diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/__grp__triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/__grp__triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ff609588b0c59e387408be531de98e9599a5e1a8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/__grp__triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.source", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttir", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttgir", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.llir", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ptx", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.cubin", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..84a484b51d9c44cf249d16e46dd9352f4529fae3 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3bbb2eb49ee7ab547cf104e08c4a0de0bb4b8eaa --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json @@ -0,0 +1 @@ +{"hash": "d4e9ec88be03aebb42041f53f2cbb9bb30afcd0e87a7a62c3ff7f5fe862eba44", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..b592e97ee0a4aee4d7bc3c4cba50deb5e62e7a8c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.llir @@ -0,0 +1,667 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_1 = internal constant [8 x i8] c"unknown\00" +@assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py\00" +@assertMessage_1 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp25 < ks4\00" +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py\00" +@assertMessage_0 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp5 < ks2\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9, i32 %10, ptr addrspace(1) readnone captures(none) %11, ptr addrspace(1) readnone captures(none) %12) local_unnamed_addr #1 !dbg !9 { + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %15 = shl i32 %14, 10, !dbg !11 + %16 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %17 = shl nuw nsw i32 %16, 3, !dbg !12 + %18 = and i32 %17, 1016, !dbg !12 + %19 = or disjoint i32 %18, %15, !dbg !13 + %20 = or disjoint i32 %19, 1, !dbg !13 + %21 = or disjoint i32 %19, 2, !dbg !13 + %22 = or disjoint i32 %19, 3, !dbg !13 + %23 = or disjoint i32 %19, 4, !dbg !13 + %24 = or disjoint i32 %19, 5, !dbg !13 + %25 = or disjoint i32 %19, 6, !dbg !13 + %26 = or disjoint i32 %19, 7, !dbg !13 + %27 = insertelement <8 x i32> poison, i32 %26, i64 0, !dbg !14 + %28 = insertelement <8 x i32> %27, i32 %25, i64 1, !dbg !14 + %29 = insertelement <8 x i32> %28, i32 %24, i64 2, !dbg !14 + %30 = insertelement <8 x i32> %29, i32 %23, i64 3, !dbg !14 + %31 = insertelement <8 x i32> %30, i32 %22, i64 4, !dbg !14 + %32 = insertelement <8 x i32> %31, i32 %21, i64 5, !dbg !14 + %33 = insertelement <8 x i32> %32, i32 %20, i64 6, !dbg !14 + %34 = insertelement <8 x i32> %33, i32 %19, i64 7, !dbg !14 + %35 = sext <8 x i32> %34 to <8 x i64>, !dbg !14 + %36 = extractelement <8 x i64> %35, i64 7, !dbg !15 + %37 = sdiv i64 %36, %5, !dbg !14 + %38 = extractelement <8 x i64> %35, i64 6, !dbg !15 + %39 = sdiv i64 %38, %5, !dbg !14 + %40 = extractelement <8 x i64> %35, i64 5, !dbg !15 + %41 = sdiv i64 %40, %5, !dbg !14 + %42 = extractelement <8 x i64> %35, i64 4, !dbg !15 + %43 = sdiv i64 %42, %5, !dbg !14 + %44 = extractelement <8 x i64> %35, i64 3, !dbg !15 + %45 = sdiv i64 %44, %5, !dbg !14 + %46 = extractelement <8 x i64> %35, i64 2, !dbg !15 + %47 = sdiv i64 %46, %5, !dbg !14 + %48 = extractelement <8 x i64> %35, i64 1, !dbg !15 + %49 = sdiv i64 %48, %5, !dbg !14 + %50 = extractelement <8 x i64> %35, i64 0, !dbg !15 + %51 = sdiv i64 %50, %5, !dbg !14 + %52 = srem i64 %37, %6, !dbg !16 + %53 = srem i64 %39, %6, !dbg !16 + %54 = srem i64 %41, %6, !dbg !16 + %55 = srem i64 %43, %6, !dbg !16 + %56 = srem i64 %45, %6, !dbg !16 + %57 = srem i64 %47, %6, !dbg !16 + %58 = srem i64 %49, %6, !dbg !16 + %59 = srem i64 %51, %6, !dbg !16 + %60 = getelementptr bfloat, ptr addrspace(1) %0, i64 %36, !dbg !15 + %61 = getelementptr bfloat, ptr addrspace(1) %0, i64 %38, !dbg !15 + %62 = getelementptr bfloat, ptr addrspace(1) %0, i64 %40, !dbg !15 + %63 = getelementptr bfloat, ptr addrspace(1) %0, i64 %42, !dbg !15 + %64 = getelementptr bfloat, ptr addrspace(1) %0, i64 %44, !dbg !15 + %65 = getelementptr bfloat, ptr addrspace(1) %0, i64 %46, !dbg !15 + %66 = getelementptr bfloat, ptr addrspace(1) %0, i64 %48, !dbg !15 + %67 = getelementptr bfloat, ptr addrspace(1) %0, i64 %50, !dbg !15 + %68 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %69 = getelementptr i64, ptr addrspace(1) %1, i64 %52, !dbg !18 + %70 = getelementptr i64, ptr addrspace(1) %1, i64 %53, !dbg !18 + %71 = getelementptr i64, ptr addrspace(1) %1, i64 %54, !dbg !18 + %72 = getelementptr i64, ptr addrspace(1) %1, i64 %55, !dbg !18 + %73 = getelementptr i64, ptr addrspace(1) %1, i64 %56, !dbg !18 + %74 = getelementptr i64, ptr addrspace(1) %1, i64 %57, !dbg !18 + %75 = getelementptr i64, ptr addrspace(1) %1, i64 %58, !dbg !18 + %76 = getelementptr i64, ptr addrspace(1) %1, i64 %59, !dbg !18 + %77 = insertelement <8 x i32> poison, i32 %19, i64 0, !dbg !19 + %78 = insertelement <8 x i32> %77, i32 %20, i64 1, !dbg !19 + %79 = insertelement <8 x i32> %78, i32 %21, i64 2, !dbg !19 + %80 = insertelement <8 x i32> %79, i32 %22, i64 3, !dbg !19 + %81 = insertelement <8 x i32> %80, i32 %23, i64 4, !dbg !19 + %82 = insertelement <8 x i32> %81, i32 %24, i64 5, !dbg !19 + %83 = insertelement <8 x i32> %82, i32 %25, i64 6, !dbg !19 + %84 = insertelement <8 x i32> %83, i32 %26, i64 7, !dbg !19 + %85 = insertelement <8 x i32> poison, i32 %10, i64 0, !dbg !19 + %86 = shufflevector <8 x i32> %85, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !19 + %87 = icmp slt <8 x i32> %84, %86, !dbg !19 + %88 = extractelement <8 x i1> %87, i64 0, !dbg !17 + %89 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %60, i64 %68, i1 %88) #4, !dbg !17 + %90 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %91 = extractelement <8 x i1> %87, i64 1, !dbg !17 + %92 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %61, i64 %90, i1 %91) #4, !dbg !17 + %93 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %94 = extractelement <8 x i1> %87, i64 2, !dbg !17 + %95 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %62, i64 %93, i1 %94) #4, !dbg !17 + %96 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %97 = extractelement <8 x i1> %87, i64 3, !dbg !17 + %98 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %63, i64 %96, i1 %97) #4, !dbg !17 + %99 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %100 = extractelement <8 x i1> %87, i64 4, !dbg !17 + %101 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %64, i64 %99, i1 %100) #4, !dbg !17 + %102 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %103 = extractelement <8 x i1> %87, i64 5, !dbg !17 + %104 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %65, i64 %102, i1 %103) #4, !dbg !17 + %105 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %106 = extractelement <8 x i1> %87, i64 6, !dbg !17 + %107 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %66, i64 %105, i1 %106) #4, !dbg !17 + %108 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %109 = extractelement <8 x i1> %87, i64 7, !dbg !17 + %110 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %67, i64 %108, i1 %109) #4, !dbg !17 + %111 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %112 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %69, i64 %111, i1 %88) #4, !dbg !20 + %113 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %114 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %70, i64 %113, i1 %91) #4, !dbg !20 + %115 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %116 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %71, i64 %115, i1 %94) #4, !dbg !20 + %117 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %118 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %72, i64 %117, i1 %97) #4, !dbg !20 + %119 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %120 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %73, i64 %119, i1 %100) #4, !dbg !20 + %121 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %122 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %74, i64 %121, i1 %103) #4, !dbg !20 + %123 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %124 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %75, i64 %123, i1 %106) #4, !dbg !20 + %125 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %126 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %76, i64 %125, i1 %109) #4, !dbg !20 + %127 = insertelement <8 x i64> poison, i64 %112, i64 0, !dbg !21 + %128 = insertelement <8 x i64> %127, i64 %114, i64 1, !dbg !21 + %129 = insertelement <8 x i64> %128, i64 %116, i64 2, !dbg !21 + %130 = insertelement <8 x i64> %129, i64 %118, i64 3, !dbg !21 + %131 = insertelement <8 x i64> %130, i64 %120, i64 4, !dbg !21 + %132 = insertelement <8 x i64> %131, i64 %122, i64 5, !dbg !21 + %133 = insertelement <8 x i64> %132, i64 %124, i64 6, !dbg !21 + %134 = insertelement <8 x i64> %133, i64 %126, i64 7, !dbg !21 + %135 = icmp slt <8 x i64> %134, zeroinitializer, !dbg !21 + %136 = insertelement <8 x i64> poison, i64 %7, i64 0, !dbg !22 + %137 = shufflevector <8 x i64> %136, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !22 + %138 = select <8 x i1> %135, <8 x i64> %137, <8 x i64> zeroinitializer, !dbg !22 + %139 = add <8 x i64> %138, %134, !dbg !22 + %140 = icmp slt <8 x i64> %139, zeroinitializer, !dbg !23 + %141 = icmp sge <8 x i64> %139, %137, !dbg !24 + %142 = or <8 x i1> %140, %141, !dbg !25 + %143 = and <8 x i1> %87, %142, !dbg !26 + %144 = bitcast <8 x i1> %143 to i8, !dbg !27 + %.not = icmp eq i8 %144, 0, !dbg !27 + br i1 %.not, label %146, label %145, !dbg !27 + +145: ; preds = %13 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 32, ptr nonnull @assertFunc_0, i64 1), !dbg !27 + unreachable, !dbg !27 + +146: ; preds = %13 + %147 = insertelement <8 x i64> poison, i64 %8, i64 0, !dbg !28 + %148 = shufflevector <8 x i64> %147, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !28 + %149 = srem <8 x i64> %35, %148, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !27 + %150 = extractelement <8 x i64> %139, i64 0, !dbg !29 + %151 = mul i64 %150, %8, !dbg !29 + %152 = extractelement <8 x i64> %139, i64 1, !dbg !29 + %153 = mul i64 %152, %8, !dbg !29 + %154 = extractelement <8 x i64> %139, i64 2, !dbg !29 + %155 = mul i64 %154, %8, !dbg !29 + %156 = extractelement <8 x i64> %139, i64 3, !dbg !29 + %157 = mul i64 %156, %8, !dbg !29 + %158 = extractelement <8 x i64> %139, i64 4, !dbg !29 + %159 = mul i64 %158, %8, !dbg !29 + %160 = extractelement <8 x i64> %139, i64 5, !dbg !29 + %161 = mul i64 %160, %8, !dbg !29 + %162 = extractelement <8 x i64> %139, i64 6, !dbg !29 + %163 = mul i64 %162, %8, !dbg !29 + %164 = extractelement <8 x i64> %139, i64 7, !dbg !29 + %165 = mul i64 %164, %8, !dbg !29 + %166 = extractelement <8 x i64> %149, i64 7, !dbg !30 + %167 = getelementptr bfloat, ptr addrspace(1) %2, i64 %166, !dbg !31 + %168 = getelementptr bfloat, ptr addrspace(1) %167, i64 %151, !dbg !31 + %169 = extractelement <8 x i64> %149, i64 6, !dbg !30 + %170 = getelementptr bfloat, ptr addrspace(1) %2, i64 %169, !dbg !31 + %171 = getelementptr bfloat, ptr addrspace(1) %170, i64 %153, !dbg !31 + %172 = extractelement <8 x i64> %149, i64 5, !dbg !30 + %173 = getelementptr bfloat, ptr addrspace(1) %2, i64 %172, !dbg !31 + %174 = getelementptr bfloat, ptr addrspace(1) %173, i64 %155, !dbg !31 + %175 = extractelement <8 x i64> %149, i64 4, !dbg !30 + %176 = getelementptr bfloat, ptr addrspace(1) %2, i64 %175, !dbg !31 + %177 = getelementptr bfloat, ptr addrspace(1) %176, i64 %157, !dbg !31 + %178 = extractelement <8 x i64> %149, i64 3, !dbg !30 + %179 = getelementptr bfloat, ptr addrspace(1) %2, i64 %178, !dbg !31 + %180 = getelementptr bfloat, ptr addrspace(1) %179, i64 %159, !dbg !31 + %181 = extractelement <8 x i64> %149, i64 2, !dbg !30 + %182 = getelementptr bfloat, ptr addrspace(1) %2, i64 %181, !dbg !31 + %183 = getelementptr bfloat, ptr addrspace(1) %182, i64 %161, !dbg !31 + %184 = extractelement <8 x i64> %149, i64 1, !dbg !30 + %185 = getelementptr bfloat, ptr addrspace(1) %2, i64 %184, !dbg !31 + %186 = getelementptr bfloat, ptr addrspace(1) %185, i64 %163, !dbg !31 + %187 = extractelement <8 x i64> %149, i64 0, !dbg !30 + %188 = getelementptr bfloat, ptr addrspace(1) %2, i64 %187, !dbg !31 + %189 = getelementptr bfloat, ptr addrspace(1) %188, i64 %165, !dbg !31 + %190 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %191 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %168, i64 %190, i1 %88) #4, !dbg !32 + %192 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %193 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %171, i64 %192, i1 %91) #4, !dbg !32 + %194 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %195 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %174, i64 %194, i1 %94) #4, !dbg !32 + %196 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %197 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %177, i64 %196, i1 %97) #4, !dbg !32 + %198 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %199 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %180, i64 %198, i1 %100) #4, !dbg !32 + %200 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %201 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %183, i64 %200, i1 %103) #4, !dbg !32 + %202 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %203 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %186, i64 %202, i1 %106) #4, !dbg !32 + %204 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %205 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %189, i64 %204, i1 %109) #4, !dbg !32 + %206 = sdiv i64 %8, 2, !dbg !33 + %207 = sub i64 %8, %206, !dbg !34 + %208 = icmp slt i64 %166, %207, !dbg !35 + %209 = icmp slt i64 %169, %207, !dbg !35 + %210 = icmp slt i64 %172, %207, !dbg !35 + %211 = icmp slt i64 %175, %207, !dbg !35 + %212 = icmp slt i64 %178, %207, !dbg !35 + %213 = icmp slt i64 %181, %207, !dbg !35 + %214 = icmp slt i64 %184, %207, !dbg !35 + %215 = icmp slt i64 %187, %207, !dbg !35 + %216 = sub nsw i64 %36, %166, !dbg !30 + %217 = sub nsw i64 %38, %169, !dbg !30 + %218 = sub nsw i64 %40, %172, !dbg !30 + %219 = sub nsw i64 %42, %175, !dbg !30 + %220 = sub nsw i64 %44, %178, !dbg !30 + %221 = sub nsw i64 %46, %181, !dbg !30 + %222 = sub nsw i64 %48, %184, !dbg !30 + %223 = sub nsw i64 %50, %187, !dbg !30 + %224 = getelementptr bfloat, ptr addrspace(1) %0, i64 %216, !dbg !36 + %225 = getelementptr bfloat, ptr addrspace(1) %224, i64 %206, !dbg !36 + %226 = getelementptr bfloat, ptr addrspace(1) %225, i64 %166, !dbg !36 + %227 = getelementptr bfloat, ptr addrspace(1) %0, i64 %217, !dbg !36 + %228 = getelementptr bfloat, ptr addrspace(1) %227, i64 %206, !dbg !36 + %229 = getelementptr bfloat, ptr addrspace(1) %228, i64 %169, !dbg !36 + %230 = getelementptr bfloat, ptr addrspace(1) %0, i64 %218, !dbg !36 + %231 = getelementptr bfloat, ptr addrspace(1) %230, i64 %206, !dbg !36 + %232 = getelementptr bfloat, ptr addrspace(1) %231, i64 %172, !dbg !36 + %233 = getelementptr bfloat, ptr addrspace(1) %0, i64 %219, !dbg !36 + %234 = getelementptr bfloat, ptr addrspace(1) %233, i64 %206, !dbg !36 + %235 = getelementptr bfloat, ptr addrspace(1) %234, i64 %175, !dbg !36 + %236 = getelementptr bfloat, ptr addrspace(1) %0, i64 %220, !dbg !36 + %237 = getelementptr bfloat, ptr addrspace(1) %236, i64 %206, !dbg !36 + %238 = getelementptr bfloat, ptr addrspace(1) %237, i64 %178, !dbg !36 + %239 = getelementptr bfloat, ptr addrspace(1) %0, i64 %221, !dbg !36 + %240 = getelementptr bfloat, ptr addrspace(1) %239, i64 %206, !dbg !36 + %241 = getelementptr bfloat, ptr addrspace(1) %240, i64 %181, !dbg !36 + %242 = getelementptr bfloat, ptr addrspace(1) %0, i64 %222, !dbg !36 + %243 = getelementptr bfloat, ptr addrspace(1) %242, i64 %206, !dbg !36 + %244 = getelementptr bfloat, ptr addrspace(1) %243, i64 %184, !dbg !36 + %245 = getelementptr bfloat, ptr addrspace(1) %0, i64 %223, !dbg !36 + %246 = getelementptr bfloat, ptr addrspace(1) %245, i64 %206, !dbg !36 + %247 = getelementptr bfloat, ptr addrspace(1) %246, i64 %187, !dbg !36 + %248 = and i1 %88, %208, !dbg !37 + %249 = and i1 %91, %209, !dbg !37 + %250 = and i1 %94, %210, !dbg !37 + %251 = and i1 %97, %211, !dbg !37 + %252 = and i1 %100, %212, !dbg !37 + %253 = and i1 %103, %213, !dbg !37 + %254 = and i1 %106, %214, !dbg !37 + %255 = and i1 %109, %215, !dbg !37 + %256 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %257 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %226, i64 %256, i1 %248) #4, !dbg !38 + %258 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %259 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %229, i64 %258, i1 %249) #4, !dbg !38 + %260 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %261 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %232, i64 %260, i1 %250) #4, !dbg !38 + %262 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %263 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %235, i64 %262, i1 %251) #4, !dbg !38 + %264 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %265 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %238, i64 %264, i1 %252) #4, !dbg !38 + %266 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %267 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %241, i64 %266, i1 %253) #4, !dbg !38 + %268 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %269 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %244, i64 %268, i1 %254) #4, !dbg !38 + %270 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %271 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %247, i64 %270, i1 %255) #4, !dbg !38 + %272 = insertelement <8 x i64> poison, i64 %207, i64 0, !dbg !39 + %273 = shufflevector <8 x i64> %272, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !39 + %274 = icmp sge <8 x i64> %149, %273, !dbg !39 + %275 = sub i64 %166, %8, !dbg !40 + %276 = sub i64 %169, %8, !dbg !40 + %277 = sub i64 %172, %8, !dbg !40 + %278 = sub i64 %175, %8, !dbg !40 + %279 = sub i64 %178, %8, !dbg !40 + %280 = sub i64 %181, %8, !dbg !40 + %281 = sub i64 %184, %8, !dbg !40 + %282 = sub i64 %187, %8, !dbg !40 + %283 = getelementptr bfloat, ptr addrspace(1) %224, i64 %275, !dbg !41 + %284 = getelementptr bfloat, ptr addrspace(1) %283, i64 %206, !dbg !41 + %285 = getelementptr bfloat, ptr addrspace(1) %227, i64 %276, !dbg !41 + %286 = getelementptr bfloat, ptr addrspace(1) %285, i64 %206, !dbg !41 + %287 = getelementptr bfloat, ptr addrspace(1) %230, i64 %277, !dbg !41 + %288 = getelementptr bfloat, ptr addrspace(1) %287, i64 %206, !dbg !41 + %289 = getelementptr bfloat, ptr addrspace(1) %233, i64 %278, !dbg !41 + %290 = getelementptr bfloat, ptr addrspace(1) %289, i64 %206, !dbg !41 + %291 = getelementptr bfloat, ptr addrspace(1) %236, i64 %279, !dbg !41 + %292 = getelementptr bfloat, ptr addrspace(1) %291, i64 %206, !dbg !41 + %293 = getelementptr bfloat, ptr addrspace(1) %239, i64 %280, !dbg !41 + %294 = getelementptr bfloat, ptr addrspace(1) %293, i64 %206, !dbg !41 + %295 = getelementptr bfloat, ptr addrspace(1) %242, i64 %281, !dbg !41 + %296 = getelementptr bfloat, ptr addrspace(1) %295, i64 %206, !dbg !41 + %297 = getelementptr bfloat, ptr addrspace(1) %245, i64 %282, !dbg !41 + %298 = getelementptr bfloat, ptr addrspace(1) %297, i64 %206, !dbg !41 + %299 = extractelement <8 x i1> %274, i64 7, !dbg !42 + %300 = and i1 %88, %299, !dbg !42 + %301 = extractelement <8 x i1> %274, i64 6, !dbg !42 + %302 = and i1 %91, %301, !dbg !42 + %303 = extractelement <8 x i1> %274, i64 5, !dbg !42 + %304 = and i1 %94, %303, !dbg !42 + %305 = extractelement <8 x i1> %274, i64 4, !dbg !42 + %306 = and i1 %97, %305, !dbg !42 + %307 = extractelement <8 x i1> %274, i64 3, !dbg !42 + %308 = and i1 %100, %307, !dbg !42 + %309 = extractelement <8 x i1> %274, i64 2, !dbg !42 + %310 = and i1 %103, %309, !dbg !42 + %311 = extractelement <8 x i1> %274, i64 1, !dbg !42 + %312 = and i1 %106, %311, !dbg !42 + %313 = extractelement <8 x i1> %274, i64 0, !dbg !42 + %314 = and i1 %109, %313, !dbg !42 + %315 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !43 + %316 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %284, i64 %315, i1 %300) #4, !dbg !43 + %317 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !43 + %318 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %286, i64 %317, i1 %302) #4, !dbg !43 + %319 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !43 + %320 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %288, i64 %319, i1 %304) #4, !dbg !43 + %321 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !43 + %322 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %290, i64 %321, i1 %306) #4, !dbg !43 + %323 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !43 + %324 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %292, i64 %323, i1 %308) #4, !dbg !43 + %325 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !43 + %326 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %294, i64 %325, i1 %310) #4, !dbg !43 + %327 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !43 + %328 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %296, i64 %327, i1 %312) #4, !dbg !43 + %329 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !43 + %330 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %298, i64 %329, i1 %314) #4, !dbg !43 + %331 = insertelement <8 x i64> poison, i64 %9, i64 0, !dbg !44 + %332 = shufflevector <8 x i64> %331, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !44 + %333 = select <8 x i1> %135, <8 x i64> %332, <8 x i64> zeroinitializer, !dbg !44 + %334 = add <8 x i64> %333, %134, !dbg !44 + %335 = icmp slt <8 x i64> %334, zeroinitializer, !dbg !45 + %336 = icmp sge <8 x i64> %334, %332, !dbg !46 + %337 = or <8 x i1> %335, %336, !dbg !47 + %338 = and <8 x i1> %87, %337, !dbg !48 + %339 = bitcast <8 x i1> %338 to i8, !dbg !49 + %.not87 = icmp eq i8 %339, 0, !dbg !49 + br i1 %.not87, label %341, label %340, !dbg !49 + +340: ; preds = %146 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 52, ptr nonnull @assertFunc_1, i64 1), !dbg !49 + unreachable, !dbg !49 + +341: ; preds = %146 + %342 = bitcast i16 %271 to bfloat, !dbg !38 + %343 = fpext bfloat %342 to float, !dbg !50 + %344 = fsub float 0.000000e+00, %343, !dbg !51 + %345 = bitcast i16 %330 to bfloat, !dbg !43 + %346 = fpext bfloat %345 to float, !dbg !52 + %347 = select i1 %215, float %344, float %346, !dbg !53 + %348 = bitcast i16 %269 to bfloat, !dbg !38 + %349 = fpext bfloat %348 to float, !dbg !50 + %350 = fsub float 0.000000e+00, %349, !dbg !51 + %351 = bitcast i16 %328 to bfloat, !dbg !43 + %352 = fpext bfloat %351 to float, !dbg !52 + %353 = select i1 %214, float %350, float %352, !dbg !53 + %354 = bitcast i16 %267 to bfloat, !dbg !38 + %355 = fpext bfloat %354 to float, !dbg !50 + %356 = fsub float 0.000000e+00, %355, !dbg !51 + %357 = bitcast i16 %326 to bfloat, !dbg !43 + %358 = fpext bfloat %357 to float, !dbg !52 + %359 = select i1 %213, float %356, float %358, !dbg !53 + %360 = bitcast i16 %265 to bfloat, !dbg !38 + %361 = fpext bfloat %360 to float, !dbg !50 + %362 = fsub float 0.000000e+00, %361, !dbg !51 + %363 = bitcast i16 %324 to bfloat, !dbg !43 + %364 = fpext bfloat %363 to float, !dbg !52 + %365 = select i1 %212, float %362, float %364, !dbg !53 + %366 = bitcast i16 %263 to bfloat, !dbg !38 + %367 = fpext bfloat %366 to float, !dbg !50 + %368 = fsub float 0.000000e+00, %367, !dbg !51 + %369 = bitcast i16 %322 to bfloat, !dbg !43 + %370 = fpext bfloat %369 to float, !dbg !52 + %371 = select i1 %211, float %368, float %370, !dbg !53 + %372 = bitcast i16 %261 to bfloat, !dbg !38 + %373 = fpext bfloat %372 to float, !dbg !50 + %374 = fsub float 0.000000e+00, %373, !dbg !51 + %375 = bitcast i16 %320 to bfloat, !dbg !43 + %376 = fpext bfloat %375 to float, !dbg !52 + %377 = select i1 %210, float %374, float %376, !dbg !53 + %378 = bitcast i16 %259 to bfloat, !dbg !38 + %379 = fpext bfloat %378 to float, !dbg !50 + %380 = fsub float 0.000000e+00, %379, !dbg !51 + %381 = bitcast i16 %318 to bfloat, !dbg !43 + %382 = fpext bfloat %381 to float, !dbg !52 + %383 = select i1 %209, float %380, float %382, !dbg !53 + %384 = bitcast i16 %257 to bfloat, !dbg !38 + %385 = fpext bfloat %384 to float, !dbg !50 + %386 = fsub float 0.000000e+00, %385, !dbg !51 + %387 = bitcast i16 %316 to bfloat, !dbg !43 + %388 = fpext bfloat %387 to float, !dbg !52 + %389 = select i1 %208, float %386, float %388, !dbg !53 + %390 = bitcast i16 %110 to bfloat, !dbg !17 + %391 = fpext bfloat %390 to float, !dbg !54 + %392 = bitcast i16 %107 to bfloat, !dbg !17 + %393 = fpext bfloat %392 to float, !dbg !54 + %394 = bitcast i16 %104 to bfloat, !dbg !17 + %395 = fpext bfloat %394 to float, !dbg !54 + %396 = bitcast i16 %101 to bfloat, !dbg !17 + %397 = fpext bfloat %396 to float, !dbg !54 + %398 = bitcast i16 %98 to bfloat, !dbg !17 + %399 = fpext bfloat %398 to float, !dbg !54 + %400 = bitcast i16 %95 to bfloat, !dbg !17 + %401 = fpext bfloat %400 to float, !dbg !54 + %402 = bitcast i16 %92 to bfloat, !dbg !17 + %403 = fpext bfloat %402 to float, !dbg !54 + %404 = bitcast i16 %89 to bfloat, !dbg !17 + %405 = fpext bfloat %404 to float, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !49 + %406 = extractelement <8 x i64> %334, i64 0, !dbg !55 + %407 = mul i64 %406, %8, !dbg !55 + %408 = extractelement <8 x i64> %334, i64 1, !dbg !55 + %409 = mul i64 %408, %8, !dbg !55 + %410 = extractelement <8 x i64> %334, i64 2, !dbg !55 + %411 = mul i64 %410, %8, !dbg !55 + %412 = extractelement <8 x i64> %334, i64 3, !dbg !55 + %413 = mul i64 %412, %8, !dbg !55 + %414 = extractelement <8 x i64> %334, i64 4, !dbg !55 + %415 = mul i64 %414, %8, !dbg !55 + %416 = extractelement <8 x i64> %334, i64 5, !dbg !55 + %417 = mul i64 %416, %8, !dbg !55 + %418 = extractelement <8 x i64> %334, i64 6, !dbg !55 + %419 = mul i64 %418, %8, !dbg !55 + %420 = extractelement <8 x i64> %334, i64 7, !dbg !55 + %421 = mul i64 %420, %8, !dbg !55 + %422 = getelementptr bfloat, ptr addrspace(1) %3, i64 %166, !dbg !56 + %423 = getelementptr bfloat, ptr addrspace(1) %422, i64 %407, !dbg !56 + %424 = getelementptr bfloat, ptr addrspace(1) %3, i64 %169, !dbg !56 + %425 = getelementptr bfloat, ptr addrspace(1) %424, i64 %409, !dbg !56 + %426 = getelementptr bfloat, ptr addrspace(1) %3, i64 %172, !dbg !56 + %427 = getelementptr bfloat, ptr addrspace(1) %426, i64 %411, !dbg !56 + %428 = getelementptr bfloat, ptr addrspace(1) %3, i64 %175, !dbg !56 + %429 = getelementptr bfloat, ptr addrspace(1) %428, i64 %413, !dbg !56 + %430 = getelementptr bfloat, ptr addrspace(1) %3, i64 %178, !dbg !56 + %431 = getelementptr bfloat, ptr addrspace(1) %430, i64 %415, !dbg !56 + %432 = getelementptr bfloat, ptr addrspace(1) %3, i64 %181, !dbg !56 + %433 = getelementptr bfloat, ptr addrspace(1) %432, i64 %417, !dbg !56 + %434 = getelementptr bfloat, ptr addrspace(1) %3, i64 %184, !dbg !56 + %435 = getelementptr bfloat, ptr addrspace(1) %434, i64 %419, !dbg !56 + %436 = getelementptr bfloat, ptr addrspace(1) %3, i64 %187, !dbg !56 + %437 = getelementptr bfloat, ptr addrspace(1) %436, i64 %421, !dbg !56 + %438 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %439 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %423, i64 %438, i1 %88) #4, !dbg !57 + %440 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %441 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %425, i64 %440, i1 %91) #4, !dbg !57 + %442 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %443 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %427, i64 %442, i1 %94) #4, !dbg !57 + %444 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %445 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %429, i64 %444, i1 %97) #4, !dbg !57 + %446 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %447 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %431, i64 %446, i1 %100) #4, !dbg !57 + %448 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %449 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %433, i64 %448, i1 %103) #4, !dbg !57 + %450 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %451 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %435, i64 %450, i1 %106) #4, !dbg !57 + %452 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %453 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %437, i64 %452, i1 %109) #4, !dbg !57 + %454 = insertelement <2 x i16> poison, i16 %191, i64 0, !dbg !32 + %455 = insertelement <2 x i16> %454, i16 %439, i64 1, !dbg !32 + %456 = bitcast <2 x i16> %455 to <2 x bfloat>, !dbg !32 + %457 = fpext <2 x bfloat> %456 to <2 x float>, !dbg !58 + %458 = insertelement <2 x float> poison, float %405, i64 0, !dbg !59 + %459 = insertelement <2 x float> %458, float %389, i64 1, !dbg !59 + %460 = fmul <2 x float> %459, %457, !dbg !59 + %461 = insertelement <2 x i16> poison, i16 %193, i64 0, !dbg !32 + %462 = insertelement <2 x i16> %461, i16 %441, i64 1, !dbg !32 + %463 = bitcast <2 x i16> %462 to <2 x bfloat>, !dbg !32 + %464 = fpext <2 x bfloat> %463 to <2 x float>, !dbg !58 + %465 = insertelement <2 x float> poison, float %403, i64 0, !dbg !59 + %466 = insertelement <2 x float> %465, float %383, i64 1, !dbg !59 + %467 = fmul <2 x float> %466, %464, !dbg !59 + %468 = insertelement <2 x i16> poison, i16 %195, i64 0, !dbg !32 + %469 = insertelement <2 x i16> %468, i16 %443, i64 1, !dbg !32 + %470 = bitcast <2 x i16> %469 to <2 x bfloat>, !dbg !32 + %471 = fpext <2 x bfloat> %470 to <2 x float>, !dbg !58 + %472 = insertelement <2 x float> poison, float %401, i64 0, !dbg !59 + %473 = insertelement <2 x float> %472, float %377, i64 1, !dbg !59 + %474 = fmul <2 x float> %473, %471, !dbg !59 + %475 = insertelement <2 x i16> poison, i16 %197, i64 0, !dbg !32 + %476 = insertelement <2 x i16> %475, i16 %445, i64 1, !dbg !32 + %477 = bitcast <2 x i16> %476 to <2 x bfloat>, !dbg !32 + %478 = fpext <2 x bfloat> %477 to <2 x float>, !dbg !58 + %479 = insertelement <2 x float> poison, float %399, i64 0, !dbg !59 + %480 = insertelement <2 x float> %479, float %371, i64 1, !dbg !59 + %481 = fmul <2 x float> %480, %478, !dbg !59 + %482 = insertelement <2 x i16> poison, i16 %199, i64 0, !dbg !32 + %483 = insertelement <2 x i16> %482, i16 %447, i64 1, !dbg !32 + %484 = bitcast <2 x i16> %483 to <2 x bfloat>, !dbg !32 + %485 = fpext <2 x bfloat> %484 to <2 x float>, !dbg !58 + %486 = insertelement <2 x float> poison, float %397, i64 0, !dbg !59 + %487 = insertelement <2 x float> %486, float %365, i64 1, !dbg !59 + %488 = fmul <2 x float> %487, %485, !dbg !59 + %489 = insertelement <2 x i16> poison, i16 %201, i64 0, !dbg !32 + %490 = insertelement <2 x i16> %489, i16 %449, i64 1, !dbg !32 + %491 = bitcast <2 x i16> %490 to <2 x bfloat>, !dbg !32 + %492 = fpext <2 x bfloat> %491 to <2 x float>, !dbg !58 + %493 = insertelement <2 x float> poison, float %395, i64 0, !dbg !59 + %494 = insertelement <2 x float> %493, float %359, i64 1, !dbg !59 + %495 = fmul <2 x float> %494, %492, !dbg !59 + %496 = insertelement <2 x i16> poison, i16 %203, i64 0, !dbg !32 + %497 = insertelement <2 x i16> %496, i16 %451, i64 1, !dbg !32 + %498 = bitcast <2 x i16> %497 to <2 x bfloat>, !dbg !32 + %499 = fpext <2 x bfloat> %498 to <2 x float>, !dbg !58 + %500 = insertelement <2 x float> poison, float %393, i64 0, !dbg !59 + %501 = insertelement <2 x float> %500, float %353, i64 1, !dbg !59 + %502 = fmul <2 x float> %501, %499, !dbg !59 + %503 = insertelement <2 x i16> poison, i16 %205, i64 0, !dbg !32 + %504 = insertelement <2 x i16> %503, i16 %453, i64 1, !dbg !32 + %505 = bitcast <2 x i16> %504 to <2 x bfloat>, !dbg !32 + %506 = fpext <2 x bfloat> %505 to <2 x float>, !dbg !58 + %507 = insertelement <2 x float> poison, float %391, i64 0, !dbg !59 + %508 = insertelement <2 x float> %507, float %347, i64 1, !dbg !59 + %509 = fmul <2 x float> %508, %506, !dbg !59 + %shift = shufflevector <2 x float> %460, <2 x float> poison, <2 x i32> , !dbg !60 + %foldExtExtBinop = fadd <2 x float> %460, %shift, !dbg !60 + %510 = extractelement <2 x float> %foldExtExtBinop, i64 0, !dbg !60 + %shift66 = shufflevector <2 x float> %467, <2 x float> poison, <2 x i32> , !dbg !60 + %foldExtExtBinop67 = fadd <2 x float> %467, %shift66, !dbg !60 + %511 = extractelement <2 x float> %foldExtExtBinop67, i64 0, !dbg !60 + %shift69 = shufflevector <2 x float> %474, <2 x float> poison, <2 x i32> , !dbg !60 + %foldExtExtBinop70 = fadd <2 x float> %474, %shift69, !dbg !60 + %512 = extractelement <2 x float> %foldExtExtBinop70, i64 0, !dbg !60 + %shift72 = shufflevector <2 x float> %481, <2 x float> poison, <2 x i32> , !dbg !60 + %foldExtExtBinop73 = fadd <2 x float> %481, %shift72, !dbg !60 + %513 = extractelement <2 x float> %foldExtExtBinop73, i64 0, !dbg !60 + %shift75 = shufflevector <2 x float> %488, <2 x float> poison, <2 x i32> , !dbg !60 + %foldExtExtBinop76 = fadd <2 x float> %488, %shift75, !dbg !60 + %514 = extractelement <2 x float> %foldExtExtBinop76, i64 0, !dbg !60 + %shift78 = shufflevector <2 x float> %495, <2 x float> poison, <2 x i32> , !dbg !60 + %foldExtExtBinop79 = fadd <2 x float> %495, %shift78, !dbg !60 + %515 = extractelement <2 x float> %foldExtExtBinop79, i64 0, !dbg !60 + %shift81 = shufflevector <2 x float> %502, <2 x float> poison, <2 x i32> , !dbg !60 + %foldExtExtBinop82 = fadd <2 x float> %502, %shift81, !dbg !60 + %516 = extractelement <2 x float> %foldExtExtBinop82, i64 0, !dbg !60 + %shift84 = shufflevector <2 x float> %509, <2 x float> poison, <2 x i32> , !dbg !60 + %foldExtExtBinop85 = fadd <2 x float> %509, %shift84, !dbg !60 + %517 = extractelement <2 x float> %foldExtExtBinop85, i64 0, !dbg !60 + %518 = getelementptr bfloat, ptr addrspace(1) %4, i64 %36, !dbg !61 + %519 = getelementptr bfloat, ptr addrspace(1) %4, i64 %38, !dbg !61 + %520 = getelementptr bfloat, ptr addrspace(1) %4, i64 %40, !dbg !61 + %521 = getelementptr bfloat, ptr addrspace(1) %4, i64 %42, !dbg !61 + %522 = getelementptr bfloat, ptr addrspace(1) %4, i64 %44, !dbg !61 + %523 = getelementptr bfloat, ptr addrspace(1) %4, i64 %46, !dbg !61 + %524 = getelementptr bfloat, ptr addrspace(1) %4, i64 %48, !dbg !61 + %525 = getelementptr bfloat, ptr addrspace(1) %4, i64 %50, !dbg !61 + %526 = fptrunc float %510 to bfloat, !dbg !62 + %527 = fptrunc float %511 to bfloat, !dbg !62 + %528 = fptrunc float %512 to bfloat, !dbg !62 + %529 = fptrunc float %513 to bfloat, !dbg !62 + %530 = fptrunc float %514 to bfloat, !dbg !62 + %531 = fptrunc float %515 to bfloat, !dbg !62 + %532 = fptrunc float %516 to bfloat, !dbg !62 + %533 = fptrunc float %517 to bfloat, !dbg !62 + %534 = bitcast bfloat %526 to i16, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %534, ptr addrspace(1) %518, i1 %88) #4, !dbg !62 + %535 = bitcast bfloat %527 to i16, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %535, ptr addrspace(1) %519, i1 %91) #4, !dbg !62 + %536 = bitcast bfloat %528 to i16, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %536, ptr addrspace(1) %520, i1 %94) #4, !dbg !62 + %537 = bitcast bfloat %529 to i16, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %537, ptr addrspace(1) %521, i1 %97) #4, !dbg !62 + %538 = bitcast bfloat %530 to i16, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %538, ptr addrspace(1) %522, i1 %100) #4, !dbg !62 + %539 = bitcast bfloat %531 to i16, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %539, ptr addrspace(1) %523, i1 %103) #4, !dbg !62 + %540 = bitcast bfloat %532 to i16, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %540, ptr addrspace(1) %524, i1 %106) #4, !dbg !62 + %541 = bitcast bfloat %533 to i16, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %541, ptr addrspace(1) %525, i1 %109) #4, !dbg !62 + ret void, !dbg !63 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="128" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0", linkageName: "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 19, column: 28, scope: !9) +!11 = !DILocation(line: 19, column: 33, scope: !9) +!12 = !DILocation(line: 20, column: 36, scope: !9) +!13 = !DILocation(line: 20, column: 23, scope: !9) +!14 = !DILocation(line: 23, column: 21, scope: !9) +!15 = !DILocation(line: 26, column: 30, scope: !9) +!16 = !DILocation(line: 23, column: 28, scope: !9) +!17 = !DILocation(line: 26, column: 35, scope: !9) +!18 = !DILocation(line: 27, column: 30, scope: !9) +!19 = !DILocation(line: 21, column: 21, scope: !9) +!20 = !DILocation(line: 27, column: 35, scope: !9) +!21 = !DILocation(line: 30, column: 18, scope: !9) +!22 = !DILocation(line: 31, column: 32, scope: !9) +!23 = !DILocation(line: 32, column: 28, scope: !9) +!24 = !DILocation(line: 32, column: 44, scope: !9) +!25 = !DILocation(line: 32, column: 37, scope: !9) +!26 = !DILocation(line: 32, column: 52, scope: !9) +!27 = !DILocation(line: 32, column: 62, scope: !9) +!28 = !DILocation(line: 24, column: 19, scope: !9) +!29 = !DILocation(line: 33, column: 39, scope: !9) +!30 = !DILocation(line: 40, column: 35, scope: !9) +!31 = !DILocation(line: 33, column: 30, scope: !9) +!32 = !DILocation(line: 33, column: 46, scope: !9) +!33 = !DILocation(line: 38, column: 31, scope: !9) +!34 = !DILocation(line: 38, column: 18, scope: !9) +!35 = !DILocation(line: 39, column: 19, scope: !9) +!36 = !DILocation(line: 40, column: 31, scope: !9) +!37 = !DILocation(line: 40, column: 68, scope: !9) +!38 = !DILocation(line: 40, column: 60, scope: !9) +!39 = !DILocation(line: 44, column: 20, scope: !9) +!40 = !DILocation(line: 47, column: 47, scope: !9) +!41 = !DILocation(line: 47, column: 31, scope: !9) +!42 = !DILocation(line: 47, column: 81, scope: !9) +!43 = !DILocation(line: 47, column: 73, scope: !9) +!44 = !DILocation(line: 51, column: 34, scope: !9) +!45 = !DILocation(line: 52, column: 28, scope: !9) +!46 = !DILocation(line: 52, column: 46, scope: !9) +!47 = !DILocation(line: 52, column: 38, scope: !9) +!48 = !DILocation(line: 52, column: 54, scope: !9) +!49 = !DILocation(line: 52, column: 64, scope: !9) +!50 = !DILocation(line: 40, column: 119, scope: !9) +!51 = !DILocation(line: 41, column: 13, scope: !9) +!52 = !DILocation(line: 47, column: 132, scope: !9) +!53 = !DILocation(line: 0, scope: !9) +!54 = !DILocation(line: 26, column: 75, scope: !9) +!55 = !DILocation(line: 53, column: 40, scope: !9) +!56 = !DILocation(line: 53, column: 31, scope: !9) +!57 = !DILocation(line: 53, column: 48, scope: !9) +!58 = !DILocation(line: 33, column: 86, scope: !9) +!59 = !DILocation(line: 34, column: 18, scope: !9) +!60 = !DILocation(line: 55, column: 19, scope: !9) +!61 = !DILocation(line: 56, column: 25, scope: !9) +!62 = !DILocation(line: 56, column: 37, scope: !9) +!63 = !DILocation(line: 56, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..609d7096e33468c241e69804670882565f812203 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ptx @@ -0,0 +1,1534 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0 // -- Begin function triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 118, 121, 47, 99, 118, 121, 111, 113, 103, 55, 106, 122, 101, 97, 100, 97, 114, 114, 103, 103, 103, 120, 104, 115, 50, 100, 106, 109, 120, 117, 103, 120, 121, 118, 106, 104, 105, 109, 55, 50, 118, 102, 115, 116, 113, 99, 52, 105, 111, 52, 113, 115, 101, 99, 106, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 50, 53, 32, 60, 32, 107, 115, 52}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 118, 121, 47, 99, 118, 121, 111, 113, 103, 55, 106, 122, 101, 97, 100, 97, 114, 114, 103, 103, 103, 120, 104, 115, 50, 100, 106, 109, 120, 117, 103, 120, 121, 118, 106, 104, 105, 109, 55, 50, 118, 102, 115, 116, 113, 99, 52, 105, 111, 52, 113, 115, 101, 99, 106, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 53, 32, 60, 32, 107, 115, 50}; + // @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0 +.visible .entry triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_4, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_5, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_6, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_7, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_8, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_9, + .param .u32 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_10, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_11, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_12 +) +.reqntid 128 +{ + .reg .pred %p<179>; + .reg .b16 %rs<165>; + .reg .b32 %r<160>; + .reg .b64 %rd<500>; + .loc 1 18 0 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:18:0 + +// %bb.0: + ld.param.b64 %rd103, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_5]; +$L__tmp0: + .loc 1 19 28 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:19:28 + mov.u32 %r26, %ctaid.x; + .loc 1 19 33 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:19:33 + shl.b32 %r27, %r26, 10; + .loc 1 20 36 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:20:36 + mov.u32 %r28, %tid.x; + shl.b32 %r29, %r28, 3; + and.b32 %r30, %r29, 1016; + .loc 1 20 23 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:20:23 + or.b32 %r1, %r30, %r27; + or.b32 %r2, %r1, 1; + or.b32 %r3, %r1, 2; + .loc 1 23 21 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:23:21 + cvt.s64.s32 %rd7, %r2; + cvt.s64.s32 %rd8, %r1; + or.b64 %rd108, %rd8, %rd103; + and.b64 %rd109, %rd108, -4294967296; + setp.ne.b64 %p9, %rd109, 0; + @%p9 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd484, %rd8, %rd103; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r31, %rd103; + cvt.u32.u64 %r32, %rd8; + div.u32 %r33, %r32, %r31; + cvt.u64.u32 %rd484, %r33; +$L__BB0_3: + .loc 1 0 0 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:0 + or.b32 %r4, %r1, 3; + .loc 1 23 21 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:23:21 + cvt.s64.s32 %rd6, %r3; + or.b64 %rd110, %rd7, %rd103; + and.b64 %rd111, %rd110, -4294967296; + setp.ne.b64 %p10, %rd111, 0; + @%p10 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd485, %rd7, %rd103; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r34, %rd103; + cvt.u32.u64 %r35, %rd7; + div.u32 %r36, %r35, %r34; + cvt.u64.u32 %rd485, %r36; +$L__BB0_6: + .loc 1 0 0 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:0 + or.b32 %r5, %r1, 4; + .loc 1 23 21 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:23:21 + cvt.s64.s32 %rd5, %r4; + or.b64 %rd112, %rd6, %rd103; + and.b64 %rd113, %rd112, -4294967296; + setp.ne.b64 %p11, %rd113, 0; + @%p11 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + div.s64 %rd486, %rd6, %rd103; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r37, %rd103; + cvt.u32.u64 %r38, %rd6; + div.u32 %r39, %r38, %r37; + cvt.u64.u32 %rd486, %r39; +$L__BB0_9: + .loc 1 0 0 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:0 + or.b32 %r6, %r1, 5; + .loc 1 23 21 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:23:21 + cvt.s64.s32 %rd4, %r5; + or.b64 %rd114, %rd5, %rd103; + and.b64 %rd115, %rd114, -4294967296; + setp.ne.b64 %p12, %rd115, 0; + @%p12 bra $L__BB0_11; + bra.uni $L__BB0_10; +$L__BB0_11: + div.s64 %rd487, %rd5, %rd103; + bra.uni $L__BB0_12; +$L__BB0_10: + cvt.u32.u64 %r40, %rd103; + cvt.u32.u64 %r41, %rd5; + div.u32 %r42, %r41, %r40; + cvt.u64.u32 %rd487, %r42; +$L__BB0_12: + .loc 1 0 0 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:0 + or.b32 %r7, %r1, 6; + .loc 1 23 21 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:23:21 + cvt.s64.s32 %rd3, %r6; + or.b64 %rd116, %rd4, %rd103; + and.b64 %rd117, %rd116, -4294967296; + setp.ne.b64 %p13, %rd117, 0; + @%p13 bra $L__BB0_14; + bra.uni $L__BB0_13; +$L__BB0_14: + div.s64 %rd488, %rd4, %rd103; + bra.uni $L__BB0_15; +$L__BB0_13: + cvt.u32.u64 %r43, %rd103; + cvt.u32.u64 %r44, %rd4; + div.u32 %r45, %r44, %r43; + cvt.u64.u32 %rd488, %r45; +$L__BB0_15: + .loc 1 0 0 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:0 + or.b32 %r8, %r1, 7; + .loc 1 23 21 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:23:21 + cvt.s64.s32 %rd2, %r7; + or.b64 %rd118, %rd3, %rd103; + and.b64 %rd119, %rd118, -4294967296; + setp.ne.b64 %p14, %rd119, 0; + @%p14 bra $L__BB0_17; + bra.uni $L__BB0_16; +$L__BB0_17: + div.s64 %rd489, %rd3, %rd103; + bra.uni $L__BB0_18; +$L__BB0_16: + cvt.u32.u64 %r46, %rd103; + cvt.u32.u64 %r47, %rd3; + div.u32 %r48, %r47, %r46; + cvt.u64.u32 %rd489, %r48; +$L__BB0_18: + cvt.s64.s32 %rd1, %r8; + or.b64 %rd120, %rd2, %rd103; + and.b64 %rd121, %rd120, -4294967296; + setp.ne.b64 %p15, %rd121, 0; + @%p15 bra $L__BB0_20; + bra.uni $L__BB0_19; +$L__BB0_20: + div.s64 %rd490, %rd2, %rd103; + bra.uni $L__BB0_21; +$L__BB0_19: + cvt.u32.u64 %r49, %rd103; + cvt.u32.u64 %r50, %rd2; + div.u32 %r51, %r50, %r49; + cvt.u64.u32 %rd490, %r51; +$L__BB0_21: + .loc 1 0 21 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:0:21 + ld.param.b64 %rd104, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_6]; + .loc 1 23 21 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:23:21 + or.b64 %rd122, %rd1, %rd103; + and.b64 %rd123, %rd122, -4294967296; + setp.ne.b64 %p16, %rd123, 0; + @%p16 bra $L__BB0_23; + bra.uni $L__BB0_22; +$L__BB0_23: + div.s64 %rd491, %rd1, %rd103; + bra.uni $L__BB0_24; +$L__BB0_22: + cvt.u32.u64 %r52, %rd103; + cvt.u32.u64 %r53, %rd1; + div.u32 %r54, %r53, %r52; + cvt.u64.u32 %rd491, %r54; +$L__BB0_24: + .loc 1 23 28 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:23:28 + or.b64 %rd124, %rd484, %rd104; + and.b64 %rd125, %rd124, -4294967296; + setp.ne.b64 %p17, %rd125, 0; + @%p17 bra $L__BB0_26; + bra.uni $L__BB0_25; +$L__BB0_26: + rem.s64 %rd492, %rd484, %rd104; + bra.uni $L__BB0_27; +$L__BB0_25: + cvt.u32.u64 %r55, %rd104; + cvt.u32.u64 %r56, %rd484; + rem.u32 %r57, %r56, %r55; + cvt.u64.u32 %rd492, %r57; +$L__BB0_27: + or.b64 %rd126, %rd485, %rd104; + and.b64 %rd127, %rd126, -4294967296; + setp.ne.b64 %p18, %rd127, 0; + @%p18 bra $L__BB0_29; + bra.uni $L__BB0_28; +$L__BB0_29: + rem.s64 %rd493, %rd485, %rd104; + bra.uni $L__BB0_30; +$L__BB0_28: + cvt.u32.u64 %r58, %rd104; + cvt.u32.u64 %r59, %rd485; + rem.u32 %r60, %r59, %r58; + cvt.u64.u32 %rd493, %r60; +$L__BB0_30: + or.b64 %rd128, %rd486, %rd104; + and.b64 %rd129, %rd128, -4294967296; + setp.ne.b64 %p19, %rd129, 0; + @%p19 bra $L__BB0_32; + bra.uni $L__BB0_31; +$L__BB0_32: + rem.s64 %rd494, %rd486, %rd104; + bra.uni $L__BB0_33; +$L__BB0_31: + cvt.u32.u64 %r61, %rd104; + cvt.u32.u64 %r62, %rd486; + rem.u32 %r63, %r62, %r61; + cvt.u64.u32 %rd494, %r63; +$L__BB0_33: + or.b64 %rd130, %rd487, %rd104; + and.b64 %rd131, %rd130, -4294967296; + setp.ne.b64 %p20, %rd131, 0; + @%p20 bra $L__BB0_35; + bra.uni $L__BB0_34; +$L__BB0_35: + rem.s64 %rd495, %rd487, %rd104; + bra.uni $L__BB0_36; +$L__BB0_34: + cvt.u32.u64 %r64, %rd104; + cvt.u32.u64 %r65, %rd487; + rem.u32 %r66, %r65, %r64; + cvt.u64.u32 %rd495, %r66; +$L__BB0_36: + or.b64 %rd132, %rd488, %rd104; + and.b64 %rd133, %rd132, -4294967296; + setp.ne.b64 %p21, %rd133, 0; + @%p21 bra $L__BB0_38; + bra.uni $L__BB0_37; +$L__BB0_38: + rem.s64 %rd496, %rd488, %rd104; + bra.uni $L__BB0_39; +$L__BB0_37: + cvt.u32.u64 %r67, %rd104; + cvt.u32.u64 %r68, %rd488; + rem.u32 %r69, %r68, %r67; + cvt.u64.u32 %rd496, %r69; +$L__BB0_39: + or.b64 %rd134, %rd489, %rd104; + and.b64 %rd135, %rd134, -4294967296; + setp.ne.b64 %p22, %rd135, 0; + @%p22 bra $L__BB0_41; + bra.uni $L__BB0_40; +$L__BB0_41: + rem.s64 %rd497, %rd489, %rd104; + bra.uni $L__BB0_42; +$L__BB0_40: + cvt.u32.u64 %r70, %rd104; + cvt.u32.u64 %r71, %rd489; + rem.u32 %r72, %r71, %r70; + cvt.u64.u32 %rd497, %r72; +$L__BB0_42: + or.b64 %rd136, %rd490, %rd104; + and.b64 %rd137, %rd136, -4294967296; + setp.ne.b64 %p23, %rd137, 0; + @%p23 bra $L__BB0_44; + bra.uni $L__BB0_43; +$L__BB0_44: + rem.s64 %rd498, %rd490, %rd104; + bra.uni $L__BB0_45; +$L__BB0_43: + cvt.u32.u64 %r73, %rd104; + cvt.u32.u64 %r74, %rd490; + rem.u32 %r75, %r74, %r73; + cvt.u64.u32 %rd498, %r75; +$L__BB0_45: + .loc 1 0 28 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:0:28 + ld.param.b32 %r25, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_10]; + ld.param.b64 %rd105, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_7]; + ld.param.b64 %rd99, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_1]; + ld.param.b64 %rd98, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_0]; + .loc 1 23 28 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:23:28 + or.b64 %rd138, %rd491, %rd104; + and.b64 %rd139, %rd138, -4294967296; + setp.ne.b64 %p24, %rd139, 0; + @%p24 bra $L__BB0_47; + bra.uni $L__BB0_46; +$L__BB0_47: + rem.s64 %rd499, %rd491, %rd104; + bra.uni $L__BB0_48; +$L__BB0_46: + cvt.u32.u64 %r76, %rd104; + cvt.u32.u64 %r77, %rd491; + rem.u32 %r78, %r77, %r76; + cvt.u64.u32 %rd499, %r78; +$L__BB0_48: + .loc 1 26 30 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:26:30 + shl.b64 %rd196, %rd8, 1; + add.s64 %rd141, %rd98, %rd196; + shl.b64 %rd197, %rd7, 1; + add.s64 %rd144, %rd98, %rd197; + shl.b64 %rd198, %rd6, 1; + add.s64 %rd147, %rd98, %rd198; + shl.b64 %rd199, %rd5, 1; + add.s64 %rd150, %rd98, %rd199; + shl.b64 %rd200, %rd4, 1; + add.s64 %rd153, %rd98, %rd200; + shl.b64 %rd201, %rd3, 1; + add.s64 %rd156, %rd98, %rd201; + shl.b64 %rd202, %rd2, 1; + add.s64 %rd159, %rd98, %rd202; + shl.b64 %rd203, %rd1, 1; + add.s64 %rd162, %rd98, %rd203; + .loc 1 26 35 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:26:35 + // begin inline asm + mov.u64 %rd140, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd140, 1.0; + // end inline asm + .loc 1 27 30 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:27:30 + shl.b64 %rd204, %rd492, 3; + add.s64 %rd166, %rd99, %rd204; + shl.b64 %rd205, %rd493, 3; + add.s64 %rd170, %rd99, %rd205; + shl.b64 %rd206, %rd494, 3; + add.s64 %rd174, %rd99, %rd206; + shl.b64 %rd207, %rd495, 3; + add.s64 %rd178, %rd99, %rd207; + shl.b64 %rd208, %rd496, 3; + add.s64 %rd182, %rd99, %rd208; + shl.b64 %rd209, %rd497, 3; + add.s64 %rd186, %rd99, %rd209; + shl.b64 %rd210, %rd498, 3; + add.s64 %rd190, %rd99, %rd210; + shl.b64 %rd211, %rd499, 3; + add.s64 %rd194, %rd99, %rd211; + .loc 1 21 21 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:21:21 + setp.lt.s32 %p8, %r8, %r25; + setp.lt.s32 %p7, %r7, %r25; + setp.lt.s32 %p6, %r6, %r25; + setp.lt.s32 %p5, %r5, %r25; + setp.lt.s32 %p4, %r4, %r25; + setp.lt.s32 %p3, %r3, %r25; + setp.lt.s32 %p2, %r2, %r25; + setp.lt.s32 %p1, %r1, %r25; + .loc 1 26 35 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:26:35 + // begin inline asm + mov.u16 %rs33, 0x0; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs33 }, [ %rd141 + 0 ], %rd140; + // end inline asm + // begin inline asm + mov.u64 %rd143, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd143, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs34, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs34 }, [ %rd144 + 0 ], %rd143; + // end inline asm + // begin inline asm + mov.u64 %rd146, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd146, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs35, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs35 }, [ %rd147 + 0 ], %rd146; + // end inline asm + // begin inline asm + mov.u64 %rd149, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd149, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs36, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs36 }, [ %rd150 + 0 ], %rd149; + // end inline asm + // begin inline asm + mov.u64 %rd152, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd152, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs37, 0x0; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs37 }, [ %rd153 + 0 ], %rd152; + // end inline asm + // begin inline asm + mov.u64 %rd155, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd155, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs38, 0x0; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs38 }, [ %rd156 + 0 ], %rd155; + // end inline asm + // begin inline asm + mov.u64 %rd158, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd158, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs39, 0x0; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs39 }, [ %rd159 + 0 ], %rd158; + // end inline asm + // begin inline asm + mov.u64 %rd161, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd161, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs40, 0x0; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs40 }, [ %rd162 + 0 ], %rd161; + // end inline asm + .loc 1 27 35 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:27:35 + // begin inline asm + mov.u64 %rd164, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd164, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd165, 0x0; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd165 }, [ %rd166 + 0 ], %rd164; + // end inline asm + // begin inline asm + mov.u64 %rd168, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd168, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd169, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd169 }, [ %rd170 + 0 ], %rd168; + // end inline asm + // begin inline asm + mov.u64 %rd172, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd172, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd173, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd173 }, [ %rd174 + 0 ], %rd172; + // end inline asm + // begin inline asm + mov.u64 %rd176, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd176, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd177, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd177 }, [ %rd178 + 0 ], %rd176; + // end inline asm + // begin inline asm + mov.u64 %rd180, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd180, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd181, 0x0; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd181 }, [ %rd182 + 0 ], %rd180; + // end inline asm + // begin inline asm + mov.u64 %rd184, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd184, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd185, 0x0; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd185 }, [ %rd186 + 0 ], %rd184; + // end inline asm + // begin inline asm + mov.u64 %rd188, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd188, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd189, 0x0; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd189 }, [ %rd190 + 0 ], %rd188; + // end inline asm + // begin inline asm + mov.u64 %rd192, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd192, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd193, 0x0; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd193 }, [ %rd194 + 0 ], %rd192; + // end inline asm + .loc 1 31 32 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:31:32 + shr.s64 %rd212, %rd173, 63; + and.b64 %rd213, %rd212, %rd105; + shr.s64 %rd214, %rd177, 63; + and.b64 %rd215, %rd214, %rd105; + shr.s64 %rd216, %rd165, 63; + and.b64 %rd217, %rd216, %rd105; + shr.s64 %rd218, %rd169, 63; + and.b64 %rd219, %rd218, %rd105; + shr.s64 %rd220, %rd189, 63; + and.b64 %rd221, %rd220, %rd105; + shr.s64 %rd222, %rd193, 63; + and.b64 %rd223, %rd222, %rd105; + shr.s64 %rd224, %rd181, 63; + and.b64 %rd225, %rd224, %rd105; + shr.s64 %rd226, %rd185, 63; + and.b64 %rd227, %rd226, %rd105; + add.s64 %rd78, %rd227, %rd185; + add.s64 %rd77, %rd225, %rd181; + add.s64 %rd80, %rd223, %rd193; + add.s64 %rd79, %rd221, %rd189; + add.s64 %rd74, %rd219, %rd169; + add.s64 %rd73, %rd217, %rd165; + add.s64 %rd76, %rd215, %rd177; + add.s64 %rd75, %rd213, %rd173; + .loc 1 32 28 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:32:28 + setp.lt.s64 %p41, %rd75, 0; + setp.lt.s64 %p42, %rd76, 0; + setp.lt.s64 %p43, %rd73, 0; + setp.lt.s64 %p44, %rd74, 0; + setp.lt.s64 %p45, %rd79, 0; + setp.lt.s64 %p46, %rd80, 0; + setp.lt.s64 %p47, %rd77, 0; + setp.lt.s64 %p48, %rd78, 0; + .loc 1 32 44 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:32:44 + setp.ge.s64 %p49, %rd75, %rd105; + setp.ge.s64 %p50, %rd76, %rd105; + setp.ge.s64 %p51, %rd73, %rd105; + setp.ge.s64 %p52, %rd74, %rd105; + setp.ge.s64 %p53, %rd79, %rd105; + setp.ge.s64 %p54, %rd80, %rd105; + setp.ge.s64 %p55, %rd77, %rd105; + setp.ge.s64 %p56, %rd78, %rd105; + .loc 1 32 37 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:32:37 + or.pred %p57, %p48, %p56; + or.pred %p58, %p47, %p55; + or.pred %p59, %p46, %p54; + or.pred %p60, %p45, %p53; + or.pred %p61, %p44, %p52; + or.pred %p62, %p43, %p51; + or.pred %p63, %p42, %p50; + or.pred %p64, %p41, %p49; + .loc 1 32 52 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:32:52 + and.pred %p65, %p3, %p64; + selp.b16 %rs41, 1, 0, %p65; + shl.b16 %rs42, %rs41, 2; + and.pred %p66, %p4, %p63; + selp.b16 %rs43, -1, 0, %p66; + shl.b16 %rs44, %rs43, 3; + or.b16 %rs45, %rs44, %rs42; + and.pred %p67, %p1, %p62; + selp.b16 %rs46, 1, 0, %p67; + and.pred %p68, %p2, %p61; + selp.b16 %rs47, -1, 0, %p68; + shl.b16 %rs48, %rs47, 1; + or.b16 %rs49, %rs46, %rs48; + and.b16 %rs50, %rs49, 3; + or.b16 %rs51, %rs50, %rs45; + and.b16 %rs52, %rs51, 15; + and.pred %p69, %p7, %p60; + selp.b16 %rs53, 1, 0, %p69; + shl.b16 %rs54, %rs53, 2; + and.pred %p70, %p8, %p59; + selp.b16 %rs55, -1, 0, %p70; + shl.b16 %rs56, %rs55, 3; + or.b16 %rs57, %rs56, %rs54; + and.pred %p71, %p5, %p58; + selp.b16 %rs58, 1, 0, %p71; + and.pred %p72, %p6, %p57; + selp.b16 %rs59, -1, 0, %p72; + shl.b16 %rs60, %rs59, 1; + or.b16 %rs61, %rs58, %rs60; + and.b16 %rs62, %rs61, 3; + or.b16 %rs63, %rs62, %rs57; + shl.b16 %rs64, %rs63, 4; + or.b16 %rs65, %rs52, %rs64; + .loc 1 32 62 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:32:62 + and.b16 %rs66, %rs65, 255; + setp.eq.b16 %p73, %rs66, 0; + @%p73 bra $L__BB0_50; + bra.uni $L__BB0_49; +$L__BB0_50: + .loc 1 0 62 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:0:62 + ld.param.b64 %rd107, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_9]; + ld.param.b64 %rd106, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_8]; + ld.param.b64 %rd100, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_2]; + .loc 1 24 19 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:24:19 + rem.s64 %rd88, %rd1, %rd106; + rem.s64 %rd87, %rd2, %rd106; + rem.s64 %rd86, %rd3, %rd106; + rem.s64 %rd85, %rd4, %rd106; + rem.s64 %rd84, %rd5, %rd106; + rem.s64 %rd83, %rd6, %rd106; + rem.s64 %rd82, %rd7, %rd106; + rem.s64 %rd81, %rd8, %rd106; + .loc 1 32 62 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:32:62 + bar.sync 0; + .loc 1 33 39 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:39 + mul.lo.s64 %rd306, %rd73, %rd106; + mul.lo.s64 %rd307, %rd74, %rd106; + mul.lo.s64 %rd308, %rd75, %rd106; + mul.lo.s64 %rd309, %rd76, %rd106; + mul.lo.s64 %rd310, %rd77, %rd106; + mul.lo.s64 %rd311, %rd78, %rd106; + mul.lo.s64 %rd312, %rd79, %rd106; + mul.lo.s64 %rd313, %rd80, %rd106; + .loc 1 33 30 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:30 + shl.b64 %rd314, %rd81, 1; + add.s64 %rd315, %rd100, %rd314; + shl.b64 %rd316, %rd306, 1; + add.s64 %rd235, %rd315, %rd316; + shl.b64 %rd317, %rd82, 1; + add.s64 %rd318, %rd100, %rd317; + shl.b64 %rd319, %rd307, 1; + add.s64 %rd238, %rd318, %rd319; + shl.b64 %rd320, %rd83, 1; + add.s64 %rd321, %rd100, %rd320; + shl.b64 %rd322, %rd308, 1; + add.s64 %rd241, %rd321, %rd322; + shl.b64 %rd323, %rd84, 1; + add.s64 %rd324, %rd100, %rd323; + shl.b64 %rd325, %rd309, 1; + add.s64 %rd244, %rd324, %rd325; + shl.b64 %rd326, %rd85, 1; + add.s64 %rd327, %rd100, %rd326; + shl.b64 %rd328, %rd310, 1; + add.s64 %rd247, %rd327, %rd328; + shl.b64 %rd329, %rd86, 1; + add.s64 %rd330, %rd100, %rd329; + shl.b64 %rd331, %rd311, 1; + add.s64 %rd250, %rd330, %rd331; + shl.b64 %rd332, %rd87, 1; + add.s64 %rd333, %rd100, %rd332; + shl.b64 %rd334, %rd312, 1; + add.s64 %rd253, %rd333, %rd334; + shl.b64 %rd335, %rd88, 1; + add.s64 %rd336, %rd100, %rd335; + shl.b64 %rd337, %rd313, 1; + add.s64 %rd256, %rd336, %rd337; + .loc 1 33 46 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:46 + // begin inline asm + mov.u64 %rd234, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd234, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs67, 0x0; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs67 }, [ %rd235 + 0 ], %rd234; + // end inline asm + // begin inline asm + mov.u64 %rd237, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd237, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs68, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs68 }, [ %rd238 + 0 ], %rd237; + // end inline asm + // begin inline asm + mov.u64 %rd240, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd240, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs69, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs69 }, [ %rd241 + 0 ], %rd240; + // end inline asm + // begin inline asm + mov.u64 %rd243, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd243, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs70, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs70 }, [ %rd244 + 0 ], %rd243; + // end inline asm + // begin inline asm + mov.u64 %rd246, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd246, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs71, 0x0; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs71 }, [ %rd247 + 0 ], %rd246; + // end inline asm + // begin inline asm + mov.u64 %rd249, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd249, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs72, 0x0; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs72 }, [ %rd250 + 0 ], %rd249; + // end inline asm + // begin inline asm + mov.u64 %rd252, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd252, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs73, 0x0; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs73 }, [ %rd253 + 0 ], %rd252; + // end inline asm + // begin inline asm + mov.u64 %rd255, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd255, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs74, 0x0; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs74 }, [ %rd256 + 0 ], %rd255; + // end inline asm + .loc 1 38 31 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:38:31 + shr.u64 %rd338, %rd106, 63; + add.s64 %rd339, %rd106, %rd338; + shr.s64 %rd340, %rd339, 1; + .loc 1 38 18 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:38:18 + sub.s64 %rd89, %rd106, %rd340; + .loc 1 39 19 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:39:19 + setp.lt.s64 %p106, %rd81, %rd89; + setp.lt.s64 %p107, %rd82, %rd89; + setp.lt.s64 %p108, %rd83, %rd89; + setp.lt.s64 %p109, %rd84, %rd89; + setp.lt.s64 %p110, %rd85, %rd89; + setp.lt.s64 %p111, %rd86, %rd89; + setp.lt.s64 %p112, %rd87, %rd89; + setp.lt.s64 %p113, %rd88, %rd89; + .loc 1 40 35 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:40:35 + sub.s64 %rd341, %rd8, %rd81; + sub.s64 %rd342, %rd7, %rd82; + sub.s64 %rd343, %rd6, %rd83; + sub.s64 %rd344, %rd5, %rd84; + sub.s64 %rd345, %rd4, %rd85; + sub.s64 %rd346, %rd3, %rd86; + sub.s64 %rd347, %rd2, %rd87; + sub.s64 %rd348, %rd1, %rd88; + .loc 1 40 31 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:40:31 + shl.b64 %rd349, %rd341, 1; + add.s64 %rd350, %rd98, %rd349; + and.b64 %rd351, %rd339, -2; + add.s64 %rd352, %rd350, %rd351; + add.s64 %rd259, %rd352, %rd314; + shl.b64 %rd353, %rd342, 1; + add.s64 %rd354, %rd98, %rd353; + add.s64 %rd355, %rd354, %rd351; + add.s64 %rd262, %rd355, %rd317; + shl.b64 %rd356, %rd343, 1; + add.s64 %rd357, %rd98, %rd356; + add.s64 %rd358, %rd357, %rd351; + add.s64 %rd265, %rd358, %rd320; + shl.b64 %rd359, %rd344, 1; + add.s64 %rd360, %rd98, %rd359; + add.s64 %rd361, %rd360, %rd351; + add.s64 %rd268, %rd361, %rd323; + shl.b64 %rd362, %rd345, 1; + add.s64 %rd363, %rd98, %rd362; + add.s64 %rd364, %rd363, %rd351; + add.s64 %rd271, %rd364, %rd326; + shl.b64 %rd365, %rd346, 1; + add.s64 %rd366, %rd98, %rd365; + add.s64 %rd367, %rd366, %rd351; + add.s64 %rd274, %rd367, %rd329; + shl.b64 %rd368, %rd347, 1; + add.s64 %rd369, %rd98, %rd368; + add.s64 %rd370, %rd369, %rd351; + add.s64 %rd277, %rd370, %rd332; + shl.b64 %rd371, %rd348, 1; + add.s64 %rd372, %rd98, %rd371; + add.s64 %rd373, %rd372, %rd351; + add.s64 %rd280, %rd373, %rd335; + .loc 1 40 68 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:40:68 + and.pred %p82, %p1, %p106; + and.pred %p83, %p2, %p107; + and.pred %p84, %p3, %p108; + and.pred %p85, %p4, %p109; + and.pred %p86, %p5, %p110; + and.pred %p87, %p6, %p111; + and.pred %p88, %p7, %p112; + and.pred %p89, %p8, %p113; + .loc 1 40 60 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:40:60 + // begin inline asm + mov.u64 %rd258, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd258, 1.0; + // end inline asm + mov.b16 %rs76, 0; + // begin inline asm + mov.u16 %rs75, %rs76; + @%p82 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs75 }, [ %rd259 + 0 ], %rd258; + // end inline asm + // begin inline asm + mov.u64 %rd261, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd261, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs77, %rs76; + @%p83 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs77 }, [ %rd262 + 0 ], %rd261; + // end inline asm + // begin inline asm + mov.u64 %rd264, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd264, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs79, %rs76; + @%p84 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs79 }, [ %rd265 + 0 ], %rd264; + // end inline asm + // begin inline asm + mov.u64 %rd267, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd267, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs81, %rs76; + @%p85 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs81 }, [ %rd268 + 0 ], %rd267; + // end inline asm + // begin inline asm + mov.u64 %rd270, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd270, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs83, %rs76; + @%p86 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs83 }, [ %rd271 + 0 ], %rd270; + // end inline asm + // begin inline asm + mov.u64 %rd273, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd273, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs85, %rs76; + @%p87 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs85 }, [ %rd274 + 0 ], %rd273; + // end inline asm + // begin inline asm + mov.u64 %rd276, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd276, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs87, %rs76; + @%p88 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs87 }, [ %rd277 + 0 ], %rd276; + // end inline asm + // begin inline asm + mov.u64 %rd279, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd279, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs89, %rs76; + @%p89 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs89 }, [ %rd280 + 0 ], %rd279; + // end inline asm + .loc 1 44 20 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:44:20 + setp.ge.s64 %p114, %rd88, %rd89; + setp.ge.s64 %p115, %rd87, %rd89; + setp.ge.s64 %p116, %rd86, %rd89; + setp.ge.s64 %p117, %rd85, %rd89; + setp.ge.s64 %p118, %rd84, %rd89; + setp.ge.s64 %p119, %rd83, %rd89; + setp.ge.s64 %p120, %rd82, %rd89; + setp.ge.s64 %p121, %rd81, %rd89; + .loc 1 47 47 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:47:47 + sub.s64 %rd374, %rd81, %rd106; + sub.s64 %rd375, %rd82, %rd106; + sub.s64 %rd376, %rd83, %rd106; + sub.s64 %rd377, %rd84, %rd106; + sub.s64 %rd378, %rd85, %rd106; + sub.s64 %rd379, %rd86, %rd106; + sub.s64 %rd380, %rd87, %rd106; + sub.s64 %rd381, %rd88, %rd106; + .loc 1 47 31 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:47:31 + shl.b64 %rd382, %rd374, 1; + add.s64 %rd283, %rd352, %rd382; + shl.b64 %rd383, %rd375, 1; + add.s64 %rd286, %rd355, %rd383; + shl.b64 %rd384, %rd376, 1; + add.s64 %rd289, %rd358, %rd384; + shl.b64 %rd385, %rd377, 1; + add.s64 %rd292, %rd361, %rd385; + shl.b64 %rd386, %rd378, 1; + add.s64 %rd295, %rd364, %rd386; + shl.b64 %rd387, %rd379, 1; + add.s64 %rd298, %rd367, %rd387; + shl.b64 %rd388, %rd380, 1; + add.s64 %rd301, %rd370, %rd388; + shl.b64 %rd389, %rd381, 1; + add.s64 %rd304, %rd373, %rd389; + .loc 1 47 81 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:47:81 + and.pred %p90, %p1, %p121; + and.pred %p91, %p2, %p120; + and.pred %p92, %p3, %p119; + and.pred %p93, %p4, %p118; + and.pred %p94, %p5, %p117; + and.pred %p95, %p6, %p116; + and.pred %p96, %p7, %p115; + and.pred %p97, %p8, %p114; + .loc 1 47 73 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:47:73 + // begin inline asm + mov.u64 %rd282, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd282, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs91, %rs76; + @%p90 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs91 }, [ %rd283 + 0 ], %rd282; + // end inline asm + // begin inline asm + mov.u64 %rd285, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd285, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs93, %rs76; + @%p91 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs93 }, [ %rd286 + 0 ], %rd285; + // end inline asm + // begin inline asm + mov.u64 %rd288, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd288, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs95, %rs76; + @%p92 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs95 }, [ %rd289 + 0 ], %rd288; + // end inline asm + // begin inline asm + mov.u64 %rd291, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd291, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs97, %rs76; + @%p93 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs97 }, [ %rd292 + 0 ], %rd291; + // end inline asm + // begin inline asm + mov.u64 %rd294, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd294, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs99, %rs76; + @%p94 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs99 }, [ %rd295 + 0 ], %rd294; + // end inline asm + // begin inline asm + mov.u64 %rd297, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd297, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs101, %rs76; + @%p95 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs101 }, [ %rd298 + 0 ], %rd297; + // end inline asm + // begin inline asm + mov.u64 %rd300, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd300, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs103, %rs76; + @%p96 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs103 }, [ %rd301 + 0 ], %rd300; + // end inline asm + // begin inline asm + mov.u64 %rd303, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd303, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs105, %rs76; + @%p97 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs105 }, [ %rd304 + 0 ], %rd303; + // end inline asm + .loc 1 51 34 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:51:34 + and.b64 %rd391, %rd212, %rd107; + and.b64 %rd393, %rd214, %rd107; + and.b64 %rd395, %rd216, %rd107; + and.b64 %rd397, %rd218, %rd107; + and.b64 %rd399, %rd220, %rd107; + and.b64 %rd401, %rd222, %rd107; + and.b64 %rd403, %rd224, %rd107; + and.b64 %rd405, %rd226, %rd107; + add.s64 %rd95, %rd405, %rd185; + add.s64 %rd94, %rd403, %rd181; + add.s64 %rd97, %rd401, %rd193; + add.s64 %rd96, %rd399, %rd189; + add.s64 %rd91, %rd397, %rd169; + add.s64 %rd90, %rd395, %rd165; + add.s64 %rd93, %rd393, %rd177; + add.s64 %rd92, %rd391, %rd173; + .loc 1 52 28 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:52:28 + setp.lt.s64 %p122, %rd92, 0; + setp.lt.s64 %p123, %rd93, 0; + setp.lt.s64 %p124, %rd90, 0; + setp.lt.s64 %p125, %rd91, 0; + setp.lt.s64 %p126, %rd96, 0; + setp.lt.s64 %p127, %rd97, 0; + setp.lt.s64 %p128, %rd94, 0; + setp.lt.s64 %p129, %rd95, 0; + .loc 1 52 46 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:52:46 + setp.ge.s64 %p130, %rd92, %rd107; + setp.ge.s64 %p131, %rd93, %rd107; + setp.ge.s64 %p132, %rd90, %rd107; + setp.ge.s64 %p133, %rd91, %rd107; + setp.ge.s64 %p134, %rd96, %rd107; + setp.ge.s64 %p135, %rd97, %rd107; + setp.ge.s64 %p136, %rd94, %rd107; + setp.ge.s64 %p137, %rd95, %rd107; + .loc 1 52 38 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:52:38 + or.pred %p138, %p129, %p137; + or.pred %p139, %p128, %p136; + or.pred %p140, %p127, %p135; + or.pred %p141, %p126, %p134; + or.pred %p142, %p125, %p133; + or.pred %p143, %p124, %p132; + or.pred %p144, %p123, %p131; + or.pred %p145, %p122, %p130; + .loc 1 52 54 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:52:54 + and.pred %p146, %p3, %p145; + selp.b16 %rs107, 1, 0, %p146; + shl.b16 %rs108, %rs107, 2; + and.pred %p147, %p4, %p144; + selp.b16 %rs109, -1, 0, %p147; + shl.b16 %rs110, %rs109, 3; + or.b16 %rs111, %rs110, %rs108; + and.pred %p148, %p1, %p143; + selp.b16 %rs112, 1, 0, %p148; + and.pred %p149, %p2, %p142; + selp.b16 %rs113, -1, 0, %p149; + shl.b16 %rs114, %rs113, 1; + or.b16 %rs115, %rs112, %rs114; + and.b16 %rs116, %rs115, 3; + or.b16 %rs117, %rs116, %rs111; + and.b16 %rs118, %rs117, 15; + and.pred %p150, %p7, %p141; + selp.b16 %rs119, 1, 0, %p150; + shl.b16 %rs120, %rs119, 2; + and.pred %p151, %p8, %p140; + selp.b16 %rs121, -1, 0, %p151; + shl.b16 %rs122, %rs121, 3; + or.b16 %rs123, %rs122, %rs120; + and.pred %p152, %p5, %p139; + selp.b16 %rs124, 1, 0, %p152; + and.pred %p153, %p6, %p138; + selp.b16 %rs125, -1, 0, %p153; + shl.b16 %rs126, %rs125, 1; + or.b16 %rs127, %rs124, %rs126; + and.b16 %rs128, %rs127, 3; + or.b16 %rs129, %rs128, %rs123; + shl.b16 %rs130, %rs129, 4; + or.b16 %rs131, %rs118, %rs130; + .loc 1 52 64 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:52:64 + and.b16 %rs132, %rs131, 255; + setp.eq.b16 %p154, %rs132, 0; + @%p154 bra $L__BB0_52; + bra.uni $L__BB0_51; +$L__BB0_52: + .loc 1 0 64 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:0:64 + ld.param.b64 %rd102, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_4]; + ld.param.b64 %rd101, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_3]; + .loc 1 40 119 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:40:119 + cvt.f32.bf16 %r79, %rs89; + mov.b32 %r80, 0f00000000; + .loc 1 41 13 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:41:13 + sub.f32 %r81, %r80, %r79; + .loc 1 47 132 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:47:132 + cvt.f32.bf16 %r82, %rs105; + .loc 1 0 0 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:0 + selp.f32 %r83, %r81, %r82, %p113; + .loc 1 40 119 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:40:119 + cvt.f32.bf16 %r84, %rs87; + .loc 1 41 13 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:41:13 + sub.f32 %r85, %r80, %r84; + .loc 1 47 132 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:47:132 + cvt.f32.bf16 %r86, %rs103; + .loc 1 0 0 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:0 + selp.f32 %r87, %r85, %r86, %p112; + .loc 1 40 119 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:40:119 + cvt.f32.bf16 %r88, %rs85; + .loc 1 41 13 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:41:13 + sub.f32 %r89, %r80, %r88; + .loc 1 47 132 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:47:132 + cvt.f32.bf16 %r90, %rs101; + .loc 1 0 0 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:0 + selp.f32 %r91, %r89, %r90, %p111; + .loc 1 40 119 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:40:119 + cvt.f32.bf16 %r92, %rs83; + .loc 1 41 13 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:41:13 + sub.f32 %r93, %r80, %r92; + .loc 1 47 132 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:47:132 + cvt.f32.bf16 %r94, %rs99; + .loc 1 0 0 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:0 + selp.f32 %r95, %r93, %r94, %p110; + .loc 1 40 119 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:40:119 + cvt.f32.bf16 %r96, %rs81; + .loc 1 41 13 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:41:13 + sub.f32 %r97, %r80, %r96; + .loc 1 47 132 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:47:132 + cvt.f32.bf16 %r98, %rs97; + .loc 1 0 0 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:0 + selp.f32 %r99, %r97, %r98, %p109; + .loc 1 40 119 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:40:119 + cvt.f32.bf16 %r100, %rs79; + .loc 1 41 13 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:41:13 + sub.f32 %r101, %r80, %r100; + .loc 1 47 132 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:47:132 + cvt.f32.bf16 %r102, %rs95; + .loc 1 0 0 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:0 + selp.f32 %r103, %r101, %r102, %p108; + .loc 1 40 119 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:40:119 + cvt.f32.bf16 %r104, %rs77; + .loc 1 41 13 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:41:13 + sub.f32 %r105, %r80, %r104; + .loc 1 47 132 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:47:132 + cvt.f32.bf16 %r106, %rs93; + .loc 1 0 0 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:0 + selp.f32 %r107, %r105, %r106, %p107; + .loc 1 40 119 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:40:119 + cvt.f32.bf16 %r108, %rs75; + .loc 1 41 13 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:41:13 + sub.f32 %r109, %r80, %r108; + .loc 1 47 132 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:47:132 + cvt.f32.bf16 %r110, %rs91; + .loc 1 0 0 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:0 + selp.f32 %r111, %r109, %r110, %p106; + .loc 1 26 75 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:26:75 + cvt.f32.bf16 %r112, %rs40; + cvt.f32.bf16 %r113, %rs39; + cvt.f32.bf16 %r114, %rs38; + cvt.f32.bf16 %r115, %rs37; + cvt.f32.bf16 %r116, %rs36; + cvt.f32.bf16 %r117, %rs35; + cvt.f32.bf16 %r118, %rs34; + cvt.f32.bf16 %r119, %rs33; + .loc 1 52 64 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:52:64 + bar.sync 0; + .loc 1 53 40 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:53:40 + mul.lo.s64 %rd444, %rd90, %rd106; + mul.lo.s64 %rd445, %rd91, %rd106; + mul.lo.s64 %rd446, %rd92, %rd106; + mul.lo.s64 %rd447, %rd93, %rd106; + mul.lo.s64 %rd448, %rd94, %rd106; + mul.lo.s64 %rd449, %rd95, %rd106; + mul.lo.s64 %rd450, %rd96, %rd106; + mul.lo.s64 %rd451, %rd97, %rd106; + .loc 1 53 31 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:53:31 + add.s64 %rd453, %rd101, %rd314; + shl.b64 %rd454, %rd444, 1; + add.s64 %rd413, %rd453, %rd454; + add.s64 %rd456, %rd101, %rd317; + shl.b64 %rd457, %rd445, 1; + add.s64 %rd416, %rd456, %rd457; + add.s64 %rd459, %rd101, %rd320; + shl.b64 %rd460, %rd446, 1; + add.s64 %rd419, %rd459, %rd460; + add.s64 %rd462, %rd101, %rd323; + shl.b64 %rd463, %rd447, 1; + add.s64 %rd422, %rd462, %rd463; + add.s64 %rd465, %rd101, %rd326; + shl.b64 %rd466, %rd448, 1; + add.s64 %rd425, %rd465, %rd466; + add.s64 %rd468, %rd101, %rd329; + shl.b64 %rd469, %rd449, 1; + add.s64 %rd428, %rd468, %rd469; + add.s64 %rd471, %rd101, %rd332; + shl.b64 %rd472, %rd450, 1; + add.s64 %rd431, %rd471, %rd472; + add.s64 %rd474, %rd101, %rd335; + shl.b64 %rd475, %rd451, 1; + add.s64 %rd434, %rd474, %rd475; + .loc 1 53 48 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:53:48 + // begin inline asm + mov.u64 %rd414, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd414, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs133, 0x0; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs133 }, [ %rd413 + 0 ], %rd414; + // end inline asm + // begin inline asm + mov.u64 %rd417, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd417, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs134, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs134 }, [ %rd416 + 0 ], %rd417; + // end inline asm + // begin inline asm + mov.u64 %rd420, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd420, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs135, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs135 }, [ %rd419 + 0 ], %rd420; + // end inline asm + // begin inline asm + mov.u64 %rd423, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd423, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs136, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs136 }, [ %rd422 + 0 ], %rd423; + // end inline asm + // begin inline asm + mov.u64 %rd426, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd426, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs137, 0x0; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs137 }, [ %rd425 + 0 ], %rd426; + // end inline asm + // begin inline asm + mov.u64 %rd429, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd429, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs138, 0x0; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs138 }, [ %rd428 + 0 ], %rd429; + // end inline asm + // begin inline asm + mov.u64 %rd432, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd432, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs139, 0x0; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs139 }, [ %rd431 + 0 ], %rd432; + // end inline asm + // begin inline asm + mov.u64 %rd435, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd435, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs140, 0x0; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs140 }, [ %rd434 + 0 ], %rd435; + // end inline asm + .loc 1 33 46 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:46 + mov.b32 %r120, {%rs67, %rs133}; + .loc 1 33 86 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:86 + mov.b32 {%rs149, %rs150}, %r120; + cvt.f32.bf16 %r121, %rs149; + cvt.f32.bf16 %r122, %rs150; + .loc 1 34 18 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:34:18 + mul.f32 %r123, %r111, %r122; + .loc 1 33 46 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:46 + mov.b32 %r124, {%rs68, %rs134}; + .loc 1 33 86 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:86 + mov.b32 {%rs151, %rs152}, %r124; + cvt.f32.bf16 %r125, %rs151; + cvt.f32.bf16 %r126, %rs152; + .loc 1 34 18 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:34:18 + mul.f32 %r127, %r107, %r126; + .loc 1 33 46 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:46 + mov.b32 %r128, {%rs69, %rs135}; + .loc 1 33 86 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:86 + mov.b32 {%rs153, %rs154}, %r128; + cvt.f32.bf16 %r129, %rs153; + cvt.f32.bf16 %r130, %rs154; + .loc 1 34 18 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:34:18 + mul.f32 %r131, %r103, %r130; + .loc 1 33 46 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:46 + mov.b32 %r132, {%rs70, %rs136}; + .loc 1 33 86 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:86 + mov.b32 {%rs155, %rs156}, %r132; + cvt.f32.bf16 %r133, %rs155; + cvt.f32.bf16 %r134, %rs156; + .loc 1 34 18 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:34:18 + mul.f32 %r135, %r99, %r134; + .loc 1 33 46 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:46 + mov.b32 %r136, {%rs71, %rs137}; + .loc 1 33 86 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:86 + mov.b32 {%rs157, %rs158}, %r136; + cvt.f32.bf16 %r137, %rs157; + cvt.f32.bf16 %r138, %rs158; + .loc 1 34 18 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:34:18 + mul.f32 %r139, %r95, %r138; + .loc 1 33 46 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:46 + mov.b32 %r140, {%rs72, %rs138}; + .loc 1 33 86 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:86 + mov.b32 {%rs159, %rs160}, %r140; + cvt.f32.bf16 %r141, %rs159; + cvt.f32.bf16 %r142, %rs160; + .loc 1 34 18 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:34:18 + mul.f32 %r143, %r91, %r142; + .loc 1 33 46 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:46 + mov.b32 %r144, {%rs73, %rs139}; + .loc 1 33 86 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:86 + mov.b32 {%rs161, %rs162}, %r144; + cvt.f32.bf16 %r145, %rs161; + cvt.f32.bf16 %r146, %rs162; + .loc 1 34 18 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:34:18 + mul.f32 %r147, %r87, %r146; + .loc 1 33 46 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:46 + mov.b32 %r148, {%rs74, %rs140}; + .loc 1 33 86 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:33:86 + mov.b32 {%rs163, %rs164}, %r148; + cvt.f32.bf16 %r149, %rs163; + cvt.f32.bf16 %r150, %rs164; + .loc 1 34 18 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:34:18 + mul.f32 %r151, %r83, %r150; + .loc 1 55 19 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:55:19 + fma.rn.f32 %r152, %r119, %r121, %r123; + fma.rn.f32 %r153, %r118, %r125, %r127; + fma.rn.f32 %r154, %r117, %r129, %r131; + fma.rn.f32 %r155, %r116, %r133, %r135; + fma.rn.f32 %r156, %r115, %r137, %r139; + fma.rn.f32 %r157, %r114, %r141, %r143; + fma.rn.f32 %r158, %r113, %r145, %r147; + fma.rn.f32 %r159, %r112, %r149, %r151; + .loc 1 56 25 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:56:25 + add.s64 %rd436, %rd102, %rd196; + add.s64 %rd437, %rd102, %rd197; + add.s64 %rd438, %rd102, %rd198; + add.s64 %rd439, %rd102, %rd199; + add.s64 %rd440, %rd102, %rd200; + add.s64 %rd441, %rd102, %rd201; + add.s64 %rd442, %rd102, %rd202; + add.s64 %rd443, %rd102, %rd203; + .loc 1 56 37 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:56:37 + cvt.rn.bf16.f32 %rs141, %r152; + cvt.rn.bf16.f32 %rs142, %r153; + cvt.rn.bf16.f32 %rs143, %r154; + cvt.rn.bf16.f32 %rs144, %r155; + cvt.rn.bf16.f32 %rs145, %r156; + cvt.rn.bf16.f32 %rs146, %r157; + cvt.rn.bf16.f32 %rs147, %r158; + cvt.rn.bf16.f32 %rs148, %r159; + // begin inline asm + @%p1 st.global.b16 [ %rd436 + 0 ], { %rs141 }; + // end inline asm + // begin inline asm + @%p2 st.global.b16 [ %rd437 + 0 ], { %rs142 }; + // end inline asm + // begin inline asm + @%p3 st.global.b16 [ %rd438 + 0 ], { %rs143 }; + // end inline asm + // begin inline asm + @%p4 st.global.b16 [ %rd439 + 0 ], { %rs144 }; + // end inline asm + // begin inline asm + @%p5 st.global.b16 [ %rd440 + 0 ], { %rs145 }; + // end inline asm + // begin inline asm + @%p6 st.global.b16 [ %rd441 + 0 ], { %rs146 }; + // end inline asm + // begin inline asm + @%p7 st.global.b16 [ %rd442 + 0 ], { %rs147 }; + // end inline asm + // begin inline asm + @%p8 st.global.b16 [ %rd443 + 0 ], { %rs148 }; + // end inline asm + .loc 1 56 4 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:56:4 + ret; +$L__BB0_49: + .loc 1 32 62 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:32:62 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd228, assertFunc_0; + cvta.global.u64 %rd229, %rd228; + st.param.b64 [param3], %rd229; + mov.b64 %rd230, assertFile_0; + cvta.global.u64 %rd231, %rd230; + st.param.b64 [param1], %rd231; + mov.b64 %rd232, assertMessage_0; + cvta.global.u64 %rd233, %rd232; + st.param.b64 [param0], %rd233; + st.param.b64 [param4], 1; + st.param.b32 [param2], 32; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__BB0_51: + .loc 1 52 64 // cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py:52:64 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd406, assertFunc_1; + cvta.global.u64 %rd407, %rd406; + st.param.b64 [param3], %rd407; + mov.b64 %rd408, assertFile_1; + cvta.global.u64 %rd409, %rd408; + st.param.b64 [param1], %rd409; + mov.b64 %rd410, assertMessage_1; + cvta.global.u64 %rd411, %rd410; + st.param.b64 [param0], %rd411; + st.param.b64 [param4], 1; + st.param.b32 [param2], 52; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 118 +.b8 121 +.b8 111 +.b8 113 +.b8 103 +.b8 55 +.b8 106 +.b8 122 +.b8 101 +.b8 97 +.b8 100 +.b8 97 +.b8 114 +.b8 114 +.b8 103 +.b8 103 +.b8 103 +.b8 120 +.b8 104 +.b8 115 +.b8 50 +.b8 100 +.b8 106 +.b8 109 +.b8 120 +.b8 117 +.b8 103 +.b8 120 +.b8 121 +.b8 118 +.b8 106 +.b8 104 +.b8 105 +.b8 109 +.b8 55 +.b8 50 +.b8 118 +.b8 102 +.b8 115 +.b8 116 +.b8 113 +.b8 99 +.b8 52 +.b8 105 +.b8 111 +.b8 52 +.b8 113 +.b8 115 +.b8 101 +.b8 99 +.b8 106 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 118 +.b8 121 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.source b/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.source new file mode 100644 index 0000000000000000000000000000000000000000..ad5af25ad47d3bfe1759cf28be7fdc8085eb58bc --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.source @@ -0,0 +1,299 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":18:0) +#loc78 = loc("in_ptr0"(#loc)) +#loc79 = loc("in_ptr1"(#loc)) +#loc80 = loc("in_ptr2"(#loc)) +#loc81 = loc("in_ptr3"(#loc)) +#loc82 = loc("out_ptr0"(#loc)) +#loc83 = loc("ks0"(#loc)) +#loc84 = loc("ks1"(#loc)) +#loc85 = loc("ks2"(#loc)) +#loc86 = loc("ks3"(#loc)) +#loc87 = loc("ks4"(#loc)) +#loc88 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc89) + %xoffset_0 = arith.constant 1024 : i32 loc(#loc90) + %xoffset_1 = arith.constant 1024 : i32 loc(#loc90) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc90) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc91) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<1024xi32> loc(#loc92) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<1024xi32> loc(#loc92) + %xmask = tt.splat %xnumel : i32 -> tensor<1024xi32> loc(#loc93) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<1024xi32> loc(#loc93) + %x2 = arith.extsi %xindex_4 : tensor<1024xi32> to tensor<1024xi64> loc(#loc94) + %x2_6 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc94) + %x2_7 = arith.divsi %x2, %x2_6 : tensor<1024xi64> loc(#loc94) + %x2_8 = tt.splat %ks1 : i64 -> tensor<1024xi64> loc(#loc95) + %x2_9 = arith.remsi %x2_7, %x2_8 : tensor<1024xi64> loc(#loc95) + %x0 = arith.extsi %xindex_4 : tensor<1024xi32> to tensor<1024xi64> loc(#loc96) + %x0_10 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc96) + %x0_11 = arith.remsi %x0, %x0_10 : tensor<1024xi64> loc(#loc96) + %x5 = arith.extsi %xindex_4 : tensor<1024xi32> to tensor<1024xi64> loc(#loc97) + %x5_12 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc97) + %x5_13 = arith.divsi %x5, %x5_12 : tensor<1024xi64> loc(#loc97) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc98) + %tmp0_14 = tt.addptr %tmp0, %xindex_4 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc98) + %tmp0_15 = tt.load %tmp0_14, %xmask_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc99) + %tmp0_16 = arith.extf %tmp0_15 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc100) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc101) + %tmp1_17 = tt.addptr %tmp1, %x2_9 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc101) + %tmp1_18 = tt.load %tmp1_17, %xmask_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc102) + %tmp3 = tt.splat %ks2 : i64 -> tensor<1024xi64> loc(#loc103) + %tmp3_19 = arith.addi %tmp1_18, %tmp3 : tensor<1024xi64> loc(#loc103) + %tmp4 = arith.constant 0 : i32 loc(#loc104) + %tmp4_20 = arith.extsi %tmp4 : i32 to i64 loc(#loc104) + %tmp4_21 = tt.splat %tmp4_20 : i64 -> tensor<1024xi64> loc(#loc104) + %tmp4_22 = arith.cmpi slt, %tmp1_18, %tmp4_21 : tensor<1024xi64> loc(#loc104) + %tmp5 = arith.select %tmp4_22, %tmp3_19, %tmp1_18 : tensor<1024xi1>, tensor<1024xi64> loc(#loc105) + %c0_i32 = arith.constant 0 : i32 loc(#loc18) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc18) + %1 = tt.splat %0 : i64 -> tensor<1024xi64> loc(#loc18) + %2 = arith.cmpi sle, %1, %tmp5 : tensor<1024xi64> loc(#loc18) + %3 = tt.splat %ks2 : i64 -> tensor<1024xi64> loc(#loc19) + %4 = arith.cmpi slt, %tmp5, %3 : tensor<1024xi64> loc(#loc19) + %5 = arith.andi %2, %4 : tensor<1024xi1> loc(#loc20) + %true = arith.constant true loc(#loc21) + %cst = arith.constant dense : tensor<1024xi1> loc(#loc21) + %6 = arith.xori %xmask_5, %cst : tensor<1024xi1> loc(#loc21) + %7 = arith.ori %5, %6 : tensor<1024xi1> loc(#loc22) + tt.assert %7, "index out of bounds: 0 <= tmp5 < ks2" : tensor<1024xi1> loc(#loc23) + %tmp7 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc106) + %tmp7_23 = arith.muli %tmp7, %tmp5 : tensor<1024xi64> loc(#loc106) + %tmp7_24 = arith.addi %x0_11, %tmp7_23 : tensor<1024xi64> loc(#loc107) + %tmp7_25 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc108) + %tmp7_26 = tt.addptr %tmp7_25, %tmp7_24 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc108) + %tmp7_27 = tt.load %tmp7_26, %xmask_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc109) + %tmp7_28 = arith.extf %tmp7_27 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc110) + %tmp8 = arith.mulf %tmp0_16, %tmp7_28 : tensor<1024xf32> loc(#loc111) + %tmp10 = arith.constant 0 : i64 loc(#loc112) + %tmp10_29 = arith.constant dense<0> : tensor<1xi64> loc(#loc112) + %tmp11 = arith.constant dense<0> : tensor<1024xi64> loc(#loc113) + %tmp11_30 = arith.cmpi sge, %x0_11, %tmp11 : tensor<1024xi64> loc(#loc113) + %tmp12 = arith.constant 2 : i32 loc(#loc114) + %tmp12_31 = arith.constant 2 : i64 loc(#loc114) + %tmp12_32 = arith.divsi %ks3, %tmp12_31 : i64 loc(#loc114) + %tmp12_33 = arith.constant -1 : i32 loc(#loc115) + %tmp12_34 = arith.constant -1 : i64 loc(#loc115) + %tmp12_35 = arith.muli %tmp12_34, %tmp12_32 : i64 loc(#loc115) + %tmp12_36 = arith.addi %ks3, %tmp12_35 : i64 loc(#loc116) + %tmp13 = tt.splat %tmp12_36 : i64 -> tensor<1024xi64> loc(#loc117) + %tmp13_37 = arith.cmpi slt, %x0_11, %tmp13 : tensor<1024xi64> loc(#loc117) + %tmp14 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc118) + %tmp14_38 = arith.muli %tmp14, %x5_13 : tensor<1024xi64> loc(#loc118) + %tmp14_39 = arith.constant 2 : i32 loc(#loc119) + %tmp14_40 = arith.constant 2 : i64 loc(#loc119) + %tmp14_41 = arith.divsi %ks3, %tmp14_40 : i64 loc(#loc119) + %tmp14_42 = tt.splat %tmp14_41 : i64 -> tensor<1024xi64> loc(#loc120) + %tmp14_43 = arith.addi %tmp14_38, %tmp14_42 : tensor<1024xi64> loc(#loc120) + %tmp14_44 = arith.addi %tmp14_43, %x0_11 : tensor<1024xi64> loc(#loc121) + %tmp14_45 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc122) + %tmp14_46 = tt.addptr %tmp14_45, %tmp14_44 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc122) + %tmp14_47 = arith.andi %tmp13_37, %xmask_5 : tensor<1024xi1> loc(#loc123) + %tmp14_48 = arith.constant 0.000000e+00 : f32 loc(#loc124) + %tmp14_49 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc124) + %tmp14_50 = arith.truncf %tmp14_49 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc124) + %tmp14_51 = tt.load %tmp14_46, %tmp14_47, %tmp14_50 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc124) + %tmp14_52 = arith.extf %tmp14_51 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc125) + %tmp15 = arith.constant 0.000000e+00 : f32 loc(#loc126) + %tmp15_53 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc126) + %tmp15_54 = arith.subf %tmp15_53, %tmp14_52 : tensor<1024xf32> loc(#loc126) + %tmp16 = arith.constant 0.000000e+00 : f32 loc(#loc127) + %tmp16_55 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc127) + %tmp17 = arith.select %tmp13_37, %tmp15_54, %tmp16_55 : tensor<1024xi1>, tensor<1024xf32> loc(#loc128) + %tmp18 = tt.splat %tmp12_36 : i64 -> tensor<1024xi64> loc(#loc129) + %tmp18_56 = arith.cmpi sge, %x0_11, %tmp18 : tensor<1024xi64> loc(#loc129) + %tmp20 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc130) + %tmp20_57 = arith.cmpi slt, %x0_11, %tmp20 : tensor<1024xi64> loc(#loc130) + %tmp21 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc131) + %tmp21_58 = arith.muli %tmp21, %x5_13 : tensor<1024xi64> loc(#loc131) + %tmp21_59 = arith.constant -1 : i32 loc(#loc132) + %tmp21_60 = arith.constant -1 : i64 loc(#loc132) + %tmp21_61 = arith.muli %tmp21_60, %ks3 : i64 loc(#loc132) + %tmp21_62 = tt.splat %tmp21_61 : i64 -> tensor<1024xi64> loc(#loc133) + %tmp21_63 = arith.addi %x0_11, %tmp21_62 : tensor<1024xi64> loc(#loc133) + %tmp21_64 = arith.constant 2 : i32 loc(#loc134) + %tmp21_65 = arith.constant 2 : i64 loc(#loc134) + %tmp21_66 = arith.divsi %ks3, %tmp21_65 : i64 loc(#loc134) + %tmp21_67 = tt.splat %tmp21_66 : i64 -> tensor<1024xi64> loc(#loc135) + %tmp21_68 = arith.addi %tmp21_63, %tmp21_67 : tensor<1024xi64> loc(#loc135) + %tmp21_69 = arith.addi %tmp21_58, %tmp21_68 : tensor<1024xi64> loc(#loc136) + %tmp21_70 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc137) + %tmp21_71 = tt.addptr %tmp21_70, %tmp21_69 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc137) + %tmp21_72 = arith.andi %tmp18_56, %xmask_5 : tensor<1024xi1> loc(#loc138) + %tmp21_73 = arith.constant 0.000000e+00 : f32 loc(#loc139) + %tmp21_74 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc139) + %tmp21_75 = arith.truncf %tmp21_74 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc139) + %tmp21_76 = tt.load %tmp21_71, %tmp21_72, %tmp21_75 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc139) + %tmp21_77 = arith.extf %tmp21_76 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc140) + %tmp22 = arith.select %tmp13_37, %tmp17, %tmp21_77 : tensor<1024xi1>, tensor<1024xf32> loc(#loc141) + %tmp24 = tt.splat %ks4 : i64 -> tensor<1024xi64> loc(#loc142) + %tmp24_78 = arith.addi %tmp1_18, %tmp24 : tensor<1024xi64> loc(#loc142) + %tmp25 = arith.select %tmp4_22, %tmp24_78, %tmp1_18 : tensor<1024xi1>, tensor<1024xi64> loc(#loc143) + %c0_i32_79 = arith.constant 0 : i32 loc(#loc62) + %8 = arith.extsi %c0_i32_79 : i32 to i64 loc(#loc62) + %9 = tt.splat %8 : i64 -> tensor<1024xi64> loc(#loc62) + %10 = arith.cmpi sle, %9, %tmp25 : tensor<1024xi64> loc(#loc62) + %11 = tt.splat %ks4 : i64 -> tensor<1024xi64> loc(#loc63) + %12 = arith.cmpi slt, %tmp25, %11 : tensor<1024xi64> loc(#loc63) + %13 = arith.andi %10, %12 : tensor<1024xi1> loc(#loc64) + %true_80 = arith.constant true loc(#loc65) + %cst_81 = arith.constant dense : tensor<1024xi1> loc(#loc65) + %14 = arith.xori %xmask_5, %cst_81 : tensor<1024xi1> loc(#loc65) + %15 = arith.ori %13, %14 : tensor<1024xi1> loc(#loc66) + tt.assert %15, "index out of bounds: 0 <= tmp25 < ks4" : tensor<1024xi1> loc(#loc67) + %tmp27 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc144) + %tmp27_82 = arith.muli %tmp27, %tmp25 : tensor<1024xi64> loc(#loc144) + %tmp27_83 = arith.addi %x0_11, %tmp27_82 : tensor<1024xi64> loc(#loc145) + %tmp27_84 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc146) + %tmp27_85 = tt.addptr %tmp27_84, %tmp27_83 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc146) + %tmp27_86 = tt.load %tmp27_85, %xmask_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc147) + %tmp27_87 = arith.extf %tmp27_86 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc148) + %tmp28 = arith.mulf %tmp22, %tmp27_87 : tensor<1024xf32> loc(#loc149) + %tmp29 = arith.addf %tmp8, %tmp28 : tensor<1024xf32> loc(#loc150) + %16 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc75) + %17 = tt.addptr %16, %xindex_4 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc75) + %18 = arith.truncf %tmp29 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc76) + tt.store %17, %18, %xmask_5 : tensor<1024x!tt.ptr> loc(#loc76) + tt.return loc(#loc77) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":23:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":24:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":25:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":26:30) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":26:35) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":26:75) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":27:30) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":27:35) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":29:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":30:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":31:32) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":32:28) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":32:44) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":32:37) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":32:54) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":32:52) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":32:62) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":33:39) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":33:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":33:30) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":33:46) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":33:86) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":34:18) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":36:28) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":37:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":38:31) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":38:24) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":38:18) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":39:19) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:35) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:48) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:41) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:54) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:31) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:68) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:60) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:119) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":41:13) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":42:38) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":43:35) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":44:20) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":46:19) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:35) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:52) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:47) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:67) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:60) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:41) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:31) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:81) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:73) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:132) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":48:35) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":50:19) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":51:34) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":52:28) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":52:46) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":52:38) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":52:56) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":52:54) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":52:64) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":53:40) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":53:36) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":53:31) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":53:48) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":53:88) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":54:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":55:19) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":56:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":56:37) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":56:4) +#loc89 = loc("xoffset"(#loc1)) +#loc90 = loc("xoffset"(#loc2)) +#loc91 = loc("xindex"(#loc3)) +#loc92 = loc("xindex"(#loc4)) +#loc93 = loc("xmask"(#loc5)) +#loc94 = loc("x2"(#loc6)) +#loc95 = loc("x2"(#loc7)) +#loc96 = loc("x0"(#loc8)) +#loc97 = loc("x5"(#loc9)) +#loc98 = loc("tmp0"(#loc10)) +#loc99 = loc("tmp0"(#loc11)) +#loc100 = loc("tmp0"(#loc12)) +#loc101 = loc("tmp1"(#loc13)) +#loc102 = loc("tmp1"(#loc14)) +#loc103 = loc("tmp3"(#loc15)) +#loc104 = loc("tmp4"(#loc16)) +#loc105 = loc("tmp5"(#loc17)) +#loc106 = loc("tmp7"(#loc24)) +#loc107 = loc("tmp7"(#loc25)) +#loc108 = loc("tmp7"(#loc26)) +#loc109 = loc("tmp7"(#loc27)) +#loc110 = loc("tmp7"(#loc28)) +#loc111 = loc("tmp8"(#loc29)) +#loc112 = loc("tmp10"(#loc30)) +#loc113 = loc("tmp11"(#loc31)) +#loc114 = loc("tmp12"(#loc32)) +#loc115 = loc("tmp12"(#loc33)) +#loc116 = loc("tmp12"(#loc34)) +#loc117 = loc("tmp13"(#loc35)) +#loc118 = loc("tmp14"(#loc36)) +#loc119 = loc("tmp14"(#loc37)) +#loc120 = loc("tmp14"(#loc38)) +#loc121 = loc("tmp14"(#loc39)) +#loc122 = loc("tmp14"(#loc40)) +#loc123 = loc("tmp14"(#loc41)) +#loc124 = loc("tmp14"(#loc42)) +#loc125 = loc("tmp14"(#loc43)) +#loc126 = loc("tmp15"(#loc44)) +#loc127 = loc("tmp16"(#loc45)) +#loc128 = loc("tmp17"(#loc46)) +#loc129 = loc("tmp18"(#loc47)) +#loc130 = loc("tmp20"(#loc48)) +#loc131 = loc("tmp21"(#loc49)) +#loc132 = loc("tmp21"(#loc50)) +#loc133 = loc("tmp21"(#loc51)) +#loc134 = loc("tmp21"(#loc52)) +#loc135 = loc("tmp21"(#loc53)) +#loc136 = loc("tmp21"(#loc54)) +#loc137 = loc("tmp21"(#loc55)) +#loc138 = loc("tmp21"(#loc56)) +#loc139 = loc("tmp21"(#loc57)) +#loc140 = loc("tmp21"(#loc58)) +#loc141 = loc("tmp22"(#loc59)) +#loc142 = loc("tmp24"(#loc60)) +#loc143 = loc("tmp25"(#loc61)) +#loc144 = loc("tmp27"(#loc68)) +#loc145 = loc("tmp27"(#loc69)) +#loc146 = loc("tmp27"(#loc70)) +#loc147 = loc("tmp27"(#loc71)) +#loc148 = loc("tmp27"(#loc72)) +#loc149 = loc("tmp28"(#loc73)) +#loc150 = loc("tmp29"(#loc74)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..e2b80ea31cf581a4303e6ca487e886d6be2f09c4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttgir @@ -0,0 +1,232 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":18:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("in_ptr1"(#loc)) +#loc72 = loc("in_ptr2"(#loc)) +#loc73 = loc("in_ptr3"(#loc)) +#loc74 = loc("out_ptr0"(#loc)) +#loc75 = loc("ks0"(#loc)) +#loc76 = loc("ks1"(#loc)) +#loc77 = loc("ks2"(#loc)) +#loc78 = loc("ks3"(#loc)) +#loc79 = loc("ks4"(#loc)) +#loc80 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<1024xi1, #blocked> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xbf16, #blocked> loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<1024xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc81) + %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc82) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc83) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32, #blocked> loc(#loc84) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32, #blocked> loc(#loc84) + %xmask = tt.splat %xnumel : i32 -> tensor<1024xi32, #blocked> loc(#loc85) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<1024xi32, #blocked> loc(#loc85) + %x2 = arith.extsi %xindex_5 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> loc(#loc86) + %x2_7 = tt.splat %ks0 : i64 -> tensor<1024xi64, #blocked> loc(#loc86) + %x2_8 = arith.divsi %x2, %x2_7 : tensor<1024xi64, #blocked> loc(#loc86) + %x2_9 = tt.splat %ks1 : i64 -> tensor<1024xi64, #blocked> loc(#loc87) + %x2_10 = arith.remsi %x2_8, %x2_9 : tensor<1024xi64, #blocked> loc(#loc87) + %x0 = tt.splat %ks3 : i64 -> tensor<1024xi64, #blocked> loc(#loc88) + %x0_11 = arith.remsi %x2, %x0 : tensor<1024xi64, #blocked> loc(#loc88) + %x5 = arith.divsi %x2, %x0 : tensor<1024xi64, #blocked> loc(#loc89) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc90) + %tmp0_12 = tt.addptr %tmp0, %xindex_5 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc90) + %tmp0_13 = tt.load %tmp0_12, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc91) + %tmp0_14 = arith.extf %tmp0_13 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc92) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc93) + %tmp1_15 = tt.addptr %tmp1, %x2_10 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc93) + %tmp1_16 = tt.load %tmp1_15, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc94) + %tmp3 = tt.splat %ks2 : i64 -> tensor<1024xi64, #blocked> loc(#loc95) + %tmp3_17 = arith.addi %tmp1_16, %tmp3 : tensor<1024xi64, #blocked> loc(#loc95) + %tmp4 = arith.cmpi slt, %tmp1_16, %cst_1 : tensor<1024xi64, #blocked> loc(#loc96) + %tmp5 = arith.select %tmp4, %tmp3_17, %tmp1_16 : tensor<1024xi1, #blocked>, tensor<1024xi64, #blocked> loc(#loc97) + %0 = arith.cmpi sge, %tmp5, %cst_1 : tensor<1024xi64, #blocked> loc(#loc19) + %1 = arith.cmpi slt, %tmp5, %tmp3 : tensor<1024xi64, #blocked> loc(#loc20) + %2 = arith.andi %0, %1 : tensor<1024xi1, #blocked> loc(#loc21) + %3 = arith.xori %xmask_6, %cst : tensor<1024xi1, #blocked> loc(#loc22) + %4 = arith.ori %2, %3 : tensor<1024xi1, #blocked> loc(#loc23) + tt.assert %4, "index out of bounds: 0 <= tmp5 < ks2" : tensor<1024xi1, #blocked> loc(#loc24) + %tmp7 = arith.muli %x0, %tmp5 : tensor<1024xi64, #blocked> loc(#loc98) + %tmp7_18 = arith.addi %x0_11, %tmp7 : tensor<1024xi64, #blocked> loc(#loc99) + %tmp7_19 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc100) + %tmp7_20 = tt.addptr %tmp7_19, %tmp7_18 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc100) + %tmp7_21 = tt.load %tmp7_20, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc101) + %tmp7_22 = arith.extf %tmp7_21 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc102) + %tmp8 = arith.mulf %tmp0_14, %tmp7_22 : tensor<1024xf32, #blocked> loc(#loc103) + %tmp12 = arith.divsi %ks3, %c2_i64 : i64 loc(#loc104) + %tmp12_23 = arith.subi %ks3, %tmp12 : i64 loc(#loc105) + %tmp13 = tt.splat %tmp12_23 : i64 -> tensor<1024xi64, #blocked> loc(#loc106) + %tmp13_24 = arith.cmpi slt, %x0_11, %tmp13 : tensor<1024xi64, #blocked> loc(#loc106) + %tmp14 = arith.muli %x0, %x5 : tensor<1024xi64, #blocked> loc(#loc107) + %tmp14_25 = tt.splat %tmp12 : i64 -> tensor<1024xi64, #blocked> loc(#loc108) + %tmp14_26 = arith.addi %tmp14, %tmp14_25 : tensor<1024xi64, #blocked> loc(#loc108) + %tmp14_27 = arith.addi %tmp14_26, %x0_11 : tensor<1024xi64, #blocked> loc(#loc109) + %tmp14_28 = tt.addptr %tmp0, %tmp14_27 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc110) + %tmp14_29 = arith.andi %tmp13_24, %xmask_6 : tensor<1024xi1, #blocked> loc(#loc111) + %tmp14_30 = tt.load %tmp14_28, %tmp14_29, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc112) + %tmp14_31 = arith.extf %tmp14_30 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc113) + %tmp15 = arith.subf %cst_2, %tmp14_31 : tensor<1024xf32, #blocked> loc(#loc114) + %tmp18 = arith.cmpi sge, %x0_11, %tmp13 : tensor<1024xi64, #blocked> loc(#loc115) + %tmp21 = arith.muli %ks3, %c-1_i64 : i64 loc(#loc116) + %tmp21_32 = tt.splat %tmp21 : i64 -> tensor<1024xi64, #blocked> loc(#loc117) + %tmp21_33 = arith.addi %x0_11, %tmp21_32 : tensor<1024xi64, #blocked> loc(#loc117) + %tmp21_34 = arith.addi %tmp21_33, %tmp14_25 : tensor<1024xi64, #blocked> loc(#loc118) + %tmp21_35 = arith.addi %tmp14, %tmp21_34 : tensor<1024xi64, #blocked> loc(#loc119) + %tmp21_36 = tt.addptr %tmp0, %tmp21_35 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc120) + %tmp21_37 = arith.andi %tmp18, %xmask_6 : tensor<1024xi1, #blocked> loc(#loc121) + %tmp21_38 = tt.load %tmp21_36, %tmp21_37, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc122) + %tmp21_39 = arith.extf %tmp21_38 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc123) + %tmp22 = arith.select %tmp13_24, %tmp15, %tmp21_39 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc135) + %tmp24 = tt.splat %ks4 : i64 -> tensor<1024xi64, #blocked> loc(#loc126) + %tmp24_40 = arith.addi %tmp1_16, %tmp24 : tensor<1024xi64, #blocked> loc(#loc126) + %tmp25 = arith.select %tmp4, %tmp24_40, %tmp1_16 : tensor<1024xi1, #blocked>, tensor<1024xi64, #blocked> loc(#loc127) + %5 = arith.cmpi sge, %tmp25, %cst_1 : tensor<1024xi64, #blocked> loc(#loc55) + %6 = arith.cmpi slt, %tmp25, %tmp24 : tensor<1024xi64, #blocked> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<1024xi1, #blocked> loc(#loc57) + %8 = arith.ori %7, %3 : tensor<1024xi1, #blocked> loc(#loc58) + tt.assert %8, "index out of bounds: 0 <= tmp25 < ks4" : tensor<1024xi1, #blocked> loc(#loc59) + %tmp27 = arith.muli %x0, %tmp25 : tensor<1024xi64, #blocked> loc(#loc128) + %tmp27_41 = arith.addi %x0_11, %tmp27 : tensor<1024xi64, #blocked> loc(#loc129) + %tmp27_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc130) + %tmp27_43 = tt.addptr %tmp27_42, %tmp27_41 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc130) + %tmp27_44 = tt.load %tmp27_43, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc131) + %tmp27_45 = arith.extf %tmp27_44 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc132) + %tmp28 = arith.mulf %tmp22, %tmp27_45 : tensor<1024xf32, #blocked> loc(#loc133) + %tmp29 = arith.addf %tmp8, %tmp28 : tensor<1024xf32, #blocked> loc(#loc134) + %9 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc67) + %10 = tt.addptr %9, %xindex_5 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc67) + %11 = arith.truncf %tmp29 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc68) + tt.store %10, %11, %xmask_6 : tensor<1024x!tt.ptr, #blocked> loc(#loc68) + tt.return loc(#loc69) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":23:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":23:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":24:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":25:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":26:30) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":26:35) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":26:75) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":27:30) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":27:35) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":29:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":30:18) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":31:32) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":32:28) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":32:44) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":32:37) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":32:54) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":32:52) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":32:62) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":33:39) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":33:35) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":33:30) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":33:46) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":33:86) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":34:18) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":38:31) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":38:18) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":39:19) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:35) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:41) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:68) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:60) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:119) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":41:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":44:20) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:52) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:47) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:60) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:41) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:81) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:73) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:132) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":48:35) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":43:35) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":50:19) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":51:34) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":52:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":52:46) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":52:38) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":52:54) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":52:64) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":53:40) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":53:36) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":53:31) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":53:48) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":53:88) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":54:20) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":55:19) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":56:25) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":56:37) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":56:4) +#loc81 = loc("xoffset"(#loc2)) +#loc82 = loc("xoffset"(#loc3)) +#loc83 = loc("xindex"(#loc4)) +#loc84 = loc("xindex"(#loc5)) +#loc85 = loc("xmask"(#loc6)) +#loc86 = loc("x2"(#loc7)) +#loc87 = loc("x2"(#loc8)) +#loc88 = loc("x0"(#loc9)) +#loc89 = loc("x5"(#loc10)) +#loc90 = loc("tmp0"(#loc11)) +#loc91 = loc("tmp0"(#loc12)) +#loc92 = loc("tmp0"(#loc13)) +#loc93 = loc("tmp1"(#loc14)) +#loc94 = loc("tmp1"(#loc15)) +#loc95 = loc("tmp3"(#loc16)) +#loc96 = loc("tmp4"(#loc17)) +#loc97 = loc("tmp5"(#loc18)) +#loc98 = loc("tmp7"(#loc25)) +#loc99 = loc("tmp7"(#loc26)) +#loc100 = loc("tmp7"(#loc27)) +#loc101 = loc("tmp7"(#loc28)) +#loc102 = loc("tmp7"(#loc29)) +#loc103 = loc("tmp8"(#loc30)) +#loc104 = loc("tmp12"(#loc31)) +#loc105 = loc("tmp12"(#loc32)) +#loc106 = loc("tmp13"(#loc33)) +#loc107 = loc("tmp14"(#loc34)) +#loc108 = loc("tmp14"(#loc35)) +#loc109 = loc("tmp14"(#loc36)) +#loc110 = loc("tmp14"(#loc37)) +#loc111 = loc("tmp14"(#loc38)) +#loc112 = loc("tmp14"(#loc39)) +#loc113 = loc("tmp14"(#loc40)) +#loc114 = loc("tmp15"(#loc41)) +#loc115 = loc("tmp18"(#loc42)) +#loc116 = loc("tmp21"(#loc43)) +#loc117 = loc("tmp21"(#loc44)) +#loc118 = loc("tmp21"(#loc45)) +#loc119 = loc("tmp21"(#loc46)) +#loc120 = loc("tmp21"(#loc47)) +#loc121 = loc("tmp21"(#loc48)) +#loc122 = loc("tmp21"(#loc49)) +#loc123 = loc("tmp21"(#loc50)) +#loc124 = loc("tmp22"(#loc51)) +#loc125 = loc("tmp17"(#loc52)) +#loc126 = loc("tmp24"(#loc53)) +#loc127 = loc("tmp25"(#loc54)) +#loc128 = loc("tmp27"(#loc60)) +#loc129 = loc("tmp27"(#loc61)) +#loc130 = loc("tmp27"(#loc62)) +#loc131 = loc("tmp27"(#loc63)) +#loc132 = loc("tmp27"(#loc64)) +#loc133 = loc("tmp28"(#loc65)) +#loc134 = loc("tmp29"(#loc66)) +#loc135 = loc(fused[#loc124, #loc125]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..cfe5e453b6223305b6a81443e0ebdb72e1c00cfd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttir @@ -0,0 +1,231 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":18:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("in_ptr1"(#loc)) +#loc72 = loc("in_ptr2"(#loc)) +#loc73 = loc("in_ptr3"(#loc)) +#loc74 = loc("out_ptr0"(#loc)) +#loc75 = loc("ks0"(#loc)) +#loc76 = loc("ks1"(#loc)) +#loc77 = loc("ks2"(#loc)) +#loc78 = loc("ks3"(#loc)) +#loc79 = loc("ks4"(#loc)) +#loc80 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1024xbf16> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1024xi64> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %cst_2 = arith.constant dense : tensor<1024xi1> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc81) + %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc82) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc83) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc84) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc84) + %xmask = tt.splat %xnumel : i32 -> tensor<1024xi32> loc(#loc85) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<1024xi32> loc(#loc85) + %x2 = arith.extsi %xindex_5 : tensor<1024xi32> to tensor<1024xi64> loc(#loc86) + %x2_7 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc86) + %x2_8 = arith.divsi %x2, %x2_7 : tensor<1024xi64> loc(#loc86) + %x2_9 = tt.splat %ks1 : i64 -> tensor<1024xi64> loc(#loc87) + %x2_10 = arith.remsi %x2_8, %x2_9 : tensor<1024xi64> loc(#loc87) + %x0 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc88) + %x0_11 = arith.remsi %x2, %x0 : tensor<1024xi64> loc(#loc88) + %x5 = arith.divsi %x2, %x0 : tensor<1024xi64> loc(#loc89) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc90) + %tmp0_12 = tt.addptr %tmp0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc90) + %tmp0_13 = tt.load %tmp0_12, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc91) + %tmp0_14 = arith.extf %tmp0_13 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc92) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc93) + %tmp1_15 = tt.addptr %tmp1, %x2_10 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc93) + %tmp1_16 = tt.load %tmp1_15, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc94) + %tmp3 = tt.splat %ks2 : i64 -> tensor<1024xi64> loc(#loc95) + %tmp3_17 = arith.addi %tmp1_16, %tmp3 : tensor<1024xi64> loc(#loc95) + %tmp4 = arith.cmpi slt, %tmp1_16, %cst_0 : tensor<1024xi64> loc(#loc96) + %tmp5 = arith.select %tmp4, %tmp3_17, %tmp1_16 : tensor<1024xi1>, tensor<1024xi64> loc(#loc97) + %0 = arith.cmpi sge, %tmp5, %cst_0 : tensor<1024xi64> loc(#loc19) + %1 = arith.cmpi slt, %tmp5, %tmp3 : tensor<1024xi64> loc(#loc20) + %2 = arith.andi %0, %1 : tensor<1024xi1> loc(#loc21) + %3 = arith.xori %xmask_6, %cst_2 : tensor<1024xi1> loc(#loc22) + %4 = arith.ori %2, %3 : tensor<1024xi1> loc(#loc23) + tt.assert %4, "index out of bounds: 0 <= tmp5 < ks2" : tensor<1024xi1> loc(#loc24) + %tmp7 = arith.muli %x0, %tmp5 : tensor<1024xi64> loc(#loc98) + %tmp7_18 = arith.addi %x0_11, %tmp7 : tensor<1024xi64> loc(#loc99) + %tmp7_19 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc100) + %tmp7_20 = tt.addptr %tmp7_19, %tmp7_18 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc100) + %tmp7_21 = tt.load %tmp7_20, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc101) + %tmp7_22 = arith.extf %tmp7_21 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc102) + %tmp8 = arith.mulf %tmp0_14, %tmp7_22 : tensor<1024xf32> loc(#loc103) + %tmp12 = arith.divsi %ks3, %c2_i64 : i64 loc(#loc104) + %tmp12_23 = arith.subi %ks3, %tmp12 : i64 loc(#loc105) + %tmp13 = tt.splat %tmp12_23 : i64 -> tensor<1024xi64> loc(#loc106) + %tmp13_24 = arith.cmpi slt, %x0_11, %tmp13 : tensor<1024xi64> loc(#loc106) + %tmp14 = arith.muli %x0, %x5 : tensor<1024xi64> loc(#loc107) + %tmp14_25 = tt.splat %tmp12 : i64 -> tensor<1024xi64> loc(#loc108) + %tmp14_26 = arith.addi %tmp14, %tmp14_25 : tensor<1024xi64> loc(#loc108) + %tmp14_27 = arith.addi %tmp14_26, %x0_11 : tensor<1024xi64> loc(#loc109) + %tmp14_28 = tt.addptr %tmp0, %tmp14_27 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc110) + %tmp14_29 = arith.andi %tmp13_24, %xmask_6 : tensor<1024xi1> loc(#loc111) + %tmp14_30 = tt.load %tmp14_28, %tmp14_29, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc112) + %tmp14_31 = arith.extf %tmp14_30 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc113) + %tmp15 = arith.subf %cst_1, %tmp14_31 : tensor<1024xf32> loc(#loc114) + %tmp18 = arith.cmpi sge, %x0_11, %tmp13 : tensor<1024xi64> loc(#loc115) + %tmp21 = arith.muli %ks3, %c-1_i64 : i64 loc(#loc116) + %tmp21_32 = tt.splat %tmp21 : i64 -> tensor<1024xi64> loc(#loc117) + %tmp21_33 = arith.addi %x0_11, %tmp21_32 : tensor<1024xi64> loc(#loc117) + %tmp21_34 = arith.addi %tmp21_33, %tmp14_25 : tensor<1024xi64> loc(#loc118) + %tmp21_35 = arith.addi %tmp14, %tmp21_34 : tensor<1024xi64> loc(#loc119) + %tmp21_36 = tt.addptr %tmp0, %tmp21_35 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc120) + %tmp21_37 = arith.andi %tmp18, %xmask_6 : tensor<1024xi1> loc(#loc121) + %tmp21_38 = tt.load %tmp21_36, %tmp21_37, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc122) + %tmp21_39 = arith.extf %tmp21_38 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc123) + %tmp22 = arith.select %tmp13_24, %tmp15, %tmp21_39 : tensor<1024xi1>, tensor<1024xf32> loc(#loc135) + %tmp24 = tt.splat %ks4 : i64 -> tensor<1024xi64> loc(#loc126) + %tmp24_40 = arith.addi %tmp1_16, %tmp24 : tensor<1024xi64> loc(#loc126) + %tmp25 = arith.select %tmp4, %tmp24_40, %tmp1_16 : tensor<1024xi1>, tensor<1024xi64> loc(#loc127) + %5 = arith.cmpi sge, %tmp25, %cst_0 : tensor<1024xi64> loc(#loc55) + %6 = arith.cmpi slt, %tmp25, %tmp24 : tensor<1024xi64> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<1024xi1> loc(#loc57) + %8 = arith.ori %7, %3 : tensor<1024xi1> loc(#loc58) + tt.assert %8, "index out of bounds: 0 <= tmp25 < ks4" : tensor<1024xi1> loc(#loc59) + %tmp27 = arith.muli %x0, %tmp25 : tensor<1024xi64> loc(#loc128) + %tmp27_41 = arith.addi %x0_11, %tmp27 : tensor<1024xi64> loc(#loc129) + %tmp27_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc130) + %tmp27_43 = tt.addptr %tmp27_42, %tmp27_41 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc130) + %tmp27_44 = tt.load %tmp27_43, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc131) + %tmp27_45 = arith.extf %tmp27_44 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc132) + %tmp28 = arith.mulf %tmp22, %tmp27_45 : tensor<1024xf32> loc(#loc133) + %tmp29 = arith.addf %tmp8, %tmp28 : tensor<1024xf32> loc(#loc134) + %9 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc67) + %10 = tt.addptr %9, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc67) + %11 = arith.truncf %tmp29 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc68) + tt.store %10, %11, %xmask_6 : tensor<1024x!tt.ptr> loc(#loc68) + tt.return loc(#loc69) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":23:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":23:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":24:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":25:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":26:30) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":26:35) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":26:75) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":27:30) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":27:35) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":29:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":30:18) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":31:32) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":32:28) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":32:44) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":32:37) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":32:54) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":32:52) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":32:62) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":33:39) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":33:35) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":33:30) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":33:46) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":33:86) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":34:18) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":38:31) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":38:18) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":39:19) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:35) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:41) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:68) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:60) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":40:119) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":41:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":44:20) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:52) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:47) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:60) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:41) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:81) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:73) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":47:132) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":48:35) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":43:35) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":50:19) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":51:34) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":52:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":52:46) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":52:38) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":52:54) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":52:64) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":53:40) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":53:36) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":53:31) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":53:48) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":53:88) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":54:20) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":55:19) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":56:25) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":56:37) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vy/cvyoqg7jzeadarrgggxhs2djmxugxyvjhim72vfstqc4io4qsecj.py":56:4) +#loc81 = loc("xoffset"(#loc2)) +#loc82 = loc("xoffset"(#loc3)) +#loc83 = loc("xindex"(#loc4)) +#loc84 = loc("xindex"(#loc5)) +#loc85 = loc("xmask"(#loc6)) +#loc86 = loc("x2"(#loc7)) +#loc87 = loc("x2"(#loc8)) +#loc88 = loc("x0"(#loc9)) +#loc89 = loc("x5"(#loc10)) +#loc90 = loc("tmp0"(#loc11)) +#loc91 = loc("tmp0"(#loc12)) +#loc92 = loc("tmp0"(#loc13)) +#loc93 = loc("tmp1"(#loc14)) +#loc94 = loc("tmp1"(#loc15)) +#loc95 = loc("tmp3"(#loc16)) +#loc96 = loc("tmp4"(#loc17)) +#loc97 = loc("tmp5"(#loc18)) +#loc98 = loc("tmp7"(#loc25)) +#loc99 = loc("tmp7"(#loc26)) +#loc100 = loc("tmp7"(#loc27)) +#loc101 = loc("tmp7"(#loc28)) +#loc102 = loc("tmp7"(#loc29)) +#loc103 = loc("tmp8"(#loc30)) +#loc104 = loc("tmp12"(#loc31)) +#loc105 = loc("tmp12"(#loc32)) +#loc106 = loc("tmp13"(#loc33)) +#loc107 = loc("tmp14"(#loc34)) +#loc108 = loc("tmp14"(#loc35)) +#loc109 = loc("tmp14"(#loc36)) +#loc110 = loc("tmp14"(#loc37)) +#loc111 = loc("tmp14"(#loc38)) +#loc112 = loc("tmp14"(#loc39)) +#loc113 = loc("tmp14"(#loc40)) +#loc114 = loc("tmp15"(#loc41)) +#loc115 = loc("tmp18"(#loc42)) +#loc116 = loc("tmp21"(#loc43)) +#loc117 = loc("tmp21"(#loc44)) +#loc118 = loc("tmp21"(#loc45)) +#loc119 = loc("tmp21"(#loc46)) +#loc120 = loc("tmp21"(#loc47)) +#loc121 = loc("tmp21"(#loc48)) +#loc122 = loc("tmp21"(#loc49)) +#loc123 = loc("tmp21"(#loc50)) +#loc124 = loc("tmp22"(#loc51)) +#loc125 = loc("tmp17"(#loc52)) +#loc126 = loc("tmp24"(#loc53)) +#loc127 = loc("tmp25"(#loc54)) +#loc128 = loc("tmp27"(#loc60)) +#loc129 = loc("tmp27"(#loc61)) +#loc130 = loc("tmp27"(#loc62)) +#loc131 = loc("tmp27"(#loc63)) +#loc132 = loc("tmp27"(#loc64)) +#loc133 = loc("tmp28"(#loc65)) +#loc134 = loc("tmp29"(#loc66)) +#loc135 = loc(fused[#loc124, #loc125]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..21ba6a87b941212418f81877ae9a06c8a637ab8f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..52b79a04e302a5ac31258d49754c7843189cd1dd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"hash": "e06ef592ad45788917e8ca2aaf880ce2d9dc1b17df1f92a71a87d94ada7fc9a8", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 16384, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..c23893a011b75d2577d7878963edf6f2a7d80df6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir @@ -0,0 +1,2749 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_1 = internal constant [8 x i8] c"unknown\00" +@assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py\00" +@assertMessage_1 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp49 < 17\00" +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py\00" +@assertMessage_0 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp40 < 17\00" +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %13 = shl i32 %12, 7, !dbg !11 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %15 = and i32 %14, 252, !dbg !12 + %16 = lshr exact i32 %15, 2, !dbg !12 + %17 = and i32 %14, 127, !dbg !12 + %18 = or disjoint i32 %16, %13, !dbg !13 + %19 = or disjoint i32 %18, 64, !dbg !13 + %20 = icmp slt i32 %18, 128, !dbg !14 + %21 = icmp slt i32 %19, 128, !dbg !14 + %22 = and i32 %14, 3, !dbg !15 + %23 = shl nuw nsw i32 %22, 2, !dbg !15 + %24 = or disjoint i32 %23, 1, !dbg !15 + %25 = or disjoint i32 %23, 2, !dbg !15 + %26 = or disjoint i32 %23, 3, !dbg !15 + %27 = shl i32 %18, 4, !dbg !16 + %28 = shl i32 %19, 4, !dbg !16 + %29 = or disjoint i32 %27, %23, !dbg !17 + %30 = or disjoint i32 %27, %25, !dbg !17 + %31 = or disjoint i32 %28, %23, !dbg !17 + %32 = or disjoint i32 %28, %25, !dbg !17 + %33 = sext i32 %29 to i64, !dbg !18 + %34 = getelementptr i64, ptr addrspace(1) %0, i64 %33, !dbg !18 + %35 = sext i32 %30 to i64, !dbg !18 + %36 = getelementptr i64, ptr addrspace(1) %0, i64 %35, !dbg !18 + %37 = sext i32 %31 to i64, !dbg !18 + %38 = getelementptr i64, ptr addrspace(1) %0, i64 %37, !dbg !18 + %39 = sext i32 %32 to i64, !dbg !18 + %40 = getelementptr i64, ptr addrspace(1) %0, i64 %39, !dbg !18 + %41 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %34, i1 %20) #6, !dbg !19 + %42 = extractvalue { i64, i64 } %41, 0, !dbg !19 + %43 = extractvalue { i64, i64 } %41, 1, !dbg !19 + %44 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %36, i1 %20) #6, !dbg !19 + %45 = extractvalue { i64, i64 } %44, 0, !dbg !19 + %46 = extractvalue { i64, i64 } %44, 1, !dbg !19 + %47 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %38, i1 %21) #6, !dbg !19 + %48 = extractvalue { i64, i64 } %47, 0, !dbg !19 + %49 = extractvalue { i64, i64 } %47, 1, !dbg !19 + %50 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %40, i1 %21) #6, !dbg !19 + %51 = extractvalue { i64, i64 } %50, 0, !dbg !19 + %52 = extractvalue { i64, i64 } %50, 1, !dbg !19 + %53 = and i32 %14, 1, !dbg !20 + %54 = lshr i32 %14, 1, !dbg !20 + %.lobit = and i32 %54, 1, !dbg !20 + %55 = xor i32 %53, 1, !dbg !24 + %56 = xor i32 %.lobit, 1, !dbg !24 + %57 = xor i32 %24, %23, !dbg !25 + %58 = xor i32 %25, %26, !dbg !25 + %59 = trunc i32 %14 to i1, !dbg !26 + %60 = trunc i32 %54 to i1, !dbg !26 + %61 = icmp eq i64 %42, 16384, !dbg !27 + %62 = icmp eq i64 %43, 16384, !dbg !27 + %63 = icmp eq i64 %45, 16384, !dbg !27 + %64 = icmp eq i64 %46, 16384, !dbg !27 + %65 = icmp eq i64 %48, 16384, !dbg !27 + %66 = icmp eq i64 %49, 16384, !dbg !27 + %67 = icmp eq i64 %51, 16384, !dbg !27 + %68 = icmp eq i64 %52, 16384, !dbg !27 + %69 = zext i1 %61 to i32, !dbg !28 + %70 = zext i1 %62 to i32, !dbg !28 + %71 = zext i1 %63 to i32, !dbg !28 + %72 = zext i1 %64 to i32, !dbg !28 + %73 = zext i1 %65 to i32, !dbg !28 + %74 = zext i1 %66 to i32, !dbg !28 + %75 = zext i1 %67 to i32, !dbg !28 + %76 = zext i1 %68 to i32, !dbg !28 + %77 = xor i1 %61, true, !dbg !29 + %78 = and i1 %62, %77, !dbg !29 + %79 = xor i1 %65, true, !dbg !29 + %80 = and i1 %66, %79, !dbg !29 + %.not26 = xor i1 %64, true, !dbg !31 + %81 = or i1 %63, %.not26, !dbg !31 + %.not27 = xor i1 %68, true, !dbg !31 + %82 = or i1 %67, %.not27, !dbg !31 + %83 = xor i32 %69, %70, !dbg !32 + %84 = xor i32 %71, %72, !dbg !32 + %85 = xor i32 %73, %74, !dbg !32 + %86 = xor i32 %75, %76, !dbg !32 + %87 = select i1 %78, i32 %83, i32 0, !dbg !33 + %88 = select i1 %81, i32 %84, i32 0, !dbg !33 + %89 = select i1 %80, i32 %85, i32 0, !dbg !33 + %90 = select i1 %82, i32 %86, i32 0, !dbg !33 + %91 = xor i32 %87, %69, !dbg !34 + %92 = xor i32 %87, %70, !dbg !34 + %93 = xor i32 %88, %71, !dbg !34 + %94 = xor i32 %88, %72, !dbg !34 + %95 = xor i32 %89, %73, !dbg !34 + %96 = xor i32 %89, %74, !dbg !34 + %97 = xor i32 %90, %75, !dbg !34 + %98 = xor i32 %90, %76, !dbg !34 + %99 = select i1 %78, i32 %57, i32 0, !dbg !35 + %100 = select i1 %81, i32 %58, i32 0, !dbg !35 + %101 = select i1 %80, i32 %57, i32 0, !dbg !35 + %102 = select i1 %82, i32 %58, i32 0, !dbg !35 + %103 = xor i32 %99, %23, !dbg !36 + %104 = xor i32 %99, %24, !dbg !36 + %105 = xor i32 %100, %25, !dbg !36 + %106 = xor i32 %100, %26, !dbg !36 + %107 = xor i32 %101, %23, !dbg !36 + %108 = xor i32 %101, %24, !dbg !36 + %109 = xor i32 %102, %25, !dbg !36 + %110 = xor i32 %102, %26, !dbg !36 + %111 = icmp samesign uge i32 %91, %93, !dbg !37 + %112 = icmp ne i32 %91, %93, !dbg !37 + %113 = icmp samesign ule i32 %103, %105, !dbg !37 + %114 = or i1 %113, %112, !dbg !37 + %115 = and i1 %111, %114, !dbg !37 + %.not28 = xor i1 %115, %59, !dbg !37 + %116 = icmp samesign uge i32 %92, %94, !dbg !37 + %117 = icmp ne i32 %92, %94, !dbg !37 + %118 = icmp samesign ule i32 %104, %106, !dbg !37 + %119 = or i1 %118, %117, !dbg !37 + %120 = and i1 %116, %119, !dbg !37 + %.not29 = xor i1 %120, %59, !dbg !37 + %121 = icmp samesign uge i32 %95, %97, !dbg !37 + %122 = icmp ne i32 %95, %97, !dbg !37 + %123 = icmp samesign ule i32 %107, %109, !dbg !37 + %124 = or i1 %123, %122, !dbg !37 + %125 = and i1 %121, %124, !dbg !37 + %.not30 = xor i1 %125, %59, !dbg !37 + %126 = icmp samesign uge i32 %96, %98, !dbg !37 + %127 = icmp ne i32 %96, %98, !dbg !37 + %128 = icmp samesign ule i32 %108, %110, !dbg !37 + %129 = or i1 %128, %127, !dbg !37 + %130 = and i1 %126, %129, !dbg !37 + %.not31 = xor i1 %130, %59, !dbg !37 + %131 = xor i32 %93, %91, !dbg !32 + %132 = xor i32 %94, %92, !dbg !32 + %133 = xor i32 %97, %95, !dbg !32 + %134 = xor i32 %98, %96, !dbg !32 + %135 = select i1 %.not28, i32 0, i32 %131, !dbg !33 + %136 = select i1 %.not29, i32 0, i32 %132, !dbg !33 + %137 = select i1 %.not30, i32 0, i32 %133, !dbg !33 + %138 = select i1 %.not31, i32 0, i32 %134, !dbg !33 + %139 = xor i32 %135, %91, !dbg !34 + %140 = xor i32 %136, %92, !dbg !34 + %141 = xor i32 %135, %93, !dbg !34 + %142 = xor i32 %136, %94, !dbg !34 + %143 = xor i32 %137, %95, !dbg !34 + %144 = xor i32 %138, %96, !dbg !34 + %145 = xor i32 %137, %97, !dbg !34 + %146 = xor i32 %138, %98, !dbg !34 + %147 = xor i32 %105, %103, !dbg !38 + %148 = xor i32 %106, %104, !dbg !38 + %149 = xor i32 %109, %107, !dbg !38 + %150 = xor i32 %110, %108, !dbg !38 + %151 = select i1 %.not28, i32 0, i32 %147, !dbg !35 + %152 = select i1 %.not29, i32 0, i32 %148, !dbg !35 + %153 = select i1 %.not30, i32 0, i32 %149, !dbg !35 + %154 = select i1 %.not31, i32 0, i32 %150, !dbg !35 + %155 = xor i32 %151, %103, !dbg !36 + %156 = xor i32 %152, %104, !dbg !36 + %157 = xor i32 %151, %105, !dbg !36 + %158 = xor i32 %152, %106, !dbg !36 + %159 = xor i32 %153, %107, !dbg !36 + %160 = xor i32 %154, %108, !dbg !36 + %161 = xor i32 %153, %109, !dbg !36 + %162 = xor i32 %154, %110, !dbg !36 + %163 = icmp samesign uge i32 %139, %140, !dbg !37 + %164 = icmp ne i32 %139, %140, !dbg !37 + %165 = icmp samesign ule i32 %155, %156, !dbg !37 + %166 = or i1 %164, %165, !dbg !37 + %167 = and i1 %163, %166, !dbg !37 + %.not32 = xor i1 %167, %59, !dbg !37 + %168 = icmp samesign uge i32 %141, %142, !dbg !37 + %169 = icmp ne i32 %141, %142, !dbg !37 + %170 = icmp samesign ule i32 %157, %158, !dbg !37 + %171 = or i1 %169, %170, !dbg !37 + %172 = and i1 %168, %171, !dbg !37 + %.not33 = xor i1 %172, %59, !dbg !37 + %173 = icmp samesign uge i32 %143, %144, !dbg !37 + %174 = icmp ne i32 %143, %144, !dbg !37 + %175 = icmp samesign ule i32 %159, %160, !dbg !37 + %176 = or i1 %174, %175, !dbg !37 + %177 = and i1 %173, %176, !dbg !37 + %.not34 = xor i1 %177, %59, !dbg !37 + %178 = icmp samesign uge i32 %145, %146, !dbg !37 + %179 = icmp ne i32 %145, %146, !dbg !37 + %180 = icmp samesign ule i32 %161, %162, !dbg !37 + %181 = or i1 %179, %180, !dbg !37 + %182 = and i1 %178, %181, !dbg !37 + %.not35 = xor i1 %182, %59, !dbg !37 + %183 = xor i32 %139, %140, !dbg !32 + %184 = xor i32 %141, %142, !dbg !32 + %185 = xor i32 %143, %144, !dbg !32 + %186 = xor i32 %145, %146, !dbg !32 + %187 = select i1 %.not32, i32 0, i32 %183, !dbg !33 + %188 = select i1 %.not33, i32 0, i32 %184, !dbg !33 + %189 = select i1 %.not34, i32 0, i32 %185, !dbg !33 + %190 = select i1 %.not35, i32 0, i32 %186, !dbg !33 + %191 = xor i32 %187, %139, !dbg !34 + %192 = xor i32 %187, %140, !dbg !34 + %193 = xor i32 %188, %141, !dbg !34 + %194 = xor i32 %188, %142, !dbg !34 + %195 = xor i32 %189, %143, !dbg !34 + %196 = xor i32 %189, %144, !dbg !34 + %197 = xor i32 %190, %145, !dbg !34 + %198 = xor i32 %190, %146, !dbg !34 + %199 = xor i32 %155, %156, !dbg !38 + %200 = xor i32 %157, %158, !dbg !38 + %201 = xor i32 %159, %160, !dbg !38 + %202 = xor i32 %161, %162, !dbg !38 + %203 = select i1 %.not32, i32 0, i32 %199, !dbg !35 + %204 = select i1 %.not33, i32 0, i32 %200, !dbg !35 + %205 = select i1 %.not34, i32 0, i32 %201, !dbg !35 + %206 = select i1 %.not35, i32 0, i32 %202, !dbg !35 + %207 = mul nuw nsw i32 %191, %55, !dbg !39 + %208 = mul nuw nsw i32 %192, %55, !dbg !39 + %209 = mul nuw nsw i32 %193, %55, !dbg !39 + %210 = mul nuw nsw i32 %194, %55, !dbg !39 + %211 = mul nuw nsw i32 %195, %55, !dbg !39 + %212 = mul nuw nsw i32 %196, %55, !dbg !39 + %213 = mul nuw nsw i32 %197, %55, !dbg !39 + %214 = mul nuw nsw i32 %198, %55, !dbg !39 + %215 = mul nuw nsw i32 %191, %53, !dbg !40 + %216 = mul nuw nsw i32 %192, %53, !dbg !40 + %217 = mul nuw nsw i32 %193, %53, !dbg !40 + %218 = mul nuw nsw i32 %194, %53, !dbg !40 + %219 = mul nuw nsw i32 %195, %53, !dbg !40 + %220 = mul nuw nsw i32 %196, %53, !dbg !40 + %221 = mul nuw nsw i32 %197, %53, !dbg !40 + %222 = mul nuw nsw i32 %198, %53, !dbg !40 + %223 = insertelement <4 x i32> poison, i32 %204, i64 0, !dbg !36 + %224 = insertelement <4 x i32> %223, i32 %203, i64 1, !dbg !36 + %225 = shufflevector <4 x i32> %224, <4 x i32> poison, <4 x i32> , !dbg !36 + %226 = insertelement <4 x i32> poison, i32 %155, i64 0, !dbg !36 + %227 = insertelement <4 x i32> %226, i32 %156, i64 1, !dbg !36 + %228 = insertelement <4 x i32> %227, i32 %157, i64 2, !dbg !36 + %229 = insertelement <4 x i32> %228, i32 %158, i64 3, !dbg !36 + %230 = xor <4 x i32> %225, %229, !dbg !36 + %231 = extractelement <4 x i32> %230, i64 0, !dbg !41 + %232 = mul nuw nsw i32 %231, %55, !dbg !42 + %233 = extractelement <4 x i32> %230, i64 1, !dbg !41 + %234 = mul nuw nsw i32 %233, %55, !dbg !42 + %235 = extractelement <4 x i32> %230, i64 2, !dbg !41 + %236 = mul nuw nsw i32 %235, %55, !dbg !42 + %237 = extractelement <4 x i32> %230, i64 3, !dbg !41 + %238 = mul nuw nsw i32 %237, %55, !dbg !42 + %239 = mul nuw nsw i32 %231, %53, !dbg !41 + %240 = mul nuw nsw i32 %233, %53, !dbg !41 + %241 = mul nuw nsw i32 %235, %53, !dbg !41 + %242 = mul nuw nsw i32 %237, %53, !dbg !41 + %243 = insertelement <2 x i1> poison, i1 %60, i64 0, !dbg !37 + %244 = shufflevector <2 x i1> %243, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !37 + %245 = insertelement <4 x i32> poison, i32 %56, i64 0, !dbg !42 + %246 = shufflevector <4 x i32> %245, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !42 + %247 = insertelement <4 x i32> poison, i32 %.lobit, i64 0, !dbg !41 + %248 = shufflevector <4 x i32> %247, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !41 + %249 = insertelement <4 x i32> poison, i32 %55, i64 0, !dbg !39 + %250 = shufflevector <4 x i32> %249, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !39 + %251 = insertelement <4 x i32> poison, i32 %53, i64 0, !dbg !40 + %252 = shufflevector <4 x i32> %251, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !40 + %253 = insertelement <4 x i32> poison, i32 %206, i64 0, !dbg !36 + %254 = insertelement <4 x i32> %253, i32 %205, i64 1, !dbg !36 + %255 = shufflevector <4 x i32> %254, <4 x i32> poison, <4 x i32> , !dbg !36 + %256 = insertelement <4 x i32> poison, i32 %159, i64 0, !dbg !36 + %257 = insertelement <4 x i32> %256, i32 %160, i64 1, !dbg !36 + %258 = insertelement <4 x i32> %257, i32 %161, i64 2, !dbg !36 + %259 = insertelement <4 x i32> %258, i32 %162, i64 3, !dbg !36 + %260 = xor <4 x i32> %255, %259, !dbg !36 + %261 = extractelement <4 x i32> %260, i64 0, !dbg !41 + %262 = mul nuw nsw i32 %261, %55, !dbg !42 + %263 = extractelement <4 x i32> %260, i64 1, !dbg !41 + %264 = mul nuw nsw i32 %263, %55, !dbg !42 + %265 = extractelement <4 x i32> %260, i64 2, !dbg !41 + %266 = mul nuw nsw i32 %265, %55, !dbg !42 + %267 = extractelement <4 x i32> %260, i64 3, !dbg !41 + %268 = mul nuw nsw i32 %267, %55, !dbg !42 + %269 = mul nuw nsw i32 %261, %53, !dbg !41 + %270 = mul nuw nsw i32 %263, %53, !dbg !41 + %271 = mul nuw nsw i32 %265, %53, !dbg !41 + %272 = mul nuw nsw i32 %267, %53, !dbg !41 + %273 = insertelement <4 x i64> poison, i64 %42, i64 0, !dbg !43 + %274 = insertelement <4 x i64> %273, i64 %43, i64 1, !dbg !43 + %275 = insertelement <4 x i64> %274, i64 %45, i64 2, !dbg !43 + %276 = insertelement <4 x i64> %275, i64 %46, i64 3, !dbg !43 + %277 = add <4 x i64> %276, splat (i64 -1), !dbg !43 + %278 = icmp ult <4 x i64> %277, splat (i64 16383), !dbg !43 + %279 = extractelement <4 x i1> %278, i64 0, !dbg !44 + %280 = zext i1 %279 to i32, !dbg !28 + %281 = extractelement <4 x i1> %278, i64 1, !dbg !44 + %282 = zext i1 %281 to i32, !dbg !28 + %283 = extractelement <4 x i1> %278, i64 2, !dbg !45 + %284 = zext i1 %283 to i32, !dbg !28 + %285 = extractelement <4 x i1> %278, i64 3, !dbg !45 + %286 = zext i1 %285 to i32, !dbg !28 + %287 = xor i1 %279, true, !dbg !44 + %288 = and i1 %281, %287, !dbg !44 + %.not = xor i1 %285, true, !dbg !45 + %289 = or i1 %283, %.not, !dbg !45 + %290 = xor i32 %280, %282, !dbg !46 + %291 = xor i32 %284, %286, !dbg !46 + %292 = select i1 %288, i32 %290, i32 0, !dbg !47 + %293 = select i1 %289, i32 %291, i32 0, !dbg !47 + %294 = xor i32 %292, %280, !dbg !48 + %295 = xor i32 %292, %282, !dbg !48 + %296 = xor i32 %293, %284, !dbg !48 + %297 = xor i32 %293, %286, !dbg !48 + %298 = select i1 %288, i32 %57, i32 0, !dbg !49 + %299 = select i1 %289, i32 %58, i32 0, !dbg !49 + %300 = xor i32 %298, %23, !dbg !50 + %301 = xor i32 %298, %24, !dbg !50 + %302 = xor i32 %299, %25, !dbg !50 + %303 = xor i32 %299, %26, !dbg !50 + %304 = insertelement <2 x i32> poison, i32 %294, i64 0, !dbg !26 + %305 = insertelement <2 x i32> %304, i32 %295, i64 1, !dbg !26 + %306 = insertelement <2 x i32> poison, i32 %296, i64 0, !dbg !26 + %307 = insertelement <2 x i32> %306, i32 %297, i64 1, !dbg !26 + %308 = icmp samesign uge <2 x i32> %305, %307, !dbg !26 + %309 = insertelement <2 x i32> %307, i32 %295, i64 1, !dbg !26 + %310 = insertelement <2 x i32> %305, i32 %297, i64 1, !dbg !26 + %311 = icmp ne <2 x i32> %309, %310, !dbg !26 + %312 = insertelement <2 x i32> poison, i32 %300, i64 0, !dbg !26 + %313 = insertelement <2 x i32> %312, i32 %301, i64 1, !dbg !26 + %314 = insertelement <2 x i32> poison, i32 %302, i64 0, !dbg !26 + %315 = insertelement <2 x i32> %314, i32 %303, i64 1, !dbg !26 + %316 = icmp ule <2 x i32> %313, %315, !dbg !26 + %317 = or <2 x i1> %316, %311, !dbg !26 + %318 = and <2 x i1> %308, %317, !dbg !26 + %319 = insertelement <2 x i1> poison, i1 %59, i64 0, !dbg !26 + %320 = shufflevector <2 x i1> %319, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !26 + %321 = xor <2 x i1> %318, %320, !dbg !26 + %322 = xor <2 x i32> %310, %309, !dbg !46 + %323 = select <2 x i1> %321, <2 x i32> zeroinitializer, <2 x i32> %322, !dbg !47 + %324 = extractelement <2 x i32> %323, i64 0, !dbg !48 + %325 = xor i32 %324, %294, !dbg !48 + %326 = extractelement <2 x i32> %323, i64 1, !dbg !48 + %327 = xor i32 %326, %295, !dbg !48 + %328 = xor <2 x i32> %323, %307, !dbg !48 + %329 = xor i32 %302, %300, !dbg !25 + %330 = xor i32 %303, %301, !dbg !25 + %331 = extractelement <2 x i1> %321, i64 0, !dbg !49 + %332 = select i1 %331, i32 0, i32 %329, !dbg !49 + %333 = extractelement <2 x i1> %321, i64 1, !dbg !49 + %334 = select i1 %333, i32 0, i32 %330, !dbg !49 + %335 = xor i32 %332, %300, !dbg !50 + %336 = xor i32 %334, %301, !dbg !50 + %337 = xor i32 %332, %302, !dbg !50 + %338 = xor i32 %334, %303, !dbg !50 + %339 = icmp samesign uge i32 %325, %327, !dbg !26 + %340 = icmp ne i32 %325, %327, !dbg !26 + %341 = icmp samesign ule i32 %335, %336, !dbg !26 + %342 = or i1 %340, %341, !dbg !26 + %343 = and i1 %339, %342, !dbg !26 + %.not6 = xor i1 %343, %59, !dbg !26 + %344 = extractelement <2 x i32> %328, i64 0, !dbg !26 + %345 = extractelement <2 x i32> %328, i64 1, !dbg !26 + %346 = icmp samesign uge i32 %344, %345, !dbg !26 + %347 = icmp ne i32 %344, %345, !dbg !26 + %348 = icmp samesign ule i32 %337, %338, !dbg !26 + %349 = or i1 %347, %348, !dbg !26 + %350 = and i1 %346, %349, !dbg !26 + %.not7 = xor i1 %350, %59, !dbg !26 + %351 = xor i32 %325, %327, !dbg !46 + %352 = xor i32 %344, %345, !dbg !46 + %353 = select i1 %.not6, i32 0, i32 %351, !dbg !47 + %354 = select i1 %.not7, i32 0, i32 %352, !dbg !47 + %355 = xor i32 %353, %325, !dbg !48 + %356 = xor i32 %353, %327, !dbg !48 + %357 = xor i32 %354, %344, !dbg !48 + %358 = xor i32 %354, %345, !dbg !48 + %359 = xor i32 %335, %336, !dbg !25 + %360 = xor i32 %337, %338, !dbg !25 + %361 = select i1 %.not6, i32 0, i32 %359, !dbg !49 + %362 = select i1 %.not7, i32 0, i32 %360, !dbg !49 + %363 = xor i32 %361, %335, !dbg !50 + %364 = xor i32 %361, %336, !dbg !50 + %365 = xor i32 %362, %337, !dbg !50 + %366 = xor i32 %362, %338, !dbg !50 + %367 = mul nuw nsw i32 %355, %55, !dbg !51 + %368 = mul nuw nsw i32 %356, %55, !dbg !51 + %369 = mul nuw nsw i32 %357, %55, !dbg !51 + %370 = mul nuw nsw i32 %358, %55, !dbg !51 + %371 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %367, i32 1, i32 31), !dbg !52 + %372 = add i32 %367, %371, !dbg !55 + %373 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %368, i32 1, i32 31), !dbg !52 + %374 = add i32 %368, %373, !dbg !55 + %375 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %369, i32 1, i32 31), !dbg !52 + %376 = add i32 %369, %375, !dbg !55 + %377 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %370, i32 1, i32 31), !dbg !52 + %378 = add i32 %370, %377, !dbg !55 + %379 = mul nuw nsw i32 %355, %53, !dbg !56 + %380 = mul nuw nsw i32 %356, %53, !dbg !56 + %381 = mul nuw nsw i32 %357, %53, !dbg !56 + %382 = mul nuw nsw i32 %358, %53, !dbg !56 + %383 = mul nuw nsw i32 %363, %55, !dbg !57 + %384 = mul nuw nsw i32 %364, %55, !dbg !57 + %385 = mul nuw nsw i32 %365, %55, !dbg !57 + %386 = mul nuw nsw i32 %366, %55, !dbg !57 + %387 = mul nuw nsw i32 %363, %53, !dbg !58 + %388 = mul nuw nsw i32 %364, %53, !dbg !58 + %389 = mul nuw nsw i32 %365, %53, !dbg !58 + %390 = mul nuw nsw i32 %366, %53, !dbg !58 + %391 = insertelement <4 x i1> poison, i1 %20, i64 0, !dbg !59 + %392 = shufflevector <4 x i1> %391, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !59 + %393 = select <4 x i1> %392, <4 x i1> %278, <4 x i1> zeroinitializer, !dbg !59 + %394 = insertelement <4 x i64> poison, i64 %48, i64 0, !dbg !43 + %395 = insertelement <4 x i64> %394, i64 %49, i64 1, !dbg !43 + %396 = insertelement <4 x i64> %395, i64 %51, i64 2, !dbg !43 + %397 = insertelement <4 x i64> %396, i64 %52, i64 3, !dbg !43 + %398 = add <4 x i64> %397, splat (i64 -1), !dbg !43 + %399 = icmp ult <4 x i64> %398, splat (i64 16383), !dbg !43 + %400 = extractelement <4 x i1> %399, i64 0, !dbg !44 + %401 = zext i1 %400 to i32, !dbg !28 + %402 = extractelement <4 x i1> %399, i64 1, !dbg !44 + %403 = zext i1 %402 to i32, !dbg !28 + %404 = extractelement <4 x i1> %399, i64 2, !dbg !45 + %405 = zext i1 %404 to i32, !dbg !28 + %406 = extractelement <4 x i1> %399, i64 3, !dbg !45 + %407 = zext i1 %406 to i32, !dbg !28 + %408 = xor i1 %400, true, !dbg !44 + %409 = and i1 %402, %408, !dbg !44 + %.not1 = xor i1 %406, true, !dbg !45 + %410 = or i1 %404, %.not1, !dbg !45 + %411 = xor i32 %401, %403, !dbg !46 + %412 = xor i32 %405, %407, !dbg !46 + %413 = select i1 %409, i32 %411, i32 0, !dbg !47 + %414 = select i1 %410, i32 %412, i32 0, !dbg !47 + %415 = xor i32 %413, %401, !dbg !48 + %416 = xor i32 %413, %403, !dbg !48 + %417 = xor i32 %414, %405, !dbg !48 + %418 = xor i32 %414, %407, !dbg !48 + %419 = select i1 %409, i32 %57, i32 0, !dbg !49 + %420 = select i1 %410, i32 %58, i32 0, !dbg !49 + %421 = xor i32 %419, %23, !dbg !50 + %422 = xor i32 %419, %24, !dbg !50 + %423 = xor i32 %420, %25, !dbg !50 + %424 = xor i32 %420, %26, !dbg !50 + %425 = insertelement <2 x i32> poison, i32 %415, i64 0, !dbg !26 + %426 = insertelement <2 x i32> %425, i32 %416, i64 1, !dbg !26 + %427 = insertelement <2 x i32> poison, i32 %417, i64 0, !dbg !26 + %428 = insertelement <2 x i32> %427, i32 %418, i64 1, !dbg !26 + %429 = icmp samesign uge <2 x i32> %426, %428, !dbg !26 + %430 = insertelement <2 x i32> %428, i32 %416, i64 1, !dbg !26 + %431 = insertelement <2 x i32> %426, i32 %418, i64 1, !dbg !26 + %432 = icmp ne <2 x i32> %430, %431, !dbg !26 + %433 = insertelement <2 x i32> poison, i32 %421, i64 0, !dbg !26 + %434 = insertelement <2 x i32> %433, i32 %422, i64 1, !dbg !26 + %435 = insertelement <2 x i32> poison, i32 %423, i64 0, !dbg !26 + %436 = insertelement <2 x i32> %435, i32 %424, i64 1, !dbg !26 + %437 = icmp ule <2 x i32> %434, %436, !dbg !26 + %438 = or <2 x i1> %437, %432, !dbg !26 + %439 = and <2 x i1> %429, %438, !dbg !26 + %440 = xor <2 x i1> %439, %320, !dbg !26 + %441 = xor <2 x i32> %431, %430, !dbg !46 + %442 = select <2 x i1> %440, <2 x i32> zeroinitializer, <2 x i32> %441, !dbg !47 + %443 = extractelement <2 x i32> %442, i64 0, !dbg !48 + %444 = xor i32 %443, %415, !dbg !48 + %445 = extractelement <2 x i32> %442, i64 1, !dbg !48 + %446 = xor i32 %445, %416, !dbg !48 + %447 = xor <2 x i32> %442, %428, !dbg !48 + %448 = xor i32 %423, %421, !dbg !25 + %449 = xor i32 %424, %422, !dbg !25 + %450 = extractelement <2 x i1> %440, i64 0, !dbg !49 + %451 = select i1 %450, i32 0, i32 %448, !dbg !49 + %452 = extractelement <2 x i1> %440, i64 1, !dbg !49 + %453 = select i1 %452, i32 0, i32 %449, !dbg !49 + %454 = xor i32 %451, %421, !dbg !50 + %455 = xor i32 %453, %422, !dbg !50 + %456 = xor i32 %451, %423, !dbg !50 + %457 = xor i32 %453, %424, !dbg !50 + %458 = icmp samesign uge i32 %444, %446, !dbg !26 + %459 = icmp ne i32 %444, %446, !dbg !26 + %460 = icmp samesign ule i32 %454, %455, !dbg !26 + %461 = or i1 %459, %460, !dbg !26 + %462 = and i1 %458, %461, !dbg !26 + %.not8 = xor i1 %462, %59, !dbg !26 + %463 = extractelement <2 x i32> %447, i64 0, !dbg !26 + %464 = extractelement <2 x i32> %447, i64 1, !dbg !26 + %465 = icmp samesign uge i32 %463, %464, !dbg !26 + %466 = icmp ne i32 %463, %464, !dbg !26 + %467 = icmp samesign ule i32 %456, %457, !dbg !26 + %468 = or i1 %466, %467, !dbg !26 + %469 = and i1 %465, %468, !dbg !26 + %.not9 = xor i1 %469, %59, !dbg !26 + %470 = xor i32 %444, %446, !dbg !46 + %471 = xor i32 %463, %464, !dbg !46 + %472 = select i1 %.not8, i32 0, i32 %470, !dbg !47 + %473 = select i1 %.not9, i32 0, i32 %471, !dbg !47 + %474 = xor i32 %472, %444, !dbg !48 + %475 = xor i32 %472, %446, !dbg !48 + %476 = xor i32 %473, %463, !dbg !48 + %477 = xor i32 %473, %464, !dbg !48 + %478 = xor i32 %454, %455, !dbg !25 + %479 = xor i32 %456, %457, !dbg !25 + %480 = select i1 %.not8, i32 0, i32 %478, !dbg !49 + %481 = select i1 %.not9, i32 0, i32 %479, !dbg !49 + %482 = xor i32 %480, %454, !dbg !50 + %483 = xor i32 %480, %455, !dbg !50 + %484 = xor i32 %481, %456, !dbg !50 + %485 = xor i32 %481, %457, !dbg !50 + %486 = mul nuw nsw i32 %474, %55, !dbg !51 + %487 = mul nuw nsw i32 %475, %55, !dbg !51 + %488 = mul nuw nsw i32 %476, %55, !dbg !51 + %489 = mul nuw nsw i32 %477, %55, !dbg !51 + %490 = mul nuw nsw i32 %474, %53, !dbg !56 + %491 = mul nuw nsw i32 %475, %53, !dbg !56 + %492 = mul nuw nsw i32 %476, %53, !dbg !56 + %493 = mul nuw nsw i32 %477, %53, !dbg !56 + %494 = mul nuw nsw i32 %482, %55, !dbg !57 + %495 = mul nuw nsw i32 %483, %55, !dbg !57 + %496 = mul nuw nsw i32 %484, %55, !dbg !57 + %497 = mul nuw nsw i32 %485, %55, !dbg !57 + %498 = mul nuw nsw i32 %482, %53, !dbg !58 + %499 = mul nuw nsw i32 %483, %53, !dbg !58 + %500 = mul nuw nsw i32 %484, %53, !dbg !58 + %501 = mul nuw nsw i32 %485, %53, !dbg !58 + %502 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %486, i32 1, i32 31), !dbg !52 + %503 = add i32 %486, %502, !dbg !55 + %504 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %487, i32 1, i32 31), !dbg !52 + %505 = add i32 %487, %504, !dbg !55 + %506 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %488, i32 1, i32 31), !dbg !52 + %507 = add i32 %488, %506, !dbg !55 + %508 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %489, i32 1, i32 31), !dbg !52 + %509 = add i32 %489, %508, !dbg !55 + %510 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %379, i32 1, i32 31), !dbg !52 + %511 = add i32 %379, %510, !dbg !55 + %512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %380, i32 1, i32 31), !dbg !52 + %513 = add i32 %380, %512, !dbg !55 + %514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %381, i32 1, i32 31), !dbg !52 + %515 = add i32 %381, %514, !dbg !55 + %516 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %382, i32 1, i32 31), !dbg !52 + %517 = add i32 %382, %516, !dbg !55 + %518 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %490, i32 1, i32 31), !dbg !52 + %519 = add i32 %490, %518, !dbg !55 + %520 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %491, i32 1, i32 31), !dbg !52 + %521 = add i32 %491, %520, !dbg !55 + %522 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %492, i32 1, i32 31), !dbg !52 + %523 = add i32 %492, %522, !dbg !55 + %524 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %493, i32 1, i32 31), !dbg !52 + %525 = add i32 %493, %524, !dbg !55 + %526 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %383, i32 1, i32 31), !dbg !52 + %527 = add i32 %383, %526, !dbg !55 + %528 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %384, i32 1, i32 31), !dbg !52 + %529 = add i32 %384, %528, !dbg !55 + %530 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %385, i32 1, i32 31), !dbg !52 + %531 = add i32 %530, %385, !dbg !55 + %532 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %386, i32 1, i32 31), !dbg !52 + %533 = add i32 %532, %386, !dbg !55 + %534 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %494, i32 1, i32 31), !dbg !52 + %535 = add i32 %534, %494, !dbg !55 + %536 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %495, i32 1, i32 31), !dbg !52 + %537 = add i32 %536, %495, !dbg !55 + %538 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %496, i32 1, i32 31), !dbg !52 + %539 = add i32 %538, %496, !dbg !55 + %540 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %497, i32 1, i32 31), !dbg !52 + %541 = add i32 %540, %497, !dbg !55 + %542 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %387, i32 1, i32 31), !dbg !52 + %543 = add i32 %542, %387, !dbg !55 + %544 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %388, i32 1, i32 31), !dbg !52 + %545 = add i32 %544, %388, !dbg !55 + %546 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %389, i32 1, i32 31), !dbg !52 + %547 = add i32 %546, %389, !dbg !55 + %548 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %390, i32 1, i32 31), !dbg !52 + %549 = add i32 %548, %390, !dbg !55 + %550 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %498, i32 1, i32 31), !dbg !52 + %551 = add i32 %550, %498, !dbg !55 + %552 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %499, i32 1, i32 31), !dbg !52 + %553 = add i32 %552, %499, !dbg !55 + %554 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %500, i32 1, i32 31), !dbg !52 + %555 = add i32 %554, %500, !dbg !55 + %556 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %501, i32 1, i32 31), !dbg !52 + %557 = add i32 %556, %501, !dbg !55 + %558 = icmp sge i32 %372, %511, !dbg !26 + %559 = icmp ne i32 %372, %511, !dbg !26 + %560 = icmp sle i32 %527, %543, !dbg !26 + %561 = or i1 %559, %560, !dbg !26 + %562 = and i1 %558, %561, !dbg !26 + %.not10 = xor i1 %562, %60, !dbg !26 + %563 = icmp sge i32 %374, %513, !dbg !26 + %564 = icmp ne i32 %374, %513, !dbg !26 + %565 = icmp sle i32 %529, %545, !dbg !26 + %566 = or i1 %564, %565, !dbg !26 + %567 = and i1 %563, %566, !dbg !26 + %.not11 = xor i1 %567, %60, !dbg !26 + %568 = icmp sge i32 %376, %515, !dbg !26 + %569 = icmp ne i32 %376, %515, !dbg !26 + %570 = icmp sle i32 %531, %547, !dbg !26 + %571 = or i1 %569, %570, !dbg !26 + %572 = and i1 %568, %571, !dbg !26 + %.not12 = xor i1 %572, %60, !dbg !26 + %573 = icmp sge i32 %378, %517, !dbg !26 + %574 = icmp ne i32 %378, %517, !dbg !26 + %575 = icmp sle i32 %533, %549, !dbg !26 + %576 = or i1 %574, %575, !dbg !26 + %577 = and i1 %573, %576, !dbg !26 + %.not13 = xor i1 %577, %60, !dbg !26 + %578 = icmp sge i32 %503, %519, !dbg !26 + %579 = icmp ne i32 %503, %519, !dbg !26 + %580 = icmp sle i32 %535, %551, !dbg !26 + %581 = or i1 %579, %580, !dbg !26 + %582 = and i1 %578, %581, !dbg !26 + %.not14 = xor i1 %582, %60, !dbg !26 + %583 = icmp sge i32 %505, %521, !dbg !26 + %584 = icmp ne i32 %505, %521, !dbg !26 + %585 = icmp sle i32 %537, %553, !dbg !26 + %586 = or i1 %584, %585, !dbg !26 + %587 = and i1 %583, %586, !dbg !26 + %.not15 = xor i1 %587, %60, !dbg !26 + %588 = icmp sge i32 %507, %523, !dbg !26 + %589 = icmp ne i32 %507, %523, !dbg !26 + %590 = icmp sle i32 %539, %555, !dbg !26 + %591 = or i1 %589, %590, !dbg !26 + %592 = and i1 %588, %591, !dbg !26 + %.not16 = xor i1 %592, %60, !dbg !26 + %593 = icmp sge i32 %509, %525, !dbg !26 + %594 = icmp ne i32 %509, %525, !dbg !26 + %595 = icmp sle i32 %541, %557, !dbg !26 + %596 = or i1 %594, %595, !dbg !26 + %597 = and i1 %593, %596, !dbg !26 + %.not17 = xor i1 %597, %60, !dbg !26 + %598 = xor i32 %372, %511, !dbg !46 + %599 = xor i32 %374, %513, !dbg !46 + %600 = xor i32 %376, %515, !dbg !46 + %601 = xor i32 %378, %517, !dbg !46 + %602 = xor i32 %503, %519, !dbg !46 + %603 = xor i32 %505, %521, !dbg !46 + %604 = xor i32 %507, %523, !dbg !46 + %605 = xor i32 %509, %525, !dbg !46 + %606 = select i1 %.not10, i32 0, i32 %598, !dbg !47 + %607 = select i1 %.not11, i32 0, i32 %599, !dbg !47 + %608 = select i1 %.not12, i32 0, i32 %600, !dbg !47 + %609 = select i1 %.not13, i32 0, i32 %601, !dbg !47 + %610 = select i1 %.not14, i32 0, i32 %602, !dbg !47 + %611 = select i1 %.not15, i32 0, i32 %603, !dbg !47 + %612 = select i1 %.not16, i32 0, i32 %604, !dbg !47 + %613 = select i1 %.not17, i32 0, i32 %605, !dbg !47 + %614 = xor i32 %606, %355, !dbg !48 + %615 = xor i32 %607, %356, !dbg !48 + %616 = xor i32 %608, %357, !dbg !48 + %617 = xor i32 %609, %358, !dbg !48 + %618 = xor i32 %610, %474, !dbg !48 + %619 = xor i32 %611, %475, !dbg !48 + %620 = xor i32 %612, %476, !dbg !48 + %621 = xor i32 %613, %477, !dbg !48 + %622 = xor i32 %543, %527, !dbg !25 + %623 = xor i32 %545, %529, !dbg !25 + %624 = xor i32 %547, %531, !dbg !25 + %625 = xor i32 %549, %533, !dbg !25 + %626 = xor i32 %551, %535, !dbg !25 + %627 = xor i32 %553, %537, !dbg !25 + %628 = xor i32 %555, %539, !dbg !25 + %629 = xor i32 %557, %541, !dbg !25 + %630 = select i1 %.not10, i32 0, i32 %622, !dbg !49 + %631 = select i1 %.not11, i32 0, i32 %623, !dbg !49 + %632 = select i1 %.not12, i32 0, i32 %624, !dbg !49 + %633 = select i1 %.not13, i32 0, i32 %625, !dbg !49 + %634 = select i1 %.not14, i32 0, i32 %626, !dbg !49 + %635 = select i1 %.not15, i32 0, i32 %627, !dbg !49 + %636 = select i1 %.not16, i32 0, i32 %628, !dbg !49 + %637 = select i1 %.not17, i32 0, i32 %629, !dbg !49 + %638 = xor i32 %630, %363, !dbg !50 + %639 = xor i32 %631, %364, !dbg !50 + %640 = xor i32 %632, %365, !dbg !50 + %641 = xor i32 %633, %366, !dbg !50 + %642 = xor i32 %634, %482, !dbg !50 + %643 = xor i32 %635, %483, !dbg !50 + %644 = xor i32 %636, %484, !dbg !50 + %645 = xor i32 %637, %485, !dbg !50 + %646 = icmp sge i32 %614, %616, !dbg !26 + %647 = icmp ne i32 %614, %616, !dbg !26 + %648 = icmp sle i32 %638, %640, !dbg !26 + %649 = or i1 %647, %648, !dbg !26 + %650 = and i1 %646, %649, !dbg !26 + %.not18 = xor i1 %650, %60, !dbg !26 + %651 = icmp sge i32 %615, %617, !dbg !26 + %652 = icmp ne i32 %615, %617, !dbg !26 + %653 = icmp sle i32 %639, %641, !dbg !26 + %654 = or i1 %652, %653, !dbg !26 + %655 = and i1 %651, %654, !dbg !26 + %.not19 = xor i1 %655, %60, !dbg !26 + %656 = icmp sge i32 %618, %620, !dbg !26 + %657 = icmp ne i32 %618, %620, !dbg !26 + %658 = icmp sle i32 %642, %644, !dbg !26 + %659 = or i1 %657, %658, !dbg !26 + %660 = and i1 %656, %659, !dbg !26 + %.not20 = xor i1 %660, %60, !dbg !26 + %661 = icmp sge i32 %619, %621, !dbg !26 + %662 = icmp ne i32 %619, %621, !dbg !26 + %663 = icmp sle i32 %643, %645, !dbg !26 + %664 = or i1 %662, %663, !dbg !26 + %665 = and i1 %661, %664, !dbg !26 + %.not21 = xor i1 %665, %60, !dbg !26 + %666 = xor i32 %616, %614, !dbg !46 + %667 = xor i32 %617, %615, !dbg !46 + %668 = xor i32 %620, %618, !dbg !46 + %669 = xor i32 %621, %619, !dbg !46 + %670 = select i1 %.not18, i32 0, i32 %666, !dbg !47 + %671 = select i1 %.not19, i32 0, i32 %667, !dbg !47 + %672 = select i1 %.not20, i32 0, i32 %668, !dbg !47 + %673 = select i1 %.not21, i32 0, i32 %669, !dbg !47 + %674 = xor i32 %670, %614, !dbg !48 + %675 = xor i32 %671, %615, !dbg !48 + %676 = xor i32 %670, %616, !dbg !48 + %677 = xor i32 %671, %617, !dbg !48 + %678 = xor i32 %672, %618, !dbg !48 + %679 = xor i32 %673, %619, !dbg !48 + %680 = xor i32 %672, %620, !dbg !48 + %681 = xor i32 %673, %621, !dbg !48 + %682 = xor i32 %640, %638, !dbg !25 + %683 = xor i32 %641, %639, !dbg !25 + %684 = xor i32 %644, %642, !dbg !25 + %685 = xor i32 %645, %643, !dbg !25 + %686 = select i1 %.not18, i32 0, i32 %682, !dbg !49 + %687 = select i1 %.not19, i32 0, i32 %683, !dbg !49 + %688 = select i1 %.not20, i32 0, i32 %684, !dbg !49 + %689 = select i1 %.not21, i32 0, i32 %685, !dbg !49 + %690 = xor i32 %686, %638, !dbg !50 + %691 = xor i32 %687, %639, !dbg !50 + %692 = xor i32 %686, %640, !dbg !50 + %693 = xor i32 %687, %641, !dbg !50 + %694 = xor i32 %688, %642, !dbg !50 + %695 = xor i32 %689, %643, !dbg !50 + %696 = xor i32 %688, %644, !dbg !50 + %697 = xor i32 %689, %645, !dbg !50 + %698 = icmp sge i32 %674, %675, !dbg !26 + %699 = icmp ne i32 %674, %675, !dbg !26 + %700 = icmp sle i32 %690, %691, !dbg !26 + %701 = or i1 %699, %700, !dbg !26 + %702 = and i1 %698, %701, !dbg !26 + %.not22 = xor i1 %702, %60, !dbg !26 + %703 = icmp sge i32 %676, %677, !dbg !26 + %704 = icmp ne i32 %676, %677, !dbg !26 + %705 = icmp sle i32 %692, %693, !dbg !26 + %706 = or i1 %704, %705, !dbg !26 + %707 = and i1 %703, %706, !dbg !26 + %.not23 = xor i1 %707, %60, !dbg !26 + %708 = icmp sge i32 %678, %679, !dbg !26 + %709 = icmp ne i32 %678, %679, !dbg !26 + %710 = icmp sle i32 %694, %695, !dbg !26 + %711 = or i1 %709, %710, !dbg !26 + %712 = and i1 %708, %711, !dbg !26 + %.not24 = xor i1 %712, %60, !dbg !26 + %713 = icmp sge i32 %680, %681, !dbg !26 + %714 = icmp ne i32 %680, %681, !dbg !26 + %715 = icmp sle i32 %696, %697, !dbg !26 + %716 = or i1 %714, %715, !dbg !26 + %717 = and i1 %713, %716, !dbg !26 + %.not25 = xor i1 %717, %60, !dbg !26 + %718 = xor i32 %675, %674, !dbg !46 + %719 = xor i32 %677, %676, !dbg !46 + %720 = xor i32 %679, %678, !dbg !46 + %721 = xor i32 %681, %680, !dbg !46 + %722 = select i1 %.not22, i32 0, i32 %718, !dbg !47 + %723 = select i1 %.not23, i32 0, i32 %719, !dbg !47 + %724 = select i1 %.not24, i32 0, i32 %720, !dbg !47 + %725 = select i1 %.not25, i32 0, i32 %721, !dbg !47 + %726 = xor i32 %722, %674, !dbg !48 + %727 = xor i32 %722, %675, !dbg !48 + %728 = xor i32 %723, %676, !dbg !48 + %729 = xor i32 %723, %677, !dbg !48 + %730 = xor i32 %724, %678, !dbg !48 + %731 = xor i32 %724, %679, !dbg !48 + %732 = xor i32 %725, %680, !dbg !48 + %733 = xor i32 %725, %681, !dbg !48 + %734 = xor i32 %691, %690, !dbg !25 + %735 = xor i32 %693, %692, !dbg !25 + %736 = xor i32 %695, %694, !dbg !25 + %737 = xor i32 %697, %696, !dbg !25 + %738 = select i1 %.not22, i32 0, i32 %734, !dbg !49 + %739 = select i1 %.not23, i32 0, i32 %735, !dbg !49 + %740 = select i1 %.not24, i32 0, i32 %736, !dbg !49 + %741 = select i1 %.not25, i32 0, i32 %737, !dbg !49 + %742 = xor i32 %738, %690, !dbg !50 + %743 = xor i32 %738, %691, !dbg !50 + %744 = xor i32 %739, %692, !dbg !50 + %745 = xor i32 %739, %693, !dbg !50 + %746 = xor i32 %740, %694, !dbg !50 + %747 = xor i32 %740, %695, !dbg !50 + %748 = xor i32 %741, %696, !dbg !50 + %749 = xor i32 %741, %697, !dbg !50 + %750 = mul nuw nsw i32 %726, %56, !dbg !51 + %751 = mul nuw nsw i32 %727, %56, !dbg !51 + %752 = mul nuw nsw i32 %728, %56, !dbg !51 + %753 = mul nuw nsw i32 %729, %56, !dbg !51 + %754 = mul nuw nsw i32 %730, %56, !dbg !51 + %755 = mul nuw nsw i32 %731, %56, !dbg !51 + %756 = mul nuw nsw i32 %732, %56, !dbg !51 + %757 = mul nuw nsw i32 %733, %56, !dbg !51 + %758 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %750, i32 2, i32 31), !dbg !52 + %759 = add i32 %750, %758, !dbg !55 + %760 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %751, i32 2, i32 31), !dbg !52 + %761 = add i32 %751, %760, !dbg !55 + %762 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %752, i32 2, i32 31), !dbg !52 + %763 = add i32 %752, %762, !dbg !55 + %764 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %753, i32 2, i32 31), !dbg !52 + %765 = add i32 %753, %764, !dbg !55 + %766 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %754, i32 2, i32 31), !dbg !52 + %767 = add i32 %754, %766, !dbg !55 + %768 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %755, i32 2, i32 31), !dbg !52 + %769 = add i32 %755, %768, !dbg !55 + %770 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %756, i32 2, i32 31), !dbg !52 + %771 = add i32 %756, %770, !dbg !55 + %772 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %757, i32 2, i32 31), !dbg !52 + %773 = add i32 %757, %772, !dbg !55 + %774 = mul nuw nsw i32 %726, %.lobit, !dbg !56 + %775 = mul nuw nsw i32 %727, %.lobit, !dbg !56 + %776 = mul nuw nsw i32 %728, %.lobit, !dbg !56 + %777 = mul nuw nsw i32 %729, %.lobit, !dbg !56 + %778 = mul nuw nsw i32 %730, %.lobit, !dbg !56 + %779 = mul nuw nsw i32 %731, %.lobit, !dbg !56 + %780 = mul nuw nsw i32 %732, %.lobit, !dbg !56 + %781 = mul nuw nsw i32 %733, %.lobit, !dbg !56 + %782 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %774, i32 2, i32 31), !dbg !52 + %783 = add i32 %774, %782, !dbg !55 + %784 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %775, i32 2, i32 31), !dbg !52 + %785 = add i32 %775, %784, !dbg !55 + %786 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %776, i32 2, i32 31), !dbg !52 + %787 = add i32 %776, %786, !dbg !55 + %788 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %777, i32 2, i32 31), !dbg !52 + %789 = add i32 %777, %788, !dbg !55 + %790 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %778, i32 2, i32 31), !dbg !52 + %791 = add i32 %778, %790, !dbg !55 + %792 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %779, i32 2, i32 31), !dbg !52 + %793 = add i32 %779, %792, !dbg !55 + %794 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %780, i32 2, i32 31), !dbg !52 + %795 = add i32 %780, %794, !dbg !55 + %796 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %781, i32 2, i32 31), !dbg !52 + %797 = add i32 %781, %796, !dbg !55 + %798 = mul nuw nsw i32 %742, %56, !dbg !57 + %799 = mul nuw nsw i32 %743, %56, !dbg !57 + %800 = mul nuw nsw i32 %744, %56, !dbg !57 + %801 = mul nuw nsw i32 %745, %56, !dbg !57 + %802 = mul nuw nsw i32 %746, %56, !dbg !57 + %803 = mul nuw nsw i32 %747, %56, !dbg !57 + %804 = mul nuw nsw i32 %748, %56, !dbg !57 + %805 = mul nuw nsw i32 %749, %56, !dbg !57 + %806 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %798, i32 2, i32 31), !dbg !52 + %807 = add i32 %806, %798, !dbg !55 + %808 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %799, i32 2, i32 31), !dbg !52 + %809 = add i32 %808, %799, !dbg !55 + %810 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %800, i32 2, i32 31), !dbg !52 + %811 = add i32 %810, %800, !dbg !55 + %812 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %801, i32 2, i32 31), !dbg !52 + %813 = add i32 %812, %801, !dbg !55 + %814 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %802, i32 2, i32 31), !dbg !52 + %815 = add i32 %814, %802, !dbg !55 + %816 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %803, i32 2, i32 31), !dbg !52 + %817 = add i32 %816, %803, !dbg !55 + %818 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %804, i32 2, i32 31), !dbg !52 + %819 = add i32 %818, %804, !dbg !55 + %820 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %805, i32 2, i32 31), !dbg !52 + %821 = add i32 %820, %805, !dbg !55 + %822 = mul nuw nsw i32 %742, %.lobit, !dbg !58 + %823 = mul nuw nsw i32 %743, %.lobit, !dbg !58 + %824 = mul nuw nsw i32 %744, %.lobit, !dbg !58 + %825 = mul nuw nsw i32 %745, %.lobit, !dbg !58 + %826 = mul nuw nsw i32 %746, %.lobit, !dbg !58 + %827 = mul nuw nsw i32 %747, %.lobit, !dbg !58 + %828 = mul nuw nsw i32 %748, %.lobit, !dbg !58 + %829 = mul nuw nsw i32 %749, %.lobit, !dbg !58 + %830 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %822, i32 2, i32 31), !dbg !52 + %831 = add i32 %830, %822, !dbg !55 + %832 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %823, i32 2, i32 31), !dbg !52 + %833 = add i32 %832, %823, !dbg !55 + %834 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %824, i32 2, i32 31), !dbg !52 + %835 = add i32 %834, %824, !dbg !55 + %836 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %825, i32 2, i32 31), !dbg !52 + %837 = add i32 %836, %825, !dbg !55 + %838 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %826, i32 2, i32 31), !dbg !52 + %839 = add i32 %838, %826, !dbg !55 + %840 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %827, i32 2, i32 31), !dbg !52 + %841 = add i32 %840, %827, !dbg !55 + %842 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %828, i32 2, i32 31), !dbg !52 + %843 = add i32 %842, %828, !dbg !55 + %844 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %829, i32 2, i32 31), !dbg !52 + %845 = add i32 %844, %829, !dbg !55 + %846 = icmp slt i32 %759, %783, !dbg !44 + %847 = icmp slt i32 %761, %785, !dbg !44 + %848 = icmp slt i32 %763, %787, !dbg !44 + %849 = icmp slt i32 %765, %789, !dbg !44 + %850 = icmp slt i32 %767, %791, !dbg !44 + %851 = icmp slt i32 %769, %793, !dbg !44 + %852 = icmp slt i32 %771, %795, !dbg !44 + %853 = icmp slt i32 %773, %797, !dbg !44 + %854 = icmp eq i32 %759, %783, !dbg !60 + %855 = icmp eq i32 %761, %785, !dbg !60 + %856 = icmp eq i32 %763, %787, !dbg !60 + %857 = icmp eq i32 %765, %789, !dbg !60 + %858 = icmp eq i32 %767, %791, !dbg !60 + %859 = icmp eq i32 %769, %793, !dbg !60 + %860 = icmp eq i32 %771, %795, !dbg !60 + %861 = icmp eq i32 %773, %797, !dbg !60 + %862 = icmp sgt i32 %807, %831, !dbg !61 + %863 = icmp sgt i32 %809, %833, !dbg !61 + %864 = icmp sgt i32 %811, %835, !dbg !61 + %865 = icmp sgt i32 %813, %837, !dbg !61 + %866 = icmp sgt i32 %815, %839, !dbg !61 + %867 = icmp sgt i32 %817, %841, !dbg !61 + %868 = icmp sgt i32 %819, %843, !dbg !61 + %869 = icmp sgt i32 %821, %845, !dbg !61 + %870 = and i1 %854, %862, !dbg !62 + %871 = and i1 %855, %863, !dbg !62 + %872 = and i1 %856, %864, !dbg !62 + %873 = and i1 %857, %865, !dbg !62 + %874 = and i1 %858, %866, !dbg !62 + %875 = and i1 %859, %867, !dbg !62 + %876 = and i1 %860, %868, !dbg !62 + %877 = and i1 %861, %869, !dbg !62 + %878 = or i1 %846, %870, !dbg !63 + %879 = or i1 %847, %871, !dbg !63 + %880 = or i1 %848, %872, !dbg !63 + %881 = or i1 %849, %873, !dbg !63 + %882 = or i1 %850, %874, !dbg !63 + %883 = or i1 %851, %875, !dbg !63 + %884 = or i1 %852, %876, !dbg !63 + %885 = or i1 %853, %877, !dbg !63 + %886 = xor i32 %759, %783, !dbg !46 + %887 = xor i32 %761, %785, !dbg !46 + %888 = xor i32 %763, %787, !dbg !46 + %889 = xor i32 %765, %789, !dbg !46 + %890 = xor i32 %767, %791, !dbg !46 + %891 = xor i32 %769, %793, !dbg !46 + %892 = xor i32 %771, %795, !dbg !46 + %893 = xor i32 %773, %797, !dbg !46 + %894 = select i1 %878, i32 %886, i32 0, !dbg !47 + %895 = select i1 %879, i32 %887, i32 0, !dbg !47 + %896 = select i1 %880, i32 %888, i32 0, !dbg !47 + %897 = select i1 %881, i32 %889, i32 0, !dbg !47 + %898 = select i1 %882, i32 %890, i32 0, !dbg !47 + %899 = select i1 %883, i32 %891, i32 0, !dbg !47 + %900 = select i1 %884, i32 %892, i32 0, !dbg !47 + %901 = select i1 %885, i32 %893, i32 0, !dbg !47 + %902 = xor i32 %894, %726, !dbg !48 + %903 = xor i32 %895, %727, !dbg !48 + %904 = xor i32 %896, %728, !dbg !48 + %905 = xor i32 %897, %729, !dbg !48 + %906 = xor i32 %898, %730, !dbg !48 + %907 = xor i32 %899, %731, !dbg !48 + %908 = xor i32 %900, %732, !dbg !48 + %909 = xor i32 %901, %733, !dbg !48 + %910 = xor i32 %831, %807, !dbg !25 + %911 = xor i32 %833, %809, !dbg !25 + %912 = xor i32 %835, %811, !dbg !25 + %913 = xor i32 %837, %813, !dbg !25 + %914 = xor i32 %839, %815, !dbg !25 + %915 = xor i32 %841, %817, !dbg !25 + %916 = xor i32 %843, %819, !dbg !25 + %917 = xor i32 %845, %821, !dbg !25 + %918 = select i1 %878, i32 %910, i32 0, !dbg !49 + %919 = select i1 %879, i32 %911, i32 0, !dbg !49 + %920 = select i1 %880, i32 %912, i32 0, !dbg !49 + %921 = select i1 %881, i32 %913, i32 0, !dbg !49 + %922 = select i1 %882, i32 %914, i32 0, !dbg !49 + %923 = select i1 %883, i32 %915, i32 0, !dbg !49 + %924 = select i1 %884, i32 %916, i32 0, !dbg !49 + %925 = select i1 %885, i32 %917, i32 0, !dbg !49 + %926 = xor i32 %918, %742, !dbg !50 + %927 = xor i32 %919, %743, !dbg !50 + %928 = xor i32 %920, %744, !dbg !50 + %929 = xor i32 %921, %745, !dbg !50 + %930 = xor i32 %922, %746, !dbg !50 + %931 = xor i32 %923, %747, !dbg !50 + %932 = xor i32 %924, %748, !dbg !50 + %933 = xor i32 %925, %749, !dbg !50 + %934 = mul nuw nsw i32 %902, %55, !dbg !51 + %935 = mul nuw nsw i32 %903, %55, !dbg !51 + %936 = mul nuw nsw i32 %904, %55, !dbg !51 + %937 = mul nuw nsw i32 %905, %55, !dbg !51 + %938 = mul nuw nsw i32 %906, %55, !dbg !51 + %939 = mul nuw nsw i32 %907, %55, !dbg !51 + %940 = mul nuw nsw i32 %908, %55, !dbg !51 + %941 = mul nuw nsw i32 %909, %55, !dbg !51 + %942 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %934, i32 1, i32 31), !dbg !52 + %943 = add i32 %942, %934, !dbg !55 + %944 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %935, i32 1, i32 31), !dbg !52 + %945 = add i32 %944, %935, !dbg !55 + %946 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %936, i32 1, i32 31), !dbg !52 + %947 = add i32 %946, %936, !dbg !55 + %948 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %937, i32 1, i32 31), !dbg !52 + %949 = add i32 %948, %937, !dbg !55 + %950 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %938, i32 1, i32 31), !dbg !52 + %951 = add i32 %950, %938, !dbg !55 + %952 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %939, i32 1, i32 31), !dbg !52 + %953 = add i32 %952, %939, !dbg !55 + %954 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %940, i32 1, i32 31), !dbg !52 + %955 = add i32 %954, %940, !dbg !55 + %956 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %941, i32 1, i32 31), !dbg !52 + %957 = add i32 %956, %941, !dbg !55 + %958 = mul nuw nsw i32 %902, %53, !dbg !56 + %959 = mul nuw nsw i32 %903, %53, !dbg !56 + %960 = mul nuw nsw i32 %904, %53, !dbg !56 + %961 = mul nuw nsw i32 %905, %53, !dbg !56 + %962 = mul nuw nsw i32 %906, %53, !dbg !56 + %963 = mul nuw nsw i32 %907, %53, !dbg !56 + %964 = mul nuw nsw i32 %908, %53, !dbg !56 + %965 = mul nuw nsw i32 %909, %53, !dbg !56 + %966 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %958, i32 1, i32 31), !dbg !52 + %967 = add i32 %966, %958, !dbg !55 + %968 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %959, i32 1, i32 31), !dbg !52 + %969 = add i32 %968, %959, !dbg !55 + %970 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %960, i32 1, i32 31), !dbg !52 + %971 = add i32 %970, %960, !dbg !55 + %972 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %961, i32 1, i32 31), !dbg !52 + %973 = add i32 %972, %961, !dbg !55 + %974 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %962, i32 1, i32 31), !dbg !52 + %975 = add i32 %974, %962, !dbg !55 + %976 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %963, i32 1, i32 31), !dbg !52 + %977 = add i32 %976, %963, !dbg !55 + %978 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %964, i32 1, i32 31), !dbg !52 + %979 = add i32 %978, %964, !dbg !55 + %980 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %965, i32 1, i32 31), !dbg !52 + %981 = add i32 %980, %965, !dbg !55 + %982 = mul nuw nsw i32 %926, %55, !dbg !57 + %983 = mul nuw nsw i32 %927, %55, !dbg !57 + %984 = mul nuw nsw i32 %928, %55, !dbg !57 + %985 = mul nuw nsw i32 %929, %55, !dbg !57 + %986 = mul nuw nsw i32 %930, %55, !dbg !57 + %987 = mul nuw nsw i32 %931, %55, !dbg !57 + %988 = mul nuw nsw i32 %932, %55, !dbg !57 + %989 = mul nuw nsw i32 %933, %55, !dbg !57 + %990 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %982, i32 1, i32 31), !dbg !52 + %991 = add i32 %990, %982, !dbg !55 + %992 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %983, i32 1, i32 31), !dbg !52 + %993 = add i32 %992, %983, !dbg !55 + %994 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %984, i32 1, i32 31), !dbg !52 + %995 = add i32 %994, %984, !dbg !55 + %996 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %985, i32 1, i32 31), !dbg !52 + %997 = add i32 %996, %985, !dbg !55 + %998 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %986, i32 1, i32 31), !dbg !52 + %999 = add i32 %998, %986, !dbg !55 + %1000 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %987, i32 1, i32 31), !dbg !52 + %1001 = add i32 %1000, %987, !dbg !55 + %1002 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %988, i32 1, i32 31), !dbg !52 + %1003 = add i32 %1002, %988, !dbg !55 + %1004 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %989, i32 1, i32 31), !dbg !52 + %1005 = add i32 %1004, %989, !dbg !55 + %1006 = mul nuw nsw i32 %926, %53, !dbg !58 + %1007 = mul nuw nsw i32 %927, %53, !dbg !58 + %1008 = mul nuw nsw i32 %928, %53, !dbg !58 + %1009 = mul nuw nsw i32 %929, %53, !dbg !58 + %1010 = mul nuw nsw i32 %930, %53, !dbg !58 + %1011 = mul nuw nsw i32 %931, %53, !dbg !58 + %1012 = mul nuw nsw i32 %932, %53, !dbg !58 + %1013 = mul nuw nsw i32 %933, %53, !dbg !58 + %1014 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1006, i32 1, i32 31), !dbg !52 + %1015 = add i32 %1014, %1006, !dbg !55 + %1016 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1007, i32 1, i32 31), !dbg !52 + %1017 = add i32 %1016, %1007, !dbg !55 + %1018 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1008, i32 1, i32 31), !dbg !52 + %1019 = add i32 %1018, %1008, !dbg !55 + %1020 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1009, i32 1, i32 31), !dbg !52 + %1021 = add i32 %1020, %1009, !dbg !55 + %1022 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1010, i32 1, i32 31), !dbg !52 + %1023 = add i32 %1022, %1010, !dbg !55 + %1024 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1011, i32 1, i32 31), !dbg !52 + %1025 = add i32 %1024, %1011, !dbg !55 + %1026 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1012, i32 1, i32 31), !dbg !52 + %1027 = add i32 %1026, %1012, !dbg !55 + %1028 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1013, i32 1, i32 31), !dbg !52 + %1029 = add i32 %1028, %1013, !dbg !55 + %1030 = icmp slt i32 %943, %967, !dbg !44 + %1031 = icmp slt i32 %945, %969, !dbg !44 + %1032 = icmp slt i32 %947, %971, !dbg !44 + %1033 = icmp slt i32 %949, %973, !dbg !44 + %1034 = icmp slt i32 %951, %975, !dbg !44 + %1035 = icmp slt i32 %953, %977, !dbg !44 + %1036 = icmp slt i32 %955, %979, !dbg !44 + %1037 = icmp slt i32 %957, %981, !dbg !44 + %1038 = icmp eq i32 %943, %967, !dbg !60 + %1039 = icmp eq i32 %945, %969, !dbg !60 + %1040 = icmp eq i32 %947, %971, !dbg !60 + %1041 = icmp eq i32 %949, %973, !dbg !60 + %1042 = icmp eq i32 %951, %975, !dbg !60 + %1043 = icmp eq i32 %953, %977, !dbg !60 + %1044 = icmp eq i32 %955, %979, !dbg !60 + %1045 = icmp eq i32 %957, %981, !dbg !60 + %1046 = icmp sgt i32 %991, %1015, !dbg !61 + %1047 = icmp sgt i32 %993, %1017, !dbg !61 + %1048 = icmp sgt i32 %995, %1019, !dbg !61 + %1049 = icmp sgt i32 %997, %1021, !dbg !61 + %1050 = icmp sgt i32 %999, %1023, !dbg !61 + %1051 = icmp sgt i32 %1001, %1025, !dbg !61 + %1052 = icmp sgt i32 %1003, %1027, !dbg !61 + %1053 = icmp sgt i32 %1005, %1029, !dbg !61 + %1054 = and i1 %1038, %1046, !dbg !62 + %1055 = and i1 %1039, %1047, !dbg !62 + %1056 = and i1 %1040, %1048, !dbg !62 + %1057 = and i1 %1041, %1049, !dbg !62 + %1058 = and i1 %1042, %1050, !dbg !62 + %1059 = and i1 %1043, %1051, !dbg !62 + %1060 = and i1 %1044, %1052, !dbg !62 + %1061 = and i1 %1045, %1053, !dbg !62 + %1062 = or i1 %1030, %1054, !dbg !63 + %1063 = or i1 %1031, %1055, !dbg !63 + %1064 = or i1 %1032, %1056, !dbg !63 + %1065 = or i1 %1033, %1057, !dbg !63 + %1066 = or i1 %1034, %1058, !dbg !63 + %1067 = or i1 %1035, %1059, !dbg !63 + %1068 = or i1 %1036, %1060, !dbg !63 + %1069 = or i1 %1037, %1061, !dbg !63 + %1070 = xor i32 %967, %943, !dbg !46 + %1071 = xor i32 %969, %945, !dbg !46 + %1072 = xor i32 %971, %947, !dbg !46 + %1073 = xor i32 %973, %949, !dbg !46 + %1074 = xor i32 %975, %951, !dbg !46 + %1075 = xor i32 %977, %953, !dbg !46 + %1076 = xor i32 %979, %955, !dbg !46 + %1077 = xor i32 %981, %957, !dbg !46 + %1078 = select i1 %1062, i32 %1070, i32 0, !dbg !47 + %1079 = select i1 %1063, i32 %1071, i32 0, !dbg !47 + %1080 = select i1 %1064, i32 %1072, i32 0, !dbg !47 + %1081 = select i1 %1065, i32 %1073, i32 0, !dbg !47 + %1082 = select i1 %1066, i32 %1074, i32 0, !dbg !47 + %1083 = select i1 %1067, i32 %1075, i32 0, !dbg !47 + %1084 = select i1 %1068, i32 %1076, i32 0, !dbg !47 + %1085 = select i1 %1069, i32 %1077, i32 0, !dbg !47 + %1086 = xor i32 %1078, %902, !dbg !48 + %1087 = xor i32 %1079, %903, !dbg !48 + %1088 = xor i32 %1080, %904, !dbg !48 + %1089 = xor i32 %1081, %905, !dbg !48 + %1090 = xor i32 %1082, %906, !dbg !48 + %1091 = xor i32 %1083, %907, !dbg !48 + %1092 = xor i32 %1084, %908, !dbg !48 + %1093 = xor i32 %1085, %909, !dbg !48 + %1094 = xor i32 %1015, %991, !dbg !25 + %1095 = xor i32 %1017, %993, !dbg !25 + %1096 = xor i32 %1019, %995, !dbg !25 + %1097 = xor i32 %1021, %997, !dbg !25 + %1098 = xor i32 %1023, %999, !dbg !25 + %1099 = xor i32 %1025, %1001, !dbg !25 + %1100 = xor i32 %1027, %1003, !dbg !25 + %1101 = xor i32 %1029, %1005, !dbg !25 + %1102 = select i1 %1062, i32 %1094, i32 0, !dbg !49 + %1103 = select i1 %1063, i32 %1095, i32 0, !dbg !49 + %1104 = select i1 %1064, i32 %1096, i32 0, !dbg !49 + %1105 = select i1 %1065, i32 %1097, i32 0, !dbg !49 + %1106 = select i1 %1066, i32 %1098, i32 0, !dbg !49 + %1107 = select i1 %1067, i32 %1099, i32 0, !dbg !49 + %1108 = select i1 %1068, i32 %1100, i32 0, !dbg !49 + %1109 = select i1 %1069, i32 %1101, i32 0, !dbg !49 + %1110 = xor i32 %1102, %926, !dbg !50 + %1111 = xor i32 %1103, %927, !dbg !50 + %1112 = xor i32 %1104, %928, !dbg !50 + %1113 = xor i32 %1105, %929, !dbg !50 + %1114 = xor i32 %1106, %930, !dbg !50 + %1115 = xor i32 %1107, %931, !dbg !50 + %1116 = xor i32 %1108, %932, !dbg !50 + %1117 = xor i32 %1109, %933, !dbg !50 + %1118 = icmp slt i32 %1086, %1088, !dbg !44 + %1119 = icmp slt i32 %1087, %1089, !dbg !44 + %1120 = icmp slt i32 %1090, %1092, !dbg !44 + %1121 = icmp slt i32 %1091, %1093, !dbg !44 + %1122 = icmp eq i32 %1086, %1088, !dbg !60 + %1123 = icmp eq i32 %1087, %1089, !dbg !60 + %1124 = icmp eq i32 %1090, %1092, !dbg !60 + %1125 = icmp eq i32 %1091, %1093, !dbg !60 + %1126 = icmp sgt i32 %1110, %1112, !dbg !61 + %1127 = icmp sgt i32 %1111, %1113, !dbg !61 + %1128 = icmp sgt i32 %1114, %1116, !dbg !61 + %1129 = icmp sgt i32 %1115, %1117, !dbg !61 + %1130 = and i1 %1122, %1126, !dbg !62 + %1131 = and i1 %1123, %1127, !dbg !62 + %1132 = and i1 %1124, %1128, !dbg !62 + %1133 = and i1 %1125, %1129, !dbg !62 + %1134 = or i1 %1118, %1130, !dbg !63 + %1135 = or i1 %1119, %1131, !dbg !63 + %1136 = or i1 %1120, %1132, !dbg !63 + %1137 = or i1 %1121, %1133, !dbg !63 + %1138 = xor i32 %1088, %1086, !dbg !46 + %1139 = xor i32 %1089, %1087, !dbg !46 + %1140 = xor i32 %1092, %1090, !dbg !46 + %1141 = xor i32 %1093, %1091, !dbg !46 + %1142 = select i1 %1134, i32 %1138, i32 0, !dbg !47 + %1143 = select i1 %1135, i32 %1139, i32 0, !dbg !47 + %1144 = select i1 %1136, i32 %1140, i32 0, !dbg !47 + %1145 = select i1 %1137, i32 %1141, i32 0, !dbg !47 + %1146 = xor i32 %1142, %1086, !dbg !48 + %1147 = xor i32 %1143, %1087, !dbg !48 + %1148 = xor i32 %1142, %1088, !dbg !48 + %1149 = xor i32 %1143, %1089, !dbg !48 + %1150 = xor i32 %1144, %1090, !dbg !48 + %1151 = xor i32 %1145, %1091, !dbg !48 + %1152 = xor i32 %1144, %1092, !dbg !48 + %1153 = xor i32 %1145, %1093, !dbg !48 + %1154 = xor i32 %1112, %1110, !dbg !25 + %1155 = xor i32 %1113, %1111, !dbg !25 + %1156 = xor i32 %1116, %1114, !dbg !25 + %1157 = xor i32 %1117, %1115, !dbg !25 + %1158 = select i1 %1134, i32 %1154, i32 0, !dbg !49 + %1159 = select i1 %1135, i32 %1155, i32 0, !dbg !49 + %1160 = select i1 %1136, i32 %1156, i32 0, !dbg !49 + %1161 = select i1 %1137, i32 %1157, i32 0, !dbg !49 + %1162 = xor i32 %1158, %1110, !dbg !50 + %1163 = xor i32 %1159, %1111, !dbg !50 + %1164 = xor i32 %1158, %1112, !dbg !50 + %1165 = xor i32 %1159, %1113, !dbg !50 + %1166 = xor i32 %1160, %1114, !dbg !50 + %1167 = xor i32 %1161, %1115, !dbg !50 + %1168 = xor i32 %1160, %1116, !dbg !50 + %1169 = xor i32 %1161, %1117, !dbg !50 + %1170 = insertelement <2 x i32> poison, i32 %1146, i64 0, !dbg !44 + %1171 = insertelement <2 x i32> %1170, i32 %1148, i64 1, !dbg !44 + %1172 = insertelement <2 x i32> poison, i32 %1147, i64 0, !dbg !44 + %1173 = insertelement <2 x i32> %1172, i32 %1149, i64 1, !dbg !44 + %1174 = icmp slt <2 x i32> %1171, %1173, !dbg !44 + %1175 = insertelement <2 x i32> poison, i32 %1150, i64 0, !dbg !44 + %1176 = insertelement <2 x i32> %1175, i32 %1152, i64 1, !dbg !44 + %1177 = insertelement <2 x i32> poison, i32 %1151, i64 0, !dbg !44 + %1178 = insertelement <2 x i32> %1177, i32 %1153, i64 1, !dbg !44 + %1179 = icmp slt <2 x i32> %1176, %1178, !dbg !44 + %1180 = insertelement <2 x i32> %1172, i32 %1148, i64 1, !dbg !60 + %1181 = insertelement <2 x i32> %1170, i32 %1149, i64 1, !dbg !60 + %1182 = icmp eq <2 x i32> %1180, %1181, !dbg !60 + %1183 = insertelement <2 x i32> poison, i32 %1162, i64 0, !dbg !61 + %1184 = insertelement <2 x i32> %1183, i32 %1164, i64 1, !dbg !61 + %1185 = insertelement <2 x i32> poison, i32 %1163, i64 0, !dbg !61 + %1186 = insertelement <2 x i32> %1185, i32 %1165, i64 1, !dbg !61 + %1187 = icmp sgt <2 x i32> %1184, %1186, !dbg !61 + %1188 = and <2 x i1> %1182, %1187, !dbg !62 + %1189 = insertelement <2 x i32> %1177, i32 %1152, i64 1, !dbg !60 + %1190 = insertelement <2 x i32> %1175, i32 %1153, i64 1, !dbg !60 + %1191 = icmp eq <2 x i32> %1189, %1190, !dbg !60 + %1192 = insertelement <2 x i32> poison, i32 %1166, i64 0, !dbg !61 + %1193 = insertelement <2 x i32> %1192, i32 %1168, i64 1, !dbg !61 + %1194 = insertelement <2 x i32> poison, i32 %1167, i64 0, !dbg !61 + %1195 = insertelement <2 x i32> %1194, i32 %1169, i64 1, !dbg !61 + %1196 = icmp sgt <2 x i32> %1193, %1195, !dbg !61 + %1197 = and <2 x i1> %1191, %1196, !dbg !62 + %1198 = or <2 x i1> %1174, %1188, !dbg !63 + %1199 = insertelement <2 x i32> %1183, i32 %1165, i64 1, !dbg !25 + %1200 = insertelement <2 x i32> %1185, i32 %1164, i64 1, !dbg !25 + %1201 = xor <2 x i32> %1199, %1200, !dbg !25 + %1202 = select <2 x i1> %1198, <2 x i32> %1201, <2 x i32> zeroinitializer, !dbg !49 + %1203 = shufflevector <2 x i32> %1202, <2 x i32> poison, <4 x i32> , !dbg !49 + %1204 = insertelement <4 x i32> poison, i32 %1162, i64 0, !dbg !50 + %1205 = insertelement <4 x i32> %1204, i32 %1163, i64 1, !dbg !50 + %1206 = insertelement <4 x i32> %1205, i32 %1164, i64 2, !dbg !50 + %1207 = insertelement <4 x i32> %1206, i32 %1165, i64 3, !dbg !50 + %1208 = xor <4 x i32> %1203, %1207, !dbg !50 + %1209 = or <2 x i1> %1179, %1197, !dbg !63 + %1210 = insertelement <2 x i32> %1192, i32 %1169, i64 1, !dbg !25 + %1211 = insertelement <2 x i32> %1194, i32 %1168, i64 1, !dbg !25 + %1212 = xor <2 x i32> %1210, %1211, !dbg !25 + %1213 = select <2 x i1> %1209, <2 x i32> %1212, <2 x i32> zeroinitializer, !dbg !49 + %1214 = shufflevector <2 x i32> %1213, <2 x i32> poison, <4 x i32> , !dbg !49 + %1215 = insertelement <4 x i32> poison, i32 %1166, i64 0, !dbg !50 + %1216 = insertelement <4 x i32> %1215, i32 %1167, i64 1, !dbg !50 + %1217 = insertelement <4 x i32> %1216, i32 %1168, i64 2, !dbg !50 + %1218 = insertelement <4 x i32> %1217, i32 %1169, i64 3, !dbg !50 + %1219 = xor <4 x i32> %1214, %1218, !dbg !50 + %1220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %207, i32 1, i32 31), !dbg !64 + %1221 = add i32 %1220, %207, !dbg !65 + %1222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %208, i32 1, i32 31), !dbg !64 + %1223 = add i32 %1222, %208, !dbg !65 + %1224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %209, i32 1, i32 31), !dbg !64 + %1225 = add i32 %1224, %209, !dbg !65 + %1226 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %210, i32 1, i32 31), !dbg !64 + %1227 = add i32 %1226, %210, !dbg !65 + %1228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %211, i32 1, i32 31), !dbg !64 + %1229 = add i32 %1228, %211, !dbg !65 + %1230 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %212, i32 1, i32 31), !dbg !64 + %1231 = add i32 %1230, %212, !dbg !65 + %1232 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %213, i32 1, i32 31), !dbg !64 + %1233 = add i32 %1232, %213, !dbg !65 + %1234 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %214, i32 1, i32 31), !dbg !64 + %1235 = add i32 %1234, %214, !dbg !65 + %1236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %215, i32 1, i32 31), !dbg !64 + %1237 = add i32 %1236, %215, !dbg !65 + %1238 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !64 + %1239 = add i32 %1238, %216, !dbg !65 + %1240 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %217, i32 1, i32 31), !dbg !64 + %1241 = add i32 %1240, %217, !dbg !65 + %1242 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %218, i32 1, i32 31), !dbg !64 + %1243 = add i32 %1242, %218, !dbg !65 + %1244 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 1, i32 31), !dbg !64 + %1245 = add i32 %1244, %219, !dbg !65 + %1246 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 1, i32 31), !dbg !64 + %1247 = add i32 %1246, %220, !dbg !65 + %1248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 1, i32 31), !dbg !64 + %1249 = add i32 %1248, %221, !dbg !65 + %1250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 1, i32 31), !dbg !64 + %1251 = add i32 %1250, %222, !dbg !65 + %1252 = icmp sge i32 %1221, %1237, !dbg !37 + %1253 = icmp ne i32 %1221, %1237, !dbg !37 + %1254 = icmp sge i32 %1223, %1239, !dbg !37 + %1255 = icmp ne i32 %1223, %1239, !dbg !37 + %1256 = icmp sge i32 %1225, %1241, !dbg !37 + %1257 = icmp ne i32 %1225, %1241, !dbg !37 + %1258 = icmp sge i32 %1227, %1243, !dbg !37 + %1259 = icmp ne i32 %1227, %1243, !dbg !37 + %1260 = icmp sge i32 %1229, %1245, !dbg !37 + %1261 = icmp ne i32 %1229, %1245, !dbg !37 + %1262 = icmp sge i32 %1231, %1247, !dbg !37 + %1263 = icmp ne i32 %1231, %1247, !dbg !37 + %1264 = icmp sge i32 %1233, %1249, !dbg !37 + %1265 = icmp ne i32 %1233, %1249, !dbg !37 + %1266 = icmp sge i32 %1235, %1251, !dbg !37 + %1267 = icmp ne i32 %1235, %1251, !dbg !37 + %1268 = xor i32 %1237, %1221, !dbg !32 + %1269 = xor i32 %1239, %1223, !dbg !32 + %1270 = xor i32 %1241, %1225, !dbg !32 + %1271 = xor i32 %1243, %1227, !dbg !32 + %1272 = xor i32 %1245, %1229, !dbg !32 + %1273 = xor i32 %1247, %1231, !dbg !32 + %1274 = xor i32 %1249, %1233, !dbg !32 + %1275 = xor i32 %1251, %1235, !dbg !32 + %1276 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !64 + %1277 = add i32 %1276, %232, !dbg !65 + %1278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %234, i32 1, i32 31), !dbg !64 + %1279 = add i32 %1278, %234, !dbg !65 + %1280 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %236, i32 1, i32 31), !dbg !64 + %1281 = add i32 %1280, %236, !dbg !65 + %1282 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %238, i32 1, i32 31), !dbg !64 + %1283 = add i32 %1282, %238, !dbg !65 + %1284 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %262, i32 1, i32 31), !dbg !64 + %1285 = add i32 %1284, %262, !dbg !65 + %1286 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %264, i32 1, i32 31), !dbg !64 + %1287 = add i32 %1286, %264, !dbg !65 + %1288 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %266, i32 1, i32 31), !dbg !64 + %1289 = add i32 %1288, %266, !dbg !65 + %1290 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 1, i32 31), !dbg !64 + %1291 = add i32 %1290, %268, !dbg !65 + %1292 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %239, i32 1, i32 31), !dbg !64 + %1293 = add i32 %1292, %239, !dbg !65 + %1294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 1, i32 31), !dbg !64 + %1295 = add i32 %1294, %240, !dbg !65 + %1296 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %241, i32 1, i32 31), !dbg !64 + %1297 = add i32 %1296, %241, !dbg !65 + %1298 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %242, i32 1, i32 31), !dbg !64 + %1299 = add i32 %1298, %242, !dbg !65 + %1300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 1, i32 31), !dbg !64 + %1301 = add i32 %1300, %269, !dbg !65 + %1302 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %270, i32 1, i32 31), !dbg !64 + %1303 = add i32 %1302, %270, !dbg !65 + %1304 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %271, i32 1, i32 31), !dbg !64 + %1305 = add i32 %1304, %271, !dbg !65 + %1306 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 1, i32 31), !dbg !64 + %1307 = add i32 %1306, %272, !dbg !65 + %1308 = icmp sle i32 %1277, %1293, !dbg !37 + %1309 = or i1 %1253, %1308, !dbg !37 + %1310 = and i1 %1252, %1309, !dbg !37 + %.not36 = xor i1 %1310, %60, !dbg !37 + %1311 = icmp sle i32 %1279, %1295, !dbg !37 + %1312 = or i1 %1255, %1311, !dbg !37 + %1313 = and i1 %1254, %1312, !dbg !37 + %.not37 = xor i1 %1313, %60, !dbg !37 + %1314 = icmp sle i32 %1281, %1297, !dbg !37 + %1315 = or i1 %1257, %1314, !dbg !37 + %1316 = and i1 %1256, %1315, !dbg !37 + %.not38 = xor i1 %1316, %60, !dbg !37 + %1317 = icmp sle i32 %1283, %1299, !dbg !37 + %1318 = or i1 %1259, %1317, !dbg !37 + %1319 = and i1 %1258, %1318, !dbg !37 + %.not39 = xor i1 %1319, %60, !dbg !37 + %1320 = icmp sle i32 %1285, %1301, !dbg !37 + %1321 = or i1 %1261, %1320, !dbg !37 + %1322 = and i1 %1260, %1321, !dbg !37 + %.not40 = xor i1 %1322, %60, !dbg !37 + %1323 = icmp sle i32 %1287, %1303, !dbg !37 + %1324 = or i1 %1263, %1323, !dbg !37 + %1325 = and i1 %1262, %1324, !dbg !37 + %.not41 = xor i1 %1325, %60, !dbg !37 + %1326 = icmp sle i32 %1289, %1305, !dbg !37 + %1327 = or i1 %1265, %1326, !dbg !37 + %1328 = and i1 %1264, %1327, !dbg !37 + %.not42 = xor i1 %1328, %60, !dbg !37 + %1329 = icmp sle i32 %1291, %1307, !dbg !37 + %1330 = or i1 %1267, %1329, !dbg !37 + %1331 = and i1 %1266, %1330, !dbg !37 + %.not43 = xor i1 %1331, %60, !dbg !37 + %1332 = select i1 %.not36, i32 0, i32 %1268, !dbg !33 + %1333 = select i1 %.not37, i32 0, i32 %1269, !dbg !33 + %1334 = select i1 %.not38, i32 0, i32 %1270, !dbg !33 + %1335 = select i1 %.not39, i32 0, i32 %1271, !dbg !33 + %1336 = select i1 %.not40, i32 0, i32 %1272, !dbg !33 + %1337 = select i1 %.not41, i32 0, i32 %1273, !dbg !33 + %1338 = select i1 %.not42, i32 0, i32 %1274, !dbg !33 + %1339 = select i1 %.not43, i32 0, i32 %1275, !dbg !33 + %1340 = xor i32 %1332, %191, !dbg !34 + %1341 = xor i32 %1333, %192, !dbg !34 + %1342 = xor i32 %1334, %193, !dbg !34 + %1343 = xor i32 %1335, %194, !dbg !34 + %1344 = xor i32 %1336, %195, !dbg !34 + %1345 = xor i32 %1337, %196, !dbg !34 + %1346 = xor i32 %1338, %197, !dbg !34 + %1347 = xor i32 %1339, %198, !dbg !34 + %1348 = xor i32 %1293, %1277, !dbg !38 + %1349 = xor i32 %1295, %1279, !dbg !38 + %1350 = xor i32 %1297, %1281, !dbg !38 + %1351 = xor i32 %1299, %1283, !dbg !38 + %1352 = xor i32 %1301, %1285, !dbg !38 + %1353 = xor i32 %1303, %1287, !dbg !38 + %1354 = xor i32 %1305, %1289, !dbg !38 + %1355 = xor i32 %1307, %1291, !dbg !38 + %1356 = insertelement <4 x i1> poison, i1 %.not36, i64 0, !dbg !35 + %1357 = insertelement <4 x i1> %1356, i1 %.not37, i64 1, !dbg !35 + %1358 = insertelement <4 x i1> %1357, i1 %.not38, i64 2, !dbg !35 + %1359 = insertelement <4 x i1> %1358, i1 %.not39, i64 3, !dbg !35 + %1360 = insertelement <4 x i32> poison, i32 %1348, i64 0, !dbg !35 + %1361 = insertelement <4 x i32> %1360, i32 %1349, i64 1, !dbg !35 + %1362 = insertelement <4 x i32> %1361, i32 %1350, i64 2, !dbg !35 + %1363 = insertelement <4 x i32> %1362, i32 %1351, i64 3, !dbg !35 + %1364 = select <4 x i1> %1359, <4 x i32> zeroinitializer, <4 x i32> %1363, !dbg !35 + %1365 = insertelement <4 x i1> poison, i1 %.not40, i64 0, !dbg !35 + %1366 = insertelement <4 x i1> %1365, i1 %.not41, i64 1, !dbg !35 + %1367 = insertelement <4 x i1> %1366, i1 %.not42, i64 2, !dbg !35 + %1368 = insertelement <4 x i1> %1367, i1 %.not43, i64 3, !dbg !35 + %1369 = insertelement <4 x i32> poison, i32 %1352, i64 0, !dbg !35 + %1370 = insertelement <4 x i32> %1369, i32 %1353, i64 1, !dbg !35 + %1371 = insertelement <4 x i32> %1370, i32 %1354, i64 2, !dbg !35 + %1372 = insertelement <4 x i32> %1371, i32 %1355, i64 3, !dbg !35 + %1373 = select <4 x i1> %1368, <4 x i32> zeroinitializer, <4 x i32> %1372, !dbg !35 + %1374 = xor <4 x i32> %1364, %230, !dbg !36 + %1375 = xor <4 x i32> %1373, %260, !dbg !36 + %1376 = icmp sge i32 %1340, %1342, !dbg !37 + %1377 = icmp ne i32 %1340, %1342, !dbg !37 + %shift = shufflevector <4 x i32> %1374, <4 x i32> poison, <4 x i32> , !dbg !37 + %1378 = icmp sle <4 x i32> %1374, %shift, !dbg !37 + %1379 = extractelement <4 x i1> %1378, i64 0, !dbg !37 + %1380 = or i1 %1377, %1379, !dbg !37 + %1381 = and i1 %1376, %1380, !dbg !37 + %1382 = icmp sge i32 %1341, %1343, !dbg !37 + %1383 = icmp ne i32 %1341, %1343, !dbg !37 + %shift109 = shufflevector <4 x i32> %1374, <4 x i32> poison, <4 x i32> , !dbg !37 + %1384 = icmp sle <4 x i32> %1374, %shift109, !dbg !37 + %1385 = extractelement <4 x i1> %1384, i64 1, !dbg !37 + %1386 = or i1 %1383, %1385, !dbg !37 + %1387 = and i1 %1382, %1386, !dbg !37 + %1388 = insertelement <2 x i1> poison, i1 %1387, i64 0, !dbg !37 + %1389 = insertelement <2 x i1> %1388, i1 %1381, i64 1, !dbg !37 + %1390 = xor <2 x i1> %1389, %244, !dbg !37 + %1391 = icmp sge i32 %1344, %1346, !dbg !37 + %1392 = icmp ne i32 %1344, %1346, !dbg !37 + %shift110 = shufflevector <4 x i32> %1375, <4 x i32> poison, <4 x i32> , !dbg !37 + %1393 = icmp sle <4 x i32> %1375, %shift110, !dbg !37 + %1394 = extractelement <4 x i1> %1393, i64 0, !dbg !37 + %1395 = or i1 %1392, %1394, !dbg !37 + %1396 = and i1 %1391, %1395, !dbg !37 + %1397 = icmp sge i32 %1345, %1347, !dbg !37 + %1398 = icmp ne i32 %1345, %1347, !dbg !37 + %shift111 = shufflevector <4 x i32> %1375, <4 x i32> poison, <4 x i32> , !dbg !37 + %1399 = icmp sle <4 x i32> %1375, %shift111, !dbg !37 + %1400 = extractelement <4 x i1> %1399, i64 1, !dbg !37 + %1401 = or i1 %1398, %1400, !dbg !37 + %1402 = and i1 %1397, %1401, !dbg !37 + %1403 = insertelement <2 x i1> poison, i1 %1402, i64 0, !dbg !37 + %1404 = insertelement <2 x i1> %1403, i1 %1396, i64 1, !dbg !37 + %1405 = xor <2 x i1> %1404, %244, !dbg !37 + %1406 = xor i32 %1342, %1340, !dbg !32 + %1407 = xor i32 %1343, %1341, !dbg !32 + %1408 = xor i32 %1346, %1344, !dbg !32 + %1409 = xor i32 %1347, %1345, !dbg !32 + %1410 = extractelement <2 x i1> %1390, i64 1, !dbg !33 + %1411 = select i1 %1410, i32 0, i32 %1406, !dbg !33 + %1412 = extractelement <2 x i1> %1390, i64 0, !dbg !33 + %1413 = select i1 %1412, i32 0, i32 %1407, !dbg !33 + %1414 = extractelement <2 x i1> %1405, i64 1, !dbg !33 + %1415 = select i1 %1414, i32 0, i32 %1408, !dbg !33 + %1416 = extractelement <2 x i1> %1405, i64 0, !dbg !33 + %1417 = select i1 %1416, i32 0, i32 %1409, !dbg !33 + %1418 = xor i32 %1411, %1340, !dbg !34 + %1419 = xor i32 %1413, %1341, !dbg !34 + %1420 = xor i32 %1411, %1342, !dbg !34 + %1421 = xor i32 %1413, %1343, !dbg !34 + %1422 = xor i32 %1415, %1344, !dbg !34 + %1423 = xor i32 %1417, %1345, !dbg !34 + %1424 = xor i32 %1415, %1346, !dbg !34 + %1425 = xor i32 %1417, %1347, !dbg !34 + %1426 = shufflevector <4 x i32> %1374, <4 x i32> poison, <2 x i32> , !dbg !38 + %1427 = shufflevector <4 x i32> %1374, <4 x i32> poison, <2 x i32> , !dbg !38 + %1428 = xor <2 x i32> %1426, %1427, !dbg !38 + %1429 = shufflevector <4 x i32> %1375, <4 x i32> poison, <2 x i32> , !dbg !38 + %1430 = shufflevector <4 x i32> %1375, <4 x i32> poison, <2 x i32> , !dbg !38 + %1431 = xor <2 x i32> %1429, %1430, !dbg !38 + %1432 = select <2 x i1> %1390, <2 x i32> zeroinitializer, <2 x i32> %1428, !dbg !35 + %1433 = shufflevector <2 x i32> %1432, <2 x i32> poison, <4 x i32> , !dbg !35 + %1434 = select <2 x i1> %1405, <2 x i32> zeroinitializer, <2 x i32> %1431, !dbg !35 + %1435 = shufflevector <2 x i32> %1434, <2 x i32> poison, <4 x i32> , !dbg !35 + %1436 = shufflevector <2 x i32> %1432, <2 x i32> poison, <2 x i32> , !dbg !36 + %1437 = shufflevector <4 x i32> %1374, <4 x i32> poison, <2 x i32> , !dbg !36 + %1438 = xor <2 x i32> %1436, %1437, !dbg !36 + %1439 = shufflevector <4 x i32> %1374, <4 x i32> poison, <2 x i32> , !dbg !36 + %1440 = xor <2 x i32> %1432, %1439, !dbg !36 + %1441 = shufflevector <2 x i32> %1434, <2 x i32> poison, <2 x i32> , !dbg !36 + %1442 = shufflevector <4 x i32> %1375, <4 x i32> poison, <2 x i32> , !dbg !36 + %1443 = xor <2 x i32> %1441, %1442, !dbg !36 + %1444 = shufflevector <4 x i32> %1375, <4 x i32> poison, <2 x i32> , !dbg !36 + %1445 = xor <2 x i32> %1434, %1444, !dbg !36 + %1446 = icmp sge i32 %1418, %1419, !dbg !37 + %1447 = icmp ne i32 %1418, %1419, !dbg !37 + %1448 = extractelement <2 x i32> %1440, i64 1, !dbg !37 + %1449 = extractelement <2 x i32> %1438, i64 1, !dbg !37 + %1450 = icmp sle i32 %1448, %1449, !dbg !37 + %1451 = or i1 %1447, %1450, !dbg !37 + %1452 = and i1 %1446, %1451, !dbg !37 + %.not48 = xor i1 %1452, %60, !dbg !37 + %1453 = icmp sge i32 %1420, %1421, !dbg !37 + %1454 = icmp ne i32 %1420, %1421, !dbg !37 + %1455 = extractelement <2 x i32> %1440, i64 0, !dbg !37 + %1456 = extractelement <2 x i32> %1438, i64 0, !dbg !37 + %1457 = icmp sle i32 %1456, %1455, !dbg !37 + %1458 = or i1 %1454, %1457, !dbg !37 + %1459 = and i1 %1453, %1458, !dbg !37 + %.not49 = xor i1 %1459, %60, !dbg !37 + %1460 = icmp sge i32 %1422, %1423, !dbg !37 + %1461 = icmp ne i32 %1422, %1423, !dbg !37 + %1462 = extractelement <2 x i32> %1445, i64 1, !dbg !37 + %1463 = extractelement <2 x i32> %1443, i64 1, !dbg !37 + %1464 = icmp sle i32 %1462, %1463, !dbg !37 + %1465 = or i1 %1461, %1464, !dbg !37 + %1466 = and i1 %1460, %1465, !dbg !37 + %.not50 = xor i1 %1466, %60, !dbg !37 + %1467 = icmp sge i32 %1424, %1425, !dbg !37 + %1468 = icmp ne i32 %1424, %1425, !dbg !37 + %1469 = extractelement <2 x i32> %1445, i64 0, !dbg !37 + %1470 = extractelement <2 x i32> %1443, i64 0, !dbg !37 + %1471 = icmp sle i32 %1470, %1469, !dbg !37 + %1472 = or i1 %1468, %1471, !dbg !37 + %1473 = and i1 %1467, %1472, !dbg !37 + %.not51 = xor i1 %1473, %60, !dbg !37 + %1474 = xor i32 %1419, %1418, !dbg !32 + %1475 = xor i32 %1421, %1420, !dbg !32 + %1476 = xor i32 %1423, %1422, !dbg !32 + %1477 = xor i32 %1425, %1424, !dbg !32 + %1478 = select i1 %.not48, i32 0, i32 %1474, !dbg !33 + %1479 = select i1 %.not49, i32 0, i32 %1475, !dbg !33 + %1480 = select i1 %.not50, i32 0, i32 %1476, !dbg !33 + %1481 = select i1 %.not51, i32 0, i32 %1477, !dbg !33 + %1482 = insertelement <2 x i32> poison, i32 %1479, i64 0, !dbg !34 + %1483 = insertelement <2 x i32> %1482, i32 %1478, i64 1, !dbg !34 + %1484 = insertelement <2 x i32> poison, i32 %1420, i64 0, !dbg !34 + %1485 = insertelement <2 x i32> %1484, i32 %1418, i64 1, !dbg !34 + %1486 = xor <2 x i32> %1483, %1485, !dbg !34 + %1487 = insertelement <2 x i32> poison, i32 %1421, i64 0, !dbg !34 + %1488 = insertelement <2 x i32> %1487, i32 %1419, i64 1, !dbg !34 + %1489 = xor <2 x i32> %1483, %1488, !dbg !34 + %1490 = insertelement <2 x i32> poison, i32 %1481, i64 0, !dbg !34 + %1491 = insertelement <2 x i32> %1490, i32 %1480, i64 1, !dbg !34 + %1492 = insertelement <2 x i32> poison, i32 %1424, i64 0, !dbg !34 + %1493 = insertelement <2 x i32> %1492, i32 %1422, i64 1, !dbg !34 + %1494 = xor <2 x i32> %1491, %1493, !dbg !34 + %1495 = insertelement <2 x i32> poison, i32 %1425, i64 0, !dbg !34 + %1496 = insertelement <2 x i32> %1495, i32 %1423, i64 1, !dbg !34 + %1497 = xor <2 x i32> %1491, %1496, !dbg !34 + %1498 = xor i32 %1449, %1448, !dbg !38 + %1499 = xor i32 %1455, %1456, !dbg !38 + %1500 = xor i32 %1463, %1462, !dbg !38 + %1501 = xor i32 %1469, %1470, !dbg !38 + %1502 = insertelement <2 x i1> poison, i1 %.not49, i64 0, !dbg !35 + %1503 = insertelement <2 x i1> %1502, i1 %.not48, i64 1, !dbg !35 + %1504 = insertelement <2 x i32> poison, i32 %1499, i64 0, !dbg !35 + %1505 = insertelement <2 x i32> %1504, i32 %1498, i64 1, !dbg !35 + %1506 = select <2 x i1> %1503, <2 x i32> zeroinitializer, <2 x i32> %1505, !dbg !35 + %1507 = insertelement <2 x i1> poison, i1 %.not51, i64 0, !dbg !35 + %1508 = insertelement <2 x i1> %1507, i1 %.not50, i64 1, !dbg !35 + %1509 = insertelement <2 x i32> poison, i32 %1501, i64 0, !dbg !35 + %1510 = insertelement <2 x i32> %1509, i32 %1500, i64 1, !dbg !35 + %1511 = select <2 x i1> %1508, <2 x i32> zeroinitializer, <2 x i32> %1510, !dbg !35 + %1512 = shufflevector <2 x i32> %1506, <2 x i32> poison, <4 x i32> , !dbg !36 + %1513 = xor <4 x i32> %1433, %1512, !dbg !36 + %1514 = xor <2 x i32> %1506, %1438, !dbg !36 + %1515 = xor <2 x i32> %1506, %1440, !dbg !36 + %1516 = shufflevector <2 x i32> %1511, <2 x i32> poison, <4 x i32> , !dbg !36 + %1517 = xor <4 x i32> %1435, %1516, !dbg !36 + %1518 = xor <2 x i32> %1511, %1443, !dbg !36 + %1519 = xor <2 x i32> %1511, %1445, !dbg !36 + %1520 = extractelement <2 x i32> %1486, i64 1, !dbg !40 + %1521 = mul nuw nsw i32 %1520, %56, !dbg !39 + %1522 = extractelement <2 x i32> %1489, i64 1, !dbg !40 + %1523 = mul nuw nsw i32 %1522, %56, !dbg !39 + %1524 = extractelement <2 x i32> %1486, i64 0, !dbg !40 + %1525 = mul nuw nsw i32 %1524, %56, !dbg !39 + %1526 = extractelement <2 x i32> %1489, i64 0, !dbg !40 + %1527 = mul nuw nsw i32 %1526, %56, !dbg !39 + %1528 = extractelement <2 x i32> %1494, i64 1, !dbg !40 + %1529 = mul nuw nsw i32 %1528, %56, !dbg !39 + %1530 = extractelement <2 x i32> %1497, i64 1, !dbg !40 + %1531 = mul nuw nsw i32 %1530, %56, !dbg !39 + %1532 = extractelement <2 x i32> %1494, i64 0, !dbg !40 + %1533 = mul nuw nsw i32 %1532, %56, !dbg !39 + %1534 = extractelement <2 x i32> %1497, i64 0, !dbg !40 + %1535 = mul nuw nsw i32 %1534, %56, !dbg !39 + %1536 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1521, i32 2, i32 31), !dbg !64 + %1537 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1523, i32 2, i32 31), !dbg !64 + %1538 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1525, i32 2, i32 31), !dbg !64 + %1539 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1527, i32 2, i32 31), !dbg !64 + %1540 = insertelement <4 x i32> poison, i32 %1521, i64 0, !dbg !65 + %1541 = insertelement <4 x i32> %1540, i32 %1523, i64 1, !dbg !65 + %1542 = insertelement <4 x i32> %1541, i32 %1525, i64 2, !dbg !65 + %1543 = insertelement <4 x i32> %1542, i32 %1527, i64 3, !dbg !65 + %1544 = insertelement <4 x i32> poison, i32 %1536, i64 0, !dbg !65 + %1545 = insertelement <4 x i32> %1544, i32 %1537, i64 1, !dbg !65 + %1546 = insertelement <4 x i32> %1545, i32 %1538, i64 2, !dbg !65 + %1547 = insertelement <4 x i32> %1546, i32 %1539, i64 3, !dbg !65 + %1548 = add <4 x i32> %1543, %1547, !dbg !65 + %1549 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1529, i32 2, i32 31), !dbg !64 + %1550 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1531, i32 2, i32 31), !dbg !64 + %1551 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1533, i32 2, i32 31), !dbg !64 + %1552 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1535, i32 2, i32 31), !dbg !64 + %1553 = insertelement <4 x i32> poison, i32 %1529, i64 0, !dbg !65 + %1554 = insertelement <4 x i32> %1553, i32 %1531, i64 1, !dbg !65 + %1555 = insertelement <4 x i32> %1554, i32 %1533, i64 2, !dbg !65 + %1556 = insertelement <4 x i32> %1555, i32 %1535, i64 3, !dbg !65 + %1557 = insertelement <4 x i32> poison, i32 %1549, i64 0, !dbg !65 + %1558 = insertelement <4 x i32> %1557, i32 %1550, i64 1, !dbg !65 + %1559 = insertelement <4 x i32> %1558, i32 %1551, i64 2, !dbg !65 + %1560 = insertelement <4 x i32> %1559, i32 %1552, i64 3, !dbg !65 + %1561 = add <4 x i32> %1556, %1560, !dbg !65 + %1562 = mul nuw nsw i32 %1520, %.lobit, !dbg !40 + %1563 = mul nuw nsw i32 %1522, %.lobit, !dbg !40 + %1564 = mul nuw nsw i32 %1524, %.lobit, !dbg !40 + %1565 = mul nuw nsw i32 %1526, %.lobit, !dbg !40 + %1566 = mul nuw nsw i32 %1528, %.lobit, !dbg !40 + %1567 = mul nuw nsw i32 %1530, %.lobit, !dbg !40 + %1568 = mul nuw nsw i32 %1532, %.lobit, !dbg !40 + %1569 = mul nuw nsw i32 %1534, %.lobit, !dbg !40 + %1570 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1562, i32 2, i32 31), !dbg !64 + %1571 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1563, i32 2, i32 31), !dbg !64 + %1572 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1564, i32 2, i32 31), !dbg !64 + %1573 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1565, i32 2, i32 31), !dbg !64 + %1574 = insertelement <4 x i32> poison, i32 %1562, i64 0, !dbg !65 + %1575 = insertelement <4 x i32> %1574, i32 %1563, i64 1, !dbg !65 + %1576 = insertelement <4 x i32> %1575, i32 %1564, i64 2, !dbg !65 + %1577 = insertelement <4 x i32> %1576, i32 %1565, i64 3, !dbg !65 + %1578 = insertelement <4 x i32> poison, i32 %1570, i64 0, !dbg !65 + %1579 = insertelement <4 x i32> %1578, i32 %1571, i64 1, !dbg !65 + %1580 = insertelement <4 x i32> %1579, i32 %1572, i64 2, !dbg !65 + %1581 = insertelement <4 x i32> %1580, i32 %1573, i64 3, !dbg !65 + %1582 = add <4 x i32> %1577, %1581, !dbg !65 + %1583 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1566, i32 2, i32 31), !dbg !64 + %1584 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1567, i32 2, i32 31), !dbg !64 + %1585 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1568, i32 2, i32 31), !dbg !64 + %1586 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1569, i32 2, i32 31), !dbg !64 + %1587 = insertelement <4 x i32> poison, i32 %1566, i64 0, !dbg !65 + %1588 = insertelement <4 x i32> %1587, i32 %1567, i64 1, !dbg !65 + %1589 = insertelement <4 x i32> %1588, i32 %1568, i64 2, !dbg !65 + %1590 = insertelement <4 x i32> %1589, i32 %1569, i64 3, !dbg !65 + %1591 = insertelement <4 x i32> poison, i32 %1583, i64 0, !dbg !65 + %1592 = insertelement <4 x i32> %1591, i32 %1584, i64 1, !dbg !65 + %1593 = insertelement <4 x i32> %1592, i32 %1585, i64 2, !dbg !65 + %1594 = insertelement <4 x i32> %1593, i32 %1586, i64 3, !dbg !65 + %1595 = add <4 x i32> %1590, %1594, !dbg !65 + %1596 = shufflevector <2 x i32> %1515, <2 x i32> %1514, <4 x i32> , !dbg !42 + %1597 = mul nuw nsw <4 x i32> %1596, %246, !dbg !42 + %1598 = shufflevector <2 x i32> %1519, <2 x i32> %1518, <4 x i32> , !dbg !42 + %1599 = mul nuw nsw <4 x i32> %1598, %246, !dbg !42 + %1600 = extractelement <4 x i32> %1597, i64 0, !dbg !64 + %1601 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1600, i32 2, i32 31), !dbg !64 + %1602 = extractelement <4 x i32> %1597, i64 1, !dbg !64 + %1603 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1602, i32 2, i32 31), !dbg !64 + %1604 = extractelement <4 x i32> %1597, i64 2, !dbg !64 + %1605 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1604, i32 2, i32 31), !dbg !64 + %1606 = extractelement <4 x i32> %1597, i64 3, !dbg !64 + %1607 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1606, i32 2, i32 31), !dbg !64 + %1608 = insertelement <4 x i32> poison, i32 %1601, i64 0, !dbg !65 + %1609 = insertelement <4 x i32> %1608, i32 %1603, i64 1, !dbg !65 + %1610 = insertelement <4 x i32> %1609, i32 %1605, i64 2, !dbg !65 + %1611 = insertelement <4 x i32> %1610, i32 %1607, i64 3, !dbg !65 + %1612 = add <4 x i32> %1611, %1597, !dbg !65 + %1613 = extractelement <4 x i32> %1599, i64 0, !dbg !64 + %1614 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1613, i32 2, i32 31), !dbg !64 + %1615 = extractelement <4 x i32> %1599, i64 1, !dbg !64 + %1616 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1615, i32 2, i32 31), !dbg !64 + %1617 = extractelement <4 x i32> %1599, i64 2, !dbg !64 + %1618 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1617, i32 2, i32 31), !dbg !64 + %1619 = extractelement <4 x i32> %1599, i64 3, !dbg !64 + %1620 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1619, i32 2, i32 31), !dbg !64 + %1621 = insertelement <4 x i32> poison, i32 %1614, i64 0, !dbg !65 + %1622 = insertelement <4 x i32> %1621, i32 %1616, i64 1, !dbg !65 + %1623 = insertelement <4 x i32> %1622, i32 %1618, i64 2, !dbg !65 + %1624 = insertelement <4 x i32> %1623, i32 %1620, i64 3, !dbg !65 + %1625 = add <4 x i32> %1624, %1599, !dbg !65 + %1626 = mul nuw nsw <4 x i32> %1596, %248, !dbg !41 + %1627 = mul nuw nsw <4 x i32> %1598, %248, !dbg !41 + %1628 = extractelement <4 x i32> %1626, i64 0, !dbg !64 + %1629 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1628, i32 2, i32 31), !dbg !64 + %1630 = extractelement <4 x i32> %1626, i64 1, !dbg !64 + %1631 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1630, i32 2, i32 31), !dbg !64 + %1632 = extractelement <4 x i32> %1626, i64 2, !dbg !64 + %1633 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1632, i32 2, i32 31), !dbg !64 + %1634 = extractelement <4 x i32> %1626, i64 3, !dbg !64 + %1635 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1634, i32 2, i32 31), !dbg !64 + %1636 = insertelement <4 x i32> poison, i32 %1629, i64 0, !dbg !65 + %1637 = insertelement <4 x i32> %1636, i32 %1631, i64 1, !dbg !65 + %1638 = insertelement <4 x i32> %1637, i32 %1633, i64 2, !dbg !65 + %1639 = insertelement <4 x i32> %1638, i32 %1635, i64 3, !dbg !65 + %1640 = add <4 x i32> %1639, %1626, !dbg !65 + %1641 = extractelement <4 x i32> %1627, i64 0, !dbg !64 + %1642 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1641, i32 2, i32 31), !dbg !64 + %1643 = extractelement <4 x i32> %1627, i64 1, !dbg !64 + %1644 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1643, i32 2, i32 31), !dbg !64 + %1645 = extractelement <4 x i32> %1627, i64 2, !dbg !64 + %1646 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1645, i32 2, i32 31), !dbg !64 + %1647 = extractelement <4 x i32> %1627, i64 3, !dbg !64 + %1648 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1647, i32 2, i32 31), !dbg !64 + %1649 = insertelement <4 x i32> poison, i32 %1642, i64 0, !dbg !65 + %1650 = insertelement <4 x i32> %1649, i32 %1644, i64 1, !dbg !65 + %1651 = insertelement <4 x i32> %1650, i32 %1646, i64 2, !dbg !65 + %1652 = insertelement <4 x i32> %1651, i32 %1648, i64 3, !dbg !65 + %1653 = add <4 x i32> %1652, %1627, !dbg !65 + %1654 = icmp slt <4 x i32> %1548, %1582, !dbg !29 + %1655 = icmp slt <4 x i32> %1561, %1595, !dbg !29 + %1656 = icmp eq <4 x i32> %1548, %1582, !dbg !66 + %1657 = icmp eq <4 x i32> %1561, %1595, !dbg !66 + %1658 = icmp sgt <4 x i32> %1612, %1640, !dbg !67 + %1659 = icmp sgt <4 x i32> %1625, %1653, !dbg !67 + %1660 = and <4 x i1> %1656, %1658, !dbg !68 + %1661 = and <4 x i1> %1657, %1659, !dbg !68 + %1662 = or <4 x i1> %1654, %1660, !dbg !69 + %1663 = or <4 x i1> %1654, %1660, !dbg !69 + %1664 = shufflevector <4 x i1> %1663, <4 x i1> poison, <2 x i32> , !dbg !69 + %1665 = or <4 x i1> %1654, %1660, !dbg !69 + %1666 = shufflevector <4 x i1> %1665, <4 x i1> poison, <2 x i32> , !dbg !69 + %1667 = or <4 x i1> %1655, %1661, !dbg !69 + %1668 = or <4 x i1> %1655, %1661, !dbg !69 + %1669 = shufflevector <4 x i1> %1668, <4 x i1> poison, <2 x i32> , !dbg !69 + %1670 = or <4 x i1> %1655, %1661, !dbg !69 + %1671 = shufflevector <4 x i1> %1670, <4 x i1> poison, <2 x i32> , !dbg !69 + %foldExtExtBinop = xor <4 x i32> %1548, %1582, !dbg !32 + %foldExtExtBinop113 = xor <4 x i32> %1548, %1582, !dbg !32 + %foldExtExtBinop115 = xor <4 x i32> %1548, %1582, !dbg !32 + %foldExtExtBinop117 = xor <4 x i32> %1548, %1582, !dbg !32 + %foldExtExtBinop119 = xor <4 x i32> %1561, %1595, !dbg !32 + %foldExtExtBinop121 = xor <4 x i32> %1561, %1595, !dbg !32 + %foldExtExtBinop123 = xor <4 x i32> %1561, %1595, !dbg !32 + %foldExtExtBinop125 = xor <4 x i32> %1561, %1595, !dbg !32 + %1672 = shufflevector <2 x i1> %1664, <2 x i1> %1666, <2 x i32> , !dbg !33 + %1673 = shufflevector <4 x i32> %foldExtExtBinop115, <4 x i32> %foldExtExtBinop, <2 x i32> , !dbg !33 + %1674 = select <2 x i1> %1672, <2 x i32> %1673, <2 x i32> zeroinitializer, !dbg !33 + %1675 = shufflevector <2 x i1> %1666, <2 x i1> %1664, <2 x i32> , !dbg !33 + %1676 = shufflevector <4 x i32> %foldExtExtBinop117, <4 x i32> %foldExtExtBinop113, <2 x i32> , !dbg !33 + %1677 = select <2 x i1> %1675, <2 x i32> %1676, <2 x i32> zeroinitializer, !dbg !33 + %1678 = shufflevector <2 x i1> %1669, <2 x i1> %1671, <2 x i32> , !dbg !33 + %1679 = shufflevector <4 x i32> %foldExtExtBinop123, <4 x i32> %foldExtExtBinop119, <2 x i32> , !dbg !33 + %1680 = select <2 x i1> %1678, <2 x i32> %1679, <2 x i32> zeroinitializer, !dbg !33 + %1681 = shufflevector <2 x i1> %1671, <2 x i1> %1669, <2 x i32> , !dbg !33 + %1682 = shufflevector <4 x i32> %foldExtExtBinop125, <4 x i32> %foldExtExtBinop121, <2 x i32> , !dbg !33 + %1683 = select <2 x i1> %1681, <2 x i32> %1682, <2 x i32> zeroinitializer, !dbg !33 + %1684 = shufflevector <2 x i32> %1674, <2 x i32> %1677, <2 x i32> , !dbg !34 + %1685 = shufflevector <2 x i32> %1486, <2 x i32> %1489, <2 x i32> , !dbg !34 + %1686 = xor <2 x i32> %1684, %1685, !dbg !34 + %1687 = xor <2 x i32> %1674, %1486, !dbg !34 + %1688 = xor <2 x i32> %1677, %1489, !dbg !34 + %1689 = shufflevector <2 x i32> %1677, <2 x i32> %1674, <2 x i32> , !dbg !34 + %1690 = shufflevector <2 x i32> %1489, <2 x i32> %1486, <2 x i32> , !dbg !34 + %1691 = xor <2 x i32> %1689, %1690, !dbg !34 + %1692 = shufflevector <2 x i32> %1680, <2 x i32> %1683, <2 x i32> , !dbg !34 + %1693 = shufflevector <2 x i32> %1494, <2 x i32> %1497, <2 x i32> , !dbg !34 + %1694 = xor <2 x i32> %1692, %1693, !dbg !34 + %1695 = xor <2 x i32> %1680, %1494, !dbg !34 + %1696 = xor <2 x i32> %1683, %1497, !dbg !34 + %1697 = shufflevector <2 x i32> %1683, <2 x i32> %1680, <2 x i32> , !dbg !34 + %1698 = shufflevector <2 x i32> %1497, <2 x i32> %1494, <2 x i32> , !dbg !34 + %1699 = xor <2 x i32> %1697, %1698, !dbg !34 + %1700 = xor <4 x i32> %1640, %1612, !dbg !38 + %1701 = xor <4 x i32> %1640, %1612, !dbg !38 + %1702 = shufflevector <4 x i32> %1701, <4 x i32> poison, <2 x i32> , !dbg !38 + %1703 = xor <4 x i32> %1640, %1612, !dbg !38 + %1704 = shufflevector <4 x i32> %1703, <4 x i32> poison, <2 x i32> , !dbg !38 + %1705 = xor <4 x i32> %1653, %1625, !dbg !38 + %1706 = xor <4 x i32> %1653, %1625, !dbg !38 + %1707 = shufflevector <4 x i32> %1706, <4 x i32> poison, <2 x i32> , !dbg !38 + %1708 = xor <4 x i32> %1653, %1625, !dbg !38 + %1709 = shufflevector <4 x i32> %1708, <4 x i32> poison, <2 x i32> , !dbg !38 + %1710 = select <4 x i1> %1662, <4 x i32> %1700, <4 x i32> zeroinitializer, !dbg !35 + %1711 = select <2 x i1> %1664, <2 x i32> %1702, <2 x i32> zeroinitializer, !dbg !35 + %1712 = select <2 x i1> %1666, <2 x i32> %1704, <2 x i32> zeroinitializer, !dbg !35 + %1713 = select <4 x i1> %1667, <4 x i32> %1705, <4 x i32> zeroinitializer, !dbg !35 + %1714 = select <2 x i1> %1669, <2 x i32> %1707, <2 x i32> zeroinitializer, !dbg !35 + %1715 = select <2 x i1> %1671, <2 x i32> %1709, <2 x i32> zeroinitializer, !dbg !35 + %1716 = xor <4 x i32> %1513, %1710, !dbg !36 + %1717 = shufflevector <2 x i32> %1712, <2 x i32> %1711, <2 x i32> , !dbg !36 + %1718 = shufflevector <2 x i32> %1515, <2 x i32> %1514, <2 x i32> , !dbg !36 + %1719 = xor <2 x i32> %1717, %1718, !dbg !36 + %1720 = shufflevector <2 x i32> %1711, <2 x i32> %1712, <2 x i32> , !dbg !36 + %1721 = shufflevector <2 x i32> %1514, <2 x i32> %1515, <2 x i32> , !dbg !36 + %1722 = xor <2 x i32> %1720, %1721, !dbg !36 + %1723 = shufflevector <2 x i32> %1712, <2 x i32> %1711, <2 x i32> , !dbg !36 + %1724 = shufflevector <2 x i32> %1515, <2 x i32> %1514, <2 x i32> , !dbg !36 + %1725 = xor <2 x i32> %1723, %1724, !dbg !36 + %1726 = shufflevector <2 x i32> %1711, <2 x i32> %1712, <2 x i32> , !dbg !36 + %1727 = shufflevector <2 x i32> %1514, <2 x i32> %1515, <2 x i32> , !dbg !36 + %1728 = xor <2 x i32> %1726, %1727, !dbg !36 + %1729 = xor <4 x i32> %1517, %1713, !dbg !36 + %1730 = shufflevector <2 x i32> %1715, <2 x i32> %1714, <2 x i32> , !dbg !36 + %1731 = shufflevector <2 x i32> %1519, <2 x i32> %1518, <2 x i32> , !dbg !36 + %1732 = xor <2 x i32> %1730, %1731, !dbg !36 + %1733 = shufflevector <2 x i32> %1714, <2 x i32> %1715, <2 x i32> , !dbg !36 + %1734 = shufflevector <2 x i32> %1518, <2 x i32> %1519, <2 x i32> , !dbg !36 + %1735 = xor <2 x i32> %1733, %1734, !dbg !36 + %1736 = shufflevector <2 x i32> %1715, <2 x i32> %1714, <2 x i32> , !dbg !36 + %1737 = shufflevector <2 x i32> %1519, <2 x i32> %1518, <2 x i32> , !dbg !36 + %1738 = xor <2 x i32> %1736, %1737, !dbg !36 + %1739 = shufflevector <2 x i32> %1714, <2 x i32> %1715, <2 x i32> , !dbg !36 + %1740 = shufflevector <2 x i32> %1518, <2 x i32> %1519, <2 x i32> , !dbg !36 + %1741 = xor <2 x i32> %1739, %1740, !dbg !36 + %1742 = shufflevector <2 x i32> %1687, <2 x i32> %1688, <4 x i32> , !dbg !39 + %1743 = mul nuw nsw <4 x i32> %1742, %250, !dbg !39 + %1744 = shufflevector <2 x i32> %1695, <2 x i32> %1696, <4 x i32> , !dbg !39 + %1745 = mul nuw nsw <4 x i32> %1744, %250, !dbg !39 + %1746 = extractelement <4 x i32> %1743, i64 0, !dbg !64 + %1747 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1746, i32 1, i32 31), !dbg !64 + %1748 = extractelement <4 x i32> %1743, i64 1, !dbg !64 + %1749 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1748, i32 1, i32 31), !dbg !64 + %1750 = extractelement <4 x i32> %1743, i64 2, !dbg !64 + %1751 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1750, i32 1, i32 31), !dbg !64 + %1752 = extractelement <4 x i32> %1743, i64 3, !dbg !64 + %1753 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1752, i32 1, i32 31), !dbg !64 + %1754 = insertelement <4 x i32> poison, i32 %1747, i64 0, !dbg !65 + %1755 = insertelement <4 x i32> %1754, i32 %1749, i64 1, !dbg !65 + %1756 = insertelement <4 x i32> %1755, i32 %1751, i64 2, !dbg !65 + %1757 = insertelement <4 x i32> %1756, i32 %1753, i64 3, !dbg !65 + %1758 = add <4 x i32> %1757, %1743, !dbg !65 + %1759 = extractelement <4 x i32> %1745, i64 0, !dbg !64 + %1760 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1759, i32 1, i32 31), !dbg !64 + %1761 = extractelement <4 x i32> %1745, i64 1, !dbg !64 + %1762 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1761, i32 1, i32 31), !dbg !64 + %1763 = extractelement <4 x i32> %1745, i64 2, !dbg !64 + %1764 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1763, i32 1, i32 31), !dbg !64 + %1765 = extractelement <4 x i32> %1745, i64 3, !dbg !64 + %1766 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1765, i32 1, i32 31), !dbg !64 + %1767 = insertelement <4 x i32> poison, i32 %1760, i64 0, !dbg !65 + %1768 = insertelement <4 x i32> %1767, i32 %1762, i64 1, !dbg !65 + %1769 = insertelement <4 x i32> %1768, i32 %1764, i64 2, !dbg !65 + %1770 = insertelement <4 x i32> %1769, i32 %1766, i64 3, !dbg !65 + %1771 = add <4 x i32> %1770, %1745, !dbg !65 + %1772 = mul nuw nsw <4 x i32> %1742, %252, !dbg !40 + %1773 = mul nuw nsw <4 x i32> %1744, %252, !dbg !40 + %1774 = extractelement <4 x i32> %1772, i64 0, !dbg !64 + %1775 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1774, i32 1, i32 31), !dbg !64 + %1776 = extractelement <4 x i32> %1772, i64 1, !dbg !64 + %1777 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1776, i32 1, i32 31), !dbg !64 + %1778 = extractelement <4 x i32> %1772, i64 2, !dbg !64 + %1779 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1778, i32 1, i32 31), !dbg !64 + %1780 = extractelement <4 x i32> %1772, i64 3, !dbg !64 + %1781 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1780, i32 1, i32 31), !dbg !64 + %1782 = insertelement <4 x i32> poison, i32 %1775, i64 0, !dbg !65 + %1783 = insertelement <4 x i32> %1782, i32 %1777, i64 1, !dbg !65 + %1784 = insertelement <4 x i32> %1783, i32 %1779, i64 2, !dbg !65 + %1785 = insertelement <4 x i32> %1784, i32 %1781, i64 3, !dbg !65 + %1786 = add <4 x i32> %1785, %1772, !dbg !65 + %1787 = extractelement <4 x i32> %1773, i64 0, !dbg !64 + %1788 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1787, i32 1, i32 31), !dbg !64 + %1789 = extractelement <4 x i32> %1773, i64 1, !dbg !64 + %1790 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1789, i32 1, i32 31), !dbg !64 + %1791 = extractelement <4 x i32> %1773, i64 2, !dbg !64 + %1792 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1791, i32 1, i32 31), !dbg !64 + %1793 = extractelement <4 x i32> %1773, i64 3, !dbg !64 + %1794 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1793, i32 1, i32 31), !dbg !64 + %1795 = insertelement <4 x i32> poison, i32 %1788, i64 0, !dbg !65 + %1796 = insertelement <4 x i32> %1795, i32 %1790, i64 1, !dbg !65 + %1797 = insertelement <4 x i32> %1796, i32 %1792, i64 2, !dbg !65 + %1798 = insertelement <4 x i32> %1797, i32 %1794, i64 3, !dbg !65 + %1799 = add <4 x i32> %1798, %1773, !dbg !65 + %1800 = extractelement <2 x i32> %1728, i64 1, !dbg !41 + %1801 = mul nuw nsw i32 %1800, %55, !dbg !42 + %1802 = extractelement <2 x i32> %1725, i64 1, !dbg !41 + %1803 = mul nuw nsw i32 %1802, %55, !dbg !42 + %1804 = extractelement <2 x i32> %1728, i64 0, !dbg !41 + %1805 = mul nuw nsw i32 %1804, %55, !dbg !42 + %1806 = extractelement <2 x i32> %1725, i64 0, !dbg !41 + %1807 = mul nuw nsw i32 %1806, %55, !dbg !42 + %1808 = extractelement <2 x i32> %1741, i64 1, !dbg !41 + %1809 = mul nuw nsw i32 %1808, %55, !dbg !42 + %1810 = extractelement <2 x i32> %1738, i64 1, !dbg !41 + %1811 = mul nuw nsw i32 %1810, %55, !dbg !42 + %1812 = extractelement <2 x i32> %1741, i64 0, !dbg !41 + %1813 = mul nuw nsw i32 %1812, %55, !dbg !42 + %1814 = extractelement <2 x i32> %1738, i64 0, !dbg !41 + %1815 = mul nuw nsw i32 %1814, %55, !dbg !42 + %1816 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1801, i32 1, i32 31), !dbg !64 + %1817 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1803, i32 1, i32 31), !dbg !64 + %1818 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1805, i32 1, i32 31), !dbg !64 + %1819 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1807, i32 1, i32 31), !dbg !64 + %1820 = insertelement <2 x i32> poison, i32 %1818, i64 0, !dbg !65 + %1821 = insertelement <2 x i32> %1820, i32 %1817, i64 1, !dbg !65 + %1822 = insertelement <2 x i32> poison, i32 %1805, i64 0, !dbg !65 + %1823 = insertelement <2 x i32> %1822, i32 %1803, i64 1, !dbg !65 + %1824 = add <2 x i32> %1821, %1823, !dbg !65 + %1825 = insertelement <4 x i32> poison, i32 %1816, i64 0, !dbg !65 + %1826 = insertelement <4 x i32> %1825, i32 %1817, i64 1, !dbg !65 + %1827 = insertelement <4 x i32> %1826, i32 %1818, i64 2, !dbg !65 + %1828 = insertelement <4 x i32> %1827, i32 %1819, i64 3, !dbg !65 + %1829 = insertelement <4 x i32> poison, i32 %1801, i64 0, !dbg !65 + %1830 = insertelement <4 x i32> %1829, i32 %1803, i64 1, !dbg !65 + %1831 = insertelement <4 x i32> %1830, i32 %1805, i64 2, !dbg !65 + %1832 = insertelement <4 x i32> %1831, i32 %1807, i64 3, !dbg !65 + %1833 = add <4 x i32> %1828, %1832, !dbg !65 + %1834 = insertelement <2 x i32> poison, i32 %1819, i64 0, !dbg !65 + %1835 = insertelement <2 x i32> %1834, i32 %1816, i64 1, !dbg !65 + %1836 = insertelement <2 x i32> poison, i32 %1807, i64 0, !dbg !65 + %1837 = insertelement <2 x i32> %1836, i32 %1801, i64 1, !dbg !65 + %1838 = add <2 x i32> %1835, %1837, !dbg !65 + %1839 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1809, i32 1, i32 31), !dbg !64 + %1840 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1811, i32 1, i32 31), !dbg !64 + %1841 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1813, i32 1, i32 31), !dbg !64 + %1842 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1815, i32 1, i32 31), !dbg !64 + %1843 = insertelement <2 x i32> poison, i32 %1841, i64 0, !dbg !65 + %1844 = insertelement <2 x i32> %1843, i32 %1840, i64 1, !dbg !65 + %1845 = insertelement <2 x i32> poison, i32 %1813, i64 0, !dbg !65 + %1846 = insertelement <2 x i32> %1845, i32 %1811, i64 1, !dbg !65 + %1847 = add <2 x i32> %1844, %1846, !dbg !65 + %1848 = insertelement <4 x i32> poison, i32 %1839, i64 0, !dbg !65 + %1849 = insertelement <4 x i32> %1848, i32 %1840, i64 1, !dbg !65 + %1850 = insertelement <4 x i32> %1849, i32 %1841, i64 2, !dbg !65 + %1851 = insertelement <4 x i32> %1850, i32 %1842, i64 3, !dbg !65 + %1852 = insertelement <4 x i32> poison, i32 %1809, i64 0, !dbg !65 + %1853 = insertelement <4 x i32> %1852, i32 %1811, i64 1, !dbg !65 + %1854 = insertelement <4 x i32> %1853, i32 %1813, i64 2, !dbg !65 + %1855 = insertelement <4 x i32> %1854, i32 %1815, i64 3, !dbg !65 + %1856 = add <4 x i32> %1851, %1855, !dbg !65 + %1857 = insertelement <2 x i32> poison, i32 %1842, i64 0, !dbg !65 + %1858 = insertelement <2 x i32> %1857, i32 %1839, i64 1, !dbg !65 + %1859 = insertelement <2 x i32> poison, i32 %1815, i64 0, !dbg !65 + %1860 = insertelement <2 x i32> %1859, i32 %1809, i64 1, !dbg !65 + %1861 = add <2 x i32> %1858, %1860, !dbg !65 + %1862 = mul nuw nsw i32 %1800, %53, !dbg !41 + %1863 = mul nuw nsw i32 %1802, %53, !dbg !41 + %1864 = mul nuw nsw i32 %1804, %53, !dbg !41 + %1865 = mul nuw nsw i32 %1806, %53, !dbg !41 + %1866 = mul nuw nsw i32 %1808, %53, !dbg !41 + %1867 = mul nuw nsw i32 %1810, %53, !dbg !41 + %1868 = mul nuw nsw i32 %1812, %53, !dbg !41 + %1869 = mul nuw nsw i32 %1814, %53, !dbg !41 + %1870 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1862, i32 1, i32 31), !dbg !64 + %1871 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1863, i32 1, i32 31), !dbg !64 + %1872 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1864, i32 1, i32 31), !dbg !64 + %1873 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1865, i32 1, i32 31), !dbg !64 + %1874 = insertelement <2 x i32> poison, i32 %1872, i64 0, !dbg !65 + %1875 = insertelement <2 x i32> %1874, i32 %1871, i64 1, !dbg !65 + %1876 = insertelement <2 x i32> poison, i32 %1864, i64 0, !dbg !65 + %1877 = insertelement <2 x i32> %1876, i32 %1863, i64 1, !dbg !65 + %1878 = add <2 x i32> %1875, %1877, !dbg !65 + %1879 = insertelement <4 x i32> poison, i32 %1870, i64 0, !dbg !65 + %1880 = insertelement <4 x i32> %1879, i32 %1871, i64 1, !dbg !65 + %1881 = insertelement <4 x i32> %1880, i32 %1872, i64 2, !dbg !65 + %1882 = insertelement <4 x i32> %1881, i32 %1873, i64 3, !dbg !65 + %1883 = insertelement <4 x i32> poison, i32 %1862, i64 0, !dbg !65 + %1884 = insertelement <4 x i32> %1883, i32 %1863, i64 1, !dbg !65 + %1885 = insertelement <4 x i32> %1884, i32 %1864, i64 2, !dbg !65 + %1886 = insertelement <4 x i32> %1885, i32 %1865, i64 3, !dbg !65 + %1887 = add <4 x i32> %1882, %1886, !dbg !65 + %1888 = insertelement <2 x i32> poison, i32 %1873, i64 0, !dbg !65 + %1889 = insertelement <2 x i32> %1888, i32 %1870, i64 1, !dbg !65 + %1890 = insertelement <2 x i32> poison, i32 %1865, i64 0, !dbg !65 + %1891 = insertelement <2 x i32> %1890, i32 %1862, i64 1, !dbg !65 + %1892 = add <2 x i32> %1889, %1891, !dbg !65 + %1893 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1866, i32 1, i32 31), !dbg !64 + %1894 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1867, i32 1, i32 31), !dbg !64 + %1895 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1868, i32 1, i32 31), !dbg !64 + %1896 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1869, i32 1, i32 31), !dbg !64 + %1897 = insertelement <2 x i32> poison, i32 %1895, i64 0, !dbg !65 + %1898 = insertelement <2 x i32> %1897, i32 %1894, i64 1, !dbg !65 + %1899 = insertelement <2 x i32> poison, i32 %1868, i64 0, !dbg !65 + %1900 = insertelement <2 x i32> %1899, i32 %1867, i64 1, !dbg !65 + %1901 = add <2 x i32> %1898, %1900, !dbg !65 + %1902 = insertelement <4 x i32> poison, i32 %1893, i64 0, !dbg !65 + %1903 = insertelement <4 x i32> %1902, i32 %1894, i64 1, !dbg !65 + %1904 = insertelement <4 x i32> %1903, i32 %1895, i64 2, !dbg !65 + %1905 = insertelement <4 x i32> %1904, i32 %1896, i64 3, !dbg !65 + %1906 = insertelement <4 x i32> poison, i32 %1866, i64 0, !dbg !65 + %1907 = insertelement <4 x i32> %1906, i32 %1867, i64 1, !dbg !65 + %1908 = insertelement <4 x i32> %1907, i32 %1868, i64 2, !dbg !65 + %1909 = insertelement <4 x i32> %1908, i32 %1869, i64 3, !dbg !65 + %1910 = add <4 x i32> %1905, %1909, !dbg !65 + %1911 = insertelement <2 x i32> poison, i32 %1896, i64 0, !dbg !65 + %1912 = insertelement <2 x i32> %1911, i32 %1893, i64 1, !dbg !65 + %1913 = insertelement <2 x i32> poison, i32 %1869, i64 0, !dbg !65 + %1914 = insertelement <2 x i32> %1913, i32 %1866, i64 1, !dbg !65 + %1915 = add <2 x i32> %1912, %1914, !dbg !65 + %1916 = icmp slt <4 x i32> %1758, %1786, !dbg !29 + %1917 = icmp slt <4 x i32> %1758, %1786, !dbg !29 + %1918 = shufflevector <4 x i1> %1917, <4 x i1> poison, <2 x i32> , !dbg !29 + %1919 = icmp slt <4 x i32> %1758, %1786, !dbg !29 + %1920 = shufflevector <4 x i1> %1919, <4 x i1> poison, <2 x i32> , !dbg !29 + %1921 = icmp slt <4 x i32> %1771, %1799, !dbg !29 + %1922 = icmp slt <4 x i32> %1771, %1799, !dbg !29 + %1923 = shufflevector <4 x i1> %1922, <4 x i1> poison, <2 x i32> , !dbg !29 + %1924 = icmp slt <4 x i32> %1771, %1799, !dbg !29 + %1925 = shufflevector <4 x i1> %1924, <4 x i1> poison, <2 x i32> , !dbg !29 + %1926 = icmp eq <4 x i32> %1758, %1786, !dbg !66 + %1927 = icmp eq <4 x i32> %1771, %1799, !dbg !66 + %1928 = icmp sgt <4 x i32> %1833, %1887, !dbg !67 + %1929 = icmp sgt <4 x i32> %1856, %1910, !dbg !67 + %1930 = and <4 x i1> %1926, %1928, !dbg !68 + %1931 = and <4 x i1> %1926, %1928, !dbg !68 + %1932 = shufflevector <4 x i1> %1931, <4 x i1> poison, <2 x i32> , !dbg !68 + %1933 = and <4 x i1> %1926, %1928, !dbg !68 + %1934 = shufflevector <4 x i1> %1933, <4 x i1> poison, <2 x i32> , !dbg !68 + %1935 = and <4 x i1> %1927, %1929, !dbg !68 + %1936 = and <4 x i1> %1927, %1929, !dbg !68 + %1937 = shufflevector <4 x i1> %1936, <4 x i1> poison, <2 x i32> , !dbg !68 + %1938 = and <4 x i1> %1927, %1929, !dbg !68 + %1939 = shufflevector <4 x i1> %1938, <4 x i1> poison, <2 x i32> , !dbg !68 + %1940 = or <4 x i1> %1916, %1930, !dbg !69 + %1941 = or <2 x i1> %1918, %1932, !dbg !69 + %1942 = or <2 x i1> %1920, %1934, !dbg !69 + %1943 = shufflevector <2 x i1> %1920, <2 x i1> %1918, <2 x i32> , !dbg !69 + %1944 = shufflevector <2 x i1> %1934, <2 x i1> %1932, <2 x i32> , !dbg !69 + %1945 = or <2 x i1> %1943, %1944, !dbg !69 + %1946 = shufflevector <2 x i1> %1918, <2 x i1> %1920, <2 x i32> , !dbg !69 + %1947 = shufflevector <2 x i1> %1932, <2 x i1> %1934, <2 x i32> , !dbg !69 + %1948 = or <2 x i1> %1946, %1947, !dbg !69 + %1949 = or <4 x i1> %1921, %1935, !dbg !69 + %1950 = or <2 x i1> %1923, %1937, !dbg !69 + %1951 = or <2 x i1> %1925, %1939, !dbg !69 + %1952 = shufflevector <2 x i1> %1925, <2 x i1> %1923, <2 x i32> , !dbg !69 + %1953 = shufflevector <2 x i1> %1939, <2 x i1> %1937, <2 x i32> , !dbg !69 + %1954 = or <2 x i1> %1952, %1953, !dbg !69 + %1955 = shufflevector <2 x i1> %1923, <2 x i1> %1925, <2 x i32> , !dbg !69 + %1956 = shufflevector <2 x i1> %1937, <2 x i1> %1939, <2 x i32> , !dbg !69 + %1957 = or <2 x i1> %1955, %1956, !dbg !69 + %1958 = xor <4 x i32> %1786, %1758, !dbg !32 + %1959 = shufflevector <4 x i32> %1958, <4 x i32> poison, <2 x i32> , !dbg !32 + %1960 = xor <4 x i32> %1786, %1758, !dbg !32 + %1961 = shufflevector <4 x i32> %1960, <4 x i32> poison, <2 x i32> , !dbg !32 + %1962 = xor <4 x i32> %1799, %1771, !dbg !32 + %1963 = shufflevector <4 x i32> %1962, <4 x i32> poison, <2 x i32> , !dbg !32 + %1964 = xor <4 x i32> %1799, %1771, !dbg !32 + %1965 = shufflevector <4 x i32> %1964, <4 x i32> poison, <2 x i32> , !dbg !32 + %1966 = shufflevector <2 x i1> %1948, <2 x i1> %1945, <2 x i32> , !dbg !33 + %1967 = shufflevector <2 x i32> %1959, <2 x i32> %1961, <2 x i32> , !dbg !33 + %1968 = select <2 x i1> %1966, <2 x i32> %1967, <2 x i32> zeroinitializer, !dbg !33 + %1969 = select <2 x i1> %1948, <2 x i32> %1959, <2 x i32> zeroinitializer, !dbg !33 + %1970 = select <2 x i1> %1945, <2 x i32> %1961, <2 x i32> zeroinitializer, !dbg !33 + %1971 = shufflevector <2 x i1> %1945, <2 x i1> %1948, <2 x i32> , !dbg !33 + %1972 = shufflevector <2 x i32> %1961, <2 x i32> %1959, <2 x i32> , !dbg !33 + %1973 = select <2 x i1> %1971, <2 x i32> %1972, <2 x i32> zeroinitializer, !dbg !33 + %1974 = shufflevector <2 x i1> %1957, <2 x i1> %1954, <2 x i32> , !dbg !33 + %1975 = shufflevector <2 x i32> %1963, <2 x i32> %1965, <2 x i32> , !dbg !33 + %1976 = select <2 x i1> %1974, <2 x i32> %1975, <2 x i32> zeroinitializer, !dbg !33 + %1977 = select <2 x i1> %1957, <2 x i32> %1963, <2 x i32> zeroinitializer, !dbg !33 + %1978 = select <2 x i1> %1954, <2 x i32> %1965, <2 x i32> zeroinitializer, !dbg !33 + %1979 = shufflevector <2 x i1> %1954, <2 x i1> %1957, <2 x i32> , !dbg !33 + %1980 = shufflevector <2 x i32> %1965, <2 x i32> %1963, <2 x i32> , !dbg !33 + %1981 = select <2 x i1> %1979, <2 x i32> %1980, <2 x i32> zeroinitializer, !dbg !33 + %1982 = xor <2 x i32> %1968, %1686, !dbg !34 + %1983 = xor <2 x i32> %1969, %1687, !dbg !34 + %1984 = xor <2 x i32> %1970, %1688, !dbg !34 + %1985 = xor <2 x i32> %1973, %1691, !dbg !34 + %1986 = xor <2 x i32> %1976, %1694, !dbg !34 + %1987 = xor <2 x i32> %1977, %1695, !dbg !34 + %1988 = xor <2 x i32> %1978, %1696, !dbg !34 + %1989 = xor <2 x i32> %1981, %1699, !dbg !34 + %1990 = shufflevector <2 x i32> %1892, <2 x i32> %1878, <4 x i32> , !dbg !38 + %1991 = shufflevector <2 x i32> %1838, <2 x i32> %1824, <4 x i32> , !dbg !38 + %1992 = xor <4 x i32> %1990, %1991, !dbg !38 + %1993 = xor <2 x i32> %1878, %1824, !dbg !38 + %1994 = xor <2 x i32> %1892, %1838, !dbg !38 + %1995 = shufflevector <2 x i32> %1915, <2 x i32> %1901, <4 x i32> , !dbg !38 + %1996 = shufflevector <2 x i32> %1861, <2 x i32> %1847, <4 x i32> , !dbg !38 + %1997 = xor <4 x i32> %1995, %1996, !dbg !38 + %1998 = xor <2 x i32> %1901, %1847, !dbg !38 + %1999 = xor <2 x i32> %1915, %1861, !dbg !38 + %2000 = select <4 x i1> %1940, <4 x i32> %1992, <4 x i32> zeroinitializer, !dbg !35 + %2001 = select <2 x i1> %1941, <2 x i32> %1993, <2 x i32> zeroinitializer, !dbg !35 + %2002 = select <2 x i1> %1942, <2 x i32> %1994, <2 x i32> zeroinitializer, !dbg !35 + %2003 = shufflevector <2 x i1> %1945, <2 x i1> %1948, <2 x i32> , !dbg !35 + %2004 = shufflevector <2 x i32> %1993, <2 x i32> %1994, <2 x i32> , !dbg !35 + %2005 = select <2 x i1> %2003, <2 x i32> %2004, <2 x i32> zeroinitializer, !dbg !35 + %2006 = shufflevector <2 x i1> %1945, <2 x i1> %1948, <2 x i32> , !dbg !35 + %2007 = shufflevector <2 x i32> %1994, <2 x i32> %1993, <2 x i32> , !dbg !35 + %2008 = select <2 x i1> %2006, <2 x i32> %2007, <2 x i32> zeroinitializer, !dbg !35 + %2009 = shufflevector <2 x i32> %1994, <2 x i32> %1993, <2 x i32> , !dbg !35 + %2010 = select <2 x i1> %1945, <2 x i32> %2009, <2 x i32> zeroinitializer, !dbg !35 + %2011 = shufflevector <2 x i32> %1993, <2 x i32> %1994, <2 x i32> , !dbg !35 + %2012 = select <2 x i1> %1948, <2 x i32> %2011, <2 x i32> zeroinitializer, !dbg !35 + %2013 = select <4 x i1> %1949, <4 x i32> %1997, <4 x i32> zeroinitializer, !dbg !35 + %2014 = select <2 x i1> %1950, <2 x i32> %1998, <2 x i32> zeroinitializer, !dbg !35 + %2015 = select <2 x i1> %1951, <2 x i32> %1999, <2 x i32> zeroinitializer, !dbg !35 + %2016 = shufflevector <2 x i1> %1954, <2 x i1> %1957, <2 x i32> , !dbg !35 + %2017 = shufflevector <2 x i32> %1998, <2 x i32> %1999, <2 x i32> , !dbg !35 + %2018 = select <2 x i1> %2016, <2 x i32> %2017, <2 x i32> zeroinitializer, !dbg !35 + %2019 = shufflevector <2 x i1> %1954, <2 x i1> %1957, <2 x i32> , !dbg !35 + %2020 = shufflevector <2 x i32> %1999, <2 x i32> %1998, <2 x i32> , !dbg !35 + %2021 = select <2 x i1> %2019, <2 x i32> %2020, <2 x i32> zeroinitializer, !dbg !35 + %2022 = shufflevector <2 x i32> %1999, <2 x i32> %1998, <2 x i32> , !dbg !35 + %2023 = select <2 x i1> %1954, <2 x i32> %2022, <2 x i32> zeroinitializer, !dbg !35 + %2024 = shufflevector <2 x i32> %1998, <2 x i32> %1999, <2 x i32> , !dbg !35 + %2025 = select <2 x i1> %1957, <2 x i32> %2024, <2 x i32> zeroinitializer, !dbg !35 + %2026 = xor <4 x i32> %1716, %2000, !dbg !36 + %2027 = xor <2 x i32> %2005, %1719, !dbg !36 + %2028 = xor <2 x i32> %2008, %1722, !dbg !36 + %2029 = xor <2 x i32> %2010, %1725, !dbg !36 + %2030 = xor <2 x i32> %2012, %1728, !dbg !36 + %2031 = xor <4 x i32> %1729, %2013, !dbg !36 + %2032 = xor <2 x i32> %2018, %1732, !dbg !36 + %2033 = xor <2 x i32> %2021, %1735, !dbg !36 + %2034 = xor <2 x i32> %2023, %1738, !dbg !36 + %2035 = xor <2 x i32> %2025, %1741, !dbg !36 + %2036 = extractelement <2 x i32> %1983, i64 0, !dbg !29 + %2037 = extractelement <2 x i32> %1983, i64 1, !dbg !29 + %2038 = icmp slt i32 %2037, %2036, !dbg !29 + %2039 = extractelement <2 x i32> %1984, i64 0, !dbg !29 + %2040 = extractelement <2 x i32> %1984, i64 1, !dbg !29 + %2041 = icmp slt i32 %2040, %2039, !dbg !29 + %2042 = extractelement <2 x i32> %1987, i64 0, !dbg !29 + %2043 = extractelement <2 x i32> %1987, i64 1, !dbg !29 + %2044 = icmp slt i32 %2043, %2042, !dbg !29 + %2045 = extractelement <2 x i32> %1988, i64 0, !dbg !29 + %2046 = extractelement <2 x i32> %1988, i64 1, !dbg !29 + %2047 = icmp slt i32 %2046, %2045, !dbg !29 + %2048 = icmp eq i32 %2037, %2036, !dbg !66 + %2049 = icmp eq i32 %2040, %2039, !dbg !66 + %2050 = icmp eq i32 %2043, %2042, !dbg !66 + %2051 = icmp eq i32 %2046, %2045, !dbg !66 + %shift127 = shufflevector <2 x i32> %2030, <2 x i32> poison, <2 x i32> , !dbg !67 + %2052 = icmp sgt <2 x i32> %shift127, %2030, !dbg !67 + %2053 = extractelement <2 x i1> %2052, i64 0, !dbg !67 + %shift128 = shufflevector <2 x i32> %2029, <2 x i32> poison, <2 x i32> , !dbg !67 + %2054 = icmp sgt <2 x i32> %shift128, %2029, !dbg !67 + %2055 = extractelement <2 x i1> %2054, i64 0, !dbg !67 + %shift129 = shufflevector <2 x i32> %2035, <2 x i32> poison, <2 x i32> , !dbg !67 + %2056 = icmp sgt <2 x i32> %shift129, %2035, !dbg !67 + %2057 = extractelement <2 x i1> %2056, i64 0, !dbg !67 + %shift130 = shufflevector <2 x i32> %2034, <2 x i32> poison, <2 x i32> , !dbg !67 + %2058 = icmp sgt <2 x i32> %shift130, %2034, !dbg !67 + %2059 = extractelement <2 x i1> %2058, i64 0, !dbg !67 + %2060 = and i1 %2048, %2053, !dbg !68 + %2061 = and i1 %2049, %2055, !dbg !68 + %2062 = and i1 %2050, %2057, !dbg !68 + %2063 = and i1 %2051, %2059, !dbg !68 + %2064 = insertelement <2 x i1> poison, i1 %2038, i64 0, !dbg !69 + %2065 = insertelement <2 x i1> %2064, i1 %2041, i64 1, !dbg !69 + %2066 = insertelement <2 x i1> poison, i1 %2060, i64 0, !dbg !69 + %2067 = insertelement <2 x i1> %2066, i1 %2061, i64 1, !dbg !69 + %2068 = or <2 x i1> %2065, %2067, !dbg !69 + %2069 = insertelement <2 x i1> poison, i1 %2044, i64 0, !dbg !69 + %2070 = insertelement <2 x i1> %2069, i1 %2047, i64 1, !dbg !69 + %2071 = insertelement <2 x i1> poison, i1 %2062, i64 0, !dbg !69 + %2072 = insertelement <2 x i1> %2071, i1 %2063, i64 1, !dbg !69 + %2073 = or <2 x i1> %2070, %2072, !dbg !69 + %2074 = shufflevector <2 x i32> %1983, <2 x i32> %1984, <2 x i32> , !dbg !32 + %2075 = shufflevector <2 x i32> %1983, <2 x i32> %1984, <2 x i32> , !dbg !32 + %2076 = xor <2 x i32> %2074, %2075, !dbg !32 + %2077 = shufflevector <2 x i32> %1987, <2 x i32> %1988, <2 x i32> , !dbg !32 + %2078 = shufflevector <2 x i32> %1987, <2 x i32> %1988, <2 x i32> , !dbg !32 + %2079 = xor <2 x i32> %2077, %2078, !dbg !32 + %2080 = select <2 x i1> %2068, <2 x i32> %2076, <2 x i32> zeroinitializer, !dbg !33 + %2081 = select <2 x i1> %2073, <2 x i32> %2079, <2 x i32> zeroinitializer, !dbg !33 + %2082 = xor <2 x i32> %2080, %1982, !dbg !34 + %2083 = shufflevector <2 x i32> %2080, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !34 + %2084 = xor <2 x i32> %2083, %1983, !dbg !34 + %2085 = shufflevector <2 x i32> %2080, <2 x i32> poison, <2 x i32> , !dbg !34 + %2086 = shufflevector <2 x i32> %2080, <2 x i32> poison, <2 x i32> , !dbg !34 + %2087 = xor <2 x i32> %2086, %1984, !dbg !34 + %2088 = xor <2 x i32> %2085, %1985, !dbg !34 + %2089 = xor <2 x i32> %2081, %1986, !dbg !34 + %2090 = shufflevector <2 x i32> %2081, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !34 + %2091 = xor <2 x i32> %2090, %1987, !dbg !34 + %2092 = shufflevector <2 x i32> %2081, <2 x i32> poison, <2 x i32> , !dbg !34 + %2093 = shufflevector <2 x i32> %2081, <2 x i32> poison, <2 x i32> , !dbg !34 + %2094 = xor <2 x i32> %2093, %1988, !dbg !34 + %2095 = xor <2 x i32> %2092, %1989, !dbg !34 + %2096 = xor <2 x i32> %2028, %2027, !dbg !38 + %2097 = xor <2 x i32> %2033, %2032, !dbg !38 + %2098 = shufflevector <2 x i1> %2068, <2 x i1> poison, <2 x i32> , !dbg !35 + %2099 = select <2 x i1> %2098, <2 x i32> %2096, <2 x i32> zeroinitializer, !dbg !35 + %2100 = shufflevector <2 x i1> %2073, <2 x i1> poison, <2 x i32> , !dbg !35 + %2101 = select <2 x i1> %2100, <2 x i32> %2097, <2 x i32> zeroinitializer, !dbg !35 + %2102 = shufflevector <2 x i32> %2099, <2 x i32> poison, <4 x i32> , !dbg !36 + %2103 = xor <4 x i32> %2026, %2102, !dbg !36 + %2104 = shufflevector <2 x i32> %2099, <2 x i32> poison, <2 x i32> , !dbg !36 + %2105 = xor <2 x i32> %2001, %2104, !dbg !36 + %2106 = shufflevector <2 x i32> %2099, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !36 + %2107 = xor <2 x i32> %2106, %2029, !dbg !36 + %2108 = shufflevector <2 x i32> %2099, <2 x i32> poison, <2 x i32> , !dbg !36 + %2109 = xor <2 x i32> %2108, %2030, !dbg !36 + %2110 = shufflevector <2 x i32> %2101, <2 x i32> poison, <4 x i32> , !dbg !36 + %2111 = xor <4 x i32> %2031, %2110, !dbg !36 + %2112 = shufflevector <2 x i32> %2101, <2 x i32> poison, <2 x i32> , !dbg !36 + %2113 = xor <2 x i32> %2014, %2112, !dbg !36 + %2114 = shufflevector <2 x i32> %2101, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !36 + %2115 = xor <2 x i32> %2114, %2034, !dbg !36 + %2116 = shufflevector <2 x i32> %2101, <2 x i32> poison, <2 x i32> , !dbg !36 + %2117 = xor <2 x i32> %2116, %2035, !dbg !36 + %2118 = icmp slt <2 x i32> %2084, %2087, !dbg !29 + %2119 = icmp slt <2 x i32> %2091, %2094, !dbg !29 + %2120 = icmp eq <2 x i32> %2082, %2088, !dbg !66 + %2121 = icmp eq <2 x i32> %2089, %2095, !dbg !66 + %2122 = icmp sgt <2 x i32> %2109, %2107, !dbg !67 + %2123 = icmp sgt <2 x i32> %2117, %2115, !dbg !67 + %2124 = and <2 x i1> %2120, %2122, !dbg !68 + %2125 = and <2 x i1> %2121, %2123, !dbg !68 + %2126 = or <2 x i1> %2118, %2124, !dbg !69 + %2127 = or <2 x i1> %2119, %2125, !dbg !69 + %2128 = xor <2 x i32> %2105, %2002, !dbg !38 + %2129 = xor <2 x i32> %2128, %1514, !dbg !38 + %2130 = xor <2 x i32> %2129, %1711, !dbg !38 + %2131 = xor <2 x i32> %2130, %1515, !dbg !38 + %2132 = xor <2 x i32> %2131, %1712, !dbg !38 + %2133 = xor <2 x i32> %2132, %2099, !dbg !38 + %2134 = xor <2 x i32> %2113, %2015, !dbg !38 + %2135 = xor <2 x i32> %2134, %1518, !dbg !38 + %2136 = xor <2 x i32> %2135, %1714, !dbg !38 + %2137 = xor <2 x i32> %2136, %1519, !dbg !38 + %2138 = xor <2 x i32> %2137, %1715, !dbg !38 + %2139 = xor <2 x i32> %2138, %2101, !dbg !38 + %2140 = select <2 x i1> %2126, <2 x i32> %2133, <2 x i32> zeroinitializer, !dbg !35 + %2141 = shufflevector <2 x i32> %2140, <2 x i32> poison, <4 x i32> , !dbg !35 + %2142 = select <2 x i1> %2127, <2 x i32> %2139, <2 x i32> zeroinitializer, !dbg !35 + %2143 = shufflevector <2 x i32> %2142, <2 x i32> poison, <4 x i32> , !dbg !35 + %2144 = xor <4 x i32> %2103, %2141, !dbg !36 + %2145 = xor <4 x i32> %2144, %1374, !dbg !36 + %2146 = xor <4 x i32> %2111, %2143, !dbg !36 + %2147 = xor <4 x i32> %2146, %1375, !dbg !36 + %2148 = insertelement <4 x i1> poison, i1 %21, i64 0, !dbg !59 + %2149 = shufflevector <4 x i1> %2148, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !59 + %2150 = select <4 x i1> %2149, <4 x i1> %399, <4 x i1> zeroinitializer, !dbg !59 + %2151 = bitcast <4 x i1> %393 to i4, !dbg !70 + %2152 = tail call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 %2151), !dbg !70 + %2153 = zext nneg i4 %2152 to i64, !dbg !70 + %2154 = bitcast <4 x i1> %2150 to i4, !dbg !70 + %2155 = tail call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 %2154), !dbg !70 + %2156 = zext nneg i4 %2155 to i64, !dbg !70 + %2157 = zext nneg i4 %2152 to i32, !dbg !72 + %2158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2157, i32 2, i32 31), !dbg !72 + %2159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 2, i32 31), !dbg !72 + %2160 = insertelement <2 x i32> poison, i32 %2158, i64 0, !dbg !72 + %2161 = insertelement <2 x i32> %2160, i32 %2159, i64 1, !dbg !72 + %2162 = bitcast <2 x i32> %2161 to i64, !dbg !72 + %2163 = add i64 %2153, %2162, !dbg !70 + %extelt.offset = lshr i64 %2163, 32, !dbg !72 + %2164 = trunc nuw i64 %extelt.offset to i32, !dbg !72 + %2165 = trunc i64 %2163 to i32, !dbg !72 + %2166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2165, i32 1, i32 31), !dbg !72 + %2167 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2164, i32 1, i32 31), !dbg !72 + %2168 = insertelement <2 x i32> poison, i32 %2166, i64 0, !dbg !72 + %2169 = insertelement <2 x i32> %2168, i32 %2167, i64 1, !dbg !72 + %2170 = bitcast <2 x i32> %2169 to i64, !dbg !72 + %2171 = add i64 %2163, %2170, !dbg !70 + %2172 = zext nneg i4 %2155 to i32, !dbg !72 + %2173 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2172, i32 2, i32 31), !dbg !72 + %2174 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 2, i32 31), !dbg !72 + %2175 = insertelement <2 x i32> poison, i32 %2173, i64 0, !dbg !72 + %2176 = insertelement <2 x i32> %2175, i32 %2174, i64 1, !dbg !72 + %2177 = bitcast <2 x i32> %2176 to i64, !dbg !72 + %2178 = add i64 %2156, %2177, !dbg !70 + %extelt.offset60 = lshr i64 %2178, 32, !dbg !72 + %2179 = trunc nuw i64 %extelt.offset60 to i32, !dbg !72 + %2180 = trunc i64 %2178 to i32, !dbg !72 + %2181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2180, i32 1, i32 31), !dbg !72 + %2182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2179, i32 1, i32 31), !dbg !72 + %2183 = insertelement <2 x i32> poison, i32 %2181, i64 0, !dbg !72 + %2184 = insertelement <2 x i32> %2183, i32 %2182, i64 1, !dbg !72 + %2185 = bitcast <2 x i32> %2184 to i64, !dbg !72 + %2186 = add i64 %2178, %2185, !dbg !70 + %2187 = shl nuw nsw i32 %15, 1, !dbg !73 + %2188 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2187, !dbg !73 + %2189 = insertelement <1 x i64> poison, i64 %2171, i64 0, !dbg !73 + store <1 x i64> %2189, ptr addrspace(3) %2188, align 8, !dbg !73 + %2190 = getelementptr inbounds nuw i8, ptr addrspace(3) %2188, i32 512, !dbg !73 + %2191 = insertelement <1 x i64> poison, i64 %2186, i64 0, !dbg !73 + store <1 x i64> %2191, ptr addrspace(3) %2190, align 8, !dbg !73 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !73 + %2192 = shl nuw nsw i32 %17, 3, !dbg !73 + %2193 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2192, !dbg !73 + %2194 = load i64, ptr addrspace(3) %2193, align 8, !dbg !73 + %2195 = insertelement <4 x i1> poison, i1 %61, i64 0, !dbg !74 + %2196 = insertelement <4 x i1> %2195, i1 %62, i64 1, !dbg !74 + %2197 = insertelement <4 x i1> %2196, i1 %63, i64 2, !dbg !74 + %2198 = insertelement <4 x i1> %2197, i1 %64, i64 3, !dbg !74 + %2199 = select <4 x i1> %392, <4 x i1> %2198, <4 x i1> zeroinitializer, !dbg !74 + %2200 = insertelement <4 x i1> poison, i1 %65, i64 0, !dbg !74 + %2201 = insertelement <4 x i1> %2200, i1 %66, i64 1, !dbg !74 + %2202 = insertelement <4 x i1> %2201, i1 %67, i64 2, !dbg !74 + %2203 = insertelement <4 x i1> %2202, i1 %68, i64 3, !dbg !74 + %2204 = select <4 x i1> %2149, <4 x i1> %2203, <4 x i1> zeroinitializer, !dbg !74 + %2205 = bitcast <4 x i1> %2199 to i4, !dbg !75 + %2206 = tail call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 %2205), !dbg !75 + %2207 = zext nneg i4 %2206 to i64, !dbg !75 + %2208 = bitcast <4 x i1> %2204 to i4, !dbg !75 + %2209 = tail call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 %2208), !dbg !75 + %2210 = zext nneg i4 %2209 to i64, !dbg !75 + %2211 = zext nneg i4 %2206 to i32, !dbg !77 + %2212 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2211, i32 2, i32 31), !dbg !77 + %2213 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 2, i32 31), !dbg !77 + %2214 = insertelement <2 x i32> poison, i32 %2212, i64 0, !dbg !77 + %2215 = insertelement <2 x i32> %2214, i32 %2213, i64 1, !dbg !77 + %2216 = bitcast <2 x i32> %2215 to i64, !dbg !77 + %2217 = add i64 %2207, %2216, !dbg !75 + %extelt.offset70 = lshr i64 %2217, 32, !dbg !77 + %2218 = trunc nuw i64 %extelt.offset70 to i32, !dbg !77 + %2219 = trunc i64 %2217 to i32, !dbg !77 + %2220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2219, i32 1, i32 31), !dbg !77 + %2221 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2218, i32 1, i32 31), !dbg !77 + %2222 = insertelement <2 x i32> poison, i32 %2220, i64 0, !dbg !77 + %2223 = insertelement <2 x i32> %2222, i32 %2221, i64 1, !dbg !77 + %2224 = bitcast <2 x i32> %2223 to i64, !dbg !77 + %2225 = add i64 %2217, %2224, !dbg !75 + %2226 = zext nneg i4 %2209 to i32, !dbg !77 + %2227 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2226, i32 2, i32 31), !dbg !77 + %2228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 2, i32 31), !dbg !77 + %2229 = insertelement <2 x i32> poison, i32 %2227, i64 0, !dbg !77 + %2230 = insertelement <2 x i32> %2229, i32 %2228, i64 1, !dbg !77 + %2231 = bitcast <2 x i32> %2230 to i64, !dbg !77 + %2232 = add i64 %2210, %2231, !dbg !75 + %extelt.offset72 = lshr i64 %2232, 32, !dbg !77 + %2233 = trunc nuw i64 %extelt.offset72 to i32, !dbg !77 + %2234 = trunc i64 %2232 to i32, !dbg !77 + %2235 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2234, i32 1, i32 31), !dbg !77 + %2236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2233, i32 1, i32 31), !dbg !77 + %2237 = insertelement <2 x i32> poison, i32 %2235, i64 0, !dbg !77 + %2238 = insertelement <2 x i32> %2237, i32 %2236, i64 1, !dbg !77 + %2239 = bitcast <2 x i32> %2238 to i64, !dbg !77 + %2240 = add i64 %2232, %2239, !dbg !75 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !78 + %2241 = insertelement <1 x i64> poison, i64 %2225, i64 0, !dbg !78 + store <1 x i64> %2241, ptr addrspace(3) %2188, align 8, !dbg !78 + %2242 = insertelement <1 x i64> poison, i64 %2240, i64 0, !dbg !78 + store <1 x i64> %2242, ptr addrspace(3) %2190, align 8, !dbg !78 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !78 + %2243 = load i64, ptr addrspace(3) %2193, align 8, !dbg !78 + %2244 = trunc i64 %2171 to i32, !dbg !73 + %2245 = trunc i64 %2186 to i32, !dbg !73 + %2246 = insertelement <4 x i32> poison, i32 %23, i64 0, !dbg !79 + %2247 = insertelement <4 x i32> %2246, i32 %24, i64 1, !dbg !79 + %2248 = insertelement <4 x i32> %2247, i32 %25, i64 2, !dbg !79 + %2249 = insertelement <4 x i32> %2248, i32 %26, i64 3, !dbg !79 + %2250 = insertelement <4 x i32> poison, i32 %2244, i64 0, !dbg !79 + %2251 = shufflevector <4 x i32> %2250, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !79 + %2252 = icmp slt <4 x i32> %2249, %2251, !dbg !79 + %2253 = select <4 x i1> %2252, <4 x i32> %1208, <4 x i32> splat (i32 16), !dbg !80 + %2254 = add <4 x i32> %2253, splat (i32 17), !dbg !81 + %2255 = insertelement <4 x i32> poison, i32 %2245, i64 0, !dbg !79 + %2256 = shufflevector <4 x i32> %2255, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !79 + %2257 = icmp slt <4 x i32> %2249, %2256, !dbg !79 + %2258 = select <4 x i1> %2257, <4 x i32> %1219, <4 x i32> splat (i32 16), !dbg !80 + %2259 = add <4 x i32> %2258, splat (i32 17), !dbg !81 + %2260 = icmp slt <4 x i32> %2253, zeroinitializer, !dbg !82 + %2261 = select <4 x i1> %2260, <4 x i32> %2254, <4 x i32> %2253, !dbg !83 + %2262 = icmp ugt <4 x i32> %2261, splat (i32 16), !dbg !84 + %2263 = icmp slt <4 x i32> %2258, zeroinitializer, !dbg !82 + %2264 = select <4 x i1> %2263, <4 x i32> %2259, <4 x i32> %2258, !dbg !83 + %2265 = icmp ugt <4 x i32> %2264, splat (i32 16), !dbg !84 + %2266 = bitcast <4 x i1> %2262 to i4, !dbg !85 + %2267 = icmp ne i4 %2266, 0, !dbg !85 + %2268 = and i1 %20, %2267, !dbg !85 + %2269 = bitcast <4 x i1> %2265 to i4, !dbg !85 + %2270 = icmp ne i4 %2269, 0, !dbg !85 + %2271 = and i1 %21, %2270, !dbg !85 + %2272 = or i1 %2268, %2271, !dbg !85 + br i1 %2272, label %2273, label %2274, !dbg !85 + +2273: ; preds = %11 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 71, ptr nonnull @assertFunc_0, i64 1), !dbg !85 + unreachable, !dbg !85 + +2274: ; preds = %11 + %2275 = trunc i64 %2240 to i32, !dbg !78 + %2276 = trunc i64 %2225 to i32, !dbg !78 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !85 + %2277 = insertelement <4 x i32> poison, i32 %2276, i64 0, !dbg !86 + %2278 = shufflevector <4 x i32> %2277, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !86 + %2279 = icmp slt <4 x i32> %2249, %2278, !dbg !86 + %2280 = select <4 x i1> %2279, <4 x i32> %2145, <4 x i32> splat (i32 16), !dbg !87 + %2281 = add <4 x i32> %2280, splat (i32 17), !dbg !88 + %2282 = icmp slt <4 x i32> %2280, zeroinitializer, !dbg !89 + %2283 = select <4 x i1> %2282, <4 x i32> %2281, <4 x i32> %2280, !dbg !90 + %2284 = icmp ugt <4 x i32> %2283, splat (i32 16), !dbg !91 + %2285 = insertelement <4 x i32> poison, i32 %2275, i64 0, !dbg !86 + %2286 = shufflevector <4 x i32> %2285, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !86 + %2287 = icmp slt <4 x i32> %2249, %2286, !dbg !86 + %2288 = select <4 x i1> %2287, <4 x i32> %2147, <4 x i32> splat (i32 16), !dbg !87 + %2289 = add <4 x i32> %2288, splat (i32 17), !dbg !88 + %2290 = icmp slt <4 x i32> %2288, zeroinitializer, !dbg !89 + %2291 = select <4 x i1> %2290, <4 x i32> %2289, <4 x i32> %2288, !dbg !90 + %2292 = icmp ugt <4 x i32> %2291, splat (i32 16), !dbg !91 + %2293 = bitcast <4 x i1> %2284 to i4, !dbg !92 + %2294 = icmp ne i4 %2293, 0, !dbg !92 + %2295 = and i1 %20, %2294, !dbg !92 + %2296 = bitcast <4 x i1> %2292 to i4, !dbg !92 + %2297 = icmp ne i4 %2296, 0, !dbg !92 + %2298 = and i1 %21, %2297, !dbg !92 + %2299 = or i1 %2295, %2298, !dbg !92 + br i1 %2299, label %2300, label %2301, !dbg !92 + +2300: ; preds = %2274 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 80, ptr nonnull @assertFunc_1, i64 1), !dbg !92 + unreachable, !dbg !92 + +2301: ; preds = %2274 + %2302 = trunc i64 %2243 to i32, !dbg !78 + %2303 = trunc i64 %2194 to i32, !dbg !73 + %2304 = or disjoint i32 %13, %17, !dbg !13 + %2305 = icmp slt i32 %2304, 128, !dbg !14 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !92 + %2306 = sext i32 %2304 to i64, !dbg !93 + %2307 = getelementptr i32, ptr addrspace(1) %1, i64 %2306, !dbg !93 + %2308 = and i32 %14, 128, !dbg !94 + %2309 = icmp eq i32 %2308, 0, !dbg !94 + %2310 = and i1 %2309, %2305, !dbg !94 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %2303, ptr addrspace(1) %2307, i1 %2310) #6, !dbg !94 + %2311 = getelementptr i32, ptr addrspace(1) %2, i64 %2306, !dbg !95 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %2302, ptr addrspace(1) %2311, i1 %2310) #6, !dbg !96 + %2312 = getelementptr i32, ptr addrspace(1) %3, i64 %33, !dbg !97 + %2313 = getelementptr i32, ptr addrspace(1) %3, i64 %37, !dbg !97 + %2314 = extractelement <4 x i32> %1208, i64 0, !dbg !98 + %2315 = extractelement <4 x i32> %1208, i64 1, !dbg !98 + %2316 = extractelement <4 x i32> %1208, i64 2, !dbg !98 + %2317 = extractelement <4 x i32> %1208, i64 3, !dbg !98 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %2314, i32 %2315, i32 %2316, i32 %2317, ptr addrspace(1) %2312, i1 %20) #6, !dbg !98 + %2318 = extractelement <4 x i32> %1219, i64 0, !dbg !98 + %2319 = extractelement <4 x i32> %1219, i64 1, !dbg !98 + %2320 = extractelement <4 x i32> %1219, i64 2, !dbg !98 + %2321 = extractelement <4 x i32> %1219, i64 3, !dbg !98 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %2318, i32 %2319, i32 %2320, i32 %2321, ptr addrspace(1) %2313, i1 %21) #6, !dbg !98 + %2322 = mul i32 %18, 17, !dbg !99 + %2323 = mul i32 %19, 17, !dbg !99 + %2324 = extractelement <4 x i32> %2261, i64 0, !dbg !100 + %2325 = add i32 %2324, %2322, !dbg !100 + %2326 = extractelement <4 x i32> %2261, i64 1, !dbg !100 + %2327 = add i32 %2326, %2322, !dbg !100 + %2328 = extractelement <4 x i32> %2261, i64 2, !dbg !100 + %2329 = add i32 %2328, %2322, !dbg !100 + %2330 = extractelement <4 x i32> %2261, i64 3, !dbg !100 + %2331 = add i32 %2330, %2322, !dbg !100 + %2332 = extractelement <4 x i32> %2264, i64 0, !dbg !100 + %2333 = add i32 %2332, %2323, !dbg !100 + %2334 = extractelement <4 x i32> %2264, i64 1, !dbg !100 + %2335 = add i32 %2334, %2323, !dbg !100 + %2336 = extractelement <4 x i32> %2264, i64 2, !dbg !100 + %2337 = add i32 %2336, %2323, !dbg !100 + %2338 = extractelement <4 x i32> %2264, i64 3, !dbg !100 + %2339 = add i32 %2338, %2323, !dbg !100 + %2340 = sext i32 %2325 to i64, !dbg !101 + %2341 = getelementptr i32, ptr addrspace(1) %4, i64 %2340, !dbg !101 + %2342 = sext i32 %2327 to i64, !dbg !101 + %2343 = getelementptr i32, ptr addrspace(1) %4, i64 %2342, !dbg !101 + %2344 = sext i32 %2329 to i64, !dbg !101 + %2345 = getelementptr i32, ptr addrspace(1) %4, i64 %2344, !dbg !101 + %2346 = sext i32 %2331 to i64, !dbg !101 + %2347 = getelementptr i32, ptr addrspace(1) %4, i64 %2346, !dbg !101 + %2348 = sext i32 %2333 to i64, !dbg !101 + %2349 = getelementptr i32, ptr addrspace(1) %4, i64 %2348, !dbg !101 + %2350 = sext i32 %2335 to i64, !dbg !101 + %2351 = getelementptr i32, ptr addrspace(1) %4, i64 %2350, !dbg !101 + %2352 = sext i32 %2337 to i64, !dbg !101 + %2353 = getelementptr i32, ptr addrspace(1) %4, i64 %2352, !dbg !101 + %2354 = sext i32 %2339 to i64, !dbg !101 + %2355 = getelementptr i32, ptr addrspace(1) %4, i64 %2354, !dbg !101 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !102 + %2356 = ptrtoint ptr addrspace(1) %2341 to i64, !dbg !102 + %2357 = ptrtoint ptr addrspace(1) %2343 to i64, !dbg !102 + %2358 = ptrtoint ptr addrspace(1) %2345 to i64, !dbg !102 + %2359 = ptrtoint ptr addrspace(1) %2347 to i64, !dbg !102 + %2360 = ptrtoint ptr addrspace(1) %2349 to i64, !dbg !102 + %2361 = ptrtoint ptr addrspace(1) %2351 to i64, !dbg !102 + %2362 = ptrtoint ptr addrspace(1) %2353 to i64, !dbg !102 + %2363 = ptrtoint ptr addrspace(1) %2355 to i64, !dbg !102 + %2364 = shl nuw nsw i32 %14, 9, !dbg !102 + %2365 = and i32 %2364, 12288, !dbg !102 + %2366 = shl nuw nsw i32 %22, 5, !dbg !102 + %2367 = shl nuw nsw i32 %15, 2, !dbg !102 + %2368 = or disjoint i32 %2365, %2366, !dbg !102 + %2369 = xor i32 %2368, %2367, !dbg !102 + %2370 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2369, !dbg !102 + %2371 = insertelement <2 x i64> poison, i64 %2356, i64 0, !dbg !102 + %2372 = insertelement <2 x i64> %2371, i64 %2358, i64 1, !dbg !102 + store <2 x i64> %2372, ptr addrspace(3) %2370, align 16, !dbg !102 + %2373 = getelementptr inbounds nuw i8, ptr addrspace(3) %2370, i32 2048, !dbg !102 + %2374 = insertelement <2 x i64> poison, i64 %2357, i64 0, !dbg !102 + %2375 = insertelement <2 x i64> %2374, i64 %2359, i64 1, !dbg !102 + store <2 x i64> %2375, ptr addrspace(3) %2373, align 16, !dbg !102 + %2376 = getelementptr inbounds nuw i8, ptr addrspace(3) %2370, i32 1024, !dbg !102 + %2377 = insertelement <2 x i64> poison, i64 %2360, i64 0, !dbg !102 + %2378 = insertelement <2 x i64> %2377, i64 %2362, i64 1, !dbg !102 + store <2 x i64> %2378, ptr addrspace(3) %2376, align 16, !dbg !102 + %2379 = getelementptr inbounds nuw i8, ptr addrspace(3) %2370, i32 3072, !dbg !102 + %2380 = insertelement <2 x i64> poison, i64 %2361, i64 0, !dbg !102 + %2381 = insertelement <2 x i64> %2380, i64 %2363, i64 1, !dbg !102 + store <2 x i64> %2381, ptr addrspace(3) %2379, align 16, !dbg !102 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !102 + %2382 = shl nuw nsw i32 %14, 11, !dbg !102 + %2383 = and i32 %2382, 12288, !dbg !102 + %2384 = shl nuw nsw i32 %14, 4, !dbg !102 + %2385 = and i32 %2384, 4080, !dbg !102 + %2386 = or disjoint i32 %2383, %2385, !dbg !102 + %2387 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2386, !dbg !102 + %2388 = load i64, ptr addrspace(3) %2387, align 16, !dbg !102 + %2389 = getelementptr inbounds nuw i8, ptr addrspace(3) %2387, i32 8, !dbg !102 + %2390 = load i64, ptr addrspace(3) %2389, align 8, !dbg !102 + %2391 = xor i32 %2386, 32, !dbg !102 + %2392 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2391, !dbg !102 + %2393 = load i64, ptr addrspace(3) %2392, align 16, !dbg !102 + %2394 = getelementptr inbounds nuw i8, ptr addrspace(3) %2392, i32 8, !dbg !102 + %2395 = load i64, ptr addrspace(3) %2394, align 8, !dbg !102 + %2396 = xor i32 %2386, 64, !dbg !102 + %2397 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2396, !dbg !102 + %2398 = load i64, ptr addrspace(3) %2397, align 16, !dbg !102 + %2399 = getelementptr inbounds nuw i8, ptr addrspace(3) %2397, i32 8, !dbg !102 + %2400 = load i64, ptr addrspace(3) %2399, align 8, !dbg !102 + %2401 = xor i32 %2386, 96, !dbg !102 + %2402 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2401, !dbg !102 + %2403 = load i64, ptr addrspace(3) %2402, align 16, !dbg !102 + %2404 = getelementptr inbounds nuw i8, ptr addrspace(3) %2402, i32 8, !dbg !102 + %2405 = load i64, ptr addrspace(3) %2404, align 8, !dbg !102 + %2406 = inttoptr i64 %2388 to ptr addrspace(1), !dbg !102 + %2407 = inttoptr i64 %2390 to ptr addrspace(1), !dbg !102 + %2408 = inttoptr i64 %2393 to ptr addrspace(1), !dbg !102 + %2409 = inttoptr i64 %2395 to ptr addrspace(1), !dbg !102 + %2410 = inttoptr i64 %2398 to ptr addrspace(1), !dbg !102 + %2411 = inttoptr i64 %2400 to ptr addrspace(1), !dbg !102 + %2412 = inttoptr i64 %2403 to ptr addrspace(1), !dbg !102 + %2413 = inttoptr i64 %2405 to ptr addrspace(1), !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2406, i1 %2305) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2407, i1 %2305) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2408, i1 %2305) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2409, i1 %2305) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2410, i1 %2305) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2411, i1 %2305) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2412, i1 %2305) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2413, i1 %2305) #6, !dbg !102 + %2414 = getelementptr i32, ptr addrspace(1) %5, i64 %33, !dbg !103 + %2415 = getelementptr i32, ptr addrspace(1) %5, i64 %37, !dbg !103 + %2416 = extractelement <4 x i32> %2145, i64 0, !dbg !104 + %2417 = extractelement <4 x i32> %2145, i64 1, !dbg !104 + %2418 = extractelement <4 x i32> %2145, i64 2, !dbg !104 + %2419 = extractelement <4 x i32> %2145, i64 3, !dbg !104 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %2416, i32 %2417, i32 %2418, i32 %2419, ptr addrspace(1) %2414, i1 %20) #6, !dbg !104 + %2420 = extractelement <4 x i32> %2147, i64 0, !dbg !104 + %2421 = extractelement <4 x i32> %2147, i64 1, !dbg !104 + %2422 = extractelement <4 x i32> %2147, i64 2, !dbg !104 + %2423 = extractelement <4 x i32> %2147, i64 3, !dbg !104 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %2420, i32 %2421, i32 %2422, i32 %2423, ptr addrspace(1) %2415, i1 %21) #6, !dbg !104 + %2424 = extractelement <4 x i32> %2283, i64 0, !dbg !105 + %2425 = add i32 %2424, %2322, !dbg !105 + %2426 = extractelement <4 x i32> %2283, i64 1, !dbg !105 + %2427 = add i32 %2426, %2322, !dbg !105 + %2428 = extractelement <4 x i32> %2283, i64 2, !dbg !105 + %2429 = add i32 %2428, %2322, !dbg !105 + %2430 = extractelement <4 x i32> %2283, i64 3, !dbg !105 + %2431 = add i32 %2430, %2322, !dbg !105 + %2432 = extractelement <4 x i32> %2291, i64 0, !dbg !105 + %2433 = add i32 %2432, %2323, !dbg !105 + %2434 = extractelement <4 x i32> %2291, i64 1, !dbg !105 + %2435 = add i32 %2434, %2323, !dbg !105 + %2436 = extractelement <4 x i32> %2291, i64 2, !dbg !105 + %2437 = add i32 %2436, %2323, !dbg !105 + %2438 = extractelement <4 x i32> %2291, i64 3, !dbg !105 + %2439 = add i32 %2438, %2323, !dbg !105 + %2440 = sext i32 %2425 to i64, !dbg !106 + %2441 = getelementptr i32, ptr addrspace(1) %6, i64 %2440, !dbg !106 + %2442 = sext i32 %2427 to i64, !dbg !106 + %2443 = getelementptr i32, ptr addrspace(1) %6, i64 %2442, !dbg !106 + %2444 = sext i32 %2429 to i64, !dbg !106 + %2445 = getelementptr i32, ptr addrspace(1) %6, i64 %2444, !dbg !106 + %2446 = sext i32 %2431 to i64, !dbg !106 + %2447 = getelementptr i32, ptr addrspace(1) %6, i64 %2446, !dbg !106 + %2448 = sext i32 %2433 to i64, !dbg !106 + %2449 = getelementptr i32, ptr addrspace(1) %6, i64 %2448, !dbg !106 + %2450 = sext i32 %2435 to i64, !dbg !106 + %2451 = getelementptr i32, ptr addrspace(1) %6, i64 %2450, !dbg !106 + %2452 = sext i32 %2437 to i64, !dbg !106 + %2453 = getelementptr i32, ptr addrspace(1) %6, i64 %2452, !dbg !106 + %2454 = sext i32 %2439 to i64, !dbg !106 + %2455 = getelementptr i32, ptr addrspace(1) %6, i64 %2454, !dbg !106 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !107 + %2456 = ptrtoint ptr addrspace(1) %2441 to i64, !dbg !107 + %2457 = ptrtoint ptr addrspace(1) %2443 to i64, !dbg !107 + %2458 = ptrtoint ptr addrspace(1) %2445 to i64, !dbg !107 + %2459 = ptrtoint ptr addrspace(1) %2447 to i64, !dbg !107 + %2460 = ptrtoint ptr addrspace(1) %2449 to i64, !dbg !107 + %2461 = ptrtoint ptr addrspace(1) %2451 to i64, !dbg !107 + %2462 = ptrtoint ptr addrspace(1) %2453 to i64, !dbg !107 + %2463 = ptrtoint ptr addrspace(1) %2455 to i64, !dbg !107 + %2464 = insertelement <2 x i64> poison, i64 %2456, i64 0, !dbg !107 + %2465 = insertelement <2 x i64> %2464, i64 %2458, i64 1, !dbg !107 + store <2 x i64> %2465, ptr addrspace(3) %2370, align 16, !dbg !107 + %2466 = insertelement <2 x i64> poison, i64 %2457, i64 0, !dbg !107 + %2467 = insertelement <2 x i64> %2466, i64 %2459, i64 1, !dbg !107 + store <2 x i64> %2467, ptr addrspace(3) %2373, align 16, !dbg !107 + %2468 = insertelement <2 x i64> poison, i64 %2460, i64 0, !dbg !107 + %2469 = insertelement <2 x i64> %2468, i64 %2462, i64 1, !dbg !107 + store <2 x i64> %2469, ptr addrspace(3) %2376, align 16, !dbg !107 + %2470 = insertelement <2 x i64> poison, i64 %2461, i64 0, !dbg !107 + %2471 = insertelement <2 x i64> %2470, i64 %2463, i64 1, !dbg !107 + store <2 x i64> %2471, ptr addrspace(3) %2379, align 16, !dbg !107 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !107 + %2472 = load i64, ptr addrspace(3) %2387, align 16, !dbg !107 + %2473 = load i64, ptr addrspace(3) %2389, align 8, !dbg !107 + %2474 = load i64, ptr addrspace(3) %2392, align 16, !dbg !107 + %2475 = load i64, ptr addrspace(3) %2394, align 8, !dbg !107 + %2476 = load i64, ptr addrspace(3) %2397, align 16, !dbg !107 + %2477 = load i64, ptr addrspace(3) %2399, align 8, !dbg !107 + %2478 = load i64, ptr addrspace(3) %2402, align 16, !dbg !107 + %2479 = load i64, ptr addrspace(3) %2404, align 8, !dbg !107 + %2480 = inttoptr i64 %2472 to ptr addrspace(1), !dbg !107 + %2481 = inttoptr i64 %2473 to ptr addrspace(1), !dbg !107 + %2482 = inttoptr i64 %2474 to ptr addrspace(1), !dbg !107 + %2483 = inttoptr i64 %2475 to ptr addrspace(1), !dbg !107 + %2484 = inttoptr i64 %2476 to ptr addrspace(1), !dbg !107 + %2485 = inttoptr i64 %2477 to ptr addrspace(1), !dbg !107 + %2486 = inttoptr i64 %2478 to ptr addrspace(1), !dbg !107 + %2487 = inttoptr i64 %2479 to ptr addrspace(1), !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2480, i1 %2305) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2481, i1 %2305) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2482, i1 %2305) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2483, i1 %2305) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2484, i1 %2305) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2485, i1 %2305) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2486, i1 %2305) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2487, i1 %2305) #6, !dbg !107 + ret void, !dbg !108 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i4 @llvm.ctpop.i4(i4) #5 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="256" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", linkageName: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 24, column: 28, scope: !9) +!11 = !DILocation(line: 24, column: 33, scope: !9) +!12 = !DILocation(line: 25, column: 44, scope: !9) +!13 = !DILocation(line: 25, column: 23, scope: !9) +!14 = !DILocation(line: 26, column: 21, scope: !9) +!15 = !DILocation(line: 27, column: 38, scope: !9) +!16 = !DILocation(line: 34, column: 40, scope: !9) +!17 = !DILocation(line: 34, column: 37, scope: !9) +!18 = !DILocation(line: 34, column: 30, scope: !9) +!19 = !DILocation(line: 34, column: 45, scope: !9) +!20 = !DILocation(line: 627, column: 44, scope: !21, inlinedAt: !23) +!21 = distinct !DILexicalBlockFile(scope: !9, file: !22, discriminator: 0) +!22 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!23 = !DILocation(line: 46, column: 71, scope: !9) +!24 = !DILocation(line: 537, column: 21, scope: !21, inlinedAt: !23) +!25 = !DILocation(line: 601, column: 48, scope: !21, inlinedAt: !23) +!26 = !DILocation(line: 599, column: 28, scope: !21, inlinedAt: !23) +!27 = !DILocation(line: 47, column: 20, scope: !9) +!28 = !DILocation(line: 0, scope: !9) +!29 = !DILocation(line: 574, column: 22, scope: !21, inlinedAt: !30) +!30 = !DILocation(line: 51, column: 71, scope: !9) +!31 = !DILocation(line: 599, column: 19, scope: !21, inlinedAt: !30) +!32 = !DILocation(line: 600, column: 38, scope: !21, inlinedAt: !30) +!33 = !DILocation(line: 600, column: 46, scope: !21, inlinedAt: !30) +!34 = !DILocation(line: 600, column: 15, scope: !21, inlinedAt: !30) +!35 = !DILocation(line: 601, column: 59, scope: !21, inlinedAt: !30) +!36 = !DILocation(line: 601, column: 22, scope: !21, inlinedAt: !30) +!37 = !DILocation(line: 599, column: 28, scope: !21, inlinedAt: !30) +!38 = !DILocation(line: 601, column: 48, scope: !21, inlinedAt: !30) +!39 = !DILocation(line: 538, column: 40, scope: !21, inlinedAt: !30) +!40 = !DILocation(line: 539, column: 41, scope: !21, inlinedAt: !30) +!41 = !DILocation(line: 551, column: 23, scope: !21, inlinedAt: !30) +!42 = !DILocation(line: 548, column: 23, scope: !21, inlinedAt: !30) +!43 = !DILocation(line: 39, column: 18, scope: !9) +!44 = !DILocation(line: 574, column: 22, scope: !21, inlinedAt: !23) +!45 = !DILocation(line: 599, column: 19, scope: !21, inlinedAt: !23) +!46 = !DILocation(line: 600, column: 38, scope: !21, inlinedAt: !23) +!47 = !DILocation(line: 600, column: 46, scope: !21, inlinedAt: !23) +!48 = !DILocation(line: 600, column: 15, scope: !21, inlinedAt: !23) +!49 = !DILocation(line: 601, column: 59, scope: !21, inlinedAt: !23) +!50 = !DILocation(line: 601, column: 22, scope: !21, inlinedAt: !23) +!51 = !DILocation(line: 538, column: 40, scope: !21, inlinedAt: !23) +!52 = !DILocation(line: 291, column: 36, scope: !53, inlinedAt: !23) +!53 = distinct !DILexicalBlockFile(scope: !9, file: !54, discriminator: 0) +!54 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!55 = !DILocation(line: 261, column: 15, scope: !53, inlinedAt: !23) +!56 = !DILocation(line: 539, column: 41, scope: !21, inlinedAt: !23) +!57 = !DILocation(line: 548, column: 23, scope: !21, inlinedAt: !23) +!58 = !DILocation(line: 551, column: 23, scope: !21, inlinedAt: !23) +!59 = !DILocation(line: 54, column: 35, scope: !9) +!60 = !DILocation(line: 591, column: 21, scope: !21, inlinedAt: !23) +!61 = !DILocation(line: 594, column: 40, scope: !21, inlinedAt: !23) +!62 = !DILocation(line: 594, column: 29, scope: !21, inlinedAt: !23) +!63 = !DILocation(line: 594, column: 23, scope: !21, inlinedAt: !23) +!64 = !DILocation(line: 291, column: 36, scope: !53, inlinedAt: !30) +!65 = !DILocation(line: 261, column: 15, scope: !53, inlinedAt: !30) +!66 = !DILocation(line: 591, column: 21, scope: !21, inlinedAt: !30) +!67 = !DILocation(line: 594, column: 40, scope: !21, inlinedAt: !30) +!68 = !DILocation(line: 594, column: 29, scope: !21, inlinedAt: !30) +!69 = !DILocation(line: 594, column: 23, scope: !21, inlinedAt: !30) +!70 = !DILocation(line: 261, column: 15, scope: !53, inlinedAt: !71) +!71 = !DILocation(line: 55, column: 26, scope: !9) +!72 = !DILocation(line: 291, column: 36, scope: !53, inlinedAt: !71) +!73 = !DILocation(line: 60, column: 21, scope: !9) +!74 = !DILocation(line: 58, column: 35, scope: !9) +!75 = !DILocation(line: 261, column: 15, scope: !53, inlinedAt: !76) +!76 = !DILocation(line: 59, column: 26, scope: !9) +!77 = !DILocation(line: 291, column: 36, scope: !53, inlinedAt: !76) +!78 = !DILocation(line: 61, column: 21, scope: !9) +!79 = !DILocation(line: 64, column: 19, scope: !9) +!80 = !DILocation(line: 66, column: 35, scope: !9) +!81 = !DILocation(line: 68, column: 20, scope: !9) +!82 = !DILocation(line: 69, column: 20, scope: !9) +!83 = !DILocation(line: 70, column: 35, scope: !9) +!84 = !DILocation(line: 71, column: 38, scope: !9) +!85 = !DILocation(line: 71, column: 63, scope: !9) +!86 = !DILocation(line: 75, column: 19, scope: !9) +!87 = !DILocation(line: 76, column: 35, scope: !9) +!88 = !DILocation(line: 77, column: 20, scope: !9) +!89 = !DILocation(line: 78, column: 20, scope: !9) +!90 = !DILocation(line: 79, column: 35, scope: !9) +!91 = !DILocation(line: 80, column: 38, scope: !9) +!92 = !DILocation(line: 80, column: 63, scope: !9) +!93 = !DILocation(line: 81, column: 25, scope: !9) +!94 = !DILocation(line: 81, column: 37, scope: !9) +!95 = !DILocation(line: 82, column: 25, scope: !9) +!96 = !DILocation(line: 82, column: 37, scope: !9) +!97 = !DILocation(line: 83, column: 25, scope: !9) +!98 = !DILocation(line: 83, column: 47, scope: !9) +!99 = !DILocation(line: 84, column: 52, scope: !9) +!100 = !DILocation(line: 84, column: 49, scope: !9) +!101 = !DILocation(line: 84, column: 25, scope: !9) +!102 = !DILocation(line: 84, column: 85, scope: !9) +!103 = !DILocation(line: 85, column: 25, scope: !9) +!104 = !DILocation(line: 85, column: 47, scope: !9) +!105 = !DILocation(line: 86, column: 49, scope: !9) +!106 = !DILocation(line: 86, column: 25, scope: !9) +!107 = !DILocation(line: 86, column: 85, scope: !9) +!108 = !DILocation(line: 86, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..39ed9252efd84c362182f808f6da6afcff085f76 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx @@ -0,0 +1,3491 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 // -- Begin function triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 117, 98, 47, 99, 117, 98, 99, 122, 97, 98, 114, 102, 50, 112, 116, 114, 121, 113, 50, 97, 116, 104, 110, 109, 114, 117, 50, 98, 121, 105, 112, 98, 103, 97, 114, 105, 111, 97, 51, 52, 54, 50, 112, 117, 120, 118, 50, 106, 119, 118, 54, 118, 109, 52, 99, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 57, 32, 60, 32, 49, 55}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 117, 98, 47, 99, 117, 98, 99, 122, 97, 98, 114, 102, 50, 112, 116, 114, 121, 113, 50, 97, 116, 104, 110, 109, 114, 117, 50, 98, 121, 105, 112, 98, 103, 97, 114, 105, 111, 97, 51, 52, 54, 50, 112, 117, 120, 118, 50, 106, 119, 118, 54, 118, 109, 52, 99, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 48, 32, 60, 32, 49, 55}; +.extern .shared .align 16 .b8 global_smem[]; + // @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.visible .entry triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_7, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_8, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_9, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_10 +) +.reqntid 256 +{ + .reg .pred %p<652>; + .reg .b16 %rs<97>; + .reg .b32 %r<1554>; + .reg .b64 %rd<138>; + .loc 1 18 0 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:18:0 + +// %bb.0: + ld.param.b64 %rd25, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0]; +$L__tmp0: + .loc 1 24 28 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:24:28 + mov.u32 %r44, %ctaid.x; + .loc 1 24 33 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:24:33 + shl.b32 %r1, %r44, 7; + .loc 1 25 44 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:25:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 252; + bfe.u32 %r45, %r2, 2, 6; + and.b32 %r4, %r2, 127; + .loc 1 25 23 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:25:23 + or.b32 %r5, %r45, %r1; + or.b32 %r6, %r5, 64; + .loc 1 26 21 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:26:21 + setp.lt.s32 %p2, %r5, 128; + setp.lt.s32 %p4, %r6, 128; + .loc 1 27 38 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:27:38 + and.b32 %r7, %r2, 3; + shl.b32 %r24, %r7, 2; + or.b32 %r25, %r24, 1; + or.b32 %r26, %r24, 2; + or.b32 %r27, %r24, 3; + .loc 1 34 40 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:34:40 + shl.b32 %r46, %r5, 4; + shl.b32 %r47, %r6, 4; + .loc 1 34 37 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:34:37 + or.b32 %r48, %r46, %r24; + or.b32 %r49, %r47, %r24; + .loc 1 34 30 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:34:30 + mad.wide.s32 %rd15, %r48, 8, %rd25; + cvt.s64.s32 %rd26, %r46; + cvt.u64.u32 %rd27, %r24; + or.b64 %rd28, %rd26, %rd27; + shl.b64 %rd29, %rd28, 3; + add.s64 %rd30, %rd25, %rd29; + add.s64 %rd18, %rd30, 16; + mad.wide.s32 %rd21, %r49, 8, %rd25; + cvt.s64.s32 %rd31, %r47; + or.b64 %rd32, %rd31, %rd27; + shl.b64 %rd33, %rd32, 3; + add.s64 %rd34, %rd25, %rd33; + add.s64 %rd24, %rd34, 16; + .loc 1 34 45 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:34:45 + // begin inline asm + mov.u64 %rd13, 0x0; + mov.u64 %rd14, 0x0; + @%p2 ld.global.v2.b64 { %rd13, %rd14 }, [ %rd15 + 0 ]; + // end inline asm + // begin inline asm + mov.u64 %rd16, 0x0; + mov.u64 %rd17, 0x0; + @%p2 ld.global.v2.b64 { %rd16, %rd17 }, [ %rd18 + 0 ]; + // end inline asm + // begin inline asm + mov.u64 %rd19, 0x0; + mov.u64 %rd20, 0x0; + @%p4 ld.global.v2.b64 { %rd19, %rd20 }, [ %rd21 + 0 ]; + // end inline asm + // begin inline asm + mov.u64 %rd22, 0x0; + mov.u64 %rd23, 0x0; + @%p4 ld.global.v2.b64 { %rd22, %rd23 }, [ %rd24 + 0 ]; + // end inline asm +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + and.b32 %r50, %r2, 1; + shr.u32 %r51, %r2, 1; + bfe.u32 %r52, %r2, 1, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r53, %r50, 1; + xor.b32 %r54, %r52, 1; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r55, %r25, %r24; + xor.b32 %r56, %r26, %r27; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.ne.b32 %p5, %r50, 0; + and.b32 %r57, %r51, 1; + setp.ne.b32 %p6, %r57, 0; +$L__tmp2: + .loc 1 47 20 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:47:20 + setp.ne.b64 %p7, %rd13, 16384; + setp.eq.b64 %p8, %rd13, 16384; + setp.eq.b64 %p9, %rd14, 16384; + setp.eq.b64 %p10, %rd16, 16384; + setp.ne.b64 %p11, %rd17, 16384; + setp.eq.b64 %p12, %rd17, 16384; + setp.ne.b64 %p13, %rd19, 16384; + setp.eq.b64 %p14, %rd19, 16384; + setp.eq.b64 %p15, %rd20, 16384; + setp.eq.b64 %p16, %rd22, 16384; + setp.ne.b64 %p17, %rd23, 16384; + setp.eq.b64 %p18, %rd23, 16384; + .loc 1 0 0 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:0 + selp.b32 %r58, 1, 0, %p8; + selp.b32 %r59, 1, 0, %p9; + selp.b32 %r60, 1, 0, %p10; + selp.b32 %r61, 1, 0, %p12; + selp.b32 %r62, 1, 0, %p14; + selp.b32 %r63, 1, 0, %p15; + selp.b32 %r64, 1, 0, %p16; + selp.b32 %r65, 1, 0, %p18; +$L__tmp3: + .loc 2 574 22 // triton_helpers.py:574:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + and.pred %p19, %p9, %p7; + and.pred %p20, %p15, %p13; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + or.pred %p21, %p10, %p11; + or.pred %p22, %p16, %p17; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r66, %r58, %r59; + xor.b32 %r67, %r60, %r61; + xor.b32 %r68, %r62, %r63; + xor.b32 %r69, %r64, %r65; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + selp.b32 %r70, %r66, 0, %p19; + selp.b32 %r71, %r67, 0, %p21; + selp.b32 %r72, %r68, 0, %p20; + selp.b32 %r73, %r69, 0, %p22; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r74, %r70, %r58; + xor.b32 %r75, %r70, %r59; + xor.b32 %r76, %r71, %r60; + xor.b32 %r77, %r71, %r61; + xor.b32 %r78, %r72, %r62; + xor.b32 %r79, %r72, %r63; + xor.b32 %r80, %r73, %r64; + xor.b32 %r81, %r73, %r65; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + selp.b32 %r82, %r55, 0, %p19; + selp.b32 %r83, %r56, 0, %p21; + selp.b32 %r84, %r55, 0, %p20; + selp.b32 %r85, %r56, 0, %p22; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r86, %r82, %r24; + xor.b32 %r87, %r82, %r25; + xor.b32 %r88, %r83, %r26; + xor.b32 %r89, %r83, %r27; + xor.b32 %r90, %r84, %r24; + xor.b32 %r91, %r84, %r25; + xor.b32 %r92, %r85, %r26; + xor.b32 %r93, %r85, %r27; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + setp.ge.u32 %p23, %r74, %r76; + setp.ne.b32 %p24, %r74, %r76; + setp.le.u32 %p25, %r86, %r88; + or.pred %p26, %p25, %p24; + and.pred %p27, %p23, %p26; + xor.pred %p28, %p27, %p5; + setp.ge.u32 %p29, %r75, %r77; + setp.ne.b32 %p30, %r75, %r77; + setp.le.u32 %p31, %r87, %r89; + or.pred %p32, %p31, %p30; + and.pred %p33, %p29, %p32; + xor.pred %p34, %p33, %p5; + setp.ge.u32 %p35, %r78, %r80; + setp.ne.b32 %p36, %r78, %r80; + setp.le.u32 %p37, %r90, %r92; + or.pred %p38, %p37, %p36; + and.pred %p39, %p35, %p38; + xor.pred %p40, %p39, %p5; + setp.ge.u32 %p41, %r79, %r81; + setp.ne.b32 %p42, %r79, %r81; + setp.le.u32 %p43, %r91, %r93; + or.pred %p44, %p43, %p42; + and.pred %p45, %p41, %p44; + xor.pred %p46, %p45, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r94, %r76, %r74; + xor.b32 %r95, %r77, %r75; + xor.b32 %r96, %r80, %r78; + xor.b32 %r97, %r81, %r79; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + selp.b32 %r98, 0, %r94, %p28; + selp.b32 %r99, 0, %r95, %p34; + selp.b32 %r100, 0, %r96, %p40; + selp.b32 %r101, 0, %r97, %p46; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r102, %r98, %r74; + xor.b32 %r103, %r99, %r75; + xor.b32 %r104, %r98, %r76; + xor.b32 %r105, %r99, %r77; + xor.b32 %r106, %r100, %r78; + xor.b32 %r107, %r101, %r79; + xor.b32 %r108, %r100, %r80; + xor.b32 %r109, %r101, %r81; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r110, %r88, %r86; + xor.b32 %r111, %r89, %r87; + xor.b32 %r112, %r92, %r90; + xor.b32 %r113, %r93, %r91; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + selp.b32 %r114, 0, %r110, %p28; + selp.b32 %r115, 0, %r111, %p34; + selp.b32 %r116, 0, %r112, %p40; + selp.b32 %r117, 0, %r113, %p46; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r118, %r114, %r86; + xor.b32 %r119, %r115, %r87; + xor.b32 %r120, %r114, %r88; + xor.b32 %r121, %r115, %r89; + xor.b32 %r122, %r116, %r90; + xor.b32 %r123, %r117, %r91; + xor.b32 %r124, %r116, %r92; + xor.b32 %r125, %r117, %r93; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + setp.ge.u32 %p47, %r102, %r103; + setp.ne.b32 %p48, %r102, %r103; + setp.le.u32 %p49, %r118, %r119; + or.pred %p50, %p48, %p49; + and.pred %p51, %p47, %p50; + xor.pred %p52, %p51, %p5; + setp.ge.u32 %p53, %r104, %r105; + setp.ne.b32 %p54, %r104, %r105; + setp.le.u32 %p55, %r120, %r121; + or.pred %p56, %p54, %p55; + and.pred %p57, %p53, %p56; + xor.pred %p58, %p57, %p5; + setp.ge.u32 %p59, %r106, %r107; + setp.ne.b32 %p60, %r106, %r107; + setp.le.u32 %p61, %r122, %r123; + or.pred %p62, %p60, %p61; + and.pred %p63, %p59, %p62; + xor.pred %p64, %p63, %p5; + setp.ge.u32 %p65, %r108, %r109; + setp.ne.b32 %p66, %r108, %r109; + setp.le.u32 %p67, %r124, %r125; + or.pred %p68, %p66, %p67; + and.pred %p69, %p65, %p68; + xor.pred %p70, %p69, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r126, %r102, %r103; + xor.b32 %r127, %r104, %r105; + xor.b32 %r128, %r106, %r107; + xor.b32 %r129, %r108, %r109; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + selp.b32 %r130, 0, %r126, %p52; + selp.b32 %r131, 0, %r127, %p58; + selp.b32 %r132, 0, %r128, %p64; + selp.b32 %r133, 0, %r129, %p70; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r134, %r130, %r102; + xor.b32 %r135, %r130, %r103; + xor.b32 %r136, %r131, %r104; + xor.b32 %r137, %r131, %r105; + xor.b32 %r138, %r132, %r106; + xor.b32 %r139, %r132, %r107; + xor.b32 %r140, %r133, %r108; + xor.b32 %r141, %r133, %r109; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r142, %r118, %r119; + xor.b32 %r143, %r120, %r121; + xor.b32 %r144, %r122, %r123; + xor.b32 %r145, %r124, %r125; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + selp.b32 %r146, 0, %r142, %p52; + selp.b32 %r147, 0, %r143, %p58; + selp.b32 %r148, 0, %r144, %p64; + selp.b32 %r149, 0, %r145, %p70; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + mul.lo.s32 %r150, %r134, %r53; + mul.lo.s32 %r151, %r135, %r53; + mul.lo.s32 %r152, %r136, %r53; + mul.lo.s32 %r153, %r137, %r53; + mul.lo.s32 %r154, %r138, %r53; + mul.lo.s32 %r155, %r139, %r53; + mul.lo.s32 %r156, %r140, %r53; + mul.lo.s32 %r157, %r141, %r53; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + mul.lo.s32 %r158, %r134, %r50; + mul.lo.s32 %r159, %r135, %r50; + mul.lo.s32 %r160, %r136, %r50; + mul.lo.s32 %r161, %r137, %r50; + mul.lo.s32 %r162, %r138, %r50; + mul.lo.s32 %r163, %r139, %r50; + mul.lo.s32 %r164, %r140, %r50; + mul.lo.s32 %r165, %r141, %r50; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r166, %r147, %r121; + xor.b32 %r167, %r147, %r120; + xor.b32 %r168, %r146, %r119; + xor.b32 %r169, %r146, %r118; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + mul.lo.s32 %r170, %r169, %r53; + mul.lo.s32 %r171, %r168, %r53; + mul.lo.s32 %r172, %r167, %r53; + mul.lo.s32 %r173, %r166, %r53; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + mul.lo.s32 %r174, %r169, %r50; + mul.lo.s32 %r175, %r168, %r50; + mul.lo.s32 %r176, %r167, %r50; + mul.lo.s32 %r177, %r166, %r50; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r178, %r149, %r125; + xor.b32 %r179, %r149, %r124; + xor.b32 %r180, %r148, %r123; + xor.b32 %r181, %r148, %r122; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + mul.lo.s32 %r182, %r181, %r53; + mul.lo.s32 %r183, %r180, %r53; + mul.lo.s32 %r184, %r179, %r53; + mul.lo.s32 %r185, %r178, %r53; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + mul.lo.s32 %r186, %r181, %r50; + mul.lo.s32 %r187, %r180, %r50; + mul.lo.s32 %r188, %r179, %r50; + mul.lo.s32 %r189, %r178, %r50; +$L__tmp4: + .loc 1 39 18 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:39:18 + add.s64 %rd35, %rd14, -1; + add.s64 %rd36, %rd16, -1; + add.s64 %rd37, %rd13, -1; + add.s64 %rd38, %rd17, -1; + setp.gt.u64 %p71, %rd38, 16382; + setp.gt.u64 %p72, %rd37, 16382; + setp.lt.u64 %p73, %rd38, 16383; + setp.lt.u64 %p74, %rd36, 16383; + setp.lt.u64 %p75, %rd35, 16383; + setp.lt.u64 %p76, %rd37, 16383; + .loc 1 0 0 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:0 + selp.b32 %r190, 1, 0, %p76; + selp.b32 %r191, 1, 0, %p75; + selp.b32 %r192, 1, 0, %p74; + selp.b32 %r193, 1, 0, %p73; +$L__tmp5: + .loc 2 574 22 // triton_helpers.py:574:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + and.pred %p77, %p75, %p72; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + or.pred %p78, %p74, %p71; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r194, %r190, %r191; + xor.b32 %r195, %r192, %r193; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r196, %r194, 0, %p77; + selp.b32 %r197, %r195, 0, %p78; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r198, %r196, %r190; + xor.b32 %r199, %r196, %r191; + xor.b32 %r200, %r197, %r192; + xor.b32 %r201, %r197, %r193; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r202, %r55, 0, %p77; + selp.b32 %r203, %r56, 0, %p78; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r204, %r202, %r24; + xor.b32 %r205, %r202, %r25; + xor.b32 %r206, %r203, %r26; + xor.b32 %r207, %r203, %r27; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.ge.u32 %p79, %r198, %r200; + setp.ge.u32 %p80, %r199, %r201; + setp.ne.b32 %p81, %r199, %r201; + setp.ne.b32 %p82, %r200, %r198; + setp.le.u32 %p83, %r205, %r207; + setp.le.u32 %p84, %r204, %r206; + or.pred %p85, %p84, %p82; + or.pred %p86, %p83, %p81; + and.pred %p87, %p80, %p86; + and.pred %p88, %p79, %p85; + xor.pred %p89, %p88, %p5; + xor.pred %p90, %p87, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r208, %r198, %r200; + xor.b32 %r209, %r201, %r199; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r210, 0, %r209, %p90; + selp.b32 %r211, 0, %r208, %p89; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r212, %r211, %r198; + xor.b32 %r213, %r210, %r199; + xor.b32 %r214, %r210, %r201; + xor.b32 %r215, %r211, %r200; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r216, %r206, %r204; + xor.b32 %r217, %r207, %r205; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r218, 0, %r216, %p89; + selp.b32 %r219, 0, %r217, %p90; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r220, %r218, %r204; + xor.b32 %r221, %r219, %r205; + xor.b32 %r222, %r218, %r206; + xor.b32 %r223, %r219, %r207; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.ge.u32 %p91, %r212, %r213; + setp.ne.b32 %p92, %r212, %r213; + setp.le.u32 %p93, %r220, %r221; + or.pred %p94, %p92, %p93; + and.pred %p95, %p91, %p94; + xor.pred %p96, %p95, %p5; + setp.ge.u32 %p97, %r215, %r214; + setp.ne.b32 %p98, %r215, %r214; + setp.le.u32 %p99, %r222, %r223; + or.pred %p100, %p98, %p99; + and.pred %p101, %p97, %p100; + xor.pred %p102, %p101, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r224, %r212, %r213; + xor.b32 %r225, %r215, %r214; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r226, 0, %r224, %p96; + selp.b32 %r227, 0, %r225, %p102; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r228, %r226, %r212; + xor.b32 %r229, %r226, %r213; + xor.b32 %r230, %r227, %r215; + xor.b32 %r231, %r227, %r214; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r232, %r220, %r221; + xor.b32 %r233, %r222, %r223; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r234, 0, %r232, %p96; + selp.b32 %r235, 0, %r233, %p102; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r236, %r234, %r220; + xor.b32 %r237, %r234, %r221; + xor.b32 %r238, %r235, %r222; + xor.b32 %r239, %r235, %r223; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + mul.lo.s32 %r240, %r228, %r53; + mul.lo.s32 %r241, %r229, %r53; + mul.lo.s32 %r242, %r230, %r53; + mul.lo.s32 %r243, %r231, %r53; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r244, %r240, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r245, %r240, %r244; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r246, %r241, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r247, %r241, %r246; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r248, %r242, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r249, %r242, %r248; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r250, %r243, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r251, %r243, %r250; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + mul.lo.s32 %r252, %r228, %r50; + mul.lo.s32 %r253, %r229, %r50; + mul.lo.s32 %r254, %r230, %r50; + mul.lo.s32 %r255, %r231, %r50; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + mul.lo.s32 %r256, %r236, %r53; + mul.lo.s32 %r257, %r237, %r53; + mul.lo.s32 %r258, %r238, %r53; + mul.lo.s32 %r259, %r239, %r53; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + mul.lo.s32 %r260, %r236, %r50; + mul.lo.s32 %r261, %r237, %r50; + mul.lo.s32 %r262, %r238, %r50; + mul.lo.s32 %r263, %r239, %r50; +$L__tmp6: + .loc 1 54 35 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:54:35 + and.pred %p103, %p2, %p74; + selp.b16 %rs1, 1, 0, %p103; + shl.b16 %rs2, %rs1, 2; + and.pred %p104, %p2, %p73; + selp.b16 %rs3, -1, 0, %p104; + shl.b16 %rs4, %rs3, 3; + or.b16 %rs5, %rs4, %rs2; + and.pred %p105, %p2, %p76; + selp.b16 %rs6, 1, 0, %p105; + and.pred %p106, %p2, %p75; + selp.b16 %rs7, -1, 0, %p106; + shl.b16 %rs8, %rs7, 1; + or.b16 %rs9, %rs6, %rs8; + and.b16 %rs10, %rs9, 3; + or.b16 %rs11, %rs10, %rs5; + .loc 1 39 18 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:39:18 + add.s64 %rd39, %rd20, -1; + add.s64 %rd40, %rd22, -1; + add.s64 %rd41, %rd19, -1; + add.s64 %rd42, %rd23, -1; + setp.gt.u64 %p107, %rd42, 16382; + setp.gt.u64 %p108, %rd41, 16382; + setp.lt.u64 %p109, %rd42, 16383; + setp.lt.u64 %p110, %rd40, 16383; + setp.lt.u64 %p111, %rd39, 16383; + setp.lt.u64 %p112, %rd41, 16383; + .loc 1 0 0 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:0 + selp.b32 %r264, 1, 0, %p112; + selp.b32 %r265, 1, 0, %p111; + selp.b32 %r266, 1, 0, %p110; + selp.b32 %r267, 1, 0, %p109; +$L__tmp7: + .loc 2 574 22 // triton_helpers.py:574:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + and.pred %p113, %p111, %p108; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + or.pred %p114, %p110, %p107; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r268, %r264, %r265; + xor.b32 %r269, %r266, %r267; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r270, %r268, 0, %p113; + selp.b32 %r271, %r269, 0, %p114; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r272, %r270, %r264; + xor.b32 %r273, %r270, %r265; + xor.b32 %r274, %r271, %r266; + xor.b32 %r275, %r271, %r267; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r276, %r55, 0, %p113; + selp.b32 %r277, %r56, 0, %p114; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r278, %r276, %r24; + xor.b32 %r279, %r276, %r25; + xor.b32 %r280, %r277, %r26; + xor.b32 %r281, %r277, %r27; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.ge.u32 %p115, %r272, %r274; + setp.ge.u32 %p116, %r273, %r275; + setp.ne.b32 %p117, %r273, %r275; + setp.ne.b32 %p118, %r274, %r272; + setp.le.u32 %p119, %r279, %r281; + setp.le.u32 %p120, %r278, %r280; + or.pred %p121, %p120, %p118; + or.pred %p122, %p119, %p117; + and.pred %p123, %p116, %p122; + and.pred %p124, %p115, %p121; + xor.pred %p125, %p124, %p5; + xor.pred %p126, %p123, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r282, %r272, %r274; + xor.b32 %r283, %r275, %r273; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r284, 0, %r283, %p126; + selp.b32 %r285, 0, %r282, %p125; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r286, %r285, %r272; + xor.b32 %r287, %r284, %r273; + xor.b32 %r288, %r284, %r275; + xor.b32 %r289, %r285, %r274; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r290, %r280, %r278; + xor.b32 %r291, %r281, %r279; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r292, 0, %r290, %p125; + selp.b32 %r293, 0, %r291, %p126; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r294, %r292, %r278; + xor.b32 %r295, %r293, %r279; + xor.b32 %r296, %r292, %r280; + xor.b32 %r297, %r293, %r281; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.ge.u32 %p127, %r286, %r287; + setp.ne.b32 %p128, %r286, %r287; + setp.le.u32 %p129, %r294, %r295; + or.pred %p130, %p128, %p129; + and.pred %p131, %p127, %p130; + xor.pred %p132, %p131, %p5; + setp.ge.u32 %p133, %r289, %r288; + setp.ne.b32 %p134, %r289, %r288; + setp.le.u32 %p135, %r296, %r297; + or.pred %p136, %p134, %p135; + and.pred %p137, %p133, %p136; + xor.pred %p138, %p137, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r298, %r286, %r287; + xor.b32 %r299, %r289, %r288; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r300, 0, %r298, %p132; + selp.b32 %r301, 0, %r299, %p138; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r302, %r300, %r286; + xor.b32 %r303, %r300, %r287; + xor.b32 %r304, %r301, %r289; + xor.b32 %r305, %r301, %r288; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r306, %r294, %r295; + xor.b32 %r307, %r296, %r297; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r308, 0, %r306, %p132; + selp.b32 %r309, 0, %r307, %p138; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r310, %r308, %r294; + xor.b32 %r311, %r308, %r295; + xor.b32 %r312, %r309, %r296; + xor.b32 %r313, %r309, %r297; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + mul.lo.s32 %r314, %r302, %r53; + mul.lo.s32 %r315, %r303, %r53; + mul.lo.s32 %r316, %r304, %r53; + mul.lo.s32 %r317, %r305, %r53; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + mul.lo.s32 %r318, %r302, %r50; + mul.lo.s32 %r319, %r303, %r50; + mul.lo.s32 %r320, %r304, %r50; + mul.lo.s32 %r321, %r305, %r50; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + mul.lo.s32 %r322, %r310, %r53; + mul.lo.s32 %r323, %r311, %r53; + mul.lo.s32 %r324, %r312, %r53; + mul.lo.s32 %r325, %r313, %r53; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + mul.lo.s32 %r326, %r310, %r50; + mul.lo.s32 %r327, %r311, %r50; + mul.lo.s32 %r328, %r312, %r50; + mul.lo.s32 %r329, %r313, %r50; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r330, %r314, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r331, %r314, %r330; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r332, %r315, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r333, %r315, %r332; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r334, %r316, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r335, %r316, %r334; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r336, %r317, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r337, %r317, %r336; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r338, %r252, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r339, %r252, %r338; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r340, %r253, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r341, %r253, %r340; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r342, %r254, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r343, %r254, %r342; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r344, %r255, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r345, %r255, %r344; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r346, %r318, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r347, %r318, %r346; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r348, %r319, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r349, %r319, %r348; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r350, %r320, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r351, %r320, %r350; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r352, %r321, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r353, %r321, %r352; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r354, %r256, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r355, %r256, %r354; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r356, %r257, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r357, %r257, %r356; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r358, %r258, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r359, %r358, %r258; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r360, %r259, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r361, %r360, %r259; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r362, %r322, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r363, %r362, %r322; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r364, %r323, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r365, %r364, %r323; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r366, %r324, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r367, %r366, %r324; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r368, %r325, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r369, %r368, %r325; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r370, %r260, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r371, %r370, %r260; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r372, %r261, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r373, %r372, %r261; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r374, %r262, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r375, %r374, %r262; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r376, %r263, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r377, %r376, %r263; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r378, %r326, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r379, %r378, %r326; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r380, %r327, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r381, %r380, %r327; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r382, %r328, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r383, %r382, %r328; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r384, %r329, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r385, %r384, %r329; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.ge.s32 %p139, %r245, %r339; + setp.ne.b32 %p140, %r245, %r339; + setp.le.s32 %p141, %r355, %r371; + or.pred %p142, %p140, %p141; + and.pred %p143, %p139, %p142; + xor.pred %p144, %p143, %p6; + setp.ge.s32 %p145, %r247, %r341; + setp.ne.b32 %p146, %r247, %r341; + setp.le.s32 %p147, %r357, %r373; + or.pred %p148, %p146, %p147; + and.pred %p149, %p145, %p148; + xor.pred %p150, %p149, %p6; + setp.ge.s32 %p151, %r249, %r343; + setp.ne.b32 %p152, %r249, %r343; + setp.le.s32 %p153, %r359, %r375; + or.pred %p154, %p152, %p153; + and.pred %p155, %p151, %p154; + xor.pred %p156, %p155, %p6; + setp.ge.s32 %p157, %r251, %r345; + setp.ne.b32 %p158, %r251, %r345; + setp.le.s32 %p159, %r361, %r377; + or.pred %p160, %p158, %p159; + and.pred %p161, %p157, %p160; + xor.pred %p162, %p161, %p6; + setp.ge.s32 %p163, %r331, %r347; + setp.ne.b32 %p164, %r331, %r347; + setp.le.s32 %p165, %r363, %r379; + or.pred %p166, %p164, %p165; + and.pred %p167, %p163, %p166; + xor.pred %p168, %p167, %p6; + setp.ge.s32 %p169, %r333, %r349; + setp.ne.b32 %p170, %r333, %r349; + setp.le.s32 %p171, %r365, %r381; + or.pred %p172, %p170, %p171; + and.pred %p173, %p169, %p172; + xor.pred %p174, %p173, %p6; + setp.ge.s32 %p175, %r335, %r351; + setp.ne.b32 %p176, %r335, %r351; + setp.le.s32 %p177, %r367, %r383; + or.pred %p178, %p176, %p177; + and.pred %p179, %p175, %p178; + xor.pred %p180, %p179, %p6; + setp.ge.s32 %p181, %r337, %r353; + setp.ne.b32 %p182, %r337, %r353; + setp.le.s32 %p183, %r369, %r385; + or.pred %p184, %p182, %p183; + and.pred %p185, %p181, %p184; + xor.pred %p186, %p185, %p6; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r386, %r245, %r339; + xor.b32 %r387, %r247, %r341; + xor.b32 %r388, %r249, %r343; + xor.b32 %r389, %r251, %r345; + xor.b32 %r390, %r331, %r347; + xor.b32 %r391, %r333, %r349; + xor.b32 %r392, %r335, %r351; + xor.b32 %r393, %r337, %r353; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r394, 0, %r386, %p144; + selp.b32 %r395, 0, %r387, %p150; + selp.b32 %r396, 0, %r388, %p156; + selp.b32 %r397, 0, %r389, %p162; + selp.b32 %r398, 0, %r390, %p168; + selp.b32 %r399, 0, %r391, %p174; + selp.b32 %r400, 0, %r392, %p180; + selp.b32 %r401, 0, %r393, %p186; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r402, %r394, %r228; + xor.b32 %r403, %r395, %r229; + xor.b32 %r404, %r396, %r230; + xor.b32 %r405, %r397, %r231; + xor.b32 %r406, %r398, %r302; + xor.b32 %r407, %r399, %r303; + xor.b32 %r408, %r400, %r304; + xor.b32 %r409, %r401, %r305; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r410, %r371, %r355; + xor.b32 %r411, %r373, %r357; + xor.b32 %r412, %r375, %r359; + xor.b32 %r413, %r377, %r361; + xor.b32 %r414, %r379, %r363; + xor.b32 %r415, %r381, %r365; + xor.b32 %r416, %r383, %r367; + xor.b32 %r417, %r385, %r369; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r418, 0, %r410, %p144; + selp.b32 %r419, 0, %r411, %p150; + selp.b32 %r420, 0, %r412, %p156; + selp.b32 %r421, 0, %r413, %p162; + selp.b32 %r422, 0, %r414, %p168; + selp.b32 %r423, 0, %r415, %p174; + selp.b32 %r424, 0, %r416, %p180; + selp.b32 %r425, 0, %r417, %p186; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r426, %r418, %r236; + xor.b32 %r427, %r419, %r237; + xor.b32 %r428, %r420, %r238; + xor.b32 %r429, %r421, %r239; + xor.b32 %r430, %r422, %r310; + xor.b32 %r431, %r423, %r311; + xor.b32 %r432, %r424, %r312; + xor.b32 %r433, %r425, %r313; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.ge.s32 %p187, %r402, %r404; + setp.ne.b32 %p188, %r402, %r404; + setp.le.s32 %p189, %r426, %r428; + or.pred %p190, %p188, %p189; + and.pred %p191, %p187, %p190; + xor.pred %p192, %p191, %p6; + setp.ge.s32 %p193, %r403, %r405; + setp.ne.b32 %p194, %r403, %r405; + setp.le.s32 %p195, %r427, %r429; + or.pred %p196, %p194, %p195; + and.pred %p197, %p193, %p196; + xor.pred %p198, %p197, %p6; + setp.ge.s32 %p199, %r406, %r408; + setp.ne.b32 %p200, %r406, %r408; + setp.le.s32 %p201, %r430, %r432; + or.pred %p202, %p200, %p201; + and.pred %p203, %p199, %p202; + xor.pred %p204, %p203, %p6; + setp.ge.s32 %p205, %r407, %r409; + setp.ne.b32 %p206, %r407, %r409; + setp.le.s32 %p207, %r431, %r433; + or.pred %p208, %p206, %p207; + and.pred %p209, %p205, %p208; + xor.pred %p210, %p209, %p6; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r434, %r404, %r402; + xor.b32 %r435, %r405, %r403; + xor.b32 %r436, %r408, %r406; + xor.b32 %r437, %r409, %r407; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r438, 0, %r434, %p192; + selp.b32 %r439, 0, %r435, %p198; + selp.b32 %r440, 0, %r436, %p204; + selp.b32 %r441, 0, %r437, %p210; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r442, %r438, %r402; + xor.b32 %r443, %r439, %r403; + xor.b32 %r444, %r438, %r404; + xor.b32 %r445, %r439, %r405; + xor.b32 %r446, %r440, %r406; + xor.b32 %r447, %r441, %r407; + xor.b32 %r448, %r440, %r408; + xor.b32 %r449, %r441, %r409; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r450, %r428, %r426; + xor.b32 %r451, %r429, %r427; + xor.b32 %r452, %r432, %r430; + xor.b32 %r453, %r433, %r431; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r454, 0, %r450, %p192; + selp.b32 %r455, 0, %r451, %p198; + selp.b32 %r456, 0, %r452, %p204; + selp.b32 %r457, 0, %r453, %p210; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r458, %r454, %r426; + xor.b32 %r459, %r455, %r427; + xor.b32 %r460, %r454, %r428; + xor.b32 %r461, %r455, %r429; + xor.b32 %r462, %r456, %r430; + xor.b32 %r463, %r457, %r431; + xor.b32 %r464, %r456, %r432; + xor.b32 %r465, %r457, %r433; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.ge.s32 %p211, %r442, %r443; + setp.ne.b32 %p212, %r442, %r443; + setp.le.s32 %p213, %r458, %r459; + or.pred %p214, %p212, %p213; + and.pred %p215, %p211, %p214; + xor.pred %p216, %p215, %p6; + setp.ge.s32 %p217, %r444, %r445; + setp.ne.b32 %p218, %r444, %r445; + setp.le.s32 %p219, %r460, %r461; + or.pred %p220, %p218, %p219; + and.pred %p221, %p217, %p220; + xor.pred %p222, %p221, %p6; + setp.ge.s32 %p223, %r446, %r447; + setp.ne.b32 %p224, %r446, %r447; + setp.le.s32 %p225, %r462, %r463; + or.pred %p226, %p224, %p225; + and.pred %p227, %p223, %p226; + xor.pred %p228, %p227, %p6; + setp.ge.s32 %p229, %r448, %r449; + setp.ne.b32 %p230, %r448, %r449; + setp.le.s32 %p231, %r464, %r465; + or.pred %p232, %p230, %p231; + and.pred %p233, %p229, %p232; + xor.pred %p234, %p233, %p6; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r466, %r443, %r442; + xor.b32 %r467, %r445, %r444; + xor.b32 %r468, %r447, %r446; + xor.b32 %r469, %r449, %r448; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r470, 0, %r466, %p216; + selp.b32 %r471, 0, %r467, %p222; + selp.b32 %r472, 0, %r468, %p228; + selp.b32 %r473, 0, %r469, %p234; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r474, %r470, %r442; + xor.b32 %r475, %r470, %r443; + xor.b32 %r476, %r471, %r444; + xor.b32 %r477, %r471, %r445; + xor.b32 %r478, %r472, %r446; + xor.b32 %r479, %r472, %r447; + xor.b32 %r480, %r473, %r448; + xor.b32 %r481, %r473, %r449; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r482, %r459, %r458; + xor.b32 %r483, %r461, %r460; + xor.b32 %r484, %r463, %r462; + xor.b32 %r485, %r465, %r464; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r486, 0, %r482, %p216; + selp.b32 %r487, 0, %r483, %p222; + selp.b32 %r488, 0, %r484, %p228; + selp.b32 %r489, 0, %r485, %p234; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r490, %r486, %r458; + xor.b32 %r491, %r486, %r459; + xor.b32 %r492, %r487, %r460; + xor.b32 %r493, %r487, %r461; + xor.b32 %r494, %r488, %r462; + xor.b32 %r495, %r488, %r463; + xor.b32 %r496, %r489, %r464; + xor.b32 %r497, %r489, %r465; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + mul.lo.s32 %r498, %r474, %r54; + mul.lo.s32 %r499, %r475, %r54; + mul.lo.s32 %r500, %r476, %r54; + mul.lo.s32 %r501, %r477, %r54; + mul.lo.s32 %r502, %r478, %r54; + mul.lo.s32 %r503, %r479, %r54; + mul.lo.s32 %r504, %r480, %r54; + mul.lo.s32 %r505, %r481, %r54; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r506, %r498, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r507, %r498, %r506; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r508, %r499, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r509, %r499, %r508; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r510, %r500, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r511, %r500, %r510; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r512, %r501, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r513, %r501, %r512; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r514, %r502, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r515, %r502, %r514; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r516, %r503, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r517, %r503, %r516; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r518, %r504, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r519, %r504, %r518; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r520, %r505, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r521, %r505, %r520; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + mul.lo.s32 %r522, %r474, %r52; + mul.lo.s32 %r523, %r475, %r52; + mul.lo.s32 %r524, %r476, %r52; + mul.lo.s32 %r525, %r477, %r52; + mul.lo.s32 %r526, %r478, %r52; + mul.lo.s32 %r527, %r479, %r52; + mul.lo.s32 %r528, %r480, %r52; + mul.lo.s32 %r529, %r481, %r52; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r530, %r522, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r531, %r522, %r530; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r532, %r523, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r533, %r523, %r532; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r534, %r524, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r535, %r524, %r534; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r536, %r525, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r537, %r525, %r536; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r538, %r526, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r539, %r526, %r538; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r540, %r527, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r541, %r527, %r540; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r542, %r528, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r543, %r528, %r542; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r544, %r529, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r545, %r529, %r544; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + mul.lo.s32 %r546, %r490, %r54; + mul.lo.s32 %r547, %r491, %r54; + mul.lo.s32 %r548, %r492, %r54; + mul.lo.s32 %r549, %r493, %r54; + mul.lo.s32 %r550, %r494, %r54; + mul.lo.s32 %r551, %r495, %r54; + mul.lo.s32 %r552, %r496, %r54; + mul.lo.s32 %r553, %r497, %r54; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r554, %r546, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r555, %r554, %r546; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r556, %r547, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r557, %r556, %r547; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r558, %r548, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r559, %r558, %r548; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r560, %r549, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r561, %r560, %r549; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r562, %r550, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r563, %r562, %r550; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r564, %r551, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r565, %r564, %r551; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r566, %r552, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r567, %r566, %r552; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r568, %r553, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r569, %r568, %r553; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + mul.lo.s32 %r570, %r490, %r52; + mul.lo.s32 %r571, %r491, %r52; + mul.lo.s32 %r572, %r492, %r52; + mul.lo.s32 %r573, %r493, %r52; + mul.lo.s32 %r574, %r494, %r52; + mul.lo.s32 %r575, %r495, %r52; + mul.lo.s32 %r576, %r496, %r52; + mul.lo.s32 %r577, %r497, %r52; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r578, %r570, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r579, %r578, %r570; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r580, %r571, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r581, %r580, %r571; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r582, %r572, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r583, %r582, %r572; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r584, %r573, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r585, %r584, %r573; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r586, %r574, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r587, %r586, %r574; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r588, %r575, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r589, %r588, %r575; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r590, %r576, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r591, %r590, %r576; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r592, %r577, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r593, %r592, %r577; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.lt.s32 %p235, %r507, %r531; + setp.lt.s32 %p236, %r509, %r533; + setp.lt.s32 %p237, %r511, %r535; + setp.lt.s32 %p238, %r513, %r537; + setp.lt.s32 %p239, %r515, %r539; + setp.lt.s32 %p240, %r517, %r541; + setp.lt.s32 %p241, %r519, %r543; + setp.lt.s32 %p242, %r521, %r545; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.eq.b32 %p243, %r507, %r531; + setp.eq.b32 %p244, %r509, %r533; + setp.eq.b32 %p245, %r511, %r535; + setp.eq.b32 %p246, %r513, %r537; + setp.eq.b32 %p247, %r515, %r539; + setp.eq.b32 %p248, %r517, %r541; + setp.eq.b32 %p249, %r519, %r543; + setp.eq.b32 %p250, %r521, %r545; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.gt.s32 %p251, %r555, %r579; + setp.gt.s32 %p252, %r557, %r581; + setp.gt.s32 %p253, %r559, %r583; + setp.gt.s32 %p254, %r561, %r585; + setp.gt.s32 %p255, %r563, %r587; + setp.gt.s32 %p256, %r565, %r589; + setp.gt.s32 %p257, %r567, %r591; + setp.gt.s32 %p258, %r569, %r593; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + and.pred %p259, %p243, %p251; + and.pred %p260, %p244, %p252; + and.pred %p261, %p245, %p253; + and.pred %p262, %p246, %p254; + and.pred %p263, %p247, %p255; + and.pred %p264, %p248, %p256; + and.pred %p265, %p249, %p257; + and.pred %p266, %p250, %p258; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + or.pred %p267, %p235, %p259; + or.pred %p268, %p236, %p260; + or.pred %p269, %p237, %p261; + or.pred %p270, %p238, %p262; + or.pred %p271, %p239, %p263; + or.pred %p272, %p240, %p264; + or.pred %p273, %p241, %p265; + or.pred %p274, %p242, %p266; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r594, %r507, %r531; + xor.b32 %r595, %r509, %r533; + xor.b32 %r596, %r511, %r535; + xor.b32 %r597, %r513, %r537; + xor.b32 %r598, %r515, %r539; + xor.b32 %r599, %r517, %r541; + xor.b32 %r600, %r519, %r543; + xor.b32 %r601, %r521, %r545; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r602, %r594, 0, %p267; + selp.b32 %r603, %r595, 0, %p268; + selp.b32 %r604, %r596, 0, %p269; + selp.b32 %r605, %r597, 0, %p270; + selp.b32 %r606, %r598, 0, %p271; + selp.b32 %r607, %r599, 0, %p272; + selp.b32 %r608, %r600, 0, %p273; + selp.b32 %r609, %r601, 0, %p274; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r610, %r602, %r474; + xor.b32 %r611, %r603, %r475; + xor.b32 %r612, %r604, %r476; + xor.b32 %r613, %r605, %r477; + xor.b32 %r614, %r606, %r478; + xor.b32 %r615, %r607, %r479; + xor.b32 %r616, %r608, %r480; + xor.b32 %r617, %r609, %r481; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r618, %r579, %r555; + xor.b32 %r619, %r581, %r557; + xor.b32 %r620, %r583, %r559; + xor.b32 %r621, %r585, %r561; + xor.b32 %r622, %r587, %r563; + xor.b32 %r623, %r589, %r565; + xor.b32 %r624, %r591, %r567; + xor.b32 %r625, %r593, %r569; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r626, %r618, 0, %p267; + selp.b32 %r627, %r619, 0, %p268; + selp.b32 %r628, %r620, 0, %p269; + selp.b32 %r629, %r621, 0, %p270; + selp.b32 %r630, %r622, 0, %p271; + selp.b32 %r631, %r623, 0, %p272; + selp.b32 %r632, %r624, 0, %p273; + selp.b32 %r633, %r625, 0, %p274; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r634, %r626, %r490; + xor.b32 %r635, %r627, %r491; + xor.b32 %r636, %r628, %r492; + xor.b32 %r637, %r629, %r493; + xor.b32 %r638, %r630, %r494; + xor.b32 %r639, %r631, %r495; + xor.b32 %r640, %r632, %r496; + xor.b32 %r641, %r633, %r497; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + mul.lo.s32 %r642, %r610, %r53; + mul.lo.s32 %r643, %r611, %r53; + mul.lo.s32 %r644, %r612, %r53; + mul.lo.s32 %r645, %r613, %r53; + mul.lo.s32 %r646, %r614, %r53; + mul.lo.s32 %r647, %r615, %r53; + mul.lo.s32 %r648, %r616, %r53; + mul.lo.s32 %r649, %r617, %r53; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r650, %r642, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r651, %r650, %r642; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r652, %r643, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r653, %r652, %r643; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r654, %r644, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r655, %r654, %r644; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r656, %r645, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r657, %r656, %r645; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r658, %r646, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r659, %r658, %r646; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r660, %r647, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r661, %r660, %r647; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r662, %r648, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r663, %r662, %r648; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r664, %r649, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r665, %r664, %r649; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + mul.lo.s32 %r666, %r610, %r50; + mul.lo.s32 %r667, %r611, %r50; + mul.lo.s32 %r668, %r612, %r50; + mul.lo.s32 %r669, %r613, %r50; + mul.lo.s32 %r670, %r614, %r50; + mul.lo.s32 %r671, %r615, %r50; + mul.lo.s32 %r672, %r616, %r50; + mul.lo.s32 %r673, %r617, %r50; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r674, %r666, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r675, %r674, %r666; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r676, %r667, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r677, %r676, %r667; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r678, %r668, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r679, %r678, %r668; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r680, %r669, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r681, %r680, %r669; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r682, %r670, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r683, %r682, %r670; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r684, %r671, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r685, %r684, %r671; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r686, %r672, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r687, %r686, %r672; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r688, %r673, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r689, %r688, %r673; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + mul.lo.s32 %r690, %r634, %r53; + mul.lo.s32 %r691, %r635, %r53; + mul.lo.s32 %r692, %r636, %r53; + mul.lo.s32 %r693, %r637, %r53; + mul.lo.s32 %r694, %r638, %r53; + mul.lo.s32 %r695, %r639, %r53; + mul.lo.s32 %r696, %r640, %r53; + mul.lo.s32 %r697, %r641, %r53; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r698, %r690, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r699, %r698, %r690; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r700, %r691, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r701, %r700, %r691; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r702, %r692, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r703, %r702, %r692; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r704, %r693, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r705, %r704, %r693; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r706, %r694, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r707, %r706, %r694; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r708, %r695, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r709, %r708, %r695; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r710, %r696, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r711, %r710, %r696; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r712, %r697, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r713, %r712, %r697; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + mul.lo.s32 %r714, %r634, %r50; + mul.lo.s32 %r715, %r635, %r50; + mul.lo.s32 %r716, %r636, %r50; + mul.lo.s32 %r717, %r637, %r50; + mul.lo.s32 %r718, %r638, %r50; + mul.lo.s32 %r719, %r639, %r50; + mul.lo.s32 %r720, %r640, %r50; + mul.lo.s32 %r721, %r641, %r50; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r722, %r714, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r723, %r722, %r714; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r724, %r715, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r725, %r724, %r715; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r726, %r716, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r727, %r726, %r716; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r728, %r717, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r729, %r728, %r717; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r730, %r718, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r731, %r730, %r718; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r732, %r719, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r733, %r732, %r719; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r734, %r720, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r735, %r734, %r720; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + shfl.sync.bfly.b32 %r736, %r721, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + add.s32 %r737, %r736, %r721; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.lt.s32 %p275, %r651, %r675; + setp.lt.s32 %p276, %r653, %r677; + setp.lt.s32 %p277, %r655, %r679; + setp.lt.s32 %p278, %r657, %r681; + setp.lt.s32 %p279, %r659, %r683; + setp.lt.s32 %p280, %r661, %r685; + setp.lt.s32 %p281, %r663, %r687; + setp.lt.s32 %p282, %r665, %r689; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.eq.b32 %p283, %r651, %r675; + setp.eq.b32 %p284, %r653, %r677; + setp.eq.b32 %p285, %r655, %r679; + setp.eq.b32 %p286, %r657, %r681; + setp.eq.b32 %p287, %r659, %r683; + setp.eq.b32 %p288, %r661, %r685; + setp.eq.b32 %p289, %r663, %r687; + setp.eq.b32 %p290, %r665, %r689; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.gt.s32 %p291, %r699, %r723; + setp.gt.s32 %p292, %r701, %r725; + setp.gt.s32 %p293, %r703, %r727; + setp.gt.s32 %p294, %r705, %r729; + setp.gt.s32 %p295, %r707, %r731; + setp.gt.s32 %p296, %r709, %r733; + setp.gt.s32 %p297, %r711, %r735; + setp.gt.s32 %p298, %r713, %r737; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + and.pred %p299, %p283, %p291; + and.pred %p300, %p284, %p292; + and.pred %p301, %p285, %p293; + and.pred %p302, %p286, %p294; + and.pred %p303, %p287, %p295; + and.pred %p304, %p288, %p296; + and.pred %p305, %p289, %p297; + and.pred %p306, %p290, %p298; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + or.pred %p307, %p275, %p299; + or.pred %p308, %p276, %p300; + or.pred %p309, %p277, %p301; + or.pred %p310, %p278, %p302; + or.pred %p311, %p279, %p303; + or.pred %p312, %p280, %p304; + or.pred %p313, %p281, %p305; + or.pred %p314, %p282, %p306; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r738, %r675, %r651; + xor.b32 %r739, %r677, %r653; + xor.b32 %r740, %r679, %r655; + xor.b32 %r741, %r681, %r657; + xor.b32 %r742, %r683, %r659; + xor.b32 %r743, %r685, %r661; + xor.b32 %r744, %r687, %r663; + xor.b32 %r745, %r689, %r665; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r746, %r738, 0, %p307; + selp.b32 %r747, %r739, 0, %p308; + selp.b32 %r748, %r740, 0, %p309; + selp.b32 %r749, %r741, 0, %p310; + selp.b32 %r750, %r742, 0, %p311; + selp.b32 %r751, %r743, 0, %p312; + selp.b32 %r752, %r744, 0, %p313; + selp.b32 %r753, %r745, 0, %p314; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r754, %r746, %r610; + xor.b32 %r755, %r747, %r611; + xor.b32 %r756, %r748, %r612; + xor.b32 %r757, %r749, %r613; + xor.b32 %r758, %r750, %r614; + xor.b32 %r759, %r751, %r615; + xor.b32 %r760, %r752, %r616; + xor.b32 %r761, %r753, %r617; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r762, %r723, %r699; + xor.b32 %r763, %r725, %r701; + xor.b32 %r764, %r727, %r703; + xor.b32 %r765, %r729, %r705; + xor.b32 %r766, %r731, %r707; + xor.b32 %r767, %r733, %r709; + xor.b32 %r768, %r735, %r711; + xor.b32 %r769, %r737, %r713; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r770, %r762, 0, %p307; + selp.b32 %r771, %r763, 0, %p308; + selp.b32 %r772, %r764, 0, %p309; + selp.b32 %r773, %r765, 0, %p310; + selp.b32 %r774, %r766, 0, %p311; + selp.b32 %r775, %r767, 0, %p312; + selp.b32 %r776, %r768, 0, %p313; + selp.b32 %r777, %r769, 0, %p314; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r778, %r770, %r634; + xor.b32 %r779, %r771, %r635; + xor.b32 %r780, %r772, %r636; + xor.b32 %r781, %r773, %r637; + xor.b32 %r782, %r774, %r638; + xor.b32 %r783, %r775, %r639; + xor.b32 %r784, %r776, %r640; + xor.b32 %r785, %r777, %r641; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.lt.s32 %p315, %r754, %r756; + setp.lt.s32 %p316, %r755, %r757; + setp.lt.s32 %p317, %r758, %r760; + setp.lt.s32 %p318, %r759, %r761; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.eq.b32 %p319, %r754, %r756; + setp.eq.b32 %p320, %r755, %r757; + setp.eq.b32 %p321, %r758, %r760; + setp.eq.b32 %p322, %r759, %r761; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.gt.s32 %p323, %r778, %r780; + setp.gt.s32 %p324, %r779, %r781; + setp.gt.s32 %p325, %r782, %r784; + setp.gt.s32 %p326, %r783, %r785; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + and.pred %p327, %p319, %p323; + and.pred %p328, %p320, %p324; + and.pred %p329, %p321, %p325; + and.pred %p330, %p322, %p326; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + or.pred %p331, %p315, %p327; + or.pred %p332, %p316, %p328; + or.pred %p333, %p317, %p329; + or.pred %p334, %p318, %p330; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r786, %r756, %r754; + xor.b32 %r787, %r757, %r755; + xor.b32 %r788, %r760, %r758; + xor.b32 %r789, %r761, %r759; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r790, %r786, 0, %p331; + selp.b32 %r791, %r787, 0, %p332; + selp.b32 %r792, %r788, 0, %p333; + selp.b32 %r793, %r789, 0, %p334; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r794, %r790, %r754; + xor.b32 %r795, %r791, %r755; + xor.b32 %r796, %r790, %r756; + xor.b32 %r797, %r791, %r757; + xor.b32 %r798, %r792, %r758; + xor.b32 %r799, %r793, %r759; + xor.b32 %r800, %r792, %r760; + xor.b32 %r801, %r793, %r761; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r802, %r780, %r778; + xor.b32 %r803, %r781, %r779; + xor.b32 %r804, %r784, %r782; + xor.b32 %r805, %r785, %r783; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r806, %r802, 0, %p331; + selp.b32 %r807, %r803, 0, %p332; + selp.b32 %r808, %r804, 0, %p333; + selp.b32 %r809, %r805, 0, %p334; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r810, %r806, %r778; + xor.b32 %r811, %r807, %r779; + xor.b32 %r812, %r806, %r780; + xor.b32 %r813, %r807, %r781; + xor.b32 %r814, %r808, %r782; + xor.b32 %r815, %r809, %r783; + xor.b32 %r816, %r808, %r784; + xor.b32 %r817, %r809, %r785; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.lt.s32 %p335, %r796, %r797; + setp.lt.s32 %p336, %r794, %r795; + setp.lt.s32 %p337, %r800, %r801; + setp.lt.s32 %p338, %r798, %r799; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.eq.b32 %p339, %r796, %r797; + setp.eq.b32 %p340, %r795, %r794; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.gt.s32 %p341, %r812, %r813; + setp.gt.s32 %p342, %r810, %r811; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.eq.b32 %p343, %r800, %r801; + setp.eq.b32 %p344, %r799, %r798; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + setp.gt.s32 %p345, %r816, %r817; + setp.gt.s32 %p346, %r814, %r815; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r818, %r813, %r812; + xor.b32 %r819, %r810, %r811; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r820, %r819, 0, %p342; + selp.b32 %r821, %r820, 0, %p340; + selp.b32 %r822, %r819, %r821, %p336; + selp.b32 %r823, %r818, 0, %p341; + selp.b32 %r824, %r823, 0, %p339; + selp.b32 %r825, %r818, %r824, %p335; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r10, %r825, %r812; + xor.b32 %r11, %r825, %r813; + xor.b32 %r8, %r822, %r810; + xor.b32 %r9, %r822, %r811; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r826, %r817, %r816; + xor.b32 %r827, %r814, %r815; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + selp.b32 %r828, %r827, 0, %p346; + selp.b32 %r829, %r828, 0, %p344; + selp.b32 %r830, %r827, %r829, %p338; + selp.b32 %r831, %r826, 0, %p345; + selp.b32 %r832, %r831, 0, %p343; + selp.b32 %r833, %r826, %r832, %p337; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:46:71 ] + xor.b32 %r14, %r833, %r816; + xor.b32 %r15, %r833, %r817; + xor.b32 %r12, %r830, %r814; + xor.b32 %r13, %r830, %r815; +$L__tmp8: + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r834, %r150, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r835, %r834, %r150; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r836, %r151, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r837, %r836, %r151; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r838, %r152, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r839, %r838, %r152; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r840, %r153, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r841, %r840, %r153; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r842, %r154, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r843, %r842, %r154; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r844, %r155, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r845, %r844, %r155; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r846, %r156, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r847, %r846, %r156; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r848, %r157, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r849, %r848, %r157; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r850, %r158, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r851, %r850, %r158; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r852, %r159, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r853, %r852, %r159; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r854, %r160, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r855, %r854, %r160; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r856, %r161, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r857, %r856, %r161; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r858, %r162, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r859, %r858, %r162; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r860, %r163, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r861, %r860, %r163; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r862, %r164, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r863, %r862, %r164; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r864, %r165, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r865, %r864, %r165; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + setp.ge.s32 %p347, %r835, %r851; + setp.ne.b32 %p348, %r835, %r851; + setp.ge.s32 %p349, %r837, %r853; + setp.ne.b32 %p350, %r837, %r853; + setp.ge.s32 %p351, %r839, %r855; + setp.ne.b32 %p352, %r839, %r855; + setp.ge.s32 %p353, %r841, %r857; + setp.ne.b32 %p354, %r841, %r857; + setp.ge.s32 %p355, %r843, %r859; + setp.ne.b32 %p356, %r843, %r859; + setp.ge.s32 %p357, %r845, %r861; + setp.ne.b32 %p358, %r845, %r861; + setp.ge.s32 %p359, %r847, %r863; + setp.ne.b32 %p360, %r847, %r863; + setp.ge.s32 %p361, %r849, %r865; + setp.ne.b32 %p362, %r849, %r865; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r866, %r851, %r835; + xor.b32 %r867, %r853, %r837; + xor.b32 %r868, %r855, %r839; + xor.b32 %r869, %r857, %r841; + xor.b32 %r870, %r859, %r843; + xor.b32 %r871, %r861, %r845; + xor.b32 %r872, %r863, %r847; + xor.b32 %r873, %r865, %r849; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r874, %r170, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r875, %r874, %r170; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r876, %r171, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r877, %r876, %r171; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r878, %r172, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r879, %r878, %r172; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r880, %r173, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r881, %r880, %r173; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r882, %r182, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r883, %r882, %r182; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r884, %r183, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r885, %r884, %r183; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r886, %r184, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r887, %r886, %r184; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r888, %r185, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r889, %r888, %r185; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r890, %r174, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r891, %r890, %r174; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r892, %r175, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r893, %r892, %r175; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r894, %r176, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r895, %r894, %r176; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r896, %r177, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r897, %r896, %r177; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r898, %r186, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r899, %r898, %r186; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r900, %r187, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r901, %r900, %r187; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r902, %r188, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r903, %r902, %r188; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r904, %r189, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r905, %r904, %r189; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + setp.le.s32 %p363, %r875, %r891; + or.pred %p364, %p348, %p363; + and.pred %p365, %p347, %p364; + xor.pred %p366, %p365, %p6; + setp.le.s32 %p367, %r877, %r893; + or.pred %p368, %p350, %p367; + and.pred %p369, %p349, %p368; + xor.pred %p370, %p369, %p6; + setp.le.s32 %p371, %r879, %r895; + or.pred %p372, %p352, %p371; + and.pred %p373, %p351, %p372; + xor.pred %p374, %p373, %p6; + setp.le.s32 %p375, %r881, %r897; + or.pred %p376, %p354, %p375; + and.pred %p377, %p353, %p376; + xor.pred %p378, %p377, %p6; + setp.le.s32 %p379, %r883, %r899; + or.pred %p380, %p356, %p379; + and.pred %p381, %p355, %p380; + xor.pred %p382, %p381, %p6; + setp.le.s32 %p383, %r885, %r901; + or.pred %p384, %p358, %p383; + and.pred %p385, %p357, %p384; + xor.pred %p386, %p385, %p6; + setp.le.s32 %p387, %r887, %r903; + or.pred %p388, %p360, %p387; + and.pred %p389, %p359, %p388; + xor.pred %p390, %p389, %p6; + setp.le.s32 %p391, %r889, %r905; + or.pred %p392, %p362, %p391; + and.pred %p393, %p361, %p392; + xor.pred %p394, %p393, %p6; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + selp.b32 %r906, 0, %r866, %p366; + selp.b32 %r907, 0, %r867, %p370; + selp.b32 %r908, 0, %r868, %p374; + selp.b32 %r909, 0, %r869, %p378; + selp.b32 %r910, 0, %r870, %p382; + selp.b32 %r911, 0, %r871, %p386; + selp.b32 %r912, 0, %r872, %p390; + selp.b32 %r913, 0, %r873, %p394; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r914, %r906, %r134; + xor.b32 %r915, %r907, %r135; + xor.b32 %r916, %r908, %r136; + xor.b32 %r917, %r909, %r137; + xor.b32 %r918, %r910, %r138; + xor.b32 %r919, %r911, %r139; + xor.b32 %r920, %r912, %r140; + xor.b32 %r921, %r913, %r141; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r922, %r891, %r875; + xor.b32 %r923, %r893, %r877; + xor.b32 %r924, %r895, %r879; + xor.b32 %r925, %r897, %r881; + xor.b32 %r926, %r899, %r883; + xor.b32 %r927, %r901, %r885; + xor.b32 %r928, %r903, %r887; + xor.b32 %r929, %r905, %r889; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + selp.b32 %r930, 0, %r925, %p378; + selp.b32 %r931, 0, %r923, %p370; + selp.b32 %r932, 0, %r924, %p374; + selp.b32 %r933, 0, %r922, %p366; + selp.b32 %r934, 0, %r929, %p394; + selp.b32 %r935, 0, %r927, %p386; + selp.b32 %r936, 0, %r928, %p390; + selp.b32 %r937, 0, %r926, %p382; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r938, %r933, %r169; + xor.b32 %r939, %r932, %r167; + xor.b32 %r940, %r931, %r168; + xor.b32 %r941, %r930, %r166; + xor.b32 %r942, %r937, %r181; + xor.b32 %r943, %r936, %r179; + xor.b32 %r944, %r935, %r180; + xor.b32 %r945, %r934, %r178; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + setp.ge.s32 %p395, %r914, %r916; + setp.ne.b32 %p396, %r914, %r916; + setp.le.s32 %p397, %r940, %r941; + setp.le.s32 %p398, %r938, %r939; + or.pred %p399, %p396, %p398; + and.pred %p400, %p395, %p399; + setp.ge.s32 %p401, %r915, %r917; + setp.ne.b32 %p402, %r915, %r917; + or.pred %p403, %p402, %p397; + and.pred %p404, %p401, %p403; + xor.pred %p405, %p404, %p6; + xor.pred %p406, %p400, %p6; + setp.ge.s32 %p407, %r918, %r920; + setp.ne.b32 %p408, %r918, %r920; + setp.le.s32 %p409, %r944, %r945; + setp.le.s32 %p410, %r942, %r943; + or.pred %p411, %p408, %p410; + and.pred %p412, %p407, %p411; + setp.ge.s32 %p413, %r919, %r921; + setp.ne.b32 %p414, %r919, %r921; + or.pred %p415, %p414, %p409; + and.pred %p416, %p413, %p415; + xor.pred %p417, %p416, %p6; + xor.pred %p418, %p412, %p6; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r946, %r916, %r914; + xor.b32 %r947, %r917, %r915; + xor.b32 %r948, %r920, %r918; + xor.b32 %r949, %r921, %r919; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + selp.b32 %r950, 0, %r946, %p406; + selp.b32 %r951, 0, %r947, %p405; + selp.b32 %r952, 0, %r948, %p418; + selp.b32 %r953, 0, %r949, %p417; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r954, %r950, %r914; + xor.b32 %r955, %r951, %r915; + xor.b32 %r956, %r950, %r916; + xor.b32 %r957, %r951, %r917; + xor.b32 %r958, %r952, %r918; + xor.b32 %r959, %r953, %r919; + xor.b32 %r960, %r952, %r920; + xor.b32 %r961, %r953, %r921; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r962, %r939, %r938; + xor.b32 %r963, %r941, %r940; + xor.b32 %r964, %r943, %r942; + xor.b32 %r965, %r945, %r944; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + selp.b32 %r966, 0, %r963, %p405; + selp.b32 %r967, 0, %r962, %p406; + selp.b32 %r968, 0, %r965, %p417; + selp.b32 %r969, 0, %r964, %p418; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r970, %r967, %r939; + xor.b32 %r971, %r966, %r940; + xor.b32 %r972, %r966, %r941; + xor.b32 %r973, %r967, %r938; + xor.b32 %r974, %r969, %r943; + xor.b32 %r975, %r968, %r944; + xor.b32 %r976, %r968, %r945; + xor.b32 %r977, %r969, %r942; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + setp.ge.s32 %p419, %r954, %r955; + setp.ne.b32 %p420, %r954, %r955; + setp.le.s32 %p421, %r973, %r971; + or.pred %p422, %p420, %p421; + and.pred %p423, %p419, %p422; + xor.pred %p424, %p423, %p6; + setp.ge.s32 %p425, %r956, %r957; + setp.ne.b32 %p426, %r956, %r957; + setp.le.s32 %p427, %r970, %r972; + or.pred %p428, %p426, %p427; + and.pred %p429, %p425, %p428; + xor.pred %p430, %p429, %p6; + setp.ge.s32 %p431, %r958, %r959; + setp.ne.b32 %p432, %r958, %r959; + setp.le.s32 %p433, %r977, %r975; + or.pred %p434, %p432, %p433; + and.pred %p435, %p431, %p434; + xor.pred %p436, %p435, %p6; + setp.ge.s32 %p437, %r960, %r961; + setp.ne.b32 %p438, %r960, %r961; + setp.le.s32 %p439, %r974, %r976; + or.pred %p440, %p438, %p439; + and.pred %p441, %p437, %p440; + xor.pred %p442, %p441, %p6; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r978, %r955, %r954; + xor.b32 %r979, %r957, %r956; + xor.b32 %r980, %r959, %r958; + xor.b32 %r981, %r961, %r960; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + selp.b32 %r982, 0, %r978, %p424; + selp.b32 %r983, 0, %r979, %p430; + selp.b32 %r984, 0, %r980, %p436; + selp.b32 %r985, 0, %r981, %p442; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r986, %r983, %r956; + xor.b32 %r987, %r982, %r954; + xor.b32 %r988, %r983, %r957; + xor.b32 %r989, %r982, %r955; + xor.b32 %r990, %r985, %r960; + xor.b32 %r991, %r984, %r958; + xor.b32 %r992, %r985, %r961; + xor.b32 %r993, %r984, %r959; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r994, %r971, %r973; + xor.b32 %r995, %r972, %r970; + xor.b32 %r996, %r975, %r977; + xor.b32 %r997, %r976, %r974; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + selp.b32 %r998, 0, %r995, %p430; + selp.b32 %r999, 0, %r994, %p424; + selp.b32 %r1000, 0, %r997, %p442; + selp.b32 %r1001, 0, %r996, %p436; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r1006, %r999, %r971; + xor.b32 %r1007, %r998, %r970; + xor.b32 %r1008, %r999, %r973; + xor.b32 %r1009, %r998, %r972; + xor.b32 %r1014, %r1001, %r975; + xor.b32 %r1015, %r1000, %r974; + xor.b32 %r1016, %r1001, %r977; + xor.b32 %r1017, %r1000, %r976; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + mul.lo.s32 %r1018, %r987, %r54; + mul.lo.s32 %r1019, %r989, %r54; + mul.lo.s32 %r1020, %r986, %r54; + mul.lo.s32 %r1021, %r988, %r54; + mul.lo.s32 %r1022, %r991, %r54; + mul.lo.s32 %r1023, %r993, %r54; + mul.lo.s32 %r1024, %r990, %r54; + mul.lo.s32 %r1025, %r992, %r54; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r1026, %r1018, 2, 31, -1; + shfl.sync.bfly.b32 %r1027, %r1019, 2, 31, -1; + shfl.sync.bfly.b32 %r1028, %r1020, 2, 31, -1; + shfl.sync.bfly.b32 %r1029, %r1021, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r1030, %r1019, %r1027; + add.s32 %r1031, %r1021, %r1029; + add.s32 %r1032, %r1018, %r1026; + add.s32 %r1033, %r1020, %r1028; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r1034, %r1022, 2, 31, -1; + shfl.sync.bfly.b32 %r1035, %r1023, 2, 31, -1; + shfl.sync.bfly.b32 %r1036, %r1024, 2, 31, -1; + shfl.sync.bfly.b32 %r1037, %r1025, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r1038, %r1023, %r1035; + add.s32 %r1039, %r1025, %r1037; + add.s32 %r1040, %r1022, %r1034; + add.s32 %r1041, %r1024, %r1036; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + mul.lo.s32 %r1042, %r987, %r52; + mul.lo.s32 %r1043, %r989, %r52; + mul.lo.s32 %r1044, %r986, %r52; + mul.lo.s32 %r1045, %r988, %r52; + mul.lo.s32 %r1046, %r991, %r52; + mul.lo.s32 %r1047, %r993, %r52; + mul.lo.s32 %r1048, %r990, %r52; + mul.lo.s32 %r1049, %r992, %r52; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r1050, %r1042, 2, 31, -1; + shfl.sync.bfly.b32 %r1051, %r1043, 2, 31, -1; + shfl.sync.bfly.b32 %r1052, %r1044, 2, 31, -1; + shfl.sync.bfly.b32 %r1053, %r1045, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r1054, %r1043, %r1051; + add.s32 %r1055, %r1045, %r1053; + add.s32 %r1056, %r1042, %r1050; + add.s32 %r1057, %r1044, %r1052; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r1058, %r1046, 2, 31, -1; + shfl.sync.bfly.b32 %r1059, %r1047, 2, 31, -1; + shfl.sync.bfly.b32 %r1060, %r1048, 2, 31, -1; + shfl.sync.bfly.b32 %r1061, %r1049, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r1062, %r1047, %r1059; + add.s32 %r1063, %r1049, %r1061; + add.s32 %r1064, %r1046, %r1058; + add.s32 %r1065, %r1048, %r1060; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + mul.lo.s32 %r1066, %r1009, %r54; + mul.lo.s32 %r1067, %r1007, %r54; + mul.lo.s32 %r1068, %r1006, %r54; + mul.lo.s32 %r1069, %r1008, %r54; + mul.lo.s32 %r1070, %r1017, %r54; + mul.lo.s32 %r1071, %r1015, %r54; + mul.lo.s32 %r1072, %r1014, %r54; + mul.lo.s32 %r1073, %r1016, %r54; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r1074, %r1069, 2, 31, -1; + shfl.sync.bfly.b32 %r1075, %r1068, 2, 31, -1; + shfl.sync.bfly.b32 %r1076, %r1067, 2, 31, -1; + shfl.sync.bfly.b32 %r1077, %r1066, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r1078, %r1076, %r1067; + add.s32 %r1079, %r1074, %r1069; + add.s32 %r1080, %r1077, %r1066; + add.s32 %r1081, %r1075, %r1068; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r1082, %r1073, 2, 31, -1; + shfl.sync.bfly.b32 %r1083, %r1072, 2, 31, -1; + shfl.sync.bfly.b32 %r1084, %r1071, 2, 31, -1; + shfl.sync.bfly.b32 %r1085, %r1070, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r1086, %r1084, %r1071; + add.s32 %r1087, %r1082, %r1073; + add.s32 %r1088, %r1085, %r1070; + add.s32 %r1089, %r1083, %r1072; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + mul.lo.s32 %r1090, %r1009, %r52; + mul.lo.s32 %r1091, %r1007, %r52; + mul.lo.s32 %r1092, %r1006, %r52; + mul.lo.s32 %r1093, %r1008, %r52; + mul.lo.s32 %r1094, %r1017, %r52; + mul.lo.s32 %r1095, %r1015, %r52; + mul.lo.s32 %r1096, %r1014, %r52; + mul.lo.s32 %r1097, %r1016, %r52; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r1098, %r1093, 2, 31, -1; + shfl.sync.bfly.b32 %r1099, %r1092, 2, 31, -1; + shfl.sync.bfly.b32 %r1100, %r1091, 2, 31, -1; + shfl.sync.bfly.b32 %r1101, %r1090, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r1102, %r1100, %r1091; + add.s32 %r1103, %r1098, %r1093; + add.s32 %r1104, %r1101, %r1090; + add.s32 %r1105, %r1099, %r1092; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r1106, %r1097, 2, 31, -1; + shfl.sync.bfly.b32 %r1107, %r1096, 2, 31, -1; + shfl.sync.bfly.b32 %r1108, %r1095, 2, 31, -1; + shfl.sync.bfly.b32 %r1109, %r1094, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + add.s32 %r1110, %r1108, %r1095; + add.s32 %r1111, %r1106, %r1097; + add.s32 %r1112, %r1109, %r1094; + add.s32 %r1113, %r1107, %r1096; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + setp.lt.s32 %p443, %r1033, %r1057; + setp.lt.s32 %p444, %r1032, %r1056; + setp.lt.s32 %p445, %r1031, %r1055; + setp.lt.s32 %p446, %r1030, %r1054; + setp.lt.s32 %p447, %r1041, %r1065; + setp.lt.s32 %p448, %r1040, %r1064; + setp.lt.s32 %p449, %r1039, %r1063; + setp.lt.s32 %p450, %r1038, %r1062; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + setp.eq.b32 %p451, %r1030, %r1054; + setp.eq.b32 %p452, %r1031, %r1055; + setp.eq.b32 %p453, %r1032, %r1056; + setp.eq.b32 %p454, %r1033, %r1057; + setp.eq.b32 %p455, %r1038, %r1062; + setp.eq.b32 %p456, %r1039, %r1063; + setp.eq.b32 %p457, %r1040, %r1064; + setp.eq.b32 %p458, %r1041, %r1065; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + setp.gt.s32 %p459, %r1081, %r1105; + setp.gt.s32 %p460, %r1080, %r1104; + setp.gt.s32 %p461, %r1079, %r1103; + setp.gt.s32 %p462, %r1078, %r1102; + setp.gt.s32 %p463, %r1089, %r1113; + setp.gt.s32 %p464, %r1088, %r1112; + setp.gt.s32 %p465, %r1087, %r1111; + setp.gt.s32 %p466, %r1086, %r1110; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + and.pred %p467, %p454, %p462; + and.pred %p468, %p453, %p461; + and.pred %p469, %p452, %p460; + and.pred %p470, %p451, %p459; + and.pred %p471, %p458, %p466; + and.pred %p472, %p457, %p465; + and.pred %p473, %p456, %p464; + and.pred %p474, %p455, %p463; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + or.pred %p475, %p446, %p470; + or.pred %p476, %p445, %p469; + or.pred %p477, %p444, %p468; + or.pred %p478, %p443, %p467; + or.pred %p479, %p450, %p474; + or.pred %p480, %p449, %p473; + or.pred %p481, %p448, %p472; + or.pred %p482, %p447, %p471; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r1114, %r1030, %r1054; + xor.b32 %r1115, %r1031, %r1055; + xor.b32 %r1116, %r1032, %r1056; + xor.b32 %r1117, %r1033, %r1057; + xor.b32 %r1118, %r1038, %r1062; + xor.b32 %r1119, %r1039, %r1063; + xor.b32 %r1120, %r1040, %r1064; + xor.b32 %r1121, %r1041, %r1065; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + selp.b32 %r1122, %r1117, 0, %p478; + selp.b32 %r1123, %r1116, 0, %p477; + selp.b32 %r1124, %r1115, 0, %p476; + selp.b32 %r1125, %r1114, 0, %p475; + selp.b32 %r1126, %r1121, 0, %p482; + selp.b32 %r1127, %r1120, 0, %p481; + selp.b32 %r1128, %r1119, 0, %p480; + selp.b32 %r1129, %r1118, 0, %p479; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r1130, %r1125, %r989; + xor.b32 %r1131, %r1123, %r987; + xor.b32 %r1132, %r1122, %r986; + xor.b32 %r1133, %r1124, %r988; + xor.b32 %r1134, %r1129, %r993; + xor.b32 %r1135, %r1127, %r991; + xor.b32 %r1136, %r1126, %r990; + xor.b32 %r1137, %r1128, %r992; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r1138, %r1105, %r1081; + xor.b32 %r1139, %r1102, %r1078; + xor.b32 %r1140, %r1104, %r1080; + xor.b32 %r1141, %r1103, %r1079; + xor.b32 %r1142, %r1113, %r1089; + xor.b32 %r1143, %r1110, %r1086; + xor.b32 %r1144, %r1112, %r1088; + xor.b32 %r1145, %r1111, %r1087; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + selp.b32 %r1146, %r1141, 0, %p477; + selp.b32 %r1147, %r1140, 0, %p476; + selp.b32 %r1148, %r1139, 0, %p478; + selp.b32 %r1149, %r1138, 0, %p475; + selp.b32 %r1150, %r1145, 0, %p481; + selp.b32 %r1151, %r1144, 0, %p480; + selp.b32 %r1152, %r1143, 0, %p482; + selp.b32 %r1153, %r1142, 0, %p479; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + xor.b32 %r1158, %r1149, %r1006; + xor.b32 %r1159, %r1146, %r1008; + xor.b32 %r1160, %r1147, %r1009; + xor.b32 %r1161, %r1148, %r1007; + xor.b32 %r1166, %r1153, %r1014; + xor.b32 %r1167, %r1150, %r1016; + xor.b32 %r1168, %r1151, %r1017; + xor.b32 %r1169, %r1152, %r1015; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + mul.lo.s32 %r1170, %r1133, %r53; + mul.lo.s32 %r1171, %r1132, %r53; + mul.lo.s32 %r1172, %r1130, %r53; + mul.lo.s32 %r1173, %r1131, %r53; + mul.lo.s32 %r1174, %r1137, %r53; + mul.lo.s32 %r1175, %r1136, %r53; + mul.lo.s32 %r1176, %r1134, %r53; + mul.lo.s32 %r1177, %r1135, %r53; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r1178, %r1173, 1, 31, -1; + shfl.sync.bfly.b32 %r1179, %r1172, 1, 31, -1; + shfl.sync.bfly.b32 %r1180, %r1171, 1, 31, -1; + shfl.sync.bfly.b32 %r1181, %r1170, 1, 31, -1; + shfl.sync.bfly.b32 %r1186, %r1177, 1, 31, -1; + shfl.sync.bfly.b32 %r1187, %r1176, 1, 31, -1; + shfl.sync.bfly.b32 %r1188, %r1175, 1, 31, -1; + shfl.sync.bfly.b32 %r1189, %r1174, 1, 31, -1; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + mul.lo.s32 %r1194, %r1133, %r50; + mul.lo.s32 %r1195, %r1132, %r50; + mul.lo.s32 %r1196, %r1130, %r50; + mul.lo.s32 %r1197, %r1131, %r50; + mul.lo.s32 %r1198, %r1137, %r50; + mul.lo.s32 %r1199, %r1136, %r50; + mul.lo.s32 %r1200, %r1134, %r50; + mul.lo.s32 %r1201, %r1135, %r50; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r1202, %r1197, 1, 31, -1; + shfl.sync.bfly.b32 %r1203, %r1196, 1, 31, -1; + shfl.sync.bfly.b32 %r1204, %r1195, 1, 31, -1; + shfl.sync.bfly.b32 %r1205, %r1194, 1, 31, -1; + shfl.sync.bfly.b32 %r1210, %r1201, 1, 31, -1; + shfl.sync.bfly.b32 %r1211, %r1200, 1, 31, -1; + shfl.sync.bfly.b32 %r1212, %r1199, 1, 31, -1; + shfl.sync.bfly.b32 %r1213, %r1198, 1, 31, -1; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + mul.lo.s32 %r1218, %r1159, %r53; + mul.lo.s32 %r1219, %r1158, %r53; + mul.lo.s32 %r1220, %r1161, %r53; + mul.lo.s32 %r1221, %r1160, %r53; + mul.lo.s32 %r1222, %r1167, %r53; + mul.lo.s32 %r1223, %r1166, %r53; + mul.lo.s32 %r1224, %r1169, %r53; + mul.lo.s32 %r1225, %r1168, %r53; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r1226, %r1218, 1, 31, -1; + shfl.sync.bfly.b32 %r1227, %r1219, 1, 31, -1; + shfl.sync.bfly.b32 %r1228, %r1220, 1, 31, -1; + shfl.sync.bfly.b32 %r1229, %r1221, 1, 31, -1; + shfl.sync.bfly.b32 %r1234, %r1222, 1, 31, -1; + shfl.sync.bfly.b32 %r1235, %r1223, 1, 31, -1; + shfl.sync.bfly.b32 %r1236, %r1224, 1, 31, -1; + shfl.sync.bfly.b32 %r1237, %r1225, 1, 31, -1; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + mul.lo.s32 %r1242, %r1159, %r50; + mul.lo.s32 %r1243, %r1158, %r50; + mul.lo.s32 %r1244, %r1161, %r50; + mul.lo.s32 %r1245, %r1160, %r50; + mul.lo.s32 %r1246, %r1167, %r50; + mul.lo.s32 %r1247, %r1166, %r50; + mul.lo.s32 %r1248, %r1169, %r50; + mul.lo.s32 %r1249, %r1168, %r50; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:51:71 ] + shfl.sync.bfly.b32 %r1250, %r1242, 1, 31, -1; + shfl.sync.bfly.b32 %r1251, %r1243, 1, 31, -1; + shfl.sync.bfly.b32 %r1252, %r1244, 1, 31, -1; + shfl.sync.bfly.b32 %r1253, %r1245, 1, 31, -1; + shfl.sync.bfly.b32 %r1258, %r1246, 1, 31, -1; + shfl.sync.bfly.b32 %r1259, %r1247, 1, 31, -1; + shfl.sync.bfly.b32 %r1260, %r1248, 1, 31, -1; + shfl.sync.bfly.b32 %r1261, %r1249, 1, 31, -1; +$L__tmp9: + .loc 1 54 35 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:54:35 + and.pred %p555, %p4, %p110; + selp.b16 %rs12, 1, 0, %p555; + shl.b16 %rs13, %rs12, 2; + and.pred %p556, %p4, %p109; + selp.b16 %rs14, -1, 0, %p556; + shl.b16 %rs15, %rs14, 3; + or.b16 %rs16, %rs15, %rs13; + and.pred %p557, %p4, %p112; + selp.b16 %rs17, 1, 0, %p557; + and.pred %p558, %p4, %p111; + selp.b16 %rs18, -1, 0, %p558; + shl.b16 %rs19, %rs18, 1; + or.b16 %rs20, %rs17, %rs19; + and.b16 %rs21, %rs20, 3; + or.b16 %rs22, %rs21, %rs16; +$L__tmp10: + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:55:26 ] + and.b16 %rs23, %rs11, 15; + cvt.u32.u16 %r1406, %rs23; + popc.b32 %r1407, %r1406; + cvt.u64.u32 %rd43, %r1407; + and.b16 %rs24, %rs22, 15; + cvt.u32.u16 %r1408, %rs24; + popc.b32 %r1409, %r1408; + cvt.u64.u32 %rd44, %r1409; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:55:26 ] + shfl.sync.bfly.b32 %r1410, %r1407, 2, 31, -1; + mov.b32 %r1411, 0; + shfl.sync.bfly.b32 %r1412, %r1411, 2, 31, -1; + cvt.u64.u32 %rd45, %r1410; + cvt.u64.u32 %rd46, %r1412; + shl.b64 %rd47, %rd46, 32; + or.b64 %rd48, %rd45, %rd47; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:55:26 ] + add.s64 %rd49, %rd43, %rd48; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:55:26 ] + mov.b64 {_, %r1413}, %rd49; + cvt.u32.u64 %r1414, %rd49; + shfl.sync.bfly.b32 %r1415, %r1414, 1, 31, -1; + shfl.sync.bfly.b32 %r1416, %r1413, 1, 31, -1; + cvt.u64.u32 %rd50, %r1415; + cvt.u64.u32 %rd51, %r1416; + shl.b64 %rd52, %rd51, 32; + or.b64 %rd53, %rd50, %rd52; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:55:26 ] + add.s64 %rd54, %rd49, %rd53; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:55:26 ] + shfl.sync.bfly.b32 %r1417, %r1409, 2, 31, -1; + shfl.sync.bfly.b32 %r1418, %r1411, 2, 31, -1; + cvt.u64.u32 %rd55, %r1417; + cvt.u64.u32 %rd56, %r1418; + shl.b64 %rd57, %rd56, 32; + or.b64 %rd58, %rd55, %rd57; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:55:26 ] + add.s64 %rd59, %rd44, %rd58; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:55:26 ] + mov.b64 {_, %r1419}, %rd59; + cvt.u32.u64 %r1420, %rd59; + shfl.sync.bfly.b32 %r1421, %r1420, 1, 31, -1; + shfl.sync.bfly.b32 %r1422, %r1419, 1, 31, -1; + cvt.u64.u32 %rd60, %r1421; + cvt.u64.u32 %rd61, %r1422; + shl.b64 %rd62, %rd61, 32; + or.b64 %rd63, %rd60, %rd62; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:55:26 ] + add.s64 %rd64, %rd59, %rd63; +$L__tmp11: + .loc 1 60 21 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:60:21 + shl.b32 %r1423, %r3, 1; + mov.b32 %r1424, global_smem; + add.s32 %r1425, %r1424, %r1423; + st.shared.b64 [%r1425], %rd54; + st.shared.b64 [%r1425+512], %rd64; + bar.sync 0; + shl.b32 %r1426, %r4, 3; + add.s32 %r1427, %r1424, %r1426; + ld.shared.b64 %rd3, [%r1427]; + .loc 1 58 35 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:58:35 + and.pred %p559, %p2, %p10; + selp.b16 %rs25, 1, 0, %p559; + shl.b16 %rs26, %rs25, 2; + and.pred %p560, %p2, %p12; + selp.b16 %rs27, -1, 0, %p560; + shl.b16 %rs28, %rs27, 3; + or.b16 %rs29, %rs28, %rs26; + and.pred %p561, %p2, %p8; + selp.b16 %rs30, 1, 0, %p561; + and.pred %p562, %p2, %p9; + selp.b16 %rs31, -1, 0, %p562; + shl.b16 %rs32, %rs31, 1; + or.b16 %rs33, %rs30, %rs32; + and.b16 %rs34, %rs33, 3; + or.b16 %rs35, %rs34, %rs29; + and.pred %p563, %p4, %p16; + selp.b16 %rs36, 1, 0, %p563; + shl.b16 %rs37, %rs36, 2; + and.pred %p564, %p4, %p18; + selp.b16 %rs38, -1, 0, %p564; + shl.b16 %rs39, %rs38, 3; + or.b16 %rs40, %rs39, %rs37; + and.pred %p565, %p4, %p14; + selp.b16 %rs41, 1, 0, %p565; + and.pred %p566, %p4, %p15; + selp.b16 %rs42, -1, 0, %p566; + shl.b16 %rs43, %rs42, 1; + or.b16 %rs44, %rs41, %rs43; + and.b16 %rs45, %rs44, 3; + or.b16 %rs46, %rs45, %rs40; +$L__tmp12: + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:59:26 ] + and.b16 %rs47, %rs35, 15; + cvt.u32.u16 %r1428, %rs47; + popc.b32 %r1429, %r1428; + cvt.u64.u32 %rd65, %r1429; + and.b16 %rs48, %rs46, 15; + cvt.u32.u16 %r1430, %rs48; + popc.b32 %r1431, %r1430; + cvt.u64.u32 %rd66, %r1431; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:59:26 ] + shfl.sync.bfly.b32 %r1432, %r1429, 2, 31, -1; + shfl.sync.bfly.b32 %r1433, %r1411, 2, 31, -1; + cvt.u64.u32 %rd67, %r1432; + cvt.u64.u32 %rd68, %r1433; + shl.b64 %rd69, %rd68, 32; + or.b64 %rd70, %rd67, %rd69; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:59:26 ] + add.s64 %rd71, %rd65, %rd70; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:59:26 ] + mov.b64 {_, %r1434}, %rd71; + cvt.u32.u64 %r1435, %rd71; + shfl.sync.bfly.b32 %r1436, %r1435, 1, 31, -1; + shfl.sync.bfly.b32 %r1437, %r1434, 1, 31, -1; + cvt.u64.u32 %rd72, %r1436; + cvt.u64.u32 %rd73, %r1437; + shl.b64 %rd74, %rd73, 32; + or.b64 %rd75, %rd72, %rd74; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:59:26 ] + add.s64 %rd4, %rd71, %rd75; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:59:26 ] + shfl.sync.bfly.b32 %r1438, %r1431, 2, 31, -1; + shfl.sync.bfly.b32 %r1439, %r1411, 2, 31, -1; + cvt.u64.u32 %rd76, %r1438; + cvt.u64.u32 %rd77, %r1439; + shl.b64 %rd78, %rd77, 32; + or.b64 %rd79, %rd76, %rd78; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:59:26 ] + add.s64 %rd80, %rd66, %rd79; + .loc 3 291 36 // standard.py:291:36 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:59:26 ] + mov.b64 {_, %r1440}, %rd80; + cvt.u32.u64 %r1441, %rd80; + shfl.sync.bfly.b32 %r1442, %r1441, 1, 31, -1; + shfl.sync.bfly.b32 %r1443, %r1440, 1, 31, -1; + cvt.u64.u32 %rd81, %r1442; + cvt.u64.u32 %rd82, %r1443; + shl.b64 %rd83, %rd82, 32; + or.b64 %rd84, %rd81, %rd83; + .loc 3 261 15 // standard.py:261:15 @[ cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:59:26 ] + add.s64 %rd5, %rd80, %rd84; +$L__tmp13: + .loc 1 61 21 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:61:21 + bar.sync 0; + st.shared.b64 [%r1425], %rd4; + st.shared.b64 [%r1425+512], %rd5; + bar.sync 0; + .loc 1 60 21 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:60:21 + cvt.u32.u64 %r1444, %rd54; + cvt.u32.u64 %r1445, %rd64; + .loc 1 64 19 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:64:19 + setp.lt.s32 %p567, %r26, %r1444; + setp.lt.s32 %p568, %r27, %r1444; + setp.lt.s32 %p569, %r24, %r1444; + setp.lt.s32 %p570, %r25, %r1444; + .loc 1 66 35 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:66:35 + selp.b32 %r1446, %r9, 16, %p570; + selp.b32 %r1447, %r8, 16, %p569; + selp.b32 %r1448, %r11, 16, %p568; + selp.b32 %r1449, %r10, 16, %p567; + .loc 1 68 20 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:68:20 + add.s32 %r1450, %r1449, 17; + add.s32 %r1451, %r1448, 17; + add.s32 %r1452, %r1447, 17; + add.s32 %r1453, %r1446, 17; + .loc 1 64 19 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:64:19 + setp.lt.s32 %p571, %r26, %r1445; + setp.lt.s32 %p572, %r27, %r1445; + setp.lt.s32 %p573, %r24, %r1445; + setp.lt.s32 %p574, %r25, %r1445; + .loc 1 66 35 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:66:35 + selp.b32 %r1454, %r13, 16, %p574; + selp.b32 %r1455, %r12, 16, %p573; + selp.b32 %r1456, %r15, 16, %p572; + selp.b32 %r1457, %r14, 16, %p571; + .loc 1 68 20 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:68:20 + add.s32 %r1458, %r1457, 17; + add.s32 %r1459, %r1456, 17; + add.s32 %r1460, %r1455, 17; + add.s32 %r1461, %r1454, 17; + .loc 1 69 20 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:69:20 + setp.lt.s32 %p575, %r1449, 0; + setp.lt.s32 %p576, %r1448, 0; + setp.lt.s32 %p577, %r1447, 0; + setp.lt.s32 %p578, %r1446, 0; + .loc 1 70 35 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:70:35 + selp.b32 %r29, %r1453, %r1446, %p578; + selp.b32 %r28, %r1452, %r1447, %p577; + selp.b32 %r31, %r1451, %r1448, %p576; + selp.b32 %r30, %r1450, %r1449, %p575; + .loc 1 71 38 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:71:38 + setp.gt.u32 %p579, %r30, 16; + selp.b16 %rs49, 1, 0, %p579; + shl.b16 %rs50, %rs49, 2; + setp.gt.u32 %p580, %r31, 16; + selp.b16 %rs51, -1, 0, %p580; + shl.b16 %rs52, %rs51, 3; + or.b16 %rs53, %rs52, %rs50; + setp.gt.u32 %p581, %r28, 16; + selp.b16 %rs54, 1, 0, %p581; + setp.gt.u32 %p582, %r29, 16; + selp.b16 %rs55, -1, 0, %p582; + shl.b16 %rs56, %rs55, 1; + or.b16 %rs57, %rs54, %rs56; + and.b16 %rs58, %rs57, 3; + or.b16 %rs59, %rs58, %rs53; + .loc 1 69 20 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:69:20 + setp.lt.s32 %p583, %r1457, 0; + setp.lt.s32 %p584, %r1456, 0; + setp.lt.s32 %p585, %r1455, 0; + setp.lt.s32 %p586, %r1454, 0; + .loc 1 70 35 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:70:35 + selp.b32 %r33, %r1461, %r1454, %p586; + selp.b32 %r32, %r1460, %r1455, %p585; + selp.b32 %r35, %r1459, %r1456, %p584; + selp.b32 %r34, %r1458, %r1457, %p583; + .loc 1 71 38 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:71:38 + setp.gt.u32 %p587, %r34, 16; + selp.b16 %rs60, 1, 0, %p587; + shl.b16 %rs61, %rs60, 2; + setp.gt.u32 %p588, %r35, 16; + selp.b16 %rs62, -1, 0, %p588; + shl.b16 %rs63, %rs62, 3; + or.b16 %rs64, %rs63, %rs61; + setp.gt.u32 %p589, %r32, 16; + selp.b16 %rs65, 1, 0, %p589; + setp.gt.u32 %p590, %r33, 16; + selp.b16 %rs66, -1, 0, %p590; + shl.b16 %rs67, %rs66, 1; + or.b16 %rs68, %rs65, %rs67; + and.b16 %rs69, %rs68, 3; + or.b16 %rs70, %rs69, %rs64; + .loc 1 71 63 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:71:63 + and.b16 %rs71, %rs59, 15; + setp.ne.b16 %p591, %rs71, 0; + and.pred %p592, %p2, %p591; + and.b16 %rs72, %rs70, 15; + setp.ne.b16 %p593, %rs72, 0; + and.pred %p594, %p4, %p593; + or.pred %p595, %p592, %p594; + not.pred %p596, %p595; + @%p596 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + .loc 1 0 0 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:0 + xor.b32 %r1002, %r967, %r999; + xor.b32 %r1003, %r966, %r998; + xor.b32 %r1004, %r967, %r998; + xor.b32 %r1005, %r966, %r999; + xor.b32 %r1010, %r969, %r1001; + xor.b32 %r1011, %r968, %r1000; + xor.b32 %r1012, %r969, %r1000; + xor.b32 %r1013, %r968, %r1001; + xor.b32 %r1154, %r1005, %r1149; + xor.b32 %r1155, %r1004, %r1148; + xor.b32 %r1156, %r1003, %r1147; + xor.b32 %r1157, %r1002, %r1146; + xor.b32 %r1162, %r1013, %r1153; + xor.b32 %r1163, %r1012, %r1152; + xor.b32 %r1164, %r1011, %r1151; + xor.b32 %r1165, %r1010, %r1150; + add.s32 %r1182, %r1179, %r1172; + add.s32 %r1183, %r1181, %r1170; + add.s32 %r1184, %r1178, %r1173; + add.s32 %r1185, %r1180, %r1171; + add.s32 %r1190, %r1187, %r1176; + add.s32 %r1191, %r1189, %r1174; + add.s32 %r1192, %r1186, %r1177; + add.s32 %r1193, %r1188, %r1175; + add.s32 %r1206, %r1203, %r1196; + add.s32 %r1207, %r1205, %r1194; + add.s32 %r1208, %r1202, %r1197; + add.s32 %r1209, %r1204, %r1195; + add.s32 %r1214, %r1211, %r1200; + add.s32 %r1215, %r1213, %r1198; + add.s32 %r1216, %r1210, %r1201; + add.s32 %r1217, %r1212, %r1199; + add.s32 %r1230, %r1228, %r1220; + add.s32 %r1231, %r1227, %r1219; + add.s32 %r1232, %r1226, %r1218; + add.s32 %r1233, %r1229, %r1221; + add.s32 %r1238, %r1236, %r1224; + add.s32 %r1239, %r1235, %r1223; + add.s32 %r1240, %r1234, %r1222; + add.s32 %r1241, %r1237, %r1225; + add.s32 %r1254, %r1252, %r1244; + add.s32 %r1255, %r1251, %r1243; + add.s32 %r1256, %r1250, %r1242; + add.s32 %r1257, %r1253, %r1245; + add.s32 %r1262, %r1260, %r1248; + add.s32 %r1263, %r1259, %r1247; + add.s32 %r1264, %r1258, %r1246; + add.s32 %r1265, %r1261, %r1249; + setp.lt.s32 %p483, %r1185, %r1209; + setp.lt.s32 %p484, %r1184, %r1208; + setp.lt.s32 %p485, %r1183, %r1207; + setp.lt.s32 %p486, %r1182, %r1206; + setp.lt.s32 %p487, %r1193, %r1217; + setp.lt.s32 %p488, %r1192, %r1216; + setp.lt.s32 %p489, %r1191, %r1215; + setp.lt.s32 %p490, %r1190, %r1214; + setp.eq.b32 %p491, %r1182, %r1206; + setp.eq.b32 %p492, %r1183, %r1207; + setp.eq.b32 %p493, %r1184, %r1208; + setp.eq.b32 %p494, %r1185, %r1209; + setp.eq.b32 %p495, %r1190, %r1214; + setp.eq.b32 %p496, %r1191, %r1215; + setp.eq.b32 %p497, %r1192, %r1216; + setp.eq.b32 %p498, %r1193, %r1217; + setp.gt.s32 %p499, %r1231, %r1255; + setp.gt.s32 %p500, %r1233, %r1257; + setp.gt.s32 %p501, %r1232, %r1256; + setp.gt.s32 %p502, %r1230, %r1254; + setp.gt.s32 %p503, %r1239, %r1263; + setp.gt.s32 %p504, %r1241, %r1265; + setp.gt.s32 %p505, %r1240, %r1264; + setp.gt.s32 %p506, %r1238, %r1262; + and.pred %p507, %p494, %p502; + and.pred %p508, %p493, %p501; + and.pred %p509, %p492, %p500; + and.pred %p510, %p491, %p499; + and.pred %p511, %p498, %p506; + and.pred %p512, %p497, %p505; + and.pred %p513, %p496, %p504; + and.pred %p514, %p495, %p503; + or.pred %p515, %p486, %p510; + or.pred %p516, %p485, %p509; + or.pred %p517, %p484, %p508; + or.pred %p518, %p483, %p507; + or.pred %p519, %p490, %p514; + or.pred %p520, %p489, %p513; + or.pred %p521, %p488, %p512; + or.pred %p522, %p487, %p511; + xor.b32 %r1266, %r1206, %r1182; + xor.b32 %r1267, %r1207, %r1183; + xor.b32 %r1268, %r1208, %r1184; + xor.b32 %r1269, %r1209, %r1185; + xor.b32 %r1270, %r1214, %r1190; + xor.b32 %r1271, %r1215, %r1191; + xor.b32 %r1272, %r1216, %r1192; + xor.b32 %r1273, %r1217, %r1193; + selp.b32 %r1274, %r1269, 0, %p518; + selp.b32 %r1275, %r1268, 0, %p517; + selp.b32 %r1276, %r1267, 0, %p516; + selp.b32 %r1277, %r1266, 0, %p515; + selp.b32 %r1278, %r1273, 0, %p522; + selp.b32 %r1279, %r1272, 0, %p521; + selp.b32 %r1280, %r1271, 0, %p520; + selp.b32 %r1281, %r1270, 0, %p519; + xor.b32 %r1282, %r1274, %r1132; + xor.b32 %r1283, %r1275, %r1131; + xor.b32 %r1284, %r1277, %r1130; + xor.b32 %r1285, %r1276, %r1133; + xor.b32 %r1286, %r1278, %r1136; + xor.b32 %r1287, %r1279, %r1135; + xor.b32 %r1288, %r1281, %r1134; + xor.b32 %r1289, %r1280, %r1137; + xor.b32 %r1290, %r1254, %r1230; + xor.b32 %r1291, %r1255, %r1231; + xor.b32 %r1292, %r1256, %r1232; + xor.b32 %r1293, %r1257, %r1233; + xor.b32 %r1294, %r1262, %r1238; + xor.b32 %r1295, %r1263, %r1239; + xor.b32 %r1296, %r1264, %r1240; + xor.b32 %r1297, %r1265, %r1241; + selp.b32 %r1298, %r1291, 0, %p515; + selp.b32 %r1299, %r1290, 0, %p518; + selp.b32 %r1300, %r1293, 0, %p516; + selp.b32 %r1301, %r1292, 0, %p517; + selp.b32 %r1302, %r1295, 0, %p519; + selp.b32 %r1303, %r1294, 0, %p522; + selp.b32 %r1304, %r1297, 0, %p520; + selp.b32 %r1305, %r1296, 0, %p521; + xor.b32 %r1306, %r1157, %r1301; + xor.b32 %r1307, %r1156, %r1300; + xor.b32 %r1308, %r1301, %r1159; + xor.b32 %r1309, %r1299, %r1161; + xor.b32 %r1310, %r1298, %r1158; + xor.b32 %r1311, %r1300, %r1160; + xor.b32 %r1312, %r1165, %r1305; + xor.b32 %r1313, %r1164, %r1304; + xor.b32 %r1314, %r1305, %r1167; + xor.b32 %r1315, %r1303, %r1169; + xor.b32 %r1316, %r1302, %r1166; + xor.b32 %r1317, %r1304, %r1168; + setp.lt.s32 %p523, %r1283, %r1282; + setp.lt.s32 %p524, %r1284, %r1285; + setp.lt.s32 %p525, %r1287, %r1286; + setp.lt.s32 %p526, %r1288, %r1289; + setp.eq.b32 %p527, %r1283, %r1282; + setp.eq.b32 %p528, %r1284, %r1285; + setp.eq.b32 %p529, %r1287, %r1286; + setp.eq.b32 %p530, %r1288, %r1289; + setp.gt.s32 %p531, %r1308, %r1309; + setp.gt.s32 %p532, %r1310, %r1311; + setp.gt.s32 %p533, %r1314, %r1315; + setp.gt.s32 %p534, %r1316, %r1317; + and.pred %p535, %p527, %p531; + and.pred %p536, %p528, %p532; + and.pred %p537, %p529, %p533; + and.pred %p538, %p530, %p534; + or.pred %p539, %p523, %p535; + or.pred %p540, %p524, %p536; + or.pred %p541, %p525, %p537; + or.pred %p542, %p526, %p538; + xor.b32 %r1318, %r1282, %r1283; + xor.b32 %r1319, %r1285, %r1284; + xor.b32 %r1320, %r1286, %r1287; + xor.b32 %r1321, %r1289, %r1288; + selp.b32 %r1322, %r1319, 0, %p540; + selp.b32 %r1323, %r1318, 0, %p539; + selp.b32 %r1324, %r1321, 0, %p542; + selp.b32 %r1325, %r1320, 0, %p541; + xor.b32 %r1326, %r1323, %r1282; + xor.b32 %r1327, %r1322, %r1284; + xor.b32 %r1328, %r1323, %r1283; + xor.b32 %r1329, %r1322, %r1285; + xor.b32 %r1330, %r1325, %r1286; + xor.b32 %r1331, %r1324, %r1288; + xor.b32 %r1332, %r1325, %r1287; + xor.b32 %r1333, %r1324, %r1289; + xor.b32 %r1334, %r1311, %r1310; + xor.b32 %r1335, %r1309, %r1308; + xor.b32 %r1336, %r1317, %r1316; + xor.b32 %r1337, %r1315, %r1314; + selp.b32 %r1338, %r1335, 0, %p539; + selp.b32 %r1339, %r1334, 0, %p540; + selp.b32 %r1340, %r1337, 0, %p541; + selp.b32 %r1341, %r1336, 0, %p542; + xor.b32 %r1342, %r1307, %r1339; + xor.b32 %r1343, %r1299, %r1338; + xor.b32 %r1344, %r1343, %r1155; + xor.b32 %r1345, %r1298, %r1339; + xor.b32 %r1346, %r1345, %r1154; + xor.b32 %r1347, %r1306, %r1338; + xor.b32 %r1348, %r1339, %r1311; + xor.b32 %r1349, %r1339, %r1310; + xor.b32 %r1350, %r1338, %r1309; + xor.b32 %r1351, %r1338, %r1308; + xor.b32 %r1352, %r1313, %r1341; + xor.b32 %r1353, %r1303, %r1340; + xor.b32 %r1354, %r1353, %r1163; + xor.b32 %r1355, %r1302, %r1341; + xor.b32 %r1356, %r1355, %r1162; + xor.b32 %r1357, %r1312, %r1340; + xor.b32 %r1358, %r1341, %r1317; + xor.b32 %r1359, %r1341, %r1316; + xor.b32 %r1360, %r1340, %r1315; + xor.b32 %r1361, %r1340, %r1314; + setp.lt.s32 %p543, %r1328, %r1327; + setp.lt.s32 %p544, %r1326, %r1329; + setp.lt.s32 %p545, %r1332, %r1331; + setp.lt.s32 %p546, %r1330, %r1333; + setp.eq.b32 %p547, %r1327, %r1328; + setp.eq.b32 %p548, %r1326, %r1329; + setp.eq.b32 %p549, %r1331, %r1332; + setp.eq.b32 %p550, %r1330, %r1333; + setp.gt.s32 %p551, %r1351, %r1349; + setp.gt.s32 %p552, %r1350, %r1348; + setp.gt.s32 %p553, %r1361, %r1359; + setp.gt.s32 %p554, %r1360, %r1358; + xor.b32 %r1362, %r1343, %r1300; + xor.b32 %r1363, %r1345, %r1301; + xor.b32 %r1364, %r1363, %r1006; + xor.b32 %r1365, %r1362, %r1007; + xor.b32 %r1366, %r1365, %r1148; + xor.b32 %r1367, %r1364, %r1149; + xor.b32 %r1368, %r1367, %r1008; + xor.b32 %r1369, %r1366, %r1009; + xor.b32 %r1370, %r1369, %r1147; + xor.b32 %r1371, %r1368, %r1146; + xor.b32 %r1372, %r1371, %r1338; + xor.b32 %r1373, %r1370, %r1339; + xor.b32 %r1374, %r1353, %r1304; + xor.b32 %r1375, %r1355, %r1305; + xor.b32 %r1376, %r1375, %r1014; + xor.b32 %r1377, %r1374, %r1015; + xor.b32 %r1378, %r1377, %r1152; + xor.b32 %r1379, %r1376, %r1153; + xor.b32 %r1380, %r1379, %r1016; + xor.b32 %r1381, %r1378, %r1017; + xor.b32 %r1382, %r1381, %r1151; + xor.b32 %r1383, %r1380, %r1150; + xor.b32 %r1384, %r1383, %r1340; + xor.b32 %r1385, %r1382, %r1341; + selp.b32 %r1386, %r1373, 0, %p552; + selp.b32 %r1387, %r1386, 0, %p548; + selp.b32 %r1388, %r1373, %r1387, %p544; + selp.b32 %r1389, %r1372, 0, %p551; + selp.b32 %r1390, %r1389, 0, %p547; + selp.b32 %r1391, %r1372, %r1390, %p543; + selp.b32 %r1392, %r1385, 0, %p554; + selp.b32 %r1393, %r1392, 0, %p550; + selp.b32 %r1394, %r1385, %r1393, %p546; + selp.b32 %r1395, %r1384, 0, %p553; + selp.b32 %r1396, %r1395, 0, %p549; + selp.b32 %r1397, %r1384, %r1396, %p545; + xor.b32 %r1398, %r1347, %r1391; + xor.b32 %r1399, %r1346, %r1391; + xor.b32 %r1400, %r1344, %r1388; + xor.b32 %r1401, %r1342, %r1388; + xor.b32 %r19, %r1401, %r941; + xor.b32 %r18, %r1400, %r939; + xor.b32 %r17, %r1399, %r940; + xor.b32 %r16, %r1398, %r938; + xor.b32 %r1402, %r1357, %r1397; + xor.b32 %r1403, %r1356, %r1397; + xor.b32 %r1404, %r1354, %r1394; + xor.b32 %r1405, %r1352, %r1394; + xor.b32 %r23, %r1405, %r945; + xor.b32 %r22, %r1404, %r943; + xor.b32 %r21, %r1403, %r944; + xor.b32 %r20, %r1402, %r942; + .loc 1 61 21 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:61:21 + ld.shared.b64 %rd6, [%r1427]; + cvt.u32.u64 %r1462, %rd5; + cvt.u32.u64 %r1463, %rd4; + .loc 1 71 63 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:71:63 + bar.sync 0; + .loc 1 75 19 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:75:19 + setp.lt.s32 %p599, %r26, %r1463; + setp.lt.s32 %p600, %r27, %r1463; + setp.lt.s32 %p601, %r24, %r1463; + setp.lt.s32 %p602, %r25, %r1463; + .loc 1 76 35 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:76:35 + selp.b32 %r1464, %r17, 16, %p602; + selp.b32 %r1465, %r16, 16, %p601; + selp.b32 %r1466, %r19, 16, %p600; + selp.b32 %r1467, %r18, 16, %p599; + .loc 1 77 20 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:77:20 + add.s32 %r1468, %r1467, 17; + add.s32 %r1469, %r1466, 17; + add.s32 %r1470, %r1465, 17; + add.s32 %r1471, %r1464, 17; + .loc 1 78 20 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:78:20 + setp.lt.s32 %p603, %r1467, 0; + setp.lt.s32 %p604, %r1466, 0; + setp.lt.s32 %p605, %r1465, 0; + setp.lt.s32 %p606, %r1464, 0; + .loc 1 79 35 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:79:35 + selp.b32 %r37, %r1471, %r1464, %p606; + selp.b32 %r36, %r1470, %r1465, %p605; + selp.b32 %r39, %r1469, %r1466, %p604; + selp.b32 %r38, %r1468, %r1467, %p603; + .loc 1 80 38 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:80:38 + setp.gt.u32 %p607, %r38, 16; + selp.b16 %rs73, 1, 0, %p607; + shl.b16 %rs74, %rs73, 2; + setp.gt.u32 %p608, %r39, 16; + selp.b16 %rs75, -1, 0, %p608; + shl.b16 %rs76, %rs75, 3; + or.b16 %rs77, %rs76, %rs74; + setp.gt.u32 %p609, %r36, 16; + selp.b16 %rs78, 1, 0, %p609; + setp.gt.u32 %p610, %r37, 16; + selp.b16 %rs79, -1, 0, %p610; + shl.b16 %rs80, %rs79, 1; + or.b16 %rs81, %rs78, %rs80; + and.b16 %rs82, %rs81, 3; + or.b16 %rs83, %rs82, %rs77; + .loc 1 75 19 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:75:19 + setp.lt.s32 %p611, %r26, %r1462; + setp.lt.s32 %p612, %r27, %r1462; + setp.lt.s32 %p613, %r24, %r1462; + setp.lt.s32 %p614, %r25, %r1462; + .loc 1 76 35 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:76:35 + selp.b32 %r1472, %r21, 16, %p614; + selp.b32 %r1473, %r20, 16, %p613; + selp.b32 %r1474, %r23, 16, %p612; + selp.b32 %r1475, %r22, 16, %p611; + .loc 1 77 20 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:77:20 + add.s32 %r1476, %r1475, 17; + add.s32 %r1477, %r1474, 17; + add.s32 %r1478, %r1473, 17; + add.s32 %r1479, %r1472, 17; + .loc 1 78 20 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:78:20 + setp.lt.s32 %p615, %r1475, 0; + setp.lt.s32 %p616, %r1474, 0; + setp.lt.s32 %p617, %r1473, 0; + setp.lt.s32 %p618, %r1472, 0; + .loc 1 79 35 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:79:35 + selp.b32 %r41, %r1479, %r1472, %p618; + selp.b32 %r40, %r1478, %r1473, %p617; + selp.b32 %r43, %r1477, %r1474, %p616; + selp.b32 %r42, %r1476, %r1475, %p615; + .loc 1 80 38 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:80:38 + setp.gt.u32 %p619, %r42, 16; + selp.b16 %rs84, 1, 0, %p619; + shl.b16 %rs85, %rs84, 2; + setp.gt.u32 %p620, %r43, 16; + selp.b16 %rs86, -1, 0, %p620; + shl.b16 %rs87, %rs86, 3; + or.b16 %rs88, %rs87, %rs85; + setp.gt.u32 %p621, %r40, 16; + selp.b16 %rs89, 1, 0, %p621; + setp.gt.u32 %p622, %r41, 16; + selp.b16 %rs90, -1, 0, %p622; + shl.b16 %rs91, %rs90, 1; + or.b16 %rs92, %rs89, %rs91; + and.b16 %rs93, %rs92, 3; + or.b16 %rs94, %rs93, %rs88; + .loc 1 80 63 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:80:63 + and.b16 %rs95, %rs83, 15; + setp.ne.b16 %p623, %rs95, 0; + and.pred %p624, %p2, %p623; + and.b16 %rs96, %rs94, 15; + setp.ne.b16 %p625, %rs96, 0; + and.pred %p626, %p4, %p625; + or.pred %p627, %p624, %p626; + not.pred %p628, %p627; + @%p628 bra $L__BB0_4; + bra.uni $L__BB0_3; +$L__BB0_4: + .loc 1 0 63 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:0:63 + ld.param.b64 %rd12, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6]; + ld.param.b64 %rd11, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5]; + ld.param.b64 %rd10, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4]; + ld.param.b64 %rd9, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3]; + ld.param.b64 %rd8, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2]; + ld.param.b64 %rd7, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1]; + cvt.s64.s32 %rd1, %r48; + cvt.s64.s32 %rd2, %r49; + .loc 1 61 21 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:61:21 + cvt.u32.u64 %r1481, %rd6; + .loc 1 60 21 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:60:21 + cvt.u32.u64 %r1480, %rd3; + .loc 1 25 23 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:25:23 + or.b32 %r1514, %r1, %r4; + .loc 1 26 21 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:26:21 + setp.lt.s32 %p633, %r1514, 128; + .loc 1 80 63 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:80:63 + bar.sync 0; + .loc 1 81 25 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:81:25 + mul.wide.s32 %rd107, %r1514, 4; + add.s64 %rd85, %rd7, %rd107; + .loc 1 81 37 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:81:37 + and.b32 %r1515, %r2, 128; + setp.eq.b32 %p651, %r1515, 0; + and.pred %p629, %p651, %p633; + // begin inline asm + @%p629 st.global.b32 [ %rd85 + 0 ], { %r1480 }; + // end inline asm + .loc 1 82 25 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:82:25 + add.s64 %rd86, %rd8, %rd107; + .loc 1 82 37 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:82:37 + // begin inline asm + @%p629 st.global.b32 [ %rd86 + 0 ], { %r1481 }; + // end inline asm + .loc 1 83 25 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:83:25 + shl.b64 %rd108, %rd1, 2; + add.s64 %rd87, %rd9, %rd108; + shl.b64 %rd109, %rd2, 2; + add.s64 %rd88, %rd9, %rd109; + .loc 1 83 47 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:83:47 + // begin inline asm + @%p2 st.global.v4.b32 [ %rd87 + 0 ], { %r8, %r9, %r10, %r11 }; + // end inline asm + // begin inline asm + @%p4 st.global.v4.b32 [ %rd88 + 0 ], { %r12, %r13, %r14, %r15 }; + // end inline asm + .loc 1 84 52 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:84:52 + mul.lo.s32 %r1516, %r5, 17; + add.s32 %r1517, %r1516, 1088; + .loc 1 84 49 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:84:49 + add.s32 %r1518, %r28, %r1516; + add.s32 %r1519, %r29, %r1516; + add.s32 %r1520, %r30, %r1516; + add.s32 %r1521, %r31, %r1516; + add.s32 %r1522, %r32, %r1517; + add.s32 %r1523, %r33, %r1517; + add.s32 %r1524, %r34, %r1517; + add.s32 %r1525, %r35, %r1517; + .loc 1 84 25 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:84:25 + mad.wide.s32 %rd110, %r1518, 4, %rd10; + mad.wide.s32 %rd111, %r1519, 4, %rd10; + mad.wide.s32 %rd112, %r1520, 4, %rd10; + mad.wide.s32 %rd113, %r1521, 4, %rd10; + mad.wide.s32 %rd114, %r1522, 4, %rd10; + mad.wide.s32 %rd115, %r1523, 4, %rd10; + mad.wide.s32 %rd116, %r1524, 4, %rd10; + mad.wide.s32 %rd117, %r1525, 4, %rd10; + .loc 1 84 85 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:84:85 + bar.sync 0; + shl.b32 %r1526, %r2, 9; + and.b32 %r1527, %r1526, 12288; + shl.b32 %r1528, %r7, 5; + shl.b32 %r1529, %r3, 2; + or.b32 %r1530, %r1527, %r1528; + xor.b32 %r1531, %r1530, %r1529; + add.s32 %r1533, %r1424, %r1531; + st.shared.v2.b64 [%r1533], {%rd110, %rd112}; + st.shared.v2.b64 [%r1533+2048], {%rd111, %rd113}; + st.shared.v2.b64 [%r1533+1024], {%rd114, %rd116}; + st.shared.v2.b64 [%r1533+3072], {%rd115, %rd117}; + bar.sync 0; + shl.b32 %r1534, %r2, 11; + and.b32 %r1535, %r1534, 12288; + shl.b32 %r1536, %r2, 4; + and.b32 %r1537, %r1536, 4080; + or.b32 %r1538, %r1535, %r1537; + add.s32 %r1539, %r1424, %r1538; + ld.shared.v2.b64 {%rd89, %rd90}, [%r1539]; + xor.b32 %r1540, %r1538, 32; + add.s32 %r1541, %r1424, %r1540; + ld.shared.v2.b64 {%rd91, %rd92}, [%r1541]; + xor.b32 %r1542, %r1538, 64; + add.s32 %r1543, %r1424, %r1542; + ld.shared.v2.b64 {%rd93, %rd94}, [%r1543]; + xor.b32 %r1544, %r1538, 96; + add.s32 %r1545, %r1424, %r1544; + ld.shared.v2.b64 {%rd95, %rd96}, [%r1545]; + mov.b32 %r1490, 1; + // begin inline asm + @%p633 st.global.b32 [ %rd89 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd90 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd91 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd92 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd93 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd94 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd95 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd96 + 0 ], { %r1490 }; + // end inline asm + .loc 1 85 25 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:85:25 + add.s64 %rd97, %rd11, %rd108; + add.s64 %rd98, %rd11, %rd109; + .loc 1 85 47 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:85:47 + // begin inline asm + @%p2 st.global.v4.b32 [ %rd97 + 0 ], { %r16, %r17, %r18, %r19 }; + // end inline asm + // begin inline asm + @%p4 st.global.v4.b32 [ %rd98 + 0 ], { %r20, %r21, %r22, %r23 }; + // end inline asm + .loc 1 86 49 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:86:49 + add.s32 %r1546, %r36, %r1516; + add.s32 %r1547, %r37, %r1516; + add.s32 %r1548, %r38, %r1516; + add.s32 %r1549, %r39, %r1516; + add.s32 %r1550, %r40, %r1517; + add.s32 %r1551, %r41, %r1517; + add.s32 %r1552, %r42, %r1517; + add.s32 %r1553, %r43, %r1517; + .loc 1 86 25 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:86:25 + mad.wide.s32 %rd118, %r1546, 4, %rd12; + mad.wide.s32 %rd119, %r1547, 4, %rd12; + mad.wide.s32 %rd120, %r1548, 4, %rd12; + mad.wide.s32 %rd121, %r1549, 4, %rd12; + mad.wide.s32 %rd122, %r1550, 4, %rd12; + mad.wide.s32 %rd123, %r1551, 4, %rd12; + mad.wide.s32 %rd124, %r1552, 4, %rd12; + mad.wide.s32 %rd125, %r1553, 4, %rd12; + .loc 1 86 85 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:86:85 + bar.sync 0; + st.shared.v2.b64 [%r1533], {%rd118, %rd120}; + st.shared.v2.b64 [%r1533+2048], {%rd119, %rd121}; + st.shared.v2.b64 [%r1533+1024], {%rd122, %rd124}; + st.shared.v2.b64 [%r1533+3072], {%rd123, %rd125}; + bar.sync 0; + ld.shared.v2.b64 {%rd99, %rd100}, [%r1539]; + ld.shared.v2.b64 {%rd101, %rd102}, [%r1541]; + ld.shared.v2.b64 {%rd103, %rd104}, [%r1543]; + ld.shared.v2.b64 {%rd105, %rd106}, [%r1545]; + // begin inline asm + @%p633 st.global.b32 [ %rd99 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd100 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd101 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd102 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd103 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd104 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd105 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd106 + 0 ], { %r1490 }; + // end inline asm + .loc 1 86 4 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:86:4 + ret; +$L__BB0_1: + .loc 1 71 63 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:71:63 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd132, assertFunc_0; + cvta.global.u64 %rd133, %rd132; + st.param.b64 [param3], %rd133; + mov.b64 %rd134, assertFile_0; + cvta.global.u64 %rd135, %rd134; + st.param.b64 [param1], %rd135; + mov.b64 %rd136, assertMessage_0; + cvta.global.u64 %rd137, %rd136; + st.param.b64 [param0], %rd137; + st.param.b64 [param4], 1; + st.param.b32 [param2], 71; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__BB0_3: + .loc 1 80 63 // cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py:80:63 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd126, assertFunc_1; + cvta.global.u64 %rd127, %rd126; + st.param.b64 [param3], %rd127; + mov.b64 %rd128, assertFile_1; + cvta.global.u64 %rd129, %rd128; + st.param.b64 [param1], %rd129; + mov.b64 %rd130, assertMessage_1; + cvta.global.u64 %rd131, %rd130; + st.param.b64 [param0], %rd131; + st.param.b64 [param4], 1; + st.param.b32 [param2], 80; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp14: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 376 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x171 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 117 +.b8 98 +.b8 99 +.b8 122 +.b8 97 +.b8 98 +.b8 114 +.b8 102 +.b8 50 +.b8 112 +.b8 116 +.b8 114 +.b8 121 +.b8 113 +.b8 50 +.b8 97 +.b8 116 +.b8 104 +.b8 110 +.b8 109 +.b8 114 +.b8 117 +.b8 50 +.b8 98 +.b8 121 +.b8 105 +.b8 112 +.b8 98 +.b8 103 +.b8 97 +.b8 114 +.b8 105 +.b8 111 +.b8 97 +.b8 51 +.b8 52 +.b8 54 +.b8 50 +.b8 112 +.b8 117 +.b8 120 +.b8 118 +.b8 50 +.b8 106 +.b8 119 +.b8 118 +.b8 54 +.b8 118 +.b8 109 +.b8 52 +.b8 99 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 117 +.b8 98 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x7a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 116 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 112 +.b8 117 +.b8 116 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 110 +.b8 101 +.b8 119 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 115 +.b8 99 +.b8 97 +.b8 108 +.b8 97 +.b8 114 +.b8 95 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 119 +.b8 104 +.b8 101 +.b8 114 +.b8 101 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x105:0x76 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x132:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp9 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x14a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp10 // DW_AT_low_pc +.b64 $L__tmp11 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 55 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x162:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp12 // DW_AT_low_pc +.b64 $L__tmp13 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 59 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source b/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source new file mode 100644 index 0000000000000000000000000000000000000000..e2212a3bb4d6ad029c5a8eea42e4f92b6fbd62f3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source @@ -0,0 +1,1405 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":18:0) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc141 = loc(unknown) +#loc166 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc170 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc175 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc179 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc188 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc193 = loc("in_ptr0"(#loc)) +#loc194 = loc("out_ptr4"(#loc)) +#loc195 = loc("out_ptr5"(#loc)) +#loc196 = loc("out_ptr6"(#loc)) +#loc197 = loc("out_ptr7"(#loc)) +#loc198 = loc("out_ptr8"(#loc)) +#loc199 = loc("out_ptr9"(#loc)) +#loc200 = loc("xnumel"(#loc)) +#loc201 = loc("r0_numel"(#loc)) +#loc257 = loc("x"(#loc91)) +#loc258 = loc("idxs"(#loc91)) +#loc259 = loc("x"(#loc95)) +#loc260 = loc("idxs"(#loc95)) +#loc265 = loc("x"(#loc103)) +#loc266 = loc("idxs"(#loc103)) +#loc267 = loc("flip"(#loc103)) +#loc323 = loc("input"(#loc166)) +#loc324 = loc("a"(#loc170)) +#loc325 = loc("b"(#loc170)) +#loc327 = loc("x"(#loc175)) +#loc328 = loc("x"(#loc179)) +#loc329 = loc("input"(#loc188)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 128 : i32 loc(#loc202) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc203) + %xoffset = tt.get_program_id x : i32 loc(#loc204) + %xoffset_2 = arith.constant 128 : i32 loc(#loc205) + %xoffset_3 = arith.constant 128 : i32 loc(#loc205) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc205) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc206) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc207) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<128x1xi32> loc(#loc208) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<128x1xi32> loc(#loc208) + %xmask = arith.constant dense<128> : tensor<128x1xi32> loc(#loc209) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<128x1xi32> loc(#loc209) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc210) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc211) + %r0_offset = arith.constant 0 : i32 loc(#loc212) + %r0_mask = arith.constant true loc(#loc213) + %r0_mask_10 = arith.constant dense : tensor<128x16xi1> loc(#loc213) + %tmp0 = arith.constant 16 : i32 loc(#loc214) + %tmp0_11 = arith.constant 16 : i32 loc(#loc214) + %tmp0_12 = arith.constant dense<16> : tensor<128x1xi32> loc(#loc214) + %tmp0_13 = arith.muli %tmp0_12, %xindex_7 : tensor<128x1xi32> loc(#loc214) + %tmp0_14 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc215) + %tmp0_15 = tt.broadcast %tmp0_13 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc215) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<128x16xi32> loc(#loc215) + %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc216) + %tmp0_18 = tt.addptr %tmp0_17, %tmp0_16 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc216) + %tmp0_19 = arith.constant 0.000000e+00 : f32 loc(#loc217) + %tmp0_20 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc217) + %tmp0_21 = arith.constant dense<0.000000e+00> : tensor<128x16xf32> loc(#loc217) + %tmp0_22 = arith.fptosi %tmp0_21 : tensor<128x16xf32> to tensor<128x16xi64> loc(#loc217) + %tmp0_23 = tt.load %tmp0_18, %tmp0_20, %tmp0_22 : tensor<128x16x!tt.ptr> loc(#loc217) + %tmp1 = arith.constant 0 : i64 loc(#loc218) + %tmp1_24 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc218) + %tmp2 = arith.constant dense<0> : tensor<128x16xi64> loc(#loc219) + %tmp2_25 = arith.cmpi sgt, %tmp0_23, %tmp2 : tensor<128x16xi64> loc(#loc219) + %tmp3 = arith.constant 16384 : i64 loc(#loc220) + %tmp3_26 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc220) + %tmp4 = arith.constant dense<16384> : tensor<128x16xi64> loc(#loc221) + %tmp4_27 = arith.cmpi slt, %tmp0_23, %tmp4 : tensor<128x16xi64> loc(#loc221) + %tmp5 = arith.andi %tmp2_25, %tmp4_27 : tensor<128x16xi1> loc(#loc222) + %tmp6 = arith.extui %tmp5 : tensor<128x16xi1> to tensor<128x16xi8> loc(#loc223) + %tmp7 = arith.extsi %tmp6 : tensor<128x16xi8> to tensor<128x16xi32> loc(#loc224) + %tmp9 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc225) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16> -> tensor<128x16xi16> loc(#loc226) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S128_16S_i16S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp7, %tmp11) : (tensor<128x16xi32>, tensor<128x16xi16>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc26) + %tmp14 = arith.constant dense<16384> : tensor<128x16xi64> loc(#loc227) + %tmp14_28 = arith.cmpi eq, %tmp0_23, %tmp14 : tensor<128x16xi64> loc(#loc227) + %tmp15 = arith.extui %tmp14_28 : tensor<128x16xi1> to tensor<128x16xi8> loc(#loc228) + %tmp16 = arith.extsi %tmp15 : tensor<128x16xi8> to tensor<128x16xi32> loc(#loc229) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S128_16S_i16S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp16, %tmp11) : (tensor<128x16xi32>, tensor<128x16xi16>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc30) + %tmp20 = arith.extsi %tmp7 : tensor<128x16xi32> to tensor<128x16xi64> loc(#loc230) + %tmp23 = arith.constant 0 : i32 loc(#loc231) + %tmp23_29 = arith.constant 0 : i64 loc(#loc231) + %tmp23_30 = arith.constant dense<0> : tensor<128x16xi64> loc(#loc231) + %tmp23_31 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc231) + %tmp23_32 = arith.select %tmp23_31, %tmp20, %tmp23_30 : tensor<128x16xi1>, tensor<128x16xi64> loc(#loc231) + %tmp24 = tt.call @"triton.language.standard.sum__i64S128_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp23_32) : (tensor<128x16xi64>) -> tensor<128xi64> loc(#loc232) + %tmp24_33 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<128xi64> -> tensor<128x1xi64> loc(#loc233) + %tmp25 = arith.extsi %tmp16 : tensor<128x16xi32> to tensor<128x16xi64> loc(#loc234) + %tmp28 = arith.constant 0 : i32 loc(#loc235) + %tmp28_34 = arith.constant 0 : i64 loc(#loc235) + %tmp28_35 = arith.constant dense<0> : tensor<128x16xi64> loc(#loc235) + %tmp28_36 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc235) + %tmp28_37 = arith.select %tmp28_36, %tmp25, %tmp28_35 : tensor<128x16xi1>, tensor<128x16xi64> loc(#loc235) + %tmp29 = tt.call @"triton.language.standard.sum__i64S128_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp28_37) : (tensor<128x16xi64>) -> tensor<128xi64> loc(#loc236) + %tmp29_38 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<128xi64> -> tensor<128x1xi64> loc(#loc237) + %tmp30 = arith.trunci %tmp24_33 : tensor<128x1xi64> to tensor<128x1xi32> loc(#loc238) + %tmp31 = arith.trunci %tmp29_38 : tensor<128x1xi64> to tensor<128x1xi32> loc(#loc239) + %tmp32 = arith.extsi %0#1 : tensor<128x16xi32> to tensor<128x16xi64> loc(#loc240) + %tmp33 = arith.trunci %tmp32 : tensor<128x16xi64> to tensor<128x16xi32> loc(#loc241) + %tmp34 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc242) + %tmp34_39 = tt.broadcast %tmp30 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc242) + %tmp34_40 = arith.cmpi slt, %tmp34, %tmp34_39 : tensor<128x16xi32> loc(#loc242) + %tmp35 = arith.constant 16 : i32 loc(#loc243) + %tmp35_41 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc243) + %tmp36 = arith.constant dense<16> : tensor<128x16xi32> loc(#loc244) + %tmp36_42 = arith.select %tmp34_40, %tmp33, %tmp36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc244) + %tmp37 = arith.constant 17 : i32 loc(#loc245) + %tmp37_43 = arith.constant dense<17> : tensor<128x16xi32> loc(#loc245) + %tmp38 = arith.addi %tmp36_42, %tmp37_43 : tensor<128x16xi32> loc(#loc246) + %tmp39 = arith.constant 0 : i32 loc(#loc247) + %tmp39_44 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc247) + %tmp39_45 = arith.cmpi slt, %tmp36_42, %tmp39_44 : tensor<128x16xi32> loc(#loc247) + %tmp40 = arith.select %tmp39_45, %tmp38, %tmp36_42 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc248) + %c0_i32 = arith.constant 0 : i32 loc(#loc50) + %cst = arith.constant dense<0> : tensor<128x16xi32> loc(#loc50) + %2 = arith.cmpi sle, %cst, %tmp40 : tensor<128x16xi32> loc(#loc50) + %c17_i32 = arith.constant 17 : i32 loc(#loc51) + %cst_46 = arith.constant dense<17> : tensor<128x16xi32> loc(#loc51) + %3 = arith.cmpi slt, %tmp40, %cst_46 : tensor<128x16xi32> loc(#loc51) + %4 = arith.andi %2, %3 : tensor<128x16xi1> loc(#loc52) + %true = arith.constant true loc(#loc53) + %cst_47 = arith.constant dense : tensor<128x1xi1> loc(#loc53) + %5 = arith.xori %xmask_8, %cst_47 : tensor<128x1xi1> loc(#loc53) + %6 = tt.broadcast %5 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc54) + %7 = arith.ori %4, %6 : tensor<128x16xi1> loc(#loc54) + tt.assert %7, "index out of bounds: 0 <= tmp40 < 17" : tensor<128x16xi1> loc(#loc55) + %tmp42 = arith.constant 1 : i32 loc(#loc249) + %tmp42_48 = arith.constant dense<1> : tensor<1x1xi32> loc(#loc249) + %tmp43 = arith.extsi %1#1 : tensor<128x16xi32> to tensor<128x16xi64> loc(#loc250) + %tmp44 = arith.trunci %tmp43 : tensor<128x16xi64> to tensor<128x16xi32> loc(#loc251) + %tmp45 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc252) + %tmp45_49 = tt.broadcast %tmp31 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc252) + %tmp45_50 = arith.cmpi slt, %tmp45, %tmp45_49 : tensor<128x16xi32> loc(#loc252) + %tmp46 = arith.constant dense<16> : tensor<128x16xi32> loc(#loc253) + %tmp46_51 = arith.select %tmp45_50, %tmp44, %tmp46 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc253) + %tmp47 = arith.addi %tmp46_51, %tmp37_43 : tensor<128x16xi32> loc(#loc254) + %tmp48 = arith.constant 0 : i32 loc(#loc255) + %tmp48_52 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc255) + %tmp48_53 = arith.cmpi slt, %tmp46_51, %tmp48_52 : tensor<128x16xi32> loc(#loc255) + %tmp49 = arith.select %tmp48_53, %tmp47, %tmp46_51 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc256) + %c0_i32_54 = arith.constant 0 : i32 loc(#loc64) + %cst_55 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc64) + %8 = arith.cmpi sle, %cst_55, %tmp49 : tensor<128x16xi32> loc(#loc64) + %c17_i32_56 = arith.constant 17 : i32 loc(#loc65) + %cst_57 = arith.constant dense<17> : tensor<128x16xi32> loc(#loc65) + %9 = arith.cmpi slt, %tmp49, %cst_57 : tensor<128x16xi32> loc(#loc65) + %10 = arith.andi %8, %9 : tensor<128x16xi1> loc(#loc66) + %true_58 = arith.constant true loc(#loc67) + %cst_59 = arith.constant dense : tensor<128x1xi1> loc(#loc67) + %11 = arith.xori %xmask_8, %cst_59 : tensor<128x1xi1> loc(#loc67) + %12 = tt.broadcast %11 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc68) + %13 = arith.ori %10, %12 : tensor<128x16xi1> loc(#loc68) + tt.assert %13, "index out of bounds: 0 <= tmp49 < 17" : tensor<128x16xi1> loc(#loc69) + %14 = tt.splat %out_ptr4 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc70) + %15 = tt.addptr %14, %xindex_7 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc70) + tt.store %15, %tmp30, %xmask_8 : tensor<128x1x!tt.ptr> loc(#loc71) + %16 = tt.splat %out_ptr5 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc72) + %17 = tt.addptr %16, %xindex_7 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc72) + tt.store %17, %tmp31, %xmask_8 : tensor<128x1x!tt.ptr> loc(#loc73) + %c16_i32 = arith.constant 16 : i32 loc(#loc74) + %c16_i32_60 = arith.constant 16 : i32 loc(#loc74) + %cst_61 = arith.constant dense<16> : tensor<128x1xi32> loc(#loc74) + %18 = arith.muli %cst_61, %xindex_7 : tensor<128x1xi32> loc(#loc74) + %19 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc75) + %20 = tt.broadcast %18 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc75) + %21 = arith.addi %19, %20 : tensor<128x16xi32> loc(#loc75) + %22 = tt.splat %out_ptr6 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc76) + %23 = tt.addptr %22, %21 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc76) + %24 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc77) + tt.store %23, %tmp33, %24 : tensor<128x16x!tt.ptr> loc(#loc77) + %c17_i32_62 = arith.constant 17 : i32 loc(#loc78) + %c17_i32_63 = arith.constant 17 : i32 loc(#loc78) + %cst_64 = arith.constant dense<17> : tensor<128x1xi32> loc(#loc78) + %25 = arith.muli %cst_64, %xindex_7 : tensor<128x1xi32> loc(#loc78) + %26 = tt.broadcast %25 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc79) + %27 = arith.addi %tmp40, %26 : tensor<128x16xi32> loc(#loc79) + %28 = tt.splat %out_ptr7 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc80) + %29 = tt.addptr %28, %27 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc80) + %cst_65 = arith.constant dense<1> : tensor<128x16xi32> loc(#loc81) + %30 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc81) + tt.store %29, %cst_65, %30 : tensor<128x16x!tt.ptr> loc(#loc81) + %c16_i32_66 = arith.constant 16 : i32 loc(#loc82) + %c16_i32_67 = arith.constant 16 : i32 loc(#loc82) + %cst_68 = arith.constant dense<16> : tensor<128x1xi32> loc(#loc82) + %31 = arith.muli %cst_68, %xindex_7 : tensor<128x1xi32> loc(#loc82) + %32 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc83) + %33 = tt.broadcast %31 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc83) + %34 = arith.addi %32, %33 : tensor<128x16xi32> loc(#loc83) + %35 = tt.splat %out_ptr8 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc84) + %36 = tt.addptr %35, %34 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc84) + %37 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc85) + tt.store %36, %tmp44, %37 : tensor<128x16x!tt.ptr> loc(#loc85) + %c17_i32_69 = arith.constant 17 : i32 loc(#loc86) + %c17_i32_70 = arith.constant 17 : i32 loc(#loc86) + %cst_71 = arith.constant dense<17> : tensor<128x1xi32> loc(#loc86) + %38 = arith.muli %cst_71, %xindex_7 : tensor<128x1xi32> loc(#loc86) + %39 = tt.broadcast %38 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc87) + %40 = arith.addi %tmp49, %39 : tensor<128x16xi32> loc(#loc87) + %41 = tt.splat %out_ptr9 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc88) + %42 = tt.addptr %41, %40 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc88) + %cst_72 = arith.constant dense<1> : tensor<128x16xi32> loc(#loc89) + %43 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc89) + tt.store %42, %cst_72, %43 : tensor<128x16x!tt.ptr> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S128_16S_i16S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc91)), %idxs: tensor<128x16xi16> loc("idxs"(#loc91))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i16S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<128x16xi32>, tensor<128x16xi16>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc92) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc92) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc92) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc92) + tt.return %3#0, %3#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc93) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc94) + %5 = ub.poison : tensor<128x16xi32> loc(#loc94) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc94) + } loc(#loc91) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i16S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc95)), %idxs: tensor<128x16xi16> loc("idxs"(#loc95))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i16S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi16>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + tt.return %0#0, %0#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc101) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x16xi32> loc(#loc102) + %2 = ub.poison : tensor<128x16xi32> loc(#loc102) + tt.return %1, %2 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i16S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc103)), %idxs: tensor<128x16xi16> loc("idxs"(#loc103)), %flip: tensor<128x16xi32> loc("flip"(#loc103))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<1024x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<1024x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<128x16xi16> -> tensor<1024x2x1xi16> loc(#loc282) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc283) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<1024x2x1xi16> loc(#loc284) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<1024x2x1xi16> loc(#loc284) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<1024x2x1xi16>) -> tensor<1024x1xi32> loc(#loc285) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc286) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc287) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc288) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<1024x2x1xi16> loc(#loc289) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<1024x2x1xi16> loc(#loc289) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<1024x2x1xi16>) -> tensor<1024x1xi32> loc(#loc290) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc291) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc292) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc293) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_27 = arith.constant dense : tensor<128x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_28 = arith.constant dense : tensor<128x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_49 = arith.constant true loc(#loc300) + %cond_50 = arith.constant dense : tensor<128x16xi1> loc(#loc300) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<128x16xi1> loc(#loc300) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<128x16xi1> loc(#loc301) + %cond_53 = arith.ori %cond, %cond_52 : tensor<128x16xi1> loc(#loc332) + scf.yield %cond_53 : tensor<128x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc304) + %eq_50 = arith.ori %eq, %eq_49 : tensor<128x16xi1> loc(#loc334) + scf.yield %eq_50 : tensor<128x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc141) + } loc(#loc144) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<128x16xi32> loc(#loc306) + %cond_30 = arith.andi %3, %cond_29 : tensor<128x16xi1> loc(#loc307) + %cond_31 = arith.ori %1, %cond_30 : tensor<128x16xi1> loc(#loc308) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<128x16xi1> loc(#loc309) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<128x16xi1> loc(#loc310) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<128x16xi1> loc(#loc311) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<128x16xi1> loc(#loc312) + %cond_36 = arith.extui %cond_35 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc313) + %cond_37 = arith.xori %cond_36, %flip : tensor<128x16xi32> loc(#loc313) + %cond_38 = arith.constant 0 : i32 loc(#loc314) + %cond_39 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc314) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<128x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc315) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc316) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc317) + %ret_43 = arith.xori %x, %ret_42 : tensor<128x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<128x16xi32> loc(#loc319) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S128_16S__(%idxs) : (tensor<128x16xi16>) -> tensor<128x16xi16> loc(#loc320) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<128x16xi16> to tensor<128x16xi32> loc(#loc321) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc321) + %new_idxs_47 = arith.extsi %idxs : tensor<128x16xi16> to tensor<128x16xi32> loc(#loc322) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<128x16xi32> loc(#loc322) + tt.return %ret_43, %new_idxs_48 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc165) + %5 = ub.poison : tensor<128x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1024x2x1xi32> loc("input"(#loc166))) -> tensor<1024x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc167) + tt.return %0 : tensor<1024x1xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1024x1xi32> loc(#loc169) + tt.return %1 : tensor<1024x1xi32> loc(#loc169) + } loc(#loc166) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc170)), %b: i32 loc("b"(#loc170))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc171) + tt.return %0 : i32 loc(#loc172) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc173) + tt.return %1 : i32 loc(#loc173) + } loc(#loc170) + tt.func private @"triton.language.standard.sum__i16S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1024x2x1xi16> loc("input"(#loc166))) -> tensor<1024x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<1024x2x1xi16> to tensor<1024x2x1xi32> loc(#loc326) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc167) + tt.return %0 : tensor<1024x1xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1024x1xi32> loc(#loc169) + tt.return %1 : tensor<1024x1xi32> loc(#loc169) + } loc(#loc166) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%x: tensor<128x16xi32> loc("x"(#loc175))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc176) + %false = arith.constant false loc(#loc177) + tt.return %false : i1 loc(#loc177) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc178) + tt.return %1 : i1 loc(#loc178) + } loc(#loc175) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S128_16S__(%x: tensor<128x16xi32> loc("x"(#loc179))) -> tensor<128x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc180) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc181) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc181) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<128x16xi32> loc(#loc181) + %4 = arith.addi %x, %3 : tensor<128x16xi32> loc(#loc181) + tt.return %4 : tensor<128x16xi32> loc(#loc182) + ^bb1: // no predecessors + %5 = ub.poison : tensor<128x16xi32> loc(#loc183) + tt.return %5 : tensor<128x16xi32> loc(#loc183) + } loc(#loc179) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc185) + %cst = arith.constant dense : tensor<1xi1> loc(#loc185) + tt.return %cst : tensor<1xi1> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc187) + tt.return %0 : tensor<1xi1> loc(#loc187) + } loc(#loc184) + tt.func private @triton.language.standard.zeros_like__i32S128_16S__(%input: tensor<128x16xi32> loc("input"(#loc188))) -> tensor<128x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<128x16xi32> loc(#loc189) + tt.return %0 : tensor<128x16xi32> loc(#loc190) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x16xi32> loc(#loc191) + tt.return %1 : tensor<128x16xi32> loc(#loc191) + } loc(#loc188) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<128x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc185) + %cst = arith.constant dense<0> : tensor<128x16xi32> loc(#loc185) + tt.return %cst : tensor<128x16xi32> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x16xi32> loc(#loc187) + tt.return %0 : tensor<128x16xi32> loc(#loc187) + } loc(#loc184) + tt.func private @triton.language.standard.zeros_like__i16S128_16S__(%input: tensor<128x16xi16> loc("input"(#loc188))) -> tensor<128x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<128x16xi16> loc(#loc189) + tt.return %0 : tensor<128x16xi16> loc(#loc190) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x16xi16> loc(#loc191) + tt.return %1 : tensor<128x16xi16> loc(#loc191) + } loc(#loc188) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<128x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc185) + %cst = arith.constant dense<0> : tensor<128x16xi16> loc(#loc185) + tt.return %cst : tensor<128x16xi16> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x16xi16> loc(#loc187) + tt.return %0 : tensor<128x16xi16> loc(#loc187) + } loc(#loc184) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc95)), %idxs: tensor<128x16xi32> loc("idxs"(#loc95))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + tt.return %1#0, %1#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc101) + ^bb1: // no predecessors + %2 = ub.poison : tensor<128x16xi32> loc(#loc102) + %3 = ub.poison : tensor<128x16xi32> loc(#loc102) + tt.return %2, %3 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc103)), %idxs: tensor<128x16xi32> loc("idxs"(#loc103)), %flip: tensor<128x16xi32> loc("flip"(#loc103))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<512x2x2xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<512x2x2xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<512x2x2xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<512x2x2xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<128x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<128x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<128x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<128x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<128x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<128x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<128x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<128x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<128x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<128x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<128x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc165) + %5 = ub.poison : tensor<128x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<512x2x2xi32> loc("input"(#loc166))) -> tensor<512x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc167) + tt.return %0 : tensor<512x2xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<512x2xi32> loc(#loc169) + tt.return %1 : tensor<512x2xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc103)), %idxs: tensor<128x16xi32> loc("idxs"(#loc103)), %flip: tensor<128x16xi32> loc("flip"(#loc103))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<1024x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<1024x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<1024x2x1xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<1024x2x1xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<128x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<128x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<128x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<128x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<128x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<128x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<128x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<128x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<128x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<128x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<128x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc165) + %5 = ub.poison : tensor<128x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc95)), %idxs: tensor<128x16xi32> loc("idxs"(#loc95))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + tt.return %2#0, %2#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc101) + ^bb1: // no predecessors + %3 = ub.poison : tensor<128x16xi32> loc(#loc102) + %4 = ub.poison : tensor<128x16xi32> loc(#loc102) + tt.return %3, %4 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc103)), %idxs: tensor<128x16xi32> loc("idxs"(#loc103)), %flip: tensor<128x16xi32> loc("flip"(#loc103))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x4xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<256x2x4xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x4xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x4xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<128x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<128x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<128x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<128x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<128x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<128x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<128x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<128x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<128x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<128x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<128x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc165) + %5 = ub.poison : tensor<128x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<256x2x4xi32> loc("input"(#loc166))) -> tensor<256x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc167) + tt.return %0 : tensor<256x4xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<256x4xi32> loc(#loc169) + tt.return %1 : tensor<256x4xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc95)), %idxs: tensor<128x16xi32> loc("idxs"(#loc95))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc330) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<128x16xi32>, tensor<128x16xi32>, i1) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<128x16xi32>, tensor<128x16xi32>, i1) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<128x16xi32>, tensor<128x16xi32>, i1) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<128x16xi32>, tensor<128x16xi32>, i1) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + tt.return %3#0, %3#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc101) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc102) + %5 = ub.poison : tensor<128x16xi32> loc(#loc102) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc103)), %idxs: tensor<128x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<128x2x8xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<128x2x8xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<128x2x8xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<128x2x8xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<128x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<128x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<128x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<128x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<128x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<128x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<128x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<128x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<128x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<128x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<128x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc165) + %5 = ub.poison : tensor<128x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x2x8xi32> loc("input"(#loc166))) -> tensor<128x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc167) + tt.return %0 : tensor<128x8xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x8xi32> loc(#loc169) + tt.return %1 : tensor<128x8xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc103)), %idxs: tensor<128x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x4xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<256x2x4xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x4xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x4xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<128x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<128x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<128x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<128x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<128x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<128x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<128x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<128x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<128x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<128x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<128x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc165) + %5 = ub.poison : tensor<128x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc103)), %idxs: tensor<128x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<512x2x2xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<512x2x2xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<512x2x2xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<512x2x2xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<128x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<128x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<128x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<128x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<128x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<128x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<128x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<128x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<128x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<128x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<128x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc165) + %5 = ub.poison : tensor<128x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc103)), %idxs: tensor<128x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<1024x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<1024x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<1024x2x1xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<1024x2x1xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<128x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<128x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<128x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<128x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<128x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<128x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<128x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<128x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<128x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<128x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<128x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc165) + %5 = ub.poison : tensor<128x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i64S128_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x16xi64> loc("input"(#loc166))) -> tensor<128xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc167) + tt.reduce.return %2 : i64 loc(#loc167) + }) : (tensor<128x16xi64>) -> tensor<128xi64> loc(#loc167) + tt.return %0 : tensor<128xi64> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128xi64> loc(#loc169) + tt.return %1 : tensor<128xi64> loc(#loc169) + } loc(#loc166) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc170)), %b: i64 loc("b"(#loc170))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc171) + tt.return %0 : i64 loc(#loc172) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc173) + tt.return %1 : i64 loc(#loc173) + } loc(#loc170) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":34:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":34:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":34:30) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":34:45) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":35:30) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":36:18) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":37:34) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":38:18) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":39:18) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":40:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":41:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":43:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":45:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":46:71) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":47:20) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":48:21) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":49:21) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":51:71) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":52:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":54:35) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":55:26) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":55:29) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":56:21) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":58:35) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":59:26) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":59:29) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":60:21) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":61:21) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":62:21) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":63:21) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":64:19) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":65:32) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":66:35) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":67:44) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":68:20) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":69:20) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":70:35) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":71:28) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":71:46) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":71:38) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":71:55) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":71:53) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":71:63) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":72:31) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":73:21) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":74:21) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":75:19) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":76:35) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":77:20) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":78:20) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":79:35) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":80:28) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":80:46) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":80:38) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":80:55) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":80:53) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":80:63) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":81:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":81:37) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":82:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":82:37) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":83:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":83:32) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":83:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":83:47) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":84:52) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":84:49) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":84:25) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":84:85) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":85:35) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":85:32) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":85:25) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":85:47) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":86:52) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":86:49) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":86:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":86:85) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":86:4) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc140 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc142 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc143 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc144 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc145 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc146 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc147 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc148 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc149 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc150 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc151 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc152 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc153 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc154 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc155 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc156 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc157 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc158 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc159 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc160 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc161 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc162 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc163 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc164 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc165 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc167 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc168 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc169 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc171 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc172 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc173 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc174 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc176 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc177 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc178 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc180 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc181 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc182 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc183 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc184 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc185 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc186 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc187 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc189 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc190 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc191 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc192 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc202 = loc("xnumel"(#loc1)) +#loc203 = loc("r0_numel"(#loc2)) +#loc204 = loc("xoffset"(#loc3)) +#loc205 = loc("xoffset"(#loc4)) +#loc206 = loc("xindex"(#loc5)) +#loc207 = loc("xindex"(#loc6)) +#loc208 = loc("xindex"(#loc7)) +#loc209 = loc("xmask"(#loc8)) +#loc210 = loc("r0_index"(#loc9)) +#loc211 = loc("r0_index"(#loc10)) +#loc212 = loc("r0_offset"(#loc11)) +#loc213 = loc("r0_mask"(#loc12)) +#loc214 = loc("tmp0"(#loc13)) +#loc215 = loc("tmp0"(#loc14)) +#loc216 = loc("tmp0"(#loc15)) +#loc217 = loc("tmp0"(#loc16)) +#loc218 = loc("tmp1"(#loc17)) +#loc219 = loc("tmp2"(#loc18)) +#loc220 = loc("tmp3"(#loc19)) +#loc221 = loc("tmp4"(#loc20)) +#loc222 = loc("tmp5"(#loc21)) +#loc223 = loc("tmp6"(#loc22)) +#loc224 = loc("tmp7"(#loc23)) +#loc225 = loc("tmp9"(#loc24)) +#loc226 = loc("tmp11"(#loc25)) +#loc227 = loc("tmp14"(#loc27)) +#loc228 = loc("tmp15"(#loc28)) +#loc229 = loc("tmp16"(#loc29)) +#loc230 = loc("tmp20"(#loc31)) +#loc231 = loc("tmp23"(#loc32)) +#loc232 = loc("tmp24"(#loc33)) +#loc233 = loc("tmp24"(#loc34)) +#loc234 = loc("tmp25"(#loc35)) +#loc235 = loc("tmp28"(#loc36)) +#loc236 = loc("tmp29"(#loc37)) +#loc237 = loc("tmp29"(#loc38)) +#loc238 = loc("tmp30"(#loc39)) +#loc239 = loc("tmp31"(#loc40)) +#loc240 = loc("tmp32"(#loc41)) +#loc241 = loc("tmp33"(#loc42)) +#loc242 = loc("tmp34"(#loc43)) +#loc243 = loc("tmp35"(#loc44)) +#loc244 = loc("tmp36"(#loc45)) +#loc245 = loc("tmp37"(#loc46)) +#loc246 = loc("tmp38"(#loc47)) +#loc247 = loc("tmp39"(#loc48)) +#loc248 = loc("tmp40"(#loc49)) +#loc249 = loc("tmp42"(#loc56)) +#loc250 = loc("tmp43"(#loc57)) +#loc251 = loc("tmp44"(#loc58)) +#loc252 = loc("tmp45"(#loc59)) +#loc253 = loc("tmp46"(#loc60)) +#loc254 = loc("tmp47"(#loc61)) +#loc255 = loc("tmp48"(#loc62)) +#loc256 = loc("tmp49"(#loc63)) +#loc261 = loc("flip"(#loc96)) +#loc262 = loc("flip"(#loc97)) +#loc263 = loc("flip"(#loc98)) +#loc264 = loc("flip"(#loc99)) +#loc268 = loc("y"(#loc104)) +#loc269 = loc("right_mask"(#loc105)) +#loc270 = loc("right_mask"(#loc106)) +#loc271 = loc("left_mask"(#loc107)) +#loc272 = loc("ileft"(#loc108)) +#loc273 = loc("ileft"(#loc109)) +#loc274 = loc("ileft"(#loc110)) +#loc275 = loc("ileft"(#loc111)) +#loc276 = loc("iright"(#loc112)) +#loc277 = loc("iright"(#loc113)) +#loc278 = loc("iright"(#loc114)) +#loc279 = loc("iright"(#loc115)) +#loc280 = loc("ileft"(#loc116)) +#loc281 = loc("iright"(#loc117)) +#loc282 = loc("y_idx"(#loc118)) +#loc283 = loc("left_idx"(#loc119)) +#loc284 = loc("left_idx"(#loc120)) +#loc285 = loc("left_idx"(#loc121)) +#loc286 = loc("left_idx"(#loc122)) +#loc287 = loc("left_idx"(#loc123)) +#loc288 = loc("right_idx"(#loc124)) +#loc289 = loc("right_idx"(#loc125)) +#loc290 = loc("right_idx"(#loc126)) +#loc291 = loc("right_idx"(#loc127)) +#loc292 = loc("right_idx"(#loc128)) +#loc293 = loc("left_idx"(#loc129)) +#loc294 = loc("right_idx"(#loc130)) +#loc295 = loc("left_valid_mask"(#loc131)) +#loc296 = loc("right_valid_mask"(#loc132)) +#loc297 = loc("left_isnan"(#loc133)) +#loc298 = loc("right_isnan"(#loc134)) +#loc299 = loc("cond"(#loc135)) +#loc300 = loc("cond"(#loc138)) +#loc301 = loc("cond"(#loc139)) +#loc302 = loc("cond"(#loc140)) +#loc303 = loc("eq"(#loc142)) +#loc304 = loc("eq"(#loc145)) +#loc305 = loc("eq"(#loc146)) +#loc306 = loc("cond"(#loc147)) +#loc307 = loc("cond"(#loc148)) +#loc308 = loc("cond"(#loc149)) +#loc309 = loc("cond"(#loc150)) +#loc310 = loc("cond"(#loc151)) +#loc311 = loc("cond"(#loc152)) +#loc312 = loc("cond"(#loc153)) +#loc313 = loc("cond"(#loc154)) +#loc314 = loc("cond"(#loc155)) +#loc315 = loc("ret"(#loc156)) +#loc316 = loc("ret"(#loc157)) +#loc317 = loc("ret"(#loc158)) +#loc318 = loc("ret"(#loc159)) +#loc319 = loc("new_idxs"(#loc160)) +#loc320 = loc("new_idxs"(#loc161)) +#loc321 = loc("new_idxs"(#loc162)) +#loc322 = loc("new_idxs"(#loc163)) +#loc326 = loc("input"(#loc174)) +#loc330 = loc("flip"(#loc192)) +#loc331 = loc("cond"(#loc299)) +#loc332 = loc("cond"(#loc302)) +#loc333 = loc("eq"(#loc303)) +#loc334 = loc("eq"(#loc305)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..8b92c38164418340bfb0a9e19ece201ebe17120c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir @@ -0,0 +1,1480 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1, 4], threadsPerWarp = [8, 2, 2], warpsPerCTA = [8, 1, 1], order = [2, 1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1, 4], threadsPerWarp = [16, 2, 1], warpsPerCTA = [8, 1, 1], order = [2, 1, 0]}> +#blocked3 = #ttg.blocked<{sizePerThread = [1, 2, 2], threadsPerWarp = [32, 1, 1], warpsPerCTA = [8, 1, 1], order = [2, 1, 0]}> +#blocked4 = #ttg.blocked<{sizePerThread = [2, 2, 1], threadsPerWarp = [32, 1, 1], warpsPerCTA = [8, 1, 1], order = [2, 1, 0]}> +#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 2], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":18:0) +#loc1 = loc(unknown) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":46:71) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":51:71) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":55:26) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":59:26) +#loc117 = loc("in_ptr0"(#loc)) +#loc118 = loc("out_ptr4"(#loc)) +#loc119 = loc("out_ptr5"(#loc)) +#loc120 = loc("out_ptr6"(#loc)) +#loc121 = loc("out_ptr7"(#loc)) +#loc122 = loc("out_ptr8"(#loc)) +#loc123 = loc("out_ptr9"(#loc)) +#loc124 = loc("xnumel"(#loc)) +#loc125 = loc("r0_numel"(#loc)) +#loc144 = loc(callsite(#loc20 at #loc21)) +#loc150 = loc("ileft"(#loc29)) +#loc154 = loc("iright"(#loc34)) +#loc163 = loc("left_idx"(#loc43)) +#loc168 = loc("right_idx"(#loc48)) +#loc189 = loc(callsite(#loc20 at #loc69)) +#loc192 = loc("tmp24"(#loc72)) +#loc197 = loc("tmp29"(#loc77)) +#loc214 = loc(callsite(#loc25 at #loc144)) +#loc218 = loc(callsite(#loc25 at #loc189)) +#loc221 = loc(callsite(#loc1 at #loc192)) +#loc224 = loc(callsite(#loc1 at #loc197)) +#loc228 = loc(callsite(#loc150 at #loc214)) +#loc232 = loc(callsite(#loc154 at #loc214)) +#loc240 = loc(callsite(#loc163 at #loc214)) +#loc245 = loc(callsite(#loc168 at #loc214)) +#loc265 = loc(callsite(#loc150 at #loc218)) +#loc269 = loc(callsite(#loc154 at #loc218)) +#loc287 = loc(callsite(#loc163 at #loc218)) +#loc291 = loc(callsite(#loc168 at #loc218)) +#loc301 = loc(callsite(#loc1 at #loc228)) +#loc303 = loc(callsite(#loc1 at #loc232)) +#loc306 = loc(callsite(#loc1 at #loc240)) +#loc309 = loc(callsite(#loc1 at #loc245)) +#loc311 = loc(callsite(#loc1 at #loc265)) +#loc313 = loc(callsite(#loc1 at #loc269)) +#loc315 = loc(callsite(#loc1 at #loc287)) +#loc317 = loc(callsite(#loc1 at #loc291)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<128x1xi1, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<17> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked2> loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked3> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked4> loc(#loc1) + %cst_5 = arith.constant dense<16> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<128x1xi32, #blocked5> loc(#loc1) + %cst_7 = arith.constant dense<128> : tensor<128x1xi32, #blocked> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %cst_8 = arith.constant dense<1> : tensor<128x16xi32, #blocked5> loc(#loc1) + %cst_9 = arith.constant dense<0> : tensor<128x16xi32, #blocked> loc(#loc1) + %cst_10 = arith.constant dense<17> : tensor<128x16xi32, #blocked> loc(#loc1) + %cst_11 = arith.constant dense<16> : tensor<128x16xi32, #blocked> loc(#loc1) + %cst_12 = arith.constant dense<16384> : tensor<128x16xi64, #blocked> loc(#loc1) + %cst_13 = arith.constant dense<0> : tensor<128x16xi64, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc126) + %xoffset_14 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc127) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc128) + %xindex_15 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc128) + %xindex_16 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc128) + %xindex_17 = tt.expand_dims %xindex_15 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<128x1xi32, #blocked5> loc(#loc128) + %xindex_18 = tt.splat %xoffset_14 : i32 -> tensor<128x1xi32, #blocked> loc(#loc129) + %xindex_19 = tt.splat %xoffset_14 : i32 -> tensor<128x1xi32, #blocked5> loc(#loc129) + %xindex_20 = arith.addi %xindex_18, %xindex_16 : tensor<128x1xi32, #blocked> loc(#loc129) + %xindex_21 = arith.addi %xindex_19, %xindex_17 : tensor<128x1xi32, #blocked5> loc(#loc129) + %xmask = arith.cmpi slt, %xindex_20, %cst_7 : tensor<128x1xi32, #blocked> loc(#loc130) + %xmask_22 = arith.cmpi slt, %xindex_21, %cst_6 : tensor<128x1xi32, #blocked5> loc(#loc130) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc131) + %r0_index_23 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc131) + %tmp0 = arith.muli %xindex_20, %cst_5 : tensor<128x1xi32, #blocked> loc(#loc132) + %tmp0_24 = tt.broadcast %r0_index_23 : tensor<1x16xi32, #blocked> -> tensor<128x16xi32, #blocked> loc(#loc133) + %tmp0_25 = tt.broadcast %tmp0 : tensor<128x1xi32, #blocked> -> tensor<128x16xi32, #blocked> loc(#loc133) + %tmp0_26 = arith.addi %tmp0_24, %tmp0_25 : tensor<128x16xi32, #blocked> loc(#loc133) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x16x!tt.ptr, #blocked> loc(#loc134) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<128x16x!tt.ptr, #blocked>, tensor<128x16xi32, #blocked> loc(#loc134) + %tmp0_29 = tt.broadcast %xmask : tensor<128x1xi1, #blocked> -> tensor<128x16xi1, #blocked> loc(#loc135) + %tmp0_30 = tt.broadcast %xmask_22 : tensor<128x1xi1, #blocked5> -> tensor<128x16xi1, #blocked5> loc(#loc135) + %tmp0_31 = tt.load %tmp0_28, %tmp0_29, %cst_13 : tensor<128x16x!tt.ptr, #blocked> loc(#loc135) + %tmp2 = arith.cmpi sgt, %tmp0_31, %cst_13 : tensor<128x16xi64, #blocked> loc(#loc136) + %tmp4 = arith.cmpi slt, %tmp0_31, %cst_12 : tensor<128x16xi64, #blocked> loc(#loc137) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<128x16xi1, #blocked> loc(#loc138) + %tmp7 = arith.extui %tmp5 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc210) + %tmp9 = arith.trunci %r0_index_23 : tensor<1x16xi32, #blocked> to tensor<1x16xi16, #blocked> loc(#loc141) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16, #blocked> -> tensor<128x16xi16, #blocked> loc(#loc142) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> loc(#loc211) + %flip_32 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> loc(#loc211) + %flip_33 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> loc(#loc211) + %flip_34 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> loc(#loc211) + %flip_35 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> loc(#loc211) + %flip_36 = tt.expand_dims %flip_32 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> loc(#loc211) + %flip_37 = tt.expand_dims %flip_33 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> loc(#loc211) + %flip_38 = tt.expand_dims %flip_34 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> loc(#loc211) + %flip_39 = tt.expand_dims %flip_35 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> -> tensor<1x2x1xi32, #blocked3> loc(#loc211) + %flip_40 = tt.expand_dims %flip_36 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> -> tensor<1x2x1xi32, #blocked4> loc(#loc211) + %flip_41 = tt.expand_dims %flip_37 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> -> tensor<1x2x1xi32, #blocked2> loc(#loc211) + %flip_42 = tt.expand_dims %flip_38 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> -> tensor<1x2x1xi32, #blocked1> loc(#loc211) + %flip_43 = tt.broadcast %flip_39 : tensor<1x2x1xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc212) + %flip_44 = tt.reshape %flip_43 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc213) + %y = tt.reshape %tmp7 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc225) + %left_mask = arith.subi %cst_4, %flip_40 : tensor<1x2x1xi32, #blocked4> loc(#loc226) + %left_mask_45 = arith.subi %cst_3, %flip_39 : tensor<1x2x1xi32, #blocked3> loc(#loc226) + %left_mask_46 = arith.subi %cst_2, %flip_41 : tensor<1x2x1xi32, #blocked2> loc(#loc226) + %left_mask_47 = arith.subi %cst_1, %flip_42 : tensor<1x2x1xi32, #blocked1> loc(#loc226) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc227) + %ileft_48 = arith.muli %y, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc227) + %ileft_49 = "tt.reduce"(%ileft_48) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_50 = tt.expand_dims %ileft_49 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc229) + %ileft_51 = tt.broadcast %ileft_50 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc230) + %iright = tt.broadcast %flip_40 : tensor<1x2x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc231) + %iright_52 = arith.muli %y, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc231) + %iright_53 = "tt.reduce"(%iright_52) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_54 = tt.expand_dims %iright_53 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc233) + %iright_55 = tt.broadcast %iright_54 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc234) + %ileft_56 = tt.reshape %ileft_51 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_57 = tt.reshape %iright_55 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx = tt.reshape %tmp11 : tensor<128x16xi16, #blocked> -> tensor<1024x2x1xi16, #blocked4> loc(#loc237) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc238) + %left_idx_58 = tt.broadcast %left_idx : tensor<1x2x1xi16, #blocked4> -> tensor<1024x2x1xi16, #blocked4> loc(#loc239) + %left_idx_59 = arith.muli %y_idx, %left_idx_58 : tensor<1024x2x1xi16, #blocked4> loc(#loc239) + %input = arith.extsi %left_idx_59 : tensor<1024x2x1xi16, #blocked4> to tensor<1024x2x1xi32, #blocked4> loc(#loc304) + %left_idx_60 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_61 = tt.expand_dims %left_idx_60 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc241) + %left_idx_62 = tt.broadcast %left_idx_61 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc242) + %right_idx = arith.trunci %flip_40 : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc243) + %right_idx_63 = tt.broadcast %right_idx : tensor<1x2x1xi16, #blocked4> -> tensor<1024x2x1xi16, #blocked4> loc(#loc244) + %right_idx_64 = arith.muli %y_idx, %right_idx_63 : tensor<1024x2x1xi16, #blocked4> loc(#loc244) + %input_65 = arith.extsi %right_idx_64 : tensor<1024x2x1xi16, #blocked4> to tensor<1024x2x1xi32, #blocked4> loc(#loc307) + %right_idx_66 = "tt.reduce"(%input_65) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_67 = tt.expand_dims %right_idx_66 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc246) + %right_idx_68 = tt.broadcast %right_idx_67 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc247) + %left_idx_69 = tt.reshape %left_idx_62 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_70 = tt.reshape %right_idx_68 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond = arith.cmpi slt, %ileft_56, %iright_57 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq = arith.cmpi eq, %ileft_56, %iright_57 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_71 = arith.cmpi sgt, %left_idx_69, %right_idx_70 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_72 = arith.andi %eq, %cond_71 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_73 = arith.ori %cond, %cond_72 : tensor<128x16xi1, #blocked> loc(#loc254) + %cond_74 = arith.extui %cond_73 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc255) + %cond_75 = arith.xori %cond_74, %flip_44 : tensor<128x16xi32, #blocked> loc(#loc255) + %cond_76 = arith.cmpi ne, %cond_75, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc256) + %ret = arith.xori %ileft_56, %iright_57 : tensor<128x16xi32, #blocked> loc(#loc257) + %ret_77 = arith.select %cond_76, %ret, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc258) + %ret_78 = arith.xori %tmp7, %ret_77 : tensor<128x16xi32, #blocked> loc(#loc259) + %new_idxs = arith.xori %left_idx_69, %right_idx_70 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_79 = arith.select %cond_76, %new_idxs, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_80 = arith.extsi %tmp9 : tensor<1x16xi16, #blocked> to tensor<1x16xi32, #blocked> loc(#loc262) + %new_idxs_81 = tt.broadcast %new_idxs_80 : tensor<1x16xi32, #blocked> -> tensor<128x16xi32, #blocked> loc(#loc262) + %new_idxs_82 = arith.xori %new_idxs_81, %new_idxs_79 : tensor<128x16xi32, #blocked> loc(#loc262) + %flip_83 = tt.broadcast %flip_41 : tensor<1x2x1xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc212) + %flip_84 = tt.reshape %flip_83 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc213) + %y_85 = tt.reshape %ret_78 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc225) + %ileft_86 = tt.broadcast %left_mask_45 : tensor<1x2x1xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc227) + %ileft_87 = arith.muli %y_85, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc227) + %ileft_88 = "tt.reduce"(%ileft_87) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_89 = tt.expand_dims %ileft_88 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc229) + %ileft_90 = tt.broadcast %ileft_89 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc230) + %iright_91 = arith.muli %y_85, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc231) + %iright_92 = "tt.reduce"(%iright_91) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_93 = tt.expand_dims %iright_92 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc233) + %iright_94 = tt.broadcast %iright_93 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc234) + %ileft_95 = tt.reshape %ileft_90 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_96 = tt.reshape %iright_94 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx_97 = tt.reshape %new_idxs_82 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc237) + %left_idx_98 = arith.muli %y_idx_97, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc239) + %left_idx_99 = "tt.reduce"(%left_idx_98) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_100 = tt.expand_dims %left_idx_99 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc241) + %left_idx_101 = tt.broadcast %left_idx_100 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc242) + %right_idx_102 = arith.muli %y_idx_97, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc244) + %right_idx_103 = "tt.reduce"(%right_idx_102) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_104 = tt.expand_dims %right_idx_103 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc246) + %right_idx_105 = tt.broadcast %right_idx_104 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc247) + %left_idx_106 = tt.reshape %left_idx_101 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_107 = tt.reshape %right_idx_105 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond_108 = arith.cmpi slt, %ileft_95, %iright_96 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq_109 = arith.cmpi eq, %ileft_95, %iright_96 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_110 = arith.cmpi sgt, %left_idx_106, %right_idx_107 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_111 = arith.andi %eq_109, %cond_110 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_112 = arith.ori %cond_108, %cond_111 : tensor<128x16xi1, #blocked> loc(#loc254) + %cond_113 = arith.extui %cond_112 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc255) + %cond_114 = arith.xori %cond_113, %flip_84 : tensor<128x16xi32, #blocked> loc(#loc255) + %cond_115 = arith.cmpi ne, %cond_114, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc256) + %ret_116 = arith.xori %ileft_95, %iright_96 : tensor<128x16xi32, #blocked> loc(#loc257) + %ret_117 = arith.select %cond_115, %ret_116, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc258) + %ret_118 = arith.xori %ret_78, %ret_117 : tensor<128x16xi32, #blocked> loc(#loc259) + %new_idxs_119 = arith.xori %left_idx_106, %right_idx_107 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_120 = arith.select %cond_115, %new_idxs_119, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_121 = arith.xori %new_idxs_82, %new_idxs_120 : tensor<128x16xi32, #blocked> loc(#loc262) + %y_122 = tt.reshape %ret_118 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc225) + %ileft_123 = arith.muli %y_122, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc227) + %ileft_124 = "tt.reduce"(%ileft_123) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_125 = tt.expand_dims %ileft_124 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc229) + %ileft_126 = tt.broadcast %ileft_125 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc230) + %iright_127 = arith.muli %y_122, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc231) + %iright_128 = "tt.reduce"(%iright_127) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_129 = tt.expand_dims %iright_128 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc233) + %iright_130 = tt.broadcast %iright_129 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc234) + %ileft_131 = tt.reshape %ileft_126 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_132 = tt.reshape %iright_130 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx_133 = tt.reshape %new_idxs_121 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc237) + %left_idx_134 = arith.muli %y_idx_133, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc239) + %left_idx_135 = "tt.reduce"(%left_idx_134) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_136 = tt.expand_dims %left_idx_135 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc241) + %left_idx_137 = tt.broadcast %left_idx_136 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc242) + %right_idx_138 = arith.muli %y_idx_133, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc244) + %right_idx_139 = "tt.reduce"(%right_idx_138) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_140 = tt.expand_dims %right_idx_139 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc246) + %right_idx_141 = tt.broadcast %right_idx_140 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc247) + %left_idx_142 = tt.reshape %left_idx_137 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_143 = tt.reshape %right_idx_141 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond_144 = arith.cmpi slt, %ileft_131, %iright_132 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq_145 = arith.cmpi eq, %ileft_131, %iright_132 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_146 = arith.cmpi sgt, %left_idx_142, %right_idx_143 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_147 = arith.andi %eq_145, %cond_146 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_148 = arith.ori %cond_144, %cond_147 : tensor<128x16xi1, #blocked> loc(#loc254) + %cond_149 = arith.extui %cond_148 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc255) + %cond_150 = arith.xori %cond_149, %flip_84 : tensor<128x16xi32, #blocked> loc(#loc255) + %cond_151 = arith.cmpi ne, %cond_150, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc256) + %ret_152 = arith.xori %ileft_131, %iright_132 : tensor<128x16xi32, #blocked> loc(#loc257) + %ret_153 = arith.select %cond_151, %ret_152, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc258) + %ret_154 = arith.xori %ret_118, %ret_153 : tensor<128x16xi32, #blocked> loc(#loc259) + %new_idxs_155 = arith.xori %left_idx_142, %right_idx_143 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_156 = arith.select %cond_151, %new_idxs_155, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_157 = arith.xori %new_idxs_121, %new_idxs_156 : tensor<128x16xi32, #blocked> loc(#loc262) + %flip_158 = tt.broadcast %flip_42 : tensor<1x2x1xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc212) + %flip_159 = tt.reshape %flip_158 : tensor<128x2x8xi32, #blocked1> -> tensor<128x16xi32, #blocked> loc(#loc213) + %y_160 = tt.reshape %ret_154 : tensor<128x16xi32, #blocked> -> tensor<256x2x4xi32, #blocked2> loc(#loc225) + %ileft_161 = tt.broadcast %left_mask_46 : tensor<1x2x1xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc227) + %ileft_162 = arith.muli %y_160, %ileft_161 : tensor<256x2x4xi32, #blocked2> loc(#loc227) + %ileft_163 = "tt.reduce"(%ileft_162) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc300) + %ileft_164 = tt.expand_dims %ileft_163 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc229) + %ileft_165 = tt.broadcast %ileft_164 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc230) + %iright_166 = arith.muli %y_160, %flip_83 : tensor<256x2x4xi32, #blocked2> loc(#loc231) + %iright_167 = "tt.reduce"(%iright_166) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %iright_168 = tt.expand_dims %iright_167 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc233) + %iright_169 = tt.broadcast %iright_168 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc234) + %ileft_170 = tt.reshape %ileft_165 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_171 = tt.reshape %iright_169 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx_172 = tt.reshape %new_idxs_157 : tensor<128x16xi32, #blocked> -> tensor<256x2x4xi32, #blocked2> loc(#loc237) + %left_idx_173 = arith.muli %y_idx_172, %ileft_161 : tensor<256x2x4xi32, #blocked2> loc(#loc239) + %left_idx_174 = "tt.reduce"(%left_idx_173) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %left_idx_175 = tt.expand_dims %left_idx_174 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc241) + %left_idx_176 = tt.broadcast %left_idx_175 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc242) + %right_idx_177 = arith.muli %y_idx_172, %flip_83 : tensor<256x2x4xi32, #blocked2> loc(#loc244) + %right_idx_178 = "tt.reduce"(%right_idx_177) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc308) + %right_idx_179 = tt.expand_dims %right_idx_178 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc246) + %right_idx_180 = tt.broadcast %right_idx_179 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc247) + %left_idx_181 = tt.reshape %left_idx_176 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_182 = tt.reshape %right_idx_180 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond_183 = arith.cmpi slt, %ileft_170, %iright_171 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq_184 = arith.cmpi eq, %ileft_170, %iright_171 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_185 = arith.cmpi sgt, %left_idx_181, %right_idx_182 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_186 = arith.andi %eq_184, %cond_185 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_187 = arith.ori %cond_183, %cond_186 : tensor<128x16xi1, #blocked> loc(#loc254) + %cond_188 = arith.extui %cond_187 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc255) + %cond_189 = arith.xori %cond_188, %flip_159 : tensor<128x16xi32, #blocked> loc(#loc255) + %cond_190 = arith.cmpi ne, %cond_189, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc256) + %ret_191 = arith.xori %ileft_170, %iright_171 : tensor<128x16xi32, #blocked> loc(#loc257) + %ret_192 = arith.select %cond_190, %ret_191, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc258) + %ret_193 = arith.xori %ret_154, %ret_192 : tensor<128x16xi32, #blocked> loc(#loc259) + %new_idxs_194 = arith.xori %left_idx_181, %right_idx_182 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_195 = arith.select %cond_190, %new_idxs_194, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_196 = arith.xori %new_idxs_157, %new_idxs_195 : tensor<128x16xi32, #blocked> loc(#loc262) + %y_197 = tt.reshape %ret_193 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc225) + %ileft_198 = arith.muli %y_197, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc227) + %ileft_199 = "tt.reduce"(%ileft_198) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_200 = tt.expand_dims %ileft_199 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc229) + %ileft_201 = tt.broadcast %ileft_200 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc230) + %iright_202 = arith.muli %y_197, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc231) + %iright_203 = "tt.reduce"(%iright_202) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_204 = tt.expand_dims %iright_203 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc233) + %iright_205 = tt.broadcast %iright_204 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc234) + %ileft_206 = tt.reshape %ileft_201 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_207 = tt.reshape %iright_205 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx_208 = tt.reshape %new_idxs_196 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc237) + %left_idx_209 = arith.muli %y_idx_208, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc239) + %left_idx_210 = "tt.reduce"(%left_idx_209) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_211 = tt.expand_dims %left_idx_210 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc241) + %left_idx_212 = tt.broadcast %left_idx_211 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc242) + %right_idx_213 = arith.muli %y_idx_208, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc244) + %right_idx_214 = "tt.reduce"(%right_idx_213) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_215 = tt.expand_dims %right_idx_214 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc246) + %right_idx_216 = tt.broadcast %right_idx_215 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc247) + %left_idx_217 = tt.reshape %left_idx_212 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_218 = tt.reshape %right_idx_216 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond_219 = arith.cmpi slt, %ileft_206, %iright_207 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq_220 = arith.cmpi eq, %ileft_206, %iright_207 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_221 = arith.cmpi sgt, %left_idx_217, %right_idx_218 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_222 = arith.andi %eq_220, %cond_221 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_223 = arith.ori %cond_219, %cond_222 : tensor<128x16xi1, #blocked> loc(#loc254) + %cond_224 = arith.extui %cond_223 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc255) + %cond_225 = arith.xori %cond_224, %flip_159 : tensor<128x16xi32, #blocked> loc(#loc255) + %cond_226 = arith.cmpi ne, %cond_225, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc256) + %ret_227 = arith.xori %ileft_206, %iright_207 : tensor<128x16xi32, #blocked> loc(#loc257) + %ret_228 = arith.select %cond_226, %ret_227, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc258) + %ret_229 = arith.xori %ret_193, %ret_228 : tensor<128x16xi32, #blocked> loc(#loc259) + %new_idxs_230 = arith.xori %left_idx_217, %right_idx_218 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_231 = arith.select %cond_226, %new_idxs_230, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_232 = arith.xori %new_idxs_196, %new_idxs_231 : tensor<128x16xi32, #blocked> loc(#loc262) + %y_233 = tt.reshape %ret_229 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc225) + %ileft_234 = arith.muli %y_233, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc227) + %ileft_235 = "tt.reduce"(%ileft_234) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_236 = tt.expand_dims %ileft_235 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc229) + %ileft_237 = tt.broadcast %ileft_236 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc230) + %iright_238 = arith.muli %y_233, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc231) + %iright_239 = "tt.reduce"(%iright_238) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_240 = tt.expand_dims %iright_239 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc233) + %iright_241 = tt.broadcast %iright_240 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc234) + %ileft_242 = tt.reshape %ileft_237 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_243 = tt.reshape %iright_241 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx_244 = tt.reshape %new_idxs_232 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc237) + %left_idx_245 = arith.muli %y_idx_244, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc239) + %left_idx_246 = "tt.reduce"(%left_idx_245) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_247 = tt.expand_dims %left_idx_246 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc241) + %left_idx_248 = tt.broadcast %left_idx_247 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc242) + %right_idx_249 = arith.muli %y_idx_244, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc244) + %right_idx_250 = "tt.reduce"(%right_idx_249) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_251 = tt.expand_dims %right_idx_250 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc246) + %right_idx_252 = tt.broadcast %right_idx_251 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc247) + %left_idx_253 = tt.reshape %left_idx_248 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_254 = tt.reshape %right_idx_252 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond_255 = arith.cmpi slt, %ileft_242, %iright_243 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq_256 = arith.cmpi eq, %ileft_242, %iright_243 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_257 = arith.cmpi sgt, %left_idx_253, %right_idx_254 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_258 = arith.andi %eq_256, %cond_257 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_259 = arith.ori %cond_255, %cond_258 : tensor<128x16xi1, #blocked> loc(#loc254) + %cond_260 = arith.extui %cond_259 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc255) + %cond_261 = arith.xori %cond_260, %flip_159 : tensor<128x16xi32, #blocked> loc(#loc255) + %cond_262 = arith.cmpi ne, %cond_261, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc256) + %ret_263 = arith.xori %ileft_242, %iright_243 : tensor<128x16xi32, #blocked> loc(#loc257) + %ret_264 = arith.select %cond_262, %ret_263, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc258) + %ret_265 = arith.xori %ret_229, %ret_264 : tensor<128x16xi32, #blocked> loc(#loc259) + %new_idxs_266 = arith.xori %left_idx_253, %right_idx_254 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_267 = arith.select %cond_262, %new_idxs_266, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_268 = arith.xori %new_idxs_232, %new_idxs_267 : tensor<128x16xi32, #blocked> loc(#loc262) + %y_269 = tt.reshape %ret_265 : tensor<128x16xi32, #blocked> -> tensor<128x2x8xi32, #blocked1> loc(#loc225) + %ileft_270 = tt.broadcast %left_mask_47 : tensor<1x2x1xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc227) + %ileft_271 = arith.muli %y_269, %ileft_270 : tensor<128x2x8xi32, #blocked1> loc(#loc227) + %ileft_272 = "tt.reduce"(%ileft_271) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<128x2x8xi32, #blocked1>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc300) + %ileft_273 = tt.expand_dims %ileft_272 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1x8xi32, #blocked1> loc(#loc229) + %ileft_274 = tt.broadcast %ileft_273 : tensor<128x1x8xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc230) + %iright_275 = arith.muli %y_269, %flip_158 : tensor<128x2x8xi32, #blocked1> loc(#loc231) + %iright_276 = "tt.reduce"(%iright_275) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<128x2x8xi32, #blocked1>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc302) + %iright_277 = tt.expand_dims %iright_276 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1x8xi32, #blocked1> loc(#loc233) + %iright_278 = tt.broadcast %iright_277 : tensor<128x1x8xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc234) + %ileft_279 = tt.reshape %ileft_274 : tensor<128x2x8xi32, #blocked1> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_280 = tt.reshape %iright_278 : tensor<128x2x8xi32, #blocked1> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx_281 = tt.reshape %new_idxs_268 : tensor<128x16xi32, #blocked> -> tensor<128x2x8xi32, #blocked1> loc(#loc237) + %left_idx_282 = arith.muli %y_idx_281, %ileft_270 : tensor<128x2x8xi32, #blocked1> loc(#loc239) + %left_idx_283 = "tt.reduce"(%left_idx_282) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<128x2x8xi32, #blocked1>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc305) + %left_idx_284 = tt.expand_dims %left_idx_283 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1x8xi32, #blocked1> loc(#loc241) + %left_idx_285 = tt.broadcast %left_idx_284 : tensor<128x1x8xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc242) + %right_idx_286 = arith.muli %y_idx_281, %flip_158 : tensor<128x2x8xi32, #blocked1> loc(#loc244) + %right_idx_287 = "tt.reduce"(%right_idx_286) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<128x2x8xi32, #blocked1>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc308) + %right_idx_288 = tt.expand_dims %right_idx_287 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1x8xi32, #blocked1> loc(#loc246) + %right_idx_289 = tt.broadcast %right_idx_288 : tensor<128x1x8xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc247) + %left_idx_290 = tt.reshape %left_idx_285 : tensor<128x2x8xi32, #blocked1> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_291 = tt.reshape %right_idx_289 : tensor<128x2x8xi32, #blocked1> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond_292 = arith.cmpi slt, %ileft_279, %iright_280 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq_293 = arith.cmpi eq, %ileft_279, %iright_280 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_294 = arith.cmpi sgt, %left_idx_290, %right_idx_291 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_295 = arith.andi %eq_293, %cond_294 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_296 = arith.ori %cond_292, %cond_295 : tensor<128x16xi1, #blocked> loc(#loc254) + %ret_297 = arith.xori %ileft_279, %iright_280 : tensor<128x16xi32, #blocked> loc(#loc257) + %ret_298 = arith.select %cond_296, %ret_297, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc258) + %ret_299 = arith.xori %ret_265, %ret_298 : tensor<128x16xi32, #blocked> loc(#loc259) + %new_idxs_300 = arith.xori %left_idx_290, %right_idx_291 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_301 = arith.select %cond_296, %new_idxs_300, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_302 = arith.xori %new_idxs_268, %new_idxs_301 : tensor<128x16xi32, #blocked> loc(#loc262) + %y_303 = tt.reshape %ret_299 : tensor<128x16xi32, #blocked> -> tensor<256x2x4xi32, #blocked2> loc(#loc225) + %ileft_304 = arith.muli %y_303, %ileft_161 : tensor<256x2x4xi32, #blocked2> loc(#loc227) + %ileft_305 = "tt.reduce"(%ileft_304) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc300) + %ileft_306 = tt.expand_dims %ileft_305 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc229) + %ileft_307 = tt.broadcast %ileft_306 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc230) + %iright_308 = arith.muli %y_303, %flip_83 : tensor<256x2x4xi32, #blocked2> loc(#loc231) + %iright_309 = "tt.reduce"(%iright_308) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %iright_310 = tt.expand_dims %iright_309 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc233) + %iright_311 = tt.broadcast %iright_310 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc234) + %ileft_312 = tt.reshape %ileft_307 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_313 = tt.reshape %iright_311 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx_314 = tt.reshape %new_idxs_302 : tensor<128x16xi32, #blocked> -> tensor<256x2x4xi32, #blocked2> loc(#loc237) + %left_idx_315 = arith.muli %y_idx_314, %ileft_161 : tensor<256x2x4xi32, #blocked2> loc(#loc239) + %left_idx_316 = "tt.reduce"(%left_idx_315) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %left_idx_317 = tt.expand_dims %left_idx_316 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc241) + %left_idx_318 = tt.broadcast %left_idx_317 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc242) + %right_idx_319 = arith.muli %y_idx_314, %flip_83 : tensor<256x2x4xi32, #blocked2> loc(#loc244) + %right_idx_320 = "tt.reduce"(%right_idx_319) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc308) + %right_idx_321 = tt.expand_dims %right_idx_320 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc246) + %right_idx_322 = tt.broadcast %right_idx_321 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc247) + %left_idx_323 = tt.reshape %left_idx_318 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_324 = tt.reshape %right_idx_322 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond_325 = arith.cmpi slt, %ileft_312, %iright_313 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq_326 = arith.cmpi eq, %ileft_312, %iright_313 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_327 = arith.cmpi sgt, %left_idx_323, %right_idx_324 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_328 = arith.andi %eq_326, %cond_327 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_329 = arith.ori %cond_325, %cond_328 : tensor<128x16xi1, #blocked> loc(#loc254) + %ret_330 = arith.xori %ileft_312, %iright_313 : tensor<128x16xi32, #blocked> loc(#loc257) + %ret_331 = arith.select %cond_329, %ret_330, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc258) + %ret_332 = arith.xori %ret_299, %ret_331 : tensor<128x16xi32, #blocked> loc(#loc259) + %new_idxs_333 = arith.xori %left_idx_323, %right_idx_324 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_334 = arith.select %cond_329, %new_idxs_333, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_335 = arith.xori %new_idxs_302, %new_idxs_334 : tensor<128x16xi32, #blocked> loc(#loc262) + %y_336 = tt.reshape %ret_332 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc225) + %ileft_337 = arith.muli %y_336, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc227) + %ileft_338 = "tt.reduce"(%ileft_337) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_339 = tt.expand_dims %ileft_338 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc229) + %ileft_340 = tt.broadcast %ileft_339 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc230) + %iright_341 = arith.muli %y_336, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc231) + %iright_342 = "tt.reduce"(%iright_341) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_343 = tt.expand_dims %iright_342 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc233) + %iright_344 = tt.broadcast %iright_343 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc234) + %ileft_345 = tt.reshape %ileft_340 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_346 = tt.reshape %iright_344 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx_347 = tt.reshape %new_idxs_335 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc237) + %left_idx_348 = arith.muli %y_idx_347, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc239) + %left_idx_349 = "tt.reduce"(%left_idx_348) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_350 = tt.expand_dims %left_idx_349 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc241) + %left_idx_351 = tt.broadcast %left_idx_350 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc242) + %right_idx_352 = arith.muli %y_idx_347, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc244) + %right_idx_353 = "tt.reduce"(%right_idx_352) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_354 = tt.expand_dims %right_idx_353 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc246) + %right_idx_355 = tt.broadcast %right_idx_354 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc247) + %left_idx_356 = tt.reshape %left_idx_351 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_357 = tt.reshape %right_idx_355 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond_358 = arith.cmpi slt, %ileft_345, %iright_346 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq_359 = arith.cmpi eq, %ileft_345, %iright_346 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_360 = arith.cmpi sgt, %left_idx_356, %right_idx_357 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_361 = arith.andi %eq_359, %cond_360 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_362 = arith.ori %cond_358, %cond_361 : tensor<128x16xi1, #blocked> loc(#loc254) + %ret_363 = arith.xori %ileft_345, %iright_346 : tensor<128x16xi32, #blocked> loc(#loc257) + %ret_364 = arith.select %cond_362, %ret_363, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc258) + %ret_365 = arith.xori %ret_332, %ret_364 : tensor<128x16xi32, #blocked> loc(#loc259) + %new_idxs_366 = arith.xori %left_idx_356, %right_idx_357 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_367 = arith.select %cond_362, %new_idxs_366, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_368 = arith.xori %new_idxs_335, %new_idxs_367 : tensor<128x16xi32, #blocked> loc(#loc262) + %y_369 = tt.reshape %ret_365 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc225) + %ileft_370 = arith.muli %y_369, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc227) + %ileft_371 = "tt.reduce"(%ileft_370) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_372 = tt.expand_dims %ileft_371 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc229) + %ileft_373 = tt.broadcast %ileft_372 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc230) + %iright_374 = arith.muli %y_369, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc231) + %iright_375 = "tt.reduce"(%iright_374) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_376 = tt.expand_dims %iright_375 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc233) + %iright_377 = tt.broadcast %iright_376 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc234) + %ileft_378 = tt.reshape %ileft_373 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_379 = tt.reshape %iright_377 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx_380 = tt.reshape %new_idxs_368 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc237) + %left_idx_381 = arith.muli %y_idx_380, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc239) + %left_idx_382 = "tt.reduce"(%left_idx_381) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_383 = tt.expand_dims %left_idx_382 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc241) + %left_idx_384 = tt.broadcast %left_idx_383 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc242) + %right_idx_385 = arith.muli %y_idx_380, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc244) + %right_idx_386 = "tt.reduce"(%right_idx_385) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_387 = tt.expand_dims %right_idx_386 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc246) + %right_idx_388 = tt.broadcast %right_idx_387 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc247) + %left_idx_389 = tt.reshape %left_idx_384 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_390 = tt.reshape %right_idx_388 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond_391 = arith.cmpi slt, %ileft_378, %iright_379 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq_392 = arith.cmpi eq, %ileft_378, %iright_379 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_393 = arith.cmpi sgt, %left_idx_389, %right_idx_390 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_394 = arith.andi %eq_392, %cond_393 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_395 = arith.ori %cond_391, %cond_394 : tensor<128x16xi1, #blocked> loc(#loc254) + %new_idxs_396 = arith.xori %left_idx_389, %right_idx_390 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_397 = arith.select %cond_395, %new_idxs_396, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_398 = arith.xori %new_idxs_368, %new_idxs_397 : tensor<128x16xi32, #blocked> loc(#loc262) + %tmp14 = arith.cmpi eq, %tmp0_31, %cst_12 : tensor<128x16xi64, #blocked> loc(#loc186) + %tmp16 = arith.extui %tmp14 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc217) + %y_399 = tt.reshape %tmp16 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc263) + %ileft_400 = arith.muli %y_399, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc264) + %ileft_401 = "tt.reduce"(%ileft_400) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_402 = tt.expand_dims %ileft_401 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc266) + %ileft_403 = tt.broadcast %ileft_402 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc267) + %iright_404 = arith.muli %y_399, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc268) + %iright_405 = "tt.reduce"(%iright_404) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_406 = tt.expand_dims %iright_405 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc270) + %iright_407 = tt.broadcast %iright_406 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc271) + %ileft_408 = tt.reshape %ileft_403 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_409 = tt.reshape %iright_407 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc273) + %cond_410 = arith.cmpi slt, %ileft_408, %iright_409 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_411 = arith.cmpi eq, %ileft_408, %iright_409 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_412 = arith.andi %eq_411, %cond_71 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_413 = arith.ori %cond_410, %cond_412 : tensor<128x16xi1, #blocked> loc(#loc277) + %cond_414 = arith.extui %cond_413 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc278) + %cond_415 = arith.xori %cond_414, %flip_44 : tensor<128x16xi32, #blocked> loc(#loc278) + %cond_416 = arith.cmpi ne, %cond_415, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc279) + %ret_417 = arith.xori %ileft_408, %iright_409 : tensor<128x16xi32, #blocked> loc(#loc280) + %ret_418 = arith.select %cond_416, %ret_417, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc281) + %ret_419 = arith.xori %tmp16, %ret_418 : tensor<128x16xi32, #blocked> loc(#loc282) + %new_idxs_420 = arith.select %cond_416, %new_idxs, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_421 = arith.xori %new_idxs_81, %new_idxs_420 : tensor<128x16xi32, #blocked> loc(#loc284) + %y_422 = tt.reshape %ret_419 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc263) + %ileft_423 = arith.muli %y_422, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc264) + %ileft_424 = "tt.reduce"(%ileft_423) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_425 = tt.expand_dims %ileft_424 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc266) + %ileft_426 = tt.broadcast %ileft_425 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc267) + %iright_427 = arith.muli %y_422, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc268) + %iright_428 = "tt.reduce"(%iright_427) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_429 = tt.expand_dims %iright_428 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc270) + %iright_430 = tt.broadcast %iright_429 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc271) + %ileft_431 = tt.reshape %ileft_426 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_432 = tt.reshape %iright_430 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc273) + %y_idx_433 = tt.reshape %new_idxs_421 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc285) + %left_idx_434 = arith.muli %y_idx_433, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc286) + %left_idx_435 = "tt.reduce"(%left_idx_434) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_436 = tt.expand_dims %left_idx_435 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc288) + %left_idx_437 = tt.broadcast %left_idx_436 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc289) + %right_idx_438 = arith.muli %y_idx_433, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc290) + %right_idx_439 = "tt.reduce"(%right_idx_438) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_440 = tt.expand_dims %right_idx_439 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc292) + %right_idx_441 = tt.broadcast %right_idx_440 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc293) + %left_idx_442 = tt.reshape %left_idx_437 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc294) + %right_idx_443 = tt.reshape %right_idx_441 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc295) + %cond_444 = arith.cmpi slt, %ileft_431, %iright_432 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_445 = arith.cmpi eq, %ileft_431, %iright_432 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_446 = arith.cmpi sgt, %left_idx_442, %right_idx_443 : tensor<128x16xi32, #blocked> loc(#loc296) + %cond_447 = arith.andi %eq_445, %cond_446 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_448 = arith.ori %cond_444, %cond_447 : tensor<128x16xi1, #blocked> loc(#loc277) + %cond_449 = arith.extui %cond_448 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc278) + %cond_450 = arith.xori %cond_449, %flip_84 : tensor<128x16xi32, #blocked> loc(#loc278) + %cond_451 = arith.cmpi ne, %cond_450, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc279) + %ret_452 = arith.xori %ileft_431, %iright_432 : tensor<128x16xi32, #blocked> loc(#loc280) + %ret_453 = arith.select %cond_451, %ret_452, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc281) + %ret_454 = arith.xori %ret_419, %ret_453 : tensor<128x16xi32, #blocked> loc(#loc282) + %new_idxs_455 = arith.xori %left_idx_442, %right_idx_443 : tensor<128x16xi32, #blocked> loc(#loc297) + %new_idxs_456 = arith.select %cond_451, %new_idxs_455, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_457 = arith.xori %new_idxs_421, %new_idxs_456 : tensor<128x16xi32, #blocked> loc(#loc284) + %y_458 = tt.reshape %ret_454 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc263) + %ileft_459 = arith.muli %y_458, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc264) + %ileft_460 = "tt.reduce"(%ileft_459) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_461 = tt.expand_dims %ileft_460 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc266) + %ileft_462 = tt.broadcast %ileft_461 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc267) + %iright_463 = arith.muli %y_458, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc268) + %iright_464 = "tt.reduce"(%iright_463) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_465 = tt.expand_dims %iright_464 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc270) + %iright_466 = tt.broadcast %iright_465 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc271) + %ileft_467 = tt.reshape %ileft_462 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_468 = tt.reshape %iright_466 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc273) + %y_idx_469 = tt.reshape %new_idxs_457 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc285) + %left_idx_470 = arith.muli %y_idx_469, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc286) + %left_idx_471 = "tt.reduce"(%left_idx_470) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_472 = tt.expand_dims %left_idx_471 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc288) + %left_idx_473 = tt.broadcast %left_idx_472 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc289) + %right_idx_474 = arith.muli %y_idx_469, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc290) + %right_idx_475 = "tt.reduce"(%right_idx_474) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_476 = tt.expand_dims %right_idx_475 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc292) + %right_idx_477 = tt.broadcast %right_idx_476 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc293) + %left_idx_478 = tt.reshape %left_idx_473 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc294) + %right_idx_479 = tt.reshape %right_idx_477 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc295) + %cond_480 = arith.cmpi slt, %ileft_467, %iright_468 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_481 = arith.cmpi eq, %ileft_467, %iright_468 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_482 = arith.cmpi sgt, %left_idx_478, %right_idx_479 : tensor<128x16xi32, #blocked> loc(#loc296) + %cond_483 = arith.andi %eq_481, %cond_482 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_484 = arith.ori %cond_480, %cond_483 : tensor<128x16xi1, #blocked> loc(#loc277) + %cond_485 = arith.extui %cond_484 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc278) + %cond_486 = arith.xori %cond_485, %flip_84 : tensor<128x16xi32, #blocked> loc(#loc278) + %cond_487 = arith.cmpi ne, %cond_486, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc279) + %ret_488 = arith.xori %ileft_467, %iright_468 : tensor<128x16xi32, #blocked> loc(#loc280) + %ret_489 = arith.select %cond_487, %ret_488, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc281) + %ret_490 = arith.xori %ret_454, %ret_489 : tensor<128x16xi32, #blocked> loc(#loc282) + %new_idxs_491 = arith.xori %left_idx_478, %right_idx_479 : tensor<128x16xi32, #blocked> loc(#loc297) + %new_idxs_492 = arith.select %cond_487, %new_idxs_491, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_493 = arith.xori %new_idxs_457, %new_idxs_492 : tensor<128x16xi32, #blocked> loc(#loc284) + %y_494 = tt.reshape %ret_490 : tensor<128x16xi32, #blocked> -> tensor<256x2x4xi32, #blocked2> loc(#loc263) + %ileft_495 = arith.muli %y_494, %ileft_161 : tensor<256x2x4xi32, #blocked2> loc(#loc264) + %ileft_496 = "tt.reduce"(%ileft_495) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc310) + %ileft_497 = tt.expand_dims %ileft_496 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc266) + %ileft_498 = tt.broadcast %ileft_497 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc267) + %iright_499 = arith.muli %y_494, %flip_83 : tensor<256x2x4xi32, #blocked2> loc(#loc268) + %iright_500 = "tt.reduce"(%iright_499) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc312) + %iright_501 = tt.expand_dims %iright_500 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc270) + %iright_502 = tt.broadcast %iright_501 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc271) + %ileft_503 = tt.reshape %ileft_498 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_504 = tt.reshape %iright_502 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc273) + %y_idx_505 = tt.reshape %new_idxs_493 : tensor<128x16xi32, #blocked> -> tensor<256x2x4xi32, #blocked2> loc(#loc285) + %left_idx_506 = arith.muli %y_idx_505, %ileft_161 : tensor<256x2x4xi32, #blocked2> loc(#loc286) + %left_idx_507 = "tt.reduce"(%left_idx_506) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc314) + %left_idx_508 = tt.expand_dims %left_idx_507 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc288) + %left_idx_509 = tt.broadcast %left_idx_508 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc289) + %right_idx_510 = arith.muli %y_idx_505, %flip_83 : tensor<256x2x4xi32, #blocked2> loc(#loc290) + %right_idx_511 = "tt.reduce"(%right_idx_510) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc316) + %right_idx_512 = tt.expand_dims %right_idx_511 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc292) + %right_idx_513 = tt.broadcast %right_idx_512 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc293) + %left_idx_514 = tt.reshape %left_idx_509 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc294) + %right_idx_515 = tt.reshape %right_idx_513 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc295) + %cond_516 = arith.cmpi slt, %ileft_503, %iright_504 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_517 = arith.cmpi eq, %ileft_503, %iright_504 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_518 = arith.cmpi sgt, %left_idx_514, %right_idx_515 : tensor<128x16xi32, #blocked> loc(#loc296) + %cond_519 = arith.andi %eq_517, %cond_518 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_520 = arith.ori %cond_516, %cond_519 : tensor<128x16xi1, #blocked> loc(#loc277) + %cond_521 = arith.extui %cond_520 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc278) + %cond_522 = arith.xori %cond_521, %flip_159 : tensor<128x16xi32, #blocked> loc(#loc278) + %cond_523 = arith.cmpi ne, %cond_522, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc279) + %ret_524 = arith.xori %ileft_503, %iright_504 : tensor<128x16xi32, #blocked> loc(#loc280) + %ret_525 = arith.select %cond_523, %ret_524, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc281) + %ret_526 = arith.xori %ret_490, %ret_525 : tensor<128x16xi32, #blocked> loc(#loc282) + %new_idxs_527 = arith.xori %left_idx_514, %right_idx_515 : tensor<128x16xi32, #blocked> loc(#loc297) + %new_idxs_528 = arith.select %cond_523, %new_idxs_527, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_529 = arith.xori %new_idxs_493, %new_idxs_528 : tensor<128x16xi32, #blocked> loc(#loc284) + %y_530 = tt.reshape %ret_526 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc263) + %ileft_531 = arith.muli %y_530, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc264) + %ileft_532 = "tt.reduce"(%ileft_531) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_533 = tt.expand_dims %ileft_532 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc266) + %ileft_534 = tt.broadcast %ileft_533 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc267) + %iright_535 = arith.muli %y_530, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc268) + %iright_536 = "tt.reduce"(%iright_535) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_537 = tt.expand_dims %iright_536 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc270) + %iright_538 = tt.broadcast %iright_537 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc271) + %ileft_539 = tt.reshape %ileft_534 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_540 = tt.reshape %iright_538 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc273) + %y_idx_541 = tt.reshape %new_idxs_529 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc285) + %left_idx_542 = arith.muli %y_idx_541, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc286) + %left_idx_543 = "tt.reduce"(%left_idx_542) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_544 = tt.expand_dims %left_idx_543 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc288) + %left_idx_545 = tt.broadcast %left_idx_544 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc289) + %right_idx_546 = arith.muli %y_idx_541, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc290) + %right_idx_547 = "tt.reduce"(%right_idx_546) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_548 = tt.expand_dims %right_idx_547 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc292) + %right_idx_549 = tt.broadcast %right_idx_548 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc293) + %left_idx_550 = tt.reshape %left_idx_545 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc294) + %right_idx_551 = tt.reshape %right_idx_549 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc295) + %cond_552 = arith.cmpi slt, %ileft_539, %iright_540 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_553 = arith.cmpi eq, %ileft_539, %iright_540 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_554 = arith.cmpi sgt, %left_idx_550, %right_idx_551 : tensor<128x16xi32, #blocked> loc(#loc296) + %cond_555 = arith.andi %eq_553, %cond_554 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_556 = arith.ori %cond_552, %cond_555 : tensor<128x16xi1, #blocked> loc(#loc277) + %cond_557 = arith.extui %cond_556 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc278) + %cond_558 = arith.xori %cond_557, %flip_159 : tensor<128x16xi32, #blocked> loc(#loc278) + %cond_559 = arith.cmpi ne, %cond_558, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc279) + %ret_560 = arith.xori %ileft_539, %iright_540 : tensor<128x16xi32, #blocked> loc(#loc280) + %ret_561 = arith.select %cond_559, %ret_560, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc281) + %ret_562 = arith.xori %ret_526, %ret_561 : tensor<128x16xi32, #blocked> loc(#loc282) + %new_idxs_563 = arith.xori %left_idx_550, %right_idx_551 : tensor<128x16xi32, #blocked> loc(#loc297) + %new_idxs_564 = arith.select %cond_559, %new_idxs_563, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_565 = arith.xori %new_idxs_529, %new_idxs_564 : tensor<128x16xi32, #blocked> loc(#loc284) + %y_566 = tt.reshape %ret_562 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc263) + %ileft_567 = arith.muli %y_566, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc264) + %ileft_568 = "tt.reduce"(%ileft_567) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_569 = tt.expand_dims %ileft_568 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc266) + %ileft_570 = tt.broadcast %ileft_569 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc267) + %iright_571 = arith.muli %y_566, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc268) + %iright_572 = "tt.reduce"(%iright_571) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_573 = tt.expand_dims %iright_572 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc270) + %iright_574 = tt.broadcast %iright_573 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc271) + %ileft_575 = tt.reshape %ileft_570 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_576 = tt.reshape %iright_574 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc273) + %y_idx_577 = tt.reshape %new_idxs_565 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc285) + %left_idx_578 = arith.muli %y_idx_577, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc286) + %left_idx_579 = "tt.reduce"(%left_idx_578) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_580 = tt.expand_dims %left_idx_579 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc288) + %left_idx_581 = tt.broadcast %left_idx_580 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc289) + %right_idx_582 = arith.muli %y_idx_577, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc290) + %right_idx_583 = "tt.reduce"(%right_idx_582) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_584 = tt.expand_dims %right_idx_583 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc292) + %right_idx_585 = tt.broadcast %right_idx_584 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc293) + %left_idx_586 = tt.reshape %left_idx_581 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc294) + %right_idx_587 = tt.reshape %right_idx_585 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc295) + %cond_588 = arith.cmpi slt, %ileft_575, %iright_576 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_589 = arith.cmpi eq, %ileft_575, %iright_576 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_590 = arith.cmpi sgt, %left_idx_586, %right_idx_587 : tensor<128x16xi32, #blocked> loc(#loc296) + %cond_591 = arith.andi %eq_589, %cond_590 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_592 = arith.ori %cond_588, %cond_591 : tensor<128x16xi1, #blocked> loc(#loc277) + %cond_593 = arith.extui %cond_592 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc278) + %cond_594 = arith.xori %cond_593, %flip_159 : tensor<128x16xi32, #blocked> loc(#loc278) + %cond_595 = arith.cmpi ne, %cond_594, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc279) + %ret_596 = arith.xori %ileft_575, %iright_576 : tensor<128x16xi32, #blocked> loc(#loc280) + %ret_597 = arith.select %cond_595, %ret_596, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc281) + %ret_598 = arith.xori %ret_562, %ret_597 : tensor<128x16xi32, #blocked> loc(#loc282) + %new_idxs_599 = arith.xori %left_idx_586, %right_idx_587 : tensor<128x16xi32, #blocked> loc(#loc297) + %new_idxs_600 = arith.select %cond_595, %new_idxs_599, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_601 = arith.xori %new_idxs_565, %new_idxs_600 : tensor<128x16xi32, #blocked> loc(#loc284) + %y_602 = tt.reshape %ret_598 : tensor<128x16xi32, #blocked> -> tensor<128x2x8xi32, #blocked1> loc(#loc263) + %ileft_603 = arith.muli %y_602, %ileft_270 : tensor<128x2x8xi32, #blocked1> loc(#loc264) + %ileft_604 = "tt.reduce"(%ileft_603) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<128x2x8xi32, #blocked1>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc310) + %ileft_605 = tt.expand_dims %ileft_604 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1x8xi32, #blocked1> loc(#loc266) + %ileft_606 = tt.broadcast %ileft_605 : tensor<128x1x8xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc267) + %iright_607 = arith.muli %y_602, %flip_158 : tensor<128x2x8xi32, #blocked1> loc(#loc268) + %iright_608 = "tt.reduce"(%iright_607) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<128x2x8xi32, #blocked1>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc312) + %iright_609 = tt.expand_dims %iright_608 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1x8xi32, #blocked1> loc(#loc270) + %iright_610 = tt.broadcast %iright_609 : tensor<128x1x8xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc271) + %ileft_611 = tt.reshape %ileft_606 : tensor<128x2x8xi32, #blocked1> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_612 = tt.reshape %iright_610 : tensor<128x2x8xi32, #blocked1> -> tensor<128x16xi32, #blocked> loc(#loc273) + %y_idx_613 = tt.reshape %new_idxs_601 : tensor<128x16xi32, #blocked> -> tensor<128x2x8xi32, #blocked1> loc(#loc285) + %left_idx_614 = arith.muli %y_idx_613, %ileft_270 : tensor<128x2x8xi32, #blocked1> loc(#loc286) + %left_idx_615 = "tt.reduce"(%left_idx_614) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<128x2x8xi32, #blocked1>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc314) + %left_idx_616 = tt.expand_dims %left_idx_615 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1x8xi32, #blocked1> loc(#loc288) + %left_idx_617 = tt.broadcast %left_idx_616 : tensor<128x1x8xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc289) + %right_idx_618 = arith.muli %y_idx_613, %flip_158 : tensor<128x2x8xi32, #blocked1> loc(#loc290) + %right_idx_619 = "tt.reduce"(%right_idx_618) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<128x2x8xi32, #blocked1>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc316) + %right_idx_620 = tt.expand_dims %right_idx_619 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1x8xi32, #blocked1> loc(#loc292) + %right_idx_621 = tt.broadcast %right_idx_620 : tensor<128x1x8xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc293) + %left_idx_622 = tt.reshape %left_idx_617 : tensor<128x2x8xi32, #blocked1> -> tensor<128x16xi32, #blocked> loc(#loc294) + %right_idx_623 = tt.reshape %right_idx_621 : tensor<128x2x8xi32, #blocked1> -> tensor<128x16xi32, #blocked> loc(#loc295) + %cond_624 = arith.cmpi slt, %ileft_611, %iright_612 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_625 = arith.cmpi eq, %ileft_611, %iright_612 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_626 = arith.cmpi sgt, %left_idx_622, %right_idx_623 : tensor<128x16xi32, #blocked> loc(#loc296) + %cond_627 = arith.andi %eq_625, %cond_626 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_628 = arith.ori %cond_624, %cond_627 : tensor<128x16xi1, #blocked> loc(#loc277) + %ret_629 = arith.xori %ileft_611, %iright_612 : tensor<128x16xi32, #blocked> loc(#loc280) + %ret_630 = arith.select %cond_628, %ret_629, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc281) + %ret_631 = arith.xori %ret_598, %ret_630 : tensor<128x16xi32, #blocked> loc(#loc282) + %new_idxs_632 = arith.xori %left_idx_622, %right_idx_623 : tensor<128x16xi32, #blocked> loc(#loc297) + %new_idxs_633 = arith.select %cond_628, %new_idxs_632, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_634 = arith.xori %new_idxs_601, %new_idxs_633 : tensor<128x16xi32, #blocked> loc(#loc284) + %y_635 = tt.reshape %ret_631 : tensor<128x16xi32, #blocked> -> tensor<256x2x4xi32, #blocked2> loc(#loc263) + %ileft_636 = arith.muli %y_635, %ileft_161 : tensor<256x2x4xi32, #blocked2> loc(#loc264) + %ileft_637 = "tt.reduce"(%ileft_636) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc310) + %ileft_638 = tt.expand_dims %ileft_637 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc266) + %ileft_639 = tt.broadcast %ileft_638 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc267) + %iright_640 = arith.muli %y_635, %flip_83 : tensor<256x2x4xi32, #blocked2> loc(#loc268) + %iright_641 = "tt.reduce"(%iright_640) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc312) + %iright_642 = tt.expand_dims %iright_641 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc270) + %iright_643 = tt.broadcast %iright_642 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc271) + %ileft_644 = tt.reshape %ileft_639 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_645 = tt.reshape %iright_643 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc273) + %y_idx_646 = tt.reshape %new_idxs_634 : tensor<128x16xi32, #blocked> -> tensor<256x2x4xi32, #blocked2> loc(#loc285) + %left_idx_647 = arith.muli %y_idx_646, %ileft_161 : tensor<256x2x4xi32, #blocked2> loc(#loc286) + %left_idx_648 = "tt.reduce"(%left_idx_647) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc314) + %left_idx_649 = tt.expand_dims %left_idx_648 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc288) + %left_idx_650 = tt.broadcast %left_idx_649 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc289) + %right_idx_651 = arith.muli %y_idx_646, %flip_83 : tensor<256x2x4xi32, #blocked2> loc(#loc290) + %right_idx_652 = "tt.reduce"(%right_idx_651) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc316) + %right_idx_653 = tt.expand_dims %right_idx_652 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc292) + %right_idx_654 = tt.broadcast %right_idx_653 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc293) + %left_idx_655 = tt.reshape %left_idx_650 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc294) + %right_idx_656 = tt.reshape %right_idx_654 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc295) + %cond_657 = arith.cmpi slt, %ileft_644, %iright_645 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_658 = arith.cmpi eq, %ileft_644, %iright_645 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_659 = arith.cmpi sgt, %left_idx_655, %right_idx_656 : tensor<128x16xi32, #blocked> loc(#loc296) + %cond_660 = arith.andi %eq_658, %cond_659 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_661 = arith.ori %cond_657, %cond_660 : tensor<128x16xi1, #blocked> loc(#loc277) + %ret_662 = arith.xori %ileft_644, %iright_645 : tensor<128x16xi32, #blocked> loc(#loc280) + %ret_663 = arith.select %cond_661, %ret_662, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc281) + %ret_664 = arith.xori %ret_631, %ret_663 : tensor<128x16xi32, #blocked> loc(#loc282) + %new_idxs_665 = arith.xori %left_idx_655, %right_idx_656 : tensor<128x16xi32, #blocked> loc(#loc297) + %new_idxs_666 = arith.select %cond_661, %new_idxs_665, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_667 = arith.xori %new_idxs_634, %new_idxs_666 : tensor<128x16xi32, #blocked> loc(#loc284) + %y_668 = tt.reshape %ret_664 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc263) + %ileft_669 = arith.muli %y_668, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc264) + %ileft_670 = "tt.reduce"(%ileft_669) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_671 = tt.expand_dims %ileft_670 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc266) + %ileft_672 = tt.broadcast %ileft_671 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc267) + %iright_673 = arith.muli %y_668, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc268) + %iright_674 = "tt.reduce"(%iright_673) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_675 = tt.expand_dims %iright_674 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc270) + %iright_676 = tt.broadcast %iright_675 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc271) + %ileft_677 = tt.reshape %ileft_672 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_678 = tt.reshape %iright_676 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc273) + %y_idx_679 = tt.reshape %new_idxs_667 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc285) + %left_idx_680 = arith.muli %y_idx_679, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc286) + %left_idx_681 = "tt.reduce"(%left_idx_680) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_682 = tt.expand_dims %left_idx_681 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc288) + %left_idx_683 = tt.broadcast %left_idx_682 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc289) + %right_idx_684 = arith.muli %y_idx_679, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc290) + %right_idx_685 = "tt.reduce"(%right_idx_684) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_686 = tt.expand_dims %right_idx_685 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc292) + %right_idx_687 = tt.broadcast %right_idx_686 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc293) + %left_idx_688 = tt.reshape %left_idx_683 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc294) + %right_idx_689 = tt.reshape %right_idx_687 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc295) + %cond_690 = arith.cmpi slt, %ileft_677, %iright_678 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_691 = arith.cmpi eq, %ileft_677, %iright_678 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_692 = arith.cmpi sgt, %left_idx_688, %right_idx_689 : tensor<128x16xi32, #blocked> loc(#loc296) + %cond_693 = arith.andi %eq_691, %cond_692 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_694 = arith.ori %cond_690, %cond_693 : tensor<128x16xi1, #blocked> loc(#loc277) + %ret_695 = arith.xori %ileft_677, %iright_678 : tensor<128x16xi32, #blocked> loc(#loc280) + %ret_696 = arith.select %cond_694, %ret_695, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc281) + %ret_697 = arith.xori %ret_664, %ret_696 : tensor<128x16xi32, #blocked> loc(#loc282) + %new_idxs_698 = arith.xori %left_idx_688, %right_idx_689 : tensor<128x16xi32, #blocked> loc(#loc297) + %new_idxs_699 = arith.select %cond_694, %new_idxs_698, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_700 = arith.xori %new_idxs_667, %new_idxs_699 : tensor<128x16xi32, #blocked> loc(#loc284) + %y_701 = tt.reshape %ret_697 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc263) + %ileft_702 = arith.muli %y_701, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc264) + %ileft_703 = "tt.reduce"(%ileft_702) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_704 = tt.expand_dims %ileft_703 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc266) + %ileft_705 = tt.broadcast %ileft_704 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc267) + %iright_706 = arith.muli %y_701, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc268) + %iright_707 = "tt.reduce"(%iright_706) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_708 = tt.expand_dims %iright_707 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc270) + %iright_709 = tt.broadcast %iright_708 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc271) + %ileft_710 = tt.reshape %ileft_705 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_711 = tt.reshape %iright_709 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc273) + %y_idx_712 = tt.reshape %new_idxs_700 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc285) + %left_idx_713 = arith.muli %y_idx_712, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc286) + %left_idx_714 = "tt.reduce"(%left_idx_713) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_715 = tt.expand_dims %left_idx_714 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc288) + %left_idx_716 = tt.broadcast %left_idx_715 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc289) + %right_idx_717 = arith.muli %y_idx_712, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc290) + %right_idx_718 = "tt.reduce"(%right_idx_717) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_719 = tt.expand_dims %right_idx_718 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc292) + %right_idx_720 = tt.broadcast %right_idx_719 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc293) + %left_idx_721 = tt.reshape %left_idx_716 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc294) + %right_idx_722 = tt.reshape %right_idx_720 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc295) + %cond_723 = arith.cmpi slt, %ileft_710, %iright_711 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_724 = arith.cmpi eq, %ileft_710, %iright_711 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_725 = arith.cmpi sgt, %left_idx_721, %right_idx_722 : tensor<128x16xi32, #blocked> loc(#loc296) + %cond_726 = arith.andi %eq_724, %cond_725 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_727 = arith.ori %cond_723, %cond_726 : tensor<128x16xi1, #blocked> loc(#loc277) + %new_idxs_728 = arith.xori %left_idx_721, %right_idx_722 : tensor<128x16xi32, #blocked> loc(#loc297) + %new_idxs_729 = arith.select %cond_727, %new_idxs_728, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_730 = arith.xori %new_idxs_700, %new_idxs_729 : tensor<128x16xi32, #blocked> loc(#loc284) + %tmp20 = arith.extui %tmp5 : tensor<128x16xi1, #blocked> to tensor<128x16xi64, #blocked> loc(#loc219) + %tmp23 = arith.select %tmp0_29, %tmp20, %cst_13 : tensor<128x16xi1, #blocked>, tensor<128x16xi64, #blocked> loc(#loc191) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_741: i64 loc(callsite(#loc1 at #loc192)), %tmp24_742: i64 loc(callsite(#loc1 at #loc192))): + %tmp24_743 = arith.addi %tmp24_741, %tmp24_742 : i64 loc(#loc298) + tt.reduce.return %tmp24_743 : i64 loc(#loc220) + }) : (tensor<128x16xi64, #blocked>) -> tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc220) + %tmp30 = ttg.convert_layout %tmp24 : tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc193) + %tmp24_731 = tt.expand_dims %tmp30 {axis = 1 : i32} : tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<128x1xi64, #blocked5> loc(#loc194) + %tmp24_732 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi64, #blocked> loc(#loc194) + %tmp25 = arith.extui %tmp14 : tensor<128x16xi1, #blocked> to tensor<128x16xi64, #blocked> loc(#loc222) + %tmp28 = arith.select %tmp0_29, %tmp25, %cst_13 : tensor<128x16xi1, #blocked>, tensor<128x16xi64, #blocked> loc(#loc196) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_741: i64 loc(callsite(#loc1 at #loc197)), %tmp29_742: i64 loc(callsite(#loc1 at #loc197))): + %tmp29_743 = arith.addi %tmp29_741, %tmp29_742 : i64 loc(#loc299) + tt.reduce.return %tmp29_743 : i64 loc(#loc223) + }) : (tensor<128x16xi64, #blocked>) -> tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc223) + %tmp31 = ttg.convert_layout %tmp29 : tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc198) + %tmp29_733 = tt.expand_dims %tmp31 {axis = 1 : i32} : tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<128x1xi64, #blocked5> loc(#loc199) + %tmp29_734 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi64, #blocked> loc(#loc199) + %tmp30_735 = arith.trunci %tmp24_731 : tensor<128x1xi64, #blocked5> to tensor<128x1xi32, #blocked5> loc(#loc193) + %tmp30_736 = arith.trunci %tmp24_732 : tensor<128x1xi64, #blocked> to tensor<128x1xi32, #blocked> loc(#loc193) + %tmp31_737 = arith.trunci %tmp29_733 : tensor<128x1xi64, #blocked5> to tensor<128x1xi32, #blocked5> loc(#loc198) + %tmp31_738 = arith.trunci %tmp29_734 : tensor<128x1xi64, #blocked> to tensor<128x1xi32, #blocked> loc(#loc198) + %tmp34 = tt.broadcast %tmp30_736 : tensor<128x1xi32, #blocked> -> tensor<128x16xi32, #blocked> loc(#loc200) + %tmp34_739 = arith.cmpi slt, %tmp0_24, %tmp34 : tensor<128x16xi32, #blocked> loc(#loc200) + %tmp36 = arith.select %tmp34_739, %new_idxs_398, %cst_11 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc201) + %tmp38 = arith.addi %tmp36, %cst_10 : tensor<128x16xi32, #blocked> loc(#loc202) + %tmp39 = arith.cmpi slt, %tmp36, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc203) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc204) + %0 = arith.cmpi sge, %tmp40, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc85) + %1 = arith.cmpi slt, %tmp40, %cst_10 : tensor<128x16xi32, #blocked> loc(#loc86) + %2 = arith.andi %0, %1 : tensor<128x16xi1, #blocked> loc(#loc87) + %3 = arith.xori %xmask, %cst : tensor<128x1xi1, #blocked> loc(#loc88) + %4 = tt.broadcast %3 : tensor<128x1xi1, #blocked> -> tensor<128x16xi1, #blocked> loc(#loc89) + %5 = arith.ori %2, %4 : tensor<128x16xi1, #blocked> loc(#loc89) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<128x16xi1, #blocked> loc(#loc90) + %tmp45 = tt.broadcast %tmp31_738 : tensor<128x1xi32, #blocked> -> tensor<128x16xi32, #blocked> loc(#loc205) + %tmp45_740 = arith.cmpi slt, %tmp0_24, %tmp45 : tensor<128x16xi32, #blocked> loc(#loc205) + %tmp46 = arith.select %tmp45_740, %new_idxs_730, %cst_11 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc206) + %tmp47 = arith.addi %tmp46, %cst_10 : tensor<128x16xi32, #blocked> loc(#loc207) + %tmp48 = arith.cmpi slt, %tmp46, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc208) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc209) + %6 = arith.cmpi sge, %tmp49, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc96) + %7 = arith.cmpi slt, %tmp49, %cst_10 : tensor<128x16xi32, #blocked> loc(#loc97) + %8 = arith.andi %6, %7 : tensor<128x16xi1, #blocked> loc(#loc98) + %9 = arith.ori %8, %4 : tensor<128x16xi1, #blocked> loc(#loc99) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<128x16xi1, #blocked> loc(#loc100) + %10 = tt.splat %out_ptr4 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked5> loc(#loc101) + %11 = tt.addptr %10, %xindex_21 : tensor<128x1x!tt.ptr, #blocked5>, tensor<128x1xi32, #blocked5> loc(#loc101) + tt.store %11, %tmp30_735, %xmask_22 : tensor<128x1x!tt.ptr, #blocked5> loc(#loc102) + %12 = tt.splat %out_ptr5 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked5> loc(#loc103) + %13 = tt.addptr %12, %xindex_21 : tensor<128x1x!tt.ptr, #blocked5>, tensor<128x1xi32, #blocked5> loc(#loc103) + tt.store %13, %tmp31_737, %xmask_22 : tensor<128x1x!tt.ptr, #blocked5> loc(#loc104) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<128x16x!tt.ptr, #blocked> loc(#loc105) + %15 = tt.addptr %14, %tmp0_26 : tensor<128x16x!tt.ptr, #blocked>, tensor<128x16xi32, #blocked> loc(#loc105) + tt.store %15, %new_idxs_398, %tmp0_29 : tensor<128x16x!tt.ptr, #blocked> loc(#loc106) + %16 = arith.muli %xindex_20, %cst_0 : tensor<128x1xi32, #blocked> loc(#loc107) + %17 = tt.broadcast %16 : tensor<128x1xi32, #blocked> -> tensor<128x16xi32, #blocked> loc(#loc108) + %18 = arith.addi %tmp40, %17 : tensor<128x16xi32, #blocked> loc(#loc108) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<128x16x!tt.ptr, #blocked> loc(#loc109) + %20 = tt.addptr %19, %18 : tensor<128x16x!tt.ptr, #blocked>, tensor<128x16xi32, #blocked> loc(#loc109) + %21 = ttg.convert_layout %20 : tensor<128x16x!tt.ptr, #blocked> -> tensor<128x16x!tt.ptr, #blocked5> loc(#loc110) + tt.store %21, %cst_8, %tmp0_30 : tensor<128x16x!tt.ptr, #blocked5> loc(#loc110) + %22 = tt.splat %out_ptr8 : !tt.ptr -> tensor<128x16x!tt.ptr, #blocked> loc(#loc111) + %23 = tt.addptr %22, %tmp0_26 : tensor<128x16x!tt.ptr, #blocked>, tensor<128x16xi32, #blocked> loc(#loc111) + tt.store %23, %new_idxs_730, %tmp0_29 : tensor<128x16x!tt.ptr, #blocked> loc(#loc112) + %24 = arith.addi %tmp49, %17 : tensor<128x16xi32, #blocked> loc(#loc113) + %25 = tt.splat %out_ptr9 : !tt.ptr -> tensor<128x16x!tt.ptr, #blocked> loc(#loc114) + %26 = tt.addptr %25, %24 : tensor<128x16x!tt.ptr, #blocked>, tensor<128x16xi32, #blocked> loc(#loc114) + %27 = ttg.convert_layout %26 : tensor<128x16x!tt.ptr, #blocked> -> tensor<128x16x!tt.ptr, #blocked5> loc(#loc115) + tt.store %27, %cst_8, %tmp0_30 : tensor<128x16x!tt.ptr, #blocked5> loc(#loc115) + tt.return loc(#loc116) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":24:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":25:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":25:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":26:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":27:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":34:40) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":34:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":34:30) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":34:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":36:18) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":38:18) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":39:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":41:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":40:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":43:19) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":45:34) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":47:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":49:21) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":48:21) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":52:20) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":54:35) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":60:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":55:29) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":56:21) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":58:35) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":61:21) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":59:29) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":64:19) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":66:35) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":68:20) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":69:20) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":70:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":71:28) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":71:46) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":71:38) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":71:55) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":71:53) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":71:63) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":75:19) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":76:35) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":77:20) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":78:20) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":79:35) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":80:28) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":80:46) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":80:38) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":80:53) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":80:63) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":81:25) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":81:37) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":82:25) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":82:37) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":83:25) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":83:47) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":84:52) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":84:49) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":84:25) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":84:85) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":85:25) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":85:47) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":86:49) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":86:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":86:85) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":86:4) +#loc126 = loc("xoffset"(#loc2)) +#loc127 = loc("xoffset"(#loc3)) +#loc128 = loc("xindex"(#loc4)) +#loc129 = loc("xindex"(#loc5)) +#loc130 = loc("xmask"(#loc6)) +#loc131 = loc("r0_index"(#loc7)) +#loc132 = loc("tmp0"(#loc8)) +#loc133 = loc("tmp0"(#loc9)) +#loc134 = loc("tmp0"(#loc10)) +#loc135 = loc("tmp0"(#loc11)) +#loc136 = loc("tmp2"(#loc12)) +#loc137 = loc("tmp4"(#loc13)) +#loc138 = loc("tmp5"(#loc14)) +#loc139 = loc("tmp7"(#loc15)) +#loc140 = loc("tmp6"(#loc16)) +#loc141 = loc("tmp9"(#loc17)) +#loc142 = loc("tmp11"(#loc18)) +#loc143 = loc("flip"(#loc19)) +#loc145 = loc("flip"(#loc22)) +#loc146 = loc("flip"(#loc23)) +#loc147 = loc("y"(#loc24)) +#loc148 = loc("left_mask"(#loc26)) +#loc149 = loc("ileft"(#loc27)) +#loc151 = loc("ileft"(#loc31)) +#loc152 = loc("ileft"(#loc32)) +#loc153 = loc("iright"(#loc33)) +#loc155 = loc("iright"(#loc35)) +#loc156 = loc("iright"(#loc36)) +#loc157 = loc("ileft"(#loc37)) +#loc158 = loc("iright"(#loc38)) +#loc159 = loc("y_idx"(#loc39)) +#loc160 = loc("left_idx"(#loc40)) +#loc161 = loc("left_idx"(#loc41)) +#loc162 = loc("input"(#loc42)) +#loc164 = loc("left_idx"(#loc44)) +#loc165 = loc("left_idx"(#loc45)) +#loc166 = loc("right_idx"(#loc46)) +#loc167 = loc("right_idx"(#loc47)) +#loc169 = loc("right_idx"(#loc49)) +#loc170 = loc("right_idx"(#loc50)) +#loc171 = loc("left_idx"(#loc51)) +#loc172 = loc("right_idx"(#loc52)) +#loc173 = loc("cond"(#loc53)) +#loc174 = loc("eq"(#loc54)) +#loc175 = loc("cond"(#loc55)) +#loc176 = loc("cond"(#loc56)) +#loc177 = loc("cond"(#loc57)) +#loc178 = loc("cond"(#loc58)) +#loc179 = loc("cond"(#loc59)) +#loc180 = loc("ret"(#loc60)) +#loc181 = loc("ret"(#loc61)) +#loc182 = loc("ret"(#loc62)) +#loc183 = loc("new_idxs"(#loc63)) +#loc184 = loc("new_idxs"(#loc64)) +#loc185 = loc("new_idxs"(#loc65)) +#loc186 = loc("tmp14"(#loc66)) +#loc187 = loc("tmp16"(#loc67)) +#loc188 = loc("tmp15"(#loc68)) +#loc190 = loc("tmp20"(#loc70)) +#loc191 = loc("tmp23"(#loc71)) +#loc193 = loc("tmp30"(#loc73)) +#loc194 = loc("tmp24"(#loc74)) +#loc195 = loc("tmp25"(#loc75)) +#loc196 = loc("tmp28"(#loc76)) +#loc198 = loc("tmp31"(#loc78)) +#loc199 = loc("tmp29"(#loc79)) +#loc200 = loc("tmp34"(#loc80)) +#loc201 = loc("tmp36"(#loc81)) +#loc202 = loc("tmp38"(#loc82)) +#loc203 = loc("tmp39"(#loc83)) +#loc204 = loc("tmp40"(#loc84)) +#loc205 = loc("tmp45"(#loc91)) +#loc206 = loc("tmp46"(#loc92)) +#loc207 = loc("tmp47"(#loc93)) +#loc208 = loc("tmp48"(#loc94)) +#loc209 = loc("tmp49"(#loc95)) +#loc210 = loc(fused[#loc139, #loc140]) +#loc211 = loc(callsite(#loc143 at #loc144)) +#loc212 = loc(callsite(#loc145 at #loc144)) +#loc213 = loc(callsite(#loc146 at #loc144)) +#loc215 = loc("cond"(#loc173)) +#loc216 = loc("eq"(#loc174)) +#loc217 = loc(fused[#loc187, #loc188]) +#loc219 = loc(fused[#loc190, #loc139, #loc140]) +#loc220 = loc(callsite(#loc28 at #loc192)) +#loc222 = loc(fused[#loc195, #loc187, #loc188]) +#loc223 = loc(callsite(#loc28 at #loc197)) +#loc225 = loc(callsite(#loc147 at #loc214)) +#loc226 = loc(callsite(#loc148 at #loc214)) +#loc227 = loc(callsite(#loc149 at #loc214)) +#loc229 = loc(callsite(#loc151 at #loc214)) +#loc230 = loc(callsite(#loc152 at #loc214)) +#loc231 = loc(callsite(#loc153 at #loc214)) +#loc233 = loc(callsite(#loc155 at #loc214)) +#loc234 = loc(callsite(#loc156 at #loc214)) +#loc235 = loc(callsite(#loc157 at #loc214)) +#loc236 = loc(callsite(#loc158 at #loc214)) +#loc237 = loc(callsite(#loc159 at #loc214)) +#loc238 = loc(callsite(#loc160 at #loc214)) +#loc239 = loc(callsite(#loc161 at #loc214)) +#loc241 = loc(callsite(#loc164 at #loc214)) +#loc242 = loc(callsite(#loc165 at #loc214)) +#loc243 = loc(callsite(#loc166 at #loc214)) +#loc244 = loc(callsite(#loc167 at #loc214)) +#loc246 = loc(callsite(#loc169 at #loc214)) +#loc247 = loc(callsite(#loc170 at #loc214)) +#loc248 = loc(callsite(#loc171 at #loc214)) +#loc249 = loc(callsite(#loc172 at #loc214)) +#loc250 = loc(callsite(#loc215 at #loc214)) +#loc251 = loc(callsite(#loc216 at #loc214)) +#loc252 = loc(callsite(#loc175 at #loc214)) +#loc253 = loc(callsite(#loc176 at #loc214)) +#loc254 = loc(callsite(#loc177 at #loc214)) +#loc255 = loc(callsite(#loc178 at #loc214)) +#loc256 = loc(callsite(#loc179 at #loc214)) +#loc257 = loc(callsite(#loc180 at #loc214)) +#loc258 = loc(callsite(#loc181 at #loc214)) +#loc259 = loc(callsite(#loc182 at #loc214)) +#loc260 = loc(callsite(#loc183 at #loc214)) +#loc261 = loc(callsite(#loc184 at #loc214)) +#loc262 = loc(callsite(#loc185 at #loc214)) +#loc263 = loc(callsite(#loc147 at #loc218)) +#loc264 = loc(callsite(#loc149 at #loc218)) +#loc266 = loc(callsite(#loc151 at #loc218)) +#loc267 = loc(callsite(#loc152 at #loc218)) +#loc268 = loc(callsite(#loc153 at #loc218)) +#loc270 = loc(callsite(#loc155 at #loc218)) +#loc271 = loc(callsite(#loc156 at #loc218)) +#loc272 = loc(callsite(#loc157 at #loc218)) +#loc273 = loc(callsite(#loc158 at #loc218)) +#loc274 = loc(callsite(#loc215 at #loc218)) +#loc275 = loc(callsite(#loc216 at #loc218)) +#loc276 = loc(callsite(#loc176 at #loc218)) +#loc277 = loc(callsite(#loc177 at #loc218)) +#loc278 = loc(callsite(#loc178 at #loc218)) +#loc279 = loc(callsite(#loc179 at #loc218)) +#loc280 = loc(callsite(#loc180 at #loc218)) +#loc281 = loc(callsite(#loc181 at #loc218)) +#loc282 = loc(callsite(#loc182 at #loc218)) +#loc283 = loc(callsite(#loc184 at #loc218)) +#loc284 = loc(callsite(#loc185 at #loc218)) +#loc285 = loc(callsite(#loc159 at #loc218)) +#loc286 = loc(callsite(#loc161 at #loc218)) +#loc288 = loc(callsite(#loc164 at #loc218)) +#loc289 = loc(callsite(#loc165 at #loc218)) +#loc290 = loc(callsite(#loc167 at #loc218)) +#loc292 = loc(callsite(#loc169 at #loc218)) +#loc293 = loc(callsite(#loc170 at #loc218)) +#loc294 = loc(callsite(#loc171 at #loc218)) +#loc295 = loc(callsite(#loc172 at #loc218)) +#loc296 = loc(callsite(#loc175 at #loc218)) +#loc297 = loc(callsite(#loc183 at #loc218)) +#loc298 = loc(callsite(#loc30 at #loc220)) +#loc299 = loc(callsite(#loc30 at #loc223)) +#loc300 = loc(callsite(#loc28 at #loc228)) +#loc302 = loc(callsite(#loc28 at #loc232)) +#loc304 = loc(callsite(#loc162 at #loc240)) +#loc305 = loc(callsite(#loc28 at #loc240)) +#loc307 = loc(callsite(#loc162 at #loc245)) +#loc308 = loc(callsite(#loc28 at #loc245)) +#loc310 = loc(callsite(#loc28 at #loc265)) +#loc312 = loc(callsite(#loc28 at #loc269)) +#loc314 = loc(callsite(#loc28 at #loc287)) +#loc316 = loc(callsite(#loc28 at #loc291)) +#loc318 = loc(callsite(#loc30 at #loc300)) +#loc319 = loc(callsite(#loc30 at #loc302)) +#loc320 = loc(callsite(#loc30 at #loc305)) +#loc321 = loc(callsite(#loc30 at #loc308)) +#loc322 = loc(callsite(#loc30 at #loc310)) +#loc323 = loc(callsite(#loc30 at #loc312)) +#loc324 = loc(callsite(#loc30 at #loc314)) +#loc325 = loc(callsite(#loc30 at #loc316)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..1a96ef3fe578600bfd505b0253b4c3ee687f5191 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir @@ -0,0 +1,1451 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":18:0) +#loc1 = loc(unknown) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":46:71) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":51:71) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":55:26) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":59:26) +#loc120 = loc("in_ptr0"(#loc)) +#loc121 = loc("out_ptr4"(#loc)) +#loc122 = loc("out_ptr5"(#loc)) +#loc123 = loc("out_ptr6"(#loc)) +#loc124 = loc("out_ptr7"(#loc)) +#loc125 = loc("out_ptr8"(#loc)) +#loc126 = loc("out_ptr9"(#loc)) +#loc127 = loc("xnumel"(#loc)) +#loc128 = loc("r0_numel"(#loc)) +#loc149 = loc(callsite(#loc22 at #loc23)) +#loc156 = loc("ileft"(#loc32)) +#loc160 = loc("iright"(#loc37)) +#loc169 = loc("left_idx"(#loc46)) +#loc174 = loc("right_idx"(#loc51)) +#loc195 = loc(callsite(#loc22 at #loc72)) +#loc198 = loc("tmp24"(#loc75)) +#loc202 = loc("tmp29"(#loc79)) +#loc221 = loc(callsite(#loc28 at #loc149)) +#loc225 = loc(callsite(#loc28 at #loc195)) +#loc228 = loc(callsite(#loc1 at #loc198)) +#loc231 = loc(callsite(#loc1 at #loc202)) +#loc235 = loc(callsite(#loc156 at #loc221)) +#loc239 = loc(callsite(#loc160 at #loc221)) +#loc247 = loc(callsite(#loc169 at #loc221)) +#loc252 = loc(callsite(#loc174 at #loc221)) +#loc272 = loc(callsite(#loc156 at #loc225)) +#loc276 = loc(callsite(#loc160 at #loc225)) +#loc294 = loc(callsite(#loc169 at #loc225)) +#loc298 = loc(callsite(#loc174 at #loc225)) +#loc308 = loc(callsite(#loc1 at #loc235)) +#loc310 = loc(callsite(#loc1 at #loc239)) +#loc313 = loc(callsite(#loc1 at #loc247)) +#loc316 = loc(callsite(#loc1 at #loc252)) +#loc318 = loc(callsite(#loc1 at #loc272)) +#loc320 = loc(callsite(#loc1 at #loc276)) +#loc322 = loc(callsite(#loc1 at #loc294)) +#loc324 = loc(callsite(#loc1 at #loc298)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<128x16xi32> loc(#loc1) + %cst_1 = arith.constant dense<17> : tensor<128x1xi32> loc(#loc1) + %cst_2 = arith.constant dense : tensor<128x1xi1> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc1) + %cst_4 = arith.constant dense<17> : tensor<128x16xi32> loc(#loc1) + %cst_5 = arith.constant dense<16> : tensor<128x16xi32> loc(#loc1) + %cst_6 = arith.constant dense<16384> : tensor<128x16xi64> loc(#loc1) + %cst_7 = arith.constant dense<0> : tensor<128x16xi64> loc(#loc1) + %cst_8 = arith.constant dense<16> : tensor<128x1xi32> loc(#loc1) + %xmask = arith.constant dense<128> : tensor<128x1xi32> loc(#loc129) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc130) + %xoffset_9 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc131) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc132) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc133) + %xindex_11 = tt.splat %xoffset_9 : i32 -> tensor<128x1xi32> loc(#loc134) + %xindex_12 = arith.addi %xindex_11, %xindex_10 : tensor<128x1xi32> loc(#loc134) + %xmask_13 = arith.cmpi slt, %xindex_12, %xmask : tensor<128x1xi32> loc(#loc129) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc135) + %r0_index_14 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc136) + %tmp0 = arith.muli %xindex_12, %cst_8 : tensor<128x1xi32> loc(#loc137) + %tmp0_15 = tt.broadcast %r0_index_14 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc138) + %tmp0_16 = tt.broadcast %tmp0 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc138) + %tmp0_17 = arith.addi %tmp0_15, %tmp0_16 : tensor<128x16xi32> loc(#loc138) + %tmp0_18 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc139) + %tmp0_19 = tt.addptr %tmp0_18, %tmp0_17 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc139) + %tmp0_20 = tt.broadcast %xmask_13 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc140) + %tmp0_21 = tt.load %tmp0_19, %tmp0_20, %cst_7 : tensor<128x16x!tt.ptr> loc(#loc140) + %tmp2 = arith.cmpi sgt, %tmp0_21, %cst_7 : tensor<128x16xi64> loc(#loc141) + %tmp4 = arith.cmpi slt, %tmp0_21, %cst_6 : tensor<128x16xi64> loc(#loc142) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<128x16xi1> loc(#loc143) + %tmp7 = arith.extui %tmp5 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc216) + %tmp9 = arith.trunci %r0_index_14 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc146) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16> -> tensor<128x16xi16> loc(#loc147) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc217) + %flip_22 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc218) + %flip_23 = tt.expand_dims %flip_22 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc218) + %flip_24 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc219) + %flip_25 = tt.reshape %flip_24 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc220) + %y = tt.reshape %tmp7 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc232) + %left_mask = arith.subi %cst, %flip_23 : tensor<1x2x1xi32> loc(#loc233) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc234) + %ileft_26 = arith.muli %y, %ileft : tensor<1024x2x1xi32> loc(#loc234) + %ileft_27 = "tt.reduce"(%ileft_26) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc307) + %ileft_28 = tt.expand_dims %ileft_27 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc236) + %ileft_29 = tt.broadcast %ileft_28 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc237) + %iright = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc238) + %iright_30 = arith.muli %y, %iright : tensor<1024x2x1xi32> loc(#loc238) + %iright_31 = "tt.reduce"(%iright_30) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc309) + %iright_32 = tt.expand_dims %iright_31 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc240) + %iright_33 = tt.broadcast %iright_32 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc241) + %ileft_34 = tt.reshape %ileft_29 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_35 = tt.reshape %iright_33 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx = tt.reshape %tmp11 : tensor<128x16xi16> -> tensor<1024x2x1xi16> loc(#loc244) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc245) + %left_idx_36 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<1024x2x1xi16> loc(#loc246) + %left_idx_37 = arith.muli %y_idx, %left_idx_36 : tensor<1024x2x1xi16> loc(#loc246) + %input = arith.extsi %left_idx_37 : tensor<1024x2x1xi16> to tensor<1024x2x1xi32> loc(#loc311) + %left_idx_38 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc312) + %left_idx_39 = tt.expand_dims %left_idx_38 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc248) + %left_idx_40 = tt.broadcast %left_idx_39 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc249) + %right_idx = arith.trunci %flip_23 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc250) + %right_idx_41 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<1024x2x1xi16> loc(#loc251) + %right_idx_42 = arith.muli %y_idx, %right_idx_41 : tensor<1024x2x1xi16> loc(#loc251) + %input_43 = arith.extsi %right_idx_42 : tensor<1024x2x1xi16> to tensor<1024x2x1xi32> loc(#loc314) + %right_idx_44 = "tt.reduce"(%input_43) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc315) + %right_idx_45 = tt.expand_dims %right_idx_44 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc253) + %right_idx_46 = tt.broadcast %right_idx_45 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc254) + %left_idx_47 = tt.reshape %left_idx_40 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_48 = tt.reshape %right_idx_46 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc256) + %cond = arith.cmpi slt, %ileft_34, %iright_35 : tensor<128x16xi32> loc(#loc257) + %eq = arith.cmpi eq, %ileft_34, %iright_35 : tensor<128x16xi32> loc(#loc258) + %cond_49 = arith.cmpi sgt, %left_idx_47, %right_idx_48 : tensor<128x16xi32> loc(#loc259) + %cond_50 = arith.andi %eq, %cond_49 : tensor<128x16xi1> loc(#loc260) + %cond_51 = arith.ori %cond, %cond_50 : tensor<128x16xi1> loc(#loc261) + %cond_52 = arith.extui %cond_51 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc262) + %cond_53 = arith.xori %cond_52, %flip_25 : tensor<128x16xi32> loc(#loc262) + %cond_54 = arith.cmpi ne, %cond_53, %cst_3 : tensor<128x16xi32> loc(#loc263) + %ret = arith.xori %ileft_34, %iright_35 : tensor<128x16xi32> loc(#loc264) + %ret_55 = arith.select %cond_54, %ret, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc265) + %ret_56 = arith.xori %tmp7, %ret_55 : tensor<128x16xi32> loc(#loc266) + %new_idxs = arith.xori %left_idx_47, %right_idx_48 : tensor<128x16xi32> loc(#loc267) + %new_idxs_57 = arith.select %cond_54, %new_idxs, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_58 = arith.extsi %tmp9 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc269) + %new_idxs_59 = tt.broadcast %new_idxs_58 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc269) + %new_idxs_60 = arith.xori %new_idxs_59, %new_idxs_57 : tensor<128x16xi32> loc(#loc269) + %flip_61 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc219) + %flip_62 = tt.reshape %flip_61 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc220) + %y_63 = tt.reshape %ret_56 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc232) + %ileft_64 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc234) + %ileft_65 = arith.muli %y_63, %ileft_64 : tensor<512x2x2xi32> loc(#loc234) + %ileft_66 = "tt.reduce"(%ileft_65) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc307) + %ileft_67 = tt.expand_dims %ileft_66 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc236) + %ileft_68 = tt.broadcast %ileft_67 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc237) + %iright_69 = arith.muli %y_63, %flip_24 : tensor<512x2x2xi32> loc(#loc238) + %iright_70 = "tt.reduce"(%iright_69) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc309) + %iright_71 = tt.expand_dims %iright_70 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc240) + %iright_72 = tt.broadcast %iright_71 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc241) + %ileft_73 = tt.reshape %ileft_68 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_74 = tt.reshape %iright_72 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx_75 = tt.reshape %new_idxs_60 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc244) + %left_idx_76 = arith.muli %y_idx_75, %ileft_64 : tensor<512x2x2xi32> loc(#loc246) + %left_idx_77 = "tt.reduce"(%left_idx_76) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc312) + %left_idx_78 = tt.expand_dims %left_idx_77 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc248) + %left_idx_79 = tt.broadcast %left_idx_78 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc249) + %right_idx_80 = arith.muli %y_idx_75, %flip_24 : tensor<512x2x2xi32> loc(#loc251) + %right_idx_81 = "tt.reduce"(%right_idx_80) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc315) + %right_idx_82 = tt.expand_dims %right_idx_81 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc253) + %right_idx_83 = tt.broadcast %right_idx_82 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc254) + %left_idx_84 = tt.reshape %left_idx_79 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_85 = tt.reshape %right_idx_83 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc256) + %cond_86 = arith.cmpi slt, %ileft_73, %iright_74 : tensor<128x16xi32> loc(#loc257) + %eq_87 = arith.cmpi eq, %ileft_73, %iright_74 : tensor<128x16xi32> loc(#loc258) + %cond_88 = arith.cmpi sgt, %left_idx_84, %right_idx_85 : tensor<128x16xi32> loc(#loc259) + %cond_89 = arith.andi %eq_87, %cond_88 : tensor<128x16xi1> loc(#loc260) + %cond_90 = arith.ori %cond_86, %cond_89 : tensor<128x16xi1> loc(#loc261) + %cond_91 = arith.extui %cond_90 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc262) + %cond_92 = arith.xori %cond_91, %flip_62 : tensor<128x16xi32> loc(#loc262) + %cond_93 = arith.cmpi ne, %cond_92, %cst_3 : tensor<128x16xi32> loc(#loc263) + %ret_94 = arith.xori %ileft_73, %iright_74 : tensor<128x16xi32> loc(#loc264) + %ret_95 = arith.select %cond_93, %ret_94, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc265) + %ret_96 = arith.xori %ret_56, %ret_95 : tensor<128x16xi32> loc(#loc266) + %new_idxs_97 = arith.xori %left_idx_84, %right_idx_85 : tensor<128x16xi32> loc(#loc267) + %new_idxs_98 = arith.select %cond_93, %new_idxs_97, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_99 = arith.xori %new_idxs_60, %new_idxs_98 : tensor<128x16xi32> loc(#loc269) + %y_100 = tt.reshape %ret_96 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc232) + %ileft_101 = arith.muli %y_100, %ileft : tensor<1024x2x1xi32> loc(#loc234) + %ileft_102 = "tt.reduce"(%ileft_101) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc307) + %ileft_103 = tt.expand_dims %ileft_102 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc236) + %ileft_104 = tt.broadcast %ileft_103 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc237) + %iright_105 = arith.muli %y_100, %iright : tensor<1024x2x1xi32> loc(#loc238) + %iright_106 = "tt.reduce"(%iright_105) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc309) + %iright_107 = tt.expand_dims %iright_106 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc240) + %iright_108 = tt.broadcast %iright_107 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc241) + %ileft_109 = tt.reshape %ileft_104 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_110 = tt.reshape %iright_108 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx_111 = tt.reshape %new_idxs_99 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc244) + %left_idx_112 = arith.muli %y_idx_111, %ileft : tensor<1024x2x1xi32> loc(#loc246) + %left_idx_113 = "tt.reduce"(%left_idx_112) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc312) + %left_idx_114 = tt.expand_dims %left_idx_113 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc248) + %left_idx_115 = tt.broadcast %left_idx_114 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc249) + %right_idx_116 = arith.muli %y_idx_111, %iright : tensor<1024x2x1xi32> loc(#loc251) + %right_idx_117 = "tt.reduce"(%right_idx_116) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc315) + %right_idx_118 = tt.expand_dims %right_idx_117 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc253) + %right_idx_119 = tt.broadcast %right_idx_118 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc254) + %left_idx_120 = tt.reshape %left_idx_115 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_121 = tt.reshape %right_idx_119 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc256) + %cond_122 = arith.cmpi slt, %ileft_109, %iright_110 : tensor<128x16xi32> loc(#loc257) + %eq_123 = arith.cmpi eq, %ileft_109, %iright_110 : tensor<128x16xi32> loc(#loc258) + %cond_124 = arith.cmpi sgt, %left_idx_120, %right_idx_121 : tensor<128x16xi32> loc(#loc259) + %cond_125 = arith.andi %eq_123, %cond_124 : tensor<128x16xi1> loc(#loc260) + %cond_126 = arith.ori %cond_122, %cond_125 : tensor<128x16xi1> loc(#loc261) + %cond_127 = arith.extui %cond_126 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc262) + %cond_128 = arith.xori %cond_127, %flip_62 : tensor<128x16xi32> loc(#loc262) + %cond_129 = arith.cmpi ne, %cond_128, %cst_3 : tensor<128x16xi32> loc(#loc263) + %ret_130 = arith.xori %ileft_109, %iright_110 : tensor<128x16xi32> loc(#loc264) + %ret_131 = arith.select %cond_129, %ret_130, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc265) + %ret_132 = arith.xori %ret_96, %ret_131 : tensor<128x16xi32> loc(#loc266) + %new_idxs_133 = arith.xori %left_idx_120, %right_idx_121 : tensor<128x16xi32> loc(#loc267) + %new_idxs_134 = arith.select %cond_129, %new_idxs_133, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_135 = arith.xori %new_idxs_99, %new_idxs_134 : tensor<128x16xi32> loc(#loc269) + %flip_136 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc219) + %flip_137 = tt.reshape %flip_136 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc220) + %y_138 = tt.reshape %ret_132 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc232) + %ileft_139 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc234) + %ileft_140 = arith.muli %y_138, %ileft_139 : tensor<256x2x4xi32> loc(#loc234) + %ileft_141 = "tt.reduce"(%ileft_140) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc307) + %ileft_142 = tt.expand_dims %ileft_141 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc236) + %ileft_143 = tt.broadcast %ileft_142 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc237) + %iright_144 = arith.muli %y_138, %flip_61 : tensor<256x2x4xi32> loc(#loc238) + %iright_145 = "tt.reduce"(%iright_144) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc309) + %iright_146 = tt.expand_dims %iright_145 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc240) + %iright_147 = tt.broadcast %iright_146 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc241) + %ileft_148 = tt.reshape %ileft_143 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_149 = tt.reshape %iright_147 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx_150 = tt.reshape %new_idxs_135 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc244) + %left_idx_151 = arith.muli %y_idx_150, %ileft_139 : tensor<256x2x4xi32> loc(#loc246) + %left_idx_152 = "tt.reduce"(%left_idx_151) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc312) + %left_idx_153 = tt.expand_dims %left_idx_152 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc248) + %left_idx_154 = tt.broadcast %left_idx_153 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc249) + %right_idx_155 = arith.muli %y_idx_150, %flip_61 : tensor<256x2x4xi32> loc(#loc251) + %right_idx_156 = "tt.reduce"(%right_idx_155) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc315) + %right_idx_157 = tt.expand_dims %right_idx_156 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc253) + %right_idx_158 = tt.broadcast %right_idx_157 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc254) + %left_idx_159 = tt.reshape %left_idx_154 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_160 = tt.reshape %right_idx_158 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc256) + %cond_161 = arith.cmpi slt, %ileft_148, %iright_149 : tensor<128x16xi32> loc(#loc257) + %eq_162 = arith.cmpi eq, %ileft_148, %iright_149 : tensor<128x16xi32> loc(#loc258) + %cond_163 = arith.cmpi sgt, %left_idx_159, %right_idx_160 : tensor<128x16xi32> loc(#loc259) + %cond_164 = arith.andi %eq_162, %cond_163 : tensor<128x16xi1> loc(#loc260) + %cond_165 = arith.ori %cond_161, %cond_164 : tensor<128x16xi1> loc(#loc261) + %cond_166 = arith.extui %cond_165 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc262) + %cond_167 = arith.xori %cond_166, %flip_137 : tensor<128x16xi32> loc(#loc262) + %cond_168 = arith.cmpi ne, %cond_167, %cst_3 : tensor<128x16xi32> loc(#loc263) + %ret_169 = arith.xori %ileft_148, %iright_149 : tensor<128x16xi32> loc(#loc264) + %ret_170 = arith.select %cond_168, %ret_169, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc265) + %ret_171 = arith.xori %ret_132, %ret_170 : tensor<128x16xi32> loc(#loc266) + %new_idxs_172 = arith.xori %left_idx_159, %right_idx_160 : tensor<128x16xi32> loc(#loc267) + %new_idxs_173 = arith.select %cond_168, %new_idxs_172, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_174 = arith.xori %new_idxs_135, %new_idxs_173 : tensor<128x16xi32> loc(#loc269) + %y_175 = tt.reshape %ret_171 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc232) + %ileft_176 = arith.muli %y_175, %ileft_64 : tensor<512x2x2xi32> loc(#loc234) + %ileft_177 = "tt.reduce"(%ileft_176) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc307) + %ileft_178 = tt.expand_dims %ileft_177 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc236) + %ileft_179 = tt.broadcast %ileft_178 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc237) + %iright_180 = arith.muli %y_175, %flip_24 : tensor<512x2x2xi32> loc(#loc238) + %iright_181 = "tt.reduce"(%iright_180) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc309) + %iright_182 = tt.expand_dims %iright_181 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc240) + %iright_183 = tt.broadcast %iright_182 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc241) + %ileft_184 = tt.reshape %ileft_179 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_185 = tt.reshape %iright_183 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx_186 = tt.reshape %new_idxs_174 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc244) + %left_idx_187 = arith.muli %y_idx_186, %ileft_64 : tensor<512x2x2xi32> loc(#loc246) + %left_idx_188 = "tt.reduce"(%left_idx_187) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc312) + %left_idx_189 = tt.expand_dims %left_idx_188 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc248) + %left_idx_190 = tt.broadcast %left_idx_189 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc249) + %right_idx_191 = arith.muli %y_idx_186, %flip_24 : tensor<512x2x2xi32> loc(#loc251) + %right_idx_192 = "tt.reduce"(%right_idx_191) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc315) + %right_idx_193 = tt.expand_dims %right_idx_192 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc253) + %right_idx_194 = tt.broadcast %right_idx_193 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc254) + %left_idx_195 = tt.reshape %left_idx_190 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_196 = tt.reshape %right_idx_194 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc256) + %cond_197 = arith.cmpi slt, %ileft_184, %iright_185 : tensor<128x16xi32> loc(#loc257) + %eq_198 = arith.cmpi eq, %ileft_184, %iright_185 : tensor<128x16xi32> loc(#loc258) + %cond_199 = arith.cmpi sgt, %left_idx_195, %right_idx_196 : tensor<128x16xi32> loc(#loc259) + %cond_200 = arith.andi %eq_198, %cond_199 : tensor<128x16xi1> loc(#loc260) + %cond_201 = arith.ori %cond_197, %cond_200 : tensor<128x16xi1> loc(#loc261) + %cond_202 = arith.extui %cond_201 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc262) + %cond_203 = arith.xori %cond_202, %flip_137 : tensor<128x16xi32> loc(#loc262) + %cond_204 = arith.cmpi ne, %cond_203, %cst_3 : tensor<128x16xi32> loc(#loc263) + %ret_205 = arith.xori %ileft_184, %iright_185 : tensor<128x16xi32> loc(#loc264) + %ret_206 = arith.select %cond_204, %ret_205, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc265) + %ret_207 = arith.xori %ret_171, %ret_206 : tensor<128x16xi32> loc(#loc266) + %new_idxs_208 = arith.xori %left_idx_195, %right_idx_196 : tensor<128x16xi32> loc(#loc267) + %new_idxs_209 = arith.select %cond_204, %new_idxs_208, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_210 = arith.xori %new_idxs_174, %new_idxs_209 : tensor<128x16xi32> loc(#loc269) + %y_211 = tt.reshape %ret_207 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc232) + %ileft_212 = arith.muli %y_211, %ileft : tensor<1024x2x1xi32> loc(#loc234) + %ileft_213 = "tt.reduce"(%ileft_212) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc307) + %ileft_214 = tt.expand_dims %ileft_213 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc236) + %ileft_215 = tt.broadcast %ileft_214 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc237) + %iright_216 = arith.muli %y_211, %iright : tensor<1024x2x1xi32> loc(#loc238) + %iright_217 = "tt.reduce"(%iright_216) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc309) + %iright_218 = tt.expand_dims %iright_217 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc240) + %iright_219 = tt.broadcast %iright_218 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc241) + %ileft_220 = tt.reshape %ileft_215 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_221 = tt.reshape %iright_219 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx_222 = tt.reshape %new_idxs_210 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc244) + %left_idx_223 = arith.muli %y_idx_222, %ileft : tensor<1024x2x1xi32> loc(#loc246) + %left_idx_224 = "tt.reduce"(%left_idx_223) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc312) + %left_idx_225 = tt.expand_dims %left_idx_224 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc248) + %left_idx_226 = tt.broadcast %left_idx_225 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc249) + %right_idx_227 = arith.muli %y_idx_222, %iright : tensor<1024x2x1xi32> loc(#loc251) + %right_idx_228 = "tt.reduce"(%right_idx_227) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc315) + %right_idx_229 = tt.expand_dims %right_idx_228 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc253) + %right_idx_230 = tt.broadcast %right_idx_229 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc254) + %left_idx_231 = tt.reshape %left_idx_226 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_232 = tt.reshape %right_idx_230 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc256) + %cond_233 = arith.cmpi slt, %ileft_220, %iright_221 : tensor<128x16xi32> loc(#loc257) + %eq_234 = arith.cmpi eq, %ileft_220, %iright_221 : tensor<128x16xi32> loc(#loc258) + %cond_235 = arith.cmpi sgt, %left_idx_231, %right_idx_232 : tensor<128x16xi32> loc(#loc259) + %cond_236 = arith.andi %eq_234, %cond_235 : tensor<128x16xi1> loc(#loc260) + %cond_237 = arith.ori %cond_233, %cond_236 : tensor<128x16xi1> loc(#loc261) + %cond_238 = arith.extui %cond_237 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc262) + %cond_239 = arith.xori %cond_238, %flip_137 : tensor<128x16xi32> loc(#loc262) + %cond_240 = arith.cmpi ne, %cond_239, %cst_3 : tensor<128x16xi32> loc(#loc263) + %ret_241 = arith.xori %ileft_220, %iright_221 : tensor<128x16xi32> loc(#loc264) + %ret_242 = arith.select %cond_240, %ret_241, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc265) + %ret_243 = arith.xori %ret_207, %ret_242 : tensor<128x16xi32> loc(#loc266) + %new_idxs_244 = arith.xori %left_idx_231, %right_idx_232 : tensor<128x16xi32> loc(#loc267) + %new_idxs_245 = arith.select %cond_240, %new_idxs_244, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_246 = arith.xori %new_idxs_210, %new_idxs_245 : tensor<128x16xi32> loc(#loc269) + %y_247 = tt.reshape %ret_243 : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc232) + %ileft_248 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc234) + %ileft_249 = arith.muli %y_247, %ileft_248 : tensor<128x2x8xi32> loc(#loc234) + %ileft_250 = "tt.reduce"(%ileft_249) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc307) + %ileft_251 = tt.expand_dims %ileft_250 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc236) + %ileft_252 = tt.broadcast %ileft_251 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc237) + %iright_253 = arith.muli %y_247, %flip_136 : tensor<128x2x8xi32> loc(#loc238) + %iright_254 = "tt.reduce"(%iright_253) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc309) + %iright_255 = tt.expand_dims %iright_254 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc240) + %iright_256 = tt.broadcast %iright_255 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc241) + %ileft_257 = tt.reshape %ileft_252 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_258 = tt.reshape %iright_256 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx_259 = tt.reshape %new_idxs_246 : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc244) + %left_idx_260 = arith.muli %y_idx_259, %ileft_248 : tensor<128x2x8xi32> loc(#loc246) + %left_idx_261 = "tt.reduce"(%left_idx_260) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc312) + %left_idx_262 = tt.expand_dims %left_idx_261 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc248) + %left_idx_263 = tt.broadcast %left_idx_262 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc249) + %right_idx_264 = arith.muli %y_idx_259, %flip_136 : tensor<128x2x8xi32> loc(#loc251) + %right_idx_265 = "tt.reduce"(%right_idx_264) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc315) + %right_idx_266 = tt.expand_dims %right_idx_265 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc253) + %right_idx_267 = tt.broadcast %right_idx_266 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc254) + %left_idx_268 = tt.reshape %left_idx_263 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_269 = tt.reshape %right_idx_267 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc256) + %cond_270 = arith.cmpi slt, %ileft_257, %iright_258 : tensor<128x16xi32> loc(#loc257) + %eq_271 = arith.cmpi eq, %ileft_257, %iright_258 : tensor<128x16xi32> loc(#loc258) + %cond_272 = arith.cmpi sgt, %left_idx_268, %right_idx_269 : tensor<128x16xi32> loc(#loc259) + %cond_273 = arith.andi %eq_271, %cond_272 : tensor<128x16xi1> loc(#loc260) + %cond_274 = arith.ori %cond_270, %cond_273 : tensor<128x16xi1> loc(#loc261) + %ret_275 = arith.xori %ileft_257, %iright_258 : tensor<128x16xi32> loc(#loc264) + %ret_276 = arith.select %cond_274, %ret_275, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc265) + %ret_277 = arith.xori %ret_243, %ret_276 : tensor<128x16xi32> loc(#loc266) + %new_idxs_278 = arith.xori %left_idx_268, %right_idx_269 : tensor<128x16xi32> loc(#loc267) + %new_idxs_279 = arith.select %cond_274, %new_idxs_278, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_280 = arith.xori %new_idxs_246, %new_idxs_279 : tensor<128x16xi32> loc(#loc269) + %y_281 = tt.reshape %ret_277 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc232) + %ileft_282 = arith.muli %y_281, %ileft_139 : tensor<256x2x4xi32> loc(#loc234) + %ileft_283 = "tt.reduce"(%ileft_282) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc307) + %ileft_284 = tt.expand_dims %ileft_283 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc236) + %ileft_285 = tt.broadcast %ileft_284 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc237) + %iright_286 = arith.muli %y_281, %flip_61 : tensor<256x2x4xi32> loc(#loc238) + %iright_287 = "tt.reduce"(%iright_286) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc309) + %iright_288 = tt.expand_dims %iright_287 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc240) + %iright_289 = tt.broadcast %iright_288 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc241) + %ileft_290 = tt.reshape %ileft_285 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_291 = tt.reshape %iright_289 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx_292 = tt.reshape %new_idxs_280 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc244) + %left_idx_293 = arith.muli %y_idx_292, %ileft_139 : tensor<256x2x4xi32> loc(#loc246) + %left_idx_294 = "tt.reduce"(%left_idx_293) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc312) + %left_idx_295 = tt.expand_dims %left_idx_294 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc248) + %left_idx_296 = tt.broadcast %left_idx_295 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc249) + %right_idx_297 = arith.muli %y_idx_292, %flip_61 : tensor<256x2x4xi32> loc(#loc251) + %right_idx_298 = "tt.reduce"(%right_idx_297) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc315) + %right_idx_299 = tt.expand_dims %right_idx_298 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc253) + %right_idx_300 = tt.broadcast %right_idx_299 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc254) + %left_idx_301 = tt.reshape %left_idx_296 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_302 = tt.reshape %right_idx_300 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc256) + %cond_303 = arith.cmpi slt, %ileft_290, %iright_291 : tensor<128x16xi32> loc(#loc257) + %eq_304 = arith.cmpi eq, %ileft_290, %iright_291 : tensor<128x16xi32> loc(#loc258) + %cond_305 = arith.cmpi sgt, %left_idx_301, %right_idx_302 : tensor<128x16xi32> loc(#loc259) + %cond_306 = arith.andi %eq_304, %cond_305 : tensor<128x16xi1> loc(#loc260) + %cond_307 = arith.ori %cond_303, %cond_306 : tensor<128x16xi1> loc(#loc261) + %ret_308 = arith.xori %ileft_290, %iright_291 : tensor<128x16xi32> loc(#loc264) + %ret_309 = arith.select %cond_307, %ret_308, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc265) + %ret_310 = arith.xori %ret_277, %ret_309 : tensor<128x16xi32> loc(#loc266) + %new_idxs_311 = arith.xori %left_idx_301, %right_idx_302 : tensor<128x16xi32> loc(#loc267) + %new_idxs_312 = arith.select %cond_307, %new_idxs_311, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_313 = arith.xori %new_idxs_280, %new_idxs_312 : tensor<128x16xi32> loc(#loc269) + %y_314 = tt.reshape %ret_310 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc232) + %ileft_315 = arith.muli %y_314, %ileft_64 : tensor<512x2x2xi32> loc(#loc234) + %ileft_316 = "tt.reduce"(%ileft_315) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc307) + %ileft_317 = tt.expand_dims %ileft_316 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc236) + %ileft_318 = tt.broadcast %ileft_317 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc237) + %iright_319 = arith.muli %y_314, %flip_24 : tensor<512x2x2xi32> loc(#loc238) + %iright_320 = "tt.reduce"(%iright_319) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc309) + %iright_321 = tt.expand_dims %iright_320 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc240) + %iright_322 = tt.broadcast %iright_321 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc241) + %ileft_323 = tt.reshape %ileft_318 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_324 = tt.reshape %iright_322 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx_325 = tt.reshape %new_idxs_313 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc244) + %left_idx_326 = arith.muli %y_idx_325, %ileft_64 : tensor<512x2x2xi32> loc(#loc246) + %left_idx_327 = "tt.reduce"(%left_idx_326) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc312) + %left_idx_328 = tt.expand_dims %left_idx_327 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc248) + %left_idx_329 = tt.broadcast %left_idx_328 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc249) + %right_idx_330 = arith.muli %y_idx_325, %flip_24 : tensor<512x2x2xi32> loc(#loc251) + %right_idx_331 = "tt.reduce"(%right_idx_330) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc315) + %right_idx_332 = tt.expand_dims %right_idx_331 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc253) + %right_idx_333 = tt.broadcast %right_idx_332 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc254) + %left_idx_334 = tt.reshape %left_idx_329 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_335 = tt.reshape %right_idx_333 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc256) + %cond_336 = arith.cmpi slt, %ileft_323, %iright_324 : tensor<128x16xi32> loc(#loc257) + %eq_337 = arith.cmpi eq, %ileft_323, %iright_324 : tensor<128x16xi32> loc(#loc258) + %cond_338 = arith.cmpi sgt, %left_idx_334, %right_idx_335 : tensor<128x16xi32> loc(#loc259) + %cond_339 = arith.andi %eq_337, %cond_338 : tensor<128x16xi1> loc(#loc260) + %cond_340 = arith.ori %cond_336, %cond_339 : tensor<128x16xi1> loc(#loc261) + %ret_341 = arith.xori %ileft_323, %iright_324 : tensor<128x16xi32> loc(#loc264) + %ret_342 = arith.select %cond_340, %ret_341, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc265) + %ret_343 = arith.xori %ret_310, %ret_342 : tensor<128x16xi32> loc(#loc266) + %new_idxs_344 = arith.xori %left_idx_334, %right_idx_335 : tensor<128x16xi32> loc(#loc267) + %new_idxs_345 = arith.select %cond_340, %new_idxs_344, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_346 = arith.xori %new_idxs_313, %new_idxs_345 : tensor<128x16xi32> loc(#loc269) + %y_347 = tt.reshape %ret_343 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc232) + %ileft_348 = arith.muli %y_347, %ileft : tensor<1024x2x1xi32> loc(#loc234) + %ileft_349 = "tt.reduce"(%ileft_348) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc307) + %ileft_350 = tt.expand_dims %ileft_349 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc236) + %ileft_351 = tt.broadcast %ileft_350 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc237) + %iright_352 = arith.muli %y_347, %iright : tensor<1024x2x1xi32> loc(#loc238) + %iright_353 = "tt.reduce"(%iright_352) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc309) + %iright_354 = tt.expand_dims %iright_353 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc240) + %iright_355 = tt.broadcast %iright_354 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc241) + %ileft_356 = tt.reshape %ileft_351 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_357 = tt.reshape %iright_355 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx_358 = tt.reshape %new_idxs_346 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc244) + %left_idx_359 = arith.muli %y_idx_358, %ileft : tensor<1024x2x1xi32> loc(#loc246) + %left_idx_360 = "tt.reduce"(%left_idx_359) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc312) + %left_idx_361 = tt.expand_dims %left_idx_360 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc248) + %left_idx_362 = tt.broadcast %left_idx_361 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc249) + %right_idx_363 = arith.muli %y_idx_358, %iright : tensor<1024x2x1xi32> loc(#loc251) + %right_idx_364 = "tt.reduce"(%right_idx_363) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc315) + %right_idx_365 = tt.expand_dims %right_idx_364 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc253) + %right_idx_366 = tt.broadcast %right_idx_365 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc254) + %left_idx_367 = tt.reshape %left_idx_362 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_368 = tt.reshape %right_idx_366 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc256) + %cond_369 = arith.cmpi slt, %ileft_356, %iright_357 : tensor<128x16xi32> loc(#loc257) + %eq_370 = arith.cmpi eq, %ileft_356, %iright_357 : tensor<128x16xi32> loc(#loc258) + %cond_371 = arith.cmpi sgt, %left_idx_367, %right_idx_368 : tensor<128x16xi32> loc(#loc259) + %cond_372 = arith.andi %eq_370, %cond_371 : tensor<128x16xi1> loc(#loc260) + %cond_373 = arith.ori %cond_369, %cond_372 : tensor<128x16xi1> loc(#loc261) + %new_idxs_374 = arith.xori %left_idx_367, %right_idx_368 : tensor<128x16xi32> loc(#loc267) + %new_idxs_375 = arith.select %cond_373, %new_idxs_374, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_376 = arith.xori %new_idxs_346, %new_idxs_375 : tensor<128x16xi32> loc(#loc269) + %tmp14 = arith.cmpi eq, %tmp0_21, %cst_6 : tensor<128x16xi64> loc(#loc192) + %tmp16 = arith.extui %tmp14 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc224) + %y_377 = tt.reshape %tmp16 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc270) + %ileft_378 = arith.muli %y_377, %ileft : tensor<1024x2x1xi32> loc(#loc271) + %ileft_379 = "tt.reduce"(%ileft_378) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc317) + %ileft_380 = tt.expand_dims %ileft_379 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc273) + %ileft_381 = tt.broadcast %ileft_380 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc274) + %iright_382 = arith.muli %y_377, %iright : tensor<1024x2x1xi32> loc(#loc275) + %iright_383 = "tt.reduce"(%iright_382) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc319) + %iright_384 = tt.expand_dims %iright_383 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc277) + %iright_385 = tt.broadcast %iright_384 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc278) + %ileft_386 = tt.reshape %ileft_381 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_387 = tt.reshape %iright_385 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc280) + %cond_388 = arith.cmpi slt, %ileft_386, %iright_387 : tensor<128x16xi32> loc(#loc281) + %eq_389 = arith.cmpi eq, %ileft_386, %iright_387 : tensor<128x16xi32> loc(#loc282) + %cond_390 = arith.andi %eq_389, %cond_49 : tensor<128x16xi1> loc(#loc283) + %cond_391 = arith.ori %cond_388, %cond_390 : tensor<128x16xi1> loc(#loc284) + %cond_392 = arith.extui %cond_391 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc285) + %cond_393 = arith.xori %cond_392, %flip_25 : tensor<128x16xi32> loc(#loc285) + %cond_394 = arith.cmpi ne, %cond_393, %cst_3 : tensor<128x16xi32> loc(#loc286) + %ret_395 = arith.xori %ileft_386, %iright_387 : tensor<128x16xi32> loc(#loc287) + %ret_396 = arith.select %cond_394, %ret_395, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc288) + %ret_397 = arith.xori %tmp16, %ret_396 : tensor<128x16xi32> loc(#loc289) + %new_idxs_398 = arith.select %cond_394, %new_idxs, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_399 = arith.xori %new_idxs_59, %new_idxs_398 : tensor<128x16xi32> loc(#loc291) + %y_400 = tt.reshape %ret_397 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc270) + %ileft_401 = arith.muli %y_400, %ileft_64 : tensor<512x2x2xi32> loc(#loc271) + %ileft_402 = "tt.reduce"(%ileft_401) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc317) + %ileft_403 = tt.expand_dims %ileft_402 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc273) + %ileft_404 = tt.broadcast %ileft_403 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc274) + %iright_405 = arith.muli %y_400, %flip_24 : tensor<512x2x2xi32> loc(#loc275) + %iright_406 = "tt.reduce"(%iright_405) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc319) + %iright_407 = tt.expand_dims %iright_406 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc277) + %iright_408 = tt.broadcast %iright_407 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc278) + %ileft_409 = tt.reshape %ileft_404 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_410 = tt.reshape %iright_408 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc280) + %y_idx_411 = tt.reshape %new_idxs_399 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc292) + %left_idx_412 = arith.muli %y_idx_411, %ileft_64 : tensor<512x2x2xi32> loc(#loc293) + %left_idx_413 = "tt.reduce"(%left_idx_412) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc321) + %left_idx_414 = tt.expand_dims %left_idx_413 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc295) + %left_idx_415 = tt.broadcast %left_idx_414 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc296) + %right_idx_416 = arith.muli %y_idx_411, %flip_24 : tensor<512x2x2xi32> loc(#loc297) + %right_idx_417 = "tt.reduce"(%right_idx_416) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc323) + %right_idx_418 = tt.expand_dims %right_idx_417 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc299) + %right_idx_419 = tt.broadcast %right_idx_418 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc300) + %left_idx_420 = tt.reshape %left_idx_415 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc301) + %right_idx_421 = tt.reshape %right_idx_419 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc302) + %cond_422 = arith.cmpi slt, %ileft_409, %iright_410 : tensor<128x16xi32> loc(#loc281) + %eq_423 = arith.cmpi eq, %ileft_409, %iright_410 : tensor<128x16xi32> loc(#loc282) + %cond_424 = arith.cmpi sgt, %left_idx_420, %right_idx_421 : tensor<128x16xi32> loc(#loc303) + %cond_425 = arith.andi %eq_423, %cond_424 : tensor<128x16xi1> loc(#loc283) + %cond_426 = arith.ori %cond_422, %cond_425 : tensor<128x16xi1> loc(#loc284) + %cond_427 = arith.extui %cond_426 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc285) + %cond_428 = arith.xori %cond_427, %flip_62 : tensor<128x16xi32> loc(#loc285) + %cond_429 = arith.cmpi ne, %cond_428, %cst_3 : tensor<128x16xi32> loc(#loc286) + %ret_430 = arith.xori %ileft_409, %iright_410 : tensor<128x16xi32> loc(#loc287) + %ret_431 = arith.select %cond_429, %ret_430, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc288) + %ret_432 = arith.xori %ret_397, %ret_431 : tensor<128x16xi32> loc(#loc289) + %new_idxs_433 = arith.xori %left_idx_420, %right_idx_421 : tensor<128x16xi32> loc(#loc304) + %new_idxs_434 = arith.select %cond_429, %new_idxs_433, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_435 = arith.xori %new_idxs_399, %new_idxs_434 : tensor<128x16xi32> loc(#loc291) + %y_436 = tt.reshape %ret_432 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc270) + %ileft_437 = arith.muli %y_436, %ileft : tensor<1024x2x1xi32> loc(#loc271) + %ileft_438 = "tt.reduce"(%ileft_437) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc317) + %ileft_439 = tt.expand_dims %ileft_438 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc273) + %ileft_440 = tt.broadcast %ileft_439 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc274) + %iright_441 = arith.muli %y_436, %iright : tensor<1024x2x1xi32> loc(#loc275) + %iright_442 = "tt.reduce"(%iright_441) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc319) + %iright_443 = tt.expand_dims %iright_442 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc277) + %iright_444 = tt.broadcast %iright_443 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc278) + %ileft_445 = tt.reshape %ileft_440 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_446 = tt.reshape %iright_444 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc280) + %y_idx_447 = tt.reshape %new_idxs_435 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc292) + %left_idx_448 = arith.muli %y_idx_447, %ileft : tensor<1024x2x1xi32> loc(#loc293) + %left_idx_449 = "tt.reduce"(%left_idx_448) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc321) + %left_idx_450 = tt.expand_dims %left_idx_449 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc295) + %left_idx_451 = tt.broadcast %left_idx_450 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc296) + %right_idx_452 = arith.muli %y_idx_447, %iright : tensor<1024x2x1xi32> loc(#loc297) + %right_idx_453 = "tt.reduce"(%right_idx_452) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc323) + %right_idx_454 = tt.expand_dims %right_idx_453 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc299) + %right_idx_455 = tt.broadcast %right_idx_454 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc300) + %left_idx_456 = tt.reshape %left_idx_451 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc301) + %right_idx_457 = tt.reshape %right_idx_455 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc302) + %cond_458 = arith.cmpi slt, %ileft_445, %iright_446 : tensor<128x16xi32> loc(#loc281) + %eq_459 = arith.cmpi eq, %ileft_445, %iright_446 : tensor<128x16xi32> loc(#loc282) + %cond_460 = arith.cmpi sgt, %left_idx_456, %right_idx_457 : tensor<128x16xi32> loc(#loc303) + %cond_461 = arith.andi %eq_459, %cond_460 : tensor<128x16xi1> loc(#loc283) + %cond_462 = arith.ori %cond_458, %cond_461 : tensor<128x16xi1> loc(#loc284) + %cond_463 = arith.extui %cond_462 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc285) + %cond_464 = arith.xori %cond_463, %flip_62 : tensor<128x16xi32> loc(#loc285) + %cond_465 = arith.cmpi ne, %cond_464, %cst_3 : tensor<128x16xi32> loc(#loc286) + %ret_466 = arith.xori %ileft_445, %iright_446 : tensor<128x16xi32> loc(#loc287) + %ret_467 = arith.select %cond_465, %ret_466, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc288) + %ret_468 = arith.xori %ret_432, %ret_467 : tensor<128x16xi32> loc(#loc289) + %new_idxs_469 = arith.xori %left_idx_456, %right_idx_457 : tensor<128x16xi32> loc(#loc304) + %new_idxs_470 = arith.select %cond_465, %new_idxs_469, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_471 = arith.xori %new_idxs_435, %new_idxs_470 : tensor<128x16xi32> loc(#loc291) + %y_472 = tt.reshape %ret_468 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc270) + %ileft_473 = arith.muli %y_472, %ileft_139 : tensor<256x2x4xi32> loc(#loc271) + %ileft_474 = "tt.reduce"(%ileft_473) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc317) + %ileft_475 = tt.expand_dims %ileft_474 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc273) + %ileft_476 = tt.broadcast %ileft_475 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc274) + %iright_477 = arith.muli %y_472, %flip_61 : tensor<256x2x4xi32> loc(#loc275) + %iright_478 = "tt.reduce"(%iright_477) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc319) + %iright_479 = tt.expand_dims %iright_478 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc277) + %iright_480 = tt.broadcast %iright_479 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc278) + %ileft_481 = tt.reshape %ileft_476 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_482 = tt.reshape %iright_480 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc280) + %y_idx_483 = tt.reshape %new_idxs_471 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc292) + %left_idx_484 = arith.muli %y_idx_483, %ileft_139 : tensor<256x2x4xi32> loc(#loc293) + %left_idx_485 = "tt.reduce"(%left_idx_484) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc321) + %left_idx_486 = tt.expand_dims %left_idx_485 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc295) + %left_idx_487 = tt.broadcast %left_idx_486 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc296) + %right_idx_488 = arith.muli %y_idx_483, %flip_61 : tensor<256x2x4xi32> loc(#loc297) + %right_idx_489 = "tt.reduce"(%right_idx_488) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc323) + %right_idx_490 = tt.expand_dims %right_idx_489 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc299) + %right_idx_491 = tt.broadcast %right_idx_490 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc300) + %left_idx_492 = tt.reshape %left_idx_487 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc301) + %right_idx_493 = tt.reshape %right_idx_491 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc302) + %cond_494 = arith.cmpi slt, %ileft_481, %iright_482 : tensor<128x16xi32> loc(#loc281) + %eq_495 = arith.cmpi eq, %ileft_481, %iright_482 : tensor<128x16xi32> loc(#loc282) + %cond_496 = arith.cmpi sgt, %left_idx_492, %right_idx_493 : tensor<128x16xi32> loc(#loc303) + %cond_497 = arith.andi %eq_495, %cond_496 : tensor<128x16xi1> loc(#loc283) + %cond_498 = arith.ori %cond_494, %cond_497 : tensor<128x16xi1> loc(#loc284) + %cond_499 = arith.extui %cond_498 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc285) + %cond_500 = arith.xori %cond_499, %flip_137 : tensor<128x16xi32> loc(#loc285) + %cond_501 = arith.cmpi ne, %cond_500, %cst_3 : tensor<128x16xi32> loc(#loc286) + %ret_502 = arith.xori %ileft_481, %iright_482 : tensor<128x16xi32> loc(#loc287) + %ret_503 = arith.select %cond_501, %ret_502, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc288) + %ret_504 = arith.xori %ret_468, %ret_503 : tensor<128x16xi32> loc(#loc289) + %new_idxs_505 = arith.xori %left_idx_492, %right_idx_493 : tensor<128x16xi32> loc(#loc304) + %new_idxs_506 = arith.select %cond_501, %new_idxs_505, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_507 = arith.xori %new_idxs_471, %new_idxs_506 : tensor<128x16xi32> loc(#loc291) + %y_508 = tt.reshape %ret_504 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc270) + %ileft_509 = arith.muli %y_508, %ileft_64 : tensor<512x2x2xi32> loc(#loc271) + %ileft_510 = "tt.reduce"(%ileft_509) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc317) + %ileft_511 = tt.expand_dims %ileft_510 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc273) + %ileft_512 = tt.broadcast %ileft_511 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc274) + %iright_513 = arith.muli %y_508, %flip_24 : tensor<512x2x2xi32> loc(#loc275) + %iright_514 = "tt.reduce"(%iright_513) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc319) + %iright_515 = tt.expand_dims %iright_514 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc277) + %iright_516 = tt.broadcast %iright_515 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc278) + %ileft_517 = tt.reshape %ileft_512 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_518 = tt.reshape %iright_516 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc280) + %y_idx_519 = tt.reshape %new_idxs_507 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc292) + %left_idx_520 = arith.muli %y_idx_519, %ileft_64 : tensor<512x2x2xi32> loc(#loc293) + %left_idx_521 = "tt.reduce"(%left_idx_520) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc321) + %left_idx_522 = tt.expand_dims %left_idx_521 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc295) + %left_idx_523 = tt.broadcast %left_idx_522 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc296) + %right_idx_524 = arith.muli %y_idx_519, %flip_24 : tensor<512x2x2xi32> loc(#loc297) + %right_idx_525 = "tt.reduce"(%right_idx_524) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc323) + %right_idx_526 = tt.expand_dims %right_idx_525 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc299) + %right_idx_527 = tt.broadcast %right_idx_526 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc300) + %left_idx_528 = tt.reshape %left_idx_523 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc301) + %right_idx_529 = tt.reshape %right_idx_527 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc302) + %cond_530 = arith.cmpi slt, %ileft_517, %iright_518 : tensor<128x16xi32> loc(#loc281) + %eq_531 = arith.cmpi eq, %ileft_517, %iright_518 : tensor<128x16xi32> loc(#loc282) + %cond_532 = arith.cmpi sgt, %left_idx_528, %right_idx_529 : tensor<128x16xi32> loc(#loc303) + %cond_533 = arith.andi %eq_531, %cond_532 : tensor<128x16xi1> loc(#loc283) + %cond_534 = arith.ori %cond_530, %cond_533 : tensor<128x16xi1> loc(#loc284) + %cond_535 = arith.extui %cond_534 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc285) + %cond_536 = arith.xori %cond_535, %flip_137 : tensor<128x16xi32> loc(#loc285) + %cond_537 = arith.cmpi ne, %cond_536, %cst_3 : tensor<128x16xi32> loc(#loc286) + %ret_538 = arith.xori %ileft_517, %iright_518 : tensor<128x16xi32> loc(#loc287) + %ret_539 = arith.select %cond_537, %ret_538, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc288) + %ret_540 = arith.xori %ret_504, %ret_539 : tensor<128x16xi32> loc(#loc289) + %new_idxs_541 = arith.xori %left_idx_528, %right_idx_529 : tensor<128x16xi32> loc(#loc304) + %new_idxs_542 = arith.select %cond_537, %new_idxs_541, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_543 = arith.xori %new_idxs_507, %new_idxs_542 : tensor<128x16xi32> loc(#loc291) + %y_544 = tt.reshape %ret_540 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc270) + %ileft_545 = arith.muli %y_544, %ileft : tensor<1024x2x1xi32> loc(#loc271) + %ileft_546 = "tt.reduce"(%ileft_545) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc317) + %ileft_547 = tt.expand_dims %ileft_546 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc273) + %ileft_548 = tt.broadcast %ileft_547 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc274) + %iright_549 = arith.muli %y_544, %iright : tensor<1024x2x1xi32> loc(#loc275) + %iright_550 = "tt.reduce"(%iright_549) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc319) + %iright_551 = tt.expand_dims %iright_550 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc277) + %iright_552 = tt.broadcast %iright_551 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc278) + %ileft_553 = tt.reshape %ileft_548 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_554 = tt.reshape %iright_552 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc280) + %y_idx_555 = tt.reshape %new_idxs_543 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc292) + %left_idx_556 = arith.muli %y_idx_555, %ileft : tensor<1024x2x1xi32> loc(#loc293) + %left_idx_557 = "tt.reduce"(%left_idx_556) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc321) + %left_idx_558 = tt.expand_dims %left_idx_557 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc295) + %left_idx_559 = tt.broadcast %left_idx_558 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc296) + %right_idx_560 = arith.muli %y_idx_555, %iright : tensor<1024x2x1xi32> loc(#loc297) + %right_idx_561 = "tt.reduce"(%right_idx_560) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc323) + %right_idx_562 = tt.expand_dims %right_idx_561 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc299) + %right_idx_563 = tt.broadcast %right_idx_562 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc300) + %left_idx_564 = tt.reshape %left_idx_559 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc301) + %right_idx_565 = tt.reshape %right_idx_563 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc302) + %cond_566 = arith.cmpi slt, %ileft_553, %iright_554 : tensor<128x16xi32> loc(#loc281) + %eq_567 = arith.cmpi eq, %ileft_553, %iright_554 : tensor<128x16xi32> loc(#loc282) + %cond_568 = arith.cmpi sgt, %left_idx_564, %right_idx_565 : tensor<128x16xi32> loc(#loc303) + %cond_569 = arith.andi %eq_567, %cond_568 : tensor<128x16xi1> loc(#loc283) + %cond_570 = arith.ori %cond_566, %cond_569 : tensor<128x16xi1> loc(#loc284) + %cond_571 = arith.extui %cond_570 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc285) + %cond_572 = arith.xori %cond_571, %flip_137 : tensor<128x16xi32> loc(#loc285) + %cond_573 = arith.cmpi ne, %cond_572, %cst_3 : tensor<128x16xi32> loc(#loc286) + %ret_574 = arith.xori %ileft_553, %iright_554 : tensor<128x16xi32> loc(#loc287) + %ret_575 = arith.select %cond_573, %ret_574, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc288) + %ret_576 = arith.xori %ret_540, %ret_575 : tensor<128x16xi32> loc(#loc289) + %new_idxs_577 = arith.xori %left_idx_564, %right_idx_565 : tensor<128x16xi32> loc(#loc304) + %new_idxs_578 = arith.select %cond_573, %new_idxs_577, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_579 = arith.xori %new_idxs_543, %new_idxs_578 : tensor<128x16xi32> loc(#loc291) + %y_580 = tt.reshape %ret_576 : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc270) + %ileft_581 = arith.muli %y_580, %ileft_248 : tensor<128x2x8xi32> loc(#loc271) + %ileft_582 = "tt.reduce"(%ileft_581) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc317) + %ileft_583 = tt.expand_dims %ileft_582 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc273) + %ileft_584 = tt.broadcast %ileft_583 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc274) + %iright_585 = arith.muli %y_580, %flip_136 : tensor<128x2x8xi32> loc(#loc275) + %iright_586 = "tt.reduce"(%iright_585) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc319) + %iright_587 = tt.expand_dims %iright_586 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc277) + %iright_588 = tt.broadcast %iright_587 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc278) + %ileft_589 = tt.reshape %ileft_584 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_590 = tt.reshape %iright_588 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc280) + %y_idx_591 = tt.reshape %new_idxs_579 : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc292) + %left_idx_592 = arith.muli %y_idx_591, %ileft_248 : tensor<128x2x8xi32> loc(#loc293) + %left_idx_593 = "tt.reduce"(%left_idx_592) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc321) + %left_idx_594 = tt.expand_dims %left_idx_593 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc295) + %left_idx_595 = tt.broadcast %left_idx_594 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc296) + %right_idx_596 = arith.muli %y_idx_591, %flip_136 : tensor<128x2x8xi32> loc(#loc297) + %right_idx_597 = "tt.reduce"(%right_idx_596) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc323) + %right_idx_598 = tt.expand_dims %right_idx_597 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc299) + %right_idx_599 = tt.broadcast %right_idx_598 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc300) + %left_idx_600 = tt.reshape %left_idx_595 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc301) + %right_idx_601 = tt.reshape %right_idx_599 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc302) + %cond_602 = arith.cmpi slt, %ileft_589, %iright_590 : tensor<128x16xi32> loc(#loc281) + %eq_603 = arith.cmpi eq, %ileft_589, %iright_590 : tensor<128x16xi32> loc(#loc282) + %cond_604 = arith.cmpi sgt, %left_idx_600, %right_idx_601 : tensor<128x16xi32> loc(#loc303) + %cond_605 = arith.andi %eq_603, %cond_604 : tensor<128x16xi1> loc(#loc283) + %cond_606 = arith.ori %cond_602, %cond_605 : tensor<128x16xi1> loc(#loc284) + %ret_607 = arith.xori %ileft_589, %iright_590 : tensor<128x16xi32> loc(#loc287) + %ret_608 = arith.select %cond_606, %ret_607, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc288) + %ret_609 = arith.xori %ret_576, %ret_608 : tensor<128x16xi32> loc(#loc289) + %new_idxs_610 = arith.xori %left_idx_600, %right_idx_601 : tensor<128x16xi32> loc(#loc304) + %new_idxs_611 = arith.select %cond_606, %new_idxs_610, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_612 = arith.xori %new_idxs_579, %new_idxs_611 : tensor<128x16xi32> loc(#loc291) + %y_613 = tt.reshape %ret_609 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc270) + %ileft_614 = arith.muli %y_613, %ileft_139 : tensor<256x2x4xi32> loc(#loc271) + %ileft_615 = "tt.reduce"(%ileft_614) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc317) + %ileft_616 = tt.expand_dims %ileft_615 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc273) + %ileft_617 = tt.broadcast %ileft_616 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc274) + %iright_618 = arith.muli %y_613, %flip_61 : tensor<256x2x4xi32> loc(#loc275) + %iright_619 = "tt.reduce"(%iright_618) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc319) + %iright_620 = tt.expand_dims %iright_619 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc277) + %iright_621 = tt.broadcast %iright_620 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc278) + %ileft_622 = tt.reshape %ileft_617 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_623 = tt.reshape %iright_621 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc280) + %y_idx_624 = tt.reshape %new_idxs_612 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc292) + %left_idx_625 = arith.muli %y_idx_624, %ileft_139 : tensor<256x2x4xi32> loc(#loc293) + %left_idx_626 = "tt.reduce"(%left_idx_625) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc321) + %left_idx_627 = tt.expand_dims %left_idx_626 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc295) + %left_idx_628 = tt.broadcast %left_idx_627 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc296) + %right_idx_629 = arith.muli %y_idx_624, %flip_61 : tensor<256x2x4xi32> loc(#loc297) + %right_idx_630 = "tt.reduce"(%right_idx_629) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc323) + %right_idx_631 = tt.expand_dims %right_idx_630 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc299) + %right_idx_632 = tt.broadcast %right_idx_631 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc300) + %left_idx_633 = tt.reshape %left_idx_628 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc301) + %right_idx_634 = tt.reshape %right_idx_632 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc302) + %cond_635 = arith.cmpi slt, %ileft_622, %iright_623 : tensor<128x16xi32> loc(#loc281) + %eq_636 = arith.cmpi eq, %ileft_622, %iright_623 : tensor<128x16xi32> loc(#loc282) + %cond_637 = arith.cmpi sgt, %left_idx_633, %right_idx_634 : tensor<128x16xi32> loc(#loc303) + %cond_638 = arith.andi %eq_636, %cond_637 : tensor<128x16xi1> loc(#loc283) + %cond_639 = arith.ori %cond_635, %cond_638 : tensor<128x16xi1> loc(#loc284) + %ret_640 = arith.xori %ileft_622, %iright_623 : tensor<128x16xi32> loc(#loc287) + %ret_641 = arith.select %cond_639, %ret_640, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc288) + %ret_642 = arith.xori %ret_609, %ret_641 : tensor<128x16xi32> loc(#loc289) + %new_idxs_643 = arith.xori %left_idx_633, %right_idx_634 : tensor<128x16xi32> loc(#loc304) + %new_idxs_644 = arith.select %cond_639, %new_idxs_643, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_645 = arith.xori %new_idxs_612, %new_idxs_644 : tensor<128x16xi32> loc(#loc291) + %y_646 = tt.reshape %ret_642 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc270) + %ileft_647 = arith.muli %y_646, %ileft_64 : tensor<512x2x2xi32> loc(#loc271) + %ileft_648 = "tt.reduce"(%ileft_647) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc317) + %ileft_649 = tt.expand_dims %ileft_648 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc273) + %ileft_650 = tt.broadcast %ileft_649 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc274) + %iright_651 = arith.muli %y_646, %flip_24 : tensor<512x2x2xi32> loc(#loc275) + %iright_652 = "tt.reduce"(%iright_651) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc319) + %iright_653 = tt.expand_dims %iright_652 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc277) + %iright_654 = tt.broadcast %iright_653 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc278) + %ileft_655 = tt.reshape %ileft_650 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_656 = tt.reshape %iright_654 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc280) + %y_idx_657 = tt.reshape %new_idxs_645 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc292) + %left_idx_658 = arith.muli %y_idx_657, %ileft_64 : tensor<512x2x2xi32> loc(#loc293) + %left_idx_659 = "tt.reduce"(%left_idx_658) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc321) + %left_idx_660 = tt.expand_dims %left_idx_659 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc295) + %left_idx_661 = tt.broadcast %left_idx_660 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc296) + %right_idx_662 = arith.muli %y_idx_657, %flip_24 : tensor<512x2x2xi32> loc(#loc297) + %right_idx_663 = "tt.reduce"(%right_idx_662) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc323) + %right_idx_664 = tt.expand_dims %right_idx_663 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc299) + %right_idx_665 = tt.broadcast %right_idx_664 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc300) + %left_idx_666 = tt.reshape %left_idx_661 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc301) + %right_idx_667 = tt.reshape %right_idx_665 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc302) + %cond_668 = arith.cmpi slt, %ileft_655, %iright_656 : tensor<128x16xi32> loc(#loc281) + %eq_669 = arith.cmpi eq, %ileft_655, %iright_656 : tensor<128x16xi32> loc(#loc282) + %cond_670 = arith.cmpi sgt, %left_idx_666, %right_idx_667 : tensor<128x16xi32> loc(#loc303) + %cond_671 = arith.andi %eq_669, %cond_670 : tensor<128x16xi1> loc(#loc283) + %cond_672 = arith.ori %cond_668, %cond_671 : tensor<128x16xi1> loc(#loc284) + %ret_673 = arith.xori %ileft_655, %iright_656 : tensor<128x16xi32> loc(#loc287) + %ret_674 = arith.select %cond_672, %ret_673, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc288) + %ret_675 = arith.xori %ret_642, %ret_674 : tensor<128x16xi32> loc(#loc289) + %new_idxs_676 = arith.xori %left_idx_666, %right_idx_667 : tensor<128x16xi32> loc(#loc304) + %new_idxs_677 = arith.select %cond_672, %new_idxs_676, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_678 = arith.xori %new_idxs_645, %new_idxs_677 : tensor<128x16xi32> loc(#loc291) + %y_679 = tt.reshape %ret_675 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc270) + %ileft_680 = arith.muli %y_679, %ileft : tensor<1024x2x1xi32> loc(#loc271) + %ileft_681 = "tt.reduce"(%ileft_680) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc317) + %ileft_682 = tt.expand_dims %ileft_681 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc273) + %ileft_683 = tt.broadcast %ileft_682 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc274) + %iright_684 = arith.muli %y_679, %iright : tensor<1024x2x1xi32> loc(#loc275) + %iright_685 = "tt.reduce"(%iright_684) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc319) + %iright_686 = tt.expand_dims %iright_685 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc277) + %iright_687 = tt.broadcast %iright_686 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc278) + %ileft_688 = tt.reshape %ileft_683 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_689 = tt.reshape %iright_687 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc280) + %y_idx_690 = tt.reshape %new_idxs_678 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc292) + %left_idx_691 = arith.muli %y_idx_690, %ileft : tensor<1024x2x1xi32> loc(#loc293) + %left_idx_692 = "tt.reduce"(%left_idx_691) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc321) + %left_idx_693 = tt.expand_dims %left_idx_692 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc295) + %left_idx_694 = tt.broadcast %left_idx_693 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc296) + %right_idx_695 = arith.muli %y_idx_690, %iright : tensor<1024x2x1xi32> loc(#loc297) + %right_idx_696 = "tt.reduce"(%right_idx_695) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc323) + %right_idx_697 = tt.expand_dims %right_idx_696 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc299) + %right_idx_698 = tt.broadcast %right_idx_697 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc300) + %left_idx_699 = tt.reshape %left_idx_694 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc301) + %right_idx_700 = tt.reshape %right_idx_698 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc302) + %cond_701 = arith.cmpi slt, %ileft_688, %iright_689 : tensor<128x16xi32> loc(#loc281) + %eq_702 = arith.cmpi eq, %ileft_688, %iright_689 : tensor<128x16xi32> loc(#loc282) + %cond_703 = arith.cmpi sgt, %left_idx_699, %right_idx_700 : tensor<128x16xi32> loc(#loc303) + %cond_704 = arith.andi %eq_702, %cond_703 : tensor<128x16xi1> loc(#loc283) + %cond_705 = arith.ori %cond_701, %cond_704 : tensor<128x16xi1> loc(#loc284) + %new_idxs_706 = arith.xori %left_idx_699, %right_idx_700 : tensor<128x16xi32> loc(#loc304) + %new_idxs_707 = arith.select %cond_705, %new_idxs_706, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_708 = arith.xori %new_idxs_678, %new_idxs_707 : tensor<128x16xi32> loc(#loc291) + %tmp20 = arith.extui %tmp5 : tensor<128x16xi1> to tensor<128x16xi64> loc(#loc226) + %tmp23 = arith.select %tmp0_20, %tmp20, %cst_7 : tensor<128x16xi1>, tensor<128x16xi64> loc(#loc197) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_713: i64 loc(callsite(#loc1 at #loc198)), %tmp24_714: i64 loc(callsite(#loc1 at #loc198))): + %tmp24_715 = arith.addi %tmp24_713, %tmp24_714 : i64 loc(#loc305) + tt.reduce.return %tmp24_715 : i64 loc(#loc227) + }) : (tensor<128x16xi64>) -> tensor<128xi64> loc(#loc227) + %tmp24_709 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<128xi64> -> tensor<128x1xi64> loc(#loc199) + %tmp25 = arith.extui %tmp14 : tensor<128x16xi1> to tensor<128x16xi64> loc(#loc229) + %tmp28 = arith.select %tmp0_20, %tmp25, %cst_7 : tensor<128x16xi1>, tensor<128x16xi64> loc(#loc201) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_713: i64 loc(callsite(#loc1 at #loc202)), %tmp29_714: i64 loc(callsite(#loc1 at #loc202))): + %tmp29_715 = arith.addi %tmp29_713, %tmp29_714 : i64 loc(#loc306) + tt.reduce.return %tmp29_715 : i64 loc(#loc230) + }) : (tensor<128x16xi64>) -> tensor<128xi64> loc(#loc230) + %tmp29_710 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<128xi64> -> tensor<128x1xi64> loc(#loc203) + %tmp30 = arith.trunci %tmp24_709 : tensor<128x1xi64> to tensor<128x1xi32> loc(#loc204) + %tmp31 = arith.trunci %tmp29_710 : tensor<128x1xi64> to tensor<128x1xi32> loc(#loc205) + %tmp34 = tt.broadcast %tmp30 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc206) + %tmp34_711 = arith.cmpi slt, %tmp0_15, %tmp34 : tensor<128x16xi32> loc(#loc206) + %tmp36 = arith.select %tmp34_711, %new_idxs_376, %cst_5 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc207) + %tmp38 = arith.addi %tmp36, %cst_4 : tensor<128x16xi32> loc(#loc208) + %tmp39 = arith.cmpi slt, %tmp36, %cst_3 : tensor<128x16xi32> loc(#loc209) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc210) + %0 = arith.cmpi sge, %tmp40, %cst_3 : tensor<128x16xi32> loc(#loc88) + %1 = arith.cmpi slt, %tmp40, %cst_4 : tensor<128x16xi32> loc(#loc89) + %2 = arith.andi %0, %1 : tensor<128x16xi1> loc(#loc90) + %3 = arith.xori %xmask_13, %cst_2 : tensor<128x1xi1> loc(#loc91) + %4 = tt.broadcast %3 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc92) + %5 = arith.ori %2, %4 : tensor<128x16xi1> loc(#loc92) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<128x16xi1> loc(#loc93) + %tmp45 = tt.broadcast %tmp31 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc211) + %tmp45_712 = arith.cmpi slt, %tmp0_15, %tmp45 : tensor<128x16xi32> loc(#loc211) + %tmp46 = arith.select %tmp45_712, %new_idxs_708, %cst_5 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc212) + %tmp47 = arith.addi %tmp46, %cst_4 : tensor<128x16xi32> loc(#loc213) + %tmp48 = arith.cmpi slt, %tmp46, %cst_3 : tensor<128x16xi32> loc(#loc214) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc215) + %6 = arith.cmpi sge, %tmp49, %cst_3 : tensor<128x16xi32> loc(#loc99) + %7 = arith.cmpi slt, %tmp49, %cst_4 : tensor<128x16xi32> loc(#loc100) + %8 = arith.andi %6, %7 : tensor<128x16xi1> loc(#loc101) + %9 = arith.ori %8, %4 : tensor<128x16xi1> loc(#loc102) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<128x16xi1> loc(#loc103) + %10 = tt.splat %out_ptr4 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc104) + %11 = tt.addptr %10, %xindex_12 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc104) + tt.store %11, %tmp30, %xmask_13 : tensor<128x1x!tt.ptr> loc(#loc105) + %12 = tt.splat %out_ptr5 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc106) + %13 = tt.addptr %12, %xindex_12 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc106) + tt.store %13, %tmp31, %xmask_13 : tensor<128x1x!tt.ptr> loc(#loc107) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc108) + %15 = tt.addptr %14, %tmp0_17 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc108) + tt.store %15, %new_idxs_376, %tmp0_20 : tensor<128x16x!tt.ptr> loc(#loc109) + %16 = arith.muli %xindex_12, %cst_1 : tensor<128x1xi32> loc(#loc110) + %17 = tt.broadcast %16 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc111) + %18 = arith.addi %tmp40, %17 : tensor<128x16xi32> loc(#loc111) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc112) + %20 = tt.addptr %19, %18 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc112) + tt.store %20, %cst_0, %tmp0_20 : tensor<128x16x!tt.ptr> loc(#loc113) + %21 = tt.splat %out_ptr8 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc114) + %22 = tt.addptr %21, %tmp0_17 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc114) + tt.store %22, %new_idxs_708, %tmp0_20 : tensor<128x16x!tt.ptr> loc(#loc115) + %23 = arith.addi %tmp49, %17 : tensor<128x16xi32> loc(#loc116) + %24 = tt.splat %out_ptr9 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc117) + %25 = tt.addptr %24, %23 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc117) + tt.store %25, %cst_0, %tmp0_20 : tensor<128x16x!tt.ptr> loc(#loc118) + tt.return loc(#loc119) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":26:21) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":27:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":27:38) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":34:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":34:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":34:30) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":34:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":36:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":38:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":39:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":41:19) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":40:19) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":43:19) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":45:34) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":47:20) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":49:21) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":48:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":52:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":54:35) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":55:29) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":56:21) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":58:35) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":59:29) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":60:21) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":61:21) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":64:19) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":66:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":68:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":69:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":70:35) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":71:28) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":71:46) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":71:38) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":71:55) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":71:53) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":71:63) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":75:19) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":76:35) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":77:20) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":78:20) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":79:35) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":80:28) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":80:46) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":80:38) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":80:53) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":80:63) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":81:25) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":81:37) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":82:25) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":82:37) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":83:25) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":83:47) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":84:52) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":84:49) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":84:25) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":84:85) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":85:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":85:47) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":86:49) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":86:25) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":86:85) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ub/cubczabrf2ptryq2athnmru2byipbgarioa3462puxv2jwv6vm4c.py":86:4) +#loc129 = loc("xmask"(#loc2)) +#loc130 = loc("xoffset"(#loc3)) +#loc131 = loc("xoffset"(#loc4)) +#loc132 = loc("xindex"(#loc5)) +#loc133 = loc("xindex"(#loc6)) +#loc134 = loc("xindex"(#loc7)) +#loc135 = loc("r0_index"(#loc8)) +#loc136 = loc("r0_index"(#loc9)) +#loc137 = loc("tmp0"(#loc10)) +#loc138 = loc("tmp0"(#loc11)) +#loc139 = loc("tmp0"(#loc12)) +#loc140 = loc("tmp0"(#loc13)) +#loc141 = loc("tmp2"(#loc14)) +#loc142 = loc("tmp4"(#loc15)) +#loc143 = loc("tmp5"(#loc16)) +#loc144 = loc("tmp7"(#loc17)) +#loc145 = loc("tmp6"(#loc18)) +#loc146 = loc("tmp9"(#loc19)) +#loc147 = loc("tmp11"(#loc20)) +#loc148 = loc("flip"(#loc21)) +#loc150 = loc("flip"(#loc24)) +#loc151 = loc("flip"(#loc25)) +#loc152 = loc("flip"(#loc26)) +#loc153 = loc("y"(#loc27)) +#loc154 = loc("left_mask"(#loc29)) +#loc155 = loc("ileft"(#loc30)) +#loc157 = loc("ileft"(#loc34)) +#loc158 = loc("ileft"(#loc35)) +#loc159 = loc("iright"(#loc36)) +#loc161 = loc("iright"(#loc38)) +#loc162 = loc("iright"(#loc39)) +#loc163 = loc("ileft"(#loc40)) +#loc164 = loc("iright"(#loc41)) +#loc165 = loc("y_idx"(#loc42)) +#loc166 = loc("left_idx"(#loc43)) +#loc167 = loc("left_idx"(#loc44)) +#loc168 = loc("input"(#loc45)) +#loc170 = loc("left_idx"(#loc47)) +#loc171 = loc("left_idx"(#loc48)) +#loc172 = loc("right_idx"(#loc49)) +#loc173 = loc("right_idx"(#loc50)) +#loc175 = loc("right_idx"(#loc52)) +#loc176 = loc("right_idx"(#loc53)) +#loc177 = loc("left_idx"(#loc54)) +#loc178 = loc("right_idx"(#loc55)) +#loc179 = loc("cond"(#loc56)) +#loc180 = loc("eq"(#loc57)) +#loc181 = loc("cond"(#loc58)) +#loc182 = loc("cond"(#loc59)) +#loc183 = loc("cond"(#loc60)) +#loc184 = loc("cond"(#loc61)) +#loc185 = loc("cond"(#loc62)) +#loc186 = loc("ret"(#loc63)) +#loc187 = loc("ret"(#loc64)) +#loc188 = loc("ret"(#loc65)) +#loc189 = loc("new_idxs"(#loc66)) +#loc190 = loc("new_idxs"(#loc67)) +#loc191 = loc("new_idxs"(#loc68)) +#loc192 = loc("tmp14"(#loc69)) +#loc193 = loc("tmp16"(#loc70)) +#loc194 = loc("tmp15"(#loc71)) +#loc196 = loc("tmp20"(#loc73)) +#loc197 = loc("tmp23"(#loc74)) +#loc199 = loc("tmp24"(#loc76)) +#loc200 = loc("tmp25"(#loc77)) +#loc201 = loc("tmp28"(#loc78)) +#loc203 = loc("tmp29"(#loc80)) +#loc204 = loc("tmp30"(#loc81)) +#loc205 = loc("tmp31"(#loc82)) +#loc206 = loc("tmp34"(#loc83)) +#loc207 = loc("tmp36"(#loc84)) +#loc208 = loc("tmp38"(#loc85)) +#loc209 = loc("tmp39"(#loc86)) +#loc210 = loc("tmp40"(#loc87)) +#loc211 = loc("tmp45"(#loc94)) +#loc212 = loc("tmp46"(#loc95)) +#loc213 = loc("tmp47"(#loc96)) +#loc214 = loc("tmp48"(#loc97)) +#loc215 = loc("tmp49"(#loc98)) +#loc216 = loc(fused[#loc144, #loc145]) +#loc217 = loc(callsite(#loc148 at #loc149)) +#loc218 = loc(callsite(#loc150 at #loc149)) +#loc219 = loc(callsite(#loc151 at #loc149)) +#loc220 = loc(callsite(#loc152 at #loc149)) +#loc222 = loc("cond"(#loc179)) +#loc223 = loc("eq"(#loc180)) +#loc224 = loc(fused[#loc193, #loc194]) +#loc226 = loc(fused[#loc196, #loc144, #loc145]) +#loc227 = loc(callsite(#loc31 at #loc198)) +#loc229 = loc(fused[#loc200, #loc193, #loc194]) +#loc230 = loc(callsite(#loc31 at #loc202)) +#loc232 = loc(callsite(#loc153 at #loc221)) +#loc233 = loc(callsite(#loc154 at #loc221)) +#loc234 = loc(callsite(#loc155 at #loc221)) +#loc236 = loc(callsite(#loc157 at #loc221)) +#loc237 = loc(callsite(#loc158 at #loc221)) +#loc238 = loc(callsite(#loc159 at #loc221)) +#loc240 = loc(callsite(#loc161 at #loc221)) +#loc241 = loc(callsite(#loc162 at #loc221)) +#loc242 = loc(callsite(#loc163 at #loc221)) +#loc243 = loc(callsite(#loc164 at #loc221)) +#loc244 = loc(callsite(#loc165 at #loc221)) +#loc245 = loc(callsite(#loc166 at #loc221)) +#loc246 = loc(callsite(#loc167 at #loc221)) +#loc248 = loc(callsite(#loc170 at #loc221)) +#loc249 = loc(callsite(#loc171 at #loc221)) +#loc250 = loc(callsite(#loc172 at #loc221)) +#loc251 = loc(callsite(#loc173 at #loc221)) +#loc253 = loc(callsite(#loc175 at #loc221)) +#loc254 = loc(callsite(#loc176 at #loc221)) +#loc255 = loc(callsite(#loc177 at #loc221)) +#loc256 = loc(callsite(#loc178 at #loc221)) +#loc257 = loc(callsite(#loc222 at #loc221)) +#loc258 = loc(callsite(#loc223 at #loc221)) +#loc259 = loc(callsite(#loc181 at #loc221)) +#loc260 = loc(callsite(#loc182 at #loc221)) +#loc261 = loc(callsite(#loc183 at #loc221)) +#loc262 = loc(callsite(#loc184 at #loc221)) +#loc263 = loc(callsite(#loc185 at #loc221)) +#loc264 = loc(callsite(#loc186 at #loc221)) +#loc265 = loc(callsite(#loc187 at #loc221)) +#loc266 = loc(callsite(#loc188 at #loc221)) +#loc267 = loc(callsite(#loc189 at #loc221)) +#loc268 = loc(callsite(#loc190 at #loc221)) +#loc269 = loc(callsite(#loc191 at #loc221)) +#loc270 = loc(callsite(#loc153 at #loc225)) +#loc271 = loc(callsite(#loc155 at #loc225)) +#loc273 = loc(callsite(#loc157 at #loc225)) +#loc274 = loc(callsite(#loc158 at #loc225)) +#loc275 = loc(callsite(#loc159 at #loc225)) +#loc277 = loc(callsite(#loc161 at #loc225)) +#loc278 = loc(callsite(#loc162 at #loc225)) +#loc279 = loc(callsite(#loc163 at #loc225)) +#loc280 = loc(callsite(#loc164 at #loc225)) +#loc281 = loc(callsite(#loc222 at #loc225)) +#loc282 = loc(callsite(#loc223 at #loc225)) +#loc283 = loc(callsite(#loc182 at #loc225)) +#loc284 = loc(callsite(#loc183 at #loc225)) +#loc285 = loc(callsite(#loc184 at #loc225)) +#loc286 = loc(callsite(#loc185 at #loc225)) +#loc287 = loc(callsite(#loc186 at #loc225)) +#loc288 = loc(callsite(#loc187 at #loc225)) +#loc289 = loc(callsite(#loc188 at #loc225)) +#loc290 = loc(callsite(#loc190 at #loc225)) +#loc291 = loc(callsite(#loc191 at #loc225)) +#loc292 = loc(callsite(#loc165 at #loc225)) +#loc293 = loc(callsite(#loc167 at #loc225)) +#loc295 = loc(callsite(#loc170 at #loc225)) +#loc296 = loc(callsite(#loc171 at #loc225)) +#loc297 = loc(callsite(#loc173 at #loc225)) +#loc299 = loc(callsite(#loc175 at #loc225)) +#loc300 = loc(callsite(#loc176 at #loc225)) +#loc301 = loc(callsite(#loc177 at #loc225)) +#loc302 = loc(callsite(#loc178 at #loc225)) +#loc303 = loc(callsite(#loc181 at #loc225)) +#loc304 = loc(callsite(#loc189 at #loc225)) +#loc305 = loc(callsite(#loc33 at #loc227)) +#loc306 = loc(callsite(#loc33 at #loc230)) +#loc307 = loc(callsite(#loc31 at #loc235)) +#loc309 = loc(callsite(#loc31 at #loc239)) +#loc311 = loc(callsite(#loc168 at #loc247)) +#loc312 = loc(callsite(#loc31 at #loc247)) +#loc314 = loc(callsite(#loc168 at #loc252)) +#loc315 = loc(callsite(#loc31 at #loc252)) +#loc317 = loc(callsite(#loc31 at #loc272)) +#loc319 = loc(callsite(#loc31 at #loc276)) +#loc321 = loc(callsite(#loc31 at #loc294)) +#loc323 = loc(callsite(#loc31 at #loc298)) +#loc325 = loc(callsite(#loc33 at #loc307)) +#loc326 = loc(callsite(#loc33 at #loc309)) +#loc327 = loc(callsite(#loc33 at #loc312)) +#loc328 = loc(callsite(#loc33 at #loc315)) +#loc329 = loc(callsite(#loc33 at #loc317)) +#loc330 = loc(callsite(#loc33 at #loc319)) +#loc331 = loc(callsite(#loc33 at #loc321)) +#loc332 = loc(callsite(#loc33 at #loc323)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c23e548139641523f6132de428d4cd25d89801a9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..164b46c6576832bfa583cfad5547b0577a1200ff Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d4787571dad5bed9532fbcbab0e4c45a9b958637 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"hash": "fe376ba41d2f05ff2bb7c13c37b09b85f38cb341d26f28c38925cd0f032bd098", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..025433a7ff9c840e195c6d8e5c3bb9269709ee35 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir @@ -0,0 +1,266 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py\00" +@assertMessage_0 = internal constant [90 x i8] c"index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i64 %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %13 = icmp samesign ult i32 %12, 32, !dbg !11 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %15 = and i32 %14, 31, !dbg !12 + %16 = zext nneg i32 %12 to i64, !dbg !13 + %17 = mul i64 %5, %16, !dbg !13 + %18 = icmp sgt i32 %8, 0, !dbg !14 + br i1 %18, label %.lr.ph, label %._crit_edge, !dbg !14 + +.lr.ph: ; preds = %11 + %19 = getelementptr i32, ptr addrspace(1) %0, i64 %17 + br i1 %13, label %.lr.ph.split, label %.lr.ph.split.us + +.lr.ph.split.us: ; preds = %.lr.ph, %.lr.ph.split.us + %20 = phi i32 [ %26, %.lr.ph.split.us ], [ 0, %.lr.ph ] + %21 = or disjoint i32 %20, %15, !dbg !15 + %22 = sext i32 %21 to i64, !dbg !16 + %23 = getelementptr i32, ptr addrspace(1) %19, i64 %22, !dbg !17 + %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %25 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %23, i64 %24, i1 false) #5, !dbg !18 + %26 = add i32 %20, 32, !dbg !14 + %27 = icmp slt i32 %26, %8, !dbg !14 + br i1 %27, label %.lr.ph.split.us, label %._crit_edge, !dbg !14 + +.lr.ph.split: ; preds = %.lr.ph, %.lr.ph.split + %28 = phi i64 [ %36, %.lr.ph.split ], [ 0, %.lr.ph ] + %29 = phi i32 [ %37, %.lr.ph.split ], [ 0, %.lr.ph ] + %30 = or disjoint i32 %29, %15, !dbg !15 + %31 = icmp slt i32 %30, %8, !dbg !19 + %32 = sext i32 %30 to i64, !dbg !16 + %33 = getelementptr i32, ptr addrspace(1) %19, i64 %32, !dbg !17 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %35 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %33, i64 %34, i1 %31) #5, !dbg !18 + %narrow16 = select i1 %31, i32 %35, i32 0, !dbg !20 + %spec.select = sext i32 %narrow16 to i64, !dbg !20 + %36 = add i64 %28, %spec.select, !dbg !20 + %37 = add i32 %29, 32, !dbg !14 + %38 = icmp slt i32 %37, %8, !dbg !14 + br i1 %38, label %.lr.ph.split, label %._crit_edge, !dbg !14 + +._crit_edge: ; preds = %.lr.ph.split.us, %.lr.ph.split, %11 + %.lcssa = phi i64 [ 0, %11 ], [ %36, %.lr.ph.split ], [ 0, %.lr.ph.split.us ], !dbg !21 + %extelt.offset = lshr i64 %.lcssa, 32, !dbg !22 + %39 = trunc nuw i64 %extelt.offset to i32, !dbg !22 + %40 = trunc i64 %.lcssa to i32, !dbg !22 + %41 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %40, i32 16, i32 31), !dbg !22 + %42 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 16, i32 31), !dbg !22 + %43 = insertelement <2 x i32> poison, i32 %41, i64 0, !dbg !22 + %44 = insertelement <2 x i32> %43, i32 %42, i64 1, !dbg !22 + %45 = bitcast <2 x i32> %44 to i64, !dbg !22 + %46 = add i64 %.lcssa, %45, !dbg !26 + %extelt.offset3 = lshr i64 %46, 32, !dbg !22 + %47 = trunc nuw i64 %extelt.offset3 to i32, !dbg !22 + %48 = trunc i64 %46 to i32, !dbg !22 + %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 8, i32 31), !dbg !22 + %50 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %47, i32 8, i32 31), !dbg !22 + %51 = insertelement <2 x i32> poison, i32 %49, i64 0, !dbg !22 + %52 = insertelement <2 x i32> %51, i32 %50, i64 1, !dbg !22 + %53 = bitcast <2 x i32> %52 to i64, !dbg !22 + %54 = add i64 %46, %53, !dbg !26 + %extelt.offset4 = lshr i64 %54, 32, !dbg !22 + %55 = trunc nuw i64 %extelt.offset4 to i32, !dbg !22 + %56 = trunc i64 %54 to i32, !dbg !22 + %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 4, i32 31), !dbg !22 + %58 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 4, i32 31), !dbg !22 + %59 = insertelement <2 x i32> poison, i32 %57, i64 0, !dbg !22 + %60 = insertelement <2 x i32> %59, i32 %58, i64 1, !dbg !22 + %61 = bitcast <2 x i32> %60 to i64, !dbg !22 + %62 = add i64 %54, %61, !dbg !26 + %extelt.offset5 = lshr i64 %62, 32, !dbg !22 + %63 = trunc nuw i64 %extelt.offset5 to i32, !dbg !22 + %64 = trunc i64 %62 to i32, !dbg !22 + %65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 2, i32 31), !dbg !22 + %66 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 2, i32 31), !dbg !22 + %67 = insertelement <2 x i32> poison, i32 %65, i64 0, !dbg !22 + %68 = insertelement <2 x i32> %67, i32 %66, i64 1, !dbg !22 + %69 = bitcast <2 x i32> %68 to i64, !dbg !22 + %70 = add i64 %62, %69, !dbg !26 + %extelt.offset6 = lshr i64 %70, 32, !dbg !22 + %71 = trunc nuw i64 %extelt.offset6 to i32, !dbg !22 + %72 = trunc i64 %70 to i32, !dbg !22 + %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 1, i32 31), !dbg !22 + %74 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 1, i32 31), !dbg !22 + %75 = insertelement <2 x i32> poison, i32 %73, i64 0, !dbg !22 + %76 = insertelement <2 x i32> %75, i32 %74, i64 1, !dbg !22 + %77 = bitcast <2 x i32> %76 to i64, !dbg !22 + %78 = add i64 %70, %77, !dbg !26 + %79 = trunc i64 %78 to i32, !dbg !27 + %80 = getelementptr i32, ptr addrspace(1) %2, i64 %16, !dbg !28 + %81 = and i32 %14, 32, !dbg !29 + %82 = icmp eq i32 %81, 0, !dbg !29 + %83 = and i32 %14, 63, !dbg !29 + %84 = icmp eq i32 %83, 0, !dbg !29 + %85 = and i1 %13, %84, !dbg !29 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %79, ptr addrspace(1) %80, i1 %85) #5, !dbg !29 + %86 = icmp slt i64 %5, 2, !dbg !30 + %87 = icmp sgt i64 %5, 1, !dbg !31 + %88 = select i1 %87, i64 %5, i64 0, !dbg !32 + %89 = zext i1 %86 to i64, !dbg !33 + %90 = add i64 %88, %89, !dbg !34 + %91 = mul i64 %90, %16, !dbg !35 + %92 = add i64 %5, 1, !dbg !36 + %93 = add i64 %6, 127, !dbg !37 + %94 = sdiv i64 %93, 128, !dbg !38 + %95 = and i64 %93, 127, !dbg !42 + %.not = icmp ne i64 %95, 0, !dbg !42 + %96 = icmp slt i64 %93, 0, !dbg !43 + %narrow = and i1 %96, %.not, !dbg !44 + %97 = sext i1 %narrow to i64, !dbg !44 + %98 = add nsw i64 %94, %97, !dbg !44 + br i1 %18, label %.lr.ph14, label %._crit_edge15, !dbg !45 + +.lr.ph14: ; preds = %._crit_edge, %119 + %99 = phi i32 [ %131, %119 ], [ 0, %._crit_edge ] + %100 = or disjoint i32 %99, %15, !dbg !46 + %101 = icmp slt i32 %100, %8, !dbg !47 + %102 = sext i32 %100 to i64, !dbg !48 + %103 = add i64 %91, %102, !dbg !48 + %104 = getelementptr i64, ptr addrspace(1) %1, i64 %103, !dbg !49 + %105 = and i1 %13, %101, !dbg !50 + %106 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !51 + %107 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %106, i1 %105) #5, !dbg !51 + %108 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !51 + %109 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %108, i1 %105) #5, !dbg !51 + %110 = icmp slt i32 %100, %79, !dbg !52 + %sext7 = shl i64 %109, 32, !dbg !53 + %111 = ashr exact i64 %sext7, 32, !dbg !53 + %112 = select i1 %110, i64 %111, i64 %5, !dbg !53 + %113 = icmp slt i64 %112, 0, !dbg !54 + %114 = select i1 %113, i64 %92, i64 0, !dbg !55 + %115 = add i64 %114, %112, !dbg !55 + %116 = icmp slt i64 %115, 0, !dbg !56 + %117 = icmp sgt i64 %115, %98, !dbg !57 + %.not12 = or i1 %116, %117, !dbg !58 + %.not9 = and i1 %105, %.not12, !dbg !59 + br i1 %.not9, label %118, label %119, !dbg !59 + +118: ; preds = %.lr.ph14 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 59, ptr nonnull @assertFunc_0, i64 1), !dbg !59 + unreachable, !dbg !59 + +119: ; preds = %.lr.ph14 + %sext = shl i64 %107, 32, !dbg !53 + %120 = ashr exact i64 %sext, 32, !dbg !53 + %121 = select i1 %110, i64 %120, i64 %5, !dbg !53 + %122 = icmp slt i64 %121, 0, !dbg !54 + %123 = select i1 %122, i64 %92, i64 0, !dbg !55 + %124 = trunc i64 %109 to i32, !dbg !60 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !59 + %125 = getelementptr i32, ptr addrspace(1) %3, i64 %103, !dbg !61 + %126 = and i1 %82, %105, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %124, ptr addrspace(1) %125, i1 %126) #5, !dbg !62 + %127 = getelementptr i32, ptr addrspace(1) %4, i64 %121, !dbg !63 + %128 = getelementptr i32, ptr addrspace(1) %127, i64 %123, !dbg !63 + %129 = getelementptr i32, ptr addrspace(1) %128, i64 %16, !dbg !63 + %130 = getelementptr i32, ptr addrspace(1) %129, i64 %17, !dbg !63 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %130, i1 %126) #5, !dbg !64 + %131 = add i32 %99, 32, !dbg !45 + %132 = icmp slt i32 %131, %8, !dbg !45 + br i1 %132, label %.lr.ph14, label %._crit_edge15, !dbg !45 + +._crit_edge15: ; preds = %119, %._crit_edge + ret void, !dbg !65 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="64" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2", linkageName: "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 22, column: 28, scope: !9) +!11 = !DILocation(line: 24, column: 21, scope: !9) +!12 = !DILocation(line: 25, column: 37, scope: !9) +!13 = !DILocation(line: 35, column: 45, scope: !9) +!14 = !DILocation(line: 29, column: 40, scope: !9) +!15 = !DILocation(line: 30, column: 31, scope: !9) +!16 = !DILocation(line: 35, column: 41, scope: !9) +!17 = !DILocation(line: 35, column: 34, scope: !9) +!18 = !DILocation(line: 35, column: 50, scope: !9) +!19 = !DILocation(line: 31, column: 29, scope: !9) +!20 = !DILocation(line: 39, column: 48, scope: !9) +!21 = !DILocation(line: 28, column: 43, scope: !9) +!22 = !DILocation(line: 291, column: 36, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !9, file: !24, discriminator: 0) +!24 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!25 = !DILocation(line: 40, column: 25, scope: !9) +!26 = !DILocation(line: 261, column: 15, scope: !23, inlinedAt: !25) +!27 = !DILocation(line: 41, column: 19, scope: !9) +!28 = !DILocation(line: 42, column: 25, scope: !9) +!29 = !DILocation(line: 42, column: 36, scope: !9) +!30 = !DILocation(line: 49, column: 60, scope: !9) +!31 = !DILocation(line: 49, column: 86, scope: !9) +!32 = !DILocation(line: 49, column: 77, scope: !9) +!33 = !DILocation(line: 49, scope: !9) +!34 = !DILocation(line: 49, column: 68, scope: !9) +!35 = !DILocation(line: 49, column: 45, scope: !9) +!36 = !DILocation(line: 55, column: 20, scope: !9) +!37 = !DILocation(line: 59, column: 94, scope: !9) +!38 = !DILocation(line: 72, column: 16, scope: !39, inlinedAt: !41) +!39 = distinct !DILexicalBlockFile(scope: !9, file: !40, discriminator: 0) +!40 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!41 = !DILocation(line: 59, column: 100, scope: !9) +!42 = !DILocation(line: 74, column: 34, scope: !39, inlinedAt: !41) +!43 = !DILocation(line: 75, column: 25, scope: !39, inlinedAt: !41) +!44 = !DILocation(line: 75, column: 47, scope: !39, inlinedAt: !41) +!45 = !DILocation(line: 43, column: 40, scope: !9) +!46 = !DILocation(line: 44, column: 31, scope: !9) +!47 = !DILocation(line: 45, column: 29, scope: !9) +!48 = !DILocation(line: 49, column: 41, scope: !9) +!49 = !DILocation(line: 49, column: 34, scope: !9) +!50 = !DILocation(line: 49, column: 103, scope: !9) +!51 = !DILocation(line: 49, column: 93, scope: !9) +!52 = !DILocation(line: 52, column: 22, scope: !9) +!53 = !DILocation(line: 54, column: 37, scope: !9) +!54 = !DILocation(line: 57, column: 24, scope: !9) +!55 = !DILocation(line: 58, column: 39, scope: !9) +!56 = !DILocation(line: 59, column: 32, scope: !9) +!57 = !DILocation(line: 59, column: 50, scope: !9) +!58 = !DILocation(line: 59, column: 112, scope: !9) +!59 = !DILocation(line: 59, column: 130, scope: !9) +!60 = !DILocation(line: 50, column: 23, scope: !9) +!61 = !DILocation(line: 61, column: 29, scope: !9) +!62 = !DILocation(line: 61, column: 94, scope: !9) +!63 = !DILocation(line: 62, column: 29, scope: !9) +!64 = !DILocation(line: 62, column: 95, scope: !9) +!65 = !DILocation(line: 43, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..1b11fbf3e2d8add77c3c92486030bfcf91ee7e47 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx @@ -0,0 +1,640 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 // -- Begin function triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 118, 114, 47, 99, 118, 114, 104, 110, 114, 109, 112, 103, 121, 120, 119, 117, 51, 52, 120, 108, 101, 99, 108, 101, 101, 51, 116, 116, 52, 107, 101, 109, 111, 108, 100, 107, 106, 55, 105, 97, 109, 52, 117, 99, 105, 97, 116, 104, 111, 109, 105, 114, 118, 108, 99, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[90] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 53, 32, 60, 32, 49, 32, 43, 32, 40, 116, 114, 105, 116, 111, 110, 95, 104, 101, 108, 112, 101, 114, 115, 46, 100, 105, 118, 95, 102, 108, 111, 111, 114, 95, 105, 110, 116, 101, 103, 101, 114, 40, 49, 50, 55, 32, 43, 32, 107, 115, 49, 44, 32, 32, 49, 50, 56, 41, 41}; + // @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 +.visible .entry triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_4, + .param .u64 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_5, + .param .u64 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_6, + .param .u32 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_7, + .param .u32 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_10 +) +.reqntid 64 +{ + .reg .pred %p<32>; + .reg .b32 %r<53>; + .reg .b64 %rd<103>; + .loc 1 18 0 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:18:0 + +// %bb.0: + ld.param.b32 %r12, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_8]; + ld.param.b64 %rd18, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_5]; + ld.param.b64 %rd15, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_2]; +$L__tmp0: + .loc 1 22 28 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:22:28 + mov.u32 %r13, %ctaid.x; + .loc 1 25 37 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:25:37 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + .loc 1 35 45 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:35:45 + cvt.u64.u32 %rd1, %r13; + mul.lo.s64 %rd2, %rd18, %rd1; + .loc 1 29 40 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:29:40 + setp.lt.s32 %p2, %r12, 1; + mov.b64 %rd102, 0; + cvt.u32.u64 %r49, %rd1; + shl.b64 %rd100, %rd2, 2; + @%p2 bra $L__BB0_6; +// %bb.1: // %.lr.ph + .loc 1 0 40 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:0:40 + ld.param.b64 %rd13, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_0]; + .loc 1 24 21 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:24:21 + setp.lt.u32 %p3, %r49, 32; + add.s64 %rd3, %rd13, %rd100; + @%p3 bra $L__BB0_4; + bra.uni $L__BB0_2; +$L__BB0_4: // %.lr.ph.split.preheader + .loc 1 0 21 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:0:21 + mov.b32 %r51, 0; + mov.b64 %rd102, 0; +$L__BB0_5: // %.lr.ph.split + // =>This Inner Loop Header: Depth=1 + .loc 1 31 29 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:31:29 + add.s32 %r20, %r2, %r51; + setp.lt.s32 %p6, %r20, %r12; + .loc 1 35 34 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:35:34 + mad.wide.s32 %rd28, %r20, 4, %rd3; + .loc 1 35 50 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:35:50 + // begin inline asm + mov.u64 %rd27, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd27, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r19, 0x0; + @%p6 ld.global.L1::evict_first.L2::cache_hint.b32 { %r19 }, [ %rd28 + 0 ], %rd27; + // end inline asm + .loc 1 39 48 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:39:48 + selp.b32 %r21, %r19, 0, %p6; + cvt.s64.s32 %rd30, %r21; + add.s64 %rd102, %rd102, %rd30; + .loc 1 29 40 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:29:40 + add.s32 %r51, %r51, 32; + setp.lt.s32 %p7, %r51, %r12; + @%p7 bra $L__BB0_5; + bra.uni $L__BB0_6; +$L__BB0_2: // %.lr.ph.split.us.preheader + .loc 1 0 40 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:0:40 + mov.b32 %r50, 0; +$L__BB0_3: // %.lr.ph.split.us + // =>This Inner Loop Header: Depth=1 + .loc 1 35 41 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:35:41 + add.s32 %r17, %r2, %r50; + .loc 1 35 34 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:35:34 + mad.wide.s32 %rd23, %r17, 4, %rd3; + .loc 1 35 50 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:35:50 + // begin inline asm + mov.u64 %rd22, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd22, 1.0; + // end inline asm + mov.pred %p4, 0; + // begin inline asm + mov.u32 %r16, 0x0; + @%p4 ld.global.L1::evict_first.L2::cache_hint.b32 { %r16 }, [ %rd23 + 0 ], %rd22; + // end inline asm + .loc 1 29 40 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:29:40 + add.s32 %r50, %r50, 32; + setp.lt.s32 %p5, %r50, %r12; + @%p5 bra $L__BB0_3; +$L__BB0_6: // %._crit_edge + .loc 1 24 21 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:24:21 + setp.lt.u32 %p10, %r49, 32; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:40:25 ] + mov.b64 {_, %r24}, %rd102; + cvt.u32.u64 %r25, %rd102; + shfl.sync.bfly.b32 %r26, %r25, 16, 31, -1; + shfl.sync.bfly.b32 %r27, %r24, 16, 31, -1; + cvt.u64.u32 %rd32, %r26; + cvt.u64.u32 %rd33, %r27; + shl.b64 %rd34, %rd33, 32; + or.b64 %rd35, %rd32, %rd34; + .loc 2 261 15 // standard.py:261:15 @[ cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:40:25 ] + add.s64 %rd36, %rd102, %rd35; + .loc 2 291 36 // standard.py:291:36 @[ cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:40:25 ] + mov.b64 {_, %r28}, %rd36; + cvt.u32.u64 %r29, %rd36; + shfl.sync.bfly.b32 %r30, %r29, 8, 31, -1; + shfl.sync.bfly.b32 %r31, %r28, 8, 31, -1; + cvt.u64.u32 %rd37, %r30; + cvt.u64.u32 %rd38, %r31; + shl.b64 %rd39, %rd38, 32; + or.b64 %rd40, %rd37, %rd39; + .loc 2 261 15 // standard.py:261:15 @[ cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:40:25 ] + add.s64 %rd41, %rd36, %rd40; + .loc 2 291 36 // standard.py:291:36 @[ cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:40:25 ] + mov.b64 {_, %r32}, %rd41; + cvt.u32.u64 %r33, %rd41; + shfl.sync.bfly.b32 %r34, %r33, 4, 31, -1; + shfl.sync.bfly.b32 %r35, %r32, 4, 31, -1; + cvt.u64.u32 %rd42, %r34; + cvt.u64.u32 %rd43, %r35; + shl.b64 %rd44, %rd43, 32; + or.b64 %rd45, %rd42, %rd44; + .loc 2 261 15 // standard.py:261:15 @[ cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:40:25 ] + add.s64 %rd46, %rd41, %rd45; + .loc 2 291 36 // standard.py:291:36 @[ cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:40:25 ] + mov.b64 {_, %r36}, %rd46; + cvt.u32.u64 %r37, %rd46; + shfl.sync.bfly.b32 %r38, %r37, 2, 31, -1; + shfl.sync.bfly.b32 %r39, %r36, 2, 31, -1; + cvt.u64.u32 %rd47, %r38; + cvt.u64.u32 %rd48, %r39; + shl.b64 %rd49, %rd48, 32; + or.b64 %rd50, %rd47, %rd49; + .loc 2 261 15 // standard.py:261:15 @[ cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:40:25 ] + add.s64 %rd51, %rd46, %rd50; + .loc 2 291 36 // standard.py:291:36 @[ cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:40:25 ] + mov.b64 {_, %r40}, %rd51; + cvt.u32.u64 %r41, %rd51; + shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1; + shfl.sync.bfly.b32 %r43, %r40, 1, 31, -1; + cvt.u64.u32 %rd52, %r42; + .loc 2 261 15 // standard.py:261:15 @[ cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:40:25 ] + add.s64 %rd53, %rd51, %rd52; +$L__tmp2: + .loc 1 41 19 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:41:19 + cvt.u32.u64 %r22, %rd53; + .loc 1 42 25 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:42:25 + shl.b64 %rd54, %rd1, 2; + add.s64 %rd31, %rd15, %rd54; + .loc 1 42 36 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:42:36 + and.b32 %r44, %r1, 63; + setp.eq.b32 %p11, %r44, 0; + and.pred %p8, %p10, %p11; + // begin inline asm + @%p8 st.global.b32 [ %rd31 + 0 ], { %r22 }; + // end inline asm + .loc 1 43 40 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:43:40 + @%p2 bra $L__BB0_11; +// %bb.7: // %.lr.ph14.preheader + .loc 1 0 40 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:0:40 + ld.param.b64 %rd19, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_6]; + ld.param.b64 %rd17, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_4]; + ld.param.b64 %rd16, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_3]; + ld.param.b64 %rd14, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_1]; + and.b32 %r8, %r1, 32; + setp.lt.s64 %p12, %rd18, 2; + setp.gt.s64 %p13, %rd18, 1; + selp.b64 %rd55, %rd18, 0, %p13; + selp.b64 %rd56, 1, 0, %p12; + add.s64 %rd57, %rd55, %rd56; + mul.lo.s64 %rd7, %rd57, %rd1; + add.s64 %rd8, %rd18, 1; + add.s64 %rd58, %rd19, 127; + shr.s64 %rd59, %rd58, 63; + shr.u64 %rd60, %rd59, 57; + add.s64 %rd61, %rd58, %rd60; + shr.s64 %rd62, %rd61, 7; + and.b64 %rd63, %rd58, 127; + setp.ne.b64 %p14, %rd63, 0; + setp.lt.s64 %p15, %rd58, 0; + and.pred %p16, %p15, %p14; + selp.b64 %rd64, -1, 0, %p16; + add.s64 %rd9, %rd62, %rd64; + mov.b32 %r52, 0; +$L__BB0_8: // %.lr.ph14 + // =>This Inner Loop Header: Depth=1 + .loc 1 45 29 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:45:29 + add.s32 %r10, %r2, %r52; + setp.lt.s32 %p20, %r10, %r12; + .loc 1 49 41 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:49:41 + cvt.s64.s32 %rd73, %r10; + add.s64 %rd10, %rd7, %rd73; + .loc 1 49 34 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:49:34 + shl.b64 %rd74, %rd10, 3; + add.s64 %rd67, %rd14, %rd74; + .loc 1 49 103 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:49:103 + and.pred %p18, %p10, %p20; + .loc 1 49 93 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:49:93 + // begin inline asm + mov.u64 %rd65, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd65, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd66, 0x0; + @%p18 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd66 }, [ %rd67 + 0 ], %rd65; + // end inline asm + // begin inline asm + mov.u64 %rd69, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd69, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd70, 0x0; + @%p18 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd70 }, [ %rd67 + 0 ], %rd69; + // end inline asm + .loc 1 52 22 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:52:22 + setp.lt.s32 %p21, %r10, %r22; + .loc 1 54 37 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:54:37 + cvt.s64.s32 %rd75, %rd70; + selp.b64 %rd76, %rd75, %rd18, %p21; + .loc 1 58 39 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:58:39 + shr.s64 %rd77, %rd76, 63; + and.b64 %rd78, %rd77, %rd8; + add.s64 %rd79, %rd78, %rd76; + .loc 1 59 32 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:59:32 + setp.lt.s64 %p22, %rd79, 0; + .loc 1 59 50 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:59:50 + setp.gt.s64 %p23, %rd79, %rd9; + .loc 1 59 112 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:59:112 + or.pred %p24, %p22, %p23; + .loc 1 59 130 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:59:130 + and.pred %p25, %p18, %p24; + not.pred %p26, %p25; + @%p26 bra $L__BB0_10; + bra.uni $L__BB0_9; +$L__BB0_10: // in Loop: Header=BB0_8 Depth=1 + .loc 1 42 36 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:42:36 + setp.eq.b32 %p30, %r8, 0; + .loc 1 54 37 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:54:37 + cvt.s64.s32 %rd82, %rd66; + selp.b64 %rd83, %rd82, %rd18, %p21; + .loc 1 58 39 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:58:39 + shr.s64 %rd84, %rd83, 63; + and.b64 %rd85, %rd84, %rd8; + .loc 1 50 23 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:50:23 + cvt.u32.u64 %r47, %rd70; + .loc 1 59 130 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:59:130 + bar.sync 0; + .loc 1 61 29 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:61:29 + shl.b64 %rd86, %rd10, 2; + add.s64 %rd80, %rd16, %rd86; + .loc 1 61 94 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:61:94 + and.pred %p27, %p30, %p18; + // begin inline asm + @%p27 st.global.b32 [ %rd80 + 0 ], { %r47 }; + // end inline asm + .loc 1 62 29 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:62:29 + shl.b64 %rd87, %rd83, 2; + add.s64 %rd88, %rd17, %rd87; + shl.b64 %rd89, %rd85, 2; + add.s64 %rd90, %rd88, %rd89; + add.s64 %rd92, %rd90, %rd54; + add.s64 %rd81, %rd92, %rd100; + mov.b32 %r48, 1; + .loc 1 62 95 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:62:95 + // begin inline asm + @%p27 st.global.b32 [ %rd81 + 0 ], { %r48 }; + // end inline asm + .loc 1 43 40 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:43:40 + add.s32 %r52, %r52, 32; + setp.lt.s32 %p31, %r52, %r12; + @%p31 bra $L__BB0_8; +$L__BB0_11: // %._crit_edge15 + .loc 1 43 4 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:43:4 + ret; +$L__BB0_9: + .loc 1 59 130 // cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py:59:130 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd94, assertFunc_0; + cvta.global.u64 %rd95, %rd94; + st.param.b64 [param3], %rd95; + mov.b64 %rd96, assertFile_0; + cvta.global.u64 %rd97, %rd96; + st.param.b64 [param1], %rd97; + mov.b64 %rd98, assertMessage_0; + cvta.global.u64 %rd99, %rd98; + st.param.b64 [param0], %rd99; + st.param.b64 [param4], 1; + st.param.b32 [param2], 59; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 281 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x112 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 118 +.b8 114 +.b8 104 +.b8 110 +.b8 114 +.b8 109 +.b8 112 +.b8 103 +.b8 121 +.b8 120 +.b8 119 +.b8 117 +.b8 51 +.b8 52 +.b8 120 +.b8 108 +.b8 101 +.b8 99 +.b8 108 +.b8 101 +.b8 101 +.b8 51 +.b8 116 +.b8 116 +.b8 52 +.b8 107 +.b8 101 +.b8 109 +.b8 111 +.b8 108 +.b8 100 +.b8 107 +.b8 106 +.b8 55 +.b8 105 +.b8 97 +.b8 109 +.b8 52 +.b8 117 +.b8 99 +.b8 105 +.b8 97 +.b8 116 +.b8 104 +.b8 111 +.b8 109 +.b8 105 +.b8 114 +.b8 118 +.b8 108 +.b8 99 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 118 +.b8 114 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x63 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 112 +.b8 117 +.b8 116 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 110 +.b8 101 +.b8 119 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 115 +.b8 99 +.b8 97 +.b8 108 +.b8 97 +.b8 114 +.b8 95 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 119 +.b8 104 +.b8 101 +.b8 114 +.b8 101 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xee:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x103:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 40 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source b/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source new file mode 100644 index 0000000000000000000000000000000000000000..4e0ef11385fb011050e258c0f8cae6472a265690 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source @@ -0,0 +1,379 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":18:0) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc79 = loc(unknown) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0) +#loc97 = loc("in_ptr0"(#loc)) +#loc98 = loc("in_ptr1"(#loc)) +#loc99 = loc("out_ptr1"(#loc)) +#loc100 = loc("out_ptr2"(#loc)) +#loc101 = loc("out_ptr3"(#loc)) +#loc102 = loc("ks0"(#loc)) +#loc103 = loc("ks1"(#loc)) +#loc104 = loc("xnumel"(#loc)) +#loc105 = loc("r0_numel"(#loc)) +#loc151 = loc("input"(#loc77)) +#loc152 = loc("a"(#loc82)) +#loc153 = loc("b"(#loc82)) +#loc154 = loc("a"(#loc86)) +module { + tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 32 : i32 loc(#loc106) + %xoffset = tt.get_program_id x : i32 loc(#loc107) + %xoffset_1 = arith.constant 1 : i32 loc(#loc108) + %xoffset_2 = arith.constant 1 : i32 loc(#loc108) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc108) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc109) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc110) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc111) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc111) + %xmask = arith.constant dense<32> : tensor<1x1xi32> loc(#loc112) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc112) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc113) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc114) + %_tmp3 = arith.constant 0 : i64 loc(#loc115) + %_tmp3_9 = arith.constant dense<0> : tensor<1x32xi64> loc(#loc115) + %c0_i32 = arith.constant 0 : i32 loc(#loc11) + %c32_i32 = arith.constant 32 : i32 loc(#loc11) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc11) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc11) + %2 = arith.bitcast %c32_i32 : i32 to i32 loc(#loc11) + %3 = ub.poison : i32 loc(#loc11) + %_tmp3_10 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_14 = %_tmp3_9) -> (tensor<1x32xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc117) + %r0_index_15 = arith.addi %r0_index, %r0_base_8 : tensor<1x32xi32> loc(#loc117) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc118) + %r0_mask_16 = arith.cmpi slt, %r0_index_15, %r0_mask : tensor<1x32xi32> loc(#loc118) + %tmp0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc119) + %tmp0_17 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc119) + %tmp0_18 = arith.muli %tmp0_17, %tmp0 : tensor<1x1xi64> loc(#loc119) + %tmp0_19 = arith.extsi %r0_index_15 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc120) + %tmp0_20 = tt.broadcast %tmp0_18 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc120) + %tmp0_21 = arith.addi %tmp0_19, %tmp0_20 : tensor<1x32xi64> loc(#loc120) + %tmp0_22 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc121) + %tmp0_23 = tt.addptr %tmp0_22, %tmp0_21 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc121) + %tmp0_24 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc122) + %tmp0_25 = arith.andi %r0_mask_16, %tmp0_24 : tensor<1x32xi1> loc(#loc122) + %tmp0_26 = arith.constant 0.000000e+00 : f32 loc(#loc123) + %tmp0_27 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc123) + %tmp0_28 = arith.fptosi %tmp0_27 : tensor<1x32xf32> to tensor<1x32xi32> loc(#loc123) + %tmp0_29 = tt.load %tmp0_23, %tmp0_25, %tmp0_28 evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc123) + %tmp1 = arith.extsi %tmp0_29 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc124) + %tmp4 = arith.addi %_tmp3_14, %tmp1 : tensor<1x32xi64> loc(#loc125) + %_tmp3_30 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc126) + %_tmp3_31 = arith.andi %r0_mask_16, %_tmp3_30 : tensor<1x32xi1> loc(#loc126) + %_tmp3_32 = arith.select %_tmp3_31, %tmp4, %_tmp3_14 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc127) + scf.yield %_tmp3_32 : tensor<1x32xi64> loc(#loc23) + } loc(#loc116) + %tmp3 = tt.call @"triton.language.standard.sum__i64S1_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp3_10) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc128) + %tmp3_11 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc129) + %tmp5 = arith.trunci %tmp3_11 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc130) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc27) + %5 = tt.addptr %4, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc27) + tt.store %5, %tmp5, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc28) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc29) + %c32_i32_13 = arith.constant 32 : i32 loc(#loc29) + %6 = arith.bitcast %c0_i32_12 : i32 to i32 loc(#loc29) + %7 = arith.bitcast %r0_numel : i32 to i32 loc(#loc29) + %8 = arith.bitcast %c32_i32_13 : i32 to i32 loc(#loc29) + %9 = ub.poison : i32 loc(#loc29) + scf.for %r0_offset = %6 to %7 step %8 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc131) + %r0_index_14 = arith.addi %r0_index, %r0_base_8 : tensor<1x32xi32> loc(#loc131) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc132) + %r0_mask_15 = arith.cmpi slt, %r0_index_14, %r0_mask : tensor<1x32xi32> loc(#loc132) + %tmp6 = arith.constant 1 : i32 loc(#loc133) + %tmp6_16 = arith.extsi %tmp6 : i32 to i64 loc(#loc133) + %tmp6_17 = arith.cmpi sge, %tmp6_16, %ks0 : i64 loc(#loc133) + %tmp6_18 = arith.constant 1 : i32 loc(#loc134) + %tmp6_19 = arith.constant 1 : i32 loc(#loc134) + %tmp6_20 = arith.extui %tmp6_17 : i1 to i32 loc(#loc134) + %tmp6_21 = arith.muli %tmp6_19, %tmp6_20 : i32 loc(#loc134) + %tmp6_22 = arith.constant 1 : i32 loc(#loc135) + %tmp6_23 = arith.extsi %tmp6_22 : i32 to i64 loc(#loc135) + %tmp6_24 = arith.cmpi sgt, %ks0, %tmp6_23 : i64 loc(#loc135) + %tmp6_25 = arith.extui %tmp6_24 : i1 to i64 loc(#loc136) + %tmp6_26 = arith.muli %ks0, %tmp6_25 : i64 loc(#loc136) + %tmp6_27 = arith.extsi %tmp6_21 : i32 to i64 loc(#loc137) + %tmp6_28 = arith.addi %tmp6_27, %tmp6_26 : i64 loc(#loc137) + %tmp6_29 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc138) + %tmp6_30 = tt.splat %tmp6_28 : i64 -> tensor<1x1xi64> loc(#loc138) + %tmp6_31 = arith.muli %tmp6_29, %tmp6_30 : tensor<1x1xi64> loc(#loc138) + %tmp6_32 = arith.extsi %r0_index_14 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc139) + %tmp6_33 = tt.broadcast %tmp6_31 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc139) + %tmp6_34 = arith.addi %tmp6_32, %tmp6_33 : tensor<1x32xi64> loc(#loc139) + %tmp6_35 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc140) + %tmp6_36 = tt.addptr %tmp6_35, %tmp6_34 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc140) + %tmp6_37 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc141) + %tmp6_38 = arith.andi %r0_mask_15, %tmp6_37 : tensor<1x32xi1> loc(#loc141) + %tmp6_39 = arith.constant 0.000000e+00 : f32 loc(#loc142) + %tmp6_40 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc142) + %tmp6_41 = arith.fptosi %tmp6_40 : tensor<1x32xf32> to tensor<1x32xi64> loc(#loc142) + %tmp6_42 = tt.load %tmp6_36, %tmp6_38, %tmp6_41 evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc142) + %tmp7 = arith.trunci %tmp6_42 : tensor<1x32xi64> to tensor<1x32xi32> loc(#loc143) + %tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32> -> tensor<1x32xi32> loc(#loc144) + %tmp9_43 = arith.cmpi slt, %r0_index_14, %tmp9 : tensor<1x32xi32> loc(#loc144) + %tmp11 = arith.extsi %tmp7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc145) + %tmp11_44 = tt.splat %ks0 : i64 -> tensor<1x32xi64> loc(#loc145) + %tmp11_45 = arith.select %tmp9_43, %tmp11, %tmp11_44 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc145) + %tmp12 = arith.constant 1 : i32 loc(#loc146) + %tmp12_46 = arith.constant 1 : i64 loc(#loc146) + %tmp12_47 = arith.addi %tmp12_46, %ks0 : i64 loc(#loc146) + %tmp13 = tt.splat %tmp12_47 : i64 -> tensor<1x32xi64> loc(#loc147) + %tmp13_48 = arith.addi %tmp11_45, %tmp13 : tensor<1x32xi64> loc(#loc147) + %tmp14 = arith.constant 0 : i32 loc(#loc148) + %tmp14_49 = arith.extsi %tmp14 : i32 to i64 loc(#loc148) + %tmp14_50 = tt.splat %tmp14_49 : i64 -> tensor<1x32xi64> loc(#loc148) + %tmp14_51 = arith.cmpi slt, %tmp11_45, %tmp14_50 : tensor<1x32xi64> loc(#loc148) + %tmp15 = arith.select %tmp14_51, %tmp13_48, %tmp11_45 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc149) + %c0_i32_52 = arith.constant 0 : i32 loc(#loc49) + %10 = arith.extsi %c0_i32_52 : i32 to i64 loc(#loc49) + %11 = tt.splat %10 : i64 -> tensor<1x32xi64> loc(#loc49) + %12 = arith.cmpi sle, %11, %tmp15 : tensor<1x32xi64> loc(#loc49) + %c127_i32 = arith.constant 127 : i32 loc(#loc50) + %c127_i64 = arith.constant 127 : i64 loc(#loc50) + %13 = arith.addi %c127_i64, %ks1 : i64 loc(#loc50) + %14 = tt.call @"torch._inductor.runtime.triton_helpers.div_floor_integer__i64__(1,)cconstexpr_128_"(%13) : (i64) -> i64 loc(#loc51) + %c1_i32 = arith.constant 1 : i32 loc(#loc52) + %c1_i64 = arith.constant 1 : i64 loc(#loc52) + %15 = arith.addi %c1_i64, %14 : i64 loc(#loc52) + %16 = tt.splat %15 : i64 -> tensor<1x32xi64> loc(#loc53) + %17 = arith.cmpi slt, %tmp15, %16 : tensor<1x32xi64> loc(#loc53) + %18 = arith.andi %12, %17 : tensor<1x32xi1> loc(#loc54) + %19 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc55) + %20 = arith.andi %r0_mask_15, %19 : tensor<1x32xi1> loc(#loc55) + %true = arith.constant true loc(#loc56) + %cst = arith.constant dense : tensor<1x32xi1> loc(#loc56) + %21 = arith.xori %20, %cst : tensor<1x32xi1> loc(#loc56) + %22 = arith.ori %18, %21 : tensor<1x32xi1> loc(#loc57) + tt.assert %22, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1> loc(#loc58) + %tmp17 = arith.constant 1 : i32 loc(#loc150) + %tmp17_53 = arith.constant dense<1> : tensor<1x1xi32> loc(#loc150) + %c1_i32_54 = arith.constant 1 : i32 loc(#loc60) + %23 = arith.extsi %c1_i32_54 : i32 to i64 loc(#loc60) + %24 = arith.cmpi sge, %23, %ks0 : i64 loc(#loc60) + %c1_i32_55 = arith.constant 1 : i32 loc(#loc61) + %c1_i32_56 = arith.constant 1 : i32 loc(#loc61) + %25 = arith.extui %24 : i1 to i32 loc(#loc61) + %26 = arith.muli %c1_i32_56, %25 : i32 loc(#loc61) + %c1_i32_57 = arith.constant 1 : i32 loc(#loc62) + %27 = arith.extsi %c1_i32_57 : i32 to i64 loc(#loc62) + %28 = arith.cmpi sgt, %ks0, %27 : i64 loc(#loc62) + %29 = arith.extui %28 : i1 to i64 loc(#loc63) + %30 = arith.muli %ks0, %29 : i64 loc(#loc63) + %31 = arith.extsi %26 : i32 to i64 loc(#loc64) + %32 = arith.addi %31, %30 : i64 loc(#loc64) + %33 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc65) + %34 = tt.splat %32 : i64 -> tensor<1x1xi64> loc(#loc65) + %35 = arith.muli %33, %34 : tensor<1x1xi64> loc(#loc65) + %36 = arith.extsi %r0_index_14 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc66) + %37 = tt.broadcast %35 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc66) + %38 = arith.addi %36, %37 : tensor<1x32xi64> loc(#loc66) + %39 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc67) + %40 = tt.addptr %39, %38 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc67) + %41 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc68) + %42 = arith.andi %r0_mask_15, %41 : tensor<1x32xi1> loc(#loc68) + tt.store %40, %tmp7, %42 : tensor<1x32x!tt.ptr> loc(#loc69) + %43 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc70) + %44 = tt.broadcast %43 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc70) + %45 = arith.addi %tmp15, %44 : tensor<1x32xi64> loc(#loc70) + %46 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc71) + %47 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc71) + %48 = arith.muli %47, %46 : tensor<1x1xi64> loc(#loc71) + %49 = tt.broadcast %48 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc72) + %50 = arith.addi %45, %49 : tensor<1x32xi64> loc(#loc72) + %51 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc73) + %52 = tt.addptr %51, %50 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc73) + %53 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc74) + %54 = arith.andi %r0_mask_15, %53 : tensor<1x32xi1> loc(#loc74) + %cst_58 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc75) + tt.store %52, %cst_58, %54 : tensor<1x32x!tt.ptr> loc(#loc75) + } loc(#loc29) + tt.return loc(#loc76) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x32xi64> loc("input"(#loc77))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc78) + tt.reduce.return %2 : i64 loc(#loc78) + }) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc78) + tt.return %0 : tensor<1xi64> loc(#loc80) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc81) + tt.return %1 : tensor<1xi64> loc(#loc81) + } loc(#loc77) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc82)), %b: i64 loc("b"(#loc82))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc83) + tt.return %0 : i64 loc(#loc84) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc85) + tt.return %1 : i64 loc(#loc85) + } loc(#loc82) + tt.func private @"torch._inductor.runtime.triton_helpers.div_floor_integer__i64__(1,)cconstexpr_128_"(%a: i64 loc("a"(#loc86))) -> i64 attributes {noinline = false} { + %quot = arith.constant 128 : i32 loc(#loc155) + %quot_0 = arith.constant 128 : i64 loc(#loc155) + %quot_1 = arith.divsi %a, %quot_0 : i64 loc(#loc155) + %remainder = arith.constant 128 : i32 loc(#loc156) + %remainder_2 = arith.constant 128 : i64 loc(#loc156) + %remainder_3 = arith.remsi %a, %remainder_2 : i64 loc(#loc156) + %fixed = arith.constant 0 : i32 loc(#loc157) + %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc157) + %fixed_5 = arith.cmpi ne, %remainder_3, %fixed_4 : i64 loc(#loc157) + %fixed_6 = arith.constant 1 : i32 loc(#loc158) + %fixed_7 = arith.constant 1 : i64 loc(#loc158) + %fixed_8 = arith.subi %quot_1, %fixed_7 : i64 loc(#loc158) + %fixed_9 = arith.select %fixed_5, %fixed_8, %quot_1 : i64 loc(#loc159) + %c0_i32 = arith.constant 0 : i32 loc(#loc92) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc92) + %1 = arith.cmpi slt, %a, %0 : i64 loc(#loc92) + %false = arith.constant false loc(#loc93) + %2 = arith.cmpi ne, %1, %false : i1 loc(#loc93) + %3 = arith.select %2, %fixed_9, %quot_1 : i64 loc(#loc94) + tt.return %3 : i64 loc(#loc95) + ^bb1: // no predecessors + %4 = ub.poison : i64 loc(#loc96) + tt.return %4 : i64 loc(#loc96) + } loc(#loc86) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":28:43) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":29:40) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":30:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":31:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":35:45) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":35:41) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":35:34) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":35:60) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":35:50) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":36:23) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":38:23) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":39:35) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":39:48) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":39:8) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":40:25) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":40:28) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":41:19) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":42:25) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":42:36) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":43:40) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":44:31) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":45:29) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:60) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:52) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:86) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:77) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:68) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:45) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:41) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:34) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:103) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:93) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":50:23) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":52:22) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":54:37) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":55:20) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":56:24) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":57:24) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":58:39) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:32) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:94) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:100) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:55) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:50) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:42) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:122) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:112) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:110) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:130) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":60:35) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":61:55) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":61:47) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":61:81) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":61:72) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":61:63) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":61:40) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":61:36) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":61:29) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":61:104) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":61:94) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":62:53) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":62:62) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":62:58) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":62:29) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":62:105) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":62:95) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":43:4) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4) +#loc106 = loc("xnumel"(#loc1)) +#loc107 = loc("xoffset"(#loc2)) +#loc108 = loc("xoffset"(#loc3)) +#loc109 = loc("xindex"(#loc4)) +#loc110 = loc("xindex"(#loc5)) +#loc111 = loc("xindex"(#loc6)) +#loc112 = loc("xmask"(#loc7)) +#loc113 = loc("r0_base"(#loc8)) +#loc114 = loc("r0_base"(#loc9)) +#loc115 = loc("_tmp3"(#loc10)) +#loc116 = loc("_tmp3"(#loc11)) +#loc117 = loc("r0_index"(#loc12)) +#loc118 = loc("r0_mask"(#loc13)) +#loc119 = loc("tmp0"(#loc14)) +#loc120 = loc("tmp0"(#loc15)) +#loc121 = loc("tmp0"(#loc16)) +#loc122 = loc("tmp0"(#loc17)) +#loc123 = loc("tmp0"(#loc18)) +#loc124 = loc("tmp1"(#loc19)) +#loc125 = loc("tmp4"(#loc20)) +#loc126 = loc("_tmp3"(#loc21)) +#loc127 = loc("_tmp3"(#loc22)) +#loc128 = loc("tmp3"(#loc24)) +#loc129 = loc("tmp3"(#loc25)) +#loc130 = loc("tmp5"(#loc26)) +#loc131 = loc("r0_index"(#loc30)) +#loc132 = loc("r0_mask"(#loc31)) +#loc133 = loc("tmp6"(#loc32)) +#loc134 = loc("tmp6"(#loc33)) +#loc135 = loc("tmp6"(#loc34)) +#loc136 = loc("tmp6"(#loc35)) +#loc137 = loc("tmp6"(#loc36)) +#loc138 = loc("tmp6"(#loc37)) +#loc139 = loc("tmp6"(#loc38)) +#loc140 = loc("tmp6"(#loc39)) +#loc141 = loc("tmp6"(#loc40)) +#loc142 = loc("tmp6"(#loc41)) +#loc143 = loc("tmp7"(#loc42)) +#loc144 = loc("tmp9"(#loc43)) +#loc145 = loc("tmp11"(#loc44)) +#loc146 = loc("tmp12"(#loc45)) +#loc147 = loc("tmp13"(#loc46)) +#loc148 = loc("tmp14"(#loc47)) +#loc149 = loc("tmp15"(#loc48)) +#loc150 = loc("tmp17"(#loc59)) +#loc155 = loc("quot"(#loc87)) +#loc156 = loc("remainder"(#loc88)) +#loc157 = loc("fixed"(#loc89)) +#loc158 = loc("fixed"(#loc90)) +#loc159 = loc("fixed"(#loc91)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..684ee4d35f6f0b3a94718da7dae431a8c9aa41ec --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir @@ -0,0 +1,270 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":18:0) +#loc1 = loc(unknown) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":40:25) +#loc68 = loc("in_ptr0"(#loc)) +#loc69 = loc("in_ptr1"(#loc)) +#loc70 = loc("out_ptr1"(#loc)) +#loc71 = loc("out_ptr2"(#loc)) +#loc72 = loc("out_ptr3"(#loc)) +#loc73 = loc("ks0"(#loc)) +#loc74 = loc("ks1"(#loc)) +#loc75 = loc("xnumel"(#loc)) +#loc76 = loc("r0_numel"(#loc)) +#loc91 = loc("tmp3"(#loc18)) +#loc124 = loc(callsite(#loc1 at #loc91)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x32xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x32xi64, #blocked1> loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c127_i64 = arith.constant 127 : i64 loc(#loc1) + %cst_1 = arith.constant dense : tensor<1x32xi1, #blocked1> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<1x32xi32, #blocked1> loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x32xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc77) + %xmask = arith.cmpi slt, %xoffset, %c32_i32 : i32 loc(#loc78) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc79) + %r0_base_4 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc79) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> loc(#loc79) + %r0_base_6 = tt.expand_dims %r0_base_4 {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> loc(#loc79) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32, #blocked1> loc(#loc80) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc81) + %tmp0_7 = arith.muli %ks0, %tmp0 : i64 loc(#loc81) + %tmp0_8 = tt.splat %tmp0_7 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc121) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc83) + %tmp0_10 = tt.splat %xmask : i1 -> tensor<1x32xi1, #blocked1> loc(#loc122) + %_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 iter_args(%_tmp3_31 = %cst_0) -> (tensor<1x32xi64, #blocked1>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked1> loc(#loc86) + %r0_index_32 = arith.addi %r0_index, %r0_base_6 : tensor<1x32xi32, #blocked1> loc(#loc86) + %r0_mask_33 = arith.cmpi slt, %r0_index_32, %r0_mask : tensor<1x32xi32, #blocked1> loc(#loc80) + %tmp0_34 = arith.extsi %r0_index_32 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc82) + %tmp0_35 = arith.addi %tmp0_34, %tmp0_8 : tensor<1x32xi64, #blocked1> loc(#loc82) + %tmp0_36 = tt.addptr %tmp0_9, %tmp0_35 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc83) + %tmp0_37 = arith.andi %r0_mask_33, %tmp0_10 : tensor<1x32xi1, #blocked1> loc(#loc84) + %tmp0_38 = tt.load %tmp0_36, %tmp0_37, %cst_2 evictionPolicy = evict_first : tensor<1x32x!tt.ptr, #blocked1> loc(#loc87) + %tmp1 = arith.extsi %tmp0_38 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc88) + %tmp4 = arith.addi %_tmp3_31, %tmp1 : tensor<1x32xi64, #blocked1> loc(#loc89) + %_tmp3_39 = arith.select %tmp0_37, %tmp4, %_tmp3_31 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc90) + scf.yield %_tmp3_39 : tensor<1x32xi64, #blocked1> loc(#loc16) + } loc(#loc85) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_31: i64 loc(callsite(#loc1 at #loc91)), %tmp3_32: i64 loc(callsite(#loc1 at #loc91))): + %tmp3_33 = arith.addi %tmp3_31, %tmp3_32 : i64 loc(#loc133) + tt.reduce.return %tmp3_33 : i64 loc(#loc123) + }) : (tensor<1x32xi64, #blocked1>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc123) + %0 = ttg.convert_layout %tmp3 : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc20) + %tmp3_11 = tt.expand_dims %0 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc92) + %tmp3_12 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi64, #blocked1> loc(#loc92) + %tmp5 = arith.trunci %tmp3_11 : tensor<1x1xi64, #blocked> to tensor<1x1xi32, #blocked> loc(#loc93) + %tmp5_13 = arith.trunci %tmp3_12 : tensor<1x1xi64, #blocked1> to tensor<1x1xi32, #blocked1> loc(#loc93) + %1 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc23) + %2 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc24) + %3 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc24) + tt.store %2, %tmp5, %3 : tensor<1x1x!tt.ptr, #blocked> loc(#loc24) + %r0_mask_14 = tt.splat %r0_numel : i32 -> tensor<1x32xi32, #blocked> loc(#loc94) + %tmp6 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc95) + %tmp6_15 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc96) + %tmp6_16 = arith.extui %tmp6_15 : i1 to i64 loc(#loc97) + %tmp6_17 = arith.muli %ks0, %tmp6_16 : i64 loc(#loc97) + %tmp6_18 = arith.extui %tmp6 : i1 to i64 loc(#loc125) + %tmp6_19 = arith.addi %tmp6_18, %tmp6_17 : i64 loc(#loc98) + %tmp6_20 = arith.muli %tmp0, %tmp6_19 : i64 loc(#loc100) + %tmp6_21 = tt.splat %tmp6_20 : i64 -> tensor<1x32xi64, #blocked> loc(#loc126) + %tmp6_22 = tt.splat %tmp6_20 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc126) + %tmp6_23 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked> loc(#loc102) + %tmp6_24 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc102) + %tmp6_25 = tt.splat %xmask : i1 -> tensor<1x32xi1, #blocked> loc(#loc127) + %tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32, #blocked> -> tensor<1x32xi32, #blocked> loc(#loc104) + %tmp9_26 = tt.broadcast %tmp5_13 : tensor<1x1xi32, #blocked1> -> tensor<1x32xi32, #blocked1> loc(#loc104) + %tmp11 = tt.splat %ks0 : i64 -> tensor<1x32xi64, #blocked> loc(#loc105) + %tmp11_27 = tt.splat %ks0 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc105) + %tmp12 = arith.addi %ks0, %c1_i64 : i64 loc(#loc106) + %tmp13 = tt.splat %tmp12 : i64 -> tensor<1x32xi64, #blocked> loc(#loc107) + %tmp13_28 = tt.splat %tmp12 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc107) + %4 = arith.addi %ks1, %c127_i64 : i64 loc(#loc39) + %quot = arith.divsi %4, %c128_i64 : i64 loc(#loc128) + %remainder = arith.remsi %4, %c128_i64 : i64 loc(#loc129) + %fixed = arith.cmpi ne, %remainder, %c0_i64 : i64 loc(#loc130) + %fixed_29 = arith.subi %quot, %c1_i64 : i64 loc(#loc131) + %fixed_30 = arith.select %fixed, %fixed_29, %quot : i64 loc(#loc132) + %5 = arith.cmpi slt, %4, %c0_i64 : i64 loc(#loc113) + %6 = arith.select %5, %fixed_30, %quot : i64 loc(#loc114) + %7 = arith.addi %6, %c1_i64 : i64 loc(#loc48) + %8 = tt.splat %7 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc49) + %9 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc50) + %10 = tt.splat %tmp0 : i64 -> tensor<1x32xi64, #blocked> loc(#loc51) + %11 = tt.splat %tmp0_7 : i64 -> tensor<1x32xi64, #blocked> loc(#loc115) + %12 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked> loc(#loc54) + scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked> loc(#loc116) + %r0_index_31 = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked1> loc(#loc116) + %r0_index_32 = arith.addi %r0_index, %r0_base_5 : tensor<1x32xi32, #blocked> loc(#loc116) + %r0_index_33 = arith.addi %r0_index_31, %r0_base_6 : tensor<1x32xi32, #blocked1> loc(#loc116) + %r0_mask_34 = arith.cmpi slt, %r0_index_32, %r0_mask_14 : tensor<1x32xi32, #blocked> loc(#loc94) + %r0_mask_35 = arith.cmpi slt, %r0_index_33, %r0_mask : tensor<1x32xi32, #blocked1> loc(#loc94) + %tmp6_36 = arith.extsi %r0_index_32 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> loc(#loc101) + %tmp6_37 = arith.extsi %r0_index_33 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc101) + %tmp6_38 = arith.addi %tmp6_36, %tmp6_21 : tensor<1x32xi64, #blocked> loc(#loc101) + %tmp6_39 = arith.addi %tmp6_37, %tmp6_22 : tensor<1x32xi64, #blocked1> loc(#loc101) + %tmp6_40 = tt.addptr %tmp6_23, %tmp6_38 : tensor<1x32x!tt.ptr, #blocked>, tensor<1x32xi64, #blocked> loc(#loc102) + %tmp6_41 = tt.addptr %tmp6_24, %tmp6_39 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc102) + %tmp6_42 = arith.andi %r0_mask_34, %tmp6_25 : tensor<1x32xi1, #blocked> loc(#loc103) + %tmp6_43 = arith.andi %r0_mask_35, %tmp0_10 : tensor<1x32xi1, #blocked1> loc(#loc103) + %tmp6_44 = tt.load %tmp6_40, %tmp6_42, %cst evictionPolicy = evict_first : tensor<1x32x!tt.ptr, #blocked> loc(#loc117) + %tmp6_45 = tt.load %tmp6_41, %tmp6_43, %cst_0 evictionPolicy = evict_first : tensor<1x32x!tt.ptr, #blocked1> loc(#loc117) + %tmp7 = arith.trunci %tmp6_44 : tensor<1x32xi64, #blocked> to tensor<1x32xi32, #blocked> loc(#loc118) + %tmp7_46 = arith.trunci %tmp6_45 : tensor<1x32xi64, #blocked1> to tensor<1x32xi32, #blocked1> loc(#loc118) + %tmp9_47 = arith.cmpi slt, %r0_index_32, %tmp9 : tensor<1x32xi32, #blocked> loc(#loc104) + %tmp9_48 = arith.cmpi slt, %r0_index_33, %tmp9_26 : tensor<1x32xi32, #blocked1> loc(#loc104) + %tmp11_49 = arith.extsi %tmp7 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> loc(#loc105) + %tmp11_50 = arith.extsi %tmp7_46 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc105) + %tmp11_51 = arith.select %tmp9_47, %tmp11_49, %tmp11 : tensor<1x32xi1, #blocked>, tensor<1x32xi64, #blocked> loc(#loc105) + %tmp11_52 = arith.select %tmp9_48, %tmp11_50, %tmp11_27 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc105) + %tmp13_53 = arith.addi %tmp11_51, %tmp13 : tensor<1x32xi64, #blocked> loc(#loc107) + %tmp13_54 = arith.addi %tmp11_52, %tmp13_28 : tensor<1x32xi64, #blocked1> loc(#loc107) + %tmp14 = arith.cmpi slt, %tmp11_51, %cst : tensor<1x32xi64, #blocked> loc(#loc119) + %tmp14_55 = arith.cmpi slt, %tmp11_52, %cst_0 : tensor<1x32xi64, #blocked1> loc(#loc119) + %tmp15 = arith.select %tmp14, %tmp13_53, %tmp11_51 : tensor<1x32xi1, #blocked>, tensor<1x32xi64, #blocked> loc(#loc120) + %tmp15_56 = arith.select %tmp14_55, %tmp13_54, %tmp11_52 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc120) + %13 = arith.cmpi sge, %tmp15_56, %cst_0 : tensor<1x32xi64, #blocked1> loc(#loc61) + %14 = arith.cmpi slt, %tmp15_56, %8 : tensor<1x32xi64, #blocked1> loc(#loc49) + %15 = arith.andi %13, %14 : tensor<1x32xi1, #blocked1> loc(#loc62) + %16 = arith.xori %tmp6_43, %cst_1 : tensor<1x32xi1, #blocked1> loc(#loc63) + %17 = arith.ori %15, %16 : tensor<1x32xi1, #blocked1> loc(#loc64) + tt.assert %17, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1, #blocked1> loc(#loc65) + %18 = tt.addptr %9, %tmp6_39 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc50) + tt.store %18, %tmp7_46, %tmp6_43 : tensor<1x32x!tt.ptr, #blocked1> loc(#loc66) + %19 = arith.addi %tmp15, %10 : tensor<1x32xi64, #blocked> loc(#loc51) + %20 = arith.addi %19, %11 : tensor<1x32xi64, #blocked> loc(#loc52) + %21 = tt.addptr %12, %20 : tensor<1x32x!tt.ptr, #blocked>, tensor<1x32xi64, #blocked> loc(#loc54) + tt.store %21, %cst_3, %tmp6_42 : tensor<1x32x!tt.ptr, #blocked> loc(#loc20) + } loc(#loc55) + tt.return loc(#loc67) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":31:29) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":35:45) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":35:41) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":35:34) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":35:60) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":29:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":30:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":35:50) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":36:23) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":38:23) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":39:48) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":39:8) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":62:95) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":40:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":41:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":42:25) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":42:36) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":45:29) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:60) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:86) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:77) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:68) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:52) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:45) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:103) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":52:22) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":54:37) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":55:20) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":56:24) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:94) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:100) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:55) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:50) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":61:29) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":62:53) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":62:58) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":62:62) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":62:29) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":43:40) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":44:31) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:93) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":50:23) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":57:24) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":58:39) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:32) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:112) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:110) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:130) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":61:94) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":43:4) +#loc77 = loc("xoffset"(#loc2)) +#loc78 = loc("xmask"(#loc3)) +#loc79 = loc("r0_base"(#loc4)) +#loc80 = loc("r0_mask"(#loc5)) +#loc81 = loc("tmp0"(#loc6)) +#loc82 = loc("tmp0"(#loc7)) +#loc83 = loc("tmp0"(#loc8)) +#loc84 = loc("tmp0"(#loc9)) +#loc85 = loc("_tmp3"(#loc10)) +#loc86 = loc("r0_index"(#loc11)) +#loc87 = loc("tmp0"(#loc12)) +#loc88 = loc("tmp1"(#loc13)) +#loc89 = loc("tmp4"(#loc14)) +#loc90 = loc("_tmp3"(#loc15)) +#loc92 = loc("tmp3"(#loc21)) +#loc93 = loc("tmp5"(#loc22)) +#loc94 = loc("r0_mask"(#loc25)) +#loc95 = loc("tmp6"(#loc26)) +#loc96 = loc("tmp6"(#loc27)) +#loc97 = loc("tmp6"(#loc28)) +#loc98 = loc("tmp6"(#loc29)) +#loc99 = loc("tmp6"(#loc30)) +#loc100 = loc("tmp6"(#loc31)) +#loc101 = loc("tmp6"(#loc32)) +#loc102 = loc("tmp6"(#loc33)) +#loc103 = loc("tmp6"(#loc34)) +#loc104 = loc("tmp9"(#loc35)) +#loc105 = loc("tmp11"(#loc36)) +#loc106 = loc("tmp12"(#loc37)) +#loc107 = loc("tmp13"(#loc38)) +#loc108 = loc("quot"(#loc40)) +#loc109 = loc("remainder"(#loc42)) +#loc110 = loc("fixed"(#loc43)) +#loc111 = loc("fixed"(#loc44)) +#loc112 = loc("fixed"(#loc45)) +#loc113 = loc(callsite(#loc46 at #loc41)) +#loc114 = loc(callsite(#loc47 at #loc41)) +#loc115 = loc(fused[#loc52, #loc53]) +#loc116 = loc("r0_index"(#loc56)) +#loc117 = loc("tmp6"(#loc57)) +#loc118 = loc("tmp7"(#loc58)) +#loc119 = loc("tmp14"(#loc59)) +#loc120 = loc("tmp15"(#loc60)) +#loc121 = loc(fused[#loc82, #loc81]) +#loc122 = loc(fused[#loc84, #loc78]) +#loc123 = loc(callsite(#loc17 at #loc91)) +#loc125 = loc(fused[#loc98, #loc99]) +#loc126 = loc(fused[#loc101, #loc100]) +#loc127 = loc(fused[#loc103, #loc78]) +#loc128 = loc(callsite(#loc108 at #loc41)) +#loc129 = loc(callsite(#loc109 at #loc41)) +#loc130 = loc(callsite(#loc110 at #loc41)) +#loc131 = loc(callsite(#loc111 at #loc41)) +#loc132 = loc(callsite(#loc112 at #loc41)) +#loc133 = loc(callsite(#loc19 at #loc123)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..af7ba3811d4ffed8b32881ee0bb0ad683be9f5a8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir @@ -0,0 +1,246 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":18:0) +#loc1 = loc(unknown) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":40:25) +#loc69 = loc("in_ptr0"(#loc)) +#loc70 = loc("in_ptr1"(#loc)) +#loc71 = loc("out_ptr1"(#loc)) +#loc72 = loc("out_ptr2"(#loc)) +#loc73 = loc("out_ptr3"(#loc)) +#loc74 = loc("ks0"(#loc)) +#loc75 = loc("ks1"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc93 = loc("tmp3"(#loc19)) +#loc126 = loc(callsite(#loc1 at #loc93)) +module { + tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %cst = arith.constant dense<0> : tensor<1x32xi32> loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc1) + %cst_1 = arith.constant dense : tensor<1x32xi1> loc(#loc1) + %c127_i64 = arith.constant 127 : i64 loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<1x32xi64> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc78) + %xmask = arith.cmpi slt, %xoffset, %c32_i32 : i32 loc(#loc79) + %xmask_3 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc79) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc80) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc81) + %_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 iter_args(%_tmp3_6 = %cst_2) -> (tensor<1x32xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc83) + %r0_index_7 = arith.addi %r0_index, %r0_base_4 : tensor<1x32xi32> loc(#loc83) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc84) + %r0_mask_8 = arith.cmpi slt, %r0_index_7, %r0_mask : tensor<1x32xi32> loc(#loc84) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc85) + %tmp0_9 = arith.muli %ks0, %tmp0 : i64 loc(#loc85) + %tmp0_10 = arith.extsi %r0_index_7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc86) + %tmp0_11 = tt.splat %tmp0_9 : i64 -> tensor<1x32xi64> loc(#loc123) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<1x32xi64> loc(#loc86) + %tmp0_13 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc87) + %tmp0_14 = tt.addptr %tmp0_13, %tmp0_12 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc87) + %tmp0_15 = tt.splat %xmask : i1 -> tensor<1x32xi1> loc(#loc124) + %tmp0_16 = arith.andi %r0_mask_8, %tmp0_15 : tensor<1x32xi1> loc(#loc88) + %tmp0_17 = tt.load %tmp0_14, %tmp0_16, %cst evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc89) + %tmp1 = arith.extsi %tmp0_17 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc90) + %tmp4 = arith.addi %_tmp3_6, %tmp1 : tensor<1x32xi64> loc(#loc91) + %_tmp3_18 = arith.select %tmp0_16, %tmp4, %_tmp3_6 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc92) + scf.yield %_tmp3_18 : tensor<1x32xi64> loc(#loc17) + } loc(#loc82) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_6: i64 loc(callsite(#loc1 at #loc93)), %tmp3_7: i64 loc(callsite(#loc1 at #loc93))): + %tmp3_8 = arith.addi %tmp3_6, %tmp3_7 : i64 loc(#loc135) + tt.reduce.return %tmp3_8 : i64 loc(#loc125) + }) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc125) + %tmp3_5 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc94) + %tmp5 = arith.trunci %tmp3_5 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc95) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc23) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc23) + tt.store %1, %tmp5, %xmask_3 : tensor<1x1x!tt.ptr> loc(#loc24) + scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc96) + %r0_index_6 = arith.addi %r0_index, %r0_base_4 : tensor<1x32xi32> loc(#loc96) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc97) + %r0_mask_7 = arith.cmpi slt, %r0_index_6, %r0_mask : tensor<1x32xi32> loc(#loc97) + %tmp6 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc98) + %tmp6_8 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc99) + %tmp6_9 = arith.extui %tmp6_8 : i1 to i64 loc(#loc100) + %tmp6_10 = arith.muli %ks0, %tmp6_9 : i64 loc(#loc100) + %tmp6_11 = arith.extui %tmp6 : i1 to i64 loc(#loc127) + %tmp6_12 = arith.addi %tmp6_11, %tmp6_10 : i64 loc(#loc101) + %tmp6_13 = arith.extsi %xoffset : i32 to i64 loc(#loc103) + %tmp6_14 = arith.muli %tmp6_13, %tmp6_12 : i64 loc(#loc103) + %tmp6_15 = arith.extsi %r0_index_6 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc104) + %tmp6_16 = tt.splat %tmp6_14 : i64 -> tensor<1x32xi64> loc(#loc128) + %tmp6_17 = arith.addi %tmp6_15, %tmp6_16 : tensor<1x32xi64> loc(#loc104) + %tmp6_18 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc105) + %tmp6_19 = tt.addptr %tmp6_18, %tmp6_17 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc105) + %tmp6_20 = tt.splat %xmask : i1 -> tensor<1x32xi1> loc(#loc129) + %tmp6_21 = arith.andi %r0_mask_7, %tmp6_20 : tensor<1x32xi1> loc(#loc106) + %tmp6_22 = tt.load %tmp6_19, %tmp6_21, %cst_2 evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc107) + %tmp7 = arith.trunci %tmp6_22 : tensor<1x32xi64> to tensor<1x32xi32> loc(#loc108) + %tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32> -> tensor<1x32xi32> loc(#loc109) + %tmp9_23 = arith.cmpi slt, %r0_index_6, %tmp9 : tensor<1x32xi32> loc(#loc109) + %tmp11 = arith.extsi %tmp7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc110) + %tmp11_24 = tt.splat %ks0 : i64 -> tensor<1x32xi64> loc(#loc110) + %tmp11_25 = arith.select %tmp9_23, %tmp11, %tmp11_24 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc110) + %tmp12 = arith.addi %ks0, %c1_i64 : i64 loc(#loc111) + %tmp13 = tt.splat %tmp12 : i64 -> tensor<1x32xi64> loc(#loc112) + %tmp13_26 = arith.addi %tmp11_25, %tmp13 : tensor<1x32xi64> loc(#loc112) + %tmp14 = arith.cmpi slt, %tmp11_25, %cst_2 : tensor<1x32xi64> loc(#loc113) + %tmp15 = arith.select %tmp14, %tmp13_26, %tmp11_25 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc114) + %2 = arith.cmpi sge, %tmp15, %cst_2 : tensor<1x32xi64> loc(#loc45) + %3 = arith.addi %ks1, %c127_i64 : i64 loc(#loc46) + %quot = arith.divsi %3, %c128_i64 : i64 loc(#loc130) + %remainder = arith.remsi %3, %c128_i64 : i64 loc(#loc131) + %fixed = arith.cmpi ne, %remainder, %c0_i64 : i64 loc(#loc132) + %fixed_27 = arith.subi %quot, %c1_i64 : i64 loc(#loc133) + %fixed_28 = arith.select %fixed, %fixed_27, %quot : i64 loc(#loc134) + %4 = arith.cmpi slt, %3, %c0_i64 : i64 loc(#loc120) + %5 = arith.select %4, %fixed_28, %quot : i64 loc(#loc121) + %6 = arith.addi %5, %c1_i64 : i64 loc(#loc55) + %7 = tt.splat %6 : i64 -> tensor<1x32xi64> loc(#loc56) + %8 = arith.cmpi slt, %tmp15, %7 : tensor<1x32xi64> loc(#loc56) + %9 = arith.andi %2, %8 : tensor<1x32xi1> loc(#loc57) + %10 = arith.xori %tmp6_21, %cst_1 : tensor<1x32xi1> loc(#loc58) + %11 = arith.ori %9, %10 : tensor<1x32xi1> loc(#loc59) + tt.assert %11, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1> loc(#loc60) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc61) + %13 = tt.addptr %12, %tmp6_17 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc61) + tt.store %13, %tmp7, %tmp6_21 : tensor<1x32x!tt.ptr> loc(#loc62) + %14 = tt.splat %tmp6_13 : i64 -> tensor<1x32xi64> loc(#loc63) + %15 = arith.addi %tmp15, %14 : tensor<1x32xi64> loc(#loc63) + %16 = arith.muli %ks0, %tmp6_13 : i64 loc(#loc64) + %17 = tt.splat %16 : i64 -> tensor<1x32xi64> loc(#loc122) + %18 = arith.addi %15, %17 : tensor<1x32xi64> loc(#loc65) + %19 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc66) + %20 = tt.addptr %19, %18 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc66) + tt.store %20, %cst_0, %tmp6_21 : tensor<1x32x!tt.ptr> loc(#loc67) + } loc(#loc25) + tt.return loc(#loc68) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":25:27) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":25:37) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":29:40) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":30:31) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":31:29) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":35:45) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":35:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":35:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":35:60) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":35:50) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":36:23) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":38:23) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":39:48) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":39:8) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":40:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":41:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":42:25) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":42:36) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":43:40) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":44:31) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":45:29) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:60) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:86) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:77) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:68) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:52) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:45) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:41) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:34) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:103) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":49:93) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":50:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":52:22) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":54:37) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":55:20) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":56:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":57:24) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":58:39) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:32) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:94) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:100) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:55) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:50) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:42) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:112) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:110) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":59:130) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":61:29) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":61:94) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":62:53) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":62:62) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":62:58) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":62:29) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":62:95) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vr/cvrhnrmpgyxwu34xleclee3tt4kemoldkj7iam4uciathomirvlc.py":43:4) +#loc78 = loc("xoffset"(#loc2)) +#loc79 = loc("xmask"(#loc3)) +#loc80 = loc("r0_base"(#loc4)) +#loc81 = loc("r0_base"(#loc5)) +#loc82 = loc("_tmp3"(#loc6)) +#loc83 = loc("r0_index"(#loc7)) +#loc84 = loc("r0_mask"(#loc8)) +#loc85 = loc("tmp0"(#loc9)) +#loc86 = loc("tmp0"(#loc10)) +#loc87 = loc("tmp0"(#loc11)) +#loc88 = loc("tmp0"(#loc12)) +#loc89 = loc("tmp0"(#loc13)) +#loc90 = loc("tmp1"(#loc14)) +#loc91 = loc("tmp4"(#loc15)) +#loc92 = loc("_tmp3"(#loc16)) +#loc94 = loc("tmp3"(#loc21)) +#loc95 = loc("tmp5"(#loc22)) +#loc96 = loc("r0_index"(#loc26)) +#loc97 = loc("r0_mask"(#loc27)) +#loc98 = loc("tmp6"(#loc28)) +#loc99 = loc("tmp6"(#loc29)) +#loc100 = loc("tmp6"(#loc30)) +#loc101 = loc("tmp6"(#loc31)) +#loc102 = loc("tmp6"(#loc32)) +#loc103 = loc("tmp6"(#loc33)) +#loc104 = loc("tmp6"(#loc34)) +#loc105 = loc("tmp6"(#loc35)) +#loc106 = loc("tmp6"(#loc36)) +#loc107 = loc("tmp6"(#loc37)) +#loc108 = loc("tmp7"(#loc38)) +#loc109 = loc("tmp9"(#loc39)) +#loc110 = loc("tmp11"(#loc40)) +#loc111 = loc("tmp12"(#loc41)) +#loc112 = loc("tmp13"(#loc42)) +#loc113 = loc("tmp14"(#loc43)) +#loc114 = loc("tmp15"(#loc44)) +#loc115 = loc("quot"(#loc47)) +#loc116 = loc("remainder"(#loc49)) +#loc117 = loc("fixed"(#loc50)) +#loc118 = loc("fixed"(#loc51)) +#loc119 = loc("fixed"(#loc52)) +#loc120 = loc(callsite(#loc53 at #loc48)) +#loc121 = loc(callsite(#loc54 at #loc48)) +#loc122 = loc(fused[#loc65, #loc64]) +#loc123 = loc(fused[#loc86, #loc85]) +#loc124 = loc(fused[#loc88, #loc79]) +#loc125 = loc(callsite(#loc18 at #loc93)) +#loc127 = loc(fused[#loc101, #loc102]) +#loc128 = loc(fused[#loc104, #loc103]) +#loc129 = loc(fused[#loc106, #loc79]) +#loc130 = loc(callsite(#loc115 at #loc48)) +#loc131 = loc(callsite(#loc116 at #loc48)) +#loc132 = loc(callsite(#loc117 at #loc48)) +#loc133 = loc(callsite(#loc118 at #loc48)) +#loc134 = loc(callsite(#loc119 at #loc48)) +#loc135 = loc(callsite(#loc20 at #loc125)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3af5f88620de332581403408fa4f6fdca5a8747b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin b/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin new file mode 100644 index 0000000000000000000000000000000000000000..b3e126b6215f21e799b5f001445fa10b651a8257 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..beec3fa5a32eff013a0bab0800bb387fd59bbb0f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"hash": "07c7815d2ce5fa33e16044674f04a1dcbb415776e0b5f0da0149af801b6db42c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 2048, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir new file mode 100644 index 0000000000000000000000000000000000000000..5cc41198df34aa2432b108f80761fd83df3aaeda --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir @@ -0,0 +1,1814 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 5, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = lshr i32 %10, 5, !dbg !9 + %12 = and i32 %10, 31, !dbg !9 + %13 = lshr i32 %10, 2, !dbg !9 + %14 = and i32 %13, 31, !dbg !9 + %15 = or disjoint i32 %9, %12, !dbg !10 + %16 = or disjoint i32 %14, %9, !dbg !10 + %17 = icmp slt i32 %15, 32, !dbg !11 + %18 = icmp slt i32 %16, 32, !dbg !11 + %19 = and i32 %10, 96, !dbg !12 + %20 = lshr exact i32 %19, 5, !dbg !12 + %21 = or disjoint i32 %20, 4, !dbg !12 + %22 = or disjoint i32 %20, 8, !dbg !12 + %23 = or disjoint i32 %20, 12, !dbg !12 + %24 = shl nuw nsw i32 %10, 2, !dbg !12 + %25 = and i32 %24, 12, !dbg !12 + %26 = sdiv i32 %15, 16, !dbg !13 + %27 = mul nuw nsw i32 %20, 17, !dbg !14 + %28 = mul nuw nsw i32 %21, 17, !dbg !14 + %29 = mul nuw nsw i32 %22, 17, !dbg !14 + %30 = mul nuw nsw i32 %23, 17, !dbg !14 + %31 = shl i32 %26, 8, !dbg !15 + %32 = add i32 %31, %15, !dbg !15 + %33 = add i32 %32, %27, !dbg !16 + %34 = add i32 %32, %28, !dbg !16 + %35 = add i32 %32, %29, !dbg !16 + %36 = add i32 %32, %30, !dbg !16 + %37 = sext i32 %33 to i64, !dbg !17 + %38 = getelementptr i32, ptr addrspace(1) %0, i64 %37, !dbg !17 + %39 = sext i32 %34 to i64, !dbg !17 + %40 = getelementptr i32, ptr addrspace(1) %0, i64 %39, !dbg !17 + %41 = sext i32 %35 to i64, !dbg !17 + %42 = getelementptr i32, ptr addrspace(1) %0, i64 %41, !dbg !17 + %43 = sext i32 %36 to i64, !dbg !17 + %44 = getelementptr i32, ptr addrspace(1) %0, i64 %43, !dbg !17 + %45 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %38, i1 %17) #5, !dbg !18 + %46 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %40, i1 %17) #5, !dbg !18 + %47 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %42, i1 %17) #5, !dbg !18 + %48 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %44, i1 %17) #5, !dbg !18 + %49 = lshr i32 %10, 6, !dbg !19 + %.lobit = and i32 %49, 1, !dbg !19 + %.lobit1 = and i32 %11, 1, !dbg !19 + %50 = xor i32 %.lobit1, 1, !dbg !23 + %51 = xor i32 %.lobit, 1, !dbg !23 + %52 = mul nuw nsw i32 %45, %50, !dbg !24 + %53 = mul nuw nsw i32 %46, %50, !dbg !24 + %54 = mul nuw nsw i32 %47, %50, !dbg !24 + %55 = mul nuw nsw i32 %48, %50, !dbg !24 + %.idx66 = shl nuw nsw i32 %.lobit, 3, !dbg !25 + %56 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx66, !dbg !25 + %.idx67 = shl nuw nsw i32 %12, 6, !dbg !25 + %57 = getelementptr i8, ptr addrspace(3) %56, i32 %.idx67, !dbg !25 + %58 = getelementptr i32, ptr addrspace(3) %57, i32 %.lobit1, !dbg !25 + %59 = insertelement <1 x i32> poison, i32 %52, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %59, i1 true) #5, !dbg !25 + %60 = getelementptr i8, ptr addrspace(3) %57, i32 16, !dbg !25 + %61 = getelementptr i32, ptr addrspace(3) %60, i32 %.lobit1, !dbg !25 + %62 = insertelement <1 x i32> poison, i32 %53, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %62, i1 true) #5, !dbg !25 + %63 = getelementptr i8, ptr addrspace(3) %57, i32 32, !dbg !25 + %64 = getelementptr i32, ptr addrspace(3) %63, i32 %.lobit1, !dbg !25 + %65 = insertelement <1 x i32> poison, i32 %54, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %65, i1 true) #5, !dbg !25 + %66 = getelementptr i8, ptr addrspace(3) %57, i32 48, !dbg !25 + %67 = getelementptr i32, ptr addrspace(3) %66, i32 %.lobit1, !dbg !25 + %68 = insertelement <1 x i32> poison, i32 %55, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %68, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %69 = icmp samesign ult i32 %10, 512, !dbg !25 + %70 = getelementptr i32, ptr addrspace(3) @global_smem, i32 %10, !dbg !25 + %71 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %72 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 1, i32 31), !dbg !25 + %73 = add i32 %72, %71, !dbg !28 + %74 = and i32 %10, 513, !dbg !25 + %75 = icmp eq i32 %74, 0, !dbg !25 + %76 = insertelement <1 x i32> poison, i32 %73, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %76, i1 %75) #5, !dbg !25 + %77 = getelementptr i8, ptr addrspace(3) %70, i32 512, !dbg !25 + %78 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 1, i32 31), !dbg !25 + %80 = add i32 %79, %78, !dbg !28 + %81 = insertelement <1 x i32> poison, i32 %80, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %81, i1 %75) #5, !dbg !25 + %82 = getelementptr i8, ptr addrspace(3) %70, i32 1024, !dbg !25 + %83 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %84 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %83, i32 1, i32 31), !dbg !25 + %85 = add i32 %84, %83, !dbg !28 + %86 = insertelement <1 x i32> poison, i32 %85, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %86, i1 %75) #5, !dbg !25 + %87 = getelementptr i8, ptr addrspace(3) %70, i32 1536, !dbg !25 + %88 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 1, i32 31), !dbg !25 + %90 = add i32 %89, %88, !dbg !28 + %91 = insertelement <1 x i32> poison, i32 %90, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %91, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %92 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %93 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %94 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %95 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %96 = mul nuw nsw i32 %45, %.lobit1, !dbg !29 + %97 = mul nuw nsw i32 %46, %.lobit1, !dbg !29 + %98 = mul nuw nsw i32 %47, %.lobit1, !dbg !29 + %99 = mul nuw nsw i32 %48, %.lobit1, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %100 = insertelement <1 x i32> poison, i32 %96, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %100, i1 true) #5, !dbg !25 + %101 = insertelement <1 x i32> poison, i32 %97, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %101, i1 true) #5, !dbg !25 + %102 = insertelement <1 x i32> poison, i32 %98, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %102, i1 true) #5, !dbg !25 + %103 = insertelement <1 x i32> poison, i32 %99, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %103, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %104 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 1, i32 31), !dbg !25 + %106 = add i32 %105, %104, !dbg !28 + %107 = insertelement <1 x i32> poison, i32 %106, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %107, i1 %75) #5, !dbg !25 + %108 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 1, i32 31), !dbg !25 + %110 = add i32 %109, %108, !dbg !28 + %111 = insertelement <1 x i32> poison, i32 %110, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %111, i1 %75) #5, !dbg !25 + %112 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !25 + %114 = add i32 %113, %112, !dbg !28 + %115 = insertelement <1 x i32> poison, i32 %114, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %115, i1 %75) #5, !dbg !25 + %116 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 1, i32 31), !dbg !25 + %118 = add i32 %117, %116, !dbg !28 + %119 = insertelement <1 x i32> poison, i32 %118, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %119, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %120 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %121 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %122 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %123 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %124 = mul nuw nsw i32 %50, %20, !dbg !30 + %125 = mul nuw nsw i32 %21, %50, !dbg !30 + %126 = mul nuw nsw i32 %22, %50, !dbg !30 + %127 = mul nuw nsw i32 %23, %50, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %128 = insertelement <1 x i32> poison, i32 %124, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %128, i1 true) #5, !dbg !25 + %129 = insertelement <1 x i32> poison, i32 %125, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %129, i1 true) #5, !dbg !25 + %130 = insertelement <1 x i32> poison, i32 %126, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %130, i1 true) #5, !dbg !25 + %131 = insertelement <1 x i32> poison, i32 %127, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %131, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %132 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 1, i32 31), !dbg !25 + %134 = add i32 %133, %132, !dbg !28 + %135 = insertelement <1 x i32> poison, i32 %134, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %135, i1 %75) #5, !dbg !25 + %136 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 1, i32 31), !dbg !25 + %138 = add i32 %137, %136, !dbg !28 + %139 = insertelement <1 x i32> poison, i32 %138, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %139, i1 %75) #5, !dbg !25 + %140 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 1, i32 31), !dbg !25 + %142 = add i32 %141, %140, !dbg !28 + %143 = insertelement <1 x i32> poison, i32 %142, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %143, i1 %75) #5, !dbg !25 + %144 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 1, i32 31), !dbg !25 + %146 = add i32 %145, %144, !dbg !28 + %147 = insertelement <1 x i32> poison, i32 %146, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %147, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %148 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %149 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %150 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %151 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %152 = mul nuw nsw i32 %20, %.lobit1, !dbg !31 + %153 = mul nuw nsw i32 %21, %.lobit1, !dbg !31 + %154 = mul nuw nsw i32 %22, %.lobit1, !dbg !31 + %155 = mul nuw nsw i32 %23, %.lobit1, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %156 = insertelement <1 x i32> poison, i32 %152, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %156, i1 true) #5, !dbg !25 + %157 = insertelement <1 x i32> poison, i32 %153, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %157, i1 true) #5, !dbg !25 + %158 = insertelement <1 x i32> poison, i32 %154, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %158, i1 true) #5, !dbg !25 + %159 = insertelement <1 x i32> poison, i32 %155, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %159, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %160 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 1, i32 31), !dbg !25 + %162 = add i32 %161, %160, !dbg !28 + %163 = insertelement <1 x i32> poison, i32 %162, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %163, i1 %75) #5, !dbg !25 + %164 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %165 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 1, i32 31), !dbg !25 + %166 = add i32 %165, %164, !dbg !28 + %167 = insertelement <1 x i32> poison, i32 %166, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %167, i1 %75) #5, !dbg !25 + %168 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 1, i32 31), !dbg !25 + %170 = add i32 %169, %168, !dbg !28 + %171 = insertelement <1 x i32> poison, i32 %170, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %171, i1 %75) #5, !dbg !25 + %172 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %173 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %172, i32 1, i32 31), !dbg !25 + %174 = add i32 %173, %172, !dbg !28 + %175 = insertelement <1 x i32> poison, i32 %174, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %175, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %176 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %177 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %178 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %179 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %180 = trunc i32 %49 to i1, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %181 = shl nuw nsw i32 %12, 2, !dbg !25 + %182 = or disjoint i32 %181, 1, !dbg !25 + %183 = or disjoint i32 %181, 2, !dbg !25 + %184 = or disjoint i32 %181, 3, !dbg !25 + %185 = shl nuw nsw i32 %.lobit1, 7, !dbg !25 + %186 = or disjoint i32 %185, %181, !dbg !25 + %.idx = shl nuw nsw i32 %186, 3, !dbg !25 + %187 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx, !dbg !25 + %188 = getelementptr i32, ptr addrspace(3) %187, i32 %.lobit, !dbg !25 + %189 = or disjoint i32 %185, %182, !dbg !25 + %.idx63 = shl nuw nsw i32 %189, 3, !dbg !25 + %190 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx63, !dbg !25 + %191 = getelementptr i32, ptr addrspace(3) %190, i32 %.lobit, !dbg !25 + %192 = or disjoint i32 %185, %183, !dbg !25 + %.idx64 = shl nuw nsw i32 %192, 3, !dbg !25 + %193 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx64, !dbg !25 + %194 = getelementptr i32, ptr addrspace(3) %193, i32 %.lobit, !dbg !25 + %195 = or disjoint i32 %185, %184, !dbg !25 + %.idx65 = shl nuw nsw i32 %195, 3, !dbg !25 + %196 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx65, !dbg !25 + %197 = getelementptr i32, ptr addrspace(3) %196, i32 %.lobit, !dbg !25 + %198 = insertelement <2 x i32> poison, i32 %92, i64 0, !dbg !32 + %199 = insertelement <2 x i32> %198, i32 %93, i64 1, !dbg !32 + %200 = insertelement <2 x i32> poison, i32 %120, i64 0, !dbg !32 + %201 = insertelement <2 x i32> %200, i32 %121, i64 1, !dbg !32 + %202 = icmp sge <2 x i32> %199, %201, !dbg !32 + %203 = icmp ne <2 x i32> %199, %201, !dbg !32 + %204 = insertelement <2 x i32> poison, i32 %148, i64 0, !dbg !32 + %205 = insertelement <2 x i32> %204, i32 %149, i64 1, !dbg !32 + %206 = insertelement <2 x i32> poison, i32 %176, i64 0, !dbg !32 + %207 = insertelement <2 x i32> %206, i32 %177, i64 1, !dbg !32 + %208 = icmp sle <2 x i32> %205, %207, !dbg !32 + %209 = or <2 x i1> %203, %208, !dbg !32 + %210 = and <2 x i1> %202, %209, !dbg !32 + %211 = insertelement <2 x i1> poison, i1 %180, i64 0, !dbg !32 + %212 = shufflevector <2 x i1> %211, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !32 + %213 = xor <2 x i1> %210, %212, !dbg !32 + %214 = insertelement <2 x i32> %206, i32 %121, i64 1, !dbg !33 + %215 = insertelement <2 x i32> %204, i32 %93, i64 1, !dbg !33 + %216 = xor <2 x i32> %214, %215, !dbg !33 + %217 = insertelement <2 x i32> poison, i32 %177, i64 0, !dbg !33 + %218 = insertelement <2 x i32> %217, i32 %120, i64 1, !dbg !33 + %219 = insertelement <2 x i32> poison, i32 %149, i64 0, !dbg !33 + %220 = insertelement <2 x i32> %219, i32 %92, i64 1, !dbg !33 + %221 = xor <2 x i32> %218, %220, !dbg !33 + %222 = select <2 x i1> %213, <2 x i32> zeroinitializer, <2 x i32> %216, !dbg !34 + %223 = shufflevector <2 x i1> %213, <2 x i1> poison, <2 x i32> , !dbg !34 + %224 = select <2 x i1> %223, <2 x i32> zeroinitializer, <2 x i32> %221, !dbg !34 + %225 = insertelement <2 x i32> poison, i32 %20, i64 0, !dbg !35 + %226 = insertelement <2 x i32> %225, i32 %46, i64 1, !dbg !35 + %227 = xor <2 x i32> %222, %226, !dbg !35 + %228 = insertelement <2 x i32> poison, i32 %21, i64 0, !dbg !35 + %229 = insertelement <2 x i32> %228, i32 %45, i64 1, !dbg !35 + %230 = xor <2 x i32> %224, %229, !dbg !35 + %231 = extractelement <2 x i32> %230, i64 1, !dbg !29 + %232 = mul nuw nsw i32 %231, %51, !dbg !24 + %233 = extractelement <2 x i32> %227, i64 1, !dbg !29 + %234 = mul nuw nsw i32 %233, %51, !dbg !24 + %235 = insertelement <1 x i32> poison, i32 %232, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %235, i1 true) #5, !dbg !25 + %236 = insertelement <1 x i32> poison, i32 %234, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %236, i1 true) #5, !dbg !25 + %237 = mul nuw nsw i32 %231, %.lobit, !dbg !29 + %238 = mul nuw nsw i32 %233, %.lobit, !dbg !29 + %239 = insertelement <1 x i32> poison, i32 %237, i64 0, !dbg !25 + %240 = insertelement <1 x i32> poison, i32 %238, i64 0, !dbg !25 + %241 = extractelement <2 x i32> %227, i64 0, !dbg !31 + %242 = mul nuw nsw i32 %241, %51, !dbg !30 + %243 = extractelement <2 x i32> %230, i64 0, !dbg !31 + %244 = mul nuw nsw i32 %243, %51, !dbg !30 + %245 = insertelement <1 x i32> poison, i32 %242, i64 0, !dbg !25 + %246 = insertelement <1 x i32> poison, i32 %244, i64 0, !dbg !25 + %247 = mul nuw nsw i32 %241, %.lobit, !dbg !31 + %248 = mul nuw nsw i32 %243, %.lobit, !dbg !31 + %249 = insertelement <1 x i32> poison, i32 %247, i64 0, !dbg !25 + %250 = insertelement <1 x i32> poison, i32 %248, i64 0, !dbg !25 + %251 = insertelement <2 x i32> poison, i32 %94, i64 0, !dbg !32 + %252 = insertelement <2 x i32> %251, i32 %95, i64 1, !dbg !32 + %253 = insertelement <2 x i32> poison, i32 %122, i64 0, !dbg !32 + %254 = insertelement <2 x i32> %253, i32 %123, i64 1, !dbg !32 + %255 = icmp sge <2 x i32> %252, %254, !dbg !32 + %256 = icmp ne <2 x i32> %252, %254, !dbg !32 + %257 = insertelement <2 x i32> poison, i32 %150, i64 0, !dbg !32 + %258 = insertelement <2 x i32> %257, i32 %151, i64 1, !dbg !32 + %259 = insertelement <2 x i32> poison, i32 %178, i64 0, !dbg !32 + %260 = insertelement <2 x i32> %259, i32 %179, i64 1, !dbg !32 + %261 = icmp sle <2 x i32> %258, %260, !dbg !32 + %262 = or <2 x i1> %256, %261, !dbg !32 + %263 = and <2 x i1> %255, %262, !dbg !32 + %264 = xor <2 x i1> %263, %212, !dbg !32 + %265 = insertelement <2 x i32> %259, i32 %123, i64 1, !dbg !33 + %266 = insertelement <2 x i32> %257, i32 %95, i64 1, !dbg !33 + %267 = xor <2 x i32> %265, %266, !dbg !33 + %268 = insertelement <2 x i32> poison, i32 %179, i64 0, !dbg !33 + %269 = insertelement <2 x i32> %268, i32 %122, i64 1, !dbg !33 + %270 = insertelement <2 x i32> poison, i32 %151, i64 0, !dbg !33 + %271 = insertelement <2 x i32> %270, i32 %94, i64 1, !dbg !33 + %272 = xor <2 x i32> %269, %271, !dbg !33 + %273 = select <2 x i1> %264, <2 x i32> zeroinitializer, <2 x i32> %267, !dbg !34 + %274 = shufflevector <2 x i1> %264, <2 x i1> poison, <2 x i32> , !dbg !34 + %275 = select <2 x i1> %274, <2 x i32> zeroinitializer, <2 x i32> %272, !dbg !34 + %276 = insertelement <2 x i32> poison, i32 %22, i64 0, !dbg !35 + %277 = insertelement <2 x i32> %276, i32 %48, i64 1, !dbg !35 + %278 = xor <2 x i32> %273, %277, !dbg !35 + %279 = insertelement <2 x i32> poison, i32 %23, i64 0, !dbg !35 + %280 = insertelement <2 x i32> %279, i32 %47, i64 1, !dbg !35 + %281 = xor <2 x i32> %275, %280, !dbg !35 + %282 = extractelement <2 x i32> %281, i64 1, !dbg !29 + %283 = mul nuw nsw i32 %282, %51, !dbg !24 + %284 = extractelement <2 x i32> %278, i64 1, !dbg !29 + %285 = mul nuw nsw i32 %284, %51, !dbg !24 + %286 = insertelement <1 x i32> poison, i32 %283, i64 0, !dbg !25 + %287 = insertelement <1 x i32> poison, i32 %285, i64 0, !dbg !25 + %288 = mul nuw nsw i32 %282, %.lobit, !dbg !29 + %289 = mul nuw nsw i32 %284, %.lobit, !dbg !29 + %290 = insertelement <1 x i32> poison, i32 %288, i64 0, !dbg !25 + %291 = insertelement <1 x i32> poison, i32 %289, i64 0, !dbg !25 + %292 = extractelement <2 x i32> %278, i64 0, !dbg !31 + %293 = mul nuw nsw i32 %292, %51, !dbg !30 + %294 = extractelement <2 x i32> %281, i64 0, !dbg !31 + %295 = mul nuw nsw i32 %294, %51, !dbg !30 + %296 = insertelement <1 x i32> poison, i32 %293, i64 0, !dbg !25 + %297 = insertelement <1 x i32> poison, i32 %295, i64 0, !dbg !25 + %298 = mul nuw nsw i32 %292, %.lobit, !dbg !31 + %299 = mul nuw nsw i32 %294, %.lobit, !dbg !31 + %300 = insertelement <1 x i32> poison, i32 %298, i64 0, !dbg !25 + %301 = insertelement <1 x i32> poison, i32 %299, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %286, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %287, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %302 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %303 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %302, i32 1, i32 31), !dbg !25 + %304 = add i32 %303, %302, !dbg !28 + %305 = insertelement <1 x i32> poison, i32 %304, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %305, i1 %75) #5, !dbg !25 + %306 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %306, i32 1, i32 31), !dbg !25 + %308 = add i32 %307, %306, !dbg !28 + %309 = insertelement <1 x i32> poison, i32 %308, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %309, i1 %75) #5, !dbg !25 + %310 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %311 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %310, i32 1, i32 31), !dbg !25 + %312 = add i32 %311, %310, !dbg !28 + %313 = insertelement <1 x i32> poison, i32 %312, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %313, i1 %75) #5, !dbg !25 + %314 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !25 + %316 = add i32 %315, %314, !dbg !28 + %317 = insertelement <1 x i32> poison, i32 %316, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %317, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %318 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %319 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %320 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %321 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %239, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %240, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %290, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %291, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %322 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %323 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %322, i32 1, i32 31), !dbg !25 + %324 = add i32 %323, %322, !dbg !28 + %325 = insertelement <1 x i32> poison, i32 %324, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %325, i1 %75) #5, !dbg !25 + %326 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %327 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %326, i32 1, i32 31), !dbg !25 + %328 = add i32 %327, %326, !dbg !28 + %329 = insertelement <1 x i32> poison, i32 %328, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %329, i1 %75) #5, !dbg !25 + %330 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %331 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %330, i32 1, i32 31), !dbg !25 + %332 = add i32 %331, %330, !dbg !28 + %333 = insertelement <1 x i32> poison, i32 %332, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %333, i1 %75) #5, !dbg !25 + %334 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %335 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %334, i32 1, i32 31), !dbg !25 + %336 = add i32 %335, %334, !dbg !28 + %337 = insertelement <1 x i32> poison, i32 %336, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %337, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %338 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %339 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %340 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %341 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %245, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %246, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %296, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %297, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %342 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %343 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %342, i32 1, i32 31), !dbg !25 + %344 = add i32 %343, %342, !dbg !28 + %345 = insertelement <1 x i32> poison, i32 %344, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %345, i1 %75) #5, !dbg !25 + %346 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %347 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %346, i32 1, i32 31), !dbg !25 + %348 = add i32 %347, %346, !dbg !28 + %349 = insertelement <1 x i32> poison, i32 %348, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %349, i1 %75) #5, !dbg !25 + %350 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %351 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %350, i32 1, i32 31), !dbg !25 + %352 = add i32 %351, %350, !dbg !28 + %353 = insertelement <1 x i32> poison, i32 %352, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %353, i1 %75) #5, !dbg !25 + %354 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %355 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %354, i32 1, i32 31), !dbg !25 + %356 = add i32 %355, %354, !dbg !28 + %357 = insertelement <1 x i32> poison, i32 %356, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %357, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %358 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %359 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %360 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %361 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %249, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %250, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %300, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %301, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %362 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %363 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %362, i32 1, i32 31), !dbg !25 + %364 = add i32 %363, %362, !dbg !28 + %365 = insertelement <1 x i32> poison, i32 %364, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %365, i1 %75) #5, !dbg !25 + %366 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %367 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %366, i32 1, i32 31), !dbg !25 + %368 = add i32 %367, %366, !dbg !28 + %369 = insertelement <1 x i32> poison, i32 %368, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %369, i1 %75) #5, !dbg !25 + %370 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %371 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %370, i32 1, i32 31), !dbg !25 + %372 = add i32 %371, %370, !dbg !28 + %373 = insertelement <1 x i32> poison, i32 %372, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %373, i1 %75) #5, !dbg !25 + %374 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %375 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %374, i32 1, i32 31), !dbg !25 + %376 = add i32 %375, %374, !dbg !28 + %377 = insertelement <1 x i32> poison, i32 %376, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %377, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %378 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %379 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %380 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %381 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + %382 = icmp slt i32 %318, %338, !dbg !36 + %383 = icmp sge i32 %319, %339, !dbg !36 + %384 = icmp slt i32 %320, %340, !dbg !36 + %385 = icmp sge i32 %321, %341, !dbg !36 + %386 = icmp eq i32 %318, %338, !dbg !37 + %387 = icmp ne i32 %319, %339, !dbg !37 + %388 = icmp eq i32 %320, %340, !dbg !37 + %389 = icmp ne i32 %321, %341, !dbg !37 + %390 = icmp sgt i32 %358, %378, !dbg !38 + %391 = icmp sle i32 %359, %379, !dbg !38 + %392 = icmp sgt i32 %360, %380, !dbg !38 + %393 = icmp sle i32 %361, %381, !dbg !38 + %394 = insertelement <2 x i1> poison, i1 %386, i64 0, !dbg !39 + %395 = insertelement <2 x i1> %394, i1 %387, i64 1, !dbg !39 + %396 = insertelement <2 x i1> poison, i1 %390, i64 0, !dbg !39 + %397 = insertelement <2 x i1> %396, i1 %391, i64 1, !dbg !39 + %398 = or <2 x i1> %395, %397, !dbg !39 + %399 = and <2 x i1> %395, %397, !dbg !39 + %400 = shufflevector <2 x i1> %399, <2 x i1> %398, <2 x i32> , !dbg !39 + %401 = insertelement <2 x i1> poison, i1 %388, i64 0, !dbg !40 + %402 = insertelement <2 x i1> %401, i1 %389, i64 1, !dbg !40 + %403 = insertelement <2 x i1> poison, i1 %392, i64 0, !dbg !40 + %404 = insertelement <2 x i1> %403, i1 %393, i64 1, !dbg !40 + %405 = or <2 x i1> %402, %404, !dbg !40 + %406 = and <2 x i1> %402, %404, !dbg !40 + %407 = shufflevector <2 x i1> %406, <2 x i1> %405, <2 x i32> , !dbg !40 + %408 = insertelement <2 x i1> poison, i1 %382, i64 0, !dbg !40 + %409 = insertelement <2 x i1> %408, i1 %383, i64 1, !dbg !40 + %410 = and <2 x i1> %409, %400, !dbg !40 + %411 = or <2 x i1> %409, %400, !dbg !40 + %412 = shufflevector <2 x i1> %411, <2 x i1> %410, <2 x i32> , !dbg !40 + %413 = insertelement <2 x i1> poison, i1 %384, i64 0, !dbg !41 + %414 = insertelement <2 x i1> %413, i1 %385, i64 1, !dbg !41 + %415 = and <2 x i1> %414, %407, !dbg !41 + %416 = or <2 x i1> %414, %407, !dbg !41 + %417 = shufflevector <2 x i1> %416, <2 x i1> %415, <2 x i32> , !dbg !41 + %418 = insertelement <2 x i32> poison, i32 %378, i64 0, !dbg !33 + %419 = insertelement <2 x i32> %418, i32 %339, i64 1, !dbg !33 + %420 = insertelement <2 x i32> poison, i32 %358, i64 0, !dbg !33 + %421 = insertelement <2 x i32> %420, i32 %319, i64 1, !dbg !33 + %422 = xor <2 x i32> %419, %421, !dbg !33 + %423 = insertelement <2 x i32> poison, i32 %379, i64 0, !dbg !33 + %424 = insertelement <2 x i32> %423, i32 %338, i64 1, !dbg !33 + %425 = insertelement <2 x i32> poison, i32 %359, i64 0, !dbg !33 + %426 = insertelement <2 x i32> %425, i32 %318, i64 1, !dbg !33 + %427 = xor <2 x i32> %424, %426, !dbg !33 + %428 = insertelement <2 x i32> poison, i32 %380, i64 0, !dbg !33 + %429 = insertelement <2 x i32> %428, i32 %341, i64 1, !dbg !33 + %430 = insertelement <2 x i32> poison, i32 %360, i64 0, !dbg !33 + %431 = insertelement <2 x i32> %430, i32 %321, i64 1, !dbg !33 + %432 = xor <2 x i32> %429, %431, !dbg !33 + %433 = insertelement <2 x i32> poison, i32 %381, i64 0, !dbg !33 + %434 = insertelement <2 x i32> %433, i32 %340, i64 1, !dbg !33 + %435 = insertelement <2 x i32> poison, i32 %361, i64 0, !dbg !33 + %436 = insertelement <2 x i32> %435, i32 %320, i64 1, !dbg !33 + %437 = xor <2 x i32> %434, %436, !dbg !33 + %438 = select <2 x i1> %412, <2 x i32> %422, <2 x i32> zeroinitializer, !dbg !34 + %439 = shufflevector <2 x i1> %410, <2 x i1> %411, <2 x i32> , !dbg !34 + %440 = select <2 x i1> %439, <2 x i32> %427, <2 x i32> zeroinitializer, !dbg !34 + %441 = select <2 x i1> %417, <2 x i32> %432, <2 x i32> zeroinitializer, !dbg !34 + %442 = shufflevector <2 x i1> %415, <2 x i1> %416, <2 x i32> , !dbg !34 + %443 = select <2 x i1> %442, <2 x i32> %437, <2 x i32> zeroinitializer, !dbg !34 + %444 = xor <2 x i32> %438, %227, !dbg !35 + %445 = xor <2 x i32> %440, %230, !dbg !35 + %446 = xor <2 x i32> %441, %278, !dbg !35 + %447 = xor <2 x i32> %443, %281, !dbg !35 + %448 = extractelement <2 x i32> %445, i64 1, !dbg !29 + %449 = mul nuw nsw i32 %448, %50, !dbg !24 + %450 = extractelement <2 x i32> %444, i64 1, !dbg !29 + %451 = mul nuw nsw i32 %450, %50, !dbg !24 + %452 = extractelement <2 x i32> %447, i64 1, !dbg !29 + %453 = mul nuw nsw i32 %452, %50, !dbg !24 + %454 = extractelement <2 x i32> %446, i64 1, !dbg !29 + %455 = mul nuw nsw i32 %454, %50, !dbg !24 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %456 = insertelement <1 x i32> poison, i32 %449, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %456, i1 true) #5, !dbg !25 + %457 = insertelement <1 x i32> poison, i32 %451, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %457, i1 true) #5, !dbg !25 + %458 = insertelement <1 x i32> poison, i32 %453, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %458, i1 true) #5, !dbg !25 + %459 = insertelement <1 x i32> poison, i32 %455, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %459, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %460 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %461 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %460, i32 1, i32 31), !dbg !25 + %462 = add i32 %461, %460, !dbg !28 + %463 = insertelement <1 x i32> poison, i32 %462, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %463, i1 %75) #5, !dbg !25 + %464 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %465 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %464, i32 1, i32 31), !dbg !25 + %466 = add i32 %465, %464, !dbg !28 + %467 = insertelement <1 x i32> poison, i32 %466, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %467, i1 %75) #5, !dbg !25 + %468 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %469 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %468, i32 1, i32 31), !dbg !25 + %470 = add i32 %469, %468, !dbg !28 + %471 = insertelement <1 x i32> poison, i32 %470, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %471, i1 %75) #5, !dbg !25 + %472 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %473 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %472, i32 1, i32 31), !dbg !25 + %474 = add i32 %473, %472, !dbg !28 + %475 = insertelement <1 x i32> poison, i32 %474, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %475, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %476 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %477 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %478 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %479 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %480 = mul nuw nsw i32 %448, %.lobit1, !dbg !29 + %481 = mul nuw nsw i32 %450, %.lobit1, !dbg !29 + %482 = mul nuw nsw i32 %452, %.lobit1, !dbg !29 + %483 = mul nuw nsw i32 %454, %.lobit1, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %484 = insertelement <1 x i32> poison, i32 %480, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %484, i1 true) #5, !dbg !25 + %485 = insertelement <1 x i32> poison, i32 %481, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %485, i1 true) #5, !dbg !25 + %486 = insertelement <1 x i32> poison, i32 %482, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %486, i1 true) #5, !dbg !25 + %487 = insertelement <1 x i32> poison, i32 %483, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %487, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %488 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %489 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %488, i32 1, i32 31), !dbg !25 + %490 = add i32 %489, %488, !dbg !28 + %491 = insertelement <1 x i32> poison, i32 %490, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %491, i1 %75) #5, !dbg !25 + %492 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %493 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %492, i32 1, i32 31), !dbg !25 + %494 = add i32 %493, %492, !dbg !28 + %495 = insertelement <1 x i32> poison, i32 %494, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %495, i1 %75) #5, !dbg !25 + %496 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %497 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %496, i32 1, i32 31), !dbg !25 + %498 = add i32 %497, %496, !dbg !28 + %499 = insertelement <1 x i32> poison, i32 %498, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %499, i1 %75) #5, !dbg !25 + %500 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %501 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %500, i32 1, i32 31), !dbg !25 + %502 = add i32 %501, %500, !dbg !28 + %503 = insertelement <1 x i32> poison, i32 %502, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %503, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %504 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %505 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %506 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %507 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %508 = extractelement <2 x i32> %444, i64 0, !dbg !31 + %509 = mul nuw nsw i32 %508, %50, !dbg !30 + %510 = extractelement <2 x i32> %445, i64 0, !dbg !31 + %511 = mul nuw nsw i32 %510, %50, !dbg !30 + %512 = extractelement <2 x i32> %446, i64 0, !dbg !31 + %513 = mul nuw nsw i32 %512, %50, !dbg !30 + %514 = extractelement <2 x i32> %447, i64 0, !dbg !31 + %515 = mul nuw nsw i32 %514, %50, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %516 = insertelement <1 x i32> poison, i32 %509, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %516, i1 true) #5, !dbg !25 + %517 = insertelement <1 x i32> poison, i32 %511, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %517, i1 true) #5, !dbg !25 + %518 = insertelement <1 x i32> poison, i32 %513, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %518, i1 true) #5, !dbg !25 + %519 = insertelement <1 x i32> poison, i32 %515, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %519, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %520 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %521 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %520, i32 1, i32 31), !dbg !25 + %522 = add i32 %521, %520, !dbg !28 + %523 = insertelement <1 x i32> poison, i32 %522, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %523, i1 %75) #5, !dbg !25 + %524 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %525 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %524, i32 1, i32 31), !dbg !25 + %526 = add i32 %525, %524, !dbg !28 + %527 = insertelement <1 x i32> poison, i32 %526, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %527, i1 %75) #5, !dbg !25 + %528 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %529 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %528, i32 1, i32 31), !dbg !25 + %530 = add i32 %529, %528, !dbg !28 + %531 = insertelement <1 x i32> poison, i32 %530, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %531, i1 %75) #5, !dbg !25 + %532 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %533 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %532, i32 1, i32 31), !dbg !25 + %534 = add i32 %533, %532, !dbg !28 + %535 = insertelement <1 x i32> poison, i32 %534, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %535, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %536 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %537 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %538 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %539 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %540 = mul nuw nsw i32 %508, %.lobit1, !dbg !31 + %541 = mul nuw nsw i32 %510, %.lobit1, !dbg !31 + %542 = mul nuw nsw i32 %512, %.lobit1, !dbg !31 + %543 = mul nuw nsw i32 %514, %.lobit1, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %544 = insertelement <1 x i32> poison, i32 %540, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %544, i1 true) #5, !dbg !25 + %545 = insertelement <1 x i32> poison, i32 %541, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %545, i1 true) #5, !dbg !25 + %546 = insertelement <1 x i32> poison, i32 %542, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %546, i1 true) #5, !dbg !25 + %547 = insertelement <1 x i32> poison, i32 %543, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %547, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %548 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %549 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %548, i32 1, i32 31), !dbg !25 + %550 = add i32 %549, %548, !dbg !28 + %551 = insertelement <1 x i32> poison, i32 %550, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %551, i1 %75) #5, !dbg !25 + %552 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %553 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %552, i32 1, i32 31), !dbg !25 + %554 = add i32 %553, %552, !dbg !28 + %555 = insertelement <1 x i32> poison, i32 %554, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %555, i1 %75) #5, !dbg !25 + %556 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %557 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %556, i32 1, i32 31), !dbg !25 + %558 = add i32 %557, %556, !dbg !28 + %559 = insertelement <1 x i32> poison, i32 %558, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %559, i1 %75) #5, !dbg !25 + %560 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %561 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %560, i32 1, i32 31), !dbg !25 + %562 = add i32 %561, %560, !dbg !28 + %563 = insertelement <1 x i32> poison, i32 %562, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %563, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %564 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %565 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %566 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %567 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %568 = icmp slt i32 %476, %504, !dbg !36 + %569 = icmp sge i32 %477, %505, !dbg !36 + %570 = icmp slt i32 %478, %506, !dbg !36 + %571 = icmp sge i32 %479, %507, !dbg !36 + %572 = icmp eq i32 %476, %504, !dbg !37 + %573 = icmp ne i32 %477, %505, !dbg !37 + %574 = icmp eq i32 %478, %506, !dbg !37 + %575 = icmp ne i32 %479, %507, !dbg !37 + %576 = icmp sgt i32 %536, %564, !dbg !38 + %577 = icmp sle i32 %537, %565, !dbg !38 + %578 = icmp sgt i32 %538, %566, !dbg !38 + %579 = icmp sle i32 %539, %567, !dbg !38 + %580 = insertelement <2 x i1> poison, i1 %572, i64 0, !dbg !39 + %581 = insertelement <2 x i1> %580, i1 %573, i64 1, !dbg !39 + %582 = insertelement <2 x i1> poison, i1 %576, i64 0, !dbg !39 + %583 = insertelement <2 x i1> %582, i1 %577, i64 1, !dbg !39 + %584 = or <2 x i1> %581, %583, !dbg !39 + %585 = and <2 x i1> %581, %583, !dbg !39 + %586 = shufflevector <2 x i1> %585, <2 x i1> %584, <2 x i32> , !dbg !39 + %587 = insertelement <2 x i1> poison, i1 %574, i64 0, !dbg !40 + %588 = insertelement <2 x i1> %587, i1 %575, i64 1, !dbg !40 + %589 = insertelement <2 x i1> poison, i1 %578, i64 0, !dbg !40 + %590 = insertelement <2 x i1> %589, i1 %579, i64 1, !dbg !40 + %591 = or <2 x i1> %588, %590, !dbg !40 + %592 = and <2 x i1> %588, %590, !dbg !40 + %593 = shufflevector <2 x i1> %592, <2 x i1> %591, <2 x i32> , !dbg !40 + %594 = insertelement <2 x i1> poison, i1 %568, i64 0, !dbg !40 + %595 = insertelement <2 x i1> %594, i1 %569, i64 1, !dbg !40 + %596 = and <2 x i1> %595, %586, !dbg !40 + %597 = or <2 x i1> %595, %586, !dbg !40 + %598 = shufflevector <2 x i1> %597, <2 x i1> %596, <2 x i32> , !dbg !40 + %599 = insertelement <2 x i1> poison, i1 %570, i64 0, !dbg !41 + %600 = insertelement <2 x i1> %599, i1 %571, i64 1, !dbg !41 + %601 = and <2 x i1> %600, %593, !dbg !41 + %602 = or <2 x i1> %600, %593, !dbg !41 + %603 = shufflevector <2 x i1> %602, <2 x i1> %601, <2 x i32> , !dbg !41 + %604 = insertelement <2 x i32> poison, i32 %564, i64 0, !dbg !33 + %605 = insertelement <2 x i32> %604, i32 %505, i64 1, !dbg !33 + %606 = insertelement <2 x i32> poison, i32 %536, i64 0, !dbg !33 + %607 = insertelement <2 x i32> %606, i32 %477, i64 1, !dbg !33 + %608 = xor <2 x i32> %605, %607, !dbg !33 + %609 = insertelement <2 x i32> poison, i32 %565, i64 0, !dbg !33 + %610 = insertelement <2 x i32> %609, i32 %504, i64 1, !dbg !33 + %611 = insertelement <2 x i32> poison, i32 %537, i64 0, !dbg !33 + %612 = insertelement <2 x i32> %611, i32 %476, i64 1, !dbg !33 + %613 = xor <2 x i32> %610, %612, !dbg !33 + %614 = insertelement <2 x i32> poison, i32 %566, i64 0, !dbg !33 + %615 = insertelement <2 x i32> %614, i32 %507, i64 1, !dbg !33 + %616 = insertelement <2 x i32> poison, i32 %538, i64 0, !dbg !33 + %617 = insertelement <2 x i32> %616, i32 %479, i64 1, !dbg !33 + %618 = xor <2 x i32> %615, %617, !dbg !33 + %619 = insertelement <2 x i32> poison, i32 %567, i64 0, !dbg !33 + %620 = insertelement <2 x i32> %619, i32 %506, i64 1, !dbg !33 + %621 = insertelement <2 x i32> poison, i32 %539, i64 0, !dbg !33 + %622 = insertelement <2 x i32> %621, i32 %478, i64 1, !dbg !33 + %623 = xor <2 x i32> %620, %622, !dbg !33 + %624 = select <2 x i1> %598, <2 x i32> %608, <2 x i32> zeroinitializer, !dbg !34 + %625 = shufflevector <2 x i1> %596, <2 x i1> %597, <2 x i32> , !dbg !34 + %626 = select <2 x i1> %625, <2 x i32> %613, <2 x i32> zeroinitializer, !dbg !34 + %627 = select <2 x i1> %603, <2 x i32> %618, <2 x i32> zeroinitializer, !dbg !34 + %628 = shufflevector <2 x i1> %601, <2 x i1> %602, <2 x i32> , !dbg !34 + %629 = select <2 x i1> %628, <2 x i32> %623, <2 x i32> zeroinitializer, !dbg !34 + %630 = xor <2 x i32> %624, %444, !dbg !35 + %631 = xor <2 x i32> %626, %445, !dbg !35 + %632 = xor <2 x i32> %627, %446, !dbg !35 + %633 = xor <2 x i32> %629, %447, !dbg !35 + %634 = extractelement <2 x i32> %630, i64 1, !dbg !36 + %635 = extractelement <2 x i32> %631, i64 1, !dbg !36 + %636 = icmp slt i32 %635, %634, !dbg !36 + %637 = extractelement <2 x i32> %632, i64 1, !dbg !36 + %638 = extractelement <2 x i32> %633, i64 1, !dbg !36 + %639 = icmp sge i32 %638, %637, !dbg !36 + %640 = icmp eq <2 x i32> %630, %631, !dbg !37 + %641 = icmp sgt <2 x i32> %630, %631, !dbg !37 + %642 = icmp ne <2 x i32> %632, %633, !dbg !37 + %643 = icmp sle <2 x i32> %632, %633, !dbg !37 + %shift = shufflevector <2 x i1> %640, <2 x i1> poison, <2 x i32> , !dbg !39 + %foldExtExtBinop = and <2 x i1> %shift, %641, !dbg !39 + %644 = extractelement <2 x i1> %foldExtExtBinop, i64 0, !dbg !39 + %shift69 = shufflevector <2 x i1> %642, <2 x i1> poison, <2 x i32> , !dbg !40 + %foldExtExtBinop70 = or <2 x i1> %shift69, %643, !dbg !40 + %.not34 = extractelement <2 x i1> %foldExtExtBinop70, i64 0, !dbg !40 + %645 = or i1 %636, %644, !dbg !40 + %.not31 = and i1 %639, %.not34, !dbg !41 + %646 = xor i32 %634, %635, !dbg !33 + %647 = xor i32 %637, %638, !dbg !33 + %648 = select i1 %645, i32 %646, i32 0, !dbg !34 + %649 = select i1 %.not31, i32 %647, i32 0, !dbg !34 + %foldExtExtBinop72 = xor <2 x i32> %631, %630, !dbg !42 + %650 = extractelement <2 x i32> %foldExtExtBinop72, i64 0, !dbg !42 + %foldExtExtBinop74 = xor <2 x i32> %633, %632, !dbg !42 + %651 = extractelement <2 x i32> %foldExtExtBinop74, i64 0, !dbg !42 + %652 = select i1 %645, i32 %650, i32 0, !dbg !43 + %653 = select i1 %.not31, i32 %651, i32 0, !dbg !43 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %654 = insertelement <2 x i32> poison, i32 %648, i64 0, !dbg !35 + %655 = shufflevector <2 x i32> %654, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !35 + %656 = shufflevector <2 x i32> %630, <2 x i32> %631, <2 x i32> , !dbg !35 + %657 = xor <2 x i32> %655, %656, !dbg !35 + %658 = insertelement <2 x i32> poison, i32 %649, i64 0, !dbg !35 + %659 = shufflevector <2 x i32> %658, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !35 + %660 = shufflevector <2 x i32> %632, <2 x i32> %633, <2 x i32> , !dbg !35 + %661 = xor <2 x i32> %659, %660, !dbg !35 + %662 = insertelement <2 x i32> poison, i32 %652, i64 0, !dbg !44 + %663 = shufflevector <2 x i32> %662, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !44 + %664 = shufflevector <2 x i32> %631, <2 x i32> %630, <2 x i32> , !dbg !44 + %665 = xor <2 x i32> %663, %664, !dbg !44 + %666 = insertelement <2 x i32> poison, i32 %653, i64 0, !dbg !44 + %667 = shufflevector <2 x i32> %666, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !44 + %668 = shufflevector <2 x i32> %633, <2 x i32> %632, <2 x i32> , !dbg !44 + %669 = xor <2 x i32> %667, %668, !dbg !44 + %670 = extractelement <2 x i32> %657, i64 1, !dbg !29 + %671 = mul nuw nsw i32 %670, %51, !dbg !24 + %672 = extractelement <2 x i32> %657, i64 0, !dbg !29 + %673 = mul nuw nsw i32 %672, %51, !dbg !24 + %674 = extractelement <2 x i32> %661, i64 1, !dbg !29 + %675 = mul nuw nsw i32 %674, %51, !dbg !24 + %676 = extractelement <2 x i32> %661, i64 0, !dbg !29 + %677 = mul nuw nsw i32 %676, %51, !dbg !24 + %678 = insertelement <1 x i32> poison, i32 %671, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %678, i1 true) #5, !dbg !25 + %679 = insertelement <1 x i32> poison, i32 %673, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %679, i1 true) #5, !dbg !25 + %680 = insertelement <1 x i32> poison, i32 %675, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %680, i1 true) #5, !dbg !25 + %681 = insertelement <1 x i32> poison, i32 %677, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %681, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %682 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %683 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %682, i32 1, i32 31), !dbg !25 + %684 = add i32 %683, %682, !dbg !28 + %685 = insertelement <1 x i32> poison, i32 %684, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %685, i1 %75) #5, !dbg !25 + %686 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %687 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %686, i32 1, i32 31), !dbg !25 + %688 = add i32 %687, %686, !dbg !28 + %689 = insertelement <1 x i32> poison, i32 %688, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %689, i1 %75) #5, !dbg !25 + %690 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %691 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %690, i32 1, i32 31), !dbg !25 + %692 = add i32 %691, %690, !dbg !28 + %693 = insertelement <1 x i32> poison, i32 %692, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %693, i1 %75) #5, !dbg !25 + %694 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %695 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %694, i32 1, i32 31), !dbg !25 + %696 = add i32 %695, %694, !dbg !28 + %697 = insertelement <1 x i32> poison, i32 %696, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %697, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %698 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %699 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %700 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %701 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + %702 = mul nuw nsw i32 %670, %.lobit, !dbg !29 + %703 = mul nuw nsw i32 %672, %.lobit, !dbg !29 + %704 = mul nuw nsw i32 %674, %.lobit, !dbg !29 + %705 = mul nuw nsw i32 %676, %.lobit, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %706 = insertelement <1 x i32> poison, i32 %702, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %706, i1 true) #5, !dbg !25 + %707 = insertelement <1 x i32> poison, i32 %703, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %707, i1 true) #5, !dbg !25 + %708 = insertelement <1 x i32> poison, i32 %704, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %708, i1 true) #5, !dbg !25 + %709 = insertelement <1 x i32> poison, i32 %705, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %709, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %710 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %711 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %710, i32 1, i32 31), !dbg !25 + %712 = add i32 %711, %710, !dbg !28 + %713 = insertelement <1 x i32> poison, i32 %712, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %713, i1 %75) #5, !dbg !25 + %714 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %715 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %714, i32 1, i32 31), !dbg !25 + %716 = add i32 %715, %714, !dbg !28 + %717 = insertelement <1 x i32> poison, i32 %716, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %717, i1 %75) #5, !dbg !25 + %718 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %719 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %718, i32 1, i32 31), !dbg !25 + %720 = add i32 %719, %718, !dbg !28 + %721 = insertelement <1 x i32> poison, i32 %720, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %721, i1 %75) #5, !dbg !25 + %722 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %723 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %722, i32 1, i32 31), !dbg !25 + %724 = add i32 %723, %722, !dbg !28 + %725 = insertelement <1 x i32> poison, i32 %724, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %725, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %726 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %727 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %728 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %729 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + %730 = extractelement <2 x i32> %665, i64 1, !dbg !31 + %731 = mul nuw nsw i32 %730, %51, !dbg !30 + %732 = extractelement <2 x i32> %665, i64 0, !dbg !31 + %733 = mul nuw nsw i32 %732, %51, !dbg !30 + %734 = extractelement <2 x i32> %669, i64 1, !dbg !31 + %735 = mul nuw nsw i32 %734, %51, !dbg !30 + %736 = extractelement <2 x i32> %669, i64 0, !dbg !31 + %737 = mul nuw nsw i32 %736, %51, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %738 = insertelement <1 x i32> poison, i32 %731, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %738, i1 true) #5, !dbg !25 + %739 = insertelement <1 x i32> poison, i32 %733, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %739, i1 true) #5, !dbg !25 + %740 = insertelement <1 x i32> poison, i32 %735, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %740, i1 true) #5, !dbg !25 + %741 = insertelement <1 x i32> poison, i32 %737, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %741, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %742 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %743 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %742, i32 1, i32 31), !dbg !25 + %744 = add i32 %743, %742, !dbg !28 + %745 = insertelement <1 x i32> poison, i32 %744, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %745, i1 %75) #5, !dbg !25 + %746 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %747 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %746, i32 1, i32 31), !dbg !25 + %748 = add i32 %747, %746, !dbg !28 + %749 = insertelement <1 x i32> poison, i32 %748, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %749, i1 %75) #5, !dbg !25 + %750 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %751 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %750, i32 1, i32 31), !dbg !25 + %752 = add i32 %751, %750, !dbg !28 + %753 = insertelement <1 x i32> poison, i32 %752, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %753, i1 %75) #5, !dbg !25 + %754 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %755 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %754, i32 1, i32 31), !dbg !25 + %756 = add i32 %755, %754, !dbg !28 + %757 = insertelement <1 x i32> poison, i32 %756, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %757, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %758 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %759 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %760 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %761 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + %762 = mul nuw nsw i32 %730, %.lobit, !dbg !31 + %763 = mul nuw nsw i32 %732, %.lobit, !dbg !31 + %764 = mul nuw nsw i32 %734, %.lobit, !dbg !31 + %765 = mul nuw nsw i32 %736, %.lobit, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %766 = insertelement <1 x i32> poison, i32 %762, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %766, i1 true) #5, !dbg !25 + %767 = insertelement <1 x i32> poison, i32 %763, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %767, i1 true) #5, !dbg !25 + %768 = insertelement <1 x i32> poison, i32 %764, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %768, i1 true) #5, !dbg !25 + %769 = insertelement <1 x i32> poison, i32 %765, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %769, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %770 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %771 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %770, i32 1, i32 31), !dbg !25 + %772 = add i32 %771, %770, !dbg !28 + %773 = insertelement <1 x i32> poison, i32 %772, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %773, i1 %75) #5, !dbg !25 + %774 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %775 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %774, i32 1, i32 31), !dbg !25 + %776 = add i32 %775, %774, !dbg !28 + %777 = insertelement <1 x i32> poison, i32 %776, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %777, i1 %75) #5, !dbg !25 + %778 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %779 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %778, i32 1, i32 31), !dbg !25 + %780 = add i32 %779, %778, !dbg !28 + %781 = insertelement <1 x i32> poison, i32 %780, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %781, i1 %75) #5, !dbg !25 + %782 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %783 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %782, i32 1, i32 31), !dbg !25 + %784 = add i32 %783, %782, !dbg !28 + %785 = insertelement <1 x i32> poison, i32 %784, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %785, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %786 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %787 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %788 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %789 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + %790 = insertelement <2 x i32> poison, i32 %699, i64 0, !dbg !36 + %791 = insertelement <2 x i32> %790, i32 %698, i64 1, !dbg !36 + %792 = insertelement <2 x i32> poison, i32 %727, i64 0, !dbg !36 + %793 = insertelement <2 x i32> %792, i32 %726, i64 1, !dbg !36 + %794 = icmp slt <2 x i32> %791, %793, !dbg !36 + %795 = insertelement <2 x i32> poison, i32 %701, i64 0, !dbg !36 + %796 = insertelement <2 x i32> %795, i32 %700, i64 1, !dbg !36 + %797 = insertelement <2 x i32> poison, i32 %729, i64 0, !dbg !36 + %798 = insertelement <2 x i32> %797, i32 %728, i64 1, !dbg !36 + %799 = icmp sge <2 x i32> %796, %798, !dbg !36 + %800 = icmp eq <2 x i32> %791, %793, !dbg !37 + %801 = icmp ne <2 x i32> %796, %798, !dbg !37 + %802 = insertelement <2 x i32> poison, i32 %759, i64 0, !dbg !38 + %803 = insertelement <2 x i32> %802, i32 %758, i64 1, !dbg !38 + %804 = insertelement <2 x i32> poison, i32 %787, i64 0, !dbg !38 + %805 = insertelement <2 x i32> %804, i32 %786, i64 1, !dbg !38 + %806 = icmp sgt <2 x i32> %803, %805, !dbg !38 + %807 = insertelement <2 x i32> poison, i32 %761, i64 0, !dbg !38 + %808 = insertelement <2 x i32> %807, i32 %760, i64 1, !dbg !38 + %809 = insertelement <2 x i32> poison, i32 %789, i64 0, !dbg !38 + %810 = insertelement <2 x i32> %809, i32 %788, i64 1, !dbg !38 + %811 = icmp sle <2 x i32> %808, %810, !dbg !38 + %812 = and <2 x i1> %800, %806, !dbg !39 + %813 = or <2 x i1> %801, %811, !dbg !40 + %814 = or <2 x i1> %794, %812, !dbg !40 + %815 = and <2 x i1> %799, %813, !dbg !41 + %816 = xor <2 x i32> %793, %791, !dbg !33 + %817 = xor <2 x i32> %798, %796, !dbg !33 + %818 = select <2 x i1> %814, <2 x i32> %816, <2 x i32> zeroinitializer, !dbg !34 + %819 = select <2 x i1> %815, <2 x i32> %817, <2 x i32> zeroinitializer, !dbg !34 + %820 = xor <2 x i32> %818, %657, !dbg !35 + %821 = xor <2 x i32> %819, %661, !dbg !35 + %822 = xor <2 x i32> %805, %803, !dbg !42 + %823 = xor <2 x i32> %810, %808, !dbg !42 + %824 = select <2 x i1> %814, <2 x i32> %822, <2 x i32> zeroinitializer, !dbg !43 + %825 = select <2 x i1> %815, <2 x i32> %823, <2 x i32> zeroinitializer, !dbg !43 + %826 = xor <2 x i32> %824, %665, !dbg !44 + %827 = xor <2 x i32> %825, %669, !dbg !44 + %828 = extractelement <2 x i32> %820, i64 1, !dbg !29 + %829 = mul nuw nsw i32 %828, %50, !dbg !24 + %830 = extractelement <2 x i32> %820, i64 0, !dbg !29 + %831 = mul nuw nsw i32 %830, %50, !dbg !24 + %832 = extractelement <2 x i32> %821, i64 1, !dbg !29 + %833 = mul nuw nsw i32 %832, %50, !dbg !24 + %834 = extractelement <2 x i32> %821, i64 0, !dbg !29 + %835 = mul nuw nsw i32 %834, %50, !dbg !24 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %836 = insertelement <1 x i32> poison, i32 %829, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %836, i1 true) #5, !dbg !25 + %837 = insertelement <1 x i32> poison, i32 %831, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %837, i1 true) #5, !dbg !25 + %838 = insertelement <1 x i32> poison, i32 %833, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %838, i1 true) #5, !dbg !25 + %839 = insertelement <1 x i32> poison, i32 %835, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %839, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %840 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %841 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %840, i32 1, i32 31), !dbg !25 + %842 = add i32 %841, %840, !dbg !28 + %843 = insertelement <1 x i32> poison, i32 %842, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %843, i1 %75) #5, !dbg !25 + %844 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %845 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %844, i32 1, i32 31), !dbg !25 + %846 = add i32 %845, %844, !dbg !28 + %847 = insertelement <1 x i32> poison, i32 %846, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %847, i1 %75) #5, !dbg !25 + %848 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %849 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %848, i32 1, i32 31), !dbg !25 + %850 = add i32 %849, %848, !dbg !28 + %851 = insertelement <1 x i32> poison, i32 %850, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %851, i1 %75) #5, !dbg !25 + %852 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %853 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %852, i32 1, i32 31), !dbg !25 + %854 = add i32 %853, %852, !dbg !28 + %855 = insertelement <1 x i32> poison, i32 %854, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %855, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %856 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %857 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %858 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %859 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %860 = mul nuw nsw i32 %828, %.lobit1, !dbg !29 + %861 = mul nuw nsw i32 %830, %.lobit1, !dbg !29 + %862 = mul nuw nsw i32 %832, %.lobit1, !dbg !29 + %863 = mul nuw nsw i32 %834, %.lobit1, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %864 = insertelement <1 x i32> poison, i32 %860, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %864, i1 true) #5, !dbg !25 + %865 = insertelement <1 x i32> poison, i32 %861, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %865, i1 true) #5, !dbg !25 + %866 = insertelement <1 x i32> poison, i32 %862, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %866, i1 true) #5, !dbg !25 + %867 = insertelement <1 x i32> poison, i32 %863, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %867, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %868 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %869 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %868, i32 1, i32 31), !dbg !25 + %870 = add i32 %869, %868, !dbg !28 + %871 = insertelement <1 x i32> poison, i32 %870, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %871, i1 %75) #5, !dbg !25 + %872 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %873 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %872, i32 1, i32 31), !dbg !25 + %874 = add i32 %873, %872, !dbg !28 + %875 = insertelement <1 x i32> poison, i32 %874, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %875, i1 %75) #5, !dbg !25 + %876 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %877 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %876, i32 1, i32 31), !dbg !25 + %878 = add i32 %877, %876, !dbg !28 + %879 = insertelement <1 x i32> poison, i32 %878, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %879, i1 %75) #5, !dbg !25 + %880 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %881 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %880, i32 1, i32 31), !dbg !25 + %882 = add i32 %881, %880, !dbg !28 + %883 = insertelement <1 x i32> poison, i32 %882, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %883, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %884 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %885 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %886 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %887 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %888 = extractelement <2 x i32> %826, i64 1, !dbg !31 + %889 = mul nuw nsw i32 %888, %50, !dbg !30 + %890 = extractelement <2 x i32> %826, i64 0, !dbg !31 + %891 = mul nuw nsw i32 %890, %50, !dbg !30 + %892 = extractelement <2 x i32> %827, i64 1, !dbg !31 + %893 = mul nuw nsw i32 %892, %50, !dbg !30 + %894 = extractelement <2 x i32> %827, i64 0, !dbg !31 + %895 = mul nuw nsw i32 %894, %50, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %896 = insertelement <1 x i32> poison, i32 %889, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %896, i1 true) #5, !dbg !25 + %897 = insertelement <1 x i32> poison, i32 %891, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %897, i1 true) #5, !dbg !25 + %898 = insertelement <1 x i32> poison, i32 %893, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %898, i1 true) #5, !dbg !25 + %899 = insertelement <1 x i32> poison, i32 %895, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %899, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %900 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %901 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %900, i32 1, i32 31), !dbg !25 + %902 = add i32 %901, %900, !dbg !28 + %903 = insertelement <1 x i32> poison, i32 %902, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %903, i1 %75) #5, !dbg !25 + %904 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %905 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %904, i32 1, i32 31), !dbg !25 + %906 = add i32 %905, %904, !dbg !28 + %907 = insertelement <1 x i32> poison, i32 %906, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %907, i1 %75) #5, !dbg !25 + %908 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %909 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %908, i32 1, i32 31), !dbg !25 + %910 = add i32 %909, %908, !dbg !28 + %911 = insertelement <1 x i32> poison, i32 %910, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %911, i1 %75) #5, !dbg !25 + %912 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %913 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %912, i32 1, i32 31), !dbg !25 + %914 = add i32 %913, %912, !dbg !28 + %915 = insertelement <1 x i32> poison, i32 %914, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %915, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %916 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %917 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %918 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %919 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %920 = mul nuw nsw i32 %888, %.lobit1, !dbg !31 + %921 = mul nuw nsw i32 %890, %.lobit1, !dbg !31 + %922 = mul nuw nsw i32 %892, %.lobit1, !dbg !31 + %923 = mul nuw nsw i32 %894, %.lobit1, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %924 = insertelement <1 x i32> poison, i32 %920, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %924, i1 true) #5, !dbg !25 + %925 = insertelement <1 x i32> poison, i32 %921, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %925, i1 true) #5, !dbg !25 + %926 = insertelement <1 x i32> poison, i32 %922, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %926, i1 true) #5, !dbg !25 + %927 = insertelement <1 x i32> poison, i32 %923, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %927, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %928 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %929 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %928, i32 1, i32 31), !dbg !25 + %930 = add i32 %929, %928, !dbg !28 + %931 = insertelement <1 x i32> poison, i32 %930, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %931, i1 %75) #5, !dbg !25 + %932 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %933 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %932, i32 1, i32 31), !dbg !25 + %934 = add i32 %933, %932, !dbg !28 + %935 = insertelement <1 x i32> poison, i32 %934, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %935, i1 %75) #5, !dbg !25 + %936 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %937 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %936, i32 1, i32 31), !dbg !25 + %938 = add i32 %937, %936, !dbg !28 + %939 = insertelement <1 x i32> poison, i32 %938, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %939, i1 %75) #5, !dbg !25 + %940 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %941 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %940, i32 1, i32 31), !dbg !25 + %942 = add i32 %941, %940, !dbg !28 + %943 = insertelement <1 x i32> poison, i32 %942, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %943, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %944 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %945 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %946 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %947 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %948 = insertelement <2 x i32> poison, i32 %857, i64 0, !dbg !36 + %949 = insertelement <2 x i32> %948, i32 %856, i64 1, !dbg !36 + %950 = insertelement <2 x i32> poison, i32 %885, i64 0, !dbg !36 + %951 = insertelement <2 x i32> %950, i32 %884, i64 1, !dbg !36 + %952 = icmp slt <2 x i32> %949, %951, !dbg !36 + %953 = insertelement <2 x i32> poison, i32 %859, i64 0, !dbg !36 + %954 = insertelement <2 x i32> %953, i32 %858, i64 1, !dbg !36 + %955 = insertelement <2 x i32> poison, i32 %887, i64 0, !dbg !36 + %956 = insertelement <2 x i32> %955, i32 %886, i64 1, !dbg !36 + %957 = icmp sge <2 x i32> %954, %956, !dbg !36 + %958 = icmp eq <2 x i32> %949, %951, !dbg !37 + %959 = icmp ne <2 x i32> %954, %956, !dbg !37 + %960 = insertelement <2 x i32> poison, i32 %917, i64 0, !dbg !38 + %961 = insertelement <2 x i32> %960, i32 %916, i64 1, !dbg !38 + %962 = insertelement <2 x i32> poison, i32 %945, i64 0, !dbg !38 + %963 = insertelement <2 x i32> %962, i32 %944, i64 1, !dbg !38 + %964 = icmp sgt <2 x i32> %961, %963, !dbg !38 + %965 = insertelement <2 x i32> poison, i32 %919, i64 0, !dbg !38 + %966 = insertelement <2 x i32> %965, i32 %918, i64 1, !dbg !38 + %967 = insertelement <2 x i32> poison, i32 %947, i64 0, !dbg !38 + %968 = insertelement <2 x i32> %967, i32 %946, i64 1, !dbg !38 + %969 = icmp sle <2 x i32> %966, %968, !dbg !38 + %970 = and <2 x i1> %958, %964, !dbg !39 + %971 = or <2 x i1> %959, %969, !dbg !40 + %972 = or <2 x i1> %952, %970, !dbg !40 + %973 = and <2 x i1> %957, %971, !dbg !41 + %974 = xor <2 x i32> %951, %949, !dbg !33 + %975 = xor <2 x i32> %956, %954, !dbg !33 + %976 = select <2 x i1> %972, <2 x i32> %974, <2 x i32> zeroinitializer, !dbg !34 + %977 = select <2 x i1> %973, <2 x i32> %975, <2 x i32> zeroinitializer, !dbg !34 + %978 = xor <2 x i32> %976, %820, !dbg !35 + %979 = xor <2 x i32> %977, %821, !dbg !35 + %980 = xor <2 x i32> %963, %961, !dbg !42 + %981 = xor <2 x i32> %968, %966, !dbg !42 + %982 = select <2 x i1> %972, <2 x i32> %980, <2 x i32> zeroinitializer, !dbg !43 + %983 = select <2 x i1> %973, <2 x i32> %981, <2 x i32> zeroinitializer, !dbg !43 + %984 = xor <2 x i32> %982, %826, !dbg !44 + %985 = xor <2 x i32> %983, %827, !dbg !44 + %986 = icmp slt <2 x i32> %978, %979, !dbg !36 + %987 = icmp eq <2 x i32> %978, %979, !dbg !37 + %988 = icmp sgt <2 x i32> %984, %985, !dbg !38 + %989 = and <2 x i1> %987, %988, !dbg !39 + %990 = or <2 x i1> %986, %989, !dbg !40 + %991 = xor <2 x i32> %979, %978, !dbg !33 + %992 = select <2 x i1> %990, <2 x i32> %991, <2 x i32> zeroinitializer, !dbg !34 + %foldExtExtBinop76 = xor <2 x i32> %992, %978, !dbg !35 + %993 = extractelement <2 x i32> %foldExtExtBinop76, i64 1, !dbg !35 + %foldExtExtBinop78 = xor <2 x i32> %992, %978, !dbg !35 + %994 = extractelement <2 x i32> %foldExtExtBinop78, i64 0, !dbg !35 + %995 = xor <2 x i32> %992, %979, !dbg !35 + %996 = extractelement <2 x i32> %995, i64 0, !dbg !36 + %997 = extractelement <2 x i32> %995, i64 1, !dbg !36 + %998 = xor i32 %994, %993, !dbg !33 + %999 = xor i32 %996, %997, !dbg !33 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1000 = xor <2 x i32> %985, %984, !dbg !42 + %1001 = select <2 x i1> %990, <2 x i32> %1000, <2 x i32> zeroinitializer, !dbg !43 + %1002 = shufflevector <2 x i32> %1001, <2 x i32> poison, <4 x i32> , !dbg !44 + %1003 = shufflevector <2 x i32> %984, <2 x i32> %985, <4 x i32> , !dbg !44 + %1004 = xor <4 x i32> %1002, %1003, !dbg !44 + %1005 = shufflevector <2 x i32> %1001, <2 x i32> poison, <2 x i32> , !dbg !44 + %1006 = shufflevector <2 x i32> %984, <2 x i32> %985, <2 x i32> , !dbg !44 + %1007 = xor <2 x i32> %1005, %1006, !dbg !44 + %1008 = shufflevector <2 x i32> %985, <2 x i32> %984, <2 x i32> , !dbg !44 + %1009 = shufflevector <2 x i32> %1001, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !44 + %1010 = shufflevector <2 x i32> %985, <2 x i32> %984, <2 x i32> , !dbg !44 + %1011 = xor <2 x i32> %1009, %1010, !dbg !44 + %1012 = shufflevector <2 x i32> %1001, <2 x i32> poison, <2 x i32> , !dbg !44 + %1013 = shufflevector <2 x i32> %985, <2 x i32> %984, <2 x i32> , !dbg !44 + %1014 = xor <2 x i32> %1012, %1013, !dbg !44 + %1015 = shufflevector <2 x i32> %995, <2 x i32> %foldExtExtBinop76, <2 x i32> , !dbg !36 + %1016 = shufflevector <2 x i32> %995, <2 x i32> %foldExtExtBinop78, <2 x i32> , !dbg !36 + %1017 = icmp slt <2 x i32> %1015, %1016, !dbg !36 + %1018 = icmp eq <2 x i32> %1015, %1016, !dbg !37 + %1019 = icmp sgt <2 x i32> %1014, %1011, !dbg !38 + %1020 = and <2 x i1> %1018, %1019, !dbg !39 + %1021 = or <2 x i1> %1017, %1020, !dbg !40 + %1022 = extractelement <2 x i1> %1021, i64 1, !dbg !34 + %1023 = select i1 %1022, i32 %998, i32 0, !dbg !34 + %1024 = extractelement <2 x i1> %1021, i64 0, !dbg !34 + %1025 = select i1 %1024, i32 %999, i32 0, !dbg !34 + %1026 = xor i32 %1023, %993, !dbg !35 + %1027 = xor i32 %1023, %994, !dbg !35 + %1028 = xor i32 %1025, %997, !dbg !35 + %1029 = xor i32 %1025, %996, !dbg !35 + %1030 = xor <2 x i32> %1008, %1007, !dbg !42 + %1031 = xor <2 x i32> %1030, %1001, !dbg !42 + %1032 = select <2 x i1> %1021, <2 x i32> %1031, <2 x i32> zeroinitializer, !dbg !43 + %1033 = shufflevector <2 x i32> %1032, <2 x i32> poison, <4 x i32> , !dbg !43 + %1034 = xor <4 x i32> %1033, %1004, !dbg !44 + %1035 = mul nuw nsw i32 %1026, %51, !dbg !24 + %1036 = mul nuw nsw i32 %1027, %51, !dbg !24 + %1037 = mul nuw nsw i32 %1028, %51, !dbg !24 + %1038 = mul nuw nsw i32 %1029, %51, !dbg !24 + %1039 = insertelement <1 x i32> poison, i32 %1035, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %1039, i1 true) #5, !dbg !25 + %1040 = insertelement <1 x i32> poison, i32 %1036, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %1040, i1 true) #5, !dbg !25 + %1041 = insertelement <1 x i32> poison, i32 %1037, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %1041, i1 true) #5, !dbg !25 + %1042 = insertelement <1 x i32> poison, i32 %1038, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %1042, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1043 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %1044 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1043, i32 1, i32 31), !dbg !25 + %1045 = add i32 %1044, %1043, !dbg !28 + %1046 = insertelement <1 x i32> poison, i32 %1045, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %1046, i1 %75) #5, !dbg !25 + %1047 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %1048 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1047, i32 1, i32 31), !dbg !25 + %1049 = add i32 %1048, %1047, !dbg !28 + %1050 = insertelement <1 x i32> poison, i32 %1049, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %1050, i1 %75) #5, !dbg !25 + %1051 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %1052 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1051, i32 1, i32 31), !dbg !25 + %1053 = add i32 %1052, %1051, !dbg !28 + %1054 = insertelement <1 x i32> poison, i32 %1053, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1054, i1 %75) #5, !dbg !25 + %1055 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %1056 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1055, i32 1, i32 31), !dbg !25 + %1057 = add i32 %1056, %1055, !dbg !28 + %1058 = insertelement <1 x i32> poison, i32 %1057, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1058, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1059 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %1060 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %1061 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %1062 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + %1063 = mul nuw nsw i32 %1026, %.lobit, !dbg !29 + %1064 = mul nuw nsw i32 %1027, %.lobit, !dbg !29 + %1065 = mul nuw nsw i32 %1028, %.lobit, !dbg !29 + %1066 = mul nuw nsw i32 %1029, %.lobit, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1067 = insertelement <1 x i32> poison, i32 %1063, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %1067, i1 true) #5, !dbg !25 + %1068 = insertelement <1 x i32> poison, i32 %1064, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %1068, i1 true) #5, !dbg !25 + %1069 = insertelement <1 x i32> poison, i32 %1065, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %1069, i1 true) #5, !dbg !25 + %1070 = insertelement <1 x i32> poison, i32 %1066, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %1070, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1071 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %1072 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1071, i32 1, i32 31), !dbg !25 + %1073 = add i32 %1072, %1071, !dbg !28 + %1074 = insertelement <1 x i32> poison, i32 %1073, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %1074, i1 %75) #5, !dbg !25 + %1075 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %1076 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1075, i32 1, i32 31), !dbg !25 + %1077 = add i32 %1076, %1075, !dbg !28 + %1078 = insertelement <1 x i32> poison, i32 %1077, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %1078, i1 %75) #5, !dbg !25 + %1079 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %1080 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1079, i32 1, i32 31), !dbg !25 + %1081 = add i32 %1080, %1079, !dbg !28 + %1082 = insertelement <1 x i32> poison, i32 %1081, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1082, i1 %75) #5, !dbg !25 + %1083 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %1084 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1083, i32 1, i32 31), !dbg !25 + %1085 = add i32 %1084, %1083, !dbg !28 + %1086 = insertelement <1 x i32> poison, i32 %1085, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1086, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1087 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %1088 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %1089 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %1090 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + %1091 = extractelement <4 x i32> %1034, i64 0, !dbg !31 + %1092 = mul nuw nsw i32 %1091, %51, !dbg !30 + %1093 = extractelement <4 x i32> %1034, i64 1, !dbg !31 + %1094 = mul nuw nsw i32 %1093, %51, !dbg !30 + %1095 = extractelement <4 x i32> %1034, i64 2, !dbg !31 + %1096 = mul nuw nsw i32 %1095, %51, !dbg !30 + %1097 = extractelement <4 x i32> %1034, i64 3, !dbg !31 + %1098 = mul nuw nsw i32 %1097, %51, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1099 = insertelement <1 x i32> poison, i32 %1092, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %1099, i1 true) #5, !dbg !25 + %1100 = insertelement <1 x i32> poison, i32 %1094, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %1100, i1 true) #5, !dbg !25 + %1101 = insertelement <1 x i32> poison, i32 %1096, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %1101, i1 true) #5, !dbg !25 + %1102 = insertelement <1 x i32> poison, i32 %1098, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %1102, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1103 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %1104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1103, i32 1, i32 31), !dbg !25 + %1105 = add i32 %1104, %1103, !dbg !28 + %1106 = insertelement <1 x i32> poison, i32 %1105, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %1106, i1 %75) #5, !dbg !25 + %1107 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %1108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1107, i32 1, i32 31), !dbg !25 + %1109 = add i32 %1108, %1107, !dbg !28 + %1110 = insertelement <1 x i32> poison, i32 %1109, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %1110, i1 %75) #5, !dbg !25 + %1111 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %1112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1111, i32 1, i32 31), !dbg !25 + %1113 = add i32 %1112, %1111, !dbg !28 + %1114 = insertelement <1 x i32> poison, i32 %1113, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1114, i1 %75) #5, !dbg !25 + %1115 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %1116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1115, i32 1, i32 31), !dbg !25 + %1117 = add i32 %1116, %1115, !dbg !28 + %1118 = insertelement <1 x i32> poison, i32 %1117, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1118, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1119 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %1120 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %1121 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %1122 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + %1123 = mul nuw nsw i32 %1091, %.lobit, !dbg !31 + %1124 = mul nuw nsw i32 %1093, %.lobit, !dbg !31 + %1125 = mul nuw nsw i32 %1095, %.lobit, !dbg !31 + %1126 = mul nuw nsw i32 %1097, %.lobit, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1127 = insertelement <1 x i32> poison, i32 %1123, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %1127, i1 true) #5, !dbg !25 + %1128 = insertelement <1 x i32> poison, i32 %1124, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %1128, i1 true) #5, !dbg !25 + %1129 = insertelement <1 x i32> poison, i32 %1125, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %1129, i1 true) #5, !dbg !25 + %1130 = insertelement <1 x i32> poison, i32 %1126, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %1130, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1131 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %1132 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1131, i32 1, i32 31), !dbg !25 + %1133 = add i32 %1132, %1131, !dbg !28 + %1134 = insertelement <1 x i32> poison, i32 %1133, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %1134, i1 %75) #5, !dbg !25 + %1135 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %1136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1135, i32 1, i32 31), !dbg !25 + %1137 = add i32 %1136, %1135, !dbg !28 + %1138 = insertelement <1 x i32> poison, i32 %1137, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %1138, i1 %75) #5, !dbg !25 + %1139 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %1140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1139, i32 1, i32 31), !dbg !25 + %1141 = add i32 %1140, %1139, !dbg !28 + %1142 = insertelement <1 x i32> poison, i32 %1141, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1142, i1 %75) #5, !dbg !25 + %1143 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %1144 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1143, i32 1, i32 31), !dbg !25 + %1145 = add i32 %1144, %1143, !dbg !28 + %1146 = insertelement <1 x i32> poison, i32 %1145, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1146, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1147 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %1148 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %1149 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %1150 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + %1151 = insertelement <4 x i32> poison, i32 %1059, i64 0, !dbg !36 + %1152 = insertelement <4 x i32> %1151, i32 %1060, i64 1, !dbg !36 + %1153 = insertelement <4 x i32> %1152, i32 %1061, i64 2, !dbg !36 + %1154 = insertelement <4 x i32> %1153, i32 %1062, i64 3, !dbg !36 + %1155 = insertelement <4 x i32> poison, i32 %1087, i64 0, !dbg !36 + %1156 = insertelement <4 x i32> %1155, i32 %1088, i64 1, !dbg !36 + %1157 = insertelement <4 x i32> %1156, i32 %1089, i64 2, !dbg !36 + %1158 = insertelement <4 x i32> %1157, i32 %1090, i64 3, !dbg !36 + %1159 = icmp slt <4 x i32> %1154, %1158, !dbg !36 + %1160 = icmp eq <4 x i32> %1154, %1158, !dbg !37 + %1161 = insertelement <4 x i32> poison, i32 %1119, i64 0, !dbg !38 + %1162 = insertelement <4 x i32> %1161, i32 %1120, i64 1, !dbg !38 + %1163 = insertelement <4 x i32> %1162, i32 %1121, i64 2, !dbg !38 + %1164 = insertelement <4 x i32> %1163, i32 %1122, i64 3, !dbg !38 + %1165 = insertelement <4 x i32> poison, i32 %1147, i64 0, !dbg !38 + %1166 = insertelement <4 x i32> %1165, i32 %1148, i64 1, !dbg !38 + %1167 = insertelement <4 x i32> %1166, i32 %1149, i64 2, !dbg !38 + %1168 = insertelement <4 x i32> %1167, i32 %1150, i64 3, !dbg !38 + %1169 = icmp sgt <4 x i32> %1164, %1168, !dbg !38 + %1170 = and <4 x i1> %1160, %1169, !dbg !39 + %1171 = or <4 x i1> %1159, %1170, !dbg !40 + %1172 = xor i32 %1087, %1059, !dbg !33 + %1173 = xor i32 %1088, %1060, !dbg !33 + %1174 = xor i32 %1089, %1061, !dbg !33 + %1175 = xor i32 %1090, %1062, !dbg !33 + %1176 = extractelement <4 x i1> %1171, i64 0, !dbg !34 + %1177 = select i1 %1176, i32 %1172, i32 0, !dbg !34 + %1178 = extractelement <4 x i1> %1171, i64 1, !dbg !34 + %1179 = select i1 %1178, i32 %1173, i32 0, !dbg !34 + %1180 = extractelement <4 x i1> %1171, i64 2, !dbg !34 + %1181 = select i1 %1180, i32 %1174, i32 0, !dbg !34 + %1182 = extractelement <4 x i1> %1171, i64 3, !dbg !34 + %1183 = select i1 %1182, i32 %1175, i32 0, !dbg !34 + %1184 = xor i32 %1177, %1026, !dbg !35 + %1185 = xor i32 %1179, %1027, !dbg !35 + %1186 = xor i32 %1181, %1028, !dbg !35 + %1187 = xor i32 %1183, %1029, !dbg !35 + %1188 = xor <4 x i32> %1168, %1164, !dbg !42 + %1189 = select <4 x i1> %1171, <4 x i32> %1188, <4 x i32> zeroinitializer, !dbg !43 + %1190 = xor <4 x i32> %1189, %1034, !dbg !44 + %1191 = mul nuw nsw i32 %1184, %50, !dbg !24 + %1192 = mul nuw nsw i32 %1185, %50, !dbg !24 + %1193 = mul nuw nsw i32 %1186, %50, !dbg !24 + %1194 = mul nuw nsw i32 %1187, %50, !dbg !24 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1195 = insertelement <1 x i32> poison, i32 %1191, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %1195, i1 true) #5, !dbg !25 + %1196 = insertelement <1 x i32> poison, i32 %1192, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %1196, i1 true) #5, !dbg !25 + %1197 = insertelement <1 x i32> poison, i32 %1193, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %1197, i1 true) #5, !dbg !25 + %1198 = insertelement <1 x i32> poison, i32 %1194, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %1198, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1199 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %1200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1199, i32 1, i32 31), !dbg !25 + %1201 = add i32 %1200, %1199, !dbg !28 + %1202 = insertelement <1 x i32> poison, i32 %1201, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %1202, i1 %75) #5, !dbg !25 + %1203 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %1204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1203, i32 1, i32 31), !dbg !25 + %1205 = add i32 %1204, %1203, !dbg !28 + %1206 = insertelement <1 x i32> poison, i32 %1205, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %1206, i1 %75) #5, !dbg !25 + %1207 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %1208 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1207, i32 1, i32 31), !dbg !25 + %1209 = add i32 %1208, %1207, !dbg !28 + %1210 = insertelement <1 x i32> poison, i32 %1209, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1210, i1 %75) #5, !dbg !25 + %1211 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %1212 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1211, i32 1, i32 31), !dbg !25 + %1213 = add i32 %1212, %1211, !dbg !28 + %1214 = insertelement <1 x i32> poison, i32 %1213, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1214, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1215 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %1216 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %1217 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %1218 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %1219 = mul nuw nsw i32 %1184, %.lobit1, !dbg !29 + %1220 = mul nuw nsw i32 %1185, %.lobit1, !dbg !29 + %1221 = mul nuw nsw i32 %1186, %.lobit1, !dbg !29 + %1222 = mul nuw nsw i32 %1187, %.lobit1, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1223 = insertelement <1 x i32> poison, i32 %1219, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %1223, i1 true) #5, !dbg !25 + %1224 = insertelement <1 x i32> poison, i32 %1220, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %1224, i1 true) #5, !dbg !25 + %1225 = insertelement <1 x i32> poison, i32 %1221, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %1225, i1 true) #5, !dbg !25 + %1226 = insertelement <1 x i32> poison, i32 %1222, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %1226, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1227 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %1228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1227, i32 1, i32 31), !dbg !25 + %1229 = add i32 %1228, %1227, !dbg !28 + %1230 = insertelement <1 x i32> poison, i32 %1229, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %1230, i1 %75) #5, !dbg !25 + %1231 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %1232 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1231, i32 1, i32 31), !dbg !25 + %1233 = add i32 %1232, %1231, !dbg !28 + %1234 = insertelement <1 x i32> poison, i32 %1233, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %1234, i1 %75) #5, !dbg !25 + %1235 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %1236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1235, i32 1, i32 31), !dbg !25 + %1237 = add i32 %1236, %1235, !dbg !28 + %1238 = insertelement <1 x i32> poison, i32 %1237, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1238, i1 %75) #5, !dbg !25 + %1239 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %1240 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1239, i32 1, i32 31), !dbg !25 + %1241 = add i32 %1240, %1239, !dbg !28 + %1242 = insertelement <1 x i32> poison, i32 %1241, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1242, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1243 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %1244 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %1245 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %1246 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %1247 = extractelement <4 x i32> %1190, i64 0, !dbg !31 + %1248 = mul nuw nsw i32 %1247, %50, !dbg !30 + %1249 = extractelement <4 x i32> %1190, i64 1, !dbg !31 + %1250 = mul nuw nsw i32 %1249, %50, !dbg !30 + %1251 = extractelement <4 x i32> %1190, i64 2, !dbg !31 + %1252 = mul nuw nsw i32 %1251, %50, !dbg !30 + %1253 = extractelement <4 x i32> %1190, i64 3, !dbg !31 + %1254 = mul nuw nsw i32 %1253, %50, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1255 = insertelement <1 x i32> poison, i32 %1248, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %1255, i1 true) #5, !dbg !25 + %1256 = insertelement <1 x i32> poison, i32 %1250, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %1256, i1 true) #5, !dbg !25 + %1257 = insertelement <1 x i32> poison, i32 %1252, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %1257, i1 true) #5, !dbg !25 + %1258 = insertelement <1 x i32> poison, i32 %1254, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %1258, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1259 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %1260 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1259, i32 1, i32 31), !dbg !25 + %1261 = add i32 %1260, %1259, !dbg !28 + %1262 = insertelement <1 x i32> poison, i32 %1261, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %1262, i1 %75) #5, !dbg !25 + %1263 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %1264 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1263, i32 1, i32 31), !dbg !25 + %1265 = add i32 %1264, %1263, !dbg !28 + %1266 = insertelement <1 x i32> poison, i32 %1265, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %1266, i1 %75) #5, !dbg !25 + %1267 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %1268 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1267, i32 1, i32 31), !dbg !25 + %1269 = add i32 %1268, %1267, !dbg !28 + %1270 = insertelement <1 x i32> poison, i32 %1269, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1270, i1 %75) #5, !dbg !25 + %1271 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %1272 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1271, i32 1, i32 31), !dbg !25 + %1273 = add i32 %1272, %1271, !dbg !28 + %1274 = insertelement <1 x i32> poison, i32 %1273, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1274, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1275 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %1276 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %1277 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %1278 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %1279 = mul nuw nsw i32 %1247, %.lobit1, !dbg !31 + %1280 = mul nuw nsw i32 %1249, %.lobit1, !dbg !31 + %1281 = mul nuw nsw i32 %1251, %.lobit1, !dbg !31 + %1282 = mul nuw nsw i32 %1253, %.lobit1, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1283 = insertelement <1 x i32> poison, i32 %1279, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %1283, i1 true) #5, !dbg !25 + %1284 = insertelement <1 x i32> poison, i32 %1280, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %1284, i1 true) #5, !dbg !25 + %1285 = insertelement <1 x i32> poison, i32 %1281, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %1285, i1 true) #5, !dbg !25 + %1286 = insertelement <1 x i32> poison, i32 %1282, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %1286, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1287 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %1288 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1287, i32 1, i32 31), !dbg !25 + %1289 = add i32 %1288, %1287, !dbg !28 + %1290 = insertelement <1 x i32> poison, i32 %1289, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %1290, i1 %75) #5, !dbg !25 + %1291 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %1292 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1291, i32 1, i32 31), !dbg !25 + %1293 = add i32 %1292, %1291, !dbg !28 + %1294 = insertelement <1 x i32> poison, i32 %1293, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %1294, i1 %75) #5, !dbg !25 + %1295 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %1296 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1295, i32 1, i32 31), !dbg !25 + %1297 = add i32 %1296, %1295, !dbg !28 + %1298 = insertelement <1 x i32> poison, i32 %1297, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1298, i1 %75) #5, !dbg !25 + %1299 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %1300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1299, i32 1, i32 31), !dbg !25 + %1301 = add i32 %1300, %1299, !dbg !28 + %1302 = insertelement <1 x i32> poison, i32 %1301, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1302, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1303 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %1304 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %1305 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %1306 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %1307 = insertelement <4 x i32> poison, i32 %1215, i64 0, !dbg !36 + %1308 = insertelement <4 x i32> %1307, i32 %1216, i64 1, !dbg !36 + %1309 = insertelement <4 x i32> %1308, i32 %1217, i64 2, !dbg !36 + %1310 = insertelement <4 x i32> %1309, i32 %1218, i64 3, !dbg !36 + %1311 = insertelement <4 x i32> poison, i32 %1243, i64 0, !dbg !36 + %1312 = insertelement <4 x i32> %1311, i32 %1244, i64 1, !dbg !36 + %1313 = insertelement <4 x i32> %1312, i32 %1245, i64 2, !dbg !36 + %1314 = insertelement <4 x i32> %1313, i32 %1246, i64 3, !dbg !36 + %1315 = icmp slt <4 x i32> %1310, %1314, !dbg !36 + %1316 = icmp eq <4 x i32> %1310, %1314, !dbg !37 + %1317 = insertelement <4 x i32> poison, i32 %1275, i64 0, !dbg !38 + %1318 = insertelement <4 x i32> %1317, i32 %1276, i64 1, !dbg !38 + %1319 = insertelement <4 x i32> %1318, i32 %1277, i64 2, !dbg !38 + %1320 = insertelement <4 x i32> %1319, i32 %1278, i64 3, !dbg !38 + %1321 = insertelement <4 x i32> poison, i32 %1303, i64 0, !dbg !38 + %1322 = insertelement <4 x i32> %1321, i32 %1304, i64 1, !dbg !38 + %1323 = insertelement <4 x i32> %1322, i32 %1305, i64 2, !dbg !38 + %1324 = insertelement <4 x i32> %1323, i32 %1306, i64 3, !dbg !38 + %1325 = icmp sgt <4 x i32> %1320, %1324, !dbg !38 + %1326 = and <4 x i1> %1316, %1325, !dbg !39 + %1327 = or <4 x i1> %1315, %1326, !dbg !40 + %1328 = xor <4 x i32> %1324, %1320, !dbg !42 + %1329 = select <4 x i1> %1327, <4 x i32> %1328, <4 x i32> zeroinitializer, !dbg !43 + %1330 = xor <4 x i32> %1329, %1190, !dbg !44 + %1331 = insertelement <4 x i1> poison, i1 %17, i64 0, !dbg !45 + %1332 = shufflevector <4 x i1> %1331, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !45 + %1333 = insertelement <4 x i32> poison, i32 %46, i64 0, !dbg !45 + %1334 = insertelement <4 x i32> %1333, i32 %45, i64 1, !dbg !45 + %1335 = insertelement <4 x i32> %1334, i32 %47, i64 2, !dbg !45 + %1336 = insertelement <4 x i32> %1335, i32 %48, i64 3, !dbg !45 + %1337 = sext <4 x i32> %1336 to <4 x i64>, !dbg !45 + %1338 = select <4 x i1> %1332, <4 x i64> %1337, <4 x i64> zeroinitializer, !dbg !45 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %1339 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %1338), !dbg !48 + %1340 = and i32 %11, 3, !dbg !46 + %1341 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %181, !dbg !46 + %1342 = getelementptr i64, ptr addrspace(3) %1341, i32 %1340, !dbg !46 + %1343 = insertelement <1 x i64> poison, i64 %1339, i64 0, !dbg !46 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %1342, <1 x i64> %1343, i1 true) #5, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %1344 = icmp samesign ult i32 %10, 128, !dbg !46 + %1345 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %10, !dbg !46 + %1346 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %1345, i1 %1344) #5, !dbg !46 + %extelt.offset = lshr i64 %1346, 32, !dbg !46 + %1347 = trunc nuw i64 %extelt.offset to i32, !dbg !46 + %1348 = trunc i64 %1346 to i32, !dbg !46 + %1349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1348, i32 2, i32 31), !dbg !46 + %1350 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1347, i32 2, i32 31), !dbg !46 + %1351 = insertelement <2 x i32> poison, i32 %1349, i64 0, !dbg !46 + %1352 = insertelement <2 x i32> %1351, i32 %1350, i64 1, !dbg !46 + %1353 = bitcast <2 x i32> %1352 to i64, !dbg !46 + %1354 = add i64 %1346, %1353, !dbg !48 + %extelt.offset62 = lshr i64 %1354, 32, !dbg !46 + %1355 = trunc nuw i64 %extelt.offset62 to i32, !dbg !46 + %1356 = trunc i64 %1354 to i32, !dbg !46 + %1357 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1356, i32 1, i32 31), !dbg !46 + %1358 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1355, i32 1, i32 31), !dbg !46 + %1359 = insertelement <2 x i32> poison, i32 %1357, i64 0, !dbg !46 + %1360 = insertelement <2 x i32> %1359, i32 %1358, i64 1, !dbg !46 + %1361 = bitcast <2 x i32> %1360 to i64, !dbg !46 + %1362 = add i64 %1354, %1361, !dbg !48 + %1363 = and i32 %10, 899, !dbg !46 + %1364 = icmp eq i32 %1363, 0, !dbg !46 + %1365 = insertelement <1 x i64> poison, i64 %1362, i64 0, !dbg !46 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %1345, <1 x i64> %1365, i1 %1364) #5, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %1366 = load i64, ptr addrspace(3) %1341, align 16, !dbg !46 + %1367 = trunc i64 %1366 to i32, !dbg !49 + %1368 = shl i32 %16, 4, !dbg !50 + %1369 = or disjoint i32 %1368, %25, !dbg !51 + %1370 = sext i32 %1369 to i64, !dbg !52 + %1371 = getelementptr i32, ptr addrspace(1) %1, i64 %1370, !dbg !52 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %1372 = shl nuw nsw i32 %10, 4, !dbg !53 + %1373 = and i32 %1372, 2032, !dbg !53 + %1374 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1373, !dbg !53 + store <4 x i32> %1330, ptr addrspace(3) %1374, align 16, !dbg !53 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %1375 = shl nuw nsw i32 %10, 6, !dbg !53 + %1376 = and i32 %1375, 1536, !dbg !53 + %1377 = and i32 %1372, 112, !dbg !53 + %1378 = shl nuw nsw i32 %19, 2, !dbg !53 + %1379 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1376, !dbg !53 + %1380 = getelementptr inbounds nuw i8, ptr addrspace(3) %1379, i32 %1377, !dbg !53 + %1381 = getelementptr inbounds nuw i8, ptr addrspace(3) %1380, i32 %1378, !dbg !53 + %1382 = ptrtoint ptr addrspace(3) %1381 to i32, !dbg !53 + %1383 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %1382) #5, !dbg !53 + %1384 = extractvalue { i32, i32, i32, i32 } %1383, 0, !dbg !53 + %1385 = extractvalue { i32, i32, i32, i32 } %1383, 1, !dbg !53 + %1386 = extractvalue { i32, i32, i32, i32 } %1383, 2, !dbg !53 + %1387 = extractvalue { i32, i32, i32, i32 } %1383, 3, !dbg !53 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1384, i32 %1385, i32 %1386, i32 %1387, ptr addrspace(1) %1371, i1 %18) #5, !dbg !53 + %1388 = sext i32 %15 to i64, !dbg !54 + %1389 = getelementptr i32, ptr addrspace(1) %2, i64 %1388, !dbg !54 + %1390 = icmp eq i32 %19, 0, !dbg !55 + %1391 = and i1 %1390, %17, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %1367, ptr addrspace(1) %1389, i1 %1391) #5, !dbg !55 + ret void, !dbg !56 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", linkageName: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 24, column: 28, scope: !4) +!8 = !DILocation(line: 24, column: 33, scope: !4) +!9 = !DILocation(line: 25, column: 44, scope: !4) +!10 = !DILocation(line: 25, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 21, scope: !4) +!12 = !DILocation(line: 27, column: 38, scope: !4) +!13 = !DILocation(line: 34, column: 19, scope: !4) +!14 = !DILocation(line: 36, column: 38, scope: !4) +!15 = !DILocation(line: 36, column: 35, scope: !4) +!16 = !DILocation(line: 36, column: 45, scope: !4) +!17 = !DILocation(line: 36, column: 30, scope: !4) +!18 = !DILocation(line: 36, column: 54, scope: !4) +!19 = !DILocation(line: 627, column: 44, scope: !20, inlinedAt: !22) +!20 = distinct !DILexicalBlockFile(scope: !4, file: !21, discriminator: 0) +!21 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!22 = !DILocation(line: 41, column: 67, scope: !4) +!23 = !DILocation(line: 537, column: 21, scope: !20, inlinedAt: !22) +!24 = !DILocation(line: 538, column: 40, scope: !20, inlinedAt: !22) +!25 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !22) +!26 = distinct !DILexicalBlockFile(scope: !4, file: !27, discriminator: 0) +!27 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!28 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !22) +!29 = !DILocation(line: 539, column: 41, scope: !20, inlinedAt: !22) +!30 = !DILocation(line: 548, column: 23, scope: !20, inlinedAt: !22) +!31 = !DILocation(line: 551, column: 23, scope: !20, inlinedAt: !22) +!32 = !DILocation(line: 599, column: 28, scope: !20, inlinedAt: !22) +!33 = !DILocation(line: 600, column: 38, scope: !20, inlinedAt: !22) +!34 = !DILocation(line: 600, column: 46, scope: !20, inlinedAt: !22) +!35 = !DILocation(line: 600, column: 15, scope: !20, inlinedAt: !22) +!36 = !DILocation(line: 574, column: 22, scope: !20, inlinedAt: !22) +!37 = !DILocation(line: 591, column: 21, scope: !20, inlinedAt: !22) +!38 = !DILocation(line: 594, column: 40, scope: !20, inlinedAt: !22) +!39 = !DILocation(line: 594, column: 29, scope: !20, inlinedAt: !22) +!40 = !DILocation(line: 594, column: 23, scope: !20, inlinedAt: !22) +!41 = !DILocation(line: 599, column: 19, scope: !20, inlinedAt: !22) +!42 = !DILocation(line: 601, column: 48, scope: !20, inlinedAt: !22) +!43 = !DILocation(line: 601, column: 59, scope: !20, inlinedAt: !22) +!44 = !DILocation(line: 601, column: 22, scope: !20, inlinedAt: !22) +!45 = !DILocation(line: 44, column: 34, scope: !4) +!46 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !47) +!47 = !DILocation(line: 45, column: 26, scope: !4) +!48 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !47) +!49 = !DILocation(line: 48, column: 21, scope: !4) +!50 = !DILocation(line: 49, column: 35, scope: !4) +!51 = !DILocation(line: 49, column: 32, scope: !4) +!52 = !DILocation(line: 49, column: 25, scope: !4) +!53 = !DILocation(line: 49, column: 47, scope: !4) +!54 = !DILocation(line: 50, column: 25, scope: !4) +!55 = !DILocation(line: 50, column: 37, scope: !4) +!56 = !DILocation(line: 50, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx new file mode 100644 index 0000000000000000000000000000000000000000..fd29b4b8fde1a4d88a514250bd2c516e62e013e3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx @@ -0,0 +1,2813 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 // -- Begin function triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.visible .entry triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_6 +) +.reqntid 128 +{ + .reg .pred %p<514>; + .reg .b32 %r<1198>; + .reg .b64 %rd<32>; + .loc 1 18 0 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:18:0 +$L__func_begin0: + .loc 1 18 0 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:18:0 + +// %bb.0: + ld.param.b64 %rd10, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0]; + ld.param.b64 %rd11, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1]; +$L__tmp0: + .loc 1 24 28 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:24:28 + mov.u32 %r690, %ctaid.x; + .loc 1 24 33 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:24:33 + shl.b32 %r691, %r690, 5; + ld.param.b64 %rd12, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2]; + .loc 1 25 44 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:25:44 + mov.u32 %r692, %tid.x; + and.b32 %r693, %r692, 31; + shr.u32 %r694, %r692, 2; + bfe.u32 %r695, %r692, 2, 5; + .loc 1 25 23 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:25:23 + or.b32 %r696, %r691, %r693; + or.b32 %r697, %r695, %r691; + .loc 1 26 21 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:26:21 + setp.lt.s32 %p1, %r696, 32; + setp.lt.s32 %p344, %r697, 32; + .loc 1 27 38 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:27:38 + and.b32 %r698, %r692, 96; + bfe.u32 %r699, %r692, 5, 2; + or.b32 %r700, %r699, 4; + or.b32 %r701, %r699, 8; + or.b32 %r702, %r699, 12; + shl.b32 %r703, %r692, 2; + and.b32 %r704, %r703, 12; + .loc 1 34 19 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:34:19 + bfe.s32 %r705, %r690, 26, 1; + shr.u32 %r706, %r705, 28; + add.s32 %r707, %r696, %r706; + .loc 1 36 35 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:36:35 + shl.b32 %r708, %r707, 4; + and.b32 %r709, %r708, -256; + add.s32 %r710, %r709, %r696; + .loc 1 36 45 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:36:45 + mad.lo.s32 %r711, %r699, 17, %r710; + add.s32 %r712, %r711, 68; + add.s32 %r713, %r711, 136; + add.s32 %r714, %r711, 204; + .loc 1 36 30 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:36:30 + mad.wide.s32 %rd1, %r711, 4, %rd10; + mad.wide.s32 %rd2, %r712, 4, %rd10; + mad.wide.s32 %rd3, %r713, 4, %rd10; + mad.wide.s32 %rd4, %r714, 4, %rd10; + .loc 1 36 54 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:36:54 + // begin inline asm + mov.u32 %r1, 0x0; + @%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2, 0x0; + @%p1 ld.global.b32 { %r2 }, [ %rd2 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r3, 0x0; + @%p1 ld.global.b32 { %r3 }, [ %rd3 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r4, 0x0; + @%p1 ld.global.b32 { %r4 }, [ %rd4 + 0 ]; + // end inline asm +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shr.u32 %r715, %r692, 6; + bfe.u32 %r716, %r692, 6, 1; + bfe.u32 %r717, %r692, 5, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r718, %r717, 1; + xor.b32 %r719, %r716, 1; + .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r6, %r1, %r718; + mul.lo.s32 %r8, %r2, %r718; + mul.lo.s32 %r10, %r3, %r718; + mul.lo.s32 %r12, %r4, %r718; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shl.b32 %r720, %r716, 3; + mov.b32 %r721, global_smem; + add.s32 %r722, %r721, %r720; + shl.b32 %r723, %r693, 6; + add.s32 %r724, %r722, %r723; + shl.b32 %r725, %r717, 2; + add.s32 %r5, %r724, %r725; + mov.pred %p5, -1; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r6; + // end inline asm + add.s32 %r7, %r5, 16; + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r8; + // end inline asm + add.s32 %r9, %r5, 32; + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r10; + // end inline asm + add.s32 %r11, %r5, 48; + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r12; + // end inline asm + bar.sync 0; + setp.lt.u32 %p9, %r692, 512; + add.s32 %r14, %r721, %r703; + // begin inline asm + @%p9 ld.shared.b32 %r13, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r726, %r13, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r16, %r726, %r13; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.b32 %r727, %r692, 513; + setp.eq.b32 %p10, %r727, 0; + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r16; + // end inline asm + add.s32 %r18, %r14, 512; + // begin inline asm + @%p9 ld.shared.b32 %r17, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r728, %r17, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r20, %r728, %r17; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r20; + // end inline asm + add.s32 %r22, %r14, 1024; + // begin inline asm + @%p9 ld.shared.b32 %r21, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r729, %r21, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r24, %r729, %r21; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r24; + // end inline asm + add.s32 %r26, %r14, 1536; + // begin inline asm + @%p9 ld.shared.b32 %r25, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r730, %r25, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r28, %r730, %r25; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r28; + // end inline asm + bar.sync 0; + ld.shared.b32 %r731, [%r724]; + ld.shared.b32 %r732, [%r724+16]; + ld.shared.b32 %r733, [%r724+32]; + ld.shared.b32 %r734, [%r724+48]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r30, %r1, %r717; + mul.lo.s32 %r32, %r2, %r717; + mul.lo.s32 %r34, %r3, %r717; + mul.lo.s32 %r36, %r4, %r717; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r30; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r32; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r34; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r36; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r37, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r735, %r37, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r40, %r735, %r37; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r40; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r41, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r736, %r41, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r44, %r736, %r41; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r44; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r45, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r737, %r45, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r48, %r737, %r45; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r48; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r49, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r738, %r49, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r52, %r738, %r49; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r52; + // end inline asm + bar.sync 0; + ld.shared.b32 %r739, [%r724]; + ld.shared.b32 %r740, [%r724+16]; + ld.shared.b32 %r741, [%r724+32]; + ld.shared.b32 %r742, [%r724+48]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r54, %r718, %r699; + shl.b32 %r743, %r718, 2; + or.b32 %r56, %r54, %r743; + add.s32 %r58, %r56, %r743; + add.s32 %r60, %r58, %r743; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r54; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r56; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r58; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r60; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r61, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r744, %r61, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r64, %r744, %r61; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r64; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r65, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r745, %r65, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r68, %r745, %r65; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r68; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r69, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r746, %r69, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r72, %r746, %r69; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r72; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r73, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r747, %r73, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r76, %r747, %r73; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r76; + // end inline asm + bar.sync 0; + ld.shared.b32 %r748, [%r724]; + ld.shared.b32 %r749, [%r724+16]; + ld.shared.b32 %r750, [%r724+32]; + ld.shared.b32 %r751, [%r724+48]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r78, %r699, %r717; + or.b32 %r80, %r78, %r725; + add.s32 %r82, %r80, %r725; + add.s32 %r84, %r82, %r725; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r78; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r80; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r82; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r84; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r85, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r752, %r85, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r88, %r752, %r85; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r88; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r89, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r753, %r89, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r92, %r753, %r89; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r92; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r93, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r754, %r93, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r96, %r754, %r93; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r96; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r97, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r755, %r97, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r100, %r755, %r97; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r100; + // end inline asm + bar.sync 0; + ld.shared.b32 %r756, [%r724]; + ld.shared.b32 %r757, [%r724+16]; + ld.shared.b32 %r758, [%r724+32]; + ld.shared.b32 %r759, [%r724+48]; + .loc 2 599 28 // triton_helpers.py:599:28 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.b32 %r760, %r715, 1; + setp.ne.b32 %p346, %r760, 0; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + shl.b32 %r761, %r717, 10; + shl.b32 %r762, %r693, 5; + or.b32 %r763, %r761, %r762; + add.s32 %r764, %r721, %r763; + shl.b32 %r765, %r716, 2; + add.s32 %r101, %r764, %r765; + add.s32 %r766, %r764, 8; + add.s32 %r103, %r766, %r765; + add.s32 %r767, %r764, 16; + add.s32 %r105, %r767, %r765; + add.s32 %r768, %r764, 24; + add.s32 %r107, %r768, %r765; + .loc 2 599 28 // triton_helpers.py:599:28 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.ge.s32 %p347, %r731, %r739; + setp.ge.s32 %p348, %r732, %r740; + setp.ne.b32 %p349, %r732, %r740; + setp.ne.b32 %p350, %r731, %r739; + setp.le.s32 %p351, %r749, %r757; + setp.le.s32 %p352, %r748, %r756; + or.pred %p353, %p350, %p352; + or.pred %p354, %p349, %p351; + and.pred %p355, %p348, %p354; + and.pred %p356, %p347, %p353; + xor.pred %p357, %p356, %p346; + xor.pred %p358, %p355, %p346; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r769, %r756, %r748; + xor.b32 %r770, %r740, %r732; + xor.b32 %r771, %r757, %r749; + xor.b32 %r772, %r739, %r731; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r773, 0, %r770, %p358; + selp.b32 %r774, 0, %r769, %p357; + selp.b32 %r775, 0, %r772, %p357; + selp.b32 %r776, 0, %r771, %p358; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r777, %r774, %r699; + xor.b32 %r778, %r773, %r2; + xor.b32 %r779, %r776, %r700; + xor.b32 %r780, %r775, %r1; + .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r102, %r780, %r719; + mul.lo.s32 %r104, %r778, %r719; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r102; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r104; + // end inline asm + .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r126, %r780, %r716; + mul.lo.s32 %r128, %r778, %r716; + .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r150, %r777, %r719; + mul.lo.s32 %r152, %r779, %r719; + .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r174, %r777, %r716; + mul.lo.s32 %r176, %r779, %r716; + .loc 2 599 28 // triton_helpers.py:599:28 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.ge.s32 %p359, %r733, %r741; + setp.ge.s32 %p360, %r734, %r742; + setp.ne.b32 %p361, %r734, %r742; + setp.ne.b32 %p362, %r733, %r741; + setp.le.s32 %p363, %r751, %r759; + setp.le.s32 %p364, %r750, %r758; + or.pred %p365, %p362, %p364; + or.pred %p366, %p361, %p363; + and.pred %p367, %p360, %p366; + and.pred %p368, %p359, %p365; + xor.pred %p369, %p368, %p346; + xor.pred %p370, %p367, %p346; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r781, %r758, %r750; + xor.b32 %r782, %r742, %r734; + xor.b32 %r783, %r759, %r751; + xor.b32 %r784, %r741, %r733; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r785, 0, %r782, %p370; + selp.b32 %r786, 0, %r781, %p369; + selp.b32 %r787, 0, %r784, %p369; + selp.b32 %r788, 0, %r783, %p370; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r789, %r786, %r701; + xor.b32 %r790, %r785, %r4; + xor.b32 %r791, %r788, %r702; + xor.b32 %r792, %r787, %r3; + .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r106, %r792, %r719; + mul.lo.s32 %r108, %r790, %r719; + .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r130, %r792, %r716; + mul.lo.s32 %r132, %r790, %r716; + .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r154, %r789, %r719; + mul.lo.s32 %r156, %r791, %r719; + .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r178, %r789, %r716; + mul.lo.s32 %r180, %r791, %r716; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r106; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r108; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r109, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r793, %r109, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r112, %r793, %r109; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r112; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r113, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r794, %r113, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r116, %r794, %r113; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r116; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r117, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r795, %r117, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r120, %r795, %r117; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r120; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r121, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r796, %r121, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r124, %r796, %r121; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r124; + // end inline asm + bar.sync 0; + ld.shared.b32 %r797, [%r764]; + ld.shared.b32 %r798, [%r764+8]; + ld.shared.b32 %r799, [%r764+16]; + ld.shared.b32 %r800, [%r764+24]; + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r126; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r128; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r130; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r132; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r133, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r801, %r133, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r136, %r801, %r133; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r136; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r137, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r802, %r137, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r140, %r802, %r137; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r140; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r141, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r803, %r141, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r144, %r803, %r141; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r144; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r145, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r804, %r145, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r148, %r804, %r145; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r148; + // end inline asm + bar.sync 0; + ld.shared.b32 %r805, [%r764]; + ld.shared.b32 %r806, [%r764+8]; + ld.shared.b32 %r807, [%r764+16]; + ld.shared.b32 %r808, [%r764+24]; + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r150; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r152; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r154; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r156; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r157, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r809, %r157, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r160, %r809, %r157; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r160; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r161, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r810, %r161, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r164, %r810, %r161; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r164; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r165, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r811, %r165, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r168, %r811, %r165; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r168; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r169, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r812, %r169, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r172, %r812, %r169; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r172; + // end inline asm + bar.sync 0; + ld.shared.b32 %r813, [%r764]; + ld.shared.b32 %r814, [%r764+8]; + ld.shared.b32 %r815, [%r764+16]; + ld.shared.b32 %r816, [%r764+24]; + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r174; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r176; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r178; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r180; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r181, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r817, %r181, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r184, %r817, %r181; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r184; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r185, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r818, %r185, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r188, %r818, %r185; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r188; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r189, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r819, %r189, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r192, %r819, %r189; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r192; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r193, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r820, %r193, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r196, %r820, %r193; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r196; + // end inline asm + bar.sync 0; + ld.shared.b32 %r821, [%r764]; + ld.shared.b32 %r822, [%r764+8]; + ld.shared.b32 %r823, [%r764+16]; + ld.shared.b32 %r824, [%r764+24]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.lt.s32 %p371, %r797, %r805; + setp.ge.s32 %p372, %r798, %r806; + setp.lt.s32 %p373, %r799, %r807; + setp.ge.s32 %p374, %r800, %r808; + .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.eq.b32 %p375, %r797, %r805; + setp.ne.b32 %p376, %r798, %r806; + setp.eq.b32 %p377, %r799, %r807; + setp.ne.b32 %p378, %r800, %r808; + .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.gt.s32 %p379, %r813, %r821; + setp.le.s32 %p380, %r814, %r822; + setp.gt.s32 %p381, %r815, %r823; + setp.le.s32 %p382, %r816, %r824; + .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + or.pred %p383, %p376, %p380; + and.pred %p384, %p375, %p379; + .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + or.pred %p385, %p378, %p382; + and.pred %p386, %p377, %p381; + and.pred %p387, %p372, %p383; + or.pred %p388, %p371, %p384; + .loc 2 599 19 // triton_helpers.py:599:19 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.pred %p389, %p374, %p385; + or.pred %p390, %p373, %p386; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r825, %r821, %r813; + xor.b32 %r826, %r806, %r798; + xor.b32 %r827, %r822, %r814; + xor.b32 %r828, %r805, %r797; + xor.b32 %r829, %r823, %r815; + xor.b32 %r830, %r808, %r800; + xor.b32 %r831, %r824, %r816; + xor.b32 %r832, %r807, %r799; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r833, %r826, 0, %p387; + selp.b32 %r834, %r825, 0, %p388; + selp.b32 %r835, %r828, 0, %p388; + selp.b32 %r836, %r827, 0, %p387; + selp.b32 %r837, %r830, 0, %p389; + selp.b32 %r838, %r829, 0, %p390; + selp.b32 %r839, %r832, 0, %p390; + selp.b32 %r840, %r831, 0, %p389; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r841, %r834, %r777; + xor.b32 %r842, %r833, %r778; + xor.b32 %r843, %r836, %r779; + xor.b32 %r844, %r835, %r780; + xor.b32 %r845, %r838, %r789; + xor.b32 %r846, %r837, %r790; + xor.b32 %r847, %r840, %r791; + xor.b32 %r848, %r839, %r792; + .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r198, %r844, %r718; + mul.lo.s32 %r200, %r842, %r718; + mul.lo.s32 %r202, %r848, %r718; + mul.lo.s32 %r204, %r846, %r718; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r198; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r200; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r202; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r204; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r205, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r849, %r205, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r208, %r849, %r205; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r208; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r209, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r850, %r209, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r212, %r850, %r209; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r212; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r213, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r851, %r213, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r216, %r851, %r213; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r216; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r217, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r852, %r217, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r220, %r852, %r217; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r220; + // end inline asm + bar.sync 0; + ld.shared.b32 %r853, [%r724]; + ld.shared.b32 %r854, [%r724+16]; + ld.shared.b32 %r855, [%r724+32]; + ld.shared.b32 %r856, [%r724+48]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r222, %r844, %r717; + mul.lo.s32 %r224, %r842, %r717; + mul.lo.s32 %r226, %r848, %r717; + mul.lo.s32 %r228, %r846, %r717; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r222; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r224; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r226; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r228; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r229, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r857, %r229, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r232, %r857, %r229; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r232; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r233, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r858, %r233, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r236, %r858, %r233; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r236; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r237, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r859, %r237, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r240, %r859, %r237; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r240; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r241, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r860, %r241, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r244, %r860, %r241; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r244; + // end inline asm + bar.sync 0; + ld.shared.b32 %r861, [%r724]; + ld.shared.b32 %r862, [%r724+16]; + ld.shared.b32 %r863, [%r724+32]; + ld.shared.b32 %r864, [%r724+48]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r246, %r841, %r718; + mul.lo.s32 %r248, %r843, %r718; + mul.lo.s32 %r250, %r845, %r718; + mul.lo.s32 %r252, %r847, %r718; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r246; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r248; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r250; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r252; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r253, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r865, %r253, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r256, %r865, %r253; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r256; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r257, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r866, %r257, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r260, %r866, %r257; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r260; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r261, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r867, %r261, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r264, %r867, %r261; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r264; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r265, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r868, %r265, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r268, %r868, %r265; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r268; + // end inline asm + bar.sync 0; + ld.shared.b32 %r869, [%r724]; + ld.shared.b32 %r870, [%r724+16]; + ld.shared.b32 %r871, [%r724+32]; + ld.shared.b32 %r872, [%r724+48]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r270, %r841, %r717; + mul.lo.s32 %r272, %r843, %r717; + mul.lo.s32 %r274, %r845, %r717; + mul.lo.s32 %r276, %r847, %r717; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r270; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r272; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r274; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r276; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r277, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r873, %r277, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r280, %r873, %r277; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r280; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r281, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r874, %r281, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r284, %r874, %r281; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r284; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r285, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r875, %r285, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r288, %r875, %r285; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r288; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r289, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r876, %r289, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r292, %r876, %r289; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r292; + // end inline asm + bar.sync 0; + ld.shared.b32 %r877, [%r724]; + ld.shared.b32 %r878, [%r724+16]; + ld.shared.b32 %r879, [%r724+32]; + ld.shared.b32 %r880, [%r724+48]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.lt.s32 %p391, %r853, %r861; + setp.ge.s32 %p392, %r854, %r862; + setp.lt.s32 %p393, %r855, %r863; + setp.ge.s32 %p394, %r856, %r864; + .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.eq.b32 %p395, %r853, %r861; + setp.ne.b32 %p396, %r854, %r862; + setp.eq.b32 %p397, %r855, %r863; + setp.ne.b32 %p398, %r856, %r864; + .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.gt.s32 %p399, %r869, %r877; + setp.le.s32 %p400, %r870, %r878; + setp.gt.s32 %p401, %r871, %r879; + setp.le.s32 %p402, %r872, %r880; + .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + or.pred %p403, %p396, %p400; + and.pred %p404, %p395, %p399; + .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + or.pred %p405, %p398, %p402; + and.pred %p406, %p397, %p401; + and.pred %p407, %p392, %p403; + or.pred %p408, %p391, %p404; + .loc 2 599 19 // triton_helpers.py:599:19 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.pred %p409, %p394, %p405; + or.pred %p410, %p393, %p406; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r881, %r877, %r869; + xor.b32 %r882, %r862, %r854; + xor.b32 %r883, %r878, %r870; + xor.b32 %r884, %r861, %r853; + xor.b32 %r885, %r879, %r871; + xor.b32 %r886, %r864, %r856; + xor.b32 %r887, %r880, %r872; + xor.b32 %r888, %r863, %r855; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r889, %r882, 0, %p407; + selp.b32 %r890, %r881, 0, %p408; + selp.b32 %r891, %r884, 0, %p408; + selp.b32 %r892, %r883, 0, %p407; + selp.b32 %r893, %r886, 0, %p409; + selp.b32 %r894, %r885, 0, %p410; + selp.b32 %r895, %r888, 0, %p410; + selp.b32 %r896, %r887, 0, %p409; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r897, %r890, %r841; + xor.b32 %r898, %r889, %r842; + xor.b32 %r899, %r892, %r843; + xor.b32 %r900, %r891, %r844; + xor.b32 %r901, %r894, %r845; + xor.b32 %r902, %r893, %r846; + xor.b32 %r903, %r896, %r847; + xor.b32 %r904, %r895, %r848; + .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.lt.s32 %p411, %r900, %r898; + setp.ge.s32 %p412, %r904, %r902; + .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.eq.b32 %p413, %r898, %r900; + setp.gt.s32 %p414, %r897, %r899; + setp.ne.b32 %p415, %r902, %r904; + setp.le.s32 %p416, %r901, %r903; + .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.pred %p417, %p413, %p414; + .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + or.pred %p418, %p415, %p416; + or.pred %p419, %p411, %p417; + .loc 2 599 19 // triton_helpers.py:599:19 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.pred %p420, %p412, %p418; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r905, %r898, %r900; + xor.b32 %r906, %r902, %r904; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r907, %r905, 0, %p419; + selp.b32 %r908, %r906, 0, %p420; + .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r909, %r899, %r897; + xor.b32 %r910, %r903, %r901; + .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r911, %r909, 0, %p419; + selp.b32 %r912, %r910, 0, %p420; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r913, %r907, %r898; + xor.b32 %r914, %r907, %r900; + xor.b32 %r915, %r908, %r902; + xor.b32 %r916, %r908, %r904; + .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r917, %r911, %r899; + xor.b32 %r918, %r911, %r897; + xor.b32 %r919, %r912, %r903; + xor.b32 %r920, %r912, %r901; + .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r294, %r914, %r719; + mul.lo.s32 %r296, %r913, %r719; + mul.lo.s32 %r298, %r916, %r719; + mul.lo.s32 %r300, %r915, %r719; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r294; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r296; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r298; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r300; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r301, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r921, %r301, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r304, %r921, %r301; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r304; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r305, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r922, %r305, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r308, %r922, %r305; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r308; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r309, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r923, %r309, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r312, %r923, %r309; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r312; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r313, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r924, %r313, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r316, %r924, %r313; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r316; + // end inline asm + bar.sync 0; + ld.shared.b32 %r925, [%r764]; + ld.shared.b32 %r926, [%r764+8]; + ld.shared.b32 %r927, [%r764+16]; + ld.shared.b32 %r928, [%r764+24]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r318, %r914, %r716; + mul.lo.s32 %r320, %r913, %r716; + mul.lo.s32 %r322, %r916, %r716; + mul.lo.s32 %r324, %r915, %r716; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r318; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r320; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r322; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r324; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r325, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r929, %r325, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r328, %r929, %r325; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r328; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r329, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r930, %r329, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r332, %r930, %r329; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r332; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r333, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r931, %r333, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r336, %r931, %r333; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r336; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r337, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r932, %r337, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r340, %r932, %r337; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r340; + // end inline asm + bar.sync 0; + ld.shared.b32 %r933, [%r764]; + ld.shared.b32 %r934, [%r764+8]; + ld.shared.b32 %r935, [%r764+16]; + ld.shared.b32 %r936, [%r764+24]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r342, %r918, %r719; + mul.lo.s32 %r344, %r917, %r719; + mul.lo.s32 %r346, %r920, %r719; + mul.lo.s32 %r348, %r919, %r719; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r342; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r344; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r346; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r348; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r349, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r937, %r349, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r352, %r937, %r349; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r352; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r353, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r938, %r353, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r356, %r938, %r353; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r356; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r357, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r939, %r357, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r360, %r939, %r357; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r360; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r361, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r940, %r361, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r364, %r940, %r361; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r364; + // end inline asm + bar.sync 0; + ld.shared.b32 %r941, [%r764]; + ld.shared.b32 %r942, [%r764+8]; + ld.shared.b32 %r943, [%r764+16]; + ld.shared.b32 %r944, [%r764+24]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r366, %r918, %r716; + mul.lo.s32 %r368, %r917, %r716; + mul.lo.s32 %r370, %r920, %r716; + mul.lo.s32 %r372, %r919, %r716; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r366; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r368; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r370; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r372; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r373, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r945, %r373, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r376, %r945, %r373; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r376; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r377, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r946, %r377, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r380, %r946, %r377; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r380; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r381, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r947, %r381, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r384, %r947, %r381; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r384; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r385, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r948, %r385, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r388, %r948, %r385; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r388; + // end inline asm + bar.sync 0; + ld.shared.b32 %r949, [%r764]; + ld.shared.b32 %r950, [%r764+8]; + ld.shared.b32 %r951, [%r764+16]; + ld.shared.b32 %r952, [%r764+24]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.lt.s32 %p421, %r925, %r933; + setp.lt.s32 %p422, %r926, %r934; + setp.ge.s32 %p423, %r927, %r935; + setp.ge.s32 %p424, %r928, %r936; + .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.eq.b32 %p425, %r926, %r934; + setp.eq.b32 %p426, %r925, %r933; + setp.ne.b32 %p427, %r928, %r936; + setp.ne.b32 %p428, %r927, %r935; + .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.gt.s32 %p429, %r942, %r950; + setp.gt.s32 %p430, %r941, %r949; + setp.le.s32 %p431, %r944, %r952; + setp.le.s32 %p432, %r943, %r951; + .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.pred %p433, %p426, %p430; + and.pred %p434, %p425, %p429; + .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + or.pred %p435, %p428, %p432; + or.pred %p436, %p427, %p431; + or.pred %p437, %p422, %p434; + or.pred %p438, %p421, %p433; + .loc 2 599 19 // triton_helpers.py:599:19 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.pred %p439, %p424, %p436; + and.pred %p440, %p423, %p435; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r953, %r934, %r926; + xor.b32 %r954, %r933, %r925; + xor.b32 %r955, %r936, %r928; + xor.b32 %r956, %r935, %r927; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r957, %r954, 0, %p438; + selp.b32 %r958, %r953, 0, %p437; + selp.b32 %r959, %r956, 0, %p440; + selp.b32 %r960, %r955, 0, %p439; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r961, %r958, %r913; + xor.b32 %r962, %r957, %r914; + xor.b32 %r963, %r960, %r915; + xor.b32 %r964, %r959, %r916; + .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r965, %r950, %r942; + xor.b32 %r966, %r949, %r941; + xor.b32 %r967, %r952, %r944; + xor.b32 %r968, %r951, %r943; + .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r969, %r966, 0, %p438; + selp.b32 %r970, %r965, 0, %p437; + selp.b32 %r971, %r968, 0, %p440; + selp.b32 %r972, %r967, 0, %p439; + .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r973, %r970, %r917; + xor.b32 %r974, %r969, %r918; + xor.b32 %r975, %r972, %r919; + xor.b32 %r976, %r971, %r920; + .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r390, %r962, %r718; + mul.lo.s32 %r392, %r961, %r718; + mul.lo.s32 %r394, %r964, %r718; + mul.lo.s32 %r396, %r963, %r718; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r390; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r392; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r394; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r396; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r397, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r977, %r397, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r400, %r977, %r397; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r400; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r401, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r978, %r401, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r404, %r978, %r401; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r404; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r405, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r979, %r405, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r408, %r979, %r405; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r408; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r409, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r980, %r409, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r412, %r980, %r409; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r412; + // end inline asm + bar.sync 0; + ld.shared.b32 %r981, [%r724]; + ld.shared.b32 %r982, [%r724+16]; + ld.shared.b32 %r983, [%r724+32]; + ld.shared.b32 %r984, [%r724+48]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r414, %r962, %r717; + mul.lo.s32 %r416, %r961, %r717; + mul.lo.s32 %r418, %r964, %r717; + mul.lo.s32 %r420, %r963, %r717; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r414; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r416; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r418; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r420; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r421, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r985, %r421, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r424, %r985, %r421; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r424; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r425, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r986, %r425, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r428, %r986, %r425; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r428; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r429, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r987, %r429, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r432, %r987, %r429; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r432; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r433, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r988, %r433, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r436, %r988, %r433; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r436; + // end inline asm + bar.sync 0; + ld.shared.b32 %r989, [%r724]; + ld.shared.b32 %r990, [%r724+16]; + ld.shared.b32 %r991, [%r724+32]; + ld.shared.b32 %r992, [%r724+48]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r438, %r974, %r718; + mul.lo.s32 %r440, %r973, %r718; + mul.lo.s32 %r442, %r976, %r718; + mul.lo.s32 %r444, %r975, %r718; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r438; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r440; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r442; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r444; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r445, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r993, %r445, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r448, %r993, %r445; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r448; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r449, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r994, %r449, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r452, %r994, %r449; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r452; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r453, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r995, %r453, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r456, %r995, %r453; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r456; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r457, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r996, %r457, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r460, %r996, %r457; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r460; + // end inline asm + bar.sync 0; + ld.shared.b32 %r997, [%r724]; + ld.shared.b32 %r998, [%r724+16]; + ld.shared.b32 %r999, [%r724+32]; + ld.shared.b32 %r1000, [%r724+48]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r462, %r974, %r717; + mul.lo.s32 %r464, %r973, %r717; + mul.lo.s32 %r466, %r976, %r717; + mul.lo.s32 %r468, %r975, %r717; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r462; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r464; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r466; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r468; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r469, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1001, %r469, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r472, %r1001, %r469; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r472; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r473, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1002, %r473, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r476, %r1002, %r473; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r476; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r477, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1003, %r477, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r480, %r1003, %r477; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r480; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r481, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1004, %r481, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r484, %r1004, %r481; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r484; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1005, [%r724]; + ld.shared.b32 %r1006, [%r724+16]; + ld.shared.b32 %r1007, [%r724+32]; + ld.shared.b32 %r1008, [%r724+48]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.lt.s32 %p441, %r982, %r990; + setp.lt.s32 %p442, %r981, %r989; + setp.ge.s32 %p443, %r984, %r992; + setp.ge.s32 %p444, %r983, %r991; + .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.eq.b32 %p445, %r981, %r989; + setp.eq.b32 %p446, %r982, %r990; + setp.ne.b32 %p447, %r983, %r991; + setp.ne.b32 %p448, %r984, %r992; + .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.gt.s32 %p449, %r997, %r1005; + setp.gt.s32 %p450, %r998, %r1006; + setp.le.s32 %p451, %r999, %r1007; + setp.le.s32 %p452, %r1000, %r1008; + .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.pred %p453, %p446, %p450; + and.pred %p454, %p445, %p449; + .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + or.pred %p455, %p448, %p452; + or.pred %p456, %p447, %p451; + or.pred %p457, %p442, %p454; + or.pred %p458, %p441, %p453; + .loc 2 599 19 // triton_helpers.py:599:19 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.pred %p459, %p444, %p456; + and.pred %p460, %p443, %p455; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r1009, %r989, %r981; + xor.b32 %r1010, %r990, %r982; + xor.b32 %r1011, %r991, %r983; + xor.b32 %r1012, %r992, %r984; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r1013, %r1010, 0, %p458; + selp.b32 %r1014, %r1009, 0, %p457; + selp.b32 %r1015, %r1012, 0, %p460; + selp.b32 %r1016, %r1011, 0, %p459; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r1017, %r1014, %r962; + xor.b32 %r1018, %r1013, %r961; + xor.b32 %r1019, %r1016, %r964; + xor.b32 %r1020, %r1015, %r963; + .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r1021, %r1006, %r998; + xor.b32 %r1022, %r1005, %r997; + xor.b32 %r1023, %r1008, %r1000; + xor.b32 %r1024, %r1007, %r999; + .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r1025, %r1022, 0, %p457; + selp.b32 %r1026, %r1021, 0, %p458; + selp.b32 %r1027, %r1024, 0, %p459; + selp.b32 %r1028, %r1023, 0, %p460; + .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r1029, %r1026, %r973; + xor.b32 %r1030, %r1025, %r974; + xor.b32 %r1031, %r1028, %r975; + xor.b32 %r1032, %r1027, %r976; + .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.lt.s32 %p461, %r1018, %r1020; + setp.lt.s32 %p462, %r1017, %r1019; + .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.eq.b32 %p463, %r1017, %r1019; + setp.eq.b32 %p464, %r1018, %r1020; + .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.gt.s32 %p465, %r1030, %r1032; + setp.gt.s32 %p466, %r1029, %r1031; + .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.pred %p467, %p464, %p466; + and.pred %p468, %p463, %p465; + .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + or.pred %p469, %p462, %p468; + or.pred %p470, %p461, %p467; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r1033, %r1019, %r1017; + xor.b32 %r1034, %r1020, %r1018; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r1035, %r1034, 0, %p470; + selp.b32 %r1036, %r1033, 0, %p469; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r1037, %r1036, %r1017; + xor.b32 %r1038, %r1035, %r1018; + xor.b32 %r1039, %r1036, %r1019; + xor.b32 %r1040, %r1035, %r1020; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r1041, %r1038, %r1037; + xor.b32 %r1042, %r1040, %r1039; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r1043, %r1031, %r1029; + xor.b32 %r1044, %r1032, %r1030; + .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r1045, %r1044, 0, %p469; + selp.b32 %r1046, %r1043, 0, %p470; + .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r1047, %r1046, %r1029; + xor.b32 %r1048, %r1045, %r1032; + xor.b32 %r1049, %r1046, %r1031; + xor.b32 %r1050, %r1045, %r1030; + .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.lt.s32 %p471, %r1037, %r1038; + setp.lt.s32 %p472, %r1039, %r1040; + .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.eq.b32 %p473, %r1039, %r1040; + setp.eq.b32 %p474, %r1037, %r1038; + .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.gt.s32 %p475, %r1048, %r1049; + setp.gt.s32 %p476, %r1050, %r1047; + .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.pred %p477, %p474, %p476; + and.pred %p478, %p473, %p475; + .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + or.pred %p479, %p472, %p478; + or.pred %p480, %p471, %p477; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r1051, %r1041, 0, %p480; + selp.b32 %r1052, %r1042, 0, %p479; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r1053, %r1051, %r1037; + xor.b32 %r1054, %r1051, %r1038; + xor.b32 %r1055, %r1052, %r1039; + xor.b32 %r1056, %r1052, %r1040; + .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r1057, %r1030, %r1047; + xor.b32 %r1058, %r1031, %r1048; + xor.b32 %r1059, %r1058, %r1046; + xor.b32 %r1060, %r1057, %r1045; + .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r1061, %r1060, 0, %p480; + selp.b32 %r1062, %r1059, 0, %p479; + .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r1063, %r1062, %r1049; + xor.b32 %r1064, %r1062, %r1048; + xor.b32 %r1065, %r1061, %r1047; + xor.b32 %r1066, %r1061, %r1050; + .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r486, %r1053, %r719; + mul.lo.s32 %r488, %r1054, %r719; + mul.lo.s32 %r490, %r1055, %r719; + mul.lo.s32 %r492, %r1056, %r719; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r486; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r488; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r490; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r492; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r493, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1067, %r493, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r496, %r1067, %r493; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r496; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r497, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1068, %r497, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r500, %r1068, %r497; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r500; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r501, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1069, %r501, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r504, %r1069, %r501; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r504; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r505, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1070, %r505, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r508, %r1070, %r505; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r508; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1071, [%r764]; + ld.shared.b32 %r1072, [%r764+8]; + ld.shared.b32 %r1073, [%r764+16]; + ld.shared.b32 %r1074, [%r764+24]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r510, %r1053, %r716; + mul.lo.s32 %r512, %r1054, %r716; + mul.lo.s32 %r514, %r1055, %r716; + mul.lo.s32 %r516, %r1056, %r716; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r510; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r512; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r514; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r516; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r517, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1075, %r517, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r520, %r1075, %r517; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r520; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r521, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1076, %r521, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r524, %r1076, %r521; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r524; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r525, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1077, %r525, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r528, %r1077, %r525; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r528; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r529, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1078, %r529, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r532, %r1078, %r529; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r532; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1079, [%r764]; + ld.shared.b32 %r1080, [%r764+8]; + ld.shared.b32 %r1081, [%r764+16]; + ld.shared.b32 %r1082, [%r764+24]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r534, %r1066, %r719; + mul.lo.s32 %r536, %r1065, %r719; + mul.lo.s32 %r538, %r1064, %r719; + mul.lo.s32 %r540, %r1063, %r719; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r534; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r536; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r538; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r540; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r541, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1083, %r541, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r544, %r1083, %r541; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r544; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r545, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1084, %r545, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r548, %r1084, %r545; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r548; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r549, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1085, %r549, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r552, %r1085, %r549; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r552; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r553, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1086, %r553, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r556, %r1086, %r553; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r556; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1087, [%r764]; + ld.shared.b32 %r1088, [%r764+8]; + ld.shared.b32 %r1089, [%r764+16]; + ld.shared.b32 %r1090, [%r764+24]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r558, %r1066, %r716; + mul.lo.s32 %r560, %r1065, %r716; + mul.lo.s32 %r562, %r1064, %r716; + mul.lo.s32 %r564, %r1063, %r716; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r558; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r560; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r562; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r564; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r565, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1091, %r565, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r568, %r1091, %r565; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r568; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r569, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1092, %r569, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r572, %r1092, %r569; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r572; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r573, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1093, %r573, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r576, %r1093, %r573; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r576; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r577, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1094, %r577, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r580, %r1094, %r577; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r580; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1095, [%r764]; + ld.shared.b32 %r1096, [%r764+8]; + ld.shared.b32 %r1097, [%r764+16]; + ld.shared.b32 %r1098, [%r764+24]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.lt.s32 %p481, %r1071, %r1079; + setp.lt.s32 %p482, %r1072, %r1080; + setp.lt.s32 %p483, %r1073, %r1081; + setp.lt.s32 %p484, %r1074, %r1082; + .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.eq.b32 %p485, %r1074, %r1082; + setp.eq.b32 %p486, %r1073, %r1081; + setp.eq.b32 %p487, %r1072, %r1080; + setp.eq.b32 %p488, %r1071, %r1079; + .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.gt.s32 %p489, %r1090, %r1098; + setp.gt.s32 %p490, %r1089, %r1097; + setp.gt.s32 %p491, %r1088, %r1096; + setp.gt.s32 %p492, %r1087, %r1095; + .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.pred %p493, %p488, %p492; + and.pred %p494, %p487, %p491; + and.pred %p495, %p486, %p490; + and.pred %p496, %p485, %p489; + .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + or.pred %p497, %p484, %p496; + or.pred %p498, %p483, %p495; + or.pred %p499, %p482, %p494; + or.pred %p500, %p481, %p493; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r1099, %r1079, %r1071; + xor.b32 %r1100, %r1080, %r1072; + xor.b32 %r1101, %r1081, %r1073; + xor.b32 %r1102, %r1082, %r1074; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r1103, %r1099, 0, %p500; + selp.b32 %r1104, %r1100, 0, %p499; + selp.b32 %r1105, %r1101, 0, %p498; + selp.b32 %r1106, %r1102, 0, %p497; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r1107, %r1103, %r1053; + xor.b32 %r1108, %r1104, %r1054; + xor.b32 %r1109, %r1105, %r1055; + xor.b32 %r1110, %r1106, %r1056; + .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r1111, %r1098, %r1090; + xor.b32 %r1112, %r1097, %r1089; + xor.b32 %r1113, %r1096, %r1088; + xor.b32 %r1114, %r1095, %r1087; + .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r1115, %r1114, 0, %p500; + selp.b32 %r1116, %r1113, 0, %p499; + selp.b32 %r1117, %r1112, 0, %p498; + selp.b32 %r1118, %r1111, 0, %p497; + .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r1119, %r1118, %r1063; + xor.b32 %r1120, %r1117, %r1064; + xor.b32 %r1121, %r1116, %r1065; + xor.b32 %r1122, %r1115, %r1066; + .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r582, %r1107, %r718; + mul.lo.s32 %r584, %r1108, %r718; + mul.lo.s32 %r586, %r1109, %r718; + mul.lo.s32 %r588, %r1110, %r718; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r582; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r584; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r586; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r588; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r589, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1123, %r589, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r592, %r1123, %r589; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r592; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r593, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1124, %r593, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r596, %r1124, %r593; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r596; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r597, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1125, %r597, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r600, %r1125, %r597; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r600; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r601, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1126, %r601, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r604, %r1126, %r601; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r604; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1127, [%r724]; + ld.shared.b32 %r1128, [%r724+16]; + ld.shared.b32 %r1129, [%r724+32]; + ld.shared.b32 %r1130, [%r724+48]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r606, %r1107, %r717; + mul.lo.s32 %r608, %r1108, %r717; + mul.lo.s32 %r610, %r1109, %r717; + mul.lo.s32 %r612, %r1110, %r717; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r606; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r608; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r610; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r612; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r613, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1131, %r613, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r616, %r1131, %r613; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r616; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r617, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1132, %r617, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r620, %r1132, %r617; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r620; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r621, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1133, %r621, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r624, %r1133, %r621; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r624; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r625, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1134, %r625, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r628, %r1134, %r625; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r628; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1135, [%r724]; + ld.shared.b32 %r1136, [%r724+16]; + ld.shared.b32 %r1137, [%r724+32]; + ld.shared.b32 %r1138, [%r724+48]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r630, %r1122, %r718; + mul.lo.s32 %r632, %r1121, %r718; + mul.lo.s32 %r634, %r1120, %r718; + mul.lo.s32 %r636, %r1119, %r718; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r630; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r632; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r634; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r636; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r637, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1139, %r637, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r640, %r1139, %r637; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r640; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r641, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1140, %r641, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r644, %r1140, %r641; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r644; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r645, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1141, %r645, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r648, %r1141, %r645; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r648; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r649, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1142, %r649, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r652, %r1142, %r649; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r652; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1143, [%r724]; + ld.shared.b32 %r1144, [%r724+16]; + ld.shared.b32 %r1145, [%r724+32]; + ld.shared.b32 %r1146, [%r724+48]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r654, %r1122, %r717; + mul.lo.s32 %r656, %r1121, %r717; + mul.lo.s32 %r658, %r1120, %r717; + mul.lo.s32 %r660, %r1119, %r717; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r654; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r656; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r658; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r660; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r661, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1147, %r661, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r664, %r1147, %r661; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r664; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r665, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1148, %r665, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r668, %r1148, %r665; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r668; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r669, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1149, %r669, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r672, %r1149, %r669; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r672; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r673, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1150, %r673, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r676, %r1150, %r673; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r676; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1151, [%r724]; + ld.shared.b32 %r1152, [%r724+16]; + ld.shared.b32 %r1153, [%r724+32]; + ld.shared.b32 %r1154, [%r724+48]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.lt.s32 %p501, %r1130, %r1138; + setp.lt.s32 %p502, %r1129, %r1137; + setp.lt.s32 %p503, %r1128, %r1136; + setp.lt.s32 %p504, %r1127, %r1135; + .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.eq.b32 %p505, %r1130, %r1138; + setp.eq.b32 %p506, %r1129, %r1137; + setp.eq.b32 %p507, %r1128, %r1136; + setp.eq.b32 %p508, %r1127, %r1135; + .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.gt.s32 %p509, %r1146, %r1154; + setp.gt.s32 %p510, %r1145, %r1153; + setp.gt.s32 %p511, %r1144, %r1152; + setp.gt.s32 %p512, %r1143, %r1151; + .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r1155, %r1154, %r1146; + xor.b32 %r1156, %r1153, %r1145; + xor.b32 %r1157, %r1152, %r1144; + xor.b32 %r1158, %r1151, %r1143; + .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r1159, %r1158, 0, %p512; + selp.b32 %r1160, %r1159, 0, %p508; + selp.b32 %r1161, %r1158, %r1160, %p504; + selp.b32 %r1162, %r1157, 0, %p511; + selp.b32 %r1163, %r1162, 0, %p507; + selp.b32 %r1164, %r1157, %r1163, %p503; + selp.b32 %r1165, %r1156, 0, %p510; + selp.b32 %r1166, %r1165, 0, %p506; + selp.b32 %r1167, %r1156, %r1166, %p502; + selp.b32 %r1168, %r1155, 0, %p509; + selp.b32 %r1169, %r1168, 0, %p505; + selp.b32 %r1170, %r1155, %r1169, %p501; + .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r1171, %r1170, %r1119; + xor.b32 %r1172, %r1167, %r1120; + xor.b32 %r1173, %r1164, %r1121; + xor.b32 %r1174, %r1161, %r1122; +$L__tmp2: + .loc 1 44 34 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:44:34 + cvt.s64.s32 %rd13, %r3; + cvt.s64.s32 %rd14, %r2; + cvt.s64.s32 %rd15, %r4; + cvt.s64.s32 %rd16, %r1; + selp.b64 %rd17, %rd16, 0, %p1; + selp.b64 %rd18, %rd15, 0, %p1; + selp.b64 %rd19, %rd14, 0, %p1; + selp.b64 %rd20, %rd13, 0, %p1; +$L__tmp3: + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ] + bar.sync 0; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ] + add.s64 %rd21, %rd19, %rd20; + add.s64 %rd22, %rd17, %rd18; + add.s64 %rd5, %rd21, %rd22; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ] + add.s32 %r1175, %r721, %r762; + and.b32 %r1176, %r694, 24; + add.s32 %r677, %r1175, %r1176; + // begin inline asm + @%p5 st.shared.b64 [ %r677 + 0 ], %rd5; + // end inline asm + bar.sync 0; + setp.lt.u32 %p342, %r692, 128; + shl.b32 %r1177, %r692, 3; + add.s32 %r678, %r721, %r1177; + // begin inline asm + @%p342 ld.shared.b64 %rd6, [ %r678 + 0 ]; + // end inline asm + mov.b64 {_, %r1178}, %rd6; + cvt.u32.u64 %r1179, %rd6; + shfl.sync.bfly.b32 %r1180, %r1179, 2, 31, -1; + shfl.sync.bfly.b32 %r1181, %r1178, 2, 31, -1; + cvt.u64.u32 %rd23, %r1180; + cvt.u64.u32 %rd24, %r1181; + shl.b64 %rd25, %rd24, 32; + or.b64 %rd26, %rd23, %rd25; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ] + add.s64 %rd27, %rd6, %rd26; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ] + mov.b64 {_, %r1182}, %rd27; + cvt.u32.u64 %r1183, %rd27; + shfl.sync.bfly.b32 %r1184, %r1183, 1, 31, -1; + shfl.sync.bfly.b32 %r1185, %r1182, 1, 31, -1; + cvt.u64.u32 %rd28, %r1184; + cvt.u64.u32 %rd29, %r1185; + shl.b64 %rd30, %rd29, 32; + or.b64 %rd31, %rd28, %rd30; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ] + add.s64 %rd7, %rd27, %rd31; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ] + and.b32 %r1186, %r692, 899; + setp.eq.b32 %p343, %r1186, 0; + // begin inline asm + @%p343 st.shared.b64 [ %r678 + 0 ], %rd7; + // end inline asm + bar.sync 0; + ld.shared.b32 %r689, [%r1175]; +$L__tmp4: + .loc 1 49 35 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:49:35 + shl.b32 %r1187, %r697, 4; + .loc 1 49 32 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:49:32 + or.b32 %r1188, %r1187, %r704; + .loc 1 49 25 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:49:25 + mad.wide.s32 %rd8, %r1188, 4, %rd11; + .loc 1 49 47 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:49:47 + bar.sync 0; + shl.b32 %r1189, %r692, 4; + and.b32 %r1190, %r1189, 2032; + add.s32 %r1191, %r721, %r1190; + st.shared.v4.b32 [%r1191], {%r1174, %r1173, %r1172, %r1171}; + bar.sync 0; + shl.b32 %r1192, %r692, 6; + and.b32 %r1193, %r1192, 1536; + and.b32 %r1194, %r1189, 112; + shl.b32 %r1195, %r698, 2; + add.s32 %r1196, %r721, %r1193; + add.s32 %r1197, %r1196, %r1194; + add.s32 %r684, %r1197, %r1195; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r685, %r686, %r687, %r688}, [%r684]; + // end inline asm + // begin inline asm + @%p344 st.global.v4.b32 [ %rd8 + 0 ], { %r685, %r686, %r687, %r688 }; + // end inline asm + .loc 1 50 25 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:50:25 + mad.wide.s32 %rd9, %r696, 4, %rd12; + .loc 1 50 37 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:50:37 + setp.eq.b32 %p513, %r698, 0; + and.pred %p345, %p513, %p1; + // begin inline asm + @%p345 st.global.b32 [ %rd9 + 0 ], { %r689 }; + // end inline asm + .loc 1 50 4 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:50:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 267 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x104 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 104 +.b8 120 +.b8 110 +.b8 121 +.b8 103 +.b8 112 +.b8 118 +.b8 112 +.b8 109 +.b8 118 +.b8 114 +.b8 50 +.b8 109 +.b8 120 +.b8 50 +.b8 101 +.b8 54 +.b8 109 +.b8 119 +.b8 103 +.b8 100 +.b8 101 +.b8 111 +.b8 106 +.b8 116 +.b8 104 +.b8 114 +.b8 105 +.b8 114 +.b8 110 +.b8 111 +.b8 103 +.b8 55 +.b8 110 +.b8 109 +.b8 113 +.b8 54 +.b8 109 +.b8 99 +.b8 115 +.b8 105 +.b8 51 +.b8 119 +.b8 118 +.b8 101 +.b8 103 +.b8 118 +.b8 105 +.b8 50 +.b8 115 +.b8 111 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 104 +.b8 120 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x3d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 99 +.b8 108 +.b8 111 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 108 +.b8 105 +.b8 99 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 116 +.b8 114 +.b8 97 +.b8 110 +.b8 115 +.b8 112 +.b8 111 +.b8 115 +.b8 101 +.b8 95 +.b8 51 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc8:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xdd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 67 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xf5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source b/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source new file mode 100644 index 0000000000000000000000000000000000000000..cf22046031621d0530ce43e748b665ae0b54e655 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source @@ -0,0 +1,1221 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":18:0) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc88 = loc(unknown) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc140 = loc("in_ptr0"(#loc)) +#loc141 = loc("out_ptr2"(#loc)) +#loc142 = loc("out_ptr3"(#loc)) +#loc143 = loc("xnumel"(#loc)) +#loc144 = loc("r0_numel"(#loc)) +#loc174 = loc("x"(#loc38)) +#loc175 = loc("idxs"(#loc38)) +#loc176 = loc("x"(#loc42)) +#loc177 = loc("idxs"(#loc42)) +#loc182 = loc("x"(#loc50)) +#loc183 = loc("idxs"(#loc50)) +#loc184 = loc("flip"(#loc50)) +#loc240 = loc("input"(#loc113)) +#loc241 = loc("a"(#loc117)) +#loc242 = loc("b"(#loc117)) +#loc244 = loc("x"(#loc122)) +#loc245 = loc("x"(#loc126)) +#loc246 = loc("input"(#loc135)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 32 : i32 loc(#loc145) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc146) + %xoffset = tt.get_program_id x : i32 loc(#loc147) + %xoffset_2 = arith.constant 32 : i32 loc(#loc148) + %xoffset_3 = arith.constant 32 : i32 loc(#loc148) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc148) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc149) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc150) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<32x1xi32> loc(#loc151) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<32x1xi32> loc(#loc151) + %xmask = arith.constant dense<32> : tensor<32x1xi32> loc(#loc152) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<32x1xi32> loc(#loc152) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc153) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc154) + %r0_offset = arith.constant 0 : i32 loc(#loc155) + %r0_mask = arith.constant true loc(#loc156) + %r0_mask_10 = arith.constant dense : tensor<32x16xi1> loc(#loc156) + %x0 = arith.constant 16 : i32 loc(#loc157) + %x0_11 = arith.constant 16 : i32 loc(#loc157) + %x0_12 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc157) + %x0_13 = arith.remsi %xindex_7, %x0_12 : tensor<32x1xi32> loc(#loc157) + %x1 = arith.constant 16 : i32 loc(#loc158) + %x1_14 = arith.constant 16 : i32 loc(#loc158) + %x1_15 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc158) + %x1_16 = arith.divsi %xindex_7, %x1_15 : tensor<32x1xi32> loc(#loc158) + %tmp0 = arith.constant 17 : i32 loc(#loc159) + %tmp0_17 = arith.constant 17 : i32 loc(#loc159) + %tmp0_18 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc159) + %tmp0_19 = arith.muli %tmp0_18, %r0_index_9 : tensor<1x16xi32> loc(#loc159) + %tmp0_20 = tt.broadcast %x0_13 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc160) + %tmp0_21 = tt.broadcast %tmp0_19 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc160) + %tmp0_22 = arith.addi %tmp0_20, %tmp0_21 : tensor<32x16xi32> loc(#loc160) + %tmp0_23 = arith.constant 272 : i32 loc(#loc161) + %tmp0_24 = arith.constant 272 : i32 loc(#loc161) + %tmp0_25 = arith.constant dense<272> : tensor<32x1xi32> loc(#loc161) + %tmp0_26 = arith.muli %tmp0_25, %x1_16 : tensor<32x1xi32> loc(#loc161) + %tmp0_27 = tt.broadcast %tmp0_26 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc162) + %tmp0_28 = arith.addi %tmp0_22, %tmp0_27 : tensor<32x16xi32> loc(#loc162) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc163) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc163) + %tmp0_31 = arith.constant 0.000000e+00 : f32 loc(#loc164) + %tmp0_32 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc164) + %tmp0_33 = arith.constant dense<0.000000e+00> : tensor<32x16xf32> loc(#loc164) + %tmp0_34 = arith.fptosi %tmp0_33 : tensor<32x16xf32> to tensor<32x16xi32> loc(#loc164) + %tmp0_35 = tt.load %tmp0_30, %tmp0_32, %tmp0_34 : tensor<32x16x!tt.ptr> loc(#loc164) + %tmp2 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc165) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<32x16xi16> loc(#loc166) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp0_35, %tmp4) : (tensor<32x16xi32>, tensor<32x16xi16>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc23) + %tmp7 = arith.extsi %tmp0_35 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc167) + %tmp10 = arith.constant 0 : i32 loc(#loc168) + %tmp10_36 = arith.constant 0 : i64 loc(#loc168) + %tmp10_37 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc168) + %tmp10_38 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc168) + %tmp10_39 = arith.select %tmp10_38, %tmp7, %tmp10_37 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc168) + %tmp11 = tt.call @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp10_39) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc169) + %tmp11_40 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc170) + %tmp12 = arith.extsi %0#1 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc171) + %tmp13 = arith.trunci %tmp12 : tensor<32x16xi64> to tensor<32x16xi32> loc(#loc172) + %tmp14 = arith.trunci %tmp11_40 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc173) + %c16_i32 = arith.constant 16 : i32 loc(#loc31) + %c16_i32_41 = arith.constant 16 : i32 loc(#loc31) + %cst = arith.constant dense<16> : tensor<32x1xi32> loc(#loc31) + %1 = arith.muli %cst, %xindex_7 : tensor<32x1xi32> loc(#loc31) + %2 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc32) + %3 = tt.broadcast %1 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc32) + %4 = arith.addi %2, %3 : tensor<32x16xi32> loc(#loc32) + %5 = tt.splat %out_ptr2 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc33) + %6 = tt.addptr %5, %4 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc33) + %7 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc34) + tt.store %6, %tmp13, %7 : tensor<32x16x!tt.ptr> loc(#loc34) + %8 = tt.splat %out_ptr3 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc35) + %9 = tt.addptr %8, %xindex_7 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc35) + tt.store %9, %tmp14, %xmask_8 : tensor<32x1x!tt.ptr> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc38)), %idxs: tensor<32x16xi16> loc("idxs"(#loc38))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<32x16xi32>, tensor<32x16xi16>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc39) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc39) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc39) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc39) + tt.return %3#0, %3#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc40) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc41) + %5 = ub.poison : tensor<32x16xi32> loc(#loc41) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc41) + } loc(#loc38) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc42)), %idxs: tensor<32x16xi16> loc("idxs"(#loc42))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc178) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc179) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc179) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc180) + %flip_3 = tt.reshape %flip_2 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc181) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i16S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi16>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + tt.return %0#0, %0#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc48) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi32> loc(#loc49) + %2 = ub.poison : tensor<32x16xi32> loc(#loc49) + tt.return %1, %2 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i16S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi16> loc("idxs"(#loc50)), %flip: tensor<32x16xi32> loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi16> -> tensor<256x2x1xi16> loc(#loc199) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc200) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc201) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<256x2x1xi16> loc(#loc201) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<256x2x1xi16>) -> tensor<256x1xi32> loc(#loc202) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc203) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc204) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc205) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc206) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<256x2x1xi16> loc(#loc206) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<256x2x1xi16>) -> tensor<256x1xi32> loc(#loc207) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc208) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc209) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_27 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_28 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_49 = arith.constant true loc(#loc217) + %cond_50 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<32x16xi1> loc(#loc217) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<32x16xi1> loc(#loc218) + %cond_53 = arith.ori %cond, %cond_52 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_53 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_50 = arith.ori %eq, %eq_49 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_50 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<32x16xi32> loc(#loc223) + %cond_30 = arith.andi %3, %cond_29 : tensor<32x16xi1> loc(#loc224) + %cond_31 = arith.ori %1, %cond_30 : tensor<32x16xi1> loc(#loc225) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<32x16xi1> loc(#loc226) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<32x16xi1> loc(#loc227) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<32x16xi1> loc(#loc228) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<32x16xi1> loc(#loc229) + %cond_36 = arith.extui %cond_35 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc230) + %cond_37 = arith.xori %cond_36, %flip : tensor<32x16xi32> loc(#loc230) + %cond_38 = arith.constant 0 : i32 loc(#loc231) + %cond_39 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc231) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<32x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_43 = arith.xori %x, %ret_42 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<32x16xi32> loc(#loc236) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S32_16S__(%idxs) : (tensor<32x16xi16>) -> tensor<32x16xi16> loc(#loc237) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<32x16xi16> to tensor<32x16xi32> loc(#loc238) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_47 = arith.extsi %idxs : tensor<32x16xi16> to tensor<32x16xi32> loc(#loc239) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_43, %new_idxs_48 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<256x2x1xi32> loc("input"(#loc113))) -> tensor<256x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc114) + tt.return %0 : tensor<256x1xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<256x1xi32> loc(#loc116) + tt.return %1 : tensor<256x1xi32> loc(#loc116) + } loc(#loc113) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc117)), %b: i32 loc("b"(#loc117))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc118) + tt.return %0 : i32 loc(#loc119) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc120) + tt.return %1 : i32 loc(#loc120) + } loc(#loc117) + tt.func private @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<256x2x1xi16> loc("input"(#loc113))) -> tensor<256x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc243) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc114) + tt.return %0 : tensor<256x1xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<256x1xi32> loc(#loc116) + tt.return %1 : tensor<256x1xi32> loc(#loc116) + } loc(#loc113) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%x: tensor<32x16xi32> loc("x"(#loc122))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc123) + %false = arith.constant false loc(#loc124) + tt.return %false : i1 loc(#loc124) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc125) + tt.return %1 : i1 loc(#loc125) + } loc(#loc122) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S32_16S__(%x: tensor<32x16xi32> loc("x"(#loc126))) -> tensor<32x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc127) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc128) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc128) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<32x16xi32> loc(#loc128) + %4 = arith.addi %x, %3 : tensor<32x16xi32> loc(#loc128) + tt.return %4 : tensor<32x16xi32> loc(#loc129) + ^bb1: // no predecessors + %5 = ub.poison : tensor<32x16xi32> loc(#loc130) + tt.return %5 : tensor<32x16xi32> loc(#loc130) + } loc(#loc126) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc132) + %cst = arith.constant dense : tensor<1xi1> loc(#loc132) + tt.return %cst : tensor<1xi1> loc(#loc133) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc134) + tt.return %0 : tensor<1xi1> loc(#loc134) + } loc(#loc131) + tt.func private @triton.language.standard.zeros_like__i32S32_16S__(%input: tensor<32x16xi32> loc("input"(#loc135))) -> tensor<32x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<32x16xi32> loc(#loc136) + tt.return %0 : tensor<32x16xi32> loc(#loc137) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi32> loc(#loc138) + tt.return %1 : tensor<32x16xi32> loc(#loc138) + } loc(#loc135) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<32x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc132) + %cst = arith.constant dense<0> : tensor<32x16xi32> loc(#loc132) + tt.return %cst : tensor<32x16xi32> loc(#loc133) + ^bb1: // no predecessors + %0 = ub.poison : tensor<32x16xi32> loc(#loc134) + tt.return %0 : tensor<32x16xi32> loc(#loc134) + } loc(#loc131) + tt.func private @triton.language.standard.zeros_like__i16S32_16S__(%input: tensor<32x16xi16> loc("input"(#loc135))) -> tensor<32x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<32x16xi16> loc(#loc136) + tt.return %0 : tensor<32x16xi16> loc(#loc137) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi16> loc(#loc138) + tt.return %1 : tensor<32x16xi16> loc(#loc138) + } loc(#loc135) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<32x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc132) + %cst = arith.constant dense<0> : tensor<32x16xi16> loc(#loc132) + tt.return %cst : tensor<32x16xi16> loc(#loc133) + ^bb1: // no predecessors + %0 = ub.poison : tensor<32x16xi16> loc(#loc134) + tt.return %0 : tensor<32x16xi16> loc(#loc134) + } loc(#loc131) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc42)), %idxs: tensor<32x16xi32> loc("idxs"(#loc42))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc178) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc179) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc179) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc180) + %flip_3 = tt.reshape %flip_2 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc181) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + tt.return %1#0, %1#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc48) + ^bb1: // no predecessors + %2 = ub.poison : tensor<32x16xi32> loc(#loc49) + %3 = ub.poison : tensor<32x16xi32> loc(#loc49) + tt.return %2, %3 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: tensor<32x16xi32> loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<128x2x2xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<128x2x2xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<128x2x2xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<128x2x2xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc217) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc217) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc218) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc230) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc230) + %cond_36 = arith.constant 0 : i32 loc(#loc231) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc231) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x2x2xi32> loc("input"(#loc113))) -> tensor<128x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc114) + tt.return %0 : tensor<128x2xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x2xi32> loc(#loc116) + tt.return %1 : tensor<128x2xi32> loc(#loc116) + } loc(#loc113) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: tensor<32x16xi32> loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x1xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x1xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc217) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc217) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc218) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc230) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc230) + %cond_36 = arith.constant 0 : i32 loc(#loc231) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc231) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc42)), %idxs: tensor<32x16xi32> loc("idxs"(#loc42))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc178) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc179) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc179) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc180) + %flip_3 = tt.reshape %flip_2 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc181) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + tt.return %2#0, %2#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc48) + ^bb1: // no predecessors + %3 = ub.poison : tensor<32x16xi32> loc(#loc49) + %4 = ub.poison : tensor<32x16xi32> loc(#loc49) + tt.return %3, %4 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: tensor<32x16xi32> loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x4xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<64x2x4xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x4xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x4xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc217) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc217) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc218) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc230) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc230) + %cond_36 = arith.constant 0 : i32 loc(#loc231) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc231) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x4xi32> loc("input"(#loc113))) -> tensor<64x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc114) + tt.return %0 : tensor<64x4xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x4xi32> loc(#loc116) + tt.return %1 : tensor<64x4xi32> loc(#loc116) + } loc(#loc113) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc42)), %idxs: tensor<32x16xi32> loc("idxs"(#loc42))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc247) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + tt.return %3#0, %3#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc48) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc49) + %5 = ub.poison : tensor<32x16xi32> loc(#loc49) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x8xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<32x2x8xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x8xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x8xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x2x8xi32> loc("input"(#loc113))) -> tensor<32x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc114) + tt.return %0 : tensor<32x8xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x8xi32> loc(#loc116) + tt.return %1 : tensor<32x8xi32> loc(#loc116) + } loc(#loc113) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x4xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<64x2x4xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x4xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x4xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<128x2x2xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<128x2x2xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<128x2x2xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<128x2x2xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x1xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x1xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x16xi64> loc("input"(#loc113))) -> tensor<32xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc114) + tt.reduce.return %2 : i64 loc(#loc114) + }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc114) + tt.return %0 : tensor<32xi64> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32xi64> loc(#loc116) + tt.return %1 : tensor<32xi64> loc(#loc116) + } loc(#loc113) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc117)), %b: i64 loc("b"(#loc117))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc118) + tt.return %0 : i64 loc(#loc119) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc120) + tt.return %1 : i64 loc(#loc120) + } loc(#loc117) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":33:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":34:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:38) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:49) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:54) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":38:19) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":40:33) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":41:67) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":42:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":44:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:26) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:29) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":46:20) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":47:21) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":48:21) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:35) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:32) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:47) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:37) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:4) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc145 = loc("xnumel"(#loc1)) +#loc146 = loc("r0_numel"(#loc2)) +#loc147 = loc("xoffset"(#loc3)) +#loc148 = loc("xoffset"(#loc4)) +#loc149 = loc("xindex"(#loc5)) +#loc150 = loc("xindex"(#loc6)) +#loc151 = loc("xindex"(#loc7)) +#loc152 = loc("xmask"(#loc8)) +#loc153 = loc("r0_index"(#loc9)) +#loc154 = loc("r0_index"(#loc10)) +#loc155 = loc("r0_offset"(#loc11)) +#loc156 = loc("r0_mask"(#loc12)) +#loc157 = loc("x0"(#loc13)) +#loc158 = loc("x1"(#loc14)) +#loc159 = loc("tmp0"(#loc15)) +#loc160 = loc("tmp0"(#loc16)) +#loc161 = loc("tmp0"(#loc17)) +#loc162 = loc("tmp0"(#loc18)) +#loc163 = loc("tmp0"(#loc19)) +#loc164 = loc("tmp0"(#loc20)) +#loc165 = loc("tmp2"(#loc21)) +#loc166 = loc("tmp4"(#loc22)) +#loc167 = loc("tmp7"(#loc24)) +#loc168 = loc("tmp10"(#loc25)) +#loc169 = loc("tmp11"(#loc26)) +#loc170 = loc("tmp11"(#loc27)) +#loc171 = loc("tmp12"(#loc28)) +#loc172 = loc("tmp13"(#loc29)) +#loc173 = loc("tmp14"(#loc30)) +#loc178 = loc("flip"(#loc43)) +#loc179 = loc("flip"(#loc44)) +#loc180 = loc("flip"(#loc45)) +#loc181 = loc("flip"(#loc46)) +#loc185 = loc("y"(#loc51)) +#loc186 = loc("right_mask"(#loc52)) +#loc187 = loc("right_mask"(#loc53)) +#loc188 = loc("left_mask"(#loc54)) +#loc189 = loc("ileft"(#loc55)) +#loc190 = loc("ileft"(#loc56)) +#loc191 = loc("ileft"(#loc57)) +#loc192 = loc("ileft"(#loc58)) +#loc193 = loc("iright"(#loc59)) +#loc194 = loc("iright"(#loc60)) +#loc195 = loc("iright"(#loc61)) +#loc196 = loc("iright"(#loc62)) +#loc197 = loc("ileft"(#loc63)) +#loc198 = loc("iright"(#loc64)) +#loc199 = loc("y_idx"(#loc65)) +#loc200 = loc("left_idx"(#loc66)) +#loc201 = loc("left_idx"(#loc67)) +#loc202 = loc("left_idx"(#loc68)) +#loc203 = loc("left_idx"(#loc69)) +#loc204 = loc("left_idx"(#loc70)) +#loc205 = loc("right_idx"(#loc71)) +#loc206 = loc("right_idx"(#loc72)) +#loc207 = loc("right_idx"(#loc73)) +#loc208 = loc("right_idx"(#loc74)) +#loc209 = loc("right_idx"(#loc75)) +#loc210 = loc("left_idx"(#loc76)) +#loc211 = loc("right_idx"(#loc77)) +#loc212 = loc("left_valid_mask"(#loc78)) +#loc213 = loc("right_valid_mask"(#loc79)) +#loc214 = loc("left_isnan"(#loc80)) +#loc215 = loc("right_isnan"(#loc81)) +#loc216 = loc("cond"(#loc82)) +#loc217 = loc("cond"(#loc85)) +#loc218 = loc("cond"(#loc86)) +#loc219 = loc("cond"(#loc87)) +#loc220 = loc("eq"(#loc89)) +#loc221 = loc("eq"(#loc92)) +#loc222 = loc("eq"(#loc93)) +#loc223 = loc("cond"(#loc94)) +#loc224 = loc("cond"(#loc95)) +#loc225 = loc("cond"(#loc96)) +#loc226 = loc("cond"(#loc97)) +#loc227 = loc("cond"(#loc98)) +#loc228 = loc("cond"(#loc99)) +#loc229 = loc("cond"(#loc100)) +#loc230 = loc("cond"(#loc101)) +#loc231 = loc("cond"(#loc102)) +#loc232 = loc("ret"(#loc103)) +#loc233 = loc("ret"(#loc104)) +#loc234 = loc("ret"(#loc105)) +#loc235 = loc("ret"(#loc106)) +#loc236 = loc("new_idxs"(#loc107)) +#loc237 = loc("new_idxs"(#loc108)) +#loc238 = loc("new_idxs"(#loc109)) +#loc239 = loc("new_idxs"(#loc110)) +#loc243 = loc("input"(#loc121)) +#loc247 = loc("flip"(#loc139)) +#loc248 = loc("cond"(#loc216)) +#loc249 = loc("cond"(#loc219)) +#loc250 = loc("eq"(#loc220)) +#loc251 = loc("eq"(#loc222)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..2ccefdce21e6370ba2978054de28c70425395552 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir @@ -0,0 +1,841 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}> +#linear = #ttg.linear<{register = [[0, 4], [0, 8]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0]], warp = [[0, 1], [0, 2]], block = []}> +#linear1 = #ttg.linear<{register = [[2, 0, 0], [4, 0, 0]], lane = [[8, 0, 0], [16, 0, 0], [32, 0, 0], [64, 0, 0], [128, 0, 0]], warp = [[0, 1, 0], [1, 0, 0]], block = []}> +#linear2 = #ttg.linear<{register = [[1, 0, 0], [2, 0, 0]], lane = [[4, 0, 0], [8, 0, 0], [16, 0, 0], [32, 0, 0], [64, 0, 0]], warp = [[0, 0, 1], [0, 1, 0]], block = []}> +#linear3 = #ttg.linear<{register = [[0, 1, 0], [1, 0, 0]], lane = [[2, 0, 0], [4, 0, 0], [8, 0, 0], [16, 0, 0], [32, 0, 0]], warp = [[0, 0, 1], [0, 0, 2]], block = []}> +#linear4 = #ttg.linear<{register = [[0, 0, 4], [0, 1, 0]], lane = [[1, 0, 0], [2, 0, 0], [4, 0, 0], [8, 0, 0], [16, 0, 0]], warp = [[0, 0, 1], [0, 0, 2]], block = []}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":18:0) +#loc1 = loc(unknown) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":41:67) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:26) +#loc77 = loc("in_ptr0"(#loc)) +#loc78 = loc("out_ptr2"(#loc)) +#loc79 = loc("out_ptr3"(#loc)) +#loc80 = loc("xnumel"(#loc)) +#loc81 = loc("r0_numel"(#loc)) +#loc99 = loc(callsite(#loc19 at #loc20)) +#loc105 = loc("ileft"(#loc28)) +#loc109 = loc("iright"(#loc33)) +#loc118 = loc("left_idx"(#loc42)) +#loc123 = loc("right_idx"(#loc47)) +#loc143 = loc("tmp11"(#loc67)) +#loc149 = loc(callsite(#loc24 at #loc99)) +#loc153 = loc(callsite(#loc1 at #loc143)) +#loc157 = loc(callsite(#loc105 at #loc149)) +#loc161 = loc(callsite(#loc109 at #loc149)) +#loc169 = loc(callsite(#loc118 at #loc149)) +#loc174 = loc(callsite(#loc123 at #loc149)) +#loc194 = loc(callsite(#loc1 at #loc157)) +#loc196 = loc(callsite(#loc1 at #loc161)) +#loc199 = loc(callsite(#loc1 at #loc169)) +#loc202 = loc(callsite(#loc1 at #loc174)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<32x16xi32, #linear> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<32x16xi64, #blocked> loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %cst_1 = arith.constant dense<32> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<32> : tensor<32x1xi32, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<16> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<16> : tensor<32x1xi32, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<17> : tensor<1x16xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<272> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<1> : tensor<1x2x1xi32, #linear1> loc(#loc1) + %cst_8 = arith.constant dense<1> : tensor<1x2x1xi32, #linear2> loc(#loc1) + %cst_9 = arith.constant dense<1> : tensor<1x2x1xi32, #linear3> loc(#loc1) + %cst_10 = arith.constant dense<1> : tensor<1x2x1xi32, #linear4> loc(#loc1) + %cst_11 = arith.constant dense<0> : tensor<32x16xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc82) + %xoffset_12 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc83) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc84) + %xindex_13 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc84) + %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc84) + %xindex_15 = tt.expand_dims %xindex_13 {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1> loc(#loc84) + %xindex_16 = tt.splat %xoffset_12 : i32 -> tensor<32x1xi32, #blocked> loc(#loc85) + %xindex_17 = tt.splat %xoffset_12 : i32 -> tensor<32x1xi32, #blocked1> loc(#loc85) + %xindex_18 = arith.addi %xindex_16, %xindex_14 : tensor<32x1xi32, #blocked> loc(#loc85) + %xindex_19 = arith.addi %xindex_17, %xindex_15 : tensor<32x1xi32, #blocked1> loc(#loc85) + %xmask = arith.cmpi slt, %xindex_18, %cst_1 : tensor<32x1xi32, #blocked> loc(#loc86) + %xmask_20 = arith.cmpi slt, %xindex_19, %cst_2 : tensor<32x1xi32, #blocked1> loc(#loc86) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc87) + %r0_index_21 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> loc(#loc87) + %r0_index_22 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc87) + %r0_index_23 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc87) + %r0_index_24 = tt.expand_dims %r0_index_21 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> -> tensor<1x16xi32, #linear> loc(#loc87) + %r0_index_25 = tt.expand_dims %r0_index_22 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x16xi32, #blocked1> loc(#loc87) + %x0 = arith.remsi %xindex_18, %cst_3 : tensor<32x1xi32, #blocked> loc(#loc88) + %x1 = arith.divsi %xindex_18, %cst_3 : tensor<32x1xi32, #blocked> loc(#loc89) + %tmp0 = arith.muli %r0_index_23, %cst_5 : tensor<1x16xi32, #blocked> loc(#loc90) + %tmp0_26 = tt.broadcast %x0 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc91) + %tmp0_27 = tt.broadcast %tmp0 : tensor<1x16xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc91) + %tmp0_28 = arith.addi %tmp0_26, %tmp0_27 : tensor<32x16xi32, #blocked> loc(#loc91) + %tmp0_29 = arith.muli %x1, %cst_6 : tensor<32x1xi32, #blocked> loc(#loc92) + %tmp0_30 = tt.broadcast %tmp0_29 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc93) + %tmp0_31 = arith.addi %tmp0_28, %tmp0_30 : tensor<32x16xi32, #blocked> loc(#loc93) + %tmp0_32 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc94) + %tmp0_33 = tt.addptr %tmp0_32, %tmp0_31 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc94) + %tmp0_34 = tt.broadcast %xmask : tensor<32x1xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc95) + %tmp0_35 = tt.broadcast %xmask_20 : tensor<32x1xi1, #blocked1> -> tensor<32x16xi1, #blocked1> loc(#loc95) + %tmp0_36 = tt.load %tmp0_33, %tmp0_34, %cst_11 : tensor<32x16x!tt.ptr, #blocked> loc(#loc95) + %tmp2 = arith.trunci %r0_index_24 : tensor<1x16xi32, #linear> to tensor<1x16xi16, #linear> loc(#loc96) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16, #linear> -> tensor<32x16xi16, #linear> loc(#loc97) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> loc(#loc146) + %flip_37 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> loc(#loc146) + %flip_38 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> loc(#loc146) + %flip_39 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> loc(#loc146) + %flip_40 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> loc(#loc146) + %flip_41 = tt.expand_dims %flip_37 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> loc(#loc146) + %flip_42 = tt.expand_dims %flip_38 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> loc(#loc146) + %flip_43 = tt.expand_dims %flip_39 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> loc(#loc146) + %flip_44 = tt.expand_dims %flip_40 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> -> tensor<1x2x1xi32, #linear2> loc(#loc146) + %flip_45 = tt.expand_dims %flip_41 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> -> tensor<1x2x1xi32, #linear1> loc(#loc146) + %flip_46 = tt.expand_dims %flip_42 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> -> tensor<1x2x1xi32, #linear3> loc(#loc146) + %flip_47 = tt.expand_dims %flip_43 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> -> tensor<1x2x1xi32, #linear4> loc(#loc146) + %flip_48 = tt.broadcast %flip_44 : tensor<1x2x1xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc147) + %flip_49 = tt.reshape %flip_48 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #blocked> loc(#loc148) + %flip_50 = tt.reshape %flip_48 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc148) + %y = tt.reshape %tmp0_36 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #linear1> loc(#loc154) + %left_mask = arith.subi %cst_7, %flip_45 : tensor<1x2x1xi32, #linear1> loc(#loc155) + %left_mask_51 = arith.subi %cst_8, %flip_44 : tensor<1x2x1xi32, #linear2> loc(#loc155) + %left_mask_52 = arith.subi %cst_9, %flip_46 : tensor<1x2x1xi32, #linear3> loc(#loc155) + %left_mask_53 = arith.subi %cst_10, %flip_47 : tensor<1x2x1xi32, #linear4> loc(#loc155) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc156) + %ileft_54 = arith.muli %y, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156) + %ileft_55 = "tt.reduce"(%ileft_54) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_56 = tt.expand_dims %ileft_55 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158) + %ileft_57 = tt.broadcast %ileft_56 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159) + %iright = tt.broadcast %flip_45 : tensor<1x2x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc160) + %iright_58 = arith.muli %y, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160) + %iright_59 = "tt.reduce"(%iright_58) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_60 = tt.expand_dims %iright_59 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162) + %iright_61 = tt.broadcast %iright_60 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163) + %ileft_62 = tt.reshape %ileft_57 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc164) + %ileft_63 = tt.reshape %ileft_57 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_64 = tt.reshape %iright_61 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc165) + %iright_65 = tt.reshape %iright_61 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx = tt.reshape %tmp4 : tensor<32x16xi16, #linear> -> tensor<256x2x1xi16, #linear1> loc(#loc166) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc167) + %left_idx_66 = tt.broadcast %left_idx : tensor<1x2x1xi16, #linear1> -> tensor<256x2x1xi16, #linear1> loc(#loc168) + %left_idx_67 = arith.muli %y_idx, %left_idx_66 : tensor<256x2x1xi16, #linear1> loc(#loc168) + %input = arith.extsi %left_idx_67 : tensor<256x2x1xi16, #linear1> to tensor<256x2x1xi32, #linear1> loc(#loc197) + %left_idx_68 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_69 = tt.expand_dims %left_idx_68 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170) + %left_idx_70 = tt.broadcast %left_idx_69 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171) + %right_idx = arith.trunci %flip_45 : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc172) + %right_idx_71 = tt.broadcast %right_idx : tensor<1x2x1xi16, #linear1> -> tensor<256x2x1xi16, #linear1> loc(#loc173) + %right_idx_72 = arith.muli %y_idx, %right_idx_71 : tensor<256x2x1xi16, #linear1> loc(#loc173) + %input_73 = arith.extsi %right_idx_72 : tensor<256x2x1xi16, #linear1> to tensor<256x2x1xi32, #linear1> loc(#loc200) + %right_idx_74 = "tt.reduce"(%input_73) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_75 = tt.expand_dims %right_idx_74 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175) + %right_idx_76 = tt.broadcast %right_idx_75 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176) + %left_idx_77 = tt.reshape %left_idx_70 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc177) + %left_idx_78 = tt.reshape %left_idx_70 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_79 = tt.reshape %right_idx_76 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc178) + %right_idx_80 = tt.reshape %right_idx_76 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond = arith.cmpi slt, %ileft_62, %iright_64 : tensor<32x16xi32, #blocked> loc(#loc179) + %cond_81 = arith.cmpi slt, %ileft_63, %iright_65 : tensor<32x16xi32, #linear> loc(#loc179) + %eq = arith.cmpi eq, %ileft_62, %iright_64 : tensor<32x16xi32, #blocked> loc(#loc180) + %eq_82 = arith.cmpi eq, %ileft_63, %iright_65 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_83 = arith.cmpi sgt, %left_idx_77, %right_idx_79 : tensor<32x16xi32, #blocked> loc(#loc181) + %cond_84 = arith.cmpi sgt, %left_idx_78, %right_idx_80 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_85 = arith.andi %eq, %cond_83 : tensor<32x16xi1, #blocked> loc(#loc182) + %cond_86 = arith.andi %eq_82, %cond_84 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_87 = arith.ori %cond, %cond_85 : tensor<32x16xi1, #blocked> loc(#loc183) + %cond_88 = arith.ori %cond_81, %cond_86 : tensor<32x16xi1, #linear> loc(#loc183) + %cond_89 = arith.extui %cond_87 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc184) + %cond_90 = arith.extui %cond_88 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184) + %cond_91 = arith.xori %cond_89, %flip_49 : tensor<32x16xi32, #blocked> loc(#loc184) + %cond_92 = arith.xori %cond_90, %flip_50 : tensor<32x16xi32, #linear> loc(#loc184) + %cond_93 = arith.cmpi ne, %cond_91, %cst_11 : tensor<32x16xi32, #blocked> loc(#loc185) + %cond_94 = arith.cmpi ne, %cond_92, %cst : tensor<32x16xi32, #linear> loc(#loc185) + %ret = arith.xori %ileft_62, %iright_64 : tensor<32x16xi32, #blocked> loc(#loc186) + %ret_95 = arith.select %cond_93, %ret, %cst_11 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc187) + %ret_96 = arith.xori %tmp0_36, %ret_95 : tensor<32x16xi32, #blocked> loc(#loc188) + %ret_97 = ttg.convert_layout %ret_96 : tensor<32x16xi32, #blocked> -> tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs = arith.xori %left_idx_78, %right_idx_80 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_98 = arith.select %cond_94, %new_idxs, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_99 = arith.extsi %tmp2 : tensor<1x16xi16, #linear> to tensor<1x16xi32, #linear> loc(#loc191) + %new_idxs_100 = tt.broadcast %new_idxs_99 : tensor<1x16xi32, #linear> -> tensor<32x16xi32, #linear> loc(#loc191) + %new_idxs_101 = arith.xori %new_idxs_100, %new_idxs_98 : tensor<32x16xi32, #linear> loc(#loc191) + %flip_102 = tt.broadcast %flip_46 : tensor<1x2x1xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc147) + %flip_103 = tt.reshape %flip_102 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc148) + %y_104 = tt.reshape %ret_96 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #linear2> loc(#loc154) + %ileft_105 = tt.broadcast %left_mask_51 : tensor<1x2x1xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc156) + %ileft_106 = arith.muli %y_104, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc156) + %ileft_107 = "tt.reduce"(%ileft_106) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193) + %ileft_108 = tt.expand_dims %ileft_107 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc158) + %ileft_109 = tt.broadcast %ileft_108 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc159) + %iright_110 = arith.muli %y_104, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc160) + %iright_111 = "tt.reduce"(%iright_110) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195) + %iright_112 = tt.expand_dims %iright_111 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc162) + %iright_113 = tt.broadcast %iright_112 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc163) + %ileft_114 = tt.reshape %ileft_109 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_115 = tt.reshape %iright_113 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_116 = tt.reshape %new_idxs_101 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc166) + %left_idx_117 = arith.muli %y_idx_116, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc168) + %left_idx_118 = "tt.reduce"(%left_idx_117) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198) + %left_idx_119 = tt.expand_dims %left_idx_118 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc170) + %left_idx_120 = tt.broadcast %left_idx_119 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc171) + %right_idx_121 = arith.muli %y_idx_116, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc173) + %right_idx_122 = "tt.reduce"(%right_idx_121) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201) + %right_idx_123 = tt.expand_dims %right_idx_122 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc175) + %right_idx_124 = tt.broadcast %right_idx_123 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc176) + %left_idx_125 = tt.reshape %left_idx_120 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_126 = tt.reshape %right_idx_124 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_127 = arith.cmpi slt, %ileft_114, %iright_115 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_128 = arith.cmpi eq, %ileft_114, %iright_115 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_129 = arith.cmpi sgt, %left_idx_125, %right_idx_126 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_130 = arith.andi %eq_128, %cond_129 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_131 = arith.ori %cond_127, %cond_130 : tensor<32x16xi1, #linear> loc(#loc183) + %cond_132 = arith.extui %cond_131 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184) + %cond_133 = arith.xori %cond_132, %flip_103 : tensor<32x16xi32, #linear> loc(#loc184) + %cond_134 = arith.cmpi ne, %cond_133, %cst : tensor<32x16xi32, #linear> loc(#loc185) + %ret_135 = arith.xori %ileft_114, %iright_115 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_136 = arith.select %cond_134, %ret_135, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_137 = arith.xori %ret_97, %ret_136 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_138 = arith.xori %left_idx_125, %right_idx_126 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_139 = arith.select %cond_134, %new_idxs_138, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_140 = arith.xori %new_idxs_101, %new_idxs_139 : tensor<32x16xi32, #linear> loc(#loc191) + %y_141 = tt.reshape %ret_137 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc154) + %ileft_142 = arith.muli %y_141, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156) + %ileft_143 = "tt.reduce"(%ileft_142) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_144 = tt.expand_dims %ileft_143 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158) + %ileft_145 = tt.broadcast %ileft_144 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159) + %iright_146 = arith.muli %y_141, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160) + %iright_147 = "tt.reduce"(%iright_146) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_148 = tt.expand_dims %iright_147 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162) + %iright_149 = tt.broadcast %iright_148 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163) + %ileft_150 = tt.reshape %ileft_145 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_151 = tt.reshape %iright_149 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_152 = tt.reshape %new_idxs_140 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc166) + %left_idx_153 = arith.muli %y_idx_152, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc168) + %left_idx_154 = "tt.reduce"(%left_idx_153) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_155 = tt.expand_dims %left_idx_154 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170) + %left_idx_156 = tt.broadcast %left_idx_155 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171) + %right_idx_157 = arith.muli %y_idx_152, %iright : tensor<256x2x1xi32, #linear1> loc(#loc173) + %right_idx_158 = "tt.reduce"(%right_idx_157) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_159 = tt.expand_dims %right_idx_158 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175) + %right_idx_160 = tt.broadcast %right_idx_159 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176) + %left_idx_161 = tt.reshape %left_idx_156 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_162 = tt.reshape %right_idx_160 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_163 = arith.cmpi slt, %ileft_150, %iright_151 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_164 = arith.cmpi eq, %ileft_150, %iright_151 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_165 = arith.cmpi sgt, %left_idx_161, %right_idx_162 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_166 = arith.andi %eq_164, %cond_165 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_167 = arith.ori %cond_163, %cond_166 : tensor<32x16xi1, #linear> loc(#loc183) + %cond_168 = arith.extui %cond_167 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184) + %cond_169 = arith.xori %cond_168, %flip_103 : tensor<32x16xi32, #linear> loc(#loc184) + %cond_170 = arith.cmpi ne, %cond_169, %cst : tensor<32x16xi32, #linear> loc(#loc185) + %ret_171 = arith.xori %ileft_150, %iright_151 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_172 = arith.select %cond_170, %ret_171, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_173 = arith.xori %ret_137, %ret_172 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_174 = arith.xori %left_idx_161, %right_idx_162 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_175 = arith.select %cond_170, %new_idxs_174, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_176 = arith.xori %new_idxs_140, %new_idxs_175 : tensor<32x16xi32, #linear> loc(#loc191) + %flip_177 = tt.broadcast %flip_47 : tensor<1x2x1xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc147) + %flip_178 = tt.reshape %flip_177 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc148) + %y_179 = tt.reshape %ret_173 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc154) + %ileft_180 = tt.broadcast %left_mask_52 : tensor<1x2x1xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc156) + %ileft_181 = arith.muli %y_179, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc156) + %ileft_182 = "tt.reduce"(%ileft_181) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc193) + %ileft_183 = tt.expand_dims %ileft_182 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc158) + %ileft_184 = tt.broadcast %ileft_183 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc159) + %iright_185 = arith.muli %y_179, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc160) + %iright_186 = "tt.reduce"(%iright_185) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc195) + %iright_187 = tt.expand_dims %iright_186 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc162) + %iright_188 = tt.broadcast %iright_187 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc163) + %ileft_189 = tt.reshape %ileft_184 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_190 = tt.reshape %iright_188 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_191 = tt.reshape %new_idxs_176 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc166) + %left_idx_192 = arith.muli %y_idx_191, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc168) + %left_idx_193 = "tt.reduce"(%left_idx_192) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc198) + %left_idx_194 = tt.expand_dims %left_idx_193 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc170) + %left_idx_195 = tt.broadcast %left_idx_194 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc171) + %right_idx_196 = arith.muli %y_idx_191, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc173) + %right_idx_197 = "tt.reduce"(%right_idx_196) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc201) + %right_idx_198 = tt.expand_dims %right_idx_197 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc175) + %right_idx_199 = tt.broadcast %right_idx_198 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc176) + %left_idx_200 = tt.reshape %left_idx_195 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_201 = tt.reshape %right_idx_199 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_202 = arith.cmpi slt, %ileft_189, %iright_190 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_203 = arith.cmpi eq, %ileft_189, %iright_190 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_204 = arith.cmpi sgt, %left_idx_200, %right_idx_201 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_205 = arith.andi %eq_203, %cond_204 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_206 = arith.ori %cond_202, %cond_205 : tensor<32x16xi1, #linear> loc(#loc183) + %cond_207 = arith.extui %cond_206 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184) + %cond_208 = arith.xori %cond_207, %flip_178 : tensor<32x16xi32, #linear> loc(#loc184) + %cond_209 = arith.cmpi ne, %cond_208, %cst : tensor<32x16xi32, #linear> loc(#loc185) + %ret_210 = arith.xori %ileft_189, %iright_190 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_211 = arith.select %cond_209, %ret_210, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_212 = arith.xori %ret_173, %ret_211 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_213 = arith.xori %left_idx_200, %right_idx_201 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_214 = arith.select %cond_209, %new_idxs_213, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_215 = arith.xori %new_idxs_176, %new_idxs_214 : tensor<32x16xi32, #linear> loc(#loc191) + %y_216 = tt.reshape %ret_212 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc154) + %ileft_217 = arith.muli %y_216, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc156) + %ileft_218 = "tt.reduce"(%ileft_217) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193) + %ileft_219 = tt.expand_dims %ileft_218 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc158) + %ileft_220 = tt.broadcast %ileft_219 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc159) + %iright_221 = arith.muli %y_216, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc160) + %iright_222 = "tt.reduce"(%iright_221) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195) + %iright_223 = tt.expand_dims %iright_222 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc162) + %iright_224 = tt.broadcast %iright_223 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc163) + %ileft_225 = tt.reshape %ileft_220 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_226 = tt.reshape %iright_224 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_227 = tt.reshape %new_idxs_215 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc166) + %left_idx_228 = arith.muli %y_idx_227, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc168) + %left_idx_229 = "tt.reduce"(%left_idx_228) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198) + %left_idx_230 = tt.expand_dims %left_idx_229 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc170) + %left_idx_231 = tt.broadcast %left_idx_230 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc171) + %right_idx_232 = arith.muli %y_idx_227, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc173) + %right_idx_233 = "tt.reduce"(%right_idx_232) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201) + %right_idx_234 = tt.expand_dims %right_idx_233 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc175) + %right_idx_235 = tt.broadcast %right_idx_234 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc176) + %left_idx_236 = tt.reshape %left_idx_231 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_237 = tt.reshape %right_idx_235 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_238 = arith.cmpi slt, %ileft_225, %iright_226 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_239 = arith.cmpi eq, %ileft_225, %iright_226 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_240 = arith.cmpi sgt, %left_idx_236, %right_idx_237 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_241 = arith.andi %eq_239, %cond_240 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_242 = arith.ori %cond_238, %cond_241 : tensor<32x16xi1, #linear> loc(#loc183) + %cond_243 = arith.extui %cond_242 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184) + %cond_244 = arith.xori %cond_243, %flip_178 : tensor<32x16xi32, #linear> loc(#loc184) + %cond_245 = arith.cmpi ne, %cond_244, %cst : tensor<32x16xi32, #linear> loc(#loc185) + %ret_246 = arith.xori %ileft_225, %iright_226 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_247 = arith.select %cond_245, %ret_246, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_248 = arith.xori %ret_212, %ret_247 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_249 = arith.xori %left_idx_236, %right_idx_237 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_250 = arith.select %cond_245, %new_idxs_249, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_251 = arith.xori %new_idxs_215, %new_idxs_250 : tensor<32x16xi32, #linear> loc(#loc191) + %y_252 = tt.reshape %ret_248 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc154) + %ileft_253 = arith.muli %y_252, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156) + %ileft_254 = "tt.reduce"(%ileft_253) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_255 = tt.expand_dims %ileft_254 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158) + %ileft_256 = tt.broadcast %ileft_255 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159) + %iright_257 = arith.muli %y_252, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160) + %iright_258 = "tt.reduce"(%iright_257) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_259 = tt.expand_dims %iright_258 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162) + %iright_260 = tt.broadcast %iright_259 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163) + %ileft_261 = tt.reshape %ileft_256 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_262 = tt.reshape %iright_260 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_263 = tt.reshape %new_idxs_251 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc166) + %left_idx_264 = arith.muli %y_idx_263, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc168) + %left_idx_265 = "tt.reduce"(%left_idx_264) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_266 = tt.expand_dims %left_idx_265 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170) + %left_idx_267 = tt.broadcast %left_idx_266 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171) + %right_idx_268 = arith.muli %y_idx_263, %iright : tensor<256x2x1xi32, #linear1> loc(#loc173) + %right_idx_269 = "tt.reduce"(%right_idx_268) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_270 = tt.expand_dims %right_idx_269 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175) + %right_idx_271 = tt.broadcast %right_idx_270 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176) + %left_idx_272 = tt.reshape %left_idx_267 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_273 = tt.reshape %right_idx_271 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_274 = arith.cmpi slt, %ileft_261, %iright_262 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_275 = arith.cmpi eq, %ileft_261, %iright_262 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_276 = arith.cmpi sgt, %left_idx_272, %right_idx_273 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_277 = arith.andi %eq_275, %cond_276 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_278 = arith.ori %cond_274, %cond_277 : tensor<32x16xi1, #linear> loc(#loc183) + %cond_279 = arith.extui %cond_278 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184) + %cond_280 = arith.xori %cond_279, %flip_178 : tensor<32x16xi32, #linear> loc(#loc184) + %cond_281 = arith.cmpi ne, %cond_280, %cst : tensor<32x16xi32, #linear> loc(#loc185) + %ret_282 = arith.xori %ileft_261, %iright_262 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_283 = arith.select %cond_281, %ret_282, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_284 = arith.xori %ret_248, %ret_283 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_285 = arith.xori %left_idx_272, %right_idx_273 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_286 = arith.select %cond_281, %new_idxs_285, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_287 = arith.xori %new_idxs_251, %new_idxs_286 : tensor<32x16xi32, #linear> loc(#loc191) + %y_288 = tt.reshape %ret_284 : tensor<32x16xi32, #linear> -> tensor<32x2x8xi32, #linear4> loc(#loc154) + %ileft_289 = tt.broadcast %left_mask_53 : tensor<1x2x1xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc156) + %ileft_290 = arith.muli %y_288, %ileft_289 : tensor<32x2x8xi32, #linear4> loc(#loc156) + %ileft_291 = "tt.reduce"(%ileft_290) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc193) + %ileft_292 = tt.expand_dims %ileft_291 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc158) + %ileft_293 = tt.broadcast %ileft_292 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc159) + %iright_294 = arith.muli %y_288, %flip_177 : tensor<32x2x8xi32, #linear4> loc(#loc160) + %iright_295 = "tt.reduce"(%iright_294) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc195) + %iright_296 = tt.expand_dims %iright_295 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc162) + %iright_297 = tt.broadcast %iright_296 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc163) + %ileft_298 = tt.reshape %ileft_293 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_299 = tt.reshape %iright_297 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_300 = tt.reshape %new_idxs_287 : tensor<32x16xi32, #linear> -> tensor<32x2x8xi32, #linear4> loc(#loc166) + %left_idx_301 = arith.muli %y_idx_300, %ileft_289 : tensor<32x2x8xi32, #linear4> loc(#loc168) + %left_idx_302 = "tt.reduce"(%left_idx_301) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc198) + %left_idx_303 = tt.expand_dims %left_idx_302 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc170) + %left_idx_304 = tt.broadcast %left_idx_303 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc171) + %right_idx_305 = arith.muli %y_idx_300, %flip_177 : tensor<32x2x8xi32, #linear4> loc(#loc173) + %right_idx_306 = "tt.reduce"(%right_idx_305) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc201) + %right_idx_307 = tt.expand_dims %right_idx_306 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc175) + %right_idx_308 = tt.broadcast %right_idx_307 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc176) + %left_idx_309 = tt.reshape %left_idx_304 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_310 = tt.reshape %right_idx_308 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_311 = arith.cmpi slt, %ileft_298, %iright_299 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_312 = arith.cmpi eq, %ileft_298, %iright_299 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_313 = arith.cmpi sgt, %left_idx_309, %right_idx_310 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_314 = arith.andi %eq_312, %cond_313 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_315 = arith.ori %cond_311, %cond_314 : tensor<32x16xi1, #linear> loc(#loc183) + %ret_316 = arith.xori %ileft_298, %iright_299 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_317 = arith.select %cond_315, %ret_316, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_318 = arith.xori %ret_284, %ret_317 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_319 = arith.xori %left_idx_309, %right_idx_310 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_320 = arith.select %cond_315, %new_idxs_319, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_321 = arith.xori %new_idxs_287, %new_idxs_320 : tensor<32x16xi32, #linear> loc(#loc191) + %y_322 = tt.reshape %ret_318 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc154) + %ileft_323 = arith.muli %y_322, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc156) + %ileft_324 = "tt.reduce"(%ileft_323) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc193) + %ileft_325 = tt.expand_dims %ileft_324 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc158) + %ileft_326 = tt.broadcast %ileft_325 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc159) + %iright_327 = arith.muli %y_322, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc160) + %iright_328 = "tt.reduce"(%iright_327) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc195) + %iright_329 = tt.expand_dims %iright_328 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc162) + %iright_330 = tt.broadcast %iright_329 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc163) + %ileft_331 = tt.reshape %ileft_326 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_332 = tt.reshape %iright_330 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_333 = tt.reshape %new_idxs_321 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc166) + %left_idx_334 = arith.muli %y_idx_333, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc168) + %left_idx_335 = "tt.reduce"(%left_idx_334) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc198) + %left_idx_336 = tt.expand_dims %left_idx_335 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc170) + %left_idx_337 = tt.broadcast %left_idx_336 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc171) + %right_idx_338 = arith.muli %y_idx_333, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc173) + %right_idx_339 = "tt.reduce"(%right_idx_338) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc201) + %right_idx_340 = tt.expand_dims %right_idx_339 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc175) + %right_idx_341 = tt.broadcast %right_idx_340 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc176) + %left_idx_342 = tt.reshape %left_idx_337 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_343 = tt.reshape %right_idx_341 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_344 = arith.cmpi slt, %ileft_331, %iright_332 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_345 = arith.cmpi eq, %ileft_331, %iright_332 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_346 = arith.cmpi sgt, %left_idx_342, %right_idx_343 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_347 = arith.andi %eq_345, %cond_346 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_348 = arith.ori %cond_344, %cond_347 : tensor<32x16xi1, #linear> loc(#loc183) + %ret_349 = arith.xori %ileft_331, %iright_332 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_350 = arith.select %cond_348, %ret_349, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_351 = arith.xori %ret_318, %ret_350 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_352 = arith.xori %left_idx_342, %right_idx_343 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_353 = arith.select %cond_348, %new_idxs_352, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_354 = arith.xori %new_idxs_321, %new_idxs_353 : tensor<32x16xi32, #linear> loc(#loc191) + %y_355 = tt.reshape %ret_351 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc154) + %ileft_356 = arith.muli %y_355, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc156) + %ileft_357 = "tt.reduce"(%ileft_356) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193) + %ileft_358 = tt.expand_dims %ileft_357 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc158) + %ileft_359 = tt.broadcast %ileft_358 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc159) + %iright_360 = arith.muli %y_355, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc160) + %iright_361 = "tt.reduce"(%iright_360) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195) + %iright_362 = tt.expand_dims %iright_361 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc162) + %iright_363 = tt.broadcast %iright_362 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc163) + %ileft_364 = tt.reshape %ileft_359 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_365 = tt.reshape %iright_363 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_366 = tt.reshape %new_idxs_354 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc166) + %left_idx_367 = arith.muli %y_idx_366, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc168) + %left_idx_368 = "tt.reduce"(%left_idx_367) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198) + %left_idx_369 = tt.expand_dims %left_idx_368 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc170) + %left_idx_370 = tt.broadcast %left_idx_369 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc171) + %right_idx_371 = arith.muli %y_idx_366, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc173) + %right_idx_372 = "tt.reduce"(%right_idx_371) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201) + %right_idx_373 = tt.expand_dims %right_idx_372 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc175) + %right_idx_374 = tt.broadcast %right_idx_373 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc176) + %left_idx_375 = tt.reshape %left_idx_370 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_376 = tt.reshape %right_idx_374 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_377 = arith.cmpi slt, %ileft_364, %iright_365 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_378 = arith.cmpi eq, %ileft_364, %iright_365 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_379 = arith.cmpi sgt, %left_idx_375, %right_idx_376 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_380 = arith.andi %eq_378, %cond_379 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_381 = arith.ori %cond_377, %cond_380 : tensor<32x16xi1, #linear> loc(#loc183) + %ret_382 = arith.xori %ileft_364, %iright_365 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_383 = arith.select %cond_381, %ret_382, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_384 = arith.xori %ret_351, %ret_383 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_385 = arith.xori %left_idx_375, %right_idx_376 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_386 = arith.select %cond_381, %new_idxs_385, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_387 = arith.xori %new_idxs_354, %new_idxs_386 : tensor<32x16xi32, #linear> loc(#loc191) + %y_388 = tt.reshape %ret_384 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc154) + %ileft_389 = arith.muli %y_388, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156) + %ileft_390 = "tt.reduce"(%ileft_389) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_391 = tt.expand_dims %ileft_390 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158) + %ileft_392 = tt.broadcast %ileft_391 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159) + %iright_393 = arith.muli %y_388, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160) + %iright_394 = "tt.reduce"(%iright_393) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_395 = tt.expand_dims %iright_394 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162) + %iright_396 = tt.broadcast %iright_395 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163) + %ileft_397 = tt.reshape %ileft_392 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_398 = tt.reshape %iright_396 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_399 = tt.reshape %new_idxs_387 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc166) + %left_idx_400 = arith.muli %y_idx_399, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc168) + %left_idx_401 = "tt.reduce"(%left_idx_400) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_402 = tt.expand_dims %left_idx_401 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170) + %left_idx_403 = tt.broadcast %left_idx_402 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171) + %right_idx_404 = arith.muli %y_idx_399, %iright : tensor<256x2x1xi32, #linear1> loc(#loc173) + %right_idx_405 = "tt.reduce"(%right_idx_404) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_406 = tt.expand_dims %right_idx_405 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175) + %right_idx_407 = tt.broadcast %right_idx_406 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176) + %left_idx_408 = tt.reshape %left_idx_403 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_409 = tt.reshape %right_idx_407 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_410 = arith.cmpi slt, %ileft_397, %iright_398 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_411 = arith.cmpi eq, %ileft_397, %iright_398 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_412 = arith.cmpi sgt, %left_idx_408, %right_idx_409 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_413 = arith.andi %eq_411, %cond_412 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_414 = arith.ori %cond_410, %cond_413 : tensor<32x16xi1, #linear> loc(#loc183) + %new_idxs_415 = arith.xori %left_idx_408, %right_idx_409 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_416 = arith.select %cond_414, %new_idxs_415, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_417 = arith.xori %new_idxs_387, %new_idxs_416 : tensor<32x16xi32, #linear> loc(#loc191) + %tmp7 = arith.extsi %tmp0_36 : tensor<32x16xi32, #blocked> to tensor<32x16xi64, #blocked> loc(#loc141) + %tmp10 = arith.select %tmp0_34, %tmp7, %cst_0 : tensor<32x16xi1, #blocked>, tensor<32x16xi64, #blocked> loc(#loc142) + %tmp11 = "tt.reduce"(%tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_419: i64 loc(callsite(#loc1 at #loc143)), %tmp11_420: i64 loc(callsite(#loc1 at #loc143))): + %tmp11_421 = arith.addi %tmp11_419, %tmp11_420 : i64 loc(#loc192) + tt.reduce.return %tmp11_421 : i64 loc(#loc152) + }) : (tensor<32x16xi64, #blocked>) -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc152) + %tmp11_418 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi64, #blocked> loc(#loc144) + %tmp14 = arith.trunci %tmp11_418 : tensor<32x1xi64, #blocked> to tensor<32x1xi32, #blocked> loc(#loc145) + %0 = arith.muli %xindex_19, %cst_4 : tensor<32x1xi32, #blocked1> loc(#loc70) + %1 = tt.broadcast %r0_index_25 : tensor<1x16xi32, #blocked1> -> tensor<32x16xi32, #blocked1> loc(#loc71) + %2 = tt.broadcast %0 : tensor<32x1xi32, #blocked1> -> tensor<32x16xi32, #blocked1> loc(#loc71) + %3 = arith.addi %1, %2 : tensor<32x16xi32, #blocked1> loc(#loc71) + %4 = tt.splat %out_ptr2 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked1> loc(#loc72) + %5 = tt.addptr %4, %3 : tensor<32x16x!tt.ptr, #blocked1>, tensor<32x16xi32, #blocked1> loc(#loc72) + %6 = ttg.convert_layout %new_idxs_417 : tensor<32x16xi32, #linear> -> tensor<32x16xi32, #blocked1> loc(#loc73) + tt.store %5, %6, %tmp0_35 : tensor<32x16x!tt.ptr, #blocked1> loc(#loc73) + %7 = tt.splat %out_ptr3 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> loc(#loc74) + %8 = tt.addptr %7, %xindex_18 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> loc(#loc74) + tt.store %8, %tmp14, %xmask : tensor<32x1x!tt.ptr, #blocked> loc(#loc75) + tt.return loc(#loc76) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":24:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":26:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":27:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":33:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":34:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:35) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:49) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:30) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:54) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":38:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":40:33) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":42:19) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":44:34) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:29) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":48:21) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:32) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:47) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:25) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:37) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:4) +#loc82 = loc("xoffset"(#loc2)) +#loc83 = loc("xoffset"(#loc3)) +#loc84 = loc("xindex"(#loc4)) +#loc85 = loc("xindex"(#loc5)) +#loc86 = loc("xmask"(#loc6)) +#loc87 = loc("r0_index"(#loc7)) +#loc88 = loc("x0"(#loc8)) +#loc89 = loc("x1"(#loc9)) +#loc90 = loc("tmp0"(#loc10)) +#loc91 = loc("tmp0"(#loc11)) +#loc92 = loc("tmp0"(#loc12)) +#loc93 = loc("tmp0"(#loc13)) +#loc94 = loc("tmp0"(#loc14)) +#loc95 = loc("tmp0"(#loc15)) +#loc96 = loc("tmp2"(#loc16)) +#loc97 = loc("tmp4"(#loc17)) +#loc98 = loc("flip"(#loc18)) +#loc100 = loc("flip"(#loc21)) +#loc101 = loc("flip"(#loc22)) +#loc102 = loc("y"(#loc23)) +#loc103 = loc("left_mask"(#loc25)) +#loc104 = loc("ileft"(#loc26)) +#loc106 = loc("ileft"(#loc30)) +#loc107 = loc("ileft"(#loc31)) +#loc108 = loc("iright"(#loc32)) +#loc110 = loc("iright"(#loc34)) +#loc111 = loc("iright"(#loc35)) +#loc112 = loc("ileft"(#loc36)) +#loc113 = loc("iright"(#loc37)) +#loc114 = loc("y_idx"(#loc38)) +#loc115 = loc("left_idx"(#loc39)) +#loc116 = loc("left_idx"(#loc40)) +#loc117 = loc("input"(#loc41)) +#loc119 = loc("left_idx"(#loc43)) +#loc120 = loc("left_idx"(#loc44)) +#loc121 = loc("right_idx"(#loc45)) +#loc122 = loc("right_idx"(#loc46)) +#loc124 = loc("right_idx"(#loc48)) +#loc125 = loc("right_idx"(#loc49)) +#loc126 = loc("left_idx"(#loc50)) +#loc127 = loc("right_idx"(#loc51)) +#loc128 = loc("cond"(#loc52)) +#loc129 = loc("eq"(#loc53)) +#loc130 = loc("cond"(#loc54)) +#loc131 = loc("cond"(#loc55)) +#loc132 = loc("cond"(#loc56)) +#loc133 = loc("cond"(#loc57)) +#loc134 = loc("cond"(#loc58)) +#loc135 = loc("ret"(#loc59)) +#loc136 = loc("ret"(#loc60)) +#loc137 = loc("ret"(#loc61)) +#loc138 = loc("new_idxs"(#loc62)) +#loc139 = loc("new_idxs"(#loc63)) +#loc140 = loc("new_idxs"(#loc64)) +#loc141 = loc("tmp7"(#loc65)) +#loc142 = loc("tmp10"(#loc66)) +#loc144 = loc("tmp11"(#loc68)) +#loc145 = loc("tmp14"(#loc69)) +#loc146 = loc(callsite(#loc98 at #loc99)) +#loc147 = loc(callsite(#loc100 at #loc99)) +#loc148 = loc(callsite(#loc101 at #loc99)) +#loc150 = loc("cond"(#loc128)) +#loc151 = loc("eq"(#loc129)) +#loc152 = loc(callsite(#loc27 at #loc143)) +#loc154 = loc(callsite(#loc102 at #loc149)) +#loc155 = loc(callsite(#loc103 at #loc149)) +#loc156 = loc(callsite(#loc104 at #loc149)) +#loc158 = loc(callsite(#loc106 at #loc149)) +#loc159 = loc(callsite(#loc107 at #loc149)) +#loc160 = loc(callsite(#loc108 at #loc149)) +#loc162 = loc(callsite(#loc110 at #loc149)) +#loc163 = loc(callsite(#loc111 at #loc149)) +#loc164 = loc(callsite(#loc112 at #loc149)) +#loc165 = loc(callsite(#loc113 at #loc149)) +#loc166 = loc(callsite(#loc114 at #loc149)) +#loc167 = loc(callsite(#loc115 at #loc149)) +#loc168 = loc(callsite(#loc116 at #loc149)) +#loc170 = loc(callsite(#loc119 at #loc149)) +#loc171 = loc(callsite(#loc120 at #loc149)) +#loc172 = loc(callsite(#loc121 at #loc149)) +#loc173 = loc(callsite(#loc122 at #loc149)) +#loc175 = loc(callsite(#loc124 at #loc149)) +#loc176 = loc(callsite(#loc125 at #loc149)) +#loc177 = loc(callsite(#loc126 at #loc149)) +#loc178 = loc(callsite(#loc127 at #loc149)) +#loc179 = loc(callsite(#loc150 at #loc149)) +#loc180 = loc(callsite(#loc151 at #loc149)) +#loc181 = loc(callsite(#loc130 at #loc149)) +#loc182 = loc(callsite(#loc131 at #loc149)) +#loc183 = loc(callsite(#loc132 at #loc149)) +#loc184 = loc(callsite(#loc133 at #loc149)) +#loc185 = loc(callsite(#loc134 at #loc149)) +#loc186 = loc(callsite(#loc135 at #loc149)) +#loc187 = loc(callsite(#loc136 at #loc149)) +#loc188 = loc(callsite(#loc137 at #loc149)) +#loc189 = loc(callsite(#loc138 at #loc149)) +#loc190 = loc(callsite(#loc139 at #loc149)) +#loc191 = loc(callsite(#loc140 at #loc149)) +#loc192 = loc(callsite(#loc29 at #loc152)) +#loc193 = loc(callsite(#loc27 at #loc157)) +#loc195 = loc(callsite(#loc27 at #loc161)) +#loc197 = loc(callsite(#loc117 at #loc169)) +#loc198 = loc(callsite(#loc27 at #loc169)) +#loc200 = loc(callsite(#loc117 at #loc174)) +#loc201 = loc(callsite(#loc27 at #loc174)) +#loc203 = loc(callsite(#loc29 at #loc193)) +#loc204 = loc(callsite(#loc29 at #loc195)) +#loc205 = loc(callsite(#loc29 at #loc198)) +#loc206 = loc(callsite(#loc29 at #loc201)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir new file mode 100644 index 0000000000000000000000000000000000000000..f64385b8d5283abf0a40bc06d05efa33470df930 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir @@ -0,0 +1,799 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":41:67) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:26) +#loc80 = loc("in_ptr0"(#loc)) +#loc81 = loc("out_ptr2"(#loc)) +#loc82 = loc("out_ptr3"(#loc)) +#loc83 = loc("xnumel"(#loc)) +#loc84 = loc("r0_numel"(#loc)) +#loc106 = loc(callsite(#loc23 at #loc2)) +#loc113 = loc("ileft"(#loc32)) +#loc117 = loc("iright"(#loc37)) +#loc126 = loc("left_idx"(#loc46)) +#loc131 = loc("right_idx"(#loc51)) +#loc150 = loc("tmp11"(#loc70)) +#loc157 = loc(callsite(#loc28 at #loc106)) +#loc161 = loc(callsite(#loc1 at #loc150)) +#loc165 = loc(callsite(#loc113 at #loc157)) +#loc169 = loc(callsite(#loc117 at #loc157)) +#loc177 = loc(callsite(#loc126 at #loc157)) +#loc182 = loc(callsite(#loc131 at #loc157)) +#loc202 = loc(callsite(#loc1 at #loc165)) +#loc204 = loc(callsite(#loc1 at #loc169)) +#loc207 = loc(callsite(#loc1 at #loc177)) +#loc210 = loc(callsite(#loc1 at #loc182)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc85) + %cst_0 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc1) + %tmp10 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc86) + %tmp0 = arith.constant dense<272> : tensor<32x1xi32> loc(#loc87) + %tmp0_1 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc88) + %cst_2 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc1) + %xmask = arith.constant dense<32> : tensor<32x1xi32> loc(#loc89) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc90) + %xoffset_3 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc91) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc92) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc93) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<32x1xi32> loc(#loc94) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<32x1xi32> loc(#loc94) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<32x1xi32> loc(#loc89) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc95) + %r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc96) + %x0 = arith.remsi %xindex_6, %cst_2 : tensor<32x1xi32> loc(#loc97) + %x1 = arith.divsi %xindex_6, %cst_2 : tensor<32x1xi32> loc(#loc98) + %tmp0_9 = arith.muli %r0_index_8, %tmp0_1 : tensor<1x16xi32> loc(#loc88) + %tmp0_10 = tt.broadcast %x0 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc99) + %tmp0_11 = tt.broadcast %tmp0_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc99) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<32x16xi32> loc(#loc99) + %tmp0_13 = arith.muli %x1, %tmp0 : tensor<32x1xi32> loc(#loc87) + %tmp0_14 = tt.broadcast %tmp0_13 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc100) + %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<32x16xi32> loc(#loc100) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc101) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc101) + %tmp0_18 = tt.broadcast %xmask_7 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc102) + %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %cst_0 : tensor<32x16x!tt.ptr> loc(#loc102) + %tmp2 = arith.trunci %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc103) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<32x16xi16> loc(#loc104) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc153) + %flip_20 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc154) + %flip_21 = tt.expand_dims %flip_20 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc154) + %flip_22 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc155) + %flip_23 = tt.reshape %flip_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc156) + %y = tt.reshape %tmp0_19 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162) + %left_mask = arith.subi %cst, %flip_21 : tensor<1x2x1xi32> loc(#loc163) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc164) + %ileft_24 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc164) + %ileft_25 = "tt.reduce"(%ileft_24) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201) + %ileft_26 = tt.expand_dims %ileft_25 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166) + %ileft_27 = tt.broadcast %ileft_26 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167) + %iright = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc168) + %iright_28 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc168) + %iright_29 = "tt.reduce"(%iright_28) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203) + %iright_30 = tt.expand_dims %iright_29 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170) + %iright_31 = tt.broadcast %iright_30 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171) + %ileft_32 = tt.reshape %ileft_27 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_33 = tt.reshape %iright_31 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx = tt.reshape %tmp4 : tensor<32x16xi16> -> tensor<256x2x1xi16> loc(#loc174) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc175) + %left_idx_34 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc176) + %left_idx_35 = arith.muli %y_idx, %left_idx_34 : tensor<256x2x1xi16> loc(#loc176) + %input = arith.extsi %left_idx_35 : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc205) + %left_idx_36 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206) + %left_idx_37 = tt.expand_dims %left_idx_36 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178) + %left_idx_38 = tt.broadcast %left_idx_37 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179) + %right_idx = arith.trunci %flip_21 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc180) + %right_idx_39 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc181) + %right_idx_40 = arith.muli %y_idx, %right_idx_39 : tensor<256x2x1xi16> loc(#loc181) + %input_41 = arith.extsi %right_idx_40 : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc208) + %right_idx_42 = "tt.reduce"(%input_41) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209) + %right_idx_43 = tt.expand_dims %right_idx_42 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183) + %right_idx_44 = tt.broadcast %right_idx_43 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184) + %left_idx_45 = tt.reshape %left_idx_38 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_46 = tt.reshape %right_idx_44 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186) + %cond = arith.cmpi slt, %ileft_32, %iright_33 : tensor<32x16xi32> loc(#loc187) + %eq = arith.cmpi eq, %ileft_32, %iright_33 : tensor<32x16xi32> loc(#loc188) + %cond_47 = arith.cmpi sgt, %left_idx_45, %right_idx_46 : tensor<32x16xi32> loc(#loc189) + %cond_48 = arith.andi %eq, %cond_47 : tensor<32x16xi1> loc(#loc190) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc191) + %cond_50 = arith.extui %cond_49 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192) + %cond_51 = arith.xori %cond_50, %flip_23 : tensor<32x16xi32> loc(#loc192) + %cond_52 = arith.cmpi ne, %cond_51, %cst_0 : tensor<32x16xi32> loc(#loc193) + %ret = arith.xori %ileft_32, %iright_33 : tensor<32x16xi32> loc(#loc194) + %ret_53 = arith.select %cond_52, %ret, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_54 = arith.xori %tmp0_19, %ret_53 : tensor<32x16xi32> loc(#loc196) + %new_idxs = arith.xori %left_idx_45, %right_idx_46 : tensor<32x16xi32> loc(#loc197) + %new_idxs_55 = arith.select %cond_52, %new_idxs, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_56 = arith.extsi %tmp2 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc199) + %new_idxs_57 = tt.broadcast %new_idxs_56 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc199) + %new_idxs_58 = arith.xori %new_idxs_57, %new_idxs_55 : tensor<32x16xi32> loc(#loc199) + %flip_59 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc155) + %flip_60 = tt.reshape %flip_59 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc156) + %y_61 = tt.reshape %ret_54 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc162) + %ileft_62 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc164) + %ileft_63 = arith.muli %y_61, %ileft_62 : tensor<128x2x2xi32> loc(#loc164) + %ileft_64 = "tt.reduce"(%ileft_63) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc201) + %ileft_65 = tt.expand_dims %ileft_64 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc166) + %ileft_66 = tt.broadcast %ileft_65 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc167) + %iright_67 = arith.muli %y_61, %flip_22 : tensor<128x2x2xi32> loc(#loc168) + %iright_68 = "tt.reduce"(%iright_67) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc203) + %iright_69 = tt.expand_dims %iright_68 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc170) + %iright_70 = tt.broadcast %iright_69 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc171) + %ileft_71 = tt.reshape %ileft_66 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_72 = tt.reshape %iright_70 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_73 = tt.reshape %new_idxs_58 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc174) + %left_idx_74 = arith.muli %y_idx_73, %ileft_62 : tensor<128x2x2xi32> loc(#loc176) + %left_idx_75 = "tt.reduce"(%left_idx_74) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc206) + %left_idx_76 = tt.expand_dims %left_idx_75 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc178) + %left_idx_77 = tt.broadcast %left_idx_76 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc179) + %right_idx_78 = arith.muli %y_idx_73, %flip_22 : tensor<128x2x2xi32> loc(#loc181) + %right_idx_79 = "tt.reduce"(%right_idx_78) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc209) + %right_idx_80 = tt.expand_dims %right_idx_79 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc183) + %right_idx_81 = tt.broadcast %right_idx_80 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc184) + %left_idx_82 = tt.reshape %left_idx_77 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_83 = tt.reshape %right_idx_81 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_84 = arith.cmpi slt, %ileft_71, %iright_72 : tensor<32x16xi32> loc(#loc187) + %eq_85 = arith.cmpi eq, %ileft_71, %iright_72 : tensor<32x16xi32> loc(#loc188) + %cond_86 = arith.cmpi sgt, %left_idx_82, %right_idx_83 : tensor<32x16xi32> loc(#loc189) + %cond_87 = arith.andi %eq_85, %cond_86 : tensor<32x16xi1> loc(#loc190) + %cond_88 = arith.ori %cond_84, %cond_87 : tensor<32x16xi1> loc(#loc191) + %cond_89 = arith.extui %cond_88 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192) + %cond_90 = arith.xori %cond_89, %flip_60 : tensor<32x16xi32> loc(#loc192) + %cond_91 = arith.cmpi ne, %cond_90, %cst_0 : tensor<32x16xi32> loc(#loc193) + %ret_92 = arith.xori %ileft_71, %iright_72 : tensor<32x16xi32> loc(#loc194) + %ret_93 = arith.select %cond_91, %ret_92, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_94 = arith.xori %ret_54, %ret_93 : tensor<32x16xi32> loc(#loc196) + %new_idxs_95 = arith.xori %left_idx_82, %right_idx_83 : tensor<32x16xi32> loc(#loc197) + %new_idxs_96 = arith.select %cond_91, %new_idxs_95, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_97 = arith.xori %new_idxs_58, %new_idxs_96 : tensor<32x16xi32> loc(#loc199) + %y_98 = tt.reshape %ret_94 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162) + %ileft_99 = arith.muli %y_98, %ileft : tensor<256x2x1xi32> loc(#loc164) + %ileft_100 = "tt.reduce"(%ileft_99) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201) + %ileft_101 = tt.expand_dims %ileft_100 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166) + %ileft_102 = tt.broadcast %ileft_101 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167) + %iright_103 = arith.muli %y_98, %iright : tensor<256x2x1xi32> loc(#loc168) + %iright_104 = "tt.reduce"(%iright_103) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203) + %iright_105 = tt.expand_dims %iright_104 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170) + %iright_106 = tt.broadcast %iright_105 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171) + %ileft_107 = tt.reshape %ileft_102 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_108 = tt.reshape %iright_106 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_109 = tt.reshape %new_idxs_97 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc174) + %left_idx_110 = arith.muli %y_idx_109, %ileft : tensor<256x2x1xi32> loc(#loc176) + %left_idx_111 = "tt.reduce"(%left_idx_110) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206) + %left_idx_112 = tt.expand_dims %left_idx_111 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178) + %left_idx_113 = tt.broadcast %left_idx_112 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179) + %right_idx_114 = arith.muli %y_idx_109, %iright : tensor<256x2x1xi32> loc(#loc181) + %right_idx_115 = "tt.reduce"(%right_idx_114) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209) + %right_idx_116 = tt.expand_dims %right_idx_115 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183) + %right_idx_117 = tt.broadcast %right_idx_116 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184) + %left_idx_118 = tt.reshape %left_idx_113 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_119 = tt.reshape %right_idx_117 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_120 = arith.cmpi slt, %ileft_107, %iright_108 : tensor<32x16xi32> loc(#loc187) + %eq_121 = arith.cmpi eq, %ileft_107, %iright_108 : tensor<32x16xi32> loc(#loc188) + %cond_122 = arith.cmpi sgt, %left_idx_118, %right_idx_119 : tensor<32x16xi32> loc(#loc189) + %cond_123 = arith.andi %eq_121, %cond_122 : tensor<32x16xi1> loc(#loc190) + %cond_124 = arith.ori %cond_120, %cond_123 : tensor<32x16xi1> loc(#loc191) + %cond_125 = arith.extui %cond_124 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192) + %cond_126 = arith.xori %cond_125, %flip_60 : tensor<32x16xi32> loc(#loc192) + %cond_127 = arith.cmpi ne, %cond_126, %cst_0 : tensor<32x16xi32> loc(#loc193) + %ret_128 = arith.xori %ileft_107, %iright_108 : tensor<32x16xi32> loc(#loc194) + %ret_129 = arith.select %cond_127, %ret_128, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_130 = arith.xori %ret_94, %ret_129 : tensor<32x16xi32> loc(#loc196) + %new_idxs_131 = arith.xori %left_idx_118, %right_idx_119 : tensor<32x16xi32> loc(#loc197) + %new_idxs_132 = arith.select %cond_127, %new_idxs_131, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_133 = arith.xori %new_idxs_97, %new_idxs_132 : tensor<32x16xi32> loc(#loc199) + %flip_134 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc155) + %flip_135 = tt.reshape %flip_134 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc156) + %y_136 = tt.reshape %ret_130 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc162) + %ileft_137 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc164) + %ileft_138 = arith.muli %y_136, %ileft_137 : tensor<64x2x4xi32> loc(#loc164) + %ileft_139 = "tt.reduce"(%ileft_138) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc201) + %ileft_140 = tt.expand_dims %ileft_139 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc166) + %ileft_141 = tt.broadcast %ileft_140 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc167) + %iright_142 = arith.muli %y_136, %flip_59 : tensor<64x2x4xi32> loc(#loc168) + %iright_143 = "tt.reduce"(%iright_142) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc203) + %iright_144 = tt.expand_dims %iright_143 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc170) + %iright_145 = tt.broadcast %iright_144 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc171) + %ileft_146 = tt.reshape %ileft_141 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_147 = tt.reshape %iright_145 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_148 = tt.reshape %new_idxs_133 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc174) + %left_idx_149 = arith.muli %y_idx_148, %ileft_137 : tensor<64x2x4xi32> loc(#loc176) + %left_idx_150 = "tt.reduce"(%left_idx_149) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc206) + %left_idx_151 = tt.expand_dims %left_idx_150 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc178) + %left_idx_152 = tt.broadcast %left_idx_151 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc179) + %right_idx_153 = arith.muli %y_idx_148, %flip_59 : tensor<64x2x4xi32> loc(#loc181) + %right_idx_154 = "tt.reduce"(%right_idx_153) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc209) + %right_idx_155 = tt.expand_dims %right_idx_154 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc183) + %right_idx_156 = tt.broadcast %right_idx_155 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc184) + %left_idx_157 = tt.reshape %left_idx_152 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_158 = tt.reshape %right_idx_156 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_159 = arith.cmpi slt, %ileft_146, %iright_147 : tensor<32x16xi32> loc(#loc187) + %eq_160 = arith.cmpi eq, %ileft_146, %iright_147 : tensor<32x16xi32> loc(#loc188) + %cond_161 = arith.cmpi sgt, %left_idx_157, %right_idx_158 : tensor<32x16xi32> loc(#loc189) + %cond_162 = arith.andi %eq_160, %cond_161 : tensor<32x16xi1> loc(#loc190) + %cond_163 = arith.ori %cond_159, %cond_162 : tensor<32x16xi1> loc(#loc191) + %cond_164 = arith.extui %cond_163 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192) + %cond_165 = arith.xori %cond_164, %flip_135 : tensor<32x16xi32> loc(#loc192) + %cond_166 = arith.cmpi ne, %cond_165, %cst_0 : tensor<32x16xi32> loc(#loc193) + %ret_167 = arith.xori %ileft_146, %iright_147 : tensor<32x16xi32> loc(#loc194) + %ret_168 = arith.select %cond_166, %ret_167, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_169 = arith.xori %ret_130, %ret_168 : tensor<32x16xi32> loc(#loc196) + %new_idxs_170 = arith.xori %left_idx_157, %right_idx_158 : tensor<32x16xi32> loc(#loc197) + %new_idxs_171 = arith.select %cond_166, %new_idxs_170, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_172 = arith.xori %new_idxs_133, %new_idxs_171 : tensor<32x16xi32> loc(#loc199) + %y_173 = tt.reshape %ret_169 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc162) + %ileft_174 = arith.muli %y_173, %ileft_62 : tensor<128x2x2xi32> loc(#loc164) + %ileft_175 = "tt.reduce"(%ileft_174) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc201) + %ileft_176 = tt.expand_dims %ileft_175 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc166) + %ileft_177 = tt.broadcast %ileft_176 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc167) + %iright_178 = arith.muli %y_173, %flip_22 : tensor<128x2x2xi32> loc(#loc168) + %iright_179 = "tt.reduce"(%iright_178) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc203) + %iright_180 = tt.expand_dims %iright_179 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc170) + %iright_181 = tt.broadcast %iright_180 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc171) + %ileft_182 = tt.reshape %ileft_177 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_183 = tt.reshape %iright_181 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_184 = tt.reshape %new_idxs_172 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc174) + %left_idx_185 = arith.muli %y_idx_184, %ileft_62 : tensor<128x2x2xi32> loc(#loc176) + %left_idx_186 = "tt.reduce"(%left_idx_185) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc206) + %left_idx_187 = tt.expand_dims %left_idx_186 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc178) + %left_idx_188 = tt.broadcast %left_idx_187 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc179) + %right_idx_189 = arith.muli %y_idx_184, %flip_22 : tensor<128x2x2xi32> loc(#loc181) + %right_idx_190 = "tt.reduce"(%right_idx_189) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc209) + %right_idx_191 = tt.expand_dims %right_idx_190 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc183) + %right_idx_192 = tt.broadcast %right_idx_191 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc184) + %left_idx_193 = tt.reshape %left_idx_188 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_194 = tt.reshape %right_idx_192 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_195 = arith.cmpi slt, %ileft_182, %iright_183 : tensor<32x16xi32> loc(#loc187) + %eq_196 = arith.cmpi eq, %ileft_182, %iright_183 : tensor<32x16xi32> loc(#loc188) + %cond_197 = arith.cmpi sgt, %left_idx_193, %right_idx_194 : tensor<32x16xi32> loc(#loc189) + %cond_198 = arith.andi %eq_196, %cond_197 : tensor<32x16xi1> loc(#loc190) + %cond_199 = arith.ori %cond_195, %cond_198 : tensor<32x16xi1> loc(#loc191) + %cond_200 = arith.extui %cond_199 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192) + %cond_201 = arith.xori %cond_200, %flip_135 : tensor<32x16xi32> loc(#loc192) + %cond_202 = arith.cmpi ne, %cond_201, %cst_0 : tensor<32x16xi32> loc(#loc193) + %ret_203 = arith.xori %ileft_182, %iright_183 : tensor<32x16xi32> loc(#loc194) + %ret_204 = arith.select %cond_202, %ret_203, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_205 = arith.xori %ret_169, %ret_204 : tensor<32x16xi32> loc(#loc196) + %new_idxs_206 = arith.xori %left_idx_193, %right_idx_194 : tensor<32x16xi32> loc(#loc197) + %new_idxs_207 = arith.select %cond_202, %new_idxs_206, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_208 = arith.xori %new_idxs_172, %new_idxs_207 : tensor<32x16xi32> loc(#loc199) + %y_209 = tt.reshape %ret_205 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162) + %ileft_210 = arith.muli %y_209, %ileft : tensor<256x2x1xi32> loc(#loc164) + %ileft_211 = "tt.reduce"(%ileft_210) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201) + %ileft_212 = tt.expand_dims %ileft_211 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166) + %ileft_213 = tt.broadcast %ileft_212 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167) + %iright_214 = arith.muli %y_209, %iright : tensor<256x2x1xi32> loc(#loc168) + %iright_215 = "tt.reduce"(%iright_214) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203) + %iright_216 = tt.expand_dims %iright_215 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170) + %iright_217 = tt.broadcast %iright_216 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171) + %ileft_218 = tt.reshape %ileft_213 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_219 = tt.reshape %iright_217 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_220 = tt.reshape %new_idxs_208 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc174) + %left_idx_221 = arith.muli %y_idx_220, %ileft : tensor<256x2x1xi32> loc(#loc176) + %left_idx_222 = "tt.reduce"(%left_idx_221) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206) + %left_idx_223 = tt.expand_dims %left_idx_222 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178) + %left_idx_224 = tt.broadcast %left_idx_223 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179) + %right_idx_225 = arith.muli %y_idx_220, %iright : tensor<256x2x1xi32> loc(#loc181) + %right_idx_226 = "tt.reduce"(%right_idx_225) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209) + %right_idx_227 = tt.expand_dims %right_idx_226 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183) + %right_idx_228 = tt.broadcast %right_idx_227 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184) + %left_idx_229 = tt.reshape %left_idx_224 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_230 = tt.reshape %right_idx_228 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_231 = arith.cmpi slt, %ileft_218, %iright_219 : tensor<32x16xi32> loc(#loc187) + %eq_232 = arith.cmpi eq, %ileft_218, %iright_219 : tensor<32x16xi32> loc(#loc188) + %cond_233 = arith.cmpi sgt, %left_idx_229, %right_idx_230 : tensor<32x16xi32> loc(#loc189) + %cond_234 = arith.andi %eq_232, %cond_233 : tensor<32x16xi1> loc(#loc190) + %cond_235 = arith.ori %cond_231, %cond_234 : tensor<32x16xi1> loc(#loc191) + %cond_236 = arith.extui %cond_235 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192) + %cond_237 = arith.xori %cond_236, %flip_135 : tensor<32x16xi32> loc(#loc192) + %cond_238 = arith.cmpi ne, %cond_237, %cst_0 : tensor<32x16xi32> loc(#loc193) + %ret_239 = arith.xori %ileft_218, %iright_219 : tensor<32x16xi32> loc(#loc194) + %ret_240 = arith.select %cond_238, %ret_239, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_241 = arith.xori %ret_205, %ret_240 : tensor<32x16xi32> loc(#loc196) + %new_idxs_242 = arith.xori %left_idx_229, %right_idx_230 : tensor<32x16xi32> loc(#loc197) + %new_idxs_243 = arith.select %cond_238, %new_idxs_242, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_244 = arith.xori %new_idxs_208, %new_idxs_243 : tensor<32x16xi32> loc(#loc199) + %y_245 = tt.reshape %ret_241 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc162) + %ileft_246 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc164) + %ileft_247 = arith.muli %y_245, %ileft_246 : tensor<32x2x8xi32> loc(#loc164) + %ileft_248 = "tt.reduce"(%ileft_247) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc201) + %ileft_249 = tt.expand_dims %ileft_248 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc166) + %ileft_250 = tt.broadcast %ileft_249 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc167) + %iright_251 = arith.muli %y_245, %flip_134 : tensor<32x2x8xi32> loc(#loc168) + %iright_252 = "tt.reduce"(%iright_251) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc203) + %iright_253 = tt.expand_dims %iright_252 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc170) + %iright_254 = tt.broadcast %iright_253 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc171) + %ileft_255 = tt.reshape %ileft_250 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_256 = tt.reshape %iright_254 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_257 = tt.reshape %new_idxs_244 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc174) + %left_idx_258 = arith.muli %y_idx_257, %ileft_246 : tensor<32x2x8xi32> loc(#loc176) + %left_idx_259 = "tt.reduce"(%left_idx_258) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc206) + %left_idx_260 = tt.expand_dims %left_idx_259 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc178) + %left_idx_261 = tt.broadcast %left_idx_260 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc179) + %right_idx_262 = arith.muli %y_idx_257, %flip_134 : tensor<32x2x8xi32> loc(#loc181) + %right_idx_263 = "tt.reduce"(%right_idx_262) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc209) + %right_idx_264 = tt.expand_dims %right_idx_263 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc183) + %right_idx_265 = tt.broadcast %right_idx_264 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc184) + %left_idx_266 = tt.reshape %left_idx_261 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_267 = tt.reshape %right_idx_265 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_268 = arith.cmpi slt, %ileft_255, %iright_256 : tensor<32x16xi32> loc(#loc187) + %eq_269 = arith.cmpi eq, %ileft_255, %iright_256 : tensor<32x16xi32> loc(#loc188) + %cond_270 = arith.cmpi sgt, %left_idx_266, %right_idx_267 : tensor<32x16xi32> loc(#loc189) + %cond_271 = arith.andi %eq_269, %cond_270 : tensor<32x16xi1> loc(#loc190) + %cond_272 = arith.ori %cond_268, %cond_271 : tensor<32x16xi1> loc(#loc191) + %ret_273 = arith.xori %ileft_255, %iright_256 : tensor<32x16xi32> loc(#loc194) + %ret_274 = arith.select %cond_272, %ret_273, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_275 = arith.xori %ret_241, %ret_274 : tensor<32x16xi32> loc(#loc196) + %new_idxs_276 = arith.xori %left_idx_266, %right_idx_267 : tensor<32x16xi32> loc(#loc197) + %new_idxs_277 = arith.select %cond_272, %new_idxs_276, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_278 = arith.xori %new_idxs_244, %new_idxs_277 : tensor<32x16xi32> loc(#loc199) + %y_279 = tt.reshape %ret_275 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc162) + %ileft_280 = arith.muli %y_279, %ileft_137 : tensor<64x2x4xi32> loc(#loc164) + %ileft_281 = "tt.reduce"(%ileft_280) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc201) + %ileft_282 = tt.expand_dims %ileft_281 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc166) + %ileft_283 = tt.broadcast %ileft_282 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc167) + %iright_284 = arith.muli %y_279, %flip_59 : tensor<64x2x4xi32> loc(#loc168) + %iright_285 = "tt.reduce"(%iright_284) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc203) + %iright_286 = tt.expand_dims %iright_285 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc170) + %iright_287 = tt.broadcast %iright_286 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc171) + %ileft_288 = tt.reshape %ileft_283 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_289 = tt.reshape %iright_287 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_290 = tt.reshape %new_idxs_278 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc174) + %left_idx_291 = arith.muli %y_idx_290, %ileft_137 : tensor<64x2x4xi32> loc(#loc176) + %left_idx_292 = "tt.reduce"(%left_idx_291) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc206) + %left_idx_293 = tt.expand_dims %left_idx_292 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc178) + %left_idx_294 = tt.broadcast %left_idx_293 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc179) + %right_idx_295 = arith.muli %y_idx_290, %flip_59 : tensor<64x2x4xi32> loc(#loc181) + %right_idx_296 = "tt.reduce"(%right_idx_295) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc209) + %right_idx_297 = tt.expand_dims %right_idx_296 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc183) + %right_idx_298 = tt.broadcast %right_idx_297 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc184) + %left_idx_299 = tt.reshape %left_idx_294 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_300 = tt.reshape %right_idx_298 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_301 = arith.cmpi slt, %ileft_288, %iright_289 : tensor<32x16xi32> loc(#loc187) + %eq_302 = arith.cmpi eq, %ileft_288, %iright_289 : tensor<32x16xi32> loc(#loc188) + %cond_303 = arith.cmpi sgt, %left_idx_299, %right_idx_300 : tensor<32x16xi32> loc(#loc189) + %cond_304 = arith.andi %eq_302, %cond_303 : tensor<32x16xi1> loc(#loc190) + %cond_305 = arith.ori %cond_301, %cond_304 : tensor<32x16xi1> loc(#loc191) + %ret_306 = arith.xori %ileft_288, %iright_289 : tensor<32x16xi32> loc(#loc194) + %ret_307 = arith.select %cond_305, %ret_306, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_308 = arith.xori %ret_275, %ret_307 : tensor<32x16xi32> loc(#loc196) + %new_idxs_309 = arith.xori %left_idx_299, %right_idx_300 : tensor<32x16xi32> loc(#loc197) + %new_idxs_310 = arith.select %cond_305, %new_idxs_309, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_311 = arith.xori %new_idxs_278, %new_idxs_310 : tensor<32x16xi32> loc(#loc199) + %y_312 = tt.reshape %ret_308 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc162) + %ileft_313 = arith.muli %y_312, %ileft_62 : tensor<128x2x2xi32> loc(#loc164) + %ileft_314 = "tt.reduce"(%ileft_313) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc201) + %ileft_315 = tt.expand_dims %ileft_314 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc166) + %ileft_316 = tt.broadcast %ileft_315 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc167) + %iright_317 = arith.muli %y_312, %flip_22 : tensor<128x2x2xi32> loc(#loc168) + %iright_318 = "tt.reduce"(%iright_317) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc203) + %iright_319 = tt.expand_dims %iright_318 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc170) + %iright_320 = tt.broadcast %iright_319 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc171) + %ileft_321 = tt.reshape %ileft_316 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_322 = tt.reshape %iright_320 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_323 = tt.reshape %new_idxs_311 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc174) + %left_idx_324 = arith.muli %y_idx_323, %ileft_62 : tensor<128x2x2xi32> loc(#loc176) + %left_idx_325 = "tt.reduce"(%left_idx_324) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc206) + %left_idx_326 = tt.expand_dims %left_idx_325 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc178) + %left_idx_327 = tt.broadcast %left_idx_326 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc179) + %right_idx_328 = arith.muli %y_idx_323, %flip_22 : tensor<128x2x2xi32> loc(#loc181) + %right_idx_329 = "tt.reduce"(%right_idx_328) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc209) + %right_idx_330 = tt.expand_dims %right_idx_329 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc183) + %right_idx_331 = tt.broadcast %right_idx_330 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc184) + %left_idx_332 = tt.reshape %left_idx_327 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_333 = tt.reshape %right_idx_331 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_334 = arith.cmpi slt, %ileft_321, %iright_322 : tensor<32x16xi32> loc(#loc187) + %eq_335 = arith.cmpi eq, %ileft_321, %iright_322 : tensor<32x16xi32> loc(#loc188) + %cond_336 = arith.cmpi sgt, %left_idx_332, %right_idx_333 : tensor<32x16xi32> loc(#loc189) + %cond_337 = arith.andi %eq_335, %cond_336 : tensor<32x16xi1> loc(#loc190) + %cond_338 = arith.ori %cond_334, %cond_337 : tensor<32x16xi1> loc(#loc191) + %ret_339 = arith.xori %ileft_321, %iright_322 : tensor<32x16xi32> loc(#loc194) + %ret_340 = arith.select %cond_338, %ret_339, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_341 = arith.xori %ret_308, %ret_340 : tensor<32x16xi32> loc(#loc196) + %new_idxs_342 = arith.xori %left_idx_332, %right_idx_333 : tensor<32x16xi32> loc(#loc197) + %new_idxs_343 = arith.select %cond_338, %new_idxs_342, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_344 = arith.xori %new_idxs_311, %new_idxs_343 : tensor<32x16xi32> loc(#loc199) + %y_345 = tt.reshape %ret_341 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162) + %ileft_346 = arith.muli %y_345, %ileft : tensor<256x2x1xi32> loc(#loc164) + %ileft_347 = "tt.reduce"(%ileft_346) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201) + %ileft_348 = tt.expand_dims %ileft_347 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166) + %ileft_349 = tt.broadcast %ileft_348 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167) + %iright_350 = arith.muli %y_345, %iright : tensor<256x2x1xi32> loc(#loc168) + %iright_351 = "tt.reduce"(%iright_350) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203) + %iright_352 = tt.expand_dims %iright_351 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170) + %iright_353 = tt.broadcast %iright_352 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171) + %ileft_354 = tt.reshape %ileft_349 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_355 = tt.reshape %iright_353 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_356 = tt.reshape %new_idxs_344 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc174) + %left_idx_357 = arith.muli %y_idx_356, %ileft : tensor<256x2x1xi32> loc(#loc176) + %left_idx_358 = "tt.reduce"(%left_idx_357) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206) + %left_idx_359 = tt.expand_dims %left_idx_358 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178) + %left_idx_360 = tt.broadcast %left_idx_359 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179) + %right_idx_361 = arith.muli %y_idx_356, %iright : tensor<256x2x1xi32> loc(#loc181) + %right_idx_362 = "tt.reduce"(%right_idx_361) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209) + %right_idx_363 = tt.expand_dims %right_idx_362 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183) + %right_idx_364 = tt.broadcast %right_idx_363 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184) + %left_idx_365 = tt.reshape %left_idx_360 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_366 = tt.reshape %right_idx_364 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_367 = arith.cmpi slt, %ileft_354, %iright_355 : tensor<32x16xi32> loc(#loc187) + %eq_368 = arith.cmpi eq, %ileft_354, %iright_355 : tensor<32x16xi32> loc(#loc188) + %cond_369 = arith.cmpi sgt, %left_idx_365, %right_idx_366 : tensor<32x16xi32> loc(#loc189) + %cond_370 = arith.andi %eq_368, %cond_369 : tensor<32x16xi1> loc(#loc190) + %cond_371 = arith.ori %cond_367, %cond_370 : tensor<32x16xi1> loc(#loc191) + %new_idxs_372 = arith.xori %left_idx_365, %right_idx_366 : tensor<32x16xi32> loc(#loc197) + %new_idxs_373 = arith.select %cond_371, %new_idxs_372, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_374 = arith.xori %new_idxs_344, %new_idxs_373 : tensor<32x16xi32> loc(#loc199) + %tmp7 = arith.extsi %tmp0_19 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc149) + %tmp10_375 = arith.select %tmp0_18, %tmp7, %tmp10 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc86) + %tmp11 = "tt.reduce"(%tmp10_375) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_377: i64 loc(callsite(#loc1 at #loc150)), %tmp11_378: i64 loc(callsite(#loc1 at #loc150))): + %tmp11_379 = arith.addi %tmp11_377, %tmp11_378 : i64 loc(#loc200) + tt.reduce.return %tmp11_379 : i64 loc(#loc160) + }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc160) + %tmp11_376 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc151) + %tmp14 = arith.trunci %tmp11_376 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc152) + %0 = arith.muli %xindex_6, %cst_2 : tensor<32x1xi32> loc(#loc73) + %1 = tt.broadcast %r0_index_8 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc74) + %2 = tt.broadcast %0 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc74) + %3 = arith.addi %1, %2 : tensor<32x16xi32> loc(#loc74) + %4 = tt.splat %out_ptr2 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc75) + %5 = tt.addptr %4, %3 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc75) + tt.store %5, %new_idxs_374, %tmp0_18 : tensor<32x16x!tt.ptr> loc(#loc76) + %6 = tt.splat %out_ptr3 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc77) + %7 = tt.addptr %6, %xindex_6 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc77) + tt.store %7, %tmp14, %xmask_7 : tensor<32x1x!tt.ptr> loc(#loc78) + tt.return loc(#loc79) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":44:34) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:49) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:38) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":26:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":24:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":24:33) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:36) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:44) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:23) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":27:28) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":27:38) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":33:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":34:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:45) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:30) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:54) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":38:19) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":40:33) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":42:19) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:29) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":48:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:35) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:32) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:47) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:25) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:37) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:4) +#loc85 = loc(callsite(#loc1 at #loc2)) +#loc86 = loc("tmp10"(#loc3)) +#loc87 = loc("tmp0"(#loc4)) +#loc88 = loc("tmp0"(#loc5)) +#loc89 = loc("xmask"(#loc6)) +#loc90 = loc("xoffset"(#loc7)) +#loc91 = loc("xoffset"(#loc8)) +#loc92 = loc("xindex"(#loc9)) +#loc93 = loc("xindex"(#loc10)) +#loc94 = loc("xindex"(#loc11)) +#loc95 = loc("r0_index"(#loc12)) +#loc96 = loc("r0_index"(#loc13)) +#loc97 = loc("x0"(#loc14)) +#loc98 = loc("x1"(#loc15)) +#loc99 = loc("tmp0"(#loc16)) +#loc100 = loc("tmp0"(#loc17)) +#loc101 = loc("tmp0"(#loc18)) +#loc102 = loc("tmp0"(#loc19)) +#loc103 = loc("tmp2"(#loc20)) +#loc104 = loc("tmp4"(#loc21)) +#loc105 = loc("flip"(#loc22)) +#loc107 = loc("flip"(#loc24)) +#loc108 = loc("flip"(#loc25)) +#loc109 = loc("flip"(#loc26)) +#loc110 = loc("y"(#loc27)) +#loc111 = loc("left_mask"(#loc29)) +#loc112 = loc("ileft"(#loc30)) +#loc114 = loc("ileft"(#loc34)) +#loc115 = loc("ileft"(#loc35)) +#loc116 = loc("iright"(#loc36)) +#loc118 = loc("iright"(#loc38)) +#loc119 = loc("iright"(#loc39)) +#loc120 = loc("ileft"(#loc40)) +#loc121 = loc("iright"(#loc41)) +#loc122 = loc("y_idx"(#loc42)) +#loc123 = loc("left_idx"(#loc43)) +#loc124 = loc("left_idx"(#loc44)) +#loc125 = loc("input"(#loc45)) +#loc127 = loc("left_idx"(#loc47)) +#loc128 = loc("left_idx"(#loc48)) +#loc129 = loc("right_idx"(#loc49)) +#loc130 = loc("right_idx"(#loc50)) +#loc132 = loc("right_idx"(#loc52)) +#loc133 = loc("right_idx"(#loc53)) +#loc134 = loc("left_idx"(#loc54)) +#loc135 = loc("right_idx"(#loc55)) +#loc136 = loc("cond"(#loc56)) +#loc137 = loc("eq"(#loc57)) +#loc138 = loc("cond"(#loc58)) +#loc139 = loc("cond"(#loc59)) +#loc140 = loc("cond"(#loc60)) +#loc141 = loc("cond"(#loc61)) +#loc142 = loc("cond"(#loc62)) +#loc143 = loc("ret"(#loc63)) +#loc144 = loc("ret"(#loc64)) +#loc145 = loc("ret"(#loc65)) +#loc146 = loc("new_idxs"(#loc66)) +#loc147 = loc("new_idxs"(#loc67)) +#loc148 = loc("new_idxs"(#loc68)) +#loc149 = loc("tmp7"(#loc69)) +#loc151 = loc("tmp11"(#loc71)) +#loc152 = loc("tmp14"(#loc72)) +#loc153 = loc(callsite(#loc105 at #loc106)) +#loc154 = loc(callsite(#loc107 at #loc106)) +#loc155 = loc(callsite(#loc108 at #loc106)) +#loc156 = loc(callsite(#loc109 at #loc106)) +#loc158 = loc("cond"(#loc136)) +#loc159 = loc("eq"(#loc137)) +#loc160 = loc(callsite(#loc31 at #loc150)) +#loc162 = loc(callsite(#loc110 at #loc157)) +#loc163 = loc(callsite(#loc111 at #loc157)) +#loc164 = loc(callsite(#loc112 at #loc157)) +#loc166 = loc(callsite(#loc114 at #loc157)) +#loc167 = loc(callsite(#loc115 at #loc157)) +#loc168 = loc(callsite(#loc116 at #loc157)) +#loc170 = loc(callsite(#loc118 at #loc157)) +#loc171 = loc(callsite(#loc119 at #loc157)) +#loc172 = loc(callsite(#loc120 at #loc157)) +#loc173 = loc(callsite(#loc121 at #loc157)) +#loc174 = loc(callsite(#loc122 at #loc157)) +#loc175 = loc(callsite(#loc123 at #loc157)) +#loc176 = loc(callsite(#loc124 at #loc157)) +#loc178 = loc(callsite(#loc127 at #loc157)) +#loc179 = loc(callsite(#loc128 at #loc157)) +#loc180 = loc(callsite(#loc129 at #loc157)) +#loc181 = loc(callsite(#loc130 at #loc157)) +#loc183 = loc(callsite(#loc132 at #loc157)) +#loc184 = loc(callsite(#loc133 at #loc157)) +#loc185 = loc(callsite(#loc134 at #loc157)) +#loc186 = loc(callsite(#loc135 at #loc157)) +#loc187 = loc(callsite(#loc158 at #loc157)) +#loc188 = loc(callsite(#loc159 at #loc157)) +#loc189 = loc(callsite(#loc138 at #loc157)) +#loc190 = loc(callsite(#loc139 at #loc157)) +#loc191 = loc(callsite(#loc140 at #loc157)) +#loc192 = loc(callsite(#loc141 at #loc157)) +#loc193 = loc(callsite(#loc142 at #loc157)) +#loc194 = loc(callsite(#loc143 at #loc157)) +#loc195 = loc(callsite(#loc144 at #loc157)) +#loc196 = loc(callsite(#loc145 at #loc157)) +#loc197 = loc(callsite(#loc146 at #loc157)) +#loc198 = loc(callsite(#loc147 at #loc157)) +#loc199 = loc(callsite(#loc148 at #loc157)) +#loc200 = loc(callsite(#loc33 at #loc160)) +#loc201 = loc(callsite(#loc31 at #loc165)) +#loc203 = loc(callsite(#loc31 at #loc169)) +#loc205 = loc(callsite(#loc125 at #loc177)) +#loc206 = loc(callsite(#loc31 at #loc177)) +#loc208 = loc(callsite(#loc125 at #loc182)) +#loc209 = loc(callsite(#loc31 at #loc182)) +#loc211 = loc(callsite(#loc33 at #loc201)) +#loc212 = loc(callsite(#loc33 at #loc203)) +#loc213 = loc(callsite(#loc33 at #loc206)) +#loc214 = loc(callsite(#loc33 at #loc209)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json b/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2db771001f61a782396106a5ff750ded25f23b08 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..4953675f87fcddfd0cc874471ef0b781a5590fc8 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json b/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5559ee464ff9cdfd888024532e0c4dee90745172 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json @@ -0,0 +1 @@ +{"hash": "16ca2c8864609722b876bac87eb220846cb74e2fe4810a46a027c2f3e653aead", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 64, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..2f0aaec313d61576853ed7973201bfe4be211c09 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir @@ -0,0 +1,934 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !5 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = shl nuw nsw i32 %8, 2, !dbg !9 + %10 = and i32 %9, 2044, !dbg !9 + %11 = mul i32 %7, 32000, !dbg !10 + %12 = zext nneg i32 %10 to i64, !dbg !11 + br label %13, !dbg !11 + +13: ; preds = %6, %13 + %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %13 ] + %14 = phi <2 x float> [ zeroinitializer, %6 ], [ %271, %13 ] + %15 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %269, %13 ] + %16 = phi <2 x float> [ zeroinitializer, %6 ], [ %270, %13 ] + %17 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %268, %13 ] + %18 = or disjoint i64 %indvars.iv, %12, !dbg !12 + %19 = icmp samesign ult i64 %18, 32000, !dbg !13 + %20 = trunc nuw nsw i64 %18 to i32, !dbg !14 + %21 = add i32 %11, %20, !dbg !14 + %22 = sext i32 %21 to i64, !dbg !15 + %23 = getelementptr bfloat, ptr addrspace(1) %0, i64 %22, !dbg !15 + %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !16 + %25 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %23, i64 %24, i1 %19) #6, !dbg !16 + %26 = extractvalue { i32, i32 } %25, 0, !dbg !16 + %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !16 + %28 = extractvalue { i32, i32 } %25, 1, !dbg !16 + %29 = bitcast i32 %28 to <2 x bfloat>, !dbg !16 + %30 = fcmp uno <2 x float> %17, zeroinitializer, !dbg !17 + %31 = fcmp uno <2 x float> %15, zeroinitializer, !dbg !17 + %32 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i72 = icmp eq i32 %32, 0, !dbg !21 + %33 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i74 = icmp eq i32 %33, 0, !dbg !21 + %34 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i78 = icmp eq i32 %35, 0, !dbg !21 + %36 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i80 = icmp eq i32 %36, 0, !dbg !21 + %37 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i82 = icmp eq i32 %37, 0, !dbg !21 + %38 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i84 = icmp eq i32 %38, 0, !dbg !21 + %39 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %40 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i88 = icmp eq i32 %40, 0, !dbg !21 + %41 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i90 = icmp eq i32 %41, 0, !dbg !21 + %42 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i92 = icmp eq i32 %42, 0, !dbg !21 + %43 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i94 = icmp eq i32 %43, 0, !dbg !21 + %44 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %45 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i98 = icmp eq i32 %45, 0, !dbg !21 + %46 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i100 = icmp eq i32 %46, 0, !dbg !21 + %47 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i102 = icmp eq i32 %47, 0, !dbg !21 + %48 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i104 = icmp eq i32 %48, 0, !dbg !21 + %49 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %50 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i108 = icmp eq i32 %50, 0, !dbg !21 + %51 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i110 = icmp eq i32 %51, 0, !dbg !21 + %52 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i112 = icmp eq i32 %52, 0, !dbg !21 + %53 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i114 = icmp eq i32 %53, 0, !dbg !21 + %54 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %55 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i118 = icmp eq i32 %55, 0, !dbg !21 + %56 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i120 = icmp eq i32 %56, 0, !dbg !21 + %57 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i122 = icmp eq i32 %57, 0, !dbg !21 + %58 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i124 = icmp eq i32 %58, 0, !dbg !21 + %59 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %60 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i128 = icmp eq i32 %60, 0, !dbg !21 + %61 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i130 = icmp eq i32 %61, 0, !dbg !21 + %62 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i132 = icmp eq i32 %62, 0, !dbg !21 + %63 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i134 = icmp eq i32 %63, 0, !dbg !21 + %64 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %65 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i138 = icmp eq i32 %65, 0, !dbg !21 + %66 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i140 = icmp eq i32 %66, 0, !dbg !21 + %67 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i142 = icmp eq i32 %67, 0, !dbg !21 + %68 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i144 = icmp eq i32 %68, 0, !dbg !21 + %69 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %70 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i148 = icmp eq i32 %70, 0, !dbg !21 + %71 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i150 = icmp eq i32 %71, 0, !dbg !21 + %72 = fpext <2 x bfloat> %27 to <2 x float>, !dbg !22 + %73 = fcmp ogt <2 x float> %17, %72, !dbg !23 + %74 = or <2 x i1> %30, %73, !dbg !24 + %75 = select <2 x i1> %74, <2 x float> %17, <2 x float> %72, !dbg !25 + %76 = fcmp oeq <2 x float> %75, splat (float 0xFFF0000000000000), !dbg !26 + %foldExtExtBinop = fsub <2 x float> %17, %75, !dbg !27 + %77 = extractelement <2 x float> %foldExtExtBinop, i64 0, !dbg !27 + %foldExtExtBinop177 = fsub <2 x float> %17, %75, !dbg !27 + %78 = extractelement <2 x float> %foldExtExtBinop177, i64 1, !dbg !27 + %79 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %77, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %80 = tail call float @llvm.nvvm.fma.rn.f(float %77, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i73 = select i1 %.not.i72, float %80, float %79, !dbg !21 + %81 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i73) #6, !dbg !21 + %82 = tail call float @llvm.nvvm.saturate.f(float %.02.i73) #6, !dbg !21 + %.03.i75 = select i1 %.not1.i74, float %82, float %81, !dbg !21 + %83 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i75, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %84 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i75, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %85 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %78, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %86 = tail call float @llvm.nvvm.fma.rn.f(float %78, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i83 = select i1 %.not.i82, float %86, float %85, !dbg !21 + %87 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i83) #6, !dbg !21 + %88 = tail call float @llvm.nvvm.saturate.f(float %.02.i83) #6, !dbg !21 + %.03.i85 = select i1 %.not1.i84, float %88, float %87, !dbg !21 + %89 = insertelement <2 x i32> poison, i32 %34, i64 0, !dbg !21 + %90 = insertelement <2 x i32> %89, i32 %39, i64 1, !dbg !21 + %91 = icmp eq <2 x i32> %90, zeroinitializer, !dbg !21 + %92 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i85, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %93 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i85, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %94 = insertelement <2 x float> poison, float %84, i64 0, !dbg !21 + %95 = insertelement <2 x float> %94, float %93, i64 1, !dbg !21 + %96 = insertelement <2 x float> poison, float %83, i64 0, !dbg !21 + %97 = insertelement <2 x float> %96, float %92, i64 1, !dbg !21 + %98 = select <2 x i1> %91, <2 x float> %95, <2 x float> %97, !dbg !21 + %99 = extractelement <2 x float> %98, i64 0, !dbg !21 + %100 = fadd float %99, 0xC168000FE0000000, !dbg !21 + %101 = fneg float %100, !dbg !21 + %102 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %77, float 0x3FF7154760000000, float %101) #6, !dbg !21 + %103 = tail call float @llvm.nvvm.fma.rn.f(float %77, float 0x3FF7154760000000, float %101) #6, !dbg !21 + %.0.i79 = select i1 %.not3.i78, float %103, float %102, !dbg !21 + %104 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %77, float 0x3E54AE0C00000000, float %.0.i79) #6, !dbg !21 + %105 = tail call float @llvm.nvvm.fma.rn.f(float %77, float 0x3E54AE0C00000000, float %.0.i79) #6, !dbg !21 + %.01.i81 = select i1 %.not4.i80, float %105, float %104, !dbg !21 + %106 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i81) #6, !dbg !21 + %107 = extractelement <2 x float> %98, i64 1, !dbg !21 + %108 = fadd float %107, 0xC168000FE0000000, !dbg !21 + %109 = fneg float %108, !dbg !21 + %110 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %78, float 0x3FF7154760000000, float %109) #6, !dbg !21 + %111 = tail call float @llvm.nvvm.fma.rn.f(float %78, float 0x3FF7154760000000, float %109) #6, !dbg !21 + %.0.i89 = select i1 %.not3.i88, float %111, float %110, !dbg !21 + %112 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %78, float 0x3E54AE0C00000000, float %.0.i89) #6, !dbg !21 + %113 = tail call float @llvm.nvvm.fma.rn.f(float %78, float 0x3E54AE0C00000000, float %.0.i89) #6, !dbg !21 + %.01.i91 = select i1 %.not4.i90, float %113, float %112, !dbg !21 + %114 = bitcast <2 x float> %98 to <2 x i32>, !dbg !21 + %115 = shl <2 x i32> %114, splat (i32 23), !dbg !21 + %116 = bitcast <2 x i32> %115 to <2 x float>, !dbg !21 + %117 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i91) #6, !dbg !21 + %118 = insertelement <2 x float> poison, float %106, i64 0, !dbg !21 + %119 = insertelement <2 x float> %118, float %117, i64 1, !dbg !21 + %120 = fmul <2 x float> %119, %116, !dbg !21 + %121 = select <2 x i1> %76, <2 x float> splat (float 1.000000e+00), <2 x float> %120, !dbg !28 + %foldExtExtBinop179 = fsub <2 x float> %72, %75, !dbg !29 + %122 = extractelement <2 x float> %foldExtExtBinop179, i64 0, !dbg !29 + %foldExtExtBinop181 = fsub <2 x float> %72, %75, !dbg !29 + %123 = extractelement <2 x float> %foldExtExtBinop181, i64 1, !dbg !29 + %124 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %122, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %125 = tail call float @llvm.nvvm.fma.rn.f(float %122, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i113 = select i1 %.not.i112, float %125, float %124, !dbg !21 + %126 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i113) #6, !dbg !21 + %127 = tail call float @llvm.nvvm.saturate.f(float %.02.i113) #6, !dbg !21 + %.03.i115 = select i1 %.not1.i114, float %127, float %126, !dbg !21 + %128 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i115, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %129 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i115, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %130 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %123, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %131 = tail call float @llvm.nvvm.fma.rn.f(float %123, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i123 = select i1 %.not.i122, float %131, float %130, !dbg !21 + %132 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i123) #6, !dbg !21 + %133 = tail call float @llvm.nvvm.saturate.f(float %.02.i123) #6, !dbg !21 + %.03.i125 = select i1 %.not1.i124, float %133, float %132, !dbg !21 + %134 = insertelement <2 x i32> poison, i32 %54, i64 0, !dbg !21 + %135 = insertelement <2 x i32> %134, i32 %59, i64 1, !dbg !21 + %136 = icmp eq <2 x i32> %135, zeroinitializer, !dbg !21 + %137 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i125, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %138 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i125, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %139 = insertelement <2 x float> poison, float %129, i64 0, !dbg !21 + %140 = insertelement <2 x float> %139, float %138, i64 1, !dbg !21 + %141 = insertelement <2 x float> poison, float %128, i64 0, !dbg !21 + %142 = insertelement <2 x float> %141, float %137, i64 1, !dbg !21 + %143 = select <2 x i1> %136, <2 x float> %140, <2 x float> %142, !dbg !21 + %144 = extractelement <2 x float> %143, i64 0, !dbg !21 + %145 = fadd float %144, 0xC168000FE0000000, !dbg !21 + %146 = fneg float %145, !dbg !21 + %147 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %122, float 0x3FF7154760000000, float %146) #6, !dbg !21 + %148 = tail call float @llvm.nvvm.fma.rn.f(float %122, float 0x3FF7154760000000, float %146) #6, !dbg !21 + %.0.i119 = select i1 %.not3.i118, float %148, float %147, !dbg !21 + %149 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %122, float 0x3E54AE0C00000000, float %.0.i119) #6, !dbg !21 + %150 = tail call float @llvm.nvvm.fma.rn.f(float %122, float 0x3E54AE0C00000000, float %.0.i119) #6, !dbg !21 + %.01.i121 = select i1 %.not4.i120, float %150, float %149, !dbg !21 + %151 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i121) #6, !dbg !21 + %152 = extractelement <2 x float> %143, i64 1, !dbg !21 + %153 = fadd float %152, 0xC168000FE0000000, !dbg !21 + %154 = fneg float %153, !dbg !21 + %155 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %123, float 0x3FF7154760000000, float %154) #6, !dbg !21 + %156 = tail call float @llvm.nvvm.fma.rn.f(float %123, float 0x3FF7154760000000, float %154) #6, !dbg !21 + %.0.i129 = select i1 %.not3.i128, float %156, float %155, !dbg !21 + %157 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %123, float 0x3E54AE0C00000000, float %.0.i129) #6, !dbg !21 + %158 = tail call float @llvm.nvvm.fma.rn.f(float %123, float 0x3E54AE0C00000000, float %.0.i129) #6, !dbg !21 + %.01.i131 = select i1 %.not4.i130, float %158, float %157, !dbg !21 + %159 = bitcast <2 x float> %143 to <2 x i32>, !dbg !21 + %160 = shl <2 x i32> %159, splat (i32 23), !dbg !21 + %161 = bitcast <2 x i32> %160 to <2 x float>, !dbg !21 + %162 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i131) #6, !dbg !21 + %163 = insertelement <2 x float> poison, float %151, i64 0, !dbg !21 + %164 = insertelement <2 x float> %163, float %162, i64 1, !dbg !21 + %165 = fmul <2 x float> %164, %161, !dbg !21 + %166 = select <2 x i1> %76, <2 x float> splat (float 1.000000e+00), <2 x float> %165, !dbg !30 + %167 = fmul <2 x float> %16, %121, !dbg !31 + %168 = fadd <2 x float> %167, %166, !dbg !32 + %169 = fpext <2 x bfloat> %29 to <2 x float>, !dbg !22 + %170 = fcmp ogt <2 x float> %15, %169, !dbg !23 + %171 = or <2 x i1> %31, %170, !dbg !24 + %172 = select <2 x i1> %171, <2 x float> %15, <2 x float> %169, !dbg !25 + %173 = fcmp oeq <2 x float> %172, splat (float 0xFFF0000000000000), !dbg !26 + %foldExtExtBinop183 = fsub <2 x float> %15, %172, !dbg !27 + %174 = extractelement <2 x float> %foldExtExtBinop183, i64 0, !dbg !27 + %foldExtExtBinop185 = fsub <2 x float> %15, %172, !dbg !27 + %175 = extractelement <2 x float> %foldExtExtBinop185, i64 1, !dbg !27 + %176 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %174, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %177 = tail call float @llvm.nvvm.fma.rn.f(float %174, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i93 = select i1 %.not.i92, float %177, float %176, !dbg !21 + %178 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i93) #6, !dbg !21 + %179 = tail call float @llvm.nvvm.saturate.f(float %.02.i93) #6, !dbg !21 + %.03.i95 = select i1 %.not1.i94, float %179, float %178, !dbg !21 + %180 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i95, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %181 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i95, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %182 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %175, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %183 = tail call float @llvm.nvvm.fma.rn.f(float %175, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i103 = select i1 %.not.i102, float %183, float %182, !dbg !21 + %184 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i103) #6, !dbg !21 + %185 = tail call float @llvm.nvvm.saturate.f(float %.02.i103) #6, !dbg !21 + %.03.i105 = select i1 %.not1.i104, float %185, float %184, !dbg !21 + %186 = insertelement <2 x i32> poison, i32 %44, i64 0, !dbg !21 + %187 = insertelement <2 x i32> %186, i32 %49, i64 1, !dbg !21 + %188 = icmp eq <2 x i32> %187, zeroinitializer, !dbg !21 + %189 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i105, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %190 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i105, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %191 = insertelement <2 x float> poison, float %181, i64 0, !dbg !21 + %192 = insertelement <2 x float> %191, float %190, i64 1, !dbg !21 + %193 = insertelement <2 x float> poison, float %180, i64 0, !dbg !21 + %194 = insertelement <2 x float> %193, float %189, i64 1, !dbg !21 + %195 = select <2 x i1> %188, <2 x float> %192, <2 x float> %194, !dbg !21 + %196 = extractelement <2 x float> %195, i64 0, !dbg !21 + %197 = fadd float %196, 0xC168000FE0000000, !dbg !21 + %198 = fneg float %197, !dbg !21 + %199 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %174, float 0x3FF7154760000000, float %198) #6, !dbg !21 + %200 = tail call float @llvm.nvvm.fma.rn.f(float %174, float 0x3FF7154760000000, float %198) #6, !dbg !21 + %.0.i99 = select i1 %.not3.i98, float %200, float %199, !dbg !21 + %201 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %174, float 0x3E54AE0C00000000, float %.0.i99) #6, !dbg !21 + %202 = tail call float @llvm.nvvm.fma.rn.f(float %174, float 0x3E54AE0C00000000, float %.0.i99) #6, !dbg !21 + %.01.i101 = select i1 %.not4.i100, float %202, float %201, !dbg !21 + %203 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i101) #6, !dbg !21 + %204 = extractelement <2 x float> %195, i64 1, !dbg !21 + %205 = fadd float %204, 0xC168000FE0000000, !dbg !21 + %206 = fneg float %205, !dbg !21 + %207 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %175, float 0x3FF7154760000000, float %206) #6, !dbg !21 + %208 = tail call float @llvm.nvvm.fma.rn.f(float %175, float 0x3FF7154760000000, float %206) #6, !dbg !21 + %.0.i109 = select i1 %.not3.i108, float %208, float %207, !dbg !21 + %209 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %175, float 0x3E54AE0C00000000, float %.0.i109) #6, !dbg !21 + %210 = tail call float @llvm.nvvm.fma.rn.f(float %175, float 0x3E54AE0C00000000, float %.0.i109) #6, !dbg !21 + %.01.i111 = select i1 %.not4.i110, float %210, float %209, !dbg !21 + %211 = bitcast <2 x float> %195 to <2 x i32>, !dbg !21 + %212 = shl <2 x i32> %211, splat (i32 23), !dbg !21 + %213 = bitcast <2 x i32> %212 to <2 x float>, !dbg !21 + %214 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i111) #6, !dbg !21 + %215 = insertelement <2 x float> poison, float %203, i64 0, !dbg !21 + %216 = insertelement <2 x float> %215, float %214, i64 1, !dbg !21 + %217 = fmul <2 x float> %216, %213, !dbg !21 + %218 = select <2 x i1> %173, <2 x float> splat (float 1.000000e+00), <2 x float> %217, !dbg !28 + %foldExtExtBinop187 = fsub <2 x float> %169, %172, !dbg !29 + %219 = extractelement <2 x float> %foldExtExtBinop187, i64 0, !dbg !29 + %foldExtExtBinop189 = fsub <2 x float> %169, %172, !dbg !29 + %220 = extractelement <2 x float> %foldExtExtBinop189, i64 1, !dbg !29 + %221 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %219, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %222 = tail call float @llvm.nvvm.fma.rn.f(float %219, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i133 = select i1 %.not.i132, float %222, float %221, !dbg !21 + %223 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i133) #6, !dbg !21 + %224 = tail call float @llvm.nvvm.saturate.f(float %.02.i133) #6, !dbg !21 + %.03.i135 = select i1 %.not1.i134, float %224, float %223, !dbg !21 + %225 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i135, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %226 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i135, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %227 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %220, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %228 = tail call float @llvm.nvvm.fma.rn.f(float %220, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i143 = select i1 %.not.i142, float %228, float %227, !dbg !21 + %229 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i143) #6, !dbg !21 + %230 = tail call float @llvm.nvvm.saturate.f(float %.02.i143) #6, !dbg !21 + %.03.i145 = select i1 %.not1.i144, float %230, float %229, !dbg !21 + %231 = insertelement <2 x i32> poison, i32 %64, i64 0, !dbg !21 + %232 = insertelement <2 x i32> %231, i32 %69, i64 1, !dbg !21 + %233 = icmp eq <2 x i32> %232, zeroinitializer, !dbg !21 + %234 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i145, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %235 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i145, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %236 = insertelement <2 x float> poison, float %226, i64 0, !dbg !21 + %237 = insertelement <2 x float> %236, float %235, i64 1, !dbg !21 + %238 = insertelement <2 x float> poison, float %225, i64 0, !dbg !21 + %239 = insertelement <2 x float> %238, float %234, i64 1, !dbg !21 + %240 = select <2 x i1> %233, <2 x float> %237, <2 x float> %239, !dbg !21 + %241 = extractelement <2 x float> %240, i64 0, !dbg !21 + %242 = fadd float %241, 0xC168000FE0000000, !dbg !21 + %243 = fneg float %242, !dbg !21 + %244 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %219, float 0x3FF7154760000000, float %243) #6, !dbg !21 + %245 = tail call float @llvm.nvvm.fma.rn.f(float %219, float 0x3FF7154760000000, float %243) #6, !dbg !21 + %.0.i139 = select i1 %.not3.i138, float %245, float %244, !dbg !21 + %246 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %219, float 0x3E54AE0C00000000, float %.0.i139) #6, !dbg !21 + %247 = tail call float @llvm.nvvm.fma.rn.f(float %219, float 0x3E54AE0C00000000, float %.0.i139) #6, !dbg !21 + %.01.i141 = select i1 %.not4.i140, float %247, float %246, !dbg !21 + %248 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i141) #6, !dbg !21 + %249 = extractelement <2 x float> %240, i64 1, !dbg !21 + %250 = fadd float %249, 0xC168000FE0000000, !dbg !21 + %251 = fneg float %250, !dbg !21 + %252 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %220, float 0x3FF7154760000000, float %251) #6, !dbg !21 + %253 = tail call float @llvm.nvvm.fma.rn.f(float %220, float 0x3FF7154760000000, float %251) #6, !dbg !21 + %.0.i149 = select i1 %.not3.i148, float %253, float %252, !dbg !21 + %254 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %220, float 0x3E54AE0C00000000, float %.0.i149) #6, !dbg !21 + %255 = tail call float @llvm.nvvm.fma.rn.f(float %220, float 0x3E54AE0C00000000, float %.0.i149) #6, !dbg !21 + %.01.i151 = select i1 %.not4.i150, float %255, float %254, !dbg !21 + %256 = bitcast <2 x float> %240 to <2 x i32>, !dbg !21 + %257 = shl <2 x i32> %256, splat (i32 23), !dbg !21 + %258 = bitcast <2 x i32> %257 to <2 x float>, !dbg !21 + %259 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i151) #6, !dbg !21 + %260 = insertelement <2 x float> poison, float %248, i64 0, !dbg !21 + %261 = insertelement <2 x float> %260, float %259, i64 1, !dbg !21 + %262 = fmul <2 x float> %261, %258, !dbg !21 + %263 = select <2 x i1> %173, <2 x float> splat (float 1.000000e+00), <2 x float> %262, !dbg !30 + %264 = fmul <2 x float> %14, %218, !dbg !31 + %265 = fadd <2 x float> %264, %263, !dbg !32 + %266 = insertelement <2 x i1> poison, i1 %19, i64 0, !dbg !33 + %267 = shufflevector <2 x i1> %266, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !33 + %268 = select <2 x i1> %267, <2 x float> %75, <2 x float> %17, !dbg !33 + %269 = select <2 x i1> %267, <2 x float> %172, <2 x float> %15, !dbg !33 + %270 = select <2 x i1> %267, <2 x float> %168, <2 x float> %16, !dbg !34 + %271 = select <2 x i1> %267, <2 x float> %265, <2 x float> %14, !dbg !34 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !11 + %272 = icmp samesign ult i64 %indvars.iv, 29952, !dbg !11 + br i1 %272, label %13, label %273, !dbg !11 + +273: ; preds = %13 + %274 = and i32 %8, 31, !dbg !9 + %275 = lshr i32 %8, 5, !dbg !9 + %276 = extractelement <2 x float> %268, i64 0, !dbg !35 + %277 = extractelement <2 x float> %268, i64 1, !dbg !35 + %278 = fcmp ogt float %276, %277, !dbg !35 + %279 = fcmp uno float %276, 0.000000e+00, !dbg !37 + %280 = or i1 %278, %279, !dbg !38 + %281 = select i1 %280, float %276, float %277, !dbg !39 + %282 = extractelement <2 x float> %269, i64 0, !dbg !35 + %283 = fcmp ogt float %281, %282, !dbg !35 + %284 = fcmp uno float %281, 0.000000e+00, !dbg !37 + %285 = or i1 %283, %284, !dbg !38 + %286 = select i1 %285, float %281, float %282, !dbg !39 + %287 = extractelement <2 x float> %269, i64 1, !dbg !35 + %288 = fcmp ogt float %286, %287, !dbg !35 + %289 = fcmp uno float %286, 0.000000e+00, !dbg !37 + %290 = or i1 %288, %289, !dbg !38 + %291 = select i1 %290, float %286, float %287, !dbg !39 + %292 = bitcast float %291 to i32, !dbg !40 + %293 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %292, i32 16, i32 31), !dbg !40 + %294 = bitcast i32 %293 to float, !dbg !40 + %295 = fcmp ogt float %291, %294, !dbg !35 + %296 = fcmp uno float %291, 0.000000e+00, !dbg !37 + %297 = or i1 %296, %295, !dbg !38 + %298 = select i1 %297, float %291, float %294, !dbg !39 + %299 = bitcast float %298 to i32, !dbg !40 + %300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %299, i32 8, i32 31), !dbg !40 + %301 = bitcast i32 %300 to float, !dbg !40 + %302 = fcmp ogt float %298, %301, !dbg !35 + %303 = fcmp uno float %298, 0.000000e+00, !dbg !37 + %304 = or i1 %302, %303, !dbg !38 + %305 = select i1 %304, float %298, float %301, !dbg !39 + %306 = bitcast float %305 to i32, !dbg !40 + %307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %306, i32 4, i32 31), !dbg !40 + %308 = bitcast i32 %307 to float, !dbg !40 + %309 = fcmp ogt float %305, %308, !dbg !35 + %310 = fcmp uno float %305, 0.000000e+00, !dbg !37 + %311 = or i1 %309, %310, !dbg !38 + %312 = select i1 %311, float %305, float %308, !dbg !39 + %313 = bitcast float %312 to i32, !dbg !40 + %314 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %313, i32 2, i32 31), !dbg !40 + %315 = bitcast i32 %314 to float, !dbg !40 + %316 = fcmp ogt float %312, %315, !dbg !35 + %317 = fcmp uno float %312, 0.000000e+00, !dbg !37 + %318 = or i1 %316, %317, !dbg !38 + %319 = select i1 %318, float %312, float %315, !dbg !39 + %320 = bitcast float %319 to i32, !dbg !40 + %321 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %320, i32 1, i32 31), !dbg !40 + %322 = bitcast i32 %321 to float, !dbg !40 + %323 = fcmp ogt float %319, %322, !dbg !35 + %324 = fcmp uno float %319, 0.000000e+00, !dbg !37 + %325 = or i1 %323, %324, !dbg !38 + %326 = and i32 %275, 15, !dbg !40 + %327 = icmp eq i32 %274, 0, !dbg !40 + %328 = getelementptr float, ptr addrspace(3) @global_smem, i32 %326, !dbg !40 + %329 = select i1 %325, i32 %320, i32 %321, !dbg !39 + %330 = insertelement <1 x i32> poison, i32 %329, i64 0, !dbg !40 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %328, <1 x i32> %330, i1 %327) #6, !dbg !40 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !40 + %331 = icmp samesign ult i32 %8, 16, !dbg !40 + %332 = getelementptr float, ptr addrspace(3) @global_smem, i32 %8, !dbg !40 + %333 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %332, i1 %331) #6, !dbg !40 + %334 = bitcast i32 %333 to float, !dbg !40 + %335 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %333, i32 8, i32 31), !dbg !40 + %336 = bitcast i32 %335 to float, !dbg !40 + %337 = fcmp ogt float %334, %336, !dbg !35 + %338 = fcmp uno float %334, 0.000000e+00, !dbg !37 + %339 = or i1 %338, %337, !dbg !38 + %340 = select i1 %339, float %334, float %336, !dbg !39 + %341 = bitcast float %340 to i32, !dbg !40 + %342 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %341, i32 4, i32 31), !dbg !40 + %343 = bitcast i32 %342 to float, !dbg !40 + %344 = fcmp ogt float %340, %343, !dbg !35 + %345 = fcmp uno float %340, 0.000000e+00, !dbg !37 + %346 = or i1 %344, %345, !dbg !38 + %347 = select i1 %346, float %340, float %343, !dbg !39 + %348 = bitcast float %347 to i32, !dbg !40 + %349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %348, i32 2, i32 31), !dbg !40 + %350 = bitcast i32 %349 to float, !dbg !40 + %351 = fcmp ogt float %347, %350, !dbg !35 + %352 = fcmp uno float %347, 0.000000e+00, !dbg !37 + %353 = or i1 %351, %352, !dbg !38 + %354 = select i1 %353, float %347, float %350, !dbg !39 + %355 = bitcast float %354 to i32, !dbg !40 + %356 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %355, i32 1, i32 31), !dbg !40 + %357 = bitcast i32 %356 to float, !dbg !40 + %358 = fcmp ogt float %354, %357, !dbg !35 + %359 = fcmp uno float %354, 0.000000e+00, !dbg !37 + %360 = or i1 %358, %359, !dbg !38 + %361 = icmp eq i32 %8, 0, !dbg !40 + %362 = select i1 %360, i32 %355, i32 %356, !dbg !39 + %363 = insertelement <1 x i32> poison, i32 %362, i64 0, !dbg !40 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %332, <1 x i32> %363, i1 %361) #6, !dbg !40 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !40 + %364 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !40 + %365 = fcmp oeq float %364, 0xFFF0000000000000, !dbg !41 + %366 = fsub float %276, %364, !dbg !42 + %367 = fsub float %277, %364, !dbg !42 + %368 = fsub float %282, %364, !dbg !42 + %369 = fsub float %287, %364, !dbg !42 + %370 = select i1 %365, float 0.000000e+00, float %366, !dbg !43 + %371 = select i1 %365, float 0.000000e+00, float %367, !dbg !43 + %372 = select i1 %365, float 0.000000e+00, float %368, !dbg !43 + %373 = select i1 %365, float 0.000000e+00, float %369, !dbg !43 + %374 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i = icmp eq i32 %374, 0, !dbg !44 + %375 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %370, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %376 = tail call float @llvm.nvvm.fma.rn.f(float %370, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i = select i1 %.not.i, float %376, float %375, !dbg !44 + %377 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i = icmp eq i32 %377, 0, !dbg !44 + %378 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i) #6, !dbg !44 + %379 = tail call float @llvm.nvvm.saturate.f(float %.02.i) #6, !dbg !44 + %.03.i = select i1 %.not1.i, float %379, float %378, !dbg !44 + %380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %381 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %382 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i = icmp eq i32 %383, 0, !dbg !44 + %384 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i = icmp eq i32 %384, 0, !dbg !44 + %385 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i2 = icmp eq i32 %385, 0, !dbg !44 + %386 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %371, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %387 = tail call float @llvm.nvvm.fma.rn.f(float %371, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i3 = select i1 %.not.i2, float %387, float %386, !dbg !44 + %388 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i4 = icmp eq i32 %388, 0, !dbg !44 + %389 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i3) #6, !dbg !44 + %390 = tail call float @llvm.nvvm.saturate.f(float %.02.i3) #6, !dbg !44 + %.03.i5 = select i1 %.not1.i4, float %390, float %389, !dbg !44 + %391 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %392 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i5, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %393 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i5, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %394 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i8 = icmp eq i32 %394, 0, !dbg !44 + %395 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i10 = icmp eq i32 %395, 0, !dbg !44 + %396 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i12 = icmp eq i32 %396, 0, !dbg !44 + %397 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %372, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %398 = tail call float @llvm.nvvm.fma.rn.f(float %372, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i13 = select i1 %.not.i12, float %398, float %397, !dbg !44 + %399 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i14 = icmp eq i32 %399, 0, !dbg !44 + %400 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i13) #6, !dbg !44 + %401 = tail call float @llvm.nvvm.saturate.f(float %.02.i13) #6, !dbg !44 + %.03.i15 = select i1 %.not1.i14, float %401, float %400, !dbg !44 + %402 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %403 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i15, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %404 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i15, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %405 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i18 = icmp eq i32 %405, 0, !dbg !44 + %406 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i20 = icmp eq i32 %406, 0, !dbg !44 + %407 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i22 = icmp eq i32 %407, 0, !dbg !44 + %408 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %373, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %409 = tail call float @llvm.nvvm.fma.rn.f(float %373, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i23 = select i1 %.not.i22, float %409, float %408, !dbg !44 + %410 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i24 = icmp eq i32 %410, 0, !dbg !44 + %411 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i23) #6, !dbg !44 + %412 = tail call float @llvm.nvvm.saturate.f(float %.02.i23) #6, !dbg !44 + %.03.i25 = select i1 %.not1.i24, float %412, float %411, !dbg !44 + %413 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %414 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i25, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %415 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i25, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %416 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i28 = icmp eq i32 %416, 0, !dbg !44 + %417 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i30 = icmp eq i32 %417, 0, !dbg !44 + %418 = insertelement <2 x i32> poison, i32 %380, i64 0, !dbg !44 + %419 = insertelement <2 x i32> %418, i32 %391, i64 1, !dbg !44 + %420 = icmp eq <2 x i32> %419, zeroinitializer, !dbg !44 + %421 = insertelement <2 x float> poison, float %382, i64 0, !dbg !44 + %422 = insertelement <2 x float> %421, float %393, i64 1, !dbg !44 + %423 = insertelement <2 x float> poison, float %381, i64 0, !dbg !44 + %424 = insertelement <2 x float> %423, float %392, i64 1, !dbg !44 + %425 = select <2 x i1> %420, <2 x float> %422, <2 x float> %424, !dbg !44 + %426 = extractelement <2 x float> %425, i64 0, !dbg !44 + %427 = fadd float %426, 0xC168000FE0000000, !dbg !44 + %428 = fneg float %427, !dbg !44 + %429 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %370, float 0x3FF7154760000000, float %428) #6, !dbg !44 + %430 = tail call float @llvm.nvvm.fma.rn.f(float %370, float 0x3FF7154760000000, float %428) #6, !dbg !44 + %.0.i = select i1 %.not3.i, float %430, float %429, !dbg !44 + %431 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %370, float 0x3E54AE0C00000000, float %.0.i) #6, !dbg !44 + %432 = tail call float @llvm.nvvm.fma.rn.f(float %370, float 0x3E54AE0C00000000, float %.0.i) #6, !dbg !44 + %.01.i = select i1 %.not4.i, float %432, float %431, !dbg !44 + %433 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i) #6, !dbg !44 + %434 = extractelement <2 x float> %425, i64 1, !dbg !44 + %435 = fadd float %434, 0xC168000FE0000000, !dbg !44 + %436 = fneg float %435, !dbg !44 + %437 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %371, float 0x3FF7154760000000, float %436) #6, !dbg !44 + %438 = tail call float @llvm.nvvm.fma.rn.f(float %371, float 0x3FF7154760000000, float %436) #6, !dbg !44 + %.0.i9 = select i1 %.not3.i8, float %438, float %437, !dbg !44 + %439 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %371, float 0x3E54AE0C00000000, float %.0.i9) #6, !dbg !44 + %440 = tail call float @llvm.nvvm.fma.rn.f(float %371, float 0x3E54AE0C00000000, float %.0.i9) #6, !dbg !44 + %.01.i11 = select i1 %.not4.i10, float %440, float %439, !dbg !44 + %441 = bitcast <2 x float> %425 to <2 x i32>, !dbg !44 + %442 = shl <2 x i32> %441, splat (i32 23), !dbg !44 + %443 = bitcast <2 x i32> %442 to <2 x float>, !dbg !44 + %444 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i11) #6, !dbg !44 + %445 = insertelement <2 x float> poison, float %433, i64 0, !dbg !44 + %446 = insertelement <2 x float> %445, float %444, i64 1, !dbg !44 + %447 = fmul <2 x float> %446, %443, !dbg !44 + %448 = fmul <2 x float> %270, %447, !dbg !45 + %449 = insertelement <2 x i32> poison, i32 %402, i64 0, !dbg !44 + %450 = insertelement <2 x i32> %449, i32 %413, i64 1, !dbg !44 + %451 = icmp eq <2 x i32> %450, zeroinitializer, !dbg !44 + %452 = insertelement <2 x float> poison, float %404, i64 0, !dbg !44 + %453 = insertelement <2 x float> %452, float %415, i64 1, !dbg !44 + %454 = insertelement <2 x float> poison, float %403, i64 0, !dbg !44 + %455 = insertelement <2 x float> %454, float %414, i64 1, !dbg !44 + %456 = select <2 x i1> %451, <2 x float> %453, <2 x float> %455, !dbg !44 + %457 = extractelement <2 x float> %456, i64 0, !dbg !44 + %458 = fadd float %457, 0xC168000FE0000000, !dbg !44 + %459 = fneg float %458, !dbg !44 + %460 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %372, float 0x3FF7154760000000, float %459) #6, !dbg !44 + %461 = tail call float @llvm.nvvm.fma.rn.f(float %372, float 0x3FF7154760000000, float %459) #6, !dbg !44 + %.0.i19 = select i1 %.not3.i18, float %461, float %460, !dbg !44 + %462 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %372, float 0x3E54AE0C00000000, float %.0.i19) #6, !dbg !44 + %463 = tail call float @llvm.nvvm.fma.rn.f(float %372, float 0x3E54AE0C00000000, float %.0.i19) #6, !dbg !44 + %.01.i21 = select i1 %.not4.i20, float %463, float %462, !dbg !44 + %464 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i21) #6, !dbg !44 + %465 = extractelement <2 x float> %456, i64 1, !dbg !44 + %466 = fadd float %465, 0xC168000FE0000000, !dbg !44 + %467 = fneg float %466, !dbg !44 + %468 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %373, float 0x3FF7154760000000, float %467) #6, !dbg !44 + %469 = tail call float @llvm.nvvm.fma.rn.f(float %373, float 0x3FF7154760000000, float %467) #6, !dbg !44 + %.0.i29 = select i1 %.not3.i28, float %469, float %468, !dbg !44 + %470 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %373, float 0x3E54AE0C00000000, float %.0.i29) #6, !dbg !44 + %471 = tail call float @llvm.nvvm.fma.rn.f(float %373, float 0x3E54AE0C00000000, float %.0.i29) #6, !dbg !44 + %.01.i31 = select i1 %.not4.i30, float %471, float %470, !dbg !44 + %472 = bitcast <2 x float> %456 to <2 x i32>, !dbg !44 + %473 = shl <2 x i32> %472, splat (i32 23), !dbg !44 + %474 = bitcast <2 x i32> %473 to <2 x float>, !dbg !44 + %475 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i31) #6, !dbg !44 + %476 = insertelement <2 x float> poison, float %464, i64 0, !dbg !44 + %477 = insertelement <2 x float> %476, float %475, i64 1, !dbg !44 + %478 = fmul <2 x float> %477, %474, !dbg !44 + %479 = fmul <2 x float> %271, %478, !dbg !45 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %shift = shufflevector <2 x float> %448, <2 x float> poison, <2 x i32> , !dbg !49 + %foldExtExtBinop191 = fadd <2 x float> %448, %shift, !dbg !49 + %foldExtExtBinop193 = fadd <2 x float> %foldExtExtBinop191, %479, !dbg !49 + %shift195 = shufflevector <2 x float> %479, <2 x float> poison, <2 x i32> , !dbg !49 + %foldExtExtBinop196 = fadd <2 x float> %foldExtExtBinop193, %shift195, !dbg !49 + %480 = extractelement <2 x float> %foldExtExtBinop196, i64 0, !dbg !49 + %481 = bitcast float %480 to i32, !dbg !46 + %482 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %481, i32 16, i32 31), !dbg !46 + %483 = bitcast i32 %482 to float, !dbg !46 + %484 = fadd float %480, %483, !dbg !49 + %485 = bitcast float %484 to i32, !dbg !46 + %486 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %485, i32 8, i32 31), !dbg !46 + %487 = bitcast i32 %486 to float, !dbg !46 + %488 = fadd float %484, %487, !dbg !49 + %489 = bitcast float %488 to i32, !dbg !46 + %490 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %489, i32 4, i32 31), !dbg !46 + %491 = bitcast i32 %490 to float, !dbg !46 + %492 = fadd float %488, %491, !dbg !49 + %493 = bitcast float %492 to i32, !dbg !46 + %494 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %493, i32 2, i32 31), !dbg !46 + %495 = bitcast i32 %494 to float, !dbg !46 + %496 = fadd float %492, %495, !dbg !49 + %497 = bitcast float %496 to i32, !dbg !46 + %498 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %497, i32 1, i32 31), !dbg !46 + %499 = bitcast i32 %498 to float, !dbg !46 + %500 = fadd float %496, %499, !dbg !49 + %501 = bitcast float %500 to <1 x i32>, !dbg !46 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %328, <1 x i32> %501, i1 %327) #6, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %502 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %332, i1 %331) #6, !dbg !46 + %503 = bitcast i32 %502 to float, !dbg !46 + %504 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %502, i32 8, i32 31), !dbg !46 + %505 = bitcast i32 %504 to float, !dbg !46 + %506 = fadd float %503, %505, !dbg !49 + %507 = bitcast float %506 to i32, !dbg !46 + %508 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %507, i32 4, i32 31), !dbg !46 + %509 = bitcast i32 %508 to float, !dbg !46 + %510 = fadd float %506, %509, !dbg !49 + %511 = bitcast float %510 to i32, !dbg !46 + %512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %511, i32 2, i32 31), !dbg !46 + %513 = bitcast i32 %512 to float, !dbg !46 + %514 = fadd float %510, %513, !dbg !49 + %515 = bitcast float %514 to i32, !dbg !46 + %516 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %515, i32 1, i32 31), !dbg !46 + %517 = bitcast i32 %516 to float, !dbg !46 + %518 = fadd float %514, %517, !dbg !49 + %519 = bitcast float %518 to <1 x i32>, !dbg !46 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %332, <1 x i32> %519, i1 %361) #6, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %520 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !46 + br label %521, !dbg !50 + +521: ; preds = %273, %521 + %indvars.iv160 = phi i64 [ 0, %273 ], [ %indvars.iv.next161, %521 ] + %522 = or disjoint i64 %indvars.iv160, %12, !dbg !51 + %523 = icmp samesign ult i64 %522, 32000, !dbg !52 + %524 = trunc nuw nsw i64 %522 to i32, !dbg !53 + %525 = add i32 %11, %524, !dbg !53 + %526 = sext i32 %525 to i64, !dbg !54 + %527 = getelementptr bfloat, ptr addrspace(1) %0, i64 %526, !dbg !54 + %528 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !55 + %529 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %527, i64 %528, i1 %523) #6, !dbg !55 + %530 = extractvalue { i32, i32 } %529, 0, !dbg !55 + %531 = bitcast i32 %530 to <2 x bfloat>, !dbg !55 + %532 = extractvalue { i32, i32 } %529, 1, !dbg !55 + %533 = bitcast i32 %532 to <2 x bfloat>, !dbg !55 + %534 = extractelement <2 x bfloat> %531, i64 0, !dbg !55 + %535 = extractelement <2 x bfloat> %531, i64 1, !dbg !55 + %536 = extractelement <2 x bfloat> %533, i64 0, !dbg !55 + %537 = extractelement <2 x bfloat> %533, i64 1, !dbg !55 + %538 = fpext bfloat %534 to float, !dbg !56 + %539 = fpext bfloat %535 to float, !dbg !56 + %540 = fpext bfloat %536 to float, !dbg !56 + %541 = fpext bfloat %537 to float, !dbg !56 + %542 = fsub float %538, %364, !dbg !57 + %543 = fsub float %539, %364, !dbg !57 + %544 = fsub float %540, %364, !dbg !57 + %545 = fsub float %541, %364, !dbg !57 + %546 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i32 = icmp eq i32 %546, 0, !dbg !58 + %547 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %542, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %548 = tail call float @llvm.nvvm.fma.rn.f(float %542, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i33 = select i1 %.not.i32, float %548, float %547, !dbg !58 + %549 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i34 = icmp eq i32 %549, 0, !dbg !58 + %550 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i33) #6, !dbg !58 + %551 = tail call float @llvm.nvvm.saturate.f(float %.02.i33) #6, !dbg !58 + %.03.i35 = select i1 %.not1.i34, float %551, float %550, !dbg !58 + %552 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i36 = icmp eq i32 %552, 0, !dbg !58 + %553 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i35, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %554 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i35, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i37 = select i1 %.not2.i36, float %554, float %553, !dbg !58 + %555 = fadd float %.04.i37, 0xC168000FE0000000, !dbg !58 + %556 = fneg float %555, !dbg !58 + %557 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i38 = icmp eq i32 %557, 0, !dbg !58 + %558 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %542, float 0x3FF7154760000000, float %556) #6, !dbg !58 + %559 = tail call float @llvm.nvvm.fma.rn.f(float %542, float 0x3FF7154760000000, float %556) #6, !dbg !58 + %.0.i39 = select i1 %.not3.i38, float %559, float %558, !dbg !58 + %560 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i40 = icmp eq i32 %560, 0, !dbg !58 + %561 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %542, float 0x3E54AE0C00000000, float %.0.i39) #6, !dbg !58 + %562 = tail call float @llvm.nvvm.fma.rn.f(float %542, float 0x3E54AE0C00000000, float %.0.i39) #6, !dbg !58 + %.01.i41 = select i1 %.not4.i40, float %562, float %561, !dbg !58 + %563 = bitcast float %.04.i37 to i32, !dbg !58 + %564 = shl i32 %563, 23, !dbg !58 + %565 = bitcast i32 %564 to float, !dbg !58 + %566 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i41) #6, !dbg !58 + %567 = fmul float %566, %565, !dbg !58 + %568 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i42 = icmp eq i32 %568, 0, !dbg !58 + %569 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %543, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %570 = tail call float @llvm.nvvm.fma.rn.f(float %543, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i43 = select i1 %.not.i42, float %570, float %569, !dbg !58 + %571 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i44 = icmp eq i32 %571, 0, !dbg !58 + %572 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i43) #6, !dbg !58 + %573 = tail call float @llvm.nvvm.saturate.f(float %.02.i43) #6, !dbg !58 + %.03.i45 = select i1 %.not1.i44, float %573, float %572, !dbg !58 + %574 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i46 = icmp eq i32 %574, 0, !dbg !58 + %575 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i45, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %576 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i45, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i47 = select i1 %.not2.i46, float %576, float %575, !dbg !58 + %577 = fadd float %.04.i47, 0xC168000FE0000000, !dbg !58 + %578 = fneg float %577, !dbg !58 + %579 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i48 = icmp eq i32 %579, 0, !dbg !58 + %580 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %543, float 0x3FF7154760000000, float %578) #6, !dbg !58 + %581 = tail call float @llvm.nvvm.fma.rn.f(float %543, float 0x3FF7154760000000, float %578) #6, !dbg !58 + %.0.i49 = select i1 %.not3.i48, float %581, float %580, !dbg !58 + %582 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i50 = icmp eq i32 %582, 0, !dbg !58 + %583 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %543, float 0x3E54AE0C00000000, float %.0.i49) #6, !dbg !58 + %584 = tail call float @llvm.nvvm.fma.rn.f(float %543, float 0x3E54AE0C00000000, float %.0.i49) #6, !dbg !58 + %.01.i51 = select i1 %.not4.i50, float %584, float %583, !dbg !58 + %585 = bitcast float %.04.i47 to i32, !dbg !58 + %586 = shl i32 %585, 23, !dbg !58 + %587 = bitcast i32 %586 to float, !dbg !58 + %588 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i51) #6, !dbg !58 + %589 = fmul float %588, %587, !dbg !58 + %590 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i52 = icmp eq i32 %590, 0, !dbg !58 + %591 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %544, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %592 = tail call float @llvm.nvvm.fma.rn.f(float %544, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i53 = select i1 %.not.i52, float %592, float %591, !dbg !58 + %593 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i54 = icmp eq i32 %593, 0, !dbg !58 + %594 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i53) #6, !dbg !58 + %595 = tail call float @llvm.nvvm.saturate.f(float %.02.i53) #6, !dbg !58 + %.03.i55 = select i1 %.not1.i54, float %595, float %594, !dbg !58 + %596 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i56 = icmp eq i32 %596, 0, !dbg !58 + %597 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i55, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %598 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i55, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i57 = select i1 %.not2.i56, float %598, float %597, !dbg !58 + %599 = fadd float %.04.i57, 0xC168000FE0000000, !dbg !58 + %600 = fneg float %599, !dbg !58 + %601 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i58 = icmp eq i32 %601, 0, !dbg !58 + %602 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %544, float 0x3FF7154760000000, float %600) #6, !dbg !58 + %603 = tail call float @llvm.nvvm.fma.rn.f(float %544, float 0x3FF7154760000000, float %600) #6, !dbg !58 + %.0.i59 = select i1 %.not3.i58, float %603, float %602, !dbg !58 + %604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i60 = icmp eq i32 %604, 0, !dbg !58 + %605 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %544, float 0x3E54AE0C00000000, float %.0.i59) #6, !dbg !58 + %606 = tail call float @llvm.nvvm.fma.rn.f(float %544, float 0x3E54AE0C00000000, float %.0.i59) #6, !dbg !58 + %.01.i61 = select i1 %.not4.i60, float %606, float %605, !dbg !58 + %607 = bitcast float %.04.i57 to i32, !dbg !58 + %608 = shl i32 %607, 23, !dbg !58 + %609 = bitcast i32 %608 to float, !dbg !58 + %610 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i61) #6, !dbg !58 + %611 = fmul float %610, %609, !dbg !58 + %612 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i62 = icmp eq i32 %612, 0, !dbg !58 + %613 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %545, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %614 = tail call float @llvm.nvvm.fma.rn.f(float %545, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i63 = select i1 %.not.i62, float %614, float %613, !dbg !58 + %615 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i64 = icmp eq i32 %615, 0, !dbg !58 + %616 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i63) #6, !dbg !58 + %617 = tail call float @llvm.nvvm.saturate.f(float %.02.i63) #6, !dbg !58 + %.03.i65 = select i1 %.not1.i64, float %617, float %616, !dbg !58 + %618 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i66 = icmp eq i32 %618, 0, !dbg !58 + %619 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i65, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %620 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i65, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i67 = select i1 %.not2.i66, float %620, float %619, !dbg !58 + %621 = fadd float %.04.i67, 0xC168000FE0000000, !dbg !58 + %622 = fneg float %621, !dbg !58 + %623 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i68 = icmp eq i32 %623, 0, !dbg !58 + %624 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %545, float 0x3FF7154760000000, float %622) #6, !dbg !58 + %625 = tail call float @llvm.nvvm.fma.rn.f(float %545, float 0x3FF7154760000000, float %622) #6, !dbg !58 + %.0.i69 = select i1 %.not3.i68, float %625, float %624, !dbg !58 + %626 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i70 = icmp eq i32 %626, 0, !dbg !58 + %627 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %545, float 0x3E54AE0C00000000, float %.0.i69) #6, !dbg !58 + %628 = tail call float @llvm.nvvm.fma.rn.f(float %545, float 0x3E54AE0C00000000, float %.0.i69) #6, !dbg !58 + %.01.i71 = select i1 %.not4.i70, float %628, float %627, !dbg !58 + %629 = bitcast float %.04.i67 to i32, !dbg !58 + %630 = shl i32 %629, 23, !dbg !58 + %631 = bitcast i32 %630 to float, !dbg !58 + %632 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i71) #6, !dbg !58 + %633 = fmul float %632, %631, !dbg !58 + %634 = tail call float @llvm.nvvm.div.full(float %567, float %520), !dbg !59 + %635 = tail call float @llvm.nvvm.div.full(float %589, float %520), !dbg !59 + %636 = tail call float @llvm.nvvm.div.full(float %611, float %520), !dbg !59 + %637 = tail call float @llvm.nvvm.div.full(float %633, float %520), !dbg !59 + %638 = getelementptr float, ptr addrspace(1) %1, i64 %526, !dbg !60 + %639 = bitcast float %634 to i32, !dbg !61 + %640 = bitcast float %635 to i32, !dbg !61 + %641 = bitcast float %636 to i32, !dbg !61 + %642 = bitcast float %637 to i32, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %639, i32 %640, i32 %641, i32 %642, ptr addrspace(1) %638, i1 %523) #6, !dbg !61 + %indvars.iv.next161 = add nuw nsw i64 %indvars.iv160, 2048, !dbg !50 + %643 = icmp samesign ult i64 %indvars.iv160, 29952, !dbg !50 + br i1 %643, label %521, label %644, !dbg !50 + +644: ; preds = %521 + ret void, !dbg !62 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.saturate.ftz.f(float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.saturate.f(float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rm.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rm.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0", linkageName: "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 26, column: 37, scope: !5) +!10 = !DILocation(line: 37, column: 47, scope: !5) +!11 = !DILocation(line: 31, column: 40, scope: !5) +!12 = !DILocation(line: 32, column: 31, scope: !5) +!13 = !DILocation(line: 33, column: 29, scope: !5) +!14 = !DILocation(line: 37, column: 41, scope: !5) +!15 = !DILocation(line: 37, column: 34, scope: !5) +!16 = !DILocation(line: 37, column: 52, scope: !5) +!17 = !DILocation(line: 112, column: 21, scope: !18, inlinedAt: !20) +!18 = distinct !DILexicalBlockFile(scope: !5, file: !19, discriminator: 0) +!19 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!20 = !DILocation(line: 42, column: 40, scope: !5) +!21 = !DILocation(line: 173, column: 29, scope: !18, inlinedAt: !20) +!22 = !DILocation(line: 37, column: 105, scope: !5) +!23 = !DILocation(line: 110, column: 15, scope: !18, inlinedAt: !20) +!24 = !DILocation(line: 112, column: 16, scope: !18, inlinedAt: !20) +!25 = !DILocation(line: 113, column: 29, scope: !18, inlinedAt: !20) +!26 = !DILocation(line: 196, column: 19, scope: !18, inlinedAt: !20) +!27 = !DILocation(line: 196, column: 53, scope: !18, inlinedAt: !20) +!28 = !DILocation(line: 196, column: 39, scope: !18, inlinedAt: !20) +!29 = !DILocation(line: 199, column: 53, scope: !18, inlinedAt: !20) +!30 = !DILocation(line: 199, column: 39, scope: !18, inlinedAt: !20) +!31 = !DILocation(line: 205, column: 24, scope: !18, inlinedAt: !20) +!32 = !DILocation(line: 205, column: 36, scope: !18, inlinedAt: !20) +!33 = !DILocation(line: 45, column: 54, scope: !5) +!34 = !DILocation(line: 46, column: 54, scope: !5) +!35 = !DILocation(line: 110, column: 15, scope: !18, inlinedAt: !36) +!36 = !DILocation(line: 49, column: 33, scope: !5) +!37 = !DILocation(line: 112, column: 21, scope: !18, inlinedAt: !36) +!38 = !DILocation(line: 112, column: 16, scope: !18, inlinedAt: !36) +!39 = !DILocation(line: 113, column: 29, scope: !18, inlinedAt: !36) +!40 = !DILocation(line: 123, column: 29, scope: !18, inlinedAt: !36) +!41 = !DILocation(line: 180, column: 40, scope: !18, inlinedAt: !36) +!42 = !DILocation(line: 180, column: 68, scope: !18, inlinedAt: !36) +!43 = !DILocation(line: 180, column: 58, scope: !18, inlinedAt: !36) +!44 = !DILocation(line: 173, column: 29, scope: !18, inlinedAt: !36) +!45 = !DILocation(line: 181, column: 31, scope: !18, inlinedAt: !36) +!46 = !DILocation(line: 291, column: 36, scope: !47, inlinedAt: !36) +!47 = distinct !DILexicalBlockFile(scope: !5, file: !48, discriminator: 0) +!48 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!49 = !DILocation(line: 261, column: 15, scope: !47, inlinedAt: !36) +!50 = !DILocation(line: 52, column: 40, scope: !5) +!51 = !DILocation(line: 53, column: 31, scope: !5) +!52 = !DILocation(line: 54, column: 29, scope: !5) +!53 = !DILocation(line: 58, column: 41, scope: !5) +!54 = !DILocation(line: 58, column: 34, scope: !5) +!55 = !DILocation(line: 58, column: 52, scope: !5) +!56 = !DILocation(line: 58, column: 106, scope: !5) +!57 = !DILocation(line: 60, column: 22, scope: !5) +!58 = !DILocation(line: 61, column: 29, scope: !5) +!59 = !DILocation(line: 62, column: 23, scope: !5) +!60 = !DILocation(line: 63, column: 29, scope: !5) +!61 = !DILocation(line: 63, column: 53, scope: !5) +!62 = !DILocation(line: 52, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..036a0e66eb6dde09dcb994b52ebdc62ff43d855a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx @@ -0,0 +1,921 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 // -- Begin function triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 +.visible .entry triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0( + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_1, + .param .u32 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_2, + .param .u32 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_5 +) +.reqntid 512 +{ + .reg .pred %p<49>; + .reg .b16 %rs<9>; + .reg .b32 %r<354>; + .reg .b64 %rd<68>; + .loc 1 18 0 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:18:0 + +// %bb.0: + ld.param.b64 %rd16, [triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_1]; + ld.param.b64 %rd15, [triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_0]; +$L__tmp0: + .loc 1 23 28 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:23:28 + mov.u32 %r4, %ctaid.x; + .loc 1 26 37 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:26:37 + mov.u32 %r1, %tid.x; + shl.b32 %r5, %r1, 2; + and.b32 %r6, %r5, 2044; + .loc 1 31 40 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:31:40 + cvt.u64.u32 %rd1, %r6; + mad.lo.s32 %r7, %r4, 32000, %r6; + cvt.u64.u32 %rd2, %r7; + mov.b32 %r8, 0fFF800000; + mov.b64 %rd64, {%r8, %r8}; + mov.b32 %r9, 0f00000000; + mov.b64 %rd63, {%r9, %r9}; + mov.b64 %rd62, 0; + mov.b64 %rd65, %rd63; + mov.b64 %rd66, %rd64; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 33 29 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:33:29 + add.s64 %rd23, %rd1, %rd62; + setp.lt.u64 %p1, %rd23, 32000; + .loc 1 37 34 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:37:34 + add.s64 %rd24, %rd2, %rd62; + cvt.u32.u64 %r14, %rd24; + mad.wide.s32 %rd21, %r14, 2, %rd15; + .loc 1 37 52 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:37:52 + // begin inline asm + mov.u64 %rd20, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0; + // end inline asm + mov.b32 %r12, 0; + // begin inline asm + mov.u32 %r10, %r12; + mov.u32 %r11, %r12; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r10, %r11 }, [ %rd21 + 0 ], %rd20; + // end inline asm +$L__tmp1: + .loc 2 112 21 // triton_helpers.py:112:21 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + mov.b64 {%r15, %r16}, %rd66; + setp.nan.f32 %p2, %r15, %r15; + setp.nan.f32 %p3, %r16, %r16; + mov.b64 {%r17, %r18}, %rd64; + setp.nan.f32 %p4, %r17, %r17; + setp.nan.f32 %p5, %r18, %r18; +$L__tmp2: + .loc 1 37 105 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:37:105 + mov.b32 {%rs1, %rs2}, %r10; + cvt.f32.bf16 %r19, %rs2; + cvt.f32.bf16 %r20, %rs1; +$L__tmp3: + .loc 2 110 15 // triton_helpers.py:110:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + setp.gt.f32 %p6, %r15, %r20; + setp.gt.f32 %p7, %r16, %r19; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + selp.f32 %r21, %r16, %r19, %p7; + selp.f32 %r22, %r16, %r21, %p3; + selp.f32 %r23, %r15, %r20, %p6; + selp.f32 %r24, %r15, %r23, %p2; + .loc 2 196 19 // triton_helpers.py:196:19 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + setp.eq.f32 %p8, %r24, 0fFF800000; + setp.eq.f32 %p9, %r22, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + sub.f32 %r25, %r16, %r22; + sub.f32 %r26, %r15, %r24; + mov.b32 %r27, 0f3F000000; + mov.b32 %r28, 0f3BBB989D; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + fma.rn.ftz.f32 %r29, %r26, %r28, %r27; + cvt.ftz.sat.f32.f32 %r30, %r29; + mov.b32 %r31, 0f4B400001; + mov.b32 %r32, 0f437C0000; + fma.rm.ftz.f32 %r33, %r30, %r32, %r31; + fma.rn.ftz.f32 %r34, %r25, %r28, %r27; + cvt.ftz.sat.f32.f32 %r35, %r34; + fma.rm.ftz.f32 %r36, %r35, %r32, %r31; + mov.b64 %rd25, {%r33, %r36}; + cvt.u32.u64 %r37, %rd25; + add.f32 %r38, %r33, 0fCB40007F; + neg.f32 %r39, %r38; + mov.b32 %r40, 0f3FB8AA3B; + fma.rn.ftz.f32 %r41, %r26, %r40, %r39; + mov.b32 %r42, 0f32A57060; + fma.rn.ftz.f32 %r43, %r26, %r42, %r41; + ex2.approx.ftz.f32 %r44, %r43; + add.f32 %r45, %r36, 0fCB40007F; + neg.f32 %r46, %r45; + fma.rn.ftz.f32 %r47, %r25, %r40, %r46; + fma.rn.ftz.f32 %r48, %r25, %r42, %r47; + shl.b64 %rd26, %rd25, 23; + and.b64 %rd27, %rd26, -36028797018963968; + shl.b32 %r49, %r37, 23; + cvt.u64.u32 %rd28, %r49; + or.b64 %rd29, %rd28, %rd27; + ex2.approx.ftz.f32 %r50, %r48; + mov.b64 {%r51, %r52}, %rd29; + mul.f32 %r53, %r44, %r51; + mul.f32 %r54, %r50, %r52; + .loc 2 196 39 // triton_helpers.py:196:39 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + selp.f32 %r55, 0f3F800000, %r54, %p9; + selp.f32 %r56, 0f3F800000, %r53, %p8; + .loc 2 199 53 // triton_helpers.py:199:53 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + sub.f32 %r57, %r19, %r22; + sub.f32 %r58, %r20, %r24; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + fma.rn.ftz.f32 %r59, %r58, %r28, %r27; + cvt.ftz.sat.f32.f32 %r60, %r59; + fma.rm.ftz.f32 %r61, %r60, %r32, %r31; + fma.rn.ftz.f32 %r62, %r57, %r28, %r27; + cvt.ftz.sat.f32.f32 %r63, %r62; + fma.rm.ftz.f32 %r64, %r63, %r32, %r31; + mov.b64 %rd30, {%r61, %r64}; + cvt.u32.u64 %r65, %rd30; + add.f32 %r66, %r61, 0fCB40007F; + neg.f32 %r67, %r66; + fma.rn.ftz.f32 %r68, %r58, %r40, %r67; + fma.rn.ftz.f32 %r69, %r58, %r42, %r68; + ex2.approx.ftz.f32 %r70, %r69; + add.f32 %r71, %r64, 0fCB40007F; + neg.f32 %r72, %r71; + fma.rn.ftz.f32 %r73, %r57, %r40, %r72; + fma.rn.ftz.f32 %r74, %r57, %r42, %r73; + shl.b64 %rd31, %rd30, 23; + and.b64 %rd32, %rd31, -36028797018963968; + shl.b32 %r75, %r65, 23; + cvt.u64.u32 %rd33, %r75; + or.b64 %rd34, %rd33, %rd32; + ex2.approx.ftz.f32 %r76, %r74; + mov.b64 {%r77, %r78}, %rd34; + mul.f32 %r79, %r70, %r77; + mul.f32 %r80, %r76, %r78; + .loc 2 199 39 // triton_helpers.py:199:39 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + selp.f32 %r81, 0f3F800000, %r80, %p9; + selp.f32 %r82, 0f3F800000, %r79, %p8; + .loc 2 205 36 // triton_helpers.py:205:36 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + mov.b64 {%r83, %r84}, %rd65; + fma.rn.f32 %r85, %r83, %r56, %r82; + fma.rn.f32 %r86, %r84, %r55, %r81; +$L__tmp4: + .loc 1 37 105 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:37:105 + mov.b32 {%rs3, %rs4}, %r11; + cvt.f32.bf16 %r87, %rs4; + cvt.f32.bf16 %r88, %rs3; +$L__tmp5: + .loc 2 110 15 // triton_helpers.py:110:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + setp.gt.f32 %p10, %r17, %r88; + setp.gt.f32 %p11, %r18, %r87; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + selp.f32 %r89, %r18, %r87, %p11; + selp.f32 %r90, %r18, %r89, %p5; + selp.f32 %r91, %r17, %r88, %p10; + selp.f32 %r92, %r17, %r91, %p4; + .loc 2 196 19 // triton_helpers.py:196:19 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + setp.eq.f32 %p12, %r92, 0fFF800000; + setp.eq.f32 %p13, %r90, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + sub.f32 %r93, %r18, %r90; + sub.f32 %r94, %r17, %r92; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + fma.rn.ftz.f32 %r95, %r94, %r28, %r27; + cvt.ftz.sat.f32.f32 %r96, %r95; + fma.rm.ftz.f32 %r97, %r96, %r32, %r31; + fma.rn.ftz.f32 %r98, %r93, %r28, %r27; + cvt.ftz.sat.f32.f32 %r99, %r98; + fma.rm.ftz.f32 %r100, %r99, %r32, %r31; + mov.b64 %rd35, {%r97, %r100}; + cvt.u32.u64 %r101, %rd35; + add.f32 %r102, %r97, 0fCB40007F; + neg.f32 %r103, %r102; + fma.rn.ftz.f32 %r104, %r94, %r40, %r103; + fma.rn.ftz.f32 %r105, %r94, %r42, %r104; + ex2.approx.ftz.f32 %r106, %r105; + add.f32 %r107, %r100, 0fCB40007F; + neg.f32 %r108, %r107; + fma.rn.ftz.f32 %r109, %r93, %r40, %r108; + fma.rn.ftz.f32 %r110, %r93, %r42, %r109; + shl.b64 %rd36, %rd35, 23; + and.b64 %rd37, %rd36, -36028797018963968; + shl.b32 %r111, %r101, 23; + cvt.u64.u32 %rd38, %r111; + or.b64 %rd39, %rd38, %rd37; + ex2.approx.ftz.f32 %r112, %r110; + mov.b64 {%r113, %r114}, %rd39; + mul.f32 %r115, %r106, %r113; + mul.f32 %r116, %r112, %r114; + .loc 2 196 39 // triton_helpers.py:196:39 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + selp.f32 %r117, 0f3F800000, %r116, %p13; + selp.f32 %r118, 0f3F800000, %r115, %p12; + .loc 2 199 53 // triton_helpers.py:199:53 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + sub.f32 %r119, %r87, %r90; + sub.f32 %r120, %r88, %r92; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + fma.rn.ftz.f32 %r121, %r120, %r28, %r27; + cvt.ftz.sat.f32.f32 %r122, %r121; + fma.rm.ftz.f32 %r123, %r122, %r32, %r31; + fma.rn.ftz.f32 %r124, %r119, %r28, %r27; + cvt.ftz.sat.f32.f32 %r125, %r124; + fma.rm.ftz.f32 %r126, %r125, %r32, %r31; + mov.b64 %rd40, {%r123, %r126}; + cvt.u32.u64 %r127, %rd40; + add.f32 %r128, %r123, 0fCB40007F; + neg.f32 %r129, %r128; + fma.rn.ftz.f32 %r130, %r120, %r40, %r129; + fma.rn.ftz.f32 %r131, %r120, %r42, %r130; + ex2.approx.ftz.f32 %r132, %r131; + add.f32 %r133, %r126, 0fCB40007F; + neg.f32 %r134, %r133; + fma.rn.ftz.f32 %r135, %r119, %r40, %r134; + fma.rn.ftz.f32 %r136, %r119, %r42, %r135; + shl.b64 %rd41, %rd40, 23; + and.b64 %rd42, %rd41, -36028797018963968; + shl.b32 %r137, %r127, 23; + cvt.u64.u32 %rd43, %r137; + or.b64 %rd44, %rd43, %rd42; + ex2.approx.ftz.f32 %r138, %r136; + mov.b64 {%r139, %r140}, %rd44; + mul.f32 %r141, %r132, %r139; + mul.f32 %r142, %r138, %r140; + .loc 2 199 39 // triton_helpers.py:199:39 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + selp.f32 %r143, 0f3F800000, %r142, %p13; + selp.f32 %r144, 0f3F800000, %r141, %p12; + .loc 2 205 36 // triton_helpers.py:205:36 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:42:40 ] + mov.b64 {%r145, %r146}, %rd63; + fma.rn.f32 %r147, %r145, %r118, %r144; + fma.rn.f32 %r148, %r146, %r117, %r143; +$L__tmp6: + .loc 1 45 54 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:45:54 + selp.f32 %r149, %r22, %r16, %p1; + selp.f32 %r150, %r24, %r15, %p1; + mov.b64 %rd66, {%r150, %r149}; + selp.f32 %r151, %r90, %r18, %p1; + selp.f32 %r152, %r92, %r17, %p1; + mov.b64 %rd64, {%r152, %r151}; + .loc 1 46 54 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:46:54 + selp.f32 %r153, %r86, %r84, %p1; + selp.f32 %r154, %r85, %r83, %p1; + mov.b64 %rd65, {%r154, %r153}; + selp.f32 %r155, %r148, %r146, %p1; + selp.f32 %r156, %r147, %r145, %p1; + mov.b64 %rd63, {%r156, %r155}; + .loc 1 31 40 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:31:40 + add.s64 %rd12, %rd62, 2048; + setp.lt.u64 %p14, %rd62, 29952; + mov.b64 %rd62, %rd12; + @%p14 bra $L__BB0_1; +// %bb.2: + .loc 1 26 37 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:26:37 + and.b32 %r169, %r1, 31; +$L__tmp7: + .loc 2 110 15 // triton_helpers.py:110:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + mov.b64 {%r170, %r171}, %rd66; + setp.gt.f32 %p21, %r170, %r171; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.nan.f32 %p22, %r170, %r170; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + selp.f32 %r172, %r170, %r171, %p22; + selp.f32 %r173, %r170, %r172, %p21; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + mov.b64 {%r174, %r175}, %rd64; + setp.gt.f32 %p23, %r173, %r174; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.nan.f32 %p24, %r173, %r173; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + selp.f32 %r176, %r173, %r174, %p24; + selp.f32 %r177, %r173, %r176, %p23; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.gt.f32 %p25, %r177, %r175; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.nan.f32 %p26, %r177, %r177; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + selp.f32 %r178, %r177, %r175, %p26; + selp.f32 %r179, %r177, %r178, %p25; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + shfl.sync.bfly.b32 %r180, %r179, 16, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.gt.f32 %p27, %r179, %r180; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.nan.f32 %p28, %r179, %r179; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + selp.f32 %r181, %r179, %r180, %p27; + selp.f32 %r182, %r179, %r181, %p28; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + shfl.sync.bfly.b32 %r183, %r182, 8, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.gt.f32 %p29, %r182, %r183; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.nan.f32 %p30, %r182, %r182; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + selp.f32 %r184, %r182, %r183, %p30; + selp.f32 %r185, %r182, %r184, %p29; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + shfl.sync.bfly.b32 %r186, %r185, 4, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.gt.f32 %p31, %r185, %r186; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.nan.f32 %p32, %r185, %r185; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + selp.f32 %r187, %r185, %r186, %p32; + selp.f32 %r188, %r185, %r187, %p31; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + shfl.sync.bfly.b32 %r189, %r188, 2, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.gt.f32 %p33, %r188, %r189; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.nan.f32 %p34, %r188, %r188; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + selp.f32 %r190, %r188, %r189, %p34; + selp.f32 %r191, %r188, %r190, %p33; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + shfl.sync.bfly.b32 %r192, %r191, 1, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.gt.f32 %p35, %r191, %r192; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.nan.f32 %p36, %r191, %r191; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.eq.b32 %p15, %r169, 0; + shr.u32 %r193, %r1, 3; + and.b32 %r194, %r193, 60; + mov.b32 %r195, global_smem; + add.s32 %r157, %r195, %r194; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + selp.b32 %r196, %r191, %r192, %p36; + selp.b32 %r158, %r191, %r196, %p35; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + // begin inline asm + @%p15 st.shared.b32 [ %r157 + 0 ], %r158; + // end inline asm + bar.sync 0; + setp.lt.u32 %p16, %r1, 16; + add.s32 %r160, %r195, %r5; + // begin inline asm + @%p16 ld.shared.b32 %r159, [ %r160 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r198, %r159, 8, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.gt.f32 %p37, %r159, %r198; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.nan.f32 %p38, %r159, %r159; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + selp.f32 %r199, %r159, %r198, %p37; + selp.f32 %r200, %r159, %r199, %p38; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + shfl.sync.bfly.b32 %r201, %r200, 4, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.gt.f32 %p39, %r200, %r201; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.nan.f32 %p40, %r200, %r200; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + selp.f32 %r202, %r200, %r201, %p40; + selp.f32 %r203, %r200, %r202, %p39; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + shfl.sync.bfly.b32 %r204, %r203, 2, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.gt.f32 %p41, %r203, %r204; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.nan.f32 %p42, %r203, %r203; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + selp.f32 %r205, %r203, %r204, %p42; + selp.f32 %r206, %r203, %r205, %p41; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + shfl.sync.bfly.b32 %r207, %r206, 1, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.gt.f32 %p43, %r206, %r207; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.nan.f32 %p44, %r206, %r206; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.eq.b32 %p17, %r1, 0; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + selp.b32 %r208, %r206, %r207, %p44; + selp.b32 %r162, %r206, %r208, %p43; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + // begin inline asm + @%p17 st.shared.b32 [ %r160 + 0 ], %r162; + // end inline asm + bar.sync 0; + ld.shared.b32 %r2, [global_smem]; + .loc 2 180 40 // triton_helpers.py:180:40 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + setp.eq.f32 %p45, %r2, 0fFF800000; + .loc 2 180 68 // triton_helpers.py:180:68 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + sub.f32 %r209, %r170, %r2; + sub.f32 %r210, %r171, %r2; + sub.f32 %r211, %r174, %r2; + sub.f32 %r212, %r175, %r2; + .loc 2 180 58 // triton_helpers.py:180:58 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + selp.f32 %r213, 0f00000000, %r209, %p45; + selp.f32 %r214, 0f00000000, %r210, %p45; + selp.f32 %r215, 0f00000000, %r211, %p45; + selp.f32 %r216, 0f00000000, %r212, %p45; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + fma.rn.ftz.f32 %r219, %r213, %r28, %r27; + cvt.ftz.sat.f32.f32 %r220, %r219; + fma.rm.ftz.f32 %r223, %r220, %r32, %r31; + fma.rn.ftz.f32 %r224, %r214, %r28, %r27; + cvt.ftz.sat.f32.f32 %r225, %r224; + fma.rm.ftz.f32 %r226, %r225, %r32, %r31; + fma.rn.ftz.f32 %r227, %r215, %r28, %r27; + cvt.ftz.sat.f32.f32 %r228, %r227; + fma.rm.ftz.f32 %r229, %r228, %r32, %r31; + fma.rn.ftz.f32 %r230, %r216, %r28, %r27; + cvt.ftz.sat.f32.f32 %r231, %r230; + fma.rm.ftz.f32 %r232, %r231, %r32, %r31; + mov.b64 %rd46, {%r223, %r226}; + cvt.u32.u64 %r233, %rd46; + add.f32 %r234, %r223, 0fCB40007F; + neg.f32 %r235, %r234; + fma.rn.ftz.f32 %r237, %r213, %r40, %r235; + fma.rn.ftz.f32 %r239, %r213, %r42, %r237; + ex2.approx.ftz.f32 %r240, %r239; + add.f32 %r241, %r226, 0fCB40007F; + neg.f32 %r242, %r241; + fma.rn.ftz.f32 %r243, %r214, %r40, %r242; + fma.rn.ftz.f32 %r244, %r214, %r42, %r243; + shl.b64 %rd47, %rd46, 23; + and.b64 %rd48, %rd47, -36028797018963968; + shl.b32 %r245, %r233, 23; + cvt.u64.u32 %rd49, %r245; + or.b64 %rd50, %rd49, %rd48; + ex2.approx.ftz.f32 %r246, %r244; + mov.b64 {%r247, %r248}, %rd50; + mul.f32 %r249, %r240, %r247; + mul.f32 %r250, %r246, %r248; + .loc 2 181 31 // triton_helpers.py:181:31 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + mov.b64 {%r251, %r252}, %rd65; + mul.f32 %r253, %r252, %r250; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + mov.b64 %rd51, {%r229, %r232}; + cvt.u32.u64 %r254, %rd51; + add.f32 %r255, %r229, 0fCB40007F; + neg.f32 %r256, %r255; + fma.rn.ftz.f32 %r257, %r215, %r40, %r256; + fma.rn.ftz.f32 %r258, %r215, %r42, %r257; + ex2.approx.ftz.f32 %r259, %r258; + add.f32 %r260, %r232, 0fCB40007F; + neg.f32 %r261, %r260; + fma.rn.ftz.f32 %r262, %r216, %r40, %r261; + fma.rn.ftz.f32 %r263, %r216, %r42, %r262; + shl.b64 %rd52, %rd51, 23; + and.b64 %rd53, %rd52, -36028797018963968; + shl.b32 %r264, %r254, 23; + cvt.u64.u32 %rd54, %r264; + or.b64 %rd55, %rd54, %rd53; + ex2.approx.ftz.f32 %r265, %r263; + mov.b64 {%r266, %r267}, %rd55; + mul.f32 %r268, %r265, %r267; + mul.f32 %r269, %r259, %r266; + .loc 2 181 31 // triton_helpers.py:181:31 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + mov.b64 {%r270, %r271}, %rd63; + .loc 3 291 36 // standard.py:291:36 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + bar.sync 0; + .loc 3 261 15 // standard.py:261:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + fma.rn.f32 %r272, %r251, %r249, %r253; + fma.rn.f32 %r273, %r270, %r269, %r272; + fma.rn.f32 %r274, %r271, %r268, %r273; + .loc 3 291 36 // standard.py:291:36 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + shfl.sync.bfly.b32 %r275, %r274, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + add.f32 %r276, %r274, %r275; + .loc 3 291 36 // standard.py:291:36 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + shfl.sync.bfly.b32 %r277, %r276, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + add.f32 %r278, %r276, %r277; + .loc 3 291 36 // standard.py:291:36 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + shfl.sync.bfly.b32 %r279, %r278, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + add.f32 %r280, %r278, %r279; + .loc 3 291 36 // standard.py:291:36 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + shfl.sync.bfly.b32 %r281, %r280, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + add.f32 %r282, %r280, %r281; + .loc 3 291 36 // standard.py:291:36 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + shfl.sync.bfly.b32 %r283, %r282, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + add.f32 %r164, %r282, %r283; + .loc 3 291 36 // standard.py:291:36 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + // begin inline asm + @%p15 st.shared.b32 [ %r157 + 0 ], %r164; + // end inline asm + bar.sync 0; + // begin inline asm + @%p16 ld.shared.b32 %r165, [ %r160 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r284, %r165, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + add.f32 %r285, %r165, %r284; + .loc 3 291 36 // standard.py:291:36 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + shfl.sync.bfly.b32 %r286, %r285, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + add.f32 %r287, %r285, %r286; + .loc 3 291 36 // standard.py:291:36 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + shfl.sync.bfly.b32 %r288, %r287, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + add.f32 %r289, %r287, %r288; + .loc 3 291 36 // standard.py:291:36 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + shfl.sync.bfly.b32 %r290, %r289, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + add.f32 %r168, %r289, %r290; + .loc 3 291 36 // standard.py:291:36 @[ cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:49:33 ] + // begin inline asm + @%p17 st.shared.b32 [ %r160 + 0 ], %r168; + // end inline asm + bar.sync 0; + mov.b64 %rd67, 0; + ld.shared.b32 %r3, [global_smem]; +$L__tmp8: +$L__BB0_3: // =>This Inner Loop Header: Depth=1 + .loc 1 54 29 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:54:29 + add.s64 %rd60, %rd1, %rd67; + setp.lt.u64 %p46, %rd60, 32000; + .loc 1 58 34 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:58:34 + add.s64 %rd61, %rd2, %rd67; + cvt.u32.u64 %r299, %rd61; + mad.wide.s32 %rd57, %r299, 2, %rd15; + .loc 1 58 52 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:58:52 + // begin inline asm + mov.u64 %rd56, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd56, 1.0; + // end inline asm + mov.b32 %r293, 0; + // begin inline asm + mov.u32 %r291, %r293; + mov.u32 %r292, %r293; + @%p46 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r291, %r292 }, [ %rd57 + 0 ], %rd56; + // end inline asm + mov.b32 {%rs5, %rs6}, %r291; + mov.b32 {%rs7, %rs8}, %r292; + .loc 1 58 106 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:58:106 + cvt.f32.bf16 %r300, %rs5; + cvt.f32.bf16 %r301, %rs6; + cvt.f32.bf16 %r302, %rs7; + cvt.f32.bf16 %r303, %rs8; + .loc 1 60 22 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:60:22 + sub.f32 %r304, %r300, %r2; + sub.f32 %r305, %r301, %r2; + sub.f32 %r306, %r302, %r2; + sub.f32 %r307, %r303, %r2; + mov.b32 %r308, 0f3F000000; + mov.b32 %r309, 0f3BBB989D; + .loc 1 61 29 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:61:29 + fma.rn.ftz.f32 %r310, %r304, %r309, %r308; + cvt.ftz.sat.f32.f32 %r311, %r310; + mov.b32 %r312, 0f4B400001; + mov.b32 %r313, 0f437C0000; + fma.rm.ftz.f32 %r314, %r311, %r313, %r312; + add.f32 %r315, %r314, 0fCB40007F; + neg.f32 %r316, %r315; + mov.b32 %r317, 0f3FB8AA3B; + fma.rn.ftz.f32 %r318, %r304, %r317, %r316; + mov.b32 %r319, 0f32A57060; + fma.rn.ftz.f32 %r320, %r304, %r319, %r318; + shl.b32 %r321, %r314, 23; + ex2.approx.ftz.f32 %r322, %r320; + mul.f32 %r323, %r322, %r321; + fma.rn.ftz.f32 %r324, %r305, %r309, %r308; + cvt.ftz.sat.f32.f32 %r325, %r324; + fma.rm.ftz.f32 %r326, %r325, %r313, %r312; + add.f32 %r327, %r326, 0fCB40007F; + neg.f32 %r328, %r327; + fma.rn.ftz.f32 %r329, %r305, %r317, %r328; + fma.rn.ftz.f32 %r330, %r305, %r319, %r329; + shl.b32 %r331, %r326, 23; + ex2.approx.ftz.f32 %r332, %r330; + mul.f32 %r333, %r332, %r331; + fma.rn.ftz.f32 %r334, %r306, %r309, %r308; + cvt.ftz.sat.f32.f32 %r335, %r334; + fma.rm.ftz.f32 %r336, %r335, %r313, %r312; + add.f32 %r337, %r336, 0fCB40007F; + neg.f32 %r338, %r337; + fma.rn.ftz.f32 %r339, %r306, %r317, %r338; + fma.rn.ftz.f32 %r340, %r306, %r319, %r339; + shl.b32 %r341, %r336, 23; + ex2.approx.ftz.f32 %r342, %r340; + mul.f32 %r343, %r342, %r341; + fma.rn.ftz.f32 %r344, %r307, %r309, %r308; + cvt.ftz.sat.f32.f32 %r345, %r344; + fma.rm.ftz.f32 %r346, %r345, %r313, %r312; + add.f32 %r347, %r346, 0fCB40007F; + neg.f32 %r348, %r347; + fma.rn.ftz.f32 %r349, %r307, %r317, %r348; + fma.rn.ftz.f32 %r350, %r307, %r319, %r349; + shl.b32 %r351, %r346, 23; + ex2.approx.ftz.f32 %r352, %r350; + mul.f32 %r353, %r352, %r351; + .loc 1 62 23 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:62:23 + div.full.f32 %r295, %r323, %r3; + div.full.f32 %r296, %r333, %r3; + div.full.f32 %r297, %r343, %r3; + div.full.f32 %r298, %r353, %r3; + .loc 1 63 29 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:63:29 + mad.wide.s32 %rd59, %r299, 4, %rd16; + .loc 1 63 53 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:63:53 + // begin inline asm + @%p46 st.global.v4.b32 [ %rd59 + 0 ], { %r295, %r296, %r297, %r298 }; + // end inline asm + .loc 1 52 40 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:52:40 + add.s64 %rd14, %rd67, 2048; + setp.lt.u64 %p48, %rd67, 29952; + mov.b64 %rd67, %rd14; + @%p48 bra $L__BB0_3; +// %bb.4: + .loc 1 52 4 // cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py:52:4 + ret; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 276 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x10d DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 118 +.b8 120 +.b8 116 +.b8 98 +.b8 120 +.b8 105 +.b8 99 +.b8 105 +.b8 99 +.b8 52 +.b8 111 +.b8 109 +.b8 98 +.b8 109 +.b8 50 +.b8 100 +.b8 50 +.b8 102 +.b8 108 +.b8 99 +.b8 122 +.b8 120 +.b8 121 +.b8 100 +.b8 107 +.b8 111 +.b8 108 +.b8 50 +.b8 108 +.b8 50 +.b8 102 +.b8 102 +.b8 114 +.b8 52 +.b8 113 +.b8 116 +.b8 111 +.b8 116 +.b8 51 +.b8 114 +.b8 114 +.b8 122 +.b8 110 +.b8 106 +.b8 122 +.b8 55 +.b8 98 +.b8 104 +.b8 110 +.b8 103 +.b8 102 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 118 +.b8 120 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x46 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 101 +.b8 120 +.b8 112 +.b8 95 +.b8 112 +.b8 114 +.b8 101 +.b8 112 +.b8 97 +.b8 114 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 111 +.b8 110 +.b8 108 +.b8 105 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 117 +.b8 98 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xd1:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xe6:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 40 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xfe:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 49 // DW_AT_call_line +.b8 33 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source b/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source new file mode 100644 index 0000000000000000000000000000000000000000..82e70a918bb3e5972f0bb2c3a3aaad6faa12d193 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source @@ -0,0 +1,449 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":18:0) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":186:0) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":109:0) +#loc68 = loc(unknown) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":169:0) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":177:0) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":122:0) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc108 = loc("in_ptr0"(#loc)) +#loc109 = loc("out_ptr2"(#loc)) +#loc110 = loc("xnumel"(#loc)) +#loc111 = loc("r0_numel"(#loc)) +#loc146 = loc("lhs_max"(#loc48)) +#loc147 = loc("lhs_sum"(#loc48)) +#loc148 = loc("rhs_max"(#loc48)) +#loc160 = loc("a"(#loc62)) +#loc161 = loc("b"(#loc62)) +#loc165 = loc("x"(#loc72)) +#loc166 = loc("x"(#loc76)) +#loc167 = loc("x"(#loc81)) +#loc168 = loc("lhs_max"(#loc85)) +#loc169 = loc("lhs_sum"(#loc85)) +#loc178 = loc("a"(#loc96)) +#loc179 = loc("input"(#loc100)) +#loc180 = loc("a"(#loc104)) +#loc181 = loc("b"(#loc104)) +module { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 4096 : i32 loc(#loc112) + %r0_numel_1 = arith.constant 32000 : i32 loc(#loc113) + %xoffset = tt.get_program_id x : i32 loc(#loc114) + %xoffset_2 = arith.constant 1 : i32 loc(#loc115) + %xoffset_3 = arith.constant 1 : i32 loc(#loc115) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc115) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc116) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc117) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc118) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc118) + %xmask = arith.constant true loc(#loc119) + %xmask_8 = arith.constant dense : tensor<1x2048xi1> loc(#loc119) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc120) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc121) + %_tmp3_max = arith.constant 0xFF800000 : f32 loc(#loc122) + %_tmp3_max_10 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc122) + %_tmp3_sum = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc123) + %c0_i32 = arith.constant 0 : i32 loc(#loc13) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc13) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc13) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc13) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc13) + %3 = ub.poison : i32 loc(#loc13) + %_tmp3_sum_11:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_max_14 = %_tmp3_max_10, %_tmp3_sum_15 = %_tmp3_sum) -> (tensor<1x2048xf32>, tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc125) + %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc125) + %r0_mask = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc126) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x2048xi32> loc(#loc126) + %tmp0 = arith.constant 32000 : i32 loc(#loc127) + %tmp0_18 = arith.constant 32000 : i32 loc(#loc127) + %tmp0_19 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc127) + %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc127) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc128) + %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x2048xi32> loc(#loc128) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc129) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc129) + %tmp0_25 = arith.constant 0.000000e+00 : f32 loc(#loc130) + %tmp0_26 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc130) + %tmp0_27 = arith.truncf %tmp0_26 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc130) + %tmp0_28 = tt.load %tmp0_24, %r0_mask_17, %tmp0_27 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc130) + %tmp0_29 = arith.extf %tmp0_28 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc131) + %9:2 = tt.call @"torch._inductor.runtime.triton_helpers.online_softmax_combine__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_False_"(%_tmp3_max_14, %_tmp3_sum_15, %tmp0_29) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1x2048xf32>, tensor<1x2048xf32>) loc(#loc21) + %_tmp3_max_30 = arith.select %r0_mask_17, %9#0, %_tmp3_max_14 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc132) + %_tmp3_sum_31 = arith.select %r0_mask_17, %9#1, %_tmp3_sum_15 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc133) + scf.yield %_tmp3_max_30, %_tmp3_sum_31 : tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc24) + } loc(#loc182) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.online_softmax_reduce__fp32S1_2048S_fp32S1_2048S__(2,)cconstexpr_1__(3,)cconstexpr_False_"(%_tmp3_sum_11#0, %_tmp3_sum_11#1) : (tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>) loc(#loc25) + %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc134) + %tmp4 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc135) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc28) + %c2048_i32_13 = arith.constant 2048 : i32 loc(#loc28) + %5 = arith.bitcast %c0_i32_12 : i32 to i32 loc(#loc28) + %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc28) + %7 = arith.bitcast %c2048_i32_13 : i32 to i32 loc(#loc28) + %8 = ub.poison : i32 loc(#loc28) + scf.for %r0_offset = %5 to %6 step %7 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc136) + %r0_index_14 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc136) + %r0_mask = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc137) + %r0_mask_15 = arith.cmpi slt, %r0_index_14, %r0_mask : tensor<1x2048xi32> loc(#loc137) + %tmp5 = arith.constant 32000 : i32 loc(#loc138) + %tmp5_16 = arith.constant 32000 : i32 loc(#loc138) + %tmp5_17 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc138) + %tmp5_18 = arith.muli %tmp5_17, %xindex_7 : tensor<1x1xi32> loc(#loc138) + %tmp5_19 = tt.broadcast %tmp5_18 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc139) + %tmp5_20 = arith.addi %r0_index_14, %tmp5_19 : tensor<1x2048xi32> loc(#loc139) + %tmp5_21 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc140) + %tmp5_22 = tt.addptr %tmp5_21, %tmp5_20 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc140) + %tmp5_23 = arith.constant 0.000000e+00 : f32 loc(#loc141) + %tmp5_24 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc141) + %tmp5_25 = arith.truncf %tmp5_24 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc141) + %tmp5_26 = tt.load %tmp5_22, %r0_mask_15, %tmp5_25 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc141) + %tmp5_27 = arith.extf %tmp5_26 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc142) + %tmp7 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc143) + %tmp7_28 = arith.subf %tmp5_27, %tmp7 : tensor<1x2048xf32> loc(#loc143) + %tmp8 = tt.extern_elementwise %tmp7_28 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc144) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc145) + %tmp9_29 = arith.divf %tmp8, %tmp9 : tensor<1x2048xf32> loc(#loc145) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc39) + %c32000_i32_30 = arith.constant 32000 : i32 loc(#loc39) + %cst = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc39) + %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc39) + %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc40) + %11 = arith.addi %r0_index_14, %10 : tensor<1x2048xi32> loc(#loc40) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc41) + %13 = tt.addptr %12, %11 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc41) + tt.store %13, %tmp9_29, %r0_mask_15 : tensor<1x2048x!tt.ptr> loc(#loc42) + } loc(#loc28) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() -> tensor<1x2048xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc45) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc45) + tt.return %cst_0 : tensor<1x2048xf32> loc(#loc46) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x2048xf32> loc(#loc47) + tt.return %0 : tensor<1x2048xf32> loc(#loc47) + } loc(#loc44) + tt.func private @"torch._inductor.runtime.triton_helpers.online_softmax_combine__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_False_"(%lhs_max: tensor<1x2048xf32> loc("lhs_max"(#loc48)), %lhs_sum: tensor<1x2048xf32> loc("lhs_sum"(#loc48)), %rhs_max: tensor<1x2048xf32> loc("rhs_max"(#loc48))) -> (tensor<1x2048xf32>, tensor<1x2048xf32>) attributes {noinline = false} { + %out_max = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32S1_2048S_fp32S1_2048S__(%lhs_max, %rhs_max) : (tensor<1x2048xf32>, tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc149) + %lhs_scale = arith.constant 0xFF800000 : f32 loc(#loc150) + %lhs_scale_0 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc150) + %lhs_scale_1 = arith.cmpf oeq, %out_max, %lhs_scale_0 : tensor<1x2048xf32> loc(#loc150) + %lhs_scale_2 = arith.subf %lhs_max, %out_max : tensor<1x2048xf32> loc(#loc151) + %lhs_scale_3 = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_2048S__(1,)cconstexpr_False_"(%lhs_scale_2) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc152) + %lhs_scale_4 = arith.constant 1.000000e+00 : f32 loc(#loc153) + %lhs_scale_5 = arith.constant 1.000000e+00 : f32 loc(#loc153) + %lhs_scale_6 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc153) + %lhs_scale_7 = arith.select %lhs_scale_1, %lhs_scale_6, %lhs_scale_3 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc153) + %rhs_scale = arith.constant 0xFF800000 : f32 loc(#loc154) + %rhs_scale_8 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc154) + %rhs_scale_9 = arith.cmpf oeq, %out_max, %rhs_scale_8 : tensor<1x2048xf32> loc(#loc154) + %rhs_scale_10 = arith.subf %rhs_max, %out_max : tensor<1x2048xf32> loc(#loc155) + %rhs_scale_11 = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_2048S__(1,)cconstexpr_False_"(%rhs_scale_10) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc156) + %rhs_scale_12 = arith.constant 1.000000e+00 : f32 loc(#loc157) + %rhs_scale_13 = arith.constant 1.000000e+00 : f32 loc(#loc157) + %rhs_scale_14 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc157) + %rhs_scale_15 = arith.select %rhs_scale_9, %rhs_scale_14, %rhs_scale_11 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc157) + %out_sum = arith.mulf %lhs_sum, %lhs_scale_7 : tensor<1x2048xf32> loc(#loc158) + %out_sum_16 = arith.addf %out_sum, %rhs_scale_15 : tensor<1x2048xf32> loc(#loc159) + tt.return %out_max, %out_sum_16 : tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc60) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x2048xf32> loc(#loc61) + %1 = ub.poison : tensor<1x2048xf32> loc(#loc61) + tt.return %0, %1 : tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc61) + } loc(#loc48) + tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32S1_2048S_fp32S1_2048S__(%a: tensor<1x2048xf32> loc("a"(#loc62)), %b: tensor<1x2048xf32> loc("b"(#loc62))) -> tensor<1x2048xf32> attributes {noinline = false} { + %mask = arith.cmpf ogt, %a, %b : tensor<1x2048xf32> loc(#loc183) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%a) : (tensor<1x2048xf32>) -> i1 loc(#loc64) + %1 = scf.if %0 -> (tensor<1x2048xi1>) { + %mask_0 = arith.cmpf une, %a, %a : tensor<1x2048xf32> loc(#loc163) + %mask_1 = arith.ori %mask, %mask_0 : tensor<1x2048xi1> loc(#loc184) + scf.yield %mask_1 : tensor<1x2048xi1> loc(#loc184) + } else { + scf.yield %mask : tensor<1x2048xi1> loc(#loc68) + } loc(#loc65) + %2 = arith.select %1, %a, %b : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc69) + tt.return %2 : tensor<1x2048xf32> loc(#loc70) + ^bb1: // no predecessors + %3 = ub.poison : tensor<1x2048xf32> loc(#loc71) + tt.return %3 : tensor<1x2048xf32> loc(#loc71) + } loc(#loc62) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc72))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc73) + %true = arith.constant true loc(#loc74) + tt.return %true : i1 loc(#loc74) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc75) + tt.return %1 : i1 loc(#loc75) + } loc(#loc72) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc76))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc77) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc78) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc78) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc78) + %4 = arith.addf %x, %3 : tensor<1x2048xf32> loc(#loc78) + tt.return %4 : tensor<1x2048xf32> loc(#loc79) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x2048xf32> loc(#loc80) + tt.return %5 : tensor<1x2048xf32> loc(#loc80) + } loc(#loc76) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc45) + %cst = arith.constant dense : tensor<1xi1> loc(#loc45) + tt.return %cst : tensor<1xi1> loc(#loc46) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc47) + tt.return %0 : tensor<1xi1> loc(#loc47) + } loc(#loc44) + tt.func private @"torch._inductor.runtime.triton_helpers.exp__fp32S1_2048S__(1,)cconstexpr_False_"(%x: tensor<1x2048xf32> loc("x"(#loc81))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.extern_elementwise %x {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc82) + tt.return %0 : tensor<1x2048xf32> loc(#loc83) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x2048xf32> loc(#loc84) + tt.return %1 : tensor<1x2048xf32> loc(#loc84) + } loc(#loc81) + tt.func private @"torch._inductor.runtime.triton_helpers.online_softmax_reduce__fp32S1_2048S_fp32S1_2048S__(2,)cconstexpr_1__(3,)cconstexpr_False_"(%lhs_max: tensor<1x2048xf32> loc("lhs_max"(#loc85)), %lhs_sum: tensor<1x2048xf32> loc("lhs_sum"(#loc85))) -> (tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} { + %out_max = tt.call @"torch._inductor.runtime.triton_helpers.max2__fp32S1_2048S__(1,)cconstexpr_1_"(%lhs_max) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc170) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc171) + %delta = arith.constant 0xFF800000 : f32 loc(#loc172) + %delta_0 = arith.constant dense<0xFF800000> : tensor<1x1xf32> loc(#loc172) + %delta_1 = arith.cmpf oeq, %out_max_keepdim, %delta_0 : tensor<1x1xf32> loc(#loc172) + %delta_2 = tt.broadcast %out_max_keepdim : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc173) + %delta_3 = arith.subf %lhs_max, %delta_2 : tensor<1x2048xf32> loc(#loc173) + %delta_4 = arith.constant 0 : i32 loc(#loc174) + %delta_5 = arith.constant 0.000000e+00 : f32 loc(#loc174) + %delta_6 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc174) + %delta_7 = tt.broadcast %delta_1 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc174) + %delta_8 = arith.select %delta_7, %delta_6, %delta_3 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc174) + %out_sum = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_2048S__(1,)cconstexpr_False_"(%delta_8) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc175) + %out_sum_9 = arith.mulf %lhs_sum, %out_sum : tensor<1x2048xf32> loc(#loc176) + %out_sum_10 = tt.call @"triton.language.standard.sum__fp32S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%out_sum_9) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc177) + tt.return %out_max, %out_sum_10 : tensor<1xf32>, tensor<1xf32> loc(#loc94) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xf32> loc(#loc95) + %1 = ub.poison : tensor<1xf32> loc(#loc95) + tt.return %0, %1 : tensor<1xf32>, tensor<1xf32> loc(#loc95) + } loc(#loc85) + tt.func private @"torch._inductor.runtime.triton_helpers.max2__fp32S1_2048S__(1,)cconstexpr_1_"(%a: tensor<1x2048xf32> loc("a"(#loc96))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%a) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc97) + tt.reduce.return %2 : f32 loc(#loc97) + }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc97) + tt.return %0 : tensor<1xf32> loc(#loc98) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc99) + tt.return %1 : tensor<1xf32> loc(#loc99) + } loc(#loc96) + tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%a: f32 loc("a"(#loc62)), %b: f32 loc("b"(#loc62))) -> f32 attributes {noinline = false} { + %mask = arith.cmpf ogt, %a, %b : f32 loc(#loc183) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a) : (f32) -> i1 loc(#loc64) + %1 = scf.if %0 -> (i1) { + %mask_0 = arith.cmpf une, %a, %a : f32 loc(#loc163) + %mask_1 = arith.ori %mask, %mask_0 : i1 loc(#loc184) + scf.yield %mask_1 : i1 loc(#loc184) + } else { + scf.yield %mask : i1 loc(#loc68) + } loc(#loc65) + %2 = arith.select %1, %a, %b : f32 loc(#loc69) + tt.return %2 : f32 loc(#loc70) + ^bb1: // no predecessors + %3 = ub.poison : f32 loc(#loc71) + tt.return %3 : f32 loc(#loc71) + } loc(#loc62) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc72))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc73) + %true = arith.constant true loc(#loc74) + tt.return %true : i1 loc(#loc74) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc75) + tt.return %1 : i1 loc(#loc75) + } loc(#loc72) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc76))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc77) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc78) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc78) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc78) + tt.return %3 : tensor<1xf32> loc(#loc79) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc80) + tt.return %4 : tensor<1xf32> loc(#loc80) + } loc(#loc76) + tt.func private @"triton.language.standard.sum__fp32S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2048xf32> loc("input"(#loc100))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc101) + tt.reduce.return %2 : f32 loc(#loc101) + }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc101) + tt.return %0 : tensor<1xf32> loc(#loc102) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc103) + tt.return %1 : tensor<1xf32> loc(#loc103) + } loc(#loc100) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc104)), %b: f32 loc("b"(#loc104))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc105) + tt.return %0 : f32 loc(#loc106) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc107) + tt.return %1 : f32 loc(#loc107) + } loc(#loc104) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":29:59) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":30:45) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":31:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":32:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":33:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":37:47) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":37:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":37:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":37:52) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":37:105) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":42:40) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":45:54) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":46:54) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":46:8) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":49:33) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":50:16) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":51:16) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":52:40) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":53:31) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":54:29) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":58:47) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":58:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":58:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":58:52) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":58:106) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":60:22) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":61:29) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":62:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":63:42) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":63:36) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":63:29) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":63:53) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":52:4) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:19) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":206:11) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":206:4) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:19) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:7) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:11) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:4) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:15) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":170:4) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":182:11) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":182:4) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:11) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:4) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc112 = loc("xnumel"(#loc1)) +#loc113 = loc("r0_numel"(#loc2)) +#loc114 = loc("xoffset"(#loc3)) +#loc115 = loc("xoffset"(#loc4)) +#loc116 = loc("xindex"(#loc5)) +#loc117 = loc("xindex"(#loc6)) +#loc118 = loc("xindex"(#loc7)) +#loc119 = loc("xmask"(#loc8)) +#loc120 = loc("r0_base"(#loc9)) +#loc121 = loc("r0_base"(#loc10)) +#loc122 = loc("_tmp3_max"(#loc11)) +#loc123 = loc("_tmp3_sum"(#loc12)) +#loc124 = loc("_tmp3_max"(#loc13)) +#loc125 = loc("r0_index"(#loc14)) +#loc126 = loc("r0_mask"(#loc15)) +#loc127 = loc("tmp0"(#loc16)) +#loc128 = loc("tmp0"(#loc17)) +#loc129 = loc("tmp0"(#loc18)) +#loc130 = loc("tmp0"(#loc19)) +#loc131 = loc("tmp0"(#loc20)) +#loc132 = loc("_tmp3_max"(#loc22)) +#loc133 = loc("_tmp3_sum"(#loc23)) +#loc134 = loc("tmp3"(#loc26)) +#loc135 = loc("tmp4"(#loc27)) +#loc136 = loc("r0_index"(#loc29)) +#loc137 = loc("r0_mask"(#loc30)) +#loc138 = loc("tmp5"(#loc31)) +#loc139 = loc("tmp5"(#loc32)) +#loc140 = loc("tmp5"(#loc33)) +#loc141 = loc("tmp5"(#loc34)) +#loc142 = loc("tmp5"(#loc35)) +#loc143 = loc("tmp7"(#loc36)) +#loc144 = loc("tmp8"(#loc37)) +#loc145 = loc("tmp9"(#loc38)) +#loc149 = loc("out_max"(#loc49)) +#loc150 = loc("lhs_scale"(#loc50)) +#loc151 = loc("lhs_scale"(#loc51)) +#loc152 = loc("lhs_scale"(#loc52)) +#loc153 = loc("lhs_scale"(#loc53)) +#loc154 = loc("rhs_scale"(#loc54)) +#loc155 = loc("rhs_scale"(#loc55)) +#loc156 = loc("rhs_scale"(#loc56)) +#loc157 = loc("rhs_scale"(#loc57)) +#loc158 = loc("out_sum"(#loc58)) +#loc159 = loc("out_sum"(#loc59)) +#loc162 = loc("mask"(#loc63)) +#loc163 = loc("mask"(#loc66)) +#loc164 = loc("mask"(#loc67)) +#loc170 = loc("out_max"(#loc86)) +#loc171 = loc("out_max_keepdim"(#loc87)) +#loc172 = loc("delta"(#loc88)) +#loc173 = loc("delta"(#loc89)) +#loc174 = loc("delta"(#loc90)) +#loc175 = loc("out_sum"(#loc91)) +#loc176 = loc("out_sum"(#loc92)) +#loc177 = loc("out_sum"(#loc93)) +#loc182 = loc("_tmp3_sum"(#loc124)) +#loc183 = loc("mask"(#loc162)) +#loc184 = loc("mask"(#loc164)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..a9eafea17b87a37dd3b34cf7a0b6716493af7901 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir @@ -0,0 +1,226 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":18:0) +#loc1 = loc(unknown) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":49:33) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc57 = loc("in_ptr0"(#loc)) +#loc58 = loc("out_ptr2"(#loc)) +#loc59 = loc("xnumel"(#loc)) +#loc60 = loc("r0_numel"(#loc)) +#loc86 = loc("out_max"(#loc32)) +#loc93 = loc("out_sum"(#loc41)) +#loc118 = loc(callsite(#loc86 at #loc33)) +#loc125 = loc(callsite(#loc93 at #loc33)) +#loc133 = loc(callsite(#loc1 at #loc118)) +#loc136 = loc(callsite(#loc1 at #loc125)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32000> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %cst_1 = arith.constant dense<0xFF800000> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc61) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc62) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc62) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc63) + %tmp0_6 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc104) + %tmp0_7 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc65) + %_tmp3_sum:2 = scf.for %_tmp3_sum_14 = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%arg5 = %cst_4, %arg6 = %cst_3) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp3_sum_14 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc67) + %r0_index_15 = arith.addi %r0_index, %r0_base_5 : tensor<1x2048xi32, #blocked> loc(#loc67) + %r0_mask = arith.cmpi slt, %r0_index_15, %cst : tensor<1x2048xi32, #blocked> loc(#loc68) + %tmp0_16 = arith.addi %r0_index_15, %tmp0_6 : tensor<1x2048xi32, #blocked> loc(#loc64) + %tmp0_17 = tt.addptr %tmp0_7, %tmp0_16 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc65) + %tmp0_18 = tt.load %tmp0_17, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc69) + %tmp0_19 = arith.extf %tmp0_18 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc70) + %mask = arith.cmpf ogt, %arg5, %tmp0_19 : tensor<1x2048xf32, #blocked> loc(#loc126) + %mask_20 = arith.cmpf une, %arg5, %arg5 : tensor<1x2048xf32, #blocked> loc(#loc127) + %mask_21 = arith.ori %mask, %mask_20 : tensor<1x2048xi1, #blocked> loc(#loc128) + %out_max_22 = arith.select %mask_21, %arg5, %tmp0_19 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc129) + %lhs_scale = arith.cmpf oeq, %out_max_22, %cst_4 : tensor<1x2048xf32, #blocked> loc(#loc109) + %lhs_scale_23 = arith.subf %arg5, %out_max_22 : tensor<1x2048xf32, #blocked> loc(#loc110) + %lhs_scale_24 = tt.extern_elementwise %lhs_scale_23 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked> loc(#loc130) + %lhs_scale_25 = arith.select %lhs_scale, %cst_2, %lhs_scale_24 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc112) + %rhs_scale = arith.subf %tmp0_19, %out_max_22 : tensor<1x2048xf32, #blocked> loc(#loc113) + %rhs_scale_26 = tt.extern_elementwise %rhs_scale {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked> loc(#loc131) + %rhs_scale_27 = arith.select %lhs_scale, %cst_2, %rhs_scale_26 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc115) + %out_sum_28 = arith.mulf %arg6, %lhs_scale_25 : tensor<1x2048xf32, #blocked> loc(#loc116) + %out_sum_29 = arith.addf %out_sum_28, %rhs_scale_27 : tensor<1x2048xf32, #blocked> loc(#loc117) + %_tmp3_max = arith.select %r0_mask, %out_max_22, %arg5 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc84) + %_tmp3_sum_30 = arith.select %r0_mask, %out_sum_29, %arg6 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc85) + scf.yield %_tmp3_max, %_tmp3_sum_30 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc30) + } loc(#loc105) + %out_max = "tt.reduce"(%_tmp3_sum#0) <{axis = 1 : i32}> ({ + ^bb0(%out_max_14: f32 loc(callsite(#loc1 at #loc118)), %out_max_15: f32 loc(callsite(#loc1 at #loc118))): + %mask = arith.cmpf ogt, %out_max_14, %out_max_15 : f32 loc(#loc137) + %mask_16 = arith.cmpf une, %out_max_14, %out_max_14 : f32 loc(#loc138) + %mask_17 = arith.ori %mask, %mask_16 : i1 loc(#loc139) + %out_max_18 = arith.select %mask_17, %out_max_14, %out_max_15 : f32 loc(#loc140) + tt.reduce.return %out_max_18 : f32 loc(#loc132) + }) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc132) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc119) + %delta = arith.cmpf oeq, %out_max_keepdim, %cst_1 : tensor<1x1xf32, #blocked> loc(#loc120) + %delta_8 = tt.broadcast %out_max_keepdim : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc121) + %delta_9 = arith.subf %_tmp3_sum#0, %delta_8 : tensor<1x2048xf32, #blocked> loc(#loc121) + %delta_10 = tt.broadcast %delta : tensor<1x1xi1, #blocked> -> tensor<1x2048xi1, #blocked> loc(#loc122) + %delta_11 = arith.select %delta_10, %cst_3, %delta_9 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc122) + %out_sum = tt.extern_elementwise %delta_11 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked> loc(#loc134) + %out_sum_12 = arith.mulf %_tmp3_sum#1, %out_sum : tensor<1x2048xf32, #blocked> loc(#loc124) + %out_sum_13 = "tt.reduce"(%out_sum_12) <{axis = 1 : i32}> ({ + ^bb0(%out_sum_14: f32 loc(callsite(#loc1 at #loc125)), %out_sum_15: f32 loc(callsite(#loc1 at #loc125))): + %out_sum_16 = arith.addf %out_sum_14, %out_sum_15 : f32 loc(#loc141) + tt.reduce.return %out_sum_16 : f32 loc(#loc135) + }) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc135) + %tmp4 = tt.expand_dims %out_sum_13 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc94) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc95) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc45) + scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc96) + %r0_index_14 = arith.addi %r0_index, %r0_base_5 : tensor<1x2048xi32, #blocked> loc(#loc96) + %r0_mask = arith.cmpi slt, %r0_index_14, %cst : tensor<1x2048xi32, #blocked> loc(#loc97) + %tmp5 = arith.addi %r0_index_14, %tmp0_6 : tensor<1x2048xi32, #blocked> loc(#loc98) + %tmp5_15 = tt.addptr %tmp0_7, %tmp5 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc99) + %tmp5_16 = tt.load %tmp5_15, %r0_mask, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc100) + %tmp5_17 = arith.extf %tmp5_16 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc101) + %tmp7 = arith.subf %tmp5_17, %delta_8 : tensor<1x2048xf32, #blocked> loc(#loc102) + %tmp8 = tt.extern_elementwise %tmp7 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked> loc(#loc103) + %tmp9_18 = arith.divf %tmp8, %tmp9 : tensor<1x2048xf32, #blocked> loc(#loc95) + %1 = tt.addptr %0, %tmp5 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc45) + tt.store %1, %tmp9_18, %r0_mask : tensor<1x2048x!tt.ptr, #blocked> loc(#loc55) + } loc(#loc46) + tt.return loc(#loc56) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":26:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":37:47) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":37:41) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":37:34) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":31:40) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":32:31) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":33:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":37:52) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":37:105) +#loc12 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc13 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":42:40) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":45:54) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":46:54) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":46:8) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":51:16) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":62:23) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":63:29) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":52:40) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":53:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":54:29) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":58:41) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":58:34) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":58:52) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":58:106) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":60:22) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":61:29) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":63:53) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":52:4) +#loc61 = loc("xoffset"(#loc2)) +#loc62 = loc("r0_base"(#loc3)) +#loc63 = loc("tmp0"(#loc4)) +#loc64 = loc("tmp0"(#loc5)) +#loc65 = loc("tmp0"(#loc6)) +#loc66 = loc("_tmp3_max"(#loc7)) +#loc67 = loc("r0_index"(#loc8)) +#loc68 = loc("r0_mask"(#loc9)) +#loc69 = loc("tmp0"(#loc10)) +#loc70 = loc("tmp0"(#loc11)) +#loc71 = loc("mask"(#loc12)) +#loc72 = loc("out_max"(#loc13)) +#loc73 = loc("mask"(#loc15)) +#loc74 = loc("mask"(#loc16)) +#loc75 = loc("lhs_scale"(#loc18)) +#loc76 = loc("lhs_scale"(#loc19)) +#loc77 = loc("lhs_scale"(#loc21)) +#loc78 = loc("lhs_scale"(#loc22)) +#loc79 = loc("rhs_scale"(#loc23)) +#loc80 = loc("rhs_scale"(#loc24)) +#loc81 = loc("rhs_scale"(#loc25)) +#loc82 = loc("out_sum"(#loc26)) +#loc83 = loc("out_sum"(#loc27)) +#loc84 = loc("_tmp3_max"(#loc28)) +#loc85 = loc("_tmp3_sum"(#loc29)) +#loc87 = loc("out_max_keepdim"(#loc34)) +#loc88 = loc("delta"(#loc35)) +#loc89 = loc("delta"(#loc36)) +#loc90 = loc("delta"(#loc37)) +#loc91 = loc("out_sum"(#loc38)) +#loc92 = loc("out_sum"(#loc39)) +#loc94 = loc("tmp4"(#loc43)) +#loc95 = loc("tmp9"(#loc44)) +#loc96 = loc("r0_index"(#loc47)) +#loc97 = loc("r0_mask"(#loc48)) +#loc98 = loc("tmp5"(#loc49)) +#loc99 = loc("tmp5"(#loc50)) +#loc100 = loc("tmp5"(#loc51)) +#loc101 = loc("tmp5"(#loc52)) +#loc102 = loc("tmp7"(#loc53)) +#loc103 = loc("tmp8"(#loc54)) +#loc104 = loc(fused[#loc64, #loc63]) +#loc105 = loc("_tmp3_sum"(#loc66)) +#loc106 = loc("mask"(#loc71)) +#loc107 = loc(callsite(#loc72 at #loc14)) +#loc108 = loc("mask"(#loc74)) +#loc109 = loc(callsite(#loc75 at #loc14)) +#loc110 = loc(callsite(#loc76 at #loc14)) +#loc111 = loc(callsite(#loc77 at #loc14)) +#loc112 = loc(callsite(#loc78 at #loc14)) +#loc113 = loc(callsite(#loc79 at #loc14)) +#loc114 = loc(callsite(#loc80 at #loc14)) +#loc115 = loc(callsite(#loc81 at #loc14)) +#loc116 = loc(callsite(#loc82 at #loc14)) +#loc117 = loc(callsite(#loc83 at #loc14)) +#loc119 = loc(callsite(#loc87 at #loc33)) +#loc120 = loc(callsite(#loc88 at #loc33)) +#loc121 = loc(callsite(#loc89 at #loc33)) +#loc122 = loc(callsite(#loc90 at #loc33)) +#loc123 = loc(callsite(#loc91 at #loc33)) +#loc124 = loc(callsite(#loc92 at #loc33)) +#loc126 = loc(callsite(#loc106 at #loc107)) +#loc127 = loc(callsite(#loc73 at #loc107)) +#loc128 = loc(callsite(#loc108 at #loc107)) +#loc129 = loc(callsite(#loc17 at #loc107)) +#loc130 = loc(callsite(#loc20 at #loc111)) +#loc131 = loc(callsite(#loc20 at #loc114)) +#loc132 = loc(callsite(#loc31 at #loc118)) +#loc134 = loc(callsite(#loc20 at #loc123)) +#loc135 = loc(callsite(#loc40 at #loc125)) +#loc137 = loc(callsite(#loc106 at #loc132)) +#loc138 = loc(callsite(#loc73 at #loc132)) +#loc139 = loc(callsite(#loc108 at #loc132)) +#loc140 = loc(callsite(#loc17 at #loc132)) +#loc141 = loc(callsite(#loc42 at #loc135)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..3743ecfe503522122f494a0b4e9160e10e07c234 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir @@ -0,0 +1,233 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":18:0) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":49:33) +#loc3 = loc(unknown) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc59 = loc("in_ptr0"(#loc)) +#loc60 = loc("out_ptr2"(#loc)) +#loc61 = loc("xnumel"(#loc)) +#loc62 = loc("r0_numel"(#loc)) +#loc90 = loc("out_max"(#loc35)) +#loc96 = loc("out_sum"(#loc42)) +#loc123 = loc(callsite(#loc90 at #loc2)) +#loc129 = loc(callsite(#loc96 at #loc2)) +#loc138 = loc(callsite(#loc3 at #loc123)) +#loc141 = loc(callsite(#loc3 at #loc129)) +module { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %delta = arith.constant dense<0xFF800000> : tensor<1x1xf32> loc(#loc108) + %cst = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc3) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc3) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc3) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_2 = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc3) + %cst_3 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc64) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc65) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc66) + %_tmp3_sum:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp3_max = %cst_3, %_tmp3_sum_12 = %cst_0) -> (tensor<1x2048xf32>, tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc68) + %r0_index_13 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32> loc(#loc68) + %r0_mask = arith.cmpi slt, %r0_index_13, %cst_2 : tensor<1x2048xi32> loc(#loc69) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc70) + %tmp0_14 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc110) + %tmp0_15 = arith.addi %r0_index_13, %tmp0_14 : tensor<1x2048xi32> loc(#loc71) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc72) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc72) + %tmp0_18 = tt.load %tmp0_17, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc73) + %tmp0_19 = arith.extf %tmp0_18 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc74) + %mask = arith.cmpf ogt, %_tmp3_max, %tmp0_19 : tensor<1x2048xf32> loc(#loc131) + %mask_20 = arith.cmpf une, %_tmp3_max, %_tmp3_max : tensor<1x2048xf32> loc(#loc132) + %mask_21 = arith.ori %mask, %mask_20 : tensor<1x2048xi1> loc(#loc133) + %out_max_22 = arith.select %mask_21, %_tmp3_max, %tmp0_19 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc134) + %lhs_scale = arith.cmpf oeq, %out_max_22, %cst_3 : tensor<1x2048xf32> loc(#loc114) + %lhs_scale_23 = arith.subf %_tmp3_max, %out_max_22 : tensor<1x2048xf32> loc(#loc115) + %lhs_scale_24 = tt.extern_elementwise %lhs_scale_23 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc135) + %lhs_scale_25 = arith.select %lhs_scale, %cst, %lhs_scale_24 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc117) + %rhs_scale = arith.subf %tmp0_19, %out_max_22 : tensor<1x2048xf32> loc(#loc118) + %rhs_scale_26 = tt.extern_elementwise %rhs_scale {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc136) + %rhs_scale_27 = arith.select %lhs_scale, %cst, %rhs_scale_26 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc120) + %out_sum_28 = arith.mulf %_tmp3_sum_12, %lhs_scale_25 : tensor<1x2048xf32> loc(#loc121) + %out_sum_29 = arith.addf %out_sum_28, %rhs_scale_27 : tensor<1x2048xf32> loc(#loc122) + %_tmp3_max_30 = arith.select %r0_mask, %out_max_22, %_tmp3_max : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc88) + %_tmp3_sum_31 = arith.select %r0_mask, %out_sum_29, %_tmp3_sum_12 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc89) + scf.yield %_tmp3_max_30, %_tmp3_sum_31 : tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc33) + } loc(#loc109) + %out_max = "tt.reduce"(%_tmp3_sum#0) <{axis = 1 : i32}> ({ + ^bb0(%out_max_12: f32 loc(callsite(#loc3 at #loc123)), %out_max_13: f32 loc(callsite(#loc3 at #loc123))): + %mask = arith.cmpf ogt, %out_max_12, %out_max_13 : f32 loc(#loc142) + %mask_14 = arith.cmpf une, %out_max_12, %out_max_12 : f32 loc(#loc143) + %mask_15 = arith.ori %mask, %mask_14 : i1 loc(#loc144) + %out_max_16 = arith.select %mask_15, %out_max_12, %out_max_13 : f32 loc(#loc145) + tt.reduce.return %out_max_16 : f32 loc(#loc137) + }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc137) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc124) + %delta_5 = arith.cmpf oeq, %out_max_keepdim, %delta : tensor<1x1xf32> loc(#loc108) + %delta_6 = tt.broadcast %out_max_keepdim : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc125) + %delta_7 = arith.subf %_tmp3_sum#0, %delta_6 : tensor<1x2048xf32> loc(#loc125) + %delta_8 = tt.broadcast %delta_5 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc126) + %delta_9 = arith.select %delta_8, %cst_0, %delta_7 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc126) + %out_sum = tt.extern_elementwise %delta_9 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc139) + %out_sum_10 = arith.mulf %_tmp3_sum#1, %out_sum : tensor<1x2048xf32> loc(#loc128) + %out_sum_11 = "tt.reduce"(%out_sum_10) <{axis = 1 : i32}> ({ + ^bb0(%out_sum_12: f32 loc(callsite(#loc3 at #loc129)), %out_sum_13: f32 loc(callsite(#loc3 at #loc129))): + %out_sum_14 = arith.addf %out_sum_12, %out_sum_13 : f32 loc(#loc146) + tt.reduce.return %out_sum_14 : f32 loc(#loc140) + }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc140) + %tmp4 = tt.expand_dims %out_sum_11 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc97) + scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc98) + %r0_index_12 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32> loc(#loc98) + %r0_mask = arith.cmpi slt, %r0_index_12, %cst_2 : tensor<1x2048xi32> loc(#loc99) + %tmp5 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc100) + %tmp5_13 = tt.splat %tmp5 : i32 -> tensor<1x2048xi32> loc(#loc130) + %tmp5_14 = arith.addi %r0_index_12, %tmp5_13 : tensor<1x2048xi32> loc(#loc101) + %tmp5_15 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc102) + %tmp5_16 = tt.addptr %tmp5_15, %tmp5_14 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc102) + %tmp5_17 = tt.load %tmp5_16, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc103) + %tmp5_18 = arith.extf %tmp5_17 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc104) + %tmp7 = arith.subf %tmp5_18, %delta_6 : tensor<1x2048xf32> loc(#loc105) + %tmp8 = tt.extern_elementwise %tmp7 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc106) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc107) + %tmp9_19 = arith.divf %tmp8, %tmp9 : tensor<1x2048xf32> loc(#loc107) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc56) + %1 = tt.addptr %0, %tmp5_14 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc56) + tt.store %1, %tmp9_19, %r0_mask : tensor<1x2048x!tt.ptr> loc(#loc57) + } loc(#loc45) + tt.return loc(#loc58) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":23:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":26:27) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":31:40) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":32:31) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":33:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":37:47) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":37:41) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":37:34) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":37:52) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":37:105) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":42:40) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":45:54) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":46:54) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":46:8) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":51:16) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":52:40) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":53:31) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":54:29) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":58:47) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":58:41) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":58:34) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":58:52) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":58:106) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":60:22) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":61:29) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":62:23) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":63:29) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":63:53) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vx/cvxtbxicic4ombm2d2flczxydkol2l2ffr4qtot3rrznjz7bhngf.py":52:4) +#loc63 = loc("delta"(#loc1)) +#loc64 = loc("xoffset"(#loc4)) +#loc65 = loc("r0_base"(#loc5)) +#loc66 = loc("r0_base"(#loc6)) +#loc67 = loc("_tmp3_max"(#loc7)) +#loc68 = loc("r0_index"(#loc8)) +#loc69 = loc("r0_mask"(#loc9)) +#loc70 = loc("tmp0"(#loc10)) +#loc71 = loc("tmp0"(#loc11)) +#loc72 = loc("tmp0"(#loc12)) +#loc73 = loc("tmp0"(#loc13)) +#loc74 = loc("tmp0"(#loc14)) +#loc75 = loc("mask"(#loc15)) +#loc76 = loc("out_max"(#loc16)) +#loc77 = loc("mask"(#loc18)) +#loc78 = loc("mask"(#loc19)) +#loc79 = loc("lhs_scale"(#loc21)) +#loc80 = loc("lhs_scale"(#loc22)) +#loc81 = loc("lhs_scale"(#loc24)) +#loc82 = loc("lhs_scale"(#loc25)) +#loc83 = loc("rhs_scale"(#loc26)) +#loc84 = loc("rhs_scale"(#loc27)) +#loc85 = loc("rhs_scale"(#loc28)) +#loc86 = loc("out_sum"(#loc29)) +#loc87 = loc("out_sum"(#loc30)) +#loc88 = loc("_tmp3_max"(#loc31)) +#loc89 = loc("_tmp3_sum"(#loc32)) +#loc91 = loc("out_max_keepdim"(#loc36)) +#loc92 = loc("delta"(#loc37)) +#loc93 = loc("delta"(#loc38)) +#loc94 = loc("out_sum"(#loc39)) +#loc95 = loc("out_sum"(#loc40)) +#loc97 = loc("tmp4"(#loc44)) +#loc98 = loc("r0_index"(#loc46)) +#loc99 = loc("r0_mask"(#loc47)) +#loc100 = loc("tmp5"(#loc48)) +#loc101 = loc("tmp5"(#loc49)) +#loc102 = loc("tmp5"(#loc50)) +#loc103 = loc("tmp5"(#loc51)) +#loc104 = loc("tmp5"(#loc52)) +#loc105 = loc("tmp7"(#loc53)) +#loc106 = loc("tmp8"(#loc54)) +#loc107 = loc("tmp9"(#loc55)) +#loc108 = loc(callsite(#loc63 at #loc2)) +#loc109 = loc("_tmp3_sum"(#loc67)) +#loc110 = loc(fused[#loc71, #loc70]) +#loc111 = loc("mask"(#loc75)) +#loc112 = loc(callsite(#loc76 at #loc17)) +#loc113 = loc("mask"(#loc78)) +#loc114 = loc(callsite(#loc79 at #loc17)) +#loc115 = loc(callsite(#loc80 at #loc17)) +#loc116 = loc(callsite(#loc81 at #loc17)) +#loc117 = loc(callsite(#loc82 at #loc17)) +#loc118 = loc(callsite(#loc83 at #loc17)) +#loc119 = loc(callsite(#loc84 at #loc17)) +#loc120 = loc(callsite(#loc85 at #loc17)) +#loc121 = loc(callsite(#loc86 at #loc17)) +#loc122 = loc(callsite(#loc87 at #loc17)) +#loc124 = loc(callsite(#loc91 at #loc2)) +#loc125 = loc(callsite(#loc92 at #loc2)) +#loc126 = loc(callsite(#loc93 at #loc2)) +#loc127 = loc(callsite(#loc94 at #loc2)) +#loc128 = loc(callsite(#loc95 at #loc2)) +#loc130 = loc(fused[#loc101, #loc100]) +#loc131 = loc(callsite(#loc111 at #loc112)) +#loc132 = loc(callsite(#loc77 at #loc112)) +#loc133 = loc(callsite(#loc113 at #loc112)) +#loc134 = loc(callsite(#loc20 at #loc112)) +#loc135 = loc(callsite(#loc23 at #loc116)) +#loc136 = loc(callsite(#loc23 at #loc119)) +#loc137 = loc(callsite(#loc34 at #loc123)) +#loc139 = loc(callsite(#loc23 at #loc127)) +#loc140 = loc(callsite(#loc41 at #loc129)) +#loc142 = loc(callsite(#loc111 at #loc137)) +#loc143 = loc(callsite(#loc77 at #loc137)) +#loc144 = loc(callsite(#loc113 at #loc137)) +#loc145 = loc(callsite(#loc20 at #loc137)) +#loc146 = loc(callsite(#loc43 at #loc140)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json b/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..02bb19fec991e5b4fa4ec9bf1cdfdda32948fb03 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..49ec0223b183e5c38c93332fa91924a71d40dd43 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json b/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..681b97131d893e6cbd7b2284882a5f397f41490b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json @@ -0,0 +1 @@ +{"hash": "193d794b0b97c3631a5ca3b76996d9137ceae2e83b0d9eaa3f7ef16cdd7504c7", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..40d5f24cc68ce22c6aa1a55278f132d6243e0e42 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir @@ -0,0 +1,318 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i32 %9, i32 %10, ptr addrspace(1) readnone captures(none) %11, ptr addrspace(1) readnone captures(none) %12) local_unnamed_addr #0 !dbg !4 { + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %15 = icmp slt i32 %14, %9, !dbg !8 + %16 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %17 = and i32 %16, 384, !dbg !9 + %18 = zext nneg i32 %14 to i64, !dbg !10 + %.frozen = freeze i64 %3, !dbg !10 + %19 = sdiv i64 %18, %.frozen, !dbg !10 + %20 = srem i64 %19, %4, !dbg !11 + %21 = mul i64 %19, %.frozen, !dbg !12 + %.decomposed = sub i64 %18, %21, !dbg !12 + %22 = sdiv i64 %18, %7, !dbg !13 + %23 = shl nsw i64 %20, 7, !dbg !14 + %24 = shl nuw nsw i64 %.decomposed, 7, !dbg !15 + %25 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !16 + %26 = and i32 %16, 127 + %27 = zext nneg i32 %26 to i64 + %28 = or disjoint i64 %24, %27 + %29 = icmp slt i64 %28, %6 + %30 = icmp sge i64 %28, %8 + %31 = tail call i64 @llvm.smin.i64(i64 %8, i64 0) + %32 = sub nsw i64 %.decomposed, %20 + %33 = shl nsw i64 %32, 7 + %34 = zext nneg i32 %17 to i64, !dbg !17 + %35 = zext nneg i32 %26 to i64, !dbg !17 + %36 = zext nneg i32 %16 to i64, !dbg !17 + %37 = insertelement <2 x i1> poison, i1 %15, i64 0, !dbg !18 + %38 = shufflevector <2 x i1> %37, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !18 + %39 = insertelement <2 x i1> poison, i1 %29, i64 0, !dbg !19 + %40 = shufflevector <2 x i1> %39, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !19 + %41 = insertelement <2 x i64> poison, i64 %23, i64 0, !dbg !20 + %42 = shufflevector <2 x i64> %41, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !20 + %43 = insertelement <2 x i64> poison, i64 %5, i64 0, !dbg !21 + %44 = shufflevector <2 x i64> %43, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !21 + %45 = insertelement <2 x i64> poison, i64 %28, i64 0, !dbg !22 + %46 = shufflevector <2 x i64> %45, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !22 + %47 = insertelement <2 x i1> poison, i1 %30, i64 0, !dbg !23 + %48 = shufflevector <2 x i1> %47, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !23 + %49 = insertelement <2 x i64> poison, i64 %33, i64 0, !dbg !24 + %50 = shufflevector <2 x i64> %49, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !24 + %51 = insertelement <2 x i64> poison, i64 %8, i64 0, !dbg !25 + %52 = shufflevector <2 x i64> %51, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !25 + br label %53, !dbg !17 + +53: ; preds = %13, %53 + %indvars.iv = phi i64 [ 0, %13 ], [ %indvars.iv.next, %53 ] + %54 = phi <2 x i64> [ zeroinitializer, %13 ], [ %113, %53 ] + %55 = or disjoint i64 %indvars.iv, %34, !dbg !26 + %56 = or disjoint i64 %indvars.iv, %36, !dbg !26 + %57 = lshr exact i64 %55, 7, !dbg !27 + %58 = lshr i64 %56, 7, !dbg !27 + %59 = trunc nuw nsw i64 %58 to i32, !dbg !27 + %60 = or i32 %59, 4, !dbg !27 + %61 = zext nneg i32 %60 to i64, !dbg !20 + %62 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28 + %63 = sub nsw i64 %35, %57, !dbg !29 + %64 = sub nsw i32 %26, %60, !dbg !29 + %65 = sext i32 %64 to i64, !dbg !30 + %66 = insertelement <2 x i64> poison, i64 %57, i64 0, !dbg !20 + %67 = insertelement <2 x i64> %66, i64 %61, i64 1, !dbg !20 + %68 = or disjoint <2 x i64> %42, %67, !dbg !20 + %69 = icmp slt <2 x i64> %68, %44, !dbg !21 + %70 = and <2 x i1> %40, %69, !dbg !19 + %71 = icmp sge <2 x i64> %68, %46, !dbg !22 + %72 = extractelement <2 x i1> %70, i64 0, !dbg !31 + %73 = and i1 %15, %72, !dbg !31 + %74 = extractelement <2 x i1> %70, i64 1, !dbg !31 + %75 = and i1 %15, %74, !dbg !31 + %76 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %62, i1 %73) #5, !dbg !28 + %77 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28 + %78 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %77, i1 %75) #5, !dbg !28 + %79 = insertelement <2 x i64> poison, i64 %76, i64 0, !dbg !32 + %80 = insertelement <2 x i64> %79, i64 %78, i64 1, !dbg !32 + %81 = icmp slt <2 x i64> %46, %80, !dbg !32 + %82 = icmp slt <2 x i64> %68, %80, !dbg !33 + %83 = and <2 x i1> %81, %82, !dbg !34 + %84 = and <2 x i1> %71, %83, !dbg !35 + %85 = srem i64 %28, %8, !dbg !36 + %.not = icmp eq i64 %85, 0, !dbg !37 + %86 = select i1 %.not, i64 0, i64 %31, !dbg !38 + %87 = add nsw i64 %86, %85, !dbg !38 + %88 = insertelement <2 x i64> poison, i64 %87, i64 0, !dbg !39 + %89 = shufflevector <2 x i64> %88, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !39 + %90 = icmp slt <2 x i64> %89, %80, !dbg !39 + %91 = insertelement <2 x i64> poison, i64 %63, i64 0, !dbg !24 + %92 = insertelement <2 x i64> %91, i64 %65, i64 1, !dbg !24 + %93 = add nsw <2 x i64> %50, %92, !dbg !24 + %94 = srem <2 x i64> %93, %52, !dbg !25 + %95 = icmp ne <2 x i64> %94, zeroinitializer, !dbg !40 + %96 = extractelement <2 x i64> %94, i64 0, !dbg !41 + %97 = xor i64 %96, %8, !dbg !41 + %98 = extractelement <2 x i64> %94, i64 1, !dbg !41 + %99 = xor i64 %98, %8, !dbg !41 + %100 = insertelement <2 x i64> poison, i64 %97, i64 0, !dbg !41 + %101 = insertelement <2 x i64> %100, i64 %99, i64 1, !dbg !41 + %102 = icmp slt <2 x i64> %101, zeroinitializer, !dbg !41 + %103 = and <2 x i1> %95, %102, !dbg !42 + %104 = select <2 x i1> %103, <2 x i64> %52, <2 x i64> zeroinitializer, !dbg !43 + %105 = sub <2 x i64> zeroinitializer, %104, !dbg !44 + %106 = icmp eq <2 x i64> %94, %105, !dbg !44 + %107 = and <2 x i1> %90, %106, !dbg !23 + %108 = and <2 x i1> %48, %107, !dbg !23 + %109 = or <2 x i1> %84, %108, !dbg !45 + %110 = select <2 x i1> %38, <2 x i1> %70, <2 x i1> zeroinitializer, !dbg !18 + %111 = select <2 x i1> %110, <2 x i1> %109, <2 x i1> zeroinitializer, !dbg !18 + %112 = zext <2 x i1> %111 to <2 x i64>, !dbg !18 + %113 = add <2 x i64> %54, %112, !dbg !18 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1024, !dbg !17 + %114 = icmp samesign ult i64 %indvars.iv, 15360, !dbg !17 + br i1 %114, label %53, label %115, !dbg !17 + +115: ; preds = %53 + %116 = and i32 %16, 31, !dbg !9 + %117 = lshr i32 %16, 5, !dbg !9 + %shift = shufflevector <2 x i64> %113, <2 x i64> poison, <2 x i32> , !dbg !46 + %foldExtExtBinop = add <2 x i64> %113, %shift, !dbg !46 + %118 = extractelement <2 x i64> %foldExtExtBinop, i64 0, !dbg !46 + %119 = bitcast <2 x i64> %foldExtExtBinop to <4 x i32>, !dbg !50 + %120 = extractelement <4 x i32> %119, i64 1, !dbg !50 + %121 = trunc i64 %118 to i32, !dbg !50 + %122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 16, i32 31), !dbg !50 + %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 16, i32 31), !dbg !50 + %124 = insertelement <2 x i32> poison, i32 %122, i64 0, !dbg !50 + %125 = insertelement <2 x i32> %124, i32 %123, i64 1, !dbg !50 + %126 = bitcast <2 x i32> %125 to i64, !dbg !50 + %127 = add i64 %118, %126, !dbg !46 + %extelt.offset1 = lshr i64 %127, 32, !dbg !50 + %128 = trunc nuw i64 %extelt.offset1 to i32, !dbg !50 + %129 = trunc i64 %127 to i32, !dbg !50 + %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 8, i32 31), !dbg !50 + %131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 8, i32 31), !dbg !50 + %132 = insertelement <2 x i32> poison, i32 %130, i64 0, !dbg !50 + %133 = insertelement <2 x i32> %132, i32 %131, i64 1, !dbg !50 + %134 = bitcast <2 x i32> %133 to i64, !dbg !50 + %135 = add i64 %127, %134, !dbg !46 + %extelt.offset2 = lshr i64 %135, 32, !dbg !50 + %136 = trunc nuw i64 %extelt.offset2 to i32, !dbg !50 + %137 = trunc i64 %135 to i32, !dbg !50 + %138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 4, i32 31), !dbg !50 + %139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 4, i32 31), !dbg !50 + %140 = insertelement <2 x i32> poison, i32 %138, i64 0, !dbg !50 + %141 = insertelement <2 x i32> %140, i32 %139, i64 1, !dbg !50 + %142 = bitcast <2 x i32> %141 to i64, !dbg !50 + %143 = add i64 %135, %142, !dbg !46 + %extelt.offset3 = lshr i64 %143, 32, !dbg !50 + %144 = trunc nuw i64 %extelt.offset3 to i32, !dbg !50 + %145 = trunc i64 %143 to i32, !dbg !50 + %146 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %145, i32 2, i32 31), !dbg !50 + %147 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 2, i32 31), !dbg !50 + %148 = insertelement <2 x i32> poison, i32 %146, i64 0, !dbg !50 + %149 = insertelement <2 x i32> %148, i32 %147, i64 1, !dbg !50 + %150 = bitcast <2 x i32> %149 to i64, !dbg !50 + %151 = add i64 %143, %150, !dbg !46 + %extelt.offset4 = lshr i64 %151, 32, !dbg !50 + %152 = trunc nuw i64 %extelt.offset4 to i32, !dbg !50 + %153 = trunc i64 %151 to i32, !dbg !50 + %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 1, i32 31), !dbg !50 + %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 1, i32 31), !dbg !50 + %156 = insertelement <2 x i32> poison, i32 %154, i64 0, !dbg !50 + %157 = insertelement <2 x i32> %156, i32 %155, i64 1, !dbg !50 + %158 = bitcast <2 x i32> %157 to i64, !dbg !50 + %159 = add i64 %151, %158, !dbg !46 + %160 = and i32 %117, 15, !dbg !50 + %161 = icmp eq i32 %116, 0, !dbg !50 + %162 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %160, !dbg !50 + %163 = insertelement <1 x i64> poison, i64 %159, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %162, <1 x i64> %163, i1 %161) #5, !dbg !50 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50 + %164 = icmp samesign ult i32 %16, 16, !dbg !50 + %165 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %16, !dbg !50 + %166 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %165, i1 %164) #5, !dbg !50 + %extelt.offset5 = lshr i64 %166, 32, !dbg !50 + %167 = trunc nuw i64 %extelt.offset5 to i32, !dbg !50 + %168 = trunc i64 %166 to i32, !dbg !50 + %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 8, i32 31), !dbg !50 + %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %167, i32 8, i32 31), !dbg !50 + %171 = insertelement <2 x i32> poison, i32 %169, i64 0, !dbg !50 + %172 = insertelement <2 x i32> %171, i32 %170, i64 1, !dbg !50 + %173 = bitcast <2 x i32> %172 to i64, !dbg !50 + %174 = add i64 %166, %173, !dbg !46 + %extelt.offset6 = lshr i64 %174, 32, !dbg !50 + %175 = trunc nuw i64 %extelt.offset6 to i32, !dbg !50 + %176 = trunc i64 %174 to i32, !dbg !50 + %177 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 4, i32 31), !dbg !50 + %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 4, i32 31), !dbg !50 + %179 = insertelement <2 x i32> poison, i32 %177, i64 0, !dbg !50 + %180 = insertelement <2 x i32> %179, i32 %178, i64 1, !dbg !50 + %181 = bitcast <2 x i32> %180 to i64, !dbg !50 + %182 = add i64 %174, %181, !dbg !46 + %extelt.offset7 = lshr i64 %182, 32, !dbg !50 + %183 = trunc nuw i64 %extelt.offset7 to i32, !dbg !50 + %184 = trunc i64 %182 to i32, !dbg !50 + %185 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 2, i32 31), !dbg !50 + %186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %183, i32 2, i32 31), !dbg !50 + %187 = insertelement <2 x i32> poison, i32 %185, i64 0, !dbg !50 + %188 = insertelement <2 x i32> %187, i32 %186, i64 1, !dbg !50 + %189 = bitcast <2 x i32> %188 to i64, !dbg !50 + %190 = add i64 %182, %189, !dbg !46 + %extelt.offset8 = lshr i64 %190, 32, !dbg !50 + %191 = trunc nuw i64 %extelt.offset8 to i32, !dbg !50 + %192 = trunc i64 %190 to i32, !dbg !50 + %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 1, i32 31), !dbg !50 + %194 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %191, i32 1, i32 31), !dbg !50 + %195 = insertelement <2 x i32> poison, i32 %193, i64 0, !dbg !50 + %196 = insertelement <2 x i32> %195, i32 %194, i64 1, !dbg !50 + %197 = bitcast <2 x i32> %196 to i64, !dbg !50 + %198 = add i64 %190, %197, !dbg !46 + %199 = icmp eq i32 %16, 0, !dbg !50 + %200 = insertelement <1 x i64> poison, i64 %198, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %165, <1 x i64> %200, i1 %199) #5, !dbg !50 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50 + %201 = load i64, ptr addrspace(3) @global_smem, align 16, !dbg !50 + %202 = add i64 %201, -1, !dbg !51 + %203 = icmp ult i64 %202, 16383, !dbg !51 + %204 = zext i1 %203 to i32, !dbg !52 + %205 = icmp eq i64 %201, 16384, !dbg !53 + %206 = zext i1 %205 to i32, !dbg !52 + %207 = getelementptr i32, ptr addrspace(1) %1, i64 %18, !dbg !54 + %208 = and i32 %16, 511, !dbg !55 + %209 = icmp eq i32 %208, 0, !dbg !55 + %210 = and i1 %209, %15, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %204, ptr addrspace(1) %207, i1 %210) #5, !dbg !55 + %211 = getelementptr i32, ptr addrspace(1) %2, i64 %18, !dbg !56 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %206, ptr addrspace(1) %211, i1 %210) #5, !dbg !57 + ret void, !dbg !58 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.smin.i64(i64, i64) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1", linkageName: "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 24, column: 21, scope: !4) +!9 = !DILocation(line: 25, column: 37, scope: !4) +!10 = !DILocation(line: 27, column: 21, scope: !4) +!11 = !DILocation(line: 27, column: 28, scope: !4) +!12 = !DILocation(line: 28, column: 19, scope: !4) +!13 = !DILocation(line: 29, column: 19, scope: !4) +!14 = !DILocation(line: 39, column: 26, scope: !4) +!15 = !DILocation(line: 42, column: 26, scope: !4) +!16 = !DILocation(line: 49, column: 35, scope: !4) +!17 = !DILocation(line: 32, column: 40, scope: !4) +!18 = !DILocation(line: 86, column: 50, scope: !4) +!19 = !DILocation(line: 45, column: 22, scope: !4) +!20 = !DILocation(line: 39, column: 22, scope: !4) +!21 = !DILocation(line: 41, column: 22, scope: !4) +!22 = !DILocation(line: 48, column: 23, scope: !4) +!23 = !DILocation(line: 79, column: 24, scope: !4) +!24 = !DILocation(line: 69, column: 51, scope: !4) +!25 = !DILocation(line: 70, column: 25, scope: !4) +!26 = !DILocation(line: 33, column: 31, scope: !4) +!27 = !DILocation(line: 37, column: 27, scope: !4) +!28 = !DILocation(line: 49, column: 77, scope: !4) +!29 = !DILocation(line: 69, column: 24, scope: !4) +!30 = !DILocation(line: 69, column: 38, scope: !4) +!31 = !DILocation(line: 49, column: 94, scope: !4) +!32 = !DILocation(line: 50, column: 23, scope: !4) +!33 = !DILocation(line: 51, column: 23, scope: !4) +!34 = !DILocation(line: 52, column: 24, scope: !4) +!35 = !DILocation(line: 53, column: 23, scope: !4) +!36 = !DILocation(line: 58, column: 24, scope: !4) +!37 = !DILocation(line: 60, column: 25, scope: !4) +!38 = !DILocation(line: 66, column: 39, scope: !4) +!39 = !DILocation(line: 67, column: 24, scope: !4) +!40 = !DILocation(line: 71, column: 25, scope: !4) +!41 = !DILocation(line: 73, column: 25, scope: !4) +!42 = !DILocation(line: 74, column: 24, scope: !4) +!43 = !DILocation(line: 76, column: 39, scope: !4) +!44 = !DILocation(line: 78, column: 25, scope: !4) +!45 = !DILocation(line: 80, column: 24, scope: !4) +!46 = !DILocation(line: 261, column: 15, scope: !47, inlinedAt: !49) +!47 = distinct !DILexicalBlockFile(scope: !4, file: !48, discriminator: 0) +!48 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!49 = !DILocation(line: 87, column: 27, scope: !4) +!50 = !DILocation(line: 291, column: 36, scope: !47, inlinedAt: !49) +!51 = !DILocation(line: 92, column: 20, scope: !4) +!52 = !DILocation(line: 0, scope: !4) +!53 = !DILocation(line: 95, column: 21, scope: !4) +!54 = !DILocation(line: 98, column: 25, scope: !4) +!55 = !DILocation(line: 98, column: 37, scope: !4) +!56 = !DILocation(line: 99, column: 25, scope: !4) +!57 = !DILocation(line: 99, column: 37, scope: !4) +!58 = !DILocation(line: 99, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..e78a1ff16f65ef4323ebe0e2c63eeb092cf79dc8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx @@ -0,0 +1,736 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 // -- Begin function triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 +.visible .entry triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_2, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_3, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_4, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_5, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_6, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_7, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_8, + .param .u32 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_9, + .param .u32 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_10, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_11, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_12 +) +.reqntid 512 +{ + .reg .pred %p<53>; + .reg .b32 %r<76>; + .reg .b64 %rd<162>; + .loc 1 18 0 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:18:0 + +// %bb.0: + ld.param.b64 %rd47, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_4]; +$L__tmp0: + .loc 1 22 28 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:22:28 + mov.u32 %r7, %ctaid.x; + .loc 1 27 21 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:27:21 + cvt.u64.u32 %rd1, %r7; + ld.param.b64 %rd52, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_3]; + and.b64 %rd53, %rd52, -4294967296; + setp.ne.b64 %p11, %rd53, 0; + cvt.u32.u64 %r74, %rd1; + @%p11 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd153, %rd1, %rd52; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r8, %rd52; + div.u32 %r10, %r74, %r8; + cvt.u64.u32 %rd153, %r10; +$L__BB0_3: + .loc 1 0 21 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:0:21 + ld.param.b64 %rd50, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_7]; + .loc 1 27 28 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:27:28 + or.b64 %rd54, %rd153, %rd47; + and.b64 %rd55, %rd54, -4294967296; + setp.ne.b64 %p12, %rd55, 0; + @%p12 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + rem.s64 %rd154, %rd153, %rd47; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r11, %rd47; + cvt.u32.u64 %r12, %rd153; + rem.u32 %r13, %r12, %r11; + cvt.u64.u32 %rd154, %r13; +$L__BB0_6: + .loc 1 0 28 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:0:28 + ld.param.b32 %r6, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_9]; + ld.param.b64 %rd51, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_8]; + ld.param.b64 %rd49, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_6]; + ld.param.b64 %rd44, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_0]; + mov.u32 %r1, %tid.x; + .loc 1 28 19 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:28:19 + mul.lo.s64 %rd56, %rd153, %rd52; + sub.s64 %rd9, %rd1, %rd56; + .loc 1 29 19 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:29:19 + and.b64 %rd57, %rd50, -4294967296; + setp.ne.b64 %p13, %rd57, 0; + @%p13 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + div.s64 %rd155, %rd1, %rd50; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r14, %rd50; + div.u32 %r16, %r74, %r14; + cvt.u64.u32 %rd155, %r16; +$L__BB0_9: + .loc 1 0 19 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:0:19 + ld.param.b64 %rd48, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_5]; + ld.param.b64 %rd46, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_2]; + ld.param.b64 %rd45, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_1]; + .loc 1 24 21 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:24:21 + setp.lt.s32 %p1, %r74, %r6; + .loc 1 39 26 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:39:26 + shl.b64 %rd16, %rd154, 7; + .loc 1 42 26 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:42:26 + shl.b64 %rd61, %rd9, 7; + .loc 1 49 35 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:49:35 + shl.b64 %rd62, %rd155, 3; + add.s64 %rd70, %rd44, %rd62; + and.b32 %r2, %r1, 127; + cvt.u64.u32 %rd63, %r2; + or.b64 %rd20, %rd61, %rd63; + setp.lt.s64 %p3, %rd20, %rd49; + setp.ge.s64 %p5, %rd20, %rd51; + min.s64 %rd15, %rd51, 0; + sub.s64 %rd64, %rd9, %rd154; + shl.b64 %rd22, %rd64, 7; + .loc 1 32 40 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:32:40 + cvt.u64.u32 %rd65, %r1; + shr.u64 %rd66, %rd65, 7; + cvt.u32.u64 %r75, %rd66; + shr.u32 %r18, %r1, 7; + cvt.u64.u32 %rd67, %r18; + and.b64 %rd157, %rd67, 3; + sub.s64 %rd156, %rd63, %rd157; + mov.b64 %rd159, 0; + mov.b64 %rd158, -1024; + mov.b64 %rd160, %rd159; + bra.uni $L__BB0_10; +$L__BB0_12: // in Loop: Header=BB0_10 Depth=1 + .loc 1 58 24 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:58:24 + rem.s64 %rd161, %rd20, %rd51; +$L__BB0_13: // in Loop: Header=BB0_10 Depth=1 + .loc 1 0 0 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:0 + sub.s32 %r21, %r2, %r20; + cvt.s64.s32 %rd33, %r21; + .loc 1 60 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:60:25 + setp.eq.b64 %p24, %rd161, 0; + .loc 1 66 39 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:66:39 + selp.b64 %rd83, 0, %rd15, %p24; + add.s64 %rd84, %rd83, %rd161; + .loc 1 67 24 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:67:24 + setp.lt.s64 %p25, %rd84, %rd69; + setp.lt.s64 %p26, %rd84, %rd73; + .loc 1 69 51 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:69:51 + add.s64 %rd85, %rd22, %rd33; + add.s64 %rd86, %rd22, %rd156; + .loc 1 70 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:70:25 + rem.s64 %rd87, %rd86, %rd51; + rem.s64 %rd88, %rd85, %rd51; + .loc 1 71 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:71:25 + setp.ne.b64 %p27, %rd88, 0; + setp.ne.b64 %p28, %rd87, 0; + .loc 1 73 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:73:25 + xor.b64 %rd89, %rd87, %rd51; + xor.b64 %rd90, %rd88, %rd51; + .loc 1 76 39 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:76:39 + shr.s64 %rd91, %rd89, 63; + and.b64 %rd92, %rd91, %rd51; + selp.b64 %rd93, %rd92, 0, %p28; + shr.s64 %rd94, %rd90, 63; + and.b64 %rd95, %rd94, %rd51; + selp.b64 %rd96, %rd95, 0, %p27; + .loc 1 78 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:78:25 + neg.s64 %rd97, %rd96; + neg.s64 %rd98, %rd93; + setp.eq.b64 %p29, %rd87, %rd98; + setp.eq.b64 %p30, %rd88, %rd97; + .loc 1 79 24 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:79:24 + and.pred %p31, %p26, %p30; + and.pred %p33, %p25, %p29; + and.pred %p35, %p5, %p33; + and.pred %p36, %p5, %p31; + .loc 1 80 24 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:80:24 + or.pred %p37, %p10, %p36; + or.pred %p38, %p9, %p35; + .loc 1 86 50 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:86:50 + and.pred %p41, %p14, %p38; + and.pred %p42, %p15, %p37; + selp.b64 %rd99, 1, 0, %p42; + selp.b64 %rd100, 1, 0, %p41; + add.s64 %rd159, %rd159, %rd100; + add.s64 %rd160, %rd160, %rd99; + .loc 1 32 40 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:32:40 + add.s64 %rd158, %rd158, 1024; + add.s32 %r75, %r75, 8; + add.s64 %rd157, %rd157, 8; + add.s64 %rd156, %rd156, -8; + setp.lt.u64 %p43, %rd158, 15360; + @%p43 bra $L__BB0_10; + bra.uni $L__BB0_14; +$L__BB0_10: // =>This Inner Loop Header: Depth=1 + .loc 1 37 27 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:37:27 + or.b32 %r20, %r75, 4; + .loc 1 39 22 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:39:22 + cvt.u64.u32 %rd76, %r20; + .loc 1 49 77 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:49:77 + // begin inline asm + mov.u64 %rd68, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd68, 1.0; + // end inline asm + .loc 1 39 22 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:39:22 + or.b64 %rd77, %rd16, %rd76; + or.b64 %rd78, %rd16, %rd157; + .loc 1 41 22 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:41:22 + setp.lt.s64 %p17, %rd78, %rd48; + setp.lt.s64 %p18, %rd77, %rd48; + .loc 1 45 22 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:45:22 + and.pred %p8, %p3, %p18; + and.pred %p7, %p3, %p17; + .loc 1 48 23 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:48:23 + setp.ge.s64 %p19, %rd77, %rd20; + setp.ge.s64 %p20, %rd78, %rd20; + .loc 1 49 94 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:49:94 + and.pred %p14, %p1, %p7; + and.pred %p15, %p1, %p8; + .loc 1 49 77 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:49:77 + // begin inline asm + mov.u64 %rd69, 0x0; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd69 }, [ %rd70 + 0 ], %rd68; + // end inline asm + // begin inline asm + mov.u64 %rd72, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd72, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd73, 0x0; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd73 }, [ %rd70 + 0 ], %rd72; + // end inline asm + .loc 1 52 24 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:52:24 + max.s64 %rd79, %rd20, %rd77; + setp.lt.s64 %p21, %rd79, %rd73; + max.s64 %rd80, %rd20, %rd78; + setp.lt.s64 %p22, %rd80, %rd69; + .loc 1 53 23 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:53:23 + and.pred %p9, %p20, %p22; + and.pred %p10, %p19, %p21; + .loc 1 58 24 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:58:24 + or.b64 %rd81, %rd20, %rd51; + and.b64 %rd82, %rd81, -4294967296; + setp.ne.b64 %p23, %rd82, 0; + @%p23 bra $L__BB0_12; +// %bb.11: // in Loop: Header=BB0_10 Depth=1 + cvt.u32.u64 %r22, %rd51; + cvt.u32.u64 %r23, %rd20; + rem.u32 %r24, %r23, %r22; + cvt.u64.u32 %rd161, %r24; + bra.uni $L__BB0_13; +$L__BB0_14: + .loc 1 25 37 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:25:37 + and.b32 %r31, %r1, 31; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + add.s64 %rd106, %rd159, %rd160; + mov.b64 {_, %r32}, %rd106; + .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + cvt.u32.u64 %r33, %rd106; + shfl.sync.bfly.b32 %r34, %r33, 16, 31, -1; + shfl.sync.bfly.b32 %r35, %r32, 16, 31, -1; + cvt.u64.u32 %rd107, %r34; + cvt.u64.u32 %rd108, %r35; + shl.b64 %rd109, %rd108, 32; + or.b64 %rd110, %rd107, %rd109; + .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + add.s64 %rd111, %rd106, %rd110; + .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + mov.b64 {_, %r36}, %rd111; + cvt.u32.u64 %r37, %rd111; + shfl.sync.bfly.b32 %r38, %r37, 8, 31, -1; + shfl.sync.bfly.b32 %r39, %r36, 8, 31, -1; + cvt.u64.u32 %rd112, %r38; + cvt.u64.u32 %rd113, %r39; + shl.b64 %rd114, %rd113, 32; + or.b64 %rd115, %rd112, %rd114; + .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + add.s64 %rd116, %rd111, %rd115; + .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + mov.b64 {_, %r40}, %rd116; + cvt.u32.u64 %r41, %rd116; + shfl.sync.bfly.b32 %r42, %r41, 4, 31, -1; + shfl.sync.bfly.b32 %r43, %r40, 4, 31, -1; + cvt.u64.u32 %rd117, %r42; + cvt.u64.u32 %rd118, %r43; + shl.b64 %rd119, %rd118, 32; + or.b64 %rd120, %rd117, %rd119; + .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + add.s64 %rd121, %rd116, %rd120; + .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + mov.b64 {_, %r44}, %rd121; + cvt.u32.u64 %r45, %rd121; + shfl.sync.bfly.b32 %r46, %r45, 2, 31, -1; + shfl.sync.bfly.b32 %r47, %r44, 2, 31, -1; + cvt.u64.u32 %rd122, %r46; + cvt.u64.u32 %rd123, %r47; + shl.b64 %rd124, %rd123, 32; + or.b64 %rd125, %rd122, %rd124; + .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + add.s64 %rd126, %rd121, %rd125; + .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + mov.b64 {_, %r48}, %rd126; + cvt.u32.u64 %r49, %rd126; + shfl.sync.bfly.b32 %r50, %r49, 1, 31, -1; + shfl.sync.bfly.b32 %r51, %r48, 1, 31, -1; + cvt.u64.u32 %rd127, %r50; + cvt.u64.u32 %rd128, %r51; + shl.b64 %rd129, %rd128, 32; + or.b64 %rd130, %rd127, %rd129; + .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + add.s64 %rd101, %rd126, %rd130; + .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + setp.eq.b32 %p44, %r31, 0; + shr.u32 %r52, %r1, 2; + and.b32 %r53, %r52, 120; + mov.b32 %r54, global_smem; + add.s32 %r25, %r54, %r53; + // begin inline asm + @%p44 st.shared.b64 [ %r25 + 0 ], %rd101; + // end inline asm + bar.sync 0; + setp.lt.u32 %p45, %r1, 16; + shl.b32 %r55, %r1, 3; + add.s32 %r26, %r54, %r55; + // begin inline asm + @%p45 ld.shared.b64 %rd102, [ %r26 + 0 ]; + // end inline asm + mov.b64 {_, %r56}, %rd102; + cvt.u32.u64 %r57, %rd102; + shfl.sync.bfly.b32 %r58, %r57, 8, 31, -1; + shfl.sync.bfly.b32 %r59, %r56, 8, 31, -1; + cvt.u64.u32 %rd131, %r58; + cvt.u64.u32 %rd132, %r59; + shl.b64 %rd133, %rd132, 32; + or.b64 %rd134, %rd131, %rd133; + .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + add.s64 %rd135, %rd102, %rd134; + .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + mov.b64 {_, %r60}, %rd135; + cvt.u32.u64 %r61, %rd135; + shfl.sync.bfly.b32 %r62, %r61, 4, 31, -1; + shfl.sync.bfly.b32 %r63, %r60, 4, 31, -1; + cvt.u64.u32 %rd136, %r62; + cvt.u64.u32 %rd137, %r63; + shl.b64 %rd138, %rd137, 32; + or.b64 %rd139, %rd136, %rd138; + .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + add.s64 %rd140, %rd135, %rd139; + .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + mov.b64 {_, %r64}, %rd140; + cvt.u32.u64 %r65, %rd140; + shfl.sync.bfly.b32 %r66, %r65, 2, 31, -1; + shfl.sync.bfly.b32 %r67, %r64, 2, 31, -1; + cvt.u64.u32 %rd141, %r66; + cvt.u64.u32 %rd142, %r67; + shl.b64 %rd143, %rd142, 32; + or.b64 %rd144, %rd141, %rd143; + .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + add.s64 %rd145, %rd140, %rd144; + .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + mov.b64 {_, %r68}, %rd145; + cvt.u32.u64 %r69, %rd145; + shfl.sync.bfly.b32 %r70, %r69, 1, 31, -1; + shfl.sync.bfly.b32 %r71, %r68, 1, 31, -1; + cvt.u64.u32 %rd146, %r70; + cvt.u64.u32 %rd147, %r71; + shl.b64 %rd148, %rd147, 32; + or.b64 %rd149, %rd146, %rd148; + .loc 2 261 15 // standard.py:261:15 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + add.s64 %rd103, %rd145, %rd149; + .loc 2 291 36 // standard.py:291:36 @[ cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:87:27 ] + setp.eq.b32 %p46, %r1, 0; + // begin inline asm + @%p46 st.shared.b64 [ %r26 + 0 ], %rd103; + // end inline asm + bar.sync 0; + ld.shared.b64 %rd150, [global_smem]; +$L__tmp2: + .loc 1 92 20 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:92:20 + add.s64 %rd151, %rd150, -1; + setp.lt.u64 %p50, %rd151, 16383; + .loc 1 0 0 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:0 + selp.b32 %r28, 1, 0, %p50; + .loc 1 95 21 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:95:21 + setp.eq.b64 %p51, %rd150, 16384; + .loc 1 0 0 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:0 + selp.b32 %r29, 1, 0, %p51; + .loc 1 98 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:98:25 + shl.b64 %rd152, %rd1, 2; + add.s64 %rd104, %rd45, %rd152; + .loc 1 98 37 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:98:37 + and.b32 %r72, %r1, 511; + setp.eq.b32 %p52, %r72, 0; + and.pred %p47, %p52, %p1; + // begin inline asm + @%p47 st.global.b32 [ %rd104 + 0 ], { %r28 }; + // end inline asm + .loc 1 99 25 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:99:25 + add.s64 %rd105, %rd46, %rd152; + .loc 1 99 37 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:99:37 + // begin inline asm + @%p47 st.global.b32 [ %rd105 + 0 ], { %r29 }; + // end inline asm + .loc 1 99 4 // cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py:99:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 307 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x12c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 97 +.b8 118 +.b8 112 +.b8 55 +.b8 120 +.b8 97 +.b8 110 +.b8 55 +.b8 55 +.b8 116 +.b8 102 +.b8 114 +.b8 55 +.b8 113 +.b8 121 +.b8 116 +.b8 102 +.b8 107 +.b8 112 +.b8 54 +.b8 115 +.b8 106 +.b8 114 +.b8 103 +.b8 107 +.b8 100 +.b8 54 +.b8 104 +.b8 118 +.b8 114 +.b8 117 +.b8 105 +.b8 97 +.b8 113 +.b8 102 +.b8 122 +.b8 107 +.b8 101 +.b8 105 +.b8 98 +.b8 116 +.b8 108 +.b8 53 +.b8 114 +.b8 116 +.b8 97 +.b8 103 +.b8 115 +.b8 99 +.b8 110 +.b8 103 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 97 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x7d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 111 +.b8 114 +.b8 95 +.b8 99 +.b8 111 +.b8 110 +.b8 115 +.b8 116 +.b8 97 +.b8 110 +.b8 116 +.b8 95 +.b8 112 +.b8 97 +.b8 100 +.b8 95 +.b8 110 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 101 +.b8 95 +.b8 103 +.b8 116 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 109 +.b8 117 +.b8 116 +.b8 101 +.b8 95 +.b8 114 +.b8 101 +.b8 109 +.b8 97 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 114 +.b8 95 +.b8 115 +.b8 117 +.b8 98 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x108:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11d:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 87 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source b/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source new file mode 100644 index 0000000000000000000000000000000000000000..8f1aa036d0008c928dc36c307b07dee094a23070 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source @@ -0,0 +1,418 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":18:0) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc99 = loc(unknown) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc106 = loc("in_ptr0"(#loc)) +#loc107 = loc("out_ptr1"(#loc)) +#loc108 = loc("out_ptr2"(#loc)) +#loc109 = loc("ks0"(#loc)) +#loc110 = loc("ks1"(#loc)) +#loc111 = loc("ks2"(#loc)) +#loc112 = loc("ks3"(#loc)) +#loc113 = loc("ks4"(#loc)) +#loc114 = loc("ks5"(#loc)) +#loc115 = loc("xnumel"(#loc)) +#loc116 = loc("r0_numel"(#loc)) +#loc207 = loc("input"(#loc97)) +#loc208 = loc("a"(#loc102)) +#loc209 = loc("b"(#loc102)) +module { + tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 16384 : i32 loc(#loc117) + %xoffset = tt.get_program_id x : i32 loc(#loc118) + %xoffset_1 = arith.constant 1 : i32 loc(#loc119) + %xoffset_2 = arith.constant 1 : i32 loc(#loc119) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc119) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc120) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc121) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc122) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc122) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc123) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc123) + %r0_base = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc124) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<1024xi32> -> tensor<1x1024xi32> loc(#loc125) + %x1 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc126) + %x1_9 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc126) + %x1_10 = arith.divsi %x1, %x1_9 : tensor<1x1xi64> loc(#loc126) + %x1_11 = tt.splat %ks1 : i64 -> tensor<1x1xi64> loc(#loc127) + %x1_12 = arith.remsi %x1_10, %x1_11 : tensor<1x1xi64> loc(#loc127) + %x0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc128) + %x0_13 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc128) + %x0_14 = arith.remsi %x0, %x0_13 : tensor<1x1xi64> loc(#loc128) + %x2 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc129) + %x2_15 = tt.splat %ks4 : i64 -> tensor<1x1xi64> loc(#loc129) + %x2_16 = arith.divsi %x2, %x2_15 : tensor<1x1xi64> loc(#loc129) + %_tmp46 = arith.constant 0 : i64 loc(#loc130) + %_tmp46_17 = arith.constant dense<0> : tensor<1x1024xi64> loc(#loc130) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c1024_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp46_18 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp46_22 = %_tmp46_17) -> (tensor<1x1024xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x1024xi32> loc(#loc132) + %r0_index_23 = arith.addi %r0_index, %r0_base_8 : tensor<1x1024xi32> loc(#loc132) + %r0_mask = arith.constant dense<16384> : tensor<1x1024xi32> loc(#loc133) + %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x1024xi32> loc(#loc133) + %r0_4 = arith.constant 128 : i32 loc(#loc134) + %r0_4_25 = arith.constant 128 : i32 loc(#loc134) + %r0_4_26 = arith.constant dense<128> : tensor<1x1024xi32> loc(#loc134) + %r0_4_27 = arith.divsi %r0_index_23, %r0_4_26 : tensor<1x1024xi32> loc(#loc134) + %r0_3 = arith.constant 128 : i32 loc(#loc135) + %r0_3_28 = arith.constant 128 : i32 loc(#loc135) + %r0_3_29 = arith.constant dense<128> : tensor<1x1024xi32> loc(#loc135) + %r0_3_30 = arith.remsi %r0_index_23, %r0_3_29 : tensor<1x1024xi32> loc(#loc135) + %tmp0 = arith.constant 128 : i32 loc(#loc136) + %tmp0_31 = arith.constant 128 : i64 loc(#loc136) + %tmp0_32 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc136) + %tmp0_33 = arith.muli %tmp0_32, %x1_12 : tensor<1x1xi64> loc(#loc136) + %tmp0_34 = arith.extsi %r0_4_27 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc137) + %tmp0_35 = tt.broadcast %tmp0_33 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc137) + %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<1x1024xi64> loc(#loc137) + %tmp2 = tt.splat %ks2 : i64 -> tensor<1x1024xi64> loc(#loc138) + %tmp2_37 = arith.cmpi slt, %tmp0_36, %tmp2 : tensor<1x1024xi64> loc(#loc138) + %tmp3 = arith.constant 128 : i32 loc(#loc139) + %tmp3_38 = arith.constant 128 : i64 loc(#loc139) + %tmp3_39 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc139) + %tmp3_40 = arith.muli %tmp3_39, %x0_14 : tensor<1x1xi64> loc(#loc139) + %tmp3_41 = arith.extsi %r0_3_30 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc140) + %tmp3_42 = tt.broadcast %tmp3_40 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc140) + %tmp3_43 = arith.addi %tmp3_41, %tmp3_42 : tensor<1x1024xi64> loc(#loc140) + %tmp5 = tt.splat %ks3 : i64 -> tensor<1x1024xi64> loc(#loc141) + %tmp5_44 = arith.cmpi slt, %tmp3_43, %tmp5 : tensor<1x1024xi64> loc(#loc141) + %tmp6 = arith.andi %tmp2_37, %tmp5_44 : tensor<1x1024xi1> loc(#loc142) + %tmp7 = arith.constant 128 : i32 loc(#loc143) + %tmp7_45 = arith.constant 128 : i64 loc(#loc143) + %tmp7_46 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc143) + %tmp7_47 = arith.muli %tmp7_46, %x1_12 : tensor<1x1xi64> loc(#loc143) + %tmp7_48 = arith.extsi %r0_4_27 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc144) + %tmp7_49 = tt.broadcast %tmp7_47 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc144) + %tmp7_50 = arith.addi %tmp7_48, %tmp7_49 : tensor<1x1024xi64> loc(#loc144) + %tmp8 = arith.constant 128 : i32 loc(#loc145) + %tmp8_51 = arith.constant 128 : i64 loc(#loc145) + %tmp8_52 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc145) + %tmp8_53 = arith.muli %tmp8_52, %x0_14 : tensor<1x1xi64> loc(#loc145) + %tmp8_54 = arith.extsi %r0_3_30 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc146) + %tmp8_55 = tt.broadcast %tmp8_53 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc146) + %tmp8_56 = arith.addi %tmp8_54, %tmp8_55 : tensor<1x1024xi64> loc(#loc146) + %tmp9 = arith.cmpi sge, %tmp7_50, %tmp8_56 : tensor<1x1024xi64> loc(#loc147) + %tmp10 = tt.broadcast %x2_16 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc148) + %tmp10_57 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x1024x!tt.ptr> loc(#loc149) + %tmp10_58 = tt.addptr %tmp10_57, %tmp10 : tensor<1x1024x!tt.ptr>, tensor<1x1024xi64> loc(#loc149) + %tmp10_59 = arith.andi %r0_mask_24, %tmp6 : tensor<1x1024xi1> loc(#loc150) + %tmp10_60 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x1024xi1> loc(#loc151) + %tmp10_61 = arith.andi %tmp10_59, %tmp10_60 : tensor<1x1024xi1> loc(#loc151) + %tmp10_62 = arith.constant 0.000000e+00 : f32 loc(#loc152) + %tmp10_63 = arith.constant dense<0.000000e+00> : tensor<1x1024xf32> loc(#loc152) + %tmp10_64 = arith.fptosi %tmp10_63 : tensor<1x1024xf32> to tensor<1x1024xi64> loc(#loc152) + %tmp10_65 = tt.load %tmp10_58, %tmp10_61, %tmp10_64 evictionPolicy = evict_last : tensor<1x1024x!tt.ptr> loc(#loc152) + %tmp11 = arith.cmpi slt, %tmp8_56, %tmp10_65 : tensor<1x1024xi64> loc(#loc153) + %tmp12 = arith.cmpi slt, %tmp7_50, %tmp10_65 : tensor<1x1024xi64> loc(#loc154) + %tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x1024xi1> loc(#loc155) + %tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x1024xi1> loc(#loc156) + %tmp15 = arith.constant false loc(#loc157) + %tmp15_66 = arith.constant dense : tensor<1x1xi1> loc(#loc157) + %tmp16 = arith.constant dense : tensor<1x1024xi1> loc(#loc158) + %tmp16_67 = arith.ori %tmp16, %tmp14 : tensor<1x1024xi1> loc(#loc158) + %tmp17 = tt.splat %ks5 : i64 -> tensor<1x1024xi64> loc(#loc159) + %tmp18 = arith.cmpi sge, %tmp8_56, %tmp17 : tensor<1x1024xi64> loc(#loc160) + %tmp19 = arith.remsi %tmp8_56, %tmp17 : tensor<1x1024xi64> loc(#loc161) + %tmp20 = arith.constant 0 : i32 loc(#loc162) + %tmp20_68 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc162) + %tmp21 = arith.extsi %tmp20_68 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc163) + %tmp21_69 = tt.broadcast %tmp21 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc163) + %tmp21_70 = arith.cmpi ne, %tmp19, %tmp21_69 : tensor<1x1024xi64> loc(#loc163) + %tmp22 = arith.constant 0 : i32 loc(#loc164) + %tmp22_71 = arith.extsi %tmp22 : i32 to i64 loc(#loc164) + %tmp22_72 = tt.splat %tmp22_71 : i64 -> tensor<1x1024xi64> loc(#loc164) + %tmp22_73 = arith.cmpi slt, %tmp19, %tmp22_72 : tensor<1x1024xi64> loc(#loc164) + %tmp23 = arith.constant 0 : i32 loc(#loc165) + %tmp23_74 = arith.extsi %tmp23 : i32 to i64 loc(#loc165) + %tmp23_75 = tt.splat %tmp23_74 : i64 -> tensor<1x1024xi64> loc(#loc165) + %tmp23_76 = arith.cmpi slt, %tmp17, %tmp23_75 : tensor<1x1024xi64> loc(#loc165) + %tmp24 = arith.cmpi ne, %tmp22_73, %tmp23_76 : tensor<1x1024xi1> loc(#loc166) + %tmp25 = arith.andi %tmp21_70, %tmp24 : tensor<1x1024xi1> loc(#loc167) + %tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x1024xi64> loc(#loc168) + %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc169) + %tmp28 = arith.cmpi slt, %tmp27, %tmp10_65 : tensor<1x1024xi64> loc(#loc170) + %tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x1024xi1> loc(#loc171) + %tmp30 = arith.constant -1 : i32 loc(#loc172) + %tmp30_77 = arith.constant -1 : i32 loc(#loc172) + %tmp30_78 = arith.constant dense<-1> : tensor<1x1024xi32> loc(#loc172) + %tmp30_79 = arith.muli %tmp30_78, %r0_4_27 : tensor<1x1024xi32> loc(#loc172) + %tmp30_80 = arith.addi %r0_3_30, %tmp30_79 : tensor<1x1024xi32> loc(#loc173) + %tmp30_81 = arith.constant -128 : i32 loc(#loc174) + %tmp30_82 = arith.constant -128 : i64 loc(#loc174) + %tmp30_83 = arith.constant dense<-128> : tensor<1x1xi64> loc(#loc174) + %tmp30_84 = arith.muli %tmp30_83, %x1_12 : tensor<1x1xi64> loc(#loc174) + %tmp30_85 = arith.extsi %tmp30_80 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc175) + %tmp30_86 = tt.broadcast %tmp30_84 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc175) + %tmp30_87 = arith.addi %tmp30_85, %tmp30_86 : tensor<1x1024xi64> loc(#loc175) + %tmp30_88 = arith.constant 128 : i32 loc(#loc176) + %tmp30_89 = arith.constant 128 : i64 loc(#loc176) + %tmp30_90 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc176) + %tmp30_91 = arith.muli %tmp30_90, %x0_14 : tensor<1x1xi64> loc(#loc176) + %tmp30_92 = tt.broadcast %tmp30_91 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc177) + %tmp30_93 = arith.addi %tmp30_87, %tmp30_92 : tensor<1x1024xi64> loc(#loc177) + %tmp31 = arith.remsi %tmp30_93, %tmp17 : tensor<1x1024xi64> loc(#loc178) + %tmp32 = arith.extsi %tmp20_68 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc179) + %tmp32_94 = tt.broadcast %tmp32 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc179) + %tmp32_95 = arith.cmpi ne, %tmp31, %tmp32_94 : tensor<1x1024xi64> loc(#loc179) + %tmp33 = arith.constant 0 : i32 loc(#loc180) + %tmp33_96 = arith.extsi %tmp33 : i32 to i64 loc(#loc180) + %tmp33_97 = tt.splat %tmp33_96 : i64 -> tensor<1x1024xi64> loc(#loc180) + %tmp33_98 = arith.cmpi slt, %tmp31, %tmp33_97 : tensor<1x1024xi64> loc(#loc180) + %tmp34 = arith.cmpi ne, %tmp33_98, %tmp23_76 : tensor<1x1024xi1> loc(#loc181) + %tmp35 = arith.andi %tmp32_95, %tmp34 : tensor<1x1024xi1> loc(#loc182) + %tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x1024xi64> loc(#loc183) + %tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc184) + %tmp38 = arith.constant 0 : i64 loc(#loc185) + %tmp38_99 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc185) + %tmp39 = arith.constant dense<0> : tensor<1x1024xi64> loc(#loc186) + %tmp39_100 = arith.cmpi eq, %tmp37, %tmp39 : tensor<1x1024xi64> loc(#loc186) + %tmp40 = arith.andi %tmp29, %tmp39_100 : tensor<1x1024xi1> loc(#loc187) + %tmp41 = arith.ori %tmp16_67, %tmp40 : tensor<1x1024xi1> loc(#loc188) + %tmp42 = arith.constant false loc(#loc189) + %tmp42_101 = arith.constant dense : tensor<1x1024xi1> loc(#loc189) + %tmp43 = arith.select %tmp6, %tmp41, %tmp42_101 : tensor<1x1024xi1>, tensor<1x1024xi1> loc(#loc190) + %tmp44 = arith.extui %tmp43 : tensor<1x1024xi1> to tensor<1x1024xi64> loc(#loc191) + %tmp47 = arith.addi %_tmp46_22, %tmp44 : tensor<1x1024xi64> loc(#loc192) + %_tmp46_102 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x1024xi1> loc(#loc193) + %_tmp46_103 = arith.andi %r0_mask_24, %_tmp46_102 : tensor<1x1024xi1> loc(#loc193) + %_tmp46_104 = arith.select %_tmp46_103, %tmp47, %_tmp46_22 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc194) + scf.yield %_tmp46_104 : tensor<1x1024xi64> loc(#loc79) + } loc(#loc131) + %tmp46 = tt.call @"triton.language.standard.sum__i64S1_1024S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp46_18) : (tensor<1x1024xi64>) -> tensor<1xi64> loc(#loc195) + %tmp46_19 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc196) + %tmp48 = arith.constant 0 : i64 loc(#loc197) + %tmp48_20 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc197) + %tmp49 = arith.cmpi sgt, %tmp46_19, %tmp48_20 : tensor<1x1xi64> loc(#loc198) + %tmp50 = arith.constant 16384 : i64 loc(#loc199) + %tmp50_21 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc199) + %tmp51 = arith.cmpi slt, %tmp46_19, %tmp50_21 : tensor<1x1xi64> loc(#loc200) + %tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1> loc(#loc201) + %tmp53 = arith.extui %tmp52 : tensor<1x1xi1> to tensor<1x1xi8> loc(#loc202) + %tmp54 = arith.extsi %tmp53 : tensor<1x1xi8> to tensor<1x1xi32> loc(#loc203) + %tmp55 = arith.cmpi eq, %tmp46_19, %tmp50_21 : tensor<1x1xi64> loc(#loc204) + %tmp56 = arith.extui %tmp55 : tensor<1x1xi1> to tensor<1x1xi8> loc(#loc205) + %tmp57 = arith.extsi %tmp56 : tensor<1x1xi8> to tensor<1x1xi32> loc(#loc206) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc92) + %5 = tt.addptr %4, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc92) + tt.store %5, %tmp54, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc93) + %6 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc94) + %7 = tt.addptr %6, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc94) + tt.store %7, %tmp57, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc95) + tt.return loc(#loc96) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_1024S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x1024xi64> loc("input"(#loc97))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc98) + tt.reduce.return %2 : i64 loc(#loc98) + }) : (tensor<1x1024xi64>) -> tensor<1xi64> loc(#loc98) + tt.return %0 : tensor<1xi64> loc(#loc100) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc101) + tt.return %1 : tensor<1xi64> loc(#loc101) + } loc(#loc97) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc102)), %b: i64 loc("b"(#loc102))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc103) + tt.return %0 : i64 loc(#loc104) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc105) + tt.return %1 : i64 loc(#loc105) + } loc(#loc102) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":27:21) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":27:28) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":28:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":29:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":30:44) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":32:40) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":33:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":34:29) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":37:27) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":38:27) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":39:26) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":39:22) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":41:22) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":42:26) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":42:22) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":44:22) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":45:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":46:26) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":46:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":47:26) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":47:22) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":48:23) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:55) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:35) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:87) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:94) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:77) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":50:23) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":51:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":52:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":53:23) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":54:39) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":55:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":56:37) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":57:24) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":58:24) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":59:35) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":60:25) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":61:92) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":62:92) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":63:25) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":64:24) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":65:24) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":66:39) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":67:24) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":68:24) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:29) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:24) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:45) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:38) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:55) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:51) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":70:25) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":71:25) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":72:92) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":73:25) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":74:24) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":75:24) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":76:39) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":77:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":78:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":79:24) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":80:24) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":81:44) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":82:38) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":83:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":85:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:36) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:50) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:8) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":87:27) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":87:30) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":88:31) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":89:20) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":90:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":91:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":92:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":93:21) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":94:21) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":95:21) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":96:21) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":97:21) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":98:25) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":98:37) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:25) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:37) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:4) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc117 = loc("r0_numel"(#loc1)) +#loc118 = loc("xoffset"(#loc2)) +#loc119 = loc("xoffset"(#loc3)) +#loc120 = loc("xindex"(#loc4)) +#loc121 = loc("xindex"(#loc5)) +#loc122 = loc("xindex"(#loc6)) +#loc123 = loc("xmask"(#loc7)) +#loc124 = loc("r0_base"(#loc8)) +#loc125 = loc("r0_base"(#loc9)) +#loc126 = loc("x1"(#loc10)) +#loc127 = loc("x1"(#loc11)) +#loc128 = loc("x0"(#loc12)) +#loc129 = loc("x2"(#loc13)) +#loc130 = loc("_tmp46"(#loc14)) +#loc131 = loc("_tmp46"(#loc15)) +#loc132 = loc("r0_index"(#loc16)) +#loc133 = loc("r0_mask"(#loc17)) +#loc134 = loc("r0_4"(#loc18)) +#loc135 = loc("r0_3"(#loc19)) +#loc136 = loc("tmp0"(#loc20)) +#loc137 = loc("tmp0"(#loc21)) +#loc138 = loc("tmp2"(#loc22)) +#loc139 = loc("tmp3"(#loc23)) +#loc140 = loc("tmp3"(#loc24)) +#loc141 = loc("tmp5"(#loc25)) +#loc142 = loc("tmp6"(#loc26)) +#loc143 = loc("tmp7"(#loc27)) +#loc144 = loc("tmp7"(#loc28)) +#loc145 = loc("tmp8"(#loc29)) +#loc146 = loc("tmp8"(#loc30)) +#loc147 = loc("tmp9"(#loc31)) +#loc148 = loc("tmp10"(#loc32)) +#loc149 = loc("tmp10"(#loc33)) +#loc150 = loc("tmp10"(#loc34)) +#loc151 = loc("tmp10"(#loc35)) +#loc152 = loc("tmp10"(#loc36)) +#loc153 = loc("tmp11"(#loc37)) +#loc154 = loc("tmp12"(#loc38)) +#loc155 = loc("tmp13"(#loc39)) +#loc156 = loc("tmp14"(#loc40)) +#loc157 = loc("tmp15"(#loc41)) +#loc158 = loc("tmp16"(#loc42)) +#loc159 = loc("tmp17"(#loc43)) +#loc160 = loc("tmp18"(#loc44)) +#loc161 = loc("tmp19"(#loc45)) +#loc162 = loc("tmp20"(#loc46)) +#loc163 = loc("tmp21"(#loc47)) +#loc164 = loc("tmp22"(#loc48)) +#loc165 = loc("tmp23"(#loc49)) +#loc166 = loc("tmp24"(#loc50)) +#loc167 = loc("tmp25"(#loc51)) +#loc168 = loc("tmp26"(#loc52)) +#loc169 = loc("tmp27"(#loc53)) +#loc170 = loc("tmp28"(#loc54)) +#loc171 = loc("tmp29"(#loc55)) +#loc172 = loc("tmp30"(#loc56)) +#loc173 = loc("tmp30"(#loc57)) +#loc174 = loc("tmp30"(#loc58)) +#loc175 = loc("tmp30"(#loc59)) +#loc176 = loc("tmp30"(#loc60)) +#loc177 = loc("tmp30"(#loc61)) +#loc178 = loc("tmp31"(#loc62)) +#loc179 = loc("tmp32"(#loc63)) +#loc180 = loc("tmp33"(#loc64)) +#loc181 = loc("tmp34"(#loc65)) +#loc182 = loc("tmp35"(#loc66)) +#loc183 = loc("tmp36"(#loc67)) +#loc184 = loc("tmp37"(#loc68)) +#loc185 = loc("tmp38"(#loc69)) +#loc186 = loc("tmp39"(#loc70)) +#loc187 = loc("tmp40"(#loc71)) +#loc188 = loc("tmp41"(#loc72)) +#loc189 = loc("tmp42"(#loc73)) +#loc190 = loc("tmp43"(#loc74)) +#loc191 = loc("tmp44"(#loc75)) +#loc192 = loc("tmp47"(#loc76)) +#loc193 = loc("_tmp46"(#loc77)) +#loc194 = loc("_tmp46"(#loc78)) +#loc195 = loc("tmp46"(#loc80)) +#loc196 = loc("tmp46"(#loc81)) +#loc197 = loc("tmp48"(#loc82)) +#loc198 = loc("tmp49"(#loc83)) +#loc199 = loc("tmp50"(#loc84)) +#loc200 = loc("tmp51"(#loc85)) +#loc201 = loc("tmp52"(#loc86)) +#loc202 = loc("tmp53"(#loc87)) +#loc203 = loc("tmp54"(#loc88)) +#loc204 = loc("tmp55"(#loc89)) +#loc205 = loc("tmp56"(#loc90)) +#loc206 = loc("tmp57"(#loc91)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..f29d409fa9c537fe0d2648d2430dcea548b52c6e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir @@ -0,0 +1,280 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":18:0) +#loc1 = loc(unknown) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":87:27) +#loc79 = loc("in_ptr0"(#loc)) +#loc80 = loc("out_ptr1"(#loc)) +#loc81 = loc("out_ptr2"(#loc)) +#loc82 = loc("ks0"(#loc)) +#loc83 = loc("ks1"(#loc)) +#loc84 = loc("ks2"(#loc)) +#loc85 = loc("ks3"(#loc)) +#loc86 = loc("ks4"(#loc)) +#loc87 = loc("ks5"(#loc)) +#loc88 = loc("xnumel"(#loc)) +#loc89 = loc("r0_numel"(#loc)) +#loc149 = loc("tmp46"(#loc63)) +#loc164 = loc(callsite(#loc1 at #loc149)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x1024xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<16384> : tensor<1x1024xi32, #blocked> loc(#loc1) + %c-128_i64 = arith.constant -128 : i64 loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_1 = arith.constant dense<16384> : tensor<1x1xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<1x1xi64, #blocked> loc(#loc1) + %cst_3 = arith.constant dense : tensor<1x1024xi1, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0> : tensor<1x1024xi64, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc90) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc91) + %r0_base = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc92) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<1024xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x1024xi32, #blocked> loc(#loc92) + %x1 = arith.extsi %xoffset : i32 to i64 loc(#loc93) + %x1_6 = arith.divsi %x1, %ks0 : i64 loc(#loc93) + %x1_7 = arith.remsi %x1_6, %ks1 : i64 loc(#loc94) + %x0 = arith.remsi %x1, %ks0 : i64 loc(#loc95) + %x2 = arith.divsi %x1, %ks4 : i64 loc(#loc96) + %tmp0 = arith.muli %x1_7, %c128_i64 : i64 loc(#loc97) + %tmp0_8 = tt.splat %tmp0 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc159) + %tmp2 = tt.splat %ks2 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc99) + %tmp3 = arith.muli %x0, %c128_i64 : i64 loc(#loc100) + %tmp3_9 = tt.splat %tmp3 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc160) + %tmp5 = tt.splat %ks3 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc102) + %tmp10 = tt.addptr %in_ptr0, %x2 : !tt.ptr, i64 loc(#loc103) + %tmp10_10 = tt.splat %xmask : i1 -> tensor<1x1024xi1, #blocked> loc(#loc161) + %tmp10_11 = tt.splat %tmp10 : !tt.ptr -> tensor<1x1024x!tt.ptr, #blocked> loc(#loc105) + %tmp17 = tt.splat %ks5 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc106) + %tmp23 = arith.cmpi slt, %ks5, %c0_i64 : i64 loc(#loc107) + %tmp23_12 = tt.splat %tmp23 : i1 -> tensor<1x1024xi1, #blocked> loc(#loc107) + %tmp30 = arith.muli %x1_7, %c-128_i64 : i64 loc(#loc108) + %tmp30_13 = tt.splat %tmp30 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc162) + %_tmp46 = scf.for %_tmp46_15 = %c0_i32 to %c16384_i32 step %c1024_i32 iter_args(%arg12 = %cst_4) -> (tensor<1x1024xi64, #blocked>) : i32 { + %r0_index = tt.splat %_tmp46_15 : i32 -> tensor<1x1024xi32, #blocked> loc(#loc111) + %r0_index_16 = arith.addi %r0_index, %r0_base_5 : tensor<1x1024xi32, #blocked> loc(#loc111) + %r0_mask = arith.cmpi slt, %r0_index_16, %cst_0 : tensor<1x1024xi32, #blocked> loc(#loc112) + %r0_4 = arith.divsi %r0_index_16, %cst : tensor<1x1024xi32, #blocked> loc(#loc113) + %r0_3 = arith.remsi %r0_index_16, %cst : tensor<1x1024xi32, #blocked> loc(#loc114) + %tmp0_17 = arith.extsi %r0_4 : tensor<1x1024xi32, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc98) + %tmp0_18 = arith.addi %tmp0_17, %tmp0_8 : tensor<1x1024xi64, #blocked> loc(#loc98) + %tmp2_19 = arith.cmpi slt, %tmp0_18, %tmp2 : tensor<1x1024xi64, #blocked> loc(#loc99) + %tmp3_20 = arith.extsi %r0_3 : tensor<1x1024xi32, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc101) + %tmp3_21 = arith.addi %tmp3_20, %tmp3_9 : tensor<1x1024xi64, #blocked> loc(#loc101) + %tmp5_22 = arith.cmpi slt, %tmp3_21, %tmp5 : tensor<1x1024xi64, #blocked> loc(#loc102) + %tmp6 = arith.andi %tmp2_19, %tmp5_22 : tensor<1x1024xi1, #blocked> loc(#loc115) + %tmp9 = arith.cmpi sge, %tmp0_18, %tmp3_21 : tensor<1x1024xi64, #blocked> loc(#loc116) + %tmp10_23 = arith.andi %r0_mask, %tmp6 : tensor<1x1024xi1, #blocked> loc(#loc117) + %tmp10_24 = arith.andi %tmp10_23, %tmp10_10 : tensor<1x1024xi1, #blocked> loc(#loc104) + %tmp10_25 = tt.load %tmp10_11, %tmp10_24, %cst_4 evictionPolicy = evict_last : tensor<1x1024x!tt.ptr, #blocked> loc(#loc105) + %tmp11 = arith.cmpi slt, %tmp3_21, %tmp10_25 : tensor<1x1024xi64, #blocked> loc(#loc118) + %tmp12 = arith.cmpi slt, %tmp0_18, %tmp10_25 : tensor<1x1024xi64, #blocked> loc(#loc119) + %tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x1024xi1, #blocked> loc(#loc120) + %tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x1024xi1, #blocked> loc(#loc121) + %tmp18 = arith.cmpi sge, %tmp3_21, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc122) + %tmp19 = arith.remsi %tmp3_21, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc123) + %tmp21 = arith.cmpi ne, %tmp19, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc124) + %tmp22 = arith.cmpi slt, %tmp19, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc125) + %tmp24 = arith.cmpi ne, %tmp22, %tmp23_12 : tensor<1x1024xi1, #blocked> loc(#loc126) + %tmp25 = arith.andi %tmp21, %tmp24 : tensor<1x1024xi1, #blocked> loc(#loc127) + %tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc128) + %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi64, #blocked> loc(#loc129) + %tmp28 = arith.cmpi slt, %tmp27, %tmp10_25 : tensor<1x1024xi64, #blocked> loc(#loc130) + %tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x1024xi1, #blocked> loc(#loc131) + %tmp30_26 = arith.subi %r0_3, %r0_4 : tensor<1x1024xi32, #blocked> loc(#loc132) + %tmp30_27 = arith.extsi %tmp30_26 : tensor<1x1024xi32, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc109) + %tmp30_28 = arith.addi %tmp30_27, %tmp30_13 : tensor<1x1024xi64, #blocked> loc(#loc109) + %tmp30_29 = arith.addi %tmp30_28, %tmp3_9 : tensor<1x1024xi64, #blocked> loc(#loc133) + %tmp31 = arith.remsi %tmp30_29, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc134) + %tmp32 = arith.cmpi ne, %tmp31, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc135) + %tmp33 = arith.cmpi slt, %tmp31, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc136) + %tmp34 = arith.cmpi ne, %tmp33, %tmp23_12 : tensor<1x1024xi1, #blocked> loc(#loc137) + %tmp35 = arith.andi %tmp32, %tmp34 : tensor<1x1024xi1, #blocked> loc(#loc138) + %tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc139) + %tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi64, #blocked> loc(#loc140) + %tmp39 = arith.cmpi eq, %tmp37, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc141) + %tmp40 = arith.andi %tmp29, %tmp39 : tensor<1x1024xi1, #blocked> loc(#loc142) + %tmp41 = arith.ori %tmp14, %tmp40 : tensor<1x1024xi1, #blocked> loc(#loc143) + %tmp43 = arith.select %tmp6, %tmp41, %cst_3 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi1, #blocked> loc(#loc144) + %tmp44 = arith.extui %tmp43 : tensor<1x1024xi1, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc145) + %tmp47 = arith.addi %arg12, %tmp44 : tensor<1x1024xi64, #blocked> loc(#loc146) + %_tmp46_30 = arith.andi %r0_mask, %tmp10_10 : tensor<1x1024xi1, #blocked> loc(#loc147) + %_tmp46_31 = arith.select %_tmp46_30, %tmp47, %arg12 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi64, #blocked> loc(#loc148) + scf.yield %_tmp46_31 : tensor<1x1024xi64, #blocked> loc(#loc61) + } loc(#loc110) + %tmp46 = "tt.reduce"(%_tmp46) <{axis = 1 : i32}> ({ + ^bb0(%tmp46_15: i64 loc(callsite(#loc1 at #loc149)), %tmp46_16: i64 loc(callsite(#loc1 at #loc149))): + %tmp46_17 = arith.addi %tmp46_15, %tmp46_16 : i64 loc(#loc167) + tt.reduce.return %tmp46_17 : i64 loc(#loc163) + }) : (tensor<1x1024xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc163) + %tmp46_14 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc150) + %tmp49 = arith.cmpi sgt, %tmp46_14, %cst_2 : tensor<1x1xi64, #blocked> loc(#loc151) + %tmp51 = arith.cmpi slt, %tmp46_14, %cst_1 : tensor<1x1xi64, #blocked> loc(#loc152) + %tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1, #blocked> loc(#loc153) + %tmp54 = arith.extui %tmp52 : tensor<1x1xi1, #blocked> to tensor<1x1xi32, #blocked> loc(#loc165) + %tmp55 = arith.cmpi eq, %tmp46_14, %cst_1 : tensor<1x1xi64, #blocked> loc(#loc156) + %tmp57 = arith.extui %tmp55 : tensor<1x1xi1, #blocked> to tensor<1x1xi32, #blocked> loc(#loc166) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc74) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc75) + %2 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc75) + tt.store %1, %tmp54, %2 : tensor<1x1x!tt.ptr, #blocked> loc(#loc75) + %3 = tt.addptr %out_ptr2, %xoffset : !tt.ptr, i32 loc(#loc76) + %4 = tt.splat %3 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc77) + tt.store %4, %tmp57, %2 : tensor<1x1x!tt.ptr, #blocked> loc(#loc77) + tt.return loc(#loc78) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":27:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":27:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":29:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":39:26) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":39:22) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":41:22) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":42:26) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":42:22) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":44:22) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:35) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:94) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:77) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":56:37) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":62:92) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:45) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:38) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":32:40) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":33:31) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":34:29) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":37:27) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":38:27) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":45:22) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":48:23) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:87) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":50:23) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":51:23) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":52:24) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":53:23) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":57:24) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":58:24) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":60:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":61:92) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":63:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":64:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":65:24) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":66:39) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":67:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":68:24) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:24) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:51) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":70:25) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":71:25) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":72:92) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":73:25) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":74:24) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":75:24) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":76:39) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":78:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":79:24) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":80:24) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":82:38) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":83:25) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":85:25) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:36) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:50) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:8) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":87:30) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":89:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":91:20) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":92:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":94:21) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":93:21) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":95:21) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":97:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":96:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":98:25) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":98:37) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:37) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:4) +#loc90 = loc("xoffset"(#loc2)) +#loc91 = loc("xmask"(#loc3)) +#loc92 = loc("r0_base"(#loc4)) +#loc93 = loc("x1"(#loc5)) +#loc94 = loc("x1"(#loc6)) +#loc95 = loc("x0"(#loc7)) +#loc96 = loc("x2"(#loc8)) +#loc97 = loc("tmp0"(#loc9)) +#loc98 = loc("tmp0"(#loc10)) +#loc99 = loc("tmp2"(#loc11)) +#loc100 = loc("tmp3"(#loc12)) +#loc101 = loc("tmp3"(#loc13)) +#loc102 = loc("tmp5"(#loc14)) +#loc103 = loc("tmp10"(#loc15)) +#loc104 = loc("tmp10"(#loc16)) +#loc105 = loc("tmp10"(#loc17)) +#loc106 = loc("tmp17"(#loc18)) +#loc107 = loc("tmp23"(#loc19)) +#loc108 = loc("tmp30"(#loc20)) +#loc109 = loc("tmp30"(#loc21)) +#loc110 = loc("_tmp46"(#loc22)) +#loc111 = loc("r0_index"(#loc23)) +#loc112 = loc("r0_mask"(#loc24)) +#loc113 = loc("r0_4"(#loc25)) +#loc114 = loc("r0_3"(#loc26)) +#loc115 = loc("tmp6"(#loc27)) +#loc116 = loc("tmp9"(#loc28)) +#loc117 = loc("tmp10"(#loc29)) +#loc118 = loc("tmp11"(#loc30)) +#loc119 = loc("tmp12"(#loc31)) +#loc120 = loc("tmp13"(#loc32)) +#loc121 = loc("tmp14"(#loc33)) +#loc122 = loc("tmp18"(#loc34)) +#loc123 = loc("tmp19"(#loc35)) +#loc124 = loc("tmp21"(#loc36)) +#loc125 = loc("tmp22"(#loc37)) +#loc126 = loc("tmp24"(#loc38)) +#loc127 = loc("tmp25"(#loc39)) +#loc128 = loc("tmp26"(#loc40)) +#loc129 = loc("tmp27"(#loc41)) +#loc130 = loc("tmp28"(#loc42)) +#loc131 = loc("tmp29"(#loc43)) +#loc132 = loc("tmp30"(#loc44)) +#loc133 = loc("tmp30"(#loc45)) +#loc134 = loc("tmp31"(#loc46)) +#loc135 = loc("tmp32"(#loc47)) +#loc136 = loc("tmp33"(#loc48)) +#loc137 = loc("tmp34"(#loc49)) +#loc138 = loc("tmp35"(#loc50)) +#loc139 = loc("tmp36"(#loc51)) +#loc140 = loc("tmp37"(#loc52)) +#loc141 = loc("tmp39"(#loc53)) +#loc142 = loc("tmp40"(#loc54)) +#loc143 = loc("tmp41"(#loc55)) +#loc144 = loc("tmp43"(#loc56)) +#loc145 = loc("tmp44"(#loc57)) +#loc146 = loc("tmp47"(#loc58)) +#loc147 = loc("_tmp46"(#loc59)) +#loc148 = loc("_tmp46"(#loc60)) +#loc150 = loc("tmp46"(#loc65)) +#loc151 = loc("tmp49"(#loc66)) +#loc152 = loc("tmp51"(#loc67)) +#loc153 = loc("tmp52"(#loc68)) +#loc154 = loc("tmp54"(#loc69)) +#loc155 = loc("tmp53"(#loc70)) +#loc156 = loc("tmp55"(#loc71)) +#loc157 = loc("tmp57"(#loc72)) +#loc158 = loc("tmp56"(#loc73)) +#loc159 = loc(fused[#loc98, #loc97]) +#loc160 = loc(fused[#loc101, #loc100]) +#loc161 = loc(fused[#loc104, #loc91]) +#loc162 = loc(fused[#loc109, #loc108]) +#loc163 = loc(callsite(#loc62 at #loc149)) +#loc165 = loc(fused[#loc154, #loc155]) +#loc166 = loc(fused[#loc157, #loc158]) +#loc167 = loc(callsite(#loc64 at #loc163)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..2817e7859c7add47e4de941bc6b522c5243baa23 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir @@ -0,0 +1,283 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":18:0) +#loc1 = loc(unknown) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":87:27) +#loc81 = loc("in_ptr0"(#loc)) +#loc82 = loc("out_ptr1"(#loc)) +#loc83 = loc("out_ptr2"(#loc)) +#loc84 = loc("ks0"(#loc)) +#loc85 = loc("ks1"(#loc)) +#loc86 = loc("ks2"(#loc)) +#loc87 = loc("ks3"(#loc)) +#loc88 = loc("ks4"(#loc)) +#loc89 = loc("ks5"(#loc)) +#loc90 = loc("xnumel"(#loc)) +#loc91 = loc("r0_numel"(#loc)) +#loc153 = loc("tmp46"(#loc65)) +#loc168 = loc(callsite(#loc1 at #loc153)) +module { + tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c-128_i64 = arith.constant -128 : i64 loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc2) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %tmp50 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc92) + %cst = arith.constant dense<0> : tensor<1x1xi64> loc(#loc1) + %cst_0 = arith.constant dense : tensor<1x1024xi1> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<1x1024xi32> loc(#loc1) + %cst_2 = arith.constant dense<16384> : tensor<1x1024xi32> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<1x1024xi64> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc93) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc94) + %xmask_4 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc94) + %r0_base = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc95) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<1024xi32> -> tensor<1x1024xi32> loc(#loc96) + %x1 = arith.extsi %xoffset : i32 to i64 loc(#loc97) + %x1_6 = arith.divsi %x1, %ks0 : i64 loc(#loc97) + %x1_7 = arith.remsi %x1_6, %ks1 : i64 loc(#loc98) + %x0 = arith.remsi %x1, %ks0 : i64 loc(#loc99) + %x2 = arith.divsi %x1, %ks4 : i64 loc(#loc100) + %_tmp46 = scf.for %r0_offset = %c0_i32 to %c16384_i32 step %c1024_i32 iter_args(%_tmp46_9 = %cst_3) -> (tensor<1x1024xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x1024xi32> loc(#loc102) + %r0_index_10 = arith.addi %r0_index, %r0_base_5 : tensor<1x1024xi32> loc(#loc102) + %r0_mask = arith.cmpi slt, %r0_index_10, %cst_2 : tensor<1x1024xi32> loc(#loc103) + %r0_4 = arith.divsi %r0_index_10, %cst_1 : tensor<1x1024xi32> loc(#loc104) + %r0_3 = arith.remsi %r0_index_10, %cst_1 : tensor<1x1024xi32> loc(#loc105) + %tmp0 = arith.muli %x1_7, %c128_i64 : i64 loc(#loc106) + %tmp0_11 = arith.extsi %r0_4 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc107) + %tmp0_12 = tt.splat %tmp0 : i64 -> tensor<1x1024xi64> loc(#loc163) + %tmp0_13 = arith.addi %tmp0_11, %tmp0_12 : tensor<1x1024xi64> loc(#loc107) + %tmp2 = tt.splat %ks2 : i64 -> tensor<1x1024xi64> loc(#loc108) + %tmp2_14 = arith.cmpi slt, %tmp0_13, %tmp2 : tensor<1x1024xi64> loc(#loc108) + %tmp3 = arith.muli %x0, %c128_i64 : i64 loc(#loc109) + %tmp3_15 = arith.extsi %r0_3 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc110) + %tmp3_16 = tt.splat %tmp3 : i64 -> tensor<1x1024xi64> loc(#loc164) + %tmp3_17 = arith.addi %tmp3_15, %tmp3_16 : tensor<1x1024xi64> loc(#loc110) + %tmp5 = tt.splat %ks3 : i64 -> tensor<1x1024xi64> loc(#loc111) + %tmp5_18 = arith.cmpi slt, %tmp3_17, %tmp5 : tensor<1x1024xi64> loc(#loc111) + %tmp6 = arith.andi %tmp2_14, %tmp5_18 : tensor<1x1024xi1> loc(#loc112) + %tmp9 = arith.cmpi sge, %tmp0_13, %tmp3_17 : tensor<1x1024xi64> loc(#loc113) + %tmp10 = tt.addptr %in_ptr0, %x2 : !tt.ptr, i64 loc(#loc114) + %tmp10_19 = tt.splat %tmp10 : !tt.ptr -> tensor<1x1024x!tt.ptr> loc(#loc114) + %tmp10_20 = arith.andi %r0_mask, %tmp6 : tensor<1x1024xi1> loc(#loc115) + %tmp10_21 = tt.splat %xmask : i1 -> tensor<1x1024xi1> loc(#loc165) + %tmp10_22 = arith.andi %tmp10_20, %tmp10_21 : tensor<1x1024xi1> loc(#loc116) + %tmp10_23 = tt.load %tmp10_19, %tmp10_22, %cst_3 evictionPolicy = evict_last : tensor<1x1024x!tt.ptr> loc(#loc117) + %tmp11 = arith.cmpi slt, %tmp3_17, %tmp10_23 : tensor<1x1024xi64> loc(#loc118) + %tmp12 = arith.cmpi slt, %tmp0_13, %tmp10_23 : tensor<1x1024xi64> loc(#loc119) + %tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x1024xi1> loc(#loc120) + %tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x1024xi1> loc(#loc121) + %tmp17 = tt.splat %ks5 : i64 -> tensor<1x1024xi64> loc(#loc122) + %tmp18 = arith.cmpi sge, %tmp3_17, %tmp17 : tensor<1x1024xi64> loc(#loc123) + %tmp19 = arith.remsi %tmp3_17, %tmp17 : tensor<1x1024xi64> loc(#loc124) + %tmp21 = arith.cmpi ne, %tmp19, %cst_3 : tensor<1x1024xi64> loc(#loc125) + %tmp22 = arith.cmpi slt, %tmp19, %cst_3 : tensor<1x1024xi64> loc(#loc126) + %tmp23 = arith.cmpi slt, %ks5, %c0_i64 : i64 loc(#loc127) + %tmp23_24 = tt.splat %tmp23 : i1 -> tensor<1x1024xi1> loc(#loc127) + %tmp24 = arith.cmpi ne, %tmp22, %tmp23_24 : tensor<1x1024xi1> loc(#loc128) + %tmp25 = arith.andi %tmp21, %tmp24 : tensor<1x1024xi1> loc(#loc129) + %tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x1024xi64> loc(#loc130) + %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc131) + %tmp28 = arith.cmpi slt, %tmp27, %tmp10_23 : tensor<1x1024xi64> loc(#loc132) + %tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x1024xi1> loc(#loc133) + %tmp30 = arith.subi %r0_3, %r0_4 : tensor<1x1024xi32> loc(#loc134) + %tmp30_25 = arith.muli %x1_7, %c-128_i64 : i64 loc(#loc135) + %tmp30_26 = arith.extsi %tmp30 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc136) + %tmp30_27 = tt.splat %tmp30_25 : i64 -> tensor<1x1024xi64> loc(#loc166) + %tmp30_28 = arith.addi %tmp30_26, %tmp30_27 : tensor<1x1024xi64> loc(#loc136) + %tmp30_29 = arith.addi %tmp30_28, %tmp3_16 : tensor<1x1024xi64> loc(#loc137) + %tmp31 = arith.remsi %tmp30_29, %tmp17 : tensor<1x1024xi64> loc(#loc138) + %tmp32 = arith.cmpi ne, %tmp31, %cst_3 : tensor<1x1024xi64> loc(#loc139) + %tmp33 = arith.cmpi slt, %tmp31, %cst_3 : tensor<1x1024xi64> loc(#loc140) + %tmp34 = arith.cmpi ne, %tmp33, %tmp23_24 : tensor<1x1024xi1> loc(#loc141) + %tmp35 = arith.andi %tmp32, %tmp34 : tensor<1x1024xi1> loc(#loc142) + %tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x1024xi64> loc(#loc143) + %tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc144) + %tmp39 = arith.cmpi eq, %tmp37, %cst_3 : tensor<1x1024xi64> loc(#loc145) + %tmp40 = arith.andi %tmp29, %tmp39 : tensor<1x1024xi1> loc(#loc146) + %tmp41 = arith.ori %tmp14, %tmp40 : tensor<1x1024xi1> loc(#loc147) + %tmp43 = arith.select %tmp6, %tmp41, %cst_0 : tensor<1x1024xi1>, tensor<1x1024xi1> loc(#loc148) + %tmp44 = arith.extui %tmp43 : tensor<1x1024xi1> to tensor<1x1024xi64> loc(#loc149) + %tmp47 = arith.addi %_tmp46_9, %tmp44 : tensor<1x1024xi64> loc(#loc150) + %_tmp46_30 = arith.andi %r0_mask, %tmp10_21 : tensor<1x1024xi1> loc(#loc151) + %_tmp46_31 = arith.select %_tmp46_30, %tmp47, %_tmp46_9 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc152) + scf.yield %_tmp46_31 : tensor<1x1024xi64> loc(#loc63) + } loc(#loc101) + %tmp46 = "tt.reduce"(%_tmp46) <{axis = 1 : i32}> ({ + ^bb0(%tmp46_9: i64 loc(callsite(#loc1 at #loc153)), %tmp46_10: i64 loc(callsite(#loc1 at #loc153))): + %tmp46_11 = arith.addi %tmp46_9, %tmp46_10 : i64 loc(#loc171) + tt.reduce.return %tmp46_11 : i64 loc(#loc167) + }) : (tensor<1x1024xi64>) -> tensor<1xi64> loc(#loc167) + %tmp46_8 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc154) + %tmp49 = arith.cmpi sgt, %tmp46_8, %cst : tensor<1x1xi64> loc(#loc155) + %tmp51 = arith.cmpi slt, %tmp46_8, %tmp50 : tensor<1x1xi64> loc(#loc156) + %tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1> loc(#loc157) + %tmp54 = arith.extui %tmp52 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc169) + %tmp55 = arith.cmpi eq, %tmp46_8, %tmp50 : tensor<1x1xi64> loc(#loc160) + %tmp57 = arith.extui %tmp55 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc170) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc76) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc76) + tt.store %1, %tmp54, %xmask_4 : tensor<1x1x!tt.ptr> loc(#loc77) + %2 = tt.addptr %out_ptr2, %xoffset : !tt.ptr, i32 loc(#loc78) + %3 = tt.splat %2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc78) + tt.store %3, %tmp57, %xmask_4 : tensor<1x1x!tt.ptr> loc(#loc79) + tt.return loc(#loc80) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":32:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":90:35) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":22:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":24:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":25:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":25:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":27:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":28:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":29:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":33:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":34:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":37:27) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":38:27) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":39:26) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":39:22) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":41:22) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":42:26) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":42:22) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":44:22) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":45:22) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":48:23) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:35) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:87) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:94) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":49:77) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":50:23) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":51:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":52:24) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":53:23) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":56:37) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":57:24) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":58:24) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":60:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":61:92) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":62:92) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":63:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":64:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":65:24) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":66:39) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":67:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":68:24) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:24) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:45) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:38) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":69:51) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":70:25) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":71:25) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":72:92) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":73:25) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":74:24) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":75:24) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":76:39) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":78:25) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":79:24) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":80:24) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":82:38) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":83:25) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":85:25) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:36) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:50) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":86:8) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":87:30) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":89:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":91:20) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":92:20) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":94:21) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":93:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":95:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":97:21) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":96:21) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":98:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":98:37) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:25) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:37) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/av/cavp7xan77tfr7qytfkp6sjrgkd6hvruiaqfzkeibtl5rtagscng.py":99:4) +#loc92 = loc("tmp50"(#loc3)) +#loc93 = loc("xoffset"(#loc4)) +#loc94 = loc("xmask"(#loc5)) +#loc95 = loc("r0_base"(#loc6)) +#loc96 = loc("r0_base"(#loc7)) +#loc97 = loc("x1"(#loc8)) +#loc98 = loc("x1"(#loc9)) +#loc99 = loc("x0"(#loc10)) +#loc100 = loc("x2"(#loc11)) +#loc101 = loc("_tmp46"(#loc2)) +#loc102 = loc("r0_index"(#loc12)) +#loc103 = loc("r0_mask"(#loc13)) +#loc104 = loc("r0_4"(#loc14)) +#loc105 = loc("r0_3"(#loc15)) +#loc106 = loc("tmp0"(#loc16)) +#loc107 = loc("tmp0"(#loc17)) +#loc108 = loc("tmp2"(#loc18)) +#loc109 = loc("tmp3"(#loc19)) +#loc110 = loc("tmp3"(#loc20)) +#loc111 = loc("tmp5"(#loc21)) +#loc112 = loc("tmp6"(#loc22)) +#loc113 = loc("tmp9"(#loc23)) +#loc114 = loc("tmp10"(#loc24)) +#loc115 = loc("tmp10"(#loc25)) +#loc116 = loc("tmp10"(#loc26)) +#loc117 = loc("tmp10"(#loc27)) +#loc118 = loc("tmp11"(#loc28)) +#loc119 = loc("tmp12"(#loc29)) +#loc120 = loc("tmp13"(#loc30)) +#loc121 = loc("tmp14"(#loc31)) +#loc122 = loc("tmp17"(#loc32)) +#loc123 = loc("tmp18"(#loc33)) +#loc124 = loc("tmp19"(#loc34)) +#loc125 = loc("tmp21"(#loc35)) +#loc126 = loc("tmp22"(#loc36)) +#loc127 = loc("tmp23"(#loc37)) +#loc128 = loc("tmp24"(#loc38)) +#loc129 = loc("tmp25"(#loc39)) +#loc130 = loc("tmp26"(#loc40)) +#loc131 = loc("tmp27"(#loc41)) +#loc132 = loc("tmp28"(#loc42)) +#loc133 = loc("tmp29"(#loc43)) +#loc134 = loc("tmp30"(#loc44)) +#loc135 = loc("tmp30"(#loc45)) +#loc136 = loc("tmp30"(#loc46)) +#loc137 = loc("tmp30"(#loc47)) +#loc138 = loc("tmp31"(#loc48)) +#loc139 = loc("tmp32"(#loc49)) +#loc140 = loc("tmp33"(#loc50)) +#loc141 = loc("tmp34"(#loc51)) +#loc142 = loc("tmp35"(#loc52)) +#loc143 = loc("tmp36"(#loc53)) +#loc144 = loc("tmp37"(#loc54)) +#loc145 = loc("tmp39"(#loc55)) +#loc146 = loc("tmp40"(#loc56)) +#loc147 = loc("tmp41"(#loc57)) +#loc148 = loc("tmp43"(#loc58)) +#loc149 = loc("tmp44"(#loc59)) +#loc150 = loc("tmp47"(#loc60)) +#loc151 = loc("_tmp46"(#loc61)) +#loc152 = loc("_tmp46"(#loc62)) +#loc154 = loc("tmp46"(#loc67)) +#loc155 = loc("tmp49"(#loc68)) +#loc156 = loc("tmp51"(#loc69)) +#loc157 = loc("tmp52"(#loc70)) +#loc158 = loc("tmp54"(#loc71)) +#loc159 = loc("tmp53"(#loc72)) +#loc160 = loc("tmp55"(#loc73)) +#loc161 = loc("tmp57"(#loc74)) +#loc162 = loc("tmp56"(#loc75)) +#loc163 = loc(fused[#loc107, #loc106]) +#loc164 = loc(fused[#loc110, #loc109]) +#loc165 = loc(fused[#loc116, #loc94]) +#loc166 = loc(fused[#loc136, #loc135]) +#loc167 = loc(callsite(#loc64 at #loc153)) +#loc169 = loc(fused[#loc158, #loc159]) +#loc170 = loc(fused[#loc161, #loc162]) +#loc171 = loc(callsite(#loc66 at #loc167)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3b73c93788fad7b33c16ae1f42835ba33c8aec81 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin b/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin new file mode 100644 index 0000000000000000000000000000000000000000..22fe158cc69cf8b6ee796f5447ecfa97fb57329f Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d148fe77f7ed5dfed5ec9d0346e7b78ee43e0163 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"hash": "20789ed347536015de365783d5f81371fbd0587b607eff6d47e815134479247e", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir new file mode 100644 index 0000000000000000000000000000000000000000..d34b77c004bbc63d8a56255f40cf30cf754413ae --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir @@ -0,0 +1,393 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = icmp samesign ult i32 %8, 32, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 15, !dbg !9 + %12 = and i32 %8, 15, !dbg !10 + %13 = lshr i32 %8, 4, !dbg !11 + %14 = mul nuw nsw i32 %11, 17, !dbg !12 + %15 = add nuw nsw i32 %14, %12, !dbg !13 + %16 = mul i32 %13, 272, !dbg !14 + %17 = add i32 %15, %16, !dbg !15 + %18 = sext i32 %17 to i64, !dbg !16 + %19 = getelementptr i32, ptr addrspace(1) %0, i64 %18, !dbg !16 + %20 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %19, i1 %9) #3, !dbg !17 + %21 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %19, i1 %9) #3, !dbg !17 + %22 = lshr i32 %10, 1, !dbg !18 + %.lobit = and i32 %22, 1, !dbg !18 + %23 = and i32 %10, 1, !dbg !18 + %24 = lshr i32 %10, 2, !dbg !18 + %.lobit1 = and i32 %24, 1, !dbg !18 + %25 = lshr i32 %10, 3, !dbg !18 + %.lobit2 = and i32 %25, 1, !dbg !18 + %26 = xor i32 %23, 1, !dbg !22 + %27 = xor i32 %.lobit, 1, !dbg !22 + %28 = xor i32 %.lobit1, 1, !dbg !22 + %29 = xor i32 %.lobit2, 1, !dbg !22 + %30 = mul nuw nsw i32 %20, %26, !dbg !23 + %31 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %30, i32 1, i32 31), !dbg !24 + %32 = add i32 %31, %30, !dbg !27 + %33 = mul nuw nsw i32 %20, %23, !dbg !28 + %34 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %33, i32 1, i32 31), !dbg !24 + %35 = add i32 %34, %33, !dbg !27 + %36 = mul nuw nsw i32 %26, %11, !dbg !29 + %37 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %36, i32 1, i32 31), !dbg !24 + %38 = add i32 %37, %36, !dbg !27 + %39 = mul nuw nsw i32 %11, %23, !dbg !30 + %40 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 1, i32 31), !dbg !24 + %41 = add i32 %40, %39, !dbg !27 + %42 = trunc i32 %22 to i1, !dbg !31 + %43 = icmp sge i32 %32, %35, !dbg !31 + %44 = icmp ne i32 %32, %35, !dbg !31 + %45 = icmp sle i32 %38, %41, !dbg !31 + %46 = or i1 %44, %45, !dbg !31 + %47 = and i1 %43, %46, !dbg !31 + %.not = xor i1 %47, %42, !dbg !31 + %48 = xor i32 %35, %32, !dbg !32 + %49 = select i1 %.not, i32 0, i32 %48, !dbg !33 + %50 = xor i32 %49, %20, !dbg !34 + %51 = xor i32 %41, %38, !dbg !35 + %52 = select i1 %.not, i32 0, i32 %51, !dbg !36 + %53 = xor i32 %52, %11, !dbg !37 + %54 = mul nuw nsw i32 %50, %27, !dbg !23 + %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 2, i32 31), !dbg !24 + %56 = add i32 %54, %55, !dbg !27 + %57 = mul nuw nsw i32 %50, %.lobit, !dbg !28 + %58 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %57, i32 2, i32 31), !dbg !24 + %59 = add i32 %57, %58, !dbg !27 + %60 = mul nuw nsw i32 %53, %27, !dbg !29 + %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 2, i32 31), !dbg !24 + %62 = add i32 %60, %61, !dbg !27 + %63 = mul nuw nsw i32 %53, %.lobit, !dbg !30 + %64 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 2, i32 31), !dbg !24 + %65 = add i32 %63, %64, !dbg !27 + %66 = trunc i32 %24 to i1, !dbg !31 + %67 = icmp sge i32 %56, %59, !dbg !31 + %68 = icmp ne i32 %56, %59, !dbg !31 + %69 = icmp sle i32 %62, %65, !dbg !31 + %70 = or i1 %68, %69, !dbg !31 + %71 = and i1 %67, %70, !dbg !31 + %.not3 = xor i1 %71, %66, !dbg !31 + %72 = xor i32 %56, %59, !dbg !32 + %73 = select i1 %.not3, i32 0, i32 %72, !dbg !33 + %74 = xor i32 %73, %50, !dbg !34 + %75 = xor i32 %62, %65, !dbg !35 + %76 = select i1 %.not3, i32 0, i32 %75, !dbg !36 + %77 = xor i32 %76, %53, !dbg !37 + %78 = mul nuw nsw i32 %74, %26, !dbg !23 + %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 1, i32 31), !dbg !24 + %80 = add i32 %78, %79, !dbg !27 + %81 = mul nuw nsw i32 %74, %23, !dbg !28 + %82 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %81, i32 1, i32 31), !dbg !24 + %83 = add i32 %81, %82, !dbg !27 + %84 = mul nuw nsw i32 %77, %26, !dbg !29 + %85 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %84, i32 1, i32 31), !dbg !24 + %86 = add i32 %84, %85, !dbg !27 + %87 = mul nuw nsw i32 %77, %23, !dbg !30 + %88 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 1, i32 31), !dbg !24 + %89 = add i32 %87, %88, !dbg !27 + %90 = icmp sge i32 %80, %83, !dbg !31 + %91 = icmp ne i32 %80, %83, !dbg !31 + %92 = icmp sle i32 %86, %89, !dbg !31 + %93 = or i1 %91, %92, !dbg !31 + %94 = and i1 %90, %93, !dbg !31 + %.not4 = xor i1 %94, %66, !dbg !31 + %95 = xor i32 %80, %83, !dbg !32 + %96 = select i1 %.not4, i32 0, i32 %95, !dbg !33 + %97 = xor i32 %96, %74, !dbg !34 + %98 = xor i32 %86, %89, !dbg !35 + %99 = select i1 %.not4, i32 0, i32 %98, !dbg !36 + %100 = xor i32 %99, %77, !dbg !37 + %101 = mul nuw nsw i32 %97, %28, !dbg !23 + %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 4, i32 31), !dbg !24 + %103 = add i32 %101, %102, !dbg !27 + %104 = mul nuw nsw i32 %97, %.lobit1, !dbg !28 + %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 4, i32 31), !dbg !24 + %106 = add i32 %104, %105, !dbg !27 + %107 = mul nuw nsw i32 %100, %28, !dbg !29 + %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 4, i32 31), !dbg !24 + %109 = add i32 %107, %108, !dbg !27 + %110 = mul nuw nsw i32 %100, %.lobit1, !dbg !30 + %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 4, i32 31), !dbg !24 + %112 = add i32 %110, %111, !dbg !27 + %113 = trunc i32 %25 to i1, !dbg !31 + %114 = icmp sge i32 %103, %106, !dbg !31 + %115 = icmp ne i32 %103, %106, !dbg !31 + %116 = icmp sle i32 %109, %112, !dbg !31 + %117 = or i1 %115, %116, !dbg !31 + %118 = and i1 %114, %117, !dbg !31 + %.not5 = xor i1 %118, %113, !dbg !31 + %119 = xor i32 %103, %106, !dbg !32 + %120 = select i1 %.not5, i32 0, i32 %119, !dbg !33 + %121 = xor i32 %120, %97, !dbg !34 + %122 = xor i32 %109, %112, !dbg !35 + %123 = select i1 %.not5, i32 0, i32 %122, !dbg !36 + %124 = xor i32 %123, %100, !dbg !37 + %125 = mul nuw nsw i32 %121, %27, !dbg !23 + %126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 2, i32 31), !dbg !24 + %127 = add i32 %125, %126, !dbg !27 + %128 = mul nuw nsw i32 %121, %.lobit, !dbg !28 + %129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 2, i32 31), !dbg !24 + %130 = add i32 %128, %129, !dbg !27 + %131 = mul nuw nsw i32 %124, %27, !dbg !29 + %132 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %131, i32 2, i32 31), !dbg !24 + %133 = add i32 %131, %132, !dbg !27 + %134 = mul nuw nsw i32 %124, %.lobit, !dbg !30 + %135 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 2, i32 31), !dbg !24 + %136 = add i32 %134, %135, !dbg !27 + %137 = icmp sge i32 %127, %130, !dbg !31 + %138 = icmp ne i32 %127, %130, !dbg !31 + %139 = icmp sle i32 %133, %136, !dbg !31 + %140 = or i1 %138, %139, !dbg !31 + %141 = and i1 %137, %140, !dbg !31 + %.not6 = xor i1 %141, %113, !dbg !31 + %142 = xor i32 %127, %130, !dbg !32 + %143 = select i1 %.not6, i32 0, i32 %142, !dbg !33 + %144 = xor i32 %143, %121, !dbg !34 + %145 = xor i32 %133, %136, !dbg !35 + %146 = select i1 %.not6, i32 0, i32 %145, !dbg !36 + %147 = xor i32 %146, %124, !dbg !37 + %148 = mul nuw nsw i32 %144, %26, !dbg !23 + %149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 1, i32 31), !dbg !24 + %150 = add i32 %148, %149, !dbg !27 + %151 = mul nuw nsw i32 %144, %23, !dbg !28 + %152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %151, i32 1, i32 31), !dbg !24 + %153 = add i32 %151, %152, !dbg !27 + %154 = mul nuw nsw i32 %147, %26, !dbg !29 + %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 1, i32 31), !dbg !24 + %156 = add i32 %154, %155, !dbg !27 + %157 = mul nuw nsw i32 %147, %23, !dbg !30 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !24 + %159 = add i32 %157, %158, !dbg !27 + %160 = icmp sge i32 %150, %153, !dbg !31 + %161 = icmp ne i32 %150, %153, !dbg !31 + %162 = icmp sle i32 %156, %159, !dbg !31 + %163 = or i1 %161, %162, !dbg !31 + %164 = and i1 %160, %163, !dbg !31 + %.not7 = xor i1 %164, %113, !dbg !31 + %165 = xor i32 %150, %153, !dbg !32 + %166 = select i1 %.not7, i32 0, i32 %165, !dbg !33 + %167 = xor i32 %166, %144, !dbg !34 + %168 = xor i32 %156, %159, !dbg !35 + %169 = select i1 %.not7, i32 0, i32 %168, !dbg !36 + %170 = xor i32 %169, %147, !dbg !37 + %171 = mul nuw nsw i32 %167, %29, !dbg !23 + %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %171, i32 8, i32 31), !dbg !24 + %173 = add i32 %171, %172, !dbg !27 + %174 = mul nuw nsw i32 %167, %.lobit2, !dbg !28 + %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 8, i32 31), !dbg !24 + %176 = add i32 %174, %175, !dbg !27 + %177 = mul nuw nsw i32 %170, %29, !dbg !29 + %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 8, i32 31), !dbg !24 + %179 = add i32 %177, %178, !dbg !27 + %180 = mul nuw nsw i32 %170, %.lobit2, !dbg !30 + %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 8, i32 31), !dbg !24 + %182 = add i32 %180, %181, !dbg !27 + %183 = icmp slt i32 %173, %176, !dbg !38 + %184 = icmp eq i32 %173, %176, !dbg !39 + %185 = icmp sgt i32 %179, %182, !dbg !40 + %186 = and i1 %184, %185, !dbg !41 + %187 = or i1 %183, %186, !dbg !42 + %188 = xor i32 %173, %176, !dbg !32 + %189 = select i1 %187, i32 %188, i32 0, !dbg !33 + %190 = xor i32 %189, %167, !dbg !34 + %191 = xor i32 %179, %182, !dbg !35 + %192 = select i1 %187, i32 %191, i32 0, !dbg !36 + %193 = xor i32 %192, %170, !dbg !37 + %194 = mul nuw nsw i32 %190, %28, !dbg !23 + %195 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %194, i32 4, i32 31), !dbg !24 + %196 = add i32 %194, %195, !dbg !27 + %197 = mul nuw nsw i32 %190, %.lobit1, !dbg !28 + %198 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %197, i32 4, i32 31), !dbg !24 + %199 = add i32 %197, %198, !dbg !27 + %200 = mul nuw nsw i32 %193, %28, !dbg !29 + %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 4, i32 31), !dbg !24 + %202 = add i32 %200, %201, !dbg !27 + %203 = mul nuw nsw i32 %193, %.lobit1, !dbg !30 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 4, i32 31), !dbg !24 + %205 = add i32 %203, %204, !dbg !27 + %206 = icmp slt i32 %196, %199, !dbg !38 + %207 = icmp eq i32 %196, %199, !dbg !39 + %208 = icmp sgt i32 %202, %205, !dbg !40 + %209 = and i1 %207, %208, !dbg !41 + %210 = or i1 %206, %209, !dbg !42 + %211 = xor i32 %196, %199, !dbg !32 + %212 = select i1 %210, i32 %211, i32 0, !dbg !33 + %213 = xor i32 %212, %190, !dbg !34 + %214 = xor i32 %202, %205, !dbg !35 + %215 = select i1 %210, i32 %214, i32 0, !dbg !36 + %216 = xor i32 %215, %193, !dbg !37 + %217 = mul nuw nsw i32 %213, %27, !dbg !23 + %218 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %217, i32 2, i32 31), !dbg !24 + %219 = add i32 %217, %218, !dbg !27 + %220 = mul nuw nsw i32 %213, %.lobit, !dbg !28 + %221 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 2, i32 31), !dbg !24 + %222 = add i32 %220, %221, !dbg !27 + %223 = mul nuw nsw i32 %216, %27, !dbg !29 + %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 2, i32 31), !dbg !24 + %225 = add i32 %223, %224, !dbg !27 + %226 = mul nuw nsw i32 %216, %.lobit, !dbg !30 + %227 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %226, i32 2, i32 31), !dbg !24 + %228 = add i32 %226, %227, !dbg !27 + %229 = icmp slt i32 %219, %222, !dbg !38 + %230 = icmp eq i32 %219, %222, !dbg !39 + %231 = icmp sgt i32 %225, %228, !dbg !40 + %232 = and i1 %230, %231, !dbg !41 + %233 = or i1 %229, %232, !dbg !42 + %234 = xor i32 %219, %222, !dbg !32 + %235 = select i1 %233, i32 %234, i32 0, !dbg !33 + %236 = xor i32 %235, %213, !dbg !34 + %237 = xor i32 %225, %228, !dbg !35 + %238 = select i1 %233, i32 %237, i32 0, !dbg !36 + %239 = xor i32 %238, %216, !dbg !37 + %240 = mul nuw nsw i32 %236, %26, !dbg !23 + %241 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 1, i32 31), !dbg !24 + %242 = add i32 %240, %241, !dbg !27 + %243 = mul nuw nsw i32 %236, %23, !dbg !28 + %244 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 1, i32 31), !dbg !24 + %245 = add i32 %243, %244, !dbg !27 + %246 = mul nuw nsw i32 %239, %26, !dbg !29 + %247 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 1, i32 31), !dbg !24 + %248 = add i32 %246, %247, !dbg !27 + %249 = mul nuw nsw i32 %239, %23, !dbg !30 + %250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %249, i32 1, i32 31), !dbg !24 + %251 = add i32 %249, %250, !dbg !27 + %252 = icmp slt i32 %242, %245, !dbg !38 + %253 = icmp eq i32 %242, %245, !dbg !39 + %254 = icmp sgt i32 %248, %251, !dbg !40 + %255 = and i1 %253, %254, !dbg !41 + %256 = or i1 %252, %255, !dbg !42 + %257 = xor i32 %248, %251, !dbg !35 + %258 = select i1 %256, i32 %257, i32 0, !dbg !36 + %259 = xor i32 %258, %239, !dbg !37 + %narrow = select i1 %9, i32 %21, i32 0, !dbg !43 + %260 = sext i32 %narrow to i64, !dbg !43 + %261 = ashr i32 %narrow, 31, !dbg !44 + %262 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %narrow, i32 8, i32 31), !dbg !44 + %263 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %261, i32 8, i32 31), !dbg !44 + %264 = insertelement <2 x i32> poison, i32 %262, i64 0, !dbg !44 + %265 = insertelement <2 x i32> %264, i32 %263, i64 1, !dbg !44 + %266 = bitcast <2 x i32> %265 to i64, !dbg !44 + %267 = add i64 %266, %260, !dbg !46 + %extelt.offset = lshr i64 %267, 32, !dbg !44 + %268 = trunc nuw i64 %extelt.offset to i32, !dbg !44 + %269 = trunc i64 %267 to i32, !dbg !44 + %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 4, i32 31), !dbg !44 + %271 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 4, i32 31), !dbg !44 + %272 = insertelement <2 x i32> poison, i32 %270, i64 0, !dbg !44 + %273 = insertelement <2 x i32> %272, i32 %271, i64 1, !dbg !44 + %274 = bitcast <2 x i32> %273 to i64, !dbg !44 + %275 = add i64 %267, %274, !dbg !46 + %extelt.offset8 = lshr i64 %275, 32, !dbg !44 + %276 = trunc nuw i64 %extelt.offset8 to i32, !dbg !44 + %277 = trunc i64 %275 to i32, !dbg !44 + %278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %277, i32 2, i32 31), !dbg !44 + %279 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 2, i32 31), !dbg !44 + %280 = insertelement <2 x i32> poison, i32 %278, i64 0, !dbg !44 + %281 = insertelement <2 x i32> %280, i32 %279, i64 1, !dbg !44 + %282 = bitcast <2 x i32> %281 to i64, !dbg !44 + %283 = add i64 %275, %282, !dbg !46 + %extelt.offset9 = lshr i64 %283, 32, !dbg !44 + %284 = trunc nuw i64 %extelt.offset9 to i32, !dbg !44 + %285 = trunc i64 %283 to i32, !dbg !44 + %286 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 1, i32 31), !dbg !44 + %287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %284, i32 1, i32 31), !dbg !44 + %288 = insertelement <2 x i32> poison, i32 %286, i64 0, !dbg !44 + %289 = insertelement <2 x i32> %288, i32 %287, i64 1, !dbg !44 + %290 = bitcast <2 x i32> %289 to i64, !dbg !44 + %291 = add i64 %283, %290, !dbg !46 + %292 = trunc i64 %291 to i32, !dbg !47 + %293 = shl i32 %8, 4, !dbg !48 + %294 = or disjoint i32 %11, %293, !dbg !49 + %295 = sext i32 %294 to i64, !dbg !50 + %296 = getelementptr i32, ptr addrspace(1) %1, i64 %295, !dbg !50 + %297 = and i32 %10, 48, !dbg !51 + %298 = icmp eq i32 %297, 0, !dbg !51 + %299 = and i1 %9, %298, !dbg !51 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %259, ptr addrspace(1) %296, i1 %299) #3, !dbg !51 + %300 = zext nneg i32 %8 to i64, !dbg !52 + %301 = getelementptr i32, ptr addrspace(1) %2, i64 %300, !dbg !52 + %302 = and i32 %10, 63, !dbg !53 + %303 = icmp eq i32 %302, 0, !dbg !53 + %304 = and i1 %9, %303, !dbg !53 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %292, ptr addrspace(1) %301, i1 %304) #3, !dbg !53 + ret void, !dbg !54 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", linkageName: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 24, column: 28, scope: !4) +!8 = !DILocation(line: 26, column: 21, scope: !4) +!9 = !DILocation(line: 27, column: 38, scope: !4) +!10 = !DILocation(line: 33, column: 19, scope: !4) +!11 = !DILocation(line: 34, column: 19, scope: !4) +!12 = !DILocation(line: 36, column: 38, scope: !4) +!13 = !DILocation(line: 36, column: 35, scope: !4) +!14 = !DILocation(line: 36, column: 49, scope: !4) +!15 = !DILocation(line: 36, column: 45, scope: !4) +!16 = !DILocation(line: 36, column: 30, scope: !4) +!17 = !DILocation(line: 36, column: 54, scope: !4) +!18 = !DILocation(line: 627, column: 44, scope: !19, inlinedAt: !21) +!19 = distinct !DILexicalBlockFile(scope: !4, file: !20, discriminator: 0) +!20 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!21 = !DILocation(line: 41, column: 67, scope: !4) +!22 = !DILocation(line: 537, column: 21, scope: !19, inlinedAt: !21) +!23 = !DILocation(line: 538, column: 40, scope: !19, inlinedAt: !21) +!24 = !DILocation(line: 291, column: 36, scope: !25, inlinedAt: !21) +!25 = distinct !DILexicalBlockFile(scope: !4, file: !26, discriminator: 0) +!26 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!27 = !DILocation(line: 261, column: 15, scope: !25, inlinedAt: !21) +!28 = !DILocation(line: 539, column: 41, scope: !19, inlinedAt: !21) +!29 = !DILocation(line: 548, column: 23, scope: !19, inlinedAt: !21) +!30 = !DILocation(line: 551, column: 23, scope: !19, inlinedAt: !21) +!31 = !DILocation(line: 599, column: 28, scope: !19, inlinedAt: !21) +!32 = !DILocation(line: 600, column: 38, scope: !19, inlinedAt: !21) +!33 = !DILocation(line: 600, column: 46, scope: !19, inlinedAt: !21) +!34 = !DILocation(line: 600, column: 15, scope: !19, inlinedAt: !21) +!35 = !DILocation(line: 601, column: 48, scope: !19, inlinedAt: !21) +!36 = !DILocation(line: 601, column: 59, scope: !19, inlinedAt: !21) +!37 = !DILocation(line: 601, column: 22, scope: !19, inlinedAt: !21) +!38 = !DILocation(line: 574, column: 22, scope: !19, inlinedAt: !21) +!39 = !DILocation(line: 591, column: 21, scope: !19, inlinedAt: !21) +!40 = !DILocation(line: 594, column: 40, scope: !19, inlinedAt: !21) +!41 = !DILocation(line: 594, column: 29, scope: !19, inlinedAt: !21) +!42 = !DILocation(line: 594, column: 23, scope: !19, inlinedAt: !21) +!43 = !DILocation(line: 44, column: 34, scope: !4) +!44 = !DILocation(line: 291, column: 36, scope: !25, inlinedAt: !45) +!45 = !DILocation(line: 45, column: 26, scope: !4) +!46 = !DILocation(line: 261, column: 15, scope: !25, inlinedAt: !45) +!47 = !DILocation(line: 48, column: 21, scope: !4) +!48 = !DILocation(line: 49, column: 35, scope: !4) +!49 = !DILocation(line: 49, column: 32, scope: !4) +!50 = !DILocation(line: 49, column: 25, scope: !4) +!51 = !DILocation(line: 49, column: 47, scope: !4) +!52 = !DILocation(line: 50, column: 25, scope: !4) +!53 = !DILocation(line: 50, column: 37, scope: !4) +!54 = !DILocation(line: 50, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx new file mode 100644 index 0000000000000000000000000000000000000000..3f1b67f349c815ec48e94bc35715d1311405f2ad --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx @@ -0,0 +1,863 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 // -- Begin function triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 + // @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.visible .entry triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_6 +) +.reqntid 64 +{ + .reg .pred %p<64>; + .reg .b32 %r<224>; + .reg .b64 %rd<26>; + .loc 1 18 0 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:18:0 +$L__func_begin0: + .loc 1 18 0 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:18:0 + +// %bb.0: + ld.param.b64 %rd5, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0]; + ld.param.b64 %rd6, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1]; +$L__tmp0: + .loc 1 24 28 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:24:28 + mov.u32 %r5, %ctaid.x; + .loc 1 26 21 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:26:21 + setp.lt.u32 %p1, %r5, 32; + ld.param.b64 %rd7, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2]; + .loc 1 27 38 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:27:38 + mov.u32 %r6, %tid.x; + and.b32 %r7, %r6, 15; + .loc 1 33 19 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:33:19 + and.b32 %r8, %r5, 15; + .loc 1 34 19 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:34:19 + shr.u32 %r9, %r5, 4; + .loc 1 36 35 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:36:35 + mad.lo.s32 %r10, %r7, 17, %r8; + .loc 1 36 45 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:36:45 + mad.lo.s32 %r11, %r9, 272, %r10; + .loc 1 36 30 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:36:30 + mad.wide.s32 %rd1, %r11, 4, %rd5; + .loc 1 36 54 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:36:54 + // begin inline asm + mov.u32 %r1, 0x0; + @%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2, 0x0; + @%p1 ld.global.b32 { %r2 }, [ %rd1 + 0 ]; + // end inline asm +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shr.u32 %r12, %r6, 1; + bfe.u32 %r13, %r6, 1, 1; + and.b32 %r14, %r6, 1; + shr.u32 %r15, %r6, 2; + bfe.u32 %r16, %r6, 2, 1; + shr.u32 %r17, %r6, 3; + bfe.u32 %r18, %r6, 3, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r19, %r14, 1; + xor.b32 %r20, %r13, 1; + xor.b32 %r21, %r16, 1; + xor.b32 %r22, %r18, 1; + .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r23, %r1, %r19; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r24, %r23, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r25, %r24, %r23; + .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r26, %r1, %r14; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r27, %r26, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r28, %r27, %r26; + .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r29, %r19, %r7; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r30, %r29, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r31, %r30, %r29; + .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r32, %r7, %r14; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r33, %r32, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r34, %r33, %r32; + .loc 2 599 28 // triton_helpers.py:599:28 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.b32 %r35, %r12, 1; + setp.ne.b32 %p5, %r35, 0; + setp.ge.s32 %p6, %r25, %r28; + setp.ne.b32 %p7, %r25, %r28; + setp.le.s32 %p8, %r31, %r34; + or.pred %p9, %p7, %p8; + and.pred %p10, %p6, %p9; + xor.pred %p11, %p10, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r36, %r28, %r25; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r37, 0, %r36, %p11; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r38, %r37, %r1; + .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r39, %r34, %r31; + .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r40, 0, %r39, %p11; + .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r41, %r40, %r7; + .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r42, %r38, %r20; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r43, %r42, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r44, %r42, %r43; + .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r45, %r38, %r13; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r46, %r45, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r47, %r45, %r46; + .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r48, %r41, %r20; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r49, %r48, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r50, %r48, %r49; + .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r51, %r41, %r13; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r52, %r51, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r53, %r51, %r52; + .loc 2 599 28 // triton_helpers.py:599:28 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.b32 %r54, %r15, 1; + setp.ne.b32 %p12, %r54, 0; + setp.ge.s32 %p13, %r44, %r47; + setp.ne.b32 %p14, %r44, %r47; + setp.le.s32 %p15, %r50, %r53; + or.pred %p16, %p14, %p15; + and.pred %p17, %p13, %p16; + xor.pred %p18, %p17, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r55, %r44, %r47; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r56, 0, %r55, %p18; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r57, %r56, %r38; + .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r58, %r50, %r53; + .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r59, 0, %r58, %p18; + .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r60, %r59, %r41; + .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r61, %r57, %r19; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r62, %r61, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r63, %r61, %r62; + .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r64, %r57, %r14; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r65, %r64, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r66, %r64, %r65; + .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r67, %r60, %r19; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r68, %r67, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r69, %r67, %r68; + .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r70, %r60, %r14; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r71, %r70, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r72, %r70, %r71; + .loc 2 599 28 // triton_helpers.py:599:28 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.ge.s32 %p19, %r63, %r66; + setp.ne.b32 %p20, %r63, %r66; + setp.le.s32 %p21, %r69, %r72; + or.pred %p22, %p20, %p21; + and.pred %p23, %p19, %p22; + xor.pred %p24, %p23, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r73, %r63, %r66; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r74, 0, %r73, %p24; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r75, %r74, %r57; + .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r76, %r69, %r72; + .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r77, 0, %r76, %p24; + .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r78, %r77, %r60; + .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r79, %r75, %r21; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r80, %r79, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r81, %r79, %r80; + .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r82, %r75, %r16; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r83, %r82, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r84, %r82, %r83; + .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r85, %r78, %r21; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r86, %r85, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r87, %r85, %r86; + .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r88, %r78, %r16; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r89, %r88, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r90, %r88, %r89; + .loc 2 599 28 // triton_helpers.py:599:28 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.b32 %r91, %r17, 1; + setp.ne.b32 %p25, %r91, 0; + setp.ge.s32 %p26, %r81, %r84; + setp.ne.b32 %p27, %r81, %r84; + setp.le.s32 %p28, %r87, %r90; + or.pred %p29, %p27, %p28; + and.pred %p30, %p26, %p29; + xor.pred %p31, %p30, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r92, %r81, %r84; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r93, 0, %r92, %p31; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r94, %r93, %r75; + .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r95, %r87, %r90; + .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r96, 0, %r95, %p31; + .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r97, %r96, %r78; + .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r98, %r94, %r20; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r99, %r98, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r100, %r98, %r99; + .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r101, %r94, %r13; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r102, %r101, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r103, %r101, %r102; + .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r104, %r97, %r20; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r105, %r104, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r106, %r104, %r105; + .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r107, %r97, %r13; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r108, %r107, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r109, %r107, %r108; + .loc 2 599 28 // triton_helpers.py:599:28 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.ge.s32 %p32, %r100, %r103; + setp.ne.b32 %p33, %r100, %r103; + setp.le.s32 %p34, %r106, %r109; + or.pred %p35, %p33, %p34; + and.pred %p36, %p32, %p35; + xor.pred %p37, %p36, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r110, %r100, %r103; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r111, 0, %r110, %p37; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r112, %r111, %r94; + .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r113, %r106, %r109; + .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r114, 0, %r113, %p37; + .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r115, %r114, %r97; + .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r116, %r112, %r19; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r117, %r116, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r118, %r116, %r117; + .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r119, %r112, %r14; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r120, %r119, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r121, %r119, %r120; + .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r122, %r115, %r19; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r123, %r122, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r124, %r122, %r123; + .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r125, %r115, %r14; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r126, %r125, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r127, %r125, %r126; + .loc 2 599 28 // triton_helpers.py:599:28 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.ge.s32 %p38, %r118, %r121; + setp.ne.b32 %p39, %r118, %r121; + setp.le.s32 %p40, %r124, %r127; + or.pred %p41, %p39, %p40; + and.pred %p42, %p38, %p41; + xor.pred %p43, %p42, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r128, %r118, %r121; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r129, 0, %r128, %p43; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r130, %r129, %r112; + .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r131, %r124, %r127; + .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r132, 0, %r131, %p43; + .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r133, %r132, %r115; + .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r134, %r130, %r22; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r135, %r134, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r136, %r134, %r135; + .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r137, %r130, %r18; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r138, %r137, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r139, %r137, %r138; + .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r140, %r133, %r22; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r141, %r140, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r142, %r140, %r141; + .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r143, %r133, %r18; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r144, %r143, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r145, %r143, %r144; + .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.lt.s32 %p44, %r136, %r139; + .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.eq.b32 %p45, %r136, %r139; + .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.gt.s32 %p46, %r142, %r145; + .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.pred %p47, %p45, %p46; + .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + or.pred %p48, %p44, %p47; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r146, %r136, %r139; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r147, %r146, 0, %p48; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r148, %r147, %r130; + .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r149, %r142, %r145; + .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r150, %r149, 0, %p48; + .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r151, %r150, %r133; + .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r152, %r148, %r21; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r153, %r152, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r154, %r152, %r153; + .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r155, %r148, %r16; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r156, %r155, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r157, %r155, %r156; + .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r158, %r151, %r21; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r159, %r158, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r160, %r158, %r159; + .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r161, %r151, %r16; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r162, %r161, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r163, %r161, %r162; + .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.lt.s32 %p49, %r154, %r157; + .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.eq.b32 %p50, %r154, %r157; + .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.gt.s32 %p51, %r160, %r163; + .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.pred %p52, %p50, %p51; + .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + or.pred %p53, %p49, %p52; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r164, %r154, %r157; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r165, %r164, 0, %p53; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r166, %r165, %r148; + .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r167, %r160, %r163; + .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r168, %r167, 0, %p53; + .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r169, %r168, %r151; + .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r170, %r166, %r20; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r171, %r170, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r172, %r170, %r171; + .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r173, %r166, %r13; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r174, %r173, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r175, %r173, %r174; + .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r176, %r169, %r20; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r177, %r176, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r178, %r176, %r177; + .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r179, %r169, %r13; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r180, %r179, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r181, %r179, %r180; + .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.lt.s32 %p54, %r172, %r175; + .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.eq.b32 %p55, %r172, %r175; + .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.gt.s32 %p56, %r178, %r181; + .loc 2 594 29 // triton_helpers.py:594:29 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + and.pred %p57, %p55, %p56; + .loc 2 594 23 // triton_helpers.py:594:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + or.pred %p58, %p54, %p57; + .loc 2 600 38 // triton_helpers.py:600:38 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r182, %r172, %r175; + .loc 2 600 46 // triton_helpers.py:600:46 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r183, %r182, 0, %p58; + .loc 2 600 15 // triton_helpers.py:600:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r184, %r183, %r166; + .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r185, %r178, %r181; + .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r186, %r185, 0, %p58; + .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r187, %r186, %r169; + .loc 2 538 40 // triton_helpers.py:538:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r188, %r184, %r19; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r189, %r188, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r190, %r188, %r189; + .loc 2 539 41 // triton_helpers.py:539:41 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r191, %r184, %r14; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r192, %r191, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r193, %r191, %r192; + .loc 2 548 23 // triton_helpers.py:548:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r194, %r187, %r19; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r195, %r194, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r196, %r194, %r195; + .loc 2 551 23 // triton_helpers.py:551:23 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + mul.lo.s32 %r197, %r187, %r14; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + shfl.sync.bfly.b32 %r198, %r197, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + add.s32 %r199, %r197, %r198; + .loc 2 574 22 // triton_helpers.py:574:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.lt.s32 %p59, %r190, %r193; + .loc 2 591 21 // triton_helpers.py:591:21 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.eq.b32 %p60, %r190, %r193; + .loc 2 594 40 // triton_helpers.py:594:40 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + setp.gt.s32 %p61, %r196, %r199; + .loc 2 601 48 // triton_helpers.py:601:48 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r200, %r196, %r199; + .loc 2 601 59 // triton_helpers.py:601:59 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + selp.b32 %r201, %r200, 0, %p61; + selp.b32 %r202, %r201, 0, %p60; + selp.b32 %r203, %r200, %r202, %p59; + .loc 2 601 22 // triton_helpers.py:601:22 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:41:67 ] + xor.b32 %r3, %r203, %r187; +$L__tmp2: + .loc 1 44 34 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:44:34 + selp.b32 %r204, %r2, 0, %p1; + cvt.s64.s32 %rd8, %r204; +$L__tmp3: + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ] + shr.s32 %r205, %r204, 31; + shfl.sync.bfly.b32 %r206, %r204, 8, 31, -1; + shfl.sync.bfly.b32 %r207, %r205, 8, 31, -1; + cvt.u64.u32 %rd9, %r206; + cvt.u64.u32 %rd10, %r207; + shl.b64 %rd11, %rd10, 32; + or.b64 %rd12, %rd9, %rd11; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ] + add.s64 %rd13, %rd12, %rd8; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ] + mov.b64 {_, %r208}, %rd13; + cvt.u32.u64 %r209, %rd13; + shfl.sync.bfly.b32 %r210, %r209, 4, 31, -1; + shfl.sync.bfly.b32 %r211, %r208, 4, 31, -1; + cvt.u64.u32 %rd14, %r210; + cvt.u64.u32 %rd15, %r211; + shl.b64 %rd16, %rd15, 32; + or.b64 %rd17, %rd14, %rd16; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ] + add.s64 %rd18, %rd13, %rd17; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ] + mov.b64 {_, %r212}, %rd18; + cvt.u32.u64 %r213, %rd18; + shfl.sync.bfly.b32 %r214, %r213, 2, 31, -1; + shfl.sync.bfly.b32 %r215, %r212, 2, 31, -1; + cvt.u64.u32 %rd19, %r214; + cvt.u64.u32 %rd20, %r215; + shl.b64 %rd21, %rd20, 32; + or.b64 %rd22, %rd19, %rd21; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ] + add.s64 %rd23, %rd18, %rd22; + .loc 3 291 36 // standard.py:291:36 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ] + mov.b64 {_, %r216}, %rd23; + cvt.u32.u64 %r217, %rd23; + shfl.sync.bfly.b32 %r218, %r217, 1, 31, -1; + shfl.sync.bfly.b32 %r219, %r216, 1, 31, -1; + cvt.u64.u32 %rd24, %r218; + .loc 3 261 15 // standard.py:261:15 @[ chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:45:26 ] + add.s64 %rd25, %rd23, %rd24; +$L__tmp4: + .loc 1 48 21 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:48:21 + cvt.u32.u64 %r4, %rd25; + .loc 1 49 35 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:49:35 + shl.b32 %r220, %r5, 4; + .loc 1 49 32 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:49:32 + or.b32 %r221, %r7, %r220; + .loc 1 49 25 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:49:25 + mad.wide.s32 %rd3, %r221, 4, %rd6; + .loc 1 49 47 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:49:47 + and.b32 %r222, %r6, 48; + setp.eq.b32 %p62, %r222, 0; + and.pred %p3, %p1, %p62; + // begin inline asm + @%p3 st.global.b32 [ %rd3 + 0 ], { %r3 }; + // end inline asm + .loc 1 50 25 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:50:25 + mad.wide.u32 %rd4, %r5, 4, %rd7; + .loc 1 50 37 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:50:37 + and.b32 %r223, %r6, 63; + setp.eq.b32 %p63, %r223, 0; + and.pred %p4, %p1, %p63; + // begin inline asm + @%p4 st.global.b32 [ %rd4 + 0 ], { %r4 }; + // end inline asm + .loc 1 50 4 // chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py:50:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 267 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x104 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 104 +.b8 120 +.b8 110 +.b8 121 +.b8 103 +.b8 112 +.b8 118 +.b8 112 +.b8 109 +.b8 118 +.b8 114 +.b8 50 +.b8 109 +.b8 120 +.b8 50 +.b8 101 +.b8 54 +.b8 109 +.b8 119 +.b8 103 +.b8 100 +.b8 101 +.b8 111 +.b8 106 +.b8 116 +.b8 104 +.b8 114 +.b8 105 +.b8 114 +.b8 110 +.b8 111 +.b8 103 +.b8 55 +.b8 110 +.b8 109 +.b8 113 +.b8 54 +.b8 109 +.b8 99 +.b8 115 +.b8 105 +.b8 51 +.b8 119 +.b8 118 +.b8 101 +.b8 103 +.b8 118 +.b8 105 +.b8 50 +.b8 115 +.b8 111 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 104 +.b8 120 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x3d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 99 +.b8 108 +.b8 111 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 108 +.b8 105 +.b8 99 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 116 +.b8 114 +.b8 97 +.b8 110 +.b8 115 +.b8 112 +.b8 111 +.b8 115 +.b8 101 +.b8 95 +.b8 51 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc8:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xdd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 67 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xf5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source b/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source new file mode 100644 index 0000000000000000000000000000000000000000..25cc90bdc300b35fee97a382b78dfc708b28e960 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source @@ -0,0 +1,1216 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":18:0) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc87 = loc(unknown) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc139 = loc("in_ptr0"(#loc)) +#loc140 = loc("out_ptr2"(#loc)) +#loc141 = loc("out_ptr3"(#loc)) +#loc142 = loc("xnumel"(#loc)) +#loc143 = loc("r0_numel"(#loc)) +#loc172 = loc("x"(#loc37)) +#loc173 = loc("idxs"(#loc37)) +#loc174 = loc("x"(#loc41)) +#loc175 = loc("idxs"(#loc41)) +#loc180 = loc("x"(#loc49)) +#loc181 = loc("idxs"(#loc49)) +#loc182 = loc("flip"(#loc49)) +#loc238 = loc("input"(#loc112)) +#loc239 = loc("a"(#loc116)) +#loc240 = loc("b"(#loc116)) +#loc242 = loc("x"(#loc121)) +#loc243 = loc("x"(#loc125)) +#loc244 = loc("input"(#loc134)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 32 : i32 loc(#loc144) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc145) + %xoffset = tt.get_program_id x : i32 loc(#loc146) + %xoffset_2 = arith.constant 1 : i32 loc(#loc147) + %xoffset_3 = arith.constant 1 : i32 loc(#loc147) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc147) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc148) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc149) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc150) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc150) + %xmask = arith.constant dense<32> : tensor<1x1xi32> loc(#loc151) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc151) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc152) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc153) + %r0_offset = arith.constant 0 : i32 loc(#loc154) + %r0_mask = arith.constant true loc(#loc155) + %r0_mask_10 = arith.constant dense : tensor<1x16xi1> loc(#loc155) + %x0 = arith.constant 16 : i32 loc(#loc156) + %x0_11 = arith.constant 16 : i32 loc(#loc156) + %x0_12 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc156) + %x0_13 = arith.remsi %xindex_7, %x0_12 : tensor<1x1xi32> loc(#loc156) + %x1 = arith.constant 16 : i32 loc(#loc157) + %x1_14 = arith.constant 16 : i32 loc(#loc157) + %x1_15 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc157) + %x1_16 = arith.divsi %xindex_7, %x1_15 : tensor<1x1xi32> loc(#loc157) + %tmp0 = arith.constant 17 : i32 loc(#loc158) + %tmp0_17 = arith.constant 17 : i32 loc(#loc158) + %tmp0_18 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc158) + %tmp0_19 = arith.muli %tmp0_18, %r0_index_9 : tensor<1x16xi32> loc(#loc158) + %tmp0_20 = tt.broadcast %x0_13 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc159) + %tmp0_21 = arith.addi %tmp0_20, %tmp0_19 : tensor<1x16xi32> loc(#loc159) + %tmp0_22 = arith.constant 272 : i32 loc(#loc160) + %tmp0_23 = arith.constant 272 : i32 loc(#loc160) + %tmp0_24 = arith.constant dense<272> : tensor<1x1xi32> loc(#loc160) + %tmp0_25 = arith.muli %tmp0_24, %x1_16 : tensor<1x1xi32> loc(#loc160) + %tmp0_26 = tt.broadcast %tmp0_25 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc161) + %tmp0_27 = arith.addi %tmp0_21, %tmp0_26 : tensor<1x16xi32> loc(#loc161) + %tmp0_28 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc162) + %tmp0_29 = tt.addptr %tmp0_28, %tmp0_27 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc162) + %tmp0_30 = arith.constant 0.000000e+00 : f32 loc(#loc163) + %tmp0_31 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc163) + %tmp0_32 = arith.constant dense<0.000000e+00> : tensor<1x16xf32> loc(#loc163) + %tmp0_33 = arith.fptosi %tmp0_32 : tensor<1x16xf32> to tensor<1x16xi32> loc(#loc163) + %tmp0_34 = tt.load %tmp0_29, %tmp0_31, %tmp0_33 : tensor<1x16x!tt.ptr> loc(#loc163) + %tmp2 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc164) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp0_34, %tmp2) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc22) + %tmp7 = arith.extsi %tmp0_34 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc165) + %tmp10 = arith.constant 0 : i32 loc(#loc166) + %tmp10_35 = arith.constant 0 : i64 loc(#loc166) + %tmp10_36 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc166) + %tmp10_37 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc166) + %tmp10_38 = arith.select %tmp10_37, %tmp7, %tmp10_36 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc166) + %tmp11 = tt.call @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp10_38) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc167) + %tmp11_39 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc168) + %tmp12 = arith.extsi %0#1 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc169) + %tmp13 = arith.trunci %tmp12 : tensor<1x16xi64> to tensor<1x16xi32> loc(#loc170) + %tmp14 = arith.trunci %tmp11_39 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc171) + %c16_i32 = arith.constant 16 : i32 loc(#loc30) + %c16_i32_40 = arith.constant 16 : i32 loc(#loc30) + %cst = arith.constant dense<16> : tensor<1x1xi32> loc(#loc30) + %1 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc30) + %2 = tt.broadcast %1 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc31) + %3 = arith.addi %r0_index_9, %2 : tensor<1x16xi32> loc(#loc31) + %4 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc32) + %5 = tt.addptr %4, %3 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc32) + %6 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc33) + tt.store %5, %tmp13, %6 : tensor<1x16x!tt.ptr> loc(#loc33) + %7 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc34) + %8 = tt.addptr %7, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc34) + tt.store %8, %tmp14, %xmask_8 : tensor<1x1x!tt.ptr> loc(#loc35) + tt.return loc(#loc36) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc37)), %idxs: tensor<1x16xi16> loc("idxs"(#loc37))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc38) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc38) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc38) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc38) + tt.return %3#0, %3#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc39) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc40) + %5 = ub.poison : tensor<1x16xi32> loc(#loc40) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc40) + } loc(#loc37) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc41)), %idxs: tensor<1x16xi16> loc("idxs"(#loc41))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc176) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc177) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc177) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc178) + %flip_3 = tt.reshape %flip_2 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc179) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i16S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi16>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + tt.return %0#0, %0#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc47) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi32> loc(#loc48) + %2 = ub.poison : tensor<1x16xi32> loc(#loc48) + tt.return %1, %2 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc48) + } loc(#loc41) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i16S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi16> loc("idxs"(#loc49)), %flip: tensor<1x16xi32> loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi16> -> tensor<8x2x1xi16> loc(#loc197) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc198) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc199) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<8x2x1xi16> loc(#loc199) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<8x2x1xi16>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc201) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc202) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc203) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc204) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<8x2x1xi16> loc(#loc204) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<8x2x1xi16>) -> tensor<8x1xi32> loc(#loc205) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc206) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc207) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_27 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_28 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_49 = arith.constant true loc(#loc215) + %cond_50 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<1x16xi1> loc(#loc215) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<1x16xi1> loc(#loc216) + %cond_53 = arith.ori %cond, %cond_52 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_53 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_50 = arith.ori %eq, %eq_49 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_50 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<1x16xi32> loc(#loc221) + %cond_30 = arith.andi %3, %cond_29 : tensor<1x16xi1> loc(#loc222) + %cond_31 = arith.ori %1, %cond_30 : tensor<1x16xi1> loc(#loc223) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<1x16xi1> loc(#loc224) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<1x16xi1> loc(#loc225) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<1x16xi1> loc(#loc226) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<1x16xi1> loc(#loc227) + %cond_36 = arith.extui %cond_35 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc228) + %cond_37 = arith.xori %cond_36, %flip : tensor<1x16xi32> loc(#loc228) + %cond_38 = arith.constant 0 : i32 loc(#loc229) + %cond_39 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc229) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<1x16xi32> loc(#loc229) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_43 = arith.xori %x, %ret_42 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<1x16xi32> loc(#loc234) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S1_16S__(%idxs) : (tensor<1x16xi16>) -> tensor<1x16xi16> loc(#loc235) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc236) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_47 = arith.extsi %idxs : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc237) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_43, %new_idxs_48 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x1xi32> loc("input"(#loc112))) -> tensor<8x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc113) + tt.return %0 : tensor<8x1xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x1xi32> loc(#loc115) + tt.return %1 : tensor<8x1xi32> loc(#loc115) + } loc(#loc112) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc116)), %b: i32 loc("b"(#loc116))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc117) + tt.return %0 : i32 loc(#loc118) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc119) + tt.return %1 : i32 loc(#loc119) + } loc(#loc116) + tt.func private @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x1xi16> loc("input"(#loc112))) -> tensor<8x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc241) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc113) + tt.return %0 : tensor<8x1xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x1xi32> loc(#loc115) + tt.return %1 : tensor<8x1xi32> loc(#loc115) + } loc(#loc112) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%x: tensor<1x16xi32> loc("x"(#loc121))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc122) + %false = arith.constant false loc(#loc123) + tt.return %false : i1 loc(#loc123) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc124) + tt.return %1 : i1 loc(#loc124) + } loc(#loc121) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S1_16S__(%x: tensor<1x16xi32> loc("x"(#loc125))) -> tensor<1x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc126) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc127) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc127) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc127) + %4 = arith.addi %x, %3 : tensor<1x16xi32> loc(#loc127) + tt.return %4 : tensor<1x16xi32> loc(#loc128) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x16xi32> loc(#loc129) + tt.return %5 : tensor<1x16xi32> loc(#loc129) + } loc(#loc125) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc131) + %cst = arith.constant dense : tensor<1xi1> loc(#loc131) + tt.return %cst : tensor<1xi1> loc(#loc132) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc133) + tt.return %0 : tensor<1xi1> loc(#loc133) + } loc(#loc130) + tt.func private @triton.language.standard.zeros_like__i32S1_16S__(%input: tensor<1x16xi32> loc("input"(#loc134))) -> tensor<1x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<1x16xi32> loc(#loc135) + tt.return %0 : tensor<1x16xi32> loc(#loc136) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi32> loc(#loc137) + tt.return %1 : tensor<1x16xi32> loc(#loc137) + } loc(#loc134) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<1x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc131) + %cst = arith.constant dense<0> : tensor<1x16xi32> loc(#loc131) + tt.return %cst : tensor<1x16xi32> loc(#loc132) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16xi32> loc(#loc133) + tt.return %0 : tensor<1x16xi32> loc(#loc133) + } loc(#loc130) + tt.func private @triton.language.standard.zeros_like__i16S1_16S__(%input: tensor<1x16xi16> loc("input"(#loc134))) -> tensor<1x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<1x16xi16> loc(#loc135) + tt.return %0 : tensor<1x16xi16> loc(#loc136) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi16> loc(#loc137) + tt.return %1 : tensor<1x16xi16> loc(#loc137) + } loc(#loc134) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<1x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc131) + %cst = arith.constant dense<0> : tensor<1x16xi16> loc(#loc131) + tt.return %cst : tensor<1x16xi16> loc(#loc132) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16xi16> loc(#loc133) + tt.return %0 : tensor<1x16xi16> loc(#loc133) + } loc(#loc130) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc41)), %idxs: tensor<1x16xi32> loc("idxs"(#loc41))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc176) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc177) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc177) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc178) + %flip_3 = tt.reshape %flip_2 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc179) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + tt.return %1#0, %1#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc47) + ^bb1: // no predecessors + %2 = ub.poison : tensor<1x16xi32> loc(#loc48) + %3 = ub.poison : tensor<1x16xi32> loc(#loc48) + tt.return %2, %3 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc48) + } loc(#loc41) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: tensor<1x16xi32> loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<4x2x2xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<4x2x2xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<4x2x2xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<4x2x2xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc215) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc215) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc216) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc228) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc228) + %cond_36 = arith.constant 0 : i32 loc(#loc229) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc229) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc229) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x2x2xi32> loc("input"(#loc112))) -> tensor<4x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc113) + tt.return %0 : tensor<4x2xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<4x2xi32> loc(#loc115) + tt.return %1 : tensor<4x2xi32> loc(#loc115) + } loc(#loc112) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: tensor<1x16xi32> loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x1xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x1xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc215) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc215) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc216) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc228) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc228) + %cond_36 = arith.constant 0 : i32 loc(#loc229) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc229) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc229) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc41)), %idxs: tensor<1x16xi32> loc("idxs"(#loc41))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc176) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc177) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc177) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc178) + %flip_3 = tt.reshape %flip_2 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc179) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + tt.return %2#0, %2#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc47) + ^bb1: // no predecessors + %3 = ub.poison : tensor<1x16xi32> loc(#loc48) + %4 = ub.poison : tensor<1x16xi32> loc(#loc48) + tt.return %3, %4 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc48) + } loc(#loc41) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: tensor<1x16xi32> loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<2x2x4xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<2x2x4xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<2x2x4xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<2x2x4xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc215) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc215) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc216) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc228) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc228) + %cond_36 = arith.constant 0 : i32 loc(#loc229) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc229) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc229) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<2x2x4xi32> loc("input"(#loc112))) -> tensor<2x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc113) + tt.return %0 : tensor<2x4xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<2x4xi32> loc(#loc115) + tt.return %1 : tensor<2x4xi32> loc(#loc115) + } loc(#loc112) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc41)), %idxs: tensor<1x16xi32> loc("idxs"(#loc41))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc245) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + tt.return %3#0, %3#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc47) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc48) + %5 = ub.poison : tensor<1x16xi32> loc(#loc48) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc48) + } loc(#loc41) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: i1 loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<1x2x8xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<1x2x8xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<1x2x8xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<1x2x8xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc215) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc215) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc216) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc228) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc228) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2x8xi32> loc("input"(#loc112))) -> tensor<1x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc113) + tt.return %0 : tensor<1x8xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x8xi32> loc(#loc115) + tt.return %1 : tensor<1x8xi32> loc(#loc115) + } loc(#loc112) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: i1 loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<2x2x4xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<2x2x4xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<2x2x4xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<2x2x4xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc215) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc215) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc216) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc228) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc228) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: i1 loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<4x2x2xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<4x2x2xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<4x2x2xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<4x2x2xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc215) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc215) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc216) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc228) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc228) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: i1 loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x1xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x1xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc215) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc215) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc216) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc228) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc228) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x16xi64> loc("input"(#loc112))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc113) + tt.reduce.return %2 : i64 loc(#loc113) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc113) + tt.return %0 : tensor<1xi64> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc115) + tt.return %1 : tensor<1xi64> loc(#loc115) + } loc(#loc112) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc116)), %b: i64 loc("b"(#loc116))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc117) + tt.return %0 : i64 loc(#loc118) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc119) + tt.return %1 : i64 loc(#loc119) + } loc(#loc116) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":33:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":34:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:38) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:49) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:54) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":38:19) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":41:67) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":42:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":44:34) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:26) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:29) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":46:20) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":47:21) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":48:21) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:35) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:32) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:47) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:25) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:37) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:4) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc144 = loc("xnumel"(#loc1)) +#loc145 = loc("r0_numel"(#loc2)) +#loc146 = loc("xoffset"(#loc3)) +#loc147 = loc("xoffset"(#loc4)) +#loc148 = loc("xindex"(#loc5)) +#loc149 = loc("xindex"(#loc6)) +#loc150 = loc("xindex"(#loc7)) +#loc151 = loc("xmask"(#loc8)) +#loc152 = loc("r0_index"(#loc9)) +#loc153 = loc("r0_index"(#loc10)) +#loc154 = loc("r0_offset"(#loc11)) +#loc155 = loc("r0_mask"(#loc12)) +#loc156 = loc("x0"(#loc13)) +#loc157 = loc("x1"(#loc14)) +#loc158 = loc("tmp0"(#loc15)) +#loc159 = loc("tmp0"(#loc16)) +#loc160 = loc("tmp0"(#loc17)) +#loc161 = loc("tmp0"(#loc18)) +#loc162 = loc("tmp0"(#loc19)) +#loc163 = loc("tmp0"(#loc20)) +#loc164 = loc("tmp2"(#loc21)) +#loc165 = loc("tmp7"(#loc23)) +#loc166 = loc("tmp10"(#loc24)) +#loc167 = loc("tmp11"(#loc25)) +#loc168 = loc("tmp11"(#loc26)) +#loc169 = loc("tmp12"(#loc27)) +#loc170 = loc("tmp13"(#loc28)) +#loc171 = loc("tmp14"(#loc29)) +#loc176 = loc("flip"(#loc42)) +#loc177 = loc("flip"(#loc43)) +#loc178 = loc("flip"(#loc44)) +#loc179 = loc("flip"(#loc45)) +#loc183 = loc("y"(#loc50)) +#loc184 = loc("right_mask"(#loc51)) +#loc185 = loc("right_mask"(#loc52)) +#loc186 = loc("left_mask"(#loc53)) +#loc187 = loc("ileft"(#loc54)) +#loc188 = loc("ileft"(#loc55)) +#loc189 = loc("ileft"(#loc56)) +#loc190 = loc("ileft"(#loc57)) +#loc191 = loc("iright"(#loc58)) +#loc192 = loc("iright"(#loc59)) +#loc193 = loc("iright"(#loc60)) +#loc194 = loc("iright"(#loc61)) +#loc195 = loc("ileft"(#loc62)) +#loc196 = loc("iright"(#loc63)) +#loc197 = loc("y_idx"(#loc64)) +#loc198 = loc("left_idx"(#loc65)) +#loc199 = loc("left_idx"(#loc66)) +#loc200 = loc("left_idx"(#loc67)) +#loc201 = loc("left_idx"(#loc68)) +#loc202 = loc("left_idx"(#loc69)) +#loc203 = loc("right_idx"(#loc70)) +#loc204 = loc("right_idx"(#loc71)) +#loc205 = loc("right_idx"(#loc72)) +#loc206 = loc("right_idx"(#loc73)) +#loc207 = loc("right_idx"(#loc74)) +#loc208 = loc("left_idx"(#loc75)) +#loc209 = loc("right_idx"(#loc76)) +#loc210 = loc("left_valid_mask"(#loc77)) +#loc211 = loc("right_valid_mask"(#loc78)) +#loc212 = loc("left_isnan"(#loc79)) +#loc213 = loc("right_isnan"(#loc80)) +#loc214 = loc("cond"(#loc81)) +#loc215 = loc("cond"(#loc84)) +#loc216 = loc("cond"(#loc85)) +#loc217 = loc("cond"(#loc86)) +#loc218 = loc("eq"(#loc88)) +#loc219 = loc("eq"(#loc91)) +#loc220 = loc("eq"(#loc92)) +#loc221 = loc("cond"(#loc93)) +#loc222 = loc("cond"(#loc94)) +#loc223 = loc("cond"(#loc95)) +#loc224 = loc("cond"(#loc96)) +#loc225 = loc("cond"(#loc97)) +#loc226 = loc("cond"(#loc98)) +#loc227 = loc("cond"(#loc99)) +#loc228 = loc("cond"(#loc100)) +#loc229 = loc("cond"(#loc101)) +#loc230 = loc("ret"(#loc102)) +#loc231 = loc("ret"(#loc103)) +#loc232 = loc("ret"(#loc104)) +#loc233 = loc("ret"(#loc105)) +#loc234 = loc("new_idxs"(#loc106)) +#loc235 = loc("new_idxs"(#loc107)) +#loc236 = loc("new_idxs"(#loc108)) +#loc237 = loc("new_idxs"(#loc109)) +#loc241 = loc("input"(#loc120)) +#loc245 = loc("flip"(#loc138)) +#loc246 = loc("cond"(#loc214)) +#loc247 = loc("cond"(#loc217)) +#loc248 = loc("eq"(#loc218)) +#loc249 = loc("eq"(#loc220)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..321f2cc57637df4a5b0891524a9bd2090cc5aa57 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir @@ -0,0 +1,812 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [2, 2, 8], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [4, 2, 4], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked3 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [8, 2, 2], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked4 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [16, 2, 1], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [2, 1], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":18:0) +#loc1 = loc(unknown) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":41:67) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:26) +#loc73 = loc("in_ptr0"(#loc)) +#loc74 = loc("out_ptr2"(#loc)) +#loc75 = loc("out_ptr3"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc91 = loc(callsite(#loc15 at #loc16)) +#loc97 = loc("ileft"(#loc24)) +#loc101 = loc("iright"(#loc29)) +#loc110 = loc("left_idx"(#loc38)) +#loc115 = loc("right_idx"(#loc43)) +#loc135 = loc("tmp11"(#loc63)) +#loc145 = loc(callsite(#loc20 at #loc91)) +#loc149 = loc(callsite(#loc1 at #loc135)) +#loc153 = loc(callsite(#loc97 at #loc145)) +#loc157 = loc(callsite(#loc101 at #loc145)) +#loc165 = loc(callsite(#loc110 at #loc145)) +#loc170 = loc(callsite(#loc115 at #loc145)) +#loc190 = loc(callsite(#loc1 at #loc153)) +#loc192 = loc(callsite(#loc1 at #loc157)) +#loc195 = loc(callsite(#loc1 at #loc165)) +#loc198 = loc(callsite(#loc1 at #loc170)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<17> : tensor<1x16xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x16xi64, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked2> loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked3> loc(#loc1) + %cst_4 = arith.constant dense<0> : tensor<1x16xi32, #blocked> loc(#loc1) + %c272_i32 = arith.constant 272 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked4> loc(#loc1) + %cst_6 = arith.constant dense<0> : tensor<1x16xi32, #blocked5> loc(#loc1) + %cst_7 = arith.constant dense<17> : tensor<1x16xi32, #blocked5> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc78) + %xmask = arith.cmpi slt, %xoffset, %c32_i32 : i32 loc(#loc79) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked5}>> loc(#loc80) + %r0_index_8 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc80) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x16xi32, #blocked5> loc(#loc80) + %r0_index_10 = tt.expand_dims %r0_index_8 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc80) + %x0 = arith.remsi %xoffset, %c16_i32 : i32 loc(#loc81) + %x1 = arith.divsi %xoffset, %c16_i32 : i32 loc(#loc82) + %tmp0 = arith.muli %r0_index_9, %cst_7 : tensor<1x16xi32, #blocked5> loc(#loc83) + %tmp0_11 = arith.muli %r0_index_10, %cst : tensor<1x16xi32, #blocked> loc(#loc83) + %tmp0_12 = tt.splat %x0 : i32 -> tensor<1x16xi32, #blocked5> loc(#loc139) + %tmp0_13 = tt.splat %x0 : i32 -> tensor<1x16xi32, #blocked> loc(#loc139) + %tmp0_14 = arith.addi %tmp0_12, %tmp0 : tensor<1x16xi32, #blocked5> loc(#loc84) + %tmp0_15 = arith.addi %tmp0_13, %tmp0_11 : tensor<1x16xi32, #blocked> loc(#loc84) + %tmp0_16 = arith.muli %x1, %c272_i32 : i32 loc(#loc85) + %tmp0_17 = tt.splat %tmp0_16 : i32 -> tensor<1x16xi32, #blocked5> loc(#loc140) + %tmp0_18 = tt.splat %tmp0_16 : i32 -> tensor<1x16xi32, #blocked> loc(#loc140) + %tmp0_19 = arith.addi %tmp0_14, %tmp0_17 : tensor<1x16xi32, #blocked5> loc(#loc86) + %tmp0_20 = arith.addi %tmp0_15, %tmp0_18 : tensor<1x16xi32, #blocked> loc(#loc86) + %tmp0_21 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc87) + %tmp0_22 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> loc(#loc87) + %tmp0_23 = tt.addptr %tmp0_21, %tmp0_19 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc87) + %tmp0_24 = tt.addptr %tmp0_22, %tmp0_20 : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> loc(#loc87) + %tmp0_25 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked> loc(#loc141) + %tmp0_26 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked5> loc(#loc141) + %tmp0_27 = tt.load %tmp0_23, %tmp0_26, %cst_6 : tensor<1x16x!tt.ptr, #blocked5> loc(#loc88) + %tmp0_28 = tt.load %tmp0_24, %tmp0_25, %cst_4 : tensor<1x16x!tt.ptr, #blocked> loc(#loc88) + %tmp2 = arith.trunci %r0_index_9 : tensor<1x16xi32, #blocked5> to tensor<1x16xi16, #blocked5> loc(#loc89) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> loc(#loc142) + %flip_29 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> loc(#loc142) + %flip_30 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> loc(#loc142) + %flip_31 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> loc(#loc142) + %flip_32 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> loc(#loc142) + %flip_33 = tt.expand_dims %flip_29 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> loc(#loc142) + %flip_34 = tt.expand_dims %flip_30 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> loc(#loc142) + %flip_35 = tt.expand_dims %flip_31 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> loc(#loc142) + %flip_36 = tt.expand_dims %flip_32 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> -> tensor<1x2x1xi32, #blocked3> loc(#loc142) + %flip_37 = tt.expand_dims %flip_33 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> -> tensor<1x2x1xi32, #blocked4> loc(#loc142) + %flip_38 = tt.expand_dims %flip_34 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> -> tensor<1x2x1xi32, #blocked2> loc(#loc142) + %flip_39 = tt.expand_dims %flip_35 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> -> tensor<1x2x1xi32, #blocked1> loc(#loc142) + %flip_40 = tt.broadcast %flip_36 : tensor<1x2x1xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc143) + %flip_41 = tt.reshape %flip_40 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc144) + %y = tt.reshape %tmp0_27 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc150) + %left_mask = arith.subi %cst_5, %flip_37 : tensor<1x2x1xi32, #blocked4> loc(#loc151) + %left_mask_42 = arith.subi %cst_3, %flip_36 : tensor<1x2x1xi32, #blocked3> loc(#loc151) + %left_mask_43 = arith.subi %cst_2, %flip_38 : tensor<1x2x1xi32, #blocked2> loc(#loc151) + %left_mask_44 = arith.subi %cst_1, %flip_39 : tensor<1x2x1xi32, #blocked1> loc(#loc151) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_45 = arith.muli %y, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_46 = "tt.reduce"(%ileft_45) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc189) + %ileft_47 = tt.expand_dims %ileft_46 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc154) + %ileft_48 = tt.broadcast %ileft_47 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc155) + %iright = tt.broadcast %flip_37 : tensor<1x2x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_49 = arith.muli %y, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_50 = "tt.reduce"(%iright_49) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc191) + %iright_51 = tt.expand_dims %iright_50 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc158) + %iright_52 = tt.broadcast %iright_51 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc159) + %ileft_53 = tt.reshape %ileft_48 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_54 = tt.reshape %iright_52 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx = tt.reshape %tmp2 : tensor<1x16xi16, #blocked5> -> tensor<8x2x1xi16, #blocked4> loc(#loc162) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc163) + %left_idx_55 = tt.broadcast %left_idx : tensor<1x2x1xi16, #blocked4> -> tensor<8x2x1xi16, #blocked4> loc(#loc164) + %left_idx_56 = arith.muli %y_idx, %left_idx_55 : tensor<8x2x1xi16, #blocked4> loc(#loc164) + %input = arith.extsi %left_idx_56 : tensor<8x2x1xi16, #blocked4> to tensor<8x2x1xi32, #blocked4> loc(#loc193) + %left_idx_57 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc194) + %left_idx_58 = tt.expand_dims %left_idx_57 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc166) + %left_idx_59 = tt.broadcast %left_idx_58 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc167) + %right_idx = arith.trunci %flip_37 : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc168) + %right_idx_60 = tt.broadcast %right_idx : tensor<1x2x1xi16, #blocked4> -> tensor<8x2x1xi16, #blocked4> loc(#loc169) + %right_idx_61 = arith.muli %y_idx, %right_idx_60 : tensor<8x2x1xi16, #blocked4> loc(#loc169) + %input_62 = arith.extsi %right_idx_61 : tensor<8x2x1xi16, #blocked4> to tensor<8x2x1xi32, #blocked4> loc(#loc196) + %right_idx_63 = "tt.reduce"(%input_62) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc197) + %right_idx_64 = tt.expand_dims %right_idx_63 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc171) + %right_idx_65 = tt.broadcast %right_idx_64 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc172) + %left_idx_66 = tt.reshape %left_idx_59 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_67 = tt.reshape %right_idx_65 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond = arith.cmpi slt, %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq = arith.cmpi eq, %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_68 = arith.cmpi sgt, %left_idx_66, %right_idx_67 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_69 = arith.andi %eq, %cond_68 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_70 = arith.ori %cond, %cond_69 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_71 = arith.extui %cond_70 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_72 = arith.xori %cond_71, %flip_41 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_73 = arith.cmpi ne, %cond_72, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret = arith.xori %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_74 = arith.select %cond_73, %ret, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_75 = arith.xori %tmp0_27, %ret_74 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs = arith.xori %left_idx_66, %right_idx_67 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_76 = arith.select %cond_73, %new_idxs, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_77 = arith.extsi %tmp2 : tensor<1x16xi16, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc187) + %new_idxs_78 = arith.xori %new_idxs_77, %new_idxs_76 : tensor<1x16xi32, #blocked5> loc(#loc187) + %flip_79 = tt.broadcast %flip_38 : tensor<1x2x1xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc143) + %flip_80 = tt.reshape %flip_79 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc144) + %y_81 = tt.reshape %ret_75 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc150) + %ileft_82 = tt.broadcast %left_mask_42 : tensor<1x2x1xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc152) + %ileft_83 = arith.muli %y_81, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc152) + %ileft_84 = "tt.reduce"(%ileft_83) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc189) + %ileft_85 = tt.expand_dims %ileft_84 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc154) + %ileft_86 = tt.broadcast %ileft_85 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc155) + %iright_87 = arith.muli %y_81, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc156) + %iright_88 = "tt.reduce"(%iright_87) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc191) + %iright_89 = tt.expand_dims %iright_88 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc158) + %iright_90 = tt.broadcast %iright_89 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc159) + %ileft_91 = tt.reshape %ileft_86 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_92 = tt.reshape %iright_90 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_93 = tt.reshape %new_idxs_78 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc162) + %left_idx_94 = arith.muli %y_idx_93, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc164) + %left_idx_95 = "tt.reduce"(%left_idx_94) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc194) + %left_idx_96 = tt.expand_dims %left_idx_95 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc166) + %left_idx_97 = tt.broadcast %left_idx_96 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc167) + %right_idx_98 = arith.muli %y_idx_93, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc169) + %right_idx_99 = "tt.reduce"(%right_idx_98) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc197) + %right_idx_100 = tt.expand_dims %right_idx_99 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc171) + %right_idx_101 = tt.broadcast %right_idx_100 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc172) + %left_idx_102 = tt.reshape %left_idx_97 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_103 = tt.reshape %right_idx_101 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_104 = arith.cmpi slt, %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_105 = arith.cmpi eq, %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_106 = arith.cmpi sgt, %left_idx_102, %right_idx_103 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_107 = arith.andi %eq_105, %cond_106 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_108 = arith.ori %cond_104, %cond_107 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_109 = arith.extui %cond_108 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_110 = arith.xori %cond_109, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_111 = arith.cmpi ne, %cond_110, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_112 = arith.xori %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_113 = arith.select %cond_111, %ret_112, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_114 = arith.xori %ret_75, %ret_113 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_115 = arith.xori %left_idx_102, %right_idx_103 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_116 = arith.select %cond_111, %new_idxs_115, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_117 = arith.xori %new_idxs_78, %new_idxs_116 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_118 = tt.reshape %ret_114 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc150) + %ileft_119 = arith.muli %y_118, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_120 = "tt.reduce"(%ileft_119) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc189) + %ileft_121 = tt.expand_dims %ileft_120 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc154) + %ileft_122 = tt.broadcast %ileft_121 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc155) + %iright_123 = arith.muli %y_118, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_124 = "tt.reduce"(%iright_123) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc191) + %iright_125 = tt.expand_dims %iright_124 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc158) + %iright_126 = tt.broadcast %iright_125 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc159) + %ileft_127 = tt.reshape %ileft_122 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_128 = tt.reshape %iright_126 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_129 = tt.reshape %new_idxs_117 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc162) + %left_idx_130 = arith.muli %y_idx_129, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc164) + %left_idx_131 = "tt.reduce"(%left_idx_130) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc194) + %left_idx_132 = tt.expand_dims %left_idx_131 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc166) + %left_idx_133 = tt.broadcast %left_idx_132 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc167) + %right_idx_134 = arith.muli %y_idx_129, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc169) + %right_idx_135 = "tt.reduce"(%right_idx_134) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc197) + %right_idx_136 = tt.expand_dims %right_idx_135 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc171) + %right_idx_137 = tt.broadcast %right_idx_136 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc172) + %left_idx_138 = tt.reshape %left_idx_133 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_139 = tt.reshape %right_idx_137 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_140 = arith.cmpi slt, %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_141 = arith.cmpi eq, %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_142 = arith.cmpi sgt, %left_idx_138, %right_idx_139 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_143 = arith.andi %eq_141, %cond_142 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_144 = arith.ori %cond_140, %cond_143 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_145 = arith.extui %cond_144 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_146 = arith.xori %cond_145, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_147 = arith.cmpi ne, %cond_146, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_148 = arith.xori %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_149 = arith.select %cond_147, %ret_148, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_150 = arith.xori %ret_114, %ret_149 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_151 = arith.xori %left_idx_138, %right_idx_139 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_152 = arith.select %cond_147, %new_idxs_151, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_153 = arith.xori %new_idxs_117, %new_idxs_152 : tensor<1x16xi32, #blocked5> loc(#loc187) + %flip_154 = tt.broadcast %flip_39 : tensor<1x2x1xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc143) + %flip_155 = tt.reshape %flip_154 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc144) + %y_156 = tt.reshape %ret_150 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc150) + %ileft_157 = tt.broadcast %left_mask_43 : tensor<1x2x1xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc152) + %ileft_158 = arith.muli %y_156, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc152) + %ileft_159 = "tt.reduce"(%ileft_158) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc189) + %ileft_160 = tt.expand_dims %ileft_159 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc154) + %ileft_161 = tt.broadcast %ileft_160 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc155) + %iright_162 = arith.muli %y_156, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc156) + %iright_163 = "tt.reduce"(%iright_162) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc191) + %iright_164 = tt.expand_dims %iright_163 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc158) + %iright_165 = tt.broadcast %iright_164 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc159) + %ileft_166 = tt.reshape %ileft_161 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_167 = tt.reshape %iright_165 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_168 = tt.reshape %new_idxs_153 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc162) + %left_idx_169 = arith.muli %y_idx_168, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc164) + %left_idx_170 = "tt.reduce"(%left_idx_169) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc194) + %left_idx_171 = tt.expand_dims %left_idx_170 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc166) + %left_idx_172 = tt.broadcast %left_idx_171 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc167) + %right_idx_173 = arith.muli %y_idx_168, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc169) + %right_idx_174 = "tt.reduce"(%right_idx_173) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc197) + %right_idx_175 = tt.expand_dims %right_idx_174 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc171) + %right_idx_176 = tt.broadcast %right_idx_175 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc172) + %left_idx_177 = tt.reshape %left_idx_172 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_178 = tt.reshape %right_idx_176 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_179 = arith.cmpi slt, %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_180 = arith.cmpi eq, %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_181 = arith.cmpi sgt, %left_idx_177, %right_idx_178 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_182 = arith.andi %eq_180, %cond_181 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_183 = arith.ori %cond_179, %cond_182 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_184 = arith.extui %cond_183 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_185 = arith.xori %cond_184, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_186 = arith.cmpi ne, %cond_185, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_187 = arith.xori %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_188 = arith.select %cond_186, %ret_187, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_189 = arith.xori %ret_150, %ret_188 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_190 = arith.xori %left_idx_177, %right_idx_178 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_191 = arith.select %cond_186, %new_idxs_190, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_192 = arith.xori %new_idxs_153, %new_idxs_191 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_193 = tt.reshape %ret_189 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc150) + %ileft_194 = arith.muli %y_193, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc152) + %ileft_195 = "tt.reduce"(%ileft_194) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc189) + %ileft_196 = tt.expand_dims %ileft_195 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc154) + %ileft_197 = tt.broadcast %ileft_196 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc155) + %iright_198 = arith.muli %y_193, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc156) + %iright_199 = "tt.reduce"(%iright_198) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc191) + %iright_200 = tt.expand_dims %iright_199 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc158) + %iright_201 = tt.broadcast %iright_200 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc159) + %ileft_202 = tt.reshape %ileft_197 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_203 = tt.reshape %iright_201 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_204 = tt.reshape %new_idxs_192 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc162) + %left_idx_205 = arith.muli %y_idx_204, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc164) + %left_idx_206 = "tt.reduce"(%left_idx_205) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc194) + %left_idx_207 = tt.expand_dims %left_idx_206 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc166) + %left_idx_208 = tt.broadcast %left_idx_207 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc167) + %right_idx_209 = arith.muli %y_idx_204, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc169) + %right_idx_210 = "tt.reduce"(%right_idx_209) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc197) + %right_idx_211 = tt.expand_dims %right_idx_210 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc171) + %right_idx_212 = tt.broadcast %right_idx_211 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc172) + %left_idx_213 = tt.reshape %left_idx_208 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_214 = tt.reshape %right_idx_212 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_215 = arith.cmpi slt, %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_216 = arith.cmpi eq, %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_217 = arith.cmpi sgt, %left_idx_213, %right_idx_214 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_218 = arith.andi %eq_216, %cond_217 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_219 = arith.ori %cond_215, %cond_218 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_220 = arith.extui %cond_219 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_221 = arith.xori %cond_220, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_222 = arith.cmpi ne, %cond_221, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_223 = arith.xori %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_224 = arith.select %cond_222, %ret_223, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_225 = arith.xori %ret_189, %ret_224 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_226 = arith.xori %left_idx_213, %right_idx_214 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_227 = arith.select %cond_222, %new_idxs_226, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_228 = arith.xori %new_idxs_192, %new_idxs_227 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_229 = tt.reshape %ret_225 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc150) + %ileft_230 = arith.muli %y_229, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_231 = "tt.reduce"(%ileft_230) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc189) + %ileft_232 = tt.expand_dims %ileft_231 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc154) + %ileft_233 = tt.broadcast %ileft_232 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc155) + %iright_234 = arith.muli %y_229, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_235 = "tt.reduce"(%iright_234) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc191) + %iright_236 = tt.expand_dims %iright_235 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc158) + %iright_237 = tt.broadcast %iright_236 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc159) + %ileft_238 = tt.reshape %ileft_233 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_239 = tt.reshape %iright_237 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_240 = tt.reshape %new_idxs_228 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc162) + %left_idx_241 = arith.muli %y_idx_240, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc164) + %left_idx_242 = "tt.reduce"(%left_idx_241) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc194) + %left_idx_243 = tt.expand_dims %left_idx_242 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc166) + %left_idx_244 = tt.broadcast %left_idx_243 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc167) + %right_idx_245 = arith.muli %y_idx_240, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc169) + %right_idx_246 = "tt.reduce"(%right_idx_245) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc197) + %right_idx_247 = tt.expand_dims %right_idx_246 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc171) + %right_idx_248 = tt.broadcast %right_idx_247 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc172) + %left_idx_249 = tt.reshape %left_idx_244 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_250 = tt.reshape %right_idx_248 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_251 = arith.cmpi slt, %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_252 = arith.cmpi eq, %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_253 = arith.cmpi sgt, %left_idx_249, %right_idx_250 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_254 = arith.andi %eq_252, %cond_253 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_255 = arith.ori %cond_251, %cond_254 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_256 = arith.extui %cond_255 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_257 = arith.xori %cond_256, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_258 = arith.cmpi ne, %cond_257, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_259 = arith.xori %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_260 = arith.select %cond_258, %ret_259, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_261 = arith.xori %ret_225, %ret_260 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_262 = arith.xori %left_idx_249, %right_idx_250 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_263 = arith.select %cond_258, %new_idxs_262, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_264 = arith.xori %new_idxs_228, %new_idxs_263 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_265 = tt.reshape %ret_261 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc150) + %ileft_266 = tt.broadcast %left_mask_44 : tensor<1x2x1xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc152) + %ileft_267 = arith.muli %y_265, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc152) + %ileft_268 = "tt.reduce"(%ileft_267) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc189) + %ileft_269 = tt.expand_dims %ileft_268 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc154) + %ileft_270 = tt.broadcast %ileft_269 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc155) + %iright_271 = arith.muli %y_265, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc156) + %iright_272 = "tt.reduce"(%iright_271) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc191) + %iright_273 = tt.expand_dims %iright_272 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc158) + %iright_274 = tt.broadcast %iright_273 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc159) + %ileft_275 = tt.reshape %ileft_270 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_276 = tt.reshape %iright_274 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_277 = tt.reshape %new_idxs_264 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc162) + %left_idx_278 = arith.muli %y_idx_277, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc164) + %left_idx_279 = "tt.reduce"(%left_idx_278) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc194) + %left_idx_280 = tt.expand_dims %left_idx_279 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc166) + %left_idx_281 = tt.broadcast %left_idx_280 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc167) + %right_idx_282 = arith.muli %y_idx_277, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc169) + %right_idx_283 = "tt.reduce"(%right_idx_282) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc197) + %right_idx_284 = tt.expand_dims %right_idx_283 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc171) + %right_idx_285 = tt.broadcast %right_idx_284 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc172) + %left_idx_286 = tt.reshape %left_idx_281 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_287 = tt.reshape %right_idx_285 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_288 = arith.cmpi slt, %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_289 = arith.cmpi eq, %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_290 = arith.cmpi sgt, %left_idx_286, %right_idx_287 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_291 = arith.andi %eq_289, %cond_290 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_292 = arith.ori %cond_288, %cond_291 : tensor<1x16xi1, #blocked5> loc(#loc179) + %ret_293 = arith.xori %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_294 = arith.select %cond_292, %ret_293, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_295 = arith.xori %ret_261, %ret_294 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_296 = arith.xori %left_idx_286, %right_idx_287 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_297 = arith.select %cond_292, %new_idxs_296, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_298 = arith.xori %new_idxs_264, %new_idxs_297 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_299 = tt.reshape %ret_295 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc150) + %ileft_300 = arith.muli %y_299, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc152) + %ileft_301 = "tt.reduce"(%ileft_300) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc189) + %ileft_302 = tt.expand_dims %ileft_301 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc154) + %ileft_303 = tt.broadcast %ileft_302 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc155) + %iright_304 = arith.muli %y_299, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc156) + %iright_305 = "tt.reduce"(%iright_304) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc191) + %iright_306 = tt.expand_dims %iright_305 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc158) + %iright_307 = tt.broadcast %iright_306 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc159) + %ileft_308 = tt.reshape %ileft_303 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_309 = tt.reshape %iright_307 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_310 = tt.reshape %new_idxs_298 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc162) + %left_idx_311 = arith.muli %y_idx_310, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc164) + %left_idx_312 = "tt.reduce"(%left_idx_311) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc194) + %left_idx_313 = tt.expand_dims %left_idx_312 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc166) + %left_idx_314 = tt.broadcast %left_idx_313 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc167) + %right_idx_315 = arith.muli %y_idx_310, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc169) + %right_idx_316 = "tt.reduce"(%right_idx_315) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc197) + %right_idx_317 = tt.expand_dims %right_idx_316 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc171) + %right_idx_318 = tt.broadcast %right_idx_317 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc172) + %left_idx_319 = tt.reshape %left_idx_314 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_320 = tt.reshape %right_idx_318 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_321 = arith.cmpi slt, %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_322 = arith.cmpi eq, %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_323 = arith.cmpi sgt, %left_idx_319, %right_idx_320 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_324 = arith.andi %eq_322, %cond_323 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_325 = arith.ori %cond_321, %cond_324 : tensor<1x16xi1, #blocked5> loc(#loc179) + %ret_326 = arith.xori %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_327 = arith.select %cond_325, %ret_326, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_328 = arith.xori %ret_295, %ret_327 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_329 = arith.xori %left_idx_319, %right_idx_320 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_330 = arith.select %cond_325, %new_idxs_329, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_331 = arith.xori %new_idxs_298, %new_idxs_330 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_332 = tt.reshape %ret_328 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc150) + %ileft_333 = arith.muli %y_332, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc152) + %ileft_334 = "tt.reduce"(%ileft_333) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc189) + %ileft_335 = tt.expand_dims %ileft_334 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc154) + %ileft_336 = tt.broadcast %ileft_335 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc155) + %iright_337 = arith.muli %y_332, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc156) + %iright_338 = "tt.reduce"(%iright_337) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc191) + %iright_339 = tt.expand_dims %iright_338 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc158) + %iright_340 = tt.broadcast %iright_339 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc159) + %ileft_341 = tt.reshape %ileft_336 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_342 = tt.reshape %iright_340 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_343 = tt.reshape %new_idxs_331 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc162) + %left_idx_344 = arith.muli %y_idx_343, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc164) + %left_idx_345 = "tt.reduce"(%left_idx_344) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc194) + %left_idx_346 = tt.expand_dims %left_idx_345 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc166) + %left_idx_347 = tt.broadcast %left_idx_346 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc167) + %right_idx_348 = arith.muli %y_idx_343, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc169) + %right_idx_349 = "tt.reduce"(%right_idx_348) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc197) + %right_idx_350 = tt.expand_dims %right_idx_349 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc171) + %right_idx_351 = tt.broadcast %right_idx_350 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc172) + %left_idx_352 = tt.reshape %left_idx_347 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_353 = tt.reshape %right_idx_351 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_354 = arith.cmpi slt, %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_355 = arith.cmpi eq, %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_356 = arith.cmpi sgt, %left_idx_352, %right_idx_353 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_357 = arith.andi %eq_355, %cond_356 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_358 = arith.ori %cond_354, %cond_357 : tensor<1x16xi1, #blocked5> loc(#loc179) + %ret_359 = arith.xori %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_360 = arith.select %cond_358, %ret_359, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_361 = arith.xori %ret_328, %ret_360 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_362 = arith.xori %left_idx_352, %right_idx_353 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_363 = arith.select %cond_358, %new_idxs_362, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_364 = arith.xori %new_idxs_331, %new_idxs_363 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_365 = tt.reshape %ret_361 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc150) + %ileft_366 = arith.muli %y_365, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_367 = "tt.reduce"(%ileft_366) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc189) + %ileft_368 = tt.expand_dims %ileft_367 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc154) + %ileft_369 = tt.broadcast %ileft_368 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc155) + %iright_370 = arith.muli %y_365, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_371 = "tt.reduce"(%iright_370) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc191) + %iright_372 = tt.expand_dims %iright_371 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc158) + %iright_373 = tt.broadcast %iright_372 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc159) + %ileft_374 = tt.reshape %ileft_369 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_375 = tt.reshape %iright_373 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_376 = tt.reshape %new_idxs_364 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc162) + %left_idx_377 = arith.muli %y_idx_376, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc164) + %left_idx_378 = "tt.reduce"(%left_idx_377) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc194) + %left_idx_379 = tt.expand_dims %left_idx_378 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc166) + %left_idx_380 = tt.broadcast %left_idx_379 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc167) + %right_idx_381 = arith.muli %y_idx_376, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc169) + %right_idx_382 = "tt.reduce"(%right_idx_381) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc197) + %right_idx_383 = tt.expand_dims %right_idx_382 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc171) + %right_idx_384 = tt.broadcast %right_idx_383 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc172) + %left_idx_385 = tt.reshape %left_idx_380 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_386 = tt.reshape %right_idx_384 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_387 = arith.cmpi slt, %ileft_374, %iright_375 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_388 = arith.cmpi eq, %ileft_374, %iright_375 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_389 = arith.cmpi sgt, %left_idx_385, %right_idx_386 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_390 = arith.andi %eq_388, %cond_389 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_391 = arith.ori %cond_387, %cond_390 : tensor<1x16xi1, #blocked5> loc(#loc179) + %new_idxs_392 = arith.xori %left_idx_385, %right_idx_386 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_393 = arith.select %cond_391, %new_idxs_392, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_394 = arith.xori %new_idxs_364, %new_idxs_393 : tensor<1x16xi32, #blocked5> loc(#loc187) + %tmp7 = arith.extsi %tmp0_28 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc133) + %tmp10 = arith.select %tmp0_25, %tmp7, %cst_0 : tensor<1x16xi1, #blocked>, tensor<1x16xi64, #blocked> loc(#loc134) + %tmp11 = "tt.reduce"(%tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_396: i64 loc(callsite(#loc1 at #loc135)), %tmp11_397: i64 loc(callsite(#loc1 at #loc135))): + %tmp11_398 = arith.addi %tmp11_396, %tmp11_397 : i64 loc(#loc188) + tt.reduce.return %tmp11_398 : i64 loc(#loc148) + }) : (tensor<1x16xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc148) + %tmp11_395 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc136) + %tmp14 = arith.trunci %tmp11_395 : tensor<1x1xi64, #blocked> to tensor<1x1xi32, #blocked> loc(#loc137) + %0 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc66) + %1 = tt.splat %0 : i32 -> tensor<1x16xi32, #blocked5> loc(#loc138) + %2 = arith.addi %r0_index_9, %1 : tensor<1x16xi32, #blocked5> loc(#loc67) + %3 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc68) + %4 = tt.addptr %3, %2 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc68) + tt.store %4, %new_idxs_394, %tmp0_26 : tensor<1x16x!tt.ptr, #blocked5> loc(#loc69) + %5 = tt.addptr %out_ptr3, %xoffset : !tt.ptr, i32 loc(#loc70) + %6 = tt.splat %5 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc71) + %7 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc71) + tt.store %6, %tmp14, %7 : tensor<1x1x!tt.ptr, #blocked> loc(#loc71) + tt.return loc(#loc72) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":26:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":27:38) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":33:19) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":34:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:35) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:49) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:45) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:30) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:54) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":38:19) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":42:19) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":44:34) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:29) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":48:21) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:35) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:32) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:25) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:47) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:37) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:4) +#loc78 = loc("xoffset"(#loc2)) +#loc79 = loc("xmask"(#loc3)) +#loc80 = loc("r0_index"(#loc4)) +#loc81 = loc("x0"(#loc5)) +#loc82 = loc("x1"(#loc6)) +#loc83 = loc("tmp0"(#loc7)) +#loc84 = loc("tmp0"(#loc8)) +#loc85 = loc("tmp0"(#loc9)) +#loc86 = loc("tmp0"(#loc10)) +#loc87 = loc("tmp0"(#loc11)) +#loc88 = loc("tmp0"(#loc12)) +#loc89 = loc("tmp2"(#loc13)) +#loc90 = loc("flip"(#loc14)) +#loc92 = loc("flip"(#loc17)) +#loc93 = loc("flip"(#loc18)) +#loc94 = loc("y"(#loc19)) +#loc95 = loc("left_mask"(#loc21)) +#loc96 = loc("ileft"(#loc22)) +#loc98 = loc("ileft"(#loc26)) +#loc99 = loc("ileft"(#loc27)) +#loc100 = loc("iright"(#loc28)) +#loc102 = loc("iright"(#loc30)) +#loc103 = loc("iright"(#loc31)) +#loc104 = loc("ileft"(#loc32)) +#loc105 = loc("iright"(#loc33)) +#loc106 = loc("y_idx"(#loc34)) +#loc107 = loc("left_idx"(#loc35)) +#loc108 = loc("left_idx"(#loc36)) +#loc109 = loc("input"(#loc37)) +#loc111 = loc("left_idx"(#loc39)) +#loc112 = loc("left_idx"(#loc40)) +#loc113 = loc("right_idx"(#loc41)) +#loc114 = loc("right_idx"(#loc42)) +#loc116 = loc("right_idx"(#loc44)) +#loc117 = loc("right_idx"(#loc45)) +#loc118 = loc("left_idx"(#loc46)) +#loc119 = loc("right_idx"(#loc47)) +#loc120 = loc("cond"(#loc48)) +#loc121 = loc("eq"(#loc49)) +#loc122 = loc("cond"(#loc50)) +#loc123 = loc("cond"(#loc51)) +#loc124 = loc("cond"(#loc52)) +#loc125 = loc("cond"(#loc53)) +#loc126 = loc("cond"(#loc54)) +#loc127 = loc("ret"(#loc55)) +#loc128 = loc("ret"(#loc56)) +#loc129 = loc("ret"(#loc57)) +#loc130 = loc("new_idxs"(#loc58)) +#loc131 = loc("new_idxs"(#loc59)) +#loc132 = loc("new_idxs"(#loc60)) +#loc133 = loc("tmp7"(#loc61)) +#loc134 = loc("tmp10"(#loc62)) +#loc136 = loc("tmp11"(#loc64)) +#loc137 = loc("tmp14"(#loc65)) +#loc138 = loc(fused[#loc67, #loc66]) +#loc139 = loc(fused[#loc84, #loc81]) +#loc140 = loc(fused[#loc86, #loc85]) +#loc141 = loc(fused[#loc88, #loc79]) +#loc142 = loc(callsite(#loc90 at #loc91)) +#loc143 = loc(callsite(#loc92 at #loc91)) +#loc144 = loc(callsite(#loc93 at #loc91)) +#loc146 = loc("cond"(#loc120)) +#loc147 = loc("eq"(#loc121)) +#loc148 = loc(callsite(#loc23 at #loc135)) +#loc150 = loc(callsite(#loc94 at #loc145)) +#loc151 = loc(callsite(#loc95 at #loc145)) +#loc152 = loc(callsite(#loc96 at #loc145)) +#loc154 = loc(callsite(#loc98 at #loc145)) +#loc155 = loc(callsite(#loc99 at #loc145)) +#loc156 = loc(callsite(#loc100 at #loc145)) +#loc158 = loc(callsite(#loc102 at #loc145)) +#loc159 = loc(callsite(#loc103 at #loc145)) +#loc160 = loc(callsite(#loc104 at #loc145)) +#loc161 = loc(callsite(#loc105 at #loc145)) +#loc162 = loc(callsite(#loc106 at #loc145)) +#loc163 = loc(callsite(#loc107 at #loc145)) +#loc164 = loc(callsite(#loc108 at #loc145)) +#loc166 = loc(callsite(#loc111 at #loc145)) +#loc167 = loc(callsite(#loc112 at #loc145)) +#loc168 = loc(callsite(#loc113 at #loc145)) +#loc169 = loc(callsite(#loc114 at #loc145)) +#loc171 = loc(callsite(#loc116 at #loc145)) +#loc172 = loc(callsite(#loc117 at #loc145)) +#loc173 = loc(callsite(#loc118 at #loc145)) +#loc174 = loc(callsite(#loc119 at #loc145)) +#loc175 = loc(callsite(#loc146 at #loc145)) +#loc176 = loc(callsite(#loc147 at #loc145)) +#loc177 = loc(callsite(#loc122 at #loc145)) +#loc178 = loc(callsite(#loc123 at #loc145)) +#loc179 = loc(callsite(#loc124 at #loc145)) +#loc180 = loc(callsite(#loc125 at #loc145)) +#loc181 = loc(callsite(#loc126 at #loc145)) +#loc182 = loc(callsite(#loc127 at #loc145)) +#loc183 = loc(callsite(#loc128 at #loc145)) +#loc184 = loc(callsite(#loc129 at #loc145)) +#loc185 = loc(callsite(#loc130 at #loc145)) +#loc186 = loc(callsite(#loc131 at #loc145)) +#loc187 = loc(callsite(#loc132 at #loc145)) +#loc188 = loc(callsite(#loc25 at #loc148)) +#loc189 = loc(callsite(#loc23 at #loc153)) +#loc191 = loc(callsite(#loc23 at #loc157)) +#loc193 = loc(callsite(#loc109 at #loc165)) +#loc194 = loc(callsite(#loc23 at #loc165)) +#loc196 = loc(callsite(#loc109 at #loc170)) +#loc197 = loc(callsite(#loc23 at #loc170)) +#loc199 = loc(callsite(#loc25 at #loc189)) +#loc200 = loc(callsite(#loc25 at #loc191)) +#loc201 = loc(callsite(#loc25 at #loc194)) +#loc202 = loc(callsite(#loc25 at #loc197)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir new file mode 100644 index 0000000000000000000000000000000000000000..b8f4d18cf4391b6e57f537742c61776a6ee03c2d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir @@ -0,0 +1,784 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":18:0) +#loc2 = loc(unknown) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":41:67) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:26) +#loc75 = loc("in_ptr0"(#loc)) +#loc76 = loc("out_ptr2"(#loc)) +#loc77 = loc("out_ptr3"(#loc)) +#loc78 = loc("xnumel"(#loc)) +#loc79 = loc("r0_numel"(#loc)) +#loc96 = loc(callsite(#loc18 at #loc4)) +#loc103 = loc("ileft"(#loc27)) +#loc107 = loc("iright"(#loc32)) +#loc116 = loc("left_idx"(#loc41)) +#loc121 = loc("right_idx"(#loc46)) +#loc140 = loc("tmp11"(#loc65)) +#loc151 = loc(callsite(#loc23 at #loc96)) +#loc155 = loc(callsite(#loc2 at #loc140)) +#loc159 = loc(callsite(#loc103 at #loc151)) +#loc163 = loc(callsite(#loc107 at #loc151)) +#loc171 = loc(callsite(#loc116 at #loc151)) +#loc176 = loc(callsite(#loc121 at #loc151)) +#loc196 = loc(callsite(#loc2 at #loc159)) +#loc198 = loc(callsite(#loc2 at #loc163)) +#loc201 = loc(callsite(#loc2 at #loc171)) +#loc204 = loc(callsite(#loc2 at #loc176)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %tmp0 = arith.constant 272 : i32 loc(#loc80) + %c16_i32 = arith.constant 16 : i32 loc(#loc2) + %xmask = arith.constant 32 : i32 loc(#loc81) + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc82) + %cst_0 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc2) + %tmp10 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc83) + %tmp0_1 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc84) + %xoffset = tt.get_program_id x : i32 loc(#loc85) + %xmask_2 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc81) + %xmask_3 = tt.splat %xmask_2 : i1 -> tensor<1x1xi1> loc(#loc81) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc86) + %r0_index_4 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc87) + %x0 = arith.remsi %xoffset, %c16_i32 : i32 loc(#loc88) + %x1 = arith.divsi %xoffset, %c16_i32 : i32 loc(#loc89) + %tmp0_5 = arith.muli %r0_index_4, %tmp0_1 : tensor<1x16xi32> loc(#loc84) + %tmp0_6 = tt.splat %x0 : i32 -> tensor<1x16xi32> loc(#loc144) + %tmp0_7 = arith.addi %tmp0_6, %tmp0_5 : tensor<1x16xi32> loc(#loc90) + %tmp0_8 = arith.muli %x1, %tmp0 : i32 loc(#loc80) + %tmp0_9 = tt.splat %tmp0_8 : i32 -> tensor<1x16xi32> loc(#loc145) + %tmp0_10 = arith.addi %tmp0_7, %tmp0_9 : tensor<1x16xi32> loc(#loc91) + %tmp0_11 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc92) + %tmp0_12 = tt.addptr %tmp0_11, %tmp0_10 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc92) + %tmp0_13 = tt.splat %xmask_2 : i1 -> tensor<1x16xi1> loc(#loc146) + %tmp0_14 = tt.load %tmp0_12, %tmp0_13, %cst_0 : tensor<1x16x!tt.ptr> loc(#loc93) + %tmp2 = arith.trunci %r0_index_4 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc94) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc147) + %flip_15 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc148) + %flip_16 = tt.expand_dims %flip_15 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc148) + %flip_17 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc149) + %flip_18 = tt.reshape %flip_17 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc150) + %y = tt.reshape %tmp0_14 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc156) + %left_mask = arith.subi %cst, %flip_16 : tensor<1x2x1xi32> loc(#loc157) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc158) + %ileft_19 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc158) + %ileft_20 = "tt.reduce"(%ileft_19) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc195) + %ileft_21 = tt.expand_dims %ileft_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc160) + %ileft_22 = tt.broadcast %ileft_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc161) + %iright = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc162) + %iright_23 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc162) + %iright_24 = "tt.reduce"(%iright_23) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc197) + %iright_25 = tt.expand_dims %iright_24 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc164) + %iright_26 = tt.broadcast %iright_25 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc165) + %ileft_27 = tt.reshape %ileft_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_28 = tt.reshape %iright_26 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx = tt.reshape %tmp2 : tensor<1x16xi16> -> tensor<8x2x1xi16> loc(#loc168) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc169) + %left_idx_29 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc170) + %left_idx_30 = arith.muli %y_idx, %left_idx_29 : tensor<8x2x1xi16> loc(#loc170) + %input = arith.extsi %left_idx_30 : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc199) + %left_idx_31 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_32 = tt.expand_dims %left_idx_31 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc172) + %left_idx_33 = tt.broadcast %left_idx_32 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc173) + %right_idx = arith.trunci %flip_16 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc174) + %right_idx_34 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc175) + %right_idx_35 = arith.muli %y_idx, %right_idx_34 : tensor<8x2x1xi16> loc(#loc175) + %input_36 = arith.extsi %right_idx_35 : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc202) + %right_idx_37 = "tt.reduce"(%input_36) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc203) + %right_idx_38 = tt.expand_dims %right_idx_37 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc177) + %right_idx_39 = tt.broadcast %right_idx_38 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc178) + %left_idx_40 = tt.reshape %left_idx_33 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_41 = tt.reshape %right_idx_39 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc180) + %cond = arith.cmpi slt, %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc181) + %eq = arith.cmpi eq, %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc182) + %cond_42 = arith.cmpi sgt, %left_idx_40, %right_idx_41 : tensor<1x16xi32> loc(#loc183) + %cond_43 = arith.andi %eq, %cond_42 : tensor<1x16xi1> loc(#loc184) + %cond_44 = arith.ori %cond, %cond_43 : tensor<1x16xi1> loc(#loc185) + %cond_45 = arith.extui %cond_44 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_46 = arith.xori %cond_45, %flip_18 : tensor<1x16xi32> loc(#loc186) + %cond_47 = arith.cmpi ne, %cond_46, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret = arith.xori %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc188) + %ret_48 = arith.select %cond_47, %ret, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_49 = arith.xori %tmp0_14, %ret_48 : tensor<1x16xi32> loc(#loc190) + %new_idxs = arith.xori %left_idx_40, %right_idx_41 : tensor<1x16xi32> loc(#loc191) + %new_idxs_50 = arith.select %cond_47, %new_idxs, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_51 = arith.extsi %tmp2 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc193) + %new_idxs_52 = arith.xori %new_idxs_51, %new_idxs_50 : tensor<1x16xi32> loc(#loc193) + %flip_53 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc149) + %flip_54 = tt.reshape %flip_53 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc150) + %y_55 = tt.reshape %ret_49 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc156) + %ileft_56 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc158) + %ileft_57 = arith.muli %y_55, %ileft_56 : tensor<4x2x2xi32> loc(#loc158) + %ileft_58 = "tt.reduce"(%ileft_57) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc195) + %ileft_59 = tt.expand_dims %ileft_58 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc160) + %ileft_60 = tt.broadcast %ileft_59 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc161) + %iright_61 = arith.muli %y_55, %flip_17 : tensor<4x2x2xi32> loc(#loc162) + %iright_62 = "tt.reduce"(%iright_61) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc197) + %iright_63 = tt.expand_dims %iright_62 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc164) + %iright_64 = tt.broadcast %iright_63 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc165) + %ileft_65 = tt.reshape %ileft_60 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_66 = tt.reshape %iright_64 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_67 = tt.reshape %new_idxs_52 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc168) + %left_idx_68 = arith.muli %y_idx_67, %ileft_56 : tensor<4x2x2xi32> loc(#loc170) + %left_idx_69 = "tt.reduce"(%left_idx_68) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_70 = tt.expand_dims %left_idx_69 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc172) + %left_idx_71 = tt.broadcast %left_idx_70 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc173) + %right_idx_72 = arith.muli %y_idx_67, %flip_17 : tensor<4x2x2xi32> loc(#loc175) + %right_idx_73 = "tt.reduce"(%right_idx_72) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc203) + %right_idx_74 = tt.expand_dims %right_idx_73 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc177) + %right_idx_75 = tt.broadcast %right_idx_74 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc178) + %left_idx_76 = tt.reshape %left_idx_71 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_77 = tt.reshape %right_idx_75 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_78 = arith.cmpi slt, %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc181) + %eq_79 = arith.cmpi eq, %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc182) + %cond_80 = arith.cmpi sgt, %left_idx_76, %right_idx_77 : tensor<1x16xi32> loc(#loc183) + %cond_81 = arith.andi %eq_79, %cond_80 : tensor<1x16xi1> loc(#loc184) + %cond_82 = arith.ori %cond_78, %cond_81 : tensor<1x16xi1> loc(#loc185) + %cond_83 = arith.extui %cond_82 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_84 = arith.xori %cond_83, %flip_54 : tensor<1x16xi32> loc(#loc186) + %cond_85 = arith.cmpi ne, %cond_84, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_86 = arith.xori %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc188) + %ret_87 = arith.select %cond_85, %ret_86, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_88 = arith.xori %ret_49, %ret_87 : tensor<1x16xi32> loc(#loc190) + %new_idxs_89 = arith.xori %left_idx_76, %right_idx_77 : tensor<1x16xi32> loc(#loc191) + %new_idxs_90 = arith.select %cond_85, %new_idxs_89, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_91 = arith.xori %new_idxs_52, %new_idxs_90 : tensor<1x16xi32> loc(#loc193) + %y_92 = tt.reshape %ret_88 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc156) + %ileft_93 = arith.muli %y_92, %ileft : tensor<8x2x1xi32> loc(#loc158) + %ileft_94 = "tt.reduce"(%ileft_93) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc195) + %ileft_95 = tt.expand_dims %ileft_94 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc160) + %ileft_96 = tt.broadcast %ileft_95 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc161) + %iright_97 = arith.muli %y_92, %iright : tensor<8x2x1xi32> loc(#loc162) + %iright_98 = "tt.reduce"(%iright_97) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc197) + %iright_99 = tt.expand_dims %iright_98 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc164) + %iright_100 = tt.broadcast %iright_99 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc165) + %ileft_101 = tt.reshape %ileft_96 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_102 = tt.reshape %iright_100 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_103 = tt.reshape %new_idxs_91 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc168) + %left_idx_104 = arith.muli %y_idx_103, %ileft : tensor<8x2x1xi32> loc(#loc170) + %left_idx_105 = "tt.reduce"(%left_idx_104) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_106 = tt.expand_dims %left_idx_105 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc172) + %left_idx_107 = tt.broadcast %left_idx_106 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc173) + %right_idx_108 = arith.muli %y_idx_103, %iright : tensor<8x2x1xi32> loc(#loc175) + %right_idx_109 = "tt.reduce"(%right_idx_108) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc203) + %right_idx_110 = tt.expand_dims %right_idx_109 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc177) + %right_idx_111 = tt.broadcast %right_idx_110 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc178) + %left_idx_112 = tt.reshape %left_idx_107 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_113 = tt.reshape %right_idx_111 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_114 = arith.cmpi slt, %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc181) + %eq_115 = arith.cmpi eq, %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc182) + %cond_116 = arith.cmpi sgt, %left_idx_112, %right_idx_113 : tensor<1x16xi32> loc(#loc183) + %cond_117 = arith.andi %eq_115, %cond_116 : tensor<1x16xi1> loc(#loc184) + %cond_118 = arith.ori %cond_114, %cond_117 : tensor<1x16xi1> loc(#loc185) + %cond_119 = arith.extui %cond_118 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_120 = arith.xori %cond_119, %flip_54 : tensor<1x16xi32> loc(#loc186) + %cond_121 = arith.cmpi ne, %cond_120, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_122 = arith.xori %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc188) + %ret_123 = arith.select %cond_121, %ret_122, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_124 = arith.xori %ret_88, %ret_123 : tensor<1x16xi32> loc(#loc190) + %new_idxs_125 = arith.xori %left_idx_112, %right_idx_113 : tensor<1x16xi32> loc(#loc191) + %new_idxs_126 = arith.select %cond_121, %new_idxs_125, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_127 = arith.xori %new_idxs_91, %new_idxs_126 : tensor<1x16xi32> loc(#loc193) + %flip_128 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc149) + %flip_129 = tt.reshape %flip_128 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc150) + %y_130 = tt.reshape %ret_124 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc156) + %ileft_131 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc158) + %ileft_132 = arith.muli %y_130, %ileft_131 : tensor<2x2x4xi32> loc(#loc158) + %ileft_133 = "tt.reduce"(%ileft_132) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc195) + %ileft_134 = tt.expand_dims %ileft_133 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc160) + %ileft_135 = tt.broadcast %ileft_134 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc161) + %iright_136 = arith.muli %y_130, %flip_53 : tensor<2x2x4xi32> loc(#loc162) + %iright_137 = "tt.reduce"(%iright_136) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc197) + %iright_138 = tt.expand_dims %iright_137 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc164) + %iright_139 = tt.broadcast %iright_138 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc165) + %ileft_140 = tt.reshape %ileft_135 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_141 = tt.reshape %iright_139 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_142 = tt.reshape %new_idxs_127 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc168) + %left_idx_143 = arith.muli %y_idx_142, %ileft_131 : tensor<2x2x4xi32> loc(#loc170) + %left_idx_144 = "tt.reduce"(%left_idx_143) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc200) + %left_idx_145 = tt.expand_dims %left_idx_144 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc172) + %left_idx_146 = tt.broadcast %left_idx_145 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc173) + %right_idx_147 = arith.muli %y_idx_142, %flip_53 : tensor<2x2x4xi32> loc(#loc175) + %right_idx_148 = "tt.reduce"(%right_idx_147) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc203) + %right_idx_149 = tt.expand_dims %right_idx_148 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc177) + %right_idx_150 = tt.broadcast %right_idx_149 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc178) + %left_idx_151 = tt.reshape %left_idx_146 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_152 = tt.reshape %right_idx_150 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_153 = arith.cmpi slt, %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc181) + %eq_154 = arith.cmpi eq, %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc182) + %cond_155 = arith.cmpi sgt, %left_idx_151, %right_idx_152 : tensor<1x16xi32> loc(#loc183) + %cond_156 = arith.andi %eq_154, %cond_155 : tensor<1x16xi1> loc(#loc184) + %cond_157 = arith.ori %cond_153, %cond_156 : tensor<1x16xi1> loc(#loc185) + %cond_158 = arith.extui %cond_157 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_159 = arith.xori %cond_158, %flip_129 : tensor<1x16xi32> loc(#loc186) + %cond_160 = arith.cmpi ne, %cond_159, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_161 = arith.xori %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc188) + %ret_162 = arith.select %cond_160, %ret_161, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_163 = arith.xori %ret_124, %ret_162 : tensor<1x16xi32> loc(#loc190) + %new_idxs_164 = arith.xori %left_idx_151, %right_idx_152 : tensor<1x16xi32> loc(#loc191) + %new_idxs_165 = arith.select %cond_160, %new_idxs_164, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_166 = arith.xori %new_idxs_127, %new_idxs_165 : tensor<1x16xi32> loc(#loc193) + %y_167 = tt.reshape %ret_163 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc156) + %ileft_168 = arith.muli %y_167, %ileft_56 : tensor<4x2x2xi32> loc(#loc158) + %ileft_169 = "tt.reduce"(%ileft_168) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc195) + %ileft_170 = tt.expand_dims %ileft_169 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc160) + %ileft_171 = tt.broadcast %ileft_170 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc161) + %iright_172 = arith.muli %y_167, %flip_17 : tensor<4x2x2xi32> loc(#loc162) + %iright_173 = "tt.reduce"(%iright_172) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc197) + %iright_174 = tt.expand_dims %iright_173 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc164) + %iright_175 = tt.broadcast %iright_174 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc165) + %ileft_176 = tt.reshape %ileft_171 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_177 = tt.reshape %iright_175 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_178 = tt.reshape %new_idxs_166 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc168) + %left_idx_179 = arith.muli %y_idx_178, %ileft_56 : tensor<4x2x2xi32> loc(#loc170) + %left_idx_180 = "tt.reduce"(%left_idx_179) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_181 = tt.expand_dims %left_idx_180 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc172) + %left_idx_182 = tt.broadcast %left_idx_181 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc173) + %right_idx_183 = arith.muli %y_idx_178, %flip_17 : tensor<4x2x2xi32> loc(#loc175) + %right_idx_184 = "tt.reduce"(%right_idx_183) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc203) + %right_idx_185 = tt.expand_dims %right_idx_184 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc177) + %right_idx_186 = tt.broadcast %right_idx_185 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc178) + %left_idx_187 = tt.reshape %left_idx_182 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_188 = tt.reshape %right_idx_186 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_189 = arith.cmpi slt, %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc181) + %eq_190 = arith.cmpi eq, %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc182) + %cond_191 = arith.cmpi sgt, %left_idx_187, %right_idx_188 : tensor<1x16xi32> loc(#loc183) + %cond_192 = arith.andi %eq_190, %cond_191 : tensor<1x16xi1> loc(#loc184) + %cond_193 = arith.ori %cond_189, %cond_192 : tensor<1x16xi1> loc(#loc185) + %cond_194 = arith.extui %cond_193 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_195 = arith.xori %cond_194, %flip_129 : tensor<1x16xi32> loc(#loc186) + %cond_196 = arith.cmpi ne, %cond_195, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_197 = arith.xori %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc188) + %ret_198 = arith.select %cond_196, %ret_197, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_199 = arith.xori %ret_163, %ret_198 : tensor<1x16xi32> loc(#loc190) + %new_idxs_200 = arith.xori %left_idx_187, %right_idx_188 : tensor<1x16xi32> loc(#loc191) + %new_idxs_201 = arith.select %cond_196, %new_idxs_200, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_202 = arith.xori %new_idxs_166, %new_idxs_201 : tensor<1x16xi32> loc(#loc193) + %y_203 = tt.reshape %ret_199 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc156) + %ileft_204 = arith.muli %y_203, %ileft : tensor<8x2x1xi32> loc(#loc158) + %ileft_205 = "tt.reduce"(%ileft_204) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc195) + %ileft_206 = tt.expand_dims %ileft_205 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc160) + %ileft_207 = tt.broadcast %ileft_206 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc161) + %iright_208 = arith.muli %y_203, %iright : tensor<8x2x1xi32> loc(#loc162) + %iright_209 = "tt.reduce"(%iright_208) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc197) + %iright_210 = tt.expand_dims %iright_209 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc164) + %iright_211 = tt.broadcast %iright_210 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc165) + %ileft_212 = tt.reshape %ileft_207 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_213 = tt.reshape %iright_211 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_214 = tt.reshape %new_idxs_202 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc168) + %left_idx_215 = arith.muli %y_idx_214, %ileft : tensor<8x2x1xi32> loc(#loc170) + %left_idx_216 = "tt.reduce"(%left_idx_215) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_217 = tt.expand_dims %left_idx_216 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc172) + %left_idx_218 = tt.broadcast %left_idx_217 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc173) + %right_idx_219 = arith.muli %y_idx_214, %iright : tensor<8x2x1xi32> loc(#loc175) + %right_idx_220 = "tt.reduce"(%right_idx_219) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc203) + %right_idx_221 = tt.expand_dims %right_idx_220 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc177) + %right_idx_222 = tt.broadcast %right_idx_221 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc178) + %left_idx_223 = tt.reshape %left_idx_218 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_224 = tt.reshape %right_idx_222 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_225 = arith.cmpi slt, %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc181) + %eq_226 = arith.cmpi eq, %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc182) + %cond_227 = arith.cmpi sgt, %left_idx_223, %right_idx_224 : tensor<1x16xi32> loc(#loc183) + %cond_228 = arith.andi %eq_226, %cond_227 : tensor<1x16xi1> loc(#loc184) + %cond_229 = arith.ori %cond_225, %cond_228 : tensor<1x16xi1> loc(#loc185) + %cond_230 = arith.extui %cond_229 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_231 = arith.xori %cond_230, %flip_129 : tensor<1x16xi32> loc(#loc186) + %cond_232 = arith.cmpi ne, %cond_231, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_233 = arith.xori %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc188) + %ret_234 = arith.select %cond_232, %ret_233, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_235 = arith.xori %ret_199, %ret_234 : tensor<1x16xi32> loc(#loc190) + %new_idxs_236 = arith.xori %left_idx_223, %right_idx_224 : tensor<1x16xi32> loc(#loc191) + %new_idxs_237 = arith.select %cond_232, %new_idxs_236, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_238 = arith.xori %new_idxs_202, %new_idxs_237 : tensor<1x16xi32> loc(#loc193) + %y_239 = tt.reshape %ret_235 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc156) + %ileft_240 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc158) + %ileft_241 = arith.muli %y_239, %ileft_240 : tensor<1x2x8xi32> loc(#loc158) + %ileft_242 = "tt.reduce"(%ileft_241) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc195) + %ileft_243 = tt.expand_dims %ileft_242 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc160) + %ileft_244 = tt.broadcast %ileft_243 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc161) + %iright_245 = arith.muli %y_239, %flip_128 : tensor<1x2x8xi32> loc(#loc162) + %iright_246 = "tt.reduce"(%iright_245) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc197) + %iright_247 = tt.expand_dims %iright_246 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc164) + %iright_248 = tt.broadcast %iright_247 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc165) + %ileft_249 = tt.reshape %ileft_244 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_250 = tt.reshape %iright_248 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_251 = tt.reshape %new_idxs_238 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc168) + %left_idx_252 = arith.muli %y_idx_251, %ileft_240 : tensor<1x2x8xi32> loc(#loc170) + %left_idx_253 = "tt.reduce"(%left_idx_252) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc200) + %left_idx_254 = tt.expand_dims %left_idx_253 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc172) + %left_idx_255 = tt.broadcast %left_idx_254 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc173) + %right_idx_256 = arith.muli %y_idx_251, %flip_128 : tensor<1x2x8xi32> loc(#loc175) + %right_idx_257 = "tt.reduce"(%right_idx_256) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc203) + %right_idx_258 = tt.expand_dims %right_idx_257 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc177) + %right_idx_259 = tt.broadcast %right_idx_258 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc178) + %left_idx_260 = tt.reshape %left_idx_255 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_261 = tt.reshape %right_idx_259 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_262 = arith.cmpi slt, %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc181) + %eq_263 = arith.cmpi eq, %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc182) + %cond_264 = arith.cmpi sgt, %left_idx_260, %right_idx_261 : tensor<1x16xi32> loc(#loc183) + %cond_265 = arith.andi %eq_263, %cond_264 : tensor<1x16xi1> loc(#loc184) + %cond_266 = arith.ori %cond_262, %cond_265 : tensor<1x16xi1> loc(#loc185) + %ret_267 = arith.xori %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc188) + %ret_268 = arith.select %cond_266, %ret_267, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_269 = arith.xori %ret_235, %ret_268 : tensor<1x16xi32> loc(#loc190) + %new_idxs_270 = arith.xori %left_idx_260, %right_idx_261 : tensor<1x16xi32> loc(#loc191) + %new_idxs_271 = arith.select %cond_266, %new_idxs_270, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_272 = arith.xori %new_idxs_238, %new_idxs_271 : tensor<1x16xi32> loc(#loc193) + %y_273 = tt.reshape %ret_269 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc156) + %ileft_274 = arith.muli %y_273, %ileft_131 : tensor<2x2x4xi32> loc(#loc158) + %ileft_275 = "tt.reduce"(%ileft_274) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc195) + %ileft_276 = tt.expand_dims %ileft_275 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc160) + %ileft_277 = tt.broadcast %ileft_276 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc161) + %iright_278 = arith.muli %y_273, %flip_53 : tensor<2x2x4xi32> loc(#loc162) + %iright_279 = "tt.reduce"(%iright_278) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc197) + %iright_280 = tt.expand_dims %iright_279 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc164) + %iright_281 = tt.broadcast %iright_280 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc165) + %ileft_282 = tt.reshape %ileft_277 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_283 = tt.reshape %iright_281 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_284 = tt.reshape %new_idxs_272 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc168) + %left_idx_285 = arith.muli %y_idx_284, %ileft_131 : tensor<2x2x4xi32> loc(#loc170) + %left_idx_286 = "tt.reduce"(%left_idx_285) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc200) + %left_idx_287 = tt.expand_dims %left_idx_286 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc172) + %left_idx_288 = tt.broadcast %left_idx_287 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc173) + %right_idx_289 = arith.muli %y_idx_284, %flip_53 : tensor<2x2x4xi32> loc(#loc175) + %right_idx_290 = "tt.reduce"(%right_idx_289) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc203) + %right_idx_291 = tt.expand_dims %right_idx_290 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc177) + %right_idx_292 = tt.broadcast %right_idx_291 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc178) + %left_idx_293 = tt.reshape %left_idx_288 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_294 = tt.reshape %right_idx_292 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_295 = arith.cmpi slt, %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc181) + %eq_296 = arith.cmpi eq, %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc182) + %cond_297 = arith.cmpi sgt, %left_idx_293, %right_idx_294 : tensor<1x16xi32> loc(#loc183) + %cond_298 = arith.andi %eq_296, %cond_297 : tensor<1x16xi1> loc(#loc184) + %cond_299 = arith.ori %cond_295, %cond_298 : tensor<1x16xi1> loc(#loc185) + %ret_300 = arith.xori %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc188) + %ret_301 = arith.select %cond_299, %ret_300, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_302 = arith.xori %ret_269, %ret_301 : tensor<1x16xi32> loc(#loc190) + %new_idxs_303 = arith.xori %left_idx_293, %right_idx_294 : tensor<1x16xi32> loc(#loc191) + %new_idxs_304 = arith.select %cond_299, %new_idxs_303, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_305 = arith.xori %new_idxs_272, %new_idxs_304 : tensor<1x16xi32> loc(#loc193) + %y_306 = tt.reshape %ret_302 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc156) + %ileft_307 = arith.muli %y_306, %ileft_56 : tensor<4x2x2xi32> loc(#loc158) + %ileft_308 = "tt.reduce"(%ileft_307) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc195) + %ileft_309 = tt.expand_dims %ileft_308 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc160) + %ileft_310 = tt.broadcast %ileft_309 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc161) + %iright_311 = arith.muli %y_306, %flip_17 : tensor<4x2x2xi32> loc(#loc162) + %iright_312 = "tt.reduce"(%iright_311) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc197) + %iright_313 = tt.expand_dims %iright_312 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc164) + %iright_314 = tt.broadcast %iright_313 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc165) + %ileft_315 = tt.reshape %ileft_310 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_316 = tt.reshape %iright_314 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_317 = tt.reshape %new_idxs_305 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc168) + %left_idx_318 = arith.muli %y_idx_317, %ileft_56 : tensor<4x2x2xi32> loc(#loc170) + %left_idx_319 = "tt.reduce"(%left_idx_318) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_320 = tt.expand_dims %left_idx_319 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc172) + %left_idx_321 = tt.broadcast %left_idx_320 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc173) + %right_idx_322 = arith.muli %y_idx_317, %flip_17 : tensor<4x2x2xi32> loc(#loc175) + %right_idx_323 = "tt.reduce"(%right_idx_322) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc203) + %right_idx_324 = tt.expand_dims %right_idx_323 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc177) + %right_idx_325 = tt.broadcast %right_idx_324 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc178) + %left_idx_326 = tt.reshape %left_idx_321 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_327 = tt.reshape %right_idx_325 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_328 = arith.cmpi slt, %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc181) + %eq_329 = arith.cmpi eq, %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc182) + %cond_330 = arith.cmpi sgt, %left_idx_326, %right_idx_327 : tensor<1x16xi32> loc(#loc183) + %cond_331 = arith.andi %eq_329, %cond_330 : tensor<1x16xi1> loc(#loc184) + %cond_332 = arith.ori %cond_328, %cond_331 : tensor<1x16xi1> loc(#loc185) + %ret_333 = arith.xori %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc188) + %ret_334 = arith.select %cond_332, %ret_333, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_335 = arith.xori %ret_302, %ret_334 : tensor<1x16xi32> loc(#loc190) + %new_idxs_336 = arith.xori %left_idx_326, %right_idx_327 : tensor<1x16xi32> loc(#loc191) + %new_idxs_337 = arith.select %cond_332, %new_idxs_336, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_338 = arith.xori %new_idxs_305, %new_idxs_337 : tensor<1x16xi32> loc(#loc193) + %y_339 = tt.reshape %ret_335 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc156) + %ileft_340 = arith.muli %y_339, %ileft : tensor<8x2x1xi32> loc(#loc158) + %ileft_341 = "tt.reduce"(%ileft_340) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc195) + %ileft_342 = tt.expand_dims %ileft_341 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc160) + %ileft_343 = tt.broadcast %ileft_342 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc161) + %iright_344 = arith.muli %y_339, %iright : tensor<8x2x1xi32> loc(#loc162) + %iright_345 = "tt.reduce"(%iright_344) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc197) + %iright_346 = tt.expand_dims %iright_345 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc164) + %iright_347 = tt.broadcast %iright_346 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc165) + %ileft_348 = tt.reshape %ileft_343 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_349 = tt.reshape %iright_347 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_350 = tt.reshape %new_idxs_338 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc168) + %left_idx_351 = arith.muli %y_idx_350, %ileft : tensor<8x2x1xi32> loc(#loc170) + %left_idx_352 = "tt.reduce"(%left_idx_351) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_353 = tt.expand_dims %left_idx_352 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc172) + %left_idx_354 = tt.broadcast %left_idx_353 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc173) + %right_idx_355 = arith.muli %y_idx_350, %iright : tensor<8x2x1xi32> loc(#loc175) + %right_idx_356 = "tt.reduce"(%right_idx_355) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc203) + %right_idx_357 = tt.expand_dims %right_idx_356 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc177) + %right_idx_358 = tt.broadcast %right_idx_357 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc178) + %left_idx_359 = tt.reshape %left_idx_354 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_360 = tt.reshape %right_idx_358 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_361 = arith.cmpi slt, %ileft_348, %iright_349 : tensor<1x16xi32> loc(#loc181) + %eq_362 = arith.cmpi eq, %ileft_348, %iright_349 : tensor<1x16xi32> loc(#loc182) + %cond_363 = arith.cmpi sgt, %left_idx_359, %right_idx_360 : tensor<1x16xi32> loc(#loc183) + %cond_364 = arith.andi %eq_362, %cond_363 : tensor<1x16xi1> loc(#loc184) + %cond_365 = arith.ori %cond_361, %cond_364 : tensor<1x16xi1> loc(#loc185) + %new_idxs_366 = arith.xori %left_idx_359, %right_idx_360 : tensor<1x16xi32> loc(#loc191) + %new_idxs_367 = arith.select %cond_365, %new_idxs_366, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_368 = arith.xori %new_idxs_338, %new_idxs_367 : tensor<1x16xi32> loc(#loc193) + %tmp7 = arith.extsi %tmp0_14 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc139) + %tmp10_369 = arith.select %tmp0_13, %tmp7, %tmp10 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc83) + %tmp11 = "tt.reduce"(%tmp10_369) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_371: i64 loc(callsite(#loc2 at #loc140)), %tmp11_372: i64 loc(callsite(#loc2 at #loc140))): + %tmp11_373 = arith.addi %tmp11_371, %tmp11_372 : i64 loc(#loc194) + tt.reduce.return %tmp11_373 : i64 loc(#loc154) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc154) + %tmp11_370 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc141) + %tmp14 = arith.trunci %tmp11_370 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc142) + %0 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc68) + %1 = tt.splat %0 : i32 -> tensor<1x16xi32> loc(#loc143) + %2 = arith.addi %r0_index_4, %1 : tensor<1x16xi32> loc(#loc69) + %3 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc70) + %4 = tt.addptr %3, %2 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc70) + tt.store %4, %new_idxs_368, %tmp0_13 : tensor<1x16x!tt.ptr> loc(#loc71) + %5 = tt.addptr %out_ptr3, %xoffset : !tt.ptr, i32 loc(#loc72) + %6 = tt.splat %5 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc72) + tt.store %6, %tmp14, %xmask_3 : tensor<1x1x!tt.ptr> loc(#loc73) + tt.return loc(#loc74) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:49) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":26:21) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":44:34) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:38) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":24:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":27:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":27:38) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":33:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":34:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:35) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:30) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":36:54) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":38:19) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":42:19) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":45:29) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":48:21) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:35) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:32) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":49:47) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:37) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hx/chxnygpvpmvr2mx2e6mwgdeojthrirnog7nmq6mcsi3wvegvi2so.py":50:4) +#loc80 = loc("tmp0"(#loc1)) +#loc81 = loc("xmask"(#loc3)) +#loc82 = loc(callsite(#loc2 at #loc4)) +#loc83 = loc("tmp10"(#loc5)) +#loc84 = loc("tmp0"(#loc6)) +#loc85 = loc("xoffset"(#loc7)) +#loc86 = loc("r0_index"(#loc8)) +#loc87 = loc("r0_index"(#loc9)) +#loc88 = loc("x0"(#loc10)) +#loc89 = loc("x1"(#loc11)) +#loc90 = loc("tmp0"(#loc12)) +#loc91 = loc("tmp0"(#loc13)) +#loc92 = loc("tmp0"(#loc14)) +#loc93 = loc("tmp0"(#loc15)) +#loc94 = loc("tmp2"(#loc16)) +#loc95 = loc("flip"(#loc17)) +#loc97 = loc("flip"(#loc19)) +#loc98 = loc("flip"(#loc20)) +#loc99 = loc("flip"(#loc21)) +#loc100 = loc("y"(#loc22)) +#loc101 = loc("left_mask"(#loc24)) +#loc102 = loc("ileft"(#loc25)) +#loc104 = loc("ileft"(#loc29)) +#loc105 = loc("ileft"(#loc30)) +#loc106 = loc("iright"(#loc31)) +#loc108 = loc("iright"(#loc33)) +#loc109 = loc("iright"(#loc34)) +#loc110 = loc("ileft"(#loc35)) +#loc111 = loc("iright"(#loc36)) +#loc112 = loc("y_idx"(#loc37)) +#loc113 = loc("left_idx"(#loc38)) +#loc114 = loc("left_idx"(#loc39)) +#loc115 = loc("input"(#loc40)) +#loc117 = loc("left_idx"(#loc42)) +#loc118 = loc("left_idx"(#loc43)) +#loc119 = loc("right_idx"(#loc44)) +#loc120 = loc("right_idx"(#loc45)) +#loc122 = loc("right_idx"(#loc47)) +#loc123 = loc("right_idx"(#loc48)) +#loc124 = loc("left_idx"(#loc49)) +#loc125 = loc("right_idx"(#loc50)) +#loc126 = loc("cond"(#loc51)) +#loc127 = loc("eq"(#loc52)) +#loc128 = loc("cond"(#loc53)) +#loc129 = loc("cond"(#loc54)) +#loc130 = loc("cond"(#loc55)) +#loc131 = loc("cond"(#loc56)) +#loc132 = loc("cond"(#loc57)) +#loc133 = loc("ret"(#loc58)) +#loc134 = loc("ret"(#loc59)) +#loc135 = loc("ret"(#loc60)) +#loc136 = loc("new_idxs"(#loc61)) +#loc137 = loc("new_idxs"(#loc62)) +#loc138 = loc("new_idxs"(#loc63)) +#loc139 = loc("tmp7"(#loc64)) +#loc141 = loc("tmp11"(#loc66)) +#loc142 = loc("tmp14"(#loc67)) +#loc143 = loc(fused[#loc69, #loc68]) +#loc144 = loc(fused[#loc90, #loc88]) +#loc145 = loc(fused[#loc91, #loc80]) +#loc146 = loc(fused[#loc93, #loc81]) +#loc147 = loc(callsite(#loc95 at #loc96)) +#loc148 = loc(callsite(#loc97 at #loc96)) +#loc149 = loc(callsite(#loc98 at #loc96)) +#loc150 = loc(callsite(#loc99 at #loc96)) +#loc152 = loc("cond"(#loc126)) +#loc153 = loc("eq"(#loc127)) +#loc154 = loc(callsite(#loc26 at #loc140)) +#loc156 = loc(callsite(#loc100 at #loc151)) +#loc157 = loc(callsite(#loc101 at #loc151)) +#loc158 = loc(callsite(#loc102 at #loc151)) +#loc160 = loc(callsite(#loc104 at #loc151)) +#loc161 = loc(callsite(#loc105 at #loc151)) +#loc162 = loc(callsite(#loc106 at #loc151)) +#loc164 = loc(callsite(#loc108 at #loc151)) +#loc165 = loc(callsite(#loc109 at #loc151)) +#loc166 = loc(callsite(#loc110 at #loc151)) +#loc167 = loc(callsite(#loc111 at #loc151)) +#loc168 = loc(callsite(#loc112 at #loc151)) +#loc169 = loc(callsite(#loc113 at #loc151)) +#loc170 = loc(callsite(#loc114 at #loc151)) +#loc172 = loc(callsite(#loc117 at #loc151)) +#loc173 = loc(callsite(#loc118 at #loc151)) +#loc174 = loc(callsite(#loc119 at #loc151)) +#loc175 = loc(callsite(#loc120 at #loc151)) +#loc177 = loc(callsite(#loc122 at #loc151)) +#loc178 = loc(callsite(#loc123 at #loc151)) +#loc179 = loc(callsite(#loc124 at #loc151)) +#loc180 = loc(callsite(#loc125 at #loc151)) +#loc181 = loc(callsite(#loc152 at #loc151)) +#loc182 = loc(callsite(#loc153 at #loc151)) +#loc183 = loc(callsite(#loc128 at #loc151)) +#loc184 = loc(callsite(#loc129 at #loc151)) +#loc185 = loc(callsite(#loc130 at #loc151)) +#loc186 = loc(callsite(#loc131 at #loc151)) +#loc187 = loc(callsite(#loc132 at #loc151)) +#loc188 = loc(callsite(#loc133 at #loc151)) +#loc189 = loc(callsite(#loc134 at #loc151)) +#loc190 = loc(callsite(#loc135 at #loc151)) +#loc191 = loc(callsite(#loc136 at #loc151)) +#loc192 = loc(callsite(#loc137 at #loc151)) +#loc193 = loc(callsite(#loc138 at #loc151)) +#loc194 = loc(callsite(#loc28 at #loc154)) +#loc195 = loc(callsite(#loc26 at #loc159)) +#loc197 = loc(callsite(#loc26 at #loc163)) +#loc199 = loc(callsite(#loc115 at #loc171)) +#loc200 = loc(callsite(#loc26 at #loc171)) +#loc202 = loc(callsite(#loc115 at #loc176)) +#loc203 = loc(callsite(#loc26 at #loc176)) +#loc205 = loc(callsite(#loc28 at #loc195)) +#loc206 = loc(callsite(#loc28 at #loc197)) +#loc207 = loc(callsite(#loc28 at #loc200)) +#loc208 = loc(callsite(#loc28 at #loc203)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a5161e4226818f9fe85f277343282c36115ea9f4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.llir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..ad3a26b319aa9edadafb8daca7ef22f3f9ab1e2b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx @@ -0,0 +1,1936 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0 // -- Begin function triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_2[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_2[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 112, 122, 47, 99, 112, 122, 119, 55, 103, 54, 121, 106, 102, 108, 112, 99, 116, 99, 113, 107, 122, 102, 53, 111, 115, 113, 55, 109, 53, 97, 99, 114, 99, 116, 97, 121, 115, 97, 54, 116, 104, 51, 111, 120, 51, 100, 101, 105, 110, 120, 108, 117, 121, 112, 99, 46, 112, 121}; +.global .align 1 .b8 assertMessage_2[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 54, 32, 60, 32, 107, 115, 51}; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 112, 122, 47, 99, 112, 122, 119, 55, 103, 54, 121, 106, 102, 108, 112, 99, 116, 99, 113, 107, 122, 102, 53, 111, 115, 113, 55, 109, 53, 97, 99, 114, 99, 116, 97, 121, 115, 97, 54, 116, 104, 51, 111, 120, 51, 100, 101, 105, 110, 120, 108, 117, 121, 112, 99, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[65] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 108, 46, 98, 114, 111, 97, 100, 99, 97, 115, 116, 95, 116, 111, 40, 116, 109, 112, 50, 51, 44, 32, 91, 88, 66, 76, 79, 67, 75, 93, 41, 32, 60, 32, 107, 115, 50}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 112, 122, 47, 99, 112, 122, 119, 55, 103, 54, 121, 106, 102, 108, 112, 99, 116, 99, 113, 107, 122, 102, 53, 111, 115, 113, 55, 109, 53, 97, 99, 114, 99, 116, 97, 121, 115, 97, 54, 116, 104, 51, 111, 120, 51, 100, 101, 105, 110, 120, 108, 117, 121, 112, 99, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[64] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 108, 46, 98, 114, 111, 97, 100, 99, 97, 115, 116, 95, 116, 111, 40, 116, 109, 112, 56, 44, 32, 91, 88, 66, 76, 79, 67, 75, 93, 41, 32, 60, 32, 107, 115, 50}; + // @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0 +.visible .entry triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_4, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_5, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_6, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_7, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_8, + .param .u32 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_9, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_10, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_11 +) +.reqntid 128 +{ + .reg .pred %p<268>; + .reg .b16 %rs<207>; + .reg .b32 %r<174>; + .reg .b64 %rd<682>; + .loc 1 18 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:18:0 + +// %bb.0: + ld.param.b64 %rd143, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_5]; +$L__tmp0: + .loc 1 19 28 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:19:28 + mov.u32 %r17, %ctaid.x; + .loc 1 19 33 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:19:33 + shl.b32 %r18, %r17, 10; + .loc 1 20 36 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:20:36 + mov.u32 %r19, %tid.x; + shl.b32 %r20, %r19, 3; + and.b32 %r21, %r20, 1016; + .loc 1 20 23 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:20:23 + or.b32 %r9, %r21, %r18; + or.b32 %r10, %r9, 1; + or.b32 %r11, %r9, 2; + .loc 1 22 19 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:22:19 + cvt.s64.s32 %rd2, %r10; + cvt.s64.s32 %rd1, %r9; + .loc 1 24 21 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:24:21 + or.b64 %rd147, %rd1, %rd143; + and.b64 %rd148, %rd147, -4294967296; + setp.ne.b64 %p25, %rd148, 0; + @%p25 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd666, %rd1, %rd143; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r22, %rd143; + cvt.u32.u64 %r23, %rd1; + div.u32 %r24, %r23, %r22; + cvt.u64.u32 %rd666, %r24; +$L__BB0_3: + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + or.b32 %r12, %r9, 3; + cvt.s64.s32 %rd3, %r11; + .loc 1 24 21 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:24:21 + or.b64 %rd149, %rd2, %rd143; + and.b64 %rd150, %rd149, -4294967296; + setp.ne.b64 %p26, %rd150, 0; + @%p26 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd667, %rd2, %rd143; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r25, %rd143; + cvt.u32.u64 %r26, %rd2; + div.u32 %r27, %r26, %r25; + cvt.u64.u32 %rd667, %r27; +$L__BB0_6: + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + or.b32 %r13, %r9, 4; + cvt.s64.s32 %rd4, %r12; + .loc 1 24 21 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:24:21 + or.b64 %rd151, %rd3, %rd143; + and.b64 %rd152, %rd151, -4294967296; + setp.ne.b64 %p27, %rd152, 0; + @%p27 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + div.s64 %rd668, %rd3, %rd143; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r28, %rd143; + cvt.u32.u64 %r29, %rd3; + div.u32 %r30, %r29, %r28; + cvt.u64.u32 %rd668, %r30; +$L__BB0_9: + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + or.b32 %r14, %r9, 5; + cvt.s64.s32 %rd5, %r13; + .loc 1 24 21 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:24:21 + or.b64 %rd153, %rd4, %rd143; + and.b64 %rd154, %rd153, -4294967296; + setp.ne.b64 %p28, %rd154, 0; + @%p28 bra $L__BB0_11; + bra.uni $L__BB0_10; +$L__BB0_11: + div.s64 %rd669, %rd4, %rd143; + bra.uni $L__BB0_12; +$L__BB0_10: + cvt.u32.u64 %r31, %rd143; + cvt.u32.u64 %r32, %rd4; + div.u32 %r33, %r32, %r31; + cvt.u64.u32 %rd669, %r33; +$L__BB0_12: + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + or.b32 %r15, %r9, 6; + cvt.s64.s32 %rd6, %r14; + .loc 1 24 21 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:24:21 + or.b64 %rd155, %rd5, %rd143; + and.b64 %rd156, %rd155, -4294967296; + setp.ne.b64 %p29, %rd156, 0; + @%p29 bra $L__BB0_14; + bra.uni $L__BB0_13; +$L__BB0_14: + div.s64 %rd670, %rd5, %rd143; + bra.uni $L__BB0_15; +$L__BB0_13: + cvt.u32.u64 %r34, %rd143; + cvt.u32.u64 %r35, %rd5; + div.u32 %r36, %r35, %r34; + cvt.u64.u32 %rd670, %r36; +$L__BB0_15: + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + or.b32 %r16, %r9, 7; + cvt.s64.s32 %rd7, %r15; + .loc 1 24 21 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:24:21 + or.b64 %rd157, %rd6, %rd143; + and.b64 %rd158, %rd157, -4294967296; + setp.ne.b64 %p30, %rd158, 0; + @%p30 bra $L__BB0_17; + bra.uni $L__BB0_16; +$L__BB0_17: + div.s64 %rd671, %rd6, %rd143; + bra.uni $L__BB0_18; +$L__BB0_16: + cvt.u32.u64 %r37, %rd143; + cvt.u32.u64 %r38, %rd6; + div.u32 %r39, %r38, %r37; + cvt.u64.u32 %rd671, %r39; +$L__BB0_18: + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + cvt.s64.s32 %rd8, %r16; + .loc 1 24 21 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:24:21 + or.b64 %rd159, %rd7, %rd143; + and.b64 %rd160, %rd159, -4294967296; + setp.ne.b64 %p31, %rd160, 0; + @%p31 bra $L__BB0_20; + bra.uni $L__BB0_19; +$L__BB0_20: + div.s64 %rd672, %rd7, %rd143; + bra.uni $L__BB0_21; +$L__BB0_19: + cvt.u32.u64 %r40, %rd143; + cvt.u32.u64 %r41, %rd7; + div.u32 %r42, %r41, %r40; + cvt.u64.u32 %rd672, %r42; +$L__BB0_21: + .loc 1 0 21 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0:21 + ld.param.b64 %rd144, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_6]; + .loc 1 24 21 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:24:21 + or.b64 %rd161, %rd8, %rd143; + and.b64 %rd162, %rd161, -4294967296; + setp.ne.b64 %p32, %rd162, 0; + @%p32 bra $L__BB0_23; + bra.uni $L__BB0_22; +$L__BB0_23: + div.s64 %rd673, %rd8, %rd143; + bra.uni $L__BB0_24; +$L__BB0_22: + cvt.u32.u64 %r43, %rd143; + cvt.u32.u64 %r44, %rd8; + div.u32 %r45, %r44, %r43; + cvt.u64.u32 %rd673, %r45; +$L__BB0_24: + .loc 1 24 28 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:24:28 + or.b64 %rd163, %rd666, %rd144; + and.b64 %rd164, %rd163, -4294967296; + setp.ne.b64 %p33, %rd164, 0; + @%p33 bra $L__BB0_26; + bra.uni $L__BB0_25; +$L__BB0_26: + rem.s64 %rd674, %rd666, %rd144; + bra.uni $L__BB0_27; +$L__BB0_25: + cvt.u32.u64 %r46, %rd144; + cvt.u32.u64 %r47, %rd666; + rem.u32 %r48, %r47, %r46; + cvt.u64.u32 %rd674, %r48; +$L__BB0_27: + or.b64 %rd165, %rd667, %rd144; + and.b64 %rd166, %rd165, -4294967296; + setp.ne.b64 %p34, %rd166, 0; + @%p34 bra $L__BB0_29; + bra.uni $L__BB0_28; +$L__BB0_29: + rem.s64 %rd675, %rd667, %rd144; + bra.uni $L__BB0_30; +$L__BB0_28: + cvt.u32.u64 %r49, %rd144; + cvt.u32.u64 %r50, %rd667; + rem.u32 %r51, %r50, %r49; + cvt.u64.u32 %rd675, %r51; +$L__BB0_30: + or.b64 %rd167, %rd668, %rd144; + and.b64 %rd168, %rd167, -4294967296; + setp.ne.b64 %p35, %rd168, 0; + @%p35 bra $L__BB0_32; + bra.uni $L__BB0_31; +$L__BB0_32: + rem.s64 %rd676, %rd668, %rd144; + bra.uni $L__BB0_33; +$L__BB0_31: + cvt.u32.u64 %r52, %rd144; + cvt.u32.u64 %r53, %rd668; + rem.u32 %r54, %r53, %r52; + cvt.u64.u32 %rd676, %r54; +$L__BB0_33: + or.b64 %rd169, %rd669, %rd144; + and.b64 %rd170, %rd169, -4294967296; + setp.ne.b64 %p36, %rd170, 0; + @%p36 bra $L__BB0_35; + bra.uni $L__BB0_34; +$L__BB0_35: + rem.s64 %rd677, %rd669, %rd144; + bra.uni $L__BB0_36; +$L__BB0_34: + cvt.u32.u64 %r55, %rd144; + cvt.u32.u64 %r56, %rd669; + rem.u32 %r57, %r56, %r55; + cvt.u64.u32 %rd677, %r57; +$L__BB0_36: + or.b64 %rd171, %rd670, %rd144; + and.b64 %rd172, %rd171, -4294967296; + setp.ne.b64 %p37, %rd172, 0; + @%p37 bra $L__BB0_38; + bra.uni $L__BB0_37; +$L__BB0_38: + rem.s64 %rd678, %rd670, %rd144; + bra.uni $L__BB0_39; +$L__BB0_37: + cvt.u32.u64 %r58, %rd144; + cvt.u32.u64 %r59, %rd670; + rem.u32 %r60, %r59, %r58; + cvt.u64.u32 %rd678, %r60; +$L__BB0_39: + or.b64 %rd173, %rd671, %rd144; + and.b64 %rd174, %rd173, -4294967296; + setp.ne.b64 %p38, %rd174, 0; + @%p38 bra $L__BB0_41; + bra.uni $L__BB0_40; +$L__BB0_41: + rem.s64 %rd679, %rd671, %rd144; + bra.uni $L__BB0_42; +$L__BB0_40: + cvt.u32.u64 %r61, %rd144; + cvt.u32.u64 %r62, %rd671; + rem.u32 %r63, %r62, %r61; + cvt.u64.u32 %rd679, %r63; +$L__BB0_42: + or.b64 %rd175, %rd672, %rd144; + and.b64 %rd176, %rd175, -4294967296; + setp.ne.b64 %p39, %rd176, 0; + @%p39 bra $L__BB0_44; + bra.uni $L__BB0_43; +$L__BB0_44: + rem.s64 %rd680, %rd672, %rd144; + bra.uni $L__BB0_45; +$L__BB0_43: + cvt.u32.u64 %r64, %rd144; + cvt.u32.u64 %r65, %rd672; + rem.u32 %r66, %r65, %r64; + cvt.u64.u32 %rd680, %r66; +$L__BB0_45: + .loc 1 0 28 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0:28 + ld.param.b64 %rd145, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_7]; + ld.param.b64 %rd139, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_1]; + ld.param.b64 %rd138, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_0]; + ld.param.b32 %r8, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_9]; + rem.s64 %rd9, %rd1, %rd143; + rem.s64 %rd16, %rd8, %rd143; + rem.s64 %rd15, %rd7, %rd143; + rem.s64 %rd14, %rd6, %rd143; + rem.s64 %rd13, %rd5, %rd143; + rem.s64 %rd12, %rd4, %rd143; + rem.s64 %rd11, %rd3, %rd143; + rem.s64 %rd10, %rd2, %rd143; + .loc 1 24 28 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:24:28 + or.b64 %rd177, %rd673, %rd144; + and.b64 %rd178, %rd177, -4294967296; + setp.ne.b64 %p40, %rd178, 0; + @%p40 bra $L__BB0_47; + bra.uni $L__BB0_46; +$L__BB0_47: + rem.s64 %rd681, %rd673, %rd144; + bra.uni $L__BB0_48; +$L__BB0_46: + cvt.u32.u64 %r67, %rd144; + cvt.u32.u64 %r68, %rd673; + rem.u32 %r69, %r68, %r67; + cvt.u64.u32 %rd681, %r69; +$L__BB0_48: + .loc 1 21 21 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:21:21 + setp.lt.s32 %p8, %r16, %r8; + setp.lt.s32 %p7, %r15, %r8; + setp.lt.s32 %p6, %r14, %r8; + setp.lt.s32 %p5, %r13, %r8; + setp.lt.s32 %p4, %r12, %r8; + setp.lt.s32 %p3, %r11, %r8; + setp.lt.s32 %p2, %r10, %r8; + setp.lt.s32 %p1, %r9, %r8; + .loc 1 25 31 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:25:31 + shl.b64 %rd291, %rd1, 1; + add.s64 %rd180, %rd138, %rd291; + shl.b64 %rd292, %rd2, 1; + add.s64 %rd183, %rd138, %rd292; + shl.b64 %rd293, %rd3, 1; + add.s64 %rd186, %rd138, %rd293; + shl.b64 %rd294, %rd4, 1; + add.s64 %rd189, %rd138, %rd294; + shl.b64 %rd295, %rd5, 1; + add.s64 %rd192, %rd138, %rd295; + shl.b64 %rd296, %rd6, 1; + add.s64 %rd195, %rd138, %rd296; + shl.b64 %rd297, %rd7, 1; + add.s64 %rd198, %rd138, %rd297; + shl.b64 %rd298, %rd8, 1; + add.s64 %rd201, %rd138, %rd298; + .loc 1 25 36 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:25:36 + // begin inline asm + mov.u64 %rd179, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd179, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs41, 0x0; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs41 }, [ %rd180 + 0 ], %rd179; + // end inline asm + // begin inline asm + mov.u64 %rd182, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd182, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs42, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs42 }, [ %rd183 + 0 ], %rd182; + // end inline asm + // begin inline asm + mov.u64 %rd185, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd185, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs43, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs43 }, [ %rd186 + 0 ], %rd185; + // end inline asm + // begin inline asm + mov.u64 %rd188, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd188, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs44, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs44 }, [ %rd189 + 0 ], %rd188; + // end inline asm + // begin inline asm + mov.u64 %rd191, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd191, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs45, 0x0; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs45 }, [ %rd192 + 0 ], %rd191; + // end inline asm + // begin inline asm + mov.u64 %rd194, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd194, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs46, 0x0; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs46 }, [ %rd195 + 0 ], %rd194; + // end inline asm + // begin inline asm + mov.u64 %rd197, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd197, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs47, 0x0; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs47 }, [ %rd198 + 0 ], %rd197; + // end inline asm + // begin inline asm + mov.u64 %rd200, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd200, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs48, 0x0; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs48 }, [ %rd201 + 0 ], %rd200; + // end inline asm + .loc 1 26 31 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:26:31 + shl.b64 %rd299, %rd674, 3; + add.s64 %rd261, %rd139, %rd299; + shl.b64 %rd300, %rd675, 3; + add.s64 %rd265, %rd139, %rd300; + shl.b64 %rd301, %rd676, 3; + add.s64 %rd269, %rd139, %rd301; + shl.b64 %rd302, %rd677, 3; + add.s64 %rd273, %rd139, %rd302; + shl.b64 %rd303, %rd678, 3; + add.s64 %rd277, %rd139, %rd303; + shl.b64 %rd304, %rd679, 3; + add.s64 %rd281, %rd139, %rd304; + shl.b64 %rd305, %rd680, 3; + add.s64 %rd285, %rd139, %rd305; + shl.b64 %rd306, %rd681, 3; + add.s64 %rd289, %rd139, %rd306; + .loc 1 26 36 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:26:36 + // begin inline asm + mov.u64 %rd203, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd203, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd204, 0x0; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd204 }, [ %rd261 + 0 ], %rd203; + // end inline asm + // begin inline asm + mov.u64 %rd207, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd207, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd208, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd208 }, [ %rd265 + 0 ], %rd207; + // end inline asm + // begin inline asm + mov.u64 %rd211, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd211, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd212, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd212 }, [ %rd269 + 0 ], %rd211; + // end inline asm + // begin inline asm + mov.u64 %rd215, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd215, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd216, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd216 }, [ %rd273 + 0 ], %rd215; + // end inline asm + // begin inline asm + mov.u64 %rd219, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd219, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd220, 0x0; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd220 }, [ %rd277 + 0 ], %rd219; + // end inline asm + // begin inline asm + mov.u64 %rd223, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd223, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd224, 0x0; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd224 }, [ %rd281 + 0 ], %rd223; + // end inline asm + // begin inline asm + mov.u64 %rd227, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd227, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd228, 0x0; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd228 }, [ %rd285 + 0 ], %rd227; + // end inline asm + // begin inline asm + mov.u64 %rd231, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd231, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd232, 0x0; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd232 }, [ %rd289 + 0 ], %rd231; + // end inline asm + .loc 1 28 18 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:28:18 + shr.u64 %rd307, %rd143, 63; + add.s64 %rd308, %rd143, %rd307; + shr.s64 %rd89, %rd308, 1; + .loc 1 30 35 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:35 + sub.s64 %rd309, %rd1, %rd89; + sub.s64 %rd310, %rd2, %rd89; + sub.s64 %rd311, %rd3, %rd89; + sub.s64 %rd312, %rd4, %rd89; + sub.s64 %rd313, %rd5, %rd89; + sub.s64 %rd314, %rd6, %rd89; + sub.s64 %rd315, %rd7, %rd89; + sub.s64 %rd316, %rd8, %rd89; + .loc 1 30 30 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:30 + shl.b64 %rd317, %rd309, 1; + add.s64 %rd236, %rd138, %rd317; + shl.b64 %rd318, %rd310, 1; + add.s64 %rd239, %rd138, %rd318; + shl.b64 %rd319, %rd311, 1; + add.s64 %rd242, %rd138, %rd319; + shl.b64 %rd320, %rd312, 1; + add.s64 %rd245, %rd138, %rd320; + shl.b64 %rd321, %rd313, 1; + add.s64 %rd248, %rd138, %rd321; + shl.b64 %rd322, %rd314, 1; + add.s64 %rd251, %rd138, %rd322; + shl.b64 %rd323, %rd315, 1; + add.s64 %rd254, %rd138, %rd323; + shl.b64 %rd324, %rd316, 1; + add.s64 %rd257, %rd138, %rd324; + .loc 1 30 53 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:53 + // begin inline asm + mov.u64 %rd235, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd235, 1.0; + // end inline asm + .loc 1 29 19 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:29:19 + setp.ge.s64 %p73, %rd9, %rd89; + setp.ge.s64 %p74, %rd10, %rd89; + setp.ge.s64 %p75, %rd11, %rd89; + setp.ge.s64 %p76, %rd12, %rd89; + setp.ge.s64 %p77, %rd13, %rd89; + setp.ge.s64 %p78, %rd14, %rd89; + setp.ge.s64 %p79, %rd15, %rd89; + setp.ge.s64 %p80, %rd16, %rd89; + .loc 1 30 60 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:60 + and.pred %p16, %p8, %p80; + and.pred %p15, %p7, %p79; + and.pred %p14, %p6, %p78; + and.pred %p13, %p5, %p77; + and.pred %p12, %p4, %p76; + and.pred %p11, %p3, %p75; + and.pred %p10, %p2, %p74; + and.pred %p9, %p1, %p73; + mov.b16 %rs64, 0; + .loc 1 30 53 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:53 + // begin inline asm + mov.u16 %rs49, %rs64; + @%p9 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs49 }, [ %rd236 + 0 ], %rd235; + // end inline asm + // begin inline asm + mov.u64 %rd238, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd238, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs51, %rs64; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs51 }, [ %rd239 + 0 ], %rd238; + // end inline asm + // begin inline asm + mov.u64 %rd241, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd241, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs53, %rs64; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs53 }, [ %rd242 + 0 ], %rd241; + // end inline asm + // begin inline asm + mov.u64 %rd244, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd244, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs55, %rs64; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs55 }, [ %rd245 + 0 ], %rd244; + // end inline asm + // begin inline asm + mov.u64 %rd247, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd247, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs57, %rs64; + @%p13 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs57 }, [ %rd248 + 0 ], %rd247; + // end inline asm + // begin inline asm + mov.u64 %rd250, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd250, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs59, %rs64; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs59 }, [ %rd251 + 0 ], %rd250; + // end inline asm + // begin inline asm + mov.u64 %rd253, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd253, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs61, %rs64; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs61 }, [ %rd254 + 0 ], %rd253; + // end inline asm + // begin inline asm + mov.u64 %rd256, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd256, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs63, %rs64; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs63 }, [ %rd257 + 0 ], %rd256; + // end inline asm + .loc 1 31 35 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:31:35 + // begin inline asm + mov.u64 %rd259, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd259, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd260, 0x0; + @%p9 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd260 }, [ %rd261 + 0 ], %rd259; + // end inline asm + // begin inline asm + mov.u64 %rd263, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd263, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd264, 0x0; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd264 }, [ %rd265 + 0 ], %rd263; + // end inline asm + // begin inline asm + mov.u64 %rd267, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd267, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd268, 0x0; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd268 }, [ %rd269 + 0 ], %rd267; + // end inline asm + // begin inline asm + mov.u64 %rd271, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd271, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd272, 0x0; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd272 }, [ %rd273 + 0 ], %rd271; + // end inline asm + // begin inline asm + mov.u64 %rd275, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd275, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd276, 0x0; + @%p13 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd276 }, [ %rd277 + 0 ], %rd275; + // end inline asm + // begin inline asm + mov.u64 %rd279, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd279, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd280, 0x0; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd280 }, [ %rd281 + 0 ], %rd279; + // end inline asm + // begin inline asm + mov.u64 %rd283, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd283, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd284, 0x0; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd284 }, [ %rd285 + 0 ], %rd283; + // end inline asm + // begin inline asm + mov.u64 %rd287, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd287, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd288, 0x0; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd288 }, [ %rd289 + 0 ], %rd287; + // end inline asm + .loc 1 35 32 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:35:32 + shr.s64 %rd325, %rd268, 63; + and.b64 %rd326, %rd325, %rd145; + shr.s64 %rd327, %rd272, 63; + and.b64 %rd328, %rd327, %rd145; + shr.s64 %rd329, %rd260, 63; + and.b64 %rd330, %rd329, %rd145; + shr.s64 %rd331, %rd264, 63; + and.b64 %rd332, %rd331, %rd145; + shr.s64 %rd333, %rd284, 63; + and.b64 %rd334, %rd333, %rd145; + shr.s64 %rd335, %rd288, 63; + and.b64 %rd336, %rd335, %rd145; + shr.s64 %rd337, %rd276, 63; + and.b64 %rd338, %rd337, %rd145; + shr.s64 %rd339, %rd280, 63; + and.b64 %rd340, %rd339, %rd145; + add.s64 %rd111, %rd340, %rd280; + add.s64 %rd110, %rd338, %rd276; + add.s64 %rd113, %rd336, %rd288; + add.s64 %rd112, %rd334, %rd284; + add.s64 %rd107, %rd332, %rd264; + add.s64 %rd106, %rd330, %rd260; + add.s64 %rd109, %rd328, %rd272; + add.s64 %rd108, %rd326, %rd268; + .loc 1 36 28 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:36:28 + setp.lt.s64 %p81, %rd108, 0; + setp.lt.s64 %p82, %rd109, 0; + setp.lt.s64 %p83, %rd106, 0; + setp.lt.s64 %p84, %rd107, 0; + setp.lt.s64 %p85, %rd112, 0; + setp.lt.s64 %p86, %rd113, 0; + setp.lt.s64 %p87, %rd110, 0; + setp.lt.s64 %p88, %rd111, 0; + .loc 1 36 98 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:36:98 + setp.ge.s64 %p89, %rd108, %rd145; + setp.ge.s64 %p90, %rd109, %rd145; + setp.ge.s64 %p91, %rd106, %rd145; + setp.ge.s64 %p92, %rd107, %rd145; + setp.ge.s64 %p93, %rd112, %rd145; + setp.ge.s64 %p94, %rd113, %rd145; + setp.ge.s64 %p95, %rd110, %rd145; + setp.ge.s64 %p96, %rd111, %rd145; + .loc 1 36 64 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:36:64 + or.pred %p97, %p88, %p96; + or.pred %p98, %p87, %p95; + or.pred %p99, %p86, %p94; + or.pred %p100, %p85, %p93; + or.pred %p101, %p84, %p92; + or.pred %p102, %p83, %p91; + or.pred %p103, %p82, %p90; + or.pred %p104, %p81, %p89; + .loc 1 36 106 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:36:106 + and.pred %p105, %p11, %p104; + selp.b16 %rs65, 1, 0, %p105; + shl.b16 %rs66, %rs65, 2; + and.pred %p106, %p12, %p103; + selp.b16 %rs67, -1, 0, %p106; + shl.b16 %rs68, %rs67, 3; + or.b16 %rs69, %rs68, %rs66; + and.pred %p107, %p9, %p102; + selp.b16 %rs70, 1, 0, %p107; + and.pred %p108, %p10, %p101; + selp.b16 %rs71, -1, 0, %p108; + shl.b16 %rs72, %rs71, 1; + or.b16 %rs73, %rs70, %rs72; + and.b16 %rs74, %rs73, 3; + or.b16 %rs75, %rs74, %rs69; + and.b16 %rs76, %rs75, 15; + and.pred %p109, %p15, %p100; + selp.b16 %rs77, 1, 0, %p109; + shl.b16 %rs78, %rs77, 2; + and.pred %p110, %p16, %p99; + selp.b16 %rs79, -1, 0, %p110; + shl.b16 %rs80, %rs79, 3; + or.b16 %rs81, %rs80, %rs78; + and.pred %p111, %p13, %p98; + selp.b16 %rs82, 1, 0, %p111; + and.pred %p112, %p14, %p97; + selp.b16 %rs83, -1, 0, %p112; + shl.b16 %rs84, %rs83, 1; + or.b16 %rs85, %rs82, %rs84; + and.b16 %rs86, %rs85, 3; + or.b16 %rs87, %rs86, %rs81; + shl.b16 %rs88, %rs87, 4; + or.b16 %rs89, %rs76, %rs88; + .loc 1 36 123 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:36:123 + and.b16 %rs90, %rs89, 255; + setp.eq.b16 %p113, %rs90, 0; + @%p113 bra $L__BB0_50; + bra.uni $L__BB0_49; +$L__BB0_50: + .loc 1 0 123 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0:123 + ld.param.b64 %rd140, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_2]; + .loc 1 36 123 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:36:123 + bar.sync 0; + .loc 1 37 36 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:37:36 + sub.s64 %rd427, %rd9, %rd89; + sub.s64 %rd428, %rd10, %rd89; + sub.s64 %rd429, %rd11, %rd89; + sub.s64 %rd430, %rd12, %rd89; + sub.s64 %rd431, %rd13, %rd89; + sub.s64 %rd432, %rd14, %rd89; + sub.s64 %rd433, %rd15, %rd89; + sub.s64 %rd434, %rd16, %rd89; + .loc 1 37 58 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:37:58 + mul.lo.s64 %rd435, %rd106, %rd143; + mul.lo.s64 %rd436, %rd107, %rd143; + mul.lo.s64 %rd437, %rd108, %rd143; + mul.lo.s64 %rd438, %rd109, %rd143; + mul.lo.s64 %rd439, %rd110, %rd143; + mul.lo.s64 %rd440, %rd111, %rd143; + mul.lo.s64 %rd441, %rd112, %rd143; + mul.lo.s64 %rd442, %rd113, %rd143; + .loc 1 37 31 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:37:31 + shl.b64 %rd443, %rd427, 1; + add.s64 %rd444, %rd140, %rd443; + shl.b64 %rd445, %rd435, 1; + add.s64 %rd348, %rd444, %rd445; + shl.b64 %rd446, %rd428, 1; + add.s64 %rd447, %rd140, %rd446; + shl.b64 %rd448, %rd436, 1; + add.s64 %rd351, %rd447, %rd448; + shl.b64 %rd449, %rd429, 1; + add.s64 %rd450, %rd140, %rd449; + shl.b64 %rd451, %rd437, 1; + add.s64 %rd354, %rd450, %rd451; + shl.b64 %rd452, %rd430, 1; + add.s64 %rd453, %rd140, %rd452; + shl.b64 %rd454, %rd438, 1; + add.s64 %rd357, %rd453, %rd454; + shl.b64 %rd455, %rd431, 1; + add.s64 %rd456, %rd140, %rd455; + shl.b64 %rd457, %rd439, 1; + add.s64 %rd360, %rd456, %rd457; + shl.b64 %rd458, %rd432, 1; + add.s64 %rd459, %rd140, %rd458; + shl.b64 %rd460, %rd440, 1; + add.s64 %rd363, %rd459, %rd460; + shl.b64 %rd461, %rd433, 1; + add.s64 %rd462, %rd140, %rd461; + shl.b64 %rd463, %rd441, 1; + add.s64 %rd366, %rd462, %rd463; + shl.b64 %rd464, %rd434, 1; + add.s64 %rd465, %rd140, %rd464; + shl.b64 %rd466, %rd442, 1; + add.s64 %rd369, %rd465, %rd466; + .loc 1 37 65 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:37:65 + // begin inline asm + mov.u64 %rd347, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd347, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs91, %rs64; + @%p9 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs91 }, [ %rd348 + 0 ], %rd347; + // end inline asm + // begin inline asm + mov.u64 %rd350, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd350, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs93, %rs64; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs93 }, [ %rd351 + 0 ], %rd350; + // end inline asm + // begin inline asm + mov.u64 %rd353, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd353, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs95, %rs64; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs95 }, [ %rd354 + 0 ], %rd353; + // end inline asm + // begin inline asm + mov.u64 %rd356, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd356, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs97, %rs64; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs97 }, [ %rd357 + 0 ], %rd356; + // end inline asm + // begin inline asm + mov.u64 %rd359, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd359, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs99, %rs64; + @%p13 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs99 }, [ %rd360 + 0 ], %rd359; + // end inline asm + // begin inline asm + mov.u64 %rd362, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd362, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs101, %rs64; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs101 }, [ %rd363 + 0 ], %rd362; + // end inline asm + // begin inline asm + mov.u64 %rd365, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd365, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs103, %rs64; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs103 }, [ %rd366 + 0 ], %rd365; + // end inline asm + // begin inline asm + mov.u64 %rd368, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd368, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs105, %rs64; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs105 }, [ %rd369 + 0 ], %rd368; + // end inline asm + .loc 1 44 19 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:44:19 + setp.lt.s64 %p146, %rd9, %rd89; + setp.lt.s64 %p147, %rd10, %rd89; + setp.lt.s64 %p148, %rd11, %rd89; + setp.lt.s64 %p149, %rd12, %rd89; + setp.lt.s64 %p150, %rd13, %rd89; + setp.lt.s64 %p151, %rd14, %rd89; + setp.lt.s64 %p152, %rd15, %rd89; + setp.lt.s64 %p153, %rd16, %rd89; + .loc 1 45 37 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:37 + add.s64 %rd467, %rd143, %rd1; + add.s64 %rd468, %rd143, %rd2; + add.s64 %rd469, %rd143, %rd3; + add.s64 %rd470, %rd143, %rd4; + add.s64 %rd471, %rd143, %rd5; + add.s64 %rd472, %rd143, %rd6; + add.s64 %rd473, %rd143, %rd7; + add.s64 %rd474, %rd143, %rd8; + .loc 1 45 42 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:42 + sub.s64 %rd475, %rd467, %rd89; + sub.s64 %rd476, %rd468, %rd89; + sub.s64 %rd477, %rd469, %rd89; + sub.s64 %rd478, %rd470, %rd89; + sub.s64 %rd479, %rd471, %rd89; + sub.s64 %rd480, %rd472, %rd89; + sub.s64 %rd481, %rd473, %rd89; + sub.s64 %rd482, %rd474, %rd89; + .loc 1 45 31 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:31 + shl.b64 %rd483, %rd475, 1; + add.s64 %rd372, %rd138, %rd483; + shl.b64 %rd484, %rd476, 1; + add.s64 %rd375, %rd138, %rd484; + shl.b64 %rd485, %rd477, 1; + add.s64 %rd378, %rd138, %rd485; + shl.b64 %rd486, %rd478, 1; + add.s64 %rd381, %rd138, %rd486; + shl.b64 %rd487, %rd479, 1; + add.s64 %rd384, %rd138, %rd487; + shl.b64 %rd488, %rd480, 1; + add.s64 %rd387, %rd138, %rd488; + shl.b64 %rd489, %rd481, 1; + add.s64 %rd390, %rd138, %rd489; + shl.b64 %rd490, %rd482, 1; + add.s64 %rd393, %rd138, %rd490; + .loc 1 45 60 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:60 + // begin inline asm + mov.u64 %rd371, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd371, 1.0; + // end inline asm + .loc 1 45 68 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:68 + and.pred %p24, %p8, %p153; + and.pred %p23, %p7, %p152; + and.pred %p22, %p6, %p151; + and.pred %p21, %p5, %p150; + and.pred %p20, %p4, %p149; + and.pred %p19, %p3, %p148; + and.pred %p18, %p2, %p147; + and.pred %p17, %p1, %p146; + .loc 1 45 60 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:60 + // begin inline asm + mov.u16 %rs107, %rs64; + @%p17 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs107 }, [ %rd372 + 0 ], %rd371; + // end inline asm + // begin inline asm + mov.u64 %rd374, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd374, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs109, %rs64; + @%p18 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs109 }, [ %rd375 + 0 ], %rd374; + // end inline asm + // begin inline asm + mov.u64 %rd377, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd377, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs111, %rs64; + @%p19 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs111 }, [ %rd378 + 0 ], %rd377; + // end inline asm + // begin inline asm + mov.u64 %rd380, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd380, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs113, %rs64; + @%p20 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs113 }, [ %rd381 + 0 ], %rd380; + // end inline asm + // begin inline asm + mov.u64 %rd383, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd383, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs115, %rs64; + @%p21 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs115 }, [ %rd384 + 0 ], %rd383; + // end inline asm + // begin inline asm + mov.u64 %rd386, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd386, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs117, %rs64; + @%p22 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs117 }, [ %rd387 + 0 ], %rd386; + // end inline asm + // begin inline asm + mov.u64 %rd389, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd389, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs119, %rs64; + @%p23 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs119 }, [ %rd390 + 0 ], %rd389; + // end inline asm + // begin inline asm + mov.u64 %rd392, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd392, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs121, %rs64; + @%p24 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs121 }, [ %rd393 + 0 ], %rd392; + // end inline asm + .loc 1 46 36 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:46:36 + // begin inline asm + mov.u64 %rd395, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd395, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd396, 0x0; + @%p17 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd396 }, [ %rd261 + 0 ], %rd395; + // end inline asm + // begin inline asm + mov.u64 %rd399, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd399, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd400, 0x0; + @%p18 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd400 }, [ %rd265 + 0 ], %rd399; + // end inline asm + // begin inline asm + mov.u64 %rd403, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd403, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd404, 0x0; + @%p19 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd404 }, [ %rd269 + 0 ], %rd403; + // end inline asm + // begin inline asm + mov.u64 %rd407, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd407, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd408, 0x0; + @%p20 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd408 }, [ %rd273 + 0 ], %rd407; + // end inline asm + // begin inline asm + mov.u64 %rd411, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd411, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd412, 0x0; + @%p21 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd412 }, [ %rd277 + 0 ], %rd411; + // end inline asm + // begin inline asm + mov.u64 %rd415, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd415, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd416, 0x0; + @%p22 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd416 }, [ %rd281 + 0 ], %rd415; + // end inline asm + // begin inline asm + mov.u64 %rd419, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd419, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd420, 0x0; + @%p23 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd420 }, [ %rd285 + 0 ], %rd419; + // end inline asm + // begin inline asm + mov.u64 %rd423, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd423, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd424, 0x0; + @%p24 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd424 }, [ %rd289 + 0 ], %rd423; + // end inline asm + .loc 1 50 35 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:50:35 + shr.s64 %rd491, %rd404, 63; + and.b64 %rd492, %rd491, %rd145; + shr.s64 %rd493, %rd408, 63; + and.b64 %rd494, %rd493, %rd145; + shr.s64 %rd495, %rd396, 63; + and.b64 %rd496, %rd495, %rd145; + shr.s64 %rd497, %rd400, 63; + and.b64 %rd498, %rd497, %rd145; + shr.s64 %rd499, %rd420, 63; + and.b64 %rd500, %rd499, %rd145; + shr.s64 %rd501, %rd424, 63; + and.b64 %rd502, %rd501, %rd145; + shr.s64 %rd503, %rd412, 63; + and.b64 %rd504, %rd503, %rd145; + shr.s64 %rd505, %rd416, 63; + and.b64 %rd506, %rd505, %rd145; + add.s64 %rd127, %rd506, %rd416; + add.s64 %rd126, %rd504, %rd412; + add.s64 %rd129, %rd502, %rd424; + add.s64 %rd128, %rd500, %rd420; + add.s64 %rd123, %rd498, %rd400; + add.s64 %rd122, %rd496, %rd396; + add.s64 %rd125, %rd494, %rd408; + add.s64 %rd124, %rd492, %rd404; + .loc 1 51 28 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:51:28 + setp.lt.s64 %p154, %rd124, 0; + setp.lt.s64 %p155, %rd125, 0; + setp.lt.s64 %p156, %rd122, 0; + setp.lt.s64 %p157, %rd123, 0; + setp.lt.s64 %p158, %rd128, 0; + setp.lt.s64 %p159, %rd129, 0; + setp.lt.s64 %p160, %rd126, 0; + setp.lt.s64 %p161, %rd127, 0; + .loc 1 51 100 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:51:100 + setp.ge.s64 %p162, %rd124, %rd145; + setp.ge.s64 %p163, %rd125, %rd145; + setp.ge.s64 %p164, %rd122, %rd145; + setp.ge.s64 %p165, %rd123, %rd145; + setp.ge.s64 %p166, %rd128, %rd145; + setp.ge.s64 %p167, %rd129, %rd145; + setp.ge.s64 %p168, %rd126, %rd145; + setp.ge.s64 %p169, %rd127, %rd145; + .loc 1 51 65 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:51:65 + or.pred %p170, %p161, %p169; + or.pred %p171, %p160, %p168; + or.pred %p172, %p159, %p167; + or.pred %p173, %p158, %p166; + or.pred %p174, %p157, %p165; + or.pred %p175, %p156, %p164; + or.pred %p176, %p155, %p163; + or.pred %p177, %p154, %p162; + .loc 1 51 108 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:51:108 + and.pred %p178, %p19, %p177; + selp.b16 %rs123, 1, 0, %p178; + shl.b16 %rs124, %rs123, 2; + and.pred %p179, %p20, %p176; + selp.b16 %rs125, -1, 0, %p179; + shl.b16 %rs126, %rs125, 3; + or.b16 %rs127, %rs126, %rs124; + and.pred %p180, %p17, %p175; + selp.b16 %rs128, 1, 0, %p180; + and.pred %p181, %p18, %p174; + selp.b16 %rs129, -1, 0, %p181; + shl.b16 %rs130, %rs129, 1; + or.b16 %rs131, %rs128, %rs130; + and.b16 %rs132, %rs131, 3; + or.b16 %rs133, %rs132, %rs127; + and.b16 %rs134, %rs133, 15; + and.pred %p182, %p23, %p173; + selp.b16 %rs135, 1, 0, %p182; + shl.b16 %rs136, %rs135, 2; + and.pred %p183, %p24, %p172; + selp.b16 %rs137, -1, 0, %p183; + shl.b16 %rs138, %rs137, 3; + or.b16 %rs139, %rs138, %rs136; + and.pred %p184, %p21, %p171; + selp.b16 %rs140, 1, 0, %p184; + and.pred %p185, %p22, %p170; + selp.b16 %rs141, -1, 0, %p185; + shl.b16 %rs142, %rs141, 1; + or.b16 %rs143, %rs140, %rs142; + and.b16 %rs144, %rs143, 3; + or.b16 %rs145, %rs144, %rs139; + shl.b16 %rs146, %rs145, 4; + or.b16 %rs147, %rs134, %rs146; + .loc 1 51 126 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:51:126 + and.b16 %rs148, %rs147, 255; + setp.eq.b16 %p186, %rs148, 0; + @%p186 bra $L__BB0_52; + bra.uni $L__BB0_51; +$L__BB0_52: + .loc 1 0 126 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0:126 + ld.param.b64 %rd146, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_8]; + .loc 1 51 126 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:51:126 + bar.sync 0; + .loc 1 52 37 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:52:37 + sub.s64 %rd537, %rd143, %rd89; + .loc 1 52 64 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:52:64 + mul.lo.s64 %rd538, %rd122, %rd143; + mul.lo.s64 %rd539, %rd123, %rd143; + mul.lo.s64 %rd540, %rd124, %rd143; + mul.lo.s64 %rd541, %rd125, %rd143; + mul.lo.s64 %rd542, %rd126, %rd143; + mul.lo.s64 %rd543, %rd127, %rd143; + mul.lo.s64 %rd544, %rd128, %rd143; + mul.lo.s64 %rd545, %rd129, %rd143; + .loc 1 52 31 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:52:31 + shl.b64 %rd546, %rd537, 1; + add.s64 %rd547, %rd140, %rd546; + shl.b64 %rd548, %rd9, 1; + add.s64 %rd549, %rd547, %rd548; + shl.b64 %rd550, %rd538, 1; + add.s64 %rd514, %rd549, %rd550; + shl.b64 %rd551, %rd10, 1; + add.s64 %rd552, %rd547, %rd551; + shl.b64 %rd553, %rd539, 1; + add.s64 %rd517, %rd552, %rd553; + shl.b64 %rd554, %rd11, 1; + add.s64 %rd555, %rd547, %rd554; + shl.b64 %rd556, %rd540, 1; + add.s64 %rd520, %rd555, %rd556; + shl.b64 %rd557, %rd12, 1; + add.s64 %rd558, %rd547, %rd557; + shl.b64 %rd559, %rd541, 1; + add.s64 %rd523, %rd558, %rd559; + shl.b64 %rd560, %rd13, 1; + add.s64 %rd561, %rd547, %rd560; + shl.b64 %rd562, %rd542, 1; + add.s64 %rd526, %rd561, %rd562; + shl.b64 %rd563, %rd14, 1; + add.s64 %rd564, %rd547, %rd563; + shl.b64 %rd565, %rd543, 1; + add.s64 %rd529, %rd564, %rd565; + shl.b64 %rd566, %rd15, 1; + add.s64 %rd567, %rd547, %rd566; + shl.b64 %rd568, %rd544, 1; + add.s64 %rd532, %rd567, %rd568; + shl.b64 %rd569, %rd16, 1; + add.s64 %rd570, %rd547, %rd569; + shl.b64 %rd571, %rd545, 1; + add.s64 %rd535, %rd570, %rd571; + .loc 1 52 72 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:52:72 + // begin inline asm + mov.u64 %rd513, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd513, 1.0; + // end inline asm + mov.b16 %rs150, 0; + // begin inline asm + mov.u16 %rs149, %rs150; + @%p17 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs149 }, [ %rd514 + 0 ], %rd513; + // end inline asm + // begin inline asm + mov.u64 %rd516, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd516, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs151, %rs150; + @%p18 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs151 }, [ %rd517 + 0 ], %rd516; + // end inline asm + // begin inline asm + mov.u64 %rd519, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd519, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs153, %rs150; + @%p19 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs153 }, [ %rd520 + 0 ], %rd519; + // end inline asm + // begin inline asm + mov.u64 %rd522, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd522, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs155, %rs150; + @%p20 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs155 }, [ %rd523 + 0 ], %rd522; + // end inline asm + // begin inline asm + mov.u64 %rd525, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd525, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs157, %rs150; + @%p21 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs157 }, [ %rd526 + 0 ], %rd525; + // end inline asm + // begin inline asm + mov.u64 %rd528, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd528, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs159, %rs150; + @%p22 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs159 }, [ %rd529 + 0 ], %rd528; + // end inline asm + // begin inline asm + mov.u64 %rd531, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd531, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs161, %rs150; + @%p23 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs161 }, [ %rd532 + 0 ], %rd531; + // end inline asm + // begin inline asm + mov.u64 %rd534, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd534, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs163, %rs150; + @%p24 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs163 }, [ %rd535 + 0 ], %rd534; + // end inline asm + .loc 1 61 35 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:61:35 + shr.s64 %rd572, %rd212, 63; + and.b64 %rd573, %rd572, %rd146; + shr.s64 %rd574, %rd216, 63; + and.b64 %rd575, %rd574, %rd146; + shr.s64 %rd576, %rd204, 63; + and.b64 %rd577, %rd576, %rd146; + shr.s64 %rd578, %rd208, 63; + and.b64 %rd579, %rd578, %rd146; + shr.s64 %rd580, %rd228, 63; + and.b64 %rd581, %rd580, %rd146; + shr.s64 %rd582, %rd232, 63; + and.b64 %rd583, %rd582, %rd146; + shr.s64 %rd584, %rd220, 63; + and.b64 %rd585, %rd584, %rd146; + shr.s64 %rd586, %rd224, 63; + and.b64 %rd587, %rd586, %rd146; + add.s64 %rd135, %rd587, %rd224; + add.s64 %rd134, %rd585, %rd220; + add.s64 %rd137, %rd583, %rd232; + add.s64 %rd136, %rd581, %rd228; + add.s64 %rd131, %rd579, %rd208; + add.s64 %rd130, %rd577, %rd204; + add.s64 %rd133, %rd575, %rd216; + add.s64 %rd132, %rd573, %rd212; + .loc 1 62 28 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:62:28 + setp.lt.s64 %p203, %rd132, 0; + setp.lt.s64 %p204, %rd133, 0; + setp.lt.s64 %p205, %rd130, 0; + setp.lt.s64 %p206, %rd131, 0; + setp.lt.s64 %p207, %rd136, 0; + setp.lt.s64 %p208, %rd137, 0; + setp.lt.s64 %p209, %rd134, 0; + setp.lt.s64 %p210, %rd135, 0; + .loc 1 62 46 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:62:46 + setp.ge.s64 %p211, %rd132, %rd146; + setp.ge.s64 %p212, %rd133, %rd146; + setp.ge.s64 %p213, %rd130, %rd146; + setp.ge.s64 %p214, %rd131, %rd146; + setp.ge.s64 %p215, %rd136, %rd146; + setp.ge.s64 %p216, %rd137, %rd146; + setp.ge.s64 %p217, %rd134, %rd146; + setp.ge.s64 %p218, %rd135, %rd146; + .loc 1 62 38 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:62:38 + or.pred %p219, %p210, %p218; + or.pred %p220, %p209, %p217; + or.pred %p221, %p208, %p216; + or.pred %p222, %p207, %p215; + or.pred %p223, %p206, %p214; + or.pred %p224, %p205, %p213; + or.pred %p225, %p204, %p212; + or.pred %p226, %p203, %p211; + .loc 1 62 54 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:62:54 + and.pred %p227, %p3, %p226; + selp.b16 %rs165, 1, 0, %p227; + shl.b16 %rs166, %rs165, 2; + and.pred %p228, %p4, %p225; + selp.b16 %rs167, -1, 0, %p228; + shl.b16 %rs168, %rs167, 3; + or.b16 %rs169, %rs168, %rs166; + and.pred %p229, %p1, %p224; + selp.b16 %rs170, 1, 0, %p229; + and.pred %p230, %p2, %p223; + selp.b16 %rs171, -1, 0, %p230; + shl.b16 %rs172, %rs171, 1; + or.b16 %rs173, %rs170, %rs172; + and.b16 %rs174, %rs173, 3; + or.b16 %rs175, %rs174, %rs169; + and.b16 %rs176, %rs175, 15; + and.pred %p231, %p7, %p222; + selp.b16 %rs177, 1, 0, %p231; + shl.b16 %rs178, %rs177, 2; + and.pred %p232, %p8, %p221; + selp.b16 %rs179, -1, 0, %p232; + shl.b16 %rs180, %rs179, 3; + or.b16 %rs181, %rs180, %rs178; + and.pred %p233, %p5, %p220; + selp.b16 %rs182, 1, 0, %p233; + and.pred %p234, %p6, %p219; + selp.b16 %rs183, -1, 0, %p234; + shl.b16 %rs184, %rs183, 1; + or.b16 %rs185, %rs182, %rs184; + and.b16 %rs186, %rs185, 3; + or.b16 %rs187, %rs186, %rs181; + shl.b16 %rs188, %rs187, 4; + or.b16 %rs189, %rs176, %rs188; + .loc 1 62 64 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:62:64 + and.b16 %rs190, %rs189, 255; + setp.eq.b16 %p235, %rs190, 0; + @%p235 bra $L__BB0_54; + bra.uni $L__BB0_53; +$L__BB0_54: + .loc 1 0 64 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0:64 + ld.param.b64 %rd142, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_4]; + ld.param.b64 %rd141, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_3]; + .loc 1 30 111 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:111 + cvt.f32.bf16 %r70, %rs63; + .loc 1 37 123 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:37:123 + cvt.f32.bf16 %r71, %rs105; + .loc 1 39 13 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:39:13 + neg.f32 %r72, %r70; + fma.rn.f32 %r73, %r72, %r71, 0f00000000; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r74, %r73, 0f00000000, %p80; + .loc 1 45 119 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:119 + cvt.f32.bf16 %r75, %rs121; + .loc 1 52 131 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:52:131 + cvt.f32.bf16 %r76, %rs163; + .loc 1 53 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:53:20 + mul.f32 %r77, %r75, %r76; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r78, %r77, 0f00000000, %p153; + .loc 1 57 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:57:20 + add.f32 %r79, %r74, %r78; + .loc 1 30 111 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:111 + cvt.f32.bf16 %r80, %rs61; + .loc 1 37 123 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:37:123 + cvt.f32.bf16 %r81, %rs103; + .loc 1 39 13 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:39:13 + neg.f32 %r82, %r80; + fma.rn.f32 %r83, %r82, %r81, 0f00000000; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r84, %r83, 0f00000000, %p79; + .loc 1 45 119 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:119 + cvt.f32.bf16 %r85, %rs119; + .loc 1 52 131 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:52:131 + cvt.f32.bf16 %r86, %rs161; + .loc 1 53 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:53:20 + mul.f32 %r87, %r85, %r86; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r88, %r87, 0f00000000, %p152; + .loc 1 57 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:57:20 + add.f32 %r89, %r84, %r88; + .loc 1 30 111 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:111 + cvt.f32.bf16 %r90, %rs59; + .loc 1 37 123 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:37:123 + cvt.f32.bf16 %r91, %rs101; + .loc 1 39 13 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:39:13 + neg.f32 %r92, %r90; + fma.rn.f32 %r93, %r92, %r91, 0f00000000; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r94, %r93, 0f00000000, %p78; + .loc 1 45 119 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:119 + cvt.f32.bf16 %r95, %rs117; + .loc 1 52 131 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:52:131 + cvt.f32.bf16 %r96, %rs159; + .loc 1 53 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:53:20 + mul.f32 %r97, %r95, %r96; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r98, %r97, 0f00000000, %p151; + .loc 1 57 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:57:20 + add.f32 %r99, %r94, %r98; + .loc 1 30 111 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:111 + cvt.f32.bf16 %r100, %rs57; + .loc 1 37 123 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:37:123 + cvt.f32.bf16 %r101, %rs99; + .loc 1 39 13 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:39:13 + neg.f32 %r102, %r100; + fma.rn.f32 %r103, %r102, %r101, 0f00000000; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r104, %r103, 0f00000000, %p77; + .loc 1 45 119 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:119 + cvt.f32.bf16 %r105, %rs115; + .loc 1 52 131 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:52:131 + cvt.f32.bf16 %r106, %rs157; + .loc 1 53 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:53:20 + mul.f32 %r107, %r105, %r106; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r108, %r107, 0f00000000, %p150; + .loc 1 57 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:57:20 + add.f32 %r109, %r104, %r108; + .loc 1 30 111 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:111 + cvt.f32.bf16 %r110, %rs55; + .loc 1 37 123 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:37:123 + cvt.f32.bf16 %r111, %rs97; + .loc 1 39 13 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:39:13 + neg.f32 %r112, %r110; + fma.rn.f32 %r113, %r112, %r111, 0f00000000; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r114, %r113, 0f00000000, %p76; + .loc 1 45 119 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:119 + cvt.f32.bf16 %r115, %rs113; + .loc 1 52 131 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:52:131 + cvt.f32.bf16 %r116, %rs155; + .loc 1 53 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:53:20 + mul.f32 %r117, %r115, %r116; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r118, %r117, 0f00000000, %p149; + .loc 1 57 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:57:20 + add.f32 %r119, %r114, %r118; + .loc 1 30 111 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:111 + cvt.f32.bf16 %r120, %rs53; + .loc 1 37 123 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:37:123 + cvt.f32.bf16 %r121, %rs95; + .loc 1 39 13 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:39:13 + neg.f32 %r122, %r120; + fma.rn.f32 %r123, %r122, %r121, 0f00000000; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r124, %r123, 0f00000000, %p75; + .loc 1 45 119 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:119 + cvt.f32.bf16 %r125, %rs111; + .loc 1 52 131 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:52:131 + cvt.f32.bf16 %r126, %rs153; + .loc 1 53 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:53:20 + mul.f32 %r127, %r125, %r126; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r128, %r127, 0f00000000, %p148; + .loc 1 57 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:57:20 + add.f32 %r129, %r124, %r128; + .loc 1 30 111 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:111 + cvt.f32.bf16 %r130, %rs51; + .loc 1 37 123 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:37:123 + cvt.f32.bf16 %r131, %rs93; + .loc 1 39 13 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:39:13 + neg.f32 %r132, %r130; + fma.rn.f32 %r133, %r132, %r131, 0f00000000; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r134, %r133, 0f00000000, %p74; + .loc 1 45 119 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:119 + cvt.f32.bf16 %r135, %rs109; + .loc 1 52 131 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:52:131 + cvt.f32.bf16 %r136, %rs151; + .loc 1 53 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:53:20 + mul.f32 %r137, %r135, %r136; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r138, %r137, 0f00000000, %p147; + .loc 1 57 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:57:20 + add.f32 %r139, %r134, %r138; + .loc 1 30 111 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:111 + cvt.f32.bf16 %r140, %rs49; + .loc 1 37 123 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:37:123 + cvt.f32.bf16 %r141, %rs91; + .loc 1 39 13 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:39:13 + neg.f32 %r142, %r140; + fma.rn.f32 %r143, %r142, %r141, 0f00000000; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r144, %r143, 0f00000000, %p73; + .loc 1 45 119 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:119 + cvt.f32.bf16 %r145, %rs107; + .loc 1 52 131 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:52:131 + cvt.f32.bf16 %r146, %rs149; + .loc 1 53 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:53:20 + mul.f32 %r147, %r145, %r146; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r148, %r147, 0f00000000, %p146; + .loc 1 57 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:57:20 + add.f32 %r149, %r144, %r148; + .loc 1 25 76 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:25:76 + cvt.f32.bf16 %r150, %rs48; + cvt.f32.bf16 %r151, %rs47; + cvt.f32.bf16 %r152, %rs46; + cvt.f32.bf16 %r153, %rs45; + cvt.f32.bf16 %r154, %rs44; + cvt.f32.bf16 %r155, %rs43; + cvt.f32.bf16 %r156, %rs42; + cvt.f32.bf16 %r157, %rs41; + .loc 1 62 64 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:62:64 + bar.sync 0; + .loc 1 63 40 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:63:40 + mul.lo.s64 %rd626, %rd130, %rd143; + mul.lo.s64 %rd627, %rd131, %rd143; + mul.lo.s64 %rd628, %rd132, %rd143; + mul.lo.s64 %rd629, %rd133, %rd143; + mul.lo.s64 %rd630, %rd134, %rd143; + mul.lo.s64 %rd631, %rd135, %rd143; + mul.lo.s64 %rd632, %rd136, %rd143; + mul.lo.s64 %rd633, %rd137, %rd143; + .loc 1 63 31 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:63:31 + add.s64 %rd635, %rd141, %rd548; + shl.b64 %rd636, %rd626, 1; + add.s64 %rd595, %rd635, %rd636; + add.s64 %rd638, %rd141, %rd551; + shl.b64 %rd639, %rd627, 1; + add.s64 %rd598, %rd638, %rd639; + add.s64 %rd641, %rd141, %rd554; + shl.b64 %rd642, %rd628, 1; + add.s64 %rd601, %rd641, %rd642; + add.s64 %rd644, %rd141, %rd557; + shl.b64 %rd645, %rd629, 1; + add.s64 %rd604, %rd644, %rd645; + add.s64 %rd647, %rd141, %rd560; + shl.b64 %rd648, %rd630, 1; + add.s64 %rd607, %rd647, %rd648; + add.s64 %rd650, %rd141, %rd563; + shl.b64 %rd651, %rd631, 1; + add.s64 %rd610, %rd650, %rd651; + add.s64 %rd653, %rd141, %rd566; + shl.b64 %rd654, %rd632, 1; + add.s64 %rd613, %rd653, %rd654; + add.s64 %rd656, %rd141, %rd569; + shl.b64 %rd657, %rd633, 1; + add.s64 %rd616, %rd656, %rd657; + .loc 1 63 48 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:63:48 + // begin inline asm + mov.u64 %rd596, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd596, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs191, 0x0; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs191 }, [ %rd595 + 0 ], %rd596; + // end inline asm + // begin inline asm + mov.u64 %rd599, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd599, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs192, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs192 }, [ %rd598 + 0 ], %rd599; + // end inline asm + // begin inline asm + mov.u64 %rd602, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd602, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs193, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs193 }, [ %rd601 + 0 ], %rd602; + // end inline asm + // begin inline asm + mov.u64 %rd605, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd605, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs194, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs194 }, [ %rd604 + 0 ], %rd605; + // end inline asm + // begin inline asm + mov.u64 %rd608, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd608, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs195, 0x0; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs195 }, [ %rd607 + 0 ], %rd608; + // end inline asm + // begin inline asm + mov.u64 %rd611, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd611, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs196, 0x0; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs196 }, [ %rd610 + 0 ], %rd611; + // end inline asm + // begin inline asm + mov.u64 %rd614, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd614, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs197, 0x0; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs197 }, [ %rd613 + 0 ], %rd614; + // end inline asm + // begin inline asm + mov.u64 %rd617, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd617, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs198, 0x0; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs198 }, [ %rd616 + 0 ], %rd617; + // end inline asm + .loc 1 63 88 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:63:88 + cvt.f32.bf16 %r158, %rs191; + cvt.f32.bf16 %r159, %rs192; + cvt.f32.bf16 %r160, %rs193; + cvt.f32.bf16 %r161, %rs194; + cvt.f32.bf16 %r162, %rs195; + cvt.f32.bf16 %r163, %rs196; + cvt.f32.bf16 %r164, %rs197; + cvt.f32.bf16 %r165, %rs198; + .loc 1 65 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:65:20 + fma.rn.f32 %r166, %r157, %r158, %r149; + fma.rn.f32 %r167, %r156, %r159, %r139; + fma.rn.f32 %r168, %r155, %r160, %r129; + fma.rn.f32 %r169, %r154, %r161, %r119; + fma.rn.f32 %r170, %r153, %r162, %r109; + fma.rn.f32 %r171, %r152, %r163, %r99; + fma.rn.f32 %r172, %r151, %r164, %r89; + fma.rn.f32 %r173, %r150, %r165, %r79; + .loc 1 66 25 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:66:25 + add.s64 %rd618, %rd142, %rd291; + add.s64 %rd619, %rd142, %rd292; + add.s64 %rd620, %rd142, %rd293; + add.s64 %rd621, %rd142, %rd294; + add.s64 %rd622, %rd142, %rd295; + add.s64 %rd623, %rd142, %rd296; + add.s64 %rd624, %rd142, %rd297; + add.s64 %rd625, %rd142, %rd298; + .loc 1 66 37 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:66:37 + cvt.rn.bf16.f32 %rs199, %r166; + cvt.rn.bf16.f32 %rs200, %r167; + cvt.rn.bf16.f32 %rs201, %r168; + cvt.rn.bf16.f32 %rs202, %r169; + cvt.rn.bf16.f32 %rs203, %r170; + cvt.rn.bf16.f32 %rs204, %r171; + cvt.rn.bf16.f32 %rs205, %r172; + cvt.rn.bf16.f32 %rs206, %r173; + // begin inline asm + @%p1 st.global.b16 [ %rd618 + 0 ], { %rs199 }; + // end inline asm + // begin inline asm + @%p2 st.global.b16 [ %rd619 + 0 ], { %rs200 }; + // end inline asm + // begin inline asm + @%p3 st.global.b16 [ %rd620 + 0 ], { %rs201 }; + // end inline asm + // begin inline asm + @%p4 st.global.b16 [ %rd621 + 0 ], { %rs202 }; + // end inline asm + // begin inline asm + @%p5 st.global.b16 [ %rd622 + 0 ], { %rs203 }; + // end inline asm + // begin inline asm + @%p6 st.global.b16 [ %rd623 + 0 ], { %rs204 }; + // end inline asm + // begin inline asm + @%p7 st.global.b16 [ %rd624 + 0 ], { %rs205 }; + // end inline asm + // begin inline asm + @%p8 st.global.b16 [ %rd625 + 0 ], { %rs206 }; + // end inline asm + .loc 1 66 4 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:66:4 + ret; +$L__BB0_49: + .loc 1 36 123 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:36:123 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd341, assertFunc_0; + cvta.global.u64 %rd342, %rd341; + st.param.b64 [param3], %rd342; + mov.b64 %rd343, assertFile_0; + cvta.global.u64 %rd344, %rd343; + st.param.b64 [param1], %rd344; + mov.b64 %rd345, assertMessage_0; + cvta.global.u64 %rd346, %rd345; + st.param.b64 [param0], %rd346; + st.param.b64 [param4], 1; + st.param.b32 [param2], 36; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__BB0_51: + .loc 1 51 126 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:51:126 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd507, assertFunc_1; + cvta.global.u64 %rd508, %rd507; + st.param.b64 [param3], %rd508; + mov.b64 %rd509, assertFile_1; + cvta.global.u64 %rd510, %rd509; + st.param.b64 [param1], %rd510; + mov.b64 %rd511, assertMessage_1; + cvta.global.u64 %rd512, %rd511; + st.param.b64 [param0], %rd512; + st.param.b64 [param4], 1; + st.param.b32 [param2], 51; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__BB0_53: + .loc 1 62 64 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:62:64 + { // callseq 2, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd588, assertFunc_2; + cvta.global.u64 %rd589, %rd588; + st.param.b64 [param3], %rd589; + mov.b64 %rd590, assertFile_2; + cvta.global.u64 %rd591, %rd590; + st.param.b64 [param1], %rd591; + mov.b64 %rd592, assertMessage_2; + cvta.global.u64 %rd593, %rd592; + st.param.b64 [param0], %rd593; + st.param.b64 [param4], 1; + st.param.b32 [param2], 62; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 2 + trap; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 112 +.b8 122 +.b8 119 +.b8 55 +.b8 103 +.b8 54 +.b8 121 +.b8 106 +.b8 102 +.b8 108 +.b8 112 +.b8 99 +.b8 116 +.b8 99 +.b8 113 +.b8 107 +.b8 122 +.b8 102 +.b8 53 +.b8 111 +.b8 115 +.b8 113 +.b8 55 +.b8 109 +.b8 53 +.b8 97 +.b8 99 +.b8 114 +.b8 99 +.b8 116 +.b8 97 +.b8 121 +.b8 115 +.b8 97 +.b8 54 +.b8 116 +.b8 104 +.b8 51 +.b8 111 +.b8 120 +.b8 51 +.b8 100 +.b8 101 +.b8 105 +.b8 110 +.b8 120 +.b8 108 +.b8 117 +.b8 121 +.b8 112 +.b8 99 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 112 +.b8 122 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..f67652c251fee25793d33c4c9bb26f55db49bf53 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir @@ -0,0 +1,284 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":18:0) +#loc91 = loc("in_ptr0"(#loc)) +#loc92 = loc("in_ptr1"(#loc)) +#loc93 = loc("in_ptr2"(#loc)) +#loc94 = loc("in_ptr3"(#loc)) +#loc95 = loc("out_ptr0"(#loc)) +#loc96 = loc("ks0"(#loc)) +#loc97 = loc("ks1"(#loc)) +#loc98 = loc("ks2"(#loc)) +#loc99 = loc("ks3"(#loc)) +#loc100 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<1024xi1, #blocked> loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xbf16, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<1024xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc101) + %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc102) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc103) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32, #blocked> loc(#loc104) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32, #blocked> loc(#loc104) + %xmask = tt.splat %xnumel : i32 -> tensor<1024xi32, #blocked> loc(#loc105) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<1024xi32, #blocked> loc(#loc105) + %x0 = arith.extsi %xindex_5 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> loc(#loc106) + %x0_7 = tt.splat %ks0 : i64 -> tensor<1024xi64, #blocked> loc(#loc106) + %x0_8 = arith.remsi %x0, %x0_7 : tensor<1024xi64, #blocked> loc(#loc106) + %x1 = arith.divsi %x0, %x0_7 : tensor<1024xi64, #blocked> loc(#loc107) + %x1_9 = tt.splat %ks1 : i64 -> tensor<1024xi64, #blocked> loc(#loc108) + %x1_10 = arith.remsi %x1, %x1_9 : tensor<1024xi64, #blocked> loc(#loc108) + %tmp31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc109) + %tmp31_11 = tt.addptr %tmp31, %xindex_5 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc109) + %tmp31_12 = tt.load %tmp31_11, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc110) + %tmp31_13 = arith.extf %tmp31_12 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc111) + %tmp32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc112) + %tmp32_14 = tt.addptr %tmp32, %x1_10 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc112) + %tmp32_15 = tt.load %tmp32_14, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc113) + %tmp1 = arith.divsi %ks0, %c2_i64 : i64 loc(#loc114) + %tmp2 = tt.splat %tmp1 : i64 -> tensor<1024xi64, #blocked> loc(#loc115) + %tmp2_16 = arith.cmpi sge, %x0_8, %tmp2 : tensor<1024xi64, #blocked> loc(#loc115) + %tmp3 = arith.muli %tmp1, %c-1_i64 : i64 loc(#loc116) + %tmp3_17 = tt.splat %tmp3 : i64 -> tensor<1024xi64, #blocked> loc(#loc117) + %tmp3_18 = arith.addi %x0, %tmp3_17 : tensor<1024xi64, #blocked> loc(#loc117) + %tmp3_19 = tt.addptr %tmp31, %tmp3_18 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc118) + %tmp3_20 = arith.andi %tmp2_16, %xmask_6 : tensor<1024xi1, #blocked> loc(#loc119) + %tmp3_21 = tt.load %tmp3_19, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc120) + %tmp3_22 = arith.extf %tmp3_21 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc121) + %tmp4 = tt.load %tmp32_14, %tmp3_20, %cst_1 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc122) + %tmp5 = tt.splat %ks2 : i64 -> tensor<1024xi64, #blocked> loc(#loc123) + %tmp6 = arith.addi %tmp4, %tmp5 : tensor<1024xi64, #blocked> loc(#loc124) + %tmp7 = arith.cmpi slt, %tmp4, %cst_1 : tensor<1024xi64, #blocked> loc(#loc125) + %tmp8 = arith.select %tmp7, %tmp6, %tmp4 : tensor<1024xi1, #blocked>, tensor<1024xi64, #blocked> loc(#loc126) + %0 = arith.cmpi sge, %tmp8, %cst_1 : tensor<1024xi64, #blocked> loc(#loc28) + %1 = arith.cmpi slt, %tmp8, %tmp5 : tensor<1024xi64, #blocked> loc(#loc29) + %2 = arith.andi %0, %1 : tensor<1024xi1, #blocked> loc(#loc30) + %3 = arith.xori %tmp3_20, %cst : tensor<1024xi1, #blocked> loc(#loc31) + %4 = arith.ori %2, %3 : tensor<1024xi1, #blocked> loc(#loc32) + tt.assert %4, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<1024xi1, #blocked> loc(#loc33) + %tmp10 = arith.addi %x0_8, %tmp3_17 : tensor<1024xi64, #blocked> loc(#loc127) + %tmp10_23 = arith.muli %x0_7, %tmp8 : tensor<1024xi64, #blocked> loc(#loc128) + %tmp10_24 = arith.addi %tmp10, %tmp10_23 : tensor<1024xi64, #blocked> loc(#loc129) + %tmp10_25 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc130) + %tmp10_26 = tt.addptr %tmp10_25, %tmp10_24 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc130) + %tmp10_27 = tt.load %tmp10_26, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc131) + %tmp10_28 = arith.extf %tmp10_27 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc132) + %tmp11 = arith.mulf %tmp3_22, %tmp10_28 : tensor<1024xf32, #blocked> loc(#loc133) + %tmp12 = arith.subf %cst_2, %tmp11 : tensor<1024xf32, #blocked> loc(#loc134) + %tmp16 = arith.select %tmp2_16, %tmp12, %cst_2 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc169) + %tmp17 = arith.cmpi slt, %x0_8, %tmp2 : tensor<1024xi64, #blocked> loc(#loc137) + %tmp18 = arith.addi %x0_7, %x0 : tensor<1024xi64, #blocked> loc(#loc138) + %tmp18_29 = arith.addi %tmp18, %tmp3_17 : tensor<1024xi64, #blocked> loc(#loc139) + %tmp18_30 = tt.addptr %tmp31, %tmp18_29 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc140) + %tmp18_31 = arith.andi %tmp17, %xmask_6 : tensor<1024xi1, #blocked> loc(#loc141) + %tmp18_32 = tt.load %tmp18_30, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc142) + %tmp18_33 = arith.extf %tmp18_32 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc143) + %tmp19 = tt.load %tmp32_14, %tmp18_31, %cst_1 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc144) + %tmp21 = arith.addi %tmp19, %tmp5 : tensor<1024xi64, #blocked> loc(#loc145) + %tmp22 = arith.cmpi slt, %tmp19, %cst_1 : tensor<1024xi64, #blocked> loc(#loc146) + %tmp23 = arith.select %tmp22, %tmp21, %tmp19 : tensor<1024xi1, #blocked>, tensor<1024xi64, #blocked> loc(#loc147) + %5 = arith.cmpi sge, %tmp23, %cst_1 : tensor<1024xi64, #blocked> loc(#loc55) + %6 = arith.cmpi slt, %tmp23, %tmp5 : tensor<1024xi64, #blocked> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<1024xi1, #blocked> loc(#loc57) + %8 = arith.xori %tmp18_31, %cst : tensor<1024xi1, #blocked> loc(#loc58) + %9 = arith.ori %7, %8 : tensor<1024xi1, #blocked> loc(#loc59) + tt.assert %9, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<1024xi1, #blocked> loc(#loc60) + %tmp25 = arith.addi %x0_7, %x0_8 : tensor<1024xi64, #blocked> loc(#loc148) + %tmp25_34 = arith.addi %tmp25, %tmp3_17 : tensor<1024xi64, #blocked> loc(#loc149) + %tmp25_35 = arith.muli %x0_7, %tmp23 : tensor<1024xi64, #blocked> loc(#loc150) + %tmp25_36 = arith.addi %tmp25_34, %tmp25_35 : tensor<1024xi64, #blocked> loc(#loc151) + %tmp25_37 = tt.addptr %tmp10_25, %tmp25_36 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc152) + %tmp25_38 = tt.load %tmp25_37, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc153) + %tmp25_39 = arith.extf %tmp25_38 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc154) + %tmp26 = arith.mulf %tmp18_33, %tmp25_39 : tensor<1024xf32, #blocked> loc(#loc155) + %tmp29 = arith.select %tmp17, %tmp26, %cst_2 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc170) + %tmp30 = arith.addf %tmp16, %tmp29 : tensor<1024xf32, #blocked> loc(#loc158) + %tmp34 = tt.splat %ks3 : i64 -> tensor<1024xi64, #blocked> loc(#loc159) + %tmp34_40 = arith.addi %tmp32_15, %tmp34 : tensor<1024xi64, #blocked> loc(#loc159) + %tmp35 = arith.cmpi slt, %tmp32_15, %cst_1 : tensor<1024xi64, #blocked> loc(#loc160) + %tmp36 = arith.select %tmp35, %tmp34_40, %tmp32_15 : tensor<1024xi1, #blocked>, tensor<1024xi64, #blocked> loc(#loc161) + %10 = arith.cmpi sge, %tmp36, %cst_1 : tensor<1024xi64, #blocked> loc(#loc75) + %11 = arith.cmpi slt, %tmp36, %tmp34 : tensor<1024xi64, #blocked> loc(#loc76) + %12 = arith.andi %10, %11 : tensor<1024xi1, #blocked> loc(#loc77) + %13 = arith.xori %xmask_6, %cst : tensor<1024xi1, #blocked> loc(#loc78) + %14 = arith.ori %12, %13 : tensor<1024xi1, #blocked> loc(#loc79) + tt.assert %14, "index out of bounds: 0 <= tmp36 < ks3" : tensor<1024xi1, #blocked> loc(#loc80) + %tmp38 = arith.muli %x0_7, %tmp36 : tensor<1024xi64, #blocked> loc(#loc162) + %tmp38_41 = arith.addi %x0_8, %tmp38 : tensor<1024xi64, #blocked> loc(#loc163) + %tmp38_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc164) + %tmp38_43 = tt.addptr %tmp38_42, %tmp38_41 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc164) + %tmp38_44 = tt.load %tmp38_43, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc165) + %tmp38_45 = arith.extf %tmp38_44 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc166) + %tmp39 = arith.mulf %tmp31_13, %tmp38_45 : tensor<1024xf32, #blocked> loc(#loc167) + %tmp40 = arith.addf %tmp30, %tmp39 : tensor<1024xf32, #blocked> loc(#loc168) + %15 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc88) + %16 = tt.addptr %15, %xindex_5 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc88) + %17 = arith.truncf %tmp40 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc89) + tt.store %16, %17, %xmask_6 : tensor<1024x!tt.ptr, #blocked> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":24:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":24:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":25:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":25:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":25:76) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":26:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":26:36) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":28:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":29:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:35) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:111) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":31:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":32:32) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":33:18) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":34:18) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":35:32) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:98) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:64) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:108) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:106) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:123) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:58) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:65) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:123) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":38:19) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":39:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":43:34) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":41:34) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":44:19) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:37) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:42) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:68) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:60) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:119) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":46:36) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":48:20) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":49:20) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":50:35) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:100) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:65) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:110) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:108) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:126) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:37) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:64) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:60) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:31) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:72) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:131) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":53:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":56:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":55:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":57:20) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":59:20) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":60:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":61:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:28) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:46) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:38) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:56) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:54) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:64) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:40) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:36) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:31) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:48) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:88) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":64:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":65:20) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":66:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":66:37) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":66:4) +#loc101 = loc("xoffset"(#loc2)) +#loc102 = loc("xoffset"(#loc3)) +#loc103 = loc("xindex"(#loc4)) +#loc104 = loc("xindex"(#loc5)) +#loc105 = loc("xmask"(#loc6)) +#loc106 = loc("x0"(#loc7)) +#loc107 = loc("x1"(#loc8)) +#loc108 = loc("x1"(#loc9)) +#loc109 = loc("tmp31"(#loc10)) +#loc110 = loc("tmp31"(#loc11)) +#loc111 = loc("tmp31"(#loc12)) +#loc112 = loc("tmp32"(#loc13)) +#loc113 = loc("tmp32"(#loc14)) +#loc114 = loc("tmp1"(#loc15)) +#loc115 = loc("tmp2"(#loc16)) +#loc116 = loc("tmp3"(#loc17)) +#loc117 = loc("tmp3"(#loc18)) +#loc118 = loc("tmp3"(#loc19)) +#loc119 = loc("tmp3"(#loc20)) +#loc120 = loc("tmp3"(#loc21)) +#loc121 = loc("tmp3"(#loc22)) +#loc122 = loc("tmp4"(#loc23)) +#loc123 = loc("tmp5"(#loc24)) +#loc124 = loc("tmp6"(#loc25)) +#loc125 = loc("tmp7"(#loc26)) +#loc126 = loc("tmp8"(#loc27)) +#loc127 = loc("tmp10"(#loc34)) +#loc128 = loc("tmp10"(#loc35)) +#loc129 = loc("tmp10"(#loc36)) +#loc130 = loc("tmp10"(#loc37)) +#loc131 = loc("tmp10"(#loc38)) +#loc132 = loc("tmp10"(#loc39)) +#loc133 = loc("tmp11"(#loc40)) +#loc134 = loc("tmp12"(#loc41)) +#loc135 = loc("tmp16"(#loc42)) +#loc136 = loc("tmp14"(#loc43)) +#loc137 = loc("tmp17"(#loc44)) +#loc138 = loc("tmp18"(#loc45)) +#loc139 = loc("tmp18"(#loc46)) +#loc140 = loc("tmp18"(#loc47)) +#loc141 = loc("tmp18"(#loc48)) +#loc142 = loc("tmp18"(#loc49)) +#loc143 = loc("tmp18"(#loc50)) +#loc144 = loc("tmp19"(#loc51)) +#loc145 = loc("tmp21"(#loc52)) +#loc146 = loc("tmp22"(#loc53)) +#loc147 = loc("tmp23"(#loc54)) +#loc148 = loc("tmp25"(#loc61)) +#loc149 = loc("tmp25"(#loc62)) +#loc150 = loc("tmp25"(#loc63)) +#loc151 = loc("tmp25"(#loc64)) +#loc152 = loc("tmp25"(#loc65)) +#loc153 = loc("tmp25"(#loc66)) +#loc154 = loc("tmp25"(#loc67)) +#loc155 = loc("tmp26"(#loc68)) +#loc156 = loc("tmp29"(#loc69)) +#loc157 = loc("tmp28"(#loc70)) +#loc158 = loc("tmp30"(#loc71)) +#loc159 = loc("tmp34"(#loc72)) +#loc160 = loc("tmp35"(#loc73)) +#loc161 = loc("tmp36"(#loc74)) +#loc162 = loc("tmp38"(#loc81)) +#loc163 = loc("tmp38"(#loc82)) +#loc164 = loc("tmp38"(#loc83)) +#loc165 = loc("tmp38"(#loc84)) +#loc166 = loc("tmp38"(#loc85)) +#loc167 = loc("tmp39"(#loc86)) +#loc168 = loc("tmp40"(#loc87)) +#loc169 = loc(fused[#loc135, #loc136]) +#loc170 = loc(fused[#loc156, #loc157]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..654c9965637e4fc87077ea2555747b3cfa5e3ec1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/FWEPGR3FZNND66AYQXX4GQT7TCIT4B6L5XAV2CXTIKJFUO4ZTCVQ/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir @@ -0,0 +1,283 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":18:0) +#loc91 = loc("in_ptr0"(#loc)) +#loc92 = loc("in_ptr1"(#loc)) +#loc93 = loc("in_ptr2"(#loc)) +#loc94 = loc("in_ptr3"(#loc)) +#loc95 = loc("out_ptr0"(#loc)) +#loc96 = loc("ks0"(#loc)) +#loc97 = loc("ks1"(#loc)) +#loc98 = loc("ks2"(#loc)) +#loc99 = loc("ks3"(#loc)) +#loc100 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1024xi64> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xbf16> loc(#loc1) + %cst_1 = arith.constant dense : tensor<1024xi1> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc101) + %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc102) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc103) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc104) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc104) + %xmask = tt.splat %xnumel : i32 -> tensor<1024xi32> loc(#loc105) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<1024xi32> loc(#loc105) + %x0 = arith.extsi %xindex_5 : tensor<1024xi32> to tensor<1024xi64> loc(#loc106) + %x0_7 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc106) + %x0_8 = arith.remsi %x0, %x0_7 : tensor<1024xi64> loc(#loc106) + %x1 = arith.divsi %x0, %x0_7 : tensor<1024xi64> loc(#loc107) + %x1_9 = tt.splat %ks1 : i64 -> tensor<1024xi64> loc(#loc108) + %x1_10 = arith.remsi %x1, %x1_9 : tensor<1024xi64> loc(#loc108) + %tmp31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc109) + %tmp31_11 = tt.addptr %tmp31, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc109) + %tmp31_12 = tt.load %tmp31_11, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc110) + %tmp31_13 = arith.extf %tmp31_12 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc111) + %tmp32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc112) + %tmp32_14 = tt.addptr %tmp32, %x1_10 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc112) + %tmp32_15 = tt.load %tmp32_14, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc113) + %tmp1 = arith.divsi %ks0, %c2_i64 : i64 loc(#loc114) + %tmp2 = tt.splat %tmp1 : i64 -> tensor<1024xi64> loc(#loc115) + %tmp2_16 = arith.cmpi sge, %x0_8, %tmp2 : tensor<1024xi64> loc(#loc115) + %tmp3 = arith.muli %tmp1, %c-1_i64 : i64 loc(#loc116) + %tmp3_17 = tt.splat %tmp3 : i64 -> tensor<1024xi64> loc(#loc117) + %tmp3_18 = arith.addi %x0, %tmp3_17 : tensor<1024xi64> loc(#loc117) + %tmp3_19 = tt.addptr %tmp31, %tmp3_18 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc118) + %tmp3_20 = arith.andi %tmp2_16, %xmask_6 : tensor<1024xi1> loc(#loc119) + %tmp3_21 = tt.load %tmp3_19, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc120) + %tmp3_22 = arith.extf %tmp3_21 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc121) + %tmp4 = tt.load %tmp32_14, %tmp3_20, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc122) + %tmp5 = tt.splat %ks2 : i64 -> tensor<1024xi64> loc(#loc123) + %tmp6 = arith.addi %tmp4, %tmp5 : tensor<1024xi64> loc(#loc124) + %tmp7 = arith.cmpi slt, %tmp4, %cst : tensor<1024xi64> loc(#loc125) + %tmp8 = arith.select %tmp7, %tmp6, %tmp4 : tensor<1024xi1>, tensor<1024xi64> loc(#loc126) + %0 = arith.cmpi sge, %tmp8, %cst : tensor<1024xi64> loc(#loc28) + %1 = arith.cmpi slt, %tmp8, %tmp5 : tensor<1024xi64> loc(#loc29) + %2 = arith.andi %0, %1 : tensor<1024xi1> loc(#loc30) + %3 = arith.xori %tmp3_20, %cst_1 : tensor<1024xi1> loc(#loc31) + %4 = arith.ori %2, %3 : tensor<1024xi1> loc(#loc32) + tt.assert %4, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<1024xi1> loc(#loc33) + %tmp10 = arith.addi %x0_8, %tmp3_17 : tensor<1024xi64> loc(#loc127) + %tmp10_23 = arith.muli %x0_7, %tmp8 : tensor<1024xi64> loc(#loc128) + %tmp10_24 = arith.addi %tmp10, %tmp10_23 : tensor<1024xi64> loc(#loc129) + %tmp10_25 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc130) + %tmp10_26 = tt.addptr %tmp10_25, %tmp10_24 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc130) + %tmp10_27 = tt.load %tmp10_26, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc131) + %tmp10_28 = arith.extf %tmp10_27 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc132) + %tmp11 = arith.mulf %tmp3_22, %tmp10_28 : tensor<1024xf32> loc(#loc133) + %tmp12 = arith.subf %cst_2, %tmp11 : tensor<1024xf32> loc(#loc134) + %tmp16 = arith.select %tmp2_16, %tmp12, %cst_2 : tensor<1024xi1>, tensor<1024xf32> loc(#loc169) + %tmp17 = arith.cmpi slt, %x0_8, %tmp2 : tensor<1024xi64> loc(#loc137) + %tmp18 = arith.addi %x0_7, %x0 : tensor<1024xi64> loc(#loc138) + %tmp18_29 = arith.addi %tmp18, %tmp3_17 : tensor<1024xi64> loc(#loc139) + %tmp18_30 = tt.addptr %tmp31, %tmp18_29 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc140) + %tmp18_31 = arith.andi %tmp17, %xmask_6 : tensor<1024xi1> loc(#loc141) + %tmp18_32 = tt.load %tmp18_30, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc142) + %tmp18_33 = arith.extf %tmp18_32 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc143) + %tmp19 = tt.load %tmp32_14, %tmp18_31, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc144) + %tmp21 = arith.addi %tmp19, %tmp5 : tensor<1024xi64> loc(#loc145) + %tmp22 = arith.cmpi slt, %tmp19, %cst : tensor<1024xi64> loc(#loc146) + %tmp23 = arith.select %tmp22, %tmp21, %tmp19 : tensor<1024xi1>, tensor<1024xi64> loc(#loc147) + %5 = arith.cmpi sge, %tmp23, %cst : tensor<1024xi64> loc(#loc55) + %6 = arith.cmpi slt, %tmp23, %tmp5 : tensor<1024xi64> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<1024xi1> loc(#loc57) + %8 = arith.xori %tmp18_31, %cst_1 : tensor<1024xi1> loc(#loc58) + %9 = arith.ori %7, %8 : tensor<1024xi1> loc(#loc59) + tt.assert %9, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<1024xi1> loc(#loc60) + %tmp25 = arith.addi %x0_7, %x0_8 : tensor<1024xi64> loc(#loc148) + %tmp25_34 = arith.addi %tmp25, %tmp3_17 : tensor<1024xi64> loc(#loc149) + %tmp25_35 = arith.muli %x0_7, %tmp23 : tensor<1024xi64> loc(#loc150) + %tmp25_36 = arith.addi %tmp25_34, %tmp25_35 : tensor<1024xi64> loc(#loc151) + %tmp25_37 = tt.addptr %tmp10_25, %tmp25_36 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc152) + %tmp25_38 = tt.load %tmp25_37, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc153) + %tmp25_39 = arith.extf %tmp25_38 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc154) + %tmp26 = arith.mulf %tmp18_33, %tmp25_39 : tensor<1024xf32> loc(#loc155) + %tmp29 = arith.select %tmp17, %tmp26, %cst_2 : tensor<1024xi1>, tensor<1024xf32> loc(#loc170) + %tmp30 = arith.addf %tmp16, %tmp29 : tensor<1024xf32> loc(#loc158) + %tmp34 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc159) + %tmp34_40 = arith.addi %tmp32_15, %tmp34 : tensor<1024xi64> loc(#loc159) + %tmp35 = arith.cmpi slt, %tmp32_15, %cst : tensor<1024xi64> loc(#loc160) + %tmp36 = arith.select %tmp35, %tmp34_40, %tmp32_15 : tensor<1024xi1>, tensor<1024xi64> loc(#loc161) + %10 = arith.cmpi sge, %tmp36, %cst : tensor<1024xi64> loc(#loc75) + %11 = arith.cmpi slt, %tmp36, %tmp34 : tensor<1024xi64> loc(#loc76) + %12 = arith.andi %10, %11 : tensor<1024xi1> loc(#loc77) + %13 = arith.xori %xmask_6, %cst_1 : tensor<1024xi1> loc(#loc78) + %14 = arith.ori %12, %13 : tensor<1024xi1> loc(#loc79) + tt.assert %14, "index out of bounds: 0 <= tmp36 < ks3" : tensor<1024xi1> loc(#loc80) + %tmp38 = arith.muli %x0_7, %tmp36 : tensor<1024xi64> loc(#loc162) + %tmp38_41 = arith.addi %x0_8, %tmp38 : tensor<1024xi64> loc(#loc163) + %tmp38_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc164) + %tmp38_43 = tt.addptr %tmp38_42, %tmp38_41 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc164) + %tmp38_44 = tt.load %tmp38_43, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc165) + %tmp38_45 = arith.extf %tmp38_44 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc166) + %tmp39 = arith.mulf %tmp31_13, %tmp38_45 : tensor<1024xf32> loc(#loc167) + %tmp40 = arith.addf %tmp30, %tmp39 : tensor<1024xf32> loc(#loc168) + %15 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc88) + %16 = tt.addptr %15, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc88) + %17 = arith.truncf %tmp40 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc89) + tt.store %16, %17, %xmask_6 : tensor<1024x!tt.ptr> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":24:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":24:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":25:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":25:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":25:76) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":26:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":26:36) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":28:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":29:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:35) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:111) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":31:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":32:32) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":33:18) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":34:18) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":35:32) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:98) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:64) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:108) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:106) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:123) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:58) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:65) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:123) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":38:19) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":39:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":43:34) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":41:34) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":44:19) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:37) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:42) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:68) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:60) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:119) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":46:36) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":48:20) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":49:20) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":50:35) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:100) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:65) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:110) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:108) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:126) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:37) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:64) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:60) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:31) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:72) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:131) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":53:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":56:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":55:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":57:20) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":59:20) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":60:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":61:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:28) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:46) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:38) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:56) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:54) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:64) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:40) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:36) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:31) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:48) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:88) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":64:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":65:20) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":66:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":66:37) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":66:4) +#loc101 = loc("xoffset"(#loc2)) +#loc102 = loc("xoffset"(#loc3)) +#loc103 = loc("xindex"(#loc4)) +#loc104 = loc("xindex"(#loc5)) +#loc105 = loc("xmask"(#loc6)) +#loc106 = loc("x0"(#loc7)) +#loc107 = loc("x1"(#loc8)) +#loc108 = loc("x1"(#loc9)) +#loc109 = loc("tmp31"(#loc10)) +#loc110 = loc("tmp31"(#loc11)) +#loc111 = loc("tmp31"(#loc12)) +#loc112 = loc("tmp32"(#loc13)) +#loc113 = loc("tmp32"(#loc14)) +#loc114 = loc("tmp1"(#loc15)) +#loc115 = loc("tmp2"(#loc16)) +#loc116 = loc("tmp3"(#loc17)) +#loc117 = loc("tmp3"(#loc18)) +#loc118 = loc("tmp3"(#loc19)) +#loc119 = loc("tmp3"(#loc20)) +#loc120 = loc("tmp3"(#loc21)) +#loc121 = loc("tmp3"(#loc22)) +#loc122 = loc("tmp4"(#loc23)) +#loc123 = loc("tmp5"(#loc24)) +#loc124 = loc("tmp6"(#loc25)) +#loc125 = loc("tmp7"(#loc26)) +#loc126 = loc("tmp8"(#loc27)) +#loc127 = loc("tmp10"(#loc34)) +#loc128 = loc("tmp10"(#loc35)) +#loc129 = loc("tmp10"(#loc36)) +#loc130 = loc("tmp10"(#loc37)) +#loc131 = loc("tmp10"(#loc38)) +#loc132 = loc("tmp10"(#loc39)) +#loc133 = loc("tmp11"(#loc40)) +#loc134 = loc("tmp12"(#loc41)) +#loc135 = loc("tmp16"(#loc42)) +#loc136 = loc("tmp14"(#loc43)) +#loc137 = loc("tmp17"(#loc44)) +#loc138 = loc("tmp18"(#loc45)) +#loc139 = loc("tmp18"(#loc46)) +#loc140 = loc("tmp18"(#loc47)) +#loc141 = loc("tmp18"(#loc48)) +#loc142 = loc("tmp18"(#loc49)) +#loc143 = loc("tmp18"(#loc50)) +#loc144 = loc("tmp19"(#loc51)) +#loc145 = loc("tmp21"(#loc52)) +#loc146 = loc("tmp22"(#loc53)) +#loc147 = loc("tmp23"(#loc54)) +#loc148 = loc("tmp25"(#loc61)) +#loc149 = loc("tmp25"(#loc62)) +#loc150 = loc("tmp25"(#loc63)) +#loc151 = loc("tmp25"(#loc64)) +#loc152 = loc("tmp25"(#loc65)) +#loc153 = loc("tmp25"(#loc66)) +#loc154 = loc("tmp25"(#loc67)) +#loc155 = loc("tmp26"(#loc68)) +#loc156 = loc("tmp29"(#loc69)) +#loc157 = loc("tmp28"(#loc70)) +#loc158 = loc("tmp30"(#loc71)) +#loc159 = loc("tmp34"(#loc72)) +#loc160 = loc("tmp35"(#loc73)) +#loc161 = loc("tmp36"(#loc74)) +#loc162 = loc("tmp38"(#loc81)) +#loc163 = loc("tmp38"(#loc82)) +#loc164 = loc("tmp38"(#loc83)) +#loc165 = loc("tmp38"(#loc84)) +#loc166 = loc("tmp38"(#loc85)) +#loc167 = loc("tmp39"(#loc86)) +#loc168 = loc("tmp40"(#loc87)) +#loc169 = loc(fused[#loc135, #loc136]) +#loc170 = loc(fused[#loc156, #loc157]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/__grp__triton_red_fused__to_copy_sum_2.json b/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/__grp__triton_red_fused__to_copy_sum_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f6163759070f943aea989ad407ebba67fde0b222 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/__grp__triton_red_fused__to_copy_sum_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_sum_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.source", "triton_red_fused__to_copy_sum_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttir", "triton_red_fused__to_copy_sum_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttgir", "triton_red_fused__to_copy_sum_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.llir", "triton_red_fused__to_copy_sum_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ptx", "triton_red_fused__to_copy_sum_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.cubin", "triton_red_fused__to_copy_sum_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..d401d805eeab3ba0f71041fc259689a634f95cda Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.json b/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f737489312fbe2bdc5a3a555c46a66a359a2eef3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.json @@ -0,0 +1 @@ +{"hash": "3de7644d9cd8133c07031313b905c396b634b96b77247524ade62d0919a3570b", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_sum_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..7aed08dbabce9bd4fd8a63ac6dbd30907731f53c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.llir @@ -0,0 +1,158 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__to_copy_sum_2(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %.fr5 = freeze i32 %4, !dbg !8 + %10 = icmp slt i32 %9, %.fr5, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 15, !dbg !9 + %13 = zext nneg i32 %9 to i64, !dbg !10 + %14 = icmp sgt i32 %5, 0, !dbg !11 + br i1 %14, label %.lr.ph, label %._crit_edge, !dbg !11 + +.lr.ph: ; preds = %8 + %15 = mul i64 %2, %13, !dbg !10 + %16 = getelementptr i32, ptr addrspace(1) %0, i64 %15 + br i1 %10, label %.lr.ph.split, label %.lr.ph.split.us + +.lr.ph.split.us: ; preds = %.lr.ph, %.lr.ph.split.us + %17 = phi i32 [ %23, %.lr.ph.split.us ], [ 0, %.lr.ph ] + %18 = or disjoint i32 %17, %12, !dbg !12 + %19 = sext i32 %18 to i64, !dbg !13 + %20 = getelementptr i32, ptr addrspace(1) %16, i64 %19, !dbg !14 + %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #3, !dbg !15 + %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %20, i64 %21, i1 false) #3, !dbg !15 + %23 = add i32 %17, 16, !dbg !11 + %24 = icmp slt i32 %23, %5, !dbg !11 + br i1 %24, label %.lr.ph.split.us, label %._crit_edge, !dbg !11 + +.lr.ph.split: ; preds = %.lr.ph, %.lr.ph.split + %25 = phi i64 [ %33, %.lr.ph.split ], [ 0, %.lr.ph ] + %26 = phi i32 [ %34, %.lr.ph.split ], [ 0, %.lr.ph ] + %27 = or disjoint i32 %26, %12, !dbg !12 + %28 = icmp slt i32 %27, %5, !dbg !16 + %29 = sext i32 %27 to i64, !dbg !13 + %30 = getelementptr i32, ptr addrspace(1) %16, i64 %29, !dbg !14 + %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #3, !dbg !15 + %32 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %30, i64 %31, i1 %28) #3, !dbg !15 + %narrow = select i1 %28, i32 %32, i32 0, !dbg !17 + %spec.select = sext i32 %narrow to i64, !dbg !17 + %33 = add i64 %25, %spec.select, !dbg !17 + %34 = add i32 %26, 16, !dbg !11 + %35 = icmp slt i32 %34, %5, !dbg !11 + br i1 %35, label %.lr.ph.split, label %._crit_edge, !dbg !11 + +._crit_edge: ; preds = %.lr.ph.split.us, %.lr.ph.split, %8 + %.lcssa = phi i64 [ 0, %8 ], [ %33, %.lr.ph.split ], [ 0, %.lr.ph.split.us ], !dbg !18 + %extelt.offset = lshr i64 %.lcssa, 32, !dbg !19 + %36 = trunc nuw i64 %extelt.offset to i32, !dbg !19 + %37 = trunc i64 %.lcssa to i32, !dbg !19 + %38 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %37, i32 8, i32 31), !dbg !19 + %39 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %36, i32 8, i32 31), !dbg !19 + %40 = insertelement <2 x i32> poison, i32 %38, i64 0, !dbg !19 + %41 = insertelement <2 x i32> %40, i32 %39, i64 1, !dbg !19 + %42 = bitcast <2 x i32> %41 to i64, !dbg !19 + %43 = add i64 %.lcssa, %42, !dbg !23 + %extelt.offset2 = lshr i64 %43, 32, !dbg !19 + %44 = trunc nuw i64 %extelt.offset2 to i32, !dbg !19 + %45 = trunc i64 %43 to i32, !dbg !19 + %46 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %45, i32 4, i32 31), !dbg !19 + %47 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 4, i32 31), !dbg !19 + %48 = insertelement <2 x i32> poison, i32 %46, i64 0, !dbg !19 + %49 = insertelement <2 x i32> %48, i32 %47, i64 1, !dbg !19 + %50 = bitcast <2 x i32> %49 to i64, !dbg !19 + %51 = add i64 %43, %50, !dbg !23 + %extelt.offset3 = lshr i64 %51, 32, !dbg !19 + %52 = trunc nuw i64 %extelt.offset3 to i32, !dbg !19 + %53 = trunc i64 %51 to i32, !dbg !19 + %54 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %53, i32 2, i32 31), !dbg !19 + %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 2, i32 31), !dbg !19 + %56 = insertelement <2 x i32> poison, i32 %54, i64 0, !dbg !19 + %57 = insertelement <2 x i32> %56, i32 %55, i64 1, !dbg !19 + %58 = bitcast <2 x i32> %57 to i64, !dbg !19 + %59 = add i64 %51, %58, !dbg !23 + %extelt.offset4 = lshr i64 %59, 32, !dbg !19 + %60 = trunc nuw i64 %extelt.offset4 to i32, !dbg !19 + %61 = trunc i64 %59 to i32, !dbg !19 + %62 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %61, i32 1, i32 31), !dbg !19 + %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 1, i32 31), !dbg !19 + %64 = insertelement <2 x i32> poison, i32 %62, i64 0, !dbg !19 + %65 = insertelement <2 x i32> %64, i32 %63, i64 1, !dbg !19 + %66 = bitcast <2 x i32> %65 to i64, !dbg !19 + %67 = add i64 %59, %66, !dbg !23 + %.frozen = freeze i64 %3, !dbg !24 + %68 = sdiv i64 %13, %.frozen, !dbg !24 + %69 = mul i64 %68, %.frozen, !dbg !25 + %.decomposed = sub i64 %13, %69, !dbg !25 + %70 = trunc i64 %67 to i32, !dbg !26 + %71 = icmp slt i64 %3, 2, !dbg !27 + %72 = icmp sgt i64 %3, 1, !dbg !28 + %73 = select i1 %72, i64 %3, i64 0, !dbg !29 + %74 = zext i1 %71 to i64, !dbg !30 + %75 = add i64 %73, %74, !dbg !31 + %76 = mul i64 %68, %75, !dbg !32 + %77 = getelementptr i32, ptr addrspace(1) %1, i64 %.decomposed, !dbg !33 + %78 = getelementptr i32, ptr addrspace(1) %77, i64 %76, !dbg !33 + %79 = and i32 %11, 63, !dbg !34 + %80 = icmp eq i32 %79, 0, !dbg !34 + %81 = and i1 %80, %10, !dbg !34 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %70, ptr addrspace(1) %78, i1 %81) #3, !dbg !34 + ret void, !dbg !35 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_sum_2", linkageName: "triton_red_fused__to_copy_sum_2", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 21, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 21, scope: !4) +!9 = !DILocation(line: 24, column: 37, scope: !4) +!10 = !DILocation(line: 34, column: 45, scope: !4) +!11 = !DILocation(line: 28, column: 40, scope: !4) +!12 = !DILocation(line: 29, column: 31, scope: !4) +!13 = !DILocation(line: 34, column: 41, scope: !4) +!14 = !DILocation(line: 34, column: 34, scope: !4) +!15 = !DILocation(line: 34, column: 50, scope: !4) +!16 = !DILocation(line: 30, column: 29, scope: !4) +!17 = !DILocation(line: 38, column: 48, scope: !4) +!18 = !DILocation(line: 27, column: 43, scope: !4) +!19 = !DILocation(line: 291, column: 36, scope: !20, inlinedAt: !22) +!20 = distinct !DILexicalBlockFile(scope: !4, file: !21, discriminator: 0) +!21 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!22 = !DILocation(line: 39, column: 25, scope: !4) +!23 = !DILocation(line: 261, column: 15, scope: !20, inlinedAt: !22) +!24 = !DILocation(line: 41, column: 19, scope: !4) +!25 = !DILocation(line: 40, column: 19, scope: !4) +!26 = !DILocation(line: 42, column: 19, scope: !4) +!27 = !DILocation(line: 43, column: 49, scope: !4) +!28 = !DILocation(line: 43, column: 75, scope: !4) +!29 = !DILocation(line: 43, column: 66, scope: !4) +!30 = !DILocation(line: 43, scope: !4) +!31 = !DILocation(line: 43, column: 57, scope: !4) +!32 = !DILocation(line: 43, column: 34, scope: !4) +!33 = !DILocation(line: 43, column: 25, scope: !4) +!34 = !DILocation(line: 43, column: 88, scope: !4) +!35 = !DILocation(line: 43, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..9cd3451d496df9b8fc0707d689e7af11fdaa1581 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ptx @@ -0,0 +1,448 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_sum_2 // -- Begin function triton_red_fused__to_copy_sum_2 + // @triton_red_fused__to_copy_sum_2 +.visible .entry triton_red_fused__to_copy_sum_2( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_sum_2_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_sum_2_param_1, + .param .u64 triton_red_fused__to_copy_sum_2_param_2, + .param .u64 triton_red_fused__to_copy_sum_2_param_3, + .param .u32 triton_red_fused__to_copy_sum_2_param_4, + .param .u32 triton_red_fused__to_copy_sum_2_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_sum_2_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_sum_2_param_7 +) +.reqntid 64 +{ + .reg .pred %p<13>; + .reg .b32 %r<45>; + .reg .b64 %rd<60>; + .loc 1 18 0 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:18:0 + +// %bb.0: + ld.param.b32 %r8, [triton_red_fused__to_copy_sum_2_param_5]; + ld.param.b64 %rd14, [triton_red_fused__to_copy_sum_2_param_3]; +$L__tmp0: + .loc 1 21 28 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:21:28 + mov.u32 %r9, %ctaid.x; + ld.param.b32 %r10, [triton_red_fused__to_copy_sum_2_param_4]; + .loc 1 24 37 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:24:37 + mov.u32 %r2, %tid.x; + .loc 1 34 45 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:34:45 + cvt.u64.u32 %rd1, %r9; + .loc 1 28 40 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:28:40 + setp.lt.s32 %p1, %r8, 1; + mov.b64 %rd58, 0; + cvt.u32.u64 %r42, %rd1; + @%p1 bra $L__BB0_6; +// %bb.1: // %.lr.ph + .loc 1 0 40 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:0:40 + ld.param.b64 %rd13, [triton_red_fused__to_copy_sum_2_param_2]; + ld.param.b64 %rd11, [triton_red_fused__to_copy_sum_2_param_0]; + and.b32 %r3, %r2, 15; + .loc 1 23 21 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:23:21 + setp.lt.s32 %p2, %r42, %r10; + .loc 1 34 45 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:34:45 + mul.lo.s64 %rd16, %rd13, %rd1; + shl.b64 %rd17, %rd16, 2; + add.s64 %rd2, %rd11, %rd17; + @%p2 bra $L__BB0_4; + bra.uni $L__BB0_2; +$L__BB0_4: // %.lr.ph.split.preheader + .loc 1 0 45 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:0:45 + mov.b32 %r44, 0; + mov.b64 %rd58, 0; +$L__BB0_5: // %.lr.ph.split + // =>This Inner Loop Header: Depth=1 + .loc 1 30 29 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:30:29 + add.s32 %r17, %r3, %r44; + setp.lt.s32 %p5, %r17, %r8; + .loc 1 34 34 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:34:34 + mad.wide.s32 %rd24, %r17, 4, %rd2; + .loc 1 34 50 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:34:50 + // begin inline asm + mov.u64 %rd23, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd23, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r16, 0x0; + @%p5 ld.global.L1::evict_first.L2::cache_hint.b32 { %r16 }, [ %rd24 + 0 ], %rd23; + // end inline asm + .loc 1 38 48 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:38:48 + selp.b32 %r18, %r16, 0, %p5; + cvt.s64.s32 %rd26, %r18; + add.s64 %rd58, %rd58, %rd26; + .loc 1 28 40 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:28:40 + add.s32 %r44, %r44, 16; + setp.lt.s32 %p6, %r44, %r8; + @%p6 bra $L__BB0_5; + bra.uni $L__BB0_6; +$L__BB0_2: // %.lr.ph.split.us.preheader + .loc 1 0 40 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:0:40 + mov.b32 %r43, 0; +$L__BB0_3: // %.lr.ph.split.us + // =>This Inner Loop Header: Depth=1 + .loc 1 34 41 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:34:41 + add.s32 %r14, %r3, %r43; + .loc 1 34 34 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:34:34 + mad.wide.s32 %rd19, %r14, 4, %rd2; + .loc 1 34 50 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:34:50 + // begin inline asm + mov.u64 %rd18, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd18, 1.0; + // end inline asm + mov.pred %p3, 0; + // begin inline asm + mov.u32 %r13, 0x0; + @%p3 ld.global.L1::evict_first.L2::cache_hint.b32 { %r13 }, [ %rd19 + 0 ], %rd18; + // end inline asm + .loc 1 28 40 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:28:40 + add.s32 %r43, %r43, 16; + setp.lt.s32 %p4, %r43, %r8; + @%p4 bra $L__BB0_3; +$L__BB0_6: // %._crit_edge + .loc 1 0 40 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:0:40 + ld.param.b64 %rd12, [triton_red_fused__to_copy_sum_2_param_1]; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:39:25 ] + mov.b64 {_, %r19}, %rd58; + cvt.u32.u64 %r20, %rd58; + shfl.sync.bfly.b32 %r21, %r20, 8, 31, -1; + shfl.sync.bfly.b32 %r22, %r19, 8, 31, -1; + cvt.u64.u32 %rd27, %r21; + cvt.u64.u32 %rd28, %r22; + shl.b64 %rd29, %rd28, 32; + or.b64 %rd30, %rd27, %rd29; + .loc 2 261 15 // standard.py:261:15 @[ c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:39:25 ] + add.s64 %rd31, %rd58, %rd30; + .loc 2 291 36 // standard.py:291:36 @[ c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:39:25 ] + mov.b64 {_, %r23}, %rd31; + cvt.u32.u64 %r24, %rd31; + shfl.sync.bfly.b32 %r25, %r24, 4, 31, -1; + shfl.sync.bfly.b32 %r26, %r23, 4, 31, -1; + cvt.u64.u32 %rd32, %r25; + cvt.u64.u32 %rd33, %r26; + shl.b64 %rd34, %rd33, 32; + or.b64 %rd35, %rd32, %rd34; + .loc 2 261 15 // standard.py:261:15 @[ c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:39:25 ] + add.s64 %rd36, %rd31, %rd35; + .loc 2 291 36 // standard.py:291:36 @[ c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:39:25 ] + mov.b64 {_, %r27}, %rd36; + cvt.u32.u64 %r28, %rd36; + shfl.sync.bfly.b32 %r29, %r28, 2, 31, -1; + shfl.sync.bfly.b32 %r30, %r27, 2, 31, -1; + cvt.u64.u32 %rd37, %r29; + cvt.u64.u32 %rd38, %r30; + shl.b64 %rd39, %rd38, 32; + or.b64 %rd40, %rd37, %rd39; + .loc 2 261 15 // standard.py:261:15 @[ c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:39:25 ] + add.s64 %rd41, %rd36, %rd40; + .loc 2 291 36 // standard.py:291:36 @[ c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:39:25 ] + mov.b64 {_, %r31}, %rd41; + cvt.u32.u64 %r32, %rd41; + shfl.sync.bfly.b32 %r33, %r32, 1, 31, -1; + shfl.sync.bfly.b32 %r34, %r31, 1, 31, -1; + cvt.u64.u32 %rd42, %r33; + cvt.u64.u32 %rd43, %r34; + shl.b64 %rd44, %rd43, 32; + or.b64 %rd45, %rd42, %rd44; + .loc 2 261 15 // standard.py:261:15 @[ c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:39:25 ] + add.s64 %rd6, %rd41, %rd45; +$L__tmp2: + .loc 1 41 19 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:41:19 + and.b64 %rd46, %rd14, -4294967296; + setp.ne.b64 %p7, %rd46, 0; + @%p7 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + div.s64 %rd59, %rd1, %rd14; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r35, %rd14; + div.u32 %r37, %r42, %r35; + cvt.u64.u32 %rd59, %r37; +$L__BB0_9: + .loc 1 23 21 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:23:21 + setp.lt.s32 %p9, %r42, %r10; + .loc 1 40 19 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:40:19 + mul.lo.s64 %rd48, %rd59, %rd14; + sub.s64 %rd49, %rd1, %rd48; + .loc 1 42 19 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:42:19 + cvt.u32.u64 %r38, %rd6; + .loc 1 43 49 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:43:49 + setp.lt.s64 %p10, %rd14, 2; + .loc 1 43 75 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:43:75 + setp.gt.s64 %p11, %rd14, 1; + .loc 1 43 66 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:43:66 + selp.b64 %rd50, %rd14, 0, %p11; + .loc 1 43 0 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:43 + selp.b64 %rd51, 1, 0, %p10; + .loc 1 43 57 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:43:57 + add.s64 %rd52, %rd50, %rd51; + .loc 1 43 34 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:43:34 + mul.lo.s64 %rd53, %rd59, %rd52; + .loc 1 43 25 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:43:25 + shl.b64 %rd54, %rd49, 2; + add.s64 %rd55, %rd12, %rd54; + shl.b64 %rd56, %rd53, 2; + add.s64 %rd47, %rd55, %rd56; + .loc 1 43 88 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:43:88 + and.b32 %r40, %r2, 63; + setp.eq.b32 %p12, %r40, 0; + and.pred %p8, %p12, %p9; + // begin inline asm + @%p8 st.global.b32 [ %rd47 + 0 ], { %r38 }; + // end inline asm + .loc 1 43 4 // c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py:43:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 216 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd1 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 50 +.b8 99 +.b8 109 +.b8 115 +.b8 113 +.b8 98 +.b8 103 +.b8 107 +.b8 114 +.b8 111 +.b8 102 +.b8 122 +.b8 102 +.b8 105 +.b8 107 +.b8 122 +.b8 114 +.b8 110 +.b8 101 +.b8 104 +.b8 118 +.b8 104 +.b8 112 +.b8 52 +.b8 119 +.b8 120 +.b8 104 +.b8 122 +.b8 101 +.b8 52 +.b8 98 +.b8 108 +.b8 121 +.b8 52 +.b8 99 +.b8 116 +.b8 53 +.b8 101 +.b8 100 +.b8 108 +.b8 103 +.b8 51 +.b8 115 +.b8 121 +.b8 105 +.b8 110 +.b8 121 +.b8 54 +.b8 50 +.b8 54 +.b8 101 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 50 +.b8 99 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x22 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xad:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xc2:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 39 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.source b/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.source new file mode 100644 index 0000000000000000000000000000000000000000..b033df428afab2eea24207961b654b0b60bf082e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.source @@ -0,0 +1,180 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":18:0) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc40 = loc(unknown) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc47 = loc("in_ptr0"(#loc)) +#loc48 = loc("out_ptr1"(#loc)) +#loc49 = loc("ks0"(#loc)) +#loc50 = loc("ks1"(#loc)) +#loc51 = loc("xnumel"(#loc)) +#loc52 = loc("r0_numel"(#loc)) +#loc79 = loc("input"(#loc38)) +#loc80 = loc("a"(#loc43)) +#loc81 = loc("b"(#loc43)) +module { + tt.func public @triton_red_fused__to_copy_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc53) + %xoffset_0 = arith.constant 1 : i32 loc(#loc54) + %xoffset_1 = arith.constant 1 : i32 loc(#loc54) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc54) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc55) + %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc56) + %xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<1x1xi32> loc(#loc57) + %xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<1x1xi32> loc(#loc57) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc58) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<1x1xi32> loc(#loc58) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc59) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc60) + %_tmp3 = arith.constant 0 : i64 loc(#loc61) + %_tmp3_8 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc61) + %c0_i32 = arith.constant 0 : i32 loc(#loc10) + %c16_i32 = arith.constant 16 : i32 loc(#loc10) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc10) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc10) + %2 = arith.bitcast %c16_i32 : i32 to i32 loc(#loc10) + %3 = ub.poison : i32 loc(#loc10) + %_tmp3_9 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_18 = %_tmp3_8) -> (tensor<1x16xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32> loc(#loc63) + %r0_index_19 = arith.addi %r0_index, %r0_base_7 : tensor<1x16xi32> loc(#loc63) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32> loc(#loc64) + %r0_mask_20 = arith.cmpi slt, %r0_index_19, %r0_mask : tensor<1x16xi32> loc(#loc64) + %tmp0 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc65) + %tmp0_21 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc65) + %tmp0_22 = arith.muli %tmp0_21, %tmp0 : tensor<1x1xi64> loc(#loc65) + %tmp0_23 = arith.extsi %r0_index_19 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc66) + %tmp0_24 = tt.broadcast %tmp0_22 : tensor<1x1xi64> -> tensor<1x16xi64> loc(#loc66) + %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<1x16xi64> loc(#loc66) + %tmp0_26 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc67) + %tmp0_27 = tt.addptr %tmp0_26, %tmp0_25 : tensor<1x16x!tt.ptr>, tensor<1x16xi64> loc(#loc67) + %tmp0_28 = tt.broadcast %xmask_6 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc68) + %tmp0_29 = arith.andi %r0_mask_20, %tmp0_28 : tensor<1x16xi1> loc(#loc68) + %tmp0_30 = arith.constant 0.000000e+00 : f32 loc(#loc69) + %tmp0_31 = arith.constant dense<0.000000e+00> : tensor<1x16xf32> loc(#loc69) + %tmp0_32 = arith.fptosi %tmp0_31 : tensor<1x16xf32> to tensor<1x16xi32> loc(#loc69) + %tmp0_33 = tt.load %tmp0_27, %tmp0_29, %tmp0_32 evictionPolicy = evict_first : tensor<1x16x!tt.ptr> loc(#loc69) + %tmp1 = arith.extsi %tmp0_33 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc70) + %tmp4 = arith.addi %_tmp3_18, %tmp1 : tensor<1x16xi64> loc(#loc71) + %_tmp3_34 = tt.broadcast %xmask_6 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc72) + %_tmp3_35 = arith.andi %r0_mask_20, %_tmp3_34 : tensor<1x16xi1> loc(#loc72) + %_tmp3_36 = arith.select %_tmp3_35, %tmp4, %_tmp3_18 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc73) + scf.yield %_tmp3_36 : tensor<1x16xi64> loc(#loc22) + } loc(#loc62) + %tmp3 = tt.call @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp3_9) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc74) + %tmp3_10 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc75) + %x2 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc76) + %x2_11 = tt.splat %ks1 : i64 -> tensor<1x1xi64> loc(#loc76) + %x2_12 = arith.remsi %x2, %x2_11 : tensor<1x1xi64> loc(#loc76) + %x3 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc77) + %x3_13 = tt.splat %ks1 : i64 -> tensor<1x1xi64> loc(#loc77) + %x3_14 = arith.divsi %x3, %x3_13 : tensor<1x1xi64> loc(#loc77) + %tmp5 = arith.trunci %tmp3_10 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc78) + %c1_i32 = arith.constant 1 : i32 loc(#loc28) + %4 = arith.extsi %c1_i32 : i32 to i64 loc(#loc28) + %5 = arith.cmpi sge, %4, %ks1 : i64 loc(#loc28) + %c1_i32_15 = arith.constant 1 : i32 loc(#loc29) + %c1_i32_16 = arith.constant 1 : i32 loc(#loc29) + %6 = arith.extui %5 : i1 to i32 loc(#loc29) + %7 = arith.muli %c1_i32_16, %6 : i32 loc(#loc29) + %c1_i32_17 = arith.constant 1 : i32 loc(#loc30) + %8 = arith.extsi %c1_i32_17 : i32 to i64 loc(#loc30) + %9 = arith.cmpi sgt, %ks1, %8 : i64 loc(#loc30) + %10 = arith.extui %9 : i1 to i64 loc(#loc31) + %11 = arith.muli %ks1, %10 : i64 loc(#loc31) + %12 = arith.extsi %7 : i32 to i64 loc(#loc32) + %13 = arith.addi %12, %11 : i64 loc(#loc32) + %14 = tt.splat %13 : i64 -> tensor<1x1xi64> loc(#loc33) + %15 = arith.muli %x3_14, %14 : tensor<1x1xi64> loc(#loc33) + %16 = arith.addi %x2_12, %15 : tensor<1x1xi64> loc(#loc34) + %17 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc35) + %18 = tt.addptr %17, %16 : tensor<1x1x!tt.ptr>, tensor<1x1xi64> loc(#loc35) + tt.store %18, %tmp5, %xmask_6 : tensor<1x1x!tt.ptr> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x16xi64> loc("input"(#loc38))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc39) + tt.reduce.return %2 : i64 loc(#loc39) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc39) + tt.return %0 : tensor<1xi64> loc(#loc41) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc42) + tt.return %1 : tensor<1xi64> loc(#loc42) + } loc(#loc38) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc43)), %b: i64 loc("b"(#loc43))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc44) + tt.return %0 : i64 loc(#loc45) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc46) + tt.return %1 : i64 loc(#loc46) + } loc(#loc43) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":21:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":21:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":22:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":22:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":22:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":24:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":24:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":27:43) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":28:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":29:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":30:29) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":34:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":34:41) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":34:34) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":34:60) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":34:50) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":35:23) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":37:23) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":38:35) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":38:48) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":38:8) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":39:25) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":39:28) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":40:19) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":41:19) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":42:19) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:49) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:75) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:66) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:57) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:30) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:88) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:4) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc53 = loc("xoffset"(#loc1)) +#loc54 = loc("xoffset"(#loc2)) +#loc55 = loc("xindex"(#loc3)) +#loc56 = loc("xindex"(#loc4)) +#loc57 = loc("xindex"(#loc5)) +#loc58 = loc("xmask"(#loc6)) +#loc59 = loc("r0_base"(#loc7)) +#loc60 = loc("r0_base"(#loc8)) +#loc61 = loc("_tmp3"(#loc9)) +#loc62 = loc("_tmp3"(#loc10)) +#loc63 = loc("r0_index"(#loc11)) +#loc64 = loc("r0_mask"(#loc12)) +#loc65 = loc("tmp0"(#loc13)) +#loc66 = loc("tmp0"(#loc14)) +#loc67 = loc("tmp0"(#loc15)) +#loc68 = loc("tmp0"(#loc16)) +#loc69 = loc("tmp0"(#loc17)) +#loc70 = loc("tmp1"(#loc18)) +#loc71 = loc("tmp4"(#loc19)) +#loc72 = loc("_tmp3"(#loc20)) +#loc73 = loc("_tmp3"(#loc21)) +#loc74 = loc("tmp3"(#loc23)) +#loc75 = loc("tmp3"(#loc24)) +#loc76 = loc("x2"(#loc25)) +#loc77 = loc("x3"(#loc26)) +#loc78 = loc("tmp5"(#loc27)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..a5599826af46a79093fc07f808cce00616e698e8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttgir @@ -0,0 +1,123 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [2, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":18:0) +#loc1 = loc(unknown) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":39:25) +#loc34 = loc("in_ptr0"(#loc)) +#loc35 = loc("out_ptr1"(#loc)) +#loc36 = loc("ks0"(#loc)) +#loc37 = loc("ks1"(#loc)) +#loc38 = loc("xnumel"(#loc)) +#loc39 = loc("r0_numel"(#loc)) +#loc54 = loc("tmp3"(#loc18)) +#loc63 = loc(callsite(#loc1 at #loc54)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x16xi64, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x16xi32, #blocked> loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc40) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc41) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc42) + %r0_base_1 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc42) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32, #blocked> loc(#loc43) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc44) + %tmp0_2 = arith.muli %ks0, %tmp0 : i64 loc(#loc44) + %tmp0_3 = tt.splat %tmp0_2 : i64 -> tensor<1x16xi64, #blocked> loc(#loc60) + %tmp0_4 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> loc(#loc46) + %tmp0_5 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked> loc(#loc61) + %_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_7 = %cst) -> (tensor<1x16xi64, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32, #blocked> loc(#loc49) + %r0_index_8 = arith.addi %r0_index, %r0_base_1 : tensor<1x16xi32, #blocked> loc(#loc49) + %r0_mask_9 = arith.cmpi slt, %r0_index_8, %r0_mask : tensor<1x16xi32, #blocked> loc(#loc43) + %tmp0_10 = arith.extsi %r0_index_8 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc45) + %tmp0_11 = arith.addi %tmp0_10, %tmp0_3 : tensor<1x16xi64, #blocked> loc(#loc45) + %tmp0_12 = tt.addptr %tmp0_4, %tmp0_11 : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi64, #blocked> loc(#loc46) + %tmp0_13 = arith.andi %r0_mask_9, %tmp0_5 : tensor<1x16xi1, #blocked> loc(#loc47) + %tmp0_14 = tt.load %tmp0_12, %tmp0_13, %cst_0 evictionPolicy = evict_first : tensor<1x16x!tt.ptr, #blocked> loc(#loc50) + %tmp1 = arith.extsi %tmp0_14 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc51) + %tmp4 = arith.addi %_tmp3_7, %tmp1 : tensor<1x16xi64, #blocked> loc(#loc52) + %_tmp3_15 = arith.select %tmp0_13, %tmp4, %_tmp3_7 : tensor<1x16xi1, #blocked>, tensor<1x16xi64, #blocked> loc(#loc53) + scf.yield %_tmp3_15 : tensor<1x16xi64, #blocked> loc(#loc16) + } loc(#loc48) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_7: i64 loc(callsite(#loc1 at #loc54)), %tmp3_8: i64 loc(callsite(#loc1 at #loc54))): + %tmp3_9 = arith.addi %tmp3_7, %tmp3_8 : i64 loc(#loc64) + tt.reduce.return %tmp3_9 : i64 loc(#loc62) + }) : (tensor<1x16xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc62) + %0 = ttg.convert_layout %tmp3 : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc20) + %tmp3_6 = tt.expand_dims %0 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi64, #blocked1> loc(#loc55) + %x2 = arith.remsi %tmp0, %ks1 : i64 loc(#loc56) + %x3 = arith.divsi %tmp0, %ks1 : i64 loc(#loc57) + %tmp5 = arith.trunci %tmp3_6 : tensor<1x1xi64, #blocked1> to tensor<1x1xi32, #blocked1> loc(#loc58) + %1 = arith.cmpi sle, %ks1, %c1_i64 : i64 loc(#loc25) + %2 = arith.cmpi sgt, %ks1, %c1_i64 : i64 loc(#loc26) + %3 = arith.extui %2 : i1 to i64 loc(#loc27) + %4 = arith.muli %ks1, %3 : i64 loc(#loc27) + %5 = arith.extui %1 : i1 to i64 loc(#loc59) + %6 = arith.addi %5, %4 : i64 loc(#loc28) + %7 = arith.muli %x3, %6 : i64 loc(#loc30) + %8 = arith.addi %x2, %7 : i64 loc(#loc31) + %9 = tt.addptr %out_ptr1, %8 : !tt.ptr, i64 loc(#loc32) + %10 = tt.splat %9 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc20) + %11 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked1> loc(#loc20) + tt.store %10, %tmp5, %11 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc20) + tt.return loc(#loc33) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":21:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":23:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":24:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":30:29) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":34:45) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":34:41) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":34:34) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":34:60) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":28:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":29:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":34:50) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":35:23) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":37:23) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":38:48) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":38:8) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:88) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":39:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":40:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":41:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":42:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:49) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:75) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:66) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:57) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:30) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:4) +#loc40 = loc("xoffset"(#loc2)) +#loc41 = loc("xmask"(#loc3)) +#loc42 = loc("r0_base"(#loc4)) +#loc43 = loc("r0_mask"(#loc5)) +#loc44 = loc("tmp0"(#loc6)) +#loc45 = loc("tmp0"(#loc7)) +#loc46 = loc("tmp0"(#loc8)) +#loc47 = loc("tmp0"(#loc9)) +#loc48 = loc("_tmp3"(#loc10)) +#loc49 = loc("r0_index"(#loc11)) +#loc50 = loc("tmp0"(#loc12)) +#loc51 = loc("tmp1"(#loc13)) +#loc52 = loc("tmp4"(#loc14)) +#loc53 = loc("_tmp3"(#loc15)) +#loc55 = loc("tmp3"(#loc21)) +#loc56 = loc("x2"(#loc22)) +#loc57 = loc("x3"(#loc23)) +#loc58 = loc("tmp5"(#loc24)) +#loc59 = loc(fused[#loc28, #loc29]) +#loc60 = loc(fused[#loc45, #loc44]) +#loc61 = loc(fused[#loc47, #loc41]) +#loc62 = loc(callsite(#loc17 at #loc54)) +#loc64 = loc(callsite(#loc19 at #loc62)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..1beb7208bbf8c997dfc8366bbe331486a171e394 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttir @@ -0,0 +1,125 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":18:0) +#loc1 = loc(unknown) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":39:25) +#loc36 = loc("in_ptr0"(#loc)) +#loc37 = loc("out_ptr1"(#loc)) +#loc38 = loc("ks0"(#loc)) +#loc39 = loc("ks1"(#loc)) +#loc40 = loc("xnumel"(#loc)) +#loc41 = loc("r0_numel"(#loc)) +#loc58 = loc("tmp3"(#loc20)) +#loc67 = loc(callsite(#loc1 at #loc58)) +module { + tt.func public @triton_red_fused__to_copy_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst = arith.constant dense<0> : tensor<1x16xi32> loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %_tmp3 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc42) + %xoffset = tt.get_program_id x : i32 loc(#loc43) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc44) + %xmask_0 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc44) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc45) + %r0_base_1 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc46) + %_tmp3_2 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_5 = %_tmp3) -> (tensor<1x16xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32> loc(#loc48) + %r0_index_6 = arith.addi %r0_index, %r0_base_1 : tensor<1x16xi32> loc(#loc48) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32> loc(#loc49) + %r0_mask_7 = arith.cmpi slt, %r0_index_6, %r0_mask : tensor<1x16xi32> loc(#loc49) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc50) + %tmp0_8 = arith.muli %ks0, %tmp0 : i64 loc(#loc50) + %tmp0_9 = arith.extsi %r0_index_6 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc51) + %tmp0_10 = tt.splat %tmp0_8 : i64 -> tensor<1x16xi64> loc(#loc64) + %tmp0_11 = arith.addi %tmp0_9, %tmp0_10 : tensor<1x16xi64> loc(#loc51) + %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc52) + %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<1x16x!tt.ptr>, tensor<1x16xi64> loc(#loc52) + %tmp0_14 = tt.splat %xmask : i1 -> tensor<1x16xi1> loc(#loc65) + %tmp0_15 = arith.andi %r0_mask_7, %tmp0_14 : tensor<1x16xi1> loc(#loc53) + %tmp0_16 = tt.load %tmp0_13, %tmp0_15, %cst evictionPolicy = evict_first : tensor<1x16x!tt.ptr> loc(#loc54) + %tmp1 = arith.extsi %tmp0_16 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc55) + %tmp4 = arith.addi %_tmp3_5, %tmp1 : tensor<1x16xi64> loc(#loc56) + %_tmp3_17 = arith.select %tmp0_15, %tmp4, %_tmp3_5 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc57) + scf.yield %_tmp3_17 : tensor<1x16xi64> loc(#loc18) + } loc(#loc47) + %tmp3 = "tt.reduce"(%_tmp3_2) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_5: i64 loc(callsite(#loc1 at #loc58)), %tmp3_6: i64 loc(callsite(#loc1 at #loc58))): + %tmp3_7 = arith.addi %tmp3_5, %tmp3_6 : i64 loc(#loc68) + tt.reduce.return %tmp3_7 : i64 loc(#loc66) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc66) + %tmp3_3 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc59) + %x2 = arith.extsi %xoffset : i32 to i64 loc(#loc60) + %x2_4 = arith.remsi %x2, %ks1 : i64 loc(#loc60) + %x3 = arith.divsi %x2, %ks1 : i64 loc(#loc61) + %tmp5 = arith.trunci %tmp3_3 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc62) + %0 = arith.cmpi sle, %ks1, %c1_i64 : i64 loc(#loc26) + %1 = arith.cmpi sgt, %ks1, %c1_i64 : i64 loc(#loc27) + %2 = arith.extui %1 : i1 to i64 loc(#loc28) + %3 = arith.muli %ks1, %2 : i64 loc(#loc28) + %4 = arith.extui %0 : i1 to i64 loc(#loc63) + %5 = arith.addi %4, %3 : i64 loc(#loc29) + %6 = arith.muli %x3, %5 : i64 loc(#loc31) + %7 = arith.addi %x2_4, %6 : i64 loc(#loc32) + %8 = tt.addptr %out_ptr1, %7 : !tt.ptr, i64 loc(#loc33) + %9 = tt.splat %8 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc33) + tt.store %9, %tmp5, %xmask_0 : tensor<1x1x!tt.ptr> loc(#loc34) + tt.return loc(#loc35) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":28:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":27:43) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":21:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":23:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":24:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":24:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":29:31) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":30:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":34:45) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":34:41) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":34:34) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":34:60) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":34:50) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":35:23) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":37:23) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":38:48) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":38:8) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":39:28) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":40:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":41:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":42:19) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:49) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:75) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:66) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:57) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:41) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:34) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:30) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:88) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2c/c2cmsqbgkrofzfikzrnehvhp4wxhze4bly4ct5edlg3syiny626e.py":43:4) +#loc42 = loc("_tmp3"(#loc3)) +#loc43 = loc("xoffset"(#loc4)) +#loc44 = loc("xmask"(#loc5)) +#loc45 = loc("r0_base"(#loc6)) +#loc46 = loc("r0_base"(#loc7)) +#loc47 = loc("_tmp3"(#loc2)) +#loc48 = loc("r0_index"(#loc8)) +#loc49 = loc("r0_mask"(#loc9)) +#loc50 = loc("tmp0"(#loc10)) +#loc51 = loc("tmp0"(#loc11)) +#loc52 = loc("tmp0"(#loc12)) +#loc53 = loc("tmp0"(#loc13)) +#loc54 = loc("tmp0"(#loc14)) +#loc55 = loc("tmp1"(#loc15)) +#loc56 = loc("tmp4"(#loc16)) +#loc57 = loc("_tmp3"(#loc17)) +#loc59 = loc("tmp3"(#loc22)) +#loc60 = loc("x2"(#loc23)) +#loc61 = loc("x3"(#loc24)) +#loc62 = loc("tmp5"(#loc25)) +#loc63 = loc(fused[#loc29, #loc30]) +#loc64 = loc(fused[#loc51, #loc50]) +#loc65 = loc(fused[#loc53, #loc44]) +#loc66 = loc(callsite(#loc19 at #loc58)) +#loc68 = loc(callsite(#loc21 at #loc66)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/__grp__triton_poi_fused__to_copy_6.json b/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/__grp__triton_poi_fused__to_copy_6.json new file mode 100644 index 0000000000000000000000000000000000000000..e4aeafdaec7c87ddf6dd386618a4a15a80c4fe6b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/__grp__triton_poi_fused__to_copy_6.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused__to_copy_6.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.source", "triton_poi_fused__to_copy_6.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttir", "triton_poi_fused__to_copy_6.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttgir", "triton_poi_fused__to_copy_6.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.llir", "triton_poi_fused__to_copy_6.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ptx", "triton_poi_fused__to_copy_6.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.cubin", "triton_poi_fused__to_copy_6.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.cubin b/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.cubin new file mode 100644 index 0000000000000000000000000000000000000000..0e27fed79985cb7325b38dd0b2266bbd7e10c2ba Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.json b/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.json new file mode 100644 index 0000000000000000000000000000000000000000..29ce4d662ad54d9abe8962c008aade5f52f52936 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.json @@ -0,0 +1 @@ +{"hash": "42bb19ed265ccbd7b1943ef4d4583a35db4f47c232332144f4da3ddc335af286", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__to_copy_6"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.llir new file mode 100644 index 0000000000000000000000000000000000000000..724580059cfd0d16edcfe5e6e7cc22ee318b3d3d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.llir @@ -0,0 +1,89 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused__to_copy_6(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i64 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = shl i32 %9, 7, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 127, !dbg !9 + %13 = or disjoint i32 %10, %12, !dbg !10 + %14 = icmp slt i32 %13, %5, !dbg !11 + %15 = sext i32 %13 to i64, !dbg !12 + %.frozen = freeze i64 %2, !dbg !13 + %16 = sdiv i64 %15, %.frozen, !dbg !13 + %17 = mul i64 %16, %.frozen, !dbg !12 + %.decomposed = sub i64 %15, %17, !dbg !12 + %18 = srem i64 %16, %3, !dbg !14 + %19 = sdiv i64 %15, %4, !dbg !15 + %20 = insertelement <2 x i64> poison, i64 %3, i64 0, !dbg !16 + %21 = insertelement <2 x i64> %20, i64 %2, i64 1, !dbg !16 + %22 = icmp slt <2 x i64> %21, splat (i64 2), !dbg !16 + %23 = icmp sgt <2 x i64> %21, splat (i64 1), !dbg !17 + %24 = select <2 x i1> %23, <2 x i64> %21, <2 x i64> zeroinitializer, !dbg !18 + %25 = zext <2 x i1> %22 to <2 x i64>, !dbg !19 + %26 = add <2 x i64> %24, %25, !dbg !20 + %27 = extractelement <2 x i64> %26, i64 0, !dbg !21 + %28 = mul i64 %.decomposed, %27, !dbg !22 + %29 = extractelement <2 x i64> %26, i64 1, !dbg !21 + %30 = mul i64 %27, %29, !dbg !21 + %31 = mul i64 %30, %19, !dbg !23 + %32 = getelementptr i64, ptr addrspace(1) %0, i64 %18, !dbg !24 + %33 = getelementptr i64, ptr addrspace(1) %32, i64 %28, !dbg !24 + %34 = getelementptr i64, ptr addrspace(1) %33, i64 %31, !dbg !24 + %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !25 + %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %34, i64 %35, i1 %14) #2, !dbg !25 + %37 = trunc i64 %36 to i32, !dbg !26 + %38 = mul i64 %18, %29, !dbg !27 + %39 = getelementptr i32, ptr addrspace(1) %1, i64 %.decomposed, !dbg !28 + %40 = getelementptr i32, ptr addrspace(1) %39, i64 %38, !dbg !28 + %41 = getelementptr i32, ptr addrspace(1) %40, i64 %31, !dbg !28 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %37, ptr addrspace(1) %41, i1 %14) #2, !dbg !29 + ret void, !dbg !30 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused__to_copy_6", linkageName: "triton_poi_fused__to_copy_6", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 19, column: 28, scope: !4) +!8 = !DILocation(line: 19, column: 33, scope: !4) +!9 = !DILocation(line: 20, column: 36, scope: !4) +!10 = !DILocation(line: 20, column: 23, scope: !4) +!11 = !DILocation(line: 21, column: 21, scope: !4) +!12 = !DILocation(line: 22, column: 19, scope: !4) +!13 = !DILocation(line: 23, column: 21, scope: !4) +!14 = !DILocation(line: 23, column: 28, scope: !4) +!15 = !DILocation(line: 24, column: 19, scope: !4) +!16 = !DILocation(line: 25, column: 54, scope: !4) +!17 = !DILocation(line: 25, column: 80, scope: !4) +!18 = !DILocation(line: 25, column: 71, scope: !4) +!19 = !DILocation(line: 25, scope: !4) +!20 = !DILocation(line: 25, column: 62, scope: !4) +!21 = !DILocation(line: 25, column: 91, scope: !4) +!22 = !DILocation(line: 25, column: 39, scope: !4) +!23 = !DILocation(line: 25, column: 138, scope: !4) +!24 = !DILocation(line: 25, column: 30, scope: !4) +!25 = !DILocation(line: 25, column: 186, scope: !4) +!26 = !DILocation(line: 26, column: 19, scope: !4) +!27 = !DILocation(line: 27, column: 34, scope: !4) +!28 = !DILocation(line: 27, column: 25, scope: !4) +!29 = !DILocation(line: 27, column: 187, scope: !4) +!30 = !DILocation(line: 27, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ptx new file mode 100644 index 0000000000000000000000000000000000000000..b7ed77fc2d1a3981292ae87d275def0604d68d96 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ptx @@ -0,0 +1,311 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused__to_copy_6 // -- Begin function triton_poi_fused__to_copy_6 + // @triton_poi_fused__to_copy_6 +.visible .entry triton_poi_fused__to_copy_6( + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_6_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_6_param_1, + .param .u64 triton_poi_fused__to_copy_6_param_2, + .param .u64 triton_poi_fused__to_copy_6_param_3, + .param .u64 triton_poi_fused__to_copy_6_param_4, + .param .u32 triton_poi_fused__to_copy_6_param_5, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_6_param_6, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_6_param_7 +) +.reqntid 128 +{ + .reg .pred %p<10>; + .reg .b32 %r<20>; + .reg .b64 %rd<54>; + .loc 1 18 0 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:18:0 + +// %bb.0: + ld.param.b64 %rd16, [triton_poi_fused__to_copy_6_param_3]; + ld.param.b64 %rd15, [triton_poi_fused__to_copy_6_param_2]; +$L__tmp0: + .loc 1 19 28 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:19:28 + mov.u32 %r2, %ctaid.x; + .loc 1 19 33 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:19:33 + shl.b32 %r3, %r2, 7; + .loc 1 20 36 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:20:36 + mov.u32 %r4, %tid.x; + and.b32 %r5, %r4, 127; + .loc 1 20 23 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:20:23 + or.b32 %r6, %r3, %r5; + .loc 1 22 19 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:22:19 + cvt.s64.s32 %rd1, %r6; + .loc 1 23 21 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:23:21 + or.b64 %rd19, %rd1, %rd15; + and.b64 %rd20, %rd19, -4294967296; + setp.ne.b64 %p1, %rd20, 0; + cvt.u32.u64 %r19, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd51, %rd1, %rd15; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r7, %rd15; + div.u32 %r9, %r19, %r7; + cvt.u64.u32 %rd51, %r9; +$L__BB0_3: + .loc 1 0 21 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:0:21 + ld.param.b64 %rd17, [triton_poi_fused__to_copy_6_param_4]; + .loc 1 22 19 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:22:19 + mul.lo.s64 %rd21, %rd51, %rd15; + .loc 1 23 28 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:23:28 + or.b64 %rd22, %rd51, %rd16; + and.b64 %rd23, %rd22, -4294967296; + setp.ne.b64 %p2, %rd23, 0; + @%p2 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + rem.s64 %rd52, %rd51, %rd16; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r10, %rd16; + cvt.u32.u64 %r11, %rd51; + rem.u32 %r12, %r11, %r10; + cvt.u64.u32 %rd52, %r12; +$L__BB0_6: + .loc 1 0 28 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:0:28 + ld.param.b32 %r1, [triton_poi_fused__to_copy_6_param_5]; + ld.param.b64 %rd14, [triton_poi_fused__to_copy_6_param_1]; + ld.param.b64 %rd13, [triton_poi_fused__to_copy_6_param_0]; + sub.s64 %rd6, %rd1, %rd21; + .loc 1 24 19 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:24:19 + or.b64 %rd24, %rd1, %rd17; + and.b64 %rd25, %rd24, -4294967296; + setp.ne.b64 %p3, %rd25, 0; + @%p3 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + div.s64 %rd53, %rd1, %rd17; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r13, %rd17; + div.u32 %r15, %r19, %r13; + cvt.u64.u32 %rd53, %r15; +$L__BB0_9: + .loc 1 21 21 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:21:21 + setp.lt.s32 %p4, %r19, %r1; + .loc 1 25 54 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:25:54 + setp.lt.s64 %p6, %rd15, 2; + setp.lt.s64 %p7, %rd16, 2; + .loc 1 25 80 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:25:80 + setp.gt.s64 %p8, %rd15, 1; + setp.gt.s64 %p9, %rd16, 1; + .loc 1 25 71 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:25:71 + selp.b64 %rd31, %rd16, 0, %p9; + selp.b64 %rd32, %rd15, 0, %p8; + .loc 1 25 0 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:25 + selp.b64 %rd33, 1, 0, %p7; + selp.b64 %rd34, 1, 0, %p6; + .loc 1 25 62 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:25:62 + add.s64 %rd35, %rd32, %rd34; + add.s64 %rd36, %rd31, %rd33; + .loc 1 25 39 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:25:39 + mul.lo.s64 %rd37, %rd6, %rd36; + .loc 1 25 91 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:25:91 + mul.lo.s64 %rd38, %rd36, %rd35; + .loc 1 25 138 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:25:138 + mul.lo.s64 %rd39, %rd38, %rd53; + .loc 1 25 30 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:25:30 + shl.b64 %rd40, %rd52, 3; + add.s64 %rd41, %rd13, %rd40; + shl.b64 %rd42, %rd37, 3; + add.s64 %rd43, %rd41, %rd42; + shl.b64 %rd44, %rd39, 3; + add.s64 %rd28, %rd43, %rd44; + .loc 1 25 186 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:25:186 + // begin inline asm + mov.u64 %rd29, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd29, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd27, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd27 }, [ %rd28 + 0 ], %rd29; + // end inline asm + .loc 1 26 19 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:26:19 + cvt.u32.u64 %r16, %rd27; + .loc 1 27 34 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:27:34 + mul.lo.s64 %rd45, %rd52, %rd35; + .loc 1 27 25 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:27:25 + shl.b64 %rd46, %rd6, 2; + add.s64 %rd47, %rd14, %rd46; + shl.b64 %rd48, %rd45, 2; + add.s64 %rd49, %rd47, %rd48; + shl.b64 %rd50, %rd39, 2; + add.s64 %rd30, %rd49, %rd50; + .loc 1 27 187 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:27:187 + // begin inline asm + @%p4 st.global.b32 [ %rd30 + 0 ], { %r16 }; + // end inline asm + .loc 1 27 4 // cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py:27:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 97 +.b8 101 +.b8 54 +.b8 55 +.b8 103 +.b8 105 +.b8 54 +.b8 109 +.b8 99 +.b8 117 +.b8 101 +.b8 121 +.b8 50 +.b8 112 +.b8 108 +.b8 98 +.b8 97 +.b8 109 +.b8 53 +.b8 52 +.b8 53 +.b8 111 +.b8 112 +.b8 102 +.b8 102 +.b8 103 +.b8 53 +.b8 51 +.b8 51 +.b8 114 +.b8 118 +.b8 106 +.b8 119 +.b8 111 +.b8 113 +.b8 109 +.b8 118 +.b8 99 +.b8 100 +.b8 53 +.b8 52 +.b8 121 +.b8 108 +.b8 112 +.b8 116 +.b8 100 +.b8 120 +.b8 106 +.b8 102 +.b8 50 +.b8 99 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 97 +.b8 101 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.source b/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.source new file mode 100644 index 0000000000000000000000000000000000000000..2c79fe08af0c37c88a0e64e2e997212ba0d16840 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.source @@ -0,0 +1,226 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":18:0) +#loc56 = loc("in_ptr0"(#loc)) +#loc57 = loc("out_ptr0"(#loc)) +#loc58 = loc("ks0"(#loc)) +#loc59 = loc("ks1"(#loc)) +#loc60 = loc("ks2"(#loc)) +#loc61 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__to_copy_6(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc62) + %xoffset_0 = arith.constant 128 : i32 loc(#loc63) + %xoffset_1 = arith.constant 128 : i32 loc(#loc63) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc63) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc64) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<128xi32> loc(#loc65) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<128xi32> loc(#loc65) + %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc66) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<128xi32> loc(#loc66) + %x0 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc67) + %x0_6 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc67) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<128xi64> loc(#loc67) + %x1 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc68) + %x1_8 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc68) + %x1_9 = arith.divsi %x1, %x1_8 : tensor<128xi64> loc(#loc68) + %x1_10 = tt.splat %ks1 : i64 -> tensor<128xi64> loc(#loc69) + %x1_11 = arith.remsi %x1_9, %x1_10 : tensor<128xi64> loc(#loc69) + %x2 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc70) + %x2_12 = tt.splat %ks2 : i64 -> tensor<128xi64> loc(#loc70) + %x2_13 = arith.divsi %x2, %x2_12 : tensor<128xi64> loc(#loc70) + %tmp0 = arith.constant 1 : i32 loc(#loc71) + %tmp0_14 = arith.extsi %tmp0 : i32 to i64 loc(#loc71) + %tmp0_15 = arith.cmpi sge, %tmp0_14, %ks1 : i64 loc(#loc71) + %tmp0_16 = arith.constant 1 : i32 loc(#loc72) + %tmp0_17 = arith.constant 1 : i32 loc(#loc72) + %tmp0_18 = arith.extui %tmp0_15 : i1 to i32 loc(#loc72) + %tmp0_19 = arith.muli %tmp0_17, %tmp0_18 : i32 loc(#loc72) + %tmp0_20 = arith.constant 1 : i32 loc(#loc73) + %tmp0_21 = arith.extsi %tmp0_20 : i32 to i64 loc(#loc73) + %tmp0_22 = arith.cmpi sgt, %ks1, %tmp0_21 : i64 loc(#loc73) + %tmp0_23 = arith.extui %tmp0_22 : i1 to i64 loc(#loc74) + %tmp0_24 = arith.muli %ks1, %tmp0_23 : i64 loc(#loc74) + %tmp0_25 = arith.extsi %tmp0_19 : i32 to i64 loc(#loc75) + %tmp0_26 = arith.addi %tmp0_25, %tmp0_24 : i64 loc(#loc75) + %tmp0_27 = tt.splat %tmp0_26 : i64 -> tensor<128xi64> loc(#loc76) + %tmp0_28 = arith.muli %x0_7, %tmp0_27 : tensor<128xi64> loc(#loc76) + %tmp0_29 = arith.addi %x1_11, %tmp0_28 : tensor<128xi64> loc(#loc77) + %tmp0_30 = arith.constant 1 : i32 loc(#loc78) + %tmp0_31 = arith.extsi %tmp0_30 : i32 to i64 loc(#loc78) + %tmp0_32 = arith.cmpi sge, %tmp0_31, %ks0 : i64 loc(#loc78) + %tmp0_33 = arith.constant 1 : i32 loc(#loc79) + %tmp0_34 = arith.constant 1 : i32 loc(#loc79) + %tmp0_35 = arith.extui %tmp0_32 : i1 to i32 loc(#loc79) + %tmp0_36 = arith.muli %tmp0_34, %tmp0_35 : i32 loc(#loc79) + %tmp0_37 = arith.constant 1 : i32 loc(#loc80) + %tmp0_38 = arith.extsi %tmp0_37 : i32 to i64 loc(#loc80) + %tmp0_39 = arith.cmpi sgt, %ks0, %tmp0_38 : i64 loc(#loc80) + %tmp0_40 = arith.extui %tmp0_39 : i1 to i64 loc(#loc81) + %tmp0_41 = arith.muli %ks0, %tmp0_40 : i64 loc(#loc81) + %tmp0_42 = arith.extsi %tmp0_36 : i32 to i64 loc(#loc82) + %tmp0_43 = arith.addi %tmp0_42, %tmp0_41 : i64 loc(#loc82) + %tmp0_44 = tt.splat %tmp0_43 : i64 -> tensor<128xi64> loc(#loc83) + %tmp0_45 = arith.muli %x2_13, %tmp0_44 : tensor<128xi64> loc(#loc83) + %tmp0_46 = arith.constant 1 : i32 loc(#loc84) + %tmp0_47 = arith.extsi %tmp0_46 : i32 to i64 loc(#loc84) + %tmp0_48 = arith.cmpi sge, %tmp0_47, %ks1 : i64 loc(#loc84) + %tmp0_49 = arith.constant 1 : i32 loc(#loc85) + %tmp0_50 = arith.constant 1 : i32 loc(#loc85) + %tmp0_51 = arith.extui %tmp0_48 : i1 to i32 loc(#loc85) + %tmp0_52 = arith.muli %tmp0_50, %tmp0_51 : i32 loc(#loc85) + %tmp0_53 = arith.constant 1 : i32 loc(#loc86) + %tmp0_54 = arith.extsi %tmp0_53 : i32 to i64 loc(#loc86) + %tmp0_55 = arith.cmpi sgt, %ks1, %tmp0_54 : i64 loc(#loc86) + %tmp0_56 = arith.extui %tmp0_55 : i1 to i64 loc(#loc87) + %tmp0_57 = arith.muli %ks1, %tmp0_56 : i64 loc(#loc87) + %tmp0_58 = arith.extsi %tmp0_52 : i32 to i64 loc(#loc88) + %tmp0_59 = arith.addi %tmp0_58, %tmp0_57 : i64 loc(#loc88) + %tmp0_60 = tt.splat %tmp0_59 : i64 -> tensor<128xi64> loc(#loc89) + %tmp0_61 = arith.muli %tmp0_45, %tmp0_60 : tensor<128xi64> loc(#loc89) + %tmp0_62 = arith.addi %tmp0_29, %tmp0_61 : tensor<128xi64> loc(#loc90) + %tmp0_63 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc91) + %tmp0_64 = tt.addptr %tmp0_63, %tmp0_62 : tensor<128x!tt.ptr>, tensor<128xi64> loc(#loc91) + %tmp0_65 = tt.load %tmp0_64, %xmask_5 evictionPolicy = evict_last : tensor<128x!tt.ptr> loc(#loc92) + %tmp1 = arith.trunci %tmp0_65 : tensor<128xi64> to tensor<128xi32> loc(#loc93) + %c1_i32 = arith.constant 1 : i32 loc(#loc33) + %0 = arith.extsi %c1_i32 : i32 to i64 loc(#loc33) + %1 = arith.cmpi sge, %0, %ks0 : i64 loc(#loc33) + %c1_i32_66 = arith.constant 1 : i32 loc(#loc34) + %c1_i32_67 = arith.constant 1 : i32 loc(#loc34) + %2 = arith.extui %1 : i1 to i32 loc(#loc34) + %3 = arith.muli %c1_i32_67, %2 : i32 loc(#loc34) + %c1_i32_68 = arith.constant 1 : i32 loc(#loc35) + %4 = arith.extsi %c1_i32_68 : i32 to i64 loc(#loc35) + %5 = arith.cmpi sgt, %ks0, %4 : i64 loc(#loc35) + %6 = arith.extui %5 : i1 to i64 loc(#loc36) + %7 = arith.muli %ks0, %6 : i64 loc(#loc36) + %8 = arith.extsi %3 : i32 to i64 loc(#loc37) + %9 = arith.addi %8, %7 : i64 loc(#loc37) + %10 = tt.splat %9 : i64 -> tensor<128xi64> loc(#loc38) + %11 = arith.muli %x1_11, %10 : tensor<128xi64> loc(#loc38) + %12 = arith.addi %x0_7, %11 : tensor<128xi64> loc(#loc39) + %c1_i32_69 = arith.constant 1 : i32 loc(#loc40) + %13 = arith.extsi %c1_i32_69 : i32 to i64 loc(#loc40) + %14 = arith.cmpi sge, %13, %ks0 : i64 loc(#loc40) + %c1_i32_70 = arith.constant 1 : i32 loc(#loc41) + %c1_i32_71 = arith.constant 1 : i32 loc(#loc41) + %15 = arith.extui %14 : i1 to i32 loc(#loc41) + %16 = arith.muli %c1_i32_71, %15 : i32 loc(#loc41) + %c1_i32_72 = arith.constant 1 : i32 loc(#loc42) + %17 = arith.extsi %c1_i32_72 : i32 to i64 loc(#loc42) + %18 = arith.cmpi sgt, %ks0, %17 : i64 loc(#loc42) + %19 = arith.extui %18 : i1 to i64 loc(#loc43) + %20 = arith.muli %ks0, %19 : i64 loc(#loc43) + %21 = arith.extsi %16 : i32 to i64 loc(#loc44) + %22 = arith.addi %21, %20 : i64 loc(#loc44) + %23 = tt.splat %22 : i64 -> tensor<128xi64> loc(#loc45) + %24 = arith.muli %x2_13, %23 : tensor<128xi64> loc(#loc45) + %c1_i32_73 = arith.constant 1 : i32 loc(#loc46) + %25 = arith.extsi %c1_i32_73 : i32 to i64 loc(#loc46) + %26 = arith.cmpi sge, %25, %ks1 : i64 loc(#loc46) + %c1_i32_74 = arith.constant 1 : i32 loc(#loc47) + %c1_i32_75 = arith.constant 1 : i32 loc(#loc47) + %27 = arith.extui %26 : i1 to i32 loc(#loc47) + %28 = arith.muli %c1_i32_75, %27 : i32 loc(#loc47) + %c1_i32_76 = arith.constant 1 : i32 loc(#loc48) + %29 = arith.extsi %c1_i32_76 : i32 to i64 loc(#loc48) + %30 = arith.cmpi sgt, %ks1, %29 : i64 loc(#loc48) + %31 = arith.extui %30 : i1 to i64 loc(#loc49) + %32 = arith.muli %ks1, %31 : i64 loc(#loc49) + %33 = arith.extsi %28 : i32 to i64 loc(#loc50) + %34 = arith.addi %33, %32 : i64 loc(#loc50) + %35 = tt.splat %34 : i64 -> tensor<128xi64> loc(#loc51) + %36 = arith.muli %24, %35 : tensor<128xi64> loc(#loc51) + %37 = arith.addi %12, %36 : tensor<128xi64> loc(#loc52) + %38 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc53) + %39 = tt.addptr %38, %37 : tensor<128x!tt.ptr>, tensor<128xi64> loc(#loc53) + tt.store %39, %tmp1, %xmask_5 : tensor<128x!tt.ptr> loc(#loc54) + tt.return loc(#loc55) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":22:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":23:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":23:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":24:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:54) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:46) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:80) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:71) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:62) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:39) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:106) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:98) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:132) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:123) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:114) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:91) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:153) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:145) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:179) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:170) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:161) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:138) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:87) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:30) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:186) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":26:19) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:49) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:41) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:75) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:66) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:57) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:34) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:30) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:101) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:93) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:127) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:118) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:109) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:86) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:148) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:140) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:174) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:165) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:156) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:133) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:82) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:187) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:4) +#loc62 = loc("xoffset"(#loc1)) +#loc63 = loc("xoffset"(#loc2)) +#loc64 = loc("xindex"(#loc3)) +#loc65 = loc("xindex"(#loc4)) +#loc66 = loc("xmask"(#loc5)) +#loc67 = loc("x0"(#loc6)) +#loc68 = loc("x1"(#loc7)) +#loc69 = loc("x1"(#loc8)) +#loc70 = loc("x2"(#loc9)) +#loc71 = loc("tmp0"(#loc10)) +#loc72 = loc("tmp0"(#loc11)) +#loc73 = loc("tmp0"(#loc12)) +#loc74 = loc("tmp0"(#loc13)) +#loc75 = loc("tmp0"(#loc14)) +#loc76 = loc("tmp0"(#loc15)) +#loc77 = loc("tmp0"(#loc16)) +#loc78 = loc("tmp0"(#loc17)) +#loc79 = loc("tmp0"(#loc18)) +#loc80 = loc("tmp0"(#loc19)) +#loc81 = loc("tmp0"(#loc20)) +#loc82 = loc("tmp0"(#loc21)) +#loc83 = loc("tmp0"(#loc22)) +#loc84 = loc("tmp0"(#loc23)) +#loc85 = loc("tmp0"(#loc24)) +#loc86 = loc("tmp0"(#loc25)) +#loc87 = loc("tmp0"(#loc26)) +#loc88 = loc("tmp0"(#loc27)) +#loc89 = loc("tmp0"(#loc28)) +#loc90 = loc("tmp0"(#loc29)) +#loc91 = loc("tmp0"(#loc30)) +#loc92 = loc("tmp0"(#loc31)) +#loc93 = loc("tmp1"(#loc32)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..37ca127816798f878f9bf84f92e0f3f81149ca7a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttgir @@ -0,0 +1,122 @@ +#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":18:0) +#loc35 = loc("in_ptr0"(#loc)) +#loc36 = loc("out_ptr0"(#loc)) +#loc37 = loc("ks0"(#loc)) +#loc38 = loc("ks1"(#loc)) +#loc39 = loc("ks2"(#loc)) +#loc40 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused__to_copy_6(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc41) + %xoffset_0 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc42) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked> loc(#loc43) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<128xi32, #blocked> loc(#loc44) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<128xi32, #blocked> loc(#loc44) + %xmask = tt.splat %xnumel : i32 -> tensor<128xi32, #blocked> loc(#loc45) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<128xi32, #blocked> loc(#loc45) + %x0 = arith.extsi %xindex_2 : tensor<128xi32, #blocked> to tensor<128xi64, #blocked> loc(#loc46) + %x0_4 = tt.splat %ks0 : i64 -> tensor<128xi64, #blocked> loc(#loc46) + %x0_5 = arith.remsi %x0, %x0_4 : tensor<128xi64, #blocked> loc(#loc46) + %x1 = arith.divsi %x0, %x0_4 : tensor<128xi64, #blocked> loc(#loc47) + %x1_6 = tt.splat %ks1 : i64 -> tensor<128xi64, #blocked> loc(#loc48) + %x1_7 = arith.remsi %x1, %x1_6 : tensor<128xi64, #blocked> loc(#loc48) + %x2 = tt.splat %ks2 : i64 -> tensor<128xi64, #blocked> loc(#loc49) + %x2_8 = arith.divsi %x0, %x2 : tensor<128xi64, #blocked> loc(#loc49) + %tmp0 = arith.cmpi sle, %ks1, %c1_i64 : i64 loc(#loc50) + %tmp0_9 = arith.cmpi sgt, %ks1, %c1_i64 : i64 loc(#loc51) + %tmp0_10 = arith.extui %tmp0_9 : i1 to i64 loc(#loc52) + %tmp0_11 = arith.muli %ks1, %tmp0_10 : i64 loc(#loc52) + %tmp0_12 = arith.extui %tmp0 : i1 to i64 loc(#loc68) + %tmp0_13 = arith.addi %tmp0_12, %tmp0_11 : i64 loc(#loc53) + %tmp0_14 = tt.splat %tmp0_13 : i64 -> tensor<128xi64, #blocked> loc(#loc55) + %tmp0_15 = arith.muli %x0_5, %tmp0_14 : tensor<128xi64, #blocked> loc(#loc55) + %tmp0_16 = arith.addi %x1_7, %tmp0_15 : tensor<128xi64, #blocked> loc(#loc56) + %tmp0_17 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc57) + %tmp0_18 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc58) + %tmp0_19 = arith.extui %tmp0_18 : i1 to i64 loc(#loc59) + %tmp0_20 = arith.muli %ks0, %tmp0_19 : i64 loc(#loc59) + %tmp0_21 = arith.extui %tmp0_17 : i1 to i64 loc(#loc69) + %tmp0_22 = arith.addi %tmp0_21, %tmp0_20 : i64 loc(#loc60) + %tmp0_23 = tt.splat %tmp0_22 : i64 -> tensor<128xi64, #blocked> loc(#loc62) + %tmp0_24 = arith.muli %x2_8, %tmp0_23 : tensor<128xi64, #blocked> loc(#loc62) + %tmp0_25 = arith.muli %tmp0_24, %tmp0_14 : tensor<128xi64, #blocked> loc(#loc63) + %tmp0_26 = arith.addi %tmp0_16, %tmp0_25 : tensor<128xi64, #blocked> loc(#loc64) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x!tt.ptr, #blocked> loc(#loc65) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<128x!tt.ptr, #blocked>, tensor<128xi64, #blocked> loc(#loc65) + %tmp0_29 = tt.load %tmp0_28, %xmask_3 evictionPolicy = evict_last : tensor<128x!tt.ptr, #blocked> loc(#loc66) + %tmp1 = arith.trunci %tmp0_29 : tensor<128xi64, #blocked> to tensor<128xi32, #blocked> loc(#loc67) + %0 = arith.muli %x1_7, %tmp0_23 : tensor<128xi64, #blocked> loc(#loc29) + %1 = arith.addi %x0_5, %0 : tensor<128xi64, #blocked> loc(#loc30) + %2 = arith.addi %1, %tmp0_25 : tensor<128xi64, #blocked> loc(#loc31) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr, #blocked> loc(#loc32) + %4 = tt.addptr %3, %2 : tensor<128x!tt.ptr, #blocked>, tensor<128xi64, #blocked> loc(#loc32) + tt.store %4, %tmp1, %xmask_3 : tensor<128x!tt.ptr, #blocked> loc(#loc33) + tt.return loc(#loc34) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":23:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":23:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":24:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:54) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:80) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:71) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:62) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:46) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:39) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:35) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:106) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:132) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:123) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:114) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:98) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:91) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:138) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:87) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:30) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:186) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":26:19) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:34) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:30) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:82) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:187) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:4) +#loc41 = loc("xoffset"(#loc2)) +#loc42 = loc("xoffset"(#loc3)) +#loc43 = loc("xindex"(#loc4)) +#loc44 = loc("xindex"(#loc5)) +#loc45 = loc("xmask"(#loc6)) +#loc46 = loc("x0"(#loc7)) +#loc47 = loc("x1"(#loc8)) +#loc48 = loc("x1"(#loc9)) +#loc49 = loc("x2"(#loc10)) +#loc50 = loc("tmp0"(#loc11)) +#loc51 = loc("tmp0"(#loc12)) +#loc52 = loc("tmp0"(#loc13)) +#loc53 = loc("tmp0"(#loc14)) +#loc54 = loc("tmp0"(#loc15)) +#loc55 = loc("tmp0"(#loc16)) +#loc56 = loc("tmp0"(#loc17)) +#loc57 = loc("tmp0"(#loc18)) +#loc58 = loc("tmp0"(#loc19)) +#loc59 = loc("tmp0"(#loc20)) +#loc60 = loc("tmp0"(#loc21)) +#loc61 = loc("tmp0"(#loc22)) +#loc62 = loc("tmp0"(#loc23)) +#loc63 = loc("tmp0"(#loc24)) +#loc64 = loc("tmp0"(#loc25)) +#loc65 = loc("tmp0"(#loc26)) +#loc66 = loc("tmp0"(#loc27)) +#loc67 = loc("tmp1"(#loc28)) +#loc68 = loc(fused[#loc53, #loc54]) +#loc69 = loc(fused[#loc60, #loc61]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttir new file mode 100644 index 0000000000000000000000000000000000000000..cdc9565e5e8aa4426506d7bde3f1c90601b788e6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/IK5RT3JGLTF5PMMUH32NIWB2GXNU6R6CGIZSCRHU3I65YM226KDA/triton_poi_fused__to_copy_6.ttir @@ -0,0 +1,121 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":18:0) +#loc35 = loc("in_ptr0"(#loc)) +#loc36 = loc("out_ptr0"(#loc)) +#loc37 = loc("ks0"(#loc)) +#loc38 = loc("ks1"(#loc)) +#loc39 = loc("ks2"(#loc)) +#loc40 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__to_copy_6(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc41) + %xoffset_0 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc42) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc43) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<128xi32> loc(#loc44) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<128xi32> loc(#loc44) + %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc45) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<128xi32> loc(#loc45) + %x0 = arith.extsi %xindex_2 : tensor<128xi32> to tensor<128xi64> loc(#loc46) + %x0_4 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc46) + %x0_5 = arith.remsi %x0, %x0_4 : tensor<128xi64> loc(#loc46) + %x1 = arith.divsi %x0, %x0_4 : tensor<128xi64> loc(#loc47) + %x1_6 = tt.splat %ks1 : i64 -> tensor<128xi64> loc(#loc48) + %x1_7 = arith.remsi %x1, %x1_6 : tensor<128xi64> loc(#loc48) + %x2 = tt.splat %ks2 : i64 -> tensor<128xi64> loc(#loc49) + %x2_8 = arith.divsi %x0, %x2 : tensor<128xi64> loc(#loc49) + %tmp0 = arith.cmpi sle, %ks1, %c1_i64 : i64 loc(#loc50) + %tmp0_9 = arith.cmpi sgt, %ks1, %c1_i64 : i64 loc(#loc51) + %tmp0_10 = arith.extui %tmp0_9 : i1 to i64 loc(#loc52) + %tmp0_11 = arith.muli %ks1, %tmp0_10 : i64 loc(#loc52) + %tmp0_12 = arith.extui %tmp0 : i1 to i64 loc(#loc68) + %tmp0_13 = arith.addi %tmp0_12, %tmp0_11 : i64 loc(#loc53) + %tmp0_14 = tt.splat %tmp0_13 : i64 -> tensor<128xi64> loc(#loc55) + %tmp0_15 = arith.muli %x0_5, %tmp0_14 : tensor<128xi64> loc(#loc55) + %tmp0_16 = arith.addi %x1_7, %tmp0_15 : tensor<128xi64> loc(#loc56) + %tmp0_17 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc57) + %tmp0_18 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc58) + %tmp0_19 = arith.extui %tmp0_18 : i1 to i64 loc(#loc59) + %tmp0_20 = arith.muli %ks0, %tmp0_19 : i64 loc(#loc59) + %tmp0_21 = arith.extui %tmp0_17 : i1 to i64 loc(#loc69) + %tmp0_22 = arith.addi %tmp0_21, %tmp0_20 : i64 loc(#loc60) + %tmp0_23 = tt.splat %tmp0_22 : i64 -> tensor<128xi64> loc(#loc62) + %tmp0_24 = arith.muli %x2_8, %tmp0_23 : tensor<128xi64> loc(#loc62) + %tmp0_25 = arith.muli %tmp0_24, %tmp0_14 : tensor<128xi64> loc(#loc63) + %tmp0_26 = arith.addi %tmp0_16, %tmp0_25 : tensor<128xi64> loc(#loc64) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc65) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<128x!tt.ptr>, tensor<128xi64> loc(#loc65) + %tmp0_29 = tt.load %tmp0_28, %xmask_3 evictionPolicy = evict_last : tensor<128x!tt.ptr> loc(#loc66) + %tmp1 = arith.trunci %tmp0_29 : tensor<128xi64> to tensor<128xi32> loc(#loc67) + %0 = arith.muli %x1_7, %tmp0_23 : tensor<128xi64> loc(#loc29) + %1 = arith.addi %x0_5, %0 : tensor<128xi64> loc(#loc30) + %2 = arith.addi %1, %tmp0_25 : tensor<128xi64> loc(#loc31) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc32) + %4 = tt.addptr %3, %2 : tensor<128x!tt.ptr>, tensor<128xi64> loc(#loc32) + tt.store %4, %tmp1, %xmask_3 : tensor<128x!tt.ptr> loc(#loc33) + tt.return loc(#loc34) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":23:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":23:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":24:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:54) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:80) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:71) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:62) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:46) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:39) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:35) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:106) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:132) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:123) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:114) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:98) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:91) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:138) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:87) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:30) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":25:186) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":26:19) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:34) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:30) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:82) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:187) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ae/cae67gi6mcuey2plbam545opffg533rvjwoqmvcd54ylptdxjf2c.py":27:4) +#loc41 = loc("xoffset"(#loc2)) +#loc42 = loc("xoffset"(#loc3)) +#loc43 = loc("xindex"(#loc4)) +#loc44 = loc("xindex"(#loc5)) +#loc45 = loc("xmask"(#loc6)) +#loc46 = loc("x0"(#loc7)) +#loc47 = loc("x1"(#loc8)) +#loc48 = loc("x1"(#loc9)) +#loc49 = loc("x2"(#loc10)) +#loc50 = loc("tmp0"(#loc11)) +#loc51 = loc("tmp0"(#loc12)) +#loc52 = loc("tmp0"(#loc13)) +#loc53 = loc("tmp0"(#loc14)) +#loc54 = loc("tmp0"(#loc15)) +#loc55 = loc("tmp0"(#loc16)) +#loc56 = loc("tmp0"(#loc17)) +#loc57 = loc("tmp0"(#loc18)) +#loc58 = loc("tmp0"(#loc19)) +#loc59 = loc("tmp0"(#loc20)) +#loc60 = loc("tmp0"(#loc21)) +#loc61 = loc("tmp0"(#loc22)) +#loc62 = loc("tmp0"(#loc23)) +#loc63 = loc("tmp0"(#loc24)) +#loc64 = loc("tmp0"(#loc25)) +#loc65 = loc("tmp0"(#loc26)) +#loc66 = loc("tmp0"(#loc27)) +#loc67 = loc("tmp1"(#loc28)) +#loc68 = loc(fused[#loc53, #loc54]) +#loc69 = loc(fused[#loc60, #loc61]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..88fa9d8ea95892a1442ec782935dec7ba3361983 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..cc25a891dfa44da6f4cd490733528acda94de04e Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7a9511149dc32a2d666aeaefa4ec5b43ed6533d3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"hash": "571e82f1c43639fab6318c86608c3ec5501b4c57931b7c7476d3dfbfbd6a706d", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..394ebbd2df2605ef84806ad02bac831439d61be2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir @@ -0,0 +1,266 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py\00" +@assertMessage_0 = internal constant [90 x i8] c"index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i64 %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %13 = icmp samesign ult i32 %12, 128, !dbg !11 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %15 = and i32 %14, 31, !dbg !12 + %16 = zext nneg i32 %12 to i64, !dbg !13 + %17 = mul i64 %5, %16, !dbg !13 + %18 = icmp sgt i32 %8, 0, !dbg !14 + br i1 %18, label %.lr.ph, label %._crit_edge, !dbg !14 + +.lr.ph: ; preds = %11 + %19 = getelementptr i32, ptr addrspace(1) %0, i64 %17 + br i1 %13, label %.lr.ph.split, label %.lr.ph.split.us + +.lr.ph.split.us: ; preds = %.lr.ph, %.lr.ph.split.us + %20 = phi i32 [ %26, %.lr.ph.split.us ], [ 0, %.lr.ph ] + %21 = or disjoint i32 %20, %15, !dbg !15 + %22 = sext i32 %21 to i64, !dbg !16 + %23 = getelementptr i32, ptr addrspace(1) %19, i64 %22, !dbg !17 + %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %25 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %23, i64 %24, i1 false) #5, !dbg !18 + %26 = add i32 %20, 32, !dbg !14 + %27 = icmp slt i32 %26, %8, !dbg !14 + br i1 %27, label %.lr.ph.split.us, label %._crit_edge, !dbg !14 + +.lr.ph.split: ; preds = %.lr.ph, %.lr.ph.split + %28 = phi i64 [ %36, %.lr.ph.split ], [ 0, %.lr.ph ] + %29 = phi i32 [ %37, %.lr.ph.split ], [ 0, %.lr.ph ] + %30 = or disjoint i32 %29, %15, !dbg !15 + %31 = icmp slt i32 %30, %8, !dbg !19 + %32 = sext i32 %30 to i64, !dbg !16 + %33 = getelementptr i32, ptr addrspace(1) %19, i64 %32, !dbg !17 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %35 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %33, i64 %34, i1 %31) #5, !dbg !18 + %narrow16 = select i1 %31, i32 %35, i32 0, !dbg !20 + %spec.select = sext i32 %narrow16 to i64, !dbg !20 + %36 = add i64 %28, %spec.select, !dbg !20 + %37 = add i32 %29, 32, !dbg !14 + %38 = icmp slt i32 %37, %8, !dbg !14 + br i1 %38, label %.lr.ph.split, label %._crit_edge, !dbg !14 + +._crit_edge: ; preds = %.lr.ph.split.us, %.lr.ph.split, %11 + %.lcssa = phi i64 [ 0, %11 ], [ %36, %.lr.ph.split ], [ 0, %.lr.ph.split.us ], !dbg !21 + %extelt.offset = lshr i64 %.lcssa, 32, !dbg !22 + %39 = trunc nuw i64 %extelt.offset to i32, !dbg !22 + %40 = trunc i64 %.lcssa to i32, !dbg !22 + %41 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %40, i32 16, i32 31), !dbg !22 + %42 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 16, i32 31), !dbg !22 + %43 = insertelement <2 x i32> poison, i32 %41, i64 0, !dbg !22 + %44 = insertelement <2 x i32> %43, i32 %42, i64 1, !dbg !22 + %45 = bitcast <2 x i32> %44 to i64, !dbg !22 + %46 = add i64 %.lcssa, %45, !dbg !26 + %extelt.offset3 = lshr i64 %46, 32, !dbg !22 + %47 = trunc nuw i64 %extelt.offset3 to i32, !dbg !22 + %48 = trunc i64 %46 to i32, !dbg !22 + %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 8, i32 31), !dbg !22 + %50 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %47, i32 8, i32 31), !dbg !22 + %51 = insertelement <2 x i32> poison, i32 %49, i64 0, !dbg !22 + %52 = insertelement <2 x i32> %51, i32 %50, i64 1, !dbg !22 + %53 = bitcast <2 x i32> %52 to i64, !dbg !22 + %54 = add i64 %46, %53, !dbg !26 + %extelt.offset4 = lshr i64 %54, 32, !dbg !22 + %55 = trunc nuw i64 %extelt.offset4 to i32, !dbg !22 + %56 = trunc i64 %54 to i32, !dbg !22 + %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 4, i32 31), !dbg !22 + %58 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 4, i32 31), !dbg !22 + %59 = insertelement <2 x i32> poison, i32 %57, i64 0, !dbg !22 + %60 = insertelement <2 x i32> %59, i32 %58, i64 1, !dbg !22 + %61 = bitcast <2 x i32> %60 to i64, !dbg !22 + %62 = add i64 %54, %61, !dbg !26 + %extelt.offset5 = lshr i64 %62, 32, !dbg !22 + %63 = trunc nuw i64 %extelt.offset5 to i32, !dbg !22 + %64 = trunc i64 %62 to i32, !dbg !22 + %65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 2, i32 31), !dbg !22 + %66 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 2, i32 31), !dbg !22 + %67 = insertelement <2 x i32> poison, i32 %65, i64 0, !dbg !22 + %68 = insertelement <2 x i32> %67, i32 %66, i64 1, !dbg !22 + %69 = bitcast <2 x i32> %68 to i64, !dbg !22 + %70 = add i64 %62, %69, !dbg !26 + %extelt.offset6 = lshr i64 %70, 32, !dbg !22 + %71 = trunc nuw i64 %extelt.offset6 to i32, !dbg !22 + %72 = trunc i64 %70 to i32, !dbg !22 + %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 1, i32 31), !dbg !22 + %74 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 1, i32 31), !dbg !22 + %75 = insertelement <2 x i32> poison, i32 %73, i64 0, !dbg !22 + %76 = insertelement <2 x i32> %75, i32 %74, i64 1, !dbg !22 + %77 = bitcast <2 x i32> %76 to i64, !dbg !22 + %78 = add i64 %70, %77, !dbg !26 + %79 = trunc i64 %78 to i32, !dbg !27 + %80 = getelementptr i32, ptr addrspace(1) %2, i64 %16, !dbg !28 + %81 = and i32 %14, 32, !dbg !29 + %82 = icmp eq i32 %81, 0, !dbg !29 + %83 = and i32 %14, 63, !dbg !29 + %84 = icmp eq i32 %83, 0, !dbg !29 + %85 = and i1 %13, %84, !dbg !29 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %79, ptr addrspace(1) %80, i1 %85) #5, !dbg !29 + %86 = icmp slt i64 %5, 2, !dbg !30 + %87 = icmp sgt i64 %5, 1, !dbg !31 + %88 = select i1 %87, i64 %5, i64 0, !dbg !32 + %89 = zext i1 %86 to i64, !dbg !33 + %90 = add i64 %88, %89, !dbg !34 + %91 = mul i64 %90, %16, !dbg !35 + %92 = add i64 %5, 1, !dbg !36 + %93 = add i64 %6, 127, !dbg !37 + %94 = sdiv i64 %93, 128, !dbg !38 + %95 = and i64 %93, 127, !dbg !42 + %.not = icmp ne i64 %95, 0, !dbg !42 + %96 = icmp slt i64 %93, 0, !dbg !43 + %narrow = and i1 %96, %.not, !dbg !44 + %97 = sext i1 %narrow to i64, !dbg !44 + %98 = add nsw i64 %94, %97, !dbg !44 + br i1 %18, label %.lr.ph14, label %._crit_edge15, !dbg !45 + +.lr.ph14: ; preds = %._crit_edge, %119 + %99 = phi i32 [ %131, %119 ], [ 0, %._crit_edge ] + %100 = or disjoint i32 %99, %15, !dbg !46 + %101 = icmp slt i32 %100, %8, !dbg !47 + %102 = sext i32 %100 to i64, !dbg !48 + %103 = add i64 %91, %102, !dbg !48 + %104 = getelementptr i64, ptr addrspace(1) %1, i64 %103, !dbg !49 + %105 = and i1 %13, %101, !dbg !50 + %106 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !51 + %107 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %106, i1 %105) #5, !dbg !51 + %108 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !51 + %109 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %108, i1 %105) #5, !dbg !51 + %110 = icmp slt i32 %100, %79, !dbg !52 + %sext7 = shl i64 %109, 32, !dbg !53 + %111 = ashr exact i64 %sext7, 32, !dbg !53 + %112 = select i1 %110, i64 %111, i64 %5, !dbg !53 + %113 = icmp slt i64 %112, 0, !dbg !54 + %114 = select i1 %113, i64 %92, i64 0, !dbg !55 + %115 = add i64 %114, %112, !dbg !55 + %116 = icmp slt i64 %115, 0, !dbg !56 + %117 = icmp sgt i64 %115, %98, !dbg !57 + %.not12 = or i1 %116, %117, !dbg !58 + %.not9 = and i1 %105, %.not12, !dbg !59 + br i1 %.not9, label %118, label %119, !dbg !59 + +118: ; preds = %.lr.ph14 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 59, ptr nonnull @assertFunc_0, i64 1), !dbg !59 + unreachable, !dbg !59 + +119: ; preds = %.lr.ph14 + %sext = shl i64 %107, 32, !dbg !53 + %120 = ashr exact i64 %sext, 32, !dbg !53 + %121 = select i1 %110, i64 %120, i64 %5, !dbg !53 + %122 = icmp slt i64 %121, 0, !dbg !54 + %123 = select i1 %122, i64 %92, i64 0, !dbg !55 + %124 = trunc i64 %109 to i32, !dbg !60 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !59 + %125 = getelementptr i32, ptr addrspace(1) %3, i64 %103, !dbg !61 + %126 = and i1 %82, %105, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %124, ptr addrspace(1) %125, i1 %126) #5, !dbg !62 + %127 = getelementptr i32, ptr addrspace(1) %4, i64 %121, !dbg !63 + %128 = getelementptr i32, ptr addrspace(1) %127, i64 %123, !dbg !63 + %129 = getelementptr i32, ptr addrspace(1) %128, i64 %16, !dbg !63 + %130 = getelementptr i32, ptr addrspace(1) %129, i64 %17, !dbg !63 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %130, i1 %126) #5, !dbg !64 + %131 = add i32 %99, 32, !dbg !45 + %132 = icmp slt i32 %131, %8, !dbg !45 + br i1 %132, label %.lr.ph14, label %._crit_edge15, !dbg !45 + +._crit_edge15: ; preds = %119, %._crit_edge + ret void, !dbg !65 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="64" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2", linkageName: "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 22, column: 28, scope: !9) +!11 = !DILocation(line: 24, column: 21, scope: !9) +!12 = !DILocation(line: 25, column: 37, scope: !9) +!13 = !DILocation(line: 35, column: 45, scope: !9) +!14 = !DILocation(line: 29, column: 40, scope: !9) +!15 = !DILocation(line: 30, column: 31, scope: !9) +!16 = !DILocation(line: 35, column: 41, scope: !9) +!17 = !DILocation(line: 35, column: 34, scope: !9) +!18 = !DILocation(line: 35, column: 50, scope: !9) +!19 = !DILocation(line: 31, column: 29, scope: !9) +!20 = !DILocation(line: 39, column: 48, scope: !9) +!21 = !DILocation(line: 28, column: 43, scope: !9) +!22 = !DILocation(line: 291, column: 36, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !9, file: !24, discriminator: 0) +!24 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!25 = !DILocation(line: 40, column: 25, scope: !9) +!26 = !DILocation(line: 261, column: 15, scope: !23, inlinedAt: !25) +!27 = !DILocation(line: 41, column: 19, scope: !9) +!28 = !DILocation(line: 42, column: 25, scope: !9) +!29 = !DILocation(line: 42, column: 36, scope: !9) +!30 = !DILocation(line: 49, column: 60, scope: !9) +!31 = !DILocation(line: 49, column: 86, scope: !9) +!32 = !DILocation(line: 49, column: 77, scope: !9) +!33 = !DILocation(line: 49, scope: !9) +!34 = !DILocation(line: 49, column: 68, scope: !9) +!35 = !DILocation(line: 49, column: 45, scope: !9) +!36 = !DILocation(line: 55, column: 20, scope: !9) +!37 = !DILocation(line: 59, column: 94, scope: !9) +!38 = !DILocation(line: 72, column: 16, scope: !39, inlinedAt: !41) +!39 = distinct !DILexicalBlockFile(scope: !9, file: !40, discriminator: 0) +!40 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!41 = !DILocation(line: 59, column: 100, scope: !9) +!42 = !DILocation(line: 74, column: 34, scope: !39, inlinedAt: !41) +!43 = !DILocation(line: 75, column: 25, scope: !39, inlinedAt: !41) +!44 = !DILocation(line: 75, column: 47, scope: !39, inlinedAt: !41) +!45 = !DILocation(line: 43, column: 40, scope: !9) +!46 = !DILocation(line: 44, column: 31, scope: !9) +!47 = !DILocation(line: 45, column: 29, scope: !9) +!48 = !DILocation(line: 49, column: 41, scope: !9) +!49 = !DILocation(line: 49, column: 34, scope: !9) +!50 = !DILocation(line: 49, column: 103, scope: !9) +!51 = !DILocation(line: 49, column: 93, scope: !9) +!52 = !DILocation(line: 52, column: 22, scope: !9) +!53 = !DILocation(line: 54, column: 37, scope: !9) +!54 = !DILocation(line: 57, column: 24, scope: !9) +!55 = !DILocation(line: 58, column: 39, scope: !9) +!56 = !DILocation(line: 59, column: 32, scope: !9) +!57 = !DILocation(line: 59, column: 50, scope: !9) +!58 = !DILocation(line: 59, column: 112, scope: !9) +!59 = !DILocation(line: 59, column: 130, scope: !9) +!60 = !DILocation(line: 50, column: 23, scope: !9) +!61 = !DILocation(line: 61, column: 29, scope: !9) +!62 = !DILocation(line: 61, column: 94, scope: !9) +!63 = !DILocation(line: 62, column: 29, scope: !9) +!64 = !DILocation(line: 62, column: 95, scope: !9) +!65 = !DILocation(line: 43, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..62c4b042ec82c8807fb15dd92ebd19802fd718b5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx @@ -0,0 +1,640 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 // -- Begin function triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 112, 113, 47, 99, 112, 113, 50, 104, 113, 118, 51, 109, 110, 112, 97, 105, 52, 98, 101, 51, 109, 102, 51, 100, 107, 117, 53, 108, 122, 116, 100, 100, 98, 104, 116, 111, 113, 99, 112, 102, 52, 120, 117, 101, 54, 54, 120, 108, 55, 51, 115, 98, 109, 103, 120, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[90] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 53, 32, 60, 32, 49, 32, 43, 32, 40, 116, 114, 105, 116, 111, 110, 95, 104, 101, 108, 112, 101, 114, 115, 46, 100, 105, 118, 95, 102, 108, 111, 111, 114, 95, 105, 110, 116, 101, 103, 101, 114, 40, 49, 50, 55, 32, 43, 32, 107, 115, 49, 44, 32, 32, 49, 50, 56, 41, 41}; + // @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 +.visible .entry triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_4, + .param .u64 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_5, + .param .u64 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_6, + .param .u32 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_7, + .param .u32 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_10 +) +.reqntid 64 +{ + .reg .pred %p<32>; + .reg .b32 %r<53>; + .reg .b64 %rd<103>; + .loc 1 18 0 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:18:0 + +// %bb.0: + ld.param.b32 %r12, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_8]; + ld.param.b64 %rd18, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_5]; + ld.param.b64 %rd15, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_2]; +$L__tmp0: + .loc 1 22 28 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:22:28 + mov.u32 %r13, %ctaid.x; + .loc 1 25 37 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:25:37 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + .loc 1 35 45 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:35:45 + cvt.u64.u32 %rd1, %r13; + mul.lo.s64 %rd2, %rd18, %rd1; + .loc 1 29 40 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:29:40 + setp.lt.s32 %p2, %r12, 1; + mov.b64 %rd102, 0; + cvt.u32.u64 %r49, %rd1; + shl.b64 %rd100, %rd2, 2; + @%p2 bra $L__BB0_6; +// %bb.1: // %.lr.ph + .loc 1 0 40 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:0:40 + ld.param.b64 %rd13, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_0]; + .loc 1 24 21 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:24:21 + setp.lt.u32 %p3, %r49, 128; + add.s64 %rd3, %rd13, %rd100; + @%p3 bra $L__BB0_4; + bra.uni $L__BB0_2; +$L__BB0_4: // %.lr.ph.split.preheader + .loc 1 0 21 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:0:21 + mov.b32 %r51, 0; + mov.b64 %rd102, 0; +$L__BB0_5: // %.lr.ph.split + // =>This Inner Loop Header: Depth=1 + .loc 1 31 29 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:31:29 + add.s32 %r20, %r2, %r51; + setp.lt.s32 %p6, %r20, %r12; + .loc 1 35 34 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:35:34 + mad.wide.s32 %rd28, %r20, 4, %rd3; + .loc 1 35 50 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:35:50 + // begin inline asm + mov.u64 %rd27, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd27, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r19, 0x0; + @%p6 ld.global.L1::evict_first.L2::cache_hint.b32 { %r19 }, [ %rd28 + 0 ], %rd27; + // end inline asm + .loc 1 39 48 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:39:48 + selp.b32 %r21, %r19, 0, %p6; + cvt.s64.s32 %rd30, %r21; + add.s64 %rd102, %rd102, %rd30; + .loc 1 29 40 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:29:40 + add.s32 %r51, %r51, 32; + setp.lt.s32 %p7, %r51, %r12; + @%p7 bra $L__BB0_5; + bra.uni $L__BB0_6; +$L__BB0_2: // %.lr.ph.split.us.preheader + .loc 1 0 40 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:0:40 + mov.b32 %r50, 0; +$L__BB0_3: // %.lr.ph.split.us + // =>This Inner Loop Header: Depth=1 + .loc 1 35 41 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:35:41 + add.s32 %r17, %r2, %r50; + .loc 1 35 34 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:35:34 + mad.wide.s32 %rd23, %r17, 4, %rd3; + .loc 1 35 50 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:35:50 + // begin inline asm + mov.u64 %rd22, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd22, 1.0; + // end inline asm + mov.pred %p4, 0; + // begin inline asm + mov.u32 %r16, 0x0; + @%p4 ld.global.L1::evict_first.L2::cache_hint.b32 { %r16 }, [ %rd23 + 0 ], %rd22; + // end inline asm + .loc 1 29 40 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:29:40 + add.s32 %r50, %r50, 32; + setp.lt.s32 %p5, %r50, %r12; + @%p5 bra $L__BB0_3; +$L__BB0_6: // %._crit_edge + .loc 1 24 21 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:24:21 + setp.lt.u32 %p10, %r49, 128; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:40:25 ] + mov.b64 {_, %r24}, %rd102; + cvt.u32.u64 %r25, %rd102; + shfl.sync.bfly.b32 %r26, %r25, 16, 31, -1; + shfl.sync.bfly.b32 %r27, %r24, 16, 31, -1; + cvt.u64.u32 %rd32, %r26; + cvt.u64.u32 %rd33, %r27; + shl.b64 %rd34, %rd33, 32; + or.b64 %rd35, %rd32, %rd34; + .loc 2 261 15 // standard.py:261:15 @[ cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:40:25 ] + add.s64 %rd36, %rd102, %rd35; + .loc 2 291 36 // standard.py:291:36 @[ cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:40:25 ] + mov.b64 {_, %r28}, %rd36; + cvt.u32.u64 %r29, %rd36; + shfl.sync.bfly.b32 %r30, %r29, 8, 31, -1; + shfl.sync.bfly.b32 %r31, %r28, 8, 31, -1; + cvt.u64.u32 %rd37, %r30; + cvt.u64.u32 %rd38, %r31; + shl.b64 %rd39, %rd38, 32; + or.b64 %rd40, %rd37, %rd39; + .loc 2 261 15 // standard.py:261:15 @[ cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:40:25 ] + add.s64 %rd41, %rd36, %rd40; + .loc 2 291 36 // standard.py:291:36 @[ cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:40:25 ] + mov.b64 {_, %r32}, %rd41; + cvt.u32.u64 %r33, %rd41; + shfl.sync.bfly.b32 %r34, %r33, 4, 31, -1; + shfl.sync.bfly.b32 %r35, %r32, 4, 31, -1; + cvt.u64.u32 %rd42, %r34; + cvt.u64.u32 %rd43, %r35; + shl.b64 %rd44, %rd43, 32; + or.b64 %rd45, %rd42, %rd44; + .loc 2 261 15 // standard.py:261:15 @[ cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:40:25 ] + add.s64 %rd46, %rd41, %rd45; + .loc 2 291 36 // standard.py:291:36 @[ cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:40:25 ] + mov.b64 {_, %r36}, %rd46; + cvt.u32.u64 %r37, %rd46; + shfl.sync.bfly.b32 %r38, %r37, 2, 31, -1; + shfl.sync.bfly.b32 %r39, %r36, 2, 31, -1; + cvt.u64.u32 %rd47, %r38; + cvt.u64.u32 %rd48, %r39; + shl.b64 %rd49, %rd48, 32; + or.b64 %rd50, %rd47, %rd49; + .loc 2 261 15 // standard.py:261:15 @[ cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:40:25 ] + add.s64 %rd51, %rd46, %rd50; + .loc 2 291 36 // standard.py:291:36 @[ cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:40:25 ] + mov.b64 {_, %r40}, %rd51; + cvt.u32.u64 %r41, %rd51; + shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1; + shfl.sync.bfly.b32 %r43, %r40, 1, 31, -1; + cvt.u64.u32 %rd52, %r42; + .loc 2 261 15 // standard.py:261:15 @[ cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:40:25 ] + add.s64 %rd53, %rd51, %rd52; +$L__tmp2: + .loc 1 41 19 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:41:19 + cvt.u32.u64 %r22, %rd53; + .loc 1 42 25 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:42:25 + shl.b64 %rd54, %rd1, 2; + add.s64 %rd31, %rd15, %rd54; + .loc 1 42 36 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:42:36 + and.b32 %r44, %r1, 63; + setp.eq.b32 %p11, %r44, 0; + and.pred %p8, %p10, %p11; + // begin inline asm + @%p8 st.global.b32 [ %rd31 + 0 ], { %r22 }; + // end inline asm + .loc 1 43 40 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:43:40 + @%p2 bra $L__BB0_11; +// %bb.7: // %.lr.ph14.preheader + .loc 1 0 40 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:0:40 + ld.param.b64 %rd19, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_6]; + ld.param.b64 %rd17, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_4]; + ld.param.b64 %rd16, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_3]; + ld.param.b64 %rd14, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_1]; + and.b32 %r8, %r1, 32; + setp.lt.s64 %p12, %rd18, 2; + setp.gt.s64 %p13, %rd18, 1; + selp.b64 %rd55, %rd18, 0, %p13; + selp.b64 %rd56, 1, 0, %p12; + add.s64 %rd57, %rd55, %rd56; + mul.lo.s64 %rd7, %rd57, %rd1; + add.s64 %rd8, %rd18, 1; + add.s64 %rd58, %rd19, 127; + shr.s64 %rd59, %rd58, 63; + shr.u64 %rd60, %rd59, 57; + add.s64 %rd61, %rd58, %rd60; + shr.s64 %rd62, %rd61, 7; + and.b64 %rd63, %rd58, 127; + setp.ne.b64 %p14, %rd63, 0; + setp.lt.s64 %p15, %rd58, 0; + and.pred %p16, %p15, %p14; + selp.b64 %rd64, -1, 0, %p16; + add.s64 %rd9, %rd62, %rd64; + mov.b32 %r52, 0; +$L__BB0_8: // %.lr.ph14 + // =>This Inner Loop Header: Depth=1 + .loc 1 45 29 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:45:29 + add.s32 %r10, %r2, %r52; + setp.lt.s32 %p20, %r10, %r12; + .loc 1 49 41 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:49:41 + cvt.s64.s32 %rd73, %r10; + add.s64 %rd10, %rd7, %rd73; + .loc 1 49 34 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:49:34 + shl.b64 %rd74, %rd10, 3; + add.s64 %rd67, %rd14, %rd74; + .loc 1 49 103 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:49:103 + and.pred %p18, %p10, %p20; + .loc 1 49 93 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:49:93 + // begin inline asm + mov.u64 %rd65, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd65, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd66, 0x0; + @%p18 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd66 }, [ %rd67 + 0 ], %rd65; + // end inline asm + // begin inline asm + mov.u64 %rd69, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd69, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd70, 0x0; + @%p18 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd70 }, [ %rd67 + 0 ], %rd69; + // end inline asm + .loc 1 52 22 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:52:22 + setp.lt.s32 %p21, %r10, %r22; + .loc 1 54 37 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:54:37 + cvt.s64.s32 %rd75, %rd70; + selp.b64 %rd76, %rd75, %rd18, %p21; + .loc 1 58 39 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:58:39 + shr.s64 %rd77, %rd76, 63; + and.b64 %rd78, %rd77, %rd8; + add.s64 %rd79, %rd78, %rd76; + .loc 1 59 32 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:59:32 + setp.lt.s64 %p22, %rd79, 0; + .loc 1 59 50 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:59:50 + setp.gt.s64 %p23, %rd79, %rd9; + .loc 1 59 112 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:59:112 + or.pred %p24, %p22, %p23; + .loc 1 59 130 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:59:130 + and.pred %p25, %p18, %p24; + not.pred %p26, %p25; + @%p26 bra $L__BB0_10; + bra.uni $L__BB0_9; +$L__BB0_10: // in Loop: Header=BB0_8 Depth=1 + .loc 1 42 36 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:42:36 + setp.eq.b32 %p30, %r8, 0; + .loc 1 54 37 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:54:37 + cvt.s64.s32 %rd82, %rd66; + selp.b64 %rd83, %rd82, %rd18, %p21; + .loc 1 58 39 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:58:39 + shr.s64 %rd84, %rd83, 63; + and.b64 %rd85, %rd84, %rd8; + .loc 1 50 23 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:50:23 + cvt.u32.u64 %r47, %rd70; + .loc 1 59 130 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:59:130 + bar.sync 0; + .loc 1 61 29 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:61:29 + shl.b64 %rd86, %rd10, 2; + add.s64 %rd80, %rd16, %rd86; + .loc 1 61 94 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:61:94 + and.pred %p27, %p30, %p18; + // begin inline asm + @%p27 st.global.b32 [ %rd80 + 0 ], { %r47 }; + // end inline asm + .loc 1 62 29 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:62:29 + shl.b64 %rd87, %rd83, 2; + add.s64 %rd88, %rd17, %rd87; + shl.b64 %rd89, %rd85, 2; + add.s64 %rd90, %rd88, %rd89; + add.s64 %rd92, %rd90, %rd54; + add.s64 %rd81, %rd92, %rd100; + mov.b32 %r48, 1; + .loc 1 62 95 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:62:95 + // begin inline asm + @%p27 st.global.b32 [ %rd81 + 0 ], { %r48 }; + // end inline asm + .loc 1 43 40 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:43:40 + add.s32 %r52, %r52, 32; + setp.lt.s32 %p31, %r52, %r12; + @%p31 bra $L__BB0_8; +$L__BB0_11: // %._crit_edge15 + .loc 1 43 4 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:43:4 + ret; +$L__BB0_9: + .loc 1 59 130 // cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py:59:130 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd94, assertFunc_0; + cvta.global.u64 %rd95, %rd94; + st.param.b64 [param3], %rd95; + mov.b64 %rd96, assertFile_0; + cvta.global.u64 %rd97, %rd96; + st.param.b64 [param1], %rd97; + mov.b64 %rd98, assertMessage_0; + cvta.global.u64 %rd99, %rd98; + st.param.b64 [param0], %rd99; + st.param.b64 [param4], 1; + st.param.b32 [param2], 59; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 281 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x112 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 112 +.b8 113 +.b8 50 +.b8 104 +.b8 113 +.b8 118 +.b8 51 +.b8 109 +.b8 110 +.b8 112 +.b8 97 +.b8 105 +.b8 52 +.b8 98 +.b8 101 +.b8 51 +.b8 109 +.b8 102 +.b8 51 +.b8 100 +.b8 107 +.b8 117 +.b8 53 +.b8 108 +.b8 122 +.b8 116 +.b8 100 +.b8 100 +.b8 98 +.b8 104 +.b8 116 +.b8 111 +.b8 113 +.b8 99 +.b8 112 +.b8 102 +.b8 52 +.b8 120 +.b8 117 +.b8 101 +.b8 54 +.b8 54 +.b8 120 +.b8 108 +.b8 55 +.b8 51 +.b8 115 +.b8 98 +.b8 109 +.b8 103 +.b8 120 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 112 +.b8 113 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x63 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 112 +.b8 117 +.b8 116 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 110 +.b8 101 +.b8 119 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 115 +.b8 99 +.b8 97 +.b8 108 +.b8 97 +.b8 114 +.b8 95 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 119 +.b8 104 +.b8 101 +.b8 114 +.b8 101 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xee:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x103:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 40 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source b/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source new file mode 100644 index 0000000000000000000000000000000000000000..124b2741f0d55f66e636f9dfb0f885aa5144781c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source @@ -0,0 +1,379 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":18:0) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc79 = loc(unknown) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0) +#loc97 = loc("in_ptr0"(#loc)) +#loc98 = loc("in_ptr1"(#loc)) +#loc99 = loc("out_ptr1"(#loc)) +#loc100 = loc("out_ptr2"(#loc)) +#loc101 = loc("out_ptr3"(#loc)) +#loc102 = loc("ks0"(#loc)) +#loc103 = loc("ks1"(#loc)) +#loc104 = loc("xnumel"(#loc)) +#loc105 = loc("r0_numel"(#loc)) +#loc151 = loc("input"(#loc77)) +#loc152 = loc("a"(#loc82)) +#loc153 = loc("b"(#loc82)) +#loc154 = loc("a"(#loc86)) +module { + tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 128 : i32 loc(#loc106) + %xoffset = tt.get_program_id x : i32 loc(#loc107) + %xoffset_1 = arith.constant 1 : i32 loc(#loc108) + %xoffset_2 = arith.constant 1 : i32 loc(#loc108) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc108) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc109) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc110) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc111) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc111) + %xmask = arith.constant dense<128> : tensor<1x1xi32> loc(#loc112) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc112) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc113) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc114) + %_tmp3 = arith.constant 0 : i64 loc(#loc115) + %_tmp3_9 = arith.constant dense<0> : tensor<1x32xi64> loc(#loc115) + %c0_i32 = arith.constant 0 : i32 loc(#loc11) + %c32_i32 = arith.constant 32 : i32 loc(#loc11) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc11) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc11) + %2 = arith.bitcast %c32_i32 : i32 to i32 loc(#loc11) + %3 = ub.poison : i32 loc(#loc11) + %_tmp3_10 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_14 = %_tmp3_9) -> (tensor<1x32xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc117) + %r0_index_15 = arith.addi %r0_index, %r0_base_8 : tensor<1x32xi32> loc(#loc117) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc118) + %r0_mask_16 = arith.cmpi slt, %r0_index_15, %r0_mask : tensor<1x32xi32> loc(#loc118) + %tmp0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc119) + %tmp0_17 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc119) + %tmp0_18 = arith.muli %tmp0_17, %tmp0 : tensor<1x1xi64> loc(#loc119) + %tmp0_19 = arith.extsi %r0_index_15 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc120) + %tmp0_20 = tt.broadcast %tmp0_18 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc120) + %tmp0_21 = arith.addi %tmp0_19, %tmp0_20 : tensor<1x32xi64> loc(#loc120) + %tmp0_22 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc121) + %tmp0_23 = tt.addptr %tmp0_22, %tmp0_21 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc121) + %tmp0_24 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc122) + %tmp0_25 = arith.andi %r0_mask_16, %tmp0_24 : tensor<1x32xi1> loc(#loc122) + %tmp0_26 = arith.constant 0.000000e+00 : f32 loc(#loc123) + %tmp0_27 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc123) + %tmp0_28 = arith.fptosi %tmp0_27 : tensor<1x32xf32> to tensor<1x32xi32> loc(#loc123) + %tmp0_29 = tt.load %tmp0_23, %tmp0_25, %tmp0_28 evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc123) + %tmp1 = arith.extsi %tmp0_29 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc124) + %tmp4 = arith.addi %_tmp3_14, %tmp1 : tensor<1x32xi64> loc(#loc125) + %_tmp3_30 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc126) + %_tmp3_31 = arith.andi %r0_mask_16, %_tmp3_30 : tensor<1x32xi1> loc(#loc126) + %_tmp3_32 = arith.select %_tmp3_31, %tmp4, %_tmp3_14 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc127) + scf.yield %_tmp3_32 : tensor<1x32xi64> loc(#loc23) + } loc(#loc116) + %tmp3 = tt.call @"triton.language.standard.sum__i64S1_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp3_10) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc128) + %tmp3_11 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc129) + %tmp5 = arith.trunci %tmp3_11 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc130) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc27) + %5 = tt.addptr %4, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc27) + tt.store %5, %tmp5, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc28) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc29) + %c32_i32_13 = arith.constant 32 : i32 loc(#loc29) + %6 = arith.bitcast %c0_i32_12 : i32 to i32 loc(#loc29) + %7 = arith.bitcast %r0_numel : i32 to i32 loc(#loc29) + %8 = arith.bitcast %c32_i32_13 : i32 to i32 loc(#loc29) + %9 = ub.poison : i32 loc(#loc29) + scf.for %r0_offset = %6 to %7 step %8 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc131) + %r0_index_14 = arith.addi %r0_index, %r0_base_8 : tensor<1x32xi32> loc(#loc131) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc132) + %r0_mask_15 = arith.cmpi slt, %r0_index_14, %r0_mask : tensor<1x32xi32> loc(#loc132) + %tmp6 = arith.constant 1 : i32 loc(#loc133) + %tmp6_16 = arith.extsi %tmp6 : i32 to i64 loc(#loc133) + %tmp6_17 = arith.cmpi sge, %tmp6_16, %ks0 : i64 loc(#loc133) + %tmp6_18 = arith.constant 1 : i32 loc(#loc134) + %tmp6_19 = arith.constant 1 : i32 loc(#loc134) + %tmp6_20 = arith.extui %tmp6_17 : i1 to i32 loc(#loc134) + %tmp6_21 = arith.muli %tmp6_19, %tmp6_20 : i32 loc(#loc134) + %tmp6_22 = arith.constant 1 : i32 loc(#loc135) + %tmp6_23 = arith.extsi %tmp6_22 : i32 to i64 loc(#loc135) + %tmp6_24 = arith.cmpi sgt, %ks0, %tmp6_23 : i64 loc(#loc135) + %tmp6_25 = arith.extui %tmp6_24 : i1 to i64 loc(#loc136) + %tmp6_26 = arith.muli %ks0, %tmp6_25 : i64 loc(#loc136) + %tmp6_27 = arith.extsi %tmp6_21 : i32 to i64 loc(#loc137) + %tmp6_28 = arith.addi %tmp6_27, %tmp6_26 : i64 loc(#loc137) + %tmp6_29 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc138) + %tmp6_30 = tt.splat %tmp6_28 : i64 -> tensor<1x1xi64> loc(#loc138) + %tmp6_31 = arith.muli %tmp6_29, %tmp6_30 : tensor<1x1xi64> loc(#loc138) + %tmp6_32 = arith.extsi %r0_index_14 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc139) + %tmp6_33 = tt.broadcast %tmp6_31 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc139) + %tmp6_34 = arith.addi %tmp6_32, %tmp6_33 : tensor<1x32xi64> loc(#loc139) + %tmp6_35 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc140) + %tmp6_36 = tt.addptr %tmp6_35, %tmp6_34 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc140) + %tmp6_37 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc141) + %tmp6_38 = arith.andi %r0_mask_15, %tmp6_37 : tensor<1x32xi1> loc(#loc141) + %tmp6_39 = arith.constant 0.000000e+00 : f32 loc(#loc142) + %tmp6_40 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc142) + %tmp6_41 = arith.fptosi %tmp6_40 : tensor<1x32xf32> to tensor<1x32xi64> loc(#loc142) + %tmp6_42 = tt.load %tmp6_36, %tmp6_38, %tmp6_41 evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc142) + %tmp7 = arith.trunci %tmp6_42 : tensor<1x32xi64> to tensor<1x32xi32> loc(#loc143) + %tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32> -> tensor<1x32xi32> loc(#loc144) + %tmp9_43 = arith.cmpi slt, %r0_index_14, %tmp9 : tensor<1x32xi32> loc(#loc144) + %tmp11 = arith.extsi %tmp7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc145) + %tmp11_44 = tt.splat %ks0 : i64 -> tensor<1x32xi64> loc(#loc145) + %tmp11_45 = arith.select %tmp9_43, %tmp11, %tmp11_44 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc145) + %tmp12 = arith.constant 1 : i32 loc(#loc146) + %tmp12_46 = arith.constant 1 : i64 loc(#loc146) + %tmp12_47 = arith.addi %tmp12_46, %ks0 : i64 loc(#loc146) + %tmp13 = tt.splat %tmp12_47 : i64 -> tensor<1x32xi64> loc(#loc147) + %tmp13_48 = arith.addi %tmp11_45, %tmp13 : tensor<1x32xi64> loc(#loc147) + %tmp14 = arith.constant 0 : i32 loc(#loc148) + %tmp14_49 = arith.extsi %tmp14 : i32 to i64 loc(#loc148) + %tmp14_50 = tt.splat %tmp14_49 : i64 -> tensor<1x32xi64> loc(#loc148) + %tmp14_51 = arith.cmpi slt, %tmp11_45, %tmp14_50 : tensor<1x32xi64> loc(#loc148) + %tmp15 = arith.select %tmp14_51, %tmp13_48, %tmp11_45 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc149) + %c0_i32_52 = arith.constant 0 : i32 loc(#loc49) + %10 = arith.extsi %c0_i32_52 : i32 to i64 loc(#loc49) + %11 = tt.splat %10 : i64 -> tensor<1x32xi64> loc(#loc49) + %12 = arith.cmpi sle, %11, %tmp15 : tensor<1x32xi64> loc(#loc49) + %c127_i32 = arith.constant 127 : i32 loc(#loc50) + %c127_i64 = arith.constant 127 : i64 loc(#loc50) + %13 = arith.addi %c127_i64, %ks1 : i64 loc(#loc50) + %14 = tt.call @"torch._inductor.runtime.triton_helpers.div_floor_integer__i64__(1,)cconstexpr_128_"(%13) : (i64) -> i64 loc(#loc51) + %c1_i32 = arith.constant 1 : i32 loc(#loc52) + %c1_i64 = arith.constant 1 : i64 loc(#loc52) + %15 = arith.addi %c1_i64, %14 : i64 loc(#loc52) + %16 = tt.splat %15 : i64 -> tensor<1x32xi64> loc(#loc53) + %17 = arith.cmpi slt, %tmp15, %16 : tensor<1x32xi64> loc(#loc53) + %18 = arith.andi %12, %17 : tensor<1x32xi1> loc(#loc54) + %19 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc55) + %20 = arith.andi %r0_mask_15, %19 : tensor<1x32xi1> loc(#loc55) + %true = arith.constant true loc(#loc56) + %cst = arith.constant dense : tensor<1x32xi1> loc(#loc56) + %21 = arith.xori %20, %cst : tensor<1x32xi1> loc(#loc56) + %22 = arith.ori %18, %21 : tensor<1x32xi1> loc(#loc57) + tt.assert %22, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1> loc(#loc58) + %tmp17 = arith.constant 1 : i32 loc(#loc150) + %tmp17_53 = arith.constant dense<1> : tensor<1x1xi32> loc(#loc150) + %c1_i32_54 = arith.constant 1 : i32 loc(#loc60) + %23 = arith.extsi %c1_i32_54 : i32 to i64 loc(#loc60) + %24 = arith.cmpi sge, %23, %ks0 : i64 loc(#loc60) + %c1_i32_55 = arith.constant 1 : i32 loc(#loc61) + %c1_i32_56 = arith.constant 1 : i32 loc(#loc61) + %25 = arith.extui %24 : i1 to i32 loc(#loc61) + %26 = arith.muli %c1_i32_56, %25 : i32 loc(#loc61) + %c1_i32_57 = arith.constant 1 : i32 loc(#loc62) + %27 = arith.extsi %c1_i32_57 : i32 to i64 loc(#loc62) + %28 = arith.cmpi sgt, %ks0, %27 : i64 loc(#loc62) + %29 = arith.extui %28 : i1 to i64 loc(#loc63) + %30 = arith.muli %ks0, %29 : i64 loc(#loc63) + %31 = arith.extsi %26 : i32 to i64 loc(#loc64) + %32 = arith.addi %31, %30 : i64 loc(#loc64) + %33 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc65) + %34 = tt.splat %32 : i64 -> tensor<1x1xi64> loc(#loc65) + %35 = arith.muli %33, %34 : tensor<1x1xi64> loc(#loc65) + %36 = arith.extsi %r0_index_14 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc66) + %37 = tt.broadcast %35 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc66) + %38 = arith.addi %36, %37 : tensor<1x32xi64> loc(#loc66) + %39 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc67) + %40 = tt.addptr %39, %38 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc67) + %41 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc68) + %42 = arith.andi %r0_mask_15, %41 : tensor<1x32xi1> loc(#loc68) + tt.store %40, %tmp7, %42 : tensor<1x32x!tt.ptr> loc(#loc69) + %43 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc70) + %44 = tt.broadcast %43 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc70) + %45 = arith.addi %tmp15, %44 : tensor<1x32xi64> loc(#loc70) + %46 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc71) + %47 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc71) + %48 = arith.muli %47, %46 : tensor<1x1xi64> loc(#loc71) + %49 = tt.broadcast %48 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc72) + %50 = arith.addi %45, %49 : tensor<1x32xi64> loc(#loc72) + %51 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc73) + %52 = tt.addptr %51, %50 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc73) + %53 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc74) + %54 = arith.andi %r0_mask_15, %53 : tensor<1x32xi1> loc(#loc74) + %cst_58 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc75) + tt.store %52, %cst_58, %54 : tensor<1x32x!tt.ptr> loc(#loc75) + } loc(#loc29) + tt.return loc(#loc76) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x32xi64> loc("input"(#loc77))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc78) + tt.reduce.return %2 : i64 loc(#loc78) + }) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc78) + tt.return %0 : tensor<1xi64> loc(#loc80) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc81) + tt.return %1 : tensor<1xi64> loc(#loc81) + } loc(#loc77) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc82)), %b: i64 loc("b"(#loc82))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc83) + tt.return %0 : i64 loc(#loc84) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc85) + tt.return %1 : i64 loc(#loc85) + } loc(#loc82) + tt.func private @"torch._inductor.runtime.triton_helpers.div_floor_integer__i64__(1,)cconstexpr_128_"(%a: i64 loc("a"(#loc86))) -> i64 attributes {noinline = false} { + %quot = arith.constant 128 : i32 loc(#loc155) + %quot_0 = arith.constant 128 : i64 loc(#loc155) + %quot_1 = arith.divsi %a, %quot_0 : i64 loc(#loc155) + %remainder = arith.constant 128 : i32 loc(#loc156) + %remainder_2 = arith.constant 128 : i64 loc(#loc156) + %remainder_3 = arith.remsi %a, %remainder_2 : i64 loc(#loc156) + %fixed = arith.constant 0 : i32 loc(#loc157) + %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc157) + %fixed_5 = arith.cmpi ne, %remainder_3, %fixed_4 : i64 loc(#loc157) + %fixed_6 = arith.constant 1 : i32 loc(#loc158) + %fixed_7 = arith.constant 1 : i64 loc(#loc158) + %fixed_8 = arith.subi %quot_1, %fixed_7 : i64 loc(#loc158) + %fixed_9 = arith.select %fixed_5, %fixed_8, %quot_1 : i64 loc(#loc159) + %c0_i32 = arith.constant 0 : i32 loc(#loc92) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc92) + %1 = arith.cmpi slt, %a, %0 : i64 loc(#loc92) + %false = arith.constant false loc(#loc93) + %2 = arith.cmpi ne, %1, %false : i1 loc(#loc93) + %3 = arith.select %2, %fixed_9, %quot_1 : i64 loc(#loc94) + tt.return %3 : i64 loc(#loc95) + ^bb1: // no predecessors + %4 = ub.poison : i64 loc(#loc96) + tt.return %4 : i64 loc(#loc96) + } loc(#loc86) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":28:43) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":29:40) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":30:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":31:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":35:45) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":35:41) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":35:34) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":35:60) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":35:50) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":36:23) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":38:23) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":39:35) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":39:48) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":39:8) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":40:25) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":40:28) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":41:19) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":42:25) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":42:36) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":43:40) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":44:31) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":45:29) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:60) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:52) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:86) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:77) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:68) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:45) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:41) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:34) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:103) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:93) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":50:23) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":52:22) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":54:37) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":55:20) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":56:24) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":57:24) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":58:39) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:32) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:94) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:100) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:55) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:50) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:42) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:122) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:112) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:110) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:130) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":60:35) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":61:55) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":61:47) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":61:81) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":61:72) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":61:63) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":61:40) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":61:36) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":61:29) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":61:104) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":61:94) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":62:53) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":62:62) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":62:58) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":62:29) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":62:105) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":62:95) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":43:4) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4) +#loc106 = loc("xnumel"(#loc1)) +#loc107 = loc("xoffset"(#loc2)) +#loc108 = loc("xoffset"(#loc3)) +#loc109 = loc("xindex"(#loc4)) +#loc110 = loc("xindex"(#loc5)) +#loc111 = loc("xindex"(#loc6)) +#loc112 = loc("xmask"(#loc7)) +#loc113 = loc("r0_base"(#loc8)) +#loc114 = loc("r0_base"(#loc9)) +#loc115 = loc("_tmp3"(#loc10)) +#loc116 = loc("_tmp3"(#loc11)) +#loc117 = loc("r0_index"(#loc12)) +#loc118 = loc("r0_mask"(#loc13)) +#loc119 = loc("tmp0"(#loc14)) +#loc120 = loc("tmp0"(#loc15)) +#loc121 = loc("tmp0"(#loc16)) +#loc122 = loc("tmp0"(#loc17)) +#loc123 = loc("tmp0"(#loc18)) +#loc124 = loc("tmp1"(#loc19)) +#loc125 = loc("tmp4"(#loc20)) +#loc126 = loc("_tmp3"(#loc21)) +#loc127 = loc("_tmp3"(#loc22)) +#loc128 = loc("tmp3"(#loc24)) +#loc129 = loc("tmp3"(#loc25)) +#loc130 = loc("tmp5"(#loc26)) +#loc131 = loc("r0_index"(#loc30)) +#loc132 = loc("r0_mask"(#loc31)) +#loc133 = loc("tmp6"(#loc32)) +#loc134 = loc("tmp6"(#loc33)) +#loc135 = loc("tmp6"(#loc34)) +#loc136 = loc("tmp6"(#loc35)) +#loc137 = loc("tmp6"(#loc36)) +#loc138 = loc("tmp6"(#loc37)) +#loc139 = loc("tmp6"(#loc38)) +#loc140 = loc("tmp6"(#loc39)) +#loc141 = loc("tmp6"(#loc40)) +#loc142 = loc("tmp6"(#loc41)) +#loc143 = loc("tmp7"(#loc42)) +#loc144 = loc("tmp9"(#loc43)) +#loc145 = loc("tmp11"(#loc44)) +#loc146 = loc("tmp12"(#loc45)) +#loc147 = loc("tmp13"(#loc46)) +#loc148 = loc("tmp14"(#loc47)) +#loc149 = loc("tmp15"(#loc48)) +#loc150 = loc("tmp17"(#loc59)) +#loc155 = loc("quot"(#loc87)) +#loc156 = loc("remainder"(#loc88)) +#loc157 = loc("fixed"(#loc89)) +#loc158 = loc("fixed"(#loc90)) +#loc159 = loc("fixed"(#loc91)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..57e554b58a8e4ebe1c2e03017b74863d976851d4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir @@ -0,0 +1,271 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":18:0) +#loc1 = loc(unknown) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":40:25) +#loc68 = loc("in_ptr0"(#loc)) +#loc69 = loc("in_ptr1"(#loc)) +#loc70 = loc("out_ptr1"(#loc)) +#loc71 = loc("out_ptr2"(#loc)) +#loc72 = loc("out_ptr3"(#loc)) +#loc73 = loc("ks0"(#loc)) +#loc74 = loc("ks1"(#loc)) +#loc75 = loc("xnumel"(#loc)) +#loc76 = loc("r0_numel"(#loc)) +#loc91 = loc("tmp3"(#loc18)) +#loc124 = loc(callsite(#loc1 at #loc91)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x32xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x32xi64, #blocked1> loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c127_i64 = arith.constant 127 : i64 loc(#loc1) + %cst_1 = arith.constant dense : tensor<1x32xi1, #blocked1> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<1x32xi32, #blocked1> loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x32xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc77) + %xmask = arith.cmpi slt, %xoffset, %c128_i32 : i32 loc(#loc78) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc79) + %r0_base_4 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc79) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> loc(#loc79) + %r0_base_6 = tt.expand_dims %r0_base_4 {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> loc(#loc79) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32, #blocked1> loc(#loc80) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc81) + %tmp0_7 = arith.muli %ks0, %tmp0 : i64 loc(#loc81) + %tmp0_8 = tt.splat %tmp0_7 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc121) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc83) + %tmp0_10 = tt.splat %xmask : i1 -> tensor<1x32xi1, #blocked1> loc(#loc122) + %_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 iter_args(%_tmp3_31 = %cst_0) -> (tensor<1x32xi64, #blocked1>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked1> loc(#loc86) + %r0_index_32 = arith.addi %r0_index, %r0_base_6 : tensor<1x32xi32, #blocked1> loc(#loc86) + %r0_mask_33 = arith.cmpi slt, %r0_index_32, %r0_mask : tensor<1x32xi32, #blocked1> loc(#loc80) + %tmp0_34 = arith.extsi %r0_index_32 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc82) + %tmp0_35 = arith.addi %tmp0_34, %tmp0_8 : tensor<1x32xi64, #blocked1> loc(#loc82) + %tmp0_36 = tt.addptr %tmp0_9, %tmp0_35 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc83) + %tmp0_37 = arith.andi %r0_mask_33, %tmp0_10 : tensor<1x32xi1, #blocked1> loc(#loc84) + %tmp0_38 = tt.load %tmp0_36, %tmp0_37, %cst_2 evictionPolicy = evict_first : tensor<1x32x!tt.ptr, #blocked1> loc(#loc87) + %tmp1 = arith.extsi %tmp0_38 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc88) + %tmp4 = arith.addi %_tmp3_31, %tmp1 : tensor<1x32xi64, #blocked1> loc(#loc89) + %_tmp3_39 = arith.select %tmp0_37, %tmp4, %_tmp3_31 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc90) + scf.yield %_tmp3_39 : tensor<1x32xi64, #blocked1> loc(#loc16) + } loc(#loc85) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_31: i64 loc(callsite(#loc1 at #loc91)), %tmp3_32: i64 loc(callsite(#loc1 at #loc91))): + %tmp3_33 = arith.addi %tmp3_31, %tmp3_32 : i64 loc(#loc133) + tt.reduce.return %tmp3_33 : i64 loc(#loc123) + }) : (tensor<1x32xi64, #blocked1>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc123) + %0 = ttg.convert_layout %tmp3 : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc20) + %tmp3_11 = tt.expand_dims %0 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc92) + %tmp3_12 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi64, #blocked1> loc(#loc92) + %tmp5 = arith.trunci %tmp3_11 : tensor<1x1xi64, #blocked> to tensor<1x1xi32, #blocked> loc(#loc93) + %tmp5_13 = arith.trunci %tmp3_12 : tensor<1x1xi64, #blocked1> to tensor<1x1xi32, #blocked1> loc(#loc93) + %1 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc23) + %2 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc24) + %3 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc24) + tt.store %2, %tmp5, %3 : tensor<1x1x!tt.ptr, #blocked> loc(#loc24) + %r0_mask_14 = tt.splat %r0_numel : i32 -> tensor<1x32xi32, #blocked> loc(#loc94) + %tmp6 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc95) + %tmp6_15 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc96) + %tmp6_16 = arith.extui %tmp6_15 : i1 to i64 loc(#loc97) + %tmp6_17 = arith.muli %ks0, %tmp6_16 : i64 loc(#loc97) + %tmp6_18 = arith.extui %tmp6 : i1 to i64 loc(#loc125) + %tmp6_19 = arith.addi %tmp6_18, %tmp6_17 : i64 loc(#loc98) + %tmp6_20 = arith.muli %tmp0, %tmp6_19 : i64 loc(#loc100) + %tmp6_21 = tt.splat %tmp6_20 : i64 -> tensor<1x32xi64, #blocked> loc(#loc126) + %tmp6_22 = tt.splat %tmp6_20 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc126) + %tmp6_23 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked> loc(#loc102) + %tmp6_24 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc102) + %tmp6_25 = tt.splat %xmask : i1 -> tensor<1x32xi1, #blocked> loc(#loc127) + %tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32, #blocked> -> tensor<1x32xi32, #blocked> loc(#loc104) + %tmp9_26 = tt.broadcast %tmp5_13 : tensor<1x1xi32, #blocked1> -> tensor<1x32xi32, #blocked1> loc(#loc104) + %tmp11 = tt.splat %ks0 : i64 -> tensor<1x32xi64, #blocked> loc(#loc105) + %tmp11_27 = tt.splat %ks0 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc105) + %tmp12 = arith.addi %ks0, %c1_i64 : i64 loc(#loc106) + %tmp13 = tt.splat %tmp12 : i64 -> tensor<1x32xi64, #blocked> loc(#loc107) + %tmp13_28 = tt.splat %tmp12 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc107) + %4 = arith.addi %ks1, %c127_i64 : i64 loc(#loc39) + %quot = arith.divsi %4, %c128_i64 : i64 loc(#loc128) + %remainder = arith.remsi %4, %c128_i64 : i64 loc(#loc129) + %fixed = arith.cmpi ne, %remainder, %c0_i64 : i64 loc(#loc130) + %fixed_29 = arith.subi %quot, %c1_i64 : i64 loc(#loc131) + %fixed_30 = arith.select %fixed, %fixed_29, %quot : i64 loc(#loc132) + %5 = arith.cmpi slt, %4, %c0_i64 : i64 loc(#loc113) + %6 = arith.select %5, %fixed_30, %quot : i64 loc(#loc114) + %7 = arith.addi %6, %c1_i64 : i64 loc(#loc48) + %8 = tt.splat %7 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc49) + %9 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc50) + %10 = tt.splat %tmp0 : i64 -> tensor<1x32xi64, #blocked> loc(#loc51) + %11 = tt.splat %tmp0_7 : i64 -> tensor<1x32xi64, #blocked> loc(#loc115) + %12 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked> loc(#loc54) + scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked> loc(#loc116) + %r0_index_31 = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked1> loc(#loc116) + %r0_index_32 = arith.addi %r0_index, %r0_base_5 : tensor<1x32xi32, #blocked> loc(#loc116) + %r0_index_33 = arith.addi %r0_index_31, %r0_base_6 : tensor<1x32xi32, #blocked1> loc(#loc116) + %r0_mask_34 = arith.cmpi slt, %r0_index_32, %r0_mask_14 : tensor<1x32xi32, #blocked> loc(#loc94) + %r0_mask_35 = arith.cmpi slt, %r0_index_33, %r0_mask : tensor<1x32xi32, #blocked1> loc(#loc94) + %tmp6_36 = arith.extsi %r0_index_32 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> loc(#loc101) + %tmp6_37 = arith.extsi %r0_index_33 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc101) + %tmp6_38 = arith.addi %tmp6_36, %tmp6_21 : tensor<1x32xi64, #blocked> loc(#loc101) + %tmp6_39 = arith.addi %tmp6_37, %tmp6_22 : tensor<1x32xi64, #blocked1> loc(#loc101) + %tmp6_40 = tt.addptr %tmp6_23, %tmp6_38 : tensor<1x32x!tt.ptr, #blocked>, tensor<1x32xi64, #blocked> loc(#loc102) + %tmp6_41 = tt.addptr %tmp6_24, %tmp6_39 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc102) + %tmp6_42 = arith.andi %r0_mask_34, %tmp6_25 : tensor<1x32xi1, #blocked> loc(#loc103) + %tmp6_43 = arith.andi %r0_mask_35, %tmp0_10 : tensor<1x32xi1, #blocked1> loc(#loc103) + %tmp6_44 = tt.load %tmp6_40, %tmp6_42, %cst evictionPolicy = evict_first : tensor<1x32x!tt.ptr, #blocked> loc(#loc117) + %tmp6_45 = tt.load %tmp6_41, %tmp6_43, %cst_0 evictionPolicy = evict_first : tensor<1x32x!tt.ptr, #blocked1> loc(#loc117) + %tmp7 = arith.trunci %tmp6_44 : tensor<1x32xi64, #blocked> to tensor<1x32xi32, #blocked> loc(#loc118) + %tmp7_46 = arith.trunci %tmp6_45 : tensor<1x32xi64, #blocked1> to tensor<1x32xi32, #blocked1> loc(#loc118) + %tmp9_47 = arith.cmpi slt, %r0_index_32, %tmp9 : tensor<1x32xi32, #blocked> loc(#loc104) + %tmp9_48 = arith.cmpi slt, %r0_index_33, %tmp9_26 : tensor<1x32xi32, #blocked1> loc(#loc104) + %tmp11_49 = arith.extsi %tmp7 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> loc(#loc105) + %tmp11_50 = arith.extsi %tmp7_46 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc105) + %tmp11_51 = arith.select %tmp9_47, %tmp11_49, %tmp11 : tensor<1x32xi1, #blocked>, tensor<1x32xi64, #blocked> loc(#loc105) + %tmp11_52 = arith.select %tmp9_48, %tmp11_50, %tmp11_27 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc105) + %tmp13_53 = arith.addi %tmp11_51, %tmp13 : tensor<1x32xi64, #blocked> loc(#loc107) + %tmp13_54 = arith.addi %tmp11_52, %tmp13_28 : tensor<1x32xi64, #blocked1> loc(#loc107) + %tmp14 = arith.cmpi slt, %tmp11_51, %cst : tensor<1x32xi64, #blocked> loc(#loc119) + %tmp14_55 = arith.cmpi slt, %tmp11_52, %cst_0 : tensor<1x32xi64, #blocked1> loc(#loc119) + %tmp15 = arith.select %tmp14, %tmp13_53, %tmp11_51 : tensor<1x32xi1, #blocked>, tensor<1x32xi64, #blocked> loc(#loc120) + %tmp15_56 = arith.select %tmp14_55, %tmp13_54, %tmp11_52 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc120) + %13 = arith.cmpi sge, %tmp15_56, %cst_0 : tensor<1x32xi64, #blocked1> loc(#loc61) + %14 = arith.cmpi slt, %tmp15_56, %8 : tensor<1x32xi64, #blocked1> loc(#loc49) + %15 = arith.andi %13, %14 : tensor<1x32xi1, #blocked1> loc(#loc62) + %16 = arith.xori %tmp6_43, %cst_1 : tensor<1x32xi1, #blocked1> loc(#loc63) + %17 = arith.ori %15, %16 : tensor<1x32xi1, #blocked1> loc(#loc64) + tt.assert %17, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1, #blocked1> loc(#loc65) + %18 = tt.addptr %9, %tmp6_39 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc50) + tt.store %18, %tmp7_46, %tmp6_43 : tensor<1x32x!tt.ptr, #blocked1> loc(#loc66) + %19 = arith.addi %tmp15, %10 : tensor<1x32xi64, #blocked> loc(#loc51) + %20 = arith.addi %19, %11 : tensor<1x32xi64, #blocked> loc(#loc52) + %21 = tt.addptr %12, %20 : tensor<1x32x!tt.ptr, #blocked>, tensor<1x32xi64, #blocked> loc(#loc54) + tt.store %21, %cst_3, %tmp6_42 : tensor<1x32x!tt.ptr, #blocked> loc(#loc20) + } loc(#loc55) + tt.return loc(#loc67) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":31:29) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":35:45) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":35:41) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":35:34) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":35:60) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":29:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":30:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":35:50) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":36:23) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":38:23) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":39:48) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":39:8) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":62:95) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":40:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":41:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":42:25) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":42:36) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":45:29) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:60) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:86) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:77) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:68) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:52) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:45) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:103) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":52:22) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":54:37) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":55:20) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":56:24) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:94) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:100) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:55) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:50) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":61:29) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":62:53) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":62:58) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":62:62) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":62:29) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":43:40) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":44:31) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:93) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":50:23) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":57:24) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":58:39) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:32) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:112) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:110) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:130) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":61:94) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":43:4) +#loc77 = loc("xoffset"(#loc2)) +#loc78 = loc("xmask"(#loc3)) +#loc79 = loc("r0_base"(#loc4)) +#loc80 = loc("r0_mask"(#loc5)) +#loc81 = loc("tmp0"(#loc6)) +#loc82 = loc("tmp0"(#loc7)) +#loc83 = loc("tmp0"(#loc8)) +#loc84 = loc("tmp0"(#loc9)) +#loc85 = loc("_tmp3"(#loc10)) +#loc86 = loc("r0_index"(#loc11)) +#loc87 = loc("tmp0"(#loc12)) +#loc88 = loc("tmp1"(#loc13)) +#loc89 = loc("tmp4"(#loc14)) +#loc90 = loc("_tmp3"(#loc15)) +#loc92 = loc("tmp3"(#loc21)) +#loc93 = loc("tmp5"(#loc22)) +#loc94 = loc("r0_mask"(#loc25)) +#loc95 = loc("tmp6"(#loc26)) +#loc96 = loc("tmp6"(#loc27)) +#loc97 = loc("tmp6"(#loc28)) +#loc98 = loc("tmp6"(#loc29)) +#loc99 = loc("tmp6"(#loc30)) +#loc100 = loc("tmp6"(#loc31)) +#loc101 = loc("tmp6"(#loc32)) +#loc102 = loc("tmp6"(#loc33)) +#loc103 = loc("tmp6"(#loc34)) +#loc104 = loc("tmp9"(#loc35)) +#loc105 = loc("tmp11"(#loc36)) +#loc106 = loc("tmp12"(#loc37)) +#loc107 = loc("tmp13"(#loc38)) +#loc108 = loc("quot"(#loc40)) +#loc109 = loc("remainder"(#loc42)) +#loc110 = loc("fixed"(#loc43)) +#loc111 = loc("fixed"(#loc44)) +#loc112 = loc("fixed"(#loc45)) +#loc113 = loc(callsite(#loc46 at #loc41)) +#loc114 = loc(callsite(#loc47 at #loc41)) +#loc115 = loc(fused[#loc52, #loc53]) +#loc116 = loc("r0_index"(#loc56)) +#loc117 = loc("tmp6"(#loc57)) +#loc118 = loc("tmp7"(#loc58)) +#loc119 = loc("tmp14"(#loc59)) +#loc120 = loc("tmp15"(#loc60)) +#loc121 = loc(fused[#loc82, #loc81]) +#loc122 = loc(fused[#loc84, #loc78]) +#loc123 = loc(callsite(#loc17 at #loc91)) +#loc125 = loc(fused[#loc98, #loc99]) +#loc126 = loc(fused[#loc101, #loc100]) +#loc127 = loc(fused[#loc103, #loc78]) +#loc128 = loc(callsite(#loc108 at #loc41)) +#loc129 = loc(callsite(#loc109 at #loc41)) +#loc130 = loc(callsite(#loc110 at #loc41)) +#loc131 = loc(callsite(#loc111 at #loc41)) +#loc132 = loc(callsite(#loc112 at #loc41)) +#loc133 = loc(callsite(#loc19 at #loc123)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..61687c725097b21d296c4f396cede093cafdf0c0 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir @@ -0,0 +1,247 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":18:0) +#loc2 = loc(unknown) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":40:25) +#loc69 = loc("in_ptr0"(#loc)) +#loc70 = loc("in_ptr1"(#loc)) +#loc71 = loc("out_ptr1"(#loc)) +#loc72 = loc("out_ptr2"(#loc)) +#loc73 = loc("out_ptr3"(#loc)) +#loc74 = loc("ks0"(#loc)) +#loc75 = loc("ks1"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc93 = loc("tmp3"(#loc19)) +#loc126 = loc(callsite(#loc2 at #loc93)) +module { + tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xmask = arith.constant 128 : i32 loc(#loc78) + %c128_i64 = arith.constant 128 : i64 loc(#loc2) + %c0_i64 = arith.constant 0 : i64 loc(#loc2) + %cst = arith.constant dense<0> : tensor<1x32xi32> loc(#loc2) + %c32_i32 = arith.constant 32 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst_0 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc2) + %cst_1 = arith.constant dense : tensor<1x32xi1> loc(#loc2) + %c127_i64 = arith.constant 127 : i64 loc(#loc2) + %c1_i64 = arith.constant 1 : i64 loc(#loc2) + %cst_2 = arith.constant dense<0> : tensor<1x32xi64> loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc79) + %xmask_3 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc78) + %xmask_4 = tt.splat %xmask_3 : i1 -> tensor<1x1xi1> loc(#loc78) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc80) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc81) + %_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 iter_args(%_tmp3_7 = %cst_2) -> (tensor<1x32xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc83) + %r0_index_8 = arith.addi %r0_index, %r0_base_5 : tensor<1x32xi32> loc(#loc83) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc84) + %r0_mask_9 = arith.cmpi slt, %r0_index_8, %r0_mask : tensor<1x32xi32> loc(#loc84) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc85) + %tmp0_10 = arith.muli %ks0, %tmp0 : i64 loc(#loc85) + %tmp0_11 = arith.extsi %r0_index_8 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc86) + %tmp0_12 = tt.splat %tmp0_10 : i64 -> tensor<1x32xi64> loc(#loc123) + %tmp0_13 = arith.addi %tmp0_11, %tmp0_12 : tensor<1x32xi64> loc(#loc86) + %tmp0_14 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc87) + %tmp0_15 = tt.addptr %tmp0_14, %tmp0_13 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc87) + %tmp0_16 = tt.splat %xmask_3 : i1 -> tensor<1x32xi1> loc(#loc124) + %tmp0_17 = arith.andi %r0_mask_9, %tmp0_16 : tensor<1x32xi1> loc(#loc88) + %tmp0_18 = tt.load %tmp0_15, %tmp0_17, %cst evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc89) + %tmp1 = arith.extsi %tmp0_18 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc90) + %tmp4 = arith.addi %_tmp3_7, %tmp1 : tensor<1x32xi64> loc(#loc91) + %_tmp3_19 = arith.select %tmp0_17, %tmp4, %_tmp3_7 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc92) + scf.yield %_tmp3_19 : tensor<1x32xi64> loc(#loc17) + } loc(#loc82) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_7: i64 loc(callsite(#loc2 at #loc93)), %tmp3_8: i64 loc(callsite(#loc2 at #loc93))): + %tmp3_9 = arith.addi %tmp3_7, %tmp3_8 : i64 loc(#loc135) + tt.reduce.return %tmp3_9 : i64 loc(#loc125) + }) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc125) + %tmp3_6 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc94) + %tmp5 = arith.trunci %tmp3_6 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc95) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc23) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc23) + tt.store %1, %tmp5, %xmask_4 : tensor<1x1x!tt.ptr> loc(#loc24) + scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc96) + %r0_index_7 = arith.addi %r0_index, %r0_base_5 : tensor<1x32xi32> loc(#loc96) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc97) + %r0_mask_8 = arith.cmpi slt, %r0_index_7, %r0_mask : tensor<1x32xi32> loc(#loc97) + %tmp6 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc98) + %tmp6_9 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc99) + %tmp6_10 = arith.extui %tmp6_9 : i1 to i64 loc(#loc100) + %tmp6_11 = arith.muli %ks0, %tmp6_10 : i64 loc(#loc100) + %tmp6_12 = arith.extui %tmp6 : i1 to i64 loc(#loc127) + %tmp6_13 = arith.addi %tmp6_12, %tmp6_11 : i64 loc(#loc101) + %tmp6_14 = arith.extsi %xoffset : i32 to i64 loc(#loc103) + %tmp6_15 = arith.muli %tmp6_14, %tmp6_13 : i64 loc(#loc103) + %tmp6_16 = arith.extsi %r0_index_7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc104) + %tmp6_17 = tt.splat %tmp6_15 : i64 -> tensor<1x32xi64> loc(#loc128) + %tmp6_18 = arith.addi %tmp6_16, %tmp6_17 : tensor<1x32xi64> loc(#loc104) + %tmp6_19 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc105) + %tmp6_20 = tt.addptr %tmp6_19, %tmp6_18 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc105) + %tmp6_21 = tt.splat %xmask_3 : i1 -> tensor<1x32xi1> loc(#loc129) + %tmp6_22 = arith.andi %r0_mask_8, %tmp6_21 : tensor<1x32xi1> loc(#loc106) + %tmp6_23 = tt.load %tmp6_20, %tmp6_22, %cst_2 evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc107) + %tmp7 = arith.trunci %tmp6_23 : tensor<1x32xi64> to tensor<1x32xi32> loc(#loc108) + %tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32> -> tensor<1x32xi32> loc(#loc109) + %tmp9_24 = arith.cmpi slt, %r0_index_7, %tmp9 : tensor<1x32xi32> loc(#loc109) + %tmp11 = arith.extsi %tmp7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc110) + %tmp11_25 = tt.splat %ks0 : i64 -> tensor<1x32xi64> loc(#loc110) + %tmp11_26 = arith.select %tmp9_24, %tmp11, %tmp11_25 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc110) + %tmp12 = arith.addi %ks0, %c1_i64 : i64 loc(#loc111) + %tmp13 = tt.splat %tmp12 : i64 -> tensor<1x32xi64> loc(#loc112) + %tmp13_27 = arith.addi %tmp11_26, %tmp13 : tensor<1x32xi64> loc(#loc112) + %tmp14 = arith.cmpi slt, %tmp11_26, %cst_2 : tensor<1x32xi64> loc(#loc113) + %tmp15 = arith.select %tmp14, %tmp13_27, %tmp11_26 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc114) + %2 = arith.cmpi sge, %tmp15, %cst_2 : tensor<1x32xi64> loc(#loc45) + %3 = arith.addi %ks1, %c127_i64 : i64 loc(#loc46) + %quot = arith.divsi %3, %c128_i64 : i64 loc(#loc130) + %remainder = arith.remsi %3, %c128_i64 : i64 loc(#loc131) + %fixed = arith.cmpi ne, %remainder, %c0_i64 : i64 loc(#loc132) + %fixed_28 = arith.subi %quot, %c1_i64 : i64 loc(#loc133) + %fixed_29 = arith.select %fixed, %fixed_28, %quot : i64 loc(#loc134) + %4 = arith.cmpi slt, %3, %c0_i64 : i64 loc(#loc120) + %5 = arith.select %4, %fixed_29, %quot : i64 loc(#loc121) + %6 = arith.addi %5, %c1_i64 : i64 loc(#loc55) + %7 = tt.splat %6 : i64 -> tensor<1x32xi64> loc(#loc56) + %8 = arith.cmpi slt, %tmp15, %7 : tensor<1x32xi64> loc(#loc56) + %9 = arith.andi %2, %8 : tensor<1x32xi1> loc(#loc57) + %10 = arith.xori %tmp6_22, %cst_1 : tensor<1x32xi1> loc(#loc58) + %11 = arith.ori %9, %10 : tensor<1x32xi1> loc(#loc59) + tt.assert %11, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1> loc(#loc60) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc61) + %13 = tt.addptr %12, %tmp6_18 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc61) + tt.store %13, %tmp7, %tmp6_22 : tensor<1x32x!tt.ptr> loc(#loc62) + %14 = tt.splat %tmp6_14 : i64 -> tensor<1x32xi64> loc(#loc63) + %15 = arith.addi %tmp15, %14 : tensor<1x32xi64> loc(#loc63) + %16 = arith.muli %ks0, %tmp6_14 : i64 loc(#loc64) + %17 = tt.splat %16 : i64 -> tensor<1x32xi64> loc(#loc122) + %18 = arith.addi %15, %17 : tensor<1x32xi64> loc(#loc65) + %19 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc66) + %20 = tt.addptr %19, %18 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc66) + tt.store %20, %cst_0, %tmp6_22 : tensor<1x32x!tt.ptr> loc(#loc67) + } loc(#loc25) + tt.return loc(#loc68) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":24:21) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":22:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":25:27) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":25:37) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":29:40) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":30:31) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":31:29) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":35:45) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":35:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":35:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":35:60) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":35:50) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":36:23) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":38:23) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":39:48) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":39:8) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":40:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":41:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":42:25) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":42:36) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":43:40) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":44:31) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":45:29) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:60) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:86) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:77) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:68) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:52) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:45) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:41) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:34) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:103) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":49:93) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":50:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":52:22) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":54:37) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":55:20) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":56:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":57:24) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":58:39) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:32) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:94) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:100) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:55) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:50) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:42) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:112) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:110) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":59:130) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":61:29) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":61:94) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":62:53) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":62:62) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":62:58) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":62:29) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":62:95) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pq/cpq2hqv3mnpai4be3mf3dku5lztddbhtoqcpf4xue66xl73sbmgx.py":43:4) +#loc78 = loc("xmask"(#loc1)) +#loc79 = loc("xoffset"(#loc3)) +#loc80 = loc("r0_base"(#loc4)) +#loc81 = loc("r0_base"(#loc5)) +#loc82 = loc("_tmp3"(#loc6)) +#loc83 = loc("r0_index"(#loc7)) +#loc84 = loc("r0_mask"(#loc8)) +#loc85 = loc("tmp0"(#loc9)) +#loc86 = loc("tmp0"(#loc10)) +#loc87 = loc("tmp0"(#loc11)) +#loc88 = loc("tmp0"(#loc12)) +#loc89 = loc("tmp0"(#loc13)) +#loc90 = loc("tmp1"(#loc14)) +#loc91 = loc("tmp4"(#loc15)) +#loc92 = loc("_tmp3"(#loc16)) +#loc94 = loc("tmp3"(#loc21)) +#loc95 = loc("tmp5"(#loc22)) +#loc96 = loc("r0_index"(#loc26)) +#loc97 = loc("r0_mask"(#loc27)) +#loc98 = loc("tmp6"(#loc28)) +#loc99 = loc("tmp6"(#loc29)) +#loc100 = loc("tmp6"(#loc30)) +#loc101 = loc("tmp6"(#loc31)) +#loc102 = loc("tmp6"(#loc32)) +#loc103 = loc("tmp6"(#loc33)) +#loc104 = loc("tmp6"(#loc34)) +#loc105 = loc("tmp6"(#loc35)) +#loc106 = loc("tmp6"(#loc36)) +#loc107 = loc("tmp6"(#loc37)) +#loc108 = loc("tmp7"(#loc38)) +#loc109 = loc("tmp9"(#loc39)) +#loc110 = loc("tmp11"(#loc40)) +#loc111 = loc("tmp12"(#loc41)) +#loc112 = loc("tmp13"(#loc42)) +#loc113 = loc("tmp14"(#loc43)) +#loc114 = loc("tmp15"(#loc44)) +#loc115 = loc("quot"(#loc47)) +#loc116 = loc("remainder"(#loc49)) +#loc117 = loc("fixed"(#loc50)) +#loc118 = loc("fixed"(#loc51)) +#loc119 = loc("fixed"(#loc52)) +#loc120 = loc(callsite(#loc53 at #loc48)) +#loc121 = loc(callsite(#loc54 at #loc48)) +#loc122 = loc(fused[#loc65, #loc64]) +#loc123 = loc(fused[#loc86, #loc85]) +#loc124 = loc(fused[#loc88, #loc78]) +#loc125 = loc(callsite(#loc18 at #loc93)) +#loc127 = loc(fused[#loc101, #loc102]) +#loc128 = loc(fused[#loc104, #loc103]) +#loc129 = loc(fused[#loc106, #loc78]) +#loc130 = loc(callsite(#loc115 at #loc48)) +#loc131 = loc(callsite(#loc116 at #loc48)) +#loc132 = loc(callsite(#loc117 at #loc48)) +#loc133 = loc(callsite(#loc118 at #loc48)) +#loc134 = loc(callsite(#loc119 at #loc48)) +#loc135 = loc(callsite(#loc20 at #loc125)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/__grp__triton_red_fused_argmax_0.json b/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/__grp__triton_red_fused_argmax_0.json new file mode 100644 index 0000000000000000000000000000000000000000..13d8a74f9ce850353affda5b05dee7dbfd835f0a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/__grp__triton_red_fused_argmax_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_argmax_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.source", "triton_red_fused_argmax_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.ttir", "triton_red_fused_argmax_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.ttgir", "triton_red_fused_argmax_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.llir", "triton_red_fused_argmax_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.ptx", "triton_red_fused_argmax_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.cubin", "triton_red_fused_argmax_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..c7cdc355d732897c9a01c034649d043a7e8b1b93 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.json b/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.json new file mode 100644 index 0000000000000000000000000000000000000000..071b37e97f64442f64dccf24618534c25e302837 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.json @@ -0,0 +1 @@ +{"hash": "590e2280885d755accf0b3fbd436c07fecc1e070055bf4c7a9ef3c4566427d6c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..16e7c24ef0f6ae128d5323ba42598c2c00a2d116 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.llir @@ -0,0 +1,433 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_argmax_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %9 = shl nuw nsw i32 %8, 2, !dbg !8 + %10 = and i32 %9, 2044, !dbg !8 + %11 = mul i32 %7, 32000, !dbg !9 + %12 = zext nneg i32 %10 to i64, !dbg !10 + br label %13, !dbg !10 + +13: ; preds = %6, %13 + %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %13 ] + %14 = phi i32 [ 2147483647, %6 ], [ %100, %13 ] + %15 = phi i32 [ 2147483647, %6 ], [ %101, %13 ] + %16 = phi float [ 0xFFF0000000000000, %6 ], [ %97, %13 ] + %17 = phi float [ 0xFFF0000000000000, %6 ], [ %98, %13 ] + %18 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %96, %13 ] + %19 = phi <2 x i32> [ splat (i32 2147483647), %6 ], [ %99, %13 ] + %20 = or disjoint i64 %indvars.iv, %12, !dbg !11 + %21 = or disjoint i64 %20, 2, !dbg !11 + %22 = or disjoint i64 %20, 3, !dbg !11 + %23 = icmp samesign ult i64 %20, 32000, !dbg !12 + %24 = trunc nuw nsw i64 %20 to i32, !dbg !13 + %25 = add i32 %11, %24, !dbg !13 + %26 = sext i32 %25 to i64, !dbg !14 + %27 = getelementptr bfloat, ptr addrspace(1) %0, i64 %26, !dbg !14 + %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !15 + %29 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %27, i64 %28, i1 %23) #4, !dbg !15 + %30 = extractvalue { i32, i32 } %29, 0, !dbg !15 + %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !15 + %32 = extractvalue { i32, i32 } %29, 1, !dbg !15 + %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !15 + %34 = extractelement <2 x bfloat> %33, i64 0, !dbg !15 + %35 = extractelement <2 x bfloat> %33, i64 1, !dbg !15 + %36 = fpext bfloat %34 to float, !dbg !16 + %37 = fpext bfloat %35 to float, !dbg !16 + %38 = fcmp ogt float %16, %36, !dbg !17 + %39 = fcmp ogt float %17, %37, !dbg !17 + %40 = fcmp oeq float %16, %36, !dbg !21 + %41 = fcmp oeq float %17, %37, !dbg !21 + %42 = fcmp uno <2 x float> %18, zeroinitializer, !dbg !22 + %43 = fcmp uno float %16, 0.000000e+00, !dbg !22 + %44 = fcmp uno float %17, 0.000000e+00, !dbg !22 + %45 = fcmp uno bfloat %34, 0xR0000, !dbg !23 + %46 = fcmp uno bfloat %35, 0xR0000, !dbg !23 + %47 = xor i1 %45, true, !dbg !24 + %48 = xor i1 %46, true, !dbg !24 + %49 = and i1 %43, %47, !dbg !25 + %50 = and i1 %44, %48, !dbg !25 + %51 = or i1 %38, %49, !dbg !26 + %52 = or i1 %39, %50, !dbg !26 + %53 = and i1 %43, %45, !dbg !27 + %54 = and i1 %44, %46, !dbg !27 + %55 = or i1 %40, %53, !dbg !28 + %56 = or i1 %41, %54, !dbg !28 + %57 = sext <2 x i32> %19 to <2 x i64>, !dbg !29 + %58 = sext i32 %14 to i64, !dbg !29 + %59 = icmp sgt i64 %21, %58, !dbg !29 + %60 = sext i32 %15 to i64, !dbg !29 + %61 = icmp sgt i64 %22, %60, !dbg !29 + %62 = and i1 %59, %55, !dbg !30 + %63 = and i1 %61, %56, !dbg !30 + %64 = or i1 %51, %62, !dbg !31 + %65 = or i1 %52, %63, !dbg !31 + %66 = select i1 %64, float %16, float %36, !dbg !32 + %67 = select i1 %65, float %17, float %37, !dbg !32 + %68 = trunc nuw nsw i64 %20 to i32, !dbg !33 + %69 = or disjoint i32 %68, 1, !dbg !33 + %70 = fpext <2 x bfloat> %31 to <2 x float>, !dbg !16 + %71 = fcmp ogt <2 x float> %18, %70, !dbg !17 + %72 = fcmp oeq <2 x float> %18, %70, !dbg !21 + %73 = fcmp uno <2 x bfloat> %31, zeroinitializer, !dbg !23 + %74 = xor <2 x i1> %73, splat (i1 true), !dbg !24 + %75 = and <2 x i1> %42, %74, !dbg !25 + %76 = or <2 x i1> %71, %75, !dbg !26 + %77 = and <2 x i1> %42, %73, !dbg !27 + %78 = or <2 x i1> %72, %77, !dbg !28 + %79 = insertelement <2 x i64> poison, i64 %20, i64 0, !dbg !29 + %80 = shufflevector <2 x i64> %79, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !29 + %81 = icmp sgt <2 x i64> %80, %57, !dbg !29 + %82 = icmp sge <2 x i64> %80, %57, !dbg !29 + %83 = shufflevector <2 x i1> %81, <2 x i1> %82, <2 x i32> , !dbg !29 + %84 = and <2 x i1> %83, %78, !dbg !30 + %85 = or <2 x i1> %76, %84, !dbg !31 + %86 = select <2 x i1> %85, <2 x float> %18, <2 x float> %70, !dbg !32 + %87 = insertelement <2 x i32> poison, i32 %24, i64 0, !dbg !33 + %88 = insertelement <2 x i32> %87, i32 %69, i64 1, !dbg !33 + %89 = select <2 x i1> %85, <2 x i32> %19, <2 x i32> %88, !dbg !33 + %90 = trunc nuw nsw i64 %21 to i32, !dbg !33 + %91 = select i1 %64, i32 %14, i32 %90, !dbg !33 + %92 = trunc nuw nsw i64 %22 to i32, !dbg !33 + %93 = select i1 %65, i32 %15, i32 %92, !dbg !33 + %94 = insertelement <2 x i1> poison, i1 %23, i64 0, !dbg !34 + %95 = shufflevector <2 x i1> %94, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !34 + %96 = select <2 x i1> %95, <2 x float> %86, <2 x float> %18, !dbg !34 + %97 = select i1 %23, float %66, float %16, !dbg !34 + %98 = select i1 %23, float %67, float %17, !dbg !34 + %99 = select <2 x i1> %95, <2 x i32> %89, <2 x i32> %19, !dbg !35 + %100 = select i1 %23, i32 %91, i32 %14, !dbg !35 + %101 = select i1 %23, i32 %93, i32 %15, !dbg !35 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !10 + %102 = icmp samesign ult i64 %indvars.iv, 29952, !dbg !10 + br i1 %102, label %13, label %103, !dbg !10 + +103: ; preds = %13 + %104 = and i32 %8, 31, !dbg !8 + %105 = lshr i32 %8, 5, !dbg !8 + %106 = shufflevector <2 x float> %96, <2 x float> poison, <2 x i32> , !dbg !36 + %107 = fcmp ogt <2 x float> %96, %106, !dbg !36 + %108 = fcmp oeq <2 x float> %96, %106, !dbg !36 + %109 = shufflevector <2 x i1> %107, <2 x i1> %108, <2 x i32> , !dbg !36 + %110 = extractelement <2 x float> %96, i64 0, !dbg !38 + %111 = fcmp uno float %110, 0.000000e+00, !dbg !38 + %112 = extractelement <2 x float> %96, i64 1, !dbg !39 + %113 = fcmp uno float %112, 0.000000e+00, !dbg !39 + %114 = xor i1 %113, true, !dbg !40 + %115 = insertelement <2 x i1> poison, i1 %111, i64 0, !dbg !41 + %116 = shufflevector <2 x i1> %115, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !41 + %117 = insertelement <2 x i1> poison, i1 %114, i64 0, !dbg !41 + %118 = insertelement <2 x i1> %117, i1 %113, i64 1, !dbg !41 + %119 = and <2 x i1> %116, %118, !dbg !41 + %120 = or <2 x i1> %109, %119, !dbg !42 + %121 = extractelement <2 x i32> %99, i64 0, !dbg !43 + %122 = extractelement <2 x i32> %99, i64 1, !dbg !43 + %123 = icmp slt i32 %121, %122, !dbg !43 + %124 = extractelement <2 x i1> %120, i64 1, !dbg !44 + %125 = and i1 %123, %124, !dbg !44 + %126 = extractelement <2 x i1> %120, i64 0, !dbg !45 + %127 = or i1 %126, %125, !dbg !45 + %128 = select i1 %127, float %110, float %112, !dbg !46 + %129 = select i1 %127, i32 %121, i32 %122, !dbg !47 + %130 = fcmp ogt float %128, %97, !dbg !36 + %131 = fcmp oeq float %128, %97, !dbg !48 + %132 = fcmp uno float %128, 0.000000e+00, !dbg !38 + %133 = fcmp uno float %97, 0.000000e+00, !dbg !39 + %134 = xor i1 %133, true, !dbg !40 + %135 = and i1 %132, %134, !dbg !41 + %136 = or i1 %130, %135, !dbg !42 + %137 = and i1 %133, %132, !dbg !49 + %138 = or i1 %131, %137, !dbg !50 + %139 = icmp slt i32 %129, %100, !dbg !43 + %140 = and i1 %139, %138, !dbg !44 + %141 = or i1 %136, %140, !dbg !45 + %142 = select i1 %141, float %128, float %97, !dbg !46 + %143 = select i1 %141, i32 %129, i32 %100, !dbg !47 + %144 = fcmp ogt float %142, %98, !dbg !36 + %145 = fcmp oeq float %142, %98, !dbg !48 + %146 = fcmp uno float %142, 0.000000e+00, !dbg !38 + %147 = fcmp uno float %98, 0.000000e+00, !dbg !39 + %148 = xor i1 %147, true, !dbg !40 + %149 = and i1 %146, %148, !dbg !41 + %150 = or i1 %144, %149, !dbg !42 + %151 = and i1 %147, %146, !dbg !49 + %152 = or i1 %145, %151, !dbg !50 + %153 = icmp slt i32 %143, %101, !dbg !43 + %154 = and i1 %153, %152, !dbg !44 + %155 = or i1 %150, %154, !dbg !45 + %156 = select i1 %155, float %142, float %98, !dbg !46 + %157 = select i1 %155, i32 %143, i32 %101, !dbg !47 + %158 = bitcast float %156 to i32, !dbg !51 + %159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %158, i32 16, i32 31), !dbg !51 + %160 = bitcast i32 %159 to float, !dbg !51 + %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 16, i32 31), !dbg !51 + %162 = fcmp ogt float %156, %160, !dbg !36 + %163 = fcmp oeq float %156, %160, !dbg !48 + %164 = fcmp uno float %156, 0.000000e+00, !dbg !38 + %165 = fcmp uno float %160, 0.000000e+00, !dbg !39 + %166 = xor i1 %165, true, !dbg !40 + %167 = and i1 %164, %166, !dbg !41 + %168 = or i1 %162, %167, !dbg !42 + %169 = and i1 %164, %165, !dbg !49 + %170 = or i1 %163, %169, !dbg !50 + %171 = icmp slt i32 %157, %161, !dbg !43 + %172 = and i1 %171, %170, !dbg !44 + %173 = or i1 %168, %172, !dbg !45 + %174 = select i1 %173, float %156, float %160, !dbg !46 + %175 = select i1 %173, i32 %157, i32 %161, !dbg !47 + %176 = bitcast float %174 to i32, !dbg !51 + %177 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 8, i32 31), !dbg !51 + %178 = bitcast i32 %177 to float, !dbg !51 + %179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 8, i32 31), !dbg !51 + %180 = fcmp ogt float %174, %178, !dbg !36 + %181 = fcmp oeq float %174, %178, !dbg !48 + %182 = fcmp uno float %174, 0.000000e+00, !dbg !38 + %183 = fcmp uno float %178, 0.000000e+00, !dbg !39 + %184 = xor i1 %183, true, !dbg !40 + %185 = and i1 %182, %184, !dbg !41 + %186 = or i1 %180, %185, !dbg !42 + %187 = and i1 %183, %182, !dbg !49 + %188 = or i1 %181, %187, !dbg !50 + %189 = icmp slt i32 %175, %179, !dbg !43 + %190 = and i1 %189, %188, !dbg !44 + %191 = or i1 %186, %190, !dbg !45 + %192 = select i1 %191, float %174, float %178, !dbg !46 + %193 = select i1 %191, i32 %175, i32 %179, !dbg !47 + %194 = bitcast float %192 to i32, !dbg !51 + %195 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %194, i32 4, i32 31), !dbg !51 + %196 = bitcast i32 %195 to float, !dbg !51 + %197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %193, i32 4, i32 31), !dbg !51 + %198 = fcmp ogt float %192, %196, !dbg !36 + %199 = fcmp oeq float %192, %196, !dbg !48 + %200 = fcmp uno float %192, 0.000000e+00, !dbg !38 + %201 = fcmp uno float %196, 0.000000e+00, !dbg !39 + %202 = xor i1 %201, true, !dbg !40 + %203 = and i1 %200, %202, !dbg !41 + %204 = or i1 %198, %203, !dbg !42 + %205 = and i1 %201, %200, !dbg !49 + %206 = or i1 %199, %205, !dbg !50 + %207 = icmp slt i32 %193, %197, !dbg !43 + %208 = and i1 %207, %206, !dbg !44 + %209 = or i1 %204, %208, !dbg !45 + %210 = select i1 %209, float %192, float %196, !dbg !46 + %211 = select i1 %209, i32 %193, i32 %197, !dbg !47 + %212 = bitcast float %210 to i32, !dbg !51 + %213 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %212, i32 2, i32 31), !dbg !51 + %214 = bitcast i32 %213 to float, !dbg !51 + %215 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %211, i32 2, i32 31), !dbg !51 + %216 = fcmp ogt float %210, %214, !dbg !36 + %217 = fcmp oeq float %210, %214, !dbg !48 + %218 = fcmp uno float %210, 0.000000e+00, !dbg !38 + %219 = fcmp uno float %214, 0.000000e+00, !dbg !39 + %220 = xor i1 %219, true, !dbg !40 + %221 = and i1 %218, %220, !dbg !41 + %222 = or i1 %216, %221, !dbg !42 + %223 = and i1 %219, %218, !dbg !49 + %224 = or i1 %217, %223, !dbg !50 + %225 = icmp slt i32 %211, %215, !dbg !43 + %226 = and i1 %225, %224, !dbg !44 + %227 = or i1 %222, %226, !dbg !45 + %228 = select i1 %227, float %210, float %214, !dbg !46 + %229 = select i1 %227, i32 %211, i32 %215, !dbg !47 + %230 = bitcast float %228 to i32, !dbg !51 + %231 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %230, i32 1, i32 31), !dbg !51 + %232 = bitcast i32 %231 to float, !dbg !51 + %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %229, i32 1, i32 31), !dbg !51 + %234 = fcmp ogt float %228, %232, !dbg !36 + %235 = fcmp oeq float %228, %232, !dbg !48 + %236 = fcmp uno float %228, 0.000000e+00, !dbg !38 + %237 = fcmp uno float %232, 0.000000e+00, !dbg !39 + %238 = xor i1 %237, true, !dbg !40 + %239 = and i1 %236, %238, !dbg !41 + %240 = or i1 %234, %239, !dbg !42 + %241 = and i1 %237, %236, !dbg !49 + %242 = or i1 %235, %241, !dbg !50 + %243 = icmp slt i32 %229, %233, !dbg !43 + %244 = and i1 %243, %242, !dbg !44 + %245 = or i1 %240, %244, !dbg !45 + %246 = select i1 %245, i32 %229, i32 %233, !dbg !47 + %247 = and i32 %105, 15, !dbg !51 + %248 = icmp eq i32 %104, 0, !dbg !51 + %249 = getelementptr float, ptr addrspace(3) @global_smem, i32 %247, !dbg !51 + %250 = select i1 %245, i32 %230, i32 %231, !dbg !46 + %251 = insertelement <1 x i32> poison, i32 %250, i64 0, !dbg !51 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %249, <1 x i32> %251, i1 %248) #4, !dbg !51 + %252 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %247, !dbg !51 + %253 = insertelement <1 x i32> poison, i32 %246, i64 0, !dbg !51 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %252, <1 x i32> %253, i1 %248) #4, !dbg !51 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !51 + %254 = icmp samesign ult i32 %8, 16, !dbg !51 + %255 = getelementptr float, ptr addrspace(3) @global_smem, i32 %8, !dbg !51 + %256 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %255, i1 %254) #4, !dbg !51 + %257 = bitcast i32 %256 to float, !dbg !51 + %258 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %8, !dbg !51 + %259 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %258, i1 %254) #4, !dbg !51 + %260 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %256, i32 8, i32 31), !dbg !51 + %261 = bitcast i32 %260 to float, !dbg !51 + %262 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %259, i32 8, i32 31), !dbg !51 + %263 = fcmp ogt float %257, %261, !dbg !36 + %264 = fcmp oeq float %257, %261, !dbg !48 + %265 = fcmp uno float %257, 0.000000e+00, !dbg !38 + %266 = fcmp uno float %261, 0.000000e+00, !dbg !39 + %267 = xor i1 %266, true, !dbg !40 + %268 = and i1 %265, %267, !dbg !41 + %269 = or i1 %263, %268, !dbg !42 + %270 = and i1 %265, %266, !dbg !49 + %271 = or i1 %264, %270, !dbg !50 + %272 = icmp slt i32 %259, %262, !dbg !43 + %273 = and i1 %272, %271, !dbg !44 + %274 = or i1 %269, %273, !dbg !45 + %275 = select i1 %274, float %257, float %261, !dbg !46 + %276 = select i1 %274, i32 %259, i32 %262, !dbg !47 + %277 = bitcast float %275 to i32, !dbg !51 + %278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %277, i32 4, i32 31), !dbg !51 + %279 = bitcast i32 %278 to float, !dbg !51 + %280 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 4, i32 31), !dbg !51 + %281 = fcmp ogt float %275, %279, !dbg !36 + %282 = fcmp oeq float %275, %279, !dbg !48 + %283 = fcmp uno float %275, 0.000000e+00, !dbg !38 + %284 = fcmp uno float %279, 0.000000e+00, !dbg !39 + %285 = xor i1 %284, true, !dbg !40 + %286 = and i1 %283, %285, !dbg !41 + %287 = or i1 %281, %286, !dbg !42 + %288 = and i1 %284, %283, !dbg !49 + %289 = or i1 %282, %288, !dbg !50 + %290 = icmp slt i32 %276, %280, !dbg !43 + %291 = and i1 %290, %289, !dbg !44 + %292 = or i1 %287, %291, !dbg !45 + %293 = select i1 %292, float %275, float %279, !dbg !46 + %294 = select i1 %292, i32 %276, i32 %280, !dbg !47 + %295 = bitcast float %293 to i32, !dbg !51 + %296 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %295, i32 2, i32 31), !dbg !51 + %297 = bitcast i32 %296 to float, !dbg !51 + %298 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %294, i32 2, i32 31), !dbg !51 + %299 = fcmp ogt float %293, %297, !dbg !36 + %300 = fcmp oeq float %293, %297, !dbg !48 + %301 = fcmp uno float %293, 0.000000e+00, !dbg !38 + %302 = fcmp uno float %297, 0.000000e+00, !dbg !39 + %303 = xor i1 %302, true, !dbg !40 + %304 = and i1 %301, %303, !dbg !41 + %305 = or i1 %299, %304, !dbg !42 + %306 = and i1 %302, %301, !dbg !49 + %307 = or i1 %300, %306, !dbg !50 + %308 = icmp slt i32 %294, %298, !dbg !43 + %309 = and i1 %308, %307, !dbg !44 + %310 = or i1 %305, %309, !dbg !45 + %311 = select i1 %310, float %293, float %297, !dbg !46 + %312 = select i1 %310, i32 %294, i32 %298, !dbg !47 + %313 = bitcast float %311 to i32, !dbg !51 + %314 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %313, i32 1, i32 31), !dbg !51 + %315 = bitcast i32 %314 to float, !dbg !51 + %316 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %312, i32 1, i32 31), !dbg !51 + %317 = fcmp ogt float %311, %315, !dbg !36 + %318 = fcmp oeq float %311, %315, !dbg !48 + %319 = fcmp uno float %311, 0.000000e+00, !dbg !38 + %320 = fcmp uno float %315, 0.000000e+00, !dbg !39 + %321 = xor i1 %320, true, !dbg !40 + %322 = and i1 %319, %321, !dbg !41 + %323 = or i1 %317, %322, !dbg !42 + %324 = and i1 %320, %319, !dbg !49 + %325 = or i1 %318, %324, !dbg !50 + %326 = icmp slt i32 %312, %316, !dbg !43 + %327 = and i1 %326, %325, !dbg !44 + %328 = or i1 %323, %327, !dbg !45 + %329 = select i1 %328, i32 %312, i32 %316, !dbg !47 + %330 = icmp eq i32 %8, 0, !dbg !51 + %331 = select i1 %328, i32 %313, i32 %314, !dbg !46 + %332 = insertelement <1 x i32> poison, i32 %331, i64 0, !dbg !51 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %255, <1 x i32> %332, i1 %330) #4, !dbg !51 + %333 = insertelement <1 x i32> poison, i32 %329, i64 0, !dbg !51 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %258, <1 x i32> %333, i1 %330) #4, !dbg !51 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !51 + %334 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !51 + %335 = zext nneg i32 %7 to i64, !dbg !52 + %336 = getelementptr i64, ptr addrspace(1) %1, i64 %335, !dbg !52 + %337 = sext i32 %334 to i64, !dbg !53 + %338 = and i32 %8, 511, !dbg !53 + %339 = icmp eq i32 %338, 0, !dbg !53 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %337, ptr addrspace(1) %336, i1 %339) #4, !dbg !53 + ret void, !dbg !54 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_0", linkageName: "triton_red_fused_argmax_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 26, column: 37, scope: !4) +!9 = !DILocation(line: 37, column: 47, scope: !4) +!10 = !DILocation(line: 31, column: 40, scope: !4) +!11 = !DILocation(line: 32, column: 31, scope: !4) +!12 = !DILocation(line: 33, column: 29, scope: !4) +!13 = !DILocation(line: 37, column: 41, scope: !4) +!14 = !DILocation(line: 37, column: 34, scope: !4) +!15 = !DILocation(line: 37, column: 52, scope: !4) +!16 = !DILocation(line: 37, column: 106, scope: !4) +!17 = !DILocation(line: 144, column: 21, scope: !18, inlinedAt: !20) +!18 = distinct !DILexicalBlockFile(scope: !4, file: !19, discriminator: 0) +!19 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!20 = !DILocation(line: 40, column: 38, scope: !4) +!21 = !DILocation(line: 145, column: 23, scope: !18, inlinedAt: !20) +!22 = !DILocation(line: 147, column: 29, scope: !18, inlinedAt: !20) +!23 = !DILocation(line: 148, column: 29, scope: !18, inlinedAt: !20) +!24 = !DILocation(line: 149, column: 31, scope: !18, inlinedAt: !20) +!25 = !DILocation(line: 149, column: 27, scope: !18, inlinedAt: !20) +!26 = !DILocation(line: 149, column: 16, scope: !18, inlinedAt: !20) +!27 = !DILocation(line: 151, column: 27, scope: !18, inlinedAt: !20) +!28 = !DILocation(line: 151, column: 17, scope: !18, inlinedAt: !20) +!29 = !DILocation(line: 154, column: 31, scope: !18, inlinedAt: !20) +!30 = !DILocation(line: 154, column: 21, scope: !18, inlinedAt: !20) +!31 = !DILocation(line: 154, column: 12, scope: !18, inlinedAt: !20) +!32 = !DILocation(line: 155, column: 35, scope: !18, inlinedAt: !20) +!33 = !DILocation(line: 155, column: 69, scope: !18, inlinedAt: !20) +!34 = !DILocation(line: 42, column: 46, scope: !4) +!35 = !DILocation(line: 43, column: 58, scope: !4) +!36 = !DILocation(line: 144, column: 21, scope: !18, inlinedAt: !37) +!37 = !DILocation(line: 44, column: 75, scope: !4) +!38 = !DILocation(line: 147, column: 29, scope: !18, inlinedAt: !37) +!39 = !DILocation(line: 148, column: 29, scope: !18, inlinedAt: !37) +!40 = !DILocation(line: 149, column: 31, scope: !18, inlinedAt: !37) +!41 = !DILocation(line: 149, column: 27, scope: !18, inlinedAt: !37) +!42 = !DILocation(line: 149, column: 16, scope: !18, inlinedAt: !37) +!43 = !DILocation(line: 154, column: 31, scope: !18, inlinedAt: !37) +!44 = !DILocation(line: 154, column: 21, scope: !18, inlinedAt: !37) +!45 = !DILocation(line: 154, column: 12, scope: !18, inlinedAt: !37) +!46 = !DILocation(line: 155, column: 35, scope: !18, inlinedAt: !37) +!47 = !DILocation(line: 155, column: 69, scope: !18, inlinedAt: !37) +!48 = !DILocation(line: 145, column: 23, scope: !18, inlinedAt: !37) +!49 = !DILocation(line: 151, column: 27, scope: !18, inlinedAt: !37) +!50 = !DILocation(line: 151, column: 17, scope: !18, inlinedAt: !37) +!51 = !DILocation(line: 165, column: 42, scope: !18, inlinedAt: !37) +!52 = !DILocation(line: 46, column: 25, scope: !4) +!53 = !DILocation(line: 46, column: 36, scope: !4) +!54 = !DILocation(line: 46, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..2f5e1f75c3d1413fcea46a8c479bec22e9eb1547 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.ptx @@ -0,0 +1,833 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_argmax_0 // -- Begin function triton_red_fused_argmax_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_argmax_0 +.visible .entry triton_red_fused_argmax_0( + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_0_param_1, + .param .u32 triton_red_fused_argmax_0_param_2, + .param .u32 triton_red_fused_argmax_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_0_param_5 +) +.reqntid 512 +{ + .reg .pred %p<202>; + .reg .b16 %rs<5>; + .reg .b32 %r<114>; + .reg .b64 %rd<27>; + .loc 1 18 0 // cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_red_fused_argmax_0_param_1]; + ld.param.b64 %rd7, [triton_red_fused_argmax_0_param_0]; +$L__tmp0: + .loc 1 23 28 // cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:23:28 + mov.u32 %r1, %ctaid.x; + .loc 1 26 37 // cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:26:37 + mov.u32 %r2, %tid.x; + shl.b32 %r19, %r2, 2; + and.b32 %r20, %r19, 2044; + .loc 1 31 40 // cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:31:40 + cvt.u64.u32 %rd1, %r20; + mad.lo.s32 %r21, %r1, 32000, %r20; + cvt.u64.u32 %rd2, %r21; + mov.b32 %r110, 0fFF800000; + mov.b64 %rd26, {%r110, %r110}; + mov.b32 %r108, 2147483647; + mov.b64 %rd25, -2048; + mov.b32 %r109, %r108; + mov.b32 %r111, %r110; + mov.b32 %r112, %r108; + mov.b32 %r113, %r108; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 32 31 // cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:32:31 + add.s64 %rd14, %rd1, %rd25; + add.s64 %rd15, %rd14, 2048; + add.s64 %rd16, %rd14, 2050; + .loc 1 33 29 // cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:33:29 + add.s64 %rd17, %rd14, 2051; + setp.lt.u64 %p1, %rd15, 32000; + .loc 1 37 34 // cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:37:34 + add.s64 %rd18, %rd2, %rd25; + cvt.u32.u64 %r26, %rd18; + add.s32 %r27, %r26, 2048; + mad.wide.s32 %rd12, %r27, 2, %rd7; + .loc 1 37 52 // cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:37:52 + // begin inline asm + mov.u64 %rd11, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd11, 1.0; + // end inline asm + mov.b32 %r24, 0; + // begin inline asm + mov.u32 %r22, %r24; + mov.u32 %r23, %r24; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r22, %r23 }, [ %rd12 + 0 ], %rd11; + // end inline asm + mov.b32 {%rs1, %rs2}, %r23; + .loc 1 37 106 // cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:37:106 + cvt.f32.bf16 %r28, %rs1; + cvt.f32.bf16 %r29, %rs2; +$L__tmp1: + .loc 2 144 21 // triton_helpers.py:144:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + setp.gt.f32 %p2, %r110, %r28; + setp.gt.f32 %p3, %r111, %r29; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + setp.eq.f32 %p4, %r110, %r28; + setp.eq.f32 %p5, %r111, %r29; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + mov.b64 {%r30, %r31}, %rd26; + setp.nan.f32 %p6, %r30, %r30; + setp.nan.f32 %p7, %r31, %r31; + setp.nan.f32 %p8, %r110, %r110; + setp.nan.f32 %p9, %r111, %r111; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + setp.nan.bf16 %p10, %rs1, %rs1; + setp.num.bf16 %p11, %rs1, %rs1; + setp.nan.bf16 %p12, %rs2, %rs2; + setp.num.bf16 %p13, %rs2, %rs2; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + and.pred %p14, %p8, %p11; + and.pred %p15, %p9, %p13; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + or.pred %p16, %p2, %p14; + or.pred %p17, %p3, %p15; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + and.pred %p18, %p8, %p10; + and.pred %p19, %p9, %p12; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + or.pred %p20, %p4, %p18; + or.pred %p21, %p5, %p19; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + cvt.s64.s32 %rd19, %r113; + cvt.s64.s32 %rd20, %r112; + cvt.s64.s32 %rd21, %r108; + setp.gt.s64 %p22, %rd16, %rd21; + cvt.s64.s32 %rd22, %r109; + setp.gt.s64 %p23, %rd17, %rd22; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + and.pred %p24, %p22, %p20; + and.pred %p25, %p23, %p21; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + or.pred %p26, %p16, %p24; + or.pred %p27, %p17, %p25; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + selp.f32 %r32, %r110, %r28, %p26; + selp.f32 %r33, %r111, %r29, %p27; +$L__tmp2: + .loc 1 37 106 // cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:37:106 + mov.b32 {%rs3, %rs4}, %r22; + cvt.f32.bf16 %r34, %rs3; + cvt.f32.bf16 %r35, %rs4; +$L__tmp3: + .loc 2 144 21 // triton_helpers.py:144:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + setp.gt.f32 %p28, %r31, %r35; + setp.gt.f32 %p29, %r30, %r34; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + setp.eq.f32 %p30, %r30, %r34; + setp.eq.f32 %p31, %r31, %r35; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + setp.nan.bf16x2 %p32|%p33, %r22, %r24; + setp.num.bf16x2 %p34|%p35, %r22, %r24; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + and.pred %p36, %p7, %p35; + and.pred %p37, %p6, %p34; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + or.pred %p38, %p29, %p37; + or.pred %p39, %p28, %p36; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + and.pred %p40, %p6, %p32; + and.pred %p41, %p7, %p33; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + or.pred %p42, %p31, %p41; + or.pred %p43, %p30, %p40; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + setp.gt.s64 %p44, %rd15, %rd20; + setp.ge.s64 %p45, %rd15, %rd19; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + and.pred %p46, %p44, %p43; + and.pred %p47, %p45, %p42; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + or.pred %p48, %p39, %p47; + or.pred %p49, %p38, %p46; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + selp.f32 %r36, %r30, %r34, %p49; + selp.f32 %r37, %r31, %r35, %p48; + cvt.u32.u64 %r38, %rd15; + cvt.u32.u64 %r39, %rd14; + add.s32 %r40, %r39, 2049; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:40:38 ] + selp.b32 %r41, %r112, %r38, %p49; + selp.b32 %r42, %r113, %r40, %p48; + cvt.u32.u64 %r43, %rd16; + selp.b32 %r44, %r108, %r43, %p26; + cvt.u32.u64 %r45, %rd17; + selp.b32 %r46, %r109, %r45, %p27; +$L__tmp4: + .loc 1 42 46 // cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:42:46 + selp.f32 %r47, %r37, %r31, %p1; + selp.f32 %r48, %r36, %r30, %p1; + mov.b64 %rd26, {%r48, %r47}; + selp.f32 %r110, %r32, %r110, %p1; + selp.f32 %r111, %r33, %r111, %p1; + .loc 1 43 58 // cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:43:58 + selp.b32 %r113, %r42, %r113, %p1; + selp.b32 %r112, %r41, %r112, %p1; + selp.b32 %r108, %r44, %r108, %p1; + selp.b32 %r109, %r46, %r109, %p1; + .loc 1 31 40 // cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:31:40 + add.s64 %rd25, %rd25, 2048; + setp.lt.u64 %p50, %rd25, 29952; + @%p50 bra $L__BB0_1; +// %bb.2: + .loc 1 26 37 // cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:26:37 + and.b32 %r61, %r2, 31; +$L__tmp5: + .loc 2 144 21 // triton_helpers.py:144:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + mov.b64 {%r62, %r63}, %rd26; + setp.gt.f32 %p58, %r62, %r63; + setp.eq.f32 %p59, %r63, %r62; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p60, %r62, %r62; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.num.f32 %p61, %r63, %r63; + setp.nan.f32 %p62, %r63, %r63; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p63, %p60, %p62; + and.pred %p64, %p60, %p61; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p65, %p58, %p64; + or.pred %p66, %p59, %p63; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.lt.s32 %p67, %r112, %r113; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p68, %p67, %p66; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p69, %p65, %p68; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.f32 %r64, %r62, %r63, %p69; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.b32 %r65, %r112, %r113, %p69; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.gt.f32 %p70, %r64, %r110; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.eq.f32 %p71, %r64, %r110; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p72, %r64, %r64; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p73, %r110, %r110; + setp.num.f32 %p74, %r110, %r110; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p75, %p72, %p74; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p76, %p70, %p75; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p77, %p73, %p72; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p78, %p71, %p77; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.lt.s32 %p79, %r65, %r108; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p80, %p79, %p78; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p81, %p76, %p80; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.f32 %r66, %r64, %r110, %p81; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.b32 %r67, %r65, %r108, %p81; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.gt.f32 %p82, %r66, %r111; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.eq.f32 %p83, %r66, %r111; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p84, %r66, %r66; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p85, %r111, %r111; + setp.num.f32 %p86, %r111, %r111; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p87, %p84, %p86; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p88, %p82, %p87; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p89, %p85, %p84; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p90, %p83, %p89; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.lt.s32 %p91, %r67, %r109; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p92, %p91, %p90; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p93, %p88, %p92; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.f32 %r68, %r66, %r111, %p93; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.b32 %r69, %r67, %r109, %p93; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + shfl.sync.bfly.b32 %r70, %r68, 16, 31, -1; + shfl.sync.bfly.b32 %r71, %r69, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.gt.f32 %p94, %r68, %r70; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.eq.f32 %p95, %r68, %r70; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p96, %r68, %r68; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p97, %r70, %r70; + setp.num.f32 %p98, %r70, %r70; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p99, %p96, %p98; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p100, %p94, %p99; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p101, %p96, %p97; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p102, %p95, %p101; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.lt.s32 %p103, %r69, %r71; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p104, %p103, %p102; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p105, %p100, %p104; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.f32 %r72, %r68, %r70, %p105; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.b32 %r73, %r69, %r71, %p105; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + shfl.sync.bfly.b32 %r74, %r72, 8, 31, -1; + shfl.sync.bfly.b32 %r75, %r73, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.gt.f32 %p106, %r72, %r74; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.eq.f32 %p107, %r72, %r74; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p108, %r72, %r72; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p109, %r74, %r74; + setp.num.f32 %p110, %r74, %r74; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p111, %p108, %p110; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p112, %p106, %p111; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p113, %p109, %p108; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p114, %p107, %p113; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.lt.s32 %p115, %r73, %r75; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p116, %p115, %p114; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p117, %p112, %p116; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.f32 %r76, %r72, %r74, %p117; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.b32 %r77, %r73, %r75, %p117; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + shfl.sync.bfly.b32 %r78, %r76, 4, 31, -1; + shfl.sync.bfly.b32 %r79, %r77, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.gt.f32 %p118, %r76, %r78; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.eq.f32 %p119, %r76, %r78; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p120, %r76, %r76; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p121, %r78, %r78; + setp.num.f32 %p122, %r78, %r78; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p123, %p120, %p122; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p124, %p118, %p123; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p125, %p121, %p120; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p126, %p119, %p125; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.lt.s32 %p127, %r77, %r79; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p128, %p127, %p126; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p129, %p124, %p128; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.f32 %r80, %r76, %r78, %p129; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.b32 %r81, %r77, %r79, %p129; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + shfl.sync.bfly.b32 %r82, %r80, 2, 31, -1; + shfl.sync.bfly.b32 %r83, %r81, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.gt.f32 %p130, %r80, %r82; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.eq.f32 %p131, %r80, %r82; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p132, %r80, %r80; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p133, %r82, %r82; + setp.num.f32 %p134, %r82, %r82; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p135, %p132, %p134; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p136, %p130, %p135; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p137, %p133, %p132; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p138, %p131, %p137; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.lt.s32 %p139, %r81, %r83; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p140, %p139, %p138; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p141, %p136, %p140; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.f32 %r84, %r80, %r82, %p141; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.b32 %r85, %r81, %r83, %p141; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + shfl.sync.bfly.b32 %r86, %r84, 1, 31, -1; + shfl.sync.bfly.b32 %r87, %r85, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.gt.f32 %p142, %r84, %r86; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.eq.f32 %p143, %r84, %r86; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p144, %r84, %r84; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p145, %r86, %r86; + setp.num.f32 %p146, %r86, %r86; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p147, %p144, %p146; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p148, %p142, %p147; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p149, %p145, %p144; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p150, %p143, %p149; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.lt.s32 %p151, %r85, %r87; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p152, %p151, %p150; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p153, %p148, %p152; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.b32 %r52, %r85, %r87, %p153; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.eq.b32 %p51, %r61, 0; + shr.u32 %r88, %r2, 3; + and.b32 %r89, %r88, 60; + mov.b32 %r90, global_smem; + add.s32 %r49, %r90, %r89; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.b32 %r50, %r84, %r86, %p153; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + // begin inline asm + @%p51 st.shared.b32 [ %r49 + 0 ], %r50; + // end inline asm + add.s32 %r91, %r90, 64; + add.s32 %r51, %r91, %r89; + // begin inline asm + @%p51 st.shared.b32 [ %r51 + 0 ], %r52; + // end inline asm + bar.sync 0; + setp.lt.u32 %p53, %r2, 16; + add.s32 %r54, %r90, %r19; + // begin inline asm + @%p53 ld.shared.b32 %r53, [ %r54 + 0 ]; + // end inline asm + add.s32 %r56, %r91, %r19; + // begin inline asm + @%p53 ld.shared.b32 %r55, [ %r56 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r93, %r53, 8, 31, -1; + shfl.sync.bfly.b32 %r94, %r55, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.gt.f32 %p154, %r53, %r93; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.eq.f32 %p155, %r53, %r93; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p156, %r53, %r53; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p157, %r93, %r93; + setp.num.f32 %p158, %r93, %r93; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p159, %p156, %p158; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p160, %p154, %p159; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p161, %p156, %p157; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p162, %p155, %p161; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.lt.s32 %p163, %r55, %r94; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p164, %p163, %p162; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p165, %p160, %p164; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.f32 %r95, %r53, %r93, %p165; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.b32 %r96, %r55, %r94, %p165; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + shfl.sync.bfly.b32 %r97, %r95, 4, 31, -1; + shfl.sync.bfly.b32 %r98, %r96, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.gt.f32 %p166, %r95, %r97; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.eq.f32 %p167, %r95, %r97; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p168, %r95, %r95; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p169, %r97, %r97; + setp.num.f32 %p170, %r97, %r97; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p171, %p168, %p170; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p172, %p166, %p171; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p173, %p169, %p168; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p174, %p167, %p173; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.lt.s32 %p175, %r96, %r98; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p176, %p175, %p174; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p177, %p172, %p176; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.f32 %r99, %r95, %r97, %p177; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.b32 %r100, %r96, %r98, %p177; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + shfl.sync.bfly.b32 %r101, %r99, 2, 31, -1; + shfl.sync.bfly.b32 %r102, %r100, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.gt.f32 %p178, %r99, %r101; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.eq.f32 %p179, %r99, %r101; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p180, %r99, %r99; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p181, %r101, %r101; + setp.num.f32 %p182, %r101, %r101; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p183, %p180, %p182; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p184, %p178, %p183; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p185, %p181, %p180; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p186, %p179, %p185; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.lt.s32 %p187, %r100, %r102; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p188, %p187, %p186; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p189, %p184, %p188; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.f32 %r103, %r99, %r101, %p189; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.b32 %r104, %r100, %r102, %p189; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + shfl.sync.bfly.b32 %r105, %r103, 1, 31, -1; + shfl.sync.bfly.b32 %r106, %r104, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.gt.f32 %p190, %r103, %r105; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.eq.f32 %p191, %r103, %r105; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p192, %r103, %r103; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.nan.f32 %p193, %r105, %r105; + setp.num.f32 %p194, %r105, %r105; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p195, %p192, %p194; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p196, %p190, %p195; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p197, %p193, %p192; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p198, %p191, %p197; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.lt.s32 %p199, %r104, %r106; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + and.pred %p200, %p199, %p198; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + or.pred %p201, %p196, %p200; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.b32 %r60, %r104, %r106, %p201; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + setp.eq.b32 %p55, %r2, 0; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + selp.b32 %r58, %r103, %r105, %p201; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:44:75 ] + // begin inline asm + @%p55 st.shared.b32 [ %r54 + 0 ], %r58; + // end inline asm + // begin inline asm + @%p55 st.shared.b32 [ %r56 + 0 ], %r60; + // end inline asm + bar.sync 0; + ld.shared.s32 %rd23, [global_smem+64]; +$L__tmp6: + .loc 1 46 25 // cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:46:25 + mad.wide.u32 %rd24, %r1, 8, %rd8; + .loc 1 46 36 // cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:46:36 + and.b32 %r107, %r2, 511; + setp.eq.b32 %p57, %r107, 0; + // begin inline asm + @%p57 st.global.b64 [ %rd24 + 0 ], { %rd23 }; + // end inline asm + .loc 1 46 4 // cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py:46:4 + ret; +$L__tmp7: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 234 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 120 +.b8 103 +.b8 97 +.b8 115 +.b8 50 +.b8 106 +.b8 116 +.b8 50 +.b8 101 +.b8 55 +.b8 102 +.b8 114 +.b8 119 +.b8 112 +.b8 109 +.b8 51 +.b8 110 +.b8 100 +.b8 53 +.b8 104 +.b8 55 +.b8 119 +.b8 108 +.b8 122 +.b8 100 +.b8 111 +.b8 50 +.b8 102 +.b8 98 +.b8 50 +.b8 121 +.b8 111 +.b8 110 +.b8 107 +.b8 118 +.b8 107 +.b8 102 +.b8 111 +.b8 97 +.b8 114 +.b8 112 +.b8 121 +.b8 97 +.b8 102 +.b8 105 +.b8 119 +.b8 111 +.b8 115 +.b8 103 +.b8 50 +.b8 51 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 120 +.b8 103 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 40 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.source b/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.source new file mode 100644 index 0000000000000000000000000000000000000000..19a7cb2b1f76d162da982d96d5d5c7f898aaa6ec --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.source @@ -0,0 +1,291 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":18:0) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc42 = loc(unknown) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc67 = loc("in_ptr0"(#loc)) +#loc68 = loc("out_ptr0"(#loc)) +#loc69 = loc("xnumel"(#loc)) +#loc70 = loc("r0_numel"(#loc)) +#loc94 = loc("a_value"(#loc30)) +#loc95 = loc("a_index"(#loc30)) +#loc96 = loc("b_value"(#loc30)) +#loc97 = loc("b_index"(#loc30)) +#loc110 = loc("x"(#loc50)) +#loc111 = loc("x"(#loc54)) +#loc112 = loc("value"(#loc63)) +#loc113 = loc("index"(#loc63)) +module { + tt.func public @triton_red_fused_argmax_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 16384 : i32 loc(#loc71) + %r0_numel_1 = arith.constant 32000 : i32 loc(#loc72) + %xoffset = tt.get_program_id x : i32 loc(#loc73) + %xoffset_2 = arith.constant 1 : i32 loc(#loc74) + %xoffset_3 = arith.constant 1 : i32 loc(#loc74) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc74) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc75) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc76) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc77) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc77) + %xmask = arith.constant true loc(#loc78) + %xmask_8 = arith.constant dense : tensor<1x2048xi1> loc(#loc78) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc79) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc80) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc81) + %_tmp2_10 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc81) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc82) + %_tmp2_index_11 = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc82) + %c0_i32 = arith.constant 0 : i32 loc(#loc13) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc13) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc13) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc13) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc13) + %3 = ub.poison : i32 loc(#loc13) + %_tmp2_index_12:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_13 = %_tmp2_10, %_tmp2_index_14 = %_tmp2_index_11) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc84) + %r0_index_15 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc84) + %r0_mask = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc85) + %r0_mask_16 = arith.cmpi slt, %r0_index_15, %r0_mask : tensor<1x2048xi32> loc(#loc85) + %tmp0 = arith.constant 32000 : i32 loc(#loc86) + %tmp0_17 = arith.constant 32000 : i32 loc(#loc86) + %tmp0_18 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc86) + %tmp0_19 = arith.muli %tmp0_18, %xindex_7 : tensor<1x1xi32> loc(#loc86) + %tmp0_20 = tt.broadcast %tmp0_19 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc87) + %tmp0_21 = arith.addi %r0_index_15, %tmp0_20 : tensor<1x2048xi32> loc(#loc87) + %tmp0_22 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc88) + %tmp0_23 = tt.addptr %tmp0_22, %tmp0_21 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc88) + %tmp0_24 = arith.constant 0.000000e+00 : f32 loc(#loc89) + %tmp0_25 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc89) + %tmp0_26 = arith.truncf %tmp0_25 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc89) + %tmp0_27 = tt.load %tmp0_23, %r0_mask_16, %tmp0_26 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc89) + %tmp0_28 = arith.extf %tmp0_27 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc90) + %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%_tmp2_13, %_tmp2_index_14, %tmp0_28, %r0_index_15) : (tensor<1x2048xf32>, tensor<1x2048xi32>, tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) loc(#loc21) + %_tmp2_29 = arith.select %r0_mask_16, %8#0, %_tmp2_13 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc91) + %_tmp2_index_30 = arith.select %r0_mask_16, %8#1, %_tmp2_index_14 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc92) + scf.yield %_tmp2_29, %_tmp2_index_30 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc24) + } loc(#loc114) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%_tmp2_index_12#0, %_tmp2_index_12#1) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc25) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc93) + %5 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc27) + %6 = tt.addptr %5, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc27) + %7 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc28) + tt.store %6, %7 : tensor<1x1x!tt.ptr> loc(#loc28) + tt.return loc(#loc29) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%a_value: tensor<1x2048xf32> loc("a_value"(#loc30)), %a_index: tensor<1x2048xi32> loc("a_index"(#loc30)), %b_value: tensor<1x2048xf32> loc("b_value"(#loc30)), %b_index: tensor<1x2048xi32> loc("b_index"(#loc30))) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<1x2048xf32> loc(#loc115) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<1x2048xf32> loc(#loc116) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%a_value) : (tensor<1x2048xf32>) -> i1 loc(#loc33) + %1:2 = scf.if %0 -> (tensor<1x2048xi1>, tensor<1x2048xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<1x2048xf32> loc(#loc100) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<1x2048xf32> loc(#loc101) + %mask_3 = arith.constant true loc(#loc102) + %mask_4 = arith.constant dense : tensor<1x2048xi1> loc(#loc102) + %mask_5 = arith.xori %b_isnan, %mask_4 : tensor<1x2048xi1> loc(#loc102) + %mask_6 = arith.andi %a_isnan, %mask_5 : tensor<1x2048xi1> loc(#loc103) + %mask_7 = arith.ori %mask, %mask_6 : tensor<1x2048xi1> loc(#loc117) + %equal_8 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc105) + %equal_9 = arith.ori %equal, %equal_8 : tensor<1x2048xi1> loc(#loc118) + scf.yield %mask_7, %equal_9 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc118) + } else { + scf.yield %mask, %equal : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc42) + } loc(#loc34) + %mask_0 = arith.cmpi slt, %a_index, %b_index : tensor<1x2048xi32> loc(#loc107) + %mask_1 = arith.andi %1#1, %mask_0 : tensor<1x2048xi1> loc(#loc108) + %mask_2 = arith.ori %1#0, %mask_1 : tensor<1x2048xi1> loc(#loc109) + %2 = arith.select %mask_2, %a_value, %b_value : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc46) + %3 = arith.select %mask_2, %a_index, %b_index : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc47) + tt.return %2, %3 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc48) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x2048xf32> loc(#loc49) + %5 = ub.poison : tensor<1x2048xi32> loc(#loc49) + tt.return %4, %5 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc49) + } loc(#loc30) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc50))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc51) + %true = arith.constant true loc(#loc52) + tt.return %true : i1 loc(#loc52) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc53) + tt.return %1 : i1 loc(#loc53) + } loc(#loc50) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc54))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc55) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc56) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc56) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc56) + %4 = arith.addf %x, %3 : tensor<1x2048xf32> loc(#loc56) + tt.return %4 : tensor<1x2048xf32> loc(#loc57) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x2048xf32> loc(#loc58) + tt.return %5 : tensor<1x2048xf32> loc(#loc58) + } loc(#loc54) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc60) + %cst = arith.constant dense : tensor<1xi1> loc(#loc60) + tt.return %cst : tensor<1xi1> loc(#loc61) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc62) + tt.return %0 : tensor<1xi1> loc(#loc62) + } loc(#loc59) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%value: tensor<1x2048xf32> loc("value"(#loc63)), %index: tensor<1x2048xi32> loc("index"(#loc63))) -> (tensor<1xf32>, tensor<1xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc64) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc64) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc64) + tt.return %0#0, %0#1 : tensor<1xf32>, tensor<1xi32> loc(#loc65) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc66) + %2 = ub.poison : tensor<1xi32> loc(#loc66) + tt.return %1, %2 : tensor<1xf32>, tensor<1xi32> loc(#loc66) + } loc(#loc63) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc30)), %a_index: i32 loc("a_index"(#loc30)), %b_value: f32 loc("b_value"(#loc30)), %b_index: i32 loc("b_index"(#loc30))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc115) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc116) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc33) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc100) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc101) + %mask_3 = arith.constant true loc(#loc102) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc102) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc103) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc117) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc105) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc118) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc118) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc42) + } loc(#loc34) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc107) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc108) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc109) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc46) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc47) + tt.return %2, %3 : f32, i32 loc(#loc48) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc49) + %5 = ub.poison : i32 loc(#loc49) + tt.return %4, %5 : f32, i32 loc(#loc49) + } loc(#loc30) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc50))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc51) + %true = arith.constant true loc(#loc52) + tt.return %true : i1 loc(#loc52) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc53) + tt.return %1 : i1 loc(#loc53) + } loc(#loc50) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc54))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc55) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc56) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc56) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc56) + tt.return %3 : tensor<1xf32> loc(#loc57) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc58) + tt.return %4 : tensor<1xf32> loc(#loc58) + } loc(#loc54) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":29:55) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":30:58) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":31:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":32:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":33:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":37:47) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":37:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":37:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":37:52) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":37:106) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":40:38) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":42:46) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":43:58) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":43:8) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":44:75) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":45:20) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":46:25) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":46:36) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":46:4) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc71 = loc("xnumel"(#loc1)) +#loc72 = loc("r0_numel"(#loc2)) +#loc73 = loc("xoffset"(#loc3)) +#loc74 = loc("xoffset"(#loc4)) +#loc75 = loc("xindex"(#loc5)) +#loc76 = loc("xindex"(#loc6)) +#loc77 = loc("xindex"(#loc7)) +#loc78 = loc("xmask"(#loc8)) +#loc79 = loc("r0_base"(#loc9)) +#loc80 = loc("r0_base"(#loc10)) +#loc81 = loc("_tmp2"(#loc11)) +#loc82 = loc("_tmp2_index"(#loc12)) +#loc83 = loc("_tmp2"(#loc13)) +#loc84 = loc("r0_index"(#loc14)) +#loc85 = loc("r0_mask"(#loc15)) +#loc86 = loc("tmp0"(#loc16)) +#loc87 = loc("tmp0"(#loc17)) +#loc88 = loc("tmp0"(#loc18)) +#loc89 = loc("tmp0"(#loc19)) +#loc90 = loc("tmp0"(#loc20)) +#loc91 = loc("_tmp2"(#loc22)) +#loc92 = loc("_tmp2_index"(#loc23)) +#loc93 = loc("tmp2"(#loc26)) +#loc98 = loc("mask"(#loc31)) +#loc99 = loc("equal"(#loc32)) +#loc100 = loc("a_isnan"(#loc35)) +#loc101 = loc("b_isnan"(#loc36)) +#loc102 = loc("mask"(#loc37)) +#loc103 = loc("mask"(#loc38)) +#loc104 = loc("mask"(#loc39)) +#loc105 = loc("equal"(#loc40)) +#loc106 = loc("equal"(#loc41)) +#loc107 = loc("mask"(#loc43)) +#loc108 = loc("mask"(#loc44)) +#loc109 = loc("mask"(#loc45)) +#loc114 = loc("_tmp2_index"(#loc83)) +#loc115 = loc("mask"(#loc98)) +#loc116 = loc("equal"(#loc99)) +#loc117 = loc("mask"(#loc104)) +#loc118 = loc("equal"(#loc106)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..f40a3b7b62e40f58c3c3627e0b7940c40c4b8014 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.ttgir @@ -0,0 +1,173 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":18:0) +#loc1 = loc(unknown) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":44:75) +#loc36 = loc("in_ptr0"(#loc)) +#loc37 = loc("out_ptr0"(#loc)) +#loc38 = loc("xnumel"(#loc)) +#loc39 = loc("r0_numel"(#loc)) +#loc67 = loc(callsite(#loc1 at #loc31)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_argmax_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32000> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %true = arith.constant true loc(#loc1) + %cst_1 = arith.constant dense : tensor<1x2048xi1, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<2147483647> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc40) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc41) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc41) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc42) + %tmp0_5 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc69) + %tmp0_6 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc44) + %_tmp2_index:2 = scf.for %_tmp2_index_7 = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp2 = %cst_3, %_tmp2_index_8 = %cst_2) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp2_index_7 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc46) + %r0_index_9 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32, #blocked> loc(#loc46) + %r0_mask = arith.cmpi slt, %r0_index_9, %cst : tensor<1x2048xi32, #blocked> loc(#loc47) + %tmp0_10 = arith.addi %r0_index_9, %tmp0_5 : tensor<1x2048xi32, #blocked> loc(#loc43) + %tmp0_11 = tt.addptr %tmp0_6, %tmp0_10 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc44) + %tmp0_12 = tt.load %tmp0_11, %r0_mask, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc48) + %tmp0_13 = arith.extf %tmp0_12 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc49) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_13 : tensor<1x2048xf32, #blocked> loc(#loc93) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_13 : tensor<1x2048xf32, #blocked> loc(#loc94) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<1x2048xf32, #blocked> loc(#loc73) + %b_isnan = arith.cmpf une, %tmp0_13, %tmp0_13 : tensor<1x2048xf32, #blocked> loc(#loc74) + %mask_14 = arith.xori %b_isnan, %cst_1 : tensor<1x2048xi1, #blocked> loc(#loc75) + %mask_15 = arith.andi %a_isnan, %mask_14 : tensor<1x2048xi1, #blocked> loc(#loc76) + %mask_16 = arith.ori %mask, %mask_15 : tensor<1x2048xi1, #blocked> loc(#loc95) + %equal_17 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1, #blocked> loc(#loc78) + %equal_18 = arith.ori %equal, %equal_17 : tensor<1x2048xi1, #blocked> loc(#loc96) + %mask_19 = arith.cmpi slt, %_tmp2_index_8, %r0_index_9 : tensor<1x2048xi32, #blocked> loc(#loc80) + %mask_20 = arith.andi %equal_18, %mask_19 : tensor<1x2048xi1, #blocked> loc(#loc81) + %mask_21 = arith.ori %mask_16, %mask_20 : tensor<1x2048xi1, #blocked> loc(#loc82) + %5 = arith.select %mask_21, %_tmp2, %tmp0_13 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc62) + %6 = arith.select %mask_21, %_tmp2_index_8, %r0_index_9 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc63) + %_tmp2_22 = arith.select %r0_mask, %5, %_tmp2 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc64) + %_tmp2_index_23 = arith.select %r0_mask, %6, %_tmp2_index_8 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc65) + scf.yield %_tmp2_22, %_tmp2_index_23 : tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc29) + } loc(#loc70) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc31)), %arg5: i32 loc(callsite(#loc1 at #loc31)), %arg6: f32 loc(callsite(#loc1 at #loc31)), %arg7: i32 loc(callsite(#loc1 at #loc31))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc97) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc98) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc83) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc84) + %mask_7 = arith.xori %b_isnan, %true : i1 loc(#loc85) + %mask_8 = arith.andi %a_isnan, %mask_7 : i1 loc(#loc86) + %mask_9 = arith.ori %mask, %mask_8 : i1 loc(#loc99) + %equal_10 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc87) + %equal_11 = arith.ori %equal, %equal_10 : i1 loc(#loc100) + %mask_12 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc88) + %mask_13 = arith.andi %equal_11, %mask_12 : i1 loc(#loc89) + %mask_14 = arith.ori %mask_9, %mask_13 : i1 loc(#loc90) + %5 = arith.select %mask_14, %arg4, %arg6 : f32 loc(#loc91) + %6 = arith.select %mask_14, %arg5, %arg7 : i32 loc(#loc92) + tt.reduce.return %5, %6 : f32, i32 loc(#loc66) + }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc66) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi32, #blocked> loc(#loc68) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc33) + %2 = ttg.convert_layout %tmp2 : tensor<1x1xi32, #blocked> -> tensor<1x1xi32, #blocked1> loc(#loc34) + %3 = arith.extsi %2 : tensor<1x1xi32, #blocked1> to tensor<1x1xi64, #blocked1> loc(#loc34) + %4 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc34) + tt.store %4, %3 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc34) + tt.return loc(#loc35) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":26:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":37:47) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":37:41) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":37:34) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":31:40) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":32:31) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":33:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":37:52) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":37:106) +#loc12 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":40:38) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":42:46) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":43:58) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":43:8) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":45:20) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":46:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":46:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":46:4) +#loc40 = loc("xoffset"(#loc2)) +#loc41 = loc("r0_base"(#loc3)) +#loc42 = loc("tmp0"(#loc4)) +#loc43 = loc("tmp0"(#loc5)) +#loc44 = loc("tmp0"(#loc6)) +#loc45 = loc("_tmp2"(#loc7)) +#loc46 = loc("r0_index"(#loc8)) +#loc47 = loc("r0_mask"(#loc9)) +#loc48 = loc("tmp0"(#loc10)) +#loc49 = loc("tmp0"(#loc11)) +#loc50 = loc("mask"(#loc12)) +#loc51 = loc("equal"(#loc14)) +#loc52 = loc("a_isnan"(#loc15)) +#loc53 = loc("b_isnan"(#loc16)) +#loc54 = loc("mask"(#loc17)) +#loc55 = loc("mask"(#loc18)) +#loc56 = loc("mask"(#loc19)) +#loc57 = loc("equal"(#loc20)) +#loc58 = loc("equal"(#loc21)) +#loc59 = loc("mask"(#loc22)) +#loc60 = loc("mask"(#loc23)) +#loc61 = loc("mask"(#loc24)) +#loc62 = loc(callsite(#loc25 at #loc13)) +#loc63 = loc(callsite(#loc26 at #loc13)) +#loc64 = loc("_tmp2"(#loc27)) +#loc65 = loc("_tmp2_index"(#loc28)) +#loc66 = loc(callsite(#loc30 at #loc31)) +#loc68 = loc("tmp2"(#loc32)) +#loc69 = loc(fused[#loc43, #loc42]) +#loc70 = loc("_tmp2_index"(#loc45)) +#loc71 = loc("mask"(#loc50)) +#loc72 = loc("equal"(#loc51)) +#loc73 = loc(callsite(#loc52 at #loc13)) +#loc74 = loc(callsite(#loc53 at #loc13)) +#loc75 = loc(callsite(#loc54 at #loc13)) +#loc76 = loc(callsite(#loc55 at #loc13)) +#loc77 = loc("mask"(#loc56)) +#loc78 = loc(callsite(#loc57 at #loc13)) +#loc79 = loc("equal"(#loc58)) +#loc80 = loc(callsite(#loc59 at #loc13)) +#loc81 = loc(callsite(#loc60 at #loc13)) +#loc82 = loc(callsite(#loc61 at #loc13)) +#loc83 = loc(callsite(#loc52 at #loc66)) +#loc84 = loc(callsite(#loc53 at #loc66)) +#loc85 = loc(callsite(#loc54 at #loc66)) +#loc86 = loc(callsite(#loc55 at #loc66)) +#loc87 = loc(callsite(#loc57 at #loc66)) +#loc88 = loc(callsite(#loc59 at #loc66)) +#loc89 = loc(callsite(#loc60 at #loc66)) +#loc90 = loc(callsite(#loc61 at #loc66)) +#loc91 = loc(callsite(#loc25 at #loc66)) +#loc92 = loc(callsite(#loc26 at #loc66)) +#loc93 = loc(callsite(#loc71 at #loc13)) +#loc94 = loc(callsite(#loc72 at #loc13)) +#loc95 = loc(callsite(#loc77 at #loc13)) +#loc96 = loc(callsite(#loc79 at #loc13)) +#loc97 = loc(callsite(#loc71 at #loc66)) +#loc98 = loc(callsite(#loc72 at #loc66)) +#loc99 = loc(callsite(#loc77 at #loc66)) +#loc100 = loc(callsite(#loc79 at #loc66)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..70d810484bdbbcccc425b13e713f58b4931fb4df --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/LEHCFAEILV2VVTHQWP55INWAP7WMDYDQAVN7JR5J546EKZSCPVWA/triton_red_fused_argmax_0.ttir @@ -0,0 +1,176 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":44:75) +#loc39 = loc("in_ptr0"(#loc)) +#loc40 = loc("out_ptr0"(#loc)) +#loc41 = loc("xnumel"(#loc)) +#loc42 = loc("r0_numel"(#loc)) +#loc43 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_argmax_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %true = arith.constant true loc(#loc43) + %cst = arith.constant dense : tensor<1x2048xi1> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc3) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_1 = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc44) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc45) + %xoffset = tt.get_program_id x : i32 loc(#loc46) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc47) + %r0_base_2 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc48) + %_tmp2_index_3:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp2_4 = %_tmp2, %_tmp2_index_5 = %_tmp2_index) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc50) + %r0_index_6 = arith.addi %r0_index, %r0_base_2 : tensor<1x2048xi32> loc(#loc50) + %r0_mask = arith.cmpi slt, %r0_index_6, %cst_1 : tensor<1x2048xi32> loc(#loc51) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc52) + %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc76) + %tmp0_8 = arith.addi %r0_index_6, %tmp0_7 : tensor<1x2048xi32> loc(#loc53) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc54) + %tmp0_10 = tt.addptr %tmp0_9, %tmp0_8 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc54) + %tmp0_11 = tt.load %tmp0_10, %r0_mask, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc55) + %tmp0_12 = arith.extf %tmp0_11 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc56) + %mask = arith.cmpf ogt, %_tmp2_4, %tmp0_12 : tensor<1x2048xf32> loc(#loc99) + %equal = arith.cmpf oeq, %_tmp2_4, %tmp0_12 : tensor<1x2048xf32> loc(#loc100) + %a_isnan = arith.cmpf une, %_tmp2_4, %_tmp2_4 : tensor<1x2048xf32> loc(#loc79) + %b_isnan = arith.cmpf une, %tmp0_12, %tmp0_12 : tensor<1x2048xf32> loc(#loc80) + %mask_13 = arith.xori %b_isnan, %cst : tensor<1x2048xi1> loc(#loc81) + %mask_14 = arith.andi %a_isnan, %mask_13 : tensor<1x2048xi1> loc(#loc82) + %mask_15 = arith.ori %mask, %mask_14 : tensor<1x2048xi1> loc(#loc101) + %equal_16 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc84) + %equal_17 = arith.ori %equal, %equal_16 : tensor<1x2048xi1> loc(#loc102) + %mask_18 = arith.cmpi slt, %_tmp2_index_5, %r0_index_6 : tensor<1x2048xi32> loc(#loc86) + %mask_19 = arith.andi %equal_17, %mask_18 : tensor<1x2048xi1> loc(#loc87) + %mask_20 = arith.ori %mask_15, %mask_19 : tensor<1x2048xi1> loc(#loc88) + %4 = arith.select %mask_20, %_tmp2_4, %tmp0_12 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc69) + %5 = arith.select %mask_20, %_tmp2_index_5, %r0_index_6 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc70) + %_tmp2_21 = arith.select %r0_mask, %4, %_tmp2_4 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc71) + %_tmp2_index_22 = arith.select %r0_mask, %5, %_tmp2_index_5 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc72) + scf.yield %_tmp2_21, %_tmp2_index_22 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc33) + } loc(#loc75) + %0:2 = "tt.reduce"(%_tmp2_index_3#0, %_tmp2_index_3#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc2)), %arg5: i32 loc(callsite(#loc1 at #loc2)), %arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc103) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc104) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc89) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc90) + %mask_4 = arith.xori %b_isnan, %true : i1 loc(#loc91) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc92) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc105) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc93) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc106) + %mask_9 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc94) + %mask_10 = arith.andi %equal_8, %mask_9 : i1 loc(#loc95) + %mask_11 = arith.ori %mask_6, %mask_10 : i1 loc(#loc96) + %4 = arith.select %mask_11, %arg4, %arg6 : f32 loc(#loc97) + %5 = arith.select %mask_11, %arg5, %arg7 : i32 loc(#loc98) + tt.reduce.return %4, %5 : f32, i32 loc(#loc73) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc73) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc74) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc36) + %2 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc36) + %3 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc37) + tt.store %2, %3 : tensor<1x1x!tt.ptr> loc(#loc37) + tt.return loc(#loc38) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":31:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":30:58) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":29:55) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":23:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":26:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":26:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":32:31) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":33:29) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":37:47) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":37:41) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":37:34) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":37:52) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":37:106) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":40:38) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":42:46) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":43:58) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":43:8) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":45:20) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":46:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":46:36) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/xg/cxgas2jt2e7frwpm3nd5h7wlzdo2fb2yonkvkfoarpyafiwosg23.py":46:4) +#loc44 = loc("_tmp2_index"(#loc4)) +#loc45 = loc("_tmp2"(#loc5)) +#loc46 = loc("xoffset"(#loc6)) +#loc47 = loc("r0_base"(#loc7)) +#loc48 = loc("r0_base"(#loc8)) +#loc49 = loc("_tmp2"(#loc3)) +#loc50 = loc("r0_index"(#loc9)) +#loc51 = loc("r0_mask"(#loc10)) +#loc52 = loc("tmp0"(#loc11)) +#loc53 = loc("tmp0"(#loc12)) +#loc54 = loc("tmp0"(#loc13)) +#loc55 = loc("tmp0"(#loc14)) +#loc56 = loc("tmp0"(#loc15)) +#loc57 = loc("mask"(#loc16)) +#loc58 = loc("equal"(#loc18)) +#loc59 = loc("a_isnan"(#loc19)) +#loc60 = loc("b_isnan"(#loc20)) +#loc61 = loc("mask"(#loc21)) +#loc62 = loc("mask"(#loc22)) +#loc63 = loc("mask"(#loc23)) +#loc64 = loc("equal"(#loc24)) +#loc65 = loc("equal"(#loc25)) +#loc66 = loc("mask"(#loc26)) +#loc67 = loc("mask"(#loc27)) +#loc68 = loc("mask"(#loc28)) +#loc69 = loc(callsite(#loc29 at #loc17)) +#loc70 = loc(callsite(#loc30 at #loc17)) +#loc71 = loc("_tmp2"(#loc31)) +#loc72 = loc("_tmp2_index"(#loc32)) +#loc73 = loc(callsite(#loc34 at #loc2)) +#loc74 = loc("tmp2"(#loc35)) +#loc75 = loc("_tmp2_index"(#loc49)) +#loc76 = loc(fused[#loc53, #loc52]) +#loc77 = loc("mask"(#loc57)) +#loc78 = loc("equal"(#loc58)) +#loc79 = loc(callsite(#loc59 at #loc17)) +#loc80 = loc(callsite(#loc60 at #loc17)) +#loc81 = loc(callsite(#loc61 at #loc17)) +#loc82 = loc(callsite(#loc62 at #loc17)) +#loc83 = loc("mask"(#loc63)) +#loc84 = loc(callsite(#loc64 at #loc17)) +#loc85 = loc("equal"(#loc65)) +#loc86 = loc(callsite(#loc66 at #loc17)) +#loc87 = loc(callsite(#loc67 at #loc17)) +#loc88 = loc(callsite(#loc68 at #loc17)) +#loc89 = loc(callsite(#loc59 at #loc73)) +#loc90 = loc(callsite(#loc60 at #loc73)) +#loc91 = loc(callsite(#loc61 at #loc73)) +#loc92 = loc(callsite(#loc62 at #loc73)) +#loc93 = loc(callsite(#loc64 at #loc73)) +#loc94 = loc(callsite(#loc66 at #loc73)) +#loc95 = loc(callsite(#loc67 at #loc73)) +#loc96 = loc(callsite(#loc68 at #loc73)) +#loc97 = loc(callsite(#loc29 at #loc73)) +#loc98 = loc(callsite(#loc30 at #loc73)) +#loc99 = loc(callsite(#loc77 at #loc17)) +#loc100 = loc(callsite(#loc78 at #loc17)) +#loc101 = loc(callsite(#loc83 at #loc17)) +#loc102 = loc(callsite(#loc85 at #loc17)) +#loc103 = loc(callsite(#loc77 at #loc73)) +#loc104 = loc(callsite(#loc78 at #loc73)) +#loc105 = loc(callsite(#loc83 at #loc73)) +#loc106 = loc(callsite(#loc85 at #loc73)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.json b/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0dce87df4b81d001d155f03dd6c8737e9dbd5ba4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.source", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttgir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.llir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ptx", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.cubin", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..e53839dab4f6fb2ee177174b98104b83e4b99420 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.json b/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.json new file mode 100644 index 0000000000000000000000000000000000000000..18b770ce0b152438bfb86c764a998fe97f538f56 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.json @@ -0,0 +1 @@ +{"hash": "646ba622f200565dee7f9c59d2d299c463a3d71c6cae1d84111435950aebd579", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..60aaafcc3378fe6276b736aaa5c6fc7f9cf67568 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.llir @@ -0,0 +1,808 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_2 = internal constant [8 x i8] c"unknown\00" +@assertFile_2 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py\00" +@assertMessage_2 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp36 < ks3\00" +@assertFunc_1 = internal constant [8 x i8] c"unknown\00" +@assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py\00" +@assertMessage_1 = internal constant [65 x i8] c"index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2\00" +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py\00" +@assertMessage_0 = internal constant [64 x i8] c"index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i64 %6, i64 %7, i64 %8, i32 %9, ptr addrspace(1) readnone captures(none) %10, ptr addrspace(1) readnone captures(none) %11) local_unnamed_addr #1 !dbg !9 { + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %14 = shl i32 %13, 10, !dbg !11 + %15 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %16 = shl nuw nsw i32 %15, 3, !dbg !12 + %17 = and i32 %16, 1016, !dbg !12 + %18 = insertelement <8 x i32> poison, i32 %9, i64 0, !dbg !13 + %19 = shufflevector <8 x i32> %18, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !13 + %20 = or disjoint i32 %17, %14, !dbg !14 + %21 = or disjoint i32 %20, 1, !dbg !14 + %22 = or disjoint i32 %20, 2, !dbg !14 + %23 = or disjoint i32 %20, 3, !dbg !14 + %24 = or disjoint i32 %20, 4, !dbg !14 + %25 = or disjoint i32 %20, 5, !dbg !14 + %26 = or disjoint i32 %20, 6, !dbg !14 + %27 = or disjoint i32 %20, 7, !dbg !14 + %28 = insertelement <8 x i32> poison, i32 %20, i64 0, !dbg !13 + %29 = insertelement <8 x i32> %28, i32 %21, i64 1, !dbg !13 + %30 = insertelement <8 x i32> %29, i32 %22, i64 2, !dbg !13 + %31 = insertelement <8 x i32> %30, i32 %23, i64 3, !dbg !13 + %32 = insertelement <8 x i32> %31, i32 %24, i64 4, !dbg !13 + %33 = insertelement <8 x i32> %32, i32 %25, i64 5, !dbg !13 + %34 = insertelement <8 x i32> %33, i32 %26, i64 6, !dbg !13 + %35 = insertelement <8 x i32> %34, i32 %27, i64 7, !dbg !13 + %36 = icmp slt <8 x i32> %35, %19, !dbg !13 + %37 = sext <8 x i32> %35 to <8 x i64>, !dbg !15 + %38 = insertelement <8 x i64> poison, i64 %5, i64 0, !dbg !15 + %39 = shufflevector <8 x i64> %38, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !15 + %40 = srem <8 x i64> %37, %39, !dbg !15 + %41 = extractelement <8 x i64> %37, i64 0, !dbg !16 + %42 = sdiv i64 %41, %5, !dbg !17 + %43 = extractelement <8 x i64> %37, i64 1, !dbg !16 + %44 = sdiv i64 %43, %5, !dbg !17 + %45 = extractelement <8 x i64> %37, i64 2, !dbg !16 + %46 = sdiv i64 %45, %5, !dbg !17 + %47 = extractelement <8 x i64> %37, i64 3, !dbg !16 + %48 = sdiv i64 %47, %5, !dbg !17 + %49 = extractelement <8 x i64> %37, i64 4, !dbg !16 + %50 = sdiv i64 %49, %5, !dbg !17 + %51 = extractelement <8 x i64> %37, i64 5, !dbg !16 + %52 = sdiv i64 %51, %5, !dbg !17 + %53 = extractelement <8 x i64> %37, i64 6, !dbg !16 + %54 = sdiv i64 %53, %5, !dbg !17 + %55 = extractelement <8 x i64> %37, i64 7, !dbg !16 + %56 = sdiv i64 %55, %5, !dbg !17 + %57 = srem i64 %42, %6, !dbg !18 + %58 = srem i64 %44, %6, !dbg !18 + %59 = srem i64 %46, %6, !dbg !18 + %60 = srem i64 %48, %6, !dbg !18 + %61 = srem i64 %50, %6, !dbg !18 + %62 = srem i64 %52, %6, !dbg !18 + %63 = srem i64 %54, %6, !dbg !18 + %64 = srem i64 %56, %6, !dbg !18 + %65 = getelementptr bfloat, ptr addrspace(1) %0, i64 %41, !dbg !16 + %66 = getelementptr bfloat, ptr addrspace(1) %0, i64 %43, !dbg !16 + %67 = getelementptr bfloat, ptr addrspace(1) %0, i64 %45, !dbg !16 + %68 = getelementptr bfloat, ptr addrspace(1) %0, i64 %47, !dbg !16 + %69 = getelementptr bfloat, ptr addrspace(1) %0, i64 %49, !dbg !16 + %70 = getelementptr bfloat, ptr addrspace(1) %0, i64 %51, !dbg !16 + %71 = getelementptr bfloat, ptr addrspace(1) %0, i64 %53, !dbg !16 + %72 = getelementptr bfloat, ptr addrspace(1) %0, i64 %55, !dbg !16 + %73 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %74 = extractelement <8 x i1> %36, i64 0, !dbg !19 + %75 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %65, i64 %73, i1 %74) #4, !dbg !19 + %76 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %77 = extractelement <8 x i1> %36, i64 1, !dbg !19 + %78 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %66, i64 %76, i1 %77) #4, !dbg !19 + %79 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %80 = extractelement <8 x i1> %36, i64 2, !dbg !19 + %81 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %67, i64 %79, i1 %80) #4, !dbg !19 + %82 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %83 = extractelement <8 x i1> %36, i64 3, !dbg !19 + %84 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %68, i64 %82, i1 %83) #4, !dbg !19 + %85 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %86 = extractelement <8 x i1> %36, i64 4, !dbg !19 + %87 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %69, i64 %85, i1 %86) #4, !dbg !19 + %88 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %89 = extractelement <8 x i1> %36, i64 5, !dbg !19 + %90 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %70, i64 %88, i1 %89) #4, !dbg !19 + %91 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %92 = extractelement <8 x i1> %36, i64 6, !dbg !19 + %93 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %71, i64 %91, i1 %92) #4, !dbg !19 + %94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %95 = extractelement <8 x i1> %36, i64 7, !dbg !19 + %96 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %72, i64 %94, i1 %95) #4, !dbg !19 + %97 = getelementptr i64, ptr addrspace(1) %1, i64 %57, !dbg !20 + %98 = getelementptr i64, ptr addrspace(1) %1, i64 %58, !dbg !20 + %99 = getelementptr i64, ptr addrspace(1) %1, i64 %59, !dbg !20 + %100 = getelementptr i64, ptr addrspace(1) %1, i64 %60, !dbg !20 + %101 = getelementptr i64, ptr addrspace(1) %1, i64 %61, !dbg !20 + %102 = getelementptr i64, ptr addrspace(1) %1, i64 %62, !dbg !20 + %103 = getelementptr i64, ptr addrspace(1) %1, i64 %63, !dbg !20 + %104 = getelementptr i64, ptr addrspace(1) %1, i64 %64, !dbg !20 + %105 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %106 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %97, i64 %105, i1 %74) #4, !dbg !21 + %107 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %108 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %98, i64 %107, i1 %77) #4, !dbg !21 + %109 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %110 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %99, i64 %109, i1 %80) #4, !dbg !21 + %111 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %112 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %100, i64 %111, i1 %83) #4, !dbg !21 + %113 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %114 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %101, i64 %113, i1 %86) #4, !dbg !21 + %115 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %116 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %102, i64 %115, i1 %89) #4, !dbg !21 + %117 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %118 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %103, i64 %117, i1 %92) #4, !dbg !21 + %119 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %120 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %119, i1 %95) #4, !dbg !21 + %121 = sdiv i64 %5, 2, !dbg !22 + %122 = sub nsw i64 %41, %121, !dbg !23 + %123 = sub nsw i64 %43, %121, !dbg !23 + %124 = sub nsw i64 %45, %121, !dbg !23 + %125 = sub nsw i64 %47, %121, !dbg !23 + %126 = sub nsw i64 %49, %121, !dbg !23 + %127 = sub nsw i64 %51, %121, !dbg !23 + %128 = sub nsw i64 %53, %121, !dbg !23 + %129 = sub nsw i64 %55, %121, !dbg !23 + %130 = getelementptr bfloat, ptr addrspace(1) %0, i64 %122, !dbg !24 + %131 = getelementptr bfloat, ptr addrspace(1) %0, i64 %123, !dbg !24 + %132 = getelementptr bfloat, ptr addrspace(1) %0, i64 %124, !dbg !24 + %133 = getelementptr bfloat, ptr addrspace(1) %0, i64 %125, !dbg !24 + %134 = getelementptr bfloat, ptr addrspace(1) %0, i64 %126, !dbg !24 + %135 = getelementptr bfloat, ptr addrspace(1) %0, i64 %127, !dbg !24 + %136 = getelementptr bfloat, ptr addrspace(1) %0, i64 %128, !dbg !24 + %137 = getelementptr bfloat, ptr addrspace(1) %0, i64 %129, !dbg !24 + %138 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !25 + %139 = insertelement <8 x i64> poison, i64 %121, i64 0, !dbg !26 + %140 = shufflevector <8 x i64> %139, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !26 + %141 = icmp sge <8 x i64> %40, %140, !dbg !26 + %142 = and <8 x i1> %36, %141, !dbg !27 + %143 = extractelement <8 x i1> %142, i64 0, !dbg !28 + %144 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %130, i64 %138, i1 %143) #4, !dbg !25 + %145 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !25 + %146 = extractelement <8 x i1> %142, i64 1, !dbg !28 + %147 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %131, i64 %145, i1 %146) #4, !dbg !25 + %148 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !25 + %149 = extractelement <8 x i1> %142, i64 2, !dbg !28 + %150 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %132, i64 %148, i1 %149) #4, !dbg !25 + %151 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !25 + %152 = extractelement <8 x i1> %142, i64 3, !dbg !28 + %153 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %133, i64 %151, i1 %152) #4, !dbg !25 + %154 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !25 + %155 = extractelement <8 x i1> %142, i64 4, !dbg !28 + %156 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %134, i64 %154, i1 %155) #4, !dbg !25 + %157 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !25 + %158 = extractelement <8 x i1> %142, i64 5, !dbg !28 + %159 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %135, i64 %157, i1 %158) #4, !dbg !25 + %160 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !25 + %161 = extractelement <8 x i1> %142, i64 6, !dbg !28 + %162 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %136, i64 %160, i1 %161) #4, !dbg !25 + %163 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !25 + %164 = extractelement <8 x i1> %142, i64 7, !dbg !28 + %165 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %137, i64 %163, i1 %164) #4, !dbg !25 + %166 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28 + %167 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %97, i64 %166, i1 %143) #4, !dbg !28 + %168 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28 + %169 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %98, i64 %168, i1 %146) #4, !dbg !28 + %170 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28 + %171 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %99, i64 %170, i1 %149) #4, !dbg !28 + %172 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28 + %173 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %100, i64 %172, i1 %152) #4, !dbg !28 + %174 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28 + %175 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %101, i64 %174, i1 %155) #4, !dbg !28 + %176 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28 + %177 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %102, i64 %176, i1 %158) #4, !dbg !28 + %178 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28 + %179 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %103, i64 %178, i1 %161) #4, !dbg !28 + %180 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28 + %181 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %180, i1 %164) #4, !dbg !28 + %182 = insertelement <8 x i64> poison, i64 %167, i64 0, !dbg !29 + %183 = insertelement <8 x i64> %182, i64 %169, i64 1, !dbg !29 + %184 = insertelement <8 x i64> %183, i64 %171, i64 2, !dbg !29 + %185 = insertelement <8 x i64> %184, i64 %173, i64 3, !dbg !29 + %186 = insertelement <8 x i64> %185, i64 %175, i64 4, !dbg !29 + %187 = insertelement <8 x i64> %186, i64 %177, i64 5, !dbg !29 + %188 = insertelement <8 x i64> %187, i64 %179, i64 6, !dbg !29 + %189 = insertelement <8 x i64> %188, i64 %181, i64 7, !dbg !29 + %190 = icmp slt <8 x i64> %189, zeroinitializer, !dbg !29 + %191 = insertelement <8 x i64> poison, i64 %7, i64 0, !dbg !30 + %192 = shufflevector <8 x i64> %191, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !30 + %193 = select <8 x i1> %190, <8 x i64> %192, <8 x i64> zeroinitializer, !dbg !30 + %194 = add <8 x i64> %193, %189, !dbg !30 + %195 = icmp slt <8 x i64> %194, zeroinitializer, !dbg !31 + %196 = icmp sge <8 x i64> %194, %192, !dbg !32 + %197 = or <8 x i1> %195, %196, !dbg !33 + %198 = and <8 x i1> %142, %197, !dbg !34 + %199 = bitcast <8 x i1> %198 to i8, !dbg !35 + %.not = icmp eq i8 %199, 0, !dbg !35 + br i1 %.not, label %201, label %200, !dbg !35 + +200: ; preds = %12 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 36, ptr nonnull @assertFunc_0, i64 1), !dbg !35 + unreachable, !dbg !35 + +201: ; preds = %12 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !35 + %202 = extractelement <8 x i64> %40, i64 0, !dbg !36 + %203 = sub nsw i64 %202, %121, !dbg !36 + %204 = extractelement <8 x i64> %40, i64 1, !dbg !36 + %205 = sub nsw i64 %204, %121, !dbg !36 + %206 = extractelement <8 x i64> %40, i64 2, !dbg !36 + %207 = sub nsw i64 %206, %121, !dbg !36 + %208 = extractelement <8 x i64> %40, i64 3, !dbg !36 + %209 = sub nsw i64 %208, %121, !dbg !36 + %210 = extractelement <8 x i64> %40, i64 4, !dbg !36 + %211 = sub nsw i64 %210, %121, !dbg !36 + %212 = extractelement <8 x i64> %40, i64 5, !dbg !36 + %213 = sub nsw i64 %212, %121, !dbg !36 + %214 = extractelement <8 x i64> %40, i64 6, !dbg !36 + %215 = sub nsw i64 %214, %121, !dbg !36 + %216 = extractelement <8 x i64> %40, i64 7, !dbg !36 + %217 = sub nsw i64 %216, %121, !dbg !36 + %218 = extractelement <8 x i64> %194, i64 0, !dbg !37 + %219 = mul i64 %218, %5, !dbg !37 + %220 = extractelement <8 x i64> %194, i64 1, !dbg !37 + %221 = mul i64 %220, %5, !dbg !37 + %222 = extractelement <8 x i64> %194, i64 2, !dbg !37 + %223 = mul i64 %222, %5, !dbg !37 + %224 = extractelement <8 x i64> %194, i64 3, !dbg !37 + %225 = mul i64 %224, %5, !dbg !37 + %226 = extractelement <8 x i64> %194, i64 4, !dbg !37 + %227 = mul i64 %226, %5, !dbg !37 + %228 = extractelement <8 x i64> %194, i64 5, !dbg !37 + %229 = mul i64 %228, %5, !dbg !37 + %230 = extractelement <8 x i64> %194, i64 6, !dbg !37 + %231 = mul i64 %230, %5, !dbg !37 + %232 = extractelement <8 x i64> %194, i64 7, !dbg !37 + %233 = mul i64 %232, %5, !dbg !37 + %234 = getelementptr bfloat, ptr addrspace(1) %2, i64 %203, !dbg !38 + %235 = getelementptr bfloat, ptr addrspace(1) %234, i64 %219, !dbg !38 + %236 = getelementptr bfloat, ptr addrspace(1) %2, i64 %205, !dbg !38 + %237 = getelementptr bfloat, ptr addrspace(1) %236, i64 %221, !dbg !38 + %238 = getelementptr bfloat, ptr addrspace(1) %2, i64 %207, !dbg !38 + %239 = getelementptr bfloat, ptr addrspace(1) %238, i64 %223, !dbg !38 + %240 = getelementptr bfloat, ptr addrspace(1) %2, i64 %209, !dbg !38 + %241 = getelementptr bfloat, ptr addrspace(1) %240, i64 %225, !dbg !38 + %242 = getelementptr bfloat, ptr addrspace(1) %2, i64 %211, !dbg !38 + %243 = getelementptr bfloat, ptr addrspace(1) %242, i64 %227, !dbg !38 + %244 = getelementptr bfloat, ptr addrspace(1) %2, i64 %213, !dbg !38 + %245 = getelementptr bfloat, ptr addrspace(1) %244, i64 %229, !dbg !38 + %246 = getelementptr bfloat, ptr addrspace(1) %2, i64 %215, !dbg !38 + %247 = getelementptr bfloat, ptr addrspace(1) %246, i64 %231, !dbg !38 + %248 = getelementptr bfloat, ptr addrspace(1) %2, i64 %217, !dbg !38 + %249 = getelementptr bfloat, ptr addrspace(1) %248, i64 %233, !dbg !38 + %250 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39 + %251 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %235, i64 %250, i1 %143) #4, !dbg !39 + %252 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39 + %253 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %237, i64 %252, i1 %146) #4, !dbg !39 + %254 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39 + %255 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %239, i64 %254, i1 %149) #4, !dbg !39 + %256 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39 + %257 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %241, i64 %256, i1 %152) #4, !dbg !39 + %258 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39 + %259 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %243, i64 %258, i1 %155) #4, !dbg !39 + %260 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39 + %261 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %245, i64 %260, i1 %158) #4, !dbg !39 + %262 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39 + %263 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %247, i64 %262, i1 %161) #4, !dbg !39 + %264 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39 + %265 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %249, i64 %264, i1 %164) #4, !dbg !39 + %266 = icmp slt <8 x i64> %40, %140, !dbg !40 + %267 = add i64 %5, %41, !dbg !41 + %268 = add i64 %5, %43, !dbg !41 + %269 = add i64 %5, %45, !dbg !41 + %270 = add i64 %5, %47, !dbg !41 + %271 = add i64 %5, %49, !dbg !41 + %272 = add i64 %5, %51, !dbg !41 + %273 = add i64 %5, %53, !dbg !41 + %274 = add i64 %5, %55, !dbg !41 + %275 = sub i64 %267, %121, !dbg !42 + %276 = sub i64 %268, %121, !dbg !42 + %277 = sub i64 %269, %121, !dbg !42 + %278 = sub i64 %270, %121, !dbg !42 + %279 = sub i64 %271, %121, !dbg !42 + %280 = sub i64 %272, %121, !dbg !42 + %281 = sub i64 %273, %121, !dbg !42 + %282 = sub i64 %274, %121, !dbg !42 + %283 = getelementptr bfloat, ptr addrspace(1) %0, i64 %275, !dbg !43 + %284 = getelementptr bfloat, ptr addrspace(1) %0, i64 %276, !dbg !43 + %285 = getelementptr bfloat, ptr addrspace(1) %0, i64 %277, !dbg !43 + %286 = getelementptr bfloat, ptr addrspace(1) %0, i64 %278, !dbg !43 + %287 = getelementptr bfloat, ptr addrspace(1) %0, i64 %279, !dbg !43 + %288 = getelementptr bfloat, ptr addrspace(1) %0, i64 %280, !dbg !43 + %289 = getelementptr bfloat, ptr addrspace(1) %0, i64 %281, !dbg !43 + %290 = getelementptr bfloat, ptr addrspace(1) %0, i64 %282, !dbg !43 + %291 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44 + %292 = and <8 x i1> %36, %266, !dbg !45 + %293 = extractelement <8 x i1> %292, i64 0, !dbg !46 + %294 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %283, i64 %291, i1 %293) #4, !dbg !44 + %295 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44 + %296 = extractelement <8 x i1> %292, i64 1, !dbg !46 + %297 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %284, i64 %295, i1 %296) #4, !dbg !44 + %298 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44 + %299 = extractelement <8 x i1> %292, i64 2, !dbg !46 + %300 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %285, i64 %298, i1 %299) #4, !dbg !44 + %301 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44 + %302 = extractelement <8 x i1> %292, i64 3, !dbg !46 + %303 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %286, i64 %301, i1 %302) #4, !dbg !44 + %304 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44 + %305 = extractelement <8 x i1> %292, i64 4, !dbg !46 + %306 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %287, i64 %304, i1 %305) #4, !dbg !44 + %307 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44 + %308 = extractelement <8 x i1> %292, i64 5, !dbg !46 + %309 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %288, i64 %307, i1 %308) #4, !dbg !44 + %310 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44 + %311 = extractelement <8 x i1> %292, i64 6, !dbg !46 + %312 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %289, i64 %310, i1 %311) #4, !dbg !44 + %313 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44 + %314 = extractelement <8 x i1> %292, i64 7, !dbg !46 + %315 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %290, i64 %313, i1 %314) #4, !dbg !44 + %316 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46 + %317 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %97, i64 %316, i1 %293) #4, !dbg !46 + %318 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46 + %319 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %98, i64 %318, i1 %296) #4, !dbg !46 + %320 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46 + %321 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %99, i64 %320, i1 %299) #4, !dbg !46 + %322 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46 + %323 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %100, i64 %322, i1 %302) #4, !dbg !46 + %324 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46 + %325 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %101, i64 %324, i1 %305) #4, !dbg !46 + %326 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46 + %327 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %102, i64 %326, i1 %308) #4, !dbg !46 + %328 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46 + %329 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %103, i64 %328, i1 %311) #4, !dbg !46 + %330 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46 + %331 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %330, i1 %314) #4, !dbg !46 + %332 = insertelement <8 x i64> poison, i64 %317, i64 0, !dbg !47 + %333 = insertelement <8 x i64> %332, i64 %319, i64 1, !dbg !47 + %334 = insertelement <8 x i64> %333, i64 %321, i64 2, !dbg !47 + %335 = insertelement <8 x i64> %334, i64 %323, i64 3, !dbg !47 + %336 = insertelement <8 x i64> %335, i64 %325, i64 4, !dbg !47 + %337 = insertelement <8 x i64> %336, i64 %327, i64 5, !dbg !47 + %338 = insertelement <8 x i64> %337, i64 %329, i64 6, !dbg !47 + %339 = insertelement <8 x i64> %338, i64 %331, i64 7, !dbg !47 + %340 = icmp slt <8 x i64> %339, zeroinitializer, !dbg !47 + %341 = select <8 x i1> %340, <8 x i64> %192, <8 x i64> zeroinitializer, !dbg !48 + %342 = add <8 x i64> %341, %339, !dbg !48 + %343 = icmp slt <8 x i64> %342, zeroinitializer, !dbg !49 + %344 = icmp sge <8 x i64> %342, %192, !dbg !50 + %345 = or <8 x i1> %343, %344, !dbg !51 + %346 = and <8 x i1> %292, %345, !dbg !52 + %347 = bitcast <8 x i1> %346 to i8, !dbg !53 + %.not97 = icmp eq i8 %347, 0, !dbg !53 + br i1 %.not97, label %349, label %348, !dbg !53 + +348: ; preds = %201 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 51, ptr nonnull @assertFunc_1, i64 1), !dbg !53 + unreachable, !dbg !53 + +349: ; preds = %201 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %350 = sub i64 %5, %121, !dbg !54 + %351 = extractelement <8 x i64> %342, i64 0, !dbg !55 + %352 = mul i64 %351, %5, !dbg !55 + %353 = extractelement <8 x i64> %342, i64 1, !dbg !55 + %354 = mul i64 %353, %5, !dbg !55 + %355 = extractelement <8 x i64> %342, i64 2, !dbg !55 + %356 = mul i64 %355, %5, !dbg !55 + %357 = extractelement <8 x i64> %342, i64 3, !dbg !55 + %358 = mul i64 %357, %5, !dbg !55 + %359 = extractelement <8 x i64> %342, i64 4, !dbg !55 + %360 = mul i64 %359, %5, !dbg !55 + %361 = extractelement <8 x i64> %342, i64 5, !dbg !55 + %362 = mul i64 %361, %5, !dbg !55 + %363 = extractelement <8 x i64> %342, i64 6, !dbg !55 + %364 = mul i64 %363, %5, !dbg !55 + %365 = extractelement <8 x i64> %342, i64 7, !dbg !55 + %366 = mul i64 %365, %5, !dbg !55 + %367 = getelementptr bfloat, ptr addrspace(1) %2, i64 %350, !dbg !56 + %368 = getelementptr bfloat, ptr addrspace(1) %367, i64 %202, !dbg !56 + %369 = getelementptr bfloat, ptr addrspace(1) %368, i64 %352, !dbg !56 + %370 = getelementptr bfloat, ptr addrspace(1) %367, i64 %204, !dbg !56 + %371 = getelementptr bfloat, ptr addrspace(1) %370, i64 %354, !dbg !56 + %372 = getelementptr bfloat, ptr addrspace(1) %367, i64 %206, !dbg !56 + %373 = getelementptr bfloat, ptr addrspace(1) %372, i64 %356, !dbg !56 + %374 = getelementptr bfloat, ptr addrspace(1) %367, i64 %208, !dbg !56 + %375 = getelementptr bfloat, ptr addrspace(1) %374, i64 %358, !dbg !56 + %376 = getelementptr bfloat, ptr addrspace(1) %367, i64 %210, !dbg !56 + %377 = getelementptr bfloat, ptr addrspace(1) %376, i64 %360, !dbg !56 + %378 = getelementptr bfloat, ptr addrspace(1) %367, i64 %212, !dbg !56 + %379 = getelementptr bfloat, ptr addrspace(1) %378, i64 %362, !dbg !56 + %380 = getelementptr bfloat, ptr addrspace(1) %367, i64 %214, !dbg !56 + %381 = getelementptr bfloat, ptr addrspace(1) %380, i64 %364, !dbg !56 + %382 = getelementptr bfloat, ptr addrspace(1) %367, i64 %216, !dbg !56 + %383 = getelementptr bfloat, ptr addrspace(1) %382, i64 %366, !dbg !56 + %384 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %385 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %369, i64 %384, i1 %293) #4, !dbg !57 + %386 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %387 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %371, i64 %386, i1 %296) #4, !dbg !57 + %388 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %389 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %373, i64 %388, i1 %299) #4, !dbg !57 + %390 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %391 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %375, i64 %390, i1 %302) #4, !dbg !57 + %392 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %393 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %377, i64 %392, i1 %305) #4, !dbg !57 + %394 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %395 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %379, i64 %394, i1 %308) #4, !dbg !57 + %396 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %397 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %381, i64 %396, i1 %311) #4, !dbg !57 + %398 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %399 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %383, i64 %398, i1 %314) #4, !dbg !57 + %400 = insertelement <8 x i64> poison, i64 %106, i64 0, !dbg !58 + %401 = insertelement <8 x i64> %400, i64 %108, i64 1, !dbg !58 + %402 = insertelement <8 x i64> %401, i64 %110, i64 2, !dbg !58 + %403 = insertelement <8 x i64> %402, i64 %112, i64 3, !dbg !58 + %404 = insertelement <8 x i64> %403, i64 %114, i64 4, !dbg !58 + %405 = insertelement <8 x i64> %404, i64 %116, i64 5, !dbg !58 + %406 = insertelement <8 x i64> %405, i64 %118, i64 6, !dbg !58 + %407 = insertelement <8 x i64> %406, i64 %120, i64 7, !dbg !58 + %408 = icmp slt <8 x i64> %407, zeroinitializer, !dbg !58 + %409 = insertelement <8 x i64> poison, i64 %8, i64 0, !dbg !59 + %410 = shufflevector <8 x i64> %409, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !59 + %411 = select <8 x i1> %408, <8 x i64> %410, <8 x i64> zeroinitializer, !dbg !59 + %412 = add <8 x i64> %411, %407, !dbg !59 + %413 = icmp slt <8 x i64> %412, zeroinitializer, !dbg !60 + %414 = icmp sge <8 x i64> %412, %410, !dbg !61 + %415 = or <8 x i1> %413, %414, !dbg !62 + %416 = and <8 x i1> %36, %415, !dbg !63 + %417 = bitcast <8 x i1> %416 to i8, !dbg !64 + %.not98 = icmp eq i8 %417, 0, !dbg !64 + br i1 %.not98, label %419, label %418, !dbg !64 + +418: ; preds = %349 + tail call void @__assertfail(ptr nonnull @assertMessage_2, ptr nonnull @assertFile_2, i32 62, ptr nonnull @assertFunc_2, i64 1), !dbg !64 + unreachable, !dbg !64 + +419: ; preds = %349 + %420 = bitcast i16 %165 to bfloat, !dbg !25 + %421 = fpext bfloat %420 to float, !dbg !65 + %422 = bitcast i16 %265 to bfloat, !dbg !39 + %423 = fpext bfloat %422 to float, !dbg !66 + %424 = fmul float %421, %423, !dbg !67 + %425 = fsub float 0.000000e+00, %424, !dbg !68 + %426 = extractelement <8 x i1> %141, i64 7, !dbg !69 + %427 = select i1 %426, float %425, float 0.000000e+00, !dbg !69 + %428 = bitcast i16 %315 to bfloat, !dbg !44 + %429 = fpext bfloat %428 to float, !dbg !70 + %430 = bitcast i16 %399 to bfloat, !dbg !57 + %431 = fpext bfloat %430 to float, !dbg !71 + %432 = fmul float %429, %431, !dbg !72 + %433 = extractelement <8 x i1> %266, i64 7, !dbg !69 + %434 = select i1 %433, float %432, float 0.000000e+00, !dbg !69 + %435 = fadd float %427, %434, !dbg !73 + %436 = bitcast i16 %162 to bfloat, !dbg !25 + %437 = fpext bfloat %436 to float, !dbg !65 + %438 = bitcast i16 %263 to bfloat, !dbg !39 + %439 = fpext bfloat %438 to float, !dbg !66 + %440 = fmul float %437, %439, !dbg !67 + %441 = fsub float 0.000000e+00, %440, !dbg !68 + %442 = extractelement <8 x i1> %141, i64 6, !dbg !69 + %443 = select i1 %442, float %441, float 0.000000e+00, !dbg !69 + %444 = bitcast i16 %312 to bfloat, !dbg !44 + %445 = fpext bfloat %444 to float, !dbg !70 + %446 = bitcast i16 %397 to bfloat, !dbg !57 + %447 = fpext bfloat %446 to float, !dbg !71 + %448 = fmul float %445, %447, !dbg !72 + %449 = extractelement <8 x i1> %266, i64 6, !dbg !69 + %450 = select i1 %449, float %448, float 0.000000e+00, !dbg !69 + %451 = fadd float %443, %450, !dbg !73 + %452 = bitcast i16 %159 to bfloat, !dbg !25 + %453 = fpext bfloat %452 to float, !dbg !65 + %454 = bitcast i16 %261 to bfloat, !dbg !39 + %455 = fpext bfloat %454 to float, !dbg !66 + %456 = fmul float %453, %455, !dbg !67 + %457 = fsub float 0.000000e+00, %456, !dbg !68 + %458 = extractelement <8 x i1> %141, i64 5, !dbg !69 + %459 = select i1 %458, float %457, float 0.000000e+00, !dbg !69 + %460 = bitcast i16 %309 to bfloat, !dbg !44 + %461 = fpext bfloat %460 to float, !dbg !70 + %462 = bitcast i16 %395 to bfloat, !dbg !57 + %463 = fpext bfloat %462 to float, !dbg !71 + %464 = fmul float %461, %463, !dbg !72 + %465 = extractelement <8 x i1> %266, i64 5, !dbg !69 + %466 = select i1 %465, float %464, float 0.000000e+00, !dbg !69 + %467 = fadd float %459, %466, !dbg !73 + %468 = bitcast i16 %156 to bfloat, !dbg !25 + %469 = fpext bfloat %468 to float, !dbg !65 + %470 = bitcast i16 %259 to bfloat, !dbg !39 + %471 = fpext bfloat %470 to float, !dbg !66 + %472 = fmul float %469, %471, !dbg !67 + %473 = fsub float 0.000000e+00, %472, !dbg !68 + %474 = extractelement <8 x i1> %141, i64 4, !dbg !69 + %475 = select i1 %474, float %473, float 0.000000e+00, !dbg !69 + %476 = bitcast i16 %306 to bfloat, !dbg !44 + %477 = fpext bfloat %476 to float, !dbg !70 + %478 = bitcast i16 %393 to bfloat, !dbg !57 + %479 = fpext bfloat %478 to float, !dbg !71 + %480 = fmul float %477, %479, !dbg !72 + %481 = extractelement <8 x i1> %266, i64 4, !dbg !69 + %482 = select i1 %481, float %480, float 0.000000e+00, !dbg !69 + %483 = fadd float %475, %482, !dbg !73 + %484 = bitcast i16 %153 to bfloat, !dbg !25 + %485 = fpext bfloat %484 to float, !dbg !65 + %486 = bitcast i16 %257 to bfloat, !dbg !39 + %487 = fpext bfloat %486 to float, !dbg !66 + %488 = fmul float %485, %487, !dbg !67 + %489 = fsub float 0.000000e+00, %488, !dbg !68 + %490 = extractelement <8 x i1> %141, i64 3, !dbg !69 + %491 = select i1 %490, float %489, float 0.000000e+00, !dbg !69 + %492 = bitcast i16 %303 to bfloat, !dbg !44 + %493 = fpext bfloat %492 to float, !dbg !70 + %494 = bitcast i16 %391 to bfloat, !dbg !57 + %495 = fpext bfloat %494 to float, !dbg !71 + %496 = fmul float %493, %495, !dbg !72 + %497 = extractelement <8 x i1> %266, i64 3, !dbg !69 + %498 = select i1 %497, float %496, float 0.000000e+00, !dbg !69 + %499 = fadd float %491, %498, !dbg !73 + %500 = bitcast i16 %150 to bfloat, !dbg !25 + %501 = fpext bfloat %500 to float, !dbg !65 + %502 = bitcast i16 %255 to bfloat, !dbg !39 + %503 = fpext bfloat %502 to float, !dbg !66 + %504 = fmul float %501, %503, !dbg !67 + %505 = fsub float 0.000000e+00, %504, !dbg !68 + %506 = extractelement <8 x i1> %141, i64 2, !dbg !69 + %507 = select i1 %506, float %505, float 0.000000e+00, !dbg !69 + %508 = bitcast i16 %300 to bfloat, !dbg !44 + %509 = fpext bfloat %508 to float, !dbg !70 + %510 = bitcast i16 %389 to bfloat, !dbg !57 + %511 = fpext bfloat %510 to float, !dbg !71 + %512 = fmul float %509, %511, !dbg !72 + %513 = extractelement <8 x i1> %266, i64 2, !dbg !69 + %514 = select i1 %513, float %512, float 0.000000e+00, !dbg !69 + %515 = fadd float %507, %514, !dbg !73 + %516 = bitcast i16 %147 to bfloat, !dbg !25 + %517 = fpext bfloat %516 to float, !dbg !65 + %518 = bitcast i16 %253 to bfloat, !dbg !39 + %519 = fpext bfloat %518 to float, !dbg !66 + %520 = fmul float %517, %519, !dbg !67 + %521 = fsub float 0.000000e+00, %520, !dbg !68 + %522 = extractelement <8 x i1> %141, i64 1, !dbg !69 + %523 = select i1 %522, float %521, float 0.000000e+00, !dbg !69 + %524 = bitcast i16 %297 to bfloat, !dbg !44 + %525 = fpext bfloat %524 to float, !dbg !70 + %526 = bitcast i16 %387 to bfloat, !dbg !57 + %527 = fpext bfloat %526 to float, !dbg !71 + %528 = fmul float %525, %527, !dbg !72 + %529 = extractelement <8 x i1> %266, i64 1, !dbg !69 + %530 = select i1 %529, float %528, float 0.000000e+00, !dbg !69 + %531 = fadd float %523, %530, !dbg !73 + %532 = bitcast i16 %144 to bfloat, !dbg !25 + %533 = fpext bfloat %532 to float, !dbg !65 + %534 = bitcast i16 %251 to bfloat, !dbg !39 + %535 = fpext bfloat %534 to float, !dbg !66 + %536 = fmul float %533, %535, !dbg !67 + %537 = fsub float 0.000000e+00, %536, !dbg !68 + %538 = extractelement <8 x i1> %141, i64 0, !dbg !69 + %539 = select i1 %538, float %537, float 0.000000e+00, !dbg !69 + %540 = bitcast i16 %294 to bfloat, !dbg !44 + %541 = fpext bfloat %540 to float, !dbg !70 + %542 = bitcast i16 %385 to bfloat, !dbg !57 + %543 = fpext bfloat %542 to float, !dbg !71 + %544 = fmul float %541, %543, !dbg !72 + %545 = extractelement <8 x i1> %266, i64 0, !dbg !69 + %546 = select i1 %545, float %544, float 0.000000e+00, !dbg !69 + %547 = fadd float %539, %546, !dbg !73 + %548 = bitcast i16 %96 to bfloat, !dbg !19 + %549 = fpext bfloat %548 to float, !dbg !74 + %550 = bitcast i16 %93 to bfloat, !dbg !19 + %551 = fpext bfloat %550 to float, !dbg !74 + %552 = bitcast i16 %90 to bfloat, !dbg !19 + %553 = fpext bfloat %552 to float, !dbg !74 + %554 = bitcast i16 %87 to bfloat, !dbg !19 + %555 = fpext bfloat %554 to float, !dbg !74 + %556 = bitcast i16 %84 to bfloat, !dbg !19 + %557 = fpext bfloat %556 to float, !dbg !74 + %558 = bitcast i16 %81 to bfloat, !dbg !19 + %559 = fpext bfloat %558 to float, !dbg !74 + %560 = bitcast i16 %78 to bfloat, !dbg !19 + %561 = fpext bfloat %560 to float, !dbg !74 + %562 = bitcast i16 %75 to bfloat, !dbg !19 + %563 = fpext bfloat %562 to float, !dbg !74 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !64 + %564 = extractelement <8 x i64> %412, i64 0, !dbg !75 + %565 = mul i64 %564, %5, !dbg !75 + %566 = extractelement <8 x i64> %412, i64 1, !dbg !75 + %567 = mul i64 %566, %5, !dbg !75 + %568 = extractelement <8 x i64> %412, i64 2, !dbg !75 + %569 = mul i64 %568, %5, !dbg !75 + %570 = extractelement <8 x i64> %412, i64 3, !dbg !75 + %571 = mul i64 %570, %5, !dbg !75 + %572 = extractelement <8 x i64> %412, i64 4, !dbg !75 + %573 = mul i64 %572, %5, !dbg !75 + %574 = extractelement <8 x i64> %412, i64 5, !dbg !75 + %575 = mul i64 %574, %5, !dbg !75 + %576 = extractelement <8 x i64> %412, i64 6, !dbg !75 + %577 = mul i64 %576, %5, !dbg !75 + %578 = extractelement <8 x i64> %412, i64 7, !dbg !75 + %579 = mul i64 %578, %5, !dbg !75 + %580 = getelementptr bfloat, ptr addrspace(1) %3, i64 %202, !dbg !76 + %581 = getelementptr bfloat, ptr addrspace(1) %580, i64 %565, !dbg !76 + %582 = getelementptr bfloat, ptr addrspace(1) %3, i64 %204, !dbg !76 + %583 = getelementptr bfloat, ptr addrspace(1) %582, i64 %567, !dbg !76 + %584 = getelementptr bfloat, ptr addrspace(1) %3, i64 %206, !dbg !76 + %585 = getelementptr bfloat, ptr addrspace(1) %584, i64 %569, !dbg !76 + %586 = getelementptr bfloat, ptr addrspace(1) %3, i64 %208, !dbg !76 + %587 = getelementptr bfloat, ptr addrspace(1) %586, i64 %571, !dbg !76 + %588 = getelementptr bfloat, ptr addrspace(1) %3, i64 %210, !dbg !76 + %589 = getelementptr bfloat, ptr addrspace(1) %588, i64 %573, !dbg !76 + %590 = getelementptr bfloat, ptr addrspace(1) %3, i64 %212, !dbg !76 + %591 = getelementptr bfloat, ptr addrspace(1) %590, i64 %575, !dbg !76 + %592 = getelementptr bfloat, ptr addrspace(1) %3, i64 %214, !dbg !76 + %593 = getelementptr bfloat, ptr addrspace(1) %592, i64 %577, !dbg !76 + %594 = getelementptr bfloat, ptr addrspace(1) %3, i64 %216, !dbg !76 + %595 = getelementptr bfloat, ptr addrspace(1) %594, i64 %579, !dbg !76 + %596 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !77 + %597 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %581, i64 %596, i1 %74) #4, !dbg !77 + %598 = bitcast i16 %597 to bfloat, !dbg !77 + %599 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !77 + %600 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %583, i64 %599, i1 %77) #4, !dbg !77 + %601 = bitcast i16 %600 to bfloat, !dbg !77 + %602 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !77 + %603 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %585, i64 %602, i1 %80) #4, !dbg !77 + %604 = bitcast i16 %603 to bfloat, !dbg !77 + %605 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !77 + %606 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %587, i64 %605, i1 %83) #4, !dbg !77 + %607 = bitcast i16 %606 to bfloat, !dbg !77 + %608 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !77 + %609 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %589, i64 %608, i1 %86) #4, !dbg !77 + %610 = bitcast i16 %609 to bfloat, !dbg !77 + %611 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !77 + %612 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %591, i64 %611, i1 %89) #4, !dbg !77 + %613 = bitcast i16 %612 to bfloat, !dbg !77 + %614 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !77 + %615 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %593, i64 %614, i1 %92) #4, !dbg !77 + %616 = bitcast i16 %615 to bfloat, !dbg !77 + %617 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !77 + %618 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %595, i64 %617, i1 %95) #4, !dbg !77 + %619 = bitcast i16 %618 to bfloat, !dbg !77 + %620 = fpext bfloat %598 to float, !dbg !78 + %621 = fpext bfloat %601 to float, !dbg !78 + %622 = fpext bfloat %604 to float, !dbg !78 + %623 = fpext bfloat %607 to float, !dbg !78 + %624 = fpext bfloat %610 to float, !dbg !78 + %625 = fpext bfloat %613 to float, !dbg !78 + %626 = fpext bfloat %616 to float, !dbg !78 + %627 = fpext bfloat %619 to float, !dbg !78 + %628 = fmul float %563, %620, !dbg !79 + %629 = fmul float %561, %621, !dbg !79 + %630 = fmul float %559, %622, !dbg !79 + %631 = fmul float %557, %623, !dbg !79 + %632 = fmul float %555, %624, !dbg !79 + %633 = fmul float %553, %625, !dbg !79 + %634 = fmul float %551, %626, !dbg !79 + %635 = fmul float %549, %627, !dbg !79 + %636 = fadd float %547, %628, !dbg !80 + %637 = fadd float %531, %629, !dbg !80 + %638 = fadd float %515, %630, !dbg !80 + %639 = fadd float %499, %631, !dbg !80 + %640 = fadd float %483, %632, !dbg !80 + %641 = fadd float %467, %633, !dbg !80 + %642 = fadd float %451, %634, !dbg !80 + %643 = fadd float %435, %635, !dbg !80 + %644 = getelementptr bfloat, ptr addrspace(1) %4, i64 %41, !dbg !81 + %645 = getelementptr bfloat, ptr addrspace(1) %4, i64 %43, !dbg !81 + %646 = getelementptr bfloat, ptr addrspace(1) %4, i64 %45, !dbg !81 + %647 = getelementptr bfloat, ptr addrspace(1) %4, i64 %47, !dbg !81 + %648 = getelementptr bfloat, ptr addrspace(1) %4, i64 %49, !dbg !81 + %649 = getelementptr bfloat, ptr addrspace(1) %4, i64 %51, !dbg !81 + %650 = getelementptr bfloat, ptr addrspace(1) %4, i64 %53, !dbg !81 + %651 = getelementptr bfloat, ptr addrspace(1) %4, i64 %55, !dbg !81 + %652 = fptrunc float %636 to bfloat, !dbg !82 + %653 = fptrunc float %637 to bfloat, !dbg !82 + %654 = fptrunc float %638 to bfloat, !dbg !82 + %655 = fptrunc float %639 to bfloat, !dbg !82 + %656 = fptrunc float %640 to bfloat, !dbg !82 + %657 = fptrunc float %641 to bfloat, !dbg !82 + %658 = fptrunc float %642 to bfloat, !dbg !82 + %659 = fptrunc float %643 to bfloat, !dbg !82 + %660 = bitcast bfloat %652 to i16, !dbg !82 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %660, ptr addrspace(1) %644, i1 %74) #4, !dbg !82 + %661 = bitcast bfloat %653 to i16, !dbg !82 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %661, ptr addrspace(1) %645, i1 %77) #4, !dbg !82 + %662 = bitcast bfloat %654 to i16, !dbg !82 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %662, ptr addrspace(1) %646, i1 %80) #4, !dbg !82 + %663 = bitcast bfloat %655 to i16, !dbg !82 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %663, ptr addrspace(1) %647, i1 %83) #4, !dbg !82 + %664 = bitcast bfloat %656 to i16, !dbg !82 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %664, ptr addrspace(1) %648, i1 %86) #4, !dbg !82 + %665 = bitcast bfloat %657 to i16, !dbg !82 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %665, ptr addrspace(1) %649, i1 %89) #4, !dbg !82 + %666 = bitcast bfloat %658 to i16, !dbg !82 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %666, ptr addrspace(1) %650, i1 %92) #4, !dbg !82 + %667 = bitcast bfloat %659 to i16, !dbg !82 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %667, ptr addrspace(1) %651, i1 %95) #4, !dbg !82 + ret void, !dbg !83 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="128" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1", linkageName: "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 19, column: 28, scope: !9) +!11 = !DILocation(line: 19, column: 33, scope: !9) +!12 = !DILocation(line: 20, column: 36, scope: !9) +!13 = !DILocation(line: 21, column: 21, scope: !9) +!14 = !DILocation(line: 20, column: 23, scope: !9) +!15 = !DILocation(line: 22, column: 19, scope: !9) +!16 = !DILocation(line: 25, column: 31, scope: !9) +!17 = !DILocation(line: 24, column: 21, scope: !9) +!18 = !DILocation(line: 24, column: 28, scope: !9) +!19 = !DILocation(line: 25, column: 36, scope: !9) +!20 = !DILocation(line: 26, column: 31, scope: !9) +!21 = !DILocation(line: 26, column: 36, scope: !9) +!22 = !DILocation(line: 28, column: 18, scope: !9) +!23 = !DILocation(line: 30, column: 35, scope: !9) +!24 = !DILocation(line: 30, column: 30, scope: !9) +!25 = !DILocation(line: 30, column: 53, scope: !9) +!26 = !DILocation(line: 29, column: 19, scope: !9) +!27 = !DILocation(line: 30, column: 60, scope: !9) +!28 = !DILocation(line: 31, column: 35, scope: !9) +!29 = !DILocation(line: 34, column: 18, scope: !9) +!30 = !DILocation(line: 35, column: 32, scope: !9) +!31 = !DILocation(line: 36, column: 28, scope: !9) +!32 = !DILocation(line: 36, column: 98, scope: !9) +!33 = !DILocation(line: 36, column: 64, scope: !9) +!34 = !DILocation(line: 36, column: 106, scope: !9) +!35 = !DILocation(line: 36, column: 123, scope: !9) +!36 = !DILocation(line: 37, column: 36, scope: !9) +!37 = !DILocation(line: 37, column: 58, scope: !9) +!38 = !DILocation(line: 37, column: 31, scope: !9) +!39 = !DILocation(line: 37, column: 65, scope: !9) +!40 = !DILocation(line: 44, column: 19, scope: !9) +!41 = !DILocation(line: 45, column: 37, scope: !9) +!42 = !DILocation(line: 45, column: 42, scope: !9) +!43 = !DILocation(line: 45, column: 31, scope: !9) +!44 = !DILocation(line: 45, column: 60, scope: !9) +!45 = !DILocation(line: 45, column: 68, scope: !9) +!46 = !DILocation(line: 46, column: 36, scope: !9) +!47 = !DILocation(line: 49, column: 20, scope: !9) +!48 = !DILocation(line: 50, column: 35, scope: !9) +!49 = !DILocation(line: 51, column: 28, scope: !9) +!50 = !DILocation(line: 51, column: 100, scope: !9) +!51 = !DILocation(line: 51, column: 65, scope: !9) +!52 = !DILocation(line: 51, column: 108, scope: !9) +!53 = !DILocation(line: 51, column: 126, scope: !9) +!54 = !DILocation(line: 52, column: 37, scope: !9) +!55 = !DILocation(line: 52, column: 64, scope: !9) +!56 = !DILocation(line: 52, column: 31, scope: !9) +!57 = !DILocation(line: 52, column: 72, scope: !9) +!58 = !DILocation(line: 60, column: 20, scope: !9) +!59 = !DILocation(line: 61, column: 35, scope: !9) +!60 = !DILocation(line: 62, column: 28, scope: !9) +!61 = !DILocation(line: 62, column: 46, scope: !9) +!62 = !DILocation(line: 62, column: 38, scope: !9) +!63 = !DILocation(line: 62, column: 54, scope: !9) +!64 = !DILocation(line: 62, column: 64, scope: !9) +!65 = !DILocation(line: 30, column: 111, scope: !9) +!66 = !DILocation(line: 37, column: 123, scope: !9) +!67 = !DILocation(line: 38, column: 19, scope: !9) +!68 = !DILocation(line: 39, column: 13, scope: !9) +!69 = !DILocation(line: 0, scope: !9) +!70 = !DILocation(line: 45, column: 119, scope: !9) +!71 = !DILocation(line: 52, column: 131, scope: !9) +!72 = !DILocation(line: 53, column: 20, scope: !9) +!73 = !DILocation(line: 57, column: 20, scope: !9) +!74 = !DILocation(line: 25, column: 76, scope: !9) +!75 = !DILocation(line: 63, column: 40, scope: !9) +!76 = !DILocation(line: 63, column: 31, scope: !9) +!77 = !DILocation(line: 63, column: 48, scope: !9) +!78 = !DILocation(line: 63, column: 88, scope: !9) +!79 = !DILocation(line: 64, column: 20, scope: !9) +!80 = !DILocation(line: 65, column: 20, scope: !9) +!81 = !DILocation(line: 66, column: 25, scope: !9) +!82 = !DILocation(line: 66, column: 37, scope: !9) +!83 = !DILocation(line: 66, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..b6b404795b3b5695a35a969421e1095432c39b4f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ptx @@ -0,0 +1,1936 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1 // -- Begin function triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_2[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_2[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 100, 122, 47, 99, 100, 122, 51, 105, 111, 55, 119, 53, 117, 121, 102, 114, 109, 102, 113, 118, 109, 103, 50, 107, 116, 50, 97, 121, 54, 54, 113, 118, 52, 99, 107, 119, 116, 121, 117, 114, 104, 105, 107, 51, 102, 114, 113, 55, 102, 113, 110, 107, 55, 103, 109, 46, 112, 121}; +.global .align 1 .b8 assertMessage_2[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 54, 32, 60, 32, 107, 115, 51}; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 100, 122, 47, 99, 100, 122, 51, 105, 111, 55, 119, 53, 117, 121, 102, 114, 109, 102, 113, 118, 109, 103, 50, 107, 116, 50, 97, 121, 54, 54, 113, 118, 52, 99, 107, 119, 116, 121, 117, 114, 104, 105, 107, 51, 102, 114, 113, 55, 102, 113, 110, 107, 55, 103, 109, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[65] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 108, 46, 98, 114, 111, 97, 100, 99, 97, 115, 116, 95, 116, 111, 40, 116, 109, 112, 50, 51, 44, 32, 91, 88, 66, 76, 79, 67, 75, 93, 41, 32, 60, 32, 107, 115, 50}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 100, 122, 47, 99, 100, 122, 51, 105, 111, 55, 119, 53, 117, 121, 102, 114, 109, 102, 113, 118, 109, 103, 50, 107, 116, 50, 97, 121, 54, 54, 113, 118, 52, 99, 107, 119, 116, 121, 117, 114, 104, 105, 107, 51, 102, 114, 113, 55, 102, 113, 110, 107, 55, 103, 109, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[64] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 108, 46, 98, 114, 111, 97, 100, 99, 97, 115, 116, 95, 116, 111, 40, 116, 109, 112, 56, 44, 32, 91, 88, 66, 76, 79, 67, 75, 93, 41, 32, 60, 32, 107, 115, 50}; + // @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1 +.visible .entry triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1( + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_4, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_5, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_6, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_7, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_8, + .param .u32 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_9, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_10, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_11 +) +.reqntid 128 +{ + .reg .pred %p<268>; + .reg .b16 %rs<207>; + .reg .b32 %r<174>; + .reg .b64 %rd<682>; + .loc 1 18 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:18:0 + +// %bb.0: + ld.param.b64 %rd143, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_5]; +$L__tmp0: + .loc 1 19 28 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:19:28 + mov.u32 %r17, %ctaid.x; + .loc 1 19 33 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:19:33 + shl.b32 %r18, %r17, 10; + .loc 1 20 36 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:20:36 + mov.u32 %r19, %tid.x; + shl.b32 %r20, %r19, 3; + and.b32 %r21, %r20, 1016; + .loc 1 20 23 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:20:23 + or.b32 %r9, %r21, %r18; + or.b32 %r10, %r9, 1; + or.b32 %r11, %r9, 2; + .loc 1 22 19 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:22:19 + cvt.s64.s32 %rd2, %r10; + cvt.s64.s32 %rd1, %r9; + .loc 1 24 21 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:24:21 + or.b64 %rd147, %rd1, %rd143; + and.b64 %rd148, %rd147, -4294967296; + setp.ne.b64 %p25, %rd148, 0; + @%p25 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd666, %rd1, %rd143; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r22, %rd143; + cvt.u32.u64 %r23, %rd1; + div.u32 %r24, %r23, %r22; + cvt.u64.u32 %rd666, %r24; +$L__BB0_3: + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + or.b32 %r12, %r9, 3; + cvt.s64.s32 %rd3, %r11; + .loc 1 24 21 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:24:21 + or.b64 %rd149, %rd2, %rd143; + and.b64 %rd150, %rd149, -4294967296; + setp.ne.b64 %p26, %rd150, 0; + @%p26 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd667, %rd2, %rd143; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r25, %rd143; + cvt.u32.u64 %r26, %rd2; + div.u32 %r27, %r26, %r25; + cvt.u64.u32 %rd667, %r27; +$L__BB0_6: + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + or.b32 %r13, %r9, 4; + cvt.s64.s32 %rd4, %r12; + .loc 1 24 21 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:24:21 + or.b64 %rd151, %rd3, %rd143; + and.b64 %rd152, %rd151, -4294967296; + setp.ne.b64 %p27, %rd152, 0; + @%p27 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + div.s64 %rd668, %rd3, %rd143; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r28, %rd143; + cvt.u32.u64 %r29, %rd3; + div.u32 %r30, %r29, %r28; + cvt.u64.u32 %rd668, %r30; +$L__BB0_9: + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + or.b32 %r14, %r9, 5; + cvt.s64.s32 %rd5, %r13; + .loc 1 24 21 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:24:21 + or.b64 %rd153, %rd4, %rd143; + and.b64 %rd154, %rd153, -4294967296; + setp.ne.b64 %p28, %rd154, 0; + @%p28 bra $L__BB0_11; + bra.uni $L__BB0_10; +$L__BB0_11: + div.s64 %rd669, %rd4, %rd143; + bra.uni $L__BB0_12; +$L__BB0_10: + cvt.u32.u64 %r31, %rd143; + cvt.u32.u64 %r32, %rd4; + div.u32 %r33, %r32, %r31; + cvt.u64.u32 %rd669, %r33; +$L__BB0_12: + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + or.b32 %r15, %r9, 6; + cvt.s64.s32 %rd6, %r14; + .loc 1 24 21 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:24:21 + or.b64 %rd155, %rd5, %rd143; + and.b64 %rd156, %rd155, -4294967296; + setp.ne.b64 %p29, %rd156, 0; + @%p29 bra $L__BB0_14; + bra.uni $L__BB0_13; +$L__BB0_14: + div.s64 %rd670, %rd5, %rd143; + bra.uni $L__BB0_15; +$L__BB0_13: + cvt.u32.u64 %r34, %rd143; + cvt.u32.u64 %r35, %rd5; + div.u32 %r36, %r35, %r34; + cvt.u64.u32 %rd670, %r36; +$L__BB0_15: + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + or.b32 %r16, %r9, 7; + cvt.s64.s32 %rd7, %r15; + .loc 1 24 21 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:24:21 + or.b64 %rd157, %rd6, %rd143; + and.b64 %rd158, %rd157, -4294967296; + setp.ne.b64 %p30, %rd158, 0; + @%p30 bra $L__BB0_17; + bra.uni $L__BB0_16; +$L__BB0_17: + div.s64 %rd671, %rd6, %rd143; + bra.uni $L__BB0_18; +$L__BB0_16: + cvt.u32.u64 %r37, %rd143; + cvt.u32.u64 %r38, %rd6; + div.u32 %r39, %r38, %r37; + cvt.u64.u32 %rd671, %r39; +$L__BB0_18: + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + cvt.s64.s32 %rd8, %r16; + .loc 1 24 21 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:24:21 + or.b64 %rd159, %rd7, %rd143; + and.b64 %rd160, %rd159, -4294967296; + setp.ne.b64 %p31, %rd160, 0; + @%p31 bra $L__BB0_20; + bra.uni $L__BB0_19; +$L__BB0_20: + div.s64 %rd672, %rd7, %rd143; + bra.uni $L__BB0_21; +$L__BB0_19: + cvt.u32.u64 %r40, %rd143; + cvt.u32.u64 %r41, %rd7; + div.u32 %r42, %r41, %r40; + cvt.u64.u32 %rd672, %r42; +$L__BB0_21: + .loc 1 0 21 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0:21 + ld.param.b64 %rd144, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_6]; + .loc 1 24 21 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:24:21 + or.b64 %rd161, %rd8, %rd143; + and.b64 %rd162, %rd161, -4294967296; + setp.ne.b64 %p32, %rd162, 0; + @%p32 bra $L__BB0_23; + bra.uni $L__BB0_22; +$L__BB0_23: + div.s64 %rd673, %rd8, %rd143; + bra.uni $L__BB0_24; +$L__BB0_22: + cvt.u32.u64 %r43, %rd143; + cvt.u32.u64 %r44, %rd8; + div.u32 %r45, %r44, %r43; + cvt.u64.u32 %rd673, %r45; +$L__BB0_24: + .loc 1 24 28 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:24:28 + or.b64 %rd163, %rd666, %rd144; + and.b64 %rd164, %rd163, -4294967296; + setp.ne.b64 %p33, %rd164, 0; + @%p33 bra $L__BB0_26; + bra.uni $L__BB0_25; +$L__BB0_26: + rem.s64 %rd674, %rd666, %rd144; + bra.uni $L__BB0_27; +$L__BB0_25: + cvt.u32.u64 %r46, %rd144; + cvt.u32.u64 %r47, %rd666; + rem.u32 %r48, %r47, %r46; + cvt.u64.u32 %rd674, %r48; +$L__BB0_27: + or.b64 %rd165, %rd667, %rd144; + and.b64 %rd166, %rd165, -4294967296; + setp.ne.b64 %p34, %rd166, 0; + @%p34 bra $L__BB0_29; + bra.uni $L__BB0_28; +$L__BB0_29: + rem.s64 %rd675, %rd667, %rd144; + bra.uni $L__BB0_30; +$L__BB0_28: + cvt.u32.u64 %r49, %rd144; + cvt.u32.u64 %r50, %rd667; + rem.u32 %r51, %r50, %r49; + cvt.u64.u32 %rd675, %r51; +$L__BB0_30: + or.b64 %rd167, %rd668, %rd144; + and.b64 %rd168, %rd167, -4294967296; + setp.ne.b64 %p35, %rd168, 0; + @%p35 bra $L__BB0_32; + bra.uni $L__BB0_31; +$L__BB0_32: + rem.s64 %rd676, %rd668, %rd144; + bra.uni $L__BB0_33; +$L__BB0_31: + cvt.u32.u64 %r52, %rd144; + cvt.u32.u64 %r53, %rd668; + rem.u32 %r54, %r53, %r52; + cvt.u64.u32 %rd676, %r54; +$L__BB0_33: + or.b64 %rd169, %rd669, %rd144; + and.b64 %rd170, %rd169, -4294967296; + setp.ne.b64 %p36, %rd170, 0; + @%p36 bra $L__BB0_35; + bra.uni $L__BB0_34; +$L__BB0_35: + rem.s64 %rd677, %rd669, %rd144; + bra.uni $L__BB0_36; +$L__BB0_34: + cvt.u32.u64 %r55, %rd144; + cvt.u32.u64 %r56, %rd669; + rem.u32 %r57, %r56, %r55; + cvt.u64.u32 %rd677, %r57; +$L__BB0_36: + or.b64 %rd171, %rd670, %rd144; + and.b64 %rd172, %rd171, -4294967296; + setp.ne.b64 %p37, %rd172, 0; + @%p37 bra $L__BB0_38; + bra.uni $L__BB0_37; +$L__BB0_38: + rem.s64 %rd678, %rd670, %rd144; + bra.uni $L__BB0_39; +$L__BB0_37: + cvt.u32.u64 %r58, %rd144; + cvt.u32.u64 %r59, %rd670; + rem.u32 %r60, %r59, %r58; + cvt.u64.u32 %rd678, %r60; +$L__BB0_39: + or.b64 %rd173, %rd671, %rd144; + and.b64 %rd174, %rd173, -4294967296; + setp.ne.b64 %p38, %rd174, 0; + @%p38 bra $L__BB0_41; + bra.uni $L__BB0_40; +$L__BB0_41: + rem.s64 %rd679, %rd671, %rd144; + bra.uni $L__BB0_42; +$L__BB0_40: + cvt.u32.u64 %r61, %rd144; + cvt.u32.u64 %r62, %rd671; + rem.u32 %r63, %r62, %r61; + cvt.u64.u32 %rd679, %r63; +$L__BB0_42: + or.b64 %rd175, %rd672, %rd144; + and.b64 %rd176, %rd175, -4294967296; + setp.ne.b64 %p39, %rd176, 0; + @%p39 bra $L__BB0_44; + bra.uni $L__BB0_43; +$L__BB0_44: + rem.s64 %rd680, %rd672, %rd144; + bra.uni $L__BB0_45; +$L__BB0_43: + cvt.u32.u64 %r64, %rd144; + cvt.u32.u64 %r65, %rd672; + rem.u32 %r66, %r65, %r64; + cvt.u64.u32 %rd680, %r66; +$L__BB0_45: + .loc 1 0 28 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0:28 + ld.param.b64 %rd145, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_7]; + ld.param.b64 %rd139, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_1]; + ld.param.b64 %rd138, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_0]; + ld.param.b32 %r8, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_9]; + rem.s64 %rd9, %rd1, %rd143; + rem.s64 %rd16, %rd8, %rd143; + rem.s64 %rd15, %rd7, %rd143; + rem.s64 %rd14, %rd6, %rd143; + rem.s64 %rd13, %rd5, %rd143; + rem.s64 %rd12, %rd4, %rd143; + rem.s64 %rd11, %rd3, %rd143; + rem.s64 %rd10, %rd2, %rd143; + .loc 1 24 28 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:24:28 + or.b64 %rd177, %rd673, %rd144; + and.b64 %rd178, %rd177, -4294967296; + setp.ne.b64 %p40, %rd178, 0; + @%p40 bra $L__BB0_47; + bra.uni $L__BB0_46; +$L__BB0_47: + rem.s64 %rd681, %rd673, %rd144; + bra.uni $L__BB0_48; +$L__BB0_46: + cvt.u32.u64 %r67, %rd144; + cvt.u32.u64 %r68, %rd673; + rem.u32 %r69, %r68, %r67; + cvt.u64.u32 %rd681, %r69; +$L__BB0_48: + .loc 1 21 21 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:21:21 + setp.lt.s32 %p8, %r16, %r8; + setp.lt.s32 %p7, %r15, %r8; + setp.lt.s32 %p6, %r14, %r8; + setp.lt.s32 %p5, %r13, %r8; + setp.lt.s32 %p4, %r12, %r8; + setp.lt.s32 %p3, %r11, %r8; + setp.lt.s32 %p2, %r10, %r8; + setp.lt.s32 %p1, %r9, %r8; + .loc 1 25 31 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:25:31 + shl.b64 %rd291, %rd1, 1; + add.s64 %rd180, %rd138, %rd291; + shl.b64 %rd292, %rd2, 1; + add.s64 %rd183, %rd138, %rd292; + shl.b64 %rd293, %rd3, 1; + add.s64 %rd186, %rd138, %rd293; + shl.b64 %rd294, %rd4, 1; + add.s64 %rd189, %rd138, %rd294; + shl.b64 %rd295, %rd5, 1; + add.s64 %rd192, %rd138, %rd295; + shl.b64 %rd296, %rd6, 1; + add.s64 %rd195, %rd138, %rd296; + shl.b64 %rd297, %rd7, 1; + add.s64 %rd198, %rd138, %rd297; + shl.b64 %rd298, %rd8, 1; + add.s64 %rd201, %rd138, %rd298; + .loc 1 25 36 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:25:36 + // begin inline asm + mov.u64 %rd179, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd179, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs41, 0x0; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs41 }, [ %rd180 + 0 ], %rd179; + // end inline asm + // begin inline asm + mov.u64 %rd182, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd182, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs42, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs42 }, [ %rd183 + 0 ], %rd182; + // end inline asm + // begin inline asm + mov.u64 %rd185, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd185, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs43, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs43 }, [ %rd186 + 0 ], %rd185; + // end inline asm + // begin inline asm + mov.u64 %rd188, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd188, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs44, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs44 }, [ %rd189 + 0 ], %rd188; + // end inline asm + // begin inline asm + mov.u64 %rd191, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd191, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs45, 0x0; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs45 }, [ %rd192 + 0 ], %rd191; + // end inline asm + // begin inline asm + mov.u64 %rd194, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd194, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs46, 0x0; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs46 }, [ %rd195 + 0 ], %rd194; + // end inline asm + // begin inline asm + mov.u64 %rd197, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd197, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs47, 0x0; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs47 }, [ %rd198 + 0 ], %rd197; + // end inline asm + // begin inline asm + mov.u64 %rd200, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd200, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs48, 0x0; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs48 }, [ %rd201 + 0 ], %rd200; + // end inline asm + .loc 1 26 31 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:26:31 + shl.b64 %rd299, %rd674, 3; + add.s64 %rd261, %rd139, %rd299; + shl.b64 %rd300, %rd675, 3; + add.s64 %rd265, %rd139, %rd300; + shl.b64 %rd301, %rd676, 3; + add.s64 %rd269, %rd139, %rd301; + shl.b64 %rd302, %rd677, 3; + add.s64 %rd273, %rd139, %rd302; + shl.b64 %rd303, %rd678, 3; + add.s64 %rd277, %rd139, %rd303; + shl.b64 %rd304, %rd679, 3; + add.s64 %rd281, %rd139, %rd304; + shl.b64 %rd305, %rd680, 3; + add.s64 %rd285, %rd139, %rd305; + shl.b64 %rd306, %rd681, 3; + add.s64 %rd289, %rd139, %rd306; + .loc 1 26 36 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:26:36 + // begin inline asm + mov.u64 %rd203, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd203, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd204, 0x0; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd204 }, [ %rd261 + 0 ], %rd203; + // end inline asm + // begin inline asm + mov.u64 %rd207, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd207, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd208, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd208 }, [ %rd265 + 0 ], %rd207; + // end inline asm + // begin inline asm + mov.u64 %rd211, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd211, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd212, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd212 }, [ %rd269 + 0 ], %rd211; + // end inline asm + // begin inline asm + mov.u64 %rd215, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd215, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd216, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd216 }, [ %rd273 + 0 ], %rd215; + // end inline asm + // begin inline asm + mov.u64 %rd219, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd219, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd220, 0x0; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd220 }, [ %rd277 + 0 ], %rd219; + // end inline asm + // begin inline asm + mov.u64 %rd223, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd223, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd224, 0x0; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd224 }, [ %rd281 + 0 ], %rd223; + // end inline asm + // begin inline asm + mov.u64 %rd227, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd227, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd228, 0x0; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd228 }, [ %rd285 + 0 ], %rd227; + // end inline asm + // begin inline asm + mov.u64 %rd231, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd231, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd232, 0x0; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd232 }, [ %rd289 + 0 ], %rd231; + // end inline asm + .loc 1 28 18 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:28:18 + shr.u64 %rd307, %rd143, 63; + add.s64 %rd308, %rd143, %rd307; + shr.s64 %rd89, %rd308, 1; + .loc 1 30 35 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:30:35 + sub.s64 %rd309, %rd1, %rd89; + sub.s64 %rd310, %rd2, %rd89; + sub.s64 %rd311, %rd3, %rd89; + sub.s64 %rd312, %rd4, %rd89; + sub.s64 %rd313, %rd5, %rd89; + sub.s64 %rd314, %rd6, %rd89; + sub.s64 %rd315, %rd7, %rd89; + sub.s64 %rd316, %rd8, %rd89; + .loc 1 30 30 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:30:30 + shl.b64 %rd317, %rd309, 1; + add.s64 %rd236, %rd138, %rd317; + shl.b64 %rd318, %rd310, 1; + add.s64 %rd239, %rd138, %rd318; + shl.b64 %rd319, %rd311, 1; + add.s64 %rd242, %rd138, %rd319; + shl.b64 %rd320, %rd312, 1; + add.s64 %rd245, %rd138, %rd320; + shl.b64 %rd321, %rd313, 1; + add.s64 %rd248, %rd138, %rd321; + shl.b64 %rd322, %rd314, 1; + add.s64 %rd251, %rd138, %rd322; + shl.b64 %rd323, %rd315, 1; + add.s64 %rd254, %rd138, %rd323; + shl.b64 %rd324, %rd316, 1; + add.s64 %rd257, %rd138, %rd324; + .loc 1 30 53 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:30:53 + // begin inline asm + mov.u64 %rd235, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd235, 1.0; + // end inline asm + .loc 1 29 19 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:29:19 + setp.ge.s64 %p73, %rd9, %rd89; + setp.ge.s64 %p74, %rd10, %rd89; + setp.ge.s64 %p75, %rd11, %rd89; + setp.ge.s64 %p76, %rd12, %rd89; + setp.ge.s64 %p77, %rd13, %rd89; + setp.ge.s64 %p78, %rd14, %rd89; + setp.ge.s64 %p79, %rd15, %rd89; + setp.ge.s64 %p80, %rd16, %rd89; + .loc 1 30 60 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:30:60 + and.pred %p16, %p8, %p80; + and.pred %p15, %p7, %p79; + and.pred %p14, %p6, %p78; + and.pred %p13, %p5, %p77; + and.pred %p12, %p4, %p76; + and.pred %p11, %p3, %p75; + and.pred %p10, %p2, %p74; + and.pred %p9, %p1, %p73; + mov.b16 %rs64, 0; + .loc 1 30 53 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:30:53 + // begin inline asm + mov.u16 %rs49, %rs64; + @%p9 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs49 }, [ %rd236 + 0 ], %rd235; + // end inline asm + // begin inline asm + mov.u64 %rd238, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd238, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs51, %rs64; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs51 }, [ %rd239 + 0 ], %rd238; + // end inline asm + // begin inline asm + mov.u64 %rd241, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd241, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs53, %rs64; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs53 }, [ %rd242 + 0 ], %rd241; + // end inline asm + // begin inline asm + mov.u64 %rd244, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd244, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs55, %rs64; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs55 }, [ %rd245 + 0 ], %rd244; + // end inline asm + // begin inline asm + mov.u64 %rd247, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd247, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs57, %rs64; + @%p13 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs57 }, [ %rd248 + 0 ], %rd247; + // end inline asm + // begin inline asm + mov.u64 %rd250, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd250, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs59, %rs64; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs59 }, [ %rd251 + 0 ], %rd250; + // end inline asm + // begin inline asm + mov.u64 %rd253, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd253, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs61, %rs64; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs61 }, [ %rd254 + 0 ], %rd253; + // end inline asm + // begin inline asm + mov.u64 %rd256, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd256, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs63, %rs64; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs63 }, [ %rd257 + 0 ], %rd256; + // end inline asm + .loc 1 31 35 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:31:35 + // begin inline asm + mov.u64 %rd259, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd259, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd260, 0x0; + @%p9 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd260 }, [ %rd261 + 0 ], %rd259; + // end inline asm + // begin inline asm + mov.u64 %rd263, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd263, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd264, 0x0; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd264 }, [ %rd265 + 0 ], %rd263; + // end inline asm + // begin inline asm + mov.u64 %rd267, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd267, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd268, 0x0; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd268 }, [ %rd269 + 0 ], %rd267; + // end inline asm + // begin inline asm + mov.u64 %rd271, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd271, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd272, 0x0; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd272 }, [ %rd273 + 0 ], %rd271; + // end inline asm + // begin inline asm + mov.u64 %rd275, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd275, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd276, 0x0; + @%p13 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd276 }, [ %rd277 + 0 ], %rd275; + // end inline asm + // begin inline asm + mov.u64 %rd279, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd279, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd280, 0x0; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd280 }, [ %rd281 + 0 ], %rd279; + // end inline asm + // begin inline asm + mov.u64 %rd283, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd283, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd284, 0x0; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd284 }, [ %rd285 + 0 ], %rd283; + // end inline asm + // begin inline asm + mov.u64 %rd287, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd287, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd288, 0x0; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd288 }, [ %rd289 + 0 ], %rd287; + // end inline asm + .loc 1 35 32 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:35:32 + shr.s64 %rd325, %rd268, 63; + and.b64 %rd326, %rd325, %rd145; + shr.s64 %rd327, %rd272, 63; + and.b64 %rd328, %rd327, %rd145; + shr.s64 %rd329, %rd260, 63; + and.b64 %rd330, %rd329, %rd145; + shr.s64 %rd331, %rd264, 63; + and.b64 %rd332, %rd331, %rd145; + shr.s64 %rd333, %rd284, 63; + and.b64 %rd334, %rd333, %rd145; + shr.s64 %rd335, %rd288, 63; + and.b64 %rd336, %rd335, %rd145; + shr.s64 %rd337, %rd276, 63; + and.b64 %rd338, %rd337, %rd145; + shr.s64 %rd339, %rd280, 63; + and.b64 %rd340, %rd339, %rd145; + add.s64 %rd111, %rd340, %rd280; + add.s64 %rd110, %rd338, %rd276; + add.s64 %rd113, %rd336, %rd288; + add.s64 %rd112, %rd334, %rd284; + add.s64 %rd107, %rd332, %rd264; + add.s64 %rd106, %rd330, %rd260; + add.s64 %rd109, %rd328, %rd272; + add.s64 %rd108, %rd326, %rd268; + .loc 1 36 28 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:36:28 + setp.lt.s64 %p81, %rd108, 0; + setp.lt.s64 %p82, %rd109, 0; + setp.lt.s64 %p83, %rd106, 0; + setp.lt.s64 %p84, %rd107, 0; + setp.lt.s64 %p85, %rd112, 0; + setp.lt.s64 %p86, %rd113, 0; + setp.lt.s64 %p87, %rd110, 0; + setp.lt.s64 %p88, %rd111, 0; + .loc 1 36 98 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:36:98 + setp.ge.s64 %p89, %rd108, %rd145; + setp.ge.s64 %p90, %rd109, %rd145; + setp.ge.s64 %p91, %rd106, %rd145; + setp.ge.s64 %p92, %rd107, %rd145; + setp.ge.s64 %p93, %rd112, %rd145; + setp.ge.s64 %p94, %rd113, %rd145; + setp.ge.s64 %p95, %rd110, %rd145; + setp.ge.s64 %p96, %rd111, %rd145; + .loc 1 36 64 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:36:64 + or.pred %p97, %p88, %p96; + or.pred %p98, %p87, %p95; + or.pred %p99, %p86, %p94; + or.pred %p100, %p85, %p93; + or.pred %p101, %p84, %p92; + or.pred %p102, %p83, %p91; + or.pred %p103, %p82, %p90; + or.pred %p104, %p81, %p89; + .loc 1 36 106 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:36:106 + and.pred %p105, %p11, %p104; + selp.b16 %rs65, 1, 0, %p105; + shl.b16 %rs66, %rs65, 2; + and.pred %p106, %p12, %p103; + selp.b16 %rs67, -1, 0, %p106; + shl.b16 %rs68, %rs67, 3; + or.b16 %rs69, %rs68, %rs66; + and.pred %p107, %p9, %p102; + selp.b16 %rs70, 1, 0, %p107; + and.pred %p108, %p10, %p101; + selp.b16 %rs71, -1, 0, %p108; + shl.b16 %rs72, %rs71, 1; + or.b16 %rs73, %rs70, %rs72; + and.b16 %rs74, %rs73, 3; + or.b16 %rs75, %rs74, %rs69; + and.b16 %rs76, %rs75, 15; + and.pred %p109, %p15, %p100; + selp.b16 %rs77, 1, 0, %p109; + shl.b16 %rs78, %rs77, 2; + and.pred %p110, %p16, %p99; + selp.b16 %rs79, -1, 0, %p110; + shl.b16 %rs80, %rs79, 3; + or.b16 %rs81, %rs80, %rs78; + and.pred %p111, %p13, %p98; + selp.b16 %rs82, 1, 0, %p111; + and.pred %p112, %p14, %p97; + selp.b16 %rs83, -1, 0, %p112; + shl.b16 %rs84, %rs83, 1; + or.b16 %rs85, %rs82, %rs84; + and.b16 %rs86, %rs85, 3; + or.b16 %rs87, %rs86, %rs81; + shl.b16 %rs88, %rs87, 4; + or.b16 %rs89, %rs76, %rs88; + .loc 1 36 123 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:36:123 + and.b16 %rs90, %rs89, 255; + setp.eq.b16 %p113, %rs90, 0; + @%p113 bra $L__BB0_50; + bra.uni $L__BB0_49; +$L__BB0_50: + .loc 1 0 123 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0:123 + ld.param.b64 %rd140, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_2]; + .loc 1 36 123 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:36:123 + bar.sync 0; + .loc 1 37 36 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:37:36 + sub.s64 %rd427, %rd9, %rd89; + sub.s64 %rd428, %rd10, %rd89; + sub.s64 %rd429, %rd11, %rd89; + sub.s64 %rd430, %rd12, %rd89; + sub.s64 %rd431, %rd13, %rd89; + sub.s64 %rd432, %rd14, %rd89; + sub.s64 %rd433, %rd15, %rd89; + sub.s64 %rd434, %rd16, %rd89; + .loc 1 37 58 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:37:58 + mul.lo.s64 %rd435, %rd106, %rd143; + mul.lo.s64 %rd436, %rd107, %rd143; + mul.lo.s64 %rd437, %rd108, %rd143; + mul.lo.s64 %rd438, %rd109, %rd143; + mul.lo.s64 %rd439, %rd110, %rd143; + mul.lo.s64 %rd440, %rd111, %rd143; + mul.lo.s64 %rd441, %rd112, %rd143; + mul.lo.s64 %rd442, %rd113, %rd143; + .loc 1 37 31 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:37:31 + shl.b64 %rd443, %rd427, 1; + add.s64 %rd444, %rd140, %rd443; + shl.b64 %rd445, %rd435, 1; + add.s64 %rd348, %rd444, %rd445; + shl.b64 %rd446, %rd428, 1; + add.s64 %rd447, %rd140, %rd446; + shl.b64 %rd448, %rd436, 1; + add.s64 %rd351, %rd447, %rd448; + shl.b64 %rd449, %rd429, 1; + add.s64 %rd450, %rd140, %rd449; + shl.b64 %rd451, %rd437, 1; + add.s64 %rd354, %rd450, %rd451; + shl.b64 %rd452, %rd430, 1; + add.s64 %rd453, %rd140, %rd452; + shl.b64 %rd454, %rd438, 1; + add.s64 %rd357, %rd453, %rd454; + shl.b64 %rd455, %rd431, 1; + add.s64 %rd456, %rd140, %rd455; + shl.b64 %rd457, %rd439, 1; + add.s64 %rd360, %rd456, %rd457; + shl.b64 %rd458, %rd432, 1; + add.s64 %rd459, %rd140, %rd458; + shl.b64 %rd460, %rd440, 1; + add.s64 %rd363, %rd459, %rd460; + shl.b64 %rd461, %rd433, 1; + add.s64 %rd462, %rd140, %rd461; + shl.b64 %rd463, %rd441, 1; + add.s64 %rd366, %rd462, %rd463; + shl.b64 %rd464, %rd434, 1; + add.s64 %rd465, %rd140, %rd464; + shl.b64 %rd466, %rd442, 1; + add.s64 %rd369, %rd465, %rd466; + .loc 1 37 65 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:37:65 + // begin inline asm + mov.u64 %rd347, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd347, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs91, %rs64; + @%p9 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs91 }, [ %rd348 + 0 ], %rd347; + // end inline asm + // begin inline asm + mov.u64 %rd350, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd350, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs93, %rs64; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs93 }, [ %rd351 + 0 ], %rd350; + // end inline asm + // begin inline asm + mov.u64 %rd353, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd353, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs95, %rs64; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs95 }, [ %rd354 + 0 ], %rd353; + // end inline asm + // begin inline asm + mov.u64 %rd356, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd356, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs97, %rs64; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs97 }, [ %rd357 + 0 ], %rd356; + // end inline asm + // begin inline asm + mov.u64 %rd359, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd359, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs99, %rs64; + @%p13 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs99 }, [ %rd360 + 0 ], %rd359; + // end inline asm + // begin inline asm + mov.u64 %rd362, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd362, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs101, %rs64; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs101 }, [ %rd363 + 0 ], %rd362; + // end inline asm + // begin inline asm + mov.u64 %rd365, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd365, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs103, %rs64; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs103 }, [ %rd366 + 0 ], %rd365; + // end inline asm + // begin inline asm + mov.u64 %rd368, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd368, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs105, %rs64; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs105 }, [ %rd369 + 0 ], %rd368; + // end inline asm + .loc 1 44 19 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:44:19 + setp.lt.s64 %p146, %rd9, %rd89; + setp.lt.s64 %p147, %rd10, %rd89; + setp.lt.s64 %p148, %rd11, %rd89; + setp.lt.s64 %p149, %rd12, %rd89; + setp.lt.s64 %p150, %rd13, %rd89; + setp.lt.s64 %p151, %rd14, %rd89; + setp.lt.s64 %p152, %rd15, %rd89; + setp.lt.s64 %p153, %rd16, %rd89; + .loc 1 45 37 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:45:37 + add.s64 %rd467, %rd143, %rd1; + add.s64 %rd468, %rd143, %rd2; + add.s64 %rd469, %rd143, %rd3; + add.s64 %rd470, %rd143, %rd4; + add.s64 %rd471, %rd143, %rd5; + add.s64 %rd472, %rd143, %rd6; + add.s64 %rd473, %rd143, %rd7; + add.s64 %rd474, %rd143, %rd8; + .loc 1 45 42 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:45:42 + sub.s64 %rd475, %rd467, %rd89; + sub.s64 %rd476, %rd468, %rd89; + sub.s64 %rd477, %rd469, %rd89; + sub.s64 %rd478, %rd470, %rd89; + sub.s64 %rd479, %rd471, %rd89; + sub.s64 %rd480, %rd472, %rd89; + sub.s64 %rd481, %rd473, %rd89; + sub.s64 %rd482, %rd474, %rd89; + .loc 1 45 31 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:45:31 + shl.b64 %rd483, %rd475, 1; + add.s64 %rd372, %rd138, %rd483; + shl.b64 %rd484, %rd476, 1; + add.s64 %rd375, %rd138, %rd484; + shl.b64 %rd485, %rd477, 1; + add.s64 %rd378, %rd138, %rd485; + shl.b64 %rd486, %rd478, 1; + add.s64 %rd381, %rd138, %rd486; + shl.b64 %rd487, %rd479, 1; + add.s64 %rd384, %rd138, %rd487; + shl.b64 %rd488, %rd480, 1; + add.s64 %rd387, %rd138, %rd488; + shl.b64 %rd489, %rd481, 1; + add.s64 %rd390, %rd138, %rd489; + shl.b64 %rd490, %rd482, 1; + add.s64 %rd393, %rd138, %rd490; + .loc 1 45 60 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:45:60 + // begin inline asm + mov.u64 %rd371, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd371, 1.0; + // end inline asm + .loc 1 45 68 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:45:68 + and.pred %p24, %p8, %p153; + and.pred %p23, %p7, %p152; + and.pred %p22, %p6, %p151; + and.pred %p21, %p5, %p150; + and.pred %p20, %p4, %p149; + and.pred %p19, %p3, %p148; + and.pred %p18, %p2, %p147; + and.pred %p17, %p1, %p146; + .loc 1 45 60 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:45:60 + // begin inline asm + mov.u16 %rs107, %rs64; + @%p17 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs107 }, [ %rd372 + 0 ], %rd371; + // end inline asm + // begin inline asm + mov.u64 %rd374, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd374, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs109, %rs64; + @%p18 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs109 }, [ %rd375 + 0 ], %rd374; + // end inline asm + // begin inline asm + mov.u64 %rd377, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd377, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs111, %rs64; + @%p19 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs111 }, [ %rd378 + 0 ], %rd377; + // end inline asm + // begin inline asm + mov.u64 %rd380, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd380, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs113, %rs64; + @%p20 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs113 }, [ %rd381 + 0 ], %rd380; + // end inline asm + // begin inline asm + mov.u64 %rd383, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd383, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs115, %rs64; + @%p21 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs115 }, [ %rd384 + 0 ], %rd383; + // end inline asm + // begin inline asm + mov.u64 %rd386, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd386, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs117, %rs64; + @%p22 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs117 }, [ %rd387 + 0 ], %rd386; + // end inline asm + // begin inline asm + mov.u64 %rd389, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd389, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs119, %rs64; + @%p23 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs119 }, [ %rd390 + 0 ], %rd389; + // end inline asm + // begin inline asm + mov.u64 %rd392, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd392, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs121, %rs64; + @%p24 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs121 }, [ %rd393 + 0 ], %rd392; + // end inline asm + .loc 1 46 36 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:46:36 + // begin inline asm + mov.u64 %rd395, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd395, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd396, 0x0; + @%p17 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd396 }, [ %rd261 + 0 ], %rd395; + // end inline asm + // begin inline asm + mov.u64 %rd399, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd399, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd400, 0x0; + @%p18 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd400 }, [ %rd265 + 0 ], %rd399; + // end inline asm + // begin inline asm + mov.u64 %rd403, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd403, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd404, 0x0; + @%p19 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd404 }, [ %rd269 + 0 ], %rd403; + // end inline asm + // begin inline asm + mov.u64 %rd407, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd407, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd408, 0x0; + @%p20 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd408 }, [ %rd273 + 0 ], %rd407; + // end inline asm + // begin inline asm + mov.u64 %rd411, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd411, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd412, 0x0; + @%p21 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd412 }, [ %rd277 + 0 ], %rd411; + // end inline asm + // begin inline asm + mov.u64 %rd415, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd415, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd416, 0x0; + @%p22 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd416 }, [ %rd281 + 0 ], %rd415; + // end inline asm + // begin inline asm + mov.u64 %rd419, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd419, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd420, 0x0; + @%p23 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd420 }, [ %rd285 + 0 ], %rd419; + // end inline asm + // begin inline asm + mov.u64 %rd423, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd423, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd424, 0x0; + @%p24 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd424 }, [ %rd289 + 0 ], %rd423; + // end inline asm + .loc 1 50 35 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:50:35 + shr.s64 %rd491, %rd404, 63; + and.b64 %rd492, %rd491, %rd145; + shr.s64 %rd493, %rd408, 63; + and.b64 %rd494, %rd493, %rd145; + shr.s64 %rd495, %rd396, 63; + and.b64 %rd496, %rd495, %rd145; + shr.s64 %rd497, %rd400, 63; + and.b64 %rd498, %rd497, %rd145; + shr.s64 %rd499, %rd420, 63; + and.b64 %rd500, %rd499, %rd145; + shr.s64 %rd501, %rd424, 63; + and.b64 %rd502, %rd501, %rd145; + shr.s64 %rd503, %rd412, 63; + and.b64 %rd504, %rd503, %rd145; + shr.s64 %rd505, %rd416, 63; + and.b64 %rd506, %rd505, %rd145; + add.s64 %rd127, %rd506, %rd416; + add.s64 %rd126, %rd504, %rd412; + add.s64 %rd129, %rd502, %rd424; + add.s64 %rd128, %rd500, %rd420; + add.s64 %rd123, %rd498, %rd400; + add.s64 %rd122, %rd496, %rd396; + add.s64 %rd125, %rd494, %rd408; + add.s64 %rd124, %rd492, %rd404; + .loc 1 51 28 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:51:28 + setp.lt.s64 %p154, %rd124, 0; + setp.lt.s64 %p155, %rd125, 0; + setp.lt.s64 %p156, %rd122, 0; + setp.lt.s64 %p157, %rd123, 0; + setp.lt.s64 %p158, %rd128, 0; + setp.lt.s64 %p159, %rd129, 0; + setp.lt.s64 %p160, %rd126, 0; + setp.lt.s64 %p161, %rd127, 0; + .loc 1 51 100 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:51:100 + setp.ge.s64 %p162, %rd124, %rd145; + setp.ge.s64 %p163, %rd125, %rd145; + setp.ge.s64 %p164, %rd122, %rd145; + setp.ge.s64 %p165, %rd123, %rd145; + setp.ge.s64 %p166, %rd128, %rd145; + setp.ge.s64 %p167, %rd129, %rd145; + setp.ge.s64 %p168, %rd126, %rd145; + setp.ge.s64 %p169, %rd127, %rd145; + .loc 1 51 65 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:51:65 + or.pred %p170, %p161, %p169; + or.pred %p171, %p160, %p168; + or.pred %p172, %p159, %p167; + or.pred %p173, %p158, %p166; + or.pred %p174, %p157, %p165; + or.pred %p175, %p156, %p164; + or.pred %p176, %p155, %p163; + or.pred %p177, %p154, %p162; + .loc 1 51 108 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:51:108 + and.pred %p178, %p19, %p177; + selp.b16 %rs123, 1, 0, %p178; + shl.b16 %rs124, %rs123, 2; + and.pred %p179, %p20, %p176; + selp.b16 %rs125, -1, 0, %p179; + shl.b16 %rs126, %rs125, 3; + or.b16 %rs127, %rs126, %rs124; + and.pred %p180, %p17, %p175; + selp.b16 %rs128, 1, 0, %p180; + and.pred %p181, %p18, %p174; + selp.b16 %rs129, -1, 0, %p181; + shl.b16 %rs130, %rs129, 1; + or.b16 %rs131, %rs128, %rs130; + and.b16 %rs132, %rs131, 3; + or.b16 %rs133, %rs132, %rs127; + and.b16 %rs134, %rs133, 15; + and.pred %p182, %p23, %p173; + selp.b16 %rs135, 1, 0, %p182; + shl.b16 %rs136, %rs135, 2; + and.pred %p183, %p24, %p172; + selp.b16 %rs137, -1, 0, %p183; + shl.b16 %rs138, %rs137, 3; + or.b16 %rs139, %rs138, %rs136; + and.pred %p184, %p21, %p171; + selp.b16 %rs140, 1, 0, %p184; + and.pred %p185, %p22, %p170; + selp.b16 %rs141, -1, 0, %p185; + shl.b16 %rs142, %rs141, 1; + or.b16 %rs143, %rs140, %rs142; + and.b16 %rs144, %rs143, 3; + or.b16 %rs145, %rs144, %rs139; + shl.b16 %rs146, %rs145, 4; + or.b16 %rs147, %rs134, %rs146; + .loc 1 51 126 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:51:126 + and.b16 %rs148, %rs147, 255; + setp.eq.b16 %p186, %rs148, 0; + @%p186 bra $L__BB0_52; + bra.uni $L__BB0_51; +$L__BB0_52: + .loc 1 0 126 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0:126 + ld.param.b64 %rd146, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_8]; + .loc 1 51 126 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:51:126 + bar.sync 0; + .loc 1 52 37 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:52:37 + sub.s64 %rd537, %rd143, %rd89; + .loc 1 52 64 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:52:64 + mul.lo.s64 %rd538, %rd122, %rd143; + mul.lo.s64 %rd539, %rd123, %rd143; + mul.lo.s64 %rd540, %rd124, %rd143; + mul.lo.s64 %rd541, %rd125, %rd143; + mul.lo.s64 %rd542, %rd126, %rd143; + mul.lo.s64 %rd543, %rd127, %rd143; + mul.lo.s64 %rd544, %rd128, %rd143; + mul.lo.s64 %rd545, %rd129, %rd143; + .loc 1 52 31 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:52:31 + shl.b64 %rd546, %rd537, 1; + add.s64 %rd547, %rd140, %rd546; + shl.b64 %rd548, %rd9, 1; + add.s64 %rd549, %rd547, %rd548; + shl.b64 %rd550, %rd538, 1; + add.s64 %rd514, %rd549, %rd550; + shl.b64 %rd551, %rd10, 1; + add.s64 %rd552, %rd547, %rd551; + shl.b64 %rd553, %rd539, 1; + add.s64 %rd517, %rd552, %rd553; + shl.b64 %rd554, %rd11, 1; + add.s64 %rd555, %rd547, %rd554; + shl.b64 %rd556, %rd540, 1; + add.s64 %rd520, %rd555, %rd556; + shl.b64 %rd557, %rd12, 1; + add.s64 %rd558, %rd547, %rd557; + shl.b64 %rd559, %rd541, 1; + add.s64 %rd523, %rd558, %rd559; + shl.b64 %rd560, %rd13, 1; + add.s64 %rd561, %rd547, %rd560; + shl.b64 %rd562, %rd542, 1; + add.s64 %rd526, %rd561, %rd562; + shl.b64 %rd563, %rd14, 1; + add.s64 %rd564, %rd547, %rd563; + shl.b64 %rd565, %rd543, 1; + add.s64 %rd529, %rd564, %rd565; + shl.b64 %rd566, %rd15, 1; + add.s64 %rd567, %rd547, %rd566; + shl.b64 %rd568, %rd544, 1; + add.s64 %rd532, %rd567, %rd568; + shl.b64 %rd569, %rd16, 1; + add.s64 %rd570, %rd547, %rd569; + shl.b64 %rd571, %rd545, 1; + add.s64 %rd535, %rd570, %rd571; + .loc 1 52 72 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:52:72 + // begin inline asm + mov.u64 %rd513, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd513, 1.0; + // end inline asm + mov.b16 %rs150, 0; + // begin inline asm + mov.u16 %rs149, %rs150; + @%p17 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs149 }, [ %rd514 + 0 ], %rd513; + // end inline asm + // begin inline asm + mov.u64 %rd516, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd516, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs151, %rs150; + @%p18 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs151 }, [ %rd517 + 0 ], %rd516; + // end inline asm + // begin inline asm + mov.u64 %rd519, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd519, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs153, %rs150; + @%p19 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs153 }, [ %rd520 + 0 ], %rd519; + // end inline asm + // begin inline asm + mov.u64 %rd522, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd522, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs155, %rs150; + @%p20 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs155 }, [ %rd523 + 0 ], %rd522; + // end inline asm + // begin inline asm + mov.u64 %rd525, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd525, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs157, %rs150; + @%p21 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs157 }, [ %rd526 + 0 ], %rd525; + // end inline asm + // begin inline asm + mov.u64 %rd528, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd528, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs159, %rs150; + @%p22 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs159 }, [ %rd529 + 0 ], %rd528; + // end inline asm + // begin inline asm + mov.u64 %rd531, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd531, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs161, %rs150; + @%p23 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs161 }, [ %rd532 + 0 ], %rd531; + // end inline asm + // begin inline asm + mov.u64 %rd534, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd534, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs163, %rs150; + @%p24 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs163 }, [ %rd535 + 0 ], %rd534; + // end inline asm + .loc 1 61 35 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:61:35 + shr.s64 %rd572, %rd212, 63; + and.b64 %rd573, %rd572, %rd146; + shr.s64 %rd574, %rd216, 63; + and.b64 %rd575, %rd574, %rd146; + shr.s64 %rd576, %rd204, 63; + and.b64 %rd577, %rd576, %rd146; + shr.s64 %rd578, %rd208, 63; + and.b64 %rd579, %rd578, %rd146; + shr.s64 %rd580, %rd228, 63; + and.b64 %rd581, %rd580, %rd146; + shr.s64 %rd582, %rd232, 63; + and.b64 %rd583, %rd582, %rd146; + shr.s64 %rd584, %rd220, 63; + and.b64 %rd585, %rd584, %rd146; + shr.s64 %rd586, %rd224, 63; + and.b64 %rd587, %rd586, %rd146; + add.s64 %rd135, %rd587, %rd224; + add.s64 %rd134, %rd585, %rd220; + add.s64 %rd137, %rd583, %rd232; + add.s64 %rd136, %rd581, %rd228; + add.s64 %rd131, %rd579, %rd208; + add.s64 %rd130, %rd577, %rd204; + add.s64 %rd133, %rd575, %rd216; + add.s64 %rd132, %rd573, %rd212; + .loc 1 62 28 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:62:28 + setp.lt.s64 %p203, %rd132, 0; + setp.lt.s64 %p204, %rd133, 0; + setp.lt.s64 %p205, %rd130, 0; + setp.lt.s64 %p206, %rd131, 0; + setp.lt.s64 %p207, %rd136, 0; + setp.lt.s64 %p208, %rd137, 0; + setp.lt.s64 %p209, %rd134, 0; + setp.lt.s64 %p210, %rd135, 0; + .loc 1 62 46 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:62:46 + setp.ge.s64 %p211, %rd132, %rd146; + setp.ge.s64 %p212, %rd133, %rd146; + setp.ge.s64 %p213, %rd130, %rd146; + setp.ge.s64 %p214, %rd131, %rd146; + setp.ge.s64 %p215, %rd136, %rd146; + setp.ge.s64 %p216, %rd137, %rd146; + setp.ge.s64 %p217, %rd134, %rd146; + setp.ge.s64 %p218, %rd135, %rd146; + .loc 1 62 38 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:62:38 + or.pred %p219, %p210, %p218; + or.pred %p220, %p209, %p217; + or.pred %p221, %p208, %p216; + or.pred %p222, %p207, %p215; + or.pred %p223, %p206, %p214; + or.pred %p224, %p205, %p213; + or.pred %p225, %p204, %p212; + or.pred %p226, %p203, %p211; + .loc 1 62 54 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:62:54 + and.pred %p227, %p3, %p226; + selp.b16 %rs165, 1, 0, %p227; + shl.b16 %rs166, %rs165, 2; + and.pred %p228, %p4, %p225; + selp.b16 %rs167, -1, 0, %p228; + shl.b16 %rs168, %rs167, 3; + or.b16 %rs169, %rs168, %rs166; + and.pred %p229, %p1, %p224; + selp.b16 %rs170, 1, 0, %p229; + and.pred %p230, %p2, %p223; + selp.b16 %rs171, -1, 0, %p230; + shl.b16 %rs172, %rs171, 1; + or.b16 %rs173, %rs170, %rs172; + and.b16 %rs174, %rs173, 3; + or.b16 %rs175, %rs174, %rs169; + and.b16 %rs176, %rs175, 15; + and.pred %p231, %p7, %p222; + selp.b16 %rs177, 1, 0, %p231; + shl.b16 %rs178, %rs177, 2; + and.pred %p232, %p8, %p221; + selp.b16 %rs179, -1, 0, %p232; + shl.b16 %rs180, %rs179, 3; + or.b16 %rs181, %rs180, %rs178; + and.pred %p233, %p5, %p220; + selp.b16 %rs182, 1, 0, %p233; + and.pred %p234, %p6, %p219; + selp.b16 %rs183, -1, 0, %p234; + shl.b16 %rs184, %rs183, 1; + or.b16 %rs185, %rs182, %rs184; + and.b16 %rs186, %rs185, 3; + or.b16 %rs187, %rs186, %rs181; + shl.b16 %rs188, %rs187, 4; + or.b16 %rs189, %rs176, %rs188; + .loc 1 62 64 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:62:64 + and.b16 %rs190, %rs189, 255; + setp.eq.b16 %p235, %rs190, 0; + @%p235 bra $L__BB0_54; + bra.uni $L__BB0_53; +$L__BB0_54: + .loc 1 0 64 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0:64 + ld.param.b64 %rd142, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_4]; + ld.param.b64 %rd141, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_3]; + .loc 1 30 111 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:30:111 + cvt.f32.bf16 %r70, %rs63; + .loc 1 37 123 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:37:123 + cvt.f32.bf16 %r71, %rs105; + .loc 1 39 13 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:39:13 + neg.f32 %r72, %r70; + fma.rn.f32 %r73, %r72, %r71, 0f00000000; + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + selp.f32 %r74, %r73, 0f00000000, %p80; + .loc 1 45 119 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:45:119 + cvt.f32.bf16 %r75, %rs121; + .loc 1 52 131 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:52:131 + cvt.f32.bf16 %r76, %rs163; + .loc 1 53 20 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:53:20 + mul.f32 %r77, %r75, %r76; + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + selp.f32 %r78, %r77, 0f00000000, %p153; + .loc 1 57 20 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:57:20 + add.f32 %r79, %r74, %r78; + .loc 1 30 111 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:30:111 + cvt.f32.bf16 %r80, %rs61; + .loc 1 37 123 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:37:123 + cvt.f32.bf16 %r81, %rs103; + .loc 1 39 13 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:39:13 + neg.f32 %r82, %r80; + fma.rn.f32 %r83, %r82, %r81, 0f00000000; + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + selp.f32 %r84, %r83, 0f00000000, %p79; + .loc 1 45 119 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:45:119 + cvt.f32.bf16 %r85, %rs119; + .loc 1 52 131 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:52:131 + cvt.f32.bf16 %r86, %rs161; + .loc 1 53 20 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:53:20 + mul.f32 %r87, %r85, %r86; + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + selp.f32 %r88, %r87, 0f00000000, %p152; + .loc 1 57 20 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:57:20 + add.f32 %r89, %r84, %r88; + .loc 1 30 111 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:30:111 + cvt.f32.bf16 %r90, %rs59; + .loc 1 37 123 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:37:123 + cvt.f32.bf16 %r91, %rs101; + .loc 1 39 13 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:39:13 + neg.f32 %r92, %r90; + fma.rn.f32 %r93, %r92, %r91, 0f00000000; + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + selp.f32 %r94, %r93, 0f00000000, %p78; + .loc 1 45 119 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:45:119 + cvt.f32.bf16 %r95, %rs117; + .loc 1 52 131 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:52:131 + cvt.f32.bf16 %r96, %rs159; + .loc 1 53 20 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:53:20 + mul.f32 %r97, %r95, %r96; + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + selp.f32 %r98, %r97, 0f00000000, %p151; + .loc 1 57 20 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:57:20 + add.f32 %r99, %r94, %r98; + .loc 1 30 111 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:30:111 + cvt.f32.bf16 %r100, %rs57; + .loc 1 37 123 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:37:123 + cvt.f32.bf16 %r101, %rs99; + .loc 1 39 13 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:39:13 + neg.f32 %r102, %r100; + fma.rn.f32 %r103, %r102, %r101, 0f00000000; + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + selp.f32 %r104, %r103, 0f00000000, %p77; + .loc 1 45 119 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:45:119 + cvt.f32.bf16 %r105, %rs115; + .loc 1 52 131 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:52:131 + cvt.f32.bf16 %r106, %rs157; + .loc 1 53 20 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:53:20 + mul.f32 %r107, %r105, %r106; + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + selp.f32 %r108, %r107, 0f00000000, %p150; + .loc 1 57 20 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:57:20 + add.f32 %r109, %r104, %r108; + .loc 1 30 111 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:30:111 + cvt.f32.bf16 %r110, %rs55; + .loc 1 37 123 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:37:123 + cvt.f32.bf16 %r111, %rs97; + .loc 1 39 13 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:39:13 + neg.f32 %r112, %r110; + fma.rn.f32 %r113, %r112, %r111, 0f00000000; + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + selp.f32 %r114, %r113, 0f00000000, %p76; + .loc 1 45 119 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:45:119 + cvt.f32.bf16 %r115, %rs113; + .loc 1 52 131 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:52:131 + cvt.f32.bf16 %r116, %rs155; + .loc 1 53 20 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:53:20 + mul.f32 %r117, %r115, %r116; + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + selp.f32 %r118, %r117, 0f00000000, %p149; + .loc 1 57 20 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:57:20 + add.f32 %r119, %r114, %r118; + .loc 1 30 111 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:30:111 + cvt.f32.bf16 %r120, %rs53; + .loc 1 37 123 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:37:123 + cvt.f32.bf16 %r121, %rs95; + .loc 1 39 13 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:39:13 + neg.f32 %r122, %r120; + fma.rn.f32 %r123, %r122, %r121, 0f00000000; + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + selp.f32 %r124, %r123, 0f00000000, %p75; + .loc 1 45 119 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:45:119 + cvt.f32.bf16 %r125, %rs111; + .loc 1 52 131 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:52:131 + cvt.f32.bf16 %r126, %rs153; + .loc 1 53 20 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:53:20 + mul.f32 %r127, %r125, %r126; + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + selp.f32 %r128, %r127, 0f00000000, %p148; + .loc 1 57 20 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:57:20 + add.f32 %r129, %r124, %r128; + .loc 1 30 111 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:30:111 + cvt.f32.bf16 %r130, %rs51; + .loc 1 37 123 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:37:123 + cvt.f32.bf16 %r131, %rs93; + .loc 1 39 13 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:39:13 + neg.f32 %r132, %r130; + fma.rn.f32 %r133, %r132, %r131, 0f00000000; + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + selp.f32 %r134, %r133, 0f00000000, %p74; + .loc 1 45 119 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:45:119 + cvt.f32.bf16 %r135, %rs109; + .loc 1 52 131 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:52:131 + cvt.f32.bf16 %r136, %rs151; + .loc 1 53 20 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:53:20 + mul.f32 %r137, %r135, %r136; + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + selp.f32 %r138, %r137, 0f00000000, %p147; + .loc 1 57 20 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:57:20 + add.f32 %r139, %r134, %r138; + .loc 1 30 111 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:30:111 + cvt.f32.bf16 %r140, %rs49; + .loc 1 37 123 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:37:123 + cvt.f32.bf16 %r141, %rs91; + .loc 1 39 13 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:39:13 + neg.f32 %r142, %r140; + fma.rn.f32 %r143, %r142, %r141, 0f00000000; + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + selp.f32 %r144, %r143, 0f00000000, %p73; + .loc 1 45 119 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:45:119 + cvt.f32.bf16 %r145, %rs107; + .loc 1 52 131 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:52:131 + cvt.f32.bf16 %r146, %rs149; + .loc 1 53 20 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:53:20 + mul.f32 %r147, %r145, %r146; + .loc 1 0 0 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:0 + selp.f32 %r148, %r147, 0f00000000, %p146; + .loc 1 57 20 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:57:20 + add.f32 %r149, %r144, %r148; + .loc 1 25 76 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:25:76 + cvt.f32.bf16 %r150, %rs48; + cvt.f32.bf16 %r151, %rs47; + cvt.f32.bf16 %r152, %rs46; + cvt.f32.bf16 %r153, %rs45; + cvt.f32.bf16 %r154, %rs44; + cvt.f32.bf16 %r155, %rs43; + cvt.f32.bf16 %r156, %rs42; + cvt.f32.bf16 %r157, %rs41; + .loc 1 62 64 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:62:64 + bar.sync 0; + .loc 1 63 40 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:63:40 + mul.lo.s64 %rd626, %rd130, %rd143; + mul.lo.s64 %rd627, %rd131, %rd143; + mul.lo.s64 %rd628, %rd132, %rd143; + mul.lo.s64 %rd629, %rd133, %rd143; + mul.lo.s64 %rd630, %rd134, %rd143; + mul.lo.s64 %rd631, %rd135, %rd143; + mul.lo.s64 %rd632, %rd136, %rd143; + mul.lo.s64 %rd633, %rd137, %rd143; + .loc 1 63 31 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:63:31 + add.s64 %rd635, %rd141, %rd548; + shl.b64 %rd636, %rd626, 1; + add.s64 %rd595, %rd635, %rd636; + add.s64 %rd638, %rd141, %rd551; + shl.b64 %rd639, %rd627, 1; + add.s64 %rd598, %rd638, %rd639; + add.s64 %rd641, %rd141, %rd554; + shl.b64 %rd642, %rd628, 1; + add.s64 %rd601, %rd641, %rd642; + add.s64 %rd644, %rd141, %rd557; + shl.b64 %rd645, %rd629, 1; + add.s64 %rd604, %rd644, %rd645; + add.s64 %rd647, %rd141, %rd560; + shl.b64 %rd648, %rd630, 1; + add.s64 %rd607, %rd647, %rd648; + add.s64 %rd650, %rd141, %rd563; + shl.b64 %rd651, %rd631, 1; + add.s64 %rd610, %rd650, %rd651; + add.s64 %rd653, %rd141, %rd566; + shl.b64 %rd654, %rd632, 1; + add.s64 %rd613, %rd653, %rd654; + add.s64 %rd656, %rd141, %rd569; + shl.b64 %rd657, %rd633, 1; + add.s64 %rd616, %rd656, %rd657; + .loc 1 63 48 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:63:48 + // begin inline asm + mov.u64 %rd596, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd596, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs191, 0x0; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs191 }, [ %rd595 + 0 ], %rd596; + // end inline asm + // begin inline asm + mov.u64 %rd599, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd599, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs192, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs192 }, [ %rd598 + 0 ], %rd599; + // end inline asm + // begin inline asm + mov.u64 %rd602, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd602, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs193, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs193 }, [ %rd601 + 0 ], %rd602; + // end inline asm + // begin inline asm + mov.u64 %rd605, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd605, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs194, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs194 }, [ %rd604 + 0 ], %rd605; + // end inline asm + // begin inline asm + mov.u64 %rd608, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd608, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs195, 0x0; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs195 }, [ %rd607 + 0 ], %rd608; + // end inline asm + // begin inline asm + mov.u64 %rd611, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd611, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs196, 0x0; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs196 }, [ %rd610 + 0 ], %rd611; + // end inline asm + // begin inline asm + mov.u64 %rd614, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd614, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs197, 0x0; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs197 }, [ %rd613 + 0 ], %rd614; + // end inline asm + // begin inline asm + mov.u64 %rd617, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd617, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs198, 0x0; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs198 }, [ %rd616 + 0 ], %rd617; + // end inline asm + .loc 1 63 88 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:63:88 + cvt.f32.bf16 %r158, %rs191; + cvt.f32.bf16 %r159, %rs192; + cvt.f32.bf16 %r160, %rs193; + cvt.f32.bf16 %r161, %rs194; + cvt.f32.bf16 %r162, %rs195; + cvt.f32.bf16 %r163, %rs196; + cvt.f32.bf16 %r164, %rs197; + cvt.f32.bf16 %r165, %rs198; + .loc 1 65 20 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:65:20 + fma.rn.f32 %r166, %r157, %r158, %r149; + fma.rn.f32 %r167, %r156, %r159, %r139; + fma.rn.f32 %r168, %r155, %r160, %r129; + fma.rn.f32 %r169, %r154, %r161, %r119; + fma.rn.f32 %r170, %r153, %r162, %r109; + fma.rn.f32 %r171, %r152, %r163, %r99; + fma.rn.f32 %r172, %r151, %r164, %r89; + fma.rn.f32 %r173, %r150, %r165, %r79; + .loc 1 66 25 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:66:25 + add.s64 %rd618, %rd142, %rd291; + add.s64 %rd619, %rd142, %rd292; + add.s64 %rd620, %rd142, %rd293; + add.s64 %rd621, %rd142, %rd294; + add.s64 %rd622, %rd142, %rd295; + add.s64 %rd623, %rd142, %rd296; + add.s64 %rd624, %rd142, %rd297; + add.s64 %rd625, %rd142, %rd298; + .loc 1 66 37 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:66:37 + cvt.rn.bf16.f32 %rs199, %r166; + cvt.rn.bf16.f32 %rs200, %r167; + cvt.rn.bf16.f32 %rs201, %r168; + cvt.rn.bf16.f32 %rs202, %r169; + cvt.rn.bf16.f32 %rs203, %r170; + cvt.rn.bf16.f32 %rs204, %r171; + cvt.rn.bf16.f32 %rs205, %r172; + cvt.rn.bf16.f32 %rs206, %r173; + // begin inline asm + @%p1 st.global.b16 [ %rd618 + 0 ], { %rs199 }; + // end inline asm + // begin inline asm + @%p2 st.global.b16 [ %rd619 + 0 ], { %rs200 }; + // end inline asm + // begin inline asm + @%p3 st.global.b16 [ %rd620 + 0 ], { %rs201 }; + // end inline asm + // begin inline asm + @%p4 st.global.b16 [ %rd621 + 0 ], { %rs202 }; + // end inline asm + // begin inline asm + @%p5 st.global.b16 [ %rd622 + 0 ], { %rs203 }; + // end inline asm + // begin inline asm + @%p6 st.global.b16 [ %rd623 + 0 ], { %rs204 }; + // end inline asm + // begin inline asm + @%p7 st.global.b16 [ %rd624 + 0 ], { %rs205 }; + // end inline asm + // begin inline asm + @%p8 st.global.b16 [ %rd625 + 0 ], { %rs206 }; + // end inline asm + .loc 1 66 4 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:66:4 + ret; +$L__BB0_49: + .loc 1 36 123 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:36:123 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd341, assertFunc_0; + cvta.global.u64 %rd342, %rd341; + st.param.b64 [param3], %rd342; + mov.b64 %rd343, assertFile_0; + cvta.global.u64 %rd344, %rd343; + st.param.b64 [param1], %rd344; + mov.b64 %rd345, assertMessage_0; + cvta.global.u64 %rd346, %rd345; + st.param.b64 [param0], %rd346; + st.param.b64 [param4], 1; + st.param.b32 [param2], 36; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__BB0_51: + .loc 1 51 126 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:51:126 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd507, assertFunc_1; + cvta.global.u64 %rd508, %rd507; + st.param.b64 [param3], %rd508; + mov.b64 %rd509, assertFile_1; + cvta.global.u64 %rd510, %rd509; + st.param.b64 [param1], %rd510; + mov.b64 %rd511, assertMessage_1; + cvta.global.u64 %rd512, %rd511; + st.param.b64 [param0], %rd512; + st.param.b64 [param4], 1; + st.param.b32 [param2], 51; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__BB0_53: + .loc 1 62 64 // cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py:62:64 + { // callseq 2, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd588, assertFunc_2; + cvta.global.u64 %rd589, %rd588; + st.param.b64 [param3], %rd589; + mov.b64 %rd590, assertFile_2; + cvta.global.u64 %rd591, %rd590; + st.param.b64 [param1], %rd591; + mov.b64 %rd592, assertMessage_2; + cvta.global.u64 %rd593, %rd592; + st.param.b64 [param0], %rd593; + st.param.b64 [param4], 1; + st.param.b32 [param2], 62; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 2 + trap; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 100 +.b8 122 +.b8 51 +.b8 105 +.b8 111 +.b8 55 +.b8 119 +.b8 53 +.b8 117 +.b8 121 +.b8 102 +.b8 114 +.b8 109 +.b8 102 +.b8 113 +.b8 118 +.b8 109 +.b8 103 +.b8 50 +.b8 107 +.b8 116 +.b8 50 +.b8 97 +.b8 121 +.b8 54 +.b8 54 +.b8 113 +.b8 118 +.b8 52 +.b8 99 +.b8 107 +.b8 119 +.b8 116 +.b8 121 +.b8 117 +.b8 114 +.b8 104 +.b8 105 +.b8 107 +.b8 51 +.b8 102 +.b8 114 +.b8 113 +.b8 55 +.b8 102 +.b8 113 +.b8 110 +.b8 107 +.b8 55 +.b8 103 +.b8 109 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 100 +.b8 122 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.source b/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.source new file mode 100644 index 0000000000000000000000000000000000000000..1750a42ce20412be5d3dc753237d994fcb821d7b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.source @@ -0,0 +1,419 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":18:0) +#loc109 = loc("in_ptr0"(#loc)) +#loc110 = loc("in_ptr1"(#loc)) +#loc111 = loc("in_ptr2"(#loc)) +#loc112 = loc("in_ptr3"(#loc)) +#loc113 = loc("out_ptr0"(#loc)) +#loc114 = loc("ks0"(#loc)) +#loc115 = loc("ks1"(#loc)) +#loc116 = loc("ks2"(#loc)) +#loc117 = loc("ks3"(#loc)) +#loc118 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc119) + %xoffset_0 = arith.constant 1024 : i32 loc(#loc120) + %xoffset_1 = arith.constant 1024 : i32 loc(#loc120) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc120) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc121) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<1024xi32> loc(#loc122) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<1024xi32> loc(#loc122) + %xmask = tt.splat %xnumel : i32 -> tensor<1024xi32> loc(#loc123) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<1024xi32> loc(#loc123) + %x0 = arith.extsi %xindex_4 : tensor<1024xi32> to tensor<1024xi64> loc(#loc124) + %x0_6 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc124) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<1024xi64> loc(#loc124) + %x1 = arith.extsi %xindex_4 : tensor<1024xi32> to tensor<1024xi64> loc(#loc125) + %x1_8 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc125) + %x1_9 = arith.divsi %x1, %x1_8 : tensor<1024xi64> loc(#loc125) + %x1_10 = tt.splat %ks1 : i64 -> tensor<1024xi64> loc(#loc126) + %x1_11 = arith.remsi %x1_9, %x1_10 : tensor<1024xi64> loc(#loc126) + %tmp31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc127) + %tmp31_12 = tt.addptr %tmp31, %xindex_4 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc127) + %tmp31_13 = tt.load %tmp31_12, %xmask_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc128) + %tmp31_14 = arith.extf %tmp31_13 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc129) + %tmp32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc130) + %tmp32_15 = tt.addptr %tmp32, %x1_11 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc130) + %tmp32_16 = tt.load %tmp32_15, %xmask_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc131) + %tmp1 = arith.constant 2 : i32 loc(#loc132) + %tmp1_17 = arith.constant 2 : i64 loc(#loc132) + %tmp1_18 = arith.divsi %ks0, %tmp1_17 : i64 loc(#loc132) + %tmp2 = tt.splat %tmp1_18 : i64 -> tensor<1024xi64> loc(#loc133) + %tmp2_19 = arith.cmpi sge, %x0_7, %tmp2 : tensor<1024xi64> loc(#loc133) + %tmp3 = arith.constant 2 : i32 loc(#loc134) + %tmp3_20 = arith.constant 2 : i64 loc(#loc134) + %tmp3_21 = arith.divsi %ks0, %tmp3_20 : i64 loc(#loc134) + %tmp3_22 = arith.constant -1 : i32 loc(#loc135) + %tmp3_23 = arith.constant -1 : i64 loc(#loc135) + %tmp3_24 = arith.muli %tmp3_23, %tmp3_21 : i64 loc(#loc135) + %tmp3_25 = arith.extsi %xindex_4 : tensor<1024xi32> to tensor<1024xi64> loc(#loc136) + %tmp3_26 = tt.splat %tmp3_24 : i64 -> tensor<1024xi64> loc(#loc136) + %tmp3_27 = arith.addi %tmp3_25, %tmp3_26 : tensor<1024xi64> loc(#loc136) + %tmp3_28 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc137) + %tmp3_29 = tt.addptr %tmp3_28, %tmp3_27 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc137) + %tmp3_30 = arith.andi %tmp2_19, %xmask_5 : tensor<1024xi1> loc(#loc138) + %tmp3_31 = arith.constant 0.000000e+00 : f32 loc(#loc139) + %tmp3_32 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc139) + %tmp3_33 = arith.truncf %tmp3_32 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc139) + %tmp3_34 = tt.load %tmp3_29, %tmp3_30, %tmp3_33 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc139) + %tmp3_35 = arith.extf %tmp3_34 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc140) + %tmp4 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc141) + %tmp4_36 = tt.addptr %tmp4, %x1_11 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc141) + %tmp4_37 = arith.andi %tmp2_19, %xmask_5 : tensor<1024xi1> loc(#loc142) + %tmp4_38 = arith.constant 0.000000e+00 : f32 loc(#loc143) + %tmp4_39 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc143) + %tmp4_40 = arith.fptosi %tmp4_39 : tensor<1024xf32> to tensor<1024xi64> loc(#loc143) + %tmp4_41 = tt.load %tmp4_36, %tmp4_37, %tmp4_40 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc143) + %tmp5 = tt.splat %ks2 : i64 -> tensor<1024xi64> loc(#loc144) + %tmp6 = arith.addi %tmp4_41, %tmp5 : tensor<1024xi64> loc(#loc145) + %tmp7 = arith.constant 0 : i32 loc(#loc146) + %tmp7_42 = arith.extsi %tmp7 : i32 to i64 loc(#loc146) + %tmp7_43 = tt.splat %tmp7_42 : i64 -> tensor<1024xi64> loc(#loc146) + %tmp7_44 = arith.cmpi slt, %tmp4_41, %tmp7_43 : tensor<1024xi64> loc(#loc146) + %tmp8 = arith.select %tmp7_44, %tmp6, %tmp4_41 : tensor<1024xi1>, tensor<1024xi64> loc(#loc147) + %c0_i32 = arith.constant 0 : i32 loc(#loc30) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc30) + %1 = tt.splat %0 : i64 -> tensor<1024xi64> loc(#loc30) + %2 = arith.cmpi sle, %1, %tmp8 : tensor<1024xi64> loc(#loc30) + %3 = tt.splat %ks2 : i64 -> tensor<1024xi64> loc(#loc31) + %4 = arith.cmpi slt, %tmp8, %3 : tensor<1024xi64> loc(#loc31) + %5 = arith.andi %2, %4 : tensor<1024xi1> loc(#loc32) + %6 = arith.andi %tmp2_19, %xmask_5 : tensor<1024xi1> loc(#loc33) + %true = arith.constant true loc(#loc34) + %cst = arith.constant dense : tensor<1024xi1> loc(#loc34) + %7 = arith.xori %6, %cst : tensor<1024xi1> loc(#loc34) + %8 = arith.ori %5, %7 : tensor<1024xi1> loc(#loc35) + tt.assert %8, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<1024xi1> loc(#loc36) + %tmp10 = arith.constant 2 : i32 loc(#loc148) + %tmp10_45 = arith.constant 2 : i64 loc(#loc148) + %tmp10_46 = arith.divsi %ks0, %tmp10_45 : i64 loc(#loc148) + %tmp10_47 = arith.constant -1 : i32 loc(#loc149) + %tmp10_48 = arith.constant -1 : i64 loc(#loc149) + %tmp10_49 = arith.muli %tmp10_48, %tmp10_46 : i64 loc(#loc149) + %tmp10_50 = tt.splat %tmp10_49 : i64 -> tensor<1024xi64> loc(#loc150) + %tmp10_51 = arith.addi %x0_7, %tmp10_50 : tensor<1024xi64> loc(#loc150) + %tmp10_52 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc151) + %tmp10_53 = arith.muli %tmp10_52, %tmp8 : tensor<1024xi64> loc(#loc151) + %tmp10_54 = arith.addi %tmp10_51, %tmp10_53 : tensor<1024xi64> loc(#loc152) + %tmp10_55 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc153) + %tmp10_56 = tt.addptr %tmp10_55, %tmp10_54 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc153) + %tmp10_57 = arith.andi %tmp2_19, %xmask_5 : tensor<1024xi1> loc(#loc154) + %tmp10_58 = arith.constant 0.000000e+00 : f32 loc(#loc155) + %tmp10_59 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc155) + %tmp10_60 = arith.truncf %tmp10_59 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc155) + %tmp10_61 = tt.load %tmp10_56, %tmp10_57, %tmp10_60 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc155) + %tmp10_62 = arith.extf %tmp10_61 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc156) + %tmp11 = arith.mulf %tmp3_35, %tmp10_62 : tensor<1024xf32> loc(#loc157) + %tmp12 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp12_63 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc158) + %tmp12_64 = arith.subf %tmp12_63, %tmp11 : tensor<1024xf32> loc(#loc158) + %tmp13 = arith.constant 0.000000e+00 : f32 loc(#loc159) + %tmp13_65 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc159) + %tmp14 = arith.select %tmp2_19, %tmp12_64, %tmp13_65 : tensor<1024xi1>, tensor<1024xf32> loc(#loc160) + %tmp15 = arith.constant 0.000000e+00 : f32 loc(#loc161) + %tmp16 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc162) + %tmp16_66 = arith.select %tmp2_19, %tmp14, %tmp16 : tensor<1024xi1>, tensor<1024xf32> loc(#loc162) + %tmp17 = tt.splat %tmp1_18 : i64 -> tensor<1024xi64> loc(#loc163) + %tmp17_67 = arith.cmpi slt, %x0_7, %tmp17 : tensor<1024xi64> loc(#loc163) + %tmp18 = arith.extsi %xindex_4 : tensor<1024xi32> to tensor<1024xi64> loc(#loc164) + %tmp18_68 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc164) + %tmp18_69 = arith.addi %tmp18_68, %tmp18 : tensor<1024xi64> loc(#loc164) + %tmp18_70 = arith.constant 2 : i32 loc(#loc165) + %tmp18_71 = arith.constant 2 : i64 loc(#loc165) + %tmp18_72 = arith.divsi %ks0, %tmp18_71 : i64 loc(#loc165) + %tmp18_73 = arith.constant -1 : i32 loc(#loc166) + %tmp18_74 = arith.constant -1 : i64 loc(#loc166) + %tmp18_75 = arith.muli %tmp18_74, %tmp18_72 : i64 loc(#loc166) + %tmp18_76 = tt.splat %tmp18_75 : i64 -> tensor<1024xi64> loc(#loc167) + %tmp18_77 = arith.addi %tmp18_69, %tmp18_76 : tensor<1024xi64> loc(#loc167) + %tmp18_78 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc168) + %tmp18_79 = tt.addptr %tmp18_78, %tmp18_77 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc168) + %tmp18_80 = arith.andi %tmp17_67, %xmask_5 : tensor<1024xi1> loc(#loc169) + %tmp18_81 = arith.constant 0.000000e+00 : f32 loc(#loc170) + %tmp18_82 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc170) + %tmp18_83 = arith.truncf %tmp18_82 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc170) + %tmp18_84 = tt.load %tmp18_79, %tmp18_80, %tmp18_83 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc170) + %tmp18_85 = arith.extf %tmp18_84 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc171) + %tmp19 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc172) + %tmp19_86 = tt.addptr %tmp19, %x1_11 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc172) + %tmp19_87 = arith.andi %tmp17_67, %xmask_5 : tensor<1024xi1> loc(#loc173) + %tmp19_88 = arith.constant 0.000000e+00 : f32 loc(#loc174) + %tmp19_89 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc174) + %tmp19_90 = arith.fptosi %tmp19_89 : tensor<1024xf32> to tensor<1024xi64> loc(#loc174) + %tmp19_91 = tt.load %tmp19_86, %tmp19_87, %tmp19_90 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc174) + %tmp20 = tt.splat %ks2 : i64 -> tensor<1024xi64> loc(#loc175) + %tmp21 = arith.addi %tmp19_91, %tmp20 : tensor<1024xi64> loc(#loc176) + %tmp22 = arith.constant 0 : i32 loc(#loc177) + %tmp22_92 = arith.extsi %tmp22 : i32 to i64 loc(#loc177) + %tmp22_93 = tt.splat %tmp22_92 : i64 -> tensor<1024xi64> loc(#loc177) + %tmp22_94 = arith.cmpi slt, %tmp19_91, %tmp22_93 : tensor<1024xi64> loc(#loc177) + %tmp23 = arith.select %tmp22_94, %tmp21, %tmp19_91 : tensor<1024xi1>, tensor<1024xi64> loc(#loc178) + %c0_i32_95 = arith.constant 0 : i32 loc(#loc68) + %9 = arith.extsi %c0_i32_95 : i32 to i64 loc(#loc68) + %10 = tt.splat %9 : i64 -> tensor<1024xi64> loc(#loc68) + %11 = arith.cmpi sle, %10, %tmp23 : tensor<1024xi64> loc(#loc68) + %12 = tt.splat %ks2 : i64 -> tensor<1024xi64> loc(#loc69) + %13 = arith.cmpi slt, %tmp23, %12 : tensor<1024xi64> loc(#loc69) + %14 = arith.andi %11, %13 : tensor<1024xi1> loc(#loc70) + %15 = arith.andi %tmp17_67, %xmask_5 : tensor<1024xi1> loc(#loc71) + %true_96 = arith.constant true loc(#loc72) + %cst_97 = arith.constant dense : tensor<1024xi1> loc(#loc72) + %16 = arith.xori %15, %cst_97 : tensor<1024xi1> loc(#loc72) + %17 = arith.ori %14, %16 : tensor<1024xi1> loc(#loc73) + tt.assert %17, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<1024xi1> loc(#loc74) + %tmp25 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc179) + %tmp25_98 = arith.addi %tmp25, %x0_7 : tensor<1024xi64> loc(#loc179) + %tmp25_99 = arith.constant 2 : i32 loc(#loc180) + %tmp25_100 = arith.constant 2 : i64 loc(#loc180) + %tmp25_101 = arith.divsi %ks0, %tmp25_100 : i64 loc(#loc180) + %tmp25_102 = arith.constant -1 : i32 loc(#loc181) + %tmp25_103 = arith.constant -1 : i64 loc(#loc181) + %tmp25_104 = arith.muli %tmp25_103, %tmp25_101 : i64 loc(#loc181) + %tmp25_105 = tt.splat %tmp25_104 : i64 -> tensor<1024xi64> loc(#loc182) + %tmp25_106 = arith.addi %tmp25_98, %tmp25_105 : tensor<1024xi64> loc(#loc182) + %tmp25_107 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc183) + %tmp25_108 = arith.muli %tmp25_107, %tmp23 : tensor<1024xi64> loc(#loc183) + %tmp25_109 = arith.addi %tmp25_106, %tmp25_108 : tensor<1024xi64> loc(#loc184) + %tmp25_110 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc185) + %tmp25_111 = tt.addptr %tmp25_110, %tmp25_109 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc185) + %tmp25_112 = arith.andi %tmp17_67, %xmask_5 : tensor<1024xi1> loc(#loc186) + %tmp25_113 = arith.constant 0.000000e+00 : f32 loc(#loc187) + %tmp25_114 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc187) + %tmp25_115 = arith.truncf %tmp25_114 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc187) + %tmp25_116 = tt.load %tmp25_111, %tmp25_112, %tmp25_115 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc187) + %tmp25_117 = arith.extf %tmp25_116 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc188) + %tmp26 = arith.mulf %tmp18_85, %tmp25_117 : tensor<1024xf32> loc(#loc189) + %tmp27 = arith.constant 0.000000e+00 : f32 loc(#loc190) + %tmp27_118 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc190) + %tmp28 = arith.select %tmp17_67, %tmp26, %tmp27_118 : tensor<1024xi1>, tensor<1024xf32> loc(#loc191) + %tmp29 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc192) + %tmp29_119 = arith.select %tmp17_67, %tmp28, %tmp29 : tensor<1024xi1>, tensor<1024xf32> loc(#loc192) + %tmp30 = arith.addf %tmp16_66, %tmp29_119 : tensor<1024xf32> loc(#loc193) + %tmp34 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc194) + %tmp34_120 = arith.addi %tmp32_16, %tmp34 : tensor<1024xi64> loc(#loc194) + %tmp35 = arith.constant 0 : i32 loc(#loc195) + %tmp35_121 = arith.extsi %tmp35 : i32 to i64 loc(#loc195) + %tmp35_122 = tt.splat %tmp35_121 : i64 -> tensor<1024xi64> loc(#loc195) + %tmp35_123 = arith.cmpi slt, %tmp32_16, %tmp35_122 : tensor<1024xi64> loc(#loc195) + %tmp36 = arith.select %tmp35_123, %tmp34_120, %tmp32_16 : tensor<1024xi1>, tensor<1024xi64> loc(#loc196) + %c0_i32_124 = arith.constant 0 : i32 loc(#loc93) + %18 = arith.extsi %c0_i32_124 : i32 to i64 loc(#loc93) + %19 = tt.splat %18 : i64 -> tensor<1024xi64> loc(#loc93) + %20 = arith.cmpi sle, %19, %tmp36 : tensor<1024xi64> loc(#loc93) + %21 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc94) + %22 = arith.cmpi slt, %tmp36, %21 : tensor<1024xi64> loc(#loc94) + %23 = arith.andi %20, %22 : tensor<1024xi1> loc(#loc95) + %true_125 = arith.constant true loc(#loc96) + %cst_126 = arith.constant dense : tensor<1024xi1> loc(#loc96) + %24 = arith.xori %xmask_5, %cst_126 : tensor<1024xi1> loc(#loc96) + %25 = arith.ori %23, %24 : tensor<1024xi1> loc(#loc97) + tt.assert %25, "index out of bounds: 0 <= tmp36 < ks3" : tensor<1024xi1> loc(#loc98) + %tmp38 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc197) + %tmp38_127 = arith.muli %tmp38, %tmp36 : tensor<1024xi64> loc(#loc197) + %tmp38_128 = arith.addi %x0_7, %tmp38_127 : tensor<1024xi64> loc(#loc198) + %tmp38_129 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc199) + %tmp38_130 = tt.addptr %tmp38_129, %tmp38_128 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc199) + %tmp38_131 = tt.load %tmp38_130, %xmask_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc200) + %tmp38_132 = arith.extf %tmp38_131 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc201) + %tmp39 = arith.mulf %tmp31_14, %tmp38_132 : tensor<1024xf32> loc(#loc202) + %tmp40 = arith.addf %tmp30, %tmp39 : tensor<1024xf32> loc(#loc203) + %26 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc106) + %27 = tt.addptr %26, %xindex_4 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc106) + %28 = arith.truncf %tmp40 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc107) + tt.store %27, %28, %xmask_5 : tensor<1024x!tt.ptr> loc(#loc107) + tt.return loc(#loc108) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":22:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":24:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":25:31) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":25:36) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":25:76) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":26:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":26:36) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":28:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":29:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:48) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:35) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:111) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":31:30) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":31:42) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":31:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":32:32) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":33:18) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":34:18) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":35:32) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:28) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:98) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:64) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:115) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:108) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:106) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:123) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:49) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:42) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:36) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:58) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:54) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:31) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:72) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:65) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:123) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":38:19) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":39:13) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":40:38) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":41:34) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":42:12) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":43:34) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":44:19) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:37) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:55) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:48) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:42) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:31) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:68) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:60) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:119) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":46:31) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":46:44) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":46:36) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":47:33) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":48:20) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":49:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":50:35) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:28) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:100) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:65) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:118) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:110) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:108) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:126) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:37) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:55) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:48) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:42) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:64) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:60) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:31) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:80) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:72) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:131) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":53:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":54:38) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":55:35) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":56:35) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":57:20) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":59:20) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":60:20) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":61:35) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":62:28) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":62:46) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":62:38) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":62:56) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":62:54) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":62:64) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":63:40) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":63:36) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":63:31) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":63:48) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":63:88) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":64:20) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":65:20) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":66:25) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":66:37) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":66:4) +#loc119 = loc("xoffset"(#loc1)) +#loc120 = loc("xoffset"(#loc2)) +#loc121 = loc("xindex"(#loc3)) +#loc122 = loc("xindex"(#loc4)) +#loc123 = loc("xmask"(#loc5)) +#loc124 = loc("x0"(#loc6)) +#loc125 = loc("x1"(#loc7)) +#loc126 = loc("x1"(#loc8)) +#loc127 = loc("tmp31"(#loc9)) +#loc128 = loc("tmp31"(#loc10)) +#loc129 = loc("tmp31"(#loc11)) +#loc130 = loc("tmp32"(#loc12)) +#loc131 = loc("tmp32"(#loc13)) +#loc132 = loc("tmp1"(#loc14)) +#loc133 = loc("tmp2"(#loc15)) +#loc134 = loc("tmp3"(#loc16)) +#loc135 = loc("tmp3"(#loc17)) +#loc136 = loc("tmp3"(#loc18)) +#loc137 = loc("tmp3"(#loc19)) +#loc138 = loc("tmp3"(#loc20)) +#loc139 = loc("tmp3"(#loc21)) +#loc140 = loc("tmp3"(#loc22)) +#loc141 = loc("tmp4"(#loc23)) +#loc142 = loc("tmp4"(#loc24)) +#loc143 = loc("tmp4"(#loc25)) +#loc144 = loc("tmp5"(#loc26)) +#loc145 = loc("tmp6"(#loc27)) +#loc146 = loc("tmp7"(#loc28)) +#loc147 = loc("tmp8"(#loc29)) +#loc148 = loc("tmp10"(#loc37)) +#loc149 = loc("tmp10"(#loc38)) +#loc150 = loc("tmp10"(#loc39)) +#loc151 = loc("tmp10"(#loc40)) +#loc152 = loc("tmp10"(#loc41)) +#loc153 = loc("tmp10"(#loc42)) +#loc154 = loc("tmp10"(#loc43)) +#loc155 = loc("tmp10"(#loc44)) +#loc156 = loc("tmp10"(#loc45)) +#loc157 = loc("tmp11"(#loc46)) +#loc158 = loc("tmp12"(#loc47)) +#loc159 = loc("tmp13"(#loc48)) +#loc160 = loc("tmp14"(#loc49)) +#loc161 = loc("tmp15"(#loc50)) +#loc162 = loc("tmp16"(#loc51)) +#loc163 = loc("tmp17"(#loc52)) +#loc164 = loc("tmp18"(#loc53)) +#loc165 = loc("tmp18"(#loc54)) +#loc166 = loc("tmp18"(#loc55)) +#loc167 = loc("tmp18"(#loc56)) +#loc168 = loc("tmp18"(#loc57)) +#loc169 = loc("tmp18"(#loc58)) +#loc170 = loc("tmp18"(#loc59)) +#loc171 = loc("tmp18"(#loc60)) +#loc172 = loc("tmp19"(#loc61)) +#loc173 = loc("tmp19"(#loc62)) +#loc174 = loc("tmp19"(#loc63)) +#loc175 = loc("tmp20"(#loc64)) +#loc176 = loc("tmp21"(#loc65)) +#loc177 = loc("tmp22"(#loc66)) +#loc178 = loc("tmp23"(#loc67)) +#loc179 = loc("tmp25"(#loc75)) +#loc180 = loc("tmp25"(#loc76)) +#loc181 = loc("tmp25"(#loc77)) +#loc182 = loc("tmp25"(#loc78)) +#loc183 = loc("tmp25"(#loc79)) +#loc184 = loc("tmp25"(#loc80)) +#loc185 = loc("tmp25"(#loc81)) +#loc186 = loc("tmp25"(#loc82)) +#loc187 = loc("tmp25"(#loc83)) +#loc188 = loc("tmp25"(#loc84)) +#loc189 = loc("tmp26"(#loc85)) +#loc190 = loc("tmp27"(#loc86)) +#loc191 = loc("tmp28"(#loc87)) +#loc192 = loc("tmp29"(#loc88)) +#loc193 = loc("tmp30"(#loc89)) +#loc194 = loc("tmp34"(#loc90)) +#loc195 = loc("tmp35"(#loc91)) +#loc196 = loc("tmp36"(#loc92)) +#loc197 = loc("tmp38"(#loc99)) +#loc198 = loc("tmp38"(#loc100)) +#loc199 = loc("tmp38"(#loc101)) +#loc200 = loc("tmp38"(#loc102)) +#loc201 = loc("tmp38"(#loc103)) +#loc202 = loc("tmp39"(#loc104)) +#loc203 = loc("tmp40"(#loc105)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..7839586e2afb80c3c7f47353476734dce00b573e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttgir @@ -0,0 +1,284 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":18:0) +#loc91 = loc("in_ptr0"(#loc)) +#loc92 = loc("in_ptr1"(#loc)) +#loc93 = loc("in_ptr2"(#loc)) +#loc94 = loc("in_ptr3"(#loc)) +#loc95 = loc("out_ptr0"(#loc)) +#loc96 = loc("ks0"(#loc)) +#loc97 = loc("ks1"(#loc)) +#loc98 = loc("ks2"(#loc)) +#loc99 = loc("ks3"(#loc)) +#loc100 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<1024xi1, #blocked> loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xbf16, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<1024xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc101) + %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc102) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc103) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32, #blocked> loc(#loc104) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32, #blocked> loc(#loc104) + %xmask = tt.splat %xnumel : i32 -> tensor<1024xi32, #blocked> loc(#loc105) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<1024xi32, #blocked> loc(#loc105) + %x0 = arith.extsi %xindex_5 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> loc(#loc106) + %x0_7 = tt.splat %ks0 : i64 -> tensor<1024xi64, #blocked> loc(#loc106) + %x0_8 = arith.remsi %x0, %x0_7 : tensor<1024xi64, #blocked> loc(#loc106) + %x1 = arith.divsi %x0, %x0_7 : tensor<1024xi64, #blocked> loc(#loc107) + %x1_9 = tt.splat %ks1 : i64 -> tensor<1024xi64, #blocked> loc(#loc108) + %x1_10 = arith.remsi %x1, %x1_9 : tensor<1024xi64, #blocked> loc(#loc108) + %tmp31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc109) + %tmp31_11 = tt.addptr %tmp31, %xindex_5 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc109) + %tmp31_12 = tt.load %tmp31_11, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc110) + %tmp31_13 = arith.extf %tmp31_12 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc111) + %tmp32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc112) + %tmp32_14 = tt.addptr %tmp32, %x1_10 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc112) + %tmp32_15 = tt.load %tmp32_14, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc113) + %tmp1 = arith.divsi %ks0, %c2_i64 : i64 loc(#loc114) + %tmp2 = tt.splat %tmp1 : i64 -> tensor<1024xi64, #blocked> loc(#loc115) + %tmp2_16 = arith.cmpi sge, %x0_8, %tmp2 : tensor<1024xi64, #blocked> loc(#loc115) + %tmp3 = arith.muli %tmp1, %c-1_i64 : i64 loc(#loc116) + %tmp3_17 = tt.splat %tmp3 : i64 -> tensor<1024xi64, #blocked> loc(#loc117) + %tmp3_18 = arith.addi %x0, %tmp3_17 : tensor<1024xi64, #blocked> loc(#loc117) + %tmp3_19 = tt.addptr %tmp31, %tmp3_18 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc118) + %tmp3_20 = arith.andi %tmp2_16, %xmask_6 : tensor<1024xi1, #blocked> loc(#loc119) + %tmp3_21 = tt.load %tmp3_19, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc120) + %tmp3_22 = arith.extf %tmp3_21 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc121) + %tmp4 = tt.load %tmp32_14, %tmp3_20, %cst_1 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc122) + %tmp5 = tt.splat %ks2 : i64 -> tensor<1024xi64, #blocked> loc(#loc123) + %tmp6 = arith.addi %tmp4, %tmp5 : tensor<1024xi64, #blocked> loc(#loc124) + %tmp7 = arith.cmpi slt, %tmp4, %cst_1 : tensor<1024xi64, #blocked> loc(#loc125) + %tmp8 = arith.select %tmp7, %tmp6, %tmp4 : tensor<1024xi1, #blocked>, tensor<1024xi64, #blocked> loc(#loc126) + %0 = arith.cmpi sge, %tmp8, %cst_1 : tensor<1024xi64, #blocked> loc(#loc28) + %1 = arith.cmpi slt, %tmp8, %tmp5 : tensor<1024xi64, #blocked> loc(#loc29) + %2 = arith.andi %0, %1 : tensor<1024xi1, #blocked> loc(#loc30) + %3 = arith.xori %tmp3_20, %cst : tensor<1024xi1, #blocked> loc(#loc31) + %4 = arith.ori %2, %3 : tensor<1024xi1, #blocked> loc(#loc32) + tt.assert %4, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<1024xi1, #blocked> loc(#loc33) + %tmp10 = arith.addi %x0_8, %tmp3_17 : tensor<1024xi64, #blocked> loc(#loc127) + %tmp10_23 = arith.muli %x0_7, %tmp8 : tensor<1024xi64, #blocked> loc(#loc128) + %tmp10_24 = arith.addi %tmp10, %tmp10_23 : tensor<1024xi64, #blocked> loc(#loc129) + %tmp10_25 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc130) + %tmp10_26 = tt.addptr %tmp10_25, %tmp10_24 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc130) + %tmp10_27 = tt.load %tmp10_26, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc131) + %tmp10_28 = arith.extf %tmp10_27 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc132) + %tmp11 = arith.mulf %tmp3_22, %tmp10_28 : tensor<1024xf32, #blocked> loc(#loc133) + %tmp12 = arith.subf %cst_2, %tmp11 : tensor<1024xf32, #blocked> loc(#loc134) + %tmp16 = arith.select %tmp2_16, %tmp12, %cst_2 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc169) + %tmp17 = arith.cmpi slt, %x0_8, %tmp2 : tensor<1024xi64, #blocked> loc(#loc137) + %tmp18 = arith.addi %x0_7, %x0 : tensor<1024xi64, #blocked> loc(#loc138) + %tmp18_29 = arith.addi %tmp18, %tmp3_17 : tensor<1024xi64, #blocked> loc(#loc139) + %tmp18_30 = tt.addptr %tmp31, %tmp18_29 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc140) + %tmp18_31 = arith.andi %tmp17, %xmask_6 : tensor<1024xi1, #blocked> loc(#loc141) + %tmp18_32 = tt.load %tmp18_30, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc142) + %tmp18_33 = arith.extf %tmp18_32 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc143) + %tmp19 = tt.load %tmp32_14, %tmp18_31, %cst_1 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc144) + %tmp21 = arith.addi %tmp19, %tmp5 : tensor<1024xi64, #blocked> loc(#loc145) + %tmp22 = arith.cmpi slt, %tmp19, %cst_1 : tensor<1024xi64, #blocked> loc(#loc146) + %tmp23 = arith.select %tmp22, %tmp21, %tmp19 : tensor<1024xi1, #blocked>, tensor<1024xi64, #blocked> loc(#loc147) + %5 = arith.cmpi sge, %tmp23, %cst_1 : tensor<1024xi64, #blocked> loc(#loc55) + %6 = arith.cmpi slt, %tmp23, %tmp5 : tensor<1024xi64, #blocked> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<1024xi1, #blocked> loc(#loc57) + %8 = arith.xori %tmp18_31, %cst : tensor<1024xi1, #blocked> loc(#loc58) + %9 = arith.ori %7, %8 : tensor<1024xi1, #blocked> loc(#loc59) + tt.assert %9, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<1024xi1, #blocked> loc(#loc60) + %tmp25 = arith.addi %x0_7, %x0_8 : tensor<1024xi64, #blocked> loc(#loc148) + %tmp25_34 = arith.addi %tmp25, %tmp3_17 : tensor<1024xi64, #blocked> loc(#loc149) + %tmp25_35 = arith.muli %x0_7, %tmp23 : tensor<1024xi64, #blocked> loc(#loc150) + %tmp25_36 = arith.addi %tmp25_34, %tmp25_35 : tensor<1024xi64, #blocked> loc(#loc151) + %tmp25_37 = tt.addptr %tmp10_25, %tmp25_36 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc152) + %tmp25_38 = tt.load %tmp25_37, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc153) + %tmp25_39 = arith.extf %tmp25_38 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc154) + %tmp26 = arith.mulf %tmp18_33, %tmp25_39 : tensor<1024xf32, #blocked> loc(#loc155) + %tmp29 = arith.select %tmp17, %tmp26, %cst_2 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc170) + %tmp30 = arith.addf %tmp16, %tmp29 : tensor<1024xf32, #blocked> loc(#loc158) + %tmp34 = tt.splat %ks3 : i64 -> tensor<1024xi64, #blocked> loc(#loc159) + %tmp34_40 = arith.addi %tmp32_15, %tmp34 : tensor<1024xi64, #blocked> loc(#loc159) + %tmp35 = arith.cmpi slt, %tmp32_15, %cst_1 : tensor<1024xi64, #blocked> loc(#loc160) + %tmp36 = arith.select %tmp35, %tmp34_40, %tmp32_15 : tensor<1024xi1, #blocked>, tensor<1024xi64, #blocked> loc(#loc161) + %10 = arith.cmpi sge, %tmp36, %cst_1 : tensor<1024xi64, #blocked> loc(#loc75) + %11 = arith.cmpi slt, %tmp36, %tmp34 : tensor<1024xi64, #blocked> loc(#loc76) + %12 = arith.andi %10, %11 : tensor<1024xi1, #blocked> loc(#loc77) + %13 = arith.xori %xmask_6, %cst : tensor<1024xi1, #blocked> loc(#loc78) + %14 = arith.ori %12, %13 : tensor<1024xi1, #blocked> loc(#loc79) + tt.assert %14, "index out of bounds: 0 <= tmp36 < ks3" : tensor<1024xi1, #blocked> loc(#loc80) + %tmp38 = arith.muli %x0_7, %tmp36 : tensor<1024xi64, #blocked> loc(#loc162) + %tmp38_41 = arith.addi %x0_8, %tmp38 : tensor<1024xi64, #blocked> loc(#loc163) + %tmp38_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc164) + %tmp38_43 = tt.addptr %tmp38_42, %tmp38_41 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc164) + %tmp38_44 = tt.load %tmp38_43, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc165) + %tmp38_45 = arith.extf %tmp38_44 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc166) + %tmp39 = arith.mulf %tmp31_13, %tmp38_45 : tensor<1024xf32, #blocked> loc(#loc167) + %tmp40 = arith.addf %tmp30, %tmp39 : tensor<1024xf32, #blocked> loc(#loc168) + %15 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc88) + %16 = tt.addptr %15, %xindex_5 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc88) + %17 = arith.truncf %tmp40 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc89) + tt.store %16, %17, %xmask_6 : tensor<1024x!tt.ptr, #blocked> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":24:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":24:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":25:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":25:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":25:76) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":26:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":26:36) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":28:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":29:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:35) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:111) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":31:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":32:32) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":33:18) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":34:18) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":35:32) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:98) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:64) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:108) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:106) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:123) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:58) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:65) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:123) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":38:19) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":39:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":43:34) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":41:34) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":44:19) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:37) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:42) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:68) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:60) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:119) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":46:36) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":48:20) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":49:20) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":50:35) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:100) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:65) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:110) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:108) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:126) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:37) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:64) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:60) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:31) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:72) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:131) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":53:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":56:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":55:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":57:20) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":59:20) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":60:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":61:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":62:28) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":62:46) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":62:38) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":62:56) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":62:54) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":62:64) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":63:40) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":63:36) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":63:31) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":63:48) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":63:88) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":64:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":65:20) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":66:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":66:37) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":66:4) +#loc101 = loc("xoffset"(#loc2)) +#loc102 = loc("xoffset"(#loc3)) +#loc103 = loc("xindex"(#loc4)) +#loc104 = loc("xindex"(#loc5)) +#loc105 = loc("xmask"(#loc6)) +#loc106 = loc("x0"(#loc7)) +#loc107 = loc("x1"(#loc8)) +#loc108 = loc("x1"(#loc9)) +#loc109 = loc("tmp31"(#loc10)) +#loc110 = loc("tmp31"(#loc11)) +#loc111 = loc("tmp31"(#loc12)) +#loc112 = loc("tmp32"(#loc13)) +#loc113 = loc("tmp32"(#loc14)) +#loc114 = loc("tmp1"(#loc15)) +#loc115 = loc("tmp2"(#loc16)) +#loc116 = loc("tmp3"(#loc17)) +#loc117 = loc("tmp3"(#loc18)) +#loc118 = loc("tmp3"(#loc19)) +#loc119 = loc("tmp3"(#loc20)) +#loc120 = loc("tmp3"(#loc21)) +#loc121 = loc("tmp3"(#loc22)) +#loc122 = loc("tmp4"(#loc23)) +#loc123 = loc("tmp5"(#loc24)) +#loc124 = loc("tmp6"(#loc25)) +#loc125 = loc("tmp7"(#loc26)) +#loc126 = loc("tmp8"(#loc27)) +#loc127 = loc("tmp10"(#loc34)) +#loc128 = loc("tmp10"(#loc35)) +#loc129 = loc("tmp10"(#loc36)) +#loc130 = loc("tmp10"(#loc37)) +#loc131 = loc("tmp10"(#loc38)) +#loc132 = loc("tmp10"(#loc39)) +#loc133 = loc("tmp11"(#loc40)) +#loc134 = loc("tmp12"(#loc41)) +#loc135 = loc("tmp16"(#loc42)) +#loc136 = loc("tmp14"(#loc43)) +#loc137 = loc("tmp17"(#loc44)) +#loc138 = loc("tmp18"(#loc45)) +#loc139 = loc("tmp18"(#loc46)) +#loc140 = loc("tmp18"(#loc47)) +#loc141 = loc("tmp18"(#loc48)) +#loc142 = loc("tmp18"(#loc49)) +#loc143 = loc("tmp18"(#loc50)) +#loc144 = loc("tmp19"(#loc51)) +#loc145 = loc("tmp21"(#loc52)) +#loc146 = loc("tmp22"(#loc53)) +#loc147 = loc("tmp23"(#loc54)) +#loc148 = loc("tmp25"(#loc61)) +#loc149 = loc("tmp25"(#loc62)) +#loc150 = loc("tmp25"(#loc63)) +#loc151 = loc("tmp25"(#loc64)) +#loc152 = loc("tmp25"(#loc65)) +#loc153 = loc("tmp25"(#loc66)) +#loc154 = loc("tmp25"(#loc67)) +#loc155 = loc("tmp26"(#loc68)) +#loc156 = loc("tmp29"(#loc69)) +#loc157 = loc("tmp28"(#loc70)) +#loc158 = loc("tmp30"(#loc71)) +#loc159 = loc("tmp34"(#loc72)) +#loc160 = loc("tmp35"(#loc73)) +#loc161 = loc("tmp36"(#loc74)) +#loc162 = loc("tmp38"(#loc81)) +#loc163 = loc("tmp38"(#loc82)) +#loc164 = loc("tmp38"(#loc83)) +#loc165 = loc("tmp38"(#loc84)) +#loc166 = loc("tmp38"(#loc85)) +#loc167 = loc("tmp39"(#loc86)) +#loc168 = loc("tmp40"(#loc87)) +#loc169 = loc(fused[#loc135, #loc136]) +#loc170 = loc(fused[#loc156, #loc157]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..681d6924901ef0d832df555047349729423f1442 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttir @@ -0,0 +1,283 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":18:0) +#loc91 = loc("in_ptr0"(#loc)) +#loc92 = loc("in_ptr1"(#loc)) +#loc93 = loc("in_ptr2"(#loc)) +#loc94 = loc("in_ptr3"(#loc)) +#loc95 = loc("out_ptr0"(#loc)) +#loc96 = loc("ks0"(#loc)) +#loc97 = loc("ks1"(#loc)) +#loc98 = loc("ks2"(#loc)) +#loc99 = loc("ks3"(#loc)) +#loc100 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1024xi64> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xbf16> loc(#loc1) + %cst_1 = arith.constant dense : tensor<1024xi1> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc101) + %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc102) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc103) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc104) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc104) + %xmask = tt.splat %xnumel : i32 -> tensor<1024xi32> loc(#loc105) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<1024xi32> loc(#loc105) + %x0 = arith.extsi %xindex_5 : tensor<1024xi32> to tensor<1024xi64> loc(#loc106) + %x0_7 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc106) + %x0_8 = arith.remsi %x0, %x0_7 : tensor<1024xi64> loc(#loc106) + %x1 = arith.divsi %x0, %x0_7 : tensor<1024xi64> loc(#loc107) + %x1_9 = tt.splat %ks1 : i64 -> tensor<1024xi64> loc(#loc108) + %x1_10 = arith.remsi %x1, %x1_9 : tensor<1024xi64> loc(#loc108) + %tmp31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc109) + %tmp31_11 = tt.addptr %tmp31, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc109) + %tmp31_12 = tt.load %tmp31_11, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc110) + %tmp31_13 = arith.extf %tmp31_12 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc111) + %tmp32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc112) + %tmp32_14 = tt.addptr %tmp32, %x1_10 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc112) + %tmp32_15 = tt.load %tmp32_14, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc113) + %tmp1 = arith.divsi %ks0, %c2_i64 : i64 loc(#loc114) + %tmp2 = tt.splat %tmp1 : i64 -> tensor<1024xi64> loc(#loc115) + %tmp2_16 = arith.cmpi sge, %x0_8, %tmp2 : tensor<1024xi64> loc(#loc115) + %tmp3 = arith.muli %tmp1, %c-1_i64 : i64 loc(#loc116) + %tmp3_17 = tt.splat %tmp3 : i64 -> tensor<1024xi64> loc(#loc117) + %tmp3_18 = arith.addi %x0, %tmp3_17 : tensor<1024xi64> loc(#loc117) + %tmp3_19 = tt.addptr %tmp31, %tmp3_18 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc118) + %tmp3_20 = arith.andi %tmp2_16, %xmask_6 : tensor<1024xi1> loc(#loc119) + %tmp3_21 = tt.load %tmp3_19, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc120) + %tmp3_22 = arith.extf %tmp3_21 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc121) + %tmp4 = tt.load %tmp32_14, %tmp3_20, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc122) + %tmp5 = tt.splat %ks2 : i64 -> tensor<1024xi64> loc(#loc123) + %tmp6 = arith.addi %tmp4, %tmp5 : tensor<1024xi64> loc(#loc124) + %tmp7 = arith.cmpi slt, %tmp4, %cst : tensor<1024xi64> loc(#loc125) + %tmp8 = arith.select %tmp7, %tmp6, %tmp4 : tensor<1024xi1>, tensor<1024xi64> loc(#loc126) + %0 = arith.cmpi sge, %tmp8, %cst : tensor<1024xi64> loc(#loc28) + %1 = arith.cmpi slt, %tmp8, %tmp5 : tensor<1024xi64> loc(#loc29) + %2 = arith.andi %0, %1 : tensor<1024xi1> loc(#loc30) + %3 = arith.xori %tmp3_20, %cst_1 : tensor<1024xi1> loc(#loc31) + %4 = arith.ori %2, %3 : tensor<1024xi1> loc(#loc32) + tt.assert %4, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<1024xi1> loc(#loc33) + %tmp10 = arith.addi %x0_8, %tmp3_17 : tensor<1024xi64> loc(#loc127) + %tmp10_23 = arith.muli %x0_7, %tmp8 : tensor<1024xi64> loc(#loc128) + %tmp10_24 = arith.addi %tmp10, %tmp10_23 : tensor<1024xi64> loc(#loc129) + %tmp10_25 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc130) + %tmp10_26 = tt.addptr %tmp10_25, %tmp10_24 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc130) + %tmp10_27 = tt.load %tmp10_26, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc131) + %tmp10_28 = arith.extf %tmp10_27 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc132) + %tmp11 = arith.mulf %tmp3_22, %tmp10_28 : tensor<1024xf32> loc(#loc133) + %tmp12 = arith.subf %cst_2, %tmp11 : tensor<1024xf32> loc(#loc134) + %tmp16 = arith.select %tmp2_16, %tmp12, %cst_2 : tensor<1024xi1>, tensor<1024xf32> loc(#loc169) + %tmp17 = arith.cmpi slt, %x0_8, %tmp2 : tensor<1024xi64> loc(#loc137) + %tmp18 = arith.addi %x0_7, %x0 : tensor<1024xi64> loc(#loc138) + %tmp18_29 = arith.addi %tmp18, %tmp3_17 : tensor<1024xi64> loc(#loc139) + %tmp18_30 = tt.addptr %tmp31, %tmp18_29 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc140) + %tmp18_31 = arith.andi %tmp17, %xmask_6 : tensor<1024xi1> loc(#loc141) + %tmp18_32 = tt.load %tmp18_30, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc142) + %tmp18_33 = arith.extf %tmp18_32 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc143) + %tmp19 = tt.load %tmp32_14, %tmp18_31, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc144) + %tmp21 = arith.addi %tmp19, %tmp5 : tensor<1024xi64> loc(#loc145) + %tmp22 = arith.cmpi slt, %tmp19, %cst : tensor<1024xi64> loc(#loc146) + %tmp23 = arith.select %tmp22, %tmp21, %tmp19 : tensor<1024xi1>, tensor<1024xi64> loc(#loc147) + %5 = arith.cmpi sge, %tmp23, %cst : tensor<1024xi64> loc(#loc55) + %6 = arith.cmpi slt, %tmp23, %tmp5 : tensor<1024xi64> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<1024xi1> loc(#loc57) + %8 = arith.xori %tmp18_31, %cst_1 : tensor<1024xi1> loc(#loc58) + %9 = arith.ori %7, %8 : tensor<1024xi1> loc(#loc59) + tt.assert %9, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<1024xi1> loc(#loc60) + %tmp25 = arith.addi %x0_7, %x0_8 : tensor<1024xi64> loc(#loc148) + %tmp25_34 = arith.addi %tmp25, %tmp3_17 : tensor<1024xi64> loc(#loc149) + %tmp25_35 = arith.muli %x0_7, %tmp23 : tensor<1024xi64> loc(#loc150) + %tmp25_36 = arith.addi %tmp25_34, %tmp25_35 : tensor<1024xi64> loc(#loc151) + %tmp25_37 = tt.addptr %tmp10_25, %tmp25_36 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc152) + %tmp25_38 = tt.load %tmp25_37, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc153) + %tmp25_39 = arith.extf %tmp25_38 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc154) + %tmp26 = arith.mulf %tmp18_33, %tmp25_39 : tensor<1024xf32> loc(#loc155) + %tmp29 = arith.select %tmp17, %tmp26, %cst_2 : tensor<1024xi1>, tensor<1024xf32> loc(#loc170) + %tmp30 = arith.addf %tmp16, %tmp29 : tensor<1024xf32> loc(#loc158) + %tmp34 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc159) + %tmp34_40 = arith.addi %tmp32_15, %tmp34 : tensor<1024xi64> loc(#loc159) + %tmp35 = arith.cmpi slt, %tmp32_15, %cst : tensor<1024xi64> loc(#loc160) + %tmp36 = arith.select %tmp35, %tmp34_40, %tmp32_15 : tensor<1024xi1>, tensor<1024xi64> loc(#loc161) + %10 = arith.cmpi sge, %tmp36, %cst : tensor<1024xi64> loc(#loc75) + %11 = arith.cmpi slt, %tmp36, %tmp34 : tensor<1024xi64> loc(#loc76) + %12 = arith.andi %10, %11 : tensor<1024xi1> loc(#loc77) + %13 = arith.xori %xmask_6, %cst_1 : tensor<1024xi1> loc(#loc78) + %14 = arith.ori %12, %13 : tensor<1024xi1> loc(#loc79) + tt.assert %14, "index out of bounds: 0 <= tmp36 < ks3" : tensor<1024xi1> loc(#loc80) + %tmp38 = arith.muli %x0_7, %tmp36 : tensor<1024xi64> loc(#loc162) + %tmp38_41 = arith.addi %x0_8, %tmp38 : tensor<1024xi64> loc(#loc163) + %tmp38_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc164) + %tmp38_43 = tt.addptr %tmp38_42, %tmp38_41 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc164) + %tmp38_44 = tt.load %tmp38_43, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc165) + %tmp38_45 = arith.extf %tmp38_44 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc166) + %tmp39 = arith.mulf %tmp31_13, %tmp38_45 : tensor<1024xf32> loc(#loc167) + %tmp40 = arith.addf %tmp30, %tmp39 : tensor<1024xf32> loc(#loc168) + %15 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc88) + %16 = tt.addptr %15, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc88) + %17 = arith.truncf %tmp40 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc89) + tt.store %16, %17, %xmask_6 : tensor<1024x!tt.ptr> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":24:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":24:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":25:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":25:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":25:76) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":26:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":26:36) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":28:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":29:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:35) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":30:111) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":31:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":32:32) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":33:18) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":34:18) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":35:32) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:98) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:64) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:108) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:106) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":36:123) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:58) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:65) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":37:123) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":38:19) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":39:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":43:34) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":41:34) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":44:19) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:37) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:42) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:68) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:60) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":45:119) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":46:36) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":48:20) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":49:20) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":50:35) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:100) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:65) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:110) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:108) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":51:126) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:37) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:64) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:60) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:31) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:72) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":52:131) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":53:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":56:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":55:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":57:20) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":59:20) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":60:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":61:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":62:28) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":62:46) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":62:38) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":62:56) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":62:54) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":62:64) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":63:40) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":63:36) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":63:31) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":63:48) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":63:88) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":64:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":65:20) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":66:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":66:37) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dz/cdz3io7w5uyfrmfqvmg2kt2ay66qv4ckwtyurhik3frq7fqnk7gm.py":66:4) +#loc101 = loc("xoffset"(#loc2)) +#loc102 = loc("xoffset"(#loc3)) +#loc103 = loc("xindex"(#loc4)) +#loc104 = loc("xindex"(#loc5)) +#loc105 = loc("xmask"(#loc6)) +#loc106 = loc("x0"(#loc7)) +#loc107 = loc("x1"(#loc8)) +#loc108 = loc("x1"(#loc9)) +#loc109 = loc("tmp31"(#loc10)) +#loc110 = loc("tmp31"(#loc11)) +#loc111 = loc("tmp31"(#loc12)) +#loc112 = loc("tmp32"(#loc13)) +#loc113 = loc("tmp32"(#loc14)) +#loc114 = loc("tmp1"(#loc15)) +#loc115 = loc("tmp2"(#loc16)) +#loc116 = loc("tmp3"(#loc17)) +#loc117 = loc("tmp3"(#loc18)) +#loc118 = loc("tmp3"(#loc19)) +#loc119 = loc("tmp3"(#loc20)) +#loc120 = loc("tmp3"(#loc21)) +#loc121 = loc("tmp3"(#loc22)) +#loc122 = loc("tmp4"(#loc23)) +#loc123 = loc("tmp5"(#loc24)) +#loc124 = loc("tmp6"(#loc25)) +#loc125 = loc("tmp7"(#loc26)) +#loc126 = loc("tmp8"(#loc27)) +#loc127 = loc("tmp10"(#loc34)) +#loc128 = loc("tmp10"(#loc35)) +#loc129 = loc("tmp10"(#loc36)) +#loc130 = loc("tmp10"(#loc37)) +#loc131 = loc("tmp10"(#loc38)) +#loc132 = loc("tmp10"(#loc39)) +#loc133 = loc("tmp11"(#loc40)) +#loc134 = loc("tmp12"(#loc41)) +#loc135 = loc("tmp16"(#loc42)) +#loc136 = loc("tmp14"(#loc43)) +#loc137 = loc("tmp17"(#loc44)) +#loc138 = loc("tmp18"(#loc45)) +#loc139 = loc("tmp18"(#loc46)) +#loc140 = loc("tmp18"(#loc47)) +#loc141 = loc("tmp18"(#loc48)) +#loc142 = loc("tmp18"(#loc49)) +#loc143 = loc("tmp18"(#loc50)) +#loc144 = loc("tmp19"(#loc51)) +#loc145 = loc("tmp21"(#loc52)) +#loc146 = loc("tmp22"(#loc53)) +#loc147 = loc("tmp23"(#loc54)) +#loc148 = loc("tmp25"(#loc61)) +#loc149 = loc("tmp25"(#loc62)) +#loc150 = loc("tmp25"(#loc63)) +#loc151 = loc("tmp25"(#loc64)) +#loc152 = loc("tmp25"(#loc65)) +#loc153 = loc("tmp25"(#loc66)) +#loc154 = loc("tmp25"(#loc67)) +#loc155 = loc("tmp26"(#loc68)) +#loc156 = loc("tmp29"(#loc69)) +#loc157 = loc("tmp28"(#loc70)) +#loc158 = loc("tmp30"(#loc71)) +#loc159 = loc("tmp34"(#loc72)) +#loc160 = loc("tmp35"(#loc73)) +#loc161 = loc("tmp36"(#loc74)) +#loc162 = loc("tmp38"(#loc81)) +#loc163 = loc("tmp38"(#loc82)) +#loc164 = loc("tmp38"(#loc83)) +#loc165 = loc("tmp38"(#loc84)) +#loc166 = loc("tmp38"(#loc85)) +#loc167 = loc("tmp39"(#loc86)) +#loc168 = loc("tmp40"(#loc87)) +#loc169 = loc(fused[#loc135, #loc136]) +#loc170 = loc(fused[#loc156, #loc157]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/OAFWLXJYPJRG2R7MPJQB4YL2BEBKAOS7GFDMESWPZ4YQI32MKLHA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.json b/SpecForge-ext/cache/compiled_kernels/triton/3/OAFWLXJYPJRG2R7MPJQB4YL2BEBKAOS7GFDMESWPZ4YQI32MKLHA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a6017a4c8c8cfbcf28b9dc68f96effd44346b761 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/OAFWLXJYPJRG2R7MPJQB4YL2BEBKAOS7GFDMESWPZ4YQI32MKLHA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.json @@ -0,0 +1 @@ +{"hash": "700b65dd387a626d47ec7a601e617a0902a03a5f3146c24acfcf31046f4c52ce", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/__grp__log_softmax_backward_kernel.json b/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/__grp__log_softmax_backward_kernel.json new file mode 100644 index 0000000000000000000000000000000000000000..6552d7d783ac4ff2fac17af7c4b1b7244344e3a8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/__grp__log_softmax_backward_kernel.json @@ -0,0 +1 @@ +{"child_paths": {"log_softmax_backward_kernel.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.source", "log_softmax_backward_kernel.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttir", "log_softmax_backward_kernel.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttgir", "log_softmax_backward_kernel.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.llir", "log_softmax_backward_kernel.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ptx", "log_softmax_backward_kernel.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.cubin", "log_softmax_backward_kernel.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.cubin b/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.cubin new file mode 100644 index 0000000000000000000000000000000000000000..aba8281374f0cc56c16c368ea11488f641cde1fb Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.json b/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.json new file mode 100644 index 0000000000000000000000000000000000000000..66a0758e7b9c6e3d7165485f360bc678675a9d10 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.json @@ -0,0 +1 @@ +{"hash": "815544e5ace0796cfb02bac4b9dec6a2a110a987d2e79f99b0e8bb667807ed7c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "log_softmax_backward_kernel"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.llir new file mode 100644 index 0000000000000000000000000000000000000000..b1a6c62618b2e35b24ed837485a66d2bfc116e35 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.llir @@ -0,0 +1,913 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @log_softmax_backward_kernel(ptr addrspace(1) %0, i32 %1, ptr addrspace(1) %2, i32 %3, ptr addrspace(1) %4, ptr addrspace(1) %5, float %6, ptr addrspace(1) %7, ptr addrspace(1) %8, i32 %9, ptr addrspace(1) readnone captures(none) %10, ptr addrspace(1) readnone captures(none) %11) local_unnamed_addr #0 !dbg !4 { + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %14 = zext nneg i32 %13 to i64, !dbg !8 + %15 = sext i32 %1 to i64, !dbg !9 + %16 = mul nsw i64 %15, %14, !dbg !9 + %17 = getelementptr bfloat, ptr addrspace(1) %0, i64 %16, !dbg !10 + %18 = sext i32 %3 to i64, !dbg !11 + %19 = mul nsw i64 %18, %14, !dbg !11 + %20 = getelementptr float, ptr addrspace(1) %2, i64 %19, !dbg !12 + %21 = getelementptr i1, ptr addrspace(1) %4, i64 %14, !dbg !13 + %22 = tail call i8 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b8 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %21) #5, !dbg !14 + %.not = icmp eq i8 %22, 0, !dbg !14 + br i1 %.not, label %23, label %46, !dbg !15 + +23: ; preds = %12 + %24 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !16 + %25 = shl nuw nsw i32 %24, 3, !dbg !16 + %26 = icmp sgt i32 %9, 0, !dbg !17 + br i1 %26, label %.lr.ph6, label %.loopexit, !dbg !17 + +.lr.ph6: ; preds = %23, %.lr.ph6 + %27 = phi i32 [ %44, %.lr.ph6 ], [ 0, %23 ] + %28 = or disjoint i32 %27, %25, !dbg !18 + %29 = or disjoint i32 %28, 8192, !dbg !18 + %30 = or disjoint i32 %28, 16384, !dbg !18 + %31 = or disjoint i32 %28, 24576, !dbg !18 + %32 = icmp slt i32 %28, %9, !dbg !19 + %33 = icmp slt i32 %29, %9, !dbg !19 + %34 = icmp slt i32 %30, %9, !dbg !19 + %35 = icmp slt i32 %31, %9, !dbg !19 + %36 = sext i32 %28 to i64, !dbg !20 + %37 = getelementptr bfloat, ptr addrspace(1) %17, i64 %36, !dbg !20 + %38 = sext i32 %29 to i64, !dbg !20 + %39 = getelementptr bfloat, ptr addrspace(1) %17, i64 %38, !dbg !20 + %40 = sext i32 %30 to i64, !dbg !20 + %41 = getelementptr bfloat, ptr addrspace(1) %17, i64 %40, !dbg !20 + %42 = sext i32 %31 to i64, !dbg !20 + %43 = getelementptr bfloat, ptr addrspace(1) %17, i64 %42, !dbg !20 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %37, i1 %32) #5, !dbg !21 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %39, i1 %33) #5, !dbg !21 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %41, i1 %34) #5, !dbg !21 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %43, i1 %35) #5, !dbg !21 + %44 = add i32 %27, 32768, !dbg !17 + %45 = icmp slt i32 %44, %9, !dbg !17 + br i1 %45, label %.lr.ph6, label %.loopexit, !dbg !17 + +.loopexit: ; preds = %.lr.ph5, %.lr.ph6, %46, %23 + ret void, !dbg !22 + +46: ; preds = %12 + %47 = getelementptr float, ptr addrspace(1) %7, i64 %14, !dbg !23 + %48 = getelementptr float, ptr addrspace(1) %8, i64 %14, !dbg !24 + %49 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %47) #5, !dbg !25 + %50 = bitcast i32 %49 to float, !dbg !25 + %51 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %48) #5, !dbg !26 + %52 = bitcast i32 %51 to float, !dbg !26 + %53 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %5) #5, !dbg !27 + %54 = bitcast i32 %53 to float, !dbg !27 + %55 = fmul float %6, %54, !dbg !28 + %56 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !29 + %57 = shl nuw nsw i32 %56, 3, !dbg !29 + %58 = or disjoint i32 %57, 4, !dbg !29 + %59 = or disjoint i32 %57, 8192, !dbg !29 + %60 = or disjoint i32 %57, 8196, !dbg !29 + %61 = or disjoint i32 %57, 16384, !dbg !29 + %62 = or disjoint i32 %57, 16388, !dbg !29 + %63 = or disjoint i32 %57, 24576, !dbg !29 + %64 = or disjoint i32 %57, 24580, !dbg !29 + %65 = icmp sgt i32 %9, 0, !dbg !30 + br i1 %65, label %.lr.ph, label %.loopexit, !dbg !30 + +.lr.ph: ; preds = %46 + %66 = lshr i32 %56, 5, !dbg !29 + %67 = and i32 %56, 31, !dbg !29 + %68 = icmp eq i32 %67, 0 + %69 = getelementptr float, ptr addrspace(3) @global_smem, i32 %66 + %70 = icmp samesign ult i32 %56, 32 + %71 = getelementptr float, ptr addrspace(3) @global_smem, i32 %56 + %72 = icmp eq i32 %56, 0 + br label %73, !dbg !30 + +73: ; preds = %.lr.ph, %73 + %74 = phi float [ 0.000000e+00, %.lr.ph ], [ %308, %73 ] + %75 = phi i32 [ 0, %.lr.ph ], [ %309, %73 ] + %76 = or disjoint i32 %75, %57, !dbg !31 + %77 = or disjoint i32 %75, %58, !dbg !31 + %78 = or disjoint i32 %75, %59, !dbg !31 + %79 = or disjoint i32 %75, %60, !dbg !31 + %80 = or disjoint i32 %75, %61, !dbg !31 + %81 = or disjoint i32 %75, %62, !dbg !31 + %82 = or disjoint i32 %75, %63, !dbg !31 + %83 = or disjoint i32 %75, %64, !dbg !31 + %84 = icmp slt i32 %76, %9, !dbg !32 + %85 = icmp slt i32 %78, %9, !dbg !32 + %86 = icmp slt i32 %80, %9, !dbg !32 + %87 = icmp slt i32 %82, %9, !dbg !32 + %88 = sext i32 %76 to i64, !dbg !33 + %89 = getelementptr float, ptr addrspace(1) %20, i64 %88, !dbg !33 + %90 = sext i32 %77 to i64, !dbg !33 + %91 = getelementptr float, ptr addrspace(1) %20, i64 %90, !dbg !33 + %92 = sext i32 %78 to i64, !dbg !33 + %93 = getelementptr float, ptr addrspace(1) %20, i64 %92, !dbg !33 + %94 = sext i32 %79 to i64, !dbg !33 + %95 = getelementptr float, ptr addrspace(1) %20, i64 %94, !dbg !33 + %96 = sext i32 %80 to i64, !dbg !33 + %97 = getelementptr float, ptr addrspace(1) %20, i64 %96, !dbg !33 + %98 = sext i32 %81 to i64, !dbg !33 + %99 = getelementptr float, ptr addrspace(1) %20, i64 %98, !dbg !33 + %100 = sext i32 %82 to i64, !dbg !33 + %101 = getelementptr float, ptr addrspace(1) %20, i64 %100, !dbg !33 + %102 = sext i32 %83 to i64, !dbg !33 + %103 = getelementptr float, ptr addrspace(1) %20, i64 %102, !dbg !33 + %104 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %89, i1 %84) #5, !dbg !34 + %105 = extractvalue { i32, i32, i32, i32 } %104, 0, !dbg !34 + %106 = extractvalue { i32, i32, i32, i32 } %104, 1, !dbg !34 + %107 = extractvalue { i32, i32, i32, i32 } %104, 2, !dbg !34 + %108 = extractvalue { i32, i32, i32, i32 } %104, 3, !dbg !34 + %109 = bitcast i32 %105 to float, !dbg !34 + %110 = bitcast i32 %106 to float, !dbg !34 + %111 = bitcast i32 %107 to float, !dbg !34 + %112 = bitcast i32 %108 to float, !dbg !34 + %113 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %91, i1 %84) #5, !dbg !34 + %114 = extractvalue { i32, i32, i32, i32 } %113, 0, !dbg !34 + %115 = extractvalue { i32, i32, i32, i32 } %113, 1, !dbg !34 + %116 = extractvalue { i32, i32, i32, i32 } %113, 2, !dbg !34 + %117 = extractvalue { i32, i32, i32, i32 } %113, 3, !dbg !34 + %118 = bitcast i32 %114 to float, !dbg !34 + %119 = bitcast i32 %115 to float, !dbg !34 + %120 = bitcast i32 %116 to float, !dbg !34 + %121 = bitcast i32 %117 to float, !dbg !34 + %122 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %93, i1 %85) #5, !dbg !34 + %123 = extractvalue { i32, i32, i32, i32 } %122, 0, !dbg !34 + %124 = extractvalue { i32, i32, i32, i32 } %122, 1, !dbg !34 + %125 = extractvalue { i32, i32, i32, i32 } %122, 2, !dbg !34 + %126 = extractvalue { i32, i32, i32, i32 } %122, 3, !dbg !34 + %127 = bitcast i32 %123 to float, !dbg !34 + %128 = bitcast i32 %124 to float, !dbg !34 + %129 = bitcast i32 %125 to float, !dbg !34 + %130 = bitcast i32 %126 to float, !dbg !34 + %131 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %95, i1 %85) #5, !dbg !34 + %132 = extractvalue { i32, i32, i32, i32 } %131, 0, !dbg !34 + %133 = extractvalue { i32, i32, i32, i32 } %131, 1, !dbg !34 + %134 = extractvalue { i32, i32, i32, i32 } %131, 2, !dbg !34 + %135 = extractvalue { i32, i32, i32, i32 } %131, 3, !dbg !34 + %136 = bitcast i32 %132 to float, !dbg !34 + %137 = bitcast i32 %133 to float, !dbg !34 + %138 = bitcast i32 %134 to float, !dbg !34 + %139 = bitcast i32 %135 to float, !dbg !34 + %140 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %97, i1 %86) #5, !dbg !34 + %141 = extractvalue { i32, i32, i32, i32 } %140, 0, !dbg !34 + %142 = extractvalue { i32, i32, i32, i32 } %140, 1, !dbg !34 + %143 = extractvalue { i32, i32, i32, i32 } %140, 2, !dbg !34 + %144 = extractvalue { i32, i32, i32, i32 } %140, 3, !dbg !34 + %145 = bitcast i32 %141 to float, !dbg !34 + %146 = bitcast i32 %142 to float, !dbg !34 + %147 = bitcast i32 %143 to float, !dbg !34 + %148 = bitcast i32 %144 to float, !dbg !34 + %149 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %99, i1 %86) #5, !dbg !34 + %150 = extractvalue { i32, i32, i32, i32 } %149, 0, !dbg !34 + %151 = extractvalue { i32, i32, i32, i32 } %149, 1, !dbg !34 + %152 = extractvalue { i32, i32, i32, i32 } %149, 2, !dbg !34 + %153 = extractvalue { i32, i32, i32, i32 } %149, 3, !dbg !34 + %154 = bitcast i32 %150 to float, !dbg !34 + %155 = bitcast i32 %151 to float, !dbg !34 + %156 = bitcast i32 %152 to float, !dbg !34 + %157 = bitcast i32 %153 to float, !dbg !34 + %158 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %101, i1 %87) #5, !dbg !34 + %159 = extractvalue { i32, i32, i32, i32 } %158, 0, !dbg !34 + %160 = extractvalue { i32, i32, i32, i32 } %158, 1, !dbg !34 + %161 = extractvalue { i32, i32, i32, i32 } %158, 2, !dbg !34 + %162 = extractvalue { i32, i32, i32, i32 } %158, 3, !dbg !34 + %163 = bitcast i32 %159 to float, !dbg !34 + %164 = bitcast i32 %160 to float, !dbg !34 + %165 = bitcast i32 %161 to float, !dbg !34 + %166 = bitcast i32 %162 to float, !dbg !34 + %167 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %103, i1 %87) #5, !dbg !34 + %168 = extractvalue { i32, i32, i32, i32 } %167, 0, !dbg !34 + %169 = extractvalue { i32, i32, i32, i32 } %167, 1, !dbg !34 + %170 = extractvalue { i32, i32, i32, i32 } %167, 2, !dbg !34 + %171 = extractvalue { i32, i32, i32, i32 } %167, 3, !dbg !34 + %172 = bitcast i32 %168 to float, !dbg !34 + %173 = bitcast i32 %169 to float, !dbg !34 + %174 = bitcast i32 %170 to float, !dbg !34 + %175 = bitcast i32 %171 to float, !dbg !34 + %176 = fmul float %55, %109, !dbg !35 + %177 = fmul float %55, %110, !dbg !35 + %178 = fmul float %55, %111, !dbg !35 + %179 = fmul float %55, %112, !dbg !35 + %180 = fmul float %55, %118, !dbg !35 + %181 = fmul float %55, %119, !dbg !35 + %182 = fmul float %55, %120, !dbg !35 + %183 = fmul float %55, %121, !dbg !35 + %184 = fmul float %55, %127, !dbg !35 + %185 = fmul float %55, %128, !dbg !35 + %186 = fmul float %55, %129, !dbg !35 + %187 = fmul float %55, %130, !dbg !35 + %188 = fmul float %55, %136, !dbg !35 + %189 = fmul float %55, %137, !dbg !35 + %190 = fmul float %55, %138, !dbg !35 + %191 = fmul float %55, %139, !dbg !35 + %192 = fmul float %55, %145, !dbg !35 + %193 = fmul float %55, %146, !dbg !35 + %194 = fmul float %55, %147, !dbg !35 + %195 = fmul float %55, %148, !dbg !35 + %196 = fmul float %55, %154, !dbg !35 + %197 = fmul float %55, %155, !dbg !35 + %198 = fmul float %55, %156, !dbg !35 + %199 = fmul float %55, %157, !dbg !35 + %200 = fmul float %55, %163, !dbg !35 + %201 = fmul float %55, %164, !dbg !35 + %202 = fmul float %55, %165, !dbg !35 + %203 = fmul float %55, %166, !dbg !35 + %204 = fmul float %55, %172, !dbg !35 + %205 = fmul float %55, %173, !dbg !35 + %206 = fmul float %55, %174, !dbg !35 + %207 = fmul float %55, %175, !dbg !35 + %208 = select i1 %85, float %184, float 0.000000e+00, !dbg !36 + %209 = select i1 %85, float %185, float 0.000000e+00, !dbg !36 + %210 = select i1 %85, float %186, float 0.000000e+00, !dbg !36 + %211 = select i1 %85, float %187, float 0.000000e+00, !dbg !36 + %212 = select i1 %85, float %188, float 0.000000e+00, !dbg !36 + %213 = select i1 %85, float %189, float 0.000000e+00, !dbg !36 + %214 = select i1 %85, float %190, float 0.000000e+00, !dbg !36 + %215 = select i1 %85, float %191, float 0.000000e+00, !dbg !36 + %216 = select i1 %86, float %192, float 0.000000e+00, !dbg !36 + %217 = select i1 %86, float %193, float 0.000000e+00, !dbg !36 + %218 = select i1 %86, float %194, float 0.000000e+00, !dbg !36 + %219 = select i1 %86, float %195, float 0.000000e+00, !dbg !36 + %220 = select i1 %86, float %196, float 0.000000e+00, !dbg !36 + %221 = select i1 %86, float %197, float 0.000000e+00, !dbg !36 + %222 = select i1 %86, float %198, float 0.000000e+00, !dbg !36 + %223 = select i1 %86, float %199, float 0.000000e+00, !dbg !36 + %224 = select i1 %87, float %200, float 0.000000e+00, !dbg !36 + %225 = select i1 %87, float %201, float 0.000000e+00, !dbg !36 + %226 = select i1 %87, float %202, float 0.000000e+00, !dbg !36 + %227 = select i1 %87, float %203, float 0.000000e+00, !dbg !36 + %228 = select i1 %87, float %204, float 0.000000e+00, !dbg !36 + %229 = select i1 %87, float %205, float 0.000000e+00, !dbg !36 + %230 = select i1 %87, float %206, float 0.000000e+00, !dbg !36 + %231 = select i1 %87, float %207, float 0.000000e+00, !dbg !36 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37 + %232 = fadd float %176, %177, !dbg !41 + %233 = fadd float %178, %232, !dbg !41 + %234 = fadd float %179, %233, !dbg !41 + %235 = fadd float %180, %234, !dbg !41 + %236 = fadd float %181, %235, !dbg !41 + %237 = fadd float %182, %236, !dbg !41 + %238 = fadd float %183, %237, !dbg !41 + %239 = select i1 %84, float %238, float 0.000000e+00, !dbg !41 + %240 = fadd float %208, %239, !dbg !41 + %241 = fadd float %209, %240, !dbg !41 + %242 = fadd float %210, %241, !dbg !41 + %243 = fadd float %211, %242, !dbg !41 + %244 = fadd float %212, %243, !dbg !41 + %245 = fadd float %213, %244, !dbg !41 + %246 = fadd float %214, %245, !dbg !41 + %247 = fadd float %215, %246, !dbg !41 + %248 = fadd float %216, %247, !dbg !41 + %249 = fadd float %217, %248, !dbg !41 + %250 = fadd float %218, %249, !dbg !41 + %251 = fadd float %219, %250, !dbg !41 + %252 = fadd float %220, %251, !dbg !41 + %253 = fadd float %221, %252, !dbg !41 + %254 = fadd float %222, %253, !dbg !41 + %255 = fadd float %223, %254, !dbg !41 + %256 = fadd float %224, %255, !dbg !41 + %257 = fadd float %225, %256, !dbg !41 + %258 = fadd float %226, %257, !dbg !41 + %259 = fadd float %227, %258, !dbg !41 + %260 = fadd float %228, %259, !dbg !41 + %261 = fadd float %229, %260, !dbg !41 + %262 = fadd float %230, %261, !dbg !41 + %263 = fadd float %231, %262, !dbg !41 + %264 = bitcast float %263 to i32, !dbg !37 + %265 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %264, i32 16, i32 31), !dbg !37 + %266 = bitcast i32 %265 to float, !dbg !37 + %267 = fadd float %263, %266, !dbg !41 + %268 = bitcast float %267 to i32, !dbg !37 + %269 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 8, i32 31), !dbg !37 + %270 = bitcast i32 %269 to float, !dbg !37 + %271 = fadd float %267, %270, !dbg !41 + %272 = bitcast float %271 to i32, !dbg !37 + %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 4, i32 31), !dbg !37 + %274 = bitcast i32 %273 to float, !dbg !37 + %275 = fadd float %271, %274, !dbg !41 + %276 = bitcast float %275 to i32, !dbg !37 + %277 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 2, i32 31), !dbg !37 + %278 = bitcast i32 %277 to float, !dbg !37 + %279 = fadd float %275, %278, !dbg !41 + %280 = bitcast float %279 to i32, !dbg !37 + %281 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %280, i32 1, i32 31), !dbg !37 + %282 = bitcast i32 %281 to float, !dbg !37 + %283 = fadd float %279, %282, !dbg !41 + %284 = bitcast float %283 to <1 x i32>, !dbg !37 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %69, <1 x i32> %284, i1 %68) #5, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37 + %285 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %71, i1 %70) #5, !dbg !37 + %286 = bitcast i32 %285 to float, !dbg !37 + %287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 16, i32 31), !dbg !37 + %288 = bitcast i32 %287 to float, !dbg !37 + %289 = fadd float %286, %288, !dbg !41 + %290 = bitcast float %289 to i32, !dbg !37 + %291 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %290, i32 8, i32 31), !dbg !37 + %292 = bitcast i32 %291 to float, !dbg !37 + %293 = fadd float %289, %292, !dbg !41 + %294 = bitcast float %293 to i32, !dbg !37 + %295 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %294, i32 4, i32 31), !dbg !37 + %296 = bitcast i32 %295 to float, !dbg !37 + %297 = fadd float %293, %296, !dbg !41 + %298 = bitcast float %297 to i32, !dbg !37 + %299 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 2, i32 31), !dbg !37 + %300 = bitcast i32 %299 to float, !dbg !37 + %301 = fadd float %297, %300, !dbg !41 + %302 = bitcast float %301 to i32, !dbg !37 + %303 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %302, i32 1, i32 31), !dbg !37 + %304 = bitcast i32 %303 to float, !dbg !37 + %305 = fadd float %301, %304, !dbg !41 + %306 = bitcast float %305 to <1 x i32>, !dbg !37 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %71, <1 x i32> %306, i1 %72) #5, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37 + %307 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !37 + %308 = fadd float %74, %307, !dbg !42 + %309 = add i32 %75, 32768, !dbg !30 + %310 = icmp slt i32 %309, %9, !dbg !30 + br i1 %310, label %73, label %.lr.ph5.preheader, !dbg !30 + +.lr.ph5.preheader: ; preds = %73 + %311 = insertelement <2 x float> poison, float %308, i64 0 + %312 = shufflevector <2 x float> %311, <2 x float> poison, <2 x i32> zeroinitializer + %313 = insertelement <2 x float> poison, float %55, i64 0 + %314 = shufflevector <2 x float> %313, <2 x float> poison, <2 x i32> zeroinitializer + br label %.lr.ph5, !dbg !43 + +.lr.ph5: ; preds = %.lr.ph5.preheader, %.lr.ph5 + %315 = phi i32 [ %792, %.lr.ph5 ], [ 0, %.lr.ph5.preheader ] + %316 = or disjoint i32 %315, %57, !dbg !44 + %317 = or disjoint i32 %315, %58, !dbg !44 + %318 = or disjoint i32 %315, %59, !dbg !44 + %319 = or disjoint i32 %315, %60, !dbg !44 + %320 = or disjoint i32 %315, %61, !dbg !44 + %321 = or disjoint i32 %315, %62, !dbg !44 + %322 = or disjoint i32 %315, %63, !dbg !44 + %323 = or disjoint i32 %315, %64, !dbg !44 + %324 = icmp slt i32 %316, %9, !dbg !45 + %325 = icmp slt i32 %318, %9, !dbg !45 + %326 = icmp slt i32 %320, %9, !dbg !45 + %327 = icmp slt i32 %322, %9, !dbg !45 + %328 = sext i32 %316 to i64, !dbg !46 + %329 = getelementptr bfloat, ptr addrspace(1) %17, i64 %328, !dbg !46 + %330 = sext i32 %318 to i64, !dbg !46 + %331 = getelementptr bfloat, ptr addrspace(1) %17, i64 %330, !dbg !46 + %332 = sext i32 %320 to i64, !dbg !46 + %333 = getelementptr bfloat, ptr addrspace(1) %17, i64 %332, !dbg !46 + %334 = sext i32 %322 to i64, !dbg !46 + %335 = getelementptr bfloat, ptr addrspace(1) %17, i64 %334, !dbg !46 + %336 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %329, i1 %324) #5, !dbg !47 + %337 = extractvalue { i32, i32, i32, i32 } %336, 0, !dbg !47 + %338 = bitcast i32 %337 to <2 x bfloat>, !dbg !47 + %339 = extractvalue { i32, i32, i32, i32 } %336, 1, !dbg !47 + %340 = bitcast i32 %339 to <2 x bfloat>, !dbg !47 + %341 = extractvalue { i32, i32, i32, i32 } %336, 2, !dbg !47 + %342 = bitcast i32 %341 to <2 x bfloat>, !dbg !47 + %343 = extractvalue { i32, i32, i32, i32 } %336, 3, !dbg !47 + %344 = bitcast i32 %343 to <2 x bfloat>, !dbg !47 + %345 = extractelement <2 x bfloat> %338, i64 0, !dbg !47 + %346 = extractelement <2 x bfloat> %338, i64 1, !dbg !47 + %347 = extractelement <2 x bfloat> %340, i64 0, !dbg !47 + %348 = extractelement <2 x bfloat> %340, i64 1, !dbg !47 + %349 = extractelement <2 x bfloat> %342, i64 0, !dbg !47 + %350 = extractelement <2 x bfloat> %342, i64 1, !dbg !47 + %351 = extractelement <2 x bfloat> %344, i64 0, !dbg !47 + %352 = extractelement <2 x bfloat> %344, i64 1, !dbg !47 + %353 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %331, i1 %325) #5, !dbg !47 + %354 = extractvalue { i32, i32, i32, i32 } %353, 0, !dbg !47 + %355 = bitcast i32 %354 to <2 x bfloat>, !dbg !47 + %356 = extractvalue { i32, i32, i32, i32 } %353, 1, !dbg !47 + %357 = bitcast i32 %356 to <2 x bfloat>, !dbg !47 + %358 = extractvalue { i32, i32, i32, i32 } %353, 2, !dbg !47 + %359 = bitcast i32 %358 to <2 x bfloat>, !dbg !47 + %360 = extractvalue { i32, i32, i32, i32 } %353, 3, !dbg !47 + %361 = bitcast i32 %360 to <2 x bfloat>, !dbg !47 + %362 = extractelement <2 x bfloat> %355, i64 0, !dbg !47 + %363 = extractelement <2 x bfloat> %355, i64 1, !dbg !47 + %364 = extractelement <2 x bfloat> %357, i64 0, !dbg !47 + %365 = extractelement <2 x bfloat> %357, i64 1, !dbg !47 + %366 = extractelement <2 x bfloat> %359, i64 0, !dbg !47 + %367 = extractelement <2 x bfloat> %359, i64 1, !dbg !47 + %368 = extractelement <2 x bfloat> %361, i64 0, !dbg !47 + %369 = extractelement <2 x bfloat> %361, i64 1, !dbg !47 + %370 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %333, i1 %326) #5, !dbg !47 + %371 = extractvalue { i32, i32, i32, i32 } %370, 0, !dbg !47 + %372 = bitcast i32 %371 to <2 x bfloat>, !dbg !47 + %373 = extractvalue { i32, i32, i32, i32 } %370, 1, !dbg !47 + %374 = bitcast i32 %373 to <2 x bfloat>, !dbg !47 + %375 = extractvalue { i32, i32, i32, i32 } %370, 2, !dbg !47 + %376 = bitcast i32 %375 to <2 x bfloat>, !dbg !47 + %377 = extractvalue { i32, i32, i32, i32 } %370, 3, !dbg !47 + %378 = bitcast i32 %377 to <2 x bfloat>, !dbg !47 + %379 = extractelement <2 x bfloat> %372, i64 0, !dbg !47 + %380 = extractelement <2 x bfloat> %372, i64 1, !dbg !47 + %381 = extractelement <2 x bfloat> %374, i64 0, !dbg !47 + %382 = extractelement <2 x bfloat> %374, i64 1, !dbg !47 + %383 = extractelement <2 x bfloat> %376, i64 0, !dbg !47 + %384 = extractelement <2 x bfloat> %376, i64 1, !dbg !47 + %385 = extractelement <2 x bfloat> %378, i64 0, !dbg !47 + %386 = extractelement <2 x bfloat> %378, i64 1, !dbg !47 + %387 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %335, i1 %327) #5, !dbg !47 + %388 = extractvalue { i32, i32, i32, i32 } %387, 0, !dbg !47 + %389 = bitcast i32 %388 to <2 x bfloat>, !dbg !47 + %390 = extractvalue { i32, i32, i32, i32 } %387, 1, !dbg !47 + %391 = bitcast i32 %390 to <2 x bfloat>, !dbg !47 + %392 = extractvalue { i32, i32, i32, i32 } %387, 2, !dbg !47 + %393 = bitcast i32 %392 to <2 x bfloat>, !dbg !47 + %394 = extractvalue { i32, i32, i32, i32 } %387, 3, !dbg !47 + %395 = bitcast i32 %394 to <2 x bfloat>, !dbg !47 + %396 = extractelement <2 x bfloat> %389, i64 0, !dbg !47 + %397 = extractelement <2 x bfloat> %389, i64 1, !dbg !47 + %398 = extractelement <2 x bfloat> %391, i64 0, !dbg !47 + %399 = extractelement <2 x bfloat> %391, i64 1, !dbg !47 + %400 = extractelement <2 x bfloat> %393, i64 0, !dbg !47 + %401 = extractelement <2 x bfloat> %393, i64 1, !dbg !47 + %402 = extractelement <2 x bfloat> %395, i64 0, !dbg !47 + %403 = extractelement <2 x bfloat> %395, i64 1, !dbg !47 + %404 = fpext bfloat %345 to float, !dbg !48 + %405 = fpext bfloat %346 to float, !dbg !48 + %406 = fpext bfloat %347 to float, !dbg !48 + %407 = fpext bfloat %348 to float, !dbg !48 + %408 = fpext bfloat %349 to float, !dbg !48 + %409 = fpext bfloat %350 to float, !dbg !48 + %410 = fpext bfloat %351 to float, !dbg !48 + %411 = fpext bfloat %352 to float, !dbg !48 + %412 = fpext bfloat %362 to float, !dbg !48 + %413 = fpext bfloat %363 to float, !dbg !48 + %414 = fpext bfloat %364 to float, !dbg !48 + %415 = fpext bfloat %365 to float, !dbg !48 + %416 = fpext bfloat %366 to float, !dbg !48 + %417 = fpext bfloat %367 to float, !dbg !48 + %418 = fpext bfloat %368 to float, !dbg !48 + %419 = fpext bfloat %369 to float, !dbg !48 + %420 = fpext bfloat %379 to float, !dbg !48 + %421 = fpext bfloat %380 to float, !dbg !48 + %422 = fpext bfloat %381 to float, !dbg !48 + %423 = fpext bfloat %382 to float, !dbg !48 + %424 = fpext bfloat %383 to float, !dbg !48 + %425 = fpext bfloat %384 to float, !dbg !48 + %426 = fpext bfloat %385 to float, !dbg !48 + %427 = fpext bfloat %386 to float, !dbg !48 + %428 = fpext bfloat %396 to float, !dbg !48 + %429 = fpext bfloat %397 to float, !dbg !48 + %430 = fpext bfloat %398 to float, !dbg !48 + %431 = fpext bfloat %399 to float, !dbg !48 + %432 = fpext bfloat %400 to float, !dbg !48 + %433 = fpext bfloat %401 to float, !dbg !48 + %434 = fpext bfloat %402 to float, !dbg !48 + %435 = fpext bfloat %403 to float, !dbg !48 + %436 = getelementptr float, ptr addrspace(1) %20, i64 %328, !dbg !49 + %437 = sext i32 %317 to i64, !dbg !49 + %438 = getelementptr float, ptr addrspace(1) %20, i64 %437, !dbg !49 + %439 = getelementptr float, ptr addrspace(1) %20, i64 %330, !dbg !49 + %440 = sext i32 %319 to i64, !dbg !49 + %441 = getelementptr float, ptr addrspace(1) %20, i64 %440, !dbg !49 + %442 = getelementptr float, ptr addrspace(1) %20, i64 %332, !dbg !49 + %443 = sext i32 %321 to i64, !dbg !49 + %444 = getelementptr float, ptr addrspace(1) %20, i64 %443, !dbg !49 + %445 = getelementptr float, ptr addrspace(1) %20, i64 %334, !dbg !49 + %446 = sext i32 %323 to i64, !dbg !49 + %447 = getelementptr float, ptr addrspace(1) %20, i64 %446, !dbg !49 + %448 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %436, i1 %324) #5, !dbg !50 + %449 = extractvalue { i32, i32, i32, i32 } %448, 0, !dbg !50 + %450 = extractvalue { i32, i32, i32, i32 } %448, 1, !dbg !50 + %451 = extractvalue { i32, i32, i32, i32 } %448, 2, !dbg !50 + %452 = extractvalue { i32, i32, i32, i32 } %448, 3, !dbg !50 + %453 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %438, i1 %324) #5, !dbg !50 + %454 = extractvalue { i32, i32, i32, i32 } %453, 0, !dbg !50 + %455 = extractvalue { i32, i32, i32, i32 } %453, 1, !dbg !50 + %456 = extractvalue { i32, i32, i32, i32 } %453, 2, !dbg !50 + %457 = extractvalue { i32, i32, i32, i32 } %453, 3, !dbg !50 + %458 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %439, i1 %325) #5, !dbg !50 + %459 = extractvalue { i32, i32, i32, i32 } %458, 0, !dbg !50 + %460 = extractvalue { i32, i32, i32, i32 } %458, 1, !dbg !50 + %461 = extractvalue { i32, i32, i32, i32 } %458, 2, !dbg !50 + %462 = extractvalue { i32, i32, i32, i32 } %458, 3, !dbg !50 + %463 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %441, i1 %325) #5, !dbg !50 + %464 = extractvalue { i32, i32, i32, i32 } %463, 0, !dbg !50 + %465 = extractvalue { i32, i32, i32, i32 } %463, 1, !dbg !50 + %466 = extractvalue { i32, i32, i32, i32 } %463, 2, !dbg !50 + %467 = extractvalue { i32, i32, i32, i32 } %463, 3, !dbg !50 + %468 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %442, i1 %326) #5, !dbg !50 + %469 = extractvalue { i32, i32, i32, i32 } %468, 0, !dbg !50 + %470 = extractvalue { i32, i32, i32, i32 } %468, 1, !dbg !50 + %471 = extractvalue { i32, i32, i32, i32 } %468, 2, !dbg !50 + %472 = extractvalue { i32, i32, i32, i32 } %468, 3, !dbg !50 + %473 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %444, i1 %326) #5, !dbg !50 + %474 = extractvalue { i32, i32, i32, i32 } %473, 0, !dbg !50 + %475 = extractvalue { i32, i32, i32, i32 } %473, 1, !dbg !50 + %476 = extractvalue { i32, i32, i32, i32 } %473, 2, !dbg !50 + %477 = extractvalue { i32, i32, i32, i32 } %473, 3, !dbg !50 + %478 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %445, i1 %327) #5, !dbg !50 + %479 = extractvalue { i32, i32, i32, i32 } %478, 0, !dbg !50 + %480 = extractvalue { i32, i32, i32, i32 } %478, 1, !dbg !50 + %481 = extractvalue { i32, i32, i32, i32 } %478, 2, !dbg !50 + %482 = extractvalue { i32, i32, i32, i32 } %478, 3, !dbg !50 + %483 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %447, i1 %327) #5, !dbg !50 + %484 = extractvalue { i32, i32, i32, i32 } %483, 0, !dbg !50 + %485 = extractvalue { i32, i32, i32, i32 } %483, 1, !dbg !50 + %486 = extractvalue { i32, i32, i32, i32 } %483, 2, !dbg !50 + %487 = extractvalue { i32, i32, i32, i32 } %483, 3, !dbg !50 + %488 = fsub float %404, %50, !dbg !51 + %489 = fsub float %405, %50, !dbg !51 + %490 = fsub float %406, %50, !dbg !51 + %491 = fsub float %407, %50, !dbg !51 + %492 = fsub float %408, %50, !dbg !51 + %493 = fsub float %409, %50, !dbg !51 + %494 = fsub float %410, %50, !dbg !51 + %495 = fsub float %411, %50, !dbg !51 + %496 = fsub float %412, %50, !dbg !51 + %497 = fsub float %413, %50, !dbg !51 + %498 = fsub float %414, %50, !dbg !51 + %499 = fsub float %415, %50, !dbg !51 + %500 = fsub float %416, %50, !dbg !51 + %501 = fsub float %417, %50, !dbg !51 + %502 = fsub float %418, %50, !dbg !51 + %503 = fsub float %419, %50, !dbg !51 + %504 = fsub float %420, %50, !dbg !51 + %505 = fsub float %421, %50, !dbg !51 + %506 = fsub float %422, %50, !dbg !51 + %507 = fsub float %423, %50, !dbg !51 + %508 = fsub float %424, %50, !dbg !51 + %509 = fsub float %425, %50, !dbg !51 + %510 = fsub float %426, %50, !dbg !51 + %511 = fsub float %427, %50, !dbg !51 + %512 = fsub float %428, %50, !dbg !51 + %513 = fsub float %429, %50, !dbg !51 + %514 = fsub float %430, %50, !dbg !51 + %515 = fsub float %431, %50, !dbg !51 + %516 = fsub float %432, %50, !dbg !51 + %517 = fsub float %433, %50, !dbg !51 + %518 = fsub float %434, %50, !dbg !51 + %519 = fsub float %435, %50, !dbg !51 + %520 = fmul float %488, 0x3FF7154760000000, !dbg !52 + %521 = tail call float @llvm.nvvm.ex2.approx.f(float %520), !dbg !52 + %522 = fmul float %489, 0x3FF7154760000000, !dbg !52 + %523 = tail call float @llvm.nvvm.ex2.approx.f(float %522), !dbg !52 + %524 = fmul float %490, 0x3FF7154760000000, !dbg !52 + %525 = tail call float @llvm.nvvm.ex2.approx.f(float %524), !dbg !52 + %526 = fmul float %491, 0x3FF7154760000000, !dbg !52 + %527 = tail call float @llvm.nvvm.ex2.approx.f(float %526), !dbg !52 + %528 = fmul float %492, 0x3FF7154760000000, !dbg !52 + %529 = tail call float @llvm.nvvm.ex2.approx.f(float %528), !dbg !52 + %530 = fmul float %493, 0x3FF7154760000000, !dbg !52 + %531 = tail call float @llvm.nvvm.ex2.approx.f(float %530), !dbg !52 + %532 = fmul float %494, 0x3FF7154760000000, !dbg !52 + %533 = tail call float @llvm.nvvm.ex2.approx.f(float %532), !dbg !52 + %534 = fmul float %495, 0x3FF7154760000000, !dbg !52 + %535 = tail call float @llvm.nvvm.ex2.approx.f(float %534), !dbg !52 + %536 = fmul float %496, 0x3FF7154760000000, !dbg !52 + %537 = tail call float @llvm.nvvm.ex2.approx.f(float %536), !dbg !52 + %538 = fmul float %497, 0x3FF7154760000000, !dbg !52 + %539 = tail call float @llvm.nvvm.ex2.approx.f(float %538), !dbg !52 + %540 = fmul float %498, 0x3FF7154760000000, !dbg !52 + %541 = tail call float @llvm.nvvm.ex2.approx.f(float %540), !dbg !52 + %542 = fmul float %499, 0x3FF7154760000000, !dbg !52 + %543 = tail call float @llvm.nvvm.ex2.approx.f(float %542), !dbg !52 + %544 = fmul float %500, 0x3FF7154760000000, !dbg !52 + %545 = tail call float @llvm.nvvm.ex2.approx.f(float %544), !dbg !52 + %546 = fmul float %501, 0x3FF7154760000000, !dbg !52 + %547 = tail call float @llvm.nvvm.ex2.approx.f(float %546), !dbg !52 + %548 = fmul float %502, 0x3FF7154760000000, !dbg !52 + %549 = tail call float @llvm.nvvm.ex2.approx.f(float %548), !dbg !52 + %550 = fmul float %503, 0x3FF7154760000000, !dbg !52 + %551 = tail call float @llvm.nvvm.ex2.approx.f(float %550), !dbg !52 + %552 = fmul float %504, 0x3FF7154760000000, !dbg !52 + %553 = tail call float @llvm.nvvm.ex2.approx.f(float %552), !dbg !52 + %554 = fmul float %505, 0x3FF7154760000000, !dbg !52 + %555 = tail call float @llvm.nvvm.ex2.approx.f(float %554), !dbg !52 + %556 = fmul float %506, 0x3FF7154760000000, !dbg !52 + %557 = tail call float @llvm.nvvm.ex2.approx.f(float %556), !dbg !52 + %558 = fmul float %507, 0x3FF7154760000000, !dbg !52 + %559 = tail call float @llvm.nvvm.ex2.approx.f(float %558), !dbg !52 + %560 = fmul float %508, 0x3FF7154760000000, !dbg !52 + %561 = tail call float @llvm.nvvm.ex2.approx.f(float %560), !dbg !52 + %562 = fmul float %509, 0x3FF7154760000000, !dbg !52 + %563 = tail call float @llvm.nvvm.ex2.approx.f(float %562), !dbg !52 + %564 = fmul float %510, 0x3FF7154760000000, !dbg !52 + %565 = tail call float @llvm.nvvm.ex2.approx.f(float %564), !dbg !52 + %566 = fmul float %511, 0x3FF7154760000000, !dbg !52 + %567 = tail call float @llvm.nvvm.ex2.approx.f(float %566), !dbg !52 + %568 = fmul float %512, 0x3FF7154760000000, !dbg !52 + %569 = tail call float @llvm.nvvm.ex2.approx.f(float %568), !dbg !52 + %570 = fmul float %513, 0x3FF7154760000000, !dbg !52 + %571 = tail call float @llvm.nvvm.ex2.approx.f(float %570), !dbg !52 + %572 = fmul float %514, 0x3FF7154760000000, !dbg !52 + %573 = tail call float @llvm.nvvm.ex2.approx.f(float %572), !dbg !52 + %574 = fmul float %515, 0x3FF7154760000000, !dbg !52 + %575 = tail call float @llvm.nvvm.ex2.approx.f(float %574), !dbg !52 + %576 = fmul float %516, 0x3FF7154760000000, !dbg !52 + %577 = tail call float @llvm.nvvm.ex2.approx.f(float %576), !dbg !52 + %578 = fmul float %517, 0x3FF7154760000000, !dbg !52 + %579 = tail call float @llvm.nvvm.ex2.approx.f(float %578), !dbg !52 + %580 = fmul float %518, 0x3FF7154760000000, !dbg !52 + %581 = tail call float @llvm.nvvm.ex2.approx.f(float %580), !dbg !52 + %582 = fmul float %519, 0x3FF7154760000000, !dbg !52 + %583 = tail call float @llvm.nvvm.ex2.approx.f(float %582), !dbg !52 + %584 = tail call float @llvm.nvvm.div.full(float %521, float %52), !dbg !53 + %585 = tail call float @llvm.nvvm.div.full(float %523, float %52), !dbg !53 + %586 = tail call float @llvm.nvvm.div.full(float %525, float %52), !dbg !53 + %587 = tail call float @llvm.nvvm.div.full(float %527, float %52), !dbg !53 + %588 = tail call float @llvm.nvvm.div.full(float %529, float %52), !dbg !53 + %589 = tail call float @llvm.nvvm.div.full(float %531, float %52), !dbg !53 + %590 = tail call float @llvm.nvvm.div.full(float %533, float %52), !dbg !53 + %591 = tail call float @llvm.nvvm.div.full(float %535, float %52), !dbg !53 + %592 = tail call float @llvm.nvvm.div.full(float %537, float %52), !dbg !53 + %593 = tail call float @llvm.nvvm.div.full(float %539, float %52), !dbg !53 + %594 = tail call float @llvm.nvvm.div.full(float %541, float %52), !dbg !53 + %595 = tail call float @llvm.nvvm.div.full(float %543, float %52), !dbg !53 + %596 = tail call float @llvm.nvvm.div.full(float %545, float %52), !dbg !53 + %597 = tail call float @llvm.nvvm.div.full(float %547, float %52), !dbg !53 + %598 = tail call float @llvm.nvvm.div.full(float %549, float %52), !dbg !53 + %599 = tail call float @llvm.nvvm.div.full(float %551, float %52), !dbg !53 + %600 = tail call float @llvm.nvvm.div.full(float %553, float %52), !dbg !53 + %601 = tail call float @llvm.nvvm.div.full(float %555, float %52), !dbg !53 + %602 = tail call float @llvm.nvvm.div.full(float %557, float %52), !dbg !53 + %603 = tail call float @llvm.nvvm.div.full(float %559, float %52), !dbg !53 + %604 = tail call float @llvm.nvvm.div.full(float %561, float %52), !dbg !53 + %605 = tail call float @llvm.nvvm.div.full(float %563, float %52), !dbg !53 + %606 = tail call float @llvm.nvvm.div.full(float %565, float %52), !dbg !53 + %607 = tail call float @llvm.nvvm.div.full(float %567, float %52), !dbg !53 + %608 = tail call float @llvm.nvvm.div.full(float %569, float %52), !dbg !53 + %609 = tail call float @llvm.nvvm.div.full(float %571, float %52), !dbg !53 + %610 = tail call float @llvm.nvvm.div.full(float %573, float %52), !dbg !53 + %611 = tail call float @llvm.nvvm.div.full(float %575, float %52), !dbg !53 + %612 = tail call float @llvm.nvvm.div.full(float %577, float %52), !dbg !53 + %613 = tail call float @llvm.nvvm.div.full(float %579, float %52), !dbg !53 + %614 = tail call float @llvm.nvvm.div.full(float %581, float %52), !dbg !53 + %615 = tail call float @llvm.nvvm.div.full(float %583, float %52), !dbg !53 + %616 = insertelement <2 x i32> poison, i32 %449, i64 0, !dbg !50 + %617 = insertelement <2 x i32> %616, i32 %450, i64 1, !dbg !50 + %618 = bitcast <2 x i32> %617 to <2 x float>, !dbg !50 + %619 = insertelement <2 x float> poison, float %584, i64 0, !dbg !54 + %620 = insertelement <2 x float> %619, float %585, i64 1, !dbg !54 + %621 = fmul <2 x float> %312, %620, !dbg !54 + %622 = fmul <2 x float> %314, %618, !dbg !55 + %623 = fsub <2 x float> %621, %622, !dbg !56 + %624 = fadd <2 x float> %623, zeroinitializer, !dbg !56 + %625 = fptrunc <2 x float> %624 to <2 x bfloat>, !dbg !57 + %626 = insertelement <2 x i32> poison, i32 %451, i64 0, !dbg !50 + %627 = insertelement <2 x i32> %626, i32 %452, i64 1, !dbg !50 + %628 = bitcast <2 x i32> %627 to <2 x float>, !dbg !50 + %629 = insertelement <2 x float> poison, float %586, i64 0, !dbg !54 + %630 = insertelement <2 x float> %629, float %587, i64 1, !dbg !54 + %631 = fmul <2 x float> %312, %630, !dbg !54 + %632 = fmul <2 x float> %314, %628, !dbg !55 + %633 = fsub <2 x float> %631, %632, !dbg !56 + %634 = fadd <2 x float> %633, zeroinitializer, !dbg !56 + %635 = fptrunc <2 x float> %634 to <2 x bfloat>, !dbg !57 + %636 = insertelement <2 x i32> poison, i32 %454, i64 0, !dbg !50 + %637 = insertelement <2 x i32> %636, i32 %455, i64 1, !dbg !50 + %638 = bitcast <2 x i32> %637 to <2 x float>, !dbg !50 + %639 = insertelement <2 x float> poison, float %588, i64 0, !dbg !54 + %640 = insertelement <2 x float> %639, float %589, i64 1, !dbg !54 + %641 = fmul <2 x float> %312, %640, !dbg !54 + %642 = fmul <2 x float> %314, %638, !dbg !55 + %643 = fsub <2 x float> %641, %642, !dbg !56 + %644 = fadd <2 x float> %643, zeroinitializer, !dbg !56 + %645 = fptrunc <2 x float> %644 to <2 x bfloat>, !dbg !57 + %646 = insertelement <2 x i32> poison, i32 %456, i64 0, !dbg !50 + %647 = insertelement <2 x i32> %646, i32 %457, i64 1, !dbg !50 + %648 = bitcast <2 x i32> %647 to <2 x float>, !dbg !50 + %649 = insertelement <2 x float> poison, float %590, i64 0, !dbg !54 + %650 = insertelement <2 x float> %649, float %591, i64 1, !dbg !54 + %651 = fmul <2 x float> %312, %650, !dbg !54 + %652 = fmul <2 x float> %314, %648, !dbg !55 + %653 = fsub <2 x float> %651, %652, !dbg !56 + %654 = fadd <2 x float> %653, zeroinitializer, !dbg !56 + %655 = fptrunc <2 x float> %654 to <2 x bfloat>, !dbg !57 + %656 = insertelement <2 x i32> poison, i32 %459, i64 0, !dbg !50 + %657 = insertelement <2 x i32> %656, i32 %460, i64 1, !dbg !50 + %658 = bitcast <2 x i32> %657 to <2 x float>, !dbg !50 + %659 = insertelement <2 x float> poison, float %592, i64 0, !dbg !54 + %660 = insertelement <2 x float> %659, float %593, i64 1, !dbg !54 + %661 = fmul <2 x float> %312, %660, !dbg !54 + %662 = fmul <2 x float> %314, %658, !dbg !55 + %663 = fsub <2 x float> %661, %662, !dbg !56 + %664 = fadd <2 x float> %663, zeroinitializer, !dbg !56 + %665 = fptrunc <2 x float> %664 to <2 x bfloat>, !dbg !57 + %666 = insertelement <2 x i32> poison, i32 %461, i64 0, !dbg !50 + %667 = insertelement <2 x i32> %666, i32 %462, i64 1, !dbg !50 + %668 = bitcast <2 x i32> %667 to <2 x float>, !dbg !50 + %669 = insertelement <2 x float> poison, float %594, i64 0, !dbg !54 + %670 = insertelement <2 x float> %669, float %595, i64 1, !dbg !54 + %671 = fmul <2 x float> %312, %670, !dbg !54 + %672 = fmul <2 x float> %314, %668, !dbg !55 + %673 = fsub <2 x float> %671, %672, !dbg !56 + %674 = fadd <2 x float> %673, zeroinitializer, !dbg !56 + %675 = fptrunc <2 x float> %674 to <2 x bfloat>, !dbg !57 + %676 = insertelement <2 x i32> poison, i32 %464, i64 0, !dbg !50 + %677 = insertelement <2 x i32> %676, i32 %465, i64 1, !dbg !50 + %678 = bitcast <2 x i32> %677 to <2 x float>, !dbg !50 + %679 = insertelement <2 x float> poison, float %596, i64 0, !dbg !54 + %680 = insertelement <2 x float> %679, float %597, i64 1, !dbg !54 + %681 = fmul <2 x float> %312, %680, !dbg !54 + %682 = fmul <2 x float> %314, %678, !dbg !55 + %683 = fsub <2 x float> %681, %682, !dbg !56 + %684 = fadd <2 x float> %683, zeroinitializer, !dbg !56 + %685 = fptrunc <2 x float> %684 to <2 x bfloat>, !dbg !57 + %686 = insertelement <2 x i32> poison, i32 %466, i64 0, !dbg !50 + %687 = insertelement <2 x i32> %686, i32 %467, i64 1, !dbg !50 + %688 = bitcast <2 x i32> %687 to <2 x float>, !dbg !50 + %689 = insertelement <2 x float> poison, float %598, i64 0, !dbg !54 + %690 = insertelement <2 x float> %689, float %599, i64 1, !dbg !54 + %691 = fmul <2 x float> %312, %690, !dbg !54 + %692 = fmul <2 x float> %314, %688, !dbg !55 + %693 = fsub <2 x float> %691, %692, !dbg !56 + %694 = fadd <2 x float> %693, zeroinitializer, !dbg !56 + %695 = fptrunc <2 x float> %694 to <2 x bfloat>, !dbg !57 + %696 = insertelement <2 x i32> poison, i32 %469, i64 0, !dbg !50 + %697 = insertelement <2 x i32> %696, i32 %470, i64 1, !dbg !50 + %698 = bitcast <2 x i32> %697 to <2 x float>, !dbg !50 + %699 = insertelement <2 x float> poison, float %600, i64 0, !dbg !54 + %700 = insertelement <2 x float> %699, float %601, i64 1, !dbg !54 + %701 = fmul <2 x float> %312, %700, !dbg !54 + %702 = fmul <2 x float> %314, %698, !dbg !55 + %703 = fsub <2 x float> %701, %702, !dbg !56 + %704 = fadd <2 x float> %703, zeroinitializer, !dbg !56 + %705 = fptrunc <2 x float> %704 to <2 x bfloat>, !dbg !57 + %706 = insertelement <2 x i32> poison, i32 %471, i64 0, !dbg !50 + %707 = insertelement <2 x i32> %706, i32 %472, i64 1, !dbg !50 + %708 = bitcast <2 x i32> %707 to <2 x float>, !dbg !50 + %709 = insertelement <2 x float> poison, float %602, i64 0, !dbg !54 + %710 = insertelement <2 x float> %709, float %603, i64 1, !dbg !54 + %711 = fmul <2 x float> %312, %710, !dbg !54 + %712 = fmul <2 x float> %314, %708, !dbg !55 + %713 = fsub <2 x float> %711, %712, !dbg !56 + %714 = fadd <2 x float> %713, zeroinitializer, !dbg !56 + %715 = fptrunc <2 x float> %714 to <2 x bfloat>, !dbg !57 + %716 = insertelement <2 x i32> poison, i32 %474, i64 0, !dbg !50 + %717 = insertelement <2 x i32> %716, i32 %475, i64 1, !dbg !50 + %718 = bitcast <2 x i32> %717 to <2 x float>, !dbg !50 + %719 = insertelement <2 x float> poison, float %604, i64 0, !dbg !54 + %720 = insertelement <2 x float> %719, float %605, i64 1, !dbg !54 + %721 = fmul <2 x float> %312, %720, !dbg !54 + %722 = fmul <2 x float> %314, %718, !dbg !55 + %723 = fsub <2 x float> %721, %722, !dbg !56 + %724 = fadd <2 x float> %723, zeroinitializer, !dbg !56 + %725 = fptrunc <2 x float> %724 to <2 x bfloat>, !dbg !57 + %726 = insertelement <2 x i32> poison, i32 %476, i64 0, !dbg !50 + %727 = insertelement <2 x i32> %726, i32 %477, i64 1, !dbg !50 + %728 = bitcast <2 x i32> %727 to <2 x float>, !dbg !50 + %729 = insertelement <2 x float> poison, float %606, i64 0, !dbg !54 + %730 = insertelement <2 x float> %729, float %607, i64 1, !dbg !54 + %731 = fmul <2 x float> %312, %730, !dbg !54 + %732 = fmul <2 x float> %314, %728, !dbg !55 + %733 = fsub <2 x float> %731, %732, !dbg !56 + %734 = fadd <2 x float> %733, zeroinitializer, !dbg !56 + %735 = fptrunc <2 x float> %734 to <2 x bfloat>, !dbg !57 + %736 = insertelement <2 x i32> poison, i32 %479, i64 0, !dbg !50 + %737 = insertelement <2 x i32> %736, i32 %480, i64 1, !dbg !50 + %738 = bitcast <2 x i32> %737 to <2 x float>, !dbg !50 + %739 = insertelement <2 x float> poison, float %608, i64 0, !dbg !54 + %740 = insertelement <2 x float> %739, float %609, i64 1, !dbg !54 + %741 = fmul <2 x float> %312, %740, !dbg !54 + %742 = fmul <2 x float> %314, %738, !dbg !55 + %743 = fsub <2 x float> %741, %742, !dbg !56 + %744 = fadd <2 x float> %743, zeroinitializer, !dbg !56 + %745 = fptrunc <2 x float> %744 to <2 x bfloat>, !dbg !57 + %746 = insertelement <2 x i32> poison, i32 %481, i64 0, !dbg !50 + %747 = insertelement <2 x i32> %746, i32 %482, i64 1, !dbg !50 + %748 = bitcast <2 x i32> %747 to <2 x float>, !dbg !50 + %749 = insertelement <2 x float> poison, float %610, i64 0, !dbg !54 + %750 = insertelement <2 x float> %749, float %611, i64 1, !dbg !54 + %751 = fmul <2 x float> %312, %750, !dbg !54 + %752 = fmul <2 x float> %314, %748, !dbg !55 + %753 = fsub <2 x float> %751, %752, !dbg !56 + %754 = fadd <2 x float> %753, zeroinitializer, !dbg !56 + %755 = fptrunc <2 x float> %754 to <2 x bfloat>, !dbg !57 + %756 = insertelement <2 x i32> poison, i32 %484, i64 0, !dbg !50 + %757 = insertelement <2 x i32> %756, i32 %485, i64 1, !dbg !50 + %758 = bitcast <2 x i32> %757 to <2 x float>, !dbg !50 + %759 = insertelement <2 x float> poison, float %612, i64 0, !dbg !54 + %760 = insertelement <2 x float> %759, float %613, i64 1, !dbg !54 + %761 = fmul <2 x float> %312, %760, !dbg !54 + %762 = fmul <2 x float> %314, %758, !dbg !55 + %763 = fsub <2 x float> %761, %762, !dbg !56 + %764 = fadd <2 x float> %763, zeroinitializer, !dbg !56 + %765 = fptrunc <2 x float> %764 to <2 x bfloat>, !dbg !57 + %766 = insertelement <2 x i32> poison, i32 %486, i64 0, !dbg !50 + %767 = insertelement <2 x i32> %766, i32 %487, i64 1, !dbg !50 + %768 = bitcast <2 x i32> %767 to <2 x float>, !dbg !50 + %769 = insertelement <2 x float> poison, float %614, i64 0, !dbg !54 + %770 = insertelement <2 x float> %769, float %615, i64 1, !dbg !54 + %771 = fmul <2 x float> %312, %770, !dbg !54 + %772 = fmul <2 x float> %314, %768, !dbg !55 + %773 = fsub <2 x float> %771, %772, !dbg !56 + %774 = fadd <2 x float> %773, zeroinitializer, !dbg !56 + %775 = fptrunc <2 x float> %774 to <2 x bfloat>, !dbg !57 + %776 = bitcast <2 x bfloat> %625 to i32, !dbg !57 + %777 = bitcast <2 x bfloat> %635 to i32, !dbg !57 + %778 = bitcast <2 x bfloat> %645 to i32, !dbg !57 + %779 = bitcast <2 x bfloat> %655 to i32, !dbg !57 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %776, i32 %777, i32 %778, i32 %779, ptr addrspace(1) %329, i1 %324) #5, !dbg !57 + %780 = bitcast <2 x bfloat> %665 to i32, !dbg !57 + %781 = bitcast <2 x bfloat> %675 to i32, !dbg !57 + %782 = bitcast <2 x bfloat> %685 to i32, !dbg !57 + %783 = bitcast <2 x bfloat> %695 to i32, !dbg !57 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %780, i32 %781, i32 %782, i32 %783, ptr addrspace(1) %331, i1 %325) #5, !dbg !57 + %784 = bitcast <2 x bfloat> %705 to i32, !dbg !57 + %785 = bitcast <2 x bfloat> %715 to i32, !dbg !57 + %786 = bitcast <2 x bfloat> %725 to i32, !dbg !57 + %787 = bitcast <2 x bfloat> %735 to i32, !dbg !57 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %784, i32 %785, i32 %786, i32 %787, ptr addrspace(1) %333, i1 %326) #5, !dbg !57 + %788 = bitcast <2 x bfloat> %745 to i32, !dbg !57 + %789 = bitcast <2 x bfloat> %755 to i32, !dbg !57 + %790 = bitcast <2 x bfloat> %765 to i32, !dbg !57 + %791 = bitcast <2 x bfloat> %775 to i32, !dbg !57 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %788, i32 %789, i32 %790, i32 %791, ptr addrspace(1) %335, i1 %327) #5, !dbg !57 + %792 = add i32 %315, 32768, !dbg !43 + %793 = icmp slt i32 %792, %9, !dbg !43 + br i1 %793, label %.lr.ph5, label %.loopexit, !dbg !43 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="1024" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "loss.py", directory: "/workspace/hanrui/SpecForge-ext/specforge/core") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "log_softmax_backward_kernel", linkageName: "log_softmax_backward_kernel", scope: !1, file: !1, line: 114, type: !5, scopeLine: 114, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 127, column: 31, scope: !4) +!8 = !DILocation(line: 127, column: 37, scope: !4) +!9 = !DILocation(line: 128, column: 31, scope: !4) +!10 = !DILocation(line: 128, column: 18, scope: !4) +!11 = !DILocation(line: 129, column: 31, scope: !4) +!12 = !DILocation(line: 129, column: 18, scope: !4) +!13 = !DILocation(line: 130, column: 25, scope: !4) +!14 = !DILocation(line: 132, column: 28, scope: !4) +!15 = !DILocation(line: 133, column: 24, scope: !4) +!16 = !DILocation(line: 135, column: 39, scope: !4) +!17 = !DILocation(line: 134, column: 34, scope: !4) +!18 = !DILocation(line: 135, column: 26, scope: !4) +!19 = !DILocation(line: 136, column: 29, scope: !4) +!20 = !DILocation(line: 137, column: 34, scope: !4) +!21 = !DILocation(line: 137, column: 43, scope: !4) +!22 = !DILocation(line: 138, column: 8, scope: !4) +!23 = !DILocation(line: 140, column: 13, scope: !4) +!24 = !DILocation(line: 141, column: 13, scope: !4) +!25 = !DILocation(line: 142, column: 16, scope: !4) +!26 = !DILocation(line: 143, column: 16, scope: !4) +!27 = !DILocation(line: 144, column: 26, scope: !4) +!28 = !DILocation(line: 145, column: 32, scope: !4) +!29 = !DILocation(line: 150, column: 35, scope: !4) +!30 = !DILocation(line: 149, column: 30, scope: !4) +!31 = !DILocation(line: 150, column: 22, scope: !4) +!32 = !DILocation(line: 151, column: 25, scope: !4) +!33 = !DILocation(line: 152, column: 44, scope: !4) +!34 = !DILocation(line: 152, column: 31, scope: !4) +!35 = !DILocation(line: 155, column: 64, scope: !4) +!36 = !DILocation(line: 155, column: 77, scope: !4) +!37 = !DILocation(line: 291, column: 36, scope: !38, inlinedAt: !40) +!38 = distinct !DILexicalBlockFile(scope: !4, file: !39, discriminator: 0) +!39 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!40 = !DILocation(line: 155, column: 34, scope: !4) +!41 = !DILocation(line: 261, column: 15, scope: !38, inlinedAt: !40) +!42 = !DILocation(line: 155, column: 27, scope: !4) +!43 = !DILocation(line: 158, column: 30, scope: !4) +!44 = !DILocation(line: 159, column: 22, scope: !4) +!45 = !DILocation(line: 160, column: 25, scope: !4) +!46 = !DILocation(line: 161, column: 44, scope: !4) +!47 = !DILocation(line: 161, column: 31, scope: !4) +!48 = !DILocation(line: 162, column: 12, scope: !4) +!49 = !DILocation(line: 164, column: 44, scope: !4) +!50 = !DILocation(line: 164, column: 31, scope: !4) +!51 = !DILocation(line: 167, column: 45, scope: !4) +!52 = !DILocation(line: 167, column: 30, scope: !4) +!53 = !DILocation(line: 167, column: 50, scope: !4) +!54 = !DILocation(line: 168, column: 41, scope: !4) +!55 = !DILocation(line: 169, column: 38, scope: !4) +!56 = !DILocation(line: 169, column: 23, scope: !4) +!57 = !DILocation(line: 170, column: 39, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ptx new file mode 100644 index 0000000000000000000000000000000000000000..a901c983491862323e4953ef991691b692d488a4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ptx @@ -0,0 +1,1182 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl log_softmax_backward_kernel // -- Begin function log_softmax_backward_kernel +.extern .shared .align 16 .b8 global_smem[]; + // @log_softmax_backward_kernel +.visible .entry log_softmax_backward_kernel( + .param .u64 .ptr .global .align 1 log_softmax_backward_kernel_param_0, + .param .u32 log_softmax_backward_kernel_param_1, + .param .u64 .ptr .global .align 1 log_softmax_backward_kernel_param_2, + .param .u32 log_softmax_backward_kernel_param_3, + .param .u64 .ptr .global .align 1 log_softmax_backward_kernel_param_4, + .param .u64 .ptr .global .align 1 log_softmax_backward_kernel_param_5, + .param .f32 log_softmax_backward_kernel_param_6, + .param .u64 .ptr .global .align 1 log_softmax_backward_kernel_param_7, + .param .u64 .ptr .global .align 1 log_softmax_backward_kernel_param_8, + .param .u32 log_softmax_backward_kernel_param_9, + .param .u64 .ptr .global .align 1 log_softmax_backward_kernel_param_10, + .param .u64 .ptr .global .align 1 log_softmax_backward_kernel_param_11 +) +.reqntid 1024 +{ + .reg .pred %p<38>; + .reg .b16 %rs<35>; + .reg .b32 %r<671>; + .reg .b64 %rd<126>; + .loc 1 114 0 // loss.py:114:0 +$L__func_begin0: + .loc 1 114 0 // loss.py:114:0 + +// %bb.0: + ld.param.b32 %r19, [log_softmax_backward_kernel_param_9]; + ld.param.b64 %rd11, [log_softmax_backward_kernel_param_0]; + ld.param.s32 %rd12, [log_softmax_backward_kernel_param_1]; +$L__tmp0: + .loc 1 127 31 // loss.py:127:31 + mov.u32 %r20, %ctaid.x; + .loc 1 127 37 // loss.py:127:37 + cvt.u64.u32 %rd1, %r20; + .loc 1 128 31 // loss.py:128:31 + mul.lo.s64 %rd15, %rd12, %rd1; + ld.param.b64 %rd16, [log_softmax_backward_kernel_param_4]; + .loc 1 128 18 // loss.py:128:18 + shl.b64 %rd17, %rd15, 1; + add.s64 %rd2, %rd11, %rd17; + .loc 1 130 25 // loss.py:130:25 + add.s64 %rd10, %rd16, %rd1; + .loc 1 132 28 // loss.py:132:28 + // begin inline asm + mov.u16 %rs1, 0x0; + ld.global.b8 { %rs1 }, [ %rd10 + 0 ]; + // end inline asm + and.b16 %rs2, %rs1, 255; + setp.ne.b16 %p1, %rs2, 0; + .loc 1 133 24 // loss.py:133:24 + @%p1 bra $L__BB0_5; + bra.uni $L__BB0_1; +$L__BB0_5: + .loc 1 0 24 // loss.py:0:24 + ld.param.b64 %rd9, [log_softmax_backward_kernel_param_8]; + ld.param.b64 %rd8, [log_softmax_backward_kernel_param_7]; + ld.param.b64 %rd7, [log_softmax_backward_kernel_param_5]; + .loc 1 140 13 // loss.py:140:13 + shl.b64 %rd23, %rd1, 2; + add.s64 %rd20, %rd8, %rd23; + .loc 1 141 13 // loss.py:141:13 + add.s64 %rd21, %rd9, %rd23; + .loc 1 142 16 // loss.py:142:16 + // begin inline asm + mov.u32 %r21, 0x0; + ld.global.b32 { %r21 }, [ %rd20 + 0 ]; + // end inline asm + .loc 1 143 16 // loss.py:143:16 + // begin inline asm + mov.u32 %r22, 0x0; + ld.global.b32 { %r22 }, [ %rd21 + 0 ]; + // end inline asm + .loc 1 144 26 // loss.py:144:26 + // begin inline asm + mov.u32 %r23, 0x0; + ld.global.b32 { %r23 }, [ %rd7 + 0 ]; + // end inline asm + .loc 1 149 30 // loss.py:149:30 + setp.lt.s32 %p2, %r19, 1; + @%p2 bra $L__BB0_4; +// %bb.6: // %.lr.ph + .loc 1 0 30 // loss.py:0:30 + ld.param.b32 %r18, [log_softmax_backward_kernel_param_6]; + ld.param.b64 %rd13, [log_softmax_backward_kernel_param_2]; + ld.param.s32 %rd14, [log_softmax_backward_kernel_param_3]; + mul.lo.s64 %rd18, %rd14, %rd1; + shl.b64 %rd19, %rd18, 2; + add.s64 %rd3, %rd13, %rd19; + mul.f32 %r6, %r18, %r23; + .loc 1 150 35 // loss.py:150:35 + mov.u32 %r7, %tid.x; + shl.b32 %r8, %r7, 3; + and.b32 %r9, %r7, 31; + shr.u32 %r26, %r7, 3; + and.b32 %r27, %r26, 124; + mov.b32 %r28, global_smem; + add.s32 %r94, %r28, %r27; + shl.b32 %r29, %r7, 2; + add.s32 %r97, %r28, %r29; + mov.b32 %r34, 0; + mov.b32 %r668, 0f00000000; + setp.eq.b32 %p13, %r7, 0; + setp.lt.u32 %p12, %r7, 32; + setp.eq.b32 %p11, %r9, 0; + cvt.u64.u32 %rd4, %r8; + mov.b32 %r669, %r34; +$L__BB0_7: // =>This Inner Loop Header: Depth=1 + .loc 1 150 22 // loss.py:150:22 + add.s32 %r100, %r8, %r669; + add.s32 %r101, %r100, 8192; + add.s32 %r102, %r100, 16384; + .loc 1 151 25 // loss.py:151:25 + add.s32 %r103, %r100, 24576; + setp.lt.s32 %p3, %r100, %r19; + setp.lt.s32 %p5, %r101, %r19; + setp.lt.s32 %p7, %r102, %r19; + setp.lt.s32 %p9, %r103, %r19; + .loc 1 152 44 // loss.py:152:44 + mad.wide.s32 %rd24, %r100, 4, %rd3; + cvt.s64.s32 %rd32, %r669; + add.s64 %rd33, %rd32, %rd4; + shl.b64 %rd34, %rd33, 2; + add.s64 %rd35, %rd3, %rd34; + add.s64 %rd25, %rd35, 16; + add.s64 %rd26, %rd35, 32768; + add.s64 %rd27, %rd35, 32784; + add.s64 %rd28, %rd35, 65536; + add.s64 %rd29, %rd35, 65552; + add.s64 %rd30, %rd35, 98304; + add.s64 %rd31, %rd35, 98320; + .loc 1 152 31 // loss.py:152:31 + // begin inline asm + mov.u32 %r30, %r34; + mov.u32 %r31, %r34; + mov.u32 %r32, %r34; + mov.u32 %r33, %r34; + @%p3 ld.global.v4.b32 { %r30, %r31, %r32, %r33 }, [ %rd24 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r38, %r34; + mov.u32 %r39, %r34; + mov.u32 %r40, %r34; + mov.u32 %r41, %r34; + @%p3 ld.global.v4.b32 { %r38, %r39, %r40, %r41 }, [ %rd25 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r46, %r34; + mov.u32 %r47, %r34; + mov.u32 %r48, %r34; + mov.u32 %r49, %r34; + @%p5 ld.global.v4.b32 { %r46, %r47, %r48, %r49 }, [ %rd26 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r54, %r34; + mov.u32 %r55, %r34; + mov.u32 %r56, %r34; + mov.u32 %r57, %r34; + @%p5 ld.global.v4.b32 { %r54, %r55, %r56, %r57 }, [ %rd27 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r62, %r34; + mov.u32 %r63, %r34; + mov.u32 %r64, %r34; + mov.u32 %r65, %r34; + @%p7 ld.global.v4.b32 { %r62, %r63, %r64, %r65 }, [ %rd28 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r70, %r34; + mov.u32 %r71, %r34; + mov.u32 %r72, %r34; + mov.u32 %r73, %r34; + @%p7 ld.global.v4.b32 { %r70, %r71, %r72, %r73 }, [ %rd29 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r78, %r34; + mov.u32 %r79, %r34; + mov.u32 %r80, %r34; + mov.u32 %r81, %r34; + @%p9 ld.global.v4.b32 { %r78, %r79, %r80, %r81 }, [ %rd30 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r86, %r34; + mov.u32 %r87, %r34; + mov.u32 %r88, %r34; + mov.u32 %r89, %r34; + @%p9 ld.global.v4.b32 { %r86, %r87, %r88, %r89 }, [ %rd31 + 0 ]; + // end inline asm + .loc 1 155 64 // loss.py:155:64 + mul.f32 %r104, %r6, %r31; + mul.f32 %r105, %r6, %r46; + mul.f32 %r106, %r6, %r47; + mul.f32 %r107, %r6, %r48; + mul.f32 %r108, %r6, %r49; + mul.f32 %r109, %r6, %r54; + mul.f32 %r110, %r6, %r55; + mul.f32 %r111, %r6, %r56; + mul.f32 %r112, %r6, %r57; + mul.f32 %r113, %r6, %r62; + mul.f32 %r114, %r6, %r63; + mul.f32 %r115, %r6, %r64; + mul.f32 %r116, %r6, %r65; + mul.f32 %r117, %r6, %r70; + mul.f32 %r118, %r6, %r71; + mul.f32 %r119, %r6, %r72; + mul.f32 %r120, %r6, %r73; + mul.f32 %r121, %r6, %r78; + mul.f32 %r122, %r6, %r79; + mul.f32 %r123, %r6, %r80; + mul.f32 %r124, %r6, %r81; + mul.f32 %r125, %r6, %r86; + mul.f32 %r126, %r6, %r87; + mul.f32 %r127, %r6, %r88; + mul.f32 %r128, %r6, %r89; + .loc 1 155 77 // loss.py:155:77 + selp.f32 %r129, %r105, 0f00000000, %p5; + selp.f32 %r130, %r106, 0f00000000, %p5; + selp.f32 %r131, %r107, 0f00000000, %p5; + selp.f32 %r132, %r108, 0f00000000, %p5; + selp.f32 %r133, %r109, 0f00000000, %p5; + selp.f32 %r134, %r110, 0f00000000, %p5; + selp.f32 %r135, %r111, 0f00000000, %p5; + selp.f32 %r136, %r112, 0f00000000, %p5; + selp.f32 %r137, %r113, 0f00000000, %p7; + selp.f32 %r138, %r114, 0f00000000, %p7; + selp.f32 %r139, %r115, 0f00000000, %p7; + selp.f32 %r140, %r116, 0f00000000, %p7; + selp.f32 %r141, %r117, 0f00000000, %p7; + selp.f32 %r142, %r118, 0f00000000, %p7; + selp.f32 %r143, %r119, 0f00000000, %p7; + selp.f32 %r144, %r120, 0f00000000, %p7; + selp.f32 %r145, %r121, 0f00000000, %p9; + selp.f32 %r146, %r122, 0f00000000, %p9; + selp.f32 %r147, %r123, 0f00000000, %p9; + selp.f32 %r148, %r124, 0f00000000, %p9; + selp.f32 %r149, %r125, 0f00000000, %p9; + selp.f32 %r150, %r126, 0f00000000, %p9; + selp.f32 %r151, %r127, 0f00000000, %p9; + selp.f32 %r152, %r128, 0f00000000, %p9; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + bar.sync 0; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + fma.rn.f32 %r153, %r6, %r30, %r104; + fma.rn.f32 %r154, %r6, %r32, %r153; + fma.rn.f32 %r155, %r6, %r33, %r154; + fma.rn.f32 %r156, %r6, %r38, %r155; + fma.rn.f32 %r157, %r6, %r39, %r156; + fma.rn.f32 %r158, %r6, %r40, %r157; + fma.rn.f32 %r159, %r6, %r41, %r158; + selp.f32 %r160, %r159, 0f00000000, %p3; + add.f32 %r161, %r129, %r160; + add.f32 %r162, %r130, %r161; + add.f32 %r163, %r131, %r162; + add.f32 %r164, %r132, %r163; + add.f32 %r165, %r133, %r164; + add.f32 %r166, %r134, %r165; + add.f32 %r167, %r135, %r166; + add.f32 %r168, %r136, %r167; + add.f32 %r169, %r137, %r168; + add.f32 %r170, %r138, %r169; + add.f32 %r171, %r139, %r170; + add.f32 %r172, %r140, %r171; + add.f32 %r173, %r141, %r172; + add.f32 %r174, %r142, %r173; + add.f32 %r175, %r143, %r174; + add.f32 %r176, %r144, %r175; + add.f32 %r177, %r145, %r176; + add.f32 %r178, %r146, %r177; + add.f32 %r179, %r147, %r178; + add.f32 %r180, %r148, %r179; + add.f32 %r181, %r149, %r180; + add.f32 %r182, %r150, %r181; + add.f32 %r183, %r151, %r182; + add.f32 %r184, %r152, %r183; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + shfl.sync.bfly.b32 %r185, %r184, 16, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r186, %r184, %r185; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + shfl.sync.bfly.b32 %r187, %r186, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r188, %r186, %r187; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + shfl.sync.bfly.b32 %r189, %r188, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r190, %r188, %r189; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + shfl.sync.bfly.b32 %r191, %r190, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r192, %r190, %r191; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + shfl.sync.bfly.b32 %r193, %r192, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r95, %r192, %r193; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + // begin inline asm + @%p11 st.shared.b32 [ %r94 + 0 ], %r95; + // end inline asm + bar.sync 0; + // begin inline asm + @%p12 ld.shared.b32 %r96, [ %r97 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r194, %r96, 16, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r195, %r96, %r194; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + shfl.sync.bfly.b32 %r196, %r195, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r197, %r195, %r196; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + shfl.sync.bfly.b32 %r198, %r197, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r199, %r197, %r198; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + shfl.sync.bfly.b32 %r200, %r199, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r201, %r199, %r200; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + shfl.sync.bfly.b32 %r202, %r201, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:155:34 ] + add.f32 %r99, %r201, %r202; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:155:34 ] + // begin inline asm + @%p13 st.shared.b32 [ %r97 + 0 ], %r99; + // end inline asm + bar.sync 0; + ld.shared.b32 %r203, [global_smem]; +$L__tmp2: + .loc 1 155 27 // loss.py:155:27 + add.f32 %r668, %r668, %r203; + .loc 1 149 30 // loss.py:149:30 + add.s32 %r669, %r669, 32768; + setp.lt.s32 %p14, %r669, %r19; + @%p14 bra $L__BB0_7; +// %bb.8: // %.lr.ph5.preheader + .loc 1 0 30 // loss.py:0:30 + mov.b64 %rd5, {%r668, %r668}; + mov.b64 %rd6, {%r6, %r6}; + mov.b32 %r209, 0; + mov.b32 %r670, %r209; +$L__BB0_9: // %.lr.ph5 + // =>This Inner Loop Header: Depth=1 + .loc 1 159 22 // loss.py:159:22 + add.s32 %r317, %r8, %r670; + add.s32 %r318, %r317, 8192; + add.s32 %r319, %r317, 16384; + .loc 1 160 25 // loss.py:160:25 + add.s32 %r320, %r317, 24576; + setp.lt.s32 %p15, %r317, %r19; + setp.lt.s32 %p16, %r318, %r19; + setp.lt.s32 %p17, %r319, %r19; + setp.lt.s32 %p18, %r320, %r19; + .loc 1 161 44 // loss.py:161:44 + mad.wide.s32 %rd36, %r317, 2, %rd2; + cvt.s64.s32 %rd52, %r670; + add.s64 %rd53, %rd52, %rd4; + shl.b64 %rd54, %rd53, 1; + add.s64 %rd55, %rd2, %rd54; + add.s64 %rd37, %rd55, 16384; + add.s64 %rd38, %rd55, 32768; + add.s64 %rd39, %rd55, 49152; + .loc 1 161 31 // loss.py:161:31 + // begin inline asm + mov.u32 %r205, %r209; + mov.u32 %r206, %r209; + mov.u32 %r207, %r209; + mov.u32 %r208, %r209; + @%p15 ld.global.v4.b32 { %r205, %r206, %r207, %r208 }, [ %rd36 + 0 ]; + // end inline asm + mov.b32 {%rs3, %rs4}, %r205; + mov.b32 {%rs5, %rs6}, %r206; + mov.b32 {%rs7, %rs8}, %r207; + mov.b32 {%rs9, %rs10}, %r208; + // begin inline asm + mov.u32 %r213, %r209; + mov.u32 %r214, %r209; + mov.u32 %r215, %r209; + mov.u32 %r216, %r209; + @%p16 ld.global.v4.b32 { %r213, %r214, %r215, %r216 }, [ %rd37 + 0 ]; + // end inline asm + mov.b32 {%rs11, %rs12}, %r213; + mov.b32 {%rs13, %rs14}, %r214; + mov.b32 {%rs15, %rs16}, %r215; + mov.b32 {%rs17, %rs18}, %r216; + // begin inline asm + mov.u32 %r221, %r209; + mov.u32 %r222, %r209; + mov.u32 %r223, %r209; + mov.u32 %r224, %r209; + @%p17 ld.global.v4.b32 { %r221, %r222, %r223, %r224 }, [ %rd38 + 0 ]; + // end inline asm + mov.b32 {%rs19, %rs20}, %r221; + mov.b32 {%rs21, %rs22}, %r222; + mov.b32 {%rs23, %rs24}, %r223; + mov.b32 {%rs25, %rs26}, %r224; + // begin inline asm + mov.u32 %r229, %r209; + mov.u32 %r230, %r209; + mov.u32 %r231, %r209; + mov.u32 %r232, %r209; + @%p18 ld.global.v4.b32 { %r229, %r230, %r231, %r232 }, [ %rd39 + 0 ]; + // end inline asm + mov.b32 {%rs27, %rs28}, %r229; + mov.b32 {%rs29, %rs30}, %r230; + mov.b32 {%rs31, %rs32}, %r231; + mov.b32 {%rs33, %rs34}, %r232; + .loc 1 162 12 // loss.py:162:12 + cvt.f32.bf16 %r321, %rs3; + cvt.f32.bf16 %r322, %rs4; + cvt.f32.bf16 %r323, %rs5; + cvt.f32.bf16 %r324, %rs6; + cvt.f32.bf16 %r325, %rs7; + cvt.f32.bf16 %r326, %rs8; + cvt.f32.bf16 %r327, %rs9; + cvt.f32.bf16 %r328, %rs10; + cvt.f32.bf16 %r329, %rs11; + cvt.f32.bf16 %r330, %rs12; + cvt.f32.bf16 %r331, %rs13; + cvt.f32.bf16 %r332, %rs14; + cvt.f32.bf16 %r333, %rs15; + cvt.f32.bf16 %r334, %rs16; + cvt.f32.bf16 %r335, %rs17; + cvt.f32.bf16 %r336, %rs18; + cvt.f32.bf16 %r337, %rs19; + cvt.f32.bf16 %r338, %rs20; + cvt.f32.bf16 %r339, %rs21; + cvt.f32.bf16 %r340, %rs22; + cvt.f32.bf16 %r341, %rs23; + cvt.f32.bf16 %r342, %rs24; + cvt.f32.bf16 %r343, %rs25; + cvt.f32.bf16 %r344, %rs26; + cvt.f32.bf16 %r345, %rs27; + cvt.f32.bf16 %r346, %rs28; + cvt.f32.bf16 %r347, %rs29; + cvt.f32.bf16 %r348, %rs30; + cvt.f32.bf16 %r349, %rs31; + cvt.f32.bf16 %r350, %rs32; + cvt.f32.bf16 %r351, %rs33; + cvt.f32.bf16 %r352, %rs34; + .loc 1 164 44 // loss.py:164:44 + mad.wide.s32 %rd40, %r317, 4, %rd3; + shl.b64 %rd56, %rd53, 2; + add.s64 %rd57, %rd3, %rd56; + add.s64 %rd41, %rd57, 16; + add.s64 %rd42, %rd57, 32768; + add.s64 %rd43, %rd57, 32784; + add.s64 %rd44, %rd57, 65536; + add.s64 %rd45, %rd57, 65552; + add.s64 %rd46, %rd57, 98304; + add.s64 %rd47, %rd57, 98320; + .loc 1 164 31 // loss.py:164:31 + // begin inline asm + mov.u32 %r237, %r209; + mov.u32 %r238, %r209; + mov.u32 %r239, %r209; + mov.u32 %r240, %r209; + @%p15 ld.global.v4.b32 { %r237, %r238, %r239, %r240 }, [ %rd40 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r245, %r209; + mov.u32 %r246, %r209; + mov.u32 %r247, %r209; + mov.u32 %r248, %r209; + @%p15 ld.global.v4.b32 { %r245, %r246, %r247, %r248 }, [ %rd41 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r253, %r209; + mov.u32 %r254, %r209; + mov.u32 %r255, %r209; + mov.u32 %r256, %r209; + @%p16 ld.global.v4.b32 { %r253, %r254, %r255, %r256 }, [ %rd42 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r261, %r209; + mov.u32 %r262, %r209; + mov.u32 %r263, %r209; + mov.u32 %r264, %r209; + @%p16 ld.global.v4.b32 { %r261, %r262, %r263, %r264 }, [ %rd43 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r269, %r209; + mov.u32 %r270, %r209; + mov.u32 %r271, %r209; + mov.u32 %r272, %r209; + @%p17 ld.global.v4.b32 { %r269, %r270, %r271, %r272 }, [ %rd44 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r277, %r209; + mov.u32 %r278, %r209; + mov.u32 %r279, %r209; + mov.u32 %r280, %r209; + @%p17 ld.global.v4.b32 { %r277, %r278, %r279, %r280 }, [ %rd45 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r285, %r209; + mov.u32 %r286, %r209; + mov.u32 %r287, %r209; + mov.u32 %r288, %r209; + @%p18 ld.global.v4.b32 { %r285, %r286, %r287, %r288 }, [ %rd46 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r293, %r209; + mov.u32 %r294, %r209; + mov.u32 %r295, %r209; + mov.u32 %r296, %r209; + @%p18 ld.global.v4.b32 { %r293, %r294, %r295, %r296 }, [ %rd47 + 0 ]; + // end inline asm + .loc 1 167 45 // loss.py:167:45 + sub.f32 %r353, %r321, %r21; + sub.f32 %r354, %r322, %r21; + sub.f32 %r355, %r323, %r21; + sub.f32 %r356, %r324, %r21; + sub.f32 %r357, %r325, %r21; + sub.f32 %r358, %r326, %r21; + sub.f32 %r359, %r327, %r21; + sub.f32 %r360, %r328, %r21; + sub.f32 %r361, %r329, %r21; + sub.f32 %r362, %r330, %r21; + sub.f32 %r363, %r331, %r21; + sub.f32 %r364, %r332, %r21; + sub.f32 %r365, %r333, %r21; + sub.f32 %r366, %r334, %r21; + sub.f32 %r367, %r335, %r21; + sub.f32 %r368, %r336, %r21; + sub.f32 %r369, %r337, %r21; + sub.f32 %r370, %r338, %r21; + sub.f32 %r371, %r339, %r21; + sub.f32 %r372, %r340, %r21; + sub.f32 %r373, %r341, %r21; + sub.f32 %r374, %r342, %r21; + sub.f32 %r375, %r343, %r21; + sub.f32 %r376, %r344, %r21; + sub.f32 %r377, %r345, %r21; + sub.f32 %r378, %r346, %r21; + sub.f32 %r379, %r347, %r21; + sub.f32 %r380, %r348, %r21; + sub.f32 %r381, %r349, %r21; + sub.f32 %r382, %r350, %r21; + sub.f32 %r383, %r351, %r21; + sub.f32 %r384, %r352, %r21; + .loc 1 167 30 // loss.py:167:30 + mul.f32 %r385, %r353, 0f3FB8AA3B; + ex2.approx.f32 %r386, %r385; + mul.f32 %r387, %r354, 0f3FB8AA3B; + ex2.approx.f32 %r388, %r387; + mul.f32 %r389, %r355, 0f3FB8AA3B; + ex2.approx.f32 %r390, %r389; + mul.f32 %r391, %r356, 0f3FB8AA3B; + ex2.approx.f32 %r392, %r391; + mul.f32 %r393, %r357, 0f3FB8AA3B; + ex2.approx.f32 %r394, %r393; + mul.f32 %r395, %r358, 0f3FB8AA3B; + ex2.approx.f32 %r396, %r395; + mul.f32 %r397, %r359, 0f3FB8AA3B; + ex2.approx.f32 %r398, %r397; + mul.f32 %r399, %r360, 0f3FB8AA3B; + ex2.approx.f32 %r400, %r399; + mul.f32 %r401, %r361, 0f3FB8AA3B; + ex2.approx.f32 %r402, %r401; + mul.f32 %r403, %r362, 0f3FB8AA3B; + ex2.approx.f32 %r404, %r403; + mul.f32 %r405, %r363, 0f3FB8AA3B; + ex2.approx.f32 %r406, %r405; + mul.f32 %r407, %r364, 0f3FB8AA3B; + ex2.approx.f32 %r408, %r407; + mul.f32 %r409, %r365, 0f3FB8AA3B; + ex2.approx.f32 %r410, %r409; + mul.f32 %r411, %r366, 0f3FB8AA3B; + ex2.approx.f32 %r412, %r411; + mul.f32 %r413, %r367, 0f3FB8AA3B; + ex2.approx.f32 %r414, %r413; + mul.f32 %r415, %r368, 0f3FB8AA3B; + ex2.approx.f32 %r416, %r415; + mul.f32 %r417, %r369, 0f3FB8AA3B; + ex2.approx.f32 %r418, %r417; + mul.f32 %r419, %r370, 0f3FB8AA3B; + ex2.approx.f32 %r420, %r419; + mul.f32 %r421, %r371, 0f3FB8AA3B; + ex2.approx.f32 %r422, %r421; + mul.f32 %r423, %r372, 0f3FB8AA3B; + ex2.approx.f32 %r424, %r423; + mul.f32 %r425, %r373, 0f3FB8AA3B; + ex2.approx.f32 %r426, %r425; + mul.f32 %r427, %r374, 0f3FB8AA3B; + ex2.approx.f32 %r428, %r427; + mul.f32 %r429, %r375, 0f3FB8AA3B; + ex2.approx.f32 %r430, %r429; + mul.f32 %r431, %r376, 0f3FB8AA3B; + ex2.approx.f32 %r432, %r431; + mul.f32 %r433, %r377, 0f3FB8AA3B; + ex2.approx.f32 %r434, %r433; + mul.f32 %r435, %r378, 0f3FB8AA3B; + ex2.approx.f32 %r436, %r435; + mul.f32 %r437, %r379, 0f3FB8AA3B; + ex2.approx.f32 %r438, %r437; + mul.f32 %r439, %r380, 0f3FB8AA3B; + ex2.approx.f32 %r440, %r439; + mul.f32 %r441, %r381, 0f3FB8AA3B; + ex2.approx.f32 %r442, %r441; + mul.f32 %r443, %r382, 0f3FB8AA3B; + ex2.approx.f32 %r444, %r443; + mul.f32 %r445, %r383, 0f3FB8AA3B; + ex2.approx.f32 %r446, %r445; + mul.f32 %r447, %r384, 0f3FB8AA3B; + ex2.approx.f32 %r448, %r447; + .loc 1 167 50 // loss.py:167:50 + div.full.f32 %r449, %r386, %r22; + div.full.f32 %r450, %r388, %r22; + div.full.f32 %r451, %r390, %r22; + div.full.f32 %r452, %r392, %r22; + div.full.f32 %r453, %r394, %r22; + div.full.f32 %r454, %r396, %r22; + div.full.f32 %r455, %r398, %r22; + div.full.f32 %r456, %r400, %r22; + div.full.f32 %r457, %r402, %r22; + div.full.f32 %r458, %r404, %r22; + div.full.f32 %r459, %r406, %r22; + div.full.f32 %r460, %r408, %r22; + div.full.f32 %r461, %r410, %r22; + div.full.f32 %r462, %r412, %r22; + div.full.f32 %r463, %r414, %r22; + div.full.f32 %r464, %r416, %r22; + div.full.f32 %r465, %r418, %r22; + div.full.f32 %r466, %r420, %r22; + div.full.f32 %r467, %r422, %r22; + div.full.f32 %r468, %r424, %r22; + div.full.f32 %r469, %r426, %r22; + div.full.f32 %r470, %r428, %r22; + div.full.f32 %r471, %r430, %r22; + div.full.f32 %r472, %r432, %r22; + div.full.f32 %r473, %r434, %r22; + div.full.f32 %r474, %r436, %r22; + div.full.f32 %r475, %r438, %r22; + div.full.f32 %r476, %r440, %r22; + div.full.f32 %r477, %r442, %r22; + div.full.f32 %r478, %r444, %r22; + div.full.f32 %r479, %r446, %r22; + div.full.f32 %r480, %r448, %r22; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd58, %r237; + cvt.u64.u32 %rd59, %r238; + shl.b64 %rd60, %rd59, 32; + or.b64 %rd61, %rd58, %rd60; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r481, %r482}, %rd61; + mov.b64 {%r483, %r484}, %rd6; + mul.f32 %r485, %r483, %r481; + mul.f32 %r486, %r484, %r482; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r487, %r486; + mov.b64 {%r488, %r489}, %rd5; + fma.rn.f32 %r490, %r489, %r450, %r487; + neg.f32 %r491, %r485; + fma.rn.f32 %r492, %r488, %r449, %r491; + add.f32 %r493, %r492, 0f00000000; + add.f32 %r494, %r490, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r301, %r494, %r493; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd62, %r239; + cvt.u64.u32 %rd63, %r240; + shl.b64 %rd64, %rd63, 32; + or.b64 %rd65, %rd62, %rd64; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r495, %r496}, %rd65; + mul.f32 %r497, %r483, %r495; + mul.f32 %r498, %r484, %r496; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r499, %r498; + fma.rn.f32 %r500, %r489, %r452, %r499; + neg.f32 %r501, %r497; + fma.rn.f32 %r502, %r488, %r451, %r501; + add.f32 %r503, %r502, 0f00000000; + add.f32 %r504, %r500, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r302, %r504, %r503; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd66, %r245; + cvt.u64.u32 %rd67, %r246; + shl.b64 %rd68, %rd67, 32; + or.b64 %rd69, %rd66, %rd68; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r505, %r506}, %rd69; + mul.f32 %r507, %r483, %r505; + mul.f32 %r508, %r484, %r506; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r509, %r508; + fma.rn.f32 %r510, %r489, %r454, %r509; + neg.f32 %r511, %r507; + fma.rn.f32 %r512, %r488, %r453, %r511; + add.f32 %r513, %r512, 0f00000000; + add.f32 %r514, %r510, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r303, %r514, %r513; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd70, %r247; + cvt.u64.u32 %rd71, %r248; + shl.b64 %rd72, %rd71, 32; + or.b64 %rd73, %rd70, %rd72; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r515, %r516}, %rd73; + mul.f32 %r517, %r483, %r515; + mul.f32 %r518, %r484, %r516; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r519, %r518; + fma.rn.f32 %r520, %r489, %r456, %r519; + neg.f32 %r521, %r517; + fma.rn.f32 %r522, %r488, %r455, %r521; + add.f32 %r523, %r522, 0f00000000; + add.f32 %r524, %r520, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r304, %r524, %r523; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd74, %r253; + cvt.u64.u32 %rd75, %r254; + shl.b64 %rd76, %rd75, 32; + or.b64 %rd77, %rd74, %rd76; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r525, %r526}, %rd77; + mul.f32 %r527, %r483, %r525; + mul.f32 %r528, %r484, %r526; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r529, %r528; + fma.rn.f32 %r530, %r489, %r458, %r529; + neg.f32 %r531, %r527; + fma.rn.f32 %r532, %r488, %r457, %r531; + add.f32 %r533, %r532, 0f00000000; + add.f32 %r534, %r530, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r305, %r534, %r533; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd78, %r255; + cvt.u64.u32 %rd79, %r256; + shl.b64 %rd80, %rd79, 32; + or.b64 %rd81, %rd78, %rd80; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r535, %r536}, %rd81; + mul.f32 %r537, %r483, %r535; + mul.f32 %r538, %r484, %r536; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r539, %r538; + fma.rn.f32 %r540, %r489, %r460, %r539; + neg.f32 %r541, %r537; + fma.rn.f32 %r542, %r488, %r459, %r541; + add.f32 %r543, %r542, 0f00000000; + add.f32 %r544, %r540, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r306, %r544, %r543; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd82, %r261; + cvt.u64.u32 %rd83, %r262; + shl.b64 %rd84, %rd83, 32; + or.b64 %rd85, %rd82, %rd84; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r545, %r546}, %rd85; + mul.f32 %r547, %r483, %r545; + mul.f32 %r548, %r484, %r546; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r549, %r548; + fma.rn.f32 %r550, %r489, %r462, %r549; + neg.f32 %r551, %r547; + fma.rn.f32 %r552, %r488, %r461, %r551; + add.f32 %r553, %r552, 0f00000000; + add.f32 %r554, %r550, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r307, %r554, %r553; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd86, %r263; + cvt.u64.u32 %rd87, %r264; + shl.b64 %rd88, %rd87, 32; + or.b64 %rd89, %rd86, %rd88; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r555, %r556}, %rd89; + mul.f32 %r557, %r483, %r555; + mul.f32 %r558, %r484, %r556; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r559, %r558; + fma.rn.f32 %r560, %r489, %r464, %r559; + neg.f32 %r561, %r557; + fma.rn.f32 %r562, %r488, %r463, %r561; + add.f32 %r563, %r562, 0f00000000; + add.f32 %r564, %r560, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r308, %r564, %r563; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd90, %r269; + cvt.u64.u32 %rd91, %r270; + shl.b64 %rd92, %rd91, 32; + or.b64 %rd93, %rd90, %rd92; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r565, %r566}, %rd93; + mul.f32 %r567, %r483, %r565; + mul.f32 %r568, %r484, %r566; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r569, %r568; + fma.rn.f32 %r570, %r489, %r466, %r569; + neg.f32 %r571, %r567; + fma.rn.f32 %r572, %r488, %r465, %r571; + add.f32 %r573, %r572, 0f00000000; + add.f32 %r574, %r570, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r309, %r574, %r573; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd94, %r271; + cvt.u64.u32 %rd95, %r272; + shl.b64 %rd96, %rd95, 32; + or.b64 %rd97, %rd94, %rd96; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r575, %r576}, %rd97; + mul.f32 %r577, %r483, %r575; + mul.f32 %r578, %r484, %r576; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r579, %r578; + fma.rn.f32 %r580, %r489, %r468, %r579; + neg.f32 %r581, %r577; + fma.rn.f32 %r582, %r488, %r467, %r581; + add.f32 %r583, %r582, 0f00000000; + add.f32 %r584, %r580, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r310, %r584, %r583; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd98, %r277; + cvt.u64.u32 %rd99, %r278; + shl.b64 %rd100, %rd99, 32; + or.b64 %rd101, %rd98, %rd100; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r585, %r586}, %rd101; + mul.f32 %r587, %r483, %r585; + mul.f32 %r588, %r484, %r586; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r589, %r588; + fma.rn.f32 %r590, %r489, %r470, %r589; + neg.f32 %r591, %r587; + fma.rn.f32 %r592, %r488, %r469, %r591; + add.f32 %r593, %r592, 0f00000000; + add.f32 %r594, %r590, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r311, %r594, %r593; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd102, %r279; + cvt.u64.u32 %rd103, %r280; + shl.b64 %rd104, %rd103, 32; + or.b64 %rd105, %rd102, %rd104; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r595, %r596}, %rd105; + mul.f32 %r597, %r483, %r595; + mul.f32 %r598, %r484, %r596; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r599, %r598; + fma.rn.f32 %r600, %r489, %r472, %r599; + neg.f32 %r601, %r597; + fma.rn.f32 %r602, %r488, %r471, %r601; + add.f32 %r603, %r602, 0f00000000; + add.f32 %r604, %r600, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r312, %r604, %r603; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd106, %r285; + cvt.u64.u32 %rd107, %r286; + shl.b64 %rd108, %rd107, 32; + or.b64 %rd109, %rd106, %rd108; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r605, %r606}, %rd109; + mul.f32 %r607, %r483, %r605; + mul.f32 %r608, %r484, %r606; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r609, %r608; + fma.rn.f32 %r610, %r489, %r474, %r609; + neg.f32 %r611, %r607; + fma.rn.f32 %r612, %r488, %r473, %r611; + add.f32 %r613, %r612, 0f00000000; + add.f32 %r614, %r610, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r313, %r614, %r613; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd110, %r287; + cvt.u64.u32 %rd111, %r288; + shl.b64 %rd112, %rd111, 32; + or.b64 %rd113, %rd110, %rd112; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r615, %r616}, %rd113; + mul.f32 %r617, %r483, %r615; + mul.f32 %r618, %r484, %r616; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r619, %r618; + fma.rn.f32 %r620, %r489, %r476, %r619; + neg.f32 %r621, %r617; + fma.rn.f32 %r622, %r488, %r475, %r621; + add.f32 %r623, %r622, 0f00000000; + add.f32 %r624, %r620, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r314, %r624, %r623; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd114, %r293; + cvt.u64.u32 %rd115, %r294; + shl.b64 %rd116, %rd115, 32; + or.b64 %rd117, %rd114, %rd116; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r625, %r626}, %rd117; + mul.f32 %r627, %r483, %r625; + mul.f32 %r628, %r484, %r626; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r629, %r628; + fma.rn.f32 %r630, %r489, %r478, %r629; + neg.f32 %r631, %r627; + fma.rn.f32 %r632, %r488, %r477, %r631; + add.f32 %r633, %r632, 0f00000000; + add.f32 %r634, %r630, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r315, %r634, %r633; + .loc 1 164 31 // loss.py:164:31 + cvt.u64.u32 %rd118, %r295; + cvt.u64.u32 %rd119, %r296; + shl.b64 %rd120, %rd119, 32; + or.b64 %rd121, %rd118, %rd120; + .loc 1 169 38 // loss.py:169:38 + mov.b64 {%r635, %r636}, %rd121; + mul.f32 %r637, %r483, %r635; + mul.f32 %r638, %r484, %r636; + .loc 1 169 23 // loss.py:169:23 + neg.f32 %r639, %r638; + fma.rn.f32 %r640, %r489, %r480, %r639; + neg.f32 %r641, %r637; + fma.rn.f32 %r642, %r488, %r479, %r641; + add.f32 %r643, %r642, 0f00000000; + add.f32 %r644, %r640, 0f00000000; + .loc 1 170 39 // loss.py:170:39 + cvt.rn.bf16x2.f32 %r316, %r644, %r643; + // begin inline asm + @%p15 st.global.v4.b32 [ %rd36 + 0 ], { %r301, %r302, %r303, %r304 }; + // end inline asm + // begin inline asm + @%p16 st.global.v4.b32 [ %rd37 + 0 ], { %r305, %r306, %r307, %r308 }; + // end inline asm + // begin inline asm + @%p17 st.global.v4.b32 [ %rd38 + 0 ], { %r309, %r310, %r311, %r312 }; + // end inline asm + // begin inline asm + @%p18 st.global.v4.b32 [ %rd39 + 0 ], { %r313, %r314, %r315, %r316 }; + // end inline asm + .loc 1 158 30 // loss.py:158:30 + add.s32 %r670, %r670, 32768; + setp.lt.s32 %p31, %r670, %r19; + @%p31 bra $L__BB0_9; + bra.uni $L__BB0_4; +$L__BB0_1: + .loc 1 134 34 // loss.py:134:34 + setp.lt.s32 %p32, %r19, 1; + @%p32 bra $L__BB0_4; +// %bb.2: // %.lr.ph6.preheader + .loc 1 0 34 // loss.py:0:34 + mov.u32 %r645, %tid.x; + shl.b32 %r1, %r645, 3; + mov.b32 %r647, 0; + mov.b32 %r667, %r647; +$L__BB0_3: // %.lr.ph6 + // =>This Inner Loop Header: Depth=1 + .loc 1 135 26 // loss.py:135:26 + add.s32 %r663, %r1, %r667; + add.s32 %r664, %r663, 8192; + add.s32 %r665, %r663, 16384; + .loc 1 136 29 // loss.py:136:29 + add.s32 %r666, %r663, 24576; + setp.lt.s32 %p33, %r663, %r19; + setp.lt.s32 %p34, %r664, %r19; + setp.lt.s32 %p35, %r665, %r19; + setp.lt.s32 %p36, %r666, %r19; + .loc 1 137 34 // loss.py:137:34 + mad.wide.s32 %rd122, %r663, 2, %rd2; + add.s64 %rd123, %rd122, 16384; + add.s64 %rd124, %rd122, 32768; + add.s64 %rd125, %rd122, 49152; + .loc 1 137 43 // loss.py:137:43 + // begin inline asm + @%p33 st.global.v4.b32 [ %rd122 + 0 ], { %r647, %r647, %r647, %r647 }; + // end inline asm + // begin inline asm + @%p34 st.global.v4.b32 [ %rd123 + 0 ], { %r647, %r647, %r647, %r647 }; + // end inline asm + // begin inline asm + @%p35 st.global.v4.b32 [ %rd124 + 0 ], { %r647, %r647, %r647, %r647 }; + // end inline asm + // begin inline asm + @%p36 st.global.v4.b32 [ %rd125 + 0 ], { %r647, %r647, %r647, %r647 }; + // end inline asm + .loc 1 134 34 // loss.py:134:34 + add.s32 %r667, %r667, 32768; + setp.lt.s32 %p37, %r667, %r19; + @%p37 bra $L__BB0_3; +$L__BB0_4: // %.loopexit + .loc 1 138 8 // loss.py:138:8 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/specforge/core/loss.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 153 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x92 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 108 // DW_AT_name +.b8 111 +.b8 115 +.b8 115 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 115 +.b8 112 +.b8 101 +.b8 99 +.b8 102 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 114 +.b8 101 +.b8 0 +.b8 2 // Abbrev [2] 0x50:0x1e DW_TAG_subprogram +.b8 108 // DW_AT_name +.b8 111 +.b8 103 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 98 +.b8 97 +.b8 99 +.b8 107 +.b8 119 +.b8 97 +.b8 114 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x6e:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 80 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x83:0x18 DW_TAG_inlined_subroutine +.b32 80 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 155 // DW_AT_call_line +.b8 34 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.source b/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.source new file mode 100644 index 0000000000000000000000000000000000000000..a4cd6994ae4e49ea89eadbe375c5bf395e5b9573 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.source @@ -0,0 +1,292 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":114:0) +#loc17 = loc(unknown) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc63 = loc("logits_ptr"(#loc)) +#loc64 = loc("logits_stride"(#loc)) +#loc65 = loc("target_ptr"(#loc)) +#loc66 = loc("target_stride"(#loc)) +#loc67 = loc("position_mask_ptr"(#loc)) +#loc68 = loc("grad_output_ptr"(#loc)) +#loc69 = loc("scaling_factor"(#loc)) +#loc70 = loc("m_ptr"(#loc)) +#loc71 = loc("d_ptr"(#loc)) +#loc72 = loc("n_cols"(#loc)) +#loc116 = loc("input"(#loc55)) +#loc117 = loc("a"(#loc59)) +#loc118 = loc("b"(#loc59)) +module { + tt.func public @log_softmax_backward_kernel(%logits_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("logits_ptr"(#loc)), %logits_stride: i32 {tt.divisibility = 16 : i32} loc("logits_stride"(#loc)), %target_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("target_ptr"(#loc)), %target_stride: i32 {tt.divisibility = 16 : i32} loc("target_stride"(#loc)), %position_mask_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("position_mask_ptr"(#loc)), %grad_output_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("grad_output_ptr"(#loc)), %scaling_factor: f32 loc("scaling_factor"(#loc)), %m_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("m_ptr"(#loc)), %d_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("d_ptr"(#loc)), %n_cols: i32 {tt.divisibility = 16 : i32} loc("n_cols"(#loc))) attributes {noinline = false} { + %program_id = tt.get_program_id x : i32 loc(#loc73) + %program_id_0 = arith.extsi %program_id : i32 to i64 loc(#loc74) + %logits_ptr_1 = arith.extsi %logits_stride : i32 to i64 loc(#loc75) + %logits_ptr_2 = arith.muli %program_id_0, %logits_ptr_1 : i64 loc(#loc75) + %logits_ptr_3 = tt.addptr %logits_ptr, %logits_ptr_2 : !tt.ptr, i64 loc(#loc76) + %target_ptr_4 = arith.extsi %target_stride : i32 to i64 loc(#loc77) + %target_ptr_5 = arith.muli %program_id_0, %target_ptr_4 : i64 loc(#loc77) + %target_ptr_6 = tt.addptr %target_ptr, %target_ptr_5 : !tt.ptr, i64 loc(#loc78) + %position_mask_ptr_7 = tt.addptr %position_mask_ptr, %program_id_0 : !tt.ptr, i64 loc(#loc79) + %position_mask = tt.bitcast %position_mask_ptr_7 : !tt.ptr -> !tt.ptr loc(#loc80) + %position_mask_8 = tt.load %position_mask : !tt.ptr loc(#loc80) + %position_mask_9 = arith.constant 0 : i8 loc(#loc80) + %position_mask_10 = arith.cmpi ne, %position_mask_8, %position_mask_9 : i8 loc(#loc80) + %c0_i32 = arith.constant 0 : i32 loc(#loc9) + %0 = arith.extui %position_mask_10 : i1 to i32 loc(#loc9) + %1 = arith.cmpi eq, %0, %c0_i32 : i32 loc(#loc9) + cf.cond_br %1, ^bb1, ^bb2 loc(#loc9) + ^bb1: // pred: ^bb0 + %c0_i32_11 = arith.constant 0 : i32 loc(#loc10) + %c32768_i32 = arith.constant 32768 : i32 loc(#loc10) + %2 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc10) + %3 = arith.bitcast %n_cols : i32 to i32 loc(#loc10) + %4 = arith.bitcast %c32768_i32 : i32 to i32 loc(#loc10) + %5 = ub.poison : i32 loc(#loc10) + scf.for %i = %2 to %3 step %4 : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc81) + %offsets_20 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc82) + %offsets_21 = arith.extsi %offsets_20 : tensor<32768xi32> to tensor<32768xi64> loc(#loc82) + %offsets_22 = arith.extsi %offsets : tensor<32768xi32> to tensor<32768xi64> loc(#loc82) + %offsets_23 = arith.addi %offsets_21, %offsets_22 : tensor<32768xi64> loc(#loc82) + %offsets_24 = arith.constant 2147483647 : i64 loc(#loc82) + %offsets_25 = arith.constant -2147483648 : i64 loc(#loc82) + %offsets_26 = arith.constant dense<2147483647> : tensor<32768xi64> loc(#loc82) + %offsets_27 = arith.cmpi sle, %offsets_23, %offsets_26 : tensor<32768xi64> loc(#loc82) + %offsets_28 = arith.constant dense<-2147483648> : tensor<32768xi64> loc(#loc82) + %offsets_29 = arith.cmpi sge, %offsets_23, %offsets_28 : tensor<32768xi64> loc(#loc82) + %offsets_30 = arith.andi %offsets_27, %offsets_29 : tensor<32768xi1> loc(#loc82) + %offsets_31 = arith.addi %offsets_20, %offsets : tensor<32768xi32> loc(#loc82) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc83) + %mask_32 = arith.cmpi slt, %offsets_31, %mask : tensor<32768xi32> loc(#loc83) + %14 = tt.splat %logits_ptr_3 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc14) + %15 = tt.addptr %14, %offsets_31 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc14) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc15) + %cst_33 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc15) + %16 = arith.truncf %cst_33 : tensor<32768xf32> to tensor<32768xbf16> loc(#loc15) + tt.store %15, %16, %mask_32 : tensor<32768x!tt.ptr> loc(#loc15) + } loc(#loc10) + tt.return loc(#loc16) + ^bb2: // pred: ^bb0 + cf.br ^bb4 loc(#loc17) + ^bb3: // no predecessors + cf.br ^bb4 loc(#loc17) + ^bb4: // 2 preds: ^bb2, ^bb3 + %m_ptr_12 = tt.addptr %m_ptr, %program_id_0 : !tt.ptr, i64 loc(#loc84) + %d_ptr_13 = tt.addptr %d_ptr, %program_id_0 : !tt.ptr, i64 loc(#loc85) + %m = tt.load %m_ptr_12 : !tt.ptr loc(#loc86) + %d = tt.load %d_ptr_13 : !tt.ptr loc(#loc87) + %grad_output = tt.load %grad_output_ptr : !tt.ptr loc(#loc88) + %grad_output_14 = arith.mulf %grad_output, %scaling_factor : f32 loc(#loc89) + %target_grad_sum = arith.constant 0.000000e+00 : f32 loc(#loc90) + %c0_i32_15 = arith.constant 0 : i32 loc(#loc25) + %c32768_i32_16 = arith.constant 32768 : i32 loc(#loc25) + %6 = arith.bitcast %c0_i32_15 : i32 to i32 loc(#loc25) + %7 = arith.bitcast %n_cols : i32 to i32 loc(#loc25) + %8 = arith.bitcast %c32768_i32_16 : i32 to i32 loc(#loc25) + %9 = ub.poison : i32 loc(#loc25) + %target_grad_sum_17 = scf.for %i = %6 to %7 step %8 iter_args(%target_grad_sum_20 = %target_grad_sum) -> (f32) : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc92) + %offsets_21 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc93) + %offsets_22 = arith.extsi %offsets_21 : tensor<32768xi32> to tensor<32768xi64> loc(#loc93) + %offsets_23 = arith.extsi %offsets : tensor<32768xi32> to tensor<32768xi64> loc(#loc93) + %offsets_24 = arith.addi %offsets_22, %offsets_23 : tensor<32768xi64> loc(#loc93) + %offsets_25 = arith.constant 2147483647 : i64 loc(#loc93) + %offsets_26 = arith.constant -2147483648 : i64 loc(#loc93) + %offsets_27 = arith.constant dense<2147483647> : tensor<32768xi64> loc(#loc93) + %offsets_28 = arith.cmpi sle, %offsets_24, %offsets_27 : tensor<32768xi64> loc(#loc93) + %offsets_29 = arith.constant dense<-2147483648> : tensor<32768xi64> loc(#loc93) + %offsets_30 = arith.cmpi sge, %offsets_24, %offsets_29 : tensor<32768xi64> loc(#loc93) + %offsets_31 = arith.andi %offsets_28, %offsets_30 : tensor<32768xi1> loc(#loc93) + %offsets_32 = arith.addi %offsets_21, %offsets : tensor<32768xi32> loc(#loc93) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc94) + %mask_33 = arith.cmpi slt, %offsets_32, %mask : tensor<32768xi32> loc(#loc94) + %target_block = tt.splat %target_ptr_6 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc95) + %target_block_34 = tt.addptr %target_block, %offsets_32 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc95) + %target_block_35 = arith.constant 0.000000e+00 : f32 loc(#loc96) + %target_block_36 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc96) + %target_block_37 = tt.load %target_block_34, %mask_33, %target_block_36 : tensor<32768x!tt.ptr> loc(#loc96) + %target_grad_sum_38 = tt.splat %grad_output_14 : f32 -> tensor<32768xf32> loc(#loc97) + %target_grad_sum_39 = arith.mulf %target_block_37, %target_grad_sum_38 : tensor<32768xf32> loc(#loc97) + %target_grad_sum_40 = arith.constant 0.000000e+00 : f32 loc(#loc98) + %target_grad_sum_41 = arith.constant 0.000000e+00 : f32 loc(#loc98) + %target_grad_sum_42 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc98) + %target_grad_sum_43 = arith.select %mask_33, %target_grad_sum_39, %target_grad_sum_42 : tensor<32768xi1>, tensor<32768xf32> loc(#loc98) + %target_grad_sum_44 = tt.call @"triton.language.standard.sum__fp32S32768S__(1,)cNone_(2,)cconstexpr_False__(3,)cNone"(%target_grad_sum_43) : (tensor<32768xf32>) -> f32 loc(#loc99) + %target_grad_sum_45 = arith.addf %target_grad_sum_20, %target_grad_sum_44 : f32 loc(#loc100) + scf.yield %target_grad_sum_45 : f32 loc(#loc35) + } loc(#loc91) + %c0_i32_18 = arith.constant 0 : i32 loc(#loc36) + %c32768_i32_19 = arith.constant 32768 : i32 loc(#loc36) + %10 = arith.bitcast %c0_i32_18 : i32 to i32 loc(#loc36) + %11 = arith.bitcast %n_cols : i32 to i32 loc(#loc36) + %12 = arith.bitcast %c32768_i32_19 : i32 to i32 loc(#loc36) + %13 = ub.poison : i32 loc(#loc36) + scf.for %i = %10 to %11 step %12 : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc101) + %offsets_20 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc102) + %offsets_21 = arith.extsi %offsets_20 : tensor<32768xi32> to tensor<32768xi64> loc(#loc102) + %offsets_22 = arith.extsi %offsets : tensor<32768xi32> to tensor<32768xi64> loc(#loc102) + %offsets_23 = arith.addi %offsets_21, %offsets_22 : tensor<32768xi64> loc(#loc102) + %offsets_24 = arith.constant 2147483647 : i64 loc(#loc102) + %offsets_25 = arith.constant -2147483648 : i64 loc(#loc102) + %offsets_26 = arith.constant dense<2147483647> : tensor<32768xi64> loc(#loc102) + %offsets_27 = arith.cmpi sle, %offsets_23, %offsets_26 : tensor<32768xi64> loc(#loc102) + %offsets_28 = arith.constant dense<-2147483648> : tensor<32768xi64> loc(#loc102) + %offsets_29 = arith.cmpi sge, %offsets_23, %offsets_28 : tensor<32768xi64> loc(#loc102) + %offsets_30 = arith.andi %offsets_27, %offsets_29 : tensor<32768xi1> loc(#loc102) + %offsets_31 = arith.addi %offsets_20, %offsets : tensor<32768xi32> loc(#loc102) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc103) + %mask_32 = arith.cmpi slt, %offsets_31, %mask : tensor<32768xi32> loc(#loc103) + %logits_block = tt.splat %logits_ptr_3 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc104) + %logits_block_33 = tt.addptr %logits_block, %offsets_31 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc104) + %logits_block_34 = arith.constant 0.000000e+00 : f32 loc(#loc105) + %logits_block_35 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc105) + %logits_block_36 = arith.truncf %logits_block_35 : tensor<32768xf32> to tensor<32768xbf16> loc(#loc105) + %logits_block_37 = tt.load %logits_block_33, %mask_32, %logits_block_36 : tensor<32768x!tt.ptr> loc(#loc105) + %logits_block_38 = arith.extf %logits_block_37 : tensor<32768xbf16> to tensor<32768xf32> loc(#loc106) + %target_block = tt.splat %target_ptr_6 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc107) + %target_block_39 = tt.addptr %target_block, %offsets_31 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc107) + %target_block_40 = arith.constant 0.000000e+00 : f32 loc(#loc108) + %target_block_41 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc108) + %target_block_42 = tt.load %target_block_39, %mask_32, %target_block_41 : tensor<32768x!tt.ptr> loc(#loc108) + %softmax_prob = tt.splat %m : f32 -> tensor<32768xf32> loc(#loc109) + %softmax_prob_43 = arith.subf %logits_block_38, %softmax_prob : tensor<32768xf32> loc(#loc109) + %softmax_prob_44 = math.exp %softmax_prob_43 : tensor<32768xf32> loc(#loc110) + %softmax_prob_45 = tt.splat %d : f32 -> tensor<32768xf32> loc(#loc111) + %softmax_prob_46 = arith.divf %softmax_prob_44, %softmax_prob_45 : tensor<32768xf32> loc(#loc111) + %normalized_grad = tt.splat %target_grad_sum_17 : f32 -> tensor<32768xf32> loc(#loc112) + %normalized_grad_47 = arith.mulf %softmax_prob_46, %normalized_grad : tensor<32768xf32> loc(#loc112) + %grad_block = tt.splat %grad_output_14 : f32 -> tensor<32768xf32> loc(#loc113) + %grad_block_48 = arith.mulf %target_block_42, %grad_block : tensor<32768xf32> loc(#loc113) + %grad_block_49 = arith.subf %grad_block_48, %normalized_grad_47 : tensor<32768xf32> loc(#loc114) + %grad_block_50 = arith.constant 0.000000e+00 : f32 loc(#loc115) + %grad_block_51 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc115) + %grad_block_52 = arith.subf %grad_block_51, %grad_block_49 : tensor<32768xf32> loc(#loc115) + %14 = tt.splat %logits_ptr_3 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc52) + %15 = tt.addptr %14, %offsets_31 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc52) + %16 = arith.truncf %grad_block_52 : tensor<32768xf32> to tensor<32768xbf16> loc(#loc53) + tt.store %15, %16, %mask_32 : tensor<32768x!tt.ptr> loc(#loc53) + } loc(#loc36) + tt.return loc(#loc54) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S32768S__(1,)cNone_(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32768xf32> loc("input"(#loc55))) -> f32 attributes {noinline = false} { + %0 = tt.reshape %input allow_reorder : tensor<32768xf32> -> tensor<32768xf32> loc(#loc56) + %1 = "tt.reduce"(%0) <{axis = 0 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %3 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc56) + tt.reduce.return %3 : f32 loc(#loc56) + }) : (tensor<32768xf32>) -> f32 loc(#loc56) + tt.return %1 : f32 loc(#loc57) + ^bb1: // no predecessors + %2 = ub.poison : f32 loc(#loc58) + tt.return %2 : f32 loc(#loc58) + } loc(#loc55) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc59)), %b: f32 loc("b"(#loc59))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc60) + tt.return %0 : f32 loc(#loc61) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc62) + tt.return %1 : f32 loc(#loc62) + } loc(#loc59) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":127:31) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":127:37) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":128:31) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":128:18) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":129:31) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":129:18) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":130:25) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":132:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":133:24) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":134:34) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":135:39) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":135:26) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":136:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":137:34) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":137:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":138:8) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":140:13) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":141:13) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":142:16) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":143:16) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":144:26) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":145:32) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":148:22) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":149:30) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":150:35) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":150:22) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":151:25) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":152:44) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":152:31) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:64) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:77) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:27) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:8) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":158:30) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":159:35) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":159:22) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":160:25) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":161:44) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":161:31) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":162:12) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":164:44) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":164:31) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:45) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:30) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:50) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":168:41) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:38) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:52) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:23) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":170:30) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":170:39) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":158:4) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc73 = loc("program_id"(#loc1)) +#loc74 = loc("program_id"(#loc2)) +#loc75 = loc("logits_ptr"(#loc3)) +#loc76 = loc("logits_ptr"(#loc4)) +#loc77 = loc("target_ptr"(#loc5)) +#loc78 = loc("target_ptr"(#loc6)) +#loc79 = loc("position_mask_ptr"(#loc7)) +#loc80 = loc("position_mask"(#loc8)) +#loc81 = loc("offsets"(#loc11)) +#loc82 = loc("offsets"(#loc12)) +#loc83 = loc("mask"(#loc13)) +#loc84 = loc("m_ptr"(#loc18)) +#loc85 = loc("d_ptr"(#loc19)) +#loc86 = loc("m"(#loc20)) +#loc87 = loc("d"(#loc21)) +#loc88 = loc("grad_output"(#loc22)) +#loc89 = loc("grad_output"(#loc23)) +#loc90 = loc("target_grad_sum"(#loc24)) +#loc91 = loc("target_grad_sum"(#loc25)) +#loc92 = loc("offsets"(#loc26)) +#loc93 = loc("offsets"(#loc27)) +#loc94 = loc("mask"(#loc28)) +#loc95 = loc("target_block"(#loc29)) +#loc96 = loc("target_block"(#loc30)) +#loc97 = loc("target_grad_sum"(#loc31)) +#loc98 = loc("target_grad_sum"(#loc32)) +#loc99 = loc("target_grad_sum"(#loc33)) +#loc100 = loc("target_grad_sum"(#loc34)) +#loc101 = loc("offsets"(#loc37)) +#loc102 = loc("offsets"(#loc38)) +#loc103 = loc("mask"(#loc39)) +#loc104 = loc("logits_block"(#loc40)) +#loc105 = loc("logits_block"(#loc41)) +#loc106 = loc("logits_block"(#loc42)) +#loc107 = loc("target_block"(#loc43)) +#loc108 = loc("target_block"(#loc44)) +#loc109 = loc("softmax_prob"(#loc45)) +#loc110 = loc("softmax_prob"(#loc46)) +#loc111 = loc("softmax_prob"(#loc47)) +#loc112 = loc("normalized_grad"(#loc48)) +#loc113 = loc("grad_block"(#loc49)) +#loc114 = loc("grad_block"(#loc50)) +#loc115 = loc("grad_block"(#loc51)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..ddf881e117876196c4d5005c7f734e00878bc4e2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttgir @@ -0,0 +1,199 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [32], order = [0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [32], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":114:0) +#loc1 = loc(unknown) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:34) +#loc54 = loc("logits_ptr"(#loc)) +#loc55 = loc("logits_stride"(#loc)) +#loc56 = loc("target_ptr"(#loc)) +#loc57 = loc("target_stride"(#loc)) +#loc58 = loc("position_mask_ptr"(#loc)) +#loc59 = loc("grad_output_ptr"(#loc)) +#loc60 = loc("scaling_factor"(#loc)) +#loc61 = loc("m_ptr"(#loc)) +#loc62 = loc("d_ptr"(#loc)) +#loc63 = loc("n_cols"(#loc)) +#loc89 = loc("target_grad_sum"(#loc33)) +#loc106 = loc(callsite(#loc1 at #loc89)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @log_softmax_backward_kernel(%logits_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("logits_ptr"(#loc)), %logits_stride: i32 {tt.divisibility = 16 : i32} loc("logits_stride"(#loc)), %target_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("target_ptr"(#loc)), %target_stride: i32 {tt.divisibility = 16 : i32} loc("target_stride"(#loc)), %position_mask_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("position_mask_ptr"(#loc)), %grad_output_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("grad_output_ptr"(#loc)), %scaling_factor: f32 loc("scaling_factor"(#loc)), %m_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("m_ptr"(#loc)), %d_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("d_ptr"(#loc)), %n_cols: i32 {tt.divisibility = 16 : i32} loc("n_cols"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<32768xf32, #blocked> loc(#loc1) + %c32768_i32 = arith.constant 32768 : i32 loc(#loc1) + %cst_0 = arith.constant 0.000000e+00 : f32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c0_i8 = arith.constant 0 : i8 loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<32768xbf16, #blocked> loc(#loc1) + %program_id = tt.get_program_id x : i32 loc(#loc64) + %program_id_2 = arith.extsi %program_id : i32 to i64 loc(#loc65) + %logits_ptr_3 = arith.extsi %logits_stride : i32 to i64 loc(#loc66) + %logits_ptr_4 = arith.muli %program_id_2, %logits_ptr_3 : i64 loc(#loc66) + %logits_ptr_5 = tt.addptr %logits_ptr, %logits_ptr_4 : !tt.ptr, i64 loc(#loc67) + %target_ptr_6 = arith.extsi %target_stride : i32 to i64 loc(#loc68) + %target_ptr_7 = arith.muli %program_id_2, %target_ptr_6 : i64 loc(#loc68) + %target_ptr_8 = tt.addptr %target_ptr, %target_ptr_7 : !tt.ptr, i64 loc(#loc69) + %position_mask_ptr_9 = tt.addptr %position_mask_ptr, %program_id_2 : !tt.ptr, i64 loc(#loc70) + %position_mask = tt.bitcast %position_mask_ptr_9 : !tt.ptr -> !tt.ptr loc(#loc71) + %position_mask_10 = tt.load %position_mask : !tt.ptr loc(#loc71) + %position_mask_11 = arith.cmpi ne, %position_mask_10, %c0_i8 : i8 loc(#loc71) + %0 = arith.extui %position_mask_11 : i1 to i32 loc(#loc10) + %1 = arith.cmpi eq, %0, %c0_i32 : i32 loc(#loc10) + cf.cond_br %1, ^bb1, ^bb2 loc(#loc10) + ^bb1: // pred: ^bb0 + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32, #blocked> loc(#loc72) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32, #blocked> loc(#loc73) + %2 = tt.splat %logits_ptr_5 : !tt.ptr -> tensor<32768x!tt.ptr, #blocked> loc(#loc13) + scf.for %i = %c0_i32 to %n_cols step %c32768_i32 : i32 { + %offsets_19 = tt.splat %i : i32 -> tensor<32768xi32, #blocked> loc(#loc74) + %offsets_20 = arith.addi %offsets_19, %offsets : tensor<32768xi32, #blocked> loc(#loc74) + %mask_21 = arith.cmpi slt, %offsets_20, %mask : tensor<32768xi32, #blocked> loc(#loc73) + %3 = tt.addptr %2, %offsets_20 : tensor<32768x!tt.ptr, #blocked>, tensor<32768xi32, #blocked> loc(#loc13) + tt.store %3, %cst_1, %mask_21 : tensor<32768x!tt.ptr, #blocked> loc(#loc16) + } loc(#loc14) + tt.return loc(#loc17) + ^bb2: // pred: ^bb0 + %m_ptr_12 = tt.addptr %m_ptr, %program_id_2 : !tt.ptr, i64 loc(#loc75) + %d_ptr_13 = tt.addptr %d_ptr, %program_id_2 : !tt.ptr, i64 loc(#loc76) + %m = tt.load %m_ptr_12 : !tt.ptr loc(#loc77) + %d = tt.load %d_ptr_13 : !tt.ptr loc(#loc78) + %grad_output = tt.load %grad_output_ptr : !tt.ptr loc(#loc79) + %grad_output_14 = arith.mulf %grad_output, %scaling_factor : f32 loc(#loc80) + %offsets_15 = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32, #blocked> loc(#loc81) + %mask_16 = tt.splat %n_cols : i32 -> tensor<32768xi32, #blocked> loc(#loc82) + %target_block = tt.splat %target_ptr_8 : !tt.ptr -> tensor<32768x!tt.ptr, #blocked> loc(#loc83) + %target_grad_sum = tt.splat %grad_output_14 : f32 -> tensor<32768xf32, #blocked> loc(#loc84) + %target_grad_sum_17 = scf.for %i = %c0_i32 to %n_cols step %c32768_i32 iter_args(%target_grad_sum_19 = %cst_0) -> (f32) : i32 { + %offsets_20 = tt.splat %i : i32 -> tensor<32768xi32, #blocked> loc(#loc86) + %offsets_21 = arith.addi %offsets_20, %offsets_15 : tensor<32768xi32, #blocked> loc(#loc86) + %mask_22 = arith.cmpi slt, %offsets_21, %mask_16 : tensor<32768xi32, #blocked> loc(#loc82) + %target_block_23 = tt.addptr %target_block, %offsets_21 : tensor<32768x!tt.ptr, #blocked>, tensor<32768xi32, #blocked> loc(#loc83) + %target_block_24 = tt.load %target_block_23, %mask_22, %cst : tensor<32768x!tt.ptr, #blocked> loc(#loc87) + %target_grad_sum_25 = arith.mulf %target_block_24, %target_grad_sum : tensor<32768xf32, #blocked> loc(#loc84) + %target_grad_sum_26 = arith.select %mask_22, %target_grad_sum_25, %cst : tensor<32768xi1, #blocked>, tensor<32768xf32, #blocked> loc(#loc88) + %target_grad_sum_27 = tt.reshape %target_grad_sum_26 allow_reorder : tensor<32768xf32, #blocked> -> tensor<32768xf32, #blocked1> loc(#loc105) + %target_grad_sum_28 = "tt.reduce"(%target_grad_sum_27) <{axis = 0 : i32}> ({ + ^bb0(%target_grad_sum_30: f32 loc(callsite(#loc1 at #loc89)), %target_grad_sum_31: f32 loc(callsite(#loc1 at #loc89))): + %target_grad_sum_32 = arith.addf %target_grad_sum_30, %target_grad_sum_31 : f32 loc(#loc107) + tt.reduce.return %target_grad_sum_32 : f32 loc(#loc105) + }) : (tensor<32768xf32, #blocked1>) -> f32 loc(#loc105) + %target_grad_sum_29 = arith.addf %target_grad_sum_19, %target_grad_sum_28 : f32 loc(#loc90) + scf.yield %target_grad_sum_29 : f32 loc(#loc36) + } loc(#loc85) + %logits_block = tt.splat %logits_ptr_5 : !tt.ptr -> tensor<32768x!tt.ptr, #blocked> loc(#loc91) + %softmax_prob = tt.splat %m : f32 -> tensor<32768xf32, #blocked> loc(#loc92) + %softmax_prob_18 = tt.splat %d : f32 -> tensor<32768xf32, #blocked> loc(#loc93) + %normalized_grad = tt.splat %target_grad_sum_17 : f32 -> tensor<32768xf32, #blocked> loc(#loc94) + scf.for %i = %c0_i32 to %n_cols step %c32768_i32 : i32 { + %offsets_19 = tt.splat %i : i32 -> tensor<32768xi32, #blocked> loc(#loc95) + %offsets_20 = arith.addi %offsets_19, %offsets_15 : tensor<32768xi32, #blocked> loc(#loc95) + %mask_21 = arith.cmpi slt, %offsets_20, %mask_16 : tensor<32768xi32, #blocked> loc(#loc96) + %logits_block_22 = tt.addptr %logits_block, %offsets_20 : tensor<32768x!tt.ptr, #blocked>, tensor<32768xi32, #blocked> loc(#loc91) + %logits_block_23 = tt.load %logits_block_22, %mask_21, %cst_1 : tensor<32768x!tt.ptr, #blocked> loc(#loc97) + %logits_block_24 = arith.extf %logits_block_23 : tensor<32768xbf16, #blocked> to tensor<32768xf32, #blocked> loc(#loc98) + %target_block_25 = tt.addptr %target_block, %offsets_20 : tensor<32768x!tt.ptr, #blocked>, tensor<32768xi32, #blocked> loc(#loc99) + %target_block_26 = tt.load %target_block_25, %mask_21, %cst : tensor<32768x!tt.ptr, #blocked> loc(#loc100) + %softmax_prob_27 = arith.subf %logits_block_24, %softmax_prob : tensor<32768xf32, #blocked> loc(#loc92) + %softmax_prob_28 = math.exp %softmax_prob_27 : tensor<32768xf32, #blocked> loc(#loc101) + %softmax_prob_29 = arith.divf %softmax_prob_28, %softmax_prob_18 : tensor<32768xf32, #blocked> loc(#loc93) + %normalized_grad_30 = arith.mulf %softmax_prob_29, %normalized_grad : tensor<32768xf32, #blocked> loc(#loc94) + %grad_block = arith.mulf %target_block_26, %target_grad_sum : tensor<32768xf32, #blocked> loc(#loc102) + %grad_block_31 = arith.subf %grad_block, %normalized_grad_30 : tensor<32768xf32, #blocked> loc(#loc103) + %grad_block_32 = arith.subf %cst, %grad_block_31 : tensor<32768xf32, #blocked> loc(#loc104) + %3 = arith.truncf %grad_block_32 : tensor<32768xf32, #blocked> to tensor<32768xbf16, #blocked> loc(#loc52) + tt.store %logits_block_22, %3, %mask_21 : tensor<32768x!tt.ptr, #blocked> loc(#loc52) + } loc(#loc41) + tt.return loc(#loc53) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":127:31) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":127:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":128:31) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":128:18) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":129:31) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":129:18) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":130:25) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":132:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":133:24) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":135:39) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":136:29) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":137:34) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":134:34) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":135:26) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":137:43) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":138:8) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":140:13) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":141:13) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":142:16) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":143:16) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":144:26) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":145:32) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":150:35) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":151:25) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":152:44) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:64) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":149:30) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":150:22) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":152:31) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:77) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:27) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":161:44) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:45) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:50) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":168:41) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":158:30) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":159:22) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":160:25) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":161:31) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":162:12) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":164:44) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":164:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:30) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:38) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:52) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:23) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":170:39) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":158:4) +#loc64 = loc("program_id"(#loc2)) +#loc65 = loc("program_id"(#loc3)) +#loc66 = loc("logits_ptr"(#loc4)) +#loc67 = loc("logits_ptr"(#loc5)) +#loc68 = loc("target_ptr"(#loc6)) +#loc69 = loc("target_ptr"(#loc7)) +#loc70 = loc("position_mask_ptr"(#loc8)) +#loc71 = loc("position_mask"(#loc9)) +#loc72 = loc("offsets"(#loc11)) +#loc73 = loc("mask"(#loc12)) +#loc74 = loc("offsets"(#loc15)) +#loc75 = loc("m_ptr"(#loc18)) +#loc76 = loc("d_ptr"(#loc19)) +#loc77 = loc("m"(#loc20)) +#loc78 = loc("d"(#loc21)) +#loc79 = loc("grad_output"(#loc22)) +#loc80 = loc("grad_output"(#loc23)) +#loc81 = loc("offsets"(#loc24)) +#loc82 = loc("mask"(#loc25)) +#loc83 = loc("target_block"(#loc26)) +#loc84 = loc("target_grad_sum"(#loc27)) +#loc85 = loc("target_grad_sum"(#loc28)) +#loc86 = loc("offsets"(#loc29)) +#loc87 = loc("target_block"(#loc30)) +#loc88 = loc("target_grad_sum"(#loc31)) +#loc90 = loc("target_grad_sum"(#loc35)) +#loc91 = loc("logits_block"(#loc37)) +#loc92 = loc("softmax_prob"(#loc38)) +#loc93 = loc("softmax_prob"(#loc39)) +#loc94 = loc("normalized_grad"(#loc40)) +#loc95 = loc("offsets"(#loc42)) +#loc96 = loc("mask"(#loc43)) +#loc97 = loc("logits_block"(#loc44)) +#loc98 = loc("logits_block"(#loc45)) +#loc99 = loc("target_block"(#loc46)) +#loc100 = loc("target_block"(#loc47)) +#loc101 = loc("softmax_prob"(#loc48)) +#loc102 = loc("grad_block"(#loc49)) +#loc103 = loc("grad_block"(#loc50)) +#loc104 = loc("grad_block"(#loc51)) +#loc105 = loc(callsite(#loc32 at #loc89)) +#loc107 = loc(callsite(#loc34 at #loc105)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttir new file mode 100644 index 0000000000000000000000000000000000000000..b5bc4459f3fc35c5d5c780663109ce8a4c361d3d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttir @@ -0,0 +1,203 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":114:0) +#loc1 = loc(unknown) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:34) +#loc55 = loc("logits_ptr"(#loc)) +#loc56 = loc("logits_stride"(#loc)) +#loc57 = loc("target_ptr"(#loc)) +#loc58 = loc("target_stride"(#loc)) +#loc59 = loc("position_mask_ptr"(#loc)) +#loc60 = loc("grad_output_ptr"(#loc)) +#loc61 = loc("scaling_factor"(#loc)) +#loc62 = loc("m_ptr"(#loc)) +#loc63 = loc("d_ptr"(#loc)) +#loc64 = loc("n_cols"(#loc)) +#loc90 = loc("target_grad_sum"(#loc33)) +#loc108 = loc(callsite(#loc1 at #loc90)) +module { + tt.func public @log_softmax_backward_kernel(%logits_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("logits_ptr"(#loc)), %logits_stride: i32 {tt.divisibility = 16 : i32} loc("logits_stride"(#loc)), %target_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("target_ptr"(#loc)), %target_stride: i32 {tt.divisibility = 16 : i32} loc("target_stride"(#loc)), %position_mask_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("position_mask_ptr"(#loc)), %grad_output_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("grad_output_ptr"(#loc)), %scaling_factor: f32 loc("scaling_factor"(#loc)), %m_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("m_ptr"(#loc)), %d_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("d_ptr"(#loc)), %n_cols: i32 {tt.divisibility = 16 : i32} loc("n_cols"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<32768xbf16> loc(#loc1) + %c32768_i32 = arith.constant 32768 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc1) + %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %position_mask = arith.constant 0 : i8 loc(#loc65) + %program_id = tt.get_program_id x : i32 loc(#loc66) + %program_id_2 = arith.extsi %program_id : i32 to i64 loc(#loc67) + %logits_ptr_3 = arith.extsi %logits_stride : i32 to i64 loc(#loc68) + %logits_ptr_4 = arith.muli %program_id_2, %logits_ptr_3 : i64 loc(#loc68) + %logits_ptr_5 = tt.addptr %logits_ptr, %logits_ptr_4 : !tt.ptr, i64 loc(#loc69) + %target_ptr_6 = arith.extsi %target_stride : i32 to i64 loc(#loc70) + %target_ptr_7 = arith.muli %program_id_2, %target_ptr_6 : i64 loc(#loc70) + %target_ptr_8 = tt.addptr %target_ptr, %target_ptr_7 : !tt.ptr, i64 loc(#loc71) + %position_mask_ptr_9 = tt.addptr %position_mask_ptr, %program_id_2 : !tt.ptr, i64 loc(#loc72) + %position_mask_10 = tt.bitcast %position_mask_ptr_9 : !tt.ptr -> !tt.ptr loc(#loc65) + %position_mask_11 = tt.load %position_mask_10 : !tt.ptr loc(#loc65) + %position_mask_12 = arith.cmpi ne, %position_mask_11, %position_mask : i8 loc(#loc65) + %0 = arith.extui %position_mask_12 : i1 to i32 loc(#loc10) + %1 = arith.cmpi eq, %0, %c0_i32 : i32 loc(#loc10) + cf.cond_br %1, ^bb1, ^bb2 loc(#loc10) + ^bb1: // pred: ^bb0 + scf.for %i = %c0_i32 to %n_cols step %c32768_i32 : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc73) + %offsets_16 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc74) + %offsets_17 = arith.addi %offsets_16, %offsets : tensor<32768xi32> loc(#loc74) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc75) + %mask_18 = arith.cmpi slt, %offsets_17, %mask : tensor<32768xi32> loc(#loc75) + %2 = tt.splat %logits_ptr_5 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc15) + %3 = tt.addptr %2, %offsets_17 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc15) + tt.store %3, %cst, %mask_18 : tensor<32768x!tt.ptr> loc(#loc16) + } loc(#loc11) + tt.return loc(#loc17) + ^bb2: // pred: ^bb0 + %m_ptr_13 = tt.addptr %m_ptr, %program_id_2 : !tt.ptr, i64 loc(#loc76) + %d_ptr_14 = tt.addptr %d_ptr, %program_id_2 : !tt.ptr, i64 loc(#loc77) + %m = tt.load %m_ptr_13 : !tt.ptr loc(#loc78) + %d = tt.load %d_ptr_14 : !tt.ptr loc(#loc79) + %grad_output = tt.load %grad_output_ptr : !tt.ptr loc(#loc80) + %grad_output_15 = arith.mulf %grad_output, %scaling_factor : f32 loc(#loc81) + %target_grad_sum = scf.for %i = %c0_i32 to %n_cols step %c32768_i32 iter_args(%target_grad_sum_16 = %cst_1) -> (f32) : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc83) + %offsets_17 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc84) + %offsets_18 = arith.addi %offsets_17, %offsets : tensor<32768xi32> loc(#loc84) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc85) + %mask_19 = arith.cmpi slt, %offsets_18, %mask : tensor<32768xi32> loc(#loc85) + %target_block = tt.splat %target_ptr_8 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc86) + %target_block_20 = tt.addptr %target_block, %offsets_18 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc86) + %target_block_21 = tt.load %target_block_20, %mask_19, %cst_0 : tensor<32768x!tt.ptr> loc(#loc87) + %target_grad_sum_22 = tt.splat %grad_output_15 : f32 -> tensor<32768xf32> loc(#loc88) + %target_grad_sum_23 = arith.mulf %target_block_21, %target_grad_sum_22 : tensor<32768xf32> loc(#loc88) + %target_grad_sum_24 = arith.select %mask_19, %target_grad_sum_23, %cst_0 : tensor<32768xi1>, tensor<32768xf32> loc(#loc89) + %target_grad_sum_25 = tt.reshape %target_grad_sum_24 allow_reorder : tensor<32768xf32> -> tensor<32768xf32> loc(#loc107) + %target_grad_sum_26 = "tt.reduce"(%target_grad_sum_25) <{axis = 0 : i32}> ({ + ^bb0(%target_grad_sum_28: f32 loc(callsite(#loc1 at #loc90)), %target_grad_sum_29: f32 loc(callsite(#loc1 at #loc90))): + %target_grad_sum_30 = arith.addf %target_grad_sum_28, %target_grad_sum_29 : f32 loc(#loc109) + tt.reduce.return %target_grad_sum_30 : f32 loc(#loc107) + }) : (tensor<32768xf32>) -> f32 loc(#loc107) + %target_grad_sum_27 = arith.addf %target_grad_sum_16, %target_grad_sum_26 : f32 loc(#loc91) + scf.yield %target_grad_sum_27 : f32 loc(#loc36) + } loc(#loc82) + scf.for %i = %c0_i32 to %n_cols step %c32768_i32 : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc92) + %offsets_16 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc93) + %offsets_17 = arith.addi %offsets_16, %offsets : tensor<32768xi32> loc(#loc93) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc94) + %mask_18 = arith.cmpi slt, %offsets_17, %mask : tensor<32768xi32> loc(#loc94) + %logits_block = tt.splat %logits_ptr_5 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc95) + %logits_block_19 = tt.addptr %logits_block, %offsets_17 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc95) + %logits_block_20 = tt.load %logits_block_19, %mask_18, %cst : tensor<32768x!tt.ptr> loc(#loc96) + %logits_block_21 = arith.extf %logits_block_20 : tensor<32768xbf16> to tensor<32768xf32> loc(#loc97) + %target_block = tt.splat %target_ptr_8 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc98) + %target_block_22 = tt.addptr %target_block, %offsets_17 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc98) + %target_block_23 = tt.load %target_block_22, %mask_18, %cst_0 : tensor<32768x!tt.ptr> loc(#loc99) + %softmax_prob = tt.splat %m : f32 -> tensor<32768xf32> loc(#loc100) + %softmax_prob_24 = arith.subf %logits_block_21, %softmax_prob : tensor<32768xf32> loc(#loc100) + %softmax_prob_25 = math.exp %softmax_prob_24 : tensor<32768xf32> loc(#loc101) + %softmax_prob_26 = tt.splat %d : f32 -> tensor<32768xf32> loc(#loc102) + %softmax_prob_27 = arith.divf %softmax_prob_25, %softmax_prob_26 : tensor<32768xf32> loc(#loc102) + %normalized_grad = tt.splat %target_grad_sum : f32 -> tensor<32768xf32> loc(#loc103) + %normalized_grad_28 = arith.mulf %softmax_prob_27, %normalized_grad : tensor<32768xf32> loc(#loc103) + %grad_block = tt.splat %grad_output_15 : f32 -> tensor<32768xf32> loc(#loc104) + %grad_block_29 = arith.mulf %target_block_23, %grad_block : tensor<32768xf32> loc(#loc104) + %grad_block_30 = arith.subf %grad_block_29, %normalized_grad_28 : tensor<32768xf32> loc(#loc105) + %grad_block_31 = arith.subf %cst_0, %grad_block_30 : tensor<32768xf32> loc(#loc106) + %2 = arith.truncf %grad_block_31 : tensor<32768xf32> to tensor<32768xbf16> loc(#loc53) + tt.store %logits_block_19, %2, %mask_18 : tensor<32768x!tt.ptr> loc(#loc53) + } loc(#loc37) + tt.return loc(#loc54) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":132:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":127:31) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":127:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":128:31) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":128:18) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":129:31) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":129:18) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":130:25) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":133:24) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":134:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":135:39) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":135:26) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":136:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":137:34) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":137:43) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":138:8) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":140:13) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":141:13) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":142:16) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":143:16) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":144:26) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":145:32) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":149:30) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":150:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":150:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":151:25) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":152:44) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":152:31) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:64) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:77) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:27) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":158:30) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":159:35) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":159:22) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":160:25) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":161:44) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":161:31) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":162:12) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":164:44) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":164:31) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:45) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:30) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:50) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":168:41) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:38) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:52) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:23) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":170:39) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":158:4) +#loc65 = loc("position_mask"(#loc2)) +#loc66 = loc("program_id"(#loc3)) +#loc67 = loc("program_id"(#loc4)) +#loc68 = loc("logits_ptr"(#loc5)) +#loc69 = loc("logits_ptr"(#loc6)) +#loc70 = loc("target_ptr"(#loc7)) +#loc71 = loc("target_ptr"(#loc8)) +#loc72 = loc("position_mask_ptr"(#loc9)) +#loc73 = loc("offsets"(#loc12)) +#loc74 = loc("offsets"(#loc13)) +#loc75 = loc("mask"(#loc14)) +#loc76 = loc("m_ptr"(#loc18)) +#loc77 = loc("d_ptr"(#loc19)) +#loc78 = loc("m"(#loc20)) +#loc79 = loc("d"(#loc21)) +#loc80 = loc("grad_output"(#loc22)) +#loc81 = loc("grad_output"(#loc23)) +#loc82 = loc("target_grad_sum"(#loc24)) +#loc83 = loc("offsets"(#loc25)) +#loc84 = loc("offsets"(#loc26)) +#loc85 = loc("mask"(#loc27)) +#loc86 = loc("target_block"(#loc28)) +#loc87 = loc("target_block"(#loc29)) +#loc88 = loc("target_grad_sum"(#loc30)) +#loc89 = loc("target_grad_sum"(#loc31)) +#loc91 = loc("target_grad_sum"(#loc35)) +#loc92 = loc("offsets"(#loc38)) +#loc93 = loc("offsets"(#loc39)) +#loc94 = loc("mask"(#loc40)) +#loc95 = loc("logits_block"(#loc41)) +#loc96 = loc("logits_block"(#loc42)) +#loc97 = loc("logits_block"(#loc43)) +#loc98 = loc("target_block"(#loc44)) +#loc99 = loc("target_block"(#loc45)) +#loc100 = loc("softmax_prob"(#loc46)) +#loc101 = loc("softmax_prob"(#loc47)) +#loc102 = loc("softmax_prob"(#loc48)) +#loc103 = loc("normalized_grad"(#loc49)) +#loc104 = loc("grad_block"(#loc50)) +#loc105 = loc("grad_block"(#loc51)) +#loc106 = loc("grad_block"(#loc52)) +#loc107 = loc(callsite(#loc32 at #loc90)) +#loc109 = loc(callsite(#loc34 at #loc107)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..95e852e855fbf223551b1db1c983e8bb6524359b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..4db09f187f871b8e6051ad0d25995fbc4ab0363c Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..662886a077c39e0c593a49265f74272afae8679c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "8ac911daa2a533d701fdd1240df5c99a179eb64893d2fcd04f63beccf895f168", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..4d3e7ed3c003af7158293be2a2ebe4bd6ee27412 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.llir @@ -0,0 +1,215 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 6, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 504, !dbg !9 + %12 = lshr exact i32 %11, 3, !dbg !9 + %13 = or disjoint i32 %12, %9, !dbg !10 + %14 = shl nuw nsw i32 %10, 3, !dbg !11 + %15 = and i32 %14, 56, !dbg !11 + %16 = sdiv i32 %13, 2048, !dbg !12 + %17 = mul i32 %16, 2048, !dbg !13 + %.decomposed = sub i32 %13, %17, !dbg !13 + %18 = srem i32 %16, 32, !dbg !14 + %19 = sdiv i32 %13, 65536, !dbg !15 + %20 = shl nsw i32 %18, 7, !dbg !16 + %21 = shl nsw i32 %.decomposed, 12, !dbg !17 + %22 = shl i32 %19, 23, !dbg !18 + %23 = shl i32 %13, 7, !dbg !19 + %24 = add i32 %22, %21 + %25 = add i32 %24, %20 + %26 = zext nneg i32 %15 to i64, !dbg !20 + %27 = sext i32 %23 to i64, !dbg !20 + %28 = or disjoint i32 %25, %15, !dbg !21 + %29 = sext i32 %28 to i64, !dbg !22 + %30 = getelementptr bfloat, ptr addrspace(1) %0, i64 %29, !dbg !22 + %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !23 + %32 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %30, i64 %31, i1 true) #4, !dbg !23 + %33 = extractvalue { i32, i32, i32, i32 } %32, 0, !dbg !23 + %34 = bitcast i32 %33 to <2 x bfloat>, !dbg !23 + %35 = extractvalue { i32, i32, i32, i32 } %32, 1, !dbg !23 + %36 = bitcast i32 %35 to <2 x bfloat>, !dbg !23 + %37 = extractvalue { i32, i32, i32, i32 } %32, 2, !dbg !23 + %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !23 + %39 = extractvalue { i32, i32, i32, i32 } %32, 3, !dbg !23 + %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !23 + %41 = getelementptr bfloat, ptr addrspace(1) %1, i64 %26, !dbg !24 + %42 = getelementptr bfloat, ptr addrspace(1) %41, i64 %27, !dbg !24 + %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !25 + %44 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %42, i64 %43, i1 true) #4, !dbg !25 + %45 = extractvalue { i32, i32, i32, i32 } %44, 0, !dbg !25 + %46 = bitcast i32 %45 to <2 x bfloat>, !dbg !25 + %47 = extractvalue { i32, i32, i32, i32 } %44, 1, !dbg !25 + %48 = bitcast i32 %47 to <2 x bfloat>, !dbg !25 + %49 = extractvalue { i32, i32, i32, i32 } %44, 2, !dbg !25 + %50 = bitcast i32 %49 to <2 x bfloat>, !dbg !25 + %51 = extractvalue { i32, i32, i32, i32 } %44, 3, !dbg !25 + %52 = bitcast i32 %51 to <2 x bfloat>, !dbg !25 + %53 = or disjoint i64 %26, 64, !dbg !26 + %54 = trunc nuw nsw i64 %53 to i32, !dbg !21 + %55 = or disjoint i32 %25, %54, !dbg !21 + %56 = sext i32 %55 to i64, !dbg !22 + %57 = getelementptr bfloat, ptr addrspace(1) %0, i64 %56, !dbg !22 + %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !23 + %59 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %57, i64 %58, i1 true) #4, !dbg !23 + %60 = extractvalue { i32, i32, i32, i32 } %59, 0, !dbg !23 + %61 = bitcast i32 %60 to <2 x bfloat>, !dbg !23 + %62 = extractvalue { i32, i32, i32, i32 } %59, 1, !dbg !23 + %63 = bitcast i32 %62 to <2 x bfloat>, !dbg !23 + %64 = extractvalue { i32, i32, i32, i32 } %59, 2, !dbg !23 + %65 = bitcast i32 %64 to <2 x bfloat>, !dbg !23 + %66 = extractvalue { i32, i32, i32, i32 } %59, 3, !dbg !23 + %67 = bitcast i32 %66 to <2 x bfloat>, !dbg !23 + %68 = getelementptr bfloat, ptr addrspace(1) %1, i64 %53, !dbg !24 + %69 = getelementptr bfloat, ptr addrspace(1) %68, i64 %27, !dbg !24 + %70 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !25 + %71 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %69, i64 %70, i1 true) #4, !dbg !25 + %72 = extractvalue { i32, i32, i32, i32 } %71, 0, !dbg !25 + %73 = bitcast i32 %72 to <2 x bfloat>, !dbg !25 + %74 = extractvalue { i32, i32, i32, i32 } %71, 1, !dbg !25 + %75 = bitcast i32 %74 to <2 x bfloat>, !dbg !25 + %76 = extractvalue { i32, i32, i32, i32 } %71, 2, !dbg !25 + %77 = bitcast i32 %76 to <2 x bfloat>, !dbg !25 + %78 = extractvalue { i32, i32, i32, i32 } %71, 3, !dbg !25 + %79 = bitcast i32 %78 to <2 x bfloat>, !dbg !25 + %80 = fpext <2 x bfloat> %34 to <2 x float>, !dbg !27 + %81 = fpext <2 x bfloat> %46 to <2 x float>, !dbg !28 + %82 = fmul <2 x float> %80, %81, !dbg !29 + %83 = fadd <2 x float> %82, zeroinitializer, !dbg !30 + %84 = fpext <2 x bfloat> %61 to <2 x float>, !dbg !27 + %85 = fpext <2 x bfloat> %73 to <2 x float>, !dbg !28 + %86 = fmul <2 x float> %84, %85, !dbg !29 + %87 = fadd <2 x float> %83, %86, !dbg !30 + %88 = fpext <2 x bfloat> %36 to <2 x float>, !dbg !27 + %89 = fpext <2 x bfloat> %48 to <2 x float>, !dbg !28 + %90 = fmul <2 x float> %88, %89, !dbg !29 + %91 = fadd <2 x float> %90, zeroinitializer, !dbg !30 + %92 = fpext <2 x bfloat> %63 to <2 x float>, !dbg !27 + %93 = fpext <2 x bfloat> %75 to <2 x float>, !dbg !28 + %94 = fmul <2 x float> %92, %93, !dbg !29 + %95 = fadd <2 x float> %91, %94, !dbg !30 + %96 = fpext <2 x bfloat> %38 to <2 x float>, !dbg !27 + %97 = fpext <2 x bfloat> %50 to <2 x float>, !dbg !28 + %98 = fmul <2 x float> %96, %97, !dbg !29 + %99 = fadd <2 x float> %98, zeroinitializer, !dbg !30 + %100 = fpext <2 x bfloat> %65 to <2 x float>, !dbg !27 + %101 = fpext <2 x bfloat> %77 to <2 x float>, !dbg !28 + %102 = fmul <2 x float> %100, %101, !dbg !29 + %103 = fadd <2 x float> %99, %102, !dbg !30 + %104 = fpext <2 x bfloat> %40 to <2 x float>, !dbg !27 + %105 = fpext <2 x bfloat> %52 to <2 x float>, !dbg !28 + %106 = fmul <2 x float> %104, %105, !dbg !29 + %107 = fadd <2 x float> %106, zeroinitializer, !dbg !30 + %108 = fpext <2 x bfloat> %67 to <2 x float>, !dbg !27 + %109 = fpext <2 x bfloat> %79 to <2 x float>, !dbg !28 + %110 = fmul <2 x float> %108, %109, !dbg !29 + %111 = fadd <2 x float> %107, %110, !dbg !30 + %112 = and i32 %10, 63, !dbg !9 + %113 = or disjoint i32 %9, %112, !dbg !10 + %shift = shufflevector <2 x float> %87, <2 x float> poison, <2 x i32> , !dbg !31 + %foldExtExtBinop = fadd <2 x float> %87, %shift, !dbg !31 + %foldExtExtBinop9 = fadd <2 x float> %95, %foldExtExtBinop, !dbg !31 + %shift11 = shufflevector <2 x float> %95, <2 x float> poison, <2 x i32> , !dbg !31 + %foldExtExtBinop12 = fadd <2 x float> %shift11, %foldExtExtBinop9, !dbg !31 + %foldExtExtBinop14 = fadd <2 x float> %103, %foldExtExtBinop12, !dbg !31 + %shift16 = shufflevector <2 x float> %103, <2 x float> poison, <2 x i32> , !dbg !31 + %foldExtExtBinop17 = fadd <2 x float> %shift16, %foldExtExtBinop14, !dbg !31 + %foldExtExtBinop19 = fadd <2 x float> %111, %foldExtExtBinop17, !dbg !31 + %shift21 = shufflevector <2 x float> %111, <2 x float> poison, <2 x i32> , !dbg !31 + %foldExtExtBinop22 = fadd <2 x float> %shift21, %foldExtExtBinop19, !dbg !31 + %114 = extractelement <2 x float> %foldExtExtBinop22, i64 0, !dbg !31 + %115 = bitcast float %114 to i32, !dbg !35 + %116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %115, i32 4, i32 31), !dbg !35 + %117 = bitcast i32 %116 to float, !dbg !35 + %118 = fadd float %114, %117, !dbg !31 + %119 = bitcast float %118 to i32, !dbg !35 + %120 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %119, i32 2, i32 31), !dbg !35 + %121 = bitcast i32 %120 to float, !dbg !35 + %122 = fadd float %118, %121, !dbg !31 + %123 = bitcast float %122 to i32, !dbg !35 + %124 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %123, i32 1, i32 31), !dbg !35 + %125 = bitcast i32 %124 to float, !dbg !35 + %126 = fadd float %122, %125, !dbg !31 + %127 = lshr exact i32 %11, 1, !dbg !36 + %128 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %127, !dbg !36 + store float %126, ptr addrspace(3) %128, align 4, !dbg !36 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !36 + %129 = shl nuw nsw i32 %112, 2, !dbg !36 + %130 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %129, !dbg !36 + %131 = load i32, ptr addrspace(3) %130, align 4, !dbg !36 + %132 = sext i32 %113 to i64, !dbg !37 + %133 = getelementptr float, ptr addrspace(1) %2, i64 %132, !dbg !37 + %134 = and i32 %10, 448, !dbg !38 + %135 = icmp eq i32 %134, 0, !dbg !38 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %131, ptr addrspace(1) %133, i1 %135) #4, !dbg !38 + ret void, !dbg !39 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 21, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 29, column: 29, scope: !4) +!15 = !DILocation(line: 30, column: 19, scope: !4) +!16 = !DILocation(line: 39, column: 45, scope: !4) +!17 = !DILocation(line: 39, column: 55, scope: !4) +!18 = !DILocation(line: 39, column: 68, scope: !4) +!19 = !DILocation(line: 40, column: 45, scope: !4) +!20 = !DILocation(line: 33, column: 40, scope: !4) +!21 = !DILocation(line: 39, column: 60, scope: !4) +!22 = !DILocation(line: 39, column: 34, scope: !4) +!23 = !DILocation(line: 39, column: 73, scope: !4) +!24 = !DILocation(line: 40, column: 34, scope: !4) +!25 = !DILocation(line: 40, column: 50, scope: !4) +!26 = !DILocation(line: 34, column: 31, scope: !4) +!27 = !DILocation(line: 39, column: 127, scope: !4) +!28 = !DILocation(line: 40, column: 104, scope: !4) +!29 = !DILocation(line: 41, column: 22, scope: !4) +!30 = !DILocation(line: 43, column: 23, scope: !4) +!31 = !DILocation(line: 261, column: 15, scope: !32, inlinedAt: !34) +!32 = distinct !DILexicalBlockFile(scope: !4, file: !33, discriminator: 0) +!33 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!34 = !DILocation(line: 45, column: 25, scope: !4) +!35 = !DILocation(line: 291, column: 36, scope: !32, inlinedAt: !34) +!36 = !DILocation(line: 45, column: 28, scope: !4) +!37 = !DILocation(line: 49, column: 25, scope: !4) +!38 = !DILocation(line: 49, column: 36, scope: !4) +!39 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..f9bd122a43e026d9367fb58ba11e53cb215e2662 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ptx @@ -0,0 +1,511 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u32 triton_red_fused_zeros_0_param_3, + .param .u32 triton_red_fused_zeros_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_6 +) +.reqntid 512 +{ + .reg .pred %p<6>; + .reg .b16 %rs<33>; + .reg .b32 %r<131>; + .reg .b64 %rd<23>; + .loc 1 18 0 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:18:0 + +// %bb.0: + ld.param.b64 %rd14, [triton_red_fused_zeros_0_param_0]; + ld.param.b64 %rd15, [triton_red_fused_zeros_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:23:28 + mov.u32 %r34, %ctaid.x; + .loc 1 23 33 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:23:33 + shl.b32 %r35, %r34, 6; + ld.param.b64 %rd16, [triton_red_fused_zeros_0_param_2]; + .loc 1 24 44 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:24:44 + mov.u32 %r36, %tid.x; + and.b32 %r37, %r36, 504; + bfe.u32 %r38, %r36, 3, 6; + .loc 1 24 23 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:24:23 + or.b32 %r39, %r38, %r35; + .loc 1 26 37 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:26:37 + shl.b32 %r40, %r36, 3; + and.b32 %r41, %r40, 56; + .loc 1 29 21 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:29:21 + bfe.s32 %r42, %r34, 25, 1; + shr.u32 %r43, %r42, 21; + add.s32 %r44, %r39, %r43; + shr.s32 %r45, %r44, 11; + .loc 1 28 19 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:28:19 + and.b32 %r46, %r44, 1046528; + sub.s32 %r47, %r39, %r46; + .loc 1 29 29 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:29:29 + shr.u32 %r48, %r45, 27; + add.s32 %r49, %r45, %r48; + and.b32 %r50, %r49, 33554400; + sub.s32 %r51, %r45, %r50; + .loc 1 30 19 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:30:19 + shr.u32 %r52, %r42, 16; + add.s32 %r53, %r39, %r52; + .loc 1 39 45 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:39:45 + shl.b32 %r54, %r51, 7; + .loc 1 39 55 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:39:55 + shl.b32 %r55, %r47, 12; + .loc 1 39 68 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:39:68 + shl.b32 %r56, %r53, 7; + and.b32 %r57, %r56, -8388608; + .loc 1 40 45 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:40:45 + shl.b32 %r58, %r39, 7; + add.s32 %r59, %r57, %r55; + add.s32 %r60, %r59, %r54; + .loc 1 33 40 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:33:40 + cvt.u64.u32 %rd17, %r41; + .loc 1 39 60 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:39:60 + or.b32 %r61, %r60, %r41; + .loc 1 39 34 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:39:34 + mad.wide.s32 %rd2, %r61, 2, %rd14; + .loc 1 39 73 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:39:73 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd3, 1.0; + // end inline asm + mov.b32 %r5, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd2 + 0 ], %rd3; + // end inline asm + .loc 1 40 34 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:40:34 + mad.wide.u32 %rd18, %r41, 2, %rd15; + mad.wide.s32 %rd5, %r58, 2, %rd18; + .loc 1 40 50 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:40:50 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r9, %r5; + mov.u32 %r10, %r5; + mov.u32 %r11, %r5; + mov.u32 %r12, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r9, %r10, %r11, %r12 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 39 34 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:39:34 + cvt.s64.s32 %rd19, %r60; + or.b64 %rd20, %rd19, %rd17; + shl.b64 %rd21, %rd20, 1; + add.s64 %rd22, %rd14, %rd21; + add.s64 %rd8, %rd22, 128; + .loc 1 39 73 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:39:73 + // begin inline asm + mov.u64 %rd9, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd9, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r17, %r5; + mov.u32 %r18, %r5; + mov.u32 %r19, %r5; + mov.u32 %r20, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r17, %r18, %r19, %r20 }, [ %rd8 + 0 ], %rd9; + // end inline asm + .loc 1 40 34 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:40:34 + add.s64 %rd11, %rd5, 128; + .loc 1 40 50 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:40:50 + // begin inline asm + mov.u64 %rd12, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd12, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r25, %r5; + mov.u32 %r26, %r5; + mov.u32 %r27, %r5; + mov.u32 %r28, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd11 + 0 ], %rd12; + // end inline asm + .loc 1 39 127 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:39:127 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r62, %rs2; + cvt.f32.bf16 %r63, %rs1; + .loc 1 40 104 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:40:104 + mov.b32 {%rs3, %rs4}, %r9; + cvt.f32.bf16 %r64, %rs4; + cvt.f32.bf16 %r65, %rs3; + .loc 1 43 23 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:43:23 + fma.rn.f32 %r66, %r63, %r65, 0f00000000; + fma.rn.f32 %r67, %r62, %r64, 0f00000000; + .loc 1 39 127 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:39:127 + mov.b32 {%rs5, %rs6}, %r17; + cvt.f32.bf16 %r68, %rs5; + cvt.f32.bf16 %r69, %rs6; + .loc 1 40 104 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:40:104 + mov.b32 {%rs7, %rs8}, %r25; + cvt.f32.bf16 %r70, %rs7; + cvt.f32.bf16 %r71, %rs8; + .loc 1 43 23 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:43:23 + fma.rn.f32 %r72, %r69, %r71, %r67; + fma.rn.f32 %r73, %r68, %r70, %r66; + .loc 1 39 127 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:39:127 + mov.b32 {%rs9, %rs10}, %r2; + cvt.f32.bf16 %r74, %rs10; + cvt.f32.bf16 %r75, %rs9; + .loc 1 40 104 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:40:104 + mov.b32 {%rs11, %rs12}, %r10; + cvt.f32.bf16 %r76, %rs12; + cvt.f32.bf16 %r77, %rs11; + .loc 1 43 23 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:43:23 + fma.rn.f32 %r78, %r75, %r77, 0f00000000; + fma.rn.f32 %r79, %r74, %r76, 0f00000000; + .loc 1 39 127 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:39:127 + mov.b32 {%rs13, %rs14}, %r18; + cvt.f32.bf16 %r80, %rs13; + cvt.f32.bf16 %r81, %rs14; + .loc 1 40 104 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:40:104 + mov.b32 {%rs15, %rs16}, %r26; + cvt.f32.bf16 %r82, %rs15; + cvt.f32.bf16 %r83, %rs16; + .loc 1 43 23 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:43:23 + fma.rn.f32 %r84, %r81, %r83, %r79; + fma.rn.f32 %r85, %r80, %r82, %r78; + .loc 1 39 127 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:39:127 + mov.b32 {%rs17, %rs18}, %r3; + cvt.f32.bf16 %r86, %rs18; + cvt.f32.bf16 %r87, %rs17; + .loc 1 40 104 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:40:104 + mov.b32 {%rs19, %rs20}, %r11; + cvt.f32.bf16 %r88, %rs20; + cvt.f32.bf16 %r89, %rs19; + .loc 1 43 23 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:43:23 + fma.rn.f32 %r90, %r87, %r89, 0f00000000; + fma.rn.f32 %r91, %r86, %r88, 0f00000000; + .loc 1 39 127 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:39:127 + mov.b32 {%rs21, %rs22}, %r19; + cvt.f32.bf16 %r92, %rs21; + cvt.f32.bf16 %r93, %rs22; + .loc 1 40 104 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:40:104 + mov.b32 {%rs23, %rs24}, %r27; + cvt.f32.bf16 %r94, %rs23; + cvt.f32.bf16 %r95, %rs24; + .loc 1 43 23 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:43:23 + fma.rn.f32 %r96, %r93, %r95, %r91; + fma.rn.f32 %r97, %r92, %r94, %r90; + .loc 1 39 127 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:39:127 + mov.b32 {%rs25, %rs26}, %r4; + cvt.f32.bf16 %r98, %rs26; + cvt.f32.bf16 %r99, %rs25; + .loc 1 40 104 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:40:104 + mov.b32 {%rs27, %rs28}, %r12; + cvt.f32.bf16 %r100, %rs28; + cvt.f32.bf16 %r101, %rs27; + .loc 1 43 23 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:43:23 + fma.rn.f32 %r102, %r99, %r101, 0f00000000; + fma.rn.f32 %r103, %r98, %r100, 0f00000000; + .loc 1 39 127 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:39:127 + mov.b32 {%rs29, %rs30}, %r20; + cvt.f32.bf16 %r104, %rs29; + cvt.f32.bf16 %r105, %rs30; + .loc 1 40 104 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:40:104 + mov.b32 {%rs31, %rs32}, %r28; + cvt.f32.bf16 %r106, %rs31; + cvt.f32.bf16 %r107, %rs32; + .loc 1 43 23 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:43:23 + fma.rn.f32 %r108, %r105, %r107, %r103; + fma.rn.f32 %r109, %r104, %r106, %r102; + .loc 1 24 44 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:24:44 + and.b32 %r110, %r36, 63; + .loc 1 24 23 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:24:23 + or.b32 %r111, %r35, %r110; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:45:25 ] + add.f32 %r112, %r73, %r72; + add.f32 %r113, %r85, %r112; + add.f32 %r114, %r84, %r113; + add.f32 %r115, %r97, %r114; + add.f32 %r116, %r96, %r115; + add.f32 %r117, %r109, %r116; + add.f32 %r118, %r108, %r117; + .loc 2 291 36 // standard.py:291:36 @[ cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:45:25 ] + shfl.sync.bfly.b32 %r119, %r118, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:45:25 ] + add.f32 %r120, %r118, %r119; + .loc 2 291 36 // standard.py:291:36 @[ cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:45:25 ] + shfl.sync.bfly.b32 %r121, %r120, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:45:25 ] + add.f32 %r122, %r120, %r121; + .loc 2 291 36 // standard.py:291:36 @[ cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:45:25 ] + shfl.sync.bfly.b32 %r123, %r122, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:45:25 ] + add.f32 %r124, %r122, %r123; +$L__tmp2: + .loc 1 45 28 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:45:28 + shr.u32 %r125, %r37, 1; + mov.b32 %r126, global_smem; + add.s32 %r127, %r126, %r125; + st.shared.b32 [%r127], %r124; + bar.sync 0; + shl.b32 %r128, %r110, 2; + add.s32 %r129, %r126, %r128; + ld.shared.b32 %r33, [%r129]; + .loc 1 49 25 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:49:25 + mad.wide.s32 %rd13, %r111, 4, %rd16; + .loc 1 49 36 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:49:36 + and.b32 %r130, %r36, 448; + setp.eq.b32 %p5, %r130, 0; + // begin inline asm + @%p5 st.global.b32 [ %rd13 + 0 ], { %r33 }; + // end inline asm + .loc 1 49 4 // cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py:49:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 209 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xca DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 118 +.b8 112 +.b8 102 +.b8 52 +.b8 52 +.b8 112 +.b8 54 +.b8 100 +.b8 53 +.b8 54 +.b8 108 +.b8 97 +.b8 100 +.b8 109 +.b8 114 +.b8 113 +.b8 102 +.b8 55 +.b8 102 +.b8 118 +.b8 117 +.b8 99 +.b8 104 +.b8 101 +.b8 50 +.b8 109 +.b8 122 +.b8 114 +.b8 102 +.b8 101 +.b8 52 +.b8 122 +.b8 97 +.b8 97 +.b8 111 +.b8 107 +.b8 54 +.b8 112 +.b8 109 +.b8 103 +.b8 99 +.b8 117 +.b8 98 +.b8 52 +.b8 114 +.b8 114 +.b8 104 +.b8 106 +.b8 53 +.b8 114 +.b8 109 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 118 +.b8 112 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..81d776f3849cede479bbffa1205a8b3b133ae26d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.source @@ -0,0 +1,222 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":18:0) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc46 = loc(unknown) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc53 = loc("in_ptr0"(#loc)) +#loc54 = loc("in_ptr1"(#loc)) +#loc55 = loc("out_ptr1"(#loc)) +#loc56 = loc("xnumel"(#loc)) +#loc57 = loc("r0_numel"(#loc)) +#loc97 = loc("input"(#loc44)) +#loc98 = loc("a"(#loc49)) +#loc99 = loc("b"(#loc49)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 131072 : i32 loc(#loc58) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc59) + %xoffset = tt.get_program_id x : i32 loc(#loc60) + %xoffset_2 = arith.constant 64 : i32 loc(#loc61) + %xoffset_3 = arith.constant 64 : i32 loc(#loc61) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc61) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc62) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc63) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc64) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc64) + %xmask = arith.constant true loc(#loc65) + %xmask_8 = arith.constant dense : tensor<64x64xi1> loc(#loc65) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc66) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc67) + %x0 = arith.constant 2048 : i32 loc(#loc68) + %x0_10 = arith.constant 2048 : i32 loc(#loc68) + %x0_11 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc68) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc68) + %x1 = arith.constant 2048 : i32 loc(#loc69) + %x1_13 = arith.constant 2048 : i32 loc(#loc69) + %x1_14 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc69) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc69) + %x1_16 = arith.constant 32 : i32 loc(#loc70) + %x1_17 = arith.constant 32 : i32 loc(#loc70) + %x1_18 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc70) + %x1_19 = arith.remsi %x1_15, %x1_18 : tensor<64x1xi32> loc(#loc70) + %x2 = arith.constant 65536 : i32 loc(#loc71) + %x2_20 = arith.constant 65536 : i32 loc(#loc71) + %x2_21 = arith.constant dense<65536> : tensor<64x1xi32> loc(#loc71) + %x2_22 = arith.divsi %xindex_7, %x2_21 : tensor<64x1xi32> loc(#loc71) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc72) + %_tmp4_23 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc72) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c64_i32 = arith.constant 64 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_24 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_27 = %_tmp4_23) -> (tensor<64x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc74) + %r0_index_28 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc74) + %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc75) + %r0_mask_29 = arith.cmpi slt, %r0_index_28, %r0_mask : tensor<1x64xi32> loc(#loc75) + %tmp0 = arith.constant 128 : i32 loc(#loc76) + %tmp0_30 = arith.constant 128 : i32 loc(#loc76) + %tmp0_31 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc76) + %tmp0_32 = arith.muli %tmp0_31, %x1_19 : tensor<64x1xi32> loc(#loc76) + %tmp0_33 = tt.broadcast %r0_index_28 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc77) + %tmp0_34 = tt.broadcast %tmp0_32 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc77) + %tmp0_35 = arith.addi %tmp0_33, %tmp0_34 : tensor<64x64xi32> loc(#loc77) + %tmp0_36 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_37 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_38 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc78) + %tmp0_39 = arith.muli %tmp0_38, %x0_12 : tensor<64x1xi32> loc(#loc78) + %tmp0_40 = tt.broadcast %tmp0_39 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc79) + %tmp0_41 = arith.addi %tmp0_35, %tmp0_40 : tensor<64x64xi32> loc(#loc79) + %tmp0_42 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_43 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_44 = arith.constant dense<8388608> : tensor<64x1xi32> loc(#loc80) + %tmp0_45 = arith.muli %tmp0_44, %x2_22 : tensor<64x1xi32> loc(#loc80) + %tmp0_46 = tt.broadcast %tmp0_45 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc81) + %tmp0_47 = arith.addi %tmp0_41, %tmp0_46 : tensor<64x64xi32> loc(#loc81) + %tmp0_48 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc82) + %tmp0_49 = tt.addptr %tmp0_48, %tmp0_47 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc82) + %tmp0_50 = arith.constant 0.000000e+00 : f32 loc(#loc83) + %tmp0_51 = tt.broadcast %r0_mask_29 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc83) + %tmp0_52 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc83) + %tmp0_53 = arith.truncf %tmp0_52 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc83) + %tmp0_54 = tt.load %tmp0_49, %tmp0_51, %tmp0_53 evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc83) + %tmp0_55 = arith.extf %tmp0_54 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc84) + %tmp1 = arith.constant 128 : i32 loc(#loc85) + %tmp1_56 = arith.constant 128 : i32 loc(#loc85) + %tmp1_57 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc85) + %tmp1_58 = arith.muli %tmp1_57, %xindex_7 : tensor<64x1xi32> loc(#loc85) + %tmp1_59 = tt.broadcast %r0_index_28 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc86) + %tmp1_60 = tt.broadcast %tmp1_58 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc86) + %tmp1_61 = arith.addi %tmp1_59, %tmp1_60 : tensor<64x64xi32> loc(#loc86) + %tmp1_62 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc87) + %tmp1_63 = tt.addptr %tmp1_62, %tmp1_61 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc87) + %tmp1_64 = arith.constant 0.000000e+00 : f32 loc(#loc88) + %tmp1_65 = tt.broadcast %r0_mask_29 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc88) + %tmp1_66 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc88) + %tmp1_67 = arith.truncf %tmp1_66 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc88) + %tmp1_68 = tt.load %tmp1_63, %tmp1_65, %tmp1_67 evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc88) + %tmp1_69 = arith.extf %tmp1_68 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc89) + %tmp2 = arith.mulf %tmp0_55, %tmp1_69 : tensor<64x64xf32> loc(#loc90) + %tmp5 = arith.addf %_tmp4_27, %tmp2 : tensor<64x64xf32> loc(#loc91) + %_tmp4_70 = tt.broadcast %r0_mask_29 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc92) + %_tmp4_71 = arith.select %_tmp4_70, %tmp5, %_tmp4_27 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc92) + scf.yield %_tmp4_71 : tensor<64x64xf32> loc(#loc36) + } loc(#loc73) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_24) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc93) + %tmp4_25 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc94) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc95) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<64x1xf32> loc(#loc96) + %tmp8_26 = arith.subf %tmp4_25, %tmp8 : tensor<64x1xf32> loc(#loc96) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc41) + %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc41) + tt.store %5, %tmp8_26 : tensor<64x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x64xf32> loc("input"(#loc44))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc45) + tt.reduce.return %2 : f32 loc(#loc45) + }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc45) + tt.return %0 : tensor<64xf32> loc(#loc47) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc48) + tt.return %1 : tensor<64xf32> loc(#loc48) + } loc(#loc44) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc49)), %b: f32 loc("b"(#loc49))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc50) + tt.return %0 : f32 loc(#loc51) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc52) + tt.return %1 : f32 loc(#loc52) + } loc(#loc49) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":29:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":30:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":32:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:68) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:60) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:73) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:127) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":40:45) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":40:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":40:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":40:50) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":40:104) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":41:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":43:23) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":44:40) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":44:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":45:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":45:28) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":47:11) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":48:18) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":49:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":49:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":49:4) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc58 = loc("xnumel"(#loc1)) +#loc59 = loc("r0_numel"(#loc2)) +#loc60 = loc("xoffset"(#loc3)) +#loc61 = loc("xoffset"(#loc4)) +#loc62 = loc("xindex"(#loc5)) +#loc63 = loc("xindex"(#loc6)) +#loc64 = loc("xindex"(#loc7)) +#loc65 = loc("xmask"(#loc8)) +#loc66 = loc("r0_base"(#loc9)) +#loc67 = loc("r0_base"(#loc10)) +#loc68 = loc("x0"(#loc11)) +#loc69 = loc("x1"(#loc12)) +#loc70 = loc("x1"(#loc13)) +#loc71 = loc("x2"(#loc14)) +#loc72 = loc("_tmp4"(#loc15)) +#loc73 = loc("_tmp4"(#loc16)) +#loc74 = loc("r0_index"(#loc17)) +#loc75 = loc("r0_mask"(#loc18)) +#loc76 = loc("tmp0"(#loc19)) +#loc77 = loc("tmp0"(#loc20)) +#loc78 = loc("tmp0"(#loc21)) +#loc79 = loc("tmp0"(#loc22)) +#loc80 = loc("tmp0"(#loc23)) +#loc81 = loc("tmp0"(#loc24)) +#loc82 = loc("tmp0"(#loc25)) +#loc83 = loc("tmp0"(#loc26)) +#loc84 = loc("tmp0"(#loc27)) +#loc85 = loc("tmp1"(#loc28)) +#loc86 = loc("tmp1"(#loc29)) +#loc87 = loc("tmp1"(#loc30)) +#loc88 = loc("tmp1"(#loc31)) +#loc89 = loc("tmp1"(#loc32)) +#loc90 = loc("tmp2"(#loc33)) +#loc91 = loc("tmp5"(#loc34)) +#loc92 = loc("_tmp4"(#loc35)) +#loc93 = loc("tmp4"(#loc37)) +#loc94 = loc("tmp4"(#loc38)) +#loc95 = loc("tmp7"(#loc39)) +#loc96 = loc("tmp8"(#loc40)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..25454499f04d13e7dc5f6e69d318a75bffaa9320 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,154 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [16, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 8], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":18:0) +#loc1 = loc(unknown) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":45:25) +#loc39 = loc("in_ptr0"(#loc)) +#loc40 = loc("in_ptr1"(#loc)) +#loc41 = loc("out_ptr1"(#loc)) +#loc42 = loc("xnumel"(#loc)) +#loc43 = loc("r0_numel"(#loc)) +#loc73 = loc("tmp4"(#loc33)) +#loc76 = loc(callsite(#loc1 at #loc73)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<8388608> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<65536> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<2048> : tensor<64x1xi32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc44) + %xoffset_8 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc45) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc46) + %xindex_9 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc46) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc46) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc46) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked> loc(#loc47) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc47) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<64x1xi32, #blocked> loc(#loc47) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<64x1xi32, #blocked1> loc(#loc47) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc48) + %r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc48) + %x0 = arith.remsi %xindex_14, %cst_5 : tensor<64x1xi32, #blocked> loc(#loc49) + %x1 = arith.divsi %xindex_14, %cst_5 : tensor<64x1xi32, #blocked> loc(#loc50) + %x1_17 = arith.remsi %x1, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc51) + %x2 = arith.divsi %xindex_14, %cst_3 : tensor<64x1xi32, #blocked> loc(#loc52) + %tmp0 = arith.muli %x1_17, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc53) + %tmp0_18 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc54) + %tmp0_19 = arith.muli %x0, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc55) + %tmp0_20 = tt.broadcast %tmp0_19 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc56) + %tmp0_21 = arith.muli %x2, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc57) + %tmp0_22 = tt.broadcast %tmp0_21 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc58) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> loc(#loc59) + %tmp1 = arith.muli %xindex_14, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc60) + %tmp1_24 = tt.broadcast %tmp1 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc61) + %tmp1_25 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> loc(#loc62) + %_tmp4 = scf.for %_tmp4_28 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg6 = %cst_7) -> (tensor<64x64xf32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp4_28 : i32 -> tensor<1x64xi32, #blocked> loc(#loc64) + %r0_index_29 = arith.addi %r0_index, %r0_base_16 : tensor<1x64xi32, #blocked> loc(#loc64) + %r0_mask = arith.cmpi slt, %r0_index_29, %cst : tensor<1x64xi32, #blocked> loc(#loc65) + %tmp0_30 = tt.broadcast %r0_index_29 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc54) + %tmp0_31 = arith.addi %tmp0_30, %tmp0_18 : tensor<64x64xi32, #blocked> loc(#loc54) + %tmp0_32 = arith.addi %tmp0_31, %tmp0_20 : tensor<64x64xi32, #blocked> loc(#loc56) + %tmp0_33 = arith.addi %tmp0_32, %tmp0_22 : tensor<64x64xi32, #blocked> loc(#loc58) + %tmp0_34 = tt.addptr %tmp0_23, %tmp0_33 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi32, #blocked> loc(#loc59) + %tmp0_35 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc66) + %tmp0_36 = tt.load %tmp0_34, %tmp0_35, %cst_6 evictionPolicy = evict_first : tensor<64x64x!tt.ptr, #blocked> loc(#loc66) + %tmp0_37 = arith.extf %tmp0_36 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc67) + %tmp1_38 = arith.addi %tmp0_30, %tmp1_24 : tensor<64x64xi32, #blocked> loc(#loc61) + %tmp1_39 = tt.addptr %tmp1_25, %tmp1_38 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi32, #blocked> loc(#loc62) + %tmp1_40 = tt.load %tmp1_39, %tmp0_35, %cst_6 evictionPolicy = evict_first : tensor<64x64x!tt.ptr, #blocked> loc(#loc68) + %tmp1_41 = arith.extf %tmp1_40 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc69) + %tmp2 = arith.mulf %tmp0_37, %tmp1_41 : tensor<64x64xf32, #blocked> loc(#loc70) + %tmp5 = arith.addf %arg6, %tmp2 : tensor<64x64xf32, #blocked> loc(#loc71) + %_tmp4_42 = arith.select %tmp0_35, %tmp5, %arg6 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc72) + scf.yield %_tmp4_42 : tensor<64x64xf32, #blocked> loc(#loc31) + } loc(#loc63) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_28: f32 loc(callsite(#loc1 at #loc73)), %tmp4_29: f32 loc(callsite(#loc1 at #loc73))): + %tmp4_30 = arith.addf %tmp4_28, %tmp4_29 : f32 loc(#loc77) + tt.reduce.return %tmp4_30 : f32 loc(#loc75) + }) : (tensor<64x64xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc75) + %tmp4_26 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc74) + %tmp4_27 = tt.expand_dims %tmp4_26 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc74) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc36) + %1 = tt.addptr %0, %xindex_15 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc36) + tt.store %1, %tmp4_27 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc37) + tt.return loc(#loc38) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":29:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":29:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":30:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:41) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:55) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:50) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:68) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:60) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":40:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":40:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":40:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":33:40) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":34:31) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":35:29) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:73) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:127) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":40:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":40:104) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":41:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":43:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":44:40) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":44:8) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":45:28) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":49:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":49:36) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":49:4) +#loc44 = loc("xoffset"(#loc2)) +#loc45 = loc("xoffset"(#loc3)) +#loc46 = loc("xindex"(#loc4)) +#loc47 = loc("xindex"(#loc5)) +#loc48 = loc("r0_base"(#loc6)) +#loc49 = loc("x0"(#loc7)) +#loc50 = loc("x1"(#loc8)) +#loc51 = loc("x1"(#loc9)) +#loc52 = loc("x2"(#loc10)) +#loc53 = loc("tmp0"(#loc11)) +#loc54 = loc("tmp0"(#loc12)) +#loc55 = loc("tmp0"(#loc13)) +#loc56 = loc("tmp0"(#loc14)) +#loc57 = loc("tmp0"(#loc15)) +#loc58 = loc("tmp0"(#loc16)) +#loc59 = loc("tmp0"(#loc17)) +#loc60 = loc("tmp1"(#loc18)) +#loc61 = loc("tmp1"(#loc19)) +#loc62 = loc("tmp1"(#loc20)) +#loc63 = loc("_tmp4"(#loc21)) +#loc64 = loc("r0_index"(#loc22)) +#loc65 = loc("r0_mask"(#loc23)) +#loc66 = loc("tmp0"(#loc24)) +#loc67 = loc("tmp0"(#loc25)) +#loc68 = loc("tmp1"(#loc26)) +#loc69 = loc("tmp1"(#loc27)) +#loc70 = loc("tmp2"(#loc28)) +#loc71 = loc("tmp5"(#loc29)) +#loc72 = loc("_tmp4"(#loc30)) +#loc74 = loc("tmp4"(#loc35)) +#loc75 = loc(callsite(#loc32 at #loc73)) +#loc77 = loc(callsite(#loc34 at #loc75)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..d5398655f3247fc00d485fed86d19f841fdb8b8b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttir @@ -0,0 +1,148 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":18:0) +#loc1 = loc(unknown) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":45:25) +#loc40 = loc("in_ptr0"(#loc)) +#loc41 = loc("in_ptr1"(#loc)) +#loc42 = loc("out_ptr1"(#loc)) +#loc43 = loc("xnumel"(#loc)) +#loc44 = loc("r0_numel"(#loc)) +#loc75 = loc("tmp4"(#loc34)) +#loc78 = loc(callsite(#loc1 at #loc75)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x64xbf16> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst_0 = arith.constant dense<8388608> : tensor<64x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc1) + %x2 = arith.constant dense<65536> : tensor<64x1xi32> loc(#loc45) + %x1 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc46) + %cst_5 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc47) + %xoffset_6 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc48) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc49) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc50) + %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<64x1xi32> loc(#loc51) + %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<64x1xi32> loc(#loc51) + %r0_base = tt.expand_dims %xindex {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc52) + %x0 = arith.remsi %xindex_9, %cst_5 : tensor<64x1xi32> loc(#loc53) + %x1_10 = arith.divsi %xindex_9, %cst_5 : tensor<64x1xi32> loc(#loc54) + %x1_11 = arith.remsi %x1_10, %x1 : tensor<64x1xi32> loc(#loc46) + %x2_12 = arith.divsi %xindex_9, %x2 : tensor<64x1xi32> loc(#loc45) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%_tmp4_14 = %cst_4) -> (tensor<64x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc56) + %r0_index_15 = arith.addi %r0_index, %r0_base : tensor<1x64xi32> loc(#loc56) + %r0_mask = arith.cmpi slt, %r0_index_15, %cst_3 : tensor<1x64xi32> loc(#loc57) + %tmp0 = arith.muli %x1_11, %cst_2 : tensor<64x1xi32> loc(#loc58) + %tmp0_16 = tt.broadcast %r0_index_15 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc59) + %tmp0_17 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc59) + %tmp0_18 = arith.addi %tmp0_16, %tmp0_17 : tensor<64x64xi32> loc(#loc59) + %tmp0_19 = arith.muli %x0, %cst_1 : tensor<64x1xi32> loc(#loc60) + %tmp0_20 = tt.broadcast %tmp0_19 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc61) + %tmp0_21 = arith.addi %tmp0_18, %tmp0_20 : tensor<64x64xi32> loc(#loc61) + %tmp0_22 = arith.muli %x2_12, %cst_0 : tensor<64x1xi32> loc(#loc62) + %tmp0_23 = tt.broadcast %tmp0_22 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc63) + %tmp0_24 = arith.addi %tmp0_21, %tmp0_23 : tensor<64x64xi32> loc(#loc63) + %tmp0_25 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc64) + %tmp0_26 = tt.addptr %tmp0_25, %tmp0_24 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc64) + %tmp0_27 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc65) + %tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc65) + %tmp0_29 = arith.extf %tmp0_28 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc66) + %tmp1 = arith.muli %xindex_9, %cst_2 : tensor<64x1xi32> loc(#loc67) + %tmp1_30 = tt.broadcast %tmp1 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc68) + %tmp1_31 = arith.addi %tmp0_16, %tmp1_30 : tensor<64x64xi32> loc(#loc68) + %tmp1_32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc69) + %tmp1_33 = tt.addptr %tmp1_32, %tmp1_31 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc69) + %tmp1_34 = tt.load %tmp1_33, %tmp0_27, %cst evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc70) + %tmp1_35 = arith.extf %tmp1_34 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc71) + %tmp2 = arith.mulf %tmp0_29, %tmp1_35 : tensor<64x64xf32> loc(#loc72) + %tmp5 = arith.addf %_tmp4_14, %tmp2 : tensor<64x64xf32> loc(#loc73) + %_tmp4_36 = arith.select %tmp0_27, %tmp5, %_tmp4_14 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc74) + scf.yield %_tmp4_36 : tensor<64x64xf32> loc(#loc32) + } loc(#loc55) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_14: f32 loc(callsite(#loc1 at #loc75)), %tmp4_15: f32 loc(callsite(#loc1 at #loc75))): + %tmp4_16 = arith.addf %tmp4_14, %tmp4_15 : f32 loc(#loc79) + tt.reduce.return %tmp4_16 : f32 loc(#loc77) + }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc77) + %tmp4_13 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc76) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc37) + %1 = tt.addptr %0, %xindex_9 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc37) + tt.store %1, %tmp4_13 : tensor<64x1x!tt.ptr> loc(#loc38) + tt.return loc(#loc39) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":33:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":30:19) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":29:29) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":23:28) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":23:33) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":24:36) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":24:44) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":24:23) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":34:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":35:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:45) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:41) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:55) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:50) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:68) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:73) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":39:127) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":40:45) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":40:41) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":40:34) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":40:50) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":40:104) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":41:22) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":43:23) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":44:40) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":44:8) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":45:28) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":49:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":49:36) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/vp/cvpf44p6d56ladmrqf7fvuche2mzrfe4zaaok6pmgcub4rrhj5rm.py":49:4) +#loc45 = loc("x2"(#loc3)) +#loc46 = loc("x1"(#loc4)) +#loc47 = loc("xoffset"(#loc5)) +#loc48 = loc("xoffset"(#loc6)) +#loc49 = loc("xindex"(#loc7)) +#loc50 = loc("xindex"(#loc8)) +#loc51 = loc("xindex"(#loc9)) +#loc52 = loc("r0_base"(#loc10)) +#loc53 = loc("x0"(#loc11)) +#loc54 = loc("x1"(#loc12)) +#loc55 = loc("_tmp4"(#loc2)) +#loc56 = loc("r0_index"(#loc13)) +#loc57 = loc("r0_mask"(#loc14)) +#loc58 = loc("tmp0"(#loc15)) +#loc59 = loc("tmp0"(#loc16)) +#loc60 = loc("tmp0"(#loc17)) +#loc61 = loc("tmp0"(#loc18)) +#loc62 = loc("tmp0"(#loc19)) +#loc63 = loc("tmp0"(#loc20)) +#loc64 = loc("tmp0"(#loc21)) +#loc65 = loc("tmp0"(#loc22)) +#loc66 = loc("tmp0"(#loc23)) +#loc67 = loc("tmp1"(#loc24)) +#loc68 = loc("tmp1"(#loc25)) +#loc69 = loc("tmp1"(#loc26)) +#loc70 = loc("tmp1"(#loc27)) +#loc71 = loc("tmp1"(#loc28)) +#loc72 = loc("tmp2"(#loc29)) +#loc73 = loc("tmp5"(#loc30)) +#loc74 = loc("_tmp4"(#loc31)) +#loc76 = loc("tmp4"(#loc36)) +#loc77 = loc(callsite(#loc33 at #loc75)) +#loc79 = loc(callsite(#loc35 at #loc77)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/__grp__triton_tem_fused_0.json b/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/__grp__triton_tem_fused_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c52ea23d9ab341077b556dab6328877b284f9982 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/__grp__triton_tem_fused_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_tem_fused_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.source", "triton_tem_fused_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.ttir", "triton_tem_fused_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.ttgir", "triton_tem_fused_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.llir", "triton_tem_fused_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.ptx", "triton_tem_fused_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.cubin", "triton_tem_fused_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.json b/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cc49a0630a23f76cda1d29bbd70ab68067d8c603 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.json @@ -0,0 +1 @@ +{"hash": "9dd9b101fc43acc8c11e35dcf9663ee9444abdcd47bbdbadcaf8bd2d718b0f91", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 131072, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..9c825ee9cae758e80fb479fed6031202b309261d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.llir @@ -0,0 +1,5227 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_tem_fused_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, ptr addrspace(1) %10, ptr addrspace(1) readnone captures(none) %11, ptr addrspace(1) readnone captures(none) %12) local_unnamed_addr #0 !dbg !5 { + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %15 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !9 + %16 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !10 + %17 = and i32 %15, 1, !dbg !11 + %18 = shl i32 %15, 23, !dbg !12 + %19 = shl nuw nsw i32 %16, 7, !dbg !13 + %20 = or disjoint i32 %18, %19, !dbg !14 + %21 = shl nuw nsw i32 %17, 21, !dbg !15 + %22 = shl nuw i32 %16, 16, !dbg !16 + %23 = and i32 %22, -262144, !dbg !16 + %24 = add i32 %21, %23, !dbg !17 + %25 = sext i32 %20 to i64, !dbg !18 + %26 = getelementptr bfloat, ptr addrspace(1) %0, i64 %25, !dbg !18 + %27 = sext i32 %24 to i64, !dbg !19 + %28 = getelementptr bfloat, ptr addrspace(1) %1, i64 %27, !dbg !19 + %29 = getelementptr bfloat, ptr addrspace(1) %2, i64 %27, !dbg !20 + %30 = shl nuw nsw i32 %17, 4, !dbg !21 + %31 = add nuw i32 %30, %14, !dbg !22 + %32 = shl nuw nsw i32 %17, 8, !dbg !23 + %33 = shl i32 %14, 4, !dbg !24 + %34 = add i32 %32, %33, !dbg !25 + %35 = shl i32 %14, 7, !dbg !26 + %36 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !27 + %37 = lshr i32 %36, 5, !dbg !27 + %38 = and i32 %36, 240, !dbg !27 + %39 = lshr exact i32 %38, 4, !dbg !27 + %40 = or disjoint i32 %39, 16, !dbg !27 + %41 = or disjoint i32 %39, 32, !dbg !27 + %42 = or disjoint i32 %39, 48, !dbg !27 + %43 = lshr i32 %36, 1, !dbg !27 + %44 = and i32 %43, 112, !dbg !27 + %45 = lshr i32 %36, 2, !dbg !27 + %46 = and i32 %45, 7, !dbg !27 + %47 = or disjoint i32 %44, %46, !dbg !27 + %48 = or disjoint i32 %39, %35, !dbg !28 + %49 = or disjoint i32 %40, %35, !dbg !28 + %50 = or disjoint i32 %41, %35, !dbg !28 + %51 = or disjoint i32 %42, %35, !dbg !28 + %52 = or disjoint i32 %48, 64, !dbg !28 + %53 = or disjoint i32 %48, 80, !dbg !28 + %54 = or disjoint i32 %48, 96, !dbg !28 + %55 = or disjoint i32 %48, 112, !dbg !28 + %56 = or disjoint i32 %47, %35, !dbg !28 + %57 = or disjoint i32 %56, 8, !dbg !28 + %58 = shl i32 %48, 12, !dbg !29 + %59 = shl i32 %49, 12, !dbg !29 + %60 = shl i32 %50, 12, !dbg !29 + %61 = shl i32 %51, 12, !dbg !29 + %62 = shl i32 %52, 12, !dbg !29 + %63 = shl i32 %53, 12, !dbg !29 + %64 = shl i32 %54, 12, !dbg !29 + %65 = shl i32 %55, 12, !dbg !29 + %66 = sext i32 %58 to i64, !dbg !32 + %67 = getelementptr bfloat, ptr addrspace(1) %26, i64 %66, !dbg !32 + %68 = sext i32 %59 to i64, !dbg !32 + %69 = getelementptr bfloat, ptr addrspace(1) %26, i64 %68, !dbg !32 + %70 = sext i32 %60 to i64, !dbg !32 + %71 = getelementptr bfloat, ptr addrspace(1) %26, i64 %70, !dbg !32 + %72 = sext i32 %61 to i64, !dbg !32 + %73 = getelementptr bfloat, ptr addrspace(1) %26, i64 %72, !dbg !32 + %74 = sext i32 %62 to i64, !dbg !32 + %75 = getelementptr bfloat, ptr addrspace(1) %26, i64 %74, !dbg !32 + %76 = sext i32 %63 to i64, !dbg !32 + %77 = getelementptr bfloat, ptr addrspace(1) %26, i64 %76, !dbg !32 + %78 = sext i32 %64 to i64, !dbg !32 + %79 = getelementptr bfloat, ptr addrspace(1) %26, i64 %78, !dbg !32 + %80 = sext i32 %65 to i64, !dbg !32 + %81 = getelementptr bfloat, ptr addrspace(1) %26, i64 %80, !dbg !32 + %82 = shl nuw nsw i32 %36, 3, !dbg !33 + %83 = and i32 %82, 120, !dbg !33 + %84 = zext nneg i32 %83 to i64, !dbg !34 + %85 = getelementptr bfloat, ptr addrspace(1) %67, i64 %84, !dbg !34 + %86 = getelementptr bfloat, ptr addrspace(1) %69, i64 %84, !dbg !34 + %87 = getelementptr bfloat, ptr addrspace(1) %71, i64 %84, !dbg !34 + %88 = getelementptr bfloat, ptr addrspace(1) %73, i64 %84, !dbg !34 + %89 = getelementptr bfloat, ptr addrspace(1) %75, i64 %84, !dbg !34 + %90 = getelementptr bfloat, ptr addrspace(1) %77, i64 %84, !dbg !34 + %91 = getelementptr bfloat, ptr addrspace(1) %79, i64 %84, !dbg !34 + %92 = getelementptr bfloat, ptr addrspace(1) %81, i64 %84, !dbg !34 + %93 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %85) #2, !dbg !35 + %94 = extractvalue { i32, i32, i32, i32 } %93, 0, !dbg !35 + %95 = extractvalue { i32, i32, i32, i32 } %93, 1, !dbg !35 + %96 = extractvalue { i32, i32, i32, i32 } %93, 2, !dbg !35 + %97 = extractvalue { i32, i32, i32, i32 } %93, 3, !dbg !35 + %98 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %86) #2, !dbg !35 + %99 = extractvalue { i32, i32, i32, i32 } %98, 0, !dbg !35 + %100 = extractvalue { i32, i32, i32, i32 } %98, 1, !dbg !35 + %101 = extractvalue { i32, i32, i32, i32 } %98, 2, !dbg !35 + %102 = extractvalue { i32, i32, i32, i32 } %98, 3, !dbg !35 + %103 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %87) #2, !dbg !35 + %104 = extractvalue { i32, i32, i32, i32 } %103, 0, !dbg !35 + %105 = extractvalue { i32, i32, i32, i32 } %103, 1, !dbg !35 + %106 = extractvalue { i32, i32, i32, i32 } %103, 2, !dbg !35 + %107 = extractvalue { i32, i32, i32, i32 } %103, 3, !dbg !35 + %108 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %88) #2, !dbg !35 + %109 = extractvalue { i32, i32, i32, i32 } %108, 0, !dbg !35 + %110 = extractvalue { i32, i32, i32, i32 } %108, 1, !dbg !35 + %111 = extractvalue { i32, i32, i32, i32 } %108, 2, !dbg !35 + %112 = extractvalue { i32, i32, i32, i32 } %108, 3, !dbg !35 + %113 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %89) #2, !dbg !35 + %114 = extractvalue { i32, i32, i32, i32 } %113, 0, !dbg !35 + %115 = extractvalue { i32, i32, i32, i32 } %113, 1, !dbg !35 + %116 = extractvalue { i32, i32, i32, i32 } %113, 2, !dbg !35 + %117 = extractvalue { i32, i32, i32, i32 } %113, 3, !dbg !35 + %118 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %90) #2, !dbg !35 + %119 = extractvalue { i32, i32, i32, i32 } %118, 0, !dbg !35 + %120 = extractvalue { i32, i32, i32, i32 } %118, 1, !dbg !35 + %121 = extractvalue { i32, i32, i32, i32 } %118, 2, !dbg !35 + %122 = extractvalue { i32, i32, i32, i32 } %118, 3, !dbg !35 + %123 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %91) #2, !dbg !35 + %124 = extractvalue { i32, i32, i32, i32 } %123, 0, !dbg !35 + %125 = extractvalue { i32, i32, i32, i32 } %123, 1, !dbg !35 + %126 = extractvalue { i32, i32, i32, i32 } %123, 2, !dbg !35 + %127 = extractvalue { i32, i32, i32, i32 } %123, 3, !dbg !35 + %128 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %92) #2, !dbg !35 + %129 = extractvalue { i32, i32, i32, i32 } %128, 0, !dbg !35 + %130 = extractvalue { i32, i32, i32, i32 } %128, 1, !dbg !35 + %131 = extractvalue { i32, i32, i32, i32 } %128, 2, !dbg !35 + %132 = extractvalue { i32, i32, i32, i32 } %128, 3, !dbg !35 + %133 = and i32 %36, 7, !dbg !35 + %134 = shl nuw nsw i32 %133, 4, !dbg !35 + %135 = shl nuw nsw i32 %38, 3, !dbg !35 + %136 = and i32 %36, 112, !dbg !35 + %137 = and i32 %36, 8, !dbg !35 + %138 = shl nuw nsw i32 %137, 11, !dbg !35 + %139 = or disjoint i32 %134, %135, !dbg !35 + %140 = xor i32 %139, %136, !dbg !35 + %141 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %138, !dbg !35 + %142 = getelementptr inbounds nuw i8, ptr addrspace(3) %141, i32 %140, !dbg !35 + %143 = insertelement <4 x i32> poison, i32 %94, i64 0, !dbg !35 + %144 = insertelement <4 x i32> %143, i32 %95, i64 1, !dbg !35 + %145 = insertelement <4 x i32> %144, i32 %96, i64 2, !dbg !35 + %146 = insertelement <4 x i32> %145, i32 %97, i64 3, !dbg !35 + store <4 x i32> %146, ptr addrspace(3) %142, align 16, !dbg !35 + %147 = getelementptr inbounds nuw i8, ptr addrspace(3) %142, i32 2048, !dbg !35 + %148 = insertelement <4 x i32> poison, i32 %99, i64 0, !dbg !35 + %149 = insertelement <4 x i32> %148, i32 %100, i64 1, !dbg !35 + %150 = insertelement <4 x i32> %149, i32 %101, i64 2, !dbg !35 + %151 = insertelement <4 x i32> %150, i32 %102, i64 3, !dbg !35 + store <4 x i32> %151, ptr addrspace(3) %147, align 16, !dbg !35 + %152 = getelementptr inbounds nuw i8, ptr addrspace(3) %142, i32 4096, !dbg !35 + %153 = insertelement <4 x i32> poison, i32 %104, i64 0, !dbg !35 + %154 = insertelement <4 x i32> %153, i32 %105, i64 1, !dbg !35 + %155 = insertelement <4 x i32> %154, i32 %106, i64 2, !dbg !35 + %156 = insertelement <4 x i32> %155, i32 %107, i64 3, !dbg !35 + store <4 x i32> %156, ptr addrspace(3) %152, align 16, !dbg !35 + %157 = getelementptr inbounds nuw i8, ptr addrspace(3) %142, i32 6144, !dbg !35 + %158 = insertelement <4 x i32> poison, i32 %109, i64 0, !dbg !35 + %159 = insertelement <4 x i32> %158, i32 %110, i64 1, !dbg !35 + %160 = insertelement <4 x i32> %159, i32 %111, i64 2, !dbg !35 + %161 = insertelement <4 x i32> %160, i32 %112, i64 3, !dbg !35 + store <4 x i32> %161, ptr addrspace(3) %157, align 16, !dbg !35 + %162 = getelementptr inbounds nuw i8, ptr addrspace(3) %142, i32 8192, !dbg !35 + %163 = insertelement <4 x i32> poison, i32 %114, i64 0, !dbg !35 + %164 = insertelement <4 x i32> %163, i32 %115, i64 1, !dbg !35 + %165 = insertelement <4 x i32> %164, i32 %116, i64 2, !dbg !35 + %166 = insertelement <4 x i32> %165, i32 %117, i64 3, !dbg !35 + store <4 x i32> %166, ptr addrspace(3) %162, align 16, !dbg !35 + %167 = getelementptr inbounds nuw i8, ptr addrspace(3) %142, i32 10240, !dbg !35 + %168 = insertelement <4 x i32> poison, i32 %119, i64 0, !dbg !35 + %169 = insertelement <4 x i32> %168, i32 %120, i64 1, !dbg !35 + %170 = insertelement <4 x i32> %169, i32 %121, i64 2, !dbg !35 + %171 = insertelement <4 x i32> %170, i32 %122, i64 3, !dbg !35 + store <4 x i32> %171, ptr addrspace(3) %167, align 16, !dbg !35 + %172 = getelementptr inbounds nuw i8, ptr addrspace(3) %142, i32 12288, !dbg !35 + %173 = insertelement <4 x i32> poison, i32 %124, i64 0, !dbg !35 + %174 = insertelement <4 x i32> %173, i32 %125, i64 1, !dbg !35 + %175 = insertelement <4 x i32> %174, i32 %126, i64 2, !dbg !35 + %176 = insertelement <4 x i32> %175, i32 %127, i64 3, !dbg !35 + store <4 x i32> %176, ptr addrspace(3) %172, align 16, !dbg !35 + %177 = getelementptr inbounds nuw i8, ptr addrspace(3) %142, i32 14336, !dbg !35 + %178 = insertelement <4 x i32> poison, i32 %129, i64 0, !dbg !35 + %179 = insertelement <4 x i32> %178, i32 %130, i64 1, !dbg !35 + %180 = insertelement <4 x i32> %179, i32 %131, i64 2, !dbg !35 + %181 = insertelement <4 x i32> %180, i32 %132, i64 3, !dbg !35 + store <4 x i32> %181, ptr addrspace(3) %177, align 16, !dbg !35 + %182 = sext i32 %34 to i64, !dbg !36 + %183 = getelementptr i32, ptr addrspace(1) %6, i64 %182, !dbg !36 + %184 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %183) #2, !dbg !37 + %185 = shl i32 %184, 7, !dbg !38 + %186 = sext i32 %31 to i64, !dbg !39 + %187 = getelementptr i32, ptr addrspace(1) %5, i64 %186, !dbg !39 + %188 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %187) #2, !dbg !40 + %189 = shl i32 %188, 1, !dbg !41 + %190 = and i32 %36, 3, !dbg !42 + %191 = zext nneg i32 %15 to i64, !dbg !43 + %192 = getelementptr i64, ptr addrspace(1) %9, i64 %191, !dbg !43 + %193 = icmp sgt i32 %189, 0, !dbg !45 + %194 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %192, i1 %193) #2, !dbg !46 + %195 = sext i32 %56 to i64, !dbg !47 + %196 = sext i32 %57 to i64, !dbg !47 + %197 = icmp sgt i64 %194, %195, !dbg !48 + %198 = icmp sgt i64 %194, %196, !dbg !48 + %199 = or disjoint i32 %185, %39, !dbg !49 + %200 = or disjoint i32 %185, %40, !dbg !49 + %201 = or disjoint i32 %185, %41, !dbg !49 + %202 = or disjoint i32 %185, %42, !dbg !49 + %203 = shl i32 %199, 7, !dbg !50 + %204 = shl i32 %200, 7, !dbg !50 + %205 = shl i32 %201, 7, !dbg !50 + %206 = shl i32 %202, 7, !dbg !50 + %207 = sext i32 %203 to i64, !dbg !51 + %208 = getelementptr bfloat, ptr addrspace(1) %28, i64 %207, !dbg !51 + %209 = sext i32 %204 to i64, !dbg !51 + %210 = getelementptr bfloat, ptr addrspace(1) %28, i64 %209, !dbg !51 + %211 = sext i32 %205 to i64, !dbg !51 + %212 = getelementptr bfloat, ptr addrspace(1) %28, i64 %211, !dbg !51 + %213 = sext i32 %206 to i64, !dbg !51 + %214 = getelementptr bfloat, ptr addrspace(1) %28, i64 %213, !dbg !51 + %215 = getelementptr bfloat, ptr addrspace(1) %208, i64 %84, !dbg !52 + %216 = getelementptr bfloat, ptr addrspace(1) %210, i64 %84, !dbg !52 + %217 = getelementptr bfloat, ptr addrspace(1) %212, i64 %84, !dbg !52 + %218 = getelementptr bfloat, ptr addrspace(1) %214, i64 %84, !dbg !52 + %219 = shl nuw nsw i32 %137, 10, !dbg !53 + %220 = or disjoint i32 %140, %219, !dbg !53 + %221 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %220, !dbg !53 + %222 = select i1 %193, i32 16, i32 0, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %221, ptr addrspace(1) %215, i32 %222) #2, !dbg !53 + %223 = or disjoint i32 %220, 2048, !dbg !53 + %224 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %223, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %224, ptr addrspace(1) %216, i32 %222) #2, !dbg !53 + %225 = or disjoint i32 %220, 4096, !dbg !53 + %226 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %225, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %226, ptr addrspace(1) %217, i32 %222) #2, !dbg !53 + %227 = or disjoint i32 %220, 6144, !dbg !53 + %228 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %227, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %228, ptr addrspace(1) %218, i32 %222) #2, !dbg !53 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !53 + %229 = getelementptr bfloat, ptr addrspace(1) %29, i64 %207, !dbg !51 + %230 = getelementptr bfloat, ptr addrspace(1) %29, i64 %209, !dbg !51 + %231 = getelementptr bfloat, ptr addrspace(1) %29, i64 %211, !dbg !51 + %232 = getelementptr bfloat, ptr addrspace(1) %29, i64 %213, !dbg !51 + %233 = getelementptr bfloat, ptr addrspace(1) %229, i64 %84, !dbg !52 + %234 = getelementptr bfloat, ptr addrspace(1) %230, i64 %84, !dbg !52 + %235 = getelementptr bfloat, ptr addrspace(1) %231, i64 %84, !dbg !52 + %236 = getelementptr bfloat, ptr addrspace(1) %232, i64 %84, !dbg !52 + %237 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %220, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %237, ptr addrspace(1) %233, i32 %222) #2, !dbg !53 + %238 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %223, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %238, ptr addrspace(1) %234, i32 %222) #2, !dbg !53 + %239 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %225, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %239, ptr addrspace(1) %235, i32 %222) #2, !dbg !53 + %240 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %227, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %240, ptr addrspace(1) %236, i32 %222) #2, !dbg !53 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !53 + %241 = icmp sgt i32 %189, 1, !dbg !45 + %242 = or disjoint i32 %185, 64, !dbg !54 + %243 = or disjoint i32 %242, %39, !dbg !49 + %244 = or disjoint i32 %242, %40, !dbg !49 + %245 = or disjoint i32 %242, %41, !dbg !49 + %246 = or disjoint i32 %242, %42, !dbg !49 + %247 = shl i32 %243, 7, !dbg !50 + %248 = shl i32 %244, 7, !dbg !50 + %249 = shl i32 %245, 7, !dbg !50 + %250 = shl i32 %246, 7, !dbg !50 + %251 = sext i32 %247 to i64, !dbg !51 + %252 = getelementptr bfloat, ptr addrspace(1) %28, i64 %251, !dbg !51 + %253 = sext i32 %248 to i64, !dbg !51 + %254 = getelementptr bfloat, ptr addrspace(1) %28, i64 %253, !dbg !51 + %255 = sext i32 %249 to i64, !dbg !51 + %256 = getelementptr bfloat, ptr addrspace(1) %28, i64 %255, !dbg !51 + %257 = sext i32 %250 to i64, !dbg !51 + %258 = getelementptr bfloat, ptr addrspace(1) %28, i64 %257, !dbg !51 + %259 = getelementptr bfloat, ptr addrspace(1) %252, i64 %84, !dbg !52 + %260 = getelementptr bfloat, ptr addrspace(1) %254, i64 %84, !dbg !52 + %261 = getelementptr bfloat, ptr addrspace(1) %256, i64 %84, !dbg !52 + %262 = getelementptr bfloat, ptr addrspace(1) %258, i64 %84, !dbg !52 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %263 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %220, !dbg !53 + %264 = select i1 %241, i32 16, i32 0, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %263, ptr addrspace(1) %259, i32 %264) #2, !dbg !53 + %265 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %223, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %265, ptr addrspace(1) %260, i32 %264) #2, !dbg !53 + %266 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %225, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %266, ptr addrspace(1) %261, i32 %264) #2, !dbg !53 + %267 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %227, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %267, ptr addrspace(1) %262, i32 %264) #2, !dbg !53 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !53 + %268 = getelementptr bfloat, ptr addrspace(1) %29, i64 %251, !dbg !51 + %269 = getelementptr bfloat, ptr addrspace(1) %29, i64 %253, !dbg !51 + %270 = getelementptr bfloat, ptr addrspace(1) %29, i64 %255, !dbg !51 + %271 = getelementptr bfloat, ptr addrspace(1) %29, i64 %257, !dbg !51 + %272 = getelementptr bfloat, ptr addrspace(1) %268, i64 %84, !dbg !52 + %273 = getelementptr bfloat, ptr addrspace(1) %269, i64 %84, !dbg !52 + %274 = getelementptr bfloat, ptr addrspace(1) %270, i64 %84, !dbg !52 + %275 = getelementptr bfloat, ptr addrspace(1) %271, i64 %84, !dbg !52 + %276 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %220, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %276, ptr addrspace(1) %272, i32 %264) #2, !dbg !53 + %277 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %223, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %277, ptr addrspace(1) %273, i32 %264) #2, !dbg !53 + %278 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %225, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %278, ptr addrspace(1) %274, i32 %264) #2, !dbg !53 + %279 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %227, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %279, ptr addrspace(1) %275, i32 %264) #2, !dbg !53 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !53 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #2, !dbg !55 + %invariant.gep = getelementptr bfloat, ptr addrspace(1) %28, i64 %84, !dbg !45 + %invariant.gep335 = getelementptr bfloat, ptr addrspace(1) %29, i64 %84, !dbg !45 + br i1 %193, label %.lr.ph, label %._crit_edge, !dbg !45 + +.lr.ph: ; preds = %13 + %280 = tail call i32 @llvm.umin.i32(i32 %189, i32 32), !dbg !56 + %281 = shl nuw nsw i32 %190, 1, !dbg !42 + %282 = or disjoint i32 %185, %281, !dbg !42 + %283 = insertelement <8 x i32> poison, i32 %282, i64 0, !dbg !57 + %284 = add nsw i32 %280, -2 + %285 = add nsw i32 %280, -1 + %286 = shufflevector <8 x i32> %283, <8 x i32> poison, <16 x i32> + %287 = insertelement <16 x i32> %286, i32 %282, i64 8 + %288 = insertelement <16 x i32> %287, i32 %282, i64 9 + %289 = insertelement <16 x i32> %288, i32 %282, i64 10 + %290 = insertelement <16 x i32> %289, i32 %282, i64 11 + %291 = insertelement <16 x i32> %290, i32 %282, i64 12 + %292 = insertelement <16 x i32> %291, i32 %282, i64 13 + %293 = insertelement <16 x i32> %292, i32 %282, i64 14 + %294 = or disjoint <16 x i32> %293, + %295 = insertelement <16 x i32> %294, i32 %282, i64 15 + %296 = insertelement <32 x i32> poison, i32 %57, i64 0, !dbg !58 + %297 = insertelement <32 x i32> %296, i32 %56, i64 1, !dbg !58 + %298 = shufflevector <32 x i32> %297, <32 x i32> poison, <32 x i32> , !dbg !58 + %299 = insertelement <8 x i64> poison, i64 %194, i64 0, !dbg !59 + %300 = shufflevector <8 x i64> %299, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !59 + br label %301, !dbg !45 + +301: ; preds = %.lr.ph, %__nv_exp2f.exit291 + %302 = phi i32 [ 64, %.lr.ph ], [ %1899, %__nv_exp2f.exit291 ] + %303 = phi i32 [ -1, %.lr.ph ], [ %380, %__nv_exp2f.exit291 ] + %304 = phi i32 [ 1, %.lr.ph ], [ %1903, %__nv_exp2f.exit291 ] + %305 = phi i32 [ 64, %.lr.ph ], [ %1900, %__nv_exp2f.exit291 ] + %306 = phi float [ 0.000000e+00, %.lr.ph ], [ %1466, %__nv_exp2f.exit291 ] + %307 = phi float [ 0.000000e+00, %.lr.ph ], [ %1467, %__nv_exp2f.exit291 ] + %308 = phi float [ 0.000000e+00, %.lr.ph ], [ %1813, %__nv_exp2f.exit291 ] + %309 = phi float [ 0.000000e+00, %.lr.ph ], [ %1814, %__nv_exp2f.exit291 ] + %310 = phi float [ 0.000000e+00, %.lr.ph ], [ %1815, %__nv_exp2f.exit291 ] + %311 = phi float [ 0.000000e+00, %.lr.ph ], [ %1816, %__nv_exp2f.exit291 ] + %312 = phi float [ 0.000000e+00, %.lr.ph ], [ %1817, %__nv_exp2f.exit291 ] + %313 = phi float [ 0.000000e+00, %.lr.ph ], [ %1818, %__nv_exp2f.exit291 ] + %314 = phi float [ 0.000000e+00, %.lr.ph ], [ %1819, %__nv_exp2f.exit291 ] + %315 = phi float [ 0.000000e+00, %.lr.ph ], [ %1820, %__nv_exp2f.exit291 ] + %316 = phi float [ 0.000000e+00, %.lr.ph ], [ %1821, %__nv_exp2f.exit291 ] + %317 = phi float [ 0.000000e+00, %.lr.ph ], [ %1822, %__nv_exp2f.exit291 ] + %318 = phi float [ 0.000000e+00, %.lr.ph ], [ %1823, %__nv_exp2f.exit291 ] + %319 = phi float [ 0.000000e+00, %.lr.ph ], [ %1824, %__nv_exp2f.exit291 ] + %320 = phi float [ 0.000000e+00, %.lr.ph ], [ %1825, %__nv_exp2f.exit291 ] + %321 = phi float [ 0.000000e+00, %.lr.ph ], [ %1826, %__nv_exp2f.exit291 ] + %322 = phi float [ 0.000000e+00, %.lr.ph ], [ %1827, %__nv_exp2f.exit291 ] + %323 = phi float [ 0.000000e+00, %.lr.ph ], [ %1828, %__nv_exp2f.exit291 ] + %324 = phi float [ 0.000000e+00, %.lr.ph ], [ %1829, %__nv_exp2f.exit291 ] + %325 = phi float [ 0.000000e+00, %.lr.ph ], [ %1830, %__nv_exp2f.exit291 ] + %326 = phi float [ 0.000000e+00, %.lr.ph ], [ %1831, %__nv_exp2f.exit291 ] + %327 = phi float [ 0.000000e+00, %.lr.ph ], [ %1832, %__nv_exp2f.exit291 ] + %328 = phi float [ 0.000000e+00, %.lr.ph ], [ %1833, %__nv_exp2f.exit291 ] + %329 = phi float [ 0.000000e+00, %.lr.ph ], [ %1834, %__nv_exp2f.exit291 ] + %330 = phi float [ 0.000000e+00, %.lr.ph ], [ %1835, %__nv_exp2f.exit291 ] + %331 = phi float [ 0.000000e+00, %.lr.ph ], [ %1836, %__nv_exp2f.exit291 ] + %332 = phi float [ 0.000000e+00, %.lr.ph ], [ %1837, %__nv_exp2f.exit291 ] + %333 = phi float [ 0.000000e+00, %.lr.ph ], [ %1838, %__nv_exp2f.exit291 ] + %334 = phi float [ 0.000000e+00, %.lr.ph ], [ %1839, %__nv_exp2f.exit291 ] + %335 = phi float [ 0.000000e+00, %.lr.ph ], [ %1840, %__nv_exp2f.exit291 ] + %336 = phi float [ 0.000000e+00, %.lr.ph ], [ %1841, %__nv_exp2f.exit291 ] + %337 = phi float [ 0.000000e+00, %.lr.ph ], [ %1842, %__nv_exp2f.exit291 ] + %338 = phi float [ 0.000000e+00, %.lr.ph ], [ %1843, %__nv_exp2f.exit291 ] + %339 = phi float [ 0.000000e+00, %.lr.ph ], [ %1844, %__nv_exp2f.exit291 ] + %340 = phi float [ 0.000000e+00, %.lr.ph ], [ %1845, %__nv_exp2f.exit291 ] + %341 = phi float [ 0.000000e+00, %.lr.ph ], [ %1846, %__nv_exp2f.exit291 ] + %342 = phi float [ 0.000000e+00, %.lr.ph ], [ %1847, %__nv_exp2f.exit291 ] + %343 = phi float [ 0.000000e+00, %.lr.ph ], [ %1848, %__nv_exp2f.exit291 ] + %344 = phi float [ 0.000000e+00, %.lr.ph ], [ %1849, %__nv_exp2f.exit291 ] + %345 = phi float [ 0.000000e+00, %.lr.ph ], [ %1850, %__nv_exp2f.exit291 ] + %346 = phi float [ 0.000000e+00, %.lr.ph ], [ %1851, %__nv_exp2f.exit291 ] + %347 = phi float [ 0.000000e+00, %.lr.ph ], [ %1852, %__nv_exp2f.exit291 ] + %348 = phi float [ 0.000000e+00, %.lr.ph ], [ %1853, %__nv_exp2f.exit291 ] + %349 = phi float [ 0.000000e+00, %.lr.ph ], [ %1854, %__nv_exp2f.exit291 ] + %350 = phi float [ 0.000000e+00, %.lr.ph ], [ %1855, %__nv_exp2f.exit291 ] + %351 = phi float [ 0.000000e+00, %.lr.ph ], [ %1856, %__nv_exp2f.exit291 ] + %352 = phi float [ 0.000000e+00, %.lr.ph ], [ %1857, %__nv_exp2f.exit291 ] + %353 = phi float [ 0.000000e+00, %.lr.ph ], [ %1858, %__nv_exp2f.exit291 ] + %354 = phi float [ 0.000000e+00, %.lr.ph ], [ %1859, %__nv_exp2f.exit291 ] + %355 = phi float [ 0.000000e+00, %.lr.ph ], [ %1860, %__nv_exp2f.exit291 ] + %356 = phi float [ 0.000000e+00, %.lr.ph ], [ %1861, %__nv_exp2f.exit291 ] + %357 = phi float [ 0.000000e+00, %.lr.ph ], [ %1862, %__nv_exp2f.exit291 ] + %358 = phi float [ 0.000000e+00, %.lr.ph ], [ %1863, %__nv_exp2f.exit291 ] + %359 = phi float [ 0.000000e+00, %.lr.ph ], [ %1864, %__nv_exp2f.exit291 ] + %360 = phi float [ 0.000000e+00, %.lr.ph ], [ %1865, %__nv_exp2f.exit291 ] + %361 = phi float [ 0.000000e+00, %.lr.ph ], [ %1866, %__nv_exp2f.exit291 ] + %362 = phi float [ 0.000000e+00, %.lr.ph ], [ %1867, %__nv_exp2f.exit291 ] + %363 = phi float [ 0.000000e+00, %.lr.ph ], [ %1868, %__nv_exp2f.exit291 ] + %364 = phi float [ 0.000000e+00, %.lr.ph ], [ %1869, %__nv_exp2f.exit291 ] + %365 = phi float [ 0.000000e+00, %.lr.ph ], [ %1870, %__nv_exp2f.exit291 ] + %366 = phi float [ 0.000000e+00, %.lr.ph ], [ %1871, %__nv_exp2f.exit291 ] + %367 = phi float [ 0.000000e+00, %.lr.ph ], [ %1872, %__nv_exp2f.exit291 ] + %368 = phi float [ 0.000000e+00, %.lr.ph ], [ %1873, %__nv_exp2f.exit291 ] + %369 = phi float [ 0.000000e+00, %.lr.ph ], [ %1874, %__nv_exp2f.exit291 ] + %370 = phi float [ 0.000000e+00, %.lr.ph ], [ %1875, %__nv_exp2f.exit291 ] + %371 = phi float [ 0.000000e+00, %.lr.ph ], [ %1876, %__nv_exp2f.exit291 ] + %372 = phi i32 [ 0, %.lr.ph ], [ %1880, %__nv_exp2f.exit291 ] + %373 = phi <2 x float> [ splat (float 0xFFF0000000000000), %.lr.ph ], [ %1205, %__nv_exp2f.exit291 ] + %374 = phi <16 x i32> [ %295, %.lr.ph ], [ %1879, %__nv_exp2f.exit291 ] + %375 = shufflevector <16 x i32> %374, <16 x i32> poison, <32 x i32> + %376 = icmp slt i32 %372, %284, !dbg !45 + %377 = icmp slt i32 %372, %285, !dbg !45 + %378 = add i32 %303, 1, !dbg !45 + %379 = icmp sgt i32 %378, 2, !dbg !45 + %380 = select i1 %379, i32 0, i32 %378, !dbg !45 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !53 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %381 = shl i32 %380, 13, !dbg !53 + %382 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %381, !dbg !53 + %383 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %37, i32 0, i32 31), !dbg !55 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !55 + %384 = shl i32 %383, 11, !dbg !55 + %385 = and i32 %384, 8192, !dbg !55 + %386 = add i32 %385, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !55 + %387 = lshr exact i32 %386, 4, !dbg !55 + %388 = and i32 %387, 16383, !dbg !55 + %389 = zext nneg i32 %388 to i64, !dbg !55 + %390 = or disjoint i64 %389, 4611686293372403712, !dbg !55 + %391 = ptrtoint ptr addrspace(3) %382 to i32, !dbg !55 + %392 = lshr exact i32 %391, 4, !dbg !55 + %393 = and i32 %392, 16383, !dbg !55 + %394 = zext nneg i32 %393 to i64, !dbg !55 + %395 = or disjoint i64 %394, 4611686293338849280, !dbg !55 + %396 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %390, i64 %395) #2, !dbg !55 + %397 = add i32 %385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 32), !dbg !55 + %398 = lshr exact i32 %397, 4, !dbg !55 + %399 = and i32 %398, 16383, !dbg !55 + %400 = zext nneg i32 %399 to i64, !dbg !55 + %401 = or disjoint i64 %400, 4611686293372403712, !dbg !55 + %402 = add i32 %391, 32, !dbg !55 + %403 = lshr exact i32 %402, 4, !dbg !55 + %404 = and i32 %403, 16383, !dbg !55 + %405 = zext nneg i32 %404 to i64, !dbg !55 + %406 = or disjoint i64 %405, 4611686293338849280, !dbg !55 + %407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 0, !dbg !55 + %408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 1, !dbg !55 + %409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 2, !dbg !55 + %410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 3, !dbg !55 + %411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 4, !dbg !55 + %412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 5, !dbg !55 + %413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 6, !dbg !55 + %414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 7, !dbg !55 + %415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 8, !dbg !55 + %416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 9, !dbg !55 + %417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 10, !dbg !55 + %418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 11, !dbg !55 + %419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 12, !dbg !55 + %420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 13, !dbg !55 + %421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 14, !dbg !55 + %422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 15, !dbg !55 + %423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 16, !dbg !55 + %424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 17, !dbg !55 + %425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 18, !dbg !55 + %426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 19, !dbg !55 + %427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 20, !dbg !55 + %428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 21, !dbg !55 + %429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 22, !dbg !55 + %430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 23, !dbg !55 + %431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 24, !dbg !55 + %432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 25, !dbg !55 + %433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 26, !dbg !55 + %434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 27, !dbg !55 + %435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 28, !dbg !55 + %436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 29, !dbg !55 + %437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 30, !dbg !55 + %438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 31, !dbg !55 + %439 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %407, float %408, float %409, float %410, float %411, float %412, float %413, float %414, float %415, float %416, float %417, float %418, float %419, float %420, float %421, float %422, float %423, float %424, float %425, float %426, float %427, float %428, float %429, float %430, float %431, float %432, float %433, float %434, float %435, float %436, float %437, float %438, i64 %401, i64 %406, i1 true) #2, !dbg !55 + %440 = add i32 %385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 64), !dbg !55 + %441 = lshr exact i32 %440, 4, !dbg !55 + %442 = and i32 %441, 16383, !dbg !55 + %443 = zext nneg i32 %442 to i64, !dbg !55 + %444 = or disjoint i64 %443, 4611686293372403712, !dbg !55 + %445 = add i32 %391, 64, !dbg !55 + %446 = lshr exact i32 %445, 4, !dbg !55 + %447 = and i32 %446, 16383, !dbg !55 + %448 = zext nneg i32 %447 to i64, !dbg !55 + %449 = or disjoint i64 %448, 4611686293338849280, !dbg !55 + %450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 0, !dbg !55 + %451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 1, !dbg !55 + %452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 2, !dbg !55 + %453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 3, !dbg !55 + %454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 4, !dbg !55 + %455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 5, !dbg !55 + %456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 6, !dbg !55 + %457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 7, !dbg !55 + %458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 8, !dbg !55 + %459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 9, !dbg !55 + %460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 10, !dbg !55 + %461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 11, !dbg !55 + %462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 12, !dbg !55 + %463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 13, !dbg !55 + %464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 14, !dbg !55 + %465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 15, !dbg !55 + %466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 16, !dbg !55 + %467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 17, !dbg !55 + %468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 18, !dbg !55 + %469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 19, !dbg !55 + %470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 20, !dbg !55 + %471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 21, !dbg !55 + %472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 22, !dbg !55 + %473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 23, !dbg !55 + %474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 24, !dbg !55 + %475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 25, !dbg !55 + %476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 26, !dbg !55 + %477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 27, !dbg !55 + %478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 28, !dbg !55 + %479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 29, !dbg !55 + %480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 30, !dbg !55 + %481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 31, !dbg !55 + %482 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %450, float %451, float %452, float %453, float %454, float %455, float %456, float %457, float %458, float %459, float %460, float %461, float %462, float %463, float %464, float %465, float %466, float %467, float %468, float %469, float %470, float %471, float %472, float %473, float %474, float %475, float %476, float %477, float %478, float %479, float %480, float %481, i64 %444, i64 %449, i1 true) #2, !dbg !55 + %483 = add i32 %385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 96), !dbg !55 + %484 = lshr exact i32 %483, 4, !dbg !55 + %485 = and i32 %484, 16383, !dbg !55 + %486 = zext nneg i32 %485 to i64, !dbg !55 + %487 = or disjoint i64 %486, 4611686293372403712, !dbg !55 + %488 = add i32 %391, 96, !dbg !55 + %489 = lshr exact i32 %488, 4, !dbg !55 + %490 = and i32 %489, 16383, !dbg !55 + %491 = zext nneg i32 %490 to i64, !dbg !55 + %492 = or disjoint i64 %491, 4611686293338849280, !dbg !55 + %493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 0, !dbg !55 + %494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 1, !dbg !55 + %495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 2, !dbg !55 + %496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 3, !dbg !55 + %497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 4, !dbg !55 + %498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 5, !dbg !55 + %499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 6, !dbg !55 + %500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 7, !dbg !55 + %501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 8, !dbg !55 + %502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 9, !dbg !55 + %503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 10, !dbg !55 + %504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 11, !dbg !55 + %505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 12, !dbg !55 + %506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 13, !dbg !55 + %507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 14, !dbg !55 + %508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 15, !dbg !55 + %509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 16, !dbg !55 + %510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 17, !dbg !55 + %511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 18, !dbg !55 + %512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 19, !dbg !55 + %513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 20, !dbg !55 + %514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 21, !dbg !55 + %515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 22, !dbg !55 + %516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 23, !dbg !55 + %517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 24, !dbg !55 + %518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 25, !dbg !55 + %519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 26, !dbg !55 + %520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 27, !dbg !55 + %521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 28, !dbg !55 + %522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 29, !dbg !55 + %523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 30, !dbg !55 + %524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 31, !dbg !55 + %525 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %493, float %494, float %495, float %496, float %497, float %498, float %499, float %500, float %501, float %502, float %503, float %504, float %505, float %506, float %507, float %508, float %509, float %510, float %511, float %512, float %513, float %514, float %515, float %516, float %517, float %518, float %519, float %520, float %521, float %522, float %523, float %524, i64 %487, i64 %492, i1 true) #2, !dbg !55 + %526 = add i32 %385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16384), !dbg !55 + %527 = lshr exact i32 %526, 4, !dbg !55 + %528 = and i32 %527, 16383, !dbg !55 + %529 = zext nneg i32 %528 to i64, !dbg !55 + %530 = or disjoint i64 %529, 4611686293372403712, !dbg !55 + %531 = add i32 %391, 8192, !dbg !55 + %532 = lshr exact i32 %531, 4, !dbg !55 + %533 = and i32 %532, 16383, !dbg !55 + %534 = zext nneg i32 %533 to i64, !dbg !55 + %535 = or disjoint i64 %534, 4611686293338849280, !dbg !55 + %536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 0, !dbg !55 + %537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 1, !dbg !55 + %538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 2, !dbg !55 + %539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 3, !dbg !55 + %540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 4, !dbg !55 + %541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 5, !dbg !55 + %542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 6, !dbg !55 + %543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 7, !dbg !55 + %544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 8, !dbg !55 + %545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 9, !dbg !55 + %546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 10, !dbg !55 + %547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 11, !dbg !55 + %548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 12, !dbg !55 + %549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 13, !dbg !55 + %550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 14, !dbg !55 + %551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 15, !dbg !55 + %552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 16, !dbg !55 + %553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 17, !dbg !55 + %554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 18, !dbg !55 + %555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 19, !dbg !55 + %556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 20, !dbg !55 + %557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 21, !dbg !55 + %558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 22, !dbg !55 + %559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 23, !dbg !55 + %560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 24, !dbg !55 + %561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 25, !dbg !55 + %562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 26, !dbg !55 + %563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 27, !dbg !55 + %564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 28, !dbg !55 + %565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 29, !dbg !55 + %566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 30, !dbg !55 + %567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 31, !dbg !55 + %568 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %536, float %537, float %538, float %539, float %540, float %541, float %542, float %543, float %544, float %545, float %546, float %547, float %548, float %549, float %550, float %551, float %552, float %553, float %554, float %555, float %556, float %557, float %558, float %559, float %560, float %561, float %562, float %563, float %564, float %565, float %566, float %567, i64 %530, i64 %535, i1 true) #2, !dbg !55 + %569 = add i32 %385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16416), !dbg !55 + %570 = lshr exact i32 %569, 4, !dbg !55 + %571 = and i32 %570, 16383, !dbg !55 + %572 = zext nneg i32 %571 to i64, !dbg !55 + %573 = or disjoint i64 %572, 4611686293372403712, !dbg !55 + %574 = add i32 %391, 8224, !dbg !55 + %575 = lshr exact i32 %574, 4, !dbg !55 + %576 = and i32 %575, 16383, !dbg !55 + %577 = zext nneg i32 %576 to i64, !dbg !55 + %578 = or disjoint i64 %577, 4611686293338849280, !dbg !55 + %579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 0, !dbg !55 + %580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 1, !dbg !55 + %581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 2, !dbg !55 + %582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 3, !dbg !55 + %583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 4, !dbg !55 + %584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 5, !dbg !55 + %585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 6, !dbg !55 + %586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 7, !dbg !55 + %587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 8, !dbg !55 + %588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 9, !dbg !55 + %589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 10, !dbg !55 + %590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 11, !dbg !55 + %591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 12, !dbg !55 + %592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 13, !dbg !55 + %593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 14, !dbg !55 + %594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 15, !dbg !55 + %595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 16, !dbg !55 + %596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 17, !dbg !55 + %597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 18, !dbg !55 + %598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 19, !dbg !55 + %599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 20, !dbg !55 + %600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 21, !dbg !55 + %601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 22, !dbg !55 + %602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 23, !dbg !55 + %603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 24, !dbg !55 + %604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 25, !dbg !55 + %605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 26, !dbg !55 + %606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 27, !dbg !55 + %607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 28, !dbg !55 + %608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 29, !dbg !55 + %609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 30, !dbg !55 + %610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 31, !dbg !55 + %611 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %579, float %580, float %581, float %582, float %583, float %584, float %585, float %586, float %587, float %588, float %589, float %590, float %591, float %592, float %593, float %594, float %595, float %596, float %597, float %598, float %599, float %600, float %601, float %602, float %603, float %604, float %605, float %606, float %607, float %608, float %609, float %610, i64 %573, i64 %578, i1 true) #2, !dbg !55 + %612 = add i32 %385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16448), !dbg !55 + %613 = lshr exact i32 %612, 4, !dbg !55 + %614 = and i32 %613, 16383, !dbg !55 + %615 = zext nneg i32 %614 to i64, !dbg !55 + %616 = or disjoint i64 %615, 4611686293372403712, !dbg !55 + %617 = add i32 %391, 8256, !dbg !55 + %618 = lshr exact i32 %617, 4, !dbg !55 + %619 = and i32 %618, 16383, !dbg !55 + %620 = zext nneg i32 %619 to i64, !dbg !55 + %621 = or disjoint i64 %620, 4611686293338849280, !dbg !55 + %622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 0, !dbg !55 + %623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 1, !dbg !55 + %624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 2, !dbg !55 + %625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 3, !dbg !55 + %626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 4, !dbg !55 + %627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 5, !dbg !55 + %628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 6, !dbg !55 + %629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 7, !dbg !55 + %630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 8, !dbg !55 + %631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 9, !dbg !55 + %632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 10, !dbg !55 + %633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 11, !dbg !55 + %634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 12, !dbg !55 + %635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 13, !dbg !55 + %636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 14, !dbg !55 + %637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 15, !dbg !55 + %638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 16, !dbg !55 + %639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 17, !dbg !55 + %640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 18, !dbg !55 + %641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 19, !dbg !55 + %642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 20, !dbg !55 + %643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 21, !dbg !55 + %644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 22, !dbg !55 + %645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 23, !dbg !55 + %646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 24, !dbg !55 + %647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 25, !dbg !55 + %648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 26, !dbg !55 + %649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 27, !dbg !55 + %650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 28, !dbg !55 + %651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 29, !dbg !55 + %652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 30, !dbg !55 + %653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 31, !dbg !55 + %654 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %622, float %623, float %624, float %625, float %626, float %627, float %628, float %629, float %630, float %631, float %632, float %633, float %634, float %635, float %636, float %637, float %638, float %639, float %640, float %641, float %642, float %643, float %644, float %645, float %646, float %647, float %648, float %649, float %650, float %651, float %652, float %653, i64 %616, i64 %621, i1 true) #2, !dbg !55 + %655 = add i32 %385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16480), !dbg !55 + %656 = lshr exact i32 %655, 4, !dbg !55 + %657 = and i32 %656, 16383, !dbg !55 + %658 = zext nneg i32 %657 to i64, !dbg !55 + %659 = or disjoint i64 %658, 4611686293372403712, !dbg !55 + %660 = add i32 %391, 8288, !dbg !55 + %661 = lshr exact i32 %660, 4, !dbg !55 + %662 = and i32 %661, 16383, !dbg !55 + %663 = zext nneg i32 %662 to i64, !dbg !55 + %664 = or disjoint i64 %663, 4611686293338849280, !dbg !55 + %665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 0, !dbg !55 + %666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 1, !dbg !55 + %667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 2, !dbg !55 + %668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 3, !dbg !55 + %669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 4, !dbg !55 + %670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 5, !dbg !55 + %671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 6, !dbg !55 + %672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 7, !dbg !55 + %673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 8, !dbg !55 + %674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 9, !dbg !55 + %675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 10, !dbg !55 + %676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 11, !dbg !55 + %677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 12, !dbg !55 + %678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 13, !dbg !55 + %679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 14, !dbg !55 + %680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 15, !dbg !55 + %681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 16, !dbg !55 + %682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 17, !dbg !55 + %683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 18, !dbg !55 + %684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 19, !dbg !55 + %685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 20, !dbg !55 + %686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 21, !dbg !55 + %687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 22, !dbg !55 + %688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 23, !dbg !55 + %689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 24, !dbg !55 + %690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 25, !dbg !55 + %691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 26, !dbg !55 + %692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 27, !dbg !55 + %693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 28, !dbg !55 + %694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 29, !dbg !55 + %695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 30, !dbg !55 + %696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 31, !dbg !55 + %697 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %665, float %666, float %667, float %668, float %669, float %670, float %671, float %672, float %673, float %674, float %675, float %676, float %677, float %678, float %679, float %680, float %681, float %682, float %683, float %684, float %685, float %686, float %687, float %688, float %689, float %690, float %691, float %692, float %693, float %694, float %695, float %696, i64 %659, i64 %664, i1 true) #2, !dbg !55 + %698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 0, !dbg !55 + %699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 1, !dbg !55 + %700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 2, !dbg !55 + %701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 3, !dbg !55 + %702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 4, !dbg !55 + %703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 5, !dbg !55 + %704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 6, !dbg !55 + %705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 7, !dbg !55 + %706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 8, !dbg !55 + %707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 9, !dbg !55 + %708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 10, !dbg !55 + %709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 11, !dbg !55 + %710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 12, !dbg !55 + %711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 13, !dbg !55 + %712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 14, !dbg !55 + %713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 15, !dbg !55 + %714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 16, !dbg !55 + %715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 17, !dbg !55 + %716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 18, !dbg !55 + %717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 19, !dbg !55 + %718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 20, !dbg !55 + %719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 21, !dbg !55 + %720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 22, !dbg !55 + %721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 23, !dbg !55 + %722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 24, !dbg !55 + %723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 25, !dbg !55 + %724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 26, !dbg !55 + %725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 27, !dbg !55 + %726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 28, !dbg !55 + %727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 29, !dbg !55 + %728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 30, !dbg !55 + %729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 31, !dbg !55 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !55 + %730 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101"(float %698, float %699, float %700, float %701, float %702, float %703, float %704, float %705, float %706, float %707, float %708, float %709, float %710, float %711, float %712, float %713, float %714, float %715, float %716, float %717, float %718, float %719, float %720, float %721, float %722, float %723, float %724, float %725, float %726, float %727, float %728, float %729, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %382, i32 0, i32 0, float %308, float %309, float %310, float %311, float %312, float %313, float %314, float %315, float %316, float %317, float %318, float %319, float %320, float %321, float %322, float %323, float %324, float %325, float %326, float %327, float %328, float %329, float %330, float %331, float %332, float %333, float %334, float %335, float %336, float %337, float %338, float %339, float %340, float %341, float %342, float %343, float %344, float %345, float %346, float %347, float %348, float %349, float %350, float %351, float %352, float %353, float %354, float %355, float %356, float %357, float %358, float %359, float %360, float %361, float %362, float %363, float %364, float %365, float %366, float %367, float %368, float %369, float %370, float %371) #2, !dbg !55 + %731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 0, !dbg !55 + %732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 1, !dbg !55 + %733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 2, !dbg !55 + %734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 3, !dbg !55 + %735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 4, !dbg !55 + %736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 5, !dbg !55 + %737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 6, !dbg !55 + %738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 7, !dbg !55 + %739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 8, !dbg !55 + %740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 9, !dbg !55 + %741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 10, !dbg !55 + %742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 11, !dbg !55 + %743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 12, !dbg !55 + %744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 13, !dbg !55 + %745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 14, !dbg !55 + %746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 15, !dbg !55 + %747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 16, !dbg !55 + %748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 17, !dbg !55 + %749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 18, !dbg !55 + %750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 19, !dbg !55 + %751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 20, !dbg !55 + %752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 21, !dbg !55 + %753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 22, !dbg !55 + %754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 23, !dbg !55 + %755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 24, !dbg !55 + %756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 25, !dbg !55 + %757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 26, !dbg !55 + %758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 27, !dbg !55 + %759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 28, !dbg !55 + %760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 29, !dbg !55 + %761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 30, !dbg !55 + %762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 31, !dbg !55 + %763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 38, !dbg !55 + %764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 39, !dbg !55 + %765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 40, !dbg !55 + %766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 41, !dbg !55 + %767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 42, !dbg !55 + %768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 43, !dbg !55 + %769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 44, !dbg !55 + %770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 45, !dbg !55 + %771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 46, !dbg !55 + %772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 47, !dbg !55 + %773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 48, !dbg !55 + %774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 49, !dbg !55 + %775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 50, !dbg !55 + %776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 51, !dbg !55 + %777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 52, !dbg !55 + %778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 53, !dbg !55 + %779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 54, !dbg !55 + %780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 55, !dbg !55 + %781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 56, !dbg !55 + %782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 57, !dbg !55 + %783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 58, !dbg !55 + %784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 59, !dbg !55 + %785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 60, !dbg !55 + %786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 61, !dbg !55 + %787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 62, !dbg !55 + %788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 63, !dbg !55 + %789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 64, !dbg !55 + %790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 65, !dbg !55 + %791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 66, !dbg !55 + %792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 67, !dbg !55 + %793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 68, !dbg !55 + %794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 69, !dbg !55 + %795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 70, !dbg !55 + %796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 71, !dbg !55 + %797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 72, !dbg !55 + %798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 73, !dbg !55 + %799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 74, !dbg !55 + %800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 75, !dbg !55 + %801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 76, !dbg !55 + %802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 77, !dbg !55 + %803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 78, !dbg !55 + %804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 79, !dbg !55 + %805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 80, !dbg !55 + %806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 81, !dbg !55 + %807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 82, !dbg !55 + %808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 83, !dbg !55 + %809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 84, !dbg !55 + %810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 85, !dbg !55 + %811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 86, !dbg !55 + %812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 87, !dbg !55 + %813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 88, !dbg !55 + %814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 89, !dbg !55 + %815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 90, !dbg !55 + %816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 91, !dbg !55 + %817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 92, !dbg !55 + %818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 93, !dbg !55 + %819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 94, !dbg !55 + %820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 95, !dbg !55 + %821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 96, !dbg !55 + %822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 97, !dbg !55 + %823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 98, !dbg !55 + %824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 99, !dbg !55 + %825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 100, !dbg !55 + %826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 101, !dbg !55 + %827 = fmul float %731, 0x3FB6A09E60000000, !dbg !60 + %828 = fmul float %732, 0x3FB6A09E60000000, !dbg !60 + %829 = fmul float %733, 0x3FB6A09E60000000, !dbg !60 + %830 = fmul float %734, 0x3FB6A09E60000000, !dbg !60 + %831 = fmul float %735, 0x3FB6A09E60000000, !dbg !60 + %832 = fmul float %736, 0x3FB6A09E60000000, !dbg !60 + %833 = fmul float %737, 0x3FB6A09E60000000, !dbg !60 + %834 = fmul float %738, 0x3FB6A09E60000000, !dbg !60 + %835 = fmul float %739, 0x3FB6A09E60000000, !dbg !60 + %836 = fmul float %740, 0x3FB6A09E60000000, !dbg !60 + %837 = fmul float %741, 0x3FB6A09E60000000, !dbg !60 + %838 = fmul float %742, 0x3FB6A09E60000000, !dbg !60 + %839 = fmul float %743, 0x3FB6A09E60000000, !dbg !60 + %840 = fmul float %744, 0x3FB6A09E60000000, !dbg !60 + %841 = fmul float %745, 0x3FB6A09E60000000, !dbg !60 + %842 = fmul float %746, 0x3FB6A09E60000000, !dbg !60 + %843 = fmul float %747, 0x3FB6A09E60000000, !dbg !60 + %844 = fmul float %748, 0x3FB6A09E60000000, !dbg !60 + %845 = fmul float %749, 0x3FB6A09E60000000, !dbg !60 + %846 = fmul float %750, 0x3FB6A09E60000000, !dbg !60 + %847 = fmul float %751, 0x3FB6A09E60000000, !dbg !60 + %848 = fmul float %752, 0x3FB6A09E60000000, !dbg !60 + %849 = fmul float %753, 0x3FB6A09E60000000, !dbg !60 + %850 = fmul float %754, 0x3FB6A09E60000000, !dbg !60 + %851 = fmul float %755, 0x3FB6A09E60000000, !dbg !60 + %852 = fmul float %756, 0x3FB6A09E60000000, !dbg !60 + %853 = fmul float %757, 0x3FB6A09E60000000, !dbg !60 + %854 = fmul float %758, 0x3FB6A09E60000000, !dbg !60 + %855 = fmul float %759, 0x3FB6A09E60000000, !dbg !60 + %856 = fmul float %760, 0x3FB6A09E60000000, !dbg !60 + %857 = fmul float %761, 0x3FB6A09E60000000, !dbg !60 + %858 = fmul float %762, 0x3FB6A09E60000000, !dbg !60 + %859 = extractelement <16 x i32> %374, i64 15, !dbg !61 + %860 = icmp sge i32 %56, %859, !dbg !61 + %861 = extractelement <16 x i32> %374, i64 14, !dbg !62 + %862 = icmp sge i32 %56, %861, !dbg !61 + %863 = icmp sge i32 %57, %859, !dbg !61 + %864 = icmp sge i32 %57, %861, !dbg !61 + %865 = extractelement <16 x i32> %374, i64 13, !dbg !61 + %866 = icmp sge i32 %56, %865, !dbg !61 + %867 = extractelement <16 x i32> %374, i64 12, !dbg !62 + %868 = icmp sge i32 %56, %867, !dbg !61 + %869 = icmp sge i32 %57, %865, !dbg !61 + %870 = icmp sge i32 %57, %867, !dbg !61 + %871 = extractelement <16 x i32> %374, i64 11, !dbg !61 + %872 = icmp sge i32 %56, %871, !dbg !61 + %873 = extractelement <16 x i32> %374, i64 10, !dbg !62 + %874 = icmp sge i32 %56, %873, !dbg !61 + %875 = icmp sge i32 %57, %871, !dbg !61 + %876 = icmp sge i32 %57, %873, !dbg !61 + %877 = extractelement <16 x i32> %374, i64 9, !dbg !61 + %878 = icmp sge i32 %56, %877, !dbg !61 + %879 = extractelement <16 x i32> %374, i64 8, !dbg !62 + %880 = icmp sge i32 %56, %879, !dbg !61 + %881 = icmp sge i32 %57, %877, !dbg !61 + %882 = icmp sge i32 %57, %879, !dbg !61 + %883 = extractelement <16 x i32> %374, i64 7, !dbg !61 + %884 = icmp sge i32 %56, %883, !dbg !61 + %885 = extractelement <16 x i32> %374, i64 6, !dbg !62 + %886 = icmp sge i32 %56, %885, !dbg !61 + %887 = icmp sge i32 %57, %883, !dbg !61 + %888 = icmp sge i32 %57, %885, !dbg !61 + %889 = extractelement <16 x i32> %374, i64 5, !dbg !61 + %890 = icmp sge i32 %56, %889, !dbg !61 + %891 = extractelement <16 x i32> %374, i64 4, !dbg !62 + %892 = icmp sge i32 %56, %891, !dbg !61 + %893 = icmp sge i32 %57, %889, !dbg !61 + %894 = icmp sge i32 %57, %891, !dbg !61 + %895 = extractelement <16 x i32> %374, i64 3, !dbg !61 + %896 = icmp sge i32 %56, %895, !dbg !61 + %897 = extractelement <16 x i32> %374, i64 2, !dbg !62 + %898 = icmp sge i32 %56, %897, !dbg !61 + %899 = icmp sge i32 %57, %895, !dbg !61 + %900 = icmp sge i32 %57, %897, !dbg !61 + %901 = extractelement <16 x i32> %374, i64 1, !dbg !61 + %902 = icmp sge i32 %56, %901, !dbg !61 + %903 = extractelement <16 x i32> %374, i64 0, !dbg !62 + %904 = icmp sge i32 %56, %903, !dbg !61 + %905 = icmp sge i32 %57, %901, !dbg !61 + %906 = icmp sge i32 %57, %903, !dbg !61 + %907 = and i1 %197, %860, !dbg !63 + %908 = and i1 %197, %862, !dbg !63 + %909 = and i1 %198, %863, !dbg !63 + %910 = and i1 %198, %864, !dbg !63 + %911 = and i1 %197, %866, !dbg !63 + %912 = and i1 %197, %868, !dbg !63 + %913 = and i1 %198, %869, !dbg !63 + %914 = and i1 %198, %870, !dbg !63 + %915 = and i1 %197, %872, !dbg !63 + %916 = and i1 %197, %874, !dbg !63 + %917 = and i1 %198, %875, !dbg !63 + %918 = and i1 %198, %876, !dbg !63 + %919 = and i1 %197, %878, !dbg !63 + %920 = and i1 %197, %880, !dbg !63 + %921 = and i1 %198, %881, !dbg !63 + %922 = and i1 %198, %882, !dbg !63 + %923 = and i1 %197, %884, !dbg !63 + %924 = and i1 %197, %886, !dbg !63 + %925 = and i1 %198, %887, !dbg !63 + %926 = and i1 %198, %888, !dbg !63 + %927 = and i1 %197, %890, !dbg !63 + %928 = and i1 %197, %892, !dbg !63 + %929 = and i1 %198, %893, !dbg !63 + %930 = and i1 %198, %894, !dbg !63 + %931 = and i1 %197, %896, !dbg !63 + %932 = and i1 %197, %898, !dbg !63 + %933 = and i1 %198, %899, !dbg !63 + %934 = and i1 %198, %900, !dbg !63 + %935 = and i1 %197, %902, !dbg !63 + %936 = and i1 %197, %904, !dbg !63 + %937 = and i1 %198, %905, !dbg !63 + %938 = and i1 %198, %906, !dbg !63 + %939 = icmp sgt i32 %859, 2047, !dbg !64 + %940 = icmp sgt i32 %865, 2047, !dbg !64 + %941 = icmp sgt i32 %871, 2047, !dbg !64 + %942 = icmp sgt i32 %877, 2047, !dbg !64 + %943 = icmp sgt i32 %883, 2047, !dbg !64 + %944 = icmp sgt i32 %889, 2047, !dbg !64 + %945 = icmp sgt i32 %895, 2047, !dbg !64 + %946 = icmp sgt i32 %901, 2047, !dbg !64 + %947 = shufflevector <16 x i32> %374, <16 x i32> poison, <8 x i32> , !dbg !62 + %948 = srem <8 x i32> %947, splat (i32 2048), !dbg !62 + %949 = icmp ne <8 x i32> %948, zeroinitializer, !dbg !65 + %950 = shufflevector <16 x i32> %374, <16 x i32> poison, <8 x i32> , !dbg !66 + %951 = and <8 x i32> %950, splat (i32 -2147481601), !dbg !66 + %952 = icmp ugt <8 x i32> %951, splat (i32 -2147483648), !dbg !66 + %953 = trunc <8 x i32> %950 to <8 x i16>, !dbg !67 + %954 = and <8 x i16> %953, splat (i16 2047), !dbg !67 + %955 = zext nneg <8 x i16> %954 to <8 x i64>, !dbg !59 + %956 = icmp sgt <8 x i64> %300, %955, !dbg !59 + %957 = and <8 x i1> %952, %949, !dbg !68 + %958 = add nsw <8 x i32> %948, splat (i32 2048), !dbg !69 + %959 = select <8 x i1> %957, <8 x i32> %958, <8 x i32> %948, !dbg !70 + %960 = sext <8 x i32> %959 to <8 x i64>, !dbg !59 + %961 = icmp sgt <8 x i64> %300, %960, !dbg !59 + %962 = extractelement <8 x i1> %956, i64 7, !dbg !71 + %963 = and i1 %939, %962, !dbg !71 + %964 = extractelement <8 x i1> %961, i64 7, !dbg !71 + %965 = and i1 %939, %964, !dbg !71 + %966 = extractelement <8 x i1> %956, i64 6, !dbg !71 + %967 = and i1 %940, %966, !dbg !71 + %968 = extractelement <8 x i1> %961, i64 6, !dbg !71 + %969 = and i1 %940, %968, !dbg !71 + %970 = extractelement <8 x i1> %956, i64 5, !dbg !71 + %971 = and i1 %941, %970, !dbg !71 + %972 = extractelement <8 x i1> %961, i64 5, !dbg !71 + %973 = and i1 %941, %972, !dbg !71 + %974 = extractelement <8 x i1> %956, i64 4, !dbg !71 + %975 = and i1 %942, %974, !dbg !71 + %976 = extractelement <8 x i1> %961, i64 4, !dbg !71 + %977 = and i1 %942, %976, !dbg !71 + %978 = extractelement <8 x i1> %956, i64 3, !dbg !71 + %979 = and i1 %943, %978, !dbg !71 + %980 = extractelement <8 x i1> %961, i64 3, !dbg !71 + %981 = and i1 %943, %980, !dbg !71 + %982 = extractelement <8 x i1> %956, i64 2, !dbg !71 + %983 = and i1 %944, %982, !dbg !71 + %984 = extractelement <8 x i1> %961, i64 2, !dbg !71 + %985 = and i1 %944, %984, !dbg !71 + %986 = extractelement <8 x i1> %956, i64 1, !dbg !71 + %987 = and i1 %945, %986, !dbg !71 + %988 = extractelement <8 x i1> %961, i64 1, !dbg !71 + %989 = and i1 %945, %988, !dbg !71 + %990 = extractelement <8 x i1> %956, i64 0, !dbg !71 + %991 = and i1 %946, %990, !dbg !71 + %992 = extractelement <8 x i1> %961, i64 0, !dbg !71 + %993 = and i1 %946, %992, !dbg !71 + %994 = sub <32 x i32> %375, %298, !dbg !58 + %995 = and <32 x i32> %994, splat (i32 2047), !dbg !72 + %996 = icmp eq <32 x i32> %995, zeroinitializer, !dbg !73 + %997 = extractelement <32 x i1> %996, i64 31, !dbg !74 + %998 = and i1 %997, %963, !dbg !74 + %999 = extractelement <32 x i1> %996, i64 30, !dbg !74 + %1000 = and i1 %999, %965, !dbg !74 + %1001 = extractelement <32 x i1> %996, i64 29, !dbg !74 + %1002 = and i1 %1001, %963, !dbg !74 + %1003 = extractelement <32 x i1> %996, i64 28, !dbg !74 + %1004 = and i1 %1003, %965, !dbg !74 + %1005 = extractelement <32 x i1> %996, i64 27, !dbg !74 + %1006 = and i1 %1005, %967, !dbg !74 + %1007 = extractelement <32 x i1> %996, i64 26, !dbg !74 + %1008 = and i1 %1007, %969, !dbg !74 + %1009 = extractelement <32 x i1> %996, i64 25, !dbg !74 + %1010 = and i1 %1009, %967, !dbg !74 + %1011 = extractelement <32 x i1> %996, i64 24, !dbg !74 + %1012 = and i1 %1011, %969, !dbg !74 + %1013 = extractelement <32 x i1> %996, i64 23, !dbg !74 + %1014 = and i1 %1013, %971, !dbg !74 + %1015 = extractelement <32 x i1> %996, i64 22, !dbg !74 + %1016 = and i1 %1015, %973, !dbg !74 + %1017 = extractelement <32 x i1> %996, i64 21, !dbg !74 + %1018 = and i1 %1017, %971, !dbg !74 + %1019 = extractelement <32 x i1> %996, i64 20, !dbg !74 + %1020 = and i1 %1019, %973, !dbg !74 + %1021 = extractelement <32 x i1> %996, i64 19, !dbg !74 + %1022 = and i1 %1021, %975, !dbg !74 + %1023 = extractelement <32 x i1> %996, i64 18, !dbg !74 + %1024 = and i1 %1023, %977, !dbg !74 + %1025 = extractelement <32 x i1> %996, i64 17, !dbg !74 + %1026 = and i1 %1025, %975, !dbg !74 + %1027 = extractelement <32 x i1> %996, i64 16, !dbg !74 + %1028 = and i1 %1027, %977, !dbg !74 + %1029 = extractelement <32 x i1> %996, i64 15, !dbg !74 + %1030 = and i1 %1029, %979, !dbg !74 + %1031 = extractelement <32 x i1> %996, i64 14, !dbg !74 + %1032 = and i1 %1031, %981, !dbg !74 + %1033 = extractelement <32 x i1> %996, i64 13, !dbg !74 + %1034 = and i1 %1033, %979, !dbg !74 + %1035 = extractelement <32 x i1> %996, i64 12, !dbg !74 + %1036 = and i1 %1035, %981, !dbg !74 + %1037 = extractelement <32 x i1> %996, i64 11, !dbg !74 + %1038 = and i1 %1037, %983, !dbg !74 + %1039 = extractelement <32 x i1> %996, i64 10, !dbg !74 + %1040 = and i1 %1039, %985, !dbg !74 + %1041 = extractelement <32 x i1> %996, i64 9, !dbg !74 + %1042 = and i1 %1041, %983, !dbg !74 + %1043 = extractelement <32 x i1> %996, i64 8, !dbg !74 + %1044 = and i1 %1043, %985, !dbg !74 + %1045 = extractelement <32 x i1> %996, i64 7, !dbg !74 + %1046 = and i1 %1045, %987, !dbg !74 + %1047 = extractelement <32 x i1> %996, i64 6, !dbg !74 + %1048 = and i1 %1047, %989, !dbg !74 + %1049 = extractelement <32 x i1> %996, i64 5, !dbg !74 + %1050 = and i1 %1049, %987, !dbg !74 + %1051 = extractelement <32 x i1> %996, i64 4, !dbg !74 + %1052 = and i1 %1051, %989, !dbg !74 + %1053 = extractelement <32 x i1> %996, i64 3, !dbg !74 + %1054 = and i1 %1053, %991, !dbg !74 + %1055 = extractelement <32 x i1> %996, i64 2, !dbg !74 + %1056 = and i1 %1055, %993, !dbg !74 + %1057 = extractelement <32 x i1> %996, i64 1, !dbg !74 + %1058 = and i1 %1057, %991, !dbg !74 + %1059 = extractelement <32 x i1> %996, i64 0, !dbg !74 + %1060 = and i1 %1059, %993, !dbg !74 + %1061 = or i1 %907, %998, !dbg !75 + %1062 = or i1 %908, %1000, !dbg !75 + %1063 = or i1 %909, %1002, !dbg !75 + %1064 = or i1 %910, %1004, !dbg !75 + %1065 = or i1 %911, %1006, !dbg !75 + %1066 = or i1 %912, %1008, !dbg !75 + %1067 = or i1 %913, %1010, !dbg !75 + %1068 = or i1 %914, %1012, !dbg !75 + %1069 = or i1 %915, %1014, !dbg !75 + %1070 = or i1 %916, %1016, !dbg !75 + %1071 = or i1 %917, %1018, !dbg !75 + %1072 = or i1 %918, %1020, !dbg !75 + %1073 = or i1 %919, %1022, !dbg !75 + %1074 = or i1 %920, %1024, !dbg !75 + %1075 = or i1 %921, %1026, !dbg !75 + %1076 = or i1 %922, %1028, !dbg !75 + %1077 = or i1 %923, %1030, !dbg !75 + %1078 = or i1 %924, %1032, !dbg !75 + %1079 = or i1 %925, %1034, !dbg !75 + %1080 = or i1 %926, %1036, !dbg !75 + %1081 = or i1 %927, %1038, !dbg !75 + %1082 = or i1 %928, %1040, !dbg !75 + %1083 = or i1 %929, %1042, !dbg !75 + %1084 = or i1 %930, %1044, !dbg !75 + %1085 = or i1 %931, %1046, !dbg !75 + %1086 = or i1 %932, %1048, !dbg !75 + %1087 = or i1 %933, %1050, !dbg !75 + %1088 = or i1 %934, %1052, !dbg !75 + %1089 = or i1 %935, %1054, !dbg !75 + %1090 = or i1 %936, %1056, !dbg !75 + %1091 = or i1 %937, %1058, !dbg !75 + %1092 = or i1 %938, %1060, !dbg !75 + %1093 = fmul float %827, 0x3FF7154760000000, !dbg !76 + %1094 = select i1 %1061, float %1093, float 0xFFF0000000000000, !dbg !77 + %1095 = fmul float %828, 0x3FF7154760000000, !dbg !76 + %1096 = select i1 %1062, float %1095, float 0xFFF0000000000000, !dbg !77 + %1097 = fmul float %829, 0x3FF7154760000000, !dbg !76 + %1098 = select i1 %1063, float %1097, float 0xFFF0000000000000, !dbg !77 + %1099 = fmul float %830, 0x3FF7154760000000, !dbg !76 + %1100 = select i1 %1064, float %1099, float 0xFFF0000000000000, !dbg !77 + %1101 = fmul float %831, 0x3FF7154760000000, !dbg !76 + %1102 = select i1 %1065, float %1101, float 0xFFF0000000000000, !dbg !77 + %1103 = fmul float %832, 0x3FF7154760000000, !dbg !76 + %1104 = select i1 %1066, float %1103, float 0xFFF0000000000000, !dbg !77 + %1105 = fmul float %833, 0x3FF7154760000000, !dbg !76 + %1106 = select i1 %1067, float %1105, float 0xFFF0000000000000, !dbg !77 + %1107 = fmul float %834, 0x3FF7154760000000, !dbg !76 + %1108 = select i1 %1068, float %1107, float 0xFFF0000000000000, !dbg !77 + %1109 = fmul float %835, 0x3FF7154760000000, !dbg !76 + %1110 = select i1 %1069, float %1109, float 0xFFF0000000000000, !dbg !77 + %1111 = fmul float %836, 0x3FF7154760000000, !dbg !76 + %1112 = select i1 %1070, float %1111, float 0xFFF0000000000000, !dbg !77 + %1113 = fmul float %837, 0x3FF7154760000000, !dbg !76 + %1114 = select i1 %1071, float %1113, float 0xFFF0000000000000, !dbg !77 + %1115 = fmul float %838, 0x3FF7154760000000, !dbg !76 + %1116 = select i1 %1072, float %1115, float 0xFFF0000000000000, !dbg !77 + %1117 = fmul float %839, 0x3FF7154760000000, !dbg !76 + %1118 = select i1 %1073, float %1117, float 0xFFF0000000000000, !dbg !77 + %1119 = fmul float %840, 0x3FF7154760000000, !dbg !76 + %1120 = select i1 %1074, float %1119, float 0xFFF0000000000000, !dbg !77 + %1121 = fmul float %841, 0x3FF7154760000000, !dbg !76 + %1122 = select i1 %1075, float %1121, float 0xFFF0000000000000, !dbg !77 + %1123 = fmul float %842, 0x3FF7154760000000, !dbg !76 + %1124 = select i1 %1076, float %1123, float 0xFFF0000000000000, !dbg !77 + %1125 = fmul float %843, 0x3FF7154760000000, !dbg !76 + %1126 = select i1 %1077, float %1125, float 0xFFF0000000000000, !dbg !77 + %1127 = fmul float %844, 0x3FF7154760000000, !dbg !76 + %1128 = select i1 %1078, float %1127, float 0xFFF0000000000000, !dbg !77 + %1129 = fmul float %845, 0x3FF7154760000000, !dbg !76 + %1130 = select i1 %1079, float %1129, float 0xFFF0000000000000, !dbg !77 + %1131 = fmul float %846, 0x3FF7154760000000, !dbg !76 + %1132 = select i1 %1080, float %1131, float 0xFFF0000000000000, !dbg !77 + %1133 = fmul float %847, 0x3FF7154760000000, !dbg !76 + %1134 = select i1 %1081, float %1133, float 0xFFF0000000000000, !dbg !77 + %1135 = fmul float %848, 0x3FF7154760000000, !dbg !76 + %1136 = select i1 %1082, float %1135, float 0xFFF0000000000000, !dbg !77 + %1137 = fmul float %849, 0x3FF7154760000000, !dbg !76 + %1138 = select i1 %1083, float %1137, float 0xFFF0000000000000, !dbg !77 + %1139 = fmul float %850, 0x3FF7154760000000, !dbg !76 + %1140 = select i1 %1084, float %1139, float 0xFFF0000000000000, !dbg !77 + %1141 = fmul float %851, 0x3FF7154760000000, !dbg !76 + %1142 = select i1 %1085, float %1141, float 0xFFF0000000000000, !dbg !77 + %1143 = fmul float %852, 0x3FF7154760000000, !dbg !76 + %1144 = select i1 %1086, float %1143, float 0xFFF0000000000000, !dbg !77 + %1145 = fmul float %853, 0x3FF7154760000000, !dbg !76 + %1146 = select i1 %1087, float %1145, float 0xFFF0000000000000, !dbg !77 + %1147 = fmul float %854, 0x3FF7154760000000, !dbg !76 + %1148 = select i1 %1088, float %1147, float 0xFFF0000000000000, !dbg !77 + %1149 = fmul float %855, 0x3FF7154760000000, !dbg !76 + %1150 = select i1 %1089, float %1149, float 0xFFF0000000000000, !dbg !77 + %1151 = fmul float %856, 0x3FF7154760000000, !dbg !76 + %1152 = select i1 %1090, float %1151, float 0xFFF0000000000000, !dbg !77 + %1153 = fmul float %857, 0x3FF7154760000000, !dbg !76 + %1154 = select i1 %1091, float %1153, float 0xFFF0000000000000, !dbg !77 + %1155 = fmul float %858, 0x3FF7154760000000, !dbg !76 + %1156 = select i1 %1092, float %1155, float 0xFFF0000000000000, !dbg !77 + %1157 = tail call float @llvm.maxnum.f32(float %1094, float %1096), !dbg !78 + %1158 = tail call float @llvm.maxnum.f32(float %1098, float %1100), !dbg !78 + %1159 = tail call float @llvm.maxnum.f32(float %1157, float %1102), !dbg !78 + %1160 = tail call float @llvm.maxnum.f32(float %1159, float %1104), !dbg !78 + %1161 = tail call float @llvm.maxnum.f32(float %1158, float %1106), !dbg !78 + %1162 = tail call float @llvm.maxnum.f32(float %1161, float %1108), !dbg !78 + %1163 = tail call float @llvm.maxnum.f32(float %1160, float %1110), !dbg !78 + %1164 = tail call float @llvm.maxnum.f32(float %1163, float %1112), !dbg !78 + %1165 = tail call float @llvm.maxnum.f32(float %1162, float %1114), !dbg !78 + %1166 = tail call float @llvm.maxnum.f32(float %1165, float %1116), !dbg !78 + %1167 = tail call float @llvm.maxnum.f32(float %1164, float %1118), !dbg !78 + %1168 = tail call float @llvm.maxnum.f32(float %1167, float %1120), !dbg !78 + %1169 = tail call float @llvm.maxnum.f32(float %1166, float %1122), !dbg !78 + %1170 = tail call float @llvm.maxnum.f32(float %1169, float %1124), !dbg !78 + %1171 = tail call float @llvm.maxnum.f32(float %1168, float %1126), !dbg !78 + %1172 = tail call float @llvm.maxnum.f32(float %1171, float %1128), !dbg !78 + %1173 = tail call float @llvm.maxnum.f32(float %1170, float %1130), !dbg !78 + %1174 = tail call float @llvm.maxnum.f32(float %1173, float %1132), !dbg !78 + %1175 = tail call float @llvm.maxnum.f32(float %1172, float %1134), !dbg !78 + %1176 = tail call float @llvm.maxnum.f32(float %1175, float %1136), !dbg !78 + %1177 = tail call float @llvm.maxnum.f32(float %1174, float %1138), !dbg !78 + %1178 = tail call float @llvm.maxnum.f32(float %1177, float %1140), !dbg !78 + %1179 = tail call float @llvm.maxnum.f32(float %1176, float %1142), !dbg !78 + %1180 = tail call float @llvm.maxnum.f32(float %1179, float %1144), !dbg !78 + %1181 = tail call float @llvm.maxnum.f32(float %1178, float %1146), !dbg !78 + %1182 = tail call float @llvm.maxnum.f32(float %1181, float %1148), !dbg !78 + %1183 = tail call float @llvm.maxnum.f32(float %1180, float %1150), !dbg !78 + %1184 = tail call float @llvm.maxnum.f32(float %1183, float %1152), !dbg !78 + %1185 = tail call float @llvm.maxnum.f32(float %1182, float %1154), !dbg !78 + %1186 = tail call float @llvm.maxnum.f32(float %1185, float %1156), !dbg !78 + %1187 = bitcast float %1184 to i32, !dbg !81 + %1188 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1187, i32 2, i32 31), !dbg !81 + %1189 = bitcast i32 %1188 to float, !dbg !81 + %1190 = bitcast float %1186 to i32, !dbg !81 + %1191 = tail call float @llvm.maxnum.f32(float %1184, float %1189), !dbg !78 + %1192 = bitcast float %1191 to i32, !dbg !81 + %1193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1192, i32 1, i32 31), !dbg !81 + %1194 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1190, i32 2, i32 31), !dbg !81 + %1195 = bitcast i32 %1194 to float, !dbg !81 + %1196 = tail call float @llvm.maxnum.f32(float %1186, float %1195), !dbg !78 + %1197 = bitcast float %1196 to i32, !dbg !81 + %1198 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1197, i32 1, i32 31), !dbg !81 + %1199 = insertelement <2 x i32> poison, i32 %1193, i64 0, !dbg !81 + %1200 = insertelement <2 x i32> %1199, i32 %1198, i64 1, !dbg !81 + %1201 = bitcast <2 x i32> %1200 to <2 x float>, !dbg !81 + %1202 = insertelement <2 x float> poison, float %1191, i64 0, !dbg !78 + %1203 = insertelement <2 x float> %1202, float %1196, i64 1, !dbg !78 + %1204 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %1203, <2 x float> %1201), !dbg !78 + %1205 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %373, <2 x float> %1204), !dbg !82 + %1206 = extractelement <2 x float> %1205, i64 0, !dbg !83 + %1207 = fcmp oeq float %1206, 0xFFF0000000000000, !dbg !84 + %1208 = extractelement <2 x float> %1205, i64 1, !dbg !83 + %1209 = fcmp oeq float %1208, 0xFFF0000000000000, !dbg !84 + %1210 = select i1 %1207, float 0.000000e+00, float %1206, !dbg !83 + %1211 = select i1 %1209, float 0.000000e+00, float %1208, !dbg !83 + %1212 = extractelement <2 x float> %373, i64 0, !dbg !85 + %1213 = fsub float %1212, %1210, !dbg !85 + %1214 = extractelement <2 x float> %373, i64 1, !dbg !85 + %1215 = fsub float %1214, %1211, !dbg !85 + %1216 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !86 + %.not.i190 = icmp eq i32 %1216, 0, !dbg !86 + br i1 %.not.i190, label %1219, label %1217, !dbg !86 + +1217: ; preds = %301 + %1218 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1213) #2, !dbg !86 + br label %__nv_exp2f.exit192, !dbg !86 + +1219: ; preds = %301 + %1220 = tail call float @llvm.nvvm.ex2.approx.f(float %1213) #2, !dbg !86 + br label %__nv_exp2f.exit192, !dbg !86 + +__nv_exp2f.exit192: ; preds = %1217, %1219 + %.0.i191 = phi float [ %1218, %1217 ], [ %1220, %1219 ], !dbg !86 + %1221 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !86 + %.not.i193 = icmp eq i32 %1221, 0, !dbg !86 + br i1 %.not.i193, label %1224, label %1222, !dbg !86 + +1222: ; preds = %__nv_exp2f.exit192 + %1223 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1215) #2, !dbg !86 + br label %__nv_exp2f.exit195, !dbg !86 + +1224: ; preds = %__nv_exp2f.exit192 + %1225 = tail call float @llvm.nvvm.ex2.approx.f(float %1215) #2, !dbg !86 + br label %__nv_exp2f.exit195, !dbg !86 + +__nv_exp2f.exit195: ; preds = %1222, %1224 + %.0.i194 = phi float [ %1223, %1222 ], [ %1225, %1224 ], !dbg !86 + %1226 = fsub float %1094, %1210, !dbg !87 + %1227 = fsub float %1096, %1210, !dbg !87 + %1228 = fsub float %1098, %1211, !dbg !87 + %1229 = fsub float %1100, %1211, !dbg !87 + %1230 = fsub float %1102, %1210, !dbg !87 + %1231 = fsub float %1104, %1210, !dbg !87 + %1232 = fsub float %1106, %1211, !dbg !87 + %1233 = fsub float %1108, %1211, !dbg !87 + %1234 = fsub float %1110, %1210, !dbg !87 + %1235 = fsub float %1112, %1210, !dbg !87 + %1236 = fsub float %1114, %1211, !dbg !87 + %1237 = fsub float %1116, %1211, !dbg !87 + %1238 = fsub float %1118, %1210, !dbg !87 + %1239 = fsub float %1120, %1210, !dbg !87 + %1240 = fsub float %1122, %1211, !dbg !87 + %1241 = fsub float %1124, %1211, !dbg !87 + %1242 = fsub float %1126, %1210, !dbg !87 + %1243 = fsub float %1128, %1210, !dbg !87 + %1244 = fsub float %1130, %1211, !dbg !87 + %1245 = fsub float %1132, %1211, !dbg !87 + %1246 = fsub float %1134, %1210, !dbg !87 + %1247 = fsub float %1136, %1210, !dbg !87 + %1248 = fsub float %1138, %1211, !dbg !87 + %1249 = fsub float %1140, %1211, !dbg !87 + %1250 = fsub float %1142, %1210, !dbg !87 + %1251 = fsub float %1144, %1210, !dbg !87 + %1252 = fsub float %1146, %1211, !dbg !87 + %1253 = fsub float %1148, %1211, !dbg !87 + %1254 = fsub float %1150, %1210, !dbg !87 + %1255 = fsub float %1152, %1210, !dbg !87 + %1256 = fsub float %1154, %1211, !dbg !87 + %1257 = fsub float %1156, %1211, !dbg !87 + %1258 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i196 = icmp eq i32 %1258, 0, !dbg !88 + br i1 %.not.i196, label %1261, label %1259, !dbg !88 + +1259: ; preds = %__nv_exp2f.exit195 + %1260 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1226) #2, !dbg !88 + br label %__nv_exp2f.exit198, !dbg !88 + +1261: ; preds = %__nv_exp2f.exit195 + %1262 = tail call float @llvm.nvvm.ex2.approx.f(float %1226) #2, !dbg !88 + br label %__nv_exp2f.exit198, !dbg !88 + +__nv_exp2f.exit198: ; preds = %1259, %1261 + %.0.i197 = phi float [ %1260, %1259 ], [ %1262, %1261 ], !dbg !88 + %1263 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i199 = icmp eq i32 %1263, 0, !dbg !88 + br i1 %.not.i199, label %1266, label %1264, !dbg !88 + +1264: ; preds = %__nv_exp2f.exit198 + %1265 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1227) #2, !dbg !88 + br label %__nv_exp2f.exit201, !dbg !88 + +1266: ; preds = %__nv_exp2f.exit198 + %1267 = tail call float @llvm.nvvm.ex2.approx.f(float %1227) #2, !dbg !88 + br label %__nv_exp2f.exit201, !dbg !88 + +__nv_exp2f.exit201: ; preds = %1264, %1266 + %.0.i200 = phi float [ %1265, %1264 ], [ %1267, %1266 ], !dbg !88 + %1268 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i202 = icmp eq i32 %1268, 0, !dbg !88 + br i1 %.not.i202, label %1271, label %1269, !dbg !88 + +1269: ; preds = %__nv_exp2f.exit201 + %1270 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1228) #2, !dbg !88 + br label %__nv_exp2f.exit204, !dbg !88 + +1271: ; preds = %__nv_exp2f.exit201 + %1272 = tail call float @llvm.nvvm.ex2.approx.f(float %1228) #2, !dbg !88 + br label %__nv_exp2f.exit204, !dbg !88 + +__nv_exp2f.exit204: ; preds = %1269, %1271 + %.0.i203 = phi float [ %1270, %1269 ], [ %1272, %1271 ], !dbg !88 + %1273 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i205 = icmp eq i32 %1273, 0, !dbg !88 + br i1 %.not.i205, label %1276, label %1274, !dbg !88 + +1274: ; preds = %__nv_exp2f.exit204 + %1275 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1229) #2, !dbg !88 + br label %__nv_exp2f.exit207, !dbg !88 + +1276: ; preds = %__nv_exp2f.exit204 + %1277 = tail call float @llvm.nvvm.ex2.approx.f(float %1229) #2, !dbg !88 + br label %__nv_exp2f.exit207, !dbg !88 + +__nv_exp2f.exit207: ; preds = %1274, %1276 + %.0.i206 = phi float [ %1275, %1274 ], [ %1277, %1276 ], !dbg !88 + %1278 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i208 = icmp eq i32 %1278, 0, !dbg !88 + br i1 %.not.i208, label %1281, label %1279, !dbg !88 + +1279: ; preds = %__nv_exp2f.exit207 + %1280 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1230) #2, !dbg !88 + br label %__nv_exp2f.exit210, !dbg !88 + +1281: ; preds = %__nv_exp2f.exit207 + %1282 = tail call float @llvm.nvvm.ex2.approx.f(float %1230) #2, !dbg !88 + br label %__nv_exp2f.exit210, !dbg !88 + +__nv_exp2f.exit210: ; preds = %1279, %1281 + %.0.i209 = phi float [ %1280, %1279 ], [ %1282, %1281 ], !dbg !88 + %1283 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i211 = icmp eq i32 %1283, 0, !dbg !88 + br i1 %.not.i211, label %1286, label %1284, !dbg !88 + +1284: ; preds = %__nv_exp2f.exit210 + %1285 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1231) #2, !dbg !88 + br label %__nv_exp2f.exit213, !dbg !88 + +1286: ; preds = %__nv_exp2f.exit210 + %1287 = tail call float @llvm.nvvm.ex2.approx.f(float %1231) #2, !dbg !88 + br label %__nv_exp2f.exit213, !dbg !88 + +__nv_exp2f.exit213: ; preds = %1284, %1286 + %.0.i212 = phi float [ %1285, %1284 ], [ %1287, %1286 ], !dbg !88 + %1288 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i214 = icmp eq i32 %1288, 0, !dbg !88 + br i1 %.not.i214, label %1291, label %1289, !dbg !88 + +1289: ; preds = %__nv_exp2f.exit213 + %1290 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1232) #2, !dbg !88 + br label %__nv_exp2f.exit216, !dbg !88 + +1291: ; preds = %__nv_exp2f.exit213 + %1292 = tail call float @llvm.nvvm.ex2.approx.f(float %1232) #2, !dbg !88 + br label %__nv_exp2f.exit216, !dbg !88 + +__nv_exp2f.exit216: ; preds = %1289, %1291 + %.0.i215 = phi float [ %1290, %1289 ], [ %1292, %1291 ], !dbg !88 + %1293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i217 = icmp eq i32 %1293, 0, !dbg !88 + br i1 %.not.i217, label %1296, label %1294, !dbg !88 + +1294: ; preds = %__nv_exp2f.exit216 + %1295 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1233) #2, !dbg !88 + br label %__nv_exp2f.exit219, !dbg !88 + +1296: ; preds = %__nv_exp2f.exit216 + %1297 = tail call float @llvm.nvvm.ex2.approx.f(float %1233) #2, !dbg !88 + br label %__nv_exp2f.exit219, !dbg !88 + +__nv_exp2f.exit219: ; preds = %1294, %1296 + %.0.i218 = phi float [ %1295, %1294 ], [ %1297, %1296 ], !dbg !88 + %1298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i220 = icmp eq i32 %1298, 0, !dbg !88 + br i1 %.not.i220, label %1301, label %1299, !dbg !88 + +1299: ; preds = %__nv_exp2f.exit219 + %1300 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1234) #2, !dbg !88 + br label %__nv_exp2f.exit222, !dbg !88 + +1301: ; preds = %__nv_exp2f.exit219 + %1302 = tail call float @llvm.nvvm.ex2.approx.f(float %1234) #2, !dbg !88 + br label %__nv_exp2f.exit222, !dbg !88 + +__nv_exp2f.exit222: ; preds = %1299, %1301 + %.0.i221 = phi float [ %1300, %1299 ], [ %1302, %1301 ], !dbg !88 + %1303 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i223 = icmp eq i32 %1303, 0, !dbg !88 + br i1 %.not.i223, label %1306, label %1304, !dbg !88 + +1304: ; preds = %__nv_exp2f.exit222 + %1305 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1235) #2, !dbg !88 + br label %__nv_exp2f.exit225, !dbg !88 + +1306: ; preds = %__nv_exp2f.exit222 + %1307 = tail call float @llvm.nvvm.ex2.approx.f(float %1235) #2, !dbg !88 + br label %__nv_exp2f.exit225, !dbg !88 + +__nv_exp2f.exit225: ; preds = %1304, %1306 + %.0.i224 = phi float [ %1305, %1304 ], [ %1307, %1306 ], !dbg !88 + %1308 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i226 = icmp eq i32 %1308, 0, !dbg !88 + br i1 %.not.i226, label %1311, label %1309, !dbg !88 + +1309: ; preds = %__nv_exp2f.exit225 + %1310 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1236) #2, !dbg !88 + br label %__nv_exp2f.exit228, !dbg !88 + +1311: ; preds = %__nv_exp2f.exit225 + %1312 = tail call float @llvm.nvvm.ex2.approx.f(float %1236) #2, !dbg !88 + br label %__nv_exp2f.exit228, !dbg !88 + +__nv_exp2f.exit228: ; preds = %1309, %1311 + %.0.i227 = phi float [ %1310, %1309 ], [ %1312, %1311 ], !dbg !88 + %1313 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i229 = icmp eq i32 %1313, 0, !dbg !88 + br i1 %.not.i229, label %1316, label %1314, !dbg !88 + +1314: ; preds = %__nv_exp2f.exit228 + %1315 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1237) #2, !dbg !88 + br label %__nv_exp2f.exit231, !dbg !88 + +1316: ; preds = %__nv_exp2f.exit228 + %1317 = tail call float @llvm.nvvm.ex2.approx.f(float %1237) #2, !dbg !88 + br label %__nv_exp2f.exit231, !dbg !88 + +__nv_exp2f.exit231: ; preds = %1314, %1316 + %.0.i230 = phi float [ %1315, %1314 ], [ %1317, %1316 ], !dbg !88 + %1318 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i232 = icmp eq i32 %1318, 0, !dbg !88 + br i1 %.not.i232, label %1321, label %1319, !dbg !88 + +1319: ; preds = %__nv_exp2f.exit231 + %1320 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1238) #2, !dbg !88 + br label %__nv_exp2f.exit234, !dbg !88 + +1321: ; preds = %__nv_exp2f.exit231 + %1322 = tail call float @llvm.nvvm.ex2.approx.f(float %1238) #2, !dbg !88 + br label %__nv_exp2f.exit234, !dbg !88 + +__nv_exp2f.exit234: ; preds = %1319, %1321 + %.0.i233 = phi float [ %1320, %1319 ], [ %1322, %1321 ], !dbg !88 + %1323 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i235 = icmp eq i32 %1323, 0, !dbg !88 + br i1 %.not.i235, label %1326, label %1324, !dbg !88 + +1324: ; preds = %__nv_exp2f.exit234 + %1325 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1239) #2, !dbg !88 + br label %__nv_exp2f.exit237, !dbg !88 + +1326: ; preds = %__nv_exp2f.exit234 + %1327 = tail call float @llvm.nvvm.ex2.approx.f(float %1239) #2, !dbg !88 + br label %__nv_exp2f.exit237, !dbg !88 + +__nv_exp2f.exit237: ; preds = %1324, %1326 + %.0.i236 = phi float [ %1325, %1324 ], [ %1327, %1326 ], !dbg !88 + %1328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i238 = icmp eq i32 %1328, 0, !dbg !88 + br i1 %.not.i238, label %1331, label %1329, !dbg !88 + +1329: ; preds = %__nv_exp2f.exit237 + %1330 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1240) #2, !dbg !88 + br label %__nv_exp2f.exit240, !dbg !88 + +1331: ; preds = %__nv_exp2f.exit237 + %1332 = tail call float @llvm.nvvm.ex2.approx.f(float %1240) #2, !dbg !88 + br label %__nv_exp2f.exit240, !dbg !88 + +__nv_exp2f.exit240: ; preds = %1329, %1331 + %.0.i239 = phi float [ %1330, %1329 ], [ %1332, %1331 ], !dbg !88 + %1333 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i241 = icmp eq i32 %1333, 0, !dbg !88 + br i1 %.not.i241, label %1336, label %1334, !dbg !88 + +1334: ; preds = %__nv_exp2f.exit240 + %1335 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1241) #2, !dbg !88 + br label %__nv_exp2f.exit243, !dbg !88 + +1336: ; preds = %__nv_exp2f.exit240 + %1337 = tail call float @llvm.nvvm.ex2.approx.f(float %1241) #2, !dbg !88 + br label %__nv_exp2f.exit243, !dbg !88 + +__nv_exp2f.exit243: ; preds = %1334, %1336 + %.0.i242 = phi float [ %1335, %1334 ], [ %1337, %1336 ], !dbg !88 + %1338 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i244 = icmp eq i32 %1338, 0, !dbg !88 + br i1 %.not.i244, label %1341, label %1339, !dbg !88 + +1339: ; preds = %__nv_exp2f.exit243 + %1340 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1242) #2, !dbg !88 + br label %__nv_exp2f.exit246, !dbg !88 + +1341: ; preds = %__nv_exp2f.exit243 + %1342 = tail call float @llvm.nvvm.ex2.approx.f(float %1242) #2, !dbg !88 + br label %__nv_exp2f.exit246, !dbg !88 + +__nv_exp2f.exit246: ; preds = %1339, %1341 + %.0.i245 = phi float [ %1340, %1339 ], [ %1342, %1341 ], !dbg !88 + %1343 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i247 = icmp eq i32 %1343, 0, !dbg !88 + br i1 %.not.i247, label %1346, label %1344, !dbg !88 + +1344: ; preds = %__nv_exp2f.exit246 + %1345 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1243) #2, !dbg !88 + br label %__nv_exp2f.exit249, !dbg !88 + +1346: ; preds = %__nv_exp2f.exit246 + %1347 = tail call float @llvm.nvvm.ex2.approx.f(float %1243) #2, !dbg !88 + br label %__nv_exp2f.exit249, !dbg !88 + +__nv_exp2f.exit249: ; preds = %1344, %1346 + %.0.i248 = phi float [ %1345, %1344 ], [ %1347, %1346 ], !dbg !88 + %1348 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i250 = icmp eq i32 %1348, 0, !dbg !88 + br i1 %.not.i250, label %1351, label %1349, !dbg !88 + +1349: ; preds = %__nv_exp2f.exit249 + %1350 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1244) #2, !dbg !88 + br label %__nv_exp2f.exit252, !dbg !88 + +1351: ; preds = %__nv_exp2f.exit249 + %1352 = tail call float @llvm.nvvm.ex2.approx.f(float %1244) #2, !dbg !88 + br label %__nv_exp2f.exit252, !dbg !88 + +__nv_exp2f.exit252: ; preds = %1349, %1351 + %.0.i251 = phi float [ %1350, %1349 ], [ %1352, %1351 ], !dbg !88 + %1353 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i253 = icmp eq i32 %1353, 0, !dbg !88 + br i1 %.not.i253, label %1356, label %1354, !dbg !88 + +1354: ; preds = %__nv_exp2f.exit252 + %1355 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1245) #2, !dbg !88 + br label %__nv_exp2f.exit255, !dbg !88 + +1356: ; preds = %__nv_exp2f.exit252 + %1357 = tail call float @llvm.nvvm.ex2.approx.f(float %1245) #2, !dbg !88 + br label %__nv_exp2f.exit255, !dbg !88 + +__nv_exp2f.exit255: ; preds = %1354, %1356 + %.0.i254 = phi float [ %1355, %1354 ], [ %1357, %1356 ], !dbg !88 + %1358 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i256 = icmp eq i32 %1358, 0, !dbg !88 + br i1 %.not.i256, label %1361, label %1359, !dbg !88 + +1359: ; preds = %__nv_exp2f.exit255 + %1360 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1246) #2, !dbg !88 + br label %__nv_exp2f.exit258, !dbg !88 + +1361: ; preds = %__nv_exp2f.exit255 + %1362 = tail call float @llvm.nvvm.ex2.approx.f(float %1246) #2, !dbg !88 + br label %__nv_exp2f.exit258, !dbg !88 + +__nv_exp2f.exit258: ; preds = %1359, %1361 + %.0.i257 = phi float [ %1360, %1359 ], [ %1362, %1361 ], !dbg !88 + %1363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i259 = icmp eq i32 %1363, 0, !dbg !88 + br i1 %.not.i259, label %1366, label %1364, !dbg !88 + +1364: ; preds = %__nv_exp2f.exit258 + %1365 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1247) #2, !dbg !88 + br label %__nv_exp2f.exit261, !dbg !88 + +1366: ; preds = %__nv_exp2f.exit258 + %1367 = tail call float @llvm.nvvm.ex2.approx.f(float %1247) #2, !dbg !88 + br label %__nv_exp2f.exit261, !dbg !88 + +__nv_exp2f.exit261: ; preds = %1364, %1366 + %.0.i260 = phi float [ %1365, %1364 ], [ %1367, %1366 ], !dbg !88 + %1368 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i262 = icmp eq i32 %1368, 0, !dbg !88 + br i1 %.not.i262, label %1371, label %1369, !dbg !88 + +1369: ; preds = %__nv_exp2f.exit261 + %1370 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1248) #2, !dbg !88 + br label %__nv_exp2f.exit264, !dbg !88 + +1371: ; preds = %__nv_exp2f.exit261 + %1372 = tail call float @llvm.nvvm.ex2.approx.f(float %1248) #2, !dbg !88 + br label %__nv_exp2f.exit264, !dbg !88 + +__nv_exp2f.exit264: ; preds = %1369, %1371 + %.0.i263 = phi float [ %1370, %1369 ], [ %1372, %1371 ], !dbg !88 + %1373 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i265 = icmp eq i32 %1373, 0, !dbg !88 + br i1 %.not.i265, label %1376, label %1374, !dbg !88 + +1374: ; preds = %__nv_exp2f.exit264 + %1375 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1249) #2, !dbg !88 + br label %__nv_exp2f.exit267, !dbg !88 + +1376: ; preds = %__nv_exp2f.exit264 + %1377 = tail call float @llvm.nvvm.ex2.approx.f(float %1249) #2, !dbg !88 + br label %__nv_exp2f.exit267, !dbg !88 + +__nv_exp2f.exit267: ; preds = %1374, %1376 + %.0.i266 = phi float [ %1375, %1374 ], [ %1377, %1376 ], !dbg !88 + %1378 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i268 = icmp eq i32 %1378, 0, !dbg !88 + br i1 %.not.i268, label %1381, label %1379, !dbg !88 + +1379: ; preds = %__nv_exp2f.exit267 + %1380 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1250) #2, !dbg !88 + br label %__nv_exp2f.exit270, !dbg !88 + +1381: ; preds = %__nv_exp2f.exit267 + %1382 = tail call float @llvm.nvvm.ex2.approx.f(float %1250) #2, !dbg !88 + br label %__nv_exp2f.exit270, !dbg !88 + +__nv_exp2f.exit270: ; preds = %1379, %1381 + %.0.i269 = phi float [ %1380, %1379 ], [ %1382, %1381 ], !dbg !88 + %1383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i271 = icmp eq i32 %1383, 0, !dbg !88 + br i1 %.not.i271, label %1386, label %1384, !dbg !88 + +1384: ; preds = %__nv_exp2f.exit270 + %1385 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1251) #2, !dbg !88 + br label %__nv_exp2f.exit273, !dbg !88 + +1386: ; preds = %__nv_exp2f.exit270 + %1387 = tail call float @llvm.nvvm.ex2.approx.f(float %1251) #2, !dbg !88 + br label %__nv_exp2f.exit273, !dbg !88 + +__nv_exp2f.exit273: ; preds = %1384, %1386 + %.0.i272 = phi float [ %1385, %1384 ], [ %1387, %1386 ], !dbg !88 + %1388 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i274 = icmp eq i32 %1388, 0, !dbg !88 + br i1 %.not.i274, label %1391, label %1389, !dbg !88 + +1389: ; preds = %__nv_exp2f.exit273 + %1390 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1252) #2, !dbg !88 + br label %__nv_exp2f.exit276, !dbg !88 + +1391: ; preds = %__nv_exp2f.exit273 + %1392 = tail call float @llvm.nvvm.ex2.approx.f(float %1252) #2, !dbg !88 + br label %__nv_exp2f.exit276, !dbg !88 + +__nv_exp2f.exit276: ; preds = %1389, %1391 + %.0.i275 = phi float [ %1390, %1389 ], [ %1392, %1391 ], !dbg !88 + %1393 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i277 = icmp eq i32 %1393, 0, !dbg !88 + br i1 %.not.i277, label %1396, label %1394, !dbg !88 + +1394: ; preds = %__nv_exp2f.exit276 + %1395 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1253) #2, !dbg !88 + br label %__nv_exp2f.exit279, !dbg !88 + +1396: ; preds = %__nv_exp2f.exit276 + %1397 = tail call float @llvm.nvvm.ex2.approx.f(float %1253) #2, !dbg !88 + br label %__nv_exp2f.exit279, !dbg !88 + +__nv_exp2f.exit279: ; preds = %1394, %1396 + %.0.i278 = phi float [ %1395, %1394 ], [ %1397, %1396 ], !dbg !88 + %1398 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i280 = icmp eq i32 %1398, 0, !dbg !88 + br i1 %.not.i280, label %1401, label %1399, !dbg !88 + +1399: ; preds = %__nv_exp2f.exit279 + %1400 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1254) #2, !dbg !88 + br label %__nv_exp2f.exit282, !dbg !88 + +1401: ; preds = %__nv_exp2f.exit279 + %1402 = tail call float @llvm.nvvm.ex2.approx.f(float %1254) #2, !dbg !88 + br label %__nv_exp2f.exit282, !dbg !88 + +__nv_exp2f.exit282: ; preds = %1399, %1401 + %.0.i281 = phi float [ %1400, %1399 ], [ %1402, %1401 ], !dbg !88 + %1403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i283 = icmp eq i32 %1403, 0, !dbg !88 + br i1 %.not.i283, label %1406, label %1404, !dbg !88 + +1404: ; preds = %__nv_exp2f.exit282 + %1405 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1255) #2, !dbg !88 + br label %__nv_exp2f.exit285, !dbg !88 + +1406: ; preds = %__nv_exp2f.exit282 + %1407 = tail call float @llvm.nvvm.ex2.approx.f(float %1255) #2, !dbg !88 + br label %__nv_exp2f.exit285, !dbg !88 + +__nv_exp2f.exit285: ; preds = %1404, %1406 + %.0.i284 = phi float [ %1405, %1404 ], [ %1407, %1406 ], !dbg !88 + %1408 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i286 = icmp eq i32 %1408, 0, !dbg !88 + br i1 %.not.i286, label %1411, label %1409, !dbg !88 + +1409: ; preds = %__nv_exp2f.exit285 + %1410 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1256) #2, !dbg !88 + br label %__nv_exp2f.exit288, !dbg !88 + +1411: ; preds = %__nv_exp2f.exit285 + %1412 = tail call float @llvm.nvvm.ex2.approx.f(float %1256) #2, !dbg !88 + br label %__nv_exp2f.exit288, !dbg !88 + +__nv_exp2f.exit288: ; preds = %1409, %1411 + %.0.i287 = phi float [ %1410, %1409 ], [ %1412, %1411 ], !dbg !88 + %1413 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i289 = icmp eq i32 %1413, 0, !dbg !88 + br i1 %.not.i289, label %1416, label %1414, !dbg !88 + +1414: ; preds = %__nv_exp2f.exit288 + %1415 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1257) #2, !dbg !88 + br label %__nv_exp2f.exit291, !dbg !88 + +1416: ; preds = %__nv_exp2f.exit288 + %1417 = tail call float @llvm.nvvm.ex2.approx.f(float %1257) #2, !dbg !88 + br label %__nv_exp2f.exit291, !dbg !88 + +__nv_exp2f.exit291: ; preds = %1414, %1416 + %.0.i290 = phi float [ %1415, %1414 ], [ %1417, %1416 ], !dbg !88 + %1418 = fmul float %306, %.0.i191, !dbg !89 + %1419 = fmul float %307, %.0.i194, !dbg !89 + %1420 = fadd float %.0.i197, %.0.i200, !dbg !90 + %1421 = fadd float %.0.i203, %.0.i206, !dbg !90 + %1422 = fadd float %1420, %.0.i209, !dbg !90 + %1423 = fadd float %1422, %.0.i212, !dbg !90 + %1424 = fadd float %1421, %.0.i215, !dbg !90 + %1425 = fadd float %1424, %.0.i218, !dbg !90 + %1426 = fadd float %1423, %.0.i221, !dbg !90 + %1427 = fadd float %1426, %.0.i224, !dbg !90 + %1428 = fadd float %1425, %.0.i227, !dbg !90 + %1429 = fadd float %1428, %.0.i230, !dbg !90 + %1430 = fadd float %1427, %.0.i233, !dbg !90 + %1431 = fadd float %1430, %.0.i236, !dbg !90 + %1432 = fadd float %1429, %.0.i239, !dbg !90 + %1433 = fadd float %1432, %.0.i242, !dbg !90 + %1434 = fadd float %1431, %.0.i245, !dbg !90 + %1435 = fadd float %1434, %.0.i248, !dbg !90 + %1436 = fadd float %1433, %.0.i251, !dbg !90 + %1437 = fadd float %1436, %.0.i254, !dbg !90 + %1438 = fadd float %1435, %.0.i257, !dbg !90 + %1439 = fadd float %1438, %.0.i260, !dbg !90 + %1440 = fadd float %1437, %.0.i263, !dbg !90 + %1441 = fadd float %1440, %.0.i266, !dbg !90 + %1442 = fadd float %1439, %.0.i269, !dbg !90 + %1443 = fadd float %1442, %.0.i272, !dbg !90 + %1444 = fadd float %1441, %.0.i275, !dbg !90 + %1445 = fadd float %1444, %.0.i278, !dbg !90 + %1446 = fadd float %1443, %.0.i281, !dbg !90 + %1447 = fadd float %1446, %.0.i284, !dbg !90 + %1448 = fadd float %1445, %.0.i287, !dbg !90 + %1449 = fadd float %1448, %.0.i290, !dbg !90 + %1450 = bitcast float %1447 to i32, !dbg !91 + %1451 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1450, i32 2, i32 31), !dbg !91 + %1452 = bitcast i32 %1451 to float, !dbg !91 + %1453 = fadd float %1447, %1452, !dbg !90 + %1454 = bitcast float %1453 to i32, !dbg !91 + %1455 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1454, i32 1, i32 31), !dbg !91 + %1456 = bitcast i32 %1455 to float, !dbg !91 + %1457 = fadd float %1453, %1456, !dbg !90 + %1458 = bitcast float %1449 to i32, !dbg !91 + %1459 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1458, i32 2, i32 31), !dbg !91 + %1460 = bitcast i32 %1459 to float, !dbg !91 + %1461 = fadd float %1449, %1460, !dbg !90 + %1462 = bitcast float %1461 to i32, !dbg !91 + %1463 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1462, i32 1, i32 31), !dbg !91 + %1464 = bitcast i32 %1463 to float, !dbg !91 + %1465 = fadd float %1461, %1464, !dbg !90 + %1466 = fadd float %1418, %1457, !dbg !92 + %1467 = fadd float %1419, %1465, !dbg !92 + %1468 = fmul float %763, %.0.i191, !dbg !93 + %1469 = fmul float %764, %.0.i191, !dbg !93 + %1470 = fmul float %765, %.0.i194, !dbg !93 + %1471 = fmul float %766, %.0.i194, !dbg !93 + %1472 = fmul float %767, %.0.i191, !dbg !93 + %1473 = fmul float %768, %.0.i191, !dbg !93 + %1474 = fmul float %769, %.0.i194, !dbg !93 + %1475 = fmul float %770, %.0.i194, !dbg !93 + %1476 = fmul float %771, %.0.i191, !dbg !93 + %1477 = fmul float %772, %.0.i191, !dbg !93 + %1478 = fmul float %773, %.0.i194, !dbg !93 + %1479 = fmul float %774, %.0.i194, !dbg !93 + %1480 = fmul float %775, %.0.i191, !dbg !93 + %1481 = fmul float %776, %.0.i191, !dbg !93 + %1482 = fmul float %777, %.0.i194, !dbg !93 + %1483 = fmul float %778, %.0.i194, !dbg !93 + %1484 = fmul float %779, %.0.i191, !dbg !93 + %1485 = fmul float %780, %.0.i191, !dbg !93 + %1486 = fmul float %781, %.0.i194, !dbg !93 + %1487 = fmul float %782, %.0.i194, !dbg !93 + %1488 = fmul float %783, %.0.i191, !dbg !93 + %1489 = fmul float %784, %.0.i191, !dbg !93 + %1490 = fmul float %785, %.0.i194, !dbg !93 + %1491 = fmul float %786, %.0.i194, !dbg !93 + %1492 = fmul float %787, %.0.i191, !dbg !93 + %1493 = fmul float %788, %.0.i191, !dbg !93 + %1494 = fmul float %789, %.0.i194, !dbg !93 + %1495 = fmul float %790, %.0.i194, !dbg !93 + %1496 = fmul float %791, %.0.i191, !dbg !93 + %1497 = fmul float %792, %.0.i191, !dbg !93 + %1498 = fmul float %793, %.0.i194, !dbg !93 + %1499 = fmul float %794, %.0.i194, !dbg !93 + %1500 = fmul float %795, %.0.i191, !dbg !93 + %1501 = fmul float %796, %.0.i191, !dbg !93 + %1502 = fmul float %797, %.0.i194, !dbg !93 + %1503 = fmul float %798, %.0.i194, !dbg !93 + %1504 = fmul float %799, %.0.i191, !dbg !93 + %1505 = fmul float %800, %.0.i191, !dbg !93 + %1506 = fmul float %801, %.0.i194, !dbg !93 + %1507 = fmul float %802, %.0.i194, !dbg !93 + %1508 = fmul float %803, %.0.i191, !dbg !93 + %1509 = fmul float %804, %.0.i191, !dbg !93 + %1510 = fmul float %805, %.0.i194, !dbg !93 + %1511 = fmul float %806, %.0.i194, !dbg !93 + %1512 = fmul float %807, %.0.i191, !dbg !93 + %1513 = fmul float %808, %.0.i191, !dbg !93 + %1514 = fmul float %809, %.0.i194, !dbg !93 + %1515 = fmul float %810, %.0.i194, !dbg !93 + %1516 = fmul float %811, %.0.i191, !dbg !93 + %1517 = fmul float %812, %.0.i191, !dbg !93 + %1518 = fmul float %813, %.0.i194, !dbg !93 + %1519 = fmul float %814, %.0.i194, !dbg !93 + %1520 = fmul float %815, %.0.i191, !dbg !93 + %1521 = fmul float %816, %.0.i191, !dbg !93 + %1522 = fmul float %817, %.0.i194, !dbg !93 + %1523 = fmul float %818, %.0.i194, !dbg !93 + %1524 = fmul float %819, %.0.i191, !dbg !93 + %1525 = fmul float %820, %.0.i191, !dbg !93 + %1526 = fmul float %821, %.0.i194, !dbg !93 + %1527 = fmul float %822, %.0.i194, !dbg !93 + %1528 = fmul float %823, %.0.i191, !dbg !93 + %1529 = fmul float %824, %.0.i191, !dbg !93 + %1530 = fmul float %825, %.0.i194, !dbg !93 + %1531 = fmul float %826, %.0.i194, !dbg !93 + %1532 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %381, !dbg !53 + %1533 = insertelement <2 x float> poison, float %.0.i197, i64 0, !dbg !94 + %1534 = insertelement <2 x float> %1533, float %.0.i200, i64 1, !dbg !94 + %1535 = fptrunc <2 x float> %1534 to <2 x bfloat>, !dbg !94 + %1536 = insertelement <2 x float> poison, float %.0.i203, i64 0, !dbg !94 + %1537 = insertelement <2 x float> %1536, float %.0.i206, i64 1, !dbg !94 + %1538 = fptrunc <2 x float> %1537 to <2 x bfloat>, !dbg !94 + %1539 = insertelement <2 x float> poison, float %.0.i209, i64 0, !dbg !94 + %1540 = insertelement <2 x float> %1539, float %.0.i212, i64 1, !dbg !94 + %1541 = fptrunc <2 x float> %1540 to <2 x bfloat>, !dbg !94 + %1542 = insertelement <2 x float> poison, float %.0.i215, i64 0, !dbg !94 + %1543 = insertelement <2 x float> %1542, float %.0.i218, i64 1, !dbg !94 + %1544 = fptrunc <2 x float> %1543 to <2 x bfloat>, !dbg !94 + %1545 = insertelement <2 x float> poison, float %.0.i221, i64 0, !dbg !94 + %1546 = insertelement <2 x float> %1545, float %.0.i224, i64 1, !dbg !94 + %1547 = fptrunc <2 x float> %1546 to <2 x bfloat>, !dbg !94 + %1548 = insertelement <2 x float> poison, float %.0.i227, i64 0, !dbg !94 + %1549 = insertelement <2 x float> %1548, float %.0.i230, i64 1, !dbg !94 + %1550 = fptrunc <2 x float> %1549 to <2 x bfloat>, !dbg !94 + %1551 = insertelement <2 x float> poison, float %.0.i233, i64 0, !dbg !94 + %1552 = insertelement <2 x float> %1551, float %.0.i236, i64 1, !dbg !94 + %1553 = fptrunc <2 x float> %1552 to <2 x bfloat>, !dbg !94 + %1554 = insertelement <2 x float> poison, float %.0.i239, i64 0, !dbg !94 + %1555 = insertelement <2 x float> %1554, float %.0.i242, i64 1, !dbg !94 + %1556 = fptrunc <2 x float> %1555 to <2 x bfloat>, !dbg !94 + %1557 = insertelement <2 x float> poison, float %.0.i245, i64 0, !dbg !94 + %1558 = insertelement <2 x float> %1557, float %.0.i248, i64 1, !dbg !94 + %1559 = fptrunc <2 x float> %1558 to <2 x bfloat>, !dbg !94 + %1560 = insertelement <2 x float> poison, float %.0.i251, i64 0, !dbg !94 + %1561 = insertelement <2 x float> %1560, float %.0.i254, i64 1, !dbg !94 + %1562 = fptrunc <2 x float> %1561 to <2 x bfloat>, !dbg !94 + %1563 = insertelement <2 x float> poison, float %.0.i257, i64 0, !dbg !94 + %1564 = insertelement <2 x float> %1563, float %.0.i260, i64 1, !dbg !94 + %1565 = fptrunc <2 x float> %1564 to <2 x bfloat>, !dbg !94 + %1566 = insertelement <2 x float> poison, float %.0.i263, i64 0, !dbg !94 + %1567 = insertelement <2 x float> %1566, float %.0.i266, i64 1, !dbg !94 + %1568 = fptrunc <2 x float> %1567 to <2 x bfloat>, !dbg !94 + %1569 = insertelement <2 x float> poison, float %.0.i269, i64 0, !dbg !94 + %1570 = insertelement <2 x float> %1569, float %.0.i272, i64 1, !dbg !94 + %1571 = fptrunc <2 x float> %1570 to <2 x bfloat>, !dbg !94 + %1572 = insertelement <2 x float> poison, float %.0.i275, i64 0, !dbg !94 + %1573 = insertelement <2 x float> %1572, float %.0.i278, i64 1, !dbg !94 + %1574 = fptrunc <2 x float> %1573 to <2 x bfloat>, !dbg !94 + %1575 = insertelement <2 x float> poison, float %.0.i281, i64 0, !dbg !94 + %1576 = insertelement <2 x float> %1575, float %.0.i284, i64 1, !dbg !94 + %1577 = fptrunc <2 x float> %1576 to <2 x bfloat>, !dbg !94 + %1578 = insertelement <2 x float> poison, float %.0.i287, i64 0, !dbg !94 + %1579 = insertelement <2 x float> %1578, float %.0.i290, i64 1, !dbg !94 + %1580 = fptrunc <2 x float> %1579 to <2 x bfloat>, !dbg !94 + %1581 = bitcast <2 x bfloat> %1535 to i32, !dbg !95 + %1582 = bitcast <2 x bfloat> %1538 to i32, !dbg !95 + %1583 = bitcast <2 x bfloat> %1541 to i32, !dbg !95 + %1584 = bitcast <2 x bfloat> %1544 to i32, !dbg !95 + %1585 = bitcast <2 x bfloat> %1547 to i32, !dbg !95 + %1586 = bitcast <2 x bfloat> %1550 to i32, !dbg !95 + %1587 = bitcast <2 x bfloat> %1553 to i32, !dbg !95 + %1588 = bitcast <2 x bfloat> %1556 to i32, !dbg !95 + %1589 = bitcast <2 x bfloat> %1559 to i32, !dbg !95 + %1590 = bitcast <2 x bfloat> %1562 to i32, !dbg !95 + %1591 = bitcast <2 x bfloat> %1565 to i32, !dbg !95 + %1592 = bitcast <2 x bfloat> %1568 to i32, !dbg !95 + %1593 = bitcast <2 x bfloat> %1571 to i32, !dbg !95 + %1594 = bitcast <2 x bfloat> %1574 to i32, !dbg !95 + %1595 = bitcast <2 x bfloat> %1577 to i32, !dbg !95 + %1596 = bitcast <2 x bfloat> %1580 to i32, !dbg !95 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !95 + %1597 = ptrtoint ptr addrspace(3) %1532 to i32, !dbg !95 + %1598 = lshr exact i32 %1597, 4, !dbg !95 + %1599 = and i32 %1598, 16383, !dbg !95 + %1600 = zext nneg i32 %1599 to i64, !dbg !95 + %1601 = or disjoint i64 %1600, 4611686293338849280, !dbg !95 + %1602 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1468, float %1469, float %1470, float %1471, float %1472, float %1473, float %1474, float %1475, float %1476, float %1477, float %1478, float %1479, float %1480, float %1481, float %1482, float %1483, float %1484, float %1485, float %1486, float %1487, float %1488, float %1489, float %1490, float %1491, float %1492, float %1493, float %1494, float %1495, float %1496, float %1497, float %1498, float %1499, float %1500, float %1501, float %1502, float %1503, float %1504, float %1505, float %1506, float %1507, float %1508, float %1509, float %1510, float %1511, float %1512, float %1513, float %1514, float %1515, float %1516, float %1517, float %1518, float %1519, float %1520, float %1521, float %1522, float %1523, float %1524, float %1525, float %1526, float %1527, float %1528, float %1529, float %1530, float %1531, i32 %1581, i32 %1582, i32 %1583, i32 %1584, i64 %1601, i1 true) #2, !dbg !95 + %1603 = add i32 %1597, 2048, !dbg !95 + %1604 = lshr exact i32 %1603, 4, !dbg !95 + %1605 = and i32 %1604, 16383, !dbg !95 + %1606 = zext nneg i32 %1605 to i64, !dbg !95 + %1607 = or disjoint i64 %1606, 4611686293338849280, !dbg !95 + %1608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 0, !dbg !95 + %1609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 1, !dbg !95 + %1610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 2, !dbg !95 + %1611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 3, !dbg !95 + %1612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 4, !dbg !95 + %1613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 5, !dbg !95 + %1614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 6, !dbg !95 + %1615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 7, !dbg !95 + %1616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 8, !dbg !95 + %1617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 9, !dbg !95 + %1618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 10, !dbg !95 + %1619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 11, !dbg !95 + %1620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 12, !dbg !95 + %1621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 13, !dbg !95 + %1622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 14, !dbg !95 + %1623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 15, !dbg !95 + %1624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 16, !dbg !95 + %1625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 17, !dbg !95 + %1626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 18, !dbg !95 + %1627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 19, !dbg !95 + %1628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 20, !dbg !95 + %1629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 21, !dbg !95 + %1630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 22, !dbg !95 + %1631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 23, !dbg !95 + %1632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 24, !dbg !95 + %1633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 25, !dbg !95 + %1634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 26, !dbg !95 + %1635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 27, !dbg !95 + %1636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 28, !dbg !95 + %1637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 29, !dbg !95 + %1638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 30, !dbg !95 + %1639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 31, !dbg !95 + %1640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 32, !dbg !95 + %1641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 33, !dbg !95 + %1642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 34, !dbg !95 + %1643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 35, !dbg !95 + %1644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 36, !dbg !95 + %1645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 37, !dbg !95 + %1646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 38, !dbg !95 + %1647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 39, !dbg !95 + %1648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 40, !dbg !95 + %1649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 41, !dbg !95 + %1650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 42, !dbg !95 + %1651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 43, !dbg !95 + %1652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 44, !dbg !95 + %1653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 45, !dbg !95 + %1654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 46, !dbg !95 + %1655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 47, !dbg !95 + %1656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 48, !dbg !95 + %1657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 49, !dbg !95 + %1658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 50, !dbg !95 + %1659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 51, !dbg !95 + %1660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 52, !dbg !95 + %1661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 53, !dbg !95 + %1662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 54, !dbg !95 + %1663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 55, !dbg !95 + %1664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 56, !dbg !95 + %1665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 57, !dbg !95 + %1666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 58, !dbg !95 + %1667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 59, !dbg !95 + %1668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 60, !dbg !95 + %1669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 61, !dbg !95 + %1670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 62, !dbg !95 + %1671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 63, !dbg !95 + %1672 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1608, float %1609, float %1610, float %1611, float %1612, float %1613, float %1614, float %1615, float %1616, float %1617, float %1618, float %1619, float %1620, float %1621, float %1622, float %1623, float %1624, float %1625, float %1626, float %1627, float %1628, float %1629, float %1630, float %1631, float %1632, float %1633, float %1634, float %1635, float %1636, float %1637, float %1638, float %1639, float %1640, float %1641, float %1642, float %1643, float %1644, float %1645, float %1646, float %1647, float %1648, float %1649, float %1650, float %1651, float %1652, float %1653, float %1654, float %1655, float %1656, float %1657, float %1658, float %1659, float %1660, float %1661, float %1662, float %1663, float %1664, float %1665, float %1666, float %1667, float %1668, float %1669, float %1670, float %1671, i32 %1585, i32 %1586, i32 %1587, i32 %1588, i64 %1607, i1 true) #2, !dbg !95 + %1673 = add i32 %1597, 4096, !dbg !95 + %1674 = lshr exact i32 %1673, 4, !dbg !95 + %1675 = and i32 %1674, 16383, !dbg !95 + %1676 = zext nneg i32 %1675 to i64, !dbg !95 + %1677 = or disjoint i64 %1676, 4611686293338849280, !dbg !95 + %1678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 0, !dbg !95 + %1679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 1, !dbg !95 + %1680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 2, !dbg !95 + %1681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 3, !dbg !95 + %1682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 4, !dbg !95 + %1683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 5, !dbg !95 + %1684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 6, !dbg !95 + %1685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 7, !dbg !95 + %1686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 8, !dbg !95 + %1687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 9, !dbg !95 + %1688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 10, !dbg !95 + %1689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 11, !dbg !95 + %1690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 12, !dbg !95 + %1691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 13, !dbg !95 + %1692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 14, !dbg !95 + %1693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 15, !dbg !95 + %1694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 16, !dbg !95 + %1695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 17, !dbg !95 + %1696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 18, !dbg !95 + %1697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 19, !dbg !95 + %1698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 20, !dbg !95 + %1699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 21, !dbg !95 + %1700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 22, !dbg !95 + %1701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 23, !dbg !95 + %1702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 24, !dbg !95 + %1703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 25, !dbg !95 + %1704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 26, !dbg !95 + %1705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 27, !dbg !95 + %1706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 28, !dbg !95 + %1707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 29, !dbg !95 + %1708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 30, !dbg !95 + %1709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 31, !dbg !95 + %1710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 32, !dbg !95 + %1711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 33, !dbg !95 + %1712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 34, !dbg !95 + %1713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 35, !dbg !95 + %1714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 36, !dbg !95 + %1715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 37, !dbg !95 + %1716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 38, !dbg !95 + %1717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 39, !dbg !95 + %1718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 40, !dbg !95 + %1719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 41, !dbg !95 + %1720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 42, !dbg !95 + %1721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 43, !dbg !95 + %1722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 44, !dbg !95 + %1723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 45, !dbg !95 + %1724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 46, !dbg !95 + %1725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 47, !dbg !95 + %1726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 48, !dbg !95 + %1727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 49, !dbg !95 + %1728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 50, !dbg !95 + %1729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 51, !dbg !95 + %1730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 52, !dbg !95 + %1731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 53, !dbg !95 + %1732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 54, !dbg !95 + %1733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 55, !dbg !95 + %1734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 56, !dbg !95 + %1735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 57, !dbg !95 + %1736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 58, !dbg !95 + %1737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 59, !dbg !95 + %1738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 60, !dbg !95 + %1739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 61, !dbg !95 + %1740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 62, !dbg !95 + %1741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 63, !dbg !95 + %1742 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1678, float %1679, float %1680, float %1681, float %1682, float %1683, float %1684, float %1685, float %1686, float %1687, float %1688, float %1689, float %1690, float %1691, float %1692, float %1693, float %1694, float %1695, float %1696, float %1697, float %1698, float %1699, float %1700, float %1701, float %1702, float %1703, float %1704, float %1705, float %1706, float %1707, float %1708, float %1709, float %1710, float %1711, float %1712, float %1713, float %1714, float %1715, float %1716, float %1717, float %1718, float %1719, float %1720, float %1721, float %1722, float %1723, float %1724, float %1725, float %1726, float %1727, float %1728, float %1729, float %1730, float %1731, float %1732, float %1733, float %1734, float %1735, float %1736, float %1737, float %1738, float %1739, float %1740, float %1741, i32 %1589, i32 %1590, i32 %1591, i32 %1592, i64 %1677, i1 true) #2, !dbg !95 + %1743 = add i32 %1597, 6144, !dbg !95 + %1744 = lshr exact i32 %1743, 4, !dbg !95 + %1745 = and i32 %1744, 16383, !dbg !95 + %1746 = zext nneg i32 %1745 to i64, !dbg !95 + %1747 = or disjoint i64 %1746, 4611686293338849280, !dbg !95 + %1748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 0, !dbg !95 + %1749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 1, !dbg !95 + %1750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 2, !dbg !95 + %1751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 3, !dbg !95 + %1752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 4, !dbg !95 + %1753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 5, !dbg !95 + %1754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 6, !dbg !95 + %1755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 7, !dbg !95 + %1756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 8, !dbg !95 + %1757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 9, !dbg !95 + %1758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 10, !dbg !95 + %1759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 11, !dbg !95 + %1760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 12, !dbg !95 + %1761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 13, !dbg !95 + %1762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 14, !dbg !95 + %1763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 15, !dbg !95 + %1764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 16, !dbg !95 + %1765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 17, !dbg !95 + %1766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 18, !dbg !95 + %1767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 19, !dbg !95 + %1768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 20, !dbg !95 + %1769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 21, !dbg !95 + %1770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 22, !dbg !95 + %1771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 23, !dbg !95 + %1772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 24, !dbg !95 + %1773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 25, !dbg !95 + %1774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 26, !dbg !95 + %1775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 27, !dbg !95 + %1776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 28, !dbg !95 + %1777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 29, !dbg !95 + %1778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 30, !dbg !95 + %1779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 31, !dbg !95 + %1780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 32, !dbg !95 + %1781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 33, !dbg !95 + %1782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 34, !dbg !95 + %1783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 35, !dbg !95 + %1784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 36, !dbg !95 + %1785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 37, !dbg !95 + %1786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 38, !dbg !95 + %1787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 39, !dbg !95 + %1788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 40, !dbg !95 + %1789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 41, !dbg !95 + %1790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 42, !dbg !95 + %1791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 43, !dbg !95 + %1792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 44, !dbg !95 + %1793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 45, !dbg !95 + %1794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 46, !dbg !95 + %1795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 47, !dbg !95 + %1796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 48, !dbg !95 + %1797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 49, !dbg !95 + %1798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 50, !dbg !95 + %1799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 51, !dbg !95 + %1800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 52, !dbg !95 + %1801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 53, !dbg !95 + %1802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 54, !dbg !95 + %1803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 55, !dbg !95 + %1804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 56, !dbg !95 + %1805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 57, !dbg !95 + %1806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 58, !dbg !95 + %1807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 59, !dbg !95 + %1808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 60, !dbg !95 + %1809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 61, !dbg !95 + %1810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 62, !dbg !95 + %1811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 63, !dbg !95 + %1812 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1748, float %1749, float %1750, float %1751, float %1752, float %1753, float %1754, float %1755, float %1756, float %1757, float %1758, float %1759, float %1760, float %1761, float %1762, float %1763, float %1764, float %1765, float %1766, float %1767, float %1768, float %1769, float %1770, float %1771, float %1772, float %1773, float %1774, float %1775, float %1776, float %1777, float %1778, float %1779, float %1780, float %1781, float %1782, float %1783, float %1784, float %1785, float %1786, float %1787, float %1788, float %1789, float %1790, float %1791, float %1792, float %1793, float %1794, float %1795, float %1796, float %1797, float %1798, float %1799, float %1800, float %1801, float %1802, float %1803, float %1804, float %1805, float %1806, float %1807, float %1808, float %1809, float %1810, float %1811, i32 %1593, i32 %1594, i32 %1595, i32 %1596, i64 %1747, i1 true) #2, !dbg !95 + %1813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 0, !dbg !95 + %1814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 1, !dbg !95 + %1815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 2, !dbg !95 + %1816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 3, !dbg !95 + %1817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 4, !dbg !95 + %1818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 5, !dbg !95 + %1819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 6, !dbg !95 + %1820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 7, !dbg !95 + %1821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 8, !dbg !95 + %1822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 9, !dbg !95 + %1823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 10, !dbg !95 + %1824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 11, !dbg !95 + %1825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 12, !dbg !95 + %1826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 13, !dbg !95 + %1827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 14, !dbg !95 + %1828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 15, !dbg !95 + %1829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 16, !dbg !95 + %1830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 17, !dbg !95 + %1831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 18, !dbg !95 + %1832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 19, !dbg !95 + %1833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 20, !dbg !95 + %1834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 21, !dbg !95 + %1835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 22, !dbg !95 + %1836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 23, !dbg !95 + %1837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 24, !dbg !95 + %1838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 25, !dbg !95 + %1839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 26, !dbg !95 + %1840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 27, !dbg !95 + %1841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 28, !dbg !95 + %1842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 29, !dbg !95 + %1843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 30, !dbg !95 + %1844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 31, !dbg !95 + %1845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 32, !dbg !95 + %1846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 33, !dbg !95 + %1847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 34, !dbg !95 + %1848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 35, !dbg !95 + %1849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 36, !dbg !95 + %1850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 37, !dbg !95 + %1851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 38, !dbg !95 + %1852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 39, !dbg !95 + %1853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 40, !dbg !95 + %1854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 41, !dbg !95 + %1855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 42, !dbg !95 + %1856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 43, !dbg !95 + %1857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 44, !dbg !95 + %1858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 45, !dbg !95 + %1859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 46, !dbg !95 + %1860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 47, !dbg !95 + %1861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 48, !dbg !95 + %1862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 49, !dbg !95 + %1863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 50, !dbg !95 + %1864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 51, !dbg !95 + %1865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 52, !dbg !95 + %1866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 53, !dbg !95 + %1867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 54, !dbg !95 + %1868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 55, !dbg !95 + %1869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 56, !dbg !95 + %1870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 57, !dbg !95 + %1871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 58, !dbg !95 + %1872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 59, !dbg !95 + %1873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 60, !dbg !95 + %1874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 61, !dbg !95 + %1875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 62, !dbg !95 + %1876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 63, !dbg !95 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !95 + %1877 = insertelement <16 x i32> poison, i32 %302, i64 0, !dbg !96 + %1878 = shufflevector <16 x i32> %1877, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !96 + %1879 = add <16 x i32> %1878, %374, !dbg !96 + %1880 = add nuw nsw i32 %372, 1, !dbg !45 + %1881 = lshr i32 %1880, 1, !dbg !97 + %1882 = zext nneg i32 %1881 to i64, !dbg !98 + %1883 = getelementptr i32, ptr addrspace(1) %183, i64 %1882, !dbg !98 + %1884 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !99 + %1885 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %1883, i64 %1884, i1 %377) #2, !dbg !99 + %1886 = add nuw nsw i32 %1881, 1, !dbg !100 + %1887 = icmp slt i32 %1886, %188, !dbg !101 + %1888 = getelementptr i8, ptr addrspace(1) %1883, i64 4, !dbg !102 + %1889 = and i1 %377, %1887, !dbg !45 + %1890 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !103 + %1891 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %1888, i64 %1890, i1 %1889) #2, !dbg !103 + %1892 = and i32 %372, 1, !dbg !104 + %1893 = sub i32 %1891, %1885, !dbg !105 + %1894 = shl i32 %1893, 7, !dbg !106 + %1895 = add i32 %1894, -64, !dbg !107 + %1896 = xor i32 %1892, 1, !dbg !108 + %1897 = mul nuw nsw i32 %1895, %1896, !dbg !108 + %1898 = shl nuw nsw i32 %1892, 6, !dbg !109 + %1899 = add i32 %1897, %1898, !dbg !110 + %1900 = add i32 %1899, %305, !dbg !111 + %1901 = add i32 %304, 1, !dbg !45 + %1902 = icmp sgt i32 %1901, 2, !dbg !45 + %1903 = select i1 %1902, i32 0, i32 %1901, !dbg !45 + %1904 = add i32 %1900, %185, !dbg !54 + %1905 = add i32 %1904, %39, !dbg !49 + %1906 = add i32 %1904, %40, !dbg !49 + %1907 = add i32 %1904, %41, !dbg !49 + %1908 = add i32 %1904, %42, !dbg !49 + %1909 = shl i32 %1905, 7, !dbg !50 + %1910 = shl i32 %1906, 7, !dbg !50 + %1911 = shl i32 %1907, 7, !dbg !50 + %1912 = shl i32 %1908, 7, !dbg !50 + %1913 = sext i32 %1909 to i64, !dbg !51 + %1914 = sext i32 %1910 to i64, !dbg !51 + %1915 = sext i32 %1911 to i64, !dbg !51 + %1916 = sext i32 %1912 to i64, !dbg !51 + %gep = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %1913, !dbg !52 + %gep330 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %1914, !dbg !52 + %gep332 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %1915, !dbg !52 + %gep334 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %1916, !dbg !52 + %1917 = shl i32 %1903, 13, !dbg !53 + %1918 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %1917, !dbg !53 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %1919 = getelementptr inbounds nuw i8, ptr addrspace(3) %1918, i32 %220, !dbg !53 + %1920 = select i1 %376, i32 16, i32 0, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %1919, ptr addrspace(1) %gep, i32 %1920) #2, !dbg !53 + %1921 = getelementptr inbounds nuw i8, ptr addrspace(3) %1918, i32 %223, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %1921, ptr addrspace(1) %gep330, i32 %1920) #2, !dbg !53 + %1922 = getelementptr inbounds nuw i8, ptr addrspace(3) %1918, i32 %225, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %1922, ptr addrspace(1) %gep332, i32 %1920) #2, !dbg !53 + %1923 = getelementptr inbounds nuw i8, ptr addrspace(3) %1918, i32 %227, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %1923, ptr addrspace(1) %gep334, i32 %1920) #2, !dbg !53 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !53 + %gep336 = getelementptr bfloat, ptr addrspace(1) %invariant.gep335, i64 %1913, !dbg !52 + %gep338 = getelementptr bfloat, ptr addrspace(1) %invariant.gep335, i64 %1914, !dbg !52 + %gep340 = getelementptr bfloat, ptr addrspace(1) %invariant.gep335, i64 %1915, !dbg !52 + %gep342 = getelementptr bfloat, ptr addrspace(1) %invariant.gep335, i64 %1916, !dbg !52 + %1924 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %1917, !dbg !53 + %1925 = getelementptr inbounds nuw i8, ptr addrspace(3) %1924, i32 %220, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %1925, ptr addrspace(1) %gep336, i32 %1920) #2, !dbg !53 + %1926 = getelementptr inbounds nuw i8, ptr addrspace(3) %1924, i32 %223, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %1926, ptr addrspace(1) %gep338, i32 %1920) #2, !dbg !53 + %1927 = getelementptr inbounds nuw i8, ptr addrspace(3) %1924, i32 %225, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %1927, ptr addrspace(1) %gep340, i32 %1920) #2, !dbg !53 + %1928 = getelementptr inbounds nuw i8, ptr addrspace(3) %1924, i32 %227, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %1928, ptr addrspace(1) %gep342, i32 %1920) #2, !dbg !53 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !53 + %exitcond.not = icmp eq i32 %1880, %280, !dbg !45 + br i1 %exitcond.not, label %._crit_edge, label %301, !dbg !45 + +._crit_edge: ; preds = %__nv_exp2f.exit291, %13 + %1929 = phi float [ 0.000000e+00, %13 ], [ %1813, %__nv_exp2f.exit291 ], !dbg !112 + %1930 = phi float [ 0.000000e+00, %13 ], [ %1814, %__nv_exp2f.exit291 ], !dbg !112 + %1931 = phi float [ 0.000000e+00, %13 ], [ %1815, %__nv_exp2f.exit291 ], !dbg !112 + %1932 = phi float [ 0.000000e+00, %13 ], [ %1816, %__nv_exp2f.exit291 ], !dbg !112 + %1933 = phi float [ 0.000000e+00, %13 ], [ %1817, %__nv_exp2f.exit291 ], !dbg !112 + %1934 = phi float [ 0.000000e+00, %13 ], [ %1818, %__nv_exp2f.exit291 ], !dbg !112 + %1935 = phi float [ 0.000000e+00, %13 ], [ %1819, %__nv_exp2f.exit291 ], !dbg !112 + %1936 = phi float [ 0.000000e+00, %13 ], [ %1820, %__nv_exp2f.exit291 ], !dbg !112 + %1937 = phi float [ 0.000000e+00, %13 ], [ %1821, %__nv_exp2f.exit291 ], !dbg !112 + %1938 = phi float [ 0.000000e+00, %13 ], [ %1822, %__nv_exp2f.exit291 ], !dbg !112 + %1939 = phi float [ 0.000000e+00, %13 ], [ %1823, %__nv_exp2f.exit291 ], !dbg !112 + %1940 = phi float [ 0.000000e+00, %13 ], [ %1824, %__nv_exp2f.exit291 ], !dbg !112 + %1941 = phi float [ 0.000000e+00, %13 ], [ %1825, %__nv_exp2f.exit291 ], !dbg !112 + %1942 = phi float [ 0.000000e+00, %13 ], [ %1826, %__nv_exp2f.exit291 ], !dbg !112 + %1943 = phi float [ 0.000000e+00, %13 ], [ %1827, %__nv_exp2f.exit291 ], !dbg !112 + %1944 = phi float [ 0.000000e+00, %13 ], [ %1828, %__nv_exp2f.exit291 ], !dbg !112 + %1945 = phi float [ 0.000000e+00, %13 ], [ %1829, %__nv_exp2f.exit291 ], !dbg !112 + %1946 = phi float [ 0.000000e+00, %13 ], [ %1830, %__nv_exp2f.exit291 ], !dbg !112 + %1947 = phi float [ 0.000000e+00, %13 ], [ %1831, %__nv_exp2f.exit291 ], !dbg !112 + %1948 = phi float [ 0.000000e+00, %13 ], [ %1832, %__nv_exp2f.exit291 ], !dbg !112 + %1949 = phi float [ 0.000000e+00, %13 ], [ %1833, %__nv_exp2f.exit291 ], !dbg !112 + %1950 = phi float [ 0.000000e+00, %13 ], [ %1834, %__nv_exp2f.exit291 ], !dbg !112 + %1951 = phi float [ 0.000000e+00, %13 ], [ %1835, %__nv_exp2f.exit291 ], !dbg !112 + %1952 = phi float [ 0.000000e+00, %13 ], [ %1836, %__nv_exp2f.exit291 ], !dbg !112 + %1953 = phi float [ 0.000000e+00, %13 ], [ %1837, %__nv_exp2f.exit291 ], !dbg !112 + %1954 = phi float [ 0.000000e+00, %13 ], [ %1838, %__nv_exp2f.exit291 ], !dbg !112 + %1955 = phi float [ 0.000000e+00, %13 ], [ %1839, %__nv_exp2f.exit291 ], !dbg !112 + %1956 = phi float [ 0.000000e+00, %13 ], [ %1840, %__nv_exp2f.exit291 ], !dbg !112 + %1957 = phi float [ 0.000000e+00, %13 ], [ %1841, %__nv_exp2f.exit291 ], !dbg !112 + %1958 = phi float [ 0.000000e+00, %13 ], [ %1842, %__nv_exp2f.exit291 ], !dbg !112 + %1959 = phi float [ 0.000000e+00, %13 ], [ %1843, %__nv_exp2f.exit291 ], !dbg !112 + %1960 = phi float [ 0.000000e+00, %13 ], [ %1844, %__nv_exp2f.exit291 ], !dbg !112 + %1961 = phi float [ 0.000000e+00, %13 ], [ %1845, %__nv_exp2f.exit291 ], !dbg !112 + %1962 = phi float [ 0.000000e+00, %13 ], [ %1846, %__nv_exp2f.exit291 ], !dbg !112 + %1963 = phi float [ 0.000000e+00, %13 ], [ %1847, %__nv_exp2f.exit291 ], !dbg !112 + %1964 = phi float [ 0.000000e+00, %13 ], [ %1848, %__nv_exp2f.exit291 ], !dbg !112 + %1965 = phi float [ 0.000000e+00, %13 ], [ %1849, %__nv_exp2f.exit291 ], !dbg !112 + %1966 = phi float [ 0.000000e+00, %13 ], [ %1850, %__nv_exp2f.exit291 ], !dbg !112 + %1967 = phi float [ 0.000000e+00, %13 ], [ %1851, %__nv_exp2f.exit291 ], !dbg !112 + %1968 = phi float [ 0.000000e+00, %13 ], [ %1852, %__nv_exp2f.exit291 ], !dbg !112 + %1969 = phi float [ 0.000000e+00, %13 ], [ %1853, %__nv_exp2f.exit291 ], !dbg !112 + %1970 = phi float [ 0.000000e+00, %13 ], [ %1854, %__nv_exp2f.exit291 ], !dbg !112 + %1971 = phi float [ 0.000000e+00, %13 ], [ %1855, %__nv_exp2f.exit291 ], !dbg !112 + %1972 = phi float [ 0.000000e+00, %13 ], [ %1856, %__nv_exp2f.exit291 ], !dbg !112 + %1973 = phi float [ 0.000000e+00, %13 ], [ %1857, %__nv_exp2f.exit291 ], !dbg !112 + %1974 = phi float [ 0.000000e+00, %13 ], [ %1858, %__nv_exp2f.exit291 ], !dbg !112 + %1975 = phi float [ 0.000000e+00, %13 ], [ %1859, %__nv_exp2f.exit291 ], !dbg !112 + %1976 = phi float [ 0.000000e+00, %13 ], [ %1860, %__nv_exp2f.exit291 ], !dbg !112 + %1977 = phi float [ 0.000000e+00, %13 ], [ %1861, %__nv_exp2f.exit291 ], !dbg !112 + %1978 = phi float [ 0.000000e+00, %13 ], [ %1862, %__nv_exp2f.exit291 ], !dbg !112 + %1979 = phi float [ 0.000000e+00, %13 ], [ %1863, %__nv_exp2f.exit291 ], !dbg !112 + %1980 = phi float [ 0.000000e+00, %13 ], [ %1864, %__nv_exp2f.exit291 ], !dbg !112 + %1981 = phi float [ 0.000000e+00, %13 ], [ %1865, %__nv_exp2f.exit291 ], !dbg !112 + %1982 = phi float [ 0.000000e+00, %13 ], [ %1866, %__nv_exp2f.exit291 ], !dbg !112 + %1983 = phi float [ 0.000000e+00, %13 ], [ %1867, %__nv_exp2f.exit291 ], !dbg !112 + %1984 = phi float [ 0.000000e+00, %13 ], [ %1868, %__nv_exp2f.exit291 ], !dbg !112 + %1985 = phi float [ 0.000000e+00, %13 ], [ %1869, %__nv_exp2f.exit291 ], !dbg !112 + %1986 = phi float [ 0.000000e+00, %13 ], [ %1870, %__nv_exp2f.exit291 ], !dbg !112 + %1987 = phi float [ 0.000000e+00, %13 ], [ %1871, %__nv_exp2f.exit291 ], !dbg !112 + %1988 = phi float [ 0.000000e+00, %13 ], [ %1872, %__nv_exp2f.exit291 ], !dbg !112 + %1989 = phi float [ 0.000000e+00, %13 ], [ %1873, %__nv_exp2f.exit291 ], !dbg !112 + %1990 = phi float [ 0.000000e+00, %13 ], [ %1874, %__nv_exp2f.exit291 ], !dbg !112 + %1991 = phi float [ 0.000000e+00, %13 ], [ %1875, %__nv_exp2f.exit291 ], !dbg !112 + %1992 = phi float [ 0.000000e+00, %13 ], [ %1876, %__nv_exp2f.exit291 ], !dbg !112 + %1993 = phi float [ 0.000000e+00, %13 ], [ %1466, %__nv_exp2f.exit291 ] + %1994 = phi float [ 0.000000e+00, %13 ], [ %1467, %__nv_exp2f.exit291 ] + %1995 = phi <2 x float> [ splat (float 0xFFF0000000000000), %13 ], [ %1205, %__nv_exp2f.exit291 ] + %1996 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %1929, float %1930, float %1931, float %1932, float %1933, float %1934, float %1935, float %1936, float %1937, float %1938, float %1939, float %1940, float %1941, float %1942, float %1943, float %1944, float %1945, float %1946, float %1947, float %1948, float %1949, float %1950, float %1951, float %1952, float %1953, float %1954, float %1955, float %1956, float %1957, float %1958, float %1959, float %1960, float %1961, float %1962, float %1963, float %1964, float %1965, float %1966, float %1967, float %1968, float %1969, float %1970, float %1971, float %1972, float %1973, float %1974, float %1975, float %1976, float %1977, float %1978, float %1979, float %1980, float %1981, float %1982, float %1983, float %1984, float %1985, float %1986, float %1987, float %1988, float %1989, float %1990, float %1991, float %1992) #2, !dbg !45 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !45 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45 + %1997 = getelementptr i32, ptr addrspace(1) %8, i64 %182, !dbg !113 + %1998 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %1997) #2, !dbg !114 + %1999 = shl i32 %1998, 7, !dbg !115 + %2000 = getelementptr i32, ptr addrspace(1) %7, i64 %186, !dbg !116 + %2001 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %2000) #2, !dbg !117 + %2002 = shl i32 %2001, 1, !dbg !118 + %2003 = icmp sgt i32 %2002, 0, !dbg !119 + %2004 = or disjoint i32 %1999, %39, !dbg !121 + %2005 = or disjoint i32 %1999, %40, !dbg !121 + %2006 = or disjoint i32 %1999, %41, !dbg !121 + %2007 = or disjoint i32 %1999, %42, !dbg !121 + %2008 = shl i32 %2004, 7, !dbg !122 + %2009 = shl i32 %2005, 7, !dbg !122 + %2010 = shl i32 %2006, 7, !dbg !122 + %2011 = shl i32 %2007, 7, !dbg !122 + %2012 = sext i32 %2008 to i64, !dbg !123 + %2013 = getelementptr bfloat, ptr addrspace(1) %28, i64 %2012, !dbg !123 + %2014 = sext i32 %2009 to i64, !dbg !123 + %2015 = getelementptr bfloat, ptr addrspace(1) %28, i64 %2014, !dbg !123 + %2016 = sext i32 %2010 to i64, !dbg !123 + %2017 = getelementptr bfloat, ptr addrspace(1) %28, i64 %2016, !dbg !123 + %2018 = sext i32 %2011 to i64, !dbg !123 + %2019 = getelementptr bfloat, ptr addrspace(1) %28, i64 %2018, !dbg !123 + %2020 = getelementptr bfloat, ptr addrspace(1) %2013, i64 %84, !dbg !124 + %2021 = getelementptr bfloat, ptr addrspace(1) %2015, i64 %84, !dbg !124 + %2022 = getelementptr bfloat, ptr addrspace(1) %2017, i64 %84, !dbg !124 + %2023 = getelementptr bfloat, ptr addrspace(1) %2019, i64 %84, !dbg !124 + %2024 = select i1 %2003, i32 16, i32 0, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %221, ptr addrspace(1) %2020, i32 %2024) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %224, ptr addrspace(1) %2021, i32 %2024) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %226, ptr addrspace(1) %2022, i32 %2024) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %228, ptr addrspace(1) %2023, i32 %2024) #2, !dbg !125 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !125 + %2025 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2012, !dbg !123 + %2026 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2014, !dbg !123 + %2027 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2016, !dbg !123 + %2028 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2018, !dbg !123 + %2029 = getelementptr bfloat, ptr addrspace(1) %2025, i64 %84, !dbg !124 + %2030 = getelementptr bfloat, ptr addrspace(1) %2026, i64 %84, !dbg !124 + %2031 = getelementptr bfloat, ptr addrspace(1) %2027, i64 %84, !dbg !124 + %2032 = getelementptr bfloat, ptr addrspace(1) %2028, i64 %84, !dbg !124 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %237, ptr addrspace(1) %2029, i32 %2024) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %238, ptr addrspace(1) %2030, i32 %2024) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %239, ptr addrspace(1) %2031, i32 %2024) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %240, ptr addrspace(1) %2032, i32 %2024) #2, !dbg !125 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !125 + %2033 = icmp sgt i32 %2002, 1, !dbg !119 + %2034 = or disjoint i32 %1999, 64, !dbg !126 + %2035 = or disjoint i32 %2034, %39, !dbg !121 + %2036 = or disjoint i32 %2034, %40, !dbg !121 + %2037 = or disjoint i32 %2034, %41, !dbg !121 + %2038 = or disjoint i32 %2034, %42, !dbg !121 + %2039 = shl i32 %2035, 7, !dbg !122 + %2040 = shl i32 %2036, 7, !dbg !122 + %2041 = shl i32 %2037, 7, !dbg !122 + %2042 = shl i32 %2038, 7, !dbg !122 + %2043 = sext i32 %2039 to i64, !dbg !123 + %2044 = getelementptr bfloat, ptr addrspace(1) %28, i64 %2043, !dbg !123 + %2045 = sext i32 %2040 to i64, !dbg !123 + %2046 = getelementptr bfloat, ptr addrspace(1) %28, i64 %2045, !dbg !123 + %2047 = sext i32 %2041 to i64, !dbg !123 + %2048 = getelementptr bfloat, ptr addrspace(1) %28, i64 %2047, !dbg !123 + %2049 = sext i32 %2042 to i64, !dbg !123 + %2050 = getelementptr bfloat, ptr addrspace(1) %28, i64 %2049, !dbg !123 + %2051 = getelementptr bfloat, ptr addrspace(1) %2044, i64 %84, !dbg !124 + %2052 = getelementptr bfloat, ptr addrspace(1) %2046, i64 %84, !dbg !124 + %2053 = getelementptr bfloat, ptr addrspace(1) %2048, i64 %84, !dbg !124 + %2054 = getelementptr bfloat, ptr addrspace(1) %2050, i64 %84, !dbg !124 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !125 + %2055 = select i1 %2033, i32 16, i32 0, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %263, ptr addrspace(1) %2051, i32 %2055) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %265, ptr addrspace(1) %2052, i32 %2055) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %266, ptr addrspace(1) %2053, i32 %2055) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %267, ptr addrspace(1) %2054, i32 %2055) #2, !dbg !125 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !125 + %2056 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2043, !dbg !123 + %2057 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2045, !dbg !123 + %2058 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2047, !dbg !123 + %2059 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2049, !dbg !123 + %2060 = getelementptr bfloat, ptr addrspace(1) %2056, i64 %84, !dbg !124 + %2061 = getelementptr bfloat, ptr addrspace(1) %2057, i64 %84, !dbg !124 + %2062 = getelementptr bfloat, ptr addrspace(1) %2058, i64 %84, !dbg !124 + %2063 = getelementptr bfloat, ptr addrspace(1) %2059, i64 %84, !dbg !124 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %276, ptr addrspace(1) %2060, i32 %2055) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %277, ptr addrspace(1) %2061, i32 %2055) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %278, ptr addrspace(1) %2062, i32 %2055) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %279, ptr addrspace(1) %2063, i32 %2055) #2, !dbg !125 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !125 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #2, !dbg !127 + %2064 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 0, !dbg !119 + %2065 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 1, !dbg !119 + %2066 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 2, !dbg !119 + %2067 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 3, !dbg !119 + %2068 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 4, !dbg !119 + %2069 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 5, !dbg !119 + %2070 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 6, !dbg !119 + %2071 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 7, !dbg !119 + %2072 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 8, !dbg !119 + %2073 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 9, !dbg !119 + %2074 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 10, !dbg !119 + %2075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 11, !dbg !119 + %2076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 12, !dbg !119 + %2077 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 13, !dbg !119 + %2078 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 14, !dbg !119 + %2079 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 15, !dbg !119 + %2080 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 16, !dbg !119 + %2081 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 17, !dbg !119 + %2082 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 18, !dbg !119 + %2083 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 19, !dbg !119 + %2084 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 20, !dbg !119 + %2085 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 21, !dbg !119 + %2086 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 22, !dbg !119 + %2087 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 23, !dbg !119 + %2088 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 24, !dbg !119 + %2089 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 25, !dbg !119 + %2090 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 26, !dbg !119 + %2091 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 27, !dbg !119 + %2092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 28, !dbg !119 + %2093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 29, !dbg !119 + %2094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 30, !dbg !119 + %2095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 31, !dbg !119 + %2096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 32, !dbg !119 + %2097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 33, !dbg !119 + %2098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 34, !dbg !119 + %2099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 35, !dbg !119 + %2100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 36, !dbg !119 + %2101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 37, !dbg !119 + %2102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 38, !dbg !119 + %2103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 39, !dbg !119 + %2104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 40, !dbg !119 + %2105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 41, !dbg !119 + %2106 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 42, !dbg !119 + %2107 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 43, !dbg !119 + %2108 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 44, !dbg !119 + %2109 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 45, !dbg !119 + %2110 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 46, !dbg !119 + %2111 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 47, !dbg !119 + %2112 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 48, !dbg !119 + %2113 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 49, !dbg !119 + %2114 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 50, !dbg !119 + %2115 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 51, !dbg !119 + %2116 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 52, !dbg !119 + %2117 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 53, !dbg !119 + %2118 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 54, !dbg !119 + %2119 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 55, !dbg !119 + %2120 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 56, !dbg !119 + %2121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 57, !dbg !119 + %2122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 58, !dbg !119 + %2123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 59, !dbg !119 + %2124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 60, !dbg !119 + %2125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 61, !dbg !119 + %2126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 62, !dbg !119 + %2127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 63, !dbg !119 + br i1 %2003, label %.lr.ph378, label %._crit_edge379, !dbg !119 + +.lr.ph378: ; preds = %._crit_edge + %2128 = tail call i32 @llvm.umin.i32(i32 %2002, i32 32), !dbg !128 + %2129 = add nsw i32 %2128, -2 + %2130 = add nsw i32 %2128, -1 + %2131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 0, !dbg !119 + %2132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 1, !dbg !119 + %2133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 2, !dbg !119 + %2134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 3, !dbg !119 + %2135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 4, !dbg !119 + %2136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 5, !dbg !119 + %2137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 6, !dbg !119 + %2138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 7, !dbg !119 + %2139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 8, !dbg !119 + %2140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 9, !dbg !119 + %2141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 10, !dbg !119 + %2142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 11, !dbg !119 + %2143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 12, !dbg !119 + %2144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 13, !dbg !119 + %2145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 14, !dbg !119 + %2146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 15, !dbg !119 + %2147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 16, !dbg !119 + %2148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 17, !dbg !119 + %2149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 18, !dbg !119 + %2150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 19, !dbg !119 + %2151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 20, !dbg !119 + %2152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 21, !dbg !119 + %2153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 22, !dbg !119 + %2154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 23, !dbg !119 + %2155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 24, !dbg !119 + %2156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 25, !dbg !119 + %2157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 26, !dbg !119 + %2158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 27, !dbg !119 + %2159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 28, !dbg !119 + %2160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 29, !dbg !119 + %2161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 30, !dbg !119 + %2162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 31, !dbg !119 + %2163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 32, !dbg !119 + %2164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 33, !dbg !119 + %2165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 34, !dbg !119 + %2166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 35, !dbg !119 + %2167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 36, !dbg !119 + %2168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 37, !dbg !119 + %2169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 38, !dbg !119 + %2170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 39, !dbg !119 + %2171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 40, !dbg !119 + %2172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 41, !dbg !119 + %2173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 42, !dbg !119 + %2174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 43, !dbg !119 + %2175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 44, !dbg !119 + %2176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 45, !dbg !119 + %2177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 46, !dbg !119 + %2178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 47, !dbg !119 + %2179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 48, !dbg !119 + %2180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 49, !dbg !119 + %2181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 50, !dbg !119 + %2182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 51, !dbg !119 + %2183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 52, !dbg !119 + %2184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 53, !dbg !119 + %2185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 54, !dbg !119 + %2186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 55, !dbg !119 + %2187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 56, !dbg !119 + %2188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 57, !dbg !119 + %2189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 58, !dbg !119 + %2190 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 59, !dbg !119 + %2191 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 60, !dbg !119 + %2192 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 61, !dbg !119 + %2193 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 62, !dbg !119 + %2194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 63, !dbg !119 + br label %2195, !dbg !119 + +2195: ; preds = %.lr.ph378, %__nv_exp2f.exit189 + %2196 = phi i32 [ -1, %.lr.ph378 ], [ %2205, %__nv_exp2f.exit189 ] + %2197 = phi i32 [ 1, %.lr.ph378 ], [ %3459, %__nv_exp2f.exit189 ] + %2198 = phi i32 [ 64, %.lr.ph378 ], [ %3456, %__nv_exp2f.exit189 ] + %.pn = phi float [ %2131, %.lr.ph378 ], [ %3372, %__nv_exp2f.exit189 ] + %.pn397 = phi float [ %2132, %.lr.ph378 ], [ %3373, %__nv_exp2f.exit189 ] + %.pn398 = phi float [ %2133, %.lr.ph378 ], [ %3374, %__nv_exp2f.exit189 ] + %.pn399 = phi float [ %2134, %.lr.ph378 ], [ %3375, %__nv_exp2f.exit189 ] + %.pn400 = phi float [ %2135, %.lr.ph378 ], [ %3376, %__nv_exp2f.exit189 ] + %.pn401 = phi float [ %2136, %.lr.ph378 ], [ %3377, %__nv_exp2f.exit189 ] + %.pn402 = phi float [ %2137, %.lr.ph378 ], [ %3378, %__nv_exp2f.exit189 ] + %.pn403 = phi float [ %2138, %.lr.ph378 ], [ %3379, %__nv_exp2f.exit189 ] + %.pn404 = phi float [ %2139, %.lr.ph378 ], [ %3380, %__nv_exp2f.exit189 ] + %.pn405 = phi float [ %2140, %.lr.ph378 ], [ %3381, %__nv_exp2f.exit189 ] + %.pn406 = phi float [ %2141, %.lr.ph378 ], [ %3382, %__nv_exp2f.exit189 ] + %.pn407 = phi float [ %2142, %.lr.ph378 ], [ %3383, %__nv_exp2f.exit189 ] + %.pn408 = phi float [ %2143, %.lr.ph378 ], [ %3384, %__nv_exp2f.exit189 ] + %.pn409 = phi float [ %2144, %.lr.ph378 ], [ %3385, %__nv_exp2f.exit189 ] + %.pn410 = phi float [ %2145, %.lr.ph378 ], [ %3386, %__nv_exp2f.exit189 ] + %.pn411 = phi float [ %2146, %.lr.ph378 ], [ %3387, %__nv_exp2f.exit189 ] + %.pn412 = phi float [ %2147, %.lr.ph378 ], [ %3388, %__nv_exp2f.exit189 ] + %.pn413 = phi float [ %2148, %.lr.ph378 ], [ %3389, %__nv_exp2f.exit189 ] + %.pn414 = phi float [ %2149, %.lr.ph378 ], [ %3390, %__nv_exp2f.exit189 ] + %.pn415 = phi float [ %2150, %.lr.ph378 ], [ %3391, %__nv_exp2f.exit189 ] + %.pn416 = phi float [ %2151, %.lr.ph378 ], [ %3392, %__nv_exp2f.exit189 ] + %.pn417 = phi float [ %2152, %.lr.ph378 ], [ %3393, %__nv_exp2f.exit189 ] + %.pn418 = phi float [ %2153, %.lr.ph378 ], [ %3394, %__nv_exp2f.exit189 ] + %.pn419 = phi float [ %2154, %.lr.ph378 ], [ %3395, %__nv_exp2f.exit189 ] + %.pn420 = phi float [ %2155, %.lr.ph378 ], [ %3396, %__nv_exp2f.exit189 ] + %.pn421 = phi float [ %2156, %.lr.ph378 ], [ %3397, %__nv_exp2f.exit189 ] + %.pn422 = phi float [ %2157, %.lr.ph378 ], [ %3398, %__nv_exp2f.exit189 ] + %.pn423 = phi float [ %2158, %.lr.ph378 ], [ %3399, %__nv_exp2f.exit189 ] + %.pn424 = phi float [ %2159, %.lr.ph378 ], [ %3400, %__nv_exp2f.exit189 ] + %.pn425 = phi float [ %2160, %.lr.ph378 ], [ %3401, %__nv_exp2f.exit189 ] + %.pn426 = phi float [ %2161, %.lr.ph378 ], [ %3402, %__nv_exp2f.exit189 ] + %.pn427 = phi float [ %2162, %.lr.ph378 ], [ %3403, %__nv_exp2f.exit189 ] + %.pn428 = phi float [ %2163, %.lr.ph378 ], [ %3404, %__nv_exp2f.exit189 ] + %.pn429 = phi float [ %2164, %.lr.ph378 ], [ %3405, %__nv_exp2f.exit189 ] + %.pn430 = phi float [ %2165, %.lr.ph378 ], [ %3406, %__nv_exp2f.exit189 ] + %.pn431 = phi float [ %2166, %.lr.ph378 ], [ %3407, %__nv_exp2f.exit189 ] + %.pn432 = phi float [ %2167, %.lr.ph378 ], [ %3408, %__nv_exp2f.exit189 ] + %.pn433 = phi float [ %2168, %.lr.ph378 ], [ %3409, %__nv_exp2f.exit189 ] + %.pn434 = phi float [ %2169, %.lr.ph378 ], [ %3410, %__nv_exp2f.exit189 ] + %.pn435 = phi float [ %2170, %.lr.ph378 ], [ %3411, %__nv_exp2f.exit189 ] + %.pn436 = phi float [ %2171, %.lr.ph378 ], [ %3412, %__nv_exp2f.exit189 ] + %.pn437 = phi float [ %2172, %.lr.ph378 ], [ %3413, %__nv_exp2f.exit189 ] + %.pn438 = phi float [ %2173, %.lr.ph378 ], [ %3414, %__nv_exp2f.exit189 ] + %.pn439 = phi float [ %2174, %.lr.ph378 ], [ %3415, %__nv_exp2f.exit189 ] + %.pn440 = phi float [ %2175, %.lr.ph378 ], [ %3416, %__nv_exp2f.exit189 ] + %.pn441 = phi float [ %2176, %.lr.ph378 ], [ %3417, %__nv_exp2f.exit189 ] + %.pn442 = phi float [ %2177, %.lr.ph378 ], [ %3418, %__nv_exp2f.exit189 ] + %.pn443 = phi float [ %2178, %.lr.ph378 ], [ %3419, %__nv_exp2f.exit189 ] + %.pn444 = phi float [ %2179, %.lr.ph378 ], [ %3420, %__nv_exp2f.exit189 ] + %.pn445 = phi float [ %2180, %.lr.ph378 ], [ %3421, %__nv_exp2f.exit189 ] + %.pn446 = phi float [ %2181, %.lr.ph378 ], [ %3422, %__nv_exp2f.exit189 ] + %.pn447 = phi float [ %2182, %.lr.ph378 ], [ %3423, %__nv_exp2f.exit189 ] + %.pn448 = phi float [ %2183, %.lr.ph378 ], [ %3424, %__nv_exp2f.exit189 ] + %.pn449 = phi float [ %2184, %.lr.ph378 ], [ %3425, %__nv_exp2f.exit189 ] + %.pn450 = phi float [ %2185, %.lr.ph378 ], [ %3426, %__nv_exp2f.exit189 ] + %.pn451 = phi float [ %2186, %.lr.ph378 ], [ %3427, %__nv_exp2f.exit189 ] + %.pn452 = phi float [ %2187, %.lr.ph378 ], [ %3428, %__nv_exp2f.exit189 ] + %.pn453 = phi float [ %2188, %.lr.ph378 ], [ %3429, %__nv_exp2f.exit189 ] + %.pn454 = phi float [ %2189, %.lr.ph378 ], [ %3430, %__nv_exp2f.exit189 ] + %.pn455 = phi float [ %2190, %.lr.ph378 ], [ %3431, %__nv_exp2f.exit189 ] + %.pn456 = phi float [ %2191, %.lr.ph378 ], [ %3432, %__nv_exp2f.exit189 ] + %.pn457 = phi float [ %2192, %.lr.ph378 ], [ %3433, %__nv_exp2f.exit189 ] + %.pn458 = phi float [ %2193, %.lr.ph378 ], [ %3434, %__nv_exp2f.exit189 ] + %.pn459 = phi float [ %2194, %.lr.ph378 ], [ %3435, %__nv_exp2f.exit189 ] + %2199 = phi i32 [ 0, %.lr.ph378 ], [ %3436, %__nv_exp2f.exit189 ] + %.pn464 = phi float [ %1993, %.lr.ph378 ], [ %3025, %__nv_exp2f.exit189 ] + %.pn462 = phi float [ %1994, %.lr.ph378 ], [ %3026, %__nv_exp2f.exit189 ] + %2200 = phi <2 x float> [ %1995, %.lr.ph378 ], [ %2764, %__nv_exp2f.exit189 ] + %2201 = icmp slt i32 %2199, %2129, !dbg !119 + %2202 = icmp slt i32 %2199, %2130, !dbg !119 + %2203 = add i32 %2196, 1, !dbg !119 + %2204 = icmp sgt i32 %2203, 2, !dbg !119 + %2205 = select i1 %2204, i32 0, i32 %2203, !dbg !119 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !125 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !125 + %2206 = shl i32 %2205, 13, !dbg !125 + %2207 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %2206, !dbg !125 + %2208 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %37, i32 0, i32 31), !dbg !127 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !127 + %2209 = shl i32 %2208, 11, !dbg !127 + %2210 = and i32 %2209, 8192, !dbg !127 + %2211 = add i32 %2210, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !127 + %2212 = lshr exact i32 %2211, 4, !dbg !127 + %2213 = and i32 %2212, 16383, !dbg !127 + %2214 = zext nneg i32 %2213 to i64, !dbg !127 + %2215 = or disjoint i64 %2214, 4611686293372403712, !dbg !127 + %2216 = ptrtoint ptr addrspace(3) %2207 to i32, !dbg !127 + %2217 = lshr exact i32 %2216, 4, !dbg !127 + %2218 = and i32 %2217, 16383, !dbg !127 + %2219 = zext nneg i32 %2218 to i64, !dbg !127 + %2220 = or disjoint i64 %2219, 4611686293338849280, !dbg !127 + %2221 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %2215, i64 %2220) #2, !dbg !127 + %2222 = add i32 %2210, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 32), !dbg !127 + %2223 = lshr exact i32 %2222, 4, !dbg !127 + %2224 = and i32 %2223, 16383, !dbg !127 + %2225 = zext nneg i32 %2224 to i64, !dbg !127 + %2226 = or disjoint i64 %2225, 4611686293372403712, !dbg !127 + %2227 = add i32 %2216, 32, !dbg !127 + %2228 = lshr exact i32 %2227, 4, !dbg !127 + %2229 = and i32 %2228, 16383, !dbg !127 + %2230 = zext nneg i32 %2229 to i64, !dbg !127 + %2231 = or disjoint i64 %2230, 4611686293338849280, !dbg !127 + %2232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 0, !dbg !127 + %2233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 1, !dbg !127 + %2234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 2, !dbg !127 + %2235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 3, !dbg !127 + %2236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 4, !dbg !127 + %2237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 5, !dbg !127 + %2238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 6, !dbg !127 + %2239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 7, !dbg !127 + %2240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 8, !dbg !127 + %2241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 9, !dbg !127 + %2242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 10, !dbg !127 + %2243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 11, !dbg !127 + %2244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 12, !dbg !127 + %2245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 13, !dbg !127 + %2246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 14, !dbg !127 + %2247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 15, !dbg !127 + %2248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 16, !dbg !127 + %2249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 17, !dbg !127 + %2250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 18, !dbg !127 + %2251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 19, !dbg !127 + %2252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 20, !dbg !127 + %2253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 21, !dbg !127 + %2254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 22, !dbg !127 + %2255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 23, !dbg !127 + %2256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 24, !dbg !127 + %2257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 25, !dbg !127 + %2258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 26, !dbg !127 + %2259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 27, !dbg !127 + %2260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 28, !dbg !127 + %2261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 29, !dbg !127 + %2262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 30, !dbg !127 + %2263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 31, !dbg !127 + %2264 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2232, float %2233, float %2234, float %2235, float %2236, float %2237, float %2238, float %2239, float %2240, float %2241, float %2242, float %2243, float %2244, float %2245, float %2246, float %2247, float %2248, float %2249, float %2250, float %2251, float %2252, float %2253, float %2254, float %2255, float %2256, float %2257, float %2258, float %2259, float %2260, float %2261, float %2262, float %2263, i64 %2226, i64 %2231, i1 true) #2, !dbg !127 + %2265 = add i32 %2210, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 64), !dbg !127 + %2266 = lshr exact i32 %2265, 4, !dbg !127 + %2267 = and i32 %2266, 16383, !dbg !127 + %2268 = zext nneg i32 %2267 to i64, !dbg !127 + %2269 = or disjoint i64 %2268, 4611686293372403712, !dbg !127 + %2270 = add i32 %2216, 64, !dbg !127 + %2271 = lshr exact i32 %2270, 4, !dbg !127 + %2272 = and i32 %2271, 16383, !dbg !127 + %2273 = zext nneg i32 %2272 to i64, !dbg !127 + %2274 = or disjoint i64 %2273, 4611686293338849280, !dbg !127 + %2275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 0, !dbg !127 + %2276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 1, !dbg !127 + %2277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 2, !dbg !127 + %2278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 3, !dbg !127 + %2279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 4, !dbg !127 + %2280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 5, !dbg !127 + %2281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 6, !dbg !127 + %2282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 7, !dbg !127 + %2283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 8, !dbg !127 + %2284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 9, !dbg !127 + %2285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 10, !dbg !127 + %2286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 11, !dbg !127 + %2287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 12, !dbg !127 + %2288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 13, !dbg !127 + %2289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 14, !dbg !127 + %2290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 15, !dbg !127 + %2291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 16, !dbg !127 + %2292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 17, !dbg !127 + %2293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 18, !dbg !127 + %2294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 19, !dbg !127 + %2295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 20, !dbg !127 + %2296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 21, !dbg !127 + %2297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 22, !dbg !127 + %2298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 23, !dbg !127 + %2299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 24, !dbg !127 + %2300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 25, !dbg !127 + %2301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 26, !dbg !127 + %2302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 27, !dbg !127 + %2303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 28, !dbg !127 + %2304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 29, !dbg !127 + %2305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 30, !dbg !127 + %2306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 31, !dbg !127 + %2307 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2275, float %2276, float %2277, float %2278, float %2279, float %2280, float %2281, float %2282, float %2283, float %2284, float %2285, float %2286, float %2287, float %2288, float %2289, float %2290, float %2291, float %2292, float %2293, float %2294, float %2295, float %2296, float %2297, float %2298, float %2299, float %2300, float %2301, float %2302, float %2303, float %2304, float %2305, float %2306, i64 %2269, i64 %2274, i1 true) #2, !dbg !127 + %2308 = add i32 %2210, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 96), !dbg !127 + %2309 = lshr exact i32 %2308, 4, !dbg !127 + %2310 = and i32 %2309, 16383, !dbg !127 + %2311 = zext nneg i32 %2310 to i64, !dbg !127 + %2312 = or disjoint i64 %2311, 4611686293372403712, !dbg !127 + %2313 = add i32 %2216, 96, !dbg !127 + %2314 = lshr exact i32 %2313, 4, !dbg !127 + %2315 = and i32 %2314, 16383, !dbg !127 + %2316 = zext nneg i32 %2315 to i64, !dbg !127 + %2317 = or disjoint i64 %2316, 4611686293338849280, !dbg !127 + %2318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 0, !dbg !127 + %2319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 1, !dbg !127 + %2320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 2, !dbg !127 + %2321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 3, !dbg !127 + %2322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 4, !dbg !127 + %2323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 5, !dbg !127 + %2324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 6, !dbg !127 + %2325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 7, !dbg !127 + %2326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 8, !dbg !127 + %2327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 9, !dbg !127 + %2328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 10, !dbg !127 + %2329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 11, !dbg !127 + %2330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 12, !dbg !127 + %2331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 13, !dbg !127 + %2332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 14, !dbg !127 + %2333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 15, !dbg !127 + %2334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 16, !dbg !127 + %2335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 17, !dbg !127 + %2336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 18, !dbg !127 + %2337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 19, !dbg !127 + %2338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 20, !dbg !127 + %2339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 21, !dbg !127 + %2340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 22, !dbg !127 + %2341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 23, !dbg !127 + %2342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 24, !dbg !127 + %2343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 25, !dbg !127 + %2344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 26, !dbg !127 + %2345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 27, !dbg !127 + %2346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 28, !dbg !127 + %2347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 29, !dbg !127 + %2348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 30, !dbg !127 + %2349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 31, !dbg !127 + %2350 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2318, float %2319, float %2320, float %2321, float %2322, float %2323, float %2324, float %2325, float %2326, float %2327, float %2328, float %2329, float %2330, float %2331, float %2332, float %2333, float %2334, float %2335, float %2336, float %2337, float %2338, float %2339, float %2340, float %2341, float %2342, float %2343, float %2344, float %2345, float %2346, float %2347, float %2348, float %2349, i64 %2312, i64 %2317, i1 true) #2, !dbg !127 + %2351 = add i32 %2210, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16384), !dbg !127 + %2352 = lshr exact i32 %2351, 4, !dbg !127 + %2353 = and i32 %2352, 16383, !dbg !127 + %2354 = zext nneg i32 %2353 to i64, !dbg !127 + %2355 = or disjoint i64 %2354, 4611686293372403712, !dbg !127 + %2356 = add i32 %2216, 8192, !dbg !127 + %2357 = lshr exact i32 %2356, 4, !dbg !127 + %2358 = and i32 %2357, 16383, !dbg !127 + %2359 = zext nneg i32 %2358 to i64, !dbg !127 + %2360 = or disjoint i64 %2359, 4611686293338849280, !dbg !127 + %2361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 0, !dbg !127 + %2362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 1, !dbg !127 + %2363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 2, !dbg !127 + %2364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 3, !dbg !127 + %2365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 4, !dbg !127 + %2366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 5, !dbg !127 + %2367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 6, !dbg !127 + %2368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 7, !dbg !127 + %2369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 8, !dbg !127 + %2370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 9, !dbg !127 + %2371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 10, !dbg !127 + %2372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 11, !dbg !127 + %2373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 12, !dbg !127 + %2374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 13, !dbg !127 + %2375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 14, !dbg !127 + %2376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 15, !dbg !127 + %2377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 16, !dbg !127 + %2378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 17, !dbg !127 + %2379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 18, !dbg !127 + %2380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 19, !dbg !127 + %2381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 20, !dbg !127 + %2382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 21, !dbg !127 + %2383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 22, !dbg !127 + %2384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 23, !dbg !127 + %2385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 24, !dbg !127 + %2386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 25, !dbg !127 + %2387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 26, !dbg !127 + %2388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 27, !dbg !127 + %2389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 28, !dbg !127 + %2390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 29, !dbg !127 + %2391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 30, !dbg !127 + %2392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 31, !dbg !127 + %2393 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2361, float %2362, float %2363, float %2364, float %2365, float %2366, float %2367, float %2368, float %2369, float %2370, float %2371, float %2372, float %2373, float %2374, float %2375, float %2376, float %2377, float %2378, float %2379, float %2380, float %2381, float %2382, float %2383, float %2384, float %2385, float %2386, float %2387, float %2388, float %2389, float %2390, float %2391, float %2392, i64 %2355, i64 %2360, i1 true) #2, !dbg !127 + %2394 = add i32 %2210, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16416), !dbg !127 + %2395 = lshr exact i32 %2394, 4, !dbg !127 + %2396 = and i32 %2395, 16383, !dbg !127 + %2397 = zext nneg i32 %2396 to i64, !dbg !127 + %2398 = or disjoint i64 %2397, 4611686293372403712, !dbg !127 + %2399 = add i32 %2216, 8224, !dbg !127 + %2400 = lshr exact i32 %2399, 4, !dbg !127 + %2401 = and i32 %2400, 16383, !dbg !127 + %2402 = zext nneg i32 %2401 to i64, !dbg !127 + %2403 = or disjoint i64 %2402, 4611686293338849280, !dbg !127 + %2404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 0, !dbg !127 + %2405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 1, !dbg !127 + %2406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 2, !dbg !127 + %2407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 3, !dbg !127 + %2408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 4, !dbg !127 + %2409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 5, !dbg !127 + %2410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 6, !dbg !127 + %2411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 7, !dbg !127 + %2412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 8, !dbg !127 + %2413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 9, !dbg !127 + %2414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 10, !dbg !127 + %2415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 11, !dbg !127 + %2416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 12, !dbg !127 + %2417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 13, !dbg !127 + %2418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 14, !dbg !127 + %2419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 15, !dbg !127 + %2420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 16, !dbg !127 + %2421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 17, !dbg !127 + %2422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 18, !dbg !127 + %2423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 19, !dbg !127 + %2424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 20, !dbg !127 + %2425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 21, !dbg !127 + %2426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 22, !dbg !127 + %2427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 23, !dbg !127 + %2428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 24, !dbg !127 + %2429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 25, !dbg !127 + %2430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 26, !dbg !127 + %2431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 27, !dbg !127 + %2432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 28, !dbg !127 + %2433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 29, !dbg !127 + %2434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 30, !dbg !127 + %2435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 31, !dbg !127 + %2436 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2404, float %2405, float %2406, float %2407, float %2408, float %2409, float %2410, float %2411, float %2412, float %2413, float %2414, float %2415, float %2416, float %2417, float %2418, float %2419, float %2420, float %2421, float %2422, float %2423, float %2424, float %2425, float %2426, float %2427, float %2428, float %2429, float %2430, float %2431, float %2432, float %2433, float %2434, float %2435, i64 %2398, i64 %2403, i1 true) #2, !dbg !127 + %2437 = add i32 %2210, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16448), !dbg !127 + %2438 = lshr exact i32 %2437, 4, !dbg !127 + %2439 = and i32 %2438, 16383, !dbg !127 + %2440 = zext nneg i32 %2439 to i64, !dbg !127 + %2441 = or disjoint i64 %2440, 4611686293372403712, !dbg !127 + %2442 = add i32 %2216, 8256, !dbg !127 + %2443 = lshr exact i32 %2442, 4, !dbg !127 + %2444 = and i32 %2443, 16383, !dbg !127 + %2445 = zext nneg i32 %2444 to i64, !dbg !127 + %2446 = or disjoint i64 %2445, 4611686293338849280, !dbg !127 + %2447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 0, !dbg !127 + %2448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 1, !dbg !127 + %2449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 2, !dbg !127 + %2450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 3, !dbg !127 + %2451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 4, !dbg !127 + %2452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 5, !dbg !127 + %2453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 6, !dbg !127 + %2454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 7, !dbg !127 + %2455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 8, !dbg !127 + %2456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 9, !dbg !127 + %2457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 10, !dbg !127 + %2458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 11, !dbg !127 + %2459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 12, !dbg !127 + %2460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 13, !dbg !127 + %2461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 14, !dbg !127 + %2462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 15, !dbg !127 + %2463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 16, !dbg !127 + %2464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 17, !dbg !127 + %2465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 18, !dbg !127 + %2466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 19, !dbg !127 + %2467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 20, !dbg !127 + %2468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 21, !dbg !127 + %2469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 22, !dbg !127 + %2470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 23, !dbg !127 + %2471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 24, !dbg !127 + %2472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 25, !dbg !127 + %2473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 26, !dbg !127 + %2474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 27, !dbg !127 + %2475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 28, !dbg !127 + %2476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 29, !dbg !127 + %2477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 30, !dbg !127 + %2478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 31, !dbg !127 + %2479 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2447, float %2448, float %2449, float %2450, float %2451, float %2452, float %2453, float %2454, float %2455, float %2456, float %2457, float %2458, float %2459, float %2460, float %2461, float %2462, float %2463, float %2464, float %2465, float %2466, float %2467, float %2468, float %2469, float %2470, float %2471, float %2472, float %2473, float %2474, float %2475, float %2476, float %2477, float %2478, i64 %2441, i64 %2446, i1 true) #2, !dbg !127 + %2480 = add i32 %2210, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16480), !dbg !127 + %2481 = lshr exact i32 %2480, 4, !dbg !127 + %2482 = and i32 %2481, 16383, !dbg !127 + %2483 = zext nneg i32 %2482 to i64, !dbg !127 + %2484 = or disjoint i64 %2483, 4611686293372403712, !dbg !127 + %2485 = add i32 %2216, 8288, !dbg !127 + %2486 = lshr exact i32 %2485, 4, !dbg !127 + %2487 = and i32 %2486, 16383, !dbg !127 + %2488 = zext nneg i32 %2487 to i64, !dbg !127 + %2489 = or disjoint i64 %2488, 4611686293338849280, !dbg !127 + %2490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 0, !dbg !127 + %2491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 1, !dbg !127 + %2492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 2, !dbg !127 + %2493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 3, !dbg !127 + %2494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 4, !dbg !127 + %2495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 5, !dbg !127 + %2496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 6, !dbg !127 + %2497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 7, !dbg !127 + %2498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 8, !dbg !127 + %2499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 9, !dbg !127 + %2500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 10, !dbg !127 + %2501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 11, !dbg !127 + %2502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 12, !dbg !127 + %2503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 13, !dbg !127 + %2504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 14, !dbg !127 + %2505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 15, !dbg !127 + %2506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 16, !dbg !127 + %2507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 17, !dbg !127 + %2508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 18, !dbg !127 + %2509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 19, !dbg !127 + %2510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 20, !dbg !127 + %2511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 21, !dbg !127 + %2512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 22, !dbg !127 + %2513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 23, !dbg !127 + %2514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 24, !dbg !127 + %2515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 25, !dbg !127 + %2516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 26, !dbg !127 + %2517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 27, !dbg !127 + %2518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 28, !dbg !127 + %2519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 29, !dbg !127 + %2520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 30, !dbg !127 + %2521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 31, !dbg !127 + %2522 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2490, float %2491, float %2492, float %2493, float %2494, float %2495, float %2496, float %2497, float %2498, float %2499, float %2500, float %2501, float %2502, float %2503, float %2504, float %2505, float %2506, float %2507, float %2508, float %2509, float %2510, float %2511, float %2512, float %2513, float %2514, float %2515, float %2516, float %2517, float %2518, float %2519, float %2520, float %2521, i64 %2484, i64 %2489, i1 true) #2, !dbg !127 + %2523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 0, !dbg !127 + %2524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 1, !dbg !127 + %2525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 2, !dbg !127 + %2526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 3, !dbg !127 + %2527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 4, !dbg !127 + %2528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 5, !dbg !127 + %2529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 6, !dbg !127 + %2530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 7, !dbg !127 + %2531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 8, !dbg !127 + %2532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 9, !dbg !127 + %2533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 10, !dbg !127 + %2534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 11, !dbg !127 + %2535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 12, !dbg !127 + %2536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 13, !dbg !127 + %2537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 14, !dbg !127 + %2538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 15, !dbg !127 + %2539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 16, !dbg !127 + %2540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 17, !dbg !127 + %2541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 18, !dbg !127 + %2542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 19, !dbg !127 + %2543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 20, !dbg !127 + %2544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 21, !dbg !127 + %2545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 22, !dbg !127 + %2546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 23, !dbg !127 + %2547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 24, !dbg !127 + %2548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 25, !dbg !127 + %2549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 26, !dbg !127 + %2550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 27, !dbg !127 + %2551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 28, !dbg !127 + %2552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 29, !dbg !127 + %2553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 30, !dbg !127 + %2554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 31, !dbg !127 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !127 + %2555 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101"(float %2523, float %2524, float %2525, float %2526, float %2527, float %2528, float %2529, float %2530, float %2531, float %2532, float %2533, float %2534, float %2535, float %2536, float %2537, float %2538, float %2539, float %2540, float %2541, float %2542, float %2543, float %2544, float %2545, float %2546, float %2547, float %2548, float %2549, float %2550, float %2551, float %2552, float %2553, float %2554, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %2207, i32 0, i32 0, float %.pn, float %.pn397, float %.pn398, float %.pn399, float %.pn400, float %.pn401, float %.pn402, float %.pn403, float %.pn404, float %.pn405, float %.pn406, float %.pn407, float %.pn408, float %.pn409, float %.pn410, float %.pn411, float %.pn412, float %.pn413, float %.pn414, float %.pn415, float %.pn416, float %.pn417, float %.pn418, float %.pn419, float %.pn420, float %.pn421, float %.pn422, float %.pn423, float %.pn424, float %.pn425, float %.pn426, float %.pn427, float %.pn428, float %.pn429, float %.pn430, float %.pn431, float %.pn432, float %.pn433, float %.pn434, float %.pn435, float %.pn436, float %.pn437, float %.pn438, float %.pn439, float %.pn440, float %.pn441, float %.pn442, float %.pn443, float %.pn444, float %.pn445, float %.pn446, float %.pn447, float %.pn448, float %.pn449, float %.pn450, float %.pn451, float %.pn452, float %.pn453, float %.pn454, float %.pn455, float %.pn456, float %.pn457, float %.pn458, float %.pn459) #2, !dbg !127 + %2556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 0, !dbg !127 + %2557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 1, !dbg !127 + %2558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 2, !dbg !127 + %2559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 3, !dbg !127 + %2560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 4, !dbg !127 + %2561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 5, !dbg !127 + %2562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 6, !dbg !127 + %2563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 7, !dbg !127 + %2564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 8, !dbg !127 + %2565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 9, !dbg !127 + %2566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 10, !dbg !127 + %2567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 11, !dbg !127 + %2568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 12, !dbg !127 + %2569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 13, !dbg !127 + %2570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 14, !dbg !127 + %2571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 15, !dbg !127 + %2572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 16, !dbg !127 + %2573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 17, !dbg !127 + %2574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 18, !dbg !127 + %2575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 19, !dbg !127 + %2576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 20, !dbg !127 + %2577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 21, !dbg !127 + %2578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 22, !dbg !127 + %2579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 23, !dbg !127 + %2580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 24, !dbg !127 + %2581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 25, !dbg !127 + %2582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 26, !dbg !127 + %2583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 27, !dbg !127 + %2584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 28, !dbg !127 + %2585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 29, !dbg !127 + %2586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 30, !dbg !127 + %2587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 31, !dbg !127 + %2588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 38, !dbg !127 + %2589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 39, !dbg !127 + %2590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 40, !dbg !127 + %2591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 41, !dbg !127 + %2592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 42, !dbg !127 + %2593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 43, !dbg !127 + %2594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 44, !dbg !127 + %2595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 45, !dbg !127 + %2596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 46, !dbg !127 + %2597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 47, !dbg !127 + %2598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 48, !dbg !127 + %2599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 49, !dbg !127 + %2600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 50, !dbg !127 + %2601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 51, !dbg !127 + %2602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 52, !dbg !127 + %2603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 53, !dbg !127 + %2604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 54, !dbg !127 + %2605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 55, !dbg !127 + %2606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 56, !dbg !127 + %2607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 57, !dbg !127 + %2608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 58, !dbg !127 + %2609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 59, !dbg !127 + %2610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 60, !dbg !127 + %2611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 61, !dbg !127 + %2612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 62, !dbg !127 + %2613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 63, !dbg !127 + %2614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 64, !dbg !127 + %2615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 65, !dbg !127 + %2616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 66, !dbg !127 + %2617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 67, !dbg !127 + %2618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 68, !dbg !127 + %2619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 69, !dbg !127 + %2620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 70, !dbg !127 + %2621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 71, !dbg !127 + %2622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 72, !dbg !127 + %2623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 73, !dbg !127 + %2624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 74, !dbg !127 + %2625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 75, !dbg !127 + %2626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 76, !dbg !127 + %2627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 77, !dbg !127 + %2628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 78, !dbg !127 + %2629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 79, !dbg !127 + %2630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 80, !dbg !127 + %2631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 81, !dbg !127 + %2632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 82, !dbg !127 + %2633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 83, !dbg !127 + %2634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 84, !dbg !127 + %2635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 85, !dbg !127 + %2636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 86, !dbg !127 + %2637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 87, !dbg !127 + %2638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 88, !dbg !127 + %2639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 89, !dbg !127 + %2640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 90, !dbg !127 + %2641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 91, !dbg !127 + %2642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 92, !dbg !127 + %2643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 93, !dbg !127 + %2644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 94, !dbg !127 + %2645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 95, !dbg !127 + %2646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 96, !dbg !127 + %2647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 97, !dbg !127 + %2648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 98, !dbg !127 + %2649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 99, !dbg !127 + %2650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 100, !dbg !127 + %2651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 101, !dbg !127 + %2652 = fmul float %2556, 0x3FB6A09E60000000, !dbg !129 + %2653 = fmul float %2557, 0x3FB6A09E60000000, !dbg !129 + %2654 = fmul float %2558, 0x3FB6A09E60000000, !dbg !129 + %2655 = fmul float %2559, 0x3FB6A09E60000000, !dbg !129 + %2656 = fmul float %2560, 0x3FB6A09E60000000, !dbg !129 + %2657 = fmul float %2561, 0x3FB6A09E60000000, !dbg !129 + %2658 = fmul float %2562, 0x3FB6A09E60000000, !dbg !129 + %2659 = fmul float %2563, 0x3FB6A09E60000000, !dbg !129 + %2660 = fmul float %2564, 0x3FB6A09E60000000, !dbg !129 + %2661 = fmul float %2565, 0x3FB6A09E60000000, !dbg !129 + %2662 = fmul float %2566, 0x3FB6A09E60000000, !dbg !129 + %2663 = fmul float %2567, 0x3FB6A09E60000000, !dbg !129 + %2664 = fmul float %2568, 0x3FB6A09E60000000, !dbg !129 + %2665 = fmul float %2569, 0x3FB6A09E60000000, !dbg !129 + %2666 = fmul float %2570, 0x3FB6A09E60000000, !dbg !129 + %2667 = fmul float %2571, 0x3FB6A09E60000000, !dbg !129 + %2668 = fmul float %2572, 0x3FB6A09E60000000, !dbg !129 + %2669 = fmul float %2573, 0x3FB6A09E60000000, !dbg !129 + %2670 = fmul float %2574, 0x3FB6A09E60000000, !dbg !129 + %2671 = fmul float %2575, 0x3FB6A09E60000000, !dbg !129 + %2672 = fmul float %2576, 0x3FB6A09E60000000, !dbg !129 + %2673 = fmul float %2577, 0x3FB6A09E60000000, !dbg !129 + %2674 = fmul float %2578, 0x3FB6A09E60000000, !dbg !129 + %2675 = fmul float %2579, 0x3FB6A09E60000000, !dbg !129 + %2676 = fmul float %2580, 0x3FB6A09E60000000, !dbg !129 + %2677 = fmul float %2581, 0x3FB6A09E60000000, !dbg !129 + %2678 = fmul float %2582, 0x3FB6A09E60000000, !dbg !129 + %2679 = fmul float %2583, 0x3FB6A09E60000000, !dbg !129 + %2680 = fmul float %2584, 0x3FB6A09E60000000, !dbg !129 + %2681 = fmul float %2585, 0x3FB6A09E60000000, !dbg !129 + %2682 = fmul float %2586, 0x3FB6A09E60000000, !dbg !129 + %2683 = fmul float %2587, 0x3FB6A09E60000000, !dbg !129 + %2684 = fmul float %2652, 0x3FF7154760000000, !dbg !130 + %2685 = fmul float %2653, 0x3FF7154760000000, !dbg !130 + %2686 = fmul float %2654, 0x3FF7154760000000, !dbg !130 + %2687 = fmul float %2655, 0x3FF7154760000000, !dbg !130 + %2688 = fmul float %2656, 0x3FF7154760000000, !dbg !130 + %2689 = fmul float %2657, 0x3FF7154760000000, !dbg !130 + %2690 = fmul float %2658, 0x3FF7154760000000, !dbg !130 + %2691 = fmul float %2659, 0x3FF7154760000000, !dbg !130 + %2692 = fmul float %2660, 0x3FF7154760000000, !dbg !130 + %2693 = fmul float %2661, 0x3FF7154760000000, !dbg !130 + %2694 = fmul float %2662, 0x3FF7154760000000, !dbg !130 + %2695 = fmul float %2663, 0x3FF7154760000000, !dbg !130 + %2696 = fmul float %2664, 0x3FF7154760000000, !dbg !130 + %2697 = fmul float %2665, 0x3FF7154760000000, !dbg !130 + %2698 = fmul float %2666, 0x3FF7154760000000, !dbg !130 + %2699 = fmul float %2667, 0x3FF7154760000000, !dbg !130 + %2700 = fmul float %2668, 0x3FF7154760000000, !dbg !130 + %2701 = fmul float %2669, 0x3FF7154760000000, !dbg !130 + %2702 = fmul float %2670, 0x3FF7154760000000, !dbg !130 + %2703 = fmul float %2671, 0x3FF7154760000000, !dbg !130 + %2704 = fmul float %2672, 0x3FF7154760000000, !dbg !130 + %2705 = fmul float %2673, 0x3FF7154760000000, !dbg !130 + %2706 = fmul float %2674, 0x3FF7154760000000, !dbg !130 + %2707 = fmul float %2675, 0x3FF7154760000000, !dbg !130 + %2708 = fmul float %2676, 0x3FF7154760000000, !dbg !130 + %2709 = fmul float %2677, 0x3FF7154760000000, !dbg !130 + %2710 = fmul float %2678, 0x3FF7154760000000, !dbg !130 + %2711 = fmul float %2679, 0x3FF7154760000000, !dbg !130 + %2712 = fmul float %2680, 0x3FF7154760000000, !dbg !130 + %2713 = fmul float %2681, 0x3FF7154760000000, !dbg !130 + %2714 = fmul float %2682, 0x3FF7154760000000, !dbg !130 + %2715 = fmul float %2683, 0x3FF7154760000000, !dbg !130 + %2716 = tail call float @llvm.maxnum.f32(float %2684, float %2685), !dbg !131 + %2717 = tail call float @llvm.maxnum.f32(float %2686, float %2687), !dbg !131 + %2718 = tail call float @llvm.maxnum.f32(float %2716, float %2688), !dbg !131 + %2719 = tail call float @llvm.maxnum.f32(float %2718, float %2689), !dbg !131 + %2720 = tail call float @llvm.maxnum.f32(float %2717, float %2690), !dbg !131 + %2721 = tail call float @llvm.maxnum.f32(float %2720, float %2691), !dbg !131 + %2722 = tail call float @llvm.maxnum.f32(float %2719, float %2692), !dbg !131 + %2723 = tail call float @llvm.maxnum.f32(float %2722, float %2693), !dbg !131 + %2724 = tail call float @llvm.maxnum.f32(float %2721, float %2694), !dbg !131 + %2725 = tail call float @llvm.maxnum.f32(float %2724, float %2695), !dbg !131 + %2726 = tail call float @llvm.maxnum.f32(float %2723, float %2696), !dbg !131 + %2727 = tail call float @llvm.maxnum.f32(float %2726, float %2697), !dbg !131 + %2728 = tail call float @llvm.maxnum.f32(float %2725, float %2698), !dbg !131 + %2729 = tail call float @llvm.maxnum.f32(float %2728, float %2699), !dbg !131 + %2730 = tail call float @llvm.maxnum.f32(float %2727, float %2700), !dbg !131 + %2731 = tail call float @llvm.maxnum.f32(float %2730, float %2701), !dbg !131 + %2732 = tail call float @llvm.maxnum.f32(float %2729, float %2702), !dbg !131 + %2733 = tail call float @llvm.maxnum.f32(float %2732, float %2703), !dbg !131 + %2734 = tail call float @llvm.maxnum.f32(float %2731, float %2704), !dbg !131 + %2735 = tail call float @llvm.maxnum.f32(float %2734, float %2705), !dbg !131 + %2736 = tail call float @llvm.maxnum.f32(float %2733, float %2706), !dbg !131 + %2737 = tail call float @llvm.maxnum.f32(float %2736, float %2707), !dbg !131 + %2738 = tail call float @llvm.maxnum.f32(float %2735, float %2708), !dbg !131 + %2739 = tail call float @llvm.maxnum.f32(float %2738, float %2709), !dbg !131 + %2740 = tail call float @llvm.maxnum.f32(float %2737, float %2710), !dbg !131 + %2741 = tail call float @llvm.maxnum.f32(float %2740, float %2711), !dbg !131 + %2742 = tail call float @llvm.maxnum.f32(float %2739, float %2712), !dbg !131 + %2743 = tail call float @llvm.maxnum.f32(float %2742, float %2713), !dbg !131 + %2744 = tail call float @llvm.maxnum.f32(float %2741, float %2714), !dbg !131 + %2745 = tail call float @llvm.maxnum.f32(float %2744, float %2715), !dbg !131 + %2746 = bitcast float %2743 to i32, !dbg !132 + %2747 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2746, i32 2, i32 31), !dbg !132 + %2748 = bitcast i32 %2747 to float, !dbg !132 + %2749 = bitcast float %2745 to i32, !dbg !132 + %2750 = tail call float @llvm.maxnum.f32(float %2743, float %2748), !dbg !131 + %2751 = bitcast float %2750 to i32, !dbg !132 + %2752 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2751, i32 1, i32 31), !dbg !132 + %2753 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2749, i32 2, i32 31), !dbg !132 + %2754 = bitcast i32 %2753 to float, !dbg !132 + %2755 = tail call float @llvm.maxnum.f32(float %2745, float %2754), !dbg !131 + %2756 = bitcast float %2755 to i32, !dbg !132 + %2757 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2756, i32 1, i32 31), !dbg !132 + %2758 = insertelement <2 x i32> poison, i32 %2752, i64 0, !dbg !132 + %2759 = insertelement <2 x i32> %2758, i32 %2757, i64 1, !dbg !132 + %2760 = bitcast <2 x i32> %2759 to <2 x float>, !dbg !132 + %2761 = insertelement <2 x float> poison, float %2750, i64 0, !dbg !131 + %2762 = insertelement <2 x float> %2761, float %2755, i64 1, !dbg !131 + %2763 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %2762, <2 x float> %2760), !dbg !131 + %2764 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %2200, <2 x float> %2763), !dbg !133 + %2765 = extractelement <2 x float> %2764, i64 0, !dbg !134 + %2766 = fcmp oeq float %2765, 0xFFF0000000000000, !dbg !135 + %2767 = extractelement <2 x float> %2764, i64 1, !dbg !134 + %2768 = fcmp oeq float %2767, 0xFFF0000000000000, !dbg !135 + %2769 = select i1 %2766, float 0.000000e+00, float %2765, !dbg !134 + %2770 = select i1 %2768, float 0.000000e+00, float %2767, !dbg !134 + %2771 = extractelement <2 x float> %2200, i64 0, !dbg !136 + %2772 = fsub float %2771, %2769, !dbg !136 + %2773 = extractelement <2 x float> %2200, i64 1, !dbg !136 + %2774 = fsub float %2773, %2770, !dbg !136 + %2775 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !137 + %.not.i = icmp eq i32 %2775, 0, !dbg !137 + br i1 %.not.i, label %2778, label %2776, !dbg !137 + +2776: ; preds = %2195 + %2777 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2772) #2, !dbg !137 + br label %__nv_exp2f.exit, !dbg !137 + +2778: ; preds = %2195 + %2779 = tail call float @llvm.nvvm.ex2.approx.f(float %2772) #2, !dbg !137 + br label %__nv_exp2f.exit, !dbg !137 + +__nv_exp2f.exit: ; preds = %2776, %2778 + %.0.i = phi float [ %2777, %2776 ], [ %2779, %2778 ], !dbg !137 + %2780 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !137 + %.not.i91 = icmp eq i32 %2780, 0, !dbg !137 + br i1 %.not.i91, label %2783, label %2781, !dbg !137 + +2781: ; preds = %__nv_exp2f.exit + %2782 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2774) #2, !dbg !137 + br label %__nv_exp2f.exit93, !dbg !137 + +2783: ; preds = %__nv_exp2f.exit + %2784 = tail call float @llvm.nvvm.ex2.approx.f(float %2774) #2, !dbg !137 + br label %__nv_exp2f.exit93, !dbg !137 + +__nv_exp2f.exit93: ; preds = %2781, %2783 + %.0.i92 = phi float [ %2782, %2781 ], [ %2784, %2783 ], !dbg !137 + %2785 = fsub float %2684, %2769, !dbg !138 + %2786 = fsub float %2685, %2769, !dbg !138 + %2787 = fsub float %2686, %2770, !dbg !138 + %2788 = fsub float %2687, %2770, !dbg !138 + %2789 = fsub float %2688, %2769, !dbg !138 + %2790 = fsub float %2689, %2769, !dbg !138 + %2791 = fsub float %2690, %2770, !dbg !138 + %2792 = fsub float %2691, %2770, !dbg !138 + %2793 = fsub float %2692, %2769, !dbg !138 + %2794 = fsub float %2693, %2769, !dbg !138 + %2795 = fsub float %2694, %2770, !dbg !138 + %2796 = fsub float %2695, %2770, !dbg !138 + %2797 = fsub float %2696, %2769, !dbg !138 + %2798 = fsub float %2697, %2769, !dbg !138 + %2799 = fsub float %2698, %2770, !dbg !138 + %2800 = fsub float %2699, %2770, !dbg !138 + %2801 = fsub float %2700, %2769, !dbg !138 + %2802 = fsub float %2701, %2769, !dbg !138 + %2803 = fsub float %2702, %2770, !dbg !138 + %2804 = fsub float %2703, %2770, !dbg !138 + %2805 = fsub float %2704, %2769, !dbg !138 + %2806 = fsub float %2705, %2769, !dbg !138 + %2807 = fsub float %2706, %2770, !dbg !138 + %2808 = fsub float %2707, %2770, !dbg !138 + %2809 = fsub float %2708, %2769, !dbg !138 + %2810 = fsub float %2709, %2769, !dbg !138 + %2811 = fsub float %2710, %2770, !dbg !138 + %2812 = fsub float %2711, %2770, !dbg !138 + %2813 = fsub float %2712, %2769, !dbg !138 + %2814 = fsub float %2713, %2769, !dbg !138 + %2815 = fsub float %2714, %2770, !dbg !138 + %2816 = fsub float %2715, %2770, !dbg !138 + %2817 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i94 = icmp eq i32 %2817, 0, !dbg !139 + br i1 %.not.i94, label %2820, label %2818, !dbg !139 + +2818: ; preds = %__nv_exp2f.exit93 + %2819 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2785) #2, !dbg !139 + br label %__nv_exp2f.exit96, !dbg !139 + +2820: ; preds = %__nv_exp2f.exit93 + %2821 = tail call float @llvm.nvvm.ex2.approx.f(float %2785) #2, !dbg !139 + br label %__nv_exp2f.exit96, !dbg !139 + +__nv_exp2f.exit96: ; preds = %2818, %2820 + %.0.i95 = phi float [ %2819, %2818 ], [ %2821, %2820 ], !dbg !139 + %2822 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i97 = icmp eq i32 %2822, 0, !dbg !139 + br i1 %.not.i97, label %2825, label %2823, !dbg !139 + +2823: ; preds = %__nv_exp2f.exit96 + %2824 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2786) #2, !dbg !139 + br label %__nv_exp2f.exit99, !dbg !139 + +2825: ; preds = %__nv_exp2f.exit96 + %2826 = tail call float @llvm.nvvm.ex2.approx.f(float %2786) #2, !dbg !139 + br label %__nv_exp2f.exit99, !dbg !139 + +__nv_exp2f.exit99: ; preds = %2823, %2825 + %.0.i98 = phi float [ %2824, %2823 ], [ %2826, %2825 ], !dbg !139 + %2827 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i100 = icmp eq i32 %2827, 0, !dbg !139 + br i1 %.not.i100, label %2830, label %2828, !dbg !139 + +2828: ; preds = %__nv_exp2f.exit99 + %2829 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2787) #2, !dbg !139 + br label %__nv_exp2f.exit102, !dbg !139 + +2830: ; preds = %__nv_exp2f.exit99 + %2831 = tail call float @llvm.nvvm.ex2.approx.f(float %2787) #2, !dbg !139 + br label %__nv_exp2f.exit102, !dbg !139 + +__nv_exp2f.exit102: ; preds = %2828, %2830 + %.0.i101 = phi float [ %2829, %2828 ], [ %2831, %2830 ], !dbg !139 + %2832 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i103 = icmp eq i32 %2832, 0, !dbg !139 + br i1 %.not.i103, label %2835, label %2833, !dbg !139 + +2833: ; preds = %__nv_exp2f.exit102 + %2834 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2788) #2, !dbg !139 + br label %__nv_exp2f.exit105, !dbg !139 + +2835: ; preds = %__nv_exp2f.exit102 + %2836 = tail call float @llvm.nvvm.ex2.approx.f(float %2788) #2, !dbg !139 + br label %__nv_exp2f.exit105, !dbg !139 + +__nv_exp2f.exit105: ; preds = %2833, %2835 + %.0.i104 = phi float [ %2834, %2833 ], [ %2836, %2835 ], !dbg !139 + %2837 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i106 = icmp eq i32 %2837, 0, !dbg !139 + br i1 %.not.i106, label %2840, label %2838, !dbg !139 + +2838: ; preds = %__nv_exp2f.exit105 + %2839 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2789) #2, !dbg !139 + br label %__nv_exp2f.exit108, !dbg !139 + +2840: ; preds = %__nv_exp2f.exit105 + %2841 = tail call float @llvm.nvvm.ex2.approx.f(float %2789) #2, !dbg !139 + br label %__nv_exp2f.exit108, !dbg !139 + +__nv_exp2f.exit108: ; preds = %2838, %2840 + %.0.i107 = phi float [ %2839, %2838 ], [ %2841, %2840 ], !dbg !139 + %2842 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i109 = icmp eq i32 %2842, 0, !dbg !139 + br i1 %.not.i109, label %2845, label %2843, !dbg !139 + +2843: ; preds = %__nv_exp2f.exit108 + %2844 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2790) #2, !dbg !139 + br label %__nv_exp2f.exit111, !dbg !139 + +2845: ; preds = %__nv_exp2f.exit108 + %2846 = tail call float @llvm.nvvm.ex2.approx.f(float %2790) #2, !dbg !139 + br label %__nv_exp2f.exit111, !dbg !139 + +__nv_exp2f.exit111: ; preds = %2843, %2845 + %.0.i110 = phi float [ %2844, %2843 ], [ %2846, %2845 ], !dbg !139 + %2847 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i112 = icmp eq i32 %2847, 0, !dbg !139 + br i1 %.not.i112, label %2850, label %2848, !dbg !139 + +2848: ; preds = %__nv_exp2f.exit111 + %2849 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2791) #2, !dbg !139 + br label %__nv_exp2f.exit114, !dbg !139 + +2850: ; preds = %__nv_exp2f.exit111 + %2851 = tail call float @llvm.nvvm.ex2.approx.f(float %2791) #2, !dbg !139 + br label %__nv_exp2f.exit114, !dbg !139 + +__nv_exp2f.exit114: ; preds = %2848, %2850 + %.0.i113 = phi float [ %2849, %2848 ], [ %2851, %2850 ], !dbg !139 + %2852 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i115 = icmp eq i32 %2852, 0, !dbg !139 + br i1 %.not.i115, label %2855, label %2853, !dbg !139 + +2853: ; preds = %__nv_exp2f.exit114 + %2854 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2792) #2, !dbg !139 + br label %__nv_exp2f.exit117, !dbg !139 + +2855: ; preds = %__nv_exp2f.exit114 + %2856 = tail call float @llvm.nvvm.ex2.approx.f(float %2792) #2, !dbg !139 + br label %__nv_exp2f.exit117, !dbg !139 + +__nv_exp2f.exit117: ; preds = %2853, %2855 + %.0.i116 = phi float [ %2854, %2853 ], [ %2856, %2855 ], !dbg !139 + %2857 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i118 = icmp eq i32 %2857, 0, !dbg !139 + br i1 %.not.i118, label %2860, label %2858, !dbg !139 + +2858: ; preds = %__nv_exp2f.exit117 + %2859 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2793) #2, !dbg !139 + br label %__nv_exp2f.exit120, !dbg !139 + +2860: ; preds = %__nv_exp2f.exit117 + %2861 = tail call float @llvm.nvvm.ex2.approx.f(float %2793) #2, !dbg !139 + br label %__nv_exp2f.exit120, !dbg !139 + +__nv_exp2f.exit120: ; preds = %2858, %2860 + %.0.i119 = phi float [ %2859, %2858 ], [ %2861, %2860 ], !dbg !139 + %2862 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i121 = icmp eq i32 %2862, 0, !dbg !139 + br i1 %.not.i121, label %2865, label %2863, !dbg !139 + +2863: ; preds = %__nv_exp2f.exit120 + %2864 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2794) #2, !dbg !139 + br label %__nv_exp2f.exit123, !dbg !139 + +2865: ; preds = %__nv_exp2f.exit120 + %2866 = tail call float @llvm.nvvm.ex2.approx.f(float %2794) #2, !dbg !139 + br label %__nv_exp2f.exit123, !dbg !139 + +__nv_exp2f.exit123: ; preds = %2863, %2865 + %.0.i122 = phi float [ %2864, %2863 ], [ %2866, %2865 ], !dbg !139 + %2867 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i124 = icmp eq i32 %2867, 0, !dbg !139 + br i1 %.not.i124, label %2870, label %2868, !dbg !139 + +2868: ; preds = %__nv_exp2f.exit123 + %2869 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2795) #2, !dbg !139 + br label %__nv_exp2f.exit126, !dbg !139 + +2870: ; preds = %__nv_exp2f.exit123 + %2871 = tail call float @llvm.nvvm.ex2.approx.f(float %2795) #2, !dbg !139 + br label %__nv_exp2f.exit126, !dbg !139 + +__nv_exp2f.exit126: ; preds = %2868, %2870 + %.0.i125 = phi float [ %2869, %2868 ], [ %2871, %2870 ], !dbg !139 + %2872 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i127 = icmp eq i32 %2872, 0, !dbg !139 + br i1 %.not.i127, label %2875, label %2873, !dbg !139 + +2873: ; preds = %__nv_exp2f.exit126 + %2874 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2796) #2, !dbg !139 + br label %__nv_exp2f.exit129, !dbg !139 + +2875: ; preds = %__nv_exp2f.exit126 + %2876 = tail call float @llvm.nvvm.ex2.approx.f(float %2796) #2, !dbg !139 + br label %__nv_exp2f.exit129, !dbg !139 + +__nv_exp2f.exit129: ; preds = %2873, %2875 + %.0.i128 = phi float [ %2874, %2873 ], [ %2876, %2875 ], !dbg !139 + %2877 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i130 = icmp eq i32 %2877, 0, !dbg !139 + br i1 %.not.i130, label %2880, label %2878, !dbg !139 + +2878: ; preds = %__nv_exp2f.exit129 + %2879 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2797) #2, !dbg !139 + br label %__nv_exp2f.exit132, !dbg !139 + +2880: ; preds = %__nv_exp2f.exit129 + %2881 = tail call float @llvm.nvvm.ex2.approx.f(float %2797) #2, !dbg !139 + br label %__nv_exp2f.exit132, !dbg !139 + +__nv_exp2f.exit132: ; preds = %2878, %2880 + %.0.i131 = phi float [ %2879, %2878 ], [ %2881, %2880 ], !dbg !139 + %2882 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i133 = icmp eq i32 %2882, 0, !dbg !139 + br i1 %.not.i133, label %2885, label %2883, !dbg !139 + +2883: ; preds = %__nv_exp2f.exit132 + %2884 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2798) #2, !dbg !139 + br label %__nv_exp2f.exit135, !dbg !139 + +2885: ; preds = %__nv_exp2f.exit132 + %2886 = tail call float @llvm.nvvm.ex2.approx.f(float %2798) #2, !dbg !139 + br label %__nv_exp2f.exit135, !dbg !139 + +__nv_exp2f.exit135: ; preds = %2883, %2885 + %.0.i134 = phi float [ %2884, %2883 ], [ %2886, %2885 ], !dbg !139 + %2887 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i136 = icmp eq i32 %2887, 0, !dbg !139 + br i1 %.not.i136, label %2890, label %2888, !dbg !139 + +2888: ; preds = %__nv_exp2f.exit135 + %2889 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2799) #2, !dbg !139 + br label %__nv_exp2f.exit138, !dbg !139 + +2890: ; preds = %__nv_exp2f.exit135 + %2891 = tail call float @llvm.nvvm.ex2.approx.f(float %2799) #2, !dbg !139 + br label %__nv_exp2f.exit138, !dbg !139 + +__nv_exp2f.exit138: ; preds = %2888, %2890 + %.0.i137 = phi float [ %2889, %2888 ], [ %2891, %2890 ], !dbg !139 + %2892 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i139 = icmp eq i32 %2892, 0, !dbg !139 + br i1 %.not.i139, label %2895, label %2893, !dbg !139 + +2893: ; preds = %__nv_exp2f.exit138 + %2894 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2800) #2, !dbg !139 + br label %__nv_exp2f.exit141, !dbg !139 + +2895: ; preds = %__nv_exp2f.exit138 + %2896 = tail call float @llvm.nvvm.ex2.approx.f(float %2800) #2, !dbg !139 + br label %__nv_exp2f.exit141, !dbg !139 + +__nv_exp2f.exit141: ; preds = %2893, %2895 + %.0.i140 = phi float [ %2894, %2893 ], [ %2896, %2895 ], !dbg !139 + %2897 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i142 = icmp eq i32 %2897, 0, !dbg !139 + br i1 %.not.i142, label %2900, label %2898, !dbg !139 + +2898: ; preds = %__nv_exp2f.exit141 + %2899 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2801) #2, !dbg !139 + br label %__nv_exp2f.exit144, !dbg !139 + +2900: ; preds = %__nv_exp2f.exit141 + %2901 = tail call float @llvm.nvvm.ex2.approx.f(float %2801) #2, !dbg !139 + br label %__nv_exp2f.exit144, !dbg !139 + +__nv_exp2f.exit144: ; preds = %2898, %2900 + %.0.i143 = phi float [ %2899, %2898 ], [ %2901, %2900 ], !dbg !139 + %2902 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i145 = icmp eq i32 %2902, 0, !dbg !139 + br i1 %.not.i145, label %2905, label %2903, !dbg !139 + +2903: ; preds = %__nv_exp2f.exit144 + %2904 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2802) #2, !dbg !139 + br label %__nv_exp2f.exit147, !dbg !139 + +2905: ; preds = %__nv_exp2f.exit144 + %2906 = tail call float @llvm.nvvm.ex2.approx.f(float %2802) #2, !dbg !139 + br label %__nv_exp2f.exit147, !dbg !139 + +__nv_exp2f.exit147: ; preds = %2903, %2905 + %.0.i146 = phi float [ %2904, %2903 ], [ %2906, %2905 ], !dbg !139 + %2907 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i148 = icmp eq i32 %2907, 0, !dbg !139 + br i1 %.not.i148, label %2910, label %2908, !dbg !139 + +2908: ; preds = %__nv_exp2f.exit147 + %2909 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2803) #2, !dbg !139 + br label %__nv_exp2f.exit150, !dbg !139 + +2910: ; preds = %__nv_exp2f.exit147 + %2911 = tail call float @llvm.nvvm.ex2.approx.f(float %2803) #2, !dbg !139 + br label %__nv_exp2f.exit150, !dbg !139 + +__nv_exp2f.exit150: ; preds = %2908, %2910 + %.0.i149 = phi float [ %2909, %2908 ], [ %2911, %2910 ], !dbg !139 + %2912 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i151 = icmp eq i32 %2912, 0, !dbg !139 + br i1 %.not.i151, label %2915, label %2913, !dbg !139 + +2913: ; preds = %__nv_exp2f.exit150 + %2914 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2804) #2, !dbg !139 + br label %__nv_exp2f.exit153, !dbg !139 + +2915: ; preds = %__nv_exp2f.exit150 + %2916 = tail call float @llvm.nvvm.ex2.approx.f(float %2804) #2, !dbg !139 + br label %__nv_exp2f.exit153, !dbg !139 + +__nv_exp2f.exit153: ; preds = %2913, %2915 + %.0.i152 = phi float [ %2914, %2913 ], [ %2916, %2915 ], !dbg !139 + %2917 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i154 = icmp eq i32 %2917, 0, !dbg !139 + br i1 %.not.i154, label %2920, label %2918, !dbg !139 + +2918: ; preds = %__nv_exp2f.exit153 + %2919 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2805) #2, !dbg !139 + br label %__nv_exp2f.exit156, !dbg !139 + +2920: ; preds = %__nv_exp2f.exit153 + %2921 = tail call float @llvm.nvvm.ex2.approx.f(float %2805) #2, !dbg !139 + br label %__nv_exp2f.exit156, !dbg !139 + +__nv_exp2f.exit156: ; preds = %2918, %2920 + %.0.i155 = phi float [ %2919, %2918 ], [ %2921, %2920 ], !dbg !139 + %2922 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i157 = icmp eq i32 %2922, 0, !dbg !139 + br i1 %.not.i157, label %2925, label %2923, !dbg !139 + +2923: ; preds = %__nv_exp2f.exit156 + %2924 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2806) #2, !dbg !139 + br label %__nv_exp2f.exit159, !dbg !139 + +2925: ; preds = %__nv_exp2f.exit156 + %2926 = tail call float @llvm.nvvm.ex2.approx.f(float %2806) #2, !dbg !139 + br label %__nv_exp2f.exit159, !dbg !139 + +__nv_exp2f.exit159: ; preds = %2923, %2925 + %.0.i158 = phi float [ %2924, %2923 ], [ %2926, %2925 ], !dbg !139 + %2927 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i160 = icmp eq i32 %2927, 0, !dbg !139 + br i1 %.not.i160, label %2930, label %2928, !dbg !139 + +2928: ; preds = %__nv_exp2f.exit159 + %2929 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2807) #2, !dbg !139 + br label %__nv_exp2f.exit162, !dbg !139 + +2930: ; preds = %__nv_exp2f.exit159 + %2931 = tail call float @llvm.nvvm.ex2.approx.f(float %2807) #2, !dbg !139 + br label %__nv_exp2f.exit162, !dbg !139 + +__nv_exp2f.exit162: ; preds = %2928, %2930 + %.0.i161 = phi float [ %2929, %2928 ], [ %2931, %2930 ], !dbg !139 + %2932 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i163 = icmp eq i32 %2932, 0, !dbg !139 + br i1 %.not.i163, label %2935, label %2933, !dbg !139 + +2933: ; preds = %__nv_exp2f.exit162 + %2934 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2808) #2, !dbg !139 + br label %__nv_exp2f.exit165, !dbg !139 + +2935: ; preds = %__nv_exp2f.exit162 + %2936 = tail call float @llvm.nvvm.ex2.approx.f(float %2808) #2, !dbg !139 + br label %__nv_exp2f.exit165, !dbg !139 + +__nv_exp2f.exit165: ; preds = %2933, %2935 + %.0.i164 = phi float [ %2934, %2933 ], [ %2936, %2935 ], !dbg !139 + %2937 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i166 = icmp eq i32 %2937, 0, !dbg !139 + br i1 %.not.i166, label %2940, label %2938, !dbg !139 + +2938: ; preds = %__nv_exp2f.exit165 + %2939 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2809) #2, !dbg !139 + br label %__nv_exp2f.exit168, !dbg !139 + +2940: ; preds = %__nv_exp2f.exit165 + %2941 = tail call float @llvm.nvvm.ex2.approx.f(float %2809) #2, !dbg !139 + br label %__nv_exp2f.exit168, !dbg !139 + +__nv_exp2f.exit168: ; preds = %2938, %2940 + %.0.i167 = phi float [ %2939, %2938 ], [ %2941, %2940 ], !dbg !139 + %2942 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i169 = icmp eq i32 %2942, 0, !dbg !139 + br i1 %.not.i169, label %2945, label %2943, !dbg !139 + +2943: ; preds = %__nv_exp2f.exit168 + %2944 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2810) #2, !dbg !139 + br label %__nv_exp2f.exit171, !dbg !139 + +2945: ; preds = %__nv_exp2f.exit168 + %2946 = tail call float @llvm.nvvm.ex2.approx.f(float %2810) #2, !dbg !139 + br label %__nv_exp2f.exit171, !dbg !139 + +__nv_exp2f.exit171: ; preds = %2943, %2945 + %.0.i170 = phi float [ %2944, %2943 ], [ %2946, %2945 ], !dbg !139 + %2947 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i172 = icmp eq i32 %2947, 0, !dbg !139 + br i1 %.not.i172, label %2950, label %2948, !dbg !139 + +2948: ; preds = %__nv_exp2f.exit171 + %2949 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2811) #2, !dbg !139 + br label %__nv_exp2f.exit174, !dbg !139 + +2950: ; preds = %__nv_exp2f.exit171 + %2951 = tail call float @llvm.nvvm.ex2.approx.f(float %2811) #2, !dbg !139 + br label %__nv_exp2f.exit174, !dbg !139 + +__nv_exp2f.exit174: ; preds = %2948, %2950 + %.0.i173 = phi float [ %2949, %2948 ], [ %2951, %2950 ], !dbg !139 + %2952 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i175 = icmp eq i32 %2952, 0, !dbg !139 + br i1 %.not.i175, label %2955, label %2953, !dbg !139 + +2953: ; preds = %__nv_exp2f.exit174 + %2954 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2812) #2, !dbg !139 + br label %__nv_exp2f.exit177, !dbg !139 + +2955: ; preds = %__nv_exp2f.exit174 + %2956 = tail call float @llvm.nvvm.ex2.approx.f(float %2812) #2, !dbg !139 + br label %__nv_exp2f.exit177, !dbg !139 + +__nv_exp2f.exit177: ; preds = %2953, %2955 + %.0.i176 = phi float [ %2954, %2953 ], [ %2956, %2955 ], !dbg !139 + %2957 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i178 = icmp eq i32 %2957, 0, !dbg !139 + br i1 %.not.i178, label %2960, label %2958, !dbg !139 + +2958: ; preds = %__nv_exp2f.exit177 + %2959 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2813) #2, !dbg !139 + br label %__nv_exp2f.exit180, !dbg !139 + +2960: ; preds = %__nv_exp2f.exit177 + %2961 = tail call float @llvm.nvvm.ex2.approx.f(float %2813) #2, !dbg !139 + br label %__nv_exp2f.exit180, !dbg !139 + +__nv_exp2f.exit180: ; preds = %2958, %2960 + %.0.i179 = phi float [ %2959, %2958 ], [ %2961, %2960 ], !dbg !139 + %2962 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i181 = icmp eq i32 %2962, 0, !dbg !139 + br i1 %.not.i181, label %2965, label %2963, !dbg !139 + +2963: ; preds = %__nv_exp2f.exit180 + %2964 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2814) #2, !dbg !139 + br label %__nv_exp2f.exit183, !dbg !139 + +2965: ; preds = %__nv_exp2f.exit180 + %2966 = tail call float @llvm.nvvm.ex2.approx.f(float %2814) #2, !dbg !139 + br label %__nv_exp2f.exit183, !dbg !139 + +__nv_exp2f.exit183: ; preds = %2963, %2965 + %.0.i182 = phi float [ %2964, %2963 ], [ %2966, %2965 ], !dbg !139 + %2967 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i184 = icmp eq i32 %2967, 0, !dbg !139 + br i1 %.not.i184, label %2970, label %2968, !dbg !139 + +2968: ; preds = %__nv_exp2f.exit183 + %2969 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2815) #2, !dbg !139 + br label %__nv_exp2f.exit186, !dbg !139 + +2970: ; preds = %__nv_exp2f.exit183 + %2971 = tail call float @llvm.nvvm.ex2.approx.f(float %2815) #2, !dbg !139 + br label %__nv_exp2f.exit186, !dbg !139 + +__nv_exp2f.exit186: ; preds = %2968, %2970 + %.0.i185 = phi float [ %2969, %2968 ], [ %2971, %2970 ], !dbg !139 + %2972 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i187 = icmp eq i32 %2972, 0, !dbg !139 + br i1 %.not.i187, label %2975, label %2973, !dbg !139 + +2973: ; preds = %__nv_exp2f.exit186 + %2974 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2816) #2, !dbg !139 + br label %__nv_exp2f.exit189, !dbg !139 + +2975: ; preds = %__nv_exp2f.exit186 + %2976 = tail call float @llvm.nvvm.ex2.approx.f(float %2816) #2, !dbg !139 + br label %__nv_exp2f.exit189, !dbg !139 + +__nv_exp2f.exit189: ; preds = %2973, %2975 + %.0.i188 = phi float [ %2974, %2973 ], [ %2976, %2975 ], !dbg !139 + %2977 = fmul float %.pn464, %.0.i, !dbg !140 + %2978 = fmul float %.pn462, %.0.i92, !dbg !140 + %2979 = fadd float %.0.i95, %.0.i98, !dbg !141 + %2980 = fadd float %.0.i101, %.0.i104, !dbg !141 + %2981 = fadd float %2979, %.0.i107, !dbg !141 + %2982 = fadd float %2981, %.0.i110, !dbg !141 + %2983 = fadd float %2980, %.0.i113, !dbg !141 + %2984 = fadd float %2983, %.0.i116, !dbg !141 + %2985 = fadd float %2982, %.0.i119, !dbg !141 + %2986 = fadd float %2985, %.0.i122, !dbg !141 + %2987 = fadd float %2984, %.0.i125, !dbg !141 + %2988 = fadd float %2987, %.0.i128, !dbg !141 + %2989 = fadd float %2986, %.0.i131, !dbg !141 + %2990 = fadd float %2989, %.0.i134, !dbg !141 + %2991 = fadd float %2988, %.0.i137, !dbg !141 + %2992 = fadd float %2991, %.0.i140, !dbg !141 + %2993 = fadd float %2990, %.0.i143, !dbg !141 + %2994 = fadd float %2993, %.0.i146, !dbg !141 + %2995 = fadd float %2992, %.0.i149, !dbg !141 + %2996 = fadd float %2995, %.0.i152, !dbg !141 + %2997 = fadd float %2994, %.0.i155, !dbg !141 + %2998 = fadd float %2997, %.0.i158, !dbg !141 + %2999 = fadd float %2996, %.0.i161, !dbg !141 + %3000 = fadd float %2999, %.0.i164, !dbg !141 + %3001 = fadd float %2998, %.0.i167, !dbg !141 + %3002 = fadd float %3001, %.0.i170, !dbg !141 + %3003 = fadd float %3000, %.0.i173, !dbg !141 + %3004 = fadd float %3003, %.0.i176, !dbg !141 + %3005 = fadd float %3002, %.0.i179, !dbg !141 + %3006 = fadd float %3005, %.0.i182, !dbg !141 + %3007 = fadd float %3004, %.0.i185, !dbg !141 + %3008 = fadd float %3007, %.0.i188, !dbg !141 + %3009 = bitcast float %3006 to i32, !dbg !142 + %3010 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3009, i32 2, i32 31), !dbg !142 + %3011 = bitcast i32 %3010 to float, !dbg !142 + %3012 = fadd float %3006, %3011, !dbg !141 + %3013 = bitcast float %3012 to i32, !dbg !142 + %3014 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3013, i32 1, i32 31), !dbg !142 + %3015 = bitcast i32 %3014 to float, !dbg !142 + %3016 = fadd float %3012, %3015, !dbg !141 + %3017 = bitcast float %3008 to i32, !dbg !142 + %3018 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3017, i32 2, i32 31), !dbg !142 + %3019 = bitcast i32 %3018 to float, !dbg !142 + %3020 = fadd float %3008, %3019, !dbg !141 + %3021 = bitcast float %3020 to i32, !dbg !142 + %3022 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3021, i32 1, i32 31), !dbg !142 + %3023 = bitcast i32 %3022 to float, !dbg !142 + %3024 = fadd float %3020, %3023, !dbg !141 + %3025 = fadd float %2977, %3016, !dbg !143 + %3026 = fadd float %2978, %3024, !dbg !143 + %3027 = fmul float %2588, %.0.i, !dbg !144 + %3028 = fmul float %2589, %.0.i, !dbg !144 + %3029 = fmul float %2590, %.0.i92, !dbg !144 + %3030 = fmul float %2591, %.0.i92, !dbg !144 + %3031 = fmul float %2592, %.0.i, !dbg !144 + %3032 = fmul float %2593, %.0.i, !dbg !144 + %3033 = fmul float %2594, %.0.i92, !dbg !144 + %3034 = fmul float %2595, %.0.i92, !dbg !144 + %3035 = fmul float %2596, %.0.i, !dbg !144 + %3036 = fmul float %2597, %.0.i, !dbg !144 + %3037 = fmul float %2598, %.0.i92, !dbg !144 + %3038 = fmul float %2599, %.0.i92, !dbg !144 + %3039 = fmul float %2600, %.0.i, !dbg !144 + %3040 = fmul float %2601, %.0.i, !dbg !144 + %3041 = fmul float %2602, %.0.i92, !dbg !144 + %3042 = fmul float %2603, %.0.i92, !dbg !144 + %3043 = fmul float %2604, %.0.i, !dbg !144 + %3044 = fmul float %2605, %.0.i, !dbg !144 + %3045 = fmul float %2606, %.0.i92, !dbg !144 + %3046 = fmul float %2607, %.0.i92, !dbg !144 + %3047 = fmul float %2608, %.0.i, !dbg !144 + %3048 = fmul float %2609, %.0.i, !dbg !144 + %3049 = fmul float %2610, %.0.i92, !dbg !144 + %3050 = fmul float %2611, %.0.i92, !dbg !144 + %3051 = fmul float %2612, %.0.i, !dbg !144 + %3052 = fmul float %2613, %.0.i, !dbg !144 + %3053 = fmul float %2614, %.0.i92, !dbg !144 + %3054 = fmul float %2615, %.0.i92, !dbg !144 + %3055 = fmul float %2616, %.0.i, !dbg !144 + %3056 = fmul float %2617, %.0.i, !dbg !144 + %3057 = fmul float %2618, %.0.i92, !dbg !144 + %3058 = fmul float %2619, %.0.i92, !dbg !144 + %3059 = fmul float %2620, %.0.i, !dbg !144 + %3060 = fmul float %2621, %.0.i, !dbg !144 + %3061 = fmul float %2622, %.0.i92, !dbg !144 + %3062 = fmul float %2623, %.0.i92, !dbg !144 + %3063 = fmul float %2624, %.0.i, !dbg !144 + %3064 = fmul float %2625, %.0.i, !dbg !144 + %3065 = fmul float %2626, %.0.i92, !dbg !144 + %3066 = fmul float %2627, %.0.i92, !dbg !144 + %3067 = fmul float %2628, %.0.i, !dbg !144 + %3068 = fmul float %2629, %.0.i, !dbg !144 + %3069 = fmul float %2630, %.0.i92, !dbg !144 + %3070 = fmul float %2631, %.0.i92, !dbg !144 + %3071 = fmul float %2632, %.0.i, !dbg !144 + %3072 = fmul float %2633, %.0.i, !dbg !144 + %3073 = fmul float %2634, %.0.i92, !dbg !144 + %3074 = fmul float %2635, %.0.i92, !dbg !144 + %3075 = fmul float %2636, %.0.i, !dbg !144 + %3076 = fmul float %2637, %.0.i, !dbg !144 + %3077 = fmul float %2638, %.0.i92, !dbg !144 + %3078 = fmul float %2639, %.0.i92, !dbg !144 + %3079 = fmul float %2640, %.0.i, !dbg !144 + %3080 = fmul float %2641, %.0.i, !dbg !144 + %3081 = fmul float %2642, %.0.i92, !dbg !144 + %3082 = fmul float %2643, %.0.i92, !dbg !144 + %3083 = fmul float %2644, %.0.i, !dbg !144 + %3084 = fmul float %2645, %.0.i, !dbg !144 + %3085 = fmul float %2646, %.0.i92, !dbg !144 + %3086 = fmul float %2647, %.0.i92, !dbg !144 + %3087 = fmul float %2648, %.0.i, !dbg !144 + %3088 = fmul float %2649, %.0.i, !dbg !144 + %3089 = fmul float %2650, %.0.i92, !dbg !144 + %3090 = fmul float %2651, %.0.i92, !dbg !144 + %3091 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %2206, !dbg !125 + %3092 = insertelement <2 x float> poison, float %.0.i95, i64 0, !dbg !145 + %3093 = insertelement <2 x float> %3092, float %.0.i98, i64 1, !dbg !145 + %3094 = fptrunc <2 x float> %3093 to <2 x bfloat>, !dbg !145 + %3095 = insertelement <2 x float> poison, float %.0.i101, i64 0, !dbg !145 + %3096 = insertelement <2 x float> %3095, float %.0.i104, i64 1, !dbg !145 + %3097 = fptrunc <2 x float> %3096 to <2 x bfloat>, !dbg !145 + %3098 = insertelement <2 x float> poison, float %.0.i107, i64 0, !dbg !145 + %3099 = insertelement <2 x float> %3098, float %.0.i110, i64 1, !dbg !145 + %3100 = fptrunc <2 x float> %3099 to <2 x bfloat>, !dbg !145 + %3101 = insertelement <2 x float> poison, float %.0.i113, i64 0, !dbg !145 + %3102 = insertelement <2 x float> %3101, float %.0.i116, i64 1, !dbg !145 + %3103 = fptrunc <2 x float> %3102 to <2 x bfloat>, !dbg !145 + %3104 = insertelement <2 x float> poison, float %.0.i119, i64 0, !dbg !145 + %3105 = insertelement <2 x float> %3104, float %.0.i122, i64 1, !dbg !145 + %3106 = fptrunc <2 x float> %3105 to <2 x bfloat>, !dbg !145 + %3107 = insertelement <2 x float> poison, float %.0.i125, i64 0, !dbg !145 + %3108 = insertelement <2 x float> %3107, float %.0.i128, i64 1, !dbg !145 + %3109 = fptrunc <2 x float> %3108 to <2 x bfloat>, !dbg !145 + %3110 = insertelement <2 x float> poison, float %.0.i131, i64 0, !dbg !145 + %3111 = insertelement <2 x float> %3110, float %.0.i134, i64 1, !dbg !145 + %3112 = fptrunc <2 x float> %3111 to <2 x bfloat>, !dbg !145 + %3113 = insertelement <2 x float> poison, float %.0.i137, i64 0, !dbg !145 + %3114 = insertelement <2 x float> %3113, float %.0.i140, i64 1, !dbg !145 + %3115 = fptrunc <2 x float> %3114 to <2 x bfloat>, !dbg !145 + %3116 = insertelement <2 x float> poison, float %.0.i143, i64 0, !dbg !145 + %3117 = insertelement <2 x float> %3116, float %.0.i146, i64 1, !dbg !145 + %3118 = fptrunc <2 x float> %3117 to <2 x bfloat>, !dbg !145 + %3119 = insertelement <2 x float> poison, float %.0.i149, i64 0, !dbg !145 + %3120 = insertelement <2 x float> %3119, float %.0.i152, i64 1, !dbg !145 + %3121 = fptrunc <2 x float> %3120 to <2 x bfloat>, !dbg !145 + %3122 = insertelement <2 x float> poison, float %.0.i155, i64 0, !dbg !145 + %3123 = insertelement <2 x float> %3122, float %.0.i158, i64 1, !dbg !145 + %3124 = fptrunc <2 x float> %3123 to <2 x bfloat>, !dbg !145 + %3125 = insertelement <2 x float> poison, float %.0.i161, i64 0, !dbg !145 + %3126 = insertelement <2 x float> %3125, float %.0.i164, i64 1, !dbg !145 + %3127 = fptrunc <2 x float> %3126 to <2 x bfloat>, !dbg !145 + %3128 = insertelement <2 x float> poison, float %.0.i167, i64 0, !dbg !145 + %3129 = insertelement <2 x float> %3128, float %.0.i170, i64 1, !dbg !145 + %3130 = fptrunc <2 x float> %3129 to <2 x bfloat>, !dbg !145 + %3131 = insertelement <2 x float> poison, float %.0.i173, i64 0, !dbg !145 + %3132 = insertelement <2 x float> %3131, float %.0.i176, i64 1, !dbg !145 + %3133 = fptrunc <2 x float> %3132 to <2 x bfloat>, !dbg !145 + %3134 = insertelement <2 x float> poison, float %.0.i179, i64 0, !dbg !145 + %3135 = insertelement <2 x float> %3134, float %.0.i182, i64 1, !dbg !145 + %3136 = fptrunc <2 x float> %3135 to <2 x bfloat>, !dbg !145 + %3137 = insertelement <2 x float> poison, float %.0.i185, i64 0, !dbg !145 + %3138 = insertelement <2 x float> %3137, float %.0.i188, i64 1, !dbg !145 + %3139 = fptrunc <2 x float> %3138 to <2 x bfloat>, !dbg !145 + %3140 = bitcast <2 x bfloat> %3094 to i32, !dbg !146 + %3141 = bitcast <2 x bfloat> %3097 to i32, !dbg !146 + %3142 = bitcast <2 x bfloat> %3100 to i32, !dbg !146 + %3143 = bitcast <2 x bfloat> %3103 to i32, !dbg !146 + %3144 = bitcast <2 x bfloat> %3106 to i32, !dbg !146 + %3145 = bitcast <2 x bfloat> %3109 to i32, !dbg !146 + %3146 = bitcast <2 x bfloat> %3112 to i32, !dbg !146 + %3147 = bitcast <2 x bfloat> %3115 to i32, !dbg !146 + %3148 = bitcast <2 x bfloat> %3118 to i32, !dbg !146 + %3149 = bitcast <2 x bfloat> %3121 to i32, !dbg !146 + %3150 = bitcast <2 x bfloat> %3124 to i32, !dbg !146 + %3151 = bitcast <2 x bfloat> %3127 to i32, !dbg !146 + %3152 = bitcast <2 x bfloat> %3130 to i32, !dbg !146 + %3153 = bitcast <2 x bfloat> %3133 to i32, !dbg !146 + %3154 = bitcast <2 x bfloat> %3136 to i32, !dbg !146 + %3155 = bitcast <2 x bfloat> %3139 to i32, !dbg !146 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !146 + %3156 = ptrtoint ptr addrspace(3) %3091 to i32, !dbg !146 + %3157 = lshr exact i32 %3156, 4, !dbg !146 + %3158 = and i32 %3157, 16383, !dbg !146 + %3159 = zext nneg i32 %3158 to i64, !dbg !146 + %3160 = or disjoint i64 %3159, 4611686293338849280, !dbg !146 + %3161 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3027, float %3028, float %3029, float %3030, float %3031, float %3032, float %3033, float %3034, float %3035, float %3036, float %3037, float %3038, float %3039, float %3040, float %3041, float %3042, float %3043, float %3044, float %3045, float %3046, float %3047, float %3048, float %3049, float %3050, float %3051, float %3052, float %3053, float %3054, float %3055, float %3056, float %3057, float %3058, float %3059, float %3060, float %3061, float %3062, float %3063, float %3064, float %3065, float %3066, float %3067, float %3068, float %3069, float %3070, float %3071, float %3072, float %3073, float %3074, float %3075, float %3076, float %3077, float %3078, float %3079, float %3080, float %3081, float %3082, float %3083, float %3084, float %3085, float %3086, float %3087, float %3088, float %3089, float %3090, i32 %3140, i32 %3141, i32 %3142, i32 %3143, i64 %3160, i1 true) #2, !dbg !146 + %3162 = add i32 %3156, 2048, !dbg !146 + %3163 = lshr exact i32 %3162, 4, !dbg !146 + %3164 = and i32 %3163, 16383, !dbg !146 + %3165 = zext nneg i32 %3164 to i64, !dbg !146 + %3166 = or disjoint i64 %3165, 4611686293338849280, !dbg !146 + %3167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 0, !dbg !146 + %3168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 1, !dbg !146 + %3169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 2, !dbg !146 + %3170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 3, !dbg !146 + %3171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 4, !dbg !146 + %3172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 5, !dbg !146 + %3173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 6, !dbg !146 + %3174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 7, !dbg !146 + %3175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 8, !dbg !146 + %3176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 9, !dbg !146 + %3177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 10, !dbg !146 + %3178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 11, !dbg !146 + %3179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 12, !dbg !146 + %3180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 13, !dbg !146 + %3181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 14, !dbg !146 + %3182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 15, !dbg !146 + %3183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 16, !dbg !146 + %3184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 17, !dbg !146 + %3185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 18, !dbg !146 + %3186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 19, !dbg !146 + %3187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 20, !dbg !146 + %3188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 21, !dbg !146 + %3189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 22, !dbg !146 + %3190 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 23, !dbg !146 + %3191 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 24, !dbg !146 + %3192 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 25, !dbg !146 + %3193 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 26, !dbg !146 + %3194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 27, !dbg !146 + %3195 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 28, !dbg !146 + %3196 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 29, !dbg !146 + %3197 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 30, !dbg !146 + %3198 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 31, !dbg !146 + %3199 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 32, !dbg !146 + %3200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 33, !dbg !146 + %3201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 34, !dbg !146 + %3202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 35, !dbg !146 + %3203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 36, !dbg !146 + %3204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 37, !dbg !146 + %3205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 38, !dbg !146 + %3206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 39, !dbg !146 + %3207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 40, !dbg !146 + %3208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 41, !dbg !146 + %3209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 42, !dbg !146 + %3210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 43, !dbg !146 + %3211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 44, !dbg !146 + %3212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 45, !dbg !146 + %3213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 46, !dbg !146 + %3214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 47, !dbg !146 + %3215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 48, !dbg !146 + %3216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 49, !dbg !146 + %3217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 50, !dbg !146 + %3218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 51, !dbg !146 + %3219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 52, !dbg !146 + %3220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 53, !dbg !146 + %3221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 54, !dbg !146 + %3222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 55, !dbg !146 + %3223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 56, !dbg !146 + %3224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 57, !dbg !146 + %3225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 58, !dbg !146 + %3226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 59, !dbg !146 + %3227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 60, !dbg !146 + %3228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 61, !dbg !146 + %3229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 62, !dbg !146 + %3230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 63, !dbg !146 + %3231 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3167, float %3168, float %3169, float %3170, float %3171, float %3172, float %3173, float %3174, float %3175, float %3176, float %3177, float %3178, float %3179, float %3180, float %3181, float %3182, float %3183, float %3184, float %3185, float %3186, float %3187, float %3188, float %3189, float %3190, float %3191, float %3192, float %3193, float %3194, float %3195, float %3196, float %3197, float %3198, float %3199, float %3200, float %3201, float %3202, float %3203, float %3204, float %3205, float %3206, float %3207, float %3208, float %3209, float %3210, float %3211, float %3212, float %3213, float %3214, float %3215, float %3216, float %3217, float %3218, float %3219, float %3220, float %3221, float %3222, float %3223, float %3224, float %3225, float %3226, float %3227, float %3228, float %3229, float %3230, i32 %3144, i32 %3145, i32 %3146, i32 %3147, i64 %3166, i1 true) #2, !dbg !146 + %3232 = add i32 %3156, 4096, !dbg !146 + %3233 = lshr exact i32 %3232, 4, !dbg !146 + %3234 = and i32 %3233, 16383, !dbg !146 + %3235 = zext nneg i32 %3234 to i64, !dbg !146 + %3236 = or disjoint i64 %3235, 4611686293338849280, !dbg !146 + %3237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 0, !dbg !146 + %3238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 1, !dbg !146 + %3239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 2, !dbg !146 + %3240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 3, !dbg !146 + %3241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 4, !dbg !146 + %3242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 5, !dbg !146 + %3243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 6, !dbg !146 + %3244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 7, !dbg !146 + %3245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 8, !dbg !146 + %3246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 9, !dbg !146 + %3247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 10, !dbg !146 + %3248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 11, !dbg !146 + %3249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 12, !dbg !146 + %3250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 13, !dbg !146 + %3251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 14, !dbg !146 + %3252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 15, !dbg !146 + %3253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 16, !dbg !146 + %3254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 17, !dbg !146 + %3255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 18, !dbg !146 + %3256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 19, !dbg !146 + %3257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 20, !dbg !146 + %3258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 21, !dbg !146 + %3259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 22, !dbg !146 + %3260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 23, !dbg !146 + %3261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 24, !dbg !146 + %3262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 25, !dbg !146 + %3263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 26, !dbg !146 + %3264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 27, !dbg !146 + %3265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 28, !dbg !146 + %3266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 29, !dbg !146 + %3267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 30, !dbg !146 + %3268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 31, !dbg !146 + %3269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 32, !dbg !146 + %3270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 33, !dbg !146 + %3271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 34, !dbg !146 + %3272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 35, !dbg !146 + %3273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 36, !dbg !146 + %3274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 37, !dbg !146 + %3275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 38, !dbg !146 + %3276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 39, !dbg !146 + %3277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 40, !dbg !146 + %3278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 41, !dbg !146 + %3279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 42, !dbg !146 + %3280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 43, !dbg !146 + %3281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 44, !dbg !146 + %3282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 45, !dbg !146 + %3283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 46, !dbg !146 + %3284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 47, !dbg !146 + %3285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 48, !dbg !146 + %3286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 49, !dbg !146 + %3287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 50, !dbg !146 + %3288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 51, !dbg !146 + %3289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 52, !dbg !146 + %3290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 53, !dbg !146 + %3291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 54, !dbg !146 + %3292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 55, !dbg !146 + %3293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 56, !dbg !146 + %3294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 57, !dbg !146 + %3295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 58, !dbg !146 + %3296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 59, !dbg !146 + %3297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 60, !dbg !146 + %3298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 61, !dbg !146 + %3299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 62, !dbg !146 + %3300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 63, !dbg !146 + %3301 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3237, float %3238, float %3239, float %3240, float %3241, float %3242, float %3243, float %3244, float %3245, float %3246, float %3247, float %3248, float %3249, float %3250, float %3251, float %3252, float %3253, float %3254, float %3255, float %3256, float %3257, float %3258, float %3259, float %3260, float %3261, float %3262, float %3263, float %3264, float %3265, float %3266, float %3267, float %3268, float %3269, float %3270, float %3271, float %3272, float %3273, float %3274, float %3275, float %3276, float %3277, float %3278, float %3279, float %3280, float %3281, float %3282, float %3283, float %3284, float %3285, float %3286, float %3287, float %3288, float %3289, float %3290, float %3291, float %3292, float %3293, float %3294, float %3295, float %3296, float %3297, float %3298, float %3299, float %3300, i32 %3148, i32 %3149, i32 %3150, i32 %3151, i64 %3236, i1 true) #2, !dbg !146 + %3302 = add i32 %3156, 6144, !dbg !146 + %3303 = lshr exact i32 %3302, 4, !dbg !146 + %3304 = and i32 %3303, 16383, !dbg !146 + %3305 = zext nneg i32 %3304 to i64, !dbg !146 + %3306 = or disjoint i64 %3305, 4611686293338849280, !dbg !146 + %3307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 0, !dbg !146 + %3308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 1, !dbg !146 + %3309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 2, !dbg !146 + %3310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 3, !dbg !146 + %3311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 4, !dbg !146 + %3312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 5, !dbg !146 + %3313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 6, !dbg !146 + %3314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 7, !dbg !146 + %3315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 8, !dbg !146 + %3316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 9, !dbg !146 + %3317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 10, !dbg !146 + %3318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 11, !dbg !146 + %3319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 12, !dbg !146 + %3320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 13, !dbg !146 + %3321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 14, !dbg !146 + %3322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 15, !dbg !146 + %3323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 16, !dbg !146 + %3324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 17, !dbg !146 + %3325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 18, !dbg !146 + %3326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 19, !dbg !146 + %3327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 20, !dbg !146 + %3328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 21, !dbg !146 + %3329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 22, !dbg !146 + %3330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 23, !dbg !146 + %3331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 24, !dbg !146 + %3332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 25, !dbg !146 + %3333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 26, !dbg !146 + %3334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 27, !dbg !146 + %3335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 28, !dbg !146 + %3336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 29, !dbg !146 + %3337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 30, !dbg !146 + %3338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 31, !dbg !146 + %3339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 32, !dbg !146 + %3340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 33, !dbg !146 + %3341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 34, !dbg !146 + %3342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 35, !dbg !146 + %3343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 36, !dbg !146 + %3344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 37, !dbg !146 + %3345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 38, !dbg !146 + %3346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 39, !dbg !146 + %3347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 40, !dbg !146 + %3348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 41, !dbg !146 + %3349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 42, !dbg !146 + %3350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 43, !dbg !146 + %3351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 44, !dbg !146 + %3352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 45, !dbg !146 + %3353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 46, !dbg !146 + %3354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 47, !dbg !146 + %3355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 48, !dbg !146 + %3356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 49, !dbg !146 + %3357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 50, !dbg !146 + %3358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 51, !dbg !146 + %3359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 52, !dbg !146 + %3360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 53, !dbg !146 + %3361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 54, !dbg !146 + %3362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 55, !dbg !146 + %3363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 56, !dbg !146 + %3364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 57, !dbg !146 + %3365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 58, !dbg !146 + %3366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 59, !dbg !146 + %3367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 60, !dbg !146 + %3368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 61, !dbg !146 + %3369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 62, !dbg !146 + %3370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 63, !dbg !146 + %3371 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3307, float %3308, float %3309, float %3310, float %3311, float %3312, float %3313, float %3314, float %3315, float %3316, float %3317, float %3318, float %3319, float %3320, float %3321, float %3322, float %3323, float %3324, float %3325, float %3326, float %3327, float %3328, float %3329, float %3330, float %3331, float %3332, float %3333, float %3334, float %3335, float %3336, float %3337, float %3338, float %3339, float %3340, float %3341, float %3342, float %3343, float %3344, float %3345, float %3346, float %3347, float %3348, float %3349, float %3350, float %3351, float %3352, float %3353, float %3354, float %3355, float %3356, float %3357, float %3358, float %3359, float %3360, float %3361, float %3362, float %3363, float %3364, float %3365, float %3366, float %3367, float %3368, float %3369, float %3370, i32 %3152, i32 %3153, i32 %3154, i32 %3155, i64 %3306, i1 true) #2, !dbg !146 + %3372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 0, !dbg !146 + %3373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 1, !dbg !146 + %3374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 2, !dbg !146 + %3375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 3, !dbg !146 + %3376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 4, !dbg !146 + %3377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 5, !dbg !146 + %3378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 6, !dbg !146 + %3379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 7, !dbg !146 + %3380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 8, !dbg !146 + %3381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 9, !dbg !146 + %3382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 10, !dbg !146 + %3383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 11, !dbg !146 + %3384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 12, !dbg !146 + %3385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 13, !dbg !146 + %3386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 14, !dbg !146 + %3387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 15, !dbg !146 + %3388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 16, !dbg !146 + %3389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 17, !dbg !146 + %3390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 18, !dbg !146 + %3391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 19, !dbg !146 + %3392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 20, !dbg !146 + %3393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 21, !dbg !146 + %3394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 22, !dbg !146 + %3395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 23, !dbg !146 + %3396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 24, !dbg !146 + %3397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 25, !dbg !146 + %3398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 26, !dbg !146 + %3399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 27, !dbg !146 + %3400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 28, !dbg !146 + %3401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 29, !dbg !146 + %3402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 30, !dbg !146 + %3403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 31, !dbg !146 + %3404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 32, !dbg !146 + %3405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 33, !dbg !146 + %3406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 34, !dbg !146 + %3407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 35, !dbg !146 + %3408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 36, !dbg !146 + %3409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 37, !dbg !146 + %3410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 38, !dbg !146 + %3411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 39, !dbg !146 + %3412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 40, !dbg !146 + %3413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 41, !dbg !146 + %3414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 42, !dbg !146 + %3415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 43, !dbg !146 + %3416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 44, !dbg !146 + %3417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 45, !dbg !146 + %3418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 46, !dbg !146 + %3419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 47, !dbg !146 + %3420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 48, !dbg !146 + %3421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 49, !dbg !146 + %3422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 50, !dbg !146 + %3423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 51, !dbg !146 + %3424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 52, !dbg !146 + %3425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 53, !dbg !146 + %3426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 54, !dbg !146 + %3427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 55, !dbg !146 + %3428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 56, !dbg !146 + %3429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 57, !dbg !146 + %3430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 58, !dbg !146 + %3431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 59, !dbg !146 + %3432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 60, !dbg !146 + %3433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 61, !dbg !146 + %3434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 62, !dbg !146 + %3435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 63, !dbg !146 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !146 + %3436 = add nuw nsw i32 %2199, 1, !dbg !119 + %3437 = lshr i32 %3436, 1, !dbg !147 + %3438 = zext nneg i32 %3437 to i64, !dbg !148 + %3439 = getelementptr i32, ptr addrspace(1) %1997, i64 %3438, !dbg !148 + %3440 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !149 + %3441 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %3439, i64 %3440, i1 %2202) #2, !dbg !149 + %3442 = add nuw nsw i32 %3437, 1, !dbg !150 + %3443 = icmp slt i32 %3442, %2001, !dbg !151 + %3444 = getelementptr i8, ptr addrspace(1) %3439, i64 4, !dbg !152 + %3445 = and i1 %2202, %3443, !dbg !119 + %3446 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !153 + %3447 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %3444, i64 %3446, i1 %3445) #2, !dbg !153 + %3448 = and i32 %2199, 1, !dbg !154 + %3449 = sub i32 %3447, %3441, !dbg !155 + %3450 = shl i32 %3449, 7, !dbg !156 + %3451 = add i32 %3450, -64, !dbg !157 + %3452 = xor i32 %3448, 1, !dbg !158 + %3453 = mul nuw nsw i32 %3451, %3452, !dbg !158 + %3454 = shl nuw nsw i32 %3448, 6, !dbg !159 + %3455 = add i32 %2198, %3454, !dbg !160 + %3456 = add i32 %3455, %3453, !dbg !161 + %3457 = add i32 %2197, 1, !dbg !119 + %3458 = icmp sgt i32 %3457, 2, !dbg !119 + %3459 = select i1 %3458, i32 0, i32 %3457, !dbg !119 + %3460 = add i32 %3456, %1999, !dbg !126 + %3461 = add i32 %3460, %39, !dbg !121 + %3462 = add i32 %3460, %40, !dbg !121 + %3463 = add i32 %3460, %41, !dbg !121 + %3464 = add i32 %3460, %42, !dbg !121 + %3465 = shl i32 %3461, 7, !dbg !122 + %3466 = shl i32 %3462, 7, !dbg !122 + %3467 = shl i32 %3463, 7, !dbg !122 + %3468 = shl i32 %3464, 7, !dbg !122 + %3469 = sext i32 %3465 to i64, !dbg !123 + %3470 = sext i32 %3466 to i64, !dbg !123 + %3471 = sext i32 %3467 to i64, !dbg !123 + %3472 = sext i32 %3468 to i64, !dbg !123 + %gep362 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3469, !dbg !124 + %gep364 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3470, !dbg !124 + %gep366 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3471, !dbg !124 + %gep368 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3472, !dbg !124 + %3473 = shl i32 %3459, 13, !dbg !125 + %3474 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %3473, !dbg !125 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !125 + %3475 = getelementptr inbounds nuw i8, ptr addrspace(3) %3474, i32 %220, !dbg !125 + %3476 = select i1 %2201, i32 16, i32 0, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %3475, ptr addrspace(1) %gep362, i32 %3476) #2, !dbg !125 + %3477 = getelementptr inbounds nuw i8, ptr addrspace(3) %3474, i32 %223, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3477, ptr addrspace(1) %gep364, i32 %3476) #2, !dbg !125 + %3478 = getelementptr inbounds nuw i8, ptr addrspace(3) %3474, i32 %225, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3478, ptr addrspace(1) %gep366, i32 %3476) #2, !dbg !125 + %3479 = getelementptr inbounds nuw i8, ptr addrspace(3) %3474, i32 %227, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3479, ptr addrspace(1) %gep368, i32 %3476) #2, !dbg !125 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !125 + %gep370 = getelementptr bfloat, ptr addrspace(1) %invariant.gep335, i64 %3469, !dbg !124 + %gep372 = getelementptr bfloat, ptr addrspace(1) %invariant.gep335, i64 %3470, !dbg !124 + %gep374 = getelementptr bfloat, ptr addrspace(1) %invariant.gep335, i64 %3471, !dbg !124 + %gep376 = getelementptr bfloat, ptr addrspace(1) %invariant.gep335, i64 %3472, !dbg !124 + %3480 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %3473, !dbg !125 + %3481 = getelementptr inbounds nuw i8, ptr addrspace(3) %3480, i32 %220, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %3481, ptr addrspace(1) %gep370, i32 %3476) #2, !dbg !125 + %3482 = getelementptr inbounds nuw i8, ptr addrspace(3) %3480, i32 %223, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3482, ptr addrspace(1) %gep372, i32 %3476) #2, !dbg !125 + %3483 = getelementptr inbounds nuw i8, ptr addrspace(3) %3480, i32 %225, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3483, ptr addrspace(1) %gep374, i32 %3476) #2, !dbg !125 + %3484 = getelementptr inbounds nuw i8, ptr addrspace(3) %3480, i32 %227, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3484, ptr addrspace(1) %gep376, i32 %3476) #2, !dbg !125 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !125 + %exitcond390.not = icmp eq i32 %3436, %2128, !dbg !119 + br i1 %exitcond390.not, label %._crit_edge379, label %2195, !dbg !119 + +._crit_edge379: ; preds = %__nv_exp2f.exit189, %._crit_edge + %3485 = phi float [ %2064, %._crit_edge ], [ %3372, %__nv_exp2f.exit189 ], !dbg !44 + %3486 = phi float [ %2065, %._crit_edge ], [ %3373, %__nv_exp2f.exit189 ], !dbg !44 + %3487 = phi float [ %2066, %._crit_edge ], [ %3374, %__nv_exp2f.exit189 ], !dbg !44 + %3488 = phi float [ %2067, %._crit_edge ], [ %3375, %__nv_exp2f.exit189 ], !dbg !44 + %3489 = phi float [ %2068, %._crit_edge ], [ %3376, %__nv_exp2f.exit189 ], !dbg !44 + %3490 = phi float [ %2069, %._crit_edge ], [ %3377, %__nv_exp2f.exit189 ], !dbg !44 + %3491 = phi float [ %2070, %._crit_edge ], [ %3378, %__nv_exp2f.exit189 ], !dbg !44 + %3492 = phi float [ %2071, %._crit_edge ], [ %3379, %__nv_exp2f.exit189 ], !dbg !44 + %3493 = phi float [ %2072, %._crit_edge ], [ %3380, %__nv_exp2f.exit189 ], !dbg !44 + %3494 = phi float [ %2073, %._crit_edge ], [ %3381, %__nv_exp2f.exit189 ], !dbg !44 + %3495 = phi float [ %2074, %._crit_edge ], [ %3382, %__nv_exp2f.exit189 ], !dbg !44 + %3496 = phi float [ %2075, %._crit_edge ], [ %3383, %__nv_exp2f.exit189 ], !dbg !44 + %3497 = phi float [ %2076, %._crit_edge ], [ %3384, %__nv_exp2f.exit189 ], !dbg !44 + %3498 = phi float [ %2077, %._crit_edge ], [ %3385, %__nv_exp2f.exit189 ], !dbg !44 + %3499 = phi float [ %2078, %._crit_edge ], [ %3386, %__nv_exp2f.exit189 ], !dbg !44 + %3500 = phi float [ %2079, %._crit_edge ], [ %3387, %__nv_exp2f.exit189 ], !dbg !44 + %3501 = phi float [ %2080, %._crit_edge ], [ %3388, %__nv_exp2f.exit189 ], !dbg !44 + %3502 = phi float [ %2081, %._crit_edge ], [ %3389, %__nv_exp2f.exit189 ], !dbg !44 + %3503 = phi float [ %2082, %._crit_edge ], [ %3390, %__nv_exp2f.exit189 ], !dbg !44 + %3504 = phi float [ %2083, %._crit_edge ], [ %3391, %__nv_exp2f.exit189 ], !dbg !44 + %3505 = phi float [ %2084, %._crit_edge ], [ %3392, %__nv_exp2f.exit189 ], !dbg !44 + %3506 = phi float [ %2085, %._crit_edge ], [ %3393, %__nv_exp2f.exit189 ], !dbg !44 + %3507 = phi float [ %2086, %._crit_edge ], [ %3394, %__nv_exp2f.exit189 ], !dbg !44 + %3508 = phi float [ %2087, %._crit_edge ], [ %3395, %__nv_exp2f.exit189 ], !dbg !44 + %3509 = phi float [ %2088, %._crit_edge ], [ %3396, %__nv_exp2f.exit189 ], !dbg !44 + %3510 = phi float [ %2089, %._crit_edge ], [ %3397, %__nv_exp2f.exit189 ], !dbg !44 + %3511 = phi float [ %2090, %._crit_edge ], [ %3398, %__nv_exp2f.exit189 ], !dbg !44 + %3512 = phi float [ %2091, %._crit_edge ], [ %3399, %__nv_exp2f.exit189 ], !dbg !44 + %3513 = phi float [ %2092, %._crit_edge ], [ %3400, %__nv_exp2f.exit189 ], !dbg !44 + %3514 = phi float [ %2093, %._crit_edge ], [ %3401, %__nv_exp2f.exit189 ], !dbg !44 + %3515 = phi float [ %2094, %._crit_edge ], [ %3402, %__nv_exp2f.exit189 ], !dbg !44 + %3516 = phi float [ %2095, %._crit_edge ], [ %3403, %__nv_exp2f.exit189 ], !dbg !44 + %3517 = phi float [ %2096, %._crit_edge ], [ %3404, %__nv_exp2f.exit189 ], !dbg !44 + %3518 = phi float [ %2097, %._crit_edge ], [ %3405, %__nv_exp2f.exit189 ], !dbg !44 + %3519 = phi float [ %2098, %._crit_edge ], [ %3406, %__nv_exp2f.exit189 ], !dbg !44 + %3520 = phi float [ %2099, %._crit_edge ], [ %3407, %__nv_exp2f.exit189 ], !dbg !44 + %3521 = phi float [ %2100, %._crit_edge ], [ %3408, %__nv_exp2f.exit189 ], !dbg !44 + %3522 = phi float [ %2101, %._crit_edge ], [ %3409, %__nv_exp2f.exit189 ], !dbg !44 + %3523 = phi float [ %2102, %._crit_edge ], [ %3410, %__nv_exp2f.exit189 ], !dbg !44 + %3524 = phi float [ %2103, %._crit_edge ], [ %3411, %__nv_exp2f.exit189 ], !dbg !44 + %3525 = phi float [ %2104, %._crit_edge ], [ %3412, %__nv_exp2f.exit189 ], !dbg !44 + %3526 = phi float [ %2105, %._crit_edge ], [ %3413, %__nv_exp2f.exit189 ], !dbg !44 + %3527 = phi float [ %2106, %._crit_edge ], [ %3414, %__nv_exp2f.exit189 ], !dbg !44 + %3528 = phi float [ %2107, %._crit_edge ], [ %3415, %__nv_exp2f.exit189 ], !dbg !44 + %3529 = phi float [ %2108, %._crit_edge ], [ %3416, %__nv_exp2f.exit189 ], !dbg !44 + %3530 = phi float [ %2109, %._crit_edge ], [ %3417, %__nv_exp2f.exit189 ], !dbg !44 + %3531 = phi float [ %2110, %._crit_edge ], [ %3418, %__nv_exp2f.exit189 ], !dbg !44 + %3532 = phi float [ %2111, %._crit_edge ], [ %3419, %__nv_exp2f.exit189 ], !dbg !44 + %3533 = phi float [ %2112, %._crit_edge ], [ %3420, %__nv_exp2f.exit189 ], !dbg !44 + %3534 = phi float [ %2113, %._crit_edge ], [ %3421, %__nv_exp2f.exit189 ], !dbg !44 + %3535 = phi float [ %2114, %._crit_edge ], [ %3422, %__nv_exp2f.exit189 ], !dbg !44 + %3536 = phi float [ %2115, %._crit_edge ], [ %3423, %__nv_exp2f.exit189 ], !dbg !44 + %3537 = phi float [ %2116, %._crit_edge ], [ %3424, %__nv_exp2f.exit189 ], !dbg !44 + %3538 = phi float [ %2117, %._crit_edge ], [ %3425, %__nv_exp2f.exit189 ], !dbg !44 + %3539 = phi float [ %2118, %._crit_edge ], [ %3426, %__nv_exp2f.exit189 ], !dbg !44 + %3540 = phi float [ %2119, %._crit_edge ], [ %3427, %__nv_exp2f.exit189 ], !dbg !44 + %3541 = phi float [ %2120, %._crit_edge ], [ %3428, %__nv_exp2f.exit189 ], !dbg !44 + %3542 = phi float [ %2121, %._crit_edge ], [ %3429, %__nv_exp2f.exit189 ], !dbg !44 + %3543 = phi float [ %2122, %._crit_edge ], [ %3430, %__nv_exp2f.exit189 ], !dbg !44 + %3544 = phi float [ %2123, %._crit_edge ], [ %3431, %__nv_exp2f.exit189 ], !dbg !44 + %3545 = phi float [ %2124, %._crit_edge ], [ %3432, %__nv_exp2f.exit189 ], !dbg !44 + %3546 = phi float [ %2125, %._crit_edge ], [ %3433, %__nv_exp2f.exit189 ], !dbg !44 + %3547 = phi float [ %2126, %._crit_edge ], [ %3434, %__nv_exp2f.exit189 ], !dbg !44 + %3548 = phi float [ %2127, %._crit_edge ], [ %3435, %__nv_exp2f.exit189 ], !dbg !44 + %3549 = phi float [ %1993, %._crit_edge ], [ %3025, %__nv_exp2f.exit189 ], !dbg !44 + %3550 = phi float [ %1994, %._crit_edge ], [ %3026, %__nv_exp2f.exit189 ], !dbg !44 + %3551 = phi <2 x float> [ %1995, %._crit_edge ], [ %2764, %__nv_exp2f.exit189 ], !dbg !44 + %3552 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %3485, float %3486, float %3487, float %3488, float %3489, float %3490, float %3491, float %3492, float %3493, float %3494, float %3495, float %3496, float %3497, float %3498, float %3499, float %3500, float %3501, float %3502, float %3503, float %3504, float %3505, float %3506, float %3507, float %3508, float %3509, float %3510, float %3511, float %3512, float %3513, float %3514, float %3515, float %3516, float %3517, float %3518, float %3519, float %3520, float %3521, float %3522, float %3523, float %3524, float %3525, float %3526, float %3527, float %3528, float %3529, float %3530, float %3531, float %3532, float %3533, float %3534, float %3535, float %3536, float %3537, float %3538, float %3539, float %3540, float %3541, float %3542, float %3543, float %3544, float %3545, float %3546, float %3547, float %3548) #2, !dbg !119 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !119 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119 + %3553 = fcmp oeq float %3549, 0.000000e+00, !dbg !162 + %3554 = fcmp oeq float %3550, 0.000000e+00, !dbg !162 + %3555 = select i1 %3553, float 1.000000e+00, float %3549, !dbg !163 + %3556 = select i1 %3554, float 1.000000e+00, float %3550, !dbg !163 + %3557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 0, !dbg !164 + %3558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 1, !dbg !164 + %3559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 2, !dbg !164 + %3560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 3, !dbg !164 + %3561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 4, !dbg !164 + %3562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 5, !dbg !164 + %3563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 6, !dbg !164 + %3564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 7, !dbg !164 + %3565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 8, !dbg !164 + %3566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 9, !dbg !164 + %3567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 10, !dbg !164 + %3568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 11, !dbg !164 + %3569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 12, !dbg !164 + %3570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 13, !dbg !164 + %3571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 14, !dbg !164 + %3572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 15, !dbg !164 + %3573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 16, !dbg !164 + %3574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 17, !dbg !164 + %3575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 18, !dbg !164 + %3576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 19, !dbg !164 + %3577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 20, !dbg !164 + %3578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 21, !dbg !164 + %3579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 22, !dbg !164 + %3580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 23, !dbg !164 + %3581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 24, !dbg !164 + %3582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 25, !dbg !164 + %3583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 26, !dbg !164 + %3584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 27, !dbg !164 + %3585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 28, !dbg !164 + %3586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 29, !dbg !164 + %3587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 30, !dbg !164 + %3588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 31, !dbg !164 + %3589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 32, !dbg !164 + %3590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 33, !dbg !164 + %3591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 34, !dbg !164 + %3592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 35, !dbg !164 + %3593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 36, !dbg !164 + %3594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 37, !dbg !164 + %3595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 38, !dbg !164 + %3596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 39, !dbg !164 + %3597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 40, !dbg !164 + %3598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 41, !dbg !164 + %3599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 42, !dbg !164 + %3600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 43, !dbg !164 + %3601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 44, !dbg !164 + %3602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 45, !dbg !164 + %3603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 46, !dbg !164 + %3604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 47, !dbg !164 + %3605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 48, !dbg !164 + %3606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 49, !dbg !164 + %3607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 50, !dbg !164 + %3608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 51, !dbg !164 + %3609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 52, !dbg !164 + %3610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 53, !dbg !164 + %3611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 54, !dbg !164 + %3612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 55, !dbg !164 + %3613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 56, !dbg !164 + %3614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 57, !dbg !164 + %3615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 58, !dbg !164 + %3616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 59, !dbg !164 + %3617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 60, !dbg !164 + %3618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 61, !dbg !164 + %3619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 62, !dbg !164 + %3620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 63, !dbg !164 + %3621 = tail call float @llvm.nvvm.div.full(float %3557, float %3555), !dbg !164 + %3622 = tail call float @llvm.nvvm.div.full(float %3558, float %3555), !dbg !164 + %3623 = tail call float @llvm.nvvm.div.full(float %3559, float %3556), !dbg !164 + %3624 = tail call float @llvm.nvvm.div.full(float %3560, float %3556), !dbg !164 + %3625 = tail call float @llvm.nvvm.div.full(float %3561, float %3555), !dbg !164 + %3626 = tail call float @llvm.nvvm.div.full(float %3562, float %3555), !dbg !164 + %3627 = tail call float @llvm.nvvm.div.full(float %3563, float %3556), !dbg !164 + %3628 = tail call float @llvm.nvvm.div.full(float %3564, float %3556), !dbg !164 + %3629 = tail call float @llvm.nvvm.div.full(float %3565, float %3555), !dbg !164 + %3630 = tail call float @llvm.nvvm.div.full(float %3566, float %3555), !dbg !164 + %3631 = tail call float @llvm.nvvm.div.full(float %3567, float %3556), !dbg !164 + %3632 = tail call float @llvm.nvvm.div.full(float %3568, float %3556), !dbg !164 + %3633 = tail call float @llvm.nvvm.div.full(float %3569, float %3555), !dbg !164 + %3634 = tail call float @llvm.nvvm.div.full(float %3570, float %3555), !dbg !164 + %3635 = tail call float @llvm.nvvm.div.full(float %3571, float %3556), !dbg !164 + %3636 = tail call float @llvm.nvvm.div.full(float %3572, float %3556), !dbg !164 + %3637 = tail call float @llvm.nvvm.div.full(float %3573, float %3555), !dbg !164 + %3638 = tail call float @llvm.nvvm.div.full(float %3574, float %3555), !dbg !164 + %3639 = tail call float @llvm.nvvm.div.full(float %3575, float %3556), !dbg !164 + %3640 = tail call float @llvm.nvvm.div.full(float %3576, float %3556), !dbg !164 + %3641 = tail call float @llvm.nvvm.div.full(float %3577, float %3555), !dbg !164 + %3642 = tail call float @llvm.nvvm.div.full(float %3578, float %3555), !dbg !164 + %3643 = tail call float @llvm.nvvm.div.full(float %3579, float %3556), !dbg !164 + %3644 = tail call float @llvm.nvvm.div.full(float %3580, float %3556), !dbg !164 + %3645 = tail call float @llvm.nvvm.div.full(float %3581, float %3555), !dbg !164 + %3646 = tail call float @llvm.nvvm.div.full(float %3582, float %3555), !dbg !164 + %3647 = tail call float @llvm.nvvm.div.full(float %3583, float %3556), !dbg !164 + %3648 = tail call float @llvm.nvvm.div.full(float %3584, float %3556), !dbg !164 + %3649 = tail call float @llvm.nvvm.div.full(float %3585, float %3555), !dbg !164 + %3650 = tail call float @llvm.nvvm.div.full(float %3586, float %3555), !dbg !164 + %3651 = tail call float @llvm.nvvm.div.full(float %3587, float %3556), !dbg !164 + %3652 = tail call float @llvm.nvvm.div.full(float %3588, float %3556), !dbg !164 + %3653 = tail call float @llvm.nvvm.div.full(float %3589, float %3555), !dbg !164 + %3654 = tail call float @llvm.nvvm.div.full(float %3590, float %3555), !dbg !164 + %3655 = tail call float @llvm.nvvm.div.full(float %3591, float %3556), !dbg !164 + %3656 = tail call float @llvm.nvvm.div.full(float %3592, float %3556), !dbg !164 + %3657 = tail call float @llvm.nvvm.div.full(float %3593, float %3555), !dbg !164 + %3658 = tail call float @llvm.nvvm.div.full(float %3594, float %3555), !dbg !164 + %3659 = tail call float @llvm.nvvm.div.full(float %3595, float %3556), !dbg !164 + %3660 = tail call float @llvm.nvvm.div.full(float %3596, float %3556), !dbg !164 + %3661 = tail call float @llvm.nvvm.div.full(float %3597, float %3555), !dbg !164 + %3662 = tail call float @llvm.nvvm.div.full(float %3598, float %3555), !dbg !164 + %3663 = tail call float @llvm.nvvm.div.full(float %3599, float %3556), !dbg !164 + %3664 = tail call float @llvm.nvvm.div.full(float %3600, float %3556), !dbg !164 + %3665 = tail call float @llvm.nvvm.div.full(float %3601, float %3555), !dbg !164 + %3666 = tail call float @llvm.nvvm.div.full(float %3602, float %3555), !dbg !164 + %3667 = tail call float @llvm.nvvm.div.full(float %3603, float %3556), !dbg !164 + %3668 = tail call float @llvm.nvvm.div.full(float %3604, float %3556), !dbg !164 + %3669 = tail call float @llvm.nvvm.div.full(float %3605, float %3555), !dbg !164 + %3670 = tail call float @llvm.nvvm.div.full(float %3606, float %3555), !dbg !164 + %3671 = tail call float @llvm.nvvm.div.full(float %3607, float %3556), !dbg !164 + %3672 = tail call float @llvm.nvvm.div.full(float %3608, float %3556), !dbg !164 + %3673 = tail call float @llvm.nvvm.div.full(float %3609, float %3555), !dbg !164 + %3674 = tail call float @llvm.nvvm.div.full(float %3610, float %3555), !dbg !164 + %3675 = tail call float @llvm.nvvm.div.full(float %3611, float %3556), !dbg !164 + %3676 = tail call float @llvm.nvvm.div.full(float %3612, float %3556), !dbg !164 + %3677 = tail call float @llvm.nvvm.div.full(float %3613, float %3555), !dbg !164 + %3678 = tail call float @llvm.nvvm.div.full(float %3614, float %3555), !dbg !164 + %3679 = tail call float @llvm.nvvm.div.full(float %3615, float %3556), !dbg !164 + %3680 = tail call float @llvm.nvvm.div.full(float %3616, float %3556), !dbg !164 + %3681 = tail call float @llvm.nvvm.div.full(float %3617, float %3555), !dbg !164 + %3682 = tail call float @llvm.nvvm.div.full(float %3618, float %3555), !dbg !164 + %3683 = tail call float @llvm.nvvm.div.full(float %3619, float %3556), !dbg !164 + %3684 = tail call float @llvm.nvvm.div.full(float %3620, float %3556), !dbg !164 + %3685 = icmp slt i32 %48, 2048, !dbg !165 + %3686 = icmp slt i32 %49, 2048, !dbg !165 + %3687 = icmp slt i32 %50, 2048, !dbg !165 + %3688 = icmp slt i32 %51, 2048, !dbg !165 + %3689 = icmp slt i32 %52, 2048, !dbg !165 + %3690 = icmp slt i32 %53, 2048, !dbg !165 + %3691 = icmp slt i32 %54, 2048, !dbg !165 + %3692 = icmp slt i32 %55, 2048, !dbg !165 + %3693 = or disjoint i32 %83, %19, !dbg !166 + %3694 = or disjoint i32 %3693, %18, !dbg !167 + %3695 = add i32 %3694, %58, !dbg !168 + %3696 = add i32 %3694, %59, !dbg !168 + %3697 = add i32 %3694, %60, !dbg !168 + %3698 = add i32 %3694, %61, !dbg !168 + %3699 = add i32 %3694, %62, !dbg !168 + %3700 = add i32 %3694, %63, !dbg !168 + %3701 = add i32 %3694, %64, !dbg !168 + %3702 = add i32 %3694, %65, !dbg !168 + %3703 = sext i32 %3695 to i64, !dbg !169 + %3704 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3703, !dbg !169 + %3705 = sext i32 %3696 to i64, !dbg !169 + %3706 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3705, !dbg !169 + %3707 = sext i32 %3697 to i64, !dbg !169 + %3708 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3707, !dbg !169 + %3709 = sext i32 %3698 to i64, !dbg !169 + %3710 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3709, !dbg !169 + %3711 = sext i32 %3699 to i64, !dbg !169 + %3712 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3711, !dbg !169 + %3713 = sext i32 %3700 to i64, !dbg !169 + %3714 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3713, !dbg !169 + %3715 = sext i32 %3701 to i64, !dbg !169 + %3716 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3715, !dbg !169 + %3717 = sext i32 %3702 to i64, !dbg !169 + %3718 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3717, !dbg !169 + %3719 = insertelement <2 x float> poison, float %3621, i64 0, !dbg !170 + %3720 = insertelement <2 x float> %3719, float %3622, i64 1, !dbg !170 + %3721 = fptrunc <2 x float> %3720 to <2 x bfloat>, !dbg !170 + %3722 = insertelement <2 x float> poison, float %3623, i64 0, !dbg !170 + %3723 = insertelement <2 x float> %3722, float %3624, i64 1, !dbg !170 + %3724 = fptrunc <2 x float> %3723 to <2 x bfloat>, !dbg !170 + %3725 = insertelement <2 x float> poison, float %3625, i64 0, !dbg !170 + %3726 = insertelement <2 x float> %3725, float %3626, i64 1, !dbg !170 + %3727 = fptrunc <2 x float> %3726 to <2 x bfloat>, !dbg !170 + %3728 = insertelement <2 x float> poison, float %3627, i64 0, !dbg !170 + %3729 = insertelement <2 x float> %3728, float %3628, i64 1, !dbg !170 + %3730 = fptrunc <2 x float> %3729 to <2 x bfloat>, !dbg !170 + %3731 = insertelement <2 x float> poison, float %3629, i64 0, !dbg !170 + %3732 = insertelement <2 x float> %3731, float %3630, i64 1, !dbg !170 + %3733 = fptrunc <2 x float> %3732 to <2 x bfloat>, !dbg !170 + %3734 = insertelement <2 x float> poison, float %3631, i64 0, !dbg !170 + %3735 = insertelement <2 x float> %3734, float %3632, i64 1, !dbg !170 + %3736 = fptrunc <2 x float> %3735 to <2 x bfloat>, !dbg !170 + %3737 = insertelement <2 x float> poison, float %3633, i64 0, !dbg !170 + %3738 = insertelement <2 x float> %3737, float %3634, i64 1, !dbg !170 + %3739 = fptrunc <2 x float> %3738 to <2 x bfloat>, !dbg !170 + %3740 = insertelement <2 x float> poison, float %3635, i64 0, !dbg !170 + %3741 = insertelement <2 x float> %3740, float %3636, i64 1, !dbg !170 + %3742 = fptrunc <2 x float> %3741 to <2 x bfloat>, !dbg !170 + %3743 = insertelement <2 x float> poison, float %3637, i64 0, !dbg !170 + %3744 = insertelement <2 x float> %3743, float %3638, i64 1, !dbg !170 + %3745 = fptrunc <2 x float> %3744 to <2 x bfloat>, !dbg !170 + %3746 = insertelement <2 x float> poison, float %3639, i64 0, !dbg !170 + %3747 = insertelement <2 x float> %3746, float %3640, i64 1, !dbg !170 + %3748 = fptrunc <2 x float> %3747 to <2 x bfloat>, !dbg !170 + %3749 = insertelement <2 x float> poison, float %3641, i64 0, !dbg !170 + %3750 = insertelement <2 x float> %3749, float %3642, i64 1, !dbg !170 + %3751 = fptrunc <2 x float> %3750 to <2 x bfloat>, !dbg !170 + %3752 = insertelement <2 x float> poison, float %3643, i64 0, !dbg !170 + %3753 = insertelement <2 x float> %3752, float %3644, i64 1, !dbg !170 + %3754 = fptrunc <2 x float> %3753 to <2 x bfloat>, !dbg !170 + %3755 = insertelement <2 x float> poison, float %3645, i64 0, !dbg !170 + %3756 = insertelement <2 x float> %3755, float %3646, i64 1, !dbg !170 + %3757 = fptrunc <2 x float> %3756 to <2 x bfloat>, !dbg !170 + %3758 = insertelement <2 x float> poison, float %3647, i64 0, !dbg !170 + %3759 = insertelement <2 x float> %3758, float %3648, i64 1, !dbg !170 + %3760 = fptrunc <2 x float> %3759 to <2 x bfloat>, !dbg !170 + %3761 = insertelement <2 x float> poison, float %3649, i64 0, !dbg !170 + %3762 = insertelement <2 x float> %3761, float %3650, i64 1, !dbg !170 + %3763 = fptrunc <2 x float> %3762 to <2 x bfloat>, !dbg !170 + %3764 = insertelement <2 x float> poison, float %3651, i64 0, !dbg !170 + %3765 = insertelement <2 x float> %3764, float %3652, i64 1, !dbg !170 + %3766 = fptrunc <2 x float> %3765 to <2 x bfloat>, !dbg !170 + %3767 = insertelement <2 x float> poison, float %3653, i64 0, !dbg !170 + %3768 = insertelement <2 x float> %3767, float %3654, i64 1, !dbg !170 + %3769 = fptrunc <2 x float> %3768 to <2 x bfloat>, !dbg !170 + %3770 = insertelement <2 x float> poison, float %3655, i64 0, !dbg !170 + %3771 = insertelement <2 x float> %3770, float %3656, i64 1, !dbg !170 + %3772 = fptrunc <2 x float> %3771 to <2 x bfloat>, !dbg !170 + %3773 = insertelement <2 x float> poison, float %3657, i64 0, !dbg !170 + %3774 = insertelement <2 x float> %3773, float %3658, i64 1, !dbg !170 + %3775 = fptrunc <2 x float> %3774 to <2 x bfloat>, !dbg !170 + %3776 = insertelement <2 x float> poison, float %3659, i64 0, !dbg !170 + %3777 = insertelement <2 x float> %3776, float %3660, i64 1, !dbg !170 + %3778 = fptrunc <2 x float> %3777 to <2 x bfloat>, !dbg !170 + %3779 = insertelement <2 x float> poison, float %3661, i64 0, !dbg !170 + %3780 = insertelement <2 x float> %3779, float %3662, i64 1, !dbg !170 + %3781 = fptrunc <2 x float> %3780 to <2 x bfloat>, !dbg !170 + %3782 = insertelement <2 x float> poison, float %3663, i64 0, !dbg !170 + %3783 = insertelement <2 x float> %3782, float %3664, i64 1, !dbg !170 + %3784 = fptrunc <2 x float> %3783 to <2 x bfloat>, !dbg !170 + %3785 = insertelement <2 x float> poison, float %3665, i64 0, !dbg !170 + %3786 = insertelement <2 x float> %3785, float %3666, i64 1, !dbg !170 + %3787 = fptrunc <2 x float> %3786 to <2 x bfloat>, !dbg !170 + %3788 = insertelement <2 x float> poison, float %3667, i64 0, !dbg !170 + %3789 = insertelement <2 x float> %3788, float %3668, i64 1, !dbg !170 + %3790 = fptrunc <2 x float> %3789 to <2 x bfloat>, !dbg !170 + %3791 = insertelement <2 x float> poison, float %3669, i64 0, !dbg !170 + %3792 = insertelement <2 x float> %3791, float %3670, i64 1, !dbg !170 + %3793 = fptrunc <2 x float> %3792 to <2 x bfloat>, !dbg !170 + %3794 = insertelement <2 x float> poison, float %3671, i64 0, !dbg !170 + %3795 = insertelement <2 x float> %3794, float %3672, i64 1, !dbg !170 + %3796 = fptrunc <2 x float> %3795 to <2 x bfloat>, !dbg !170 + %3797 = insertelement <2 x float> poison, float %3673, i64 0, !dbg !170 + %3798 = insertelement <2 x float> %3797, float %3674, i64 1, !dbg !170 + %3799 = fptrunc <2 x float> %3798 to <2 x bfloat>, !dbg !170 + %3800 = insertelement <2 x float> poison, float %3675, i64 0, !dbg !170 + %3801 = insertelement <2 x float> %3800, float %3676, i64 1, !dbg !170 + %3802 = fptrunc <2 x float> %3801 to <2 x bfloat>, !dbg !170 + %3803 = insertelement <2 x float> poison, float %3677, i64 0, !dbg !170 + %3804 = insertelement <2 x float> %3803, float %3678, i64 1, !dbg !170 + %3805 = fptrunc <2 x float> %3804 to <2 x bfloat>, !dbg !170 + %3806 = insertelement <2 x float> poison, float %3679, i64 0, !dbg !170 + %3807 = insertelement <2 x float> %3806, float %3680, i64 1, !dbg !170 + %3808 = fptrunc <2 x float> %3807 to <2 x bfloat>, !dbg !170 + %3809 = insertelement <2 x float> poison, float %3681, i64 0, !dbg !170 + %3810 = insertelement <2 x float> %3809, float %3682, i64 1, !dbg !170 + %3811 = fptrunc <2 x float> %3810 to <2 x bfloat>, !dbg !170 + %3812 = insertelement <2 x float> poison, float %3683, i64 0, !dbg !170 + %3813 = insertelement <2 x float> %3812, float %3684, i64 1, !dbg !170 + %3814 = fptrunc <2 x float> %3813 to <2 x bfloat>, !dbg !170 + %3815 = shl nuw nsw i32 %190, 13, !dbg !170 + %3816 = shl nuw nsw i32 %36, 5, !dbg !170 + %3817 = and i32 %3816, 7264, !dbg !170 + %3818 = and i32 %36, 24, !dbg !170 + %3819 = shl nuw nsw i32 %3818, 4, !dbg !170 + %3820 = shl nuw nsw i32 %36, 2, !dbg !170 + %3821 = and i32 %3820, 16, !dbg !170 + %3822 = or disjoint i32 %3815, %3821, !dbg !170 + %3823 = or disjoint i32 %3817, %3819, !dbg !170 + %3824 = or disjoint i32 %3822, %3823, !dbg !170 + %3825 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %3824, !dbg !170 + %3826 = bitcast <2 x bfloat> %3721 to i32, !dbg !170 + %3827 = bitcast <2 x bfloat> %3727 to i32, !dbg !170 + %3828 = bitcast <2 x bfloat> %3733 to i32, !dbg !170 + %3829 = bitcast <2 x bfloat> %3739 to i32, !dbg !170 + %3830 = insertelement <4 x i32> poison, i32 %3826, i64 0, !dbg !170 + %3831 = insertelement <4 x i32> %3830, i32 %3827, i64 1, !dbg !170 + %3832 = insertelement <4 x i32> %3831, i32 %3828, i64 2, !dbg !170 + %3833 = insertelement <4 x i32> %3832, i32 %3829, i64 3, !dbg !170 + store <4 x i32> %3833, ptr addrspace(3) %3825, align 16, !dbg !170 + %3834 = getelementptr inbounds nuw i8, ptr addrspace(3) %3825, i32 512, !dbg !170 + %3835 = bitcast <2 x bfloat> %3724 to i32, !dbg !170 + %3836 = bitcast <2 x bfloat> %3730 to i32, !dbg !170 + %3837 = bitcast <2 x bfloat> %3736 to i32, !dbg !170 + %3838 = bitcast <2 x bfloat> %3742 to i32, !dbg !170 + %3839 = insertelement <4 x i32> poison, i32 %3835, i64 0, !dbg !170 + %3840 = insertelement <4 x i32> %3839, i32 %3836, i64 1, !dbg !170 + %3841 = insertelement <4 x i32> %3840, i32 %3837, i64 2, !dbg !170 + %3842 = insertelement <4 x i32> %3841, i32 %3838, i64 3, !dbg !170 + store <4 x i32> %3842, ptr addrspace(3) %3834, align 16, !dbg !170 + %3843 = xor i32 %3824, 32, !dbg !170 + %3844 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %3843, !dbg !170 + %3845 = bitcast <2 x bfloat> %3745 to i32, !dbg !170 + %3846 = bitcast <2 x bfloat> %3751 to i32, !dbg !170 + %3847 = bitcast <2 x bfloat> %3757 to i32, !dbg !170 + %3848 = bitcast <2 x bfloat> %3763 to i32, !dbg !170 + %3849 = insertelement <4 x i32> poison, i32 %3845, i64 0, !dbg !170 + %3850 = insertelement <4 x i32> %3849, i32 %3846, i64 1, !dbg !170 + %3851 = insertelement <4 x i32> %3850, i32 %3847, i64 2, !dbg !170 + %3852 = insertelement <4 x i32> %3851, i32 %3848, i64 3, !dbg !170 + store <4 x i32> %3852, ptr addrspace(3) %3844, align 16, !dbg !170 + %3853 = getelementptr inbounds nuw i8, ptr addrspace(3) %3844, i32 512, !dbg !170 + %3854 = bitcast <2 x bfloat> %3748 to i32, !dbg !170 + %3855 = bitcast <2 x bfloat> %3754 to i32, !dbg !170 + %3856 = bitcast <2 x bfloat> %3760 to i32, !dbg !170 + %3857 = bitcast <2 x bfloat> %3766 to i32, !dbg !170 + %3858 = insertelement <4 x i32> poison, i32 %3854, i64 0, !dbg !170 + %3859 = insertelement <4 x i32> %3858, i32 %3855, i64 1, !dbg !170 + %3860 = insertelement <4 x i32> %3859, i32 %3856, i64 2, !dbg !170 + %3861 = insertelement <4 x i32> %3860, i32 %3857, i64 3, !dbg !170 + store <4 x i32> %3861, ptr addrspace(3) %3853, align 16, !dbg !170 + %3862 = xor i32 %3824, 64, !dbg !170 + %3863 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %3862, !dbg !170 + %3864 = bitcast <2 x bfloat> %3769 to i32, !dbg !170 + %3865 = bitcast <2 x bfloat> %3775 to i32, !dbg !170 + %3866 = bitcast <2 x bfloat> %3781 to i32, !dbg !170 + %3867 = bitcast <2 x bfloat> %3787 to i32, !dbg !170 + %3868 = insertelement <4 x i32> poison, i32 %3864, i64 0, !dbg !170 + %3869 = insertelement <4 x i32> %3868, i32 %3865, i64 1, !dbg !170 + %3870 = insertelement <4 x i32> %3869, i32 %3866, i64 2, !dbg !170 + %3871 = insertelement <4 x i32> %3870, i32 %3867, i64 3, !dbg !170 + store <4 x i32> %3871, ptr addrspace(3) %3863, align 16, !dbg !170 + %3872 = getelementptr inbounds nuw i8, ptr addrspace(3) %3863, i32 512, !dbg !170 + %3873 = bitcast <2 x bfloat> %3772 to i32, !dbg !170 + %3874 = bitcast <2 x bfloat> %3778 to i32, !dbg !170 + %3875 = bitcast <2 x bfloat> %3784 to i32, !dbg !170 + %3876 = bitcast <2 x bfloat> %3790 to i32, !dbg !170 + %3877 = insertelement <4 x i32> poison, i32 %3873, i64 0, !dbg !170 + %3878 = insertelement <4 x i32> %3877, i32 %3874, i64 1, !dbg !170 + %3879 = insertelement <4 x i32> %3878, i32 %3875, i64 2, !dbg !170 + %3880 = insertelement <4 x i32> %3879, i32 %3876, i64 3, !dbg !170 + store <4 x i32> %3880, ptr addrspace(3) %3872, align 16, !dbg !170 + %3881 = xor i32 %3824, 96, !dbg !170 + %3882 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %3881, !dbg !170 + %3883 = bitcast <2 x bfloat> %3793 to i32, !dbg !170 + %3884 = bitcast <2 x bfloat> %3799 to i32, !dbg !170 + %3885 = bitcast <2 x bfloat> %3805 to i32, !dbg !170 + %3886 = bitcast <2 x bfloat> %3811 to i32, !dbg !170 + %3887 = insertelement <4 x i32> poison, i32 %3883, i64 0, !dbg !170 + %3888 = insertelement <4 x i32> %3887, i32 %3884, i64 1, !dbg !170 + %3889 = insertelement <4 x i32> %3888, i32 %3885, i64 2, !dbg !170 + %3890 = insertelement <4 x i32> %3889, i32 %3886, i64 3, !dbg !170 + store <4 x i32> %3890, ptr addrspace(3) %3882, align 16, !dbg !170 + %3891 = getelementptr inbounds nuw i8, ptr addrspace(3) %3882, i32 512, !dbg !170 + %3892 = bitcast <2 x bfloat> %3796 to i32, !dbg !170 + %3893 = bitcast <2 x bfloat> %3802 to i32, !dbg !170 + %3894 = bitcast <2 x bfloat> %3808 to i32, !dbg !170 + %3895 = bitcast <2 x bfloat> %3814 to i32, !dbg !170 + %3896 = insertelement <4 x i32> poison, i32 %3892, i64 0, !dbg !170 + %3897 = insertelement <4 x i32> %3896, i32 %3893, i64 1, !dbg !170 + %3898 = insertelement <4 x i32> %3897, i32 %3894, i64 2, !dbg !170 + %3899 = insertelement <4 x i32> %3898, i32 %3895, i64 3, !dbg !170 + store <4 x i32> %3899, ptr addrspace(3) %3891, align 16, !dbg !170 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !170 + %3900 = shl nuw nsw i32 %3818, 10, !dbg !170 + %3901 = shl nuw nsw i32 %190, 5, !dbg !170 + %3902 = and i32 %36, 252, !dbg !170 + %3903 = shl nuw nsw i32 %3902, 2, !dbg !170 + %3904 = or disjoint i32 %3900, %3901, !dbg !170 + %3905 = xor i32 %3904, %3903, !dbg !170 + %3906 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %3905, !dbg !170 + %3907 = ptrtoint ptr addrspace(3) %3906 to i32, !dbg !170 + %3908 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %3907) #2, !dbg !170 + %3909 = extractvalue { i32, i32, i32, i32 } %3908, 0, !dbg !170 + %3910 = extractvalue { i32, i32, i32, i32 } %3908, 1, !dbg !170 + %3911 = extractvalue { i32, i32, i32, i32 } %3908, 2, !dbg !170 + %3912 = extractvalue { i32, i32, i32, i32 } %3908, 3, !dbg !170 + %3913 = getelementptr inbounds nuw i8, ptr addrspace(3) %3906, i32 1024, !dbg !170 + %3914 = ptrtoint ptr addrspace(3) %3913 to i32, !dbg !170 + %3915 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %3914) #2, !dbg !170 + %3916 = extractvalue { i32, i32, i32, i32 } %3915, 0, !dbg !170 + %3917 = extractvalue { i32, i32, i32, i32 } %3915, 1, !dbg !170 + %3918 = extractvalue { i32, i32, i32, i32 } %3915, 2, !dbg !170 + %3919 = extractvalue { i32, i32, i32, i32 } %3915, 3, !dbg !170 + %3920 = getelementptr inbounds nuw i8, ptr addrspace(3) %3906, i32 2048, !dbg !170 + %3921 = ptrtoint ptr addrspace(3) %3920 to i32, !dbg !170 + %3922 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %3921) #2, !dbg !170 + %3923 = extractvalue { i32, i32, i32, i32 } %3922, 0, !dbg !170 + %3924 = extractvalue { i32, i32, i32, i32 } %3922, 1, !dbg !170 + %3925 = extractvalue { i32, i32, i32, i32 } %3922, 2, !dbg !170 + %3926 = extractvalue { i32, i32, i32, i32 } %3922, 3, !dbg !170 + %3927 = getelementptr inbounds nuw i8, ptr addrspace(3) %3906, i32 3072, !dbg !170 + %3928 = ptrtoint ptr addrspace(3) %3927 to i32, !dbg !170 + %3929 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %3928) #2, !dbg !170 + %3930 = extractvalue { i32, i32, i32, i32 } %3929, 0, !dbg !170 + %3931 = extractvalue { i32, i32, i32, i32 } %3929, 1, !dbg !170 + %3932 = extractvalue { i32, i32, i32, i32 } %3929, 2, !dbg !170 + %3933 = extractvalue { i32, i32, i32, i32 } %3929, 3, !dbg !170 + %3934 = getelementptr inbounds nuw i8, ptr addrspace(3) %3906, i32 4096, !dbg !170 + %3935 = ptrtoint ptr addrspace(3) %3934 to i32, !dbg !170 + %3936 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %3935) #2, !dbg !170 + %3937 = extractvalue { i32, i32, i32, i32 } %3936, 0, !dbg !170 + %3938 = extractvalue { i32, i32, i32, i32 } %3936, 1, !dbg !170 + %3939 = extractvalue { i32, i32, i32, i32 } %3936, 2, !dbg !170 + %3940 = extractvalue { i32, i32, i32, i32 } %3936, 3, !dbg !170 + %3941 = getelementptr inbounds nuw i8, ptr addrspace(3) %3906, i32 5120, !dbg !170 + %3942 = ptrtoint ptr addrspace(3) %3941 to i32, !dbg !170 + %3943 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %3942) #2, !dbg !170 + %3944 = extractvalue { i32, i32, i32, i32 } %3943, 0, !dbg !170 + %3945 = extractvalue { i32, i32, i32, i32 } %3943, 1, !dbg !170 + %3946 = extractvalue { i32, i32, i32, i32 } %3943, 2, !dbg !170 + %3947 = extractvalue { i32, i32, i32, i32 } %3943, 3, !dbg !170 + %3948 = getelementptr inbounds nuw i8, ptr addrspace(3) %3906, i32 6144, !dbg !170 + %3949 = ptrtoint ptr addrspace(3) %3948 to i32, !dbg !170 + %3950 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %3949) #2, !dbg !170 + %3951 = extractvalue { i32, i32, i32, i32 } %3950, 0, !dbg !170 + %3952 = extractvalue { i32, i32, i32, i32 } %3950, 1, !dbg !170 + %3953 = extractvalue { i32, i32, i32, i32 } %3950, 2, !dbg !170 + %3954 = extractvalue { i32, i32, i32, i32 } %3950, 3, !dbg !170 + %3955 = getelementptr inbounds nuw i8, ptr addrspace(3) %3906, i32 7168, !dbg !170 + %3956 = ptrtoint ptr addrspace(3) %3955 to i32, !dbg !170 + %3957 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %3956) #2, !dbg !170 + %3958 = extractvalue { i32, i32, i32, i32 } %3957, 0, !dbg !170 + %3959 = extractvalue { i32, i32, i32, i32 } %3957, 1, !dbg !170 + %3960 = extractvalue { i32, i32, i32, i32 } %3957, 2, !dbg !170 + %3961 = extractvalue { i32, i32, i32, i32 } %3957, 3, !dbg !170 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %3909, i32 %3910, i32 %3911, i32 %3912, ptr addrspace(1) %3704, i1 %3685) #2, !dbg !170 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %3916, i32 %3917, i32 %3918, i32 %3919, ptr addrspace(1) %3706, i1 %3686) #2, !dbg !170 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %3923, i32 %3924, i32 %3925, i32 %3926, ptr addrspace(1) %3708, i1 %3687) #2, !dbg !170 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %3930, i32 %3931, i32 %3932, i32 %3933, ptr addrspace(1) %3710, i1 %3688) #2, !dbg !170 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %3937, i32 %3938, i32 %3939, i32 %3940, ptr addrspace(1) %3712, i1 %3689) #2, !dbg !170 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %3944, i32 %3945, i32 %3946, i32 %3947, ptr addrspace(1) %3714, i1 %3690) #2, !dbg !170 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %3951, i32 %3952, i32 %3953, i32 %3954, ptr addrspace(1) %3716, i1 %3691) #2, !dbg !170 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %3958, i32 %3959, i32 %3960, i32 %3961, ptr addrspace(1) %3718, i1 %3692) #2, !dbg !170 + %3962 = fcmp olt float %3555, 0x3810000000000000, !dbg !171 + %3963 = fmul float %3555, 0x4160000000000000, !dbg !171 + %.02.i = select i1 %3962, float %3963, float %3555, !dbg !171 + %i.i.0.i = select i1 %3962, float -2.300000e+01, float 0.000000e+00, !dbg !171 + %3964 = bitcast float %.02.i to i32, !dbg !171 + %3965 = add i32 %3964, -1060439283, !dbg !171 + %3966 = and i32 %3965, -8388608, !dbg !171 + %3967 = sub i32 %3964, %3966, !dbg !171 + %3968 = bitcast i32 %3967 to float, !dbg !171 + %3969 = sitofp i32 %3966 to float, !dbg !171 + %3970 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not.i292 = icmp eq i32 %3970, 0, !dbg !171 + %3971 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3969, float 0x3E80000000000000, float %i.i.0.i) #2, !dbg !171 + %3972 = tail call float @llvm.nvvm.fma.rn.f(float %3969, float 0x3E80000000000000, float %i.i.0.i) #2, !dbg !171 + %.08.i = select i1 %.not.i292, float %3972, float %3971, !dbg !171 + %3973 = fadd float %3968, -1.000000e+00, !dbg !171 + %3974 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not1.i = icmp eq i32 %3974, 0, !dbg !171 + %3975 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %3973, float 0xBFC58FE600000000) #2, !dbg !171 + %3976 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %3973, float 0xBFC58FE600000000) #2, !dbg !171 + %.010.i = select i1 %.not1.i, float %3976, float %3975, !dbg !171 + %3977 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not2.i = icmp eq i32 %3977, 0, !dbg !171 + %3978 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i, float %3973, float 0x3FC5F9E540000000) #2, !dbg !171 + %3979 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i, float %3973, float 0x3FC5F9E540000000) #2, !dbg !171 + %.011.i = select i1 %.not2.i, float %3979, float %3978, !dbg !171 + %3980 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not3.i = icmp eq i32 %3980, 0, !dbg !171 + %3981 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i, float %3973, float 0xBFC6E9C860000000) #2, !dbg !171 + %3982 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i, float %3973, float 0xBFC6E9C860000000) #2, !dbg !171 + %.012.i = select i1 %.not3.i, float %3982, float %3981, !dbg !171 + %3983 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not4.i = icmp eq i32 %3983, 0, !dbg !171 + %3984 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i, float %3973, float 0x3FCA417E80000000) #2, !dbg !171 + %3985 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i, float %3973, float 0x3FCA417E80000000) #2, !dbg !171 + %.09.i = select i1 %.not4.i, float %3985, float %3984, !dbg !171 + %3986 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not5.i = icmp eq i32 %3986, 0, !dbg !171 + %3987 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i, float %3973, float 0xBFCEC79160000000) #2, !dbg !171 + %3988 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i, float %3973, float 0xBFCEC79160000000) #2, !dbg !171 + %.05.i = select i1 %.not5.i, float %3988, float %3987, !dbg !171 + %3989 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not6.i = icmp eq i32 %3989, 0, !dbg !171 + %3990 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %3973, float 0x3FD277F320000000) #2, !dbg !171 + %3991 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %3973, float 0x3FD277F320000000) #2, !dbg !171 + %.01.i = select i1 %.not6.i, float %3991, float %3990, !dbg !171 + %3992 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not7.i = icmp eq i32 %3992, 0, !dbg !171 + %3993 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i, float %3973, float 0xBFD7154920000000) #2, !dbg !171 + %3994 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i, float %3973, float 0xBFD7154920000000) #2, !dbg !171 + %.0.i293 = select i1 %.not7.i, float %3994, float %3993, !dbg !171 + %3995 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not8.i = icmp eq i32 %3995, 0, !dbg !171 + %3996 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i293, float %3973, float 0x3FDEC70940000000) #2, !dbg !171 + %3997 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i293, float %3973, float 0x3FDEC70940000000) #2, !dbg !171 + %.07.i = select i1 %.not8.i, float %3997, float %3996, !dbg !171 + %3998 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not9.i = icmp eq i32 %3998, 0, !dbg !171 + %3999 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %3973, float 0xBFE7154760000000) #2, !dbg !171 + %4000 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %3973, float 0xBFE7154760000000) #2, !dbg !171 + %.06.i = select i1 %.not9.i, float %4000, float %3999, !dbg !171 + %4001 = fmul float %3973, %.06.i, !dbg !171 + %4002 = fmul float %3973, %4001, !dbg !171 + %4003 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not10.i = icmp eq i32 %4003, 0, !dbg !171 + %4004 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3973, float 0x3FF7154760000000, float %4002) #2, !dbg !171 + %4005 = tail call float @llvm.nvvm.fma.rn.f(float %3973, float 0x3FF7154760000000, float %4002) #2, !dbg !171 + %.04.i = select i1 %.not10.i, float %4005, float %4004, !dbg !171 + %4006 = fadd float %.08.i, %.04.i, !dbg !171 + %4007 = icmp ugt i32 %3964, 2139095039, !dbg !171 + br i1 %4007, label %__nv_fmaf_rn.exit.i.i, label %__nv_log2f.exit, !dbg !171 + +__nv_fmaf_rn.exit.i.i: ; preds = %._crit_edge379 + %4008 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not11.i = icmp eq i32 %4008, 0, !dbg !171 + %4009 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !171 + %4010 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !171 + %.03.i = select i1 %.not11.i, float %4010, float %4009, !dbg !171 + br label %__nv_log2f.exit, !dbg !171 + +__nv_log2f.exit: ; preds = %._crit_edge379, %__nv_fmaf_rn.exit.i.i + %r.i.0.i = phi float [ %.03.i, %__nv_fmaf_rn.exit.i.i ], [ %4006, %._crit_edge379 ], !dbg !171 + %4011 = fcmp olt float %3556, 0x3810000000000000, !dbg !171 + %4012 = fmul float %3556, 0x4160000000000000, !dbg !171 + %.02.i294 = select i1 %4011, float %4012, float %3556, !dbg !171 + %i.i.0.i295 = select i1 %4011, float -2.300000e+01, float 0.000000e+00, !dbg !171 + %4013 = bitcast float %.02.i294 to i32, !dbg !171 + %4014 = add i32 %4013, -1060439283, !dbg !171 + %4015 = and i32 %4014, -8388608, !dbg !171 + %4016 = sub i32 %4013, %4015, !dbg !171 + %4017 = bitcast i32 %4016 to float, !dbg !171 + %4018 = sitofp i32 %4015 to float, !dbg !171 + %4019 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not.i296 = icmp eq i32 %4019, 0, !dbg !171 + %4020 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %4018, float 0x3E80000000000000, float %i.i.0.i295) #2, !dbg !171 + %4021 = tail call float @llvm.nvvm.fma.rn.f(float %4018, float 0x3E80000000000000, float %i.i.0.i295) #2, !dbg !171 + %.08.i297 = select i1 %.not.i296, float %4021, float %4020, !dbg !171 + %4022 = fadd float %4017, -1.000000e+00, !dbg !171 + %4023 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not1.i298 = icmp eq i32 %4023, 0, !dbg !171 + %4024 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %4022, float 0xBFC58FE600000000) #2, !dbg !171 + %4025 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %4022, float 0xBFC58FE600000000) #2, !dbg !171 + %.010.i299 = select i1 %.not1.i298, float %4025, float %4024, !dbg !171 + %4026 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not2.i300 = icmp eq i32 %4026, 0, !dbg !171 + %4027 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i299, float %4022, float 0x3FC5F9E540000000) #2, !dbg !171 + %4028 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i299, float %4022, float 0x3FC5F9E540000000) #2, !dbg !171 + %.011.i301 = select i1 %.not2.i300, float %4028, float %4027, !dbg !171 + %4029 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not3.i302 = icmp eq i32 %4029, 0, !dbg !171 + %4030 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i301, float %4022, float 0xBFC6E9C860000000) #2, !dbg !171 + %4031 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i301, float %4022, float 0xBFC6E9C860000000) #2, !dbg !171 + %.012.i303 = select i1 %.not3.i302, float %4031, float %4030, !dbg !171 + %4032 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not4.i304 = icmp eq i32 %4032, 0, !dbg !171 + %4033 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i303, float %4022, float 0x3FCA417E80000000) #2, !dbg !171 + %4034 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i303, float %4022, float 0x3FCA417E80000000) #2, !dbg !171 + %.09.i305 = select i1 %.not4.i304, float %4034, float %4033, !dbg !171 + %4035 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not5.i306 = icmp eq i32 %4035, 0, !dbg !171 + %4036 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i305, float %4022, float 0xBFCEC79160000000) #2, !dbg !171 + %4037 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i305, float %4022, float 0xBFCEC79160000000) #2, !dbg !171 + %.05.i307 = select i1 %.not5.i306, float %4037, float %4036, !dbg !171 + %4038 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not6.i308 = icmp eq i32 %4038, 0, !dbg !171 + %4039 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i307, float %4022, float 0x3FD277F320000000) #2, !dbg !171 + %4040 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i307, float %4022, float 0x3FD277F320000000) #2, !dbg !171 + %.01.i309 = select i1 %.not6.i308, float %4040, float %4039, !dbg !171 + %4041 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not7.i310 = icmp eq i32 %4041, 0, !dbg !171 + %4042 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i309, float %4022, float 0xBFD7154920000000) #2, !dbg !171 + %4043 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i309, float %4022, float 0xBFD7154920000000) #2, !dbg !171 + %.0.i311 = select i1 %.not7.i310, float %4043, float %4042, !dbg !171 + %4044 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not8.i312 = icmp eq i32 %4044, 0, !dbg !171 + %4045 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i311, float %4022, float 0x3FDEC70940000000) #2, !dbg !171 + %4046 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i311, float %4022, float 0x3FDEC70940000000) #2, !dbg !171 + %.07.i313 = select i1 %.not8.i312, float %4046, float %4045, !dbg !171 + %4047 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not9.i314 = icmp eq i32 %4047, 0, !dbg !171 + %4048 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i313, float %4022, float 0xBFE7154760000000) #2, !dbg !171 + %4049 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i313, float %4022, float 0xBFE7154760000000) #2, !dbg !171 + %.06.i315 = select i1 %.not9.i314, float %4049, float %4048, !dbg !171 + %4050 = fmul float %4022, %.06.i315, !dbg !171 + %4051 = fmul float %4022, %4050, !dbg !171 + %4052 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not10.i316 = icmp eq i32 %4052, 0, !dbg !171 + %4053 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %4022, float 0x3FF7154760000000, float %4051) #2, !dbg !171 + %4054 = tail call float @llvm.nvvm.fma.rn.f(float %4022, float 0x3FF7154760000000, float %4051) #2, !dbg !171 + %.04.i317 = select i1 %.not10.i316, float %4054, float %4053, !dbg !171 + %4055 = fadd float %.08.i297, %.04.i317, !dbg !171 + %4056 = icmp ugt i32 %4013, 2139095039, !dbg !171 + br i1 %4056, label %__nv_fmaf_rn.exit.i.i320, label %__nv_log2f.exit323, !dbg !171 + +__nv_fmaf_rn.exit.i.i320: ; preds = %__nv_log2f.exit + %4057 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not11.i321 = icmp eq i32 %4057, 0, !dbg !171 + %4058 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i294, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !171 + %4059 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i294, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !171 + %.03.i322 = select i1 %.not11.i321, float %4059, float %4058, !dbg !171 + br label %__nv_log2f.exit323, !dbg !171 + +__nv_log2f.exit323: ; preds = %__nv_log2f.exit, %__nv_fmaf_rn.exit.i.i320 + %r.i.0.i318 = phi float [ %.03.i322, %__nv_fmaf_rn.exit.i.i320 ], [ %4055, %__nv_log2f.exit ], !dbg !171 + %4060 = insertelement <2 x float> poison, float %.02.i, i64 0, !dbg !171 + %4061 = insertelement <2 x float> %4060, float %.02.i294, i64 1, !dbg !171 + %4062 = fcmp oeq <2 x float> %4061, zeroinitializer, !dbg !171 + %4063 = shl nuw i32 %15, 16, !dbg !172 + %4064 = shl nuw nsw i32 %16, 11, !dbg !172 + %4065 = add i32 %4063, %4064, !dbg !172 + %4066 = sext i32 %4065 to i64, !dbg !173 + %4067 = getelementptr float, ptr addrspace(1) %3, i64 %4066, !dbg !173 + %4068 = and i32 %36, 127, !dbg !27 + %4069 = or disjoint i32 %35, %4068, !dbg !28 + %4070 = sext i32 %4069 to i64, !dbg !174 + %4071 = getelementptr float, ptr addrspace(1) %4067, i64 %4070, !dbg !174 + %4072 = insertelement <2 x float> poison, float %r.i.0.i, i64 0, !dbg !171 + %4073 = insertelement <2 x float> %4072, float %r.i.0.i318, i64 1, !dbg !171 + %4074 = select <2 x i1> %4062, <2 x float> splat (float 0xFFF0000000000000), <2 x float> %4073, !dbg !171 + %4075 = fadd <2 x float> %3551, %4074, !dbg !175 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !176 + %4076 = shl nuw nsw i32 %3902, 1, !dbg !176 + %4077 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4076, !dbg !176 + store <2 x float> %4075, ptr addrspace(3) %4077, align 8, !dbg !176 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !176 + %4078 = shl nuw nsw i32 %133, 3, !dbg !176 + %4079 = shl nuw nsw i32 %136, 2, !dbg !176 + %4080 = lshr exact i32 %137, 1, !dbg !176 + %4081 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4078, !dbg !176 + %4082 = getelementptr inbounds nuw i8, ptr addrspace(3) %4081, i32 %4079, !dbg !176 + %4083 = getelementptr inbounds nuw i8, ptr addrspace(3) %4082, i32 %4080, !dbg !176 + %4084 = load i32, ptr addrspace(3) %4083, align 4, !dbg !176 + %4085 = and i32 %36, 128, !dbg !176 + %4086 = icmp eq i32 %4085, 0, !dbg !176 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %4084, ptr addrspace(1) %4071, i1 %4086) #2, !dbg !176 + ret void, !dbg !177 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.commit.group() #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.wait.group(i32 immarg) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.idx.i32(i32, i32, i32, i32) #5 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.fence.sync.aligned() #6 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.commit_group.sync.aligned() #6 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.maxnum.f32(float, float) #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #5 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #7 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.f(float, float, float) #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.umin.i32(i32, i32) #8 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #8 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #6 = { convergent nounwind } +attributes #7 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #8 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_tem_fused_0", linkageName: "triton_tem_fused_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 97, column: 28, scope: !5) +!9 = !DILocation(line: 98, column: 27, scope: !5) +!10 = !DILocation(line: 99, column: 27, scope: !5) +!11 = !DILocation(line: 103, column: 23, scope: !5) +!12 = !DILocation(line: 107, column: 24, scope: !5) +!13 = !DILocation(line: 107, column: 45, scope: !5) +!14 = !DILocation(line: 107, column: 36, scope: !5) +!15 = !DILocation(line: 108, column: 25, scope: !5) +!16 = !DILocation(line: 108, column: 47, scope: !5) +!17 = !DILocation(line: 108, column: 37, scope: !5) +!18 = !DILocation(line: 111, column: 12, scope: !5) +!19 = !DILocation(line: 112, column: 12, scope: !5) +!20 = !DILocation(line: 113, column: 12, scope: !5) +!21 = !DILocation(line: 142, column: 51, scope: !5) +!22 = !DILocation(line: 142, column: 74, scope: !5) +!23 = !DILocation(line: 143, column: 46, scope: !5) +!24 = !DILocation(line: 143, column: 97, scope: !5) +!25 = !DILocation(line: 143, column: 64, scope: !5) +!26 = !DILocation(line: 144, column: 23, scope: !5) +!27 = !DILocation(line: 144, column: 46, scope: !5) +!28 = !DILocation(line: 144, column: 33, scope: !5) +!29 = !DILocation(line: 284, column: 38, scope: !30, inlinedAt: !31) +!30 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!31 = !DILocation(line: 146, column: 101, scope: !5) +!32 = !DILocation(line: 284, column: 20, scope: !30, inlinedAt: !31) +!33 = !DILocation(line: 284, column: 56, scope: !30, inlinedAt: !31) +!34 = !DILocation(line: 284, column: 49, scope: !30, inlinedAt: !31) +!35 = !DILocation(line: 294, column: 23, scope: !30, inlinedAt: !31) +!36 = !DILocation(line: 151, column: 26, scope: !5) +!37 = !DILocation(line: 152, column: 23, scope: !5) +!38 = !DILocation(line: 152, column: 37, scope: !5) +!39 = !DILocation(line: 153, column: 42, scope: !5) +!40 = !DILocation(line: 153, column: 28, scope: !5) +!41 = !DILocation(line: 154, column: 45, scope: !5) +!42 = !DILocation(line: 159, column: 37, scope: !5) +!43 = !DILocation(line: 376, column: 33, scope: !30, inlinedAt: !44) +!44 = !DILocation(line: 172, column: 41, scope: !5) +!45 = !DILocation(line: 502, column: 40, scope: !30, inlinedAt: !44) +!46 = !DILocation(line: 376, column: 23, scope: !30, inlinedAt: !44) +!47 = !DILocation(line: 378, column: 23, scope: !30, inlinedAt: !44) +!48 = !DILocation(line: 379, column: 23, scope: !30, inlinedAt: !44) +!49 = !DILocation(line: 346, column: 35, scope: !30, inlinedAt: !44) +!50 = !DILocation(line: 284, column: 38, scope: !30, inlinedAt: !44) +!51 = !DILocation(line: 284, column: 20, scope: !30, inlinedAt: !44) +!52 = !DILocation(line: 284, column: 49, scope: !30, inlinedAt: !44) +!53 = !DILocation(line: 294, column: 23, scope: !30, inlinedAt: !44) +!54 = !DILocation(line: 342, column: 32, scope: !30, inlinedAt: !44) +!55 = !DILocation(line: 351, column: 19, scope: !30, inlinedAt: !44) +!56 = !DILocation(line: 154, column: 65, scope: !5) +!57 = !DILocation(line: 159, column: 24, scope: !5) +!58 = !DILocation(line: 397, column: 23, scope: !30, inlinedAt: !44) +!59 = !DILocation(line: 395, column: 24, scope: !30, inlinedAt: !44) +!60 = !DILocation(line: 353, column: 14, scope: !30, inlinedAt: !44) +!61 = !DILocation(line: 373, column: 23, scope: !30, inlinedAt: !44) +!62 = !DILocation(line: 385, column: 24, scope: !30, inlinedAt: !44) +!63 = !DILocation(line: 381, column: 23, scope: !30, inlinedAt: !44) +!64 = !DILocation(line: 384, column: 24, scope: !30, inlinedAt: !44) +!65 = !DILocation(line: 387, column: 25, scope: !30, inlinedAt: !44) +!66 = !DILocation(line: 388, column: 92, scope: !30, inlinedAt: !44) +!67 = !DILocation(line: 394, column: 25, scope: !30, inlinedAt: !44) +!68 = !DILocation(line: 391, column: 24, scope: !30, inlinedAt: !44) +!69 = !DILocation(line: 392, column: 24, scope: !30, inlinedAt: !44) +!70 = !DILocation(line: 393, column: 39, scope: !30, inlinedAt: !44) +!71 = !DILocation(line: 396, column: 24, scope: !30, inlinedAt: !44) +!72 = !DILocation(line: 404, column: 39, scope: !30, inlinedAt: !44) +!73 = !DILocation(line: 405, column: 25, scope: !30, inlinedAt: !44) +!74 = !DILocation(line: 406, column: 24, scope: !30, inlinedAt: !44) +!75 = !DILocation(line: 407, column: 24, scope: !30, inlinedAt: !44) +!76 = !DILocation(line: 417, column: 27, scope: !30, inlinedAt: !44) +!77 = !DILocation(line: 414, column: 69, scope: !30, inlinedAt: !44) +!78 = !DILocation(line: 168, column: 27, scope: !79, inlinedAt: !44) +!79 = distinct !DILexicalBlockFile(scope: !5, file: !80, discriminator: 0) +!80 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!81 = !DILocation(line: 189, column: 40, scope: !79, inlinedAt: !44) +!82 = !DILocation(line: 421, column: 27, scope: !30, inlinedAt: !44) +!83 = !DILocation(line: 424, column: 51, scope: !30, inlinedAt: !44) +!84 = !DILocation(line: 423, column: 35, scope: !30, inlinedAt: !44) +!85 = !DILocation(line: 428, column: 31, scope: !30, inlinedAt: !44) +!86 = !DILocation(line: 428, column: 25, scope: !30, inlinedAt: !44) +!87 = !DILocation(line: 429, column: 39, scope: !30, inlinedAt: !44) +!88 = !DILocation(line: 429, column: 21, scope: !30, inlinedAt: !44) +!89 = !DILocation(line: 434, column: 16, scope: !30, inlinedAt: !44) +!90 = !DILocation(line: 261, column: 15, scope: !79, inlinedAt: !44) +!91 = !DILocation(line: 291, column: 36, scope: !79, inlinedAt: !44) +!92 = !DILocation(line: 434, column: 24, scope: !30, inlinedAt: !44) +!93 = !DILocation(line: 436, column: 16, scope: !30, inlinedAt: !44) +!94 = !DILocation(line: 440, column: 22, scope: !30, inlinedAt: !44) +!95 = !DILocation(line: 440, column: 44, scope: !30, inlinedAt: !44) +!96 = !DILocation(line: 548, column: 26, scope: !30, inlinedAt: !44) +!97 = !DILocation(line: 247, column: 33, scope: !30, inlinedAt: !44) +!98 = !DILocation(line: 248, column: 38, scope: !30, inlinedAt: !44) +!99 = !DILocation(line: 248, column: 24, scope: !30, inlinedAt: !44) +!100 = !DILocation(line: 249, column: 109, scope: !30, inlinedAt: !44) +!101 = !DILocation(line: 249, column: 113, scope: !30, inlinedAt: !44) +!102 = !DILocation(line: 249, column: 55, scope: !30, inlinedAt: !44) +!103 = !DILocation(line: 249, column: 25, scope: !30, inlinedAt: !44) +!104 = !DILocation(line: 250, column: 35, scope: !30, inlinedAt: !44) +!105 = !DILocation(line: 251, column: 34, scope: !30, inlinedAt: !44) +!106 = !DILocation(line: 251, column: 48, scope: !30, inlinedAt: !44) +!107 = !DILocation(line: 251, column: 63, scope: !30, inlinedAt: !44) +!108 = !DILocation(line: 252, column: 29, scope: !30, inlinedAt: !44) +!109 = !DILocation(line: 252, column: 61, scope: !30, inlinedAt: !44) +!110 = !DILocation(line: 252, column: 42, scope: !30, inlinedAt: !44) +!111 = !DILocation(line: 549, column: 21, scope: !30, inlinedAt: !44) +!112 = !DILocation(line: 136, column: 19, scope: !5) +!113 = !DILocation(line: 181, column: 35, scope: !5) +!114 = !DILocation(line: 182, column: 27, scope: !5) +!115 = !DILocation(line: 182, column: 41, scope: !5) +!116 = !DILocation(line: 183, column: 51, scope: !5) +!117 = !DILocation(line: 183, column: 32, scope: !5) +!118 = !DILocation(line: 184, column: 49, scope: !5) +!119 = !DILocation(line: 502, column: 40, scope: !30, inlinedAt: !120) +!120 = !DILocation(line: 198, column: 45, scope: !5) +!121 = !DILocation(line: 346, column: 35, scope: !30, inlinedAt: !120) +!122 = !DILocation(line: 284, column: 38, scope: !30, inlinedAt: !120) +!123 = !DILocation(line: 284, column: 20, scope: !30, inlinedAt: !120) +!124 = !DILocation(line: 284, column: 49, scope: !30, inlinedAt: !120) +!125 = !DILocation(line: 294, column: 23, scope: !30, inlinedAt: !120) +!126 = !DILocation(line: 342, column: 32, scope: !30, inlinedAt: !120) +!127 = !DILocation(line: 351, column: 19, scope: !30, inlinedAt: !120) +!128 = !DILocation(line: 184, column: 69, scope: !5) +!129 = !DILocation(line: 353, column: 14, scope: !30, inlinedAt: !120) +!130 = !DILocation(line: 417, column: 27, scope: !30, inlinedAt: !120) +!131 = !DILocation(line: 168, column: 27, scope: !79, inlinedAt: !120) +!132 = !DILocation(line: 189, column: 40, scope: !79, inlinedAt: !120) +!133 = !DILocation(line: 421, column: 27, scope: !30, inlinedAt: !120) +!134 = !DILocation(line: 424, column: 51, scope: !30, inlinedAt: !120) +!135 = !DILocation(line: 423, column: 35, scope: !30, inlinedAt: !120) +!136 = !DILocation(line: 428, column: 31, scope: !30, inlinedAt: !120) +!137 = !DILocation(line: 428, column: 25, scope: !30, inlinedAt: !120) +!138 = !DILocation(line: 429, column: 39, scope: !30, inlinedAt: !120) +!139 = !DILocation(line: 429, column: 21, scope: !30, inlinedAt: !120) +!140 = !DILocation(line: 434, column: 16, scope: !30, inlinedAt: !120) +!141 = !DILocation(line: 261, column: 15, scope: !79, inlinedAt: !120) +!142 = !DILocation(line: 291, column: 36, scope: !79, inlinedAt: !120) +!143 = !DILocation(line: 434, column: 24, scope: !30, inlinedAt: !120) +!144 = !DILocation(line: 436, column: 16, scope: !30, inlinedAt: !120) +!145 = !DILocation(line: 440, column: 22, scope: !30, inlinedAt: !120) +!146 = !DILocation(line: 440, column: 44, scope: !30, inlinedAt: !120) +!147 = !DILocation(line: 247, column: 33, scope: !30, inlinedAt: !120) +!148 = !DILocation(line: 248, column: 38, scope: !30, inlinedAt: !120) +!149 = !DILocation(line: 248, column: 24, scope: !30, inlinedAt: !120) +!150 = !DILocation(line: 249, column: 109, scope: !30, inlinedAt: !120) +!151 = !DILocation(line: 249, column: 113, scope: !30, inlinedAt: !120) +!152 = !DILocation(line: 249, column: 55, scope: !30, inlinedAt: !120) +!153 = !DILocation(line: 249, column: 25, scope: !30, inlinedAt: !120) +!154 = !DILocation(line: 250, column: 35, scope: !30, inlinedAt: !120) +!155 = !DILocation(line: 251, column: 34, scope: !30, inlinedAt: !120) +!156 = !DILocation(line: 251, column: 48, scope: !30, inlinedAt: !120) +!157 = !DILocation(line: 251, column: 63, scope: !30, inlinedAt: !120) +!158 = !DILocation(line: 252, column: 29, scope: !30, inlinedAt: !120) +!159 = !DILocation(line: 252, column: 61, scope: !30, inlinedAt: !120) +!160 = !DILocation(line: 252, column: 42, scope: !30, inlinedAt: !120) +!161 = !DILocation(line: 549, column: 21, scope: !30, inlinedAt: !120) +!162 = !DILocation(line: 206, column: 26, scope: !5) +!163 = !DILocation(line: 206, column: 34, scope: !5) +!164 = !DILocation(line: 208, column: 16, scope: !5) +!165 = !DILocation(line: 214, column: 20, scope: !5) +!166 = !DILocation(line: 218, column: 49, scope: !5) +!167 = !DILocation(line: 218, column: 62, scope: !5) +!168 = !DILocation(line: 218, column: 75, scope: !5) +!169 = !DILocation(line: 218, column: 25, scope: !5) +!170 = !DILocation(line: 218, column: 109, scope: !5) +!171 = !DILocation(line: 223, column: 33, scope: !5) +!172 = !DILocation(line: 222, column: 32, scope: !5) +!173 = !DILocation(line: 222, column: 23, scope: !5) +!174 = !DILocation(line: 222, column: 40, scope: !5) +!175 = !DILocation(line: 223, column: 20, scope: !5) +!176 = !DILocation(line: 225, column: 29, scope: !5) +!177 = !DILocation(line: 229, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..1299c2da052cc853d5cc8f25f0e32d2fc6bb2264 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.ptx @@ -0,0 +1,3160 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_tem_fused_0 // -- Begin function triton_tem_fused_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_tem_fused_0 +.visible .entry triton_tem_fused_0( + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_0, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_1, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_2, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_3, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_4, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_5, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_6, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_7, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_8, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_9, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_10, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_11, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_12 +) +.reqntid 256 +{ + .reg .pred %p<201>; + .reg .b16 %rs<62>; + .reg .b32 %r<4978>; + .reg .b64 %rd<285>; + .loc 1 18 0 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:18:0 + +// %bb.0: + ld.param.b64 %rd29, [triton_tem_fused_0_param_8]; + ld.param.b64 %rd28, [triton_tem_fused_0_param_7]; + ld.param.b64 %rd60, [triton_tem_fused_0_param_0]; +$L__tmp0: + .loc 1 97 28 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:97:28 + mov.u32 %r696, %ctaid.x; + ld.param.b64 %rd61, [triton_tem_fused_0_param_1]; + .loc 1 98 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:98:27 + mov.u32 %r1, %ctaid.y; + ld.param.b64 %rd62, [triton_tem_fused_0_param_2]; + .loc 1 99 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:99:27 + mov.u32 %r2, %ctaid.z; + .loc 1 103 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:103:23 + and.b32 %r697, %r1, 1; + .loc 1 107 24 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:107:24 + shl.b32 %r3, %r1, 23; + ld.param.b64 %rd63, [triton_tem_fused_0_param_5]; + .loc 1 107 45 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:107:45 + shl.b32 %r4, %r2, 7; + ld.param.b64 %rd64, [triton_tem_fused_0_param_6]; + .loc 1 107 36 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:107:36 + or.b32 %r698, %r3, %r4; + .loc 1 108 25 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:108:25 + shl.b32 %r699, %r697, 21; + .loc 1 108 47 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:108:47 + shl.b32 %r700, %r2, 16; + ld.param.b64 %rd65, [triton_tem_fused_0_param_9]; + and.b32 %r701, %r700, -262144; + .loc 1 108 37 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:108:37 + add.s32 %r702, %r699, %r701; + .loc 1 111 12 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:111:12 + mad.wide.s32 %rd66, %r698, 2, %rd60; + .loc 1 112 12 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:112:12 + mul.wide.s32 %rd67, %r702, 2; + add.s64 %rd1, %rd61, %rd67; + .loc 1 113 12 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:113:12 + add.s64 %rd2, %rd62, %rd67; + .loc 1 142 51 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:142:51 + shl.b32 %r703, %r697, 4; + .loc 1 142 74 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:142:74 + add.s32 %r704, %r703, %r696; + .loc 1 143 46 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:143:46 + shl.b32 %r705, %r697, 8; + .loc 1 143 97 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:143:97 + shl.b32 %r706, %r696, 4; + .loc 1 143 64 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:143:64 + add.s32 %r707, %r705, %r706; + .loc 1 144 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:144:23 + shl.b32 %r5, %r696, 7; + .loc 1 144 46 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:144:46 + mov.u32 %r6, %tid.x; + shr.u32 %r7, %r6, 5; + and.b32 %r708, %r6, 240; + bfe.u32 %r8, %r6, 4, 4; + or.b32 %r9, %r8, 16; + or.b32 %r10, %r8, 32; + or.b32 %r11, %r8, 48; + .loc 1 144 33 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:144:33 + or.b32 %r12, %r8, %r5; + or.b32 %r13, %r9, %r5; + or.b32 %r14, %r10, %r5; + or.b32 %r15, %r11, %r5; + or.b32 %r16, %r12, 64; + or.b32 %r17, %r12, 80; + or.b32 %r18, %r12, 96; + or.b32 %r19, %r12, 112; +$L__tmp1: + .loc 1 284 38 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:38 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:146:101 ] + shl.b32 %r20, %r12, 12; + shl.b32 %r21, %r13, 12; + shl.b32 %r22, %r14, 12; + shl.b32 %r23, %r15, 12; + shl.b32 %r24, %r16, 12; + shl.b32 %r25, %r17, 12; + shl.b32 %r26, %r18, 12; + shl.b32 %r27, %r19, 12; + .loc 1 284 20 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:20 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:146:101 ] + mad.wide.s32 %rd68, %r20, 2, %rd66; + mad.wide.s32 %rd69, %r21, 2, %rd66; + mad.wide.s32 %rd70, %r22, 2, %rd66; + mad.wide.s32 %rd71, %r23, 2, %rd66; + mad.wide.s32 %rd72, %r24, 2, %rd66; + mad.wide.s32 %rd73, %r25, 2, %rd66; + mad.wide.s32 %rd74, %r26, 2, %rd66; + mad.wide.s32 %rd75, %r27, 2, %rd66; + .loc 1 284 56 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:56 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:146:101 ] + shl.b32 %r715, %r6, 3; + and.b32 %r716, %r715, 120; + .loc 1 284 49 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:49 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:146:101 ] + cvt.u64.u32 %rd3, %r716; + mul.wide.u32 %rd76, %r716, 2; + add.s64 %rd31, %rd68, %rd76; + add.s64 %rd32, %rd69, %rd76; + add.s64 %rd33, %rd70, %rd76; + add.s64 %rd34, %rd71, %rd76; + add.s64 %rd35, %rd72, %rd76; + add.s64 %rd36, %rd73, %rd76; + add.s64 %rd37, %rd74, %rd76; + add.s64 %rd38, %rd75, %rd76; + .loc 1 294 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:294:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:146:101 ] + // begin inline asm + mov.u32 %r629, 0x0; + mov.u32 %r630, 0x0; + mov.u32 %r631, 0x0; + mov.u32 %r632, 0x0; + ld.global.v4.b32 { %r629, %r630, %r631, %r632 }, [ %rd31 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r633, 0x0; + mov.u32 %r634, 0x0; + mov.u32 %r635, 0x0; + mov.u32 %r636, 0x0; + ld.global.v4.b32 { %r633, %r634, %r635, %r636 }, [ %rd32 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r637, 0x0; + mov.u32 %r638, 0x0; + mov.u32 %r639, 0x0; + mov.u32 %r640, 0x0; + ld.global.v4.b32 { %r637, %r638, %r639, %r640 }, [ %rd33 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r641, 0x0; + mov.u32 %r642, 0x0; + mov.u32 %r643, 0x0; + mov.u32 %r644, 0x0; + ld.global.v4.b32 { %r641, %r642, %r643, %r644 }, [ %rd34 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r645, 0x0; + mov.u32 %r646, 0x0; + mov.u32 %r647, 0x0; + mov.u32 %r648, 0x0; + ld.global.v4.b32 { %r645, %r646, %r647, %r648 }, [ %rd35 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r649, 0x0; + mov.u32 %r650, 0x0; + mov.u32 %r651, 0x0; + mov.u32 %r652, 0x0; + ld.global.v4.b32 { %r649, %r650, %r651, %r652 }, [ %rd36 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r653, 0x0; + mov.u32 %r654, 0x0; + mov.u32 %r655, 0x0; + mov.u32 %r656, 0x0; + ld.global.v4.b32 { %r653, %r654, %r655, %r656 }, [ %rd37 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r657, 0x0; + mov.u32 %r658, 0x0; + mov.u32 %r659, 0x0; + mov.u32 %r660, 0x0; + ld.global.v4.b32 { %r657, %r658, %r659, %r660 }, [ %rd38 + 0 ]; + // end inline asm + and.b32 %r28, %r6, 7; + shl.b32 %r717, %r28, 4; + shl.b32 %r718, %r708, 3; + and.b32 %r29, %r6, 112; + and.b32 %r30, %r6, 8; + shl.b32 %r719, %r30, 11; + or.b32 %r720, %r717, %r718; + xor.b32 %r721, %r720, %r29; + mov.b32 %r722, global_smem; + add.s32 %r723, %r722, %r719; + add.s32 %r724, %r723, %r721; + st.shared.v4.b32 [%r724+98304], {%r629, %r630, %r631, %r632}; + st.shared.v4.b32 [%r724+100352], {%r633, %r634, %r635, %r636}; + st.shared.v4.b32 [%r724+102400], {%r637, %r638, %r639, %r640}; + st.shared.v4.b32 [%r724+104448], {%r641, %r642, %r643, %r644}; + st.shared.v4.b32 [%r724+106496], {%r645, %r646, %r647, %r648}; + st.shared.v4.b32 [%r724+108544], {%r649, %r650, %r651, %r652}; + st.shared.v4.b32 [%r724+110592], {%r653, %r654, %r655, %r656}; + st.shared.v4.b32 [%r724+112640], {%r657, %r658, %r659, %r660}; +$L__tmp2: + .loc 1 151 26 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:151:26 + cvt.s64.s32 %rd4, %r707; + mad.wide.s32 %rd39, %r707, 4, %rd64; + .loc 1 152 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:152:23 + // begin inline asm + mov.u32 %r661, 0x0; + ld.global.b32 { %r661 }, [ %rd39 + 0 ]; + // end inline asm + .loc 1 152 37 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:152:37 + shl.b32 %r31, %r661, 7; + .loc 1 153 42 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:153:42 + cvt.s64.s32 %rd6, %r704; + mad.wide.s32 %rd40, %r704, 4, %rd63; + .loc 1 153 28 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:153:28 + // begin inline asm + mov.u32 %r662, 0x0; + ld.global.b32 { %r662 }, [ %rd40 + 0 ]; + // end inline asm + .loc 1 154 45 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:154:45 + shl.b32 %r33, %r662, 1; + .loc 1 159 37 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:159:37 + and.b32 %r34, %r6, 3; +$L__tmp3: + .loc 1 376 33 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:376:33 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mad.wide.u32 %rd42, %r1, 8, %rd65; + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + setp.lt.s32 %p2, %r33, 1; + setp.gt.s32 %p1, %r33, 0; + .loc 1 376 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:376:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + // begin inline asm + mov.u64 %rd7, 0x0; + @%p1 ld.global.b64 { %rd7 }, [ %rd42 + 0 ]; + // end inline asm + .loc 1 346 35 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:346:35 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + or.b32 %r725, %r31, %r8; + or.b32 %r726, %r31, %r9; + or.b32 %r727, %r31, %r10; + or.b32 %r728, %r31, %r11; + .loc 1 284 38 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:38 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + shl.b32 %r729, %r725, 7; + shl.b32 %r730, %r726, 7; + shl.b32 %r731, %r727, 7; + shl.b32 %r732, %r728, 7; + .loc 1 284 20 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:20 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.wide.s32 %rd77, %r729, 2; + mul.wide.s32 %rd78, %r730, 2; + mul.wide.s32 %rd79, %r731, 2; + mul.wide.s32 %rd80, %r732, 2; + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s64 %rd10, %rd1, %rd76; + .loc 1 284 49 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:49 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s64 %rd43, %rd10, %rd77; + add.s64 %rd44, %rd10, %rd78; + add.s64 %rd45, %rd10, %rd79; + add.s64 %rd46, %rd10, %rd80; + .loc 1 294 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:294:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + shl.b32 %r733, %r30, 10; + or.b32 %r35, %r721, %r733; + add.s32 %r2699, %r722, %r35; + selp.b32 %r664, 16, 0, %p1; + // begin inline asm + cp.async.cg.shared.global [ %r2699 + 0 ], [ %rd43 + 0 ], 0x10, %r664; + // end inline asm + add.s32 %r2701, %r2699, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r2701 + 0 ], [ %rd44 + 0 ], 0x10, %r664; + // end inline asm + add.s32 %r2703, %r2699, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r2703 + 0 ], [ %rd45 + 0 ], 0x10, %r664; + // end inline asm + add.s32 %r2705, %r2699, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r2705 + 0 ], [ %rd46 + 0 ], 0x10, %r664; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s64 %rd11, %rd2, %rd76; + .loc 1 284 49 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:49 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s64 %rd47, %rd11, %rd77; + add.s64 %rd48, %rd11, %rd78; + add.s64 %rd49, %rd11, %rd79; + add.s64 %rd50, %rd11, %rd80; + .loc 1 294 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:294:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s32 %r671, %r2699, 49152; + // begin inline asm + cp.async.cg.shared.global [ %r671 + 0 ], [ %rd47 + 0 ], 0x10, %r664; + // end inline asm + add.s32 %r673, %r2699, 51200; + // begin inline asm + cp.async.cg.shared.global [ %r673 + 0 ], [ %rd48 + 0 ], 0x10, %r664; + // end inline asm + add.s32 %r675, %r2699, 53248; + // begin inline asm + cp.async.cg.shared.global [ %r675 + 0 ], [ %rd49 + 0 ], 0x10, %r664; + // end inline asm + add.s32 %r677, %r2699, 55296; + // begin inline asm + cp.async.cg.shared.global [ %r677 + 0 ], [ %rd50 + 0 ], 0x10, %r664; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + setp.gt.s32 %p3, %r33, 1; + .loc 1 342 32 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:342:32 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + or.b32 %r734, %r31, 64; + .loc 1 346 35 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:346:35 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + or.b32 %r735, %r734, %r8; + or.b32 %r736, %r734, %r9; + or.b32 %r737, %r734, %r10; + or.b32 %r738, %r734, %r11; + .loc 1 284 38 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:38 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + shl.b32 %r739, %r735, 7; + shl.b32 %r740, %r736, 7; + shl.b32 %r741, %r737, 7; + shl.b32 %r742, %r738, 7; + .loc 1 284 20 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:20 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.wide.s32 %rd81, %r739, 2; + mul.wide.s32 %rd82, %r740, 2; + mul.wide.s32 %rd83, %r741, 2; + mul.wide.s32 %rd84, %r742, 2; + .loc 1 284 49 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:49 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s64 %rd51, %rd10, %rd81; + add.s64 %rd52, %rd10, %rd82; + add.s64 %rd53, %rd10, %rd83; + add.s64 %rd54, %rd10, %rd84; + .loc 1 294 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:294:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + bar.sync 0; + add.s32 %r679, %r2699, 16384; + selp.b32 %r680, 16, 0, %p3; + // begin inline asm + cp.async.cg.shared.global [ %r679 + 0 ], [ %rd51 + 0 ], 0x10, %r680; + // end inline asm + add.s32 %r681, %r2699, 18432; + // begin inline asm + cp.async.cg.shared.global [ %r681 + 0 ], [ %rd52 + 0 ], 0x10, %r680; + // end inline asm + add.s32 %r683, %r2699, 20480; + // begin inline asm + cp.async.cg.shared.global [ %r683 + 0 ], [ %rd53 + 0 ], 0x10, %r680; + // end inline asm + add.s32 %r685, %r2699, 22528; + // begin inline asm + cp.async.cg.shared.global [ %r685 + 0 ], [ %rd54 + 0 ], 0x10, %r680; + // end inline asm + cp.async.commit_group; + .loc 1 284 49 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:49 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s64 %rd55, %rd11, %rd81; + add.s64 %rd56, %rd11, %rd82; + add.s64 %rd57, %rd11, %rd83; + add.s64 %rd58, %rd11, %rd84; + .loc 1 294 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:294:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s32 %r687, %r2699, 65536; + // begin inline asm + cp.async.cg.shared.global [ %r687 + 0 ], [ %rd55 + 0 ], 0x10, %r680; + // end inline asm + add.s32 %r689, %r2699, 67584; + // begin inline asm + cp.async.cg.shared.global [ %r689 + 0 ], [ %rd56 + 0 ], 0x10, %r680; + // end inline asm + add.s32 %r691, %r2699, 69632; + // begin inline asm + cp.async.cg.shared.global [ %r691 + 0 ], [ %rd57 + 0 ], 0x10, %r680; + // end inline asm + add.s32 %r693, %r2699, 71680; + // begin inline asm + cp.async.cg.shared.global [ %r693 + 0 ], [ %rd58 + 0 ], 0x10, %r680; + // end inline asm + cp.async.commit_group; + .loc 1 351 19 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:351:19 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + mov.b32 %r743, 0fFF800000; + mov.b64 %rd284, {%r743, %r743}; + mov.b32 %r1567, 0f00000000; + mov.b32 %r1568, %r1567; + mov.b32 %r1569, %r1567; + mov.b32 %r1570, %r1567; + mov.b32 %r1571, %r1567; + mov.b32 %r1572, %r1567; + mov.b32 %r1573, %r1567; + mov.b32 %r1574, %r1567; + mov.b32 %r1575, %r1567; + mov.b32 %r1576, %r1567; + mov.b32 %r1577, %r1567; + mov.b32 %r1578, %r1567; + mov.b32 %r1579, %r1567; + mov.b32 %r1580, %r1567; + mov.b32 %r1581, %r1567; + mov.b32 %r1582, %r1567; + mov.b32 %r1583, %r1567; + mov.b32 %r1584, %r1567; + mov.b32 %r1585, %r1567; + mov.b32 %r1586, %r1567; + mov.b32 %r1587, %r1567; + mov.b32 %r1588, %r1567; + mov.b32 %r1589, %r1567; + mov.b32 %r1590, %r1567; + mov.b32 %r1591, %r1567; + mov.b32 %r1592, %r1567; + mov.b32 %r1593, %r1567; + mov.b32 %r1594, %r1567; + mov.b32 %r1595, %r1567; + mov.b32 %r1596, %r1567; + mov.b32 %r1597, %r1567; + mov.b32 %r1598, %r1567; + mov.b32 %r1599, %r1567; + mov.b32 %r1600, %r1567; + mov.b32 %r1601, %r1567; + mov.b32 %r1602, %r1567; + mov.b32 %r1603, %r1567; + mov.b32 %r1604, %r1567; + mov.b32 %r1605, %r1567; + mov.b32 %r1606, %r1567; + mov.b32 %r1607, %r1567; + mov.b32 %r1608, %r1567; + mov.b32 %r1609, %r1567; + mov.b32 %r1610, %r1567; + mov.b32 %r1611, %r1567; + mov.b32 %r1612, %r1567; + mov.b32 %r1613, %r1567; + mov.b32 %r1614, %r1567; + mov.b32 %r1615, %r1567; + mov.b32 %r1616, %r1567; + mov.b32 %r1617, %r1567; + mov.b32 %r1618, %r1567; + mov.b32 %r1619, %r1567; + mov.b32 %r1620, %r1567; + mov.b32 %r1621, %r1567; + mov.b32 %r1622, %r1567; + mov.b32 %r1623, %r1567; + mov.b32 %r1624, %r1567; + mov.b32 %r1625, %r1567; + mov.b32 %r1626, %r1567; + mov.b32 %r1627, %r1567; + mov.b32 %r1628, %r1567; + mov.b32 %r1629, %r1567; + mov.b32 %r1630, %r1567; + mov.b32 %r4974, %r1567; + mov.b32 %r4975, %r1567; + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + @%p2 bra $L__BB0_3; +$L__tmp4: +// %bb.1: // %.lr.ph + .loc 1 0 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:0:40 + shr.u32 %r709, %r6, 1; + and.b32 %r710, %r709, 112; + bfe.u32 %r711, %r6, 2, 3; + or.b32 %r712, %r710, %r711; + or.b32 %r713, %r712, %r5; + or.b32 %r714, %r713, 8; + cvt.s64.s32 %rd8, %r713; + cvt.s64.s32 %rd9, %r714; + cvt.u32.u64 %r71, %rd9; + cvt.u32.u64 %r73, %rd8; + .loc 1 154 65 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:154:65 + min.u32 %r52, %r33, 32; + .loc 1 159 37 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:159:37 + shl.b32 %r749, %r34, 1; + or.b32 %r4773, %r31, %r749; + add.s32 %r53, %r52, -2; + add.s32 %r54, %r52, -1; + or.b32 %r4772, %r4773, 1; + or.b32 %r4771, %r4773, 8; + or.b32 %r4770, %r4773, 9; + or.b32 %r4769, %r4773, 16; + or.b32 %r4768, %r4773, 17; + or.b32 %r4767, %r4773, 24; + or.b32 %r4766, %r4773, 25; + or.b32 %r4765, %r4773, 32; + or.b32 %r4764, %r4773, 33; + or.b32 %r4763, %r4773, 40; + or.b32 %r4762, %r4773, 41; + or.b32 %r4761, %r4773, 48; + or.b32 %r4760, %r4773, 49; + or.b32 %r4759, %r4773, 56; + or.b32 %r4758, %r4773, 57; + mov.b64 %rd284, {%r743, %r743}; + mov.b32 %r1366, 0; + mov.b32 %r4974, 0f00000000; + mov.b32 %r4689, 1; + mov.b32 %r4688, -1; + mov.b32 %r4687, 64; + setp.gt.s64 %p17, %rd7, %rd9; + setp.gt.s64 %p18, %rd7, %rd8; + mov.b32 %r4690, %r4687; + mov.b32 %r4975, %r4974; + mov.b32 %r1567, %r4974; + mov.b32 %r1568, %r4974; + mov.b32 %r1569, %r4974; + mov.b32 %r1570, %r4974; + mov.b32 %r1571, %r4974; + mov.b32 %r1572, %r4974; + mov.b32 %r1573, %r4974; + mov.b32 %r1574, %r4974; + mov.b32 %r1575, %r4974; + mov.b32 %r1576, %r4974; + mov.b32 %r1577, %r4974; + mov.b32 %r1578, %r4974; + mov.b32 %r1579, %r4974; + mov.b32 %r1580, %r4974; + mov.b32 %r1581, %r4974; + mov.b32 %r1582, %r4974; + mov.b32 %r1583, %r4974; + mov.b32 %r1584, %r4974; + mov.b32 %r1585, %r4974; + mov.b32 %r1586, %r4974; + mov.b32 %r1587, %r4974; + mov.b32 %r1588, %r4974; + mov.b32 %r1589, %r4974; + mov.b32 %r1590, %r4974; + mov.b32 %r1591, %r4974; + mov.b32 %r1592, %r4974; + mov.b32 %r1593, %r4974; + mov.b32 %r1594, %r4974; + mov.b32 %r1595, %r4974; + mov.b32 %r1596, %r4974; + mov.b32 %r1597, %r4974; + mov.b32 %r1598, %r4974; + mov.b32 %r1599, %r4974; + mov.b32 %r1600, %r4974; + mov.b32 %r1601, %r4974; + mov.b32 %r1602, %r4974; + mov.b32 %r1603, %r4974; + mov.b32 %r1604, %r4974; + mov.b32 %r1605, %r4974; + mov.b32 %r1606, %r4974; + mov.b32 %r1607, %r4974; + mov.b32 %r1608, %r4974; + mov.b32 %r1609, %r4974; + mov.b32 %r1610, %r4974; + mov.b32 %r1611, %r4974; + mov.b32 %r1612, %r4974; + mov.b32 %r1613, %r4974; + mov.b32 %r1614, %r4974; + mov.b32 %r1615, %r4974; + mov.b32 %r1616, %r4974; + mov.b32 %r1617, %r4974; + mov.b32 %r1618, %r4974; + mov.b32 %r1619, %r4974; + mov.b32 %r1620, %r4974; + mov.b32 %r1621, %r4974; + mov.b32 %r1622, %r4974; + mov.b32 %r1623, %r4974; + mov.b32 %r1624, %r4974; + mov.b32 %r1625, %r4974; + mov.b32 %r1626, %r4974; + mov.b32 %r1627, %r4974; + mov.b32 %r1628, %r4974; + mov.b32 %r1629, %r4974; + mov.b32 %r1630, %r4974; + mov.b32 %r4757, %r1366; +$L__BB0_2: // %__nv_exp2f.exit192 + // =>This Inner Loop Header: Depth=1 +$L__tmp5: + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + setp.lt.s32 %p19, %r4757, %r53; + setp.lt.s32 %p15, %r4757, %r54; + add.s32 %r1983, %r4688, 1; + setp.gt.s32 %p20, %r1983, 2; + selp.b32 %r4688, 0, %r1983, %p20; + .loc 1 294 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:294:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r1984, %r4688, 14; + add.s32 %r1266, %r722, %r1984; + .loc 1 351 19 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:351:19 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + shfl.sync.idx.b32 %r1986, %r7, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r1987, %r1986, 11; + and.b32 %r1988, %r1987, 8192; + add.s32 %r1263, %r722, 98304; + add.s32 %r1989, %r1988, %r1263; + bfe.u32 %r1990, %r1989, 4, 14; + cvt.u64.u32 %rd120, %r1990; + or.b64 %rd86, %rd120, 4611686293372403712; + bfe.u32 %r1991, %r1266, 4, 14; + cvt.u64.u32 %rd121, %r1991; + or.b64 %rd87, %rd121, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r847,%r848,%r849,%r850,%r851,%r852,%r853,%r854,%r855,%r856,%r857,%r858,%r859,%r860,%r861,%r862,%r863,%r864,%r865,%r866,%r867,%r868,%r869,%r870,%r871,%r872,%r873,%r874,%r875,%r876,%r877,%r878}, %rd86, %rd87, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r1992, %r1989, 32; + bfe.u32 %r1993, %r1992, 4, 14; + cvt.u64.u32 %rd122, %r1993; + or.b64 %rd88, %rd122, 4611686293372403712; + add.s32 %r1994, %r1266, 32; + bfe.u32 %r1995, %r1994, 4, 14; + cvt.u64.u32 %rd123, %r1995; + or.b64 %rd89, %rd123, 4611686293338849280; + mov.pred %p4, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r847,%r848,%r849,%r850,%r851,%r852,%r853,%r854,%r855,%r856,%r857,%r858,%r859,%r860,%r861,%r862,%r863,%r864,%r865,%r866,%r867,%r868,%r869,%r870,%r871,%r872,%r873,%r874,%r875,%r876,%r877,%r878}, %rd88, %rd89, %p4, 1, 1, 0, 0; + // end inline asm + add.s32 %r1996, %r1989, 64; + bfe.u32 %r1997, %r1996, 4, 14; + cvt.u64.u32 %rd124, %r1997; + or.b64 %rd90, %rd124, 4611686293372403712; + add.s32 %r1998, %r1266, 64; + bfe.u32 %r1999, %r1998, 4, 14; + cvt.u64.u32 %rd125, %r1999; + or.b64 %rd91, %rd125, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r847,%r848,%r849,%r850,%r851,%r852,%r853,%r854,%r855,%r856,%r857,%r858,%r859,%r860,%r861,%r862,%r863,%r864,%r865,%r866,%r867,%r868,%r869,%r870,%r871,%r872,%r873,%r874,%r875,%r876,%r877,%r878}, %rd90, %rd91, %p4, 1, 1, 0, 0; + // end inline asm + add.s32 %r2000, %r1989, 96; + bfe.u32 %r2001, %r2000, 4, 14; + cvt.u64.u32 %rd126, %r2001; + or.b64 %rd92, %rd126, 4611686293372403712; + add.s32 %r2002, %r1266, 96; + bfe.u32 %r2003, %r2002, 4, 14; + cvt.u64.u32 %rd127, %r2003; + or.b64 %rd93, %rd127, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r847,%r848,%r849,%r850,%r851,%r852,%r853,%r854,%r855,%r856,%r857,%r858,%r859,%r860,%r861,%r862,%r863,%r864,%r865,%r866,%r867,%r868,%r869,%r870,%r871,%r872,%r873,%r874,%r875,%r876,%r877,%r878}, %rd92, %rd93, %p4, 1, 1, 0, 0; + // end inline asm + add.s32 %r2004, %r1989, 16384; + bfe.u32 %r2005, %r2004, 4, 14; + cvt.u64.u32 %rd128, %r2005; + or.b64 %rd94, %rd128, 4611686293372403712; + add.s32 %r2006, %r1266, 8192; + bfe.u32 %r2007, %r2006, 4, 14; + cvt.u64.u32 %rd129, %r2007; + or.b64 %rd95, %rd129, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r847,%r848,%r849,%r850,%r851,%r852,%r853,%r854,%r855,%r856,%r857,%r858,%r859,%r860,%r861,%r862,%r863,%r864,%r865,%r866,%r867,%r868,%r869,%r870,%r871,%r872,%r873,%r874,%r875,%r876,%r877,%r878}, %rd94, %rd95, %p4, 1, 1, 0, 0; + // end inline asm + add.s32 %r2008, %r1989, 16416; + bfe.u32 %r2009, %r2008, 4, 14; + cvt.u64.u32 %rd130, %r2009; + or.b64 %rd96, %rd130, 4611686293372403712; + add.s32 %r2010, %r1266, 8224; + bfe.u32 %r2011, %r2010, 4, 14; + cvt.u64.u32 %rd131, %r2011; + or.b64 %rd97, %rd131, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r847,%r848,%r849,%r850,%r851,%r852,%r853,%r854,%r855,%r856,%r857,%r858,%r859,%r860,%r861,%r862,%r863,%r864,%r865,%r866,%r867,%r868,%r869,%r870,%r871,%r872,%r873,%r874,%r875,%r876,%r877,%r878}, %rd96, %rd97, %p4, 1, 1, 0, 0; + // end inline asm + add.s32 %r2012, %r1989, 16448; + bfe.u32 %r2013, %r2012, 4, 14; + cvt.u64.u32 %rd132, %r2013; + or.b64 %rd98, %rd132, 4611686293372403712; + add.s32 %r2014, %r1266, 8256; + bfe.u32 %r2015, %r2014, 4, 14; + cvt.u64.u32 %rd133, %r2015; + or.b64 %rd99, %rd133, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r847,%r848,%r849,%r850,%r851,%r852,%r853,%r854,%r855,%r856,%r857,%r858,%r859,%r860,%r861,%r862,%r863,%r864,%r865,%r866,%r867,%r868,%r869,%r870,%r871,%r872,%r873,%r874,%r875,%r876,%r877,%r878}, %rd98, %rd99, %p4, 1, 1, 0, 0; + // end inline asm + add.s32 %r2016, %r1989, 16480; + bfe.u32 %r2017, %r2016, 4, 14; + cvt.u64.u32 %rd134, %r2017; + or.b64 %rd100, %rd134, 4611686293372403712; + add.s32 %r2018, %r1266, 8288; + bfe.u32 %r2019, %r2018, 4, 14; + cvt.u64.u32 %rd135, %r2019; + or.b64 %rd101, %rd135, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r847,%r848,%r849,%r850,%r851,%r852,%r853,%r854,%r855,%r856,%r857,%r858,%r859,%r860,%r861,%r862,%r863,%r864,%r865,%r866,%r867,%r868,%r869,%r870,%r871,%r872,%r873,%r874,%r875,%r876,%r877,%r878}, %rd100, %rd101, %p4, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r1264, %r1366; + mov.b32 %r1265, %r1366; + mov.b32 %r1267, %r1366; + mov.b32 %r1268, %r1366; + // begin inline asm + // wait for regs: %r847,%r848,%r849,%r850,%r851,%r852,%r853,%r854,%r855,%r856,%r857,%r858,%r859,%r860,%r861,%r862,%r863,%r864,%r865,%r866,%r867,%r868,%r869,%r870,%r871,%r872,%r873,%r874,%r875,%r876,%r877,%r878,%r1263,%r1264,%r1265,%r1266,%r1267,%r1268,%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 353 14 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:353:14 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2020, %r847, 0f3DB504F3; + mul.f32 %r2021, %r848, 0f3DB504F3; + mul.f32 %r2022, %r849, 0f3DB504F3; + mul.f32 %r2023, %r850, 0f3DB504F3; + mul.f32 %r2024, %r851, 0f3DB504F3; + mul.f32 %r2025, %r852, 0f3DB504F3; + mul.f32 %r2026, %r853, 0f3DB504F3; + mul.f32 %r2027, %r854, 0f3DB504F3; + mul.f32 %r2028, %r855, 0f3DB504F3; + mul.f32 %r2029, %r856, 0f3DB504F3; + mul.f32 %r2030, %r857, 0f3DB504F3; + mul.f32 %r2031, %r858, 0f3DB504F3; + mul.f32 %r2032, %r859, 0f3DB504F3; + mul.f32 %r2033, %r860, 0f3DB504F3; + mul.f32 %r2034, %r861, 0f3DB504F3; + mul.f32 %r2035, %r862, 0f3DB504F3; + mul.f32 %r2036, %r863, 0f3DB504F3; + mul.f32 %r2037, %r864, 0f3DB504F3; + mul.f32 %r2038, %r865, 0f3DB504F3; + mul.f32 %r2039, %r866, 0f3DB504F3; + mul.f32 %r2040, %r867, 0f3DB504F3; + mul.f32 %r2041, %r868, 0f3DB504F3; + mul.f32 %r2042, %r869, 0f3DB504F3; + mul.f32 %r2043, %r870, 0f3DB504F3; + mul.f32 %r2044, %r871, 0f3DB504F3; + mul.f32 %r2045, %r872, 0f3DB504F3; + mul.f32 %r2046, %r873, 0f3DB504F3; + mul.f32 %r2047, %r874, 0f3DB504F3; + mul.f32 %r2048, %r875, 0f3DB504F3; + mul.f32 %r2049, %r876, 0f3DB504F3; + mul.f32 %r2050, %r877, 0f3DB504F3; + mul.f32 %r2051, %r878, 0f3DB504F3; + .loc 1 373 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:373:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + setp.ge.s32 %p21, %r73, %r4773; + setp.ge.s32 %p22, %r73, %r4772; + setp.ge.s32 %p23, %r71, %r4773; + setp.ge.s32 %p24, %r71, %r4772; + setp.ge.s32 %p25, %r73, %r4771; + setp.ge.s32 %p26, %r73, %r4770; + setp.ge.s32 %p27, %r71, %r4771; + setp.ge.s32 %p28, %r71, %r4770; + setp.ge.s32 %p29, %r73, %r4769; + setp.ge.s32 %p30, %r73, %r4768; + setp.ge.s32 %p31, %r71, %r4769; + setp.ge.s32 %p32, %r71, %r4768; + setp.ge.s32 %p33, %r73, %r4767; + setp.ge.s32 %p34, %r73, %r4766; + setp.ge.s32 %p35, %r71, %r4767; + setp.ge.s32 %p36, %r71, %r4766; + setp.ge.s32 %p37, %r73, %r4765; + setp.ge.s32 %p38, %r73, %r4764; + setp.ge.s32 %p39, %r71, %r4765; + setp.ge.s32 %p40, %r71, %r4764; + setp.ge.s32 %p41, %r73, %r4763; + setp.ge.s32 %p42, %r73, %r4762; + setp.ge.s32 %p43, %r71, %r4763; + setp.ge.s32 %p44, %r71, %r4762; + setp.ge.s32 %p45, %r73, %r4761; + setp.ge.s32 %p46, %r73, %r4760; + setp.ge.s32 %p47, %r71, %r4761; + setp.ge.s32 %p48, %r71, %r4760; + setp.ge.s32 %p49, %r73, %r4759; + setp.ge.s32 %p50, %r73, %r4758; + setp.ge.s32 %p51, %r71, %r4759; + setp.ge.s32 %p52, %r71, %r4758; + .loc 1 384 24 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:384:24 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + setp.gt.s32 %p53, %r4773, 2047; + setp.gt.s32 %p54, %r4771, 2047; + setp.gt.s32 %p55, %r4769, 2047; + setp.gt.s32 %p56, %r4767, 2047; + setp.gt.s32 %p57, %r4765, 2047; + setp.gt.s32 %p58, %r4763, 2047; + setp.gt.s32 %p59, %r4761, 2047; + setp.gt.s32 %p60, %r4759, 2047; + .loc 1 385 24 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:385:24 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + shr.s32 %r2052, %r4758, 31; + shr.u32 %r2053, %r2052, 21; + add.s32 %r2054, %r4758, %r2053; + and.b32 %r2055, %r2054, -2048; + sub.s32 %r2056, %r4758, %r2055; + shr.s32 %r2057, %r4760, 31; + shr.u32 %r2058, %r2057, 21; + add.s32 %r2059, %r4760, %r2058; + and.b32 %r2060, %r2059, -2048; + sub.s32 %r2061, %r4760, %r2060; + shr.s32 %r2062, %r4762, 31; + shr.u32 %r2063, %r2062, 21; + add.s32 %r2064, %r4762, %r2063; + and.b32 %r2065, %r2064, -2048; + sub.s32 %r2066, %r4762, %r2065; + shr.s32 %r2067, %r4764, 31; + shr.u32 %r2068, %r2067, 21; + add.s32 %r2069, %r4764, %r2068; + and.b32 %r2070, %r2069, -2048; + sub.s32 %r2071, %r4764, %r2070; + shr.s32 %r2072, %r4766, 31; + shr.u32 %r2073, %r2072, 21; + add.s32 %r2074, %r4766, %r2073; + and.b32 %r2075, %r2074, -2048; + sub.s32 %r2076, %r4766, %r2075; + shr.s32 %r2077, %r4768, 31; + shr.u32 %r2078, %r2077, 21; + add.s32 %r2079, %r4768, %r2078; + and.b32 %r2080, %r2079, -2048; + sub.s32 %r2081, %r4768, %r2080; + shr.s32 %r2082, %r4770, 31; + shr.u32 %r2083, %r2082, 21; + add.s32 %r2084, %r4770, %r2083; + and.b32 %r2085, %r2084, -2048; + sub.s32 %r2086, %r4770, %r2085; + shr.s32 %r2087, %r4772, 31; + shr.u32 %r2088, %r2087, 21; + add.s32 %r2089, %r4772, %r2088; + and.b32 %r2090, %r2089, -2048; + sub.s32 %r2091, %r4772, %r2090; + .loc 1 387 25 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:387:25 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + setp.ne.b32 %p61, %r2091, 0; + setp.ne.b32 %p62, %r2086, 0; + setp.ne.b32 %p63, %r2081, 0; + setp.ne.b32 %p64, %r2076, 0; + setp.ne.b32 %p65, %r2071, 0; + setp.ne.b32 %p66, %r2066, 0; + setp.ne.b32 %p67, %r2061, 0; + setp.ne.b32 %p68, %r2056, 0; + .loc 1 388 92 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:388:92 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + and.b32 %r2092, %r4759, -2147481601; + and.b32 %r2093, %r4761, -2147481601; + and.b32 %r2094, %r4763, -2147481601; + and.b32 %r2095, %r4765, -2147481601; + and.b32 %r2096, %r4767, -2147481601; + and.b32 %r2097, %r4769, -2147481601; + and.b32 %r2098, %r4771, -2147481601; + and.b32 %r2099, %r4773, -2147481601; + setp.gt.u32 %p69, %r2099, -2147483648; + setp.gt.u32 %p70, %r2098, -2147483648; + setp.gt.u32 %p71, %r2097, -2147483648; + setp.gt.u32 %p72, %r2096, -2147483648; + setp.gt.u32 %p73, %r2095, -2147483648; + setp.gt.u32 %p74, %r2094, -2147483648; + setp.gt.u32 %p75, %r2093, -2147483648; + setp.gt.u32 %p76, %r2092, -2147483648; + .loc 1 394 25 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:394:25 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + prmt.b32 %r2100, %r4771, %r4773, 0x5410U; + prmt.b32 %r2101, %r4767, %r4769, 0x5410U; + prmt.b32 %r2102, %r4763, %r4765, 0x5410U; + prmt.b32 %r2103, %r4759, %r4761, 0x5410U; + and.b32 %r2104, %r2103, 134154239; + and.b32 %r2105, %r2102, 134154239; + and.b32 %r2106, %r2101, 134154239; + and.b32 %r2107, %r2100, 134154239; + .loc 1 395 24 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:395:24 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mov.b32 {%rs1, %rs2}, %r2107; + cvt.u64.u16 %rd136, %rs2; + cvt.u64.u16 %rd137, %rs1; + mov.b32 {%rs3, %rs4}, %r2106; + cvt.u64.u16 %rd138, %rs4; + cvt.u64.u16 %rd139, %rs3; + mov.b32 {%rs5, %rs6}, %r2105; + cvt.u64.u16 %rd140, %rs6; + cvt.u64.u16 %rd141, %rs5; + mov.b32 {%rs7, %rs8}, %r2104; + cvt.u64.u16 %rd142, %rs8; + cvt.u64.u16 %rd143, %rs7; + setp.gt.s64 %p77, %rd7, %rd143; + setp.gt.s64 %p78, %rd7, %rd142; + setp.gt.s64 %p79, %rd7, %rd141; + setp.gt.s64 %p80, %rd7, %rd140; + setp.gt.s64 %p81, %rd7, %rd139; + setp.gt.s64 %p82, %rd7, %rd138; + setp.gt.s64 %p83, %rd7, %rd137; + setp.gt.s64 %p84, %rd7, %rd136; + .loc 1 392 24 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:392:24 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s32 %r2108, %r2091, 2048; + add.s32 %r2109, %r2086, 2048; + add.s32 %r2110, %r2081, 2048; + add.s32 %r2111, %r2076, 2048; + add.s32 %r2112, %r2071, 2048; + add.s32 %r2113, %r2066, 2048; + add.s32 %r2114, %r2061, 2048; + add.s32 %r2115, %r2056, 2048; + .loc 1 393 39 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:393:39 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.b32 %r2116, %r2115, %r2056, %p68; + selp.b32 %r2117, %r2116, %r2056, %p76; + selp.b32 %r2118, %r2114, %r2061, %p67; + selp.b32 %r2119, %r2118, %r2061, %p75; + selp.b32 %r2120, %r2113, %r2066, %p66; + selp.b32 %r2121, %r2120, %r2066, %p74; + selp.b32 %r2122, %r2112, %r2071, %p65; + selp.b32 %r2123, %r2122, %r2071, %p73; + selp.b32 %r2124, %r2111, %r2076, %p64; + selp.b32 %r2125, %r2124, %r2076, %p72; + selp.b32 %r2126, %r2110, %r2081, %p63; + selp.b32 %r2127, %r2126, %r2081, %p71; + selp.b32 %r2128, %r2109, %r2086, %p62; + selp.b32 %r2129, %r2128, %r2086, %p70; + selp.b32 %r2130, %r2108, %r2091, %p61; + selp.b32 %r2131, %r2130, %r2091, %p69; + .loc 1 395 24 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:395:24 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + cvt.s64.s32 %rd144, %r2131; + cvt.s64.s32 %rd145, %r2129; + cvt.s64.s32 %rd146, %r2127; + cvt.s64.s32 %rd147, %r2125; + cvt.s64.s32 %rd148, %r2123; + cvt.s64.s32 %rd149, %r2121; + cvt.s64.s32 %rd150, %r2119; + cvt.s64.s32 %rd151, %r2117; + setp.gt.s64 %p85, %rd7, %rd151; + setp.gt.s64 %p86, %rd7, %rd150; + setp.gt.s64 %p87, %rd7, %rd149; + setp.gt.s64 %p88, %rd7, %rd148; + setp.gt.s64 %p89, %rd7, %rd147; + setp.gt.s64 %p90, %rd7, %rd146; + setp.gt.s64 %p91, %rd7, %rd145; + setp.gt.s64 %p92, %rd7, %rd144; + .loc 1 396 24 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:396:24 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + and.pred %p93, %p53, %p84; + and.pred %p94, %p53, %p92; + and.pred %p95, %p54, %p83; + and.pred %p96, %p54, %p91; + and.pred %p97, %p55, %p82; + and.pred %p98, %p55, %p90; + and.pred %p99, %p56, %p81; + and.pred %p100, %p56, %p89; + and.pred %p101, %p57, %p80; + and.pred %p102, %p57, %p88; + and.pred %p103, %p58, %p79; + and.pred %p104, %p58, %p87; + and.pred %p105, %p59, %p78; + and.pred %p106, %p59, %p86; + and.pred %p107, %p60, %p77; + and.pred %p108, %p60, %p85; + .loc 1 397 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:397:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + sub.s32 %r2132, %r4762, %r73; + sub.s32 %r2133, %r4763, %r73; + sub.s32 %r2134, %r4762, %r71; + sub.s32 %r2135, %r4763, %r71; + sub.s32 %r2136, %r4764, %r73; + sub.s32 %r2137, %r4765, %r73; + sub.s32 %r2138, %r4764, %r71; + sub.s32 %r2139, %r4765, %r71; + sub.s32 %r2140, %r4758, %r73; + sub.s32 %r2141, %r4759, %r73; + sub.s32 %r2142, %r4758, %r71; + sub.s32 %r2143, %r4759, %r71; + sub.s32 %r2144, %r4760, %r73; + sub.s32 %r2145, %r4761, %r73; + sub.s32 %r2146, %r4760, %r71; + sub.s32 %r2147, %r4761, %r71; + sub.s32 %r2148, %r4766, %r71; + sub.s32 %r2149, %r4767, %r71; + sub.s32 %r2150, %r4766, %r73; + sub.s32 %r2151, %r4767, %r73; + sub.s32 %r2152, %r4768, %r71; + sub.s32 %r2153, %r4769, %r71; + sub.s32 %r2154, %r4768, %r73; + sub.s32 %r2155, %r4769, %r73; + sub.s32 %r2156, %r4770, %r71; + sub.s32 %r2157, %r4771, %r71; + sub.s32 %r2158, %r4770, %r73; + sub.s32 %r2159, %r4771, %r73; + sub.s32 %r2160, %r4772, %r71; + sub.s32 %r2161, %r4773, %r71; + sub.s32 %r2162, %r4772, %r73; + sub.s32 %r2163, %r4773, %r73; + .loc 1 404 39 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:404:39 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + and.b32 %r2164, %r2163, 2047; + and.b32 %r2165, %r2162, 2047; + and.b32 %r2166, %r2161, 2047; + and.b32 %r2167, %r2160, 2047; + and.b32 %r2168, %r2159, 2047; + and.b32 %r2169, %r2158, 2047; + and.b32 %r2170, %r2157, 2047; + and.b32 %r2171, %r2156, 2047; + and.b32 %r2172, %r2155, 2047; + and.b32 %r2173, %r2154, 2047; + and.b32 %r2174, %r2153, 2047; + and.b32 %r2175, %r2152, 2047; + and.b32 %r2176, %r2151, 2047; + and.b32 %r2177, %r2150, 2047; + and.b32 %r2178, %r2149, 2047; + and.b32 %r2179, %r2148, 2047; + and.b32 %r2180, %r2147, 2047; + and.b32 %r2181, %r2146, 2047; + and.b32 %r2182, %r2145, 2047; + and.b32 %r2183, %r2144, 2047; + and.b32 %r2184, %r2143, 2047; + and.b32 %r2185, %r2142, 2047; + and.b32 %r2186, %r2141, 2047; + and.b32 %r2187, %r2140, 2047; + and.b32 %r2188, %r2139, 2047; + and.b32 %r2189, %r2138, 2047; + and.b32 %r2190, %r2137, 2047; + and.b32 %r2191, %r2136, 2047; + and.b32 %r2192, %r2135, 2047; + and.b32 %r2193, %r2134, 2047; + and.b32 %r2194, %r2133, 2047; + and.b32 %r2195, %r2132, 2047; + .loc 1 405 25 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:405:25 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + setp.eq.b32 %p109, %r2195, 0; + selp.b16 %rs9, 1, 0, %p109; + shl.b16 %rs10, %rs9, 2; + setp.eq.b32 %p110, %r2194, 0; + selp.b16 %rs11, -1, 0, %p110; + shl.b16 %rs12, %rs11, 3; + or.b16 %rs13, %rs12, %rs10; + setp.eq.b32 %p111, %r2193, 0; + selp.b16 %rs14, 1, 0, %p111; + setp.eq.b32 %p112, %r2192, 0; + selp.b16 %rs15, -1, 0, %p112; + shl.b16 %rs16, %rs15, 1; + or.b16 %rs17, %rs14, %rs16; + and.b16 %rs18, %rs17, 3; + or.b16 %rs19, %rs18, %rs13; + and.b16 %rs20, %rs19, 15; + shl.b16 %rs21, %rs20, 8; + setp.eq.b32 %p113, %r2191, 0; + selp.b16 %rs22, 1, 0, %p113; + shl.b16 %rs23, %rs22, 2; + setp.eq.b32 %p114, %r2190, 0; + selp.b16 %rs24, -1, 0, %p114; + shl.b16 %rs25, %rs24, 3; + or.b16 %rs26, %rs25, %rs23; + setp.eq.b32 %p115, %r2189, 0; + selp.b16 %rs27, 1, 0, %p115; + setp.eq.b32 %p116, %r2188, 0; + selp.b16 %rs28, -1, 0, %p116; + shl.b16 %rs29, %rs28, 1; + or.b16 %rs30, %rs27, %rs29; + and.b16 %rs31, %rs30, 3; + or.b16 %rs32, %rs31, %rs26; + shl.b16 %rs33, %rs32, 12; + or.b16 %rs34, %rs33, %rs21; + setp.eq.b32 %p117, %r2187, 0; + selp.b16 %rs35, 1, 0, %p117; + shl.b16 %rs36, %rs35, 2; + setp.eq.b32 %p118, %r2186, 0; + selp.b16 %rs37, -1, 0, %p118; + shl.b16 %rs38, %rs37, 3; + or.b16 %rs39, %rs38, %rs36; + setp.eq.b32 %p119, %r2185, 0; + selp.b16 %rs40, 1, 0, %p119; + setp.eq.b32 %p120, %r2184, 0; + selp.b16 %rs41, -1, 0, %p120; + shl.b16 %rs42, %rs41, 1; + or.b16 %rs43, %rs40, %rs42; + and.b16 %rs44, %rs43, 3; + or.b16 %rs45, %rs44, %rs39; + and.b16 %rs46, %rs45, 15; + setp.eq.b32 %p121, %r2183, 0; + selp.b16 %rs47, 1, 0, %p121; + shl.b16 %rs48, %rs47, 2; + setp.eq.b32 %p122, %r2182, 0; + selp.b16 %rs49, -1, 0, %p122; + shl.b16 %rs50, %rs49, 3; + or.b16 %rs51, %rs50, %rs48; + setp.eq.b32 %p123, %r2181, 0; + selp.b16 %rs52, 1, 0, %p123; + setp.eq.b32 %p124, %r2180, 0; + selp.b16 %rs53, -1, 0, %p124; + shl.b16 %rs54, %rs53, 1; + or.b16 %rs55, %rs52, %rs54; + and.b16 %rs56, %rs55, 3; + or.b16 %rs57, %rs56, %rs51; + shl.b16 %rs58, %rs57, 4; + or.b16 %rs59, %rs46, %rs58; + and.b16 %rs60, %rs59, 255; + or.b16 %rs61, %rs60, %rs34; + cvt.u32.u16 %r2196, %rs61; + setp.eq.b32 %p125, %r2179, 0; + setp.eq.b32 %p126, %r2178, 0; + setp.eq.b32 %p127, %r2177, 0; + setp.eq.b32 %p128, %r2176, 0; + setp.eq.b32 %p129, %r2175, 0; + setp.eq.b32 %p130, %r2174, 0; + setp.eq.b32 %p131, %r2173, 0; + setp.eq.b32 %p132, %r2172, 0; + setp.eq.b32 %p133, %r2171, 0; + setp.eq.b32 %p134, %r2170, 0; + setp.eq.b32 %p135, %r2169, 0; + setp.eq.b32 %p136, %r2168, 0; + setp.eq.b32 %p137, %r2167, 0; + setp.eq.b32 %p138, %r2166, 0; + setp.eq.b32 %p139, %r2165, 0; + setp.eq.b32 %p140, %r2164, 0; + .loc 1 406 24 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:406:24 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + shr.u32 %r2197, %r2196, 15; + and.b32 %r2198, %r2197, 1; + setp.ne.b32 %p141, %r2198, 0; + shr.u32 %r2199, %r2196, 14; + and.b32 %r2200, %r2199, 1; + setp.ne.b32 %p142, %r2200, 0; + shr.u32 %r2201, %r2196, 13; + and.b32 %r2202, %r2201, 1; + setp.ne.b32 %p143, %r2202, 0; + shr.u32 %r2203, %r2196, 12; + and.b32 %r2204, %r2203, 1; + setp.ne.b32 %p144, %r2204, 0; + shr.u32 %r2205, %r2196, 11; + and.b32 %r2206, %r2205, 1; + setp.ne.b32 %p145, %r2206, 0; + shr.u32 %r2207, %r2196, 10; + and.b32 %r2208, %r2207, 1; + setp.ne.b32 %p146, %r2208, 0; + shr.u32 %r2209, %r2196, 9; + and.b32 %r2210, %r2209, 1; + setp.ne.b32 %p147, %r2210, 0; + shr.u32 %r2211, %r2196, 8; + and.b32 %r2212, %r2211, 1; + setp.ne.b32 %p148, %r2212, 0; + shr.u32 %r2213, %r2196, 7; + and.b32 %r2214, %r2213, 1; + setp.ne.b32 %p149, %r2214, 0; + shr.u32 %r2215, %r2196, 6; + and.b32 %r2216, %r2215, 1; + setp.ne.b32 %p150, %r2216, 0; + shr.u32 %r2217, %r2196, 5; + and.b32 %r2218, %r2217, 1; + setp.ne.b32 %p151, %r2218, 0; + shr.u32 %r2219, %r2196, 4; + and.b32 %r2220, %r2219, 1; + setp.ne.b32 %p152, %r2220, 0; + shr.u32 %r2221, %r2196, 3; + and.b32 %r2222, %r2221, 1; + setp.ne.b32 %p153, %r2222, 0; + shr.u32 %r2223, %r2196, 2; + and.b32 %r2224, %r2223, 1; + setp.ne.b32 %p154, %r2224, 0; + shr.u32 %r2225, %r2196, 1; + and.b32 %r2226, %r2225, 1; + setp.ne.b32 %p155, %r2226, 0; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2227, %r2020, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2228, %r2227, 0fFF800000, %p93; + selp.f32 %r2229, %r2228, 0fFF800000, %p140; + selp.f32 %r2230, %r2227, %r2229, %p21; + selp.f32 %r2231, %r2230, %r2229, %p18; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2232, %r2021, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2233, %r2232, 0fFF800000, %p94; + selp.f32 %r2234, %r2233, 0fFF800000, %p139; + selp.f32 %r2235, %r2232, %r2234, %p22; + selp.f32 %r2236, %r2235, %r2234, %p18; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2237, %r2022, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2238, %r2237, 0fFF800000, %p93; + selp.f32 %r2239, %r2238, 0fFF800000, %p138; + selp.f32 %r2240, %r2237, %r2239, %p23; + selp.f32 %r2241, %r2240, %r2239, %p17; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2242, %r2023, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2243, %r2242, 0fFF800000, %p94; + selp.f32 %r2244, %r2243, 0fFF800000, %p137; + selp.f32 %r2245, %r2242, %r2244, %p24; + selp.f32 %r2246, %r2245, %r2244, %p17; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2247, %r2024, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2248, %r2247, 0fFF800000, %p95; + selp.f32 %r2249, %r2248, 0fFF800000, %p136; + selp.f32 %r2250, %r2247, %r2249, %p25; + selp.f32 %r2251, %r2250, %r2249, %p18; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2252, %r2025, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2253, %r2252, 0fFF800000, %p96; + selp.f32 %r2254, %r2253, 0fFF800000, %p135; + selp.f32 %r2255, %r2252, %r2254, %p26; + selp.f32 %r2256, %r2255, %r2254, %p18; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2257, %r2026, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2258, %r2257, 0fFF800000, %p95; + selp.f32 %r2259, %r2258, 0fFF800000, %p134; + selp.f32 %r2260, %r2257, %r2259, %p27; + selp.f32 %r2261, %r2260, %r2259, %p17; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2262, %r2027, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2263, %r2262, 0fFF800000, %p96; + selp.f32 %r2264, %r2263, 0fFF800000, %p133; + selp.f32 %r2265, %r2262, %r2264, %p28; + selp.f32 %r2266, %r2265, %r2264, %p17; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2267, %r2028, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2268, %r2267, 0fFF800000, %p97; + selp.f32 %r2269, %r2268, 0fFF800000, %p132; + selp.f32 %r2270, %r2267, %r2269, %p29; + selp.f32 %r2271, %r2270, %r2269, %p18; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2272, %r2029, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2273, %r2272, 0fFF800000, %p98; + selp.f32 %r2274, %r2273, 0fFF800000, %p131; + selp.f32 %r2275, %r2272, %r2274, %p30; + selp.f32 %r2276, %r2275, %r2274, %p18; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2277, %r2030, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2278, %r2277, 0fFF800000, %p97; + selp.f32 %r2279, %r2278, 0fFF800000, %p130; + selp.f32 %r2280, %r2277, %r2279, %p31; + selp.f32 %r2281, %r2280, %r2279, %p17; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2282, %r2031, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2283, %r2282, 0fFF800000, %p98; + selp.f32 %r2284, %r2283, 0fFF800000, %p129; + selp.f32 %r2285, %r2282, %r2284, %p32; + selp.f32 %r2286, %r2285, %r2284, %p17; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2287, %r2032, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2288, %r2287, 0fFF800000, %p99; + selp.f32 %r2289, %r2288, 0fFF800000, %p128; + selp.f32 %r2290, %r2287, %r2289, %p33; + selp.f32 %r2291, %r2290, %r2289, %p18; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2292, %r2033, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2293, %r2292, 0fFF800000, %p100; + selp.f32 %r2294, %r2293, 0fFF800000, %p127; + selp.f32 %r2295, %r2292, %r2294, %p34; + selp.f32 %r2296, %r2295, %r2294, %p18; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2297, %r2034, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2298, %r2297, 0fFF800000, %p99; + selp.f32 %r2299, %r2298, 0fFF800000, %p126; + selp.f32 %r2300, %r2297, %r2299, %p35; + selp.f32 %r2301, %r2300, %r2299, %p17; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2302, %r2035, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2303, %r2302, 0fFF800000, %p100; + selp.f32 %r2304, %r2303, 0fFF800000, %p125; + selp.f32 %r2305, %r2302, %r2304, %p36; + selp.f32 %r2306, %r2305, %r2304, %p17; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2307, %r2036, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2308, %r2307, 0fFF800000, %p101; + selp.f32 %r2309, %r2308, 0fFF800000, %p141; + selp.f32 %r2310, %r2307, %r2309, %p37; + selp.f32 %r2311, %r2310, %r2309, %p18; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2312, %r2037, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2313, %r2312, 0fFF800000, %p102; + selp.f32 %r2314, %r2313, 0fFF800000, %p142; + selp.f32 %r2315, %r2312, %r2314, %p38; + selp.f32 %r2316, %r2315, %r2314, %p18; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2317, %r2038, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2318, %r2317, 0fFF800000, %p101; + selp.f32 %r2319, %r2318, 0fFF800000, %p143; + selp.f32 %r2320, %r2317, %r2319, %p39; + selp.f32 %r2321, %r2320, %r2319, %p17; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2322, %r2039, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2323, %r2322, 0fFF800000, %p102; + selp.f32 %r2324, %r2323, 0fFF800000, %p144; + selp.f32 %r2325, %r2322, %r2324, %p40; + selp.f32 %r2326, %r2325, %r2324, %p17; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2327, %r2040, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2328, %r2327, 0fFF800000, %p103; + selp.f32 %r2329, %r2328, 0fFF800000, %p145; + selp.f32 %r2330, %r2327, %r2329, %p41; + selp.f32 %r2331, %r2330, %r2329, %p18; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2332, %r2041, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2333, %r2332, 0fFF800000, %p104; + selp.f32 %r2334, %r2333, 0fFF800000, %p146; + selp.f32 %r2335, %r2332, %r2334, %p42; + selp.f32 %r2336, %r2335, %r2334, %p18; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2337, %r2042, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2338, %r2337, 0fFF800000, %p103; + selp.f32 %r2339, %r2338, 0fFF800000, %p147; + selp.f32 %r2340, %r2337, %r2339, %p43; + selp.f32 %r2341, %r2340, %r2339, %p17; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2342, %r2043, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2343, %r2342, 0fFF800000, %p104; + selp.f32 %r2344, %r2343, 0fFF800000, %p148; + selp.f32 %r2345, %r2342, %r2344, %p44; + selp.f32 %r2346, %r2345, %r2344, %p17; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2347, %r2044, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2348, %r2347, 0fFF800000, %p105; + selp.f32 %r2349, %r2348, 0fFF800000, %p149; + selp.f32 %r2350, %r2347, %r2349, %p45; + selp.f32 %r2351, %r2350, %r2349, %p18; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2352, %r2045, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2353, %r2352, 0fFF800000, %p106; + selp.f32 %r2354, %r2353, 0fFF800000, %p150; + selp.f32 %r2355, %r2352, %r2354, %p46; + selp.f32 %r2356, %r2355, %r2354, %p18; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2357, %r2046, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2358, %r2357, 0fFF800000, %p105; + selp.f32 %r2359, %r2358, 0fFF800000, %p151; + selp.f32 %r2360, %r2357, %r2359, %p47; + selp.f32 %r2361, %r2360, %r2359, %p17; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2362, %r2047, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2363, %r2362, 0fFF800000, %p106; + selp.f32 %r2364, %r2363, 0fFF800000, %p152; + selp.f32 %r2365, %r2362, %r2364, %p48; + selp.f32 %r2366, %r2365, %r2364, %p17; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2367, %r2048, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2368, %r2367, 0fFF800000, %p107; + selp.f32 %r2369, %r2368, 0fFF800000, %p153; + selp.f32 %r2370, %r2367, %r2369, %p49; + selp.f32 %r2371, %r2370, %r2369, %p18; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2372, %r2049, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2373, %r2372, 0fFF800000, %p108; + selp.f32 %r2374, %r2373, 0fFF800000, %p154; + selp.f32 %r2375, %r2372, %r2374, %p50; + selp.f32 %r2376, %r2375, %r2374, %p18; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2377, %r2050, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2378, %r2377, 0fFF800000, %p107; + selp.f32 %r2379, %r2378, 0fFF800000, %p155; + selp.f32 %r2380, %r2377, %r2379, %p51; + selp.f32 %r2381, %r2380, %r2379, %p17; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r2382, %r2051, 0f3FB8AA3B; + .loc 1 414 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:414:69 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2383, %r2382, 0fFF800000, %p108; + selp.f32 %r2384, %r2383, 0fFF800000, %p119; + selp.f32 %r2385, %r2382, %r2384, %p52; + selp.f32 %r2386, %r2385, %r2384, %p17; + .loc 2 168 27 // standard.py:168:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + max.f32 %r2387, %r2231, %r2236; + max.f32 %r2388, %r2241, %r2246; + max.f32 %r2389, %r2387, %r2251; + max.f32 %r2390, %r2389, %r2256; + max.f32 %r2391, %r2388, %r2261; + max.f32 %r2392, %r2391, %r2266; + max.f32 %r2393, %r2390, %r2271; + max.f32 %r2394, %r2393, %r2276; + max.f32 %r2395, %r2392, %r2281; + max.f32 %r2396, %r2395, %r2286; + max.f32 %r2397, %r2394, %r2291; + max.f32 %r2398, %r2397, %r2296; + max.f32 %r2399, %r2396, %r2301; + max.f32 %r2400, %r2399, %r2306; + max.f32 %r2401, %r2398, %r2311; + max.f32 %r2402, %r2401, %r2316; + max.f32 %r2403, %r2400, %r2321; + max.f32 %r2404, %r2403, %r2326; + max.f32 %r2405, %r2402, %r2331; + max.f32 %r2406, %r2405, %r2336; + max.f32 %r2407, %r2404, %r2341; + max.f32 %r2408, %r2407, %r2346; + max.f32 %r2409, %r2406, %r2351; + max.f32 %r2410, %r2409, %r2356; + max.f32 %r2411, %r2408, %r2361; + max.f32 %r2412, %r2411, %r2366; + max.f32 %r2413, %r2410, %r2371; + max.f32 %r2414, %r2413, %r2376; + max.f32 %r2415, %r2412, %r2381; + max.f32 %r2416, %r2415, %r2386; + .loc 2 189 40 // standard.py:189:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + shfl.sync.bfly.b32 %r2417, %r2414, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + max.f32 %r2418, %r2414, %r2417; + .loc 2 189 40 // standard.py:189:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + shfl.sync.bfly.b32 %r2419, %r2418, 1, 31, -1; + shfl.sync.bfly.b32 %r2420, %r2416, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + max.f32 %r2421, %r2416, %r2420; + .loc 2 189 40 // standard.py:189:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + shfl.sync.bfly.b32 %r2422, %r2421, 1, 31, -1; + cvt.u64.u32 %rd152, %r2419; + cvt.u64.u32 %rd153, %r2422; + shl.b64 %rd154, %rd153, 32; + or.b64 %rd155, %rd152, %rd154; + .loc 2 168 27 // standard.py:168:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mov.b64 {%r2423, %r2424}, %rd155; + max.f32 %r2425, %r2418, %r2423; + max.f32 %r2426, %r2421, %r2424; + .loc 1 421 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:421:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mov.b64 {%r2427, %r2428}, %rd284; + max.f32 %r2429, %r2428, %r2426; + max.f32 %r2430, %r2427, %r2425; + mov.b64 %rd284, {%r2430, %r2429}; + .loc 1 423 35 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:423:35 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + setp.eq.f32 %p156, %r2430, 0fFF800000; + setp.eq.f32 %p157, %r2429, 0fFF800000; + .loc 1 424 51 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:424:51 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + selp.f32 %r2431, 0f00000000, %r2430, %p156; + selp.f32 %r2432, 0f00000000, %r2429, %p157; + .loc 1 428 31 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:428:31 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + sub.f32 %r2433, %r2427, %r2431; + sub.f32 %r2434, %r2428, %r2432; + .loc 1 428 25 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:428:25 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + ex2.approx.ftz.f32 %r2435, %r2433; + ex2.approx.ftz.f32 %r2436, %r2434; + .loc 1 429 39 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:429:39 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + sub.f32 %r2437, %r2231, %r2431; + sub.f32 %r2438, %r2236, %r2431; + sub.f32 %r2439, %r2241, %r2432; + sub.f32 %r2440, %r2246, %r2432; + sub.f32 %r2441, %r2251, %r2431; + sub.f32 %r2442, %r2256, %r2431; + sub.f32 %r2443, %r2261, %r2432; + sub.f32 %r2444, %r2266, %r2432; + sub.f32 %r2445, %r2271, %r2431; + sub.f32 %r2446, %r2276, %r2431; + sub.f32 %r2447, %r2281, %r2432; + sub.f32 %r2448, %r2286, %r2432; + sub.f32 %r2449, %r2291, %r2431; + sub.f32 %r2450, %r2296, %r2431; + sub.f32 %r2451, %r2301, %r2432; + sub.f32 %r2452, %r2306, %r2432; + sub.f32 %r2453, %r2311, %r2431; + sub.f32 %r2454, %r2316, %r2431; + sub.f32 %r2455, %r2321, %r2432; + sub.f32 %r2456, %r2326, %r2432; + sub.f32 %r2457, %r2331, %r2431; + sub.f32 %r2458, %r2336, %r2431; + sub.f32 %r2459, %r2341, %r2432; + sub.f32 %r2460, %r2346, %r2432; + sub.f32 %r2461, %r2351, %r2431; + sub.f32 %r2462, %r2356, %r2431; + sub.f32 %r2463, %r2361, %r2432; + sub.f32 %r2464, %r2366, %r2432; + sub.f32 %r2465, %r2371, %r2431; + sub.f32 %r2466, %r2376, %r2431; + sub.f32 %r2467, %r2381, %r2432; + sub.f32 %r2468, %r2386, %r2432; + .loc 1 429 21 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:429:21 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + ex2.approx.ftz.f32 %r2469, %r2437; + ex2.approx.ftz.f32 %r2470, %r2438; + ex2.approx.ftz.f32 %r2471, %r2439; + ex2.approx.ftz.f32 %r2472, %r2440; + ex2.approx.ftz.f32 %r2473, %r2441; + ex2.approx.ftz.f32 %r2474, %r2442; + ex2.approx.ftz.f32 %r2475, %r2443; + ex2.approx.ftz.f32 %r2476, %r2444; + ex2.approx.ftz.f32 %r2477, %r2445; + ex2.approx.ftz.f32 %r2478, %r2446; + ex2.approx.ftz.f32 %r2479, %r2447; + ex2.approx.ftz.f32 %r2480, %r2448; + ex2.approx.ftz.f32 %r2481, %r2449; + ex2.approx.ftz.f32 %r2482, %r2450; + ex2.approx.ftz.f32 %r2483, %r2451; + ex2.approx.ftz.f32 %r2484, %r2452; + ex2.approx.ftz.f32 %r2485, %r2453; + ex2.approx.ftz.f32 %r2486, %r2454; + ex2.approx.ftz.f32 %r2487, %r2455; + ex2.approx.ftz.f32 %r2488, %r2456; + ex2.approx.ftz.f32 %r2489, %r2457; + ex2.approx.ftz.f32 %r2490, %r2458; + ex2.approx.ftz.f32 %r2491, %r2459; + ex2.approx.ftz.f32 %r2492, %r2460; + ex2.approx.ftz.f32 %r2493, %r2461; + ex2.approx.ftz.f32 %r2494, %r2462; + ex2.approx.ftz.f32 %r2495, %r2463; + ex2.approx.ftz.f32 %r2496, %r2464; + ex2.approx.ftz.f32 %r2497, %r2465; + ex2.approx.ftz.f32 %r2498, %r2466; + ex2.approx.ftz.f32 %r2499, %r2467; + ex2.approx.ftz.f32 %r2500, %r2468; + .loc 2 261 15 // standard.py:261:15 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.f32 %r2501, %r2469, %r2470; + add.f32 %r2502, %r2471, %r2472; + add.f32 %r2503, %r2501, %r2473; + add.f32 %r2504, %r2503, %r2474; + add.f32 %r2505, %r2502, %r2475; + add.f32 %r2506, %r2505, %r2476; + add.f32 %r2507, %r2504, %r2477; + add.f32 %r2508, %r2507, %r2478; + add.f32 %r2509, %r2506, %r2479; + add.f32 %r2510, %r2509, %r2480; + add.f32 %r2511, %r2508, %r2481; + add.f32 %r2512, %r2511, %r2482; + add.f32 %r2513, %r2510, %r2483; + add.f32 %r2514, %r2513, %r2484; + add.f32 %r2515, %r2512, %r2485; + add.f32 %r2516, %r2515, %r2486; + add.f32 %r2517, %r2514, %r2487; + add.f32 %r2518, %r2517, %r2488; + add.f32 %r2519, %r2516, %r2489; + add.f32 %r2520, %r2519, %r2490; + add.f32 %r2521, %r2518, %r2491; + add.f32 %r2522, %r2521, %r2492; + add.f32 %r2523, %r2520, %r2493; + add.f32 %r2524, %r2523, %r2494; + add.f32 %r2525, %r2522, %r2495; + add.f32 %r2526, %r2525, %r2496; + add.f32 %r2527, %r2524, %r2497; + add.f32 %r2528, %r2527, %r2498; + add.f32 %r2529, %r2526, %r2499; + add.f32 %r2530, %r2529, %r2500; + .loc 2 291 36 // standard.py:291:36 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + shfl.sync.bfly.b32 %r2531, %r2528, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.f32 %r2532, %r2528, %r2531; + .loc 2 291 36 // standard.py:291:36 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + shfl.sync.bfly.b32 %r2533, %r2532, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.f32 %r2534, %r2532, %r2533; + .loc 2 291 36 // standard.py:291:36 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + shfl.sync.bfly.b32 %r2535, %r2530, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.f32 %r2536, %r2530, %r2535; + .loc 2 291 36 // standard.py:291:36 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + shfl.sync.bfly.b32 %r2537, %r2536, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.f32 %r2538, %r2536, %r2537; + .loc 1 434 24 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:434:24 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + fma.rn.f32 %r4974, %r4974, %r2435, %r2534; + fma.rn.f32 %r4975, %r4975, %r2436, %r2538; + .loc 1 436 16 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:436:16 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.f32 %r1567, %r1567, %r2435; + mul.f32 %r1568, %r1568, %r2435; + mul.f32 %r1569, %r1569, %r2436; + mul.f32 %r1570, %r1570, %r2436; + mul.f32 %r1571, %r1571, %r2435; + mul.f32 %r1572, %r1572, %r2435; + mul.f32 %r1573, %r1573, %r2436; + mul.f32 %r1574, %r1574, %r2436; + mul.f32 %r1575, %r1575, %r2435; + mul.f32 %r1576, %r1576, %r2435; + mul.f32 %r1577, %r1577, %r2436; + mul.f32 %r1578, %r1578, %r2436; + mul.f32 %r1579, %r1579, %r2435; + mul.f32 %r1580, %r1580, %r2435; + mul.f32 %r1581, %r1581, %r2436; + mul.f32 %r1582, %r1582, %r2436; + mul.f32 %r1583, %r1583, %r2435; + mul.f32 %r1584, %r1584, %r2435; + mul.f32 %r1585, %r1585, %r2436; + mul.f32 %r1586, %r1586, %r2436; + mul.f32 %r1587, %r1587, %r2435; + mul.f32 %r1588, %r1588, %r2435; + mul.f32 %r1589, %r1589, %r2436; + mul.f32 %r1590, %r1590, %r2436; + mul.f32 %r1591, %r1591, %r2435; + mul.f32 %r1592, %r1592, %r2435; + mul.f32 %r1593, %r1593, %r2436; + mul.f32 %r1594, %r1594, %r2436; + mul.f32 %r1595, %r1595, %r2435; + mul.f32 %r1596, %r1596, %r2435; + mul.f32 %r1597, %r1597, %r2436; + mul.f32 %r1598, %r1598, %r2436; + mul.f32 %r1599, %r1599, %r2435; + mul.f32 %r1600, %r1600, %r2435; + mul.f32 %r1601, %r1601, %r2436; + mul.f32 %r1602, %r1602, %r2436; + mul.f32 %r1603, %r1603, %r2435; + mul.f32 %r1604, %r1604, %r2435; + mul.f32 %r1605, %r1605, %r2436; + mul.f32 %r1606, %r1606, %r2436; + mul.f32 %r1607, %r1607, %r2435; + mul.f32 %r1608, %r1608, %r2435; + mul.f32 %r1609, %r1609, %r2436; + mul.f32 %r1610, %r1610, %r2436; + mul.f32 %r1611, %r1611, %r2435; + mul.f32 %r1612, %r1612, %r2435; + mul.f32 %r1613, %r1613, %r2436; + mul.f32 %r1614, %r1614, %r2436; + mul.f32 %r1615, %r1615, %r2435; + mul.f32 %r1616, %r1616, %r2435; + mul.f32 %r1617, %r1617, %r2436; + mul.f32 %r1618, %r1618, %r2436; + mul.f32 %r1619, %r1619, %r2435; + mul.f32 %r1620, %r1620, %r2435; + mul.f32 %r1621, %r1621, %r2436; + mul.f32 %r1622, %r1622, %r2436; + mul.f32 %r1623, %r1623, %r2435; + mul.f32 %r1624, %r1624, %r2435; + mul.f32 %r1625, %r1625, %r2436; + mul.f32 %r1626, %r1626, %r2436; + mul.f32 %r1627, %r1627, %r2435; + mul.f32 %r1628, %r1628, %r2435; + mul.f32 %r1629, %r1629, %r2436; + mul.f32 %r1630, %r1630, %r2436; + .loc 1 294 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:294:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s32 %r2539, %r722, 49152; + add.s32 %r2540, %r2539, %r1984; + .loc 1 440 22 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:440:22 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + cvt.rn.bf16x2.f32 %r1563, %r2470, %r2469; + cvt.rn.bf16x2.f32 %r1564, %r2472, %r2471; + cvt.rn.bf16x2.f32 %r1565, %r2474, %r2473; + cvt.rn.bf16x2.f32 %r1566, %r2476, %r2475; + cvt.rn.bf16x2.f32 %r1695, %r2478, %r2477; + cvt.rn.bf16x2.f32 %r1696, %r2480, %r2479; + cvt.rn.bf16x2.f32 %r1697, %r2482, %r2481; + cvt.rn.bf16x2.f32 %r1698, %r2484, %r2483; + cvt.rn.bf16x2.f32 %r1827, %r2486, %r2485; + cvt.rn.bf16x2.f32 %r1828, %r2488, %r2487; + cvt.rn.bf16x2.f32 %r1829, %r2490, %r2489; + cvt.rn.bf16x2.f32 %r1830, %r2492, %r2491; + cvt.rn.bf16x2.f32 %r1959, %r2494, %r2493; + cvt.rn.bf16x2.f32 %r1960, %r2496, %r2495; + cvt.rn.bf16x2.f32 %r1961, %r2498, %r2497; + cvt.rn.bf16x2.f32 %r1962, %r2500, %r2499; + .loc 1 440 44 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:440:44 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + wgmma.fence.sync.aligned; + bfe.u32 %r2541, %r2540, 4, 14; + cvt.u64.u32 %rd156, %r2541; + or.b64 %rd102, %rd156, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630}, {%r1563,%r1564,%r1565,%r1566}, %rd102, %p4, 1, 1, 1; + // end inline asm + add.s32 %r2542, %r2540, 2048; + bfe.u32 %r2543, %r2542, 4, 14; + cvt.u64.u32 %rd157, %r2543; + or.b64 %rd103, %rd157, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630}, {%r1695,%r1696,%r1697,%r1698}, %rd103, %p4, 1, 1, 1; + // end inline asm + add.s32 %r2544, %r2540, 4096; + bfe.u32 %r2545, %r2544, 4, 14; + cvt.u64.u32 %rd158, %r2545; + or.b64 %rd104, %rd158, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630}, {%r1827,%r1828,%r1829,%r1830}, %rd104, %p4, 1, 1, 1; + // end inline asm + add.s32 %r2546, %r2540, 6144; + bfe.u32 %r2547, %r2546, 4, 14; + cvt.u64.u32 %rd159, %r2547; + or.b64 %rd105, %rd159, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630}, {%r1959,%r1960,%r1961,%r1962}, %rd105, %p4, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 548 26 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:548:26 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s32 %r4758, %r4687, %r4758; + add.s32 %r4759, %r4687, %r4759; + add.s32 %r4760, %r4687, %r4760; + add.s32 %r4761, %r4687, %r4761; + add.s32 %r4762, %r4687, %r4762; + add.s32 %r4763, %r4687, %r4763; + add.s32 %r4764, %r4687, %r4764; + add.s32 %r4765, %r4687, %r4765; + add.s32 %r4766, %r4687, %r4766; + add.s32 %r4767, %r4687, %r4767; + add.s32 %r4768, %r4687, %r4768; + add.s32 %r4769, %r4687, %r4769; + add.s32 %r4770, %r4687, %r4770; + add.s32 %r4771, %r4687, %r4771; + add.s32 %r4772, %r4687, %r4772; + add.s32 %r4773, %r4687, %r4773; + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s32 %r273, %r4757, 1; + .loc 1 247 33 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:247:33 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + shr.u32 %r2548, %r273, 1; + .loc 1 248 38 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:248:38 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mad.wide.u32 %rd107, %r2548, 4, %rd39; + .loc 1 248 24 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:248:24 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + // begin inline asm + mov.u64 %rd106, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd106, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r1963, 0x0; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b32 { %r1963 }, [ %rd107 + 0 ], %rd106; + // end inline asm + .loc 1 249 109 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:249:109 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s32 %r2549, %r2548, 1; + .loc 1 249 113 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:249:113 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + setp.lt.s32 %p158, %r2549, %r662; + .loc 1 249 55 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:249:55 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s64 %rd110, %rd107, 4; + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + and.pred %p16, %p15, %p158; + .loc 1 249 25 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:249:25 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + // begin inline asm + mov.u64 %rd109, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd109, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r1964, 0x0; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b32 { %r1964 }, [ %rd110 + 0 ], %rd109; + // end inline asm + .loc 1 250 35 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:250:35 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + and.b32 %r2550, %r4757, 1; + .loc 1 251 34 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:251:34 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + sub.s32 %r2551, %r1964, %r1963; + .loc 1 251 48 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:251:48 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + shl.b32 %r2552, %r2551, 7; + .loc 1 251 63 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:251:63 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s32 %r2553, %r2552, -64; + .loc 1 252 29 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:252:29 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + xor.b32 %r2554, %r2550, 1; + .loc 1 252 61 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:252:61 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + shl.b32 %r2555, %r2550, 6; + .loc 1 252 42 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:252:42 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mad.lo.s32 %r4687, %r2553, %r2554, %r2555; + .loc 1 549 21 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:549:21 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s32 %r4690, %r4687, %r4690; + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s32 %r2556, %r4689, 1; + setp.gt.s32 %p159, %r2556, 2; + selp.b32 %r4689, 0, %r2556, %p159; + .loc 1 342 32 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:342:32 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s32 %r2557, %r4690, %r31; + .loc 1 346 35 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:346:35 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s32 %r2558, %r2557, %r8; + add.s32 %r2559, %r2557, %r9; + add.s32 %r2560, %r2557, %r10; + add.s32 %r2561, %r2557, %r11; + .loc 1 284 38 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:38 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + shl.b32 %r2562, %r2558, 7; + shl.b32 %r2563, %r2559, 7; + shl.b32 %r2564, %r2560, 7; + shl.b32 %r2565, %r2561, 7; + .loc 1 284 49 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:49 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + mul.wide.s32 %rd160, %r2562, 2; + add.s64 %rd112, %rd10, %rd160; + mul.wide.s32 %rd161, %r2563, 2; + add.s64 %rd113, %rd10, %rd161; + mul.wide.s32 %rd162, %r2564, 2; + add.s64 %rd114, %rd10, %rd162; + mul.wide.s32 %rd163, %r2565, 2; + add.s64 %rd115, %rd10, %rd163; + .loc 1 294 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:294:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + shl.b32 %r2566, %r4689, 14; + add.s32 %r2567, %r722, %r2566; + bar.sync 0; + add.s32 %r1965, %r2567, %r35; + selp.b32 %r1966, 16, 0, %p19; + // begin inline asm + cp.async.cg.shared.global [ %r1965 + 0 ], [ %rd112 + 0 ], 0x10, %r1966; + // end inline asm + add.s32 %r1967, %r1965, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r1967 + 0 ], [ %rd113 + 0 ], 0x10, %r1966; + // end inline asm + add.s32 %r1969, %r1965, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r1969 + 0 ], [ %rd114 + 0 ], 0x10, %r1966; + // end inline asm + add.s32 %r1971, %r1965, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r1971 + 0 ], [ %rd115 + 0 ], 0x10, %r1966; + // end inline asm + cp.async.commit_group; + .loc 1 284 49 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:49 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s64 %rd116, %rd11, %rd160; + add.s64 %rd117, %rd11, %rd161; + add.s64 %rd118, %rd11, %rd162; + add.s64 %rd119, %rd11, %rd163; + .loc 1 294 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:294:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + add.s32 %r2568, %r2539, %r2566; + add.s32 %r1973, %r2568, %r35; + // begin inline asm + cp.async.cg.shared.global [ %r1973 + 0 ], [ %rd116 + 0 ], 0x10, %r1966; + // end inline asm + add.s32 %r1975, %r1973, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r1975 + 0 ], [ %rd117 + 0 ], 0x10, %r1966; + // end inline asm + add.s32 %r1977, %r1973, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r1977 + 0 ], [ %rd118 + 0 ], 0x10, %r1966; + // end inline asm + add.s32 %r1979, %r1973, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r1979 + 0 ], [ %rd119 + 0 ], 0x10, %r1966; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + setp.ne.b32 %p160, %r52, %r273; + mov.b32 %r4757, %r273; + @%p160 bra $L__BB0_2; +$L__BB0_3: // %._crit_edge + .loc 1 0 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:0:40 + ld.param.b64 %rd30, [triton_tem_fused_0_param_10]; + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:172:41 ] + // begin inline asm + // wait for regs: %r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp6: + .loc 1 181 35 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:181:35 + shl.b64 %rd182, %rd4, 2; + add.s64 %rd164, %rd29, %rd182; + .loc 1 182 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:182:27 + // begin inline asm + mov.u32 %r2697, 0x0; + ld.global.b32 { %r2697 }, [ %rd164 + 0 ]; + // end inline asm + .loc 1 182 41 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:182:41 + shl.b32 %r343, %r2697, 7; + .loc 1 183 51 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:183:51 + shl.b64 %rd183, %rd6, 2; + add.s64 %rd165, %rd28, %rd183; + .loc 1 183 32 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:183:32 + // begin inline asm + mov.u32 %r2698, 0x0; + ld.global.b32 { %r2698 }, [ %rd165 + 0 ]; + // end inline asm + .loc 1 184 49 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:184:49 + shl.b32 %r345, %r2698, 1; +$L__tmp7: + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + setp.lt.s32 %p161, %r345, 1; + setp.gt.s32 %p162, %r345, 0; + .loc 1 346 35 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:346:35 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + or.b32 %r2731, %r343, %r8; + or.b32 %r2732, %r343, %r9; + or.b32 %r2733, %r343, %r10; + or.b32 %r2734, %r343, %r11; + .loc 1 284 38 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:38 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + shl.b32 %r2735, %r2731, 7; + shl.b32 %r2736, %r2732, 7; + shl.b32 %r2737, %r2733, 7; + shl.b32 %r2738, %r2734, 7; + .loc 1 284 20 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:20 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + mul.wide.s32 %rd184, %r2735, 2; + add.s64 %rd185, %rd1, %rd184; + mul.wide.s32 %rd186, %r2736, 2; + add.s64 %rd187, %rd1, %rd186; + mul.wide.s32 %rd188, %r2737, 2; + add.s64 %rd189, %rd1, %rd188; + mul.wide.s32 %rd190, %r2738, 2; + add.s64 %rd191, %rd1, %rd190; + .loc 1 284 49 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:49 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + shl.b64 %rd192, %rd3, 1; + add.s64 %rd166, %rd185, %rd192; + add.s64 %rd167, %rd187, %rd192; + add.s64 %rd168, %rd189, %rd192; + add.s64 %rd169, %rd191, %rd192; + .loc 1 294 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:294:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + selp.b32 %r2700, 16, 0, %p162; + // begin inline asm + cp.async.cg.shared.global [ %r2699 + 0 ], [ %rd166 + 0 ], 0x10, %r2700; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2701 + 0 ], [ %rd167 + 0 ], 0x10, %r2700; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2703 + 0 ], [ %rd168 + 0 ], 0x10, %r2700; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2705 + 0 ], [ %rd169 + 0 ], 0x10, %r2700; + // end inline asm + cp.async.commit_group; + .loc 1 284 20 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:20 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.s64 %rd193, %rd2, %rd184; + add.s64 %rd194, %rd2, %rd186; + add.s64 %rd195, %rd2, %rd188; + add.s64 %rd196, %rd2, %rd190; + .loc 1 284 49 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:49 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.s64 %rd170, %rd193, %rd192; + add.s64 %rd171, %rd194, %rd192; + add.s64 %rd172, %rd195, %rd192; + add.s64 %rd173, %rd196, %rd192; + .loc 1 294 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:294:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + // begin inline asm + cp.async.cg.shared.global [ %r671 + 0 ], [ %rd170 + 0 ], 0x10, %r2700; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r673 + 0 ], [ %rd171 + 0 ], 0x10, %r2700; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r675 + 0 ], [ %rd172 + 0 ], 0x10, %r2700; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r677 + 0 ], [ %rd173 + 0 ], 0x10, %r2700; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + setp.gt.s32 %p163, %r345, 1; + .loc 1 342 32 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:342:32 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + or.b32 %r2739, %r343, 64; + .loc 1 346 35 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:346:35 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + or.b32 %r2740, %r2739, %r8; + or.b32 %r2741, %r2739, %r9; + or.b32 %r2742, %r2739, %r10; + or.b32 %r2743, %r2739, %r11; + .loc 1 284 38 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:38 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + shl.b32 %r2744, %r2740, 7; + shl.b32 %r2745, %r2741, 7; + shl.b32 %r2746, %r2742, 7; + shl.b32 %r2747, %r2743, 7; + .loc 1 284 20 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:20 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + mul.wide.s32 %rd197, %r2744, 2; + add.s64 %rd198, %rd1, %rd197; + mul.wide.s32 %rd199, %r2745, 2; + add.s64 %rd200, %rd1, %rd199; + mul.wide.s32 %rd201, %r2746, 2; + add.s64 %rd202, %rd1, %rd201; + mul.wide.s32 %rd203, %r2747, 2; + add.s64 %rd204, %rd1, %rd203; + .loc 1 284 49 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:49 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.s64 %rd174, %rd198, %rd192; + add.s64 %rd175, %rd200, %rd192; + add.s64 %rd176, %rd202, %rd192; + add.s64 %rd177, %rd204, %rd192; + .loc 1 294 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:294:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + bar.sync 0; + selp.b32 %r2716, 16, 0, %p163; + // begin inline asm + cp.async.cg.shared.global [ %r679 + 0 ], [ %rd174 + 0 ], 0x10, %r2716; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r681 + 0 ], [ %rd175 + 0 ], 0x10, %r2716; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r683 + 0 ], [ %rd176 + 0 ], 0x10, %r2716; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r685 + 0 ], [ %rd177 + 0 ], 0x10, %r2716; + // end inline asm + cp.async.commit_group; + .loc 1 284 20 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:20 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.s64 %rd205, %rd2, %rd197; + add.s64 %rd206, %rd2, %rd199; + add.s64 %rd207, %rd2, %rd201; + add.s64 %rd208, %rd2, %rd203; + .loc 1 284 49 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:49 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.s64 %rd178, %rd205, %rd192; + add.s64 %rd179, %rd206, %rd192; + add.s64 %rd180, %rd207, %rd192; + add.s64 %rd181, %rd208, %rd192; + .loc 1 294 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:294:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + // begin inline asm + cp.async.cg.shared.global [ %r687 + 0 ], [ %rd178 + 0 ], 0x10, %r2716; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r689 + 0 ], [ %rd179 + 0 ], 0x10, %r2716; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r691 + 0 ], [ %rd180 + 0 ], 0x10, %r2716; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r693 + 0 ], [ %rd181 + 0 ], 0x10, %r2716; + // end inline asm + cp.async.commit_group; + .loc 1 351 19 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:351:19 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + @%p161 bra $L__BB0_6; +$L__tmp8: +// %bb.4: // %.lr.ph378 + .loc 1 184 69 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:184:69 + min.u32 %r410, %r345, 32; + add.s32 %r411, %r410, -2; + add.s32 %r412, %r410, -1; + mov.b32 %r3367, 0; + mov.b32 %r4842, 64; + mov.b32 %r4841, 1; + mov.b32 %r4840, -1; + mov.b32 %r4907, %r3367; +$L__BB0_5: // %__nv_exp2f.exit + // =>This Inner Loop Header: Depth=1 +$L__tmp9: + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + setp.lt.s32 %p177, %r4907, %r411; + setp.lt.s32 %p175, %r4907, %r412; + add.s32 %r3982, %r4840, 1; + setp.gt.s32 %p178, %r3982, 2; + selp.b32 %r4840, 0, %r3982, %p178; + .loc 1 294 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:294:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r3983, %r4840, 14; + add.s32 %r3267, %r722, %r3983; + .loc 1 351 19 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:351:19 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + shfl.sync.idx.b32 %r3985, %r7, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r3986, %r3985, 11; + and.b32 %r3987, %r3986, 8192; + add.s32 %r3264, %r722, 98304; + add.s32 %r3988, %r3987, %r3264; + bfe.u32 %r3989, %r3988, 4, 14; + cvt.u64.u32 %rd243, %r3989; + or.b64 %rd209, %rd243, 4611686293372403712; + bfe.u32 %r3990, %r3267, 4, 14; + cvt.u64.u32 %rd244, %r3990; + or.b64 %rd210, %rd244, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2848,%r2849,%r2850,%r2851,%r2852,%r2853,%r2854,%r2855,%r2856,%r2857,%r2858,%r2859,%r2860,%r2861,%r2862,%r2863,%r2864,%r2865,%r2866,%r2867,%r2868,%r2869,%r2870,%r2871,%r2872,%r2873,%r2874,%r2875,%r2876,%r2877,%r2878,%r2879}, %rd209, %rd210, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r3991, %r3988, 32; + bfe.u32 %r3992, %r3991, 4, 14; + cvt.u64.u32 %rd245, %r3992; + or.b64 %rd211, %rd245, 4611686293372403712; + add.s32 %r3993, %r3267, 32; + bfe.u32 %r3994, %r3993, 4, 14; + cvt.u64.u32 %rd246, %r3994; + or.b64 %rd212, %rd246, 4611686293338849280; + mov.pred %p164, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2848,%r2849,%r2850,%r2851,%r2852,%r2853,%r2854,%r2855,%r2856,%r2857,%r2858,%r2859,%r2860,%r2861,%r2862,%r2863,%r2864,%r2865,%r2866,%r2867,%r2868,%r2869,%r2870,%r2871,%r2872,%r2873,%r2874,%r2875,%r2876,%r2877,%r2878,%r2879}, %rd211, %rd212, %p164, 1, 1, 0, 0; + // end inline asm + add.s32 %r3995, %r3988, 64; + bfe.u32 %r3996, %r3995, 4, 14; + cvt.u64.u32 %rd247, %r3996; + or.b64 %rd213, %rd247, 4611686293372403712; + add.s32 %r3997, %r3267, 64; + bfe.u32 %r3998, %r3997, 4, 14; + cvt.u64.u32 %rd248, %r3998; + or.b64 %rd214, %rd248, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2848,%r2849,%r2850,%r2851,%r2852,%r2853,%r2854,%r2855,%r2856,%r2857,%r2858,%r2859,%r2860,%r2861,%r2862,%r2863,%r2864,%r2865,%r2866,%r2867,%r2868,%r2869,%r2870,%r2871,%r2872,%r2873,%r2874,%r2875,%r2876,%r2877,%r2878,%r2879}, %rd213, %rd214, %p164, 1, 1, 0, 0; + // end inline asm + add.s32 %r3999, %r3988, 96; + bfe.u32 %r4000, %r3999, 4, 14; + cvt.u64.u32 %rd249, %r4000; + or.b64 %rd215, %rd249, 4611686293372403712; + add.s32 %r4001, %r3267, 96; + bfe.u32 %r4002, %r4001, 4, 14; + cvt.u64.u32 %rd250, %r4002; + or.b64 %rd216, %rd250, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2848,%r2849,%r2850,%r2851,%r2852,%r2853,%r2854,%r2855,%r2856,%r2857,%r2858,%r2859,%r2860,%r2861,%r2862,%r2863,%r2864,%r2865,%r2866,%r2867,%r2868,%r2869,%r2870,%r2871,%r2872,%r2873,%r2874,%r2875,%r2876,%r2877,%r2878,%r2879}, %rd215, %rd216, %p164, 1, 1, 0, 0; + // end inline asm + add.s32 %r4003, %r3988, 16384; + bfe.u32 %r4004, %r4003, 4, 14; + cvt.u64.u32 %rd251, %r4004; + or.b64 %rd217, %rd251, 4611686293372403712; + add.s32 %r4005, %r3267, 8192; + bfe.u32 %r4006, %r4005, 4, 14; + cvt.u64.u32 %rd252, %r4006; + or.b64 %rd218, %rd252, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2848,%r2849,%r2850,%r2851,%r2852,%r2853,%r2854,%r2855,%r2856,%r2857,%r2858,%r2859,%r2860,%r2861,%r2862,%r2863,%r2864,%r2865,%r2866,%r2867,%r2868,%r2869,%r2870,%r2871,%r2872,%r2873,%r2874,%r2875,%r2876,%r2877,%r2878,%r2879}, %rd217, %rd218, %p164, 1, 1, 0, 0; + // end inline asm + add.s32 %r4007, %r3988, 16416; + bfe.u32 %r4008, %r4007, 4, 14; + cvt.u64.u32 %rd253, %r4008; + or.b64 %rd219, %rd253, 4611686293372403712; + add.s32 %r4009, %r3267, 8224; + bfe.u32 %r4010, %r4009, 4, 14; + cvt.u64.u32 %rd254, %r4010; + or.b64 %rd220, %rd254, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2848,%r2849,%r2850,%r2851,%r2852,%r2853,%r2854,%r2855,%r2856,%r2857,%r2858,%r2859,%r2860,%r2861,%r2862,%r2863,%r2864,%r2865,%r2866,%r2867,%r2868,%r2869,%r2870,%r2871,%r2872,%r2873,%r2874,%r2875,%r2876,%r2877,%r2878,%r2879}, %rd219, %rd220, %p164, 1, 1, 0, 0; + // end inline asm + add.s32 %r4011, %r3988, 16448; + bfe.u32 %r4012, %r4011, 4, 14; + cvt.u64.u32 %rd255, %r4012; + or.b64 %rd221, %rd255, 4611686293372403712; + add.s32 %r4013, %r3267, 8256; + bfe.u32 %r4014, %r4013, 4, 14; + cvt.u64.u32 %rd256, %r4014; + or.b64 %rd222, %rd256, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2848,%r2849,%r2850,%r2851,%r2852,%r2853,%r2854,%r2855,%r2856,%r2857,%r2858,%r2859,%r2860,%r2861,%r2862,%r2863,%r2864,%r2865,%r2866,%r2867,%r2868,%r2869,%r2870,%r2871,%r2872,%r2873,%r2874,%r2875,%r2876,%r2877,%r2878,%r2879}, %rd221, %rd222, %p164, 1, 1, 0, 0; + // end inline asm + add.s32 %r4015, %r3988, 16480; + bfe.u32 %r4016, %r4015, 4, 14; + cvt.u64.u32 %rd257, %r4016; + or.b64 %rd223, %rd257, 4611686293372403712; + add.s32 %r4017, %r3267, 8288; + bfe.u32 %r4018, %r4017, 4, 14; + cvt.u64.u32 %rd258, %r4018; + or.b64 %rd224, %rd258, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2848,%r2849,%r2850,%r2851,%r2852,%r2853,%r2854,%r2855,%r2856,%r2857,%r2858,%r2859,%r2860,%r2861,%r2862,%r2863,%r2864,%r2865,%r2866,%r2867,%r2868,%r2869,%r2870,%r2871,%r2872,%r2873,%r2874,%r2875,%r2876,%r2877,%r2878,%r2879}, %rd223, %rd224, %p164, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r3265, %r3367; + mov.b32 %r3266, %r3367; + mov.b32 %r3268, %r3367; + mov.b32 %r3269, %r3367; + // begin inline asm + // wait for regs: %r2848,%r2849,%r2850,%r2851,%r2852,%r2853,%r2854,%r2855,%r2856,%r2857,%r2858,%r2859,%r2860,%r2861,%r2862,%r2863,%r2864,%r2865,%r2866,%r2867,%r2868,%r2869,%r2870,%r2871,%r2872,%r2873,%r2874,%r2875,%r2876,%r2877,%r2878,%r2879,%r3264,%r3265,%r3266,%r3267,%r3268,%r3269,%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 353 14 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:353:14 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + mul.f32 %r4019, %r2848, 0f3DB504F3; + mul.f32 %r4020, %r2849, 0f3DB504F3; + mul.f32 %r4021, %r2850, 0f3DB504F3; + mul.f32 %r4022, %r2851, 0f3DB504F3; + mul.f32 %r4023, %r2852, 0f3DB504F3; + mul.f32 %r4024, %r2853, 0f3DB504F3; + mul.f32 %r4025, %r2854, 0f3DB504F3; + mul.f32 %r4026, %r2855, 0f3DB504F3; + mul.f32 %r4027, %r2856, 0f3DB504F3; + mul.f32 %r4028, %r2857, 0f3DB504F3; + mul.f32 %r4029, %r2858, 0f3DB504F3; + mul.f32 %r4030, %r2859, 0f3DB504F3; + mul.f32 %r4031, %r2860, 0f3DB504F3; + mul.f32 %r4032, %r2861, 0f3DB504F3; + mul.f32 %r4033, %r2862, 0f3DB504F3; + mul.f32 %r4034, %r2863, 0f3DB504F3; + mul.f32 %r4035, %r2864, 0f3DB504F3; + mul.f32 %r4036, %r2865, 0f3DB504F3; + mul.f32 %r4037, %r2866, 0f3DB504F3; + mul.f32 %r4038, %r2867, 0f3DB504F3; + mul.f32 %r4039, %r2868, 0f3DB504F3; + mul.f32 %r4040, %r2869, 0f3DB504F3; + mul.f32 %r4041, %r2870, 0f3DB504F3; + mul.f32 %r4042, %r2871, 0f3DB504F3; + mul.f32 %r4043, %r2872, 0f3DB504F3; + mul.f32 %r4044, %r2873, 0f3DB504F3; + mul.f32 %r4045, %r2874, 0f3DB504F3; + mul.f32 %r4046, %r2875, 0f3DB504F3; + mul.f32 %r4047, %r2876, 0f3DB504F3; + mul.f32 %r4048, %r2877, 0f3DB504F3; + mul.f32 %r4049, %r2878, 0f3DB504F3; + mul.f32 %r4050, %r2879, 0f3DB504F3; + .loc 1 417 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:417:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + mul.f32 %r4051, %r4019, 0f3FB8AA3B; + mul.f32 %r4052, %r4020, 0f3FB8AA3B; + mul.f32 %r4053, %r4021, 0f3FB8AA3B; + mul.f32 %r4054, %r4022, 0f3FB8AA3B; + mul.f32 %r4055, %r4023, 0f3FB8AA3B; + mul.f32 %r4056, %r4024, 0f3FB8AA3B; + mul.f32 %r4057, %r4025, 0f3FB8AA3B; + mul.f32 %r4058, %r4026, 0f3FB8AA3B; + mul.f32 %r4059, %r4027, 0f3FB8AA3B; + mul.f32 %r4060, %r4028, 0f3FB8AA3B; + mul.f32 %r4061, %r4029, 0f3FB8AA3B; + mul.f32 %r4062, %r4030, 0f3FB8AA3B; + mul.f32 %r4063, %r4031, 0f3FB8AA3B; + mul.f32 %r4064, %r4032, 0f3FB8AA3B; + mul.f32 %r4065, %r4033, 0f3FB8AA3B; + mul.f32 %r4066, %r4034, 0f3FB8AA3B; + mul.f32 %r4067, %r4035, 0f3FB8AA3B; + mul.f32 %r4068, %r4036, 0f3FB8AA3B; + mul.f32 %r4069, %r4037, 0f3FB8AA3B; + mul.f32 %r4070, %r4038, 0f3FB8AA3B; + mul.f32 %r4071, %r4039, 0f3FB8AA3B; + mul.f32 %r4072, %r4040, 0f3FB8AA3B; + mul.f32 %r4073, %r4041, 0f3FB8AA3B; + mul.f32 %r4074, %r4042, 0f3FB8AA3B; + mul.f32 %r4075, %r4043, 0f3FB8AA3B; + mul.f32 %r4076, %r4044, 0f3FB8AA3B; + mul.f32 %r4077, %r4045, 0f3FB8AA3B; + mul.f32 %r4078, %r4046, 0f3FB8AA3B; + mul.f32 %r4079, %r4047, 0f3FB8AA3B; + mul.f32 %r4080, %r4048, 0f3FB8AA3B; + mul.f32 %r4081, %r4049, 0f3FB8AA3B; + mul.f32 %r4082, %r4050, 0f3FB8AA3B; + .loc 2 168 27 // standard.py:168:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + max.f32 %r4083, %r4051, %r4052; + max.f32 %r4084, %r4053, %r4054; + max.f32 %r4085, %r4083, %r4055; + max.f32 %r4086, %r4085, %r4056; + max.f32 %r4087, %r4084, %r4057; + max.f32 %r4088, %r4087, %r4058; + max.f32 %r4089, %r4086, %r4059; + max.f32 %r4090, %r4089, %r4060; + max.f32 %r4091, %r4088, %r4061; + max.f32 %r4092, %r4091, %r4062; + max.f32 %r4093, %r4090, %r4063; + max.f32 %r4094, %r4093, %r4064; + max.f32 %r4095, %r4092, %r4065; + max.f32 %r4096, %r4095, %r4066; + max.f32 %r4097, %r4094, %r4067; + max.f32 %r4098, %r4097, %r4068; + max.f32 %r4099, %r4096, %r4069; + max.f32 %r4100, %r4099, %r4070; + max.f32 %r4101, %r4098, %r4071; + max.f32 %r4102, %r4101, %r4072; + max.f32 %r4103, %r4100, %r4073; + max.f32 %r4104, %r4103, %r4074; + max.f32 %r4105, %r4102, %r4075; + max.f32 %r4106, %r4105, %r4076; + max.f32 %r4107, %r4104, %r4077; + max.f32 %r4108, %r4107, %r4078; + max.f32 %r4109, %r4106, %r4079; + max.f32 %r4110, %r4109, %r4080; + max.f32 %r4111, %r4108, %r4081; + max.f32 %r4112, %r4111, %r4082; + .loc 2 189 40 // standard.py:189:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + shfl.sync.bfly.b32 %r4113, %r4110, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + max.f32 %r4114, %r4110, %r4113; + .loc 2 189 40 // standard.py:189:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + shfl.sync.bfly.b32 %r4115, %r4114, 1, 31, -1; + shfl.sync.bfly.b32 %r4116, %r4112, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + max.f32 %r4117, %r4112, %r4116; + .loc 2 189 40 // standard.py:189:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + shfl.sync.bfly.b32 %r4118, %r4117, 1, 31, -1; + cvt.u64.u32 %rd259, %r4115; + cvt.u64.u32 %rd260, %r4118; + shl.b64 %rd261, %rd260, 32; + or.b64 %rd262, %rd259, %rd261; + .loc 2 168 27 // standard.py:168:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + mov.b64 {%r4119, %r4120}, %rd262; + max.f32 %r4121, %r4114, %r4119; + max.f32 %r4122, %r4117, %r4120; + .loc 1 421 27 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:421:27 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + mov.b64 {%r4123, %r4124}, %rd284; + max.f32 %r4125, %r4124, %r4122; + max.f32 %r4126, %r4123, %r4121; + mov.b64 %rd284, {%r4126, %r4125}; + .loc 1 423 35 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:423:35 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + setp.eq.f32 %p179, %r4126, 0fFF800000; + setp.eq.f32 %p180, %r4125, 0fFF800000; + .loc 1 424 51 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:424:51 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + selp.f32 %r4127, 0f00000000, %r4126, %p179; + selp.f32 %r4128, 0f00000000, %r4125, %p180; + .loc 1 428 31 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:428:31 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + sub.f32 %r4129, %r4123, %r4127; + sub.f32 %r4130, %r4124, %r4128; + .loc 1 428 25 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:428:25 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + ex2.approx.ftz.f32 %r4131, %r4129; + ex2.approx.ftz.f32 %r4132, %r4130; + .loc 1 429 39 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:429:39 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + neg.f32 %r4133, %r4127; + fma.rn.f32 %r4134, %r4019, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4135, %r4020, 0f3FB8AA3B, %r4133; + neg.f32 %r4136, %r4128; + fma.rn.f32 %r4137, %r4021, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4138, %r4022, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4139, %r4023, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4140, %r4024, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4141, %r4025, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4142, %r4026, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4143, %r4027, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4144, %r4028, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4145, %r4029, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4146, %r4030, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4147, %r4031, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4148, %r4032, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4149, %r4033, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4150, %r4034, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4151, %r4035, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4152, %r4036, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4153, %r4037, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4154, %r4038, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4155, %r4039, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4156, %r4040, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4157, %r4041, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4158, %r4042, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4159, %r4043, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4160, %r4044, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4161, %r4045, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4162, %r4046, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4163, %r4047, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4164, %r4048, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4165, %r4049, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4166, %r4050, 0f3FB8AA3B, %r4136; + .loc 1 429 21 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:429:21 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + ex2.approx.ftz.f32 %r4167, %r4134; + ex2.approx.ftz.f32 %r4168, %r4135; + ex2.approx.ftz.f32 %r4169, %r4137; + ex2.approx.ftz.f32 %r4170, %r4138; + ex2.approx.ftz.f32 %r4171, %r4139; + ex2.approx.ftz.f32 %r4172, %r4140; + ex2.approx.ftz.f32 %r4173, %r4141; + ex2.approx.ftz.f32 %r4174, %r4142; + ex2.approx.ftz.f32 %r4175, %r4143; + ex2.approx.ftz.f32 %r4176, %r4144; + ex2.approx.ftz.f32 %r4177, %r4145; + ex2.approx.ftz.f32 %r4178, %r4146; + ex2.approx.ftz.f32 %r4179, %r4147; + ex2.approx.ftz.f32 %r4180, %r4148; + ex2.approx.ftz.f32 %r4181, %r4149; + ex2.approx.ftz.f32 %r4182, %r4150; + ex2.approx.ftz.f32 %r4183, %r4151; + ex2.approx.ftz.f32 %r4184, %r4152; + ex2.approx.ftz.f32 %r4185, %r4153; + ex2.approx.ftz.f32 %r4186, %r4154; + ex2.approx.ftz.f32 %r4187, %r4155; + ex2.approx.ftz.f32 %r4188, %r4156; + ex2.approx.ftz.f32 %r4189, %r4157; + ex2.approx.ftz.f32 %r4190, %r4158; + ex2.approx.ftz.f32 %r4191, %r4159; + ex2.approx.ftz.f32 %r4192, %r4160; + ex2.approx.ftz.f32 %r4193, %r4161; + ex2.approx.ftz.f32 %r4194, %r4162; + ex2.approx.ftz.f32 %r4195, %r4163; + ex2.approx.ftz.f32 %r4196, %r4164; + ex2.approx.ftz.f32 %r4197, %r4165; + ex2.approx.ftz.f32 %r4198, %r4166; + .loc 2 261 15 // standard.py:261:15 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.f32 %r4199, %r4167, %r4168; + add.f32 %r4200, %r4169, %r4170; + add.f32 %r4201, %r4199, %r4171; + add.f32 %r4202, %r4201, %r4172; + add.f32 %r4203, %r4200, %r4173; + add.f32 %r4204, %r4203, %r4174; + add.f32 %r4205, %r4202, %r4175; + add.f32 %r4206, %r4205, %r4176; + add.f32 %r4207, %r4204, %r4177; + add.f32 %r4208, %r4207, %r4178; + add.f32 %r4209, %r4206, %r4179; + add.f32 %r4210, %r4209, %r4180; + add.f32 %r4211, %r4208, %r4181; + add.f32 %r4212, %r4211, %r4182; + add.f32 %r4213, %r4210, %r4183; + add.f32 %r4214, %r4213, %r4184; + add.f32 %r4215, %r4212, %r4185; + add.f32 %r4216, %r4215, %r4186; + add.f32 %r4217, %r4214, %r4187; + add.f32 %r4218, %r4217, %r4188; + add.f32 %r4219, %r4216, %r4189; + add.f32 %r4220, %r4219, %r4190; + add.f32 %r4221, %r4218, %r4191; + add.f32 %r4222, %r4221, %r4192; + add.f32 %r4223, %r4220, %r4193; + add.f32 %r4224, %r4223, %r4194; + add.f32 %r4225, %r4222, %r4195; + add.f32 %r4226, %r4225, %r4196; + add.f32 %r4227, %r4224, %r4197; + add.f32 %r4228, %r4227, %r4198; + .loc 2 291 36 // standard.py:291:36 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + shfl.sync.bfly.b32 %r4229, %r4226, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.f32 %r4230, %r4226, %r4229; + .loc 2 291 36 // standard.py:291:36 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + shfl.sync.bfly.b32 %r4231, %r4230, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.f32 %r4232, %r4230, %r4231; + .loc 2 291 36 // standard.py:291:36 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + shfl.sync.bfly.b32 %r4233, %r4228, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.f32 %r4234, %r4228, %r4233; + .loc 2 291 36 // standard.py:291:36 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + shfl.sync.bfly.b32 %r4235, %r4234, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.f32 %r4236, %r4234, %r4235; + .loc 1 434 24 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:434:24 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + fma.rn.f32 %r4974, %r4974, %r4131, %r4232; + fma.rn.f32 %r4975, %r4975, %r4132, %r4236; + .loc 1 436 16 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:436:16 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + mul.f32 %r1567, %r1567, %r4131; + mul.f32 %r1568, %r1568, %r4131; + mul.f32 %r1569, %r1569, %r4132; + mul.f32 %r1570, %r1570, %r4132; + mul.f32 %r1571, %r1571, %r4131; + mul.f32 %r1572, %r1572, %r4131; + mul.f32 %r1573, %r1573, %r4132; + mul.f32 %r1574, %r1574, %r4132; + mul.f32 %r1575, %r1575, %r4131; + mul.f32 %r1576, %r1576, %r4131; + mul.f32 %r1577, %r1577, %r4132; + mul.f32 %r1578, %r1578, %r4132; + mul.f32 %r1579, %r1579, %r4131; + mul.f32 %r1580, %r1580, %r4131; + mul.f32 %r1581, %r1581, %r4132; + mul.f32 %r1582, %r1582, %r4132; + mul.f32 %r1583, %r1583, %r4131; + mul.f32 %r1584, %r1584, %r4131; + mul.f32 %r1585, %r1585, %r4132; + mul.f32 %r1586, %r1586, %r4132; + mul.f32 %r1587, %r1587, %r4131; + mul.f32 %r1588, %r1588, %r4131; + mul.f32 %r1589, %r1589, %r4132; + mul.f32 %r1590, %r1590, %r4132; + mul.f32 %r1591, %r1591, %r4131; + mul.f32 %r1592, %r1592, %r4131; + mul.f32 %r1593, %r1593, %r4132; + mul.f32 %r1594, %r1594, %r4132; + mul.f32 %r1595, %r1595, %r4131; + mul.f32 %r1596, %r1596, %r4131; + mul.f32 %r1597, %r1597, %r4132; + mul.f32 %r1598, %r1598, %r4132; + mul.f32 %r1599, %r1599, %r4131; + mul.f32 %r1600, %r1600, %r4131; + mul.f32 %r1601, %r1601, %r4132; + mul.f32 %r1602, %r1602, %r4132; + mul.f32 %r1603, %r1603, %r4131; + mul.f32 %r1604, %r1604, %r4131; + mul.f32 %r1605, %r1605, %r4132; + mul.f32 %r1606, %r1606, %r4132; + mul.f32 %r1607, %r1607, %r4131; + mul.f32 %r1608, %r1608, %r4131; + mul.f32 %r1609, %r1609, %r4132; + mul.f32 %r1610, %r1610, %r4132; + mul.f32 %r1611, %r1611, %r4131; + mul.f32 %r1612, %r1612, %r4131; + mul.f32 %r1613, %r1613, %r4132; + mul.f32 %r1614, %r1614, %r4132; + mul.f32 %r1615, %r1615, %r4131; + mul.f32 %r1616, %r1616, %r4131; + mul.f32 %r1617, %r1617, %r4132; + mul.f32 %r1618, %r1618, %r4132; + mul.f32 %r1619, %r1619, %r4131; + mul.f32 %r1620, %r1620, %r4131; + mul.f32 %r1621, %r1621, %r4132; + mul.f32 %r1622, %r1622, %r4132; + mul.f32 %r1623, %r1623, %r4131; + mul.f32 %r1624, %r1624, %r4131; + mul.f32 %r1625, %r1625, %r4132; + mul.f32 %r1626, %r1626, %r4132; + mul.f32 %r1627, %r1627, %r4131; + mul.f32 %r1628, %r1628, %r4131; + mul.f32 %r1629, %r1629, %r4132; + mul.f32 %r1630, %r1630, %r4132; + .loc 1 294 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:294:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.s32 %r4237, %r722, 49152; + add.s32 %r4238, %r4237, %r3983; + .loc 1 440 22 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:440:22 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + cvt.rn.bf16x2.f32 %r3564, %r4168, %r4167; + cvt.rn.bf16x2.f32 %r3565, %r4170, %r4169; + cvt.rn.bf16x2.f32 %r3566, %r4172, %r4171; + cvt.rn.bf16x2.f32 %r3567, %r4174, %r4173; + cvt.rn.bf16x2.f32 %r3696, %r4176, %r4175; + cvt.rn.bf16x2.f32 %r3697, %r4178, %r4177; + cvt.rn.bf16x2.f32 %r3698, %r4180, %r4179; + cvt.rn.bf16x2.f32 %r3699, %r4182, %r4181; + cvt.rn.bf16x2.f32 %r3828, %r4184, %r4183; + cvt.rn.bf16x2.f32 %r3829, %r4186, %r4185; + cvt.rn.bf16x2.f32 %r3830, %r4188, %r4187; + cvt.rn.bf16x2.f32 %r3831, %r4190, %r4189; + cvt.rn.bf16x2.f32 %r3960, %r4192, %r4191; + cvt.rn.bf16x2.f32 %r3961, %r4194, %r4193; + cvt.rn.bf16x2.f32 %r3962, %r4196, %r4195; + cvt.rn.bf16x2.f32 %r3963, %r4198, %r4197; + .loc 1 440 44 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:440:44 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + wgmma.fence.sync.aligned; + bfe.u32 %r4239, %r4238, 4, 14; + cvt.u64.u32 %rd263, %r4239; + or.b64 %rd225, %rd263, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630}, {%r3564,%r3565,%r3566,%r3567}, %rd225, %p164, 1, 1, 1; + // end inline asm + add.s32 %r4240, %r4238, 2048; + bfe.u32 %r4241, %r4240, 4, 14; + cvt.u64.u32 %rd264, %r4241; + or.b64 %rd226, %rd264, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630}, {%r3696,%r3697,%r3698,%r3699}, %rd226, %p164, 1, 1, 1; + // end inline asm + add.s32 %r4242, %r4238, 4096; + bfe.u32 %r4243, %r4242, 4, 14; + cvt.u64.u32 %rd265, %r4243; + or.b64 %rd227, %rd265, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630}, {%r3828,%r3829,%r3830,%r3831}, %rd227, %p164, 1, 1, 1; + // end inline asm + add.s32 %r4244, %r4238, 6144; + bfe.u32 %r4245, %r4244, 4, 14; + cvt.u64.u32 %rd266, %r4245; + or.b64 %rd228, %rd266, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630}, {%r3960,%r3961,%r3962,%r3963}, %rd228, %p164, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.s32 %r550, %r4907, 1; + .loc 1 247 33 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:247:33 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + shr.u32 %r4246, %r550, 1; + .loc 1 248 38 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:248:38 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + mad.wide.u32 %rd230, %r4246, 4, %rd164; + .loc 1 248 24 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:248:24 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + // begin inline asm + mov.u64 %rd229, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd229, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r3964, 0x0; + @%p175 ld.global.L1::evict_last.L2::cache_hint.b32 { %r3964 }, [ %rd230 + 0 ], %rd229; + // end inline asm + .loc 1 249 109 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:249:109 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.s32 %r4247, %r4246, 1; + .loc 1 249 113 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:249:113 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + setp.lt.s32 %p181, %r4247, %r2698; + .loc 1 249 55 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:249:55 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.s64 %rd233, %rd230, 4; + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + and.pred %p176, %p175, %p181; + .loc 1 249 25 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:249:25 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + // begin inline asm + mov.u64 %rd232, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd232, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r3965, 0x0; + @%p176 ld.global.L1::evict_last.L2::cache_hint.b32 { %r3965 }, [ %rd233 + 0 ], %rd232; + // end inline asm + .loc 1 250 35 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:250:35 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + and.b32 %r4248, %r4907, 1; + .loc 1 251 34 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:251:34 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + sub.s32 %r4249, %r3965, %r3964; + .loc 1 251 48 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:251:48 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + shl.b32 %r4250, %r4249, 7; + .loc 1 251 63 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:251:63 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.s32 %r4251, %r4250, -64; + .loc 1 252 29 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:252:29 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + xor.b32 %r4252, %r4248, 1; + .loc 1 252 61 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:252:61 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + shl.b32 %r4253, %r4248, 6; + .loc 1 252 42 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:252:42 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.s32 %r4254, %r4842, %r4253; + .loc 1 549 21 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:549:21 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + mad.lo.s32 %r4842, %r4251, %r4252, %r4254; + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.s32 %r4255, %r4841, 1; + setp.gt.s32 %p182, %r4255, 2; + selp.b32 %r4841, 0, %r4255, %p182; + .loc 1 342 32 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:342:32 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.s32 %r4256, %r4842, %r343; + .loc 1 346 35 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:346:35 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.s32 %r4257, %r4256, %r8; + add.s32 %r4258, %r4256, %r9; + add.s32 %r4259, %r4256, %r10; + add.s32 %r4260, %r4256, %r11; + .loc 1 284 38 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:38 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + shl.b32 %r4261, %r4257, 7; + shl.b32 %r4262, %r4258, 7; + shl.b32 %r4263, %r4259, 7; + shl.b32 %r4264, %r4260, 7; + .loc 1 284 49 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:49 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + mul.wide.s32 %rd267, %r4261, 2; + add.s64 %rd235, %rd10, %rd267; + mul.wide.s32 %rd268, %r4262, 2; + add.s64 %rd236, %rd10, %rd268; + mul.wide.s32 %rd269, %r4263, 2; + add.s64 %rd237, %rd10, %rd269; + mul.wide.s32 %rd270, %r4264, 2; + add.s64 %rd238, %rd10, %rd270; + .loc 1 294 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:294:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + shl.b32 %r4265, %r4841, 14; + add.s32 %r4266, %r722, %r4265; + bar.sync 0; + add.s32 %r3966, %r4266, %r35; + selp.b32 %r3967, 16, 0, %p177; + // begin inline asm + cp.async.cg.shared.global [ %r3966 + 0 ], [ %rd235 + 0 ], 0x10, %r3967; + // end inline asm + add.s32 %r3968, %r3966, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r3968 + 0 ], [ %rd236 + 0 ], 0x10, %r3967; + // end inline asm + add.s32 %r3970, %r3966, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r3970 + 0 ], [ %rd237 + 0 ], 0x10, %r3967; + // end inline asm + add.s32 %r3972, %r3966, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r3972 + 0 ], [ %rd238 + 0 ], 0x10, %r3967; + // end inline asm + cp.async.commit_group; + .loc 1 284 49 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:284:49 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.s64 %rd239, %rd11, %rd267; + add.s64 %rd240, %rd11, %rd268; + add.s64 %rd241, %rd11, %rd269; + add.s64 %rd242, %rd11, %rd270; + .loc 1 294 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:294:23 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + add.s32 %r4267, %r4237, %r4265; + add.s32 %r3974, %r4267, %r35; + // begin inline asm + cp.async.cg.shared.global [ %r3974 + 0 ], [ %rd239 + 0 ], 0x10, %r3967; + // end inline asm + add.s32 %r3976, %r3974, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r3976 + 0 ], [ %rd240 + 0 ], 0x10, %r3967; + // end inline asm + add.s32 %r3978, %r3974, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r3978 + 0 ], [ %rd241 + 0 ], 0x10, %r3967; + // end inline asm + add.s32 %r3980, %r3974, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r3980 + 0 ], [ %rd242 + 0 ], 0x10, %r3967; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + setp.ne.b32 %p183, %r410, %r550; + mov.b32 %r4907, %r550; + @%p183 bra $L__BB0_5; +$L__BB0_6: // %._crit_edge379 + .loc 1 0 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:0:40 + cvt.u32.u64 %r4468, %rd3; + .loc 1 502 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:502:40 @[ cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:198:45 ] + // begin inline asm + // wait for regs: %r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp10: + .loc 1 206 26 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:206:26 + setp.eq.f32 %p192, %r4974, 0f00000000; + setp.eq.f32 %p193, %r4975, 0f00000000; + .loc 1 206 34 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:206:34 + selp.f32 %r4469, 0f3F800000, %r4974, %p192; + selp.f32 %r619, 0f3F800000, %r4975, %p193; + .loc 1 208 16 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:208:16 + div.full.f32 %r4470, %r1567, %r4469; + div.full.f32 %r4471, %r1568, %r4469; + div.full.f32 %r4472, %r1569, %r619; + div.full.f32 %r4473, %r1570, %r619; + div.full.f32 %r4474, %r1571, %r4469; + div.full.f32 %r4475, %r1572, %r4469; + div.full.f32 %r4476, %r1573, %r619; + div.full.f32 %r4477, %r1574, %r619; + div.full.f32 %r4478, %r1575, %r4469; + div.full.f32 %r4479, %r1576, %r4469; + div.full.f32 %r4480, %r1577, %r619; + div.full.f32 %r4481, %r1578, %r619; + div.full.f32 %r4482, %r1579, %r4469; + div.full.f32 %r4483, %r1580, %r4469; + div.full.f32 %r4484, %r1581, %r619; + div.full.f32 %r4485, %r1582, %r619; + div.full.f32 %r4486, %r1583, %r4469; + div.full.f32 %r4487, %r1584, %r4469; + div.full.f32 %r4488, %r1585, %r619; + div.full.f32 %r4489, %r1586, %r619; + div.full.f32 %r4490, %r1587, %r4469; + div.full.f32 %r4491, %r1588, %r4469; + div.full.f32 %r4492, %r1589, %r619; + div.full.f32 %r4493, %r1590, %r619; + div.full.f32 %r4494, %r1591, %r4469; + div.full.f32 %r4495, %r1592, %r4469; + div.full.f32 %r4496, %r1593, %r619; + div.full.f32 %r4497, %r1594, %r619; + div.full.f32 %r4498, %r1595, %r4469; + div.full.f32 %r4499, %r1596, %r4469; + div.full.f32 %r4500, %r1597, %r619; + div.full.f32 %r4501, %r1598, %r619; + div.full.f32 %r4502, %r1599, %r4469; + div.full.f32 %r4503, %r1600, %r4469; + div.full.f32 %r4504, %r1601, %r619; + div.full.f32 %r4505, %r1602, %r619; + div.full.f32 %r4506, %r1603, %r4469; + div.full.f32 %r4507, %r1604, %r4469; + div.full.f32 %r4508, %r1605, %r619; + div.full.f32 %r4509, %r1606, %r619; + div.full.f32 %r4510, %r1607, %r4469; + div.full.f32 %r4511, %r1608, %r4469; + div.full.f32 %r4512, %r1609, %r619; + div.full.f32 %r4513, %r1610, %r619; + div.full.f32 %r4514, %r1611, %r4469; + div.full.f32 %r4515, %r1612, %r4469; + div.full.f32 %r4516, %r1613, %r619; + div.full.f32 %r4517, %r1614, %r619; + div.full.f32 %r4518, %r1615, %r4469; + div.full.f32 %r4519, %r1616, %r4469; + div.full.f32 %r4520, %r1617, %r619; + div.full.f32 %r4521, %r1618, %r619; + div.full.f32 %r4522, %r1619, %r4469; + div.full.f32 %r4523, %r1620, %r4469; + div.full.f32 %r4524, %r1621, %r619; + div.full.f32 %r4525, %r1622, %r619; + div.full.f32 %r4526, %r1623, %r4469; + div.full.f32 %r4527, %r1624, %r4469; + div.full.f32 %r4528, %r1625, %r619; + div.full.f32 %r4529, %r1626, %r619; + div.full.f32 %r4530, %r1627, %r4469; + div.full.f32 %r4531, %r1628, %r4469; + div.full.f32 %r4532, %r1629, %r619; + div.full.f32 %r4533, %r1630, %r619; + .loc 1 214 20 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:214:20 + setp.lt.s32 %p184, %r12, 2048; + setp.lt.s32 %p185, %r13, 2048; + setp.lt.s32 %p186, %r14, 2048; + setp.lt.s32 %p187, %r15, 2048; + setp.lt.s32 %p188, %r16, 2048; + setp.lt.s32 %p189, %r17, 2048; + setp.lt.s32 %p190, %r18, 2048; + setp.lt.s32 %p191, %r19, 2048; + .loc 1 218 49 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:218:49 + or.b32 %r4534, %r4468, %r4; + .loc 1 218 62 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:218:62 + or.b32 %r4535, %r4534, %r3; + .loc 1 218 75 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:218:75 + add.s32 %r4536, %r4535, %r20; + add.s32 %r4537, %r4535, %r21; + add.s32 %r4538, %r4535, %r22; + add.s32 %r4539, %r4535, %r23; + add.s32 %r4540, %r4535, %r24; + add.s32 %r4541, %r4535, %r25; + add.s32 %r4542, %r4535, %r26; + add.s32 %r4543, %r4535, %r27; + .loc 1 218 25 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:218:25 + mad.wide.s32 %rd271, %r4536, 2, %rd30; + mad.wide.s32 %rd272, %r4537, 2, %rd30; + mad.wide.s32 %rd273, %r4538, 2, %rd30; + mad.wide.s32 %rd274, %r4539, 2, %rd30; + mad.wide.s32 %rd275, %r4540, 2, %rd30; + mad.wide.s32 %rd276, %r4541, 2, %rd30; + mad.wide.s32 %rd277, %r4542, 2, %rd30; + mad.wide.s32 %rd278, %r4543, 2, %rd30; + .loc 1 218 109 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:218:109 + cvt.rn.bf16x2.f32 %r4544, %r4471, %r4470; + cvt.rn.bf16x2.f32 %r4545, %r4473, %r4472; + cvt.rn.bf16x2.f32 %r4546, %r4475, %r4474; + cvt.rn.bf16x2.f32 %r4547, %r4477, %r4476; + cvt.rn.bf16x2.f32 %r4548, %r4479, %r4478; + cvt.rn.bf16x2.f32 %r4549, %r4481, %r4480; + cvt.rn.bf16x2.f32 %r4550, %r4483, %r4482; + cvt.rn.bf16x2.f32 %r4551, %r4485, %r4484; + cvt.rn.bf16x2.f32 %r4552, %r4487, %r4486; + cvt.rn.bf16x2.f32 %r4553, %r4489, %r4488; + cvt.rn.bf16x2.f32 %r4554, %r4491, %r4490; + cvt.rn.bf16x2.f32 %r4555, %r4493, %r4492; + cvt.rn.bf16x2.f32 %r4556, %r4495, %r4494; + cvt.rn.bf16x2.f32 %r4557, %r4497, %r4496; + cvt.rn.bf16x2.f32 %r4558, %r4499, %r4498; + cvt.rn.bf16x2.f32 %r4559, %r4501, %r4500; + cvt.rn.bf16x2.f32 %r4560, %r4503, %r4502; + cvt.rn.bf16x2.f32 %r4561, %r4505, %r4504; + cvt.rn.bf16x2.f32 %r4562, %r4507, %r4506; + cvt.rn.bf16x2.f32 %r4563, %r4509, %r4508; + cvt.rn.bf16x2.f32 %r4564, %r4511, %r4510; + cvt.rn.bf16x2.f32 %r4565, %r4513, %r4512; + cvt.rn.bf16x2.f32 %r4566, %r4515, %r4514; + cvt.rn.bf16x2.f32 %r4567, %r4517, %r4516; + cvt.rn.bf16x2.f32 %r4568, %r4519, %r4518; + cvt.rn.bf16x2.f32 %r4569, %r4521, %r4520; + cvt.rn.bf16x2.f32 %r4570, %r4523, %r4522; + cvt.rn.bf16x2.f32 %r4571, %r4525, %r4524; + cvt.rn.bf16x2.f32 %r4572, %r4527, %r4526; + cvt.rn.bf16x2.f32 %r4573, %r4529, %r4528; + cvt.rn.bf16x2.f32 %r4574, %r4531, %r4530; + cvt.rn.bf16x2.f32 %r4575, %r4533, %r4532; + shl.b32 %r4576, %r34, 13; + shl.b32 %r4577, %r6, 5; + and.b32 %r4578, %r4577, 7264; + and.b32 %r4579, %r6, 24; + shl.b32 %r4580, %r4579, 4; + shl.b32 %r4581, %r6, 2; + and.b32 %r4582, %r4581, 16; + or.b32 %r4583, %r4576, %r4582; + or.b32 %r4584, %r4578, %r4580; + or.b32 %r4585, %r4583, %r4584; + add.s32 %r4587, %r722, %r4585; + st.shared.v4.b32 [%r4587], {%r4544, %r4546, %r4548, %r4550}; + st.shared.v4.b32 [%r4587+512], {%r4545, %r4547, %r4549, %r4551}; + xor.b32 %r4588, %r4585, 32; + add.s32 %r4589, %r722, %r4588; + st.shared.v4.b32 [%r4589], {%r4552, %r4554, %r4556, %r4558}; + st.shared.v4.b32 [%r4589+512], {%r4553, %r4555, %r4557, %r4559}; + xor.b32 %r4590, %r4585, 64; + add.s32 %r4591, %r722, %r4590; + st.shared.v4.b32 [%r4591], {%r4560, %r4562, %r4564, %r4566}; + st.shared.v4.b32 [%r4591+512], {%r4561, %r4563, %r4565, %r4567}; + xor.b32 %r4592, %r4585, 96; + add.s32 %r4593, %r722, %r4592; + st.shared.v4.b32 [%r4593], {%r4568, %r4570, %r4572, %r4574}; + st.shared.v4.b32 [%r4593+512], {%r4569, %r4571, %r4573, %r4575}; + bar.sync 0; + shl.b32 %r4594, %r4579, 10; + shl.b32 %r4595, %r34, 5; + and.b32 %r620, %r6, 252; + shl.b32 %r4596, %r620, 2; + or.b32 %r4597, %r4594, %r4595; + xor.b32 %r4598, %r4597, %r4596; + add.s32 %r4400, %r722, %r4598; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4396, %r4397, %r4398, %r4399}, [%r4400]; + // end inline asm + add.s32 %r4405, %r4400, 1024; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4401, %r4402, %r4403, %r4404}, [%r4405]; + // end inline asm + add.s32 %r4410, %r4400, 2048; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4406, %r4407, %r4408, %r4409}, [%r4410]; + // end inline asm + add.s32 %r4415, %r4400, 3072; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4411, %r4412, %r4413, %r4414}, [%r4415]; + // end inline asm + add.s32 %r4420, %r4400, 4096; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4416, %r4417, %r4418, %r4419}, [%r4420]; + // end inline asm + add.s32 %r4425, %r4400, 5120; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4421, %r4422, %r4423, %r4424}, [%r4425]; + // end inline asm + add.s32 %r4430, %r4400, 6144; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4426, %r4427, %r4428, %r4429}, [%r4430]; + // end inline asm + add.s32 %r4435, %r4400, 7168; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4431, %r4432, %r4433, %r4434}, [%r4435]; + // end inline asm + // begin inline asm + @%p184 st.global.v4.b32 [ %rd271 + 0 ], { %r4396, %r4397, %r4398, %r4399 }; + // end inline asm + // begin inline asm + @%p185 st.global.v4.b32 [ %rd272 + 0 ], { %r4401, %r4402, %r4403, %r4404 }; + // end inline asm + // begin inline asm + @%p186 st.global.v4.b32 [ %rd273 + 0 ], { %r4406, %r4407, %r4408, %r4409 }; + // end inline asm + // begin inline asm + @%p187 st.global.v4.b32 [ %rd274 + 0 ], { %r4411, %r4412, %r4413, %r4414 }; + // end inline asm + // begin inline asm + @%p188 st.global.v4.b32 [ %rd275 + 0 ], { %r4416, %r4417, %r4418, %r4419 }; + // end inline asm + // begin inline asm + @%p189 st.global.v4.b32 [ %rd276 + 0 ], { %r4421, %r4422, %r4423, %r4424 }; + // end inline asm + // begin inline asm + @%p190 st.global.v4.b32 [ %rd277 + 0 ], { %r4426, %r4427, %r4428, %r4429 }; + // end inline asm + // begin inline asm + @%p191 st.global.v4.b32 [ %rd278 + 0 ], { %r4431, %r4432, %r4433, %r4434 }; + // end inline asm + .loc 1 223 33 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:223:33 + setp.lt.f32 %p194, %r4469, 0f00800000; + mul.f32 %r4599, %r4469, 0f4B000000; + selp.f32 %r621, %r4599, %r4469, %p194; + selp.f32 %r4600, 0fC1B80000, 0f00000000, %p194; + add.s32 %r4601, %r621, -1060439283; + and.b32 %r4602, %r4601, -8388608; + sub.s32 %r4603, %r621, %r4602; + cvt.rn.f32.s32 %r4604, %r4602; + mov.b32 %r4605, 0f34000000; + fma.rn.ftz.f32 %r4606, %r4604, %r4605, %r4600; + add.f32 %r4607, %r4603, 0fBF800000; + mov.b32 %r4608, 0fBE2C7F30; + mov.b32 %r4609, 0f3DC6B27F; + fma.rn.ftz.f32 %r4610, %r4609, %r4607, %r4608; + mov.b32 %r4611, 0f3E2FCF2A; + fma.rn.ftz.f32 %r4612, %r4610, %r4607, %r4611; + mov.b32 %r4613, 0fBE374E43; + fma.rn.ftz.f32 %r4614, %r4612, %r4607, %r4613; + mov.b32 %r4615, 0f3E520BF4; + fma.rn.ftz.f32 %r4616, %r4614, %r4607, %r4615; + mov.b32 %r4617, 0fBE763C8B; + fma.rn.ftz.f32 %r4618, %r4616, %r4607, %r4617; + mov.b32 %r4619, 0f3E93BF99; + fma.rn.ftz.f32 %r4620, %r4618, %r4607, %r4619; + mov.b32 %r4621, 0fBEB8AA49; + fma.rn.ftz.f32 %r4622, %r4620, %r4607, %r4621; + mov.b32 %r4623, 0f3EF6384A; + fma.rn.ftz.f32 %r4624, %r4622, %r4607, %r4623; + mov.b32 %r4625, 0fBF38AA3B; + fma.rn.ftz.f32 %r4626, %r4624, %r4607, %r4625; + mul.f32 %r4627, %r4607, %r4626; + mul.f32 %r4628, %r4607, %r4627; + mov.b32 %r4629, 0f3FB8AA3B; + fma.rn.ftz.f32 %r4630, %r4607, %r4629, %r4628; + add.f32 %r4976, %r4606, %r4630; + setp.lt.u32 %p195, %r621, 2139095040; + mov.b32 %r4631, 0f7F800000; + @%p195 bra $L__BB0_8; +// %bb.7: // %__nv_fmaf_rn.exit.i.i + .loc 1 0 33 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:0:33 + fma.rn.ftz.f32 %r4976, %r621, %r4631, %r4631; +$L__BB0_8: // %__nv_log2f.exit + ld.param.b64 %rd27, [triton_tem_fused_0_param_3]; + .loc 1 223 33 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:223:33 + setp.lt.f32 %p196, %r619, 0f00800000; + mul.f32 %r4632, %r619, 0f4B000000; + selp.f32 %r625, %r4632, %r619, %p196; + selp.f32 %r4633, 0fC1B80000, 0f00000000, %p196; + add.s32 %r4634, %r625, -1060439283; + and.b32 %r4635, %r4634, -8388608; + sub.s32 %r4636, %r625, %r4635; + cvt.rn.f32.s32 %r4637, %r4635; + fma.rn.ftz.f32 %r4639, %r4637, %r4605, %r4633; + add.f32 %r4640, %r4636, 0fBF800000; + fma.rn.ftz.f32 %r4643, %r4609, %r4640, %r4608; + fma.rn.ftz.f32 %r4645, %r4643, %r4640, %r4611; + fma.rn.ftz.f32 %r4647, %r4645, %r4640, %r4613; + fma.rn.ftz.f32 %r4649, %r4647, %r4640, %r4615; + fma.rn.ftz.f32 %r4651, %r4649, %r4640, %r4617; + fma.rn.ftz.f32 %r4653, %r4651, %r4640, %r4619; + fma.rn.ftz.f32 %r4655, %r4653, %r4640, %r4621; + fma.rn.ftz.f32 %r4657, %r4655, %r4640, %r4623; + fma.rn.ftz.f32 %r4659, %r4657, %r4640, %r4625; + mul.f32 %r4660, %r4640, %r4659; + mul.f32 %r4661, %r4640, %r4660; + fma.rn.ftz.f32 %r4663, %r4640, %r4629, %r4661; + add.f32 %r4977, %r4639, %r4663; + setp.lt.u32 %p197, %r625, 2139095040; + @%p197 bra $L__BB0_10; +// %bb.9: // %__nv_fmaf_rn.exit.i.i320 + .loc 1 0 33 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:0:33 + fma.rn.ftz.f32 %r4977, %r625, %r4631, %r4631; +$L__BB0_10: // %__nv_log2f.exit323 + .loc 1 223 33 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:223:33 + setp.eq.f32 %p199, %r625, 0f00000000; + setp.eq.f32 %p200, %r621, 0f00000000; + .loc 1 222 32 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:222:32 + shl.b32 %r4666, %r1, 16; + shl.b32 %r4667, %r2, 11; + add.s32 %r4668, %r4666, %r4667; + .loc 1 222 23 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:222:23 + mad.wide.s32 %rd280, %r4668, 4, %rd27; + .loc 1 144 46 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:144:46 + and.b32 %r4669, %r6, 127; + .loc 1 144 33 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:144:33 + or.b32 %r4670, %r5, %r4669; + .loc 1 222 40 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:222:40 + mad.wide.s32 %rd279, %r4670, 4, %rd280; + .loc 1 223 33 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:223:33 + selp.f32 %r4671, 0fFF800000, %r4976, %p200; + selp.f32 %r4672, 0fFF800000, %r4977, %p199; + .loc 1 223 20 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:223:20 + mov.b64 {%r4673, %r4674}, %rd284; + add.f32 %r4675, %r4674, %r4672; + add.f32 %r4676, %r4673, %r4671; + .loc 1 225 29 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:225:29 + bar.sync 0; + shl.b32 %r4677, %r620, 1; + add.s32 %r4679, %r722, %r4677; + st.shared.v2.b32 [%r4679], {%r4676, %r4675}; + bar.sync 0; + shl.b32 %r4680, %r28, 3; + shl.b32 %r4681, %r29, 2; + shr.u32 %r4682, %r30, 1; + add.s32 %r4683, %r722, %r4680; + add.s32 %r4684, %r4683, %r4681; + add.s32 %r4685, %r4684, %r4682; + ld.shared.b32 %r4665, [%r4685]; + and.b32 %r4686, %r6, 128; + setp.eq.b32 %p198, %r4686, 0; + // begin inline asm + @%p198 st.global.b32 [ %rd279 + 0 ], { %r4665 }; + // end inline asm + .loc 1 229 4 // cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py:229:4 + ret; +$L__tmp11: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 251 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xf4 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 106 +.b8 50 +.b8 117 +.b8 50 +.b8 108 +.b8 97 +.b8 97 +.b8 119 +.b8 117 +.b8 98 +.b8 118 +.b8 101 +.b8 102 +.b8 55 +.b8 116 +.b8 53 +.b8 114 +.b8 116 +.b8 118 +.b8 122 +.b8 97 +.b8 120 +.b8 54 +.b8 122 +.b8 101 +.b8 98 +.b8 111 +.b8 114 +.b8 100 +.b8 108 +.b8 113 +.b8 108 +.b8 106 +.b8 117 +.b8 121 +.b8 51 +.b8 100 +.b8 104 +.b8 53 +.b8 121 +.b8 97 +.b8 119 +.b8 121 +.b8 122 +.b8 117 +.b8 108 +.b8 108 +.b8 105 +.b8 114 +.b8 112 +.b8 97 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 106 +.b8 50 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x15 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 116 +.b8 101 +.b8 109 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa0:0x5e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xb5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 146 // DW_AT_call_line +.b8 101 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xcd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 172 // DW_AT_call_line +.b8 41 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xe5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp10 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 198 // DW_AT_call_line +.b8 45 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.source b/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.source new file mode 100644 index 0000000000000000000000000000000000000000..b766b55fc01a54fec435e89591ec7dce374db051 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.source @@ -0,0 +1,1132 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":18:0) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":271:0) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":32:0) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":448:0) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":299:0) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":256:0) +#loc220 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":175:0) +#loc222 = loc(unknown) +#loc225 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":167:0) +#loc229 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc233 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc237 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":240:0) +#loc258 = loc("arg_Q"(#loc)) +#loc259 = loc("arg_K"(#loc)) +#loc260 = loc("arg_V"(#loc)) +#loc261 = loc("arg_LSE"(#loc)) +#loc262 = loc("arg_MAX"(#loc)) +#loc263 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc264 = loc("arg_KV_IDX"(#loc)) +#loc265 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc266 = loc("arg_FULL_KV_IDX"(#loc)) +#loc267 = loc("in_ptr9"(#loc)) +#loc268 = loc("out_ptr0"(#loc)) +#loc367 = loc("ptr"(#loc122)) +#loc368 = loc("offs_m"(#loc122)) +#loc369 = loc("offs_n"(#loc122)) +#loc370 = loc("stride_m"(#loc122)) +#loc371 = loc("stride_n"(#loc122)) +#loc372 = loc("M_LEN"(#loc122)) +#loc379 = loc("x"(#loc132)) +#loc380 = loc("arg_Q"(#loc138)) +#loc381 = loc("arg_K"(#loc138)) +#loc382 = loc("arg_V"(#loc138)) +#loc383 = loc("arg_LSE"(#loc138)) +#loc384 = loc("arg_MAX"(#loc138)) +#loc385 = loc("arg_KV_NUM_BLKS"(#loc138)) +#loc386 = loc("arg_KV_IDX"(#loc138)) +#loc387 = loc("arg_FULL_KV_NUM_BLKS"(#loc138)) +#loc388 = loc("arg_FULL_KV_IDX"(#loc138)) +#loc389 = loc("in_ptr9"(#loc138)) +#loc390 = loc("out_ptr0"(#loc138)) +#loc391 = loc("q"(#loc138)) +#loc392 = loc("K"(#loc138)) +#loc393 = loc("V"(#loc138)) +#loc394 = loc("Q_LEN"(#loc138)) +#loc395 = loc("KV_LEN"(#loc138)) +#loc396 = loc("acc"(#loc138)) +#loc397 = loc("l_i"(#loc138)) +#loc398 = loc("m_i"(#loc138)) +#loc399 = loc("off_z"(#loc138)) +#loc400 = loc("off_h"(#loc138)) +#loc401 = loc("offs_m"(#loc138)) +#loc402 = loc("offs_n"(#loc138)) +#loc403 = loc("kv_start"(#loc138)) +#loc404 = loc("kv_indices"(#loc138)) +#loc405 = loc("kv_num_blocks"(#loc138)) +#loc406 = loc("block_n_end"(#loc138)) +#loc407 = loc("stride_kk"(#loc138)) +#loc408 = loc("stride_kn"(#loc138)) +#loc409 = loc("stride_vn"(#loc138)) +#loc410 = loc("stride_vk"(#loc138)) +#loc416 = loc("arg_Q"(#loc148)) +#loc417 = loc("arg_K"(#loc148)) +#loc418 = loc("arg_V"(#loc148)) +#loc419 = loc("arg_LSE"(#loc148)) +#loc420 = loc("arg_MAX"(#loc148)) +#loc421 = loc("arg_KV_NUM_BLKS"(#loc148)) +#loc422 = loc("arg_KV_IDX"(#loc148)) +#loc423 = loc("arg_FULL_KV_NUM_BLKS"(#loc148)) +#loc424 = loc("arg_FULL_KV_IDX"(#loc148)) +#loc425 = loc("in_ptr9"(#loc148)) +#loc426 = loc("out_ptr0"(#loc148)) +#loc427 = loc("q"(#loc148)) +#loc428 = loc("K"(#loc148)) +#loc429 = loc("V"(#loc148)) +#loc430 = loc("Q_LEN"(#loc148)) +#loc431 = loc("KV_LEN"(#loc148)) +#loc432 = loc("acc"(#loc148)) +#loc433 = loc("l_i"(#loc148)) +#loc434 = loc("m_i"(#loc148)) +#loc435 = loc("off_z"(#loc148)) +#loc436 = loc("off_h"(#loc148)) +#loc437 = loc("offs_m"(#loc148)) +#loc438 = loc("offs_n"(#loc148)) +#loc439 = loc("kv_start"(#loc148)) +#loc440 = loc("kv_offset"(#loc148)) +#loc441 = loc("stride_kk"(#loc148)) +#loc442 = loc("stride_kn"(#loc148)) +#loc443 = loc("stride_vn"(#loc148)) +#loc444 = loc("stride_vk"(#loc148)) +#loc511 = loc("indices"(#loc217)) +#loc512 = loc("input"(#loc220)) +#loc513 = loc("a"(#loc225)) +#loc514 = loc("b"(#loc225)) +#loc515 = loc("input"(#loc229)) +#loc516 = loc("a"(#loc233)) +#loc517 = loc("b"(#loc233)) +#loc518 = loc("loop_iter"(#loc237)) +#loc519 = loc("col_indices"(#loc237)) +#loc520 = loc("total_blocks"(#loc237)) +module { + tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %in_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr9"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc))) attributes {noinline = false} { + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c2097152_i32 = arith.constant 2097152 : i32 loc(#loc2) + %c262144_i32 = arith.constant 262144 : i32 loc(#loc2) + %c128_i32_0 = arith.constant 128 : i32 loc(#loc2) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc2) + %c2097152_i32_2 = arith.constant 2097152 : i32 loc(#loc3) + %c262144_i32_3 = arith.constant 262144 : i32 loc(#loc3) + %c128_i32_4 = arith.constant 128 : i32 loc(#loc3) + %c1_i32_5 = arith.constant 1 : i32 loc(#loc3) + %ZQ = arith.constant 2 : i32 loc(#loc269) + %HQ = arith.constant 32 : i32 loc(#loc270) + %Q_LEN = arith.constant 2048 : i32 loc(#loc271) + %ZKV = arith.constant 2 : i32 loc(#loc272) + %KV_LEN = arith.constant 2048 : i32 loc(#loc273) + %q_start = tt.get_program_id x : i32 loc(#loc274) + %off_zq = tt.get_program_id y : i32 loc(#loc275) + %off_hq = tt.get_program_id z : i32 loc(#loc276) + %off_zkv = arith.remsi %off_zq, %ZKV : i32 loc(#loc277) + %off_hkv = arith.constant 4 : i32 loc(#loc278) + %off_hkv_6 = arith.constant 4 : i32 loc(#loc278) + %off_hkv_7 = arith.divsi %off_hq, %off_hkv_6 : i32 loc(#loc278) + %off_g = arith.constant 4 : i32 loc(#loc279) + %off_g_8 = arith.constant 4 : i32 loc(#loc279) + %off_g_9 = arith.remsi %off_hq, %off_g_8 : i32 loc(#loc279) + %q_offset = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc280) + %q_offset_10 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc281) + %q_offset_11 = arith.addi %q_offset, %q_offset_10 : i32 loc(#loc282) + %k_offset = arith.muli %off_zkv, %c2097152_i32 : i32 loc(#loc283) + %k_offset_12 = arith.muli %off_hkv_7, %c262144_i32 : i32 loc(#loc284) + %k_offset_13 = arith.addi %k_offset, %k_offset_12 : i32 loc(#loc285) + %v_offset = arith.muli %off_zkv, %c2097152_i32_2 : i32 loc(#loc286) + %v_offset_14 = arith.muli %off_hkv_7, %c262144_i32_3 : i32 loc(#loc287) + %v_offset_15 = arith.addi %v_offset, %v_offset_14 : i32 loc(#loc288) + %Q = tt.addptr %arg_Q, %q_offset_11 : !tt.ptr, i32 loc(#loc289) + %K = tt.addptr %arg_K, %k_offset_13 : !tt.ptr, i32 loc(#loc290) + %V = tt.addptr %arg_V, %v_offset_15 : !tt.ptr, i32 loc(#loc291) + %SPARSE_Z = arith.constant 2 : i32 loc(#loc292) + %SPARSE_HQ = arith.constant 1 : i32 loc(#loc293) + %sparse_idx_z = arith.remsi %off_zq, %SPARSE_Z : i32 loc(#loc294) + %sparse_idx_hq = arith.remsi %off_hq, %SPARSE_HQ : i32 loc(#loc295) + %stride_kv_num_blks_h = arith.constant 16 : i32 loc(#loc296) + %stride_kv_idx_h = arith.constant 256 : i32 loc(#loc297) + %stride_kv_idx_m = arith.constant 16 : i32 loc(#loc298) + %m_i = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128xf32> loc(#loc299) + %m_i_16 = arith.constant 0x7F800000 : f32 loc(#loc300) + %m_i_17 = arith.constant 0x7F800000 : f32 loc(#loc300) + %m_i_18 = arith.constant dense<0x7F800000> : tensor<128xf32> loc(#loc300) + %m_i_19 = arith.subf %m_i, %m_i_18 : tensor<128xf32> loc(#loc300) + %l_i = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128xf32> loc(#loc301) + %acc = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc302) + %offs_m = arith.constant 128 : i32 loc(#loc303) + %offs_m_20 = arith.constant 128 : i32 loc(#loc303) + %offs_m_21 = arith.muli %q_start, %offs_m_20 : i32 loc(#loc303) + %offs_m_22 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc304) + %offs_m_23 = tt.splat %offs_m_21 : i32 -> tensor<128xi32> loc(#loc305) + %offs_m_24 = arith.addi %offs_m_23, %offs_m_22 : tensor<128xi32> loc(#loc305) + %sparse_hz_offset = arith.muli %sparse_idx_z, %SPARSE_HQ : i32 loc(#loc306) + %sparse_hz_offset_25 = arith.addi %sparse_hz_offset, %sparse_idx_hq : i32 loc(#loc307) + %sparse_kv_num_blks_offset = arith.muli %sparse_hz_offset_25, %stride_kv_num_blks_h : i32 loc(#loc308) + %sparse_kv_num_blks_offset_26 = arith.constant 1 : i32 loc(#loc309) + %sparse_kv_num_blks_offset_27 = arith.constant 1 : i32 loc(#loc309) + %sparse_kv_num_blks_offset_28 = arith.divsi %q_start, %sparse_kv_num_blks_offset_27 : i32 loc(#loc309) + %sparse_kv_num_blks_offset_29 = arith.addi %sparse_kv_num_blks_offset, %sparse_kv_num_blks_offset_28 : i32 loc(#loc310) + %sparse_kv_idx_offset = arith.muli %sparse_hz_offset_25, %stride_kv_idx_h : i32 loc(#loc311) + %sparse_kv_idx_offset_30 = arith.constant 1 : i32 loc(#loc312) + %sparse_kv_idx_offset_31 = arith.constant 1 : i32 loc(#loc312) + %sparse_kv_idx_offset_32 = arith.divsi %q_start, %sparse_kv_idx_offset_31 : i32 loc(#loc312) + %sparse_kv_idx_offset_33 = arith.muli %sparse_kv_idx_offset_32, %stride_kv_idx_m : i32 loc(#loc313) + %sparse_kv_idx_offset_34 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_33 : i32 loc(#loc314) + %offs_m_35 = arith.constant 128 : i32 loc(#loc315) + %offs_m_36 = arith.constant 128 : i32 loc(#loc315) + %offs_m_37 = arith.muli %q_start, %offs_m_36 : i32 loc(#loc315) + %offs_m_38 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc316) + %offs_m_39 = tt.splat %offs_m_37 : i32 -> tensor<128xi32> loc(#loc317) + %offs_m_40 = arith.addi %offs_m_39, %offs_m_38 : tensor<128xi32> loc(#loc317) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc318) + %q = tt.call @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%Q, %offs_m_40, %offs_k, %c4096_i32, %c1_i32, %Q_LEN) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc319) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_34 : !tt.ptr, i32 loc(#loc320) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc321) + %kv_start_41 = arith.constant 128 : i32 loc(#loc322) + %kv_start_42 = arith.constant 128 : i32 loc(#loc322) + %kv_start_43 = arith.muli %kv_start, %kv_start_42 : i32 loc(#loc322) + %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_29 : !tt.ptr, i32 loc(#loc323) + %kv_num_blocks_44 = tt.load %kv_num_blocks : !tt.ptr loc(#loc324) + %block_n_end = arith.constant 2 : i32 loc(#loc325) + %block_n_end_45 = arith.constant 2 : i32 loc(#loc325) + %block_n_end_46 = arith.muli %kv_num_blocks_44, %block_n_end_45 : i32 loc(#loc325) + %block_n_end_47 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%KV_LEN) : (i32) -> i32 loc(#loc326) + %block_n_end_48 = arith.constant 1 : i32 loc(#loc327) + %block_n_end_49 = arith.maxsi %block_n_end_47, %block_n_end_48 : i32 loc(#loc327) + %block_n_end_50 = arith.minsi %block_n_end_46, %block_n_end_49 : i32 loc(#loc328) + %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc329) + %offs_n_51 = tt.splat %kv_start_43 : i32 -> tensor<64xi32> loc(#loc330) + %offs_n_52 = arith.addi %offs_n_51, %offs_n : tensor<64xi32> loc(#loc330) + %0 = tt.expand_dims %offs_m_40 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc66) + %1 = tt.expand_dims %offs_n_52 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc67) + %2:3 = tt.call @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(14,)cNone_(15,)cNone_(28,)cconstexpr_0__(30,)cconstexpr_bf16__(35,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %q, %K, %V, %Q_LEN, %KV_LEN, %acc, %l_i, %m_i_19, %off_zq, %off_hq, %0, %1, %kv_start_43, %kv_indices, %kv_num_blocks_44, %block_n_end_50, %c1_i32_1, %c128_i32_0, %c128_i32_4, %c1_i32_5) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, !tt.ptr, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc68) + %kv_indices_53 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_34 : !tt.ptr, i32 loc(#loc331) + %kv_start_54 = tt.load %kv_indices_53 : !tt.ptr loc(#loc332) + %kv_start_55 = arith.constant 128 : i32 loc(#loc333) + %kv_start_56 = arith.constant 128 : i32 loc(#loc333) + %kv_start_57 = arith.muli %kv_start_54, %kv_start_56 : i32 loc(#loc333) + %kv_num_blocks_58 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_29 : !tt.ptr, i32 loc(#loc334) + %kv_num_blocks_59 = tt.load %kv_num_blocks_58 : !tt.ptr loc(#loc335) + %block_n_end_60 = arith.constant 2 : i32 loc(#loc336) + %block_n_end_61 = arith.constant 2 : i32 loc(#loc336) + %block_n_end_62 = arith.muli %kv_num_blocks_59, %block_n_end_61 : i32 loc(#loc336) + %block_n_end_63 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%KV_LEN) : (i32) -> i32 loc(#loc337) + %block_n_end_64 = arith.constant 1 : i32 loc(#loc338) + %block_n_end_65 = arith.maxsi %block_n_end_63, %block_n_end_64 : i32 loc(#loc338) + %block_n_end_66 = arith.minsi %block_n_end_62, %block_n_end_65 : i32 loc(#loc339) + %offs_n_67 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc340) + %offs_n_68 = tt.splat %kv_start_57 : i32 -> tensor<64xi32> loc(#loc341) + %offs_n_69 = arith.addi %offs_n_68, %offs_n_67 : tensor<64xi32> loc(#loc341) + %3 = tt.expand_dims %offs_m_40 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc80) + %4 = tt.expand_dims %offs_n_69 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc81) + %5:3 = tt.call @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(14,)cNone_(15,)cNone_(28,)cconstexpr_0__(30,)cconstexpr_bf16__(35,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %q, %K, %V, %Q_LEN, %KV_LEN, %2#0, %2#1, %2#2, %off_zq, %off_hq, %3, %4, %kv_start_57, %kv_indices_53, %kv_num_blocks_59, %block_n_end_66, %c1_i32_1, %c128_i32_0, %c128_i32_4, %c1_i32_5) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, !tt.ptr, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc82) + %l_i_70 = arith.constant 0.000000e+00 : f32 loc(#loc342) + %l_i_71 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc342) + %l_i_72 = arith.cmpf oeq, %5#1, %l_i_71 : tensor<128xf32> loc(#loc342) + %l_i_73 = arith.constant 1 : i32 loc(#loc343) + %l_i_74 = arith.constant 1.000000e+00 : f32 loc(#loc343) + %l_i_75 = arith.constant dense<1.000000e+00> : tensor<128xf32> loc(#loc343) + %l_i_76 = arith.select %l_i_72, %l_i_75, %5#1 : tensor<128xi1>, tensor<128xf32> loc(#loc343) + %acc_77 = tt.expand_dims %l_i_76 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc344) + %acc_78 = tt.broadcast %acc_77 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc345) + %acc_79 = arith.divf %5#0, %acc_78 : tensor<128x128xf32> loc(#loc345) + %idx_zq = tt.get_program_id y : i32 loc(#loc346) + %idx_hq = tt.get_program_id z : i32 loc(#loc347) + %idx_m = tt.expand_dims %offs_m_40 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc348) + %idx_d = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc349) + %idx_d_80 = tt.expand_dims %idx_d {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc350) + %mask = arith.constant dense<2048> : tensor<128x1xi32> loc(#loc351) + %mask_81 = arith.cmpi slt, %idx_m, %mask : tensor<128x1xi32> loc(#loc351) + %mask_82 = arith.constant 128 : i32 loc(#loc352) + %mask_83 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc352) + %mask_84 = arith.cmpi slt, %idx_d_80, %mask_83 : tensor<1x128xi32> loc(#loc352) + %mask_85 = tt.broadcast %mask_81 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc353) + %mask_86 = tt.broadcast %mask_84 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc353) + %mask_87 = arith.andi %mask_85, %mask_86 : tensor<128x128xi1> loc(#loc353) + %xindex = arith.constant 128 : i32 loc(#loc354) + %xindex_88 = arith.constant 128 : i32 loc(#loc354) + %xindex_89 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc354) + %xindex_90 = arith.muli %xindex_89, %idx_m : tensor<128x1xi32> loc(#loc354) + %xindex_91 = tt.broadcast %idx_d_80 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc355) + %xindex_92 = tt.broadcast %xindex_90 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc355) + %xindex_93 = arith.addi %xindex_91, %xindex_92 : tensor<128x128xi32> loc(#loc355) + %xindex_94 = arith.constant 262144 : i32 loc(#loc356) + %xindex_95 = arith.constant 262144 : i32 loc(#loc356) + %xindex_96 = arith.muli %xindex_95, %idx_hq : i32 loc(#loc356) + %xindex_97 = tt.splat %xindex_96 : i32 -> tensor<128x128xi32> loc(#loc357) + %xindex_98 = arith.addi %xindex_93, %xindex_97 : tensor<128x128xi32> loc(#loc357) + %xindex_99 = arith.constant 8388608 : i32 loc(#loc358) + %xindex_100 = arith.constant 8388608 : i32 loc(#loc358) + %xindex_101 = arith.muli %xindex_100, %idx_zq : i32 loc(#loc358) + %xindex_102 = tt.splat %xindex_101 : i32 -> tensor<128x128xi32> loc(#loc359) + %xindex_103 = arith.addi %xindex_98, %xindex_102 : tensor<128x128xi32> loc(#loc359) + %c128_i32_104 = arith.constant 128 : i32 loc(#loc101) + %c128_i32_105 = arith.constant 128 : i32 loc(#loc101) + %6 = arith.muli %c128_i32_105, %idx_hq : i32 loc(#loc101) + %7 = tt.splat %6 : i32 -> tensor<1x128xi32> loc(#loc102) + %8 = arith.addi %idx_d_80, %7 : tensor<1x128xi32> loc(#loc102) + %c4096_i32_106 = arith.constant 4096 : i32 loc(#loc103) + %c4096_i32_107 = arith.constant 4096 : i32 loc(#loc103) + %cst = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc103) + %9 = arith.muli %cst, %idx_m : tensor<128x1xi32> loc(#loc103) + %10 = tt.broadcast %8 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc104) + %11 = tt.broadcast %9 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc104) + %12 = arith.addi %10, %11 : tensor<128x128xi32> loc(#loc104) + %c8388608_i32_108 = arith.constant 8388608 : i32 loc(#loc105) + %c8388608_i32_109 = arith.constant 8388608 : i32 loc(#loc105) + %13 = arith.muli %c8388608_i32_109, %idx_zq : i32 loc(#loc105) + %14 = tt.splat %13 : i32 -> tensor<128x128xi32> loc(#loc106) + %15 = arith.addi %12, %14 : tensor<128x128xi32> loc(#loc106) + %16 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc107) + %17 = tt.addptr %16, %15 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc107) + %18 = arith.truncf %acc_79 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc108) + tt.store %17, %18, %mask_87 : tensor<128x128x!tt.ptr> loc(#loc108) + %off_hz = arith.muli %off_zq, %HQ : i32 loc(#loc360) + %off_hz_110 = arith.addi %off_hz, %off_hq : i32 loc(#loc361) + %l_ptrs = arith.muli %off_hz_110, %Q_LEN : i32 loc(#loc362) + %l_ptrs_111 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr, i32 loc(#loc363) + %l_ptrs_112 = tt.splat %l_ptrs_111 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc364) + %l_ptrs_113 = tt.addptr %l_ptrs_112, %offs_m_40 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc364) + %lse = math.log2 %l_i_76 : tensor<128xf32> loc(#loc365) + %lse_114 = arith.addf %5#2, %lse : tensor<128xf32> loc(#loc366) + tt.store %l_ptrs_113, %lse_114 : tensor<128x!tt.ptr> loc(#loc116) + tt.return loc(#loc117) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(1,)cconstexpr_fp32_"() -> tensor<128xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc119) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc119) + tt.return %cst_0 : tensor<128xf32> loc(#loc120) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128xf32> loc(#loc121) + tt.return %0 : tensor<128xf32> loc(#loc121) + } loc(#loc118) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() -> tensor<128x128xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc119) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc119) + tt.return %cst_0 : tensor<128x128xf32> loc(#loc120) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc121) + tt.return %0 : tensor<128x128xf32> loc(#loc121) + } loc(#loc118) + tt.func private @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: !tt.ptr loc("ptr"(#loc122)), %offs_m: tensor<128xi32> loc("offs_m"(#loc122)), %offs_n: tensor<128xi32> loc("offs_n"(#loc122)), %stride_m: i32 loc("stride_m"(#loc122)), %stride_n: i32 loc("stride_n"(#loc122)), %M_LEN: i32 loc("M_LEN"(#loc122))) -> tensor<128x128xbf16> attributes {noinline = false} { + %ptr_0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc373) + %ptr_1 = tt.splat %stride_m : i32 -> tensor<128x1xi32> loc(#loc374) + %ptr_2 = arith.muli %ptr_0, %ptr_1 : tensor<128x1xi32> loc(#loc374) + %ptr_3 = tt.splat %ptr : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc375) + %ptr_4 = tt.addptr %ptr_3, %ptr_2 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc375) + %ptr_5 = tt.expand_dims %offs_n {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc376) + %ptr_6 = tt.splat %stride_n : i32 -> tensor<1x128xi32> loc(#loc377) + %ptr_7 = arith.muli %ptr_5, %ptr_6 : tensor<1x128xi32> loc(#loc377) + %ptr_8 = tt.broadcast %ptr_4 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc378) + %ptr_9 = tt.broadcast %ptr_7 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc378) + %ptr_10 = tt.addptr %ptr_8, %ptr_9 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc378) + %0 = tt.load %ptr_10 : tensor<128x128x!tt.ptr> loc(#loc129) + tt.return %0 : tensor<128x128xbf16> loc(#loc130) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x128xbf16> loc(#loc131) + tt.return %1 : tensor<128x128xbf16> loc(#loc131) + } loc(#loc122) + tt.func private @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%x: i32 loc("x"(#loc132))) -> i32 attributes {noinline = false} { + %c64_i32 = arith.constant 64 : i32 loc(#loc133) + %c64_i32_0 = arith.constant 64 : i32 loc(#loc133) + %0 = arith.addi %x, %c64_i32_0 : i32 loc(#loc133) + %c1_i32 = arith.constant 1 : i32 loc(#loc134) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc134) + %1 = arith.subi %0, %c1_i32_1 : i32 loc(#loc134) + %c64_i32_2 = arith.constant 64 : i32 loc(#loc135) + %c64_i32_3 = arith.constant 64 : i32 loc(#loc135) + %2 = arith.divsi %1, %c64_i32_3 : i32 loc(#loc135) + tt.return %2 : i32 loc(#loc136) + ^bb1: // no predecessors + %3 = ub.poison : i32 loc(#loc137) + tt.return %3 : i32 loc(#loc137) + } loc(#loc132) + tt.func private @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(14,)cNone_(15,)cNone_(28,)cconstexpr_0__(30,)cconstexpr_bf16__(35,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc138)), %arg_K: !tt.ptr loc("arg_K"(#loc138)), %arg_V: !tt.ptr loc("arg_V"(#loc138)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc138)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc138)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc138)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc138)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc138)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc138)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc138)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc138)), %q: tensor<128x128xbf16> loc("q"(#loc138)), %K: !tt.ptr loc("K"(#loc138)), %V: !tt.ptr loc("V"(#loc138)), %Q_LEN: i32 loc("Q_LEN"(#loc138)), %KV_LEN: i32 loc("KV_LEN"(#loc138)), %acc: tensor<128x128xf32> loc("acc"(#loc138)), %l_i: tensor<128xf32> loc("l_i"(#loc138)), %m_i: tensor<128xf32> loc("m_i"(#loc138)), %off_z: i32 loc("off_z"(#loc138)), %off_h: i32 loc("off_h"(#loc138)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc138)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc138)), %kv_start: i32 loc("kv_start"(#loc138)), %kv_indices: !tt.ptr loc("kv_indices"(#loc138)), %kv_num_blocks: i32 loc("kv_num_blocks"(#loc138)), %block_n_end: i32 loc("block_n_end"(#loc138)), %stride_kk: i32 loc("stride_kk"(#loc138)), %stride_kn: i32 loc("stride_kn"(#loc138)), %stride_vn: i32 loc("stride_vn"(#loc138)), %stride_vk: i32 loc("stride_vk"(#loc138))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_offset = arith.constant 0 : i32 loc(#loc411) + %c0_i32 = arith.constant 0 : i32 loc(#loc140) + %c1_i32 = arith.constant 1 : i32 loc(#loc140) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc140) + %1 = arith.bitcast %block_n_end : i32 to i32 loc(#loc140) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc140) + %3 = ub.poison : i32 loc(#loc140) + %kv_offset_0:5 = scf.for %start_n = %0 to %1 step %2 iter_args(%acc_1 = %acc, %l_i_2 = %l_i, %m_i_3 = %m_i, %offs_n_4 = %offs_n, %kv_offset_5 = %kv_offset) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %7:3 = tt.call @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(14,)cconstexpr_None__(15,)cconstexpr_None__(27,)cconstexpr_bf16__(28,)cconstexpr_1_d_44269504__(33,)cconstexpr_False__(34,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %q, %K, %V, %Q_LEN, %KV_LEN, %acc_1, %l_i_2, %m_i_3, %off_z, %off_h, %offs_m, %offs_n_4, %kv_start, %kv_offset_5, %stride_kk, %stride_kn, %stride_vn, %stride_vk) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc141) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc413) + %offs_n_6 = tt.splat %offset : i32 -> tensor<1x64xi32> loc(#loc414) + %offs_n_7 = arith.addi %offs_n_4, %offs_n_6 : tensor<1x64xi32> loc(#loc414) + %kv_offset_8 = arith.addi %kv_offset_5, %offset : i32 loc(#loc415) + scf.yield %7#0, %7#1, %7#2, %offs_n_7, %kv_offset_8 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc145) + } loc(#loc542) + tt.return %kv_offset_0#0, %kv_offset_0#1, %kv_offset_0#2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc146) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc147) + %5 = ub.poison : tensor<128xf32> loc(#loc147) + %6 = ub.poison : tensor<128xf32> loc(#loc147) + tt.return %4, %5, %6 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc147) + } loc(#loc138) + tt.func private @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(14,)cconstexpr_None__(15,)cconstexpr_None__(27,)cconstexpr_bf16__(28,)cconstexpr_1_d_44269504__(33,)cconstexpr_False__(34,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc148)), %arg_K: !tt.ptr loc("arg_K"(#loc148)), %arg_V: !tt.ptr loc("arg_V"(#loc148)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc148)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc148)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc148)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc148)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc148)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc148)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc148)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc148)), %q: tensor<128x128xbf16> loc("q"(#loc148)), %K: !tt.ptr loc("K"(#loc148)), %V: !tt.ptr loc("V"(#loc148)), %Q_LEN: i32 loc("Q_LEN"(#loc148)), %KV_LEN: i32 loc("KV_LEN"(#loc148)), %acc: tensor<128x128xf32> loc("acc"(#loc148)), %l_i: tensor<128xf32> loc("l_i"(#loc148)), %m_i: tensor<128xf32> loc("m_i"(#loc148)), %off_z: i32 loc("off_z"(#loc148)), %off_h: i32 loc("off_h"(#loc148)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc148)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc148)), %kv_start: i32 loc("kv_start"(#loc148)), %kv_offset: i32 loc("kv_offset"(#loc148)), %stride_kk: i32 loc("stride_kk"(#loc148)), %stride_kn: i32 loc("stride_kn"(#loc148)), %stride_vn: i32 loc("stride_vn"(#loc148)), %stride_vk: i32 loc("stride_vk"(#loc148))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_base_offset = arith.addi %kv_start, %kv_offset : i32 loc(#loc445) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc446) + %offs_n_load = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc447) + %offs_n_load_0 = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc448) + %offs_n_load_1 = arith.addi %offs_n_load_0, %offs_n_load : tensor<64xi32> loc(#loc448) + %k = tt.call @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%K, %offs_n_load_1, %offs_k, %stride_kn, %stride_kk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc449) + %k_2 = tt.trans %k {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc450) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc451) + %qk_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc451) + %qk_4 = tt.dot %q, %k_2, %qk_3, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc451) + %qk_5 = arith.constant 0.0883883461 : f32 loc(#loc452) + %qk_6 = arith.constant 0.0883883461 : f32 loc(#loc452) + %qk_7 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc452) + %qk_8 = arith.mulf %qk_4, %qk_7 : tensor<128x64xf32> loc(#loc452) + %m = tt.call @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.get_bounded_indices__i32S128_1S__(1,)cconstexpr_None_"(%offs_m) : (tensor<128x1xi32>) -> tensor<128x1xi32> loc(#loc453) + %n = tt.call @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.get_bounded_indices__i32S1_64S__(1,)cconstexpr_None_"(%offs_n) : (tensor<1x64xi32>) -> tensor<1x64xi32> loc(#loc454) + %tmp1 = arith.constant false loc(#loc455) + %tmp1_9 = arith.constant dense : tensor<1xi1> loc(#loc455) + %tmp4 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc456) + %tmp4_10 = tt.broadcast %n : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc456) + %tmp4_11 = arith.cmpi sge, %tmp4, %tmp4_10 : tensor<128x64xi32> loc(#loc456) + %tmp5 = arith.extsi %n : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc457) + %tmp7 = tt.addptr %in_ptr9, %off_z : !tt.ptr, i32 loc(#loc458) + %tmp7_12 = tt.load %tmp7 : !tt.ptr loc(#loc459) + %tmp8 = tt.splat %tmp7_12 : i64 -> tensor<1x64xi64> loc(#loc460) + %tmp8_13 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc460) + %tmp9 = arith.extsi %m : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc461) + %tmp10 = tt.splat %tmp7_12 : i64 -> tensor<128x1xi64> loc(#loc462) + %tmp10_14 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc462) + %tmp11 = tt.broadcast %tmp8_13 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc463) + %tmp11_15 = tt.broadcast %tmp10_14 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc463) + %tmp11_16 = arith.andi %tmp11, %tmp11_15 : tensor<128x64xi1> loc(#loc463) + %tmp12 = arith.andi %tmp4_11, %tmp11_16 : tensor<128x64xi1> loc(#loc464) + %tmp13 = tt.expand_dims %tmp1_9 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc465) + %tmp13_17 = tt.broadcast %tmp13 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc465) + %tmp13_18 = arith.ori %tmp13_17, %tmp12 : tensor<128x64xi1> loc(#loc465) + %tmp14 = arith.constant 2048 : i32 loc(#loc466) + %tmp14_19 = arith.constant dense<2048> : tensor<1xi32> loc(#loc466) + %tmp15 = tt.expand_dims %tmp14_19 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc467) + %tmp15_20 = tt.broadcast %tmp15 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc467) + %tmp15_21 = arith.cmpi sge, %n, %tmp15_20 : tensor<1x64xi32> loc(#loc467) + %tmp16 = tt.expand_dims %tmp14_19 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc468) + %tmp16_22 = tt.broadcast %tmp16 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc468) + %tmp16_23 = arith.remsi %n, %tmp16_22 : tensor<1x64xi32> loc(#loc468) + %tmp17 = arith.constant 0 : i32 loc(#loc469) + %tmp17_24 = arith.constant dense<0> : tensor<1xi32> loc(#loc469) + %tmp18 = tt.expand_dims %tmp17_24 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc470) + %tmp18_25 = tt.broadcast %tmp18 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc470) + %tmp18_26 = arith.cmpi ne, %tmp16_23, %tmp18_25 : tensor<1x64xi32> loc(#loc470) + %tmp19 = arith.constant 0 : i32 loc(#loc471) + %tmp19_27 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc471) + %tmp19_28 = arith.cmpi slt, %tmp16_23, %tmp19_27 : tensor<1x64xi32> loc(#loc471) + %tmp20 = arith.constant 0 : i32 loc(#loc472) + %tmp20_29 = arith.constant dense<0> : tensor<1xi32> loc(#loc472) + %tmp20_30 = arith.cmpi slt, %tmp14_19, %tmp20_29 : tensor<1xi32> loc(#loc472) + %tmp21 = tt.expand_dims %tmp20_30 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc473) + %tmp21_31 = tt.broadcast %tmp21 : tensor<1x1xi1> -> tensor<1x64xi1> loc(#loc473) + %tmp21_32 = arith.cmpi ne, %tmp19_28, %tmp21_31 : tensor<1x64xi1> loc(#loc473) + %tmp22 = arith.andi %tmp18_26, %tmp21_32 : tensor<1x64xi1> loc(#loc474) + %tmp23 = tt.expand_dims %tmp14_19 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc475) + %tmp23_33 = tt.broadcast %tmp23 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc475) + %tmp23_34 = arith.addi %tmp16_23, %tmp23_33 : tensor<1x64xi32> loc(#loc475) + %tmp24 = arith.select %tmp22, %tmp23_34, %tmp16_23 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc476) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc477) + %tmp26 = tt.splat %tmp7_12 : i64 -> tensor<1x64xi64> loc(#loc478) + %tmp26_35 = arith.cmpi slt, %tmp25, %tmp26 : tensor<1x64xi64> loc(#loc478) + %tmp27 = arith.andi %tmp15_21, %tmp26_35 : tensor<1x64xi1> loc(#loc479) + %tmp28 = tt.broadcast %n : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc480) + %tmp28_36 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc480) + %tmp28_37 = arith.subi %tmp28, %tmp28_36 : tensor<128x64xi32> loc(#loc480) + %tmp29 = tt.expand_dims %tmp14_19 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc481) + %tmp29_38 = tt.broadcast %tmp29 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc481) + %tmp29_39 = arith.remsi %tmp28_37, %tmp29_38 : tensor<128x64xi32> loc(#loc481) + %tmp30 = tt.expand_dims %tmp17_24 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc482) + %tmp30_40 = tt.broadcast %tmp30 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc482) + %tmp30_41 = arith.cmpi ne, %tmp29_39, %tmp30_40 : tensor<128x64xi32> loc(#loc482) + %tmp31 = arith.constant 0 : i32 loc(#loc483) + %tmp31_42 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc483) + %tmp31_43 = arith.cmpi slt, %tmp29_39, %tmp31_42 : tensor<128x64xi32> loc(#loc483) + %tmp32 = tt.expand_dims %tmp20_30 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc484) + %tmp32_44 = tt.broadcast %tmp32 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc484) + %tmp32_45 = arith.cmpi ne, %tmp31_43, %tmp32_44 : tensor<128x64xi1> loc(#loc484) + %tmp33 = arith.andi %tmp30_41, %tmp32_45 : tensor<128x64xi1> loc(#loc485) + %tmp34 = tt.expand_dims %tmp14_19 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc486) + %tmp34_46 = tt.broadcast %tmp34 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc486) + %tmp34_47 = arith.addi %tmp29_39, %tmp34_46 : tensor<128x64xi32> loc(#loc486) + %tmp35 = arith.select %tmp33, %tmp34_47, %tmp29_39 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc487) + %tmp36 = tt.expand_dims %tmp17_24 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc488) + %tmp36_48 = tt.broadcast %tmp36 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc488) + %tmp36_49 = arith.cmpi eq, %tmp35, %tmp36_48 : tensor<128x64xi32> loc(#loc488) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc489) + %tmp37_50 = arith.andi %tmp37, %tmp36_49 : tensor<128x64xi1> loc(#loc489) + %tmp38 = arith.ori %tmp13_18, %tmp37_50 : tensor<128x64xi1> loc(#loc490) + %post_mod_scores = arith.constant 0xFF800000 : f32 loc(#loc491) + %post_mod_scores_51 = arith.constant 0xFF800000 : f32 loc(#loc491) + %post_mod_scores_52 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc491) + %post_mod_scores_53 = arith.select %tmp38, %qk_8, %post_mod_scores_52 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc491) + %post_mod_scores_54 = arith.constant 1.44269502 : f32 loc(#loc492) + %post_mod_scores_55 = arith.constant 1.44269502 : f32 loc(#loc492) + %post_mod_scores_56 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc492) + %post_mod_scores_57 = arith.mulf %post_mod_scores_53, %post_mod_scores_56 : tensor<128x64xf32> loc(#loc492) + %m_ij = tt.call @"triton.language.standard.max__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cconstexpr_True__(4,)cconstexpr_False_"(%post_mod_scores_57) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc493) + %m_ij_58 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc494) + %masked_out_rows = arith.constant 0xFF800000 : f32 loc(#loc495) + %masked_out_rows_59 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc495) + %masked_out_rows_60 = arith.cmpf oeq, %m_ij_58, %masked_out_rows_59 : tensor<128xf32> loc(#loc495) + %m_ij_masked = arith.constant 0 : i32 loc(#loc496) + %m_ij_masked_61 = arith.constant 0.000000e+00 : f32 loc(#loc496) + %m_ij_masked_62 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc496) + %m_ij_masked_63 = arith.select %masked_out_rows_60, %m_ij_masked_62, %m_ij_58 : tensor<128xi1>, tensor<128xf32> loc(#loc496) + %alpha = arith.subf %m_i, %m_ij_masked_63 : tensor<128xf32> loc(#loc497) + %alpha_64 = math.exp2 %alpha : tensor<128xf32> loc(#loc498) + %p = tt.expand_dims %m_ij_masked_63 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc499) + %p_65 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc500) + %p_66 = arith.subf %post_mod_scores_57, %p_65 : tensor<128x64xf32> loc(#loc500) + %p_67 = math.exp2 %p_66 : tensor<128x64xf32> loc(#loc501) + %l_i_68 = arith.mulf %l_i, %alpha_64 : tensor<128xf32> loc(#loc502) + %l_i_69 = tt.call @"triton.language.standard.sum__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%p_67) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc503) + %l_i_70 = arith.addf %l_i_68, %l_i_69 : tensor<128xf32> loc(#loc504) + %acc_71 = tt.expand_dims %alpha_64 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc505) + %acc_72 = tt.broadcast %acc_71 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc506) + %acc_73 = arith.mulf %acc, %acc_72 : tensor<128x128xf32> loc(#loc506) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc507) + %v = tt.call @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%V, %offs_n_load_1, %offs_v, %stride_vn, %stride_vk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc508) + %acc_74 = arith.truncf %p_67 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc509) + %acc_75 = arith.constant 0.000000e+00 : f32 loc(#loc510) + %acc_76 = tt.dot %acc_74, %v, %acc_73, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc510) + tt.return %acc_76, %l_i_70, %m_ij_58 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc215) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc216) + %1 = ub.poison : tensor<128xf32> loc(#loc216) + %2 = ub.poison : tensor<128xf32> loc(#loc216) + tt.return %0, %1, %2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc216) + } loc(#loc148) + tt.func private @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: !tt.ptr loc("ptr"(#loc122)), %offs_m: tensor<64xi32> loc("offs_m"(#loc122)), %offs_n: tensor<128xi32> loc("offs_n"(#loc122)), %stride_m: i32 loc("stride_m"(#loc122)), %stride_n: i32 loc("stride_n"(#loc122)), %M_LEN: i32 loc("M_LEN"(#loc122))) -> tensor<64x128xbf16> attributes {noinline = false} { + %ptr_0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc373) + %ptr_1 = tt.splat %stride_m : i32 -> tensor<64x1xi32> loc(#loc374) + %ptr_2 = arith.muli %ptr_0, %ptr_1 : tensor<64x1xi32> loc(#loc374) + %ptr_3 = tt.splat %ptr : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc375) + %ptr_4 = tt.addptr %ptr_3, %ptr_2 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc375) + %ptr_5 = tt.expand_dims %offs_n {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc376) + %ptr_6 = tt.splat %stride_n : i32 -> tensor<1x128xi32> loc(#loc377) + %ptr_7 = arith.muli %ptr_5, %ptr_6 : tensor<1x128xi32> loc(#loc377) + %ptr_8 = tt.broadcast %ptr_4 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc378) + %ptr_9 = tt.broadcast %ptr_7 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc378) + %ptr_10 = tt.addptr %ptr_8, %ptr_9 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc378) + %0 = tt.load %ptr_10 : tensor<64x128x!tt.ptr> loc(#loc129) + tt.return %0 : tensor<64x128xbf16> loc(#loc130) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x128xbf16> loc(#loc131) + tt.return %1 : tensor<64x128xbf16> loc(#loc131) + } loc(#loc122) + tt.func private @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.get_bounded_indices__i32S128_1S__(1,)cconstexpr_None_"(%indices: tensor<128x1xi32> loc("indices"(#loc217))) -> tensor<128x1xi32> attributes {noinline = false} { + tt.return %indices : tensor<128x1xi32> loc(#loc218) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x1xi32> loc(#loc219) + tt.return %0 : tensor<128x1xi32> loc(#loc219) + } loc(#loc217) + tt.func private @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.get_bounded_indices__i32S1_64S__(1,)cconstexpr_None_"(%indices: tensor<1x64xi32> loc("indices"(#loc217))) -> tensor<1x64xi32> attributes {noinline = false} { + tt.return %indices : tensor<1x64xi32> loc(#loc218) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x64xi32> loc(#loc219) + tt.return %0 : tensor<1x64xi32> loc(#loc219) + } loc(#loc217) + tt.func private @"triton.language.standard.max__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cconstexpr_True__(4,)cconstexpr_False_"(%input: tensor<128x64xf32> loc("input"(#loc220))) -> tensor<128xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._elementwise_max__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc221) + tt.reduce.return %2 : f32 loc(#loc221) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc221) + tt.return %0 : tensor<128xf32> loc(#loc223) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128xf32> loc(#loc224) + tt.return %1 : tensor<128xf32> loc(#loc224) + } loc(#loc220) + tt.func private @triton.language.standard._elementwise_max__fp32_fp32__(%a: f32 loc("a"(#loc225)), %b: f32 loc("b"(#loc225))) -> f32 attributes {noinline = false} { + %0 = arith.maxnumf %a, %b : f32 loc(#loc226) + tt.return %0 : f32 loc(#loc227) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc228) + tt.return %1 : f32 loc(#loc228) + } loc(#loc225) + tt.func private @"triton.language.standard.sum__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x64xf32> loc("input"(#loc229))) -> tensor<128xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc230) + tt.reduce.return %2 : f32 loc(#loc230) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc230) + tt.return %0 : tensor<128xf32> loc(#loc231) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128xf32> loc(#loc232) + tt.return %1 : tensor<128xf32> loc(#loc232) + } loc(#loc229) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc233)), %b: f32 loc("b"(#loc233))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc234) + tt.return %0 : f32 loc(#loc235) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc236) + tt.return %1 : f32 loc(#loc236) + } loc(#loc233) + tt.func private @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%loop_iter: i32 loc("loop_iter"(#loc237)), %col_indices: !tt.ptr loc("col_indices"(#loc237)), %total_blocks: i32 loc("total_blocks"(#loc237))) -> i32 attributes {noinline = false} { + %cur_block_idx = arith.constant 2 : i32 loc(#loc521) + %cur_block_idx_0 = arith.constant 2 : i32 loc(#loc521) + %cur_block_idx_1 = arith.divsi %loop_iter, %cur_block_idx_0 : i32 loc(#loc521) + %cur_block = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc522) + %cur_block_2 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc523) + %next_block = arith.constant 1 : i32 loc(#loc524) + %next_block_3 = arith.constant 1 : i32 loc(#loc524) + %next_block_4 = arith.addi %cur_block_idx_1, %next_block_3 : i32 loc(#loc524) + %next_block_5 = arith.cmpi slt, %next_block_4, %total_blocks : i32 loc(#loc525) + %next_block_6 = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc526) + %next_block_7 = arith.constant 1 : i32 loc(#loc527) + %next_block_8 = tt.addptr %next_block_6, %next_block_7 : !tt.ptr, i32 loc(#loc527) + %next_block_9 = tt.load %next_block_8, %next_block_5 evictionPolicy = evict_last : !tt.ptr loc(#loc528) + %needs_jump = arith.constant 1 : i32 loc(#loc529) + %needs_jump_10 = arith.constant 1 : i32 loc(#loc529) + %needs_jump_11 = arith.addi %loop_iter, %needs_jump_10 : i32 loc(#loc529) + %needs_jump_12 = arith.constant 2 : i32 loc(#loc530) + %needs_jump_13 = arith.constant 2 : i32 loc(#loc530) + %needs_jump_14 = arith.remsi %needs_jump_11, %needs_jump_13 : i32 loc(#loc530) + %needs_jump_15 = arith.constant 0 : i32 loc(#loc531) + %needs_jump_16 = arith.cmpi eq, %needs_jump_14, %needs_jump_15 : i32 loc(#loc531) + %jump_to_block = arith.subi %next_block_9, %cur_block_2 : i32 loc(#loc532) + %jump_to_block_17 = arith.constant 128 : i32 loc(#loc533) + %jump_to_block_18 = arith.constant 128 : i32 loc(#loc533) + %jump_to_block_19 = arith.muli %jump_to_block, %jump_to_block_18 : i32 loc(#loc533) + %jump_to_block_20 = arith.constant 64 : i32 loc(#loc534) + %jump_to_block_21 = arith.constant 64 : i32 loc(#loc534) + %jump_to_block_22 = arith.subi %jump_to_block_19, %jump_to_block_21 : i32 loc(#loc534) + %offset = arith.extui %needs_jump_16 : i1 to i32 loc(#loc535) + %offset_23 = arith.muli %jump_to_block_22, %offset : i32 loc(#loc535) + %offset_24 = arith.constant 1 : i32 loc(#loc536) + %offset_25 = arith.constant 1 : i32 loc(#loc536) + %offset_26 = arith.extui %needs_jump_16 : i1 to i32 loc(#loc536) + %offset_27 = arith.subi %offset_25, %offset_26 : i32 loc(#loc536) + %offset_28 = arith.constant 64 : i32 loc(#loc537) + %offset_29 = arith.constant 64 : i32 loc(#loc537) + %offset_30 = arith.muli %offset_27, %offset_29 : i32 loc(#loc537) + %offset_31 = arith.addi %offset_23, %offset_30 : i32 loc(#loc538) + tt.return %offset_31 : i32 loc(#loc256) + ^bb1: // no predecessors + %0 = ub.poison : i32 loc(#loc257) + tt.return %0 : i32 loc(#loc257) + } loc(#loc237) + tt.func private @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(14,)cNone_(15,)cNone_(28,)cconstexpr_0__(30,)cconstexpr_bf16__(35,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc138)), %arg_K: !tt.ptr loc("arg_K"(#loc138)), %arg_V: !tt.ptr loc("arg_V"(#loc138)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc138)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc138)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc138)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc138)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc138)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc138)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc138)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc138)), %q: tensor<128x128xbf16> loc("q"(#loc138)), %K: !tt.ptr loc("K"(#loc138)), %V: !tt.ptr loc("V"(#loc138)), %Q_LEN: i32 loc("Q_LEN"(#loc138)), %KV_LEN: i32 loc("KV_LEN"(#loc138)), %acc: tensor<128x128xf32> loc("acc"(#loc138)), %l_i: tensor<128xf32> loc("l_i"(#loc138)), %m_i: tensor<128xf32> loc("m_i"(#loc138)), %off_z: i32 loc("off_z"(#loc138)), %off_h: i32 loc("off_h"(#loc138)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc138)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc138)), %kv_start: i32 loc("kv_start"(#loc138)), %kv_indices: !tt.ptr loc("kv_indices"(#loc138)), %kv_num_blocks: i32 loc("kv_num_blocks"(#loc138)), %block_n_end: i32 loc("block_n_end"(#loc138)), %stride_kk: i32 loc("stride_kk"(#loc138)), %stride_kn: i32 loc("stride_kn"(#loc138)), %stride_vn: i32 loc("stride_vn"(#loc138)), %stride_vk: i32 loc("stride_vk"(#loc138))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_offset = arith.constant 0 : i32 loc(#loc411) + %c0_i32 = arith.constant 0 : i32 loc(#loc140) + %c1_i32 = arith.constant 1 : i32 loc(#loc140) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc140) + %1 = arith.bitcast %block_n_end : i32 to i32 loc(#loc140) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc140) + %3 = ub.poison : i32 loc(#loc140) + %kv_offset_0:5 = scf.for %start_n = %0 to %1 step %2 iter_args(%acc_1 = %acc, %l_i_2 = %l_i, %m_i_3 = %m_i, %offs_n_4 = %offs_n, %kv_offset_5 = %kv_offset) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %7:3 = tt.call @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(14,)cconstexpr_None__(15,)cconstexpr_None__(27,)cconstexpr_bf16__(28,)cconstexpr_1_d_44269504__(33,)cconstexpr_True__(34,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %q, %K, %V, %Q_LEN, %KV_LEN, %acc_1, %l_i_2, %m_i_3, %off_z, %off_h, %offs_m, %offs_n_4, %kv_start, %kv_offset_5, %stride_kk, %stride_kn, %stride_vn, %stride_vk) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc141) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc413) + %offs_n_6 = tt.splat %offset : i32 -> tensor<1x64xi32> loc(#loc414) + %offs_n_7 = arith.addi %offs_n_4, %offs_n_6 : tensor<1x64xi32> loc(#loc414) + %kv_offset_8 = arith.addi %kv_offset_5, %offset : i32 loc(#loc415) + scf.yield %7#0, %7#1, %7#2, %offs_n_7, %kv_offset_8 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc145) + } loc(#loc542) + tt.return %kv_offset_0#0, %kv_offset_0#1, %kv_offset_0#2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc146) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc147) + %5 = ub.poison : tensor<128xf32> loc(#loc147) + %6 = ub.poison : tensor<128xf32> loc(#loc147) + tt.return %4, %5, %6 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc147) + } loc(#loc138) + tt.func private @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(14,)cconstexpr_None__(15,)cconstexpr_None__(27,)cconstexpr_bf16__(28,)cconstexpr_1_d_44269504__(33,)cconstexpr_True__(34,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc148)), %arg_K: !tt.ptr loc("arg_K"(#loc148)), %arg_V: !tt.ptr loc("arg_V"(#loc148)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc148)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc148)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc148)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc148)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc148)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc148)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc148)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc148)), %q: tensor<128x128xbf16> loc("q"(#loc148)), %K: !tt.ptr loc("K"(#loc148)), %V: !tt.ptr loc("V"(#loc148)), %Q_LEN: i32 loc("Q_LEN"(#loc148)), %KV_LEN: i32 loc("KV_LEN"(#loc148)), %acc: tensor<128x128xf32> loc("acc"(#loc148)), %l_i: tensor<128xf32> loc("l_i"(#loc148)), %m_i: tensor<128xf32> loc("m_i"(#loc148)), %off_z: i32 loc("off_z"(#loc148)), %off_h: i32 loc("off_h"(#loc148)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc148)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc148)), %kv_start: i32 loc("kv_start"(#loc148)), %kv_offset: i32 loc("kv_offset"(#loc148)), %stride_kk: i32 loc("stride_kk"(#loc148)), %stride_kn: i32 loc("stride_kn"(#loc148)), %stride_vn: i32 loc("stride_vn"(#loc148)), %stride_vk: i32 loc("stride_vk"(#loc148))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_base_offset = arith.addi %kv_start, %kv_offset : i32 loc(#loc445) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc446) + %offs_n_load = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc447) + %offs_n_load_0 = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc448) + %offs_n_load_1 = arith.addi %offs_n_load_0, %offs_n_load : tensor<64xi32> loc(#loc448) + %k = tt.call @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%K, %offs_n_load_1, %offs_k, %stride_kn, %stride_kk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc449) + %k_2 = tt.trans %k {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc450) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc451) + %qk_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc451) + %qk_4 = tt.dot %q, %k_2, %qk_3, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc451) + %qk_5 = arith.constant 0.0883883461 : f32 loc(#loc452) + %qk_6 = arith.constant 0.0883883461 : f32 loc(#loc452) + %qk_7 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc452) + %qk_8 = arith.mulf %qk_4, %qk_7 : tensor<128x64xf32> loc(#loc452) + %m = tt.call @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.get_bounded_indices__i32S128_1S__(1,)cconstexpr_None_"(%offs_m) : (tensor<128x1xi32>) -> tensor<128x1xi32> loc(#loc453) + %n = tt.call @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.get_bounded_indices__i32S1_64S__(1,)cconstexpr_None_"(%offs_n) : (tensor<1x64xi32>) -> tensor<1x64xi32> loc(#loc454) + %post_mod_scores = arith.constant 1.44269502 : f32 loc(#loc492) + %post_mod_scores_9 = arith.constant 1.44269502 : f32 loc(#loc492) + %post_mod_scores_10 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc492) + %post_mod_scores_11 = arith.mulf %qk_8, %post_mod_scores_10 : tensor<128x64xf32> loc(#loc492) + %m_ij = tt.call @"triton.language.standard.max__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cconstexpr_True__(4,)cconstexpr_False_"(%post_mod_scores_11) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc493) + %m_ij_12 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc494) + %masked_out_rows = arith.constant 0xFF800000 : f32 loc(#loc495) + %masked_out_rows_13 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc495) + %masked_out_rows_14 = arith.cmpf oeq, %m_ij_12, %masked_out_rows_13 : tensor<128xf32> loc(#loc495) + %m_ij_masked = arith.constant 0 : i32 loc(#loc496) + %m_ij_masked_15 = arith.constant 0.000000e+00 : f32 loc(#loc496) + %m_ij_masked_16 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc496) + %m_ij_masked_17 = arith.select %masked_out_rows_14, %m_ij_masked_16, %m_ij_12 : tensor<128xi1>, tensor<128xf32> loc(#loc496) + %alpha = arith.subf %m_i, %m_ij_masked_17 : tensor<128xf32> loc(#loc497) + %alpha_18 = math.exp2 %alpha : tensor<128xf32> loc(#loc498) + %p = tt.expand_dims %m_ij_masked_17 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc499) + %p_19 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc500) + %p_20 = arith.subf %post_mod_scores_11, %p_19 : tensor<128x64xf32> loc(#loc500) + %p_21 = math.exp2 %p_20 : tensor<128x64xf32> loc(#loc501) + %l_i_22 = arith.mulf %l_i, %alpha_18 : tensor<128xf32> loc(#loc502) + %l_i_23 = tt.call @"triton.language.standard.sum__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%p_21) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc503) + %l_i_24 = arith.addf %l_i_22, %l_i_23 : tensor<128xf32> loc(#loc504) + %acc_25 = tt.expand_dims %alpha_18 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc505) + %acc_26 = tt.broadcast %acc_25 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc506) + %acc_27 = arith.mulf %acc, %acc_26 : tensor<128x128xf32> loc(#loc506) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc507) + %v = tt.call @"torch._inductor.runtime.compile_tasks.cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%V, %offs_n_load_1, %offs_v, %stride_vn, %stride_vk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc508) + %acc_28 = arith.truncf %p_21 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc509) + %acc_29 = arith.constant 0.000000e+00 : f32 loc(#loc510) + %acc_30 = tt.dot %acc_28, %v, %acc_27, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc510) + tt.return %acc_30, %l_i_24, %m_ij_12 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc215) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc216) + %1 = ub.poison : tensor<128xf32> loc(#loc216) + %2 = ub.poison : tensor<128xf32> loc(#loc216) + tt.return %0, %1, %2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc216) + } loc(#loc148) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":85:49) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":86:49) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":87:49) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":89:9) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":90:9) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":91:12) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":92:10) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":93:13) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":97:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":98:27) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":99:27) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":103:23) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":104:24) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":105:21) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":107:24) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":107:45) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":107:36) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":108:25) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":108:47) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":108:37) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":109:25) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":109:47) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":109:37) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":111:12) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":112:12) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":113:12) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":120:15) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":121:16) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":123:28) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":124:29) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":129:27) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":130:22) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":131:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":134:19) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":134:50) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":135:19) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":136:19) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":138:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":138:46) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":138:33) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":141:38) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":141:50) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":142:51) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":142:85) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":142:74) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":143:46) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":143:76) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":143:97) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":143:64) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":144:23) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":144:46) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":144:33) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":145:26) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":146:101) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":151:26) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":152:23) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":152:37) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":153:42) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":153:28) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":154:45) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":154:92) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":154:102) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":154:65) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":159:37) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":159:24) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":167:31) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":167:48) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":172:41) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":181:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":182:27) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":182:41) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":183:51) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":183:32) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":184:49) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":184:96) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":184:106) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":184:69) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":186:41) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":186:28) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":193:35) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":193:52) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":198:45) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":206:26) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":206:34) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":208:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":208:16) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":209:27) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":210:27) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":211:19) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":212:25) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":212:45) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":214:20) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":214:38) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":214:30) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":217:25) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":217:21) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":217:40) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":217:33) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":217:57) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":217:49) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":218:53) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":218:49) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":218:67) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":218:62) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":218:83) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":218:75) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":218:25) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":218:109) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":221:26) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":221:31) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":222:32) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":222:23) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":222:40) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":223:33) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":223:20) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":225:29) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":229:4) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":284:27) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":284:38) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":284:20) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":284:56) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":284:67) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":284:49) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":294:23) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":294:15) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":287:4) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:16) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:11) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:4) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":499:16) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":502:40) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":518:16) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":545:63) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":548:26) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":549:21) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":549:8) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":552:11) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":552:4) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":342:32) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":345:26) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":346:48) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":346:35) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":347:107) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":349:17) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":351:19) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":353:14) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":358:36) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":359:36) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":370:35) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":373:23) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":374:23) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":376:33) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":376:23) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":377:22) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":378:23) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":379:23) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":380:23) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":381:23) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":382:23) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":383:35) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":384:24) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":385:24) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":386:32) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":387:25) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":388:92) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":389:92) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":390:25) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":391:24) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":392:24) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":393:39) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":394:25) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":395:24) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":396:24) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":397:23) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":398:25) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":399:25) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":400:92) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":401:25) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":402:24) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":403:24) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":404:39) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":405:25) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":406:24) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":407:24) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":414:69) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":417:27) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":421:51) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":421:27) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":423:35) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":424:51) +#loc201 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":428:31) +#loc202 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":428:25) +#loc203 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":429:51) +#loc204 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":429:39) +#loc205 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":429:21) +#loc206 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":434:16) +#loc207 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":434:34) +#loc208 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":434:24) +#loc209 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":436:22) +#loc210 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":436:16) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":438:26) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":439:107) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":440:22) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":440:44) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":445:11) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":445:4) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":257:11) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":257:4) +#loc221 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40) +#loc223 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:15) +#loc224 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":177:4) +#loc226 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27) +#loc227 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:11) +#loc228 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:4) +#loc230 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc231 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc232 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc234 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc235 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc236 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc238 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":247:33) +#loc239 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":248:38) +#loc240 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":248:24) +#loc241 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":249:109) +#loc242 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":249:113) +#loc243 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":249:39) +#loc244 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":249:55) +#loc245 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":249:25) +#loc246 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":250:30) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":250:35) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":250:60) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":251:34) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":251:48) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":251:63) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":252:29) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":252:47) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":252:61) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":252:42) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":253:11) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":253:4) +#loc269 = loc("ZQ"(#loc4)) +#loc270 = loc("HQ"(#loc5)) +#loc271 = loc("Q_LEN"(#loc6)) +#loc272 = loc("ZKV"(#loc7)) +#loc273 = loc("KV_LEN"(#loc8)) +#loc274 = loc("q_start"(#loc9)) +#loc275 = loc("off_zq"(#loc10)) +#loc276 = loc("off_hq"(#loc11)) +#loc277 = loc("off_zkv"(#loc12)) +#loc278 = loc("off_hkv"(#loc13)) +#loc279 = loc("off_g"(#loc14)) +#loc280 = loc("q_offset"(#loc15)) +#loc281 = loc("q_offset"(#loc16)) +#loc282 = loc("q_offset"(#loc17)) +#loc283 = loc("k_offset"(#loc18)) +#loc284 = loc("k_offset"(#loc19)) +#loc285 = loc("k_offset"(#loc20)) +#loc286 = loc("v_offset"(#loc21)) +#loc287 = loc("v_offset"(#loc22)) +#loc288 = loc("v_offset"(#loc23)) +#loc289 = loc("Q"(#loc24)) +#loc290 = loc("K"(#loc25)) +#loc291 = loc("V"(#loc26)) +#loc292 = loc("SPARSE_Z"(#loc27)) +#loc293 = loc("SPARSE_HQ"(#loc28)) +#loc294 = loc("sparse_idx_z"(#loc29)) +#loc295 = loc("sparse_idx_hq"(#loc30)) +#loc296 = loc("stride_kv_num_blks_h"(#loc31)) +#loc297 = loc("stride_kv_idx_h"(#loc32)) +#loc298 = loc("stride_kv_idx_m"(#loc33)) +#loc299 = loc("m_i"(#loc34)) +#loc300 = loc("m_i"(#loc35)) +#loc301 = loc("l_i"(#loc36)) +#loc302 = loc("acc"(#loc37)) +#loc303 = loc("offs_m"(#loc38)) +#loc304 = loc("offs_m"(#loc39)) +#loc305 = loc("offs_m"(#loc40)) +#loc306 = loc("sparse_hz_offset"(#loc41)) +#loc307 = loc("sparse_hz_offset"(#loc42)) +#loc308 = loc("sparse_kv_num_blks_offset"(#loc43)) +#loc309 = loc("sparse_kv_num_blks_offset"(#loc44)) +#loc310 = loc("sparse_kv_num_blks_offset"(#loc45)) +#loc311 = loc("sparse_kv_idx_offset"(#loc46)) +#loc312 = loc("sparse_kv_idx_offset"(#loc47)) +#loc313 = loc("sparse_kv_idx_offset"(#loc48)) +#loc314 = loc("sparse_kv_idx_offset"(#loc49)) +#loc315 = loc("offs_m"(#loc50)) +#loc316 = loc("offs_m"(#loc51)) +#loc317 = loc("offs_m"(#loc52)) +#loc318 = loc("offs_k"(#loc53)) +#loc319 = loc("q"(#loc54)) +#loc320 = loc("kv_indices"(#loc55)) +#loc321 = loc("kv_start"(#loc56)) +#loc322 = loc("kv_start"(#loc57)) +#loc323 = loc("kv_num_blocks"(#loc58)) +#loc324 = loc("kv_num_blocks"(#loc59)) +#loc325 = loc("block_n_end"(#loc60)) +#loc326 = loc("block_n_end"(#loc61)) +#loc327 = loc("block_n_end"(#loc62)) +#loc328 = loc("block_n_end"(#loc63)) +#loc329 = loc("offs_n"(#loc64)) +#loc330 = loc("offs_n"(#loc65)) +#loc331 = loc("kv_indices"(#loc69)) +#loc332 = loc("kv_start"(#loc70)) +#loc333 = loc("kv_start"(#loc71)) +#loc334 = loc("kv_num_blocks"(#loc72)) +#loc335 = loc("kv_num_blocks"(#loc73)) +#loc336 = loc("block_n_end"(#loc74)) +#loc337 = loc("block_n_end"(#loc75)) +#loc338 = loc("block_n_end"(#loc76)) +#loc339 = loc("block_n_end"(#loc77)) +#loc340 = loc("offs_n"(#loc78)) +#loc341 = loc("offs_n"(#loc79)) +#loc342 = loc("l_i"(#loc83)) +#loc343 = loc("l_i"(#loc84)) +#loc344 = loc("acc"(#loc85)) +#loc345 = loc("acc"(#loc86)) +#loc346 = loc("idx_zq"(#loc87)) +#loc347 = loc("idx_hq"(#loc88)) +#loc348 = loc("idx_m"(#loc89)) +#loc349 = loc("idx_d"(#loc90)) +#loc350 = loc("idx_d"(#loc91)) +#loc351 = loc("mask"(#loc92)) +#loc352 = loc("mask"(#loc93)) +#loc353 = loc("mask"(#loc94)) +#loc354 = loc("xindex"(#loc95)) +#loc355 = loc("xindex"(#loc96)) +#loc356 = loc("xindex"(#loc97)) +#loc357 = loc("xindex"(#loc98)) +#loc358 = loc("xindex"(#loc99)) +#loc359 = loc("xindex"(#loc100)) +#loc360 = loc("off_hz"(#loc109)) +#loc361 = loc("off_hz"(#loc110)) +#loc362 = loc("l_ptrs"(#loc111)) +#loc363 = loc("l_ptrs"(#loc112)) +#loc364 = loc("l_ptrs"(#loc113)) +#loc365 = loc("lse"(#loc114)) +#loc366 = loc("lse"(#loc115)) +#loc373 = loc("ptr"(#loc123)) +#loc374 = loc("ptr"(#loc124)) +#loc375 = loc("ptr"(#loc125)) +#loc376 = loc("ptr"(#loc126)) +#loc377 = loc("ptr"(#loc127)) +#loc378 = loc("ptr"(#loc128)) +#loc411 = loc("kv_offset"(#loc139)) +#loc412 = loc("acc"(#loc140)) +#loc413 = loc("offset"(#loc142)) +#loc414 = loc("offs_n"(#loc143)) +#loc415 = loc("kv_offset"(#loc144)) +#loc445 = loc("kv_base_offset"(#loc149)) +#loc446 = loc("offs_k"(#loc150)) +#loc447 = loc("offs_n_load"(#loc151)) +#loc448 = loc("offs_n_load"(#loc152)) +#loc449 = loc("k"(#loc153)) +#loc450 = loc("k"(#loc154)) +#loc451 = loc("qk"(#loc155)) +#loc452 = loc("qk"(#loc156)) +#loc453 = loc("m"(#loc157)) +#loc454 = loc("n"(#loc158)) +#loc455 = loc("tmp1"(#loc159)) +#loc456 = loc("tmp4"(#loc160)) +#loc457 = loc("tmp5"(#loc161)) +#loc458 = loc("tmp7"(#loc162)) +#loc459 = loc("tmp7"(#loc163)) +#loc460 = loc("tmp8"(#loc164)) +#loc461 = loc("tmp9"(#loc165)) +#loc462 = loc("tmp10"(#loc166)) +#loc463 = loc("tmp11"(#loc167)) +#loc464 = loc("tmp12"(#loc168)) +#loc465 = loc("tmp13"(#loc169)) +#loc466 = loc("tmp14"(#loc170)) +#loc467 = loc("tmp15"(#loc171)) +#loc468 = loc("tmp16"(#loc172)) +#loc469 = loc("tmp17"(#loc173)) +#loc470 = loc("tmp18"(#loc174)) +#loc471 = loc("tmp19"(#loc175)) +#loc472 = loc("tmp20"(#loc176)) +#loc473 = loc("tmp21"(#loc177)) +#loc474 = loc("tmp22"(#loc178)) +#loc475 = loc("tmp23"(#loc179)) +#loc476 = loc("tmp24"(#loc180)) +#loc477 = loc("tmp25"(#loc181)) +#loc478 = loc("tmp26"(#loc182)) +#loc479 = loc("tmp27"(#loc183)) +#loc480 = loc("tmp28"(#loc184)) +#loc481 = loc("tmp29"(#loc185)) +#loc482 = loc("tmp30"(#loc186)) +#loc483 = loc("tmp31"(#loc187)) +#loc484 = loc("tmp32"(#loc188)) +#loc485 = loc("tmp33"(#loc189)) +#loc486 = loc("tmp34"(#loc190)) +#loc487 = loc("tmp35"(#loc191)) +#loc488 = loc("tmp36"(#loc192)) +#loc489 = loc("tmp37"(#loc193)) +#loc490 = loc("tmp38"(#loc194)) +#loc491 = loc("post_mod_scores"(#loc195)) +#loc492 = loc("post_mod_scores"(#loc196)) +#loc493 = loc("m_ij"(#loc197)) +#loc494 = loc("m_ij"(#loc198)) +#loc495 = loc("masked_out_rows"(#loc199)) +#loc496 = loc("m_ij_masked"(#loc200)) +#loc497 = loc("alpha"(#loc201)) +#loc498 = loc("alpha"(#loc202)) +#loc499 = loc("p"(#loc203)) +#loc500 = loc("p"(#loc204)) +#loc501 = loc("p"(#loc205)) +#loc502 = loc("l_i"(#loc206)) +#loc503 = loc("l_i"(#loc207)) +#loc504 = loc("l_i"(#loc208)) +#loc505 = loc("acc"(#loc209)) +#loc506 = loc("acc"(#loc210)) +#loc507 = loc("offs_v"(#loc211)) +#loc508 = loc("v"(#loc212)) +#loc509 = loc("acc"(#loc213)) +#loc510 = loc("acc"(#loc214)) +#loc521 = loc("cur_block_idx"(#loc238)) +#loc522 = loc("cur_block"(#loc239)) +#loc523 = loc("cur_block"(#loc240)) +#loc524 = loc("next_block"(#loc241)) +#loc525 = loc("next_block"(#loc242)) +#loc526 = loc("next_block"(#loc243)) +#loc527 = loc("next_block"(#loc244)) +#loc528 = loc("next_block"(#loc245)) +#loc529 = loc("needs_jump"(#loc246)) +#loc530 = loc("needs_jump"(#loc247)) +#loc531 = loc("needs_jump"(#loc248)) +#loc532 = loc("jump_to_block"(#loc249)) +#loc533 = loc("jump_to_block"(#loc250)) +#loc534 = loc("jump_to_block"(#loc251)) +#loc535 = loc("offset"(#loc252)) +#loc536 = loc("offset"(#loc253)) +#loc537 = loc("offset"(#loc254)) +#loc538 = loc("offset"(#loc255)) +#loc539 = loc("l_i"(#loc412)) +#loc540 = loc("m_i"(#loc539)) +#loc541 = loc("offs_n"(#loc540)) +#loc542 = loc("kv_offset"(#loc541)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..485f57000d5b45889b2509d62fe8a6ac976d262c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.ttgir @@ -0,0 +1,896 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":18:0) +#loc1 = loc(unknown) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":518:16) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":172:41) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":421:51) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":434:34) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":198:45) +#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 64, 16]}> +#mma1 = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 128, 16]}> +#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}> +#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}> +#smem = #ttg.shared_memory +#loc151 = loc("arg_Q"(#loc)) +#loc152 = loc("arg_K"(#loc)) +#loc153 = loc("arg_V"(#loc)) +#loc154 = loc("arg_LSE"(#loc)) +#loc155 = loc("arg_MAX"(#loc)) +#loc156 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc157 = loc("arg_KV_IDX"(#loc)) +#loc158 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc159 = loc("arg_FULL_KV_IDX"(#loc)) +#loc160 = loc("in_ptr9"(#loc)) +#loc161 = loc("out_ptr0"(#loc)) +#loc200 = loc(callsite(#loc42 at #loc43)) +#loc239 = loc("m_ij"(#loc83)) +#loc249 = loc("l_i"(#loc95)) +#loc282 = loc(callsite(#loc42 at #loc129)) +#loc342 = loc(callsite(#loc239 at #loc200)) +#loc352 = loc(callsite(#loc249 at #loc200)) +#loc369 = loc(callsite(#loc239 at #loc282)) +#loc379 = loc(callsite(#loc249 at #loc282)) +#loc397 = loc(callsite(#loc1 at #loc342)) +#loc399 = loc(callsite(#loc1 at #loc352)) +#loc426 = loc(callsite(#loc1 at #loc369)) +#loc428 = loc(callsite(#loc1 at #loc379)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %in_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr9"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x64xi32, #mma> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<128x64xi32, #mma> loc(#loc1) + %cst_1 = arith.constant dense<2048> : tensor<128x64xi32, #mma> loc(#loc1) + %cst_2 = arith.constant dense<2048> : tensor<1x64xi32, #mma> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_4 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<4096> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<2048> : tensor<128x1xi32, #blocked> loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c2097152_i32 = arith.constant 2097152 : i32 loc(#loc1) + %c262144_i32 = arith.constant 262144 : i32 loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_8 = arith.constant dense<0.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %cst_9 = arith.constant dense<1.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %cst_10 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma1> loc(#loc1) + %cst_11 = arith.constant dense<0.0883883461> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_12 = arith.constant dense<0xFF800000> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_13 = arith.constant dense<1.44269502> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_14 = arith.constant dense<0xFF800000> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %c-1_i32 = arith.constant -1 : i32 loc(#loc1) + %c3_i32 = arith.constant 3 : i32 loc(#loc1) + %q_start = tt.get_program_id x : i32 loc(#loc162) + %off_zq = tt.get_program_id y : i32 loc(#loc163) + %off_hq = tt.get_program_id z : i32 loc(#loc164) + %off_zkv = arith.remsi %off_zq, %c2_i32 : i32 loc(#loc165) + %off_hkv = arith.divsi %off_hq, %c4_i32 : i32 loc(#loc166) + %q_offset = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc167) + %q_offset_15 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc168) + %q_offset_16 = arith.addi %q_offset, %q_offset_15 : i32 loc(#loc169) + %k_offset = arith.muli %off_zkv, %c2097152_i32 : i32 loc(#loc170) + %k_offset_17 = arith.muli %off_hkv, %c262144_i32 : i32 loc(#loc171) + %k_offset_18 = arith.addi %k_offset, %k_offset_17 : i32 loc(#loc172) + %Q = tt.addptr %arg_Q, %q_offset_16 : !tt.ptr, i32 loc(#loc173) + %K = tt.addptr %arg_K, %k_offset_18 : !tt.ptr, i32 loc(#loc174) + %V = tt.addptr %arg_V, %k_offset_18 : !tt.ptr, i32 loc(#loc175) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %c16_i32 : i32 loc(#loc176) + %sparse_kv_num_blks_offset_19 = arith.addi %sparse_kv_num_blks_offset, %q_start : i32 loc(#loc177) + %sparse_kv_idx_offset = arith.muli %off_zkv, %c256_i32 : i32 loc(#loc178) + %sparse_kv_idx_offset_20 = arith.muli %q_start, %c16_i32 : i32 loc(#loc179) + %sparse_kv_idx_offset_21 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_20 : i32 loc(#loc180) + %offs_m = arith.muli %q_start, %c128_i32 : i32 loc(#loc181) + %offs_m_22 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc182) + %offs_m_23 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc182) + %offs_m_24 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked1> loc(#loc182) + %offs_m_25 = tt.splat %offs_m : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc183) + %offs_m_26 = tt.splat %offs_m : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc183) + %offs_m_27 = tt.splat %offs_m : i32 -> tensor<128xi32, #blocked1> loc(#loc183) + %offs_m_28 = arith.addi %offs_m_25, %offs_m_22 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc183) + %offs_m_29 = arith.addi %offs_m_26, %offs_m_23 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc183) + %offs_m_30 = arith.addi %offs_m_27, %offs_m_24 : tensor<128xi32, #blocked1> loc(#loc183) + %ptr = tt.expand_dims %offs_m_28 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc297) + %ptr_31 = tt.expand_dims %offs_m_29 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xi32, #mma> loc(#loc297) + %ptr_32 = arith.muli %ptr, %cst_5 : tensor<128x1xi32, #blocked> loc(#loc298) + %ptr_33 = tt.splat %Q : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc299) + %ptr_34 = tt.addptr %ptr_33, %ptr_32 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc299) + %ptr_35 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc300) + %ptr_36 = tt.expand_dims %ptr_35 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc300) + %ptr_37 = tt.broadcast %ptr_34 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc301) + %ptr_38 = tt.broadcast %ptr_36 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc301) + %ptr_39 = tt.addptr %ptr_37, %ptr_38 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc301) + %q = tt.load %ptr_39 : tensor<128x128x!tt.ptr, #blocked> loc(#loc302) + %q_40 = ttg.local_alloc %q : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc302) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_21 : !tt.ptr, i32 loc(#loc190) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc191) + %kv_start_41 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc192) + %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_19 : !tt.ptr, i32 loc(#loc193) + %kv_num_blocks_42 = tt.load %kv_num_blocks : !tt.ptr loc(#loc194) + %block_n_end = arith.muli %kv_num_blocks_42, %c2_i32 : i32 loc(#loc195) + %block_n_end_43 = arith.minsi %block_n_end, %c32_i32 : i32 loc(#loc196) + %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc197) + %offs_n_44 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc197) + %offs_n_45 = tt.splat %kv_start_41 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc198) + %offs_n_46 = arith.addi %offs_n_45, %offs_n : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc198) + %0 = tt.expand_dims %offs_n_46 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc40) + %ptr_47 = tt.splat %K : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked> loc(#loc387) + %ptr_48 = tt.broadcast %ptr_36 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked> loc(#loc388) + %tmp4 = tt.broadcast %ptr_31 : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc304) + %tmp7 = tt.addptr %in_ptr9, %off_zq : !tt.ptr, i32 loc(#loc305) + %kv_offset = arith.cmpi sgt, %block_n_end_43, %c0_i32 : i32 loc(#loc452) + %tmp7_49 = tt.load %tmp7, %kv_offset : !tt.ptr loc(#loc307) + %tmp8 = tt.splat %tmp7_49 : i64 -> tensor<1x64xi64, #mma> loc(#loc308) + %tmp9 = arith.extsi %ptr_31 : tensor<128x1xi32, #mma> to tensor<128x1xi64, #mma> loc(#loc309) + %tmp10 = tt.splat %tmp7_49 : i64 -> tensor<128x1xi64, #mma> loc(#loc310) + %tmp10_50 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64, #mma> loc(#loc310) + %tmp11 = tt.broadcast %tmp10_50 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc311) + %ptr_51 = tt.splat %V : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked> loc(#loc390) + %k = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc391) + %v = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc392) + %offs_n_load = tt.splat %kv_start_41 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc313) + %offs_n_load_52 = arith.addi %offs_n_load, %offs_n_44 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc313) + %ptr_53 = tt.expand_dims %offs_n_load_52 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc393) + %ptr_54 = arith.muli %ptr_53, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc394) + %ptr_55 = tt.addptr %ptr_47, %ptr_54 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc387) + %ptr_56 = tt.broadcast %ptr_55 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc388) + %ptr_57 = tt.addptr %ptr_56, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc388) + %k_58 = ttg.memdesc_index %k[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc391) + %kv_offset_59 = tt.splat %kv_offset : i1 -> tensor<64x128xi1, #blocked> loc(#loc452) + %k_60 = ttg.async_copy_global_to_local %ptr_57, %k_58 mask %kv_offset_59 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc391) + %k_61 = ttg.async_commit_group tokens %k_60 loc(#loc391) + %ptr_62 = tt.addptr %ptr_51, %ptr_54 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc390) + %ptr_63 = tt.broadcast %ptr_62 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc395) + %ptr_64 = tt.addptr %ptr_63, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc395) + %v_65 = ttg.memdesc_index %v[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc392) + %v_66 = ttg.async_copy_global_to_local %ptr_64, %v_65 mask %kv_offset_59 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc392) + %v_67 = ttg.async_commit_group tokens %v_66 loc(#loc392) + %kv_offset_68 = arith.cmpi sgt, %block_n_end_43, %c1_i32 : i32 loc(#loc452) + %kv_base_offset = arith.addi %kv_start_41, %c64_i32 : i32 loc(#loc314) + %offs_n_load_69 = tt.splat %kv_base_offset : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc313) + %offs_n_load_70 = arith.addi %offs_n_load_69, %offs_n_44 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc313) + %ptr_71 = tt.expand_dims %offs_n_load_70 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc393) + %ptr_72 = arith.muli %ptr_71, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc394) + %ptr_73 = tt.addptr %ptr_47, %ptr_72 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc387) + %ptr_74 = tt.broadcast %ptr_73 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc388) + %ptr_75 = tt.addptr %ptr_74, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc388) + %k_76 = ttg.memdesc_index %k[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc391) + %kv_offset_77 = tt.splat %kv_offset_68 : i1 -> tensor<64x128xi1, #blocked> loc(#loc452) + %k_78 = ttg.async_copy_global_to_local %ptr_75, %k_76 mask %kv_offset_77 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc391) + %k_79 = ttg.async_commit_group tokens %k_78 loc(#loc391) + %ptr_80 = tt.addptr %ptr_51, %ptr_72 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc390) + %ptr_81 = tt.broadcast %ptr_80 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc395) + %ptr_82 = tt.addptr %ptr_81, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc395) + %v_83 = ttg.memdesc_index %v[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc392) + %v_84 = ttg.async_copy_global_to_local %ptr_82, %v_83 mask %kv_offset_77 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc392) + %v_85 = ttg.async_commit_group tokens %v_84 loc(#loc392) + ttng.fence_async_shared {bCluster = false} loc(#loc315) + %kv_offset_86:12 = scf.for %kv_offset_151 = %c0_i32 to %block_n_end_43 step %c1_i32 iter_args(%acc_152 = %cst_10, %arg13 = %cst_8, %arg14 = %cst_14, %arg15 = %c64_i32, %arg16 = %0, %arg17 = %c1_i32, %arg18 = %c-1_i32, %k_153 = %k_61, %k_154 = %k_79, %v_155 = %v_67, %v_156 = %v_85, %arg23 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 { + %kv_offset_157 = arith.subi %block_n_end_43, %c2_i32 : i32 loc(#loc452) + %kv_offset_158 = arith.cmpi slt, %kv_offset_151, %kv_offset_157 : i32 loc(#loc452) + %kv_offset_159 = arith.subi %block_n_end_43, %c1_i32 : i32 loc(#loc452) + %kv_offset_160 = arith.cmpi slt, %kv_offset_151, %kv_offset_159 : i32 loc(#loc452) + %kv_offset_161 = arith.addi %arg18, %c1_i32 : i32 loc(#loc452) + %kv_offset_162 = arith.cmpi sge, %kv_offset_161, %c3_i32 : i32 loc(#loc452) + %kv_offset_163 = arith.select %kv_offset_162, %c0_i32, %kv_offset_161 : i32 loc(#loc452) + %k_164 = ttg.async_wait %k_153, %v_155 {num = 2 : i32} loc(#loc391) + %k_165 = ttg.memdesc_index %k[%kv_offset_163] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc391) + %k_166 = ttg.memdesc_trans %k_165 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc316) + %qk = ttng.warp_group_dot %q_40, %k_166, %cst_3 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc315) + %qk_167:4 = ttng.warp_group_dot_wait %qk, %q_40, %k_166, %acc_152 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64>, tensor<128x128xf32, #mma1> loc(#loc315) + %qk_168 = arith.mulf %qk_167#0, %cst_11 : tensor<128x64xf32, #mma> loc(#loc317) + %tmp4_169 = tt.broadcast %arg16 : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc304) + %tmp4_170 = arith.cmpi sge, %tmp4, %tmp4_169 : tensor<128x64xi32, #mma> loc(#loc304) + %tmp5 = arith.extsi %arg16 : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc318) + %tmp8_171 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64, #mma> loc(#loc308) + %tmp11_172 = tt.broadcast %tmp8_171 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc311) + %tmp11_173 = arith.andi %tmp11_172, %tmp11 : tensor<128x64xi1, #mma> loc(#loc311) + %tmp12 = arith.andi %tmp4_170, %tmp11_173 : tensor<128x64xi1, #mma> loc(#loc319) + %tmp15 = arith.cmpi sge, %arg16, %cst_2 : tensor<1x64xi32, #mma> loc(#loc320) + %tmp16 = arith.remsi %arg16, %cst_2 : tensor<1x64xi32, #mma> loc(#loc321) + %tmp18 = arith.cmpi ne, %tmp16, %cst : tensor<1x64xi32, #mma> loc(#loc322) + %tmp19 = arith.cmpi slt, %tmp16, %cst : tensor<1x64xi32, #mma> loc(#loc323) + %tmp22 = arith.andi %tmp18, %tmp19 : tensor<1x64xi1, #mma> loc(#loc324) + %tmp23 = arith.addi %tmp16, %cst_2 : tensor<1x64xi32, #mma> loc(#loc325) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1, #mma>, tensor<1x64xi32, #mma> loc(#loc326) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc327) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64, #mma> loc(#loc328) + %tmp27 = arith.andi %tmp15, %tmp26 : tensor<1x64xi1, #mma> loc(#loc329) + %tmp28 = arith.subi %tmp4_169, %tmp4 : tensor<128x64xi32, #mma> loc(#loc330) + %tmp29 = arith.remsi %tmp28, %cst_1 : tensor<128x64xi32, #mma> loc(#loc331) + %tmp30 = arith.cmpi ne, %tmp29, %cst_0 : tensor<128x64xi32, #mma> loc(#loc332) + %tmp31 = arith.cmpi slt, %tmp29, %cst_0 : tensor<128x64xi32, #mma> loc(#loc333) + %tmp33 = arith.andi %tmp30, %tmp31 : tensor<128x64xi1, #mma> loc(#loc334) + %tmp34 = arith.addi %tmp29, %cst_1 : tensor<128x64xi32, #mma> loc(#loc335) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29 : tensor<128x64xi1, #mma>, tensor<128x64xi32, #mma> loc(#loc336) + %tmp36 = arith.cmpi eq, %tmp35, %cst_0 : tensor<128x64xi32, #mma> loc(#loc337) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc338) + %tmp37_174 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1, #mma> loc(#loc338) + %tmp38 = arith.ori %tmp12, %tmp37_174 : tensor<128x64xi1, #mma> loc(#loc339) + %post_mod_scores = arith.select %tmp38, %qk_168, %cst_12 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc340) + %post_mod_scores_175 = arith.mulf %post_mod_scores, %cst_13 : tensor<128x64xf32, #mma> loc(#loc341) + %m_ij = "tt.reduce"(%post_mod_scores_175) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_230: f32 loc(callsite(#loc1 at #loc342)), %m_ij_231: f32 loc(callsite(#loc1 at #loc342))): + %m_ij_232 = arith.maxnumf %m_ij_230, %m_ij_231 : f32 loc(#loc447) + tt.reduce.return %m_ij_232 : f32 loc(#loc396) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc396) + %m_ij_176 = arith.maxnumf %arg14, %m_ij : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc343) + %masked_out_rows = arith.cmpf oeq, %m_ij_176, %cst_14 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc344) + %m_ij_masked = arith.select %masked_out_rows, %cst_8, %m_ij_176 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc345) + %alpha = arith.subf %arg14, %m_ij_masked : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc346) + %alpha_177 = math.exp2 %alpha : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc347) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc348) + %p_178 = tt.broadcast %p : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc349) + %p_179 = arith.subf %post_mod_scores_175, %p_178 : tensor<128x64xf32, #mma> loc(#loc349) + %p_180 = math.exp2 %p_179 : tensor<128x64xf32, #mma> loc(#loc350) + %l_i_181 = arith.mulf %arg13, %alpha_177 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc351) + %l_i_182 = "tt.reduce"(%p_180) <{axis = 1 : i32}> ({ + ^bb0(%l_i_230: f32 loc(callsite(#loc1 at #loc352)), %l_i_231: f32 loc(callsite(#loc1 at #loc352))): + %l_i_232 = arith.addf %l_i_230, %l_i_231 : f32 loc(#loc448) + tt.reduce.return %l_i_232 : f32 loc(#loc398) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc398) + %l_i_183 = arith.addf %l_i_181, %l_i_182 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc353) + %acc_184 = tt.expand_dims %alpha_177 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc354) + %acc_185 = ttg.convert_layout %acc_184 : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc355) + %acc_186 = tt.broadcast %acc_185 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc355) + %acc_187 = arith.mulf %qk_167#3, %acc_186 : tensor<128x128xf32, #mma1> loc(#loc355) + %v_188 = ttg.memdesc_index %v[%kv_offset_163] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc392) + %acc_189 = arith.truncf %p_180 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc356) + %acc_190 = ttg.convert_layout %acc_189 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc356) + %acc_191 = ttng.warp_group_dot %acc_190, %v_188, %acc_187 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc357) + %offs_n_192 = tt.splat %arg23 : i32 -> tensor<1x64xi32, #mma> loc(#loc358) + %offs_n_193 = arith.addi %arg16, %offs_n_192 : tensor<1x64xi32, #mma> loc(#loc358) + %kv_offset_194 = arith.addi %kv_offset_151, %c1_i32 : i32 loc(#loc452) + %cur_block_idx = arith.divsi %kv_offset_194, %c2_i32 : i32 loc(#loc400) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc401) + %cur_block_195 = tt.load %cur_block, %kv_offset_160 evictionPolicy = evict_last : !tt.ptr loc(#loc402) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc403) + %next_block_196 = arith.cmpi slt, %next_block, %kv_num_blocks_42 : i32 loc(#loc404) + %next_block_197 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc405) + %kv_offset_198 = arith.andi %kv_offset_160, %next_block_196 : i1 loc(#loc452) + %next_block_199 = tt.load %next_block_197, %kv_offset_198 evictionPolicy = evict_last : !tt.ptr loc(#loc406) + %needs_jump = arith.addi %kv_offset_151, %c2_i32 : i32 loc(#loc407) + %needs_jump_200 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc408) + %needs_jump_201 = arith.cmpi eq, %needs_jump_200, %c0_i32 : i32 loc(#loc409) + %jump_to_block = arith.subi %next_block_199, %cur_block_195 : i32 loc(#loc410) + %jump_to_block_202 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc411) + %jump_to_block_203 = arith.subi %jump_to_block_202, %c64_i32 : i32 loc(#loc412) + %offset = arith.extui %needs_jump_201 : i1 to i32 loc(#loc413) + %offset_204 = arith.muli %jump_to_block_203, %offset : i32 loc(#loc413) + %offset_205 = arith.subi %c1_i32, %offset : i32 loc(#loc414) + %offset_206 = arith.muli %offset_205, %c64_i32 : i32 loc(#loc415) + %offset_207 = arith.addi %offset_204, %offset_206 : i32 loc(#loc416) + %kv_offset_208 = arith.addi %arg15, %offset_207 : i32 loc(#loc360) + %kv_offset_209 = arith.addi %arg17, %c1_i32 : i32 loc(#loc452) + %kv_offset_210 = arith.cmpi sge, %kv_offset_209, %c3_i32 : i32 loc(#loc452) + %kv_offset_211 = arith.select %kv_offset_210, %c0_i32, %kv_offset_209 : i32 loc(#loc452) + %kv_base_offset_212 = arith.addi %kv_start_41, %kv_offset_208 : i32 loc(#loc314) + %offs_n_load_213 = tt.splat %kv_base_offset_212 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc313) + %offs_n_load_214 = arith.addi %offs_n_load_213, %offs_n_44 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc313) + %ptr_215 = tt.expand_dims %offs_n_load_214 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc393) + %ptr_216 = arith.muli %ptr_215, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc394) + %ptr_217 = tt.addptr %ptr_47, %ptr_216 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc387) + %ptr_218 = tt.broadcast %ptr_217 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc388) + %ptr_219 = tt.addptr %ptr_218, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc388) + %k_220 = ttg.memdesc_index %k[%kv_offset_211] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc391) + %kv_offset_221 = tt.splat %kv_offset_158 : i1 -> tensor<64x128xi1, #blocked> loc(#loc452) + %k_222 = ttg.async_copy_global_to_local %ptr_219, %k_220 mask %kv_offset_221 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc391) + %k_223 = ttg.async_commit_group tokens %k_222 loc(#loc391) + %ptr_224 = tt.addptr %ptr_51, %ptr_216 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc390) + %ptr_225 = tt.broadcast %ptr_224 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc395) + %ptr_226 = tt.addptr %ptr_225, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc395) + %v_227 = ttg.memdesc_index %v[%kv_offset_211] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc392) + %v_228 = ttg.async_copy_global_to_local %ptr_226, %v_227 mask %kv_offset_221 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc392) + %v_229 = ttg.async_commit_group tokens %v_228 loc(#loc392) + scf.yield %acc_191, %l_i_183, %m_ij_176, %kv_offset_208, %offs_n_193, %kv_offset_211, %kv_offset_163, %k_154, %k_223, %v_156, %v_229, %offset_207 : tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc452) + } loc(#loc452) + %kv_offset_87 = ttng.warp_group_dot_wait %kv_offset_86#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc452) + %kv_offset_88 = ttg.async_wait {num = 0 : i32} loc(#loc452) + ttg.local_dealloc %v : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc452) + ttg.local_dealloc %k : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc452) + %kv_indices_89 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_21 : !tt.ptr, i32 loc(#loc275) + %kv_start_90 = tt.load %kv_indices_89 : !tt.ptr loc(#loc276) + %kv_start_91 = arith.muli %kv_start_90, %c128_i32 : i32 loc(#loc277) + %kv_num_blocks_92 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_19 : !tt.ptr, i32 loc(#loc278) + %kv_num_blocks_93 = tt.load %kv_num_blocks_92 : !tt.ptr loc(#loc279) + %block_n_end_94 = arith.muli %kv_num_blocks_93, %c2_i32 : i32 loc(#loc280) + %block_n_end_95 = arith.minsi %block_n_end_94, %c32_i32 : i32 loc(#loc281) + %k_96 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc417) + %v_97 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc418) + %kv_offset_98 = arith.cmpi sgt, %block_n_end_95, %c0_i32 : i32 loc(#loc453) + %offs_n_load_99 = tt.splat %kv_start_91 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc363) + %offs_n_load_100 = arith.addi %offs_n_load_99, %offs_n_44 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc363) + %ptr_101 = tt.expand_dims %offs_n_load_100 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc419) + %ptr_102 = arith.muli %ptr_101, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc420) + %ptr_103 = tt.addptr %ptr_47, %ptr_102 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc421) + %ptr_104 = tt.broadcast %ptr_103 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc422) + %ptr_105 = tt.addptr %ptr_104, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc422) + %k_106 = ttg.memdesc_index %k_96[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc417) + %kv_offset_107 = tt.splat %kv_offset_98 : i1 -> tensor<64x128xi1, #blocked> loc(#loc453) + %k_108 = ttg.async_copy_global_to_local %ptr_105, %k_106 mask %kv_offset_107 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc417) + %k_109 = ttg.async_commit_group tokens %k_108 loc(#loc417) + %ptr_110 = tt.addptr %ptr_51, %ptr_102 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc423) + %ptr_111 = tt.broadcast %ptr_110 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc424) + %ptr_112 = tt.addptr %ptr_111, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc424) + %v_113 = ttg.memdesc_index %v_97[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc418) + %v_114 = ttg.async_copy_global_to_local %ptr_112, %v_113 mask %kv_offset_107 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc418) + %v_115 = ttg.async_commit_group tokens %v_114 loc(#loc418) + %kv_offset_116 = arith.cmpi sgt, %block_n_end_95, %c1_i32 : i32 loc(#loc453) + %kv_base_offset_117 = arith.addi %kv_start_91, %c64_i32 : i32 loc(#loc364) + %offs_n_load_118 = tt.splat %kv_base_offset_117 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc363) + %offs_n_load_119 = arith.addi %offs_n_load_118, %offs_n_44 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc363) + %ptr_120 = tt.expand_dims %offs_n_load_119 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc419) + %ptr_121 = arith.muli %ptr_120, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc420) + %ptr_122 = tt.addptr %ptr_47, %ptr_121 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc421) + %ptr_123 = tt.broadcast %ptr_122 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc422) + %ptr_124 = tt.addptr %ptr_123, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc422) + %k_125 = ttg.memdesc_index %k_96[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc417) + %kv_offset_126 = tt.splat %kv_offset_116 : i1 -> tensor<64x128xi1, #blocked> loc(#loc453) + %k_127 = ttg.async_copy_global_to_local %ptr_124, %k_125 mask %kv_offset_126 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc417) + %k_128 = ttg.async_commit_group tokens %k_127 loc(#loc417) + %ptr_129 = tt.addptr %ptr_51, %ptr_121 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc423) + %ptr_130 = tt.broadcast %ptr_129 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc424) + %ptr_131 = tt.addptr %ptr_130, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc424) + %v_132 = ttg.memdesc_index %v_97[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc418) + %v_133 = ttg.async_copy_global_to_local %ptr_131, %v_132 mask %kv_offset_126 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc418) + %v_134 = ttg.async_commit_group tokens %v_133 loc(#loc418) + ttng.fence_async_shared {bCluster = false} loc(#loc365) + %kv_offset_135:10 = scf.for %kv_offset_151 = %c0_i32 to %block_n_end_95 step %c1_i32 iter_args(%kv_offset_152 = %kv_offset_87, %kv_offset_153 = %kv_offset_86#1, %kv_offset_154 = %kv_offset_86#2, %arg15 = %c64_i32, %arg16 = %c1_i32, %arg17 = %c-1_i32, %k_155 = %k_109, %k_156 = %k_128, %v_157 = %v_115, %v_158 = %v_134) -> (tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token) : i32 { + %kv_offset_159 = arith.subi %block_n_end_95, %c2_i32 : i32 loc(#loc453) + %kv_offset_160 = arith.cmpi slt, %kv_offset_151, %kv_offset_159 : i32 loc(#loc453) + %kv_offset_161 = arith.subi %block_n_end_95, %c1_i32 : i32 loc(#loc453) + %kv_offset_162 = arith.cmpi slt, %kv_offset_151, %kv_offset_161 : i32 loc(#loc453) + %kv_offset_163 = arith.addi %arg17, %c1_i32 : i32 loc(#loc453) + %kv_offset_164 = arith.cmpi sge, %kv_offset_163, %c3_i32 : i32 loc(#loc453) + %kv_offset_165 = arith.select %kv_offset_164, %c0_i32, %kv_offset_163 : i32 loc(#loc453) + %k_166 = ttg.async_wait %k_155, %v_157 {num = 2 : i32} loc(#loc417) + %k_167 = ttg.memdesc_index %k_96[%kv_offset_165] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc417) + %k_168 = ttg.memdesc_trans %k_167 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc366) + %qk = ttng.warp_group_dot %q_40, %k_168, %cst_3 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc365) + %qk_169:4 = ttng.warp_group_dot_wait %qk, %q_40, %k_168, %kv_offset_152 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64>, tensor<128x128xf32, #mma1> loc(#loc365) + %qk_170 = arith.mulf %qk_169#0, %cst_11 : tensor<128x64xf32, #mma> loc(#loc367) + %post_mod_scores = arith.mulf %qk_170, %cst_13 : tensor<128x64xf32, #mma> loc(#loc368) + %m_ij = "tt.reduce"(%post_mod_scores) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_223: f32 loc(callsite(#loc1 at #loc369)), %m_ij_224: f32 loc(callsite(#loc1 at #loc369))): + %m_ij_225 = arith.maxnumf %m_ij_223, %m_ij_224 : f32 loc(#loc449) + tt.reduce.return %m_ij_225 : f32 loc(#loc425) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc425) + %m_ij_171 = arith.maxnumf %kv_offset_154, %m_ij : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc370) + %masked_out_rows = arith.cmpf oeq, %m_ij_171, %cst_14 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc371) + %m_ij_masked = arith.select %masked_out_rows, %cst_8, %m_ij_171 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc372) + %alpha = arith.subf %kv_offset_154, %m_ij_masked : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc373) + %alpha_172 = math.exp2 %alpha : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc374) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc375) + %p_173 = tt.broadcast %p : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc376) + %p_174 = arith.subf %post_mod_scores, %p_173 : tensor<128x64xf32, #mma> loc(#loc376) + %p_175 = math.exp2 %p_174 : tensor<128x64xf32, #mma> loc(#loc377) + %l_i_176 = arith.mulf %kv_offset_153, %alpha_172 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc378) + %l_i_177 = "tt.reduce"(%p_175) <{axis = 1 : i32}> ({ + ^bb0(%l_i_223: f32 loc(callsite(#loc1 at #loc379)), %l_i_224: f32 loc(callsite(#loc1 at #loc379))): + %l_i_225 = arith.addf %l_i_223, %l_i_224 : f32 loc(#loc450) + tt.reduce.return %l_i_225 : f32 loc(#loc427) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc427) + %l_i_178 = arith.addf %l_i_176, %l_i_177 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc380) + %acc_179 = tt.expand_dims %alpha_172 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc381) + %acc_180 = ttg.convert_layout %acc_179 : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc382) + %acc_181 = tt.broadcast %acc_180 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc382) + %acc_182 = arith.mulf %qk_169#3, %acc_181 : tensor<128x128xf32, #mma1> loc(#loc382) + %v_183 = ttg.memdesc_index %v_97[%kv_offset_165] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc418) + %acc_184 = arith.truncf %p_175 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc383) + %acc_185 = ttg.convert_layout %acc_184 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc383) + %acc_186 = ttng.warp_group_dot %acc_185, %v_183, %acc_182 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc384) + %kv_offset_187 = arith.addi %kv_offset_151, %c1_i32 : i32 loc(#loc453) + %cur_block_idx = arith.divsi %kv_offset_187, %c2_i32 : i32 loc(#loc429) + %cur_block = tt.addptr %kv_indices_89, %cur_block_idx : !tt.ptr, i32 loc(#loc430) + %cur_block_188 = tt.load %cur_block, %kv_offset_162 evictionPolicy = evict_last : !tt.ptr loc(#loc431) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc432) + %next_block_189 = arith.cmpi slt, %next_block, %kv_num_blocks_93 : i32 loc(#loc433) + %next_block_190 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc434) + %kv_offset_191 = arith.andi %kv_offset_162, %next_block_189 : i1 loc(#loc453) + %next_block_192 = tt.load %next_block_190, %kv_offset_191 evictionPolicy = evict_last : !tt.ptr loc(#loc435) + %needs_jump = arith.addi %kv_offset_151, %c2_i32 : i32 loc(#loc436) + %needs_jump_193 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc437) + %needs_jump_194 = arith.cmpi eq, %needs_jump_193, %c0_i32 : i32 loc(#loc438) + %jump_to_block = arith.subi %next_block_192, %cur_block_188 : i32 loc(#loc439) + %jump_to_block_195 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc440) + %jump_to_block_196 = arith.subi %jump_to_block_195, %c64_i32 : i32 loc(#loc441) + %offset = arith.extui %needs_jump_194 : i1 to i32 loc(#loc442) + %offset_197 = arith.muli %jump_to_block_196, %offset : i32 loc(#loc442) + %offset_198 = arith.subi %c1_i32, %offset : i32 loc(#loc443) + %offset_199 = arith.muli %offset_198, %c64_i32 : i32 loc(#loc444) + %offset_200 = arith.addi %offset_197, %offset_199 : i32 loc(#loc445) + %kv_offset_201 = arith.addi %arg15, %offset_200 : i32 loc(#loc386) + %kv_offset_202 = arith.addi %arg16, %c1_i32 : i32 loc(#loc453) + %kv_offset_203 = arith.cmpi sge, %kv_offset_202, %c3_i32 : i32 loc(#loc453) + %kv_offset_204 = arith.select %kv_offset_203, %c0_i32, %kv_offset_202 : i32 loc(#loc453) + %kv_base_offset_205 = arith.addi %kv_start_91, %kv_offset_201 : i32 loc(#loc364) + %offs_n_load_206 = tt.splat %kv_base_offset_205 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc363) + %offs_n_load_207 = arith.addi %offs_n_load_206, %offs_n_44 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc363) + %ptr_208 = tt.expand_dims %offs_n_load_207 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc419) + %ptr_209 = arith.muli %ptr_208, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc420) + %ptr_210 = tt.addptr %ptr_47, %ptr_209 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc421) + %ptr_211 = tt.broadcast %ptr_210 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc422) + %ptr_212 = tt.addptr %ptr_211, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc422) + %k_213 = ttg.memdesc_index %k_96[%kv_offset_204] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc417) + %kv_offset_214 = tt.splat %kv_offset_160 : i1 -> tensor<64x128xi1, #blocked> loc(#loc453) + %k_215 = ttg.async_copy_global_to_local %ptr_212, %k_213 mask %kv_offset_214 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc417) + %k_216 = ttg.async_commit_group tokens %k_215 loc(#loc417) + %ptr_217 = tt.addptr %ptr_51, %ptr_209 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc423) + %ptr_218 = tt.broadcast %ptr_217 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc424) + %ptr_219 = tt.addptr %ptr_218, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc424) + %v_220 = ttg.memdesc_index %v_97[%kv_offset_204] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc418) + %v_221 = ttg.async_copy_global_to_local %ptr_219, %v_220 mask %kv_offset_214 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc418) + %v_222 = ttg.async_commit_group tokens %v_221 loc(#loc418) + scf.yield %acc_186, %l_i_178, %m_ij_171, %kv_offset_201, %kv_offset_204, %kv_offset_165, %k_156, %k_216, %v_158, %v_222 : tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token loc(#loc453) + } loc(#loc453) + %kv_offset_136 = ttng.warp_group_dot_wait %kv_offset_135#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc453) + %kv_offset_137 = ttg.async_wait {num = 0 : i32} loc(#loc453) + ttg.local_dealloc %v_97 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc453) + ttg.local_dealloc %k_96 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc453) + %l_i = arith.cmpf oeq, %kv_offset_135#1, %cst_8 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc283) + %l_i_138 = arith.select %l_i, %cst_9, %kv_offset_135#1 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc284) + %acc = tt.expand_dims %l_i_138 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc285) + %acc_139 = ttg.convert_layout %acc : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc286) + %acc_140 = tt.broadcast %acc_139 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc286) + %acc_141 = arith.divf %kv_offset_136, %acc_140 : tensor<128x128xf32, #mma1> loc(#loc286) + %mask = arith.cmpi slt, %ptr, %cst_7 : tensor<128x1xi32, #blocked> loc(#loc287) + %mask_142 = arith.cmpi slt, %ptr_36, %cst_6 : tensor<1x128xi32, #blocked> loc(#loc288) + %mask_143 = tt.broadcast %mask : tensor<128x1xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc289) + %mask_144 = tt.broadcast %mask_142 : tensor<1x128xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc289) + %mask_145 = arith.andi %mask_143, %mask_144 : tensor<128x128xi1, #blocked> loc(#loc289) + %1 = tt.splat %q_offset_15 : i32 -> tensor<1x128xi32, #blocked> loc(#loc137) + %2 = arith.addi %ptr_36, %1 : tensor<1x128xi32, #blocked> loc(#loc137) + %3 = tt.broadcast %2 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc138) + %4 = tt.broadcast %ptr_32 : tensor<128x1xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc138) + %5 = arith.addi %3, %4 : tensor<128x128xi32, #blocked> loc(#loc138) + %6 = tt.splat %q_offset : i32 -> tensor<128x128xi32, #blocked> loc(#loc139) + %7 = arith.addi %5, %6 : tensor<128x128xi32, #blocked> loc(#loc139) + %8 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr, #blocked> loc(#loc140) + %9 = tt.addptr %8, %7 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc140) + %10 = arith.truncf %acc_141 : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc141) + %11 = ttg.convert_layout %10 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc141) + tt.store %9, %11, %mask_145 : tensor<128x128x!tt.ptr, #blocked> loc(#loc141) + %off_hz = arith.muli %off_zq, %c32_i32 : i32 loc(#loc290) + %off_hz_146 = arith.addi %off_hz, %off_hq : i32 loc(#loc291) + %l_ptrs = arith.muli %off_hz_146, %c2048_i32 : i32 loc(#loc292) + %l_ptrs_147 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr, i32 loc(#loc293) + %l_ptrs_148 = tt.splat %l_ptrs_147 : !tt.ptr -> tensor<128x!tt.ptr, #blocked1> loc(#loc294) + %l_ptrs_149 = tt.addptr %l_ptrs_148, %offs_m_30 : tensor<128x!tt.ptr, #blocked1>, tensor<128xi32, #blocked1> loc(#loc294) + %lse = math.log2 %l_i_138 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc295) + %lse_150 = arith.addf %kv_offset_135#2, %lse : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc296) + %12 = ttg.convert_layout %lse_150 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128xf32, #blocked1> loc(#loc149) + tt.store %l_ptrs_149, %12 : tensor<128x!tt.ptr, #blocked1> loc(#loc149) + tt.return loc(#loc150) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":97:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":98:27) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":99:27) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":103:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":104:24) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":107:24) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":107:45) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":107:36) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":108:25) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":108:47) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":108:37) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":111:12) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":112:12) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":113:12) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":142:51) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":142:74) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":143:46) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":143:97) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":143:64) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":144:23) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":144:46) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":144:33) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":284:27) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":146:101) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":284:38) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":284:20) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":284:56) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":284:49) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":294:23) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":151:26) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":152:23) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":152:37) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":153:42) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":153:28) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":154:45) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":154:65) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":159:37) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":159:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":167:48) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":347:107) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":373:23) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":376:33) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":502:40) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":376:23) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":377:22) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":378:23) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":379:23) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":380:23) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":439:107) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":346:35) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":342:32) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":351:19) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":349:17) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":353:14) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":374:23) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":381:23) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":384:24) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":385:24) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":387:25) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":388:92) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":391:24) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":392:24) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":393:39) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":394:25) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":395:24) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":396:24) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":397:23) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":398:25) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":399:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":400:92) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":402:24) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":403:24) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":404:39) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":405:25) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":406:24) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":407:24) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":414:69) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":417:27) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":421:27) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":423:35) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":424:51) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":428:31) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":428:25) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":429:51) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":429:39) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":429:21) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":434:16) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":434:24) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":436:22) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":436:16) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":440:22) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":440:44) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":548:26) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":247:33) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":545:63) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":248:38) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":248:24) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":249:109) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":249:113) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":249:55) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":249:25) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":250:30) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":250:35) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":250:60) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":251:34) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":251:48) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":251:63) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":252:29) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":252:47) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":252:61) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":252:42) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":549:21) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":181:35) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":182:27) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":182:41) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":183:51) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":183:32) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":184:49) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":184:69) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":206:26) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":206:34) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":208:20) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":208:16) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":214:20) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":214:38) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":214:30) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":218:49) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":218:62) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":218:75) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":218:25) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":218:109) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":221:26) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":221:31) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":222:32) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":222:23) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":222:40) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":223:33) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":223:20) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":225:29) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":229:4) +#loc162 = loc("q_start"(#loc2)) +#loc163 = loc("off_zq"(#loc3)) +#loc164 = loc("off_hq"(#loc4)) +#loc165 = loc("off_zkv"(#loc5)) +#loc166 = loc("off_hkv"(#loc6)) +#loc167 = loc("q_offset"(#loc7)) +#loc168 = loc("q_offset"(#loc8)) +#loc169 = loc("q_offset"(#loc9)) +#loc170 = loc("k_offset"(#loc10)) +#loc171 = loc("k_offset"(#loc11)) +#loc172 = loc("k_offset"(#loc12)) +#loc173 = loc("Q"(#loc13)) +#loc174 = loc("K"(#loc14)) +#loc175 = loc("V"(#loc15)) +#loc176 = loc("sparse_kv_num_blks_offset"(#loc16)) +#loc177 = loc("sparse_kv_num_blks_offset"(#loc17)) +#loc178 = loc("sparse_kv_idx_offset"(#loc18)) +#loc179 = loc("sparse_kv_idx_offset"(#loc19)) +#loc180 = loc("sparse_kv_idx_offset"(#loc20)) +#loc181 = loc("offs_m"(#loc21)) +#loc182 = loc("offs_m"(#loc22)) +#loc183 = loc("offs_m"(#loc23)) +#loc184 = loc("ptr"(#loc24)) +#loc185 = loc("q"(#loc25)) +#loc186 = loc("ptr"(#loc26)) +#loc187 = loc("ptr"(#loc27)) +#loc188 = loc("ptr"(#loc28)) +#loc189 = loc("ptr"(#loc29)) +#loc190 = loc("kv_indices"(#loc31)) +#loc191 = loc("kv_start"(#loc32)) +#loc192 = loc("kv_start"(#loc33)) +#loc193 = loc("kv_num_blocks"(#loc34)) +#loc194 = loc("kv_num_blocks"(#loc35)) +#loc195 = loc("block_n_end"(#loc36)) +#loc196 = loc("block_n_end"(#loc37)) +#loc197 = loc("offs_n"(#loc38)) +#loc198 = loc("offs_n"(#loc39)) +#loc199 = loc("k"(#loc41)) +#loc201 = loc("tmp4"(#loc44)) +#loc202 = loc("tmp7"(#loc45)) +#loc203 = loc("acc"(#loc46)) +#loc204 = loc("tmp7"(#loc47)) +#loc205 = loc("tmp8"(#loc48)) +#loc206 = loc("tmp9"(#loc49)) +#loc207 = loc("tmp10"(#loc50)) +#loc208 = loc("tmp11"(#loc51)) +#loc209 = loc("v"(#loc52)) +#loc210 = loc("offs_n_load"(#loc53)) +#loc211 = loc("kv_base_offset"(#loc54)) +#loc212 = loc("qk"(#loc55)) +#loc213 = loc("k"(#loc56)) +#loc214 = loc("qk"(#loc57)) +#loc215 = loc("tmp5"(#loc58)) +#loc216 = loc("tmp12"(#loc59)) +#loc217 = loc("tmp15"(#loc60)) +#loc218 = loc("tmp16"(#loc61)) +#loc219 = loc("tmp18"(#loc62)) +#loc220 = loc("tmp19"(#loc63)) +#loc221 = loc("tmp22"(#loc64)) +#loc222 = loc("tmp23"(#loc65)) +#loc223 = loc("tmp24"(#loc66)) +#loc224 = loc("tmp25"(#loc67)) +#loc225 = loc("tmp26"(#loc68)) +#loc226 = loc("tmp27"(#loc69)) +#loc227 = loc("tmp28"(#loc70)) +#loc228 = loc("tmp29"(#loc71)) +#loc229 = loc("tmp30"(#loc72)) +#loc230 = loc("tmp31"(#loc73)) +#loc231 = loc("tmp33"(#loc74)) +#loc232 = loc("tmp34"(#loc75)) +#loc233 = loc("tmp35"(#loc76)) +#loc234 = loc("tmp36"(#loc77)) +#loc235 = loc("tmp37"(#loc78)) +#loc236 = loc("tmp38"(#loc79)) +#loc237 = loc("post_mod_scores"(#loc80)) +#loc238 = loc("post_mod_scores"(#loc81)) +#loc240 = loc("m_ij"(#loc85)) +#loc241 = loc("masked_out_rows"(#loc86)) +#loc242 = loc("m_ij_masked"(#loc87)) +#loc243 = loc("alpha"(#loc88)) +#loc244 = loc("alpha"(#loc89)) +#loc245 = loc("p"(#loc90)) +#loc246 = loc("p"(#loc91)) +#loc247 = loc("p"(#loc92)) +#loc248 = loc("l_i"(#loc93)) +#loc250 = loc("l_i"(#loc97)) +#loc251 = loc("acc"(#loc98)) +#loc252 = loc("acc"(#loc99)) +#loc253 = loc("acc"(#loc100)) +#loc254 = loc("acc"(#loc101)) +#loc255 = loc("offs_n"(#loc102)) +#loc256 = loc("cur_block_idx"(#loc103)) +#loc257 = loc("offset"(#loc104)) +#loc258 = loc("cur_block"(#loc105)) +#loc259 = loc("cur_block"(#loc106)) +#loc260 = loc("next_block"(#loc107)) +#loc261 = loc("next_block"(#loc108)) +#loc262 = loc("next_block"(#loc109)) +#loc263 = loc("next_block"(#loc110)) +#loc264 = loc("needs_jump"(#loc111)) +#loc265 = loc("needs_jump"(#loc112)) +#loc266 = loc("needs_jump"(#loc113)) +#loc267 = loc("jump_to_block"(#loc114)) +#loc268 = loc("jump_to_block"(#loc115)) +#loc269 = loc("jump_to_block"(#loc116)) +#loc270 = loc("offset"(#loc117)) +#loc271 = loc("offset"(#loc118)) +#loc272 = loc("offset"(#loc119)) +#loc273 = loc("offset"(#loc120)) +#loc274 = loc("kv_offset"(#loc121)) +#loc275 = loc("kv_indices"(#loc122)) +#loc276 = loc("kv_start"(#loc123)) +#loc277 = loc("kv_start"(#loc124)) +#loc278 = loc("kv_num_blocks"(#loc125)) +#loc279 = loc("kv_num_blocks"(#loc126)) +#loc280 = loc("block_n_end"(#loc127)) +#loc281 = loc("block_n_end"(#loc128)) +#loc283 = loc("l_i"(#loc130)) +#loc284 = loc("l_i"(#loc131)) +#loc285 = loc("acc"(#loc132)) +#loc286 = loc("acc"(#loc133)) +#loc287 = loc("mask"(#loc134)) +#loc288 = loc("mask"(#loc135)) +#loc289 = loc("mask"(#loc136)) +#loc290 = loc("off_hz"(#loc142)) +#loc291 = loc("off_hz"(#loc143)) +#loc292 = loc("l_ptrs"(#loc144)) +#loc293 = loc("l_ptrs"(#loc145)) +#loc294 = loc("l_ptrs"(#loc146)) +#loc295 = loc("lse"(#loc147)) +#loc296 = loc("lse"(#loc148)) +#loc297 = loc(callsite(#loc184 at #loc185)) +#loc298 = loc(callsite(#loc186 at #loc185)) +#loc299 = loc(callsite(#loc187 at #loc185)) +#loc300 = loc(callsite(#loc188 at #loc185)) +#loc301 = loc(callsite(#loc189 at #loc185)) +#loc302 = loc(callsite(#loc30 at #loc185)) +#loc303 = loc(callsite(#loc199 at #loc200)) +#loc304 = loc(callsite(#loc201 at #loc200)) +#loc305 = loc(callsite(#loc202 at #loc200)) +#loc306 = loc("l_i"(#loc203)) +#loc307 = loc(callsite(#loc204 at #loc200)) +#loc308 = loc(callsite(#loc205 at #loc200)) +#loc309 = loc(callsite(#loc206 at #loc200)) +#loc310 = loc(callsite(#loc207 at #loc200)) +#loc311 = loc(callsite(#loc208 at #loc200)) +#loc312 = loc(callsite(#loc209 at #loc200)) +#loc313 = loc(callsite(#loc210 at #loc200)) +#loc314 = loc(callsite(#loc211 at #loc200)) +#loc315 = loc(callsite(#loc212 at #loc200)) +#loc316 = loc(callsite(#loc213 at #loc200)) +#loc317 = loc(callsite(#loc214 at #loc200)) +#loc318 = loc(callsite(#loc215 at #loc200)) +#loc319 = loc(callsite(#loc216 at #loc200)) +#loc320 = loc(callsite(#loc217 at #loc200)) +#loc321 = loc(callsite(#loc218 at #loc200)) +#loc322 = loc(callsite(#loc219 at #loc200)) +#loc323 = loc(callsite(#loc220 at #loc200)) +#loc324 = loc(callsite(#loc221 at #loc200)) +#loc325 = loc(callsite(#loc222 at #loc200)) +#loc326 = loc(callsite(#loc223 at #loc200)) +#loc327 = loc(callsite(#loc224 at #loc200)) +#loc328 = loc(callsite(#loc225 at #loc200)) +#loc329 = loc(callsite(#loc226 at #loc200)) +#loc330 = loc(callsite(#loc227 at #loc200)) +#loc331 = loc(callsite(#loc228 at #loc200)) +#loc332 = loc(callsite(#loc229 at #loc200)) +#loc333 = loc(callsite(#loc230 at #loc200)) +#loc334 = loc(callsite(#loc231 at #loc200)) +#loc335 = loc(callsite(#loc232 at #loc200)) +#loc336 = loc(callsite(#loc233 at #loc200)) +#loc337 = loc(callsite(#loc234 at #loc200)) +#loc338 = loc(callsite(#loc235 at #loc200)) +#loc339 = loc(callsite(#loc236 at #loc200)) +#loc340 = loc(callsite(#loc237 at #loc200)) +#loc341 = loc(callsite(#loc238 at #loc200)) +#loc343 = loc(callsite(#loc240 at #loc200)) +#loc344 = loc(callsite(#loc241 at #loc200)) +#loc345 = loc(callsite(#loc242 at #loc200)) +#loc346 = loc(callsite(#loc243 at #loc200)) +#loc347 = loc(callsite(#loc244 at #loc200)) +#loc348 = loc(callsite(#loc245 at #loc200)) +#loc349 = loc(callsite(#loc246 at #loc200)) +#loc350 = loc(callsite(#loc247 at #loc200)) +#loc351 = loc(callsite(#loc248 at #loc200)) +#loc353 = loc(callsite(#loc250 at #loc200)) +#loc354 = loc(callsite(#loc251 at #loc200)) +#loc355 = loc(callsite(#loc252 at #loc200)) +#loc356 = loc(callsite(#loc253 at #loc200)) +#loc357 = loc(callsite(#loc254 at #loc200)) +#loc358 = loc(callsite(#loc255 at #loc43)) +#loc359 = loc(callsite(#loc257 at #loc43)) +#loc360 = loc(callsite(#loc274 at #loc43)) +#loc361 = loc(callsite(#loc199 at #loc282)) +#loc362 = loc(callsite(#loc209 at #loc282)) +#loc363 = loc(callsite(#loc210 at #loc282)) +#loc364 = loc(callsite(#loc211 at #loc282)) +#loc365 = loc(callsite(#loc212 at #loc282)) +#loc366 = loc(callsite(#loc213 at #loc282)) +#loc367 = loc(callsite(#loc214 at #loc282)) +#loc368 = loc(callsite(#loc238 at #loc282)) +#loc370 = loc(callsite(#loc240 at #loc282)) +#loc371 = loc(callsite(#loc241 at #loc282)) +#loc372 = loc(callsite(#loc242 at #loc282)) +#loc373 = loc(callsite(#loc243 at #loc282)) +#loc374 = loc(callsite(#loc244 at #loc282)) +#loc375 = loc(callsite(#loc245 at #loc282)) +#loc376 = loc(callsite(#loc246 at #loc282)) +#loc377 = loc(callsite(#loc247 at #loc282)) +#loc378 = loc(callsite(#loc248 at #loc282)) +#loc380 = loc(callsite(#loc250 at #loc282)) +#loc381 = loc(callsite(#loc251 at #loc282)) +#loc382 = loc(callsite(#loc252 at #loc282)) +#loc383 = loc(callsite(#loc253 at #loc282)) +#loc384 = loc(callsite(#loc254 at #loc282)) +#loc385 = loc(callsite(#loc257 at #loc129)) +#loc386 = loc(callsite(#loc274 at #loc129)) +#loc387 = loc(callsite(#loc187 at #loc303)) +#loc388 = loc(callsite(#loc189 at #loc303)) +#loc389 = loc("m_i"(#loc306)) +#loc390 = loc(callsite(#loc187 at #loc312)) +#loc391 = loc(callsite(#loc30 at #loc303)) +#loc392 = loc(callsite(#loc30 at #loc312)) +#loc393 = loc(callsite(#loc184 at #loc303)) +#loc394 = loc(callsite(#loc186 at #loc303)) +#loc395 = loc(callsite(#loc189 at #loc312)) +#loc396 = loc(callsite(#loc82 at #loc342)) +#loc398 = loc(callsite(#loc94 at #loc352)) +#loc400 = loc(callsite(#loc256 at #loc359)) +#loc401 = loc(callsite(#loc258 at #loc359)) +#loc402 = loc(callsite(#loc259 at #loc359)) +#loc403 = loc(callsite(#loc260 at #loc359)) +#loc404 = loc(callsite(#loc261 at #loc359)) +#loc405 = loc(callsite(#loc262 at #loc359)) +#loc406 = loc(callsite(#loc263 at #loc359)) +#loc407 = loc(callsite(#loc264 at #loc359)) +#loc408 = loc(callsite(#loc265 at #loc359)) +#loc409 = loc(callsite(#loc266 at #loc359)) +#loc410 = loc(callsite(#loc267 at #loc359)) +#loc411 = loc(callsite(#loc268 at #loc359)) +#loc412 = loc(callsite(#loc269 at #loc359)) +#loc413 = loc(callsite(#loc270 at #loc359)) +#loc414 = loc(callsite(#loc271 at #loc359)) +#loc415 = loc(callsite(#loc272 at #loc359)) +#loc416 = loc(callsite(#loc273 at #loc359)) +#loc417 = loc(callsite(#loc30 at #loc361)) +#loc418 = loc(callsite(#loc30 at #loc362)) +#loc419 = loc(callsite(#loc184 at #loc361)) +#loc420 = loc(callsite(#loc186 at #loc361)) +#loc421 = loc(callsite(#loc187 at #loc361)) +#loc422 = loc(callsite(#loc189 at #loc361)) +#loc423 = loc(callsite(#loc187 at #loc362)) +#loc424 = loc(callsite(#loc189 at #loc362)) +#loc425 = loc(callsite(#loc82 at #loc369)) +#loc427 = loc(callsite(#loc94 at #loc379)) +#loc429 = loc(callsite(#loc256 at #loc385)) +#loc430 = loc(callsite(#loc258 at #loc385)) +#loc431 = loc(callsite(#loc259 at #loc385)) +#loc432 = loc(callsite(#loc260 at #loc385)) +#loc433 = loc(callsite(#loc261 at #loc385)) +#loc434 = loc(callsite(#loc262 at #loc385)) +#loc435 = loc(callsite(#loc263 at #loc385)) +#loc436 = loc(callsite(#loc264 at #loc385)) +#loc437 = loc(callsite(#loc265 at #loc385)) +#loc438 = loc(callsite(#loc266 at #loc385)) +#loc439 = loc(callsite(#loc267 at #loc385)) +#loc440 = loc(callsite(#loc268 at #loc385)) +#loc441 = loc(callsite(#loc269 at #loc385)) +#loc442 = loc(callsite(#loc270 at #loc385)) +#loc443 = loc(callsite(#loc271 at #loc385)) +#loc444 = loc(callsite(#loc272 at #loc385)) +#loc445 = loc(callsite(#loc273 at #loc385)) +#loc446 = loc("offs_n"(#loc389)) +#loc447 = loc(callsite(#loc84 at #loc396)) +#loc448 = loc(callsite(#loc96 at #loc398)) +#loc449 = loc(callsite(#loc84 at #loc425)) +#loc450 = loc(callsite(#loc96 at #loc427)) +#loc451 = loc("kv_offset"(#loc446)) +#loc452 = loc(callsite(#loc451 at #loc43)) +#loc453 = loc(callsite(#loc451 at #loc129)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..d45a577538c6a5c22c95943e6081513589525294 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/TXM3CAP4IOWMRQI6GXOPSZR65FCEVPONI655XLOK7C6S24MLB6IQ/triton_tem_fused_0.ttir @@ -0,0 +1,762 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":172:41) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":518:16) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":421:51) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":434:34) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":198:45) +#loc157 = loc("arg_Q"(#loc)) +#loc158 = loc("arg_K"(#loc)) +#loc159 = loc("arg_V"(#loc)) +#loc160 = loc("arg_LSE"(#loc)) +#loc161 = loc("arg_MAX"(#loc)) +#loc162 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc163 = loc("arg_KV_IDX"(#loc)) +#loc164 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc165 = loc("arg_FULL_KV_IDX"(#loc)) +#loc166 = loc("in_ptr9"(#loc)) +#loc167 = loc("out_ptr0"(#loc)) +#loc213 = loc(callsite(#loc50 at #loc2)) +#loc250 = loc("m_ij"(#loc88)) +#loc260 = loc("l_i"(#loc100)) +#loc296 = loc(callsite(#loc50 at #loc138)) +#loc354 = loc(callsite(#loc250 at #loc213)) +#loc364 = loc(callsite(#loc260 at #loc213)) +#loc381 = loc(callsite(#loc250 at #loc296)) +#loc391 = loc(callsite(#loc260 at #loc296)) +#loc408 = loc(callsite(#loc1 at #loc354)) +#loc410 = loc(callsite(#loc1 at #loc364)) +#loc437 = loc(callsite(#loc1 at #loc381)) +#loc439 = loc(callsite(#loc1 at #loc391)) +module { + tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %in_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr9"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<2048> : tensor<128x64xi32> loc(#loc168) + %cst_1 = arith.constant dense<2048> : tensor<1x64xi32> loc(#loc168) + %cst_2 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc1) + %cst_3 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1) + %cst_4 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc168) + %cst_5 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc168) + %cst_6 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc168) + %cst_7 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1) + %cst_8 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %acc = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc309) + %cst_9 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc1) + %mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc170) + %mask_10 = arith.constant dense<2048> : tensor<128x1xi32> loc(#loc171) + %l_i = arith.constant dense<1.000000e+00> : tensor<128xf32> loc(#loc172) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc1) + %stride_kv_idx_h = arith.constant 256 : i32 loc(#loc173) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c2097152_i32 = arith.constant 2097152 : i32 loc(#loc1) + %c262144_i32 = arith.constant 262144 : i32 loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %q_start = tt.get_program_id x : i32 loc(#loc174) + %off_zq = tt.get_program_id y : i32 loc(#loc175) + %off_hq = tt.get_program_id z : i32 loc(#loc176) + %off_zkv = arith.remsi %off_zq, %c2_i32 : i32 loc(#loc177) + %off_hkv = arith.divsi %off_hq, %c4_i32 : i32 loc(#loc178) + %q_offset = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc179) + %q_offset_12 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc180) + %q_offset_13 = arith.addi %q_offset, %q_offset_12 : i32 loc(#loc181) + %k_offset = arith.muli %off_zkv, %c2097152_i32 : i32 loc(#loc182) + %k_offset_14 = arith.muli %off_hkv, %c262144_i32 : i32 loc(#loc183) + %k_offset_15 = arith.addi %k_offset, %k_offset_14 : i32 loc(#loc184) + %Q = tt.addptr %arg_Q, %q_offset_13 : !tt.ptr, i32 loc(#loc185) + %K = tt.addptr %arg_K, %k_offset_15 : !tt.ptr, i32 loc(#loc186) + %V = tt.addptr %arg_V, %k_offset_15 : !tt.ptr, i32 loc(#loc187) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %c16_i32 : i32 loc(#loc188) + %sparse_kv_num_blks_offset_16 = arith.addi %sparse_kv_num_blks_offset, %q_start : i32 loc(#loc189) + %sparse_kv_idx_offset = arith.muli %off_zkv, %stride_kv_idx_h : i32 loc(#loc190) + %sparse_kv_idx_offset_17 = arith.muli %q_start, %c16_i32 : i32 loc(#loc191) + %sparse_kv_idx_offset_18 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_17 : i32 loc(#loc192) + %offs_m = arith.muli %q_start, %c128_i32 : i32 loc(#loc193) + %offs_m_19 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc194) + %offs_m_20 = tt.splat %offs_m : i32 -> tensor<128xi32> loc(#loc195) + %offs_m_21 = arith.addi %offs_m_20, %offs_m_19 : tensor<128xi32> loc(#loc195) + %ptr = tt.expand_dims %offs_m_21 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc310) + %ptr_22 = arith.muli %ptr, %cst_9 : tensor<128x1xi32> loc(#loc311) + %ptr_23 = tt.splat %Q : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc312) + %ptr_24 = tt.addptr %ptr_23, %ptr_22 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc312) + %ptr_25 = tt.expand_dims %offs_m_19 {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc313) + %ptr_26 = tt.broadcast %ptr_24 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc314) + %ptr_27 = tt.broadcast %ptr_25 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc314) + %ptr_28 = tt.addptr %ptr_26, %ptr_27 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc314) + %q = tt.load %ptr_28 : tensor<128x128x!tt.ptr> loc(#loc315) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_18 : !tt.ptr, i32 loc(#loc202) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc203) + %kv_start_29 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc204) + %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_16 : !tt.ptr, i32 loc(#loc205) + %kv_num_blocks_30 = tt.load %kv_num_blocks : !tt.ptr loc(#loc206) + %block_n_end = arith.muli %kv_num_blocks_30, %c2_i32 : i32 loc(#loc207) + %block_n_end_31 = arith.minsi %block_n_end, %c32_i32 : i32 loc(#loc208) + %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc209) + %offs_n_32 = tt.splat %kv_start_29 : i32 -> tensor<64xi32> loc(#loc210) + %offs_n_33 = arith.addi %offs_n_32, %offs_n : tensor<64xi32> loc(#loc210) + %0 = tt.expand_dims %offs_n_33 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc47) + %kv_offset:5 = scf.for %start_n = %c0_i32 to %block_n_end_31 step %c1_i32 iter_args(%acc_59 = %acc, %l_i_60 = %cst_11, %m_i = %cst_2, %offs_n_61 = %0, %kv_offset_62 = %c0_i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %kv_base_offset = arith.addi %kv_start_29, %kv_offset_62 : i32 loc(#loc317) + %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc318) + %offs_n_load_63 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc318) + %ptr_64 = tt.expand_dims %offs_n_load_63 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc402) + %ptr_65 = arith.muli %ptr_64, %cst : tensor<64x1xi32> loc(#loc403) + %ptr_66 = tt.splat %K : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc404) + %ptr_67 = tt.addptr %ptr_66, %ptr_65 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc404) + %ptr_68 = tt.broadcast %ptr_67 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc405) + %ptr_69 = tt.broadcast %ptr_25 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc405) + %ptr_70 = tt.addptr %ptr_68, %ptr_69 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc405) + %k = tt.load %ptr_70 : tensor<64x128x!tt.ptr> loc(#loc406) + %k_71 = tt.trans %k {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc320) + %qk = tt.dot %q, %k_71, %cst_8, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc321) + %qk_72 = arith.mulf %qk, %cst_7 : tensor<128x64xf32> loc(#loc322) + %tmp4 = tt.broadcast %ptr : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc323) + %tmp4_73 = tt.broadcast %offs_n_61 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc323) + %tmp4_74 = arith.cmpi sge, %tmp4, %tmp4_73 : tensor<128x64xi32> loc(#loc323) + %tmp5 = arith.extsi %offs_n_61 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc324) + %tmp7 = tt.addptr %in_ptr9, %off_zq : !tt.ptr, i32 loc(#loc325) + %tmp7_75 = tt.load %tmp7 : !tt.ptr loc(#loc326) + %tmp8 = tt.splat %tmp7_75 : i64 -> tensor<1x64xi64> loc(#loc327) + %tmp8_76 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc327) + %tmp9 = arith.extsi %ptr : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc328) + %tmp10 = tt.splat %tmp7_75 : i64 -> tensor<128x1xi64> loc(#loc329) + %tmp10_77 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc329) + %tmp11 = tt.broadcast %tmp8_76 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc330) + %tmp11_78 = tt.broadcast %tmp10_77 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc330) + %tmp11_79 = arith.andi %tmp11, %tmp11_78 : tensor<128x64xi1> loc(#loc330) + %tmp12 = arith.andi %tmp4_74, %tmp11_79 : tensor<128x64xi1> loc(#loc331) + %tmp15 = arith.cmpi sge, %offs_n_61, %cst_1 : tensor<1x64xi32> loc(#loc332) + %tmp16 = arith.remsi %offs_n_61, %cst_1 : tensor<1x64xi32> loc(#loc333) + %tmp18 = arith.cmpi ne, %tmp16, %cst_6 : tensor<1x64xi32> loc(#loc334) + %tmp19 = arith.cmpi slt, %tmp16, %cst_6 : tensor<1x64xi32> loc(#loc335) + %tmp22 = arith.andi %tmp18, %tmp19 : tensor<1x64xi1> loc(#loc336) + %tmp23 = arith.addi %tmp16, %cst_1 : tensor<1x64xi32> loc(#loc337) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc338) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc339) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64> loc(#loc340) + %tmp27 = arith.andi %tmp15, %tmp26 : tensor<1x64xi1> loc(#loc341) + %tmp28 = arith.subi %tmp4_73, %tmp4 : tensor<128x64xi32> loc(#loc342) + %tmp29 = arith.remsi %tmp28, %cst_0 : tensor<128x64xi32> loc(#loc343) + %tmp30 = arith.cmpi ne, %tmp29, %cst_5 : tensor<128x64xi32> loc(#loc344) + %tmp31 = arith.cmpi slt, %tmp29, %cst_5 : tensor<128x64xi32> loc(#loc345) + %tmp33 = arith.andi %tmp30, %tmp31 : tensor<128x64xi1> loc(#loc346) + %tmp34 = arith.addi %tmp29, %cst_0 : tensor<128x64xi32> loc(#loc347) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc348) + %tmp36 = arith.cmpi eq, %tmp35, %cst_5 : tensor<128x64xi32> loc(#loc349) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc350) + %tmp37_80 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1> loc(#loc350) + %tmp38 = arith.ori %tmp12, %tmp37_80 : tensor<128x64xi1> loc(#loc351) + %post_mod_scores = arith.select %tmp38, %qk_72, %cst_4 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc352) + %post_mod_scores_81 = arith.mulf %post_mod_scores, %cst_3 : tensor<128x64xf32> loc(#loc353) + %m_ij = "tt.reduce"(%post_mod_scores_81) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_114: f32 loc(callsite(#loc1 at #loc354)), %m_ij_115: f32 loc(callsite(#loc1 at #loc354))): + %m_ij_116 = arith.maxnumf %m_ij_114, %m_ij_115 : f32 loc(#loc461) + tt.reduce.return %m_ij_116 : f32 loc(#loc407) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc407) + %m_ij_82 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc355) + %masked_out_rows = arith.cmpf oeq, %m_ij_82, %cst_2 : tensor<128xf32> loc(#loc356) + %m_ij_masked = arith.select %masked_out_rows, %cst_11, %m_ij_82 : tensor<128xi1>, tensor<128xf32> loc(#loc357) + %alpha = arith.subf %m_i, %m_ij_masked : tensor<128xf32> loc(#loc358) + %alpha_83 = math.exp2 %alpha : tensor<128xf32> loc(#loc359) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc360) + %p_84 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc361) + %p_85 = arith.subf %post_mod_scores_81, %p_84 : tensor<128x64xf32> loc(#loc361) + %p_86 = math.exp2 %p_85 : tensor<128x64xf32> loc(#loc362) + %l_i_87 = arith.mulf %l_i_60, %alpha_83 : tensor<128xf32> loc(#loc363) + %l_i_88 = "tt.reduce"(%p_86) <{axis = 1 : i32}> ({ + ^bb0(%l_i_114: f32 loc(callsite(#loc1 at #loc364)), %l_i_115: f32 loc(callsite(#loc1 at #loc364))): + %l_i_116 = arith.addf %l_i_114, %l_i_115 : f32 loc(#loc462) + tt.reduce.return %l_i_116 : f32 loc(#loc409) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc409) + %l_i_89 = arith.addf %l_i_87, %l_i_88 : tensor<128xf32> loc(#loc365) + %acc_90 = tt.expand_dims %alpha_83 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc366) + %acc_91 = tt.broadcast %acc_90 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc367) + %acc_92 = arith.mulf %acc_59, %acc_91 : tensor<128x128xf32> loc(#loc367) + %ptr_93 = tt.splat %V : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc411) + %ptr_94 = tt.addptr %ptr_93, %ptr_65 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc411) + %ptr_95 = tt.broadcast %ptr_94 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc412) + %ptr_96 = tt.addptr %ptr_95, %ptr_69 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc412) + %v = tt.load %ptr_96 : tensor<64x128x!tt.ptr> loc(#loc413) + %acc_97 = arith.truncf %p_86 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc369) + %acc_98 = tt.dot %acc_97, %v, %acc_92, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc370) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc414) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc415) + %cur_block_99 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc416) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc417) + %next_block_100 = arith.cmpi slt, %next_block, %kv_num_blocks_30 : i32 loc(#loc418) + %next_block_101 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc419) + %next_block_102 = tt.load %next_block_101, %next_block_100 evictionPolicy = evict_last : !tt.ptr loc(#loc420) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc421) + %needs_jump_103 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc422) + %needs_jump_104 = arith.cmpi eq, %needs_jump_103, %c0_i32 : i32 loc(#loc423) + %jump_to_block = arith.subi %next_block_102, %cur_block_99 : i32 loc(#loc424) + %jump_to_block_105 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc425) + %jump_to_block_106 = arith.subi %jump_to_block_105, %c64_i32 : i32 loc(#loc426) + %offset = arith.extui %needs_jump_104 : i1 to i32 loc(#loc427) + %offset_107 = arith.muli %jump_to_block_106, %offset : i32 loc(#loc427) + %offset_108 = arith.subi %c1_i32, %offset : i32 loc(#loc428) + %offset_109 = arith.muli %offset_108, %c64_i32 : i32 loc(#loc429) + %offset_110 = arith.addi %offset_107, %offset_109 : i32 loc(#loc430) + %offs_n_111 = tt.splat %offset_110 : i32 -> tensor<1x64xi32> loc(#loc372) + %offs_n_112 = arith.addi %offs_n_61, %offs_n_111 : tensor<1x64xi32> loc(#loc372) + %kv_offset_113 = arith.addi %kv_offset_62, %offset_110 : i32 loc(#loc373) + scf.yield %acc_98, %l_i_89, %m_ij_82, %offs_n_112, %kv_offset_113 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc287) + } loc(#loc466) + %kv_indices_34 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_18 : !tt.ptr, i32 loc(#loc288) + %kv_start_35 = tt.load %kv_indices_34 : !tt.ptr loc(#loc289) + %kv_start_36 = arith.muli %kv_start_35, %c128_i32 : i32 loc(#loc290) + %kv_num_blocks_37 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_16 : !tt.ptr, i32 loc(#loc291) + %kv_num_blocks_38 = tt.load %kv_num_blocks_37 : !tt.ptr loc(#loc292) + %block_n_end_39 = arith.muli %kv_num_blocks_38, %c2_i32 : i32 loc(#loc293) + %block_n_end_40 = arith.minsi %block_n_end_39, %c32_i32 : i32 loc(#loc294) + %offs_n_41 = tt.splat %kv_start_36 : i32 -> tensor<64xi32> loc(#loc295) + %offs_n_42 = arith.addi %offs_n_41, %offs_n : tensor<64xi32> loc(#loc295) + %1 = tt.expand_dims %offs_n_42 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc137) + %kv_offset_43:5 = scf.for %start_n = %c0_i32 to %block_n_end_40 step %c1_i32 iter_args(%acc_59 = %kv_offset#0, %l_i_60 = %kv_offset#1, %m_i = %kv_offset#2, %offs_n_61 = %1, %kv_offset_62 = %c0_i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %kv_base_offset = arith.addi %kv_start_36, %kv_offset_62 : i32 loc(#loc374) + %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc375) + %offs_n_load_63 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc375) + %ptr_64 = tt.expand_dims %offs_n_load_63 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc431) + %ptr_65 = arith.muli %ptr_64, %cst : tensor<64x1xi32> loc(#loc432) + %ptr_66 = tt.splat %K : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc433) + %ptr_67 = tt.addptr %ptr_66, %ptr_65 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc433) + %ptr_68 = tt.broadcast %ptr_67 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc434) + %ptr_69 = tt.broadcast %ptr_25 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc434) + %ptr_70 = tt.addptr %ptr_68, %ptr_69 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc434) + %k = tt.load %ptr_70 : tensor<64x128x!tt.ptr> loc(#loc435) + %k_71 = tt.trans %k {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc377) + %qk = tt.dot %q, %k_71, %cst_8, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc378) + %qk_72 = arith.mulf %qk, %cst_7 : tensor<128x64xf32> loc(#loc379) + %post_mod_scores = arith.mulf %qk_72, %cst_3 : tensor<128x64xf32> loc(#loc380) + %m_ij = "tt.reduce"(%post_mod_scores) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_105: f32 loc(callsite(#loc1 at #loc381)), %m_ij_106: f32 loc(callsite(#loc1 at #loc381))): + %m_ij_107 = arith.maxnumf %m_ij_105, %m_ij_106 : f32 loc(#loc463) + tt.reduce.return %m_ij_107 : f32 loc(#loc436) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc436) + %m_ij_73 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc382) + %masked_out_rows = arith.cmpf oeq, %m_ij_73, %cst_2 : tensor<128xf32> loc(#loc383) + %m_ij_masked = arith.select %masked_out_rows, %cst_11, %m_ij_73 : tensor<128xi1>, tensor<128xf32> loc(#loc384) + %alpha = arith.subf %m_i, %m_ij_masked : tensor<128xf32> loc(#loc385) + %alpha_74 = math.exp2 %alpha : tensor<128xf32> loc(#loc386) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc387) + %p_75 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc388) + %p_76 = arith.subf %post_mod_scores, %p_75 : tensor<128x64xf32> loc(#loc388) + %p_77 = math.exp2 %p_76 : tensor<128x64xf32> loc(#loc389) + %l_i_78 = arith.mulf %l_i_60, %alpha_74 : tensor<128xf32> loc(#loc390) + %l_i_79 = "tt.reduce"(%p_77) <{axis = 1 : i32}> ({ + ^bb0(%l_i_105: f32 loc(callsite(#loc1 at #loc391)), %l_i_106: f32 loc(callsite(#loc1 at #loc391))): + %l_i_107 = arith.addf %l_i_105, %l_i_106 : f32 loc(#loc464) + tt.reduce.return %l_i_107 : f32 loc(#loc438) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc438) + %l_i_80 = arith.addf %l_i_78, %l_i_79 : tensor<128xf32> loc(#loc392) + %acc_81 = tt.expand_dims %alpha_74 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc393) + %acc_82 = tt.broadcast %acc_81 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc394) + %acc_83 = arith.mulf %acc_59, %acc_82 : tensor<128x128xf32> loc(#loc394) + %ptr_84 = tt.splat %V : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc440) + %ptr_85 = tt.addptr %ptr_84, %ptr_65 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc440) + %ptr_86 = tt.broadcast %ptr_85 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc441) + %ptr_87 = tt.addptr %ptr_86, %ptr_69 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc441) + %v = tt.load %ptr_87 : tensor<64x128x!tt.ptr> loc(#loc442) + %acc_88 = arith.truncf %p_77 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc396) + %acc_89 = tt.dot %acc_88, %v, %acc_83, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc397) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc443) + %cur_block = tt.addptr %kv_indices_34, %cur_block_idx : !tt.ptr, i32 loc(#loc444) + %cur_block_90 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc445) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc446) + %next_block_91 = arith.cmpi slt, %next_block, %kv_num_blocks_38 : i32 loc(#loc447) + %next_block_92 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc448) + %next_block_93 = tt.load %next_block_92, %next_block_91 evictionPolicy = evict_last : !tt.ptr loc(#loc449) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc450) + %needs_jump_94 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc451) + %needs_jump_95 = arith.cmpi eq, %needs_jump_94, %c0_i32 : i32 loc(#loc452) + %jump_to_block = arith.subi %next_block_93, %cur_block_90 : i32 loc(#loc453) + %jump_to_block_96 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc454) + %jump_to_block_97 = arith.subi %jump_to_block_96, %c64_i32 : i32 loc(#loc455) + %offset = arith.extui %needs_jump_95 : i1 to i32 loc(#loc456) + %offset_98 = arith.muli %jump_to_block_97, %offset : i32 loc(#loc456) + %offset_99 = arith.subi %c1_i32, %offset : i32 loc(#loc457) + %offset_100 = arith.muli %offset_99, %c64_i32 : i32 loc(#loc458) + %offset_101 = arith.addi %offset_98, %offset_100 : i32 loc(#loc459) + %offs_n_102 = tt.splat %offset_101 : i32 -> tensor<1x64xi32> loc(#loc399) + %offs_n_103 = arith.addi %offs_n_61, %offs_n_102 : tensor<1x64xi32> loc(#loc399) + %kv_offset_104 = arith.addi %kv_offset_62, %offset_101 : i32 loc(#loc400) + scf.yield %acc_89, %l_i_80, %m_ij_73, %offs_n_103, %kv_offset_104 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc297) + } loc(#loc467) + %l_i_44 = arith.cmpf oeq, %kv_offset_43#1, %cst_11 : tensor<128xf32> loc(#loc298) + %l_i_45 = arith.select %l_i_44, %l_i, %kv_offset_43#1 : tensor<128xi1>, tensor<128xf32> loc(#loc172) + %acc_46 = tt.expand_dims %l_i_45 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc299) + %acc_47 = tt.broadcast %acc_46 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc300) + %acc_48 = arith.divf %kv_offset_43#0, %acc_47 : tensor<128x128xf32> loc(#loc300) + %mask_49 = arith.cmpi slt, %ptr, %mask_10 : tensor<128x1xi32> loc(#loc171) + %mask_50 = arith.cmpi slt, %ptr_25, %mask : tensor<1x128xi32> loc(#loc170) + %mask_51 = tt.broadcast %mask_49 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc301) + %mask_52 = tt.broadcast %mask_50 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc301) + %mask_53 = arith.andi %mask_51, %mask_52 : tensor<128x128xi1> loc(#loc301) + %2 = tt.splat %q_offset_12 : i32 -> tensor<1x128xi32> loc(#loc143) + %3 = arith.addi %ptr_25, %2 : tensor<1x128xi32> loc(#loc143) + %4 = tt.broadcast %3 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc144) + %5 = tt.broadcast %ptr_22 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc144) + %6 = arith.addi %4, %5 : tensor<128x128xi32> loc(#loc144) + %7 = tt.splat %q_offset : i32 -> tensor<128x128xi32> loc(#loc145) + %8 = arith.addi %6, %7 : tensor<128x128xi32> loc(#loc145) + %9 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc146) + %10 = tt.addptr %9, %8 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc146) + %11 = arith.truncf %acc_48 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc147) + tt.store %10, %11, %mask_53 : tensor<128x128x!tt.ptr> loc(#loc147) + %off_hz = arith.muli %off_zq, %c32_i32 : i32 loc(#loc302) + %off_hz_54 = arith.addi %off_hz, %off_hq : i32 loc(#loc303) + %l_ptrs = arith.muli %off_hz_54, %c2048_i32 : i32 loc(#loc304) + %l_ptrs_55 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr, i32 loc(#loc305) + %l_ptrs_56 = tt.splat %l_ptrs_55 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc306) + %l_ptrs_57 = tt.addptr %l_ptrs_56, %offs_m_21 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc306) + %lse = math.log2 %l_i_45 : tensor<128xf32> loc(#loc307) + %lse_58 = arith.addf %kv_offset_43#2, %lse : tensor<128xf32> loc(#loc308) + tt.store %l_ptrs_57, %lse_58 : tensor<128x!tt.ptr> loc(#loc155) + tt.return loc(#loc156) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":136:19) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":214:38) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":214:20) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":206:34) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":130:22) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":97:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":98:27) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":99:27) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":103:23) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":104:24) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":107:24) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":107:45) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":107:36) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":108:25) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":108:47) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":108:37) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":111:12) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":112:12) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":113:12) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":142:51) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":142:74) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":143:46) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":143:97) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":143:64) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":144:23) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":144:46) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":144:33) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":284:27) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":146:101) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":284:38) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":284:20) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":284:56) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":284:49) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":294:23) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":151:26) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":152:23) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":152:37) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":153:42) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":153:28) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":154:45) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":154:65) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":159:37) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":159:24) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":167:48) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":502:40) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":342:32) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":346:35) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":347:107) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":349:17) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":351:19) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":353:14) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":373:23) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":374:23) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":376:33) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":376:23) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":377:22) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":378:23) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":379:23) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":380:23) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":381:23) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":384:24) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":385:24) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":387:25) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":388:92) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":391:24) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":392:24) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":393:39) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":394:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":395:24) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":396:24) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":397:23) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":398:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":399:25) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":400:92) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":402:24) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":403:24) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":404:39) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":405:25) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":406:24) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":407:24) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":414:69) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":417:27) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":421:27) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":423:35) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":424:51) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":428:31) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":428:25) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":429:51) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":429:39) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":429:21) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":434:16) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":434:24) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":436:22) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":436:16) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":439:107) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":440:22) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":440:44) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":247:33) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":545:63) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":248:38) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":248:24) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":249:109) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":249:113) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":249:55) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":249:25) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":250:30) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":250:35) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":250:60) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":251:34) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":251:48) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":251:63) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":252:29) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":252:47) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":252:61) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":252:42) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":548:26) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":549:21) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":549:8) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":181:35) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":182:27) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":182:41) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":183:51) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":183:32) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":184:49) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":184:69) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":186:28) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":193:52) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":206:26) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":208:20) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":208:16) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":214:30) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":218:49) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":218:62) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":218:75) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":218:25) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":218:109) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":221:26) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":221:31) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":222:32) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":222:23) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":222:40) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":223:33) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":223:20) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":225:29) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2u2laawubvef7t5rtvzax6zebordlqljuy3dh5yawyzullirpa.py":229:4) +#loc168 = loc(callsite(#loc1 at #loc2)) +#loc169 = loc("acc"(#loc4)) +#loc170 = loc("mask"(#loc5)) +#loc171 = loc("mask"(#loc6)) +#loc172 = loc("l_i"(#loc7)) +#loc173 = loc("stride_kv_idx_h"(#loc8)) +#loc174 = loc("q_start"(#loc9)) +#loc175 = loc("off_zq"(#loc10)) +#loc176 = loc("off_hq"(#loc11)) +#loc177 = loc("off_zkv"(#loc12)) +#loc178 = loc("off_hkv"(#loc13)) +#loc179 = loc("q_offset"(#loc14)) +#loc180 = loc("q_offset"(#loc15)) +#loc181 = loc("q_offset"(#loc16)) +#loc182 = loc("k_offset"(#loc17)) +#loc183 = loc("k_offset"(#loc18)) +#loc184 = loc("k_offset"(#loc19)) +#loc185 = loc("Q"(#loc20)) +#loc186 = loc("K"(#loc21)) +#loc187 = loc("V"(#loc22)) +#loc188 = loc("sparse_kv_num_blks_offset"(#loc23)) +#loc189 = loc("sparse_kv_num_blks_offset"(#loc24)) +#loc190 = loc("sparse_kv_idx_offset"(#loc25)) +#loc191 = loc("sparse_kv_idx_offset"(#loc26)) +#loc192 = loc("sparse_kv_idx_offset"(#loc27)) +#loc193 = loc("offs_m"(#loc28)) +#loc194 = loc("offs_m"(#loc29)) +#loc195 = loc("offs_m"(#loc30)) +#loc196 = loc("ptr"(#loc31)) +#loc197 = loc("q"(#loc32)) +#loc198 = loc("ptr"(#loc33)) +#loc199 = loc("ptr"(#loc34)) +#loc200 = loc("ptr"(#loc35)) +#loc201 = loc("ptr"(#loc36)) +#loc202 = loc("kv_indices"(#loc38)) +#loc203 = loc("kv_start"(#loc39)) +#loc204 = loc("kv_start"(#loc40)) +#loc205 = loc("kv_num_blocks"(#loc41)) +#loc206 = loc("kv_num_blocks"(#loc42)) +#loc207 = loc("block_n_end"(#loc43)) +#loc208 = loc("block_n_end"(#loc44)) +#loc209 = loc("offs_n"(#loc45)) +#loc210 = loc("offs_n"(#loc46)) +#loc211 = loc("acc"(#loc48)) +#loc212 = loc("kv_base_offset"(#loc49)) +#loc214 = loc("offs_n_load"(#loc51)) +#loc215 = loc("k"(#loc52)) +#loc216 = loc("k"(#loc53)) +#loc217 = loc("qk"(#loc54)) +#loc218 = loc("qk"(#loc55)) +#loc219 = loc("tmp4"(#loc56)) +#loc220 = loc("tmp5"(#loc57)) +#loc221 = loc("tmp7"(#loc58)) +#loc222 = loc("tmp7"(#loc59)) +#loc223 = loc("tmp8"(#loc60)) +#loc224 = loc("tmp9"(#loc61)) +#loc225 = loc("tmp10"(#loc62)) +#loc226 = loc("tmp11"(#loc63)) +#loc227 = loc("tmp12"(#loc64)) +#loc228 = loc("tmp15"(#loc65)) +#loc229 = loc("tmp16"(#loc66)) +#loc230 = loc("tmp18"(#loc67)) +#loc231 = loc("tmp19"(#loc68)) +#loc232 = loc("tmp22"(#loc69)) +#loc233 = loc("tmp23"(#loc70)) +#loc234 = loc("tmp24"(#loc71)) +#loc235 = loc("tmp25"(#loc72)) +#loc236 = loc("tmp26"(#loc73)) +#loc237 = loc("tmp27"(#loc74)) +#loc238 = loc("tmp28"(#loc75)) +#loc239 = loc("tmp29"(#loc76)) +#loc240 = loc("tmp30"(#loc77)) +#loc241 = loc("tmp31"(#loc78)) +#loc242 = loc("tmp33"(#loc79)) +#loc243 = loc("tmp34"(#loc80)) +#loc244 = loc("tmp35"(#loc81)) +#loc245 = loc("tmp36"(#loc82)) +#loc246 = loc("tmp37"(#loc83)) +#loc247 = loc("tmp38"(#loc84)) +#loc248 = loc("post_mod_scores"(#loc85)) +#loc249 = loc("post_mod_scores"(#loc86)) +#loc251 = loc("m_ij"(#loc90)) +#loc252 = loc("masked_out_rows"(#loc91)) +#loc253 = loc("m_ij_masked"(#loc92)) +#loc254 = loc("alpha"(#loc93)) +#loc255 = loc("alpha"(#loc94)) +#loc256 = loc("p"(#loc95)) +#loc257 = loc("p"(#loc96)) +#loc258 = loc("p"(#loc97)) +#loc259 = loc("l_i"(#loc98)) +#loc261 = loc("l_i"(#loc102)) +#loc262 = loc("acc"(#loc103)) +#loc263 = loc("acc"(#loc104)) +#loc264 = loc("v"(#loc105)) +#loc265 = loc("acc"(#loc106)) +#loc266 = loc("acc"(#loc107)) +#loc267 = loc("cur_block_idx"(#loc108)) +#loc268 = loc("offset"(#loc109)) +#loc269 = loc("cur_block"(#loc110)) +#loc270 = loc("cur_block"(#loc111)) +#loc271 = loc("next_block"(#loc112)) +#loc272 = loc("next_block"(#loc113)) +#loc273 = loc("next_block"(#loc114)) +#loc274 = loc("next_block"(#loc115)) +#loc275 = loc("needs_jump"(#loc116)) +#loc276 = loc("needs_jump"(#loc117)) +#loc277 = loc("needs_jump"(#loc118)) +#loc278 = loc("jump_to_block"(#loc119)) +#loc279 = loc("jump_to_block"(#loc120)) +#loc280 = loc("jump_to_block"(#loc121)) +#loc281 = loc("offset"(#loc122)) +#loc282 = loc("offset"(#loc123)) +#loc283 = loc("offset"(#loc124)) +#loc284 = loc("offset"(#loc125)) +#loc285 = loc("offs_n"(#loc126)) +#loc286 = loc("kv_offset"(#loc127)) +#loc287 = loc(callsite(#loc128 at #loc2)) +#loc288 = loc("kv_indices"(#loc129)) +#loc289 = loc("kv_start"(#loc130)) +#loc290 = loc("kv_start"(#loc131)) +#loc291 = loc("kv_num_blocks"(#loc132)) +#loc292 = loc("kv_num_blocks"(#loc133)) +#loc293 = loc("block_n_end"(#loc134)) +#loc294 = loc("block_n_end"(#loc135)) +#loc295 = loc("offs_n"(#loc136)) +#loc297 = loc(callsite(#loc128 at #loc138)) +#loc298 = loc("l_i"(#loc139)) +#loc299 = loc("acc"(#loc140)) +#loc300 = loc("acc"(#loc141)) +#loc301 = loc("mask"(#loc142)) +#loc302 = loc("off_hz"(#loc148)) +#loc303 = loc("off_hz"(#loc149)) +#loc304 = loc("l_ptrs"(#loc150)) +#loc305 = loc("l_ptrs"(#loc151)) +#loc306 = loc("l_ptrs"(#loc152)) +#loc307 = loc("lse"(#loc153)) +#loc308 = loc("lse"(#loc154)) +#loc309 = loc(callsite(#loc3 at #loc169)) +#loc310 = loc(callsite(#loc196 at #loc197)) +#loc311 = loc(callsite(#loc198 at #loc197)) +#loc312 = loc(callsite(#loc199 at #loc197)) +#loc313 = loc(callsite(#loc200 at #loc197)) +#loc314 = loc(callsite(#loc201 at #loc197)) +#loc315 = loc(callsite(#loc37 at #loc197)) +#loc316 = loc("l_i"(#loc211)) +#loc317 = loc(callsite(#loc212 at #loc213)) +#loc318 = loc(callsite(#loc214 at #loc213)) +#loc319 = loc(callsite(#loc215 at #loc213)) +#loc320 = loc(callsite(#loc216 at #loc213)) +#loc321 = loc(callsite(#loc217 at #loc213)) +#loc322 = loc(callsite(#loc218 at #loc213)) +#loc323 = loc(callsite(#loc219 at #loc213)) +#loc324 = loc(callsite(#loc220 at #loc213)) +#loc325 = loc(callsite(#loc221 at #loc213)) +#loc326 = loc(callsite(#loc222 at #loc213)) +#loc327 = loc(callsite(#loc223 at #loc213)) +#loc328 = loc(callsite(#loc224 at #loc213)) +#loc329 = loc(callsite(#loc225 at #loc213)) +#loc330 = loc(callsite(#loc226 at #loc213)) +#loc331 = loc(callsite(#loc227 at #loc213)) +#loc332 = loc(callsite(#loc228 at #loc213)) +#loc333 = loc(callsite(#loc229 at #loc213)) +#loc334 = loc(callsite(#loc230 at #loc213)) +#loc335 = loc(callsite(#loc231 at #loc213)) +#loc336 = loc(callsite(#loc232 at #loc213)) +#loc337 = loc(callsite(#loc233 at #loc213)) +#loc338 = loc(callsite(#loc234 at #loc213)) +#loc339 = loc(callsite(#loc235 at #loc213)) +#loc340 = loc(callsite(#loc236 at #loc213)) +#loc341 = loc(callsite(#loc237 at #loc213)) +#loc342 = loc(callsite(#loc238 at #loc213)) +#loc343 = loc(callsite(#loc239 at #loc213)) +#loc344 = loc(callsite(#loc240 at #loc213)) +#loc345 = loc(callsite(#loc241 at #loc213)) +#loc346 = loc(callsite(#loc242 at #loc213)) +#loc347 = loc(callsite(#loc243 at #loc213)) +#loc348 = loc(callsite(#loc244 at #loc213)) +#loc349 = loc(callsite(#loc245 at #loc213)) +#loc350 = loc(callsite(#loc246 at #loc213)) +#loc351 = loc(callsite(#loc247 at #loc213)) +#loc352 = loc(callsite(#loc248 at #loc213)) +#loc353 = loc(callsite(#loc249 at #loc213)) +#loc355 = loc(callsite(#loc251 at #loc213)) +#loc356 = loc(callsite(#loc252 at #loc213)) +#loc357 = loc(callsite(#loc253 at #loc213)) +#loc358 = loc(callsite(#loc254 at #loc213)) +#loc359 = loc(callsite(#loc255 at #loc213)) +#loc360 = loc(callsite(#loc256 at #loc213)) +#loc361 = loc(callsite(#loc257 at #loc213)) +#loc362 = loc(callsite(#loc258 at #loc213)) +#loc363 = loc(callsite(#loc259 at #loc213)) +#loc365 = loc(callsite(#loc261 at #loc213)) +#loc366 = loc(callsite(#loc262 at #loc213)) +#loc367 = loc(callsite(#loc263 at #loc213)) +#loc368 = loc(callsite(#loc264 at #loc213)) +#loc369 = loc(callsite(#loc265 at #loc213)) +#loc370 = loc(callsite(#loc266 at #loc213)) +#loc371 = loc(callsite(#loc268 at #loc2)) +#loc372 = loc(callsite(#loc285 at #loc2)) +#loc373 = loc(callsite(#loc286 at #loc2)) +#loc374 = loc(callsite(#loc212 at #loc296)) +#loc375 = loc(callsite(#loc214 at #loc296)) +#loc376 = loc(callsite(#loc215 at #loc296)) +#loc377 = loc(callsite(#loc216 at #loc296)) +#loc378 = loc(callsite(#loc217 at #loc296)) +#loc379 = loc(callsite(#loc218 at #loc296)) +#loc380 = loc(callsite(#loc249 at #loc296)) +#loc382 = loc(callsite(#loc251 at #loc296)) +#loc383 = loc(callsite(#loc252 at #loc296)) +#loc384 = loc(callsite(#loc253 at #loc296)) +#loc385 = loc(callsite(#loc254 at #loc296)) +#loc386 = loc(callsite(#loc255 at #loc296)) +#loc387 = loc(callsite(#loc256 at #loc296)) +#loc388 = loc(callsite(#loc257 at #loc296)) +#loc389 = loc(callsite(#loc258 at #loc296)) +#loc390 = loc(callsite(#loc259 at #loc296)) +#loc392 = loc(callsite(#loc261 at #loc296)) +#loc393 = loc(callsite(#loc262 at #loc296)) +#loc394 = loc(callsite(#loc263 at #loc296)) +#loc395 = loc(callsite(#loc264 at #loc296)) +#loc396 = loc(callsite(#loc265 at #loc296)) +#loc397 = loc(callsite(#loc266 at #loc296)) +#loc398 = loc(callsite(#loc268 at #loc138)) +#loc399 = loc(callsite(#loc285 at #loc138)) +#loc400 = loc(callsite(#loc286 at #loc138)) +#loc401 = loc("m_i"(#loc316)) +#loc402 = loc(callsite(#loc196 at #loc319)) +#loc403 = loc(callsite(#loc198 at #loc319)) +#loc404 = loc(callsite(#loc199 at #loc319)) +#loc405 = loc(callsite(#loc201 at #loc319)) +#loc406 = loc(callsite(#loc37 at #loc319)) +#loc407 = loc(callsite(#loc87 at #loc354)) +#loc409 = loc(callsite(#loc99 at #loc364)) +#loc411 = loc(callsite(#loc199 at #loc368)) +#loc412 = loc(callsite(#loc201 at #loc368)) +#loc413 = loc(callsite(#loc37 at #loc368)) +#loc414 = loc(callsite(#loc267 at #loc371)) +#loc415 = loc(callsite(#loc269 at #loc371)) +#loc416 = loc(callsite(#loc270 at #loc371)) +#loc417 = loc(callsite(#loc271 at #loc371)) +#loc418 = loc(callsite(#loc272 at #loc371)) +#loc419 = loc(callsite(#loc273 at #loc371)) +#loc420 = loc(callsite(#loc274 at #loc371)) +#loc421 = loc(callsite(#loc275 at #loc371)) +#loc422 = loc(callsite(#loc276 at #loc371)) +#loc423 = loc(callsite(#loc277 at #loc371)) +#loc424 = loc(callsite(#loc278 at #loc371)) +#loc425 = loc(callsite(#loc279 at #loc371)) +#loc426 = loc(callsite(#loc280 at #loc371)) +#loc427 = loc(callsite(#loc281 at #loc371)) +#loc428 = loc(callsite(#loc282 at #loc371)) +#loc429 = loc(callsite(#loc283 at #loc371)) +#loc430 = loc(callsite(#loc284 at #loc371)) +#loc431 = loc(callsite(#loc196 at #loc376)) +#loc432 = loc(callsite(#loc198 at #loc376)) +#loc433 = loc(callsite(#loc199 at #loc376)) +#loc434 = loc(callsite(#loc201 at #loc376)) +#loc435 = loc(callsite(#loc37 at #loc376)) +#loc436 = loc(callsite(#loc87 at #loc381)) +#loc438 = loc(callsite(#loc99 at #loc391)) +#loc440 = loc(callsite(#loc199 at #loc395)) +#loc441 = loc(callsite(#loc201 at #loc395)) +#loc442 = loc(callsite(#loc37 at #loc395)) +#loc443 = loc(callsite(#loc267 at #loc398)) +#loc444 = loc(callsite(#loc269 at #loc398)) +#loc445 = loc(callsite(#loc270 at #loc398)) +#loc446 = loc(callsite(#loc271 at #loc398)) +#loc447 = loc(callsite(#loc272 at #loc398)) +#loc448 = loc(callsite(#loc273 at #loc398)) +#loc449 = loc(callsite(#loc274 at #loc398)) +#loc450 = loc(callsite(#loc275 at #loc398)) +#loc451 = loc(callsite(#loc276 at #loc398)) +#loc452 = loc(callsite(#loc277 at #loc398)) +#loc453 = loc(callsite(#loc278 at #loc398)) +#loc454 = loc(callsite(#loc279 at #loc398)) +#loc455 = loc(callsite(#loc280 at #loc398)) +#loc456 = loc(callsite(#loc281 at #loc398)) +#loc457 = loc(callsite(#loc282 at #loc398)) +#loc458 = loc(callsite(#loc283 at #loc398)) +#loc459 = loc(callsite(#loc284 at #loc398)) +#loc460 = loc("offs_n"(#loc401)) +#loc461 = loc(callsite(#loc89 at #loc407)) +#loc462 = loc(callsite(#loc101 at #loc409)) +#loc463 = loc(callsite(#loc89 at #loc436)) +#loc464 = loc(callsite(#loc101 at #loc438)) +#loc465 = loc("kv_offset"(#loc460)) +#loc466 = loc(callsite(#loc465 at #loc2)) +#loc467 = loc(callsite(#loc465 at #loc138)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/__grp__triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/__grp__triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..44630ba42d7b57f849996d7796eb8a894ebe733e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/__grp__triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..2e6054bfdf9a0f762057f6cb059136385cb60407 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e61b1473136112cdb2c008c73e5ff893eb6238e4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"hash": "a86b11120b5411cbf0d8b8f8917a84c667a2fd685341aa47866d51603d9e328a", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..7d654efb4192b6111ed22e6bf0438860ef696bae --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.llir @@ -0,0 +1,450 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_argmax_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = icmp slt i32 %9, %4, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 511, !dbg !9 + %13 = zext nneg i32 %9 to i64, !dbg !10 + %.frozen = freeze i64 %2, !dbg !11 + %14 = sdiv i64 %13, %.frozen, !dbg !11 + %15 = mul i64 %14, %.frozen, !dbg !10 + %.decomposed = sub i64 %13, %15, !dbg !10 + %16 = mul i64 %14, %3, !dbg !12 + %.idx = mul nuw nsw i64 %.decomposed, 128000 + %17 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx + %invariant.gep = getelementptr float, ptr addrspace(1) %17, i64 %16, !dbg !13 + %18 = zext nneg i32 %12 to i64, !dbg !13 + %19 = insertelement <2 x i1> poison, i1 %10, i64 0, !dbg !14 + %20 = shufflevector <2 x i1> %19, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !15 + br label %21, !dbg !13 + +21: ; preds = %8, %21 + %indvars.iv = phi i64 [ 0, %8 ], [ %indvars.iv.next, %21 ] + %22 = phi i32 [ 2147483647, %8 ], [ %106, %21 ] + %23 = phi i32 [ 2147483647, %8 ], [ %107, %21 ] + %24 = phi <2 x float> [ splat (float 0xFFF0000000000000), %8 ], [ %104, %21 ] + %25 = phi <2 x i32> [ splat (i32 2147483647), %8 ], [ %108, %21 ] + %26 = phi <2 x float> [ splat (float 0xFFF0000000000000), %8 ], [ %105, %21 ] + %27 = or disjoint i64 %indvars.iv, %18, !dbg !16 + %28 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !16 + %29 = or disjoint i32 %11, %28, !dbg !16 + %30 = or i32 %29, 512, !dbg !16 + %31 = or disjoint i64 %27, 1024, !dbg !16 + %32 = or i32 %29, 1536, !dbg !16 + %33 = icmp samesign ult i32 %30, 32000, !dbg !17 + %34 = icmp samesign ult i64 %31, 32000, !dbg !17 + %35 = icmp samesign ult i32 %32, 32000, !dbg !17 + %36 = zext nneg i32 %30 to i64, !dbg !18 + %37 = zext nneg i32 %32 to i64, !dbg !18 + %gep = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %27, !dbg !19 + %gep3 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %36, !dbg !19 + %gep5 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %31, !dbg !19 + %gep7 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %37, !dbg !19 + %38 = and i1 %10, %33, !dbg !15 + %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %40 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep, i64 %39, i1 %10) #4, !dbg !20 + %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %42 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep3, i64 %41, i1 %38) #4, !dbg !20 + %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %44 = fcmp uno <2 x float> %24, zeroinitializer, !dbg !21 + %45 = fcmp uno <2 x float> %26, zeroinitializer, !dbg !21 + %46 = sext i32 %22 to i64, !dbg !25 + %47 = icmp sgt i64 %27, %46, !dbg !25 + %48 = icmp slt i32 %23, %30, !dbg !25 + %49 = extractelement <2 x i32> %25, i64 0, !dbg !25 + %50 = sext i32 %49 to i64, !dbg !25 + %51 = icmp sgt i64 %31, %50, !dbg !25 + %52 = extractelement <2 x i32> %25, i64 1, !dbg !25 + %53 = icmp slt i32 %52, %32, !dbg !25 + %54 = insertelement <2 x i32> poison, i32 %40, i64 0, !dbg !20 + %55 = insertelement <2 x i32> %54, i32 %42, i64 1, !dbg !20 + %56 = bitcast <2 x i32> %55 to <2 x float>, !dbg !20 + %57 = fcmp ogt <2 x float> %24, %56, !dbg !26 + %58 = fcmp oeq <2 x float> %24, %56, !dbg !27 + %59 = fcmp uno <2 x float> %56, zeroinitializer, !dbg !28 + %60 = xor <2 x i1> %59, splat (i1 true), !dbg !29 + %61 = and <2 x i1> %44, %60, !dbg !30 + %62 = or <2 x i1> %57, %61, !dbg !31 + %63 = and <2 x i1> %44, %59, !dbg !32 + %64 = or <2 x i1> %58, %63, !dbg !33 + %65 = insertelement <2 x i1> poison, i1 %47, i64 0, !dbg !34 + %66 = insertelement <2 x i1> %65, i1 %48, i64 1, !dbg !34 + %67 = and <2 x i1> %66, %64, !dbg !34 + %68 = or <2 x i1> %62, %67, !dbg !35 + %69 = select <2 x i1> %68, <2 x float> %24, <2 x float> %56, !dbg !36 + %70 = trunc nuw nsw i64 %27 to i32, !dbg !37 + %71 = extractelement <2 x i1> %68, i64 0, !dbg !37 + %72 = select i1 %71, i32 %22, i32 %70, !dbg !37 + %73 = extractelement <2 x i1> %68, i64 1, !dbg !37 + %74 = select i1 %73, i32 %23, i32 %30, !dbg !37 + %75 = trunc nuw nsw i64 %31 to i32, !dbg !37 + %76 = insertelement <2 x i1> poison, i1 %34, i64 0, !dbg !15 + %77 = insertelement <2 x i1> %76, i1 %35, i64 1, !dbg !15 + %78 = and <2 x i1> %20, %77, !dbg !15 + %79 = extractelement <2 x i1> %78, i64 0, !dbg !20 + %80 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep5, i64 %43, i1 %79) #4, !dbg !20 + %81 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %82 = extractelement <2 x i1> %78, i64 1, !dbg !20 + %83 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep7, i64 %81, i1 %82) #4, !dbg !20 + %84 = insertelement <2 x i32> poison, i32 %80, i64 0, !dbg !20 + %85 = insertelement <2 x i32> %84, i32 %83, i64 1, !dbg !20 + %86 = bitcast <2 x i32> %85 to <2 x float>, !dbg !20 + %87 = fcmp ogt <2 x float> %26, %86, !dbg !26 + %88 = fcmp oeq <2 x float> %26, %86, !dbg !27 + %89 = fcmp uno <2 x float> %86, zeroinitializer, !dbg !28 + %90 = xor <2 x i1> %89, splat (i1 true), !dbg !29 + %91 = and <2 x i1> %45, %90, !dbg !30 + %92 = or <2 x i1> %87, %91, !dbg !31 + %93 = and <2 x i1> %45, %89, !dbg !32 + %94 = or <2 x i1> %88, %93, !dbg !33 + %95 = insertelement <2 x i1> poison, i1 %51, i64 0, !dbg !34 + %96 = insertelement <2 x i1> %95, i1 %53, i64 1, !dbg !34 + %97 = and <2 x i1> %96, %94, !dbg !34 + %98 = or <2 x i1> %92, %97, !dbg !35 + %99 = select <2 x i1> %98, <2 x float> %26, <2 x float> %86, !dbg !36 + %100 = insertelement <2 x i32> poison, i32 %75, i64 0, !dbg !37 + %101 = insertelement <2 x i32> %100, i32 %32, i64 1, !dbg !37 + %102 = select <2 x i1> %98, <2 x i32> %25, <2 x i32> %101, !dbg !37 + %103 = insertelement <2 x i1> %19, i1 %38, i64 1, !dbg !14 + %104 = select <2 x i1> %103, <2 x float> %69, <2 x float> %24, !dbg !14 + %105 = select <2 x i1> %78, <2 x float> %99, <2 x float> %26, !dbg !14 + %106 = select i1 %10, i32 %72, i32 %22, !dbg !38 + %107 = select i1 %38, i32 %74, i32 %23, !dbg !38 + %108 = select <2 x i1> %78, <2 x i32> %102, <2 x i32> %25, !dbg !38 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !13 + %109 = icmp samesign ult i64 %indvars.iv, 29952, !dbg !13 + br i1 %109, label %21, label %110, !dbg !13 + +110: ; preds = %21 + %111 = and i32 %11, 31, !dbg !9 + %112 = lshr i32 %11, 5, !dbg !9 + %113 = shufflevector <2 x float> %104, <2 x float> poison, <2 x i32> , !dbg !39 + %114 = fcmp ogt <2 x float> %104, %113, !dbg !39 + %115 = fcmp oeq <2 x float> %104, %113, !dbg !39 + %116 = shufflevector <2 x i1> %114, <2 x i1> %115, <2 x i32> , !dbg !39 + %117 = extractelement <2 x float> %104, i64 0, !dbg !41 + %118 = fcmp uno float %117, 0.000000e+00, !dbg !41 + %119 = extractelement <2 x float> %104, i64 1, !dbg !42 + %120 = fcmp uno float %119, 0.000000e+00, !dbg !42 + %121 = xor i1 %120, true, !dbg !43 + %122 = insertelement <2 x i1> poison, i1 %118, i64 0, !dbg !44 + %123 = shufflevector <2 x i1> %122, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !44 + %124 = insertelement <2 x i1> poison, i1 %121, i64 0, !dbg !44 + %125 = insertelement <2 x i1> %124, i1 %120, i64 1, !dbg !44 + %126 = and <2 x i1> %123, %125, !dbg !44 + %127 = or <2 x i1> %116, %126, !dbg !45 + %128 = icmp slt i32 %106, %107, !dbg !46 + %129 = extractelement <2 x i1> %127, i64 1, !dbg !47 + %130 = and i1 %128, %129, !dbg !47 + %131 = extractelement <2 x i1> %127, i64 0, !dbg !48 + %132 = or i1 %131, %130, !dbg !48 + %133 = select i1 %132, float %117, float %119, !dbg !49 + %134 = select i1 %132, i32 %106, i32 %107, !dbg !50 + %135 = extractelement <2 x float> %105, i64 0, !dbg !39 + %136 = fcmp ogt float %133, %135, !dbg !39 + %137 = fcmp oeq float %133, %135, !dbg !51 + %138 = fcmp uno float %133, 0.000000e+00, !dbg !41 + %139 = fcmp uno float %135, 0.000000e+00, !dbg !42 + %140 = xor i1 %139, true, !dbg !43 + %141 = and i1 %138, %140, !dbg !44 + %142 = or i1 %136, %141, !dbg !45 + %143 = and i1 %139, %138, !dbg !52 + %144 = or i1 %137, %143, !dbg !53 + %145 = extractelement <2 x i32> %108, i64 0, !dbg !46 + %146 = icmp slt i32 %134, %145, !dbg !46 + %147 = and i1 %146, %144, !dbg !47 + %148 = or i1 %142, %147, !dbg !48 + %149 = select i1 %148, float %133, float %135, !dbg !49 + %150 = select i1 %148, i32 %134, i32 %145, !dbg !50 + %151 = extractelement <2 x float> %105, i64 1, !dbg !39 + %152 = fcmp ogt float %149, %151, !dbg !39 + %153 = fcmp oeq float %149, %151, !dbg !51 + %154 = fcmp uno float %149, 0.000000e+00, !dbg !41 + %155 = fcmp uno float %151, 0.000000e+00, !dbg !42 + %156 = xor i1 %155, true, !dbg !43 + %157 = and i1 %154, %156, !dbg !44 + %158 = or i1 %152, %157, !dbg !45 + %159 = and i1 %155, %154, !dbg !52 + %160 = or i1 %153, %159, !dbg !53 + %161 = extractelement <2 x i32> %108, i64 1, !dbg !46 + %162 = icmp slt i32 %150, %161, !dbg !46 + %163 = and i1 %162, %160, !dbg !47 + %164 = or i1 %158, %163, !dbg !48 + %165 = select i1 %164, float %149, float %151, !dbg !49 + %166 = select i1 %164, i32 %150, i32 %161, !dbg !50 + %167 = bitcast float %165 to i32, !dbg !54 + %168 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %167, i32 16, i32 31), !dbg !54 + %169 = bitcast i32 %168 to float, !dbg !54 + %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %166, i32 16, i32 31), !dbg !54 + %171 = fcmp ogt float %165, %169, !dbg !39 + %172 = fcmp oeq float %165, %169, !dbg !51 + %173 = fcmp uno float %165, 0.000000e+00, !dbg !41 + %174 = fcmp uno float %169, 0.000000e+00, !dbg !42 + %175 = xor i1 %174, true, !dbg !43 + %176 = and i1 %173, %175, !dbg !44 + %177 = or i1 %171, %176, !dbg !45 + %178 = and i1 %173, %174, !dbg !52 + %179 = or i1 %172, %178, !dbg !53 + %180 = icmp slt i32 %166, %170, !dbg !46 + %181 = and i1 %180, %179, !dbg !47 + %182 = or i1 %177, %181, !dbg !48 + %183 = select i1 %182, float %165, float %169, !dbg !49 + %184 = select i1 %182, i32 %166, i32 %170, !dbg !50 + %185 = bitcast float %183 to i32, !dbg !54 + %186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %185, i32 8, i32 31), !dbg !54 + %187 = bitcast i32 %186 to float, !dbg !54 + %188 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 8, i32 31), !dbg !54 + %189 = fcmp ogt float %183, %187, !dbg !39 + %190 = fcmp oeq float %183, %187, !dbg !51 + %191 = fcmp uno float %183, 0.000000e+00, !dbg !41 + %192 = fcmp uno float %187, 0.000000e+00, !dbg !42 + %193 = xor i1 %192, true, !dbg !43 + %194 = and i1 %191, %193, !dbg !44 + %195 = or i1 %189, %194, !dbg !45 + %196 = and i1 %192, %191, !dbg !52 + %197 = or i1 %190, %196, !dbg !53 + %198 = icmp slt i32 %184, %188, !dbg !46 + %199 = and i1 %198, %197, !dbg !47 + %200 = or i1 %195, %199, !dbg !48 + %201 = select i1 %200, float %183, float %187, !dbg !49 + %202 = select i1 %200, i32 %184, i32 %188, !dbg !50 + %203 = bitcast float %201 to i32, !dbg !54 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 4, i32 31), !dbg !54 + %205 = bitcast i32 %204 to float, !dbg !54 + %206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 4, i32 31), !dbg !54 + %207 = fcmp ogt float %201, %205, !dbg !39 + %208 = fcmp oeq float %201, %205, !dbg !51 + %209 = fcmp uno float %201, 0.000000e+00, !dbg !41 + %210 = fcmp uno float %205, 0.000000e+00, !dbg !42 + %211 = xor i1 %210, true, !dbg !43 + %212 = and i1 %209, %211, !dbg !44 + %213 = or i1 %207, %212, !dbg !45 + %214 = and i1 %210, %209, !dbg !52 + %215 = or i1 %208, %214, !dbg !53 + %216 = icmp slt i32 %202, %206, !dbg !46 + %217 = and i1 %216, %215, !dbg !47 + %218 = or i1 %213, %217, !dbg !48 + %219 = select i1 %218, float %201, float %205, !dbg !49 + %220 = select i1 %218, i32 %202, i32 %206, !dbg !50 + %221 = bitcast float %219 to i32, !dbg !54 + %222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 2, i32 31), !dbg !54 + %223 = bitcast i32 %222 to float, !dbg !54 + %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 2, i32 31), !dbg !54 + %225 = fcmp ogt float %219, %223, !dbg !39 + %226 = fcmp oeq float %219, %223, !dbg !51 + %227 = fcmp uno float %219, 0.000000e+00, !dbg !41 + %228 = fcmp uno float %223, 0.000000e+00, !dbg !42 + %229 = xor i1 %228, true, !dbg !43 + %230 = and i1 %227, %229, !dbg !44 + %231 = or i1 %225, %230, !dbg !45 + %232 = and i1 %228, %227, !dbg !52 + %233 = or i1 %226, %232, !dbg !53 + %234 = icmp slt i32 %220, %224, !dbg !46 + %235 = and i1 %234, %233, !dbg !47 + %236 = or i1 %231, %235, !dbg !48 + %237 = select i1 %236, float %219, float %223, !dbg !49 + %238 = select i1 %236, i32 %220, i32 %224, !dbg !50 + %239 = bitcast float %237 to i32, !dbg !54 + %240 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %239, i32 1, i32 31), !dbg !54 + %241 = bitcast i32 %240 to float, !dbg !54 + %242 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %238, i32 1, i32 31), !dbg !54 + %243 = fcmp ogt float %237, %241, !dbg !39 + %244 = fcmp oeq float %237, %241, !dbg !51 + %245 = fcmp uno float %237, 0.000000e+00, !dbg !41 + %246 = fcmp uno float %241, 0.000000e+00, !dbg !42 + %247 = xor i1 %246, true, !dbg !43 + %248 = and i1 %245, %247, !dbg !44 + %249 = or i1 %243, %248, !dbg !45 + %250 = and i1 %246, %245, !dbg !52 + %251 = or i1 %244, %250, !dbg !53 + %252 = icmp slt i32 %238, %242, !dbg !46 + %253 = and i1 %252, %251, !dbg !47 + %254 = or i1 %249, %253, !dbg !48 + %255 = select i1 %254, i32 %238, i32 %242, !dbg !50 + %256 = and i32 %112, 15, !dbg !54 + %257 = icmp eq i32 %111, 0, !dbg !54 + %258 = getelementptr float, ptr addrspace(3) @global_smem, i32 %256, !dbg !54 + %259 = select i1 %254, i32 %239, i32 %240, !dbg !49 + %260 = insertelement <1 x i32> poison, i32 %259, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %258, <1 x i32> %260, i1 %257) #4, !dbg !54 + %261 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %256, !dbg !54 + %262 = insertelement <1 x i32> poison, i32 %255, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %261, <1 x i32> %262, i1 %257) #4, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + %263 = icmp samesign ult i32 %11, 16, !dbg !54 + %264 = getelementptr float, ptr addrspace(3) @global_smem, i32 %11, !dbg !54 + %265 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %264, i1 %263) #4, !dbg !54 + %266 = bitcast i32 %265 to float, !dbg !54 + %267 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %11, !dbg !54 + %268 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %267, i1 %263) #4, !dbg !54 + %269 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %265, i32 8, i32 31), !dbg !54 + %270 = bitcast i32 %269 to float, !dbg !54 + %271 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 8, i32 31), !dbg !54 + %272 = fcmp ogt float %266, %270, !dbg !39 + %273 = fcmp oeq float %266, %270, !dbg !51 + %274 = fcmp uno float %266, 0.000000e+00, !dbg !41 + %275 = fcmp uno float %270, 0.000000e+00, !dbg !42 + %276 = xor i1 %275, true, !dbg !43 + %277 = and i1 %274, %276, !dbg !44 + %278 = or i1 %272, %277, !dbg !45 + %279 = and i1 %274, %275, !dbg !52 + %280 = or i1 %273, %279, !dbg !53 + %281 = icmp slt i32 %268, %271, !dbg !46 + %282 = and i1 %281, %280, !dbg !47 + %283 = or i1 %278, %282, !dbg !48 + %284 = select i1 %283, float %266, float %270, !dbg !49 + %285 = select i1 %283, i32 %268, i32 %271, !dbg !50 + %286 = bitcast float %284 to i32, !dbg !54 + %287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %286, i32 4, i32 31), !dbg !54 + %288 = bitcast i32 %287 to float, !dbg !54 + %289 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 4, i32 31), !dbg !54 + %290 = fcmp ogt float %284, %288, !dbg !39 + %291 = fcmp oeq float %284, %288, !dbg !51 + %292 = fcmp uno float %284, 0.000000e+00, !dbg !41 + %293 = fcmp uno float %288, 0.000000e+00, !dbg !42 + %294 = xor i1 %293, true, !dbg !43 + %295 = and i1 %292, %294, !dbg !44 + %296 = or i1 %290, %295, !dbg !45 + %297 = and i1 %293, %292, !dbg !52 + %298 = or i1 %291, %297, !dbg !53 + %299 = icmp slt i32 %285, %289, !dbg !46 + %300 = and i1 %299, %298, !dbg !47 + %301 = or i1 %296, %300, !dbg !48 + %302 = select i1 %301, float %284, float %288, !dbg !49 + %303 = select i1 %301, i32 %285, i32 %289, !dbg !50 + %304 = bitcast float %302 to i32, !dbg !54 + %305 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %304, i32 2, i32 31), !dbg !54 + %306 = bitcast i32 %305 to float, !dbg !54 + %307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %303, i32 2, i32 31), !dbg !54 + %308 = fcmp ogt float %302, %306, !dbg !39 + %309 = fcmp oeq float %302, %306, !dbg !51 + %310 = fcmp uno float %302, 0.000000e+00, !dbg !41 + %311 = fcmp uno float %306, 0.000000e+00, !dbg !42 + %312 = xor i1 %311, true, !dbg !43 + %313 = and i1 %310, %312, !dbg !44 + %314 = or i1 %308, %313, !dbg !45 + %315 = and i1 %311, %310, !dbg !52 + %316 = or i1 %309, %315, !dbg !53 + %317 = icmp slt i32 %303, %307, !dbg !46 + %318 = and i1 %317, %316, !dbg !47 + %319 = or i1 %314, %318, !dbg !48 + %320 = select i1 %319, float %302, float %306, !dbg !49 + %321 = select i1 %319, i32 %303, i32 %307, !dbg !50 + %322 = bitcast float %320 to i32, !dbg !54 + %323 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %322, i32 1, i32 31), !dbg !54 + %324 = bitcast i32 %323 to float, !dbg !54 + %325 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %321, i32 1, i32 31), !dbg !54 + %326 = fcmp ogt float %320, %324, !dbg !39 + %327 = fcmp oeq float %320, %324, !dbg !51 + %328 = fcmp uno float %320, 0.000000e+00, !dbg !41 + %329 = fcmp uno float %324, 0.000000e+00, !dbg !42 + %330 = xor i1 %329, true, !dbg !43 + %331 = and i1 %328, %330, !dbg !44 + %332 = or i1 %326, %331, !dbg !45 + %333 = and i1 %329, %328, !dbg !52 + %334 = or i1 %327, %333, !dbg !53 + %335 = icmp slt i32 %321, %325, !dbg !46 + %336 = and i1 %335, %334, !dbg !47 + %337 = or i1 %332, %336, !dbg !48 + %338 = select i1 %337, i32 %321, i32 %325, !dbg !50 + %339 = icmp eq i32 %11, 0, !dbg !54 + %340 = select i1 %337, i32 %322, i32 %323, !dbg !49 + %341 = insertelement <1 x i32> poison, i32 %340, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %264, <1 x i32> %341, i1 %339) #4, !dbg !54 + %342 = insertelement <1 x i32> poison, i32 %338, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %267, <1 x i32> %342, i1 %339) #4, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + %343 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !54 + %344 = getelementptr i64, ptr addrspace(1) %1, i64 %13, !dbg !55 + %345 = sext i32 %343 to i64, !dbg !56 + %346 = icmp eq i32 %12, 0, !dbg !56 + %347 = and i1 %346, %10, !dbg !56 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %345, ptr addrspace(1) %344, i1 %347) #4, !dbg !56 + ret void, !dbg !57 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_1", linkageName: "triton_red_fused_argmax_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 24, column: 21, scope: !4) +!9 = !DILocation(line: 25, column: 37, scope: !4) +!10 = !DILocation(line: 27, column: 19, scope: !4) +!11 = !DILocation(line: 28, column: 19, scope: !4) +!12 = !DILocation(line: 38, column: 56, scope: !4) +!13 = !DILocation(line: 32, column: 40, scope: !4) +!14 = !DILocation(line: 43, column: 54, scope: !4) +!15 = !DILocation(line: 38, column: 71, scope: !4) +!16 = !DILocation(line: 33, column: 31, scope: !4) +!17 = !DILocation(line: 34, column: 29, scope: !4) +!18 = !DILocation(line: 38, column: 41, scope: !4) +!19 = !DILocation(line: 38, column: 34, scope: !4) +!20 = !DILocation(line: 38, column: 61, scope: !4) +!21 = !DILocation(line: 147, column: 29, scope: !22, inlinedAt: !24) +!22 = distinct !DILexicalBlockFile(scope: !4, file: !23, discriminator: 0) +!23 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!24 = !DILocation(line: 41, column: 38, scope: !4) +!25 = !DILocation(line: 154, column: 31, scope: !22, inlinedAt: !24) +!26 = !DILocation(line: 144, column: 21, scope: !22, inlinedAt: !24) +!27 = !DILocation(line: 145, column: 23, scope: !22, inlinedAt: !24) +!28 = !DILocation(line: 148, column: 29, scope: !22, inlinedAt: !24) +!29 = !DILocation(line: 149, column: 31, scope: !22, inlinedAt: !24) +!30 = !DILocation(line: 149, column: 27, scope: !22, inlinedAt: !24) +!31 = !DILocation(line: 149, column: 16, scope: !22, inlinedAt: !24) +!32 = !DILocation(line: 151, column: 27, scope: !22, inlinedAt: !24) +!33 = !DILocation(line: 151, column: 17, scope: !22, inlinedAt: !24) +!34 = !DILocation(line: 154, column: 21, scope: !22, inlinedAt: !24) +!35 = !DILocation(line: 154, column: 12, scope: !22, inlinedAt: !24) +!36 = !DILocation(line: 155, column: 35, scope: !22, inlinedAt: !24) +!37 = !DILocation(line: 155, column: 69, scope: !22, inlinedAt: !24) +!38 = !DILocation(line: 44, column: 66, scope: !4) +!39 = !DILocation(line: 144, column: 21, scope: !22, inlinedAt: !40) +!40 = !DILocation(line: 45, column: 75, scope: !4) +!41 = !DILocation(line: 147, column: 29, scope: !22, inlinedAt: !40) +!42 = !DILocation(line: 148, column: 29, scope: !22, inlinedAt: !40) +!43 = !DILocation(line: 149, column: 31, scope: !22, inlinedAt: !40) +!44 = !DILocation(line: 149, column: 27, scope: !22, inlinedAt: !40) +!45 = !DILocation(line: 149, column: 16, scope: !22, inlinedAt: !40) +!46 = !DILocation(line: 154, column: 31, scope: !22, inlinedAt: !40) +!47 = !DILocation(line: 154, column: 21, scope: !22, inlinedAt: !40) +!48 = !DILocation(line: 154, column: 12, scope: !22, inlinedAt: !40) +!49 = !DILocation(line: 155, column: 35, scope: !22, inlinedAt: !40) +!50 = !DILocation(line: 155, column: 69, scope: !22, inlinedAt: !40) +!51 = !DILocation(line: 145, column: 23, scope: !22, inlinedAt: !40) +!52 = !DILocation(line: 151, column: 27, scope: !22, inlinedAt: !40) +!53 = !DILocation(line: 151, column: 17, scope: !22, inlinedAt: !40) +!54 = !DILocation(line: 165, column: 42, scope: !22, inlinedAt: !40) +!55 = !DILocation(line: 47, column: 25, scope: !4) +!56 = !DILocation(line: 47, column: 36, scope: !4) +!57 = !DILocation(line: 47, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..c3ad494a3386d615cfd524ceb268c217df62d6e4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ptx @@ -0,0 +1,915 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_argmax_1 // -- Begin function triton_red_fused_argmax_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_argmax_1 +.visible .entry triton_red_fused_argmax_1( + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_1, + .param .u64 triton_red_fused_argmax_1_param_2, + .param .u64 triton_red_fused_argmax_1_param_3, + .param .u32 triton_red_fused_argmax_1_param_4, + .param .u32 triton_red_fused_argmax_1_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_7 +) +.reqntid 512 +{ + .reg .pred %p<215>; + .reg .b32 %r<123>; + .reg .b64 %rd<70>; + .loc 1 18 0 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:18:0 + +// %bb.0: + ld.param.b32 %r10, [triton_red_fused_argmax_1_param_4]; + ld.param.b64 %rd20, [triton_red_fused_argmax_1_param_3]; + ld.param.b64 %rd18, [triton_red_fused_argmax_1_param_0]; +$L__tmp0: + .loc 1 22 28 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:22:28 + mov.u32 %r11, %ctaid.x; + ld.param.b64 %rd21, [triton_red_fused_argmax_1_param_2]; + .loc 1 25 37 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:25:37 + mov.u32 %r12, %tid.x; + and.b32 %r1, %r12, 511; + .loc 1 27 19 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:27:19 + cvt.u64.u32 %rd2, %r11; + .loc 1 28 19 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:28:19 + and.b64 %rd22, %rd21, -4294967296; + setp.ne.b64 %p5, %rd22, 0; + cvt.u32.u64 %r118, %rd2; + @%p5 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd65, %rd2, %rd21; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r13, %rd21; + div.u32 %r15, %r118, %r13; + cvt.u64.u32 %rd65, %r15; +$L__BB0_3: + .loc 1 0 19 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:0:19 + ld.param.b64 %rd19, [triton_red_fused_argmax_1_param_1]; + cvt.u64.u32 %rd1, %r12; + .loc 1 24 21 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:24:21 + setp.lt.s32 %p3, %r118, %r10; + .loc 1 27 19 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:27:19 + mul.lo.s64 %rd25, %rd65, %rd21; + sub.s64 %rd26, %rd2, %rd25; + .loc 1 38 56 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:38:56 + mul.lo.s64 %rd27, %rd65, %rd20; + mad.lo.s64 %rd28, %rd26, 128000, %rd18; + .loc 1 32 40 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:32:40 + shl.b64 %rd29, %rd27, 2; + add.s64 %rd7, %rd28, %rd29; + cvt.u64.u32 %rd8, %r1; + shl.b64 %rd30, %rd20, 2; + mul.lo.s64 %rd31, %rd21, 128000; + sub.s64 %rd32, %rd30, %rd31; + mul.lo.s64 %rd33, %rd65, %rd32; + mad.lo.s64 %rd34, %rd2, 128000, %rd33; + mad.wide.u32 %rd35, %r1, 4, %rd34; + add.s64 %rd66, %rd18, %rd35; + mov.b32 %r20, 0fFF800000; + mov.b64 %rd68, {%r20, %r20}; + mov.b32 %r119, 2147483647; + mov.b64 %rd67, -2048; + mov.b32 %r120, %r119; + mov.b32 %r121, %r119; + mov.b32 %r122, %r119; + mov.b64 %rd69, %rd68; +$L__BB0_4: // =>This Inner Loop Header: Depth=1 + .loc 1 33 31 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:33:31 + add.s64 %rd48, %rd8, %rd67; + add.s64 %rd49, %rd48, 2048; + add.s64 %rd50, %rd1, %rd67; + cvt.u32.u64 %r30, %rd50; + add.s32 %r31, %r30, 2048; + or.b32 %r32, %r31, 512; + add.s64 %rd51, %rd48, 3072; + or.b32 %r33, %r31, 1536; + .loc 1 34 29 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:34:29 + setp.lt.u32 %p10, %r32, 32000; + setp.lt.u64 %p11, %rd51, 32000; + setp.lt.u32 %p12, %r33, 32000; + .loc 1 38 34 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:38:34 + mad.wide.u32 %rd40, %r32, 4, %rd7; + add.s64 %rd43, %rd66, 4096; + mad.wide.u32 %rd46, %r33, 4, %rd7; + .loc 1 38 71 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:38:71 + and.pred %p7, %p3, %p10; + .loc 1 38 61 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:38:61 + // begin inline asm + mov.u64 %rd36, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd36, 1.0; + // end inline asm + mov.b32 %r22, 0; + // begin inline asm + mov.u32 %r21, %r22; + @%p3 ld.global.L1::evict_first.L2::cache_hint.b32 { %r21 }, [ %rd66 + 0 ], %rd36; + // end inline asm + // begin inline asm + mov.u64 %rd39, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd39, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r23, %r22; + @%p7 ld.global.L1::evict_first.L2::cache_hint.b32 { %r23 }, [ %rd40 + 0 ], %rd39; + // end inline asm + // begin inline asm + mov.u64 %rd42, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd42, 1.0; + // end inline asm +$L__tmp1: + .loc 2 147 29 // triton_helpers.py:147:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + mov.b64 {%r34, %r35}, %rd68; + setp.nan.f32 %p13, %r34, %r34; + setp.nan.f32 %p14, %r35, %r35; + mov.b64 {%r36, %r37}, %rd69; + setp.nan.f32 %p15, %r36, %r36; + setp.nan.f32 %p16, %r37, %r37; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + cvt.s64.s32 %rd52, %r119; + setp.gt.s64 %p17, %rd49, %rd52; + setp.lt.s32 %p18, %r120, %r32; + cvt.s64.s32 %rd53, %r121; + setp.gt.s64 %p19, %rd51, %rd53; + setp.lt.s32 %p20, %r122, %r33; +$L__tmp2: + .loc 1 38 61 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:38:61 + cvt.u64.u32 %rd54, %r23; + shl.b64 %rd55, %rd54, 32; + cvt.u64.u32 %rd56, %r21; + or.b64 %rd57, %rd56, %rd55; +$L__tmp3: + .loc 2 148 29 // triton_helpers.py:148:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + mov.b64 {%r38, %r39}, %rd57; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + setp.gt.f32 %p21, %r35, %r39; + setp.gt.f32 %p22, %r34, %r38; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + setp.eq.f32 %p23, %r34, %r38; + setp.eq.f32 %p24, %r35, %r39; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + setp.nan.f32 %p25, %r39, %r39; + setp.nan.f32 %p26, %r38, %r38; + setp.num.f32 %p27, %r38, %r38; + setp.num.f32 %p28, %r39, %r39; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + and.pred %p29, %p14, %p28; + and.pred %p30, %p13, %p27; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + or.pred %p31, %p22, %p30; + or.pred %p32, %p21, %p29; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + and.pred %p33, %p13, %p26; + and.pred %p34, %p14, %p25; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + or.pred %p35, %p24, %p34; + or.pred %p36, %p23, %p33; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + and.pred %p37, %p17, %p36; + and.pred %p38, %p18, %p35; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + or.pred %p39, %p32, %p38; + or.pred %p40, %p31, %p37; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + selp.f32 %r40, %r34, %r38, %p40; + selp.f32 %r41, %r35, %r39, %p39; + cvt.u32.u64 %r42, %rd49; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + selp.b32 %r43, %r119, %r42, %p40; + selp.b32 %r44, %r120, %r32, %p39; +$L__tmp4: + .loc 1 38 71 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:38:71 + and.pred %p9, %p3, %p12; + and.pred %p8, %p3, %p11; + .loc 1 38 61 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:38:61 + // begin inline asm + mov.u32 %r25, %r22; + @%p8 ld.global.L1::evict_first.L2::cache_hint.b32 { %r25 }, [ %rd43 + 0 ], %rd42; + // end inline asm + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd45, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r27, %r22; + @%p9 ld.global.L1::evict_first.L2::cache_hint.b32 { %r27 }, [ %rd46 + 0 ], %rd45; + // end inline asm + cvt.u64.u32 %rd58, %r27; + shl.b64 %rd59, %rd58, 32; + cvt.u64.u32 %rd60, %r25; + or.b64 %rd61, %rd60, %rd59; +$L__tmp5: + .loc 2 148 29 // triton_helpers.py:148:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + mov.b64 {%r45, %r46}, %rd61; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + setp.gt.f32 %p41, %r37, %r46; + setp.gt.f32 %p42, %r36, %r45; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + setp.eq.f32 %p43, %r36, %r45; + setp.eq.f32 %p44, %r37, %r46; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + setp.nan.f32 %p45, %r46, %r46; + setp.nan.f32 %p46, %r45, %r45; + setp.num.f32 %p47, %r45, %r45; + setp.num.f32 %p48, %r46, %r46; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + and.pred %p49, %p16, %p48; + and.pred %p50, %p15, %p47; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + or.pred %p51, %p42, %p50; + or.pred %p52, %p41, %p49; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + and.pred %p53, %p15, %p46; + and.pred %p54, %p16, %p45; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + or.pred %p55, %p44, %p54; + or.pred %p56, %p43, %p53; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + and.pred %p57, %p19, %p56; + and.pred %p58, %p20, %p55; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + or.pred %p59, %p52, %p58; + or.pred %p60, %p51, %p57; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + selp.f32 %r47, %r36, %r45, %p60; + selp.f32 %r48, %r37, %r46, %p59; + cvt.u32.u64 %r49, %rd51; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:41:38 ] + selp.b32 %r50, %r121, %r49, %p60; + selp.b32 %r51, %r122, %r33, %p59; +$L__tmp6: + .loc 1 43 54 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:43:54 + selp.f32 %r52, %r41, %r35, %p7; + selp.f32 %r53, %r40, %r34, %p3; + mov.b64 %rd68, {%r53, %r52}; + selp.f32 %r54, %r48, %r37, %p9; + selp.f32 %r55, %r47, %r36, %p8; + mov.b64 %rd69, {%r55, %r54}; + .loc 1 44 66 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:44:66 + selp.b32 %r119, %r43, %r119, %p3; + selp.b32 %r120, %r44, %r120, %p7; + selp.b32 %r122, %r51, %r122, %p9; + selp.b32 %r121, %r50, %r121, %p8; + .loc 1 32 40 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:32:40 + add.s64 %rd67, %rd67, 2048; + add.s64 %rd66, %rd66, 8192; + setp.lt.u64 %p61, %rd67, 29952; + @%p61 bra $L__BB0_4; +// %bb.5: + .loc 1 0 40 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:0:40 + cvt.u32.u64 %r68, %rd1; + .loc 1 25 37 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:25:37 + and.b32 %r70, %r68, 31; +$L__tmp7: + .loc 2 144 21 // triton_helpers.py:144:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + mov.b64 {%r71, %r72}, %rd68; + setp.gt.f32 %p70, %r71, %r72; + setp.eq.f32 %p71, %r72, %r71; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p72, %r71, %r71; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.num.f32 %p73, %r72, %r72; + setp.nan.f32 %p74, %r72, %r72; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p75, %p72, %p74; + and.pred %p76, %p72, %p73; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p77, %p70, %p76; + or.pred %p78, %p71, %p75; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.lt.s32 %p79, %r119, %r120; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p80, %p79, %p78; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p81, %p77, %p80; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.f32 %r73, %r71, %r72, %p81; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.b32 %r74, %r119, %r120, %p81; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + mov.b64 {%r75, %r76}, %rd69; + setp.gt.f32 %p82, %r73, %r75; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.eq.f32 %p83, %r73, %r75; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p84, %r73, %r73; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p85, %r75, %r75; + setp.num.f32 %p86, %r75, %r75; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p87, %p84, %p86; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p88, %p82, %p87; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p89, %p85, %p84; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p90, %p83, %p89; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.lt.s32 %p91, %r74, %r121; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p92, %p91, %p90; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p93, %p88, %p92; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.f32 %r77, %r73, %r75, %p93; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.b32 %r78, %r74, %r121, %p93; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.gt.f32 %p94, %r77, %r76; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.eq.f32 %p95, %r77, %r76; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p96, %r77, %r77; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p97, %r76, %r76; + setp.num.f32 %p98, %r76, %r76; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p99, %p96, %p98; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p100, %p94, %p99; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p101, %p97, %p96; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p102, %p95, %p101; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.lt.s32 %p103, %r78, %r122; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p104, %p103, %p102; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p105, %p100, %p104; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.f32 %r79, %r77, %r76, %p105; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.b32 %r80, %r78, %r122, %p105; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + shfl.sync.bfly.b32 %r81, %r79, 16, 31, -1; + shfl.sync.bfly.b32 %r82, %r80, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.gt.f32 %p106, %r79, %r81; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.eq.f32 %p107, %r79, %r81; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p108, %r79, %r79; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p109, %r81, %r81; + setp.num.f32 %p110, %r81, %r81; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p111, %p108, %p110; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p112, %p106, %p111; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p113, %p108, %p109; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p114, %p107, %p113; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.lt.s32 %p115, %r80, %r82; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p116, %p115, %p114; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p117, %p112, %p116; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.f32 %r83, %r79, %r81, %p117; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.b32 %r84, %r80, %r82, %p117; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + shfl.sync.bfly.b32 %r85, %r83, 8, 31, -1; + shfl.sync.bfly.b32 %r86, %r84, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.gt.f32 %p118, %r83, %r85; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.eq.f32 %p119, %r83, %r85; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p120, %r83, %r83; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p121, %r85, %r85; + setp.num.f32 %p122, %r85, %r85; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p123, %p120, %p122; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p124, %p118, %p123; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p125, %p121, %p120; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p126, %p119, %p125; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.lt.s32 %p127, %r84, %r86; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p128, %p127, %p126; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p129, %p124, %p128; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.f32 %r87, %r83, %r85, %p129; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.b32 %r88, %r84, %r86, %p129; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + shfl.sync.bfly.b32 %r89, %r87, 4, 31, -1; + shfl.sync.bfly.b32 %r90, %r88, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.gt.f32 %p130, %r87, %r89; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.eq.f32 %p131, %r87, %r89; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p132, %r87, %r87; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p133, %r89, %r89; + setp.num.f32 %p134, %r89, %r89; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p135, %p132, %p134; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p136, %p130, %p135; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p137, %p133, %p132; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p138, %p131, %p137; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.lt.s32 %p139, %r88, %r90; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p140, %p139, %p138; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p141, %p136, %p140; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.f32 %r91, %r87, %r89, %p141; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.b32 %r92, %r88, %r90, %p141; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + shfl.sync.bfly.b32 %r93, %r91, 2, 31, -1; + shfl.sync.bfly.b32 %r94, %r92, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.gt.f32 %p142, %r91, %r93; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.eq.f32 %p143, %r91, %r93; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p144, %r91, %r91; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p145, %r93, %r93; + setp.num.f32 %p146, %r93, %r93; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p147, %p144, %p146; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p148, %p142, %p147; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p149, %p145, %p144; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p150, %p143, %p149; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.lt.s32 %p151, %r92, %r94; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p152, %p151, %p150; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p153, %p148, %p152; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.f32 %r95, %r91, %r93, %p153; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.b32 %r96, %r92, %r94, %p153; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + shfl.sync.bfly.b32 %r97, %r95, 1, 31, -1; + shfl.sync.bfly.b32 %r98, %r96, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.gt.f32 %p154, %r95, %r97; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.eq.f32 %p155, %r95, %r97; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p156, %r95, %r95; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p157, %r97, %r97; + setp.num.f32 %p158, %r97, %r97; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p159, %p156, %p158; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p160, %p154, %p159; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p161, %p157, %p156; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p162, %p155, %p161; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.lt.s32 %p163, %r96, %r98; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p164, %p163, %p162; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p165, %p160, %p164; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.b32 %r59, %r96, %r98, %p165; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.eq.b32 %p62, %r70, 0; + shr.u32 %r99, %r68, 3; + and.b32 %r100, %r99, 60; + mov.b32 %r101, global_smem; + add.s32 %r56, %r101, %r100; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.b32 %r57, %r95, %r97, %p165; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + // begin inline asm + @%p62 st.shared.b32 [ %r56 + 0 ], %r57; + // end inline asm + add.s32 %r102, %r101, 64; + add.s32 %r58, %r102, %r100; + // begin inline asm + @%p62 st.shared.b32 [ %r58 + 0 ], %r59; + // end inline asm + bar.sync 0; + setp.lt.u32 %p64, %r68, 16; + shl.b32 %r103, %r68, 2; + add.s32 %r61, %r101, %r103; + // begin inline asm + @%p64 ld.shared.b32 %r60, [ %r61 + 0 ]; + // end inline asm + add.s32 %r63, %r102, %r103; + // begin inline asm + @%p64 ld.shared.b32 %r62, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r104, %r60, 8, 31, -1; + shfl.sync.bfly.b32 %r105, %r62, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.gt.f32 %p166, %r60, %r104; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.eq.f32 %p167, %r60, %r104; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p168, %r60, %r60; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p169, %r104, %r104; + setp.num.f32 %p170, %r104, %r104; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p171, %p168, %p170; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p172, %p166, %p171; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p173, %p168, %p169; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p174, %p167, %p173; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.lt.s32 %p175, %r62, %r105; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p176, %p175, %p174; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p177, %p172, %p176; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.f32 %r106, %r60, %r104, %p177; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.b32 %r107, %r62, %r105, %p177; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + shfl.sync.bfly.b32 %r108, %r106, 4, 31, -1; + shfl.sync.bfly.b32 %r109, %r107, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.gt.f32 %p178, %r106, %r108; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.eq.f32 %p179, %r106, %r108; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p180, %r106, %r106; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p181, %r108, %r108; + setp.num.f32 %p182, %r108, %r108; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p183, %p180, %p182; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p184, %p178, %p183; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p185, %p181, %p180; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p186, %p179, %p185; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.lt.s32 %p187, %r107, %r109; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p188, %p187, %p186; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p189, %p184, %p188; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.f32 %r110, %r106, %r108, %p189; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.b32 %r111, %r107, %r109, %p189; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + shfl.sync.bfly.b32 %r112, %r110, 2, 31, -1; + shfl.sync.bfly.b32 %r113, %r111, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.gt.f32 %p190, %r110, %r112; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.eq.f32 %p191, %r110, %r112; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p192, %r110, %r110; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p193, %r112, %r112; + setp.num.f32 %p194, %r112, %r112; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p195, %p192, %p194; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p196, %p190, %p195; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p197, %p193, %p192; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p198, %p191, %p197; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.lt.s32 %p199, %r111, %r113; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p200, %p199, %p198; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p201, %p196, %p200; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.f32 %r114, %r110, %r112, %p201; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.b32 %r115, %r111, %r113, %p201; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + shfl.sync.bfly.b32 %r116, %r114, 1, 31, -1; + shfl.sync.bfly.b32 %r117, %r115, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.gt.f32 %p202, %r114, %r116; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.eq.f32 %p203, %r114, %r116; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p204, %r114, %r114; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.nan.f32 %p205, %r116, %r116; + setp.num.f32 %p206, %r116, %r116; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p207, %p204, %p206; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p208, %p202, %p207; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p209, %p205, %p204; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p210, %p203, %p209; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.lt.s32 %p211, %r115, %r117; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + and.pred %p212, %p211, %p210; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + or.pred %p213, %p208, %p212; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.b32 %r67, %r115, %r117, %p213; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + setp.eq.b32 %p66, %r68, 0; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + selp.b32 %r65, %r114, %r116, %p213; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:45:75 ] + // begin inline asm + @%p66 st.shared.b32 [ %r61 + 0 ], %r65; + // end inline asm + // begin inline asm + @%p66 st.shared.b32 [ %r63 + 0 ], %r67; + // end inline asm + bar.sync 0; + ld.shared.s32 %rd62, [global_smem+64]; +$L__tmp8: + .loc 1 47 25 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:47:25 + shl.b64 %rd64, %rd2, 3; + add.s64 %rd63, %rd19, %rd64; + .loc 1 47 36 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:47:36 + setp.eq.b32 %p214, %r1, 0; + and.pred %p68, %p214, %p3; + // begin inline asm + @%p68 st.global.b64 [ %rd63 + 0 ], { %rd62 }; + // end inline asm + .loc 1 47 4 // cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py:47:4 + ret; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 234 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 97 +.b8 102 +.b8 101 +.b8 51 +.b8 100 +.b8 115 +.b8 117 +.b8 101 +.b8 108 +.b8 99 +.b8 108 +.b8 111 +.b8 101 +.b8 109 +.b8 119 +.b8 117 +.b8 53 +.b8 106 +.b8 100 +.b8 105 +.b8 107 +.b8 112 +.b8 55 +.b8 108 +.b8 113 +.b8 97 +.b8 110 +.b8 111 +.b8 53 +.b8 113 +.b8 120 +.b8 118 +.b8 55 +.b8 105 +.b8 97 +.b8 121 +.b8 104 +.b8 116 +.b8 109 +.b8 53 +.b8 120 +.b8 103 +.b8 106 +.b8 105 +.b8 50 +.b8 120 +.b8 118 +.b8 114 +.b8 52 +.b8 107 +.b8 54 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 97 +.b8 102 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.source b/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.source new file mode 100644 index 0000000000000000000000000000000000000000..a709c715ae03e07e81f4d085eb4f6d4939f0fbd7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.source @@ -0,0 +1,317 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":18:0) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc47 = loc(unknown) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc72 = loc("in_ptr0"(#loc)) +#loc73 = loc("out_ptr0"(#loc)) +#loc74 = loc("ks0"(#loc)) +#loc75 = loc("ks1"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc106 = loc("a_value"(#loc35)) +#loc107 = loc("a_index"(#loc35)) +#loc108 = loc("b_value"(#loc35)) +#loc109 = loc("b_index"(#loc35)) +#loc122 = loc("x"(#loc55)) +#loc123 = loc("x"(#loc59)) +#loc124 = loc("value"(#loc68)) +#loc125 = loc("index"(#loc68)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 32000 : i32 loc(#loc78) + %xoffset = tt.get_program_id x : i32 loc(#loc79) + %xoffset_1 = arith.constant 1 : i32 loc(#loc80) + %xoffset_2 = arith.constant 1 : i32 loc(#loc80) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc80) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc81) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc82) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc83) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc83) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc84) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc84) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc85) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc86) + %x0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc87) + %x0_9 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc87) + %x0_10 = arith.remsi %x0, %x0_9 : tensor<1x1xi64> loc(#loc87) + %x1 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc88) + %x1_11 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc88) + %x1_12 = arith.divsi %x1, %x1_11 : tensor<1x1xi64> loc(#loc88) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc89) + %_tmp2_13 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc89) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc90) + %_tmp2_index_14 = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc90) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp2_index_15:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_16 = %_tmp2_13, %_tmp2_index_17 = %_tmp2_index_14) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc92) + %r0_index_18 = arith.addi %r0_index, %r0_base_8 : tensor<1x2048xi32> loc(#loc92) + %r0_mask = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc93) + %r0_mask_19 = arith.cmpi slt, %r0_index_18, %r0_mask : tensor<1x2048xi32> loc(#loc93) + %tmp0 = arith.constant 32000 : i32 loc(#loc94) + %tmp0_20 = arith.constant 32000 : i64 loc(#loc94) + %tmp0_21 = arith.constant dense<32000> : tensor<1x1xi64> loc(#loc94) + %tmp0_22 = arith.muli %tmp0_21, %x0_10 : tensor<1x1xi64> loc(#loc94) + %tmp0_23 = arith.extsi %r0_index_18 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc95) + %tmp0_24 = tt.broadcast %tmp0_22 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc95) + %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<1x2048xi64> loc(#loc95) + %tmp0_26 = tt.splat %ks1 : i64 -> tensor<1x1xi64> loc(#loc96) + %tmp0_27 = arith.muli %tmp0_26, %x1_12 : tensor<1x1xi64> loc(#loc96) + %tmp0_28 = tt.broadcast %tmp0_27 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc97) + %tmp0_29 = arith.addi %tmp0_25, %tmp0_28 : tensor<1x2048xi64> loc(#loc97) + %tmp0_30 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc98) + %tmp0_31 = tt.addptr %tmp0_30, %tmp0_29 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc98) + %tmp0_32 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc99) + %tmp0_33 = arith.andi %r0_mask_19, %tmp0_32 : tensor<1x2048xi1> loc(#loc99) + %tmp0_34 = arith.constant 0.000000e+00 : f32 loc(#loc100) + %tmp0_35 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc100) + %tmp0_36 = tt.load %tmp0_31, %tmp0_33, %tmp0_35 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc100) + %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%_tmp2_16, %_tmp2_index_17, %tmp0_36, %r0_index_18) : (tensor<1x2048xf32>, tensor<1x2048xi32>, tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) loc(#loc24) + %_tmp2_37 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc101) + %_tmp2_38 = arith.andi %r0_mask_19, %_tmp2_37 : tensor<1x2048xi1> loc(#loc101) + %_tmp2_39 = arith.select %_tmp2_38, %8#0, %_tmp2_16 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc102) + %_tmp2_index_40 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc103) + %_tmp2_index_41 = arith.andi %r0_mask_19, %_tmp2_index_40 : tensor<1x2048xi1> loc(#loc103) + %_tmp2_index_42 = arith.select %_tmp2_index_41, %8#1, %_tmp2_index_17 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc104) + scf.yield %_tmp2_39, %_tmp2_index_42 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc29) + } loc(#loc126) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%_tmp2_index_15#0, %_tmp2_index_15#1) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc30) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc105) + %5 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc32) + %6 = tt.addptr %5, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc32) + %7 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc33) + tt.store %6, %7, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc33) + tt.return loc(#loc34) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%a_value: tensor<1x2048xf32> loc("a_value"(#loc35)), %a_index: tensor<1x2048xi32> loc("a_index"(#loc35)), %b_value: tensor<1x2048xf32> loc("b_value"(#loc35)), %b_index: tensor<1x2048xi32> loc("b_index"(#loc35))) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<1x2048xf32> loc(#loc127) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<1x2048xf32> loc(#loc128) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%a_value) : (tensor<1x2048xf32>) -> i1 loc(#loc38) + %1:2 = scf.if %0 -> (tensor<1x2048xi1>, tensor<1x2048xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<1x2048xf32> loc(#loc112) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<1x2048xf32> loc(#loc113) + %mask_3 = arith.constant true loc(#loc114) + %mask_4 = arith.constant dense : tensor<1x2048xi1> loc(#loc114) + %mask_5 = arith.xori %b_isnan, %mask_4 : tensor<1x2048xi1> loc(#loc114) + %mask_6 = arith.andi %a_isnan, %mask_5 : tensor<1x2048xi1> loc(#loc115) + %mask_7 = arith.ori %mask, %mask_6 : tensor<1x2048xi1> loc(#loc129) + %equal_8 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc117) + %equal_9 = arith.ori %equal, %equal_8 : tensor<1x2048xi1> loc(#loc130) + scf.yield %mask_7, %equal_9 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc130) + } else { + scf.yield %mask, %equal : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc47) + } loc(#loc39) + %mask_0 = arith.cmpi slt, %a_index, %b_index : tensor<1x2048xi32> loc(#loc119) + %mask_1 = arith.andi %1#1, %mask_0 : tensor<1x2048xi1> loc(#loc120) + %mask_2 = arith.ori %1#0, %mask_1 : tensor<1x2048xi1> loc(#loc121) + %2 = arith.select %mask_2, %a_value, %b_value : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc51) + %3 = arith.select %mask_2, %a_index, %b_index : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc52) + tt.return %2, %3 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc53) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x2048xf32> loc(#loc54) + %5 = ub.poison : tensor<1x2048xi32> loc(#loc54) + tt.return %4, %5 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc54) + } loc(#loc35) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc55))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc56) + %true = arith.constant true loc(#loc57) + tt.return %true : i1 loc(#loc57) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc58) + tt.return %1 : i1 loc(#loc58) + } loc(#loc55) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc59))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc60) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc61) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc61) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc61) + %4 = arith.addf %x, %3 : tensor<1x2048xf32> loc(#loc61) + tt.return %4 : tensor<1x2048xf32> loc(#loc62) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x2048xf32> loc(#loc63) + tt.return %5 : tensor<1x2048xf32> loc(#loc63) + } loc(#loc59) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc65) + %cst = arith.constant dense : tensor<1xi1> loc(#loc65) + tt.return %cst : tensor<1xi1> loc(#loc66) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc67) + tt.return %0 : tensor<1xi1> loc(#loc67) + } loc(#loc64) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%value: tensor<1x2048xf32> loc("value"(#loc68)), %index: tensor<1x2048xi32> loc("index"(#loc68))) -> (tensor<1xf32>, tensor<1xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc69) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc69) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc69) + tt.return %0#0, %0#1 : tensor<1xf32>, tensor<1xi32> loc(#loc70) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc71) + %2 = ub.poison : tensor<1xi32> loc(#loc71) + tt.return %1, %2 : tensor<1xf32>, tensor<1xi32> loc(#loc71) + } loc(#loc68) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc35)), %a_index: i32 loc("a_index"(#loc35)), %b_value: f32 loc("b_value"(#loc35)), %b_index: i32 loc("b_index"(#loc35))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc127) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc128) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc38) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc112) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc113) + %mask_3 = arith.constant true loc(#loc114) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc114) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc115) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc129) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc117) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc130) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc130) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc47) + } loc(#loc39) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc119) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc120) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc121) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc51) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc52) + tt.return %2, %3 : f32, i32 loc(#loc53) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc54) + %5 = ub.poison : i32 loc(#loc54) + tt.return %4, %5 : f32, i32 loc(#loc54) + } loc(#loc35) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc55))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc56) + %true = arith.constant true loc(#loc57) + tt.return %true : i1 loc(#loc57) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc58) + tt.return %1 : i1 loc(#loc58) + } loc(#loc55) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc59))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc60) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc61) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc61) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc61) + tt.return %3 : tensor<1xf32> loc(#loc62) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc63) + tt.return %4 : tensor<1xf32> loc(#loc63) + } loc(#loc59) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":29:55) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":30:58) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":32:40) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":33:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":34:29) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:47) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:41) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:56) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:52) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:71) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:61) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":41:38) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":43:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":43:54) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":44:41) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":44:66) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":44:8) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":45:75) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":46:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":47:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":47:36) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":47:4) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc78 = loc("r0_numel"(#loc1)) +#loc79 = loc("xoffset"(#loc2)) +#loc80 = loc("xoffset"(#loc3)) +#loc81 = loc("xindex"(#loc4)) +#loc82 = loc("xindex"(#loc5)) +#loc83 = loc("xindex"(#loc6)) +#loc84 = loc("xmask"(#loc7)) +#loc85 = loc("r0_base"(#loc8)) +#loc86 = loc("r0_base"(#loc9)) +#loc87 = loc("x0"(#loc10)) +#loc88 = loc("x1"(#loc11)) +#loc89 = loc("_tmp2"(#loc12)) +#loc90 = loc("_tmp2_index"(#loc13)) +#loc91 = loc("_tmp2"(#loc14)) +#loc92 = loc("r0_index"(#loc15)) +#loc93 = loc("r0_mask"(#loc16)) +#loc94 = loc("tmp0"(#loc17)) +#loc95 = loc("tmp0"(#loc18)) +#loc96 = loc("tmp0"(#loc19)) +#loc97 = loc("tmp0"(#loc20)) +#loc98 = loc("tmp0"(#loc21)) +#loc99 = loc("tmp0"(#loc22)) +#loc100 = loc("tmp0"(#loc23)) +#loc101 = loc("_tmp2"(#loc25)) +#loc102 = loc("_tmp2"(#loc26)) +#loc103 = loc("_tmp2_index"(#loc27)) +#loc104 = loc("_tmp2_index"(#loc28)) +#loc105 = loc("tmp2"(#loc31)) +#loc110 = loc("mask"(#loc36)) +#loc111 = loc("equal"(#loc37)) +#loc112 = loc("a_isnan"(#loc40)) +#loc113 = loc("b_isnan"(#loc41)) +#loc114 = loc("mask"(#loc42)) +#loc115 = loc("mask"(#loc43)) +#loc116 = loc("mask"(#loc44)) +#loc117 = loc("equal"(#loc45)) +#loc118 = loc("equal"(#loc46)) +#loc119 = loc("mask"(#loc48)) +#loc120 = loc("mask"(#loc49)) +#loc121 = loc("mask"(#loc50)) +#loc126 = loc("_tmp2_index"(#loc91)) +#loc127 = loc("mask"(#loc110)) +#loc128 = loc("equal"(#loc111)) +#loc129 = loc("mask"(#loc116)) +#loc130 = loc("equal"(#loc118)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..22253622e6955f45bd5ef1f6068055304be27fb0 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttgir @@ -0,0 +1,198 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":18:0) +#loc1 = loc(unknown) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":45:75) +#loc41 = loc("in_ptr0"(#loc)) +#loc42 = loc("out_ptr0"(#loc)) +#loc43 = loc("ks0"(#loc)) +#loc44 = loc("ks1"(#loc)) +#loc45 = loc("xnumel"(#loc)) +#loc46 = loc("r0_numel"(#loc)) +#loc79 = loc(callsite(#loc1 at #loc36)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<2147483647> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<32000> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %cst_3 = arith.constant dense : tensor<1x2048xi1, #blocked> loc(#loc1) + %true = arith.constant true loc(#loc1) + %c32000_i64 = arith.constant 32000 : i64 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc47) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc48) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc49) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc49) + %x0 = arith.extsi %xoffset : i32 to i64 loc(#loc50) + %x0_5 = arith.remsi %x0, %ks0 : i64 loc(#loc50) + %x1 = arith.divsi %x0, %ks0 : i64 loc(#loc51) + %tmp0 = arith.muli %x0_5, %c32000_i64 : i64 loc(#loc52) + %tmp0_6 = tt.splat %tmp0 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc81) + %tmp0_7 = arith.muli %ks1, %x1 : i64 loc(#loc54) + %tmp0_8 = tt.splat %tmp0_7 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc82) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc56) + %tmp0_10 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc83) + %_tmp2_index:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp2 = %cst, %_tmp2_index_11 = %cst_0) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc59) + %r0_index_12 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32, #blocked> loc(#loc59) + %r0_mask = arith.cmpi slt, %r0_index_12, %cst_1 : tensor<1x2048xi32, #blocked> loc(#loc60) + %tmp0_13 = arith.extsi %r0_index_12 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc53) + %tmp0_14 = arith.addi %tmp0_13, %tmp0_6 : tensor<1x2048xi64, #blocked> loc(#loc53) + %tmp0_15 = arith.addi %tmp0_14, %tmp0_8 : tensor<1x2048xi64, #blocked> loc(#loc55) + %tmp0_16 = tt.addptr %tmp0_9, %tmp0_15 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc56) + %tmp0_17 = arith.andi %r0_mask, %tmp0_10 : tensor<1x2048xi1, #blocked> loc(#loc57) + %tmp0_18 = tt.load %tmp0_16, %tmp0_17, %cst_2 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc61) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_18 : tensor<1x2048xf32, #blocked> loc(#loc107) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_18 : tensor<1x2048xf32, #blocked> loc(#loc108) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<1x2048xf32, #blocked> loc(#loc87) + %b_isnan = arith.cmpf une, %tmp0_18, %tmp0_18 : tensor<1x2048xf32, #blocked> loc(#loc88) + %mask_19 = arith.xori %b_isnan, %cst_3 : tensor<1x2048xi1, #blocked> loc(#loc89) + %mask_20 = arith.andi %a_isnan, %mask_19 : tensor<1x2048xi1, #blocked> loc(#loc90) + %mask_21 = arith.ori %mask, %mask_20 : tensor<1x2048xi1, #blocked> loc(#loc109) + %equal_22 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1, #blocked> loc(#loc92) + %equal_23 = arith.ori %equal, %equal_22 : tensor<1x2048xi1, #blocked> loc(#loc110) + %mask_24 = arith.cmpi slt, %_tmp2_index_11, %r0_index_12 : tensor<1x2048xi32, #blocked> loc(#loc94) + %mask_25 = arith.andi %equal_23, %mask_24 : tensor<1x2048xi1, #blocked> loc(#loc95) + %mask_26 = arith.ori %mask_21, %mask_25 : tensor<1x2048xi1, #blocked> loc(#loc96) + %6 = arith.select %mask_26, %_tmp2, %tmp0_18 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc74) + %7 = arith.select %mask_26, %_tmp2_index_11, %r0_index_12 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc75) + %_tmp2_27 = arith.select %tmp0_17, %6, %_tmp2 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc76) + %_tmp2_index_28 = arith.select %tmp0_17, %7, %_tmp2_index_11 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc77) + scf.yield %_tmp2_27, %_tmp2_index_28 : tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc34) + } loc(#loc84) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc36)), %arg7: i32 loc(callsite(#loc1 at #loc36)), %arg8: f32 loc(callsite(#loc1 at #loc36)), %arg9: i32 loc(callsite(#loc1 at #loc36))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc111) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc112) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc97) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc98) + %mask_11 = arith.xori %b_isnan, %true : i1 loc(#loc99) + %mask_12 = arith.andi %a_isnan, %mask_11 : i1 loc(#loc100) + %mask_13 = arith.ori %mask, %mask_12 : i1 loc(#loc113) + %equal_14 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc101) + %equal_15 = arith.ori %equal, %equal_14 : i1 loc(#loc114) + %mask_16 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc102) + %mask_17 = arith.andi %equal_15, %mask_16 : i1 loc(#loc103) + %mask_18 = arith.ori %mask_13, %mask_17 : i1 loc(#loc104) + %6 = arith.select %mask_18, %arg6, %arg8 : f32 loc(#loc105) + %7 = arith.select %mask_18, %arg7, %arg9 : i32 loc(#loc106) + tt.reduce.return %6, %7 : f32, i32 loc(#loc78) + }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc78) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi32, #blocked> loc(#loc80) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc38) + %2 = ttg.convert_layout %tmp2 : tensor<1x1xi32, #blocked> -> tensor<1x1xi32, #blocked1> loc(#loc39) + %3 = arith.extsi %2 : tensor<1x1xi32, #blocked1> to tensor<1x1xi64, #blocked1> loc(#loc39) + %4 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc39) + %5 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked1> loc(#loc39) + tt.store %4, %3, %5 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc39) + tt.return loc(#loc40) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":27:19) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":28:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:47) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:41) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:56) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:52) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:71) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":32:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":33:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":34:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:61) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":41:38) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":43:54) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":44:66) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":44:8) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":46:20) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":47:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":47:36) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":47:4) +#loc47 = loc("xoffset"(#loc2)) +#loc48 = loc("xmask"(#loc3)) +#loc49 = loc("r0_base"(#loc4)) +#loc50 = loc("x0"(#loc5)) +#loc51 = loc("x1"(#loc6)) +#loc52 = loc("tmp0"(#loc7)) +#loc53 = loc("tmp0"(#loc8)) +#loc54 = loc("tmp0"(#loc9)) +#loc55 = loc("tmp0"(#loc10)) +#loc56 = loc("tmp0"(#loc11)) +#loc57 = loc("tmp0"(#loc12)) +#loc58 = loc("_tmp2"(#loc13)) +#loc59 = loc("r0_index"(#loc14)) +#loc60 = loc("r0_mask"(#loc15)) +#loc61 = loc("tmp0"(#loc16)) +#loc62 = loc("mask"(#loc17)) +#loc63 = loc("equal"(#loc19)) +#loc64 = loc("a_isnan"(#loc20)) +#loc65 = loc("b_isnan"(#loc21)) +#loc66 = loc("mask"(#loc22)) +#loc67 = loc("mask"(#loc23)) +#loc68 = loc("mask"(#loc24)) +#loc69 = loc("equal"(#loc25)) +#loc70 = loc("equal"(#loc26)) +#loc71 = loc("mask"(#loc27)) +#loc72 = loc("mask"(#loc28)) +#loc73 = loc("mask"(#loc29)) +#loc74 = loc(callsite(#loc30 at #loc18)) +#loc75 = loc(callsite(#loc31 at #loc18)) +#loc76 = loc("_tmp2"(#loc32)) +#loc77 = loc("_tmp2_index"(#loc33)) +#loc78 = loc(callsite(#loc35 at #loc36)) +#loc80 = loc("tmp2"(#loc37)) +#loc81 = loc(fused[#loc53, #loc52]) +#loc82 = loc(fused[#loc55, #loc54]) +#loc83 = loc(fused[#loc57, #loc48]) +#loc84 = loc("_tmp2_index"(#loc58)) +#loc85 = loc("mask"(#loc62)) +#loc86 = loc("equal"(#loc63)) +#loc87 = loc(callsite(#loc64 at #loc18)) +#loc88 = loc(callsite(#loc65 at #loc18)) +#loc89 = loc(callsite(#loc66 at #loc18)) +#loc90 = loc(callsite(#loc67 at #loc18)) +#loc91 = loc("mask"(#loc68)) +#loc92 = loc(callsite(#loc69 at #loc18)) +#loc93 = loc("equal"(#loc70)) +#loc94 = loc(callsite(#loc71 at #loc18)) +#loc95 = loc(callsite(#loc72 at #loc18)) +#loc96 = loc(callsite(#loc73 at #loc18)) +#loc97 = loc(callsite(#loc64 at #loc78)) +#loc98 = loc(callsite(#loc65 at #loc78)) +#loc99 = loc(callsite(#loc66 at #loc78)) +#loc100 = loc(callsite(#loc67 at #loc78)) +#loc101 = loc(callsite(#loc69 at #loc78)) +#loc102 = loc(callsite(#loc71 at #loc78)) +#loc103 = loc(callsite(#loc72 at #loc78)) +#loc104 = loc(callsite(#loc73 at #loc78)) +#loc105 = loc(callsite(#loc30 at #loc78)) +#loc106 = loc(callsite(#loc31 at #loc78)) +#loc107 = loc(callsite(#loc85 at #loc18)) +#loc108 = loc(callsite(#loc86 at #loc18)) +#loc109 = loc(callsite(#loc91 at #loc18)) +#loc110 = loc(callsite(#loc93 at #loc18)) +#loc111 = loc(callsite(#loc85 at #loc78)) +#loc112 = loc(callsite(#loc86 at #loc78)) +#loc113 = loc(callsite(#loc91 at #loc78)) +#loc114 = loc(callsite(#loc93 at #loc78)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..4b9a8d290accfaba87a5cb31b5fe2705a9852a6f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttir @@ -0,0 +1,201 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":45:75) +#loc44 = loc("in_ptr0"(#loc)) +#loc45 = loc("out_ptr0"(#loc)) +#loc46 = loc("ks0"(#loc)) +#loc47 = loc("ks1"(#loc)) +#loc48 = loc("xnumel"(#loc)) +#loc49 = loc("r0_numel"(#loc)) +#loc50 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c32000_i64 = arith.constant 32000 : i64 loc(#loc1) + %true = arith.constant true loc(#loc50) + %cst = arith.constant dense : tensor<1x2048xi1> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc3) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc1) + %cst_1 = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc51) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc52) + %xoffset = tt.get_program_id x : i32 loc(#loc53) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc54) + %xmask_2 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc54) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc55) + %r0_base_3 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc56) + %x0 = arith.extsi %xoffset : i32 to i64 loc(#loc57) + %x0_4 = arith.remsi %x0, %ks0 : i64 loc(#loc57) + %x1 = arith.divsi %x0, %ks0 : i64 loc(#loc58) + %_tmp2_index_5:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp2_6 = %_tmp2, %_tmp2_index_7 = %_tmp2_index) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc60) + %r0_index_8 = arith.addi %r0_index, %r0_base_3 : tensor<1x2048xi32> loc(#loc60) + %r0_mask = arith.cmpi slt, %r0_index_8, %cst_1 : tensor<1x2048xi32> loc(#loc61) + %tmp0 = arith.muli %x0_4, %c32000_i64 : i64 loc(#loc62) + %tmp0_9 = arith.extsi %r0_index_8 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc63) + %tmp0_10 = tt.splat %tmp0 : i64 -> tensor<1x2048xi64> loc(#loc88) + %tmp0_11 = arith.addi %tmp0_9, %tmp0_10 : tensor<1x2048xi64> loc(#loc63) + %tmp0_12 = arith.muli %ks1, %x1 : i64 loc(#loc64) + %tmp0_13 = tt.splat %tmp0_12 : i64 -> tensor<1x2048xi64> loc(#loc89) + %tmp0_14 = arith.addi %tmp0_11, %tmp0_13 : tensor<1x2048xi64> loc(#loc65) + %tmp0_15 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc66) + %tmp0_16 = tt.addptr %tmp0_15, %tmp0_14 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc66) + %tmp0_17 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc90) + %tmp0_18 = arith.andi %r0_mask, %tmp0_17 : tensor<1x2048xi1> loc(#loc67) + %tmp0_19 = tt.load %tmp0_16, %tmp0_18, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc68) + %mask = arith.cmpf ogt, %_tmp2_6, %tmp0_19 : tensor<1x2048xf32> loc(#loc113) + %equal = arith.cmpf oeq, %_tmp2_6, %tmp0_19 : tensor<1x2048xf32> loc(#loc114) + %a_isnan = arith.cmpf une, %_tmp2_6, %_tmp2_6 : tensor<1x2048xf32> loc(#loc93) + %b_isnan = arith.cmpf une, %tmp0_19, %tmp0_19 : tensor<1x2048xf32> loc(#loc94) + %mask_20 = arith.xori %b_isnan, %cst : tensor<1x2048xi1> loc(#loc95) + %mask_21 = arith.andi %a_isnan, %mask_20 : tensor<1x2048xi1> loc(#loc96) + %mask_22 = arith.ori %mask, %mask_21 : tensor<1x2048xi1> loc(#loc115) + %equal_23 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc98) + %equal_24 = arith.ori %equal, %equal_23 : tensor<1x2048xi1> loc(#loc116) + %mask_25 = arith.cmpi slt, %_tmp2_index_7, %r0_index_8 : tensor<1x2048xi32> loc(#loc100) + %mask_26 = arith.andi %equal_24, %mask_25 : tensor<1x2048xi1> loc(#loc101) + %mask_27 = arith.ori %mask_22, %mask_26 : tensor<1x2048xi1> loc(#loc102) + %4 = arith.select %mask_27, %_tmp2_6, %tmp0_19 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc81) + %5 = arith.select %mask_27, %_tmp2_index_7, %r0_index_8 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc82) + %_tmp2_28 = arith.select %tmp0_18, %4, %_tmp2_6 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc83) + %_tmp2_index_29 = arith.select %tmp0_18, %5, %_tmp2_index_7 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc84) + scf.yield %_tmp2_28, %_tmp2_index_29 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc38) + } loc(#loc87) + %0:2 = "tt.reduce"(%_tmp2_index_5#0, %_tmp2_index_5#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2)), %arg8: f32 loc(callsite(#loc1 at #loc2)), %arg9: i32 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc117) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc118) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc103) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc104) + %mask_6 = arith.xori %b_isnan, %true : i1 loc(#loc105) + %mask_7 = arith.andi %a_isnan, %mask_6 : i1 loc(#loc106) + %mask_8 = arith.ori %mask, %mask_7 : i1 loc(#loc119) + %equal_9 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc107) + %equal_10 = arith.ori %equal, %equal_9 : i1 loc(#loc120) + %mask_11 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc108) + %mask_12 = arith.andi %equal_10, %mask_11 : i1 loc(#loc109) + %mask_13 = arith.ori %mask_8, %mask_12 : i1 loc(#loc110) + %4 = arith.select %mask_13, %arg6, %arg8 : f32 loc(#loc111) + %5 = arith.select %mask_13, %arg7, %arg9 : i32 loc(#loc112) + tt.reduce.return %4, %5 : f32, i32 loc(#loc85) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc85) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc86) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc41) + %2 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc41) + %3 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc42) + tt.store %2, %3, %xmask_2 : tensor<1x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":32:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":30:58) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":29:55) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":22:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":33:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":34:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:47) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:41) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:56) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:52) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:71) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":38:61) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":41:38) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":43:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":44:66) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":44:8) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":46:20) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":47:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":47:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/af/cafe3dsuelcloemwu5jdikp7lqano5qxv7iayhtm5xgji2xvr4k6.py":47:4) +#loc51 = loc("_tmp2_index"(#loc4)) +#loc52 = loc("_tmp2"(#loc5)) +#loc53 = loc("xoffset"(#loc6)) +#loc54 = loc("xmask"(#loc7)) +#loc55 = loc("r0_base"(#loc8)) +#loc56 = loc("r0_base"(#loc9)) +#loc57 = loc("x0"(#loc10)) +#loc58 = loc("x1"(#loc11)) +#loc59 = loc("_tmp2"(#loc3)) +#loc60 = loc("r0_index"(#loc12)) +#loc61 = loc("r0_mask"(#loc13)) +#loc62 = loc("tmp0"(#loc14)) +#loc63 = loc("tmp0"(#loc15)) +#loc64 = loc("tmp0"(#loc16)) +#loc65 = loc("tmp0"(#loc17)) +#loc66 = loc("tmp0"(#loc18)) +#loc67 = loc("tmp0"(#loc19)) +#loc68 = loc("tmp0"(#loc20)) +#loc69 = loc("mask"(#loc21)) +#loc70 = loc("equal"(#loc23)) +#loc71 = loc("a_isnan"(#loc24)) +#loc72 = loc("b_isnan"(#loc25)) +#loc73 = loc("mask"(#loc26)) +#loc74 = loc("mask"(#loc27)) +#loc75 = loc("mask"(#loc28)) +#loc76 = loc("equal"(#loc29)) +#loc77 = loc("equal"(#loc30)) +#loc78 = loc("mask"(#loc31)) +#loc79 = loc("mask"(#loc32)) +#loc80 = loc("mask"(#loc33)) +#loc81 = loc(callsite(#loc34 at #loc22)) +#loc82 = loc(callsite(#loc35 at #loc22)) +#loc83 = loc("_tmp2"(#loc36)) +#loc84 = loc("_tmp2_index"(#loc37)) +#loc85 = loc(callsite(#loc39 at #loc2)) +#loc86 = loc("tmp2"(#loc40)) +#loc87 = loc("_tmp2_index"(#loc59)) +#loc88 = loc(fused[#loc63, #loc62]) +#loc89 = loc(fused[#loc65, #loc64]) +#loc90 = loc(fused[#loc67, #loc54]) +#loc91 = loc("mask"(#loc69)) +#loc92 = loc("equal"(#loc70)) +#loc93 = loc(callsite(#loc71 at #loc22)) +#loc94 = loc(callsite(#loc72 at #loc22)) +#loc95 = loc(callsite(#loc73 at #loc22)) +#loc96 = loc(callsite(#loc74 at #loc22)) +#loc97 = loc("mask"(#loc75)) +#loc98 = loc(callsite(#loc76 at #loc22)) +#loc99 = loc("equal"(#loc77)) +#loc100 = loc(callsite(#loc78 at #loc22)) +#loc101 = loc(callsite(#loc79 at #loc22)) +#loc102 = loc(callsite(#loc80 at #loc22)) +#loc103 = loc(callsite(#loc71 at #loc85)) +#loc104 = loc(callsite(#loc72 at #loc85)) +#loc105 = loc(callsite(#loc73 at #loc85)) +#loc106 = loc(callsite(#loc74 at #loc85)) +#loc107 = loc(callsite(#loc76 at #loc85)) +#loc108 = loc(callsite(#loc78 at #loc85)) +#loc109 = loc(callsite(#loc79 at #loc85)) +#loc110 = loc(callsite(#loc80 at #loc85)) +#loc111 = loc(callsite(#loc34 at #loc85)) +#loc112 = loc(callsite(#loc35 at #loc85)) +#loc113 = loc(callsite(#loc91 at #loc22)) +#loc114 = loc(callsite(#loc92 at #loc22)) +#loc115 = loc(callsite(#loc97 at #loc22)) +#loc116 = loc(callsite(#loc99 at #loc22)) +#loc117 = loc(callsite(#loc91 at #loc85)) +#loc118 = loc(callsite(#loc92 at #loc85)) +#loc119 = loc(callsite(#loc97 at #loc85)) +#loc120 = loc(callsite(#loc99 at #loc85)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..fbfd7bf7923edd7772f8062d938ebcedbaca2e0f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.llir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..360437ff8969db8ae2aad5c2e318b0ba0c4345b7 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b8f23c2417ceaa38d431ce5344048336932539b5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json @@ -0,0 +1 @@ +{"hash": "bc63a85e7982b83906634d26dbbbc5216e8d21115485d36241b2e6befa4a472c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..166ab553fca13f8101cdb7a38dcaa01420829b1b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.llir @@ -0,0 +1,346 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_2 = internal constant [8 x i8] c"unknown\00" +@assertFile_2 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py\00" +@assertMessage_2 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp36 < ks3\00" +@assertFunc_1 = internal constant [8 x i8] c"unknown\00" +@assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py\00" +@assertMessage_1 = internal constant [65 x i8] c"index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2\00" +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py\00" +@assertMessage_0 = internal constant [64 x i8] c"index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i64 %6, i64 %7, i64 %8, i32 %9, ptr addrspace(1) readnone captures(none) %10, ptr addrspace(1) readnone captures(none) %11) local_unnamed_addr #1 !dbg !9 { + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %14 = shl i32 %13, 9, !dbg !11 + %15 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %16 = shl nuw nsw i32 %15, 1, !dbg !12 + %17 = and i32 %16, 510, !dbg !12 + %18 = or disjoint i32 %17, %14, !dbg !13 + %19 = or disjoint i32 %18, 1, !dbg !13 + %20 = icmp slt i32 %18, %9, !dbg !14 + %21 = icmp slt i32 %19, %9, !dbg !14 + %22 = sext i32 %18 to i64, !dbg !15 + %23 = sext i32 %19 to i64, !dbg !15 + %.frozen = freeze i64 %5, !dbg !16 + %24 = sdiv i64 %22, %.frozen, !dbg !16 + %25 = mul i64 %24, %.frozen, !dbg !15 + %.decomposed = sub i64 %22, %25, !dbg !15 + %.frozen25 = freeze i64 %5, !dbg !16 + %26 = sdiv i64 %23, %.frozen25, !dbg !16 + %27 = mul i64 %26, %.frozen25, !dbg !15 + %.decomposed26 = sub i64 %23, %27, !dbg !15 + %28 = srem i64 %24, %6, !dbg !17 + %29 = srem i64 %26, %6, !dbg !17 + %30 = getelementptr bfloat, ptr addrspace(1) %0, i64 %22, !dbg !18 + %31 = getelementptr bfloat, ptr addrspace(1) %0, i64 %23, !dbg !18 + %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %33 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %30, i64 %32, i1 %20) #4, !dbg !19 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %35 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %31, i64 %34, i1 %21) #4, !dbg !19 + %36 = getelementptr i64, ptr addrspace(1) %1, i64 %28, !dbg !20 + %37 = getelementptr i64, ptr addrspace(1) %1, i64 %29, !dbg !20 + %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %36, i64 %38, i1 %20) #4, !dbg !21 + %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %37, i64 %40, i1 %21) #4, !dbg !21 + %42 = sdiv i64 %5, 2, !dbg !22 + %43 = icmp sge i64 %.decomposed, %42, !dbg !23 + %44 = icmp sge i64 %.decomposed26, %42, !dbg !23 + %45 = sub nsw i64 %22, %42, !dbg !24 + %46 = sub nsw i64 %23, %42, !dbg !24 + %47 = getelementptr bfloat, ptr addrspace(1) %0, i64 %45, !dbg !25 + %48 = getelementptr bfloat, ptr addrspace(1) %0, i64 %46, !dbg !25 + %49 = and i1 %20, %43, !dbg !26 + %50 = and i1 %21, %44, !dbg !26 + %51 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !27 + %52 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %47, i64 %51, i1 %49) #4, !dbg !27 + %53 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !27 + %54 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %48, i64 %53, i1 %50) #4, !dbg !27 + %55 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28 + %56 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %36, i64 %55, i1 %49) #4, !dbg !28 + %57 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28 + %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %37, i64 %57, i1 %50) #4, !dbg !28 + %59 = icmp slt i64 %56, 0, !dbg !29 + %60 = icmp slt i64 %58, 0, !dbg !29 + %61 = select i1 %59, i64 %7, i64 0, !dbg !30 + %62 = add i64 %61, %56, !dbg !30 + %63 = select i1 %60, i64 %7, i64 0, !dbg !30 + %64 = add i64 %63, %58, !dbg !30 + %65 = icmp slt i64 %62, 0, !dbg !31 + %66 = icmp slt i64 %64, 0, !dbg !31 + %67 = icmp sge i64 %62, %7, !dbg !32 + %68 = icmp sge i64 %64, %7, !dbg !32 + %.not4 = or i1 %65, %67, !dbg !33 + %.not8 = or i1 %66, %68, !dbg !34 + %.not1 = and i1 %49, %.not4, !dbg !35 + %.not5 = and i1 %50, %.not8, !dbg !36 + %69 = or i1 %.not1, %.not5, !dbg !36 + br i1 %69, label %70, label %71, !dbg !36 + +70: ; preds = %12 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 36, ptr nonnull @assertFunc_0, i64 1), !dbg !36 + unreachable, !dbg !36 + +71: ; preds = %12 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !36 + %72 = sub nsw i64 %.decomposed, %42, !dbg !37 + %73 = sub nsw i64 %.decomposed26, %42, !dbg !37 + %74 = mul i64 %62, %5, !dbg !38 + %75 = mul i64 %64, %5, !dbg !38 + %76 = getelementptr bfloat, ptr addrspace(1) %2, i64 %72, !dbg !39 + %77 = getelementptr bfloat, ptr addrspace(1) %76, i64 %74, !dbg !39 + %78 = getelementptr bfloat, ptr addrspace(1) %2, i64 %73, !dbg !39 + %79 = getelementptr bfloat, ptr addrspace(1) %78, i64 %75, !dbg !39 + %80 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !40 + %81 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %77, i64 %80, i1 %49) #4, !dbg !40 + %82 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !40 + %83 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %79, i64 %82, i1 %50) #4, !dbg !40 + %84 = icmp slt i64 %.decomposed, %42, !dbg !41 + %85 = icmp slt i64 %.decomposed26, %42, !dbg !41 + %86 = add i64 %5, %22, !dbg !42 + %87 = add i64 %5, %23, !dbg !42 + %88 = sub i64 %86, %42, !dbg !43 + %89 = sub i64 %87, %42, !dbg !43 + %90 = getelementptr bfloat, ptr addrspace(1) %0, i64 %88, !dbg !44 + %91 = getelementptr bfloat, ptr addrspace(1) %0, i64 %89, !dbg !44 + %92 = and i1 %20, %84, !dbg !45 + %93 = and i1 %21, %85, !dbg !45 + %94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46 + %95 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %90, i64 %94, i1 %92) #4, !dbg !46 + %96 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46 + %97 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %91, i64 %96, i1 %93) #4, !dbg !46 + %98 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !47 + %99 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %36, i64 %98, i1 %92) #4, !dbg !47 + %100 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !47 + %101 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %37, i64 %100, i1 %93) #4, !dbg !47 + %102 = icmp slt i64 %99, 0, !dbg !48 + %103 = icmp slt i64 %101, 0, !dbg !48 + %104 = select i1 %102, i64 %7, i64 0, !dbg !49 + %105 = add i64 %104, %99, !dbg !49 + %106 = select i1 %103, i64 %7, i64 0, !dbg !49 + %107 = add i64 %106, %101, !dbg !49 + %108 = icmp slt i64 %105, 0, !dbg !50 + %109 = icmp slt i64 %107, 0, !dbg !50 + %110 = icmp sge i64 %105, %7, !dbg !51 + %111 = icmp sge i64 %107, %7, !dbg !51 + %.not12 = or i1 %108, %110, !dbg !52 + %.not16 = or i1 %109, %111, !dbg !53 + %.not9 = and i1 %92, %.not12, !dbg !54 + %.not13 = and i1 %93, %.not16, !dbg !55 + %112 = or i1 %.not9, %.not13, !dbg !55 + br i1 %112, label %113, label %114, !dbg !55 + +113: ; preds = %71 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 51, ptr nonnull @assertFunc_1, i64 1), !dbg !55 + unreachable, !dbg !55 + +114: ; preds = %71 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !55 + %115 = sub i64 %5, %42, !dbg !56 + %116 = mul i64 %105, %5, !dbg !57 + %117 = mul i64 %107, %5, !dbg !57 + %118 = getelementptr bfloat, ptr addrspace(1) %2, i64 %115, !dbg !58 + %119 = getelementptr bfloat, ptr addrspace(1) %118, i64 %.decomposed, !dbg !58 + %120 = getelementptr bfloat, ptr addrspace(1) %119, i64 %116, !dbg !58 + %121 = getelementptr bfloat, ptr addrspace(1) %118, i64 %.decomposed26, !dbg !58 + %122 = getelementptr bfloat, ptr addrspace(1) %121, i64 %117, !dbg !58 + %123 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !59 + %124 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %120, i64 %123, i1 %92) #4, !dbg !59 + %125 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !59 + %126 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %122, i64 %125, i1 %93) #4, !dbg !59 + %127 = icmp slt i64 %39, 0, !dbg !60 + %128 = icmp slt i64 %41, 0, !dbg !60 + %129 = select i1 %127, i64 %8, i64 0, !dbg !61 + %130 = add i64 %129, %39, !dbg !61 + %131 = select i1 %128, i64 %8, i64 0, !dbg !61 + %132 = add i64 %131, %41, !dbg !61 + %133 = icmp slt i64 %130, 0, !dbg !62 + %134 = icmp slt i64 %132, 0, !dbg !62 + %135 = icmp sge i64 %130, %8, !dbg !63 + %136 = icmp sge i64 %132, %8, !dbg !63 + %.not20 = or i1 %133, %135, !dbg !64 + %.not24 = or i1 %134, %136, !dbg !65 + %.not17 = and i1 %20, %.not20, !dbg !66 + %.not21 = and i1 %21, %.not24, !dbg !67 + %137 = or i1 %.not17, %.not21, !dbg !67 + br i1 %137, label %138, label %139, !dbg !67 + +138: ; preds = %114 + tail call void @__assertfail(ptr nonnull @assertMessage_2, ptr nonnull @assertFile_2, i32 62, ptr nonnull @assertFunc_2, i64 1), !dbg !67 + unreachable, !dbg !67 + +139: ; preds = %114 + %140 = bitcast i16 %54 to bfloat, !dbg !27 + %141 = fpext bfloat %140 to float, !dbg !68 + %142 = bitcast i16 %83 to bfloat, !dbg !40 + %143 = fpext bfloat %142 to float, !dbg !69 + %144 = fmul float %141, %143, !dbg !70 + %145 = fsub float 0.000000e+00, %144, !dbg !71 + %146 = select i1 %44, float %145, float 0.000000e+00, !dbg !72 + %147 = bitcast i16 %97 to bfloat, !dbg !46 + %148 = fpext bfloat %147 to float, !dbg !73 + %149 = bitcast i16 %126 to bfloat, !dbg !59 + %150 = fpext bfloat %149 to float, !dbg !74 + %151 = fmul float %148, %150, !dbg !75 + %152 = select i1 %85, float %151, float 0.000000e+00, !dbg !72 + %153 = fadd float %146, %152, !dbg !76 + %154 = bitcast i16 %52 to bfloat, !dbg !27 + %155 = fpext bfloat %154 to float, !dbg !68 + %156 = bitcast i16 %81 to bfloat, !dbg !40 + %157 = fpext bfloat %156 to float, !dbg !69 + %158 = fmul float %155, %157, !dbg !70 + %159 = fsub float 0.000000e+00, %158, !dbg !71 + %160 = select i1 %43, float %159, float 0.000000e+00, !dbg !72 + %161 = bitcast i16 %95 to bfloat, !dbg !46 + %162 = fpext bfloat %161 to float, !dbg !73 + %163 = bitcast i16 %124 to bfloat, !dbg !59 + %164 = fpext bfloat %163 to float, !dbg !74 + %165 = fmul float %162, %164, !dbg !75 + %166 = select i1 %84, float %165, float 0.000000e+00, !dbg !72 + %167 = fadd float %160, %166, !dbg !76 + %168 = bitcast i16 %35 to bfloat, !dbg !19 + %169 = fpext bfloat %168 to float, !dbg !77 + %170 = bitcast i16 %33 to bfloat, !dbg !19 + %171 = fpext bfloat %170 to float, !dbg !77 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !67 + %172 = mul i64 %130, %5, !dbg !78 + %173 = mul i64 %132, %5, !dbg !78 + %174 = getelementptr bfloat, ptr addrspace(1) %3, i64 %.decomposed, !dbg !79 + %175 = getelementptr bfloat, ptr addrspace(1) %174, i64 %172, !dbg !79 + %176 = getelementptr bfloat, ptr addrspace(1) %3, i64 %.decomposed26, !dbg !79 + %177 = getelementptr bfloat, ptr addrspace(1) %176, i64 %173, !dbg !79 + %178 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !80 + %179 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %175, i64 %178, i1 %20) #4, !dbg !80 + %180 = bitcast i16 %179 to bfloat, !dbg !80 + %181 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !80 + %182 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %177, i64 %181, i1 %21) #4, !dbg !80 + %183 = bitcast i16 %182 to bfloat, !dbg !80 + %184 = fpext bfloat %180 to float, !dbg !81 + %185 = fpext bfloat %183 to float, !dbg !81 + %186 = fmul float %171, %184, !dbg !82 + %187 = fmul float %169, %185, !dbg !82 + %188 = fadd float %167, %186, !dbg !83 + %189 = fadd float %153, %187, !dbg !83 + %190 = getelementptr bfloat, ptr addrspace(1) %4, i64 %22, !dbg !84 + %191 = getelementptr bfloat, ptr addrspace(1) %4, i64 %23, !dbg !84 + %192 = fptrunc float %188 to bfloat, !dbg !85 + %193 = fptrunc float %189 to bfloat, !dbg !85 + %194 = bitcast bfloat %192 to i16, !dbg !85 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %194, ptr addrspace(1) %190, i1 %20) #4, !dbg !85 + %195 = bitcast bfloat %193 to i16, !dbg !85 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %195, ptr addrspace(1) %191, i1 %21) #4, !dbg !85 + ret void, !dbg !86 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="256" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0", linkageName: "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 19, column: 28, scope: !9) +!11 = !DILocation(line: 19, column: 33, scope: !9) +!12 = !DILocation(line: 20, column: 36, scope: !9) +!13 = !DILocation(line: 20, column: 23, scope: !9) +!14 = !DILocation(line: 21, column: 21, scope: !9) +!15 = !DILocation(line: 22, column: 19, scope: !9) +!16 = !DILocation(line: 24, column: 21, scope: !9) +!17 = !DILocation(line: 24, column: 28, scope: !9) +!18 = !DILocation(line: 25, column: 31, scope: !9) +!19 = !DILocation(line: 25, column: 36, scope: !9) +!20 = !DILocation(line: 26, column: 31, scope: !9) +!21 = !DILocation(line: 26, column: 36, scope: !9) +!22 = !DILocation(line: 28, column: 18, scope: !9) +!23 = !DILocation(line: 29, column: 19, scope: !9) +!24 = !DILocation(line: 30, column: 35, scope: !9) +!25 = !DILocation(line: 30, column: 30, scope: !9) +!26 = !DILocation(line: 30, column: 60, scope: !9) +!27 = !DILocation(line: 30, column: 53, scope: !9) +!28 = !DILocation(line: 31, column: 35, scope: !9) +!29 = !DILocation(line: 34, column: 18, scope: !9) +!30 = !DILocation(line: 35, column: 32, scope: !9) +!31 = !DILocation(line: 36, column: 28, scope: !9) +!32 = !DILocation(line: 36, column: 98, scope: !9) +!33 = !DILocation(line: 36, column: 64, scope: !9) +!34 = !DILocation(line: 36, column: 108, scope: !9) +!35 = !DILocation(line: 36, column: 106, scope: !9) +!36 = !DILocation(line: 36, column: 123, scope: !9) +!37 = !DILocation(line: 37, column: 36, scope: !9) +!38 = !DILocation(line: 37, column: 58, scope: !9) +!39 = !DILocation(line: 37, column: 31, scope: !9) +!40 = !DILocation(line: 37, column: 65, scope: !9) +!41 = !DILocation(line: 44, column: 19, scope: !9) +!42 = !DILocation(line: 45, column: 37, scope: !9) +!43 = !DILocation(line: 45, column: 42, scope: !9) +!44 = !DILocation(line: 45, column: 31, scope: !9) +!45 = !DILocation(line: 45, column: 68, scope: !9) +!46 = !DILocation(line: 45, column: 60, scope: !9) +!47 = !DILocation(line: 46, column: 36, scope: !9) +!48 = !DILocation(line: 49, column: 20, scope: !9) +!49 = !DILocation(line: 50, column: 35, scope: !9) +!50 = !DILocation(line: 51, column: 28, scope: !9) +!51 = !DILocation(line: 51, column: 100, scope: !9) +!52 = !DILocation(line: 51, column: 65, scope: !9) +!53 = !DILocation(line: 51, column: 110, scope: !9) +!54 = !DILocation(line: 51, column: 108, scope: !9) +!55 = !DILocation(line: 51, column: 126, scope: !9) +!56 = !DILocation(line: 52, column: 37, scope: !9) +!57 = !DILocation(line: 52, column: 64, scope: !9) +!58 = !DILocation(line: 52, column: 31, scope: !9) +!59 = !DILocation(line: 52, column: 72, scope: !9) +!60 = !DILocation(line: 60, column: 20, scope: !9) +!61 = !DILocation(line: 61, column: 35, scope: !9) +!62 = !DILocation(line: 62, column: 28, scope: !9) +!63 = !DILocation(line: 62, column: 46, scope: !9) +!64 = !DILocation(line: 62, column: 38, scope: !9) +!65 = !DILocation(line: 62, column: 56, scope: !9) +!66 = !DILocation(line: 62, column: 54, scope: !9) +!67 = !DILocation(line: 62, column: 64, scope: !9) +!68 = !DILocation(line: 30, column: 111, scope: !9) +!69 = !DILocation(line: 37, column: 123, scope: !9) +!70 = !DILocation(line: 38, column: 19, scope: !9) +!71 = !DILocation(line: 39, column: 13, scope: !9) +!72 = !DILocation(line: 0, scope: !9) +!73 = !DILocation(line: 45, column: 119, scope: !9) +!74 = !DILocation(line: 52, column: 131, scope: !9) +!75 = !DILocation(line: 53, column: 20, scope: !9) +!76 = !DILocation(line: 57, column: 20, scope: !9) +!77 = !DILocation(line: 25, column: 76, scope: !9) +!78 = !DILocation(line: 63, column: 40, scope: !9) +!79 = !DILocation(line: 63, column: 31, scope: !9) +!80 = !DILocation(line: 63, column: 48, scope: !9) +!81 = !DILocation(line: 63, column: 88, scope: !9) +!82 = !DILocation(line: 64, column: 20, scope: !9) +!83 = !DILocation(line: 65, column: 20, scope: !9) +!84 = !DILocation(line: 66, column: 25, scope: !9) +!85 = !DILocation(line: 66, column: 37, scope: !9) +!86 = !DILocation(line: 66, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..3b011d8d23258a96442c8ceae566d26c10f3cf5e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx @@ -0,0 +1,759 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0 // -- Begin function triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_2[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_2[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 112, 122, 47, 99, 112, 122, 119, 55, 103, 54, 121, 106, 102, 108, 112, 99, 116, 99, 113, 107, 122, 102, 53, 111, 115, 113, 55, 109, 53, 97, 99, 114, 99, 116, 97, 121, 115, 97, 54, 116, 104, 51, 111, 120, 51, 100, 101, 105, 110, 120, 108, 117, 121, 112, 99, 46, 112, 121}; +.global .align 1 .b8 assertMessage_2[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 54, 32, 60, 32, 107, 115, 51}; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 112, 122, 47, 99, 112, 122, 119, 55, 103, 54, 121, 106, 102, 108, 112, 99, 116, 99, 113, 107, 122, 102, 53, 111, 115, 113, 55, 109, 53, 97, 99, 114, 99, 116, 97, 121, 115, 97, 54, 116, 104, 51, 111, 120, 51, 100, 101, 105, 110, 120, 108, 117, 121, 112, 99, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[65] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 108, 46, 98, 114, 111, 97, 100, 99, 97, 115, 116, 95, 116, 111, 40, 116, 109, 112, 50, 51, 44, 32, 91, 88, 66, 76, 79, 67, 75, 93, 41, 32, 60, 32, 107, 115, 50}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 112, 122, 47, 99, 112, 122, 119, 55, 103, 54, 121, 106, 102, 108, 112, 99, 116, 99, 113, 107, 122, 102, 53, 111, 115, 113, 55, 109, 53, 97, 99, 114, 99, 116, 97, 121, 115, 97, 54, 116, 104, 51, 111, 120, 51, 100, 101, 105, 110, 120, 108, 117, 121, 112, 99, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[64] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 108, 46, 98, 114, 111, 97, 100, 99, 97, 115, 116, 95, 116, 111, 40, 116, 109, 112, 56, 44, 32, 91, 88, 66, 76, 79, 67, 75, 93, 41, 32, 60, 32, 107, 115, 50}; + // @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0 +.visible .entry triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_4, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_5, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_6, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_7, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_8, + .param .u32 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_9, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_10, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_11 +) +.reqntid 256 +{ + .reg .pred %p<71>; + .reg .b16 %rs<33>; + .reg .b32 %r<57>; + .reg .b64 %rd<189>; + .loc 1 18 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:18:0 + +// %bb.0: + ld.param.b64 %rd40, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_5]; +$L__tmp0: + .loc 1 19 28 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:19:28 + mov.u32 %r2, %ctaid.x; + .loc 1 19 33 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:19:33 + shl.b32 %r3, %r2, 9; + .loc 1 20 36 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:20:36 + mov.u32 %r4, %tid.x; + shl.b32 %r5, %r4, 1; + and.b32 %r6, %r5, 510; + .loc 1 20 23 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:20:23 + or.b32 %r7, %r6, %r3; + or.b32 %r8, %r7, 1; + .loc 1 22 19 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:22:19 + cvt.s64.s32 %rd1, %r7; + cvt.s64.s32 %rd2, %r8; + .loc 1 24 21 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:24:21 + or.b64 %rd41, %rd1, %rd40; + and.b64 %rd42, %rd41, -4294967296; + setp.ne.b64 %p5, %rd42, 0; + cvt.u32.u64 %r56, %rd1; + @%p5 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd185, %rd1, %rd40; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r9, %rd40; + div.u32 %r11, %r56, %r9; + cvt.u64.u32 %rd185, %r11; +$L__BB0_3: + .loc 1 0 21 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0:21 + ld.param.b64 %rd37, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_6]; + .loc 1 24 21 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:24:21 + or.b64 %rd44, %rd2, %rd40; + and.b64 %rd45, %rd44, -4294967296; + setp.ne.b64 %p6, %rd45, 0; + cvt.u32.u64 %r55, %rd2; + @%p6 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd186, %rd2, %rd40; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r12, %rd40; + div.u32 %r14, %r55, %r12; + cvt.u64.u32 %rd186, %r14; +$L__BB0_6: + .loc 1 22 19 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:22:19 + mul.lo.s64 %rd43, %rd185, %rd40; + mul.lo.s64 %rd11, %rd186, %rd40; + .loc 1 24 28 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:24:28 + or.b64 %rd46, %rd185, %rd37; + and.b64 %rd47, %rd46, -4294967296; + setp.ne.b64 %p7, %rd47, 0; + @%p7 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + rem.s64 %rd187, %rd185, %rd37; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r15, %rd37; + cvt.u32.u64 %r16, %rd185; + rem.u32 %r17, %r16, %r15; + cvt.u64.u32 %rd187, %r17; +$L__BB0_9: + .loc 1 0 28 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0:28 + ld.param.b32 %r1, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_9]; + ld.param.b64 %rd38, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_7]; + ld.param.b64 %rd32, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_1]; + ld.param.b64 %rd31, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_0]; + sub.s64 %rd7, %rd1, %rd43; + sub.s64 %rd12, %rd2, %rd11; + .loc 1 24 28 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:24:28 + or.b64 %rd48, %rd186, %rd37; + and.b64 %rd49, %rd48, -4294967296; + setp.ne.b64 %p8, %rd49, 0; + @%p8 bra $L__BB0_11; + bra.uni $L__BB0_10; +$L__BB0_11: + rem.s64 %rd188, %rd186, %rd37; + bra.uni $L__BB0_12; +$L__BB0_10: + cvt.u32.u64 %r18, %rd37; + cvt.u32.u64 %r19, %rd186; + rem.u32 %r20, %r19, %r18; + cvt.u64.u32 %rd188, %r20; +$L__BB0_12: + .loc 1 21 21 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:21:21 + setp.lt.s32 %p12, %r55, %r1; + setp.lt.s32 %p11, %r56, %r1; + .loc 1 25 31 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:25:31 + shl.b64 %rd78, %rd1, 1; + add.s64 %rd51, %rd31, %rd78; + add.s64 %rd54, %rd51, 2; + .loc 1 25 36 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:25:36 + // begin inline asm + mov.u64 %rd50, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd50, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs11, 0x0; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd51 + 0 ], %rd50; + // end inline asm + // begin inline asm + mov.u64 %rd53, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs12, 0x0; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd54 + 0 ], %rd53; + // end inline asm + .loc 1 26 31 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:26:31 + shl.b64 %rd79, %rd187, 3; + add.s64 %rd72, %rd32, %rd79; + shl.b64 %rd80, %rd188, 3; + add.s64 %rd76, %rd32, %rd80; + .loc 1 26 36 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:26:36 + // begin inline asm + mov.u64 %rd56, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd56, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd57, 0x0; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd57 }, [ %rd72 + 0 ], %rd56; + // end inline asm + // begin inline asm + mov.u64 %rd60, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd60, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd61, 0x0; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd61 }, [ %rd76 + 0 ], %rd60; + // end inline asm + .loc 1 28 18 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:28:18 + shr.u64 %rd81, %rd40, 63; + add.s64 %rd82, %rd40, %rd81; + shr.s64 %rd23, %rd82, 1; + .loc 1 29 19 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:29:19 + setp.ge.s64 %p17, %rd7, %rd23; + setp.ge.s64 %p18, %rd12, %rd23; + .loc 1 30 35 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:35 + sub.s64 %rd83, %rd1, %rd23; + .loc 1 30 30 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:30 + shl.b64 %rd84, %rd83, 1; + add.s64 %rd65, %rd31, %rd84; + add.s64 %rd68, %rd65, 2; + .loc 1 30 60 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:60 + and.pred %p15, %p11, %p17; + and.pred %p16, %p12, %p18; + .loc 1 30 53 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:53 + // begin inline asm + mov.u64 %rd64, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd64, 1.0; + // end inline asm + mov.b16 %rs16, 0; + // begin inline asm + mov.u16 %rs13, %rs16; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd65 + 0 ], %rd64; + // end inline asm + // begin inline asm + mov.u64 %rd67, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd67, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs16; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd68 + 0 ], %rd67; + // end inline asm + .loc 1 31 35 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:31:35 + // begin inline asm + mov.u64 %rd70, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd70, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd71, 0x0; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd71 }, [ %rd72 + 0 ], %rd70; + // end inline asm + // begin inline asm + mov.u64 %rd74, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd74, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd75, 0x0; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd75 }, [ %rd76 + 0 ], %rd74; + // end inline asm + .loc 1 35 32 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:35:32 + shr.s64 %rd85, %rd71, 63; + and.b64 %rd86, %rd85, %rd38; + add.s64 %rd24, %rd86, %rd71; + shr.s64 %rd87, %rd75, 63; + and.b64 %rd88, %rd87, %rd38; + add.s64 %rd25, %rd88, %rd75; + .loc 1 36 28 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:36:28 + setp.lt.s64 %p19, %rd24, 0; + setp.lt.s64 %p20, %rd25, 0; + .loc 1 36 98 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:36:98 + setp.ge.s64 %p21, %rd24, %rd38; + setp.ge.s64 %p22, %rd25, %rd38; + .loc 1 36 64 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:36:64 + or.pred %p23, %p19, %p21; + .loc 1 36 108 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:36:108 + or.pred %p24, %p20, %p22; + .loc 1 36 106 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:36:106 + and.pred %p25, %p15, %p23; + .loc 1 36 123 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:36:123 + and.pred %p26, %p16, %p24; + or.pred %p27, %p25, %p26; + not.pred %p28, %p27; + @%p28 bra $L__BB0_14; + bra.uni $L__BB0_13; +$L__BB0_14: + .loc 1 0 123 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0:123 + ld.param.b64 %rd33, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_2]; + .loc 1 36 123 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:36:123 + bar.sync 0; + .loc 1 37 36 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:37:36 + sub.s64 %rd109, %rd7, %rd23; + .loc 1 37 58 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:37:58 + mul.lo.s64 %rd110, %rd24, %rd40; + mul.lo.s64 %rd111, %rd25, %rd40; + .loc 1 37 31 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:37:31 + shl.b64 %rd112, %rd109, 1; + add.s64 %rd113, %rd33, %rd112; + shl.b64 %rd114, %rd110, 1; + add.s64 %rd90, %rd113, %rd114; + sub.s64 %rd26, %rd1, %rd11; + sub.s64 %rd115, %rd26, %rd23; + shl.b64 %rd116, %rd115, 1; + add.s64 %rd117, %rd33, %rd116; + shl.b64 %rd118, %rd111, 1; + add.s64 %rd119, %rd117, %rd118; + add.s64 %rd93, %rd119, 2; + .loc 1 37 65 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:37:65 + // begin inline asm + mov.u64 %rd89, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd89, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs17, %rs16; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd90 + 0 ], %rd89; + // end inline asm + // begin inline asm + mov.u64 %rd92, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd92, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs19, %rs16; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd93 + 0 ], %rd92; + // end inline asm + .loc 1 44 19 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:44:19 + setp.lt.s64 %p37, %rd7, %rd23; + setp.lt.s64 %p38, %rd12, %rd23; + .loc 1 45 37 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:37 + add.s64 %rd120, %rd40, %rd1; + .loc 1 45 42 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:42 + sub.s64 %rd121, %rd120, %rd23; + .loc 1 45 31 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:31 + shl.b64 %rd122, %rd121, 1; + add.s64 %rd96, %rd31, %rd122; + add.s64 %rd99, %rd96, 2; + .loc 1 45 68 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:68 + and.pred %p33, %p11, %p37; + and.pred %p34, %p12, %p38; + .loc 1 45 60 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:60 + // begin inline asm + mov.u64 %rd95, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd95, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs21, %rs16; + @%p33 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd96 + 0 ], %rd95; + // end inline asm + // begin inline asm + mov.u64 %rd98, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd98, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs23, %rs16; + @%p34 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd99 + 0 ], %rd98; + // end inline asm + .loc 1 46 36 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:46:36 + // begin inline asm + mov.u64 %rd101, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd101, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd102, 0x0; + @%p33 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd102 }, [ %rd72 + 0 ], %rd101; + // end inline asm + // begin inline asm + mov.u64 %rd105, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd105, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd106, 0x0; + @%p34 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd106 }, [ %rd76 + 0 ], %rd105; + // end inline asm + .loc 1 50 35 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:50:35 + shr.s64 %rd123, %rd102, 63; + and.b64 %rd124, %rd123, %rd38; + add.s64 %rd27, %rd124, %rd102; + shr.s64 %rd125, %rd106, 63; + and.b64 %rd126, %rd125, %rd38; + add.s64 %rd28, %rd126, %rd106; + .loc 1 51 28 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:51:28 + setp.lt.s64 %p39, %rd27, 0; + setp.lt.s64 %p40, %rd28, 0; + .loc 1 51 100 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:51:100 + setp.ge.s64 %p41, %rd27, %rd38; + setp.ge.s64 %p42, %rd28, %rd38; + .loc 1 51 65 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:51:65 + or.pred %p43, %p39, %p41; + .loc 1 51 110 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:51:110 + or.pred %p44, %p40, %p42; + .loc 1 51 108 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:51:108 + and.pred %p45, %p33, %p43; + .loc 1 51 126 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:51:126 + and.pred %p46, %p34, %p44; + or.pred %p47, %p45, %p46; + not.pred %p48, %p47; + @%p48 bra $L__BB0_16; + bra.uni $L__BB0_15; +$L__BB0_16: + .loc 1 0 126 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0:126 + ld.param.b64 %rd39, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_8]; + .loc 1 51 126 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:51:126 + bar.sync 0; + .loc 1 52 37 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:52:37 + sub.s64 %rd133, %rd40, %rd23; + .loc 1 52 64 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:52:64 + mul.lo.s64 %rd134, %rd27, %rd40; + mul.lo.s64 %rd135, %rd28, %rd40; + .loc 1 52 31 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:52:31 + shl.b64 %rd136, %rd133, 1; + add.s64 %rd137, %rd33, %rd136; + shl.b64 %rd138, %rd7, 1; + add.s64 %rd139, %rd137, %rd138; + shl.b64 %rd140, %rd134, 1; + add.s64 %rd128, %rd139, %rd140; + shl.b64 %rd141, %rd26, 1; + add.s64 %rd142, %rd137, %rd141; + shl.b64 %rd143, %rd135, 1; + add.s64 %rd144, %rd142, %rd143; + add.s64 %rd131, %rd144, 2; + .loc 1 52 72 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:52:72 + // begin inline asm + mov.u64 %rd127, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd127, 1.0; + // end inline asm + mov.b16 %rs26, 0; + // begin inline asm + mov.u16 %rs25, %rs26; + @%p33 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd128 + 0 ], %rd127; + // end inline asm + // begin inline asm + mov.u64 %rd130, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd130, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs27, %rs26; + @%p34 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs27 }, [ %rd131 + 0 ], %rd130; + // end inline asm + .loc 1 61 35 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:61:35 + shr.s64 %rd145, %rd57, 63; + and.b64 %rd146, %rd145, %rd39; + add.s64 %rd29, %rd146, %rd57; + shr.s64 %rd147, %rd61, 63; + and.b64 %rd148, %rd147, %rd39; + add.s64 %rd30, %rd148, %rd61; + .loc 1 62 28 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:62:28 + setp.lt.s64 %p53, %rd29, 0; + setp.lt.s64 %p54, %rd30, 0; + .loc 1 62 46 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:62:46 + setp.ge.s64 %p55, %rd29, %rd39; + setp.ge.s64 %p56, %rd30, %rd39; + .loc 1 62 38 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:62:38 + or.pred %p57, %p53, %p55; + .loc 1 62 56 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:62:56 + or.pred %p58, %p54, %p56; + .loc 1 62 54 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:62:54 + and.pred %p59, %p11, %p57; + .loc 1 62 64 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:62:64 + and.pred %p60, %p12, %p58; + or.pred %p61, %p59, %p60; + not.pred %p62, %p61; + @%p62 bra $L__BB0_18; + bra.uni $L__BB0_17; +$L__BB0_18: + .loc 1 0 64 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0:64 + ld.param.b64 %rd35, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_4]; + ld.param.b64 %rd34, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_3]; + .loc 1 30 111 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:111 + cvt.f32.bf16 %r29, %rs15; + .loc 1 37 123 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:37:123 + cvt.f32.bf16 %r30, %rs19; + .loc 1 39 13 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:39:13 + neg.f32 %r31, %r29; + fma.rn.f32 %r32, %r31, %r30, 0f00000000; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r33, %r32, 0f00000000, %p18; + .loc 1 45 119 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:119 + cvt.f32.bf16 %r34, %rs23; + .loc 1 52 131 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:52:131 + cvt.f32.bf16 %r35, %rs27; + .loc 1 53 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:53:20 + mul.f32 %r36, %r34, %r35; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r37, %r36, 0f00000000, %p38; + .loc 1 57 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:57:20 + add.f32 %r38, %r33, %r37; + .loc 1 30 111 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:30:111 + cvt.f32.bf16 %r39, %rs13; + .loc 1 37 123 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:37:123 + cvt.f32.bf16 %r40, %rs17; + .loc 1 39 13 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:39:13 + neg.f32 %r41, %r39; + fma.rn.f32 %r42, %r41, %r40, 0f00000000; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r43, %r42, 0f00000000, %p17; + .loc 1 45 119 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:45:119 + cvt.f32.bf16 %r44, %rs21; + .loc 1 52 131 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:52:131 + cvt.f32.bf16 %r45, %rs25; + .loc 1 53 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:53:20 + mul.f32 %r46, %r44, %r45; + .loc 1 0 0 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:0 + selp.f32 %r47, %r46, 0f00000000, %p37; + .loc 1 57 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:57:20 + add.f32 %r48, %r43, %r47; + .loc 1 25 76 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:25:76 + cvt.f32.bf16 %r49, %rs12; + cvt.f32.bf16 %r50, %rs11; + .loc 1 62 64 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:62:64 + bar.sync 0; + .loc 1 63 40 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:63:40 + mul.lo.s64 %rd157, %rd29, %rd40; + mul.lo.s64 %rd158, %rd30, %rd40; + .loc 1 63 31 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:63:31 + add.s64 %rd160, %rd34, %rd138; + shl.b64 %rd161, %rd157, 1; + add.s64 %rd150, %rd160, %rd161; + add.s64 %rd163, %rd34, %rd141; + shl.b64 %rd164, %rd158, 1; + add.s64 %rd165, %rd163, %rd164; + add.s64 %rd153, %rd165, 2; + .loc 1 63 48 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:63:48 + // begin inline asm + mov.u64 %rd151, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd151, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs29, 0x0; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs29 }, [ %rd150 + 0 ], %rd151; + // end inline asm + // begin inline asm + mov.u64 %rd154, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd154, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs30, 0x0; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs30 }, [ %rd153 + 0 ], %rd154; + // end inline asm + .loc 1 63 88 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:63:88 + cvt.f32.bf16 %r51, %rs29; + cvt.f32.bf16 %r52, %rs30; + .loc 1 65 20 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:65:20 + fma.rn.f32 %r53, %r50, %r51, %r48; + fma.rn.f32 %r54, %r49, %r52, %r38; + .loc 1 66 25 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:66:25 + add.s64 %rd155, %rd35, %rd78; + add.s64 %rd156, %rd155, 2; + .loc 1 66 37 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:66:37 + cvt.rn.bf16.f32 %rs31, %r53; + cvt.rn.bf16.f32 %rs32, %r54; + // begin inline asm + @%p11 st.global.b16 [ %rd155 + 0 ], { %rs31 }; + // end inline asm + // begin inline asm + @%p12 st.global.b16 [ %rd156 + 0 ], { %rs32 }; + // end inline asm + .loc 1 66 4 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:66:4 + ret; +$L__BB0_13: + .loc 1 36 123 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:36:123 + { // callseq 2, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd179, assertFunc_0; + cvta.global.u64 %rd180, %rd179; + st.param.b64 [param3], %rd180; + mov.b64 %rd181, assertFile_0; + cvta.global.u64 %rd182, %rd181; + st.param.b64 [param1], %rd182; + mov.b64 %rd183, assertMessage_0; + cvta.global.u64 %rd184, %rd183; + st.param.b64 [param0], %rd184; + st.param.b64 [param4], 1; + st.param.b32 [param2], 36; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 2 + trap; +$L__BB0_15: + .loc 1 51 126 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:51:126 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd173, assertFunc_1; + cvta.global.u64 %rd174, %rd173; + st.param.b64 [param3], %rd174; + mov.b64 %rd175, assertFile_1; + cvta.global.u64 %rd176, %rd175; + st.param.b64 [param1], %rd176; + mov.b64 %rd177, assertMessage_1; + cvta.global.u64 %rd178, %rd177; + st.param.b64 [param0], %rd178; + st.param.b64 [param4], 1; + st.param.b32 [param2], 51; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__BB0_17: + .loc 1 62 64 // cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py:62:64 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd167, assertFunc_2; + cvta.global.u64 %rd168, %rd167; + st.param.b64 [param3], %rd168; + mov.b64 %rd169, assertFile_2; + cvta.global.u64 %rd170, %rd169; + st.param.b64 [param1], %rd170; + mov.b64 %rd171, assertMessage_2; + cvta.global.u64 %rd172, %rd171; + st.param.b64 [param0], %rd172; + st.param.b64 [param4], 1; + st.param.b32 [param2], 62; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 112 +.b8 122 +.b8 119 +.b8 55 +.b8 103 +.b8 54 +.b8 121 +.b8 106 +.b8 102 +.b8 108 +.b8 112 +.b8 99 +.b8 116 +.b8 99 +.b8 113 +.b8 107 +.b8 122 +.b8 102 +.b8 53 +.b8 111 +.b8 115 +.b8 113 +.b8 55 +.b8 109 +.b8 53 +.b8 97 +.b8 99 +.b8 114 +.b8 99 +.b8 116 +.b8 97 +.b8 121 +.b8 115 +.b8 97 +.b8 54 +.b8 116 +.b8 104 +.b8 51 +.b8 111 +.b8 120 +.b8 51 +.b8 100 +.b8 101 +.b8 105 +.b8 110 +.b8 120 +.b8 108 +.b8 117 +.b8 121 +.b8 112 +.b8 99 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 112 +.b8 122 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source b/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source new file mode 100644 index 0000000000000000000000000000000000000000..1fc3ed13bda82f5a5b4116558052d8c4690fbcdd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source @@ -0,0 +1,419 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":18:0) +#loc109 = loc("in_ptr0"(#loc)) +#loc110 = loc("in_ptr1"(#loc)) +#loc111 = loc("in_ptr2"(#loc)) +#loc112 = loc("in_ptr3"(#loc)) +#loc113 = loc("out_ptr0"(#loc)) +#loc114 = loc("ks0"(#loc)) +#loc115 = loc("ks1"(#loc)) +#loc116 = loc("ks2"(#loc)) +#loc117 = loc("ks3"(#loc)) +#loc118 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc119) + %xoffset_0 = arith.constant 512 : i32 loc(#loc120) + %xoffset_1 = arith.constant 512 : i32 loc(#loc120) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc120) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc121) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<512xi32> loc(#loc122) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<512xi32> loc(#loc122) + %xmask = tt.splat %xnumel : i32 -> tensor<512xi32> loc(#loc123) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<512xi32> loc(#loc123) + %x0 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc124) + %x0_6 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc124) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<512xi64> loc(#loc124) + %x1 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc125) + %x1_8 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc125) + %x1_9 = arith.divsi %x1, %x1_8 : tensor<512xi64> loc(#loc125) + %x1_10 = tt.splat %ks1 : i64 -> tensor<512xi64> loc(#loc126) + %x1_11 = arith.remsi %x1_9, %x1_10 : tensor<512xi64> loc(#loc126) + %tmp31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc127) + %tmp31_12 = tt.addptr %tmp31, %xindex_4 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc127) + %tmp31_13 = tt.load %tmp31_12, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc128) + %tmp31_14 = arith.extf %tmp31_13 : tensor<512xbf16> to tensor<512xf32> loc(#loc129) + %tmp32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc130) + %tmp32_15 = tt.addptr %tmp32, %x1_11 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc130) + %tmp32_16 = tt.load %tmp32_15, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc131) + %tmp1 = arith.constant 2 : i32 loc(#loc132) + %tmp1_17 = arith.constant 2 : i64 loc(#loc132) + %tmp1_18 = arith.divsi %ks0, %tmp1_17 : i64 loc(#loc132) + %tmp2 = tt.splat %tmp1_18 : i64 -> tensor<512xi64> loc(#loc133) + %tmp2_19 = arith.cmpi sge, %x0_7, %tmp2 : tensor<512xi64> loc(#loc133) + %tmp3 = arith.constant 2 : i32 loc(#loc134) + %tmp3_20 = arith.constant 2 : i64 loc(#loc134) + %tmp3_21 = arith.divsi %ks0, %tmp3_20 : i64 loc(#loc134) + %tmp3_22 = arith.constant -1 : i32 loc(#loc135) + %tmp3_23 = arith.constant -1 : i64 loc(#loc135) + %tmp3_24 = arith.muli %tmp3_23, %tmp3_21 : i64 loc(#loc135) + %tmp3_25 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc136) + %tmp3_26 = tt.splat %tmp3_24 : i64 -> tensor<512xi64> loc(#loc136) + %tmp3_27 = arith.addi %tmp3_25, %tmp3_26 : tensor<512xi64> loc(#loc136) + %tmp3_28 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc137) + %tmp3_29 = tt.addptr %tmp3_28, %tmp3_27 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc137) + %tmp3_30 = arith.andi %tmp2_19, %xmask_5 : tensor<512xi1> loc(#loc138) + %tmp3_31 = arith.constant 0.000000e+00 : f32 loc(#loc139) + %tmp3_32 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc139) + %tmp3_33 = arith.truncf %tmp3_32 : tensor<512xf32> to tensor<512xbf16> loc(#loc139) + %tmp3_34 = tt.load %tmp3_29, %tmp3_30, %tmp3_33 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc139) + %tmp3_35 = arith.extf %tmp3_34 : tensor<512xbf16> to tensor<512xf32> loc(#loc140) + %tmp4 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc141) + %tmp4_36 = tt.addptr %tmp4, %x1_11 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc141) + %tmp4_37 = arith.andi %tmp2_19, %xmask_5 : tensor<512xi1> loc(#loc142) + %tmp4_38 = arith.constant 0.000000e+00 : f32 loc(#loc143) + %tmp4_39 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc143) + %tmp4_40 = arith.fptosi %tmp4_39 : tensor<512xf32> to tensor<512xi64> loc(#loc143) + %tmp4_41 = tt.load %tmp4_36, %tmp4_37, %tmp4_40 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc143) + %tmp5 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc144) + %tmp6 = arith.addi %tmp4_41, %tmp5 : tensor<512xi64> loc(#loc145) + %tmp7 = arith.constant 0 : i32 loc(#loc146) + %tmp7_42 = arith.extsi %tmp7 : i32 to i64 loc(#loc146) + %tmp7_43 = tt.splat %tmp7_42 : i64 -> tensor<512xi64> loc(#loc146) + %tmp7_44 = arith.cmpi slt, %tmp4_41, %tmp7_43 : tensor<512xi64> loc(#loc146) + %tmp8 = arith.select %tmp7_44, %tmp6, %tmp4_41 : tensor<512xi1>, tensor<512xi64> loc(#loc147) + %c0_i32 = arith.constant 0 : i32 loc(#loc30) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc30) + %1 = tt.splat %0 : i64 -> tensor<512xi64> loc(#loc30) + %2 = arith.cmpi sle, %1, %tmp8 : tensor<512xi64> loc(#loc30) + %3 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc31) + %4 = arith.cmpi slt, %tmp8, %3 : tensor<512xi64> loc(#loc31) + %5 = arith.andi %2, %4 : tensor<512xi1> loc(#loc32) + %6 = arith.andi %tmp2_19, %xmask_5 : tensor<512xi1> loc(#loc33) + %true = arith.constant true loc(#loc34) + %cst = arith.constant dense : tensor<512xi1> loc(#loc34) + %7 = arith.xori %6, %cst : tensor<512xi1> loc(#loc34) + %8 = arith.ori %5, %7 : tensor<512xi1> loc(#loc35) + tt.assert %8, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<512xi1> loc(#loc36) + %tmp10 = arith.constant 2 : i32 loc(#loc148) + %tmp10_45 = arith.constant 2 : i64 loc(#loc148) + %tmp10_46 = arith.divsi %ks0, %tmp10_45 : i64 loc(#loc148) + %tmp10_47 = arith.constant -1 : i32 loc(#loc149) + %tmp10_48 = arith.constant -1 : i64 loc(#loc149) + %tmp10_49 = arith.muli %tmp10_48, %tmp10_46 : i64 loc(#loc149) + %tmp10_50 = tt.splat %tmp10_49 : i64 -> tensor<512xi64> loc(#loc150) + %tmp10_51 = arith.addi %x0_7, %tmp10_50 : tensor<512xi64> loc(#loc150) + %tmp10_52 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc151) + %tmp10_53 = arith.muli %tmp10_52, %tmp8 : tensor<512xi64> loc(#loc151) + %tmp10_54 = arith.addi %tmp10_51, %tmp10_53 : tensor<512xi64> loc(#loc152) + %tmp10_55 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc153) + %tmp10_56 = tt.addptr %tmp10_55, %tmp10_54 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc153) + %tmp10_57 = arith.andi %tmp2_19, %xmask_5 : tensor<512xi1> loc(#loc154) + %tmp10_58 = arith.constant 0.000000e+00 : f32 loc(#loc155) + %tmp10_59 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc155) + %tmp10_60 = arith.truncf %tmp10_59 : tensor<512xf32> to tensor<512xbf16> loc(#loc155) + %tmp10_61 = tt.load %tmp10_56, %tmp10_57, %tmp10_60 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc155) + %tmp10_62 = arith.extf %tmp10_61 : tensor<512xbf16> to tensor<512xf32> loc(#loc156) + %tmp11 = arith.mulf %tmp3_35, %tmp10_62 : tensor<512xf32> loc(#loc157) + %tmp12 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp12_63 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc158) + %tmp12_64 = arith.subf %tmp12_63, %tmp11 : tensor<512xf32> loc(#loc158) + %tmp13 = arith.constant 0.000000e+00 : f32 loc(#loc159) + %tmp13_65 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc159) + %tmp14 = arith.select %tmp2_19, %tmp12_64, %tmp13_65 : tensor<512xi1>, tensor<512xf32> loc(#loc160) + %tmp15 = arith.constant 0.000000e+00 : f32 loc(#loc161) + %tmp16 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc162) + %tmp16_66 = arith.select %tmp2_19, %tmp14, %tmp16 : tensor<512xi1>, tensor<512xf32> loc(#loc162) + %tmp17 = tt.splat %tmp1_18 : i64 -> tensor<512xi64> loc(#loc163) + %tmp17_67 = arith.cmpi slt, %x0_7, %tmp17 : tensor<512xi64> loc(#loc163) + %tmp18 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc164) + %tmp18_68 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc164) + %tmp18_69 = arith.addi %tmp18_68, %tmp18 : tensor<512xi64> loc(#loc164) + %tmp18_70 = arith.constant 2 : i32 loc(#loc165) + %tmp18_71 = arith.constant 2 : i64 loc(#loc165) + %tmp18_72 = arith.divsi %ks0, %tmp18_71 : i64 loc(#loc165) + %tmp18_73 = arith.constant -1 : i32 loc(#loc166) + %tmp18_74 = arith.constant -1 : i64 loc(#loc166) + %tmp18_75 = arith.muli %tmp18_74, %tmp18_72 : i64 loc(#loc166) + %tmp18_76 = tt.splat %tmp18_75 : i64 -> tensor<512xi64> loc(#loc167) + %tmp18_77 = arith.addi %tmp18_69, %tmp18_76 : tensor<512xi64> loc(#loc167) + %tmp18_78 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc168) + %tmp18_79 = tt.addptr %tmp18_78, %tmp18_77 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc168) + %tmp18_80 = arith.andi %tmp17_67, %xmask_5 : tensor<512xi1> loc(#loc169) + %tmp18_81 = arith.constant 0.000000e+00 : f32 loc(#loc170) + %tmp18_82 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc170) + %tmp18_83 = arith.truncf %tmp18_82 : tensor<512xf32> to tensor<512xbf16> loc(#loc170) + %tmp18_84 = tt.load %tmp18_79, %tmp18_80, %tmp18_83 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc170) + %tmp18_85 = arith.extf %tmp18_84 : tensor<512xbf16> to tensor<512xf32> loc(#loc171) + %tmp19 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc172) + %tmp19_86 = tt.addptr %tmp19, %x1_11 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc172) + %tmp19_87 = arith.andi %tmp17_67, %xmask_5 : tensor<512xi1> loc(#loc173) + %tmp19_88 = arith.constant 0.000000e+00 : f32 loc(#loc174) + %tmp19_89 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc174) + %tmp19_90 = arith.fptosi %tmp19_89 : tensor<512xf32> to tensor<512xi64> loc(#loc174) + %tmp19_91 = tt.load %tmp19_86, %tmp19_87, %tmp19_90 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc174) + %tmp20 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc175) + %tmp21 = arith.addi %tmp19_91, %tmp20 : tensor<512xi64> loc(#loc176) + %tmp22 = arith.constant 0 : i32 loc(#loc177) + %tmp22_92 = arith.extsi %tmp22 : i32 to i64 loc(#loc177) + %tmp22_93 = tt.splat %tmp22_92 : i64 -> tensor<512xi64> loc(#loc177) + %tmp22_94 = arith.cmpi slt, %tmp19_91, %tmp22_93 : tensor<512xi64> loc(#loc177) + %tmp23 = arith.select %tmp22_94, %tmp21, %tmp19_91 : tensor<512xi1>, tensor<512xi64> loc(#loc178) + %c0_i32_95 = arith.constant 0 : i32 loc(#loc68) + %9 = arith.extsi %c0_i32_95 : i32 to i64 loc(#loc68) + %10 = tt.splat %9 : i64 -> tensor<512xi64> loc(#loc68) + %11 = arith.cmpi sle, %10, %tmp23 : tensor<512xi64> loc(#loc68) + %12 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc69) + %13 = arith.cmpi slt, %tmp23, %12 : tensor<512xi64> loc(#loc69) + %14 = arith.andi %11, %13 : tensor<512xi1> loc(#loc70) + %15 = arith.andi %tmp17_67, %xmask_5 : tensor<512xi1> loc(#loc71) + %true_96 = arith.constant true loc(#loc72) + %cst_97 = arith.constant dense : tensor<512xi1> loc(#loc72) + %16 = arith.xori %15, %cst_97 : tensor<512xi1> loc(#loc72) + %17 = arith.ori %14, %16 : tensor<512xi1> loc(#loc73) + tt.assert %17, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<512xi1> loc(#loc74) + %tmp25 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc179) + %tmp25_98 = arith.addi %tmp25, %x0_7 : tensor<512xi64> loc(#loc179) + %tmp25_99 = arith.constant 2 : i32 loc(#loc180) + %tmp25_100 = arith.constant 2 : i64 loc(#loc180) + %tmp25_101 = arith.divsi %ks0, %tmp25_100 : i64 loc(#loc180) + %tmp25_102 = arith.constant -1 : i32 loc(#loc181) + %tmp25_103 = arith.constant -1 : i64 loc(#loc181) + %tmp25_104 = arith.muli %tmp25_103, %tmp25_101 : i64 loc(#loc181) + %tmp25_105 = tt.splat %tmp25_104 : i64 -> tensor<512xi64> loc(#loc182) + %tmp25_106 = arith.addi %tmp25_98, %tmp25_105 : tensor<512xi64> loc(#loc182) + %tmp25_107 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc183) + %tmp25_108 = arith.muli %tmp25_107, %tmp23 : tensor<512xi64> loc(#loc183) + %tmp25_109 = arith.addi %tmp25_106, %tmp25_108 : tensor<512xi64> loc(#loc184) + %tmp25_110 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc185) + %tmp25_111 = tt.addptr %tmp25_110, %tmp25_109 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc185) + %tmp25_112 = arith.andi %tmp17_67, %xmask_5 : tensor<512xi1> loc(#loc186) + %tmp25_113 = arith.constant 0.000000e+00 : f32 loc(#loc187) + %tmp25_114 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc187) + %tmp25_115 = arith.truncf %tmp25_114 : tensor<512xf32> to tensor<512xbf16> loc(#loc187) + %tmp25_116 = tt.load %tmp25_111, %tmp25_112, %tmp25_115 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc187) + %tmp25_117 = arith.extf %tmp25_116 : tensor<512xbf16> to tensor<512xf32> loc(#loc188) + %tmp26 = arith.mulf %tmp18_85, %tmp25_117 : tensor<512xf32> loc(#loc189) + %tmp27 = arith.constant 0.000000e+00 : f32 loc(#loc190) + %tmp27_118 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc190) + %tmp28 = arith.select %tmp17_67, %tmp26, %tmp27_118 : tensor<512xi1>, tensor<512xf32> loc(#loc191) + %tmp29 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc192) + %tmp29_119 = arith.select %tmp17_67, %tmp28, %tmp29 : tensor<512xi1>, tensor<512xf32> loc(#loc192) + %tmp30 = arith.addf %tmp16_66, %tmp29_119 : tensor<512xf32> loc(#loc193) + %tmp34 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc194) + %tmp34_120 = arith.addi %tmp32_16, %tmp34 : tensor<512xi64> loc(#loc194) + %tmp35 = arith.constant 0 : i32 loc(#loc195) + %tmp35_121 = arith.extsi %tmp35 : i32 to i64 loc(#loc195) + %tmp35_122 = tt.splat %tmp35_121 : i64 -> tensor<512xi64> loc(#loc195) + %tmp35_123 = arith.cmpi slt, %tmp32_16, %tmp35_122 : tensor<512xi64> loc(#loc195) + %tmp36 = arith.select %tmp35_123, %tmp34_120, %tmp32_16 : tensor<512xi1>, tensor<512xi64> loc(#loc196) + %c0_i32_124 = arith.constant 0 : i32 loc(#loc93) + %18 = arith.extsi %c0_i32_124 : i32 to i64 loc(#loc93) + %19 = tt.splat %18 : i64 -> tensor<512xi64> loc(#loc93) + %20 = arith.cmpi sle, %19, %tmp36 : tensor<512xi64> loc(#loc93) + %21 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc94) + %22 = arith.cmpi slt, %tmp36, %21 : tensor<512xi64> loc(#loc94) + %23 = arith.andi %20, %22 : tensor<512xi1> loc(#loc95) + %true_125 = arith.constant true loc(#loc96) + %cst_126 = arith.constant dense : tensor<512xi1> loc(#loc96) + %24 = arith.xori %xmask_5, %cst_126 : tensor<512xi1> loc(#loc96) + %25 = arith.ori %23, %24 : tensor<512xi1> loc(#loc97) + tt.assert %25, "index out of bounds: 0 <= tmp36 < ks3" : tensor<512xi1> loc(#loc98) + %tmp38 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc197) + %tmp38_127 = arith.muli %tmp38, %tmp36 : tensor<512xi64> loc(#loc197) + %tmp38_128 = arith.addi %x0_7, %tmp38_127 : tensor<512xi64> loc(#loc198) + %tmp38_129 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc199) + %tmp38_130 = tt.addptr %tmp38_129, %tmp38_128 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc199) + %tmp38_131 = tt.load %tmp38_130, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc200) + %tmp38_132 = arith.extf %tmp38_131 : tensor<512xbf16> to tensor<512xf32> loc(#loc201) + %tmp39 = arith.mulf %tmp31_14, %tmp38_132 : tensor<512xf32> loc(#loc202) + %tmp40 = arith.addf %tmp30, %tmp39 : tensor<512xf32> loc(#loc203) + %26 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc106) + %27 = tt.addptr %26, %xindex_4 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc106) + %28 = arith.truncf %tmp40 : tensor<512xf32> to tensor<512xbf16> loc(#loc107) + tt.store %27, %28, %xmask_5 : tensor<512x!tt.ptr> loc(#loc107) + tt.return loc(#loc108) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":22:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":24:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":25:31) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":25:36) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":25:76) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":26:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":26:36) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":28:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":29:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:48) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:35) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:111) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":31:30) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":31:42) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":31:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":32:32) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":33:18) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":34:18) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":35:32) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:28) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:98) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:64) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:115) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:108) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:106) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:123) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:49) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:42) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:36) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:58) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:54) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:31) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:72) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:65) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:123) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":38:19) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":39:13) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":40:38) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":41:34) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":42:12) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":43:34) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":44:19) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:37) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:55) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:48) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:42) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:31) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:68) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:60) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:119) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":46:31) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":46:44) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":46:36) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":47:33) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":48:20) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":49:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":50:35) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:28) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:100) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:65) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:118) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:110) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:108) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:126) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:37) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:55) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:48) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:42) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:64) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:60) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:31) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:80) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:72) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:131) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":53:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":54:38) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":55:35) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":56:35) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":57:20) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":59:20) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":60:20) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":61:35) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:28) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:46) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:38) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:56) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:54) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:64) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:40) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:36) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:31) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:48) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:88) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":64:20) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":65:20) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":66:25) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":66:37) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":66:4) +#loc119 = loc("xoffset"(#loc1)) +#loc120 = loc("xoffset"(#loc2)) +#loc121 = loc("xindex"(#loc3)) +#loc122 = loc("xindex"(#loc4)) +#loc123 = loc("xmask"(#loc5)) +#loc124 = loc("x0"(#loc6)) +#loc125 = loc("x1"(#loc7)) +#loc126 = loc("x1"(#loc8)) +#loc127 = loc("tmp31"(#loc9)) +#loc128 = loc("tmp31"(#loc10)) +#loc129 = loc("tmp31"(#loc11)) +#loc130 = loc("tmp32"(#loc12)) +#loc131 = loc("tmp32"(#loc13)) +#loc132 = loc("tmp1"(#loc14)) +#loc133 = loc("tmp2"(#loc15)) +#loc134 = loc("tmp3"(#loc16)) +#loc135 = loc("tmp3"(#loc17)) +#loc136 = loc("tmp3"(#loc18)) +#loc137 = loc("tmp3"(#loc19)) +#loc138 = loc("tmp3"(#loc20)) +#loc139 = loc("tmp3"(#loc21)) +#loc140 = loc("tmp3"(#loc22)) +#loc141 = loc("tmp4"(#loc23)) +#loc142 = loc("tmp4"(#loc24)) +#loc143 = loc("tmp4"(#loc25)) +#loc144 = loc("tmp5"(#loc26)) +#loc145 = loc("tmp6"(#loc27)) +#loc146 = loc("tmp7"(#loc28)) +#loc147 = loc("tmp8"(#loc29)) +#loc148 = loc("tmp10"(#loc37)) +#loc149 = loc("tmp10"(#loc38)) +#loc150 = loc("tmp10"(#loc39)) +#loc151 = loc("tmp10"(#loc40)) +#loc152 = loc("tmp10"(#loc41)) +#loc153 = loc("tmp10"(#loc42)) +#loc154 = loc("tmp10"(#loc43)) +#loc155 = loc("tmp10"(#loc44)) +#loc156 = loc("tmp10"(#loc45)) +#loc157 = loc("tmp11"(#loc46)) +#loc158 = loc("tmp12"(#loc47)) +#loc159 = loc("tmp13"(#loc48)) +#loc160 = loc("tmp14"(#loc49)) +#loc161 = loc("tmp15"(#loc50)) +#loc162 = loc("tmp16"(#loc51)) +#loc163 = loc("tmp17"(#loc52)) +#loc164 = loc("tmp18"(#loc53)) +#loc165 = loc("tmp18"(#loc54)) +#loc166 = loc("tmp18"(#loc55)) +#loc167 = loc("tmp18"(#loc56)) +#loc168 = loc("tmp18"(#loc57)) +#loc169 = loc("tmp18"(#loc58)) +#loc170 = loc("tmp18"(#loc59)) +#loc171 = loc("tmp18"(#loc60)) +#loc172 = loc("tmp19"(#loc61)) +#loc173 = loc("tmp19"(#loc62)) +#loc174 = loc("tmp19"(#loc63)) +#loc175 = loc("tmp20"(#loc64)) +#loc176 = loc("tmp21"(#loc65)) +#loc177 = loc("tmp22"(#loc66)) +#loc178 = loc("tmp23"(#loc67)) +#loc179 = loc("tmp25"(#loc75)) +#loc180 = loc("tmp25"(#loc76)) +#loc181 = loc("tmp25"(#loc77)) +#loc182 = loc("tmp25"(#loc78)) +#loc183 = loc("tmp25"(#loc79)) +#loc184 = loc("tmp25"(#loc80)) +#loc185 = loc("tmp25"(#loc81)) +#loc186 = loc("tmp25"(#loc82)) +#loc187 = loc("tmp25"(#loc83)) +#loc188 = loc("tmp25"(#loc84)) +#loc189 = loc("tmp26"(#loc85)) +#loc190 = loc("tmp27"(#loc86)) +#loc191 = loc("tmp28"(#loc87)) +#loc192 = loc("tmp29"(#loc88)) +#loc193 = loc("tmp30"(#loc89)) +#loc194 = loc("tmp34"(#loc90)) +#loc195 = loc("tmp35"(#loc91)) +#loc196 = loc("tmp36"(#loc92)) +#loc197 = loc("tmp38"(#loc99)) +#loc198 = loc("tmp38"(#loc100)) +#loc199 = loc("tmp38"(#loc101)) +#loc200 = loc("tmp38"(#loc102)) +#loc201 = loc("tmp38"(#loc103)) +#loc202 = loc("tmp39"(#loc104)) +#loc203 = loc("tmp40"(#loc105)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..fd7335255e4c1c24af78edf80e42792a9ac7ca8e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir @@ -0,0 +1,284 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":18:0) +#loc91 = loc("in_ptr0"(#loc)) +#loc92 = loc("in_ptr1"(#loc)) +#loc93 = loc("in_ptr2"(#loc)) +#loc94 = loc("in_ptr3"(#loc)) +#loc95 = loc("out_ptr0"(#loc)) +#loc96 = loc("ks0"(#loc)) +#loc97 = loc("ks1"(#loc)) +#loc98 = loc("ks2"(#loc)) +#loc99 = loc("ks3"(#loc)) +#loc100 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<512xi1, #blocked> loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xbf16, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<512xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc101) + %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc102) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc103) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32, #blocked> loc(#loc104) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32, #blocked> loc(#loc104) + %xmask = tt.splat %xnumel : i32 -> tensor<512xi32, #blocked> loc(#loc105) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<512xi32, #blocked> loc(#loc105) + %x0 = arith.extsi %xindex_5 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked> loc(#loc106) + %x0_7 = tt.splat %ks0 : i64 -> tensor<512xi64, #blocked> loc(#loc106) + %x0_8 = arith.remsi %x0, %x0_7 : tensor<512xi64, #blocked> loc(#loc106) + %x1 = arith.divsi %x0, %x0_7 : tensor<512xi64, #blocked> loc(#loc107) + %x1_9 = tt.splat %ks1 : i64 -> tensor<512xi64, #blocked> loc(#loc108) + %x1_10 = arith.remsi %x1, %x1_9 : tensor<512xi64, #blocked> loc(#loc108) + %tmp31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc109) + %tmp31_11 = tt.addptr %tmp31, %xindex_5 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc109) + %tmp31_12 = tt.load %tmp31_11, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc110) + %tmp31_13 = arith.extf %tmp31_12 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc111) + %tmp32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc112) + %tmp32_14 = tt.addptr %tmp32, %x1_10 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc112) + %tmp32_15 = tt.load %tmp32_14, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc113) + %tmp1 = arith.divsi %ks0, %c2_i64 : i64 loc(#loc114) + %tmp2 = tt.splat %tmp1 : i64 -> tensor<512xi64, #blocked> loc(#loc115) + %tmp2_16 = arith.cmpi sge, %x0_8, %tmp2 : tensor<512xi64, #blocked> loc(#loc115) + %tmp3 = arith.muli %tmp1, %c-1_i64 : i64 loc(#loc116) + %tmp3_17 = tt.splat %tmp3 : i64 -> tensor<512xi64, #blocked> loc(#loc117) + %tmp3_18 = arith.addi %x0, %tmp3_17 : tensor<512xi64, #blocked> loc(#loc117) + %tmp3_19 = tt.addptr %tmp31, %tmp3_18 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc118) + %tmp3_20 = arith.andi %tmp2_16, %xmask_6 : tensor<512xi1, #blocked> loc(#loc119) + %tmp3_21 = tt.load %tmp3_19, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc120) + %tmp3_22 = arith.extf %tmp3_21 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc121) + %tmp4 = tt.load %tmp32_14, %tmp3_20, %cst_1 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc122) + %tmp5 = tt.splat %ks2 : i64 -> tensor<512xi64, #blocked> loc(#loc123) + %tmp6 = arith.addi %tmp4, %tmp5 : tensor<512xi64, #blocked> loc(#loc124) + %tmp7 = arith.cmpi slt, %tmp4, %cst_1 : tensor<512xi64, #blocked> loc(#loc125) + %tmp8 = arith.select %tmp7, %tmp6, %tmp4 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked> loc(#loc126) + %0 = arith.cmpi sge, %tmp8, %cst_1 : tensor<512xi64, #blocked> loc(#loc28) + %1 = arith.cmpi slt, %tmp8, %tmp5 : tensor<512xi64, #blocked> loc(#loc29) + %2 = arith.andi %0, %1 : tensor<512xi1, #blocked> loc(#loc30) + %3 = arith.xori %tmp3_20, %cst : tensor<512xi1, #blocked> loc(#loc31) + %4 = arith.ori %2, %3 : tensor<512xi1, #blocked> loc(#loc32) + tt.assert %4, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<512xi1, #blocked> loc(#loc33) + %tmp10 = arith.addi %x0_8, %tmp3_17 : tensor<512xi64, #blocked> loc(#loc127) + %tmp10_23 = arith.muli %x0_7, %tmp8 : tensor<512xi64, #blocked> loc(#loc128) + %tmp10_24 = arith.addi %tmp10, %tmp10_23 : tensor<512xi64, #blocked> loc(#loc129) + %tmp10_25 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc130) + %tmp10_26 = tt.addptr %tmp10_25, %tmp10_24 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc130) + %tmp10_27 = tt.load %tmp10_26, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc131) + %tmp10_28 = arith.extf %tmp10_27 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc132) + %tmp11 = arith.mulf %tmp3_22, %tmp10_28 : tensor<512xf32, #blocked> loc(#loc133) + %tmp12 = arith.subf %cst_2, %tmp11 : tensor<512xf32, #blocked> loc(#loc134) + %tmp16 = arith.select %tmp2_16, %tmp12, %cst_2 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc169) + %tmp17 = arith.cmpi slt, %x0_8, %tmp2 : tensor<512xi64, #blocked> loc(#loc137) + %tmp18 = arith.addi %x0_7, %x0 : tensor<512xi64, #blocked> loc(#loc138) + %tmp18_29 = arith.addi %tmp18, %tmp3_17 : tensor<512xi64, #blocked> loc(#loc139) + %tmp18_30 = tt.addptr %tmp31, %tmp18_29 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc140) + %tmp18_31 = arith.andi %tmp17, %xmask_6 : tensor<512xi1, #blocked> loc(#loc141) + %tmp18_32 = tt.load %tmp18_30, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc142) + %tmp18_33 = arith.extf %tmp18_32 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc143) + %tmp19 = tt.load %tmp32_14, %tmp18_31, %cst_1 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc144) + %tmp21 = arith.addi %tmp19, %tmp5 : tensor<512xi64, #blocked> loc(#loc145) + %tmp22 = arith.cmpi slt, %tmp19, %cst_1 : tensor<512xi64, #blocked> loc(#loc146) + %tmp23 = arith.select %tmp22, %tmp21, %tmp19 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked> loc(#loc147) + %5 = arith.cmpi sge, %tmp23, %cst_1 : tensor<512xi64, #blocked> loc(#loc55) + %6 = arith.cmpi slt, %tmp23, %tmp5 : tensor<512xi64, #blocked> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<512xi1, #blocked> loc(#loc57) + %8 = arith.xori %tmp18_31, %cst : tensor<512xi1, #blocked> loc(#loc58) + %9 = arith.ori %7, %8 : tensor<512xi1, #blocked> loc(#loc59) + tt.assert %9, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<512xi1, #blocked> loc(#loc60) + %tmp25 = arith.addi %x0_7, %x0_8 : tensor<512xi64, #blocked> loc(#loc148) + %tmp25_34 = arith.addi %tmp25, %tmp3_17 : tensor<512xi64, #blocked> loc(#loc149) + %tmp25_35 = arith.muli %x0_7, %tmp23 : tensor<512xi64, #blocked> loc(#loc150) + %tmp25_36 = arith.addi %tmp25_34, %tmp25_35 : tensor<512xi64, #blocked> loc(#loc151) + %tmp25_37 = tt.addptr %tmp10_25, %tmp25_36 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc152) + %tmp25_38 = tt.load %tmp25_37, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc153) + %tmp25_39 = arith.extf %tmp25_38 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc154) + %tmp26 = arith.mulf %tmp18_33, %tmp25_39 : tensor<512xf32, #blocked> loc(#loc155) + %tmp29 = arith.select %tmp17, %tmp26, %cst_2 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc170) + %tmp30 = arith.addf %tmp16, %tmp29 : tensor<512xf32, #blocked> loc(#loc158) + %tmp34 = tt.splat %ks3 : i64 -> tensor<512xi64, #blocked> loc(#loc159) + %tmp34_40 = arith.addi %tmp32_15, %tmp34 : tensor<512xi64, #blocked> loc(#loc159) + %tmp35 = arith.cmpi slt, %tmp32_15, %cst_1 : tensor<512xi64, #blocked> loc(#loc160) + %tmp36 = arith.select %tmp35, %tmp34_40, %tmp32_15 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked> loc(#loc161) + %10 = arith.cmpi sge, %tmp36, %cst_1 : tensor<512xi64, #blocked> loc(#loc75) + %11 = arith.cmpi slt, %tmp36, %tmp34 : tensor<512xi64, #blocked> loc(#loc76) + %12 = arith.andi %10, %11 : tensor<512xi1, #blocked> loc(#loc77) + %13 = arith.xori %xmask_6, %cst : tensor<512xi1, #blocked> loc(#loc78) + %14 = arith.ori %12, %13 : tensor<512xi1, #blocked> loc(#loc79) + tt.assert %14, "index out of bounds: 0 <= tmp36 < ks3" : tensor<512xi1, #blocked> loc(#loc80) + %tmp38 = arith.muli %x0_7, %tmp36 : tensor<512xi64, #blocked> loc(#loc162) + %tmp38_41 = arith.addi %x0_8, %tmp38 : tensor<512xi64, #blocked> loc(#loc163) + %tmp38_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc164) + %tmp38_43 = tt.addptr %tmp38_42, %tmp38_41 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc164) + %tmp38_44 = tt.load %tmp38_43, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc165) + %tmp38_45 = arith.extf %tmp38_44 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc166) + %tmp39 = arith.mulf %tmp31_13, %tmp38_45 : tensor<512xf32, #blocked> loc(#loc167) + %tmp40 = arith.addf %tmp30, %tmp39 : tensor<512xf32, #blocked> loc(#loc168) + %15 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc88) + %16 = tt.addptr %15, %xindex_5 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc88) + %17 = arith.truncf %tmp40 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc89) + tt.store %16, %17, %xmask_6 : tensor<512x!tt.ptr, #blocked> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":24:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":24:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":25:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":25:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":25:76) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":26:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":26:36) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":28:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":29:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:35) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:111) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":31:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":32:32) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":33:18) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":34:18) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":35:32) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:98) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:64) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:108) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:106) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:123) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:58) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:65) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:123) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":38:19) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":39:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":43:34) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":41:34) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":44:19) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:37) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:42) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:68) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:60) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:119) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":46:36) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":48:20) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":49:20) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":50:35) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:100) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:65) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:110) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:108) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:126) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:37) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:64) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:60) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:31) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:72) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:131) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":53:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":56:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":55:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":57:20) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":59:20) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":60:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":61:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:28) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:46) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:38) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:56) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:54) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:64) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:40) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:36) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:31) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:48) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:88) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":64:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":65:20) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":66:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":66:37) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":66:4) +#loc101 = loc("xoffset"(#loc2)) +#loc102 = loc("xoffset"(#loc3)) +#loc103 = loc("xindex"(#loc4)) +#loc104 = loc("xindex"(#loc5)) +#loc105 = loc("xmask"(#loc6)) +#loc106 = loc("x0"(#loc7)) +#loc107 = loc("x1"(#loc8)) +#loc108 = loc("x1"(#loc9)) +#loc109 = loc("tmp31"(#loc10)) +#loc110 = loc("tmp31"(#loc11)) +#loc111 = loc("tmp31"(#loc12)) +#loc112 = loc("tmp32"(#loc13)) +#loc113 = loc("tmp32"(#loc14)) +#loc114 = loc("tmp1"(#loc15)) +#loc115 = loc("tmp2"(#loc16)) +#loc116 = loc("tmp3"(#loc17)) +#loc117 = loc("tmp3"(#loc18)) +#loc118 = loc("tmp3"(#loc19)) +#loc119 = loc("tmp3"(#loc20)) +#loc120 = loc("tmp3"(#loc21)) +#loc121 = loc("tmp3"(#loc22)) +#loc122 = loc("tmp4"(#loc23)) +#loc123 = loc("tmp5"(#loc24)) +#loc124 = loc("tmp6"(#loc25)) +#loc125 = loc("tmp7"(#loc26)) +#loc126 = loc("tmp8"(#loc27)) +#loc127 = loc("tmp10"(#loc34)) +#loc128 = loc("tmp10"(#loc35)) +#loc129 = loc("tmp10"(#loc36)) +#loc130 = loc("tmp10"(#loc37)) +#loc131 = loc("tmp10"(#loc38)) +#loc132 = loc("tmp10"(#loc39)) +#loc133 = loc("tmp11"(#loc40)) +#loc134 = loc("tmp12"(#loc41)) +#loc135 = loc("tmp16"(#loc42)) +#loc136 = loc("tmp14"(#loc43)) +#loc137 = loc("tmp17"(#loc44)) +#loc138 = loc("tmp18"(#loc45)) +#loc139 = loc("tmp18"(#loc46)) +#loc140 = loc("tmp18"(#loc47)) +#loc141 = loc("tmp18"(#loc48)) +#loc142 = loc("tmp18"(#loc49)) +#loc143 = loc("tmp18"(#loc50)) +#loc144 = loc("tmp19"(#loc51)) +#loc145 = loc("tmp21"(#loc52)) +#loc146 = loc("tmp22"(#loc53)) +#loc147 = loc("tmp23"(#loc54)) +#loc148 = loc("tmp25"(#loc61)) +#loc149 = loc("tmp25"(#loc62)) +#loc150 = loc("tmp25"(#loc63)) +#loc151 = loc("tmp25"(#loc64)) +#loc152 = loc("tmp25"(#loc65)) +#loc153 = loc("tmp25"(#loc66)) +#loc154 = loc("tmp25"(#loc67)) +#loc155 = loc("tmp26"(#loc68)) +#loc156 = loc("tmp29"(#loc69)) +#loc157 = loc("tmp28"(#loc70)) +#loc158 = loc("tmp30"(#loc71)) +#loc159 = loc("tmp34"(#loc72)) +#loc160 = loc("tmp35"(#loc73)) +#loc161 = loc("tmp36"(#loc74)) +#loc162 = loc("tmp38"(#loc81)) +#loc163 = loc("tmp38"(#loc82)) +#loc164 = loc("tmp38"(#loc83)) +#loc165 = loc("tmp38"(#loc84)) +#loc166 = loc("tmp38"(#loc85)) +#loc167 = loc("tmp39"(#loc86)) +#loc168 = loc("tmp40"(#loc87)) +#loc169 = loc(fused[#loc135, #loc136]) +#loc170 = loc(fused[#loc156, #loc157]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..f1dfef3ba819d3459ed99028d6501da35078df8d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir @@ -0,0 +1,283 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":18:0) +#loc91 = loc("in_ptr0"(#loc)) +#loc92 = loc("in_ptr1"(#loc)) +#loc93 = loc("in_ptr2"(#loc)) +#loc94 = loc("in_ptr3"(#loc)) +#loc95 = loc("out_ptr0"(#loc)) +#loc96 = loc("ks0"(#loc)) +#loc97 = loc("ks1"(#loc)) +#loc98 = loc("ks2"(#loc)) +#loc99 = loc("ks3"(#loc)) +#loc100 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<512xi64> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xbf16> loc(#loc1) + %cst_1 = arith.constant dense : tensor<512xi1> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc101) + %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc102) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc103) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc104) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc104) + %xmask = tt.splat %xnumel : i32 -> tensor<512xi32> loc(#loc105) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<512xi32> loc(#loc105) + %x0 = arith.extsi %xindex_5 : tensor<512xi32> to tensor<512xi64> loc(#loc106) + %x0_7 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc106) + %x0_8 = arith.remsi %x0, %x0_7 : tensor<512xi64> loc(#loc106) + %x1 = arith.divsi %x0, %x0_7 : tensor<512xi64> loc(#loc107) + %x1_9 = tt.splat %ks1 : i64 -> tensor<512xi64> loc(#loc108) + %x1_10 = arith.remsi %x1, %x1_9 : tensor<512xi64> loc(#loc108) + %tmp31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc109) + %tmp31_11 = tt.addptr %tmp31, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc109) + %tmp31_12 = tt.load %tmp31_11, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc110) + %tmp31_13 = arith.extf %tmp31_12 : tensor<512xbf16> to tensor<512xf32> loc(#loc111) + %tmp32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc112) + %tmp32_14 = tt.addptr %tmp32, %x1_10 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc112) + %tmp32_15 = tt.load %tmp32_14, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc113) + %tmp1 = arith.divsi %ks0, %c2_i64 : i64 loc(#loc114) + %tmp2 = tt.splat %tmp1 : i64 -> tensor<512xi64> loc(#loc115) + %tmp2_16 = arith.cmpi sge, %x0_8, %tmp2 : tensor<512xi64> loc(#loc115) + %tmp3 = arith.muli %tmp1, %c-1_i64 : i64 loc(#loc116) + %tmp3_17 = tt.splat %tmp3 : i64 -> tensor<512xi64> loc(#loc117) + %tmp3_18 = arith.addi %x0, %tmp3_17 : tensor<512xi64> loc(#loc117) + %tmp3_19 = tt.addptr %tmp31, %tmp3_18 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc118) + %tmp3_20 = arith.andi %tmp2_16, %xmask_6 : tensor<512xi1> loc(#loc119) + %tmp3_21 = tt.load %tmp3_19, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc120) + %tmp3_22 = arith.extf %tmp3_21 : tensor<512xbf16> to tensor<512xf32> loc(#loc121) + %tmp4 = tt.load %tmp32_14, %tmp3_20, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc122) + %tmp5 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc123) + %tmp6 = arith.addi %tmp4, %tmp5 : tensor<512xi64> loc(#loc124) + %tmp7 = arith.cmpi slt, %tmp4, %cst : tensor<512xi64> loc(#loc125) + %tmp8 = arith.select %tmp7, %tmp6, %tmp4 : tensor<512xi1>, tensor<512xi64> loc(#loc126) + %0 = arith.cmpi sge, %tmp8, %cst : tensor<512xi64> loc(#loc28) + %1 = arith.cmpi slt, %tmp8, %tmp5 : tensor<512xi64> loc(#loc29) + %2 = arith.andi %0, %1 : tensor<512xi1> loc(#loc30) + %3 = arith.xori %tmp3_20, %cst_1 : tensor<512xi1> loc(#loc31) + %4 = arith.ori %2, %3 : tensor<512xi1> loc(#loc32) + tt.assert %4, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<512xi1> loc(#loc33) + %tmp10 = arith.addi %x0_8, %tmp3_17 : tensor<512xi64> loc(#loc127) + %tmp10_23 = arith.muli %x0_7, %tmp8 : tensor<512xi64> loc(#loc128) + %tmp10_24 = arith.addi %tmp10, %tmp10_23 : tensor<512xi64> loc(#loc129) + %tmp10_25 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc130) + %tmp10_26 = tt.addptr %tmp10_25, %tmp10_24 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc130) + %tmp10_27 = tt.load %tmp10_26, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc131) + %tmp10_28 = arith.extf %tmp10_27 : tensor<512xbf16> to tensor<512xf32> loc(#loc132) + %tmp11 = arith.mulf %tmp3_22, %tmp10_28 : tensor<512xf32> loc(#loc133) + %tmp12 = arith.subf %cst_2, %tmp11 : tensor<512xf32> loc(#loc134) + %tmp16 = arith.select %tmp2_16, %tmp12, %cst_2 : tensor<512xi1>, tensor<512xf32> loc(#loc169) + %tmp17 = arith.cmpi slt, %x0_8, %tmp2 : tensor<512xi64> loc(#loc137) + %tmp18 = arith.addi %x0_7, %x0 : tensor<512xi64> loc(#loc138) + %tmp18_29 = arith.addi %tmp18, %tmp3_17 : tensor<512xi64> loc(#loc139) + %tmp18_30 = tt.addptr %tmp31, %tmp18_29 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc140) + %tmp18_31 = arith.andi %tmp17, %xmask_6 : tensor<512xi1> loc(#loc141) + %tmp18_32 = tt.load %tmp18_30, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc142) + %tmp18_33 = arith.extf %tmp18_32 : tensor<512xbf16> to tensor<512xf32> loc(#loc143) + %tmp19 = tt.load %tmp32_14, %tmp18_31, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc144) + %tmp21 = arith.addi %tmp19, %tmp5 : tensor<512xi64> loc(#loc145) + %tmp22 = arith.cmpi slt, %tmp19, %cst : tensor<512xi64> loc(#loc146) + %tmp23 = arith.select %tmp22, %tmp21, %tmp19 : tensor<512xi1>, tensor<512xi64> loc(#loc147) + %5 = arith.cmpi sge, %tmp23, %cst : tensor<512xi64> loc(#loc55) + %6 = arith.cmpi slt, %tmp23, %tmp5 : tensor<512xi64> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<512xi1> loc(#loc57) + %8 = arith.xori %tmp18_31, %cst_1 : tensor<512xi1> loc(#loc58) + %9 = arith.ori %7, %8 : tensor<512xi1> loc(#loc59) + tt.assert %9, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<512xi1> loc(#loc60) + %tmp25 = arith.addi %x0_7, %x0_8 : tensor<512xi64> loc(#loc148) + %tmp25_34 = arith.addi %tmp25, %tmp3_17 : tensor<512xi64> loc(#loc149) + %tmp25_35 = arith.muli %x0_7, %tmp23 : tensor<512xi64> loc(#loc150) + %tmp25_36 = arith.addi %tmp25_34, %tmp25_35 : tensor<512xi64> loc(#loc151) + %tmp25_37 = tt.addptr %tmp10_25, %tmp25_36 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc152) + %tmp25_38 = tt.load %tmp25_37, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc153) + %tmp25_39 = arith.extf %tmp25_38 : tensor<512xbf16> to tensor<512xf32> loc(#loc154) + %tmp26 = arith.mulf %tmp18_33, %tmp25_39 : tensor<512xf32> loc(#loc155) + %tmp29 = arith.select %tmp17, %tmp26, %cst_2 : tensor<512xi1>, tensor<512xf32> loc(#loc170) + %tmp30 = arith.addf %tmp16, %tmp29 : tensor<512xf32> loc(#loc158) + %tmp34 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc159) + %tmp34_40 = arith.addi %tmp32_15, %tmp34 : tensor<512xi64> loc(#loc159) + %tmp35 = arith.cmpi slt, %tmp32_15, %cst : tensor<512xi64> loc(#loc160) + %tmp36 = arith.select %tmp35, %tmp34_40, %tmp32_15 : tensor<512xi1>, tensor<512xi64> loc(#loc161) + %10 = arith.cmpi sge, %tmp36, %cst : tensor<512xi64> loc(#loc75) + %11 = arith.cmpi slt, %tmp36, %tmp34 : tensor<512xi64> loc(#loc76) + %12 = arith.andi %10, %11 : tensor<512xi1> loc(#loc77) + %13 = arith.xori %xmask_6, %cst_1 : tensor<512xi1> loc(#loc78) + %14 = arith.ori %12, %13 : tensor<512xi1> loc(#loc79) + tt.assert %14, "index out of bounds: 0 <= tmp36 < ks3" : tensor<512xi1> loc(#loc80) + %tmp38 = arith.muli %x0_7, %tmp36 : tensor<512xi64> loc(#loc162) + %tmp38_41 = arith.addi %x0_8, %tmp38 : tensor<512xi64> loc(#loc163) + %tmp38_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc164) + %tmp38_43 = tt.addptr %tmp38_42, %tmp38_41 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc164) + %tmp38_44 = tt.load %tmp38_43, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc165) + %tmp38_45 = arith.extf %tmp38_44 : tensor<512xbf16> to tensor<512xf32> loc(#loc166) + %tmp39 = arith.mulf %tmp31_13, %tmp38_45 : tensor<512xf32> loc(#loc167) + %tmp40 = arith.addf %tmp30, %tmp39 : tensor<512xf32> loc(#loc168) + %15 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc88) + %16 = tt.addptr %15, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc88) + %17 = arith.truncf %tmp40 : tensor<512xf32> to tensor<512xbf16> loc(#loc89) + tt.store %16, %17, %xmask_6 : tensor<512x!tt.ptr> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":24:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":24:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":25:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":25:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":25:76) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":26:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":26:36) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":28:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":29:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:35) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":30:111) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":31:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":32:32) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":33:18) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":34:18) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":35:32) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:98) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:64) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:108) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:106) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":36:123) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:58) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:65) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":37:123) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":38:19) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":39:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":43:34) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":41:34) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":44:19) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:37) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:42) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:68) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:60) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":45:119) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":46:36) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":48:20) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":49:20) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":50:35) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:100) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:65) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:110) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:108) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":51:126) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:37) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:64) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:60) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:31) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:72) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":52:131) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":53:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":56:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":55:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":57:20) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":59:20) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":60:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":61:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:28) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:46) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:38) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:56) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:54) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":62:64) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:40) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:36) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:31) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:48) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":63:88) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":64:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":65:20) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":66:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":66:37) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpzw7g6yjflpctcqkzf5osq7m5acrctaysa6th3ox3deinxluypc.py":66:4) +#loc101 = loc("xoffset"(#loc2)) +#loc102 = loc("xoffset"(#loc3)) +#loc103 = loc("xindex"(#loc4)) +#loc104 = loc("xindex"(#loc5)) +#loc105 = loc("xmask"(#loc6)) +#loc106 = loc("x0"(#loc7)) +#loc107 = loc("x1"(#loc8)) +#loc108 = loc("x1"(#loc9)) +#loc109 = loc("tmp31"(#loc10)) +#loc110 = loc("tmp31"(#loc11)) +#loc111 = loc("tmp31"(#loc12)) +#loc112 = loc("tmp32"(#loc13)) +#loc113 = loc("tmp32"(#loc14)) +#loc114 = loc("tmp1"(#loc15)) +#loc115 = loc("tmp2"(#loc16)) +#loc116 = loc("tmp3"(#loc17)) +#loc117 = loc("tmp3"(#loc18)) +#loc118 = loc("tmp3"(#loc19)) +#loc119 = loc("tmp3"(#loc20)) +#loc120 = loc("tmp3"(#loc21)) +#loc121 = loc("tmp3"(#loc22)) +#loc122 = loc("tmp4"(#loc23)) +#loc123 = loc("tmp5"(#loc24)) +#loc124 = loc("tmp6"(#loc25)) +#loc125 = loc("tmp7"(#loc26)) +#loc126 = loc("tmp8"(#loc27)) +#loc127 = loc("tmp10"(#loc34)) +#loc128 = loc("tmp10"(#loc35)) +#loc129 = loc("tmp10"(#loc36)) +#loc130 = loc("tmp10"(#loc37)) +#loc131 = loc("tmp10"(#loc38)) +#loc132 = loc("tmp10"(#loc39)) +#loc133 = loc("tmp11"(#loc40)) +#loc134 = loc("tmp12"(#loc41)) +#loc135 = loc("tmp16"(#loc42)) +#loc136 = loc("tmp14"(#loc43)) +#loc137 = loc("tmp17"(#loc44)) +#loc138 = loc("tmp18"(#loc45)) +#loc139 = loc("tmp18"(#loc46)) +#loc140 = loc("tmp18"(#loc47)) +#loc141 = loc("tmp18"(#loc48)) +#loc142 = loc("tmp18"(#loc49)) +#loc143 = loc("tmp18"(#loc50)) +#loc144 = loc("tmp19"(#loc51)) +#loc145 = loc("tmp21"(#loc52)) +#loc146 = loc("tmp22"(#loc53)) +#loc147 = loc("tmp23"(#loc54)) +#loc148 = loc("tmp25"(#loc61)) +#loc149 = loc("tmp25"(#loc62)) +#loc150 = loc("tmp25"(#loc63)) +#loc151 = loc("tmp25"(#loc64)) +#loc152 = loc("tmp25"(#loc65)) +#loc153 = loc("tmp25"(#loc66)) +#loc154 = loc("tmp25"(#loc67)) +#loc155 = loc("tmp26"(#loc68)) +#loc156 = loc("tmp29"(#loc69)) +#loc157 = loc("tmp28"(#loc70)) +#loc158 = loc("tmp30"(#loc71)) +#loc159 = loc("tmp34"(#loc72)) +#loc160 = loc("tmp35"(#loc73)) +#loc161 = loc("tmp36"(#loc74)) +#loc162 = loc("tmp38"(#loc81)) +#loc163 = loc("tmp38"(#loc82)) +#loc164 = loc("tmp38"(#loc83)) +#loc165 = loc("tmp38"(#loc84)) +#loc166 = loc("tmp38"(#loc85)) +#loc167 = loc("tmp39"(#loc86)) +#loc168 = loc("tmp40"(#loc87)) +#loc169 = loc(fused[#loc135, #loc136]) +#loc170 = loc(fused[#loc156, #loc157]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/__grp__triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/__grp__triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9eedf0d865923218f06b8acc370d8b39028b60b1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/__grp__triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..200ab1ea58eb7416b11bb11bb97c73d7a7488966 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4995b3ee05ac1e475b8a4ae8eb6ffc9ffc1abd9b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"hash": "c1c151c2ec256bf97239dc73a11ec7285ba76d002f69baae4ffe81038065d86d", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..1a65a64a39fdd7d44d56cc259faea553f90f0070 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.llir @@ -0,0 +1,435 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_argmax_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %9 = shl nuw nsw i32 %8, 2, !dbg !8 + %10 = and i32 %9, 2044, !dbg !8 + %11 = lshr i32 %7, 11, !dbg !9 + %12 = mul i32 %7, 32000 + %13 = mul i32 %11, 224000 + %14 = add i32 %13, %12 + %15 = zext nneg i32 %10 to i64, !dbg !10 + br label %16, !dbg !10 + +16: ; preds = %6, %16 + %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %16 ] + %17 = phi i32 [ 2147483647, %6 ], [ %103, %16 ] + %18 = phi i32 [ 2147483647, %6 ], [ %104, %16 ] + %19 = phi float [ 0xFFF0000000000000, %6 ], [ %100, %16 ] + %20 = phi float [ 0xFFF0000000000000, %6 ], [ %101, %16 ] + %21 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %99, %16 ] + %22 = phi <2 x i32> [ splat (i32 2147483647), %6 ], [ %102, %16 ] + %23 = or disjoint i64 %indvars.iv, %15, !dbg !11 + %24 = or disjoint i64 %23, 2, !dbg !11 + %25 = or disjoint i64 %23, 3, !dbg !11 + %26 = icmp samesign ult i64 %23, 32000, !dbg !12 + %27 = trunc nuw nsw i64 %23 to i32, !dbg !13 + %28 = add i32 %14, %27, !dbg !13 + %29 = sext i32 %28 to i64, !dbg !14 + %30 = getelementptr float, ptr addrspace(1) %0, i64 %29, !dbg !14 + %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !15 + %32 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %30, i64 %31, i1 %26) #4, !dbg !15 + %33 = extractvalue { i32, i32, i32, i32 } %32, 0, !dbg !15 + %34 = extractvalue { i32, i32, i32, i32 } %32, 1, !dbg !15 + %35 = extractvalue { i32, i32, i32, i32 } %32, 2, !dbg !15 + %36 = extractvalue { i32, i32, i32, i32 } %32, 3, !dbg !15 + %37 = bitcast i32 %35 to float, !dbg !15 + %38 = bitcast i32 %36 to float, !dbg !15 + %39 = fcmp ogt float %19, %37, !dbg !16 + %40 = fcmp ogt float %20, %38, !dbg !16 + %41 = fcmp oeq float %19, %37, !dbg !20 + %42 = fcmp oeq float %20, %38, !dbg !20 + %43 = fcmp uno <2 x float> %21, zeroinitializer, !dbg !21 + %44 = fcmp uno float %19, 0.000000e+00, !dbg !21 + %45 = fcmp uno float %20, 0.000000e+00, !dbg !21 + %46 = fcmp uno float %37, 0.000000e+00, !dbg !22 + %47 = fcmp uno float %38, 0.000000e+00, !dbg !22 + %48 = xor i1 %46, true, !dbg !23 + %49 = xor i1 %47, true, !dbg !23 + %50 = and i1 %44, %48, !dbg !24 + %51 = and i1 %45, %49, !dbg !24 + %52 = or i1 %39, %50, !dbg !25 + %53 = or i1 %40, %51, !dbg !25 + %54 = and i1 %44, %46, !dbg !26 + %55 = and i1 %45, %47, !dbg !26 + %56 = or i1 %41, %54, !dbg !27 + %57 = or i1 %42, %55, !dbg !27 + %58 = sext <2 x i32> %22 to <2 x i64>, !dbg !28 + %59 = sext i32 %17 to i64, !dbg !28 + %60 = icmp sgt i64 %24, %59, !dbg !28 + %61 = sext i32 %18 to i64, !dbg !28 + %62 = icmp sgt i64 %25, %61, !dbg !28 + %63 = and i1 %60, %56, !dbg !29 + %64 = and i1 %62, %57, !dbg !29 + %65 = or i1 %52, %63, !dbg !30 + %66 = or i1 %53, %64, !dbg !30 + %67 = select i1 %65, float %19, float %37, !dbg !31 + %68 = select i1 %66, float %20, float %38, !dbg !31 + %69 = trunc nuw nsw i64 %23 to i32, !dbg !32 + %70 = or disjoint i32 %69, 1, !dbg !32 + %71 = insertelement <2 x i32> poison, i32 %33, i64 0, !dbg !15 + %72 = insertelement <2 x i32> %71, i32 %34, i64 1, !dbg !15 + %73 = bitcast <2 x i32> %72 to <2 x float>, !dbg !15 + %74 = fcmp ogt <2 x float> %21, %73, !dbg !16 + %75 = fcmp oeq <2 x float> %21, %73, !dbg !20 + %76 = fcmp uno <2 x float> %73, zeroinitializer, !dbg !22 + %77 = xor <2 x i1> %76, splat (i1 true), !dbg !23 + %78 = and <2 x i1> %43, %77, !dbg !24 + %79 = or <2 x i1> %74, %78, !dbg !25 + %80 = and <2 x i1> %43, %76, !dbg !26 + %81 = or <2 x i1> %75, %80, !dbg !27 + %82 = insertelement <2 x i64> poison, i64 %23, i64 0, !dbg !28 + %83 = shufflevector <2 x i64> %82, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !28 + %84 = icmp sgt <2 x i64> %83, %58, !dbg !28 + %85 = icmp sge <2 x i64> %83, %58, !dbg !28 + %86 = shufflevector <2 x i1> %84, <2 x i1> %85, <2 x i32> , !dbg !28 + %87 = and <2 x i1> %86, %81, !dbg !29 + %88 = or <2 x i1> %79, %87, !dbg !30 + %89 = select <2 x i1> %88, <2 x float> %21, <2 x float> %73, !dbg !31 + %90 = insertelement <2 x i32> poison, i32 %27, i64 0, !dbg !32 + %91 = insertelement <2 x i32> %90, i32 %70, i64 1, !dbg !32 + %92 = select <2 x i1> %88, <2 x i32> %22, <2 x i32> %91, !dbg !32 + %93 = trunc nuw nsw i64 %24 to i32, !dbg !32 + %94 = select i1 %65, i32 %17, i32 %93, !dbg !32 + %95 = trunc nuw nsw i64 %25 to i32, !dbg !32 + %96 = select i1 %66, i32 %18, i32 %95, !dbg !32 + %97 = insertelement <2 x i1> poison, i1 %26, i64 0, !dbg !33 + %98 = shufflevector <2 x i1> %97, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !33 + %99 = select <2 x i1> %98, <2 x float> %89, <2 x float> %21, !dbg !33 + %100 = select i1 %26, float %67, float %19, !dbg !33 + %101 = select i1 %26, float %68, float %20, !dbg !33 + %102 = select <2 x i1> %98, <2 x i32> %92, <2 x i32> %22, !dbg !34 + %103 = select i1 %26, i32 %94, i32 %17, !dbg !34 + %104 = select i1 %26, i32 %96, i32 %18, !dbg !34 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !10 + %105 = icmp samesign ult i64 %indvars.iv, 29952, !dbg !10 + br i1 %105, label %16, label %106, !dbg !10 + +106: ; preds = %16 + %107 = and i32 %8, 31, !dbg !8 + %108 = lshr i32 %8, 5, !dbg !8 + %109 = shufflevector <2 x float> %99, <2 x float> poison, <2 x i32> , !dbg !35 + %110 = fcmp ogt <2 x float> %99, %109, !dbg !35 + %111 = fcmp oeq <2 x float> %99, %109, !dbg !35 + %112 = shufflevector <2 x i1> %110, <2 x i1> %111, <2 x i32> , !dbg !35 + %113 = extractelement <2 x float> %99, i64 0, !dbg !37 + %114 = fcmp uno float %113, 0.000000e+00, !dbg !37 + %115 = extractelement <2 x float> %99, i64 1, !dbg !38 + %116 = fcmp uno float %115, 0.000000e+00, !dbg !38 + %117 = xor i1 %116, true, !dbg !39 + %118 = insertelement <2 x i1> poison, i1 %114, i64 0, !dbg !40 + %119 = shufflevector <2 x i1> %118, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !40 + %120 = insertelement <2 x i1> poison, i1 %117, i64 0, !dbg !40 + %121 = insertelement <2 x i1> %120, i1 %116, i64 1, !dbg !40 + %122 = and <2 x i1> %119, %121, !dbg !40 + %123 = or <2 x i1> %112, %122, !dbg !41 + %124 = extractelement <2 x i32> %102, i64 0, !dbg !42 + %125 = extractelement <2 x i32> %102, i64 1, !dbg !42 + %126 = icmp slt i32 %124, %125, !dbg !42 + %127 = extractelement <2 x i1> %123, i64 1, !dbg !43 + %128 = and i1 %126, %127, !dbg !43 + %129 = extractelement <2 x i1> %123, i64 0, !dbg !44 + %130 = or i1 %129, %128, !dbg !44 + %131 = select i1 %130, float %113, float %115, !dbg !45 + %132 = select i1 %130, i32 %124, i32 %125, !dbg !46 + %133 = fcmp ogt float %131, %100, !dbg !35 + %134 = fcmp oeq float %131, %100, !dbg !47 + %135 = fcmp uno float %131, 0.000000e+00, !dbg !37 + %136 = fcmp uno float %100, 0.000000e+00, !dbg !38 + %137 = xor i1 %136, true, !dbg !39 + %138 = and i1 %135, %137, !dbg !40 + %139 = or i1 %133, %138, !dbg !41 + %140 = and i1 %136, %135, !dbg !48 + %141 = or i1 %134, %140, !dbg !49 + %142 = icmp slt i32 %132, %103, !dbg !42 + %143 = and i1 %142, %141, !dbg !43 + %144 = or i1 %139, %143, !dbg !44 + %145 = select i1 %144, float %131, float %100, !dbg !45 + %146 = select i1 %144, i32 %132, i32 %103, !dbg !46 + %147 = fcmp ogt float %145, %101, !dbg !35 + %148 = fcmp oeq float %145, %101, !dbg !47 + %149 = fcmp uno float %145, 0.000000e+00, !dbg !37 + %150 = fcmp uno float %101, 0.000000e+00, !dbg !38 + %151 = xor i1 %150, true, !dbg !39 + %152 = and i1 %149, %151, !dbg !40 + %153 = or i1 %147, %152, !dbg !41 + %154 = and i1 %150, %149, !dbg !48 + %155 = or i1 %148, %154, !dbg !49 + %156 = icmp slt i32 %146, %104, !dbg !42 + %157 = and i1 %156, %155, !dbg !43 + %158 = or i1 %153, %157, !dbg !44 + %159 = select i1 %158, float %145, float %101, !dbg !45 + %160 = select i1 %158, i32 %146, i32 %104, !dbg !46 + %161 = bitcast float %159 to i32, !dbg !50 + %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 16, i32 31), !dbg !50 + %163 = bitcast i32 %162 to float, !dbg !50 + %164 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 16, i32 31), !dbg !50 + %165 = fcmp ogt float %159, %163, !dbg !35 + %166 = fcmp oeq float %159, %163, !dbg !47 + %167 = fcmp uno float %159, 0.000000e+00, !dbg !37 + %168 = fcmp uno float %163, 0.000000e+00, !dbg !38 + %169 = xor i1 %168, true, !dbg !39 + %170 = and i1 %167, %169, !dbg !40 + %171 = or i1 %165, %170, !dbg !41 + %172 = and i1 %167, %168, !dbg !48 + %173 = or i1 %166, %172, !dbg !49 + %174 = icmp slt i32 %160, %164, !dbg !42 + %175 = and i1 %174, %173, !dbg !43 + %176 = or i1 %171, %175, !dbg !44 + %177 = select i1 %176, float %159, float %163, !dbg !45 + %178 = select i1 %176, i32 %160, i32 %164, !dbg !46 + %179 = bitcast float %177 to i32, !dbg !50 + %180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %179, i32 8, i32 31), !dbg !50 + %181 = bitcast i32 %180 to float, !dbg !50 + %182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 8, i32 31), !dbg !50 + %183 = fcmp ogt float %177, %181, !dbg !35 + %184 = fcmp oeq float %177, %181, !dbg !47 + %185 = fcmp uno float %177, 0.000000e+00, !dbg !37 + %186 = fcmp uno float %181, 0.000000e+00, !dbg !38 + %187 = xor i1 %186, true, !dbg !39 + %188 = and i1 %185, %187, !dbg !40 + %189 = or i1 %183, %188, !dbg !41 + %190 = and i1 %186, %185, !dbg !48 + %191 = or i1 %184, %190, !dbg !49 + %192 = icmp slt i32 %178, %182, !dbg !42 + %193 = and i1 %192, %191, !dbg !43 + %194 = or i1 %189, %193, !dbg !44 + %195 = select i1 %194, float %177, float %181, !dbg !45 + %196 = select i1 %194, i32 %178, i32 %182, !dbg !46 + %197 = bitcast float %195 to i32, !dbg !50 + %198 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %197, i32 4, i32 31), !dbg !50 + %199 = bitcast i32 %198 to float, !dbg !50 + %200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 4, i32 31), !dbg !50 + %201 = fcmp ogt float %195, %199, !dbg !35 + %202 = fcmp oeq float %195, %199, !dbg !47 + %203 = fcmp uno float %195, 0.000000e+00, !dbg !37 + %204 = fcmp uno float %199, 0.000000e+00, !dbg !38 + %205 = xor i1 %204, true, !dbg !39 + %206 = and i1 %203, %205, !dbg !40 + %207 = or i1 %201, %206, !dbg !41 + %208 = and i1 %204, %203, !dbg !48 + %209 = or i1 %202, %208, !dbg !49 + %210 = icmp slt i32 %196, %200, !dbg !42 + %211 = and i1 %210, %209, !dbg !43 + %212 = or i1 %207, %211, !dbg !44 + %213 = select i1 %212, float %195, float %199, !dbg !45 + %214 = select i1 %212, i32 %196, i32 %200, !dbg !46 + %215 = bitcast float %213 to i32, !dbg !50 + %216 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %215, i32 2, i32 31), !dbg !50 + %217 = bitcast i32 %216 to float, !dbg !50 + %218 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %214, i32 2, i32 31), !dbg !50 + %219 = fcmp ogt float %213, %217, !dbg !35 + %220 = fcmp oeq float %213, %217, !dbg !47 + %221 = fcmp uno float %213, 0.000000e+00, !dbg !37 + %222 = fcmp uno float %217, 0.000000e+00, !dbg !38 + %223 = xor i1 %222, true, !dbg !39 + %224 = and i1 %221, %223, !dbg !40 + %225 = or i1 %219, %224, !dbg !41 + %226 = and i1 %222, %221, !dbg !48 + %227 = or i1 %220, %226, !dbg !49 + %228 = icmp slt i32 %214, %218, !dbg !42 + %229 = and i1 %228, %227, !dbg !43 + %230 = or i1 %225, %229, !dbg !44 + %231 = select i1 %230, float %213, float %217, !dbg !45 + %232 = select i1 %230, i32 %214, i32 %218, !dbg !46 + %233 = bitcast float %231 to i32, !dbg !50 + %234 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %233, i32 1, i32 31), !dbg !50 + %235 = bitcast i32 %234 to float, !dbg !50 + %236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !50 + %237 = fcmp ogt float %231, %235, !dbg !35 + %238 = fcmp oeq float %231, %235, !dbg !47 + %239 = fcmp uno float %231, 0.000000e+00, !dbg !37 + %240 = fcmp uno float %235, 0.000000e+00, !dbg !38 + %241 = xor i1 %240, true, !dbg !39 + %242 = and i1 %239, %241, !dbg !40 + %243 = or i1 %237, %242, !dbg !41 + %244 = and i1 %240, %239, !dbg !48 + %245 = or i1 %238, %244, !dbg !49 + %246 = icmp slt i32 %232, %236, !dbg !42 + %247 = and i1 %246, %245, !dbg !43 + %248 = or i1 %243, %247, !dbg !44 + %249 = select i1 %248, i32 %232, i32 %236, !dbg !46 + %250 = and i32 %108, 15, !dbg !50 + %251 = icmp eq i32 %107, 0, !dbg !50 + %252 = getelementptr float, ptr addrspace(3) @global_smem, i32 %250, !dbg !50 + %253 = select i1 %248, i32 %233, i32 %234, !dbg !45 + %254 = insertelement <1 x i32> poison, i32 %253, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %252, <1 x i32> %254, i1 %251) #4, !dbg !50 + %255 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %250, !dbg !50 + %256 = insertelement <1 x i32> poison, i32 %249, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %255, <1 x i32> %256, i1 %251) #4, !dbg !50 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50 + %257 = icmp samesign ult i32 %8, 16, !dbg !50 + %258 = getelementptr float, ptr addrspace(3) @global_smem, i32 %8, !dbg !50 + %259 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %258, i1 %257) #4, !dbg !50 + %260 = bitcast i32 %259 to float, !dbg !50 + %261 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %8, !dbg !50 + %262 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %261, i1 %257) #4, !dbg !50 + %263 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %259, i32 8, i32 31), !dbg !50 + %264 = bitcast i32 %263 to float, !dbg !50 + %265 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %262, i32 8, i32 31), !dbg !50 + %266 = fcmp ogt float %260, %264, !dbg !35 + %267 = fcmp oeq float %260, %264, !dbg !47 + %268 = fcmp uno float %260, 0.000000e+00, !dbg !37 + %269 = fcmp uno float %264, 0.000000e+00, !dbg !38 + %270 = xor i1 %269, true, !dbg !39 + %271 = and i1 %268, %270, !dbg !40 + %272 = or i1 %266, %271, !dbg !41 + %273 = and i1 %268, %269, !dbg !48 + %274 = or i1 %267, %273, !dbg !49 + %275 = icmp slt i32 %262, %265, !dbg !42 + %276 = and i1 %275, %274, !dbg !43 + %277 = or i1 %272, %276, !dbg !44 + %278 = select i1 %277, float %260, float %264, !dbg !45 + %279 = select i1 %277, i32 %262, i32 %265, !dbg !46 + %280 = bitcast float %278 to i32, !dbg !50 + %281 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %280, i32 4, i32 31), !dbg !50 + %282 = bitcast i32 %281 to float, !dbg !50 + %283 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %279, i32 4, i32 31), !dbg !50 + %284 = fcmp ogt float %278, %282, !dbg !35 + %285 = fcmp oeq float %278, %282, !dbg !47 + %286 = fcmp uno float %278, 0.000000e+00, !dbg !37 + %287 = fcmp uno float %282, 0.000000e+00, !dbg !38 + %288 = xor i1 %287, true, !dbg !39 + %289 = and i1 %286, %288, !dbg !40 + %290 = or i1 %284, %289, !dbg !41 + %291 = and i1 %287, %286, !dbg !48 + %292 = or i1 %285, %291, !dbg !49 + %293 = icmp slt i32 %279, %283, !dbg !42 + %294 = and i1 %293, %292, !dbg !43 + %295 = or i1 %290, %294, !dbg !44 + %296 = select i1 %295, float %278, float %282, !dbg !45 + %297 = select i1 %295, i32 %279, i32 %283, !dbg !46 + %298 = bitcast float %296 to i32, !dbg !50 + %299 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 2, i32 31), !dbg !50 + %300 = bitcast i32 %299 to float, !dbg !50 + %301 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %297, i32 2, i32 31), !dbg !50 + %302 = fcmp ogt float %296, %300, !dbg !35 + %303 = fcmp oeq float %296, %300, !dbg !47 + %304 = fcmp uno float %296, 0.000000e+00, !dbg !37 + %305 = fcmp uno float %300, 0.000000e+00, !dbg !38 + %306 = xor i1 %305, true, !dbg !39 + %307 = and i1 %304, %306, !dbg !40 + %308 = or i1 %302, %307, !dbg !41 + %309 = and i1 %305, %304, !dbg !48 + %310 = or i1 %303, %309, !dbg !49 + %311 = icmp slt i32 %297, %301, !dbg !42 + %312 = and i1 %311, %310, !dbg !43 + %313 = or i1 %308, %312, !dbg !44 + %314 = select i1 %313, float %296, float %300, !dbg !45 + %315 = select i1 %313, i32 %297, i32 %301, !dbg !46 + %316 = bitcast float %314 to i32, !dbg !50 + %317 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %316, i32 1, i32 31), !dbg !50 + %318 = bitcast i32 %317 to float, !dbg !50 + %319 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %315, i32 1, i32 31), !dbg !50 + %320 = fcmp ogt float %314, %318, !dbg !35 + %321 = fcmp oeq float %314, %318, !dbg !47 + %322 = fcmp uno float %314, 0.000000e+00, !dbg !37 + %323 = fcmp uno float %318, 0.000000e+00, !dbg !38 + %324 = xor i1 %323, true, !dbg !39 + %325 = and i1 %322, %324, !dbg !40 + %326 = or i1 %320, %325, !dbg !41 + %327 = and i1 %323, %322, !dbg !48 + %328 = or i1 %321, %327, !dbg !49 + %329 = icmp slt i32 %315, %319, !dbg !42 + %330 = and i1 %329, %328, !dbg !43 + %331 = or i1 %326, %330, !dbg !44 + %332 = select i1 %331, i32 %315, i32 %319, !dbg !46 + %333 = icmp eq i32 %8, 0, !dbg !50 + %334 = select i1 %331, i32 %316, i32 %317, !dbg !45 + %335 = insertelement <1 x i32> poison, i32 %334, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %258, <1 x i32> %335, i1 %333) #4, !dbg !50 + %336 = insertelement <1 x i32> poison, i32 %332, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %261, <1 x i32> %336, i1 %333) #4, !dbg !50 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50 + %337 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !50 + %338 = zext nneg i32 %7 to i64, !dbg !51 + %339 = getelementptr i64, ptr addrspace(1) %1, i64 %338, !dbg !51 + %340 = sext i32 %337 to i64, !dbg !52 + %341 = and i32 %8, 511, !dbg !52 + %342 = icmp eq i32 %341, 0, !dbg !52 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %340, ptr addrspace(1) %339, i1 %342) #4, !dbg !52 + ret void, !dbg !53 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_1", linkageName: "triton_red_fused_argmax_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 26, column: 37, scope: !4) +!9 = !DILocation(line: 29, column: 19, scope: !4) +!10 = !DILocation(line: 33, column: 40, scope: !4) +!11 = !DILocation(line: 34, column: 31, scope: !4) +!12 = !DILocation(line: 35, column: 29, scope: !4) +!13 = !DILocation(line: 39, column: 52, scope: !4) +!14 = !DILocation(line: 39, column: 34, scope: !4) +!15 = !DILocation(line: 39, column: 66, scope: !4) +!16 = !DILocation(line: 144, column: 21, scope: !17, inlinedAt: !19) +!17 = distinct !DILexicalBlockFile(scope: !4, file: !18, discriminator: 0) +!18 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!19 = !DILocation(line: 42, column: 38, scope: !4) +!20 = !DILocation(line: 145, column: 23, scope: !17, inlinedAt: !19) +!21 = !DILocation(line: 147, column: 29, scope: !17, inlinedAt: !19) +!22 = !DILocation(line: 148, column: 29, scope: !17, inlinedAt: !19) +!23 = !DILocation(line: 149, column: 31, scope: !17, inlinedAt: !19) +!24 = !DILocation(line: 149, column: 27, scope: !17, inlinedAt: !19) +!25 = !DILocation(line: 149, column: 16, scope: !17, inlinedAt: !19) +!26 = !DILocation(line: 151, column: 27, scope: !17, inlinedAt: !19) +!27 = !DILocation(line: 151, column: 17, scope: !17, inlinedAt: !19) +!28 = !DILocation(line: 154, column: 31, scope: !17, inlinedAt: !19) +!29 = !DILocation(line: 154, column: 21, scope: !17, inlinedAt: !19) +!30 = !DILocation(line: 154, column: 12, scope: !17, inlinedAt: !19) +!31 = !DILocation(line: 155, column: 35, scope: !17, inlinedAt: !19) +!32 = !DILocation(line: 155, column: 69, scope: !17, inlinedAt: !19) +!33 = !DILocation(line: 44, column: 46, scope: !4) +!34 = !DILocation(line: 45, column: 58, scope: !4) +!35 = !DILocation(line: 144, column: 21, scope: !17, inlinedAt: !36) +!36 = !DILocation(line: 46, column: 75, scope: !4) +!37 = !DILocation(line: 147, column: 29, scope: !17, inlinedAt: !36) +!38 = !DILocation(line: 148, column: 29, scope: !17, inlinedAt: !36) +!39 = !DILocation(line: 149, column: 31, scope: !17, inlinedAt: !36) +!40 = !DILocation(line: 149, column: 27, scope: !17, inlinedAt: !36) +!41 = !DILocation(line: 149, column: 16, scope: !17, inlinedAt: !36) +!42 = !DILocation(line: 154, column: 31, scope: !17, inlinedAt: !36) +!43 = !DILocation(line: 154, column: 21, scope: !17, inlinedAt: !36) +!44 = !DILocation(line: 154, column: 12, scope: !17, inlinedAt: !36) +!45 = !DILocation(line: 155, column: 35, scope: !17, inlinedAt: !36) +!46 = !DILocation(line: 155, column: 69, scope: !17, inlinedAt: !36) +!47 = !DILocation(line: 145, column: 23, scope: !17, inlinedAt: !36) +!48 = !DILocation(line: 151, column: 27, scope: !17, inlinedAt: !36) +!49 = !DILocation(line: 151, column: 17, scope: !17, inlinedAt: !36) +!50 = !DILocation(line: 165, column: 42, scope: !17, inlinedAt: !36) +!51 = !DILocation(line: 48, column: 25, scope: !4) +!52 = !DILocation(line: 48, column: 36, scope: !4) +!53 = !DILocation(line: 48, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..ca1087737ae3568976e8e7e1e1e6fc95a84ddf80 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.ptx @@ -0,0 +1,839 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_argmax_1 // -- Begin function triton_red_fused_argmax_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_argmax_1 +.visible .entry triton_red_fused_argmax_1( + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_1, + .param .u32 triton_red_fused_argmax_1_param_2, + .param .u32 triton_red_fused_argmax_1_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_5 +) +.reqntid 512 +{ + .reg .pred %p<202>; + .reg .b32 %r<119>; + .reg .b64 %rd<31>; + .loc 1 18 0 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_red_fused_argmax_1_param_1]; + ld.param.b64 %rd7, [triton_red_fused_argmax_1_param_0]; +$L__tmp0: + .loc 1 23 28 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:23:28 + mov.u32 %r1, %ctaid.x; + .loc 1 26 37 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:26:37 + mov.u32 %r2, %tid.x; + shl.b32 %r19, %r2, 2; + and.b32 %r20, %r19, 2044; + .loc 1 29 19 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:29:19 + shr.u32 %r21, %r1, 11; + mul.lo.s32 %r22, %r1, 32000; + mad.lo.s32 %r23, %r21, 224000, %r22; + .loc 1 33 40 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:33:40 + cvt.u64.u32 %rd1, %r20; + add.s32 %r24, %r23, %r20; + cvt.u64.u32 %rd2, %r24; + mov.b32 %r115, 0fFF800000; + mov.b64 %rd30, {%r115, %r115}; + mov.b32 %r113, 2147483647; + mov.b64 %rd29, -2048; + mov.b32 %r114, %r113; + mov.b32 %r116, %r115; + mov.b32 %r117, %r113; + mov.b32 %r118, %r113; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 34 31 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:34:31 + add.s64 %rd14, %rd1, %rd29; + add.s64 %rd15, %rd14, 2048; + add.s64 %rd16, %rd14, 2050; + .loc 1 35 29 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:35:29 + add.s64 %rd17, %rd14, 2051; + setp.lt.u64 %p1, %rd15, 32000; + .loc 1 39 34 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:39:34 + add.s64 %rd18, %rd2, %rd29; + cvt.u32.u64 %r33, %rd18; + add.s32 %r34, %r33, 2048; + mad.wide.s32 %rd12, %r34, 4, %rd7; + .loc 1 39 66 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:39:66 + // begin inline asm + mov.u64 %rd11, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd11, 1.0; + // end inline asm + mov.b32 %r29, 0; + // begin inline asm + mov.u32 %r25, %r29; + mov.u32 %r26, %r29; + mov.u32 %r27, %r29; + mov.u32 %r28, %r29; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd12 + 0 ], %rd11; + // end inline asm +$L__tmp1: + .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + setp.gt.f32 %p2, %r115, %r27; + setp.gt.f32 %p3, %r116, %r28; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + setp.eq.f32 %p4, %r115, %r27; + setp.eq.f32 %p5, %r116, %r28; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + mov.b64 {%r35, %r36}, %rd30; + setp.nan.f32 %p6, %r35, %r35; + setp.nan.f32 %p7, %r36, %r36; + setp.nan.f32 %p8, %r115, %r115; + setp.nan.f32 %p9, %r116, %r116; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + setp.nan.f32 %p10, %r27, %r27; + setp.num.f32 %p11, %r27, %r27; + setp.nan.f32 %p12, %r28, %r28; + setp.num.f32 %p13, %r28, %r28; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + and.pred %p14, %p8, %p11; + and.pred %p15, %p9, %p13; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + or.pred %p16, %p2, %p14; + or.pred %p17, %p3, %p15; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + and.pred %p18, %p8, %p10; + and.pred %p19, %p9, %p12; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + or.pred %p20, %p4, %p18; + or.pred %p21, %p5, %p19; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + cvt.s64.s32 %rd19, %r118; + cvt.s64.s32 %rd20, %r117; + cvt.s64.s32 %rd21, %r113; + setp.gt.s64 %p22, %rd16, %rd21; + cvt.s64.s32 %rd22, %r114; + setp.gt.s64 %p23, %rd17, %rd22; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + and.pred %p24, %p22, %p20; + and.pred %p25, %p23, %p21; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + or.pred %p26, %p16, %p24; + or.pred %p27, %p17, %p25; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + selp.f32 %r37, %r115, %r27, %p26; + selp.f32 %r38, %r116, %r28, %p27; +$L__tmp2: + .loc 1 39 66 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:39:66 + cvt.u64.u32 %rd23, %r25; + cvt.u64.u32 %rd24, %r26; + shl.b64 %rd25, %rd24, 32; + or.b64 %rd26, %rd23, %rd25; +$L__tmp3: + .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + mov.b64 {%r39, %r40}, %rd26; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + setp.gt.f32 %p28, %r36, %r40; + setp.gt.f32 %p29, %r35, %r39; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + setp.eq.f32 %p30, %r35, %r39; + setp.eq.f32 %p31, %r36, %r40; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + setp.nan.f32 %p32, %r40, %r40; + setp.nan.f32 %p33, %r39, %r39; + setp.num.f32 %p34, %r39, %r39; + setp.num.f32 %p35, %r40, %r40; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + and.pred %p36, %p7, %p35; + and.pred %p37, %p6, %p34; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + or.pred %p38, %p29, %p37; + or.pred %p39, %p28, %p36; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + and.pred %p40, %p6, %p33; + and.pred %p41, %p7, %p32; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + or.pred %p42, %p31, %p41; + or.pred %p43, %p30, %p40; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + setp.gt.s64 %p44, %rd15, %rd20; + setp.ge.s64 %p45, %rd15, %rd19; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + and.pred %p46, %p44, %p43; + and.pred %p47, %p45, %p42; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + or.pred %p48, %p39, %p47; + or.pred %p49, %p38, %p46; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + selp.f32 %r41, %r35, %r39, %p49; + selp.f32 %r42, %r36, %r40, %p48; + cvt.u32.u64 %r43, %rd15; + cvt.u32.u64 %r44, %rd14; + add.s32 %r45, %r44, 2049; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:42:38 ] + selp.b32 %r46, %r117, %r43, %p49; + selp.b32 %r47, %r118, %r45, %p48; + cvt.u32.u64 %r48, %rd16; + selp.b32 %r49, %r113, %r48, %p26; + cvt.u32.u64 %r50, %rd17; + selp.b32 %r51, %r114, %r50, %p27; +$L__tmp4: + .loc 1 44 46 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:44:46 + selp.f32 %r52, %r42, %r36, %p1; + selp.f32 %r53, %r41, %r35, %p1; + mov.b64 %rd30, {%r53, %r52}; + selp.f32 %r115, %r37, %r115, %p1; + selp.f32 %r116, %r38, %r116, %p1; + .loc 1 45 58 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:45:58 + selp.b32 %r118, %r47, %r118, %p1; + selp.b32 %r117, %r46, %r117, %p1; + selp.b32 %r113, %r49, %r113, %p1; + selp.b32 %r114, %r51, %r114, %p1; + .loc 1 33 40 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:33:40 + add.s64 %rd29, %rd29, 2048; + setp.lt.u64 %p50, %rd29, 29952; + @%p50 bra $L__BB0_1; +// %bb.2: + .loc 1 26 37 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:26:37 + and.b32 %r66, %r2, 31; +$L__tmp5: + .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + mov.b64 {%r67, %r68}, %rd30; + setp.gt.f32 %p58, %r67, %r68; + setp.eq.f32 %p59, %r68, %r67; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p60, %r67, %r67; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.num.f32 %p61, %r68, %r68; + setp.nan.f32 %p62, %r68, %r68; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p63, %p60, %p62; + and.pred %p64, %p60, %p61; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p65, %p58, %p64; + or.pred %p66, %p59, %p63; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.lt.s32 %p67, %r117, %r118; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p68, %p67, %p66; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p69, %p65, %p68; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.f32 %r69, %r67, %r68, %p69; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.b32 %r70, %r117, %r118, %p69; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.gt.f32 %p70, %r69, %r115; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.eq.f32 %p71, %r69, %r115; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p72, %r69, %r69; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p73, %r115, %r115; + setp.num.f32 %p74, %r115, %r115; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p75, %p72, %p74; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p76, %p70, %p75; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p77, %p73, %p72; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p78, %p71, %p77; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.lt.s32 %p79, %r70, %r113; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p80, %p79, %p78; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p81, %p76, %p80; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.f32 %r71, %r69, %r115, %p81; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.b32 %r72, %r70, %r113, %p81; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.gt.f32 %p82, %r71, %r116; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.eq.f32 %p83, %r71, %r116; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p84, %r71, %r71; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p85, %r116, %r116; + setp.num.f32 %p86, %r116, %r116; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p87, %p84, %p86; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p88, %p82, %p87; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p89, %p85, %p84; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p90, %p83, %p89; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.lt.s32 %p91, %r72, %r114; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p92, %p91, %p90; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p93, %p88, %p92; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.f32 %r73, %r71, %r116, %p93; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.b32 %r74, %r72, %r114, %p93; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + shfl.sync.bfly.b32 %r75, %r73, 16, 31, -1; + shfl.sync.bfly.b32 %r76, %r74, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.gt.f32 %p94, %r73, %r75; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.eq.f32 %p95, %r73, %r75; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p96, %r73, %r73; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p97, %r75, %r75; + setp.num.f32 %p98, %r75, %r75; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p99, %p96, %p98; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p100, %p94, %p99; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p101, %p96, %p97; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p102, %p95, %p101; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.lt.s32 %p103, %r74, %r76; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p104, %p103, %p102; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p105, %p100, %p104; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.f32 %r77, %r73, %r75, %p105; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.b32 %r78, %r74, %r76, %p105; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + shfl.sync.bfly.b32 %r79, %r77, 8, 31, -1; + shfl.sync.bfly.b32 %r80, %r78, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.gt.f32 %p106, %r77, %r79; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.eq.f32 %p107, %r77, %r79; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p108, %r77, %r77; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p109, %r79, %r79; + setp.num.f32 %p110, %r79, %r79; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p111, %p108, %p110; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p112, %p106, %p111; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p113, %p109, %p108; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p114, %p107, %p113; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.lt.s32 %p115, %r78, %r80; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p116, %p115, %p114; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p117, %p112, %p116; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.f32 %r81, %r77, %r79, %p117; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.b32 %r82, %r78, %r80, %p117; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + shfl.sync.bfly.b32 %r83, %r81, 4, 31, -1; + shfl.sync.bfly.b32 %r84, %r82, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.gt.f32 %p118, %r81, %r83; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.eq.f32 %p119, %r81, %r83; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p120, %r81, %r81; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p121, %r83, %r83; + setp.num.f32 %p122, %r83, %r83; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p123, %p120, %p122; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p124, %p118, %p123; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p125, %p121, %p120; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p126, %p119, %p125; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.lt.s32 %p127, %r82, %r84; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p128, %p127, %p126; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p129, %p124, %p128; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.f32 %r85, %r81, %r83, %p129; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.b32 %r86, %r82, %r84, %p129; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + shfl.sync.bfly.b32 %r87, %r85, 2, 31, -1; + shfl.sync.bfly.b32 %r88, %r86, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.gt.f32 %p130, %r85, %r87; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.eq.f32 %p131, %r85, %r87; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p132, %r85, %r85; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p133, %r87, %r87; + setp.num.f32 %p134, %r87, %r87; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p135, %p132, %p134; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p136, %p130, %p135; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p137, %p133, %p132; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p138, %p131, %p137; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.lt.s32 %p139, %r86, %r88; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p140, %p139, %p138; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p141, %p136, %p140; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.f32 %r89, %r85, %r87, %p141; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.b32 %r90, %r86, %r88, %p141; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + shfl.sync.bfly.b32 %r91, %r89, 1, 31, -1; + shfl.sync.bfly.b32 %r92, %r90, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.gt.f32 %p142, %r89, %r91; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.eq.f32 %p143, %r89, %r91; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p144, %r89, %r89; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p145, %r91, %r91; + setp.num.f32 %p146, %r91, %r91; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p147, %p144, %p146; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p148, %p142, %p147; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p149, %p145, %p144; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p150, %p143, %p149; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.lt.s32 %p151, %r90, %r92; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p152, %p151, %p150; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p153, %p148, %p152; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.b32 %r57, %r90, %r92, %p153; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.eq.b32 %p51, %r66, 0; + shr.u32 %r93, %r2, 3; + and.b32 %r94, %r93, 60; + mov.b32 %r95, global_smem; + add.s32 %r54, %r95, %r94; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.b32 %r55, %r89, %r91, %p153; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + // begin inline asm + @%p51 st.shared.b32 [ %r54 + 0 ], %r55; + // end inline asm + add.s32 %r96, %r95, 64; + add.s32 %r56, %r96, %r94; + // begin inline asm + @%p51 st.shared.b32 [ %r56 + 0 ], %r57; + // end inline asm + bar.sync 0; + setp.lt.u32 %p53, %r2, 16; + add.s32 %r59, %r95, %r19; + // begin inline asm + @%p53 ld.shared.b32 %r58, [ %r59 + 0 ]; + // end inline asm + add.s32 %r61, %r96, %r19; + // begin inline asm + @%p53 ld.shared.b32 %r60, [ %r61 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r98, %r58, 8, 31, -1; + shfl.sync.bfly.b32 %r99, %r60, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.gt.f32 %p154, %r58, %r98; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.eq.f32 %p155, %r58, %r98; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p156, %r58, %r58; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p157, %r98, %r98; + setp.num.f32 %p158, %r98, %r98; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p159, %p156, %p158; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p160, %p154, %p159; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p161, %p156, %p157; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p162, %p155, %p161; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.lt.s32 %p163, %r60, %r99; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p164, %p163, %p162; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p165, %p160, %p164; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.f32 %r100, %r58, %r98, %p165; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.b32 %r101, %r60, %r99, %p165; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + shfl.sync.bfly.b32 %r102, %r100, 4, 31, -1; + shfl.sync.bfly.b32 %r103, %r101, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.gt.f32 %p166, %r100, %r102; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.eq.f32 %p167, %r100, %r102; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p168, %r100, %r100; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p169, %r102, %r102; + setp.num.f32 %p170, %r102, %r102; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p171, %p168, %p170; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p172, %p166, %p171; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p173, %p169, %p168; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p174, %p167, %p173; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.lt.s32 %p175, %r101, %r103; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p176, %p175, %p174; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p177, %p172, %p176; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.f32 %r104, %r100, %r102, %p177; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.b32 %r105, %r101, %r103, %p177; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + shfl.sync.bfly.b32 %r106, %r104, 2, 31, -1; + shfl.sync.bfly.b32 %r107, %r105, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.gt.f32 %p178, %r104, %r106; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.eq.f32 %p179, %r104, %r106; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p180, %r104, %r104; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p181, %r106, %r106; + setp.num.f32 %p182, %r106, %r106; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p183, %p180, %p182; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p184, %p178, %p183; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p185, %p181, %p180; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p186, %p179, %p185; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.lt.s32 %p187, %r105, %r107; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p188, %p187, %p186; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p189, %p184, %p188; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.f32 %r108, %r104, %r106, %p189; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.b32 %r109, %r105, %r107, %p189; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + shfl.sync.bfly.b32 %r110, %r108, 1, 31, -1; + shfl.sync.bfly.b32 %r111, %r109, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.gt.f32 %p190, %r108, %r110; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.eq.f32 %p191, %r108, %r110; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p192, %r108, %r108; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.nan.f32 %p193, %r110, %r110; + setp.num.f32 %p194, %r110, %r110; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p195, %p192, %p194; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p196, %p190, %p195; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p197, %p193, %p192; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p198, %p191, %p197; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.lt.s32 %p199, %r109, %r111; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + and.pred %p200, %p199, %p198; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + or.pred %p201, %p196, %p200; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.b32 %r65, %r109, %r111, %p201; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + setp.eq.b32 %p55, %r2, 0; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + selp.b32 %r63, %r108, %r110, %p201; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:46:75 ] + // begin inline asm + @%p55 st.shared.b32 [ %r59 + 0 ], %r63; + // end inline asm + // begin inline asm + @%p55 st.shared.b32 [ %r61 + 0 ], %r65; + // end inline asm + bar.sync 0; + ld.shared.s32 %rd27, [global_smem+64]; +$L__tmp6: + .loc 1 48 25 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:48:25 + mad.wide.u32 %rd28, %r1, 8, %rd8; + .loc 1 48 36 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:48:36 + and.b32 %r112, %r2, 511; + setp.eq.b32 %p57, %r112, 0; + // begin inline asm + @%p57 st.global.b64 [ %rd28 + 0 ], { %rd27 }; + // end inline asm + .loc 1 48 4 // cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py:48:4 + ret; +$L__tmp7: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 234 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 102 +.b8 118 +.b8 109 +.b8 54 +.b8 53 +.b8 53 +.b8 106 +.b8 53 +.b8 99 +.b8 109 +.b8 52 +.b8 53 +.b8 50 +.b8 52 +.b8 103 +.b8 109 +.b8 100 +.b8 121 +.b8 104 +.b8 114 +.b8 55 +.b8 121 +.b8 108 +.b8 105 +.b8 54 +.b8 100 +.b8 102 +.b8 102 +.b8 112 +.b8 97 +.b8 107 +.b8 121 +.b8 115 +.b8 117 +.b8 121 +.b8 99 +.b8 117 +.b8 111 +.b8 122 +.b8 107 +.b8 113 +.b8 109 +.b8 121 +.b8 117 +.b8 97 +.b8 111 +.b8 110 +.b8 107 +.b8 119 +.b8 98 +.b8 103 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 102 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.source b/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.source new file mode 100644 index 0000000000000000000000000000000000000000..5a08a74af8795320722ba313e732a65d4cfa2ca2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.source @@ -0,0 +1,309 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":18:0) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc45 = loc(unknown) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("out_ptr0"(#loc)) +#loc72 = loc("xnumel"(#loc)) +#loc73 = loc("r0_numel"(#loc)) +#loc100 = loc("a_value"(#loc33)) +#loc101 = loc("a_index"(#loc33)) +#loc102 = loc("b_value"(#loc33)) +#loc103 = loc("b_index"(#loc33)) +#loc116 = loc("x"(#loc53)) +#loc117 = loc("x"(#loc57)) +#loc118 = loc("value"(#loc66)) +#loc119 = loc("index"(#loc66)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 16384 : i32 loc(#loc74) + %r0_numel_1 = arith.constant 32000 : i32 loc(#loc75) + %xoffset = tt.get_program_id x : i32 loc(#loc76) + %xoffset_2 = arith.constant 1 : i32 loc(#loc77) + %xoffset_3 = arith.constant 1 : i32 loc(#loc77) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc77) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc78) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc79) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc80) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc80) + %xmask = arith.constant true loc(#loc81) + %xmask_8 = arith.constant dense : tensor<1x2048xi1> loc(#loc81) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc82) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc83) + %x0 = arith.constant 2048 : i32 loc(#loc84) + %x0_10 = arith.constant 2048 : i32 loc(#loc84) + %x0_11 = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc84) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<1x1xi32> loc(#loc84) + %x1 = arith.constant 2048 : i32 loc(#loc85) + %x1_13 = arith.constant 2048 : i32 loc(#loc85) + %x1_14 = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc85) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<1x1xi32> loc(#loc85) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc86) + %_tmp2_16 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc86) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc87) + %_tmp2_index_17 = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc87) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp2_index_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_19 = %_tmp2_16, %_tmp2_index_20 = %_tmp2_index_17) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc89) + %r0_index_21 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc89) + %r0_mask = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc90) + %r0_mask_22 = arith.cmpi slt, %r0_index_21, %r0_mask : tensor<1x2048xi32> loc(#loc90) + %tmp0 = arith.constant 32000 : i32 loc(#loc91) + %tmp0_23 = arith.constant 32000 : i32 loc(#loc91) + %tmp0_24 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc91) + %tmp0_25 = arith.muli %tmp0_24, %x0_12 : tensor<1x1xi32> loc(#loc91) + %tmp0_26 = tt.broadcast %tmp0_25 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc92) + %tmp0_27 = arith.addi %r0_index_21, %tmp0_26 : tensor<1x2048xi32> loc(#loc92) + %tmp0_28 = arith.constant 65760000 : i32 loc(#loc93) + %tmp0_29 = arith.constant 65760000 : i32 loc(#loc93) + %tmp0_30 = arith.constant dense<65760000> : tensor<1x1xi32> loc(#loc93) + %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<1x1xi32> loc(#loc93) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc94) + %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<1x2048xi32> loc(#loc94) + %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc95) + %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc95) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc96) + %tmp0_37 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc96) + %tmp0_38 = tt.load %tmp0_35, %r0_mask_22, %tmp0_37 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc96) + %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%_tmp2_19, %_tmp2_index_20, %tmp0_38, %r0_index_21) : (tensor<1x2048xf32>, tensor<1x2048xi32>, tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) loc(#loc24) + %_tmp2_39 = arith.select %r0_mask_22, %8#0, %_tmp2_19 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc97) + %_tmp2_index_40 = arith.select %r0_mask_22, %8#1, %_tmp2_index_20 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc98) + scf.yield %_tmp2_39, %_tmp2_index_40 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc27) + } loc(#loc120) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%_tmp2_index_18#0, %_tmp2_index_18#1) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc28) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc99) + %5 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc30) + %6 = tt.addptr %5, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc30) + %7 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc31) + tt.store %6, %7 : tensor<1x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%a_value: tensor<1x2048xf32> loc("a_value"(#loc33)), %a_index: tensor<1x2048xi32> loc("a_index"(#loc33)), %b_value: tensor<1x2048xf32> loc("b_value"(#loc33)), %b_index: tensor<1x2048xi32> loc("b_index"(#loc33))) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<1x2048xf32> loc(#loc121) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<1x2048xf32> loc(#loc122) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%a_value) : (tensor<1x2048xf32>) -> i1 loc(#loc36) + %1:2 = scf.if %0 -> (tensor<1x2048xi1>, tensor<1x2048xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<1x2048xf32> loc(#loc106) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<1x2048xf32> loc(#loc107) + %mask_3 = arith.constant true loc(#loc108) + %mask_4 = arith.constant dense : tensor<1x2048xi1> loc(#loc108) + %mask_5 = arith.xori %b_isnan, %mask_4 : tensor<1x2048xi1> loc(#loc108) + %mask_6 = arith.andi %a_isnan, %mask_5 : tensor<1x2048xi1> loc(#loc109) + %mask_7 = arith.ori %mask, %mask_6 : tensor<1x2048xi1> loc(#loc123) + %equal_8 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc111) + %equal_9 = arith.ori %equal, %equal_8 : tensor<1x2048xi1> loc(#loc124) + scf.yield %mask_7, %equal_9 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc124) + } else { + scf.yield %mask, %equal : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc45) + } loc(#loc37) + %mask_0 = arith.cmpi slt, %a_index, %b_index : tensor<1x2048xi32> loc(#loc113) + %mask_1 = arith.andi %1#1, %mask_0 : tensor<1x2048xi1> loc(#loc114) + %mask_2 = arith.ori %1#0, %mask_1 : tensor<1x2048xi1> loc(#loc115) + %2 = arith.select %mask_2, %a_value, %b_value : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc49) + %3 = arith.select %mask_2, %a_index, %b_index : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc50) + tt.return %2, %3 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc51) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x2048xf32> loc(#loc52) + %5 = ub.poison : tensor<1x2048xi32> loc(#loc52) + tt.return %4, %5 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc52) + } loc(#loc33) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc53))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc54) + %true = arith.constant true loc(#loc55) + tt.return %true : i1 loc(#loc55) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc56) + tt.return %1 : i1 loc(#loc56) + } loc(#loc53) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc57))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc59) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc59) + %4 = arith.addf %x, %3 : tensor<1x2048xf32> loc(#loc59) + tt.return %4 : tensor<1x2048xf32> loc(#loc60) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x2048xf32> loc(#loc61) + tt.return %5 : tensor<1x2048xf32> loc(#loc61) + } loc(#loc57) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc63) + %cst = arith.constant dense : tensor<1xi1> loc(#loc63) + tt.return %cst : tensor<1xi1> loc(#loc64) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc65) + tt.return %0 : tensor<1xi1> loc(#loc65) + } loc(#loc62) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%value: tensor<1x2048xf32> loc("value"(#loc66)), %index: tensor<1x2048xi32> loc("index"(#loc66))) -> (tensor<1xf32>, tensor<1xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc67) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc67) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc67) + tt.return %0#0, %0#1 : tensor<1xf32>, tensor<1xi32> loc(#loc68) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc69) + %2 = ub.poison : tensor<1xi32> loc(#loc69) + tt.return %1, %2 : tensor<1xf32>, tensor<1xi32> loc(#loc69) + } loc(#loc66) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc33)), %a_index: i32 loc("a_index"(#loc33)), %b_value: f32 loc("b_value"(#loc33)), %b_index: i32 loc("b_index"(#loc33))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc121) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc122) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc36) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc106) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc107) + %mask_3 = arith.constant true loc(#loc108) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc108) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc109) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc123) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc111) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc124) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc124) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc45) + } loc(#loc37) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc113) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc114) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc115) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc49) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc50) + tt.return %2, %3 : f32, i32 loc(#loc51) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc52) + %5 = ub.poison : i32 loc(#loc52) + tt.return %4, %5 : f32, i32 loc(#loc52) + } loc(#loc33) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc53))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc54) + %true = arith.constant true loc(#loc55) + tt.return %true : i1 loc(#loc55) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc56) + tt.return %1 : i1 loc(#loc56) + } loc(#loc53) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc57))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc59) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc59) + tt.return %3 : tensor<1xf32> loc(#loc60) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc61) + tt.return %4 : tensor<1xf32> loc(#loc61) + } loc(#loc57) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":29:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":30:55) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":31:58) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":33:40) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":34:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":35:29) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:47) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:61) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:52) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:66) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":42:38) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":44:46) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":45:58) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":45:8) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":46:75) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":47:20) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:25) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:36) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:4) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc74 = loc("xnumel"(#loc1)) +#loc75 = loc("r0_numel"(#loc2)) +#loc76 = loc("xoffset"(#loc3)) +#loc77 = loc("xoffset"(#loc4)) +#loc78 = loc("xindex"(#loc5)) +#loc79 = loc("xindex"(#loc6)) +#loc80 = loc("xindex"(#loc7)) +#loc81 = loc("xmask"(#loc8)) +#loc82 = loc("r0_base"(#loc9)) +#loc83 = loc("r0_base"(#loc10)) +#loc84 = loc("x0"(#loc11)) +#loc85 = loc("x1"(#loc12)) +#loc86 = loc("_tmp2"(#loc13)) +#loc87 = loc("_tmp2_index"(#loc14)) +#loc88 = loc("_tmp2"(#loc15)) +#loc89 = loc("r0_index"(#loc16)) +#loc90 = loc("r0_mask"(#loc17)) +#loc91 = loc("tmp0"(#loc18)) +#loc92 = loc("tmp0"(#loc19)) +#loc93 = loc("tmp0"(#loc20)) +#loc94 = loc("tmp0"(#loc21)) +#loc95 = loc("tmp0"(#loc22)) +#loc96 = loc("tmp0"(#loc23)) +#loc97 = loc("_tmp2"(#loc25)) +#loc98 = loc("_tmp2_index"(#loc26)) +#loc99 = loc("tmp2"(#loc29)) +#loc104 = loc("mask"(#loc34)) +#loc105 = loc("equal"(#loc35)) +#loc106 = loc("a_isnan"(#loc38)) +#loc107 = loc("b_isnan"(#loc39)) +#loc108 = loc("mask"(#loc40)) +#loc109 = loc("mask"(#loc41)) +#loc110 = loc("mask"(#loc42)) +#loc111 = loc("equal"(#loc43)) +#loc112 = loc("equal"(#loc44)) +#loc113 = loc("mask"(#loc46)) +#loc114 = loc("mask"(#loc47)) +#loc115 = loc("mask"(#loc48)) +#loc120 = loc("_tmp2_index"(#loc88)) +#loc121 = loc("mask"(#loc104)) +#loc122 = loc("equal"(#loc105)) +#loc123 = loc("mask"(#loc110)) +#loc124 = loc("equal"(#loc112)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..45aaa31c20de17b0b388765a1549f6c9484489a1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.ttgir @@ -0,0 +1,185 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":18:0) +#loc1 = loc(unknown) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":46:75) +#loc39 = loc("in_ptr0"(#loc)) +#loc40 = loc("out_ptr0"(#loc)) +#loc41 = loc("xnumel"(#loc)) +#loc42 = loc("r0_numel"(#loc)) +#loc73 = loc(callsite(#loc1 at #loc34)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32000> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c65760000_i32 = arith.constant 65760000 : i32 loc(#loc1) + %true = arith.constant true loc(#loc1) + %cst_1 = arith.constant dense : tensor<1x2048xi1, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<2147483647> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc43) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc44) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc44) + %x0 = arith.remsi %xoffset, %c2048_i32 : i32 loc(#loc45) + %x1 = arith.divsi %xoffset, %c2048_i32 : i32 loc(#loc46) + %tmp0 = arith.muli %x0, %c32000_i32 : i32 loc(#loc47) + %tmp0_5 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc75) + %tmp0_6 = arith.muli %x1, %c65760000_i32 : i32 loc(#loc49) + %tmp0_7 = tt.splat %tmp0_6 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc76) + %tmp0_8 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc51) + %_tmp2_index:2 = scf.for %_tmp2_index_9 = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp2 = %cst_3, %_tmp2_index_10 = %cst_2) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp2_index_9 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc53) + %r0_index_11 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32, #blocked> loc(#loc53) + %r0_mask = arith.cmpi slt, %r0_index_11, %cst : tensor<1x2048xi32, #blocked> loc(#loc54) + %tmp0_12 = arith.addi %r0_index_11, %tmp0_5 : tensor<1x2048xi32, #blocked> loc(#loc48) + %tmp0_13 = arith.addi %tmp0_12, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc50) + %tmp0_14 = tt.addptr %tmp0_8, %tmp0_13 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc51) + %tmp0_15 = tt.load %tmp0_14, %r0_mask, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc55) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_15 : tensor<1x2048xf32, #blocked> loc(#loc100) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_15 : tensor<1x2048xf32, #blocked> loc(#loc101) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<1x2048xf32, #blocked> loc(#loc80) + %b_isnan = arith.cmpf une, %tmp0_15, %tmp0_15 : tensor<1x2048xf32, #blocked> loc(#loc81) + %mask_16 = arith.xori %b_isnan, %cst_1 : tensor<1x2048xi1, #blocked> loc(#loc82) + %mask_17 = arith.andi %a_isnan, %mask_16 : tensor<1x2048xi1, #blocked> loc(#loc83) + %mask_18 = arith.ori %mask, %mask_17 : tensor<1x2048xi1, #blocked> loc(#loc102) + %equal_19 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1, #blocked> loc(#loc85) + %equal_20 = arith.ori %equal, %equal_19 : tensor<1x2048xi1, #blocked> loc(#loc103) + %mask_21 = arith.cmpi slt, %_tmp2_index_10, %r0_index_11 : tensor<1x2048xi32, #blocked> loc(#loc87) + %mask_22 = arith.andi %equal_20, %mask_21 : tensor<1x2048xi1, #blocked> loc(#loc88) + %mask_23 = arith.ori %mask_18, %mask_22 : tensor<1x2048xi1, #blocked> loc(#loc89) + %5 = arith.select %mask_23, %_tmp2, %tmp0_15 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc68) + %6 = arith.select %mask_23, %_tmp2_index_10, %r0_index_11 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc69) + %_tmp2_24 = arith.select %r0_mask, %5, %_tmp2 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc70) + %_tmp2_index_25 = arith.select %r0_mask, %6, %_tmp2_index_10 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc71) + scf.yield %_tmp2_24, %_tmp2_index_25 : tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc32) + } loc(#loc77) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc34)), %arg5: i32 loc(callsite(#loc1 at #loc34)), %arg6: f32 loc(callsite(#loc1 at #loc34)), %arg7: i32 loc(callsite(#loc1 at #loc34))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc104) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc105) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc90) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc91) + %mask_9 = arith.xori %b_isnan, %true : i1 loc(#loc92) + %mask_10 = arith.andi %a_isnan, %mask_9 : i1 loc(#loc93) + %mask_11 = arith.ori %mask, %mask_10 : i1 loc(#loc106) + %equal_12 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc94) + %equal_13 = arith.ori %equal, %equal_12 : i1 loc(#loc107) + %mask_14 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc95) + %mask_15 = arith.andi %equal_13, %mask_14 : i1 loc(#loc96) + %mask_16 = arith.ori %mask_11, %mask_15 : i1 loc(#loc97) + %5 = arith.select %mask_16, %arg4, %arg6 : f32 loc(#loc98) + %6 = arith.select %mask_16, %arg5, %arg7 : i32 loc(#loc99) + tt.reduce.return %5, %6 : f32, i32 loc(#loc72) + }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc72) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi32, #blocked> loc(#loc74) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc36) + %2 = ttg.convert_layout %tmp2 : tensor<1x1xi32, #blocked> -> tensor<1x1xi32, #blocked1> loc(#loc37) + %3 = arith.extsi %2 : tensor<1x1xi32, #blocked1> to tensor<1x1xi64, #blocked1> loc(#loc37) + %4 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc37) + tt.store %4, %3 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc37) + tt.return loc(#loc38) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":26:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":28:19) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":29:19) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:47) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:41) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:61) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:52) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:34) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":33:40) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":34:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":35:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:66) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":42:38) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":44:46) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":45:58) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":45:8) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":47:20) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:36) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:4) +#loc43 = loc("xoffset"(#loc2)) +#loc44 = loc("r0_base"(#loc3)) +#loc45 = loc("x0"(#loc4)) +#loc46 = loc("x1"(#loc5)) +#loc47 = loc("tmp0"(#loc6)) +#loc48 = loc("tmp0"(#loc7)) +#loc49 = loc("tmp0"(#loc8)) +#loc50 = loc("tmp0"(#loc9)) +#loc51 = loc("tmp0"(#loc10)) +#loc52 = loc("_tmp2"(#loc11)) +#loc53 = loc("r0_index"(#loc12)) +#loc54 = loc("r0_mask"(#loc13)) +#loc55 = loc("tmp0"(#loc14)) +#loc56 = loc("mask"(#loc15)) +#loc57 = loc("equal"(#loc17)) +#loc58 = loc("a_isnan"(#loc18)) +#loc59 = loc("b_isnan"(#loc19)) +#loc60 = loc("mask"(#loc20)) +#loc61 = loc("mask"(#loc21)) +#loc62 = loc("mask"(#loc22)) +#loc63 = loc("equal"(#loc23)) +#loc64 = loc("equal"(#loc24)) +#loc65 = loc("mask"(#loc25)) +#loc66 = loc("mask"(#loc26)) +#loc67 = loc("mask"(#loc27)) +#loc68 = loc(callsite(#loc28 at #loc16)) +#loc69 = loc(callsite(#loc29 at #loc16)) +#loc70 = loc("_tmp2"(#loc30)) +#loc71 = loc("_tmp2_index"(#loc31)) +#loc72 = loc(callsite(#loc33 at #loc34)) +#loc74 = loc("tmp2"(#loc35)) +#loc75 = loc(fused[#loc48, #loc47]) +#loc76 = loc(fused[#loc50, #loc49]) +#loc77 = loc("_tmp2_index"(#loc52)) +#loc78 = loc("mask"(#loc56)) +#loc79 = loc("equal"(#loc57)) +#loc80 = loc(callsite(#loc58 at #loc16)) +#loc81 = loc(callsite(#loc59 at #loc16)) +#loc82 = loc(callsite(#loc60 at #loc16)) +#loc83 = loc(callsite(#loc61 at #loc16)) +#loc84 = loc("mask"(#loc62)) +#loc85 = loc(callsite(#loc63 at #loc16)) +#loc86 = loc("equal"(#loc64)) +#loc87 = loc(callsite(#loc65 at #loc16)) +#loc88 = loc(callsite(#loc66 at #loc16)) +#loc89 = loc(callsite(#loc67 at #loc16)) +#loc90 = loc(callsite(#loc58 at #loc72)) +#loc91 = loc(callsite(#loc59 at #loc72)) +#loc92 = loc(callsite(#loc60 at #loc72)) +#loc93 = loc(callsite(#loc61 at #loc72)) +#loc94 = loc(callsite(#loc63 at #loc72)) +#loc95 = loc(callsite(#loc65 at #loc72)) +#loc96 = loc(callsite(#loc66 at #loc72)) +#loc97 = loc(callsite(#loc67 at #loc72)) +#loc98 = loc(callsite(#loc28 at #loc72)) +#loc99 = loc(callsite(#loc29 at #loc72)) +#loc100 = loc(callsite(#loc78 at #loc16)) +#loc101 = loc(callsite(#loc79 at #loc16)) +#loc102 = loc(callsite(#loc84 at #loc16)) +#loc103 = loc(callsite(#loc86 at #loc16)) +#loc104 = loc(callsite(#loc78 at #loc72)) +#loc105 = loc(callsite(#loc79 at #loc72)) +#loc106 = loc(callsite(#loc84 at #loc72)) +#loc107 = loc(callsite(#loc86 at #loc72)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..0a8d6122c8c47ac3da046e24aaf91aa98e547778 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.ttir @@ -0,0 +1,188 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":46:75) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc46 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c65760000_i32 = arith.constant 65760000 : i32 loc(#loc1) + %true = arith.constant true loc(#loc46) + %cst = arith.constant dense : tensor<1x2048xi1> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc1) + %cst_1 = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc47) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc48) + %xoffset = tt.get_program_id x : i32 loc(#loc49) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc50) + %r0_base_2 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc51) + %x0 = arith.remsi %xoffset, %c2048_i32 : i32 loc(#loc52) + %x1 = arith.divsi %xoffset, %c2048_i32 : i32 loc(#loc53) + %_tmp2_index_3:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp2_4 = %_tmp2, %_tmp2_index_5 = %_tmp2_index) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc55) + %r0_index_6 = arith.addi %r0_index, %r0_base_2 : tensor<1x2048xi32> loc(#loc55) + %r0_mask = arith.cmpi slt, %r0_index_6, %cst_1 : tensor<1x2048xi32> loc(#loc56) + %tmp0 = arith.muli %x0, %c32000_i32 : i32 loc(#loc57) + %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc82) + %tmp0_8 = arith.addi %r0_index_6, %tmp0_7 : tensor<1x2048xi32> loc(#loc58) + %tmp0_9 = arith.muli %x1, %c65760000_i32 : i32 loc(#loc59) + %tmp0_10 = tt.splat %tmp0_9 : i32 -> tensor<1x2048xi32> loc(#loc83) + %tmp0_11 = arith.addi %tmp0_8, %tmp0_10 : tensor<1x2048xi32> loc(#loc60) + %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc61) + %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc61) + %tmp0_14 = tt.load %tmp0_13, %r0_mask, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc62) + %mask = arith.cmpf ogt, %_tmp2_4, %tmp0_14 : tensor<1x2048xf32> loc(#loc106) + %equal = arith.cmpf oeq, %_tmp2_4, %tmp0_14 : tensor<1x2048xf32> loc(#loc107) + %a_isnan = arith.cmpf une, %_tmp2_4, %_tmp2_4 : tensor<1x2048xf32> loc(#loc86) + %b_isnan = arith.cmpf une, %tmp0_14, %tmp0_14 : tensor<1x2048xf32> loc(#loc87) + %mask_15 = arith.xori %b_isnan, %cst : tensor<1x2048xi1> loc(#loc88) + %mask_16 = arith.andi %a_isnan, %mask_15 : tensor<1x2048xi1> loc(#loc89) + %mask_17 = arith.ori %mask, %mask_16 : tensor<1x2048xi1> loc(#loc108) + %equal_18 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc91) + %equal_19 = arith.ori %equal, %equal_18 : tensor<1x2048xi1> loc(#loc109) + %mask_20 = arith.cmpi slt, %_tmp2_index_5, %r0_index_6 : tensor<1x2048xi32> loc(#loc93) + %mask_21 = arith.andi %equal_19, %mask_20 : tensor<1x2048xi1> loc(#loc94) + %mask_22 = arith.ori %mask_17, %mask_21 : tensor<1x2048xi1> loc(#loc95) + %4 = arith.select %mask_22, %_tmp2_4, %tmp0_14 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc75) + %5 = arith.select %mask_22, %_tmp2_index_5, %r0_index_6 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc76) + %_tmp2_23 = arith.select %r0_mask, %4, %_tmp2_4 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc77) + %_tmp2_index_24 = arith.select %r0_mask, %5, %_tmp2_index_5 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc78) + scf.yield %_tmp2_23, %_tmp2_index_24 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc36) + } loc(#loc81) + %0:2 = "tt.reduce"(%_tmp2_index_3#0, %_tmp2_index_3#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc2)), %arg5: i32 loc(callsite(#loc1 at #loc2)), %arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc110) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc111) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc96) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc97) + %mask_4 = arith.xori %b_isnan, %true : i1 loc(#loc98) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc99) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc112) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc100) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc113) + %mask_9 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc101) + %mask_10 = arith.andi %equal_8, %mask_9 : i1 loc(#loc102) + %mask_11 = arith.ori %mask_6, %mask_10 : i1 loc(#loc103) + %4 = arith.select %mask_11, %arg4, %arg6 : f32 loc(#loc104) + %5 = arith.select %mask_11, %arg5, %arg7 : i32 loc(#loc105) + tt.reduce.return %4, %5 : f32, i32 loc(#loc79) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc79) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc80) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc39) + %2 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc39) + %3 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc40) + tt.store %2, %3 : tensor<1x1x!tt.ptr> loc(#loc40) + tt.return loc(#loc41) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":33:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":31:58) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":30:55) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":23:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":26:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":26:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":28:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":29:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":34:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":35:29) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:47) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:41) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:61) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:52) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":39:66) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":42:38) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":44:46) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":45:58) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":45:8) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":47:20) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:25) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:36) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fv/cfvm655j5cm4524gmdyhr7yli6dffpakysuycuozkqmyuaonkwbg.py":48:4) +#loc47 = loc("_tmp2_index"(#loc4)) +#loc48 = loc("_tmp2"(#loc5)) +#loc49 = loc("xoffset"(#loc6)) +#loc50 = loc("r0_base"(#loc7)) +#loc51 = loc("r0_base"(#loc8)) +#loc52 = loc("x0"(#loc9)) +#loc53 = loc("x1"(#loc10)) +#loc54 = loc("_tmp2"(#loc3)) +#loc55 = loc("r0_index"(#loc11)) +#loc56 = loc("r0_mask"(#loc12)) +#loc57 = loc("tmp0"(#loc13)) +#loc58 = loc("tmp0"(#loc14)) +#loc59 = loc("tmp0"(#loc15)) +#loc60 = loc("tmp0"(#loc16)) +#loc61 = loc("tmp0"(#loc17)) +#loc62 = loc("tmp0"(#loc18)) +#loc63 = loc("mask"(#loc19)) +#loc64 = loc("equal"(#loc21)) +#loc65 = loc("a_isnan"(#loc22)) +#loc66 = loc("b_isnan"(#loc23)) +#loc67 = loc("mask"(#loc24)) +#loc68 = loc("mask"(#loc25)) +#loc69 = loc("mask"(#loc26)) +#loc70 = loc("equal"(#loc27)) +#loc71 = loc("equal"(#loc28)) +#loc72 = loc("mask"(#loc29)) +#loc73 = loc("mask"(#loc30)) +#loc74 = loc("mask"(#loc31)) +#loc75 = loc(callsite(#loc32 at #loc20)) +#loc76 = loc(callsite(#loc33 at #loc20)) +#loc77 = loc("_tmp2"(#loc34)) +#loc78 = loc("_tmp2_index"(#loc35)) +#loc79 = loc(callsite(#loc37 at #loc2)) +#loc80 = loc("tmp2"(#loc38)) +#loc81 = loc("_tmp2_index"(#loc54)) +#loc82 = loc(fused[#loc58, #loc57]) +#loc83 = loc(fused[#loc60, #loc59]) +#loc84 = loc("mask"(#loc63)) +#loc85 = loc("equal"(#loc64)) +#loc86 = loc(callsite(#loc65 at #loc20)) +#loc87 = loc(callsite(#loc66 at #loc20)) +#loc88 = loc(callsite(#loc67 at #loc20)) +#loc89 = loc(callsite(#loc68 at #loc20)) +#loc90 = loc("mask"(#loc69)) +#loc91 = loc(callsite(#loc70 at #loc20)) +#loc92 = loc("equal"(#loc71)) +#loc93 = loc(callsite(#loc72 at #loc20)) +#loc94 = loc(callsite(#loc73 at #loc20)) +#loc95 = loc(callsite(#loc74 at #loc20)) +#loc96 = loc(callsite(#loc65 at #loc79)) +#loc97 = loc(callsite(#loc66 at #loc79)) +#loc98 = loc(callsite(#loc67 at #loc79)) +#loc99 = loc(callsite(#loc68 at #loc79)) +#loc100 = loc(callsite(#loc70 at #loc79)) +#loc101 = loc(callsite(#loc72 at #loc79)) +#loc102 = loc(callsite(#loc73 at #loc79)) +#loc103 = loc(callsite(#loc74 at #loc79)) +#loc104 = loc(callsite(#loc32 at #loc79)) +#loc105 = loc(callsite(#loc33 at #loc79)) +#loc106 = loc(callsite(#loc84 at #loc20)) +#loc107 = loc(callsite(#loc85 at #loc20)) +#loc108 = loc(callsite(#loc90 at #loc20)) +#loc109 = loc(callsite(#loc92 at #loc20)) +#loc110 = loc(callsite(#loc84 at #loc79)) +#loc111 = loc(callsite(#loc85 at #loc79)) +#loc112 = loc(callsite(#loc90 at #loc79)) +#loc113 = loc(callsite(#loc92 at #loc79)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7f59941b0b9cc9487e3614759f0247c387fa32a3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin b/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin new file mode 100644 index 0000000000000000000000000000000000000000..e5ef5cc9db850c70655df7ed2ddac9158b6be579 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir b/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir new file mode 100644 index 0000000000000000000000000000000000000000..fd2b5ae798a4a3399e9b39908b86a15745588ae2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir @@ -0,0 +1,781 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 3, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 24, !dbg !9 + %12 = lshr i32 %10, 5, !dbg !9 + %13 = and i32 %10, 7, !dbg !9 + %14 = lshr i32 %10, 3, !dbg !9 + %15 = and i32 %14, 7, !dbg !9 + %16 = or disjoint i32 %9, %13, !dbg !10 + %17 = or disjoint i32 %15, %9, !dbg !10 + %18 = icmp slt i32 %16, 128, !dbg !11 + %19 = icmp slt i32 %17, 128, !dbg !11 + %20 = or disjoint i32 %15, 8, !dbg !12 + %21 = shl nuw nsw i32 %13, 1, !dbg !12 + %22 = sdiv i32 %16, 16, !dbg !13 + %23 = mul nuw nsw i32 %15, 17, !dbg !14 + %24 = mul nuw nsw i32 %20, 17, !dbg !14 + %25 = shl i32 %22, 8, !dbg !15 + %26 = add i32 %25, %16, !dbg !15 + %27 = add i32 %26, %23, !dbg !16 + %28 = add i32 %26, %24, !dbg !16 + %29 = sext i32 %27 to i64, !dbg !17 + %30 = getelementptr i32, ptr addrspace(1) %0, i64 %29, !dbg !17 + %31 = sext i32 %28 to i64, !dbg !17 + %32 = getelementptr i32, ptr addrspace(1) %0, i64 %31, !dbg !17 + %33 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %30, i1 %18) #4, !dbg !18 + %34 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %32, i1 %18) #4, !dbg !18 + %35 = lshr i32 %10, 4, !dbg !19 + %.lobit = and i32 %35, 1, !dbg !19 + %36 = and i32 %10, 8, !dbg !19 + %.not = icmp eq i32 %36, 0, !dbg !19 + %.lobit1 = lshr exact i32 %36, 3, !dbg !19 + %37 = and i32 %10, 32, !dbg !19 + %.not3 = icmp eq i32 %37, 0, !dbg !19 + %.lobit2 = lshr exact i32 %37, 5, !dbg !19 + %38 = xor i32 %.lobit1, 1, !dbg !23 + %39 = xor i32 %.lobit, 1, !dbg !23 + %40 = xor i32 %.lobit2, 1, !dbg !23 + %41 = mul nuw nsw i32 %33, %38, !dbg !24 + %42 = mul nuw nsw i32 %34, %38, !dbg !24 + %43 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %41, i32 8, i32 31), !dbg !25 + %44 = add i32 %43, %41, !dbg !28 + %45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %42, i32 8, i32 31), !dbg !25 + %46 = add i32 %45, %42, !dbg !28 + %47 = mul nuw nsw i32 %33, %.lobit1, !dbg !29 + %48 = mul nuw nsw i32 %34, %.lobit1, !dbg !29 + %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %47, i32 8, i32 31), !dbg !25 + %50 = add i32 %49, %47, !dbg !28 + %51 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 8, i32 31), !dbg !25 + %52 = add i32 %51, %48, !dbg !28 + %53 = mul nuw nsw i32 %38, %15, !dbg !30 + %54 = mul nuw nsw i32 %20, %38, !dbg !30 + %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %53, i32 8, i32 31), !dbg !25 + %56 = add i32 %55, %53, !dbg !28 + %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 8, i32 31), !dbg !25 + %58 = add i32 %57, %54, !dbg !28 + %59 = mul nuw nsw i32 %15, %.lobit1, !dbg !31 + %60 = mul nuw nsw i32 %20, %.lobit1, !dbg !31 + %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %59, i32 8, i32 31), !dbg !25 + %62 = add i32 %61, %59, !dbg !28 + %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 8, i32 31), !dbg !25 + %64 = add i32 %63, %60, !dbg !28 + %65 = trunc i32 %35 to i1, !dbg !32 + %66 = icmp sge i32 %44, %50, !dbg !32 + %67 = icmp ne i32 %44, %50, !dbg !32 + %68 = icmp sle i32 %56, %62, !dbg !32 + %69 = or i1 %67, %68, !dbg !32 + %70 = and i1 %66, %69, !dbg !32 + %.not4 = xor i1 %70, %65, !dbg !32 + %71 = icmp sge i32 %46, %52, !dbg !32 + %72 = icmp ne i32 %46, %52, !dbg !32 + %73 = icmp sle i32 %58, %64, !dbg !32 + %74 = or i1 %72, %73, !dbg !32 + %75 = and i1 %71, %74, !dbg !32 + %.not5 = xor i1 %75, %65, !dbg !32 + %76 = xor i32 %50, %44, !dbg !33 + %77 = xor i32 %52, %46, !dbg !33 + %78 = select i1 %.not4, i32 0, i32 %76, !dbg !34 + %79 = select i1 %.not5, i32 0, i32 %77, !dbg !34 + %80 = xor i32 %78, %33, !dbg !35 + %81 = xor i32 %79, %34, !dbg !35 + %82 = xor i32 %62, %56, !dbg !36 + %83 = xor i32 %64, %58, !dbg !36 + %84 = select i1 %.not4, i32 0, i32 %82, !dbg !37 + %85 = select i1 %.not5, i32 0, i32 %83, !dbg !37 + %86 = xor i32 %84, %15, !dbg !38 + %87 = xor i32 %85, %20, !dbg !38 + %88 = mul nuw nsw i32 %80, %39, !dbg !24 + %89 = mul nuw nsw i32 %81, %39, !dbg !24 + %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 16, i32 31), !dbg !25 + %91 = add i32 %88, %90, !dbg !28 + %92 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 16, i32 31), !dbg !25 + %93 = add i32 %89, %92, !dbg !28 + %94 = mul nuw nsw i32 %80, %.lobit, !dbg !29 + %95 = mul nuw nsw i32 %81, %.lobit, !dbg !29 + %96 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 16, i32 31), !dbg !25 + %97 = add i32 %94, %96, !dbg !28 + %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %95, i32 16, i32 31), !dbg !25 + %99 = add i32 %95, %98, !dbg !28 + %100 = mul nuw nsw i32 %86, %39, !dbg !30 + %101 = mul nuw nsw i32 %87, %39, !dbg !30 + %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 16, i32 31), !dbg !25 + %103 = add i32 %100, %102, !dbg !28 + %104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 16, i32 31), !dbg !25 + %105 = add i32 %101, %104, !dbg !28 + %106 = mul nuw nsw i32 %86, %.lobit, !dbg !31 + %107 = mul nuw nsw i32 %87, %.lobit, !dbg !31 + %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 16, i32 31), !dbg !25 + %109 = add i32 %106, %108, !dbg !28 + %110 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 16, i32 31), !dbg !25 + %111 = add i32 %107, %110, !dbg !28 + %112 = icmp slt i32 %91, %97, !dbg !39 + %113 = icmp slt i32 %93, %99, !dbg !39 + %114 = icmp eq i32 %91, %97, !dbg !40 + %115 = icmp eq i32 %93, %99, !dbg !40 + %116 = icmp sgt i32 %103, %109, !dbg !41 + %117 = icmp sgt i32 %105, %111, !dbg !41 + %118 = and i1 %114, %116, !dbg !42 + %119 = and i1 %115, %117, !dbg !42 + %120 = or i1 %112, %118, !dbg !43 + %121 = or i1 %113, %119, !dbg !43 + %122 = zext i1 %120 to i32, !dbg !44 + %123 = zext i1 %121 to i32, !dbg !44 + %.not6 = icmp eq i32 %.lobit2, %122, !dbg !32 + %.not7 = icmp eq i32 %.lobit2, %123, !dbg !32 + %124 = xor i32 %91, %97, !dbg !33 + %125 = xor i32 %93, %99, !dbg !33 + %126 = select i1 %.not6, i32 0, i32 %124, !dbg !34 + %127 = select i1 %.not7, i32 0, i32 %125, !dbg !34 + %128 = xor i32 %126, %80, !dbg !35 + %129 = xor i32 %127, %81, !dbg !35 + %130 = xor i32 %103, %109, !dbg !36 + %131 = xor i32 %105, %111, !dbg !36 + %132 = select i1 %.not6, i32 0, i32 %130, !dbg !37 + %133 = select i1 %.not7, i32 0, i32 %131, !dbg !37 + %134 = xor i32 %132, %86, !dbg !38 + %135 = xor i32 %133, %87, !dbg !38 + %136 = mul nuw nsw i32 %128, %38, !dbg !24 + %137 = mul nuw nsw i32 %129, %38, !dbg !24 + %138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 8, i32 31), !dbg !25 + %139 = add i32 %136, %138, !dbg !28 + %140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 8, i32 31), !dbg !25 + %141 = add i32 %137, %140, !dbg !28 + %142 = mul nuw nsw i32 %128, %.lobit1, !dbg !29 + %143 = mul nuw nsw i32 %129, %.lobit1, !dbg !29 + %144 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 8, i32 31), !dbg !25 + %145 = add i32 %142, %144, !dbg !28 + %146 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %143, i32 8, i32 31), !dbg !25 + %147 = add i32 %143, %146, !dbg !28 + %148 = mul nuw nsw i32 %134, %38, !dbg !30 + %149 = mul nuw nsw i32 %135, %38, !dbg !30 + %150 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 8, i32 31), !dbg !25 + %151 = add i32 %148, %150, !dbg !28 + %152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %149, i32 8, i32 31), !dbg !25 + %153 = add i32 %149, %152, !dbg !28 + %154 = mul nuw nsw i32 %134, %.lobit1, !dbg !31 + %155 = mul nuw nsw i32 %135, %.lobit1, !dbg !31 + %156 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 8, i32 31), !dbg !25 + %157 = add i32 %154, %156, !dbg !28 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %155, i32 8, i32 31), !dbg !25 + %159 = add i32 %155, %158, !dbg !28 + %160 = icmp slt i32 %139, %145, !dbg !39 + %161 = icmp slt i32 %141, %147, !dbg !39 + %162 = icmp eq i32 %139, %145, !dbg !40 + %163 = icmp eq i32 %141, %147, !dbg !40 + %164 = icmp sgt i32 %151, %157, !dbg !41 + %165 = icmp sgt i32 %153, %159, !dbg !41 + %166 = and i1 %162, %164, !dbg !42 + %167 = and i1 %163, %165, !dbg !42 + %168 = or i1 %160, %166, !dbg !43 + %169 = or i1 %161, %167, !dbg !43 + %170 = zext i1 %168 to i32, !dbg !44 + %171 = zext i1 %169 to i32, !dbg !44 + %.not8 = icmp eq i32 %.lobit2, %170, !dbg !32 + %.not9 = icmp eq i32 %.lobit2, %171, !dbg !32 + %172 = xor i32 %139, %145, !dbg !33 + %173 = xor i32 %141, %147, !dbg !33 + %174 = select i1 %.not8, i32 0, i32 %172, !dbg !34 + %175 = select i1 %.not9, i32 0, i32 %173, !dbg !34 + %176 = xor i32 %174, %128, !dbg !35 + %177 = xor i32 %175, %129, !dbg !35 + %178 = xor i32 %151, %157, !dbg !36 + %179 = xor i32 %153, %159, !dbg !36 + %180 = select i1 %.not8, i32 0, i32 %178, !dbg !37 + %181 = select i1 %.not9, i32 0, i32 %179, !dbg !37 + %182 = xor i32 %180, %134, !dbg !38 + %183 = xor i32 %181, %135, !dbg !38 + %184 = mul nuw nsw i32 %176, %40, !dbg !24 + %185 = mul nuw nsw i32 %177, %40, !dbg !24 + %186 = and i32 %12, 1, !dbg !25 + %187 = shl nuw nsw i32 %10, 1, !dbg !25 + %188 = and i32 %187, 48, !dbg !25 + %189 = or disjoint i32 %188, %21, !dbg !25 + %.idx = shl nuw nsw i32 %189, 3, !dbg !25 + %190 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx, !dbg !25 + %191 = getelementptr i32, ptr addrspace(3) %190, i32 %186, !dbg !25 + %192 = insertelement <1 x i32> poison, i32 %184, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %192, i1 true) #4, !dbg !25 + %193 = getelementptr i8, ptr addrspace(3) %190, i32 8, !dbg !25 + %194 = getelementptr i32, ptr addrspace(3) %193, i32 %186, !dbg !25 + %195 = insertelement <1 x i32> poison, i32 %185, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %195, i1 true) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %196 = icmp samesign ult i32 %10, 128, !dbg !25 + %197 = getelementptr i32, ptr addrspace(3) @global_smem, i32 %10, !dbg !25 + %198 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25 + %199 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 1, i32 31), !dbg !25 + %200 = add i32 %199, %198, !dbg !28 + %201 = and i32 %10, 897, !dbg !25 + %202 = icmp eq i32 %201, 0, !dbg !25 + %203 = insertelement <1 x i32> poison, i32 %200, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %203, i1 %202) #4, !dbg !25 + %204 = getelementptr i8, ptr addrspace(3) %197, i32 256, !dbg !25 + %205 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25 + %206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %205, i32 1, i32 31), !dbg !25 + %207 = add i32 %206, %205, !dbg !28 + %208 = insertelement <1 x i32> poison, i32 %207, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %208, i1 %202) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %209 = load i32, ptr addrspace(3) %190, align 16, !dbg !25 + %210 = load i32, ptr addrspace(3) %193, align 8, !dbg !25 + %211 = mul nuw nsw i32 %176, %.lobit2, !dbg !29 + %212 = mul nuw nsw i32 %177, %.lobit2, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %213 = insertelement <1 x i32> poison, i32 %211, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %213, i1 true) #4, !dbg !25 + %214 = insertelement <1 x i32> poison, i32 %212, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %214, i1 true) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %215 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25 + %216 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %215, i32 1, i32 31), !dbg !25 + %217 = add i32 %216, %215, !dbg !28 + %218 = insertelement <1 x i32> poison, i32 %217, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %218, i1 %202) #4, !dbg !25 + %219 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25 + %220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 1, i32 31), !dbg !25 + %221 = add i32 %220, %219, !dbg !28 + %222 = insertelement <1 x i32> poison, i32 %221, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %222, i1 %202) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %223 = load i32, ptr addrspace(3) %190, align 16, !dbg !25 + %224 = load i32, ptr addrspace(3) %193, align 8, !dbg !25 + %225 = mul nuw nsw i32 %182, %40, !dbg !30 + %226 = mul nuw nsw i32 %183, %40, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %227 = insertelement <1 x i32> poison, i32 %225, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %227, i1 true) #4, !dbg !25 + %228 = insertelement <1 x i32> poison, i32 %226, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %228, i1 true) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %229 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25 + %230 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %229, i32 1, i32 31), !dbg !25 + %231 = add i32 %230, %229, !dbg !28 + %232 = insertelement <1 x i32> poison, i32 %231, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %232, i1 %202) #4, !dbg !25 + %233 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25 + %234 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %233, i32 1, i32 31), !dbg !25 + %235 = add i32 %234, %233, !dbg !28 + %236 = insertelement <1 x i32> poison, i32 %235, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %236, i1 %202) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %237 = load i32, ptr addrspace(3) %190, align 16, !dbg !25 + %238 = load i32, ptr addrspace(3) %193, align 8, !dbg !25 + %239 = mul nuw nsw i32 %182, %.lobit2, !dbg !31 + %240 = mul nuw nsw i32 %183, %.lobit2, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %241 = insertelement <1 x i32> poison, i32 %239, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %241, i1 true) #4, !dbg !25 + %242 = insertelement <1 x i32> poison, i32 %240, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %242, i1 true) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %243 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25 + %244 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 1, i32 31), !dbg !25 + %245 = add i32 %244, %243, !dbg !28 + %246 = insertelement <1 x i32> poison, i32 %245, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %246, i1 %202) #4, !dbg !25 + %247 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25 + %248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %247, i32 1, i32 31), !dbg !25 + %249 = add i32 %248, %247, !dbg !28 + %250 = insertelement <1 x i32> poison, i32 %249, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %250, i1 %202) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %251 = load i32, ptr addrspace(3) %190, align 16, !dbg !25 + %252 = load i32, ptr addrspace(3) %193, align 8, !dbg !25 + %253 = icmp slt i32 %209, %223, !dbg !39 + %254 = icmp sge i32 %210, %224, !dbg !39 + %255 = icmp eq i32 %209, %223, !dbg !40 + %256 = icmp ne i32 %210, %224, !dbg !40 + %257 = icmp sgt i32 %237, %251, !dbg !41 + %258 = icmp sle i32 %238, %252, !dbg !41 + %259 = and i1 %255, %257, !dbg !42 + %.not15 = or i1 %256, %258, !dbg !43 + %260 = or i1 %253, %259, !dbg !43 + %.not12 = and i1 %254, %.not15, !dbg !44 + %261 = xor i32 %223, %209, !dbg !33 + %262 = xor i32 %224, %210, !dbg !33 + %263 = select i1 %260, i32 %261, i32 0, !dbg !34 + %264 = select i1 %.not12, i32 %262, i32 0, !dbg !34 + %265 = xor i32 %263, %176, !dbg !35 + %266 = xor i32 %264, %177, !dbg !35 + %267 = xor i32 %251, %237, !dbg !36 + %268 = xor i32 %252, %238, !dbg !36 + %269 = select i1 %260, i32 %267, i32 0, !dbg !37 + %270 = select i1 %.not12, i32 %268, i32 0, !dbg !37 + %271 = xor i32 %269, %182, !dbg !38 + %272 = xor i32 %270, %183, !dbg !38 + %273 = mul nuw nsw i32 %265, %39, !dbg !24 + %274 = mul nuw nsw i32 %266, %39, !dbg !24 + %275 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %273, i32 16, i32 31), !dbg !25 + %276 = add i32 %273, %275, !dbg !28 + %277 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %274, i32 16, i32 31), !dbg !25 + %278 = add i32 %274, %277, !dbg !28 + %279 = mul nuw nsw i32 %265, %.lobit, !dbg !29 + %280 = mul nuw nsw i32 %266, %.lobit, !dbg !29 + %281 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %279, i32 16, i32 31), !dbg !25 + %282 = add i32 %279, %281, !dbg !28 + %283 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %280, i32 16, i32 31), !dbg !25 + %284 = add i32 %280, %283, !dbg !28 + %285 = mul nuw nsw i32 %271, %39, !dbg !30 + %286 = mul nuw nsw i32 %272, %39, !dbg !30 + %287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 16, i32 31), !dbg !25 + %288 = add i32 %285, %287, !dbg !28 + %289 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %286, i32 16, i32 31), !dbg !25 + %290 = add i32 %286, %289, !dbg !28 + %291 = mul nuw nsw i32 %271, %.lobit, !dbg !31 + %292 = mul nuw nsw i32 %272, %.lobit, !dbg !31 + %293 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %291, i32 16, i32 31), !dbg !25 + %294 = add i32 %293, %291, !dbg !28 + %295 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %292, i32 16, i32 31), !dbg !25 + %296 = add i32 %295, %292, !dbg !28 + %297 = icmp slt i32 %276, %282, !dbg !39 + %298 = icmp sge i32 %278, %284, !dbg !39 + %299 = icmp eq i32 %276, %282, !dbg !40 + %300 = icmp ne i32 %278, %284, !dbg !40 + %301 = icmp sgt i32 %288, %294, !dbg !41 + %302 = icmp sle i32 %290, %296, !dbg !41 + %303 = and i1 %299, %301, !dbg !42 + %.not21 = or i1 %300, %302, !dbg !43 + %304 = or i1 %297, %303, !dbg !43 + %.not18 = and i1 %298, %.not21, !dbg !44 + %305 = xor i32 %276, %282, !dbg !33 + %306 = xor i32 %278, %284, !dbg !33 + %307 = select i1 %304, i32 %305, i32 0, !dbg !34 + %308 = select i1 %.not18, i32 %306, i32 0, !dbg !34 + %309 = xor i32 %307, %265, !dbg !35 + %310 = xor i32 %308, %266, !dbg !35 + %311 = xor i32 %294, %288, !dbg !36 + %312 = xor i32 %296, %290, !dbg !36 + %313 = select i1 %304, i32 %311, i32 0, !dbg !37 + %314 = select i1 %.not18, i32 %312, i32 0, !dbg !37 + %315 = xor i32 %313, %271, !dbg !38 + %316 = xor i32 %314, %272, !dbg !38 + %317 = mul nuw nsw i32 %309, %38, !dbg !24 + %318 = mul nuw nsw i32 %310, %38, !dbg !24 + %319 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %317, i32 8, i32 31), !dbg !25 + %320 = add i32 %317, %319, !dbg !28 + %321 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %318, i32 8, i32 31), !dbg !25 + %322 = add i32 %318, %321, !dbg !28 + %323 = mul nuw nsw i32 %309, %.lobit1, !dbg !29 + %324 = mul nuw nsw i32 %310, %.lobit1, !dbg !29 + %325 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %323, i32 8, i32 31), !dbg !25 + %326 = add i32 %323, %325, !dbg !28 + %327 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %324, i32 8, i32 31), !dbg !25 + %328 = add i32 %324, %327, !dbg !28 + %329 = mul nuw nsw i32 %315, %38, !dbg !30 + %330 = mul nuw nsw i32 %316, %38, !dbg !30 + %331 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %329, i32 8, i32 31), !dbg !25 + %332 = add i32 %329, %331, !dbg !28 + %333 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %330, i32 8, i32 31), !dbg !25 + %334 = add i32 %330, %333, !dbg !28 + %335 = mul nuw nsw i32 %315, %.lobit1, !dbg !31 + %336 = mul nuw nsw i32 %316, %.lobit1, !dbg !31 + %337 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %335, i32 8, i32 31), !dbg !25 + %338 = add i32 %337, %335, !dbg !28 + %339 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %336, i32 8, i32 31), !dbg !25 + %340 = add i32 %339, %336, !dbg !28 + %341 = icmp slt i32 %320, %326, !dbg !39 + %342 = icmp sge i32 %322, %328, !dbg !39 + %343 = icmp eq i32 %320, %326, !dbg !40 + %344 = icmp ne i32 %322, %328, !dbg !40 + %345 = icmp sgt i32 %332, %338, !dbg !41 + %346 = icmp sle i32 %334, %340, !dbg !41 + %347 = and i1 %343, %345, !dbg !42 + %.not27 = or i1 %344, %346, !dbg !43 + %348 = or i1 %341, %347, !dbg !43 + %.not24 = and i1 %342, %.not27, !dbg !44 + %349 = xor i32 %320, %326, !dbg !33 + %350 = xor i32 %322, %328, !dbg !33 + %351 = select i1 %348, i32 %349, i32 0, !dbg !34 + %352 = select i1 %.not24, i32 %350, i32 0, !dbg !34 + %353 = xor i32 %351, %309, !dbg !35 + %354 = xor i32 %352, %310, !dbg !35 + %355 = xor i32 %338, %332, !dbg !36 + %356 = xor i32 %340, %334, !dbg !36 + %357 = select i1 %348, i32 %355, i32 0, !dbg !37 + %358 = select i1 %.not24, i32 %356, i32 0, !dbg !37 + %359 = xor i32 %357, %315, !dbg !38 + %360 = xor i32 %358, %316, !dbg !38 + %361 = icmp slt i32 %353, %354, !dbg !39 + %362 = icmp eq i32 %353, %354, !dbg !40 + %363 = icmp sgt i32 %359, %360, !dbg !41 + %364 = and i1 %362, %363, !dbg !42 + %365 = or i1 %361, %364, !dbg !43 + %366 = xor i32 %354, %353, !dbg !33 + %367 = select i1 %365, i32 %366, i32 0, !dbg !34 + %368 = xor i32 %367, %353, !dbg !35 + %369 = xor i32 %367, %354, !dbg !35 + %370 = xor i32 %360, %359, !dbg !36 + %371 = select i1 %365, i32 %370, i32 0, !dbg !37 + %372 = xor i32 %371, %359, !dbg !38 + %373 = xor i32 %371, %360, !dbg !38 + %374 = mul nuw nsw i32 %368, %40, !dbg !24 + %375 = mul nuw nsw i32 %369, %40, !dbg !24 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %376 = insertelement <1 x i32> poison, i32 %374, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %376, i1 true) #4, !dbg !25 + %377 = insertelement <1 x i32> poison, i32 %375, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %377, i1 true) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %378 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25 + %379 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %378, i32 1, i32 31), !dbg !25 + %380 = add i32 %379, %378, !dbg !28 + %381 = insertelement <1 x i32> poison, i32 %380, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %381, i1 %202) #4, !dbg !25 + %382 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25 + %383 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %382, i32 1, i32 31), !dbg !25 + %384 = add i32 %383, %382, !dbg !28 + %385 = insertelement <1 x i32> poison, i32 %384, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %385, i1 %202) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %386 = load i32, ptr addrspace(3) %190, align 16, !dbg !25 + %387 = load i32, ptr addrspace(3) %193, align 8, !dbg !25 + %388 = mul nuw nsw i32 %368, %.lobit2, !dbg !29 + %389 = mul nuw nsw i32 %369, %.lobit2, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %390 = insertelement <1 x i32> poison, i32 %388, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %390, i1 true) #4, !dbg !25 + %391 = insertelement <1 x i32> poison, i32 %389, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %391, i1 true) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %392 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25 + %393 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %392, i32 1, i32 31), !dbg !25 + %394 = add i32 %393, %392, !dbg !28 + %395 = insertelement <1 x i32> poison, i32 %394, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %395, i1 %202) #4, !dbg !25 + %396 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25 + %397 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %396, i32 1, i32 31), !dbg !25 + %398 = add i32 %397, %396, !dbg !28 + %399 = insertelement <1 x i32> poison, i32 %398, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %399, i1 %202) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %400 = load i32, ptr addrspace(3) %190, align 16, !dbg !25 + %401 = load i32, ptr addrspace(3) %193, align 8, !dbg !25 + %402 = mul nuw nsw i32 %372, %40, !dbg !30 + %403 = mul nuw nsw i32 %373, %40, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %404 = insertelement <1 x i32> poison, i32 %402, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %404, i1 true) #4, !dbg !25 + %405 = insertelement <1 x i32> poison, i32 %403, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %405, i1 true) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %406 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25 + %407 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %406, i32 1, i32 31), !dbg !25 + %408 = add i32 %407, %406, !dbg !28 + %409 = insertelement <1 x i32> poison, i32 %408, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %409, i1 %202) #4, !dbg !25 + %410 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25 + %411 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %410, i32 1, i32 31), !dbg !25 + %412 = add i32 %411, %410, !dbg !28 + %413 = insertelement <1 x i32> poison, i32 %412, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %413, i1 %202) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %414 = load i32, ptr addrspace(3) %190, align 16, !dbg !25 + %415 = load i32, ptr addrspace(3) %193, align 8, !dbg !25 + %416 = mul nuw nsw i32 %372, %.lobit2, !dbg !31 + %417 = mul nuw nsw i32 %373, %.lobit2, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %418 = insertelement <1 x i32> poison, i32 %416, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %418, i1 true) #4, !dbg !25 + %419 = insertelement <1 x i32> poison, i32 %417, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %419, i1 true) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %420 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25 + %421 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %420, i32 1, i32 31), !dbg !25 + %422 = add i32 %421, %420, !dbg !28 + %423 = insertelement <1 x i32> poison, i32 %422, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %423, i1 %202) #4, !dbg !25 + %424 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25 + %425 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %424, i32 1, i32 31), !dbg !25 + %426 = add i32 %425, %424, !dbg !28 + %427 = insertelement <1 x i32> poison, i32 %426, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %427, i1 %202) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %428 = load i32, ptr addrspace(3) %190, align 16, !dbg !25 + %429 = load i32, ptr addrspace(3) %193, align 8, !dbg !25 + %430 = icmp slt i32 %386, %400, !dbg !39 + %431 = icmp slt i32 %387, %401, !dbg !39 + %432 = icmp eq i32 %386, %400, !dbg !40 + %433 = icmp eq i32 %387, %401, !dbg !40 + %434 = icmp sgt i32 %414, %428, !dbg !41 + %435 = icmp sgt i32 %415, %429, !dbg !41 + %436 = and i1 %432, %434, !dbg !42 + %437 = and i1 %433, %435, !dbg !42 + %438 = or i1 %430, %436, !dbg !43 + %439 = or i1 %431, %437, !dbg !43 + %440 = xor i32 %400, %386, !dbg !33 + %441 = xor i32 %401, %387, !dbg !33 + %442 = select i1 %438, i32 %440, i32 0, !dbg !34 + %443 = select i1 %439, i32 %441, i32 0, !dbg !34 + %444 = xor i32 %442, %368, !dbg !35 + %445 = xor i32 %443, %369, !dbg !35 + %446 = xor i32 %428, %414, !dbg !36 + %447 = xor i32 %429, %415, !dbg !36 + %448 = select i1 %438, i32 %446, i32 0, !dbg !37 + %449 = select i1 %439, i32 %447, i32 0, !dbg !37 + %450 = xor i32 %448, %372, !dbg !38 + %451 = xor i32 %449, %373, !dbg !38 + %452 = mul nuw nsw i32 %444, %39, !dbg !24 + %453 = mul nuw nsw i32 %445, %39, !dbg !24 + %454 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %452, i32 16, i32 31), !dbg !25 + %455 = add i32 %452, %454, !dbg !28 + %456 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %453, i32 16, i32 31), !dbg !25 + %457 = add i32 %453, %456, !dbg !28 + %458 = mul nuw nsw i32 %444, %.lobit, !dbg !29 + %459 = mul nuw nsw i32 %445, %.lobit, !dbg !29 + %460 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %458, i32 16, i32 31), !dbg !25 + %461 = add i32 %458, %460, !dbg !28 + %462 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %459, i32 16, i32 31), !dbg !25 + %463 = add i32 %459, %462, !dbg !28 + %464 = mul nuw nsw i32 %450, %39, !dbg !30 + %465 = mul nuw nsw i32 %451, %39, !dbg !30 + %466 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %464, i32 16, i32 31), !dbg !25 + %467 = add i32 %464, %466, !dbg !28 + %468 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %465, i32 16, i32 31), !dbg !25 + %469 = add i32 %465, %468, !dbg !28 + %470 = mul nuw nsw i32 %450, %.lobit, !dbg !31 + %471 = mul nuw nsw i32 %451, %.lobit, !dbg !31 + %472 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %470, i32 16, i32 31), !dbg !25 + %473 = add i32 %472, %470, !dbg !28 + %474 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %471, i32 16, i32 31), !dbg !25 + %475 = add i32 %474, %471, !dbg !28 + %476 = icmp slt i32 %455, %461, !dbg !39 + %477 = icmp slt i32 %457, %463, !dbg !39 + %478 = icmp eq i32 %455, %461, !dbg !40 + %479 = icmp eq i32 %457, %463, !dbg !40 + %480 = icmp sgt i32 %467, %473, !dbg !41 + %481 = icmp sgt i32 %469, %475, !dbg !41 + %482 = and i1 %478, %480, !dbg !42 + %483 = and i1 %479, %481, !dbg !42 + %484 = or i1 %476, %482, !dbg !43 + %485 = or i1 %477, %483, !dbg !43 + %486 = xor i32 %455, %461, !dbg !33 + %487 = xor i32 %457, %463, !dbg !33 + %488 = select i1 %484, i32 %486, i32 0, !dbg !34 + %489 = select i1 %485, i32 %487, i32 0, !dbg !34 + %490 = xor i32 %488, %444, !dbg !35 + %491 = xor i32 %489, %445, !dbg !35 + %492 = xor i32 %473, %467, !dbg !36 + %493 = xor i32 %475, %469, !dbg !36 + %494 = select i1 %484, i32 %492, i32 0, !dbg !37 + %495 = select i1 %485, i32 %493, i32 0, !dbg !37 + %496 = xor i32 %494, %450, !dbg !38 + %497 = xor i32 %495, %451, !dbg !38 + %498 = mul nuw nsw i32 %490, %38, !dbg !24 + %499 = mul nuw nsw i32 %491, %38, !dbg !24 + %500 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %498, i32 8, i32 31), !dbg !25 + %501 = add i32 %498, %500, !dbg !28 + %502 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %499, i32 8, i32 31), !dbg !25 + %503 = add i32 %499, %502, !dbg !28 + %504 = mul nuw nsw i32 %490, %.lobit1, !dbg !29 + %505 = mul nuw nsw i32 %491, %.lobit1, !dbg !29 + %506 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %504, i32 8, i32 31), !dbg !25 + %507 = add i32 %504, %506, !dbg !28 + %508 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %505, i32 8, i32 31), !dbg !25 + %509 = add i32 %505, %508, !dbg !28 + %510 = mul nuw nsw i32 %496, %38, !dbg !30 + %511 = mul nuw nsw i32 %497, %38, !dbg !30 + %512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %510, i32 8, i32 31), !dbg !25 + %513 = add i32 %510, %512, !dbg !28 + %514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %511, i32 8, i32 31), !dbg !25 + %515 = add i32 %511, %514, !dbg !28 + %516 = mul nuw nsw i32 %496, %.lobit1, !dbg !31 + %517 = mul nuw nsw i32 %497, %.lobit1, !dbg !31 + %518 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %516, i32 8, i32 31), !dbg !25 + %519 = add i32 %518, %516, !dbg !28 + %520 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %517, i32 8, i32 31), !dbg !25 + %521 = add i32 %520, %517, !dbg !28 + %522 = icmp slt i32 %501, %507, !dbg !39 + %523 = icmp slt i32 %503, %509, !dbg !39 + %524 = icmp eq i32 %501, %507, !dbg !40 + %525 = icmp eq i32 %503, %509, !dbg !40 + %526 = icmp sgt i32 %513, %519, !dbg !41 + %527 = icmp sgt i32 %515, %521, !dbg !41 + %528 = and i1 %524, %526, !dbg !42 + %529 = and i1 %525, %527, !dbg !42 + %530 = or i1 %522, %528, !dbg !43 + %531 = or i1 %523, %529, !dbg !43 + %532 = xor i32 %519, %513, !dbg !36 + %533 = xor i32 %521, %515, !dbg !36 + %534 = select i1 %530, i32 %532, i32 0, !dbg !37 + %535 = select i1 %531, i32 %533, i32 0, !dbg !37 + %536 = xor i32 %534, %496, !dbg !38 + %537 = xor i32 %535, %497, !dbg !38 + %narrow = select i1 %18, i32 %33, i32 0, !dbg !45 + %538 = sext i32 %narrow to i64, !dbg !45 + %narrow28 = select i1 %18, i32 %34, i32 0, !dbg !45 + %539 = sext i32 %narrow28 to i64, !dbg !45 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %540 = add nsw i64 %539, %538, !dbg !48 + %extelt.offset = lshr i64 %540, 32, !dbg !46 + %541 = trunc nuw i64 %extelt.offset to i32, !dbg !46 + %542 = trunc i64 %540 to i32, !dbg !46 + %543 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %542, i32 16, i32 31), !dbg !46 + %544 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %541, i32 16, i32 31), !dbg !46 + %545 = insertelement <2 x i32> poison, i32 %543, i64 0, !dbg !46 + %546 = insertelement <2 x i32> %545, i32 %544, i64 1, !dbg !46 + %547 = bitcast <2 x i32> %546 to i64, !dbg !46 + %548 = add i64 %540, %547, !dbg !48 + %extelt.offset29 = lshr i64 %548, 32, !dbg !46 + %549 = trunc nuw i64 %extelt.offset29 to i32, !dbg !46 + %550 = trunc i64 %548 to i32, !dbg !46 + %551 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %550, i32 8, i32 31), !dbg !46 + %552 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %549, i32 8, i32 31), !dbg !46 + %553 = insertelement <2 x i32> poison, i32 %551, i64 0, !dbg !46 + %554 = insertelement <2 x i32> %553, i32 %552, i64 1, !dbg !46 + %555 = bitcast <2 x i32> %554 to i64, !dbg !46 + %556 = add i64 %548, %555, !dbg !48 + %557 = icmp eq i32 %11, 0, !dbg !46 + %558 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %21, !dbg !46 + %559 = getelementptr i64, ptr addrspace(3) %558, i32 %186, !dbg !46 + %560 = insertelement <1 x i64> poison, i64 %556, i64 0, !dbg !46 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %559, <1 x i64> %560, i1 %557) #4, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %561 = icmp samesign ult i32 %10, 16, !dbg !46 + %562 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %10, !dbg !46 + %563 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %562, i1 %561) #4, !dbg !46 + %extelt.offset30 = lshr i64 %563, 32, !dbg !46 + %564 = trunc nuw i64 %extelt.offset30 to i32, !dbg !46 + %565 = trunc i64 %563 to i32, !dbg !46 + %566 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %565, i32 1, i32 31), !dbg !46 + %567 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %564, i32 1, i32 31), !dbg !46 + %568 = insertelement <2 x i32> poison, i32 %566, i64 0, !dbg !46 + %569 = insertelement <2 x i32> %568, i32 %567, i64 1, !dbg !46 + %570 = bitcast <2 x i32> %569 to i64, !dbg !46 + %571 = add i64 %563, %570, !dbg !48 + %572 = and i32 %10, 1009, !dbg !46 + %573 = icmp eq i32 %572, 0, !dbg !46 + %574 = insertelement <1 x i64> poison, i64 %571, i64 0, !dbg !46 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %562, <1 x i64> %574, i1 %573) #4, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %575 = load i64, ptr addrspace(3) %558, align 16, !dbg !46 + %576 = trunc i64 %575 to i32, !dbg !49 + %577 = shl i32 %17, 4, !dbg !50 + %578 = or disjoint i32 %577, %21, !dbg !51 + %579 = sext i32 %578 to i64, !dbg !52 + %580 = getelementptr i32, ptr addrspace(1) %1, i64 %579, !dbg !52 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %581 = and i32 %10, 3, !dbg !53 + %582 = shl nuw nsw i32 %581, 3, !dbg !53 + %583 = and i32 %187, 96, !dbg !53 + %584 = and i32 %10, 4, !dbg !53 + %585 = icmp eq i32 %584, 0, !dbg !53 + %586 = select i1 %585, i32 0, i32 192, !dbg !53 + %587 = select i1 %.not, i32 0, i32 260, !dbg !53 + %588 = or disjoint i32 %582, %583, !dbg !53 + %589 = xor i32 %588, %586, !dbg !53 + %590 = or disjoint i32 %589, %587, !dbg !53 + %591 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %590, !dbg !53 + %592 = insertelement <1 x i32> poison, i32 %536, i64 0, !dbg !53 + store <1 x i32> %592, ptr addrspace(3) %591, align 4, !dbg !53 + %593 = xor i32 %590, 4, !dbg !53 + %594 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %593, !dbg !53 + %595 = insertelement <1 x i32> poison, i32 %537, i64 0, !dbg !53 + store <1 x i32> %595, ptr addrspace(3) %594, align 4, !dbg !53 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %596 = shl nuw nsw i32 %581, 5, !dbg !53 + %597 = and i32 %10, 28, !dbg !53 + %598 = select i1 %.not3, i32 0, i32 192, !dbg !53 + %599 = or disjoint i32 %596, %597, !dbg !53 + %600 = xor i32 %599, %598, !dbg !53 + %601 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %600, !dbg !53 + %602 = load i32, ptr addrspace(3) %601, align 4, !dbg !53 + %603 = xor i32 %600, 260, !dbg !53 + %604 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %603, !dbg !53 + %605 = load i32, ptr addrspace(3) %604, align 4, !dbg !53 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %602, i32 %605, ptr addrspace(1) %580, i1 %19) #4, !dbg !53 + %606 = sext i32 %16 to i64, !dbg !54 + %607 = getelementptr i32, ptr addrspace(1) %2, i64 %606, !dbg !54 + %608 = and i32 %10, 56, !dbg !55 + %609 = icmp eq i32 %608, 0, !dbg !55 + %610 = and i1 %609, %18, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %576, ptr addrspace(1) %607, i1 %610) #4, !dbg !55 + ret void, !dbg !56 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", linkageName: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 24, column: 28, scope: !4) +!8 = !DILocation(line: 24, column: 33, scope: !4) +!9 = !DILocation(line: 25, column: 44, scope: !4) +!10 = !DILocation(line: 25, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 21, scope: !4) +!12 = !DILocation(line: 27, column: 38, scope: !4) +!13 = !DILocation(line: 34, column: 19, scope: !4) +!14 = !DILocation(line: 36, column: 38, scope: !4) +!15 = !DILocation(line: 36, column: 35, scope: !4) +!16 = !DILocation(line: 36, column: 45, scope: !4) +!17 = !DILocation(line: 36, column: 30, scope: !4) +!18 = !DILocation(line: 36, column: 54, scope: !4) +!19 = !DILocation(line: 627, column: 44, scope: !20, inlinedAt: !22) +!20 = distinct !DILexicalBlockFile(scope: !4, file: !21, discriminator: 0) +!21 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!22 = !DILocation(line: 41, column: 67, scope: !4) +!23 = !DILocation(line: 537, column: 21, scope: !20, inlinedAt: !22) +!24 = !DILocation(line: 538, column: 40, scope: !20, inlinedAt: !22) +!25 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !22) +!26 = distinct !DILexicalBlockFile(scope: !4, file: !27, discriminator: 0) +!27 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!28 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !22) +!29 = !DILocation(line: 539, column: 41, scope: !20, inlinedAt: !22) +!30 = !DILocation(line: 548, column: 23, scope: !20, inlinedAt: !22) +!31 = !DILocation(line: 551, column: 23, scope: !20, inlinedAt: !22) +!32 = !DILocation(line: 599, column: 28, scope: !20, inlinedAt: !22) +!33 = !DILocation(line: 600, column: 38, scope: !20, inlinedAt: !22) +!34 = !DILocation(line: 600, column: 46, scope: !20, inlinedAt: !22) +!35 = !DILocation(line: 600, column: 15, scope: !20, inlinedAt: !22) +!36 = !DILocation(line: 601, column: 48, scope: !20, inlinedAt: !22) +!37 = !DILocation(line: 601, column: 59, scope: !20, inlinedAt: !22) +!38 = !DILocation(line: 601, column: 22, scope: !20, inlinedAt: !22) +!39 = !DILocation(line: 574, column: 22, scope: !20, inlinedAt: !22) +!40 = !DILocation(line: 591, column: 21, scope: !20, inlinedAt: !22) +!41 = !DILocation(line: 594, column: 40, scope: !20, inlinedAt: !22) +!42 = !DILocation(line: 594, column: 29, scope: !20, inlinedAt: !22) +!43 = !DILocation(line: 594, column: 23, scope: !20, inlinedAt: !22) +!44 = !DILocation(line: 599, column: 19, scope: !20, inlinedAt: !22) +!45 = !DILocation(line: 44, column: 34, scope: !4) +!46 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !47) +!47 = !DILocation(line: 45, column: 26, scope: !4) +!48 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !47) +!49 = !DILocation(line: 48, column: 21, scope: !4) +!50 = !DILocation(line: 49, column: 35, scope: !4) +!51 = !DILocation(line: 49, column: 32, scope: !4) +!52 = !DILocation(line: 49, column: 25, scope: !4) +!53 = !DILocation(line: 49, column: 47, scope: !4) +!54 = !DILocation(line: 50, column: 25, scope: !4) +!55 = !DILocation(line: 50, column: 37, scope: !4) +!56 = !DILocation(line: 50, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx b/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx new file mode 100644 index 0000000000000000000000000000000000000000..667bb9876d0bc5e0cfe0a547effb8a4a4bcb406d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx @@ -0,0 +1,1410 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 // -- Begin function triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.visible .entry triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_6 +) +.reqntid 64 +{ + .reg .pred %p<155>; + .reg .b32 %r<504>; + .reg .b64 %rd<27>; + .loc 1 18 0 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0]; + ld.param.b64 %rd9, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1]; +$L__tmp0: + .loc 1 24 28 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:24:28 + mov.u32 %r105, %ctaid.x; + .loc 1 24 33 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:24:33 + shl.b32 %r106, %r105, 3; + ld.param.b64 %rd10, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2]; + .loc 1 25 44 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:25:44 + mov.u32 %r107, %tid.x; + and.b32 %r108, %r107, 24; + and.b32 %r109, %r107, 7; + bfe.u32 %r110, %r107, 3, 3; + .loc 1 25 23 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:25:23 + or.b32 %r111, %r106, %r109; + or.b32 %r112, %r110, %r106; + .loc 1 26 21 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:26:21 + setp.lt.s32 %p1, %r111, 128; + setp.lt.s32 %p54, %r112, 128; + .loc 1 27 38 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:27:38 + or.b32 %r113, %r110, 8; + shl.b32 %r114, %r109, 1; + .loc 1 34 19 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:34:19 + bfe.s32 %r115, %r105, 28, 1; + shr.u32 %r116, %r115, 28; + add.s32 %r117, %r111, %r116; + .loc 1 36 35 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:36:35 + shl.b32 %r118, %r117, 4; + and.b32 %r119, %r118, -256; + add.s32 %r120, %r119, %r111; + .loc 1 36 45 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:36:45 + mad.lo.s32 %r121, %r110, 17, %r120; + add.s32 %r122, %r121, 136; + .loc 1 36 30 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:36:30 + mad.wide.s32 %rd1, %r121, 4, %rd8; + mad.wide.s32 %rd2, %r122, 4, %rd8; + .loc 1 36 54 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:36:54 + // begin inline asm + mov.u32 %r1, 0x0; + @%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2, 0x0; + @%p1 ld.global.b32 { %r2 }, [ %rd2 + 0 ]; + // end inline asm +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shr.u32 %r123, %r107, 4; + bfe.u32 %r124, %r107, 4, 1; + bfe.s32 %r125, %r107, 3, 1; + and.b32 %r126, %r107, 8; + bfe.u32 %r127, %r107, 3, 1; + bfe.s32 %r128, %r107, 5, 1; + bfe.u32 %r129, %r107, 5, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r130, %r127, 1; + xor.b32 %r131, %r124, 1; + xor.b32 %r132, %r129, 1; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r133, %r1, %r130; + mul.lo.s32 %r134, %r2, %r130; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r135, %r133, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r136, %r135, %r133; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r137, %r134, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r138, %r137, %r134; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r139, %r1, %r127; + mul.lo.s32 %r140, %r2, %r127; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r141, %r139, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r142, %r141, %r139; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r143, %r140, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r144, %r143, %r140; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r145, %r130, %r110; + shl.b32 %r146, %r130, 3; + or.b32 %r147, %r145, %r146; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r148, %r145, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r149, %r148, %r145; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r150, %r147, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r151, %r150, %r147; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r152, %r110, %r127; + or.b32 %r153, %r152, %r126; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r154, %r152, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r155, %r154, %r152; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r156, %r153, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r157, %r156, %r153; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + and.b32 %r158, %r123, 1; + setp.ne.b32 %p56, %r158, 0; + setp.ge.s32 %p57, %r136, %r142; + setp.ne.b32 %p58, %r136, %r142; + setp.le.s32 %p59, %r149, %r155; + or.pred %p60, %p58, %p59; + and.pred %p61, %p57, %p60; + xor.pred %p62, %p61, %p56; + setp.ge.s32 %p63, %r138, %r144; + setp.ne.b32 %p64, %r138, %r144; + setp.le.s32 %p65, %r151, %r157; + or.pred %p66, %p64, %p65; + and.pred %p67, %p63, %p66; + xor.pred %p68, %p67, %p56; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r159, %r142, %r136; + xor.b32 %r160, %r144, %r138; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r161, 0, %r159, %p62; + selp.b32 %r162, 0, %r160, %p68; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r163, %r161, %r1; + xor.b32 %r164, %r162, %r2; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r165, %r155, %r149; + xor.b32 %r166, %r157, %r151; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r167, 0, %r165, %p62; + selp.b32 %r168, 0, %r166, %p68; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r169, %r167, %r110; + xor.b32 %r170, %r168, %r113; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r171, %r163, %r131; + mul.lo.s32 %r172, %r164, %r131; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r173, %r171, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r174, %r171, %r173; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r175, %r172, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r176, %r172, %r175; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r177, %r163, %r124; + mul.lo.s32 %r178, %r164, %r124; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r179, %r177, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r180, %r177, %r179; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r181, %r178, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r182, %r178, %r181; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r183, %r169, %r131; + mul.lo.s32 %r184, %r170, %r131; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r185, %r183, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r186, %r183, %r185; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r187, %r184, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r188, %r184, %r187; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r189, %r169, %r124; + mul.lo.s32 %r190, %r170, %r124; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r191, %r189, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r192, %r189, %r191; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r193, %r190, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r194, %r190, %r193; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.lt.s32 %p69, %r174, %r180; + setp.lt.s32 %p70, %r176, %r182; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.eq.b32 %p71, %r174, %r180; + setp.eq.b32 %p72, %r176, %r182; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.gt.s32 %p73, %r186, %r192; + setp.gt.s32 %p74, %r188, %r194; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + and.pred %p75, %p71, %p73; + and.pred %p76, %p72, %p74; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + or.pred %p77, %p69, %p75; + or.pred %p78, %p70, %p76; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r195, 1, 0, %p77; + selp.b32 %r196, 1, 0, %p78; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.eq.b32 %p79, %r129, %r195; + setp.eq.b32 %p80, %r129, %r196; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r197, %r174, %r180; + xor.b32 %r198, %r176, %r182; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r199, 0, %r197, %p79; + selp.b32 %r200, 0, %r198, %p80; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r201, %r199, %r163; + xor.b32 %r202, %r200, %r164; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r203, %r186, %r192; + xor.b32 %r204, %r188, %r194; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r205, 0, %r203, %p79; + selp.b32 %r206, 0, %r204, %p80; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r207, %r205, %r169; + xor.b32 %r208, %r206, %r170; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r209, %r201, %r130; + mul.lo.s32 %r210, %r202, %r130; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r211, %r209, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r212, %r209, %r211; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r213, %r210, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r214, %r210, %r213; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r215, %r201, %r127; + mul.lo.s32 %r216, %r202, %r127; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r217, %r215, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r218, %r215, %r217; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r219, %r216, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r220, %r216, %r219; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r221, %r207, %r130; + mul.lo.s32 %r222, %r208, %r130; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r223, %r221, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r224, %r221, %r223; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r225, %r222, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r226, %r222, %r225; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r227, %r207, %r127; + mul.lo.s32 %r228, %r208, %r127; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r229, %r227, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r230, %r227, %r229; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r231, %r228, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r232, %r228, %r231; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.lt.s32 %p81, %r212, %r218; + setp.lt.s32 %p82, %r214, %r220; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.eq.b32 %p83, %r212, %r218; + setp.eq.b32 %p84, %r214, %r220; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.gt.s32 %p85, %r224, %r230; + setp.gt.s32 %p86, %r226, %r232; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + and.pred %p87, %p83, %p85; + and.pred %p88, %p84, %p86; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + or.pred %p89, %p81, %p87; + or.pred %p90, %p82, %p88; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r233, 1, 0, %p89; + selp.b32 %r234, 1, 0, %p90; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.eq.b32 %p91, %r129, %r233; + setp.eq.b32 %p92, %r129, %r234; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r235, %r212, %r218; + xor.b32 %r236, %r214, %r220; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r237, 0, %r235, %p91; + selp.b32 %r238, 0, %r236, %p92; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r239, %r237, %r201; + xor.b32 %r240, %r238, %r202; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r241, %r224, %r230; + xor.b32 %r242, %r226, %r232; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r243, 0, %r241, %p91; + selp.b32 %r244, 0, %r242, %p92; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r245, %r243, %r207; + xor.b32 %r246, %r244, %r208; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r4, %r239, %r132; + mul.lo.s32 %r6, %r240, %r132; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shl.b32 %r247, %r107, 1; + and.b32 %r248, %r247, 48; + or.b32 %r249, %r248, %r114; + shl.b32 %r250, %r249, 3; + mov.b32 %r251, global_smem; + add.s32 %r252, %r251, %r250; + shl.b32 %r253, %r129, 2; + add.s32 %r3, %r252, %r253; + mov.pred %p3, -1; + // begin inline asm + @%p3 st.shared.b32 [ %r3 + 0 ], %r4; + // end inline asm + add.s32 %r5, %r3, 8; + // begin inline asm + @%p3 st.shared.b32 [ %r5 + 0 ], %r6; + // end inline asm + bar.sync 0; + setp.lt.u32 %p5, %r107, 128; + shl.b32 %r254, %r107, 2; + add.s32 %r8, %r251, %r254; + // begin inline asm + @%p5 ld.shared.b32 %r7, [ %r8 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r255, %r7, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r10, %r255, %r7; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + and.b32 %r256, %r107, 897; + setp.eq.b32 %p6, %r256, 0; + // begin inline asm + @%p6 st.shared.b32 [ %r8 + 0 ], %r10; + // end inline asm + add.s32 %r12, %r8, 256; + // begin inline asm + @%p5 ld.shared.b32 %r11, [ %r12 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r257, %r11, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r14, %r257, %r11; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r12 + 0 ], %r14; + // end inline asm + bar.sync 0; + ld.shared.b32 %r258, [%r252]; + ld.shared.b32 %r259, [%r252+8]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r16, %r239, %r129; + mul.lo.s32 %r18, %r240, %r129; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p3 st.shared.b32 [ %r3 + 0 ], %r16; + // end inline asm + // begin inline asm + @%p3 st.shared.b32 [ %r5 + 0 ], %r18; + // end inline asm + bar.sync 0; + // begin inline asm + @%p5 ld.shared.b32 %r19, [ %r8 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r260, %r19, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r22, %r260, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r8 + 0 ], %r22; + // end inline asm + // begin inline asm + @%p5 ld.shared.b32 %r23, [ %r12 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r261, %r23, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r26, %r261, %r23; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r12 + 0 ], %r26; + // end inline asm + bar.sync 0; + ld.shared.b32 %r262, [%r252]; + ld.shared.b32 %r263, [%r252+8]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r28, %r245, %r132; + mul.lo.s32 %r30, %r246, %r132; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p3 st.shared.b32 [ %r3 + 0 ], %r28; + // end inline asm + // begin inline asm + @%p3 st.shared.b32 [ %r5 + 0 ], %r30; + // end inline asm + bar.sync 0; + // begin inline asm + @%p5 ld.shared.b32 %r31, [ %r8 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r264, %r31, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r34, %r264, %r31; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r8 + 0 ], %r34; + // end inline asm + // begin inline asm + @%p5 ld.shared.b32 %r35, [ %r12 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r265, %r35, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r38, %r265, %r35; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r12 + 0 ], %r38; + // end inline asm + bar.sync 0; + ld.shared.b32 %r266, [%r252]; + ld.shared.b32 %r267, [%r252+8]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r40, %r245, %r129; + mul.lo.s32 %r42, %r246, %r129; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p3 st.shared.b32 [ %r3 + 0 ], %r40; + // end inline asm + // begin inline asm + @%p3 st.shared.b32 [ %r5 + 0 ], %r42; + // end inline asm + bar.sync 0; + // begin inline asm + @%p5 ld.shared.b32 %r43, [ %r8 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r268, %r43, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r46, %r268, %r43; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r8 + 0 ], %r46; + // end inline asm + // begin inline asm + @%p5 ld.shared.b32 %r47, [ %r12 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r269, %r47, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r50, %r269, %r47; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r12 + 0 ], %r50; + // end inline asm + bar.sync 0; + ld.shared.b32 %r270, [%r252]; + ld.shared.b32 %r271, [%r252+8]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.lt.s32 %p93, %r258, %r262; + setp.ge.s32 %p94, %r259, %r263; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.eq.b32 %p95, %r258, %r262; + setp.ne.b32 %p96, %r259, %r263; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.gt.s32 %p97, %r266, %r270; + setp.le.s32 %p98, %r267, %r271; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + and.pred %p99, %p95, %p97; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + or.pred %p100, %p96, %p98; + or.pred %p101, %p93, %p99; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + and.pred %p102, %p94, %p100; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r272, %r262, %r258; + xor.b32 %r273, %r263, %r259; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r274, %r272, 0, %p101; + selp.b32 %r275, %r273, 0, %p102; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r276, %r274, %r239; + xor.b32 %r277, %r275, %r240; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r278, %r270, %r266; + xor.b32 %r279, %r271, %r267; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r280, %r278, 0, %p101; + selp.b32 %r281, %r279, 0, %p102; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r282, %r280, %r245; + xor.b32 %r283, %r281, %r246; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r284, %r276, %r131; + mul.lo.s32 %r285, %r277, %r131; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r286, %r284, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r287, %r284, %r286; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r288, %r285, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r289, %r285, %r288; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r290, %r276, %r124; + mul.lo.s32 %r291, %r277, %r124; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r292, %r290, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r293, %r290, %r292; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r294, %r291, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r295, %r291, %r294; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r296, %r282, %r131; + mul.lo.s32 %r297, %r283, %r131; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r298, %r296, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r299, %r296, %r298; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r300, %r297, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r301, %r297, %r300; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r302, %r282, %r124; + mul.lo.s32 %r303, %r283, %r124; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r304, %r302, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r305, %r304, %r302; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r306, %r303, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r307, %r306, %r303; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.lt.s32 %p103, %r287, %r293; + setp.ge.s32 %p104, %r289, %r295; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.eq.b32 %p105, %r287, %r293; + setp.ne.b32 %p106, %r289, %r295; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.gt.s32 %p107, %r299, %r305; + setp.le.s32 %p108, %r301, %r307; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + and.pred %p109, %p105, %p107; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + or.pred %p110, %p106, %p108; + or.pred %p111, %p103, %p109; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + and.pred %p112, %p104, %p110; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r308, %r287, %r293; + xor.b32 %r309, %r289, %r295; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r310, %r308, 0, %p111; + selp.b32 %r311, %r309, 0, %p112; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r312, %r310, %r276; + xor.b32 %r313, %r311, %r277; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r314, %r305, %r299; + xor.b32 %r315, %r307, %r301; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r316, %r314, 0, %p111; + selp.b32 %r317, %r315, 0, %p112; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r318, %r316, %r282; + xor.b32 %r319, %r317, %r283; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r320, %r312, %r130; + mul.lo.s32 %r321, %r313, %r130; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r322, %r320, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r323, %r320, %r322; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r324, %r321, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r325, %r321, %r324; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r326, %r312, %r127; + mul.lo.s32 %r327, %r313, %r127; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r328, %r326, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r329, %r326, %r328; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r330, %r327, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r331, %r327, %r330; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r332, %r318, %r130; + mul.lo.s32 %r333, %r319, %r130; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r334, %r332, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r335, %r332, %r334; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r336, %r333, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r337, %r333, %r336; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r338, %r318, %r127; + mul.lo.s32 %r339, %r319, %r127; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r340, %r338, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r341, %r340, %r338; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r342, %r339, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r343, %r342, %r339; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.lt.s32 %p113, %r323, %r329; + setp.ge.s32 %p114, %r325, %r331; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.eq.b32 %p115, %r323, %r329; + setp.ne.b32 %p116, %r325, %r331; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.gt.s32 %p117, %r335, %r341; + setp.le.s32 %p118, %r337, %r343; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + and.pred %p119, %p115, %p117; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + or.pred %p120, %p116, %p118; + or.pred %p121, %p113, %p119; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + and.pred %p122, %p114, %p120; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r344, %r323, %r329; + xor.b32 %r345, %r325, %r331; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r346, %r344, 0, %p121; + selp.b32 %r347, %r345, 0, %p122; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r348, %r346, %r312; + xor.b32 %r349, %r347, %r313; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r350, %r341, %r335; + xor.b32 %r351, %r343, %r337; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r352, %r350, 0, %p121; + selp.b32 %r353, %r351, 0, %p122; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r354, %r352, %r318; + xor.b32 %r355, %r353, %r319; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.lt.s32 %p123, %r348, %r349; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.eq.b32 %p124, %r348, %r349; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.gt.s32 %p125, %r354, %r355; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + and.pred %p126, %p124, %p125; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + or.pred %p127, %p123, %p126; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r356, %r349, %r348; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r357, %r356, 0, %p127; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r358, %r357, %r348; + xor.b32 %r359, %r357, %r349; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r360, %r355, %r354; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r361, %r360, 0, %p127; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r362, %r361, %r354; + xor.b32 %r363, %r361, %r355; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r52, %r358, %r132; + mul.lo.s32 %r54, %r359, %r132; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p3 st.shared.b32 [ %r3 + 0 ], %r52; + // end inline asm + // begin inline asm + @%p3 st.shared.b32 [ %r5 + 0 ], %r54; + // end inline asm + bar.sync 0; + // begin inline asm + @%p5 ld.shared.b32 %r55, [ %r8 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r364, %r55, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r58, %r364, %r55; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r8 + 0 ], %r58; + // end inline asm + // begin inline asm + @%p5 ld.shared.b32 %r59, [ %r12 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r365, %r59, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r62, %r365, %r59; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r12 + 0 ], %r62; + // end inline asm + bar.sync 0; + ld.shared.b32 %r366, [%r252]; + ld.shared.b32 %r367, [%r252+8]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r64, %r358, %r129; + mul.lo.s32 %r66, %r359, %r129; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p3 st.shared.b32 [ %r3 + 0 ], %r64; + // end inline asm + // begin inline asm + @%p3 st.shared.b32 [ %r5 + 0 ], %r66; + // end inline asm + bar.sync 0; + // begin inline asm + @%p5 ld.shared.b32 %r67, [ %r8 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r368, %r67, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r70, %r368, %r67; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r8 + 0 ], %r70; + // end inline asm + // begin inline asm + @%p5 ld.shared.b32 %r71, [ %r12 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r369, %r71, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r74, %r369, %r71; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r12 + 0 ], %r74; + // end inline asm + bar.sync 0; + ld.shared.b32 %r370, [%r252]; + ld.shared.b32 %r371, [%r252+8]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r76, %r362, %r132; + mul.lo.s32 %r78, %r363, %r132; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p3 st.shared.b32 [ %r3 + 0 ], %r76; + // end inline asm + // begin inline asm + @%p3 st.shared.b32 [ %r5 + 0 ], %r78; + // end inline asm + bar.sync 0; + // begin inline asm + @%p5 ld.shared.b32 %r79, [ %r8 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r372, %r79, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r82, %r372, %r79; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r8 + 0 ], %r82; + // end inline asm + // begin inline asm + @%p5 ld.shared.b32 %r83, [ %r12 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r373, %r83, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r86, %r373, %r83; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r12 + 0 ], %r86; + // end inline asm + bar.sync 0; + ld.shared.b32 %r374, [%r252]; + ld.shared.b32 %r375, [%r252+8]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r88, %r362, %r129; + mul.lo.s32 %r90, %r363, %r129; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p3 st.shared.b32 [ %r3 + 0 ], %r88; + // end inline asm + // begin inline asm + @%p3 st.shared.b32 [ %r5 + 0 ], %r90; + // end inline asm + bar.sync 0; + // begin inline asm + @%p5 ld.shared.b32 %r91, [ %r8 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r376, %r91, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r94, %r376, %r91; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r8 + 0 ], %r94; + // end inline asm + // begin inline asm + @%p5 ld.shared.b32 %r95, [ %r12 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r377, %r95, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r98, %r377, %r95; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r12 + 0 ], %r98; + // end inline asm + bar.sync 0; + ld.shared.b32 %r378, [%r252]; + ld.shared.b32 %r379, [%r252+8]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.lt.s32 %p128, %r366, %r370; + setp.lt.s32 %p129, %r367, %r371; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.eq.b32 %p130, %r366, %r370; + setp.eq.b32 %p131, %r367, %r371; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.gt.s32 %p132, %r374, %r378; + setp.gt.s32 %p133, %r375, %r379; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + and.pred %p134, %p130, %p132; + and.pred %p135, %p131, %p133; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + or.pred %p136, %p128, %p134; + or.pred %p137, %p129, %p135; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r380, %r370, %r366; + xor.b32 %r381, %r371, %r367; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r382, %r380, 0, %p136; + selp.b32 %r383, %r381, 0, %p137; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r384, %r382, %r358; + xor.b32 %r385, %r383, %r359; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r386, %r378, %r374; + xor.b32 %r387, %r379, %r375; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r388, %r386, 0, %p136; + selp.b32 %r389, %r387, 0, %p137; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r390, %r388, %r362; + xor.b32 %r391, %r389, %r363; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r392, %r384, %r131; + mul.lo.s32 %r393, %r385, %r131; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r394, %r392, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r395, %r392, %r394; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r396, %r393, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r397, %r393, %r396; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r398, %r384, %r124; + mul.lo.s32 %r399, %r385, %r124; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r400, %r398, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r401, %r398, %r400; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r402, %r399, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r403, %r399, %r402; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r404, %r390, %r131; + mul.lo.s32 %r405, %r391, %r131; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r406, %r404, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r407, %r404, %r406; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r408, %r405, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r409, %r405, %r408; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r410, %r390, %r124; + mul.lo.s32 %r411, %r391, %r124; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r412, %r410, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r413, %r412, %r410; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r414, %r411, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r415, %r414, %r411; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.lt.s32 %p138, %r395, %r401; + setp.lt.s32 %p139, %r397, %r403; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.eq.b32 %p140, %r395, %r401; + setp.eq.b32 %p141, %r397, %r403; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.gt.s32 %p142, %r407, %r413; + setp.gt.s32 %p143, %r409, %r415; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + and.pred %p144, %p140, %p142; + and.pred %p145, %p141, %p143; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + or.pred %p146, %p138, %p144; + or.pred %p147, %p139, %p145; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r416, %r395, %r401; + xor.b32 %r417, %r397, %r403; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r418, %r416, 0, %p146; + selp.b32 %r419, %r417, 0, %p147; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r420, %r418, %r384; + xor.b32 %r421, %r419, %r385; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r422, %r413, %r407; + xor.b32 %r423, %r415, %r409; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r424, %r422, 0, %p146; + selp.b32 %r425, %r423, 0, %p147; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r426, %r424, %r390; + xor.b32 %r427, %r425, %r391; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r428, %r420, %r130; + mul.lo.s32 %r429, %r421, %r130; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r430, %r428, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r431, %r428, %r430; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r432, %r429, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r433, %r429, %r432; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r434, %r420, %r127; + mul.lo.s32 %r435, %r421, %r127; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r436, %r434, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r437, %r434, %r436; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r438, %r435, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r439, %r435, %r438; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r440, %r426, %r130; + mul.lo.s32 %r441, %r427, %r130; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r442, %r440, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r443, %r440, %r442; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r444, %r441, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r445, %r441, %r444; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + mul.lo.s32 %r446, %r426, %r127; + mul.lo.s32 %r447, %r427, %r127; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r448, %r446, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r449, %r448, %r446; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + shfl.sync.bfly.b32 %r450, %r447, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + add.s32 %r451, %r450, %r447; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.lt.s32 %p148, %r431, %r437; + setp.lt.s32 %p149, %r433, %r439; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.eq.b32 %p150, %r431, %r437; + setp.eq.b32 %p151, %r433, %r439; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + setp.gt.s32 %p152, %r443, %r449; + setp.gt.s32 %p153, %r445, %r451; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r452, %r449, %r443; + xor.b32 %r453, %r451, %r445; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + selp.b32 %r454, %r452, 0, %p152; + selp.b32 %r455, %r454, 0, %p150; + selp.b32 %r456, %r452, %r455, %p148; + selp.b32 %r457, %r453, 0, %p153; + selp.b32 %r458, %r457, 0, %p151; + selp.b32 %r459, %r453, %r458, %p149; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:41:67 ] + xor.b32 %r460, %r456, %r426; + xor.b32 %r461, %r459, %r427; +$L__tmp2: + .loc 1 44 34 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:44:34 + selp.b32 %r462, %r1, 0, %p1; + cvt.s64.s32 %rd11, %r462; + selp.b32 %r463, %r2, 0, %p1; + cvt.s64.s32 %rd12, %r463; +$L__tmp3: + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:45:26 ] + bar.sync 0; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:45:26 ] + add.s64 %rd13, %rd12, %rd11; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:45:26 ] + mov.b64 {_, %r464}, %rd13; + cvt.u32.u64 %r465, %rd13; + shfl.sync.bfly.b32 %r466, %r465, 16, 31, -1; + shfl.sync.bfly.b32 %r467, %r464, 16, 31, -1; + cvt.u64.u32 %rd14, %r466; + cvt.u64.u32 %rd15, %r467; + shl.b64 %rd16, %rd15, 32; + or.b64 %rd17, %rd14, %rd16; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:45:26 ] + add.s64 %rd18, %rd13, %rd17; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:45:26 ] + mov.b64 {_, %r468}, %rd18; + cvt.u32.u64 %r469, %rd18; + shfl.sync.bfly.b32 %r470, %r469, 8, 31, -1; + shfl.sync.bfly.b32 %r471, %r468, 8, 31, -1; + cvt.u64.u32 %rd19, %r470; + cvt.u64.u32 %rd20, %r471; + shl.b64 %rd21, %rd20, 32; + or.b64 %rd22, %rd19, %rd21; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:45:26 ] + add.s64 %rd3, %rd18, %rd22; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:45:26 ] + setp.eq.b32 %p51, %r108, 0; + shl.b32 %r472, %r109, 4; + add.s32 %r473, %r251, %r472; + shl.b32 %r474, %r129, 3; + add.s32 %r99, %r473, %r474; + // begin inline asm + @%p51 st.shared.b64 [ %r99 + 0 ], %rd3; + // end inline asm + bar.sync 0; + setp.lt.u32 %p52, %r107, 16; + shl.b32 %r475, %r107, 3; + add.s32 %r100, %r251, %r475; + // begin inline asm + @%p52 ld.shared.b64 %rd4, [ %r100 + 0 ]; + // end inline asm + mov.b64 {_, %r476}, %rd4; + cvt.u32.u64 %r477, %rd4; + shfl.sync.bfly.b32 %r478, %r477, 1, 31, -1; + shfl.sync.bfly.b32 %r479, %r476, 1, 31, -1; + cvt.u64.u32 %rd23, %r478; + cvt.u64.u32 %rd24, %r479; + shl.b64 %rd25, %rd24, 32; + or.b64 %rd26, %rd23, %rd25; + .loc 3 261 15 // standard.py:261:15 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:45:26 ] + add.s64 %rd5, %rd4, %rd26; + .loc 3 291 36 // standard.py:291:36 @[ cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:45:26 ] + and.b32 %r480, %r107, 1009; + setp.eq.b32 %p53, %r480, 0; + // begin inline asm + @%p53 st.shared.b64 [ %r100 + 0 ], %rd5; + // end inline asm + bar.sync 0; + ld.shared.b32 %r104, [%r473]; +$L__tmp4: + .loc 1 49 35 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:49:35 + shl.b32 %r481, %r112, 4; + .loc 1 49 32 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:49:32 + or.b32 %r482, %r481, %r114; + .loc 1 49 25 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:49:25 + mad.wide.s32 %rd6, %r482, 4, %rd9; + .loc 1 49 47 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:49:47 + bar.sync 0; + and.b32 %r483, %r107, 3; + shl.b32 %r484, %r483, 3; + and.b32 %r485, %r247, 96; + bfe.s32 %r486, %r107, 2, 1; + and.b32 %r487, %r486, 192; + and.b32 %r488, %r125, 260; + or.b32 %r489, %r484, %r485; + xor.b32 %r490, %r489, %r487; + or.b32 %r491, %r490, %r488; + add.s32 %r492, %r251, %r491; + st.shared.b32 [%r492], %r460; + xor.b32 %r493, %r491, 4; + add.s32 %r494, %r251, %r493; + st.shared.b32 [%r494], %r461; + bar.sync 0; + shl.b32 %r495, %r483, 5; + and.b32 %r496, %r107, 28; + and.b32 %r497, %r128, 192; + or.b32 %r498, %r495, %r496; + xor.b32 %r499, %r498, %r497; + add.s32 %r500, %r251, %r499; + ld.shared.b32 %r102, [%r500]; + xor.b32 %r501, %r499, 4; + add.s32 %r502, %r251, %r501; + ld.shared.b32 %r103, [%r502+256]; + // begin inline asm + @%p54 st.global.v2.b32 [ %rd6 + 0 ], { %r102, %r103 }; + // end inline asm + .loc 1 50 25 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:50:25 + mad.wide.s32 %rd7, %r111, 4, %rd10; + .loc 1 50 37 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:50:37 + and.b32 %r503, %r107, 56; + setp.eq.b32 %p154, %r503, 0; + and.pred %p55, %p154, %p1; + // begin inline asm + @%p55 st.global.b32 [ %rd7 + 0 ], { %r104 }; + // end inline asm + .loc 1 50 4 // cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py:50:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 267 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x104 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 106 +.b8 50 +.b8 110 +.b8 55 +.b8 119 +.b8 121 +.b8 54 +.b8 102 +.b8 113 +.b8 120 +.b8 120 +.b8 109 +.b8 119 +.b8 100 +.b8 111 +.b8 55 +.b8 99 +.b8 98 +.b8 119 +.b8 109 +.b8 98 +.b8 111 +.b8 117 +.b8 98 +.b8 101 +.b8 121 +.b8 121 +.b8 120 +.b8 108 +.b8 97 +.b8 104 +.b8 97 +.b8 112 +.b8 97 +.b8 103 +.b8 117 +.b8 102 +.b8 53 +.b8 113 +.b8 104 +.b8 111 +.b8 97 +.b8 112 +.b8 105 +.b8 122 +.b8 55 +.b8 53 +.b8 103 +.b8 106 +.b8 122 +.b8 101 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 106 +.b8 50 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x3d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 99 +.b8 108 +.b8 111 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 108 +.b8 105 +.b8 99 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 116 +.b8 114 +.b8 97 +.b8 110 +.b8 115 +.b8 112 +.b8 111 +.b8 115 +.b8 101 +.b8 95 +.b8 51 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc8:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xdd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 67 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xf5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source b/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source new file mode 100644 index 0000000000000000000000000000000000000000..fb148dcbfe8fdae0c33351238664be08add8c69c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/3/ZAQGY32WOMI65RBGOJKKAVWGBVH2WVLE74EK5Q5HPTVNPXZT6EXQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source @@ -0,0 +1,1221 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":18:0) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc88 = loc(unknown) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc140 = loc("in_ptr0"(#loc)) +#loc141 = loc("out_ptr2"(#loc)) +#loc142 = loc("out_ptr3"(#loc)) +#loc143 = loc("xnumel"(#loc)) +#loc144 = loc("r0_numel"(#loc)) +#loc174 = loc("x"(#loc38)) +#loc175 = loc("idxs"(#loc38)) +#loc176 = loc("x"(#loc42)) +#loc177 = loc("idxs"(#loc42)) +#loc182 = loc("x"(#loc50)) +#loc183 = loc("idxs"(#loc50)) +#loc184 = loc("flip"(#loc50)) +#loc240 = loc("input"(#loc113)) +#loc241 = loc("a"(#loc117)) +#loc242 = loc("b"(#loc117)) +#loc244 = loc("x"(#loc122)) +#loc245 = loc("x"(#loc126)) +#loc246 = loc("input"(#loc135)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 128 : i32 loc(#loc145) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc146) + %xoffset = tt.get_program_id x : i32 loc(#loc147) + %xoffset_2 = arith.constant 8 : i32 loc(#loc148) + %xoffset_3 = arith.constant 8 : i32 loc(#loc148) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc148) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc149) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc150) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc151) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc151) + %xmask = arith.constant dense<128> : tensor<8x1xi32> loc(#loc152) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<8x1xi32> loc(#loc152) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc153) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc154) + %r0_offset = arith.constant 0 : i32 loc(#loc155) + %r0_mask = arith.constant true loc(#loc156) + %r0_mask_10 = arith.constant dense : tensor<8x16xi1> loc(#loc156) + %x0 = arith.constant 16 : i32 loc(#loc157) + %x0_11 = arith.constant 16 : i32 loc(#loc157) + %x0_12 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc157) + %x0_13 = arith.remsi %xindex_7, %x0_12 : tensor<8x1xi32> loc(#loc157) + %x1 = arith.constant 16 : i32 loc(#loc158) + %x1_14 = arith.constant 16 : i32 loc(#loc158) + %x1_15 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc158) + %x1_16 = arith.divsi %xindex_7, %x1_15 : tensor<8x1xi32> loc(#loc158) + %tmp0 = arith.constant 17 : i32 loc(#loc159) + %tmp0_17 = arith.constant 17 : i32 loc(#loc159) + %tmp0_18 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc159) + %tmp0_19 = arith.muli %tmp0_18, %r0_index_9 : tensor<1x16xi32> loc(#loc159) + %tmp0_20 = tt.broadcast %x0_13 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc160) + %tmp0_21 = tt.broadcast %tmp0_19 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc160) + %tmp0_22 = arith.addi %tmp0_20, %tmp0_21 : tensor<8x16xi32> loc(#loc160) + %tmp0_23 = arith.constant 272 : i32 loc(#loc161) + %tmp0_24 = arith.constant 272 : i32 loc(#loc161) + %tmp0_25 = arith.constant dense<272> : tensor<8x1xi32> loc(#loc161) + %tmp0_26 = arith.muli %tmp0_25, %x1_16 : tensor<8x1xi32> loc(#loc161) + %tmp0_27 = tt.broadcast %tmp0_26 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc162) + %tmp0_28 = arith.addi %tmp0_22, %tmp0_27 : tensor<8x16xi32> loc(#loc162) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc163) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc163) + %tmp0_31 = arith.constant 0.000000e+00 : f32 loc(#loc164) + %tmp0_32 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc164) + %tmp0_33 = arith.constant dense<0.000000e+00> : tensor<8x16xf32> loc(#loc164) + %tmp0_34 = arith.fptosi %tmp0_33 : tensor<8x16xf32> to tensor<8x16xi32> loc(#loc164) + %tmp0_35 = tt.load %tmp0_30, %tmp0_32, %tmp0_34 : tensor<8x16x!tt.ptr> loc(#loc164) + %tmp2 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc165) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<8x16xi16> loc(#loc166) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp0_35, %tmp4) : (tensor<8x16xi32>, tensor<8x16xi16>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc23) + %tmp7 = arith.extsi %tmp0_35 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc167) + %tmp10 = arith.constant 0 : i32 loc(#loc168) + %tmp10_36 = arith.constant 0 : i64 loc(#loc168) + %tmp10_37 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc168) + %tmp10_38 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc168) + %tmp10_39 = arith.select %tmp10_38, %tmp7, %tmp10_37 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc168) + %tmp11 = tt.call @"triton.language.standard.sum__i64S8_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp10_39) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc169) + %tmp11_40 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc170) + %tmp12 = arith.extsi %0#1 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc171) + %tmp13 = arith.trunci %tmp12 : tensor<8x16xi64> to tensor<8x16xi32> loc(#loc172) + %tmp14 = arith.trunci %tmp11_40 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc173) + %c16_i32 = arith.constant 16 : i32 loc(#loc31) + %c16_i32_41 = arith.constant 16 : i32 loc(#loc31) + %cst = arith.constant dense<16> : tensor<8x1xi32> loc(#loc31) + %1 = arith.muli %cst, %xindex_7 : tensor<8x1xi32> loc(#loc31) + %2 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc32) + %3 = tt.broadcast %1 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc32) + %4 = arith.addi %2, %3 : tensor<8x16xi32> loc(#loc32) + %5 = tt.splat %out_ptr2 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc33) + %6 = tt.addptr %5, %4 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc33) + %7 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc34) + tt.store %6, %tmp13, %7 : tensor<8x16x!tt.ptr> loc(#loc34) + %8 = tt.splat %out_ptr3 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc35) + %9 = tt.addptr %8, %xindex_7 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc35) + tt.store %9, %tmp14, %xmask_8 : tensor<8x1x!tt.ptr> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc38)), %idxs: tensor<8x16xi16> loc("idxs"(#loc38))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<8x16xi32>, tensor<8x16xi16>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc39) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc39) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc39) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc39) + tt.return %3#0, %3#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc40) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc41) + %5 = ub.poison : tensor<8x16xi32> loc(#loc41) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc41) + } loc(#loc38) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc42)), %idxs: tensor<8x16xi16> loc("idxs"(#loc42))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc178) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc179) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc179) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc180) + %flip_3 = tt.reshape %flip_2 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc181) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i16S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi16>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + tt.return %0#0, %0#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc48) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x16xi32> loc(#loc49) + %2 = ub.poison : tensor<8x16xi32> loc(#loc49) + tt.return %1, %2 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i16S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc50)), %idxs: tensor<8x16xi16> loc("idxs"(#loc50)), %flip: tensor<8x16xi32> loc("flip"(#loc50))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<8x16xi16> -> tensor<64x2x1xi16> loc(#loc199) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc200) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc201) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<64x2x1xi16> loc(#loc201) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<64x2x1xi16>) -> tensor<64x1xi32> loc(#loc202) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc203) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc204) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc205) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc206) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<64x2x1xi16> loc(#loc206) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<64x2x1xi16>) -> tensor<64x1xi32> loc(#loc207) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc208) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc209) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc210) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_27 = arith.constant dense : tensor<8x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_28 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_49 = arith.constant true loc(#loc217) + %cond_50 = arith.constant dense : tensor<8x16xi1> loc(#loc217) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<8x16xi1> loc(#loc217) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<8x16xi1> loc(#loc218) + %cond_53 = arith.ori %cond, %cond_52 : tensor<8x16xi1> loc(#loc249) + scf.yield %cond_53 : tensor<8x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc221) + %eq_50 = arith.ori %eq, %eq_49 : tensor<8x16xi1> loc(#loc251) + scf.yield %eq_50 : tensor<8x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc88) + } loc(#loc91) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<8x16xi32> loc(#loc223) + %cond_30 = arith.andi %3, %cond_29 : tensor<8x16xi1> loc(#loc224) + %cond_31 = arith.ori %1, %cond_30 : tensor<8x16xi1> loc(#loc225) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<8x16xi1> loc(#loc226) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<8x16xi1> loc(#loc227) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<8x16xi1> loc(#loc228) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<8x16xi1> loc(#loc229) + %cond_36 = arith.extui %cond_35 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc230) + %cond_37 = arith.xori %cond_36, %flip : tensor<8x16xi32> loc(#loc230) + %cond_38 = arith.constant 0 : i32 loc(#loc231) + %cond_39 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc231) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<8x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc232) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc233) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc234) + %ret_43 = arith.xori %x, %ret_42 : tensor<8x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<8x16xi32> loc(#loc236) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S8_16S__(%idxs) : (tensor<8x16xi16>) -> tensor<8x16xi16> loc(#loc237) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<8x16xi16> to tensor<8x16xi32> loc(#loc238) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc238) + %new_idxs_47 = arith.extsi %idxs : tensor<8x16xi16> to tensor<8x16xi32> loc(#loc239) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<8x16xi32> loc(#loc239) + tt.return %ret_43, %new_idxs_48 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc112) + %5 = ub.poison : tensor<8x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x1xi32> loc("input"(#loc113))) -> tensor<64x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc114) + tt.return %0 : tensor<64x1xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x1xi32> loc(#loc116) + tt.return %1 : tensor<64x1xi32> loc(#loc116) + } loc(#loc113) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc117)), %b: i32 loc("b"(#loc117))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc118) + tt.return %0 : i32 loc(#loc119) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc120) + tt.return %1 : i32 loc(#loc120) + } loc(#loc117) + tt.func private @"triton.language.standard.sum__i16S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x1xi16> loc("input"(#loc113))) -> tensor<64x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc243) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc114) + tt.return %0 : tensor<64x1xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x1xi32> loc(#loc116) + tt.return %1 : tensor<64x1xi32> loc(#loc116) + } loc(#loc113) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%x: tensor<8x16xi32> loc("x"(#loc122))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc123) + %false = arith.constant false loc(#loc124) + tt.return %false : i1 loc(#loc124) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc125) + tt.return %1 : i1 loc(#loc125) + } loc(#loc122) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S8_16S__(%x: tensor<8x16xi32> loc("x"(#loc126))) -> tensor<8x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc127) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc128) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc128) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<8x16xi32> loc(#loc128) + %4 = arith.addi %x, %3 : tensor<8x16xi32> loc(#loc128) + tt.return %4 : tensor<8x16xi32> loc(#loc129) + ^bb1: // no predecessors + %5 = ub.poison : tensor<8x16xi32> loc(#loc130) + tt.return %5 : tensor<8x16xi32> loc(#loc130) + } loc(#loc126) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc132) + %cst = arith.constant dense : tensor<1xi1> loc(#loc132) + tt.return %cst : tensor<1xi1> loc(#loc133) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc134) + tt.return %0 : tensor<1xi1> loc(#loc134) + } loc(#loc131) + tt.func private @triton.language.standard.zeros_like__i32S8_16S__(%input: tensor<8x16xi32> loc("input"(#loc135))) -> tensor<8x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<8x16xi32> loc(#loc136) + tt.return %0 : tensor<8x16xi32> loc(#loc137) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x16xi32> loc(#loc138) + tt.return %1 : tensor<8x16xi32> loc(#loc138) + } loc(#loc135) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<8x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc132) + %cst = arith.constant dense<0> : tensor<8x16xi32> loc(#loc132) + tt.return %cst : tensor<8x16xi32> loc(#loc133) + ^bb1: // no predecessors + %0 = ub.poison : tensor<8x16xi32> loc(#loc134) + tt.return %0 : tensor<8x16xi32> loc(#loc134) + } loc(#loc131) + tt.func private @triton.language.standard.zeros_like__i16S8_16S__(%input: tensor<8x16xi16> loc("input"(#loc135))) -> tensor<8x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<8x16xi16> loc(#loc136) + tt.return %0 : tensor<8x16xi16> loc(#loc137) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x16xi16> loc(#loc138) + tt.return %1 : tensor<8x16xi16> loc(#loc138) + } loc(#loc135) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<8x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc132) + %cst = arith.constant dense<0> : tensor<8x16xi16> loc(#loc132) + tt.return %cst : tensor<8x16xi16> loc(#loc133) + ^bb1: // no predecessors + %0 = ub.poison : tensor<8x16xi16> loc(#loc134) + tt.return %0 : tensor<8x16xi16> loc(#loc134) + } loc(#loc131) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc42)), %idxs: tensor<8x16xi32> loc("idxs"(#loc42))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc178) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc179) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc179) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc180) + %flip_3 = tt.reshape %flip_2 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc181) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + tt.return %1#0, %1#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc48) + ^bb1: // no predecessors + %2 = ub.poison : tensor<8x16xi32> loc(#loc49) + %3 = ub.poison : tensor<8x16xi32> loc(#loc49) + tt.return %2, %3 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc50)), %idxs: tensor<8x16xi32> loc("idxs"(#loc50)), %flip: tensor<8x16xi32> loc("flip"(#loc50))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x2xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<32x2x2xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x2xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x2xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_45 = arith.constant true loc(#loc217) + %cond_46 = arith.constant dense : tensor<8x16xi1> loc(#loc217) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<8x16xi1> loc(#loc217) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<8x16xi1> loc(#loc218) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc249) + scf.yield %cond_49 : tensor<8x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc221) + %eq_46 = arith.ori %eq, %eq_45 : tensor<8x16xi1> loc(#loc251) + scf.yield %eq_46 : tensor<8x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc229) + %cond_34 = arith.extui %cond_33 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc230) + %cond_35 = arith.xori %cond_34, %flip : tensor<8x16xi32> loc(#loc230) + %cond_36 = arith.constant 0 : i32 loc(#loc231) + %cond_37 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc231) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<8x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc232) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc233) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc234) + %ret_41 = arith.xori %x, %ret_40 : tensor<8x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc236) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc237) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc238) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<8x16xi32> loc(#loc239) + tt.return %ret_41, %new_idxs_44 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc112) + %5 = ub.poison : tensor<8x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x2x2xi32> loc("input"(#loc113))) -> tensor<32x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc114) + tt.return %0 : tensor<32x2xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x2xi32> loc(#loc116) + tt.return %1 : tensor<32x2xi32> loc(#loc116) + } loc(#loc113) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc50)), %idxs: tensor<8x16xi32> loc("idxs"(#loc50)), %flip: tensor<8x16xi32> loc("flip"(#loc50))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x1xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x1xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_45 = arith.constant true loc(#loc217) + %cond_46 = arith.constant dense : tensor<8x16xi1> loc(#loc217) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<8x16xi1> loc(#loc217) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<8x16xi1> loc(#loc218) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc249) + scf.yield %cond_49 : tensor<8x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc221) + %eq_46 = arith.ori %eq, %eq_45 : tensor<8x16xi1> loc(#loc251) + scf.yield %eq_46 : tensor<8x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc229) + %cond_34 = arith.extui %cond_33 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc230) + %cond_35 = arith.xori %cond_34, %flip : tensor<8x16xi32> loc(#loc230) + %cond_36 = arith.constant 0 : i32 loc(#loc231) + %cond_37 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc231) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<8x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc232) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc233) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc234) + %ret_41 = arith.xori %x, %ret_40 : tensor<8x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc236) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc237) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc238) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<8x16xi32> loc(#loc239) + tt.return %ret_41, %new_idxs_44 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc112) + %5 = ub.poison : tensor<8x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc42)), %idxs: tensor<8x16xi32> loc("idxs"(#loc42))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc178) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc179) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc179) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc180) + %flip_3 = tt.reshape %flip_2 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc181) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + tt.return %2#0, %2#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc48) + ^bb1: // no predecessors + %3 = ub.poison : tensor<8x16xi32> loc(#loc49) + %4 = ub.poison : tensor<8x16xi32> loc(#loc49) + tt.return %3, %4 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc50)), %idxs: tensor<8x16xi32> loc("idxs"(#loc50)), %flip: tensor<8x16xi32> loc("flip"(#loc50))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<16x2x4xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<16x2x4xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<16x2x4xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<16x2x4xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_45 = arith.constant true loc(#loc217) + %cond_46 = arith.constant dense : tensor<8x16xi1> loc(#loc217) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<8x16xi1> loc(#loc217) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<8x16xi1> loc(#loc218) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc249) + scf.yield %cond_49 : tensor<8x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc221) + %eq_46 = arith.ori %eq, %eq_45 : tensor<8x16xi1> loc(#loc251) + scf.yield %eq_46 : tensor<8x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc229) + %cond_34 = arith.extui %cond_33 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc230) + %cond_35 = arith.xori %cond_34, %flip : tensor<8x16xi32> loc(#loc230) + %cond_36 = arith.constant 0 : i32 loc(#loc231) + %cond_37 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc231) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<8x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc232) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc233) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc234) + %ret_41 = arith.xori %x, %ret_40 : tensor<8x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc236) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc237) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc238) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<8x16xi32> loc(#loc239) + tt.return %ret_41, %new_idxs_44 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc112) + %5 = ub.poison : tensor<8x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<16x2x4xi32> loc("input"(#loc113))) -> tensor<16x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc114) + tt.return %0 : tensor<16x4xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<16x4xi32> loc(#loc116) + tt.return %1 : tensor<16x4xi32> loc(#loc116) + } loc(#loc113) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc42)), %idxs: tensor<8x16xi32> loc("idxs"(#loc42))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc247) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + tt.return %3#0, %3#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc48) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc49) + %5 = ub.poison : tensor<8x16xi32> loc(#loc49) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc50)), %idxs: tensor<8x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x8xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<8x2x8xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x8xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x8xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc112) + %5 = ub.poison : tensor<8x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x8xi32> loc("input"(#loc113))) -> tensor<8x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc114) + tt.return %0 : tensor<8x8xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x8xi32> loc(#loc116) + tt.return %1 : tensor<8x8xi32> loc(#loc116) + } loc(#loc113) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc50)), %idxs: tensor<8x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<16x2x4xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<16x2x4xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<16x2x4xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<16x2x4xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc112) + %5 = ub.poison : tensor<8x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc50)), %idxs: tensor<8x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x2xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<32x2x2xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x2xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x2xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc112) + %5 = ub.poison : tensor<8x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc50)), %idxs: tensor<8x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x1xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x1xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc112) + %5 = ub.poison : tensor<8x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i64S8_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x16xi64> loc("input"(#loc113))) -> tensor<8xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc114) + tt.reduce.return %2 : i64 loc(#loc114) + }) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc114) + tt.return %0 : tensor<8xi64> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8xi64> loc(#loc116) + tt.return %1 : tensor<8xi64> loc(#loc116) + } loc(#loc113) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc117)), %b: i64 loc("b"(#loc117))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc118) + tt.return %0 : i64 loc(#loc119) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc120) + tt.return %1 : i64 loc(#loc120) + } loc(#loc117) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":33:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":34:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":36:38) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":36:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":36:49) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":36:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":36:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":36:54) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":38:19) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":40:33) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":41:67) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":42:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":44:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":45:26) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":45:29) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":46:20) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":47:21) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":48:21) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":49:35) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":49:32) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":49:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":49:47) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":50:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":50:37) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/j2/cj2n7wy6fqxxmwdo7cbwmboubeyyxlahapaguf5qhoapiz75gjze.py":50:4) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc145 = loc("xnumel"(#loc1)) +#loc146 = loc("r0_numel"(#loc2)) +#loc147 = loc("xoffset"(#loc3)) +#loc148 = loc("xoffset"(#loc4)) +#loc149 = loc("xindex"(#loc5)) +#loc150 = loc("xindex"(#loc6)) +#loc151 = loc("xindex"(#loc7)) +#loc152 = loc("xmask"(#loc8)) +#loc153 = loc("r0_index"(#loc9)) +#loc154 = loc("r0_index"(#loc10)) +#loc155 = loc("r0_offset"(#loc11)) +#loc156 = loc("r0_mask"(#loc12)) +#loc157 = loc("x0"(#loc13)) +#loc158 = loc("x1"(#loc14)) +#loc159 = loc("tmp0"(#loc15)) +#loc160 = loc("tmp0"(#loc16)) +#loc161 = loc("tmp0"(#loc17)) +#loc162 = loc("tmp0"(#loc18)) +#loc163 = loc("tmp0"(#loc19)) +#loc164 = loc("tmp0"(#loc20)) +#loc165 = loc("tmp2"(#loc21)) +#loc166 = loc("tmp4"(#loc22)) +#loc167 = loc("tmp7"(#loc24)) +#loc168 = loc("tmp10"(#loc25)) +#loc169 = loc("tmp11"(#loc26)) +#loc170 = loc("tmp11"(#loc27)) +#loc171 = loc("tmp12"(#loc28)) +#loc172 = loc("tmp13"(#loc29)) +#loc173 = loc("tmp14"(#loc30)) +#loc178 = loc("flip"(#loc43)) +#loc179 = loc("flip"(#loc44)) +#loc180 = loc("flip"(#loc45)) +#loc181 = loc("flip"(#loc46)) +#loc185 = loc("y"(#loc51)) +#loc186 = loc("right_mask"(#loc52)) +#loc187 = loc("right_mask"(#loc53)) +#loc188 = loc("left_mask"(#loc54)) +#loc189 = loc("ileft"(#loc55)) +#loc190 = loc("ileft"(#loc56)) +#loc191 = loc("ileft"(#loc57)) +#loc192 = loc("ileft"(#loc58)) +#loc193 = loc("iright"(#loc59)) +#loc194 = loc("iright"(#loc60)) +#loc195 = loc("iright"(#loc61)) +#loc196 = loc("iright"(#loc62)) +#loc197 = loc("ileft"(#loc63)) +#loc198 = loc("iright"(#loc64)) +#loc199 = loc("y_idx"(#loc65)) +#loc200 = loc("left_idx"(#loc66)) +#loc201 = loc("left_idx"(#loc67)) +#loc202 = loc("left_idx"(#loc68)) +#loc203 = loc("left_idx"(#loc69)) +#loc204 = loc("left_idx"(#loc70)) +#loc205 = loc("right_idx"(#loc71)) +#loc206 = loc("right_idx"(#loc72)) +#loc207 = loc("right_idx"(#loc73)) +#loc208 = loc("right_idx"(#loc74)) +#loc209 = loc("right_idx"(#loc75)) +#loc210 = loc("left_idx"(#loc76)) +#loc211 = loc("right_idx"(#loc77)) +#loc212 = loc("left_valid_mask"(#loc78)) +#loc213 = loc("right_valid_mask"(#loc79)) +#loc214 = loc("left_isnan"(#loc80)) +#loc215 = loc("right_isnan"(#loc81)) +#loc216 = loc("cond"(#loc82)) +#loc217 = loc("cond"(#loc85)) +#loc218 = loc("cond"(#loc86)) +#loc219 = loc("cond"(#loc87)) +#loc220 = loc("eq"(#loc89)) +#loc221 = loc("eq"(#loc92)) +#loc222 = loc("eq"(#loc93)) +#loc223 = loc("cond"(#loc94)) +#loc224 = loc("cond"(#loc95)) +#loc225 = loc("cond"(#loc96)) +#loc226 = loc("cond"(#loc97)) +#loc227 = loc("cond"(#loc98)) +#loc228 = loc("cond"(#loc99)) +#loc229 = loc("cond"(#loc100)) +#loc230 = loc("cond"(#loc101)) +#loc231 = loc("cond"(#loc102)) +#loc232 = loc("ret"(#loc103)) +#loc233 = loc("ret"(#loc104)) +#loc234 = loc("ret"(#loc105)) +#loc235 = loc("ret"(#loc106)) +#loc236 = loc("new_idxs"(#loc107)) +#loc237 = loc("new_idxs"(#loc108)) +#loc238 = loc("new_idxs"(#loc109)) +#loc239 = loc("new_idxs"(#loc110)) +#loc243 = loc("input"(#loc121)) +#loc247 = loc("flip"(#loc139)) +#loc248 = loc("cond"(#loc216)) +#loc249 = loc("cond"(#loc219)) +#loc250 = loc("eq"(#loc220)) +#loc251 = loc("eq"(#loc222)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir new file mode 100644 index 0000000000000000000000000000000000000000..bccb2a94e591d92768e278b70cba8db4ad1e2fa4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir @@ -0,0 +1,204 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__to_copy_clone_slice_sum_transpose_5(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = shl i32 %9, 5, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 31, !dbg !9 + %13 = or disjoint i32 %10, %12, !dbg !10 + %14 = icmp slt i32 %13, %4, !dbg !11 + %15 = lshr i32 %11, 5, !dbg !12 + %16 = and i32 %15, 3, !dbg !12 + %17 = sext i32 %13 to i64, !dbg !13 + %.frozen = freeze i64 %2, !dbg !14 + %18 = sdiv i64 %17, %.frozen, !dbg !14 + %19 = mul i64 %18, %.frozen, !dbg !13 + %.decomposed = sub i64 %17, %19, !dbg !13 + %20 = icmp sgt i32 %5, 0, !dbg !15 + br i1 %20, label %.lr.ph, label %._crit_edge, !dbg !15 + +.lr.ph: ; preds = %8 + %21 = mul i64 %3, %2, !dbg !16 + %22 = mul i64 %21, %18, !dbg !17 + %23 = getelementptr i32, ptr addrspace(1) %0, i64 %.decomposed + %invariant.gep = getelementptr i32, ptr addrspace(1) %23, i64 %22, !dbg !15 + %24 = insertelement <4 x i1> poison, i1 %14, i64 0, !dbg !18 + %25 = shufflevector <4 x i1> %24, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !18 + %26 = insertelement <4 x i32> poison, i32 %5, i64 0, !dbg !19 + %27 = shufflevector <4 x i32> %26, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !19 + br label %28, !dbg !15 + +28: ; preds = %.lr.ph, %28 + %29 = phi i32 [ 0, %.lr.ph ], [ %68, %28 ] + %30 = phi <4 x i64> [ zeroinitializer, %.lr.ph ], [ %67, %28 ] + %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20 + %32 = or disjoint i32 %29, %16, !dbg !21 + %33 = or disjoint i32 %32, 4, !dbg !21 + %34 = or disjoint i32 %32, 8, !dbg !21 + %35 = or disjoint i32 %32, 12, !dbg !21 + %36 = insertelement <4 x i32> poison, i32 %32, i64 0, !dbg !19 + %37 = insertelement <4 x i32> %36, i32 %33, i64 1, !dbg !19 + %38 = insertelement <4 x i32> %37, i32 %34, i64 2, !dbg !19 + %39 = insertelement <4 x i32> %38, i32 %35, i64 3, !dbg !19 + %40 = icmp slt <4 x i32> %39, %27, !dbg !19 + %41 = sext i32 %32 to i64, !dbg !22 + %42 = sext i32 %33 to i64, !dbg !22 + %43 = sext i32 %34 to i64, !dbg !22 + %44 = sext i32 %35 to i64, !dbg !22 + %45 = mul i64 %2, %41, !dbg !22 + %46 = mul i64 %2, %42, !dbg !22 + %47 = mul i64 %2, %43, !dbg !22 + %48 = mul i64 %2, %44, !dbg !22 + %gep = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %45, !dbg !23 + %gep4 = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %46, !dbg !23 + %gep6 = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %47, !dbg !23 + %gep8 = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %48, !dbg !23 + %49 = and <4 x i1> %25, %40, !dbg !18 + %50 = extractelement <4 x i1> %49, i64 0, !dbg !20 + %51 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep, i64 %31, i1 %50) #5, !dbg !20 + %52 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20 + %53 = extractelement <4 x i1> %49, i64 1, !dbg !20 + %54 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep4, i64 %52, i1 %53) #5, !dbg !20 + %55 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20 + %56 = extractelement <4 x i1> %49, i64 2, !dbg !20 + %57 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep6, i64 %55, i1 %56) #5, !dbg !20 + %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20 + %59 = extractelement <4 x i1> %49, i64 3, !dbg !20 + %60 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep8, i64 %58, i1 %59) #5, !dbg !20 + %61 = insertelement <4 x i32> poison, i32 %51, i64 0, !dbg !24 + %62 = insertelement <4 x i32> %61, i32 %54, i64 1, !dbg !24 + %63 = insertelement <4 x i32> %62, i32 %57, i64 2, !dbg !24 + %64 = insertelement <4 x i32> %63, i32 %60, i64 3, !dbg !24 + %65 = sext <4 x i32> %64 to <4 x i64>, !dbg !24 + %66 = select <4 x i1> %49, <4 x i64> %65, <4 x i64> zeroinitializer, !dbg !25 + %67 = add <4 x i64> %66, %30, !dbg !25 + %68 = add i32 %29, 16, !dbg !15 + %69 = icmp slt i32 %68, %5, !dbg !15 + br i1 %69, label %28, label %._crit_edge.loopexit, !dbg !15 + +._crit_edge.loopexit: ; preds = %28 + %70 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %67), !dbg !26 + br label %._crit_edge, !dbg !26 + +._crit_edge: ; preds = %._crit_edge.loopexit, %8 + %71 = phi i64 [ 0, %8 ], [ %70, %._crit_edge.loopexit ], !dbg !26 + %.idx = shl nuw nsw i32 %12, 5, !dbg !30 + %72 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx, !dbg !30 + %73 = getelementptr i64, ptr addrspace(3) %72, i32 %16, !dbg !30 + %74 = insertelement <1 x i64> poison, i64 %71, i64 0, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %73, <1 x i64> %74, i1 true) #5, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30 + %75 = icmp samesign ult i32 %11, 128, !dbg !30 + %76 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %11, !dbg !30 + %77 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %76, i1 %75) #5, !dbg !30 + %extelt.offset = lshr i64 %77, 32, !dbg !30 + %78 = trunc nuw i64 %extelt.offset to i32, !dbg !30 + %79 = trunc i64 %77 to i32, !dbg !30 + %80 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %79, i32 2, i32 31), !dbg !30 + %81 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 2, i32 31), !dbg !30 + %82 = insertelement <2 x i32> poison, i32 %80, i64 0, !dbg !30 + %83 = insertelement <2 x i32> %82, i32 %81, i64 1, !dbg !30 + %84 = bitcast <2 x i32> %83 to i64, !dbg !30 + %85 = add i64 %77, %84, !dbg !26 + %extelt.offset2 = lshr i64 %85, 32, !dbg !30 + %86 = trunc nuw i64 %extelt.offset2 to i32, !dbg !30 + %87 = trunc i64 %85 to i32, !dbg !30 + %88 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 1, i32 31), !dbg !30 + %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 1, i32 31), !dbg !30 + %90 = insertelement <2 x i32> poison, i32 %88, i64 0, !dbg !30 + %91 = insertelement <2 x i32> %90, i32 %89, i64 1, !dbg !30 + %92 = bitcast <2 x i32> %91 to i64, !dbg !30 + %93 = add i64 %85, %92, !dbg !26 + %94 = and i32 %11, 899, !dbg !30 + %95 = icmp eq i32 %94, 0, !dbg !30 + %96 = insertelement <1 x i64> poison, i64 %93, i64 0, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %76, <1 x i64> %96, i1 %95) #5, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30 + %97 = load i64, ptr addrspace(3) %72, align 16, !dbg !30 + %98 = trunc i64 %97 to i32, !dbg !31 + %99 = icmp slt i64 %2, 2, !dbg !32 + %100 = icmp sgt i64 %2, 1, !dbg !33 + %101 = select i1 %100, i64 %2, i64 0, !dbg !34 + %102 = zext i1 %99 to i64, !dbg !35 + %103 = add i64 %101, %102, !dbg !36 + %104 = mul i64 %18, %103, !dbg !37 + %105 = getelementptr i32, ptr addrspace(1) %1, i64 %.decomposed, !dbg !38 + %106 = getelementptr i32, ptr addrspace(1) %105, i64 %104, !dbg !38 + %107 = and i32 %11, 96, !dbg !39 + %108 = icmp eq i32 %107, 0, !dbg !39 + %109 = and i1 %108, %14, !dbg !39 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %98, ptr addrspace(1) %106, i1 %109) #5, !dbg !39 + ret void, !dbg !40 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_clone_slice_sum_transpose_5", linkageName: "triton_red_fused__to_copy_clone_slice_sum_transpose_5", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 21, column: 28, scope: !4) +!8 = !DILocation(line: 21, column: 33, scope: !4) +!9 = !DILocation(line: 22, column: 44, scope: !4) +!10 = !DILocation(line: 22, column: 23, scope: !4) +!11 = !DILocation(line: 23, column: 21, scope: !4) +!12 = !DILocation(line: 24, column: 37, scope: !4) +!13 = !DILocation(line: 26, column: 19, scope: !4) +!14 = !DILocation(line: 27, column: 19, scope: !4) +!15 = !DILocation(line: 30, column: 40, scope: !4) +!16 = !DILocation(line: 36, column: 54, scope: !4) +!17 = !DILocation(line: 36, column: 58, scope: !4) +!18 = !DILocation(line: 36, column: 73, scope: !4) +!19 = !DILocation(line: 32, column: 29, scope: !4) +!20 = !DILocation(line: 36, column: 63, scope: !4) +!21 = !DILocation(line: 31, column: 31, scope: !4) +!22 = !DILocation(line: 36, column: 43, scope: !4) +!23 = !DILocation(line: 36, column: 34, scope: !4) +!24 = !DILocation(line: 37, column: 23, scope: !4) +!25 = !DILocation(line: 40, column: 48, scope: !4) +!26 = !DILocation(line: 261, column: 15, scope: !27, inlinedAt: !29) +!27 = distinct !DILexicalBlockFile(scope: !4, file: !28, discriminator: 0) +!28 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!29 = !DILocation(line: 41, column: 25, scope: !4) +!30 = !DILocation(line: 291, column: 36, scope: !27, inlinedAt: !29) +!31 = !DILocation(line: 42, column: 19, scope: !4) +!32 = !DILocation(line: 43, column: 49, scope: !4) +!33 = !DILocation(line: 43, column: 75, scope: !4) +!34 = !DILocation(line: 43, column: 66, scope: !4) +!35 = !DILocation(line: 43, scope: !4) +!36 = !DILocation(line: 43, column: 57, scope: !4) +!37 = !DILocation(line: 43, column: 34, scope: !4) +!38 = !DILocation(line: 43, column: 25, scope: !4) +!39 = !DILocation(line: 43, column: 88, scope: !4) +!40 = !DILocation(line: 43, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source b/SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source new file mode 100644 index 0000000000000000000000000000000000000000..a34e714f9fd4c16526aaac100a70631f76a4c26d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/2HBOMUT44J5WFCUWYGRFAAS3HGVNDHLHT7HCSXUCAOIKU6XGJNTA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source @@ -0,0 +1,193 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":18:0) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc43 = loc(unknown) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc50 = loc("in_ptr0"(#loc)) +#loc51 = loc("out_ptr1"(#loc)) +#loc52 = loc("ks0"(#loc)) +#loc53 = loc("ks1"(#loc)) +#loc54 = loc("xnumel"(#loc)) +#loc55 = loc("r0_numel"(#loc)) +#loc85 = loc("input"(#loc41)) +#loc86 = loc("a"(#loc46)) +#loc87 = loc("b"(#loc46)) +module { + tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc56) + %xoffset_0 = arith.constant 32 : i32 loc(#loc57) + %xoffset_1 = arith.constant 32 : i32 loc(#loc57) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc57) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc58) + %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc59) + %xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<32x1xi32> loc(#loc60) + %xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<32x1xi32> loc(#loc60) + %xmask = tt.splat %xnumel : i32 -> tensor<32x1xi32> loc(#loc61) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<32x1xi32> loc(#loc61) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc62) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc63) + %x0 = arith.extsi %xindex_5 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc64) + %x0_8 = tt.splat %ks0 : i64 -> tensor<32x1xi64> loc(#loc64) + %x0_9 = arith.remsi %x0, %x0_8 : tensor<32x1xi64> loc(#loc64) + %x1 = arith.extsi %xindex_5 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc65) + %x1_10 = tt.splat %ks0 : i64 -> tensor<32x1xi64> loc(#loc65) + %x1_11 = arith.divsi %x1, %x1_10 : tensor<32x1xi64> loc(#loc65) + %_tmp3 = arith.constant 0 : i64 loc(#loc66) + %_tmp3_12 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc66) + %c0_i32 = arith.constant 0 : i32 loc(#loc12) + %c16_i32 = arith.constant 16 : i32 loc(#loc12) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc12) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc12) + %2 = arith.bitcast %c16_i32 : i32 to i32 loc(#loc12) + %3 = ub.poison : i32 loc(#loc12) + %_tmp3_13 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_18 = %_tmp3_12) -> (tensor<32x16xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32> loc(#loc68) + %r0_index_19 = arith.addi %r0_index, %r0_base_7 : tensor<1x16xi32> loc(#loc68) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32> loc(#loc69) + %r0_mask_20 = arith.cmpi slt, %r0_index_19, %r0_mask : tensor<1x16xi32> loc(#loc69) + %tmp0 = arith.extsi %r0_index_19 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc70) + %tmp0_21 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc70) + %tmp0_22 = arith.muli %tmp0_21, %tmp0 : tensor<1x16xi64> loc(#loc70) + %tmp0_23 = tt.broadcast %x0_9 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc71) + %tmp0_24 = tt.broadcast %tmp0_22 : tensor<1x16xi64> -> tensor<32x16xi64> loc(#loc71) + %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<32x16xi64> loc(#loc71) + %tmp0_26 = arith.muli %ks0, %ks1 : i64 loc(#loc72) + %tmp0_27 = tt.splat %tmp0_26 : i64 -> tensor<32x1xi64> loc(#loc73) + %tmp0_28 = arith.muli %tmp0_27, %x1_11 : tensor<32x1xi64> loc(#loc73) + %tmp0_29 = tt.broadcast %tmp0_28 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc74) + %tmp0_30 = arith.addi %tmp0_25, %tmp0_29 : tensor<32x16xi64> loc(#loc74) + %tmp0_31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc75) + %tmp0_32 = tt.addptr %tmp0_31, %tmp0_30 : tensor<32x16x!tt.ptr>, tensor<32x16xi64> loc(#loc75) + %tmp0_33 = tt.broadcast %r0_mask_20 : tensor<1x16xi1> -> tensor<32x16xi1> loc(#loc76) + %tmp0_34 = tt.broadcast %xmask_6 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc76) + %tmp0_35 = arith.andi %tmp0_33, %tmp0_34 : tensor<32x16xi1> loc(#loc76) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc77) + %tmp0_37 = arith.constant dense<0.000000e+00> : tensor<32x16xf32> loc(#loc77) + %tmp0_38 = arith.fptosi %tmp0_37 : tensor<32x16xf32> to tensor<32x16xi32> loc(#loc77) + %tmp0_39 = tt.load %tmp0_32, %tmp0_35, %tmp0_38 evictionPolicy = evict_last : tensor<32x16x!tt.ptr> loc(#loc77) + %tmp1 = arith.extsi %tmp0_39 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc78) + %tmp4 = arith.addi %_tmp3_18, %tmp1 : tensor<32x16xi64> loc(#loc79) + %_tmp3_40 = tt.broadcast %r0_mask_20 : tensor<1x16xi1> -> tensor<32x16xi1> loc(#loc80) + %_tmp3_41 = tt.broadcast %xmask_6 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc80) + %_tmp3_42 = arith.andi %_tmp3_40, %_tmp3_41 : tensor<32x16xi1> loc(#loc80) + %_tmp3_43 = arith.select %_tmp3_42, %tmp4, %_tmp3_18 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc81) + scf.yield %_tmp3_43 : tensor<32x16xi64> loc(#loc27) + } loc(#loc67) + %tmp3 = tt.call @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp3_13) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc82) + %tmp3_14 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc83) + %tmp5 = arith.trunci %tmp3_14 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc84) + %c1_i32 = arith.constant 1 : i32 loc(#loc31) + %4 = arith.extsi %c1_i32 : i32 to i64 loc(#loc31) + %5 = arith.cmpi sge, %4, %ks0 : i64 loc(#loc31) + %c1_i32_15 = arith.constant 1 : i32 loc(#loc32) + %c1_i32_16 = arith.constant 1 : i32 loc(#loc32) + %6 = arith.extui %5 : i1 to i32 loc(#loc32) + %7 = arith.muli %c1_i32_16, %6 : i32 loc(#loc32) + %c1_i32_17 = arith.constant 1 : i32 loc(#loc33) + %8 = arith.extsi %c1_i32_17 : i32 to i64 loc(#loc33) + %9 = arith.cmpi sgt, %ks0, %8 : i64 loc(#loc33) + %10 = arith.extui %9 : i1 to i64 loc(#loc34) + %11 = arith.muli %ks0, %10 : i64 loc(#loc34) + %12 = arith.extsi %7 : i32 to i64 loc(#loc35) + %13 = arith.addi %12, %11 : i64 loc(#loc35) + %14 = tt.splat %13 : i64 -> tensor<32x1xi64> loc(#loc36) + %15 = arith.muli %x1_11, %14 : tensor<32x1xi64> loc(#loc36) + %16 = arith.addi %x0_9, %15 : tensor<32x1xi64> loc(#loc37) + %17 = tt.splat %out_ptr1 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc38) + %18 = tt.addptr %17, %16 : tensor<32x1x!tt.ptr>, tensor<32x1xi64> loc(#loc38) + tt.store %18, %tmp5, %xmask_6 : tensor<32x1x!tt.ptr> loc(#loc39) + tt.return loc(#loc40) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x16xi64> loc("input"(#loc41))) -> tensor<32xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc42) + tt.reduce.return %2 : i64 loc(#loc42) + }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc42) + tt.return %0 : tensor<32xi64> loc(#loc44) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32xi64> loc(#loc45) + tt.return %1 : tensor<32xi64> loc(#loc45) + } loc(#loc41) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc46)), %b: i64 loc("b"(#loc46))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc47) + tt.return %0 : i64 loc(#loc48) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc49) + tt.return %1 : i64 loc(#loc49) + } loc(#loc46) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":26:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":28:43) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":30:40) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":31:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":32:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:39) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:54) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:58) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:50) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:73) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:63) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":37:23) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":39:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:48) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:8) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:25) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:28) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":42:19) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:49) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:75) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:66) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:57) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:34) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:30) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:88) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:4) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc56 = loc("xoffset"(#loc1)) +#loc57 = loc("xoffset"(#loc2)) +#loc58 = loc("xindex"(#loc3)) +#loc59 = loc("xindex"(#loc4)) +#loc60 = loc("xindex"(#loc5)) +#loc61 = loc("xmask"(#loc6)) +#loc62 = loc("r0_base"(#loc7)) +#loc63 = loc("r0_base"(#loc8)) +#loc64 = loc("x0"(#loc9)) +#loc65 = loc("x1"(#loc10)) +#loc66 = loc("_tmp3"(#loc11)) +#loc67 = loc("_tmp3"(#loc12)) +#loc68 = loc("r0_index"(#loc13)) +#loc69 = loc("r0_mask"(#loc14)) +#loc70 = loc("tmp0"(#loc15)) +#loc71 = loc("tmp0"(#loc16)) +#loc72 = loc("tmp0"(#loc17)) +#loc73 = loc("tmp0"(#loc18)) +#loc74 = loc("tmp0"(#loc19)) +#loc75 = loc("tmp0"(#loc20)) +#loc76 = loc("tmp0"(#loc21)) +#loc77 = loc("tmp0"(#loc22)) +#loc78 = loc("tmp1"(#loc23)) +#loc79 = loc("tmp4"(#loc24)) +#loc80 = loc("_tmp3"(#loc25)) +#loc81 = loc("_tmp3"(#loc26)) +#loc82 = loc("tmp3"(#loc28)) +#loc83 = loc("tmp3"(#loc29)) +#loc84 = loc("tmp5"(#loc30)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/__grp__triton_tem_fused_zeros_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/__grp__triton_tem_fused_zeros_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f0ad76e2fe0ae8902e218af3fe0ca55bd368a674 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/__grp__triton_tem_fused_zeros_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_tem_fused_zeros_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.source", "triton_tem_fused_zeros_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttir", "triton_tem_fused_zeros_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttgir", "triton_tem_fused_zeros_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.llir", "triton_tem_fused_zeros_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ptx", "triton_tem_fused_zeros_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.cubin", "triton_tem_fused_zeros_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ec46f0a05da6190cc6b4a5a1ddd0c1ee3355af39 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.json @@ -0,0 +1 @@ +{"hash": "d27b4d62c7c8c987ce65c6725faf9fd300a6f1f86eddda35855b227ed6dece60", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 164864, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_zeros_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..a5b40b32ffca1e27cd8f1baa858ac61976e66bfc --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.llir @@ -0,0 +1,12657 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_tem_fused_zeros_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, ptr addrspace(1) %10, ptr addrspace(1) %11, ptr addrspace(1) %12, ptr addrspace(1) %13, ptr addrspace(1) %14, ptr addrspace(1) %15, ptr addrspace(1) %16, ptr addrspace(1) %17, ptr addrspace(1) readnone captures(none) %18, ptr addrspace(1) readnone captures(none) %19) local_unnamed_addr #0 !dbg !5 { + %21 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %22 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !9 + %23 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !10 + %24 = and i32 %22, 7, !dbg !11 + %25 = shl i32 %23, 18, !dbg !12 + %26 = shl nuw nsw i32 %24, 21, !dbg !13 + %27 = add i32 %26, %25, !dbg !14 + %28 = sext i32 %27 to i64, !dbg !15 + %29 = shl i32 %22, 21, !dbg !16 + %30 = add i32 %25, %29, !dbg !17 + %31 = sext i32 %30 to i64, !dbg !18 + %32 = getelementptr bfloat, ptr addrspace(1) %1, i64 %28, !dbg !19 + %33 = getelementptr bfloat, ptr addrspace(1) %2, i64 %28, !dbg !20 + %34 = getelementptr bfloat, ptr addrspace(1) %7, i64 %31, !dbg !21 + %35 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !22 + %36 = lshr i32 %35, 5, !dbg !22 + %37 = and i32 %35, 240, !dbg !22 + %38 = lshr exact i32 %37, 4, !dbg !22 + %39 = or disjoint i32 %38, 16, !dbg !22 + %40 = or disjoint i32 %38, 32, !dbg !22 + %41 = or disjoint i32 %38, 48, !dbg !22 + %42 = or disjoint i32 %38, 64, !dbg !22 + %43 = or disjoint i32 %38, 80, !dbg !22 + %44 = or disjoint i32 %38, 96, !dbg !22 + %45 = or disjoint i32 %38, 112, !dbg !22 + %46 = lshr i32 %35, 1, !dbg !22 + %47 = and i32 %46, 112, !dbg !22 + %48 = lshr i32 %35, 2, !dbg !22 + %49 = and i32 %48, 7, !dbg !22 + %50 = or disjoint i32 %47, %49, !dbg !22 + %51 = or disjoint i32 %50, 8, !dbg !22 + %52 = icmp samesign ugt i32 %21, 15, !dbg !23 + br i1 %52, label %53, label %4460, !dbg !24 + +53: ; preds = %20 + %54 = add nsw i32 %21, -16, !dbg !25 + %55 = lshr i32 %54, 4, !dbg !26 + %56 = shl nuw nsw i32 %23, 2, !dbg !27 + %57 = add nuw nsw i32 %55, %56, !dbg !28 + %58 = and i32 %21, 15, !dbg !29 + %59 = shl nuw nsw i32 %24, 4, !dbg !30 + %60 = or disjoint i32 %59, %58, !dbg !31 + %61 = shl nuw nsw i32 %24, 8, !dbg !32 + %62 = shl nuw nsw i32 %58, 4, !dbg !33 + %63 = or disjoint i32 %61, %62, !dbg !34 + %64 = shl i32 %57, 7, !dbg !35 + %65 = shl i32 %22, 23, !dbg !36 + %66 = add i32 %64, %65, !dbg !37 + %67 = sext i32 %66 to i64, !dbg !38 + %68 = shl i32 %57, 18, !dbg !39 + %69 = add i32 %68, %65, !dbg !40 + %70 = sext i32 %69 to i64, !dbg !41 + %71 = shl nuw i32 %22, 16, !dbg !42 + %72 = shl i32 %57, 11, !dbg !42 + %73 = add i32 %72, %71, !dbg !42 + %74 = sext i32 %73 to i64, !dbg !43 + %75 = getelementptr bfloat, ptr addrspace(1) %0, i64 %67, !dbg !44 + %76 = getelementptr bfloat, ptr addrspace(1) %5, i64 %70, !dbg !45 + %77 = getelementptr bfloat, ptr addrspace(1) %6, i64 %67, !dbg !46 + %78 = getelementptr float, ptr addrspace(1) %3, i64 %74, !dbg !47 + %79 = getelementptr float, ptr addrspace(1) %4, i64 %74, !dbg !48 + %80 = shl nuw nsw i32 %58, 7, !dbg !49 + %81 = or disjoint i32 %80, %38, !dbg !50 + %82 = or disjoint i32 %39, %80, !dbg !50 + %83 = or disjoint i32 %40, %80, !dbg !50 + %84 = or disjoint i32 %41, %80, !dbg !50 + %85 = or disjoint i32 %42, %80, !dbg !50 + %86 = or disjoint i32 %43, %80, !dbg !50 + %87 = or disjoint i32 %44, %80, !dbg !50 + %88 = or disjoint i32 %45, %80, !dbg !50 + %89 = insertelement <2 x i32> poison, i32 %51, i64 0, !dbg !50 + %90 = insertelement <2 x i32> %89, i32 %50, i64 1, !dbg !50 + %91 = insertelement <2 x i32> poison, i32 %80, i64 0, !dbg !50 + %92 = shufflevector <2 x i32> %90, <2 x i32> poison, <32 x i32> , !dbg !50 + %93 = shufflevector <2 x i32> %91, <2 x i32> poison, <32 x i32> zeroinitializer, !dbg !50 + %94 = or disjoint <32 x i32> %92, %93, !dbg !50 + %95 = shl nuw nsw i32 %81, 12, !dbg !51 + %96 = shl nuw nsw i32 %82, 12, !dbg !51 + %97 = shl nuw nsw i32 %83, 12, !dbg !51 + %98 = shl nuw nsw i32 %84, 12, !dbg !51 + %99 = shl nuw nsw i32 %85, 12, !dbg !51 + %100 = shl nuw nsw i32 %86, 12, !dbg !51 + %101 = shl nuw nsw i32 %87, 12, !dbg !51 + %102 = shl nuw nsw i32 %88, 12, !dbg !51 + %103 = zext nneg i32 %95 to i64, !dbg !54 + %104 = getelementptr bfloat, ptr addrspace(1) %75, i64 %103, !dbg !54 + %105 = zext nneg i32 %96 to i64, !dbg !54 + %106 = getelementptr bfloat, ptr addrspace(1) %75, i64 %105, !dbg !54 + %107 = zext nneg i32 %97 to i64, !dbg !54 + %108 = getelementptr bfloat, ptr addrspace(1) %75, i64 %107, !dbg !54 + %109 = zext nneg i32 %98 to i64, !dbg !54 + %110 = getelementptr bfloat, ptr addrspace(1) %75, i64 %109, !dbg !54 + %111 = zext nneg i32 %99 to i64, !dbg !54 + %112 = getelementptr bfloat, ptr addrspace(1) %75, i64 %111, !dbg !54 + %113 = zext nneg i32 %100 to i64, !dbg !54 + %114 = getelementptr bfloat, ptr addrspace(1) %75, i64 %113, !dbg !54 + %115 = zext nneg i32 %101 to i64, !dbg !54 + %116 = getelementptr bfloat, ptr addrspace(1) %75, i64 %115, !dbg !54 + %117 = zext nneg i32 %102 to i64, !dbg !54 + %118 = getelementptr bfloat, ptr addrspace(1) %75, i64 %117, !dbg !54 + %119 = shl nuw nsw i32 %35, 3, !dbg !55 + %120 = and i32 %119, 120, !dbg !55 + %121 = zext nneg i32 %120 to i64, !dbg !56 + %122 = getelementptr bfloat, ptr addrspace(1) %104, i64 %121, !dbg !56 + %123 = getelementptr bfloat, ptr addrspace(1) %106, i64 %121, !dbg !56 + %124 = getelementptr bfloat, ptr addrspace(1) %108, i64 %121, !dbg !56 + %125 = getelementptr bfloat, ptr addrspace(1) %110, i64 %121, !dbg !56 + %126 = getelementptr bfloat, ptr addrspace(1) %112, i64 %121, !dbg !56 + %127 = getelementptr bfloat, ptr addrspace(1) %114, i64 %121, !dbg !56 + %128 = getelementptr bfloat, ptr addrspace(1) %116, i64 %121, !dbg !56 + %129 = getelementptr bfloat, ptr addrspace(1) %118, i64 %121, !dbg !56 + %130 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %122) #3, !dbg !57 + %131 = extractvalue { i32, i32, i32, i32 } %130, 0, !dbg !57 + %132 = extractvalue { i32, i32, i32, i32 } %130, 1, !dbg !57 + %133 = extractvalue { i32, i32, i32, i32 } %130, 2, !dbg !57 + %134 = extractvalue { i32, i32, i32, i32 } %130, 3, !dbg !57 + %135 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %123) #3, !dbg !57 + %136 = extractvalue { i32, i32, i32, i32 } %135, 0, !dbg !57 + %137 = extractvalue { i32, i32, i32, i32 } %135, 1, !dbg !57 + %138 = extractvalue { i32, i32, i32, i32 } %135, 2, !dbg !57 + %139 = extractvalue { i32, i32, i32, i32 } %135, 3, !dbg !57 + %140 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %124) #3, !dbg !57 + %141 = extractvalue { i32, i32, i32, i32 } %140, 0, !dbg !57 + %142 = extractvalue { i32, i32, i32, i32 } %140, 1, !dbg !57 + %143 = extractvalue { i32, i32, i32, i32 } %140, 2, !dbg !57 + %144 = extractvalue { i32, i32, i32, i32 } %140, 3, !dbg !57 + %145 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %125) #3, !dbg !57 + %146 = extractvalue { i32, i32, i32, i32 } %145, 0, !dbg !57 + %147 = extractvalue { i32, i32, i32, i32 } %145, 1, !dbg !57 + %148 = extractvalue { i32, i32, i32, i32 } %145, 2, !dbg !57 + %149 = extractvalue { i32, i32, i32, i32 } %145, 3, !dbg !57 + %150 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %126) #3, !dbg !57 + %151 = extractvalue { i32, i32, i32, i32 } %150, 0, !dbg !57 + %152 = extractvalue { i32, i32, i32, i32 } %150, 1, !dbg !57 + %153 = extractvalue { i32, i32, i32, i32 } %150, 2, !dbg !57 + %154 = extractvalue { i32, i32, i32, i32 } %150, 3, !dbg !57 + %155 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %127) #3, !dbg !57 + %156 = extractvalue { i32, i32, i32, i32 } %155, 0, !dbg !57 + %157 = extractvalue { i32, i32, i32, i32 } %155, 1, !dbg !57 + %158 = extractvalue { i32, i32, i32, i32 } %155, 2, !dbg !57 + %159 = extractvalue { i32, i32, i32, i32 } %155, 3, !dbg !57 + %160 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %128) #3, !dbg !57 + %161 = extractvalue { i32, i32, i32, i32 } %160, 0, !dbg !57 + %162 = extractvalue { i32, i32, i32, i32 } %160, 1, !dbg !57 + %163 = extractvalue { i32, i32, i32, i32 } %160, 2, !dbg !57 + %164 = extractvalue { i32, i32, i32, i32 } %160, 3, !dbg !57 + %165 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %129) #3, !dbg !57 + %166 = extractvalue { i32, i32, i32, i32 } %165, 0, !dbg !57 + %167 = extractvalue { i32, i32, i32, i32 } %165, 1, !dbg !57 + %168 = extractvalue { i32, i32, i32, i32 } %165, 2, !dbg !57 + %169 = extractvalue { i32, i32, i32, i32 } %165, 3, !dbg !57 + %170 = shl nuw nsw i32 %35, 4, !dbg !57 + %171 = and i32 %170, 112, !dbg !57 + %172 = shl nuw nsw i32 %37, 3, !dbg !57 + %173 = and i32 %35, 112, !dbg !57 + %174 = and i32 %35, 8, !dbg !57 + %175 = shl nuw nsw i32 %174, 11, !dbg !57 + %176 = or disjoint i32 %171, %172, !dbg !57 + %177 = xor i32 %176, %173, !dbg !57 + %178 = or disjoint i32 %177, %175, !dbg !57 + %179 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %178, !dbg !57 + %180 = insertelement <4 x i32> poison, i32 %131, i64 0, !dbg !57 + %181 = insertelement <4 x i32> %180, i32 %132, i64 1, !dbg !57 + %182 = insertelement <4 x i32> %181, i32 %133, i64 2, !dbg !57 + %183 = insertelement <4 x i32> %182, i32 %134, i64 3, !dbg !57 + store <4 x i32> %183, ptr addrspace(3) %179, align 16, !dbg !57 + %184 = or disjoint i32 %178, 2048, !dbg !57 + %185 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %184, !dbg !57 + %186 = insertelement <4 x i32> poison, i32 %136, i64 0, !dbg !57 + %187 = insertelement <4 x i32> %186, i32 %137, i64 1, !dbg !57 + %188 = insertelement <4 x i32> %187, i32 %138, i64 2, !dbg !57 + %189 = insertelement <4 x i32> %188, i32 %139, i64 3, !dbg !57 + store <4 x i32> %189, ptr addrspace(3) %185, align 16, !dbg !57 + %190 = or disjoint i32 %178, 4096, !dbg !57 + %191 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %190, !dbg !57 + %192 = insertelement <4 x i32> poison, i32 %141, i64 0, !dbg !57 + %193 = insertelement <4 x i32> %192, i32 %142, i64 1, !dbg !57 + %194 = insertelement <4 x i32> %193, i32 %143, i64 2, !dbg !57 + %195 = insertelement <4 x i32> %194, i32 %144, i64 3, !dbg !57 + store <4 x i32> %195, ptr addrspace(3) %191, align 16, !dbg !57 + %196 = or disjoint i32 %178, 6144, !dbg !57 + %197 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %196, !dbg !57 + %198 = insertelement <4 x i32> poison, i32 %146, i64 0, !dbg !57 + %199 = insertelement <4 x i32> %198, i32 %147, i64 1, !dbg !57 + %200 = insertelement <4 x i32> %199, i32 %148, i64 2, !dbg !57 + %201 = insertelement <4 x i32> %200, i32 %149, i64 3, !dbg !57 + store <4 x i32> %201, ptr addrspace(3) %197, align 16, !dbg !57 + %202 = or disjoint i32 %178, 8192, !dbg !57 + %203 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %202, !dbg !57 + %204 = insertelement <4 x i32> poison, i32 %151, i64 0, !dbg !57 + %205 = insertelement <4 x i32> %204, i32 %152, i64 1, !dbg !57 + %206 = insertelement <4 x i32> %205, i32 %153, i64 2, !dbg !57 + %207 = insertelement <4 x i32> %206, i32 %154, i64 3, !dbg !57 + store <4 x i32> %207, ptr addrspace(3) %203, align 16, !dbg !57 + %208 = or disjoint i32 %178, 10240, !dbg !57 + %209 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %208, !dbg !57 + %210 = insertelement <4 x i32> poison, i32 %156, i64 0, !dbg !57 + %211 = insertelement <4 x i32> %210, i32 %157, i64 1, !dbg !57 + %212 = insertelement <4 x i32> %211, i32 %158, i64 2, !dbg !57 + %213 = insertelement <4 x i32> %212, i32 %159, i64 3, !dbg !57 + store <4 x i32> %213, ptr addrspace(3) %209, align 16, !dbg !57 + %214 = or disjoint i32 %178, 12288, !dbg !57 + %215 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %214, !dbg !57 + %216 = insertelement <4 x i32> poison, i32 %161, i64 0, !dbg !57 + %217 = insertelement <4 x i32> %216, i32 %162, i64 1, !dbg !57 + %218 = insertelement <4 x i32> %217, i32 %163, i64 2, !dbg !57 + %219 = insertelement <4 x i32> %218, i32 %164, i64 3, !dbg !57 + store <4 x i32> %219, ptr addrspace(3) %215, align 16, !dbg !57 + %220 = or disjoint i32 %178, 14336, !dbg !57 + %221 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %220, !dbg !57 + %222 = insertelement <4 x i32> poison, i32 %166, i64 0, !dbg !57 + %223 = insertelement <4 x i32> %222, i32 %167, i64 1, !dbg !57 + %224 = insertelement <4 x i32> %223, i32 %168, i64 2, !dbg !57 + %225 = insertelement <4 x i32> %224, i32 %169, i64 3, !dbg !57 + store <4 x i32> %225, ptr addrspace(3) %221, align 16, !dbg !57 + %226 = shl nuw nsw i32 %81, 7, !dbg !58 + %227 = shl nuw nsw i32 %82, 7, !dbg !58 + %228 = shl nuw nsw i32 %83, 7, !dbg !58 + %229 = shl nuw nsw i32 %84, 7, !dbg !58 + %230 = shl nuw nsw i32 %85, 7, !dbg !58 + %231 = shl nuw nsw i32 %86, 7, !dbg !58 + %232 = shl nuw nsw i32 %87, 7, !dbg !58 + %233 = shl nuw nsw i32 %88, 7, !dbg !58 + %234 = zext nneg i32 %226 to i64, !dbg !60 + %235 = getelementptr bfloat, ptr addrspace(1) %76, i64 %234, !dbg !60 + %236 = zext nneg i32 %227 to i64, !dbg !60 + %237 = getelementptr bfloat, ptr addrspace(1) %76, i64 %236, !dbg !60 + %238 = zext nneg i32 %228 to i64, !dbg !60 + %239 = getelementptr bfloat, ptr addrspace(1) %76, i64 %238, !dbg !60 + %240 = zext nneg i32 %229 to i64, !dbg !60 + %241 = getelementptr bfloat, ptr addrspace(1) %76, i64 %240, !dbg !60 + %242 = zext nneg i32 %230 to i64, !dbg !60 + %243 = getelementptr bfloat, ptr addrspace(1) %76, i64 %242, !dbg !60 + %244 = zext nneg i32 %231 to i64, !dbg !60 + %245 = getelementptr bfloat, ptr addrspace(1) %76, i64 %244, !dbg !60 + %246 = zext nneg i32 %232 to i64, !dbg !60 + %247 = getelementptr bfloat, ptr addrspace(1) %76, i64 %246, !dbg !60 + %248 = zext nneg i32 %233 to i64, !dbg !60 + %249 = getelementptr bfloat, ptr addrspace(1) %76, i64 %248, !dbg !60 + %250 = getelementptr bfloat, ptr addrspace(1) %235, i64 %121, !dbg !61 + %251 = getelementptr bfloat, ptr addrspace(1) %237, i64 %121, !dbg !61 + %252 = getelementptr bfloat, ptr addrspace(1) %239, i64 %121, !dbg !61 + %253 = getelementptr bfloat, ptr addrspace(1) %241, i64 %121, !dbg !61 + %254 = getelementptr bfloat, ptr addrspace(1) %243, i64 %121, !dbg !61 + %255 = getelementptr bfloat, ptr addrspace(1) %245, i64 %121, !dbg !61 + %256 = getelementptr bfloat, ptr addrspace(1) %247, i64 %121, !dbg !61 + %257 = getelementptr bfloat, ptr addrspace(1) %249, i64 %121, !dbg !61 + %258 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %250) #3, !dbg !62 + %259 = extractvalue { i32, i32, i32, i32 } %258, 0, !dbg !62 + %260 = extractvalue { i32, i32, i32, i32 } %258, 1, !dbg !62 + %261 = extractvalue { i32, i32, i32, i32 } %258, 2, !dbg !62 + %262 = extractvalue { i32, i32, i32, i32 } %258, 3, !dbg !62 + %263 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %251) #3, !dbg !62 + %264 = extractvalue { i32, i32, i32, i32 } %263, 0, !dbg !62 + %265 = extractvalue { i32, i32, i32, i32 } %263, 1, !dbg !62 + %266 = extractvalue { i32, i32, i32, i32 } %263, 2, !dbg !62 + %267 = extractvalue { i32, i32, i32, i32 } %263, 3, !dbg !62 + %268 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %252) #3, !dbg !62 + %269 = extractvalue { i32, i32, i32, i32 } %268, 0, !dbg !62 + %270 = extractvalue { i32, i32, i32, i32 } %268, 1, !dbg !62 + %271 = extractvalue { i32, i32, i32, i32 } %268, 2, !dbg !62 + %272 = extractvalue { i32, i32, i32, i32 } %268, 3, !dbg !62 + %273 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %253) #3, !dbg !62 + %274 = extractvalue { i32, i32, i32, i32 } %273, 0, !dbg !62 + %275 = extractvalue { i32, i32, i32, i32 } %273, 1, !dbg !62 + %276 = extractvalue { i32, i32, i32, i32 } %273, 2, !dbg !62 + %277 = extractvalue { i32, i32, i32, i32 } %273, 3, !dbg !62 + %278 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %254) #3, !dbg !62 + %279 = extractvalue { i32, i32, i32, i32 } %278, 0, !dbg !62 + %280 = extractvalue { i32, i32, i32, i32 } %278, 1, !dbg !62 + %281 = extractvalue { i32, i32, i32, i32 } %278, 2, !dbg !62 + %282 = extractvalue { i32, i32, i32, i32 } %278, 3, !dbg !62 + %283 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %255) #3, !dbg !62 + %284 = extractvalue { i32, i32, i32, i32 } %283, 0, !dbg !62 + %285 = extractvalue { i32, i32, i32, i32 } %283, 1, !dbg !62 + %286 = extractvalue { i32, i32, i32, i32 } %283, 2, !dbg !62 + %287 = extractvalue { i32, i32, i32, i32 } %283, 3, !dbg !62 + %288 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %256) #3, !dbg !62 + %289 = extractvalue { i32, i32, i32, i32 } %288, 0, !dbg !62 + %290 = extractvalue { i32, i32, i32, i32 } %288, 1, !dbg !62 + %291 = extractvalue { i32, i32, i32, i32 } %288, 2, !dbg !62 + %292 = extractvalue { i32, i32, i32, i32 } %288, 3, !dbg !62 + %293 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %257) #3, !dbg !62 + %294 = extractvalue { i32, i32, i32, i32 } %293, 0, !dbg !62 + %295 = extractvalue { i32, i32, i32, i32 } %293, 1, !dbg !62 + %296 = extractvalue { i32, i32, i32, i32 } %293, 2, !dbg !62 + %297 = extractvalue { i32, i32, i32, i32 } %293, 3, !dbg !62 + %298 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %178, !dbg !62 + %299 = insertelement <4 x i32> poison, i32 %259, i64 0, !dbg !62 + %300 = insertelement <4 x i32> %299, i32 %260, i64 1, !dbg !62 + %301 = insertelement <4 x i32> %300, i32 %261, i64 2, !dbg !62 + %302 = insertelement <4 x i32> %301, i32 %262, i64 3, !dbg !62 + store <4 x i32> %302, ptr addrspace(3) %298, align 16, !dbg !62 + %303 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %184, !dbg !62 + %304 = insertelement <4 x i32> poison, i32 %264, i64 0, !dbg !62 + %305 = insertelement <4 x i32> %304, i32 %265, i64 1, !dbg !62 + %306 = insertelement <4 x i32> %305, i32 %266, i64 2, !dbg !62 + %307 = insertelement <4 x i32> %306, i32 %267, i64 3, !dbg !62 + store <4 x i32> %307, ptr addrspace(3) %303, align 16, !dbg !62 + %308 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %190, !dbg !62 + %309 = insertelement <4 x i32> poison, i32 %269, i64 0, !dbg !62 + %310 = insertelement <4 x i32> %309, i32 %270, i64 1, !dbg !62 + %311 = insertelement <4 x i32> %310, i32 %271, i64 2, !dbg !62 + %312 = insertelement <4 x i32> %311, i32 %272, i64 3, !dbg !62 + store <4 x i32> %312, ptr addrspace(3) %308, align 16, !dbg !62 + %313 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %196, !dbg !62 + %314 = insertelement <4 x i32> poison, i32 %274, i64 0, !dbg !62 + %315 = insertelement <4 x i32> %314, i32 %275, i64 1, !dbg !62 + %316 = insertelement <4 x i32> %315, i32 %276, i64 2, !dbg !62 + %317 = insertelement <4 x i32> %316, i32 %277, i64 3, !dbg !62 + store <4 x i32> %317, ptr addrspace(3) %313, align 16, !dbg !62 + %318 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %202, !dbg !62 + %319 = insertelement <4 x i32> poison, i32 %279, i64 0, !dbg !62 + %320 = insertelement <4 x i32> %319, i32 %280, i64 1, !dbg !62 + %321 = insertelement <4 x i32> %320, i32 %281, i64 2, !dbg !62 + %322 = insertelement <4 x i32> %321, i32 %282, i64 3, !dbg !62 + store <4 x i32> %322, ptr addrspace(3) %318, align 16, !dbg !62 + %323 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %208, !dbg !62 + %324 = insertelement <4 x i32> poison, i32 %284, i64 0, !dbg !62 + %325 = insertelement <4 x i32> %324, i32 %285, i64 1, !dbg !62 + %326 = insertelement <4 x i32> %325, i32 %286, i64 2, !dbg !62 + %327 = insertelement <4 x i32> %326, i32 %287, i64 3, !dbg !62 + store <4 x i32> %327, ptr addrspace(3) %323, align 16, !dbg !62 + %328 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %214, !dbg !62 + %329 = insertelement <4 x i32> poison, i32 %289, i64 0, !dbg !62 + %330 = insertelement <4 x i32> %329, i32 %290, i64 1, !dbg !62 + %331 = insertelement <4 x i32> %330, i32 %291, i64 2, !dbg !62 + %332 = insertelement <4 x i32> %331, i32 %292, i64 3, !dbg !62 + store <4 x i32> %332, ptr addrspace(3) %328, align 16, !dbg !62 + %333 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %220, !dbg !62 + %334 = insertelement <4 x i32> poison, i32 %294, i64 0, !dbg !62 + %335 = insertelement <4 x i32> %334, i32 %295, i64 1, !dbg !62 + %336 = insertelement <4 x i32> %335, i32 %296, i64 2, !dbg !62 + %337 = insertelement <4 x i32> %336, i32 %297, i64 3, !dbg !62 + store <4 x i32> %337, ptr addrspace(3) %333, align 16, !dbg !62 + %338 = extractelement <32 x i32> %94, i64 2, !dbg !63 + %339 = zext nneg i32 %338 to i64, !dbg !63 + %340 = getelementptr float, ptr addrspace(1) %79, i64 %339, !dbg !63 + %341 = extractelement <32 x i32> %94, i64 0, !dbg !63 + %342 = zext nneg i32 %341 to i64, !dbg !63 + %343 = getelementptr float, ptr addrspace(1) %79, i64 %342, !dbg !63 + %344 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %340) #3, !dbg !64 + %345 = bitcast i32 %344 to float, !dbg !64 + %346 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %343) #3, !dbg !64 + %347 = bitcast i32 %346 to float, !dbg !64 + %348 = getelementptr float, ptr addrspace(1) %78, i64 %339, !dbg !65 + %349 = getelementptr float, ptr addrspace(1) %78, i64 %342, !dbg !65 + %350 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %348) #3, !dbg !66 + %351 = bitcast i32 %350 to float, !dbg !66 + %352 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %349) #3, !dbg !66 + %353 = bitcast i32 %352 to float, !dbg !66 + %354 = fcmp oeq float %351, 0xFFF0000000000000, !dbg !67 + %355 = fcmp oeq float %353, 0xFFF0000000000000, !dbg !67 + %356 = select i1 %354, float 0.000000e+00, float %351, !dbg !68 + %357 = select i1 %355, float 0.000000e+00, float %353, !dbg !68 + %358 = zext nneg i32 %63 to i64, !dbg !69 + %359 = getelementptr i32, ptr addrspace(1) %9, i64 %358, !dbg !69 + %360 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %359) #3, !dbg !70 + %361 = shl i32 %360, 7, !dbg !71 + %362 = zext nneg i32 %60 to i64, !dbg !72 + %363 = getelementptr i32, ptr addrspace(1) %8, i64 %362, !dbg !72 + %364 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %363) #3, !dbg !73 + %365 = and i32 %35, 3, !dbg !74 + %366 = or disjoint i32 %361, %38, !dbg !75 + %367 = or disjoint i32 %361, %39, !dbg !75 + %368 = or disjoint i32 %361, %40, !dbg !75 + %369 = or disjoint i32 %361, %41, !dbg !75 + %370 = shl i32 %366, 7, !dbg !76 + %371 = shl i32 %367, 7, !dbg !76 + %372 = shl i32 %368, 7, !dbg !76 + %373 = shl i32 %369, 7, !dbg !76 + %374 = sext i32 %370 to i64, !dbg !78 + %375 = getelementptr bfloat, ptr addrspace(1) %32, i64 %374, !dbg !78 + %376 = sext i32 %371 to i64, !dbg !78 + %377 = getelementptr bfloat, ptr addrspace(1) %32, i64 %376, !dbg !78 + %378 = sext i32 %372 to i64, !dbg !78 + %379 = getelementptr bfloat, ptr addrspace(1) %32, i64 %378, !dbg !78 + %380 = sext i32 %373 to i64, !dbg !78 + %381 = getelementptr bfloat, ptr addrspace(1) %32, i64 %380, !dbg !78 + %382 = getelementptr bfloat, ptr addrspace(1) %375, i64 %121, !dbg !79 + %383 = getelementptr bfloat, ptr addrspace(1) %377, i64 %121, !dbg !79 + %384 = getelementptr bfloat, ptr addrspace(1) %379, i64 %121, !dbg !79 + %385 = getelementptr bfloat, ptr addrspace(1) %381, i64 %121, !dbg !79 + %386 = getelementptr bfloat, ptr addrspace(1) %33, i64 %374, !dbg !80 + %387 = getelementptr bfloat, ptr addrspace(1) %33, i64 %376, !dbg !80 + %388 = getelementptr bfloat, ptr addrspace(1) %33, i64 %378, !dbg !80 + %389 = getelementptr bfloat, ptr addrspace(1) %33, i64 %380, !dbg !80 + %390 = getelementptr bfloat, ptr addrspace(1) %386, i64 %121, !dbg !81 + %391 = getelementptr bfloat, ptr addrspace(1) %387, i64 %121, !dbg !81 + %392 = getelementptr bfloat, ptr addrspace(1) %388, i64 %121, !dbg !81 + %393 = getelementptr bfloat, ptr addrspace(1) %389, i64 %121, !dbg !81 + %394 = shl i32 %364, 1, !dbg !82 + %395 = zext nneg i32 %22 to i64, !dbg !83 + %396 = getelementptr i64, ptr addrspace(1) %16, i64 %395, !dbg !83 + %397 = icmp sgt i32 %394, 0, !dbg !84 + %398 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %396, i1 %397) #3, !dbg !85 + %399 = icmp sgt i64 %398, %339, !dbg !86 + %400 = icmp sgt i64 %398, %342, !dbg !86 + %401 = shl nuw nsw i32 %174, 10, !dbg !87 + %402 = or disjoint i32 %177, %401, !dbg !87 + %403 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %402, !dbg !87 + %404 = select i1 %397, i32 16, i32 0, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %403, ptr addrspace(1) %382, i32 %404) #3, !dbg !87 + %405 = or disjoint i32 %402, 2048, !dbg !87 + %406 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %405, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %406, ptr addrspace(1) %383, i32 %404) #3, !dbg !87 + %407 = or disjoint i32 %402, 4096, !dbg !87 + %408 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %407, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %408, ptr addrspace(1) %384, i32 %404) #3, !dbg !87 + %409 = or disjoint i32 %402, 6144, !dbg !87 + %410 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %409, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %410, ptr addrspace(1) %385, i32 %404) #3, !dbg !87 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !87 + %411 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %402, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %411, ptr addrspace(1) %390, i32 %404) #3, !dbg !87 + %412 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %405, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %412, ptr addrspace(1) %391, i32 %404) #3, !dbg !87 + %413 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %407, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %413, ptr addrspace(1) %392, i32 %404) #3, !dbg !87 + %414 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %409, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %414, ptr addrspace(1) %393, i32 %404) #3, !dbg !87 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !87 + %415 = icmp sgt i32 %394, 1, !dbg !84 + %416 = getelementptr i8, ptr addrspace(1) %382, i64 16384, !dbg !88 + %417 = getelementptr i8, ptr addrspace(1) %383, i64 16384, !dbg !88 + %418 = getelementptr i8, ptr addrspace(1) %384, i64 16384, !dbg !88 + %419 = getelementptr i8, ptr addrspace(1) %385, i64 16384, !dbg !88 + %420 = getelementptr i8, ptr addrspace(1) %390, i64 16384, !dbg !89 + %421 = getelementptr i8, ptr addrspace(1) %391, i64 16384, !dbg !89 + %422 = getelementptr i8, ptr addrspace(1) %392, i64 16384, !dbg !89 + %423 = getelementptr i8, ptr addrspace(1) %393, i64 16384, !dbg !89 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !87 + %424 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %402, !dbg !87 + %425 = select i1 %415, i32 16, i32 0, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %424, ptr addrspace(1) %416, i32 %425) #3, !dbg !87 + %426 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %405, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %426, ptr addrspace(1) %417, i32 %425) #3, !dbg !87 + %427 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %407, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %427, ptr addrspace(1) %418, i32 %425) #3, !dbg !87 + %428 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %409, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %428, ptr addrspace(1) %419, i32 %425) #3, !dbg !87 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !87 + %429 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %402, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %429, ptr addrspace(1) %420, i32 %425) #3, !dbg !87 + %430 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %405, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %430, ptr addrspace(1) %421, i32 %425) #3, !dbg !87 + %431 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %407, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %431, ptr addrspace(1) %422, i32 %425) #3, !dbg !87 + %432 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %409, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %432, ptr addrspace(1) %423, i32 %425) #3, !dbg !87 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !87 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #3, !dbg !90 + br i1 %397, label %.lr.ph1846, label %._crit_edge1847, !dbg !84 + +.lr.ph1846: ; preds = %53 + %433 = tail call i32 @llvm.umin.i32(i32 %394, i32 32), !dbg !91 + %434 = shl nuw nsw i32 %365, 1, !dbg !74 + %435 = or disjoint i32 %361, %434, !dbg !74 + %436 = insertelement <8 x i32> poison, i32 %435, i64 0, !dbg !75 + %437 = add nsw i32 %433, -2 + %438 = add nsw i32 %433, -1 + %439 = shufflevector <8 x i32> %436, <8 x i32> poison, <16 x i32> + %440 = insertelement <16 x i32> %439, i32 %435, i64 8 + %441 = insertelement <16 x i32> %440, i32 %435, i64 9 + %442 = insertelement <16 x i32> %441, i32 %435, i64 10 + %443 = insertelement <16 x i32> %442, i32 %435, i64 11 + %444 = insertelement <16 x i32> %443, i32 %435, i64 12 + %445 = insertelement <16 x i32> %444, i32 %435, i64 13 + %446 = insertelement <16 x i32> %445, i32 %435, i64 14 + %447 = or disjoint <16 x i32> %446, + %448 = insertelement <16 x i32> %447, i32 %435, i64 15 + %449 = insertelement <8 x i64> poison, i64 %398, i64 0, !dbg !92 + %450 = shufflevector <8 x i64> %449, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !92 + br label %451, !dbg !84 + +451: ; preds = %.lr.ph1846, %__nv_exp2f.exit1507 + %452 = phi i32 [ 64, %.lr.ph1846 ], [ %2289, %__nv_exp2f.exit1507 ] + %453 = phi i32 [ -1, %.lr.ph1846 ], [ %526, %__nv_exp2f.exit1507 ] + %454 = phi i32 [ 1, %.lr.ph1846 ], [ %2302, %__nv_exp2f.exit1507 ] + %.pn9181828 = phi ptr addrspace(1) [ %423, %.lr.ph1846 ], [ %2299, %__nv_exp2f.exit1507 ] + %.pn9341827 = phi ptr addrspace(1) [ %422, %.lr.ph1846 ], [ %2298, %__nv_exp2f.exit1507 ] + %.pn9501826 = phi ptr addrspace(1) [ %421, %.lr.ph1846 ], [ %2297, %__nv_exp2f.exit1507 ] + %.pn9661825 = phi ptr addrspace(1) [ %420, %.lr.ph1846 ], [ %2296, %__nv_exp2f.exit1507 ] + %.pn8541824 = phi ptr addrspace(1) [ %419, %.lr.ph1846 ], [ %2295, %__nv_exp2f.exit1507 ] + %.pn8701823 = phi ptr addrspace(1) [ %418, %.lr.ph1846 ], [ %2294, %__nv_exp2f.exit1507 ] + %.pn8861822 = phi ptr addrspace(1) [ %417, %.lr.ph1846 ], [ %2293, %__nv_exp2f.exit1507 ] + %.pn9021821 = phi ptr addrspace(1) [ %416, %.lr.ph1846 ], [ %2292, %__nv_exp2f.exit1507 ] + %455 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2203, %__nv_exp2f.exit1507 ] + %456 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2204, %__nv_exp2f.exit1507 ] + %457 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2205, %__nv_exp2f.exit1507 ] + %458 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2206, %__nv_exp2f.exit1507 ] + %459 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2207, %__nv_exp2f.exit1507 ] + %460 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2208, %__nv_exp2f.exit1507 ] + %461 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2209, %__nv_exp2f.exit1507 ] + %462 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2210, %__nv_exp2f.exit1507 ] + %463 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2211, %__nv_exp2f.exit1507 ] + %464 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2212, %__nv_exp2f.exit1507 ] + %465 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2213, %__nv_exp2f.exit1507 ] + %466 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2214, %__nv_exp2f.exit1507 ] + %467 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2215, %__nv_exp2f.exit1507 ] + %468 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2216, %__nv_exp2f.exit1507 ] + %469 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2217, %__nv_exp2f.exit1507 ] + %470 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2218, %__nv_exp2f.exit1507 ] + %471 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2219, %__nv_exp2f.exit1507 ] + %472 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2220, %__nv_exp2f.exit1507 ] + %473 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2221, %__nv_exp2f.exit1507 ] + %474 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2222, %__nv_exp2f.exit1507 ] + %475 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2223, %__nv_exp2f.exit1507 ] + %476 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2224, %__nv_exp2f.exit1507 ] + %477 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2225, %__nv_exp2f.exit1507 ] + %478 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2226, %__nv_exp2f.exit1507 ] + %479 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2227, %__nv_exp2f.exit1507 ] + %480 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2228, %__nv_exp2f.exit1507 ] + %481 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2229, %__nv_exp2f.exit1507 ] + %482 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2230, %__nv_exp2f.exit1507 ] + %483 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2231, %__nv_exp2f.exit1507 ] + %484 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2232, %__nv_exp2f.exit1507 ] + %485 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2233, %__nv_exp2f.exit1507 ] + %486 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2234, %__nv_exp2f.exit1507 ] + %487 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2235, %__nv_exp2f.exit1507 ] + %488 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2236, %__nv_exp2f.exit1507 ] + %489 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2237, %__nv_exp2f.exit1507 ] + %490 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2238, %__nv_exp2f.exit1507 ] + %491 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2239, %__nv_exp2f.exit1507 ] + %492 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2240, %__nv_exp2f.exit1507 ] + %493 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2241, %__nv_exp2f.exit1507 ] + %494 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2242, %__nv_exp2f.exit1507 ] + %495 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2243, %__nv_exp2f.exit1507 ] + %496 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2244, %__nv_exp2f.exit1507 ] + %497 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2245, %__nv_exp2f.exit1507 ] + %498 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2246, %__nv_exp2f.exit1507 ] + %499 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2247, %__nv_exp2f.exit1507 ] + %500 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2248, %__nv_exp2f.exit1507 ] + %501 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2249, %__nv_exp2f.exit1507 ] + %502 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2250, %__nv_exp2f.exit1507 ] + %503 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2251, %__nv_exp2f.exit1507 ] + %504 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2252, %__nv_exp2f.exit1507 ] + %505 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2253, %__nv_exp2f.exit1507 ] + %506 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2254, %__nv_exp2f.exit1507 ] + %507 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2255, %__nv_exp2f.exit1507 ] + %508 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2256, %__nv_exp2f.exit1507 ] + %509 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2257, %__nv_exp2f.exit1507 ] + %510 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2258, %__nv_exp2f.exit1507 ] + %511 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2259, %__nv_exp2f.exit1507 ] + %512 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2260, %__nv_exp2f.exit1507 ] + %513 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2261, %__nv_exp2f.exit1507 ] + %514 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2262, %__nv_exp2f.exit1507 ] + %515 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2263, %__nv_exp2f.exit1507 ] + %516 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2264, %__nv_exp2f.exit1507 ] + %517 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2265, %__nv_exp2f.exit1507 ] + %518 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2266, %__nv_exp2f.exit1507 ] + %519 = phi i32 [ 0, %.lr.ph1846 ], [ %2270, %__nv_exp2f.exit1507 ] + %520 = phi <16 x i32> [ %448, %.lr.ph1846 ], [ %2269, %__nv_exp2f.exit1507 ] + %521 = shufflevector <16 x i32> %520, <16 x i32> poison, <32 x i32> + %522 = icmp slt i32 %519, %437, !dbg !84 + %523 = icmp slt i32 %519, %438, !dbg !84 + %524 = add i32 %453, 1, !dbg !84 + %525 = icmp sgt i32 %524, 2, !dbg !84 + %526 = select i1 %525, i32 0, i32 %524, !dbg !84 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !87 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !87 + %527 = shl i32 %526, 13, !dbg !87 + %528 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %527, !dbg !87 + %529 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %36, i32 0, i32 31), !dbg !90 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !90 + %530 = shl i32 %529, 11, !dbg !90 + %531 = and i32 %530, 8192, !dbg !90 + %532 = add i32 %531, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !90 + %533 = lshr exact i32 %532, 4, !dbg !90 + %534 = and i32 %533, 16383, !dbg !90 + %535 = zext nneg i32 %534 to i64, !dbg !90 + %536 = or disjoint i64 %535, 4611686293372403712, !dbg !90 + %537 = ptrtoint ptr addrspace(3) %528 to i32, !dbg !90 + %538 = lshr exact i32 %537, 4, !dbg !90 + %539 = and i32 %538, 16383, !dbg !90 + %540 = zext nneg i32 %539 to i64, !dbg !90 + %541 = or disjoint i64 %540, 4611686293338849280, !dbg !90 + %542 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %536, i64 %541) #3, !dbg !90 + %543 = or disjoint i32 %531, 32, !dbg !90 + %544 = add i32 %543, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !90 + %545 = lshr exact i32 %544, 4, !dbg !90 + %546 = and i32 %545, 16383, !dbg !90 + %547 = zext nneg i32 %546 to i64, !dbg !90 + %548 = or disjoint i64 %547, 4611686293372403712, !dbg !90 + %549 = add i32 %537, 32, !dbg !90 + %550 = lshr exact i32 %549, 4, !dbg !90 + %551 = and i32 %550, 16383, !dbg !90 + %552 = zext nneg i32 %551 to i64, !dbg !90 + %553 = or disjoint i64 %552, 4611686293338849280, !dbg !90 + %554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 0, !dbg !90 + %555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 1, !dbg !90 + %556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 2, !dbg !90 + %557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 3, !dbg !90 + %558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 4, !dbg !90 + %559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 5, !dbg !90 + %560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 6, !dbg !90 + %561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 7, !dbg !90 + %562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 8, !dbg !90 + %563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 9, !dbg !90 + %564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 10, !dbg !90 + %565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 11, !dbg !90 + %566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 12, !dbg !90 + %567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 13, !dbg !90 + %568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 14, !dbg !90 + %569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 15, !dbg !90 + %570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 16, !dbg !90 + %571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 17, !dbg !90 + %572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 18, !dbg !90 + %573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 19, !dbg !90 + %574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 20, !dbg !90 + %575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 21, !dbg !90 + %576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 22, !dbg !90 + %577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 23, !dbg !90 + %578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 24, !dbg !90 + %579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 25, !dbg !90 + %580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 26, !dbg !90 + %581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 27, !dbg !90 + %582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 28, !dbg !90 + %583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 29, !dbg !90 + %584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 30, !dbg !90 + %585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 31, !dbg !90 + %586 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %554, float %555, float %556, float %557, float %558, float %559, float %560, float %561, float %562, float %563, float %564, float %565, float %566, float %567, float %568, float %569, float %570, float %571, float %572, float %573, float %574, float %575, float %576, float %577, float %578, float %579, float %580, float %581, float %582, float %583, float %584, float %585, i64 %548, i64 %553, i1 true) #3, !dbg !90 + %587 = or disjoint i32 %531, 64, !dbg !90 + %588 = add i32 %587, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !90 + %589 = lshr exact i32 %588, 4, !dbg !90 + %590 = and i32 %589, 16383, !dbg !90 + %591 = zext nneg i32 %590 to i64, !dbg !90 + %592 = or disjoint i64 %591, 4611686293372403712, !dbg !90 + %593 = add i32 %537, 64, !dbg !90 + %594 = lshr exact i32 %593, 4, !dbg !90 + %595 = and i32 %594, 16383, !dbg !90 + %596 = zext nneg i32 %595 to i64, !dbg !90 + %597 = or disjoint i64 %596, 4611686293338849280, !dbg !90 + %598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 0, !dbg !90 + %599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 1, !dbg !90 + %600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 2, !dbg !90 + %601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 3, !dbg !90 + %602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 4, !dbg !90 + %603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 5, !dbg !90 + %604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 6, !dbg !90 + %605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 7, !dbg !90 + %606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 8, !dbg !90 + %607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 9, !dbg !90 + %608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 10, !dbg !90 + %609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 11, !dbg !90 + %610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 12, !dbg !90 + %611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 13, !dbg !90 + %612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 14, !dbg !90 + %613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 15, !dbg !90 + %614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 16, !dbg !90 + %615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 17, !dbg !90 + %616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 18, !dbg !90 + %617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 19, !dbg !90 + %618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 20, !dbg !90 + %619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 21, !dbg !90 + %620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 22, !dbg !90 + %621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 23, !dbg !90 + %622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 24, !dbg !90 + %623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 25, !dbg !90 + %624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 26, !dbg !90 + %625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 27, !dbg !90 + %626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 28, !dbg !90 + %627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 29, !dbg !90 + %628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 30, !dbg !90 + %629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 31, !dbg !90 + %630 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %598, float %599, float %600, float %601, float %602, float %603, float %604, float %605, float %606, float %607, float %608, float %609, float %610, float %611, float %612, float %613, float %614, float %615, float %616, float %617, float %618, float %619, float %620, float %621, float %622, float %623, float %624, float %625, float %626, float %627, float %628, float %629, i64 %592, i64 %597, i1 true) #3, !dbg !90 + %631 = or disjoint i32 %531, 96, !dbg !90 + %632 = add i32 %631, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !90 + %633 = lshr exact i32 %632, 4, !dbg !90 + %634 = and i32 %633, 16383, !dbg !90 + %635 = zext nneg i32 %634 to i64, !dbg !90 + %636 = or disjoint i64 %635, 4611686293372403712, !dbg !90 + %637 = add i32 %537, 96, !dbg !90 + %638 = lshr exact i32 %637, 4, !dbg !90 + %639 = and i32 %638, 16383, !dbg !90 + %640 = zext nneg i32 %639 to i64, !dbg !90 + %641 = or disjoint i64 %640, 4611686293338849280, !dbg !90 + %642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 0, !dbg !90 + %643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 1, !dbg !90 + %644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 2, !dbg !90 + %645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 3, !dbg !90 + %646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 4, !dbg !90 + %647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 5, !dbg !90 + %648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 6, !dbg !90 + %649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 7, !dbg !90 + %650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 8, !dbg !90 + %651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 9, !dbg !90 + %652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 10, !dbg !90 + %653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 11, !dbg !90 + %654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 12, !dbg !90 + %655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 13, !dbg !90 + %656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 14, !dbg !90 + %657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 15, !dbg !90 + %658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 16, !dbg !90 + %659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 17, !dbg !90 + %660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 18, !dbg !90 + %661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 19, !dbg !90 + %662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 20, !dbg !90 + %663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 21, !dbg !90 + %664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 22, !dbg !90 + %665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 23, !dbg !90 + %666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 24, !dbg !90 + %667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 25, !dbg !90 + %668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 26, !dbg !90 + %669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 27, !dbg !90 + %670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 28, !dbg !90 + %671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 29, !dbg !90 + %672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 30, !dbg !90 + %673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 31, !dbg !90 + %674 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %642, float %643, float %644, float %645, float %646, float %647, float %648, float %649, float %650, float %651, float %652, float %653, float %654, float %655, float %656, float %657, float %658, float %659, float %660, float %661, float %662, float %663, float %664, float %665, float %666, float %667, float %668, float %669, float %670, float %671, float %672, float %673, i64 %636, i64 %641, i1 true) #3, !dbg !90 + %675 = or disjoint i32 %531, 16384, !dbg !90 + %676 = add i32 %675, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !90 + %677 = lshr exact i32 %676, 4, !dbg !90 + %678 = and i32 %677, 16383, !dbg !90 + %679 = zext nneg i32 %678 to i64, !dbg !90 + %680 = or disjoint i64 %679, 4611686293372403712, !dbg !90 + %681 = add i32 %537, 8192, !dbg !90 + %682 = lshr exact i32 %681, 4, !dbg !90 + %683 = and i32 %682, 16383, !dbg !90 + %684 = zext nneg i32 %683 to i64, !dbg !90 + %685 = or disjoint i64 %684, 4611686293338849280, !dbg !90 + %686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 0, !dbg !90 + %687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 1, !dbg !90 + %688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 2, !dbg !90 + %689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 3, !dbg !90 + %690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 4, !dbg !90 + %691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 5, !dbg !90 + %692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 6, !dbg !90 + %693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 7, !dbg !90 + %694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 8, !dbg !90 + %695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 9, !dbg !90 + %696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 10, !dbg !90 + %697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 11, !dbg !90 + %698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 12, !dbg !90 + %699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 13, !dbg !90 + %700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 14, !dbg !90 + %701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 15, !dbg !90 + %702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 16, !dbg !90 + %703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 17, !dbg !90 + %704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 18, !dbg !90 + %705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 19, !dbg !90 + %706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 20, !dbg !90 + %707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 21, !dbg !90 + %708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 22, !dbg !90 + %709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 23, !dbg !90 + %710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 24, !dbg !90 + %711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 25, !dbg !90 + %712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 26, !dbg !90 + %713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 27, !dbg !90 + %714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 28, !dbg !90 + %715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 29, !dbg !90 + %716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 30, !dbg !90 + %717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 31, !dbg !90 + %718 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %686, float %687, float %688, float %689, float %690, float %691, float %692, float %693, float %694, float %695, float %696, float %697, float %698, float %699, float %700, float %701, float %702, float %703, float %704, float %705, float %706, float %707, float %708, float %709, float %710, float %711, float %712, float %713, float %714, float %715, float %716, float %717, i64 %680, i64 %685, i1 true) #3, !dbg !90 + %719 = or disjoint i32 %531, 16416, !dbg !90 + %720 = add i32 %719, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !90 + %721 = lshr exact i32 %720, 4, !dbg !90 + %722 = and i32 %721, 16383, !dbg !90 + %723 = zext nneg i32 %722 to i64, !dbg !90 + %724 = or disjoint i64 %723, 4611686293372403712, !dbg !90 + %725 = add i32 %537, 8224, !dbg !90 + %726 = lshr exact i32 %725, 4, !dbg !90 + %727 = and i32 %726, 16383, !dbg !90 + %728 = zext nneg i32 %727 to i64, !dbg !90 + %729 = or disjoint i64 %728, 4611686293338849280, !dbg !90 + %730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 0, !dbg !90 + %731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 1, !dbg !90 + %732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 2, !dbg !90 + %733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 3, !dbg !90 + %734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 4, !dbg !90 + %735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 5, !dbg !90 + %736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 6, !dbg !90 + %737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 7, !dbg !90 + %738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 8, !dbg !90 + %739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 9, !dbg !90 + %740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 10, !dbg !90 + %741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 11, !dbg !90 + %742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 12, !dbg !90 + %743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 13, !dbg !90 + %744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 14, !dbg !90 + %745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 15, !dbg !90 + %746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 16, !dbg !90 + %747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 17, !dbg !90 + %748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 18, !dbg !90 + %749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 19, !dbg !90 + %750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 20, !dbg !90 + %751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 21, !dbg !90 + %752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 22, !dbg !90 + %753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 23, !dbg !90 + %754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 24, !dbg !90 + %755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 25, !dbg !90 + %756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 26, !dbg !90 + %757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 27, !dbg !90 + %758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 28, !dbg !90 + %759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 29, !dbg !90 + %760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 30, !dbg !90 + %761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 31, !dbg !90 + %762 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %730, float %731, float %732, float %733, float %734, float %735, float %736, float %737, float %738, float %739, float %740, float %741, float %742, float %743, float %744, float %745, float %746, float %747, float %748, float %749, float %750, float %751, float %752, float %753, float %754, float %755, float %756, float %757, float %758, float %759, float %760, float %761, i64 %724, i64 %729, i1 true) #3, !dbg !90 + %763 = or disjoint i32 %531, 16448, !dbg !90 + %764 = add i32 %763, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !90 + %765 = lshr exact i32 %764, 4, !dbg !90 + %766 = and i32 %765, 16383, !dbg !90 + %767 = zext nneg i32 %766 to i64, !dbg !90 + %768 = or disjoint i64 %767, 4611686293372403712, !dbg !90 + %769 = add i32 %537, 8256, !dbg !90 + %770 = lshr exact i32 %769, 4, !dbg !90 + %771 = and i32 %770, 16383, !dbg !90 + %772 = zext nneg i32 %771 to i64, !dbg !90 + %773 = or disjoint i64 %772, 4611686293338849280, !dbg !90 + %774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 0, !dbg !90 + %775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 1, !dbg !90 + %776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 2, !dbg !90 + %777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 3, !dbg !90 + %778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 4, !dbg !90 + %779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 5, !dbg !90 + %780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 6, !dbg !90 + %781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 7, !dbg !90 + %782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 8, !dbg !90 + %783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 9, !dbg !90 + %784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 10, !dbg !90 + %785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 11, !dbg !90 + %786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 12, !dbg !90 + %787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 13, !dbg !90 + %788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 14, !dbg !90 + %789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 15, !dbg !90 + %790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 16, !dbg !90 + %791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 17, !dbg !90 + %792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 18, !dbg !90 + %793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 19, !dbg !90 + %794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 20, !dbg !90 + %795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 21, !dbg !90 + %796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 22, !dbg !90 + %797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 23, !dbg !90 + %798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 24, !dbg !90 + %799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 25, !dbg !90 + %800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 26, !dbg !90 + %801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 27, !dbg !90 + %802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 28, !dbg !90 + %803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 29, !dbg !90 + %804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 30, !dbg !90 + %805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 31, !dbg !90 + %806 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %774, float %775, float %776, float %777, float %778, float %779, float %780, float %781, float %782, float %783, float %784, float %785, float %786, float %787, float %788, float %789, float %790, float %791, float %792, float %793, float %794, float %795, float %796, float %797, float %798, float %799, float %800, float %801, float %802, float %803, float %804, float %805, i64 %768, i64 %773, i1 true) #3, !dbg !90 + %807 = or disjoint i32 %531, 16480, !dbg !90 + %808 = add i32 %807, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !90 + %809 = lshr exact i32 %808, 4, !dbg !90 + %810 = and i32 %809, 16383, !dbg !90 + %811 = zext nneg i32 %810 to i64, !dbg !90 + %812 = or disjoint i64 %811, 4611686293372403712, !dbg !90 + %813 = add i32 %537, 8288, !dbg !90 + %814 = lshr exact i32 %813, 4, !dbg !90 + %815 = and i32 %814, 16383, !dbg !90 + %816 = zext nneg i32 %815 to i64, !dbg !90 + %817 = or disjoint i64 %816, 4611686293338849280, !dbg !90 + %818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 0, !dbg !90 + %819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 1, !dbg !90 + %820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 2, !dbg !90 + %821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 3, !dbg !90 + %822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 4, !dbg !90 + %823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 5, !dbg !90 + %824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 6, !dbg !90 + %825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 7, !dbg !90 + %826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 8, !dbg !90 + %827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 9, !dbg !90 + %828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 10, !dbg !90 + %829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 11, !dbg !90 + %830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 12, !dbg !90 + %831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 13, !dbg !90 + %832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 14, !dbg !90 + %833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 15, !dbg !90 + %834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 16, !dbg !90 + %835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 17, !dbg !90 + %836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 18, !dbg !90 + %837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 19, !dbg !90 + %838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 20, !dbg !90 + %839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 21, !dbg !90 + %840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 22, !dbg !90 + %841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 23, !dbg !90 + %842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 24, !dbg !90 + %843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 25, !dbg !90 + %844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 26, !dbg !90 + %845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 27, !dbg !90 + %846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 28, !dbg !90 + %847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 29, !dbg !90 + %848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 30, !dbg !90 + %849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 31, !dbg !90 + %850 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %818, float %819, float %820, float %821, float %822, float %823, float %824, float %825, float %826, float %827, float %828, float %829, float %830, float %831, float %832, float %833, float %834, float %835, float %836, float %837, float %838, float %839, float %840, float %841, float %842, float %843, float %844, float %845, float %846, float %847, float %848, float %849, i64 %812, i64 %817, i1 true) #3, !dbg !90 + %851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 0, !dbg !90 + %852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 1, !dbg !90 + %853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 2, !dbg !90 + %854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 3, !dbg !90 + %855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 4, !dbg !90 + %856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 5, !dbg !90 + %857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 6, !dbg !90 + %858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 7, !dbg !90 + %859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 8, !dbg !90 + %860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 9, !dbg !90 + %861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 10, !dbg !90 + %862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 11, !dbg !90 + %863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 12, !dbg !90 + %864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 13, !dbg !90 + %865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 14, !dbg !90 + %866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 15, !dbg !90 + %867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 16, !dbg !90 + %868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 17, !dbg !90 + %869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 18, !dbg !90 + %870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 19, !dbg !90 + %871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 20, !dbg !90 + %872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 21, !dbg !90 + %873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 22, !dbg !90 + %874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 23, !dbg !90 + %875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 24, !dbg !90 + %876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 25, !dbg !90 + %877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 26, !dbg !90 + %878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 27, !dbg !90 + %879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 28, !dbg !90 + %880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 29, !dbg !90 + %881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 30, !dbg !90 + %882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 31, !dbg !90 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !90 + %883 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %851, float %852, float %853, float %854, float %855, float %856, float %857, float %858, float %859, float %860, float %861, float %862, float %863, float %864, float %865, float %866, float %867, float %868, float %869, float %870, float %871, float %872, float %873, float %874, float %875, float %876, float %877, float %878, float %879, float %880, float %881, float %882, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %528, i32 0, i32 0) #3, !dbg !90 + %884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 0, !dbg !90 + %885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 1, !dbg !90 + %886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 2, !dbg !90 + %887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 3, !dbg !90 + %888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 4, !dbg !90 + %889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 5, !dbg !90 + %890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 6, !dbg !90 + %891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 7, !dbg !90 + %892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 8, !dbg !90 + %893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 9, !dbg !90 + %894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 10, !dbg !90 + %895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 11, !dbg !90 + %896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 12, !dbg !90 + %897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 13, !dbg !90 + %898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 14, !dbg !90 + %899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 15, !dbg !90 + %900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 16, !dbg !90 + %901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 17, !dbg !90 + %902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 18, !dbg !90 + %903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 19, !dbg !90 + %904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 20, !dbg !90 + %905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 21, !dbg !90 + %906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 22, !dbg !90 + %907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 23, !dbg !90 + %908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 24, !dbg !90 + %909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 25, !dbg !90 + %910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 26, !dbg !90 + %911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 27, !dbg !90 + %912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 28, !dbg !90 + %913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 29, !dbg !90 + %914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 30, !dbg !90 + %915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 31, !dbg !90 + %916 = fmul float %884, 0x3FB6A09E60000000, !dbg !93 + %917 = fmul float %885, 0x3FB6A09E60000000, !dbg !93 + %918 = fmul float %886, 0x3FB6A09E60000000, !dbg !93 + %919 = fmul float %887, 0x3FB6A09E60000000, !dbg !93 + %920 = fmul float %888, 0x3FB6A09E60000000, !dbg !93 + %921 = fmul float %889, 0x3FB6A09E60000000, !dbg !93 + %922 = fmul float %890, 0x3FB6A09E60000000, !dbg !93 + %923 = fmul float %891, 0x3FB6A09E60000000, !dbg !93 + %924 = fmul float %892, 0x3FB6A09E60000000, !dbg !93 + %925 = fmul float %893, 0x3FB6A09E60000000, !dbg !93 + %926 = fmul float %894, 0x3FB6A09E60000000, !dbg !93 + %927 = fmul float %895, 0x3FB6A09E60000000, !dbg !93 + %928 = fmul float %896, 0x3FB6A09E60000000, !dbg !93 + %929 = fmul float %897, 0x3FB6A09E60000000, !dbg !93 + %930 = fmul float %898, 0x3FB6A09E60000000, !dbg !93 + %931 = fmul float %899, 0x3FB6A09E60000000, !dbg !93 + %932 = fmul float %900, 0x3FB6A09E60000000, !dbg !93 + %933 = fmul float %901, 0x3FB6A09E60000000, !dbg !93 + %934 = fmul float %902, 0x3FB6A09E60000000, !dbg !93 + %935 = fmul float %903, 0x3FB6A09E60000000, !dbg !93 + %936 = fmul float %904, 0x3FB6A09E60000000, !dbg !93 + %937 = fmul float %905, 0x3FB6A09E60000000, !dbg !93 + %938 = fmul float %906, 0x3FB6A09E60000000, !dbg !93 + %939 = fmul float %907, 0x3FB6A09E60000000, !dbg !93 + %940 = fmul float %908, 0x3FB6A09E60000000, !dbg !93 + %941 = fmul float %909, 0x3FB6A09E60000000, !dbg !93 + %942 = fmul float %910, 0x3FB6A09E60000000, !dbg !93 + %943 = fmul float %911, 0x3FB6A09E60000000, !dbg !93 + %944 = fmul float %912, 0x3FB6A09E60000000, !dbg !93 + %945 = fmul float %913, 0x3FB6A09E60000000, !dbg !93 + %946 = fmul float %914, 0x3FB6A09E60000000, !dbg !93 + %947 = fmul float %915, 0x3FB6A09E60000000, !dbg !93 + %948 = extractelement <16 x i32> %520, i64 15, !dbg !94 + %949 = icmp sge i32 %338, %948, !dbg !94 + %950 = extractelement <16 x i32> %520, i64 14, !dbg !95 + %951 = icmp sge i32 %338, %950, !dbg !94 + %952 = icmp sge i32 %341, %948, !dbg !94 + %953 = icmp sge i32 %341, %950, !dbg !94 + %954 = extractelement <16 x i32> %520, i64 13, !dbg !94 + %955 = icmp sge i32 %338, %954, !dbg !94 + %956 = extractelement <16 x i32> %520, i64 12, !dbg !95 + %957 = icmp sge i32 %338, %956, !dbg !94 + %958 = icmp sge i32 %341, %954, !dbg !94 + %959 = icmp sge i32 %341, %956, !dbg !94 + %960 = extractelement <16 x i32> %520, i64 11, !dbg !94 + %961 = icmp sge i32 %338, %960, !dbg !94 + %962 = extractelement <16 x i32> %520, i64 10, !dbg !95 + %963 = icmp sge i32 %338, %962, !dbg !94 + %964 = icmp sge i32 %341, %960, !dbg !94 + %965 = icmp sge i32 %341, %962, !dbg !94 + %966 = extractelement <16 x i32> %520, i64 9, !dbg !94 + %967 = icmp sge i32 %338, %966, !dbg !94 + %968 = extractelement <16 x i32> %520, i64 8, !dbg !95 + %969 = icmp sge i32 %338, %968, !dbg !94 + %970 = icmp sge i32 %341, %966, !dbg !94 + %971 = icmp sge i32 %341, %968, !dbg !94 + %972 = extractelement <16 x i32> %520, i64 7, !dbg !94 + %973 = icmp sge i32 %338, %972, !dbg !94 + %974 = extractelement <16 x i32> %520, i64 6, !dbg !95 + %975 = icmp sge i32 %338, %974, !dbg !94 + %976 = icmp sge i32 %341, %972, !dbg !94 + %977 = icmp sge i32 %341, %974, !dbg !94 + %978 = extractelement <16 x i32> %520, i64 5, !dbg !94 + %979 = icmp sge i32 %338, %978, !dbg !94 + %980 = extractelement <16 x i32> %520, i64 4, !dbg !95 + %981 = icmp sge i32 %338, %980, !dbg !94 + %982 = icmp sge i32 %341, %978, !dbg !94 + %983 = icmp sge i32 %341, %980, !dbg !94 + %984 = extractelement <16 x i32> %520, i64 3, !dbg !94 + %985 = icmp sge i32 %338, %984, !dbg !94 + %986 = extractelement <16 x i32> %520, i64 2, !dbg !95 + %987 = icmp sge i32 %338, %986, !dbg !94 + %988 = icmp sge i32 %341, %984, !dbg !94 + %989 = icmp sge i32 %341, %986, !dbg !94 + %990 = extractelement <16 x i32> %520, i64 1, !dbg !94 + %991 = icmp sge i32 %338, %990, !dbg !94 + %992 = extractelement <16 x i32> %520, i64 0, !dbg !95 + %993 = icmp sge i32 %338, %992, !dbg !94 + %994 = icmp sge i32 %341, %990, !dbg !94 + %995 = icmp sge i32 %341, %992, !dbg !94 + %996 = and i1 %399, %949, !dbg !96 + %997 = and i1 %399, %951, !dbg !96 + %998 = and i1 %400, %952, !dbg !96 + %999 = and i1 %400, %953, !dbg !96 + %1000 = and i1 %399, %955, !dbg !96 + %1001 = and i1 %399, %957, !dbg !96 + %1002 = and i1 %400, %958, !dbg !96 + %1003 = and i1 %400, %959, !dbg !96 + %1004 = and i1 %399, %961, !dbg !96 + %1005 = and i1 %399, %963, !dbg !96 + %1006 = and i1 %400, %964, !dbg !96 + %1007 = and i1 %400, %965, !dbg !96 + %1008 = and i1 %399, %967, !dbg !96 + %1009 = and i1 %399, %969, !dbg !96 + %1010 = and i1 %400, %970, !dbg !96 + %1011 = and i1 %400, %971, !dbg !96 + %1012 = and i1 %399, %973, !dbg !96 + %1013 = and i1 %399, %975, !dbg !96 + %1014 = and i1 %400, %976, !dbg !96 + %1015 = and i1 %400, %977, !dbg !96 + %1016 = and i1 %399, %979, !dbg !96 + %1017 = and i1 %399, %981, !dbg !96 + %1018 = and i1 %400, %982, !dbg !96 + %1019 = and i1 %400, %983, !dbg !96 + %1020 = and i1 %399, %985, !dbg !96 + %1021 = and i1 %399, %987, !dbg !96 + %1022 = and i1 %400, %988, !dbg !96 + %1023 = and i1 %400, %989, !dbg !96 + %1024 = and i1 %399, %991, !dbg !96 + %1025 = and i1 %399, %993, !dbg !96 + %1026 = and i1 %400, %994, !dbg !96 + %1027 = and i1 %400, %995, !dbg !96 + %1028 = icmp sgt i32 %948, 2047, !dbg !97 + %1029 = icmp sgt i32 %954, 2047, !dbg !97 + %1030 = icmp sgt i32 %960, 2047, !dbg !97 + %1031 = icmp sgt i32 %966, 2047, !dbg !97 + %1032 = icmp sgt i32 %972, 2047, !dbg !97 + %1033 = icmp sgt i32 %978, 2047, !dbg !97 + %1034 = icmp sgt i32 %984, 2047, !dbg !97 + %1035 = icmp sgt i32 %990, 2047, !dbg !97 + %1036 = shufflevector <16 x i32> %520, <16 x i32> poison, <8 x i32> , !dbg !95 + %1037 = srem <8 x i32> %1036, splat (i32 2048), !dbg !95 + %1038 = icmp ne <8 x i32> %1037, zeroinitializer, !dbg !98 + %1039 = shufflevector <16 x i32> %520, <16 x i32> poison, <8 x i32> , !dbg !99 + %1040 = and <8 x i32> %1039, splat (i32 -2147481601), !dbg !99 + %1041 = icmp ugt <8 x i32> %1040, splat (i32 -2147483648), !dbg !99 + %1042 = trunc <8 x i32> %1039 to <8 x i16>, !dbg !100 + %1043 = and <8 x i16> %1042, splat (i16 2047), !dbg !100 + %1044 = zext nneg <8 x i16> %1043 to <8 x i64>, !dbg !92 + %1045 = icmp sgt <8 x i64> %450, %1044, !dbg !92 + %1046 = and <8 x i1> %1041, %1038, !dbg !101 + %1047 = add nsw <8 x i32> %1037, splat (i32 2048), !dbg !102 + %1048 = select <8 x i1> %1046, <8 x i32> %1047, <8 x i32> %1037, !dbg !103 + %1049 = sext <8 x i32> %1048 to <8 x i64>, !dbg !92 + %1050 = icmp sgt <8 x i64> %450, %1049, !dbg !92 + %1051 = extractelement <8 x i1> %1045, i64 7, !dbg !104 + %1052 = and i1 %1028, %1051, !dbg !104 + %1053 = extractelement <8 x i1> %1050, i64 7, !dbg !104 + %1054 = and i1 %1028, %1053, !dbg !104 + %1055 = extractelement <8 x i1> %1045, i64 6, !dbg !104 + %1056 = and i1 %1029, %1055, !dbg !104 + %1057 = extractelement <8 x i1> %1050, i64 6, !dbg !104 + %1058 = and i1 %1029, %1057, !dbg !104 + %1059 = extractelement <8 x i1> %1045, i64 5, !dbg !104 + %1060 = and i1 %1030, %1059, !dbg !104 + %1061 = extractelement <8 x i1> %1050, i64 5, !dbg !104 + %1062 = and i1 %1030, %1061, !dbg !104 + %1063 = extractelement <8 x i1> %1045, i64 4, !dbg !104 + %1064 = and i1 %1031, %1063, !dbg !104 + %1065 = extractelement <8 x i1> %1050, i64 4, !dbg !104 + %1066 = and i1 %1031, %1065, !dbg !104 + %1067 = extractelement <8 x i1> %1045, i64 3, !dbg !104 + %1068 = and i1 %1032, %1067, !dbg !104 + %1069 = extractelement <8 x i1> %1050, i64 3, !dbg !104 + %1070 = and i1 %1032, %1069, !dbg !104 + %1071 = extractelement <8 x i1> %1045, i64 2, !dbg !104 + %1072 = and i1 %1033, %1071, !dbg !104 + %1073 = extractelement <8 x i1> %1050, i64 2, !dbg !104 + %1074 = and i1 %1033, %1073, !dbg !104 + %1075 = extractelement <8 x i1> %1045, i64 1, !dbg !104 + %1076 = and i1 %1034, %1075, !dbg !104 + %1077 = extractelement <8 x i1> %1050, i64 1, !dbg !104 + %1078 = and i1 %1034, %1077, !dbg !104 + %1079 = extractelement <8 x i1> %1045, i64 0, !dbg !104 + %1080 = and i1 %1035, %1079, !dbg !104 + %1081 = extractelement <8 x i1> %1050, i64 0, !dbg !104 + %1082 = and i1 %1035, %1081, !dbg !104 + %1083 = sub <32 x i32> %521, %94, !dbg !105 + %1084 = and <32 x i32> %1083, splat (i32 2047), !dbg !106 + %1085 = icmp eq <32 x i32> %1084, zeroinitializer, !dbg !107 + %1086 = extractelement <32 x i1> %1085, i64 31, !dbg !108 + %1087 = and i1 %1086, %1052, !dbg !108 + %1088 = extractelement <32 x i1> %1085, i64 30, !dbg !108 + %1089 = and i1 %1088, %1054, !dbg !108 + %1090 = extractelement <32 x i1> %1085, i64 29, !dbg !108 + %1091 = and i1 %1090, %1052, !dbg !108 + %1092 = extractelement <32 x i1> %1085, i64 28, !dbg !108 + %1093 = and i1 %1092, %1054, !dbg !108 + %1094 = extractelement <32 x i1> %1085, i64 27, !dbg !108 + %1095 = and i1 %1094, %1056, !dbg !108 + %1096 = extractelement <32 x i1> %1085, i64 26, !dbg !108 + %1097 = and i1 %1096, %1058, !dbg !108 + %1098 = extractelement <32 x i1> %1085, i64 25, !dbg !108 + %1099 = and i1 %1098, %1056, !dbg !108 + %1100 = extractelement <32 x i1> %1085, i64 24, !dbg !108 + %1101 = and i1 %1100, %1058, !dbg !108 + %1102 = extractelement <32 x i1> %1085, i64 23, !dbg !108 + %1103 = and i1 %1102, %1060, !dbg !108 + %1104 = extractelement <32 x i1> %1085, i64 22, !dbg !108 + %1105 = and i1 %1104, %1062, !dbg !108 + %1106 = extractelement <32 x i1> %1085, i64 21, !dbg !108 + %1107 = and i1 %1106, %1060, !dbg !108 + %1108 = extractelement <32 x i1> %1085, i64 20, !dbg !108 + %1109 = and i1 %1108, %1062, !dbg !108 + %1110 = extractelement <32 x i1> %1085, i64 19, !dbg !108 + %1111 = and i1 %1110, %1064, !dbg !108 + %1112 = extractelement <32 x i1> %1085, i64 18, !dbg !108 + %1113 = and i1 %1112, %1066, !dbg !108 + %1114 = extractelement <32 x i1> %1085, i64 17, !dbg !108 + %1115 = and i1 %1114, %1064, !dbg !108 + %1116 = extractelement <32 x i1> %1085, i64 16, !dbg !108 + %1117 = and i1 %1116, %1066, !dbg !108 + %1118 = extractelement <32 x i1> %1085, i64 15, !dbg !108 + %1119 = and i1 %1118, %1068, !dbg !108 + %1120 = extractelement <32 x i1> %1085, i64 14, !dbg !108 + %1121 = and i1 %1120, %1070, !dbg !108 + %1122 = extractelement <32 x i1> %1085, i64 13, !dbg !108 + %1123 = and i1 %1122, %1068, !dbg !108 + %1124 = extractelement <32 x i1> %1085, i64 12, !dbg !108 + %1125 = and i1 %1124, %1070, !dbg !108 + %1126 = extractelement <32 x i1> %1085, i64 11, !dbg !108 + %1127 = and i1 %1126, %1072, !dbg !108 + %1128 = extractelement <32 x i1> %1085, i64 10, !dbg !108 + %1129 = and i1 %1128, %1074, !dbg !108 + %1130 = extractelement <32 x i1> %1085, i64 9, !dbg !108 + %1131 = and i1 %1130, %1072, !dbg !108 + %1132 = extractelement <32 x i1> %1085, i64 8, !dbg !108 + %1133 = and i1 %1132, %1074, !dbg !108 + %1134 = extractelement <32 x i1> %1085, i64 7, !dbg !108 + %1135 = and i1 %1134, %1076, !dbg !108 + %1136 = extractelement <32 x i1> %1085, i64 6, !dbg !108 + %1137 = and i1 %1136, %1078, !dbg !108 + %1138 = extractelement <32 x i1> %1085, i64 5, !dbg !108 + %1139 = and i1 %1138, %1076, !dbg !108 + %1140 = extractelement <32 x i1> %1085, i64 4, !dbg !108 + %1141 = and i1 %1140, %1078, !dbg !108 + %1142 = extractelement <32 x i1> %1085, i64 3, !dbg !108 + %1143 = and i1 %1142, %1080, !dbg !108 + %1144 = extractelement <32 x i1> %1085, i64 2, !dbg !108 + %1145 = and i1 %1144, %1082, !dbg !108 + %1146 = extractelement <32 x i1> %1085, i64 1, !dbg !108 + %1147 = and i1 %1146, %1080, !dbg !108 + %1148 = extractelement <32 x i1> %1085, i64 0, !dbg !108 + %1149 = and i1 %1148, %1082, !dbg !108 + %1150 = or i1 %996, %1087, !dbg !109 + %1151 = or i1 %997, %1089, !dbg !109 + %1152 = or i1 %998, %1091, !dbg !109 + %1153 = or i1 %999, %1093, !dbg !109 + %1154 = or i1 %1000, %1095, !dbg !109 + %1155 = or i1 %1001, %1097, !dbg !109 + %1156 = or i1 %1002, %1099, !dbg !109 + %1157 = or i1 %1003, %1101, !dbg !109 + %1158 = or i1 %1004, %1103, !dbg !109 + %1159 = or i1 %1005, %1105, !dbg !109 + %1160 = or i1 %1006, %1107, !dbg !109 + %1161 = or i1 %1007, %1109, !dbg !109 + %1162 = or i1 %1008, %1111, !dbg !109 + %1163 = or i1 %1009, %1113, !dbg !109 + %1164 = or i1 %1010, %1115, !dbg !109 + %1165 = or i1 %1011, %1117, !dbg !109 + %1166 = or i1 %1012, %1119, !dbg !109 + %1167 = or i1 %1013, %1121, !dbg !109 + %1168 = or i1 %1014, %1123, !dbg !109 + %1169 = or i1 %1015, %1125, !dbg !109 + %1170 = or i1 %1016, %1127, !dbg !109 + %1171 = or i1 %1017, %1129, !dbg !109 + %1172 = or i1 %1018, %1131, !dbg !109 + %1173 = or i1 %1019, %1133, !dbg !109 + %1174 = or i1 %1020, %1135, !dbg !109 + %1175 = or i1 %1021, %1137, !dbg !109 + %1176 = or i1 %1022, %1139, !dbg !109 + %1177 = or i1 %1023, %1141, !dbg !109 + %1178 = or i1 %1024, %1143, !dbg !109 + %1179 = or i1 %1025, %1145, !dbg !109 + %1180 = or i1 %1026, %1147, !dbg !109 + %1181 = or i1 %1027, %1149, !dbg !109 + %1182 = fmul float %916, 0x3FF7154760000000, !dbg !110 + %1183 = select i1 %1150, float %1182, float 0xFFF0000000000000, !dbg !111 + %1184 = fmul float %917, 0x3FF7154760000000, !dbg !110 + %1185 = select i1 %1151, float %1184, float 0xFFF0000000000000, !dbg !111 + %1186 = fmul float %918, 0x3FF7154760000000, !dbg !110 + %1187 = select i1 %1152, float %1186, float 0xFFF0000000000000, !dbg !111 + %1188 = fmul float %919, 0x3FF7154760000000, !dbg !110 + %1189 = select i1 %1153, float %1188, float 0xFFF0000000000000, !dbg !111 + %1190 = fmul float %920, 0x3FF7154760000000, !dbg !110 + %1191 = select i1 %1154, float %1190, float 0xFFF0000000000000, !dbg !111 + %1192 = fmul float %921, 0x3FF7154760000000, !dbg !110 + %1193 = select i1 %1155, float %1192, float 0xFFF0000000000000, !dbg !111 + %1194 = fmul float %922, 0x3FF7154760000000, !dbg !110 + %1195 = select i1 %1156, float %1194, float 0xFFF0000000000000, !dbg !111 + %1196 = fmul float %923, 0x3FF7154760000000, !dbg !110 + %1197 = select i1 %1157, float %1196, float 0xFFF0000000000000, !dbg !111 + %1198 = fmul float %924, 0x3FF7154760000000, !dbg !110 + %1199 = select i1 %1158, float %1198, float 0xFFF0000000000000, !dbg !111 + %1200 = fmul float %925, 0x3FF7154760000000, !dbg !110 + %1201 = select i1 %1159, float %1200, float 0xFFF0000000000000, !dbg !111 + %1202 = fmul float %926, 0x3FF7154760000000, !dbg !110 + %1203 = select i1 %1160, float %1202, float 0xFFF0000000000000, !dbg !111 + %1204 = fmul float %927, 0x3FF7154760000000, !dbg !110 + %1205 = select i1 %1161, float %1204, float 0xFFF0000000000000, !dbg !111 + %1206 = fmul float %928, 0x3FF7154760000000, !dbg !110 + %1207 = select i1 %1162, float %1206, float 0xFFF0000000000000, !dbg !111 + %1208 = fmul float %929, 0x3FF7154760000000, !dbg !110 + %1209 = select i1 %1163, float %1208, float 0xFFF0000000000000, !dbg !111 + %1210 = fmul float %930, 0x3FF7154760000000, !dbg !110 + %1211 = select i1 %1164, float %1210, float 0xFFF0000000000000, !dbg !111 + %1212 = fmul float %931, 0x3FF7154760000000, !dbg !110 + %1213 = select i1 %1165, float %1212, float 0xFFF0000000000000, !dbg !111 + %1214 = fmul float %932, 0x3FF7154760000000, !dbg !110 + %1215 = select i1 %1166, float %1214, float 0xFFF0000000000000, !dbg !111 + %1216 = fmul float %933, 0x3FF7154760000000, !dbg !110 + %1217 = select i1 %1167, float %1216, float 0xFFF0000000000000, !dbg !111 + %1218 = fmul float %934, 0x3FF7154760000000, !dbg !110 + %1219 = select i1 %1168, float %1218, float 0xFFF0000000000000, !dbg !111 + %1220 = fmul float %935, 0x3FF7154760000000, !dbg !110 + %1221 = select i1 %1169, float %1220, float 0xFFF0000000000000, !dbg !111 + %1222 = fmul float %936, 0x3FF7154760000000, !dbg !110 + %1223 = select i1 %1170, float %1222, float 0xFFF0000000000000, !dbg !111 + %1224 = fmul float %937, 0x3FF7154760000000, !dbg !110 + %1225 = select i1 %1171, float %1224, float 0xFFF0000000000000, !dbg !111 + %1226 = fmul float %938, 0x3FF7154760000000, !dbg !110 + %1227 = select i1 %1172, float %1226, float 0xFFF0000000000000, !dbg !111 + %1228 = fmul float %939, 0x3FF7154760000000, !dbg !110 + %1229 = select i1 %1173, float %1228, float 0xFFF0000000000000, !dbg !111 + %1230 = fmul float %940, 0x3FF7154760000000, !dbg !110 + %1231 = select i1 %1174, float %1230, float 0xFFF0000000000000, !dbg !111 + %1232 = fmul float %941, 0x3FF7154760000000, !dbg !110 + %1233 = select i1 %1175, float %1232, float 0xFFF0000000000000, !dbg !111 + %1234 = fmul float %942, 0x3FF7154760000000, !dbg !110 + %1235 = select i1 %1176, float %1234, float 0xFFF0000000000000, !dbg !111 + %1236 = fmul float %943, 0x3FF7154760000000, !dbg !110 + %1237 = select i1 %1177, float %1236, float 0xFFF0000000000000, !dbg !111 + %1238 = fmul float %944, 0x3FF7154760000000, !dbg !110 + %1239 = select i1 %1178, float %1238, float 0xFFF0000000000000, !dbg !111 + %1240 = fmul float %945, 0x3FF7154760000000, !dbg !110 + %1241 = select i1 %1179, float %1240, float 0xFFF0000000000000, !dbg !111 + %1242 = fmul float %946, 0x3FF7154760000000, !dbg !110 + %1243 = select i1 %1180, float %1242, float 0xFFF0000000000000, !dbg !111 + %1244 = fmul float %947, 0x3FF7154760000000, !dbg !110 + %1245 = select i1 %1181, float %1244, float 0xFFF0000000000000, !dbg !111 + %1246 = fsub float %1183, %356, !dbg !112 + %1247 = fsub float %1185, %356, !dbg !112 + %1248 = fsub float %1187, %357, !dbg !112 + %1249 = fsub float %1189, %357, !dbg !112 + %1250 = fsub float %1191, %356, !dbg !112 + %1251 = fsub float %1193, %356, !dbg !112 + %1252 = fsub float %1195, %357, !dbg !112 + %1253 = fsub float %1197, %357, !dbg !112 + %1254 = fsub float %1199, %356, !dbg !112 + %1255 = fsub float %1201, %356, !dbg !112 + %1256 = fsub float %1203, %357, !dbg !112 + %1257 = fsub float %1205, %357, !dbg !112 + %1258 = fsub float %1207, %356, !dbg !112 + %1259 = fsub float %1209, %356, !dbg !112 + %1260 = fsub float %1211, %357, !dbg !112 + %1261 = fsub float %1213, %357, !dbg !112 + %1262 = fsub float %1215, %356, !dbg !112 + %1263 = fsub float %1217, %356, !dbg !112 + %1264 = fsub float %1219, %357, !dbg !112 + %1265 = fsub float %1221, %357, !dbg !112 + %1266 = fsub float %1223, %356, !dbg !112 + %1267 = fsub float %1225, %356, !dbg !112 + %1268 = fsub float %1227, %357, !dbg !112 + %1269 = fsub float %1229, %357, !dbg !112 + %1270 = fsub float %1231, %356, !dbg !112 + %1271 = fsub float %1233, %356, !dbg !112 + %1272 = fsub float %1235, %357, !dbg !112 + %1273 = fsub float %1237, %357, !dbg !112 + %1274 = fsub float %1239, %356, !dbg !112 + %1275 = fsub float %1241, %356, !dbg !112 + %1276 = fsub float %1243, %357, !dbg !112 + %1277 = fsub float %1245, %357, !dbg !112 + %1278 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1412 = icmp eq i32 %1278, 0, !dbg !113 + br i1 %.not.i1412, label %1281, label %1279, !dbg !113 + +1279: ; preds = %451 + %1280 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1246) #3, !dbg !113 + br label %__nv_exp2f.exit1414, !dbg !113 + +1281: ; preds = %451 + %1282 = tail call float @llvm.nvvm.ex2.approx.f(float %1246) #3, !dbg !113 + br label %__nv_exp2f.exit1414, !dbg !113 + +__nv_exp2f.exit1414: ; preds = %1279, %1281 + %.0.i1413 = phi float [ %1280, %1279 ], [ %1282, %1281 ], !dbg !113 + %1283 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1415 = icmp eq i32 %1283, 0, !dbg !113 + br i1 %.not.i1415, label %1286, label %1284, !dbg !113 + +1284: ; preds = %__nv_exp2f.exit1414 + %1285 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1247) #3, !dbg !113 + br label %__nv_exp2f.exit1417, !dbg !113 + +1286: ; preds = %__nv_exp2f.exit1414 + %1287 = tail call float @llvm.nvvm.ex2.approx.f(float %1247) #3, !dbg !113 + br label %__nv_exp2f.exit1417, !dbg !113 + +__nv_exp2f.exit1417: ; preds = %1284, %1286 + %.0.i1416 = phi float [ %1285, %1284 ], [ %1287, %1286 ], !dbg !113 + %1288 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1418 = icmp eq i32 %1288, 0, !dbg !113 + br i1 %.not.i1418, label %1291, label %1289, !dbg !113 + +1289: ; preds = %__nv_exp2f.exit1417 + %1290 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1248) #3, !dbg !113 + br label %__nv_exp2f.exit1420, !dbg !113 + +1291: ; preds = %__nv_exp2f.exit1417 + %1292 = tail call float @llvm.nvvm.ex2.approx.f(float %1248) #3, !dbg !113 + br label %__nv_exp2f.exit1420, !dbg !113 + +__nv_exp2f.exit1420: ; preds = %1289, %1291 + %.0.i1419 = phi float [ %1290, %1289 ], [ %1292, %1291 ], !dbg !113 + %1293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1421 = icmp eq i32 %1293, 0, !dbg !113 + br i1 %.not.i1421, label %1296, label %1294, !dbg !113 + +1294: ; preds = %__nv_exp2f.exit1420 + %1295 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1249) #3, !dbg !113 + br label %__nv_exp2f.exit1423, !dbg !113 + +1296: ; preds = %__nv_exp2f.exit1420 + %1297 = tail call float @llvm.nvvm.ex2.approx.f(float %1249) #3, !dbg !113 + br label %__nv_exp2f.exit1423, !dbg !113 + +__nv_exp2f.exit1423: ; preds = %1294, %1296 + %.0.i1422 = phi float [ %1295, %1294 ], [ %1297, %1296 ], !dbg !113 + %1298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1424 = icmp eq i32 %1298, 0, !dbg !113 + br i1 %.not.i1424, label %1301, label %1299, !dbg !113 + +1299: ; preds = %__nv_exp2f.exit1423 + %1300 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1250) #3, !dbg !113 + br label %__nv_exp2f.exit1426, !dbg !113 + +1301: ; preds = %__nv_exp2f.exit1423 + %1302 = tail call float @llvm.nvvm.ex2.approx.f(float %1250) #3, !dbg !113 + br label %__nv_exp2f.exit1426, !dbg !113 + +__nv_exp2f.exit1426: ; preds = %1299, %1301 + %.0.i1425 = phi float [ %1300, %1299 ], [ %1302, %1301 ], !dbg !113 + %1303 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1427 = icmp eq i32 %1303, 0, !dbg !113 + br i1 %.not.i1427, label %1306, label %1304, !dbg !113 + +1304: ; preds = %__nv_exp2f.exit1426 + %1305 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1251) #3, !dbg !113 + br label %__nv_exp2f.exit1429, !dbg !113 + +1306: ; preds = %__nv_exp2f.exit1426 + %1307 = tail call float @llvm.nvvm.ex2.approx.f(float %1251) #3, !dbg !113 + br label %__nv_exp2f.exit1429, !dbg !113 + +__nv_exp2f.exit1429: ; preds = %1304, %1306 + %.0.i1428 = phi float [ %1305, %1304 ], [ %1307, %1306 ], !dbg !113 + %1308 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1430 = icmp eq i32 %1308, 0, !dbg !113 + br i1 %.not.i1430, label %1311, label %1309, !dbg !113 + +1309: ; preds = %__nv_exp2f.exit1429 + %1310 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1252) #3, !dbg !113 + br label %__nv_exp2f.exit1432, !dbg !113 + +1311: ; preds = %__nv_exp2f.exit1429 + %1312 = tail call float @llvm.nvvm.ex2.approx.f(float %1252) #3, !dbg !113 + br label %__nv_exp2f.exit1432, !dbg !113 + +__nv_exp2f.exit1432: ; preds = %1309, %1311 + %.0.i1431 = phi float [ %1310, %1309 ], [ %1312, %1311 ], !dbg !113 + %1313 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1433 = icmp eq i32 %1313, 0, !dbg !113 + br i1 %.not.i1433, label %1316, label %1314, !dbg !113 + +1314: ; preds = %__nv_exp2f.exit1432 + %1315 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1253) #3, !dbg !113 + br label %__nv_exp2f.exit1435, !dbg !113 + +1316: ; preds = %__nv_exp2f.exit1432 + %1317 = tail call float @llvm.nvvm.ex2.approx.f(float %1253) #3, !dbg !113 + br label %__nv_exp2f.exit1435, !dbg !113 + +__nv_exp2f.exit1435: ; preds = %1314, %1316 + %.0.i1434 = phi float [ %1315, %1314 ], [ %1317, %1316 ], !dbg !113 + %1318 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1436 = icmp eq i32 %1318, 0, !dbg !113 + br i1 %.not.i1436, label %1321, label %1319, !dbg !113 + +1319: ; preds = %__nv_exp2f.exit1435 + %1320 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1254) #3, !dbg !113 + br label %__nv_exp2f.exit1438, !dbg !113 + +1321: ; preds = %__nv_exp2f.exit1435 + %1322 = tail call float @llvm.nvvm.ex2.approx.f(float %1254) #3, !dbg !113 + br label %__nv_exp2f.exit1438, !dbg !113 + +__nv_exp2f.exit1438: ; preds = %1319, %1321 + %.0.i1437 = phi float [ %1320, %1319 ], [ %1322, %1321 ], !dbg !113 + %1323 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1439 = icmp eq i32 %1323, 0, !dbg !113 + br i1 %.not.i1439, label %1326, label %1324, !dbg !113 + +1324: ; preds = %__nv_exp2f.exit1438 + %1325 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1255) #3, !dbg !113 + br label %__nv_exp2f.exit1441, !dbg !113 + +1326: ; preds = %__nv_exp2f.exit1438 + %1327 = tail call float @llvm.nvvm.ex2.approx.f(float %1255) #3, !dbg !113 + br label %__nv_exp2f.exit1441, !dbg !113 + +__nv_exp2f.exit1441: ; preds = %1324, %1326 + %.0.i1440 = phi float [ %1325, %1324 ], [ %1327, %1326 ], !dbg !113 + %1328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1442 = icmp eq i32 %1328, 0, !dbg !113 + br i1 %.not.i1442, label %1331, label %1329, !dbg !113 + +1329: ; preds = %__nv_exp2f.exit1441 + %1330 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1256) #3, !dbg !113 + br label %__nv_exp2f.exit1444, !dbg !113 + +1331: ; preds = %__nv_exp2f.exit1441 + %1332 = tail call float @llvm.nvvm.ex2.approx.f(float %1256) #3, !dbg !113 + br label %__nv_exp2f.exit1444, !dbg !113 + +__nv_exp2f.exit1444: ; preds = %1329, %1331 + %.0.i1443 = phi float [ %1330, %1329 ], [ %1332, %1331 ], !dbg !113 + %1333 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1445 = icmp eq i32 %1333, 0, !dbg !113 + br i1 %.not.i1445, label %1336, label %1334, !dbg !113 + +1334: ; preds = %__nv_exp2f.exit1444 + %1335 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1257) #3, !dbg !113 + br label %__nv_exp2f.exit1447, !dbg !113 + +1336: ; preds = %__nv_exp2f.exit1444 + %1337 = tail call float @llvm.nvvm.ex2.approx.f(float %1257) #3, !dbg !113 + br label %__nv_exp2f.exit1447, !dbg !113 + +__nv_exp2f.exit1447: ; preds = %1334, %1336 + %.0.i1446 = phi float [ %1335, %1334 ], [ %1337, %1336 ], !dbg !113 + %1338 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1448 = icmp eq i32 %1338, 0, !dbg !113 + br i1 %.not.i1448, label %1341, label %1339, !dbg !113 + +1339: ; preds = %__nv_exp2f.exit1447 + %1340 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1258) #3, !dbg !113 + br label %__nv_exp2f.exit1450, !dbg !113 + +1341: ; preds = %__nv_exp2f.exit1447 + %1342 = tail call float @llvm.nvvm.ex2.approx.f(float %1258) #3, !dbg !113 + br label %__nv_exp2f.exit1450, !dbg !113 + +__nv_exp2f.exit1450: ; preds = %1339, %1341 + %.0.i1449 = phi float [ %1340, %1339 ], [ %1342, %1341 ], !dbg !113 + %1343 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1451 = icmp eq i32 %1343, 0, !dbg !113 + br i1 %.not.i1451, label %1346, label %1344, !dbg !113 + +1344: ; preds = %__nv_exp2f.exit1450 + %1345 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1259) #3, !dbg !113 + br label %__nv_exp2f.exit1453, !dbg !113 + +1346: ; preds = %__nv_exp2f.exit1450 + %1347 = tail call float @llvm.nvvm.ex2.approx.f(float %1259) #3, !dbg !113 + br label %__nv_exp2f.exit1453, !dbg !113 + +__nv_exp2f.exit1453: ; preds = %1344, %1346 + %.0.i1452 = phi float [ %1345, %1344 ], [ %1347, %1346 ], !dbg !113 + %1348 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1454 = icmp eq i32 %1348, 0, !dbg !113 + br i1 %.not.i1454, label %1351, label %1349, !dbg !113 + +1349: ; preds = %__nv_exp2f.exit1453 + %1350 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1260) #3, !dbg !113 + br label %__nv_exp2f.exit1456, !dbg !113 + +1351: ; preds = %__nv_exp2f.exit1453 + %1352 = tail call float @llvm.nvvm.ex2.approx.f(float %1260) #3, !dbg !113 + br label %__nv_exp2f.exit1456, !dbg !113 + +__nv_exp2f.exit1456: ; preds = %1349, %1351 + %.0.i1455 = phi float [ %1350, %1349 ], [ %1352, %1351 ], !dbg !113 + %1353 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1457 = icmp eq i32 %1353, 0, !dbg !113 + br i1 %.not.i1457, label %1356, label %1354, !dbg !113 + +1354: ; preds = %__nv_exp2f.exit1456 + %1355 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1261) #3, !dbg !113 + br label %__nv_exp2f.exit1459, !dbg !113 + +1356: ; preds = %__nv_exp2f.exit1456 + %1357 = tail call float @llvm.nvvm.ex2.approx.f(float %1261) #3, !dbg !113 + br label %__nv_exp2f.exit1459, !dbg !113 + +__nv_exp2f.exit1459: ; preds = %1354, %1356 + %.0.i1458 = phi float [ %1355, %1354 ], [ %1357, %1356 ], !dbg !113 + %1358 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1460 = icmp eq i32 %1358, 0, !dbg !113 + br i1 %.not.i1460, label %1361, label %1359, !dbg !113 + +1359: ; preds = %__nv_exp2f.exit1459 + %1360 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1262) #3, !dbg !113 + br label %__nv_exp2f.exit1462, !dbg !113 + +1361: ; preds = %__nv_exp2f.exit1459 + %1362 = tail call float @llvm.nvvm.ex2.approx.f(float %1262) #3, !dbg !113 + br label %__nv_exp2f.exit1462, !dbg !113 + +__nv_exp2f.exit1462: ; preds = %1359, %1361 + %.0.i1461 = phi float [ %1360, %1359 ], [ %1362, %1361 ], !dbg !113 + %1363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1463 = icmp eq i32 %1363, 0, !dbg !113 + br i1 %.not.i1463, label %1366, label %1364, !dbg !113 + +1364: ; preds = %__nv_exp2f.exit1462 + %1365 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1263) #3, !dbg !113 + br label %__nv_exp2f.exit1465, !dbg !113 + +1366: ; preds = %__nv_exp2f.exit1462 + %1367 = tail call float @llvm.nvvm.ex2.approx.f(float %1263) #3, !dbg !113 + br label %__nv_exp2f.exit1465, !dbg !113 + +__nv_exp2f.exit1465: ; preds = %1364, %1366 + %.0.i1464 = phi float [ %1365, %1364 ], [ %1367, %1366 ], !dbg !113 + %1368 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1466 = icmp eq i32 %1368, 0, !dbg !113 + br i1 %.not.i1466, label %1371, label %1369, !dbg !113 + +1369: ; preds = %__nv_exp2f.exit1465 + %1370 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1264) #3, !dbg !113 + br label %__nv_exp2f.exit1468, !dbg !113 + +1371: ; preds = %__nv_exp2f.exit1465 + %1372 = tail call float @llvm.nvvm.ex2.approx.f(float %1264) #3, !dbg !113 + br label %__nv_exp2f.exit1468, !dbg !113 + +__nv_exp2f.exit1468: ; preds = %1369, %1371 + %.0.i1467 = phi float [ %1370, %1369 ], [ %1372, %1371 ], !dbg !113 + %1373 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1469 = icmp eq i32 %1373, 0, !dbg !113 + br i1 %.not.i1469, label %1376, label %1374, !dbg !113 + +1374: ; preds = %__nv_exp2f.exit1468 + %1375 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1265) #3, !dbg !113 + br label %__nv_exp2f.exit1471, !dbg !113 + +1376: ; preds = %__nv_exp2f.exit1468 + %1377 = tail call float @llvm.nvvm.ex2.approx.f(float %1265) #3, !dbg !113 + br label %__nv_exp2f.exit1471, !dbg !113 + +__nv_exp2f.exit1471: ; preds = %1374, %1376 + %.0.i1470 = phi float [ %1375, %1374 ], [ %1377, %1376 ], !dbg !113 + %1378 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1472 = icmp eq i32 %1378, 0, !dbg !113 + br i1 %.not.i1472, label %1381, label %1379, !dbg !113 + +1379: ; preds = %__nv_exp2f.exit1471 + %1380 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1266) #3, !dbg !113 + br label %__nv_exp2f.exit1474, !dbg !113 + +1381: ; preds = %__nv_exp2f.exit1471 + %1382 = tail call float @llvm.nvvm.ex2.approx.f(float %1266) #3, !dbg !113 + br label %__nv_exp2f.exit1474, !dbg !113 + +__nv_exp2f.exit1474: ; preds = %1379, %1381 + %.0.i1473 = phi float [ %1380, %1379 ], [ %1382, %1381 ], !dbg !113 + %1383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1475 = icmp eq i32 %1383, 0, !dbg !113 + br i1 %.not.i1475, label %1386, label %1384, !dbg !113 + +1384: ; preds = %__nv_exp2f.exit1474 + %1385 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1267) #3, !dbg !113 + br label %__nv_exp2f.exit1477, !dbg !113 + +1386: ; preds = %__nv_exp2f.exit1474 + %1387 = tail call float @llvm.nvvm.ex2.approx.f(float %1267) #3, !dbg !113 + br label %__nv_exp2f.exit1477, !dbg !113 + +__nv_exp2f.exit1477: ; preds = %1384, %1386 + %.0.i1476 = phi float [ %1385, %1384 ], [ %1387, %1386 ], !dbg !113 + %1388 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1478 = icmp eq i32 %1388, 0, !dbg !113 + br i1 %.not.i1478, label %1391, label %1389, !dbg !113 + +1389: ; preds = %__nv_exp2f.exit1477 + %1390 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1268) #3, !dbg !113 + br label %__nv_exp2f.exit1480, !dbg !113 + +1391: ; preds = %__nv_exp2f.exit1477 + %1392 = tail call float @llvm.nvvm.ex2.approx.f(float %1268) #3, !dbg !113 + br label %__nv_exp2f.exit1480, !dbg !113 + +__nv_exp2f.exit1480: ; preds = %1389, %1391 + %.0.i1479 = phi float [ %1390, %1389 ], [ %1392, %1391 ], !dbg !113 + %1393 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1481 = icmp eq i32 %1393, 0, !dbg !113 + br i1 %.not.i1481, label %1396, label %1394, !dbg !113 + +1394: ; preds = %__nv_exp2f.exit1480 + %1395 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1269) #3, !dbg !113 + br label %__nv_exp2f.exit1483, !dbg !113 + +1396: ; preds = %__nv_exp2f.exit1480 + %1397 = tail call float @llvm.nvvm.ex2.approx.f(float %1269) #3, !dbg !113 + br label %__nv_exp2f.exit1483, !dbg !113 + +__nv_exp2f.exit1483: ; preds = %1394, %1396 + %.0.i1482 = phi float [ %1395, %1394 ], [ %1397, %1396 ], !dbg !113 + %1398 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1484 = icmp eq i32 %1398, 0, !dbg !113 + br i1 %.not.i1484, label %1401, label %1399, !dbg !113 + +1399: ; preds = %__nv_exp2f.exit1483 + %1400 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1270) #3, !dbg !113 + br label %__nv_exp2f.exit1486, !dbg !113 + +1401: ; preds = %__nv_exp2f.exit1483 + %1402 = tail call float @llvm.nvvm.ex2.approx.f(float %1270) #3, !dbg !113 + br label %__nv_exp2f.exit1486, !dbg !113 + +__nv_exp2f.exit1486: ; preds = %1399, %1401 + %.0.i1485 = phi float [ %1400, %1399 ], [ %1402, %1401 ], !dbg !113 + %1403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1487 = icmp eq i32 %1403, 0, !dbg !113 + br i1 %.not.i1487, label %1406, label %1404, !dbg !113 + +1404: ; preds = %__nv_exp2f.exit1486 + %1405 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1271) #3, !dbg !113 + br label %__nv_exp2f.exit1489, !dbg !113 + +1406: ; preds = %__nv_exp2f.exit1486 + %1407 = tail call float @llvm.nvvm.ex2.approx.f(float %1271) #3, !dbg !113 + br label %__nv_exp2f.exit1489, !dbg !113 + +__nv_exp2f.exit1489: ; preds = %1404, %1406 + %.0.i1488 = phi float [ %1405, %1404 ], [ %1407, %1406 ], !dbg !113 + %1408 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1490 = icmp eq i32 %1408, 0, !dbg !113 + br i1 %.not.i1490, label %1411, label %1409, !dbg !113 + +1409: ; preds = %__nv_exp2f.exit1489 + %1410 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1272) #3, !dbg !113 + br label %__nv_exp2f.exit1492, !dbg !113 + +1411: ; preds = %__nv_exp2f.exit1489 + %1412 = tail call float @llvm.nvvm.ex2.approx.f(float %1272) #3, !dbg !113 + br label %__nv_exp2f.exit1492, !dbg !113 + +__nv_exp2f.exit1492: ; preds = %1409, %1411 + %.0.i1491 = phi float [ %1410, %1409 ], [ %1412, %1411 ], !dbg !113 + %1413 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1493 = icmp eq i32 %1413, 0, !dbg !113 + br i1 %.not.i1493, label %1416, label %1414, !dbg !113 + +1414: ; preds = %__nv_exp2f.exit1492 + %1415 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1273) #3, !dbg !113 + br label %__nv_exp2f.exit1495, !dbg !113 + +1416: ; preds = %__nv_exp2f.exit1492 + %1417 = tail call float @llvm.nvvm.ex2.approx.f(float %1273) #3, !dbg !113 + br label %__nv_exp2f.exit1495, !dbg !113 + +__nv_exp2f.exit1495: ; preds = %1414, %1416 + %.0.i1494 = phi float [ %1415, %1414 ], [ %1417, %1416 ], !dbg !113 + %1418 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1496 = icmp eq i32 %1418, 0, !dbg !113 + br i1 %.not.i1496, label %1421, label %1419, !dbg !113 + +1419: ; preds = %__nv_exp2f.exit1495 + %1420 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1274) #3, !dbg !113 + br label %__nv_exp2f.exit1498, !dbg !113 + +1421: ; preds = %__nv_exp2f.exit1495 + %1422 = tail call float @llvm.nvvm.ex2.approx.f(float %1274) #3, !dbg !113 + br label %__nv_exp2f.exit1498, !dbg !113 + +__nv_exp2f.exit1498: ; preds = %1419, %1421 + %.0.i1497 = phi float [ %1420, %1419 ], [ %1422, %1421 ], !dbg !113 + %1423 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1499 = icmp eq i32 %1423, 0, !dbg !113 + br i1 %.not.i1499, label %1426, label %1424, !dbg !113 + +1424: ; preds = %__nv_exp2f.exit1498 + %1425 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1275) #3, !dbg !113 + br label %__nv_exp2f.exit1501, !dbg !113 + +1426: ; preds = %__nv_exp2f.exit1498 + %1427 = tail call float @llvm.nvvm.ex2.approx.f(float %1275) #3, !dbg !113 + br label %__nv_exp2f.exit1501, !dbg !113 + +__nv_exp2f.exit1501: ; preds = %1424, %1426 + %.0.i1500 = phi float [ %1425, %1424 ], [ %1427, %1426 ], !dbg !113 + %1428 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1502 = icmp eq i32 %1428, 0, !dbg !113 + br i1 %.not.i1502, label %1431, label %1429, !dbg !113 + +1429: ; preds = %__nv_exp2f.exit1501 + %1430 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1276) #3, !dbg !113 + br label %__nv_exp2f.exit1504, !dbg !113 + +1431: ; preds = %__nv_exp2f.exit1501 + %1432 = tail call float @llvm.nvvm.ex2.approx.f(float %1276) #3, !dbg !113 + br label %__nv_exp2f.exit1504, !dbg !113 + +__nv_exp2f.exit1504: ; preds = %1429, %1431 + %.0.i1503 = phi float [ %1430, %1429 ], [ %1432, %1431 ], !dbg !113 + %1433 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1505 = icmp eq i32 %1433, 0, !dbg !113 + br i1 %.not.i1505, label %1436, label %1434, !dbg !113 + +1434: ; preds = %__nv_exp2f.exit1504 + %1435 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1277) #3, !dbg !113 + br label %__nv_exp2f.exit1507, !dbg !113 + +1436: ; preds = %__nv_exp2f.exit1504 + %1437 = tail call float @llvm.nvvm.ex2.approx.f(float %1277) #3, !dbg !113 + br label %__nv_exp2f.exit1507, !dbg !113 + +__nv_exp2f.exit1507: ; preds = %1434, %1436 + %.0.i1506 = phi float [ %1435, %1434 ], [ %1437, %1436 ], !dbg !113 + %1438 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %527, !dbg !87 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !114 + %1439 = add i32 %531, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !114 + %1440 = lshr exact i32 %1439, 4, !dbg !114 + %1441 = and i32 %1440, 16383, !dbg !114 + %1442 = zext nneg i32 %1441 to i64, !dbg !114 + %1443 = or disjoint i64 %1442, 4611686293372403712, !dbg !114 + %1444 = ptrtoint ptr addrspace(3) %1438 to i32, !dbg !114 + %1445 = lshr exact i32 %1444, 4, !dbg !114 + %1446 = and i32 %1445, 16383, !dbg !114 + %1447 = zext nneg i32 %1446 to i64, !dbg !114 + %1448 = or disjoint i64 %1447, 4611686293338849280, !dbg !114 + %1449 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %1443, i64 %1448) #3, !dbg !114 + %1450 = add i32 %543, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !114 + %1451 = lshr exact i32 %1450, 4, !dbg !114 + %1452 = and i32 %1451, 16383, !dbg !114 + %1453 = zext nneg i32 %1452 to i64, !dbg !114 + %1454 = or disjoint i64 %1453, 4611686293372403712, !dbg !114 + %1455 = add i32 %1444, 32, !dbg !114 + %1456 = lshr exact i32 %1455, 4, !dbg !114 + %1457 = and i32 %1456, 16383, !dbg !114 + %1458 = zext nneg i32 %1457 to i64, !dbg !114 + %1459 = or disjoint i64 %1458, 4611686293338849280, !dbg !114 + %1460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 0, !dbg !114 + %1461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 1, !dbg !114 + %1462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 2, !dbg !114 + %1463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 3, !dbg !114 + %1464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 4, !dbg !114 + %1465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 5, !dbg !114 + %1466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 6, !dbg !114 + %1467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 7, !dbg !114 + %1468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 8, !dbg !114 + %1469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 9, !dbg !114 + %1470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 10, !dbg !114 + %1471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 11, !dbg !114 + %1472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 12, !dbg !114 + %1473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 13, !dbg !114 + %1474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 14, !dbg !114 + %1475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 15, !dbg !114 + %1476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 16, !dbg !114 + %1477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 17, !dbg !114 + %1478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 18, !dbg !114 + %1479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 19, !dbg !114 + %1480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 20, !dbg !114 + %1481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 21, !dbg !114 + %1482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 22, !dbg !114 + %1483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 23, !dbg !114 + %1484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 24, !dbg !114 + %1485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 25, !dbg !114 + %1486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 26, !dbg !114 + %1487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 27, !dbg !114 + %1488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 28, !dbg !114 + %1489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 29, !dbg !114 + %1490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 30, !dbg !114 + %1491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 31, !dbg !114 + %1492 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1460, float %1461, float %1462, float %1463, float %1464, float %1465, float %1466, float %1467, float %1468, float %1469, float %1470, float %1471, float %1472, float %1473, float %1474, float %1475, float %1476, float %1477, float %1478, float %1479, float %1480, float %1481, float %1482, float %1483, float %1484, float %1485, float %1486, float %1487, float %1488, float %1489, float %1490, float %1491, i64 %1454, i64 %1459, i1 true) #3, !dbg !114 + %1493 = add i32 %587, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !114 + %1494 = lshr exact i32 %1493, 4, !dbg !114 + %1495 = and i32 %1494, 16383, !dbg !114 + %1496 = zext nneg i32 %1495 to i64, !dbg !114 + %1497 = or disjoint i64 %1496, 4611686293372403712, !dbg !114 + %1498 = add i32 %1444, 64, !dbg !114 + %1499 = lshr exact i32 %1498, 4, !dbg !114 + %1500 = and i32 %1499, 16383, !dbg !114 + %1501 = zext nneg i32 %1500 to i64, !dbg !114 + %1502 = or disjoint i64 %1501, 4611686293338849280, !dbg !114 + %1503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 0, !dbg !114 + %1504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 1, !dbg !114 + %1505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 2, !dbg !114 + %1506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 3, !dbg !114 + %1507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 4, !dbg !114 + %1508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 5, !dbg !114 + %1509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 6, !dbg !114 + %1510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 7, !dbg !114 + %1511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 8, !dbg !114 + %1512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 9, !dbg !114 + %1513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 10, !dbg !114 + %1514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 11, !dbg !114 + %1515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 12, !dbg !114 + %1516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 13, !dbg !114 + %1517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 14, !dbg !114 + %1518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 15, !dbg !114 + %1519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 16, !dbg !114 + %1520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 17, !dbg !114 + %1521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 18, !dbg !114 + %1522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 19, !dbg !114 + %1523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 20, !dbg !114 + %1524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 21, !dbg !114 + %1525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 22, !dbg !114 + %1526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 23, !dbg !114 + %1527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 24, !dbg !114 + %1528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 25, !dbg !114 + %1529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 26, !dbg !114 + %1530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 27, !dbg !114 + %1531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 28, !dbg !114 + %1532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 29, !dbg !114 + %1533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 30, !dbg !114 + %1534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 31, !dbg !114 + %1535 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1503, float %1504, float %1505, float %1506, float %1507, float %1508, float %1509, float %1510, float %1511, float %1512, float %1513, float %1514, float %1515, float %1516, float %1517, float %1518, float %1519, float %1520, float %1521, float %1522, float %1523, float %1524, float %1525, float %1526, float %1527, float %1528, float %1529, float %1530, float %1531, float %1532, float %1533, float %1534, i64 %1497, i64 %1502, i1 true) #3, !dbg !114 + %1536 = add i32 %631, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !114 + %1537 = lshr exact i32 %1536, 4, !dbg !114 + %1538 = and i32 %1537, 16383, !dbg !114 + %1539 = zext nneg i32 %1538 to i64, !dbg !114 + %1540 = or disjoint i64 %1539, 4611686293372403712, !dbg !114 + %1541 = add i32 %1444, 96, !dbg !114 + %1542 = lshr exact i32 %1541, 4, !dbg !114 + %1543 = and i32 %1542, 16383, !dbg !114 + %1544 = zext nneg i32 %1543 to i64, !dbg !114 + %1545 = or disjoint i64 %1544, 4611686293338849280, !dbg !114 + %1546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 0, !dbg !114 + %1547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 1, !dbg !114 + %1548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 2, !dbg !114 + %1549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 3, !dbg !114 + %1550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 4, !dbg !114 + %1551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 5, !dbg !114 + %1552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 6, !dbg !114 + %1553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 7, !dbg !114 + %1554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 8, !dbg !114 + %1555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 9, !dbg !114 + %1556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 10, !dbg !114 + %1557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 11, !dbg !114 + %1558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 12, !dbg !114 + %1559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 13, !dbg !114 + %1560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 14, !dbg !114 + %1561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 15, !dbg !114 + %1562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 16, !dbg !114 + %1563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 17, !dbg !114 + %1564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 18, !dbg !114 + %1565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 19, !dbg !114 + %1566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 20, !dbg !114 + %1567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 21, !dbg !114 + %1568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 22, !dbg !114 + %1569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 23, !dbg !114 + %1570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 24, !dbg !114 + %1571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 25, !dbg !114 + %1572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 26, !dbg !114 + %1573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 27, !dbg !114 + %1574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 28, !dbg !114 + %1575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 29, !dbg !114 + %1576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 30, !dbg !114 + %1577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 31, !dbg !114 + %1578 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1546, float %1547, float %1548, float %1549, float %1550, float %1551, float %1552, float %1553, float %1554, float %1555, float %1556, float %1557, float %1558, float %1559, float %1560, float %1561, float %1562, float %1563, float %1564, float %1565, float %1566, float %1567, float %1568, float %1569, float %1570, float %1571, float %1572, float %1573, float %1574, float %1575, float %1576, float %1577, i64 %1540, i64 %1545, i1 true) #3, !dbg !114 + %1579 = add i32 %675, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !114 + %1580 = lshr exact i32 %1579, 4, !dbg !114 + %1581 = and i32 %1580, 16383, !dbg !114 + %1582 = zext nneg i32 %1581 to i64, !dbg !114 + %1583 = or disjoint i64 %1582, 4611686293372403712, !dbg !114 + %1584 = add i32 %1444, 8192, !dbg !114 + %1585 = lshr exact i32 %1584, 4, !dbg !114 + %1586 = and i32 %1585, 16383, !dbg !114 + %1587 = zext nneg i32 %1586 to i64, !dbg !114 + %1588 = or disjoint i64 %1587, 4611686293338849280, !dbg !114 + %1589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 0, !dbg !114 + %1590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 1, !dbg !114 + %1591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 2, !dbg !114 + %1592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 3, !dbg !114 + %1593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 4, !dbg !114 + %1594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 5, !dbg !114 + %1595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 6, !dbg !114 + %1596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 7, !dbg !114 + %1597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 8, !dbg !114 + %1598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 9, !dbg !114 + %1599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 10, !dbg !114 + %1600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 11, !dbg !114 + %1601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 12, !dbg !114 + %1602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 13, !dbg !114 + %1603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 14, !dbg !114 + %1604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 15, !dbg !114 + %1605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 16, !dbg !114 + %1606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 17, !dbg !114 + %1607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 18, !dbg !114 + %1608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 19, !dbg !114 + %1609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 20, !dbg !114 + %1610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 21, !dbg !114 + %1611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 22, !dbg !114 + %1612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 23, !dbg !114 + %1613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 24, !dbg !114 + %1614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 25, !dbg !114 + %1615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 26, !dbg !114 + %1616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 27, !dbg !114 + %1617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 28, !dbg !114 + %1618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 29, !dbg !114 + %1619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 30, !dbg !114 + %1620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 31, !dbg !114 + %1621 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1589, float %1590, float %1591, float %1592, float %1593, float %1594, float %1595, float %1596, float %1597, float %1598, float %1599, float %1600, float %1601, float %1602, float %1603, float %1604, float %1605, float %1606, float %1607, float %1608, float %1609, float %1610, float %1611, float %1612, float %1613, float %1614, float %1615, float %1616, float %1617, float %1618, float %1619, float %1620, i64 %1583, i64 %1588, i1 true) #3, !dbg !114 + %1622 = add i32 %719, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !114 + %1623 = lshr exact i32 %1622, 4, !dbg !114 + %1624 = and i32 %1623, 16383, !dbg !114 + %1625 = zext nneg i32 %1624 to i64, !dbg !114 + %1626 = or disjoint i64 %1625, 4611686293372403712, !dbg !114 + %1627 = add i32 %1444, 8224, !dbg !114 + %1628 = lshr exact i32 %1627, 4, !dbg !114 + %1629 = and i32 %1628, 16383, !dbg !114 + %1630 = zext nneg i32 %1629 to i64, !dbg !114 + %1631 = or disjoint i64 %1630, 4611686293338849280, !dbg !114 + %1632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 0, !dbg !114 + %1633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 1, !dbg !114 + %1634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 2, !dbg !114 + %1635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 3, !dbg !114 + %1636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 4, !dbg !114 + %1637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 5, !dbg !114 + %1638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 6, !dbg !114 + %1639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 7, !dbg !114 + %1640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 8, !dbg !114 + %1641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 9, !dbg !114 + %1642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 10, !dbg !114 + %1643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 11, !dbg !114 + %1644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 12, !dbg !114 + %1645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 13, !dbg !114 + %1646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 14, !dbg !114 + %1647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 15, !dbg !114 + %1648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 16, !dbg !114 + %1649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 17, !dbg !114 + %1650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 18, !dbg !114 + %1651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 19, !dbg !114 + %1652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 20, !dbg !114 + %1653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 21, !dbg !114 + %1654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 22, !dbg !114 + %1655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 23, !dbg !114 + %1656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 24, !dbg !114 + %1657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 25, !dbg !114 + %1658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 26, !dbg !114 + %1659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 27, !dbg !114 + %1660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 28, !dbg !114 + %1661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 29, !dbg !114 + %1662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 30, !dbg !114 + %1663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 31, !dbg !114 + %1664 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1632, float %1633, float %1634, float %1635, float %1636, float %1637, float %1638, float %1639, float %1640, float %1641, float %1642, float %1643, float %1644, float %1645, float %1646, float %1647, float %1648, float %1649, float %1650, float %1651, float %1652, float %1653, float %1654, float %1655, float %1656, float %1657, float %1658, float %1659, float %1660, float %1661, float %1662, float %1663, i64 %1626, i64 %1631, i1 true) #3, !dbg !114 + %1665 = add i32 %763, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !114 + %1666 = lshr exact i32 %1665, 4, !dbg !114 + %1667 = and i32 %1666, 16383, !dbg !114 + %1668 = zext nneg i32 %1667 to i64, !dbg !114 + %1669 = or disjoint i64 %1668, 4611686293372403712, !dbg !114 + %1670 = add i32 %1444, 8256, !dbg !114 + %1671 = lshr exact i32 %1670, 4, !dbg !114 + %1672 = and i32 %1671, 16383, !dbg !114 + %1673 = zext nneg i32 %1672 to i64, !dbg !114 + %1674 = or disjoint i64 %1673, 4611686293338849280, !dbg !114 + %1675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 0, !dbg !114 + %1676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 1, !dbg !114 + %1677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 2, !dbg !114 + %1678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 3, !dbg !114 + %1679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 4, !dbg !114 + %1680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 5, !dbg !114 + %1681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 6, !dbg !114 + %1682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 7, !dbg !114 + %1683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 8, !dbg !114 + %1684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 9, !dbg !114 + %1685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 10, !dbg !114 + %1686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 11, !dbg !114 + %1687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 12, !dbg !114 + %1688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 13, !dbg !114 + %1689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 14, !dbg !114 + %1690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 15, !dbg !114 + %1691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 16, !dbg !114 + %1692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 17, !dbg !114 + %1693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 18, !dbg !114 + %1694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 19, !dbg !114 + %1695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 20, !dbg !114 + %1696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 21, !dbg !114 + %1697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 22, !dbg !114 + %1698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 23, !dbg !114 + %1699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 24, !dbg !114 + %1700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 25, !dbg !114 + %1701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 26, !dbg !114 + %1702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 27, !dbg !114 + %1703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 28, !dbg !114 + %1704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 29, !dbg !114 + %1705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 30, !dbg !114 + %1706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 31, !dbg !114 + %1707 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1675, float %1676, float %1677, float %1678, float %1679, float %1680, float %1681, float %1682, float %1683, float %1684, float %1685, float %1686, float %1687, float %1688, float %1689, float %1690, float %1691, float %1692, float %1693, float %1694, float %1695, float %1696, float %1697, float %1698, float %1699, float %1700, float %1701, float %1702, float %1703, float %1704, float %1705, float %1706, i64 %1669, i64 %1674, i1 true) #3, !dbg !114 + %1708 = add i32 %807, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !114 + %1709 = lshr exact i32 %1708, 4, !dbg !114 + %1710 = and i32 %1709, 16383, !dbg !114 + %1711 = zext nneg i32 %1710 to i64, !dbg !114 + %1712 = or disjoint i64 %1711, 4611686293372403712, !dbg !114 + %1713 = add i32 %1444, 8288, !dbg !114 + %1714 = lshr exact i32 %1713, 4, !dbg !114 + %1715 = and i32 %1714, 16383, !dbg !114 + %1716 = zext nneg i32 %1715 to i64, !dbg !114 + %1717 = or disjoint i64 %1716, 4611686293338849280, !dbg !114 + %1718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 0, !dbg !114 + %1719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 1, !dbg !114 + %1720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 2, !dbg !114 + %1721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 3, !dbg !114 + %1722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 4, !dbg !114 + %1723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 5, !dbg !114 + %1724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 6, !dbg !114 + %1725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 7, !dbg !114 + %1726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 8, !dbg !114 + %1727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 9, !dbg !114 + %1728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 10, !dbg !114 + %1729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 11, !dbg !114 + %1730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 12, !dbg !114 + %1731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 13, !dbg !114 + %1732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 14, !dbg !114 + %1733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 15, !dbg !114 + %1734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 16, !dbg !114 + %1735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 17, !dbg !114 + %1736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 18, !dbg !114 + %1737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 19, !dbg !114 + %1738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 20, !dbg !114 + %1739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 21, !dbg !114 + %1740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 22, !dbg !114 + %1741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 23, !dbg !114 + %1742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 24, !dbg !114 + %1743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 25, !dbg !114 + %1744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 26, !dbg !114 + %1745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 27, !dbg !114 + %1746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 28, !dbg !114 + %1747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 29, !dbg !114 + %1748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 30, !dbg !114 + %1749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 31, !dbg !114 + %1750 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1718, float %1719, float %1720, float %1721, float %1722, float %1723, float %1724, float %1725, float %1726, float %1727, float %1728, float %1729, float %1730, float %1731, float %1732, float %1733, float %1734, float %1735, float %1736, float %1737, float %1738, float %1739, float %1740, float %1741, float %1742, float %1743, float %1744, float %1745, float %1746, float %1747, float %1748, float %1749, i64 %1712, i64 %1717, i1 true) #3, !dbg !114 + %1751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 0, !dbg !114 + %1752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 1, !dbg !114 + %1753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 2, !dbg !114 + %1754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 3, !dbg !114 + %1755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 4, !dbg !114 + %1756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 5, !dbg !114 + %1757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 6, !dbg !114 + %1758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 7, !dbg !114 + %1759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 8, !dbg !114 + %1760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 9, !dbg !114 + %1761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 10, !dbg !114 + %1762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 11, !dbg !114 + %1763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 12, !dbg !114 + %1764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 13, !dbg !114 + %1765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 14, !dbg !114 + %1766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 15, !dbg !114 + %1767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 16, !dbg !114 + %1768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 17, !dbg !114 + %1769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 18, !dbg !114 + %1770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 19, !dbg !114 + %1771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 20, !dbg !114 + %1772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 21, !dbg !114 + %1773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 22, !dbg !114 + %1774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 23, !dbg !114 + %1775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 24, !dbg !114 + %1776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 25, !dbg !114 + %1777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 26, !dbg !114 + %1778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 27, !dbg !114 + %1779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 28, !dbg !114 + %1780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 29, !dbg !114 + %1781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 30, !dbg !114 + %1782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 31, !dbg !114 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !114 + %1783 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %1751, float %1752, float %1753, float %1754, float %1755, float %1756, float %1757, float %1758, float %1759, float %1760, float %1761, float %1762, float %1763, float %1764, float %1765, float %1766, float %1767, float %1768, float %1769, float %1770, float %1771, float %1772, float %1773, float %1774, float %1775, float %1776, float %1777, float %1778, float %1779, float %1780, float %1781, float %1782, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 0, i32 0, ptr addrspace(3) %1438, i32 0, i32 0) #3, !dbg !114 + %1784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 0, !dbg !114 + %1785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 1, !dbg !114 + %1786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 2, !dbg !114 + %1787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 3, !dbg !114 + %1788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 4, !dbg !114 + %1789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 5, !dbg !114 + %1790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 6, !dbg !114 + %1791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 7, !dbg !114 + %1792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 8, !dbg !114 + %1793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 9, !dbg !114 + %1794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 10, !dbg !114 + %1795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 11, !dbg !114 + %1796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 12, !dbg !114 + %1797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 13, !dbg !114 + %1798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 14, !dbg !114 + %1799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 15, !dbg !114 + %1800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 16, !dbg !114 + %1801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 17, !dbg !114 + %1802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 18, !dbg !114 + %1803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 19, !dbg !114 + %1804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 20, !dbg !114 + %1805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 21, !dbg !114 + %1806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 22, !dbg !114 + %1807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 23, !dbg !114 + %1808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 24, !dbg !114 + %1809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 25, !dbg !114 + %1810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 26, !dbg !114 + %1811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 27, !dbg !114 + %1812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 28, !dbg !114 + %1813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 29, !dbg !114 + %1814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 30, !dbg !114 + %1815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 31, !dbg !114 + %1816 = fsub float %1784, %345, !dbg !115 + %1817 = fsub float %1785, %345, !dbg !115 + %1818 = fsub float %1786, %347, !dbg !115 + %1819 = fsub float %1787, %347, !dbg !115 + %1820 = fsub float %1788, %345, !dbg !115 + %1821 = fsub float %1789, %345, !dbg !115 + %1822 = fsub float %1790, %347, !dbg !115 + %1823 = fsub float %1791, %347, !dbg !115 + %1824 = fsub float %1792, %345, !dbg !115 + %1825 = fsub float %1793, %345, !dbg !115 + %1826 = fsub float %1794, %347, !dbg !115 + %1827 = fsub float %1795, %347, !dbg !115 + %1828 = fsub float %1796, %345, !dbg !115 + %1829 = fsub float %1797, %345, !dbg !115 + %1830 = fsub float %1798, %347, !dbg !115 + %1831 = fsub float %1799, %347, !dbg !115 + %1832 = fsub float %1800, %345, !dbg !115 + %1833 = fsub float %1801, %345, !dbg !115 + %1834 = fsub float %1802, %347, !dbg !115 + %1835 = fsub float %1803, %347, !dbg !115 + %1836 = fsub float %1804, %345, !dbg !115 + %1837 = fsub float %1805, %345, !dbg !115 + %1838 = fsub float %1806, %347, !dbg !115 + %1839 = fsub float %1807, %347, !dbg !115 + %1840 = fsub float %1808, %345, !dbg !115 + %1841 = fsub float %1809, %345, !dbg !115 + %1842 = fsub float %1810, %347, !dbg !115 + %1843 = fsub float %1811, %347, !dbg !115 + %1844 = fsub float %1812, %345, !dbg !115 + %1845 = fsub float %1813, %345, !dbg !115 + %1846 = fsub float %1814, %347, !dbg !115 + %1847 = fsub float %1815, %347, !dbg !115 + %1848 = fmul float %.0.i1413, %1816, !dbg !116 + %1849 = fmul float %.0.i1416, %1817, !dbg !116 + %1850 = fmul float %.0.i1419, %1818, !dbg !116 + %1851 = fmul float %.0.i1422, %1819, !dbg !116 + %1852 = fmul float %.0.i1425, %1820, !dbg !116 + %1853 = fmul float %.0.i1428, %1821, !dbg !116 + %1854 = fmul float %.0.i1431, %1822, !dbg !116 + %1855 = fmul float %.0.i1434, %1823, !dbg !116 + %1856 = fmul float %.0.i1437, %1824, !dbg !116 + %1857 = fmul float %.0.i1440, %1825, !dbg !116 + %1858 = fmul float %.0.i1443, %1826, !dbg !116 + %1859 = fmul float %.0.i1446, %1827, !dbg !116 + %1860 = fmul float %.0.i1449, %1828, !dbg !116 + %1861 = fmul float %.0.i1452, %1829, !dbg !116 + %1862 = fmul float %.0.i1455, %1830, !dbg !116 + %1863 = fmul float %.0.i1458, %1831, !dbg !116 + %1864 = fmul float %.0.i1461, %1832, !dbg !116 + %1865 = fmul float %.0.i1464, %1833, !dbg !116 + %1866 = fmul float %.0.i1467, %1834, !dbg !116 + %1867 = fmul float %.0.i1470, %1835, !dbg !116 + %1868 = fmul float %.0.i1473, %1836, !dbg !116 + %1869 = fmul float %.0.i1476, %1837, !dbg !116 + %1870 = fmul float %.0.i1479, %1838, !dbg !116 + %1871 = fmul float %.0.i1482, %1839, !dbg !116 + %1872 = fmul float %.0.i1485, %1840, !dbg !116 + %1873 = fmul float %.0.i1488, %1841, !dbg !116 + %1874 = fmul float %.0.i1491, %1842, !dbg !116 + %1875 = fmul float %.0.i1494, %1843, !dbg !116 + %1876 = fmul float %.0.i1497, %1844, !dbg !116 + %1877 = fmul float %.0.i1500, %1845, !dbg !116 + %1878 = fmul float %.0.i1503, %1846, !dbg !116 + %1879 = fmul float %.0.i1506, %1847, !dbg !116 + %1880 = fptrunc float %1848 to bfloat, !dbg !117 + %1881 = select i1 %1150, bfloat %1880, bfloat 0xR0000, !dbg !118 + %1882 = fptrunc float %1849 to bfloat, !dbg !117 + %1883 = select i1 %1151, bfloat %1882, bfloat 0xR0000, !dbg !118 + %1884 = fptrunc float %1850 to bfloat, !dbg !117 + %1885 = select i1 %1152, bfloat %1884, bfloat 0xR0000, !dbg !118 + %1886 = fptrunc float %1851 to bfloat, !dbg !117 + %1887 = select i1 %1153, bfloat %1886, bfloat 0xR0000, !dbg !118 + %1888 = fptrunc float %1852 to bfloat, !dbg !117 + %1889 = select i1 %1154, bfloat %1888, bfloat 0xR0000, !dbg !118 + %1890 = fptrunc float %1853 to bfloat, !dbg !117 + %1891 = select i1 %1155, bfloat %1890, bfloat 0xR0000, !dbg !118 + %1892 = fptrunc float %1854 to bfloat, !dbg !117 + %1893 = select i1 %1156, bfloat %1892, bfloat 0xR0000, !dbg !118 + %1894 = fptrunc float %1855 to bfloat, !dbg !117 + %1895 = select i1 %1157, bfloat %1894, bfloat 0xR0000, !dbg !118 + %1896 = fptrunc float %1856 to bfloat, !dbg !117 + %1897 = select i1 %1158, bfloat %1896, bfloat 0xR0000, !dbg !118 + %1898 = fptrunc float %1857 to bfloat, !dbg !117 + %1899 = select i1 %1159, bfloat %1898, bfloat 0xR0000, !dbg !118 + %1900 = fptrunc float %1858 to bfloat, !dbg !117 + %1901 = select i1 %1160, bfloat %1900, bfloat 0xR0000, !dbg !118 + %1902 = fptrunc float %1859 to bfloat, !dbg !117 + %1903 = select i1 %1161, bfloat %1902, bfloat 0xR0000, !dbg !118 + %1904 = fptrunc float %1860 to bfloat, !dbg !117 + %1905 = select i1 %1162, bfloat %1904, bfloat 0xR0000, !dbg !118 + %1906 = fptrunc float %1861 to bfloat, !dbg !117 + %1907 = select i1 %1163, bfloat %1906, bfloat 0xR0000, !dbg !118 + %1908 = fptrunc float %1862 to bfloat, !dbg !117 + %1909 = select i1 %1164, bfloat %1908, bfloat 0xR0000, !dbg !118 + %1910 = fptrunc float %1863 to bfloat, !dbg !117 + %1911 = select i1 %1165, bfloat %1910, bfloat 0xR0000, !dbg !118 + %1912 = fptrunc float %1864 to bfloat, !dbg !117 + %1913 = select i1 %1166, bfloat %1912, bfloat 0xR0000, !dbg !118 + %1914 = fptrunc float %1865 to bfloat, !dbg !117 + %1915 = select i1 %1167, bfloat %1914, bfloat 0xR0000, !dbg !118 + %1916 = fptrunc float %1866 to bfloat, !dbg !117 + %1917 = select i1 %1168, bfloat %1916, bfloat 0xR0000, !dbg !118 + %1918 = fptrunc float %1867 to bfloat, !dbg !117 + %1919 = select i1 %1169, bfloat %1918, bfloat 0xR0000, !dbg !118 + %1920 = fptrunc float %1868 to bfloat, !dbg !117 + %1921 = select i1 %1170, bfloat %1920, bfloat 0xR0000, !dbg !118 + %1922 = fptrunc float %1869 to bfloat, !dbg !117 + %1923 = select i1 %1171, bfloat %1922, bfloat 0xR0000, !dbg !118 + %1924 = fptrunc float %1870 to bfloat, !dbg !117 + %1925 = select i1 %1172, bfloat %1924, bfloat 0xR0000, !dbg !118 + %1926 = fptrunc float %1871 to bfloat, !dbg !117 + %1927 = select i1 %1173, bfloat %1926, bfloat 0xR0000, !dbg !118 + %1928 = fptrunc float %1872 to bfloat, !dbg !117 + %1929 = select i1 %1174, bfloat %1928, bfloat 0xR0000, !dbg !118 + %1930 = fptrunc float %1873 to bfloat, !dbg !117 + %1931 = select i1 %1175, bfloat %1930, bfloat 0xR0000, !dbg !118 + %1932 = fptrunc float %1874 to bfloat, !dbg !117 + %1933 = select i1 %1176, bfloat %1932, bfloat 0xR0000, !dbg !118 + %1934 = fptrunc float %1875 to bfloat, !dbg !117 + %1935 = select i1 %1177, bfloat %1934, bfloat 0xR0000, !dbg !118 + %1936 = fptrunc float %1876 to bfloat, !dbg !117 + %1937 = select i1 %1178, bfloat %1936, bfloat 0xR0000, !dbg !118 + %1938 = fptrunc float %1877 to bfloat, !dbg !117 + %1939 = select i1 %1179, bfloat %1938, bfloat 0xR0000, !dbg !118 + %1940 = fptrunc float %1878 to bfloat, !dbg !117 + %1941 = select i1 %1180, bfloat %1940, bfloat 0xR0000, !dbg !118 + %1942 = fptrunc float %1879 to bfloat, !dbg !117 + %1943 = select i1 %1181, bfloat %1942, bfloat 0xR0000, !dbg !118 + %1944 = insertelement <2 x bfloat> poison, bfloat %1881, i64 0, !dbg !119 + %1945 = insertelement <2 x bfloat> %1944, bfloat %1883, i64 1, !dbg !119 + %1946 = bitcast <2 x bfloat> %1945 to i32, !dbg !119 + %1947 = insertelement <2 x bfloat> poison, bfloat %1885, i64 0, !dbg !119 + %1948 = insertelement <2 x bfloat> %1947, bfloat %1887, i64 1, !dbg !119 + %1949 = bitcast <2 x bfloat> %1948 to i32, !dbg !119 + %1950 = insertelement <2 x bfloat> poison, bfloat %1889, i64 0, !dbg !119 + %1951 = insertelement <2 x bfloat> %1950, bfloat %1891, i64 1, !dbg !119 + %1952 = bitcast <2 x bfloat> %1951 to i32, !dbg !119 + %1953 = insertelement <2 x bfloat> poison, bfloat %1893, i64 0, !dbg !119 + %1954 = insertelement <2 x bfloat> %1953, bfloat %1895, i64 1, !dbg !119 + %1955 = bitcast <2 x bfloat> %1954 to i32, !dbg !119 + %1956 = insertelement <2 x bfloat> poison, bfloat %1897, i64 0, !dbg !119 + %1957 = insertelement <2 x bfloat> %1956, bfloat %1899, i64 1, !dbg !119 + %1958 = bitcast <2 x bfloat> %1957 to i32, !dbg !119 + %1959 = insertelement <2 x bfloat> poison, bfloat %1901, i64 0, !dbg !119 + %1960 = insertelement <2 x bfloat> %1959, bfloat %1903, i64 1, !dbg !119 + %1961 = bitcast <2 x bfloat> %1960 to i32, !dbg !119 + %1962 = insertelement <2 x bfloat> poison, bfloat %1905, i64 0, !dbg !119 + %1963 = insertelement <2 x bfloat> %1962, bfloat %1907, i64 1, !dbg !119 + %1964 = bitcast <2 x bfloat> %1963 to i32, !dbg !119 + %1965 = insertelement <2 x bfloat> poison, bfloat %1909, i64 0, !dbg !119 + %1966 = insertelement <2 x bfloat> %1965, bfloat %1911, i64 1, !dbg !119 + %1967 = bitcast <2 x bfloat> %1966 to i32, !dbg !119 + %1968 = insertelement <2 x bfloat> poison, bfloat %1913, i64 0, !dbg !119 + %1969 = insertelement <2 x bfloat> %1968, bfloat %1915, i64 1, !dbg !119 + %1970 = bitcast <2 x bfloat> %1969 to i32, !dbg !119 + %1971 = insertelement <2 x bfloat> poison, bfloat %1917, i64 0, !dbg !119 + %1972 = insertelement <2 x bfloat> %1971, bfloat %1919, i64 1, !dbg !119 + %1973 = bitcast <2 x bfloat> %1972 to i32, !dbg !119 + %1974 = insertelement <2 x bfloat> poison, bfloat %1921, i64 0, !dbg !119 + %1975 = insertelement <2 x bfloat> %1974, bfloat %1923, i64 1, !dbg !119 + %1976 = bitcast <2 x bfloat> %1975 to i32, !dbg !119 + %1977 = insertelement <2 x bfloat> poison, bfloat %1925, i64 0, !dbg !119 + %1978 = insertelement <2 x bfloat> %1977, bfloat %1927, i64 1, !dbg !119 + %1979 = bitcast <2 x bfloat> %1978 to i32, !dbg !119 + %1980 = insertelement <2 x bfloat> poison, bfloat %1929, i64 0, !dbg !119 + %1981 = insertelement <2 x bfloat> %1980, bfloat %1931, i64 1, !dbg !119 + %1982 = bitcast <2 x bfloat> %1981 to i32, !dbg !119 + %1983 = insertelement <2 x bfloat> poison, bfloat %1933, i64 0, !dbg !119 + %1984 = insertelement <2 x bfloat> %1983, bfloat %1935, i64 1, !dbg !119 + %1985 = bitcast <2 x bfloat> %1984 to i32, !dbg !119 + %1986 = insertelement <2 x bfloat> poison, bfloat %1937, i64 0, !dbg !119 + %1987 = insertelement <2 x bfloat> %1986, bfloat %1939, i64 1, !dbg !119 + %1988 = bitcast <2 x bfloat> %1987 to i32, !dbg !119 + %1989 = insertelement <2 x bfloat> poison, bfloat %1941, i64 0, !dbg !119 + %1990 = insertelement <2 x bfloat> %1989, bfloat %1943, i64 1, !dbg !119 + %1991 = bitcast <2 x bfloat> %1990 to i32, !dbg !119 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !119 + %1992 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %455, float %456, float %457, float %458, float %459, float %460, float %461, float %462, float %463, float %464, float %465, float %466, float %467, float %468, float %469, float %470, float %471, float %472, float %473, float %474, float %475, float %476, float %477, float %478, float %479, float %480, float %481, float %482, float %483, float %484, float %485, float %486, float %487, float %488, float %489, float %490, float %491, float %492, float %493, float %494, float %495, float %496, float %497, float %498, float %499, float %500, float %501, float %502, float %503, float %504, float %505, float %506, float %507, float %508, float %509, float %510, float %511, float %512, float %513, float %514, float %515, float %516, float %517, float %518, i32 %1946, i32 %1949, i32 %1952, i32 %1955, i64 %541, i1 true) #3, !dbg !119 + %1993 = add i32 %537, 2048, !dbg !119 + %1994 = lshr exact i32 %1993, 4, !dbg !119 + %1995 = and i32 %1994, 16383, !dbg !119 + %1996 = zext nneg i32 %1995 to i64, !dbg !119 + %1997 = or disjoint i64 %1996, 4611686293338849280, !dbg !119 + %1998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 0, !dbg !119 + %1999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 1, !dbg !119 + %2000 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 2, !dbg !119 + %2001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 3, !dbg !119 + %2002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 4, !dbg !119 + %2003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 5, !dbg !119 + %2004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 6, !dbg !119 + %2005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 7, !dbg !119 + %2006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 8, !dbg !119 + %2007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 9, !dbg !119 + %2008 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 10, !dbg !119 + %2009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 11, !dbg !119 + %2010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 12, !dbg !119 + %2011 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 13, !dbg !119 + %2012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 14, !dbg !119 + %2013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 15, !dbg !119 + %2014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 16, !dbg !119 + %2015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 17, !dbg !119 + %2016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 18, !dbg !119 + %2017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 19, !dbg !119 + %2018 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 20, !dbg !119 + %2019 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 21, !dbg !119 + %2020 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 22, !dbg !119 + %2021 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 23, !dbg !119 + %2022 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 24, !dbg !119 + %2023 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 25, !dbg !119 + %2024 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 26, !dbg !119 + %2025 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 27, !dbg !119 + %2026 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 28, !dbg !119 + %2027 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 29, !dbg !119 + %2028 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 30, !dbg !119 + %2029 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 31, !dbg !119 + %2030 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 32, !dbg !119 + %2031 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 33, !dbg !119 + %2032 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 34, !dbg !119 + %2033 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 35, !dbg !119 + %2034 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 36, !dbg !119 + %2035 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 37, !dbg !119 + %2036 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 38, !dbg !119 + %2037 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 39, !dbg !119 + %2038 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 40, !dbg !119 + %2039 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 41, !dbg !119 + %2040 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 42, !dbg !119 + %2041 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 43, !dbg !119 + %2042 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 44, !dbg !119 + %2043 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 45, !dbg !119 + %2044 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 46, !dbg !119 + %2045 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 47, !dbg !119 + %2046 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 48, !dbg !119 + %2047 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 49, !dbg !119 + %2048 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 50, !dbg !119 + %2049 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 51, !dbg !119 + %2050 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 52, !dbg !119 + %2051 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 53, !dbg !119 + %2052 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 54, !dbg !119 + %2053 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 55, !dbg !119 + %2054 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 56, !dbg !119 + %2055 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 57, !dbg !119 + %2056 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 58, !dbg !119 + %2057 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 59, !dbg !119 + %2058 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 60, !dbg !119 + %2059 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 61, !dbg !119 + %2060 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 62, !dbg !119 + %2061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 63, !dbg !119 + %2062 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1998, float %1999, float %2000, float %2001, float %2002, float %2003, float %2004, float %2005, float %2006, float %2007, float %2008, float %2009, float %2010, float %2011, float %2012, float %2013, float %2014, float %2015, float %2016, float %2017, float %2018, float %2019, float %2020, float %2021, float %2022, float %2023, float %2024, float %2025, float %2026, float %2027, float %2028, float %2029, float %2030, float %2031, float %2032, float %2033, float %2034, float %2035, float %2036, float %2037, float %2038, float %2039, float %2040, float %2041, float %2042, float %2043, float %2044, float %2045, float %2046, float %2047, float %2048, float %2049, float %2050, float %2051, float %2052, float %2053, float %2054, float %2055, float %2056, float %2057, float %2058, float %2059, float %2060, float %2061, i32 %1958, i32 %1961, i32 %1964, i32 %1967, i64 %1997, i1 true) #3, !dbg !119 + %2063 = add i32 %537, 4096, !dbg !119 + %2064 = lshr exact i32 %2063, 4, !dbg !119 + %2065 = and i32 %2064, 16383, !dbg !119 + %2066 = zext nneg i32 %2065 to i64, !dbg !119 + %2067 = or disjoint i64 %2066, 4611686293338849280, !dbg !119 + %2068 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 0, !dbg !119 + %2069 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 1, !dbg !119 + %2070 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 2, !dbg !119 + %2071 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 3, !dbg !119 + %2072 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 4, !dbg !119 + %2073 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 5, !dbg !119 + %2074 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 6, !dbg !119 + %2075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 7, !dbg !119 + %2076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 8, !dbg !119 + %2077 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 9, !dbg !119 + %2078 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 10, !dbg !119 + %2079 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 11, !dbg !119 + %2080 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 12, !dbg !119 + %2081 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 13, !dbg !119 + %2082 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 14, !dbg !119 + %2083 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 15, !dbg !119 + %2084 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 16, !dbg !119 + %2085 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 17, !dbg !119 + %2086 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 18, !dbg !119 + %2087 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 19, !dbg !119 + %2088 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 20, !dbg !119 + %2089 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 21, !dbg !119 + %2090 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 22, !dbg !119 + %2091 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 23, !dbg !119 + %2092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 24, !dbg !119 + %2093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 25, !dbg !119 + %2094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 26, !dbg !119 + %2095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 27, !dbg !119 + %2096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 28, !dbg !119 + %2097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 29, !dbg !119 + %2098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 30, !dbg !119 + %2099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 31, !dbg !119 + %2100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 32, !dbg !119 + %2101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 33, !dbg !119 + %2102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 34, !dbg !119 + %2103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 35, !dbg !119 + %2104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 36, !dbg !119 + %2105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 37, !dbg !119 + %2106 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 38, !dbg !119 + %2107 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 39, !dbg !119 + %2108 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 40, !dbg !119 + %2109 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 41, !dbg !119 + %2110 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 42, !dbg !119 + %2111 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 43, !dbg !119 + %2112 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 44, !dbg !119 + %2113 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 45, !dbg !119 + %2114 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 46, !dbg !119 + %2115 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 47, !dbg !119 + %2116 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 48, !dbg !119 + %2117 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 49, !dbg !119 + %2118 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 50, !dbg !119 + %2119 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 51, !dbg !119 + %2120 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 52, !dbg !119 + %2121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 53, !dbg !119 + %2122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 54, !dbg !119 + %2123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 55, !dbg !119 + %2124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 56, !dbg !119 + %2125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 57, !dbg !119 + %2126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 58, !dbg !119 + %2127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 59, !dbg !119 + %2128 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 60, !dbg !119 + %2129 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 61, !dbg !119 + %2130 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 62, !dbg !119 + %2131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 63, !dbg !119 + %2132 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %2068, float %2069, float %2070, float %2071, float %2072, float %2073, float %2074, float %2075, float %2076, float %2077, float %2078, float %2079, float %2080, float %2081, float %2082, float %2083, float %2084, float %2085, float %2086, float %2087, float %2088, float %2089, float %2090, float %2091, float %2092, float %2093, float %2094, float %2095, float %2096, float %2097, float %2098, float %2099, float %2100, float %2101, float %2102, float %2103, float %2104, float %2105, float %2106, float %2107, float %2108, float %2109, float %2110, float %2111, float %2112, float %2113, float %2114, float %2115, float %2116, float %2117, float %2118, float %2119, float %2120, float %2121, float %2122, float %2123, float %2124, float %2125, float %2126, float %2127, float %2128, float %2129, float %2130, float %2131, i32 %1970, i32 %1973, i32 %1976, i32 %1979, i64 %2067, i1 true) #3, !dbg !119 + %2133 = add i32 %537, 6144, !dbg !119 + %2134 = lshr exact i32 %2133, 4, !dbg !119 + %2135 = and i32 %2134, 16383, !dbg !119 + %2136 = zext nneg i32 %2135 to i64, !dbg !119 + %2137 = or disjoint i64 %2136, 4611686293338849280, !dbg !119 + %2138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 0, !dbg !119 + %2139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 1, !dbg !119 + %2140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 2, !dbg !119 + %2141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 3, !dbg !119 + %2142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 4, !dbg !119 + %2143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 5, !dbg !119 + %2144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 6, !dbg !119 + %2145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 7, !dbg !119 + %2146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 8, !dbg !119 + %2147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 9, !dbg !119 + %2148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 10, !dbg !119 + %2149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 11, !dbg !119 + %2150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 12, !dbg !119 + %2151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 13, !dbg !119 + %2152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 14, !dbg !119 + %2153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 15, !dbg !119 + %2154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 16, !dbg !119 + %2155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 17, !dbg !119 + %2156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 18, !dbg !119 + %2157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 19, !dbg !119 + %2158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 20, !dbg !119 + %2159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 21, !dbg !119 + %2160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 22, !dbg !119 + %2161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 23, !dbg !119 + %2162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 24, !dbg !119 + %2163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 25, !dbg !119 + %2164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 26, !dbg !119 + %2165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 27, !dbg !119 + %2166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 28, !dbg !119 + %2167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 29, !dbg !119 + %2168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 30, !dbg !119 + %2169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 31, !dbg !119 + %2170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 32, !dbg !119 + %2171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 33, !dbg !119 + %2172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 34, !dbg !119 + %2173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 35, !dbg !119 + %2174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 36, !dbg !119 + %2175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 37, !dbg !119 + %2176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 38, !dbg !119 + %2177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 39, !dbg !119 + %2178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 40, !dbg !119 + %2179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 41, !dbg !119 + %2180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 42, !dbg !119 + %2181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 43, !dbg !119 + %2182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 44, !dbg !119 + %2183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 45, !dbg !119 + %2184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 46, !dbg !119 + %2185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 47, !dbg !119 + %2186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 48, !dbg !119 + %2187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 49, !dbg !119 + %2188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 50, !dbg !119 + %2189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 51, !dbg !119 + %2190 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 52, !dbg !119 + %2191 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 53, !dbg !119 + %2192 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 54, !dbg !119 + %2193 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 55, !dbg !119 + %2194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 56, !dbg !119 + %2195 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 57, !dbg !119 + %2196 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 58, !dbg !119 + %2197 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 59, !dbg !119 + %2198 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 60, !dbg !119 + %2199 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 61, !dbg !119 + %2200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 62, !dbg !119 + %2201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 63, !dbg !119 + %2202 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %2138, float %2139, float %2140, float %2141, float %2142, float %2143, float %2144, float %2145, float %2146, float %2147, float %2148, float %2149, float %2150, float %2151, float %2152, float %2153, float %2154, float %2155, float %2156, float %2157, float %2158, float %2159, float %2160, float %2161, float %2162, float %2163, float %2164, float %2165, float %2166, float %2167, float %2168, float %2169, float %2170, float %2171, float %2172, float %2173, float %2174, float %2175, float %2176, float %2177, float %2178, float %2179, float %2180, float %2181, float %2182, float %2183, float %2184, float %2185, float %2186, float %2187, float %2188, float %2189, float %2190, float %2191, float %2192, float %2193, float %2194, float %2195, float %2196, float %2197, float %2198, float %2199, float %2200, float %2201, i32 %1982, i32 %1985, i32 %1988, i32 %1991, i64 %2137, i1 true) #3, !dbg !119 + %2203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 0, !dbg !119 + %2204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 1, !dbg !119 + %2205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 2, !dbg !119 + %2206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 3, !dbg !119 + %2207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 4, !dbg !119 + %2208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 5, !dbg !119 + %2209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 6, !dbg !119 + %2210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 7, !dbg !119 + %2211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 8, !dbg !119 + %2212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 9, !dbg !119 + %2213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 10, !dbg !119 + %2214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 11, !dbg !119 + %2215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 12, !dbg !119 + %2216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 13, !dbg !119 + %2217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 14, !dbg !119 + %2218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 15, !dbg !119 + %2219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 16, !dbg !119 + %2220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 17, !dbg !119 + %2221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 18, !dbg !119 + %2222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 19, !dbg !119 + %2223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 20, !dbg !119 + %2224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 21, !dbg !119 + %2225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 22, !dbg !119 + %2226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 23, !dbg !119 + %2227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 24, !dbg !119 + %2228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 25, !dbg !119 + %2229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 26, !dbg !119 + %2230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 27, !dbg !119 + %2231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 28, !dbg !119 + %2232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 29, !dbg !119 + %2233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 30, !dbg !119 + %2234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 31, !dbg !119 + %2235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 32, !dbg !119 + %2236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 33, !dbg !119 + %2237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 34, !dbg !119 + %2238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 35, !dbg !119 + %2239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 36, !dbg !119 + %2240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 37, !dbg !119 + %2241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 38, !dbg !119 + %2242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 39, !dbg !119 + %2243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 40, !dbg !119 + %2244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 41, !dbg !119 + %2245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 42, !dbg !119 + %2246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 43, !dbg !119 + %2247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 44, !dbg !119 + %2248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 45, !dbg !119 + %2249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 46, !dbg !119 + %2250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 47, !dbg !119 + %2251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 48, !dbg !119 + %2252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 49, !dbg !119 + %2253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 50, !dbg !119 + %2254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 51, !dbg !119 + %2255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 52, !dbg !119 + %2256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 53, !dbg !119 + %2257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 54, !dbg !119 + %2258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 55, !dbg !119 + %2259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 56, !dbg !119 + %2260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 57, !dbg !119 + %2261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 58, !dbg !119 + %2262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 59, !dbg !119 + %2263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 60, !dbg !119 + %2264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 61, !dbg !119 + %2265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 62, !dbg !119 + %2266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 63, !dbg !119 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !119 + %2267 = insertelement <16 x i32> poison, i32 %452, i64 0, !dbg !120 + %2268 = shufflevector <16 x i32> %2267, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !120 + %2269 = add <16 x i32> %2268, %520, !dbg !120 + %2270 = add nuw nsw i32 %519, 1, !dbg !84 + %2271 = lshr i32 %2270, 1, !dbg !121 + %2272 = zext nneg i32 %2271 to i64, !dbg !122 + %2273 = getelementptr i32, ptr addrspace(1) %359, i64 %2272, !dbg !122 + %2274 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !123 + %2275 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %2273, i64 %2274, i1 %523) #3, !dbg !123 + %2276 = add nuw nsw i32 %2271, 1, !dbg !124 + %2277 = icmp slt i32 %2276, %364, !dbg !125 + %2278 = getelementptr i8, ptr addrspace(1) %2273, i64 4, !dbg !126 + %2279 = and i1 %523, %2277, !dbg !84 + %2280 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !127 + %2281 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %2278, i64 %2280, i1 %2279) #3, !dbg !127 + %2282 = and i32 %519, 1, !dbg !128 + %2283 = sub i32 %2281, %2275, !dbg !129 + %2284 = shl i32 %2283, 7, !dbg !130 + %2285 = add i32 %2284, -64, !dbg !131 + %2286 = xor i32 %2282, 1, !dbg !132 + %2287 = mul nuw nsw i32 %2285, %2286, !dbg !132 + %2288 = shl nuw nsw i32 %2282, 6, !dbg !133 + %2289 = add i32 %2287, %2288, !dbg !134 + %2290 = shl i32 %2289, 7, !dbg !135 + %2291 = sext i32 %2290 to i64, !dbg !88 + %2292 = getelementptr bfloat, ptr addrspace(1) %.pn9021821, i64 %2291, !dbg !88 + %2293 = getelementptr bfloat, ptr addrspace(1) %.pn8861822, i64 %2291, !dbg !88 + %2294 = getelementptr bfloat, ptr addrspace(1) %.pn8701823, i64 %2291, !dbg !88 + %2295 = getelementptr bfloat, ptr addrspace(1) %.pn8541824, i64 %2291, !dbg !88 + %2296 = getelementptr bfloat, ptr addrspace(1) %.pn9661825, i64 %2291, !dbg !89 + %2297 = getelementptr bfloat, ptr addrspace(1) %.pn9501826, i64 %2291, !dbg !89 + %2298 = getelementptr bfloat, ptr addrspace(1) %.pn9341827, i64 %2291, !dbg !89 + %2299 = getelementptr bfloat, ptr addrspace(1) %.pn9181828, i64 %2291, !dbg !89 + %2300 = add i32 %454, 1, !dbg !84 + %2301 = icmp sgt i32 %2300, 2, !dbg !84 + %2302 = select i1 %2301, i32 0, i32 %2300, !dbg !84 + %2303 = shl i32 %2302, 13, !dbg !87 + %2304 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %2303, !dbg !87 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !87 + %2305 = getelementptr inbounds nuw i8, ptr addrspace(3) %2304, i32 %402, !dbg !87 + %2306 = select i1 %522, i32 16, i32 0, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %2305, ptr addrspace(1) %2292, i32 %2306) #3, !dbg !87 + %2307 = getelementptr inbounds nuw i8, ptr addrspace(3) %2304, i32 %405, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2307, ptr addrspace(1) %2293, i32 %2306) #3, !dbg !87 + %2308 = getelementptr inbounds nuw i8, ptr addrspace(3) %2304, i32 %407, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2308, ptr addrspace(1) %2294, i32 %2306) #3, !dbg !87 + %2309 = getelementptr inbounds nuw i8, ptr addrspace(3) %2304, i32 %409, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2309, ptr addrspace(1) %2295, i32 %2306) #3, !dbg !87 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !87 + %2310 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %2303, !dbg !87 + %2311 = getelementptr inbounds nuw i8, ptr addrspace(3) %2310, i32 %402, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %2311, ptr addrspace(1) %2296, i32 %2306) #3, !dbg !87 + %2312 = getelementptr inbounds nuw i8, ptr addrspace(3) %2310, i32 %405, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2312, ptr addrspace(1) %2297, i32 %2306) #3, !dbg !87 + %2313 = getelementptr inbounds nuw i8, ptr addrspace(3) %2310, i32 %407, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2313, ptr addrspace(1) %2298, i32 %2306) #3, !dbg !87 + %2314 = getelementptr inbounds nuw i8, ptr addrspace(3) %2310, i32 %409, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2314, ptr addrspace(1) %2299, i32 %2306) #3, !dbg !87 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !87 + %exitcond2125.not = icmp eq i32 %2270, %433, !dbg !84 + br i1 %exitcond2125.not, label %._crit_edge1847, label %451, !dbg !84 + +._crit_edge1847: ; preds = %__nv_exp2f.exit1507, %53 + %2315 = phi float [ 0.000000e+00, %53 ], [ %2203, %__nv_exp2f.exit1507 ] + %2316 = phi float [ 0.000000e+00, %53 ], [ %2204, %__nv_exp2f.exit1507 ] + %2317 = phi float [ 0.000000e+00, %53 ], [ %2205, %__nv_exp2f.exit1507 ] + %2318 = phi float [ 0.000000e+00, %53 ], [ %2206, %__nv_exp2f.exit1507 ] + %2319 = phi float [ 0.000000e+00, %53 ], [ %2207, %__nv_exp2f.exit1507 ] + %2320 = phi float [ 0.000000e+00, %53 ], [ %2208, %__nv_exp2f.exit1507 ] + %2321 = phi float [ 0.000000e+00, %53 ], [ %2209, %__nv_exp2f.exit1507 ] + %2322 = phi float [ 0.000000e+00, %53 ], [ %2210, %__nv_exp2f.exit1507 ] + %2323 = phi float [ 0.000000e+00, %53 ], [ %2211, %__nv_exp2f.exit1507 ] + %2324 = phi float [ 0.000000e+00, %53 ], [ %2212, %__nv_exp2f.exit1507 ] + %2325 = phi float [ 0.000000e+00, %53 ], [ %2213, %__nv_exp2f.exit1507 ] + %2326 = phi float [ 0.000000e+00, %53 ], [ %2214, %__nv_exp2f.exit1507 ] + %2327 = phi float [ 0.000000e+00, %53 ], [ %2215, %__nv_exp2f.exit1507 ] + %2328 = phi float [ 0.000000e+00, %53 ], [ %2216, %__nv_exp2f.exit1507 ] + %2329 = phi float [ 0.000000e+00, %53 ], [ %2217, %__nv_exp2f.exit1507 ] + %2330 = phi float [ 0.000000e+00, %53 ], [ %2218, %__nv_exp2f.exit1507 ] + %2331 = phi float [ 0.000000e+00, %53 ], [ %2219, %__nv_exp2f.exit1507 ] + %2332 = phi float [ 0.000000e+00, %53 ], [ %2220, %__nv_exp2f.exit1507 ] + %2333 = phi float [ 0.000000e+00, %53 ], [ %2221, %__nv_exp2f.exit1507 ] + %2334 = phi float [ 0.000000e+00, %53 ], [ %2222, %__nv_exp2f.exit1507 ] + %2335 = phi float [ 0.000000e+00, %53 ], [ %2223, %__nv_exp2f.exit1507 ] + %2336 = phi float [ 0.000000e+00, %53 ], [ %2224, %__nv_exp2f.exit1507 ] + %2337 = phi float [ 0.000000e+00, %53 ], [ %2225, %__nv_exp2f.exit1507 ] + %2338 = phi float [ 0.000000e+00, %53 ], [ %2226, %__nv_exp2f.exit1507 ] + %2339 = phi float [ 0.000000e+00, %53 ], [ %2227, %__nv_exp2f.exit1507 ] + %2340 = phi float [ 0.000000e+00, %53 ], [ %2228, %__nv_exp2f.exit1507 ] + %2341 = phi float [ 0.000000e+00, %53 ], [ %2229, %__nv_exp2f.exit1507 ] + %2342 = phi float [ 0.000000e+00, %53 ], [ %2230, %__nv_exp2f.exit1507 ] + %2343 = phi float [ 0.000000e+00, %53 ], [ %2231, %__nv_exp2f.exit1507 ] + %2344 = phi float [ 0.000000e+00, %53 ], [ %2232, %__nv_exp2f.exit1507 ] + %2345 = phi float [ 0.000000e+00, %53 ], [ %2233, %__nv_exp2f.exit1507 ] + %2346 = phi float [ 0.000000e+00, %53 ], [ %2234, %__nv_exp2f.exit1507 ] + %2347 = phi float [ 0.000000e+00, %53 ], [ %2235, %__nv_exp2f.exit1507 ] + %2348 = phi float [ 0.000000e+00, %53 ], [ %2236, %__nv_exp2f.exit1507 ] + %2349 = phi float [ 0.000000e+00, %53 ], [ %2237, %__nv_exp2f.exit1507 ] + %2350 = phi float [ 0.000000e+00, %53 ], [ %2238, %__nv_exp2f.exit1507 ] + %2351 = phi float [ 0.000000e+00, %53 ], [ %2239, %__nv_exp2f.exit1507 ] + %2352 = phi float [ 0.000000e+00, %53 ], [ %2240, %__nv_exp2f.exit1507 ] + %2353 = phi float [ 0.000000e+00, %53 ], [ %2241, %__nv_exp2f.exit1507 ] + %2354 = phi float [ 0.000000e+00, %53 ], [ %2242, %__nv_exp2f.exit1507 ] + %2355 = phi float [ 0.000000e+00, %53 ], [ %2243, %__nv_exp2f.exit1507 ] + %2356 = phi float [ 0.000000e+00, %53 ], [ %2244, %__nv_exp2f.exit1507 ] + %2357 = phi float [ 0.000000e+00, %53 ], [ %2245, %__nv_exp2f.exit1507 ] + %2358 = phi float [ 0.000000e+00, %53 ], [ %2246, %__nv_exp2f.exit1507 ] + %2359 = phi float [ 0.000000e+00, %53 ], [ %2247, %__nv_exp2f.exit1507 ] + %2360 = phi float [ 0.000000e+00, %53 ], [ %2248, %__nv_exp2f.exit1507 ] + %2361 = phi float [ 0.000000e+00, %53 ], [ %2249, %__nv_exp2f.exit1507 ] + %2362 = phi float [ 0.000000e+00, %53 ], [ %2250, %__nv_exp2f.exit1507 ] + %2363 = phi float [ 0.000000e+00, %53 ], [ %2251, %__nv_exp2f.exit1507 ] + %2364 = phi float [ 0.000000e+00, %53 ], [ %2252, %__nv_exp2f.exit1507 ] + %2365 = phi float [ 0.000000e+00, %53 ], [ %2253, %__nv_exp2f.exit1507 ] + %2366 = phi float [ 0.000000e+00, %53 ], [ %2254, %__nv_exp2f.exit1507 ] + %2367 = phi float [ 0.000000e+00, %53 ], [ %2255, %__nv_exp2f.exit1507 ] + %2368 = phi float [ 0.000000e+00, %53 ], [ %2256, %__nv_exp2f.exit1507 ] + %2369 = phi float [ 0.000000e+00, %53 ], [ %2257, %__nv_exp2f.exit1507 ] + %2370 = phi float [ 0.000000e+00, %53 ], [ %2258, %__nv_exp2f.exit1507 ] + %2371 = phi float [ 0.000000e+00, %53 ], [ %2259, %__nv_exp2f.exit1507 ] + %2372 = phi float [ 0.000000e+00, %53 ], [ %2260, %__nv_exp2f.exit1507 ] + %2373 = phi float [ 0.000000e+00, %53 ], [ %2261, %__nv_exp2f.exit1507 ] + %2374 = phi float [ 0.000000e+00, %53 ], [ %2262, %__nv_exp2f.exit1507 ] + %2375 = phi float [ 0.000000e+00, %53 ], [ %2263, %__nv_exp2f.exit1507 ] + %2376 = phi float [ 0.000000e+00, %53 ], [ %2264, %__nv_exp2f.exit1507 ] + %2377 = phi float [ 0.000000e+00, %53 ], [ %2265, %__nv_exp2f.exit1507 ] + %2378 = phi float [ 0.000000e+00, %53 ], [ %2266, %__nv_exp2f.exit1507 ] + %2379 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %2315, float %2316, float %2317, float %2318, float %2319, float %2320, float %2321, float %2322, float %2323, float %2324, float %2325, float %2326, float %2327, float %2328, float %2329, float %2330, float %2331, float %2332, float %2333, float %2334, float %2335, float %2336, float %2337, float %2338, float %2339, float %2340, float %2341, float %2342, float %2343, float %2344, float %2345, float %2346, float %2347, float %2348, float %2349, float %2350, float %2351, float %2352, float %2353, float %2354, float %2355, float %2356, float %2357, float %2358, float %2359, float %2360, float %2361, float %2362, float %2363, float %2364, float %2365, float %2366, float %2367, float %2368, float %2369, float %2370, float %2371, float %2372, float %2373, float %2374, float %2375, float %2376, float %2377, float %2378) #3, !dbg !84 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !84 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !84 + %2380 = getelementptr i32, ptr addrspace(1) %13, i64 %358, !dbg !136 + %2381 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %2380) #3, !dbg !137 + %2382 = shl i32 %2381, 7, !dbg !138 + %2383 = getelementptr i32, ptr addrspace(1) %12, i64 %362, !dbg !139 + %2384 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %2383) #3, !dbg !140 + %2385 = or disjoint i32 %2382, %38, !dbg !141 + %2386 = or disjoint i32 %2382, %39, !dbg !141 + %2387 = or disjoint i32 %2382, %40, !dbg !141 + %2388 = or disjoint i32 %2382, %41, !dbg !141 + %2389 = shl i32 %2385, 7, !dbg !142 + %2390 = shl i32 %2386, 7, !dbg !142 + %2391 = shl i32 %2387, 7, !dbg !142 + %2392 = shl i32 %2388, 7, !dbg !142 + %2393 = sext i32 %2389 to i64, !dbg !144 + %2394 = getelementptr bfloat, ptr addrspace(1) %32, i64 %2393, !dbg !144 + %2395 = sext i32 %2390 to i64, !dbg !144 + %2396 = getelementptr bfloat, ptr addrspace(1) %32, i64 %2395, !dbg !144 + %2397 = sext i32 %2391 to i64, !dbg !144 + %2398 = getelementptr bfloat, ptr addrspace(1) %32, i64 %2397, !dbg !144 + %2399 = sext i32 %2392 to i64, !dbg !144 + %2400 = getelementptr bfloat, ptr addrspace(1) %32, i64 %2399, !dbg !144 + %2401 = getelementptr bfloat, ptr addrspace(1) %2394, i64 %121, !dbg !145 + %2402 = getelementptr bfloat, ptr addrspace(1) %2396, i64 %121, !dbg !145 + %2403 = getelementptr bfloat, ptr addrspace(1) %2398, i64 %121, !dbg !145 + %2404 = getelementptr bfloat, ptr addrspace(1) %2400, i64 %121, !dbg !145 + %2405 = getelementptr bfloat, ptr addrspace(1) %33, i64 %2393, !dbg !146 + %2406 = getelementptr bfloat, ptr addrspace(1) %33, i64 %2395, !dbg !146 + %2407 = getelementptr bfloat, ptr addrspace(1) %33, i64 %2397, !dbg !146 + %2408 = getelementptr bfloat, ptr addrspace(1) %33, i64 %2399, !dbg !146 + %2409 = getelementptr bfloat, ptr addrspace(1) %2405, i64 %121, !dbg !147 + %2410 = getelementptr bfloat, ptr addrspace(1) %2406, i64 %121, !dbg !147 + %2411 = getelementptr bfloat, ptr addrspace(1) %2407, i64 %121, !dbg !147 + %2412 = getelementptr bfloat, ptr addrspace(1) %2408, i64 %121, !dbg !147 + %2413 = shl i32 %2384, 1, !dbg !148 + %2414 = icmp sgt i32 %2413, 0, !dbg !149 + %2415 = select i1 %2414, i32 16, i32 0, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %403, ptr addrspace(1) %2401, i32 %2415) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %406, ptr addrspace(1) %2402, i32 %2415) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %408, ptr addrspace(1) %2403, i32 %2415) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %410, ptr addrspace(1) %2404, i32 %2415) #3, !dbg !150 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %411, ptr addrspace(1) %2409, i32 %2415) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %412, ptr addrspace(1) %2410, i32 %2415) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %413, ptr addrspace(1) %2411, i32 %2415) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %414, ptr addrspace(1) %2412, i32 %2415) #3, !dbg !150 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !150 + %2416 = icmp sgt i32 %2413, 1, !dbg !149 + %2417 = getelementptr i8, ptr addrspace(1) %2401, i64 16384, !dbg !151 + %2418 = getelementptr i8, ptr addrspace(1) %2402, i64 16384, !dbg !151 + %2419 = getelementptr i8, ptr addrspace(1) %2403, i64 16384, !dbg !151 + %2420 = getelementptr i8, ptr addrspace(1) %2404, i64 16384, !dbg !151 + %2421 = getelementptr i8, ptr addrspace(1) %2409, i64 16384, !dbg !152 + %2422 = getelementptr i8, ptr addrspace(1) %2410, i64 16384, !dbg !152 + %2423 = getelementptr i8, ptr addrspace(1) %2411, i64 16384, !dbg !152 + %2424 = getelementptr i8, ptr addrspace(1) %2412, i64 16384, !dbg !152 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !150 + %2425 = select i1 %2416, i32 16, i32 0, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %424, ptr addrspace(1) %2417, i32 %2425) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %426, ptr addrspace(1) %2418, i32 %2425) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %427, ptr addrspace(1) %2419, i32 %2425) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %428, ptr addrspace(1) %2420, i32 %2425) #3, !dbg !150 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %429, ptr addrspace(1) %2421, i32 %2425) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %430, ptr addrspace(1) %2422, i32 %2425) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %431, ptr addrspace(1) %2423, i32 %2425) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %432, ptr addrspace(1) %2424, i32 %2425) #3, !dbg !150 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !150 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #3, !dbg !153 + %2426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 0, !dbg !149 + %2427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 1, !dbg !149 + %2428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 2, !dbg !149 + %2429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 3, !dbg !149 + %2430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 4, !dbg !149 + %2431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 5, !dbg !149 + %2432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 6, !dbg !149 + %2433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 7, !dbg !149 + %2434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 8, !dbg !149 + %2435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 9, !dbg !149 + %2436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 10, !dbg !149 + %2437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 11, !dbg !149 + %2438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 12, !dbg !149 + %2439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 13, !dbg !149 + %2440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 14, !dbg !149 + %2441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 15, !dbg !149 + %2442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 16, !dbg !149 + %2443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 17, !dbg !149 + %2444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 18, !dbg !149 + %2445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 19, !dbg !149 + %2446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 20, !dbg !149 + %2447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 21, !dbg !149 + %2448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 22, !dbg !149 + %2449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 23, !dbg !149 + %2450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 24, !dbg !149 + %2451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 25, !dbg !149 + %2452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 26, !dbg !149 + %2453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 27, !dbg !149 + %2454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 28, !dbg !149 + %2455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 29, !dbg !149 + %2456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 30, !dbg !149 + %2457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 31, !dbg !149 + %2458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 32, !dbg !149 + %2459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 33, !dbg !149 + %2460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 34, !dbg !149 + %2461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 35, !dbg !149 + %2462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 36, !dbg !149 + %2463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 37, !dbg !149 + %2464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 38, !dbg !149 + %2465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 39, !dbg !149 + %2466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 40, !dbg !149 + %2467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 41, !dbg !149 + %2468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 42, !dbg !149 + %2469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 43, !dbg !149 + %2470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 44, !dbg !149 + %2471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 45, !dbg !149 + %2472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 46, !dbg !149 + %2473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 47, !dbg !149 + %2474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 48, !dbg !149 + %2475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 49, !dbg !149 + %2476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 50, !dbg !149 + %2477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 51, !dbg !149 + %2478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 52, !dbg !149 + %2479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 53, !dbg !149 + %2480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 54, !dbg !149 + %2481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 55, !dbg !149 + %2482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 56, !dbg !149 + %2483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 57, !dbg !149 + %2484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 58, !dbg !149 + %2485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 59, !dbg !149 + %2486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 60, !dbg !149 + %2487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 61, !dbg !149 + %2488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 62, !dbg !149 + %2489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 63, !dbg !149 + br i1 %2414, label %.lr.ph1858, label %._crit_edge1859, !dbg !149 + +.lr.ph1858: ; preds = %._crit_edge1847 + %2490 = tail call i32 @llvm.umin.i32(i32 %2413, i32 32), !dbg !154 + %2491 = add nsw i32 %2490, -2 + %2492 = add nsw i32 %2490, -1 + %2493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 0, !dbg !149 + %2494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 1, !dbg !149 + %2495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 2, !dbg !149 + %2496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 3, !dbg !149 + %2497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 4, !dbg !149 + %2498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 5, !dbg !149 + %2499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 6, !dbg !149 + %2500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 7, !dbg !149 + %2501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 8, !dbg !149 + %2502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 9, !dbg !149 + %2503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 10, !dbg !149 + %2504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 11, !dbg !149 + %2505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 12, !dbg !149 + %2506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 13, !dbg !149 + %2507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 14, !dbg !149 + %2508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 15, !dbg !149 + %2509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 16, !dbg !149 + %2510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 17, !dbg !149 + %2511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 18, !dbg !149 + %2512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 19, !dbg !149 + %2513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 20, !dbg !149 + %2514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 21, !dbg !149 + %2515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 22, !dbg !149 + %2516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 23, !dbg !149 + %2517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 24, !dbg !149 + %2518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 25, !dbg !149 + %2519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 26, !dbg !149 + %2520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 27, !dbg !149 + %2521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 28, !dbg !149 + %2522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 29, !dbg !149 + %2523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 30, !dbg !149 + %2524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 31, !dbg !149 + %2525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 32, !dbg !149 + %2526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 33, !dbg !149 + %2527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 34, !dbg !149 + %2528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 35, !dbg !149 + %2529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 36, !dbg !149 + %2530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 37, !dbg !149 + %2531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 38, !dbg !149 + %2532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 39, !dbg !149 + %2533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 40, !dbg !149 + %2534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 41, !dbg !149 + %2535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 42, !dbg !149 + %2536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 43, !dbg !149 + %2537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 44, !dbg !149 + %2538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 45, !dbg !149 + %2539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 46, !dbg !149 + %2540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 47, !dbg !149 + %2541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 48, !dbg !149 + %2542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 49, !dbg !149 + %2543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 50, !dbg !149 + %2544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 51, !dbg !149 + %2545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 52, !dbg !149 + %2546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 53, !dbg !149 + %2547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 54, !dbg !149 + %2548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 55, !dbg !149 + %2549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 56, !dbg !149 + %2550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 57, !dbg !149 + %2551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 58, !dbg !149 + %2552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 59, !dbg !149 + %2553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 60, !dbg !149 + %2554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 61, !dbg !149 + %2555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 62, !dbg !149 + %2556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 63, !dbg !149 + %2557 = insertelement <2 x float> poison, float %347, i64 0, !dbg !155 + %2558 = shufflevector <2 x float> %2557, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !155 + %2559 = insertelement <2 x float> poison, float %345, i64 0, !dbg !155 + %2560 = shufflevector <2 x float> %2559, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !155 + br label %2561, !dbg !149 + +2561: ; preds = %.lr.ph1858, %__nv_exp2f.exit1411 + %2562 = phi i32 [ -1, %.lr.ph1858 ], [ %2569, %__nv_exp2f.exit1411 ] + %2563 = phi i32 [ 1, %.lr.ph1858 ], [ %4028, %__nv_exp2f.exit1411 ] + %.pn10781856 = phi ptr addrspace(1) [ %2424, %.lr.ph1858 ], [ %4025, %__nv_exp2f.exit1411 ] + %.pn10941855 = phi ptr addrspace(1) [ %2423, %.lr.ph1858 ], [ %4024, %__nv_exp2f.exit1411 ] + %.pn11101854 = phi ptr addrspace(1) [ %2422, %.lr.ph1858 ], [ %4023, %__nv_exp2f.exit1411 ] + %.pn11261853 = phi ptr addrspace(1) [ %2421, %.lr.ph1858 ], [ %4022, %__nv_exp2f.exit1411 ] + %.pn10141852 = phi ptr addrspace(1) [ %2420, %.lr.ph1858 ], [ %4021, %__nv_exp2f.exit1411 ] + %.pn10301851 = phi ptr addrspace(1) [ %2419, %.lr.ph1858 ], [ %4020, %__nv_exp2f.exit1411 ] + %.pn10461850 = phi ptr addrspace(1) [ %2418, %.lr.ph1858 ], [ %4019, %__nv_exp2f.exit1411 ] + %.pn10621849 = phi ptr addrspace(1) [ %2417, %.lr.ph1858 ], [ %4018, %__nv_exp2f.exit1411 ] + %.pn = phi float [ %2493, %.lr.ph1858 ], [ %3932, %__nv_exp2f.exit1411 ] + %.pn2389 = phi float [ %2494, %.lr.ph1858 ], [ %3933, %__nv_exp2f.exit1411 ] + %.pn2390 = phi float [ %2495, %.lr.ph1858 ], [ %3934, %__nv_exp2f.exit1411 ] + %.pn2391 = phi float [ %2496, %.lr.ph1858 ], [ %3935, %__nv_exp2f.exit1411 ] + %.pn2392 = phi float [ %2497, %.lr.ph1858 ], [ %3936, %__nv_exp2f.exit1411 ] + %.pn2393 = phi float [ %2498, %.lr.ph1858 ], [ %3937, %__nv_exp2f.exit1411 ] + %.pn2394 = phi float [ %2499, %.lr.ph1858 ], [ %3938, %__nv_exp2f.exit1411 ] + %.pn2395 = phi float [ %2500, %.lr.ph1858 ], [ %3939, %__nv_exp2f.exit1411 ] + %.pn2396 = phi float [ %2501, %.lr.ph1858 ], [ %3940, %__nv_exp2f.exit1411 ] + %.pn2397 = phi float [ %2502, %.lr.ph1858 ], [ %3941, %__nv_exp2f.exit1411 ] + %.pn2398 = phi float [ %2503, %.lr.ph1858 ], [ %3942, %__nv_exp2f.exit1411 ] + %.pn2399 = phi float [ %2504, %.lr.ph1858 ], [ %3943, %__nv_exp2f.exit1411 ] + %.pn2400 = phi float [ %2505, %.lr.ph1858 ], [ %3944, %__nv_exp2f.exit1411 ] + %.pn2401 = phi float [ %2506, %.lr.ph1858 ], [ %3945, %__nv_exp2f.exit1411 ] + %.pn2402 = phi float [ %2507, %.lr.ph1858 ], [ %3946, %__nv_exp2f.exit1411 ] + %.pn2403 = phi float [ %2508, %.lr.ph1858 ], [ %3947, %__nv_exp2f.exit1411 ] + %.pn2404 = phi float [ %2509, %.lr.ph1858 ], [ %3948, %__nv_exp2f.exit1411 ] + %.pn2405 = phi float [ %2510, %.lr.ph1858 ], [ %3949, %__nv_exp2f.exit1411 ] + %.pn2406 = phi float [ %2511, %.lr.ph1858 ], [ %3950, %__nv_exp2f.exit1411 ] + %.pn2407 = phi float [ %2512, %.lr.ph1858 ], [ %3951, %__nv_exp2f.exit1411 ] + %.pn2408 = phi float [ %2513, %.lr.ph1858 ], [ %3952, %__nv_exp2f.exit1411 ] + %.pn2409 = phi float [ %2514, %.lr.ph1858 ], [ %3953, %__nv_exp2f.exit1411 ] + %.pn2410 = phi float [ %2515, %.lr.ph1858 ], [ %3954, %__nv_exp2f.exit1411 ] + %.pn2411 = phi float [ %2516, %.lr.ph1858 ], [ %3955, %__nv_exp2f.exit1411 ] + %.pn2412 = phi float [ %2517, %.lr.ph1858 ], [ %3956, %__nv_exp2f.exit1411 ] + %.pn2413 = phi float [ %2518, %.lr.ph1858 ], [ %3957, %__nv_exp2f.exit1411 ] + %.pn2414 = phi float [ %2519, %.lr.ph1858 ], [ %3958, %__nv_exp2f.exit1411 ] + %.pn2415 = phi float [ %2520, %.lr.ph1858 ], [ %3959, %__nv_exp2f.exit1411 ] + %.pn2416 = phi float [ %2521, %.lr.ph1858 ], [ %3960, %__nv_exp2f.exit1411 ] + %.pn2417 = phi float [ %2522, %.lr.ph1858 ], [ %3961, %__nv_exp2f.exit1411 ] + %.pn2418 = phi float [ %2523, %.lr.ph1858 ], [ %3962, %__nv_exp2f.exit1411 ] + %.pn2419 = phi float [ %2524, %.lr.ph1858 ], [ %3963, %__nv_exp2f.exit1411 ] + %.pn2420 = phi float [ %2525, %.lr.ph1858 ], [ %3964, %__nv_exp2f.exit1411 ] + %.pn2421 = phi float [ %2526, %.lr.ph1858 ], [ %3965, %__nv_exp2f.exit1411 ] + %.pn2422 = phi float [ %2527, %.lr.ph1858 ], [ %3966, %__nv_exp2f.exit1411 ] + %.pn2423 = phi float [ %2528, %.lr.ph1858 ], [ %3967, %__nv_exp2f.exit1411 ] + %.pn2424 = phi float [ %2529, %.lr.ph1858 ], [ %3968, %__nv_exp2f.exit1411 ] + %.pn2425 = phi float [ %2530, %.lr.ph1858 ], [ %3969, %__nv_exp2f.exit1411 ] + %.pn2426 = phi float [ %2531, %.lr.ph1858 ], [ %3970, %__nv_exp2f.exit1411 ] + %.pn2427 = phi float [ %2532, %.lr.ph1858 ], [ %3971, %__nv_exp2f.exit1411 ] + %.pn2428 = phi float [ %2533, %.lr.ph1858 ], [ %3972, %__nv_exp2f.exit1411 ] + %.pn2429 = phi float [ %2534, %.lr.ph1858 ], [ %3973, %__nv_exp2f.exit1411 ] + %.pn2430 = phi float [ %2535, %.lr.ph1858 ], [ %3974, %__nv_exp2f.exit1411 ] + %.pn2431 = phi float [ %2536, %.lr.ph1858 ], [ %3975, %__nv_exp2f.exit1411 ] + %.pn2432 = phi float [ %2537, %.lr.ph1858 ], [ %3976, %__nv_exp2f.exit1411 ] + %.pn2433 = phi float [ %2538, %.lr.ph1858 ], [ %3977, %__nv_exp2f.exit1411 ] + %.pn2434 = phi float [ %2539, %.lr.ph1858 ], [ %3978, %__nv_exp2f.exit1411 ] + %.pn2435 = phi float [ %2540, %.lr.ph1858 ], [ %3979, %__nv_exp2f.exit1411 ] + %.pn2436 = phi float [ %2541, %.lr.ph1858 ], [ %3980, %__nv_exp2f.exit1411 ] + %.pn2437 = phi float [ %2542, %.lr.ph1858 ], [ %3981, %__nv_exp2f.exit1411 ] + %.pn2438 = phi float [ %2543, %.lr.ph1858 ], [ %3982, %__nv_exp2f.exit1411 ] + %.pn2439 = phi float [ %2544, %.lr.ph1858 ], [ %3983, %__nv_exp2f.exit1411 ] + %.pn2440 = phi float [ %2545, %.lr.ph1858 ], [ %3984, %__nv_exp2f.exit1411 ] + %.pn2441 = phi float [ %2546, %.lr.ph1858 ], [ %3985, %__nv_exp2f.exit1411 ] + %.pn2442 = phi float [ %2547, %.lr.ph1858 ], [ %3986, %__nv_exp2f.exit1411 ] + %.pn2443 = phi float [ %2548, %.lr.ph1858 ], [ %3987, %__nv_exp2f.exit1411 ] + %.pn2444 = phi float [ %2549, %.lr.ph1858 ], [ %3988, %__nv_exp2f.exit1411 ] + %.pn2445 = phi float [ %2550, %.lr.ph1858 ], [ %3989, %__nv_exp2f.exit1411 ] + %.pn2446 = phi float [ %2551, %.lr.ph1858 ], [ %3990, %__nv_exp2f.exit1411 ] + %.pn2447 = phi float [ %2552, %.lr.ph1858 ], [ %3991, %__nv_exp2f.exit1411 ] + %.pn2448 = phi float [ %2553, %.lr.ph1858 ], [ %3992, %__nv_exp2f.exit1411 ] + %.pn2449 = phi float [ %2554, %.lr.ph1858 ], [ %3993, %__nv_exp2f.exit1411 ] + %.pn2450 = phi float [ %2555, %.lr.ph1858 ], [ %3994, %__nv_exp2f.exit1411 ] + %.pn2451 = phi float [ %2556, %.lr.ph1858 ], [ %3995, %__nv_exp2f.exit1411 ] + %2564 = phi i32 [ 0, %.lr.ph1858 ], [ %3996, %__nv_exp2f.exit1411 ] + %2565 = icmp slt i32 %2564, %2491, !dbg !149 + %2566 = icmp slt i32 %2564, %2492, !dbg !149 + %2567 = add i32 %2562, 1, !dbg !149 + %2568 = icmp sgt i32 %2567, 2, !dbg !149 + %2569 = select i1 %2568, i32 0, i32 %2567, !dbg !149 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !150 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !150 + %2570 = shl i32 %2569, 13, !dbg !150 + %2571 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %2570, !dbg !150 + %2572 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %36, i32 0, i32 31), !dbg !153 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !153 + %2573 = shl i32 %2572, 11, !dbg !153 + %2574 = and i32 %2573, 8192, !dbg !153 + %2575 = add i32 %2574, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !153 + %2576 = lshr exact i32 %2575, 4, !dbg !153 + %2577 = and i32 %2576, 16383, !dbg !153 + %2578 = zext nneg i32 %2577 to i64, !dbg !153 + %2579 = or disjoint i64 %2578, 4611686293372403712, !dbg !153 + %2580 = ptrtoint ptr addrspace(3) %2571 to i32, !dbg !153 + %2581 = lshr exact i32 %2580, 4, !dbg !153 + %2582 = and i32 %2581, 16383, !dbg !153 + %2583 = zext nneg i32 %2582 to i64, !dbg !153 + %2584 = or disjoint i64 %2583, 4611686293338849280, !dbg !153 + %2585 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %2579, i64 %2584) #3, !dbg !153 + %2586 = or disjoint i32 %2574, 32, !dbg !153 + %2587 = add i32 %2586, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !153 + %2588 = lshr exact i32 %2587, 4, !dbg !153 + %2589 = and i32 %2588, 16383, !dbg !153 + %2590 = zext nneg i32 %2589 to i64, !dbg !153 + %2591 = or disjoint i64 %2590, 4611686293372403712, !dbg !153 + %2592 = add i32 %2580, 32, !dbg !153 + %2593 = lshr exact i32 %2592, 4, !dbg !153 + %2594 = and i32 %2593, 16383, !dbg !153 + %2595 = zext nneg i32 %2594 to i64, !dbg !153 + %2596 = or disjoint i64 %2595, 4611686293338849280, !dbg !153 + %2597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 0, !dbg !153 + %2598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 1, !dbg !153 + %2599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 2, !dbg !153 + %2600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 3, !dbg !153 + %2601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 4, !dbg !153 + %2602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 5, !dbg !153 + %2603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 6, !dbg !153 + %2604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 7, !dbg !153 + %2605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 8, !dbg !153 + %2606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 9, !dbg !153 + %2607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 10, !dbg !153 + %2608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 11, !dbg !153 + %2609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 12, !dbg !153 + %2610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 13, !dbg !153 + %2611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 14, !dbg !153 + %2612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 15, !dbg !153 + %2613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 16, !dbg !153 + %2614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 17, !dbg !153 + %2615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 18, !dbg !153 + %2616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 19, !dbg !153 + %2617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 20, !dbg !153 + %2618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 21, !dbg !153 + %2619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 22, !dbg !153 + %2620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 23, !dbg !153 + %2621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 24, !dbg !153 + %2622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 25, !dbg !153 + %2623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 26, !dbg !153 + %2624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 27, !dbg !153 + %2625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 28, !dbg !153 + %2626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 29, !dbg !153 + %2627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 30, !dbg !153 + %2628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 31, !dbg !153 + %2629 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2597, float %2598, float %2599, float %2600, float %2601, float %2602, float %2603, float %2604, float %2605, float %2606, float %2607, float %2608, float %2609, float %2610, float %2611, float %2612, float %2613, float %2614, float %2615, float %2616, float %2617, float %2618, float %2619, float %2620, float %2621, float %2622, float %2623, float %2624, float %2625, float %2626, float %2627, float %2628, i64 %2591, i64 %2596, i1 true) #3, !dbg !153 + %2630 = or disjoint i32 %2574, 64, !dbg !153 + %2631 = add i32 %2630, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !153 + %2632 = lshr exact i32 %2631, 4, !dbg !153 + %2633 = and i32 %2632, 16383, !dbg !153 + %2634 = zext nneg i32 %2633 to i64, !dbg !153 + %2635 = or disjoint i64 %2634, 4611686293372403712, !dbg !153 + %2636 = add i32 %2580, 64, !dbg !153 + %2637 = lshr exact i32 %2636, 4, !dbg !153 + %2638 = and i32 %2637, 16383, !dbg !153 + %2639 = zext nneg i32 %2638 to i64, !dbg !153 + %2640 = or disjoint i64 %2639, 4611686293338849280, !dbg !153 + %2641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 0, !dbg !153 + %2642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 1, !dbg !153 + %2643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 2, !dbg !153 + %2644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 3, !dbg !153 + %2645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 4, !dbg !153 + %2646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 5, !dbg !153 + %2647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 6, !dbg !153 + %2648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 7, !dbg !153 + %2649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 8, !dbg !153 + %2650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 9, !dbg !153 + %2651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 10, !dbg !153 + %2652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 11, !dbg !153 + %2653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 12, !dbg !153 + %2654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 13, !dbg !153 + %2655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 14, !dbg !153 + %2656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 15, !dbg !153 + %2657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 16, !dbg !153 + %2658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 17, !dbg !153 + %2659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 18, !dbg !153 + %2660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 19, !dbg !153 + %2661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 20, !dbg !153 + %2662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 21, !dbg !153 + %2663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 22, !dbg !153 + %2664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 23, !dbg !153 + %2665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 24, !dbg !153 + %2666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 25, !dbg !153 + %2667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 26, !dbg !153 + %2668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 27, !dbg !153 + %2669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 28, !dbg !153 + %2670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 29, !dbg !153 + %2671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 30, !dbg !153 + %2672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 31, !dbg !153 + %2673 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2641, float %2642, float %2643, float %2644, float %2645, float %2646, float %2647, float %2648, float %2649, float %2650, float %2651, float %2652, float %2653, float %2654, float %2655, float %2656, float %2657, float %2658, float %2659, float %2660, float %2661, float %2662, float %2663, float %2664, float %2665, float %2666, float %2667, float %2668, float %2669, float %2670, float %2671, float %2672, i64 %2635, i64 %2640, i1 true) #3, !dbg !153 + %2674 = or disjoint i32 %2574, 96, !dbg !153 + %2675 = add i32 %2674, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !153 + %2676 = lshr exact i32 %2675, 4, !dbg !153 + %2677 = and i32 %2676, 16383, !dbg !153 + %2678 = zext nneg i32 %2677 to i64, !dbg !153 + %2679 = or disjoint i64 %2678, 4611686293372403712, !dbg !153 + %2680 = add i32 %2580, 96, !dbg !153 + %2681 = lshr exact i32 %2680, 4, !dbg !153 + %2682 = and i32 %2681, 16383, !dbg !153 + %2683 = zext nneg i32 %2682 to i64, !dbg !153 + %2684 = or disjoint i64 %2683, 4611686293338849280, !dbg !153 + %2685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 0, !dbg !153 + %2686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 1, !dbg !153 + %2687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 2, !dbg !153 + %2688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 3, !dbg !153 + %2689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 4, !dbg !153 + %2690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 5, !dbg !153 + %2691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 6, !dbg !153 + %2692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 7, !dbg !153 + %2693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 8, !dbg !153 + %2694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 9, !dbg !153 + %2695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 10, !dbg !153 + %2696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 11, !dbg !153 + %2697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 12, !dbg !153 + %2698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 13, !dbg !153 + %2699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 14, !dbg !153 + %2700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 15, !dbg !153 + %2701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 16, !dbg !153 + %2702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 17, !dbg !153 + %2703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 18, !dbg !153 + %2704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 19, !dbg !153 + %2705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 20, !dbg !153 + %2706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 21, !dbg !153 + %2707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 22, !dbg !153 + %2708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 23, !dbg !153 + %2709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 24, !dbg !153 + %2710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 25, !dbg !153 + %2711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 26, !dbg !153 + %2712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 27, !dbg !153 + %2713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 28, !dbg !153 + %2714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 29, !dbg !153 + %2715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 30, !dbg !153 + %2716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 31, !dbg !153 + %2717 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2685, float %2686, float %2687, float %2688, float %2689, float %2690, float %2691, float %2692, float %2693, float %2694, float %2695, float %2696, float %2697, float %2698, float %2699, float %2700, float %2701, float %2702, float %2703, float %2704, float %2705, float %2706, float %2707, float %2708, float %2709, float %2710, float %2711, float %2712, float %2713, float %2714, float %2715, float %2716, i64 %2679, i64 %2684, i1 true) #3, !dbg !153 + %2718 = or disjoint i32 %2574, 16384, !dbg !153 + %2719 = add i32 %2718, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !153 + %2720 = lshr exact i32 %2719, 4, !dbg !153 + %2721 = and i32 %2720, 16383, !dbg !153 + %2722 = zext nneg i32 %2721 to i64, !dbg !153 + %2723 = or disjoint i64 %2722, 4611686293372403712, !dbg !153 + %2724 = add i32 %2580, 8192, !dbg !153 + %2725 = lshr exact i32 %2724, 4, !dbg !153 + %2726 = and i32 %2725, 16383, !dbg !153 + %2727 = zext nneg i32 %2726 to i64, !dbg !153 + %2728 = or disjoint i64 %2727, 4611686293338849280, !dbg !153 + %2729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 0, !dbg !153 + %2730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 1, !dbg !153 + %2731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 2, !dbg !153 + %2732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 3, !dbg !153 + %2733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 4, !dbg !153 + %2734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 5, !dbg !153 + %2735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 6, !dbg !153 + %2736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 7, !dbg !153 + %2737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 8, !dbg !153 + %2738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 9, !dbg !153 + %2739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 10, !dbg !153 + %2740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 11, !dbg !153 + %2741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 12, !dbg !153 + %2742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 13, !dbg !153 + %2743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 14, !dbg !153 + %2744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 15, !dbg !153 + %2745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 16, !dbg !153 + %2746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 17, !dbg !153 + %2747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 18, !dbg !153 + %2748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 19, !dbg !153 + %2749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 20, !dbg !153 + %2750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 21, !dbg !153 + %2751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 22, !dbg !153 + %2752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 23, !dbg !153 + %2753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 24, !dbg !153 + %2754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 25, !dbg !153 + %2755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 26, !dbg !153 + %2756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 27, !dbg !153 + %2757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 28, !dbg !153 + %2758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 29, !dbg !153 + %2759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 30, !dbg !153 + %2760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 31, !dbg !153 + %2761 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2729, float %2730, float %2731, float %2732, float %2733, float %2734, float %2735, float %2736, float %2737, float %2738, float %2739, float %2740, float %2741, float %2742, float %2743, float %2744, float %2745, float %2746, float %2747, float %2748, float %2749, float %2750, float %2751, float %2752, float %2753, float %2754, float %2755, float %2756, float %2757, float %2758, float %2759, float %2760, i64 %2723, i64 %2728, i1 true) #3, !dbg !153 + %2762 = or disjoint i32 %2574, 16416, !dbg !153 + %2763 = add i32 %2762, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !153 + %2764 = lshr exact i32 %2763, 4, !dbg !153 + %2765 = and i32 %2764, 16383, !dbg !153 + %2766 = zext nneg i32 %2765 to i64, !dbg !153 + %2767 = or disjoint i64 %2766, 4611686293372403712, !dbg !153 + %2768 = add i32 %2580, 8224, !dbg !153 + %2769 = lshr exact i32 %2768, 4, !dbg !153 + %2770 = and i32 %2769, 16383, !dbg !153 + %2771 = zext nneg i32 %2770 to i64, !dbg !153 + %2772 = or disjoint i64 %2771, 4611686293338849280, !dbg !153 + %2773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 0, !dbg !153 + %2774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 1, !dbg !153 + %2775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 2, !dbg !153 + %2776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 3, !dbg !153 + %2777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 4, !dbg !153 + %2778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 5, !dbg !153 + %2779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 6, !dbg !153 + %2780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 7, !dbg !153 + %2781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 8, !dbg !153 + %2782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 9, !dbg !153 + %2783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 10, !dbg !153 + %2784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 11, !dbg !153 + %2785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 12, !dbg !153 + %2786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 13, !dbg !153 + %2787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 14, !dbg !153 + %2788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 15, !dbg !153 + %2789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 16, !dbg !153 + %2790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 17, !dbg !153 + %2791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 18, !dbg !153 + %2792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 19, !dbg !153 + %2793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 20, !dbg !153 + %2794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 21, !dbg !153 + %2795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 22, !dbg !153 + %2796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 23, !dbg !153 + %2797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 24, !dbg !153 + %2798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 25, !dbg !153 + %2799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 26, !dbg !153 + %2800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 27, !dbg !153 + %2801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 28, !dbg !153 + %2802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 29, !dbg !153 + %2803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 30, !dbg !153 + %2804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 31, !dbg !153 + %2805 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2773, float %2774, float %2775, float %2776, float %2777, float %2778, float %2779, float %2780, float %2781, float %2782, float %2783, float %2784, float %2785, float %2786, float %2787, float %2788, float %2789, float %2790, float %2791, float %2792, float %2793, float %2794, float %2795, float %2796, float %2797, float %2798, float %2799, float %2800, float %2801, float %2802, float %2803, float %2804, i64 %2767, i64 %2772, i1 true) #3, !dbg !153 + %2806 = or disjoint i32 %2574, 16448, !dbg !153 + %2807 = add i32 %2806, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !153 + %2808 = lshr exact i32 %2807, 4, !dbg !153 + %2809 = and i32 %2808, 16383, !dbg !153 + %2810 = zext nneg i32 %2809 to i64, !dbg !153 + %2811 = or disjoint i64 %2810, 4611686293372403712, !dbg !153 + %2812 = add i32 %2580, 8256, !dbg !153 + %2813 = lshr exact i32 %2812, 4, !dbg !153 + %2814 = and i32 %2813, 16383, !dbg !153 + %2815 = zext nneg i32 %2814 to i64, !dbg !153 + %2816 = or disjoint i64 %2815, 4611686293338849280, !dbg !153 + %2817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 0, !dbg !153 + %2818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 1, !dbg !153 + %2819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 2, !dbg !153 + %2820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 3, !dbg !153 + %2821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 4, !dbg !153 + %2822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 5, !dbg !153 + %2823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 6, !dbg !153 + %2824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 7, !dbg !153 + %2825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 8, !dbg !153 + %2826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 9, !dbg !153 + %2827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 10, !dbg !153 + %2828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 11, !dbg !153 + %2829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 12, !dbg !153 + %2830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 13, !dbg !153 + %2831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 14, !dbg !153 + %2832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 15, !dbg !153 + %2833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 16, !dbg !153 + %2834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 17, !dbg !153 + %2835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 18, !dbg !153 + %2836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 19, !dbg !153 + %2837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 20, !dbg !153 + %2838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 21, !dbg !153 + %2839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 22, !dbg !153 + %2840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 23, !dbg !153 + %2841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 24, !dbg !153 + %2842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 25, !dbg !153 + %2843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 26, !dbg !153 + %2844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 27, !dbg !153 + %2845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 28, !dbg !153 + %2846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 29, !dbg !153 + %2847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 30, !dbg !153 + %2848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 31, !dbg !153 + %2849 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2817, float %2818, float %2819, float %2820, float %2821, float %2822, float %2823, float %2824, float %2825, float %2826, float %2827, float %2828, float %2829, float %2830, float %2831, float %2832, float %2833, float %2834, float %2835, float %2836, float %2837, float %2838, float %2839, float %2840, float %2841, float %2842, float %2843, float %2844, float %2845, float %2846, float %2847, float %2848, i64 %2811, i64 %2816, i1 true) #3, !dbg !153 + %2850 = or disjoint i32 %2574, 16480, !dbg !153 + %2851 = add i32 %2850, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !153 + %2852 = lshr exact i32 %2851, 4, !dbg !153 + %2853 = and i32 %2852, 16383, !dbg !153 + %2854 = zext nneg i32 %2853 to i64, !dbg !153 + %2855 = or disjoint i64 %2854, 4611686293372403712, !dbg !153 + %2856 = add i32 %2580, 8288, !dbg !153 + %2857 = lshr exact i32 %2856, 4, !dbg !153 + %2858 = and i32 %2857, 16383, !dbg !153 + %2859 = zext nneg i32 %2858 to i64, !dbg !153 + %2860 = or disjoint i64 %2859, 4611686293338849280, !dbg !153 + %2861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 0, !dbg !153 + %2862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 1, !dbg !153 + %2863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 2, !dbg !153 + %2864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 3, !dbg !153 + %2865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 4, !dbg !153 + %2866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 5, !dbg !153 + %2867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 6, !dbg !153 + %2868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 7, !dbg !153 + %2869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 8, !dbg !153 + %2870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 9, !dbg !153 + %2871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 10, !dbg !153 + %2872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 11, !dbg !153 + %2873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 12, !dbg !153 + %2874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 13, !dbg !153 + %2875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 14, !dbg !153 + %2876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 15, !dbg !153 + %2877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 16, !dbg !153 + %2878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 17, !dbg !153 + %2879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 18, !dbg !153 + %2880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 19, !dbg !153 + %2881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 20, !dbg !153 + %2882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 21, !dbg !153 + %2883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 22, !dbg !153 + %2884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 23, !dbg !153 + %2885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 24, !dbg !153 + %2886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 25, !dbg !153 + %2887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 26, !dbg !153 + %2888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 27, !dbg !153 + %2889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 28, !dbg !153 + %2890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 29, !dbg !153 + %2891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 30, !dbg !153 + %2892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 31, !dbg !153 + %2893 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2861, float %2862, float %2863, float %2864, float %2865, float %2866, float %2867, float %2868, float %2869, float %2870, float %2871, float %2872, float %2873, float %2874, float %2875, float %2876, float %2877, float %2878, float %2879, float %2880, float %2881, float %2882, float %2883, float %2884, float %2885, float %2886, float %2887, float %2888, float %2889, float %2890, float %2891, float %2892, i64 %2855, i64 %2860, i1 true) #3, !dbg !153 + %2894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 0, !dbg !153 + %2895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 1, !dbg !153 + %2896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 2, !dbg !153 + %2897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 3, !dbg !153 + %2898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 4, !dbg !153 + %2899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 5, !dbg !153 + %2900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 6, !dbg !153 + %2901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 7, !dbg !153 + %2902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 8, !dbg !153 + %2903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 9, !dbg !153 + %2904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 10, !dbg !153 + %2905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 11, !dbg !153 + %2906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 12, !dbg !153 + %2907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 13, !dbg !153 + %2908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 14, !dbg !153 + %2909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 15, !dbg !153 + %2910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 16, !dbg !153 + %2911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 17, !dbg !153 + %2912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 18, !dbg !153 + %2913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 19, !dbg !153 + %2914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 20, !dbg !153 + %2915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 21, !dbg !153 + %2916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 22, !dbg !153 + %2917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 23, !dbg !153 + %2918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 24, !dbg !153 + %2919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 25, !dbg !153 + %2920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 26, !dbg !153 + %2921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 27, !dbg !153 + %2922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 28, !dbg !153 + %2923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 29, !dbg !153 + %2924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 30, !dbg !153 + %2925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 31, !dbg !153 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !153 + %2926 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %2894, float %2895, float %2896, float %2897, float %2898, float %2899, float %2900, float %2901, float %2902, float %2903, float %2904, float %2905, float %2906, float %2907, float %2908, float %2909, float %2910, float %2911, float %2912, float %2913, float %2914, float %2915, float %2916, float %2917, float %2918, float %2919, float %2920, float %2921, float %2922, float %2923, float %2924, float %2925, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %2571, i32 0, i32 0) #3, !dbg !153 + %2927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 0, !dbg !153 + %2928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 1, !dbg !153 + %2929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 2, !dbg !153 + %2930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 3, !dbg !153 + %2931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 4, !dbg !153 + %2932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 5, !dbg !153 + %2933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 6, !dbg !153 + %2934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 7, !dbg !153 + %2935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 8, !dbg !153 + %2936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 9, !dbg !153 + %2937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 10, !dbg !153 + %2938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 11, !dbg !153 + %2939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 12, !dbg !153 + %2940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 13, !dbg !153 + %2941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 14, !dbg !153 + %2942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 15, !dbg !153 + %2943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 16, !dbg !153 + %2944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 17, !dbg !153 + %2945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 18, !dbg !153 + %2946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 19, !dbg !153 + %2947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 20, !dbg !153 + %2948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 21, !dbg !153 + %2949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 22, !dbg !153 + %2950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 23, !dbg !153 + %2951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 24, !dbg !153 + %2952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 25, !dbg !153 + %2953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 26, !dbg !153 + %2954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 27, !dbg !153 + %2955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 28, !dbg !153 + %2956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 29, !dbg !153 + %2957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 30, !dbg !153 + %2958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 31, !dbg !153 + %2959 = fmul float %2927, 0x3FB6A09E60000000, !dbg !156 + %2960 = fmul float %2928, 0x3FB6A09E60000000, !dbg !156 + %2961 = fmul float %2929, 0x3FB6A09E60000000, !dbg !156 + %2962 = fmul float %2930, 0x3FB6A09E60000000, !dbg !156 + %2963 = fmul float %2931, 0x3FB6A09E60000000, !dbg !156 + %2964 = fmul float %2932, 0x3FB6A09E60000000, !dbg !156 + %2965 = fmul float %2933, 0x3FB6A09E60000000, !dbg !156 + %2966 = fmul float %2934, 0x3FB6A09E60000000, !dbg !156 + %2967 = fmul float %2935, 0x3FB6A09E60000000, !dbg !156 + %2968 = fmul float %2936, 0x3FB6A09E60000000, !dbg !156 + %2969 = fmul float %2937, 0x3FB6A09E60000000, !dbg !156 + %2970 = fmul float %2938, 0x3FB6A09E60000000, !dbg !156 + %2971 = fmul float %2939, 0x3FB6A09E60000000, !dbg !156 + %2972 = fmul float %2940, 0x3FB6A09E60000000, !dbg !156 + %2973 = fmul float %2941, 0x3FB6A09E60000000, !dbg !156 + %2974 = fmul float %2942, 0x3FB6A09E60000000, !dbg !156 + %2975 = fmul float %2943, 0x3FB6A09E60000000, !dbg !156 + %2976 = fmul float %2944, 0x3FB6A09E60000000, !dbg !156 + %2977 = fmul float %2945, 0x3FB6A09E60000000, !dbg !156 + %2978 = fmul float %2946, 0x3FB6A09E60000000, !dbg !156 + %2979 = fmul float %2947, 0x3FB6A09E60000000, !dbg !156 + %2980 = fmul float %2948, 0x3FB6A09E60000000, !dbg !156 + %2981 = fmul float %2949, 0x3FB6A09E60000000, !dbg !156 + %2982 = fmul float %2950, 0x3FB6A09E60000000, !dbg !156 + %2983 = fmul float %2951, 0x3FB6A09E60000000, !dbg !156 + %2984 = fmul float %2952, 0x3FB6A09E60000000, !dbg !156 + %2985 = fmul float %2953, 0x3FB6A09E60000000, !dbg !156 + %2986 = fmul float %2954, 0x3FB6A09E60000000, !dbg !156 + %2987 = fmul float %2955, 0x3FB6A09E60000000, !dbg !156 + %2988 = fmul float %2956, 0x3FB6A09E60000000, !dbg !156 + %2989 = fmul float %2957, 0x3FB6A09E60000000, !dbg !156 + %2990 = fmul float %2958, 0x3FB6A09E60000000, !dbg !156 + %2991 = fmul float %2959, 0x3FF7154760000000, !dbg !157 + %2992 = fmul float %2960, 0x3FF7154760000000, !dbg !157 + %2993 = fmul float %2961, 0x3FF7154760000000, !dbg !157 + %2994 = fmul float %2962, 0x3FF7154760000000, !dbg !157 + %2995 = fmul float %2963, 0x3FF7154760000000, !dbg !157 + %2996 = fmul float %2964, 0x3FF7154760000000, !dbg !157 + %2997 = fmul float %2965, 0x3FF7154760000000, !dbg !157 + %2998 = fmul float %2966, 0x3FF7154760000000, !dbg !157 + %2999 = fmul float %2967, 0x3FF7154760000000, !dbg !157 + %3000 = fmul float %2968, 0x3FF7154760000000, !dbg !157 + %3001 = fmul float %2969, 0x3FF7154760000000, !dbg !157 + %3002 = fmul float %2970, 0x3FF7154760000000, !dbg !157 + %3003 = fmul float %2971, 0x3FF7154760000000, !dbg !157 + %3004 = fmul float %2972, 0x3FF7154760000000, !dbg !157 + %3005 = fmul float %2973, 0x3FF7154760000000, !dbg !157 + %3006 = fmul float %2974, 0x3FF7154760000000, !dbg !157 + %3007 = fmul float %2975, 0x3FF7154760000000, !dbg !157 + %3008 = fmul float %2976, 0x3FF7154760000000, !dbg !157 + %3009 = fmul float %2977, 0x3FF7154760000000, !dbg !157 + %3010 = fmul float %2978, 0x3FF7154760000000, !dbg !157 + %3011 = fmul float %2979, 0x3FF7154760000000, !dbg !157 + %3012 = fmul float %2980, 0x3FF7154760000000, !dbg !157 + %3013 = fmul float %2981, 0x3FF7154760000000, !dbg !157 + %3014 = fmul float %2982, 0x3FF7154760000000, !dbg !157 + %3015 = fmul float %2983, 0x3FF7154760000000, !dbg !157 + %3016 = fmul float %2984, 0x3FF7154760000000, !dbg !157 + %3017 = fmul float %2985, 0x3FF7154760000000, !dbg !157 + %3018 = fmul float %2986, 0x3FF7154760000000, !dbg !157 + %3019 = fmul float %2987, 0x3FF7154760000000, !dbg !157 + %3020 = fmul float %2988, 0x3FF7154760000000, !dbg !157 + %3021 = fmul float %2989, 0x3FF7154760000000, !dbg !157 + %3022 = fmul float %2990, 0x3FF7154760000000, !dbg !157 + %3023 = fsub float %2991, %356, !dbg !158 + %3024 = fsub float %2992, %356, !dbg !158 + %3025 = fsub float %2993, %357, !dbg !158 + %3026 = fsub float %2994, %357, !dbg !158 + %3027 = fsub float %2995, %356, !dbg !158 + %3028 = fsub float %2996, %356, !dbg !158 + %3029 = fsub float %2997, %357, !dbg !158 + %3030 = fsub float %2998, %357, !dbg !158 + %3031 = fsub float %2999, %356, !dbg !158 + %3032 = fsub float %3000, %356, !dbg !158 + %3033 = fsub float %3001, %357, !dbg !158 + %3034 = fsub float %3002, %357, !dbg !158 + %3035 = fsub float %3003, %356, !dbg !158 + %3036 = fsub float %3004, %356, !dbg !158 + %3037 = fsub float %3005, %357, !dbg !158 + %3038 = fsub float %3006, %357, !dbg !158 + %3039 = fsub float %3007, %356, !dbg !158 + %3040 = fsub float %3008, %356, !dbg !158 + %3041 = fsub float %3009, %357, !dbg !158 + %3042 = fsub float %3010, %357, !dbg !158 + %3043 = fsub float %3011, %356, !dbg !158 + %3044 = fsub float %3012, %356, !dbg !158 + %3045 = fsub float %3013, %357, !dbg !158 + %3046 = fsub float %3014, %357, !dbg !158 + %3047 = fsub float %3015, %356, !dbg !158 + %3048 = fsub float %3016, %356, !dbg !158 + %3049 = fsub float %3017, %357, !dbg !158 + %3050 = fsub float %3018, %357, !dbg !158 + %3051 = fsub float %3019, %356, !dbg !158 + %3052 = fsub float %3020, %356, !dbg !158 + %3053 = fsub float %3021, %357, !dbg !158 + %3054 = fsub float %3022, %357, !dbg !158 + %3055 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1316 = icmp eq i32 %3055, 0, !dbg !159 + br i1 %.not.i1316, label %3058, label %3056, !dbg !159 + +3056: ; preds = %2561 + %3057 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3023) #3, !dbg !159 + br label %__nv_exp2f.exit1318, !dbg !159 + +3058: ; preds = %2561 + %3059 = tail call float @llvm.nvvm.ex2.approx.f(float %3023) #3, !dbg !159 + br label %__nv_exp2f.exit1318, !dbg !159 + +__nv_exp2f.exit1318: ; preds = %3056, %3058 + %.0.i1317 = phi float [ %3057, %3056 ], [ %3059, %3058 ], !dbg !159 + %3060 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1319 = icmp eq i32 %3060, 0, !dbg !159 + br i1 %.not.i1319, label %3063, label %3061, !dbg !159 + +3061: ; preds = %__nv_exp2f.exit1318 + %3062 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3024) #3, !dbg !159 + br label %__nv_exp2f.exit1321, !dbg !159 + +3063: ; preds = %__nv_exp2f.exit1318 + %3064 = tail call float @llvm.nvvm.ex2.approx.f(float %3024) #3, !dbg !159 + br label %__nv_exp2f.exit1321, !dbg !159 + +__nv_exp2f.exit1321: ; preds = %3061, %3063 + %.0.i1320 = phi float [ %3062, %3061 ], [ %3064, %3063 ], !dbg !159 + %3065 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1322 = icmp eq i32 %3065, 0, !dbg !159 + br i1 %.not.i1322, label %3068, label %3066, !dbg !159 + +3066: ; preds = %__nv_exp2f.exit1321 + %3067 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3025) #3, !dbg !159 + br label %__nv_exp2f.exit1324, !dbg !159 + +3068: ; preds = %__nv_exp2f.exit1321 + %3069 = tail call float @llvm.nvvm.ex2.approx.f(float %3025) #3, !dbg !159 + br label %__nv_exp2f.exit1324, !dbg !159 + +__nv_exp2f.exit1324: ; preds = %3066, %3068 + %.0.i1323 = phi float [ %3067, %3066 ], [ %3069, %3068 ], !dbg !159 + %3070 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1325 = icmp eq i32 %3070, 0, !dbg !159 + br i1 %.not.i1325, label %3073, label %3071, !dbg !159 + +3071: ; preds = %__nv_exp2f.exit1324 + %3072 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3026) #3, !dbg !159 + br label %__nv_exp2f.exit1327, !dbg !159 + +3073: ; preds = %__nv_exp2f.exit1324 + %3074 = tail call float @llvm.nvvm.ex2.approx.f(float %3026) #3, !dbg !159 + br label %__nv_exp2f.exit1327, !dbg !159 + +__nv_exp2f.exit1327: ; preds = %3071, %3073 + %.0.i1326 = phi float [ %3072, %3071 ], [ %3074, %3073 ], !dbg !159 + %3075 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1328 = icmp eq i32 %3075, 0, !dbg !159 + br i1 %.not.i1328, label %3078, label %3076, !dbg !159 + +3076: ; preds = %__nv_exp2f.exit1327 + %3077 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3027) #3, !dbg !159 + br label %__nv_exp2f.exit1330, !dbg !159 + +3078: ; preds = %__nv_exp2f.exit1327 + %3079 = tail call float @llvm.nvvm.ex2.approx.f(float %3027) #3, !dbg !159 + br label %__nv_exp2f.exit1330, !dbg !159 + +__nv_exp2f.exit1330: ; preds = %3076, %3078 + %.0.i1329 = phi float [ %3077, %3076 ], [ %3079, %3078 ], !dbg !159 + %3080 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1331 = icmp eq i32 %3080, 0, !dbg !159 + br i1 %.not.i1331, label %3083, label %3081, !dbg !159 + +3081: ; preds = %__nv_exp2f.exit1330 + %3082 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3028) #3, !dbg !159 + br label %__nv_exp2f.exit1333, !dbg !159 + +3083: ; preds = %__nv_exp2f.exit1330 + %3084 = tail call float @llvm.nvvm.ex2.approx.f(float %3028) #3, !dbg !159 + br label %__nv_exp2f.exit1333, !dbg !159 + +__nv_exp2f.exit1333: ; preds = %3081, %3083 + %.0.i1332 = phi float [ %3082, %3081 ], [ %3084, %3083 ], !dbg !159 + %3085 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1334 = icmp eq i32 %3085, 0, !dbg !159 + br i1 %.not.i1334, label %3088, label %3086, !dbg !159 + +3086: ; preds = %__nv_exp2f.exit1333 + %3087 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3029) #3, !dbg !159 + br label %__nv_exp2f.exit1336, !dbg !159 + +3088: ; preds = %__nv_exp2f.exit1333 + %3089 = tail call float @llvm.nvvm.ex2.approx.f(float %3029) #3, !dbg !159 + br label %__nv_exp2f.exit1336, !dbg !159 + +__nv_exp2f.exit1336: ; preds = %3086, %3088 + %.0.i1335 = phi float [ %3087, %3086 ], [ %3089, %3088 ], !dbg !159 + %3090 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1337 = icmp eq i32 %3090, 0, !dbg !159 + br i1 %.not.i1337, label %3093, label %3091, !dbg !159 + +3091: ; preds = %__nv_exp2f.exit1336 + %3092 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3030) #3, !dbg !159 + br label %__nv_exp2f.exit1339, !dbg !159 + +3093: ; preds = %__nv_exp2f.exit1336 + %3094 = tail call float @llvm.nvvm.ex2.approx.f(float %3030) #3, !dbg !159 + br label %__nv_exp2f.exit1339, !dbg !159 + +__nv_exp2f.exit1339: ; preds = %3091, %3093 + %.0.i1338 = phi float [ %3092, %3091 ], [ %3094, %3093 ], !dbg !159 + %3095 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1340 = icmp eq i32 %3095, 0, !dbg !159 + br i1 %.not.i1340, label %3098, label %3096, !dbg !159 + +3096: ; preds = %__nv_exp2f.exit1339 + %3097 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3031) #3, !dbg !159 + br label %__nv_exp2f.exit1342, !dbg !159 + +3098: ; preds = %__nv_exp2f.exit1339 + %3099 = tail call float @llvm.nvvm.ex2.approx.f(float %3031) #3, !dbg !159 + br label %__nv_exp2f.exit1342, !dbg !159 + +__nv_exp2f.exit1342: ; preds = %3096, %3098 + %.0.i1341 = phi float [ %3097, %3096 ], [ %3099, %3098 ], !dbg !159 + %3100 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1343 = icmp eq i32 %3100, 0, !dbg !159 + br i1 %.not.i1343, label %3103, label %3101, !dbg !159 + +3101: ; preds = %__nv_exp2f.exit1342 + %3102 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3032) #3, !dbg !159 + br label %__nv_exp2f.exit1345, !dbg !159 + +3103: ; preds = %__nv_exp2f.exit1342 + %3104 = tail call float @llvm.nvvm.ex2.approx.f(float %3032) #3, !dbg !159 + br label %__nv_exp2f.exit1345, !dbg !159 + +__nv_exp2f.exit1345: ; preds = %3101, %3103 + %.0.i1344 = phi float [ %3102, %3101 ], [ %3104, %3103 ], !dbg !159 + %3105 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1346 = icmp eq i32 %3105, 0, !dbg !159 + br i1 %.not.i1346, label %3108, label %3106, !dbg !159 + +3106: ; preds = %__nv_exp2f.exit1345 + %3107 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3033) #3, !dbg !159 + br label %__nv_exp2f.exit1348, !dbg !159 + +3108: ; preds = %__nv_exp2f.exit1345 + %3109 = tail call float @llvm.nvvm.ex2.approx.f(float %3033) #3, !dbg !159 + br label %__nv_exp2f.exit1348, !dbg !159 + +__nv_exp2f.exit1348: ; preds = %3106, %3108 + %.0.i1347 = phi float [ %3107, %3106 ], [ %3109, %3108 ], !dbg !159 + %3110 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1349 = icmp eq i32 %3110, 0, !dbg !159 + br i1 %.not.i1349, label %3113, label %3111, !dbg !159 + +3111: ; preds = %__nv_exp2f.exit1348 + %3112 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3034) #3, !dbg !159 + br label %__nv_exp2f.exit1351, !dbg !159 + +3113: ; preds = %__nv_exp2f.exit1348 + %3114 = tail call float @llvm.nvvm.ex2.approx.f(float %3034) #3, !dbg !159 + br label %__nv_exp2f.exit1351, !dbg !159 + +__nv_exp2f.exit1351: ; preds = %3111, %3113 + %.0.i1350 = phi float [ %3112, %3111 ], [ %3114, %3113 ], !dbg !159 + %3115 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1352 = icmp eq i32 %3115, 0, !dbg !159 + br i1 %.not.i1352, label %3118, label %3116, !dbg !159 + +3116: ; preds = %__nv_exp2f.exit1351 + %3117 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3035) #3, !dbg !159 + br label %__nv_exp2f.exit1354, !dbg !159 + +3118: ; preds = %__nv_exp2f.exit1351 + %3119 = tail call float @llvm.nvvm.ex2.approx.f(float %3035) #3, !dbg !159 + br label %__nv_exp2f.exit1354, !dbg !159 + +__nv_exp2f.exit1354: ; preds = %3116, %3118 + %.0.i1353 = phi float [ %3117, %3116 ], [ %3119, %3118 ], !dbg !159 + %3120 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1355 = icmp eq i32 %3120, 0, !dbg !159 + br i1 %.not.i1355, label %3123, label %3121, !dbg !159 + +3121: ; preds = %__nv_exp2f.exit1354 + %3122 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3036) #3, !dbg !159 + br label %__nv_exp2f.exit1357, !dbg !159 + +3123: ; preds = %__nv_exp2f.exit1354 + %3124 = tail call float @llvm.nvvm.ex2.approx.f(float %3036) #3, !dbg !159 + br label %__nv_exp2f.exit1357, !dbg !159 + +__nv_exp2f.exit1357: ; preds = %3121, %3123 + %.0.i1356 = phi float [ %3122, %3121 ], [ %3124, %3123 ], !dbg !159 + %3125 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1358 = icmp eq i32 %3125, 0, !dbg !159 + br i1 %.not.i1358, label %3128, label %3126, !dbg !159 + +3126: ; preds = %__nv_exp2f.exit1357 + %3127 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3037) #3, !dbg !159 + br label %__nv_exp2f.exit1360, !dbg !159 + +3128: ; preds = %__nv_exp2f.exit1357 + %3129 = tail call float @llvm.nvvm.ex2.approx.f(float %3037) #3, !dbg !159 + br label %__nv_exp2f.exit1360, !dbg !159 + +__nv_exp2f.exit1360: ; preds = %3126, %3128 + %.0.i1359 = phi float [ %3127, %3126 ], [ %3129, %3128 ], !dbg !159 + %3130 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1361 = icmp eq i32 %3130, 0, !dbg !159 + br i1 %.not.i1361, label %3133, label %3131, !dbg !159 + +3131: ; preds = %__nv_exp2f.exit1360 + %3132 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3038) #3, !dbg !159 + br label %__nv_exp2f.exit1363, !dbg !159 + +3133: ; preds = %__nv_exp2f.exit1360 + %3134 = tail call float @llvm.nvvm.ex2.approx.f(float %3038) #3, !dbg !159 + br label %__nv_exp2f.exit1363, !dbg !159 + +__nv_exp2f.exit1363: ; preds = %3131, %3133 + %.0.i1362 = phi float [ %3132, %3131 ], [ %3134, %3133 ], !dbg !159 + %3135 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1364 = icmp eq i32 %3135, 0, !dbg !159 + br i1 %.not.i1364, label %3138, label %3136, !dbg !159 + +3136: ; preds = %__nv_exp2f.exit1363 + %3137 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3039) #3, !dbg !159 + br label %__nv_exp2f.exit1366, !dbg !159 + +3138: ; preds = %__nv_exp2f.exit1363 + %3139 = tail call float @llvm.nvvm.ex2.approx.f(float %3039) #3, !dbg !159 + br label %__nv_exp2f.exit1366, !dbg !159 + +__nv_exp2f.exit1366: ; preds = %3136, %3138 + %.0.i1365 = phi float [ %3137, %3136 ], [ %3139, %3138 ], !dbg !159 + %3140 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1367 = icmp eq i32 %3140, 0, !dbg !159 + br i1 %.not.i1367, label %3143, label %3141, !dbg !159 + +3141: ; preds = %__nv_exp2f.exit1366 + %3142 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3040) #3, !dbg !159 + br label %__nv_exp2f.exit1369, !dbg !159 + +3143: ; preds = %__nv_exp2f.exit1366 + %3144 = tail call float @llvm.nvvm.ex2.approx.f(float %3040) #3, !dbg !159 + br label %__nv_exp2f.exit1369, !dbg !159 + +__nv_exp2f.exit1369: ; preds = %3141, %3143 + %.0.i1368 = phi float [ %3142, %3141 ], [ %3144, %3143 ], !dbg !159 + %3145 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1370 = icmp eq i32 %3145, 0, !dbg !159 + br i1 %.not.i1370, label %3148, label %3146, !dbg !159 + +3146: ; preds = %__nv_exp2f.exit1369 + %3147 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3041) #3, !dbg !159 + br label %__nv_exp2f.exit1372, !dbg !159 + +3148: ; preds = %__nv_exp2f.exit1369 + %3149 = tail call float @llvm.nvvm.ex2.approx.f(float %3041) #3, !dbg !159 + br label %__nv_exp2f.exit1372, !dbg !159 + +__nv_exp2f.exit1372: ; preds = %3146, %3148 + %.0.i1371 = phi float [ %3147, %3146 ], [ %3149, %3148 ], !dbg !159 + %3150 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1373 = icmp eq i32 %3150, 0, !dbg !159 + br i1 %.not.i1373, label %3153, label %3151, !dbg !159 + +3151: ; preds = %__nv_exp2f.exit1372 + %3152 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3042) #3, !dbg !159 + br label %__nv_exp2f.exit1375, !dbg !159 + +3153: ; preds = %__nv_exp2f.exit1372 + %3154 = tail call float @llvm.nvvm.ex2.approx.f(float %3042) #3, !dbg !159 + br label %__nv_exp2f.exit1375, !dbg !159 + +__nv_exp2f.exit1375: ; preds = %3151, %3153 + %.0.i1374 = phi float [ %3152, %3151 ], [ %3154, %3153 ], !dbg !159 + %3155 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1376 = icmp eq i32 %3155, 0, !dbg !159 + br i1 %.not.i1376, label %3158, label %3156, !dbg !159 + +3156: ; preds = %__nv_exp2f.exit1375 + %3157 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3043) #3, !dbg !159 + br label %__nv_exp2f.exit1378, !dbg !159 + +3158: ; preds = %__nv_exp2f.exit1375 + %3159 = tail call float @llvm.nvvm.ex2.approx.f(float %3043) #3, !dbg !159 + br label %__nv_exp2f.exit1378, !dbg !159 + +__nv_exp2f.exit1378: ; preds = %3156, %3158 + %.0.i1377 = phi float [ %3157, %3156 ], [ %3159, %3158 ], !dbg !159 + %3160 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1379 = icmp eq i32 %3160, 0, !dbg !159 + br i1 %.not.i1379, label %3163, label %3161, !dbg !159 + +3161: ; preds = %__nv_exp2f.exit1378 + %3162 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3044) #3, !dbg !159 + br label %__nv_exp2f.exit1381, !dbg !159 + +3163: ; preds = %__nv_exp2f.exit1378 + %3164 = tail call float @llvm.nvvm.ex2.approx.f(float %3044) #3, !dbg !159 + br label %__nv_exp2f.exit1381, !dbg !159 + +__nv_exp2f.exit1381: ; preds = %3161, %3163 + %.0.i1380 = phi float [ %3162, %3161 ], [ %3164, %3163 ], !dbg !159 + %3165 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1382 = icmp eq i32 %3165, 0, !dbg !159 + br i1 %.not.i1382, label %3168, label %3166, !dbg !159 + +3166: ; preds = %__nv_exp2f.exit1381 + %3167 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3045) #3, !dbg !159 + br label %__nv_exp2f.exit1384, !dbg !159 + +3168: ; preds = %__nv_exp2f.exit1381 + %3169 = tail call float @llvm.nvvm.ex2.approx.f(float %3045) #3, !dbg !159 + br label %__nv_exp2f.exit1384, !dbg !159 + +__nv_exp2f.exit1384: ; preds = %3166, %3168 + %.0.i1383 = phi float [ %3167, %3166 ], [ %3169, %3168 ], !dbg !159 + %3170 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1385 = icmp eq i32 %3170, 0, !dbg !159 + br i1 %.not.i1385, label %3173, label %3171, !dbg !159 + +3171: ; preds = %__nv_exp2f.exit1384 + %3172 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3046) #3, !dbg !159 + br label %__nv_exp2f.exit1387, !dbg !159 + +3173: ; preds = %__nv_exp2f.exit1384 + %3174 = tail call float @llvm.nvvm.ex2.approx.f(float %3046) #3, !dbg !159 + br label %__nv_exp2f.exit1387, !dbg !159 + +__nv_exp2f.exit1387: ; preds = %3171, %3173 + %.0.i1386 = phi float [ %3172, %3171 ], [ %3174, %3173 ], !dbg !159 + %3175 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1388 = icmp eq i32 %3175, 0, !dbg !159 + br i1 %.not.i1388, label %3178, label %3176, !dbg !159 + +3176: ; preds = %__nv_exp2f.exit1387 + %3177 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3047) #3, !dbg !159 + br label %__nv_exp2f.exit1390, !dbg !159 + +3178: ; preds = %__nv_exp2f.exit1387 + %3179 = tail call float @llvm.nvvm.ex2.approx.f(float %3047) #3, !dbg !159 + br label %__nv_exp2f.exit1390, !dbg !159 + +__nv_exp2f.exit1390: ; preds = %3176, %3178 + %.0.i1389 = phi float [ %3177, %3176 ], [ %3179, %3178 ], !dbg !159 + %3180 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1391 = icmp eq i32 %3180, 0, !dbg !159 + br i1 %.not.i1391, label %3183, label %3181, !dbg !159 + +3181: ; preds = %__nv_exp2f.exit1390 + %3182 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3048) #3, !dbg !159 + br label %__nv_exp2f.exit1393, !dbg !159 + +3183: ; preds = %__nv_exp2f.exit1390 + %3184 = tail call float @llvm.nvvm.ex2.approx.f(float %3048) #3, !dbg !159 + br label %__nv_exp2f.exit1393, !dbg !159 + +__nv_exp2f.exit1393: ; preds = %3181, %3183 + %.0.i1392 = phi float [ %3182, %3181 ], [ %3184, %3183 ], !dbg !159 + %3185 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1394 = icmp eq i32 %3185, 0, !dbg !159 + br i1 %.not.i1394, label %3188, label %3186, !dbg !159 + +3186: ; preds = %__nv_exp2f.exit1393 + %3187 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3049) #3, !dbg !159 + br label %__nv_exp2f.exit1396, !dbg !159 + +3188: ; preds = %__nv_exp2f.exit1393 + %3189 = tail call float @llvm.nvvm.ex2.approx.f(float %3049) #3, !dbg !159 + br label %__nv_exp2f.exit1396, !dbg !159 + +__nv_exp2f.exit1396: ; preds = %3186, %3188 + %.0.i1395 = phi float [ %3187, %3186 ], [ %3189, %3188 ], !dbg !159 + %3190 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1397 = icmp eq i32 %3190, 0, !dbg !159 + br i1 %.not.i1397, label %3193, label %3191, !dbg !159 + +3191: ; preds = %__nv_exp2f.exit1396 + %3192 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3050) #3, !dbg !159 + br label %__nv_exp2f.exit1399, !dbg !159 + +3193: ; preds = %__nv_exp2f.exit1396 + %3194 = tail call float @llvm.nvvm.ex2.approx.f(float %3050) #3, !dbg !159 + br label %__nv_exp2f.exit1399, !dbg !159 + +__nv_exp2f.exit1399: ; preds = %3191, %3193 + %.0.i1398 = phi float [ %3192, %3191 ], [ %3194, %3193 ], !dbg !159 + %3195 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1400 = icmp eq i32 %3195, 0, !dbg !159 + br i1 %.not.i1400, label %3198, label %3196, !dbg !159 + +3196: ; preds = %__nv_exp2f.exit1399 + %3197 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3051) #3, !dbg !159 + br label %__nv_exp2f.exit1402, !dbg !159 + +3198: ; preds = %__nv_exp2f.exit1399 + %3199 = tail call float @llvm.nvvm.ex2.approx.f(float %3051) #3, !dbg !159 + br label %__nv_exp2f.exit1402, !dbg !159 + +__nv_exp2f.exit1402: ; preds = %3196, %3198 + %.0.i1401 = phi float [ %3197, %3196 ], [ %3199, %3198 ], !dbg !159 + %3200 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1403 = icmp eq i32 %3200, 0, !dbg !159 + br i1 %.not.i1403, label %3203, label %3201, !dbg !159 + +3201: ; preds = %__nv_exp2f.exit1402 + %3202 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3052) #3, !dbg !159 + br label %__nv_exp2f.exit1405, !dbg !159 + +3203: ; preds = %__nv_exp2f.exit1402 + %3204 = tail call float @llvm.nvvm.ex2.approx.f(float %3052) #3, !dbg !159 + br label %__nv_exp2f.exit1405, !dbg !159 + +__nv_exp2f.exit1405: ; preds = %3201, %3203 + %.0.i1404 = phi float [ %3202, %3201 ], [ %3204, %3203 ], !dbg !159 + %3205 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1406 = icmp eq i32 %3205, 0, !dbg !159 + br i1 %.not.i1406, label %3208, label %3206, !dbg !159 + +3206: ; preds = %__nv_exp2f.exit1405 + %3207 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3053) #3, !dbg !159 + br label %__nv_exp2f.exit1408, !dbg !159 + +3208: ; preds = %__nv_exp2f.exit1405 + %3209 = tail call float @llvm.nvvm.ex2.approx.f(float %3053) #3, !dbg !159 + br label %__nv_exp2f.exit1408, !dbg !159 + +__nv_exp2f.exit1408: ; preds = %3206, %3208 + %.0.i1407 = phi float [ %3207, %3206 ], [ %3209, %3208 ], !dbg !159 + %3210 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1409 = icmp eq i32 %3210, 0, !dbg !159 + br i1 %.not.i1409, label %3213, label %3211, !dbg !159 + +3211: ; preds = %__nv_exp2f.exit1408 + %3212 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3054) #3, !dbg !159 + br label %__nv_exp2f.exit1411, !dbg !159 + +3213: ; preds = %__nv_exp2f.exit1408 + %3214 = tail call float @llvm.nvvm.ex2.approx.f(float %3054) #3, !dbg !159 + br label %__nv_exp2f.exit1411, !dbg !159 + +__nv_exp2f.exit1411: ; preds = %3211, %3213 + %.0.i1410 = phi float [ %3212, %3211 ], [ %3214, %3213 ], !dbg !159 + %3215 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %2570, !dbg !150 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !160 + %3216 = add i32 %2574, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !160 + %3217 = lshr exact i32 %3216, 4, !dbg !160 + %3218 = and i32 %3217, 16383, !dbg !160 + %3219 = zext nneg i32 %3218 to i64, !dbg !160 + %3220 = or disjoint i64 %3219, 4611686293372403712, !dbg !160 + %3221 = ptrtoint ptr addrspace(3) %3215 to i32, !dbg !160 + %3222 = lshr exact i32 %3221, 4, !dbg !160 + %3223 = and i32 %3222, 16383, !dbg !160 + %3224 = zext nneg i32 %3223 to i64, !dbg !160 + %3225 = or disjoint i64 %3224, 4611686293338849280, !dbg !160 + %3226 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %3220, i64 %3225) #3, !dbg !160 + %3227 = add i32 %2586, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !160 + %3228 = lshr exact i32 %3227, 4, !dbg !160 + %3229 = and i32 %3228, 16383, !dbg !160 + %3230 = zext nneg i32 %3229 to i64, !dbg !160 + %3231 = or disjoint i64 %3230, 4611686293372403712, !dbg !160 + %3232 = add i32 %3221, 32, !dbg !160 + %3233 = lshr exact i32 %3232, 4, !dbg !160 + %3234 = and i32 %3233, 16383, !dbg !160 + %3235 = zext nneg i32 %3234 to i64, !dbg !160 + %3236 = or disjoint i64 %3235, 4611686293338849280, !dbg !160 + %3237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 0, !dbg !160 + %3238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 1, !dbg !160 + %3239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 2, !dbg !160 + %3240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 3, !dbg !160 + %3241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 4, !dbg !160 + %3242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 5, !dbg !160 + %3243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 6, !dbg !160 + %3244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 7, !dbg !160 + %3245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 8, !dbg !160 + %3246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 9, !dbg !160 + %3247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 10, !dbg !160 + %3248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 11, !dbg !160 + %3249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 12, !dbg !160 + %3250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 13, !dbg !160 + %3251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 14, !dbg !160 + %3252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 15, !dbg !160 + %3253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 16, !dbg !160 + %3254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 17, !dbg !160 + %3255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 18, !dbg !160 + %3256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 19, !dbg !160 + %3257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 20, !dbg !160 + %3258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 21, !dbg !160 + %3259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 22, !dbg !160 + %3260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 23, !dbg !160 + %3261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 24, !dbg !160 + %3262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 25, !dbg !160 + %3263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 26, !dbg !160 + %3264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 27, !dbg !160 + %3265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 28, !dbg !160 + %3266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 29, !dbg !160 + %3267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 30, !dbg !160 + %3268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 31, !dbg !160 + %3269 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3237, float %3238, float %3239, float %3240, float %3241, float %3242, float %3243, float %3244, float %3245, float %3246, float %3247, float %3248, float %3249, float %3250, float %3251, float %3252, float %3253, float %3254, float %3255, float %3256, float %3257, float %3258, float %3259, float %3260, float %3261, float %3262, float %3263, float %3264, float %3265, float %3266, float %3267, float %3268, i64 %3231, i64 %3236, i1 true) #3, !dbg !160 + %3270 = add i32 %2630, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !160 + %3271 = lshr exact i32 %3270, 4, !dbg !160 + %3272 = and i32 %3271, 16383, !dbg !160 + %3273 = zext nneg i32 %3272 to i64, !dbg !160 + %3274 = or disjoint i64 %3273, 4611686293372403712, !dbg !160 + %3275 = add i32 %3221, 64, !dbg !160 + %3276 = lshr exact i32 %3275, 4, !dbg !160 + %3277 = and i32 %3276, 16383, !dbg !160 + %3278 = zext nneg i32 %3277 to i64, !dbg !160 + %3279 = or disjoint i64 %3278, 4611686293338849280, !dbg !160 + %3280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 0, !dbg !160 + %3281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 1, !dbg !160 + %3282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 2, !dbg !160 + %3283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 3, !dbg !160 + %3284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 4, !dbg !160 + %3285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 5, !dbg !160 + %3286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 6, !dbg !160 + %3287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 7, !dbg !160 + %3288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 8, !dbg !160 + %3289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 9, !dbg !160 + %3290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 10, !dbg !160 + %3291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 11, !dbg !160 + %3292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 12, !dbg !160 + %3293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 13, !dbg !160 + %3294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 14, !dbg !160 + %3295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 15, !dbg !160 + %3296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 16, !dbg !160 + %3297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 17, !dbg !160 + %3298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 18, !dbg !160 + %3299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 19, !dbg !160 + %3300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 20, !dbg !160 + %3301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 21, !dbg !160 + %3302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 22, !dbg !160 + %3303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 23, !dbg !160 + %3304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 24, !dbg !160 + %3305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 25, !dbg !160 + %3306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 26, !dbg !160 + %3307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 27, !dbg !160 + %3308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 28, !dbg !160 + %3309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 29, !dbg !160 + %3310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 30, !dbg !160 + %3311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 31, !dbg !160 + %3312 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3280, float %3281, float %3282, float %3283, float %3284, float %3285, float %3286, float %3287, float %3288, float %3289, float %3290, float %3291, float %3292, float %3293, float %3294, float %3295, float %3296, float %3297, float %3298, float %3299, float %3300, float %3301, float %3302, float %3303, float %3304, float %3305, float %3306, float %3307, float %3308, float %3309, float %3310, float %3311, i64 %3274, i64 %3279, i1 true) #3, !dbg !160 + %3313 = add i32 %2674, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !160 + %3314 = lshr exact i32 %3313, 4, !dbg !160 + %3315 = and i32 %3314, 16383, !dbg !160 + %3316 = zext nneg i32 %3315 to i64, !dbg !160 + %3317 = or disjoint i64 %3316, 4611686293372403712, !dbg !160 + %3318 = add i32 %3221, 96, !dbg !160 + %3319 = lshr exact i32 %3318, 4, !dbg !160 + %3320 = and i32 %3319, 16383, !dbg !160 + %3321 = zext nneg i32 %3320 to i64, !dbg !160 + %3322 = or disjoint i64 %3321, 4611686293338849280, !dbg !160 + %3323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 0, !dbg !160 + %3324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 1, !dbg !160 + %3325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 2, !dbg !160 + %3326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 3, !dbg !160 + %3327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 4, !dbg !160 + %3328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 5, !dbg !160 + %3329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 6, !dbg !160 + %3330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 7, !dbg !160 + %3331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 8, !dbg !160 + %3332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 9, !dbg !160 + %3333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 10, !dbg !160 + %3334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 11, !dbg !160 + %3335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 12, !dbg !160 + %3336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 13, !dbg !160 + %3337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 14, !dbg !160 + %3338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 15, !dbg !160 + %3339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 16, !dbg !160 + %3340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 17, !dbg !160 + %3341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 18, !dbg !160 + %3342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 19, !dbg !160 + %3343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 20, !dbg !160 + %3344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 21, !dbg !160 + %3345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 22, !dbg !160 + %3346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 23, !dbg !160 + %3347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 24, !dbg !160 + %3348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 25, !dbg !160 + %3349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 26, !dbg !160 + %3350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 27, !dbg !160 + %3351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 28, !dbg !160 + %3352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 29, !dbg !160 + %3353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 30, !dbg !160 + %3354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 31, !dbg !160 + %3355 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3323, float %3324, float %3325, float %3326, float %3327, float %3328, float %3329, float %3330, float %3331, float %3332, float %3333, float %3334, float %3335, float %3336, float %3337, float %3338, float %3339, float %3340, float %3341, float %3342, float %3343, float %3344, float %3345, float %3346, float %3347, float %3348, float %3349, float %3350, float %3351, float %3352, float %3353, float %3354, i64 %3317, i64 %3322, i1 true) #3, !dbg !160 + %3356 = add i32 %2718, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !160 + %3357 = lshr exact i32 %3356, 4, !dbg !160 + %3358 = and i32 %3357, 16383, !dbg !160 + %3359 = zext nneg i32 %3358 to i64, !dbg !160 + %3360 = or disjoint i64 %3359, 4611686293372403712, !dbg !160 + %3361 = add i32 %3221, 8192, !dbg !160 + %3362 = lshr exact i32 %3361, 4, !dbg !160 + %3363 = and i32 %3362, 16383, !dbg !160 + %3364 = zext nneg i32 %3363 to i64, !dbg !160 + %3365 = or disjoint i64 %3364, 4611686293338849280, !dbg !160 + %3366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 0, !dbg !160 + %3367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 1, !dbg !160 + %3368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 2, !dbg !160 + %3369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 3, !dbg !160 + %3370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 4, !dbg !160 + %3371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 5, !dbg !160 + %3372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 6, !dbg !160 + %3373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 7, !dbg !160 + %3374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 8, !dbg !160 + %3375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 9, !dbg !160 + %3376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 10, !dbg !160 + %3377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 11, !dbg !160 + %3378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 12, !dbg !160 + %3379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 13, !dbg !160 + %3380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 14, !dbg !160 + %3381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 15, !dbg !160 + %3382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 16, !dbg !160 + %3383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 17, !dbg !160 + %3384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 18, !dbg !160 + %3385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 19, !dbg !160 + %3386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 20, !dbg !160 + %3387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 21, !dbg !160 + %3388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 22, !dbg !160 + %3389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 23, !dbg !160 + %3390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 24, !dbg !160 + %3391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 25, !dbg !160 + %3392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 26, !dbg !160 + %3393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 27, !dbg !160 + %3394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 28, !dbg !160 + %3395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 29, !dbg !160 + %3396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 30, !dbg !160 + %3397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 31, !dbg !160 + %3398 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3366, float %3367, float %3368, float %3369, float %3370, float %3371, float %3372, float %3373, float %3374, float %3375, float %3376, float %3377, float %3378, float %3379, float %3380, float %3381, float %3382, float %3383, float %3384, float %3385, float %3386, float %3387, float %3388, float %3389, float %3390, float %3391, float %3392, float %3393, float %3394, float %3395, float %3396, float %3397, i64 %3360, i64 %3365, i1 true) #3, !dbg !160 + %3399 = add i32 %2762, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !160 + %3400 = lshr exact i32 %3399, 4, !dbg !160 + %3401 = and i32 %3400, 16383, !dbg !160 + %3402 = zext nneg i32 %3401 to i64, !dbg !160 + %3403 = or disjoint i64 %3402, 4611686293372403712, !dbg !160 + %3404 = add i32 %3221, 8224, !dbg !160 + %3405 = lshr exact i32 %3404, 4, !dbg !160 + %3406 = and i32 %3405, 16383, !dbg !160 + %3407 = zext nneg i32 %3406 to i64, !dbg !160 + %3408 = or disjoint i64 %3407, 4611686293338849280, !dbg !160 + %3409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 0, !dbg !160 + %3410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 1, !dbg !160 + %3411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 2, !dbg !160 + %3412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 3, !dbg !160 + %3413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 4, !dbg !160 + %3414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 5, !dbg !160 + %3415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 6, !dbg !160 + %3416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 7, !dbg !160 + %3417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 8, !dbg !160 + %3418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 9, !dbg !160 + %3419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 10, !dbg !160 + %3420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 11, !dbg !160 + %3421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 12, !dbg !160 + %3422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 13, !dbg !160 + %3423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 14, !dbg !160 + %3424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 15, !dbg !160 + %3425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 16, !dbg !160 + %3426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 17, !dbg !160 + %3427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 18, !dbg !160 + %3428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 19, !dbg !160 + %3429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 20, !dbg !160 + %3430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 21, !dbg !160 + %3431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 22, !dbg !160 + %3432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 23, !dbg !160 + %3433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 24, !dbg !160 + %3434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 25, !dbg !160 + %3435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 26, !dbg !160 + %3436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 27, !dbg !160 + %3437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 28, !dbg !160 + %3438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 29, !dbg !160 + %3439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 30, !dbg !160 + %3440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 31, !dbg !160 + %3441 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3409, float %3410, float %3411, float %3412, float %3413, float %3414, float %3415, float %3416, float %3417, float %3418, float %3419, float %3420, float %3421, float %3422, float %3423, float %3424, float %3425, float %3426, float %3427, float %3428, float %3429, float %3430, float %3431, float %3432, float %3433, float %3434, float %3435, float %3436, float %3437, float %3438, float %3439, float %3440, i64 %3403, i64 %3408, i1 true) #3, !dbg !160 + %3442 = add i32 %2806, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !160 + %3443 = lshr exact i32 %3442, 4, !dbg !160 + %3444 = and i32 %3443, 16383, !dbg !160 + %3445 = zext nneg i32 %3444 to i64, !dbg !160 + %3446 = or disjoint i64 %3445, 4611686293372403712, !dbg !160 + %3447 = add i32 %3221, 8256, !dbg !160 + %3448 = lshr exact i32 %3447, 4, !dbg !160 + %3449 = and i32 %3448, 16383, !dbg !160 + %3450 = zext nneg i32 %3449 to i64, !dbg !160 + %3451 = or disjoint i64 %3450, 4611686293338849280, !dbg !160 + %3452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 0, !dbg !160 + %3453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 1, !dbg !160 + %3454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 2, !dbg !160 + %3455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 3, !dbg !160 + %3456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 4, !dbg !160 + %3457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 5, !dbg !160 + %3458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 6, !dbg !160 + %3459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 7, !dbg !160 + %3460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 8, !dbg !160 + %3461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 9, !dbg !160 + %3462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 10, !dbg !160 + %3463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 11, !dbg !160 + %3464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 12, !dbg !160 + %3465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 13, !dbg !160 + %3466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 14, !dbg !160 + %3467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 15, !dbg !160 + %3468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 16, !dbg !160 + %3469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 17, !dbg !160 + %3470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 18, !dbg !160 + %3471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 19, !dbg !160 + %3472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 20, !dbg !160 + %3473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 21, !dbg !160 + %3474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 22, !dbg !160 + %3475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 23, !dbg !160 + %3476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 24, !dbg !160 + %3477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 25, !dbg !160 + %3478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 26, !dbg !160 + %3479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 27, !dbg !160 + %3480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 28, !dbg !160 + %3481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 29, !dbg !160 + %3482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 30, !dbg !160 + %3483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 31, !dbg !160 + %3484 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3452, float %3453, float %3454, float %3455, float %3456, float %3457, float %3458, float %3459, float %3460, float %3461, float %3462, float %3463, float %3464, float %3465, float %3466, float %3467, float %3468, float %3469, float %3470, float %3471, float %3472, float %3473, float %3474, float %3475, float %3476, float %3477, float %3478, float %3479, float %3480, float %3481, float %3482, float %3483, i64 %3446, i64 %3451, i1 true) #3, !dbg !160 + %3485 = add i32 %2850, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !160 + %3486 = lshr exact i32 %3485, 4, !dbg !160 + %3487 = and i32 %3486, 16383, !dbg !160 + %3488 = zext nneg i32 %3487 to i64, !dbg !160 + %3489 = or disjoint i64 %3488, 4611686293372403712, !dbg !160 + %3490 = add i32 %3221, 8288, !dbg !160 + %3491 = lshr exact i32 %3490, 4, !dbg !160 + %3492 = and i32 %3491, 16383, !dbg !160 + %3493 = zext nneg i32 %3492 to i64, !dbg !160 + %3494 = or disjoint i64 %3493, 4611686293338849280, !dbg !160 + %3495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 0, !dbg !160 + %3496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 1, !dbg !160 + %3497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 2, !dbg !160 + %3498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 3, !dbg !160 + %3499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 4, !dbg !160 + %3500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 5, !dbg !160 + %3501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 6, !dbg !160 + %3502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 7, !dbg !160 + %3503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 8, !dbg !160 + %3504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 9, !dbg !160 + %3505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 10, !dbg !160 + %3506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 11, !dbg !160 + %3507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 12, !dbg !160 + %3508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 13, !dbg !160 + %3509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 14, !dbg !160 + %3510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 15, !dbg !160 + %3511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 16, !dbg !160 + %3512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 17, !dbg !160 + %3513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 18, !dbg !160 + %3514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 19, !dbg !160 + %3515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 20, !dbg !160 + %3516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 21, !dbg !160 + %3517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 22, !dbg !160 + %3518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 23, !dbg !160 + %3519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 24, !dbg !160 + %3520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 25, !dbg !160 + %3521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 26, !dbg !160 + %3522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 27, !dbg !160 + %3523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 28, !dbg !160 + %3524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 29, !dbg !160 + %3525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 30, !dbg !160 + %3526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 31, !dbg !160 + %3527 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3495, float %3496, float %3497, float %3498, float %3499, float %3500, float %3501, float %3502, float %3503, float %3504, float %3505, float %3506, float %3507, float %3508, float %3509, float %3510, float %3511, float %3512, float %3513, float %3514, float %3515, float %3516, float %3517, float %3518, float %3519, float %3520, float %3521, float %3522, float %3523, float %3524, float %3525, float %3526, i64 %3489, i64 %3494, i1 true) #3, !dbg !160 + %3528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 0, !dbg !160 + %3529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 1, !dbg !160 + %3530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 2, !dbg !160 + %3531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 3, !dbg !160 + %3532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 4, !dbg !160 + %3533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 5, !dbg !160 + %3534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 6, !dbg !160 + %3535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 7, !dbg !160 + %3536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 8, !dbg !160 + %3537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 9, !dbg !160 + %3538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 10, !dbg !160 + %3539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 11, !dbg !160 + %3540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 12, !dbg !160 + %3541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 13, !dbg !160 + %3542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 14, !dbg !160 + %3543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 15, !dbg !160 + %3544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 16, !dbg !160 + %3545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 17, !dbg !160 + %3546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 18, !dbg !160 + %3547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 19, !dbg !160 + %3548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 20, !dbg !160 + %3549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 21, !dbg !160 + %3550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 22, !dbg !160 + %3551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 23, !dbg !160 + %3552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 24, !dbg !160 + %3553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 25, !dbg !160 + %3554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 26, !dbg !160 + %3555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 27, !dbg !160 + %3556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 28, !dbg !160 + %3557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 29, !dbg !160 + %3558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 30, !dbg !160 + %3559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 31, !dbg !160 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !160 + %3560 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %3528, float %3529, float %3530, float %3531, float %3532, float %3533, float %3534, float %3535, float %3536, float %3537, float %3538, float %3539, float %3540, float %3541, float %3542, float %3543, float %3544, float %3545, float %3546, float %3547, float %3548, float %3549, float %3550, float %3551, float %3552, float %3553, float %3554, float %3555, float %3556, float %3557, float %3558, float %3559, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 0, i32 0, ptr addrspace(3) %3215, i32 0, i32 0) #3, !dbg !160 + %3561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 0, !dbg !160 + %3562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 1, !dbg !160 + %3563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 2, !dbg !160 + %3564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 3, !dbg !160 + %3565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 4, !dbg !160 + %3566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 5, !dbg !160 + %3567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 6, !dbg !160 + %3568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 7, !dbg !160 + %3569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 8, !dbg !160 + %3570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 9, !dbg !160 + %3571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 10, !dbg !160 + %3572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 11, !dbg !160 + %3573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 12, !dbg !160 + %3574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 13, !dbg !160 + %3575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 14, !dbg !160 + %3576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 15, !dbg !160 + %3577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 16, !dbg !160 + %3578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 17, !dbg !160 + %3579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 18, !dbg !160 + %3580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 19, !dbg !160 + %3581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 20, !dbg !160 + %3582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 21, !dbg !160 + %3583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 22, !dbg !160 + %3584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 23, !dbg !160 + %3585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 24, !dbg !160 + %3586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 25, !dbg !160 + %3587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 26, !dbg !160 + %3588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 27, !dbg !160 + %3589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 28, !dbg !160 + %3590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 29, !dbg !160 + %3591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 30, !dbg !160 + %3592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 31, !dbg !160 + %3593 = insertelement <2 x float> poison, float %3561, i64 0, !dbg !155 + %3594 = insertelement <2 x float> %3593, float %3562, i64 1, !dbg !155 + %3595 = fsub <2 x float> %3594, %2560, !dbg !155 + %3596 = insertelement <2 x float> poison, float %.0.i1317, i64 0, !dbg !161 + %3597 = insertelement <2 x float> %3596, float %.0.i1320, i64 1, !dbg !161 + %3598 = fmul <2 x float> %3597, %3595, !dbg !161 + %3599 = fptrunc <2 x float> %3598 to <2 x bfloat>, !dbg !162 + %3600 = insertelement <2 x float> poison, float %3563, i64 0, !dbg !155 + %3601 = insertelement <2 x float> %3600, float %3564, i64 1, !dbg !155 + %3602 = fsub <2 x float> %3601, %2558, !dbg !155 + %3603 = insertelement <2 x float> poison, float %.0.i1323, i64 0, !dbg !161 + %3604 = insertelement <2 x float> %3603, float %.0.i1326, i64 1, !dbg !161 + %3605 = fmul <2 x float> %3604, %3602, !dbg !161 + %3606 = fptrunc <2 x float> %3605 to <2 x bfloat>, !dbg !162 + %3607 = insertelement <2 x float> poison, float %3565, i64 0, !dbg !155 + %3608 = insertelement <2 x float> %3607, float %3566, i64 1, !dbg !155 + %3609 = fsub <2 x float> %3608, %2560, !dbg !155 + %3610 = insertelement <2 x float> poison, float %.0.i1329, i64 0, !dbg !161 + %3611 = insertelement <2 x float> %3610, float %.0.i1332, i64 1, !dbg !161 + %3612 = fmul <2 x float> %3611, %3609, !dbg !161 + %3613 = fptrunc <2 x float> %3612 to <2 x bfloat>, !dbg !162 + %3614 = insertelement <2 x float> poison, float %3567, i64 0, !dbg !155 + %3615 = insertelement <2 x float> %3614, float %3568, i64 1, !dbg !155 + %3616 = fsub <2 x float> %3615, %2558, !dbg !155 + %3617 = insertelement <2 x float> poison, float %.0.i1335, i64 0, !dbg !161 + %3618 = insertelement <2 x float> %3617, float %.0.i1338, i64 1, !dbg !161 + %3619 = fmul <2 x float> %3618, %3616, !dbg !161 + %3620 = fptrunc <2 x float> %3619 to <2 x bfloat>, !dbg !162 + %3621 = insertelement <2 x float> poison, float %3569, i64 0, !dbg !155 + %3622 = insertelement <2 x float> %3621, float %3570, i64 1, !dbg !155 + %3623 = fsub <2 x float> %3622, %2560, !dbg !155 + %3624 = insertelement <2 x float> poison, float %.0.i1341, i64 0, !dbg !161 + %3625 = insertelement <2 x float> %3624, float %.0.i1344, i64 1, !dbg !161 + %3626 = fmul <2 x float> %3625, %3623, !dbg !161 + %3627 = fptrunc <2 x float> %3626 to <2 x bfloat>, !dbg !162 + %3628 = insertelement <2 x float> poison, float %3571, i64 0, !dbg !155 + %3629 = insertelement <2 x float> %3628, float %3572, i64 1, !dbg !155 + %3630 = fsub <2 x float> %3629, %2558, !dbg !155 + %3631 = insertelement <2 x float> poison, float %.0.i1347, i64 0, !dbg !161 + %3632 = insertelement <2 x float> %3631, float %.0.i1350, i64 1, !dbg !161 + %3633 = fmul <2 x float> %3632, %3630, !dbg !161 + %3634 = fptrunc <2 x float> %3633 to <2 x bfloat>, !dbg !162 + %3635 = insertelement <2 x float> poison, float %3573, i64 0, !dbg !155 + %3636 = insertelement <2 x float> %3635, float %3574, i64 1, !dbg !155 + %3637 = fsub <2 x float> %3636, %2560, !dbg !155 + %3638 = insertelement <2 x float> poison, float %.0.i1353, i64 0, !dbg !161 + %3639 = insertelement <2 x float> %3638, float %.0.i1356, i64 1, !dbg !161 + %3640 = fmul <2 x float> %3639, %3637, !dbg !161 + %3641 = fptrunc <2 x float> %3640 to <2 x bfloat>, !dbg !162 + %3642 = insertelement <2 x float> poison, float %3575, i64 0, !dbg !155 + %3643 = insertelement <2 x float> %3642, float %3576, i64 1, !dbg !155 + %3644 = fsub <2 x float> %3643, %2558, !dbg !155 + %3645 = insertelement <2 x float> poison, float %.0.i1359, i64 0, !dbg !161 + %3646 = insertelement <2 x float> %3645, float %.0.i1362, i64 1, !dbg !161 + %3647 = fmul <2 x float> %3646, %3644, !dbg !161 + %3648 = fptrunc <2 x float> %3647 to <2 x bfloat>, !dbg !162 + %3649 = insertelement <2 x float> poison, float %3577, i64 0, !dbg !155 + %3650 = insertelement <2 x float> %3649, float %3578, i64 1, !dbg !155 + %3651 = fsub <2 x float> %3650, %2560, !dbg !155 + %3652 = insertelement <2 x float> poison, float %.0.i1365, i64 0, !dbg !161 + %3653 = insertelement <2 x float> %3652, float %.0.i1368, i64 1, !dbg !161 + %3654 = fmul <2 x float> %3653, %3651, !dbg !161 + %3655 = fptrunc <2 x float> %3654 to <2 x bfloat>, !dbg !162 + %3656 = insertelement <2 x float> poison, float %3579, i64 0, !dbg !155 + %3657 = insertelement <2 x float> %3656, float %3580, i64 1, !dbg !155 + %3658 = fsub <2 x float> %3657, %2558, !dbg !155 + %3659 = insertelement <2 x float> poison, float %.0.i1371, i64 0, !dbg !161 + %3660 = insertelement <2 x float> %3659, float %.0.i1374, i64 1, !dbg !161 + %3661 = fmul <2 x float> %3660, %3658, !dbg !161 + %3662 = fptrunc <2 x float> %3661 to <2 x bfloat>, !dbg !162 + %3663 = insertelement <2 x float> poison, float %3581, i64 0, !dbg !155 + %3664 = insertelement <2 x float> %3663, float %3582, i64 1, !dbg !155 + %3665 = fsub <2 x float> %3664, %2560, !dbg !155 + %3666 = insertelement <2 x float> poison, float %.0.i1377, i64 0, !dbg !161 + %3667 = insertelement <2 x float> %3666, float %.0.i1380, i64 1, !dbg !161 + %3668 = fmul <2 x float> %3667, %3665, !dbg !161 + %3669 = fptrunc <2 x float> %3668 to <2 x bfloat>, !dbg !162 + %3670 = insertelement <2 x float> poison, float %3583, i64 0, !dbg !155 + %3671 = insertelement <2 x float> %3670, float %3584, i64 1, !dbg !155 + %3672 = fsub <2 x float> %3671, %2558, !dbg !155 + %3673 = insertelement <2 x float> poison, float %.0.i1383, i64 0, !dbg !161 + %3674 = insertelement <2 x float> %3673, float %.0.i1386, i64 1, !dbg !161 + %3675 = fmul <2 x float> %3674, %3672, !dbg !161 + %3676 = fptrunc <2 x float> %3675 to <2 x bfloat>, !dbg !162 + %3677 = insertelement <2 x float> poison, float %3585, i64 0, !dbg !155 + %3678 = insertelement <2 x float> %3677, float %3586, i64 1, !dbg !155 + %3679 = fsub <2 x float> %3678, %2560, !dbg !155 + %3680 = insertelement <2 x float> poison, float %.0.i1389, i64 0, !dbg !161 + %3681 = insertelement <2 x float> %3680, float %.0.i1392, i64 1, !dbg !161 + %3682 = fmul <2 x float> %3681, %3679, !dbg !161 + %3683 = fptrunc <2 x float> %3682 to <2 x bfloat>, !dbg !162 + %3684 = insertelement <2 x float> poison, float %3587, i64 0, !dbg !155 + %3685 = insertelement <2 x float> %3684, float %3588, i64 1, !dbg !155 + %3686 = fsub <2 x float> %3685, %2558, !dbg !155 + %3687 = insertelement <2 x float> poison, float %.0.i1395, i64 0, !dbg !161 + %3688 = insertelement <2 x float> %3687, float %.0.i1398, i64 1, !dbg !161 + %3689 = fmul <2 x float> %3688, %3686, !dbg !161 + %3690 = fptrunc <2 x float> %3689 to <2 x bfloat>, !dbg !162 + %3691 = insertelement <2 x float> poison, float %3589, i64 0, !dbg !155 + %3692 = insertelement <2 x float> %3691, float %3590, i64 1, !dbg !155 + %3693 = fsub <2 x float> %3692, %2560, !dbg !155 + %3694 = insertelement <2 x float> poison, float %.0.i1401, i64 0, !dbg !161 + %3695 = insertelement <2 x float> %3694, float %.0.i1404, i64 1, !dbg !161 + %3696 = fmul <2 x float> %3695, %3693, !dbg !161 + %3697 = fptrunc <2 x float> %3696 to <2 x bfloat>, !dbg !162 + %3698 = insertelement <2 x float> poison, float %3591, i64 0, !dbg !155 + %3699 = insertelement <2 x float> %3698, float %3592, i64 1, !dbg !155 + %3700 = fsub <2 x float> %3699, %2558, !dbg !155 + %3701 = insertelement <2 x float> poison, float %.0.i1407, i64 0, !dbg !161 + %3702 = insertelement <2 x float> %3701, float %.0.i1410, i64 1, !dbg !161 + %3703 = fmul <2 x float> %3702, %3700, !dbg !161 + %3704 = fptrunc <2 x float> %3703 to <2 x bfloat>, !dbg !162 + %3705 = bitcast <2 x bfloat> %3599 to i32, !dbg !163 + %3706 = bitcast <2 x bfloat> %3606 to i32, !dbg !163 + %3707 = bitcast <2 x bfloat> %3613 to i32, !dbg !163 + %3708 = bitcast <2 x bfloat> %3620 to i32, !dbg !163 + %3709 = bitcast <2 x bfloat> %3627 to i32, !dbg !163 + %3710 = bitcast <2 x bfloat> %3634 to i32, !dbg !163 + %3711 = bitcast <2 x bfloat> %3641 to i32, !dbg !163 + %3712 = bitcast <2 x bfloat> %3648 to i32, !dbg !163 + %3713 = bitcast <2 x bfloat> %3655 to i32, !dbg !163 + %3714 = bitcast <2 x bfloat> %3662 to i32, !dbg !163 + %3715 = bitcast <2 x bfloat> %3669 to i32, !dbg !163 + %3716 = bitcast <2 x bfloat> %3676 to i32, !dbg !163 + %3717 = bitcast <2 x bfloat> %3683 to i32, !dbg !163 + %3718 = bitcast <2 x bfloat> %3690 to i32, !dbg !163 + %3719 = bitcast <2 x bfloat> %3697 to i32, !dbg !163 + %3720 = bitcast <2 x bfloat> %3704 to i32, !dbg !163 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !163 + %3721 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %.pn, float %.pn2389, float %.pn2390, float %.pn2391, float %.pn2392, float %.pn2393, float %.pn2394, float %.pn2395, float %.pn2396, float %.pn2397, float %.pn2398, float %.pn2399, float %.pn2400, float %.pn2401, float %.pn2402, float %.pn2403, float %.pn2404, float %.pn2405, float %.pn2406, float %.pn2407, float %.pn2408, float %.pn2409, float %.pn2410, float %.pn2411, float %.pn2412, float %.pn2413, float %.pn2414, float %.pn2415, float %.pn2416, float %.pn2417, float %.pn2418, float %.pn2419, float %.pn2420, float %.pn2421, float %.pn2422, float %.pn2423, float %.pn2424, float %.pn2425, float %.pn2426, float %.pn2427, float %.pn2428, float %.pn2429, float %.pn2430, float %.pn2431, float %.pn2432, float %.pn2433, float %.pn2434, float %.pn2435, float %.pn2436, float %.pn2437, float %.pn2438, float %.pn2439, float %.pn2440, float %.pn2441, float %.pn2442, float %.pn2443, float %.pn2444, float %.pn2445, float %.pn2446, float %.pn2447, float %.pn2448, float %.pn2449, float %.pn2450, float %.pn2451, i32 %3705, i32 %3706, i32 %3707, i32 %3708, i64 %2584, i1 true) #3, !dbg !163 + %3722 = add i32 %2580, 2048, !dbg !163 + %3723 = lshr exact i32 %3722, 4, !dbg !163 + %3724 = and i32 %3723, 16383, !dbg !163 + %3725 = zext nneg i32 %3724 to i64, !dbg !163 + %3726 = or disjoint i64 %3725, 4611686293338849280, !dbg !163 + %3727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 0, !dbg !163 + %3728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 1, !dbg !163 + %3729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 2, !dbg !163 + %3730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 3, !dbg !163 + %3731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 4, !dbg !163 + %3732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 5, !dbg !163 + %3733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 6, !dbg !163 + %3734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 7, !dbg !163 + %3735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 8, !dbg !163 + %3736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 9, !dbg !163 + %3737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 10, !dbg !163 + %3738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 11, !dbg !163 + %3739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 12, !dbg !163 + %3740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 13, !dbg !163 + %3741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 14, !dbg !163 + %3742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 15, !dbg !163 + %3743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 16, !dbg !163 + %3744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 17, !dbg !163 + %3745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 18, !dbg !163 + %3746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 19, !dbg !163 + %3747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 20, !dbg !163 + %3748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 21, !dbg !163 + %3749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 22, !dbg !163 + %3750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 23, !dbg !163 + %3751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 24, !dbg !163 + %3752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 25, !dbg !163 + %3753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 26, !dbg !163 + %3754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 27, !dbg !163 + %3755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 28, !dbg !163 + %3756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 29, !dbg !163 + %3757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 30, !dbg !163 + %3758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 31, !dbg !163 + %3759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 32, !dbg !163 + %3760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 33, !dbg !163 + %3761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 34, !dbg !163 + %3762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 35, !dbg !163 + %3763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 36, !dbg !163 + %3764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 37, !dbg !163 + %3765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 38, !dbg !163 + %3766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 39, !dbg !163 + %3767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 40, !dbg !163 + %3768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 41, !dbg !163 + %3769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 42, !dbg !163 + %3770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 43, !dbg !163 + %3771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 44, !dbg !163 + %3772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 45, !dbg !163 + %3773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 46, !dbg !163 + %3774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 47, !dbg !163 + %3775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 48, !dbg !163 + %3776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 49, !dbg !163 + %3777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 50, !dbg !163 + %3778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 51, !dbg !163 + %3779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 52, !dbg !163 + %3780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 53, !dbg !163 + %3781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 54, !dbg !163 + %3782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 55, !dbg !163 + %3783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 56, !dbg !163 + %3784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 57, !dbg !163 + %3785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 58, !dbg !163 + %3786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 59, !dbg !163 + %3787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 60, !dbg !163 + %3788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 61, !dbg !163 + %3789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 62, !dbg !163 + %3790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 63, !dbg !163 + %3791 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3727, float %3728, float %3729, float %3730, float %3731, float %3732, float %3733, float %3734, float %3735, float %3736, float %3737, float %3738, float %3739, float %3740, float %3741, float %3742, float %3743, float %3744, float %3745, float %3746, float %3747, float %3748, float %3749, float %3750, float %3751, float %3752, float %3753, float %3754, float %3755, float %3756, float %3757, float %3758, float %3759, float %3760, float %3761, float %3762, float %3763, float %3764, float %3765, float %3766, float %3767, float %3768, float %3769, float %3770, float %3771, float %3772, float %3773, float %3774, float %3775, float %3776, float %3777, float %3778, float %3779, float %3780, float %3781, float %3782, float %3783, float %3784, float %3785, float %3786, float %3787, float %3788, float %3789, float %3790, i32 %3709, i32 %3710, i32 %3711, i32 %3712, i64 %3726, i1 true) #3, !dbg !163 + %3792 = add i32 %2580, 4096, !dbg !163 + %3793 = lshr exact i32 %3792, 4, !dbg !163 + %3794 = and i32 %3793, 16383, !dbg !163 + %3795 = zext nneg i32 %3794 to i64, !dbg !163 + %3796 = or disjoint i64 %3795, 4611686293338849280, !dbg !163 + %3797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 0, !dbg !163 + %3798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 1, !dbg !163 + %3799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 2, !dbg !163 + %3800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 3, !dbg !163 + %3801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 4, !dbg !163 + %3802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 5, !dbg !163 + %3803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 6, !dbg !163 + %3804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 7, !dbg !163 + %3805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 8, !dbg !163 + %3806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 9, !dbg !163 + %3807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 10, !dbg !163 + %3808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 11, !dbg !163 + %3809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 12, !dbg !163 + %3810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 13, !dbg !163 + %3811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 14, !dbg !163 + %3812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 15, !dbg !163 + %3813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 16, !dbg !163 + %3814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 17, !dbg !163 + %3815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 18, !dbg !163 + %3816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 19, !dbg !163 + %3817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 20, !dbg !163 + %3818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 21, !dbg !163 + %3819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 22, !dbg !163 + %3820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 23, !dbg !163 + %3821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 24, !dbg !163 + %3822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 25, !dbg !163 + %3823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 26, !dbg !163 + %3824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 27, !dbg !163 + %3825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 28, !dbg !163 + %3826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 29, !dbg !163 + %3827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 30, !dbg !163 + %3828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 31, !dbg !163 + %3829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 32, !dbg !163 + %3830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 33, !dbg !163 + %3831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 34, !dbg !163 + %3832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 35, !dbg !163 + %3833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 36, !dbg !163 + %3834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 37, !dbg !163 + %3835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 38, !dbg !163 + %3836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 39, !dbg !163 + %3837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 40, !dbg !163 + %3838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 41, !dbg !163 + %3839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 42, !dbg !163 + %3840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 43, !dbg !163 + %3841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 44, !dbg !163 + %3842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 45, !dbg !163 + %3843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 46, !dbg !163 + %3844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 47, !dbg !163 + %3845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 48, !dbg !163 + %3846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 49, !dbg !163 + %3847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 50, !dbg !163 + %3848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 51, !dbg !163 + %3849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 52, !dbg !163 + %3850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 53, !dbg !163 + %3851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 54, !dbg !163 + %3852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 55, !dbg !163 + %3853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 56, !dbg !163 + %3854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 57, !dbg !163 + %3855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 58, !dbg !163 + %3856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 59, !dbg !163 + %3857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 60, !dbg !163 + %3858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 61, !dbg !163 + %3859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 62, !dbg !163 + %3860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 63, !dbg !163 + %3861 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3797, float %3798, float %3799, float %3800, float %3801, float %3802, float %3803, float %3804, float %3805, float %3806, float %3807, float %3808, float %3809, float %3810, float %3811, float %3812, float %3813, float %3814, float %3815, float %3816, float %3817, float %3818, float %3819, float %3820, float %3821, float %3822, float %3823, float %3824, float %3825, float %3826, float %3827, float %3828, float %3829, float %3830, float %3831, float %3832, float %3833, float %3834, float %3835, float %3836, float %3837, float %3838, float %3839, float %3840, float %3841, float %3842, float %3843, float %3844, float %3845, float %3846, float %3847, float %3848, float %3849, float %3850, float %3851, float %3852, float %3853, float %3854, float %3855, float %3856, float %3857, float %3858, float %3859, float %3860, i32 %3713, i32 %3714, i32 %3715, i32 %3716, i64 %3796, i1 true) #3, !dbg !163 + %3862 = add i32 %2580, 6144, !dbg !163 + %3863 = lshr exact i32 %3862, 4, !dbg !163 + %3864 = and i32 %3863, 16383, !dbg !163 + %3865 = zext nneg i32 %3864 to i64, !dbg !163 + %3866 = or disjoint i64 %3865, 4611686293338849280, !dbg !163 + %3867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 0, !dbg !163 + %3868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 1, !dbg !163 + %3869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 2, !dbg !163 + %3870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 3, !dbg !163 + %3871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 4, !dbg !163 + %3872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 5, !dbg !163 + %3873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 6, !dbg !163 + %3874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 7, !dbg !163 + %3875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 8, !dbg !163 + %3876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 9, !dbg !163 + %3877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 10, !dbg !163 + %3878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 11, !dbg !163 + %3879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 12, !dbg !163 + %3880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 13, !dbg !163 + %3881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 14, !dbg !163 + %3882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 15, !dbg !163 + %3883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 16, !dbg !163 + %3884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 17, !dbg !163 + %3885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 18, !dbg !163 + %3886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 19, !dbg !163 + %3887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 20, !dbg !163 + %3888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 21, !dbg !163 + %3889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 22, !dbg !163 + %3890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 23, !dbg !163 + %3891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 24, !dbg !163 + %3892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 25, !dbg !163 + %3893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 26, !dbg !163 + %3894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 27, !dbg !163 + %3895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 28, !dbg !163 + %3896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 29, !dbg !163 + %3897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 30, !dbg !163 + %3898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 31, !dbg !163 + %3899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 32, !dbg !163 + %3900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 33, !dbg !163 + %3901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 34, !dbg !163 + %3902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 35, !dbg !163 + %3903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 36, !dbg !163 + %3904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 37, !dbg !163 + %3905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 38, !dbg !163 + %3906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 39, !dbg !163 + %3907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 40, !dbg !163 + %3908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 41, !dbg !163 + %3909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 42, !dbg !163 + %3910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 43, !dbg !163 + %3911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 44, !dbg !163 + %3912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 45, !dbg !163 + %3913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 46, !dbg !163 + %3914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 47, !dbg !163 + %3915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 48, !dbg !163 + %3916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 49, !dbg !163 + %3917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 50, !dbg !163 + %3918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 51, !dbg !163 + %3919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 52, !dbg !163 + %3920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 53, !dbg !163 + %3921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 54, !dbg !163 + %3922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 55, !dbg !163 + %3923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 56, !dbg !163 + %3924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 57, !dbg !163 + %3925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 58, !dbg !163 + %3926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 59, !dbg !163 + %3927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 60, !dbg !163 + %3928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 61, !dbg !163 + %3929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 62, !dbg !163 + %3930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 63, !dbg !163 + %3931 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3867, float %3868, float %3869, float %3870, float %3871, float %3872, float %3873, float %3874, float %3875, float %3876, float %3877, float %3878, float %3879, float %3880, float %3881, float %3882, float %3883, float %3884, float %3885, float %3886, float %3887, float %3888, float %3889, float %3890, float %3891, float %3892, float %3893, float %3894, float %3895, float %3896, float %3897, float %3898, float %3899, float %3900, float %3901, float %3902, float %3903, float %3904, float %3905, float %3906, float %3907, float %3908, float %3909, float %3910, float %3911, float %3912, float %3913, float %3914, float %3915, float %3916, float %3917, float %3918, float %3919, float %3920, float %3921, float %3922, float %3923, float %3924, float %3925, float %3926, float %3927, float %3928, float %3929, float %3930, i32 %3717, i32 %3718, i32 %3719, i32 %3720, i64 %3866, i1 true) #3, !dbg !163 + %3932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 0, !dbg !163 + %3933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 1, !dbg !163 + %3934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 2, !dbg !163 + %3935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 3, !dbg !163 + %3936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 4, !dbg !163 + %3937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 5, !dbg !163 + %3938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 6, !dbg !163 + %3939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 7, !dbg !163 + %3940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 8, !dbg !163 + %3941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 9, !dbg !163 + %3942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 10, !dbg !163 + %3943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 11, !dbg !163 + %3944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 12, !dbg !163 + %3945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 13, !dbg !163 + %3946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 14, !dbg !163 + %3947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 15, !dbg !163 + %3948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 16, !dbg !163 + %3949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 17, !dbg !163 + %3950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 18, !dbg !163 + %3951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 19, !dbg !163 + %3952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 20, !dbg !163 + %3953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 21, !dbg !163 + %3954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 22, !dbg !163 + %3955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 23, !dbg !163 + %3956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 24, !dbg !163 + %3957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 25, !dbg !163 + %3958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 26, !dbg !163 + %3959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 27, !dbg !163 + %3960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 28, !dbg !163 + %3961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 29, !dbg !163 + %3962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 30, !dbg !163 + %3963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 31, !dbg !163 + %3964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 32, !dbg !163 + %3965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 33, !dbg !163 + %3966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 34, !dbg !163 + %3967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 35, !dbg !163 + %3968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 36, !dbg !163 + %3969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 37, !dbg !163 + %3970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 38, !dbg !163 + %3971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 39, !dbg !163 + %3972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 40, !dbg !163 + %3973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 41, !dbg !163 + %3974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 42, !dbg !163 + %3975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 43, !dbg !163 + %3976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 44, !dbg !163 + %3977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 45, !dbg !163 + %3978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 46, !dbg !163 + %3979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 47, !dbg !163 + %3980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 48, !dbg !163 + %3981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 49, !dbg !163 + %3982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 50, !dbg !163 + %3983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 51, !dbg !163 + %3984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 52, !dbg !163 + %3985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 53, !dbg !163 + %3986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 54, !dbg !163 + %3987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 55, !dbg !163 + %3988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 56, !dbg !163 + %3989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 57, !dbg !163 + %3990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 58, !dbg !163 + %3991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 59, !dbg !163 + %3992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 60, !dbg !163 + %3993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 61, !dbg !163 + %3994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 62, !dbg !163 + %3995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 63, !dbg !163 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !163 + %3996 = add nuw nsw i32 %2564, 1, !dbg !149 + %3997 = lshr i32 %3996, 1, !dbg !164 + %3998 = zext nneg i32 %3997 to i64, !dbg !165 + %3999 = getelementptr i32, ptr addrspace(1) %2380, i64 %3998, !dbg !165 + %4000 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !166 + %4001 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %3999, i64 %4000, i1 %2566) #3, !dbg !166 + %4002 = add nuw nsw i32 %3997, 1, !dbg !167 + %4003 = icmp slt i32 %4002, %2384, !dbg !168 + %4004 = getelementptr i8, ptr addrspace(1) %3999, i64 4, !dbg !169 + %4005 = and i1 %2566, %4003, !dbg !149 + %4006 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !170 + %4007 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %4004, i64 %4006, i1 %4005) #3, !dbg !170 + %4008 = and i32 %2564, 1, !dbg !171 + %4009 = sub i32 %4007, %4001, !dbg !172 + %4010 = shl i32 %4009, 7, !dbg !173 + %4011 = add i32 %4010, 33554368, !dbg !174 + %4012 = shl nuw nsw i32 %4008, 13, !dbg !175 + %4013 = shl nuw nsw i32 %4008, 7, !dbg !176 + %4014 = xor i32 %4013, 128, !dbg !176 + %4015 = mul i32 %4014, %4011, !dbg !175 + %4016 = add i32 %4015, %4012, !dbg !175 + %4017 = sext i32 %4016 to i64, !dbg !151 + %4018 = getelementptr bfloat, ptr addrspace(1) %.pn10621849, i64 %4017, !dbg !151 + %4019 = getelementptr bfloat, ptr addrspace(1) %.pn10461850, i64 %4017, !dbg !151 + %4020 = getelementptr bfloat, ptr addrspace(1) %.pn10301851, i64 %4017, !dbg !151 + %4021 = getelementptr bfloat, ptr addrspace(1) %.pn10141852, i64 %4017, !dbg !151 + %4022 = getelementptr bfloat, ptr addrspace(1) %.pn11261853, i64 %4017, !dbg !152 + %4023 = getelementptr bfloat, ptr addrspace(1) %.pn11101854, i64 %4017, !dbg !152 + %4024 = getelementptr bfloat, ptr addrspace(1) %.pn10941855, i64 %4017, !dbg !152 + %4025 = getelementptr bfloat, ptr addrspace(1) %.pn10781856, i64 %4017, !dbg !152 + %4026 = add i32 %2563, 1, !dbg !149 + %4027 = icmp sgt i32 %4026, 2, !dbg !149 + %4028 = select i1 %4027, i32 0, i32 %4026, !dbg !149 + %4029 = shl i32 %4028, 13, !dbg !150 + %4030 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %4029, !dbg !150 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !150 + %4031 = getelementptr inbounds nuw i8, ptr addrspace(3) %4030, i32 %402, !dbg !150 + %4032 = select i1 %2565, i32 16, i32 0, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4031, ptr addrspace(1) %4018, i32 %4032) #3, !dbg !150 + %4033 = getelementptr inbounds nuw i8, ptr addrspace(3) %4030, i32 %405, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4033, ptr addrspace(1) %4019, i32 %4032) #3, !dbg !150 + %4034 = getelementptr inbounds nuw i8, ptr addrspace(3) %4030, i32 %407, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4034, ptr addrspace(1) %4020, i32 %4032) #3, !dbg !150 + %4035 = getelementptr inbounds nuw i8, ptr addrspace(3) %4030, i32 %409, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4035, ptr addrspace(1) %4021, i32 %4032) #3, !dbg !150 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !150 + %4036 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %4029, !dbg !150 + %4037 = getelementptr inbounds nuw i8, ptr addrspace(3) %4036, i32 %402, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4037, ptr addrspace(1) %4022, i32 %4032) #3, !dbg !150 + %4038 = getelementptr inbounds nuw i8, ptr addrspace(3) %4036, i32 %405, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4038, ptr addrspace(1) %4023, i32 %4032) #3, !dbg !150 + %4039 = getelementptr inbounds nuw i8, ptr addrspace(3) %4036, i32 %407, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4039, ptr addrspace(1) %4024, i32 %4032) #3, !dbg !150 + %4040 = getelementptr inbounds nuw i8, ptr addrspace(3) %4036, i32 %409, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4040, ptr addrspace(1) %4025, i32 %4032) #3, !dbg !150 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !150 + %exitcond2127.not = icmp eq i32 %3996, %2490, !dbg !149 + br i1 %exitcond2127.not, label %._crit_edge1859, label %2561, !dbg !149 + +._crit_edge1859: ; preds = %__nv_exp2f.exit1411, %._crit_edge1847 + %4041 = phi float [ %2426, %._crit_edge1847 ], [ %3932, %__nv_exp2f.exit1411 ], !dbg !77 + %4042 = phi float [ %2427, %._crit_edge1847 ], [ %3933, %__nv_exp2f.exit1411 ], !dbg !77 + %4043 = phi float [ %2428, %._crit_edge1847 ], [ %3934, %__nv_exp2f.exit1411 ], !dbg !77 + %4044 = phi float [ %2429, %._crit_edge1847 ], [ %3935, %__nv_exp2f.exit1411 ], !dbg !77 + %4045 = phi float [ %2430, %._crit_edge1847 ], [ %3936, %__nv_exp2f.exit1411 ], !dbg !77 + %4046 = phi float [ %2431, %._crit_edge1847 ], [ %3937, %__nv_exp2f.exit1411 ], !dbg !77 + %4047 = phi float [ %2432, %._crit_edge1847 ], [ %3938, %__nv_exp2f.exit1411 ], !dbg !77 + %4048 = phi float [ %2433, %._crit_edge1847 ], [ %3939, %__nv_exp2f.exit1411 ], !dbg !77 + %4049 = phi float [ %2434, %._crit_edge1847 ], [ %3940, %__nv_exp2f.exit1411 ], !dbg !77 + %4050 = phi float [ %2435, %._crit_edge1847 ], [ %3941, %__nv_exp2f.exit1411 ], !dbg !77 + %4051 = phi float [ %2436, %._crit_edge1847 ], [ %3942, %__nv_exp2f.exit1411 ], !dbg !77 + %4052 = phi float [ %2437, %._crit_edge1847 ], [ %3943, %__nv_exp2f.exit1411 ], !dbg !77 + %4053 = phi float [ %2438, %._crit_edge1847 ], [ %3944, %__nv_exp2f.exit1411 ], !dbg !77 + %4054 = phi float [ %2439, %._crit_edge1847 ], [ %3945, %__nv_exp2f.exit1411 ], !dbg !77 + %4055 = phi float [ %2440, %._crit_edge1847 ], [ %3946, %__nv_exp2f.exit1411 ], !dbg !77 + %4056 = phi float [ %2441, %._crit_edge1847 ], [ %3947, %__nv_exp2f.exit1411 ], !dbg !77 + %4057 = phi float [ %2442, %._crit_edge1847 ], [ %3948, %__nv_exp2f.exit1411 ], !dbg !77 + %4058 = phi float [ %2443, %._crit_edge1847 ], [ %3949, %__nv_exp2f.exit1411 ], !dbg !77 + %4059 = phi float [ %2444, %._crit_edge1847 ], [ %3950, %__nv_exp2f.exit1411 ], !dbg !77 + %4060 = phi float [ %2445, %._crit_edge1847 ], [ %3951, %__nv_exp2f.exit1411 ], !dbg !77 + %4061 = phi float [ %2446, %._crit_edge1847 ], [ %3952, %__nv_exp2f.exit1411 ], !dbg !77 + %4062 = phi float [ %2447, %._crit_edge1847 ], [ %3953, %__nv_exp2f.exit1411 ], !dbg !77 + %4063 = phi float [ %2448, %._crit_edge1847 ], [ %3954, %__nv_exp2f.exit1411 ], !dbg !77 + %4064 = phi float [ %2449, %._crit_edge1847 ], [ %3955, %__nv_exp2f.exit1411 ], !dbg !77 + %4065 = phi float [ %2450, %._crit_edge1847 ], [ %3956, %__nv_exp2f.exit1411 ], !dbg !77 + %4066 = phi float [ %2451, %._crit_edge1847 ], [ %3957, %__nv_exp2f.exit1411 ], !dbg !77 + %4067 = phi float [ %2452, %._crit_edge1847 ], [ %3958, %__nv_exp2f.exit1411 ], !dbg !77 + %4068 = phi float [ %2453, %._crit_edge1847 ], [ %3959, %__nv_exp2f.exit1411 ], !dbg !77 + %4069 = phi float [ %2454, %._crit_edge1847 ], [ %3960, %__nv_exp2f.exit1411 ], !dbg !77 + %4070 = phi float [ %2455, %._crit_edge1847 ], [ %3961, %__nv_exp2f.exit1411 ], !dbg !77 + %4071 = phi float [ %2456, %._crit_edge1847 ], [ %3962, %__nv_exp2f.exit1411 ], !dbg !77 + %4072 = phi float [ %2457, %._crit_edge1847 ], [ %3963, %__nv_exp2f.exit1411 ], !dbg !77 + %4073 = phi float [ %2458, %._crit_edge1847 ], [ %3964, %__nv_exp2f.exit1411 ], !dbg !77 + %4074 = phi float [ %2459, %._crit_edge1847 ], [ %3965, %__nv_exp2f.exit1411 ], !dbg !77 + %4075 = phi float [ %2460, %._crit_edge1847 ], [ %3966, %__nv_exp2f.exit1411 ], !dbg !77 + %4076 = phi float [ %2461, %._crit_edge1847 ], [ %3967, %__nv_exp2f.exit1411 ], !dbg !77 + %4077 = phi float [ %2462, %._crit_edge1847 ], [ %3968, %__nv_exp2f.exit1411 ], !dbg !77 + %4078 = phi float [ %2463, %._crit_edge1847 ], [ %3969, %__nv_exp2f.exit1411 ], !dbg !77 + %4079 = phi float [ %2464, %._crit_edge1847 ], [ %3970, %__nv_exp2f.exit1411 ], !dbg !77 + %4080 = phi float [ %2465, %._crit_edge1847 ], [ %3971, %__nv_exp2f.exit1411 ], !dbg !77 + %4081 = phi float [ %2466, %._crit_edge1847 ], [ %3972, %__nv_exp2f.exit1411 ], !dbg !77 + %4082 = phi float [ %2467, %._crit_edge1847 ], [ %3973, %__nv_exp2f.exit1411 ], !dbg !77 + %4083 = phi float [ %2468, %._crit_edge1847 ], [ %3974, %__nv_exp2f.exit1411 ], !dbg !77 + %4084 = phi float [ %2469, %._crit_edge1847 ], [ %3975, %__nv_exp2f.exit1411 ], !dbg !77 + %4085 = phi float [ %2470, %._crit_edge1847 ], [ %3976, %__nv_exp2f.exit1411 ], !dbg !77 + %4086 = phi float [ %2471, %._crit_edge1847 ], [ %3977, %__nv_exp2f.exit1411 ], !dbg !77 + %4087 = phi float [ %2472, %._crit_edge1847 ], [ %3978, %__nv_exp2f.exit1411 ], !dbg !77 + %4088 = phi float [ %2473, %._crit_edge1847 ], [ %3979, %__nv_exp2f.exit1411 ], !dbg !77 + %4089 = phi float [ %2474, %._crit_edge1847 ], [ %3980, %__nv_exp2f.exit1411 ], !dbg !77 + %4090 = phi float [ %2475, %._crit_edge1847 ], [ %3981, %__nv_exp2f.exit1411 ], !dbg !77 + %4091 = phi float [ %2476, %._crit_edge1847 ], [ %3982, %__nv_exp2f.exit1411 ], !dbg !77 + %4092 = phi float [ %2477, %._crit_edge1847 ], [ %3983, %__nv_exp2f.exit1411 ], !dbg !77 + %4093 = phi float [ %2478, %._crit_edge1847 ], [ %3984, %__nv_exp2f.exit1411 ], !dbg !77 + %4094 = phi float [ %2479, %._crit_edge1847 ], [ %3985, %__nv_exp2f.exit1411 ], !dbg !77 + %4095 = phi float [ %2480, %._crit_edge1847 ], [ %3986, %__nv_exp2f.exit1411 ], !dbg !77 + %4096 = phi float [ %2481, %._crit_edge1847 ], [ %3987, %__nv_exp2f.exit1411 ], !dbg !77 + %4097 = phi float [ %2482, %._crit_edge1847 ], [ %3988, %__nv_exp2f.exit1411 ], !dbg !77 + %4098 = phi float [ %2483, %._crit_edge1847 ], [ %3989, %__nv_exp2f.exit1411 ], !dbg !77 + %4099 = phi float [ %2484, %._crit_edge1847 ], [ %3990, %__nv_exp2f.exit1411 ], !dbg !77 + %4100 = phi float [ %2485, %._crit_edge1847 ], [ %3991, %__nv_exp2f.exit1411 ], !dbg !77 + %4101 = phi float [ %2486, %._crit_edge1847 ], [ %3992, %__nv_exp2f.exit1411 ], !dbg !77 + %4102 = phi float [ %2487, %._crit_edge1847 ], [ %3993, %__nv_exp2f.exit1411 ], !dbg !77 + %4103 = phi float [ %2488, %._crit_edge1847 ], [ %3994, %__nv_exp2f.exit1411 ], !dbg !77 + %4104 = phi float [ %2489, %._crit_edge1847 ], [ %3995, %__nv_exp2f.exit1411 ], !dbg !77 + %4105 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %4041, float %4042, float %4043, float %4044, float %4045, float %4046, float %4047, float %4048, float %4049, float %4050, float %4051, float %4052, float %4053, float %4054, float %4055, float %4056, float %4057, float %4058, float %4059, float %4060, float %4061, float %4062, float %4063, float %4064, float %4065, float %4066, float %4067, float %4068, float %4069, float %4070, float %4071, float %4072, float %4073, float %4074, float %4075, float %4076, float %4077, float %4078, float %4079, float %4080, float %4081, float %4082, float %4083, float %4084, float %4085, float %4086, float %4087, float %4088, float %4089, float %4090, float %4091, float %4092, float %4093, float %4094, float %4095, float %4096, float %4097, float %4098, float %4099, float %4100, float %4101, float %4102, float %4103, float %4104) #3, !dbg !149 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !149 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !149 + %4106 = getelementptr bfloat, ptr addrspace(1) %77, i64 %103, !dbg !177 + %4107 = getelementptr bfloat, ptr addrspace(1) %77, i64 %105, !dbg !177 + %4108 = getelementptr bfloat, ptr addrspace(1) %77, i64 %107, !dbg !177 + %4109 = getelementptr bfloat, ptr addrspace(1) %77, i64 %109, !dbg !177 + %4110 = getelementptr bfloat, ptr addrspace(1) %77, i64 %111, !dbg !177 + %4111 = getelementptr bfloat, ptr addrspace(1) %77, i64 %113, !dbg !177 + %4112 = getelementptr bfloat, ptr addrspace(1) %77, i64 %115, !dbg !177 + %4113 = getelementptr bfloat, ptr addrspace(1) %77, i64 %117, !dbg !177 + %4114 = getelementptr bfloat, ptr addrspace(1) %4106, i64 %121, !dbg !178 + %4115 = getelementptr bfloat, ptr addrspace(1) %4107, i64 %121, !dbg !178 + %4116 = getelementptr bfloat, ptr addrspace(1) %4108, i64 %121, !dbg !178 + %4117 = getelementptr bfloat, ptr addrspace(1) %4109, i64 %121, !dbg !178 + %4118 = getelementptr bfloat, ptr addrspace(1) %4110, i64 %121, !dbg !178 + %4119 = getelementptr bfloat, ptr addrspace(1) %4111, i64 %121, !dbg !178 + %4120 = getelementptr bfloat, ptr addrspace(1) %4112, i64 %121, !dbg !178 + %4121 = getelementptr bfloat, ptr addrspace(1) %4113, i64 %121, !dbg !178 + %4122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 0, !dbg !179 + %4123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 1, !dbg !179 + %4124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 2, !dbg !179 + %4125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 3, !dbg !179 + %4126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 4, !dbg !179 + %4127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 5, !dbg !179 + %4128 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 6, !dbg !179 + %4129 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 7, !dbg !179 + %4130 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 8, !dbg !179 + %4131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 9, !dbg !179 + %4132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 10, !dbg !179 + %4133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 11, !dbg !179 + %4134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 12, !dbg !179 + %4135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 13, !dbg !179 + %4136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 14, !dbg !179 + %4137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 15, !dbg !179 + %4138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 16, !dbg !179 + %4139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 17, !dbg !179 + %4140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 18, !dbg !179 + %4141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 19, !dbg !179 + %4142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 20, !dbg !179 + %4143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 21, !dbg !179 + %4144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 22, !dbg !179 + %4145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 23, !dbg !179 + %4146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 24, !dbg !179 + %4147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 25, !dbg !179 + %4148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 26, !dbg !179 + %4149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 27, !dbg !179 + %4150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 28, !dbg !179 + %4151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 29, !dbg !179 + %4152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 30, !dbg !179 + %4153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 31, !dbg !179 + %4154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 32, !dbg !179 + %4155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 33, !dbg !179 + %4156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 34, !dbg !179 + %4157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 35, !dbg !179 + %4158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 36, !dbg !179 + %4159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 37, !dbg !179 + %4160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 38, !dbg !179 + %4161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 39, !dbg !179 + %4162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 40, !dbg !179 + %4163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 41, !dbg !179 + %4164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 42, !dbg !179 + %4165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 43, !dbg !179 + %4166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 44, !dbg !179 + %4167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 45, !dbg !179 + %4168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 46, !dbg !179 + %4169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 47, !dbg !179 + %4170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 48, !dbg !179 + %4171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 49, !dbg !179 + %4172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 50, !dbg !179 + %4173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 51, !dbg !179 + %4174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 52, !dbg !179 + %4175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 53, !dbg !179 + %4176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 54, !dbg !179 + %4177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 55, !dbg !179 + %4178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 56, !dbg !179 + %4179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 57, !dbg !179 + %4180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 58, !dbg !179 + %4181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 59, !dbg !179 + %4182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 60, !dbg !179 + %4183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 61, !dbg !179 + %4184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 62, !dbg !179 + %4185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 63, !dbg !179 + %4186 = insertelement <2 x float> poison, float %4122, i64 0, !dbg !179 + %4187 = insertelement <2 x float> %4186, float %4123, i64 1, !dbg !179 + %4188 = fmul <2 x float> %4187, splat (float 0x3FB6A09E60000000), !dbg !179 + %4189 = fptrunc <2 x float> %4188 to <2 x bfloat>, !dbg !180 + %4190 = insertelement <2 x float> poison, float %4124, i64 0, !dbg !179 + %4191 = insertelement <2 x float> %4190, float %4125, i64 1, !dbg !179 + %4192 = fmul <2 x float> %4191, splat (float 0x3FB6A09E60000000), !dbg !179 + %4193 = fptrunc <2 x float> %4192 to <2 x bfloat>, !dbg !180 + %4194 = insertelement <2 x float> poison, float %4126, i64 0, !dbg !179 + %4195 = insertelement <2 x float> %4194, float %4127, i64 1, !dbg !179 + %4196 = fmul <2 x float> %4195, splat (float 0x3FB6A09E60000000), !dbg !179 + %4197 = fptrunc <2 x float> %4196 to <2 x bfloat>, !dbg !180 + %4198 = insertelement <2 x float> poison, float %4128, i64 0, !dbg !179 + %4199 = insertelement <2 x float> %4198, float %4129, i64 1, !dbg !179 + %4200 = fmul <2 x float> %4199, splat (float 0x3FB6A09E60000000), !dbg !179 + %4201 = fptrunc <2 x float> %4200 to <2 x bfloat>, !dbg !180 + %4202 = insertelement <2 x float> poison, float %4130, i64 0, !dbg !179 + %4203 = insertelement <2 x float> %4202, float %4131, i64 1, !dbg !179 + %4204 = fmul <2 x float> %4203, splat (float 0x3FB6A09E60000000), !dbg !179 + %4205 = fptrunc <2 x float> %4204 to <2 x bfloat>, !dbg !180 + %4206 = insertelement <2 x float> poison, float %4132, i64 0, !dbg !179 + %4207 = insertelement <2 x float> %4206, float %4133, i64 1, !dbg !179 + %4208 = fmul <2 x float> %4207, splat (float 0x3FB6A09E60000000), !dbg !179 + %4209 = fptrunc <2 x float> %4208 to <2 x bfloat>, !dbg !180 + %4210 = insertelement <2 x float> poison, float %4134, i64 0, !dbg !179 + %4211 = insertelement <2 x float> %4210, float %4135, i64 1, !dbg !179 + %4212 = fmul <2 x float> %4211, splat (float 0x3FB6A09E60000000), !dbg !179 + %4213 = fptrunc <2 x float> %4212 to <2 x bfloat>, !dbg !180 + %4214 = insertelement <2 x float> poison, float %4136, i64 0, !dbg !179 + %4215 = insertelement <2 x float> %4214, float %4137, i64 1, !dbg !179 + %4216 = fmul <2 x float> %4215, splat (float 0x3FB6A09E60000000), !dbg !179 + %4217 = fptrunc <2 x float> %4216 to <2 x bfloat>, !dbg !180 + %4218 = insertelement <2 x float> poison, float %4138, i64 0, !dbg !179 + %4219 = insertelement <2 x float> %4218, float %4139, i64 1, !dbg !179 + %4220 = fmul <2 x float> %4219, splat (float 0x3FB6A09E60000000), !dbg !179 + %4221 = fptrunc <2 x float> %4220 to <2 x bfloat>, !dbg !180 + %4222 = insertelement <2 x float> poison, float %4140, i64 0, !dbg !179 + %4223 = insertelement <2 x float> %4222, float %4141, i64 1, !dbg !179 + %4224 = fmul <2 x float> %4223, splat (float 0x3FB6A09E60000000), !dbg !179 + %4225 = fptrunc <2 x float> %4224 to <2 x bfloat>, !dbg !180 + %4226 = insertelement <2 x float> poison, float %4142, i64 0, !dbg !179 + %4227 = insertelement <2 x float> %4226, float %4143, i64 1, !dbg !179 + %4228 = fmul <2 x float> %4227, splat (float 0x3FB6A09E60000000), !dbg !179 + %4229 = fptrunc <2 x float> %4228 to <2 x bfloat>, !dbg !180 + %4230 = insertelement <2 x float> poison, float %4144, i64 0, !dbg !179 + %4231 = insertelement <2 x float> %4230, float %4145, i64 1, !dbg !179 + %4232 = fmul <2 x float> %4231, splat (float 0x3FB6A09E60000000), !dbg !179 + %4233 = fptrunc <2 x float> %4232 to <2 x bfloat>, !dbg !180 + %4234 = insertelement <2 x float> poison, float %4146, i64 0, !dbg !179 + %4235 = insertelement <2 x float> %4234, float %4147, i64 1, !dbg !179 + %4236 = fmul <2 x float> %4235, splat (float 0x3FB6A09E60000000), !dbg !179 + %4237 = fptrunc <2 x float> %4236 to <2 x bfloat>, !dbg !180 + %4238 = insertelement <2 x float> poison, float %4148, i64 0, !dbg !179 + %4239 = insertelement <2 x float> %4238, float %4149, i64 1, !dbg !179 + %4240 = fmul <2 x float> %4239, splat (float 0x3FB6A09E60000000), !dbg !179 + %4241 = fptrunc <2 x float> %4240 to <2 x bfloat>, !dbg !180 + %4242 = insertelement <2 x float> poison, float %4150, i64 0, !dbg !179 + %4243 = insertelement <2 x float> %4242, float %4151, i64 1, !dbg !179 + %4244 = fmul <2 x float> %4243, splat (float 0x3FB6A09E60000000), !dbg !179 + %4245 = fptrunc <2 x float> %4244 to <2 x bfloat>, !dbg !180 + %4246 = insertelement <2 x float> poison, float %4152, i64 0, !dbg !179 + %4247 = insertelement <2 x float> %4246, float %4153, i64 1, !dbg !179 + %4248 = fmul <2 x float> %4247, splat (float 0x3FB6A09E60000000), !dbg !179 + %4249 = fptrunc <2 x float> %4248 to <2 x bfloat>, !dbg !180 + %4250 = insertelement <2 x float> poison, float %4154, i64 0, !dbg !179 + %4251 = insertelement <2 x float> %4250, float %4155, i64 1, !dbg !179 + %4252 = fmul <2 x float> %4251, splat (float 0x3FB6A09E60000000), !dbg !179 + %4253 = fptrunc <2 x float> %4252 to <2 x bfloat>, !dbg !180 + %4254 = insertelement <2 x float> poison, float %4156, i64 0, !dbg !179 + %4255 = insertelement <2 x float> %4254, float %4157, i64 1, !dbg !179 + %4256 = fmul <2 x float> %4255, splat (float 0x3FB6A09E60000000), !dbg !179 + %4257 = fptrunc <2 x float> %4256 to <2 x bfloat>, !dbg !180 + %4258 = insertelement <2 x float> poison, float %4158, i64 0, !dbg !179 + %4259 = insertelement <2 x float> %4258, float %4159, i64 1, !dbg !179 + %4260 = fmul <2 x float> %4259, splat (float 0x3FB6A09E60000000), !dbg !179 + %4261 = fptrunc <2 x float> %4260 to <2 x bfloat>, !dbg !180 + %4262 = insertelement <2 x float> poison, float %4160, i64 0, !dbg !179 + %4263 = insertelement <2 x float> %4262, float %4161, i64 1, !dbg !179 + %4264 = fmul <2 x float> %4263, splat (float 0x3FB6A09E60000000), !dbg !179 + %4265 = fptrunc <2 x float> %4264 to <2 x bfloat>, !dbg !180 + %4266 = insertelement <2 x float> poison, float %4162, i64 0, !dbg !179 + %4267 = insertelement <2 x float> %4266, float %4163, i64 1, !dbg !179 + %4268 = fmul <2 x float> %4267, splat (float 0x3FB6A09E60000000), !dbg !179 + %4269 = fptrunc <2 x float> %4268 to <2 x bfloat>, !dbg !180 + %4270 = insertelement <2 x float> poison, float %4164, i64 0, !dbg !179 + %4271 = insertelement <2 x float> %4270, float %4165, i64 1, !dbg !179 + %4272 = fmul <2 x float> %4271, splat (float 0x3FB6A09E60000000), !dbg !179 + %4273 = fptrunc <2 x float> %4272 to <2 x bfloat>, !dbg !180 + %4274 = insertelement <2 x float> poison, float %4166, i64 0, !dbg !179 + %4275 = insertelement <2 x float> %4274, float %4167, i64 1, !dbg !179 + %4276 = fmul <2 x float> %4275, splat (float 0x3FB6A09E60000000), !dbg !179 + %4277 = fptrunc <2 x float> %4276 to <2 x bfloat>, !dbg !180 + %4278 = insertelement <2 x float> poison, float %4168, i64 0, !dbg !179 + %4279 = insertelement <2 x float> %4278, float %4169, i64 1, !dbg !179 + %4280 = fmul <2 x float> %4279, splat (float 0x3FB6A09E60000000), !dbg !179 + %4281 = fptrunc <2 x float> %4280 to <2 x bfloat>, !dbg !180 + %4282 = insertelement <2 x float> poison, float %4170, i64 0, !dbg !179 + %4283 = insertelement <2 x float> %4282, float %4171, i64 1, !dbg !179 + %4284 = fmul <2 x float> %4283, splat (float 0x3FB6A09E60000000), !dbg !179 + %4285 = fptrunc <2 x float> %4284 to <2 x bfloat>, !dbg !180 + %4286 = insertelement <2 x float> poison, float %4172, i64 0, !dbg !179 + %4287 = insertelement <2 x float> %4286, float %4173, i64 1, !dbg !179 + %4288 = fmul <2 x float> %4287, splat (float 0x3FB6A09E60000000), !dbg !179 + %4289 = fptrunc <2 x float> %4288 to <2 x bfloat>, !dbg !180 + %4290 = insertelement <2 x float> poison, float %4174, i64 0, !dbg !179 + %4291 = insertelement <2 x float> %4290, float %4175, i64 1, !dbg !179 + %4292 = fmul <2 x float> %4291, splat (float 0x3FB6A09E60000000), !dbg !179 + %4293 = fptrunc <2 x float> %4292 to <2 x bfloat>, !dbg !180 + %4294 = insertelement <2 x float> poison, float %4176, i64 0, !dbg !179 + %4295 = insertelement <2 x float> %4294, float %4177, i64 1, !dbg !179 + %4296 = fmul <2 x float> %4295, splat (float 0x3FB6A09E60000000), !dbg !179 + %4297 = fptrunc <2 x float> %4296 to <2 x bfloat>, !dbg !180 + %4298 = insertelement <2 x float> poison, float %4178, i64 0, !dbg !179 + %4299 = insertelement <2 x float> %4298, float %4179, i64 1, !dbg !179 + %4300 = fmul <2 x float> %4299, splat (float 0x3FB6A09E60000000), !dbg !179 + %4301 = fptrunc <2 x float> %4300 to <2 x bfloat>, !dbg !180 + %4302 = insertelement <2 x float> poison, float %4180, i64 0, !dbg !179 + %4303 = insertelement <2 x float> %4302, float %4181, i64 1, !dbg !179 + %4304 = fmul <2 x float> %4303, splat (float 0x3FB6A09E60000000), !dbg !179 + %4305 = fptrunc <2 x float> %4304 to <2 x bfloat>, !dbg !180 + %4306 = insertelement <2 x float> poison, float %4182, i64 0, !dbg !179 + %4307 = insertelement <2 x float> %4306, float %4183, i64 1, !dbg !179 + %4308 = fmul <2 x float> %4307, splat (float 0x3FB6A09E60000000), !dbg !179 + %4309 = fptrunc <2 x float> %4308 to <2 x bfloat>, !dbg !180 + %4310 = insertelement <2 x float> poison, float %4184, i64 0, !dbg !179 + %4311 = insertelement <2 x float> %4310, float %4185, i64 1, !dbg !179 + %4312 = fmul <2 x float> %4311, splat (float 0x3FB6A09E60000000), !dbg !179 + %4313 = fptrunc <2 x float> %4312 to <2 x bfloat>, !dbg !180 + %4314 = shl nuw nsw i32 %365, 13, !dbg !180 + %4315 = shl nuw nsw i32 %35, 5, !dbg !180 + %4316 = and i32 %4315, 7264, !dbg !180 + %4317 = and i32 %35, 24, !dbg !180 + %4318 = shl nuw nsw i32 %4317, 4, !dbg !180 + %4319 = shl nuw nsw i32 %35, 2, !dbg !180 + %4320 = and i32 %4319, 16, !dbg !180 + %4321 = or disjoint i32 %4314, %4320, !dbg !180 + %4322 = or disjoint i32 %4316, %4318, !dbg !180 + %4323 = or disjoint i32 %4321, %4322, !dbg !180 + %4324 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4323, !dbg !180 + %4325 = bitcast <2 x bfloat> %4189 to i32, !dbg !180 + %4326 = bitcast <2 x bfloat> %4197 to i32, !dbg !180 + %4327 = bitcast <2 x bfloat> %4205 to i32, !dbg !180 + %4328 = bitcast <2 x bfloat> %4213 to i32, !dbg !180 + %4329 = insertelement <4 x i32> poison, i32 %4325, i64 0, !dbg !180 + %4330 = insertelement <4 x i32> %4329, i32 %4326, i64 1, !dbg !180 + %4331 = insertelement <4 x i32> %4330, i32 %4327, i64 2, !dbg !180 + %4332 = insertelement <4 x i32> %4331, i32 %4328, i64 3, !dbg !180 + store <4 x i32> %4332, ptr addrspace(3) %4324, align 16, !dbg !180 + %4333 = getelementptr inbounds nuw i8, ptr addrspace(3) %4324, i32 512, !dbg !180 + %4334 = bitcast <2 x bfloat> %4193 to i32, !dbg !180 + %4335 = bitcast <2 x bfloat> %4201 to i32, !dbg !180 + %4336 = bitcast <2 x bfloat> %4209 to i32, !dbg !180 + %4337 = bitcast <2 x bfloat> %4217 to i32, !dbg !180 + %4338 = insertelement <4 x i32> poison, i32 %4334, i64 0, !dbg !180 + %4339 = insertelement <4 x i32> %4338, i32 %4335, i64 1, !dbg !180 + %4340 = insertelement <4 x i32> %4339, i32 %4336, i64 2, !dbg !180 + %4341 = insertelement <4 x i32> %4340, i32 %4337, i64 3, !dbg !180 + store <4 x i32> %4341, ptr addrspace(3) %4333, align 16, !dbg !180 + %4342 = xor i32 %4323, 32, !dbg !180 + %4343 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4342, !dbg !180 + %4344 = bitcast <2 x bfloat> %4221 to i32, !dbg !180 + %4345 = bitcast <2 x bfloat> %4229 to i32, !dbg !180 + %4346 = bitcast <2 x bfloat> %4237 to i32, !dbg !180 + %4347 = bitcast <2 x bfloat> %4245 to i32, !dbg !180 + %4348 = insertelement <4 x i32> poison, i32 %4344, i64 0, !dbg !180 + %4349 = insertelement <4 x i32> %4348, i32 %4345, i64 1, !dbg !180 + %4350 = insertelement <4 x i32> %4349, i32 %4346, i64 2, !dbg !180 + %4351 = insertelement <4 x i32> %4350, i32 %4347, i64 3, !dbg !180 + store <4 x i32> %4351, ptr addrspace(3) %4343, align 16, !dbg !180 + %4352 = getelementptr inbounds nuw i8, ptr addrspace(3) %4343, i32 512, !dbg !180 + %4353 = bitcast <2 x bfloat> %4225 to i32, !dbg !180 + %4354 = bitcast <2 x bfloat> %4233 to i32, !dbg !180 + %4355 = bitcast <2 x bfloat> %4241 to i32, !dbg !180 + %4356 = bitcast <2 x bfloat> %4249 to i32, !dbg !180 + %4357 = insertelement <4 x i32> poison, i32 %4353, i64 0, !dbg !180 + %4358 = insertelement <4 x i32> %4357, i32 %4354, i64 1, !dbg !180 + %4359 = insertelement <4 x i32> %4358, i32 %4355, i64 2, !dbg !180 + %4360 = insertelement <4 x i32> %4359, i32 %4356, i64 3, !dbg !180 + store <4 x i32> %4360, ptr addrspace(3) %4352, align 16, !dbg !180 + %4361 = xor i32 %4323, 64, !dbg !180 + %4362 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4361, !dbg !180 + %4363 = bitcast <2 x bfloat> %4253 to i32, !dbg !180 + %4364 = bitcast <2 x bfloat> %4261 to i32, !dbg !180 + %4365 = bitcast <2 x bfloat> %4269 to i32, !dbg !180 + %4366 = bitcast <2 x bfloat> %4277 to i32, !dbg !180 + %4367 = insertelement <4 x i32> poison, i32 %4363, i64 0, !dbg !180 + %4368 = insertelement <4 x i32> %4367, i32 %4364, i64 1, !dbg !180 + %4369 = insertelement <4 x i32> %4368, i32 %4365, i64 2, !dbg !180 + %4370 = insertelement <4 x i32> %4369, i32 %4366, i64 3, !dbg !180 + store <4 x i32> %4370, ptr addrspace(3) %4362, align 16, !dbg !180 + %4371 = getelementptr inbounds nuw i8, ptr addrspace(3) %4362, i32 512, !dbg !180 + %4372 = bitcast <2 x bfloat> %4257 to i32, !dbg !180 + %4373 = bitcast <2 x bfloat> %4265 to i32, !dbg !180 + %4374 = bitcast <2 x bfloat> %4273 to i32, !dbg !180 + %4375 = bitcast <2 x bfloat> %4281 to i32, !dbg !180 + %4376 = insertelement <4 x i32> poison, i32 %4372, i64 0, !dbg !180 + %4377 = insertelement <4 x i32> %4376, i32 %4373, i64 1, !dbg !180 + %4378 = insertelement <4 x i32> %4377, i32 %4374, i64 2, !dbg !180 + %4379 = insertelement <4 x i32> %4378, i32 %4375, i64 3, !dbg !180 + store <4 x i32> %4379, ptr addrspace(3) %4371, align 16, !dbg !180 + %4380 = xor i32 %4323, 96, !dbg !180 + %4381 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4380, !dbg !180 + %4382 = bitcast <2 x bfloat> %4285 to i32, !dbg !180 + %4383 = bitcast <2 x bfloat> %4293 to i32, !dbg !180 + %4384 = bitcast <2 x bfloat> %4301 to i32, !dbg !180 + %4385 = bitcast <2 x bfloat> %4309 to i32, !dbg !180 + %4386 = insertelement <4 x i32> poison, i32 %4382, i64 0, !dbg !180 + %4387 = insertelement <4 x i32> %4386, i32 %4383, i64 1, !dbg !180 + %4388 = insertelement <4 x i32> %4387, i32 %4384, i64 2, !dbg !180 + %4389 = insertelement <4 x i32> %4388, i32 %4385, i64 3, !dbg !180 + store <4 x i32> %4389, ptr addrspace(3) %4381, align 16, !dbg !180 + %4390 = getelementptr inbounds nuw i8, ptr addrspace(3) %4381, i32 512, !dbg !180 + %4391 = bitcast <2 x bfloat> %4289 to i32, !dbg !180 + %4392 = bitcast <2 x bfloat> %4297 to i32, !dbg !180 + %4393 = bitcast <2 x bfloat> %4305 to i32, !dbg !180 + %4394 = bitcast <2 x bfloat> %4313 to i32, !dbg !180 + %4395 = insertelement <4 x i32> poison, i32 %4391, i64 0, !dbg !180 + %4396 = insertelement <4 x i32> %4395, i32 %4392, i64 1, !dbg !180 + %4397 = insertelement <4 x i32> %4396, i32 %4393, i64 2, !dbg !180 + %4398 = insertelement <4 x i32> %4397, i32 %4394, i64 3, !dbg !180 + store <4 x i32> %4398, ptr addrspace(3) %4390, align 16, !dbg !180 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !180 + %4399 = shl nuw nsw i32 %4317, 10, !dbg !180 + %4400 = shl nuw nsw i32 %365, 5, !dbg !180 + %4401 = and i32 %4319, 1008, !dbg !180 + %4402 = or disjoint i32 %4399, %4400, !dbg !180 + %4403 = xor i32 %4402, %4401, !dbg !180 + %4404 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4403, !dbg !180 + %4405 = ptrtoint ptr addrspace(3) %4404 to i32, !dbg !180 + %4406 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4405) #3, !dbg !180 + %4407 = extractvalue { i32, i32, i32, i32 } %4406, 0, !dbg !180 + %4408 = extractvalue { i32, i32, i32, i32 } %4406, 1, !dbg !180 + %4409 = extractvalue { i32, i32, i32, i32 } %4406, 2, !dbg !180 + %4410 = extractvalue { i32, i32, i32, i32 } %4406, 3, !dbg !180 + %4411 = getelementptr inbounds nuw i8, ptr addrspace(3) %4404, i32 1024, !dbg !180 + %4412 = ptrtoint ptr addrspace(3) %4411 to i32, !dbg !180 + %4413 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4412) #3, !dbg !180 + %4414 = extractvalue { i32, i32, i32, i32 } %4413, 0, !dbg !180 + %4415 = extractvalue { i32, i32, i32, i32 } %4413, 1, !dbg !180 + %4416 = extractvalue { i32, i32, i32, i32 } %4413, 2, !dbg !180 + %4417 = extractvalue { i32, i32, i32, i32 } %4413, 3, !dbg !180 + %4418 = getelementptr inbounds nuw i8, ptr addrspace(3) %4404, i32 2048, !dbg !180 + %4419 = ptrtoint ptr addrspace(3) %4418 to i32, !dbg !180 + %4420 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4419) #3, !dbg !180 + %4421 = extractvalue { i32, i32, i32, i32 } %4420, 0, !dbg !180 + %4422 = extractvalue { i32, i32, i32, i32 } %4420, 1, !dbg !180 + %4423 = extractvalue { i32, i32, i32, i32 } %4420, 2, !dbg !180 + %4424 = extractvalue { i32, i32, i32, i32 } %4420, 3, !dbg !180 + %4425 = getelementptr inbounds nuw i8, ptr addrspace(3) %4404, i32 3072, !dbg !180 + %4426 = ptrtoint ptr addrspace(3) %4425 to i32, !dbg !180 + %4427 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4426) #3, !dbg !180 + %4428 = extractvalue { i32, i32, i32, i32 } %4427, 0, !dbg !180 + %4429 = extractvalue { i32, i32, i32, i32 } %4427, 1, !dbg !180 + %4430 = extractvalue { i32, i32, i32, i32 } %4427, 2, !dbg !180 + %4431 = extractvalue { i32, i32, i32, i32 } %4427, 3, !dbg !180 + %4432 = getelementptr inbounds nuw i8, ptr addrspace(3) %4404, i32 4096, !dbg !180 + %4433 = ptrtoint ptr addrspace(3) %4432 to i32, !dbg !180 + %4434 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4433) #3, !dbg !180 + %4435 = extractvalue { i32, i32, i32, i32 } %4434, 0, !dbg !180 + %4436 = extractvalue { i32, i32, i32, i32 } %4434, 1, !dbg !180 + %4437 = extractvalue { i32, i32, i32, i32 } %4434, 2, !dbg !180 + %4438 = extractvalue { i32, i32, i32, i32 } %4434, 3, !dbg !180 + %4439 = getelementptr inbounds nuw i8, ptr addrspace(3) %4404, i32 5120, !dbg !180 + %4440 = ptrtoint ptr addrspace(3) %4439 to i32, !dbg !180 + %4441 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4440) #3, !dbg !180 + %4442 = extractvalue { i32, i32, i32, i32 } %4441, 0, !dbg !180 + %4443 = extractvalue { i32, i32, i32, i32 } %4441, 1, !dbg !180 + %4444 = extractvalue { i32, i32, i32, i32 } %4441, 2, !dbg !180 + %4445 = extractvalue { i32, i32, i32, i32 } %4441, 3, !dbg !180 + %4446 = getelementptr inbounds nuw i8, ptr addrspace(3) %4404, i32 6144, !dbg !180 + %4447 = ptrtoint ptr addrspace(3) %4446 to i32, !dbg !180 + %4448 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4447) #3, !dbg !180 + %4449 = extractvalue { i32, i32, i32, i32 } %4448, 0, !dbg !180 + %4450 = extractvalue { i32, i32, i32, i32 } %4448, 1, !dbg !180 + %4451 = extractvalue { i32, i32, i32, i32 } %4448, 2, !dbg !180 + %4452 = extractvalue { i32, i32, i32, i32 } %4448, 3, !dbg !180 + %4453 = getelementptr inbounds nuw i8, ptr addrspace(3) %4404, i32 7168, !dbg !180 + %4454 = ptrtoint ptr addrspace(3) %4453 to i32, !dbg !180 + %4455 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4454) #3, !dbg !180 + %4456 = extractvalue { i32, i32, i32, i32 } %4455, 0, !dbg !180 + %4457 = extractvalue { i32, i32, i32, i32 } %4455, 1, !dbg !180 + %4458 = extractvalue { i32, i32, i32, i32 } %4455, 2, !dbg !180 + %4459 = extractvalue { i32, i32, i32, i32 } %4455, 3, !dbg !180 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %4407, i32 %4408, i32 %4409, i32 %4410, ptr addrspace(1) %4114) #3, !dbg !180 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %4414, i32 %4415, i32 %4416, i32 %4417, ptr addrspace(1) %4115) #3, !dbg !180 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %4421, i32 %4422, i32 %4423, i32 %4424, ptr addrspace(1) %4116) #3, !dbg !180 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %4428, i32 %4429, i32 %4430, i32 %4431, ptr addrspace(1) %4117) #3, !dbg !180 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %4435, i32 %4436, i32 %4437, i32 %4438, ptr addrspace(1) %4118) #3, !dbg !180 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %4442, i32 %4443, i32 %4444, i32 %4445, ptr addrspace(1) %4119) #3, !dbg !180 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %4449, i32 %4450, i32 %4451, i32 %4452, ptr addrspace(1) %4120) #3, !dbg !180 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %4456, i32 %4457, i32 %4458, i32 %4459, ptr addrspace(1) %4121) #3, !dbg !180 + br label %10331, !dbg !24 + +4460: ; preds = %20 + %4461 = shl nuw nsw i32 %21, 7, !dbg !181 + %4462 = or disjoint i32 %38, %4461, !dbg !182 + %4463 = or disjoint i32 %39, %4461, !dbg !182 + %4464 = or disjoint i32 %40, %4461, !dbg !182 + %4465 = or disjoint i32 %41, %4461, !dbg !182 + %4466 = or disjoint i32 %42, %4461, !dbg !182 + %4467 = or disjoint i32 %43, %4461, !dbg !182 + %4468 = or disjoint i32 %44, %4461, !dbg !182 + %4469 = or disjoint i32 %45, %4461, !dbg !182 + %4470 = or disjoint i32 %50, %4461, !dbg !182 + %4471 = or disjoint i32 %51, %4461, !dbg !182 + %4472 = shl nuw nsw i32 %4462, 7, !dbg !183 + %4473 = shl nuw nsw i32 %4463, 7, !dbg !183 + %4474 = shl nuw nsw i32 %4464, 7, !dbg !183 + %4475 = shl nuw nsw i32 %4465, 7, !dbg !183 + %4476 = shl nuw nsw i32 %4466, 7, !dbg !183 + %4477 = shl nuw nsw i32 %4467, 7, !dbg !183 + %4478 = shl nuw nsw i32 %4468, 7, !dbg !183 + %4479 = shl nuw nsw i32 %4469, 7, !dbg !183 + %4480 = zext nneg i32 %4472 to i64, !dbg !185 + %4481 = getelementptr bfloat, ptr addrspace(1) %32, i64 %4480, !dbg !185 + %4482 = zext nneg i32 %4473 to i64, !dbg !185 + %4483 = getelementptr bfloat, ptr addrspace(1) %32, i64 %4482, !dbg !185 + %4484 = zext nneg i32 %4474 to i64, !dbg !185 + %4485 = getelementptr bfloat, ptr addrspace(1) %32, i64 %4484, !dbg !185 + %4486 = zext nneg i32 %4475 to i64, !dbg !185 + %4487 = getelementptr bfloat, ptr addrspace(1) %32, i64 %4486, !dbg !185 + %4488 = zext nneg i32 %4476 to i64, !dbg !185 + %4489 = getelementptr bfloat, ptr addrspace(1) %32, i64 %4488, !dbg !185 + %4490 = zext nneg i32 %4477 to i64, !dbg !185 + %4491 = getelementptr bfloat, ptr addrspace(1) %32, i64 %4490, !dbg !185 + %4492 = zext nneg i32 %4478 to i64, !dbg !185 + %4493 = getelementptr bfloat, ptr addrspace(1) %32, i64 %4492, !dbg !185 + %4494 = zext nneg i32 %4479 to i64, !dbg !185 + %4495 = getelementptr bfloat, ptr addrspace(1) %32, i64 %4494, !dbg !185 + %4496 = shl nuw nsw i32 %35, 3, !dbg !186 + %4497 = and i32 %4496, 120, !dbg !186 + %4498 = zext nneg i32 %4497 to i64, !dbg !187 + %4499 = getelementptr bfloat, ptr addrspace(1) %4481, i64 %4498, !dbg !187 + %4500 = getelementptr bfloat, ptr addrspace(1) %4483, i64 %4498, !dbg !187 + %4501 = getelementptr bfloat, ptr addrspace(1) %4485, i64 %4498, !dbg !187 + %4502 = getelementptr bfloat, ptr addrspace(1) %4487, i64 %4498, !dbg !187 + %4503 = getelementptr bfloat, ptr addrspace(1) %4489, i64 %4498, !dbg !187 + %4504 = getelementptr bfloat, ptr addrspace(1) %4491, i64 %4498, !dbg !187 + %4505 = getelementptr bfloat, ptr addrspace(1) %4493, i64 %4498, !dbg !187 + %4506 = getelementptr bfloat, ptr addrspace(1) %4495, i64 %4498, !dbg !187 + %4507 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4499) #3, !dbg !188 + %4508 = extractvalue { i32, i32, i32, i32 } %4507, 0, !dbg !188 + %4509 = extractvalue { i32, i32, i32, i32 } %4507, 1, !dbg !188 + %4510 = extractvalue { i32, i32, i32, i32 } %4507, 2, !dbg !188 + %4511 = extractvalue { i32, i32, i32, i32 } %4507, 3, !dbg !188 + %4512 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4500) #3, !dbg !188 + %4513 = extractvalue { i32, i32, i32, i32 } %4512, 0, !dbg !188 + %4514 = extractvalue { i32, i32, i32, i32 } %4512, 1, !dbg !188 + %4515 = extractvalue { i32, i32, i32, i32 } %4512, 2, !dbg !188 + %4516 = extractvalue { i32, i32, i32, i32 } %4512, 3, !dbg !188 + %4517 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4501) #3, !dbg !188 + %4518 = extractvalue { i32, i32, i32, i32 } %4517, 0, !dbg !188 + %4519 = extractvalue { i32, i32, i32, i32 } %4517, 1, !dbg !188 + %4520 = extractvalue { i32, i32, i32, i32 } %4517, 2, !dbg !188 + %4521 = extractvalue { i32, i32, i32, i32 } %4517, 3, !dbg !188 + %4522 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4502) #3, !dbg !188 + %4523 = extractvalue { i32, i32, i32, i32 } %4522, 0, !dbg !188 + %4524 = extractvalue { i32, i32, i32, i32 } %4522, 1, !dbg !188 + %4525 = extractvalue { i32, i32, i32, i32 } %4522, 2, !dbg !188 + %4526 = extractvalue { i32, i32, i32, i32 } %4522, 3, !dbg !188 + %4527 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4503) #3, !dbg !188 + %4528 = extractvalue { i32, i32, i32, i32 } %4527, 0, !dbg !188 + %4529 = extractvalue { i32, i32, i32, i32 } %4527, 1, !dbg !188 + %4530 = extractvalue { i32, i32, i32, i32 } %4527, 2, !dbg !188 + %4531 = extractvalue { i32, i32, i32, i32 } %4527, 3, !dbg !188 + %4532 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4504) #3, !dbg !188 + %4533 = extractvalue { i32, i32, i32, i32 } %4532, 0, !dbg !188 + %4534 = extractvalue { i32, i32, i32, i32 } %4532, 1, !dbg !188 + %4535 = extractvalue { i32, i32, i32, i32 } %4532, 2, !dbg !188 + %4536 = extractvalue { i32, i32, i32, i32 } %4532, 3, !dbg !188 + %4537 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4505) #3, !dbg !188 + %4538 = extractvalue { i32, i32, i32, i32 } %4537, 0, !dbg !188 + %4539 = extractvalue { i32, i32, i32, i32 } %4537, 1, !dbg !188 + %4540 = extractvalue { i32, i32, i32, i32 } %4537, 2, !dbg !188 + %4541 = extractvalue { i32, i32, i32, i32 } %4537, 3, !dbg !188 + %4542 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4506) #3, !dbg !188 + %4543 = extractvalue { i32, i32, i32, i32 } %4542, 0, !dbg !188 + %4544 = extractvalue { i32, i32, i32, i32 } %4542, 1, !dbg !188 + %4545 = extractvalue { i32, i32, i32, i32 } %4542, 2, !dbg !188 + %4546 = extractvalue { i32, i32, i32, i32 } %4542, 3, !dbg !188 + %4547 = shl nuw nsw i32 %35, 4, !dbg !188 + %4548 = and i32 %4547, 112, !dbg !188 + %4549 = shl nuw nsw i32 %37, 3, !dbg !188 + %4550 = and i32 %35, 112, !dbg !188 + %4551 = and i32 %35, 8, !dbg !188 + %4552 = shl nuw nsw i32 %4551, 11, !dbg !188 + %4553 = or disjoint i32 %4548, %4549, !dbg !188 + %4554 = xor i32 %4553, %4550, !dbg !188 + %4555 = or disjoint i32 %4554, %4552, !dbg !188 + %4556 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4555, !dbg !188 + %4557 = insertelement <4 x i32> poison, i32 %4508, i64 0, !dbg !188 + %4558 = insertelement <4 x i32> %4557, i32 %4509, i64 1, !dbg !188 + %4559 = insertelement <4 x i32> %4558, i32 %4510, i64 2, !dbg !188 + %4560 = insertelement <4 x i32> %4559, i32 %4511, i64 3, !dbg !188 + store <4 x i32> %4560, ptr addrspace(3) %4556, align 16, !dbg !188 + %4561 = or disjoint i32 %4555, 2048, !dbg !188 + %4562 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4561, !dbg !188 + %4563 = insertelement <4 x i32> poison, i32 %4513, i64 0, !dbg !188 + %4564 = insertelement <4 x i32> %4563, i32 %4514, i64 1, !dbg !188 + %4565 = insertelement <4 x i32> %4564, i32 %4515, i64 2, !dbg !188 + %4566 = insertelement <4 x i32> %4565, i32 %4516, i64 3, !dbg !188 + store <4 x i32> %4566, ptr addrspace(3) %4562, align 16, !dbg !188 + %4567 = or disjoint i32 %4555, 4096, !dbg !188 + %4568 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4567, !dbg !188 + %4569 = insertelement <4 x i32> poison, i32 %4518, i64 0, !dbg !188 + %4570 = insertelement <4 x i32> %4569, i32 %4519, i64 1, !dbg !188 + %4571 = insertelement <4 x i32> %4570, i32 %4520, i64 2, !dbg !188 + %4572 = insertelement <4 x i32> %4571, i32 %4521, i64 3, !dbg !188 + store <4 x i32> %4572, ptr addrspace(3) %4568, align 16, !dbg !188 + %4573 = or disjoint i32 %4555, 6144, !dbg !188 + %4574 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4573, !dbg !188 + %4575 = insertelement <4 x i32> poison, i32 %4523, i64 0, !dbg !188 + %4576 = insertelement <4 x i32> %4575, i32 %4524, i64 1, !dbg !188 + %4577 = insertelement <4 x i32> %4576, i32 %4525, i64 2, !dbg !188 + %4578 = insertelement <4 x i32> %4577, i32 %4526, i64 3, !dbg !188 + store <4 x i32> %4578, ptr addrspace(3) %4574, align 16, !dbg !188 + %4579 = or disjoint i32 %4555, 8192, !dbg !188 + %4580 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4579, !dbg !188 + %4581 = insertelement <4 x i32> poison, i32 %4528, i64 0, !dbg !188 + %4582 = insertelement <4 x i32> %4581, i32 %4529, i64 1, !dbg !188 + %4583 = insertelement <4 x i32> %4582, i32 %4530, i64 2, !dbg !188 + %4584 = insertelement <4 x i32> %4583, i32 %4531, i64 3, !dbg !188 + store <4 x i32> %4584, ptr addrspace(3) %4580, align 16, !dbg !188 + %4585 = or disjoint i32 %4555, 10240, !dbg !188 + %4586 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4585, !dbg !188 + %4587 = insertelement <4 x i32> poison, i32 %4533, i64 0, !dbg !188 + %4588 = insertelement <4 x i32> %4587, i32 %4534, i64 1, !dbg !188 + %4589 = insertelement <4 x i32> %4588, i32 %4535, i64 2, !dbg !188 + %4590 = insertelement <4 x i32> %4589, i32 %4536, i64 3, !dbg !188 + store <4 x i32> %4590, ptr addrspace(3) %4586, align 16, !dbg !188 + %4591 = or disjoint i32 %4555, 12288, !dbg !188 + %4592 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4591, !dbg !188 + %4593 = insertelement <4 x i32> poison, i32 %4538, i64 0, !dbg !188 + %4594 = insertelement <4 x i32> %4593, i32 %4539, i64 1, !dbg !188 + %4595 = insertelement <4 x i32> %4594, i32 %4540, i64 2, !dbg !188 + %4596 = insertelement <4 x i32> %4595, i32 %4541, i64 3, !dbg !188 + store <4 x i32> %4596, ptr addrspace(3) %4592, align 16, !dbg !188 + %4597 = or disjoint i32 %4555, 14336, !dbg !188 + %4598 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4597, !dbg !188 + %4599 = insertelement <4 x i32> poison, i32 %4543, i64 0, !dbg !188 + %4600 = insertelement <4 x i32> %4599, i32 %4544, i64 1, !dbg !188 + %4601 = insertelement <4 x i32> %4600, i32 %4545, i64 2, !dbg !188 + %4602 = insertelement <4 x i32> %4601, i32 %4546, i64 3, !dbg !188 + store <4 x i32> %4602, ptr addrspace(3) %4598, align 16, !dbg !188 + %4603 = getelementptr bfloat, ptr addrspace(1) %33, i64 %4480, !dbg !189 + %4604 = getelementptr bfloat, ptr addrspace(1) %33, i64 %4482, !dbg !189 + %4605 = getelementptr bfloat, ptr addrspace(1) %33, i64 %4484, !dbg !189 + %4606 = getelementptr bfloat, ptr addrspace(1) %33, i64 %4486, !dbg !189 + %4607 = getelementptr bfloat, ptr addrspace(1) %33, i64 %4488, !dbg !189 + %4608 = getelementptr bfloat, ptr addrspace(1) %33, i64 %4490, !dbg !189 + %4609 = getelementptr bfloat, ptr addrspace(1) %33, i64 %4492, !dbg !189 + %4610 = getelementptr bfloat, ptr addrspace(1) %33, i64 %4494, !dbg !189 + %4611 = getelementptr bfloat, ptr addrspace(1) %4603, i64 %4498, !dbg !191 + %4612 = getelementptr bfloat, ptr addrspace(1) %4604, i64 %4498, !dbg !191 + %4613 = getelementptr bfloat, ptr addrspace(1) %4605, i64 %4498, !dbg !191 + %4614 = getelementptr bfloat, ptr addrspace(1) %4606, i64 %4498, !dbg !191 + %4615 = getelementptr bfloat, ptr addrspace(1) %4607, i64 %4498, !dbg !191 + %4616 = getelementptr bfloat, ptr addrspace(1) %4608, i64 %4498, !dbg !191 + %4617 = getelementptr bfloat, ptr addrspace(1) %4609, i64 %4498, !dbg !191 + %4618 = getelementptr bfloat, ptr addrspace(1) %4610, i64 %4498, !dbg !191 + %4619 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4611) #3, !dbg !192 + %4620 = extractvalue { i32, i32, i32, i32 } %4619, 0, !dbg !192 + %4621 = extractvalue { i32, i32, i32, i32 } %4619, 1, !dbg !192 + %4622 = extractvalue { i32, i32, i32, i32 } %4619, 2, !dbg !192 + %4623 = extractvalue { i32, i32, i32, i32 } %4619, 3, !dbg !192 + %4624 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4612) #3, !dbg !192 + %4625 = extractvalue { i32, i32, i32, i32 } %4624, 0, !dbg !192 + %4626 = extractvalue { i32, i32, i32, i32 } %4624, 1, !dbg !192 + %4627 = extractvalue { i32, i32, i32, i32 } %4624, 2, !dbg !192 + %4628 = extractvalue { i32, i32, i32, i32 } %4624, 3, !dbg !192 + %4629 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4613) #3, !dbg !192 + %4630 = extractvalue { i32, i32, i32, i32 } %4629, 0, !dbg !192 + %4631 = extractvalue { i32, i32, i32, i32 } %4629, 1, !dbg !192 + %4632 = extractvalue { i32, i32, i32, i32 } %4629, 2, !dbg !192 + %4633 = extractvalue { i32, i32, i32, i32 } %4629, 3, !dbg !192 + %4634 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4614) #3, !dbg !192 + %4635 = extractvalue { i32, i32, i32, i32 } %4634, 0, !dbg !192 + %4636 = extractvalue { i32, i32, i32, i32 } %4634, 1, !dbg !192 + %4637 = extractvalue { i32, i32, i32, i32 } %4634, 2, !dbg !192 + %4638 = extractvalue { i32, i32, i32, i32 } %4634, 3, !dbg !192 + %4639 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4615) #3, !dbg !192 + %4640 = extractvalue { i32, i32, i32, i32 } %4639, 0, !dbg !192 + %4641 = extractvalue { i32, i32, i32, i32 } %4639, 1, !dbg !192 + %4642 = extractvalue { i32, i32, i32, i32 } %4639, 2, !dbg !192 + %4643 = extractvalue { i32, i32, i32, i32 } %4639, 3, !dbg !192 + %4644 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4616) #3, !dbg !192 + %4645 = extractvalue { i32, i32, i32, i32 } %4644, 0, !dbg !192 + %4646 = extractvalue { i32, i32, i32, i32 } %4644, 1, !dbg !192 + %4647 = extractvalue { i32, i32, i32, i32 } %4644, 2, !dbg !192 + %4648 = extractvalue { i32, i32, i32, i32 } %4644, 3, !dbg !192 + %4649 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4617) #3, !dbg !192 + %4650 = extractvalue { i32, i32, i32, i32 } %4649, 0, !dbg !192 + %4651 = extractvalue { i32, i32, i32, i32 } %4649, 1, !dbg !192 + %4652 = extractvalue { i32, i32, i32, i32 } %4649, 2, !dbg !192 + %4653 = extractvalue { i32, i32, i32, i32 } %4649, 3, !dbg !192 + %4654 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4618) #3, !dbg !192 + %4655 = extractvalue { i32, i32, i32, i32 } %4654, 0, !dbg !192 + %4656 = extractvalue { i32, i32, i32, i32 } %4654, 1, !dbg !192 + %4657 = extractvalue { i32, i32, i32, i32 } %4654, 2, !dbg !192 + %4658 = extractvalue { i32, i32, i32, i32 } %4654, 3, !dbg !192 + %4659 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4555, !dbg !192 + %4660 = insertelement <4 x i32> poison, i32 %4620, i64 0, !dbg !192 + %4661 = insertelement <4 x i32> %4660, i32 %4621, i64 1, !dbg !192 + %4662 = insertelement <4 x i32> %4661, i32 %4622, i64 2, !dbg !192 + %4663 = insertelement <4 x i32> %4662, i32 %4623, i64 3, !dbg !192 + store <4 x i32> %4663, ptr addrspace(3) %4659, align 16, !dbg !192 + %4664 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4561, !dbg !192 + %4665 = insertelement <4 x i32> poison, i32 %4625, i64 0, !dbg !192 + %4666 = insertelement <4 x i32> %4665, i32 %4626, i64 1, !dbg !192 + %4667 = insertelement <4 x i32> %4666, i32 %4627, i64 2, !dbg !192 + %4668 = insertelement <4 x i32> %4667, i32 %4628, i64 3, !dbg !192 + store <4 x i32> %4668, ptr addrspace(3) %4664, align 16, !dbg !192 + %4669 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4567, !dbg !192 + %4670 = insertelement <4 x i32> poison, i32 %4630, i64 0, !dbg !192 + %4671 = insertelement <4 x i32> %4670, i32 %4631, i64 1, !dbg !192 + %4672 = insertelement <4 x i32> %4671, i32 %4632, i64 2, !dbg !192 + %4673 = insertelement <4 x i32> %4672, i32 %4633, i64 3, !dbg !192 + store <4 x i32> %4673, ptr addrspace(3) %4669, align 16, !dbg !192 + %4674 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4573, !dbg !192 + %4675 = insertelement <4 x i32> poison, i32 %4635, i64 0, !dbg !192 + %4676 = insertelement <4 x i32> %4675, i32 %4636, i64 1, !dbg !192 + %4677 = insertelement <4 x i32> %4676, i32 %4637, i64 2, !dbg !192 + %4678 = insertelement <4 x i32> %4677, i32 %4638, i64 3, !dbg !192 + store <4 x i32> %4678, ptr addrspace(3) %4674, align 16, !dbg !192 + %4679 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4579, !dbg !192 + %4680 = insertelement <4 x i32> poison, i32 %4640, i64 0, !dbg !192 + %4681 = insertelement <4 x i32> %4680, i32 %4641, i64 1, !dbg !192 + %4682 = insertelement <4 x i32> %4681, i32 %4642, i64 2, !dbg !192 + %4683 = insertelement <4 x i32> %4682, i32 %4643, i64 3, !dbg !192 + store <4 x i32> %4683, ptr addrspace(3) %4679, align 16, !dbg !192 + %4684 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4585, !dbg !192 + %4685 = insertelement <4 x i32> poison, i32 %4645, i64 0, !dbg !192 + %4686 = insertelement <4 x i32> %4685, i32 %4646, i64 1, !dbg !192 + %4687 = insertelement <4 x i32> %4686, i32 %4647, i64 2, !dbg !192 + %4688 = insertelement <4 x i32> %4687, i32 %4648, i64 3, !dbg !192 + store <4 x i32> %4688, ptr addrspace(3) %4684, align 16, !dbg !192 + %4689 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4591, !dbg !192 + %4690 = insertelement <4 x i32> poison, i32 %4650, i64 0, !dbg !192 + %4691 = insertelement <4 x i32> %4690, i32 %4651, i64 1, !dbg !192 + %4692 = insertelement <4 x i32> %4691, i32 %4652, i64 2, !dbg !192 + %4693 = insertelement <4 x i32> %4692, i32 %4653, i64 3, !dbg !192 + store <4 x i32> %4693, ptr addrspace(3) %4689, align 16, !dbg !192 + %4694 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4597, !dbg !192 + %4695 = insertelement <4 x i32> poison, i32 %4655, i64 0, !dbg !192 + %4696 = insertelement <4 x i32> %4695, i32 %4656, i64 1, !dbg !192 + %4697 = insertelement <4 x i32> %4696, i32 %4657, i64 2, !dbg !192 + %4698 = insertelement <4 x i32> %4697, i32 %4658, i64 3, !dbg !192 + store <4 x i32> %4698, ptr addrspace(3) %4694, align 16, !dbg !192 + %4699 = shl nuw nsw i32 %23, 2, !dbg !193 + %4700 = shl i32 %22, 23, !dbg !194 + %4701 = shl nuw nsw i32 %24, 4, !dbg !195 + %4702 = or disjoint i32 %4701, %21, !dbg !196 + %4703 = shl nuw nsw i32 %24, 8, !dbg !197 + %4704 = shl nuw nsw i32 %21, 4, !dbg !198 + %4705 = or disjoint i32 %4703, %4704, !dbg !199 + %4706 = zext nneg i32 %4705 to i64, !dbg !200 + %4707 = getelementptr i32, ptr addrspace(1) %11, i64 %4706, !dbg !200 + %4708 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %4707, i1 true) #3, !dbg !201 + %4709 = shl i32 %4708, 7, !dbg !202 + %4710 = zext nneg i32 %4702 to i64, !dbg !203 + %4711 = getelementptr i32, ptr addrspace(1) %10, i64 %4710, !dbg !203 + %4712 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %4711, i1 true) #3, !dbg !204 + %4713 = and i32 %35, 3, !dbg !205 + %4714 = shl nuw nsw i32 %4713, 1, !dbg !205 + %4715 = or disjoint i32 %4714, 8, !dbg !205 + %4716 = or disjoint i32 %4714, 16, !dbg !205 + %4717 = or disjoint i32 %4714, 24, !dbg !205 + %4718 = or disjoint i32 %4714, 32, !dbg !205 + %4719 = or disjoint i32 %4714, 40, !dbg !205 + %4720 = or disjoint i32 %4714, 48, !dbg !205 + %4721 = or disjoint i32 %4714, 56, !dbg !205 + %4722 = or disjoint i32 %4709, %38, !dbg !206 + %4723 = or disjoint i32 %4709, %39, !dbg !206 + %4724 = or disjoint i32 %4709, %40, !dbg !206 + %4725 = or disjoint i32 %4709, %41, !dbg !206 + %4726 = or disjoint i32 %4709, %4714, !dbg !206 + %4727 = or disjoint i32 %4726, 1, !dbg !206 + %4728 = or disjoint i32 %4709, %4715, !dbg !206 + %4729 = or disjoint i32 %4726, 9, !dbg !206 + %4730 = or disjoint i32 %4709, %4716, !dbg !206 + %4731 = or disjoint i32 %4726, 17, !dbg !206 + %4732 = or disjoint i32 %4709, %4717, !dbg !206 + %4733 = or disjoint i32 %4726, 25, !dbg !206 + %4734 = or disjoint i32 %4709, %4718, !dbg !206 + %4735 = or disjoint i32 %4726, 33, !dbg !206 + %4736 = or disjoint i32 %4709, %4719, !dbg !206 + %4737 = or disjoint i32 %4726, 41, !dbg !206 + %4738 = or disjoint i32 %4709, %4720, !dbg !206 + %4739 = or disjoint i32 %4726, 49, !dbg !206 + %4740 = or disjoint i32 %4709, %4721, !dbg !206 + %4741 = or disjoint i32 %4726, 57, !dbg !206 + %4742 = shl i32 %4722, 12, !dbg !207 + %4743 = shl i32 %4723, 12, !dbg !207 + %4744 = shl i32 %4724, 12, !dbg !207 + %4745 = shl i32 %4725, 12, !dbg !207 + %4746 = shl i32 %4722, 7, !dbg !209 + %4747 = shl i32 %4723, 7, !dbg !209 + %4748 = shl i32 %4724, 7, !dbg !209 + %4749 = shl i32 %4725, 7, !dbg !209 + %4750 = shl i32 %4712, 1, !dbg !210 + %4751 = tail call i32 @llvm.smin.i32(i32 %4750, i32 32), !dbg !211 + %4752 = zext nneg i32 %22 to i64, !dbg !212 + %4753 = getelementptr i64, ptr addrspace(1) %16, i64 %4752, !dbg !212 + %4754 = icmp sgt i32 %4750, 0, !dbg !213 + %4755 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %4753, i1 %4754) #3, !dbg !214 + %4756 = getelementptr i32, ptr addrspace(1) %15, i64 %4706, !dbg !215 + %4757 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %4756, i1 true) #3, !dbg !216 + %4758 = shl i32 %4757, 7, !dbg !217 + %4759 = getelementptr i32, ptr addrspace(1) %14, i64 %4710, !dbg !218 + %4760 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %4759, i1 true) #3, !dbg !219 + %4761 = or disjoint i32 %4758, %38, !dbg !220 + %4762 = or disjoint i32 %4758, %39, !dbg !220 + %4763 = or disjoint i32 %4758, %40, !dbg !220 + %4764 = or disjoint i32 %4758, %41, !dbg !220 + %4765 = or disjoint i32 %4758, %4714, !dbg !220 + %4766 = or disjoint i32 %4758, %4715, !dbg !220 + %4767 = or disjoint i32 %4758, %4716, !dbg !220 + %4768 = or disjoint i32 %4758, %4717, !dbg !220 + %4769 = or disjoint i32 %4758, %4718, !dbg !220 + %4770 = or disjoint i32 %4758, %4719, !dbg !220 + %4771 = or disjoint i32 %4758, %4720, !dbg !220 + %4772 = or disjoint i32 %4758, %4721, !dbg !220 + %4773 = shl i32 %4761, 12, !dbg !221 + %4774 = shl i32 %4762, 12, !dbg !221 + %4775 = shl i32 %4763, 12, !dbg !221 + %4776 = shl i32 %4764, 12, !dbg !221 + %4777 = shl i32 %4761, 7, !dbg !223 + %4778 = shl i32 %4762, 7, !dbg !223 + %4779 = shl i32 %4763, 7, !dbg !223 + %4780 = shl i32 %4764, 7, !dbg !223 + %4781 = shl i32 %4760, 1, !dbg !224 + %4782 = tail call i32 @llvm.smin.i32(i32 %4781, i32 32), !dbg !225 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #3, !dbg !226 + %4783 = shl nuw i32 %22, 16 + %4784 = sext i32 %4742 to i64 + %4785 = sext i32 %4743 to i64 + %4786 = sext i32 %4744 to i64 + %4787 = sext i32 %4745 to i64 + %4788 = sext i32 %4746 to i64 + %4789 = sext i32 %4747 to i64 + %4790 = sext i32 %4748 to i64 + %4791 = sext i32 %4749 to i64 + %4792 = shl nuw nsw i32 %4551, 10 + %4793 = or disjoint i32 %4554, %4792 + %4794 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4793 + %4795 = select i1 %4754, i32 16, i32 0 + %4796 = or disjoint i32 %4793, 2048 + %4797 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4796 + %4798 = or disjoint i32 %4793, 4096 + %4799 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4798 + %4800 = or disjoint i32 %4793, 6144 + %4801 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4800 + %4802 = sext i32 %4726 to i64 + %4803 = sext i32 %4728 to i64 + %4804 = sext i32 %4730 to i64 + %4805 = sext i32 %4732 to i64 + %4806 = sext i32 %4734 to i64 + %4807 = sext i32 %4736 to i64 + %4808 = sext i32 %4738 to i64 + %4809 = sext i32 %4740 to i64 + %4810 = and i32 %35, 252 + %4811 = icmp eq i32 %4810, 0 + %4812 = shl nuw nsw i32 %4713, 3 + %4813 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %4812 + %4814 = select i1 %4754, i32 8, i32 0 + %4815 = or disjoint i32 %4812, 32 + %4816 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %4815 + %4817 = or disjoint i32 %4812, 64 + %4818 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %4817 + %4819 = or disjoint i32 %4812, 96 + %4820 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %4819 + %4821 = or disjoint i32 %4812, 128 + %4822 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %4821 + %4823 = or disjoint i32 %4812, 160 + %4824 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %4823 + %4825 = or disjoint i32 %4812, 192 + %4826 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %4825 + %4827 = or disjoint i32 %4812, 224 + %4828 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %4827 + %4829 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %4793 + %4830 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %4796 + %4831 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %4798 + %4832 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %4800 + %4833 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %4812 + %4834 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %4815 + %4835 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %4817 + %4836 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %4819 + %4837 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %4821 + %4838 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %4823 + %4839 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %4825 + %4840 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %4827 + %4841 = icmp sgt i32 %4750, 1 + %4842 = or disjoint i32 %4726, 64 + %4843 = or disjoint i32 %4728, 64 + %4844 = or disjoint i32 %4730, 64 + %4845 = or disjoint i32 %4732, 64 + %4846 = or disjoint i32 %4734, 64 + %4847 = or disjoint i32 %4736, 64 + %4848 = or disjoint i32 %4738, 64 + %4849 = or disjoint i32 %4740, 64 + %4850 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %4793 + %4851 = select i1 %4841, i32 16, i32 0 + %4852 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %4796 + %4853 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %4798 + %4854 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %4800 + %4855 = sext i32 %4842 to i64 + %4856 = sext i32 %4843 to i64 + %4857 = sext i32 %4844 to i64 + %4858 = sext i32 %4845 to i64 + %4859 = sext i32 %4846 to i64 + %4860 = sext i32 %4847 to i64 + %4861 = sext i32 %4848 to i64 + %4862 = sext i32 %4849 to i64 + %4863 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %4812 + %4864 = select i1 %4841, i32 8, i32 0 + %4865 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %4815 + %4866 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %4817 + %4867 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %4819 + %4868 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %4821 + %4869 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %4823 + %4870 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %4825 + %4871 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %4827 + %4872 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %4793 + %4873 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %4796 + %4874 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %4798 + %4875 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %4800 + %4876 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %4812 + %4877 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %4815 + %4878 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %4817 + %4879 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %4819 + %4880 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %4821 + %4881 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %4823 + %4882 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %4825 + %4883 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %4827 + %4884 = add i32 %4751, -2 + %4885 = add nsw i32 %4751, -1 + %4886 = sext i32 %4773 to i64 + %4887 = sext i32 %4774 to i64 + %4888 = sext i32 %4775 to i64 + %4889 = sext i32 %4776 to i64 + %4890 = sext i32 %4777 to i64 + %4891 = sext i32 %4778 to i64 + %4892 = sext i32 %4779 to i64 + %4893 = sext i32 %4780 to i64 + %4894 = icmp sgt i32 %4781, 0 + %4895 = select i1 %4894, i32 16, i32 0 + %4896 = sext i32 %4765 to i64 + %4897 = sext i32 %4766 to i64 + %4898 = sext i32 %4767 to i64 + %4899 = sext i32 %4768 to i64 + %4900 = sext i32 %4769 to i64 + %4901 = sext i32 %4770 to i64 + %4902 = sext i32 %4771 to i64 + %4903 = sext i32 %4772 to i64 + %4904 = select i1 %4894, i32 8, i32 0 + %4905 = icmp sgt i32 %4781, 1 + %4906 = or disjoint i32 %4765, 64 + %4907 = or disjoint i32 %4766, 64 + %4908 = or disjoint i32 %4767, 64 + %4909 = or disjoint i32 %4768, 64 + %4910 = or disjoint i32 %4769, 64 + %4911 = or disjoint i32 %4770, 64 + %4912 = or disjoint i32 %4771, 64 + %4913 = or disjoint i32 %4772, 64 + %4914 = select i1 %4905, i32 16, i32 0 + %4915 = sext i32 %4906 to i64 + %4916 = sext i32 %4907 to i64 + %4917 = sext i32 %4908 to i64 + %4918 = sext i32 %4909 to i64 + %4919 = sext i32 %4910 to i64 + %4920 = sext i32 %4911 to i64 + %4921 = sext i32 %4912 to i64 + %4922 = sext i32 %4913 to i64 + %4923 = select i1 %4905, i32 8, i32 0 + %4924 = add i32 %4782, -2 + %4925 = add nsw i32 %4782, -1 + %smax = tail call i32 @llvm.smax.i32(i32 %4751, i32 1), !dbg !227 + %smax2121 = tail call i32 @llvm.smax.i32(i32 %4782, i32 1), !dbg !227 + %4926 = zext nneg i32 %4699 to i64, !dbg !227 + %4927 = insertelement <16 x i32> poison, i32 %4741, i64 0 + %4928 = insertelement <16 x i32> %4927, i32 %4740, i64 1 + %4929 = insertelement <16 x i32> %4928, i32 %4739, i64 2 + %4930 = insertelement <16 x i32> %4929, i32 %4738, i64 3 + %4931 = insertelement <16 x i32> %4930, i32 %4737, i64 4 + %4932 = insertelement <16 x i32> %4931, i32 %4736, i64 5 + %4933 = insertelement <16 x i32> %4932, i32 %4735, i64 6 + %4934 = insertelement <16 x i32> %4933, i32 %4734, i64 7 + %4935 = insertelement <16 x i32> %4934, i32 %4733, i64 8 + %4936 = insertelement <16 x i32> %4935, i32 %4732, i64 9 + %4937 = insertelement <16 x i32> %4936, i32 %4731, i64 10 + %4938 = insertelement <16 x i32> %4937, i32 %4730, i64 11 + %4939 = insertelement <16 x i32> %4938, i32 %4729, i64 12 + %4940 = insertelement <16 x i32> %4939, i32 %4728, i64 13 + %4941 = insertelement <16 x i32> %4940, i32 %4727, i64 14 + %4942 = insertelement <16 x i32> %4941, i32 %4726, i64 15 + %4943 = insertelement <16 x i64> poison, i64 %4755, i64 0 + %4944 = shufflevector <16 x i64> %4943, <16 x i64> poison, <16 x i32> zeroinitializer + br label %4945, !dbg !227 + +4945: ; preds = %4460, %._crit_edge1692 + %indvars.iv = phi i64 [ 0, %4460 ], [ %indvars.iv.next, %._crit_edge1692 ] + %4946 = phi float [ 0.000000e+00, %4460 ], [ %9744, %._crit_edge1692 ] + %4947 = phi float [ 0.000000e+00, %4460 ], [ %9745, %._crit_edge1692 ] + %4948 = phi float [ 0.000000e+00, %4460 ], [ %9746, %._crit_edge1692 ] + %4949 = phi float [ 0.000000e+00, %4460 ], [ %9747, %._crit_edge1692 ] + %4950 = phi float [ 0.000000e+00, %4460 ], [ %9748, %._crit_edge1692 ] + %4951 = phi float [ 0.000000e+00, %4460 ], [ %9749, %._crit_edge1692 ] + %4952 = phi float [ 0.000000e+00, %4460 ], [ %9750, %._crit_edge1692 ] + %4953 = phi float [ 0.000000e+00, %4460 ], [ %9751, %._crit_edge1692 ] + %4954 = phi float [ 0.000000e+00, %4460 ], [ %9752, %._crit_edge1692 ] + %4955 = phi float [ 0.000000e+00, %4460 ], [ %9753, %._crit_edge1692 ] + %4956 = phi float [ 0.000000e+00, %4460 ], [ %9754, %._crit_edge1692 ] + %4957 = phi float [ 0.000000e+00, %4460 ], [ %9755, %._crit_edge1692 ] + %4958 = phi float [ 0.000000e+00, %4460 ], [ %9756, %._crit_edge1692 ] + %4959 = phi float [ 0.000000e+00, %4460 ], [ %9757, %._crit_edge1692 ] + %4960 = phi float [ 0.000000e+00, %4460 ], [ %9758, %._crit_edge1692 ] + %4961 = phi float [ 0.000000e+00, %4460 ], [ %9759, %._crit_edge1692 ] + %4962 = phi float [ 0.000000e+00, %4460 ], [ %9760, %._crit_edge1692 ] + %4963 = phi float [ 0.000000e+00, %4460 ], [ %9761, %._crit_edge1692 ] + %4964 = phi float [ 0.000000e+00, %4460 ], [ %9762, %._crit_edge1692 ] + %4965 = phi float [ 0.000000e+00, %4460 ], [ %9763, %._crit_edge1692 ] + %4966 = phi float [ 0.000000e+00, %4460 ], [ %9764, %._crit_edge1692 ] + %4967 = phi float [ 0.000000e+00, %4460 ], [ %9765, %._crit_edge1692 ] + %4968 = phi float [ 0.000000e+00, %4460 ], [ %9766, %._crit_edge1692 ] + %4969 = phi float [ 0.000000e+00, %4460 ], [ %9767, %._crit_edge1692 ] + %4970 = phi float [ 0.000000e+00, %4460 ], [ %9768, %._crit_edge1692 ] + %4971 = phi float [ 0.000000e+00, %4460 ], [ %9769, %._crit_edge1692 ] + %4972 = phi float [ 0.000000e+00, %4460 ], [ %9770, %._crit_edge1692 ] + %4973 = phi float [ 0.000000e+00, %4460 ], [ %9771, %._crit_edge1692 ] + %4974 = phi float [ 0.000000e+00, %4460 ], [ %9772, %._crit_edge1692 ] + %4975 = phi float [ 0.000000e+00, %4460 ], [ %9773, %._crit_edge1692 ] + %4976 = phi float [ 0.000000e+00, %4460 ], [ %9774, %._crit_edge1692 ] + %4977 = phi float [ 0.000000e+00, %4460 ], [ %9775, %._crit_edge1692 ] + %4978 = phi float [ 0.000000e+00, %4460 ], [ %9776, %._crit_edge1692 ] + %4979 = phi float [ 0.000000e+00, %4460 ], [ %9777, %._crit_edge1692 ] + %4980 = phi float [ 0.000000e+00, %4460 ], [ %9778, %._crit_edge1692 ] + %4981 = phi float [ 0.000000e+00, %4460 ], [ %9779, %._crit_edge1692 ] + %4982 = phi float [ 0.000000e+00, %4460 ], [ %9780, %._crit_edge1692 ] + %4983 = phi float [ 0.000000e+00, %4460 ], [ %9781, %._crit_edge1692 ] + %4984 = phi float [ 0.000000e+00, %4460 ], [ %9782, %._crit_edge1692 ] + %4985 = phi float [ 0.000000e+00, %4460 ], [ %9783, %._crit_edge1692 ] + %4986 = phi float [ 0.000000e+00, %4460 ], [ %9784, %._crit_edge1692 ] + %4987 = phi float [ 0.000000e+00, %4460 ], [ %9785, %._crit_edge1692 ] + %4988 = phi float [ 0.000000e+00, %4460 ], [ %9786, %._crit_edge1692 ] + %4989 = phi float [ 0.000000e+00, %4460 ], [ %9787, %._crit_edge1692 ] + %4990 = phi float [ 0.000000e+00, %4460 ], [ %9788, %._crit_edge1692 ] + %4991 = phi float [ 0.000000e+00, %4460 ], [ %9789, %._crit_edge1692 ] + %4992 = phi float [ 0.000000e+00, %4460 ], [ %9790, %._crit_edge1692 ] + %4993 = phi float [ 0.000000e+00, %4460 ], [ %9791, %._crit_edge1692 ] + %4994 = phi float [ 0.000000e+00, %4460 ], [ %9792, %._crit_edge1692 ] + %4995 = phi float [ 0.000000e+00, %4460 ], [ %9793, %._crit_edge1692 ] + %4996 = phi float [ 0.000000e+00, %4460 ], [ %9794, %._crit_edge1692 ] + %4997 = phi float [ 0.000000e+00, %4460 ], [ %9795, %._crit_edge1692 ] + %4998 = phi float [ 0.000000e+00, %4460 ], [ %9796, %._crit_edge1692 ] + %4999 = phi float [ 0.000000e+00, %4460 ], [ %9797, %._crit_edge1692 ] + %5000 = phi float [ 0.000000e+00, %4460 ], [ %9798, %._crit_edge1692 ] + %5001 = phi float [ 0.000000e+00, %4460 ], [ %9799, %._crit_edge1692 ] + %5002 = phi float [ 0.000000e+00, %4460 ], [ %9800, %._crit_edge1692 ] + %5003 = phi float [ 0.000000e+00, %4460 ], [ %9801, %._crit_edge1692 ] + %5004 = phi float [ 0.000000e+00, %4460 ], [ %9802, %._crit_edge1692 ] + %5005 = phi float [ 0.000000e+00, %4460 ], [ %9803, %._crit_edge1692 ] + %5006 = phi float [ 0.000000e+00, %4460 ], [ %9804, %._crit_edge1692 ] + %5007 = phi float [ 0.000000e+00, %4460 ], [ %9805, %._crit_edge1692 ] + %5008 = phi float [ 0.000000e+00, %4460 ], [ %9806, %._crit_edge1692 ] + %5009 = phi float [ 0.000000e+00, %4460 ], [ %9807, %._crit_edge1692 ] + %5010 = phi float [ 0.000000e+00, %4460 ], [ %9680, %._crit_edge1692 ] + %5011 = phi float [ 0.000000e+00, %4460 ], [ %9681, %._crit_edge1692 ] + %5012 = phi float [ 0.000000e+00, %4460 ], [ %9682, %._crit_edge1692 ] + %5013 = phi float [ 0.000000e+00, %4460 ], [ %9683, %._crit_edge1692 ] + %5014 = phi float [ 0.000000e+00, %4460 ], [ %9684, %._crit_edge1692 ] + %5015 = phi float [ 0.000000e+00, %4460 ], [ %9685, %._crit_edge1692 ] + %5016 = phi float [ 0.000000e+00, %4460 ], [ %9686, %._crit_edge1692 ] + %5017 = phi float [ 0.000000e+00, %4460 ], [ %9687, %._crit_edge1692 ] + %5018 = phi float [ 0.000000e+00, %4460 ], [ %9688, %._crit_edge1692 ] + %5019 = phi float [ 0.000000e+00, %4460 ], [ %9689, %._crit_edge1692 ] + %5020 = phi float [ 0.000000e+00, %4460 ], [ %9690, %._crit_edge1692 ] + %5021 = phi float [ 0.000000e+00, %4460 ], [ %9691, %._crit_edge1692 ] + %5022 = phi float [ 0.000000e+00, %4460 ], [ %9692, %._crit_edge1692 ] + %5023 = phi float [ 0.000000e+00, %4460 ], [ %9693, %._crit_edge1692 ] + %5024 = phi float [ 0.000000e+00, %4460 ], [ %9694, %._crit_edge1692 ] + %5025 = phi float [ 0.000000e+00, %4460 ], [ %9695, %._crit_edge1692 ] + %5026 = phi float [ 0.000000e+00, %4460 ], [ %9696, %._crit_edge1692 ] + %5027 = phi float [ 0.000000e+00, %4460 ], [ %9697, %._crit_edge1692 ] + %5028 = phi float [ 0.000000e+00, %4460 ], [ %9698, %._crit_edge1692 ] + %5029 = phi float [ 0.000000e+00, %4460 ], [ %9699, %._crit_edge1692 ] + %5030 = phi float [ 0.000000e+00, %4460 ], [ %9700, %._crit_edge1692 ] + %5031 = phi float [ 0.000000e+00, %4460 ], [ %9701, %._crit_edge1692 ] + %5032 = phi float [ 0.000000e+00, %4460 ], [ %9702, %._crit_edge1692 ] + %5033 = phi float [ 0.000000e+00, %4460 ], [ %9703, %._crit_edge1692 ] + %5034 = phi float [ 0.000000e+00, %4460 ], [ %9704, %._crit_edge1692 ] + %5035 = phi float [ 0.000000e+00, %4460 ], [ %9705, %._crit_edge1692 ] + %5036 = phi float [ 0.000000e+00, %4460 ], [ %9706, %._crit_edge1692 ] + %5037 = phi float [ 0.000000e+00, %4460 ], [ %9707, %._crit_edge1692 ] + %5038 = phi float [ 0.000000e+00, %4460 ], [ %9708, %._crit_edge1692 ] + %5039 = phi float [ 0.000000e+00, %4460 ], [ %9709, %._crit_edge1692 ] + %5040 = phi float [ 0.000000e+00, %4460 ], [ %9710, %._crit_edge1692 ] + %5041 = phi float [ 0.000000e+00, %4460 ], [ %9711, %._crit_edge1692 ] + %5042 = phi float [ 0.000000e+00, %4460 ], [ %9712, %._crit_edge1692 ] + %5043 = phi float [ 0.000000e+00, %4460 ], [ %9713, %._crit_edge1692 ] + %5044 = phi float [ 0.000000e+00, %4460 ], [ %9714, %._crit_edge1692 ] + %5045 = phi float [ 0.000000e+00, %4460 ], [ %9715, %._crit_edge1692 ] + %5046 = phi float [ 0.000000e+00, %4460 ], [ %9716, %._crit_edge1692 ] + %5047 = phi float [ 0.000000e+00, %4460 ], [ %9717, %._crit_edge1692 ] + %5048 = phi float [ 0.000000e+00, %4460 ], [ %9718, %._crit_edge1692 ] + %5049 = phi float [ 0.000000e+00, %4460 ], [ %9719, %._crit_edge1692 ] + %5050 = phi float [ 0.000000e+00, %4460 ], [ %9720, %._crit_edge1692 ] + %5051 = phi float [ 0.000000e+00, %4460 ], [ %9721, %._crit_edge1692 ] + %5052 = phi float [ 0.000000e+00, %4460 ], [ %9722, %._crit_edge1692 ] + %5053 = phi float [ 0.000000e+00, %4460 ], [ %9723, %._crit_edge1692 ] + %5054 = phi float [ 0.000000e+00, %4460 ], [ %9724, %._crit_edge1692 ] + %5055 = phi float [ 0.000000e+00, %4460 ], [ %9725, %._crit_edge1692 ] + %5056 = phi float [ 0.000000e+00, %4460 ], [ %9726, %._crit_edge1692 ] + %5057 = phi float [ 0.000000e+00, %4460 ], [ %9727, %._crit_edge1692 ] + %5058 = phi float [ 0.000000e+00, %4460 ], [ %9728, %._crit_edge1692 ] + %5059 = phi float [ 0.000000e+00, %4460 ], [ %9729, %._crit_edge1692 ] + %5060 = phi float [ 0.000000e+00, %4460 ], [ %9730, %._crit_edge1692 ] + %5061 = phi float [ 0.000000e+00, %4460 ], [ %9731, %._crit_edge1692 ] + %5062 = phi float [ 0.000000e+00, %4460 ], [ %9732, %._crit_edge1692 ] + %5063 = phi float [ 0.000000e+00, %4460 ], [ %9733, %._crit_edge1692 ] + %5064 = phi float [ 0.000000e+00, %4460 ], [ %9734, %._crit_edge1692 ] + %5065 = phi float [ 0.000000e+00, %4460 ], [ %9735, %._crit_edge1692 ] + %5066 = phi float [ 0.000000e+00, %4460 ], [ %9736, %._crit_edge1692 ] + %5067 = phi float [ 0.000000e+00, %4460 ], [ %9737, %._crit_edge1692 ] + %5068 = phi float [ 0.000000e+00, %4460 ], [ %9738, %._crit_edge1692 ] + %5069 = phi float [ 0.000000e+00, %4460 ], [ %9739, %._crit_edge1692 ] + %5070 = phi float [ 0.000000e+00, %4460 ], [ %9740, %._crit_edge1692 ] + %5071 = phi float [ 0.000000e+00, %4460 ], [ %9741, %._crit_edge1692 ] + %5072 = phi float [ 0.000000e+00, %4460 ], [ %9742, %._crit_edge1692 ] + %5073 = phi float [ 0.000000e+00, %4460 ], [ %9743, %._crit_edge1692 ] + %5074 = add nuw nsw i64 %indvars.iv, %4926, !dbg !228 + %.tr = trunc i64 %5074 to i32, !dbg !229 + %5075 = shl i32 %.tr, 7, !dbg !229 + %5076 = add i32 %5075, %4700, !dbg !229 + %5077 = sext i32 %5076 to i64, !dbg !230 + %5078 = trunc nuw nsw i64 %5074 to i32, !dbg !231 + %5079 = shl i32 %5078, 18, !dbg !231 + %5080 = add i32 %5079, %4700, !dbg !232 + %5081 = sext i32 %5080 to i64, !dbg !233 + %.tr2128 = trunc i64 %5074 to i32, !dbg !234 + %5082 = shl i32 %.tr2128, 11, !dbg !234 + %5083 = add i32 %5082, %4783, !dbg !234 + %5084 = sext i32 %5083 to i64, !dbg !235 + %5085 = getelementptr bfloat, ptr addrspace(1) %0, i64 %5077, !dbg !236 + %5086 = getelementptr bfloat, ptr addrspace(1) %5, i64 %5081, !dbg !237 + %5087 = getelementptr float, ptr addrspace(1) %3, i64 %5084, !dbg !238 + %5088 = getelementptr float, ptr addrspace(1) %4, i64 %5084, !dbg !239 + %5089 = getelementptr bfloat, ptr addrspace(1) %5085, i64 %4784, !dbg !240 + %5090 = getelementptr bfloat, ptr addrspace(1) %5085, i64 %4785, !dbg !240 + %5091 = getelementptr bfloat, ptr addrspace(1) %5085, i64 %4786, !dbg !240 + %5092 = getelementptr bfloat, ptr addrspace(1) %5085, i64 %4787, !dbg !240 + %5093 = getelementptr bfloat, ptr addrspace(1) %5089, i64 %4498, !dbg !241 + %5094 = getelementptr bfloat, ptr addrspace(1) %5090, i64 %4498, !dbg !241 + %5095 = getelementptr bfloat, ptr addrspace(1) %5091, i64 %4498, !dbg !241 + %5096 = getelementptr bfloat, ptr addrspace(1) %5092, i64 %4498, !dbg !241 + %5097 = getelementptr bfloat, ptr addrspace(1) %5086, i64 %4788, !dbg !242 + %5098 = getelementptr bfloat, ptr addrspace(1) %5086, i64 %4789, !dbg !242 + %5099 = getelementptr bfloat, ptr addrspace(1) %5086, i64 %4790, !dbg !242 + %5100 = getelementptr bfloat, ptr addrspace(1) %5086, i64 %4791, !dbg !242 + %5101 = getelementptr bfloat, ptr addrspace(1) %5097, i64 %4498, !dbg !243 + %5102 = getelementptr bfloat, ptr addrspace(1) %5098, i64 %4498, !dbg !243 + %5103 = getelementptr bfloat, ptr addrspace(1) %5099, i64 %4498, !dbg !243 + %5104 = getelementptr bfloat, ptr addrspace(1) %5100, i64 %4498, !dbg !243 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4794, ptr addrspace(1) %5093, i32 %4795) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4797, ptr addrspace(1) %5094, i32 %4795) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4799, ptr addrspace(1) %5095, i32 %4795) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4801, ptr addrspace(1) %5096, i32 %4795) #3, !dbg !244 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !244 + %5105 = getelementptr float, ptr addrspace(1) %5087, i64 %4802, !dbg !245 + %5106 = getelementptr float, ptr addrspace(1) %5087, i64 %4803, !dbg !245 + %5107 = getelementptr float, ptr addrspace(1) %5087, i64 %4804, !dbg !245 + %5108 = getelementptr float, ptr addrspace(1) %5087, i64 %4805, !dbg !245 + %5109 = getelementptr float, ptr addrspace(1) %5087, i64 %4806, !dbg !245 + %5110 = getelementptr float, ptr addrspace(1) %5087, i64 %4807, !dbg !245 + %5111 = getelementptr float, ptr addrspace(1) %5087, i64 %4808, !dbg !245 + %5112 = getelementptr float, ptr addrspace(1) %5087, i64 %4809, !dbg !245 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %4813, ptr addrspace(1) %5105, i32 %4814, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4816, ptr addrspace(1) %5106, i32 %4814, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4818, ptr addrspace(1) %5107, i32 %4814, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4820, ptr addrspace(1) %5108, i32 %4814, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4822, ptr addrspace(1) %5109, i32 %4814, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4824, ptr addrspace(1) %5110, i32 %4814, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4826, ptr addrspace(1) %5111, i32 %4814, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4828, ptr addrspace(1) %5112, i32 %4814, i1 %4811) #3, !dbg !246 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !246 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4829, ptr addrspace(1) %5101, i32 %4795) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4830, ptr addrspace(1) %5102, i32 %4795) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4831, ptr addrspace(1) %5103, i32 %4795) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4832, ptr addrspace(1) %5104, i32 %4795) #3, !dbg !244 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !244 + %5113 = getelementptr float, ptr addrspace(1) %5088, i64 %4802, !dbg !247 + %5114 = getelementptr float, ptr addrspace(1) %5088, i64 %4803, !dbg !247 + %5115 = getelementptr float, ptr addrspace(1) %5088, i64 %4804, !dbg !247 + %5116 = getelementptr float, ptr addrspace(1) %5088, i64 %4805, !dbg !247 + %5117 = getelementptr float, ptr addrspace(1) %5088, i64 %4806, !dbg !247 + %5118 = getelementptr float, ptr addrspace(1) %5088, i64 %4807, !dbg !247 + %5119 = getelementptr float, ptr addrspace(1) %5088, i64 %4808, !dbg !247 + %5120 = getelementptr float, ptr addrspace(1) %5088, i64 %4809, !dbg !247 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %4833, ptr addrspace(1) %5113, i32 %4814, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4834, ptr addrspace(1) %5114, i32 %4814, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4835, ptr addrspace(1) %5115, i32 %4814, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4836, ptr addrspace(1) %5116, i32 %4814, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4837, ptr addrspace(1) %5117, i32 %4814, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4838, ptr addrspace(1) %5118, i32 %4814, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4839, ptr addrspace(1) %5119, i32 %4814, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4840, ptr addrspace(1) %5120, i32 %4814, i1 %4811) #3, !dbg !248 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !248 + %5121 = getelementptr i8, ptr addrspace(1) %5093, i64 524288, !dbg !249 + %5122 = getelementptr i8, ptr addrspace(1) %5094, i64 524288, !dbg !249 + %5123 = getelementptr i8, ptr addrspace(1) %5095, i64 524288, !dbg !249 + %5124 = getelementptr i8, ptr addrspace(1) %5096, i64 524288, !dbg !249 + %5125 = getelementptr i8, ptr addrspace(1) %5101, i64 16384, !dbg !250 + %5126 = getelementptr i8, ptr addrspace(1) %5102, i64 16384, !dbg !250 + %5127 = getelementptr i8, ptr addrspace(1) %5103, i64 16384, !dbg !250 + %5128 = getelementptr i8, ptr addrspace(1) %5104, i64 16384, !dbg !250 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4850, ptr addrspace(1) %5121, i32 %4851) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4852, ptr addrspace(1) %5122, i32 %4851) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4853, ptr addrspace(1) %5123, i32 %4851) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4854, ptr addrspace(1) %5124, i32 %4851) #3, !dbg !244 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !244 + %5129 = getelementptr float, ptr addrspace(1) %5087, i64 %4855, !dbg !245 + %5130 = getelementptr float, ptr addrspace(1) %5087, i64 %4856, !dbg !245 + %5131 = getelementptr float, ptr addrspace(1) %5087, i64 %4857, !dbg !245 + %5132 = getelementptr float, ptr addrspace(1) %5087, i64 %4858, !dbg !245 + %5133 = getelementptr float, ptr addrspace(1) %5087, i64 %4859, !dbg !245 + %5134 = getelementptr float, ptr addrspace(1) %5087, i64 %4860, !dbg !245 + %5135 = getelementptr float, ptr addrspace(1) %5087, i64 %4861, !dbg !245 + %5136 = getelementptr float, ptr addrspace(1) %5087, i64 %4862, !dbg !245 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %4863, ptr addrspace(1) %5129, i32 %4864, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4865, ptr addrspace(1) %5130, i32 %4864, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4866, ptr addrspace(1) %5131, i32 %4864, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4867, ptr addrspace(1) %5132, i32 %4864, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4868, ptr addrspace(1) %5133, i32 %4864, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4869, ptr addrspace(1) %5134, i32 %4864, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4870, ptr addrspace(1) %5135, i32 %4864, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4871, ptr addrspace(1) %5136, i32 %4864, i1 %4811) #3, !dbg !246 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !246 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4872, ptr addrspace(1) %5125, i32 %4851) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4873, ptr addrspace(1) %5126, i32 %4851) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4874, ptr addrspace(1) %5127, i32 %4851) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4875, ptr addrspace(1) %5128, i32 %4851) #3, !dbg !244 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !244 + %5137 = getelementptr float, ptr addrspace(1) %5088, i64 %4855, !dbg !247 + %5138 = getelementptr float, ptr addrspace(1) %5088, i64 %4856, !dbg !247 + %5139 = getelementptr float, ptr addrspace(1) %5088, i64 %4857, !dbg !247 + %5140 = getelementptr float, ptr addrspace(1) %5088, i64 %4858, !dbg !247 + %5141 = getelementptr float, ptr addrspace(1) %5088, i64 %4859, !dbg !247 + %5142 = getelementptr float, ptr addrspace(1) %5088, i64 %4860, !dbg !247 + %5143 = getelementptr float, ptr addrspace(1) %5088, i64 %4861, !dbg !247 + %5144 = getelementptr float, ptr addrspace(1) %5088, i64 %4862, !dbg !247 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %4876, ptr addrspace(1) %5137, i32 %4864, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4877, ptr addrspace(1) %5138, i32 %4864, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4878, ptr addrspace(1) %5139, i32 %4864, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4879, ptr addrspace(1) %5140, i32 %4864, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4880, ptr addrspace(1) %5141, i32 %4864, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4881, ptr addrspace(1) %5142, i32 %4864, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4882, ptr addrspace(1) %5143, i32 %4864, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4883, ptr addrspace(1) %5144, i32 %4864, i1 %4811) #3, !dbg !248 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !248 + br i1 %4754, label %.lr.ph, label %._crit_edge, !dbg !213 + +.lr.ph: ; preds = %4945, %__nv_exp2f.exit1315 + %5145 = phi i32 [ %7352, %__nv_exp2f.exit1315 ], [ 64, %4945 ] + %5146 = phi i32 [ %5284, %__nv_exp2f.exit1315 ], [ -1, %4945 ] + %5147 = phi i32 [ %7375, %__nv_exp2f.exit1315 ], [ 1, %4945 ] + %5148 = phi i32 [ %5287, %__nv_exp2f.exit1315 ], [ -1, %4945 ] + %5149 = phi i32 [ %7378, %__nv_exp2f.exit1315 ], [ 1, %4945 ] + %.pn1881528 = phi i32 [ %7372, %__nv_exp2f.exit1315 ], [ %4849, %4945 ] + %.pn1921527 = phi i32 [ %7371, %__nv_exp2f.exit1315 ], [ %4848, %4945 ] + %.pn1961526 = phi i32 [ %7370, %__nv_exp2f.exit1315 ], [ %4847, %4945 ] + %.pn2001525 = phi i32 [ %7369, %__nv_exp2f.exit1315 ], [ %4846, %4945 ] + %.pn2041524 = phi i32 [ %7368, %__nv_exp2f.exit1315 ], [ %4845, %4945 ] + %.pn2081523 = phi i32 [ %7367, %__nv_exp2f.exit1315 ], [ %4844, %4945 ] + %.pn2121522 = phi i32 [ %7366, %__nv_exp2f.exit1315 ], [ %4843, %4945 ] + %.pn2161521 = phi i32 [ %7365, %__nv_exp2f.exit1315 ], [ %4842, %4945 ] + %.pn1361520 = phi ptr addrspace(1) [ %7364, %__nv_exp2f.exit1315 ], [ %5128, %4945 ] + %.pn1521519 = phi ptr addrspace(1) [ %7363, %__nv_exp2f.exit1315 ], [ %5127, %4945 ] + %.pn1681518 = phi ptr addrspace(1) [ %7362, %__nv_exp2f.exit1315 ], [ %5126, %4945 ] + %.pn1841517 = phi ptr addrspace(1) [ %7361, %__nv_exp2f.exit1315 ], [ %5125, %4945 ] + %.pn721516 = phi ptr addrspace(1) [ %7358, %__nv_exp2f.exit1315 ], [ %5124, %4945 ] + %.pn881515 = phi ptr addrspace(1) [ %7357, %__nv_exp2f.exit1315 ], [ %5123, %4945 ] + %.pn1041514 = phi ptr addrspace(1) [ %7356, %__nv_exp2f.exit1315 ], [ %5122, %4945 ] + %.pn1201513 = phi ptr addrspace(1) [ %7355, %__nv_exp2f.exit1315 ], [ %5121, %4945 ] + %5150 = phi float [ %6410, %__nv_exp2f.exit1315 ], [ %5010, %4945 ] + %5151 = phi float [ %6411, %__nv_exp2f.exit1315 ], [ %5011, %4945 ] + %5152 = phi float [ %6412, %__nv_exp2f.exit1315 ], [ %5012, %4945 ] + %5153 = phi float [ %6413, %__nv_exp2f.exit1315 ], [ %5013, %4945 ] + %5154 = phi float [ %6414, %__nv_exp2f.exit1315 ], [ %5014, %4945 ] + %5155 = phi float [ %6415, %__nv_exp2f.exit1315 ], [ %5015, %4945 ] + %5156 = phi float [ %6416, %__nv_exp2f.exit1315 ], [ %5016, %4945 ] + %5157 = phi float [ %6417, %__nv_exp2f.exit1315 ], [ %5017, %4945 ] + %5158 = phi float [ %6418, %__nv_exp2f.exit1315 ], [ %5018, %4945 ] + %5159 = phi float [ %6419, %__nv_exp2f.exit1315 ], [ %5019, %4945 ] + %5160 = phi float [ %6420, %__nv_exp2f.exit1315 ], [ %5020, %4945 ] + %5161 = phi float [ %6421, %__nv_exp2f.exit1315 ], [ %5021, %4945 ] + %5162 = phi float [ %6422, %__nv_exp2f.exit1315 ], [ %5022, %4945 ] + %5163 = phi float [ %6423, %__nv_exp2f.exit1315 ], [ %5023, %4945 ] + %5164 = phi float [ %6424, %__nv_exp2f.exit1315 ], [ %5024, %4945 ] + %5165 = phi float [ %6425, %__nv_exp2f.exit1315 ], [ %5025, %4945 ] + %5166 = phi float [ %6426, %__nv_exp2f.exit1315 ], [ %5026, %4945 ] + %5167 = phi float [ %6427, %__nv_exp2f.exit1315 ], [ %5027, %4945 ] + %5168 = phi float [ %6428, %__nv_exp2f.exit1315 ], [ %5028, %4945 ] + %5169 = phi float [ %6429, %__nv_exp2f.exit1315 ], [ %5029, %4945 ] + %5170 = phi float [ %6430, %__nv_exp2f.exit1315 ], [ %5030, %4945 ] + %5171 = phi float [ %6431, %__nv_exp2f.exit1315 ], [ %5031, %4945 ] + %5172 = phi float [ %6432, %__nv_exp2f.exit1315 ], [ %5032, %4945 ] + %5173 = phi float [ %6433, %__nv_exp2f.exit1315 ], [ %5033, %4945 ] + %5174 = phi float [ %6434, %__nv_exp2f.exit1315 ], [ %5034, %4945 ] + %5175 = phi float [ %6435, %__nv_exp2f.exit1315 ], [ %5035, %4945 ] + %5176 = phi float [ %6436, %__nv_exp2f.exit1315 ], [ %5036, %4945 ] + %5177 = phi float [ %6437, %__nv_exp2f.exit1315 ], [ %5037, %4945 ] + %5178 = phi float [ %6438, %__nv_exp2f.exit1315 ], [ %5038, %4945 ] + %5179 = phi float [ %6439, %__nv_exp2f.exit1315 ], [ %5039, %4945 ] + %5180 = phi float [ %6440, %__nv_exp2f.exit1315 ], [ %5040, %4945 ] + %5181 = phi float [ %6441, %__nv_exp2f.exit1315 ], [ %5041, %4945 ] + %5182 = phi float [ %6442, %__nv_exp2f.exit1315 ], [ %5042, %4945 ] + %5183 = phi float [ %6443, %__nv_exp2f.exit1315 ], [ %5043, %4945 ] + %5184 = phi float [ %6444, %__nv_exp2f.exit1315 ], [ %5044, %4945 ] + %5185 = phi float [ %6445, %__nv_exp2f.exit1315 ], [ %5045, %4945 ] + %5186 = phi float [ %6446, %__nv_exp2f.exit1315 ], [ %5046, %4945 ] + %5187 = phi float [ %6447, %__nv_exp2f.exit1315 ], [ %5047, %4945 ] + %5188 = phi float [ %6448, %__nv_exp2f.exit1315 ], [ %5048, %4945 ] + %5189 = phi float [ %6449, %__nv_exp2f.exit1315 ], [ %5049, %4945 ] + %5190 = phi float [ %6450, %__nv_exp2f.exit1315 ], [ %5050, %4945 ] + %5191 = phi float [ %6451, %__nv_exp2f.exit1315 ], [ %5051, %4945 ] + %5192 = phi float [ %6452, %__nv_exp2f.exit1315 ], [ %5052, %4945 ] + %5193 = phi float [ %6453, %__nv_exp2f.exit1315 ], [ %5053, %4945 ] + %5194 = phi float [ %6454, %__nv_exp2f.exit1315 ], [ %5054, %4945 ] + %5195 = phi float [ %6455, %__nv_exp2f.exit1315 ], [ %5055, %4945 ] + %5196 = phi float [ %6456, %__nv_exp2f.exit1315 ], [ %5056, %4945 ] + %5197 = phi float [ %6457, %__nv_exp2f.exit1315 ], [ %5057, %4945 ] + %5198 = phi float [ %6458, %__nv_exp2f.exit1315 ], [ %5058, %4945 ] + %5199 = phi float [ %6459, %__nv_exp2f.exit1315 ], [ %5059, %4945 ] + %5200 = phi float [ %6460, %__nv_exp2f.exit1315 ], [ %5060, %4945 ] + %5201 = phi float [ %6461, %__nv_exp2f.exit1315 ], [ %5061, %4945 ] + %5202 = phi float [ %6462, %__nv_exp2f.exit1315 ], [ %5062, %4945 ] + %5203 = phi float [ %6463, %__nv_exp2f.exit1315 ], [ %5063, %4945 ] + %5204 = phi float [ %6464, %__nv_exp2f.exit1315 ], [ %5064, %4945 ] + %5205 = phi float [ %6465, %__nv_exp2f.exit1315 ], [ %5065, %4945 ] + %5206 = phi float [ %6466, %__nv_exp2f.exit1315 ], [ %5066, %4945 ] + %5207 = phi float [ %6467, %__nv_exp2f.exit1315 ], [ %5067, %4945 ] + %5208 = phi float [ %6468, %__nv_exp2f.exit1315 ], [ %5068, %4945 ] + %5209 = phi float [ %6469, %__nv_exp2f.exit1315 ], [ %5069, %4945 ] + %5210 = phi float [ %6470, %__nv_exp2f.exit1315 ], [ %5070, %4945 ] + %5211 = phi float [ %6471, %__nv_exp2f.exit1315 ], [ %5071, %4945 ] + %5212 = phi float [ %6472, %__nv_exp2f.exit1315 ], [ %5072, %4945 ] + %5213 = phi float [ %6473, %__nv_exp2f.exit1315 ], [ %5073, %4945 ] + %5214 = phi float [ %7266, %__nv_exp2f.exit1315 ], [ %4946, %4945 ] + %5215 = phi float [ %7267, %__nv_exp2f.exit1315 ], [ %4947, %4945 ] + %5216 = phi float [ %7268, %__nv_exp2f.exit1315 ], [ %4948, %4945 ] + %5217 = phi float [ %7269, %__nv_exp2f.exit1315 ], [ %4949, %4945 ] + %5218 = phi float [ %7270, %__nv_exp2f.exit1315 ], [ %4950, %4945 ] + %5219 = phi float [ %7271, %__nv_exp2f.exit1315 ], [ %4951, %4945 ] + %5220 = phi float [ %7272, %__nv_exp2f.exit1315 ], [ %4952, %4945 ] + %5221 = phi float [ %7273, %__nv_exp2f.exit1315 ], [ %4953, %4945 ] + %5222 = phi float [ %7274, %__nv_exp2f.exit1315 ], [ %4954, %4945 ] + %5223 = phi float [ %7275, %__nv_exp2f.exit1315 ], [ %4955, %4945 ] + %5224 = phi float [ %7276, %__nv_exp2f.exit1315 ], [ %4956, %4945 ] + %5225 = phi float [ %7277, %__nv_exp2f.exit1315 ], [ %4957, %4945 ] + %5226 = phi float [ %7278, %__nv_exp2f.exit1315 ], [ %4958, %4945 ] + %5227 = phi float [ %7279, %__nv_exp2f.exit1315 ], [ %4959, %4945 ] + %5228 = phi float [ %7280, %__nv_exp2f.exit1315 ], [ %4960, %4945 ] + %5229 = phi float [ %7281, %__nv_exp2f.exit1315 ], [ %4961, %4945 ] + %5230 = phi float [ %7282, %__nv_exp2f.exit1315 ], [ %4962, %4945 ] + %5231 = phi float [ %7283, %__nv_exp2f.exit1315 ], [ %4963, %4945 ] + %5232 = phi float [ %7284, %__nv_exp2f.exit1315 ], [ %4964, %4945 ] + %5233 = phi float [ %7285, %__nv_exp2f.exit1315 ], [ %4965, %4945 ] + %5234 = phi float [ %7286, %__nv_exp2f.exit1315 ], [ %4966, %4945 ] + %5235 = phi float [ %7287, %__nv_exp2f.exit1315 ], [ %4967, %4945 ] + %5236 = phi float [ %7288, %__nv_exp2f.exit1315 ], [ %4968, %4945 ] + %5237 = phi float [ %7289, %__nv_exp2f.exit1315 ], [ %4969, %4945 ] + %5238 = phi float [ %7290, %__nv_exp2f.exit1315 ], [ %4970, %4945 ] + %5239 = phi float [ %7291, %__nv_exp2f.exit1315 ], [ %4971, %4945 ] + %5240 = phi float [ %7292, %__nv_exp2f.exit1315 ], [ %4972, %4945 ] + %5241 = phi float [ %7293, %__nv_exp2f.exit1315 ], [ %4973, %4945 ] + %5242 = phi float [ %7294, %__nv_exp2f.exit1315 ], [ %4974, %4945 ] + %5243 = phi float [ %7295, %__nv_exp2f.exit1315 ], [ %4975, %4945 ] + %5244 = phi float [ %7296, %__nv_exp2f.exit1315 ], [ %4976, %4945 ] + %5245 = phi float [ %7297, %__nv_exp2f.exit1315 ], [ %4977, %4945 ] + %5246 = phi float [ %7298, %__nv_exp2f.exit1315 ], [ %4978, %4945 ] + %5247 = phi float [ %7299, %__nv_exp2f.exit1315 ], [ %4979, %4945 ] + %5248 = phi float [ %7300, %__nv_exp2f.exit1315 ], [ %4980, %4945 ] + %5249 = phi float [ %7301, %__nv_exp2f.exit1315 ], [ %4981, %4945 ] + %5250 = phi float [ %7302, %__nv_exp2f.exit1315 ], [ %4982, %4945 ] + %5251 = phi float [ %7303, %__nv_exp2f.exit1315 ], [ %4983, %4945 ] + %5252 = phi float [ %7304, %__nv_exp2f.exit1315 ], [ %4984, %4945 ] + %5253 = phi float [ %7305, %__nv_exp2f.exit1315 ], [ %4985, %4945 ] + %5254 = phi float [ %7306, %__nv_exp2f.exit1315 ], [ %4986, %4945 ] + %5255 = phi float [ %7307, %__nv_exp2f.exit1315 ], [ %4987, %4945 ] + %5256 = phi float [ %7308, %__nv_exp2f.exit1315 ], [ %4988, %4945 ] + %5257 = phi float [ %7309, %__nv_exp2f.exit1315 ], [ %4989, %4945 ] + %5258 = phi float [ %7310, %__nv_exp2f.exit1315 ], [ %4990, %4945 ] + %5259 = phi float [ %7311, %__nv_exp2f.exit1315 ], [ %4991, %4945 ] + %5260 = phi float [ %7312, %__nv_exp2f.exit1315 ], [ %4992, %4945 ] + %5261 = phi float [ %7313, %__nv_exp2f.exit1315 ], [ %4993, %4945 ] + %5262 = phi float [ %7314, %__nv_exp2f.exit1315 ], [ %4994, %4945 ] + %5263 = phi float [ %7315, %__nv_exp2f.exit1315 ], [ %4995, %4945 ] + %5264 = phi float [ %7316, %__nv_exp2f.exit1315 ], [ %4996, %4945 ] + %5265 = phi float [ %7317, %__nv_exp2f.exit1315 ], [ %4997, %4945 ] + %5266 = phi float [ %7318, %__nv_exp2f.exit1315 ], [ %4998, %4945 ] + %5267 = phi float [ %7319, %__nv_exp2f.exit1315 ], [ %4999, %4945 ] + %5268 = phi float [ %7320, %__nv_exp2f.exit1315 ], [ %5000, %4945 ] + %5269 = phi float [ %7321, %__nv_exp2f.exit1315 ], [ %5001, %4945 ] + %5270 = phi float [ %7322, %__nv_exp2f.exit1315 ], [ %5002, %4945 ] + %5271 = phi float [ %7323, %__nv_exp2f.exit1315 ], [ %5003, %4945 ] + %5272 = phi float [ %7324, %__nv_exp2f.exit1315 ], [ %5004, %4945 ] + %5273 = phi float [ %7325, %__nv_exp2f.exit1315 ], [ %5005, %4945 ] + %5274 = phi float [ %7326, %__nv_exp2f.exit1315 ], [ %5006, %4945 ] + %5275 = phi float [ %7327, %__nv_exp2f.exit1315 ], [ %5007, %4945 ] + %5276 = phi float [ %7328, %__nv_exp2f.exit1315 ], [ %5008, %4945 ] + %5277 = phi float [ %7329, %__nv_exp2f.exit1315 ], [ %5009, %4945 ] + %5278 = phi i32 [ %7333, %__nv_exp2f.exit1315 ], [ 0, %4945 ] + %5279 = phi <16 x i32> [ %7332, %__nv_exp2f.exit1315 ], [ %4942, %4945 ] + %5280 = icmp slt i32 %5278, %4884, !dbg !213 + %5281 = icmp slt i32 %5278, %4885, !dbg !213 + %5282 = add i32 %5146, 1, !dbg !213 + %5283 = icmp sgt i32 %5282, 1, !dbg !213 + %5284 = select i1 %5283, i32 0, i32 %5282, !dbg !213 + %5285 = add i32 %5148, 1, !dbg !213 + %5286 = icmp sgt i32 %5285, 2, !dbg !213 + %5287 = select i1 %5286, i32 0, i32 %5285, !dbg !213 + tail call void @llvm.nvvm.cp.async.wait.group(i32 4), !dbg !244 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !244 + %5288 = shl i32 %5287, 13, !dbg !244 + %5289 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %5288, !dbg !244 + %5290 = shl i32 %5284, 6, !dbg !246 + %5291 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5290, !dbg !246 + %5292 = getelementptr inbounds nuw i8, ptr addrspace(3) %5291, i32 %4812, !dbg !246 + %5293 = load float, ptr addrspace(3) %5292, align 8, !dbg !246 + %5294 = getelementptr inbounds nuw i8, ptr addrspace(3) %5292, i32 4, !dbg !246 + %5295 = load float, ptr addrspace(3) %5294, align 4, !dbg !246 + %5296 = getelementptr inbounds nuw i8, ptr addrspace(3) %5291, i32 %4815, !dbg !246 + %5297 = load float, ptr addrspace(3) %5296, align 8, !dbg !246 + %5298 = getelementptr inbounds nuw i8, ptr addrspace(3) %5296, i32 4, !dbg !246 + %5299 = load float, ptr addrspace(3) %5298, align 4, !dbg !246 + %5300 = getelementptr inbounds nuw i8, ptr addrspace(3) %5291, i32 %4817, !dbg !246 + %5301 = load float, ptr addrspace(3) %5300, align 8, !dbg !246 + %5302 = getelementptr inbounds nuw i8, ptr addrspace(3) %5300, i32 4, !dbg !246 + %5303 = load float, ptr addrspace(3) %5302, align 4, !dbg !246 + %5304 = getelementptr inbounds nuw i8, ptr addrspace(3) %5291, i32 %4819, !dbg !246 + %5305 = load float, ptr addrspace(3) %5304, align 8, !dbg !246 + %5306 = getelementptr inbounds nuw i8, ptr addrspace(3) %5304, i32 4, !dbg !246 + %5307 = load float, ptr addrspace(3) %5306, align 4, !dbg !246 + %5308 = getelementptr inbounds nuw i8, ptr addrspace(3) %5291, i32 %4821, !dbg !246 + %5309 = load float, ptr addrspace(3) %5308, align 8, !dbg !246 + %5310 = getelementptr inbounds nuw i8, ptr addrspace(3) %5308, i32 4, !dbg !246 + %5311 = load float, ptr addrspace(3) %5310, align 4, !dbg !246 + %5312 = getelementptr inbounds nuw i8, ptr addrspace(3) %5291, i32 %4823, !dbg !246 + %5313 = load float, ptr addrspace(3) %5312, align 8, !dbg !246 + %5314 = getelementptr inbounds nuw i8, ptr addrspace(3) %5312, i32 4, !dbg !246 + %5315 = load float, ptr addrspace(3) %5314, align 4, !dbg !246 + %5316 = getelementptr inbounds nuw i8, ptr addrspace(3) %5291, i32 %4825, !dbg !246 + %5317 = load float, ptr addrspace(3) %5316, align 8, !dbg !246 + %5318 = getelementptr inbounds nuw i8, ptr addrspace(3) %5316, i32 4, !dbg !246 + %5319 = load float, ptr addrspace(3) %5318, align 4, !dbg !246 + %5320 = getelementptr inbounds nuw i8, ptr addrspace(3) %5291, i32 %4827, !dbg !246 + %5321 = load float, ptr addrspace(3) %5320, align 8, !dbg !246 + %5322 = getelementptr inbounds nuw i8, ptr addrspace(3) %5320, i32 4, !dbg !246 + %5323 = load float, ptr addrspace(3) %5322, align 4, !dbg !246 + %5324 = fcmp oeq float %5293, 0xFFF0000000000000, !dbg !251 + %5325 = fcmp oeq float %5295, 0xFFF0000000000000, !dbg !251 + %5326 = fcmp oeq float %5297, 0xFFF0000000000000, !dbg !251 + %5327 = fcmp oeq float %5299, 0xFFF0000000000000, !dbg !251 + %5328 = fcmp oeq float %5301, 0xFFF0000000000000, !dbg !251 + %5329 = fcmp oeq float %5303, 0xFFF0000000000000, !dbg !251 + %5330 = fcmp oeq float %5305, 0xFFF0000000000000, !dbg !251 + %5331 = fcmp oeq float %5307, 0xFFF0000000000000, !dbg !251 + %5332 = fcmp oeq float %5309, 0xFFF0000000000000, !dbg !251 + %5333 = fcmp oeq float %5311, 0xFFF0000000000000, !dbg !251 + %5334 = fcmp oeq float %5313, 0xFFF0000000000000, !dbg !251 + %5335 = fcmp oeq float %5315, 0xFFF0000000000000, !dbg !251 + %5336 = fcmp oeq float %5317, 0xFFF0000000000000, !dbg !251 + %5337 = fcmp oeq float %5319, 0xFFF0000000000000, !dbg !251 + %5338 = fcmp oeq float %5321, 0xFFF0000000000000, !dbg !251 + %5339 = fcmp oeq float %5323, 0xFFF0000000000000, !dbg !251 + %5340 = select i1 %5324, float 0.000000e+00, float %5293, !dbg !252 + %5341 = select i1 %5325, float 0.000000e+00, float %5295, !dbg !252 + %5342 = select i1 %5326, float 0.000000e+00, float %5297, !dbg !252 + %5343 = select i1 %5327, float 0.000000e+00, float %5299, !dbg !252 + %5344 = select i1 %5328, float 0.000000e+00, float %5301, !dbg !252 + %5345 = select i1 %5329, float 0.000000e+00, float %5303, !dbg !252 + %5346 = select i1 %5330, float 0.000000e+00, float %5305, !dbg !252 + %5347 = select i1 %5331, float 0.000000e+00, float %5307, !dbg !252 + %5348 = select i1 %5332, float 0.000000e+00, float %5309, !dbg !252 + %5349 = select i1 %5333, float 0.000000e+00, float %5311, !dbg !252 + %5350 = select i1 %5334, float 0.000000e+00, float %5313, !dbg !252 + %5351 = select i1 %5335, float 0.000000e+00, float %5315, !dbg !252 + %5352 = select i1 %5336, float 0.000000e+00, float %5317, !dbg !252 + %5353 = select i1 %5337, float 0.000000e+00, float %5319, !dbg !252 + %5354 = select i1 %5338, float 0.000000e+00, float %5321, !dbg !252 + %5355 = select i1 %5339, float 0.000000e+00, float %5323, !dbg !252 + %5356 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %36, i32 0, i32 31), !dbg !226 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !226 + %5357 = shl i32 %5356, 11, !dbg !226 + %5358 = and i32 %5357, 8192, !dbg !226 + %5359 = add i32 %5358, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !226 + %5360 = lshr exact i32 %5359, 4, !dbg !226 + %5361 = and i32 %5360, 16383, !dbg !226 + %5362 = zext nneg i32 %5361 to i64, !dbg !226 + %5363 = or disjoint i64 %5362, 4611686293372403712, !dbg !226 + %5364 = ptrtoint ptr addrspace(3) %5289 to i32, !dbg !226 + %5365 = lshr exact i32 %5364, 4, !dbg !226 + %5366 = and i32 %5365, 16383, !dbg !226 + %5367 = zext nneg i32 %5366 to i64, !dbg !226 + %5368 = or disjoint i64 %5367, 4611686293338849280, !dbg !226 + %5369 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %5363, i64 %5368) #3, !dbg !226 + %5370 = or disjoint i32 %5358, 32, !dbg !226 + %5371 = add i32 %5370, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !226 + %5372 = lshr exact i32 %5371, 4, !dbg !226 + %5373 = and i32 %5372, 16383, !dbg !226 + %5374 = zext nneg i32 %5373 to i64, !dbg !226 + %5375 = or disjoint i64 %5374, 4611686293372403712, !dbg !226 + %5376 = add i32 %5364, 32, !dbg !226 + %5377 = lshr exact i32 %5376, 4, !dbg !226 + %5378 = and i32 %5377, 16383, !dbg !226 + %5379 = zext nneg i32 %5378 to i64, !dbg !226 + %5380 = or disjoint i64 %5379, 4611686293338849280, !dbg !226 + %5381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 0, !dbg !226 + %5382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 1, !dbg !226 + %5383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 2, !dbg !226 + %5384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 3, !dbg !226 + %5385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 4, !dbg !226 + %5386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 5, !dbg !226 + %5387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 6, !dbg !226 + %5388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 7, !dbg !226 + %5389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 8, !dbg !226 + %5390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 9, !dbg !226 + %5391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 10, !dbg !226 + %5392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 11, !dbg !226 + %5393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 12, !dbg !226 + %5394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 13, !dbg !226 + %5395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 14, !dbg !226 + %5396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 15, !dbg !226 + %5397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 16, !dbg !226 + %5398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 17, !dbg !226 + %5399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 18, !dbg !226 + %5400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 19, !dbg !226 + %5401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 20, !dbg !226 + %5402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 21, !dbg !226 + %5403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 22, !dbg !226 + %5404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 23, !dbg !226 + %5405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 24, !dbg !226 + %5406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 25, !dbg !226 + %5407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 26, !dbg !226 + %5408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 27, !dbg !226 + %5409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 28, !dbg !226 + %5410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 29, !dbg !226 + %5411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 30, !dbg !226 + %5412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 31, !dbg !226 + %5413 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %5381, float %5382, float %5383, float %5384, float %5385, float %5386, float %5387, float %5388, float %5389, float %5390, float %5391, float %5392, float %5393, float %5394, float %5395, float %5396, float %5397, float %5398, float %5399, float %5400, float %5401, float %5402, float %5403, float %5404, float %5405, float %5406, float %5407, float %5408, float %5409, float %5410, float %5411, float %5412, i64 %5375, i64 %5380, i1 true) #3, !dbg !226 + %5414 = or disjoint i32 %5358, 64, !dbg !226 + %5415 = add i32 %5414, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !226 + %5416 = lshr exact i32 %5415, 4, !dbg !226 + %5417 = and i32 %5416, 16383, !dbg !226 + %5418 = zext nneg i32 %5417 to i64, !dbg !226 + %5419 = or disjoint i64 %5418, 4611686293372403712, !dbg !226 + %5420 = add i32 %5364, 64, !dbg !226 + %5421 = lshr exact i32 %5420, 4, !dbg !226 + %5422 = and i32 %5421, 16383, !dbg !226 + %5423 = zext nneg i32 %5422 to i64, !dbg !226 + %5424 = or disjoint i64 %5423, 4611686293338849280, !dbg !226 + %5425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 0, !dbg !226 + %5426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 1, !dbg !226 + %5427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 2, !dbg !226 + %5428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 3, !dbg !226 + %5429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 4, !dbg !226 + %5430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 5, !dbg !226 + %5431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 6, !dbg !226 + %5432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 7, !dbg !226 + %5433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 8, !dbg !226 + %5434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 9, !dbg !226 + %5435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 10, !dbg !226 + %5436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 11, !dbg !226 + %5437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 12, !dbg !226 + %5438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 13, !dbg !226 + %5439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 14, !dbg !226 + %5440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 15, !dbg !226 + %5441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 16, !dbg !226 + %5442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 17, !dbg !226 + %5443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 18, !dbg !226 + %5444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 19, !dbg !226 + %5445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 20, !dbg !226 + %5446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 21, !dbg !226 + %5447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 22, !dbg !226 + %5448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 23, !dbg !226 + %5449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 24, !dbg !226 + %5450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 25, !dbg !226 + %5451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 26, !dbg !226 + %5452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 27, !dbg !226 + %5453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 28, !dbg !226 + %5454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 29, !dbg !226 + %5455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 30, !dbg !226 + %5456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 31, !dbg !226 + %5457 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %5425, float %5426, float %5427, float %5428, float %5429, float %5430, float %5431, float %5432, float %5433, float %5434, float %5435, float %5436, float %5437, float %5438, float %5439, float %5440, float %5441, float %5442, float %5443, float %5444, float %5445, float %5446, float %5447, float %5448, float %5449, float %5450, float %5451, float %5452, float %5453, float %5454, float %5455, float %5456, i64 %5419, i64 %5424, i1 true) #3, !dbg !226 + %5458 = or disjoint i32 %5358, 96, !dbg !226 + %5459 = add i32 %5458, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !226 + %5460 = lshr exact i32 %5459, 4, !dbg !226 + %5461 = and i32 %5460, 16383, !dbg !226 + %5462 = zext nneg i32 %5461 to i64, !dbg !226 + %5463 = or disjoint i64 %5462, 4611686293372403712, !dbg !226 + %5464 = add i32 %5364, 96, !dbg !226 + %5465 = lshr exact i32 %5464, 4, !dbg !226 + %5466 = and i32 %5465, 16383, !dbg !226 + %5467 = zext nneg i32 %5466 to i64, !dbg !226 + %5468 = or disjoint i64 %5467, 4611686293338849280, !dbg !226 + %5469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 0, !dbg !226 + %5470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 1, !dbg !226 + %5471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 2, !dbg !226 + %5472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 3, !dbg !226 + %5473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 4, !dbg !226 + %5474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 5, !dbg !226 + %5475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 6, !dbg !226 + %5476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 7, !dbg !226 + %5477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 8, !dbg !226 + %5478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 9, !dbg !226 + %5479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 10, !dbg !226 + %5480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 11, !dbg !226 + %5481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 12, !dbg !226 + %5482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 13, !dbg !226 + %5483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 14, !dbg !226 + %5484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 15, !dbg !226 + %5485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 16, !dbg !226 + %5486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 17, !dbg !226 + %5487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 18, !dbg !226 + %5488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 19, !dbg !226 + %5489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 20, !dbg !226 + %5490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 21, !dbg !226 + %5491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 22, !dbg !226 + %5492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 23, !dbg !226 + %5493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 24, !dbg !226 + %5494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 25, !dbg !226 + %5495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 26, !dbg !226 + %5496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 27, !dbg !226 + %5497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 28, !dbg !226 + %5498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 29, !dbg !226 + %5499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 30, !dbg !226 + %5500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 31, !dbg !226 + %5501 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %5469, float %5470, float %5471, float %5472, float %5473, float %5474, float %5475, float %5476, float %5477, float %5478, float %5479, float %5480, float %5481, float %5482, float %5483, float %5484, float %5485, float %5486, float %5487, float %5488, float %5489, float %5490, float %5491, float %5492, float %5493, float %5494, float %5495, float %5496, float %5497, float %5498, float %5499, float %5500, i64 %5463, i64 %5468, i1 true) #3, !dbg !226 + %5502 = or disjoint i32 %5358, 16384, !dbg !226 + %5503 = add i32 %5502, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !226 + %5504 = lshr exact i32 %5503, 4, !dbg !226 + %5505 = and i32 %5504, 16383, !dbg !226 + %5506 = zext nneg i32 %5505 to i64, !dbg !226 + %5507 = or disjoint i64 %5506, 4611686293372403712, !dbg !226 + %5508 = add i32 %5364, 8192, !dbg !226 + %5509 = lshr exact i32 %5508, 4, !dbg !226 + %5510 = and i32 %5509, 16383, !dbg !226 + %5511 = zext nneg i32 %5510 to i64, !dbg !226 + %5512 = or disjoint i64 %5511, 4611686293338849280, !dbg !226 + %5513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 0, !dbg !226 + %5514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 1, !dbg !226 + %5515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 2, !dbg !226 + %5516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 3, !dbg !226 + %5517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 4, !dbg !226 + %5518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 5, !dbg !226 + %5519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 6, !dbg !226 + %5520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 7, !dbg !226 + %5521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 8, !dbg !226 + %5522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 9, !dbg !226 + %5523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 10, !dbg !226 + %5524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 11, !dbg !226 + %5525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 12, !dbg !226 + %5526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 13, !dbg !226 + %5527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 14, !dbg !226 + %5528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 15, !dbg !226 + %5529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 16, !dbg !226 + %5530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 17, !dbg !226 + %5531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 18, !dbg !226 + %5532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 19, !dbg !226 + %5533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 20, !dbg !226 + %5534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 21, !dbg !226 + %5535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 22, !dbg !226 + %5536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 23, !dbg !226 + %5537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 24, !dbg !226 + %5538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 25, !dbg !226 + %5539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 26, !dbg !226 + %5540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 27, !dbg !226 + %5541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 28, !dbg !226 + %5542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 29, !dbg !226 + %5543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 30, !dbg !226 + %5544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 31, !dbg !226 + %5545 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %5513, float %5514, float %5515, float %5516, float %5517, float %5518, float %5519, float %5520, float %5521, float %5522, float %5523, float %5524, float %5525, float %5526, float %5527, float %5528, float %5529, float %5530, float %5531, float %5532, float %5533, float %5534, float %5535, float %5536, float %5537, float %5538, float %5539, float %5540, float %5541, float %5542, float %5543, float %5544, i64 %5507, i64 %5512, i1 true) #3, !dbg !226 + %5546 = or disjoint i32 %5358, 16416, !dbg !226 + %5547 = add i32 %5546, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !226 + %5548 = lshr exact i32 %5547, 4, !dbg !226 + %5549 = and i32 %5548, 16383, !dbg !226 + %5550 = zext nneg i32 %5549 to i64, !dbg !226 + %5551 = or disjoint i64 %5550, 4611686293372403712, !dbg !226 + %5552 = add i32 %5364, 8224, !dbg !226 + %5553 = lshr exact i32 %5552, 4, !dbg !226 + %5554 = and i32 %5553, 16383, !dbg !226 + %5555 = zext nneg i32 %5554 to i64, !dbg !226 + %5556 = or disjoint i64 %5555, 4611686293338849280, !dbg !226 + %5557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 0, !dbg !226 + %5558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 1, !dbg !226 + %5559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 2, !dbg !226 + %5560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 3, !dbg !226 + %5561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 4, !dbg !226 + %5562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 5, !dbg !226 + %5563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 6, !dbg !226 + %5564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 7, !dbg !226 + %5565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 8, !dbg !226 + %5566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 9, !dbg !226 + %5567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 10, !dbg !226 + %5568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 11, !dbg !226 + %5569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 12, !dbg !226 + %5570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 13, !dbg !226 + %5571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 14, !dbg !226 + %5572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 15, !dbg !226 + %5573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 16, !dbg !226 + %5574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 17, !dbg !226 + %5575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 18, !dbg !226 + %5576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 19, !dbg !226 + %5577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 20, !dbg !226 + %5578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 21, !dbg !226 + %5579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 22, !dbg !226 + %5580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 23, !dbg !226 + %5581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 24, !dbg !226 + %5582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 25, !dbg !226 + %5583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 26, !dbg !226 + %5584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 27, !dbg !226 + %5585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 28, !dbg !226 + %5586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 29, !dbg !226 + %5587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 30, !dbg !226 + %5588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 31, !dbg !226 + %5589 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %5557, float %5558, float %5559, float %5560, float %5561, float %5562, float %5563, float %5564, float %5565, float %5566, float %5567, float %5568, float %5569, float %5570, float %5571, float %5572, float %5573, float %5574, float %5575, float %5576, float %5577, float %5578, float %5579, float %5580, float %5581, float %5582, float %5583, float %5584, float %5585, float %5586, float %5587, float %5588, i64 %5551, i64 %5556, i1 true) #3, !dbg !226 + %5590 = or disjoint i32 %5358, 16448, !dbg !226 + %5591 = add i32 %5590, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !226 + %5592 = lshr exact i32 %5591, 4, !dbg !226 + %5593 = and i32 %5592, 16383, !dbg !226 + %5594 = zext nneg i32 %5593 to i64, !dbg !226 + %5595 = or disjoint i64 %5594, 4611686293372403712, !dbg !226 + %5596 = add i32 %5364, 8256, !dbg !226 + %5597 = lshr exact i32 %5596, 4, !dbg !226 + %5598 = and i32 %5597, 16383, !dbg !226 + %5599 = zext nneg i32 %5598 to i64, !dbg !226 + %5600 = or disjoint i64 %5599, 4611686293338849280, !dbg !226 + %5601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 0, !dbg !226 + %5602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 1, !dbg !226 + %5603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 2, !dbg !226 + %5604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 3, !dbg !226 + %5605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 4, !dbg !226 + %5606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 5, !dbg !226 + %5607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 6, !dbg !226 + %5608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 7, !dbg !226 + %5609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 8, !dbg !226 + %5610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 9, !dbg !226 + %5611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 10, !dbg !226 + %5612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 11, !dbg !226 + %5613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 12, !dbg !226 + %5614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 13, !dbg !226 + %5615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 14, !dbg !226 + %5616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 15, !dbg !226 + %5617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 16, !dbg !226 + %5618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 17, !dbg !226 + %5619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 18, !dbg !226 + %5620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 19, !dbg !226 + %5621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 20, !dbg !226 + %5622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 21, !dbg !226 + %5623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 22, !dbg !226 + %5624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 23, !dbg !226 + %5625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 24, !dbg !226 + %5626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 25, !dbg !226 + %5627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 26, !dbg !226 + %5628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 27, !dbg !226 + %5629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 28, !dbg !226 + %5630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 29, !dbg !226 + %5631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 30, !dbg !226 + %5632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 31, !dbg !226 + %5633 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %5601, float %5602, float %5603, float %5604, float %5605, float %5606, float %5607, float %5608, float %5609, float %5610, float %5611, float %5612, float %5613, float %5614, float %5615, float %5616, float %5617, float %5618, float %5619, float %5620, float %5621, float %5622, float %5623, float %5624, float %5625, float %5626, float %5627, float %5628, float %5629, float %5630, float %5631, float %5632, i64 %5595, i64 %5600, i1 true) #3, !dbg !226 + %5634 = or disjoint i32 %5358, 16480, !dbg !226 + %5635 = add i32 %5634, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !226 + %5636 = lshr exact i32 %5635, 4, !dbg !226 + %5637 = and i32 %5636, 16383, !dbg !226 + %5638 = zext nneg i32 %5637 to i64, !dbg !226 + %5639 = or disjoint i64 %5638, 4611686293372403712, !dbg !226 + %5640 = add i32 %5364, 8288, !dbg !226 + %5641 = lshr exact i32 %5640, 4, !dbg !226 + %5642 = and i32 %5641, 16383, !dbg !226 + %5643 = zext nneg i32 %5642 to i64, !dbg !226 + %5644 = or disjoint i64 %5643, 4611686293338849280, !dbg !226 + %5645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 0, !dbg !226 + %5646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 1, !dbg !226 + %5647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 2, !dbg !226 + %5648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 3, !dbg !226 + %5649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 4, !dbg !226 + %5650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 5, !dbg !226 + %5651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 6, !dbg !226 + %5652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 7, !dbg !226 + %5653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 8, !dbg !226 + %5654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 9, !dbg !226 + %5655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 10, !dbg !226 + %5656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 11, !dbg !226 + %5657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 12, !dbg !226 + %5658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 13, !dbg !226 + %5659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 14, !dbg !226 + %5660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 15, !dbg !226 + %5661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 16, !dbg !226 + %5662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 17, !dbg !226 + %5663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 18, !dbg !226 + %5664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 19, !dbg !226 + %5665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 20, !dbg !226 + %5666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 21, !dbg !226 + %5667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 22, !dbg !226 + %5668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 23, !dbg !226 + %5669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 24, !dbg !226 + %5670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 25, !dbg !226 + %5671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 26, !dbg !226 + %5672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 27, !dbg !226 + %5673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 28, !dbg !226 + %5674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 29, !dbg !226 + %5675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 30, !dbg !226 + %5676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 31, !dbg !226 + %5677 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %5645, float %5646, float %5647, float %5648, float %5649, float %5650, float %5651, float %5652, float %5653, float %5654, float %5655, float %5656, float %5657, float %5658, float %5659, float %5660, float %5661, float %5662, float %5663, float %5664, float %5665, float %5666, float %5667, float %5668, float %5669, float %5670, float %5671, float %5672, float %5673, float %5674, float %5675, float %5676, i64 %5639, i64 %5644, i1 true) #3, !dbg !226 + %5678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 0, !dbg !226 + %5679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 1, !dbg !226 + %5680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 2, !dbg !226 + %5681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 3, !dbg !226 + %5682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 4, !dbg !226 + %5683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 5, !dbg !226 + %5684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 6, !dbg !226 + %5685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 7, !dbg !226 + %5686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 8, !dbg !226 + %5687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 9, !dbg !226 + %5688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 10, !dbg !226 + %5689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 11, !dbg !226 + %5690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 12, !dbg !226 + %5691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 13, !dbg !226 + %5692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 14, !dbg !226 + %5693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 15, !dbg !226 + %5694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 16, !dbg !226 + %5695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 17, !dbg !226 + %5696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 18, !dbg !226 + %5697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 19, !dbg !226 + %5698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 20, !dbg !226 + %5699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 21, !dbg !226 + %5700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 22, !dbg !226 + %5701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 23, !dbg !226 + %5702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 24, !dbg !226 + %5703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 25, !dbg !226 + %5704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 26, !dbg !226 + %5705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 27, !dbg !226 + %5706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 28, !dbg !226 + %5707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 29, !dbg !226 + %5708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 30, !dbg !226 + %5709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 31, !dbg !226 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !226 + %5710 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %5678, float %5679, float %5680, float %5681, float %5682, float %5683, float %5684, float %5685, float %5686, float %5687, float %5688, float %5689, float %5690, float %5691, float %5692, float %5693, float %5694, float %5695, float %5696, float %5697, float %5698, float %5699, float %5700, float %5701, float %5702, float %5703, float %5704, float %5705, float %5706, float %5707, float %5708, float %5709, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 0, i32 0, ptr addrspace(3) %5289, i32 0, i32 0) #3, !dbg !226 + %5711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 0, !dbg !226 + %5712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 1, !dbg !226 + %5713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 2, !dbg !226 + %5714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 3, !dbg !226 + %5715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 4, !dbg !226 + %5716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 5, !dbg !226 + %5717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 6, !dbg !226 + %5718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 7, !dbg !226 + %5719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 8, !dbg !226 + %5720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 9, !dbg !226 + %5721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 10, !dbg !226 + %5722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 11, !dbg !226 + %5723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 12, !dbg !226 + %5724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 13, !dbg !226 + %5725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 14, !dbg !226 + %5726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 15, !dbg !226 + %5727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 16, !dbg !226 + %5728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 17, !dbg !226 + %5729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 18, !dbg !226 + %5730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 19, !dbg !226 + %5731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 20, !dbg !226 + %5732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 21, !dbg !226 + %5733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 22, !dbg !226 + %5734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 23, !dbg !226 + %5735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 24, !dbg !226 + %5736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 25, !dbg !226 + %5737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 26, !dbg !226 + %5738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 27, !dbg !226 + %5739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 28, !dbg !226 + %5740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 29, !dbg !226 + %5741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 30, !dbg !226 + %5742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 31, !dbg !226 + %5743 = fmul float %5711, 0x3FB6A09E60000000, !dbg !253 + %5744 = fmul float %5712, 0x3FB6A09E60000000, !dbg !253 + %5745 = fmul float %5713, 0x3FB6A09E60000000, !dbg !253 + %5746 = fmul float %5714, 0x3FB6A09E60000000, !dbg !253 + %5747 = fmul float %5715, 0x3FB6A09E60000000, !dbg !253 + %5748 = fmul float %5716, 0x3FB6A09E60000000, !dbg !253 + %5749 = fmul float %5717, 0x3FB6A09E60000000, !dbg !253 + %5750 = fmul float %5718, 0x3FB6A09E60000000, !dbg !253 + %5751 = fmul float %5719, 0x3FB6A09E60000000, !dbg !253 + %5752 = fmul float %5720, 0x3FB6A09E60000000, !dbg !253 + %5753 = fmul float %5721, 0x3FB6A09E60000000, !dbg !253 + %5754 = fmul float %5722, 0x3FB6A09E60000000, !dbg !253 + %5755 = fmul float %5723, 0x3FB6A09E60000000, !dbg !253 + %5756 = fmul float %5724, 0x3FB6A09E60000000, !dbg !253 + %5757 = fmul float %5725, 0x3FB6A09E60000000, !dbg !253 + %5758 = fmul float %5726, 0x3FB6A09E60000000, !dbg !253 + %5759 = fmul float %5727, 0x3FB6A09E60000000, !dbg !253 + %5760 = fmul float %5728, 0x3FB6A09E60000000, !dbg !253 + %5761 = fmul float %5729, 0x3FB6A09E60000000, !dbg !253 + %5762 = fmul float %5730, 0x3FB6A09E60000000, !dbg !253 + %5763 = fmul float %5731, 0x3FB6A09E60000000, !dbg !253 + %5764 = fmul float %5732, 0x3FB6A09E60000000, !dbg !253 + %5765 = fmul float %5733, 0x3FB6A09E60000000, !dbg !253 + %5766 = fmul float %5734, 0x3FB6A09E60000000, !dbg !253 + %5767 = fmul float %5735, 0x3FB6A09E60000000, !dbg !253 + %5768 = fmul float %5736, 0x3FB6A09E60000000, !dbg !253 + %5769 = fmul float %5737, 0x3FB6A09E60000000, !dbg !253 + %5770 = fmul float %5738, 0x3FB6A09E60000000, !dbg !253 + %5771 = fmul float %5739, 0x3FB6A09E60000000, !dbg !253 + %5772 = fmul float %5740, 0x3FB6A09E60000000, !dbg !253 + %5773 = fmul float %5741, 0x3FB6A09E60000000, !dbg !253 + %5774 = fmul float %5742, 0x3FB6A09E60000000, !dbg !253 + %5775 = extractelement <16 x i32> %5279, i64 15, !dbg !254 + %5776 = icmp sge i32 %5775, %4470, !dbg !254 + %5777 = extractelement <16 x i32> %5279, i64 14, !dbg !254 + %5778 = icmp sge i32 %5777, %4470, !dbg !254 + %5779 = icmp sge i32 %5775, %4471, !dbg !254 + %5780 = icmp sge i32 %5777, %4471, !dbg !254 + %5781 = extractelement <16 x i32> %5279, i64 13, !dbg !254 + %5782 = icmp sge i32 %5781, %4470, !dbg !254 + %5783 = extractelement <16 x i32> %5279, i64 12, !dbg !254 + %5784 = icmp sge i32 %5783, %4470, !dbg !254 + %5785 = icmp sge i32 %5781, %4471, !dbg !254 + %5786 = icmp sge i32 %5783, %4471, !dbg !254 + %5787 = extractelement <16 x i32> %5279, i64 11, !dbg !254 + %5788 = icmp sge i32 %5787, %4470, !dbg !254 + %5789 = extractelement <16 x i32> %5279, i64 10, !dbg !254 + %5790 = icmp sge i32 %5789, %4470, !dbg !254 + %5791 = icmp sge i32 %5787, %4471, !dbg !254 + %5792 = icmp sge i32 %5789, %4471, !dbg !254 + %5793 = extractelement <16 x i32> %5279, i64 9, !dbg !254 + %5794 = icmp sge i32 %5793, %4470, !dbg !254 + %5795 = extractelement <16 x i32> %5279, i64 8, !dbg !254 + %5796 = icmp sge i32 %5795, %4470, !dbg !254 + %5797 = icmp sge i32 %5793, %4471, !dbg !254 + %5798 = icmp sge i32 %5795, %4471, !dbg !254 + %5799 = extractelement <16 x i32> %5279, i64 7, !dbg !254 + %5800 = icmp sge i32 %5799, %4470, !dbg !254 + %5801 = extractelement <16 x i32> %5279, i64 6, !dbg !254 + %5802 = icmp sge i32 %5801, %4470, !dbg !254 + %5803 = icmp sge i32 %5799, %4471, !dbg !254 + %5804 = icmp sge i32 %5801, %4471, !dbg !254 + %5805 = extractelement <16 x i32> %5279, i64 5, !dbg !254 + %5806 = icmp sge i32 %5805, %4470, !dbg !254 + %5807 = extractelement <16 x i32> %5279, i64 4, !dbg !254 + %5808 = icmp sge i32 %5807, %4470, !dbg !254 + %5809 = icmp sge i32 %5805, %4471, !dbg !254 + %5810 = icmp sge i32 %5807, %4471, !dbg !254 + %5811 = extractelement <16 x i32> %5279, i64 3, !dbg !254 + %5812 = icmp sge i32 %5811, %4470, !dbg !254 + %5813 = extractelement <16 x i32> %5279, i64 2, !dbg !254 + %5814 = icmp sge i32 %5813, %4470, !dbg !254 + %5815 = icmp sge i32 %5811, %4471, !dbg !254 + %5816 = icmp sge i32 %5813, %4471, !dbg !254 + %5817 = extractelement <16 x i32> %5279, i64 1, !dbg !254 + %5818 = icmp sge i32 %5817, %4470, !dbg !254 + %5819 = extractelement <16 x i32> %5279, i64 0, !dbg !254 + %5820 = icmp sge i32 %5819, %4470, !dbg !254 + %5821 = icmp sge i32 %5817, %4471, !dbg !254 + %5822 = icmp sge i32 %5819, %4471, !dbg !254 + %5823 = sext <16 x i32> %5279 to <16 x i64>, !dbg !255 + %5824 = icmp sgt <16 x i64> %4944, %5823, !dbg !255 + %5825 = extractelement <16 x i1> %5824, i64 15, !dbg !256 + %5826 = and i1 %5776, %5825, !dbg !256 + %5827 = extractelement <16 x i1> %5824, i64 14, !dbg !256 + %5828 = and i1 %5778, %5827, !dbg !256 + %5829 = and i1 %5779, %5825, !dbg !256 + %5830 = and i1 %5780, %5827, !dbg !256 + %5831 = extractelement <16 x i1> %5824, i64 13, !dbg !256 + %5832 = and i1 %5782, %5831, !dbg !256 + %5833 = extractelement <16 x i1> %5824, i64 12, !dbg !256 + %5834 = and i1 %5784, %5833, !dbg !256 + %5835 = and i1 %5785, %5831, !dbg !256 + %5836 = and i1 %5786, %5833, !dbg !256 + %5837 = extractelement <16 x i1> %5824, i64 11, !dbg !256 + %5838 = and i1 %5788, %5837, !dbg !256 + %5839 = extractelement <16 x i1> %5824, i64 10, !dbg !256 + %5840 = and i1 %5790, %5839, !dbg !256 + %5841 = and i1 %5791, %5837, !dbg !256 + %5842 = and i1 %5792, %5839, !dbg !256 + %5843 = extractelement <16 x i1> %5824, i64 9, !dbg !256 + %5844 = and i1 %5794, %5843, !dbg !256 + %5845 = extractelement <16 x i1> %5824, i64 8, !dbg !256 + %5846 = and i1 %5796, %5845, !dbg !256 + %5847 = and i1 %5797, %5843, !dbg !256 + %5848 = and i1 %5798, %5845, !dbg !256 + %5849 = extractelement <16 x i1> %5824, i64 7, !dbg !256 + %5850 = and i1 %5800, %5849, !dbg !256 + %5851 = extractelement <16 x i1> %5824, i64 6, !dbg !256 + %5852 = and i1 %5802, %5851, !dbg !256 + %5853 = and i1 %5803, %5849, !dbg !256 + %5854 = and i1 %5804, %5851, !dbg !256 + %5855 = extractelement <16 x i1> %5824, i64 5, !dbg !256 + %5856 = and i1 %5806, %5855, !dbg !256 + %5857 = extractelement <16 x i1> %5824, i64 4, !dbg !256 + %5858 = and i1 %5808, %5857, !dbg !256 + %5859 = and i1 %5809, %5855, !dbg !256 + %5860 = and i1 %5810, %5857, !dbg !256 + %5861 = extractelement <16 x i1> %5824, i64 3, !dbg !256 + %5862 = and i1 %5812, %5861, !dbg !256 + %5863 = extractelement <16 x i1> %5824, i64 2, !dbg !256 + %5864 = and i1 %5814, %5863, !dbg !256 + %5865 = and i1 %5815, %5861, !dbg !256 + %5866 = and i1 %5816, %5863, !dbg !256 + %5867 = extractelement <16 x i1> %5824, i64 1, !dbg !256 + %5868 = and i1 %5818, %5867, !dbg !256 + %5869 = extractelement <16 x i1> %5824, i64 0, !dbg !256 + %5870 = and i1 %5820, %5869, !dbg !256 + %5871 = and i1 %5821, %5867, !dbg !256 + %5872 = and i1 %5822, %5869, !dbg !256 + %5873 = fmul float %5743, 0x3FF7154760000000, !dbg !257 + %5874 = select i1 %5826, float %5873, float 0xFFF0000000000000, !dbg !258 + %5875 = fmul float %5744, 0x3FF7154760000000, !dbg !257 + %5876 = select i1 %5828, float %5875, float 0xFFF0000000000000, !dbg !258 + %5877 = fmul float %5745, 0x3FF7154760000000, !dbg !257 + %5878 = select i1 %5829, float %5877, float 0xFFF0000000000000, !dbg !258 + %5879 = fmul float %5746, 0x3FF7154760000000, !dbg !257 + %5880 = select i1 %5830, float %5879, float 0xFFF0000000000000, !dbg !258 + %5881 = fmul float %5747, 0x3FF7154760000000, !dbg !257 + %5882 = select i1 %5832, float %5881, float 0xFFF0000000000000, !dbg !258 + %5883 = fmul float %5748, 0x3FF7154760000000, !dbg !257 + %5884 = select i1 %5834, float %5883, float 0xFFF0000000000000, !dbg !258 + %5885 = fmul float %5749, 0x3FF7154760000000, !dbg !257 + %5886 = select i1 %5835, float %5885, float 0xFFF0000000000000, !dbg !258 + %5887 = fmul float %5750, 0x3FF7154760000000, !dbg !257 + %5888 = select i1 %5836, float %5887, float 0xFFF0000000000000, !dbg !258 + %5889 = fmul float %5751, 0x3FF7154760000000, !dbg !257 + %5890 = select i1 %5838, float %5889, float 0xFFF0000000000000, !dbg !258 + %5891 = fmul float %5752, 0x3FF7154760000000, !dbg !257 + %5892 = select i1 %5840, float %5891, float 0xFFF0000000000000, !dbg !258 + %5893 = fmul float %5753, 0x3FF7154760000000, !dbg !257 + %5894 = select i1 %5841, float %5893, float 0xFFF0000000000000, !dbg !258 + %5895 = fmul float %5754, 0x3FF7154760000000, !dbg !257 + %5896 = select i1 %5842, float %5895, float 0xFFF0000000000000, !dbg !258 + %5897 = fmul float %5755, 0x3FF7154760000000, !dbg !257 + %5898 = select i1 %5844, float %5897, float 0xFFF0000000000000, !dbg !258 + %5899 = fmul float %5756, 0x3FF7154760000000, !dbg !257 + %5900 = select i1 %5846, float %5899, float 0xFFF0000000000000, !dbg !258 + %5901 = fmul float %5757, 0x3FF7154760000000, !dbg !257 + %5902 = select i1 %5847, float %5901, float 0xFFF0000000000000, !dbg !258 + %5903 = fmul float %5758, 0x3FF7154760000000, !dbg !257 + %5904 = select i1 %5848, float %5903, float 0xFFF0000000000000, !dbg !258 + %5905 = fmul float %5759, 0x3FF7154760000000, !dbg !257 + %5906 = select i1 %5850, float %5905, float 0xFFF0000000000000, !dbg !258 + %5907 = fmul float %5760, 0x3FF7154760000000, !dbg !257 + %5908 = select i1 %5852, float %5907, float 0xFFF0000000000000, !dbg !258 + %5909 = fmul float %5761, 0x3FF7154760000000, !dbg !257 + %5910 = select i1 %5853, float %5909, float 0xFFF0000000000000, !dbg !258 + %5911 = fmul float %5762, 0x3FF7154760000000, !dbg !257 + %5912 = select i1 %5854, float %5911, float 0xFFF0000000000000, !dbg !258 + %5913 = fmul float %5763, 0x3FF7154760000000, !dbg !257 + %5914 = select i1 %5856, float %5913, float 0xFFF0000000000000, !dbg !258 + %5915 = fmul float %5764, 0x3FF7154760000000, !dbg !257 + %5916 = select i1 %5858, float %5915, float 0xFFF0000000000000, !dbg !258 + %5917 = fmul float %5765, 0x3FF7154760000000, !dbg !257 + %5918 = select i1 %5859, float %5917, float 0xFFF0000000000000, !dbg !258 + %5919 = fmul float %5766, 0x3FF7154760000000, !dbg !257 + %5920 = select i1 %5860, float %5919, float 0xFFF0000000000000, !dbg !258 + %5921 = fmul float %5767, 0x3FF7154760000000, !dbg !257 + %5922 = select i1 %5862, float %5921, float 0xFFF0000000000000, !dbg !258 + %5923 = fmul float %5768, 0x3FF7154760000000, !dbg !257 + %5924 = select i1 %5864, float %5923, float 0xFFF0000000000000, !dbg !258 + %5925 = fmul float %5769, 0x3FF7154760000000, !dbg !257 + %5926 = select i1 %5865, float %5925, float 0xFFF0000000000000, !dbg !258 + %5927 = fmul float %5770, 0x3FF7154760000000, !dbg !257 + %5928 = select i1 %5866, float %5927, float 0xFFF0000000000000, !dbg !258 + %5929 = fmul float %5771, 0x3FF7154760000000, !dbg !257 + %5930 = select i1 %5868, float %5929, float 0xFFF0000000000000, !dbg !258 + %5931 = fmul float %5772, 0x3FF7154760000000, !dbg !257 + %5932 = select i1 %5870, float %5931, float 0xFFF0000000000000, !dbg !258 + %5933 = fmul float %5773, 0x3FF7154760000000, !dbg !257 + %5934 = select i1 %5871, float %5933, float 0xFFF0000000000000, !dbg !258 + %5935 = fmul float %5774, 0x3FF7154760000000, !dbg !257 + %5936 = select i1 %5872, float %5935, float 0xFFF0000000000000, !dbg !258 + %5937 = fsub float %5874, %5340, !dbg !259 + %5938 = fsub float %5876, %5341, !dbg !259 + %5939 = fsub float %5878, %5340, !dbg !259 + %5940 = fsub float %5880, %5341, !dbg !259 + %5941 = fsub float %5882, %5342, !dbg !259 + %5942 = fsub float %5884, %5343, !dbg !259 + %5943 = fsub float %5886, %5342, !dbg !259 + %5944 = fsub float %5888, %5343, !dbg !259 + %5945 = fsub float %5890, %5344, !dbg !259 + %5946 = fsub float %5892, %5345, !dbg !259 + %5947 = fsub float %5894, %5344, !dbg !259 + %5948 = fsub float %5896, %5345, !dbg !259 + %5949 = fsub float %5898, %5346, !dbg !259 + %5950 = fsub float %5900, %5347, !dbg !259 + %5951 = fsub float %5902, %5346, !dbg !259 + %5952 = fsub float %5904, %5347, !dbg !259 + %5953 = fsub float %5906, %5348, !dbg !259 + %5954 = fsub float %5908, %5349, !dbg !259 + %5955 = fsub float %5910, %5348, !dbg !259 + %5956 = fsub float %5912, %5349, !dbg !259 + %5957 = fsub float %5914, %5350, !dbg !259 + %5958 = fsub float %5916, %5351, !dbg !259 + %5959 = fsub float %5918, %5350, !dbg !259 + %5960 = fsub float %5920, %5351, !dbg !259 + %5961 = fsub float %5922, %5352, !dbg !259 + %5962 = fsub float %5924, %5353, !dbg !259 + %5963 = fsub float %5926, %5352, !dbg !259 + %5964 = fsub float %5928, %5353, !dbg !259 + %5965 = fsub float %5930, %5354, !dbg !259 + %5966 = fsub float %5932, %5355, !dbg !259 + %5967 = fsub float %5934, %5354, !dbg !259 + %5968 = fsub float %5936, %5355, !dbg !259 + %5969 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1220 = icmp eq i32 %5969, 0, !dbg !260 + br i1 %.not.i1220, label %5972, label %5970, !dbg !260 + +5970: ; preds = %.lr.ph + %5971 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5937) #3, !dbg !260 + br label %__nv_exp2f.exit1222, !dbg !260 + +5972: ; preds = %.lr.ph + %5973 = tail call float @llvm.nvvm.ex2.approx.f(float %5937) #3, !dbg !260 + br label %__nv_exp2f.exit1222, !dbg !260 + +__nv_exp2f.exit1222: ; preds = %5970, %5972 + %.0.i1221 = phi float [ %5971, %5970 ], [ %5973, %5972 ], !dbg !260 + %5974 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1223 = icmp eq i32 %5974, 0, !dbg !260 + br i1 %.not.i1223, label %5977, label %5975, !dbg !260 + +5975: ; preds = %__nv_exp2f.exit1222 + %5976 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5938) #3, !dbg !260 + br label %__nv_exp2f.exit1225, !dbg !260 + +5977: ; preds = %__nv_exp2f.exit1222 + %5978 = tail call float @llvm.nvvm.ex2.approx.f(float %5938) #3, !dbg !260 + br label %__nv_exp2f.exit1225, !dbg !260 + +__nv_exp2f.exit1225: ; preds = %5975, %5977 + %.0.i1224 = phi float [ %5976, %5975 ], [ %5978, %5977 ], !dbg !260 + %5979 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1226 = icmp eq i32 %5979, 0, !dbg !260 + br i1 %.not.i1226, label %5982, label %5980, !dbg !260 + +5980: ; preds = %__nv_exp2f.exit1225 + %5981 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5939) #3, !dbg !260 + br label %__nv_exp2f.exit1228, !dbg !260 + +5982: ; preds = %__nv_exp2f.exit1225 + %5983 = tail call float @llvm.nvvm.ex2.approx.f(float %5939) #3, !dbg !260 + br label %__nv_exp2f.exit1228, !dbg !260 + +__nv_exp2f.exit1228: ; preds = %5980, %5982 + %.0.i1227 = phi float [ %5981, %5980 ], [ %5983, %5982 ], !dbg !260 + %5984 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1229 = icmp eq i32 %5984, 0, !dbg !260 + br i1 %.not.i1229, label %5987, label %5985, !dbg !260 + +5985: ; preds = %__nv_exp2f.exit1228 + %5986 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5940) #3, !dbg !260 + br label %__nv_exp2f.exit1231, !dbg !260 + +5987: ; preds = %__nv_exp2f.exit1228 + %5988 = tail call float @llvm.nvvm.ex2.approx.f(float %5940) #3, !dbg !260 + br label %__nv_exp2f.exit1231, !dbg !260 + +__nv_exp2f.exit1231: ; preds = %5985, %5987 + %.0.i1230 = phi float [ %5986, %5985 ], [ %5988, %5987 ], !dbg !260 + %5989 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1232 = icmp eq i32 %5989, 0, !dbg !260 + br i1 %.not.i1232, label %5992, label %5990, !dbg !260 + +5990: ; preds = %__nv_exp2f.exit1231 + %5991 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5941) #3, !dbg !260 + br label %__nv_exp2f.exit1234, !dbg !260 + +5992: ; preds = %__nv_exp2f.exit1231 + %5993 = tail call float @llvm.nvvm.ex2.approx.f(float %5941) #3, !dbg !260 + br label %__nv_exp2f.exit1234, !dbg !260 + +__nv_exp2f.exit1234: ; preds = %5990, %5992 + %.0.i1233 = phi float [ %5991, %5990 ], [ %5993, %5992 ], !dbg !260 + %5994 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1235 = icmp eq i32 %5994, 0, !dbg !260 + br i1 %.not.i1235, label %5997, label %5995, !dbg !260 + +5995: ; preds = %__nv_exp2f.exit1234 + %5996 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5942) #3, !dbg !260 + br label %__nv_exp2f.exit1237, !dbg !260 + +5997: ; preds = %__nv_exp2f.exit1234 + %5998 = tail call float @llvm.nvvm.ex2.approx.f(float %5942) #3, !dbg !260 + br label %__nv_exp2f.exit1237, !dbg !260 + +__nv_exp2f.exit1237: ; preds = %5995, %5997 + %.0.i1236 = phi float [ %5996, %5995 ], [ %5998, %5997 ], !dbg !260 + %5999 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1238 = icmp eq i32 %5999, 0, !dbg !260 + br i1 %.not.i1238, label %6002, label %6000, !dbg !260 + +6000: ; preds = %__nv_exp2f.exit1237 + %6001 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5943) #3, !dbg !260 + br label %__nv_exp2f.exit1240, !dbg !260 + +6002: ; preds = %__nv_exp2f.exit1237 + %6003 = tail call float @llvm.nvvm.ex2.approx.f(float %5943) #3, !dbg !260 + br label %__nv_exp2f.exit1240, !dbg !260 + +__nv_exp2f.exit1240: ; preds = %6000, %6002 + %.0.i1239 = phi float [ %6001, %6000 ], [ %6003, %6002 ], !dbg !260 + %6004 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1241 = icmp eq i32 %6004, 0, !dbg !260 + br i1 %.not.i1241, label %6007, label %6005, !dbg !260 + +6005: ; preds = %__nv_exp2f.exit1240 + %6006 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5944) #3, !dbg !260 + br label %__nv_exp2f.exit1243, !dbg !260 + +6007: ; preds = %__nv_exp2f.exit1240 + %6008 = tail call float @llvm.nvvm.ex2.approx.f(float %5944) #3, !dbg !260 + br label %__nv_exp2f.exit1243, !dbg !260 + +__nv_exp2f.exit1243: ; preds = %6005, %6007 + %.0.i1242 = phi float [ %6006, %6005 ], [ %6008, %6007 ], !dbg !260 + %6009 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1244 = icmp eq i32 %6009, 0, !dbg !260 + br i1 %.not.i1244, label %6012, label %6010, !dbg !260 + +6010: ; preds = %__nv_exp2f.exit1243 + %6011 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5945) #3, !dbg !260 + br label %__nv_exp2f.exit1246, !dbg !260 + +6012: ; preds = %__nv_exp2f.exit1243 + %6013 = tail call float @llvm.nvvm.ex2.approx.f(float %5945) #3, !dbg !260 + br label %__nv_exp2f.exit1246, !dbg !260 + +__nv_exp2f.exit1246: ; preds = %6010, %6012 + %.0.i1245 = phi float [ %6011, %6010 ], [ %6013, %6012 ], !dbg !260 + %6014 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1247 = icmp eq i32 %6014, 0, !dbg !260 + br i1 %.not.i1247, label %6017, label %6015, !dbg !260 + +6015: ; preds = %__nv_exp2f.exit1246 + %6016 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5946) #3, !dbg !260 + br label %__nv_exp2f.exit1249, !dbg !260 + +6017: ; preds = %__nv_exp2f.exit1246 + %6018 = tail call float @llvm.nvvm.ex2.approx.f(float %5946) #3, !dbg !260 + br label %__nv_exp2f.exit1249, !dbg !260 + +__nv_exp2f.exit1249: ; preds = %6015, %6017 + %.0.i1248 = phi float [ %6016, %6015 ], [ %6018, %6017 ], !dbg !260 + %6019 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1250 = icmp eq i32 %6019, 0, !dbg !260 + br i1 %.not.i1250, label %6022, label %6020, !dbg !260 + +6020: ; preds = %__nv_exp2f.exit1249 + %6021 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5947) #3, !dbg !260 + br label %__nv_exp2f.exit1252, !dbg !260 + +6022: ; preds = %__nv_exp2f.exit1249 + %6023 = tail call float @llvm.nvvm.ex2.approx.f(float %5947) #3, !dbg !260 + br label %__nv_exp2f.exit1252, !dbg !260 + +__nv_exp2f.exit1252: ; preds = %6020, %6022 + %.0.i1251 = phi float [ %6021, %6020 ], [ %6023, %6022 ], !dbg !260 + %6024 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1253 = icmp eq i32 %6024, 0, !dbg !260 + br i1 %.not.i1253, label %6027, label %6025, !dbg !260 + +6025: ; preds = %__nv_exp2f.exit1252 + %6026 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5948) #3, !dbg !260 + br label %__nv_exp2f.exit1255, !dbg !260 + +6027: ; preds = %__nv_exp2f.exit1252 + %6028 = tail call float @llvm.nvvm.ex2.approx.f(float %5948) #3, !dbg !260 + br label %__nv_exp2f.exit1255, !dbg !260 + +__nv_exp2f.exit1255: ; preds = %6025, %6027 + %.0.i1254 = phi float [ %6026, %6025 ], [ %6028, %6027 ], !dbg !260 + %6029 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1256 = icmp eq i32 %6029, 0, !dbg !260 + br i1 %.not.i1256, label %6032, label %6030, !dbg !260 + +6030: ; preds = %__nv_exp2f.exit1255 + %6031 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5949) #3, !dbg !260 + br label %__nv_exp2f.exit1258, !dbg !260 + +6032: ; preds = %__nv_exp2f.exit1255 + %6033 = tail call float @llvm.nvvm.ex2.approx.f(float %5949) #3, !dbg !260 + br label %__nv_exp2f.exit1258, !dbg !260 + +__nv_exp2f.exit1258: ; preds = %6030, %6032 + %.0.i1257 = phi float [ %6031, %6030 ], [ %6033, %6032 ], !dbg !260 + %6034 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1259 = icmp eq i32 %6034, 0, !dbg !260 + br i1 %.not.i1259, label %6037, label %6035, !dbg !260 + +6035: ; preds = %__nv_exp2f.exit1258 + %6036 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5950) #3, !dbg !260 + br label %__nv_exp2f.exit1261, !dbg !260 + +6037: ; preds = %__nv_exp2f.exit1258 + %6038 = tail call float @llvm.nvvm.ex2.approx.f(float %5950) #3, !dbg !260 + br label %__nv_exp2f.exit1261, !dbg !260 + +__nv_exp2f.exit1261: ; preds = %6035, %6037 + %.0.i1260 = phi float [ %6036, %6035 ], [ %6038, %6037 ], !dbg !260 + %6039 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1262 = icmp eq i32 %6039, 0, !dbg !260 + br i1 %.not.i1262, label %6042, label %6040, !dbg !260 + +6040: ; preds = %__nv_exp2f.exit1261 + %6041 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5951) #3, !dbg !260 + br label %__nv_exp2f.exit1264, !dbg !260 + +6042: ; preds = %__nv_exp2f.exit1261 + %6043 = tail call float @llvm.nvvm.ex2.approx.f(float %5951) #3, !dbg !260 + br label %__nv_exp2f.exit1264, !dbg !260 + +__nv_exp2f.exit1264: ; preds = %6040, %6042 + %.0.i1263 = phi float [ %6041, %6040 ], [ %6043, %6042 ], !dbg !260 + %6044 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1265 = icmp eq i32 %6044, 0, !dbg !260 + br i1 %.not.i1265, label %6047, label %6045, !dbg !260 + +6045: ; preds = %__nv_exp2f.exit1264 + %6046 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5952) #3, !dbg !260 + br label %__nv_exp2f.exit1267, !dbg !260 + +6047: ; preds = %__nv_exp2f.exit1264 + %6048 = tail call float @llvm.nvvm.ex2.approx.f(float %5952) #3, !dbg !260 + br label %__nv_exp2f.exit1267, !dbg !260 + +__nv_exp2f.exit1267: ; preds = %6045, %6047 + %.0.i1266 = phi float [ %6046, %6045 ], [ %6048, %6047 ], !dbg !260 + %6049 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1268 = icmp eq i32 %6049, 0, !dbg !260 + br i1 %.not.i1268, label %6052, label %6050, !dbg !260 + +6050: ; preds = %__nv_exp2f.exit1267 + %6051 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5953) #3, !dbg !260 + br label %__nv_exp2f.exit1270, !dbg !260 + +6052: ; preds = %__nv_exp2f.exit1267 + %6053 = tail call float @llvm.nvvm.ex2.approx.f(float %5953) #3, !dbg !260 + br label %__nv_exp2f.exit1270, !dbg !260 + +__nv_exp2f.exit1270: ; preds = %6050, %6052 + %.0.i1269 = phi float [ %6051, %6050 ], [ %6053, %6052 ], !dbg !260 + %6054 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1271 = icmp eq i32 %6054, 0, !dbg !260 + br i1 %.not.i1271, label %6057, label %6055, !dbg !260 + +6055: ; preds = %__nv_exp2f.exit1270 + %6056 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5954) #3, !dbg !260 + br label %__nv_exp2f.exit1273, !dbg !260 + +6057: ; preds = %__nv_exp2f.exit1270 + %6058 = tail call float @llvm.nvvm.ex2.approx.f(float %5954) #3, !dbg !260 + br label %__nv_exp2f.exit1273, !dbg !260 + +__nv_exp2f.exit1273: ; preds = %6055, %6057 + %.0.i1272 = phi float [ %6056, %6055 ], [ %6058, %6057 ], !dbg !260 + %6059 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1274 = icmp eq i32 %6059, 0, !dbg !260 + br i1 %.not.i1274, label %6062, label %6060, !dbg !260 + +6060: ; preds = %__nv_exp2f.exit1273 + %6061 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5955) #3, !dbg !260 + br label %__nv_exp2f.exit1276, !dbg !260 + +6062: ; preds = %__nv_exp2f.exit1273 + %6063 = tail call float @llvm.nvvm.ex2.approx.f(float %5955) #3, !dbg !260 + br label %__nv_exp2f.exit1276, !dbg !260 + +__nv_exp2f.exit1276: ; preds = %6060, %6062 + %.0.i1275 = phi float [ %6061, %6060 ], [ %6063, %6062 ], !dbg !260 + %6064 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1277 = icmp eq i32 %6064, 0, !dbg !260 + br i1 %.not.i1277, label %6067, label %6065, !dbg !260 + +6065: ; preds = %__nv_exp2f.exit1276 + %6066 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5956) #3, !dbg !260 + br label %__nv_exp2f.exit1279, !dbg !260 + +6067: ; preds = %__nv_exp2f.exit1276 + %6068 = tail call float @llvm.nvvm.ex2.approx.f(float %5956) #3, !dbg !260 + br label %__nv_exp2f.exit1279, !dbg !260 + +__nv_exp2f.exit1279: ; preds = %6065, %6067 + %.0.i1278 = phi float [ %6066, %6065 ], [ %6068, %6067 ], !dbg !260 + %6069 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1280 = icmp eq i32 %6069, 0, !dbg !260 + br i1 %.not.i1280, label %6072, label %6070, !dbg !260 + +6070: ; preds = %__nv_exp2f.exit1279 + %6071 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5957) #3, !dbg !260 + br label %__nv_exp2f.exit1282, !dbg !260 + +6072: ; preds = %__nv_exp2f.exit1279 + %6073 = tail call float @llvm.nvvm.ex2.approx.f(float %5957) #3, !dbg !260 + br label %__nv_exp2f.exit1282, !dbg !260 + +__nv_exp2f.exit1282: ; preds = %6070, %6072 + %.0.i1281 = phi float [ %6071, %6070 ], [ %6073, %6072 ], !dbg !260 + %6074 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1283 = icmp eq i32 %6074, 0, !dbg !260 + br i1 %.not.i1283, label %6077, label %6075, !dbg !260 + +6075: ; preds = %__nv_exp2f.exit1282 + %6076 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5958) #3, !dbg !260 + br label %__nv_exp2f.exit1285, !dbg !260 + +6077: ; preds = %__nv_exp2f.exit1282 + %6078 = tail call float @llvm.nvvm.ex2.approx.f(float %5958) #3, !dbg !260 + br label %__nv_exp2f.exit1285, !dbg !260 + +__nv_exp2f.exit1285: ; preds = %6075, %6077 + %.0.i1284 = phi float [ %6076, %6075 ], [ %6078, %6077 ], !dbg !260 + %6079 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1286 = icmp eq i32 %6079, 0, !dbg !260 + br i1 %.not.i1286, label %6082, label %6080, !dbg !260 + +6080: ; preds = %__nv_exp2f.exit1285 + %6081 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5959) #3, !dbg !260 + br label %__nv_exp2f.exit1288, !dbg !260 + +6082: ; preds = %__nv_exp2f.exit1285 + %6083 = tail call float @llvm.nvvm.ex2.approx.f(float %5959) #3, !dbg !260 + br label %__nv_exp2f.exit1288, !dbg !260 + +__nv_exp2f.exit1288: ; preds = %6080, %6082 + %.0.i1287 = phi float [ %6081, %6080 ], [ %6083, %6082 ], !dbg !260 + %6084 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1289 = icmp eq i32 %6084, 0, !dbg !260 + br i1 %.not.i1289, label %6087, label %6085, !dbg !260 + +6085: ; preds = %__nv_exp2f.exit1288 + %6086 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5960) #3, !dbg !260 + br label %__nv_exp2f.exit1291, !dbg !260 + +6087: ; preds = %__nv_exp2f.exit1288 + %6088 = tail call float @llvm.nvvm.ex2.approx.f(float %5960) #3, !dbg !260 + br label %__nv_exp2f.exit1291, !dbg !260 + +__nv_exp2f.exit1291: ; preds = %6085, %6087 + %.0.i1290 = phi float [ %6086, %6085 ], [ %6088, %6087 ], !dbg !260 + %6089 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1292 = icmp eq i32 %6089, 0, !dbg !260 + br i1 %.not.i1292, label %6092, label %6090, !dbg !260 + +6090: ; preds = %__nv_exp2f.exit1291 + %6091 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5961) #3, !dbg !260 + br label %__nv_exp2f.exit1294, !dbg !260 + +6092: ; preds = %__nv_exp2f.exit1291 + %6093 = tail call float @llvm.nvvm.ex2.approx.f(float %5961) #3, !dbg !260 + br label %__nv_exp2f.exit1294, !dbg !260 + +__nv_exp2f.exit1294: ; preds = %6090, %6092 + %.0.i1293 = phi float [ %6091, %6090 ], [ %6093, %6092 ], !dbg !260 + %6094 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1295 = icmp eq i32 %6094, 0, !dbg !260 + br i1 %.not.i1295, label %6097, label %6095, !dbg !260 + +6095: ; preds = %__nv_exp2f.exit1294 + %6096 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5962) #3, !dbg !260 + br label %__nv_exp2f.exit1297, !dbg !260 + +6097: ; preds = %__nv_exp2f.exit1294 + %6098 = tail call float @llvm.nvvm.ex2.approx.f(float %5962) #3, !dbg !260 + br label %__nv_exp2f.exit1297, !dbg !260 + +__nv_exp2f.exit1297: ; preds = %6095, %6097 + %.0.i1296 = phi float [ %6096, %6095 ], [ %6098, %6097 ], !dbg !260 + %6099 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1298 = icmp eq i32 %6099, 0, !dbg !260 + br i1 %.not.i1298, label %6102, label %6100, !dbg !260 + +6100: ; preds = %__nv_exp2f.exit1297 + %6101 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5963) #3, !dbg !260 + br label %__nv_exp2f.exit1300, !dbg !260 + +6102: ; preds = %__nv_exp2f.exit1297 + %6103 = tail call float @llvm.nvvm.ex2.approx.f(float %5963) #3, !dbg !260 + br label %__nv_exp2f.exit1300, !dbg !260 + +__nv_exp2f.exit1300: ; preds = %6100, %6102 + %.0.i1299 = phi float [ %6101, %6100 ], [ %6103, %6102 ], !dbg !260 + %6104 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1301 = icmp eq i32 %6104, 0, !dbg !260 + br i1 %.not.i1301, label %6107, label %6105, !dbg !260 + +6105: ; preds = %__nv_exp2f.exit1300 + %6106 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5964) #3, !dbg !260 + br label %__nv_exp2f.exit1303, !dbg !260 + +6107: ; preds = %__nv_exp2f.exit1300 + %6108 = tail call float @llvm.nvvm.ex2.approx.f(float %5964) #3, !dbg !260 + br label %__nv_exp2f.exit1303, !dbg !260 + +__nv_exp2f.exit1303: ; preds = %6105, %6107 + %.0.i1302 = phi float [ %6106, %6105 ], [ %6108, %6107 ], !dbg !260 + %6109 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1304 = icmp eq i32 %6109, 0, !dbg !260 + br i1 %.not.i1304, label %6112, label %6110, !dbg !260 + +6110: ; preds = %__nv_exp2f.exit1303 + %6111 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5965) #3, !dbg !260 + br label %__nv_exp2f.exit1306, !dbg !260 + +6112: ; preds = %__nv_exp2f.exit1303 + %6113 = tail call float @llvm.nvvm.ex2.approx.f(float %5965) #3, !dbg !260 + br label %__nv_exp2f.exit1306, !dbg !260 + +__nv_exp2f.exit1306: ; preds = %6110, %6112 + %.0.i1305 = phi float [ %6111, %6110 ], [ %6113, %6112 ], !dbg !260 + %6114 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1307 = icmp eq i32 %6114, 0, !dbg !260 + br i1 %.not.i1307, label %6117, label %6115, !dbg !260 + +6115: ; preds = %__nv_exp2f.exit1306 + %6116 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5966) #3, !dbg !260 + br label %__nv_exp2f.exit1309, !dbg !260 + +6117: ; preds = %__nv_exp2f.exit1306 + %6118 = tail call float @llvm.nvvm.ex2.approx.f(float %5966) #3, !dbg !260 + br label %__nv_exp2f.exit1309, !dbg !260 + +__nv_exp2f.exit1309: ; preds = %6115, %6117 + %.0.i1308 = phi float [ %6116, %6115 ], [ %6118, %6117 ], !dbg !260 + %6119 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1310 = icmp eq i32 %6119, 0, !dbg !260 + br i1 %.not.i1310, label %6122, label %6120, !dbg !260 + +6120: ; preds = %__nv_exp2f.exit1309 + %6121 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5967) #3, !dbg !260 + br label %__nv_exp2f.exit1312, !dbg !260 + +6122: ; preds = %__nv_exp2f.exit1309 + %6123 = tail call float @llvm.nvvm.ex2.approx.f(float %5967) #3, !dbg !260 + br label %__nv_exp2f.exit1312, !dbg !260 + +__nv_exp2f.exit1312: ; preds = %6120, %6122 + %.0.i1311 = phi float [ %6121, %6120 ], [ %6123, %6122 ], !dbg !260 + %6124 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1313 = icmp eq i32 %6124, 0, !dbg !260 + br i1 %.not.i1313, label %6127, label %6125, !dbg !260 + +6125: ; preds = %__nv_exp2f.exit1312 + %6126 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5968) #3, !dbg !260 + br label %__nv_exp2f.exit1315, !dbg !260 + +6127: ; preds = %__nv_exp2f.exit1312 + %6128 = tail call float @llvm.nvvm.ex2.approx.f(float %5968) #3, !dbg !260 + br label %__nv_exp2f.exit1315, !dbg !260 + +__nv_exp2f.exit1315: ; preds = %6125, %6127 + %.0.i1314 = phi float [ %6126, %6125 ], [ %6128, %6127 ], !dbg !260 + %6129 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %5288, !dbg !244 + %6130 = insertelement <2 x float> poison, float %.0.i1221, i64 0, !dbg !261 + %6131 = insertelement <2 x float> %6130, float %.0.i1224, i64 1, !dbg !261 + %6132 = fptrunc <2 x float> %6131 to <2 x bfloat>, !dbg !261 + %6133 = insertelement <2 x float> poison, float %.0.i1227, i64 0, !dbg !261 + %6134 = insertelement <2 x float> %6133, float %.0.i1230, i64 1, !dbg !261 + %6135 = fptrunc <2 x float> %6134 to <2 x bfloat>, !dbg !261 + %6136 = insertelement <2 x float> poison, float %.0.i1233, i64 0, !dbg !261 + %6137 = insertelement <2 x float> %6136, float %.0.i1236, i64 1, !dbg !261 + %6138 = fptrunc <2 x float> %6137 to <2 x bfloat>, !dbg !261 + %6139 = insertelement <2 x float> poison, float %.0.i1239, i64 0, !dbg !261 + %6140 = insertelement <2 x float> %6139, float %.0.i1242, i64 1, !dbg !261 + %6141 = fptrunc <2 x float> %6140 to <2 x bfloat>, !dbg !261 + %6142 = insertelement <2 x float> poison, float %.0.i1245, i64 0, !dbg !261 + %6143 = insertelement <2 x float> %6142, float %.0.i1248, i64 1, !dbg !261 + %6144 = fptrunc <2 x float> %6143 to <2 x bfloat>, !dbg !261 + %6145 = insertelement <2 x float> poison, float %.0.i1251, i64 0, !dbg !261 + %6146 = insertelement <2 x float> %6145, float %.0.i1254, i64 1, !dbg !261 + %6147 = fptrunc <2 x float> %6146 to <2 x bfloat>, !dbg !261 + %6148 = insertelement <2 x float> poison, float %.0.i1257, i64 0, !dbg !261 + %6149 = insertelement <2 x float> %6148, float %.0.i1260, i64 1, !dbg !261 + %6150 = fptrunc <2 x float> %6149 to <2 x bfloat>, !dbg !261 + %6151 = insertelement <2 x float> poison, float %.0.i1263, i64 0, !dbg !261 + %6152 = insertelement <2 x float> %6151, float %.0.i1266, i64 1, !dbg !261 + %6153 = fptrunc <2 x float> %6152 to <2 x bfloat>, !dbg !261 + %6154 = insertelement <2 x float> poison, float %.0.i1269, i64 0, !dbg !261 + %6155 = insertelement <2 x float> %6154, float %.0.i1272, i64 1, !dbg !261 + %6156 = fptrunc <2 x float> %6155 to <2 x bfloat>, !dbg !261 + %6157 = insertelement <2 x float> poison, float %.0.i1275, i64 0, !dbg !261 + %6158 = insertelement <2 x float> %6157, float %.0.i1278, i64 1, !dbg !261 + %6159 = fptrunc <2 x float> %6158 to <2 x bfloat>, !dbg !261 + %6160 = insertelement <2 x float> poison, float %.0.i1281, i64 0, !dbg !261 + %6161 = insertelement <2 x float> %6160, float %.0.i1284, i64 1, !dbg !261 + %6162 = fptrunc <2 x float> %6161 to <2 x bfloat>, !dbg !261 + %6163 = insertelement <2 x float> poison, float %.0.i1287, i64 0, !dbg !261 + %6164 = insertelement <2 x float> %6163, float %.0.i1290, i64 1, !dbg !261 + %6165 = fptrunc <2 x float> %6164 to <2 x bfloat>, !dbg !261 + %6166 = insertelement <2 x float> poison, float %.0.i1293, i64 0, !dbg !261 + %6167 = insertelement <2 x float> %6166, float %.0.i1296, i64 1, !dbg !261 + %6168 = fptrunc <2 x float> %6167 to <2 x bfloat>, !dbg !261 + %6169 = insertelement <2 x float> poison, float %.0.i1299, i64 0, !dbg !261 + %6170 = insertelement <2 x float> %6169, float %.0.i1302, i64 1, !dbg !261 + %6171 = fptrunc <2 x float> %6170 to <2 x bfloat>, !dbg !261 + %6172 = insertelement <2 x float> poison, float %.0.i1305, i64 0, !dbg !261 + %6173 = insertelement <2 x float> %6172, float %.0.i1308, i64 1, !dbg !261 + %6174 = fptrunc <2 x float> %6173 to <2 x bfloat>, !dbg !261 + %6175 = insertelement <2 x float> poison, float %.0.i1311, i64 0, !dbg !261 + %6176 = insertelement <2 x float> %6175, float %.0.i1314, i64 1, !dbg !261 + %6177 = fptrunc <2 x float> %6176 to <2 x bfloat>, !dbg !261 + %6178 = bitcast <2 x bfloat> %6132 to i32, !dbg !262 + %6179 = bitcast <2 x bfloat> %6135 to i32, !dbg !262 + %6180 = bitcast <2 x bfloat> %6138 to i32, !dbg !262 + %6181 = bitcast <2 x bfloat> %6141 to i32, !dbg !262 + %6182 = bitcast <2 x bfloat> %6144 to i32, !dbg !262 + %6183 = bitcast <2 x bfloat> %6147 to i32, !dbg !262 + %6184 = bitcast <2 x bfloat> %6150 to i32, !dbg !262 + %6185 = bitcast <2 x bfloat> %6153 to i32, !dbg !262 + %6186 = bitcast <2 x bfloat> %6156 to i32, !dbg !262 + %6187 = bitcast <2 x bfloat> %6159 to i32, !dbg !262 + %6188 = bitcast <2 x bfloat> %6162 to i32, !dbg !262 + %6189 = bitcast <2 x bfloat> %6165 to i32, !dbg !262 + %6190 = bitcast <2 x bfloat> %6168 to i32, !dbg !262 + %6191 = bitcast <2 x bfloat> %6171 to i32, !dbg !262 + %6192 = bitcast <2 x bfloat> %6174 to i32, !dbg !262 + %6193 = bitcast <2 x bfloat> %6177 to i32, !dbg !262 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !262 + %6194 = ptrtoint ptr addrspace(3) %6129 to i32, !dbg !262 + %6195 = lshr exact i32 %6194, 4, !dbg !262 + %6196 = and i32 %6195, 16383, !dbg !262 + %6197 = zext nneg i32 %6196 to i64, !dbg !262 + %6198 = or disjoint i64 %6197, 4611686293338849280, !dbg !262 + %6199 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %5150, float %5151, float %5152, float %5153, float %5154, float %5155, float %5156, float %5157, float %5158, float %5159, float %5160, float %5161, float %5162, float %5163, float %5164, float %5165, float %5166, float %5167, float %5168, float %5169, float %5170, float %5171, float %5172, float %5173, float %5174, float %5175, float %5176, float %5177, float %5178, float %5179, float %5180, float %5181, float %5182, float %5183, float %5184, float %5185, float %5186, float %5187, float %5188, float %5189, float %5190, float %5191, float %5192, float %5193, float %5194, float %5195, float %5196, float %5197, float %5198, float %5199, float %5200, float %5201, float %5202, float %5203, float %5204, float %5205, float %5206, float %5207, float %5208, float %5209, float %5210, float %5211, float %5212, float %5213, i32 %6178, i32 %6179, i32 %6180, i32 %6181, i64 %6198, i1 true) #3, !dbg !262 + %6200 = add i32 %6194, 2048, !dbg !262 + %6201 = lshr exact i32 %6200, 4, !dbg !262 + %6202 = and i32 %6201, 16383, !dbg !262 + %6203 = zext nneg i32 %6202 to i64, !dbg !262 + %6204 = or disjoint i64 %6203, 4611686293338849280, !dbg !262 + %6205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 0, !dbg !262 + %6206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 1, !dbg !262 + %6207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 2, !dbg !262 + %6208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 3, !dbg !262 + %6209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 4, !dbg !262 + %6210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 5, !dbg !262 + %6211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 6, !dbg !262 + %6212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 7, !dbg !262 + %6213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 8, !dbg !262 + %6214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 9, !dbg !262 + %6215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 10, !dbg !262 + %6216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 11, !dbg !262 + %6217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 12, !dbg !262 + %6218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 13, !dbg !262 + %6219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 14, !dbg !262 + %6220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 15, !dbg !262 + %6221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 16, !dbg !262 + %6222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 17, !dbg !262 + %6223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 18, !dbg !262 + %6224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 19, !dbg !262 + %6225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 20, !dbg !262 + %6226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 21, !dbg !262 + %6227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 22, !dbg !262 + %6228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 23, !dbg !262 + %6229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 24, !dbg !262 + %6230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 25, !dbg !262 + %6231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 26, !dbg !262 + %6232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 27, !dbg !262 + %6233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 28, !dbg !262 + %6234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 29, !dbg !262 + %6235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 30, !dbg !262 + %6236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 31, !dbg !262 + %6237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 32, !dbg !262 + %6238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 33, !dbg !262 + %6239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 34, !dbg !262 + %6240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 35, !dbg !262 + %6241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 36, !dbg !262 + %6242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 37, !dbg !262 + %6243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 38, !dbg !262 + %6244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 39, !dbg !262 + %6245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 40, !dbg !262 + %6246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 41, !dbg !262 + %6247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 42, !dbg !262 + %6248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 43, !dbg !262 + %6249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 44, !dbg !262 + %6250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 45, !dbg !262 + %6251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 46, !dbg !262 + %6252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 47, !dbg !262 + %6253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 48, !dbg !262 + %6254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 49, !dbg !262 + %6255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 50, !dbg !262 + %6256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 51, !dbg !262 + %6257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 52, !dbg !262 + %6258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 53, !dbg !262 + %6259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 54, !dbg !262 + %6260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 55, !dbg !262 + %6261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 56, !dbg !262 + %6262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 57, !dbg !262 + %6263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 58, !dbg !262 + %6264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 59, !dbg !262 + %6265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 60, !dbg !262 + %6266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 61, !dbg !262 + %6267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 62, !dbg !262 + %6268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 63, !dbg !262 + %6269 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %6205, float %6206, float %6207, float %6208, float %6209, float %6210, float %6211, float %6212, float %6213, float %6214, float %6215, float %6216, float %6217, float %6218, float %6219, float %6220, float %6221, float %6222, float %6223, float %6224, float %6225, float %6226, float %6227, float %6228, float %6229, float %6230, float %6231, float %6232, float %6233, float %6234, float %6235, float %6236, float %6237, float %6238, float %6239, float %6240, float %6241, float %6242, float %6243, float %6244, float %6245, float %6246, float %6247, float %6248, float %6249, float %6250, float %6251, float %6252, float %6253, float %6254, float %6255, float %6256, float %6257, float %6258, float %6259, float %6260, float %6261, float %6262, float %6263, float %6264, float %6265, float %6266, float %6267, float %6268, i32 %6182, i32 %6183, i32 %6184, i32 %6185, i64 %6204, i1 true) #3, !dbg !262 + %6270 = add i32 %6194, 4096, !dbg !262 + %6271 = lshr exact i32 %6270, 4, !dbg !262 + %6272 = and i32 %6271, 16383, !dbg !262 + %6273 = zext nneg i32 %6272 to i64, !dbg !262 + %6274 = or disjoint i64 %6273, 4611686293338849280, !dbg !262 + %6275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 0, !dbg !262 + %6276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 1, !dbg !262 + %6277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 2, !dbg !262 + %6278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 3, !dbg !262 + %6279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 4, !dbg !262 + %6280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 5, !dbg !262 + %6281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 6, !dbg !262 + %6282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 7, !dbg !262 + %6283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 8, !dbg !262 + %6284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 9, !dbg !262 + %6285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 10, !dbg !262 + %6286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 11, !dbg !262 + %6287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 12, !dbg !262 + %6288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 13, !dbg !262 + %6289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 14, !dbg !262 + %6290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 15, !dbg !262 + %6291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 16, !dbg !262 + %6292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 17, !dbg !262 + %6293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 18, !dbg !262 + %6294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 19, !dbg !262 + %6295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 20, !dbg !262 + %6296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 21, !dbg !262 + %6297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 22, !dbg !262 + %6298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 23, !dbg !262 + %6299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 24, !dbg !262 + %6300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 25, !dbg !262 + %6301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 26, !dbg !262 + %6302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 27, !dbg !262 + %6303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 28, !dbg !262 + %6304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 29, !dbg !262 + %6305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 30, !dbg !262 + %6306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 31, !dbg !262 + %6307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 32, !dbg !262 + %6308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 33, !dbg !262 + %6309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 34, !dbg !262 + %6310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 35, !dbg !262 + %6311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 36, !dbg !262 + %6312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 37, !dbg !262 + %6313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 38, !dbg !262 + %6314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 39, !dbg !262 + %6315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 40, !dbg !262 + %6316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 41, !dbg !262 + %6317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 42, !dbg !262 + %6318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 43, !dbg !262 + %6319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 44, !dbg !262 + %6320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 45, !dbg !262 + %6321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 46, !dbg !262 + %6322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 47, !dbg !262 + %6323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 48, !dbg !262 + %6324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 49, !dbg !262 + %6325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 50, !dbg !262 + %6326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 51, !dbg !262 + %6327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 52, !dbg !262 + %6328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 53, !dbg !262 + %6329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 54, !dbg !262 + %6330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 55, !dbg !262 + %6331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 56, !dbg !262 + %6332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 57, !dbg !262 + %6333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 58, !dbg !262 + %6334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 59, !dbg !262 + %6335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 60, !dbg !262 + %6336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 61, !dbg !262 + %6337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 62, !dbg !262 + %6338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 63, !dbg !262 + %6339 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %6275, float %6276, float %6277, float %6278, float %6279, float %6280, float %6281, float %6282, float %6283, float %6284, float %6285, float %6286, float %6287, float %6288, float %6289, float %6290, float %6291, float %6292, float %6293, float %6294, float %6295, float %6296, float %6297, float %6298, float %6299, float %6300, float %6301, float %6302, float %6303, float %6304, float %6305, float %6306, float %6307, float %6308, float %6309, float %6310, float %6311, float %6312, float %6313, float %6314, float %6315, float %6316, float %6317, float %6318, float %6319, float %6320, float %6321, float %6322, float %6323, float %6324, float %6325, float %6326, float %6327, float %6328, float %6329, float %6330, float %6331, float %6332, float %6333, float %6334, float %6335, float %6336, float %6337, float %6338, i32 %6186, i32 %6187, i32 %6188, i32 %6189, i64 %6274, i1 true) #3, !dbg !262 + %6340 = add i32 %6194, 6144, !dbg !262 + %6341 = lshr exact i32 %6340, 4, !dbg !262 + %6342 = and i32 %6341, 16383, !dbg !262 + %6343 = zext nneg i32 %6342 to i64, !dbg !262 + %6344 = or disjoint i64 %6343, 4611686293338849280, !dbg !262 + %6345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 0, !dbg !262 + %6346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 1, !dbg !262 + %6347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 2, !dbg !262 + %6348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 3, !dbg !262 + %6349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 4, !dbg !262 + %6350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 5, !dbg !262 + %6351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 6, !dbg !262 + %6352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 7, !dbg !262 + %6353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 8, !dbg !262 + %6354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 9, !dbg !262 + %6355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 10, !dbg !262 + %6356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 11, !dbg !262 + %6357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 12, !dbg !262 + %6358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 13, !dbg !262 + %6359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 14, !dbg !262 + %6360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 15, !dbg !262 + %6361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 16, !dbg !262 + %6362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 17, !dbg !262 + %6363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 18, !dbg !262 + %6364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 19, !dbg !262 + %6365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 20, !dbg !262 + %6366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 21, !dbg !262 + %6367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 22, !dbg !262 + %6368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 23, !dbg !262 + %6369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 24, !dbg !262 + %6370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 25, !dbg !262 + %6371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 26, !dbg !262 + %6372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 27, !dbg !262 + %6373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 28, !dbg !262 + %6374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 29, !dbg !262 + %6375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 30, !dbg !262 + %6376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 31, !dbg !262 + %6377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 32, !dbg !262 + %6378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 33, !dbg !262 + %6379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 34, !dbg !262 + %6380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 35, !dbg !262 + %6381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 36, !dbg !262 + %6382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 37, !dbg !262 + %6383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 38, !dbg !262 + %6384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 39, !dbg !262 + %6385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 40, !dbg !262 + %6386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 41, !dbg !262 + %6387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 42, !dbg !262 + %6388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 43, !dbg !262 + %6389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 44, !dbg !262 + %6390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 45, !dbg !262 + %6391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 46, !dbg !262 + %6392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 47, !dbg !262 + %6393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 48, !dbg !262 + %6394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 49, !dbg !262 + %6395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 50, !dbg !262 + %6396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 51, !dbg !262 + %6397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 52, !dbg !262 + %6398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 53, !dbg !262 + %6399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 54, !dbg !262 + %6400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 55, !dbg !262 + %6401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 56, !dbg !262 + %6402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 57, !dbg !262 + %6403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 58, !dbg !262 + %6404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 59, !dbg !262 + %6405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 60, !dbg !262 + %6406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 61, !dbg !262 + %6407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 62, !dbg !262 + %6408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 63, !dbg !262 + %6409 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %6345, float %6346, float %6347, float %6348, float %6349, float %6350, float %6351, float %6352, float %6353, float %6354, float %6355, float %6356, float %6357, float %6358, float %6359, float %6360, float %6361, float %6362, float %6363, float %6364, float %6365, float %6366, float %6367, float %6368, float %6369, float %6370, float %6371, float %6372, float %6373, float %6374, float %6375, float %6376, float %6377, float %6378, float %6379, float %6380, float %6381, float %6382, float %6383, float %6384, float %6385, float %6386, float %6387, float %6388, float %6389, float %6390, float %6391, float %6392, float %6393, float %6394, float %6395, float %6396, float %6397, float %6398, float %6399, float %6400, float %6401, float %6402, float %6403, float %6404, float %6405, float %6406, float %6407, float %6408, i32 %6190, i32 %6191, i32 %6192, i32 %6193, i64 %6344, i1 true) #3, !dbg !262 + %6410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 0, !dbg !262 + %6411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 1, !dbg !262 + %6412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 2, !dbg !262 + %6413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 3, !dbg !262 + %6414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 4, !dbg !262 + %6415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 5, !dbg !262 + %6416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 6, !dbg !262 + %6417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 7, !dbg !262 + %6418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 8, !dbg !262 + %6419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 9, !dbg !262 + %6420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 10, !dbg !262 + %6421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 11, !dbg !262 + %6422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 12, !dbg !262 + %6423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 13, !dbg !262 + %6424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 14, !dbg !262 + %6425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 15, !dbg !262 + %6426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 16, !dbg !262 + %6427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 17, !dbg !262 + %6428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 18, !dbg !262 + %6429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 19, !dbg !262 + %6430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 20, !dbg !262 + %6431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 21, !dbg !262 + %6432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 22, !dbg !262 + %6433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 23, !dbg !262 + %6434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 24, !dbg !262 + %6435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 25, !dbg !262 + %6436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 26, !dbg !262 + %6437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 27, !dbg !262 + %6438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 28, !dbg !262 + %6439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 29, !dbg !262 + %6440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 30, !dbg !262 + %6441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 31, !dbg !262 + %6442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 32, !dbg !262 + %6443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 33, !dbg !262 + %6444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 34, !dbg !262 + %6445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 35, !dbg !262 + %6446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 36, !dbg !262 + %6447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 37, !dbg !262 + %6448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 38, !dbg !262 + %6449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 39, !dbg !262 + %6450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 40, !dbg !262 + %6451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 41, !dbg !262 + %6452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 42, !dbg !262 + %6453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 43, !dbg !262 + %6454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 44, !dbg !262 + %6455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 45, !dbg !262 + %6456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 46, !dbg !262 + %6457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 47, !dbg !262 + %6458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 48, !dbg !262 + %6459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 49, !dbg !262 + %6460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 50, !dbg !262 + %6461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 51, !dbg !262 + %6462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 52, !dbg !262 + %6463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 53, !dbg !262 + %6464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 54, !dbg !262 + %6465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 55, !dbg !262 + %6466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 56, !dbg !262 + %6467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 57, !dbg !262 + %6468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 58, !dbg !262 + %6469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 59, !dbg !262 + %6470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 60, !dbg !262 + %6471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 61, !dbg !262 + %6472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 62, !dbg !262 + %6473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 63, !dbg !262 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !262 + %6474 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5290, !dbg !248 + %6475 = getelementptr inbounds nuw i8, ptr addrspace(3) %6474, i32 %4812, !dbg !248 + %6476 = load float, ptr addrspace(3) %6475, align 8, !dbg !248 + %6477 = getelementptr inbounds nuw i8, ptr addrspace(3) %6475, i32 4, !dbg !248 + %6478 = load float, ptr addrspace(3) %6477, align 4, !dbg !248 + %6479 = getelementptr inbounds nuw i8, ptr addrspace(3) %6474, i32 %4815, !dbg !248 + %6480 = load float, ptr addrspace(3) %6479, align 8, !dbg !248 + %6481 = getelementptr inbounds nuw i8, ptr addrspace(3) %6479, i32 4, !dbg !248 + %6482 = load float, ptr addrspace(3) %6481, align 4, !dbg !248 + %6483 = getelementptr inbounds nuw i8, ptr addrspace(3) %6474, i32 %4817, !dbg !248 + %6484 = load float, ptr addrspace(3) %6483, align 8, !dbg !248 + %6485 = getelementptr inbounds nuw i8, ptr addrspace(3) %6483, i32 4, !dbg !248 + %6486 = load float, ptr addrspace(3) %6485, align 4, !dbg !248 + %6487 = getelementptr inbounds nuw i8, ptr addrspace(3) %6474, i32 %4819, !dbg !248 + %6488 = load float, ptr addrspace(3) %6487, align 8, !dbg !248 + %6489 = getelementptr inbounds nuw i8, ptr addrspace(3) %6487, i32 4, !dbg !248 + %6490 = load float, ptr addrspace(3) %6489, align 4, !dbg !248 + %6491 = getelementptr inbounds nuw i8, ptr addrspace(3) %6474, i32 %4821, !dbg !248 + %6492 = load float, ptr addrspace(3) %6491, align 8, !dbg !248 + %6493 = getelementptr inbounds nuw i8, ptr addrspace(3) %6491, i32 4, !dbg !248 + %6494 = load float, ptr addrspace(3) %6493, align 4, !dbg !248 + %6495 = getelementptr inbounds nuw i8, ptr addrspace(3) %6474, i32 %4823, !dbg !248 + %6496 = load float, ptr addrspace(3) %6495, align 8, !dbg !248 + %6497 = getelementptr inbounds nuw i8, ptr addrspace(3) %6495, i32 4, !dbg !248 + %6498 = load float, ptr addrspace(3) %6497, align 4, !dbg !248 + %6499 = getelementptr inbounds nuw i8, ptr addrspace(3) %6474, i32 %4825, !dbg !248 + %6500 = load float, ptr addrspace(3) %6499, align 8, !dbg !248 + %6501 = getelementptr inbounds nuw i8, ptr addrspace(3) %6499, i32 4, !dbg !248 + %6502 = load float, ptr addrspace(3) %6501, align 4, !dbg !248 + %6503 = getelementptr inbounds nuw i8, ptr addrspace(3) %6474, i32 %4827, !dbg !248 + %6504 = load float, ptr addrspace(3) %6503, align 8, !dbg !248 + %6505 = getelementptr inbounds nuw i8, ptr addrspace(3) %6503, i32 4, !dbg !248 + %6506 = load float, ptr addrspace(3) %6505, align 4, !dbg !248 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !263 + %6507 = add i32 %5358, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !263 + %6508 = lshr exact i32 %6507, 4, !dbg !263 + %6509 = and i32 %6508, 16383, !dbg !263 + %6510 = zext nneg i32 %6509 to i64, !dbg !263 + %6511 = or disjoint i64 %6510, 4611686293372403712, !dbg !263 + %6512 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %6511, i64 %6198) #3, !dbg !263 + %6513 = add i32 %5370, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !263 + %6514 = lshr exact i32 %6513, 4, !dbg !263 + %6515 = and i32 %6514, 16383, !dbg !263 + %6516 = zext nneg i32 %6515 to i64, !dbg !263 + %6517 = or disjoint i64 %6516, 4611686293372403712, !dbg !263 + %6518 = add i32 %6194, 32, !dbg !263 + %6519 = lshr exact i32 %6518, 4, !dbg !263 + %6520 = and i32 %6519, 16383, !dbg !263 + %6521 = zext nneg i32 %6520 to i64, !dbg !263 + %6522 = or disjoint i64 %6521, 4611686293338849280, !dbg !263 + %6523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 0, !dbg !263 + %6524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 1, !dbg !263 + %6525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 2, !dbg !263 + %6526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 3, !dbg !263 + %6527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 4, !dbg !263 + %6528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 5, !dbg !263 + %6529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 6, !dbg !263 + %6530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 7, !dbg !263 + %6531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 8, !dbg !263 + %6532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 9, !dbg !263 + %6533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 10, !dbg !263 + %6534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 11, !dbg !263 + %6535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 12, !dbg !263 + %6536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 13, !dbg !263 + %6537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 14, !dbg !263 + %6538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 15, !dbg !263 + %6539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 16, !dbg !263 + %6540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 17, !dbg !263 + %6541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 18, !dbg !263 + %6542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 19, !dbg !263 + %6543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 20, !dbg !263 + %6544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 21, !dbg !263 + %6545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 22, !dbg !263 + %6546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 23, !dbg !263 + %6547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 24, !dbg !263 + %6548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 25, !dbg !263 + %6549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 26, !dbg !263 + %6550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 27, !dbg !263 + %6551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 28, !dbg !263 + %6552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 29, !dbg !263 + %6553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 30, !dbg !263 + %6554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 31, !dbg !263 + %6555 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6523, float %6524, float %6525, float %6526, float %6527, float %6528, float %6529, float %6530, float %6531, float %6532, float %6533, float %6534, float %6535, float %6536, float %6537, float %6538, float %6539, float %6540, float %6541, float %6542, float %6543, float %6544, float %6545, float %6546, float %6547, float %6548, float %6549, float %6550, float %6551, float %6552, float %6553, float %6554, i64 %6517, i64 %6522, i1 true) #3, !dbg !263 + %6556 = add i32 %5414, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !263 + %6557 = lshr exact i32 %6556, 4, !dbg !263 + %6558 = and i32 %6557, 16383, !dbg !263 + %6559 = zext nneg i32 %6558 to i64, !dbg !263 + %6560 = or disjoint i64 %6559, 4611686293372403712, !dbg !263 + %6561 = add i32 %6194, 64, !dbg !263 + %6562 = lshr exact i32 %6561, 4, !dbg !263 + %6563 = and i32 %6562, 16383, !dbg !263 + %6564 = zext nneg i32 %6563 to i64, !dbg !263 + %6565 = or disjoint i64 %6564, 4611686293338849280, !dbg !263 + %6566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 0, !dbg !263 + %6567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 1, !dbg !263 + %6568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 2, !dbg !263 + %6569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 3, !dbg !263 + %6570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 4, !dbg !263 + %6571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 5, !dbg !263 + %6572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 6, !dbg !263 + %6573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 7, !dbg !263 + %6574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 8, !dbg !263 + %6575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 9, !dbg !263 + %6576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 10, !dbg !263 + %6577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 11, !dbg !263 + %6578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 12, !dbg !263 + %6579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 13, !dbg !263 + %6580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 14, !dbg !263 + %6581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 15, !dbg !263 + %6582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 16, !dbg !263 + %6583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 17, !dbg !263 + %6584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 18, !dbg !263 + %6585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 19, !dbg !263 + %6586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 20, !dbg !263 + %6587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 21, !dbg !263 + %6588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 22, !dbg !263 + %6589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 23, !dbg !263 + %6590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 24, !dbg !263 + %6591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 25, !dbg !263 + %6592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 26, !dbg !263 + %6593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 27, !dbg !263 + %6594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 28, !dbg !263 + %6595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 29, !dbg !263 + %6596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 30, !dbg !263 + %6597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 31, !dbg !263 + %6598 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6566, float %6567, float %6568, float %6569, float %6570, float %6571, float %6572, float %6573, float %6574, float %6575, float %6576, float %6577, float %6578, float %6579, float %6580, float %6581, float %6582, float %6583, float %6584, float %6585, float %6586, float %6587, float %6588, float %6589, float %6590, float %6591, float %6592, float %6593, float %6594, float %6595, float %6596, float %6597, i64 %6560, i64 %6565, i1 true) #3, !dbg !263 + %6599 = add i32 %5458, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !263 + %6600 = lshr exact i32 %6599, 4, !dbg !263 + %6601 = and i32 %6600, 16383, !dbg !263 + %6602 = zext nneg i32 %6601 to i64, !dbg !263 + %6603 = or disjoint i64 %6602, 4611686293372403712, !dbg !263 + %6604 = add i32 %6194, 96, !dbg !263 + %6605 = lshr exact i32 %6604, 4, !dbg !263 + %6606 = and i32 %6605, 16383, !dbg !263 + %6607 = zext nneg i32 %6606 to i64, !dbg !263 + %6608 = or disjoint i64 %6607, 4611686293338849280, !dbg !263 + %6609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 0, !dbg !263 + %6610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 1, !dbg !263 + %6611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 2, !dbg !263 + %6612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 3, !dbg !263 + %6613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 4, !dbg !263 + %6614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 5, !dbg !263 + %6615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 6, !dbg !263 + %6616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 7, !dbg !263 + %6617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 8, !dbg !263 + %6618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 9, !dbg !263 + %6619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 10, !dbg !263 + %6620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 11, !dbg !263 + %6621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 12, !dbg !263 + %6622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 13, !dbg !263 + %6623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 14, !dbg !263 + %6624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 15, !dbg !263 + %6625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 16, !dbg !263 + %6626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 17, !dbg !263 + %6627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 18, !dbg !263 + %6628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 19, !dbg !263 + %6629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 20, !dbg !263 + %6630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 21, !dbg !263 + %6631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 22, !dbg !263 + %6632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 23, !dbg !263 + %6633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 24, !dbg !263 + %6634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 25, !dbg !263 + %6635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 26, !dbg !263 + %6636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 27, !dbg !263 + %6637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 28, !dbg !263 + %6638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 29, !dbg !263 + %6639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 30, !dbg !263 + %6640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 31, !dbg !263 + %6641 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6609, float %6610, float %6611, float %6612, float %6613, float %6614, float %6615, float %6616, float %6617, float %6618, float %6619, float %6620, float %6621, float %6622, float %6623, float %6624, float %6625, float %6626, float %6627, float %6628, float %6629, float %6630, float %6631, float %6632, float %6633, float %6634, float %6635, float %6636, float %6637, float %6638, float %6639, float %6640, i64 %6603, i64 %6608, i1 true) #3, !dbg !263 + %6642 = add i32 %5502, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !263 + %6643 = lshr exact i32 %6642, 4, !dbg !263 + %6644 = and i32 %6643, 16383, !dbg !263 + %6645 = zext nneg i32 %6644 to i64, !dbg !263 + %6646 = or disjoint i64 %6645, 4611686293372403712, !dbg !263 + %6647 = add i32 %6194, 8192, !dbg !263 + %6648 = lshr exact i32 %6647, 4, !dbg !263 + %6649 = and i32 %6648, 16383, !dbg !263 + %6650 = zext nneg i32 %6649 to i64, !dbg !263 + %6651 = or disjoint i64 %6650, 4611686293338849280, !dbg !263 + %6652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 0, !dbg !263 + %6653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 1, !dbg !263 + %6654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 2, !dbg !263 + %6655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 3, !dbg !263 + %6656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 4, !dbg !263 + %6657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 5, !dbg !263 + %6658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 6, !dbg !263 + %6659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 7, !dbg !263 + %6660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 8, !dbg !263 + %6661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 9, !dbg !263 + %6662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 10, !dbg !263 + %6663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 11, !dbg !263 + %6664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 12, !dbg !263 + %6665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 13, !dbg !263 + %6666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 14, !dbg !263 + %6667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 15, !dbg !263 + %6668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 16, !dbg !263 + %6669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 17, !dbg !263 + %6670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 18, !dbg !263 + %6671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 19, !dbg !263 + %6672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 20, !dbg !263 + %6673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 21, !dbg !263 + %6674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 22, !dbg !263 + %6675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 23, !dbg !263 + %6676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 24, !dbg !263 + %6677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 25, !dbg !263 + %6678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 26, !dbg !263 + %6679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 27, !dbg !263 + %6680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 28, !dbg !263 + %6681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 29, !dbg !263 + %6682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 30, !dbg !263 + %6683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 31, !dbg !263 + %6684 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6652, float %6653, float %6654, float %6655, float %6656, float %6657, float %6658, float %6659, float %6660, float %6661, float %6662, float %6663, float %6664, float %6665, float %6666, float %6667, float %6668, float %6669, float %6670, float %6671, float %6672, float %6673, float %6674, float %6675, float %6676, float %6677, float %6678, float %6679, float %6680, float %6681, float %6682, float %6683, i64 %6646, i64 %6651, i1 true) #3, !dbg !263 + %6685 = add i32 %5546, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !263 + %6686 = lshr exact i32 %6685, 4, !dbg !263 + %6687 = and i32 %6686, 16383, !dbg !263 + %6688 = zext nneg i32 %6687 to i64, !dbg !263 + %6689 = or disjoint i64 %6688, 4611686293372403712, !dbg !263 + %6690 = add i32 %6194, 8224, !dbg !263 + %6691 = lshr exact i32 %6690, 4, !dbg !263 + %6692 = and i32 %6691, 16383, !dbg !263 + %6693 = zext nneg i32 %6692 to i64, !dbg !263 + %6694 = or disjoint i64 %6693, 4611686293338849280, !dbg !263 + %6695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 0, !dbg !263 + %6696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 1, !dbg !263 + %6697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 2, !dbg !263 + %6698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 3, !dbg !263 + %6699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 4, !dbg !263 + %6700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 5, !dbg !263 + %6701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 6, !dbg !263 + %6702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 7, !dbg !263 + %6703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 8, !dbg !263 + %6704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 9, !dbg !263 + %6705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 10, !dbg !263 + %6706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 11, !dbg !263 + %6707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 12, !dbg !263 + %6708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 13, !dbg !263 + %6709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 14, !dbg !263 + %6710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 15, !dbg !263 + %6711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 16, !dbg !263 + %6712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 17, !dbg !263 + %6713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 18, !dbg !263 + %6714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 19, !dbg !263 + %6715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 20, !dbg !263 + %6716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 21, !dbg !263 + %6717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 22, !dbg !263 + %6718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 23, !dbg !263 + %6719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 24, !dbg !263 + %6720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 25, !dbg !263 + %6721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 26, !dbg !263 + %6722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 27, !dbg !263 + %6723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 28, !dbg !263 + %6724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 29, !dbg !263 + %6725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 30, !dbg !263 + %6726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 31, !dbg !263 + %6727 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6695, float %6696, float %6697, float %6698, float %6699, float %6700, float %6701, float %6702, float %6703, float %6704, float %6705, float %6706, float %6707, float %6708, float %6709, float %6710, float %6711, float %6712, float %6713, float %6714, float %6715, float %6716, float %6717, float %6718, float %6719, float %6720, float %6721, float %6722, float %6723, float %6724, float %6725, float %6726, i64 %6689, i64 %6694, i1 true) #3, !dbg !263 + %6728 = add i32 %5590, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !263 + %6729 = lshr exact i32 %6728, 4, !dbg !263 + %6730 = and i32 %6729, 16383, !dbg !263 + %6731 = zext nneg i32 %6730 to i64, !dbg !263 + %6732 = or disjoint i64 %6731, 4611686293372403712, !dbg !263 + %6733 = add i32 %6194, 8256, !dbg !263 + %6734 = lshr exact i32 %6733, 4, !dbg !263 + %6735 = and i32 %6734, 16383, !dbg !263 + %6736 = zext nneg i32 %6735 to i64, !dbg !263 + %6737 = or disjoint i64 %6736, 4611686293338849280, !dbg !263 + %6738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 0, !dbg !263 + %6739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 1, !dbg !263 + %6740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 2, !dbg !263 + %6741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 3, !dbg !263 + %6742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 4, !dbg !263 + %6743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 5, !dbg !263 + %6744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 6, !dbg !263 + %6745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 7, !dbg !263 + %6746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 8, !dbg !263 + %6747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 9, !dbg !263 + %6748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 10, !dbg !263 + %6749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 11, !dbg !263 + %6750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 12, !dbg !263 + %6751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 13, !dbg !263 + %6752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 14, !dbg !263 + %6753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 15, !dbg !263 + %6754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 16, !dbg !263 + %6755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 17, !dbg !263 + %6756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 18, !dbg !263 + %6757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 19, !dbg !263 + %6758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 20, !dbg !263 + %6759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 21, !dbg !263 + %6760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 22, !dbg !263 + %6761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 23, !dbg !263 + %6762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 24, !dbg !263 + %6763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 25, !dbg !263 + %6764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 26, !dbg !263 + %6765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 27, !dbg !263 + %6766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 28, !dbg !263 + %6767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 29, !dbg !263 + %6768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 30, !dbg !263 + %6769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 31, !dbg !263 + %6770 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6738, float %6739, float %6740, float %6741, float %6742, float %6743, float %6744, float %6745, float %6746, float %6747, float %6748, float %6749, float %6750, float %6751, float %6752, float %6753, float %6754, float %6755, float %6756, float %6757, float %6758, float %6759, float %6760, float %6761, float %6762, float %6763, float %6764, float %6765, float %6766, float %6767, float %6768, float %6769, i64 %6732, i64 %6737, i1 true) #3, !dbg !263 + %6771 = add i32 %5634, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !263 + %6772 = lshr exact i32 %6771, 4, !dbg !263 + %6773 = and i32 %6772, 16383, !dbg !263 + %6774 = zext nneg i32 %6773 to i64, !dbg !263 + %6775 = or disjoint i64 %6774, 4611686293372403712, !dbg !263 + %6776 = add i32 %6194, 8288, !dbg !263 + %6777 = lshr exact i32 %6776, 4, !dbg !263 + %6778 = and i32 %6777, 16383, !dbg !263 + %6779 = zext nneg i32 %6778 to i64, !dbg !263 + %6780 = or disjoint i64 %6779, 4611686293338849280, !dbg !263 + %6781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 0, !dbg !263 + %6782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 1, !dbg !263 + %6783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 2, !dbg !263 + %6784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 3, !dbg !263 + %6785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 4, !dbg !263 + %6786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 5, !dbg !263 + %6787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 6, !dbg !263 + %6788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 7, !dbg !263 + %6789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 8, !dbg !263 + %6790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 9, !dbg !263 + %6791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 10, !dbg !263 + %6792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 11, !dbg !263 + %6793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 12, !dbg !263 + %6794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 13, !dbg !263 + %6795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 14, !dbg !263 + %6796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 15, !dbg !263 + %6797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 16, !dbg !263 + %6798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 17, !dbg !263 + %6799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 18, !dbg !263 + %6800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 19, !dbg !263 + %6801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 20, !dbg !263 + %6802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 21, !dbg !263 + %6803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 22, !dbg !263 + %6804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 23, !dbg !263 + %6805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 24, !dbg !263 + %6806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 25, !dbg !263 + %6807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 26, !dbg !263 + %6808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 27, !dbg !263 + %6809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 28, !dbg !263 + %6810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 29, !dbg !263 + %6811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 30, !dbg !263 + %6812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 31, !dbg !263 + %6813 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6781, float %6782, float %6783, float %6784, float %6785, float %6786, float %6787, float %6788, float %6789, float %6790, float %6791, float %6792, float %6793, float %6794, float %6795, float %6796, float %6797, float %6798, float %6799, float %6800, float %6801, float %6802, float %6803, float %6804, float %6805, float %6806, float %6807, float %6808, float %6809, float %6810, float %6811, float %6812, i64 %6775, i64 %6780, i1 true) #3, !dbg !263 + %6814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 0, !dbg !263 + %6815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 1, !dbg !263 + %6816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 2, !dbg !263 + %6817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 3, !dbg !263 + %6818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 4, !dbg !263 + %6819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 5, !dbg !263 + %6820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 6, !dbg !263 + %6821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 7, !dbg !263 + %6822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 8, !dbg !263 + %6823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 9, !dbg !263 + %6824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 10, !dbg !263 + %6825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 11, !dbg !263 + %6826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 12, !dbg !263 + %6827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 13, !dbg !263 + %6828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 14, !dbg !263 + %6829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 15, !dbg !263 + %6830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 16, !dbg !263 + %6831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 17, !dbg !263 + %6832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 18, !dbg !263 + %6833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 19, !dbg !263 + %6834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 20, !dbg !263 + %6835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 21, !dbg !263 + %6836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 22, !dbg !263 + %6837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 23, !dbg !263 + %6838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 24, !dbg !263 + %6839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 25, !dbg !263 + %6840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 26, !dbg !263 + %6841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 27, !dbg !263 + %6842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 28, !dbg !263 + %6843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 29, !dbg !263 + %6844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 30, !dbg !263 + %6845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 31, !dbg !263 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !263 + %6846 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %6814, float %6815, float %6816, float %6817, float %6818, float %6819, float %6820, float %6821, float %6822, float %6823, float %6824, float %6825, float %6826, float %6827, float %6828, float %6829, float %6830, float %6831, float %6832, float %6833, float %6834, float %6835, float %6836, float %6837, float %6838, float %6839, float %6840, float %6841, float %6842, float %6843, float %6844, float %6845, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 0, i32 0, ptr addrspace(3) %6129, i32 0, i32 0) #3, !dbg !263 + %6847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 0, !dbg !263 + %6848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 1, !dbg !263 + %6849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 2, !dbg !263 + %6850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 3, !dbg !263 + %6851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 4, !dbg !263 + %6852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 5, !dbg !263 + %6853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 6, !dbg !263 + %6854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 7, !dbg !263 + %6855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 8, !dbg !263 + %6856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 9, !dbg !263 + %6857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 10, !dbg !263 + %6858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 11, !dbg !263 + %6859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 12, !dbg !263 + %6860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 13, !dbg !263 + %6861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 14, !dbg !263 + %6862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 15, !dbg !263 + %6863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 16, !dbg !263 + %6864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 17, !dbg !263 + %6865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 18, !dbg !263 + %6866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 19, !dbg !263 + %6867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 20, !dbg !263 + %6868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 21, !dbg !263 + %6869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 22, !dbg !263 + %6870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 23, !dbg !263 + %6871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 24, !dbg !263 + %6872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 25, !dbg !263 + %6873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 26, !dbg !263 + %6874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 27, !dbg !263 + %6875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 28, !dbg !263 + %6876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 29, !dbg !263 + %6877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 30, !dbg !263 + %6878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 31, !dbg !263 + %6879 = fsub float %6847, %6476, !dbg !264 + %6880 = fsub float %6848, %6478, !dbg !264 + %6881 = fsub float %6849, %6476, !dbg !264 + %6882 = fsub float %6850, %6478, !dbg !264 + %6883 = fsub float %6851, %6480, !dbg !264 + %6884 = fsub float %6852, %6482, !dbg !264 + %6885 = fsub float %6853, %6480, !dbg !264 + %6886 = fsub float %6854, %6482, !dbg !264 + %6887 = fsub float %6855, %6484, !dbg !264 + %6888 = fsub float %6856, %6486, !dbg !264 + %6889 = fsub float %6857, %6484, !dbg !264 + %6890 = fsub float %6858, %6486, !dbg !264 + %6891 = fsub float %6859, %6488, !dbg !264 + %6892 = fsub float %6860, %6490, !dbg !264 + %6893 = fsub float %6861, %6488, !dbg !264 + %6894 = fsub float %6862, %6490, !dbg !264 + %6895 = fsub float %6863, %6492, !dbg !264 + %6896 = fsub float %6864, %6494, !dbg !264 + %6897 = fsub float %6865, %6492, !dbg !264 + %6898 = fsub float %6866, %6494, !dbg !264 + %6899 = fsub float %6867, %6496, !dbg !264 + %6900 = fsub float %6868, %6498, !dbg !264 + %6901 = fsub float %6869, %6496, !dbg !264 + %6902 = fsub float %6870, %6498, !dbg !264 + %6903 = fsub float %6871, %6500, !dbg !264 + %6904 = fsub float %6872, %6502, !dbg !264 + %6905 = fsub float %6873, %6500, !dbg !264 + %6906 = fsub float %6874, %6502, !dbg !264 + %6907 = fsub float %6875, %6504, !dbg !264 + %6908 = fsub float %6876, %6506, !dbg !264 + %6909 = fsub float %6877, %6504, !dbg !264 + %6910 = fsub float %6878, %6506, !dbg !264 + %6911 = fmul float %.0.i1221, %6879, !dbg !265 + %6912 = fmul float %.0.i1224, %6880, !dbg !265 + %6913 = fmul float %.0.i1227, %6881, !dbg !265 + %6914 = fmul float %.0.i1230, %6882, !dbg !265 + %6915 = fmul float %.0.i1233, %6883, !dbg !265 + %6916 = fmul float %.0.i1236, %6884, !dbg !265 + %6917 = fmul float %.0.i1239, %6885, !dbg !265 + %6918 = fmul float %.0.i1242, %6886, !dbg !265 + %6919 = fmul float %.0.i1245, %6887, !dbg !265 + %6920 = fmul float %.0.i1248, %6888, !dbg !265 + %6921 = fmul float %.0.i1251, %6889, !dbg !265 + %6922 = fmul float %.0.i1254, %6890, !dbg !265 + %6923 = fmul float %.0.i1257, %6891, !dbg !265 + %6924 = fmul float %.0.i1260, %6892, !dbg !265 + %6925 = fmul float %.0.i1263, %6893, !dbg !265 + %6926 = fmul float %.0.i1266, %6894, !dbg !265 + %6927 = fmul float %.0.i1269, %6895, !dbg !265 + %6928 = fmul float %.0.i1272, %6896, !dbg !265 + %6929 = fmul float %.0.i1275, %6897, !dbg !265 + %6930 = fmul float %.0.i1278, %6898, !dbg !265 + %6931 = fmul float %.0.i1281, %6899, !dbg !265 + %6932 = fmul float %.0.i1284, %6900, !dbg !265 + %6933 = fmul float %.0.i1287, %6901, !dbg !265 + %6934 = fmul float %.0.i1290, %6902, !dbg !265 + %6935 = fmul float %.0.i1293, %6903, !dbg !265 + %6936 = fmul float %.0.i1296, %6904, !dbg !265 + %6937 = fmul float %.0.i1299, %6905, !dbg !265 + %6938 = fmul float %.0.i1302, %6906, !dbg !265 + %6939 = fmul float %.0.i1305, %6907, !dbg !265 + %6940 = fmul float %.0.i1308, %6908, !dbg !265 + %6941 = fmul float %.0.i1311, %6909, !dbg !265 + %6942 = fmul float %.0.i1314, %6910, !dbg !265 + %6943 = fptrunc float %6911 to bfloat, !dbg !266 + %6944 = select i1 %5826, bfloat %6943, bfloat 0xR0000, !dbg !267 + %6945 = fptrunc float %6912 to bfloat, !dbg !266 + %6946 = select i1 %5828, bfloat %6945, bfloat 0xR0000, !dbg !267 + %6947 = fptrunc float %6913 to bfloat, !dbg !266 + %6948 = select i1 %5829, bfloat %6947, bfloat 0xR0000, !dbg !267 + %6949 = fptrunc float %6914 to bfloat, !dbg !266 + %6950 = select i1 %5830, bfloat %6949, bfloat 0xR0000, !dbg !267 + %6951 = fptrunc float %6915 to bfloat, !dbg !266 + %6952 = select i1 %5832, bfloat %6951, bfloat 0xR0000, !dbg !267 + %6953 = fptrunc float %6916 to bfloat, !dbg !266 + %6954 = select i1 %5834, bfloat %6953, bfloat 0xR0000, !dbg !267 + %6955 = fptrunc float %6917 to bfloat, !dbg !266 + %6956 = select i1 %5835, bfloat %6955, bfloat 0xR0000, !dbg !267 + %6957 = fptrunc float %6918 to bfloat, !dbg !266 + %6958 = select i1 %5836, bfloat %6957, bfloat 0xR0000, !dbg !267 + %6959 = fptrunc float %6919 to bfloat, !dbg !266 + %6960 = select i1 %5838, bfloat %6959, bfloat 0xR0000, !dbg !267 + %6961 = fptrunc float %6920 to bfloat, !dbg !266 + %6962 = select i1 %5840, bfloat %6961, bfloat 0xR0000, !dbg !267 + %6963 = fptrunc float %6921 to bfloat, !dbg !266 + %6964 = select i1 %5841, bfloat %6963, bfloat 0xR0000, !dbg !267 + %6965 = fptrunc float %6922 to bfloat, !dbg !266 + %6966 = select i1 %5842, bfloat %6965, bfloat 0xR0000, !dbg !267 + %6967 = fptrunc float %6923 to bfloat, !dbg !266 + %6968 = select i1 %5844, bfloat %6967, bfloat 0xR0000, !dbg !267 + %6969 = fptrunc float %6924 to bfloat, !dbg !266 + %6970 = select i1 %5846, bfloat %6969, bfloat 0xR0000, !dbg !267 + %6971 = fptrunc float %6925 to bfloat, !dbg !266 + %6972 = select i1 %5847, bfloat %6971, bfloat 0xR0000, !dbg !267 + %6973 = fptrunc float %6926 to bfloat, !dbg !266 + %6974 = select i1 %5848, bfloat %6973, bfloat 0xR0000, !dbg !267 + %6975 = fptrunc float %6927 to bfloat, !dbg !266 + %6976 = select i1 %5850, bfloat %6975, bfloat 0xR0000, !dbg !267 + %6977 = fptrunc float %6928 to bfloat, !dbg !266 + %6978 = select i1 %5852, bfloat %6977, bfloat 0xR0000, !dbg !267 + %6979 = fptrunc float %6929 to bfloat, !dbg !266 + %6980 = select i1 %5853, bfloat %6979, bfloat 0xR0000, !dbg !267 + %6981 = fptrunc float %6930 to bfloat, !dbg !266 + %6982 = select i1 %5854, bfloat %6981, bfloat 0xR0000, !dbg !267 + %6983 = fptrunc float %6931 to bfloat, !dbg !266 + %6984 = select i1 %5856, bfloat %6983, bfloat 0xR0000, !dbg !267 + %6985 = fptrunc float %6932 to bfloat, !dbg !266 + %6986 = select i1 %5858, bfloat %6985, bfloat 0xR0000, !dbg !267 + %6987 = fptrunc float %6933 to bfloat, !dbg !266 + %6988 = select i1 %5859, bfloat %6987, bfloat 0xR0000, !dbg !267 + %6989 = fptrunc float %6934 to bfloat, !dbg !266 + %6990 = select i1 %5860, bfloat %6989, bfloat 0xR0000, !dbg !267 + %6991 = fptrunc float %6935 to bfloat, !dbg !266 + %6992 = select i1 %5862, bfloat %6991, bfloat 0xR0000, !dbg !267 + %6993 = fptrunc float %6936 to bfloat, !dbg !266 + %6994 = select i1 %5864, bfloat %6993, bfloat 0xR0000, !dbg !267 + %6995 = fptrunc float %6937 to bfloat, !dbg !266 + %6996 = select i1 %5865, bfloat %6995, bfloat 0xR0000, !dbg !267 + %6997 = fptrunc float %6938 to bfloat, !dbg !266 + %6998 = select i1 %5866, bfloat %6997, bfloat 0xR0000, !dbg !267 + %6999 = fptrunc float %6939 to bfloat, !dbg !266 + %7000 = select i1 %5868, bfloat %6999, bfloat 0xR0000, !dbg !267 + %7001 = fptrunc float %6940 to bfloat, !dbg !266 + %7002 = select i1 %5870, bfloat %7001, bfloat 0xR0000, !dbg !267 + %7003 = fptrunc float %6941 to bfloat, !dbg !266 + %7004 = select i1 %5871, bfloat %7003, bfloat 0xR0000, !dbg !267 + %7005 = fptrunc float %6942 to bfloat, !dbg !266 + %7006 = select i1 %5872, bfloat %7005, bfloat 0xR0000, !dbg !267 + %7007 = insertelement <2 x bfloat> poison, bfloat %6944, i64 0, !dbg !268 + %7008 = insertelement <2 x bfloat> %7007, bfloat %6946, i64 1, !dbg !268 + %7009 = bitcast <2 x bfloat> %7008 to i32, !dbg !268 + %7010 = insertelement <2 x bfloat> poison, bfloat %6948, i64 0, !dbg !268 + %7011 = insertelement <2 x bfloat> %7010, bfloat %6950, i64 1, !dbg !268 + %7012 = bitcast <2 x bfloat> %7011 to i32, !dbg !268 + %7013 = insertelement <2 x bfloat> poison, bfloat %6952, i64 0, !dbg !268 + %7014 = insertelement <2 x bfloat> %7013, bfloat %6954, i64 1, !dbg !268 + %7015 = bitcast <2 x bfloat> %7014 to i32, !dbg !268 + %7016 = insertelement <2 x bfloat> poison, bfloat %6956, i64 0, !dbg !268 + %7017 = insertelement <2 x bfloat> %7016, bfloat %6958, i64 1, !dbg !268 + %7018 = bitcast <2 x bfloat> %7017 to i32, !dbg !268 + %7019 = insertelement <2 x bfloat> poison, bfloat %6960, i64 0, !dbg !268 + %7020 = insertelement <2 x bfloat> %7019, bfloat %6962, i64 1, !dbg !268 + %7021 = bitcast <2 x bfloat> %7020 to i32, !dbg !268 + %7022 = insertelement <2 x bfloat> poison, bfloat %6964, i64 0, !dbg !268 + %7023 = insertelement <2 x bfloat> %7022, bfloat %6966, i64 1, !dbg !268 + %7024 = bitcast <2 x bfloat> %7023 to i32, !dbg !268 + %7025 = insertelement <2 x bfloat> poison, bfloat %6968, i64 0, !dbg !268 + %7026 = insertelement <2 x bfloat> %7025, bfloat %6970, i64 1, !dbg !268 + %7027 = bitcast <2 x bfloat> %7026 to i32, !dbg !268 + %7028 = insertelement <2 x bfloat> poison, bfloat %6972, i64 0, !dbg !268 + %7029 = insertelement <2 x bfloat> %7028, bfloat %6974, i64 1, !dbg !268 + %7030 = bitcast <2 x bfloat> %7029 to i32, !dbg !268 + %7031 = insertelement <2 x bfloat> poison, bfloat %6976, i64 0, !dbg !268 + %7032 = insertelement <2 x bfloat> %7031, bfloat %6978, i64 1, !dbg !268 + %7033 = bitcast <2 x bfloat> %7032 to i32, !dbg !268 + %7034 = insertelement <2 x bfloat> poison, bfloat %6980, i64 0, !dbg !268 + %7035 = insertelement <2 x bfloat> %7034, bfloat %6982, i64 1, !dbg !268 + %7036 = bitcast <2 x bfloat> %7035 to i32, !dbg !268 + %7037 = insertelement <2 x bfloat> poison, bfloat %6984, i64 0, !dbg !268 + %7038 = insertelement <2 x bfloat> %7037, bfloat %6986, i64 1, !dbg !268 + %7039 = bitcast <2 x bfloat> %7038 to i32, !dbg !268 + %7040 = insertelement <2 x bfloat> poison, bfloat %6988, i64 0, !dbg !268 + %7041 = insertelement <2 x bfloat> %7040, bfloat %6990, i64 1, !dbg !268 + %7042 = bitcast <2 x bfloat> %7041 to i32, !dbg !268 + %7043 = insertelement <2 x bfloat> poison, bfloat %6992, i64 0, !dbg !268 + %7044 = insertelement <2 x bfloat> %7043, bfloat %6994, i64 1, !dbg !268 + %7045 = bitcast <2 x bfloat> %7044 to i32, !dbg !268 + %7046 = insertelement <2 x bfloat> poison, bfloat %6996, i64 0, !dbg !268 + %7047 = insertelement <2 x bfloat> %7046, bfloat %6998, i64 1, !dbg !268 + %7048 = bitcast <2 x bfloat> %7047 to i32, !dbg !268 + %7049 = insertelement <2 x bfloat> poison, bfloat %7000, i64 0, !dbg !268 + %7050 = insertelement <2 x bfloat> %7049, bfloat %7002, i64 1, !dbg !268 + %7051 = bitcast <2 x bfloat> %7050 to i32, !dbg !268 + %7052 = insertelement <2 x bfloat> poison, bfloat %7004, i64 0, !dbg !268 + %7053 = insertelement <2 x bfloat> %7052, bfloat %7006, i64 1, !dbg !268 + %7054 = bitcast <2 x bfloat> %7053 to i32, !dbg !268 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !268 + %7055 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %5214, float %5215, float %5216, float %5217, float %5218, float %5219, float %5220, float %5221, float %5222, float %5223, float %5224, float %5225, float %5226, float %5227, float %5228, float %5229, float %5230, float %5231, float %5232, float %5233, float %5234, float %5235, float %5236, float %5237, float %5238, float %5239, float %5240, float %5241, float %5242, float %5243, float %5244, float %5245, float %5246, float %5247, float %5248, float %5249, float %5250, float %5251, float %5252, float %5253, float %5254, float %5255, float %5256, float %5257, float %5258, float %5259, float %5260, float %5261, float %5262, float %5263, float %5264, float %5265, float %5266, float %5267, float %5268, float %5269, float %5270, float %5271, float %5272, float %5273, float %5274, float %5275, float %5276, float %5277, i32 %7009, i32 %7012, i32 %7015, i32 %7018, i64 %5368, i1 true) #3, !dbg !268 + %7056 = add i32 %5364, 2048, !dbg !268 + %7057 = lshr exact i32 %7056, 4, !dbg !268 + %7058 = and i32 %7057, 16383, !dbg !268 + %7059 = zext nneg i32 %7058 to i64, !dbg !268 + %7060 = or disjoint i64 %7059, 4611686293338849280, !dbg !268 + %7061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 0, !dbg !268 + %7062 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 1, !dbg !268 + %7063 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 2, !dbg !268 + %7064 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 3, !dbg !268 + %7065 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 4, !dbg !268 + %7066 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 5, !dbg !268 + %7067 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 6, !dbg !268 + %7068 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 7, !dbg !268 + %7069 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 8, !dbg !268 + %7070 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 9, !dbg !268 + %7071 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 10, !dbg !268 + %7072 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 11, !dbg !268 + %7073 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 12, !dbg !268 + %7074 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 13, !dbg !268 + %7075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 14, !dbg !268 + %7076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 15, !dbg !268 + %7077 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 16, !dbg !268 + %7078 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 17, !dbg !268 + %7079 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 18, !dbg !268 + %7080 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 19, !dbg !268 + %7081 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 20, !dbg !268 + %7082 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 21, !dbg !268 + %7083 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 22, !dbg !268 + %7084 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 23, !dbg !268 + %7085 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 24, !dbg !268 + %7086 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 25, !dbg !268 + %7087 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 26, !dbg !268 + %7088 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 27, !dbg !268 + %7089 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 28, !dbg !268 + %7090 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 29, !dbg !268 + %7091 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 30, !dbg !268 + %7092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 31, !dbg !268 + %7093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 32, !dbg !268 + %7094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 33, !dbg !268 + %7095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 34, !dbg !268 + %7096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 35, !dbg !268 + %7097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 36, !dbg !268 + %7098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 37, !dbg !268 + %7099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 38, !dbg !268 + %7100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 39, !dbg !268 + %7101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 40, !dbg !268 + %7102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 41, !dbg !268 + %7103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 42, !dbg !268 + %7104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 43, !dbg !268 + %7105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 44, !dbg !268 + %7106 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 45, !dbg !268 + %7107 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 46, !dbg !268 + %7108 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 47, !dbg !268 + %7109 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 48, !dbg !268 + %7110 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 49, !dbg !268 + %7111 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 50, !dbg !268 + %7112 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 51, !dbg !268 + %7113 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 52, !dbg !268 + %7114 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 53, !dbg !268 + %7115 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 54, !dbg !268 + %7116 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 55, !dbg !268 + %7117 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 56, !dbg !268 + %7118 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 57, !dbg !268 + %7119 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 58, !dbg !268 + %7120 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 59, !dbg !268 + %7121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 60, !dbg !268 + %7122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 61, !dbg !268 + %7123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 62, !dbg !268 + %7124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 63, !dbg !268 + %7125 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %7061, float %7062, float %7063, float %7064, float %7065, float %7066, float %7067, float %7068, float %7069, float %7070, float %7071, float %7072, float %7073, float %7074, float %7075, float %7076, float %7077, float %7078, float %7079, float %7080, float %7081, float %7082, float %7083, float %7084, float %7085, float %7086, float %7087, float %7088, float %7089, float %7090, float %7091, float %7092, float %7093, float %7094, float %7095, float %7096, float %7097, float %7098, float %7099, float %7100, float %7101, float %7102, float %7103, float %7104, float %7105, float %7106, float %7107, float %7108, float %7109, float %7110, float %7111, float %7112, float %7113, float %7114, float %7115, float %7116, float %7117, float %7118, float %7119, float %7120, float %7121, float %7122, float %7123, float %7124, i32 %7021, i32 %7024, i32 %7027, i32 %7030, i64 %7060, i1 true) #3, !dbg !268 + %7126 = add i32 %5364, 4096, !dbg !268 + %7127 = lshr exact i32 %7126, 4, !dbg !268 + %7128 = and i32 %7127, 16383, !dbg !268 + %7129 = zext nneg i32 %7128 to i64, !dbg !268 + %7130 = or disjoint i64 %7129, 4611686293338849280, !dbg !268 + %7131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 0, !dbg !268 + %7132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 1, !dbg !268 + %7133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 2, !dbg !268 + %7134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 3, !dbg !268 + %7135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 4, !dbg !268 + %7136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 5, !dbg !268 + %7137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 6, !dbg !268 + %7138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 7, !dbg !268 + %7139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 8, !dbg !268 + %7140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 9, !dbg !268 + %7141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 10, !dbg !268 + %7142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 11, !dbg !268 + %7143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 12, !dbg !268 + %7144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 13, !dbg !268 + %7145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 14, !dbg !268 + %7146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 15, !dbg !268 + %7147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 16, !dbg !268 + %7148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 17, !dbg !268 + %7149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 18, !dbg !268 + %7150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 19, !dbg !268 + %7151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 20, !dbg !268 + %7152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 21, !dbg !268 + %7153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 22, !dbg !268 + %7154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 23, !dbg !268 + %7155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 24, !dbg !268 + %7156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 25, !dbg !268 + %7157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 26, !dbg !268 + %7158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 27, !dbg !268 + %7159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 28, !dbg !268 + %7160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 29, !dbg !268 + %7161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 30, !dbg !268 + %7162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 31, !dbg !268 + %7163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 32, !dbg !268 + %7164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 33, !dbg !268 + %7165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 34, !dbg !268 + %7166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 35, !dbg !268 + %7167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 36, !dbg !268 + %7168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 37, !dbg !268 + %7169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 38, !dbg !268 + %7170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 39, !dbg !268 + %7171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 40, !dbg !268 + %7172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 41, !dbg !268 + %7173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 42, !dbg !268 + %7174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 43, !dbg !268 + %7175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 44, !dbg !268 + %7176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 45, !dbg !268 + %7177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 46, !dbg !268 + %7178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 47, !dbg !268 + %7179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 48, !dbg !268 + %7180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 49, !dbg !268 + %7181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 50, !dbg !268 + %7182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 51, !dbg !268 + %7183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 52, !dbg !268 + %7184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 53, !dbg !268 + %7185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 54, !dbg !268 + %7186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 55, !dbg !268 + %7187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 56, !dbg !268 + %7188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 57, !dbg !268 + %7189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 58, !dbg !268 + %7190 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 59, !dbg !268 + %7191 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 60, !dbg !268 + %7192 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 61, !dbg !268 + %7193 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 62, !dbg !268 + %7194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 63, !dbg !268 + %7195 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %7131, float %7132, float %7133, float %7134, float %7135, float %7136, float %7137, float %7138, float %7139, float %7140, float %7141, float %7142, float %7143, float %7144, float %7145, float %7146, float %7147, float %7148, float %7149, float %7150, float %7151, float %7152, float %7153, float %7154, float %7155, float %7156, float %7157, float %7158, float %7159, float %7160, float %7161, float %7162, float %7163, float %7164, float %7165, float %7166, float %7167, float %7168, float %7169, float %7170, float %7171, float %7172, float %7173, float %7174, float %7175, float %7176, float %7177, float %7178, float %7179, float %7180, float %7181, float %7182, float %7183, float %7184, float %7185, float %7186, float %7187, float %7188, float %7189, float %7190, float %7191, float %7192, float %7193, float %7194, i32 %7033, i32 %7036, i32 %7039, i32 %7042, i64 %7130, i1 true) #3, !dbg !268 + %7196 = add i32 %5364, 6144, !dbg !268 + %7197 = lshr exact i32 %7196, 4, !dbg !268 + %7198 = and i32 %7197, 16383, !dbg !268 + %7199 = zext nneg i32 %7198 to i64, !dbg !268 + %7200 = or disjoint i64 %7199, 4611686293338849280, !dbg !268 + %7201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 0, !dbg !268 + %7202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 1, !dbg !268 + %7203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 2, !dbg !268 + %7204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 3, !dbg !268 + %7205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 4, !dbg !268 + %7206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 5, !dbg !268 + %7207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 6, !dbg !268 + %7208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 7, !dbg !268 + %7209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 8, !dbg !268 + %7210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 9, !dbg !268 + %7211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 10, !dbg !268 + %7212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 11, !dbg !268 + %7213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 12, !dbg !268 + %7214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 13, !dbg !268 + %7215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 14, !dbg !268 + %7216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 15, !dbg !268 + %7217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 16, !dbg !268 + %7218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 17, !dbg !268 + %7219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 18, !dbg !268 + %7220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 19, !dbg !268 + %7221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 20, !dbg !268 + %7222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 21, !dbg !268 + %7223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 22, !dbg !268 + %7224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 23, !dbg !268 + %7225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 24, !dbg !268 + %7226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 25, !dbg !268 + %7227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 26, !dbg !268 + %7228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 27, !dbg !268 + %7229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 28, !dbg !268 + %7230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 29, !dbg !268 + %7231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 30, !dbg !268 + %7232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 31, !dbg !268 + %7233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 32, !dbg !268 + %7234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 33, !dbg !268 + %7235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 34, !dbg !268 + %7236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 35, !dbg !268 + %7237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 36, !dbg !268 + %7238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 37, !dbg !268 + %7239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 38, !dbg !268 + %7240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 39, !dbg !268 + %7241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 40, !dbg !268 + %7242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 41, !dbg !268 + %7243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 42, !dbg !268 + %7244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 43, !dbg !268 + %7245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 44, !dbg !268 + %7246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 45, !dbg !268 + %7247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 46, !dbg !268 + %7248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 47, !dbg !268 + %7249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 48, !dbg !268 + %7250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 49, !dbg !268 + %7251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 50, !dbg !268 + %7252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 51, !dbg !268 + %7253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 52, !dbg !268 + %7254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 53, !dbg !268 + %7255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 54, !dbg !268 + %7256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 55, !dbg !268 + %7257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 56, !dbg !268 + %7258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 57, !dbg !268 + %7259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 58, !dbg !268 + %7260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 59, !dbg !268 + %7261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 60, !dbg !268 + %7262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 61, !dbg !268 + %7263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 62, !dbg !268 + %7264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 63, !dbg !268 + %7265 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %7201, float %7202, float %7203, float %7204, float %7205, float %7206, float %7207, float %7208, float %7209, float %7210, float %7211, float %7212, float %7213, float %7214, float %7215, float %7216, float %7217, float %7218, float %7219, float %7220, float %7221, float %7222, float %7223, float %7224, float %7225, float %7226, float %7227, float %7228, float %7229, float %7230, float %7231, float %7232, float %7233, float %7234, float %7235, float %7236, float %7237, float %7238, float %7239, float %7240, float %7241, float %7242, float %7243, float %7244, float %7245, float %7246, float %7247, float %7248, float %7249, float %7250, float %7251, float %7252, float %7253, float %7254, float %7255, float %7256, float %7257, float %7258, float %7259, float %7260, float %7261, float %7262, float %7263, float %7264, i32 %7045, i32 %7048, i32 %7051, i32 %7054, i64 %7200, i1 true) #3, !dbg !268 + %7266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 0, !dbg !268 + %7267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 1, !dbg !268 + %7268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 2, !dbg !268 + %7269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 3, !dbg !268 + %7270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 4, !dbg !268 + %7271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 5, !dbg !268 + %7272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 6, !dbg !268 + %7273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 7, !dbg !268 + %7274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 8, !dbg !268 + %7275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 9, !dbg !268 + %7276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 10, !dbg !268 + %7277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 11, !dbg !268 + %7278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 12, !dbg !268 + %7279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 13, !dbg !268 + %7280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 14, !dbg !268 + %7281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 15, !dbg !268 + %7282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 16, !dbg !268 + %7283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 17, !dbg !268 + %7284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 18, !dbg !268 + %7285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 19, !dbg !268 + %7286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 20, !dbg !268 + %7287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 21, !dbg !268 + %7288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 22, !dbg !268 + %7289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 23, !dbg !268 + %7290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 24, !dbg !268 + %7291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 25, !dbg !268 + %7292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 26, !dbg !268 + %7293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 27, !dbg !268 + %7294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 28, !dbg !268 + %7295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 29, !dbg !268 + %7296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 30, !dbg !268 + %7297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 31, !dbg !268 + %7298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 32, !dbg !268 + %7299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 33, !dbg !268 + %7300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 34, !dbg !268 + %7301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 35, !dbg !268 + %7302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 36, !dbg !268 + %7303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 37, !dbg !268 + %7304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 38, !dbg !268 + %7305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 39, !dbg !268 + %7306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 40, !dbg !268 + %7307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 41, !dbg !268 + %7308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 42, !dbg !268 + %7309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 43, !dbg !268 + %7310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 44, !dbg !268 + %7311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 45, !dbg !268 + %7312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 46, !dbg !268 + %7313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 47, !dbg !268 + %7314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 48, !dbg !268 + %7315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 49, !dbg !268 + %7316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 50, !dbg !268 + %7317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 51, !dbg !268 + %7318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 52, !dbg !268 + %7319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 53, !dbg !268 + %7320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 54, !dbg !268 + %7321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 55, !dbg !268 + %7322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 56, !dbg !268 + %7323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 57, !dbg !268 + %7324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 58, !dbg !268 + %7325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 59, !dbg !268 + %7326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 60, !dbg !268 + %7327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 61, !dbg !268 + %7328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 62, !dbg !268 + %7329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 63, !dbg !268 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !268 + %7330 = insertelement <16 x i32> poison, i32 %5145, i64 0, !dbg !269 + %7331 = shufflevector <16 x i32> %7330, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !269 + %7332 = add <16 x i32> %5279, %7331, !dbg !269 + %7333 = add nuw nsw i32 %5278, 1, !dbg !213 + %7334 = lshr i32 %7333, 1, !dbg !270 + %7335 = zext nneg i32 %7334 to i64, !dbg !271 + %7336 = getelementptr i32, ptr addrspace(1) %4707, i64 %7335, !dbg !271 + %7337 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !272 + %7338 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %7336, i64 %7337, i1 %5281) #3, !dbg !272 + %7339 = add nuw nsw i32 %7334, 1, !dbg !273 + %7340 = icmp slt i32 %7339, %4712, !dbg !274 + %7341 = getelementptr i8, ptr addrspace(1) %7336, i64 4, !dbg !275 + %7342 = and i1 %5281, %7340, !dbg !213 + %7343 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !276 + %7344 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %7341, i64 %7343, i1 %7342) #3, !dbg !276 + %7345 = and i32 %5278, 1, !dbg !277 + %7346 = sub i32 %7344, %7338, !dbg !278 + %7347 = shl i32 %7346, 7, !dbg !279 + %7348 = add i32 %7347, -64, !dbg !280 + %7349 = xor i32 %7345, 1, !dbg !281 + %7350 = mul nuw nsw i32 %7348, %7349, !dbg !281 + %7351 = shl nuw nsw i32 %7345, 6, !dbg !282 + %7352 = add i32 %7350, %7351, !dbg !283 + %7353 = shl i32 %7352, 12, !dbg !284 + %7354 = sext i32 %7353 to i64, !dbg !249 + %7355 = getelementptr bfloat, ptr addrspace(1) %.pn1201513, i64 %7354, !dbg !249 + %7356 = getelementptr bfloat, ptr addrspace(1) %.pn1041514, i64 %7354, !dbg !249 + %7357 = getelementptr bfloat, ptr addrspace(1) %.pn881515, i64 %7354, !dbg !249 + %7358 = getelementptr bfloat, ptr addrspace(1) %.pn721516, i64 %7354, !dbg !249 + %7359 = shl i32 %7352, 7, !dbg !285 + %7360 = sext i32 %7359 to i64, !dbg !250 + %7361 = getelementptr bfloat, ptr addrspace(1) %.pn1841517, i64 %7360, !dbg !250 + %7362 = getelementptr bfloat, ptr addrspace(1) %.pn1681518, i64 %7360, !dbg !250 + %7363 = getelementptr bfloat, ptr addrspace(1) %.pn1521519, i64 %7360, !dbg !250 + %7364 = getelementptr bfloat, ptr addrspace(1) %.pn1361520, i64 %7360, !dbg !250 + %7365 = add i32 %7352, %.pn2161521, !dbg !269 + %7366 = add i32 %7352, %.pn2121522, !dbg !269 + %7367 = add i32 %7352, %.pn2081523, !dbg !269 + %7368 = add i32 %7352, %.pn2041524, !dbg !269 + %7369 = add i32 %7352, %.pn2001525, !dbg !269 + %7370 = add i32 %7352, %.pn1961526, !dbg !269 + %7371 = add i32 %7352, %.pn1921527, !dbg !269 + %7372 = add i32 %7352, %.pn1881528, !dbg !269 + %7373 = add i32 %5147, 1, !dbg !213 + %7374 = icmp sgt i32 %7373, 1, !dbg !213 + %7375 = select i1 %7374, i32 0, i32 %7373, !dbg !213 + %7376 = add i32 %5149, 1, !dbg !213 + %7377 = icmp sgt i32 %7376, 2, !dbg !213 + %7378 = select i1 %7377, i32 0, i32 %7376, !dbg !213 + %7379 = shl i32 %7378, 13, !dbg !244 + %7380 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %7379, !dbg !244 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !244 + %7381 = getelementptr inbounds nuw i8, ptr addrspace(3) %7380, i32 %4793, !dbg !244 + %7382 = select i1 %5280, i32 16, i32 0, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %7381, ptr addrspace(1) %7355, i32 %7382) #3, !dbg !244 + %7383 = getelementptr inbounds nuw i8, ptr addrspace(3) %7380, i32 %4796, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %7383, ptr addrspace(1) %7356, i32 %7382) #3, !dbg !244 + %7384 = getelementptr inbounds nuw i8, ptr addrspace(3) %7380, i32 %4798, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %7384, ptr addrspace(1) %7357, i32 %7382) #3, !dbg !244 + %7385 = getelementptr inbounds nuw i8, ptr addrspace(3) %7380, i32 %4800, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %7385, ptr addrspace(1) %7358, i32 %7382) #3, !dbg !244 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !244 + %7386 = sext i32 %7365 to i64, !dbg !245 + %7387 = getelementptr float, ptr addrspace(1) %5087, i64 %7386, !dbg !245 + %7388 = sext i32 %7366 to i64, !dbg !245 + %7389 = getelementptr float, ptr addrspace(1) %5087, i64 %7388, !dbg !245 + %7390 = sext i32 %7367 to i64, !dbg !245 + %7391 = getelementptr float, ptr addrspace(1) %5087, i64 %7390, !dbg !245 + %7392 = sext i32 %7368 to i64, !dbg !245 + %7393 = getelementptr float, ptr addrspace(1) %5087, i64 %7392, !dbg !245 + %7394 = sext i32 %7369 to i64, !dbg !245 + %7395 = getelementptr float, ptr addrspace(1) %5087, i64 %7394, !dbg !245 + %7396 = sext i32 %7370 to i64, !dbg !245 + %7397 = getelementptr float, ptr addrspace(1) %5087, i64 %7396, !dbg !245 + %7398 = sext i32 %7371 to i64, !dbg !245 + %7399 = getelementptr float, ptr addrspace(1) %5087, i64 %7398, !dbg !245 + %7400 = sext i32 %7372 to i64, !dbg !245 + %7401 = getelementptr float, ptr addrspace(1) %5087, i64 %7400, !dbg !245 + %7402 = shl i32 %7375, 6, !dbg !246 + %7403 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %7402, !dbg !246 + %7404 = getelementptr inbounds nuw i8, ptr addrspace(3) %7403, i32 %4812, !dbg !246 + %7405 = select i1 %5280, i32 8, i32 0, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %7404, ptr addrspace(1) %7387, i32 %7405, i1 %4811) #3, !dbg !246 + %7406 = getelementptr inbounds nuw i8, ptr addrspace(3) %7403, i32 %4815, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7406, ptr addrspace(1) %7389, i32 %7405, i1 %4811) #3, !dbg !246 + %7407 = getelementptr inbounds nuw i8, ptr addrspace(3) %7403, i32 %4817, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7407, ptr addrspace(1) %7391, i32 %7405, i1 %4811) #3, !dbg !246 + %7408 = getelementptr inbounds nuw i8, ptr addrspace(3) %7403, i32 %4819, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7408, ptr addrspace(1) %7393, i32 %7405, i1 %4811) #3, !dbg !246 + %7409 = getelementptr inbounds nuw i8, ptr addrspace(3) %7403, i32 %4821, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7409, ptr addrspace(1) %7395, i32 %7405, i1 %4811) #3, !dbg !246 + %7410 = getelementptr inbounds nuw i8, ptr addrspace(3) %7403, i32 %4823, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7410, ptr addrspace(1) %7397, i32 %7405, i1 %4811) #3, !dbg !246 + %7411 = getelementptr inbounds nuw i8, ptr addrspace(3) %7403, i32 %4825, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7411, ptr addrspace(1) %7399, i32 %7405, i1 %4811) #3, !dbg !246 + %7412 = getelementptr inbounds nuw i8, ptr addrspace(3) %7403, i32 %4827, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7412, ptr addrspace(1) %7401, i32 %7405, i1 %4811) #3, !dbg !246 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !246 + %7413 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %7379, !dbg !244 + %7414 = getelementptr inbounds nuw i8, ptr addrspace(3) %7413, i32 %4793, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %7414, ptr addrspace(1) %7361, i32 %7382) #3, !dbg !244 + %7415 = getelementptr inbounds nuw i8, ptr addrspace(3) %7413, i32 %4796, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %7415, ptr addrspace(1) %7362, i32 %7382) #3, !dbg !244 + %7416 = getelementptr inbounds nuw i8, ptr addrspace(3) %7413, i32 %4798, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %7416, ptr addrspace(1) %7363, i32 %7382) #3, !dbg !244 + %7417 = getelementptr inbounds nuw i8, ptr addrspace(3) %7413, i32 %4800, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %7417, ptr addrspace(1) %7364, i32 %7382) #3, !dbg !244 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !244 + %7418 = getelementptr float, ptr addrspace(1) %5088, i64 %7386, !dbg !247 + %7419 = getelementptr float, ptr addrspace(1) %5088, i64 %7388, !dbg !247 + %7420 = getelementptr float, ptr addrspace(1) %5088, i64 %7390, !dbg !247 + %7421 = getelementptr float, ptr addrspace(1) %5088, i64 %7392, !dbg !247 + %7422 = getelementptr float, ptr addrspace(1) %5088, i64 %7394, !dbg !247 + %7423 = getelementptr float, ptr addrspace(1) %5088, i64 %7396, !dbg !247 + %7424 = getelementptr float, ptr addrspace(1) %5088, i64 %7398, !dbg !247 + %7425 = getelementptr float, ptr addrspace(1) %5088, i64 %7400, !dbg !247 + %7426 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %7402, !dbg !248 + %7427 = getelementptr inbounds nuw i8, ptr addrspace(3) %7426, i32 %4812, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %7427, ptr addrspace(1) %7418, i32 %7405, i1 %4811) #3, !dbg !248 + %7428 = getelementptr inbounds nuw i8, ptr addrspace(3) %7426, i32 %4815, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7428, ptr addrspace(1) %7419, i32 %7405, i1 %4811) #3, !dbg !248 + %7429 = getelementptr inbounds nuw i8, ptr addrspace(3) %7426, i32 %4817, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7429, ptr addrspace(1) %7420, i32 %7405, i1 %4811) #3, !dbg !248 + %7430 = getelementptr inbounds nuw i8, ptr addrspace(3) %7426, i32 %4819, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7430, ptr addrspace(1) %7421, i32 %7405, i1 %4811) #3, !dbg !248 + %7431 = getelementptr inbounds nuw i8, ptr addrspace(3) %7426, i32 %4821, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7431, ptr addrspace(1) %7422, i32 %7405, i1 %4811) #3, !dbg !248 + %7432 = getelementptr inbounds nuw i8, ptr addrspace(3) %7426, i32 %4823, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7432, ptr addrspace(1) %7423, i32 %7405, i1 %4811) #3, !dbg !248 + %7433 = getelementptr inbounds nuw i8, ptr addrspace(3) %7426, i32 %4825, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7433, ptr addrspace(1) %7424, i32 %7405, i1 %4811) #3, !dbg !248 + %7434 = getelementptr inbounds nuw i8, ptr addrspace(3) %7426, i32 %4827, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7434, ptr addrspace(1) %7425, i32 %7405, i1 %4811) #3, !dbg !248 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !248 + %exitcond.not = icmp eq i32 %7333, %smax, !dbg !213 + br i1 %exitcond.not, label %._crit_edge, label %.lr.ph, !dbg !213 + +._crit_edge: ; preds = %__nv_exp2f.exit1315, %4945 + %7435 = phi float [ %4946, %4945 ], [ %7266, %__nv_exp2f.exit1315 ] + %7436 = phi float [ %4947, %4945 ], [ %7267, %__nv_exp2f.exit1315 ] + %7437 = phi float [ %4948, %4945 ], [ %7268, %__nv_exp2f.exit1315 ] + %7438 = phi float [ %4949, %4945 ], [ %7269, %__nv_exp2f.exit1315 ] + %7439 = phi float [ %4950, %4945 ], [ %7270, %__nv_exp2f.exit1315 ] + %7440 = phi float [ %4951, %4945 ], [ %7271, %__nv_exp2f.exit1315 ] + %7441 = phi float [ %4952, %4945 ], [ %7272, %__nv_exp2f.exit1315 ] + %7442 = phi float [ %4953, %4945 ], [ %7273, %__nv_exp2f.exit1315 ] + %7443 = phi float [ %4954, %4945 ], [ %7274, %__nv_exp2f.exit1315 ] + %7444 = phi float [ %4955, %4945 ], [ %7275, %__nv_exp2f.exit1315 ] + %7445 = phi float [ %4956, %4945 ], [ %7276, %__nv_exp2f.exit1315 ] + %7446 = phi float [ %4957, %4945 ], [ %7277, %__nv_exp2f.exit1315 ] + %7447 = phi float [ %4958, %4945 ], [ %7278, %__nv_exp2f.exit1315 ] + %7448 = phi float [ %4959, %4945 ], [ %7279, %__nv_exp2f.exit1315 ] + %7449 = phi float [ %4960, %4945 ], [ %7280, %__nv_exp2f.exit1315 ] + %7450 = phi float [ %4961, %4945 ], [ %7281, %__nv_exp2f.exit1315 ] + %7451 = phi float [ %4962, %4945 ], [ %7282, %__nv_exp2f.exit1315 ] + %7452 = phi float [ %4963, %4945 ], [ %7283, %__nv_exp2f.exit1315 ] + %7453 = phi float [ %4964, %4945 ], [ %7284, %__nv_exp2f.exit1315 ] + %7454 = phi float [ %4965, %4945 ], [ %7285, %__nv_exp2f.exit1315 ] + %7455 = phi float [ %4966, %4945 ], [ %7286, %__nv_exp2f.exit1315 ] + %7456 = phi float [ %4967, %4945 ], [ %7287, %__nv_exp2f.exit1315 ] + %7457 = phi float [ %4968, %4945 ], [ %7288, %__nv_exp2f.exit1315 ] + %7458 = phi float [ %4969, %4945 ], [ %7289, %__nv_exp2f.exit1315 ] + %7459 = phi float [ %4970, %4945 ], [ %7290, %__nv_exp2f.exit1315 ] + %7460 = phi float [ %4971, %4945 ], [ %7291, %__nv_exp2f.exit1315 ] + %7461 = phi float [ %4972, %4945 ], [ %7292, %__nv_exp2f.exit1315 ] + %7462 = phi float [ %4973, %4945 ], [ %7293, %__nv_exp2f.exit1315 ] + %7463 = phi float [ %4974, %4945 ], [ %7294, %__nv_exp2f.exit1315 ] + %7464 = phi float [ %4975, %4945 ], [ %7295, %__nv_exp2f.exit1315 ] + %7465 = phi float [ %4976, %4945 ], [ %7296, %__nv_exp2f.exit1315 ] + %7466 = phi float [ %4977, %4945 ], [ %7297, %__nv_exp2f.exit1315 ] + %7467 = phi float [ %4978, %4945 ], [ %7298, %__nv_exp2f.exit1315 ] + %7468 = phi float [ %4979, %4945 ], [ %7299, %__nv_exp2f.exit1315 ] + %7469 = phi float [ %4980, %4945 ], [ %7300, %__nv_exp2f.exit1315 ] + %7470 = phi float [ %4981, %4945 ], [ %7301, %__nv_exp2f.exit1315 ] + %7471 = phi float [ %4982, %4945 ], [ %7302, %__nv_exp2f.exit1315 ] + %7472 = phi float [ %4983, %4945 ], [ %7303, %__nv_exp2f.exit1315 ] + %7473 = phi float [ %4984, %4945 ], [ %7304, %__nv_exp2f.exit1315 ] + %7474 = phi float [ %4985, %4945 ], [ %7305, %__nv_exp2f.exit1315 ] + %7475 = phi float [ %4986, %4945 ], [ %7306, %__nv_exp2f.exit1315 ] + %7476 = phi float [ %4987, %4945 ], [ %7307, %__nv_exp2f.exit1315 ] + %7477 = phi float [ %4988, %4945 ], [ %7308, %__nv_exp2f.exit1315 ] + %7478 = phi float [ %4989, %4945 ], [ %7309, %__nv_exp2f.exit1315 ] + %7479 = phi float [ %4990, %4945 ], [ %7310, %__nv_exp2f.exit1315 ] + %7480 = phi float [ %4991, %4945 ], [ %7311, %__nv_exp2f.exit1315 ] + %7481 = phi float [ %4992, %4945 ], [ %7312, %__nv_exp2f.exit1315 ] + %7482 = phi float [ %4993, %4945 ], [ %7313, %__nv_exp2f.exit1315 ] + %7483 = phi float [ %4994, %4945 ], [ %7314, %__nv_exp2f.exit1315 ] + %7484 = phi float [ %4995, %4945 ], [ %7315, %__nv_exp2f.exit1315 ] + %7485 = phi float [ %4996, %4945 ], [ %7316, %__nv_exp2f.exit1315 ] + %7486 = phi float [ %4997, %4945 ], [ %7317, %__nv_exp2f.exit1315 ] + %7487 = phi float [ %4998, %4945 ], [ %7318, %__nv_exp2f.exit1315 ] + %7488 = phi float [ %4999, %4945 ], [ %7319, %__nv_exp2f.exit1315 ] + %7489 = phi float [ %5000, %4945 ], [ %7320, %__nv_exp2f.exit1315 ] + %7490 = phi float [ %5001, %4945 ], [ %7321, %__nv_exp2f.exit1315 ] + %7491 = phi float [ %5002, %4945 ], [ %7322, %__nv_exp2f.exit1315 ] + %7492 = phi float [ %5003, %4945 ], [ %7323, %__nv_exp2f.exit1315 ] + %7493 = phi float [ %5004, %4945 ], [ %7324, %__nv_exp2f.exit1315 ] + %7494 = phi float [ %5005, %4945 ], [ %7325, %__nv_exp2f.exit1315 ] + %7495 = phi float [ %5006, %4945 ], [ %7326, %__nv_exp2f.exit1315 ] + %7496 = phi float [ %5007, %4945 ], [ %7327, %__nv_exp2f.exit1315 ] + %7497 = phi float [ %5008, %4945 ], [ %7328, %__nv_exp2f.exit1315 ] + %7498 = phi float [ %5009, %4945 ], [ %7329, %__nv_exp2f.exit1315 ] + %7499 = phi float [ %5010, %4945 ], [ %6410, %__nv_exp2f.exit1315 ] + %7500 = phi float [ %5011, %4945 ], [ %6411, %__nv_exp2f.exit1315 ] + %7501 = phi float [ %5012, %4945 ], [ %6412, %__nv_exp2f.exit1315 ] + %7502 = phi float [ %5013, %4945 ], [ %6413, %__nv_exp2f.exit1315 ] + %7503 = phi float [ %5014, %4945 ], [ %6414, %__nv_exp2f.exit1315 ] + %7504 = phi float [ %5015, %4945 ], [ %6415, %__nv_exp2f.exit1315 ] + %7505 = phi float [ %5016, %4945 ], [ %6416, %__nv_exp2f.exit1315 ] + %7506 = phi float [ %5017, %4945 ], [ %6417, %__nv_exp2f.exit1315 ] + %7507 = phi float [ %5018, %4945 ], [ %6418, %__nv_exp2f.exit1315 ] + %7508 = phi float [ %5019, %4945 ], [ %6419, %__nv_exp2f.exit1315 ] + %7509 = phi float [ %5020, %4945 ], [ %6420, %__nv_exp2f.exit1315 ] + %7510 = phi float [ %5021, %4945 ], [ %6421, %__nv_exp2f.exit1315 ] + %7511 = phi float [ %5022, %4945 ], [ %6422, %__nv_exp2f.exit1315 ] + %7512 = phi float [ %5023, %4945 ], [ %6423, %__nv_exp2f.exit1315 ] + %7513 = phi float [ %5024, %4945 ], [ %6424, %__nv_exp2f.exit1315 ] + %7514 = phi float [ %5025, %4945 ], [ %6425, %__nv_exp2f.exit1315 ] + %7515 = phi float [ %5026, %4945 ], [ %6426, %__nv_exp2f.exit1315 ] + %7516 = phi float [ %5027, %4945 ], [ %6427, %__nv_exp2f.exit1315 ] + %7517 = phi float [ %5028, %4945 ], [ %6428, %__nv_exp2f.exit1315 ] + %7518 = phi float [ %5029, %4945 ], [ %6429, %__nv_exp2f.exit1315 ] + %7519 = phi float [ %5030, %4945 ], [ %6430, %__nv_exp2f.exit1315 ] + %7520 = phi float [ %5031, %4945 ], [ %6431, %__nv_exp2f.exit1315 ] + %7521 = phi float [ %5032, %4945 ], [ %6432, %__nv_exp2f.exit1315 ] + %7522 = phi float [ %5033, %4945 ], [ %6433, %__nv_exp2f.exit1315 ] + %7523 = phi float [ %5034, %4945 ], [ %6434, %__nv_exp2f.exit1315 ] + %7524 = phi float [ %5035, %4945 ], [ %6435, %__nv_exp2f.exit1315 ] + %7525 = phi float [ %5036, %4945 ], [ %6436, %__nv_exp2f.exit1315 ] + %7526 = phi float [ %5037, %4945 ], [ %6437, %__nv_exp2f.exit1315 ] + %7527 = phi float [ %5038, %4945 ], [ %6438, %__nv_exp2f.exit1315 ] + %7528 = phi float [ %5039, %4945 ], [ %6439, %__nv_exp2f.exit1315 ] + %7529 = phi float [ %5040, %4945 ], [ %6440, %__nv_exp2f.exit1315 ] + %7530 = phi float [ %5041, %4945 ], [ %6441, %__nv_exp2f.exit1315 ] + %7531 = phi float [ %5042, %4945 ], [ %6442, %__nv_exp2f.exit1315 ] + %7532 = phi float [ %5043, %4945 ], [ %6443, %__nv_exp2f.exit1315 ] + %7533 = phi float [ %5044, %4945 ], [ %6444, %__nv_exp2f.exit1315 ] + %7534 = phi float [ %5045, %4945 ], [ %6445, %__nv_exp2f.exit1315 ] + %7535 = phi float [ %5046, %4945 ], [ %6446, %__nv_exp2f.exit1315 ] + %7536 = phi float [ %5047, %4945 ], [ %6447, %__nv_exp2f.exit1315 ] + %7537 = phi float [ %5048, %4945 ], [ %6448, %__nv_exp2f.exit1315 ] + %7538 = phi float [ %5049, %4945 ], [ %6449, %__nv_exp2f.exit1315 ] + %7539 = phi float [ %5050, %4945 ], [ %6450, %__nv_exp2f.exit1315 ] + %7540 = phi float [ %5051, %4945 ], [ %6451, %__nv_exp2f.exit1315 ] + %7541 = phi float [ %5052, %4945 ], [ %6452, %__nv_exp2f.exit1315 ] + %7542 = phi float [ %5053, %4945 ], [ %6453, %__nv_exp2f.exit1315 ] + %7543 = phi float [ %5054, %4945 ], [ %6454, %__nv_exp2f.exit1315 ] + %7544 = phi float [ %5055, %4945 ], [ %6455, %__nv_exp2f.exit1315 ] + %7545 = phi float [ %5056, %4945 ], [ %6456, %__nv_exp2f.exit1315 ] + %7546 = phi float [ %5057, %4945 ], [ %6457, %__nv_exp2f.exit1315 ] + %7547 = phi float [ %5058, %4945 ], [ %6458, %__nv_exp2f.exit1315 ] + %7548 = phi float [ %5059, %4945 ], [ %6459, %__nv_exp2f.exit1315 ] + %7549 = phi float [ %5060, %4945 ], [ %6460, %__nv_exp2f.exit1315 ] + %7550 = phi float [ %5061, %4945 ], [ %6461, %__nv_exp2f.exit1315 ] + %7551 = phi float [ %5062, %4945 ], [ %6462, %__nv_exp2f.exit1315 ] + %7552 = phi float [ %5063, %4945 ], [ %6463, %__nv_exp2f.exit1315 ] + %7553 = phi float [ %5064, %4945 ], [ %6464, %__nv_exp2f.exit1315 ] + %7554 = phi float [ %5065, %4945 ], [ %6465, %__nv_exp2f.exit1315 ] + %7555 = phi float [ %5066, %4945 ], [ %6466, %__nv_exp2f.exit1315 ] + %7556 = phi float [ %5067, %4945 ], [ %6467, %__nv_exp2f.exit1315 ] + %7557 = phi float [ %5068, %4945 ], [ %6468, %__nv_exp2f.exit1315 ] + %7558 = phi float [ %5069, %4945 ], [ %6469, %__nv_exp2f.exit1315 ] + %7559 = phi float [ %5070, %4945 ], [ %6470, %__nv_exp2f.exit1315 ] + %7560 = phi float [ %5071, %4945 ], [ %6471, %__nv_exp2f.exit1315 ] + %7561 = phi float [ %5072, %4945 ], [ %6472, %__nv_exp2f.exit1315 ] + %7562 = phi float [ %5073, %4945 ], [ %6473, %__nv_exp2f.exit1315 ] + %7563 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101,$102,$103,$104,$105,$106,$107,$108,$109,$110,$111,$112,$113,$114,$115,$116,$117,$118,$119,$120,$121,$122,$123,$124,$125,$126,$127\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127"(float %7499, float %7500, float %7501, float %7502, float %7503, float %7504, float %7505, float %7506, float %7507, float %7508, float %7509, float %7510, float %7511, float %7512, float %7513, float %7514, float %7515, float %7516, float %7517, float %7518, float %7519, float %7520, float %7521, float %7522, float %7523, float %7524, float %7525, float %7526, float %7527, float %7528, float %7529, float %7530, float %7531, float %7532, float %7533, float %7534, float %7535, float %7536, float %7537, float %7538, float %7539, float %7540, float %7541, float %7542, float %7543, float %7544, float %7545, float %7546, float %7547, float %7548, float %7549, float %7550, float %7551, float %7552, float %7553, float %7554, float %7555, float %7556, float %7557, float %7558, float %7559, float %7560, float %7561, float %7562, float %7435, float %7436, float %7437, float %7438, float %7439, float %7440, float %7441, float %7442, float %7443, float %7444, float %7445, float %7446, float %7447, float %7448, float %7449, float %7450, float %7451, float %7452, float %7453, float %7454, float %7455, float %7456, float %7457, float %7458, float %7459, float %7460, float %7461, float %7462, float %7463, float %7464, float %7465, float %7466, float %7467, float %7468, float %7469, float %7470, float %7471, float %7472, float %7473, float %7474, float %7475, float %7476, float %7477, float %7478, float %7479, float %7480, float %7481, float %7482, float %7483, float %7484, float %7485, float %7486, float %7487, float %7488, float %7489, float %7490, float %7491, float %7492, float %7493, float %7494, float %7495, float %7496, float %7497, float %7498) #3, !dbg !213 + %7564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 0, !dbg !213 + %7565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 1, !dbg !213 + %7566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 2, !dbg !213 + %7567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 3, !dbg !213 + %7568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 4, !dbg !213 + %7569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 5, !dbg !213 + %7570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 6, !dbg !213 + %7571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 7, !dbg !213 + %7572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 8, !dbg !213 + %7573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 9, !dbg !213 + %7574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 10, !dbg !213 + %7575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 11, !dbg !213 + %7576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 12, !dbg !213 + %7577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 13, !dbg !213 + %7578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 14, !dbg !213 + %7579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 15, !dbg !213 + %7580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 16, !dbg !213 + %7581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 17, !dbg !213 + %7582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 18, !dbg !213 + %7583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 19, !dbg !213 + %7584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 20, !dbg !213 + %7585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 21, !dbg !213 + %7586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 22, !dbg !213 + %7587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 23, !dbg !213 + %7588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 24, !dbg !213 + %7589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 25, !dbg !213 + %7590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 26, !dbg !213 + %7591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 27, !dbg !213 + %7592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 28, !dbg !213 + %7593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 29, !dbg !213 + %7594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 30, !dbg !213 + %7595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 31, !dbg !213 + %7596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 32, !dbg !213 + %7597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 33, !dbg !213 + %7598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 34, !dbg !213 + %7599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 35, !dbg !213 + %7600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 36, !dbg !213 + %7601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 37, !dbg !213 + %7602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 38, !dbg !213 + %7603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 39, !dbg !213 + %7604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 40, !dbg !213 + %7605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 41, !dbg !213 + %7606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 42, !dbg !213 + %7607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 43, !dbg !213 + %7608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 44, !dbg !213 + %7609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 45, !dbg !213 + %7610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 46, !dbg !213 + %7611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 47, !dbg !213 + %7612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 48, !dbg !213 + %7613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 49, !dbg !213 + %7614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 50, !dbg !213 + %7615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 51, !dbg !213 + %7616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 52, !dbg !213 + %7617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 53, !dbg !213 + %7618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 54, !dbg !213 + %7619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 55, !dbg !213 + %7620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 56, !dbg !213 + %7621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 57, !dbg !213 + %7622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 58, !dbg !213 + %7623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 59, !dbg !213 + %7624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 60, !dbg !213 + %7625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 61, !dbg !213 + %7626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 62, !dbg !213 + %7627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 63, !dbg !213 + %7628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 64, !dbg !213 + %7629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 65, !dbg !213 + %7630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 66, !dbg !213 + %7631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 67, !dbg !213 + %7632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 68, !dbg !213 + %7633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 69, !dbg !213 + %7634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 70, !dbg !213 + %7635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 71, !dbg !213 + %7636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 72, !dbg !213 + %7637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 73, !dbg !213 + %7638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 74, !dbg !213 + %7639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 75, !dbg !213 + %7640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 76, !dbg !213 + %7641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 77, !dbg !213 + %7642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 78, !dbg !213 + %7643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 79, !dbg !213 + %7644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 80, !dbg !213 + %7645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 81, !dbg !213 + %7646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 82, !dbg !213 + %7647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 83, !dbg !213 + %7648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 84, !dbg !213 + %7649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 85, !dbg !213 + %7650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 86, !dbg !213 + %7651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 87, !dbg !213 + %7652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 88, !dbg !213 + %7653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 89, !dbg !213 + %7654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 90, !dbg !213 + %7655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 91, !dbg !213 + %7656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 92, !dbg !213 + %7657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 93, !dbg !213 + %7658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 94, !dbg !213 + %7659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 95, !dbg !213 + %7660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 96, !dbg !213 + %7661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 97, !dbg !213 + %7662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 98, !dbg !213 + %7663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 99, !dbg !213 + %7664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 100, !dbg !213 + %7665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 101, !dbg !213 + %7666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 102, !dbg !213 + %7667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 103, !dbg !213 + %7668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 104, !dbg !213 + %7669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 105, !dbg !213 + %7670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 106, !dbg !213 + %7671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 107, !dbg !213 + %7672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 108, !dbg !213 + %7673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 109, !dbg !213 + %7674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 110, !dbg !213 + %7675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 111, !dbg !213 + %7676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 112, !dbg !213 + %7677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 113, !dbg !213 + %7678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 114, !dbg !213 + %7679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 115, !dbg !213 + %7680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 116, !dbg !213 + %7681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 117, !dbg !213 + %7682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 118, !dbg !213 + %7683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 119, !dbg !213 + %7684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 120, !dbg !213 + %7685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 121, !dbg !213 + %7686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 122, !dbg !213 + %7687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 123, !dbg !213 + %7688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 124, !dbg !213 + %7689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 125, !dbg !213 + %7690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 126, !dbg !213 + %7691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 127, !dbg !213 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !213 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !213 + %7692 = getelementptr bfloat, ptr addrspace(1) %5085, i64 %4886, !dbg !286 + %7693 = getelementptr bfloat, ptr addrspace(1) %5085, i64 %4887, !dbg !286 + %7694 = getelementptr bfloat, ptr addrspace(1) %5085, i64 %4888, !dbg !286 + %7695 = getelementptr bfloat, ptr addrspace(1) %5085, i64 %4889, !dbg !286 + %7696 = getelementptr bfloat, ptr addrspace(1) %7692, i64 %4498, !dbg !287 + %7697 = getelementptr bfloat, ptr addrspace(1) %7693, i64 %4498, !dbg !287 + %7698 = getelementptr bfloat, ptr addrspace(1) %7694, i64 %4498, !dbg !287 + %7699 = getelementptr bfloat, ptr addrspace(1) %7695, i64 %4498, !dbg !287 + %7700 = getelementptr bfloat, ptr addrspace(1) %5086, i64 %4890, !dbg !288 + %7701 = getelementptr bfloat, ptr addrspace(1) %5086, i64 %4891, !dbg !288 + %7702 = getelementptr bfloat, ptr addrspace(1) %5086, i64 %4892, !dbg !288 + %7703 = getelementptr bfloat, ptr addrspace(1) %5086, i64 %4893, !dbg !288 + %7704 = getelementptr bfloat, ptr addrspace(1) %7700, i64 %4498, !dbg !289 + %7705 = getelementptr bfloat, ptr addrspace(1) %7701, i64 %4498, !dbg !289 + %7706 = getelementptr bfloat, ptr addrspace(1) %7702, i64 %4498, !dbg !289 + %7707 = getelementptr bfloat, ptr addrspace(1) %7703, i64 %4498, !dbg !289 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4794, ptr addrspace(1) %7696, i32 %4895) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4797, ptr addrspace(1) %7697, i32 %4895) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4799, ptr addrspace(1) %7698, i32 %4895) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4801, ptr addrspace(1) %7699, i32 %4895) #3, !dbg !290 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !290 + %7708 = getelementptr float, ptr addrspace(1) %5087, i64 %4896, !dbg !291 + %7709 = getelementptr float, ptr addrspace(1) %5087, i64 %4897, !dbg !291 + %7710 = getelementptr float, ptr addrspace(1) %5087, i64 %4898, !dbg !291 + %7711 = getelementptr float, ptr addrspace(1) %5087, i64 %4899, !dbg !291 + %7712 = getelementptr float, ptr addrspace(1) %5087, i64 %4900, !dbg !291 + %7713 = getelementptr float, ptr addrspace(1) %5087, i64 %4901, !dbg !291 + %7714 = getelementptr float, ptr addrspace(1) %5087, i64 %4902, !dbg !291 + %7715 = getelementptr float, ptr addrspace(1) %5087, i64 %4903, !dbg !291 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %4813, ptr addrspace(1) %7708, i32 %4904, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4816, ptr addrspace(1) %7709, i32 %4904, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4818, ptr addrspace(1) %7710, i32 %4904, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4820, ptr addrspace(1) %7711, i32 %4904, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4822, ptr addrspace(1) %7712, i32 %4904, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4824, ptr addrspace(1) %7713, i32 %4904, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4826, ptr addrspace(1) %7714, i32 %4904, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4828, ptr addrspace(1) %7715, i32 %4904, i1 %4811) #3, !dbg !292 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !292 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4829, ptr addrspace(1) %7704, i32 %4895) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4830, ptr addrspace(1) %7705, i32 %4895) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4831, ptr addrspace(1) %7706, i32 %4895) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4832, ptr addrspace(1) %7707, i32 %4895) #3, !dbg !290 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !290 + %7716 = getelementptr float, ptr addrspace(1) %5088, i64 %4896, !dbg !293 + %7717 = getelementptr float, ptr addrspace(1) %5088, i64 %4897, !dbg !293 + %7718 = getelementptr float, ptr addrspace(1) %5088, i64 %4898, !dbg !293 + %7719 = getelementptr float, ptr addrspace(1) %5088, i64 %4899, !dbg !293 + %7720 = getelementptr float, ptr addrspace(1) %5088, i64 %4900, !dbg !293 + %7721 = getelementptr float, ptr addrspace(1) %5088, i64 %4901, !dbg !293 + %7722 = getelementptr float, ptr addrspace(1) %5088, i64 %4902, !dbg !293 + %7723 = getelementptr float, ptr addrspace(1) %5088, i64 %4903, !dbg !293 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %4833, ptr addrspace(1) %7716, i32 %4904, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4834, ptr addrspace(1) %7717, i32 %4904, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4835, ptr addrspace(1) %7718, i32 %4904, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4836, ptr addrspace(1) %7719, i32 %4904, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4837, ptr addrspace(1) %7720, i32 %4904, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4838, ptr addrspace(1) %7721, i32 %4904, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4839, ptr addrspace(1) %7722, i32 %4904, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4840, ptr addrspace(1) %7723, i32 %4904, i1 %4811) #3, !dbg !294 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !294 + %7724 = getelementptr i8, ptr addrspace(1) %7696, i64 524288, !dbg !295 + %7725 = getelementptr i8, ptr addrspace(1) %7697, i64 524288, !dbg !295 + %7726 = getelementptr i8, ptr addrspace(1) %7698, i64 524288, !dbg !295 + %7727 = getelementptr i8, ptr addrspace(1) %7699, i64 524288, !dbg !295 + %7728 = getelementptr i8, ptr addrspace(1) %7704, i64 16384, !dbg !296 + %7729 = getelementptr i8, ptr addrspace(1) %7705, i64 16384, !dbg !296 + %7730 = getelementptr i8, ptr addrspace(1) %7706, i64 16384, !dbg !296 + %7731 = getelementptr i8, ptr addrspace(1) %7707, i64 16384, !dbg !296 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4850, ptr addrspace(1) %7724, i32 %4914) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4852, ptr addrspace(1) %7725, i32 %4914) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4853, ptr addrspace(1) %7726, i32 %4914) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4854, ptr addrspace(1) %7727, i32 %4914) #3, !dbg !290 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !290 + %7732 = getelementptr float, ptr addrspace(1) %5087, i64 %4915, !dbg !291 + %7733 = getelementptr float, ptr addrspace(1) %5087, i64 %4916, !dbg !291 + %7734 = getelementptr float, ptr addrspace(1) %5087, i64 %4917, !dbg !291 + %7735 = getelementptr float, ptr addrspace(1) %5087, i64 %4918, !dbg !291 + %7736 = getelementptr float, ptr addrspace(1) %5087, i64 %4919, !dbg !291 + %7737 = getelementptr float, ptr addrspace(1) %5087, i64 %4920, !dbg !291 + %7738 = getelementptr float, ptr addrspace(1) %5087, i64 %4921, !dbg !291 + %7739 = getelementptr float, ptr addrspace(1) %5087, i64 %4922, !dbg !291 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %4863, ptr addrspace(1) %7732, i32 %4923, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4865, ptr addrspace(1) %7733, i32 %4923, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4866, ptr addrspace(1) %7734, i32 %4923, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4867, ptr addrspace(1) %7735, i32 %4923, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4868, ptr addrspace(1) %7736, i32 %4923, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4869, ptr addrspace(1) %7737, i32 %4923, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4870, ptr addrspace(1) %7738, i32 %4923, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4871, ptr addrspace(1) %7739, i32 %4923, i1 %4811) #3, !dbg !292 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !292 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4872, ptr addrspace(1) %7728, i32 %4914) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4873, ptr addrspace(1) %7729, i32 %4914) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4874, ptr addrspace(1) %7730, i32 %4914) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4875, ptr addrspace(1) %7731, i32 %4914) #3, !dbg !290 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !290 + %7740 = getelementptr float, ptr addrspace(1) %5088, i64 %4915, !dbg !293 + %7741 = getelementptr float, ptr addrspace(1) %5088, i64 %4916, !dbg !293 + %7742 = getelementptr float, ptr addrspace(1) %5088, i64 %4917, !dbg !293 + %7743 = getelementptr float, ptr addrspace(1) %5088, i64 %4918, !dbg !293 + %7744 = getelementptr float, ptr addrspace(1) %5088, i64 %4919, !dbg !293 + %7745 = getelementptr float, ptr addrspace(1) %5088, i64 %4920, !dbg !293 + %7746 = getelementptr float, ptr addrspace(1) %5088, i64 %4921, !dbg !293 + %7747 = getelementptr float, ptr addrspace(1) %5088, i64 %4922, !dbg !293 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %4876, ptr addrspace(1) %7740, i32 %4923, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4877, ptr addrspace(1) %7741, i32 %4923, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4878, ptr addrspace(1) %7742, i32 %4923, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4879, ptr addrspace(1) %7743, i32 %4923, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4880, ptr addrspace(1) %7744, i32 %4923, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4881, ptr addrspace(1) %7745, i32 %4923, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4882, ptr addrspace(1) %7746, i32 %4923, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4883, ptr addrspace(1) %7747, i32 %4923, i1 %4811) #3, !dbg !294 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !294 + br i1 %4894, label %.lr.ph1691, label %._crit_edge1692, !dbg !297 + +.lr.ph1691: ; preds = %._crit_edge, %__nv_exp2f.exit1219 + %7748 = phi i32 [ %7757, %__nv_exp2f.exit1219 ], [ -1, %._crit_edge ] + %7749 = phi i32 [ %9619, %__nv_exp2f.exit1219 ], [ 1, %._crit_edge ] + %7750 = phi i32 [ %7760, %__nv_exp2f.exit1219 ], [ -1, %._crit_edge ] + %7751 = phi i32 [ %9622, %__nv_exp2f.exit1219 ], [ 1, %._crit_edge ] + %.pn6361689 = phi i32 [ %9616, %__nv_exp2f.exit1219 ], [ %4913, %._crit_edge ] + %.pn6401688 = phi i32 [ %9615, %__nv_exp2f.exit1219 ], [ %4912, %._crit_edge ] + %.pn6441687 = phi i32 [ %9614, %__nv_exp2f.exit1219 ], [ %4911, %._crit_edge ] + %.pn6481686 = phi i32 [ %9613, %__nv_exp2f.exit1219 ], [ %4910, %._crit_edge ] + %.pn6521685 = phi i32 [ %9612, %__nv_exp2f.exit1219 ], [ %4909, %._crit_edge ] + %.pn6561684 = phi i32 [ %9611, %__nv_exp2f.exit1219 ], [ %4908, %._crit_edge ] + %.pn6601683 = phi i32 [ %9610, %__nv_exp2f.exit1219 ], [ %4907, %._crit_edge ] + %.pn6641682 = phi i32 [ %9609, %__nv_exp2f.exit1219 ], [ %4906, %._crit_edge ] + %.pn5841681 = phi ptr addrspace(1) [ %9608, %__nv_exp2f.exit1219 ], [ %7731, %._crit_edge ] + %.pn6001680 = phi ptr addrspace(1) [ %9607, %__nv_exp2f.exit1219 ], [ %7730, %._crit_edge ] + %.pn6161679 = phi ptr addrspace(1) [ %9606, %__nv_exp2f.exit1219 ], [ %7729, %._crit_edge ] + %.pn6321678 = phi ptr addrspace(1) [ %9605, %__nv_exp2f.exit1219 ], [ %7728, %._crit_edge ] + %.pn5201677 = phi ptr addrspace(1) [ %9602, %__nv_exp2f.exit1219 ], [ %7727, %._crit_edge ] + %.pn5361676 = phi ptr addrspace(1) [ %9601, %__nv_exp2f.exit1219 ], [ %7726, %._crit_edge ] + %.pn5521675 = phi ptr addrspace(1) [ %9600, %__nv_exp2f.exit1219 ], [ %7725, %._crit_edge ] + %.pn5681674 = phi ptr addrspace(1) [ %9599, %__nv_exp2f.exit1219 ], [ %7724, %._crit_edge ] + %.pn3781673 = phi float [ %8816, %__nv_exp2f.exit1219 ], [ %7627, %._crit_edge ] + %.pn3801672 = phi float [ %8815, %__nv_exp2f.exit1219 ], [ %7626, %._crit_edge ] + %.pn3821671 = phi float [ %8814, %__nv_exp2f.exit1219 ], [ %7625, %._crit_edge ] + %.pn3841670 = phi float [ %8813, %__nv_exp2f.exit1219 ], [ %7624, %._crit_edge ] + %.pn3861669 = phi float [ %8812, %__nv_exp2f.exit1219 ], [ %7623, %._crit_edge ] + %.pn3881668 = phi float [ %8811, %__nv_exp2f.exit1219 ], [ %7622, %._crit_edge ] + %.pn3901667 = phi float [ %8810, %__nv_exp2f.exit1219 ], [ %7621, %._crit_edge ] + %.pn3921666 = phi float [ %8809, %__nv_exp2f.exit1219 ], [ %7620, %._crit_edge ] + %.pn3941665 = phi float [ %8808, %__nv_exp2f.exit1219 ], [ %7619, %._crit_edge ] + %.pn3961664 = phi float [ %8807, %__nv_exp2f.exit1219 ], [ %7618, %._crit_edge ] + %.pn3981663 = phi float [ %8806, %__nv_exp2f.exit1219 ], [ %7617, %._crit_edge ] + %.pn4001662 = phi float [ %8805, %__nv_exp2f.exit1219 ], [ %7616, %._crit_edge ] + %.pn4021661 = phi float [ %8804, %__nv_exp2f.exit1219 ], [ %7615, %._crit_edge ] + %.pn4041660 = phi float [ %8803, %__nv_exp2f.exit1219 ], [ %7614, %._crit_edge ] + %.pn4061659 = phi float [ %8802, %__nv_exp2f.exit1219 ], [ %7613, %._crit_edge ] + %.pn4081658 = phi float [ %8801, %__nv_exp2f.exit1219 ], [ %7612, %._crit_edge ] + %.pn4101657 = phi float [ %8800, %__nv_exp2f.exit1219 ], [ %7611, %._crit_edge ] + %.pn4121656 = phi float [ %8799, %__nv_exp2f.exit1219 ], [ %7610, %._crit_edge ] + %.pn4141655 = phi float [ %8798, %__nv_exp2f.exit1219 ], [ %7609, %._crit_edge ] + %.pn4161654 = phi float [ %8797, %__nv_exp2f.exit1219 ], [ %7608, %._crit_edge ] + %.pn4181653 = phi float [ %8796, %__nv_exp2f.exit1219 ], [ %7607, %._crit_edge ] + %.pn4201652 = phi float [ %8795, %__nv_exp2f.exit1219 ], [ %7606, %._crit_edge ] + %.pn4221651 = phi float [ %8794, %__nv_exp2f.exit1219 ], [ %7605, %._crit_edge ] + %.pn4241650 = phi float [ %8793, %__nv_exp2f.exit1219 ], [ %7604, %._crit_edge ] + %.pn4261649 = phi float [ %8792, %__nv_exp2f.exit1219 ], [ %7603, %._crit_edge ] + %.pn4281648 = phi float [ %8791, %__nv_exp2f.exit1219 ], [ %7602, %._crit_edge ] + %.pn4301647 = phi float [ %8790, %__nv_exp2f.exit1219 ], [ %7601, %._crit_edge ] + %.pn4321646 = phi float [ %8789, %__nv_exp2f.exit1219 ], [ %7600, %._crit_edge ] + %.pn4341645 = phi float [ %8788, %__nv_exp2f.exit1219 ], [ %7599, %._crit_edge ] + %.pn4361644 = phi float [ %8787, %__nv_exp2f.exit1219 ], [ %7598, %._crit_edge ] + %.pn4381643 = phi float [ %8786, %__nv_exp2f.exit1219 ], [ %7597, %._crit_edge ] + %.pn4401642 = phi float [ %8785, %__nv_exp2f.exit1219 ], [ %7596, %._crit_edge ] + %.pn4421641 = phi float [ %8784, %__nv_exp2f.exit1219 ], [ %7595, %._crit_edge ] + %.pn4441640 = phi float [ %8783, %__nv_exp2f.exit1219 ], [ %7594, %._crit_edge ] + %.pn4461639 = phi float [ %8782, %__nv_exp2f.exit1219 ], [ %7593, %._crit_edge ] + %.pn4481638 = phi float [ %8781, %__nv_exp2f.exit1219 ], [ %7592, %._crit_edge ] + %.pn4501637 = phi float [ %8780, %__nv_exp2f.exit1219 ], [ %7591, %._crit_edge ] + %.pn4521636 = phi float [ %8779, %__nv_exp2f.exit1219 ], [ %7590, %._crit_edge ] + %.pn4541635 = phi float [ %8778, %__nv_exp2f.exit1219 ], [ %7589, %._crit_edge ] + %.pn4561634 = phi float [ %8777, %__nv_exp2f.exit1219 ], [ %7588, %._crit_edge ] + %.pn4581633 = phi float [ %8776, %__nv_exp2f.exit1219 ], [ %7587, %._crit_edge ] + %.pn4601632 = phi float [ %8775, %__nv_exp2f.exit1219 ], [ %7586, %._crit_edge ] + %.pn4621631 = phi float [ %8774, %__nv_exp2f.exit1219 ], [ %7585, %._crit_edge ] + %.pn4641630 = phi float [ %8773, %__nv_exp2f.exit1219 ], [ %7584, %._crit_edge ] + %.pn4661629 = phi float [ %8772, %__nv_exp2f.exit1219 ], [ %7583, %._crit_edge ] + %.pn4681628 = phi float [ %8771, %__nv_exp2f.exit1219 ], [ %7582, %._crit_edge ] + %.pn4701627 = phi float [ %8770, %__nv_exp2f.exit1219 ], [ %7581, %._crit_edge ] + %.pn4721626 = phi float [ %8769, %__nv_exp2f.exit1219 ], [ %7580, %._crit_edge ] + %.pn4741625 = phi float [ %8768, %__nv_exp2f.exit1219 ], [ %7579, %._crit_edge ] + %.pn4761624 = phi float [ %8767, %__nv_exp2f.exit1219 ], [ %7578, %._crit_edge ] + %.pn4781623 = phi float [ %8766, %__nv_exp2f.exit1219 ], [ %7577, %._crit_edge ] + %.pn4801622 = phi float [ %8765, %__nv_exp2f.exit1219 ], [ %7576, %._crit_edge ] + %.pn4821621 = phi float [ %8764, %__nv_exp2f.exit1219 ], [ %7575, %._crit_edge ] + %.pn4841620 = phi float [ %8763, %__nv_exp2f.exit1219 ], [ %7574, %._crit_edge ] + %.pn4861619 = phi float [ %8762, %__nv_exp2f.exit1219 ], [ %7573, %._crit_edge ] + %.pn4881618 = phi float [ %8761, %__nv_exp2f.exit1219 ], [ %7572, %._crit_edge ] + %.pn4901617 = phi float [ %8760, %__nv_exp2f.exit1219 ], [ %7571, %._crit_edge ] + %.pn4921616 = phi float [ %8759, %__nv_exp2f.exit1219 ], [ %7570, %._crit_edge ] + %.pn4941615 = phi float [ %8758, %__nv_exp2f.exit1219 ], [ %7569, %._crit_edge ] + %.pn4961614 = phi float [ %8757, %__nv_exp2f.exit1219 ], [ %7568, %._crit_edge ] + %.pn4981613 = phi float [ %8756, %__nv_exp2f.exit1219 ], [ %7567, %._crit_edge ] + %.pn5001612 = phi float [ %8755, %__nv_exp2f.exit1219 ], [ %7566, %._crit_edge ] + %.pn5021611 = phi float [ %8754, %__nv_exp2f.exit1219 ], [ %7565, %._crit_edge ] + %.pn5041610 = phi float [ %8753, %__nv_exp2f.exit1219 ], [ %7564, %._crit_edge ] + %.pn2501609 = phi float [ %9576, %__nv_exp2f.exit1219 ], [ %7691, %._crit_edge ] + %.pn2521608 = phi float [ %9575, %__nv_exp2f.exit1219 ], [ %7690, %._crit_edge ] + %.pn2541607 = phi float [ %9574, %__nv_exp2f.exit1219 ], [ %7689, %._crit_edge ] + %.pn2561606 = phi float [ %9573, %__nv_exp2f.exit1219 ], [ %7688, %._crit_edge ] + %.pn2581605 = phi float [ %9572, %__nv_exp2f.exit1219 ], [ %7687, %._crit_edge ] + %.pn2601604 = phi float [ %9571, %__nv_exp2f.exit1219 ], [ %7686, %._crit_edge ] + %.pn2621603 = phi float [ %9570, %__nv_exp2f.exit1219 ], [ %7685, %._crit_edge ] + %.pn2641602 = phi float [ %9569, %__nv_exp2f.exit1219 ], [ %7684, %._crit_edge ] + %.pn2661601 = phi float [ %9568, %__nv_exp2f.exit1219 ], [ %7683, %._crit_edge ] + %.pn2681600 = phi float [ %9567, %__nv_exp2f.exit1219 ], [ %7682, %._crit_edge ] + %.pn2701599 = phi float [ %9566, %__nv_exp2f.exit1219 ], [ %7681, %._crit_edge ] + %.pn2721598 = phi float [ %9565, %__nv_exp2f.exit1219 ], [ %7680, %._crit_edge ] + %.pn2741597 = phi float [ %9564, %__nv_exp2f.exit1219 ], [ %7679, %._crit_edge ] + %.pn2761596 = phi float [ %9563, %__nv_exp2f.exit1219 ], [ %7678, %._crit_edge ] + %.pn2781595 = phi float [ %9562, %__nv_exp2f.exit1219 ], [ %7677, %._crit_edge ] + %.pn2801594 = phi float [ %9561, %__nv_exp2f.exit1219 ], [ %7676, %._crit_edge ] + %.pn2821593 = phi float [ %9560, %__nv_exp2f.exit1219 ], [ %7675, %._crit_edge ] + %.pn2841592 = phi float [ %9559, %__nv_exp2f.exit1219 ], [ %7674, %._crit_edge ] + %.pn2861591 = phi float [ %9558, %__nv_exp2f.exit1219 ], [ %7673, %._crit_edge ] + %.pn2881590 = phi float [ %9557, %__nv_exp2f.exit1219 ], [ %7672, %._crit_edge ] + %.pn2901589 = phi float [ %9556, %__nv_exp2f.exit1219 ], [ %7671, %._crit_edge ] + %.pn2921588 = phi float [ %9555, %__nv_exp2f.exit1219 ], [ %7670, %._crit_edge ] + %.pn2941587 = phi float [ %9554, %__nv_exp2f.exit1219 ], [ %7669, %._crit_edge ] + %.pn2961586 = phi float [ %9553, %__nv_exp2f.exit1219 ], [ %7668, %._crit_edge ] + %.pn2981585 = phi float [ %9552, %__nv_exp2f.exit1219 ], [ %7667, %._crit_edge ] + %.pn3001584 = phi float [ %9551, %__nv_exp2f.exit1219 ], [ %7666, %._crit_edge ] + %.pn3021583 = phi float [ %9550, %__nv_exp2f.exit1219 ], [ %7665, %._crit_edge ] + %.pn3041582 = phi float [ %9549, %__nv_exp2f.exit1219 ], [ %7664, %._crit_edge ] + %.pn3061581 = phi float [ %9548, %__nv_exp2f.exit1219 ], [ %7663, %._crit_edge ] + %.pn3081580 = phi float [ %9547, %__nv_exp2f.exit1219 ], [ %7662, %._crit_edge ] + %.pn3101579 = phi float [ %9546, %__nv_exp2f.exit1219 ], [ %7661, %._crit_edge ] + %.pn3121578 = phi float [ %9545, %__nv_exp2f.exit1219 ], [ %7660, %._crit_edge ] + %.pn3141577 = phi float [ %9544, %__nv_exp2f.exit1219 ], [ %7659, %._crit_edge ] + %.pn3161576 = phi float [ %9543, %__nv_exp2f.exit1219 ], [ %7658, %._crit_edge ] + %.pn3181575 = phi float [ %9542, %__nv_exp2f.exit1219 ], [ %7657, %._crit_edge ] + %.pn3201574 = phi float [ %9541, %__nv_exp2f.exit1219 ], [ %7656, %._crit_edge ] + %.pn3221573 = phi float [ %9540, %__nv_exp2f.exit1219 ], [ %7655, %._crit_edge ] + %.pn3241572 = phi float [ %9539, %__nv_exp2f.exit1219 ], [ %7654, %._crit_edge ] + %.pn3261571 = phi float [ %9538, %__nv_exp2f.exit1219 ], [ %7653, %._crit_edge ] + %.pn3281570 = phi float [ %9537, %__nv_exp2f.exit1219 ], [ %7652, %._crit_edge ] + %.pn3301569 = phi float [ %9536, %__nv_exp2f.exit1219 ], [ %7651, %._crit_edge ] + %.pn3321568 = phi float [ %9535, %__nv_exp2f.exit1219 ], [ %7650, %._crit_edge ] + %.pn3341567 = phi float [ %9534, %__nv_exp2f.exit1219 ], [ %7649, %._crit_edge ] + %.pn3361566 = phi float [ %9533, %__nv_exp2f.exit1219 ], [ %7648, %._crit_edge ] + %.pn3381565 = phi float [ %9532, %__nv_exp2f.exit1219 ], [ %7647, %._crit_edge ] + %.pn3401564 = phi float [ %9531, %__nv_exp2f.exit1219 ], [ %7646, %._crit_edge ] + %.pn3421563 = phi float [ %9530, %__nv_exp2f.exit1219 ], [ %7645, %._crit_edge ] + %.pn3441562 = phi float [ %9529, %__nv_exp2f.exit1219 ], [ %7644, %._crit_edge ] + %.pn3461561 = phi float [ %9528, %__nv_exp2f.exit1219 ], [ %7643, %._crit_edge ] + %.pn3481560 = phi float [ %9527, %__nv_exp2f.exit1219 ], [ %7642, %._crit_edge ] + %.pn3501559 = phi float [ %9526, %__nv_exp2f.exit1219 ], [ %7641, %._crit_edge ] + %.pn3521558 = phi float [ %9525, %__nv_exp2f.exit1219 ], [ %7640, %._crit_edge ] + %.pn3541557 = phi float [ %9524, %__nv_exp2f.exit1219 ], [ %7639, %._crit_edge ] + %.pn3561556 = phi float [ %9523, %__nv_exp2f.exit1219 ], [ %7638, %._crit_edge ] + %.pn3581555 = phi float [ %9522, %__nv_exp2f.exit1219 ], [ %7637, %._crit_edge ] + %.pn3601554 = phi float [ %9521, %__nv_exp2f.exit1219 ], [ %7636, %._crit_edge ] + %.pn3621553 = phi float [ %9520, %__nv_exp2f.exit1219 ], [ %7635, %._crit_edge ] + %.pn3641552 = phi float [ %9519, %__nv_exp2f.exit1219 ], [ %7634, %._crit_edge ] + %.pn3661551 = phi float [ %9518, %__nv_exp2f.exit1219 ], [ %7633, %._crit_edge ] + %.pn3681550 = phi float [ %9517, %__nv_exp2f.exit1219 ], [ %7632, %._crit_edge ] + %.pn3701549 = phi float [ %9516, %__nv_exp2f.exit1219 ], [ %7631, %._crit_edge ] + %.pn3721548 = phi float [ %9515, %__nv_exp2f.exit1219 ], [ %7630, %._crit_edge ] + %.pn3741547 = phi float [ %9514, %__nv_exp2f.exit1219 ], [ %7629, %._crit_edge ] + %.pn3761546 = phi float [ %9513, %__nv_exp2f.exit1219 ], [ %7628, %._crit_edge ] + %7752 = phi i32 [ %9577, %__nv_exp2f.exit1219 ], [ 0, %._crit_edge ] + %7753 = icmp slt i32 %7752, %4924, !dbg !297 + %7754 = icmp slt i32 %7752, %4925, !dbg !297 + %7755 = add i32 %7748, 1, !dbg !297 + %7756 = icmp sgt i32 %7755, 1, !dbg !297 + %7757 = select i1 %7756, i32 0, i32 %7755, !dbg !297 + %7758 = add i32 %7750, 1, !dbg !297 + %7759 = icmp sgt i32 %7758, 2, !dbg !297 + %7760 = select i1 %7759, i32 0, i32 %7758, !dbg !297 + tail call void @llvm.nvvm.cp.async.wait.group(i32 4), !dbg !290 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !290 + %7761 = shl i32 %7760, 13, !dbg !290 + %7762 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %7761, !dbg !290 + %7763 = shl i32 %7757, 6, !dbg !292 + %7764 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %7763, !dbg !292 + %7765 = getelementptr inbounds nuw i8, ptr addrspace(3) %7764, i32 %4812, !dbg !292 + %7766 = load float, ptr addrspace(3) %7765, align 8, !dbg !292 + %7767 = getelementptr inbounds nuw i8, ptr addrspace(3) %7765, i32 4, !dbg !292 + %7768 = load float, ptr addrspace(3) %7767, align 4, !dbg !292 + %7769 = getelementptr inbounds nuw i8, ptr addrspace(3) %7764, i32 %4815, !dbg !292 + %7770 = load float, ptr addrspace(3) %7769, align 8, !dbg !292 + %7771 = getelementptr inbounds nuw i8, ptr addrspace(3) %7769, i32 4, !dbg !292 + %7772 = load float, ptr addrspace(3) %7771, align 4, !dbg !292 + %7773 = getelementptr inbounds nuw i8, ptr addrspace(3) %7764, i32 %4817, !dbg !292 + %7774 = load float, ptr addrspace(3) %7773, align 8, !dbg !292 + %7775 = getelementptr inbounds nuw i8, ptr addrspace(3) %7773, i32 4, !dbg !292 + %7776 = load float, ptr addrspace(3) %7775, align 4, !dbg !292 + %7777 = getelementptr inbounds nuw i8, ptr addrspace(3) %7764, i32 %4819, !dbg !292 + %7778 = load float, ptr addrspace(3) %7777, align 8, !dbg !292 + %7779 = getelementptr inbounds nuw i8, ptr addrspace(3) %7777, i32 4, !dbg !292 + %7780 = load float, ptr addrspace(3) %7779, align 4, !dbg !292 + %7781 = getelementptr inbounds nuw i8, ptr addrspace(3) %7764, i32 %4821, !dbg !292 + %7782 = load float, ptr addrspace(3) %7781, align 8, !dbg !292 + %7783 = getelementptr inbounds nuw i8, ptr addrspace(3) %7781, i32 4, !dbg !292 + %7784 = load float, ptr addrspace(3) %7783, align 4, !dbg !292 + %7785 = getelementptr inbounds nuw i8, ptr addrspace(3) %7764, i32 %4823, !dbg !292 + %7786 = load float, ptr addrspace(3) %7785, align 8, !dbg !292 + %7787 = getelementptr inbounds nuw i8, ptr addrspace(3) %7785, i32 4, !dbg !292 + %7788 = load float, ptr addrspace(3) %7787, align 4, !dbg !292 + %7789 = getelementptr inbounds nuw i8, ptr addrspace(3) %7764, i32 %4825, !dbg !292 + %7790 = load float, ptr addrspace(3) %7789, align 8, !dbg !292 + %7791 = getelementptr inbounds nuw i8, ptr addrspace(3) %7789, i32 4, !dbg !292 + %7792 = load float, ptr addrspace(3) %7791, align 4, !dbg !292 + %7793 = getelementptr inbounds nuw i8, ptr addrspace(3) %7764, i32 %4827, !dbg !292 + %7794 = load float, ptr addrspace(3) %7793, align 8, !dbg !292 + %7795 = getelementptr inbounds nuw i8, ptr addrspace(3) %7793, i32 4, !dbg !292 + %7796 = load float, ptr addrspace(3) %7795, align 4, !dbg !292 + %7797 = fcmp oeq float %7766, 0xFFF0000000000000, !dbg !298 + %7798 = fcmp oeq float %7768, 0xFFF0000000000000, !dbg !298 + %7799 = fcmp oeq float %7770, 0xFFF0000000000000, !dbg !298 + %7800 = fcmp oeq float %7772, 0xFFF0000000000000, !dbg !298 + %7801 = fcmp oeq float %7774, 0xFFF0000000000000, !dbg !298 + %7802 = fcmp oeq float %7776, 0xFFF0000000000000, !dbg !298 + %7803 = fcmp oeq float %7778, 0xFFF0000000000000, !dbg !298 + %7804 = fcmp oeq float %7780, 0xFFF0000000000000, !dbg !298 + %7805 = fcmp oeq float %7782, 0xFFF0000000000000, !dbg !298 + %7806 = fcmp oeq float %7784, 0xFFF0000000000000, !dbg !298 + %7807 = fcmp oeq float %7786, 0xFFF0000000000000, !dbg !298 + %7808 = fcmp oeq float %7788, 0xFFF0000000000000, !dbg !298 + %7809 = fcmp oeq float %7790, 0xFFF0000000000000, !dbg !298 + %7810 = fcmp oeq float %7792, 0xFFF0000000000000, !dbg !298 + %7811 = fcmp oeq float %7794, 0xFFF0000000000000, !dbg !298 + %7812 = fcmp oeq float %7796, 0xFFF0000000000000, !dbg !298 + %7813 = select i1 %7797, float 0.000000e+00, float %7766, !dbg !299 + %7814 = select i1 %7798, float 0.000000e+00, float %7768, !dbg !299 + %7815 = select i1 %7799, float 0.000000e+00, float %7770, !dbg !299 + %7816 = select i1 %7800, float 0.000000e+00, float %7772, !dbg !299 + %7817 = select i1 %7801, float 0.000000e+00, float %7774, !dbg !299 + %7818 = select i1 %7802, float 0.000000e+00, float %7776, !dbg !299 + %7819 = select i1 %7803, float 0.000000e+00, float %7778, !dbg !299 + %7820 = select i1 %7804, float 0.000000e+00, float %7780, !dbg !299 + %7821 = select i1 %7805, float 0.000000e+00, float %7782, !dbg !299 + %7822 = select i1 %7806, float 0.000000e+00, float %7784, !dbg !299 + %7823 = select i1 %7807, float 0.000000e+00, float %7786, !dbg !299 + %7824 = select i1 %7808, float 0.000000e+00, float %7788, !dbg !299 + %7825 = select i1 %7809, float 0.000000e+00, float %7790, !dbg !299 + %7826 = select i1 %7810, float 0.000000e+00, float %7792, !dbg !299 + %7827 = select i1 %7811, float 0.000000e+00, float %7794, !dbg !299 + %7828 = select i1 %7812, float 0.000000e+00, float %7796, !dbg !299 + %7829 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %36, i32 0, i32 31), !dbg !300 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !300 + %7830 = shl i32 %7829, 11, !dbg !300 + %7831 = and i32 %7830, 8192, !dbg !300 + %7832 = add i32 %7831, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !300 + %7833 = lshr exact i32 %7832, 4, !dbg !300 + %7834 = and i32 %7833, 16383, !dbg !300 + %7835 = zext nneg i32 %7834 to i64, !dbg !300 + %7836 = or disjoint i64 %7835, 4611686293372403712, !dbg !300 + %7837 = ptrtoint ptr addrspace(3) %7762 to i32, !dbg !300 + %7838 = lshr exact i32 %7837, 4, !dbg !300 + %7839 = and i32 %7838, 16383, !dbg !300 + %7840 = zext nneg i32 %7839 to i64, !dbg !300 + %7841 = or disjoint i64 %7840, 4611686293338849280, !dbg !300 + %7842 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %7836, i64 %7841) #3, !dbg !300 + %7843 = or disjoint i32 %7831, 32, !dbg !300 + %7844 = add i32 %7843, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !300 + %7845 = lshr exact i32 %7844, 4, !dbg !300 + %7846 = and i32 %7845, 16383, !dbg !300 + %7847 = zext nneg i32 %7846 to i64, !dbg !300 + %7848 = or disjoint i64 %7847, 4611686293372403712, !dbg !300 + %7849 = add i32 %7837, 32, !dbg !300 + %7850 = lshr exact i32 %7849, 4, !dbg !300 + %7851 = and i32 %7850, 16383, !dbg !300 + %7852 = zext nneg i32 %7851 to i64, !dbg !300 + %7853 = or disjoint i64 %7852, 4611686293338849280, !dbg !300 + %7854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 0, !dbg !300 + %7855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 1, !dbg !300 + %7856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 2, !dbg !300 + %7857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 3, !dbg !300 + %7858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 4, !dbg !300 + %7859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 5, !dbg !300 + %7860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 6, !dbg !300 + %7861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 7, !dbg !300 + %7862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 8, !dbg !300 + %7863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 9, !dbg !300 + %7864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 10, !dbg !300 + %7865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 11, !dbg !300 + %7866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 12, !dbg !300 + %7867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 13, !dbg !300 + %7868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 14, !dbg !300 + %7869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 15, !dbg !300 + %7870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 16, !dbg !300 + %7871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 17, !dbg !300 + %7872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 18, !dbg !300 + %7873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 19, !dbg !300 + %7874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 20, !dbg !300 + %7875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 21, !dbg !300 + %7876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 22, !dbg !300 + %7877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 23, !dbg !300 + %7878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 24, !dbg !300 + %7879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 25, !dbg !300 + %7880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 26, !dbg !300 + %7881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 27, !dbg !300 + %7882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 28, !dbg !300 + %7883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 29, !dbg !300 + %7884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 30, !dbg !300 + %7885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 31, !dbg !300 + %7886 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7854, float %7855, float %7856, float %7857, float %7858, float %7859, float %7860, float %7861, float %7862, float %7863, float %7864, float %7865, float %7866, float %7867, float %7868, float %7869, float %7870, float %7871, float %7872, float %7873, float %7874, float %7875, float %7876, float %7877, float %7878, float %7879, float %7880, float %7881, float %7882, float %7883, float %7884, float %7885, i64 %7848, i64 %7853, i1 true) #3, !dbg !300 + %7887 = or disjoint i32 %7831, 64, !dbg !300 + %7888 = add i32 %7887, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !300 + %7889 = lshr exact i32 %7888, 4, !dbg !300 + %7890 = and i32 %7889, 16383, !dbg !300 + %7891 = zext nneg i32 %7890 to i64, !dbg !300 + %7892 = or disjoint i64 %7891, 4611686293372403712, !dbg !300 + %7893 = add i32 %7837, 64, !dbg !300 + %7894 = lshr exact i32 %7893, 4, !dbg !300 + %7895 = and i32 %7894, 16383, !dbg !300 + %7896 = zext nneg i32 %7895 to i64, !dbg !300 + %7897 = or disjoint i64 %7896, 4611686293338849280, !dbg !300 + %7898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 0, !dbg !300 + %7899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 1, !dbg !300 + %7900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 2, !dbg !300 + %7901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 3, !dbg !300 + %7902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 4, !dbg !300 + %7903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 5, !dbg !300 + %7904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 6, !dbg !300 + %7905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 7, !dbg !300 + %7906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 8, !dbg !300 + %7907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 9, !dbg !300 + %7908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 10, !dbg !300 + %7909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 11, !dbg !300 + %7910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 12, !dbg !300 + %7911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 13, !dbg !300 + %7912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 14, !dbg !300 + %7913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 15, !dbg !300 + %7914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 16, !dbg !300 + %7915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 17, !dbg !300 + %7916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 18, !dbg !300 + %7917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 19, !dbg !300 + %7918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 20, !dbg !300 + %7919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 21, !dbg !300 + %7920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 22, !dbg !300 + %7921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 23, !dbg !300 + %7922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 24, !dbg !300 + %7923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 25, !dbg !300 + %7924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 26, !dbg !300 + %7925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 27, !dbg !300 + %7926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 28, !dbg !300 + %7927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 29, !dbg !300 + %7928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 30, !dbg !300 + %7929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 31, !dbg !300 + %7930 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7898, float %7899, float %7900, float %7901, float %7902, float %7903, float %7904, float %7905, float %7906, float %7907, float %7908, float %7909, float %7910, float %7911, float %7912, float %7913, float %7914, float %7915, float %7916, float %7917, float %7918, float %7919, float %7920, float %7921, float %7922, float %7923, float %7924, float %7925, float %7926, float %7927, float %7928, float %7929, i64 %7892, i64 %7897, i1 true) #3, !dbg !300 + %7931 = or disjoint i32 %7831, 96, !dbg !300 + %7932 = add i32 %7931, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !300 + %7933 = lshr exact i32 %7932, 4, !dbg !300 + %7934 = and i32 %7933, 16383, !dbg !300 + %7935 = zext nneg i32 %7934 to i64, !dbg !300 + %7936 = or disjoint i64 %7935, 4611686293372403712, !dbg !300 + %7937 = add i32 %7837, 96, !dbg !300 + %7938 = lshr exact i32 %7937, 4, !dbg !300 + %7939 = and i32 %7938, 16383, !dbg !300 + %7940 = zext nneg i32 %7939 to i64, !dbg !300 + %7941 = or disjoint i64 %7940, 4611686293338849280, !dbg !300 + %7942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 0, !dbg !300 + %7943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 1, !dbg !300 + %7944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 2, !dbg !300 + %7945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 3, !dbg !300 + %7946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 4, !dbg !300 + %7947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 5, !dbg !300 + %7948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 6, !dbg !300 + %7949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 7, !dbg !300 + %7950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 8, !dbg !300 + %7951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 9, !dbg !300 + %7952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 10, !dbg !300 + %7953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 11, !dbg !300 + %7954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 12, !dbg !300 + %7955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 13, !dbg !300 + %7956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 14, !dbg !300 + %7957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 15, !dbg !300 + %7958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 16, !dbg !300 + %7959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 17, !dbg !300 + %7960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 18, !dbg !300 + %7961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 19, !dbg !300 + %7962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 20, !dbg !300 + %7963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 21, !dbg !300 + %7964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 22, !dbg !300 + %7965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 23, !dbg !300 + %7966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 24, !dbg !300 + %7967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 25, !dbg !300 + %7968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 26, !dbg !300 + %7969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 27, !dbg !300 + %7970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 28, !dbg !300 + %7971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 29, !dbg !300 + %7972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 30, !dbg !300 + %7973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 31, !dbg !300 + %7974 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7942, float %7943, float %7944, float %7945, float %7946, float %7947, float %7948, float %7949, float %7950, float %7951, float %7952, float %7953, float %7954, float %7955, float %7956, float %7957, float %7958, float %7959, float %7960, float %7961, float %7962, float %7963, float %7964, float %7965, float %7966, float %7967, float %7968, float %7969, float %7970, float %7971, float %7972, float %7973, i64 %7936, i64 %7941, i1 true) #3, !dbg !300 + %7975 = or disjoint i32 %7831, 16384, !dbg !300 + %7976 = add i32 %7975, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !300 + %7977 = lshr exact i32 %7976, 4, !dbg !300 + %7978 = and i32 %7977, 16383, !dbg !300 + %7979 = zext nneg i32 %7978 to i64, !dbg !300 + %7980 = or disjoint i64 %7979, 4611686293372403712, !dbg !300 + %7981 = add i32 %7837, 8192, !dbg !300 + %7982 = lshr exact i32 %7981, 4, !dbg !300 + %7983 = and i32 %7982, 16383, !dbg !300 + %7984 = zext nneg i32 %7983 to i64, !dbg !300 + %7985 = or disjoint i64 %7984, 4611686293338849280, !dbg !300 + %7986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 0, !dbg !300 + %7987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 1, !dbg !300 + %7988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 2, !dbg !300 + %7989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 3, !dbg !300 + %7990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 4, !dbg !300 + %7991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 5, !dbg !300 + %7992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 6, !dbg !300 + %7993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 7, !dbg !300 + %7994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 8, !dbg !300 + %7995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 9, !dbg !300 + %7996 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 10, !dbg !300 + %7997 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 11, !dbg !300 + %7998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 12, !dbg !300 + %7999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 13, !dbg !300 + %8000 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 14, !dbg !300 + %8001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 15, !dbg !300 + %8002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 16, !dbg !300 + %8003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 17, !dbg !300 + %8004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 18, !dbg !300 + %8005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 19, !dbg !300 + %8006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 20, !dbg !300 + %8007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 21, !dbg !300 + %8008 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 22, !dbg !300 + %8009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 23, !dbg !300 + %8010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 24, !dbg !300 + %8011 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 25, !dbg !300 + %8012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 26, !dbg !300 + %8013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 27, !dbg !300 + %8014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 28, !dbg !300 + %8015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 29, !dbg !300 + %8016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 30, !dbg !300 + %8017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 31, !dbg !300 + %8018 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7986, float %7987, float %7988, float %7989, float %7990, float %7991, float %7992, float %7993, float %7994, float %7995, float %7996, float %7997, float %7998, float %7999, float %8000, float %8001, float %8002, float %8003, float %8004, float %8005, float %8006, float %8007, float %8008, float %8009, float %8010, float %8011, float %8012, float %8013, float %8014, float %8015, float %8016, float %8017, i64 %7980, i64 %7985, i1 true) #3, !dbg !300 + %8019 = or disjoint i32 %7831, 16416, !dbg !300 + %8020 = add i32 %8019, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !300 + %8021 = lshr exact i32 %8020, 4, !dbg !300 + %8022 = and i32 %8021, 16383, !dbg !300 + %8023 = zext nneg i32 %8022 to i64, !dbg !300 + %8024 = or disjoint i64 %8023, 4611686293372403712, !dbg !300 + %8025 = add i32 %7837, 8224, !dbg !300 + %8026 = lshr exact i32 %8025, 4, !dbg !300 + %8027 = and i32 %8026, 16383, !dbg !300 + %8028 = zext nneg i32 %8027 to i64, !dbg !300 + %8029 = or disjoint i64 %8028, 4611686293338849280, !dbg !300 + %8030 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 0, !dbg !300 + %8031 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 1, !dbg !300 + %8032 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 2, !dbg !300 + %8033 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 3, !dbg !300 + %8034 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 4, !dbg !300 + %8035 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 5, !dbg !300 + %8036 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 6, !dbg !300 + %8037 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 7, !dbg !300 + %8038 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 8, !dbg !300 + %8039 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 9, !dbg !300 + %8040 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 10, !dbg !300 + %8041 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 11, !dbg !300 + %8042 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 12, !dbg !300 + %8043 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 13, !dbg !300 + %8044 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 14, !dbg !300 + %8045 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 15, !dbg !300 + %8046 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 16, !dbg !300 + %8047 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 17, !dbg !300 + %8048 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 18, !dbg !300 + %8049 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 19, !dbg !300 + %8050 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 20, !dbg !300 + %8051 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 21, !dbg !300 + %8052 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 22, !dbg !300 + %8053 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 23, !dbg !300 + %8054 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 24, !dbg !300 + %8055 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 25, !dbg !300 + %8056 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 26, !dbg !300 + %8057 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 27, !dbg !300 + %8058 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 28, !dbg !300 + %8059 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 29, !dbg !300 + %8060 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 30, !dbg !300 + %8061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 31, !dbg !300 + %8062 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8030, float %8031, float %8032, float %8033, float %8034, float %8035, float %8036, float %8037, float %8038, float %8039, float %8040, float %8041, float %8042, float %8043, float %8044, float %8045, float %8046, float %8047, float %8048, float %8049, float %8050, float %8051, float %8052, float %8053, float %8054, float %8055, float %8056, float %8057, float %8058, float %8059, float %8060, float %8061, i64 %8024, i64 %8029, i1 true) #3, !dbg !300 + %8063 = or disjoint i32 %7831, 16448, !dbg !300 + %8064 = add i32 %8063, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !300 + %8065 = lshr exact i32 %8064, 4, !dbg !300 + %8066 = and i32 %8065, 16383, !dbg !300 + %8067 = zext nneg i32 %8066 to i64, !dbg !300 + %8068 = or disjoint i64 %8067, 4611686293372403712, !dbg !300 + %8069 = add i32 %7837, 8256, !dbg !300 + %8070 = lshr exact i32 %8069, 4, !dbg !300 + %8071 = and i32 %8070, 16383, !dbg !300 + %8072 = zext nneg i32 %8071 to i64, !dbg !300 + %8073 = or disjoint i64 %8072, 4611686293338849280, !dbg !300 + %8074 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 0, !dbg !300 + %8075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 1, !dbg !300 + %8076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 2, !dbg !300 + %8077 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 3, !dbg !300 + %8078 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 4, !dbg !300 + %8079 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 5, !dbg !300 + %8080 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 6, !dbg !300 + %8081 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 7, !dbg !300 + %8082 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 8, !dbg !300 + %8083 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 9, !dbg !300 + %8084 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 10, !dbg !300 + %8085 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 11, !dbg !300 + %8086 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 12, !dbg !300 + %8087 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 13, !dbg !300 + %8088 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 14, !dbg !300 + %8089 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 15, !dbg !300 + %8090 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 16, !dbg !300 + %8091 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 17, !dbg !300 + %8092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 18, !dbg !300 + %8093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 19, !dbg !300 + %8094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 20, !dbg !300 + %8095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 21, !dbg !300 + %8096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 22, !dbg !300 + %8097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 23, !dbg !300 + %8098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 24, !dbg !300 + %8099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 25, !dbg !300 + %8100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 26, !dbg !300 + %8101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 27, !dbg !300 + %8102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 28, !dbg !300 + %8103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 29, !dbg !300 + %8104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 30, !dbg !300 + %8105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 31, !dbg !300 + %8106 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8074, float %8075, float %8076, float %8077, float %8078, float %8079, float %8080, float %8081, float %8082, float %8083, float %8084, float %8085, float %8086, float %8087, float %8088, float %8089, float %8090, float %8091, float %8092, float %8093, float %8094, float %8095, float %8096, float %8097, float %8098, float %8099, float %8100, float %8101, float %8102, float %8103, float %8104, float %8105, i64 %8068, i64 %8073, i1 true) #3, !dbg !300 + %8107 = or disjoint i32 %7831, 16480, !dbg !300 + %8108 = add i32 %8107, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !300 + %8109 = lshr exact i32 %8108, 4, !dbg !300 + %8110 = and i32 %8109, 16383, !dbg !300 + %8111 = zext nneg i32 %8110 to i64, !dbg !300 + %8112 = or disjoint i64 %8111, 4611686293372403712, !dbg !300 + %8113 = add i32 %7837, 8288, !dbg !300 + %8114 = lshr exact i32 %8113, 4, !dbg !300 + %8115 = and i32 %8114, 16383, !dbg !300 + %8116 = zext nneg i32 %8115 to i64, !dbg !300 + %8117 = or disjoint i64 %8116, 4611686293338849280, !dbg !300 + %8118 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 0, !dbg !300 + %8119 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 1, !dbg !300 + %8120 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 2, !dbg !300 + %8121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 3, !dbg !300 + %8122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 4, !dbg !300 + %8123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 5, !dbg !300 + %8124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 6, !dbg !300 + %8125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 7, !dbg !300 + %8126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 8, !dbg !300 + %8127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 9, !dbg !300 + %8128 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 10, !dbg !300 + %8129 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 11, !dbg !300 + %8130 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 12, !dbg !300 + %8131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 13, !dbg !300 + %8132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 14, !dbg !300 + %8133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 15, !dbg !300 + %8134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 16, !dbg !300 + %8135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 17, !dbg !300 + %8136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 18, !dbg !300 + %8137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 19, !dbg !300 + %8138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 20, !dbg !300 + %8139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 21, !dbg !300 + %8140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 22, !dbg !300 + %8141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 23, !dbg !300 + %8142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 24, !dbg !300 + %8143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 25, !dbg !300 + %8144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 26, !dbg !300 + %8145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 27, !dbg !300 + %8146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 28, !dbg !300 + %8147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 29, !dbg !300 + %8148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 30, !dbg !300 + %8149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 31, !dbg !300 + %8150 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8118, float %8119, float %8120, float %8121, float %8122, float %8123, float %8124, float %8125, float %8126, float %8127, float %8128, float %8129, float %8130, float %8131, float %8132, float %8133, float %8134, float %8135, float %8136, float %8137, float %8138, float %8139, float %8140, float %8141, float %8142, float %8143, float %8144, float %8145, float %8146, float %8147, float %8148, float %8149, i64 %8112, i64 %8117, i1 true) #3, !dbg !300 + %8151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 0, !dbg !300 + %8152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 1, !dbg !300 + %8153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 2, !dbg !300 + %8154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 3, !dbg !300 + %8155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 4, !dbg !300 + %8156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 5, !dbg !300 + %8157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 6, !dbg !300 + %8158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 7, !dbg !300 + %8159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 8, !dbg !300 + %8160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 9, !dbg !300 + %8161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 10, !dbg !300 + %8162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 11, !dbg !300 + %8163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 12, !dbg !300 + %8164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 13, !dbg !300 + %8165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 14, !dbg !300 + %8166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 15, !dbg !300 + %8167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 16, !dbg !300 + %8168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 17, !dbg !300 + %8169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 18, !dbg !300 + %8170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 19, !dbg !300 + %8171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 20, !dbg !300 + %8172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 21, !dbg !300 + %8173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 22, !dbg !300 + %8174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 23, !dbg !300 + %8175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 24, !dbg !300 + %8176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 25, !dbg !300 + %8177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 26, !dbg !300 + %8178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 27, !dbg !300 + %8179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 28, !dbg !300 + %8180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 29, !dbg !300 + %8181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 30, !dbg !300 + %8182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 31, !dbg !300 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !300 + %8183 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %8151, float %8152, float %8153, float %8154, float %8155, float %8156, float %8157, float %8158, float %8159, float %8160, float %8161, float %8162, float %8163, float %8164, float %8165, float %8166, float %8167, float %8168, float %8169, float %8170, float %8171, float %8172, float %8173, float %8174, float %8175, float %8176, float %8177, float %8178, float %8179, float %8180, float %8181, float %8182, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 0, i32 0, ptr addrspace(3) %7762, i32 0, i32 0) #3, !dbg !300 + %8184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 0, !dbg !300 + %8185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 1, !dbg !300 + %8186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 2, !dbg !300 + %8187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 3, !dbg !300 + %8188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 4, !dbg !300 + %8189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 5, !dbg !300 + %8190 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 6, !dbg !300 + %8191 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 7, !dbg !300 + %8192 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 8, !dbg !300 + %8193 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 9, !dbg !300 + %8194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 10, !dbg !300 + %8195 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 11, !dbg !300 + %8196 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 12, !dbg !300 + %8197 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 13, !dbg !300 + %8198 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 14, !dbg !300 + %8199 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 15, !dbg !300 + %8200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 16, !dbg !300 + %8201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 17, !dbg !300 + %8202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 18, !dbg !300 + %8203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 19, !dbg !300 + %8204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 20, !dbg !300 + %8205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 21, !dbg !300 + %8206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 22, !dbg !300 + %8207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 23, !dbg !300 + %8208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 24, !dbg !300 + %8209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 25, !dbg !300 + %8210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 26, !dbg !300 + %8211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 27, !dbg !300 + %8212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 28, !dbg !300 + %8213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 29, !dbg !300 + %8214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 30, !dbg !300 + %8215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 31, !dbg !300 + %8216 = fmul float %8184, 0x3FB6A09E60000000, !dbg !301 + %8217 = fmul float %8185, 0x3FB6A09E60000000, !dbg !301 + %8218 = fmul float %8186, 0x3FB6A09E60000000, !dbg !301 + %8219 = fmul float %8187, 0x3FB6A09E60000000, !dbg !301 + %8220 = fmul float %8188, 0x3FB6A09E60000000, !dbg !301 + %8221 = fmul float %8189, 0x3FB6A09E60000000, !dbg !301 + %8222 = fmul float %8190, 0x3FB6A09E60000000, !dbg !301 + %8223 = fmul float %8191, 0x3FB6A09E60000000, !dbg !301 + %8224 = fmul float %8192, 0x3FB6A09E60000000, !dbg !301 + %8225 = fmul float %8193, 0x3FB6A09E60000000, !dbg !301 + %8226 = fmul float %8194, 0x3FB6A09E60000000, !dbg !301 + %8227 = fmul float %8195, 0x3FB6A09E60000000, !dbg !301 + %8228 = fmul float %8196, 0x3FB6A09E60000000, !dbg !301 + %8229 = fmul float %8197, 0x3FB6A09E60000000, !dbg !301 + %8230 = fmul float %8198, 0x3FB6A09E60000000, !dbg !301 + %8231 = fmul float %8199, 0x3FB6A09E60000000, !dbg !301 + %8232 = fmul float %8200, 0x3FB6A09E60000000, !dbg !301 + %8233 = fmul float %8201, 0x3FB6A09E60000000, !dbg !301 + %8234 = fmul float %8202, 0x3FB6A09E60000000, !dbg !301 + %8235 = fmul float %8203, 0x3FB6A09E60000000, !dbg !301 + %8236 = fmul float %8204, 0x3FB6A09E60000000, !dbg !301 + %8237 = fmul float %8205, 0x3FB6A09E60000000, !dbg !301 + %8238 = fmul float %8206, 0x3FB6A09E60000000, !dbg !301 + %8239 = fmul float %8207, 0x3FB6A09E60000000, !dbg !301 + %8240 = fmul float %8208, 0x3FB6A09E60000000, !dbg !301 + %8241 = fmul float %8209, 0x3FB6A09E60000000, !dbg !301 + %8242 = fmul float %8210, 0x3FB6A09E60000000, !dbg !301 + %8243 = fmul float %8211, 0x3FB6A09E60000000, !dbg !301 + %8244 = fmul float %8212, 0x3FB6A09E60000000, !dbg !301 + %8245 = fmul float %8213, 0x3FB6A09E60000000, !dbg !301 + %8246 = fmul float %8214, 0x3FB6A09E60000000, !dbg !301 + %8247 = fmul float %8215, 0x3FB6A09E60000000, !dbg !301 + %8248 = fmul float %8216, 0x3FF7154760000000, !dbg !302 + %8249 = fmul float %8217, 0x3FF7154760000000, !dbg !302 + %8250 = fmul float %8218, 0x3FF7154760000000, !dbg !302 + %8251 = fmul float %8219, 0x3FF7154760000000, !dbg !302 + %8252 = fmul float %8220, 0x3FF7154760000000, !dbg !302 + %8253 = fmul float %8221, 0x3FF7154760000000, !dbg !302 + %8254 = fmul float %8222, 0x3FF7154760000000, !dbg !302 + %8255 = fmul float %8223, 0x3FF7154760000000, !dbg !302 + %8256 = fmul float %8224, 0x3FF7154760000000, !dbg !302 + %8257 = fmul float %8225, 0x3FF7154760000000, !dbg !302 + %8258 = fmul float %8226, 0x3FF7154760000000, !dbg !302 + %8259 = fmul float %8227, 0x3FF7154760000000, !dbg !302 + %8260 = fmul float %8228, 0x3FF7154760000000, !dbg !302 + %8261 = fmul float %8229, 0x3FF7154760000000, !dbg !302 + %8262 = fmul float %8230, 0x3FF7154760000000, !dbg !302 + %8263 = fmul float %8231, 0x3FF7154760000000, !dbg !302 + %8264 = fmul float %8232, 0x3FF7154760000000, !dbg !302 + %8265 = fmul float %8233, 0x3FF7154760000000, !dbg !302 + %8266 = fmul float %8234, 0x3FF7154760000000, !dbg !302 + %8267 = fmul float %8235, 0x3FF7154760000000, !dbg !302 + %8268 = fmul float %8236, 0x3FF7154760000000, !dbg !302 + %8269 = fmul float %8237, 0x3FF7154760000000, !dbg !302 + %8270 = fmul float %8238, 0x3FF7154760000000, !dbg !302 + %8271 = fmul float %8239, 0x3FF7154760000000, !dbg !302 + %8272 = fmul float %8240, 0x3FF7154760000000, !dbg !302 + %8273 = fmul float %8241, 0x3FF7154760000000, !dbg !302 + %8274 = fmul float %8242, 0x3FF7154760000000, !dbg !302 + %8275 = fmul float %8243, 0x3FF7154760000000, !dbg !302 + %8276 = fmul float %8244, 0x3FF7154760000000, !dbg !302 + %8277 = fmul float %8245, 0x3FF7154760000000, !dbg !302 + %8278 = fmul float %8246, 0x3FF7154760000000, !dbg !302 + %8279 = fmul float %8247, 0x3FF7154760000000, !dbg !302 + %8280 = fsub float %8248, %7813, !dbg !303 + %8281 = fsub float %8249, %7814, !dbg !303 + %8282 = fsub float %8250, %7813, !dbg !303 + %8283 = fsub float %8251, %7814, !dbg !303 + %8284 = fsub float %8252, %7815, !dbg !303 + %8285 = fsub float %8253, %7816, !dbg !303 + %8286 = fsub float %8254, %7815, !dbg !303 + %8287 = fsub float %8255, %7816, !dbg !303 + %8288 = fsub float %8256, %7817, !dbg !303 + %8289 = fsub float %8257, %7818, !dbg !303 + %8290 = fsub float %8258, %7817, !dbg !303 + %8291 = fsub float %8259, %7818, !dbg !303 + %8292 = fsub float %8260, %7819, !dbg !303 + %8293 = fsub float %8261, %7820, !dbg !303 + %8294 = fsub float %8262, %7819, !dbg !303 + %8295 = fsub float %8263, %7820, !dbg !303 + %8296 = fsub float %8264, %7821, !dbg !303 + %8297 = fsub float %8265, %7822, !dbg !303 + %8298 = fsub float %8266, %7821, !dbg !303 + %8299 = fsub float %8267, %7822, !dbg !303 + %8300 = fsub float %8268, %7823, !dbg !303 + %8301 = fsub float %8269, %7824, !dbg !303 + %8302 = fsub float %8270, %7823, !dbg !303 + %8303 = fsub float %8271, %7824, !dbg !303 + %8304 = fsub float %8272, %7825, !dbg !303 + %8305 = fsub float %8273, %7826, !dbg !303 + %8306 = fsub float %8274, %7825, !dbg !303 + %8307 = fsub float %8275, %7826, !dbg !303 + %8308 = fsub float %8276, %7827, !dbg !303 + %8309 = fsub float %8277, %7828, !dbg !303 + %8310 = fsub float %8278, %7827, !dbg !303 + %8311 = fsub float %8279, %7828, !dbg !303 + %8312 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i = icmp eq i32 %8312, 0, !dbg !304 + br i1 %.not.i, label %8315, label %8313, !dbg !304 + +8313: ; preds = %.lr.ph1691 + %8314 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8280) #3, !dbg !304 + br label %__nv_exp2f.exit, !dbg !304 + +8315: ; preds = %.lr.ph1691 + %8316 = tail call float @llvm.nvvm.ex2.approx.f(float %8280) #3, !dbg !304 + br label %__nv_exp2f.exit, !dbg !304 + +__nv_exp2f.exit: ; preds = %8313, %8315 + %.0.i = phi float [ %8314, %8313 ], [ %8316, %8315 ], !dbg !304 + %8317 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1127 = icmp eq i32 %8317, 0, !dbg !304 + br i1 %.not.i1127, label %8320, label %8318, !dbg !304 + +8318: ; preds = %__nv_exp2f.exit + %8319 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8281) #3, !dbg !304 + br label %__nv_exp2f.exit1129, !dbg !304 + +8320: ; preds = %__nv_exp2f.exit + %8321 = tail call float @llvm.nvvm.ex2.approx.f(float %8281) #3, !dbg !304 + br label %__nv_exp2f.exit1129, !dbg !304 + +__nv_exp2f.exit1129: ; preds = %8318, %8320 + %.0.i1128 = phi float [ %8319, %8318 ], [ %8321, %8320 ], !dbg !304 + %8322 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1130 = icmp eq i32 %8322, 0, !dbg !304 + br i1 %.not.i1130, label %8325, label %8323, !dbg !304 + +8323: ; preds = %__nv_exp2f.exit1129 + %8324 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8282) #3, !dbg !304 + br label %__nv_exp2f.exit1132, !dbg !304 + +8325: ; preds = %__nv_exp2f.exit1129 + %8326 = tail call float @llvm.nvvm.ex2.approx.f(float %8282) #3, !dbg !304 + br label %__nv_exp2f.exit1132, !dbg !304 + +__nv_exp2f.exit1132: ; preds = %8323, %8325 + %.0.i1131 = phi float [ %8324, %8323 ], [ %8326, %8325 ], !dbg !304 + %8327 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1133 = icmp eq i32 %8327, 0, !dbg !304 + br i1 %.not.i1133, label %8330, label %8328, !dbg !304 + +8328: ; preds = %__nv_exp2f.exit1132 + %8329 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8283) #3, !dbg !304 + br label %__nv_exp2f.exit1135, !dbg !304 + +8330: ; preds = %__nv_exp2f.exit1132 + %8331 = tail call float @llvm.nvvm.ex2.approx.f(float %8283) #3, !dbg !304 + br label %__nv_exp2f.exit1135, !dbg !304 + +__nv_exp2f.exit1135: ; preds = %8328, %8330 + %.0.i1134 = phi float [ %8329, %8328 ], [ %8331, %8330 ], !dbg !304 + %8332 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1136 = icmp eq i32 %8332, 0, !dbg !304 + br i1 %.not.i1136, label %8335, label %8333, !dbg !304 + +8333: ; preds = %__nv_exp2f.exit1135 + %8334 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8284) #3, !dbg !304 + br label %__nv_exp2f.exit1138, !dbg !304 + +8335: ; preds = %__nv_exp2f.exit1135 + %8336 = tail call float @llvm.nvvm.ex2.approx.f(float %8284) #3, !dbg !304 + br label %__nv_exp2f.exit1138, !dbg !304 + +__nv_exp2f.exit1138: ; preds = %8333, %8335 + %.0.i1137 = phi float [ %8334, %8333 ], [ %8336, %8335 ], !dbg !304 + %8337 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1139 = icmp eq i32 %8337, 0, !dbg !304 + br i1 %.not.i1139, label %8340, label %8338, !dbg !304 + +8338: ; preds = %__nv_exp2f.exit1138 + %8339 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8285) #3, !dbg !304 + br label %__nv_exp2f.exit1141, !dbg !304 + +8340: ; preds = %__nv_exp2f.exit1138 + %8341 = tail call float @llvm.nvvm.ex2.approx.f(float %8285) #3, !dbg !304 + br label %__nv_exp2f.exit1141, !dbg !304 + +__nv_exp2f.exit1141: ; preds = %8338, %8340 + %.0.i1140 = phi float [ %8339, %8338 ], [ %8341, %8340 ], !dbg !304 + %8342 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1142 = icmp eq i32 %8342, 0, !dbg !304 + br i1 %.not.i1142, label %8345, label %8343, !dbg !304 + +8343: ; preds = %__nv_exp2f.exit1141 + %8344 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8286) #3, !dbg !304 + br label %__nv_exp2f.exit1144, !dbg !304 + +8345: ; preds = %__nv_exp2f.exit1141 + %8346 = tail call float @llvm.nvvm.ex2.approx.f(float %8286) #3, !dbg !304 + br label %__nv_exp2f.exit1144, !dbg !304 + +__nv_exp2f.exit1144: ; preds = %8343, %8345 + %.0.i1143 = phi float [ %8344, %8343 ], [ %8346, %8345 ], !dbg !304 + %8347 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1145 = icmp eq i32 %8347, 0, !dbg !304 + br i1 %.not.i1145, label %8350, label %8348, !dbg !304 + +8348: ; preds = %__nv_exp2f.exit1144 + %8349 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8287) #3, !dbg !304 + br label %__nv_exp2f.exit1147, !dbg !304 + +8350: ; preds = %__nv_exp2f.exit1144 + %8351 = tail call float @llvm.nvvm.ex2.approx.f(float %8287) #3, !dbg !304 + br label %__nv_exp2f.exit1147, !dbg !304 + +__nv_exp2f.exit1147: ; preds = %8348, %8350 + %.0.i1146 = phi float [ %8349, %8348 ], [ %8351, %8350 ], !dbg !304 + %8352 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1148 = icmp eq i32 %8352, 0, !dbg !304 + br i1 %.not.i1148, label %8355, label %8353, !dbg !304 + +8353: ; preds = %__nv_exp2f.exit1147 + %8354 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8288) #3, !dbg !304 + br label %__nv_exp2f.exit1150, !dbg !304 + +8355: ; preds = %__nv_exp2f.exit1147 + %8356 = tail call float @llvm.nvvm.ex2.approx.f(float %8288) #3, !dbg !304 + br label %__nv_exp2f.exit1150, !dbg !304 + +__nv_exp2f.exit1150: ; preds = %8353, %8355 + %.0.i1149 = phi float [ %8354, %8353 ], [ %8356, %8355 ], !dbg !304 + %8357 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1151 = icmp eq i32 %8357, 0, !dbg !304 + br i1 %.not.i1151, label %8360, label %8358, !dbg !304 + +8358: ; preds = %__nv_exp2f.exit1150 + %8359 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8289) #3, !dbg !304 + br label %__nv_exp2f.exit1153, !dbg !304 + +8360: ; preds = %__nv_exp2f.exit1150 + %8361 = tail call float @llvm.nvvm.ex2.approx.f(float %8289) #3, !dbg !304 + br label %__nv_exp2f.exit1153, !dbg !304 + +__nv_exp2f.exit1153: ; preds = %8358, %8360 + %.0.i1152 = phi float [ %8359, %8358 ], [ %8361, %8360 ], !dbg !304 + %8362 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1154 = icmp eq i32 %8362, 0, !dbg !304 + br i1 %.not.i1154, label %8365, label %8363, !dbg !304 + +8363: ; preds = %__nv_exp2f.exit1153 + %8364 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8290) #3, !dbg !304 + br label %__nv_exp2f.exit1156, !dbg !304 + +8365: ; preds = %__nv_exp2f.exit1153 + %8366 = tail call float @llvm.nvvm.ex2.approx.f(float %8290) #3, !dbg !304 + br label %__nv_exp2f.exit1156, !dbg !304 + +__nv_exp2f.exit1156: ; preds = %8363, %8365 + %.0.i1155 = phi float [ %8364, %8363 ], [ %8366, %8365 ], !dbg !304 + %8367 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1157 = icmp eq i32 %8367, 0, !dbg !304 + br i1 %.not.i1157, label %8370, label %8368, !dbg !304 + +8368: ; preds = %__nv_exp2f.exit1156 + %8369 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8291) #3, !dbg !304 + br label %__nv_exp2f.exit1159, !dbg !304 + +8370: ; preds = %__nv_exp2f.exit1156 + %8371 = tail call float @llvm.nvvm.ex2.approx.f(float %8291) #3, !dbg !304 + br label %__nv_exp2f.exit1159, !dbg !304 + +__nv_exp2f.exit1159: ; preds = %8368, %8370 + %.0.i1158 = phi float [ %8369, %8368 ], [ %8371, %8370 ], !dbg !304 + %8372 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1160 = icmp eq i32 %8372, 0, !dbg !304 + br i1 %.not.i1160, label %8375, label %8373, !dbg !304 + +8373: ; preds = %__nv_exp2f.exit1159 + %8374 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8292) #3, !dbg !304 + br label %__nv_exp2f.exit1162, !dbg !304 + +8375: ; preds = %__nv_exp2f.exit1159 + %8376 = tail call float @llvm.nvvm.ex2.approx.f(float %8292) #3, !dbg !304 + br label %__nv_exp2f.exit1162, !dbg !304 + +__nv_exp2f.exit1162: ; preds = %8373, %8375 + %.0.i1161 = phi float [ %8374, %8373 ], [ %8376, %8375 ], !dbg !304 + %8377 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1163 = icmp eq i32 %8377, 0, !dbg !304 + br i1 %.not.i1163, label %8380, label %8378, !dbg !304 + +8378: ; preds = %__nv_exp2f.exit1162 + %8379 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8293) #3, !dbg !304 + br label %__nv_exp2f.exit1165, !dbg !304 + +8380: ; preds = %__nv_exp2f.exit1162 + %8381 = tail call float @llvm.nvvm.ex2.approx.f(float %8293) #3, !dbg !304 + br label %__nv_exp2f.exit1165, !dbg !304 + +__nv_exp2f.exit1165: ; preds = %8378, %8380 + %.0.i1164 = phi float [ %8379, %8378 ], [ %8381, %8380 ], !dbg !304 + %8382 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1166 = icmp eq i32 %8382, 0, !dbg !304 + br i1 %.not.i1166, label %8385, label %8383, !dbg !304 + +8383: ; preds = %__nv_exp2f.exit1165 + %8384 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8294) #3, !dbg !304 + br label %__nv_exp2f.exit1168, !dbg !304 + +8385: ; preds = %__nv_exp2f.exit1165 + %8386 = tail call float @llvm.nvvm.ex2.approx.f(float %8294) #3, !dbg !304 + br label %__nv_exp2f.exit1168, !dbg !304 + +__nv_exp2f.exit1168: ; preds = %8383, %8385 + %.0.i1167 = phi float [ %8384, %8383 ], [ %8386, %8385 ], !dbg !304 + %8387 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1169 = icmp eq i32 %8387, 0, !dbg !304 + br i1 %.not.i1169, label %8390, label %8388, !dbg !304 + +8388: ; preds = %__nv_exp2f.exit1168 + %8389 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8295) #3, !dbg !304 + br label %__nv_exp2f.exit1171, !dbg !304 + +8390: ; preds = %__nv_exp2f.exit1168 + %8391 = tail call float @llvm.nvvm.ex2.approx.f(float %8295) #3, !dbg !304 + br label %__nv_exp2f.exit1171, !dbg !304 + +__nv_exp2f.exit1171: ; preds = %8388, %8390 + %.0.i1170 = phi float [ %8389, %8388 ], [ %8391, %8390 ], !dbg !304 + %8392 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1172 = icmp eq i32 %8392, 0, !dbg !304 + br i1 %.not.i1172, label %8395, label %8393, !dbg !304 + +8393: ; preds = %__nv_exp2f.exit1171 + %8394 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8296) #3, !dbg !304 + br label %__nv_exp2f.exit1174, !dbg !304 + +8395: ; preds = %__nv_exp2f.exit1171 + %8396 = tail call float @llvm.nvvm.ex2.approx.f(float %8296) #3, !dbg !304 + br label %__nv_exp2f.exit1174, !dbg !304 + +__nv_exp2f.exit1174: ; preds = %8393, %8395 + %.0.i1173 = phi float [ %8394, %8393 ], [ %8396, %8395 ], !dbg !304 + %8397 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1175 = icmp eq i32 %8397, 0, !dbg !304 + br i1 %.not.i1175, label %8400, label %8398, !dbg !304 + +8398: ; preds = %__nv_exp2f.exit1174 + %8399 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8297) #3, !dbg !304 + br label %__nv_exp2f.exit1177, !dbg !304 + +8400: ; preds = %__nv_exp2f.exit1174 + %8401 = tail call float @llvm.nvvm.ex2.approx.f(float %8297) #3, !dbg !304 + br label %__nv_exp2f.exit1177, !dbg !304 + +__nv_exp2f.exit1177: ; preds = %8398, %8400 + %.0.i1176 = phi float [ %8399, %8398 ], [ %8401, %8400 ], !dbg !304 + %8402 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1178 = icmp eq i32 %8402, 0, !dbg !304 + br i1 %.not.i1178, label %8405, label %8403, !dbg !304 + +8403: ; preds = %__nv_exp2f.exit1177 + %8404 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8298) #3, !dbg !304 + br label %__nv_exp2f.exit1180, !dbg !304 + +8405: ; preds = %__nv_exp2f.exit1177 + %8406 = tail call float @llvm.nvvm.ex2.approx.f(float %8298) #3, !dbg !304 + br label %__nv_exp2f.exit1180, !dbg !304 + +__nv_exp2f.exit1180: ; preds = %8403, %8405 + %.0.i1179 = phi float [ %8404, %8403 ], [ %8406, %8405 ], !dbg !304 + %8407 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1181 = icmp eq i32 %8407, 0, !dbg !304 + br i1 %.not.i1181, label %8410, label %8408, !dbg !304 + +8408: ; preds = %__nv_exp2f.exit1180 + %8409 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8299) #3, !dbg !304 + br label %__nv_exp2f.exit1183, !dbg !304 + +8410: ; preds = %__nv_exp2f.exit1180 + %8411 = tail call float @llvm.nvvm.ex2.approx.f(float %8299) #3, !dbg !304 + br label %__nv_exp2f.exit1183, !dbg !304 + +__nv_exp2f.exit1183: ; preds = %8408, %8410 + %.0.i1182 = phi float [ %8409, %8408 ], [ %8411, %8410 ], !dbg !304 + %8412 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1184 = icmp eq i32 %8412, 0, !dbg !304 + br i1 %.not.i1184, label %8415, label %8413, !dbg !304 + +8413: ; preds = %__nv_exp2f.exit1183 + %8414 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8300) #3, !dbg !304 + br label %__nv_exp2f.exit1186, !dbg !304 + +8415: ; preds = %__nv_exp2f.exit1183 + %8416 = tail call float @llvm.nvvm.ex2.approx.f(float %8300) #3, !dbg !304 + br label %__nv_exp2f.exit1186, !dbg !304 + +__nv_exp2f.exit1186: ; preds = %8413, %8415 + %.0.i1185 = phi float [ %8414, %8413 ], [ %8416, %8415 ], !dbg !304 + %8417 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1187 = icmp eq i32 %8417, 0, !dbg !304 + br i1 %.not.i1187, label %8420, label %8418, !dbg !304 + +8418: ; preds = %__nv_exp2f.exit1186 + %8419 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8301) #3, !dbg !304 + br label %__nv_exp2f.exit1189, !dbg !304 + +8420: ; preds = %__nv_exp2f.exit1186 + %8421 = tail call float @llvm.nvvm.ex2.approx.f(float %8301) #3, !dbg !304 + br label %__nv_exp2f.exit1189, !dbg !304 + +__nv_exp2f.exit1189: ; preds = %8418, %8420 + %.0.i1188 = phi float [ %8419, %8418 ], [ %8421, %8420 ], !dbg !304 + %8422 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1190 = icmp eq i32 %8422, 0, !dbg !304 + br i1 %.not.i1190, label %8425, label %8423, !dbg !304 + +8423: ; preds = %__nv_exp2f.exit1189 + %8424 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8302) #3, !dbg !304 + br label %__nv_exp2f.exit1192, !dbg !304 + +8425: ; preds = %__nv_exp2f.exit1189 + %8426 = tail call float @llvm.nvvm.ex2.approx.f(float %8302) #3, !dbg !304 + br label %__nv_exp2f.exit1192, !dbg !304 + +__nv_exp2f.exit1192: ; preds = %8423, %8425 + %.0.i1191 = phi float [ %8424, %8423 ], [ %8426, %8425 ], !dbg !304 + %8427 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1193 = icmp eq i32 %8427, 0, !dbg !304 + br i1 %.not.i1193, label %8430, label %8428, !dbg !304 + +8428: ; preds = %__nv_exp2f.exit1192 + %8429 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8303) #3, !dbg !304 + br label %__nv_exp2f.exit1195, !dbg !304 + +8430: ; preds = %__nv_exp2f.exit1192 + %8431 = tail call float @llvm.nvvm.ex2.approx.f(float %8303) #3, !dbg !304 + br label %__nv_exp2f.exit1195, !dbg !304 + +__nv_exp2f.exit1195: ; preds = %8428, %8430 + %.0.i1194 = phi float [ %8429, %8428 ], [ %8431, %8430 ], !dbg !304 + %8432 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1196 = icmp eq i32 %8432, 0, !dbg !304 + br i1 %.not.i1196, label %8435, label %8433, !dbg !304 + +8433: ; preds = %__nv_exp2f.exit1195 + %8434 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8304) #3, !dbg !304 + br label %__nv_exp2f.exit1198, !dbg !304 + +8435: ; preds = %__nv_exp2f.exit1195 + %8436 = tail call float @llvm.nvvm.ex2.approx.f(float %8304) #3, !dbg !304 + br label %__nv_exp2f.exit1198, !dbg !304 + +__nv_exp2f.exit1198: ; preds = %8433, %8435 + %.0.i1197 = phi float [ %8434, %8433 ], [ %8436, %8435 ], !dbg !304 + %8437 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1199 = icmp eq i32 %8437, 0, !dbg !304 + br i1 %.not.i1199, label %8440, label %8438, !dbg !304 + +8438: ; preds = %__nv_exp2f.exit1198 + %8439 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8305) #3, !dbg !304 + br label %__nv_exp2f.exit1201, !dbg !304 + +8440: ; preds = %__nv_exp2f.exit1198 + %8441 = tail call float @llvm.nvvm.ex2.approx.f(float %8305) #3, !dbg !304 + br label %__nv_exp2f.exit1201, !dbg !304 + +__nv_exp2f.exit1201: ; preds = %8438, %8440 + %.0.i1200 = phi float [ %8439, %8438 ], [ %8441, %8440 ], !dbg !304 + %8442 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1202 = icmp eq i32 %8442, 0, !dbg !304 + br i1 %.not.i1202, label %8445, label %8443, !dbg !304 + +8443: ; preds = %__nv_exp2f.exit1201 + %8444 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8306) #3, !dbg !304 + br label %__nv_exp2f.exit1204, !dbg !304 + +8445: ; preds = %__nv_exp2f.exit1201 + %8446 = tail call float @llvm.nvvm.ex2.approx.f(float %8306) #3, !dbg !304 + br label %__nv_exp2f.exit1204, !dbg !304 + +__nv_exp2f.exit1204: ; preds = %8443, %8445 + %.0.i1203 = phi float [ %8444, %8443 ], [ %8446, %8445 ], !dbg !304 + %8447 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1205 = icmp eq i32 %8447, 0, !dbg !304 + br i1 %.not.i1205, label %8450, label %8448, !dbg !304 + +8448: ; preds = %__nv_exp2f.exit1204 + %8449 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8307) #3, !dbg !304 + br label %__nv_exp2f.exit1207, !dbg !304 + +8450: ; preds = %__nv_exp2f.exit1204 + %8451 = tail call float @llvm.nvvm.ex2.approx.f(float %8307) #3, !dbg !304 + br label %__nv_exp2f.exit1207, !dbg !304 + +__nv_exp2f.exit1207: ; preds = %8448, %8450 + %.0.i1206 = phi float [ %8449, %8448 ], [ %8451, %8450 ], !dbg !304 + %8452 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1208 = icmp eq i32 %8452, 0, !dbg !304 + br i1 %.not.i1208, label %8455, label %8453, !dbg !304 + +8453: ; preds = %__nv_exp2f.exit1207 + %8454 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8308) #3, !dbg !304 + br label %__nv_exp2f.exit1210, !dbg !304 + +8455: ; preds = %__nv_exp2f.exit1207 + %8456 = tail call float @llvm.nvvm.ex2.approx.f(float %8308) #3, !dbg !304 + br label %__nv_exp2f.exit1210, !dbg !304 + +__nv_exp2f.exit1210: ; preds = %8453, %8455 + %.0.i1209 = phi float [ %8454, %8453 ], [ %8456, %8455 ], !dbg !304 + %8457 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1211 = icmp eq i32 %8457, 0, !dbg !304 + br i1 %.not.i1211, label %8460, label %8458, !dbg !304 + +8458: ; preds = %__nv_exp2f.exit1210 + %8459 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8309) #3, !dbg !304 + br label %__nv_exp2f.exit1213, !dbg !304 + +8460: ; preds = %__nv_exp2f.exit1210 + %8461 = tail call float @llvm.nvvm.ex2.approx.f(float %8309) #3, !dbg !304 + br label %__nv_exp2f.exit1213, !dbg !304 + +__nv_exp2f.exit1213: ; preds = %8458, %8460 + %.0.i1212 = phi float [ %8459, %8458 ], [ %8461, %8460 ], !dbg !304 + %8462 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1214 = icmp eq i32 %8462, 0, !dbg !304 + br i1 %.not.i1214, label %8465, label %8463, !dbg !304 + +8463: ; preds = %__nv_exp2f.exit1213 + %8464 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8310) #3, !dbg !304 + br label %__nv_exp2f.exit1216, !dbg !304 + +8465: ; preds = %__nv_exp2f.exit1213 + %8466 = tail call float @llvm.nvvm.ex2.approx.f(float %8310) #3, !dbg !304 + br label %__nv_exp2f.exit1216, !dbg !304 + +__nv_exp2f.exit1216: ; preds = %8463, %8465 + %.0.i1215 = phi float [ %8464, %8463 ], [ %8466, %8465 ], !dbg !304 + %8467 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1217 = icmp eq i32 %8467, 0, !dbg !304 + br i1 %.not.i1217, label %8470, label %8468, !dbg !304 + +8468: ; preds = %__nv_exp2f.exit1216 + %8469 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8311) #3, !dbg !304 + br label %__nv_exp2f.exit1219, !dbg !304 + +8470: ; preds = %__nv_exp2f.exit1216 + %8471 = tail call float @llvm.nvvm.ex2.approx.f(float %8311) #3, !dbg !304 + br label %__nv_exp2f.exit1219, !dbg !304 + +__nv_exp2f.exit1219: ; preds = %8468, %8470 + %.0.i1218 = phi float [ %8469, %8468 ], [ %8471, %8470 ], !dbg !304 + %8472 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %7761, !dbg !290 + %8473 = insertelement <2 x float> poison, float %.0.i, i64 0, !dbg !305 + %8474 = insertelement <2 x float> %8473, float %.0.i1128, i64 1, !dbg !305 + %8475 = fptrunc <2 x float> %8474 to <2 x bfloat>, !dbg !305 + %8476 = insertelement <2 x float> poison, float %.0.i1131, i64 0, !dbg !305 + %8477 = insertelement <2 x float> %8476, float %.0.i1134, i64 1, !dbg !305 + %8478 = fptrunc <2 x float> %8477 to <2 x bfloat>, !dbg !305 + %8479 = insertelement <2 x float> poison, float %.0.i1137, i64 0, !dbg !305 + %8480 = insertelement <2 x float> %8479, float %.0.i1140, i64 1, !dbg !305 + %8481 = fptrunc <2 x float> %8480 to <2 x bfloat>, !dbg !305 + %8482 = insertelement <2 x float> poison, float %.0.i1143, i64 0, !dbg !305 + %8483 = insertelement <2 x float> %8482, float %.0.i1146, i64 1, !dbg !305 + %8484 = fptrunc <2 x float> %8483 to <2 x bfloat>, !dbg !305 + %8485 = insertelement <2 x float> poison, float %.0.i1149, i64 0, !dbg !305 + %8486 = insertelement <2 x float> %8485, float %.0.i1152, i64 1, !dbg !305 + %8487 = fptrunc <2 x float> %8486 to <2 x bfloat>, !dbg !305 + %8488 = insertelement <2 x float> poison, float %.0.i1155, i64 0, !dbg !305 + %8489 = insertelement <2 x float> %8488, float %.0.i1158, i64 1, !dbg !305 + %8490 = fptrunc <2 x float> %8489 to <2 x bfloat>, !dbg !305 + %8491 = insertelement <2 x float> poison, float %.0.i1161, i64 0, !dbg !305 + %8492 = insertelement <2 x float> %8491, float %.0.i1164, i64 1, !dbg !305 + %8493 = fptrunc <2 x float> %8492 to <2 x bfloat>, !dbg !305 + %8494 = insertelement <2 x float> poison, float %.0.i1167, i64 0, !dbg !305 + %8495 = insertelement <2 x float> %8494, float %.0.i1170, i64 1, !dbg !305 + %8496 = fptrunc <2 x float> %8495 to <2 x bfloat>, !dbg !305 + %8497 = insertelement <2 x float> poison, float %.0.i1173, i64 0, !dbg !305 + %8498 = insertelement <2 x float> %8497, float %.0.i1176, i64 1, !dbg !305 + %8499 = fptrunc <2 x float> %8498 to <2 x bfloat>, !dbg !305 + %8500 = insertelement <2 x float> poison, float %.0.i1179, i64 0, !dbg !305 + %8501 = insertelement <2 x float> %8500, float %.0.i1182, i64 1, !dbg !305 + %8502 = fptrunc <2 x float> %8501 to <2 x bfloat>, !dbg !305 + %8503 = insertelement <2 x float> poison, float %.0.i1185, i64 0, !dbg !305 + %8504 = insertelement <2 x float> %8503, float %.0.i1188, i64 1, !dbg !305 + %8505 = fptrunc <2 x float> %8504 to <2 x bfloat>, !dbg !305 + %8506 = insertelement <2 x float> poison, float %.0.i1191, i64 0, !dbg !305 + %8507 = insertelement <2 x float> %8506, float %.0.i1194, i64 1, !dbg !305 + %8508 = fptrunc <2 x float> %8507 to <2 x bfloat>, !dbg !305 + %8509 = insertelement <2 x float> poison, float %.0.i1197, i64 0, !dbg !305 + %8510 = insertelement <2 x float> %8509, float %.0.i1200, i64 1, !dbg !305 + %8511 = fptrunc <2 x float> %8510 to <2 x bfloat>, !dbg !305 + %8512 = insertelement <2 x float> poison, float %.0.i1203, i64 0, !dbg !305 + %8513 = insertelement <2 x float> %8512, float %.0.i1206, i64 1, !dbg !305 + %8514 = fptrunc <2 x float> %8513 to <2 x bfloat>, !dbg !305 + %8515 = insertelement <2 x float> poison, float %.0.i1209, i64 0, !dbg !305 + %8516 = insertelement <2 x float> %8515, float %.0.i1212, i64 1, !dbg !305 + %8517 = fptrunc <2 x float> %8516 to <2 x bfloat>, !dbg !305 + %8518 = insertelement <2 x float> poison, float %.0.i1215, i64 0, !dbg !305 + %8519 = insertelement <2 x float> %8518, float %.0.i1218, i64 1, !dbg !305 + %8520 = fptrunc <2 x float> %8519 to <2 x bfloat>, !dbg !305 + %8521 = bitcast <2 x bfloat> %8475 to i32, !dbg !306 + %8522 = bitcast <2 x bfloat> %8478 to i32, !dbg !306 + %8523 = bitcast <2 x bfloat> %8481 to i32, !dbg !306 + %8524 = bitcast <2 x bfloat> %8484 to i32, !dbg !306 + %8525 = bitcast <2 x bfloat> %8487 to i32, !dbg !306 + %8526 = bitcast <2 x bfloat> %8490 to i32, !dbg !306 + %8527 = bitcast <2 x bfloat> %8493 to i32, !dbg !306 + %8528 = bitcast <2 x bfloat> %8496 to i32, !dbg !306 + %8529 = bitcast <2 x bfloat> %8499 to i32, !dbg !306 + %8530 = bitcast <2 x bfloat> %8502 to i32, !dbg !306 + %8531 = bitcast <2 x bfloat> %8505 to i32, !dbg !306 + %8532 = bitcast <2 x bfloat> %8508 to i32, !dbg !306 + %8533 = bitcast <2 x bfloat> %8511 to i32, !dbg !306 + %8534 = bitcast <2 x bfloat> %8514 to i32, !dbg !306 + %8535 = bitcast <2 x bfloat> %8517 to i32, !dbg !306 + %8536 = bitcast <2 x bfloat> %8520 to i32, !dbg !306 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !306 + %8537 = ptrtoint ptr addrspace(3) %8472 to i32, !dbg !306 + %8538 = lshr exact i32 %8537, 4, !dbg !306 + %8539 = and i32 %8538, 16383, !dbg !306 + %8540 = zext nneg i32 %8539 to i64, !dbg !306 + %8541 = or disjoint i64 %8540, 4611686293338849280, !dbg !306 + %8542 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %.pn5041610, float %.pn5021611, float %.pn5001612, float %.pn4981613, float %.pn4961614, float %.pn4941615, float %.pn4921616, float %.pn4901617, float %.pn4881618, float %.pn4861619, float %.pn4841620, float %.pn4821621, float %.pn4801622, float %.pn4781623, float %.pn4761624, float %.pn4741625, float %.pn4721626, float %.pn4701627, float %.pn4681628, float %.pn4661629, float %.pn4641630, float %.pn4621631, float %.pn4601632, float %.pn4581633, float %.pn4561634, float %.pn4541635, float %.pn4521636, float %.pn4501637, float %.pn4481638, float %.pn4461639, float %.pn4441640, float %.pn4421641, float %.pn4401642, float %.pn4381643, float %.pn4361644, float %.pn4341645, float %.pn4321646, float %.pn4301647, float %.pn4281648, float %.pn4261649, float %.pn4241650, float %.pn4221651, float %.pn4201652, float %.pn4181653, float %.pn4161654, float %.pn4141655, float %.pn4121656, float %.pn4101657, float %.pn4081658, float %.pn4061659, float %.pn4041660, float %.pn4021661, float %.pn4001662, float %.pn3981663, float %.pn3961664, float %.pn3941665, float %.pn3921666, float %.pn3901667, float %.pn3881668, float %.pn3861669, float %.pn3841670, float %.pn3821671, float %.pn3801672, float %.pn3781673, i32 %8521, i32 %8522, i32 %8523, i32 %8524, i64 %8541, i1 true) #3, !dbg !306 + %8543 = add i32 %8537, 2048, !dbg !306 + %8544 = lshr exact i32 %8543, 4, !dbg !306 + %8545 = and i32 %8544, 16383, !dbg !306 + %8546 = zext nneg i32 %8545 to i64, !dbg !306 + %8547 = or disjoint i64 %8546, 4611686293338849280, !dbg !306 + %8548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 0, !dbg !306 + %8549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 1, !dbg !306 + %8550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 2, !dbg !306 + %8551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 3, !dbg !306 + %8552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 4, !dbg !306 + %8553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 5, !dbg !306 + %8554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 6, !dbg !306 + %8555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 7, !dbg !306 + %8556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 8, !dbg !306 + %8557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 9, !dbg !306 + %8558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 10, !dbg !306 + %8559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 11, !dbg !306 + %8560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 12, !dbg !306 + %8561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 13, !dbg !306 + %8562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 14, !dbg !306 + %8563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 15, !dbg !306 + %8564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 16, !dbg !306 + %8565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 17, !dbg !306 + %8566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 18, !dbg !306 + %8567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 19, !dbg !306 + %8568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 20, !dbg !306 + %8569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 21, !dbg !306 + %8570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 22, !dbg !306 + %8571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 23, !dbg !306 + %8572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 24, !dbg !306 + %8573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 25, !dbg !306 + %8574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 26, !dbg !306 + %8575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 27, !dbg !306 + %8576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 28, !dbg !306 + %8577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 29, !dbg !306 + %8578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 30, !dbg !306 + %8579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 31, !dbg !306 + %8580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 32, !dbg !306 + %8581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 33, !dbg !306 + %8582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 34, !dbg !306 + %8583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 35, !dbg !306 + %8584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 36, !dbg !306 + %8585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 37, !dbg !306 + %8586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 38, !dbg !306 + %8587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 39, !dbg !306 + %8588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 40, !dbg !306 + %8589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 41, !dbg !306 + %8590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 42, !dbg !306 + %8591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 43, !dbg !306 + %8592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 44, !dbg !306 + %8593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 45, !dbg !306 + %8594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 46, !dbg !306 + %8595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 47, !dbg !306 + %8596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 48, !dbg !306 + %8597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 49, !dbg !306 + %8598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 50, !dbg !306 + %8599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 51, !dbg !306 + %8600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 52, !dbg !306 + %8601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 53, !dbg !306 + %8602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 54, !dbg !306 + %8603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 55, !dbg !306 + %8604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 56, !dbg !306 + %8605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 57, !dbg !306 + %8606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 58, !dbg !306 + %8607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 59, !dbg !306 + %8608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 60, !dbg !306 + %8609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 61, !dbg !306 + %8610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 62, !dbg !306 + %8611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 63, !dbg !306 + %8612 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %8548, float %8549, float %8550, float %8551, float %8552, float %8553, float %8554, float %8555, float %8556, float %8557, float %8558, float %8559, float %8560, float %8561, float %8562, float %8563, float %8564, float %8565, float %8566, float %8567, float %8568, float %8569, float %8570, float %8571, float %8572, float %8573, float %8574, float %8575, float %8576, float %8577, float %8578, float %8579, float %8580, float %8581, float %8582, float %8583, float %8584, float %8585, float %8586, float %8587, float %8588, float %8589, float %8590, float %8591, float %8592, float %8593, float %8594, float %8595, float %8596, float %8597, float %8598, float %8599, float %8600, float %8601, float %8602, float %8603, float %8604, float %8605, float %8606, float %8607, float %8608, float %8609, float %8610, float %8611, i32 %8525, i32 %8526, i32 %8527, i32 %8528, i64 %8547, i1 true) #3, !dbg !306 + %8613 = add i32 %8537, 4096, !dbg !306 + %8614 = lshr exact i32 %8613, 4, !dbg !306 + %8615 = and i32 %8614, 16383, !dbg !306 + %8616 = zext nneg i32 %8615 to i64, !dbg !306 + %8617 = or disjoint i64 %8616, 4611686293338849280, !dbg !306 + %8618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 0, !dbg !306 + %8619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 1, !dbg !306 + %8620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 2, !dbg !306 + %8621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 3, !dbg !306 + %8622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 4, !dbg !306 + %8623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 5, !dbg !306 + %8624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 6, !dbg !306 + %8625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 7, !dbg !306 + %8626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 8, !dbg !306 + %8627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 9, !dbg !306 + %8628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 10, !dbg !306 + %8629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 11, !dbg !306 + %8630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 12, !dbg !306 + %8631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 13, !dbg !306 + %8632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 14, !dbg !306 + %8633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 15, !dbg !306 + %8634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 16, !dbg !306 + %8635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 17, !dbg !306 + %8636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 18, !dbg !306 + %8637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 19, !dbg !306 + %8638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 20, !dbg !306 + %8639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 21, !dbg !306 + %8640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 22, !dbg !306 + %8641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 23, !dbg !306 + %8642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 24, !dbg !306 + %8643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 25, !dbg !306 + %8644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 26, !dbg !306 + %8645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 27, !dbg !306 + %8646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 28, !dbg !306 + %8647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 29, !dbg !306 + %8648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 30, !dbg !306 + %8649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 31, !dbg !306 + %8650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 32, !dbg !306 + %8651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 33, !dbg !306 + %8652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 34, !dbg !306 + %8653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 35, !dbg !306 + %8654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 36, !dbg !306 + %8655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 37, !dbg !306 + %8656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 38, !dbg !306 + %8657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 39, !dbg !306 + %8658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 40, !dbg !306 + %8659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 41, !dbg !306 + %8660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 42, !dbg !306 + %8661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 43, !dbg !306 + %8662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 44, !dbg !306 + %8663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 45, !dbg !306 + %8664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 46, !dbg !306 + %8665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 47, !dbg !306 + %8666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 48, !dbg !306 + %8667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 49, !dbg !306 + %8668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 50, !dbg !306 + %8669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 51, !dbg !306 + %8670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 52, !dbg !306 + %8671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 53, !dbg !306 + %8672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 54, !dbg !306 + %8673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 55, !dbg !306 + %8674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 56, !dbg !306 + %8675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 57, !dbg !306 + %8676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 58, !dbg !306 + %8677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 59, !dbg !306 + %8678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 60, !dbg !306 + %8679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 61, !dbg !306 + %8680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 62, !dbg !306 + %8681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 63, !dbg !306 + %8682 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %8618, float %8619, float %8620, float %8621, float %8622, float %8623, float %8624, float %8625, float %8626, float %8627, float %8628, float %8629, float %8630, float %8631, float %8632, float %8633, float %8634, float %8635, float %8636, float %8637, float %8638, float %8639, float %8640, float %8641, float %8642, float %8643, float %8644, float %8645, float %8646, float %8647, float %8648, float %8649, float %8650, float %8651, float %8652, float %8653, float %8654, float %8655, float %8656, float %8657, float %8658, float %8659, float %8660, float %8661, float %8662, float %8663, float %8664, float %8665, float %8666, float %8667, float %8668, float %8669, float %8670, float %8671, float %8672, float %8673, float %8674, float %8675, float %8676, float %8677, float %8678, float %8679, float %8680, float %8681, i32 %8529, i32 %8530, i32 %8531, i32 %8532, i64 %8617, i1 true) #3, !dbg !306 + %8683 = add i32 %8537, 6144, !dbg !306 + %8684 = lshr exact i32 %8683, 4, !dbg !306 + %8685 = and i32 %8684, 16383, !dbg !306 + %8686 = zext nneg i32 %8685 to i64, !dbg !306 + %8687 = or disjoint i64 %8686, 4611686293338849280, !dbg !306 + %8688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 0, !dbg !306 + %8689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 1, !dbg !306 + %8690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 2, !dbg !306 + %8691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 3, !dbg !306 + %8692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 4, !dbg !306 + %8693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 5, !dbg !306 + %8694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 6, !dbg !306 + %8695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 7, !dbg !306 + %8696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 8, !dbg !306 + %8697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 9, !dbg !306 + %8698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 10, !dbg !306 + %8699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 11, !dbg !306 + %8700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 12, !dbg !306 + %8701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 13, !dbg !306 + %8702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 14, !dbg !306 + %8703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 15, !dbg !306 + %8704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 16, !dbg !306 + %8705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 17, !dbg !306 + %8706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 18, !dbg !306 + %8707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 19, !dbg !306 + %8708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 20, !dbg !306 + %8709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 21, !dbg !306 + %8710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 22, !dbg !306 + %8711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 23, !dbg !306 + %8712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 24, !dbg !306 + %8713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 25, !dbg !306 + %8714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 26, !dbg !306 + %8715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 27, !dbg !306 + %8716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 28, !dbg !306 + %8717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 29, !dbg !306 + %8718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 30, !dbg !306 + %8719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 31, !dbg !306 + %8720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 32, !dbg !306 + %8721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 33, !dbg !306 + %8722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 34, !dbg !306 + %8723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 35, !dbg !306 + %8724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 36, !dbg !306 + %8725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 37, !dbg !306 + %8726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 38, !dbg !306 + %8727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 39, !dbg !306 + %8728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 40, !dbg !306 + %8729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 41, !dbg !306 + %8730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 42, !dbg !306 + %8731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 43, !dbg !306 + %8732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 44, !dbg !306 + %8733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 45, !dbg !306 + %8734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 46, !dbg !306 + %8735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 47, !dbg !306 + %8736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 48, !dbg !306 + %8737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 49, !dbg !306 + %8738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 50, !dbg !306 + %8739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 51, !dbg !306 + %8740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 52, !dbg !306 + %8741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 53, !dbg !306 + %8742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 54, !dbg !306 + %8743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 55, !dbg !306 + %8744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 56, !dbg !306 + %8745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 57, !dbg !306 + %8746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 58, !dbg !306 + %8747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 59, !dbg !306 + %8748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 60, !dbg !306 + %8749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 61, !dbg !306 + %8750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 62, !dbg !306 + %8751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 63, !dbg !306 + %8752 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %8688, float %8689, float %8690, float %8691, float %8692, float %8693, float %8694, float %8695, float %8696, float %8697, float %8698, float %8699, float %8700, float %8701, float %8702, float %8703, float %8704, float %8705, float %8706, float %8707, float %8708, float %8709, float %8710, float %8711, float %8712, float %8713, float %8714, float %8715, float %8716, float %8717, float %8718, float %8719, float %8720, float %8721, float %8722, float %8723, float %8724, float %8725, float %8726, float %8727, float %8728, float %8729, float %8730, float %8731, float %8732, float %8733, float %8734, float %8735, float %8736, float %8737, float %8738, float %8739, float %8740, float %8741, float %8742, float %8743, float %8744, float %8745, float %8746, float %8747, float %8748, float %8749, float %8750, float %8751, i32 %8533, i32 %8534, i32 %8535, i32 %8536, i64 %8687, i1 true) #3, !dbg !306 + %8753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 0, !dbg !306 + %8754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 1, !dbg !306 + %8755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 2, !dbg !306 + %8756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 3, !dbg !306 + %8757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 4, !dbg !306 + %8758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 5, !dbg !306 + %8759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 6, !dbg !306 + %8760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 7, !dbg !306 + %8761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 8, !dbg !306 + %8762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 9, !dbg !306 + %8763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 10, !dbg !306 + %8764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 11, !dbg !306 + %8765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 12, !dbg !306 + %8766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 13, !dbg !306 + %8767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 14, !dbg !306 + %8768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 15, !dbg !306 + %8769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 16, !dbg !306 + %8770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 17, !dbg !306 + %8771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 18, !dbg !306 + %8772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 19, !dbg !306 + %8773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 20, !dbg !306 + %8774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 21, !dbg !306 + %8775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 22, !dbg !306 + %8776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 23, !dbg !306 + %8777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 24, !dbg !306 + %8778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 25, !dbg !306 + %8779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 26, !dbg !306 + %8780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 27, !dbg !306 + %8781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 28, !dbg !306 + %8782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 29, !dbg !306 + %8783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 30, !dbg !306 + %8784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 31, !dbg !306 + %8785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 32, !dbg !306 + %8786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 33, !dbg !306 + %8787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 34, !dbg !306 + %8788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 35, !dbg !306 + %8789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 36, !dbg !306 + %8790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 37, !dbg !306 + %8791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 38, !dbg !306 + %8792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 39, !dbg !306 + %8793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 40, !dbg !306 + %8794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 41, !dbg !306 + %8795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 42, !dbg !306 + %8796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 43, !dbg !306 + %8797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 44, !dbg !306 + %8798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 45, !dbg !306 + %8799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 46, !dbg !306 + %8800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 47, !dbg !306 + %8801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 48, !dbg !306 + %8802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 49, !dbg !306 + %8803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 50, !dbg !306 + %8804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 51, !dbg !306 + %8805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 52, !dbg !306 + %8806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 53, !dbg !306 + %8807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 54, !dbg !306 + %8808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 55, !dbg !306 + %8809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 56, !dbg !306 + %8810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 57, !dbg !306 + %8811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 58, !dbg !306 + %8812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 59, !dbg !306 + %8813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 60, !dbg !306 + %8814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 61, !dbg !306 + %8815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 62, !dbg !306 + %8816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 63, !dbg !306 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !306 + %8817 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %7763, !dbg !294 + %8818 = getelementptr inbounds nuw i8, ptr addrspace(3) %8817, i32 %4812, !dbg !294 + %8819 = getelementptr inbounds nuw i8, ptr addrspace(3) %8817, i32 %4815, !dbg !294 + %8820 = getelementptr inbounds nuw i8, ptr addrspace(3) %8817, i32 %4817, !dbg !294 + %8821 = getelementptr inbounds nuw i8, ptr addrspace(3) %8817, i32 %4819, !dbg !294 + %8822 = getelementptr inbounds nuw i8, ptr addrspace(3) %8817, i32 %4821, !dbg !294 + %8823 = getelementptr inbounds nuw i8, ptr addrspace(3) %8817, i32 %4823, !dbg !294 + %8824 = getelementptr inbounds nuw i8, ptr addrspace(3) %8817, i32 %4825, !dbg !294 + %8825 = getelementptr inbounds nuw i8, ptr addrspace(3) %8817, i32 %4827, !dbg !294 + %8826 = add i32 %7831, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !307 + %8827 = lshr exact i32 %8826, 4, !dbg !307 + %8828 = and i32 %8827, 16383, !dbg !307 + %8829 = zext nneg i32 %8828 to i64, !dbg !307 + %8830 = or disjoint i64 %8829, 4611686293372403712, !dbg !307 + %8831 = add i32 %7843, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !307 + %8832 = lshr exact i32 %8831, 4, !dbg !307 + %8833 = and i32 %8832, 16383, !dbg !307 + %8834 = zext nneg i32 %8833 to i64, !dbg !307 + %8835 = or disjoint i64 %8834, 4611686293372403712, !dbg !307 + %8836 = add i32 %8537, 32, !dbg !307 + %8837 = lshr exact i32 %8836, 4, !dbg !307 + %8838 = and i32 %8837, 16383, !dbg !307 + %8839 = zext nneg i32 %8838 to i64, !dbg !307 + %8840 = or disjoint i64 %8839, 4611686293338849280, !dbg !307 + %8841 = add i32 %7887, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !307 + %8842 = lshr exact i32 %8841, 4, !dbg !307 + %8843 = and i32 %8842, 16383, !dbg !307 + %8844 = zext nneg i32 %8843 to i64, !dbg !307 + %8845 = or disjoint i64 %8844, 4611686293372403712, !dbg !307 + %8846 = add i32 %8537, 64, !dbg !307 + %8847 = lshr exact i32 %8846, 4, !dbg !307 + %8848 = and i32 %8847, 16383, !dbg !307 + %8849 = zext nneg i32 %8848 to i64, !dbg !307 + %8850 = or disjoint i64 %8849, 4611686293338849280, !dbg !307 + %8851 = add i32 %7931, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !307 + %8852 = lshr exact i32 %8851, 4, !dbg !307 + %8853 = and i32 %8852, 16383, !dbg !307 + %8854 = zext nneg i32 %8853 to i64, !dbg !307 + %8855 = or disjoint i64 %8854, 4611686293372403712, !dbg !307 + %8856 = add i32 %8537, 96, !dbg !307 + %8857 = lshr exact i32 %8856, 4, !dbg !307 + %8858 = and i32 %8857, 16383, !dbg !307 + %8859 = zext nneg i32 %8858 to i64, !dbg !307 + %8860 = or disjoint i64 %8859, 4611686293338849280, !dbg !307 + %8861 = add i32 %7975, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !307 + %8862 = lshr exact i32 %8861, 4, !dbg !307 + %8863 = and i32 %8862, 16383, !dbg !307 + %8864 = zext nneg i32 %8863 to i64, !dbg !307 + %8865 = or disjoint i64 %8864, 4611686293372403712, !dbg !307 + %8866 = add i32 %8537, 8192, !dbg !307 + %8867 = lshr exact i32 %8866, 4, !dbg !307 + %8868 = and i32 %8867, 16383, !dbg !307 + %8869 = zext nneg i32 %8868 to i64, !dbg !307 + %8870 = or disjoint i64 %8869, 4611686293338849280, !dbg !307 + %8871 = add i32 %8019, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !307 + %8872 = lshr exact i32 %8871, 4, !dbg !307 + %8873 = and i32 %8872, 16383, !dbg !307 + %8874 = zext nneg i32 %8873 to i64, !dbg !307 + %8875 = or disjoint i64 %8874, 4611686293372403712, !dbg !307 + %8876 = add i32 %8537, 8224, !dbg !307 + %8877 = lshr exact i32 %8876, 4, !dbg !307 + %8878 = and i32 %8877, 16383, !dbg !307 + %8879 = zext nneg i32 %8878 to i64, !dbg !307 + %8880 = or disjoint i64 %8879, 4611686293338849280, !dbg !307 + %8881 = add i32 %8063, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !307 + %8882 = lshr exact i32 %8881, 4, !dbg !307 + %8883 = and i32 %8882, 16383, !dbg !307 + %8884 = zext nneg i32 %8883 to i64, !dbg !307 + %8885 = or disjoint i64 %8884, 4611686293372403712, !dbg !307 + %8886 = add i32 %8537, 8256, !dbg !307 + %8887 = lshr exact i32 %8886, 4, !dbg !307 + %8888 = and i32 %8887, 16383, !dbg !307 + %8889 = zext nneg i32 %8888 to i64, !dbg !307 + %8890 = or disjoint i64 %8889, 4611686293338849280, !dbg !307 + %8891 = add i32 %8107, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !307 + %8892 = lshr exact i32 %8891, 4, !dbg !307 + %8893 = and i32 %8892, 16383, !dbg !307 + %8894 = zext nneg i32 %8893 to i64, !dbg !307 + %8895 = or disjoint i64 %8894, 4611686293372403712, !dbg !307 + %8896 = add i32 %8537, 8288, !dbg !307 + %8897 = lshr exact i32 %8896, 4, !dbg !307 + %8898 = and i32 %8897, 16383, !dbg !307 + %8899 = zext nneg i32 %8898 to i64, !dbg !307 + %8900 = or disjoint i64 %8899, 4611686293338849280, !dbg !307 + %8901 = load <2 x float>, ptr addrspace(3) %8825, align 8, !dbg !294 + %8902 = load <2 x float>, ptr addrspace(3) %8824, align 8, !dbg !294 + %8903 = load <2 x float>, ptr addrspace(3) %8823, align 8, !dbg !294 + %8904 = load <2 x float>, ptr addrspace(3) %8822, align 8, !dbg !294 + %8905 = load <2 x float>, ptr addrspace(3) %8821, align 8, !dbg !294 + %8906 = load <2 x float>, ptr addrspace(3) %8820, align 8, !dbg !294 + %8907 = load <2 x float>, ptr addrspace(3) %8819, align 8, !dbg !294 + %8908 = load <2 x float>, ptr addrspace(3) %8818, align 8, !dbg !294 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !307 + %8909 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %8830, i64 %8541) #3, !dbg !307 + %8910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 0, !dbg !307 + %8911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 1, !dbg !307 + %8912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 2, !dbg !307 + %8913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 3, !dbg !307 + %8914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 4, !dbg !307 + %8915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 5, !dbg !307 + %8916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 6, !dbg !307 + %8917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 7, !dbg !307 + %8918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 8, !dbg !307 + %8919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 9, !dbg !307 + %8920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 10, !dbg !307 + %8921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 11, !dbg !307 + %8922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 12, !dbg !307 + %8923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 13, !dbg !307 + %8924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 14, !dbg !307 + %8925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 15, !dbg !307 + %8926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 16, !dbg !307 + %8927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 17, !dbg !307 + %8928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 18, !dbg !307 + %8929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 19, !dbg !307 + %8930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 20, !dbg !307 + %8931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 21, !dbg !307 + %8932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 22, !dbg !307 + %8933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 23, !dbg !307 + %8934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 24, !dbg !307 + %8935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 25, !dbg !307 + %8936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 26, !dbg !307 + %8937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 27, !dbg !307 + %8938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 28, !dbg !307 + %8939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 29, !dbg !307 + %8940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 30, !dbg !307 + %8941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 31, !dbg !307 + %8942 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8910, float %8911, float %8912, float %8913, float %8914, float %8915, float %8916, float %8917, float %8918, float %8919, float %8920, float %8921, float %8922, float %8923, float %8924, float %8925, float %8926, float %8927, float %8928, float %8929, float %8930, float %8931, float %8932, float %8933, float %8934, float %8935, float %8936, float %8937, float %8938, float %8939, float %8940, float %8941, i64 %8835, i64 %8840, i1 true) #3, !dbg !307 + %8943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 0, !dbg !307 + %8944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 1, !dbg !307 + %8945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 2, !dbg !307 + %8946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 3, !dbg !307 + %8947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 4, !dbg !307 + %8948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 5, !dbg !307 + %8949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 6, !dbg !307 + %8950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 7, !dbg !307 + %8951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 8, !dbg !307 + %8952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 9, !dbg !307 + %8953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 10, !dbg !307 + %8954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 11, !dbg !307 + %8955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 12, !dbg !307 + %8956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 13, !dbg !307 + %8957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 14, !dbg !307 + %8958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 15, !dbg !307 + %8959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 16, !dbg !307 + %8960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 17, !dbg !307 + %8961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 18, !dbg !307 + %8962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 19, !dbg !307 + %8963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 20, !dbg !307 + %8964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 21, !dbg !307 + %8965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 22, !dbg !307 + %8966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 23, !dbg !307 + %8967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 24, !dbg !307 + %8968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 25, !dbg !307 + %8969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 26, !dbg !307 + %8970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 27, !dbg !307 + %8971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 28, !dbg !307 + %8972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 29, !dbg !307 + %8973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 30, !dbg !307 + %8974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 31, !dbg !307 + %8975 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8943, float %8944, float %8945, float %8946, float %8947, float %8948, float %8949, float %8950, float %8951, float %8952, float %8953, float %8954, float %8955, float %8956, float %8957, float %8958, float %8959, float %8960, float %8961, float %8962, float %8963, float %8964, float %8965, float %8966, float %8967, float %8968, float %8969, float %8970, float %8971, float %8972, float %8973, float %8974, i64 %8845, i64 %8850, i1 true) #3, !dbg !307 + %8976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 0, !dbg !307 + %8977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 1, !dbg !307 + %8978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 2, !dbg !307 + %8979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 3, !dbg !307 + %8980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 4, !dbg !307 + %8981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 5, !dbg !307 + %8982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 6, !dbg !307 + %8983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 7, !dbg !307 + %8984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 8, !dbg !307 + %8985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 9, !dbg !307 + %8986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 10, !dbg !307 + %8987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 11, !dbg !307 + %8988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 12, !dbg !307 + %8989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 13, !dbg !307 + %8990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 14, !dbg !307 + %8991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 15, !dbg !307 + %8992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 16, !dbg !307 + %8993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 17, !dbg !307 + %8994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 18, !dbg !307 + %8995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 19, !dbg !307 + %8996 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 20, !dbg !307 + %8997 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 21, !dbg !307 + %8998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 22, !dbg !307 + %8999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 23, !dbg !307 + %9000 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 24, !dbg !307 + %9001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 25, !dbg !307 + %9002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 26, !dbg !307 + %9003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 27, !dbg !307 + %9004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 28, !dbg !307 + %9005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 29, !dbg !307 + %9006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 30, !dbg !307 + %9007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 31, !dbg !307 + %9008 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8976, float %8977, float %8978, float %8979, float %8980, float %8981, float %8982, float %8983, float %8984, float %8985, float %8986, float %8987, float %8988, float %8989, float %8990, float %8991, float %8992, float %8993, float %8994, float %8995, float %8996, float %8997, float %8998, float %8999, float %9000, float %9001, float %9002, float %9003, float %9004, float %9005, float %9006, float %9007, i64 %8855, i64 %8860, i1 true) #3, !dbg !307 + %9009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 0, !dbg !307 + %9010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 1, !dbg !307 + %9011 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 2, !dbg !307 + %9012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 3, !dbg !307 + %9013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 4, !dbg !307 + %9014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 5, !dbg !307 + %9015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 6, !dbg !307 + %9016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 7, !dbg !307 + %9017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 8, !dbg !307 + %9018 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 9, !dbg !307 + %9019 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 10, !dbg !307 + %9020 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 11, !dbg !307 + %9021 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 12, !dbg !307 + %9022 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 13, !dbg !307 + %9023 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 14, !dbg !307 + %9024 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 15, !dbg !307 + %9025 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 16, !dbg !307 + %9026 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 17, !dbg !307 + %9027 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 18, !dbg !307 + %9028 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 19, !dbg !307 + %9029 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 20, !dbg !307 + %9030 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 21, !dbg !307 + %9031 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 22, !dbg !307 + %9032 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 23, !dbg !307 + %9033 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 24, !dbg !307 + %9034 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 25, !dbg !307 + %9035 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 26, !dbg !307 + %9036 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 27, !dbg !307 + %9037 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 28, !dbg !307 + %9038 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 29, !dbg !307 + %9039 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 30, !dbg !307 + %9040 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 31, !dbg !307 + %9041 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9009, float %9010, float %9011, float %9012, float %9013, float %9014, float %9015, float %9016, float %9017, float %9018, float %9019, float %9020, float %9021, float %9022, float %9023, float %9024, float %9025, float %9026, float %9027, float %9028, float %9029, float %9030, float %9031, float %9032, float %9033, float %9034, float %9035, float %9036, float %9037, float %9038, float %9039, float %9040, i64 %8865, i64 %8870, i1 true) #3, !dbg !307 + %9042 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 0, !dbg !307 + %9043 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 1, !dbg !307 + %9044 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 2, !dbg !307 + %9045 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 3, !dbg !307 + %9046 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 4, !dbg !307 + %9047 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 5, !dbg !307 + %9048 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 6, !dbg !307 + %9049 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 7, !dbg !307 + %9050 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 8, !dbg !307 + %9051 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 9, !dbg !307 + %9052 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 10, !dbg !307 + %9053 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 11, !dbg !307 + %9054 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 12, !dbg !307 + %9055 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 13, !dbg !307 + %9056 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 14, !dbg !307 + %9057 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 15, !dbg !307 + %9058 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 16, !dbg !307 + %9059 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 17, !dbg !307 + %9060 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 18, !dbg !307 + %9061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 19, !dbg !307 + %9062 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 20, !dbg !307 + %9063 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 21, !dbg !307 + %9064 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 22, !dbg !307 + %9065 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 23, !dbg !307 + %9066 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 24, !dbg !307 + %9067 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 25, !dbg !307 + %9068 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 26, !dbg !307 + %9069 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 27, !dbg !307 + %9070 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 28, !dbg !307 + %9071 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 29, !dbg !307 + %9072 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 30, !dbg !307 + %9073 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 31, !dbg !307 + %9074 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9042, float %9043, float %9044, float %9045, float %9046, float %9047, float %9048, float %9049, float %9050, float %9051, float %9052, float %9053, float %9054, float %9055, float %9056, float %9057, float %9058, float %9059, float %9060, float %9061, float %9062, float %9063, float %9064, float %9065, float %9066, float %9067, float %9068, float %9069, float %9070, float %9071, float %9072, float %9073, i64 %8875, i64 %8880, i1 true) #3, !dbg !307 + %9075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 0, !dbg !307 + %9076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 1, !dbg !307 + %9077 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 2, !dbg !307 + %9078 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 3, !dbg !307 + %9079 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 4, !dbg !307 + %9080 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 5, !dbg !307 + %9081 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 6, !dbg !307 + %9082 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 7, !dbg !307 + %9083 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 8, !dbg !307 + %9084 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 9, !dbg !307 + %9085 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 10, !dbg !307 + %9086 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 11, !dbg !307 + %9087 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 12, !dbg !307 + %9088 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 13, !dbg !307 + %9089 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 14, !dbg !307 + %9090 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 15, !dbg !307 + %9091 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 16, !dbg !307 + %9092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 17, !dbg !307 + %9093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 18, !dbg !307 + %9094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 19, !dbg !307 + %9095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 20, !dbg !307 + %9096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 21, !dbg !307 + %9097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 22, !dbg !307 + %9098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 23, !dbg !307 + %9099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 24, !dbg !307 + %9100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 25, !dbg !307 + %9101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 26, !dbg !307 + %9102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 27, !dbg !307 + %9103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 28, !dbg !307 + %9104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 29, !dbg !307 + %9105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 30, !dbg !307 + %9106 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 31, !dbg !307 + %9107 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9075, float %9076, float %9077, float %9078, float %9079, float %9080, float %9081, float %9082, float %9083, float %9084, float %9085, float %9086, float %9087, float %9088, float %9089, float %9090, float %9091, float %9092, float %9093, float %9094, float %9095, float %9096, float %9097, float %9098, float %9099, float %9100, float %9101, float %9102, float %9103, float %9104, float %9105, float %9106, i64 %8885, i64 %8890, i1 true) #3, !dbg !307 + %9108 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 0, !dbg !307 + %9109 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 1, !dbg !307 + %9110 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 2, !dbg !307 + %9111 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 3, !dbg !307 + %9112 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 4, !dbg !307 + %9113 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 5, !dbg !307 + %9114 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 6, !dbg !307 + %9115 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 7, !dbg !307 + %9116 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 8, !dbg !307 + %9117 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 9, !dbg !307 + %9118 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 10, !dbg !307 + %9119 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 11, !dbg !307 + %9120 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 12, !dbg !307 + %9121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 13, !dbg !307 + %9122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 14, !dbg !307 + %9123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 15, !dbg !307 + %9124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 16, !dbg !307 + %9125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 17, !dbg !307 + %9126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 18, !dbg !307 + %9127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 19, !dbg !307 + %9128 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 20, !dbg !307 + %9129 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 21, !dbg !307 + %9130 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 22, !dbg !307 + %9131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 23, !dbg !307 + %9132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 24, !dbg !307 + %9133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 25, !dbg !307 + %9134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 26, !dbg !307 + %9135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 27, !dbg !307 + %9136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 28, !dbg !307 + %9137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 29, !dbg !307 + %9138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 30, !dbg !307 + %9139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 31, !dbg !307 + %9140 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9108, float %9109, float %9110, float %9111, float %9112, float %9113, float %9114, float %9115, float %9116, float %9117, float %9118, float %9119, float %9120, float %9121, float %9122, float %9123, float %9124, float %9125, float %9126, float %9127, float %9128, float %9129, float %9130, float %9131, float %9132, float %9133, float %9134, float %9135, float %9136, float %9137, float %9138, float %9139, i64 %8895, i64 %8900, i1 true) #3, !dbg !307 + %9141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 0, !dbg !307 + %9142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 1, !dbg !307 + %9143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 2, !dbg !307 + %9144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 3, !dbg !307 + %9145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 4, !dbg !307 + %9146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 5, !dbg !307 + %9147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 6, !dbg !307 + %9148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 7, !dbg !307 + %9149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 8, !dbg !307 + %9150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 9, !dbg !307 + %9151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 10, !dbg !307 + %9152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 11, !dbg !307 + %9153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 12, !dbg !307 + %9154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 13, !dbg !307 + %9155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 14, !dbg !307 + %9156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 15, !dbg !307 + %9157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 16, !dbg !307 + %9158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 17, !dbg !307 + %9159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 18, !dbg !307 + %9160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 19, !dbg !307 + %9161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 20, !dbg !307 + %9162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 21, !dbg !307 + %9163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 22, !dbg !307 + %9164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 23, !dbg !307 + %9165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 24, !dbg !307 + %9166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 25, !dbg !307 + %9167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 26, !dbg !307 + %9168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 27, !dbg !307 + %9169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 28, !dbg !307 + %9170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 29, !dbg !307 + %9171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 30, !dbg !307 + %9172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 31, !dbg !307 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !307 + %9173 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %9141, float %9142, float %9143, float %9144, float %9145, float %9146, float %9147, float %9148, float %9149, float %9150, float %9151, float %9152, float %9153, float %9154, float %9155, float %9156, float %9157, float %9158, float %9159, float %9160, float %9161, float %9162, float %9163, float %9164, float %9165, float %9166, float %9167, float %9168, float %9169, float %9170, float %9171, float %9172, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 0, i32 0, ptr addrspace(3) %8472, i32 0, i32 0) #3, !dbg !307 + %9174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 0, !dbg !307 + %9175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 1, !dbg !307 + %9176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 2, !dbg !307 + %9177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 3, !dbg !307 + %9178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 4, !dbg !307 + %9179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 5, !dbg !307 + %9180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 6, !dbg !307 + %9181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 7, !dbg !307 + %9182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 8, !dbg !307 + %9183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 9, !dbg !307 + %9184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 10, !dbg !307 + %9185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 11, !dbg !307 + %9186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 12, !dbg !307 + %9187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 13, !dbg !307 + %9188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 14, !dbg !307 + %9189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 15, !dbg !307 + %9190 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 16, !dbg !307 + %9191 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 17, !dbg !307 + %9192 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 18, !dbg !307 + %9193 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 19, !dbg !307 + %9194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 20, !dbg !307 + %9195 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 21, !dbg !307 + %9196 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 22, !dbg !307 + %9197 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 23, !dbg !307 + %9198 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 24, !dbg !307 + %9199 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 25, !dbg !307 + %9200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 26, !dbg !307 + %9201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 27, !dbg !307 + %9202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 28, !dbg !307 + %9203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 29, !dbg !307 + %9204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 30, !dbg !307 + %9205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 31, !dbg !307 + %9206 = insertelement <2 x float> poison, float %9176, i64 0, !dbg !308 + %9207 = insertelement <2 x float> %9206, float %9177, i64 1, !dbg !308 + %9208 = fsub <2 x float> %9207, %8908, !dbg !308 + %9209 = insertelement <2 x float> poison, float %9180, i64 0, !dbg !308 + %9210 = insertelement <2 x float> %9209, float %9181, i64 1, !dbg !308 + %9211 = fsub <2 x float> %9210, %8907, !dbg !308 + %9212 = insertelement <2 x float> poison, float %9184, i64 0, !dbg !308 + %9213 = insertelement <2 x float> %9212, float %9185, i64 1, !dbg !308 + %9214 = fsub <2 x float> %9213, %8906, !dbg !308 + %9215 = insertelement <2 x float> poison, float %9188, i64 0, !dbg !308 + %9216 = insertelement <2 x float> %9215, float %9189, i64 1, !dbg !308 + %9217 = fsub <2 x float> %9216, %8905, !dbg !308 + %9218 = insertelement <2 x float> poison, float %9192, i64 0, !dbg !308 + %9219 = insertelement <2 x float> %9218, float %9193, i64 1, !dbg !308 + %9220 = fsub <2 x float> %9219, %8904, !dbg !308 + %9221 = insertelement <2 x float> poison, float %9196, i64 0, !dbg !308 + %9222 = insertelement <2 x float> %9221, float %9197, i64 1, !dbg !308 + %9223 = fsub <2 x float> %9222, %8903, !dbg !308 + %9224 = insertelement <2 x float> poison, float %9200, i64 0, !dbg !308 + %9225 = insertelement <2 x float> %9224, float %9201, i64 1, !dbg !308 + %9226 = fsub <2 x float> %9225, %8902, !dbg !308 + %9227 = insertelement <2 x float> poison, float %9204, i64 0, !dbg !308 + %9228 = insertelement <2 x float> %9227, float %9205, i64 1, !dbg !308 + %9229 = fsub <2 x float> %9228, %8901, !dbg !308 + %9230 = fmul <2 x float> %8477, %9208, !dbg !309 + %9231 = fmul <2 x float> %8483, %9211, !dbg !309 + %9232 = fmul <2 x float> %8489, %9214, !dbg !309 + %9233 = fmul <2 x float> %8495, %9217, !dbg !309 + %9234 = fmul <2 x float> %8501, %9220, !dbg !309 + %9235 = fmul <2 x float> %8507, %9223, !dbg !309 + %9236 = fmul <2 x float> %8513, %9226, !dbg !309 + %9237 = fmul <2 x float> %8519, %9229, !dbg !309 + %9238 = insertelement <2 x float> poison, float %9174, i64 0, !dbg !308 + %9239 = insertelement <2 x float> %9238, float %9175, i64 1, !dbg !308 + %9240 = fsub <2 x float> %9239, %8908, !dbg !308 + %9241 = fmul <2 x float> %8474, %9240, !dbg !309 + %9242 = fptrunc <2 x float> %9241 to <2 x bfloat>, !dbg !310 + %9243 = fptrunc <2 x float> %9230 to <2 x bfloat>, !dbg !310 + %9244 = insertelement <2 x float> poison, float %9178, i64 0, !dbg !308 + %9245 = insertelement <2 x float> %9244, float %9179, i64 1, !dbg !308 + %9246 = fsub <2 x float> %9245, %8907, !dbg !308 + %9247 = fmul <2 x float> %8480, %9246, !dbg !309 + %9248 = fptrunc <2 x float> %9247 to <2 x bfloat>, !dbg !310 + %9249 = fptrunc <2 x float> %9231 to <2 x bfloat>, !dbg !310 + %9250 = insertelement <2 x float> poison, float %9182, i64 0, !dbg !308 + %9251 = insertelement <2 x float> %9250, float %9183, i64 1, !dbg !308 + %9252 = fsub <2 x float> %9251, %8906, !dbg !308 + %9253 = fmul <2 x float> %8486, %9252, !dbg !309 + %9254 = fptrunc <2 x float> %9253 to <2 x bfloat>, !dbg !310 + %9255 = fptrunc <2 x float> %9232 to <2 x bfloat>, !dbg !310 + %9256 = insertelement <2 x float> poison, float %9186, i64 0, !dbg !308 + %9257 = insertelement <2 x float> %9256, float %9187, i64 1, !dbg !308 + %9258 = fsub <2 x float> %9257, %8905, !dbg !308 + %9259 = fmul <2 x float> %8492, %9258, !dbg !309 + %9260 = fptrunc <2 x float> %9259 to <2 x bfloat>, !dbg !310 + %9261 = fptrunc <2 x float> %9233 to <2 x bfloat>, !dbg !310 + %9262 = insertelement <2 x float> poison, float %9190, i64 0, !dbg !308 + %9263 = insertelement <2 x float> %9262, float %9191, i64 1, !dbg !308 + %9264 = fsub <2 x float> %9263, %8904, !dbg !308 + %9265 = fmul <2 x float> %8498, %9264, !dbg !309 + %9266 = fptrunc <2 x float> %9265 to <2 x bfloat>, !dbg !310 + %9267 = fptrunc <2 x float> %9234 to <2 x bfloat>, !dbg !310 + %9268 = insertelement <2 x float> poison, float %9194, i64 0, !dbg !308 + %9269 = insertelement <2 x float> %9268, float %9195, i64 1, !dbg !308 + %9270 = fsub <2 x float> %9269, %8903, !dbg !308 + %9271 = fmul <2 x float> %8504, %9270, !dbg !309 + %9272 = fptrunc <2 x float> %9271 to <2 x bfloat>, !dbg !310 + %9273 = fptrunc <2 x float> %9235 to <2 x bfloat>, !dbg !310 + %9274 = insertelement <2 x float> poison, float %9198, i64 0, !dbg !308 + %9275 = insertelement <2 x float> %9274, float %9199, i64 1, !dbg !308 + %9276 = fsub <2 x float> %9275, %8902, !dbg !308 + %9277 = fmul <2 x float> %8510, %9276, !dbg !309 + %9278 = fptrunc <2 x float> %9277 to <2 x bfloat>, !dbg !310 + %9279 = fptrunc <2 x float> %9236 to <2 x bfloat>, !dbg !310 + %9280 = insertelement <2 x float> poison, float %9202, i64 0, !dbg !308 + %9281 = insertelement <2 x float> %9280, float %9203, i64 1, !dbg !308 + %9282 = fsub <2 x float> %9281, %8901, !dbg !308 + %9283 = fmul <2 x float> %8516, %9282, !dbg !309 + %9284 = fptrunc <2 x float> %9283 to <2 x bfloat>, !dbg !310 + %9285 = fptrunc <2 x float> %9237 to <2 x bfloat>, !dbg !310 + %9286 = bitcast <2 x bfloat> %9242 to i32, !dbg !311 + %9287 = bitcast <2 x bfloat> %9243 to i32, !dbg !311 + %9288 = bitcast <2 x bfloat> %9248 to i32, !dbg !311 + %9289 = bitcast <2 x bfloat> %9249 to i32, !dbg !311 + %9290 = bitcast <2 x bfloat> %9254 to i32, !dbg !311 + %9291 = bitcast <2 x bfloat> %9255 to i32, !dbg !311 + %9292 = bitcast <2 x bfloat> %9260 to i32, !dbg !311 + %9293 = bitcast <2 x bfloat> %9261 to i32, !dbg !311 + %9294 = bitcast <2 x bfloat> %9266 to i32, !dbg !311 + %9295 = bitcast <2 x bfloat> %9267 to i32, !dbg !311 + %9296 = bitcast <2 x bfloat> %9272 to i32, !dbg !311 + %9297 = bitcast <2 x bfloat> %9273 to i32, !dbg !311 + %9298 = bitcast <2 x bfloat> %9278 to i32, !dbg !311 + %9299 = bitcast <2 x bfloat> %9279 to i32, !dbg !311 + %9300 = bitcast <2 x bfloat> %9284 to i32, !dbg !311 + %9301 = bitcast <2 x bfloat> %9285 to i32, !dbg !311 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !311 + %9302 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %.pn3761546, float %.pn3741547, float %.pn3721548, float %.pn3701549, float %.pn3681550, float %.pn3661551, float %.pn3641552, float %.pn3621553, float %.pn3601554, float %.pn3581555, float %.pn3561556, float %.pn3541557, float %.pn3521558, float %.pn3501559, float %.pn3481560, float %.pn3461561, float %.pn3441562, float %.pn3421563, float %.pn3401564, float %.pn3381565, float %.pn3361566, float %.pn3341567, float %.pn3321568, float %.pn3301569, float %.pn3281570, float %.pn3261571, float %.pn3241572, float %.pn3221573, float %.pn3201574, float %.pn3181575, float %.pn3161576, float %.pn3141577, float %.pn3121578, float %.pn3101579, float %.pn3081580, float %.pn3061581, float %.pn3041582, float %.pn3021583, float %.pn3001584, float %.pn2981585, float %.pn2961586, float %.pn2941587, float %.pn2921588, float %.pn2901589, float %.pn2881590, float %.pn2861591, float %.pn2841592, float %.pn2821593, float %.pn2801594, float %.pn2781595, float %.pn2761596, float %.pn2741597, float %.pn2721598, float %.pn2701599, float %.pn2681600, float %.pn2661601, float %.pn2641602, float %.pn2621603, float %.pn2601604, float %.pn2581605, float %.pn2561606, float %.pn2541607, float %.pn2521608, float %.pn2501609, i32 %9286, i32 %9287, i32 %9288, i32 %9289, i64 %7841, i1 true) #3, !dbg !311 + %9303 = add i32 %7837, 2048, !dbg !311 + %9304 = lshr exact i32 %9303, 4, !dbg !311 + %9305 = and i32 %9304, 16383, !dbg !311 + %9306 = zext nneg i32 %9305 to i64, !dbg !311 + %9307 = or disjoint i64 %9306, 4611686293338849280, !dbg !311 + %9308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 0, !dbg !311 + %9309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 1, !dbg !311 + %9310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 2, !dbg !311 + %9311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 3, !dbg !311 + %9312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 4, !dbg !311 + %9313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 5, !dbg !311 + %9314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 6, !dbg !311 + %9315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 7, !dbg !311 + %9316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 8, !dbg !311 + %9317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 9, !dbg !311 + %9318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 10, !dbg !311 + %9319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 11, !dbg !311 + %9320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 12, !dbg !311 + %9321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 13, !dbg !311 + %9322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 14, !dbg !311 + %9323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 15, !dbg !311 + %9324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 16, !dbg !311 + %9325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 17, !dbg !311 + %9326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 18, !dbg !311 + %9327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 19, !dbg !311 + %9328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 20, !dbg !311 + %9329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 21, !dbg !311 + %9330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 22, !dbg !311 + %9331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 23, !dbg !311 + %9332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 24, !dbg !311 + %9333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 25, !dbg !311 + %9334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 26, !dbg !311 + %9335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 27, !dbg !311 + %9336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 28, !dbg !311 + %9337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 29, !dbg !311 + %9338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 30, !dbg !311 + %9339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 31, !dbg !311 + %9340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 32, !dbg !311 + %9341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 33, !dbg !311 + %9342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 34, !dbg !311 + %9343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 35, !dbg !311 + %9344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 36, !dbg !311 + %9345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 37, !dbg !311 + %9346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 38, !dbg !311 + %9347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 39, !dbg !311 + %9348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 40, !dbg !311 + %9349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 41, !dbg !311 + %9350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 42, !dbg !311 + %9351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 43, !dbg !311 + %9352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 44, !dbg !311 + %9353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 45, !dbg !311 + %9354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 46, !dbg !311 + %9355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 47, !dbg !311 + %9356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 48, !dbg !311 + %9357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 49, !dbg !311 + %9358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 50, !dbg !311 + %9359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 51, !dbg !311 + %9360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 52, !dbg !311 + %9361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 53, !dbg !311 + %9362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 54, !dbg !311 + %9363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 55, !dbg !311 + %9364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 56, !dbg !311 + %9365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 57, !dbg !311 + %9366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 58, !dbg !311 + %9367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 59, !dbg !311 + %9368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 60, !dbg !311 + %9369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 61, !dbg !311 + %9370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 62, !dbg !311 + %9371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 63, !dbg !311 + %9372 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %9308, float %9309, float %9310, float %9311, float %9312, float %9313, float %9314, float %9315, float %9316, float %9317, float %9318, float %9319, float %9320, float %9321, float %9322, float %9323, float %9324, float %9325, float %9326, float %9327, float %9328, float %9329, float %9330, float %9331, float %9332, float %9333, float %9334, float %9335, float %9336, float %9337, float %9338, float %9339, float %9340, float %9341, float %9342, float %9343, float %9344, float %9345, float %9346, float %9347, float %9348, float %9349, float %9350, float %9351, float %9352, float %9353, float %9354, float %9355, float %9356, float %9357, float %9358, float %9359, float %9360, float %9361, float %9362, float %9363, float %9364, float %9365, float %9366, float %9367, float %9368, float %9369, float %9370, float %9371, i32 %9290, i32 %9291, i32 %9292, i32 %9293, i64 %9307, i1 true) #3, !dbg !311 + %9373 = add i32 %7837, 4096, !dbg !311 + %9374 = lshr exact i32 %9373, 4, !dbg !311 + %9375 = and i32 %9374, 16383, !dbg !311 + %9376 = zext nneg i32 %9375 to i64, !dbg !311 + %9377 = or disjoint i64 %9376, 4611686293338849280, !dbg !311 + %9378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 0, !dbg !311 + %9379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 1, !dbg !311 + %9380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 2, !dbg !311 + %9381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 3, !dbg !311 + %9382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 4, !dbg !311 + %9383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 5, !dbg !311 + %9384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 6, !dbg !311 + %9385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 7, !dbg !311 + %9386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 8, !dbg !311 + %9387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 9, !dbg !311 + %9388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 10, !dbg !311 + %9389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 11, !dbg !311 + %9390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 12, !dbg !311 + %9391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 13, !dbg !311 + %9392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 14, !dbg !311 + %9393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 15, !dbg !311 + %9394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 16, !dbg !311 + %9395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 17, !dbg !311 + %9396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 18, !dbg !311 + %9397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 19, !dbg !311 + %9398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 20, !dbg !311 + %9399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 21, !dbg !311 + %9400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 22, !dbg !311 + %9401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 23, !dbg !311 + %9402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 24, !dbg !311 + %9403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 25, !dbg !311 + %9404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 26, !dbg !311 + %9405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 27, !dbg !311 + %9406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 28, !dbg !311 + %9407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 29, !dbg !311 + %9408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 30, !dbg !311 + %9409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 31, !dbg !311 + %9410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 32, !dbg !311 + %9411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 33, !dbg !311 + %9412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 34, !dbg !311 + %9413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 35, !dbg !311 + %9414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 36, !dbg !311 + %9415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 37, !dbg !311 + %9416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 38, !dbg !311 + %9417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 39, !dbg !311 + %9418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 40, !dbg !311 + %9419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 41, !dbg !311 + %9420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 42, !dbg !311 + %9421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 43, !dbg !311 + %9422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 44, !dbg !311 + %9423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 45, !dbg !311 + %9424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 46, !dbg !311 + %9425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 47, !dbg !311 + %9426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 48, !dbg !311 + %9427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 49, !dbg !311 + %9428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 50, !dbg !311 + %9429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 51, !dbg !311 + %9430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 52, !dbg !311 + %9431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 53, !dbg !311 + %9432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 54, !dbg !311 + %9433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 55, !dbg !311 + %9434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 56, !dbg !311 + %9435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 57, !dbg !311 + %9436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 58, !dbg !311 + %9437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 59, !dbg !311 + %9438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 60, !dbg !311 + %9439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 61, !dbg !311 + %9440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 62, !dbg !311 + %9441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 63, !dbg !311 + %9442 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %9378, float %9379, float %9380, float %9381, float %9382, float %9383, float %9384, float %9385, float %9386, float %9387, float %9388, float %9389, float %9390, float %9391, float %9392, float %9393, float %9394, float %9395, float %9396, float %9397, float %9398, float %9399, float %9400, float %9401, float %9402, float %9403, float %9404, float %9405, float %9406, float %9407, float %9408, float %9409, float %9410, float %9411, float %9412, float %9413, float %9414, float %9415, float %9416, float %9417, float %9418, float %9419, float %9420, float %9421, float %9422, float %9423, float %9424, float %9425, float %9426, float %9427, float %9428, float %9429, float %9430, float %9431, float %9432, float %9433, float %9434, float %9435, float %9436, float %9437, float %9438, float %9439, float %9440, float %9441, i32 %9294, i32 %9295, i32 %9296, i32 %9297, i64 %9377, i1 true) #3, !dbg !311 + %9443 = add i32 %7837, 6144, !dbg !311 + %9444 = lshr exact i32 %9443, 4, !dbg !311 + %9445 = and i32 %9444, 16383, !dbg !311 + %9446 = zext nneg i32 %9445 to i64, !dbg !311 + %9447 = or disjoint i64 %9446, 4611686293338849280, !dbg !311 + %9448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 0, !dbg !311 + %9449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 1, !dbg !311 + %9450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 2, !dbg !311 + %9451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 3, !dbg !311 + %9452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 4, !dbg !311 + %9453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 5, !dbg !311 + %9454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 6, !dbg !311 + %9455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 7, !dbg !311 + %9456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 8, !dbg !311 + %9457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 9, !dbg !311 + %9458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 10, !dbg !311 + %9459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 11, !dbg !311 + %9460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 12, !dbg !311 + %9461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 13, !dbg !311 + %9462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 14, !dbg !311 + %9463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 15, !dbg !311 + %9464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 16, !dbg !311 + %9465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 17, !dbg !311 + %9466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 18, !dbg !311 + %9467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 19, !dbg !311 + %9468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 20, !dbg !311 + %9469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 21, !dbg !311 + %9470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 22, !dbg !311 + %9471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 23, !dbg !311 + %9472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 24, !dbg !311 + %9473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 25, !dbg !311 + %9474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 26, !dbg !311 + %9475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 27, !dbg !311 + %9476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 28, !dbg !311 + %9477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 29, !dbg !311 + %9478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 30, !dbg !311 + %9479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 31, !dbg !311 + %9480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 32, !dbg !311 + %9481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 33, !dbg !311 + %9482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 34, !dbg !311 + %9483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 35, !dbg !311 + %9484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 36, !dbg !311 + %9485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 37, !dbg !311 + %9486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 38, !dbg !311 + %9487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 39, !dbg !311 + %9488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 40, !dbg !311 + %9489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 41, !dbg !311 + %9490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 42, !dbg !311 + %9491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 43, !dbg !311 + %9492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 44, !dbg !311 + %9493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 45, !dbg !311 + %9494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 46, !dbg !311 + %9495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 47, !dbg !311 + %9496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 48, !dbg !311 + %9497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 49, !dbg !311 + %9498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 50, !dbg !311 + %9499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 51, !dbg !311 + %9500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 52, !dbg !311 + %9501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 53, !dbg !311 + %9502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 54, !dbg !311 + %9503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 55, !dbg !311 + %9504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 56, !dbg !311 + %9505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 57, !dbg !311 + %9506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 58, !dbg !311 + %9507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 59, !dbg !311 + %9508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 60, !dbg !311 + %9509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 61, !dbg !311 + %9510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 62, !dbg !311 + %9511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 63, !dbg !311 + %9512 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %9448, float %9449, float %9450, float %9451, float %9452, float %9453, float %9454, float %9455, float %9456, float %9457, float %9458, float %9459, float %9460, float %9461, float %9462, float %9463, float %9464, float %9465, float %9466, float %9467, float %9468, float %9469, float %9470, float %9471, float %9472, float %9473, float %9474, float %9475, float %9476, float %9477, float %9478, float %9479, float %9480, float %9481, float %9482, float %9483, float %9484, float %9485, float %9486, float %9487, float %9488, float %9489, float %9490, float %9491, float %9492, float %9493, float %9494, float %9495, float %9496, float %9497, float %9498, float %9499, float %9500, float %9501, float %9502, float %9503, float %9504, float %9505, float %9506, float %9507, float %9508, float %9509, float %9510, float %9511, i32 %9298, i32 %9299, i32 %9300, i32 %9301, i64 %9447, i1 true) #3, !dbg !311 + %9513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 0, !dbg !311 + %9514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 1, !dbg !311 + %9515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 2, !dbg !311 + %9516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 3, !dbg !311 + %9517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 4, !dbg !311 + %9518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 5, !dbg !311 + %9519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 6, !dbg !311 + %9520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 7, !dbg !311 + %9521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 8, !dbg !311 + %9522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 9, !dbg !311 + %9523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 10, !dbg !311 + %9524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 11, !dbg !311 + %9525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 12, !dbg !311 + %9526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 13, !dbg !311 + %9527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 14, !dbg !311 + %9528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 15, !dbg !311 + %9529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 16, !dbg !311 + %9530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 17, !dbg !311 + %9531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 18, !dbg !311 + %9532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 19, !dbg !311 + %9533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 20, !dbg !311 + %9534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 21, !dbg !311 + %9535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 22, !dbg !311 + %9536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 23, !dbg !311 + %9537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 24, !dbg !311 + %9538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 25, !dbg !311 + %9539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 26, !dbg !311 + %9540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 27, !dbg !311 + %9541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 28, !dbg !311 + %9542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 29, !dbg !311 + %9543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 30, !dbg !311 + %9544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 31, !dbg !311 + %9545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 32, !dbg !311 + %9546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 33, !dbg !311 + %9547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 34, !dbg !311 + %9548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 35, !dbg !311 + %9549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 36, !dbg !311 + %9550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 37, !dbg !311 + %9551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 38, !dbg !311 + %9552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 39, !dbg !311 + %9553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 40, !dbg !311 + %9554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 41, !dbg !311 + %9555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 42, !dbg !311 + %9556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 43, !dbg !311 + %9557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 44, !dbg !311 + %9558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 45, !dbg !311 + %9559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 46, !dbg !311 + %9560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 47, !dbg !311 + %9561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 48, !dbg !311 + %9562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 49, !dbg !311 + %9563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 50, !dbg !311 + %9564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 51, !dbg !311 + %9565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 52, !dbg !311 + %9566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 53, !dbg !311 + %9567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 54, !dbg !311 + %9568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 55, !dbg !311 + %9569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 56, !dbg !311 + %9570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 57, !dbg !311 + %9571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 58, !dbg !311 + %9572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 59, !dbg !311 + %9573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 60, !dbg !311 + %9574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 61, !dbg !311 + %9575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 62, !dbg !311 + %9576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 63, !dbg !311 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !311 + %9577 = add nuw nsw i32 %7752, 1, !dbg !297 + %9578 = lshr i32 %9577, 1, !dbg !312 + %9579 = zext nneg i32 %9578 to i64, !dbg !313 + %9580 = getelementptr i32, ptr addrspace(1) %4756, i64 %9579, !dbg !313 + %9581 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !314 + %9582 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %9580, i64 %9581, i1 %7754) #3, !dbg !314 + %9583 = add nuw nsw i32 %9578, 1, !dbg !315 + %9584 = icmp slt i32 %9583, %4760, !dbg !316 + %9585 = getelementptr i8, ptr addrspace(1) %9580, i64 4, !dbg !317 + %9586 = and i1 %7754, %9584, !dbg !297 + %9587 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !318 + %9588 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %9585, i64 %9587, i1 %9586) #3, !dbg !318 + %9589 = and i32 %7752, 1, !dbg !319 + %9590 = sub i32 %9588, %9582, !dbg !320 + %9591 = shl i32 %9590, 7, !dbg !321 + %9592 = add i32 %9591, -64, !dbg !322 + %9593 = xor i32 %9589, 1, !dbg !323 + %9594 = mul nuw nsw i32 %9592, %9593, !dbg !323 + %9595 = shl nuw nsw i32 %9589, 6, !dbg !324 + %9596 = add i32 %9594, %9595, !dbg !325 + %9597 = shl i32 %9596, 12, !dbg !326 + %9598 = sext i32 %9597 to i64, !dbg !295 + %9599 = getelementptr bfloat, ptr addrspace(1) %.pn5681674, i64 %9598, !dbg !295 + %9600 = getelementptr bfloat, ptr addrspace(1) %.pn5521675, i64 %9598, !dbg !295 + %9601 = getelementptr bfloat, ptr addrspace(1) %.pn5361676, i64 %9598, !dbg !295 + %9602 = getelementptr bfloat, ptr addrspace(1) %.pn5201677, i64 %9598, !dbg !295 + %9603 = shl i32 %9596, 7, !dbg !327 + %9604 = sext i32 %9603 to i64, !dbg !296 + %9605 = getelementptr bfloat, ptr addrspace(1) %.pn6321678, i64 %9604, !dbg !296 + %9606 = getelementptr bfloat, ptr addrspace(1) %.pn6161679, i64 %9604, !dbg !296 + %9607 = getelementptr bfloat, ptr addrspace(1) %.pn6001680, i64 %9604, !dbg !296 + %9608 = getelementptr bfloat, ptr addrspace(1) %.pn5841681, i64 %9604, !dbg !296 + %9609 = add i32 %9596, %.pn6641682, !dbg !328 + %9610 = add i32 %9596, %.pn6601683, !dbg !328 + %9611 = add i32 %9596, %.pn6561684, !dbg !328 + %9612 = add i32 %9596, %.pn6521685, !dbg !328 + %9613 = add i32 %9596, %.pn6481686, !dbg !328 + %9614 = add i32 %9596, %.pn6441687, !dbg !328 + %9615 = add i32 %9596, %.pn6401688, !dbg !328 + %9616 = add i32 %9596, %.pn6361689, !dbg !328 + %9617 = add i32 %7749, 1, !dbg !297 + %9618 = icmp sgt i32 %9617, 1, !dbg !297 + %9619 = select i1 %9618, i32 0, i32 %9617, !dbg !297 + %9620 = add i32 %7751, 1, !dbg !297 + %9621 = icmp sgt i32 %9620, 2, !dbg !297 + %9622 = select i1 %9621, i32 0, i32 %9620, !dbg !297 + %9623 = shl i32 %9622, 13, !dbg !290 + %9624 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %9623, !dbg !290 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !290 + %9625 = getelementptr inbounds nuw i8, ptr addrspace(3) %9624, i32 %4793, !dbg !290 + %9626 = select i1 %7753, i32 16, i32 0, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %9625, ptr addrspace(1) %9599, i32 %9626) #3, !dbg !290 + %9627 = getelementptr inbounds nuw i8, ptr addrspace(3) %9624, i32 %4796, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %9627, ptr addrspace(1) %9600, i32 %9626) #3, !dbg !290 + %9628 = getelementptr inbounds nuw i8, ptr addrspace(3) %9624, i32 %4798, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %9628, ptr addrspace(1) %9601, i32 %9626) #3, !dbg !290 + %9629 = getelementptr inbounds nuw i8, ptr addrspace(3) %9624, i32 %4800, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %9629, ptr addrspace(1) %9602, i32 %9626) #3, !dbg !290 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !290 + %9630 = sext i32 %9609 to i64, !dbg !291 + %9631 = getelementptr float, ptr addrspace(1) %5087, i64 %9630, !dbg !291 + %9632 = sext i32 %9610 to i64, !dbg !291 + %9633 = getelementptr float, ptr addrspace(1) %5087, i64 %9632, !dbg !291 + %9634 = sext i32 %9611 to i64, !dbg !291 + %9635 = getelementptr float, ptr addrspace(1) %5087, i64 %9634, !dbg !291 + %9636 = sext i32 %9612 to i64, !dbg !291 + %9637 = getelementptr float, ptr addrspace(1) %5087, i64 %9636, !dbg !291 + %9638 = sext i32 %9613 to i64, !dbg !291 + %9639 = getelementptr float, ptr addrspace(1) %5087, i64 %9638, !dbg !291 + %9640 = sext i32 %9614 to i64, !dbg !291 + %9641 = getelementptr float, ptr addrspace(1) %5087, i64 %9640, !dbg !291 + %9642 = sext i32 %9615 to i64, !dbg !291 + %9643 = getelementptr float, ptr addrspace(1) %5087, i64 %9642, !dbg !291 + %9644 = sext i32 %9616 to i64, !dbg !291 + %9645 = getelementptr float, ptr addrspace(1) %5087, i64 %9644, !dbg !291 + %9646 = shl i32 %9619, 6, !dbg !292 + %9647 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %9646, !dbg !292 + %9648 = getelementptr inbounds nuw i8, ptr addrspace(3) %9647, i32 %4812, !dbg !292 + %9649 = select i1 %7753, i32 8, i32 0, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %9648, ptr addrspace(1) %9631, i32 %9649, i1 %4811) #3, !dbg !292 + %9650 = getelementptr inbounds nuw i8, ptr addrspace(3) %9647, i32 %4815, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9650, ptr addrspace(1) %9633, i32 %9649, i1 %4811) #3, !dbg !292 + %9651 = getelementptr inbounds nuw i8, ptr addrspace(3) %9647, i32 %4817, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9651, ptr addrspace(1) %9635, i32 %9649, i1 %4811) #3, !dbg !292 + %9652 = getelementptr inbounds nuw i8, ptr addrspace(3) %9647, i32 %4819, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9652, ptr addrspace(1) %9637, i32 %9649, i1 %4811) #3, !dbg !292 + %9653 = getelementptr inbounds nuw i8, ptr addrspace(3) %9647, i32 %4821, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9653, ptr addrspace(1) %9639, i32 %9649, i1 %4811) #3, !dbg !292 + %9654 = getelementptr inbounds nuw i8, ptr addrspace(3) %9647, i32 %4823, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9654, ptr addrspace(1) %9641, i32 %9649, i1 %4811) #3, !dbg !292 + %9655 = getelementptr inbounds nuw i8, ptr addrspace(3) %9647, i32 %4825, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9655, ptr addrspace(1) %9643, i32 %9649, i1 %4811) #3, !dbg !292 + %9656 = getelementptr inbounds nuw i8, ptr addrspace(3) %9647, i32 %4827, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9656, ptr addrspace(1) %9645, i32 %9649, i1 %4811) #3, !dbg !292 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !292 + %9657 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %9623, !dbg !290 + %9658 = getelementptr inbounds nuw i8, ptr addrspace(3) %9657, i32 %4793, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %9658, ptr addrspace(1) %9605, i32 %9626) #3, !dbg !290 + %9659 = getelementptr inbounds nuw i8, ptr addrspace(3) %9657, i32 %4796, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %9659, ptr addrspace(1) %9606, i32 %9626) #3, !dbg !290 + %9660 = getelementptr inbounds nuw i8, ptr addrspace(3) %9657, i32 %4798, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %9660, ptr addrspace(1) %9607, i32 %9626) #3, !dbg !290 + %9661 = getelementptr inbounds nuw i8, ptr addrspace(3) %9657, i32 %4800, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %9661, ptr addrspace(1) %9608, i32 %9626) #3, !dbg !290 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !290 + %9662 = getelementptr float, ptr addrspace(1) %5088, i64 %9630, !dbg !293 + %9663 = getelementptr float, ptr addrspace(1) %5088, i64 %9632, !dbg !293 + %9664 = getelementptr float, ptr addrspace(1) %5088, i64 %9634, !dbg !293 + %9665 = getelementptr float, ptr addrspace(1) %5088, i64 %9636, !dbg !293 + %9666 = getelementptr float, ptr addrspace(1) %5088, i64 %9638, !dbg !293 + %9667 = getelementptr float, ptr addrspace(1) %5088, i64 %9640, !dbg !293 + %9668 = getelementptr float, ptr addrspace(1) %5088, i64 %9642, !dbg !293 + %9669 = getelementptr float, ptr addrspace(1) %5088, i64 %9644, !dbg !293 + %9670 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %9646, !dbg !294 + %9671 = getelementptr inbounds nuw i8, ptr addrspace(3) %9670, i32 %4812, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %9671, ptr addrspace(1) %9662, i32 %9649, i1 %4811) #3, !dbg !294 + %9672 = getelementptr inbounds nuw i8, ptr addrspace(3) %9670, i32 %4815, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9672, ptr addrspace(1) %9663, i32 %9649, i1 %4811) #3, !dbg !294 + %9673 = getelementptr inbounds nuw i8, ptr addrspace(3) %9670, i32 %4817, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9673, ptr addrspace(1) %9664, i32 %9649, i1 %4811) #3, !dbg !294 + %9674 = getelementptr inbounds nuw i8, ptr addrspace(3) %9670, i32 %4819, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9674, ptr addrspace(1) %9665, i32 %9649, i1 %4811) #3, !dbg !294 + %9675 = getelementptr inbounds nuw i8, ptr addrspace(3) %9670, i32 %4821, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9675, ptr addrspace(1) %9666, i32 %9649, i1 %4811) #3, !dbg !294 + %9676 = getelementptr inbounds nuw i8, ptr addrspace(3) %9670, i32 %4823, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9676, ptr addrspace(1) %9667, i32 %9649, i1 %4811) #3, !dbg !294 + %9677 = getelementptr inbounds nuw i8, ptr addrspace(3) %9670, i32 %4825, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9677, ptr addrspace(1) %9668, i32 %9649, i1 %4811) #3, !dbg !294 + %9678 = getelementptr inbounds nuw i8, ptr addrspace(3) %9670, i32 %4827, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9678, ptr addrspace(1) %9669, i32 %9649, i1 %4811) #3, !dbg !294 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !294 + %exitcond2122.not = icmp eq i32 %9577, %smax2121, !dbg !297 + br i1 %exitcond2122.not, label %._crit_edge1692, label %.lr.ph1691, !dbg !297 + +._crit_edge1692: ; preds = %__nv_exp2f.exit1219, %._crit_edge + %.pn376.lcssa = phi float [ %7628, %._crit_edge ], [ %9513, %__nv_exp2f.exit1219 ] + %.pn374.lcssa = phi float [ %7629, %._crit_edge ], [ %9514, %__nv_exp2f.exit1219 ] + %.pn372.lcssa = phi float [ %7630, %._crit_edge ], [ %9515, %__nv_exp2f.exit1219 ] + %.pn370.lcssa = phi float [ %7631, %._crit_edge ], [ %9516, %__nv_exp2f.exit1219 ] + %.pn368.lcssa = phi float [ %7632, %._crit_edge ], [ %9517, %__nv_exp2f.exit1219 ] + %.pn366.lcssa = phi float [ %7633, %._crit_edge ], [ %9518, %__nv_exp2f.exit1219 ] + %.pn364.lcssa = phi float [ %7634, %._crit_edge ], [ %9519, %__nv_exp2f.exit1219 ] + %.pn362.lcssa = phi float [ %7635, %._crit_edge ], [ %9520, %__nv_exp2f.exit1219 ] + %.pn360.lcssa = phi float [ %7636, %._crit_edge ], [ %9521, %__nv_exp2f.exit1219 ] + %.pn358.lcssa = phi float [ %7637, %._crit_edge ], [ %9522, %__nv_exp2f.exit1219 ] + %.pn356.lcssa = phi float [ %7638, %._crit_edge ], [ %9523, %__nv_exp2f.exit1219 ] + %.pn354.lcssa = phi float [ %7639, %._crit_edge ], [ %9524, %__nv_exp2f.exit1219 ] + %.pn352.lcssa = phi float [ %7640, %._crit_edge ], [ %9525, %__nv_exp2f.exit1219 ] + %.pn350.lcssa = phi float [ %7641, %._crit_edge ], [ %9526, %__nv_exp2f.exit1219 ] + %.pn348.lcssa = phi float [ %7642, %._crit_edge ], [ %9527, %__nv_exp2f.exit1219 ] + %.pn346.lcssa = phi float [ %7643, %._crit_edge ], [ %9528, %__nv_exp2f.exit1219 ] + %.pn344.lcssa = phi float [ %7644, %._crit_edge ], [ %9529, %__nv_exp2f.exit1219 ] + %.pn342.lcssa = phi float [ %7645, %._crit_edge ], [ %9530, %__nv_exp2f.exit1219 ] + %.pn340.lcssa = phi float [ %7646, %._crit_edge ], [ %9531, %__nv_exp2f.exit1219 ] + %.pn338.lcssa = phi float [ %7647, %._crit_edge ], [ %9532, %__nv_exp2f.exit1219 ] + %.pn336.lcssa = phi float [ %7648, %._crit_edge ], [ %9533, %__nv_exp2f.exit1219 ] + %.pn334.lcssa = phi float [ %7649, %._crit_edge ], [ %9534, %__nv_exp2f.exit1219 ] + %.pn332.lcssa = phi float [ %7650, %._crit_edge ], [ %9535, %__nv_exp2f.exit1219 ] + %.pn330.lcssa = phi float [ %7651, %._crit_edge ], [ %9536, %__nv_exp2f.exit1219 ] + %.pn328.lcssa = phi float [ %7652, %._crit_edge ], [ %9537, %__nv_exp2f.exit1219 ] + %.pn326.lcssa = phi float [ %7653, %._crit_edge ], [ %9538, %__nv_exp2f.exit1219 ] + %.pn324.lcssa = phi float [ %7654, %._crit_edge ], [ %9539, %__nv_exp2f.exit1219 ] + %.pn322.lcssa = phi float [ %7655, %._crit_edge ], [ %9540, %__nv_exp2f.exit1219 ] + %.pn320.lcssa = phi float [ %7656, %._crit_edge ], [ %9541, %__nv_exp2f.exit1219 ] + %.pn318.lcssa = phi float [ %7657, %._crit_edge ], [ %9542, %__nv_exp2f.exit1219 ] + %.pn316.lcssa = phi float [ %7658, %._crit_edge ], [ %9543, %__nv_exp2f.exit1219 ] + %.pn314.lcssa = phi float [ %7659, %._crit_edge ], [ %9544, %__nv_exp2f.exit1219 ] + %.pn312.lcssa = phi float [ %7660, %._crit_edge ], [ %9545, %__nv_exp2f.exit1219 ] + %.pn310.lcssa = phi float [ %7661, %._crit_edge ], [ %9546, %__nv_exp2f.exit1219 ] + %.pn308.lcssa = phi float [ %7662, %._crit_edge ], [ %9547, %__nv_exp2f.exit1219 ] + %.pn306.lcssa = phi float [ %7663, %._crit_edge ], [ %9548, %__nv_exp2f.exit1219 ] + %.pn304.lcssa = phi float [ %7664, %._crit_edge ], [ %9549, %__nv_exp2f.exit1219 ] + %.pn302.lcssa = phi float [ %7665, %._crit_edge ], [ %9550, %__nv_exp2f.exit1219 ] + %.pn300.lcssa = phi float [ %7666, %._crit_edge ], [ %9551, %__nv_exp2f.exit1219 ] + %.pn298.lcssa = phi float [ %7667, %._crit_edge ], [ %9552, %__nv_exp2f.exit1219 ] + %.pn296.lcssa = phi float [ %7668, %._crit_edge ], [ %9553, %__nv_exp2f.exit1219 ] + %.pn294.lcssa = phi float [ %7669, %._crit_edge ], [ %9554, %__nv_exp2f.exit1219 ] + %.pn292.lcssa = phi float [ %7670, %._crit_edge ], [ %9555, %__nv_exp2f.exit1219 ] + %.pn290.lcssa = phi float [ %7671, %._crit_edge ], [ %9556, %__nv_exp2f.exit1219 ] + %.pn288.lcssa = phi float [ %7672, %._crit_edge ], [ %9557, %__nv_exp2f.exit1219 ] + %.pn286.lcssa = phi float [ %7673, %._crit_edge ], [ %9558, %__nv_exp2f.exit1219 ] + %.pn284.lcssa = phi float [ %7674, %._crit_edge ], [ %9559, %__nv_exp2f.exit1219 ] + %.pn282.lcssa = phi float [ %7675, %._crit_edge ], [ %9560, %__nv_exp2f.exit1219 ] + %.pn280.lcssa = phi float [ %7676, %._crit_edge ], [ %9561, %__nv_exp2f.exit1219 ] + %.pn278.lcssa = phi float [ %7677, %._crit_edge ], [ %9562, %__nv_exp2f.exit1219 ] + %.pn276.lcssa = phi float [ %7678, %._crit_edge ], [ %9563, %__nv_exp2f.exit1219 ] + %.pn274.lcssa = phi float [ %7679, %._crit_edge ], [ %9564, %__nv_exp2f.exit1219 ] + %.pn272.lcssa = phi float [ %7680, %._crit_edge ], [ %9565, %__nv_exp2f.exit1219 ] + %.pn270.lcssa = phi float [ %7681, %._crit_edge ], [ %9566, %__nv_exp2f.exit1219 ] + %.pn268.lcssa = phi float [ %7682, %._crit_edge ], [ %9567, %__nv_exp2f.exit1219 ] + %.pn266.lcssa = phi float [ %7683, %._crit_edge ], [ %9568, %__nv_exp2f.exit1219 ] + %.pn264.lcssa = phi float [ %7684, %._crit_edge ], [ %9569, %__nv_exp2f.exit1219 ] + %.pn262.lcssa = phi float [ %7685, %._crit_edge ], [ %9570, %__nv_exp2f.exit1219 ] + %.pn260.lcssa = phi float [ %7686, %._crit_edge ], [ %9571, %__nv_exp2f.exit1219 ] + %.pn258.lcssa = phi float [ %7687, %._crit_edge ], [ %9572, %__nv_exp2f.exit1219 ] + %.pn256.lcssa = phi float [ %7688, %._crit_edge ], [ %9573, %__nv_exp2f.exit1219 ] + %.pn254.lcssa = phi float [ %7689, %._crit_edge ], [ %9574, %__nv_exp2f.exit1219 ] + %.pn252.lcssa = phi float [ %7690, %._crit_edge ], [ %9575, %__nv_exp2f.exit1219 ] + %.pn250.lcssa = phi float [ %7691, %._crit_edge ], [ %9576, %__nv_exp2f.exit1219 ] + %.pn504.lcssa = phi float [ %7564, %._crit_edge ], [ %8753, %__nv_exp2f.exit1219 ] + %.pn502.lcssa = phi float [ %7565, %._crit_edge ], [ %8754, %__nv_exp2f.exit1219 ] + %.pn500.lcssa = phi float [ %7566, %._crit_edge ], [ %8755, %__nv_exp2f.exit1219 ] + %.pn498.lcssa = phi float [ %7567, %._crit_edge ], [ %8756, %__nv_exp2f.exit1219 ] + %.pn496.lcssa = phi float [ %7568, %._crit_edge ], [ %8757, %__nv_exp2f.exit1219 ] + %.pn494.lcssa = phi float [ %7569, %._crit_edge ], [ %8758, %__nv_exp2f.exit1219 ] + %.pn492.lcssa = phi float [ %7570, %._crit_edge ], [ %8759, %__nv_exp2f.exit1219 ] + %.pn490.lcssa = phi float [ %7571, %._crit_edge ], [ %8760, %__nv_exp2f.exit1219 ] + %.pn488.lcssa = phi float [ %7572, %._crit_edge ], [ %8761, %__nv_exp2f.exit1219 ] + %.pn486.lcssa = phi float [ %7573, %._crit_edge ], [ %8762, %__nv_exp2f.exit1219 ] + %.pn484.lcssa = phi float [ %7574, %._crit_edge ], [ %8763, %__nv_exp2f.exit1219 ] + %.pn482.lcssa = phi float [ %7575, %._crit_edge ], [ %8764, %__nv_exp2f.exit1219 ] + %.pn480.lcssa = phi float [ %7576, %._crit_edge ], [ %8765, %__nv_exp2f.exit1219 ] + %.pn478.lcssa = phi float [ %7577, %._crit_edge ], [ %8766, %__nv_exp2f.exit1219 ] + %.pn476.lcssa = phi float [ %7578, %._crit_edge ], [ %8767, %__nv_exp2f.exit1219 ] + %.pn474.lcssa = phi float [ %7579, %._crit_edge ], [ %8768, %__nv_exp2f.exit1219 ] + %.pn472.lcssa = phi float [ %7580, %._crit_edge ], [ %8769, %__nv_exp2f.exit1219 ] + %.pn470.lcssa = phi float [ %7581, %._crit_edge ], [ %8770, %__nv_exp2f.exit1219 ] + %.pn468.lcssa = phi float [ %7582, %._crit_edge ], [ %8771, %__nv_exp2f.exit1219 ] + %.pn466.lcssa = phi float [ %7583, %._crit_edge ], [ %8772, %__nv_exp2f.exit1219 ] + %.pn464.lcssa = phi float [ %7584, %._crit_edge ], [ %8773, %__nv_exp2f.exit1219 ] + %.pn462.lcssa = phi float [ %7585, %._crit_edge ], [ %8774, %__nv_exp2f.exit1219 ] + %.pn460.lcssa = phi float [ %7586, %._crit_edge ], [ %8775, %__nv_exp2f.exit1219 ] + %.pn458.lcssa = phi float [ %7587, %._crit_edge ], [ %8776, %__nv_exp2f.exit1219 ] + %.pn456.lcssa = phi float [ %7588, %._crit_edge ], [ %8777, %__nv_exp2f.exit1219 ] + %.pn454.lcssa = phi float [ %7589, %._crit_edge ], [ %8778, %__nv_exp2f.exit1219 ] + %.pn452.lcssa = phi float [ %7590, %._crit_edge ], [ %8779, %__nv_exp2f.exit1219 ] + %.pn450.lcssa = phi float [ %7591, %._crit_edge ], [ %8780, %__nv_exp2f.exit1219 ] + %.pn448.lcssa = phi float [ %7592, %._crit_edge ], [ %8781, %__nv_exp2f.exit1219 ] + %.pn446.lcssa = phi float [ %7593, %._crit_edge ], [ %8782, %__nv_exp2f.exit1219 ] + %.pn444.lcssa = phi float [ %7594, %._crit_edge ], [ %8783, %__nv_exp2f.exit1219 ] + %.pn442.lcssa = phi float [ %7595, %._crit_edge ], [ %8784, %__nv_exp2f.exit1219 ] + %.pn440.lcssa = phi float [ %7596, %._crit_edge ], [ %8785, %__nv_exp2f.exit1219 ] + %.pn438.lcssa = phi float [ %7597, %._crit_edge ], [ %8786, %__nv_exp2f.exit1219 ] + %.pn436.lcssa = phi float [ %7598, %._crit_edge ], [ %8787, %__nv_exp2f.exit1219 ] + %.pn434.lcssa = phi float [ %7599, %._crit_edge ], [ %8788, %__nv_exp2f.exit1219 ] + %.pn432.lcssa = phi float [ %7600, %._crit_edge ], [ %8789, %__nv_exp2f.exit1219 ] + %.pn430.lcssa = phi float [ %7601, %._crit_edge ], [ %8790, %__nv_exp2f.exit1219 ] + %.pn428.lcssa = phi float [ %7602, %._crit_edge ], [ %8791, %__nv_exp2f.exit1219 ] + %.pn426.lcssa = phi float [ %7603, %._crit_edge ], [ %8792, %__nv_exp2f.exit1219 ] + %.pn424.lcssa = phi float [ %7604, %._crit_edge ], [ %8793, %__nv_exp2f.exit1219 ] + %.pn422.lcssa = phi float [ %7605, %._crit_edge ], [ %8794, %__nv_exp2f.exit1219 ] + %.pn420.lcssa = phi float [ %7606, %._crit_edge ], [ %8795, %__nv_exp2f.exit1219 ] + %.pn418.lcssa = phi float [ %7607, %._crit_edge ], [ %8796, %__nv_exp2f.exit1219 ] + %.pn416.lcssa = phi float [ %7608, %._crit_edge ], [ %8797, %__nv_exp2f.exit1219 ] + %.pn414.lcssa = phi float [ %7609, %._crit_edge ], [ %8798, %__nv_exp2f.exit1219 ] + %.pn412.lcssa = phi float [ %7610, %._crit_edge ], [ %8799, %__nv_exp2f.exit1219 ] + %.pn410.lcssa = phi float [ %7611, %._crit_edge ], [ %8800, %__nv_exp2f.exit1219 ] + %.pn408.lcssa = phi float [ %7612, %._crit_edge ], [ %8801, %__nv_exp2f.exit1219 ] + %.pn406.lcssa = phi float [ %7613, %._crit_edge ], [ %8802, %__nv_exp2f.exit1219 ] + %.pn404.lcssa = phi float [ %7614, %._crit_edge ], [ %8803, %__nv_exp2f.exit1219 ] + %.pn402.lcssa = phi float [ %7615, %._crit_edge ], [ %8804, %__nv_exp2f.exit1219 ] + %.pn400.lcssa = phi float [ %7616, %._crit_edge ], [ %8805, %__nv_exp2f.exit1219 ] + %.pn398.lcssa = phi float [ %7617, %._crit_edge ], [ %8806, %__nv_exp2f.exit1219 ] + %.pn396.lcssa = phi float [ %7618, %._crit_edge ], [ %8807, %__nv_exp2f.exit1219 ] + %.pn394.lcssa = phi float [ %7619, %._crit_edge ], [ %8808, %__nv_exp2f.exit1219 ] + %.pn392.lcssa = phi float [ %7620, %._crit_edge ], [ %8809, %__nv_exp2f.exit1219 ] + %.pn390.lcssa = phi float [ %7621, %._crit_edge ], [ %8810, %__nv_exp2f.exit1219 ] + %.pn388.lcssa = phi float [ %7622, %._crit_edge ], [ %8811, %__nv_exp2f.exit1219 ] + %.pn386.lcssa = phi float [ %7623, %._crit_edge ], [ %8812, %__nv_exp2f.exit1219 ] + %.pn384.lcssa = phi float [ %7624, %._crit_edge ], [ %8813, %__nv_exp2f.exit1219 ] + %.pn382.lcssa = phi float [ %7625, %._crit_edge ], [ %8814, %__nv_exp2f.exit1219 ] + %.pn380.lcssa = phi float [ %7626, %._crit_edge ], [ %8815, %__nv_exp2f.exit1219 ] + %.pn378.lcssa = phi float [ %7627, %._crit_edge ], [ %8816, %__nv_exp2f.exit1219 ] + %9679 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101,$102,$103,$104,$105,$106,$107,$108,$109,$110,$111,$112,$113,$114,$115,$116,$117,$118,$119,$120,$121,$122,$123,$124,$125,$126,$127\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127"(float %.pn504.lcssa, float %.pn502.lcssa, float %.pn500.lcssa, float %.pn498.lcssa, float %.pn496.lcssa, float %.pn494.lcssa, float %.pn492.lcssa, float %.pn490.lcssa, float %.pn488.lcssa, float %.pn486.lcssa, float %.pn484.lcssa, float %.pn482.lcssa, float %.pn480.lcssa, float %.pn478.lcssa, float %.pn476.lcssa, float %.pn474.lcssa, float %.pn472.lcssa, float %.pn470.lcssa, float %.pn468.lcssa, float %.pn466.lcssa, float %.pn464.lcssa, float %.pn462.lcssa, float %.pn460.lcssa, float %.pn458.lcssa, float %.pn456.lcssa, float %.pn454.lcssa, float %.pn452.lcssa, float %.pn450.lcssa, float %.pn448.lcssa, float %.pn446.lcssa, float %.pn444.lcssa, float %.pn442.lcssa, float %.pn440.lcssa, float %.pn438.lcssa, float %.pn436.lcssa, float %.pn434.lcssa, float %.pn432.lcssa, float %.pn430.lcssa, float %.pn428.lcssa, float %.pn426.lcssa, float %.pn424.lcssa, float %.pn422.lcssa, float %.pn420.lcssa, float %.pn418.lcssa, float %.pn416.lcssa, float %.pn414.lcssa, float %.pn412.lcssa, float %.pn410.lcssa, float %.pn408.lcssa, float %.pn406.lcssa, float %.pn404.lcssa, float %.pn402.lcssa, float %.pn400.lcssa, float %.pn398.lcssa, float %.pn396.lcssa, float %.pn394.lcssa, float %.pn392.lcssa, float %.pn390.lcssa, float %.pn388.lcssa, float %.pn386.lcssa, float %.pn384.lcssa, float %.pn382.lcssa, float %.pn380.lcssa, float %.pn378.lcssa, float %.pn376.lcssa, float %.pn374.lcssa, float %.pn372.lcssa, float %.pn370.lcssa, float %.pn368.lcssa, float %.pn366.lcssa, float %.pn364.lcssa, float %.pn362.lcssa, float %.pn360.lcssa, float %.pn358.lcssa, float %.pn356.lcssa, float %.pn354.lcssa, float %.pn352.lcssa, float %.pn350.lcssa, float %.pn348.lcssa, float %.pn346.lcssa, float %.pn344.lcssa, float %.pn342.lcssa, float %.pn340.lcssa, float %.pn338.lcssa, float %.pn336.lcssa, float %.pn334.lcssa, float %.pn332.lcssa, float %.pn330.lcssa, float %.pn328.lcssa, float %.pn326.lcssa, float %.pn324.lcssa, float %.pn322.lcssa, float %.pn320.lcssa, float %.pn318.lcssa, float %.pn316.lcssa, float %.pn314.lcssa, float %.pn312.lcssa, float %.pn310.lcssa, float %.pn308.lcssa, float %.pn306.lcssa, float %.pn304.lcssa, float %.pn302.lcssa, float %.pn300.lcssa, float %.pn298.lcssa, float %.pn296.lcssa, float %.pn294.lcssa, float %.pn292.lcssa, float %.pn290.lcssa, float %.pn288.lcssa, float %.pn286.lcssa, float %.pn284.lcssa, float %.pn282.lcssa, float %.pn280.lcssa, float %.pn278.lcssa, float %.pn276.lcssa, float %.pn274.lcssa, float %.pn272.lcssa, float %.pn270.lcssa, float %.pn268.lcssa, float %.pn266.lcssa, float %.pn264.lcssa, float %.pn262.lcssa, float %.pn260.lcssa, float %.pn258.lcssa, float %.pn256.lcssa, float %.pn254.lcssa, float %.pn252.lcssa, float %.pn250.lcssa) #3, !dbg !297 + %9680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 0, !dbg !297 + %9681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 1, !dbg !297 + %9682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 2, !dbg !297 + %9683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 3, !dbg !297 + %9684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 4, !dbg !297 + %9685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 5, !dbg !297 + %9686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 6, !dbg !297 + %9687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 7, !dbg !297 + %9688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 8, !dbg !297 + %9689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 9, !dbg !297 + %9690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 10, !dbg !297 + %9691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 11, !dbg !297 + %9692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 12, !dbg !297 + %9693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 13, !dbg !297 + %9694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 14, !dbg !297 + %9695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 15, !dbg !297 + %9696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 16, !dbg !297 + %9697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 17, !dbg !297 + %9698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 18, !dbg !297 + %9699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 19, !dbg !297 + %9700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 20, !dbg !297 + %9701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 21, !dbg !297 + %9702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 22, !dbg !297 + %9703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 23, !dbg !297 + %9704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 24, !dbg !297 + %9705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 25, !dbg !297 + %9706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 26, !dbg !297 + %9707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 27, !dbg !297 + %9708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 28, !dbg !297 + %9709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 29, !dbg !297 + %9710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 30, !dbg !297 + %9711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 31, !dbg !297 + %9712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 32, !dbg !297 + %9713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 33, !dbg !297 + %9714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 34, !dbg !297 + %9715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 35, !dbg !297 + %9716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 36, !dbg !297 + %9717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 37, !dbg !297 + %9718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 38, !dbg !297 + %9719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 39, !dbg !297 + %9720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 40, !dbg !297 + %9721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 41, !dbg !297 + %9722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 42, !dbg !297 + %9723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 43, !dbg !297 + %9724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 44, !dbg !297 + %9725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 45, !dbg !297 + %9726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 46, !dbg !297 + %9727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 47, !dbg !297 + %9728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 48, !dbg !297 + %9729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 49, !dbg !297 + %9730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 50, !dbg !297 + %9731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 51, !dbg !297 + %9732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 52, !dbg !297 + %9733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 53, !dbg !297 + %9734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 54, !dbg !297 + %9735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 55, !dbg !297 + %9736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 56, !dbg !297 + %9737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 57, !dbg !297 + %9738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 58, !dbg !297 + %9739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 59, !dbg !297 + %9740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 60, !dbg !297 + %9741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 61, !dbg !297 + %9742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 62, !dbg !297 + %9743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 63, !dbg !297 + %9744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 64, !dbg !297 + %9745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 65, !dbg !297 + %9746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 66, !dbg !297 + %9747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 67, !dbg !297 + %9748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 68, !dbg !297 + %9749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 69, !dbg !297 + %9750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 70, !dbg !297 + %9751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 71, !dbg !297 + %9752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 72, !dbg !297 + %9753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 73, !dbg !297 + %9754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 74, !dbg !297 + %9755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 75, !dbg !297 + %9756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 76, !dbg !297 + %9757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 77, !dbg !297 + %9758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 78, !dbg !297 + %9759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 79, !dbg !297 + %9760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 80, !dbg !297 + %9761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 81, !dbg !297 + %9762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 82, !dbg !297 + %9763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 83, !dbg !297 + %9764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 84, !dbg !297 + %9765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 85, !dbg !297 + %9766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 86, !dbg !297 + %9767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 87, !dbg !297 + %9768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 88, !dbg !297 + %9769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 89, !dbg !297 + %9770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 90, !dbg !297 + %9771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 91, !dbg !297 + %9772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 92, !dbg !297 + %9773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 93, !dbg !297 + %9774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 94, !dbg !297 + %9775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 95, !dbg !297 + %9776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 96, !dbg !297 + %9777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 97, !dbg !297 + %9778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 98, !dbg !297 + %9779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 99, !dbg !297 + %9780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 100, !dbg !297 + %9781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 101, !dbg !297 + %9782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 102, !dbg !297 + %9783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 103, !dbg !297 + %9784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 104, !dbg !297 + %9785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 105, !dbg !297 + %9786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 106, !dbg !297 + %9787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 107, !dbg !297 + %9788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 108, !dbg !297 + %9789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 109, !dbg !297 + %9790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 110, !dbg !297 + %9791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 111, !dbg !297 + %9792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 112, !dbg !297 + %9793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 113, !dbg !297 + %9794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 114, !dbg !297 + %9795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 115, !dbg !297 + %9796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 116, !dbg !297 + %9797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 117, !dbg !297 + %9798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 118, !dbg !297 + %9799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 119, !dbg !297 + %9800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 120, !dbg !297 + %9801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 121, !dbg !297 + %9802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 122, !dbg !297 + %9803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 123, !dbg !297 + %9804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 124, !dbg !297 + %9805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 125, !dbg !297 + %9806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 126, !dbg !297 + %9807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 127, !dbg !297 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !297 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !297 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !227 + %exitcond2123.not = icmp eq i64 %indvars.iv.next, 4, !dbg !227 + br i1 %exitcond2123.not, label %9808, label %4945, !dbg !227 + +9808: ; preds = %._crit_edge1692 + %9809 = getelementptr bfloat, ptr addrspace(1) %34, i64 %4480, !dbg !329 + %9810 = getelementptr bfloat, ptr addrspace(1) %34, i64 %4482, !dbg !329 + %9811 = getelementptr bfloat, ptr addrspace(1) %34, i64 %4484, !dbg !329 + %9812 = getelementptr bfloat, ptr addrspace(1) %34, i64 %4486, !dbg !329 + %9813 = getelementptr bfloat, ptr addrspace(1) %34, i64 %4488, !dbg !329 + %9814 = getelementptr bfloat, ptr addrspace(1) %34, i64 %4490, !dbg !329 + %9815 = getelementptr bfloat, ptr addrspace(1) %34, i64 %4492, !dbg !329 + %9816 = getelementptr bfloat, ptr addrspace(1) %34, i64 %4494, !dbg !329 + %9817 = getelementptr bfloat, ptr addrspace(1) %9809, i64 %4498, !dbg !330 + %9818 = getelementptr bfloat, ptr addrspace(1) %9810, i64 %4498, !dbg !330 + %9819 = getelementptr bfloat, ptr addrspace(1) %9811, i64 %4498, !dbg !330 + %9820 = getelementptr bfloat, ptr addrspace(1) %9812, i64 %4498, !dbg !330 + %9821 = getelementptr bfloat, ptr addrspace(1) %9813, i64 %4498, !dbg !330 + %9822 = getelementptr bfloat, ptr addrspace(1) %9814, i64 %4498, !dbg !330 + %9823 = getelementptr bfloat, ptr addrspace(1) %9815, i64 %4498, !dbg !330 + %9824 = getelementptr bfloat, ptr addrspace(1) %9816, i64 %4498, !dbg !330 + %9825 = insertelement <2 x float> poison, float %9680, i64 0, !dbg !331 + %9826 = insertelement <2 x float> %9825, float %9681, i64 1, !dbg !331 + %9827 = fptrunc <2 x float> %9826 to <2 x bfloat>, !dbg !331 + %9828 = insertelement <2 x float> poison, float %9682, i64 0, !dbg !331 + %9829 = insertelement <2 x float> %9828, float %9683, i64 1, !dbg !331 + %9830 = fptrunc <2 x float> %9829 to <2 x bfloat>, !dbg !331 + %9831 = insertelement <2 x float> poison, float %9684, i64 0, !dbg !331 + %9832 = insertelement <2 x float> %9831, float %9685, i64 1, !dbg !331 + %9833 = fptrunc <2 x float> %9832 to <2 x bfloat>, !dbg !331 + %9834 = insertelement <2 x float> poison, float %9686, i64 0, !dbg !331 + %9835 = insertelement <2 x float> %9834, float %9687, i64 1, !dbg !331 + %9836 = fptrunc <2 x float> %9835 to <2 x bfloat>, !dbg !331 + %9837 = insertelement <2 x float> poison, float %9688, i64 0, !dbg !331 + %9838 = insertelement <2 x float> %9837, float %9689, i64 1, !dbg !331 + %9839 = fptrunc <2 x float> %9838 to <2 x bfloat>, !dbg !331 + %9840 = insertelement <2 x float> poison, float %9690, i64 0, !dbg !331 + %9841 = insertelement <2 x float> %9840, float %9691, i64 1, !dbg !331 + %9842 = fptrunc <2 x float> %9841 to <2 x bfloat>, !dbg !331 + %9843 = insertelement <2 x float> poison, float %9692, i64 0, !dbg !331 + %9844 = insertelement <2 x float> %9843, float %9693, i64 1, !dbg !331 + %9845 = fptrunc <2 x float> %9844 to <2 x bfloat>, !dbg !331 + %9846 = insertelement <2 x float> poison, float %9694, i64 0, !dbg !331 + %9847 = insertelement <2 x float> %9846, float %9695, i64 1, !dbg !331 + %9848 = fptrunc <2 x float> %9847 to <2 x bfloat>, !dbg !331 + %9849 = insertelement <2 x float> poison, float %9696, i64 0, !dbg !331 + %9850 = insertelement <2 x float> %9849, float %9697, i64 1, !dbg !331 + %9851 = fptrunc <2 x float> %9850 to <2 x bfloat>, !dbg !331 + %9852 = insertelement <2 x float> poison, float %9698, i64 0, !dbg !331 + %9853 = insertelement <2 x float> %9852, float %9699, i64 1, !dbg !331 + %9854 = fptrunc <2 x float> %9853 to <2 x bfloat>, !dbg !331 + %9855 = insertelement <2 x float> poison, float %9700, i64 0, !dbg !331 + %9856 = insertelement <2 x float> %9855, float %9701, i64 1, !dbg !331 + %9857 = fptrunc <2 x float> %9856 to <2 x bfloat>, !dbg !331 + %9858 = insertelement <2 x float> poison, float %9702, i64 0, !dbg !331 + %9859 = insertelement <2 x float> %9858, float %9703, i64 1, !dbg !331 + %9860 = fptrunc <2 x float> %9859 to <2 x bfloat>, !dbg !331 + %9861 = insertelement <2 x float> poison, float %9704, i64 0, !dbg !331 + %9862 = insertelement <2 x float> %9861, float %9705, i64 1, !dbg !331 + %9863 = fptrunc <2 x float> %9862 to <2 x bfloat>, !dbg !331 + %9864 = insertelement <2 x float> poison, float %9706, i64 0, !dbg !331 + %9865 = insertelement <2 x float> %9864, float %9707, i64 1, !dbg !331 + %9866 = fptrunc <2 x float> %9865 to <2 x bfloat>, !dbg !331 + %9867 = insertelement <2 x float> poison, float %9708, i64 0, !dbg !331 + %9868 = insertelement <2 x float> %9867, float %9709, i64 1, !dbg !331 + %9869 = fptrunc <2 x float> %9868 to <2 x bfloat>, !dbg !331 + %9870 = insertelement <2 x float> poison, float %9710, i64 0, !dbg !331 + %9871 = insertelement <2 x float> %9870, float %9711, i64 1, !dbg !331 + %9872 = fptrunc <2 x float> %9871 to <2 x bfloat>, !dbg !331 + %9873 = insertelement <2 x float> poison, float %9712, i64 0, !dbg !331 + %9874 = insertelement <2 x float> %9873, float %9713, i64 1, !dbg !331 + %9875 = fptrunc <2 x float> %9874 to <2 x bfloat>, !dbg !331 + %9876 = insertelement <2 x float> poison, float %9714, i64 0, !dbg !331 + %9877 = insertelement <2 x float> %9876, float %9715, i64 1, !dbg !331 + %9878 = fptrunc <2 x float> %9877 to <2 x bfloat>, !dbg !331 + %9879 = insertelement <2 x float> poison, float %9716, i64 0, !dbg !331 + %9880 = insertelement <2 x float> %9879, float %9717, i64 1, !dbg !331 + %9881 = fptrunc <2 x float> %9880 to <2 x bfloat>, !dbg !331 + %9882 = insertelement <2 x float> poison, float %9718, i64 0, !dbg !331 + %9883 = insertelement <2 x float> %9882, float %9719, i64 1, !dbg !331 + %9884 = fptrunc <2 x float> %9883 to <2 x bfloat>, !dbg !331 + %9885 = insertelement <2 x float> poison, float %9720, i64 0, !dbg !331 + %9886 = insertelement <2 x float> %9885, float %9721, i64 1, !dbg !331 + %9887 = fptrunc <2 x float> %9886 to <2 x bfloat>, !dbg !331 + %9888 = insertelement <2 x float> poison, float %9722, i64 0, !dbg !331 + %9889 = insertelement <2 x float> %9888, float %9723, i64 1, !dbg !331 + %9890 = fptrunc <2 x float> %9889 to <2 x bfloat>, !dbg !331 + %9891 = insertelement <2 x float> poison, float %9724, i64 0, !dbg !331 + %9892 = insertelement <2 x float> %9891, float %9725, i64 1, !dbg !331 + %9893 = fptrunc <2 x float> %9892 to <2 x bfloat>, !dbg !331 + %9894 = insertelement <2 x float> poison, float %9726, i64 0, !dbg !331 + %9895 = insertelement <2 x float> %9894, float %9727, i64 1, !dbg !331 + %9896 = fptrunc <2 x float> %9895 to <2 x bfloat>, !dbg !331 + %9897 = insertelement <2 x float> poison, float %9728, i64 0, !dbg !331 + %9898 = insertelement <2 x float> %9897, float %9729, i64 1, !dbg !331 + %9899 = fptrunc <2 x float> %9898 to <2 x bfloat>, !dbg !331 + %9900 = insertelement <2 x float> poison, float %9730, i64 0, !dbg !331 + %9901 = insertelement <2 x float> %9900, float %9731, i64 1, !dbg !331 + %9902 = fptrunc <2 x float> %9901 to <2 x bfloat>, !dbg !331 + %9903 = insertelement <2 x float> poison, float %9732, i64 0, !dbg !331 + %9904 = insertelement <2 x float> %9903, float %9733, i64 1, !dbg !331 + %9905 = fptrunc <2 x float> %9904 to <2 x bfloat>, !dbg !331 + %9906 = insertelement <2 x float> poison, float %9734, i64 0, !dbg !331 + %9907 = insertelement <2 x float> %9906, float %9735, i64 1, !dbg !331 + %9908 = fptrunc <2 x float> %9907 to <2 x bfloat>, !dbg !331 + %9909 = insertelement <2 x float> poison, float %9736, i64 0, !dbg !331 + %9910 = insertelement <2 x float> %9909, float %9737, i64 1, !dbg !331 + %9911 = fptrunc <2 x float> %9910 to <2 x bfloat>, !dbg !331 + %9912 = insertelement <2 x float> poison, float %9738, i64 0, !dbg !331 + %9913 = insertelement <2 x float> %9912, float %9739, i64 1, !dbg !331 + %9914 = fptrunc <2 x float> %9913 to <2 x bfloat>, !dbg !331 + %9915 = insertelement <2 x float> poison, float %9740, i64 0, !dbg !331 + %9916 = insertelement <2 x float> %9915, float %9741, i64 1, !dbg !331 + %9917 = fptrunc <2 x float> %9916 to <2 x bfloat>, !dbg !331 + %9918 = insertelement <2 x float> poison, float %9742, i64 0, !dbg !331 + %9919 = insertelement <2 x float> %9918, float %9743, i64 1, !dbg !331 + %9920 = fptrunc <2 x float> %9919 to <2 x bfloat>, !dbg !331 + %9921 = shl nuw nsw i32 %4713, 13, !dbg !331 + %9922 = shl nuw nsw i32 %35, 5, !dbg !331 + %9923 = and i32 %9922, 7264, !dbg !331 + %9924 = and i32 %35, 24, !dbg !331 + %9925 = shl nuw nsw i32 %9924, 4, !dbg !331 + %9926 = shl nuw nsw i32 %35, 2, !dbg !331 + %9927 = and i32 %9926, 16, !dbg !331 + %9928 = or disjoint i32 %9921, %9927, !dbg !331 + %9929 = or disjoint i32 %9923, %9925, !dbg !331 + %9930 = or disjoint i32 %9928, %9929, !dbg !331 + %9931 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %9930, !dbg !331 + %9932 = bitcast <2 x bfloat> %9827 to i32, !dbg !331 + %9933 = bitcast <2 x bfloat> %9833 to i32, !dbg !331 + %9934 = bitcast <2 x bfloat> %9839 to i32, !dbg !331 + %9935 = bitcast <2 x bfloat> %9845 to i32, !dbg !331 + %9936 = insertelement <4 x i32> poison, i32 %9932, i64 0, !dbg !331 + %9937 = insertelement <4 x i32> %9936, i32 %9933, i64 1, !dbg !331 + %9938 = insertelement <4 x i32> %9937, i32 %9934, i64 2, !dbg !331 + %9939 = insertelement <4 x i32> %9938, i32 %9935, i64 3, !dbg !331 + store <4 x i32> %9939, ptr addrspace(3) %9931, align 16, !dbg !331 + %9940 = getelementptr inbounds nuw i8, ptr addrspace(3) %9931, i32 512, !dbg !331 + %9941 = bitcast <2 x bfloat> %9830 to i32, !dbg !331 + %9942 = bitcast <2 x bfloat> %9836 to i32, !dbg !331 + %9943 = bitcast <2 x bfloat> %9842 to i32, !dbg !331 + %9944 = bitcast <2 x bfloat> %9848 to i32, !dbg !331 + %9945 = insertelement <4 x i32> poison, i32 %9941, i64 0, !dbg !331 + %9946 = insertelement <4 x i32> %9945, i32 %9942, i64 1, !dbg !331 + %9947 = insertelement <4 x i32> %9946, i32 %9943, i64 2, !dbg !331 + %9948 = insertelement <4 x i32> %9947, i32 %9944, i64 3, !dbg !331 + store <4 x i32> %9948, ptr addrspace(3) %9940, align 16, !dbg !331 + %9949 = xor i32 %9930, 32, !dbg !331 + %9950 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %9949, !dbg !331 + %9951 = bitcast <2 x bfloat> %9851 to i32, !dbg !331 + %9952 = bitcast <2 x bfloat> %9857 to i32, !dbg !331 + %9953 = bitcast <2 x bfloat> %9863 to i32, !dbg !331 + %9954 = bitcast <2 x bfloat> %9869 to i32, !dbg !331 + %9955 = insertelement <4 x i32> poison, i32 %9951, i64 0, !dbg !331 + %9956 = insertelement <4 x i32> %9955, i32 %9952, i64 1, !dbg !331 + %9957 = insertelement <4 x i32> %9956, i32 %9953, i64 2, !dbg !331 + %9958 = insertelement <4 x i32> %9957, i32 %9954, i64 3, !dbg !331 + store <4 x i32> %9958, ptr addrspace(3) %9950, align 16, !dbg !331 + %9959 = getelementptr inbounds nuw i8, ptr addrspace(3) %9950, i32 512, !dbg !331 + %9960 = bitcast <2 x bfloat> %9854 to i32, !dbg !331 + %9961 = bitcast <2 x bfloat> %9860 to i32, !dbg !331 + %9962 = bitcast <2 x bfloat> %9866 to i32, !dbg !331 + %9963 = bitcast <2 x bfloat> %9872 to i32, !dbg !331 + %9964 = insertelement <4 x i32> poison, i32 %9960, i64 0, !dbg !331 + %9965 = insertelement <4 x i32> %9964, i32 %9961, i64 1, !dbg !331 + %9966 = insertelement <4 x i32> %9965, i32 %9962, i64 2, !dbg !331 + %9967 = insertelement <4 x i32> %9966, i32 %9963, i64 3, !dbg !331 + store <4 x i32> %9967, ptr addrspace(3) %9959, align 16, !dbg !331 + %9968 = xor i32 %9930, 64, !dbg !331 + %9969 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %9968, !dbg !331 + %9970 = bitcast <2 x bfloat> %9875 to i32, !dbg !331 + %9971 = bitcast <2 x bfloat> %9881 to i32, !dbg !331 + %9972 = bitcast <2 x bfloat> %9887 to i32, !dbg !331 + %9973 = bitcast <2 x bfloat> %9893 to i32, !dbg !331 + %9974 = insertelement <4 x i32> poison, i32 %9970, i64 0, !dbg !331 + %9975 = insertelement <4 x i32> %9974, i32 %9971, i64 1, !dbg !331 + %9976 = insertelement <4 x i32> %9975, i32 %9972, i64 2, !dbg !331 + %9977 = insertelement <4 x i32> %9976, i32 %9973, i64 3, !dbg !331 + store <4 x i32> %9977, ptr addrspace(3) %9969, align 16, !dbg !331 + %9978 = getelementptr inbounds nuw i8, ptr addrspace(3) %9969, i32 512, !dbg !331 + %9979 = bitcast <2 x bfloat> %9878 to i32, !dbg !331 + %9980 = bitcast <2 x bfloat> %9884 to i32, !dbg !331 + %9981 = bitcast <2 x bfloat> %9890 to i32, !dbg !331 + %9982 = bitcast <2 x bfloat> %9896 to i32, !dbg !331 + %9983 = insertelement <4 x i32> poison, i32 %9979, i64 0, !dbg !331 + %9984 = insertelement <4 x i32> %9983, i32 %9980, i64 1, !dbg !331 + %9985 = insertelement <4 x i32> %9984, i32 %9981, i64 2, !dbg !331 + %9986 = insertelement <4 x i32> %9985, i32 %9982, i64 3, !dbg !331 + store <4 x i32> %9986, ptr addrspace(3) %9978, align 16, !dbg !331 + %9987 = xor i32 %9930, 96, !dbg !331 + %9988 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %9987, !dbg !331 + %9989 = bitcast <2 x bfloat> %9899 to i32, !dbg !331 + %9990 = bitcast <2 x bfloat> %9905 to i32, !dbg !331 + %9991 = bitcast <2 x bfloat> %9911 to i32, !dbg !331 + %9992 = bitcast <2 x bfloat> %9917 to i32, !dbg !331 + %9993 = insertelement <4 x i32> poison, i32 %9989, i64 0, !dbg !331 + %9994 = insertelement <4 x i32> %9993, i32 %9990, i64 1, !dbg !331 + %9995 = insertelement <4 x i32> %9994, i32 %9991, i64 2, !dbg !331 + %9996 = insertelement <4 x i32> %9995, i32 %9992, i64 3, !dbg !331 + store <4 x i32> %9996, ptr addrspace(3) %9988, align 16, !dbg !331 + %9997 = getelementptr inbounds nuw i8, ptr addrspace(3) %9988, i32 512, !dbg !331 + %9998 = bitcast <2 x bfloat> %9902 to i32, !dbg !331 + %9999 = bitcast <2 x bfloat> %9908 to i32, !dbg !331 + %10000 = bitcast <2 x bfloat> %9914 to i32, !dbg !331 + %10001 = bitcast <2 x bfloat> %9920 to i32, !dbg !331 + %10002 = insertelement <4 x i32> poison, i32 %9998, i64 0, !dbg !331 + %10003 = insertelement <4 x i32> %10002, i32 %9999, i64 1, !dbg !331 + %10004 = insertelement <4 x i32> %10003, i32 %10000, i64 2, !dbg !331 + %10005 = insertelement <4 x i32> %10004, i32 %10001, i64 3, !dbg !331 + store <4 x i32> %10005, ptr addrspace(3) %9997, align 16, !dbg !331 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !331 + %10006 = shl nuw nsw i32 %9924, 10, !dbg !331 + %10007 = shl nuw nsw i32 %4713, 5, !dbg !331 + %10008 = and i32 %9926, 1008, !dbg !331 + %10009 = or disjoint i32 %10006, %10007, !dbg !331 + %10010 = xor i32 %10009, %10008, !dbg !331 + %10011 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %10010, !dbg !331 + %10012 = ptrtoint ptr addrspace(3) %10011 to i32, !dbg !331 + %10013 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10012) #3, !dbg !331 + %10014 = extractvalue { i32, i32, i32, i32 } %10013, 0, !dbg !331 + %10015 = extractvalue { i32, i32, i32, i32 } %10013, 1, !dbg !331 + %10016 = extractvalue { i32, i32, i32, i32 } %10013, 2, !dbg !331 + %10017 = extractvalue { i32, i32, i32, i32 } %10013, 3, !dbg !331 + %10018 = getelementptr inbounds nuw i8, ptr addrspace(3) %10011, i32 1024, !dbg !331 + %10019 = ptrtoint ptr addrspace(3) %10018 to i32, !dbg !331 + %10020 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10019) #3, !dbg !331 + %10021 = extractvalue { i32, i32, i32, i32 } %10020, 0, !dbg !331 + %10022 = extractvalue { i32, i32, i32, i32 } %10020, 1, !dbg !331 + %10023 = extractvalue { i32, i32, i32, i32 } %10020, 2, !dbg !331 + %10024 = extractvalue { i32, i32, i32, i32 } %10020, 3, !dbg !331 + %10025 = getelementptr inbounds nuw i8, ptr addrspace(3) %10011, i32 2048, !dbg !331 + %10026 = ptrtoint ptr addrspace(3) %10025 to i32, !dbg !331 + %10027 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10026) #3, !dbg !331 + %10028 = extractvalue { i32, i32, i32, i32 } %10027, 0, !dbg !331 + %10029 = extractvalue { i32, i32, i32, i32 } %10027, 1, !dbg !331 + %10030 = extractvalue { i32, i32, i32, i32 } %10027, 2, !dbg !331 + %10031 = extractvalue { i32, i32, i32, i32 } %10027, 3, !dbg !331 + %10032 = getelementptr inbounds nuw i8, ptr addrspace(3) %10011, i32 3072, !dbg !331 + %10033 = ptrtoint ptr addrspace(3) %10032 to i32, !dbg !331 + %10034 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10033) #3, !dbg !331 + %10035 = extractvalue { i32, i32, i32, i32 } %10034, 0, !dbg !331 + %10036 = extractvalue { i32, i32, i32, i32 } %10034, 1, !dbg !331 + %10037 = extractvalue { i32, i32, i32, i32 } %10034, 2, !dbg !331 + %10038 = extractvalue { i32, i32, i32, i32 } %10034, 3, !dbg !331 + %10039 = getelementptr inbounds nuw i8, ptr addrspace(3) %10011, i32 4096, !dbg !331 + %10040 = ptrtoint ptr addrspace(3) %10039 to i32, !dbg !331 + %10041 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10040) #3, !dbg !331 + %10042 = extractvalue { i32, i32, i32, i32 } %10041, 0, !dbg !331 + %10043 = extractvalue { i32, i32, i32, i32 } %10041, 1, !dbg !331 + %10044 = extractvalue { i32, i32, i32, i32 } %10041, 2, !dbg !331 + %10045 = extractvalue { i32, i32, i32, i32 } %10041, 3, !dbg !331 + %10046 = getelementptr inbounds nuw i8, ptr addrspace(3) %10011, i32 5120, !dbg !331 + %10047 = ptrtoint ptr addrspace(3) %10046 to i32, !dbg !331 + %10048 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10047) #3, !dbg !331 + %10049 = extractvalue { i32, i32, i32, i32 } %10048, 0, !dbg !331 + %10050 = extractvalue { i32, i32, i32, i32 } %10048, 1, !dbg !331 + %10051 = extractvalue { i32, i32, i32, i32 } %10048, 2, !dbg !331 + %10052 = extractvalue { i32, i32, i32, i32 } %10048, 3, !dbg !331 + %10053 = getelementptr inbounds nuw i8, ptr addrspace(3) %10011, i32 6144, !dbg !331 + %10054 = ptrtoint ptr addrspace(3) %10053 to i32, !dbg !331 + %10055 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10054) #3, !dbg !331 + %10056 = extractvalue { i32, i32, i32, i32 } %10055, 0, !dbg !331 + %10057 = extractvalue { i32, i32, i32, i32 } %10055, 1, !dbg !331 + %10058 = extractvalue { i32, i32, i32, i32 } %10055, 2, !dbg !331 + %10059 = extractvalue { i32, i32, i32, i32 } %10055, 3, !dbg !331 + %10060 = getelementptr inbounds nuw i8, ptr addrspace(3) %10011, i32 7168, !dbg !331 + %10061 = ptrtoint ptr addrspace(3) %10060 to i32, !dbg !331 + %10062 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10061) #3, !dbg !331 + %10063 = extractvalue { i32, i32, i32, i32 } %10062, 0, !dbg !331 + %10064 = extractvalue { i32, i32, i32, i32 } %10062, 1, !dbg !331 + %10065 = extractvalue { i32, i32, i32, i32 } %10062, 2, !dbg !331 + %10066 = extractvalue { i32, i32, i32, i32 } %10062, 3, !dbg !331 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %10014, i32 %10015, i32 %10016, i32 %10017, ptr addrspace(1) %9817) #3, !dbg !331 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %10021, i32 %10022, i32 %10023, i32 %10024, ptr addrspace(1) %9818) #3, !dbg !331 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %10028, i32 %10029, i32 %10030, i32 %10031, ptr addrspace(1) %9819) #3, !dbg !331 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %10035, i32 %10036, i32 %10037, i32 %10038, ptr addrspace(1) %9820) #3, !dbg !331 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %10042, i32 %10043, i32 %10044, i32 %10045, ptr addrspace(1) %9821) #3, !dbg !331 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %10049, i32 %10050, i32 %10051, i32 %10052, ptr addrspace(1) %9822) #3, !dbg !331 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %10056, i32 %10057, i32 %10058, i32 %10059, ptr addrspace(1) %9823) #3, !dbg !331 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %10063, i32 %10064, i32 %10065, i32 %10066, ptr addrspace(1) %9824) #3, !dbg !331 + %10067 = insertelement <2 x float> poison, float %9744, i64 0, !dbg !332 + %10068 = insertelement <2 x float> %10067, float %9745, i64 1, !dbg !332 + %10069 = fmul <2 x float> %10068, splat (float 0x3FB6A09E60000000), !dbg !332 + %10070 = insertelement <2 x float> poison, float %9746, i64 0, !dbg !332 + %10071 = insertelement <2 x float> %10070, float %9747, i64 1, !dbg !332 + %10072 = fmul <2 x float> %10071, splat (float 0x3FB6A09E60000000), !dbg !332 + %10073 = insertelement <2 x float> poison, float %9748, i64 0, !dbg !332 + %10074 = insertelement <2 x float> %10073, float %9749, i64 1, !dbg !332 + %10075 = fmul <2 x float> %10074, splat (float 0x3FB6A09E60000000), !dbg !332 + %10076 = insertelement <2 x float> poison, float %9750, i64 0, !dbg !332 + %10077 = insertelement <2 x float> %10076, float %9751, i64 1, !dbg !332 + %10078 = fmul <2 x float> %10077, splat (float 0x3FB6A09E60000000), !dbg !332 + %10079 = insertelement <2 x float> poison, float %9752, i64 0, !dbg !332 + %10080 = insertelement <2 x float> %10079, float %9753, i64 1, !dbg !332 + %10081 = fmul <2 x float> %10080, splat (float 0x3FB6A09E60000000), !dbg !332 + %10082 = insertelement <2 x float> poison, float %9754, i64 0, !dbg !332 + %10083 = insertelement <2 x float> %10082, float %9755, i64 1, !dbg !332 + %10084 = fmul <2 x float> %10083, splat (float 0x3FB6A09E60000000), !dbg !332 + %10085 = insertelement <2 x float> poison, float %9756, i64 0, !dbg !332 + %10086 = insertelement <2 x float> %10085, float %9757, i64 1, !dbg !332 + %10087 = fmul <2 x float> %10086, splat (float 0x3FB6A09E60000000), !dbg !332 + %10088 = insertelement <2 x float> poison, float %9758, i64 0, !dbg !332 + %10089 = insertelement <2 x float> %10088, float %9759, i64 1, !dbg !332 + %10090 = fmul <2 x float> %10089, splat (float 0x3FB6A09E60000000), !dbg !332 + %10091 = insertelement <2 x float> poison, float %9760, i64 0, !dbg !332 + %10092 = insertelement <2 x float> %10091, float %9761, i64 1, !dbg !332 + %10093 = fmul <2 x float> %10092, splat (float 0x3FB6A09E60000000), !dbg !332 + %10094 = insertelement <2 x float> poison, float %9762, i64 0, !dbg !332 + %10095 = insertelement <2 x float> %10094, float %9763, i64 1, !dbg !332 + %10096 = fmul <2 x float> %10095, splat (float 0x3FB6A09E60000000), !dbg !332 + %10097 = insertelement <2 x float> poison, float %9764, i64 0, !dbg !332 + %10098 = insertelement <2 x float> %10097, float %9765, i64 1, !dbg !332 + %10099 = fmul <2 x float> %10098, splat (float 0x3FB6A09E60000000), !dbg !332 + %10100 = insertelement <2 x float> poison, float %9766, i64 0, !dbg !332 + %10101 = insertelement <2 x float> %10100, float %9767, i64 1, !dbg !332 + %10102 = fmul <2 x float> %10101, splat (float 0x3FB6A09E60000000), !dbg !332 + %10103 = insertelement <2 x float> poison, float %9768, i64 0, !dbg !332 + %10104 = insertelement <2 x float> %10103, float %9769, i64 1, !dbg !332 + %10105 = fmul <2 x float> %10104, splat (float 0x3FB6A09E60000000), !dbg !332 + %10106 = insertelement <2 x float> poison, float %9770, i64 0, !dbg !332 + %10107 = insertelement <2 x float> %10106, float %9771, i64 1, !dbg !332 + %10108 = fmul <2 x float> %10107, splat (float 0x3FB6A09E60000000), !dbg !332 + %10109 = insertelement <2 x float> poison, float %9772, i64 0, !dbg !332 + %10110 = insertelement <2 x float> %10109, float %9773, i64 1, !dbg !332 + %10111 = fmul <2 x float> %10110, splat (float 0x3FB6A09E60000000), !dbg !332 + %10112 = insertelement <2 x float> poison, float %9774, i64 0, !dbg !332 + %10113 = insertelement <2 x float> %10112, float %9775, i64 1, !dbg !332 + %10114 = fmul <2 x float> %10113, splat (float 0x3FB6A09E60000000), !dbg !332 + %10115 = insertelement <2 x float> poison, float %9776, i64 0, !dbg !332 + %10116 = insertelement <2 x float> %10115, float %9777, i64 1, !dbg !332 + %10117 = fmul <2 x float> %10116, splat (float 0x3FB6A09E60000000), !dbg !332 + %10118 = insertelement <2 x float> poison, float %9778, i64 0, !dbg !332 + %10119 = insertelement <2 x float> %10118, float %9779, i64 1, !dbg !332 + %10120 = fmul <2 x float> %10119, splat (float 0x3FB6A09E60000000), !dbg !332 + %10121 = insertelement <2 x float> poison, float %9780, i64 0, !dbg !332 + %10122 = insertelement <2 x float> %10121, float %9781, i64 1, !dbg !332 + %10123 = fmul <2 x float> %10122, splat (float 0x3FB6A09E60000000), !dbg !332 + %10124 = insertelement <2 x float> poison, float %9782, i64 0, !dbg !332 + %10125 = insertelement <2 x float> %10124, float %9783, i64 1, !dbg !332 + %10126 = fmul <2 x float> %10125, splat (float 0x3FB6A09E60000000), !dbg !332 + %10127 = insertelement <2 x float> poison, float %9784, i64 0, !dbg !332 + %10128 = insertelement <2 x float> %10127, float %9785, i64 1, !dbg !332 + %10129 = fmul <2 x float> %10128, splat (float 0x3FB6A09E60000000), !dbg !332 + %10130 = insertelement <2 x float> poison, float %9786, i64 0, !dbg !332 + %10131 = insertelement <2 x float> %10130, float %9787, i64 1, !dbg !332 + %10132 = fmul <2 x float> %10131, splat (float 0x3FB6A09E60000000), !dbg !332 + %10133 = insertelement <2 x float> poison, float %9788, i64 0, !dbg !332 + %10134 = insertelement <2 x float> %10133, float %9789, i64 1, !dbg !332 + %10135 = fmul <2 x float> %10134, splat (float 0x3FB6A09E60000000), !dbg !332 + %10136 = insertelement <2 x float> poison, float %9790, i64 0, !dbg !332 + %10137 = insertelement <2 x float> %10136, float %9791, i64 1, !dbg !332 + %10138 = fmul <2 x float> %10137, splat (float 0x3FB6A09E60000000), !dbg !332 + %10139 = insertelement <2 x float> poison, float %9792, i64 0, !dbg !332 + %10140 = insertelement <2 x float> %10139, float %9793, i64 1, !dbg !332 + %10141 = fmul <2 x float> %10140, splat (float 0x3FB6A09E60000000), !dbg !332 + %10142 = insertelement <2 x float> poison, float %9794, i64 0, !dbg !332 + %10143 = insertelement <2 x float> %10142, float %9795, i64 1, !dbg !332 + %10144 = fmul <2 x float> %10143, splat (float 0x3FB6A09E60000000), !dbg !332 + %10145 = insertelement <2 x float> poison, float %9796, i64 0, !dbg !332 + %10146 = insertelement <2 x float> %10145, float %9797, i64 1, !dbg !332 + %10147 = fmul <2 x float> %10146, splat (float 0x3FB6A09E60000000), !dbg !332 + %10148 = insertelement <2 x float> poison, float %9798, i64 0, !dbg !332 + %10149 = insertelement <2 x float> %10148, float %9799, i64 1, !dbg !332 + %10150 = fmul <2 x float> %10149, splat (float 0x3FB6A09E60000000), !dbg !332 + %10151 = insertelement <2 x float> poison, float %9800, i64 0, !dbg !332 + %10152 = insertelement <2 x float> %10151, float %9801, i64 1, !dbg !332 + %10153 = fmul <2 x float> %10152, splat (float 0x3FB6A09E60000000), !dbg !332 + %10154 = insertelement <2 x float> poison, float %9802, i64 0, !dbg !332 + %10155 = insertelement <2 x float> %10154, float %9803, i64 1, !dbg !332 + %10156 = fmul <2 x float> %10155, splat (float 0x3FB6A09E60000000), !dbg !332 + %10157 = insertelement <2 x float> poison, float %9804, i64 0, !dbg !332 + %10158 = insertelement <2 x float> %10157, float %9805, i64 1, !dbg !332 + %10159 = fmul <2 x float> %10158, splat (float 0x3FB6A09E60000000), !dbg !332 + %10160 = insertelement <2 x float> poison, float %9806, i64 0, !dbg !332 + %10161 = insertelement <2 x float> %10160, float %9807, i64 1, !dbg !332 + %10162 = fmul <2 x float> %10161, splat (float 0x3FB6A09E60000000), !dbg !332 + %10163 = or disjoint i32 %4472, %30, !dbg !333 + %10164 = or disjoint i32 %10163, %4497, !dbg !333 + %10165 = or disjoint i32 %4473, %30, !dbg !333 + %10166 = or disjoint i32 %10165, %4497, !dbg !333 + %10167 = or disjoint i32 %4474, %30, !dbg !333 + %10168 = or disjoint i32 %10167, %4497, !dbg !333 + %10169 = or disjoint i32 %4475, %30, !dbg !333 + %10170 = or disjoint i32 %10169, %4497, !dbg !333 + %10171 = or disjoint i32 %4476, %30, !dbg !333 + %10172 = or disjoint i32 %10171, %4497, !dbg !333 + %10173 = or disjoint i32 %4477, %30, !dbg !333 + %10174 = or disjoint i32 %10173, %4497, !dbg !333 + %10175 = or disjoint i32 %4478, %30, !dbg !333 + %10176 = or disjoint i32 %10175, %4497, !dbg !333 + %10177 = or disjoint i32 %4479, %30, !dbg !333 + %10178 = or disjoint i32 %10177, %4497, !dbg !333 + %10179 = sext i32 %10164 to i64, !dbg !334 + %10180 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10179, !dbg !334 + %10181 = sext i32 %10166 to i64, !dbg !334 + %10182 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10181, !dbg !334 + %10183 = sext i32 %10168 to i64, !dbg !334 + %10184 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10183, !dbg !334 + %10185 = sext i32 %10170 to i64, !dbg !334 + %10186 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10185, !dbg !334 + %10187 = sext i32 %10172 to i64, !dbg !334 + %10188 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10187, !dbg !334 + %10189 = sext i32 %10174 to i64, !dbg !334 + %10190 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10189, !dbg !334 + %10191 = sext i32 %10176 to i64, !dbg !334 + %10192 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10191, !dbg !334 + %10193 = sext i32 %10178 to i64, !dbg !334 + %10194 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10193, !dbg !334 + %10195 = fptrunc <2 x float> %10069 to <2 x bfloat>, !dbg !335 + %10196 = fptrunc <2 x float> %10072 to <2 x bfloat>, !dbg !335 + %10197 = fptrunc <2 x float> %10075 to <2 x bfloat>, !dbg !335 + %10198 = fptrunc <2 x float> %10078 to <2 x bfloat>, !dbg !335 + %10199 = fptrunc <2 x float> %10081 to <2 x bfloat>, !dbg !335 + %10200 = fptrunc <2 x float> %10084 to <2 x bfloat>, !dbg !335 + %10201 = fptrunc <2 x float> %10087 to <2 x bfloat>, !dbg !335 + %10202 = fptrunc <2 x float> %10090 to <2 x bfloat>, !dbg !335 + %10203 = fptrunc <2 x float> %10093 to <2 x bfloat>, !dbg !335 + %10204 = fptrunc <2 x float> %10096 to <2 x bfloat>, !dbg !335 + %10205 = fptrunc <2 x float> %10099 to <2 x bfloat>, !dbg !335 + %10206 = fptrunc <2 x float> %10102 to <2 x bfloat>, !dbg !335 + %10207 = fptrunc <2 x float> %10105 to <2 x bfloat>, !dbg !335 + %10208 = fptrunc <2 x float> %10108 to <2 x bfloat>, !dbg !335 + %10209 = fptrunc <2 x float> %10111 to <2 x bfloat>, !dbg !335 + %10210 = fptrunc <2 x float> %10114 to <2 x bfloat>, !dbg !335 + %10211 = fptrunc <2 x float> %10117 to <2 x bfloat>, !dbg !335 + %10212 = fptrunc <2 x float> %10120 to <2 x bfloat>, !dbg !335 + %10213 = fptrunc <2 x float> %10123 to <2 x bfloat>, !dbg !335 + %10214 = fptrunc <2 x float> %10126 to <2 x bfloat>, !dbg !335 + %10215 = fptrunc <2 x float> %10129 to <2 x bfloat>, !dbg !335 + %10216 = fptrunc <2 x float> %10132 to <2 x bfloat>, !dbg !335 + %10217 = fptrunc <2 x float> %10135 to <2 x bfloat>, !dbg !335 + %10218 = fptrunc <2 x float> %10138 to <2 x bfloat>, !dbg !335 + %10219 = fptrunc <2 x float> %10141 to <2 x bfloat>, !dbg !335 + %10220 = fptrunc <2 x float> %10144 to <2 x bfloat>, !dbg !335 + %10221 = fptrunc <2 x float> %10147 to <2 x bfloat>, !dbg !335 + %10222 = fptrunc <2 x float> %10150 to <2 x bfloat>, !dbg !335 + %10223 = fptrunc <2 x float> %10153 to <2 x bfloat>, !dbg !335 + %10224 = fptrunc <2 x float> %10156 to <2 x bfloat>, !dbg !335 + %10225 = fptrunc <2 x float> %10159 to <2 x bfloat>, !dbg !335 + %10226 = fptrunc <2 x float> %10162 to <2 x bfloat>, !dbg !335 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !335 + %10227 = bitcast <2 x bfloat> %10195 to i32, !dbg !335 + %10228 = bitcast <2 x bfloat> %10197 to i32, !dbg !335 + %10229 = bitcast <2 x bfloat> %10199 to i32, !dbg !335 + %10230 = bitcast <2 x bfloat> %10201 to i32, !dbg !335 + %10231 = insertelement <4 x i32> poison, i32 %10227, i64 0, !dbg !335 + %10232 = insertelement <4 x i32> %10231, i32 %10228, i64 1, !dbg !335 + %10233 = insertelement <4 x i32> %10232, i32 %10229, i64 2, !dbg !335 + %10234 = insertelement <4 x i32> %10233, i32 %10230, i64 3, !dbg !335 + store <4 x i32> %10234, ptr addrspace(3) %9931, align 16, !dbg !335 + %10235 = bitcast <2 x bfloat> %10196 to i32, !dbg !335 + %10236 = bitcast <2 x bfloat> %10198 to i32, !dbg !335 + %10237 = bitcast <2 x bfloat> %10200 to i32, !dbg !335 + %10238 = bitcast <2 x bfloat> %10202 to i32, !dbg !335 + %10239 = insertelement <4 x i32> poison, i32 %10235, i64 0, !dbg !335 + %10240 = insertelement <4 x i32> %10239, i32 %10236, i64 1, !dbg !335 + %10241 = insertelement <4 x i32> %10240, i32 %10237, i64 2, !dbg !335 + %10242 = insertelement <4 x i32> %10241, i32 %10238, i64 3, !dbg !335 + store <4 x i32> %10242, ptr addrspace(3) %9940, align 16, !dbg !335 + %10243 = bitcast <2 x bfloat> %10203 to i32, !dbg !335 + %10244 = bitcast <2 x bfloat> %10205 to i32, !dbg !335 + %10245 = bitcast <2 x bfloat> %10207 to i32, !dbg !335 + %10246 = bitcast <2 x bfloat> %10209 to i32, !dbg !335 + %10247 = insertelement <4 x i32> poison, i32 %10243, i64 0, !dbg !335 + %10248 = insertelement <4 x i32> %10247, i32 %10244, i64 1, !dbg !335 + %10249 = insertelement <4 x i32> %10248, i32 %10245, i64 2, !dbg !335 + %10250 = insertelement <4 x i32> %10249, i32 %10246, i64 3, !dbg !335 + store <4 x i32> %10250, ptr addrspace(3) %9950, align 16, !dbg !335 + %10251 = bitcast <2 x bfloat> %10204 to i32, !dbg !335 + %10252 = bitcast <2 x bfloat> %10206 to i32, !dbg !335 + %10253 = bitcast <2 x bfloat> %10208 to i32, !dbg !335 + %10254 = bitcast <2 x bfloat> %10210 to i32, !dbg !335 + %10255 = insertelement <4 x i32> poison, i32 %10251, i64 0, !dbg !335 + %10256 = insertelement <4 x i32> %10255, i32 %10252, i64 1, !dbg !335 + %10257 = insertelement <4 x i32> %10256, i32 %10253, i64 2, !dbg !335 + %10258 = insertelement <4 x i32> %10257, i32 %10254, i64 3, !dbg !335 + store <4 x i32> %10258, ptr addrspace(3) %9959, align 16, !dbg !335 + %10259 = bitcast <2 x bfloat> %10211 to i32, !dbg !335 + %10260 = bitcast <2 x bfloat> %10213 to i32, !dbg !335 + %10261 = bitcast <2 x bfloat> %10215 to i32, !dbg !335 + %10262 = bitcast <2 x bfloat> %10217 to i32, !dbg !335 + %10263 = insertelement <4 x i32> poison, i32 %10259, i64 0, !dbg !335 + %10264 = insertelement <4 x i32> %10263, i32 %10260, i64 1, !dbg !335 + %10265 = insertelement <4 x i32> %10264, i32 %10261, i64 2, !dbg !335 + %10266 = insertelement <4 x i32> %10265, i32 %10262, i64 3, !dbg !335 + store <4 x i32> %10266, ptr addrspace(3) %9969, align 16, !dbg !335 + %10267 = bitcast <2 x bfloat> %10212 to i32, !dbg !335 + %10268 = bitcast <2 x bfloat> %10214 to i32, !dbg !335 + %10269 = bitcast <2 x bfloat> %10216 to i32, !dbg !335 + %10270 = bitcast <2 x bfloat> %10218 to i32, !dbg !335 + %10271 = insertelement <4 x i32> poison, i32 %10267, i64 0, !dbg !335 + %10272 = insertelement <4 x i32> %10271, i32 %10268, i64 1, !dbg !335 + %10273 = insertelement <4 x i32> %10272, i32 %10269, i64 2, !dbg !335 + %10274 = insertelement <4 x i32> %10273, i32 %10270, i64 3, !dbg !335 + store <4 x i32> %10274, ptr addrspace(3) %9978, align 16, !dbg !335 + %10275 = bitcast <2 x bfloat> %10219 to i32, !dbg !335 + %10276 = bitcast <2 x bfloat> %10221 to i32, !dbg !335 + %10277 = bitcast <2 x bfloat> %10223 to i32, !dbg !335 + %10278 = bitcast <2 x bfloat> %10225 to i32, !dbg !335 + %10279 = insertelement <4 x i32> poison, i32 %10275, i64 0, !dbg !335 + %10280 = insertelement <4 x i32> %10279, i32 %10276, i64 1, !dbg !335 + %10281 = insertelement <4 x i32> %10280, i32 %10277, i64 2, !dbg !335 + %10282 = insertelement <4 x i32> %10281, i32 %10278, i64 3, !dbg !335 + store <4 x i32> %10282, ptr addrspace(3) %9988, align 16, !dbg !335 + %10283 = bitcast <2 x bfloat> %10220 to i32, !dbg !335 + %10284 = bitcast <2 x bfloat> %10222 to i32, !dbg !335 + %10285 = bitcast <2 x bfloat> %10224 to i32, !dbg !335 + %10286 = bitcast <2 x bfloat> %10226 to i32, !dbg !335 + %10287 = insertelement <4 x i32> poison, i32 %10283, i64 0, !dbg !335 + %10288 = insertelement <4 x i32> %10287, i32 %10284, i64 1, !dbg !335 + %10289 = insertelement <4 x i32> %10288, i32 %10285, i64 2, !dbg !335 + %10290 = insertelement <4 x i32> %10289, i32 %10286, i64 3, !dbg !335 + store <4 x i32> %10290, ptr addrspace(3) %9997, align 16, !dbg !335 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !335 + %10291 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10012) #3, !dbg !335 + %10292 = extractvalue { i32, i32, i32, i32 } %10291, 0, !dbg !335 + %10293 = extractvalue { i32, i32, i32, i32 } %10291, 1, !dbg !335 + %10294 = extractvalue { i32, i32, i32, i32 } %10291, 2, !dbg !335 + %10295 = extractvalue { i32, i32, i32, i32 } %10291, 3, !dbg !335 + %10296 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10019) #3, !dbg !335 + %10297 = extractvalue { i32, i32, i32, i32 } %10296, 0, !dbg !335 + %10298 = extractvalue { i32, i32, i32, i32 } %10296, 1, !dbg !335 + %10299 = extractvalue { i32, i32, i32, i32 } %10296, 2, !dbg !335 + %10300 = extractvalue { i32, i32, i32, i32 } %10296, 3, !dbg !335 + %10301 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10026) #3, !dbg !335 + %10302 = extractvalue { i32, i32, i32, i32 } %10301, 0, !dbg !335 + %10303 = extractvalue { i32, i32, i32, i32 } %10301, 1, !dbg !335 + %10304 = extractvalue { i32, i32, i32, i32 } %10301, 2, !dbg !335 + %10305 = extractvalue { i32, i32, i32, i32 } %10301, 3, !dbg !335 + %10306 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10033) #3, !dbg !335 + %10307 = extractvalue { i32, i32, i32, i32 } %10306, 0, !dbg !335 + %10308 = extractvalue { i32, i32, i32, i32 } %10306, 1, !dbg !335 + %10309 = extractvalue { i32, i32, i32, i32 } %10306, 2, !dbg !335 + %10310 = extractvalue { i32, i32, i32, i32 } %10306, 3, !dbg !335 + %10311 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10040) #3, !dbg !335 + %10312 = extractvalue { i32, i32, i32, i32 } %10311, 0, !dbg !335 + %10313 = extractvalue { i32, i32, i32, i32 } %10311, 1, !dbg !335 + %10314 = extractvalue { i32, i32, i32, i32 } %10311, 2, !dbg !335 + %10315 = extractvalue { i32, i32, i32, i32 } %10311, 3, !dbg !335 + %10316 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10047) #3, !dbg !335 + %10317 = extractvalue { i32, i32, i32, i32 } %10316, 0, !dbg !335 + %10318 = extractvalue { i32, i32, i32, i32 } %10316, 1, !dbg !335 + %10319 = extractvalue { i32, i32, i32, i32 } %10316, 2, !dbg !335 + %10320 = extractvalue { i32, i32, i32, i32 } %10316, 3, !dbg !335 + %10321 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10054) #3, !dbg !335 + %10322 = extractvalue { i32, i32, i32, i32 } %10321, 0, !dbg !335 + %10323 = extractvalue { i32, i32, i32, i32 } %10321, 1, !dbg !335 + %10324 = extractvalue { i32, i32, i32, i32 } %10321, 2, !dbg !335 + %10325 = extractvalue { i32, i32, i32, i32 } %10321, 3, !dbg !335 + %10326 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10061) #3, !dbg !335 + %10327 = extractvalue { i32, i32, i32, i32 } %10326, 0, !dbg !335 + %10328 = extractvalue { i32, i32, i32, i32 } %10326, 1, !dbg !335 + %10329 = extractvalue { i32, i32, i32, i32 } %10326, 2, !dbg !335 + %10330 = extractvalue { i32, i32, i32, i32 } %10326, 3, !dbg !335 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10292, i32 %10293, i32 %10294, i32 %10295, ptr addrspace(1) %10180, i1 true) #3, !dbg !335 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10297, i32 %10298, i32 %10299, i32 %10300, ptr addrspace(1) %10182, i1 true) #3, !dbg !335 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10302, i32 %10303, i32 %10304, i32 %10305, ptr addrspace(1) %10184, i1 true) #3, !dbg !335 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10307, i32 %10308, i32 %10309, i32 %10310, ptr addrspace(1) %10186, i1 true) #3, !dbg !335 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10312, i32 %10313, i32 %10314, i32 %10315, ptr addrspace(1) %10188, i1 true) #3, !dbg !335 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10317, i32 %10318, i32 %10319, i32 %10320, ptr addrspace(1) %10190, i1 true) #3, !dbg !335 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10322, i32 %10323, i32 %10324, i32 %10325, ptr addrspace(1) %10192, i1 true) #3, !dbg !335 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10327, i32 %10328, i32 %10329, i32 %10330, ptr addrspace(1) %10194, i1 true) #3, !dbg !335 + br label %10331, !dbg !24 + +10331: ; preds = %._crit_edge1859, %9808 + ret void, !dbg !336 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smin.i32(i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.commit.group() #3 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.wait.group(i32 immarg) #3 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.idx.i32(i32, i32, i32, i32) #4 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.fence.sync.aligned() #5 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.commit_group.sync.aligned() #5 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #6 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #7 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #7 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smax.i32(i32, i32) #8 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.umin.i32(i32, i32) #8 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } +attributes #4 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #5 = { convergent nounwind } +attributes #6 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #7 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #8 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_tem_fused_zeros_1", linkageName: "triton_tem_fused_zeros_1", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 111, column: 24, scope: !5) +!9 = !DILocation(line: 115, column: 27, scope: !5) +!10 = !DILocation(line: 116, column: 28, scope: !5) +!11 = !DILocation(line: 117, column: 23, scope: !5) +!12 = !DILocation(line: 124, column: 25, scope: !5) +!13 = !DILocation(line: 124, column: 47, scope: !5) +!14 = !DILocation(line: 124, column: 35, scope: !5) +!15 = !DILocation(line: 124, column: 59, scope: !5) +!16 = !DILocation(line: 128, column: 50, scope: !5) +!17 = !DILocation(line: 128, column: 37, scope: !5) +!18 = !DILocation(line: 128, column: 61, scope: !5) +!19 = !DILocation(line: 131, column: 9, scope: !5) +!20 = !DILocation(line: 132, column: 9, scope: !5) +!21 = !DILocation(line: 133, column: 10, scope: !5) +!22 = !DILocation(line: 136, column: 26, scope: !5) +!23 = !DILocation(line: 139, column: 14, scope: !5) +!24 = !DILocation(line: 139, column: 7, scope: !5) +!25 = !DILocation(line: 140, column: 24, scope: !5) +!26 = !DILocation(line: 144, column: 29, scope: !5) +!27 = !DILocation(line: 144, column: 54, scope: !5) +!28 = !DILocation(line: 144, column: 44, scope: !5) +!29 = !DILocation(line: 145, column: 35, scope: !5) +!30 = !DILocation(line: 154, column: 55, scope: !5) +!31 = !DILocation(line: 154, column: 78, scope: !5) +!32 = !DILocation(line: 155, column: 50, scope: !5) +!33 = !DILocation(line: 155, column: 83, scope: !5) +!34 = !DILocation(line: 155, column: 68, scope: !5) +!35 = !DILocation(line: 158, column: 30, scope: !5) +!36 = !DILocation(line: 158, column: 52, scope: !5) +!37 = !DILocation(line: 158, column: 40, scope: !5) +!38 = !DILocation(line: 158, column: 63, scope: !5) +!39 = !DILocation(line: 159, column: 32, scope: !5) +!40 = !DILocation(line: 159, column: 42, scope: !5) +!41 = !DILocation(line: 159, column: 66, scope: !5) +!42 = !DILocation(line: 161, column: 46, scope: !5) +!43 = !DILocation(line: 161, column: 56, scope: !5) +!44 = !DILocation(line: 163, column: 17, scope: !5) +!45 = !DILocation(line: 164, column: 19, scope: !5) +!46 = !DILocation(line: 167, column: 19, scope: !5) +!47 = !DILocation(line: 168, column: 21, scope: !5) +!48 = !DILocation(line: 169, column: 25, scope: !5) +!49 = !DILocation(line: 174, column: 36, scope: !5) +!50 = !DILocation(line: 175, column: 29, scope: !5) +!51 = !DILocation(line: 825, column: 38, scope: !52, inlinedAt: !53) +!52 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!53 = !DILocation(line: 178, column: 107, scope: !5) +!54 = !DILocation(line: 825, column: 20, scope: !52, inlinedAt: !53) +!55 = !DILocation(line: 825, column: 56, scope: !52, inlinedAt: !53) +!56 = !DILocation(line: 825, column: 49, scope: !52, inlinedAt: !53) +!57 = !DILocation(line: 835, column: 23, scope: !52, inlinedAt: !53) +!58 = !DILocation(line: 825, column: 38, scope: !52, inlinedAt: !59) +!59 = !DILocation(line: 179, column: 111, scope: !5) +!60 = !DILocation(line: 825, column: 20, scope: !52, inlinedAt: !59) +!61 = !DILocation(line: 825, column: 49, scope: !52, inlinedAt: !59) +!62 = !DILocation(line: 835, column: 23, scope: !52, inlinedAt: !59) +!63 = !DILocation(line: 185, column: 34, scope: !5) +!64 = !DILocation(line: 185, column: 25, scope: !5) +!65 = !DILocation(line: 186, column: 33, scope: !5) +!66 = !DILocation(line: 186, column: 26, scope: !5) +!67 = !DILocation(line: 190, column: 30, scope: !5) +!68 = !DILocation(line: 190, column: 50, scope: !5) +!69 = !DILocation(line: 195, column: 30, scope: !5) +!70 = !DILocation(line: 196, column: 27, scope: !5) +!71 = !DILocation(line: 196, column: 41, scope: !5) +!72 = !DILocation(line: 197, column: 53, scope: !5) +!73 = !DILocation(line: 197, column: 39, scope: !5) +!74 = !DILocation(line: 199, column: 42, scope: !5) +!75 = !DILocation(line: 199, column: 29, scope: !5) +!76 = !DILocation(line: 390, column: 37, scope: !52, inlinedAt: !77) +!77 = !DILocation(line: 207, column: 12, scope: !5) +!78 = !DILocation(line: 390, column: 18, scope: !52, inlinedAt: !77) +!79 = !DILocation(line: 390, column: 49, scope: !52, inlinedAt: !77) +!80 = !DILocation(line: 391, column: 18, scope: !52, inlinedAt: !77) +!81 = !DILocation(line: 391, column: 49, scope: !52, inlinedAt: !77) +!82 = !DILocation(line: 395, column: 43, scope: !52, inlinedAt: !77) +!83 = !DILocation(line: 485, column: 34, scope: !52, inlinedAt: !77) +!84 = !DILocation(line: 397, column: 28, scope: !52, inlinedAt: !77) +!85 = !DILocation(line: 485, column: 23, scope: !52, inlinedAt: !77) +!86 = !DILocation(line: 488, column: 23, scope: !52, inlinedAt: !77) +!87 = !DILocation(line: 835, column: 23, scope: !52, inlinedAt: !77) +!88 = !DILocation(line: 414, column: 19, scope: !52, inlinedAt: !77) +!89 = !DILocation(line: 415, column: 19, scope: !52, inlinedAt: !77) +!90 = !DILocation(line: 459, column: 19, scope: !52, inlinedAt: !77) +!91 = !DILocation(line: 395, column: 63, scope: !52, inlinedAt: !77) +!92 = !DILocation(line: 504, column: 24, scope: !52, inlinedAt: !77) +!93 = !DILocation(line: 461, column: 14, scope: !52, inlinedAt: !77) +!94 = !DILocation(line: 482, column: 23, scope: !52, inlinedAt: !77) +!95 = !DILocation(line: 494, column: 24, scope: !52, inlinedAt: !77) +!96 = !DILocation(line: 490, column: 23, scope: !52, inlinedAt: !77) +!97 = !DILocation(line: 493, column: 24, scope: !52, inlinedAt: !77) +!98 = !DILocation(line: 496, column: 25, scope: !52, inlinedAt: !77) +!99 = !DILocation(line: 497, column: 92, scope: !52, inlinedAt: !77) +!100 = !DILocation(line: 503, column: 25, scope: !52, inlinedAt: !77) +!101 = !DILocation(line: 500, column: 24, scope: !52, inlinedAt: !77) +!102 = !DILocation(line: 501, column: 24, scope: !52, inlinedAt: !77) +!103 = !DILocation(line: 502, column: 39, scope: !52, inlinedAt: !77) +!104 = !DILocation(line: 505, column: 24, scope: !52, inlinedAt: !77) +!105 = !DILocation(line: 506, column: 23, scope: !52, inlinedAt: !77) +!106 = !DILocation(line: 513, column: 39, scope: !52, inlinedAt: !77) +!107 = !DILocation(line: 514, column: 25, scope: !52, inlinedAt: !77) +!108 = !DILocation(line: 515, column: 24, scope: !52, inlinedAt: !77) +!109 = !DILocation(line: 516, column: 24, scope: !52, inlinedAt: !77) +!110 = !DILocation(line: 524, column: 27, scope: !52, inlinedAt: !77) +!111 = !DILocation(line: 521, column: 69, scope: !52, inlinedAt: !77) +!112 = !DILocation(line: 525, column: 39, scope: !52, inlinedAt: !77) +!113 = !DILocation(line: 525, column: 21, scope: !52, inlinedAt: !77) +!114 = !DILocation(line: 530, column: 20, scope: !52, inlinedAt: !77) +!115 = !DILocation(line: 531, column: 19, scope: !52, inlinedAt: !77) +!116 = !DILocation(line: 531, column: 14, scope: !52, inlinedAt: !77) +!117 = !DILocation(line: 551, column: 15, scope: !52, inlinedAt: !77) +!118 = !DILocation(line: 549, column: 43, scope: !52, inlinedAt: !77) +!119 = !DILocation(line: 553, column: 21, scope: !52, inlinedAt: !77) +!120 = !DILocation(line: 417, column: 19, scope: !52, inlinedAt: !77) +!121 = !DILocation(line: 788, column: 33, scope: !52, inlinedAt: !77) +!122 = !DILocation(line: 789, column: 38, scope: !52, inlinedAt: !77) +!123 = !DILocation(line: 789, column: 24, scope: !52, inlinedAt: !77) +!124 = !DILocation(line: 790, column: 109, scope: !52, inlinedAt: !77) +!125 = !DILocation(line: 790, column: 113, scope: !52, inlinedAt: !77) +!126 = !DILocation(line: 790, column: 55, scope: !52, inlinedAt: !77) +!127 = !DILocation(line: 790, column: 25, scope: !52, inlinedAt: !77) +!128 = !DILocation(line: 791, column: 35, scope: !52, inlinedAt: !77) +!129 = !DILocation(line: 792, column: 34, scope: !52, inlinedAt: !77) +!130 = !DILocation(line: 792, column: 48, scope: !52, inlinedAt: !77) +!131 = !DILocation(line: 792, column: 63, scope: !52, inlinedAt: !77) +!132 = !DILocation(line: 793, column: 29, scope: !52, inlinedAt: !77) +!133 = !DILocation(line: 793, column: 61, scope: !52, inlinedAt: !77) +!134 = !DILocation(line: 793, column: 42, scope: !52, inlinedAt: !77) +!135 = !DILocation(line: 414, column: 28, scope: !52, inlinedAt: !77) +!136 = !DILocation(line: 214, column: 39, scope: !5) +!137 = !DILocation(line: 215, column: 31, scope: !5) +!138 = !DILocation(line: 215, column: 45, scope: !5) +!139 = !DILocation(line: 216, column: 62, scope: !5) +!140 = !DILocation(line: 216, column: 43, scope: !5) +!141 = !DILocation(line: 218, column: 33, scope: !5) +!142 = !DILocation(line: 390, column: 37, scope: !52, inlinedAt: !143) +!143 = !DILocation(line: 226, column: 16, scope: !5) +!144 = !DILocation(line: 390, column: 18, scope: !52, inlinedAt: !143) +!145 = !DILocation(line: 390, column: 49, scope: !52, inlinedAt: !143) +!146 = !DILocation(line: 391, column: 18, scope: !52, inlinedAt: !143) +!147 = !DILocation(line: 391, column: 49, scope: !52, inlinedAt: !143) +!148 = !DILocation(line: 395, column: 43, scope: !52, inlinedAt: !143) +!149 = !DILocation(line: 397, column: 28, scope: !52, inlinedAt: !143) +!150 = !DILocation(line: 835, column: 23, scope: !52, inlinedAt: !143) +!151 = !DILocation(line: 414, column: 19, scope: !52, inlinedAt: !143) +!152 = !DILocation(line: 415, column: 19, scope: !52, inlinedAt: !143) +!153 = !DILocation(line: 459, column: 19, scope: !52, inlinedAt: !143) +!154 = !DILocation(line: 395, column: 63, scope: !52, inlinedAt: !143) +!155 = !DILocation(line: 531, column: 19, scope: !52, inlinedAt: !143) +!156 = !DILocation(line: 461, column: 14, scope: !52, inlinedAt: !143) +!157 = !DILocation(line: 524, column: 27, scope: !52, inlinedAt: !143) +!158 = !DILocation(line: 525, column: 39, scope: !52, inlinedAt: !143) +!159 = !DILocation(line: 525, column: 21, scope: !52, inlinedAt: !143) +!160 = !DILocation(line: 530, column: 20, scope: !52, inlinedAt: !143) +!161 = !DILocation(line: 531, column: 14, scope: !52, inlinedAt: !143) +!162 = !DILocation(line: 551, column: 15, scope: !52, inlinedAt: !143) +!163 = !DILocation(line: 553, column: 21, scope: !52, inlinedAt: !143) +!164 = !DILocation(line: 788, column: 33, scope: !52, inlinedAt: !143) +!165 = !DILocation(line: 789, column: 38, scope: !52, inlinedAt: !143) +!166 = !DILocation(line: 789, column: 24, scope: !52, inlinedAt: !143) +!167 = !DILocation(line: 790, column: 109, scope: !52, inlinedAt: !143) +!168 = !DILocation(line: 790, column: 113, scope: !52, inlinedAt: !143) +!169 = !DILocation(line: 790, column: 55, scope: !52, inlinedAt: !143) +!170 = !DILocation(line: 790, column: 25, scope: !52, inlinedAt: !143) +!171 = !DILocation(line: 791, column: 35, scope: !52, inlinedAt: !143) +!172 = !DILocation(line: 792, column: 34, scope: !52, inlinedAt: !143) +!173 = !DILocation(line: 792, column: 48, scope: !52, inlinedAt: !143) +!174 = !DILocation(line: 792, column: 63, scope: !52, inlinedAt: !143) +!175 = !DILocation(line: 414, column: 28, scope: !52, inlinedAt: !143) +!176 = !DILocation(line: 793, column: 29, scope: !52, inlinedAt: !143) +!177 = !DILocation(line: 231, column: 24, scope: !5) +!178 = !DILocation(line: 231, column: 56, scope: !5) +!179 = !DILocation(line: 232, column: 14, scope: !5) +!180 = !DILocation(line: 234, column: 30, scope: !5) +!181 = !DILocation(line: 252, column: 25, scope: !5) +!182 = !DILocation(line: 253, column: 29, scope: !5) +!183 = !DILocation(line: 825, column: 38, scope: !52, inlinedAt: !184) +!184 = !DILocation(line: 256, column: 107, scope: !5) +!185 = !DILocation(line: 825, column: 20, scope: !52, inlinedAt: !184) +!186 = !DILocation(line: 825, column: 56, scope: !52, inlinedAt: !184) +!187 = !DILocation(line: 825, column: 49, scope: !52, inlinedAt: !184) +!188 = !DILocation(line: 835, column: 23, scope: !52, inlinedAt: !184) +!189 = !DILocation(line: 825, column: 20, scope: !52, inlinedAt: !190) +!190 = !DILocation(line: 257, column: 107, scope: !5) +!191 = !DILocation(line: 825, column: 49, scope: !52, inlinedAt: !190) +!192 = !DILocation(line: 835, column: 23, scope: !52, inlinedAt: !190) +!193 = !DILocation(line: 263, column: 32, scope: !5) +!194 = !DILocation(line: 266, column: 56, scope: !5) +!195 = !DILocation(line: 281, column: 58, scope: !5) +!196 = !DILocation(line: 281, column: 80, scope: !5) +!197 = !DILocation(line: 282, column: 53, scope: !5) +!198 = !DILocation(line: 282, column: 81, scope: !5) +!199 = !DILocation(line: 282, column: 70, scope: !5) +!200 = !DILocation(line: 286, column: 32, scope: !5) +!201 = !DILocation(line: 287, column: 30, scope: !5) +!202 = !DILocation(line: 287, column: 43, scope: !5) +!203 = !DILocation(line: 288, column: 55, scope: !5) +!204 = !DILocation(line: 288, column: 42, scope: !5) +!205 = !DILocation(line: 290, column: 45, scope: !5) +!206 = !DILocation(line: 290, column: 32, scope: !5) +!207 = !DILocation(line: 601, column: 37, scope: !52, inlinedAt: !208) +!208 = !DILocation(line: 298, column: 16, scope: !5) +!209 = !DILocation(line: 602, column: 38, scope: !52, inlinedAt: !208) +!210 = !DILocation(line: 608, column: 42, scope: !52, inlinedAt: !208) +!211 = !DILocation(line: 608, column: 61, scope: !52, inlinedAt: !208) +!212 = !DILocation(line: 701, column: 35, scope: !52, inlinedAt: !208) +!213 = !DILocation(line: 610, column: 28, scope: !52, inlinedAt: !208) +!214 = !DILocation(line: 701, column: 24, scope: !52, inlinedAt: !208) +!215 = !DILocation(line: 306, column: 41, scope: !5) +!216 = !DILocation(line: 307, column: 34, scope: !5) +!217 = !DILocation(line: 307, column: 47, scope: !5) +!218 = !DILocation(line: 308, column: 64, scope: !5) +!219 = !DILocation(line: 308, column: 46, scope: !5) +!220 = !DILocation(line: 310, column: 36, scope: !5) +!221 = !DILocation(line: 601, column: 37, scope: !52, inlinedAt: !222) +!222 = !DILocation(line: 318, column: 20, scope: !5) +!223 = !DILocation(line: 602, column: 38, scope: !52, inlinedAt: !222) +!224 = !DILocation(line: 608, column: 42, scope: !52, inlinedAt: !222) +!225 = !DILocation(line: 608, column: 61, scope: !52, inlinedAt: !222) +!226 = !DILocation(line: 676, column: 20, scope: !52, inlinedAt: !208) +!227 = !DILocation(line: 262, column: 30, scope: !5) +!228 = !DILocation(line: 263, column: 51, scope: !5) +!229 = !DILocation(line: 266, column: 44, scope: !5) +!230 = !DILocation(line: 266, column: 67, scope: !5) +!231 = !DILocation(line: 267, column: 36, scope: !5) +!232 = !DILocation(line: 267, column: 46, scope: !5) +!233 = !DILocation(line: 267, column: 70, scope: !5) +!234 = !DILocation(line: 269, column: 50, scope: !5) +!235 = !DILocation(line: 269, column: 60, scope: !5) +!236 = !DILocation(line: 271, column: 21, scope: !5) +!237 = !DILocation(line: 272, column: 23, scope: !5) +!238 = !DILocation(line: 275, column: 25, scope: !5) +!239 = !DILocation(line: 276, column: 29, scope: !5) +!240 = !DILocation(line: 601, column: 18, scope: !52, inlinedAt: !208) +!241 = !DILocation(line: 601, column: 49, scope: !52, inlinedAt: !208) +!242 = !DILocation(line: 602, column: 19, scope: !52, inlinedAt: !208) +!243 = !DILocation(line: 602, column: 51, scope: !52, inlinedAt: !208) +!244 = !DILocation(line: 835, column: 23, scope: !52, inlinedAt: !208) +!245 = !DILocation(line: 672, column: 28, scope: !52, inlinedAt: !208) +!246 = !DILocation(line: 672, column: 22, scope: !52, inlinedAt: !208) +!247 = !DILocation(line: 746, column: 29, scope: !52, inlinedAt: !208) +!248 = !DILocation(line: 746, column: 21, scope: !52, inlinedAt: !208) +!249 = !DILocation(line: 626, column: 19, scope: !52, inlinedAt: !208) +!250 = !DILocation(line: 627, column: 19, scope: !52, inlinedAt: !208) +!251 = !DILocation(line: 675, column: 26, scope: !52, inlinedAt: !208) +!252 = !DILocation(line: 675, column: 46, scope: !52, inlinedAt: !208) +!253 = !DILocation(line: 678, column: 15, scope: !52, inlinedAt: !208) +!254 = !DILocation(line: 698, column: 25, scope: !52, inlinedAt: !208) +!255 = !DILocation(line: 704, column: 24, scope: !52, inlinedAt: !208) +!256 = !DILocation(line: 706, column: 24, scope: !52, inlinedAt: !208) +!257 = !DILocation(line: 739, column: 27, scope: !52, inlinedAt: !208) +!258 = !DILocation(line: 736, column: 69, scope: !52, inlinedAt: !208) +!259 = !DILocation(line: 740, column: 40, scope: !52, inlinedAt: !208) +!260 = !DILocation(line: 740, column: 22, scope: !52, inlinedAt: !208) +!261 = !DILocation(line: 744, column: 24, scope: !52, inlinedAt: !208) +!262 = !DILocation(line: 744, column: 43, scope: !52, inlinedAt: !208) +!263 = !DILocation(line: 750, column: 20, scope: !52, inlinedAt: !208) +!264 = !DILocation(line: 751, column: 22, scope: !52, inlinedAt: !208) +!265 = !DILocation(line: 751, column: 16, scope: !52, inlinedAt: !208) +!266 = !DILocation(line: 775, column: 24, scope: !52, inlinedAt: !208) +!267 = !DILocation(line: 773, column: 45, scope: !52, inlinedAt: !208) +!268 = !DILocation(line: 775, column: 43, scope: !52, inlinedAt: !208) +!269 = !DILocation(line: 628, column: 19, scope: !52, inlinedAt: !208) +!270 = !DILocation(line: 788, column: 33, scope: !52, inlinedAt: !208) +!271 = !DILocation(line: 789, column: 38, scope: !52, inlinedAt: !208) +!272 = !DILocation(line: 789, column: 24, scope: !52, inlinedAt: !208) +!273 = !DILocation(line: 790, column: 109, scope: !52, inlinedAt: !208) +!274 = !DILocation(line: 790, column: 113, scope: !52, inlinedAt: !208) +!275 = !DILocation(line: 790, column: 55, scope: !52, inlinedAt: !208) +!276 = !DILocation(line: 790, column: 25, scope: !52, inlinedAt: !208) +!277 = !DILocation(line: 791, column: 35, scope: !52, inlinedAt: !208) +!278 = !DILocation(line: 792, column: 34, scope: !52, inlinedAt: !208) +!279 = !DILocation(line: 792, column: 48, scope: !52, inlinedAt: !208) +!280 = !DILocation(line: 792, column: 63, scope: !52, inlinedAt: !208) +!281 = !DILocation(line: 793, column: 29, scope: !52, inlinedAt: !208) +!282 = !DILocation(line: 793, column: 61, scope: !52, inlinedAt: !208) +!283 = !DILocation(line: 793, column: 42, scope: !52, inlinedAt: !208) +!284 = !DILocation(line: 626, column: 28, scope: !52, inlinedAt: !208) +!285 = !DILocation(line: 627, column: 28, scope: !52, inlinedAt: !208) +!286 = !DILocation(line: 601, column: 18, scope: !52, inlinedAt: !222) +!287 = !DILocation(line: 601, column: 49, scope: !52, inlinedAt: !222) +!288 = !DILocation(line: 602, column: 19, scope: !52, inlinedAt: !222) +!289 = !DILocation(line: 602, column: 51, scope: !52, inlinedAt: !222) +!290 = !DILocation(line: 835, column: 23, scope: !52, inlinedAt: !222) +!291 = !DILocation(line: 672, column: 28, scope: !52, inlinedAt: !222) +!292 = !DILocation(line: 672, column: 22, scope: !52, inlinedAt: !222) +!293 = !DILocation(line: 746, column: 29, scope: !52, inlinedAt: !222) +!294 = !DILocation(line: 746, column: 21, scope: !52, inlinedAt: !222) +!295 = !DILocation(line: 626, column: 19, scope: !52, inlinedAt: !222) +!296 = !DILocation(line: 627, column: 19, scope: !52, inlinedAt: !222) +!297 = !DILocation(line: 610, column: 28, scope: !52, inlinedAt: !222) +!298 = !DILocation(line: 675, column: 26, scope: !52, inlinedAt: !222) +!299 = !DILocation(line: 675, column: 46, scope: !52, inlinedAt: !222) +!300 = !DILocation(line: 676, column: 20, scope: !52, inlinedAt: !222) +!301 = !DILocation(line: 678, column: 15, scope: !52, inlinedAt: !222) +!302 = !DILocation(line: 739, column: 27, scope: !52, inlinedAt: !222) +!303 = !DILocation(line: 740, column: 40, scope: !52, inlinedAt: !222) +!304 = !DILocation(line: 740, column: 22, scope: !52, inlinedAt: !222) +!305 = !DILocation(line: 744, column: 24, scope: !52, inlinedAt: !222) +!306 = !DILocation(line: 744, column: 43, scope: !52, inlinedAt: !222) +!307 = !DILocation(line: 750, column: 20, scope: !52, inlinedAt: !222) +!308 = !DILocation(line: 751, column: 22, scope: !52, inlinedAt: !222) +!309 = !DILocation(line: 751, column: 16, scope: !52, inlinedAt: !222) +!310 = !DILocation(line: 775, column: 24, scope: !52, inlinedAt: !222) +!311 = !DILocation(line: 775, column: 43, scope: !52, inlinedAt: !222) +!312 = !DILocation(line: 788, column: 33, scope: !52, inlinedAt: !222) +!313 = !DILocation(line: 789, column: 38, scope: !52, inlinedAt: !222) +!314 = !DILocation(line: 789, column: 24, scope: !52, inlinedAt: !222) +!315 = !DILocation(line: 790, column: 109, scope: !52, inlinedAt: !222) +!316 = !DILocation(line: 790, column: 113, scope: !52, inlinedAt: !222) +!317 = !DILocation(line: 790, column: 55, scope: !52, inlinedAt: !222) +!318 = !DILocation(line: 790, column: 25, scope: !52, inlinedAt: !222) +!319 = !DILocation(line: 791, column: 35, scope: !52, inlinedAt: !222) +!320 = !DILocation(line: 792, column: 34, scope: !52, inlinedAt: !222) +!321 = !DILocation(line: 792, column: 48, scope: !52, inlinedAt: !222) +!322 = !DILocation(line: 792, column: 63, scope: !52, inlinedAt: !222) +!323 = !DILocation(line: 793, column: 29, scope: !52, inlinedAt: !222) +!324 = !DILocation(line: 793, column: 61, scope: !52, inlinedAt: !222) +!325 = !DILocation(line: 793, column: 42, scope: !52, inlinedAt: !222) +!326 = !DILocation(line: 626, column: 28, scope: !52, inlinedAt: !222) +!327 = !DILocation(line: 627, column: 28, scope: !52, inlinedAt: !222) +!328 = !DILocation(line: 628, column: 19, scope: !52, inlinedAt: !222) +!329 = !DILocation(line: 323, column: 23, scope: !5) +!330 = !DILocation(line: 323, column: 55, scope: !5) +!331 = !DILocation(line: 330, column: 30, scope: !5) +!332 = !DILocation(line: 334, column: 14, scope: !5) +!333 = !DILocation(line: 344, column: 58, scope: !5) +!334 = !DILocation(line: 345, column: 29, scope: !5) +!335 = !DILocation(line: 345, column: 69, scope: !5) +!336 = !DILocation(line: 139, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..d56b545cf65296335b11033a29920176d7de0272 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ptx @@ -0,0 +1,6891 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_tem_fused_zeros_1 // -- Begin function triton_tem_fused_zeros_1 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_tem_fused_zeros_1 +.visible .entry triton_tem_fused_zeros_1( + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_0, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_1, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_2, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_3, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_4, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_5, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_6, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_7, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_8, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_9, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_10, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_11, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_12, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_13, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_14, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_15, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_16, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_17, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_18, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_19 +) +.reqntid 256 +{ + .reg .pred %p<582>; + .reg .b16 %rs<190>; + .reg .b32 %r<13996>; + .reg .b64 %rd<1040>; + .loc 1 18 0 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:18:0 + +// %bb.0: + ld.param.b64 %rd193, [triton_tem_fused_zeros_1_param_16]; + ld.param.b64 %rd183, [triton_tem_fused_zeros_1_param_5]; + ld.param.b64 %rd182, [triton_tem_fused_zeros_1_param_4]; + ld.param.b64 %rd181, [triton_tem_fused_zeros_1_param_3]; + ld.param.b64 %rd180, [triton_tem_fused_zeros_1_param_0]; +$L__tmp0: + .loc 1 111 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:111:24 + mov.u32 %r1, %ctaid.x; + ld.param.b64 %rd195, [triton_tem_fused_zeros_1_param_1]; + .loc 1 115 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:115:27 + mov.u32 %r2, %ctaid.y; + ld.param.b64 %rd196, [triton_tem_fused_zeros_1_param_2]; + .loc 1 116 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:116:28 + mov.u32 %r3, %ctaid.z; + .loc 1 117 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:117:23 + and.b32 %r4, %r2, 7; + .loc 1 124 25 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:124:25 + shl.b32 %r1941, %r3, 18; + .loc 1 124 47 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:124:47 + shl.b32 %r1942, %r4, 21; + .loc 1 124 35 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:124:35 + add.s32 %r1943, %r1942, %r1941; + .loc 1 131 9 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:131:9 + mul.wide.s32 %rd198, %r1943, 2; + add.s64 %rd1, %rd195, %rd198; + .loc 1 132 9 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:132:9 + add.s64 %rd2, %rd196, %rd198; + .loc 1 136 26 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:136:26 + mov.u32 %r6, %tid.x; + shr.u32 %r7, %r6, 5; + and.b32 %r8, %r6, 240; + bfe.u32 %r9, %r6, 4, 4; + or.b32 %r10, %r9, 16; + or.b32 %r11, %r9, 32; + or.b32 %r12, %r9, 48; + or.b32 %r13, %r9, 64; + or.b32 %r14, %r9, 80; + or.b32 %r15, %r9, 96; + or.b32 %r16, %r9, 112; + shr.u32 %r1945, %r6, 1; + and.b32 %r1946, %r1945, 112; + bfe.u32 %r1947, %r6, 2, 3; + or.b32 %r17, %r1946, %r1947; + or.b32 %r18, %r17, 8; + .loc 1 139 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:139:14 + setp.lt.u32 %p1, %r1, 16; + .loc 1 139 7 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:139:7 + @%p1 bra $L__BB0_8; +// %bb.1: + .loc 1 0 7 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:0:7 + ld.param.b64 %rd190, [triton_tem_fused_zeros_1_param_13]; + ld.param.b64 %rd189, [triton_tem_fused_zeros_1_param_12]; + ld.param.b64 %rd186, [triton_tem_fused_zeros_1_param_9]; + ld.param.b64 %rd185, [triton_tem_fused_zeros_1_param_8]; + ld.param.b64 %rd184, [triton_tem_fused_zeros_1_param_6]; + .loc 1 140 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:140:24 + add.s32 %r8397, %r1, -16; + .loc 1 144 29 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:144:29 + shr.u32 %r8398, %r8397, 4; + .loc 1 144 54 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:144:54 + shl.b32 %r8399, %r3, 2; + .loc 1 144 44 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:144:44 + add.s32 %r8400, %r8398, %r8399; + .loc 1 145 35 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:145:35 + and.b32 %r8401, %r1, 15; + .loc 1 154 55 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:154:55 + shl.b32 %r8402, %r4, 4; + .loc 1 154 78 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:154:78 + or.b32 %r8403, %r8402, %r8401; + .loc 1 155 50 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:155:50 + shl.b32 %r8404, %r4, 8; + .loc 1 155 83 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:155:83 + shl.b32 %r8405, %r8401, 4; + .loc 1 155 68 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:155:68 + or.b32 %r8406, %r8404, %r8405; + .loc 1 158 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:158:30 + shl.b32 %r8407, %r8400, 7; + .loc 1 158 52 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:158:52 + shl.b32 %r8408, %r2, 23; + .loc 1 158 40 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:158:40 + add.s32 %r8409, %r8407, %r8408; + .loc 1 159 42 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:159:42 + mad.lo.s32 %r8410, %r8400, 262016, %r8409; + .loc 1 161 46 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:161:46 + shl.b32 %r8411, %r2, 16; + shl.b32 %r8412, %r8400, 11; + add.s32 %r8413, %r8412, %r8411; + .loc 1 163 17 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:163:17 + mul.wide.s32 %rd724, %r8409, 2; + add.s64 %rd725, %rd180, %rd724; + .loc 1 164 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:164:19 + mad.wide.s32 %rd726, %r8410, 2, %rd183; + .loc 1 168 21 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:168:21 + mul.wide.s32 %rd727, %r8413, 4; + add.s64 %rd728, %rd181, %rd727; + .loc 1 169 25 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:169:25 + add.s64 %rd729, %rd182, %rd727; + .loc 1 174 36 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:174:36 + shl.b32 %r8414, %r8401, 7; + .loc 1 175 29 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:175:29 + or.b32 %r8415, %r8414, %r9; + or.b32 %r8416, %r10, %r8414; + or.b32 %r8417, %r11, %r8414; + or.b32 %r8418, %r12, %r8414; + or.b32 %r8419, %r13, %r8414; + or.b32 %r8420, %r14, %r8414; + or.b32 %r8421, %r15, %r8414; + or.b32 %r8422, %r16, %r8414; + or.b32 %r19, %r18, %r8414; + or.b32 %r21, %r17, %r8414; +$L__tmp1: + .loc 1 825 38 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:825:38 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:178:107 ] + shl.b32 %r8423, %r8415, 12; + shl.b32 %r8424, %r8416, 12; + shl.b32 %r8425, %r8417, 12; + shl.b32 %r8426, %r8418, 12; + shl.b32 %r8427, %r8419, 12; + shl.b32 %r8428, %r8420, 12; + shl.b32 %r8429, %r8421, 12; + shl.b32 %r8430, %r8422, 12; + .loc 1 825 20 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:825:20 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:178:107 ] + mad.wide.u32 %rd730, %r8423, 2, %rd725; + mad.wide.u32 %rd731, %r8424, 2, %rd725; + mad.wide.u32 %rd732, %r8425, 2, %rd725; + mad.wide.u32 %rd733, %r8426, 2, %rd725; + mad.wide.u32 %rd734, %r8427, 2, %rd725; + mad.wide.u32 %rd735, %r8428, 2, %rd725; + mad.wide.u32 %rd736, %r8429, 2, %rd725; + mad.wide.u32 %rd737, %r8430, 2, %rd725; + .loc 1 825 56 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:825:56 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:178:107 ] + shl.b32 %r8431, %r6, 3; + and.b32 %r8432, %r8431, 120; + .loc 1 825 49 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:825:49 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:178:107 ] + cvt.u64.u32 %rd13, %r8432; + mul.wide.u32 %rd738, %r8432, 2; + add.s64 %rd684, %rd730, %rd738; + add.s64 %rd685, %rd731, %rd738; + add.s64 %rd686, %rd732, %rd738; + add.s64 %rd687, %rd733, %rd738; + add.s64 %rd688, %rd734, %rd738; + add.s64 %rd689, %rd735, %rd738; + add.s64 %rd690, %rd736, %rd738; + add.s64 %rd691, %rd737, %rd738; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:178:107 ] + // begin inline asm + mov.u32 %r8294, 0x0; + mov.u32 %r8295, 0x0; + mov.u32 %r8296, 0x0; + mov.u32 %r8297, 0x0; + ld.global.v4.b32 { %r8294, %r8295, %r8296, %r8297 }, [ %rd684 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8298, 0x0; + mov.u32 %r8299, 0x0; + mov.u32 %r8300, 0x0; + mov.u32 %r8301, 0x0; + ld.global.v4.b32 { %r8298, %r8299, %r8300, %r8301 }, [ %rd685 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8302, 0x0; + mov.u32 %r8303, 0x0; + mov.u32 %r8304, 0x0; + mov.u32 %r8305, 0x0; + ld.global.v4.b32 { %r8302, %r8303, %r8304, %r8305 }, [ %rd686 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8306, 0x0; + mov.u32 %r8307, 0x0; + mov.u32 %r8308, 0x0; + mov.u32 %r8309, 0x0; + ld.global.v4.b32 { %r8306, %r8307, %r8308, %r8309 }, [ %rd687 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8310, 0x0; + mov.u32 %r8311, 0x0; + mov.u32 %r8312, 0x0; + mov.u32 %r8313, 0x0; + ld.global.v4.b32 { %r8310, %r8311, %r8312, %r8313 }, [ %rd688 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8314, 0x0; + mov.u32 %r8315, 0x0; + mov.u32 %r8316, 0x0; + mov.u32 %r8317, 0x0; + ld.global.v4.b32 { %r8314, %r8315, %r8316, %r8317 }, [ %rd689 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8318, 0x0; + mov.u32 %r8319, 0x0; + mov.u32 %r8320, 0x0; + mov.u32 %r8321, 0x0; + ld.global.v4.b32 { %r8318, %r8319, %r8320, %r8321 }, [ %rd690 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8322, 0x0; + mov.u32 %r8323, 0x0; + mov.u32 %r8324, 0x0; + mov.u32 %r8325, 0x0; + ld.global.v4.b32 { %r8322, %r8323, %r8324, %r8325 }, [ %rd691 + 0 ]; + // end inline asm + shl.b32 %r8433, %r6, 4; + and.b32 %r8434, %r8433, 112; + shl.b32 %r8435, %r8, 3; + and.b32 %r8436, %r6, 112; + and.b32 %r8437, %r6, 8; + shl.b32 %r8438, %r8437, 11; + or.b32 %r8439, %r8434, %r8435; + xor.b32 %r8440, %r8439, %r8436; + or.b32 %r8441, %r8440, %r8438; + mov.b32 %r8442, global_smem; + add.s32 %r8443, %r8442, %r8441; + st.shared.v4.b32 [%r8443+98304], {%r8294, %r8295, %r8296, %r8297}; + st.shared.v4.b32 [%r8443+100352], {%r8298, %r8299, %r8300, %r8301}; + st.shared.v4.b32 [%r8443+102400], {%r8302, %r8303, %r8304, %r8305}; + st.shared.v4.b32 [%r8443+104448], {%r8306, %r8307, %r8308, %r8309}; + st.shared.v4.b32 [%r8443+106496], {%r8310, %r8311, %r8312, %r8313}; + st.shared.v4.b32 [%r8443+108544], {%r8314, %r8315, %r8316, %r8317}; + st.shared.v4.b32 [%r8443+110592], {%r8318, %r8319, %r8320, %r8321}; + st.shared.v4.b32 [%r8443+112640], {%r8322, %r8323, %r8324, %r8325}; +$L__tmp2: + .loc 1 825 38 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:825:38 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:179:111 ] + shl.b32 %r8444, %r8415, 7; + shl.b32 %r8445, %r8416, 7; + shl.b32 %r8446, %r8417, 7; + shl.b32 %r8447, %r8418, 7; + shl.b32 %r8448, %r8419, 7; + shl.b32 %r8449, %r8420, 7; + shl.b32 %r8450, %r8421, 7; + shl.b32 %r8451, %r8422, 7; + .loc 1 825 20 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:825:20 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:179:111 ] + mad.wide.u32 %rd739, %r8444, 2, %rd726; + mad.wide.u32 %rd740, %r8445, 2, %rd726; + mad.wide.u32 %rd741, %r8446, 2, %rd726; + mad.wide.u32 %rd742, %r8447, 2, %rd726; + mad.wide.u32 %rd743, %r8448, 2, %rd726; + mad.wide.u32 %rd744, %r8449, 2, %rd726; + mad.wide.u32 %rd745, %r8450, 2, %rd726; + mad.wide.u32 %rd746, %r8451, 2, %rd726; + .loc 1 825 49 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:825:49 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:179:111 ] + add.s64 %rd692, %rd739, %rd738; + add.s64 %rd693, %rd740, %rd738; + add.s64 %rd694, %rd741, %rd738; + add.s64 %rd695, %rd742, %rd738; + add.s64 %rd696, %rd743, %rd738; + add.s64 %rd697, %rd744, %rd738; + add.s64 %rd698, %rd745, %rd738; + add.s64 %rd699, %rd746, %rd738; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:179:111 ] + // begin inline asm + mov.u32 %r8326, 0x0; + mov.u32 %r8327, 0x0; + mov.u32 %r8328, 0x0; + mov.u32 %r8329, 0x0; + ld.global.v4.b32 { %r8326, %r8327, %r8328, %r8329 }, [ %rd692 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8330, 0x0; + mov.u32 %r8331, 0x0; + mov.u32 %r8332, 0x0; + mov.u32 %r8333, 0x0; + ld.global.v4.b32 { %r8330, %r8331, %r8332, %r8333 }, [ %rd693 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8334, 0x0; + mov.u32 %r8335, 0x0; + mov.u32 %r8336, 0x0; + mov.u32 %r8337, 0x0; + ld.global.v4.b32 { %r8334, %r8335, %r8336, %r8337 }, [ %rd694 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8338, 0x0; + mov.u32 %r8339, 0x0; + mov.u32 %r8340, 0x0; + mov.u32 %r8341, 0x0; + ld.global.v4.b32 { %r8338, %r8339, %r8340, %r8341 }, [ %rd695 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8342, 0x0; + mov.u32 %r8343, 0x0; + mov.u32 %r8344, 0x0; + mov.u32 %r8345, 0x0; + ld.global.v4.b32 { %r8342, %r8343, %r8344, %r8345 }, [ %rd696 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8346, 0x0; + mov.u32 %r8347, 0x0; + mov.u32 %r8348, 0x0; + mov.u32 %r8349, 0x0; + ld.global.v4.b32 { %r8346, %r8347, %r8348, %r8349 }, [ %rd697 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8350, 0x0; + mov.u32 %r8351, 0x0; + mov.u32 %r8352, 0x0; + mov.u32 %r8353, 0x0; + ld.global.v4.b32 { %r8350, %r8351, %r8352, %r8353 }, [ %rd698 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8354, 0x0; + mov.u32 %r8355, 0x0; + mov.u32 %r8356, 0x0; + mov.u32 %r8357, 0x0; + ld.global.v4.b32 { %r8354, %r8355, %r8356, %r8357 }, [ %rd699 + 0 ]; + // end inline asm + st.shared.v4.b32 [%r8443+131072], {%r8326, %r8327, %r8328, %r8329}; + st.shared.v4.b32 [%r8443+133120], {%r8330, %r8331, %r8332, %r8333}; + st.shared.v4.b32 [%r8443+135168], {%r8334, %r8335, %r8336, %r8337}; + st.shared.v4.b32 [%r8443+137216], {%r8338, %r8339, %r8340, %r8341}; + st.shared.v4.b32 [%r8443+139264], {%r8342, %r8343, %r8344, %r8345}; + st.shared.v4.b32 [%r8443+141312], {%r8346, %r8347, %r8348, %r8349}; + st.shared.v4.b32 [%r8443+143360], {%r8350, %r8351, %r8352, %r8353}; + st.shared.v4.b32 [%r8443+145408], {%r8354, %r8355, %r8356, %r8357}; +$L__tmp3: + .loc 1 185 34 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:185:34 + mul.wide.u32 %rd747, %r21, 4; + add.s64 %rd700, %rd729, %rd747; + mul.wide.u32 %rd748, %r19, 4; + add.s64 %rd701, %rd729, %rd748; + .loc 1 185 25 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:185:25 + // begin inline asm + mov.u32 %r8358, 0x0; + ld.global.b32 { %r8358 }, [ %rd700 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8359, 0x0; + ld.global.b32 { %r8359 }, [ %rd701 + 0 ]; + // end inline asm + .loc 1 186 33 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:186:33 + add.s64 %rd702, %rd728, %rd747; + add.s64 %rd703, %rd728, %rd748; + .loc 1 186 26 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:186:26 + // begin inline asm + mov.u32 %r8360, 0x0; + ld.global.b32 { %r8360 }, [ %rd702 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8361, 0x0; + ld.global.b32 { %r8361 }, [ %rd703 + 0 ]; + // end inline asm + .loc 1 190 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:190:30 + setp.eq.f32 %p292, %r8360, 0fFF800000; + setp.eq.f32 %p293, %r8361, 0fFF800000; + .loc 1 190 50 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:190:50 + selp.f32 %r53, 0f00000000, %r8360, %p292; + selp.f32 %r54, 0f00000000, %r8361, %p293; + .loc 1 195 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:195:30 + cvt.u64.u32 %rd16, %r8406; + mad.wide.u32 %rd704, %r8406, 4, %rd186; + .loc 1 196 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:196:27 + // begin inline asm + mov.u32 %r8362, 0x0; + ld.global.b32 { %r8362 }, [ %rd704 + 0 ]; + // end inline asm + .loc 1 196 41 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:196:41 + shl.b32 %r55, %r8362, 7; + .loc 1 197 53 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:197:53 + cvt.u64.u32 %rd18, %r8403; + mad.wide.u32 %rd705, %r8403, 4, %rd185; + .loc 1 197 39 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:197:39 + // begin inline asm + mov.u32 %r8363, 0x0; + ld.global.b32 { %r8363 }, [ %rd705 + 0 ]; + // end inline asm + .loc 1 199 42 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:199:42 + and.b32 %r57, %r6, 3; + .loc 1 199 29 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:199:29 + or.b32 %r8452, %r55, %r9; + or.b32 %r8453, %r55, %r10; + or.b32 %r8454, %r55, %r11; + or.b32 %r8455, %r55, %r12; +$L__tmp4: + .loc 1 390 37 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:390:37 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + shl.b32 %r8456, %r8452, 7; + shl.b32 %r8457, %r8453, 7; + shl.b32 %r8458, %r8454, 7; + shl.b32 %r8459, %r8455, 7; + .loc 1 390 18 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:390:18 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.wide.s32 %rd749, %r8456, 2; + add.s64 %rd750, %rd1, %rd749; + mul.wide.s32 %rd751, %r8457, 2; + add.s64 %rd752, %rd1, %rd751; + mul.wide.s32 %rd753, %r8458, 2; + add.s64 %rd754, %rd1, %rd753; + mul.wide.s32 %rd755, %r8459, 2; + add.s64 %rd756, %rd1, %rd755; + .loc 1 390 49 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:390:49 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + add.s64 %rd708, %rd750, %rd738; + add.s64 %rd709, %rd752, %rd738; + add.s64 %rd710, %rd754, %rd738; + add.s64 %rd711, %rd756, %rd738; + .loc 1 391 18 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:391:18 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + add.s64 %rd757, %rd2, %rd749; + add.s64 %rd758, %rd2, %rd751; + add.s64 %rd759, %rd2, %rd753; + add.s64 %rd760, %rd2, %rd755; + .loc 1 391 49 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:391:49 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + add.s64 %rd712, %rd757, %rd738; + add.s64 %rd713, %rd758, %rd738; + add.s64 %rd714, %rd759, %rd738; + add.s64 %rd715, %rd760, %rd738; + .loc 1 395 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:395:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + shl.b32 %r58, %r8363, 1; + .loc 1 485 34 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:485:34 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mad.wide.u32 %rd707, %r2, 8, %rd193; + .loc 1 397 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:397:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + setp.lt.s32 %p294, %r58, 1; + setp.gt.s32 %p291, %r58, 0; + .loc 1 485 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:485:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + // begin inline asm + mov.u64 %rd19, 0x0; + @%p291 ld.global.b64 { %rd19 }, [ %rd707 + 0 ]; + // end inline asm + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + shl.b32 %r8460, %r8437, 10; + or.b32 %r59, %r8440, %r8460; + add.s32 %r10751, %r8442, %r59; + selp.b32 %r8365, 16, 0, %p291; + // begin inline asm + cp.async.cg.shared.global [ %r10751 + 0 ], [ %rd708 + 0 ], 0x10, %r8365; + // end inline asm + add.s32 %r10753, %r10751, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r10753 + 0 ], [ %rd709 + 0 ], 0x10, %r8365; + // end inline asm + add.s32 %r10755, %r10751, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r10755 + 0 ], [ %rd710 + 0 ], 0x10, %r8365; + // end inline asm + add.s32 %r10757, %r10751, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r10757 + 0 ], [ %rd711 + 0 ], 0x10, %r8365; + // end inline asm + cp.async.commit_group; + add.s32 %r8372, %r10751, 49152; + // begin inline asm + cp.async.cg.shared.global [ %r8372 + 0 ], [ %rd712 + 0 ], 0x10, %r8365; + // end inline asm + add.s32 %r8374, %r10751, 51200; + // begin inline asm + cp.async.cg.shared.global [ %r8374 + 0 ], [ %rd713 + 0 ], 0x10, %r8365; + // end inline asm + add.s32 %r8376, %r10751, 53248; + // begin inline asm + cp.async.cg.shared.global [ %r8376 + 0 ], [ %rd714 + 0 ], 0x10, %r8365; + // end inline asm + add.s32 %r8378, %r10751, 55296; + // begin inline asm + cp.async.cg.shared.global [ %r8378 + 0 ], [ %rd715 + 0 ], 0x10, %r8365; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:397:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + setp.gt.s32 %p295, %r58, 1; + .loc 1 414 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:414:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + add.s64 %rd1014, %rd708, 16384; + add.s64 %rd1013, %rd709, 16384; + add.s64 %rd1012, %rd710, 16384; + add.s64 %rd1011, %rd711, 16384; + .loc 1 415 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:415:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + add.s64 %rd1010, %rd712, 16384; + add.s64 %rd1009, %rd713, 16384; + add.s64 %rd1008, %rd714, 16384; + add.s64 %rd1007, %rd715, 16384; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + bar.sync 0; + add.s32 %r8380, %r10751, 16384; + selp.b32 %r8381, 16, 0, %p295; + // begin inline asm + cp.async.cg.shared.global [ %r8380 + 0 ], [ %rd1014 + 0 ], 0x10, %r8381; + // end inline asm + add.s32 %r8382, %r10751, 18432; + // begin inline asm + cp.async.cg.shared.global [ %r8382 + 0 ], [ %rd1013 + 0 ], 0x10, %r8381; + // end inline asm + add.s32 %r8384, %r10751, 20480; + // begin inline asm + cp.async.cg.shared.global [ %r8384 + 0 ], [ %rd1012 + 0 ], 0x10, %r8381; + // end inline asm + add.s32 %r8386, %r10751, 22528; + // begin inline asm + cp.async.cg.shared.global [ %r8386 + 0 ], [ %rd1011 + 0 ], 0x10, %r8381; + // end inline asm + cp.async.commit_group; + add.s32 %r8388, %r10751, 65536; + // begin inline asm + cp.async.cg.shared.global [ %r8388 + 0 ], [ %rd1010 + 0 ], 0x10, %r8381; + // end inline asm + add.s32 %r8390, %r10751, 67584; + // begin inline asm + cp.async.cg.shared.global [ %r8390 + 0 ], [ %rd1009 + 0 ], 0x10, %r8381; + // end inline asm + add.s32 %r8392, %r10751, 69632; + // begin inline asm + cp.async.cg.shared.global [ %r8392 + 0 ], [ %rd1008 + 0 ], 0x10, %r8381; + // end inline asm + add.s32 %r8394, %r10751, 71680; + // begin inline asm + cp.async.cg.shared.global [ %r8394 + 0 ], [ %rd1007 + 0 ], 0x10, %r8381; + // end inline asm + cp.async.commit_group; + .loc 1 459 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:459:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + mov.b32 %r13037, 0f00000000; + mov.b32 %r13038, %r13037; + mov.b32 %r13039, %r13037; + mov.b32 %r13040, %r13037; + mov.b32 %r13041, %r13037; + mov.b32 %r13042, %r13037; + mov.b32 %r13043, %r13037; + mov.b32 %r13044, %r13037; + mov.b32 %r13045, %r13037; + mov.b32 %r13046, %r13037; + mov.b32 %r13047, %r13037; + mov.b32 %r13048, %r13037; + mov.b32 %r13049, %r13037; + mov.b32 %r13050, %r13037; + mov.b32 %r13051, %r13037; + mov.b32 %r13052, %r13037; + mov.b32 %r13053, %r13037; + mov.b32 %r13054, %r13037; + mov.b32 %r13055, %r13037; + mov.b32 %r13056, %r13037; + mov.b32 %r13057, %r13037; + mov.b32 %r13058, %r13037; + mov.b32 %r13059, %r13037; + mov.b32 %r13060, %r13037; + mov.b32 %r13061, %r13037; + mov.b32 %r13062, %r13037; + mov.b32 %r13063, %r13037; + mov.b32 %r13064, %r13037; + mov.b32 %r13065, %r13037; + mov.b32 %r13066, %r13037; + mov.b32 %r13067, %r13037; + mov.b32 %r13068, %r13037; + mov.b32 %r13069, %r13037; + mov.b32 %r13070, %r13037; + mov.b32 %r13071, %r13037; + mov.b32 %r13072, %r13037; + mov.b32 %r13073, %r13037; + mov.b32 %r13074, %r13037; + mov.b32 %r13075, %r13037; + mov.b32 %r13076, %r13037; + mov.b32 %r13077, %r13037; + mov.b32 %r13078, %r13037; + mov.b32 %r13079, %r13037; + mov.b32 %r13080, %r13037; + mov.b32 %r13081, %r13037; + mov.b32 %r13082, %r13037; + mov.b32 %r13083, %r13037; + mov.b32 %r13084, %r13037; + mov.b32 %r13085, %r13037; + mov.b32 %r13086, %r13037; + mov.b32 %r13087, %r13037; + mov.b32 %r13088, %r13037; + mov.b32 %r13089, %r13037; + mov.b32 %r13090, %r13037; + mov.b32 %r13091, %r13037; + mov.b32 %r13092, %r13037; + mov.b32 %r13093, %r13037; + mov.b32 %r13094, %r13037; + mov.b32 %r13095, %r13037; + mov.b32 %r13096, %r13037; + mov.b32 %r13097, %r13037; + mov.b32 %r13098, %r13037; + mov.b32 %r13099, %r13037; + mov.b32 %r13100, %r13037; + .loc 1 397 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:397:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + @%p294 bra $L__BB0_4; +$L__tmp5: +// %bb.2: // %.lr.ph1846 + .loc 1 0 0 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:0 + cvt.u64.u32 %rd14, %r21; + cvt.u64.u32 %rd15, %r19; +$L__tmp6: + .loc 1 395 63 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:395:63 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + min.u32 %r76, %r58, 32; +$L__tmp7: + .loc 1 199 42 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:199:42 + shl.b32 %r8466, %r57, 1; + or.b32 %r13117, %r55, %r8466; + add.s32 %r77, %r76, -2; + add.s32 %r78, %r76, -1; + or.b32 %r13116, %r13117, 1; + or.b32 %r13115, %r13117, 8; + or.b32 %r13114, %r13117, 9; + or.b32 %r13113, %r13117, 16; + or.b32 %r13112, %r13117, 17; + or.b32 %r13111, %r13117, 24; + or.b32 %r13110, %r13117, 25; + or.b32 %r13109, %r13117, 32; + or.b32 %r13108, %r13117, 33; + or.b32 %r13107, %r13117, 40; + or.b32 %r13106, %r13117, 41; + or.b32 %r13105, %r13117, 48; + or.b32 %r13104, %r13117, 49; + or.b32 %r13103, %r13117, 56; + or.b32 %r13102, %r13117, 57; + mov.b32 %r9018, 0; + mov.b32 %r13037, 0f00000000; + mov.b32 %r13036, 1; + mov.b32 %r13035, -1; + mov.b32 %r13034, 64; + setp.gt.s64 %p316, %rd19, %rd15; + setp.gt.s64 %p317, %rd19, %rd14; + mov.b32 %r13038, %r13037; + mov.b32 %r13039, %r13037; + mov.b32 %r13040, %r13037; + mov.b32 %r13041, %r13037; + mov.b32 %r13042, %r13037; + mov.b32 %r13043, %r13037; + mov.b32 %r13044, %r13037; + mov.b32 %r13045, %r13037; + mov.b32 %r13046, %r13037; + mov.b32 %r13047, %r13037; + mov.b32 %r13048, %r13037; + mov.b32 %r13049, %r13037; + mov.b32 %r13050, %r13037; + mov.b32 %r13051, %r13037; + mov.b32 %r13052, %r13037; + mov.b32 %r13053, %r13037; + mov.b32 %r13054, %r13037; + mov.b32 %r13055, %r13037; + mov.b32 %r13056, %r13037; + mov.b32 %r13057, %r13037; + mov.b32 %r13058, %r13037; + mov.b32 %r13059, %r13037; + mov.b32 %r13060, %r13037; + mov.b32 %r13061, %r13037; + mov.b32 %r13062, %r13037; + mov.b32 %r13063, %r13037; + mov.b32 %r13064, %r13037; + mov.b32 %r13065, %r13037; + mov.b32 %r13066, %r13037; + mov.b32 %r13067, %r13037; + mov.b32 %r13068, %r13037; + mov.b32 %r13069, %r13037; + mov.b32 %r13070, %r13037; + mov.b32 %r13071, %r13037; + mov.b32 %r13072, %r13037; + mov.b32 %r13073, %r13037; + mov.b32 %r13074, %r13037; + mov.b32 %r13075, %r13037; + mov.b32 %r13076, %r13037; + mov.b32 %r13077, %r13037; + mov.b32 %r13078, %r13037; + mov.b32 %r13079, %r13037; + mov.b32 %r13080, %r13037; + mov.b32 %r13081, %r13037; + mov.b32 %r13082, %r13037; + mov.b32 %r13083, %r13037; + mov.b32 %r13084, %r13037; + mov.b32 %r13085, %r13037; + mov.b32 %r13086, %r13037; + mov.b32 %r13087, %r13037; + mov.b32 %r13088, %r13037; + mov.b32 %r13089, %r13037; + mov.b32 %r13090, %r13037; + mov.b32 %r13091, %r13037; + mov.b32 %r13092, %r13037; + mov.b32 %r13093, %r13037; + mov.b32 %r13094, %r13037; + mov.b32 %r13095, %r13037; + mov.b32 %r13096, %r13037; + mov.b32 %r13097, %r13037; + mov.b32 %r13098, %r13037; + mov.b32 %r13099, %r13037; + mov.b32 %r13100, %r13037; + mov.b32 %r13101, %r9018; +$L__BB0_3: // %__nv_exp2f.exit1414 + // =>This Inner Loop Header: Depth=1 + .loc 1 0 42 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:0:42 + cvt.u32.u64 %r10125, %rd15; + cvt.u32.u64 %r10126, %rd14; +$L__tmp8: + .loc 1 397 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:397:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + setp.lt.s32 %p318, %r13101, %r77; + setp.lt.s32 %p314, %r13101, %r78; + add.s32 %r10127, %r13035, 1; + setp.gt.s32 %p319, %r10127, 2; + selp.b32 %r13035, 0, %r10127, %p319; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r10128, %r13035, 14; + add.s32 %r9020, %r8442, %r10128; + .loc 1 459 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:459:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + shfl.sync.idx.b32 %r10130, %r7, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r10131, %r10130, 11; + and.b32 %r10132, %r10131, 8192; + add.s32 %r8979, %r8442, 98304; + add.s32 %r10133, %r10132, %r8979; + bfe.u32 %r10134, %r10133, 4, 14; + cvt.u64.u32 %rd811, %r10134; + or.b64 %rd761, %rd811, 4611686293372403712; + bfe.u32 %r10135, %r9020, 4, 14; + cvt.u64.u32 %rd812, %r10135; + or.b64 %rd762, %rd812, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8584,%r8585,%r8586,%r8587,%r8588,%r8589,%r8590,%r8591,%r8592,%r8593,%r8594}, %rd761, %rd762, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r10136, %r10132, 32; + add.s32 %r10137, %r10136, %r8979; + bfe.u32 %r10138, %r10137, 4, 14; + cvt.u64.u32 %rd813, %r10138; + or.b64 %rd763, %rd813, 4611686293372403712; + add.s32 %r10139, %r9020, 32; + bfe.u32 %r10140, %r10139, 4, 14; + cvt.u64.u32 %rd814, %r10140; + or.b64 %rd764, %rd814, 4611686293338849280; + mov.pred %p296, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8584,%r8585,%r8586,%r8587,%r8588,%r8589,%r8590,%r8591,%r8592,%r8593,%r8594}, %rd763, %rd764, %p296, 1, 1, 0, 0; + // end inline asm + or.b32 %r10141, %r10132, 64; + add.s32 %r10142, %r10141, %r8979; + bfe.u32 %r10143, %r10142, 4, 14; + cvt.u64.u32 %rd815, %r10143; + or.b64 %rd765, %rd815, 4611686293372403712; + add.s32 %r10144, %r9020, 64; + bfe.u32 %r10145, %r10144, 4, 14; + cvt.u64.u32 %rd816, %r10145; + or.b64 %rd766, %rd816, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8584,%r8585,%r8586,%r8587,%r8588,%r8589,%r8590,%r8591,%r8592,%r8593,%r8594}, %rd765, %rd766, %p296, 1, 1, 0, 0; + // end inline asm + or.b32 %r10146, %r10132, 96; + add.s32 %r10147, %r10146, %r8979; + bfe.u32 %r10148, %r10147, 4, 14; + cvt.u64.u32 %rd817, %r10148; + or.b64 %rd767, %rd817, 4611686293372403712; + add.s32 %r10149, %r9020, 96; + bfe.u32 %r10150, %r10149, 4, 14; + cvt.u64.u32 %rd818, %r10150; + or.b64 %rd768, %rd818, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8584,%r8585,%r8586,%r8587,%r8588,%r8589,%r8590,%r8591,%r8592,%r8593,%r8594}, %rd767, %rd768, %p296, 1, 1, 0, 0; + // end inline asm + or.b32 %r10151, %r10132, 16384; + add.s32 %r10152, %r10151, %r8979; + bfe.u32 %r10153, %r10152, 4, 14; + cvt.u64.u32 %rd819, %r10153; + or.b64 %rd769, %rd819, 4611686293372403712; + add.s32 %r10154, %r9020, 8192; + bfe.u32 %r10155, %r10154, 4, 14; + cvt.u64.u32 %rd820, %r10155; + or.b64 %rd770, %rd820, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8584,%r8585,%r8586,%r8587,%r8588,%r8589,%r8590,%r8591,%r8592,%r8593,%r8594}, %rd769, %rd770, %p296, 1, 1, 0, 0; + // end inline asm + or.b32 %r10156, %r10132, 16416; + add.s32 %r10157, %r10156, %r8979; + bfe.u32 %r10158, %r10157, 4, 14; + cvt.u64.u32 %rd821, %r10158; + or.b64 %rd771, %rd821, 4611686293372403712; + add.s32 %r10159, %r9020, 8224; + bfe.u32 %r10160, %r10159, 4, 14; + cvt.u64.u32 %rd822, %r10160; + or.b64 %rd772, %rd822, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8584,%r8585,%r8586,%r8587,%r8588,%r8589,%r8590,%r8591,%r8592,%r8593,%r8594}, %rd771, %rd772, %p296, 1, 1, 0, 0; + // end inline asm + or.b32 %r10161, %r10132, 16448; + add.s32 %r10162, %r10161, %r8979; + bfe.u32 %r10163, %r10162, 4, 14; + cvt.u64.u32 %rd823, %r10163; + or.b64 %rd773, %rd823, 4611686293372403712; + add.s32 %r10164, %r9020, 8256; + bfe.u32 %r10165, %r10164, 4, 14; + cvt.u64.u32 %rd824, %r10165; + or.b64 %rd774, %rd824, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8584,%r8585,%r8586,%r8587,%r8588,%r8589,%r8590,%r8591,%r8592,%r8593,%r8594}, %rd773, %rd774, %p296, 1, 1, 0, 0; + // end inline asm + or.b32 %r10166, %r10132, 16480; + add.s32 %r10167, %r10166, %r8979; + bfe.u32 %r10168, %r10167, 4, 14; + cvt.u64.u32 %rd825, %r10168; + or.b64 %rd775, %rd825, 4611686293372403712; + add.s32 %r10169, %r9020, 8288; + bfe.u32 %r10170, %r10169, 4, 14; + cvt.u64.u32 %rd826, %r10170; + or.b64 %rd776, %rd826, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8584,%r8585,%r8586,%r8587,%r8588,%r8589,%r8590,%r8591,%r8592,%r8593,%r8594}, %rd775, %rd776, %p296, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r8980, %r9018; + mov.b32 %r8981, %r9018; + mov.b32 %r8983, %r9018; + mov.b32 %r8984, %r9018; + mov.b32 %r8982, %r9020; + // begin inline asm + // wait for regs: %r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8584,%r8585,%r8586,%r8587,%r8588,%r8589,%r8590,%r8591,%r8592,%r8593,%r8594,%r8979,%r8980,%r8981,%r8982,%r8983,%r8984 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 461 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:461:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10171, %r8563, 0f3DB504F3; + mul.f32 %r10172, %r8564, 0f3DB504F3; + mul.f32 %r10173, %r8565, 0f3DB504F3; + mul.f32 %r10174, %r8566, 0f3DB504F3; + mul.f32 %r10175, %r8567, 0f3DB504F3; + mul.f32 %r10176, %r8568, 0f3DB504F3; + mul.f32 %r10177, %r8569, 0f3DB504F3; + mul.f32 %r10178, %r8570, 0f3DB504F3; + mul.f32 %r10179, %r8571, 0f3DB504F3; + mul.f32 %r10180, %r8572, 0f3DB504F3; + mul.f32 %r10181, %r8573, 0f3DB504F3; + mul.f32 %r10182, %r8574, 0f3DB504F3; + mul.f32 %r10183, %r8575, 0f3DB504F3; + mul.f32 %r10184, %r8576, 0f3DB504F3; + mul.f32 %r10185, %r8577, 0f3DB504F3; + mul.f32 %r10186, %r8578, 0f3DB504F3; + mul.f32 %r10187, %r8579, 0f3DB504F3; + mul.f32 %r10188, %r8580, 0f3DB504F3; + mul.f32 %r10189, %r8581, 0f3DB504F3; + mul.f32 %r10190, %r8582, 0f3DB504F3; + mul.f32 %r10191, %r8583, 0f3DB504F3; + mul.f32 %r10192, %r8584, 0f3DB504F3; + mul.f32 %r10193, %r8585, 0f3DB504F3; + mul.f32 %r10194, %r8586, 0f3DB504F3; + mul.f32 %r10195, %r8587, 0f3DB504F3; + mul.f32 %r10196, %r8588, 0f3DB504F3; + mul.f32 %r10197, %r8589, 0f3DB504F3; + mul.f32 %r10198, %r8590, 0f3DB504F3; + mul.f32 %r10199, %r8591, 0f3DB504F3; + mul.f32 %r10200, %r8592, 0f3DB504F3; + mul.f32 %r10201, %r8593, 0f3DB504F3; + mul.f32 %r10202, %r8594, 0f3DB504F3; + .loc 1 482 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:482:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + setp.ge.s32 %p320, %r10126, %r13117; + setp.ge.s32 %p321, %r10126, %r13116; + setp.ge.s32 %p322, %r10125, %r13117; + setp.ge.s32 %p323, %r10125, %r13116; + setp.ge.s32 %p324, %r10126, %r13115; + setp.ge.s32 %p325, %r10126, %r13114; + setp.ge.s32 %p326, %r10125, %r13115; + setp.ge.s32 %p327, %r10125, %r13114; + setp.ge.s32 %p328, %r10126, %r13113; + setp.ge.s32 %p329, %r10126, %r13112; + setp.ge.s32 %p330, %r10125, %r13113; + setp.ge.s32 %p331, %r10125, %r13112; + setp.ge.s32 %p332, %r10126, %r13111; + setp.ge.s32 %p333, %r10126, %r13110; + setp.ge.s32 %p334, %r10125, %r13111; + setp.ge.s32 %p335, %r10125, %r13110; + setp.ge.s32 %p336, %r10126, %r13109; + setp.ge.s32 %p337, %r10126, %r13108; + setp.ge.s32 %p338, %r10125, %r13109; + setp.ge.s32 %p339, %r10125, %r13108; + setp.ge.s32 %p340, %r10126, %r13107; + setp.ge.s32 %p341, %r10126, %r13106; + setp.ge.s32 %p342, %r10125, %r13107; + setp.ge.s32 %p343, %r10125, %r13106; + setp.ge.s32 %p344, %r10126, %r13105; + setp.ge.s32 %p345, %r10126, %r13104; + setp.ge.s32 %p346, %r10125, %r13105; + setp.ge.s32 %p347, %r10125, %r13104; + setp.ge.s32 %p348, %r10126, %r13103; + setp.ge.s32 %p349, %r10126, %r13102; + setp.ge.s32 %p350, %r10125, %r13103; + setp.ge.s32 %p351, %r10125, %r13102; + .loc 1 490 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:490:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + and.pred %p352, %p317, %p320; + and.pred %p353, %p317, %p321; + and.pred %p354, %p316, %p322; + and.pred %p355, %p316, %p323; + and.pred %p356, %p317, %p324; + and.pred %p357, %p317, %p325; + and.pred %p358, %p316, %p326; + and.pred %p359, %p316, %p327; + and.pred %p360, %p317, %p328; + and.pred %p361, %p317, %p329; + and.pred %p362, %p316, %p330; + and.pred %p363, %p316, %p331; + and.pred %p364, %p317, %p332; + and.pred %p365, %p317, %p333; + and.pred %p366, %p316, %p334; + and.pred %p367, %p316, %p335; + and.pred %p368, %p317, %p336; + and.pred %p369, %p317, %p337; + and.pred %p370, %p316, %p338; + and.pred %p371, %p316, %p339; + and.pred %p372, %p317, %p340; + and.pred %p373, %p317, %p341; + and.pred %p374, %p316, %p342; + and.pred %p375, %p316, %p343; + and.pred %p376, %p317, %p344; + and.pred %p377, %p317, %p345; + and.pred %p378, %p316, %p346; + and.pred %p379, %p316, %p347; + and.pred %p380, %p317, %p348; + and.pred %p381, %p317, %p349; + and.pred %p382, %p316, %p350; + and.pred %p383, %p316, %p351; + .loc 1 493 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:493:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + setp.gt.s32 %p384, %r13117, 2047; + setp.gt.s32 %p385, %r13115, 2047; + setp.gt.s32 %p386, %r13113, 2047; + setp.gt.s32 %p387, %r13111, 2047; + setp.gt.s32 %p388, %r13109, 2047; + setp.gt.s32 %p389, %r13107, 2047; + setp.gt.s32 %p390, %r13105, 2047; + setp.gt.s32 %p391, %r13103, 2047; + .loc 1 494 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:494:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + shr.s32 %r10203, %r13102, 31; + shr.u32 %r10204, %r10203, 21; + add.s32 %r10205, %r13102, %r10204; + and.b32 %r10206, %r10205, -2048; + sub.s32 %r10207, %r13102, %r10206; + shr.s32 %r10208, %r13104, 31; + shr.u32 %r10209, %r10208, 21; + add.s32 %r10210, %r13104, %r10209; + and.b32 %r10211, %r10210, -2048; + sub.s32 %r10212, %r13104, %r10211; + shr.s32 %r10213, %r13106, 31; + shr.u32 %r10214, %r10213, 21; + add.s32 %r10215, %r13106, %r10214; + and.b32 %r10216, %r10215, -2048; + sub.s32 %r10217, %r13106, %r10216; + shr.s32 %r10218, %r13108, 31; + shr.u32 %r10219, %r10218, 21; + add.s32 %r10220, %r13108, %r10219; + and.b32 %r10221, %r10220, -2048; + sub.s32 %r10222, %r13108, %r10221; + shr.s32 %r10223, %r13110, 31; + shr.u32 %r10224, %r10223, 21; + add.s32 %r10225, %r13110, %r10224; + and.b32 %r10226, %r10225, -2048; + sub.s32 %r10227, %r13110, %r10226; + shr.s32 %r10228, %r13112, 31; + shr.u32 %r10229, %r10228, 21; + add.s32 %r10230, %r13112, %r10229; + and.b32 %r10231, %r10230, -2048; + sub.s32 %r10232, %r13112, %r10231; + shr.s32 %r10233, %r13114, 31; + shr.u32 %r10234, %r10233, 21; + add.s32 %r10235, %r13114, %r10234; + and.b32 %r10236, %r10235, -2048; + sub.s32 %r10237, %r13114, %r10236; + shr.s32 %r10238, %r13116, 31; + shr.u32 %r10239, %r10238, 21; + add.s32 %r10240, %r13116, %r10239; + and.b32 %r10241, %r10240, -2048; + sub.s32 %r10242, %r13116, %r10241; + .loc 1 496 25 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:496:25 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + setp.ne.b32 %p392, %r10242, 0; + setp.ne.b32 %p393, %r10237, 0; + setp.ne.b32 %p394, %r10232, 0; + setp.ne.b32 %p395, %r10227, 0; + setp.ne.b32 %p396, %r10222, 0; + setp.ne.b32 %p397, %r10217, 0; + setp.ne.b32 %p398, %r10212, 0; + setp.ne.b32 %p399, %r10207, 0; + .loc 1 497 92 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:497:92 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + and.b32 %r10243, %r13103, -2147481601; + and.b32 %r10244, %r13105, -2147481601; + and.b32 %r10245, %r13107, -2147481601; + and.b32 %r10246, %r13109, -2147481601; + and.b32 %r10247, %r13111, -2147481601; + and.b32 %r10248, %r13113, -2147481601; + and.b32 %r10249, %r13115, -2147481601; + and.b32 %r10250, %r13117, -2147481601; + setp.gt.u32 %p400, %r10250, -2147483648; + setp.gt.u32 %p401, %r10249, -2147483648; + setp.gt.u32 %p402, %r10248, -2147483648; + setp.gt.u32 %p403, %r10247, -2147483648; + setp.gt.u32 %p404, %r10246, -2147483648; + setp.gt.u32 %p405, %r10245, -2147483648; + setp.gt.u32 %p406, %r10244, -2147483648; + setp.gt.u32 %p407, %r10243, -2147483648; + .loc 1 503 25 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:503:25 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + prmt.b32 %r10251, %r13115, %r13117, 0x5410U; + prmt.b32 %r10252, %r13111, %r13113, 0x5410U; + prmt.b32 %r10253, %r13107, %r13109, 0x5410U; + prmt.b32 %r10254, %r13103, %r13105, 0x5410U; + and.b32 %r10255, %r10254, 134154239; + and.b32 %r10256, %r10253, 134154239; + and.b32 %r10257, %r10252, 134154239; + and.b32 %r10258, %r10251, 134154239; + .loc 1 504 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:504:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mov.b32 {%rs65, %rs66}, %r10258; + cvt.u64.u16 %rd827, %rs66; + cvt.u64.u16 %rd828, %rs65; + mov.b32 {%rs67, %rs68}, %r10257; + cvt.u64.u16 %rd829, %rs68; + cvt.u64.u16 %rd830, %rs67; + mov.b32 {%rs69, %rs70}, %r10256; + cvt.u64.u16 %rd831, %rs70; + cvt.u64.u16 %rd832, %rs69; + mov.b32 {%rs71, %rs72}, %r10255; + cvt.u64.u16 %rd833, %rs72; + cvt.u64.u16 %rd834, %rs71; + setp.gt.s64 %p408, %rd19, %rd834; + setp.gt.s64 %p409, %rd19, %rd833; + setp.gt.s64 %p410, %rd19, %rd832; + setp.gt.s64 %p411, %rd19, %rd831; + setp.gt.s64 %p412, %rd19, %rd830; + setp.gt.s64 %p413, %rd19, %rd829; + setp.gt.s64 %p414, %rd19, %rd828; + setp.gt.s64 %p415, %rd19, %rd827; + .loc 1 501 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:501:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + add.s32 %r10259, %r10242, 2048; + add.s32 %r10260, %r10237, 2048; + add.s32 %r10261, %r10232, 2048; + add.s32 %r10262, %r10227, 2048; + add.s32 %r10263, %r10222, 2048; + add.s32 %r10264, %r10217, 2048; + add.s32 %r10265, %r10212, 2048; + add.s32 %r10266, %r10207, 2048; + .loc 1 502 39 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:502:39 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b32 %r10267, %r10266, %r10207, %p399; + selp.b32 %r10268, %r10267, %r10207, %p407; + selp.b32 %r10269, %r10265, %r10212, %p398; + selp.b32 %r10270, %r10269, %r10212, %p406; + selp.b32 %r10271, %r10264, %r10217, %p397; + selp.b32 %r10272, %r10271, %r10217, %p405; + selp.b32 %r10273, %r10263, %r10222, %p396; + selp.b32 %r10274, %r10273, %r10222, %p404; + selp.b32 %r10275, %r10262, %r10227, %p395; + selp.b32 %r10276, %r10275, %r10227, %p403; + selp.b32 %r10277, %r10261, %r10232, %p394; + selp.b32 %r10278, %r10277, %r10232, %p402; + selp.b32 %r10279, %r10260, %r10237, %p393; + selp.b32 %r10280, %r10279, %r10237, %p401; + selp.b32 %r10281, %r10259, %r10242, %p392; + selp.b32 %r10282, %r10281, %r10242, %p400; + .loc 1 504 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:504:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.s64.s32 %rd835, %r10282; + cvt.s64.s32 %rd836, %r10280; + cvt.s64.s32 %rd837, %r10278; + cvt.s64.s32 %rd838, %r10276; + cvt.s64.s32 %rd839, %r10274; + cvt.s64.s32 %rd840, %r10272; + cvt.s64.s32 %rd841, %r10270; + cvt.s64.s32 %rd842, %r10268; + setp.gt.s64 %p416, %rd19, %rd842; + setp.gt.s64 %p417, %rd19, %rd841; + setp.gt.s64 %p418, %rd19, %rd840; + setp.gt.s64 %p419, %rd19, %rd839; + setp.gt.s64 %p420, %rd19, %rd838; + setp.gt.s64 %p421, %rd19, %rd837; + setp.gt.s64 %p422, %rd19, %rd836; + setp.gt.s64 %p423, %rd19, %rd835; + .loc 1 505 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:505:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + and.pred %p424, %p384, %p415; + and.pred %p425, %p384, %p423; + and.pred %p426, %p385, %p414; + and.pred %p427, %p385, %p422; + and.pred %p428, %p386, %p413; + and.pred %p429, %p386, %p421; + and.pred %p430, %p387, %p412; + and.pred %p431, %p387, %p420; + and.pred %p432, %p388, %p411; + and.pred %p433, %p388, %p419; + and.pred %p434, %p389, %p410; + and.pred %p435, %p389, %p418; + and.pred %p436, %p390, %p409; + and.pred %p437, %p390, %p417; + and.pred %p438, %p391, %p408; + and.pred %p439, %p391, %p416; + .loc 1 506 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:506:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + sub.s32 %r10283, %r13106, %r21; + sub.s32 %r10284, %r13107, %r21; + sub.s32 %r10285, %r13106, %r19; + sub.s32 %r10286, %r13107, %r19; + sub.s32 %r10287, %r13108, %r21; + sub.s32 %r10288, %r13109, %r21; + sub.s32 %r10289, %r13108, %r19; + sub.s32 %r10290, %r13109, %r19; + sub.s32 %r10291, %r13102, %r21; + sub.s32 %r10292, %r13103, %r21; + sub.s32 %r10293, %r13102, %r19; + sub.s32 %r10294, %r13103, %r19; + sub.s32 %r10295, %r13104, %r21; + sub.s32 %r10296, %r13105, %r21; + sub.s32 %r10297, %r13104, %r19; + sub.s32 %r10298, %r13105, %r19; + sub.s32 %r10299, %r13110, %r19; + sub.s32 %r10300, %r13111, %r19; + sub.s32 %r10301, %r13110, %r21; + sub.s32 %r10302, %r13111, %r21; + sub.s32 %r10303, %r13112, %r19; + sub.s32 %r10304, %r13113, %r19; + sub.s32 %r10305, %r13112, %r21; + sub.s32 %r10306, %r13113, %r21; + sub.s32 %r10307, %r13114, %r19; + sub.s32 %r10308, %r13115, %r19; + sub.s32 %r10309, %r13114, %r21; + sub.s32 %r10310, %r13115, %r21; + sub.s32 %r10311, %r13116, %r19; + sub.s32 %r10312, %r13117, %r19; + sub.s32 %r10313, %r13116, %r21; + sub.s32 %r10314, %r13117, %r21; + .loc 1 513 39 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:513:39 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + and.b32 %r10315, %r10314, 2047; + and.b32 %r10316, %r10313, 2047; + and.b32 %r10317, %r10312, 2047; + and.b32 %r10318, %r10311, 2047; + and.b32 %r10319, %r10310, 2047; + and.b32 %r10320, %r10309, 2047; + and.b32 %r10321, %r10308, 2047; + and.b32 %r10322, %r10307, 2047; + and.b32 %r10323, %r10306, 2047; + and.b32 %r10324, %r10305, 2047; + and.b32 %r10325, %r10304, 2047; + and.b32 %r10326, %r10303, 2047; + and.b32 %r10327, %r10302, 2047; + and.b32 %r10328, %r10301, 2047; + and.b32 %r10329, %r10300, 2047; + and.b32 %r10330, %r10299, 2047; + and.b32 %r10331, %r10298, 2047; + and.b32 %r10332, %r10297, 2047; + and.b32 %r10333, %r10296, 2047; + and.b32 %r10334, %r10295, 2047; + and.b32 %r10335, %r10294, 2047; + and.b32 %r10336, %r10293, 2047; + and.b32 %r10337, %r10292, 2047; + and.b32 %r10338, %r10291, 2047; + and.b32 %r10339, %r10290, 2047; + and.b32 %r10340, %r10289, 2047; + and.b32 %r10341, %r10288, 2047; + and.b32 %r10342, %r10287, 2047; + and.b32 %r10343, %r10286, 2047; + and.b32 %r10344, %r10285, 2047; + and.b32 %r10345, %r10284, 2047; + and.b32 %r10346, %r10283, 2047; + .loc 1 514 25 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:514:25 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + setp.eq.b32 %p440, %r10346, 0; + selp.b16 %rs73, 1, 0, %p440; + shl.b16 %rs74, %rs73, 2; + setp.eq.b32 %p441, %r10345, 0; + selp.b16 %rs75, -1, 0, %p441; + shl.b16 %rs76, %rs75, 3; + or.b16 %rs77, %rs76, %rs74; + setp.eq.b32 %p442, %r10344, 0; + selp.b16 %rs78, 1, 0, %p442; + setp.eq.b32 %p443, %r10343, 0; + selp.b16 %rs79, -1, 0, %p443; + shl.b16 %rs80, %rs79, 1; + or.b16 %rs81, %rs78, %rs80; + and.b16 %rs82, %rs81, 3; + or.b16 %rs83, %rs82, %rs77; + and.b16 %rs84, %rs83, 15; + shl.b16 %rs85, %rs84, 8; + setp.eq.b32 %p444, %r10342, 0; + selp.b16 %rs86, 1, 0, %p444; + shl.b16 %rs87, %rs86, 2; + setp.eq.b32 %p445, %r10341, 0; + selp.b16 %rs88, -1, 0, %p445; + shl.b16 %rs89, %rs88, 3; + or.b16 %rs90, %rs89, %rs87; + setp.eq.b32 %p446, %r10340, 0; + selp.b16 %rs91, 1, 0, %p446; + setp.eq.b32 %p447, %r10339, 0; + selp.b16 %rs92, -1, 0, %p447; + shl.b16 %rs93, %rs92, 1; + or.b16 %rs94, %rs91, %rs93; + and.b16 %rs95, %rs94, 3; + or.b16 %rs96, %rs95, %rs90; + shl.b16 %rs97, %rs96, 12; + or.b16 %rs98, %rs97, %rs85; + setp.eq.b32 %p448, %r10338, 0; + selp.b16 %rs99, 1, 0, %p448; + shl.b16 %rs100, %rs99, 2; + setp.eq.b32 %p449, %r10337, 0; + selp.b16 %rs101, -1, 0, %p449; + shl.b16 %rs102, %rs101, 3; + or.b16 %rs103, %rs102, %rs100; + setp.eq.b32 %p450, %r10336, 0; + selp.b16 %rs104, 1, 0, %p450; + setp.eq.b32 %p451, %r10335, 0; + selp.b16 %rs105, -1, 0, %p451; + shl.b16 %rs106, %rs105, 1; + or.b16 %rs107, %rs104, %rs106; + and.b16 %rs108, %rs107, 3; + or.b16 %rs109, %rs108, %rs103; + and.b16 %rs110, %rs109, 15; + setp.eq.b32 %p452, %r10334, 0; + selp.b16 %rs111, 1, 0, %p452; + shl.b16 %rs112, %rs111, 2; + setp.eq.b32 %p453, %r10333, 0; + selp.b16 %rs113, -1, 0, %p453; + shl.b16 %rs114, %rs113, 3; + or.b16 %rs115, %rs114, %rs112; + setp.eq.b32 %p454, %r10332, 0; + selp.b16 %rs116, 1, 0, %p454; + setp.eq.b32 %p455, %r10331, 0; + selp.b16 %rs117, -1, 0, %p455; + shl.b16 %rs118, %rs117, 1; + or.b16 %rs119, %rs116, %rs118; + and.b16 %rs120, %rs119, 3; + or.b16 %rs121, %rs120, %rs115; + shl.b16 %rs122, %rs121, 4; + or.b16 %rs123, %rs110, %rs122; + and.b16 %rs124, %rs123, 255; + or.b16 %rs125, %rs124, %rs98; + cvt.u32.u16 %r10347, %rs125; + setp.eq.b32 %p456, %r10330, 0; + setp.eq.b32 %p457, %r10329, 0; + setp.eq.b32 %p458, %r10328, 0; + setp.eq.b32 %p459, %r10327, 0; + setp.eq.b32 %p460, %r10326, 0; + setp.eq.b32 %p461, %r10325, 0; + setp.eq.b32 %p462, %r10324, 0; + setp.eq.b32 %p463, %r10323, 0; + setp.eq.b32 %p464, %r10322, 0; + setp.eq.b32 %p465, %r10321, 0; + setp.eq.b32 %p466, %r10320, 0; + setp.eq.b32 %p467, %r10319, 0; + setp.eq.b32 %p468, %r10318, 0; + setp.eq.b32 %p469, %r10317, 0; + setp.eq.b32 %p470, %r10316, 0; + setp.eq.b32 %p471, %r10315, 0; + .loc 1 515 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:515:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + and.pred %p472, %p471, %p424; + and.pred %p473, %p470, %p425; + and.pred %p474, %p469, %p424; + and.pred %p475, %p468, %p425; + and.pred %p476, %p467, %p426; + and.pred %p477, %p466, %p427; + and.pred %p478, %p465, %p426; + and.pred %p479, %p464, %p427; + and.pred %p480, %p463, %p428; + and.pred %p481, %p462, %p429; + and.pred %p482, %p461, %p428; + and.pred %p483, %p460, %p429; + and.pred %p484, %p459, %p430; + and.pred %p485, %p458, %p431; + and.pred %p486, %p457, %p430; + and.pred %p487, %p456, %p431; + shr.u32 %r10348, %r10347, 15; + and.b32 %r10349, %r10348, 1; + setp.ne.b32 %p488, %r10349, 0; + and.pred %p489, %p488, %p432; + shr.u32 %r10350, %r10347, 14; + and.b32 %r10351, %r10350, 1; + setp.ne.b32 %p490, %r10351, 0; + and.pred %p491, %p490, %p433; + shr.u32 %r10352, %r10347, 13; + and.b32 %r10353, %r10352, 1; + setp.ne.b32 %p492, %r10353, 0; + and.pred %p493, %p492, %p432; + shr.u32 %r10354, %r10347, 12; + and.b32 %r10355, %r10354, 1; + setp.ne.b32 %p494, %r10355, 0; + and.pred %p495, %p494, %p433; + shr.u32 %r10356, %r10347, 11; + and.b32 %r10357, %r10356, 1; + setp.ne.b32 %p496, %r10357, 0; + and.pred %p497, %p496, %p434; + shr.u32 %r10358, %r10347, 10; + and.b32 %r10359, %r10358, 1; + setp.ne.b32 %p498, %r10359, 0; + and.pred %p499, %p498, %p435; + shr.u32 %r10360, %r10347, 9; + and.b32 %r10361, %r10360, 1; + setp.ne.b32 %p500, %r10361, 0; + and.pred %p501, %p500, %p434; + shr.u32 %r10362, %r10347, 8; + and.b32 %r10363, %r10362, 1; + setp.ne.b32 %p502, %r10363, 0; + and.pred %p503, %p502, %p435; + shr.u32 %r10364, %r10347, 7; + and.b32 %r10365, %r10364, 1; + setp.ne.b32 %p504, %r10365, 0; + and.pred %p505, %p504, %p436; + shr.u32 %r10366, %r10347, 6; + and.b32 %r10367, %r10366, 1; + setp.ne.b32 %p506, %r10367, 0; + and.pred %p507, %p506, %p437; + shr.u32 %r10368, %r10347, 5; + and.b32 %r10369, %r10368, 1; + setp.ne.b32 %p508, %r10369, 0; + and.pred %p509, %p508, %p436; + shr.u32 %r10370, %r10347, 4; + and.b32 %r10371, %r10370, 1; + setp.ne.b32 %p510, %r10371, 0; + and.pred %p511, %p510, %p437; + shr.u32 %r10372, %r10347, 3; + and.b32 %r10373, %r10372, 1; + setp.ne.b32 %p512, %r10373, 0; + and.pred %p513, %p512, %p438; + shr.u32 %r10374, %r10347, 2; + and.b32 %r10375, %r10374, 1; + setp.ne.b32 %p514, %r10375, 0; + and.pred %p515, %p514, %p439; + shr.u32 %r10376, %r10347, 1; + and.b32 %r10377, %r10376, 1; + setp.ne.b32 %p516, %r10377, 0; + and.pred %p517, %p516, %p438; + and.pred %p518, %p450, %p439; + .loc 1 516 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:516:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + or.pred %p519, %p352, %p472; + or.pred %p520, %p353, %p473; + or.pred %p521, %p354, %p474; + or.pred %p522, %p355, %p475; + or.pred %p523, %p356, %p476; + or.pred %p524, %p357, %p477; + or.pred %p525, %p358, %p478; + or.pred %p526, %p359, %p479; + or.pred %p527, %p360, %p480; + or.pred %p528, %p361, %p481; + or.pred %p529, %p362, %p482; + or.pred %p530, %p363, %p483; + or.pred %p531, %p364, %p484; + or.pred %p532, %p365, %p485; + or.pred %p533, %p366, %p486; + or.pred %p534, %p367, %p487; + or.pred %p535, %p368, %p489; + or.pred %p536, %p369, %p491; + or.pred %p537, %p370, %p493; + or.pred %p538, %p371, %p495; + or.pred %p539, %p372, %p497; + or.pred %p540, %p373, %p499; + or.pred %p541, %p374, %p501; + or.pred %p542, %p375, %p503; + or.pred %p543, %p376, %p505; + or.pred %p544, %p377, %p507; + or.pred %p545, %p378, %p509; + or.pred %p546, %p379, %p511; + or.pred %p547, %p380, %p513; + or.pred %p548, %p381, %p515; + or.pred %p549, %p382, %p517; + or.pred %p550, %p383, %p518; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10378, %r10171, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10379, %r10378, 0fFF800000, %p519; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10380, %r10172, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10381, %r10380, 0fFF800000, %p520; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10382, %r10173, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10383, %r10382, 0fFF800000, %p521; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10384, %r10174, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10385, %r10384, 0fFF800000, %p522; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10386, %r10175, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10387, %r10386, 0fFF800000, %p523; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10388, %r10176, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10389, %r10388, 0fFF800000, %p524; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10390, %r10177, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10391, %r10390, 0fFF800000, %p525; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10392, %r10178, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10393, %r10392, 0fFF800000, %p526; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10394, %r10179, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10395, %r10394, 0fFF800000, %p527; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10396, %r10180, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10397, %r10396, 0fFF800000, %p528; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10398, %r10181, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10399, %r10398, 0fFF800000, %p529; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10400, %r10182, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10401, %r10400, 0fFF800000, %p530; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10402, %r10183, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10403, %r10402, 0fFF800000, %p531; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10404, %r10184, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10405, %r10404, 0fFF800000, %p532; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10406, %r10185, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10407, %r10406, 0fFF800000, %p533; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10408, %r10186, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10409, %r10408, 0fFF800000, %p534; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10410, %r10187, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10411, %r10410, 0fFF800000, %p535; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10412, %r10188, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10413, %r10412, 0fFF800000, %p536; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10414, %r10189, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10415, %r10414, 0fFF800000, %p537; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10416, %r10190, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10417, %r10416, 0fFF800000, %p538; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10418, %r10191, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10419, %r10418, 0fFF800000, %p539; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10420, %r10192, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10421, %r10420, 0fFF800000, %p540; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10422, %r10193, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10423, %r10422, 0fFF800000, %p541; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10424, %r10194, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10425, %r10424, 0fFF800000, %p542; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10426, %r10195, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10427, %r10426, 0fFF800000, %p543; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10428, %r10196, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10429, %r10428, 0fFF800000, %p544; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10430, %r10197, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10431, %r10430, 0fFF800000, %p545; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10432, %r10198, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10433, %r10432, 0fFF800000, %p546; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10434, %r10199, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10435, %r10434, 0fFF800000, %p547; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10436, %r10200, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10437, %r10436, 0fFF800000, %p548; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10438, %r10201, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10439, %r10438, 0fFF800000, %p549; + .loc 1 524 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:524:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10440, %r10202, 0f3FB8AA3B; + .loc 1 521 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:521:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.f32 %r10441, %r10440, 0fFF800000, %p550; + .loc 1 525 39 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:525:39 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + sub.f32 %r10442, %r10379, %r53; + sub.f32 %r10443, %r10381, %r53; + sub.f32 %r10444, %r10383, %r54; + sub.f32 %r10445, %r10385, %r54; + sub.f32 %r10446, %r10387, %r53; + sub.f32 %r10447, %r10389, %r53; + sub.f32 %r10448, %r10391, %r54; + sub.f32 %r10449, %r10393, %r54; + sub.f32 %r10450, %r10395, %r53; + sub.f32 %r10451, %r10397, %r53; + sub.f32 %r10452, %r10399, %r54; + sub.f32 %r10453, %r10401, %r54; + sub.f32 %r10454, %r10403, %r53; + sub.f32 %r10455, %r10405, %r53; + sub.f32 %r10456, %r10407, %r54; + sub.f32 %r10457, %r10409, %r54; + sub.f32 %r10458, %r10411, %r53; + sub.f32 %r10459, %r10413, %r53; + sub.f32 %r10460, %r10415, %r54; + sub.f32 %r10461, %r10417, %r54; + sub.f32 %r10462, %r10419, %r53; + sub.f32 %r10463, %r10421, %r53; + sub.f32 %r10464, %r10423, %r54; + sub.f32 %r10465, %r10425, %r54; + sub.f32 %r10466, %r10427, %r53; + sub.f32 %r10467, %r10429, %r53; + sub.f32 %r10468, %r10431, %r54; + sub.f32 %r10469, %r10433, %r54; + sub.f32 %r10470, %r10435, %r53; + sub.f32 %r10471, %r10437, %r53; + sub.f32 %r10472, %r10439, %r54; + sub.f32 %r10473, %r10441, %r54; + .loc 1 525 21 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:525:21 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + ex2.approx.ftz.f32 %r10474, %r10442; + ex2.approx.ftz.f32 %r10475, %r10443; + ex2.approx.ftz.f32 %r10476, %r10444; + ex2.approx.ftz.f32 %r10477, %r10445; + ex2.approx.ftz.f32 %r10478, %r10446; + ex2.approx.ftz.f32 %r10479, %r10447; + ex2.approx.ftz.f32 %r10480, %r10448; + ex2.approx.ftz.f32 %r10481, %r10449; + ex2.approx.ftz.f32 %r10482, %r10450; + ex2.approx.ftz.f32 %r10483, %r10451; + ex2.approx.ftz.f32 %r10484, %r10452; + ex2.approx.ftz.f32 %r10485, %r10453; + ex2.approx.ftz.f32 %r10486, %r10454; + ex2.approx.ftz.f32 %r10487, %r10455; + ex2.approx.ftz.f32 %r10488, %r10456; + ex2.approx.ftz.f32 %r10489, %r10457; + ex2.approx.ftz.f32 %r10490, %r10458; + ex2.approx.ftz.f32 %r10491, %r10459; + ex2.approx.ftz.f32 %r10492, %r10460; + ex2.approx.ftz.f32 %r10493, %r10461; + ex2.approx.ftz.f32 %r10494, %r10462; + ex2.approx.ftz.f32 %r10495, %r10463; + ex2.approx.ftz.f32 %r10496, %r10464; + ex2.approx.ftz.f32 %r10497, %r10465; + ex2.approx.ftz.f32 %r10498, %r10466; + ex2.approx.ftz.f32 %r10499, %r10467; + ex2.approx.ftz.f32 %r10500, %r10468; + ex2.approx.ftz.f32 %r10501, %r10469; + ex2.approx.ftz.f32 %r10502, %r10470; + ex2.approx.ftz.f32 %r10503, %r10471; + ex2.approx.ftz.f32 %r10504, %r10472; + ex2.approx.ftz.f32 %r10505, %r10473; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + add.s32 %r10506, %r8442, 49152; + add.s32 %r9538, %r10506, %r10128; + .loc 1 530 20 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:530:20 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + wgmma.fence.sync.aligned; + add.s32 %r9535, %r8442, 131072; + add.s32 %r10507, %r10132, %r9535; + bfe.u32 %r10508, %r10507, 4, 14; + cvt.u64.u32 %rd843, %r10508; + or.b64 %rd777, %rd843, 4611686293372403712; + bfe.u32 %r10509, %r9538, 4, 14; + cvt.u64.u32 %rd844, %r10509; + or.b64 %rd778, %rd844, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9119,%r9120,%r9121,%r9122,%r9123,%r9124,%r9125,%r9126,%r9127,%r9128,%r9129,%r9130,%r9131,%r9132,%r9133,%r9134,%r9135,%r9136,%r9137,%r9138,%r9139,%r9140,%r9141,%r9142,%r9143,%r9144,%r9145,%r9146,%r9147,%r9148,%r9149,%r9150}, %rd777, %rd778, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r10510, %r10136, %r9535; + bfe.u32 %r10511, %r10510, 4, 14; + cvt.u64.u32 %rd845, %r10511; + or.b64 %rd779, %rd845, 4611686293372403712; + add.s32 %r10512, %r9538, 32; + bfe.u32 %r10513, %r10512, 4, 14; + cvt.u64.u32 %rd846, %r10513; + or.b64 %rd780, %rd846, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9119,%r9120,%r9121,%r9122,%r9123,%r9124,%r9125,%r9126,%r9127,%r9128,%r9129,%r9130,%r9131,%r9132,%r9133,%r9134,%r9135,%r9136,%r9137,%r9138,%r9139,%r9140,%r9141,%r9142,%r9143,%r9144,%r9145,%r9146,%r9147,%r9148,%r9149,%r9150}, %rd779, %rd780, %p296, 1, 1, 0, 0; + // end inline asm + add.s32 %r10514, %r10141, %r9535; + bfe.u32 %r10515, %r10514, 4, 14; + cvt.u64.u32 %rd847, %r10515; + or.b64 %rd781, %rd847, 4611686293372403712; + add.s32 %r10516, %r9538, 64; + bfe.u32 %r10517, %r10516, 4, 14; + cvt.u64.u32 %rd848, %r10517; + or.b64 %rd782, %rd848, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9119,%r9120,%r9121,%r9122,%r9123,%r9124,%r9125,%r9126,%r9127,%r9128,%r9129,%r9130,%r9131,%r9132,%r9133,%r9134,%r9135,%r9136,%r9137,%r9138,%r9139,%r9140,%r9141,%r9142,%r9143,%r9144,%r9145,%r9146,%r9147,%r9148,%r9149,%r9150}, %rd781, %rd782, %p296, 1, 1, 0, 0; + // end inline asm + add.s32 %r10518, %r10146, %r9535; + bfe.u32 %r10519, %r10518, 4, 14; + cvt.u64.u32 %rd849, %r10519; + or.b64 %rd783, %rd849, 4611686293372403712; + add.s32 %r10520, %r9538, 96; + bfe.u32 %r10521, %r10520, 4, 14; + cvt.u64.u32 %rd850, %r10521; + or.b64 %rd784, %rd850, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9119,%r9120,%r9121,%r9122,%r9123,%r9124,%r9125,%r9126,%r9127,%r9128,%r9129,%r9130,%r9131,%r9132,%r9133,%r9134,%r9135,%r9136,%r9137,%r9138,%r9139,%r9140,%r9141,%r9142,%r9143,%r9144,%r9145,%r9146,%r9147,%r9148,%r9149,%r9150}, %rd783, %rd784, %p296, 1, 1, 0, 0; + // end inline asm + add.s32 %r10522, %r10151, %r9535; + bfe.u32 %r10523, %r10522, 4, 14; + cvt.u64.u32 %rd851, %r10523; + or.b64 %rd785, %rd851, 4611686293372403712; + add.s32 %r10524, %r9538, 8192; + bfe.u32 %r10525, %r10524, 4, 14; + cvt.u64.u32 %rd852, %r10525; + or.b64 %rd786, %rd852, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9119,%r9120,%r9121,%r9122,%r9123,%r9124,%r9125,%r9126,%r9127,%r9128,%r9129,%r9130,%r9131,%r9132,%r9133,%r9134,%r9135,%r9136,%r9137,%r9138,%r9139,%r9140,%r9141,%r9142,%r9143,%r9144,%r9145,%r9146,%r9147,%r9148,%r9149,%r9150}, %rd785, %rd786, %p296, 1, 1, 0, 0; + // end inline asm + add.s32 %r10526, %r10156, %r9535; + bfe.u32 %r10527, %r10526, 4, 14; + cvt.u64.u32 %rd853, %r10527; + or.b64 %rd787, %rd853, 4611686293372403712; + add.s32 %r10528, %r9538, 8224; + bfe.u32 %r10529, %r10528, 4, 14; + cvt.u64.u32 %rd854, %r10529; + or.b64 %rd788, %rd854, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9119,%r9120,%r9121,%r9122,%r9123,%r9124,%r9125,%r9126,%r9127,%r9128,%r9129,%r9130,%r9131,%r9132,%r9133,%r9134,%r9135,%r9136,%r9137,%r9138,%r9139,%r9140,%r9141,%r9142,%r9143,%r9144,%r9145,%r9146,%r9147,%r9148,%r9149,%r9150}, %rd787, %rd788, %p296, 1, 1, 0, 0; + // end inline asm + add.s32 %r10530, %r10161, %r9535; + bfe.u32 %r10531, %r10530, 4, 14; + cvt.u64.u32 %rd855, %r10531; + or.b64 %rd789, %rd855, 4611686293372403712; + add.s32 %r10532, %r9538, 8256; + bfe.u32 %r10533, %r10532, 4, 14; + cvt.u64.u32 %rd856, %r10533; + or.b64 %rd790, %rd856, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9119,%r9120,%r9121,%r9122,%r9123,%r9124,%r9125,%r9126,%r9127,%r9128,%r9129,%r9130,%r9131,%r9132,%r9133,%r9134,%r9135,%r9136,%r9137,%r9138,%r9139,%r9140,%r9141,%r9142,%r9143,%r9144,%r9145,%r9146,%r9147,%r9148,%r9149,%r9150}, %rd789, %rd790, %p296, 1, 1, 0, 0; + // end inline asm + add.s32 %r10534, %r10166, %r9535; + bfe.u32 %r10535, %r10534, 4, 14; + cvt.u64.u32 %rd857, %r10535; + or.b64 %rd791, %rd857, 4611686293372403712; + add.s32 %r10536, %r9538, 8288; + bfe.u32 %r10537, %r10536, 4, 14; + cvt.u64.u32 %rd858, %r10537; + or.b64 %rd792, %rd858, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9119,%r9120,%r9121,%r9122,%r9123,%r9124,%r9125,%r9126,%r9127,%r9128,%r9129,%r9130,%r9131,%r9132,%r9133,%r9134,%r9135,%r9136,%r9137,%r9138,%r9139,%r9140,%r9141,%r9142,%r9143,%r9144,%r9145,%r9146,%r9147,%r9148,%r9149,%r9150}, %rd791, %rd792, %p296, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r9536, %r9018; + mov.b32 %r9537, %r9018; + mov.b32 %r9539, %r9018; + mov.b32 %r9540, %r9018; + // begin inline asm + // wait for regs: %r9119,%r9120,%r9121,%r9122,%r9123,%r9124,%r9125,%r9126,%r9127,%r9128,%r9129,%r9130,%r9131,%r9132,%r9133,%r9134,%r9135,%r9136,%r9137,%r9138,%r9139,%r9140,%r9141,%r9142,%r9143,%r9144,%r9145,%r9146,%r9147,%r9148,%r9149,%r9150,%r9535,%r9536,%r9537,%r9538,%r9539,%r9540 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 531 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + sub.f32 %r10538, %r9119, %r8358; + sub.f32 %r10539, %r9120, %r8358; + sub.f32 %r10540, %r9121, %r8359; + sub.f32 %r10541, %r9122, %r8359; + sub.f32 %r10542, %r9123, %r8358; + sub.f32 %r10543, %r9124, %r8358; + sub.f32 %r10544, %r9125, %r8359; + sub.f32 %r10545, %r9126, %r8359; + sub.f32 %r10546, %r9127, %r8358; + sub.f32 %r10547, %r9128, %r8358; + sub.f32 %r10548, %r9129, %r8359; + sub.f32 %r10549, %r9130, %r8359; + sub.f32 %r10550, %r9131, %r8358; + sub.f32 %r10551, %r9132, %r8358; + sub.f32 %r10552, %r9133, %r8359; + sub.f32 %r10553, %r9134, %r8359; + sub.f32 %r10554, %r9135, %r8358; + sub.f32 %r10555, %r9136, %r8358; + sub.f32 %r10556, %r9137, %r8359; + sub.f32 %r10557, %r9138, %r8359; + sub.f32 %r10558, %r9139, %r8358; + sub.f32 %r10559, %r9140, %r8358; + sub.f32 %r10560, %r9141, %r8359; + sub.f32 %r10561, %r9142, %r8359; + sub.f32 %r10562, %r9143, %r8358; + sub.f32 %r10563, %r9144, %r8358; + sub.f32 %r10564, %r9145, %r8359; + sub.f32 %r10565, %r9146, %r8359; + sub.f32 %r10566, %r9147, %r8358; + sub.f32 %r10567, %r9148, %r8358; + sub.f32 %r10568, %r9149, %r8359; + sub.f32 %r10569, %r9150, %r8359; + .loc 1 531 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.f32 %r10570, %r10474, %r10538; + mul.f32 %r10571, %r10475, %r10539; + mul.f32 %r10572, %r10476, %r10540; + mul.f32 %r10573, %r10477, %r10541; + mul.f32 %r10574, %r10478, %r10542; + mul.f32 %r10575, %r10479, %r10543; + mul.f32 %r10576, %r10480, %r10544; + mul.f32 %r10577, %r10481, %r10545; + mul.f32 %r10578, %r10482, %r10546; + mul.f32 %r10579, %r10483, %r10547; + mul.f32 %r10580, %r10484, %r10548; + mul.f32 %r10581, %r10485, %r10549; + mul.f32 %r10582, %r10486, %r10550; + mul.f32 %r10583, %r10487, %r10551; + mul.f32 %r10584, %r10488, %r10552; + mul.f32 %r10585, %r10489, %r10553; + mul.f32 %r10586, %r10490, %r10554; + mul.f32 %r10587, %r10491, %r10555; + mul.f32 %r10588, %r10492, %r10556; + mul.f32 %r10589, %r10493, %r10557; + mul.f32 %r10590, %r10494, %r10558; + mul.f32 %r10591, %r10495, %r10559; + mul.f32 %r10592, %r10496, %r10560; + mul.f32 %r10593, %r10497, %r10561; + mul.f32 %r10594, %r10498, %r10562; + mul.f32 %r10595, %r10499, %r10563; + mul.f32 %r10596, %r10500, %r10564; + mul.f32 %r10597, %r10501, %r10565; + mul.f32 %r10598, %r10502, %r10566; + mul.f32 %r10599, %r10503, %r10567; + mul.f32 %r10600, %r10504, %r10568; + mul.f32 %r10601, %r10505, %r10569; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs126, %r10570; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs127, %rs126, 0x0000, %p519; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs128, %r10571; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs129, %rs128, 0x0000, %p520; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs130, %r10572; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs131, %rs130, 0x0000, %p521; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs132, %r10573; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs133, %rs132, 0x0000, %p522; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs134, %r10574; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs135, %rs134, 0x0000, %p523; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs136, %r10575; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs137, %rs136, 0x0000, %p524; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs138, %r10576; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs139, %rs138, 0x0000, %p525; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs140, %r10577; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs141, %rs140, 0x0000, %p526; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs142, %r10578; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs143, %rs142, 0x0000, %p527; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs144, %r10579; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs145, %rs144, 0x0000, %p528; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs146, %r10580; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs147, %rs146, 0x0000, %p529; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs148, %r10581; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs149, %rs148, 0x0000, %p530; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs150, %r10582; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs151, %rs150, 0x0000, %p531; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs152, %r10583; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs153, %rs152, 0x0000, %p532; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs154, %r10584; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs155, %rs154, 0x0000, %p533; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs156, %r10585; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs157, %rs156, 0x0000, %p534; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs158, %r10586; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs159, %rs158, 0x0000, %p535; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs160, %r10587; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs161, %rs160, 0x0000, %p536; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs162, %r10588; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs163, %rs162, 0x0000, %p537; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs164, %r10589; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs165, %rs164, 0x0000, %p538; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs166, %r10590; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs167, %rs166, 0x0000, %p539; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs168, %r10591; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs169, %rs168, 0x0000, %p540; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs170, %r10592; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs171, %rs170, 0x0000, %p541; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs172, %r10593; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs173, %rs172, 0x0000, %p542; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs174, %r10594; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs175, %rs174, 0x0000, %p543; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs176, %r10595; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs177, %rs176, 0x0000, %p544; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs178, %r10596; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs179, %rs178, 0x0000, %p545; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs180, %r10597; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs181, %rs180, 0x0000, %p546; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs182, %r10598; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs183, %rs182, 0x0000, %p547; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs184, %r10599; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs185, %rs184, 0x0000, %p548; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs186, %r10600; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs187, %rs186, 0x0000, %p549; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + cvt.rn.bf16.f32 %rs188, %r10601; + .loc 1 549 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:549:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + selp.b16 %rs189, %rs188, 0x0000, %p550; + .loc 1 553 21 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:553:21 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mov.b32 %r9707, {%rs127, %rs129}; + mov.b32 %r9708, {%rs131, %rs133}; + mov.b32 %r9709, {%rs135, %rs137}; + mov.b32 %r9710, {%rs139, %rs141}; + mov.b32 %r9839, {%rs143, %rs145}; + mov.b32 %r9840, {%rs147, %rs149}; + mov.b32 %r9841, {%rs151, %rs153}; + mov.b32 %r9842, {%rs155, %rs157}; + mov.b32 %r9971, {%rs159, %rs161}; + mov.b32 %r9972, {%rs163, %rs165}; + mov.b32 %r9973, {%rs167, %rs169}; + mov.b32 %r9974, {%rs171, %rs173}; + mov.b32 %r10103, {%rs175, %rs177}; + mov.b32 %r10104, {%rs179, %rs181}; + mov.b32 %r10105, {%rs183, %rs185}; + mov.b32 %r10106, {%rs187, %rs189}; + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100}, {%r9707,%r9708,%r9709,%r9710}, %rd762, %p296, 1, 1, 1; + // end inline asm + add.s32 %r10602, %r9020, 2048; + bfe.u32 %r10603, %r10602, 4, 14; + cvt.u64.u32 %rd859, %r10603; + or.b64 %rd794, %rd859, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100}, {%r9839,%r9840,%r9841,%r9842}, %rd794, %p296, 1, 1, 1; + // end inline asm + add.s32 %r10604, %r9020, 4096; + bfe.u32 %r10605, %r10604, 4, 14; + cvt.u64.u32 %rd860, %r10605; + or.b64 %rd795, %rd860, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100}, {%r9971,%r9972,%r9973,%r9974}, %rd795, %p296, 1, 1, 1; + // end inline asm + add.s32 %r10606, %r9020, 6144; + bfe.u32 %r10607, %r10606, 4, 14; + cvt.u64.u32 %rd861, %r10607; + or.b64 %rd796, %rd861, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100}, {%r10103,%r10104,%r10105,%r10106}, %rd796, %p296, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 417 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:417:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + add.s32 %r13102, %r13034, %r13102; + add.s32 %r13103, %r13034, %r13103; + add.s32 %r13104, %r13034, %r13104; + add.s32 %r13105, %r13034, %r13105; + add.s32 %r13106, %r13034, %r13106; + add.s32 %r13107, %r13034, %r13107; + add.s32 %r13108, %r13034, %r13108; + add.s32 %r13109, %r13034, %r13109; + add.s32 %r13110, %r13034, %r13110; + add.s32 %r13111, %r13034, %r13111; + add.s32 %r13112, %r13034, %r13112; + add.s32 %r13113, %r13034, %r13113; + add.s32 %r13114, %r13034, %r13114; + add.s32 %r13115, %r13034, %r13115; + add.s32 %r13116, %r13034, %r13116; + add.s32 %r13117, %r13034, %r13117; + .loc 1 397 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:397:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + add.s32 %r260, %r13101, 1; + .loc 1 788 33 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:788:33 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + shr.u32 %r10608, %r260, 1; + .loc 1 789 38 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:789:38 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mad.wide.u32 %rd798, %r10608, 4, %rd704; + .loc 1 789 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:789:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + // begin inline asm + mov.u64 %rd797, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd797, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10107, 0x0; + @%p314 ld.global.L1::evict_last.L2::cache_hint.b32 { %r10107 }, [ %rd798 + 0 ], %rd797; + // end inline asm + .loc 1 790 109 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:790:109 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + add.s32 %r10609, %r10608, 1; + .loc 1 790 113 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:790:113 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + setp.lt.s32 %p551, %r10609, %r8363; + .loc 1 790 55 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:790:55 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + add.s64 %rd801, %rd798, 4; + .loc 1 397 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:397:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + and.pred %p315, %p314, %p551; + .loc 1 790 25 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:790:25 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + // begin inline asm + mov.u64 %rd800, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd800, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10108, 0x0; + @%p315 ld.global.L1::evict_last.L2::cache_hint.b32 { %r10108 }, [ %rd801 + 0 ], %rd800; + // end inline asm + .loc 1 791 35 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:791:35 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + and.b32 %r10610, %r13101, 1; + .loc 1 792 34 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:792:34 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + sub.s32 %r10611, %r10108, %r10107; + .loc 1 792 48 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:792:48 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + shl.b32 %r10612, %r10611, 7; + .loc 1 792 63 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:792:63 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + add.s32 %r10613, %r10612, -64; + .loc 1 793 29 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:793:29 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + xor.b32 %r10614, %r10610, 1; + .loc 1 793 61 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:793:61 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + shl.b32 %r10615, %r10610, 6; + .loc 1 793 42 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:793:42 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mad.lo.s32 %r13034, %r10613, %r10614, %r10615; + .loc 1 414 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:414:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + shl.b32 %r10616, %r13034, 7; + .loc 1 414 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:414:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + mul.wide.s32 %rd862, %r10616, 2; + add.s64 %rd1014, %rd1014, %rd862; + add.s64 %rd1013, %rd1013, %rd862; + add.s64 %rd1012, %rd1012, %rd862; + add.s64 %rd1011, %rd1011, %rd862; + .loc 1 415 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:415:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + add.s64 %rd1010, %rd1010, %rd862; + add.s64 %rd1009, %rd1009, %rd862; + add.s64 %rd1008, %rd1008, %rd862; + add.s64 %rd1007, %rd1007, %rd862; + .loc 1 397 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:397:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + add.s32 %r10617, %r13036, 1; + setp.gt.s32 %p552, %r10617, 2; + selp.b32 %r13036, 0, %r10617, %p552; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + shl.b32 %r10618, %r13036, 14; + add.s32 %r10619, %r8442, %r10618; + bar.sync 0; + add.s32 %r10109, %r10619, %r59; + selp.b32 %r10110, 16, 0, %p318; + // begin inline asm + cp.async.cg.shared.global [ %r10109 + 0 ], [ %rd1014 + 0 ], 0x10, %r10110; + // end inline asm + add.s32 %r10111, %r10109, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r10111 + 0 ], [ %rd1013 + 0 ], 0x10, %r10110; + // end inline asm + add.s32 %r10113, %r10109, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r10113 + 0 ], [ %rd1012 + 0 ], 0x10, %r10110; + // end inline asm + add.s32 %r10115, %r10109, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r10115 + 0 ], [ %rd1011 + 0 ], 0x10, %r10110; + // end inline asm + cp.async.commit_group; + add.s32 %r10620, %r10506, %r10618; + add.s32 %r10117, %r10620, %r59; + // begin inline asm + cp.async.cg.shared.global [ %r10117 + 0 ], [ %rd1010 + 0 ], 0x10, %r10110; + // end inline asm + add.s32 %r10119, %r10117, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r10119 + 0 ], [ %rd1009 + 0 ], 0x10, %r10110; + // end inline asm + add.s32 %r10121, %r10117, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r10121 + 0 ], [ %rd1008 + 0 ], 0x10, %r10110; + // end inline asm + add.s32 %r10123, %r10117, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r10123 + 0 ], [ %rd1007 + 0 ], 0x10, %r10110; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:397:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + setp.ne.b32 %p553, %r76, %r260; + mov.b32 %r13101, %r260; + @%p553 bra $L__BB0_3; +$L__tmp9: +$L__BB0_4: // %._crit_edge1847 + .loc 1 0 0 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:0 + add.s64 %rd4, %rd184, %rd724; +$L__tmp10: + cvt.u64.u32 %rd5, %r8423; + cvt.u64.u32 %rd6, %r8424; + cvt.u64.u32 %rd7, %r8425; + cvt.u64.u32 %rd8, %r8426; + cvt.u64.u32 %rd9, %r8427; + cvt.u64.u32 %rd10, %r8428; + cvt.u64.u32 %rd11, %r8429; + cvt.u64.u32 %rd12, %r8430; +$L__tmp11: + .loc 1 397 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:397:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:207:12 ] + // begin inline asm + // wait for regs: %r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp12: + .loc 1 214 39 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:214:39 + shl.b64 %rd881, %rd16, 2; + add.s64 %rd863, %rd190, %rd881; + .loc 1 215 31 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:215:31 + // begin inline asm + mov.u32 %r10749, 0x0; + ld.global.b32 { %r10749 }, [ %rd863 + 0 ]; + // end inline asm + .loc 1 215 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:215:45 + shl.b32 %r10783, %r10749, 7; + .loc 1 216 62 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:216:62 + shl.b64 %rd882, %rd18, 2; + add.s64 %rd864, %rd189, %rd882; + .loc 1 216 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:216:43 + // begin inline asm + mov.u32 %r10750, 0x0; + ld.global.b32 { %r10750 }, [ %rd864 + 0 ]; + // end inline asm + .loc 1 218 33 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:218:33 + or.b32 %r10784, %r10783, %r9; + or.b32 %r10785, %r10783, %r10; + or.b32 %r10786, %r10783, %r11; + or.b32 %r10787, %r10783, %r12; +$L__tmp13: + .loc 1 390 37 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:390:37 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + shl.b32 %r10788, %r10784, 7; + shl.b32 %r10789, %r10785, 7; + shl.b32 %r10790, %r10786, 7; + shl.b32 %r10791, %r10787, 7; + .loc 1 390 18 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:390:18 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.wide.s32 %rd883, %r10788, 2; + add.s64 %rd884, %rd1, %rd883; + mul.wide.s32 %rd885, %r10789, 2; + add.s64 %rd886, %rd1, %rd885; + mul.wide.s32 %rd887, %r10790, 2; + add.s64 %rd888, %rd1, %rd887; + mul.wide.s32 %rd889, %r10791, 2; + add.s64 %rd890, %rd1, %rd889; + .loc 1 390 49 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:390:49 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + shl.b64 %rd891, %rd13, 1; + add.s64 %rd865, %rd884, %rd891; + add.s64 %rd866, %rd886, %rd891; + add.s64 %rd867, %rd888, %rd891; + add.s64 %rd868, %rd890, %rd891; + .loc 1 391 18 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:391:18 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + add.s64 %rd892, %rd2, %rd883; + add.s64 %rd893, %rd2, %rd885; + add.s64 %rd894, %rd2, %rd887; + add.s64 %rd895, %rd2, %rd889; + .loc 1 391 49 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:391:49 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + add.s64 %rd869, %rd892, %rd891; + add.s64 %rd870, %rd893, %rd891; + add.s64 %rd871, %rd894, %rd891; + add.s64 %rd872, %rd895, %rd891; + .loc 1 395 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:395:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + shl.b32 %r328, %r10750, 1; + .loc 1 397 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:397:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + setp.lt.s32 %p554, %r328, 1; + setp.gt.s32 %p555, %r328, 0; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + selp.b32 %r10752, 16, 0, %p555; + // begin inline asm + cp.async.cg.shared.global [ %r10751 + 0 ], [ %rd865 + 0 ], 0x10, %r10752; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10753 + 0 ], [ %rd866 + 0 ], 0x10, %r10752; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10755 + 0 ], [ %rd867 + 0 ], 0x10, %r10752; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10757 + 0 ], [ %rd868 + 0 ], 0x10, %r10752; + // end inline asm + cp.async.commit_group; + // begin inline asm + cp.async.cg.shared.global [ %r8372 + 0 ], [ %rd869 + 0 ], 0x10, %r10752; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r8374 + 0 ], [ %rd870 + 0 ], 0x10, %r10752; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r8376 + 0 ], [ %rd871 + 0 ], 0x10, %r10752; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r8378 + 0 ], [ %rd872 + 0 ], 0x10, %r10752; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:397:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + setp.gt.s32 %p556, %r328, 1; + .loc 1 414 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:414:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + add.s64 %rd1022, %rd865, 16384; + add.s64 %rd1021, %rd866, 16384; + add.s64 %rd1020, %rd867, 16384; + add.s64 %rd1019, %rd868, 16384; + .loc 1 415 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:415:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + add.s64 %rd1018, %rd869, 16384; + add.s64 %rd1017, %rd870, 16384; + add.s64 %rd1016, %rd871, 16384; + add.s64 %rd1015, %rd872, 16384; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + bar.sync 0; + selp.b32 %r10768, 16, 0, %p556; + // begin inline asm + cp.async.cg.shared.global [ %r8380 + 0 ], [ %rd1022 + 0 ], 0x10, %r10768; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r8382 + 0 ], [ %rd1021 + 0 ], 0x10, %r10768; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r8384 + 0 ], [ %rd1020 + 0 ], 0x10, %r10768; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r8386 + 0 ], [ %rd1019 + 0 ], 0x10, %r10768; + // end inline asm + cp.async.commit_group; + // begin inline asm + cp.async.cg.shared.global [ %r8388 + 0 ], [ %rd1018 + 0 ], 0x10, %r10768; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r8390 + 0 ], [ %rd1017 + 0 ], 0x10, %r10768; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r8392 + 0 ], [ %rd1016 + 0 ], 0x10, %r10768; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r8394 + 0 ], [ %rd1015 + 0 ], 0x10, %r10768; + // end inline asm + cp.async.commit_group; + .loc 1 459 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:459:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + .loc 1 397 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:397:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + @%p554 bra $L__BB0_7; +// %bb.5: // %.lr.ph1858 + .loc 1 395 63 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:395:63 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + min.u32 %r393, %r328, 32; + add.s32 %r394, %r393, -2; + add.s32 %r395, %r393, -1; + .loc 1 531 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mov.b64 %rd61, {%r8359, %r8359}; + mov.b64 %rd62, {%r8358, %r8358}; + mov.b32 %r11346, 0; + mov.b32 %r13183, 1; + mov.b32 %r13182, -1; + mov.b32 %r13248, %r11346; +$L__BB0_6: // %__nv_exp2f.exit1318 + // =>This Inner Loop Header: Depth=1 + .loc 1 397 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:397:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + setp.lt.s32 %p577, %r13248, %r394; + setp.lt.s32 %p575, %r13248, %r395; + add.s32 %r12453, %r13182, 1; + setp.gt.s32 %p578, %r12453, 2; + selp.b32 %r13182, 0, %r12453, %p578; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r12454, %r13182, 14; + add.s32 %r11348, %r8442, %r12454; + .loc 1 459 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:459:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + shfl.sync.idx.b32 %r12456, %r7, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r12457, %r12456, 11; + and.b32 %r12458, %r12457, 8192; + add.s32 %r11307, %r8442, 98304; + add.s32 %r12459, %r12458, %r11307; + bfe.u32 %r12460, %r12459, 4, 14; + cvt.u64.u32 %rd946, %r12460; + or.b64 %rd896, %rd946, 4611686293372403712; + bfe.u32 %r12461, %r11348, 4, 14; + cvt.u64.u32 %rd947, %r12461; + or.b64 %rd897, %rd947, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10891,%r10892,%r10893,%r10894,%r10895,%r10896,%r10897,%r10898,%r10899,%r10900,%r10901,%r10902,%r10903,%r10904,%r10905,%r10906,%r10907,%r10908,%r10909,%r10910,%r10911,%r10912,%r10913,%r10914,%r10915,%r10916,%r10917,%r10918,%r10919,%r10920,%r10921,%r10922}, %rd896, %rd897, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r12462, %r12458, 32; + add.s32 %r12463, %r12462, %r11307; + bfe.u32 %r12464, %r12463, 4, 14; + cvt.u64.u32 %rd948, %r12464; + or.b64 %rd898, %rd948, 4611686293372403712; + add.s32 %r12465, %r11348, 32; + bfe.u32 %r12466, %r12465, 4, 14; + cvt.u64.u32 %rd949, %r12466; + or.b64 %rd899, %rd949, 4611686293338849280; + mov.pred %p557, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10891,%r10892,%r10893,%r10894,%r10895,%r10896,%r10897,%r10898,%r10899,%r10900,%r10901,%r10902,%r10903,%r10904,%r10905,%r10906,%r10907,%r10908,%r10909,%r10910,%r10911,%r10912,%r10913,%r10914,%r10915,%r10916,%r10917,%r10918,%r10919,%r10920,%r10921,%r10922}, %rd898, %rd899, %p557, 1, 1, 0, 0; + // end inline asm + or.b32 %r12467, %r12458, 64; + add.s32 %r12468, %r12467, %r11307; + bfe.u32 %r12469, %r12468, 4, 14; + cvt.u64.u32 %rd950, %r12469; + or.b64 %rd900, %rd950, 4611686293372403712; + add.s32 %r12470, %r11348, 64; + bfe.u32 %r12471, %r12470, 4, 14; + cvt.u64.u32 %rd951, %r12471; + or.b64 %rd901, %rd951, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10891,%r10892,%r10893,%r10894,%r10895,%r10896,%r10897,%r10898,%r10899,%r10900,%r10901,%r10902,%r10903,%r10904,%r10905,%r10906,%r10907,%r10908,%r10909,%r10910,%r10911,%r10912,%r10913,%r10914,%r10915,%r10916,%r10917,%r10918,%r10919,%r10920,%r10921,%r10922}, %rd900, %rd901, %p557, 1, 1, 0, 0; + // end inline asm + or.b32 %r12472, %r12458, 96; + add.s32 %r12473, %r12472, %r11307; + bfe.u32 %r12474, %r12473, 4, 14; + cvt.u64.u32 %rd952, %r12474; + or.b64 %rd902, %rd952, 4611686293372403712; + add.s32 %r12475, %r11348, 96; + bfe.u32 %r12476, %r12475, 4, 14; + cvt.u64.u32 %rd953, %r12476; + or.b64 %rd903, %rd953, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10891,%r10892,%r10893,%r10894,%r10895,%r10896,%r10897,%r10898,%r10899,%r10900,%r10901,%r10902,%r10903,%r10904,%r10905,%r10906,%r10907,%r10908,%r10909,%r10910,%r10911,%r10912,%r10913,%r10914,%r10915,%r10916,%r10917,%r10918,%r10919,%r10920,%r10921,%r10922}, %rd902, %rd903, %p557, 1, 1, 0, 0; + // end inline asm + or.b32 %r12477, %r12458, 16384; + add.s32 %r12478, %r12477, %r11307; + bfe.u32 %r12479, %r12478, 4, 14; + cvt.u64.u32 %rd954, %r12479; + or.b64 %rd904, %rd954, 4611686293372403712; + add.s32 %r12480, %r11348, 8192; + bfe.u32 %r12481, %r12480, 4, 14; + cvt.u64.u32 %rd955, %r12481; + or.b64 %rd905, %rd955, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10891,%r10892,%r10893,%r10894,%r10895,%r10896,%r10897,%r10898,%r10899,%r10900,%r10901,%r10902,%r10903,%r10904,%r10905,%r10906,%r10907,%r10908,%r10909,%r10910,%r10911,%r10912,%r10913,%r10914,%r10915,%r10916,%r10917,%r10918,%r10919,%r10920,%r10921,%r10922}, %rd904, %rd905, %p557, 1, 1, 0, 0; + // end inline asm + or.b32 %r12482, %r12458, 16416; + add.s32 %r12483, %r12482, %r11307; + bfe.u32 %r12484, %r12483, 4, 14; + cvt.u64.u32 %rd956, %r12484; + or.b64 %rd906, %rd956, 4611686293372403712; + add.s32 %r12485, %r11348, 8224; + bfe.u32 %r12486, %r12485, 4, 14; + cvt.u64.u32 %rd957, %r12486; + or.b64 %rd907, %rd957, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10891,%r10892,%r10893,%r10894,%r10895,%r10896,%r10897,%r10898,%r10899,%r10900,%r10901,%r10902,%r10903,%r10904,%r10905,%r10906,%r10907,%r10908,%r10909,%r10910,%r10911,%r10912,%r10913,%r10914,%r10915,%r10916,%r10917,%r10918,%r10919,%r10920,%r10921,%r10922}, %rd906, %rd907, %p557, 1, 1, 0, 0; + // end inline asm + or.b32 %r12487, %r12458, 16448; + add.s32 %r12488, %r12487, %r11307; + bfe.u32 %r12489, %r12488, 4, 14; + cvt.u64.u32 %rd958, %r12489; + or.b64 %rd908, %rd958, 4611686293372403712; + add.s32 %r12490, %r11348, 8256; + bfe.u32 %r12491, %r12490, 4, 14; + cvt.u64.u32 %rd959, %r12491; + or.b64 %rd909, %rd959, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10891,%r10892,%r10893,%r10894,%r10895,%r10896,%r10897,%r10898,%r10899,%r10900,%r10901,%r10902,%r10903,%r10904,%r10905,%r10906,%r10907,%r10908,%r10909,%r10910,%r10911,%r10912,%r10913,%r10914,%r10915,%r10916,%r10917,%r10918,%r10919,%r10920,%r10921,%r10922}, %rd908, %rd909, %p557, 1, 1, 0, 0; + // end inline asm + or.b32 %r12492, %r12458, 16480; + add.s32 %r12493, %r12492, %r11307; + bfe.u32 %r12494, %r12493, 4, 14; + cvt.u64.u32 %rd960, %r12494; + or.b64 %rd910, %rd960, 4611686293372403712; + add.s32 %r12495, %r11348, 8288; + bfe.u32 %r12496, %r12495, 4, 14; + cvt.u64.u32 %rd961, %r12496; + or.b64 %rd911, %rd961, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10891,%r10892,%r10893,%r10894,%r10895,%r10896,%r10897,%r10898,%r10899,%r10900,%r10901,%r10902,%r10903,%r10904,%r10905,%r10906,%r10907,%r10908,%r10909,%r10910,%r10911,%r10912,%r10913,%r10914,%r10915,%r10916,%r10917,%r10918,%r10919,%r10920,%r10921,%r10922}, %rd910, %rd911, %p557, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r11308, %r11346; + mov.b32 %r11309, %r11346; + mov.b32 %r11311, %r11346; + mov.b32 %r11312, %r11346; + mov.b32 %r11310, %r11348; + // begin inline asm + // wait for regs: %r10891,%r10892,%r10893,%r10894,%r10895,%r10896,%r10897,%r10898,%r10899,%r10900,%r10901,%r10902,%r10903,%r10904,%r10905,%r10906,%r10907,%r10908,%r10909,%r10910,%r10911,%r10912,%r10913,%r10914,%r10915,%r10916,%r10917,%r10918,%r10919,%r10920,%r10921,%r10922,%r11307,%r11308,%r11309,%r11310,%r11311,%r11312 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 461 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:461:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.f32 %r12497, %r10891, 0f3DB504F3; + mul.f32 %r12498, %r10892, 0f3DB504F3; + mul.f32 %r12499, %r10893, 0f3DB504F3; + mul.f32 %r12500, %r10894, 0f3DB504F3; + mul.f32 %r12501, %r10895, 0f3DB504F3; + mul.f32 %r12502, %r10896, 0f3DB504F3; + mul.f32 %r12503, %r10897, 0f3DB504F3; + mul.f32 %r12504, %r10898, 0f3DB504F3; + mul.f32 %r12505, %r10899, 0f3DB504F3; + mul.f32 %r12506, %r10900, 0f3DB504F3; + mul.f32 %r12507, %r10901, 0f3DB504F3; + mul.f32 %r12508, %r10902, 0f3DB504F3; + mul.f32 %r12509, %r10903, 0f3DB504F3; + mul.f32 %r12510, %r10904, 0f3DB504F3; + mul.f32 %r12511, %r10905, 0f3DB504F3; + mul.f32 %r12512, %r10906, 0f3DB504F3; + mul.f32 %r12513, %r10907, 0f3DB504F3; + mul.f32 %r12514, %r10908, 0f3DB504F3; + mul.f32 %r12515, %r10909, 0f3DB504F3; + mul.f32 %r12516, %r10910, 0f3DB504F3; + mul.f32 %r12517, %r10911, 0f3DB504F3; + mul.f32 %r12518, %r10912, 0f3DB504F3; + mul.f32 %r12519, %r10913, 0f3DB504F3; + mul.f32 %r12520, %r10914, 0f3DB504F3; + mul.f32 %r12521, %r10915, 0f3DB504F3; + mul.f32 %r12522, %r10916, 0f3DB504F3; + mul.f32 %r12523, %r10917, 0f3DB504F3; + mul.f32 %r12524, %r10918, 0f3DB504F3; + mul.f32 %r12525, %r10919, 0f3DB504F3; + mul.f32 %r12526, %r10920, 0f3DB504F3; + mul.f32 %r12527, %r10921, 0f3DB504F3; + mul.f32 %r12528, %r10922, 0f3DB504F3; + .loc 1 525 39 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:525:39 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + neg.f32 %r12529, %r53; + fma.rn.f32 %r12530, %r12497, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12531, %r12498, 0f3FB8AA3B, %r12529; + neg.f32 %r12532, %r54; + fma.rn.f32 %r12533, %r12499, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12534, %r12500, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12535, %r12501, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12536, %r12502, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12537, %r12503, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12538, %r12504, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12539, %r12505, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12540, %r12506, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12541, %r12507, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12542, %r12508, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12543, %r12509, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12544, %r12510, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12545, %r12511, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12546, %r12512, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12547, %r12513, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12548, %r12514, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12549, %r12515, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12550, %r12516, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12551, %r12517, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12552, %r12518, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12553, %r12519, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12554, %r12520, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12555, %r12521, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12556, %r12522, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12557, %r12523, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12558, %r12524, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12559, %r12525, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12560, %r12526, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12561, %r12527, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12562, %r12528, 0f3FB8AA3B, %r12532; + .loc 1 525 21 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:525:21 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + ex2.approx.ftz.f32 %r12563, %r12530; + ex2.approx.ftz.f32 %r12564, %r12531; + ex2.approx.ftz.f32 %r12565, %r12533; + ex2.approx.ftz.f32 %r12566, %r12534; + ex2.approx.ftz.f32 %r12567, %r12535; + ex2.approx.ftz.f32 %r12568, %r12536; + ex2.approx.ftz.f32 %r12569, %r12537; + ex2.approx.ftz.f32 %r12570, %r12538; + ex2.approx.ftz.f32 %r12571, %r12539; + ex2.approx.ftz.f32 %r12572, %r12540; + ex2.approx.ftz.f32 %r12573, %r12541; + ex2.approx.ftz.f32 %r12574, %r12542; + ex2.approx.ftz.f32 %r12575, %r12543; + ex2.approx.ftz.f32 %r12576, %r12544; + ex2.approx.ftz.f32 %r12577, %r12545; + ex2.approx.ftz.f32 %r12578, %r12546; + ex2.approx.ftz.f32 %r12579, %r12547; + ex2.approx.ftz.f32 %r12580, %r12548; + ex2.approx.ftz.f32 %r12581, %r12549; + ex2.approx.ftz.f32 %r12582, %r12550; + ex2.approx.ftz.f32 %r12583, %r12551; + ex2.approx.ftz.f32 %r12584, %r12552; + ex2.approx.ftz.f32 %r12585, %r12553; + ex2.approx.ftz.f32 %r12586, %r12554; + ex2.approx.ftz.f32 %r12587, %r12555; + ex2.approx.ftz.f32 %r12588, %r12556; + ex2.approx.ftz.f32 %r12589, %r12557; + ex2.approx.ftz.f32 %r12590, %r12558; + ex2.approx.ftz.f32 %r12591, %r12559; + ex2.approx.ftz.f32 %r12592, %r12560; + ex2.approx.ftz.f32 %r12593, %r12561; + ex2.approx.ftz.f32 %r12594, %r12562; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + add.s32 %r12595, %r8442, 49152; + add.s32 %r11866, %r12595, %r12454; + .loc 1 530 20 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:530:20 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + wgmma.fence.sync.aligned; + add.s32 %r11863, %r8442, 131072; + add.s32 %r12596, %r12458, %r11863; + bfe.u32 %r12597, %r12596, 4, 14; + cvt.u64.u32 %rd962, %r12597; + or.b64 %rd912, %rd962, 4611686293372403712; + bfe.u32 %r12598, %r11866, 4, 14; + cvt.u64.u32 %rd963, %r12598; + or.b64 %rd913, %rd963, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11447,%r11448,%r11449,%r11450,%r11451,%r11452,%r11453,%r11454,%r11455,%r11456,%r11457,%r11458,%r11459,%r11460,%r11461,%r11462,%r11463,%r11464,%r11465,%r11466,%r11467,%r11468,%r11469,%r11470,%r11471,%r11472,%r11473,%r11474,%r11475,%r11476,%r11477,%r11478}, %rd912, %rd913, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r12599, %r12462, %r11863; + bfe.u32 %r12600, %r12599, 4, 14; + cvt.u64.u32 %rd964, %r12600; + or.b64 %rd914, %rd964, 4611686293372403712; + add.s32 %r12601, %r11866, 32; + bfe.u32 %r12602, %r12601, 4, 14; + cvt.u64.u32 %rd965, %r12602; + or.b64 %rd915, %rd965, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11447,%r11448,%r11449,%r11450,%r11451,%r11452,%r11453,%r11454,%r11455,%r11456,%r11457,%r11458,%r11459,%r11460,%r11461,%r11462,%r11463,%r11464,%r11465,%r11466,%r11467,%r11468,%r11469,%r11470,%r11471,%r11472,%r11473,%r11474,%r11475,%r11476,%r11477,%r11478}, %rd914, %rd915, %p557, 1, 1, 0, 0; + // end inline asm + add.s32 %r12603, %r12467, %r11863; + bfe.u32 %r12604, %r12603, 4, 14; + cvt.u64.u32 %rd966, %r12604; + or.b64 %rd916, %rd966, 4611686293372403712; + add.s32 %r12605, %r11866, 64; + bfe.u32 %r12606, %r12605, 4, 14; + cvt.u64.u32 %rd967, %r12606; + or.b64 %rd917, %rd967, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11447,%r11448,%r11449,%r11450,%r11451,%r11452,%r11453,%r11454,%r11455,%r11456,%r11457,%r11458,%r11459,%r11460,%r11461,%r11462,%r11463,%r11464,%r11465,%r11466,%r11467,%r11468,%r11469,%r11470,%r11471,%r11472,%r11473,%r11474,%r11475,%r11476,%r11477,%r11478}, %rd916, %rd917, %p557, 1, 1, 0, 0; + // end inline asm + add.s32 %r12607, %r12472, %r11863; + bfe.u32 %r12608, %r12607, 4, 14; + cvt.u64.u32 %rd968, %r12608; + or.b64 %rd918, %rd968, 4611686293372403712; + add.s32 %r12609, %r11866, 96; + bfe.u32 %r12610, %r12609, 4, 14; + cvt.u64.u32 %rd969, %r12610; + or.b64 %rd919, %rd969, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11447,%r11448,%r11449,%r11450,%r11451,%r11452,%r11453,%r11454,%r11455,%r11456,%r11457,%r11458,%r11459,%r11460,%r11461,%r11462,%r11463,%r11464,%r11465,%r11466,%r11467,%r11468,%r11469,%r11470,%r11471,%r11472,%r11473,%r11474,%r11475,%r11476,%r11477,%r11478}, %rd918, %rd919, %p557, 1, 1, 0, 0; + // end inline asm + add.s32 %r12611, %r12477, %r11863; + bfe.u32 %r12612, %r12611, 4, 14; + cvt.u64.u32 %rd970, %r12612; + or.b64 %rd920, %rd970, 4611686293372403712; + add.s32 %r12613, %r11866, 8192; + bfe.u32 %r12614, %r12613, 4, 14; + cvt.u64.u32 %rd971, %r12614; + or.b64 %rd921, %rd971, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11447,%r11448,%r11449,%r11450,%r11451,%r11452,%r11453,%r11454,%r11455,%r11456,%r11457,%r11458,%r11459,%r11460,%r11461,%r11462,%r11463,%r11464,%r11465,%r11466,%r11467,%r11468,%r11469,%r11470,%r11471,%r11472,%r11473,%r11474,%r11475,%r11476,%r11477,%r11478}, %rd920, %rd921, %p557, 1, 1, 0, 0; + // end inline asm + add.s32 %r12615, %r12482, %r11863; + bfe.u32 %r12616, %r12615, 4, 14; + cvt.u64.u32 %rd972, %r12616; + or.b64 %rd922, %rd972, 4611686293372403712; + add.s32 %r12617, %r11866, 8224; + bfe.u32 %r12618, %r12617, 4, 14; + cvt.u64.u32 %rd973, %r12618; + or.b64 %rd923, %rd973, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11447,%r11448,%r11449,%r11450,%r11451,%r11452,%r11453,%r11454,%r11455,%r11456,%r11457,%r11458,%r11459,%r11460,%r11461,%r11462,%r11463,%r11464,%r11465,%r11466,%r11467,%r11468,%r11469,%r11470,%r11471,%r11472,%r11473,%r11474,%r11475,%r11476,%r11477,%r11478}, %rd922, %rd923, %p557, 1, 1, 0, 0; + // end inline asm + add.s32 %r12619, %r12487, %r11863; + bfe.u32 %r12620, %r12619, 4, 14; + cvt.u64.u32 %rd974, %r12620; + or.b64 %rd924, %rd974, 4611686293372403712; + add.s32 %r12621, %r11866, 8256; + bfe.u32 %r12622, %r12621, 4, 14; + cvt.u64.u32 %rd975, %r12622; + or.b64 %rd925, %rd975, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11447,%r11448,%r11449,%r11450,%r11451,%r11452,%r11453,%r11454,%r11455,%r11456,%r11457,%r11458,%r11459,%r11460,%r11461,%r11462,%r11463,%r11464,%r11465,%r11466,%r11467,%r11468,%r11469,%r11470,%r11471,%r11472,%r11473,%r11474,%r11475,%r11476,%r11477,%r11478}, %rd924, %rd925, %p557, 1, 1, 0, 0; + // end inline asm + add.s32 %r12623, %r12492, %r11863; + bfe.u32 %r12624, %r12623, 4, 14; + cvt.u64.u32 %rd976, %r12624; + or.b64 %rd926, %rd976, 4611686293372403712; + add.s32 %r12625, %r11866, 8288; + bfe.u32 %r12626, %r12625, 4, 14; + cvt.u64.u32 %rd977, %r12626; + or.b64 %rd927, %rd977, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11447,%r11448,%r11449,%r11450,%r11451,%r11452,%r11453,%r11454,%r11455,%r11456,%r11457,%r11458,%r11459,%r11460,%r11461,%r11462,%r11463,%r11464,%r11465,%r11466,%r11467,%r11468,%r11469,%r11470,%r11471,%r11472,%r11473,%r11474,%r11475,%r11476,%r11477,%r11478}, %rd926, %rd927, %p557, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r11864, %r11346; + mov.b32 %r11865, %r11346; + mov.b32 %r11867, %r11346; + mov.b32 %r11868, %r11346; + // begin inline asm + // wait for regs: %r11447,%r11448,%r11449,%r11450,%r11451,%r11452,%r11453,%r11454,%r11455,%r11456,%r11457,%r11458,%r11459,%r11460,%r11461,%r11462,%r11463,%r11464,%r11465,%r11466,%r11467,%r11468,%r11469,%r11470,%r11471,%r11472,%r11473,%r11474,%r11475,%r11476,%r11477,%r11478,%r11863,%r11864,%r11865,%r11866,%r11867,%r11868 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 531 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mov.b64 {%r12627, %r12628}, %rd62; + sub.f32 %r12629, %r11448, %r12628; + sub.f32 %r12630, %r11447, %r12627; + .loc 1 531 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.f32 %r12631, %r12563, %r12630; + mul.f32 %r12632, %r12564, %r12629; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + cvt.rn.bf16x2.f32 %r12035, %r12632, %r12631; + .loc 1 531 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mov.b64 {%r12633, %r12634}, %rd61; + sub.f32 %r12635, %r11450, %r12634; + sub.f32 %r12636, %r11449, %r12633; + .loc 1 531 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.f32 %r12637, %r12565, %r12636; + mul.f32 %r12638, %r12566, %r12635; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + cvt.rn.bf16x2.f32 %r12036, %r12638, %r12637; + .loc 1 531 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + sub.f32 %r12639, %r11452, %r12628; + sub.f32 %r12640, %r11451, %r12627; + .loc 1 531 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.f32 %r12641, %r12567, %r12640; + mul.f32 %r12642, %r12568, %r12639; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + cvt.rn.bf16x2.f32 %r12037, %r12642, %r12641; + .loc 1 531 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + sub.f32 %r12643, %r11454, %r12634; + sub.f32 %r12644, %r11453, %r12633; + .loc 1 531 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.f32 %r12645, %r12569, %r12644; + mul.f32 %r12646, %r12570, %r12643; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + cvt.rn.bf16x2.f32 %r12038, %r12646, %r12645; + .loc 1 531 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + sub.f32 %r12647, %r11456, %r12628; + sub.f32 %r12648, %r11455, %r12627; + .loc 1 531 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.f32 %r12649, %r12571, %r12648; + mul.f32 %r12650, %r12572, %r12647; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + cvt.rn.bf16x2.f32 %r12167, %r12650, %r12649; + .loc 1 531 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + sub.f32 %r12651, %r11458, %r12634; + sub.f32 %r12652, %r11457, %r12633; + .loc 1 531 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.f32 %r12653, %r12573, %r12652; + mul.f32 %r12654, %r12574, %r12651; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + cvt.rn.bf16x2.f32 %r12168, %r12654, %r12653; + .loc 1 531 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + sub.f32 %r12655, %r11460, %r12628; + sub.f32 %r12656, %r11459, %r12627; + .loc 1 531 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.f32 %r12657, %r12575, %r12656; + mul.f32 %r12658, %r12576, %r12655; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + cvt.rn.bf16x2.f32 %r12169, %r12658, %r12657; + .loc 1 531 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + sub.f32 %r12659, %r11462, %r12634; + sub.f32 %r12660, %r11461, %r12633; + .loc 1 531 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.f32 %r12661, %r12577, %r12660; + mul.f32 %r12662, %r12578, %r12659; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + cvt.rn.bf16x2.f32 %r12170, %r12662, %r12661; + .loc 1 531 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + sub.f32 %r12663, %r11464, %r12628; + sub.f32 %r12664, %r11463, %r12627; + .loc 1 531 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.f32 %r12665, %r12579, %r12664; + mul.f32 %r12666, %r12580, %r12663; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + cvt.rn.bf16x2.f32 %r12299, %r12666, %r12665; + .loc 1 531 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + sub.f32 %r12667, %r11466, %r12634; + sub.f32 %r12668, %r11465, %r12633; + .loc 1 531 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.f32 %r12669, %r12581, %r12668; + mul.f32 %r12670, %r12582, %r12667; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + cvt.rn.bf16x2.f32 %r12300, %r12670, %r12669; + .loc 1 531 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + sub.f32 %r12671, %r11468, %r12628; + sub.f32 %r12672, %r11467, %r12627; + .loc 1 531 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.f32 %r12673, %r12583, %r12672; + mul.f32 %r12674, %r12584, %r12671; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + cvt.rn.bf16x2.f32 %r12301, %r12674, %r12673; + .loc 1 531 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + sub.f32 %r12675, %r11470, %r12634; + sub.f32 %r12676, %r11469, %r12633; + .loc 1 531 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.f32 %r12677, %r12585, %r12676; + mul.f32 %r12678, %r12586, %r12675; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + cvt.rn.bf16x2.f32 %r12302, %r12678, %r12677; + .loc 1 531 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + sub.f32 %r12679, %r11472, %r12628; + sub.f32 %r12680, %r11471, %r12627; + .loc 1 531 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.f32 %r12681, %r12587, %r12680; + mul.f32 %r12682, %r12588, %r12679; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + cvt.rn.bf16x2.f32 %r12431, %r12682, %r12681; + .loc 1 531 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + sub.f32 %r12683, %r11474, %r12634; + sub.f32 %r12684, %r11473, %r12633; + .loc 1 531 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.f32 %r12685, %r12589, %r12684; + mul.f32 %r12686, %r12590, %r12683; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + cvt.rn.bf16x2.f32 %r12432, %r12686, %r12685; + .loc 1 531 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + sub.f32 %r12687, %r11476, %r12628; + sub.f32 %r12688, %r11475, %r12627; + .loc 1 531 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.f32 %r12689, %r12591, %r12688; + mul.f32 %r12690, %r12592, %r12687; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + cvt.rn.bf16x2.f32 %r12433, %r12690, %r12689; + .loc 1 531 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + sub.f32 %r12691, %r11478, %r12634; + sub.f32 %r12692, %r11477, %r12633; + .loc 1 531 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:531:14 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.f32 %r12693, %r12593, %r12692; + mul.f32 %r12694, %r12594, %r12691; + .loc 1 551 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:551:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + cvt.rn.bf16x2.f32 %r12434, %r12694, %r12693; + .loc 1 553 21 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:553:21 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100}, {%r12035,%r12036,%r12037,%r12038}, %rd897, %p557, 1, 1, 1; + // end inline asm + add.s32 %r12695, %r11348, 2048; + bfe.u32 %r12696, %r12695, 4, 14; + cvt.u64.u32 %rd978, %r12696; + or.b64 %rd929, %rd978, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100}, {%r12167,%r12168,%r12169,%r12170}, %rd929, %p557, 1, 1, 1; + // end inline asm + add.s32 %r12697, %r11348, 4096; + bfe.u32 %r12698, %r12697, 4, 14; + cvt.u64.u32 %rd979, %r12698; + or.b64 %rd930, %rd979, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100}, {%r12299,%r12300,%r12301,%r12302}, %rd930, %p557, 1, 1, 1; + // end inline asm + add.s32 %r12699, %r11348, 6144; + bfe.u32 %r12700, %r12699, 4, 14; + cvt.u64.u32 %rd980, %r12700; + or.b64 %rd931, %rd980, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100}, {%r12431,%r12432,%r12433,%r12434}, %rd931, %p557, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 397 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:397:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + add.s32 %r528, %r13248, 1; + .loc 1 788 33 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:788:33 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + shr.u32 %r12701, %r528, 1; + .loc 1 789 38 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:789:38 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mad.wide.u32 %rd933, %r12701, 4, %rd863; + .loc 1 789 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:789:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + // begin inline asm + mov.u64 %rd932, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd932, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r12435, 0x0; + @%p575 ld.global.L1::evict_last.L2::cache_hint.b32 { %r12435 }, [ %rd933 + 0 ], %rd932; + // end inline asm + .loc 1 790 109 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:790:109 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + add.s32 %r12702, %r12701, 1; + .loc 1 790 113 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:790:113 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + setp.lt.s32 %p579, %r12702, %r10750; + .loc 1 790 55 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:790:55 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + add.s64 %rd936, %rd933, 4; + .loc 1 397 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:397:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + and.pred %p576, %p575, %p579; + .loc 1 790 25 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:790:25 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + // begin inline asm + mov.u64 %rd935, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd935, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r12436, 0x0; + @%p576 ld.global.L1::evict_last.L2::cache_hint.b32 { %r12436 }, [ %rd936 + 0 ], %rd935; + // end inline asm + .loc 1 791 35 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:791:35 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + and.b32 %r12703, %r13248, 1; + .loc 1 792 34 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:792:34 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + sub.s32 %r12704, %r12436, %r12435; + .loc 1 792 48 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:792:48 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + shl.b32 %r12705, %r12704, 7; + .loc 1 792 63 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:792:63 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + add.s32 %r12706, %r12705, 33554368; + .loc 1 414 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:414:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + shl.b32 %r12707, %r12703, 13; + .loc 1 793 29 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:793:29 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + shl.b32 %r12708, %r12703, 7; + xor.b32 %r12709, %r12708, 128; + .loc 1 414 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:414:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mad.lo.s32 %r12710, %r12709, %r12706, %r12707; + .loc 1 414 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:414:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + mul.wide.s32 %rd981, %r12710, 2; + add.s64 %rd1022, %rd1022, %rd981; + add.s64 %rd1021, %rd1021, %rd981; + add.s64 %rd1020, %rd1020, %rd981; + add.s64 %rd1019, %rd1019, %rd981; + .loc 1 415 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:415:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + add.s64 %rd1018, %rd1018, %rd981; + add.s64 %rd1017, %rd1017, %rd981; + add.s64 %rd1016, %rd1016, %rd981; + add.s64 %rd1015, %rd1015, %rd981; + .loc 1 397 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:397:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + add.s32 %r12711, %r13183, 1; + setp.gt.s32 %p580, %r12711, 2; + selp.b32 %r13183, 0, %r12711, %p580; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + shl.b32 %r12712, %r13183, 14; + add.s32 %r12713, %r8442, %r12712; + bar.sync 0; + add.s32 %r12437, %r12713, %r59; + selp.b32 %r12438, 16, 0, %p577; + // begin inline asm + cp.async.cg.shared.global [ %r12437 + 0 ], [ %rd1022 + 0 ], 0x10, %r12438; + // end inline asm + add.s32 %r12439, %r12437, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r12439 + 0 ], [ %rd1021 + 0 ], 0x10, %r12438; + // end inline asm + add.s32 %r12441, %r12437, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r12441 + 0 ], [ %rd1020 + 0 ], 0x10, %r12438; + // end inline asm + add.s32 %r12443, %r12437, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r12443 + 0 ], [ %rd1019 + 0 ], 0x10, %r12438; + // end inline asm + cp.async.commit_group; + add.s32 %r12714, %r12595, %r12712; + add.s32 %r12445, %r12714, %r59; + // begin inline asm + cp.async.cg.shared.global [ %r12445 + 0 ], [ %rd1018 + 0 ], 0x10, %r12438; + // end inline asm + add.s32 %r12447, %r12445, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r12447 + 0 ], [ %rd1017 + 0 ], 0x10, %r12438; + // end inline asm + add.s32 %r12449, %r12445, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r12449 + 0 ], [ %rd1016 + 0 ], 0x10, %r12438; + // end inline asm + add.s32 %r12451, %r12445, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r12451 + 0 ], [ %rd1015 + 0 ], 0x10, %r12438; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:397:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:226:16 ] + setp.ne.b32 %p581, %r393, %r528; + mov.b32 %r13248, %r528; + @%p581 bra $L__BB0_6; +$L__BB0_7: // %._crit_edge1859 + // begin inline asm + // wait for regs: %r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp14: + .loc 1 231 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:231:24 + shl.b64 %rd990, %rd5, 1; + add.s64 %rd991, %rd4, %rd990; + shl.b64 %rd992, %rd6, 1; + add.s64 %rd993, %rd4, %rd992; + shl.b64 %rd994, %rd7, 1; + add.s64 %rd995, %rd4, %rd994; + shl.b64 %rd996, %rd8, 1; + add.s64 %rd997, %rd4, %rd996; + shl.b64 %rd998, %rd9, 1; + add.s64 %rd999, %rd4, %rd998; + shl.b64 %rd1000, %rd10, 1; + add.s64 %rd1001, %rd4, %rd1000; + shl.b64 %rd1002, %rd11, 1; + add.s64 %rd1003, %rd4, %rd1002; + shl.b64 %rd1004, %rd12, 1; + add.s64 %rd1005, %rd4, %rd1004; + .loc 1 231 56 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:231:56 + add.s64 %rd982, %rd991, %rd891; + add.s64 %rd983, %rd993, %rd891; + add.s64 %rd984, %rd995, %rd891; + add.s64 %rd985, %rd997, %rd891; + add.s64 %rd986, %rd999, %rd891; + add.s64 %rd987, %rd1001, %rd891; + add.s64 %rd988, %rd1003, %rd891; + add.s64 %rd989, %rd1005, %rd891; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12915, %r13037, 0f3DB504F3; + mul.f32 %r12916, %r13038, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12917, %r12916, %r12915; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12918, %r13039, 0f3DB504F3; + mul.f32 %r12919, %r13040, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12920, %r12919, %r12918; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12921, %r13041, 0f3DB504F3; + mul.f32 %r12922, %r13042, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12923, %r12922, %r12921; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12924, %r13043, 0f3DB504F3; + mul.f32 %r12925, %r13044, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12926, %r12925, %r12924; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12927, %r13045, 0f3DB504F3; + mul.f32 %r12928, %r13046, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12929, %r12928, %r12927; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12930, %r13047, 0f3DB504F3; + mul.f32 %r12931, %r13048, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12932, %r12931, %r12930; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12933, %r13049, 0f3DB504F3; + mul.f32 %r12934, %r13050, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12935, %r12934, %r12933; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12936, %r13051, 0f3DB504F3; + mul.f32 %r12937, %r13052, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12938, %r12937, %r12936; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12939, %r13053, 0f3DB504F3; + mul.f32 %r12940, %r13054, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12941, %r12940, %r12939; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12942, %r13055, 0f3DB504F3; + mul.f32 %r12943, %r13056, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12944, %r12943, %r12942; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12945, %r13057, 0f3DB504F3; + mul.f32 %r12946, %r13058, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12947, %r12946, %r12945; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12948, %r13059, 0f3DB504F3; + mul.f32 %r12949, %r13060, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12950, %r12949, %r12948; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12951, %r13061, 0f3DB504F3; + mul.f32 %r12952, %r13062, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12953, %r12952, %r12951; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12954, %r13063, 0f3DB504F3; + mul.f32 %r12955, %r13064, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12956, %r12955, %r12954; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12957, %r13065, 0f3DB504F3; + mul.f32 %r12958, %r13066, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12959, %r12958, %r12957; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12960, %r13067, 0f3DB504F3; + mul.f32 %r12961, %r13068, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12962, %r12961, %r12960; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12963, %r13069, 0f3DB504F3; + mul.f32 %r12964, %r13070, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12965, %r12964, %r12963; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12966, %r13071, 0f3DB504F3; + mul.f32 %r12967, %r13072, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12968, %r12967, %r12966; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12969, %r13073, 0f3DB504F3; + mul.f32 %r12970, %r13074, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12971, %r12970, %r12969; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12972, %r13075, 0f3DB504F3; + mul.f32 %r12973, %r13076, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12974, %r12973, %r12972; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12975, %r13077, 0f3DB504F3; + mul.f32 %r12976, %r13078, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12977, %r12976, %r12975; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12978, %r13079, 0f3DB504F3; + mul.f32 %r12979, %r13080, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12980, %r12979, %r12978; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12981, %r13081, 0f3DB504F3; + mul.f32 %r12982, %r13082, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12983, %r12982, %r12981; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12984, %r13083, 0f3DB504F3; + mul.f32 %r12985, %r13084, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12986, %r12985, %r12984; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12987, %r13085, 0f3DB504F3; + mul.f32 %r12988, %r13086, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12989, %r12988, %r12987; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12990, %r13087, 0f3DB504F3; + mul.f32 %r12991, %r13088, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12992, %r12991, %r12990; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12993, %r13089, 0f3DB504F3; + mul.f32 %r12994, %r13090, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12995, %r12994, %r12993; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12996, %r13091, 0f3DB504F3; + mul.f32 %r12997, %r13092, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r12998, %r12997, %r12996; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r12999, %r13093, 0f3DB504F3; + mul.f32 %r13000, %r13094, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r13001, %r13000, %r12999; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r13002, %r13095, 0f3DB504F3; + mul.f32 %r13003, %r13096, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r13004, %r13003, %r13002; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r13005, %r13097, 0f3DB504F3; + mul.f32 %r13006, %r13098, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r13007, %r13006, %r13005; + .loc 1 232 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:232:14 + mul.f32 %r13008, %r13099, 0f3DB504F3; + mul.f32 %r13009, %r13100, 0f3DB504F3; + .loc 1 234 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:234:30 + cvt.rn.bf16x2.f32 %r13010, %r13009, %r13008; + shl.b32 %r13011, %r57, 13; + shl.b32 %r13012, %r6, 5; + and.b32 %r13013, %r13012, 7264; + and.b32 %r13014, %r6, 24; + shl.b32 %r13015, %r13014, 4; + shl.b32 %r13016, %r6, 2; + and.b32 %r13017, %r13016, 16; + or.b32 %r13018, %r13011, %r13017; + or.b32 %r13019, %r13013, %r13015; + or.b32 %r13020, %r13018, %r13019; + add.s32 %r13022, %r8442, %r13020; + st.shared.v4.b32 [%r13022], {%r12917, %r12923, %r12929, %r12935}; + st.shared.v4.b32 [%r13022+512], {%r12920, %r12926, %r12932, %r12938}; + xor.b32 %r13023, %r13020, 32; + add.s32 %r13024, %r8442, %r13023; + st.shared.v4.b32 [%r13024], {%r12941, %r12947, %r12953, %r12959}; + st.shared.v4.b32 [%r13024+512], {%r12944, %r12950, %r12956, %r12962}; + xor.b32 %r13025, %r13020, 64; + add.s32 %r13026, %r8442, %r13025; + st.shared.v4.b32 [%r13026], {%r12965, %r12971, %r12977, %r12983}; + st.shared.v4.b32 [%r13026+512], {%r12968, %r12974, %r12980, %r12986}; + xor.b32 %r13027, %r13020, 96; + add.s32 %r13028, %r8442, %r13027; + st.shared.v4.b32 [%r13028], {%r12989, %r12995, %r13001, %r13007}; + st.shared.v4.b32 [%r13028+512], {%r12992, %r12998, %r13004, %r13010}; + bar.sync 0; + shl.b32 %r13029, %r13014, 10; + shl.b32 %r13030, %r57, 5; + and.b32 %r13031, %r13016, 1008; + or.b32 %r13032, %r13029, %r13030; + xor.b32 %r13033, %r13032, %r13031; + add.s32 %r12847, %r8442, %r13033; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r12883, %r12884, %r12885, %r12886}, [%r12847]; + // end inline asm + add.s32 %r12852, %r12847, 1024; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r12887, %r12888, %r12889, %r12890}, [%r12852]; + // end inline asm + add.s32 %r12857, %r12847, 2048; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r12891, %r12892, %r12893, %r12894}, [%r12857]; + // end inline asm + add.s32 %r12862, %r12847, 3072; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r12895, %r12896, %r12897, %r12898}, [%r12862]; + // end inline asm + add.s32 %r12867, %r12847, 4096; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r12899, %r12900, %r12901, %r12902}, [%r12867]; + // end inline asm + add.s32 %r12872, %r12847, 5120; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r12903, %r12904, %r12905, %r12906}, [%r12872]; + // end inline asm + add.s32 %r12877, %r12847, 6144; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r12907, %r12908, %r12909, %r12910}, [%r12877]; + // end inline asm + add.s32 %r12882, %r12847, 7168; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r12911, %r12912, %r12913, %r12914}, [%r12882]; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd982 + 0 ], { %r12883, %r12884, %r12885, %r12886 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd983 + 0 ], { %r12887, %r12888, %r12889, %r12890 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd984 + 0 ], { %r12891, %r12892, %r12893, %r12894 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd985 + 0 ], { %r12895, %r12896, %r12897, %r12898 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd986 + 0 ], { %r12899, %r12900, %r12901, %r12902 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd987 + 0 ], { %r12903, %r12904, %r12905, %r12906 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd988 + 0 ], { %r12907, %r12908, %r12909, %r12910 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd989 + 0 ], { %r12911, %r12912, %r12913, %r12914 }; + // end inline asm + .loc 1 139 7 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:139:7 + bra.uni $L__BB0_17; +$L__BB0_8: + .loc 1 0 7 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:0:7 + ld.param.b64 %rd194, [triton_tem_fused_zeros_1_param_17]; + ld.param.b64 %rd192, [triton_tem_fused_zeros_1_param_15]; + ld.param.b64 %rd191, [triton_tem_fused_zeros_1_param_14]; + ld.param.b64 %rd188, [triton_tem_fused_zeros_1_param_11]; + ld.param.b64 %rd187, [triton_tem_fused_zeros_1_param_10]; + ld.param.b64 %rd197, [triton_tem_fused_zeros_1_param_7]; + shl.b32 %r1944, %r2, 21; + add.s32 %r5, %r1941, %r1944; + mad.wide.s32 %rd3, %r5, 2, %rd197; + .loc 1 252 25 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:252:25 + shl.b32 %r2017, %r1, 7; + .loc 1 253 29 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:253:29 + or.b32 %r2018, %r9, %r2017; + or.b32 %r2019, %r10, %r2017; + or.b32 %r2020, %r11, %r2017; + or.b32 %r2021, %r12, %r2017; + or.b32 %r2022, %r13, %r2017; + or.b32 %r2023, %r14, %r2017; + or.b32 %r2024, %r15, %r2017; + or.b32 %r2025, %r16, %r2017; + or.b32 %r594, %r17, %r2017; + or.b32 %r595, %r18, %r2017; +$L__tmp15: + .loc 1 825 38 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:825:38 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:256:107 ] + shl.b32 %r2026, %r2018, 7; + shl.b32 %r2027, %r2019, 7; + shl.b32 %r2028, %r2020, 7; + shl.b32 %r2029, %r2021, 7; + shl.b32 %r2030, %r2022, 7; + shl.b32 %r2031, %r2023, 7; + shl.b32 %r2032, %r2024, 7; + shl.b32 %r2033, %r2025, 7; + .loc 1 825 20 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:825:20 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:256:107 ] + cvt.u64.u32 %rd79, %r2026; + mul.wide.u32 %rd222, %r2026, 2; + add.s64 %rd223, %rd1, %rd222; + cvt.u64.u32 %rd80, %r2027; + mul.wide.u32 %rd224, %r2027, 2; + add.s64 %rd225, %rd1, %rd224; + cvt.u64.u32 %rd81, %r2028; + mul.wide.u32 %rd226, %r2028, 2; + add.s64 %rd227, %rd1, %rd226; + cvt.u64.u32 %rd82, %r2029; + mul.wide.u32 %rd228, %r2029, 2; + add.s64 %rd229, %rd1, %rd228; + cvt.u64.u32 %rd83, %r2030; + mul.wide.u32 %rd230, %r2030, 2; + add.s64 %rd231, %rd1, %rd230; + cvt.u64.u32 %rd84, %r2031; + mul.wide.u32 %rd232, %r2031, 2; + add.s64 %rd233, %rd1, %rd232; + cvt.u64.u32 %rd85, %r2032; + mul.wide.u32 %rd234, %r2032, 2; + add.s64 %rd235, %rd1, %rd234; + cvt.u64.u32 %rd86, %r2033; + mul.wide.u32 %rd236, %r2033, 2; + add.s64 %rd237, %rd1, %rd236; + .loc 1 825 56 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:825:56 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:256:107 ] + shl.b32 %r2034, %r6, 3; + and.b32 %r2035, %r2034, 120; + .loc 1 825 49 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:825:49 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:256:107 ] + cvt.u64.u32 %rd87, %r2035; + mul.wide.u32 %rd238, %r2035, 2; + add.s64 %rd199, %rd223, %rd238; + add.s64 %rd200, %rd225, %rd238; + add.s64 %rd201, %rd227, %rd238; + add.s64 %rd202, %rd229, %rd238; + add.s64 %rd203, %rd231, %rd238; + add.s64 %rd204, %rd233, %rd238; + add.s64 %rd205, %rd235, %rd238; + add.s64 %rd206, %rd237, %rd238; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:256:107 ] + // begin inline asm + mov.u32 %r1948, 0x0; + mov.u32 %r1949, 0x0; + mov.u32 %r1950, 0x0; + mov.u32 %r1951, 0x0; + ld.global.v4.b32 { %r1948, %r1949, %r1950, %r1951 }, [ %rd199 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1952, 0x0; + mov.u32 %r1953, 0x0; + mov.u32 %r1954, 0x0; + mov.u32 %r1955, 0x0; + ld.global.v4.b32 { %r1952, %r1953, %r1954, %r1955 }, [ %rd200 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1956, 0x0; + mov.u32 %r1957, 0x0; + mov.u32 %r1958, 0x0; + mov.u32 %r1959, 0x0; + ld.global.v4.b32 { %r1956, %r1957, %r1958, %r1959 }, [ %rd201 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1960, 0x0; + mov.u32 %r1961, 0x0; + mov.u32 %r1962, 0x0; + mov.u32 %r1963, 0x0; + ld.global.v4.b32 { %r1960, %r1961, %r1962, %r1963 }, [ %rd202 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1964, 0x0; + mov.u32 %r1965, 0x0; + mov.u32 %r1966, 0x0; + mov.u32 %r1967, 0x0; + ld.global.v4.b32 { %r1964, %r1965, %r1966, %r1967 }, [ %rd203 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1968, 0x0; + mov.u32 %r1969, 0x0; + mov.u32 %r1970, 0x0; + mov.u32 %r1971, 0x0; + ld.global.v4.b32 { %r1968, %r1969, %r1970, %r1971 }, [ %rd204 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1972, 0x0; + mov.u32 %r1973, 0x0; + mov.u32 %r1974, 0x0; + mov.u32 %r1975, 0x0; + ld.global.v4.b32 { %r1972, %r1973, %r1974, %r1975 }, [ %rd205 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1976, 0x0; + mov.u32 %r1977, 0x0; + mov.u32 %r1978, 0x0; + mov.u32 %r1979, 0x0; + ld.global.v4.b32 { %r1976, %r1977, %r1978, %r1979 }, [ %rd206 + 0 ]; + // end inline asm + shl.b32 %r2036, %r6, 4; + and.b32 %r2037, %r2036, 112; + shl.b32 %r2038, %r8, 3; + and.b32 %r2039, %r6, 112; + and.b32 %r2040, %r6, 8; + shl.b32 %r2041, %r2040, 11; + or.b32 %r2042, %r2037, %r2038; + xor.b32 %r2043, %r2042, %r2039; + or.b32 %r2044, %r2043, %r2041; + mov.b32 %r2045, global_smem; + add.s32 %r2046, %r2045, %r2044; + st.shared.v4.b32 [%r2046+99328], {%r1948, %r1949, %r1950, %r1951}; + st.shared.v4.b32 [%r2046+101376], {%r1952, %r1953, %r1954, %r1955}; + st.shared.v4.b32 [%r2046+103424], {%r1956, %r1957, %r1958, %r1959}; + st.shared.v4.b32 [%r2046+105472], {%r1960, %r1961, %r1962, %r1963}; + st.shared.v4.b32 [%r2046+107520], {%r1964, %r1965, %r1966, %r1967}; + st.shared.v4.b32 [%r2046+109568], {%r1968, %r1969, %r1970, %r1971}; + st.shared.v4.b32 [%r2046+111616], {%r1972, %r1973, %r1974, %r1975}; + st.shared.v4.b32 [%r2046+113664], {%r1976, %r1977, %r1978, %r1979}; +$L__tmp16: + .loc 1 825 20 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:825:20 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:257:107 ] + add.s64 %rd239, %rd2, %rd222; + add.s64 %rd240, %rd2, %rd224; + add.s64 %rd241, %rd2, %rd226; + add.s64 %rd242, %rd2, %rd228; + add.s64 %rd243, %rd2, %rd230; + add.s64 %rd244, %rd2, %rd232; + add.s64 %rd245, %rd2, %rd234; + add.s64 %rd246, %rd2, %rd236; + .loc 1 825 49 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:825:49 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:257:107 ] + add.s64 %rd207, %rd239, %rd238; + add.s64 %rd208, %rd240, %rd238; + add.s64 %rd209, %rd241, %rd238; + add.s64 %rd210, %rd242, %rd238; + add.s64 %rd211, %rd243, %rd238; + add.s64 %rd212, %rd244, %rd238; + add.s64 %rd213, %rd245, %rd238; + add.s64 %rd214, %rd246, %rd238; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:257:107 ] + // begin inline asm + mov.u32 %r1980, 0x0; + mov.u32 %r1981, 0x0; + mov.u32 %r1982, 0x0; + mov.u32 %r1983, 0x0; + ld.global.v4.b32 { %r1980, %r1981, %r1982, %r1983 }, [ %rd207 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1984, 0x0; + mov.u32 %r1985, 0x0; + mov.u32 %r1986, 0x0; + mov.u32 %r1987, 0x0; + ld.global.v4.b32 { %r1984, %r1985, %r1986, %r1987 }, [ %rd208 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1988, 0x0; + mov.u32 %r1989, 0x0; + mov.u32 %r1990, 0x0; + mov.u32 %r1991, 0x0; + ld.global.v4.b32 { %r1988, %r1989, %r1990, %r1991 }, [ %rd209 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1992, 0x0; + mov.u32 %r1993, 0x0; + mov.u32 %r1994, 0x0; + mov.u32 %r1995, 0x0; + ld.global.v4.b32 { %r1992, %r1993, %r1994, %r1995 }, [ %rd210 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1996, 0x0; + mov.u32 %r1997, 0x0; + mov.u32 %r1998, 0x0; + mov.u32 %r1999, 0x0; + ld.global.v4.b32 { %r1996, %r1997, %r1998, %r1999 }, [ %rd211 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2000, 0x0; + mov.u32 %r2001, 0x0; + mov.u32 %r2002, 0x0; + mov.u32 %r2003, 0x0; + ld.global.v4.b32 { %r2000, %r2001, %r2002, %r2003 }, [ %rd212 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2004, 0x0; + mov.u32 %r2005, 0x0; + mov.u32 %r2006, 0x0; + mov.u32 %r2007, 0x0; + ld.global.v4.b32 { %r2004, %r2005, %r2006, %r2007 }, [ %rd213 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2008, 0x0; + mov.u32 %r2009, 0x0; + mov.u32 %r2010, 0x0; + mov.u32 %r2011, 0x0; + ld.global.v4.b32 { %r2008, %r2009, %r2010, %r2011 }, [ %rd214 + 0 ]; + // end inline asm + st.shared.v4.b32 [%r2046+132096], {%r1980, %r1981, %r1982, %r1983}; + st.shared.v4.b32 [%r2046+134144], {%r1984, %r1985, %r1986, %r1987}; + st.shared.v4.b32 [%r2046+136192], {%r1988, %r1989, %r1990, %r1991}; + st.shared.v4.b32 [%r2046+138240], {%r1992, %r1993, %r1994, %r1995}; + st.shared.v4.b32 [%r2046+140288], {%r1996, %r1997, %r1998, %r1999}; + st.shared.v4.b32 [%r2046+142336], {%r2000, %r2001, %r2002, %r2003}; + st.shared.v4.b32 [%r2046+144384], {%r2004, %r2005, %r2006, %r2007}; + st.shared.v4.b32 [%r2046+146432], {%r2008, %r2009, %r2010, %r2011}; +$L__tmp17: + .loc 1 266 56 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:266:56 + shl.b32 %r596, %r2, 23; + .loc 1 281 58 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:281:58 + shl.b32 %r2047, %r4, 4; + .loc 1 281 80 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:281:80 + or.b32 %r2048, %r2047, %r1; + .loc 1 282 53 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:282:53 + shl.b32 %r2049, %r4, 8; + .loc 1 282 81 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:282:81 + shl.b32 %r2050, %r1, 4; + .loc 1 282 70 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:282:70 + or.b32 %r2051, %r2049, %r2050; + .loc 1 286 32 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:286:32 + mul.wide.u32 %rd247, %r2051, 4; + add.s64 %rd215, %rd188, %rd247; + mov.pred %p2, -1; + .loc 1 287 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:287:30 + // begin inline asm + mov.u32 %r2012, 0x0; + @%p2 ld.global.b32 { %r2012 }, [ %rd215 + 0 ]; + // end inline asm + .loc 1 287 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:287:43 + shl.b32 %r597, %r2012, 7; + .loc 1 288 55 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:288:55 + mul.wide.u32 %rd248, %r2048, 4; + add.s64 %rd216, %rd187, %rd248; + .loc 1 288 42 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:288:42 + // begin inline asm + mov.u32 %r2013, 0x0; + @%p2 ld.global.b32 { %r2013 }, [ %rd216 + 0 ]; + // end inline asm + .loc 1 290 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:290:45 + and.b32 %r599, %r6, 3; + shl.b32 %r600, %r599, 1; + or.b32 %r2052, %r600, 8; + or.b32 %r2053, %r600, 16; + or.b32 %r2054, %r600, 24; + or.b32 %r2055, %r600, 32; + or.b32 %r2056, %r600, 40; + or.b32 %r2057, %r600, 48; + or.b32 %r2058, %r600, 56; + .loc 1 290 32 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:290:32 + or.b32 %r2059, %r597, %r9; + or.b32 %r2060, %r597, %r10; + or.b32 %r2061, %r597, %r11; + or.b32 %r2062, %r597, %r12; + or.b32 %r702, %r597, %r600; + or.b32 %r701, %r702, 1; + or.b32 %r700, %r597, %r2052; + or.b32 %r699, %r702, 9; + or.b32 %r698, %r597, %r2053; + or.b32 %r697, %r702, 17; + or.b32 %r696, %r597, %r2054; + or.b32 %r695, %r702, 25; + or.b32 %r694, %r597, %r2055; + or.b32 %r693, %r702, 33; + or.b32 %r692, %r597, %r2056; + or.b32 %r691, %r702, 41; + or.b32 %r690, %r597, %r2057; + or.b32 %r689, %r702, 49; + or.b32 %r688, %r597, %r2058; + or.b32 %r687, %r702, 57; +$L__tmp18: + .loc 1 601 37 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:601:37 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + shl.b32 %r2063, %r2059, 12; + shl.b32 %r2064, %r2060, 12; + shl.b32 %r2065, %r2061, 12; + shl.b32 %r2066, %r2062, 12; + .loc 1 602 38 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:602:38 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + shl.b32 %r2067, %r2059, 7; + shl.b32 %r2068, %r2060, 7; + shl.b32 %r2069, %r2061, 7; + shl.b32 %r2070, %r2062, 7; + .loc 1 608 42 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:608:42 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + shl.b32 %r601, %r2013, 1; + .loc 1 608 61 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:608:61 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + min.s32 %r2071, %r601, 32; + .loc 1 701 35 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:701:35 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mad.wide.u32 %rd218, %r2, 8, %rd193; + .loc 1 610 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:610:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + setp.gt.s32 %p4, %r601, 0; + .loc 1 701 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:701:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + // begin inline asm + mov.u64 %rd217, 0x0; + @%p4 ld.global.b64 { %rd217 }, [ %rd218 + 0 ]; + // end inline asm +$L__tmp19: + .loc 1 306 41 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:306:41 + add.s64 %rd219, %rd192, %rd247; + .loc 1 307 34 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:307:34 + // begin inline asm + mov.u32 %r2014, 0x0; + @%p2 ld.global.b32 { %r2014 }, [ %rd219 + 0 ]; + // end inline asm + .loc 1 307 47 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:307:47 + shl.b32 %r602, %r2014, 7; + .loc 1 308 64 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:308:64 + add.s64 %rd220, %rd191, %rd248; + .loc 1 308 46 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:308:46 + // begin inline asm + mov.u32 %r2015, 0x0; + @%p2 ld.global.b32 { %r2015 }, [ %rd220 + 0 ]; + // end inline asm + .loc 1 310 36 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:310:36 + or.b32 %r2072, %r602, %r9; + or.b32 %r2073, %r602, %r10; + or.b32 %r2074, %r602, %r11; + or.b32 %r2075, %r602, %r12; + or.b32 %r2076, %r602, %r600; + or.b32 %r2077, %r602, %r2052; + or.b32 %r2078, %r602, %r2053; + or.b32 %r2079, %r602, %r2054; + or.b32 %r2080, %r602, %r2055; + or.b32 %r2081, %r602, %r2056; + or.b32 %r2082, %r602, %r2057; + or.b32 %r2083, %r602, %r2058; +$L__tmp20: + .loc 1 601 37 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:601:37 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + shl.b32 %r2084, %r2072, 12; + shl.b32 %r2085, %r2073, 12; + shl.b32 %r2086, %r2074, 12; + shl.b32 %r2087, %r2075, 12; + .loc 1 602 38 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:602:38 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + shl.b32 %r2088, %r2072, 7; + shl.b32 %r2089, %r2073, 7; + shl.b32 %r2090, %r2074, 7; + shl.b32 %r2091, %r2075, 7; + .loc 1 608 42 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:608:42 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + shl.b32 %r604, %r2015, 1; + .loc 1 608 61 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:608:61 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + min.s32 %r2092, %r604, 32; +$L__tmp21: + .loc 1 676 20 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:676:20 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + shl.b32 %r605, %r2, 16; + cvt.s64.s32 %rd90, %r2063; + cvt.s64.s32 %rd91, %r2064; + cvt.s64.s32 %rd92, %r2065; + cvt.s64.s32 %rd93, %r2066; + cvt.s64.s32 %rd94, %r2067; + cvt.s64.s32 %rd95, %r2068; + cvt.s64.s32 %rd96, %r2069; + cvt.s64.s32 %rd97, %r2070; + shl.b32 %r2093, %r2040, 10; + or.b32 %r606, %r2043, %r2093; + add.s32 %r5062, %r2045, %r606; + selp.b32 %r2096, 16, 0, %p4; + add.s32 %r5064, %r5062, 2048; + add.s32 %r5066, %r5062, 4096; + add.s32 %r5068, %r5062, 6144; + cvt.s64.s32 %rd98, %r702; + and.b32 %r612, %r6, 252; + shl.b32 %r613, %r599, 3; + add.s32 %r2094, %r2045, %r613; + add.s32 %r5070, %r2094, 98304; + selp.b32 %r2104, 8, 0, %p4; + add.s32 %r5072, %r2094, 98336; + add.s32 %r5074, %r2094, 98368; + add.s32 %r5076, %r2094, 98400; + add.s32 %r5078, %r2094, 98432; + add.s32 %r5080, %r2094, 98464; + add.s32 %r5082, %r2094, 98496; + add.s32 %r5084, %r2094, 98528; + add.s32 %r5086, %r5062, 49152; + add.s32 %r5088, %r5062, 51200; + add.s32 %r5090, %r5062, 53248; + add.s32 %r5092, %r5062, 55296; + add.s32 %r5094, %r2094, 98816; + add.s32 %r5096, %r2094, 98848; + add.s32 %r5098, %r2094, 98880; + add.s32 %r5100, %r2094, 98912; + add.s32 %r5102, %r2094, 98944; + add.s32 %r5104, %r2094, 98976; + add.s32 %r5106, %r2094, 99008; + add.s32 %r5108, %r2094, 99040; + setp.gt.s32 %p7, %r601, 1; + or.b32 %r635, %r702, 64; + or.b32 %r636, %r700, 64; + or.b32 %r637, %r698, 64; + or.b32 %r638, %r696, 64; + or.b32 %r639, %r694, 64; + or.b32 %r640, %r692, 64; + or.b32 %r641, %r690, 64; + or.b32 %r642, %r688, 64; + add.s32 %r5110, %r5062, 16384; + selp.b32 %r2144, 16, 0, %p7; + add.s32 %r5112, %r5062, 18432; + add.s32 %r5114, %r5062, 20480; + add.s32 %r5116, %r5062, 22528; + add.s32 %r5118, %r2094, 98560; + selp.b32 %r2152, 8, 0, %p7; + add.s32 %r5120, %r2094, 98592; + add.s32 %r5122, %r2094, 98624; + add.s32 %r5124, %r2094, 98656; + add.s32 %r5126, %r2094, 98688; + add.s32 %r5128, %r2094, 98720; + add.s32 %r5130, %r2094, 98752; + add.s32 %r5132, %r2094, 98784; + add.s32 %r5134, %r5062, 65536; + add.s32 %r5136, %r5062, 67584; + add.s32 %r5138, %r5062, 69632; + add.s32 %r5140, %r5062, 71680; + add.s32 %r5142, %r2094, 99072; + add.s32 %r5144, %r2094, 99104; + add.s32 %r5146, %r2094, 99136; + add.s32 %r5148, %r2094, 99168; + add.s32 %r5150, %r2094, 99200; + add.s32 %r5152, %r2094, 99232; + add.s32 %r5154, %r2094, 99264; + add.s32 %r5156, %r2094, 99296; + add.s32 %r669, %r2071, -2; + add.s32 %r670, %r2071, -1; + cvt.s64.s32 %rd99, %r2084; + cvt.s64.s32 %rd100, %r2085; + cvt.s64.s32 %rd101, %r2086; + cvt.s64.s32 %rd102, %r2087; + cvt.s64.s32 %rd103, %r2088; + cvt.s64.s32 %rd104, %r2089; + cvt.s64.s32 %rd105, %r2090; + cvt.s64.s32 %rd106, %r2091; + setp.gt.s32 %p8, %r604, 0; + selp.b32 %r5063, 16, 0, %p8; + cvt.s64.s32 %rd107, %r2076; + selp.b32 %r5071, 8, 0, %p8; + setp.gt.s32 %p9, %r604, 1; + or.b32 %r673, %r2076, 64; + or.b32 %r674, %r2077, 64; + or.b32 %r675, %r2078, 64; + or.b32 %r676, %r2079, 64; + or.b32 %r677, %r2080, 64; + or.b32 %r678, %r2081, 64; + or.b32 %r679, %r2082, 64; + or.b32 %r680, %r2083, 64; + selp.b32 %r5111, 16, 0, %p9; + selp.b32 %r5119, 8, 0, %p9; + add.s32 %r683, %r2092, -2; + add.s32 %r684, %r2092, -1; +$L__tmp22: + .loc 1 262 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:262:30 + max.s32 %r685, %r2071, 1; + max.s32 %r686, %r2092, 1; + mul.wide.u32 %rd108, %r3, 4; + mov.b32 %r13518, 0f00000000; + mov.b64 %rd1023, 0; + shl.b64 %rd299, %rd90, 1; + shl.b64 %rd301, %rd91, 1; + shl.b64 %rd303, %rd92, 1; + shl.b64 %rd305, %rd93, 1; + shl.b64 %rd308, %rd94, 1; + shl.b64 %rd310, %rd95, 1; + shl.b64 %rd312, %rd96, 1; + shl.b64 %rd314, %rd97, 1; + shl.b64 %rd316, %rd98, 2; + shl.b64 %rd507, %rd99, 1; + shl.b64 %rd509, %rd100, 1; + shl.b64 %rd511, %rd101, 1; + shl.b64 %rd513, %rd102, 1; + shl.b64 %rd516, %rd103, 1; + shl.b64 %rd518, %rd104, 1; + shl.b64 %rd520, %rd105, 1; + shl.b64 %rd522, %rd106, 1; + shl.b64 %rd524, %rd107, 2; + mov.b32 %r13519, %r13518; + mov.b32 %r13520, %r13518; + mov.b32 %r13521, %r13518; + mov.b32 %r13522, %r13518; + mov.b32 %r13523, %r13518; + mov.b32 %r13524, %r13518; + mov.b32 %r13525, %r13518; + mov.b32 %r13526, %r13518; + mov.b32 %r13527, %r13518; + mov.b32 %r13528, %r13518; + mov.b32 %r13529, %r13518; + mov.b32 %r13530, %r13518; + mov.b32 %r13531, %r13518; + mov.b32 %r13532, %r13518; + mov.b32 %r13533, %r13518; + mov.b32 %r13534, %r13518; + mov.b32 %r13535, %r13518; + mov.b32 %r13536, %r13518; + mov.b32 %r13537, %r13518; + mov.b32 %r13538, %r13518; + mov.b32 %r13539, %r13518; + mov.b32 %r13540, %r13518; + mov.b32 %r13541, %r13518; + mov.b32 %r13542, %r13518; + mov.b32 %r13543, %r13518; + mov.b32 %r13544, %r13518; + mov.b32 %r13545, %r13518; + mov.b32 %r13546, %r13518; + mov.b32 %r13547, %r13518; + mov.b32 %r13548, %r13518; + mov.b32 %r13549, %r13518; + mov.b32 %r13550, %r13518; + mov.b32 %r13551, %r13518; + mov.b32 %r13552, %r13518; + mov.b32 %r13553, %r13518; + mov.b32 %r13554, %r13518; + mov.b32 %r13555, %r13518; + mov.b32 %r13556, %r13518; + mov.b32 %r13557, %r13518; + mov.b32 %r13558, %r13518; + mov.b32 %r13559, %r13518; + mov.b32 %r13560, %r13518; + mov.b32 %r13561, %r13518; + mov.b32 %r13562, %r13518; + mov.b32 %r13563, %r13518; + mov.b32 %r13564, %r13518; + mov.b32 %r13565, %r13518; + mov.b32 %r13566, %r13518; + mov.b32 %r13567, %r13518; + mov.b32 %r13568, %r13518; + mov.b32 %r13569, %r13518; + mov.b32 %r13570, %r13518; + mov.b32 %r13571, %r13518; + mov.b32 %r13572, %r13518; + mov.b32 %r13573, %r13518; + mov.b32 %r13574, %r13518; + mov.b32 %r13575, %r13518; + mov.b32 %r13576, %r13518; + mov.b32 %r13577, %r13518; + mov.b32 %r13578, %r13518; + mov.b32 %r13579, %r13518; + mov.b32 %r13580, %r13518; + mov.b32 %r13581, %r13518; + mov.b32 %r13454, %r13518; + mov.b32 %r13455, %r13518; + mov.b32 %r13456, %r13518; + mov.b32 %r13457, %r13518; + mov.b32 %r13458, %r13518; + mov.b32 %r13459, %r13518; + mov.b32 %r13460, %r13518; + mov.b32 %r13461, %r13518; + mov.b32 %r13462, %r13518; + mov.b32 %r13463, %r13518; + mov.b32 %r13464, %r13518; + mov.b32 %r13465, %r13518; + mov.b32 %r13466, %r13518; + mov.b32 %r13467, %r13518; + mov.b32 %r13468, %r13518; + mov.b32 %r13469, %r13518; + mov.b32 %r13470, %r13518; + mov.b32 %r13471, %r13518; + mov.b32 %r13472, %r13518; + mov.b32 %r13473, %r13518; + mov.b32 %r13474, %r13518; + mov.b32 %r13475, %r13518; + mov.b32 %r13476, %r13518; + mov.b32 %r13477, %r13518; + mov.b32 %r13478, %r13518; + mov.b32 %r13479, %r13518; + mov.b32 %r13480, %r13518; + mov.b32 %r13481, %r13518; + mov.b32 %r13482, %r13518; + mov.b32 %r13483, %r13518; + mov.b32 %r13484, %r13518; + mov.b32 %r13485, %r13518; + mov.b32 %r13486, %r13518; + mov.b32 %r13487, %r13518; + mov.b32 %r13488, %r13518; + mov.b32 %r13489, %r13518; + mov.b32 %r13490, %r13518; + mov.b32 %r13491, %r13518; + mov.b32 %r13492, %r13518; + mov.b32 %r13493, %r13518; + mov.b32 %r13494, %r13518; + mov.b32 %r13495, %r13518; + mov.b32 %r13496, %r13518; + mov.b32 %r13497, %r13518; + mov.b32 %r13498, %r13518; + mov.b32 %r13499, %r13518; + mov.b32 %r13500, %r13518; + mov.b32 %r13501, %r13518; + mov.b32 %r13502, %r13518; + mov.b32 %r13503, %r13518; + mov.b32 %r13504, %r13518; + mov.b32 %r13505, %r13518; + mov.b32 %r13506, %r13518; + mov.b32 %r13507, %r13518; + mov.b32 %r13508, %r13518; + mov.b32 %r13509, %r13518; + mov.b32 %r13510, %r13518; + mov.b32 %r13511, %r13518; + mov.b32 %r13512, %r13518; + mov.b32 %r13513, %r13518; + mov.b32 %r13514, %r13518; + mov.b32 %r13515, %r13518; + mov.b32 %r13516, %r13518; + mov.b32 %r13517, %r13518; + bra.uni $L__BB0_9; +$L__BB0_15: // %._crit_edge1692 + // in Loop: Header=BB0_9 Depth=1 +$L__tmp23: + .loc 1 610 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:610:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + // begin inline asm + // wait for regs: %r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517,%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp24: + .loc 1 262 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:262:30 + add.s64 %rd1023, %rd1023, 1; + setp.ne.b64 %p282, %rd1023, 4; + @%p282 bra $L__BB0_9; + bra.uni $L__BB0_16; +$L__BB0_9: // =>This Loop Header: Depth=1 + // Child Loop BB0_11 Depth 2 + // Child Loop BB0_14 Depth 2 + .loc 1 0 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:0:30 + setp.eq.b32 %p186, %r612, 0; +$L__tmp25: + .loc 1 610 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:610:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + setp.lt.s32 %p42, %r601, 1; +$L__tmp26: + .loc 1 263 51 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:263:51 + add.s64 %rd297, %rd1023, %rd108; + .loc 1 266 44 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:266:44 + cvt.u32.u64 %r2191, %rd297; + shl.b32 %r2192, %r2191, 7; + add.s32 %r2193, %r2192, %r596; + .loc 1 267 36 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:267:36 + shl.b32 %r2194, %r2191, 18; + .loc 1 267 46 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:267:46 + add.s32 %r2195, %r2194, %r596; + .loc 1 269 50 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:269:50 + shl.b32 %r2196, %r2191, 11; + add.s32 %r2197, %r2196, %r605; + .loc 1 271 21 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:271:21 + mad.wide.s32 %rd126, %r2193, 2, %rd180; + .loc 1 272 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:272:23 + mad.wide.s32 %rd127, %r2195, 2, %rd183; + .loc 1 275 25 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:275:25 + mul.wide.s32 %rd298, %r2197, 4; + add.s64 %rd128, %rd181, %rd298; + .loc 1 276 29 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:276:29 + add.s64 %rd129, %rd182, %rd298; +$L__tmp27: + .loc 1 601 18 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:601:18 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s64 %rd300, %rd126, %rd299; + add.s64 %rd302, %rd126, %rd301; + add.s64 %rd304, %rd126, %rd303; + add.s64 %rd306, %rd126, %rd305; + .loc 1 601 49 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:601:49 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + shl.b64 %rd307, %rd87, 1; + add.s64 %rd249, %rd300, %rd307; + add.s64 %rd250, %rd302, %rd307; + add.s64 %rd251, %rd304, %rd307; + add.s64 %rd252, %rd306, %rd307; + .loc 1 602 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:602:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s64 %rd309, %rd127, %rd308; + add.s64 %rd311, %rd127, %rd310; + add.s64 %rd313, %rd127, %rd312; + add.s64 %rd315, %rd127, %rd314; + .loc 1 602 51 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:602:51 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s64 %rd261, %rd309, %rd307; + add.s64 %rd262, %rd311, %rd307; + add.s64 %rd263, %rd313, %rd307; + add.s64 %rd264, %rd315, %rd307; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + // begin inline asm + cp.async.cg.shared.global [ %r5062 + 0 ], [ %rd249 + 0 ], 0x10, %r2096; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5064 + 0 ], [ %rd250 + 0 ], 0x10, %r2096; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5066 + 0 ], [ %rd251 + 0 ], 0x10, %r2096; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5068 + 0 ], [ %rd252 + 0 ], 0x10, %r2096; + // end inline asm + cp.async.commit_group; + .loc 1 672 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:672:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s64 %rd253, %rd128, %rd316; + cvt.s64.s32 %rd317, %r597; + cvt.u64.u32 %rd130, %r600; + add.s64 %rd318, %rd317, %rd130; + shl.b64 %rd319, %rd318, 2; + add.s64 %rd320, %rd128, %rd319; + add.s64 %rd254, %rd320, 32; + add.s64 %rd255, %rd320, 64; + add.s64 %rd256, %rd320, 96; + add.s64 %rd257, %rd320, 128; + add.s64 %rd258, %rd320, 160; + add.s64 %rd259, %rd320, 192; + add.s64 %rd260, %rd320, 224; + .loc 1 672 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:672:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5070 + 0 ], [ %rd253 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5072 + 0 ], [ %rd254 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5074 + 0 ], [ %rd255 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5076 + 0 ], [ %rd256 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5078 + 0 ], [ %rd257 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5080 + 0 ], [ %rd258 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5082 + 0 ], [ %rd259 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5084 + 0 ], [ %rd260 + 0 ], 0x8, %r2104; + // end inline asm + cp.async.commit_group; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + // begin inline asm + cp.async.cg.shared.global [ %r5086 + 0 ], [ %rd261 + 0 ], 0x10, %r2096; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5088 + 0 ], [ %rd262 + 0 ], 0x10, %r2096; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5090 + 0 ], [ %rd263 + 0 ], 0x10, %r2096; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5092 + 0 ], [ %rd264 + 0 ], 0x10, %r2096; + // end inline asm + cp.async.commit_group; + .loc 1 746 29 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:746:29 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s64 %rd265, %rd129, %rd316; + add.s64 %rd321, %rd129, %rd319; + add.s64 %rd266, %rd321, 32; + add.s64 %rd267, %rd321, 64; + add.s64 %rd268, %rd321, 96; + add.s64 %rd269, %rd321, 128; + add.s64 %rd270, %rd321, 160; + add.s64 %rd271, %rd321, 192; + add.s64 %rd272, %rd321, 224; + .loc 1 746 21 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:746:21 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5094 + 0 ], [ %rd265 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5096 + 0 ], [ %rd266 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5098 + 0 ], [ %rd267 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5100 + 0 ], [ %rd268 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5102 + 0 ], [ %rd269 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5104 + 0 ], [ %rd270 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5106 + 0 ], [ %rd271 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5108 + 0 ], [ %rd272 + 0 ], 0x8, %r2104; + // end inline asm + cp.async.commit_group; + .loc 1 626 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:626:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s64 %rd1031, %rd249, 524288; + add.s64 %rd1030, %rd250, 524288; + add.s64 %rd1029, %rd251, 524288; + add.s64 %rd1028, %rd252, 524288; + .loc 1 627 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:627:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s64 %rd1027, %rd261, 16384; + add.s64 %rd1026, %rd262, 16384; + add.s64 %rd1025, %rd263, 16384; + add.s64 %rd1024, %rd264, 16384; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + bar.sync 0; + // begin inline asm + cp.async.cg.shared.global [ %r5110 + 0 ], [ %rd1031 + 0 ], 0x10, %r2144; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5112 + 0 ], [ %rd1030 + 0 ], 0x10, %r2144; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5114 + 0 ], [ %rd1029 + 0 ], 0x10, %r2144; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5116 + 0 ], [ %rd1028 + 0 ], 0x10, %r2144; + // end inline asm + cp.async.commit_group; + .loc 1 672 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:672:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s64 %rd277, %rd253, 256; + add.s64 %rd322, %rd319, 256; + add.s64 %rd323, %rd128, %rd322; + add.s64 %rd278, %rd323, 32; + add.s64 %rd279, %rd323, 64; + add.s64 %rd280, %rd323, 96; + add.s64 %rd281, %rd323, 128; + add.s64 %rd282, %rd323, 160; + add.s64 %rd283, %rd323, 192; + add.s64 %rd284, %rd323, 224; + .loc 1 672 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:672:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5118 + 0 ], [ %rd277 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5120 + 0 ], [ %rd278 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5122 + 0 ], [ %rd279 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5124 + 0 ], [ %rd280 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5126 + 0 ], [ %rd281 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5128 + 0 ], [ %rd282 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5130 + 0 ], [ %rd283 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5132 + 0 ], [ %rd284 + 0 ], 0x8, %r2152; + // end inline asm + cp.async.commit_group; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + // begin inline asm + cp.async.cg.shared.global [ %r5134 + 0 ], [ %rd1027 + 0 ], 0x10, %r2144; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5136 + 0 ], [ %rd1026 + 0 ], 0x10, %r2144; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5138 + 0 ], [ %rd1025 + 0 ], 0x10, %r2144; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5140 + 0 ], [ %rd1024 + 0 ], 0x10, %r2144; + // end inline asm + cp.async.commit_group; + .loc 1 746 29 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:746:29 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s64 %rd289, %rd265, 256; + add.s64 %rd324, %rd129, %rd322; + add.s64 %rd290, %rd324, 32; + add.s64 %rd291, %rd324, 64; + add.s64 %rd292, %rd324, 96; + add.s64 %rd293, %rd324, 128; + add.s64 %rd294, %rd324, 160; + add.s64 %rd295, %rd324, 192; + add.s64 %rd296, %rd324, 224; + .loc 1 746 21 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:746:21 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5142 + 0 ], [ %rd289 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5144 + 0 ], [ %rd290 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5146 + 0 ], [ %rd291 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5148 + 0 ], [ %rd292 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5150 + 0 ], [ %rd293 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5152 + 0 ], [ %rd294 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5154 + 0 ], [ %rd295 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5156 + 0 ], [ %rd296 + 0 ], 0x8, %r2152; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:610:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + @%p42 bra $L__BB0_12; +// %bb.10: // %.lr.ph.preheader + // in Loop: Header=BB0_9 Depth=1 + .loc 1 0 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:0:28 + mov.b32 %r2753, 0; + mov.b32 %r13443, 1; + mov.b32 %r13442, -1; + mov.b32 %r13441, 64; + mov.b32 %r13444, %r13442; + mov.b32 %r13445, %r13443; + mov.b32 %r13446, %r642; + mov.b32 %r13447, %r641; + mov.b32 %r13448, %r640; + mov.b32 %r13449, %r639; + mov.b32 %r13450, %r638; + mov.b32 %r13451, %r637; + mov.b32 %r13452, %r636; + mov.b32 %r13453, %r635; + mov.b32 %r13582, %r2753; + mov.b32 %r13583, %r687; + mov.b32 %r13584, %r688; + mov.b32 %r13585, %r689; + mov.b32 %r13586, %r690; + mov.b32 %r13587, %r691; + mov.b32 %r13588, %r692; + mov.b32 %r13589, %r693; + mov.b32 %r13590, %r694; + mov.b32 %r13591, %r695; + mov.b32 %r13592, %r696; + mov.b32 %r13593, %r697; + mov.b32 %r13594, %r698; + mov.b32 %r13595, %r699; + mov.b32 %r13596, %r700; + mov.b32 %r13597, %r701; + mov.b32 %r13598, %r702; +$L__BB0_11: // %.lr.ph + // Parent Loop BB0_9 Depth=1 + // => This Inner Loop Header: Depth=2 + .loc 1 610 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:610:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + setp.lt.s32 %p83, %r13582, %r669; + setp.lt.s32 %p65, %r13582, %r670; + add.s32 %r4420, %r13442, 1; + setp.gt.s32 %p84, %r4420, 1; + selp.b32 %r13442, 0, %r4420, %p84; + add.s32 %r4421, %r13444, 1; + setp.gt.s32 %p85, %r4421, 2; + selp.b32 %r13444, 0, %r4421, %p85; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cp.async.wait_group 4; + bar.sync 0; + shl.b32 %r4422, %r13444, 14; + add.s32 %r2755, %r2045, %r4422; + .loc 1 672 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:672:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + shl.b32 %r4424, %r13442, 8; + add.s32 %r4425, %r2045, 98304; + add.s32 %r4426, %r4425, %r4424; + add.s32 %r4427, %r4426, %r613; + ld.shared.v2.b32 {%r4428, %r4429}, [%r4427]; + ld.shared.v2.b32 {%r4430, %r4431}, [%r4427+32]; + ld.shared.v2.b32 {%r4432, %r4433}, [%r4427+64]; + ld.shared.v2.b32 {%r4434, %r4435}, [%r4427+96]; + ld.shared.v2.b32 {%r4436, %r4437}, [%r4427+128]; + ld.shared.v2.b32 {%r4438, %r4439}, [%r4427+160]; + ld.shared.v2.b32 {%r4440, %r4441}, [%r4427+192]; + ld.shared.v2.b32 {%r4442, %r4443}, [%r4427+224]; + .loc 1 675 26 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:675:26 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + setp.eq.f32 %p86, %r4428, 0fFF800000; + setp.eq.f32 %p87, %r4429, 0fFF800000; + setp.eq.f32 %p88, %r4430, 0fFF800000; + setp.eq.f32 %p89, %r4431, 0fFF800000; + setp.eq.f32 %p90, %r4432, 0fFF800000; + setp.eq.f32 %p91, %r4433, 0fFF800000; + setp.eq.f32 %p92, %r4434, 0fFF800000; + setp.eq.f32 %p93, %r4435, 0fFF800000; + setp.eq.f32 %p94, %r4436, 0fFF800000; + setp.eq.f32 %p95, %r4437, 0fFF800000; + setp.eq.f32 %p96, %r4438, 0fFF800000; + setp.eq.f32 %p97, %r4439, 0fFF800000; + setp.eq.f32 %p98, %r4440, 0fFF800000; + setp.eq.f32 %p99, %r4441, 0fFF800000; + setp.eq.f32 %p100, %r4442, 0fFF800000; + setp.eq.f32 %p101, %r4443, 0fFF800000; + .loc 1 675 46 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:675:46 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4444, 0f00000000, %r4428, %p86; + selp.f32 %r4445, 0f00000000, %r4429, %p87; + selp.f32 %r4446, 0f00000000, %r4430, %p88; + selp.f32 %r4447, 0f00000000, %r4431, %p89; + selp.f32 %r4448, 0f00000000, %r4432, %p90; + selp.f32 %r4449, 0f00000000, %r4433, %p91; + selp.f32 %r4450, 0f00000000, %r4434, %p92; + selp.f32 %r4451, 0f00000000, %r4435, %p93; + selp.f32 %r4452, 0f00000000, %r4436, %p94; + selp.f32 %r4453, 0f00000000, %r4437, %p95; + selp.f32 %r4454, 0f00000000, %r4438, %p96; + selp.f32 %r4455, 0f00000000, %r4439, %p97; + selp.f32 %r4456, 0f00000000, %r4440, %p98; + selp.f32 %r4457, 0f00000000, %r4441, %p99; + selp.f32 %r4458, 0f00000000, %r4442, %p100; + selp.f32 %r4459, 0f00000000, %r4443, %p101; + .loc 1 676 20 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:676:20 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + shfl.sync.idx.b32 %r4460, %r7, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r4461, %r4460, 11; + and.b32 %r4462, %r4461, 8192; + add.s32 %r2714, %r2045, 99328; + add.s32 %r4463, %r4462, %r2714; + bfe.u32 %r4464, %r4463, 4, 14; + cvt.u64.u32 %rd395, %r4464; + or.b64 %rd325, %rd395, 4611686293372403712; + bfe.u32 %r4465, %r2755, 4, 14; + cvt.u64.u32 %rd396, %r4465; + or.b64 %rd326, %rd396, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2298,%r2299,%r2300,%r2301,%r2302,%r2303,%r2304,%r2305,%r2306,%r2307,%r2308,%r2309,%r2310,%r2311,%r2312,%r2313,%r2314,%r2315,%r2316,%r2317,%r2318,%r2319,%r2320,%r2321,%r2322,%r2323,%r2324,%r2325,%r2326,%r2327,%r2328,%r2329}, %rd325, %rd326, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r4466, %r4462, 32; + add.s32 %r4467, %r4466, %r2714; + bfe.u32 %r4468, %r4467, 4, 14; + cvt.u64.u32 %rd397, %r4468; + or.b64 %rd327, %rd397, 4611686293372403712; + add.s32 %r4469, %r2755, 32; + bfe.u32 %r4470, %r4469, 4, 14; + cvt.u64.u32 %rd398, %r4470; + or.b64 %rd328, %rd398, 4611686293338849280; + mov.pred %p43, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2298,%r2299,%r2300,%r2301,%r2302,%r2303,%r2304,%r2305,%r2306,%r2307,%r2308,%r2309,%r2310,%r2311,%r2312,%r2313,%r2314,%r2315,%r2316,%r2317,%r2318,%r2319,%r2320,%r2321,%r2322,%r2323,%r2324,%r2325,%r2326,%r2327,%r2328,%r2329}, %rd327, %rd328, %p43, 1, 1, 0, 0; + // end inline asm + or.b32 %r4471, %r4462, 64; + add.s32 %r4472, %r4471, %r2714; + bfe.u32 %r4473, %r4472, 4, 14; + cvt.u64.u32 %rd399, %r4473; + or.b64 %rd329, %rd399, 4611686293372403712; + add.s32 %r4474, %r2755, 64; + bfe.u32 %r4475, %r4474, 4, 14; + cvt.u64.u32 %rd400, %r4475; + or.b64 %rd330, %rd400, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2298,%r2299,%r2300,%r2301,%r2302,%r2303,%r2304,%r2305,%r2306,%r2307,%r2308,%r2309,%r2310,%r2311,%r2312,%r2313,%r2314,%r2315,%r2316,%r2317,%r2318,%r2319,%r2320,%r2321,%r2322,%r2323,%r2324,%r2325,%r2326,%r2327,%r2328,%r2329}, %rd329, %rd330, %p43, 1, 1, 0, 0; + // end inline asm + or.b32 %r4476, %r4462, 96; + add.s32 %r4477, %r4476, %r2714; + bfe.u32 %r4478, %r4477, 4, 14; + cvt.u64.u32 %rd401, %r4478; + or.b64 %rd331, %rd401, 4611686293372403712; + add.s32 %r4479, %r2755, 96; + bfe.u32 %r4480, %r4479, 4, 14; + cvt.u64.u32 %rd402, %r4480; + or.b64 %rd332, %rd402, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2298,%r2299,%r2300,%r2301,%r2302,%r2303,%r2304,%r2305,%r2306,%r2307,%r2308,%r2309,%r2310,%r2311,%r2312,%r2313,%r2314,%r2315,%r2316,%r2317,%r2318,%r2319,%r2320,%r2321,%r2322,%r2323,%r2324,%r2325,%r2326,%r2327,%r2328,%r2329}, %rd331, %rd332, %p43, 1, 1, 0, 0; + // end inline asm + or.b32 %r4481, %r4462, 16384; + add.s32 %r4482, %r4481, %r2714; + bfe.u32 %r4483, %r4482, 4, 14; + cvt.u64.u32 %rd403, %r4483; + or.b64 %rd333, %rd403, 4611686293372403712; + add.s32 %r4484, %r2755, 8192; + bfe.u32 %r4485, %r4484, 4, 14; + cvt.u64.u32 %rd404, %r4485; + or.b64 %rd334, %rd404, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2298,%r2299,%r2300,%r2301,%r2302,%r2303,%r2304,%r2305,%r2306,%r2307,%r2308,%r2309,%r2310,%r2311,%r2312,%r2313,%r2314,%r2315,%r2316,%r2317,%r2318,%r2319,%r2320,%r2321,%r2322,%r2323,%r2324,%r2325,%r2326,%r2327,%r2328,%r2329}, %rd333, %rd334, %p43, 1, 1, 0, 0; + // end inline asm + or.b32 %r4486, %r4462, 16416; + add.s32 %r4487, %r4486, %r2714; + bfe.u32 %r4488, %r4487, 4, 14; + cvt.u64.u32 %rd405, %r4488; + or.b64 %rd335, %rd405, 4611686293372403712; + add.s32 %r4489, %r2755, 8224; + bfe.u32 %r4490, %r4489, 4, 14; + cvt.u64.u32 %rd406, %r4490; + or.b64 %rd336, %rd406, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2298,%r2299,%r2300,%r2301,%r2302,%r2303,%r2304,%r2305,%r2306,%r2307,%r2308,%r2309,%r2310,%r2311,%r2312,%r2313,%r2314,%r2315,%r2316,%r2317,%r2318,%r2319,%r2320,%r2321,%r2322,%r2323,%r2324,%r2325,%r2326,%r2327,%r2328,%r2329}, %rd335, %rd336, %p43, 1, 1, 0, 0; + // end inline asm + or.b32 %r4491, %r4462, 16448; + add.s32 %r4492, %r4491, %r2714; + bfe.u32 %r4493, %r4492, 4, 14; + cvt.u64.u32 %rd407, %r4493; + or.b64 %rd337, %rd407, 4611686293372403712; + add.s32 %r4494, %r2755, 8256; + bfe.u32 %r4495, %r4494, 4, 14; + cvt.u64.u32 %rd408, %r4495; + or.b64 %rd338, %rd408, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2298,%r2299,%r2300,%r2301,%r2302,%r2303,%r2304,%r2305,%r2306,%r2307,%r2308,%r2309,%r2310,%r2311,%r2312,%r2313,%r2314,%r2315,%r2316,%r2317,%r2318,%r2319,%r2320,%r2321,%r2322,%r2323,%r2324,%r2325,%r2326,%r2327,%r2328,%r2329}, %rd337, %rd338, %p43, 1, 1, 0, 0; + // end inline asm + or.b32 %r4496, %r4462, 16480; + add.s32 %r4497, %r4496, %r2714; + bfe.u32 %r4498, %r4497, 4, 14; + cvt.u64.u32 %rd409, %r4498; + or.b64 %rd339, %rd409, 4611686293372403712; + add.s32 %r4499, %r2755, 8288; + bfe.u32 %r4500, %r4499, 4, 14; + cvt.u64.u32 %rd410, %r4500; + or.b64 %rd340, %rd410, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2298,%r2299,%r2300,%r2301,%r2302,%r2303,%r2304,%r2305,%r2306,%r2307,%r2308,%r2309,%r2310,%r2311,%r2312,%r2313,%r2314,%r2315,%r2316,%r2317,%r2318,%r2319,%r2320,%r2321,%r2322,%r2323,%r2324,%r2325,%r2326,%r2327,%r2328,%r2329}, %rd339, %rd340, %p43, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r2717, %r2755; + mov.b32 %r2715, %r2753; + mov.b32 %r2716, %r2753; + mov.b32 %r2718, %r2753; + mov.b32 %r2719, %r2753; + // begin inline asm + // wait for regs: %r2298,%r2299,%r2300,%r2301,%r2302,%r2303,%r2304,%r2305,%r2306,%r2307,%r2308,%r2309,%r2310,%r2311,%r2312,%r2313,%r2314,%r2315,%r2316,%r2317,%r2318,%r2319,%r2320,%r2321,%r2322,%r2323,%r2324,%r2325,%r2326,%r2327,%r2328,%r2329,%r2714,%r2715,%r2716,%r2717,%r2718,%r2719 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 678 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:678:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4501, %r2298, 0f3DB504F3; + mul.f32 %r4502, %r2299, 0f3DB504F3; + mul.f32 %r4503, %r2300, 0f3DB504F3; + mul.f32 %r4504, %r2301, 0f3DB504F3; + mul.f32 %r4505, %r2302, 0f3DB504F3; + mul.f32 %r4506, %r2303, 0f3DB504F3; + mul.f32 %r4507, %r2304, 0f3DB504F3; + mul.f32 %r4508, %r2305, 0f3DB504F3; + mul.f32 %r4509, %r2306, 0f3DB504F3; + mul.f32 %r4510, %r2307, 0f3DB504F3; + mul.f32 %r4511, %r2308, 0f3DB504F3; + mul.f32 %r4512, %r2309, 0f3DB504F3; + mul.f32 %r4513, %r2310, 0f3DB504F3; + mul.f32 %r4514, %r2311, 0f3DB504F3; + mul.f32 %r4515, %r2312, 0f3DB504F3; + mul.f32 %r4516, %r2313, 0f3DB504F3; + mul.f32 %r4517, %r2314, 0f3DB504F3; + mul.f32 %r4518, %r2315, 0f3DB504F3; + mul.f32 %r4519, %r2316, 0f3DB504F3; + mul.f32 %r4520, %r2317, 0f3DB504F3; + mul.f32 %r4521, %r2318, 0f3DB504F3; + mul.f32 %r4522, %r2319, 0f3DB504F3; + mul.f32 %r4523, %r2320, 0f3DB504F3; + mul.f32 %r4524, %r2321, 0f3DB504F3; + mul.f32 %r4525, %r2322, 0f3DB504F3; + mul.f32 %r4526, %r2323, 0f3DB504F3; + mul.f32 %r4527, %r2324, 0f3DB504F3; + mul.f32 %r4528, %r2325, 0f3DB504F3; + mul.f32 %r4529, %r2326, 0f3DB504F3; + mul.f32 %r4530, %r2327, 0f3DB504F3; + mul.f32 %r4531, %r2328, 0f3DB504F3; + mul.f32 %r4532, %r2329, 0f3DB504F3; + .loc 1 698 25 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:698:25 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + setp.ge.s32 %p102, %r13598, %r594; + setp.ge.s32 %p103, %r13597, %r594; + setp.ge.s32 %p104, %r13598, %r595; + setp.ge.s32 %p105, %r13597, %r595; + setp.ge.s32 %p106, %r13596, %r594; + setp.ge.s32 %p107, %r13595, %r594; + setp.ge.s32 %p108, %r13596, %r595; + setp.ge.s32 %p109, %r13595, %r595; + setp.ge.s32 %p110, %r13594, %r594; + setp.ge.s32 %p111, %r13593, %r594; + setp.ge.s32 %p112, %r13594, %r595; + setp.ge.s32 %p113, %r13593, %r595; + setp.ge.s32 %p114, %r13592, %r594; + setp.ge.s32 %p115, %r13591, %r594; + setp.ge.s32 %p116, %r13592, %r595; + setp.ge.s32 %p117, %r13591, %r595; + setp.ge.s32 %p118, %r13590, %r594; + setp.ge.s32 %p119, %r13589, %r594; + setp.ge.s32 %p120, %r13590, %r595; + setp.ge.s32 %p121, %r13589, %r595; + setp.ge.s32 %p122, %r13588, %r594; + setp.ge.s32 %p123, %r13587, %r594; + setp.ge.s32 %p124, %r13588, %r595; + setp.ge.s32 %p125, %r13587, %r595; + setp.ge.s32 %p126, %r13586, %r594; + setp.ge.s32 %p127, %r13585, %r594; + setp.ge.s32 %p128, %r13586, %r595; + setp.ge.s32 %p129, %r13585, %r595; + setp.ge.s32 %p130, %r13584, %r594; + setp.ge.s32 %p131, %r13583, %r594; + setp.ge.s32 %p132, %r13584, %r595; + setp.ge.s32 %p133, %r13583, %r595; + .loc 1 704 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:704:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.s64.s32 %rd411, %r13598; + cvt.s64.s32 %rd412, %r13597; + cvt.s64.s32 %rd413, %r13596; + cvt.s64.s32 %rd414, %r13595; + cvt.s64.s32 %rd415, %r13594; + cvt.s64.s32 %rd416, %r13593; + cvt.s64.s32 %rd417, %r13592; + cvt.s64.s32 %rd418, %r13591; + cvt.s64.s32 %rd419, %r13590; + cvt.s64.s32 %rd420, %r13589; + cvt.s64.s32 %rd421, %r13588; + cvt.s64.s32 %rd422, %r13587; + cvt.s64.s32 %rd423, %r13586; + cvt.s64.s32 %rd424, %r13585; + cvt.s64.s32 %rd425, %r13584; + cvt.s64.s32 %rd426, %r13583; + setp.gt.s64 %p134, %rd217, %rd426; + setp.gt.s64 %p135, %rd217, %rd425; + setp.gt.s64 %p136, %rd217, %rd424; + setp.gt.s64 %p137, %rd217, %rd423; + setp.gt.s64 %p138, %rd217, %rd422; + setp.gt.s64 %p139, %rd217, %rd421; + setp.gt.s64 %p140, %rd217, %rd420; + setp.gt.s64 %p141, %rd217, %rd419; + setp.gt.s64 %p142, %rd217, %rd418; + setp.gt.s64 %p143, %rd217, %rd417; + setp.gt.s64 %p144, %rd217, %rd416; + setp.gt.s64 %p145, %rd217, %rd415; + setp.gt.s64 %p146, %rd217, %rd414; + setp.gt.s64 %p147, %rd217, %rd413; + setp.gt.s64 %p148, %rd217, %rd412; + setp.gt.s64 %p149, %rd217, %rd411; + .loc 1 706 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:706:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + and.pred %p150, %p102, %p149; + and.pred %p151, %p103, %p148; + and.pred %p152, %p104, %p149; + and.pred %p153, %p105, %p148; + and.pred %p154, %p106, %p147; + and.pred %p155, %p107, %p146; + and.pred %p156, %p108, %p147; + and.pred %p157, %p109, %p146; + and.pred %p158, %p110, %p145; + and.pred %p159, %p111, %p144; + and.pred %p160, %p112, %p145; + and.pred %p161, %p113, %p144; + and.pred %p162, %p114, %p143; + and.pred %p163, %p115, %p142; + and.pred %p164, %p116, %p143; + and.pred %p165, %p117, %p142; + and.pred %p166, %p118, %p141; + and.pred %p167, %p119, %p140; + and.pred %p168, %p120, %p141; + and.pred %p169, %p121, %p140; + and.pred %p170, %p122, %p139; + and.pred %p171, %p123, %p138; + and.pred %p172, %p124, %p139; + and.pred %p173, %p125, %p138; + and.pred %p174, %p126, %p137; + and.pred %p175, %p127, %p136; + and.pred %p176, %p128, %p137; + and.pred %p177, %p129, %p136; + and.pred %p178, %p130, %p135; + and.pred %p179, %p131, %p134; + and.pred %p180, %p132, %p135; + and.pred %p181, %p133, %p134; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4533, %r4501, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4534, %r4533, 0fFF800000, %p150; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4535, %r4502, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4536, %r4535, 0fFF800000, %p151; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4537, %r4503, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4538, %r4537, 0fFF800000, %p152; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4539, %r4504, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4540, %r4539, 0fFF800000, %p153; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4541, %r4505, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4542, %r4541, 0fFF800000, %p154; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4543, %r4506, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4544, %r4543, 0fFF800000, %p155; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4545, %r4507, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4546, %r4545, 0fFF800000, %p156; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4547, %r4508, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4548, %r4547, 0fFF800000, %p157; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4549, %r4509, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4550, %r4549, 0fFF800000, %p158; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4551, %r4510, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4552, %r4551, 0fFF800000, %p159; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4553, %r4511, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4554, %r4553, 0fFF800000, %p160; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4555, %r4512, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4556, %r4555, 0fFF800000, %p161; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4557, %r4513, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4558, %r4557, 0fFF800000, %p162; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4559, %r4514, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4560, %r4559, 0fFF800000, %p163; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4561, %r4515, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4562, %r4561, 0fFF800000, %p164; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4563, %r4516, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4564, %r4563, 0fFF800000, %p165; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4565, %r4517, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4566, %r4565, 0fFF800000, %p166; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4567, %r4518, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4568, %r4567, 0fFF800000, %p167; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4569, %r4519, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4570, %r4569, 0fFF800000, %p168; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4571, %r4520, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4572, %r4571, 0fFF800000, %p169; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4573, %r4521, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4574, %r4573, 0fFF800000, %p170; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4575, %r4522, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4576, %r4575, 0fFF800000, %p171; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4577, %r4523, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4578, %r4577, 0fFF800000, %p172; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4579, %r4524, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4580, %r4579, 0fFF800000, %p173; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4581, %r4525, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4582, %r4581, 0fFF800000, %p174; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4583, %r4526, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4584, %r4583, 0fFF800000, %p175; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4585, %r4527, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4586, %r4585, 0fFF800000, %p176; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4587, %r4528, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4588, %r4587, 0fFF800000, %p177; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4589, %r4529, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4590, %r4589, 0fFF800000, %p178; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4591, %r4530, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4592, %r4591, 0fFF800000, %p179; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4593, %r4531, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4594, %r4593, 0fFF800000, %p180; + .loc 1 739 27 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:739:27 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4595, %r4532, 0f3FB8AA3B; + .loc 1 736 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:736:69 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.f32 %r4596, %r4595, 0fFF800000, %p181; + .loc 1 740 40 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:740:40 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + sub.f32 %r4597, %r4534, %r4444; + sub.f32 %r4598, %r4536, %r4445; + sub.f32 %r4599, %r4538, %r4444; + sub.f32 %r4600, %r4540, %r4445; + sub.f32 %r4601, %r4542, %r4446; + sub.f32 %r4602, %r4544, %r4447; + sub.f32 %r4603, %r4546, %r4446; + sub.f32 %r4604, %r4548, %r4447; + sub.f32 %r4605, %r4550, %r4448; + sub.f32 %r4606, %r4552, %r4449; + sub.f32 %r4607, %r4554, %r4448; + sub.f32 %r4608, %r4556, %r4449; + sub.f32 %r4609, %r4558, %r4450; + sub.f32 %r4610, %r4560, %r4451; + sub.f32 %r4611, %r4562, %r4450; + sub.f32 %r4612, %r4564, %r4451; + sub.f32 %r4613, %r4566, %r4452; + sub.f32 %r4614, %r4568, %r4453; + sub.f32 %r4615, %r4570, %r4452; + sub.f32 %r4616, %r4572, %r4453; + sub.f32 %r4617, %r4574, %r4454; + sub.f32 %r4618, %r4576, %r4455; + sub.f32 %r4619, %r4578, %r4454; + sub.f32 %r4620, %r4580, %r4455; + sub.f32 %r4621, %r4582, %r4456; + sub.f32 %r4622, %r4584, %r4457; + sub.f32 %r4623, %r4586, %r4456; + sub.f32 %r4624, %r4588, %r4457; + sub.f32 %r4625, %r4590, %r4458; + sub.f32 %r4626, %r4592, %r4459; + sub.f32 %r4627, %r4594, %r4458; + sub.f32 %r4628, %r4596, %r4459; + .loc 1 740 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:740:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + ex2.approx.ftz.f32 %r4629, %r4597; + ex2.approx.ftz.f32 %r4630, %r4598; + ex2.approx.ftz.f32 %r4631, %r4599; + ex2.approx.ftz.f32 %r4632, %r4600; + ex2.approx.ftz.f32 %r4633, %r4601; + ex2.approx.ftz.f32 %r4634, %r4602; + ex2.approx.ftz.f32 %r4635, %r4603; + ex2.approx.ftz.f32 %r4636, %r4604; + ex2.approx.ftz.f32 %r4637, %r4605; + ex2.approx.ftz.f32 %r4638, %r4606; + ex2.approx.ftz.f32 %r4639, %r4607; + ex2.approx.ftz.f32 %r4640, %r4608; + ex2.approx.ftz.f32 %r4641, %r4609; + ex2.approx.ftz.f32 %r4642, %r4610; + ex2.approx.ftz.f32 %r4643, %r4611; + ex2.approx.ftz.f32 %r4644, %r4612; + ex2.approx.ftz.f32 %r4645, %r4613; + ex2.approx.ftz.f32 %r4646, %r4614; + ex2.approx.ftz.f32 %r4647, %r4615; + ex2.approx.ftz.f32 %r4648, %r4616; + ex2.approx.ftz.f32 %r4649, %r4617; + ex2.approx.ftz.f32 %r4650, %r4618; + ex2.approx.ftz.f32 %r4651, %r4619; + ex2.approx.ftz.f32 %r4652, %r4620; + ex2.approx.ftz.f32 %r4653, %r4621; + ex2.approx.ftz.f32 %r4654, %r4622; + ex2.approx.ftz.f32 %r4655, %r4623; + ex2.approx.ftz.f32 %r4656, %r4624; + ex2.approx.ftz.f32 %r4657, %r4625; + ex2.approx.ftz.f32 %r4658, %r4626; + ex2.approx.ftz.f32 %r4659, %r4627; + ex2.approx.ftz.f32 %r4660, %r4628; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s32 %r4661, %r2045, 49152; + add.s32 %r3801, %r4661, %r4422; + .loc 1 744 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:744:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16x2.f32 %r2886, %r4630, %r4629; + cvt.rn.bf16x2.f32 %r2887, %r4632, %r4631; + cvt.rn.bf16x2.f32 %r2888, %r4634, %r4633; + cvt.rn.bf16x2.f32 %r2889, %r4636, %r4635; + cvt.rn.bf16x2.f32 %r3018, %r4638, %r4637; + cvt.rn.bf16x2.f32 %r3019, %r4640, %r4639; + cvt.rn.bf16x2.f32 %r3020, %r4642, %r4641; + cvt.rn.bf16x2.f32 %r3021, %r4644, %r4643; + cvt.rn.bf16x2.f32 %r3150, %r4646, %r4645; + cvt.rn.bf16x2.f32 %r3151, %r4648, %r4647; + cvt.rn.bf16x2.f32 %r3152, %r4650, %r4649; + cvt.rn.bf16x2.f32 %r3153, %r4652, %r4651; + cvt.rn.bf16x2.f32 %r3282, %r4654, %r4653; + cvt.rn.bf16x2.f32 %r3283, %r4656, %r4655; + cvt.rn.bf16x2.f32 %r3284, %r4658, %r4657; + cvt.rn.bf16x2.f32 %r3285, %r4660, %r4659; + .loc 1 744 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:744:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + wgmma.fence.sync.aligned; + bfe.u32 %r4662, %r3801, 4, 14; + cvt.u64.u32 %rd427, %r4662; + or.b64 %rd341, %rd427, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517}, {%r2886,%r2887,%r2888,%r2889}, %rd341, %p43, 1, 1, 1; + // end inline asm + add.s32 %r4663, %r3801, 2048; + bfe.u32 %r4664, %r4663, 4, 14; + cvt.u64.u32 %rd428, %r4664; + or.b64 %rd342, %rd428, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517}, {%r3018,%r3019,%r3020,%r3021}, %rd342, %p43, 1, 1, 1; + // end inline asm + add.s32 %r4665, %r3801, 4096; + bfe.u32 %r4666, %r4665, 4, 14; + cvt.u64.u32 %rd429, %r4666; + or.b64 %rd343, %rd429, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517}, {%r3150,%r3151,%r3152,%r3153}, %rd343, %p43, 1, 1, 1; + // end inline asm + add.s32 %r4667, %r3801, 6144; + bfe.u32 %r4668, %r4667, 4, 14; + cvt.u64.u32 %rd430, %r4668; + or.b64 %rd344, %rd430, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517}, {%r3282,%r3283,%r3284,%r3285}, %rd344, %p43, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 746 21 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:746:21 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s32 %r4669, %r2045, 98816; + add.s32 %r4670, %r4669, %r4424; + add.s32 %r4671, %r4670, %r613; + ld.shared.v2.b32 {%r4672, %r4673}, [%r4671]; + ld.shared.v2.b32 {%r4674, %r4675}, [%r4671+32]; + ld.shared.v2.b32 {%r4676, %r4677}, [%r4671+64]; + ld.shared.v2.b32 {%r4678, %r4679}, [%r4671+96]; + ld.shared.v2.b32 {%r4680, %r4681}, [%r4671+128]; + ld.shared.v2.b32 {%r4682, %r4683}, [%r4671+160]; + ld.shared.v2.b32 {%r4684, %r4685}, [%r4671+192]; + ld.shared.v2.b32 {%r4686, %r4687}, [%r4671+224]; + .loc 1 750 20 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:750:20 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + wgmma.fence.sync.aligned; + add.s32 %r3798, %r2045, 132096; + add.s32 %r4688, %r4462, %r3798; + bfe.u32 %r4689, %r4688, 4, 14; + cvt.u64.u32 %rd431, %r4689; + or.b64 %rd345, %rd431, 4611686293372403712; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3382,%r3383,%r3384,%r3385,%r3386,%r3387,%r3388,%r3389,%r3390,%r3391,%r3392,%r3393,%r3394,%r3395,%r3396,%r3397,%r3398,%r3399,%r3400,%r3401,%r3402,%r3403,%r3404,%r3405,%r3406,%r3407,%r3408,%r3409,%r3410,%r3411,%r3412,%r3413}, %rd345, %rd341, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r4690, %r4466, %r3798; + bfe.u32 %r4691, %r4690, 4, 14; + cvt.u64.u32 %rd432, %r4691; + or.b64 %rd347, %rd432, 4611686293372403712; + add.s32 %r4692, %r3801, 32; + bfe.u32 %r4693, %r4692, 4, 14; + cvt.u64.u32 %rd433, %r4693; + or.b64 %rd348, %rd433, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3382,%r3383,%r3384,%r3385,%r3386,%r3387,%r3388,%r3389,%r3390,%r3391,%r3392,%r3393,%r3394,%r3395,%r3396,%r3397,%r3398,%r3399,%r3400,%r3401,%r3402,%r3403,%r3404,%r3405,%r3406,%r3407,%r3408,%r3409,%r3410,%r3411,%r3412,%r3413}, %rd347, %rd348, %p43, 1, 1, 0, 0; + // end inline asm + add.s32 %r4694, %r4471, %r3798; + bfe.u32 %r4695, %r4694, 4, 14; + cvt.u64.u32 %rd434, %r4695; + or.b64 %rd349, %rd434, 4611686293372403712; + add.s32 %r4696, %r3801, 64; + bfe.u32 %r4697, %r4696, 4, 14; + cvt.u64.u32 %rd435, %r4697; + or.b64 %rd350, %rd435, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3382,%r3383,%r3384,%r3385,%r3386,%r3387,%r3388,%r3389,%r3390,%r3391,%r3392,%r3393,%r3394,%r3395,%r3396,%r3397,%r3398,%r3399,%r3400,%r3401,%r3402,%r3403,%r3404,%r3405,%r3406,%r3407,%r3408,%r3409,%r3410,%r3411,%r3412,%r3413}, %rd349, %rd350, %p43, 1, 1, 0, 0; + // end inline asm + add.s32 %r4698, %r4476, %r3798; + bfe.u32 %r4699, %r4698, 4, 14; + cvt.u64.u32 %rd436, %r4699; + or.b64 %rd351, %rd436, 4611686293372403712; + add.s32 %r4700, %r3801, 96; + bfe.u32 %r4701, %r4700, 4, 14; + cvt.u64.u32 %rd437, %r4701; + or.b64 %rd352, %rd437, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3382,%r3383,%r3384,%r3385,%r3386,%r3387,%r3388,%r3389,%r3390,%r3391,%r3392,%r3393,%r3394,%r3395,%r3396,%r3397,%r3398,%r3399,%r3400,%r3401,%r3402,%r3403,%r3404,%r3405,%r3406,%r3407,%r3408,%r3409,%r3410,%r3411,%r3412,%r3413}, %rd351, %rd352, %p43, 1, 1, 0, 0; + // end inline asm + add.s32 %r4702, %r4481, %r3798; + bfe.u32 %r4703, %r4702, 4, 14; + cvt.u64.u32 %rd438, %r4703; + or.b64 %rd353, %rd438, 4611686293372403712; + add.s32 %r4704, %r3801, 8192; + bfe.u32 %r4705, %r4704, 4, 14; + cvt.u64.u32 %rd439, %r4705; + or.b64 %rd354, %rd439, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3382,%r3383,%r3384,%r3385,%r3386,%r3387,%r3388,%r3389,%r3390,%r3391,%r3392,%r3393,%r3394,%r3395,%r3396,%r3397,%r3398,%r3399,%r3400,%r3401,%r3402,%r3403,%r3404,%r3405,%r3406,%r3407,%r3408,%r3409,%r3410,%r3411,%r3412,%r3413}, %rd353, %rd354, %p43, 1, 1, 0, 0; + // end inline asm + add.s32 %r4706, %r4486, %r3798; + bfe.u32 %r4707, %r4706, 4, 14; + cvt.u64.u32 %rd440, %r4707; + or.b64 %rd355, %rd440, 4611686293372403712; + add.s32 %r4708, %r3801, 8224; + bfe.u32 %r4709, %r4708, 4, 14; + cvt.u64.u32 %rd441, %r4709; + or.b64 %rd356, %rd441, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3382,%r3383,%r3384,%r3385,%r3386,%r3387,%r3388,%r3389,%r3390,%r3391,%r3392,%r3393,%r3394,%r3395,%r3396,%r3397,%r3398,%r3399,%r3400,%r3401,%r3402,%r3403,%r3404,%r3405,%r3406,%r3407,%r3408,%r3409,%r3410,%r3411,%r3412,%r3413}, %rd355, %rd356, %p43, 1, 1, 0, 0; + // end inline asm + add.s32 %r4710, %r4491, %r3798; + bfe.u32 %r4711, %r4710, 4, 14; + cvt.u64.u32 %rd442, %r4711; + or.b64 %rd357, %rd442, 4611686293372403712; + add.s32 %r4712, %r3801, 8256; + bfe.u32 %r4713, %r4712, 4, 14; + cvt.u64.u32 %rd443, %r4713; + or.b64 %rd358, %rd443, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3382,%r3383,%r3384,%r3385,%r3386,%r3387,%r3388,%r3389,%r3390,%r3391,%r3392,%r3393,%r3394,%r3395,%r3396,%r3397,%r3398,%r3399,%r3400,%r3401,%r3402,%r3403,%r3404,%r3405,%r3406,%r3407,%r3408,%r3409,%r3410,%r3411,%r3412,%r3413}, %rd357, %rd358, %p43, 1, 1, 0, 0; + // end inline asm + add.s32 %r4714, %r4496, %r3798; + bfe.u32 %r4715, %r4714, 4, 14; + cvt.u64.u32 %rd444, %r4715; + or.b64 %rd359, %rd444, 4611686293372403712; + add.s32 %r4716, %r3801, 8288; + bfe.u32 %r4717, %r4716, 4, 14; + cvt.u64.u32 %rd445, %r4717; + or.b64 %rd360, %rd445, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3382,%r3383,%r3384,%r3385,%r3386,%r3387,%r3388,%r3389,%r3390,%r3391,%r3392,%r3393,%r3394,%r3395,%r3396,%r3397,%r3398,%r3399,%r3400,%r3401,%r3402,%r3403,%r3404,%r3405,%r3406,%r3407,%r3408,%r3409,%r3410,%r3411,%r3412,%r3413}, %rd359, %rd360, %p43, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r3803, %r2753; + mov.b32 %r3799, %r2753; + mov.b32 %r3800, %r2753; + mov.b32 %r3802, %r2753; + // begin inline asm + // wait for regs: %r3382,%r3383,%r3384,%r3385,%r3386,%r3387,%r3388,%r3389,%r3390,%r3391,%r3392,%r3393,%r3394,%r3395,%r3396,%r3397,%r3398,%r3399,%r3400,%r3401,%r3402,%r3403,%r3404,%r3405,%r3406,%r3407,%r3408,%r3409,%r3410,%r3411,%r3412,%r3413,%r3798,%r3799,%r3800,%r3801,%r3802,%r3803 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 751 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + sub.f32 %r4718, %r3382, %r4672; + sub.f32 %r4719, %r3383, %r4673; + sub.f32 %r4720, %r3384, %r4672; + sub.f32 %r4721, %r3385, %r4673; + sub.f32 %r4722, %r3386, %r4674; + sub.f32 %r4723, %r3387, %r4675; + sub.f32 %r4724, %r3388, %r4674; + sub.f32 %r4725, %r3389, %r4675; + sub.f32 %r4726, %r3390, %r4676; + sub.f32 %r4727, %r3391, %r4677; + sub.f32 %r4728, %r3392, %r4676; + sub.f32 %r4729, %r3393, %r4677; + sub.f32 %r4730, %r3394, %r4678; + sub.f32 %r4731, %r3395, %r4679; + sub.f32 %r4732, %r3396, %r4678; + sub.f32 %r4733, %r3397, %r4679; + sub.f32 %r4734, %r3398, %r4680; + sub.f32 %r4735, %r3399, %r4681; + sub.f32 %r4736, %r3400, %r4680; + sub.f32 %r4737, %r3401, %r4681; + sub.f32 %r4738, %r3402, %r4682; + sub.f32 %r4739, %r3403, %r4683; + sub.f32 %r4740, %r3404, %r4682; + sub.f32 %r4741, %r3405, %r4683; + sub.f32 %r4742, %r3406, %r4684; + sub.f32 %r4743, %r3407, %r4685; + sub.f32 %r4744, %r3408, %r4684; + sub.f32 %r4745, %r3409, %r4685; + sub.f32 %r4746, %r3410, %r4686; + sub.f32 %r4747, %r3411, %r4687; + sub.f32 %r4748, %r3412, %r4686; + sub.f32 %r4749, %r3413, %r4687; + .loc 1 751 16 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:16 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.f32 %r4750, %r4629, %r4718; + mul.f32 %r4751, %r4630, %r4719; + mul.f32 %r4752, %r4631, %r4720; + mul.f32 %r4753, %r4632, %r4721; + mul.f32 %r4754, %r4633, %r4722; + mul.f32 %r4755, %r4634, %r4723; + mul.f32 %r4756, %r4635, %r4724; + mul.f32 %r4757, %r4636, %r4725; + mul.f32 %r4758, %r4637, %r4726; + mul.f32 %r4759, %r4638, %r4727; + mul.f32 %r4760, %r4639, %r4728; + mul.f32 %r4761, %r4640, %r4729; + mul.f32 %r4762, %r4641, %r4730; + mul.f32 %r4763, %r4642, %r4731; + mul.f32 %r4764, %r4643, %r4732; + mul.f32 %r4765, %r4644, %r4733; + mul.f32 %r4766, %r4645, %r4734; + mul.f32 %r4767, %r4646, %r4735; + mul.f32 %r4768, %r4647, %r4736; + mul.f32 %r4769, %r4648, %r4737; + mul.f32 %r4770, %r4649, %r4738; + mul.f32 %r4771, %r4650, %r4739; + mul.f32 %r4772, %r4651, %r4740; + mul.f32 %r4773, %r4652, %r4741; + mul.f32 %r4774, %r4653, %r4742; + mul.f32 %r4775, %r4654, %r4743; + mul.f32 %r4776, %r4655, %r4744; + mul.f32 %r4777, %r4656, %r4745; + mul.f32 %r4778, %r4657, %r4746; + mul.f32 %r4779, %r4658, %r4747; + mul.f32 %r4780, %r4659, %r4748; + mul.f32 %r4781, %r4660, %r4749; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs1, %r4750; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs2, %rs1, 0x0000, %p150; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs3, %r4751; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs4, %rs3, 0x0000, %p151; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs5, %r4752; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs6, %rs5, 0x0000, %p152; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs7, %r4753; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs8, %rs7, 0x0000, %p153; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs9, %r4754; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs10, %rs9, 0x0000, %p154; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs11, %r4755; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs12, %rs11, 0x0000, %p155; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs13, %r4756; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs14, %rs13, 0x0000, %p156; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs15, %r4757; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs16, %rs15, 0x0000, %p157; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs17, %r4758; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs18, %rs17, 0x0000, %p158; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs19, %r4759; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs20, %rs19, 0x0000, %p159; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs21, %r4760; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs22, %rs21, 0x0000, %p160; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs23, %r4761; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs24, %rs23, 0x0000, %p161; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs25, %r4762; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs26, %rs25, 0x0000, %p162; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs27, %r4763; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs28, %rs27, 0x0000, %p163; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs29, %r4764; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs30, %rs29, 0x0000, %p164; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs31, %r4765; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs32, %rs31, 0x0000, %p165; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs33, %r4766; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs34, %rs33, 0x0000, %p166; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs35, %r4767; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs36, %rs35, 0x0000, %p167; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs37, %r4768; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs38, %rs37, 0x0000, %p168; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs39, %r4769; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs40, %rs39, 0x0000, %p169; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs41, %r4770; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs42, %rs41, 0x0000, %p170; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs43, %r4771; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs44, %rs43, 0x0000, %p171; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs45, %r4772; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs46, %rs45, 0x0000, %p172; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs47, %r4773; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs48, %rs47, 0x0000, %p173; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs49, %r4774; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs50, %rs49, 0x0000, %p174; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs51, %r4775; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs52, %rs51, 0x0000, %p175; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs53, %r4776; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs54, %rs53, 0x0000, %p176; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs55, %r4777; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs56, %rs55, 0x0000, %p177; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs57, %r4778; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs58, %rs57, 0x0000, %p178; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs59, %r4779; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs60, %rs59, 0x0000, %p179; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs61, %r4780; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs62, %rs61, 0x0000, %p180; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + cvt.rn.bf16.f32 %rs63, %r4781; + .loc 1 773 45 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:773:45 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + selp.b16 %rs64, %rs63, 0x0000, %p181; + .loc 1 775 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mov.b32 %r3970, {%rs2, %rs4}; + mov.b32 %r3971, {%rs6, %rs8}; + mov.b32 %r3972, {%rs10, %rs12}; + mov.b32 %r3973, {%rs14, %rs16}; + mov.b32 %r4102, {%rs18, %rs20}; + mov.b32 %r4103, {%rs22, %rs24}; + mov.b32 %r4104, {%rs26, %rs28}; + mov.b32 %r4105, {%rs30, %rs32}; + mov.b32 %r4234, {%rs34, %rs36}; + mov.b32 %r4235, {%rs38, %rs40}; + mov.b32 %r4236, {%rs42, %rs44}; + mov.b32 %r4237, {%rs46, %rs48}; + mov.b32 %r4366, {%rs50, %rs52}; + mov.b32 %r4367, {%rs54, %rs56}; + mov.b32 %r4368, {%rs58, %rs60}; + mov.b32 %r4369, {%rs62, %rs64}; + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581}, {%r3970,%r3971,%r3972,%r3973}, %rd326, %p43, 1, 1, 1; + // end inline asm + add.s32 %r4782, %r2755, 2048; + bfe.u32 %r4783, %r4782, 4, 14; + cvt.u64.u32 %rd446, %r4783; + or.b64 %rd362, %rd446, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581}, {%r4102,%r4103,%r4104,%r4105}, %rd362, %p43, 1, 1, 1; + // end inline asm + add.s32 %r4784, %r2755, 4096; + bfe.u32 %r4785, %r4784, 4, 14; + cvt.u64.u32 %rd447, %r4785; + or.b64 %rd363, %rd447, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581}, {%r4234,%r4235,%r4236,%r4237}, %rd363, %p43, 1, 1, 1; + // end inline asm + add.s32 %r4786, %r2755, 6144; + bfe.u32 %r4787, %r4786, 4, 14; + cvt.u64.u32 %rd448, %r4787; + or.b64 %rd364, %rd448, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581}, {%r4366,%r4367,%r4368,%r4369}, %rd364, %p43, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 628 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:628:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s32 %r13583, %r13583, %r13441; + add.s32 %r13584, %r13584, %r13441; + add.s32 %r13585, %r13585, %r13441; + add.s32 %r13586, %r13586, %r13441; + add.s32 %r13587, %r13587, %r13441; + add.s32 %r13588, %r13588, %r13441; + add.s32 %r13589, %r13589, %r13441; + add.s32 %r13590, %r13590, %r13441; + add.s32 %r13591, %r13591, %r13441; + add.s32 %r13592, %r13592, %r13441; + add.s32 %r13593, %r13593, %r13441; + add.s32 %r13594, %r13594, %r13441; + add.s32 %r13595, %r13595, %r13441; + add.s32 %r13596, %r13596, %r13441; + add.s32 %r13597, %r13597, %r13441; + add.s32 %r13598, %r13598, %r13441; + .loc 1 610 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:610:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s32 %r1135, %r13582, 1; + .loc 1 788 33 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:788:33 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + shr.u32 %r4788, %r1135, 1; + .loc 1 789 38 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:789:38 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mad.wide.u32 %rd366, %r4788, 4, %rd215; + .loc 1 789 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:789:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + // begin inline asm + mov.u64 %rd365, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd365, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4370, 0x0; + @%p65 ld.global.L1::evict_last.L2::cache_hint.b32 { %r4370 }, [ %rd366 + 0 ], %rd365; + // end inline asm + .loc 1 790 109 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:790:109 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s32 %r4789, %r4788, 1; + .loc 1 790 113 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:790:113 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + setp.lt.s32 %p182, %r4789, %r2013; + .loc 1 790 55 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:790:55 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s64 %rd369, %rd366, 4; + .loc 1 610 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:610:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + and.pred %p66, %p65, %p182; + .loc 1 790 25 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:790:25 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + // begin inline asm + mov.u64 %rd368, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd368, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4371, 0x0; + @%p66 ld.global.L1::evict_last.L2::cache_hint.b32 { %r4371 }, [ %rd369 + 0 ], %rd368; + // end inline asm + .loc 1 791 35 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:791:35 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + and.b32 %r4790, %r13582, 1; + .loc 1 792 34 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:792:34 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + sub.s32 %r4791, %r4371, %r4370; + .loc 1 792 48 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:792:48 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + shl.b32 %r4792, %r4791, 7; + .loc 1 792 63 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:792:63 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s32 %r4793, %r4792, -64; + .loc 1 793 29 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:793:29 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + xor.b32 %r4794, %r4790, 1; + .loc 1 793 61 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:793:61 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + shl.b32 %r4795, %r4790, 6; + .loc 1 793 42 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:793:42 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mad.lo.s32 %r13441, %r4793, %r4794, %r4795; + .loc 1 626 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:626:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + shl.b32 %r4796, %r13441, 12; + .loc 1 626 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:626:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.wide.s32 %rd449, %r4796, 2; + add.s64 %rd1031, %rd1031, %rd449; + add.s64 %rd1030, %rd1030, %rd449; + add.s64 %rd1029, %rd1029, %rd449; + add.s64 %rd1028, %rd1028, %rd449; + .loc 1 627 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:627:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + shl.b32 %r4797, %r13441, 7; + .loc 1 627 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:627:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.wide.s32 %rd450, %r4797, 2; + add.s64 %rd1027, %rd1027, %rd450; + add.s64 %rd1026, %rd1026, %rd450; + add.s64 %rd1025, %rd1025, %rd450; + add.s64 %rd1024, %rd1024, %rd450; + .loc 1 628 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:628:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s32 %r13453, %r13441, %r13453; + add.s32 %r13452, %r13441, %r13452; + add.s32 %r13451, %r13441, %r13451; + add.s32 %r13450, %r13441, %r13450; + add.s32 %r13449, %r13441, %r13449; + add.s32 %r13448, %r13441, %r13448; + add.s32 %r13447, %r13441, %r13447; + add.s32 %r13446, %r13441, %r13446; + .loc 1 610 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:610:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s32 %r4798, %r13443, 1; + setp.gt.s32 %p183, %r4798, 1; + selp.b32 %r13443, 0, %r4798, %p183; + add.s32 %r4799, %r13445, 1; + setp.gt.s32 %p184, %r4799, 2; + selp.b32 %r13445, 0, %r4799, %p184; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + shl.b32 %r4800, %r13445, 14; + add.s32 %r4801, %r2045, %r4800; + bar.sync 0; + add.s32 %r4372, %r4801, %r606; + selp.b32 %r4373, 16, 0, %p83; + // begin inline asm + cp.async.cg.shared.global [ %r4372 + 0 ], [ %rd1031 + 0 ], 0x10, %r4373; + // end inline asm + add.s32 %r4374, %r4372, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r4374 + 0 ], [ %rd1030 + 0 ], 0x10, %r4373; + // end inline asm + add.s32 %r4376, %r4372, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r4376 + 0 ], [ %rd1029 + 0 ], 0x10, %r4373; + // end inline asm + add.s32 %r4378, %r4372, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r4378 + 0 ], [ %rd1028 + 0 ], 0x10, %r4373; + // end inline asm + cp.async.commit_group; + .loc 1 672 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:672:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + mul.wide.s32 %rd451, %r13453, 4; + add.s64 %rd375, %rd128, %rd451; + mul.wide.s32 %rd452, %r13452, 4; + add.s64 %rd376, %rd128, %rd452; + mul.wide.s32 %rd453, %r13451, 4; + add.s64 %rd377, %rd128, %rd453; + mul.wide.s32 %rd454, %r13450, 4; + add.s64 %rd378, %rd128, %rd454; + mul.wide.s32 %rd455, %r13449, 4; + add.s64 %rd379, %rd128, %rd455; + mul.wide.s32 %rd456, %r13448, 4; + add.s64 %rd380, %rd128, %rd456; + mul.wide.s32 %rd457, %r13447, 4; + add.s64 %rd381, %rd128, %rd457; + mul.wide.s32 %rd458, %r13446, 4; + add.s64 %rd382, %rd128, %rd458; + .loc 1 672 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:672:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + shl.b32 %r4802, %r13443, 8; + add.s32 %r4803, %r4425, %r4802; + add.s32 %r4380, %r4803, %r613; + selp.b32 %r4381, 8, 0, %p83; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4380 + 0 ], [ %rd375 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4382, %r4380, 32; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4382 + 0 ], [ %rd376 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4384, %r4380, 64; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4384 + 0 ], [ %rd377 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4386, %r4380, 96; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4386 + 0 ], [ %rd378 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4388, %r4380, 128; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4388 + 0 ], [ %rd379 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4390, %r4380, 160; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4390 + 0 ], [ %rd380 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4392, %r4380, 192; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4392 + 0 ], [ %rd381 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4394, %r4380, 224; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4394 + 0 ], [ %rd382 + 0 ], 0x8, %r4381; + // end inline asm + cp.async.commit_group; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s32 %r4804, %r4661, %r4800; + add.s32 %r4396, %r4804, %r606; + // begin inline asm + cp.async.cg.shared.global [ %r4396 + 0 ], [ %rd1027 + 0 ], 0x10, %r4373; + // end inline asm + add.s32 %r4398, %r4396, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r4398 + 0 ], [ %rd1026 + 0 ], 0x10, %r4373; + // end inline asm + add.s32 %r4400, %r4396, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r4400 + 0 ], [ %rd1025 + 0 ], 0x10, %r4373; + // end inline asm + add.s32 %r4402, %r4396, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r4402 + 0 ], [ %rd1024 + 0 ], 0x10, %r4373; + // end inline asm + cp.async.commit_group; + .loc 1 746 29 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:746:29 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s64 %rd387, %rd129, %rd451; + add.s64 %rd388, %rd129, %rd452; + add.s64 %rd389, %rd129, %rd453; + add.s64 %rd390, %rd129, %rd454; + add.s64 %rd391, %rd129, %rd455; + add.s64 %rd392, %rd129, %rd456; + add.s64 %rd393, %rd129, %rd457; + add.s64 %rd394, %rd129, %rd458; + .loc 1 746 21 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:746:21 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + add.s32 %r4805, %r4669, %r4802; + add.s32 %r4404, %r4805, %r613; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4404 + 0 ], [ %rd387 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4406, %r4404, 32; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4406 + 0 ], [ %rd388 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4408, %r4404, 64; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4408 + 0 ], [ %rd389 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4410, %r4404, 96; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4410 + 0 ], [ %rd390 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4412, %r4404, 128; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4412 + 0 ], [ %rd391 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4414, %r4404, 160; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4414 + 0 ], [ %rd392 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4416, %r4404, 192; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4416 + 0 ], [ %rd393 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4418, %r4404, 224; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4418 + 0 ], [ %rd394 + 0 ], 0x8, %r4381; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:610:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + setp.ne.b32 %p185, %r685, %r1135; + mov.b32 %r13582, %r1135; + @%p185 bra $L__BB0_11; +$L__BB0_12: // %._crit_edge + // in Loop: Header=BB0_9 Depth=1 + .loc 1 0 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:0:28 + setp.lt.s32 %p218, %r604, 1; + .loc 1 610 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:610:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:298:16 ] + // begin inline asm + // wait for regs: %r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517,%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp28: + .loc 1 601 18 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:601:18 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s64 %rd508, %rd126, %rd507; + add.s64 %rd510, %rd126, %rd509; + add.s64 %rd512, %rd126, %rd511; + add.s64 %rd514, %rd126, %rd513; + .loc 1 601 49 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:601:49 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s64 %rd459, %rd508, %rd307; + add.s64 %rd460, %rd510, %rd307; + add.s64 %rd461, %rd512, %rd307; + add.s64 %rd462, %rd514, %rd307; + .loc 1 602 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:602:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s64 %rd517, %rd127, %rd516; + add.s64 %rd519, %rd127, %rd518; + add.s64 %rd521, %rd127, %rd520; + add.s64 %rd523, %rd127, %rd522; + .loc 1 602 51 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:602:51 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s64 %rd471, %rd517, %rd307; + add.s64 %rd472, %rd519, %rd307; + add.s64 %rd473, %rd521, %rd307; + add.s64 %rd474, %rd523, %rd307; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + // begin inline asm + cp.async.cg.shared.global [ %r5062 + 0 ], [ %rd459 + 0 ], 0x10, %r5063; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5064 + 0 ], [ %rd460 + 0 ], 0x10, %r5063; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5066 + 0 ], [ %rd461 + 0 ], 0x10, %r5063; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5068 + 0 ], [ %rd462 + 0 ], 0x10, %r5063; + // end inline asm + cp.async.commit_group; + .loc 1 672 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:672:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s64 %rd463, %rd128, %rd524; + cvt.s64.s32 %rd525, %r602; + add.s64 %rd526, %rd525, %rd130; + shl.b64 %rd527, %rd526, 2; + add.s64 %rd528, %rd128, %rd527; + add.s64 %rd464, %rd528, 32; + add.s64 %rd465, %rd528, 64; + add.s64 %rd466, %rd528, 96; + add.s64 %rd467, %rd528, 128; + add.s64 %rd468, %rd528, 160; + add.s64 %rd469, %rd528, 192; + add.s64 %rd470, %rd528, 224; + .loc 1 672 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:672:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5070 + 0 ], [ %rd463 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5072 + 0 ], [ %rd464 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5074 + 0 ], [ %rd465 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5076 + 0 ], [ %rd466 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5078 + 0 ], [ %rd467 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5080 + 0 ], [ %rd468 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5082 + 0 ], [ %rd469 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5084 + 0 ], [ %rd470 + 0 ], 0x8, %r5071; + // end inline asm + cp.async.commit_group; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + // begin inline asm + cp.async.cg.shared.global [ %r5086 + 0 ], [ %rd471 + 0 ], 0x10, %r5063; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5088 + 0 ], [ %rd472 + 0 ], 0x10, %r5063; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5090 + 0 ], [ %rd473 + 0 ], 0x10, %r5063; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5092 + 0 ], [ %rd474 + 0 ], 0x10, %r5063; + // end inline asm + cp.async.commit_group; + .loc 1 746 29 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:746:29 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s64 %rd475, %rd129, %rd524; + add.s64 %rd529, %rd129, %rd527; + add.s64 %rd476, %rd529, 32; + add.s64 %rd477, %rd529, 64; + add.s64 %rd478, %rd529, 96; + add.s64 %rd479, %rd529, 128; + add.s64 %rd480, %rd529, 160; + add.s64 %rd481, %rd529, 192; + add.s64 %rd482, %rd529, 224; + .loc 1 746 21 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:746:21 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5094 + 0 ], [ %rd475 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5096 + 0 ], [ %rd476 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5098 + 0 ], [ %rd477 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5100 + 0 ], [ %rd478 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5102 + 0 ], [ %rd479 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5104 + 0 ], [ %rd480 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5106 + 0 ], [ %rd481 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5108 + 0 ], [ %rd482 + 0 ], 0x8, %r5071; + // end inline asm + cp.async.commit_group; + .loc 1 626 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:626:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s64 %rd1039, %rd459, 524288; + add.s64 %rd1038, %rd460, 524288; + add.s64 %rd1037, %rd461, 524288; + add.s64 %rd1036, %rd462, 524288; + .loc 1 627 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:627:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s64 %rd1035, %rd471, 16384; + add.s64 %rd1034, %rd472, 16384; + add.s64 %rd1033, %rd473, 16384; + add.s64 %rd1032, %rd474, 16384; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + bar.sync 0; + // begin inline asm + cp.async.cg.shared.global [ %r5110 + 0 ], [ %rd1039 + 0 ], 0x10, %r5111; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5112 + 0 ], [ %rd1038 + 0 ], 0x10, %r5111; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5114 + 0 ], [ %rd1037 + 0 ], 0x10, %r5111; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5116 + 0 ], [ %rd1036 + 0 ], 0x10, %r5111; + // end inline asm + cp.async.commit_group; + .loc 1 672 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:672:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s64 %rd487, %rd463, 256; + add.s64 %rd530, %rd527, 256; + add.s64 %rd531, %rd128, %rd530; + add.s64 %rd488, %rd531, 32; + add.s64 %rd489, %rd531, 64; + add.s64 %rd490, %rd531, 96; + add.s64 %rd491, %rd531, 128; + add.s64 %rd492, %rd531, 160; + add.s64 %rd493, %rd531, 192; + add.s64 %rd494, %rd531, 224; + .loc 1 672 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:672:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5118 + 0 ], [ %rd487 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5120 + 0 ], [ %rd488 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5122 + 0 ], [ %rd489 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5124 + 0 ], [ %rd490 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5126 + 0 ], [ %rd491 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5128 + 0 ], [ %rd492 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5130 + 0 ], [ %rd493 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5132 + 0 ], [ %rd494 + 0 ], 0x8, %r5119; + // end inline asm + cp.async.commit_group; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + // begin inline asm + cp.async.cg.shared.global [ %r5134 + 0 ], [ %rd1035 + 0 ], 0x10, %r5111; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5136 + 0 ], [ %rd1034 + 0 ], 0x10, %r5111; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5138 + 0 ], [ %rd1033 + 0 ], 0x10, %r5111; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5140 + 0 ], [ %rd1032 + 0 ], 0x10, %r5111; + // end inline asm + cp.async.commit_group; + .loc 1 746 29 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:746:29 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s64 %rd499, %rd475, 256; + add.s64 %rd532, %rd129, %rd530; + add.s64 %rd500, %rd532, 32; + add.s64 %rd501, %rd532, 64; + add.s64 %rd502, %rd532, 96; + add.s64 %rd503, %rd532, 128; + add.s64 %rd504, %rd532, 160; + add.s64 %rd505, %rd532, 192; + add.s64 %rd506, %rd532, 224; + .loc 1 746 21 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:746:21 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5142 + 0 ], [ %rd499 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5144 + 0 ], [ %rd500 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5146 + 0 ], [ %rd501 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5148 + 0 ], [ %rd502 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5150 + 0 ], [ %rd503 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5152 + 0 ], [ %rd504 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5154 + 0 ], [ %rd505 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5156 + 0 ], [ %rd506 + 0 ], 0x8, %r5119; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:610:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + @%p218 bra $L__BB0_15; +// %bb.13: // %.lr.ph1691.preheader + // in Loop: Header=BB0_9 Depth=1 + .loc 1 0 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:0:28 + mov.b32 %r5712, 0; + mov.b32 %r13728, 1; + mov.b32 %r13727, -1; + mov.b32 %r13729, %r13727; + mov.b32 %r13730, %r13728; + mov.b32 %r13731, %r680; + mov.b32 %r13732, %r679; + mov.b32 %r13733, %r678; + mov.b32 %r13734, %r677; + mov.b32 %r13735, %r676; + mov.b32 %r13736, %r675; + mov.b32 %r13737, %r674; + mov.b32 %r13738, %r673; + mov.b32 %r13867, %r5712; +$L__BB0_14: // %.lr.ph1691 + // Parent Loop BB0_9 Depth=1 + // => This Inner Loop Header: Depth=2 + .loc 1 610 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:610:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + setp.lt.s32 %p259, %r13867, %r683; + setp.lt.s32 %p241, %r13867, %r684; + add.s32 %r7379, %r13727, 1; + setp.gt.s32 %p260, %r7379, 1; + selp.b32 %r13727, 0, %r7379, %p260; + add.s32 %r7380, %r13729, 1; + setp.gt.s32 %p261, %r7380, 2; + selp.b32 %r13729, 0, %r7380, %p261; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + cp.async.wait_group 4; + bar.sync 0; + shl.b32 %r7381, %r13729, 14; + add.s32 %r5714, %r2045, %r7381; + .loc 1 672 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:672:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + shl.b32 %r7383, %r13727, 8; + add.s32 %r7384, %r2045, 98304; + add.s32 %r7385, %r7384, %r7383; + add.s32 %r7386, %r7385, %r613; + ld.shared.v2.b32 {%r7387, %r7388}, [%r7386]; + ld.shared.v2.b32 {%r7389, %r7390}, [%r7386+32]; + ld.shared.v2.b32 {%r7391, %r7392}, [%r7386+64]; + ld.shared.v2.b32 {%r7393, %r7394}, [%r7386+96]; + ld.shared.v2.b32 {%r7395, %r7396}, [%r7386+128]; + ld.shared.v2.b32 {%r7397, %r7398}, [%r7386+160]; + ld.shared.v2.b32 {%r7399, %r7400}, [%r7386+192]; + ld.shared.v2.b32 {%r7401, %r7402}, [%r7386+224]; + .loc 1 675 26 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:675:26 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + setp.eq.f32 %p262, %r7387, 0fFF800000; + setp.eq.f32 %p263, %r7388, 0fFF800000; + setp.eq.f32 %p264, %r7389, 0fFF800000; + setp.eq.f32 %p265, %r7390, 0fFF800000; + setp.eq.f32 %p266, %r7391, 0fFF800000; + setp.eq.f32 %p267, %r7392, 0fFF800000; + setp.eq.f32 %p268, %r7393, 0fFF800000; + setp.eq.f32 %p269, %r7394, 0fFF800000; + setp.eq.f32 %p270, %r7395, 0fFF800000; + setp.eq.f32 %p271, %r7396, 0fFF800000; + setp.eq.f32 %p272, %r7397, 0fFF800000; + setp.eq.f32 %p273, %r7398, 0fFF800000; + setp.eq.f32 %p274, %r7399, 0fFF800000; + setp.eq.f32 %p275, %r7400, 0fFF800000; + setp.eq.f32 %p276, %r7401, 0fFF800000; + setp.eq.f32 %p277, %r7402, 0fFF800000; + .loc 1 675 46 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:675:46 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + selp.f32 %r7403, 0f00000000, %r7387, %p262; + selp.f32 %r7404, 0f00000000, %r7388, %p263; + selp.f32 %r7405, 0f00000000, %r7389, %p264; + selp.f32 %r7406, 0f00000000, %r7390, %p265; + selp.f32 %r7407, 0f00000000, %r7391, %p266; + selp.f32 %r7408, 0f00000000, %r7392, %p267; + selp.f32 %r7409, 0f00000000, %r7393, %p268; + selp.f32 %r7410, 0f00000000, %r7394, %p269; + selp.f32 %r7411, 0f00000000, %r7395, %p270; + selp.f32 %r7412, 0f00000000, %r7396, %p271; + selp.f32 %r7413, 0f00000000, %r7397, %p272; + selp.f32 %r7414, 0f00000000, %r7398, %p273; + selp.f32 %r7415, 0f00000000, %r7399, %p274; + selp.f32 %r7416, 0f00000000, %r7400, %p275; + selp.f32 %r7417, 0f00000000, %r7401, %p276; + selp.f32 %r7418, 0f00000000, %r7402, %p277; + .loc 1 676 20 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:676:20 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + shfl.sync.idx.b32 %r7419, %r7, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r7420, %r7419, 11; + and.b32 %r7421, %r7420, 8192; + add.s32 %r5673, %r2045, 99328; + add.s32 %r7422, %r7421, %r5673; + bfe.u32 %r7423, %r7422, 4, 14; + cvt.u64.u32 %rd603, %r7423; + or.b64 %rd533, %rd603, 4611686293372403712; + bfe.u32 %r7424, %r5714, 4, 14; + cvt.u64.u32 %rd604, %r7424; + or.b64 %rd534, %rd604, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5257,%r5258,%r5259,%r5260,%r5261,%r5262,%r5263,%r5264,%r5265,%r5266,%r5267,%r5268,%r5269,%r5270,%r5271,%r5272,%r5273,%r5274,%r5275,%r5276,%r5277,%r5278,%r5279,%r5280,%r5281,%r5282,%r5283,%r5284,%r5285,%r5286,%r5287,%r5288}, %rd533, %rd534, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r7425, %r7421, 32; + add.s32 %r7426, %r7425, %r5673; + bfe.u32 %r7427, %r7426, 4, 14; + cvt.u64.u32 %rd605, %r7427; + or.b64 %rd535, %rd605, 4611686293372403712; + add.s32 %r7428, %r5714, 32; + bfe.u32 %r7429, %r7428, 4, 14; + cvt.u64.u32 %rd606, %r7429; + or.b64 %rd536, %rd606, 4611686293338849280; + mov.pred %p219, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5257,%r5258,%r5259,%r5260,%r5261,%r5262,%r5263,%r5264,%r5265,%r5266,%r5267,%r5268,%r5269,%r5270,%r5271,%r5272,%r5273,%r5274,%r5275,%r5276,%r5277,%r5278,%r5279,%r5280,%r5281,%r5282,%r5283,%r5284,%r5285,%r5286,%r5287,%r5288}, %rd535, %rd536, %p219, 1, 1, 0, 0; + // end inline asm + or.b32 %r7430, %r7421, 64; + add.s32 %r7431, %r7430, %r5673; + bfe.u32 %r7432, %r7431, 4, 14; + cvt.u64.u32 %rd607, %r7432; + or.b64 %rd537, %rd607, 4611686293372403712; + add.s32 %r7433, %r5714, 64; + bfe.u32 %r7434, %r7433, 4, 14; + cvt.u64.u32 %rd608, %r7434; + or.b64 %rd538, %rd608, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5257,%r5258,%r5259,%r5260,%r5261,%r5262,%r5263,%r5264,%r5265,%r5266,%r5267,%r5268,%r5269,%r5270,%r5271,%r5272,%r5273,%r5274,%r5275,%r5276,%r5277,%r5278,%r5279,%r5280,%r5281,%r5282,%r5283,%r5284,%r5285,%r5286,%r5287,%r5288}, %rd537, %rd538, %p219, 1, 1, 0, 0; + // end inline asm + or.b32 %r7435, %r7421, 96; + add.s32 %r7436, %r7435, %r5673; + bfe.u32 %r7437, %r7436, 4, 14; + cvt.u64.u32 %rd609, %r7437; + or.b64 %rd539, %rd609, 4611686293372403712; + add.s32 %r7438, %r5714, 96; + bfe.u32 %r7439, %r7438, 4, 14; + cvt.u64.u32 %rd610, %r7439; + or.b64 %rd540, %rd610, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5257,%r5258,%r5259,%r5260,%r5261,%r5262,%r5263,%r5264,%r5265,%r5266,%r5267,%r5268,%r5269,%r5270,%r5271,%r5272,%r5273,%r5274,%r5275,%r5276,%r5277,%r5278,%r5279,%r5280,%r5281,%r5282,%r5283,%r5284,%r5285,%r5286,%r5287,%r5288}, %rd539, %rd540, %p219, 1, 1, 0, 0; + // end inline asm + or.b32 %r7440, %r7421, 16384; + add.s32 %r7441, %r7440, %r5673; + bfe.u32 %r7442, %r7441, 4, 14; + cvt.u64.u32 %rd611, %r7442; + or.b64 %rd541, %rd611, 4611686293372403712; + add.s32 %r7443, %r5714, 8192; + bfe.u32 %r7444, %r7443, 4, 14; + cvt.u64.u32 %rd612, %r7444; + or.b64 %rd542, %rd612, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5257,%r5258,%r5259,%r5260,%r5261,%r5262,%r5263,%r5264,%r5265,%r5266,%r5267,%r5268,%r5269,%r5270,%r5271,%r5272,%r5273,%r5274,%r5275,%r5276,%r5277,%r5278,%r5279,%r5280,%r5281,%r5282,%r5283,%r5284,%r5285,%r5286,%r5287,%r5288}, %rd541, %rd542, %p219, 1, 1, 0, 0; + // end inline asm + or.b32 %r7445, %r7421, 16416; + add.s32 %r7446, %r7445, %r5673; + bfe.u32 %r7447, %r7446, 4, 14; + cvt.u64.u32 %rd613, %r7447; + or.b64 %rd543, %rd613, 4611686293372403712; + add.s32 %r7448, %r5714, 8224; + bfe.u32 %r7449, %r7448, 4, 14; + cvt.u64.u32 %rd614, %r7449; + or.b64 %rd544, %rd614, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5257,%r5258,%r5259,%r5260,%r5261,%r5262,%r5263,%r5264,%r5265,%r5266,%r5267,%r5268,%r5269,%r5270,%r5271,%r5272,%r5273,%r5274,%r5275,%r5276,%r5277,%r5278,%r5279,%r5280,%r5281,%r5282,%r5283,%r5284,%r5285,%r5286,%r5287,%r5288}, %rd543, %rd544, %p219, 1, 1, 0, 0; + // end inline asm + or.b32 %r7450, %r7421, 16448; + add.s32 %r7451, %r7450, %r5673; + bfe.u32 %r7452, %r7451, 4, 14; + cvt.u64.u32 %rd615, %r7452; + or.b64 %rd545, %rd615, 4611686293372403712; + add.s32 %r7453, %r5714, 8256; + bfe.u32 %r7454, %r7453, 4, 14; + cvt.u64.u32 %rd616, %r7454; + or.b64 %rd546, %rd616, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5257,%r5258,%r5259,%r5260,%r5261,%r5262,%r5263,%r5264,%r5265,%r5266,%r5267,%r5268,%r5269,%r5270,%r5271,%r5272,%r5273,%r5274,%r5275,%r5276,%r5277,%r5278,%r5279,%r5280,%r5281,%r5282,%r5283,%r5284,%r5285,%r5286,%r5287,%r5288}, %rd545, %rd546, %p219, 1, 1, 0, 0; + // end inline asm + or.b32 %r7455, %r7421, 16480; + add.s32 %r7456, %r7455, %r5673; + bfe.u32 %r7457, %r7456, 4, 14; + cvt.u64.u32 %rd617, %r7457; + or.b64 %rd547, %rd617, 4611686293372403712; + add.s32 %r7458, %r5714, 8288; + bfe.u32 %r7459, %r7458, 4, 14; + cvt.u64.u32 %rd618, %r7459; + or.b64 %rd548, %rd618, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5257,%r5258,%r5259,%r5260,%r5261,%r5262,%r5263,%r5264,%r5265,%r5266,%r5267,%r5268,%r5269,%r5270,%r5271,%r5272,%r5273,%r5274,%r5275,%r5276,%r5277,%r5278,%r5279,%r5280,%r5281,%r5282,%r5283,%r5284,%r5285,%r5286,%r5287,%r5288}, %rd547, %rd548, %p219, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r5674, %r5712; + mov.b32 %r5675, %r5712; + mov.b32 %r5677, %r5712; + mov.b32 %r5678, %r5712; + mov.b32 %r5676, %r5714; + // begin inline asm + // wait for regs: %r5257,%r5258,%r5259,%r5260,%r5261,%r5262,%r5263,%r5264,%r5265,%r5266,%r5267,%r5268,%r5269,%r5270,%r5271,%r5272,%r5273,%r5274,%r5275,%r5276,%r5277,%r5278,%r5279,%r5280,%r5281,%r5282,%r5283,%r5284,%r5285,%r5286,%r5287,%r5288,%r5673,%r5674,%r5675,%r5676,%r5677,%r5678 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 678 15 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:678:15 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + mul.f32 %r7460, %r5257, 0f3DB504F3; + mul.f32 %r7461, %r5258, 0f3DB504F3; + mul.f32 %r7462, %r5259, 0f3DB504F3; + mul.f32 %r7463, %r5260, 0f3DB504F3; + mul.f32 %r7464, %r5261, 0f3DB504F3; + mul.f32 %r7465, %r5262, 0f3DB504F3; + mul.f32 %r7466, %r5263, 0f3DB504F3; + mul.f32 %r7467, %r5264, 0f3DB504F3; + mul.f32 %r7468, %r5265, 0f3DB504F3; + mul.f32 %r7469, %r5266, 0f3DB504F3; + mul.f32 %r7470, %r5267, 0f3DB504F3; + mul.f32 %r7471, %r5268, 0f3DB504F3; + mul.f32 %r7472, %r5269, 0f3DB504F3; + mul.f32 %r7473, %r5270, 0f3DB504F3; + mul.f32 %r7474, %r5271, 0f3DB504F3; + mul.f32 %r7475, %r5272, 0f3DB504F3; + mul.f32 %r7476, %r5273, 0f3DB504F3; + mul.f32 %r7477, %r5274, 0f3DB504F3; + mul.f32 %r7478, %r5275, 0f3DB504F3; + mul.f32 %r7479, %r5276, 0f3DB504F3; + mul.f32 %r7480, %r5277, 0f3DB504F3; + mul.f32 %r7481, %r5278, 0f3DB504F3; + mul.f32 %r7482, %r5279, 0f3DB504F3; + mul.f32 %r7483, %r5280, 0f3DB504F3; + mul.f32 %r7484, %r5281, 0f3DB504F3; + mul.f32 %r7485, %r5282, 0f3DB504F3; + mul.f32 %r7486, %r5283, 0f3DB504F3; + mul.f32 %r7487, %r5284, 0f3DB504F3; + mul.f32 %r7488, %r5285, 0f3DB504F3; + mul.f32 %r7489, %r5286, 0f3DB504F3; + mul.f32 %r7490, %r5287, 0f3DB504F3; + mul.f32 %r7491, %r5288, 0f3DB504F3; + .loc 1 740 40 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:740:40 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + neg.f32 %r7492, %r7403; + fma.rn.f32 %r7493, %r7460, 0f3FB8AA3B, %r7492; + neg.f32 %r7494, %r7404; + fma.rn.f32 %r7495, %r7461, 0f3FB8AA3B, %r7494; + fma.rn.f32 %r7496, %r7462, 0f3FB8AA3B, %r7492; + fma.rn.f32 %r7497, %r7463, 0f3FB8AA3B, %r7494; + neg.f32 %r7498, %r7405; + fma.rn.f32 %r7499, %r7464, 0f3FB8AA3B, %r7498; + neg.f32 %r7500, %r7406; + fma.rn.f32 %r7501, %r7465, 0f3FB8AA3B, %r7500; + fma.rn.f32 %r7502, %r7466, 0f3FB8AA3B, %r7498; + fma.rn.f32 %r7503, %r7467, 0f3FB8AA3B, %r7500; + neg.f32 %r7504, %r7407; + fma.rn.f32 %r7505, %r7468, 0f3FB8AA3B, %r7504; + neg.f32 %r7506, %r7408; + fma.rn.f32 %r7507, %r7469, 0f3FB8AA3B, %r7506; + fma.rn.f32 %r7508, %r7470, 0f3FB8AA3B, %r7504; + fma.rn.f32 %r7509, %r7471, 0f3FB8AA3B, %r7506; + neg.f32 %r7510, %r7409; + fma.rn.f32 %r7511, %r7472, 0f3FB8AA3B, %r7510; + neg.f32 %r7512, %r7410; + fma.rn.f32 %r7513, %r7473, 0f3FB8AA3B, %r7512; + fma.rn.f32 %r7514, %r7474, 0f3FB8AA3B, %r7510; + fma.rn.f32 %r7515, %r7475, 0f3FB8AA3B, %r7512; + neg.f32 %r7516, %r7411; + fma.rn.f32 %r7517, %r7476, 0f3FB8AA3B, %r7516; + neg.f32 %r7518, %r7412; + fma.rn.f32 %r7519, %r7477, 0f3FB8AA3B, %r7518; + fma.rn.f32 %r7520, %r7478, 0f3FB8AA3B, %r7516; + fma.rn.f32 %r7521, %r7479, 0f3FB8AA3B, %r7518; + neg.f32 %r7522, %r7413; + fma.rn.f32 %r7523, %r7480, 0f3FB8AA3B, %r7522; + neg.f32 %r7524, %r7414; + fma.rn.f32 %r7525, %r7481, 0f3FB8AA3B, %r7524; + fma.rn.f32 %r7526, %r7482, 0f3FB8AA3B, %r7522; + fma.rn.f32 %r7527, %r7483, 0f3FB8AA3B, %r7524; + neg.f32 %r7528, %r7415; + fma.rn.f32 %r7529, %r7484, 0f3FB8AA3B, %r7528; + neg.f32 %r7530, %r7416; + fma.rn.f32 %r7531, %r7485, 0f3FB8AA3B, %r7530; + fma.rn.f32 %r7532, %r7486, 0f3FB8AA3B, %r7528; + fma.rn.f32 %r7533, %r7487, 0f3FB8AA3B, %r7530; + neg.f32 %r7534, %r7417; + fma.rn.f32 %r7535, %r7488, 0f3FB8AA3B, %r7534; + neg.f32 %r7536, %r7418; + fma.rn.f32 %r7537, %r7489, 0f3FB8AA3B, %r7536; + fma.rn.f32 %r7538, %r7490, 0f3FB8AA3B, %r7534; + fma.rn.f32 %r7539, %r7491, 0f3FB8AA3B, %r7536; + .loc 1 740 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:740:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + ex2.approx.ftz.f32 %r7540, %r7493; + ex2.approx.ftz.f32 %r7541, %r7495; + ex2.approx.ftz.f32 %r7542, %r7496; + ex2.approx.ftz.f32 %r7543, %r7497; + ex2.approx.ftz.f32 %r7544, %r7499; + ex2.approx.ftz.f32 %r7545, %r7501; + ex2.approx.ftz.f32 %r7546, %r7502; + ex2.approx.ftz.f32 %r7547, %r7503; + ex2.approx.ftz.f32 %r7548, %r7505; + ex2.approx.ftz.f32 %r7549, %r7507; + ex2.approx.ftz.f32 %r7550, %r7508; + ex2.approx.ftz.f32 %r7551, %r7509; + ex2.approx.ftz.f32 %r7552, %r7511; + ex2.approx.ftz.f32 %r7553, %r7513; + ex2.approx.ftz.f32 %r7554, %r7514; + ex2.approx.ftz.f32 %r7555, %r7515; + ex2.approx.ftz.f32 %r7556, %r7517; + ex2.approx.ftz.f32 %r7557, %r7519; + ex2.approx.ftz.f32 %r7558, %r7520; + ex2.approx.ftz.f32 %r7559, %r7521; + ex2.approx.ftz.f32 %r7560, %r7523; + ex2.approx.ftz.f32 %r7561, %r7525; + ex2.approx.ftz.f32 %r7562, %r7526; + ex2.approx.ftz.f32 %r7563, %r7527; + ex2.approx.ftz.f32 %r7564, %r7529; + ex2.approx.ftz.f32 %r7565, %r7531; + ex2.approx.ftz.f32 %r7566, %r7532; + ex2.approx.ftz.f32 %r7567, %r7533; + ex2.approx.ftz.f32 %r7568, %r7535; + ex2.approx.ftz.f32 %r7569, %r7537; + ex2.approx.ftz.f32 %r7570, %r7538; + ex2.approx.ftz.f32 %r7571, %r7539; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s32 %r7572, %r2045, 49152; + add.s32 %r6760, %r7572, %r7381; + .loc 1 744 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:744:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + cvt.rn.bf16x2.f32 %r5845, %r7541, %r7540; + cvt.rn.bf16x2.f32 %r5846, %r7543, %r7542; + cvt.rn.bf16x2.f32 %r5847, %r7545, %r7544; + cvt.rn.bf16x2.f32 %r5848, %r7547, %r7546; + cvt.rn.bf16x2.f32 %r5977, %r7549, %r7548; + cvt.rn.bf16x2.f32 %r5978, %r7551, %r7550; + cvt.rn.bf16x2.f32 %r5979, %r7553, %r7552; + cvt.rn.bf16x2.f32 %r5980, %r7555, %r7554; + cvt.rn.bf16x2.f32 %r6109, %r7557, %r7556; + cvt.rn.bf16x2.f32 %r6110, %r7559, %r7558; + cvt.rn.bf16x2.f32 %r6111, %r7561, %r7560; + cvt.rn.bf16x2.f32 %r6112, %r7563, %r7562; + cvt.rn.bf16x2.f32 %r6241, %r7565, %r7564; + cvt.rn.bf16x2.f32 %r6242, %r7567, %r7566; + cvt.rn.bf16x2.f32 %r6243, %r7569, %r7568; + cvt.rn.bf16x2.f32 %r6244, %r7571, %r7570; + .loc 1 744 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:744:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + wgmma.fence.sync.aligned; + bfe.u32 %r7573, %r6760, 4, 14; + cvt.u64.u32 %rd619, %r7573; + or.b64 %rd549, %rd619, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517}, {%r5845,%r5846,%r5847,%r5848}, %rd549, %p219, 1, 1, 1; + // end inline asm + add.s32 %r7574, %r6760, 2048; + bfe.u32 %r7575, %r7574, 4, 14; + cvt.u64.u32 %rd620, %r7575; + or.b64 %rd550, %rd620, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517}, {%r5977,%r5978,%r5979,%r5980}, %rd550, %p219, 1, 1, 1; + // end inline asm + add.s32 %r7576, %r6760, 4096; + bfe.u32 %r7577, %r7576, 4, 14; + cvt.u64.u32 %rd621, %r7577; + or.b64 %rd551, %rd621, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517}, {%r6109,%r6110,%r6111,%r6112}, %rd551, %p219, 1, 1, 1; + // end inline asm + add.s32 %r7578, %r6760, 6144; + bfe.u32 %r7579, %r7578, 4, 14; + cvt.u64.u32 %rd622, %r7579; + or.b64 %rd552, %rd622, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517}, {%r6241,%r6242,%r6243,%r6244}, %rd552, %p219, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 746 21 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:746:21 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s32 %r7580, %r2045, 98816; + add.s32 %r7581, %r7580, %r7383; + add.s32 %r7582, %r7581, %r613; + .loc 1 750 20 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:750:20 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s32 %r6757, %r2045, 132096; + add.s32 %r7583, %r7421, %r6757; + bfe.u32 %r7584, %r7583, 4, 14; + cvt.u64.u32 %rd623, %r7584; + or.b64 %rd553, %rd623, 4611686293372403712; + add.s32 %r7585, %r7425, %r6757; + bfe.u32 %r7586, %r7585, 4, 14; + cvt.u64.u32 %rd624, %r7586; + or.b64 %rd555, %rd624, 4611686293372403712; + add.s32 %r7587, %r6760, 32; + bfe.u32 %r7588, %r7587, 4, 14; + cvt.u64.u32 %rd625, %r7588; + or.b64 %rd556, %rd625, 4611686293338849280; + add.s32 %r7589, %r7430, %r6757; + bfe.u32 %r7590, %r7589, 4, 14; + cvt.u64.u32 %rd626, %r7590; + or.b64 %rd557, %rd626, 4611686293372403712; + add.s32 %r7591, %r6760, 64; + bfe.u32 %r7592, %r7591, 4, 14; + cvt.u64.u32 %rd627, %r7592; + or.b64 %rd558, %rd627, 4611686293338849280; + add.s32 %r7593, %r7435, %r6757; + bfe.u32 %r7594, %r7593, 4, 14; + cvt.u64.u32 %rd628, %r7594; + or.b64 %rd559, %rd628, 4611686293372403712; + add.s32 %r7595, %r6760, 96; + bfe.u32 %r7596, %r7595, 4, 14; + cvt.u64.u32 %rd629, %r7596; + or.b64 %rd560, %rd629, 4611686293338849280; + add.s32 %r7597, %r7440, %r6757; + bfe.u32 %r7598, %r7597, 4, 14; + cvt.u64.u32 %rd630, %r7598; + or.b64 %rd561, %rd630, 4611686293372403712; + add.s32 %r7599, %r6760, 8192; + bfe.u32 %r7600, %r7599, 4, 14; + cvt.u64.u32 %rd631, %r7600; + or.b64 %rd562, %rd631, 4611686293338849280; + add.s32 %r7601, %r7445, %r6757; + bfe.u32 %r7602, %r7601, 4, 14; + cvt.u64.u32 %rd632, %r7602; + or.b64 %rd563, %rd632, 4611686293372403712; + add.s32 %r7603, %r6760, 8224; + bfe.u32 %r7604, %r7603, 4, 14; + cvt.u64.u32 %rd633, %r7604; + or.b64 %rd564, %rd633, 4611686293338849280; + add.s32 %r7605, %r7450, %r6757; + bfe.u32 %r7606, %r7605, 4, 14; + cvt.u64.u32 %rd634, %r7606; + or.b64 %rd565, %rd634, 4611686293372403712; + add.s32 %r7607, %r6760, 8256; + bfe.u32 %r7608, %r7607, 4, 14; + cvt.u64.u32 %rd635, %r7608; + or.b64 %rd566, %rd635, 4611686293338849280; + add.s32 %r7609, %r7455, %r6757; + bfe.u32 %r7610, %r7609, 4, 14; + cvt.u64.u32 %rd636, %r7610; + or.b64 %rd567, %rd636, 4611686293372403712; + add.s32 %r7611, %r6760, 8288; + bfe.u32 %r7612, %r7611, 4, 14; + cvt.u64.u32 %rd637, %r7612; + or.b64 %rd568, %rd637, 4611686293338849280; + .loc 1 746 21 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:746:21 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + ld.shared.v2.b32 {%r7613, %r7614}, [%r7582+224]; + ld.shared.v2.b32 {%r7615, %r7616}, [%r7582+192]; + ld.shared.v2.b32 {%r7617, %r7618}, [%r7582+160]; + ld.shared.v2.b32 {%r7619, %r7620}, [%r7582+128]; + ld.shared.v2.b32 {%r7621, %r7622}, [%r7582+96]; + ld.shared.v2.b32 {%r7623, %r7624}, [%r7582+64]; + ld.shared.v2.b32 {%r7625, %r7626}, [%r7582+32]; + ld.shared.v2.b32 {%r7627, %r7628}, [%r7582]; + .loc 1 750 20 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:750:20 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r6341,%r6342,%r6343,%r6344,%r6345,%r6346,%r6347,%r6348,%r6349,%r6350,%r6351,%r6352,%r6353,%r6354,%r6355,%r6356,%r6357,%r6358,%r6359,%r6360,%r6361,%r6362,%r6363,%r6364,%r6365,%r6366,%r6367,%r6368,%r6369,%r6370,%r6371,%r6372}, %rd553, %rd549, 0, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r6341,%r6342,%r6343,%r6344,%r6345,%r6346,%r6347,%r6348,%r6349,%r6350,%r6351,%r6352,%r6353,%r6354,%r6355,%r6356,%r6357,%r6358,%r6359,%r6360,%r6361,%r6362,%r6363,%r6364,%r6365,%r6366,%r6367,%r6368,%r6369,%r6370,%r6371,%r6372}, %rd555, %rd556, %p219, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r6341,%r6342,%r6343,%r6344,%r6345,%r6346,%r6347,%r6348,%r6349,%r6350,%r6351,%r6352,%r6353,%r6354,%r6355,%r6356,%r6357,%r6358,%r6359,%r6360,%r6361,%r6362,%r6363,%r6364,%r6365,%r6366,%r6367,%r6368,%r6369,%r6370,%r6371,%r6372}, %rd557, %rd558, %p219, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r6341,%r6342,%r6343,%r6344,%r6345,%r6346,%r6347,%r6348,%r6349,%r6350,%r6351,%r6352,%r6353,%r6354,%r6355,%r6356,%r6357,%r6358,%r6359,%r6360,%r6361,%r6362,%r6363,%r6364,%r6365,%r6366,%r6367,%r6368,%r6369,%r6370,%r6371,%r6372}, %rd559, %rd560, %p219, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r6341,%r6342,%r6343,%r6344,%r6345,%r6346,%r6347,%r6348,%r6349,%r6350,%r6351,%r6352,%r6353,%r6354,%r6355,%r6356,%r6357,%r6358,%r6359,%r6360,%r6361,%r6362,%r6363,%r6364,%r6365,%r6366,%r6367,%r6368,%r6369,%r6370,%r6371,%r6372}, %rd561, %rd562, %p219, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r6341,%r6342,%r6343,%r6344,%r6345,%r6346,%r6347,%r6348,%r6349,%r6350,%r6351,%r6352,%r6353,%r6354,%r6355,%r6356,%r6357,%r6358,%r6359,%r6360,%r6361,%r6362,%r6363,%r6364,%r6365,%r6366,%r6367,%r6368,%r6369,%r6370,%r6371,%r6372}, %rd563, %rd564, %p219, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r6341,%r6342,%r6343,%r6344,%r6345,%r6346,%r6347,%r6348,%r6349,%r6350,%r6351,%r6352,%r6353,%r6354,%r6355,%r6356,%r6357,%r6358,%r6359,%r6360,%r6361,%r6362,%r6363,%r6364,%r6365,%r6366,%r6367,%r6368,%r6369,%r6370,%r6371,%r6372}, %rd565, %rd566, %p219, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r6341,%r6342,%r6343,%r6344,%r6345,%r6346,%r6347,%r6348,%r6349,%r6350,%r6351,%r6352,%r6353,%r6354,%r6355,%r6356,%r6357,%r6358,%r6359,%r6360,%r6361,%r6362,%r6363,%r6364,%r6365,%r6366,%r6367,%r6368,%r6369,%r6370,%r6371,%r6372}, %rd567, %rd568, %p219, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r6762, %r5712; + mov.b32 %r6758, %r5712; + mov.b32 %r6759, %r5712; + mov.b32 %r6761, %r5712; + // begin inline asm + // wait for regs: %r6341,%r6342,%r6343,%r6344,%r6345,%r6346,%r6347,%r6348,%r6349,%r6350,%r6351,%r6352,%r6353,%r6354,%r6355,%r6356,%r6357,%r6358,%r6359,%r6360,%r6361,%r6362,%r6363,%r6364,%r6365,%r6366,%r6367,%r6368,%r6369,%r6370,%r6371,%r6372,%r6757,%r6758,%r6759,%r6760,%r6761,%r6762 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 751 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + sub.f32 %r7629, %r6344, %r7628; + sub.f32 %r7630, %r6343, %r7627; + sub.f32 %r7631, %r6348, %r7626; + sub.f32 %r7632, %r6347, %r7625; + sub.f32 %r7633, %r6352, %r7624; + sub.f32 %r7634, %r6351, %r7623; + sub.f32 %r7635, %r6356, %r7622; + sub.f32 %r7636, %r6355, %r7621; + sub.f32 %r7637, %r6360, %r7620; + sub.f32 %r7638, %r6359, %r7619; + sub.f32 %r7639, %r6364, %r7618; + sub.f32 %r7640, %r6363, %r7617; + sub.f32 %r7641, %r6368, %r7616; + sub.f32 %r7642, %r6367, %r7615; + sub.f32 %r7643, %r6372, %r7614; + sub.f32 %r7644, %r6371, %r7613; + .loc 1 751 16 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:16 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + mul.f32 %r7645, %r7542, %r7630; + mul.f32 %r7646, %r7543, %r7629; + mul.f32 %r7647, %r7546, %r7632; + mul.f32 %r7648, %r7547, %r7631; + mul.f32 %r7649, %r7550, %r7634; + mul.f32 %r7650, %r7551, %r7633; + mul.f32 %r7651, %r7554, %r7636; + mul.f32 %r7652, %r7555, %r7635; + mul.f32 %r7653, %r7558, %r7638; + mul.f32 %r7654, %r7559, %r7637; + mul.f32 %r7655, %r7562, %r7640; + mul.f32 %r7656, %r7563, %r7639; + mul.f32 %r7657, %r7566, %r7642; + mul.f32 %r7658, %r7567, %r7641; + mul.f32 %r7659, %r7570, %r7644; + mul.f32 %r7660, %r7571, %r7643; + .loc 1 751 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + sub.f32 %r7661, %r6342, %r7628; + sub.f32 %r7662, %r6341, %r7627; + .loc 1 751 16 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:16 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + mul.f32 %r7663, %r7540, %r7662; + mul.f32 %r7664, %r7541, %r7661; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + cvt.rn.bf16x2.f32 %r6929, %r7664, %r7663; + cvt.rn.bf16x2.f32 %r6930, %r7646, %r7645; + .loc 1 751 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + sub.f32 %r7665, %r6346, %r7626; + sub.f32 %r7666, %r6345, %r7625; + .loc 1 751 16 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:16 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + mul.f32 %r7667, %r7544, %r7666; + mul.f32 %r7668, %r7545, %r7665; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + cvt.rn.bf16x2.f32 %r6931, %r7668, %r7667; + cvt.rn.bf16x2.f32 %r6932, %r7648, %r7647; + .loc 1 751 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + sub.f32 %r7669, %r6350, %r7624; + sub.f32 %r7670, %r6349, %r7623; + .loc 1 751 16 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:16 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + mul.f32 %r7671, %r7548, %r7670; + mul.f32 %r7672, %r7549, %r7669; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + cvt.rn.bf16x2.f32 %r7061, %r7672, %r7671; + cvt.rn.bf16x2.f32 %r7062, %r7650, %r7649; + .loc 1 751 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + sub.f32 %r7673, %r6354, %r7622; + sub.f32 %r7674, %r6353, %r7621; + .loc 1 751 16 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:16 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + mul.f32 %r7675, %r7552, %r7674; + mul.f32 %r7676, %r7553, %r7673; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + cvt.rn.bf16x2.f32 %r7063, %r7676, %r7675; + cvt.rn.bf16x2.f32 %r7064, %r7652, %r7651; + .loc 1 751 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + sub.f32 %r7677, %r6358, %r7620; + sub.f32 %r7678, %r6357, %r7619; + .loc 1 751 16 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:16 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + mul.f32 %r7679, %r7556, %r7678; + mul.f32 %r7680, %r7557, %r7677; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + cvt.rn.bf16x2.f32 %r7193, %r7680, %r7679; + cvt.rn.bf16x2.f32 %r7194, %r7654, %r7653; + .loc 1 751 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + sub.f32 %r7681, %r6362, %r7618; + sub.f32 %r7682, %r6361, %r7617; + .loc 1 751 16 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:16 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + mul.f32 %r7683, %r7560, %r7682; + mul.f32 %r7684, %r7561, %r7681; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + cvt.rn.bf16x2.f32 %r7195, %r7684, %r7683; + cvt.rn.bf16x2.f32 %r7196, %r7656, %r7655; + .loc 1 751 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + sub.f32 %r7685, %r6366, %r7616; + sub.f32 %r7686, %r6365, %r7615; + .loc 1 751 16 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:16 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + mul.f32 %r7687, %r7564, %r7686; + mul.f32 %r7688, %r7565, %r7685; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + cvt.rn.bf16x2.f32 %r7325, %r7688, %r7687; + cvt.rn.bf16x2.f32 %r7326, %r7658, %r7657; + .loc 1 751 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + sub.f32 %r7689, %r6370, %r7614; + sub.f32 %r7690, %r6369, %r7613; + .loc 1 751 16 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:751:16 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + mul.f32 %r7691, %r7568, %r7690; + mul.f32 %r7692, %r7569, %r7689; + .loc 1 775 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + cvt.rn.bf16x2.f32 %r7327, %r7692, %r7691; + cvt.rn.bf16x2.f32 %r7328, %r7660, %r7659; + .loc 1 775 43 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:775:43 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581}, {%r6929,%r6930,%r6931,%r6932}, %rd534, %p219, 1, 1, 1; + // end inline asm + add.s32 %r7693, %r5714, 2048; + bfe.u32 %r7694, %r7693, 4, 14; + cvt.u64.u32 %rd638, %r7694; + or.b64 %rd570, %rd638, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581}, {%r7061,%r7062,%r7063,%r7064}, %rd570, %p219, 1, 1, 1; + // end inline asm + add.s32 %r7695, %r5714, 4096; + bfe.u32 %r7696, %r7695, 4, 14; + cvt.u64.u32 %rd639, %r7696; + or.b64 %rd571, %rd639, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581}, {%r7193,%r7194,%r7195,%r7196}, %rd571, %p219, 1, 1, 1; + // end inline asm + add.s32 %r7697, %r5714, 6144; + bfe.u32 %r7698, %r7697, 4, 14; + cvt.u64.u32 %rd640, %r7698; + or.b64 %rd572, %rd640, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581}, {%r7325,%r7326,%r7327,%r7328}, %rd572, %p219, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 610 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:610:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s32 %r1674, %r13867, 1; + .loc 1 788 33 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:788:33 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + shr.u32 %r7699, %r1674, 1; + .loc 1 789 38 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:789:38 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + mad.wide.u32 %rd574, %r7699, 4, %rd219; + .loc 1 789 24 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:789:24 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + // begin inline asm + mov.u64 %rd573, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd573, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r7329, 0x0; + @%p241 ld.global.L1::evict_last.L2::cache_hint.b32 { %r7329 }, [ %rd574 + 0 ], %rd573; + // end inline asm + .loc 1 790 109 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:790:109 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s32 %r7700, %r7699, 1; + .loc 1 790 113 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:790:113 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + setp.lt.s32 %p278, %r7700, %r2015; + .loc 1 790 55 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:790:55 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s64 %rd577, %rd574, 4; + .loc 1 610 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:610:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + and.pred %p242, %p241, %p278; + .loc 1 790 25 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:790:25 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + // begin inline asm + mov.u64 %rd576, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd576, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r7330, 0x0; + @%p242 ld.global.L1::evict_last.L2::cache_hint.b32 { %r7330 }, [ %rd577 + 0 ], %rd576; + // end inline asm + .loc 1 791 35 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:791:35 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + and.b32 %r7701, %r13867, 1; + .loc 1 792 34 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:792:34 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + sub.s32 %r7702, %r7330, %r7329; + .loc 1 792 48 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:792:48 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + shl.b32 %r7703, %r7702, 7; + .loc 1 792 63 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:792:63 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s32 %r7704, %r7703, -64; + .loc 1 793 29 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:793:29 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + xor.b32 %r7705, %r7701, 1; + .loc 1 793 61 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:793:61 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + shl.b32 %r7706, %r7701, 6; + .loc 1 793 42 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:793:42 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + mad.lo.s32 %r7707, %r7704, %r7705, %r7706; + .loc 1 626 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:626:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + shl.b32 %r7708, %r7707, 12; + .loc 1 626 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:626:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + mul.wide.s32 %rd641, %r7708, 2; + add.s64 %rd1039, %rd1039, %rd641; + add.s64 %rd1038, %rd1038, %rd641; + add.s64 %rd1037, %rd1037, %rd641; + add.s64 %rd1036, %rd1036, %rd641; + .loc 1 627 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:627:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + shl.b32 %r7709, %r7707, 7; + .loc 1 627 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:627:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + mul.wide.s32 %rd642, %r7709, 2; + add.s64 %rd1035, %rd1035, %rd642; + add.s64 %rd1034, %rd1034, %rd642; + add.s64 %rd1033, %rd1033, %rd642; + add.s64 %rd1032, %rd1032, %rd642; + .loc 1 628 19 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:628:19 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s32 %r13738, %r7707, %r13738; + add.s32 %r13737, %r7707, %r13737; + add.s32 %r13736, %r7707, %r13736; + add.s32 %r13735, %r7707, %r13735; + add.s32 %r13734, %r7707, %r13734; + add.s32 %r13733, %r7707, %r13733; + add.s32 %r13732, %r7707, %r13732; + add.s32 %r13731, %r7707, %r13731; + .loc 1 610 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:610:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s32 %r7710, %r13728, 1; + setp.gt.s32 %p279, %r7710, 1; + selp.b32 %r13728, 0, %r7710, %p279; + add.s32 %r7711, %r13730, 1; + setp.gt.s32 %p280, %r7711, 2; + selp.b32 %r13730, 0, %r7711, %p280; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + shl.b32 %r7712, %r13730, 14; + add.s32 %r7713, %r2045, %r7712; + bar.sync 0; + add.s32 %r7331, %r7713, %r606; + selp.b32 %r7332, 16, 0, %p259; + // begin inline asm + cp.async.cg.shared.global [ %r7331 + 0 ], [ %rd1039 + 0 ], 0x10, %r7332; + // end inline asm + add.s32 %r7333, %r7331, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r7333 + 0 ], [ %rd1038 + 0 ], 0x10, %r7332; + // end inline asm + add.s32 %r7335, %r7331, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r7335 + 0 ], [ %rd1037 + 0 ], 0x10, %r7332; + // end inline asm + add.s32 %r7337, %r7331, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r7337 + 0 ], [ %rd1036 + 0 ], 0x10, %r7332; + // end inline asm + cp.async.commit_group; + .loc 1 672 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:672:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + mul.wide.s32 %rd643, %r13738, 4; + add.s64 %rd583, %rd128, %rd643; + mul.wide.s32 %rd644, %r13737, 4; + add.s64 %rd584, %rd128, %rd644; + mul.wide.s32 %rd645, %r13736, 4; + add.s64 %rd585, %rd128, %rd645; + mul.wide.s32 %rd646, %r13735, 4; + add.s64 %rd586, %rd128, %rd646; + mul.wide.s32 %rd647, %r13734, 4; + add.s64 %rd587, %rd128, %rd647; + mul.wide.s32 %rd648, %r13733, 4; + add.s64 %rd588, %rd128, %rd648; + mul.wide.s32 %rd649, %r13732, 4; + add.s64 %rd589, %rd128, %rd649; + mul.wide.s32 %rd650, %r13731, 4; + add.s64 %rd590, %rd128, %rd650; + .loc 1 672 22 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:672:22 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + shl.b32 %r7714, %r13728, 8; + add.s32 %r7715, %r7384, %r7714; + add.s32 %r7339, %r7715, %r613; + selp.b32 %r7340, 8, 0, %p259; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7339 + 0 ], [ %rd583 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7341, %r7339, 32; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7341 + 0 ], [ %rd584 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7343, %r7339, 64; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7343 + 0 ], [ %rd585 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7345, %r7339, 96; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7345 + 0 ], [ %rd586 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7347, %r7339, 128; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7347 + 0 ], [ %rd587 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7349, %r7339, 160; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7349 + 0 ], [ %rd588 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7351, %r7339, 192; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7351 + 0 ], [ %rd589 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7353, %r7339, 224; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7353 + 0 ], [ %rd590 + 0 ], 0x8, %r7340; + // end inline asm + cp.async.commit_group; + .loc 1 835 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:835:23 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s32 %r7716, %r7572, %r7712; + add.s32 %r7355, %r7716, %r606; + // begin inline asm + cp.async.cg.shared.global [ %r7355 + 0 ], [ %rd1035 + 0 ], 0x10, %r7332; + // end inline asm + add.s32 %r7357, %r7355, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r7357 + 0 ], [ %rd1034 + 0 ], 0x10, %r7332; + // end inline asm + add.s32 %r7359, %r7355, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r7359 + 0 ], [ %rd1033 + 0 ], 0x10, %r7332; + // end inline asm + add.s32 %r7361, %r7355, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r7361 + 0 ], [ %rd1032 + 0 ], 0x10, %r7332; + // end inline asm + cp.async.commit_group; + .loc 1 746 29 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:746:29 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s64 %rd595, %rd129, %rd643; + add.s64 %rd596, %rd129, %rd644; + add.s64 %rd597, %rd129, %rd645; + add.s64 %rd598, %rd129, %rd646; + add.s64 %rd599, %rd129, %rd647; + add.s64 %rd600, %rd129, %rd648; + add.s64 %rd601, %rd129, %rd649; + add.s64 %rd602, %rd129, %rd650; + .loc 1 746 21 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:746:21 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + add.s32 %r7717, %r7580, %r7714; + add.s32 %r7363, %r7717, %r613; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7363 + 0 ], [ %rd595 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7365, %r7363, 32; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7365 + 0 ], [ %rd596 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7367, %r7363, 64; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7367 + 0 ], [ %rd597 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7369, %r7363, 96; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7369 + 0 ], [ %rd598 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7371, %r7363, 128; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7371 + 0 ], [ %rd599 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7373, %r7363, 160; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7373 + 0 ], [ %rd600 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7375, %r7363, 192; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7375 + 0 ], [ %rd601 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7377, %r7363, 224; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7377 + 0 ], [ %rd602 + 0 ], 0x8, %r7340; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:610:28 @[ c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:318:20 ] + setp.ne.b32 %p281, %r686, %r1674; + mov.b32 %r13867, %r1674; + @%p281 bra $L__BB0_14; + bra.uni $L__BB0_15; +$L__tmp29: +$L__BB0_16: + .loc 1 0 28 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:0:28 + cvt.u32.u64 %r8118, %rd87; + cvt.u32.u64 %r8119, %rd86; + cvt.u32.u64 %r8120, %rd85; + cvt.u32.u64 %r8121, %rd84; + cvt.u32.u64 %r8122, %rd83; + cvt.u32.u64 %r8123, %rd82; + cvt.u32.u64 %r8124, %rd81; + cvt.u32.u64 %r8125, %rd80; + cvt.u32.u64 %r8126, %rd79; + .loc 1 323 23 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:323:23 + shl.b64 %rd667, %rd79, 1; + add.s64 %rd668, %rd3, %rd667; + shl.b64 %rd669, %rd80, 1; + add.s64 %rd670, %rd3, %rd669; + shl.b64 %rd671, %rd81, 1; + add.s64 %rd672, %rd3, %rd671; + shl.b64 %rd673, %rd82, 1; + add.s64 %rd674, %rd3, %rd673; + shl.b64 %rd675, %rd83, 1; + add.s64 %rd676, %rd3, %rd675; + shl.b64 %rd677, %rd84, 1; + add.s64 %rd678, %rd3, %rd677; + shl.b64 %rd679, %rd85, 1; + add.s64 %rd680, %rd3, %rd679; + shl.b64 %rd681, %rd86, 1; + add.s64 %rd682, %rd3, %rd681; + .loc 1 323 55 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:323:55 + add.s64 %rd651, %rd668, %rd307; + add.s64 %rd652, %rd670, %rd307; + add.s64 %rd653, %rd672, %rd307; + add.s64 %rd654, %rd674, %rd307; + add.s64 %rd655, %rd676, %rd307; + add.s64 %rd656, %rd678, %rd307; + add.s64 %rd657, %rd680, %rd307; + add.s64 %rd658, %rd682, %rd307; + .loc 1 330 30 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:330:30 + cvt.rn.bf16x2.f32 %r8127, %r13455, %r13454; + cvt.rn.bf16x2.f32 %r8128, %r13457, %r13456; + cvt.rn.bf16x2.f32 %r8129, %r13459, %r13458; + cvt.rn.bf16x2.f32 %r8130, %r13461, %r13460; + cvt.rn.bf16x2.f32 %r8131, %r13463, %r13462; + cvt.rn.bf16x2.f32 %r8132, %r13465, %r13464; + cvt.rn.bf16x2.f32 %r8133, %r13467, %r13466; + cvt.rn.bf16x2.f32 %r8134, %r13469, %r13468; + cvt.rn.bf16x2.f32 %r8135, %r13471, %r13470; + cvt.rn.bf16x2.f32 %r8136, %r13473, %r13472; + cvt.rn.bf16x2.f32 %r8137, %r13475, %r13474; + cvt.rn.bf16x2.f32 %r8138, %r13477, %r13476; + cvt.rn.bf16x2.f32 %r8139, %r13479, %r13478; + cvt.rn.bf16x2.f32 %r8140, %r13481, %r13480; + cvt.rn.bf16x2.f32 %r8141, %r13483, %r13482; + cvt.rn.bf16x2.f32 %r8142, %r13485, %r13484; + cvt.rn.bf16x2.f32 %r8143, %r13487, %r13486; + cvt.rn.bf16x2.f32 %r8144, %r13489, %r13488; + cvt.rn.bf16x2.f32 %r8145, %r13491, %r13490; + cvt.rn.bf16x2.f32 %r8146, %r13493, %r13492; + cvt.rn.bf16x2.f32 %r8147, %r13495, %r13494; + cvt.rn.bf16x2.f32 %r8148, %r13497, %r13496; + cvt.rn.bf16x2.f32 %r8149, %r13499, %r13498; + cvt.rn.bf16x2.f32 %r8150, %r13501, %r13500; + cvt.rn.bf16x2.f32 %r8151, %r13503, %r13502; + cvt.rn.bf16x2.f32 %r8152, %r13505, %r13504; + cvt.rn.bf16x2.f32 %r8153, %r13507, %r13506; + cvt.rn.bf16x2.f32 %r8154, %r13509, %r13508; + cvt.rn.bf16x2.f32 %r8155, %r13511, %r13510; + cvt.rn.bf16x2.f32 %r8156, %r13513, %r13512; + cvt.rn.bf16x2.f32 %r8157, %r13515, %r13514; + cvt.rn.bf16x2.f32 %r8158, %r13517, %r13516; + shl.b32 %r8159, %r599, 13; + shl.b32 %r8160, %r6, 5; + and.b32 %r8161, %r8160, 7264; + and.b32 %r8162, %r6, 24; + shl.b32 %r8163, %r8162, 4; + shl.b32 %r8164, %r6, 2; + and.b32 %r8165, %r8164, 16; + or.b32 %r8166, %r8159, %r8165; + or.b32 %r8167, %r8161, %r8163; + or.b32 %r8168, %r8166, %r8167; + add.s32 %r8170, %r2045, %r8168; + st.shared.v4.b32 [%r8170], {%r8127, %r8129, %r8131, %r8133}; + st.shared.v4.b32 [%r8170+512], {%r8128, %r8130, %r8132, %r8134}; + xor.b32 %r8171, %r8168, 32; + add.s32 %r8172, %r2045, %r8171; + st.shared.v4.b32 [%r8172], {%r8135, %r8137, %r8139, %r8141}; + st.shared.v4.b32 [%r8172+512], {%r8136, %r8138, %r8140, %r8142}; + xor.b32 %r8173, %r8168, 64; + add.s32 %r8174, %r2045, %r8173; + st.shared.v4.b32 [%r8174], {%r8143, %r8145, %r8147, %r8149}; + st.shared.v4.b32 [%r8174+512], {%r8144, %r8146, %r8148, %r8150}; + xor.b32 %r8175, %r8168, 96; + add.s32 %r8176, %r2045, %r8175; + st.shared.v4.b32 [%r8176], {%r8151, %r8153, %r8155, %r8157}; + st.shared.v4.b32 [%r8176+512], {%r8152, %r8154, %r8156, %r8158}; + bar.sync 0; + shl.b32 %r8177, %r8162, 10; + shl.b32 %r8178, %r599, 5; + and.b32 %r8179, %r8164, 1008; + or.b32 %r8180, %r8177, %r8178; + xor.b32 %r8181, %r8180, %r8179; + add.s32 %r7978, %r2045, %r8181; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8014, %r8015, %r8016, %r8017}, [%r7978]; + // end inline asm + add.s32 %r7983, %r7978, 1024; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8018, %r8019, %r8020, %r8021}, [%r7983]; + // end inline asm + add.s32 %r7988, %r7978, 2048; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8022, %r8023, %r8024, %r8025}, [%r7988]; + // end inline asm + add.s32 %r7993, %r7978, 3072; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8026, %r8027, %r8028, %r8029}, [%r7993]; + // end inline asm + add.s32 %r7998, %r7978, 4096; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8030, %r8031, %r8032, %r8033}, [%r7998]; + // end inline asm + add.s32 %r8003, %r7978, 5120; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8034, %r8035, %r8036, %r8037}, [%r8003]; + // end inline asm + add.s32 %r8008, %r7978, 6144; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8038, %r8039, %r8040, %r8041}, [%r8008]; + // end inline asm + add.s32 %r8013, %r7978, 7168; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8042, %r8043, %r8044, %r8045}, [%r8013]; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd651 + 0 ], { %r8014, %r8015, %r8016, %r8017 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd652 + 0 ], { %r8018, %r8019, %r8020, %r8021 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd653 + 0 ], { %r8022, %r8023, %r8024, %r8025 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd654 + 0 ], { %r8026, %r8027, %r8028, %r8029 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd655 + 0 ], { %r8030, %r8031, %r8032, %r8033 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd656 + 0 ], { %r8034, %r8035, %r8036, %r8037 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd657 + 0 ], { %r8038, %r8039, %r8040, %r8041 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd658 + 0 ], { %r8042, %r8043, %r8044, %r8045 }; + // end inline asm + .loc 1 334 14 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:334:14 + mul.f32 %r8182, %r13518, 0f3DB504F3; + mul.f32 %r8183, %r13519, 0f3DB504F3; + mul.f32 %r8184, %r13520, 0f3DB504F3; + mul.f32 %r8185, %r13521, 0f3DB504F3; + mul.f32 %r8186, %r13522, 0f3DB504F3; + mul.f32 %r8187, %r13523, 0f3DB504F3; + mul.f32 %r8188, %r13524, 0f3DB504F3; + mul.f32 %r8189, %r13525, 0f3DB504F3; + mul.f32 %r8190, %r13526, 0f3DB504F3; + mul.f32 %r8191, %r13527, 0f3DB504F3; + mul.f32 %r8192, %r13528, 0f3DB504F3; + mul.f32 %r8193, %r13529, 0f3DB504F3; + mul.f32 %r8194, %r13530, 0f3DB504F3; + mul.f32 %r8195, %r13531, 0f3DB504F3; + mul.f32 %r8196, %r13532, 0f3DB504F3; + mul.f32 %r8197, %r13533, 0f3DB504F3; + mul.f32 %r8198, %r13534, 0f3DB504F3; + mul.f32 %r8199, %r13535, 0f3DB504F3; + mul.f32 %r8200, %r13536, 0f3DB504F3; + mul.f32 %r8201, %r13537, 0f3DB504F3; + mul.f32 %r8202, %r13538, 0f3DB504F3; + mul.f32 %r8203, %r13539, 0f3DB504F3; + mul.f32 %r8204, %r13540, 0f3DB504F3; + mul.f32 %r8205, %r13541, 0f3DB504F3; + mul.f32 %r8206, %r13542, 0f3DB504F3; + mul.f32 %r8207, %r13543, 0f3DB504F3; + mul.f32 %r8208, %r13544, 0f3DB504F3; + mul.f32 %r8209, %r13545, 0f3DB504F3; + mul.f32 %r8210, %r13546, 0f3DB504F3; + mul.f32 %r8211, %r13547, 0f3DB504F3; + mul.f32 %r8212, %r13548, 0f3DB504F3; + mul.f32 %r8213, %r13549, 0f3DB504F3; + mul.f32 %r8214, %r13550, 0f3DB504F3; + mul.f32 %r8215, %r13551, 0f3DB504F3; + mul.f32 %r8216, %r13552, 0f3DB504F3; + mul.f32 %r8217, %r13553, 0f3DB504F3; + mul.f32 %r8218, %r13554, 0f3DB504F3; + mul.f32 %r8219, %r13555, 0f3DB504F3; + mul.f32 %r8220, %r13556, 0f3DB504F3; + mul.f32 %r8221, %r13557, 0f3DB504F3; + mul.f32 %r8222, %r13558, 0f3DB504F3; + mul.f32 %r8223, %r13559, 0f3DB504F3; + mul.f32 %r8224, %r13560, 0f3DB504F3; + mul.f32 %r8225, %r13561, 0f3DB504F3; + mul.f32 %r8226, %r13562, 0f3DB504F3; + mul.f32 %r8227, %r13563, 0f3DB504F3; + mul.f32 %r8228, %r13564, 0f3DB504F3; + mul.f32 %r8229, %r13565, 0f3DB504F3; + mul.f32 %r8230, %r13566, 0f3DB504F3; + mul.f32 %r8231, %r13567, 0f3DB504F3; + mul.f32 %r8232, %r13568, 0f3DB504F3; + mul.f32 %r8233, %r13569, 0f3DB504F3; + mul.f32 %r8234, %r13570, 0f3DB504F3; + mul.f32 %r8235, %r13571, 0f3DB504F3; + mul.f32 %r8236, %r13572, 0f3DB504F3; + mul.f32 %r8237, %r13573, 0f3DB504F3; + mul.f32 %r8238, %r13574, 0f3DB504F3; + mul.f32 %r8239, %r13575, 0f3DB504F3; + mul.f32 %r8240, %r13576, 0f3DB504F3; + mul.f32 %r8241, %r13577, 0f3DB504F3; + mul.f32 %r8242, %r13578, 0f3DB504F3; + mul.f32 %r8243, %r13579, 0f3DB504F3; + mul.f32 %r8244, %r13580, 0f3DB504F3; + mul.f32 %r8245, %r13581, 0f3DB504F3; + .loc 1 344 58 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:344:58 + or.b32 %r8246, %r8126, %r5; + or.b32 %r8247, %r8246, %r8118; + or.b32 %r8248, %r8125, %r5; + or.b32 %r8249, %r8248, %r8118; + or.b32 %r8250, %r8124, %r5; + or.b32 %r8251, %r8250, %r8118; + or.b32 %r8252, %r8123, %r5; + or.b32 %r8253, %r8252, %r8118; + or.b32 %r8254, %r8122, %r5; + or.b32 %r8255, %r8254, %r8118; + or.b32 %r8256, %r8121, %r5; + or.b32 %r8257, %r8256, %r8118; + or.b32 %r8258, %r8120, %r5; + or.b32 %r8259, %r8258, %r8118; + or.b32 %r8260, %r8119, %r5; + or.b32 %r8261, %r8260, %r8118; + .loc 1 345 29 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:345:29 + mad.wide.s32 %rd659, %r8247, 2, %rd194; + mad.wide.s32 %rd660, %r8249, 2, %rd194; + mad.wide.s32 %rd661, %r8251, 2, %rd194; + mad.wide.s32 %rd662, %r8253, 2, %rd194; + mad.wide.s32 %rd663, %r8255, 2, %rd194; + mad.wide.s32 %rd664, %r8257, 2, %rd194; + mad.wide.s32 %rd665, %r8259, 2, %rd194; + mad.wide.s32 %rd666, %r8261, 2, %rd194; + .loc 1 345 69 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:345:69 + cvt.rn.bf16x2.f32 %r8262, %r8183, %r8182; + cvt.rn.bf16x2.f32 %r8263, %r8185, %r8184; + cvt.rn.bf16x2.f32 %r8264, %r8187, %r8186; + cvt.rn.bf16x2.f32 %r8265, %r8189, %r8188; + cvt.rn.bf16x2.f32 %r8266, %r8191, %r8190; + cvt.rn.bf16x2.f32 %r8267, %r8193, %r8192; + cvt.rn.bf16x2.f32 %r8268, %r8195, %r8194; + cvt.rn.bf16x2.f32 %r8269, %r8197, %r8196; + cvt.rn.bf16x2.f32 %r8270, %r8199, %r8198; + cvt.rn.bf16x2.f32 %r8271, %r8201, %r8200; + cvt.rn.bf16x2.f32 %r8272, %r8203, %r8202; + cvt.rn.bf16x2.f32 %r8273, %r8205, %r8204; + cvt.rn.bf16x2.f32 %r8274, %r8207, %r8206; + cvt.rn.bf16x2.f32 %r8275, %r8209, %r8208; + cvt.rn.bf16x2.f32 %r8276, %r8211, %r8210; + cvt.rn.bf16x2.f32 %r8277, %r8213, %r8212; + cvt.rn.bf16x2.f32 %r8278, %r8215, %r8214; + cvt.rn.bf16x2.f32 %r8279, %r8217, %r8216; + cvt.rn.bf16x2.f32 %r8280, %r8219, %r8218; + cvt.rn.bf16x2.f32 %r8281, %r8221, %r8220; + cvt.rn.bf16x2.f32 %r8282, %r8223, %r8222; + cvt.rn.bf16x2.f32 %r8283, %r8225, %r8224; + cvt.rn.bf16x2.f32 %r8284, %r8227, %r8226; + cvt.rn.bf16x2.f32 %r8285, %r8229, %r8228; + cvt.rn.bf16x2.f32 %r8286, %r8231, %r8230; + cvt.rn.bf16x2.f32 %r8287, %r8233, %r8232; + cvt.rn.bf16x2.f32 %r8288, %r8235, %r8234; + cvt.rn.bf16x2.f32 %r8289, %r8237, %r8236; + cvt.rn.bf16x2.f32 %r8290, %r8239, %r8238; + cvt.rn.bf16x2.f32 %r8291, %r8241, %r8240; + cvt.rn.bf16x2.f32 %r8292, %r8243, %r8242; + cvt.rn.bf16x2.f32 %r8293, %r8245, %r8244; + bar.sync 0; + st.shared.v4.b32 [%r8170], {%r8262, %r8264, %r8266, %r8268}; + st.shared.v4.b32 [%r8170+512], {%r8263, %r8265, %r8267, %r8269}; + st.shared.v4.b32 [%r8172], {%r8270, %r8272, %r8274, %r8276}; + st.shared.v4.b32 [%r8172+512], {%r8271, %r8273, %r8275, %r8277}; + st.shared.v4.b32 [%r8174], {%r8278, %r8280, %r8282, %r8284}; + st.shared.v4.b32 [%r8174+512], {%r8279, %r8281, %r8283, %r8285}; + st.shared.v4.b32 [%r8176], {%r8286, %r8288, %r8290, %r8292}; + st.shared.v4.b32 [%r8176+512], {%r8287, %r8289, %r8291, %r8293}; + bar.sync 0; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8086, %r8087, %r8088, %r8089}, [%r7978]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8090, %r8091, %r8092, %r8093}, [%r7983]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8094, %r8095, %r8096, %r8097}, [%r7988]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8098, %r8099, %r8100, %r8101}, [%r7993]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8102, %r8103, %r8104, %r8105}, [%r7998]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8106, %r8107, %r8108, %r8109}, [%r8003]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8110, %r8111, %r8112, %r8113}, [%r8008]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8114, %r8115, %r8116, %r8117}, [%r8013]; + // end inline asm + mov.pred %p283, -1; + // begin inline asm + @%p283 st.global.v4.b32 [ %rd659 + 0 ], { %r8086, %r8087, %r8088, %r8089 }; + // end inline asm + // begin inline asm + @%p283 st.global.v4.b32 [ %rd660 + 0 ], { %r8090, %r8091, %r8092, %r8093 }; + // end inline asm + // begin inline asm + @%p283 st.global.v4.b32 [ %rd661 + 0 ], { %r8094, %r8095, %r8096, %r8097 }; + // end inline asm + // begin inline asm + @%p283 st.global.v4.b32 [ %rd662 + 0 ], { %r8098, %r8099, %r8100, %r8101 }; + // end inline asm + // begin inline asm + @%p283 st.global.v4.b32 [ %rd663 + 0 ], { %r8102, %r8103, %r8104, %r8105 }; + // end inline asm + // begin inline asm + @%p283 st.global.v4.b32 [ %rd664 + 0 ], { %r8106, %r8107, %r8108, %r8109 }; + // end inline asm + // begin inline asm + @%p283 st.global.v4.b32 [ %rd665 + 0 ], { %r8110, %r8111, %r8112, %r8113 }; + // end inline asm + // begin inline asm + @%p283 st.global.v4.b32 [ %rd666 + 0 ], { %r8114, %r8115, %r8116, %r8117 }; + // end inline asm +$L__BB0_17: + .loc 1 139 4 // c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py:139:4 + ret; +$L__tmp30: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 6 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 404 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x18d DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 52 +.b8 100 +.b8 119 +.b8 54 +.b8 121 +.b8 107 +.b8 120 +.b8 103 +.b8 98 +.b8 119 +.b8 107 +.b8 51 +.b8 103 +.b8 108 +.b8 97 +.b8 99 +.b8 117 +.b8 116 +.b8 120 +.b8 107 +.b8 112 +.b8 122 +.b8 119 +.b8 104 +.b8 97 +.b8 112 +.b8 118 +.b8 114 +.b8 53 +.b8 111 +.b8 115 +.b8 122 +.b8 106 +.b8 101 +.b8 116 +.b8 51 +.b8 110 +.b8 52 +.b8 105 +.b8 52 +.b8 113 +.b8 51 +.b8 115 +.b8 110 +.b8 117 +.b8 109 +.b8 106 +.b8 122 +.b8 109 +.b8 51 +.b8 120 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 52 +.b8 100 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 116 +.b8 101 +.b8 109 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0xf1 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 178 // DW_AT_call_line +.b8 107 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd3:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp2 // DW_AT_low_pc +.b64 $L__tmp3 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 179 // DW_AT_call_line +.b8 111 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xeb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp4 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 207 // DW_AT_call_line +.b8 12 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x103:0x17 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp10 // DW_AT_low_pc +.b64 $L__tmp11 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 0 // DW_AT_call_line +.b8 4 // Abbrev [4] 0x11a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp13 // DW_AT_low_pc +.b64 $L__tmp14 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 226 // DW_AT_call_line +.b8 16 // DW_AT_call_column +.b8 6 // Abbrev [6] 0x132:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp15 // DW_AT_low_pc +.b64 $L__tmp16 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 0 // DW_AT_call_line +.b8 1 +.b8 107 // DW_AT_call_column +.b8 6 // Abbrev [6] 0x14b:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp16 // DW_AT_low_pc +.b64 $L__tmp17 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 1 // DW_AT_call_line +.b8 1 +.b8 107 // DW_AT_call_column +.b8 6 // Abbrev [6] 0x164:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp18 // DW_AT_low_pc +.b64 $L__tmp28 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 1 +.b8 16 // DW_AT_call_column +.b8 6 // Abbrev [6] 0x17d:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp20 // DW_AT_low_pc +.b64 $L__tmp29 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 62 // DW_AT_call_line +.b8 1 +.b8 20 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.source new file mode 100644 index 0000000000000000000000000000000000000000..2ce464776131844262720f845e0d40fcf4a90044 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.source @@ -0,0 +1,2072 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":18:0) +#loc201 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":32:0) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":812:0) +#loc221 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":348:0) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":423:0) +#loc318 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":797:0) +#loc321 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":781:0) +#loc342 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":559:0) +#loc374 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":634:0) +#loc445 = loc("arg_Q"(#loc)) +#loc446 = loc("arg_K"(#loc)) +#loc447 = loc("arg_V"(#loc)) +#loc448 = loc("arg_LSE"(#loc)) +#loc449 = loc("arg_DELTA"(#loc)) +#loc450 = loc("arg_DO"(#loc)) +#loc451 = loc("arg_DQ"(#loc)) +#loc452 = loc("arg_DV"(#loc)) +#loc453 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc454 = loc("arg_KV_IDX"(#loc)) +#loc455 = loc("arg_Q_NUM_BLKS"(#loc)) +#loc456 = loc("arg_Q_IDX"(#loc)) +#loc457 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc458 = loc("arg_FULL_KV_IDX"(#loc)) +#loc459 = loc("arg_FULL_Q_NUM_BLKS"(#loc)) +#loc460 = loc("arg_FULL_Q_IDX"(#loc)) +#loc461 = loc("in_ptr16"(#loc)) +#loc462 = loc("out_ptr0"(#loc)) +#loc647 = loc("x"(#loc201)) +#loc648 = loc("ptr"(#loc211)) +#loc649 = loc("offs_m"(#loc211)) +#loc650 = loc("offs_n"(#loc211)) +#loc651 = loc("stride_m"(#loc211)) +#loc652 = loc("stride_n"(#loc211)) +#loc653 = loc("M_LEN"(#loc211)) +#loc660 = loc("arg_Q"(#loc221)) +#loc661 = loc("arg_K"(#loc221)) +#loc662 = loc("arg_V"(#loc221)) +#loc663 = loc("arg_LSE"(#loc221)) +#loc664 = loc("arg_DELTA"(#loc221)) +#loc665 = loc("arg_DO"(#loc221)) +#loc666 = loc("arg_DQ"(#loc221)) +#loc667 = loc("arg_DV"(#loc221)) +#loc668 = loc("arg_KV_NUM_BLKS"(#loc221)) +#loc669 = loc("arg_KV_IDX"(#loc221)) +#loc670 = loc("arg_Q_NUM_BLKS"(#loc221)) +#loc671 = loc("arg_Q_IDX"(#loc221)) +#loc672 = loc("arg_FULL_KV_NUM_BLKS"(#loc221)) +#loc673 = loc("arg_FULL_KV_IDX"(#loc221)) +#loc674 = loc("arg_FULL_Q_NUM_BLKS"(#loc221)) +#loc675 = loc("arg_FULL_Q_IDX"(#loc221)) +#loc676 = loc("in_ptr16"(#loc221)) +#loc677 = loc("out_ptr0"(#loc221)) +#loc678 = loc("K"(#loc221)) +#loc679 = loc("V"(#loc221)) +#loc680 = loc("dq"(#loc221)) +#loc681 = loc("q"(#loc221)) +#loc682 = loc("do"(#loc221)) +#loc683 = loc("Di"(#loc221)) +#loc684 = loc("lse"(#loc221)) +#loc685 = loc("off_z"(#loc221)) +#loc686 = loc("off_hq"(#loc221)) +#loc687 = loc("offs_m2"(#loc221)) +#loc688 = loc("offs_n2"(#loc221)) +#loc689 = loc("stride_kn"(#loc221)) +#loc690 = loc("stride_kd"(#loc221)) +#loc691 = loc("stride_vn"(#loc221)) +#loc692 = loc("stride_vd"(#loc221)) +#loc693 = loc("kv_indices"(#loc221)) +#loc694 = loc("sparse_kv_num_blocks"(#loc221)) +#loc723 = loc("arg_Q"(#loc253)) +#loc724 = loc("arg_K"(#loc253)) +#loc725 = loc("arg_V"(#loc253)) +#loc726 = loc("arg_LSE"(#loc253)) +#loc727 = loc("arg_DELTA"(#loc253)) +#loc728 = loc("arg_DO"(#loc253)) +#loc729 = loc("arg_DQ"(#loc253)) +#loc730 = loc("arg_DV"(#loc253)) +#loc731 = loc("arg_KV_NUM_BLKS"(#loc253)) +#loc732 = loc("arg_KV_IDX"(#loc253)) +#loc733 = loc("arg_Q_NUM_BLKS"(#loc253)) +#loc734 = loc("arg_Q_IDX"(#loc253)) +#loc735 = loc("arg_FULL_KV_NUM_BLKS"(#loc253)) +#loc736 = loc("arg_FULL_KV_IDX"(#loc253)) +#loc737 = loc("arg_FULL_Q_NUM_BLKS"(#loc253)) +#loc738 = loc("arg_FULL_Q_IDX"(#loc253)) +#loc739 = loc("in_ptr16"(#loc253)) +#loc740 = loc("out_ptr0"(#loc253)) +#loc741 = loc("dq"(#loc253)) +#loc742 = loc("q"(#loc253)) +#loc743 = loc("kT_ptrs"(#loc253)) +#loc744 = loc("vT_ptrs"(#loc253)) +#loc745 = loc("do"(#loc253)) +#loc746 = loc("Di"(#loc253)) +#loc747 = loc("lse"(#loc253)) +#loc748 = loc("Q_LEN"(#loc253)) +#loc749 = loc("KV_LEN"(#loc253)) +#loc750 = loc("off_z"(#loc253)) +#loc751 = loc("off_hq"(#loc253)) +#loc752 = loc("offs_m2"(#loc253)) +#loc753 = loc("offs_n2"(#loc253)) +#loc754 = loc("offs_k"(#loc253)) +#loc755 = loc("offs_v"(#loc253)) +#loc756 = loc("stride_kn"(#loc253)) +#loc757 = loc("stride_kd"(#loc253)) +#loc758 = loc("stride_vn"(#loc253)) +#loc759 = loc("stride_vd"(#loc253)) +#loc760 = loc("kv_indices"(#loc253)) +#loc761 = loc("sparse_kv_num_blocks"(#loc253)) +#loc824 = loc("N_LEN"(#loc211)) +#loc825 = loc("indices"(#loc318)) +#loc826 = loc("loop_iter"(#loc321)) +#loc827 = loc("col_indices"(#loc321)) +#loc828 = loc("total_blocks"(#loc321)) +#loc847 = loc("arg_Q"(#loc342)) +#loc848 = loc("arg_K"(#loc342)) +#loc849 = loc("arg_V"(#loc342)) +#loc850 = loc("arg_LSE"(#loc342)) +#loc851 = loc("arg_DELTA"(#loc342)) +#loc852 = loc("arg_DO"(#loc342)) +#loc853 = loc("arg_DQ"(#loc342)) +#loc854 = loc("arg_DV"(#loc342)) +#loc855 = loc("arg_KV_NUM_BLKS"(#loc342)) +#loc856 = loc("arg_KV_IDX"(#loc342)) +#loc857 = loc("arg_Q_NUM_BLKS"(#loc342)) +#loc858 = loc("arg_Q_IDX"(#loc342)) +#loc859 = loc("arg_FULL_KV_NUM_BLKS"(#loc342)) +#loc860 = loc("arg_FULL_KV_IDX"(#loc342)) +#loc861 = loc("arg_FULL_Q_NUM_BLKS"(#loc342)) +#loc862 = loc("arg_FULL_Q_IDX"(#loc342)) +#loc863 = loc("in_ptr16"(#loc342)) +#loc864 = loc("out_ptr0"(#loc342)) +#loc865 = loc("Q"(#loc342)) +#loc866 = loc("DO"(#loc342)) +#loc867 = loc("DELTA"(#loc342)) +#loc868 = loc("LSE"(#loc342)) +#loc869 = loc("dk"(#loc342)) +#loc870 = loc("dv"(#loc342)) +#loc871 = loc("k"(#loc342)) +#loc872 = loc("v"(#loc342)) +#loc873 = loc("off_z"(#loc342)) +#loc874 = loc("off_hq"(#loc342)) +#loc875 = loc("offs_n1"(#loc342)) +#loc876 = loc("offs_m1"(#loc342)) +#loc877 = loc("stride_qm"(#loc342)) +#loc878 = loc("stride_qd"(#loc342)) +#loc879 = loc("stride_dom"(#loc342)) +#loc880 = loc("stride_dod"(#loc342)) +#loc881 = loc("q_indices"(#loc342)) +#loc882 = loc("sparse_q_num_blocks"(#loc342)) +#loc910 = loc("arg_Q"(#loc374)) +#loc911 = loc("arg_K"(#loc374)) +#loc912 = loc("arg_V"(#loc374)) +#loc913 = loc("arg_LSE"(#loc374)) +#loc914 = loc("arg_DELTA"(#loc374)) +#loc915 = loc("arg_DO"(#loc374)) +#loc916 = loc("arg_DQ"(#loc374)) +#loc917 = loc("arg_DV"(#loc374)) +#loc918 = loc("arg_KV_NUM_BLKS"(#loc374)) +#loc919 = loc("arg_KV_IDX"(#loc374)) +#loc920 = loc("arg_Q_NUM_BLKS"(#loc374)) +#loc921 = loc("arg_Q_IDX"(#loc374)) +#loc922 = loc("arg_FULL_KV_NUM_BLKS"(#loc374)) +#loc923 = loc("arg_FULL_KV_IDX"(#loc374)) +#loc924 = loc("arg_FULL_Q_NUM_BLKS"(#loc374)) +#loc925 = loc("arg_FULL_Q_IDX"(#loc374)) +#loc926 = loc("in_ptr16"(#loc374)) +#loc927 = loc("out_ptr0"(#loc374)) +#loc928 = loc("dk"(#loc374)) +#loc929 = loc("dv"(#loc374)) +#loc930 = loc("qT_ptrs"(#loc374)) +#loc931 = loc("k"(#loc374)) +#loc932 = loc("v"(#loc374)) +#loc933 = loc("do_ptrs"(#loc374)) +#loc934 = loc("DELTA"(#loc374)) +#loc935 = loc("LSE"(#loc374)) +#loc936 = loc("Q_LEN"(#loc374)) +#loc937 = loc("KV_LEN"(#loc374)) +#loc938 = loc("off_z"(#loc374)) +#loc939 = loc("off_hq"(#loc374)) +#loc940 = loc("offs_n1"(#loc374)) +#loc941 = loc("offs_m1"(#loc374)) +#loc942 = loc("offs_k"(#loc374)) +#loc943 = loc("offs_v"(#loc374)) +#loc944 = loc("stride_qm"(#loc374)) +#loc945 = loc("stride_qd"(#loc374)) +#loc946 = loc("stride_dom"(#loc374)) +#loc947 = loc("stride_dod"(#loc374)) +#loc948 = loc("q_indices"(#loc374)) +#loc949 = loc("sparse_q_num_blocks"(#loc374)) +module { + tt.func public @triton_tem_fused_zeros_1(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_DELTA: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DELTA"(#loc)), %arg_DO: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DO"(#loc)), %arg_DQ: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DQ"(#loc)), %arg_DV: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DV"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_NUM_BLKS"(#loc)), %arg_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %arg_FULL_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_NUM_BLKS"(#loc)), %arg_FULL_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_IDX"(#loc)), %in_ptr16: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr16"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc))) attributes {noinline = false} { + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c2097152_i32 = arith.constant 2097152 : i32 loc(#loc2) + %c262144_i32 = arith.constant 262144 : i32 loc(#loc2) + %c128_i32_0 = arith.constant 128 : i32 loc(#loc2) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc2) + %c2097152_i32_2 = arith.constant 2097152 : i32 loc(#loc3) + %c262144_i32_3 = arith.constant 262144 : i32 loc(#loc3) + %c128_i32_4 = arith.constant 128 : i32 loc(#loc3) + %c1_i32_5 = arith.constant 1 : i32 loc(#loc3) + %c8388608_i32_6 = arith.constant 8388608 : i32 loc(#loc4) + %c262144_i32_7 = arith.constant 262144 : i32 loc(#loc4) + %c128_i32_8 = arith.constant 128 : i32 loc(#loc4) + %c1_i32_9 = arith.constant 1 : i32 loc(#loc4) + %c8388608_i32_10 = arith.constant 8388608 : i32 loc(#loc5) + %c128_i32_11 = arith.constant 128 : i32 loc(#loc5) + %c4096_i32_12 = arith.constant 4096 : i32 loc(#loc5) + %c1_i32_13 = arith.constant 1 : i32 loc(#loc5) + %c2097152_i32_14 = arith.constant 2097152 : i32 loc(#loc6) + %c262144_i32_15 = arith.constant 262144 : i32 loc(#loc6) + %c128_i32_16 = arith.constant 128 : i32 loc(#loc6) + %c1_i32_17 = arith.constant 1 : i32 loc(#loc6) + %ZQ = arith.constant 8 : i32 loc(#loc463) + %HQ = arith.constant 32 : i32 loc(#loc464) + %HKV = arith.constant 8 : i32 loc(#loc465) + %Q_LEN = arith.constant 2048 : i32 loc(#loc466) + %ZKV = arith.constant 8 : i32 loc(#loc467) + %KV_LEN = arith.constant 2048 : i32 loc(#loc468) + %pid = tt.get_program_id x : i32 loc(#loc469) + %NUM_KV_BLOCKS = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_128_"(%KV_LEN) : (i32) -> i32 loc(#loc470) + %NUM_Q_BLOCKS = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_128_"(%Q_LEN) : (i32) -> i32 loc(#loc471) + %off_zq = tt.get_program_id y : i32 loc(#loc472) + %off_hkv = tt.get_program_id z : i32 loc(#loc473) + %off_zkv = arith.remsi %off_zq, %ZKV : i32 loc(#loc474) + %SPARSE_Z = arith.constant 8 : i32 loc(#loc475) + %SPARSE_HQ = arith.constant 1 : i32 loc(#loc476) + %sparse_idx_z = arith.remsi %off_zq, %SPARSE_Z : i32 loc(#loc477) + %k_adj = arith.muli %c262144_i32, %off_hkv : i32 loc(#loc478) + %k_adj_18 = arith.muli %c2097152_i32, %off_zkv : i32 loc(#loc479) + %k_adj_19 = arith.addi %k_adj, %k_adj_18 : i32 loc(#loc480) + %k_adj_20 = arith.extsi %k_adj_19 : i32 to i64 loc(#loc481) + %v_adj = arith.muli %c262144_i32_3, %off_hkv : i32 loc(#loc482) + %v_adj_21 = arith.muli %c2097152_i32_2, %off_zkv : i32 loc(#loc483) + %v_adj_22 = arith.addi %v_adj, %v_adj_21 : i32 loc(#loc484) + %v_adj_23 = arith.extsi %v_adj_22 : i32 to i64 loc(#loc485) + %dv_adj = arith.muli %c262144_i32_15, %off_hkv : i32 loc(#loc486) + %dv_adj_24 = arith.muli %c2097152_i32_14, %off_zq : i32 loc(#loc487) + %dv_adj_25 = arith.addi %dv_adj, %dv_adj_24 : i32 loc(#loc488) + %dv_adj_26 = arith.extsi %dv_adj_25 : i32 to i64 loc(#loc489) + %K = tt.addptr %arg_K, %k_adj_20 : !tt.ptr, i64 loc(#loc490) + %V = tt.addptr %arg_V, %v_adj_23 : !tt.ptr, i64 loc(#loc491) + %DV = tt.addptr %arg_DV, %dv_adj_26 : !tt.ptr, i64 loc(#loc492) + %RCP_LN2 = arith.constant 1.44269502 : f32 loc(#loc493) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc494) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc495) + %0 = arith.cmpi sge, %pid, %NUM_KV_BLOCKS : i32 loc(#loc40) + %1:2 = scf.if %0 -> (i32, i32) { + %off_pid = arith.subi %pid, %NUM_KV_BLOCKS : i32 loc(#loc496) + %SPARSE_Q_MULTIPLE = arith.constant 1 : i32 loc(#loc1018) + %SPARSE_KV_MULTIPLE = arith.constant 2 : i32 loc(#loc1019) + %off_hq2 = arith.divsi %off_pid, %NUM_Q_BLOCKS : i32 loc(#loc499) + %off_hq2_27 = arith.constant 4 : i32 loc(#loc500) + %off_hq2_28 = arith.constant 4 : i32 loc(#loc500) + %off_hq2_29 = arith.muli %off_hkv, %off_hq2_28 : i32 loc(#loc500) + %off_hq2_30 = arith.addi %off_hq2, %off_hq2_29 : i32 loc(#loc501) + %start_m2_block = arith.remsi %off_pid, %NUM_Q_BLOCKS : i32 loc(#loc502) + %off_pid_mask = arith.divsi %start_m2_block, %SPARSE_Q_MULTIPLE : i32 loc(#loc503) + %stride_kv_num_blks_h = arith.constant 16 : i32 loc(#loc504) + %stride_kv_idx_h = arith.constant 256 : i32 loc(#loc505) + %stride_kv_idx_m = arith.constant 16 : i32 loc(#loc506) + %sparse_idx_hq2 = arith.remsi %off_hq2_30, %SPARSE_HQ : i32 loc(#loc507) + %sparse_hz_offset = arith.muli %sparse_idx_z, %SPARSE_HQ : i32 loc(#loc508) + %sparse_hz_offset_31 = arith.addi %sparse_hz_offset, %sparse_idx_hq2 : i32 loc(#loc509) + %sparse_kv_num_blks_offset = arith.muli %sparse_hz_offset_31, %stride_kv_num_blks_h : i32 loc(#loc510) + %sparse_kv_num_blks_offset_32 = arith.addi %sparse_kv_num_blks_offset, %off_pid_mask : i32 loc(#loc511) + %sparse_kv_idx_offset = arith.muli %sparse_hz_offset_31, %stride_kv_idx_h : i32 loc(#loc512) + %sparse_kv_idx_offset_33 = arith.muli %off_pid_mask, %stride_kv_idx_m : i32 loc(#loc513) + %sparse_kv_idx_offset_34 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_33 : i32 loc(#loc514) + %q_adj2 = arith.muli %c128_i32, %off_hq2_30 : i32 loc(#loc515) + %q_adj2_35 = arith.muli %c8388608_i32, %off_zq : i32 loc(#loc516) + %q_adj2_36 = arith.addi %q_adj2, %q_adj2_35 : i32 loc(#loc517) + %q_adj2_37 = arith.extsi %q_adj2_36 : i32 to i64 loc(#loc518) + %do_adj2 = arith.muli %c262144_i32_7, %off_hq2_30 : i32 loc(#loc519) + %do_adj2_38 = arith.muli %c8388608_i32_6, %off_zq : i32 loc(#loc520) + %do_adj2_39 = arith.addi %do_adj2, %do_adj2_38 : i32 loc(#loc521) + %do_adj2_40 = arith.extsi %do_adj2_39 : i32 to i64 loc(#loc522) + %dq_adj2 = arith.muli %c128_i32_11, %off_hq2_30 : i32 loc(#loc523) + %dq_adj2_41 = arith.muli %c8388608_i32_10, %off_zq : i32 loc(#loc524) + %dq_adj2_42 = arith.addi %dq_adj2, %dq_adj2_41 : i32 loc(#loc525) + %dq_adj2_43 = arith.extsi %dq_adj2_42 : i32 to i64 loc(#loc526) + %off_chz2 = arith.muli %off_zq, %HQ : i32 loc(#loc527) + %off_chz2_44 = arith.addi %off_chz2, %off_hq2_30 : i32 loc(#loc528) + %off_chz2_45 = arith.muli %off_chz2_44, %Q_LEN : i32 loc(#loc529) + %off_chz2_46 = arith.extsi %off_chz2_45 : i32 to i64 loc(#loc530) + %Q2 = tt.addptr %arg_Q, %q_adj2_37 : !tt.ptr, i64 loc(#loc531) + %DO2 = tt.addptr %arg_DO, %do_adj2_40 : !tt.ptr, i64 loc(#loc532) + %DQ2 = tt.addptr %arg_DQ, %dq_adj2_43 : !tt.ptr, i64 loc(#loc533) + %LSE2 = tt.addptr %arg_LSE, %off_chz2_46 : !tt.ptr, i64 loc(#loc534) + %DELTA2 = tt.addptr %arg_DELTA, %off_chz2_46 : !tt.ptr, i64 loc(#loc535) + %dq = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc536) + %start_m2 = arith.constant 128 : i32 loc(#loc537) + %start_m2_47 = arith.constant 128 : i32 loc(#loc537) + %start_m2_48 = arith.muli %start_m2_block, %start_m2_47 : i32 loc(#loc537) + %offs_m2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc538) + %offs_m2_49 = tt.splat %start_m2_48 : i32 -> tensor<128xi32> loc(#loc539) + %offs_m2_50 = arith.addi %offs_m2_49, %offs_m2 : tensor<128xi32> loc(#loc539) + %q = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%Q2, %offs_m2_50, %offs_k, %c4096_i32, %c1_i32, %Q_LEN) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc540) + %do = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%DO2, %offs_m2_50, %offs_v, %c128_i32_8, %c1_i32_9, %Q_LEN) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc541) + %Di = tt.splat %DELTA2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc542) + %Di_51 = tt.addptr %Di, %offs_m2_50 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc542) + %Di_52 = tt.load %Di_51 : tensor<128x!tt.ptr> loc(#loc543) + %lse = tt.splat %LSE2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc544) + %lse_53 = tt.addptr %lse, %offs_m2_50 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc544) + %lse_54 = tt.load %lse_53 : tensor<128x!tt.ptr> loc(#loc545) + %lse_55 = arith.constant 0xFF800000 : f32 loc(#loc546) + %lse_56 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc546) + %lse_57 = arith.cmpf oeq, %lse_54, %lse_56 : tensor<128xf32> loc(#loc546) + %lse_58 = arith.constant 0.000000e+00 : f32 loc(#loc547) + %lse_59 = arith.constant 0.000000e+00 : f32 loc(#loc547) + %lse_60 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc547) + %lse_61 = arith.select %lse_57, %lse_60, %lse_54 : tensor<128xi1>, tensor<128xf32> loc(#loc547) + %lse_62 = tt.expand_dims %lse_61 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc548) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_34 : !tt.ptr, i32 loc(#loc549) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc550) + %kv_start_63 = arith.constant 128 : i32 loc(#loc551) + %kv_start_64 = arith.constant 128 : i32 loc(#loc551) + %kv_start_65 = arith.muli %kv_start, %kv_start_64 : i32 loc(#loc551) + %sparse_kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_32 : !tt.ptr, i32 loc(#loc552) + %sparse_kv_num_blocks_66 = tt.load %sparse_kv_num_blocks : !tt.ptr loc(#loc553) + %offs_n2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc554) + %offs_n2_67 = tt.splat %kv_start_65 : i32 -> tensor<64xi32> loc(#loc555) + %offs_n2_68 = arith.addi %offs_n2_67, %offs_n2 : tensor<64xi32> loc(#loc555) + %dq_69 = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(35,)cconstexpr_bf16__(36,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %K, %V, %dq, %q, %do, %Di_52, %lse_62, %off_zq, %off_hq2_30, %offs_m2_50, %offs_n2_68, %c128_i32_0, %c1_i32_1, %c128_i32_4, %c1_i32_5, %kv_indices, %sparse_kv_num_blocks_66) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc556) + %kv_indices_70 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_34 : !tt.ptr, i32 loc(#loc557) + %kv_start_71 = tt.load %kv_indices_70 : !tt.ptr loc(#loc558) + %kv_start_72 = arith.constant 128 : i32 loc(#loc559) + %kv_start_73 = arith.constant 128 : i32 loc(#loc559) + %kv_start_74 = arith.muli %kv_start_71, %kv_start_73 : i32 loc(#loc559) + %sparse_kv_num_blocks_75 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_32 : !tt.ptr, i32 loc(#loc560) + %sparse_kv_num_blocks_76 = tt.load %sparse_kv_num_blocks_75 : !tt.ptr loc(#loc561) + %offs_n2_77 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc562) + %offs_n2_78 = tt.splat %kv_start_74 : i32 -> tensor<64xi32> loc(#loc563) + %offs_n2_79 = arith.addi %offs_n2_78, %offs_n2_77 : tensor<64xi32> loc(#loc563) + %dq_80 = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(35,)cconstexpr_bf16__(36,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %K, %V, %dq_69, %q, %do, %Di_52, %lse_62, %off_zq, %off_hq2_30, %offs_m2_50, %offs_n2_79, %c128_i32_0, %c1_i32_1, %c128_i32_4, %c1_i32_5, %kv_indices_70, %sparse_kv_num_blocks_76) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc564) + %dq_ptrs = tt.expand_dims %offs_m2_50 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc565) + %dq_ptrs_81 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc566) + %dq_ptrs_82 = arith.muli %dq_ptrs, %dq_ptrs_81 : tensor<128x1xi32> loc(#loc566) + %dq_ptrs_83 = tt.splat %DQ2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc567) + %dq_ptrs_84 = tt.addptr %dq_ptrs_83, %dq_ptrs_82 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc567) + %dq_ptrs_85 = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc568) + %dq_ptrs_86 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc569) + %dq_ptrs_87 = arith.muli %dq_ptrs_85, %dq_ptrs_86 : tensor<1x128xi32> loc(#loc569) + %dq_ptrs_88 = tt.broadcast %dq_ptrs_84 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc570) + %dq_ptrs_89 = tt.broadcast %dq_ptrs_87 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc570) + %dq_ptrs_90 = tt.addptr %dq_ptrs_88, %dq_ptrs_89 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc570) + %dq_91 = arith.constant 0.0883883461 : f32 loc(#loc571) + %dq_92 = arith.constant 0.0883883461 : f32 loc(#loc571) + %dq_93 = arith.constant dense<0.0883883461> : tensor<128x128xf32> loc(#loc571) + %dq_94 = arith.mulf %dq_80, %dq_93 : tensor<128x128xf32> loc(#loc571) + %2 = arith.truncf %dq_94 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc118) + tt.store %dq_ptrs_90, %2 : tensor<128x128x!tt.ptr> loc(#loc118) + scf.yield %SPARSE_KV_MULTIPLE, %SPARSE_Q_MULTIPLE : i32, i32 loc(#loc118) + } else { + %SPARSE_Q_MULTIPLE = arith.constant 2 : i32 loc(#loc1020) + %SPARSE_KV_MULTIPLE = arith.constant 1 : i32 loc(#loc1021) + %pid_mask = arith.divsi %pid, %SPARSE_KV_MULTIPLE : i32 loc(#loc574) + %stride_q_num_blks_h = arith.constant 16 : i32 loc(#loc575) + %stride_q_idx_h = arith.constant 256 : i32 loc(#loc576) + %stride_q_idx_n = arith.constant 16 : i32 loc(#loc577) + %dv = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc578) + %dk = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc579) + %start_n1 = arith.constant 128 : i32 loc(#loc580) + %start_n1_27 = arith.constant 128 : i32 loc(#loc580) + %start_n1_28 = arith.muli %pid, %start_n1_27 : i32 loc(#loc580) + %offs_n1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc581) + %offs_n1_29 = tt.splat %start_n1_28 : i32 -> tensor<128xi32> loc(#loc582) + %offs_n1_30 = arith.addi %offs_n1_29, %offs_n1 : tensor<128xi32> loc(#loc582) + %k = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%K, %offs_n1_30, %offs_k, %c128_i32_0, %c1_i32_1, %KV_LEN) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc583) + %v = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%V, %offs_n1_30, %offs_v, %c128_i32_4, %c1_i32_5, %KV_LEN) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc584) + %c0_i32 = arith.constant 0 : i32 loc(#loc132) + %c4_i32 = arith.constant 4 : i32 loc(#loc132) + %c1_i32_31 = arith.constant 1 : i32 loc(#loc132) + %2 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc132) + %3 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc132) + %4 = arith.bitcast %c1_i32_31 : i32 to i32 loc(#loc132) + %5 = ub.poison : i32 loc(#loc132) + %dk_32:2 = scf.for %off_g = %2 to %3 step %4 iter_args(%dv_64 = %dv, %dk_65 = %dk) -> (tensor<128x128xf32>, tensor<128x128xf32>) : i32 { + %off_hq1 = arith.constant 4 : i32 loc(#loc586) + %off_hq1_66 = arith.constant 4 : i32 loc(#loc586) + %off_hq1_67 = arith.muli %off_hkv, %off_hq1_66 : i32 loc(#loc586) + %off_hq1_68 = arith.addi %off_hq1_67, %off_g : i32 loc(#loc587) + %q_adj1 = arith.muli %c128_i32, %off_hq1_68 : i32 loc(#loc588) + %q_adj1_69 = arith.muli %c8388608_i32, %off_zq : i32 loc(#loc589) + %q_adj1_70 = arith.addi %q_adj1, %q_adj1_69 : i32 loc(#loc590) + %q_adj1_71 = arith.extsi %q_adj1_70 : i32 to i64 loc(#loc591) + %do_adj1 = arith.muli %c262144_i32_7, %off_hq1_68 : i32 loc(#loc592) + %do_adj1_72 = arith.muli %c8388608_i32_6, %off_zq : i32 loc(#loc593) + %do_adj1_73 = arith.addi %do_adj1, %do_adj1_72 : i32 loc(#loc594) + %do_adj1_74 = arith.extsi %do_adj1_73 : i32 to i64 loc(#loc595) + %dq_adj1 = arith.muli %c128_i32_11, %off_hq1_68 : i32 loc(#loc596) + %dq_adj1_75 = arith.muli %c8388608_i32_10, %off_zq : i32 loc(#loc597) + %dq_adj1_76 = arith.addi %dq_adj1, %dq_adj1_75 : i32 loc(#loc598) + %dq_adj1_77 = arith.extsi %dq_adj1_76 : i32 to i64 loc(#loc599) + %off_chz1 = arith.muli %off_zq, %HQ : i32 loc(#loc600) + %off_chz1_78 = arith.addi %off_chz1, %off_hq1_68 : i32 loc(#loc601) + %off_chz1_79 = arith.muli %off_chz1_78, %Q_LEN : i32 loc(#loc602) + %off_chz1_80 = arith.extsi %off_chz1_79 : i32 to i64 loc(#loc603) + %Q1 = tt.addptr %arg_Q, %q_adj1_71 : !tt.ptr, i64 loc(#loc604) + %DO1 = tt.addptr %arg_DO, %do_adj1_74 : !tt.ptr, i64 loc(#loc605) + %LSE1 = tt.addptr %arg_LSE, %off_chz1_80 : !tt.ptr, i64 loc(#loc606) + %DELTA1 = tt.addptr %arg_DELTA, %off_chz1_80 : !tt.ptr, i64 loc(#loc607) + %sparse_idx_hq1 = arith.remsi %off_hq1_68, %SPARSE_HQ : i32 loc(#loc608) + %sparse_hz_offset = arith.muli %sparse_idx_z, %SPARSE_HQ : i32 loc(#loc609) + %sparse_hz_offset_81 = arith.addi %sparse_hz_offset, %sparse_idx_hq1 : i32 loc(#loc610) + %sparse_q_num_blks_offset = arith.muli %sparse_hz_offset_81, %stride_q_num_blks_h : i32 loc(#loc611) + %sparse_q_num_blks_offset_82 = arith.addi %sparse_q_num_blks_offset, %pid_mask : i32 loc(#loc612) + %sparse_q_idx_offset = arith.muli %sparse_hz_offset_81, %stride_q_idx_h : i32 loc(#loc613) + %sparse_q_idx_offset_83 = arith.muli %pid_mask, %stride_q_idx_n : i32 loc(#loc614) + %sparse_q_idx_offset_84 = arith.addi %sparse_q_idx_offset, %sparse_q_idx_offset_83 : i32 loc(#loc615) + %q_indices = tt.addptr %arg_Q_IDX, %sparse_q_idx_offset_84 : !tt.ptr, i32 loc(#loc616) + %q_start = tt.load %q_indices : !tt.ptr loc(#loc617) + %q_start_85 = arith.constant 128 : i32 loc(#loc618) + %q_start_86 = arith.constant 128 : i32 loc(#loc618) + %q_start_87 = arith.muli %q_start, %q_start_86 : i32 loc(#loc618) + %sparse_q_num_blocks = tt.addptr %arg_Q_NUM_BLKS, %sparse_q_num_blks_offset_82 : !tt.ptr, i32 loc(#loc619) + %sparse_q_num_blocks_88 = tt.load %sparse_q_num_blocks : !tt.ptr loc(#loc620) + %offs_m1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc621) + %offs_m1_89 = tt.splat %q_start_87 : i32 -> tensor<64xi32> loc(#loc622) + %offs_m1_90 = arith.addi %offs_m1_89, %offs_m1 : tensor<64xi32> loc(#loc622) + %11:2 = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(36,)cconstexpr_bf16__(37,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %Q1, %DO1, %DELTA1, %LSE1, %dk_65, %dv_64, %k, %v, %off_zq, %off_hq1_68, %offs_n1_30, %offs_m1_90, %c4096_i32, %c1_i32, %c128_i32_8, %c1_i32_9, %q_indices, %sparse_q_num_blocks_88) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc170) + %q_indices_91 = tt.addptr %arg_FULL_Q_IDX, %sparse_q_idx_offset_84 : !tt.ptr, i32 loc(#loc623) + %q_start_92 = tt.load %q_indices_91 : !tt.ptr loc(#loc624) + %q_start_93 = arith.constant 128 : i32 loc(#loc625) + %q_start_94 = arith.constant 128 : i32 loc(#loc625) + %q_start_95 = arith.muli %q_start_92, %q_start_94 : i32 loc(#loc625) + %sparse_q_num_blocks_96 = tt.addptr %arg_FULL_Q_NUM_BLKS, %sparse_q_num_blks_offset_82 : !tt.ptr, i32 loc(#loc626) + %sparse_q_num_blocks_97 = tt.load %sparse_q_num_blocks_96 : !tt.ptr loc(#loc627) + %offs_m1_98 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc628) + %offs_m1_99 = tt.splat %q_start_95 : i32 -> tensor<64xi32> loc(#loc629) + %offs_m1_100 = arith.addi %offs_m1_99, %offs_m1_98 : tensor<64xi32> loc(#loc629) + %12:2 = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(36,)cconstexpr_bf16__(37,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %Q1, %DO1, %DELTA1, %LSE1, %11#0, %11#1, %k, %v, %off_zq, %off_hq1_68, %offs_n1_30, %offs_m1_100, %c4096_i32, %c1_i32, %c128_i32_8, %c1_i32_9, %q_indices_91, %sparse_q_num_blocks_97) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc178) + scf.yield %12#1, %12#0 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc179) + } loc(#loc1022) + %dv_ptrs = tt.expand_dims %offs_n1_30 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc630) + %dv_ptrs_33 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc631) + %dv_ptrs_34 = arith.muli %dv_ptrs, %dv_ptrs_33 : tensor<128x1xi32> loc(#loc631) + %dv_ptrs_35 = tt.splat %DV : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc632) + %dv_ptrs_36 = tt.addptr %dv_ptrs_35, %dv_ptrs_34 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc632) + %dv_ptrs_37 = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc633) + %dv_ptrs_38 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc634) + %dv_ptrs_39 = arith.muli %dv_ptrs_37, %dv_ptrs_38 : tensor<1x128xi32> loc(#loc634) + %dv_ptrs_40 = tt.broadcast %dv_ptrs_36 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc635) + %dv_ptrs_41 = tt.broadcast %dv_ptrs_39 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc635) + %dv_ptrs_42 = tt.addptr %dv_ptrs_40, %dv_ptrs_41 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc635) + %index_n = tt.expand_dims %offs_n1_30 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc636) + %index_k = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc637) + %index_v = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc638) + %6 = arith.truncf %dk_32#0 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc189) + tt.store %dv_ptrs_42, %6 : tensor<128x128x!tt.ptr> loc(#loc189) + %dk_43 = arith.constant 0.0883883461 : f32 loc(#loc639) + %dk_44 = arith.constant 0.0883883461 : f32 loc(#loc639) + %dk_45 = arith.constant dense<0.0883883461> : tensor<128x128xf32> loc(#loc639) + %dk_46 = arith.mulf %dk_32#1, %dk_45 : tensor<128x128xf32> loc(#loc639) + %mask = arith.constant dense<2048> : tensor<128x1xi32> loc(#loc640) + %mask_47 = arith.cmpi slt, %index_n, %mask : tensor<128x1xi32> loc(#loc640) + %xindex = arith.constant 128 : i32 loc(#loc641) + %xindex_48 = arith.constant 128 : i32 loc(#loc641) + %xindex_49 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc641) + %xindex_50 = arith.muli %xindex_49, %index_n : tensor<128x1xi32> loc(#loc641) + %xindex_51 = tt.broadcast %index_k : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc642) + %xindex_52 = tt.broadcast %xindex_50 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc642) + %xindex_53 = arith.addi %xindex_51, %xindex_52 : tensor<128x128xi32> loc(#loc642) + %xindex_54 = arith.constant 262144 : i32 loc(#loc643) + %xindex_55 = arith.constant 262144 : i32 loc(#loc643) + %xindex_56 = arith.muli %xindex_55, %off_hkv : i32 loc(#loc643) + %xindex_57 = tt.splat %xindex_56 : i32 -> tensor<128x128xi32> loc(#loc644) + %xindex_58 = arith.addi %xindex_53, %xindex_57 : tensor<128x128xi32> loc(#loc644) + %xindex_59 = arith.constant 2097152 : i32 loc(#loc645) + %xindex_60 = arith.constant 2097152 : i32 loc(#loc645) + %xindex_61 = arith.muli %xindex_60, %off_zq : i32 loc(#loc645) + %xindex_62 = tt.splat %xindex_61 : i32 -> tensor<128x128xi32> loc(#loc646) + %xindex_63 = arith.addi %xindex_58, %xindex_62 : tensor<128x128xi32> loc(#loc646) + %7 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc198) + %8 = tt.addptr %7, %xindex_63 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc198) + %9 = tt.broadcast %mask_47 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc199) + %10 = arith.truncf %dk_46 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc199) + tt.store %8, %10, %9 : tensor<128x128x!tt.ptr> loc(#loc199) + scf.yield %SPARSE_KV_MULTIPLE, %SPARSE_Q_MULTIPLE : i32, i32 loc(#loc199) + } loc(#loc41) + tt.return loc(#loc200) + } loc(#loc) + tt.func private @"triton.language.standard.cdiv__i32__(1,)cconstexpr_128_"(%x: i32 loc("x"(#loc201))) -> i32 attributes {noinline = false} { + %c128_i32 = arith.constant 128 : i32 loc(#loc202) + %c128_i32_0 = arith.constant 128 : i32 loc(#loc202) + %0 = arith.addi %x, %c128_i32_0 : i32 loc(#loc202) + %c1_i32 = arith.constant 1 : i32 loc(#loc203) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc203) + %1 = arith.subi %0, %c1_i32_1 : i32 loc(#loc203) + %c128_i32_2 = arith.constant 128 : i32 loc(#loc204) + %c128_i32_3 = arith.constant 128 : i32 loc(#loc204) + %2 = arith.divsi %1, %c128_i32_3 : i32 loc(#loc204) + tt.return %2 : i32 loc(#loc205) + ^bb1: // no predecessors + %3 = ub.poison : i32 loc(#loc206) + tt.return %3 : i32 loc(#loc206) + } loc(#loc201) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() -> tensor<128x128xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc208) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc208) + tt.return %cst_0 : tensor<128x128xf32> loc(#loc209) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc210) + tt.return %0 : tensor<128x128xf32> loc(#loc210) + } loc(#loc207) + tt.func private @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: !tt.ptr loc("ptr"(#loc211)), %offs_m: tensor<128xi32> loc("offs_m"(#loc211)), %offs_n: tensor<128xi32> loc("offs_n"(#loc211)), %stride_m: i32 loc("stride_m"(#loc211)), %stride_n: i32 loc("stride_n"(#loc211)), %M_LEN: i32 loc("M_LEN"(#loc211))) -> tensor<128x128xbf16> attributes {noinline = false} { + %ptr_0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc654) + %ptr_1 = tt.splat %stride_m : i32 -> tensor<128x1xi32> loc(#loc655) + %ptr_2 = arith.muli %ptr_0, %ptr_1 : tensor<128x1xi32> loc(#loc655) + %ptr_3 = tt.splat %ptr : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc656) + %ptr_4 = tt.addptr %ptr_3, %ptr_2 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc656) + %ptr_5 = tt.expand_dims %offs_n {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc657) + %ptr_6 = tt.splat %stride_n : i32 -> tensor<1x128xi32> loc(#loc658) + %ptr_7 = arith.muli %ptr_5, %ptr_6 : tensor<1x128xi32> loc(#loc658) + %ptr_8 = tt.broadcast %ptr_4 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc659) + %ptr_9 = tt.broadcast %ptr_7 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc659) + %ptr_10 = tt.addptr %ptr_8, %ptr_9 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc659) + %0 = tt.load %ptr_10 : tensor<128x128x!tt.ptr> loc(#loc218) + tt.return %0 : tensor<128x128xbf16> loc(#loc219) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x128xbf16> loc(#loc220) + tt.return %1 : tensor<128x128xbf16> loc(#loc220) + } loc(#loc211) + tt.func private @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(35,)cconstexpr_bf16__(36,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc221)), %arg_K: !tt.ptr loc("arg_K"(#loc221)), %arg_V: !tt.ptr loc("arg_V"(#loc221)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc221)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc221)), %arg_DO: !tt.ptr loc("arg_DO"(#loc221)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc221)), %arg_DV: !tt.ptr loc("arg_DV"(#loc221)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc221)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc221)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc221)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc221)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc221)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc221)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc221)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc221)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc221)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc221)), %K: !tt.ptr loc("K"(#loc221)), %V: !tt.ptr loc("V"(#loc221)), %dq: tensor<128x128xf32> loc("dq"(#loc221)), %q: tensor<128x128xbf16> loc("q"(#loc221)), %do: tensor<128x128xbf16> loc("do"(#loc221)), %Di: tensor<128xf32> loc("Di"(#loc221)), %lse: tensor<128x1xf32> loc("lse"(#loc221)), %off_z: i32 loc("off_z"(#loc221)), %off_hq: i32 loc("off_hq"(#loc221)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc221)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc221)), %stride_kn: i32 loc("stride_kn"(#loc221)), %stride_kd: i32 loc("stride_kd"(#loc221)), %stride_vn: i32 loc("stride_vn"(#loc221)), %stride_vd: i32 loc("stride_vd"(#loc221)), %kv_indices: !tt.ptr loc("kv_indices"(#loc221)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc221))) -> tensor<128x128xf32> attributes {noinline = false} { + %Q_LEN = arith.constant 2048 : i32 loc(#loc695) + %KV_LEN = arith.constant 2048 : i32 loc(#loc696) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc697) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc698) + %kT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc699) + %kT_ptrs_0 = tt.splat %stride_kn : i32 -> tensor<1x64xi32> loc(#loc700) + %kT_ptrs_1 = arith.muli %kT_ptrs, %kT_ptrs_0 : tensor<1x64xi32> loc(#loc700) + %kT_ptrs_2 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc701) + %kT_ptrs_3 = tt.addptr %kT_ptrs_2, %kT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc701) + %kT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc702) + %kT_ptrs_5 = tt.splat %stride_kd : i32 -> tensor<128x1xi32> loc(#loc703) + %kT_ptrs_6 = arith.muli %kT_ptrs_4, %kT_ptrs_5 : tensor<128x1xi32> loc(#loc703) + %kT_ptrs_7 = tt.broadcast %kT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc704) + %kT_ptrs_8 = tt.broadcast %kT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc704) + %kT_ptrs_9 = tt.addptr %kT_ptrs_7, %kT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc704) + %vT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc705) + %vT_ptrs_10 = tt.splat %stride_vn : i32 -> tensor<1x64xi32> loc(#loc706) + %vT_ptrs_11 = arith.muli %vT_ptrs, %vT_ptrs_10 : tensor<1x64xi32> loc(#loc706) + %vT_ptrs_12 = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc707) + %vT_ptrs_13 = tt.addptr %vT_ptrs_12, %vT_ptrs_11 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc707) + %vT_ptrs_14 = tt.expand_dims %offs_v {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc708) + %vT_ptrs_15 = tt.splat %stride_vd : i32 -> tensor<128x1xi32> loc(#loc709) + %vT_ptrs_16 = arith.muli %vT_ptrs_14, %vT_ptrs_15 : tensor<128x1xi32> loc(#loc709) + %vT_ptrs_17 = tt.broadcast %vT_ptrs_13 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc710) + %vT_ptrs_18 = tt.broadcast %vT_ptrs_16 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc710) + %vT_ptrs_19 = tt.addptr %vT_ptrs_17, %vT_ptrs_18 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc710) + %hi = arith.constant 2 : i32 loc(#loc711) + %hi_20 = arith.constant 2 : i32 loc(#loc711) + %hi_21 = arith.muli %sparse_kv_num_blocks, %hi_20 : i32 loc(#loc711) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%KV_LEN) : (i32) -> i32 loc(#loc712) + %hi_23 = arith.constant 1 : i32 loc(#loc713) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc713) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc714) + %c0_i32 = arith.constant 0 : i32 loc(#loc242) + %c1_i32 = arith.constant 1 : i32 loc(#loc242) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc242) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc242) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc242) + %3 = ub.poison : i32 loc(#loc242) + %vT_ptrs_26:4 = scf.for %start_n = %0 to %1 step %2 iter_args(%dq_27 = %dq, %offs_n2_28 = %offs_n2, %kT_ptrs_29 = %kT_ptrs_9, %vT_ptrs_30 = %vT_ptrs_19) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %dq_31 = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(39,)cconstexpr_bf16__(40,)cconstexpr_1_d_44269504__(41,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %dq_27, %q, %kT_ptrs_29, %vT_ptrs_30, %do, %Di, %lse, %Q_LEN, %KV_LEN, %off_z, %off_hq, %offs_m2, %offs_n2_28, %offs_k, %offs_v, %stride_kn, %stride_kd, %stride_vn, %stride_vd, %kv_indices, %sparse_kv_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc716) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %sparse_kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc717) + %kT_ptrs_32 = arith.muli %offset, %stride_kn : i32 loc(#loc718) + %kT_ptrs_33 = tt.splat %kT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc719) + %kT_ptrs_34 = tt.addptr %kT_ptrs_29, %kT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc719) + %vT_ptrs_35 = arith.muli %offset, %stride_vn : i32 loc(#loc720) + %vT_ptrs_36 = tt.splat %vT_ptrs_35 : i32 -> tensor<128x64xi32> loc(#loc721) + %vT_ptrs_37 = tt.addptr %vT_ptrs_30, %vT_ptrs_36 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc721) + %offs_n2_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc722) + %offs_n2_39 = arith.addi %offs_n2_28, %offs_n2_38 : tensor<64xi32> loc(#loc722) + scf.yield %dq_31, %offs_n2_39, %kT_ptrs_34, %vT_ptrs_37 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc250) + } loc(#loc1027) + tt.return %vT_ptrs_26#0 : tensor<128x128xf32> loc(#loc251) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc252) + tt.return %4 : tensor<128x128xf32> loc(#loc252) + } loc(#loc221) + tt.func private @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%x: i32 loc("x"(#loc201))) -> i32 attributes {noinline = false} { + %c64_i32 = arith.constant 64 : i32 loc(#loc202) + %c64_i32_0 = arith.constant 64 : i32 loc(#loc202) + %0 = arith.addi %x, %c64_i32_0 : i32 loc(#loc202) + %c1_i32 = arith.constant 1 : i32 loc(#loc203) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc203) + %1 = arith.subi %0, %c1_i32_1 : i32 loc(#loc203) + %c64_i32_2 = arith.constant 64 : i32 loc(#loc204) + %c64_i32_3 = arith.constant 64 : i32 loc(#loc204) + %2 = arith.divsi %1, %c64_i32_3 : i32 loc(#loc204) + tt.return %2 : i32 loc(#loc205) + ^bb1: // no predecessors + %3 = ub.poison : i32 loc(#loc206) + tt.return %3 : i32 loc(#loc206) + } loc(#loc201) + tt.func private @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(39,)cconstexpr_bf16__(40,)cconstexpr_1_d_44269504__(41,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc253)), %arg_K: !tt.ptr loc("arg_K"(#loc253)), %arg_V: !tt.ptr loc("arg_V"(#loc253)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc253)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc253)), %arg_DO: !tt.ptr loc("arg_DO"(#loc253)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc253)), %arg_DV: !tt.ptr loc("arg_DV"(#loc253)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc253)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc253)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc253)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc253)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc253)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc253)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc253)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc253)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc253)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc253)), %dq: tensor<128x128xf32> loc("dq"(#loc253)), %q: tensor<128x128xbf16> loc("q"(#loc253)), %kT_ptrs: tensor<128x64x!tt.ptr> loc("kT_ptrs"(#loc253)), %vT_ptrs: tensor<128x64x!tt.ptr> loc("vT_ptrs"(#loc253)), %do: tensor<128x128xbf16> loc("do"(#loc253)), %Di: tensor<128xf32> loc("Di"(#loc253)), %lse: tensor<128x1xf32> loc("lse"(#loc253)), %Q_LEN: i32 loc("Q_LEN"(#loc253)), %KV_LEN: i32 loc("KV_LEN"(#loc253)), %off_z: i32 loc("off_z"(#loc253)), %off_hq: i32 loc("off_hq"(#loc253)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc253)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc253)), %offs_k: tensor<128xi32> loc("offs_k"(#loc253)), %offs_v: tensor<128xi32> loc("offs_v"(#loc253)), %stride_kn: i32 loc("stride_kn"(#loc253)), %stride_kd: i32 loc("stride_kd"(#loc253)), %stride_vn: i32 loc("stride_vn"(#loc253)), %stride_vd: i32 loc("stride_vd"(#loc253)), %kv_indices: !tt.ptr loc("kv_indices"(#loc253)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc253))) -> tensor<128x128xf32> attributes {noinline = false} { + %kT = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(7,)cconstexpr_128_"(%kT_ptrs, %offs_k, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc762) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc763) + %qk_0 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc763) + %qk_1 = tt.dot %q, %kT, %qk_0, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc763) + %qk_2 = arith.constant 0.0883883461 : f32 loc(#loc764) + %qk_3 = arith.constant 0.0883883461 : f32 loc(#loc764) + %qk_4 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc764) + %qk_5 = arith.mulf %qk_1, %qk_4 : tensor<128x64xf32> loc(#loc764) + %n = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc765) + %n_6 = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.get_bounded_indices__i32S1_64S__(1,)cconstexpr_None_"(%n) : (tensor<1x64xi32>) -> tensor<1x64xi32> loc(#loc766) + %m = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc767) + %m_7 = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.get_bounded_indices__i32S128_1S__(1,)cconstexpr_None_"(%m) : (tensor<128x1xi32>) -> tensor<128x1xi32> loc(#loc768) + %tmp1 = arith.constant false loc(#loc769) + %tmp1_8 = arith.constant dense : tensor<1xi1> loc(#loc769) + %tmp4 = tt.broadcast %m_7 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc770) + %tmp4_9 = tt.broadcast %n_6 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc770) + %tmp4_10 = arith.cmpi sge, %tmp4, %tmp4_9 : tensor<128x64xi32> loc(#loc770) + %tmp5 = arith.extsi %n_6 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc771) + %tmp7 = tt.addptr %in_ptr16, %off_z : !tt.ptr, i32 loc(#loc772) + %tmp7_11 = tt.load %tmp7 : !tt.ptr loc(#loc773) + %tmp8 = tt.splat %tmp7_11 : i64 -> tensor<1x64xi64> loc(#loc774) + %tmp8_12 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc774) + %tmp9 = arith.extsi %m_7 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc775) + %tmp10 = tt.splat %tmp7_11 : i64 -> tensor<128x1xi64> loc(#loc776) + %tmp10_13 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc776) + %tmp11 = tt.broadcast %tmp8_12 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc777) + %tmp11_14 = tt.broadcast %tmp10_13 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc777) + %tmp11_15 = arith.andi %tmp11, %tmp11_14 : tensor<128x64xi1> loc(#loc777) + %tmp12 = arith.andi %tmp4_10, %tmp11_15 : tensor<128x64xi1> loc(#loc778) + %tmp13 = tt.expand_dims %tmp1_8 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc779) + %tmp13_16 = tt.broadcast %tmp13 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc779) + %tmp13_17 = arith.ori %tmp13_16, %tmp12 : tensor<128x64xi1> loc(#loc779) + %tmp14 = arith.constant 2048 : i32 loc(#loc780) + %tmp14_18 = arith.constant dense<2048> : tensor<1xi32> loc(#loc780) + %tmp15 = tt.expand_dims %tmp14_18 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc781) + %tmp15_19 = tt.broadcast %tmp15 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc781) + %tmp15_20 = arith.cmpi sge, %n_6, %tmp15_19 : tensor<1x64xi32> loc(#loc781) + %tmp16 = tt.expand_dims %tmp14_18 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc782) + %tmp16_21 = tt.broadcast %tmp16 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc782) + %tmp16_22 = arith.remsi %n_6, %tmp16_21 : tensor<1x64xi32> loc(#loc782) + %tmp17 = arith.constant 0 : i32 loc(#loc783) + %tmp17_23 = arith.constant dense<0> : tensor<1xi32> loc(#loc783) + %tmp18 = tt.expand_dims %tmp17_23 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc784) + %tmp18_24 = tt.broadcast %tmp18 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc784) + %tmp18_25 = arith.cmpi ne, %tmp16_22, %tmp18_24 : tensor<1x64xi32> loc(#loc784) + %tmp19 = arith.constant 0 : i32 loc(#loc785) + %tmp19_26 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc785) + %tmp19_27 = arith.cmpi slt, %tmp16_22, %tmp19_26 : tensor<1x64xi32> loc(#loc785) + %tmp20 = arith.constant 0 : i32 loc(#loc786) + %tmp20_28 = arith.constant dense<0> : tensor<1xi32> loc(#loc786) + %tmp20_29 = arith.cmpi slt, %tmp14_18, %tmp20_28 : tensor<1xi32> loc(#loc786) + %tmp21 = tt.expand_dims %tmp20_29 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc787) + %tmp21_30 = tt.broadcast %tmp21 : tensor<1x1xi1> -> tensor<1x64xi1> loc(#loc787) + %tmp21_31 = arith.cmpi ne, %tmp19_27, %tmp21_30 : tensor<1x64xi1> loc(#loc787) + %tmp22 = arith.andi %tmp18_25, %tmp21_31 : tensor<1x64xi1> loc(#loc788) + %tmp23 = tt.expand_dims %tmp14_18 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc789) + %tmp23_32 = tt.broadcast %tmp23 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc789) + %tmp23_33 = arith.addi %tmp16_22, %tmp23_32 : tensor<1x64xi32> loc(#loc789) + %tmp24 = arith.select %tmp22, %tmp23_33, %tmp16_22 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc790) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc791) + %tmp26 = tt.splat %tmp7_11 : i64 -> tensor<1x64xi64> loc(#loc792) + %tmp26_34 = arith.cmpi slt, %tmp25, %tmp26 : tensor<1x64xi64> loc(#loc792) + %tmp27 = arith.andi %tmp15_20, %tmp26_34 : tensor<1x64xi1> loc(#loc793) + %tmp28 = tt.broadcast %n_6 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc794) + %tmp28_35 = tt.broadcast %m_7 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc794) + %tmp28_36 = arith.subi %tmp28, %tmp28_35 : tensor<128x64xi32> loc(#loc794) + %tmp29 = tt.expand_dims %tmp14_18 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc795) + %tmp29_37 = tt.broadcast %tmp29 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc795) + %tmp29_38 = arith.remsi %tmp28_36, %tmp29_37 : tensor<128x64xi32> loc(#loc795) + %tmp30 = tt.expand_dims %tmp17_23 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc796) + %tmp30_39 = tt.broadcast %tmp30 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc796) + %tmp30_40 = arith.cmpi ne, %tmp29_38, %tmp30_39 : tensor<128x64xi32> loc(#loc796) + %tmp31 = arith.constant 0 : i32 loc(#loc797) + %tmp31_41 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc797) + %tmp31_42 = arith.cmpi slt, %tmp29_38, %tmp31_41 : tensor<128x64xi32> loc(#loc797) + %tmp32 = tt.expand_dims %tmp20_29 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc798) + %tmp32_43 = tt.broadcast %tmp32 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc798) + %tmp32_44 = arith.cmpi ne, %tmp31_42, %tmp32_43 : tensor<128x64xi1> loc(#loc798) + %tmp33 = arith.andi %tmp30_40, %tmp32_44 : tensor<128x64xi1> loc(#loc799) + %tmp34 = tt.expand_dims %tmp14_18 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc800) + %tmp34_45 = tt.broadcast %tmp34 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc800) + %tmp34_46 = arith.addi %tmp29_38, %tmp34_45 : tensor<128x64xi32> loc(#loc800) + %tmp35 = arith.select %tmp33, %tmp34_46, %tmp29_38 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc801) + %tmp36 = tt.expand_dims %tmp17_23 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc802) + %tmp36_47 = tt.broadcast %tmp36 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc802) + %tmp36_48 = arith.cmpi eq, %tmp35, %tmp36_47 : tensor<128x64xi32> loc(#loc802) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc803) + %tmp37_49 = arith.andi %tmp37, %tmp36_48 : tensor<128x64xi1> loc(#loc803) + %tmp38 = arith.ori %tmp13_17, %tmp37_49 : tensor<128x64xi1> loc(#loc804) + %post_mod_scores = arith.constant 0xFF800000 : f32 loc(#loc805) + %post_mod_scores_50 = arith.constant 0xFF800000 : f32 loc(#loc805) + %post_mod_scores_51 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc805) + %post_mod_scores_52 = arith.select %tmp38, %qk_5, %post_mod_scores_51 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc805) + %post_mod_scores_53 = arith.constant 1.44269502 : f32 loc(#loc806) + %post_mod_scores_54 = arith.constant 1.44269502 : f32 loc(#loc806) + %post_mod_scores_55 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc806) + %post_mod_scores_56 = arith.mulf %post_mod_scores_52, %post_mod_scores_55 : tensor<128x64xf32> loc(#loc806) + %p = tt.broadcast %lse : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc807) + %p_57 = arith.subf %post_mod_scores_56, %p : tensor<128x64xf32> loc(#loc807) + %p_58 = math.exp2 %p_57 : tensor<128x64xf32> loc(#loc808) + %vT = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(7,)cconstexpr_128_"(%vT_ptrs, %offs_v, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc809) + %dp = arith.constant 0.000000e+00 : f32 loc(#loc810) + %dp_59 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc810) + %dp_60 = tt.dot %do, %vT, %dp_59, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc810) + %ds = tt.expand_dims %Di {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc811) + %ds_61 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc812) + %ds_62 = arith.subf %dp_60, %ds_61 : tensor<128x64xf32> loc(#loc812) + %ds_63 = arith.mulf %p_58, %ds_62 : tensor<128x64xf32> loc(#loc813) + %scatter_mask = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc814) + %scatter_mask_64 = tt.splat %Q_LEN : i32 -> tensor<128x1xi32> loc(#loc815) + %scatter_mask_65 = arith.cmpi slt, %scatter_mask, %scatter_mask_64 : tensor<128x1xi32> loc(#loc815) + %scatter_mask_66 = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc816) + %scatter_mask_67 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc817) + %scatter_mask_68 = arith.cmpi slt, %scatter_mask_66, %scatter_mask_67 : tensor<1x64xi32> loc(#loc817) + %scatter_mask_69 = tt.broadcast %scatter_mask_65 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc818) + %scatter_mask_70 = tt.broadcast %scatter_mask_68 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc818) + %scatter_mask_71 = arith.andi %scatter_mask_69, %scatter_mask_70 : tensor<128x64xi1> loc(#loc818) + %ds_72 = arith.constant 0.000000e+00 : f32 loc(#loc819) + %ds_73 = arith.constant 0.000000e+00 : f32 loc(#loc819) + %ds_74 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc819) + %ds_75 = arith.select %tmp38, %ds_63, %ds_74 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc819) + %ds_76 = arith.truncf %ds_75 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc820) + %dq_77 = tt.trans %kT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc821) + %dq_78 = arith.constant 0.000000e+00 : f32 loc(#loc822) + %dq_79 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc822) + %dq_80 = tt.dot %ds_76, %dq_77, %dq_79, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc822) + %dq_81 = arith.addf %dq, %dq_80 : tensor<128x128xf32> loc(#loc823) + tt.return %dq_81 : tensor<128x128xf32> loc(#loc316) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc317) + tt.return %0 : tensor<128x128xf32> loc(#loc317) + } loc(#loc253) + tt.func private @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(7,)cconstexpr_128_"(%ptr: tensor<128x64x!tt.ptr> loc("ptr"(#loc211)), %offs_m: tensor<128xi32> loc("offs_m"(#loc211)), %offs_n: tensor<64xi32> loc("offs_n"(#loc211)), %N_LEN: i32 loc("N_LEN"(#loc211))) -> tensor<128x64xbf16> attributes {noinline = false} { + %0 = tt.load %ptr : tensor<128x64x!tt.ptr> loc(#loc218) + tt.return %0 : tensor<128x64xbf16> loc(#loc219) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x64xbf16> loc(#loc220) + tt.return %1 : tensor<128x64xbf16> loc(#loc220) + } loc(#loc211) + tt.func private @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.get_bounded_indices__i32S1_64S__(1,)cconstexpr_None_"(%indices: tensor<1x64xi32> loc("indices"(#loc318))) -> tensor<1x64xi32> attributes {noinline = false} { + tt.return %indices : tensor<1x64xi32> loc(#loc319) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x64xi32> loc(#loc320) + tt.return %0 : tensor<1x64xi32> loc(#loc320) + } loc(#loc318) + tt.func private @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.get_bounded_indices__i32S128_1S__(1,)cconstexpr_None_"(%indices: tensor<128x1xi32> loc("indices"(#loc318))) -> tensor<128x1xi32> attributes {noinline = false} { + tt.return %indices : tensor<128x1xi32> loc(#loc319) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x1xi32> loc(#loc320) + tt.return %0 : tensor<128x1xi32> loc(#loc320) + } loc(#loc318) + tt.func private @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%loop_iter: i32 loc("loop_iter"(#loc321)), %col_indices: !tt.ptr loc("col_indices"(#loc321)), %total_blocks: i32 loc("total_blocks"(#loc321))) -> i32 attributes {noinline = false} { + %cur_block_idx = arith.constant 2 : i32 loc(#loc829) + %cur_block_idx_0 = arith.constant 2 : i32 loc(#loc829) + %cur_block_idx_1 = arith.divsi %loop_iter, %cur_block_idx_0 : i32 loc(#loc829) + %cur_block = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc830) + %cur_block_2 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc831) + %next_block = arith.constant 1 : i32 loc(#loc832) + %next_block_3 = arith.constant 1 : i32 loc(#loc832) + %next_block_4 = arith.addi %cur_block_idx_1, %next_block_3 : i32 loc(#loc832) + %next_block_5 = arith.cmpi slt, %next_block_4, %total_blocks : i32 loc(#loc833) + %next_block_6 = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc834) + %next_block_7 = arith.constant 1 : i32 loc(#loc835) + %next_block_8 = tt.addptr %next_block_6, %next_block_7 : !tt.ptr, i32 loc(#loc835) + %next_block_9 = tt.load %next_block_8, %next_block_5 evictionPolicy = evict_last : !tt.ptr loc(#loc836) + %needs_jump = arith.constant 1 : i32 loc(#loc837) + %needs_jump_10 = arith.constant 1 : i32 loc(#loc837) + %needs_jump_11 = arith.addi %loop_iter, %needs_jump_10 : i32 loc(#loc837) + %needs_jump_12 = arith.constant 2 : i32 loc(#loc838) + %needs_jump_13 = arith.constant 2 : i32 loc(#loc838) + %needs_jump_14 = arith.remsi %needs_jump_11, %needs_jump_13 : i32 loc(#loc838) + %needs_jump_15 = arith.constant 0 : i32 loc(#loc839) + %needs_jump_16 = arith.cmpi eq, %needs_jump_14, %needs_jump_15 : i32 loc(#loc839) + %jump_to_block = arith.subi %next_block_9, %cur_block_2 : i32 loc(#loc840) + %jump_to_block_17 = arith.constant 128 : i32 loc(#loc841) + %jump_to_block_18 = arith.constant 128 : i32 loc(#loc841) + %jump_to_block_19 = arith.muli %jump_to_block, %jump_to_block_18 : i32 loc(#loc841) + %jump_to_block_20 = arith.constant 64 : i32 loc(#loc842) + %jump_to_block_21 = arith.constant 64 : i32 loc(#loc842) + %jump_to_block_22 = arith.subi %jump_to_block_19, %jump_to_block_21 : i32 loc(#loc842) + %offset = arith.extui %needs_jump_16 : i1 to i32 loc(#loc843) + %offset_23 = arith.muli %jump_to_block_22, %offset : i32 loc(#loc843) + %offset_24 = arith.constant 1 : i32 loc(#loc844) + %offset_25 = arith.constant 1 : i32 loc(#loc844) + %offset_26 = arith.extui %needs_jump_16 : i1 to i32 loc(#loc844) + %offset_27 = arith.subi %offset_25, %offset_26 : i32 loc(#loc844) + %offset_28 = arith.constant 64 : i32 loc(#loc845) + %offset_29 = arith.constant 64 : i32 loc(#loc845) + %offset_30 = arith.muli %offset_27, %offset_29 : i32 loc(#loc845) + %offset_31 = arith.addi %offset_23, %offset_30 : i32 loc(#loc846) + tt.return %offset_31 : i32 loc(#loc340) + ^bb1: // no predecessors + %0 = ub.poison : i32 loc(#loc341) + tt.return %0 : i32 loc(#loc341) + } loc(#loc321) + tt.func private @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(35,)cconstexpr_bf16__(36,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc221)), %arg_K: !tt.ptr loc("arg_K"(#loc221)), %arg_V: !tt.ptr loc("arg_V"(#loc221)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc221)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc221)), %arg_DO: !tt.ptr loc("arg_DO"(#loc221)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc221)), %arg_DV: !tt.ptr loc("arg_DV"(#loc221)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc221)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc221)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc221)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc221)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc221)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc221)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc221)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc221)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc221)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc221)), %K: !tt.ptr loc("K"(#loc221)), %V: !tt.ptr loc("V"(#loc221)), %dq: tensor<128x128xf32> loc("dq"(#loc221)), %q: tensor<128x128xbf16> loc("q"(#loc221)), %do: tensor<128x128xbf16> loc("do"(#loc221)), %Di: tensor<128xf32> loc("Di"(#loc221)), %lse: tensor<128x1xf32> loc("lse"(#loc221)), %off_z: i32 loc("off_z"(#loc221)), %off_hq: i32 loc("off_hq"(#loc221)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc221)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc221)), %stride_kn: i32 loc("stride_kn"(#loc221)), %stride_kd: i32 loc("stride_kd"(#loc221)), %stride_vn: i32 loc("stride_vn"(#loc221)), %stride_vd: i32 loc("stride_vd"(#loc221)), %kv_indices: !tt.ptr loc("kv_indices"(#loc221)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc221))) -> tensor<128x128xf32> attributes {noinline = false} { + %Q_LEN = arith.constant 2048 : i32 loc(#loc695) + %KV_LEN = arith.constant 2048 : i32 loc(#loc696) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc697) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc698) + %kT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc699) + %kT_ptrs_0 = tt.splat %stride_kn : i32 -> tensor<1x64xi32> loc(#loc700) + %kT_ptrs_1 = arith.muli %kT_ptrs, %kT_ptrs_0 : tensor<1x64xi32> loc(#loc700) + %kT_ptrs_2 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc701) + %kT_ptrs_3 = tt.addptr %kT_ptrs_2, %kT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc701) + %kT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc702) + %kT_ptrs_5 = tt.splat %stride_kd : i32 -> tensor<128x1xi32> loc(#loc703) + %kT_ptrs_6 = arith.muli %kT_ptrs_4, %kT_ptrs_5 : tensor<128x1xi32> loc(#loc703) + %kT_ptrs_7 = tt.broadcast %kT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc704) + %kT_ptrs_8 = tt.broadcast %kT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc704) + %kT_ptrs_9 = tt.addptr %kT_ptrs_7, %kT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc704) + %vT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc705) + %vT_ptrs_10 = tt.splat %stride_vn : i32 -> tensor<1x64xi32> loc(#loc706) + %vT_ptrs_11 = arith.muli %vT_ptrs, %vT_ptrs_10 : tensor<1x64xi32> loc(#loc706) + %vT_ptrs_12 = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc707) + %vT_ptrs_13 = tt.addptr %vT_ptrs_12, %vT_ptrs_11 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc707) + %vT_ptrs_14 = tt.expand_dims %offs_v {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc708) + %vT_ptrs_15 = tt.splat %stride_vd : i32 -> tensor<128x1xi32> loc(#loc709) + %vT_ptrs_16 = arith.muli %vT_ptrs_14, %vT_ptrs_15 : tensor<128x1xi32> loc(#loc709) + %vT_ptrs_17 = tt.broadcast %vT_ptrs_13 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc710) + %vT_ptrs_18 = tt.broadcast %vT_ptrs_16 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc710) + %vT_ptrs_19 = tt.addptr %vT_ptrs_17, %vT_ptrs_18 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc710) + %hi = arith.constant 2 : i32 loc(#loc711) + %hi_20 = arith.constant 2 : i32 loc(#loc711) + %hi_21 = arith.muli %sparse_kv_num_blocks, %hi_20 : i32 loc(#loc711) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%KV_LEN) : (i32) -> i32 loc(#loc712) + %hi_23 = arith.constant 1 : i32 loc(#loc713) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc713) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc714) + %c0_i32 = arith.constant 0 : i32 loc(#loc242) + %c1_i32 = arith.constant 1 : i32 loc(#loc242) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc242) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc242) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc242) + %3 = ub.poison : i32 loc(#loc242) + %vT_ptrs_26:4 = scf.for %start_n = %0 to %1 step %2 iter_args(%dq_27 = %dq, %offs_n2_28 = %offs_n2, %kT_ptrs_29 = %kT_ptrs_9, %vT_ptrs_30 = %vT_ptrs_19) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %dq_31 = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(39,)cconstexpr_bf16__(40,)cconstexpr_1_d_44269504__(41,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %dq_27, %q, %kT_ptrs_29, %vT_ptrs_30, %do, %Di, %lse, %Q_LEN, %KV_LEN, %off_z, %off_hq, %offs_m2, %offs_n2_28, %offs_k, %offs_v, %stride_kn, %stride_kd, %stride_vn, %stride_vd, %kv_indices, %sparse_kv_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc716) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %sparse_kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc717) + %kT_ptrs_32 = arith.muli %offset, %stride_kn : i32 loc(#loc718) + %kT_ptrs_33 = tt.splat %kT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc719) + %kT_ptrs_34 = tt.addptr %kT_ptrs_29, %kT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc719) + %vT_ptrs_35 = arith.muli %offset, %stride_vn : i32 loc(#loc720) + %vT_ptrs_36 = tt.splat %vT_ptrs_35 : i32 -> tensor<128x64xi32> loc(#loc721) + %vT_ptrs_37 = tt.addptr %vT_ptrs_30, %vT_ptrs_36 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc721) + %offs_n2_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc722) + %offs_n2_39 = arith.addi %offs_n2_28, %offs_n2_38 : tensor<64xi32> loc(#loc722) + scf.yield %dq_31, %offs_n2_39, %kT_ptrs_34, %vT_ptrs_37 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc250) + } loc(#loc1027) + tt.return %vT_ptrs_26#0 : tensor<128x128xf32> loc(#loc251) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc252) + tt.return %4 : tensor<128x128xf32> loc(#loc252) + } loc(#loc221) + tt.func private @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(39,)cconstexpr_bf16__(40,)cconstexpr_1_d_44269504__(41,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc253)), %arg_K: !tt.ptr loc("arg_K"(#loc253)), %arg_V: !tt.ptr loc("arg_V"(#loc253)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc253)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc253)), %arg_DO: !tt.ptr loc("arg_DO"(#loc253)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc253)), %arg_DV: !tt.ptr loc("arg_DV"(#loc253)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc253)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc253)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc253)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc253)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc253)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc253)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc253)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc253)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc253)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc253)), %dq: tensor<128x128xf32> loc("dq"(#loc253)), %q: tensor<128x128xbf16> loc("q"(#loc253)), %kT_ptrs: tensor<128x64x!tt.ptr> loc("kT_ptrs"(#loc253)), %vT_ptrs: tensor<128x64x!tt.ptr> loc("vT_ptrs"(#loc253)), %do: tensor<128x128xbf16> loc("do"(#loc253)), %Di: tensor<128xf32> loc("Di"(#loc253)), %lse: tensor<128x1xf32> loc("lse"(#loc253)), %Q_LEN: i32 loc("Q_LEN"(#loc253)), %KV_LEN: i32 loc("KV_LEN"(#loc253)), %off_z: i32 loc("off_z"(#loc253)), %off_hq: i32 loc("off_hq"(#loc253)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc253)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc253)), %offs_k: tensor<128xi32> loc("offs_k"(#loc253)), %offs_v: tensor<128xi32> loc("offs_v"(#loc253)), %stride_kn: i32 loc("stride_kn"(#loc253)), %stride_kd: i32 loc("stride_kd"(#loc253)), %stride_vn: i32 loc("stride_vn"(#loc253)), %stride_vd: i32 loc("stride_vd"(#loc253)), %kv_indices: !tt.ptr loc("kv_indices"(#loc253)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc253))) -> tensor<128x128xf32> attributes {noinline = false} { + %kT = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(7,)cconstexpr_128_"(%kT_ptrs, %offs_k, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc762) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc763) + %qk_0 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc763) + %qk_1 = tt.dot %q, %kT, %qk_0, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc763) + %qk_2 = arith.constant 0.0883883461 : f32 loc(#loc764) + %qk_3 = arith.constant 0.0883883461 : f32 loc(#loc764) + %qk_4 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc764) + %qk_5 = arith.mulf %qk_1, %qk_4 : tensor<128x64xf32> loc(#loc764) + %n = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc765) + %n_6 = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.get_bounded_indices__i32S1_64S__(1,)cconstexpr_None_"(%n) : (tensor<1x64xi32>) -> tensor<1x64xi32> loc(#loc766) + %m = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc767) + %m_7 = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.get_bounded_indices__i32S128_1S__(1,)cconstexpr_None_"(%m) : (tensor<128x1xi32>) -> tensor<128x1xi32> loc(#loc768) + %post_mod_scores = arith.constant 1.44269502 : f32 loc(#loc806) + %post_mod_scores_8 = arith.constant 1.44269502 : f32 loc(#loc806) + %post_mod_scores_9 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc806) + %post_mod_scores_10 = arith.mulf %qk_5, %post_mod_scores_9 : tensor<128x64xf32> loc(#loc806) + %p = tt.broadcast %lse : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc807) + %p_11 = arith.subf %post_mod_scores_10, %p : tensor<128x64xf32> loc(#loc807) + %p_12 = math.exp2 %p_11 : tensor<128x64xf32> loc(#loc808) + %vT = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(7,)cconstexpr_128_"(%vT_ptrs, %offs_v, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc809) + %dp = arith.constant 0.000000e+00 : f32 loc(#loc810) + %dp_13 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc810) + %dp_14 = tt.dot %do, %vT, %dp_13, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc810) + %ds = tt.expand_dims %Di {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc811) + %ds_15 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc812) + %ds_16 = arith.subf %dp_14, %ds_15 : tensor<128x64xf32> loc(#loc812) + %ds_17 = arith.mulf %p_12, %ds_16 : tensor<128x64xf32> loc(#loc813) + %scatter_mask = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc814) + %scatter_mask_18 = tt.splat %Q_LEN : i32 -> tensor<128x1xi32> loc(#loc815) + %scatter_mask_19 = arith.cmpi slt, %scatter_mask, %scatter_mask_18 : tensor<128x1xi32> loc(#loc815) + %scatter_mask_20 = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc816) + %scatter_mask_21 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc817) + %scatter_mask_22 = arith.cmpi slt, %scatter_mask_20, %scatter_mask_21 : tensor<1x64xi32> loc(#loc817) + %scatter_mask_23 = tt.broadcast %scatter_mask_19 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc818) + %scatter_mask_24 = tt.broadcast %scatter_mask_22 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc818) + %scatter_mask_25 = arith.andi %scatter_mask_23, %scatter_mask_24 : tensor<128x64xi1> loc(#loc818) + %ds_26 = arith.truncf %ds_17 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc820) + %dq_27 = tt.trans %kT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc821) + %dq_28 = arith.constant 0.000000e+00 : f32 loc(#loc822) + %dq_29 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc822) + %dq_30 = tt.dot %ds_26, %dq_27, %dq_29, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc822) + %dq_31 = arith.addf %dq, %dq_30 : tensor<128x128xf32> loc(#loc823) + tt.return %dq_31 : tensor<128x128xf32> loc(#loc316) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc317) + tt.return %0 : tensor<128x128xf32> loc(#loc317) + } loc(#loc253) + tt.func private @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(36,)cconstexpr_bf16__(37,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc342)), %arg_K: !tt.ptr loc("arg_K"(#loc342)), %arg_V: !tt.ptr loc("arg_V"(#loc342)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc342)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc342)), %arg_DO: !tt.ptr loc("arg_DO"(#loc342)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc342)), %arg_DV: !tt.ptr loc("arg_DV"(#loc342)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc342)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc342)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc342)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc342)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc342)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc342)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc342)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc342)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc342)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc342)), %Q: !tt.ptr loc("Q"(#loc342)), %DO: !tt.ptr loc("DO"(#loc342)), %DELTA: !tt.ptr loc("DELTA"(#loc342)), %LSE: !tt.ptr loc("LSE"(#loc342)), %dk: tensor<128x128xf32> loc("dk"(#loc342)), %dv: tensor<128x128xf32> loc("dv"(#loc342)), %k: tensor<128x128xbf16> loc("k"(#loc342)), %v: tensor<128x128xbf16> loc("v"(#loc342)), %off_z: i32 loc("off_z"(#loc342)), %off_hq: i32 loc("off_hq"(#loc342)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc342)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc342)), %stride_qm: i32 loc("stride_qm"(#loc342)), %stride_qd: i32 loc("stride_qd"(#loc342)), %stride_dom: i32 loc("stride_dom"(#loc342)), %stride_dod: i32 loc("stride_dod"(#loc342)), %q_indices: !tt.ptr loc("q_indices"(#loc342)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc342))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %Q_LEN = arith.constant 2048 : i32 loc(#loc883) + %KV_LEN = arith.constant 2048 : i32 loc(#loc884) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc885) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc886) + %qT_ptrs = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc887) + %qT_ptrs_0 = tt.splat %stride_qm : i32 -> tensor<1x64xi32> loc(#loc888) + %qT_ptrs_1 = arith.muli %qT_ptrs, %qT_ptrs_0 : tensor<1x64xi32> loc(#loc888) + %qT_ptrs_2 = tt.splat %Q : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc889) + %qT_ptrs_3 = tt.addptr %qT_ptrs_2, %qT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc889) + %qT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc890) + %qT_ptrs_5 = tt.splat %stride_qd : i32 -> tensor<128x1xi32> loc(#loc891) + %qT_ptrs_6 = arith.muli %qT_ptrs_4, %qT_ptrs_5 : tensor<128x1xi32> loc(#loc891) + %qT_ptrs_7 = tt.broadcast %qT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc892) + %qT_ptrs_8 = tt.broadcast %qT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc892) + %qT_ptrs_9 = tt.addptr %qT_ptrs_7, %qT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc892) + %do_ptrs = tt.expand_dims %offs_m1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc893) + %do_ptrs_10 = tt.splat %stride_dom : i32 -> tensor<64x1xi32> loc(#loc894) + %do_ptrs_11 = arith.muli %do_ptrs, %do_ptrs_10 : tensor<64x1xi32> loc(#loc894) + %do_ptrs_12 = tt.splat %DO : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc895) + %do_ptrs_13 = tt.addptr %do_ptrs_12, %do_ptrs_11 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc895) + %do_ptrs_14 = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc896) + %do_ptrs_15 = tt.splat %stride_dod : i32 -> tensor<1x128xi32> loc(#loc897) + %do_ptrs_16 = arith.muli %do_ptrs_14, %do_ptrs_15 : tensor<1x128xi32> loc(#loc897) + %do_ptrs_17 = tt.broadcast %do_ptrs_13 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc898) + %do_ptrs_18 = tt.broadcast %do_ptrs_16 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc898) + %do_ptrs_19 = tt.addptr %do_ptrs_17, %do_ptrs_18 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc898) + %hi = arith.constant 2 : i32 loc(#loc899) + %hi_20 = arith.constant 2 : i32 loc(#loc899) + %hi_21 = arith.muli %sparse_q_num_blocks, %hi_20 : i32 loc(#loc899) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%Q_LEN) : (i32) -> i32 loc(#loc900) + %hi_23 = arith.constant 1 : i32 loc(#loc901) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc901) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc902) + %c0_i32 = arith.constant 0 : i32 loc(#loc363) + %c1_i32 = arith.constant 1 : i32 loc(#loc363) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc363) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc363) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc363) + %3 = ub.poison : i32 loc(#loc363) + %do_ptrs_26:5 = scf.for %start_m = %0 to %1 step %2 iter_args(%dk_27 = %dk, %dv_28 = %dv, %offs_m1_29 = %offs_m1, %qT_ptrs_30 = %qT_ptrs_9, %do_ptrs_31 = %do_ptrs_19) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %6:2 = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(40,)cconstexpr_bf16__(41,)cconstexpr_1_d_44269504__(42,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %dk_27, %dv_28, %qT_ptrs_30, %k, %v, %do_ptrs_31, %DELTA, %LSE, %Q_LEN, %KV_LEN, %off_z, %off_hq, %offs_n1, %offs_m1_29, %offs_k, %offs_v, %stride_qm, %stride_qd, %stride_dom, %stride_dod, %q_indices, %sparse_q_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<64x128x!tt.ptr>, !tt.ptr, !tt.ptr, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc364) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_m, %q_indices, %sparse_q_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc904) + %qT_ptrs_32 = arith.muli %offset, %stride_qm : i32 loc(#loc905) + %qT_ptrs_33 = tt.splat %qT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc906) + %qT_ptrs_34 = tt.addptr %qT_ptrs_30, %qT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc906) + %do_ptrs_35 = arith.muli %offset, %stride_dom : i32 loc(#loc907) + %do_ptrs_36 = tt.splat %do_ptrs_35 : i32 -> tensor<64x128xi32> loc(#loc908) + %do_ptrs_37 = tt.addptr %do_ptrs_31, %do_ptrs_36 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc908) + %offs_m1_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc909) + %offs_m1_39 = arith.addi %offs_m1_29, %offs_m1_38 : tensor<64xi32> loc(#loc909) + scf.yield %6#0, %6#1, %offs_m1_39, %qT_ptrs_34, %do_ptrs_37 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc371) + } loc(#loc1029) + tt.return %do_ptrs_26#0, %do_ptrs_26#1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc372) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc373) + %5 = ub.poison : tensor<128x128xf32> loc(#loc373) + tt.return %4, %5 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc373) + } loc(#loc342) + tt.func private @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(40,)cconstexpr_bf16__(41,)cconstexpr_1_d_44269504__(42,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc374)), %arg_K: !tt.ptr loc("arg_K"(#loc374)), %arg_V: !tt.ptr loc("arg_V"(#loc374)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc374)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc374)), %arg_DO: !tt.ptr loc("arg_DO"(#loc374)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc374)), %arg_DV: !tt.ptr loc("arg_DV"(#loc374)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc374)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc374)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc374)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc374)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc374)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc374)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc374)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc374)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc374)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc374)), %dk: tensor<128x128xf32> loc("dk"(#loc374)), %dv: tensor<128x128xf32> loc("dv"(#loc374)), %qT_ptrs: tensor<128x64x!tt.ptr> loc("qT_ptrs"(#loc374)), %k: tensor<128x128xbf16> loc("k"(#loc374)), %v: tensor<128x128xbf16> loc("v"(#loc374)), %do_ptrs: tensor<64x128x!tt.ptr> loc("do_ptrs"(#loc374)), %DELTA: !tt.ptr loc("DELTA"(#loc374)), %LSE: !tt.ptr loc("LSE"(#loc374)), %Q_LEN: i32 loc("Q_LEN"(#loc374)), %KV_LEN: i32 loc("KV_LEN"(#loc374)), %off_z: i32 loc("off_z"(#loc374)), %off_hq: i32 loc("off_hq"(#loc374)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc374)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc374)), %offs_k: tensor<128xi32> loc("offs_k"(#loc374)), %offs_v: tensor<128xi32> loc("offs_v"(#loc374)), %stride_qm: i32 loc("stride_qm"(#loc374)), %stride_qd: i32 loc("stride_qd"(#loc374)), %stride_dom: i32 loc("stride_dom"(#loc374)), %stride_dod: i32 loc("stride_dod"(#loc374)), %q_indices: !tt.ptr loc("q_indices"(#loc374)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc374))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %qT = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(7,)cconstexpr_128_"(%qT_ptrs, %offs_k, %offs_m1, %Q_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc950) + %lse = tt.splat %LSE : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc951) + %lse_0 = tt.addptr %lse, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc951) + %lse_1 = tt.load %lse_0 : tensor<64x!tt.ptr> loc(#loc952) + %lse_2 = arith.constant 0xFF800000 : f32 loc(#loc953) + %lse_3 = arith.constant dense<0xFF800000> : tensor<64xf32> loc(#loc953) + %lse_4 = arith.cmpf oeq, %lse_1, %lse_3 : tensor<64xf32> loc(#loc953) + %lse_5 = arith.constant 0.000000e+00 : f32 loc(#loc954) + %lse_6 = arith.constant 0.000000e+00 : f32 loc(#loc954) + %lse_7 = arith.constant dense<0.000000e+00> : tensor<64xf32> loc(#loc954) + %lse_8 = arith.select %lse_4, %lse_7, %lse_1 : tensor<64xi1>, tensor<64xf32> loc(#loc954) + %qkT = arith.constant 0.000000e+00 : f32 loc(#loc955) + %qkT_9 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc955) + %qkT_10 = tt.dot %k, %qT, %qkT_9, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc955) + %qkT_11 = arith.constant 0.0883883461 : f32 loc(#loc956) + %qkT_12 = arith.constant 0.0883883461 : f32 loc(#loc956) + %qkT_13 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc956) + %qkT_14 = arith.mulf %qkT_10, %qkT_13 : tensor<128x64xf32> loc(#loc956) + %m = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc957) + %m_15 = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.get_bounded_indices__i32S1_64S__(1,)cconstexpr_None_"(%m) : (tensor<1x64xi32>) -> tensor<1x64xi32> loc(#loc958) + %n = tt.expand_dims %offs_n1 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc959) + %n_16 = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.get_bounded_indices__i32S128_1S__(1,)cconstexpr_None_"(%n) : (tensor<128x1xi32>) -> tensor<128x1xi32> loc(#loc960) + %tmp41 = arith.constant false loc(#loc961) + %tmp41_17 = arith.constant dense : tensor<1xi1> loc(#loc961) + %tmp44 = tt.broadcast %m_15 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc962) + %tmp44_18 = tt.broadcast %n_16 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc962) + %tmp44_19 = arith.cmpi sge, %tmp44, %tmp44_18 : tensor<128x64xi32> loc(#loc962) + %tmp45 = arith.extsi %n_16 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc963) + %tmp47 = tt.addptr %in_ptr16, %off_z : !tt.ptr, i32 loc(#loc964) + %tmp47_20 = tt.load %tmp47 : !tt.ptr loc(#loc965) + %tmp48 = tt.splat %tmp47_20 : i64 -> tensor<128x1xi64> loc(#loc966) + %tmp48_21 = arith.cmpi slt, %tmp45, %tmp48 : tensor<128x1xi64> loc(#loc966) + %tmp49 = arith.extsi %m_15 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc967) + %tmp50 = tt.splat %tmp47_20 : i64 -> tensor<1x64xi64> loc(#loc968) + %tmp50_22 = arith.cmpi slt, %tmp49, %tmp50 : tensor<1x64xi64> loc(#loc968) + %tmp51 = tt.broadcast %tmp48_21 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc969) + %tmp51_23 = tt.broadcast %tmp50_22 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc969) + %tmp51_24 = arith.andi %tmp51, %tmp51_23 : tensor<128x64xi1> loc(#loc969) + %tmp52 = arith.andi %tmp44_19, %tmp51_24 : tensor<128x64xi1> loc(#loc970) + %tmp53 = tt.expand_dims %tmp41_17 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc971) + %tmp53_25 = tt.broadcast %tmp53 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc971) + %tmp53_26 = arith.ori %tmp53_25, %tmp52 : tensor<128x64xi1> loc(#loc971) + %tmp54 = arith.constant 2048 : i32 loc(#loc972) + %tmp54_27 = arith.constant dense<2048> : tensor<1xi32> loc(#loc972) + %tmp55 = tt.expand_dims %tmp54_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc973) + %tmp55_28 = tt.broadcast %tmp55 : tensor<1x1xi32> -> tensor<128x1xi32> loc(#loc973) + %tmp55_29 = arith.cmpi sge, %n_16, %tmp55_28 : tensor<128x1xi32> loc(#loc973) + %tmp56 = tt.expand_dims %tmp54_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc974) + %tmp56_30 = tt.broadcast %tmp56 : tensor<1x1xi32> -> tensor<128x1xi32> loc(#loc974) + %tmp56_31 = arith.remsi %n_16, %tmp56_30 : tensor<128x1xi32> loc(#loc974) + %tmp57 = arith.constant 0 : i32 loc(#loc975) + %tmp57_32 = arith.constant dense<0> : tensor<1xi32> loc(#loc975) + %tmp58 = tt.expand_dims %tmp57_32 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc976) + %tmp58_33 = tt.broadcast %tmp58 : tensor<1x1xi32> -> tensor<128x1xi32> loc(#loc976) + %tmp58_34 = arith.cmpi ne, %tmp56_31, %tmp58_33 : tensor<128x1xi32> loc(#loc976) + %tmp59 = arith.constant 0 : i32 loc(#loc977) + %tmp59_35 = arith.constant dense<0> : tensor<128x1xi32> loc(#loc977) + %tmp59_36 = arith.cmpi slt, %tmp56_31, %tmp59_35 : tensor<128x1xi32> loc(#loc977) + %tmp60 = arith.constant 0 : i32 loc(#loc978) + %tmp60_37 = arith.constant dense<0> : tensor<1xi32> loc(#loc978) + %tmp60_38 = arith.cmpi slt, %tmp54_27, %tmp60_37 : tensor<1xi32> loc(#loc978) + %tmp61 = tt.expand_dims %tmp60_38 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc979) + %tmp61_39 = tt.broadcast %tmp61 : tensor<1x1xi1> -> tensor<128x1xi1> loc(#loc979) + %tmp61_40 = arith.cmpi ne, %tmp59_36, %tmp61_39 : tensor<128x1xi1> loc(#loc979) + %tmp62 = arith.andi %tmp58_34, %tmp61_40 : tensor<128x1xi1> loc(#loc980) + %tmp63 = tt.expand_dims %tmp54_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc981) + %tmp63_41 = tt.broadcast %tmp63 : tensor<1x1xi32> -> tensor<128x1xi32> loc(#loc981) + %tmp63_42 = arith.addi %tmp56_31, %tmp63_41 : tensor<128x1xi32> loc(#loc981) + %tmp64 = arith.select %tmp62, %tmp63_42, %tmp56_31 : tensor<128x1xi1>, tensor<128x1xi32> loc(#loc982) + %tmp65 = arith.extsi %tmp64 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc983) + %tmp66 = tt.splat %tmp47_20 : i64 -> tensor<128x1xi64> loc(#loc984) + %tmp66_43 = arith.cmpi slt, %tmp65, %tmp66 : tensor<128x1xi64> loc(#loc984) + %tmp67 = arith.andi %tmp55_29, %tmp66_43 : tensor<128x1xi1> loc(#loc985) + %tmp68 = tt.broadcast %n_16 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc986) + %tmp68_44 = tt.broadcast %m_15 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc986) + %tmp68_45 = arith.subi %tmp68, %tmp68_44 : tensor<128x64xi32> loc(#loc986) + %tmp69 = tt.expand_dims %tmp54_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc987) + %tmp69_46 = tt.broadcast %tmp69 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc987) + %tmp69_47 = arith.remsi %tmp68_45, %tmp69_46 : tensor<128x64xi32> loc(#loc987) + %tmp70 = tt.expand_dims %tmp57_32 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc988) + %tmp70_48 = tt.broadcast %tmp70 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc988) + %tmp70_49 = arith.cmpi ne, %tmp69_47, %tmp70_48 : tensor<128x64xi32> loc(#loc988) + %tmp71 = arith.constant 0 : i32 loc(#loc989) + %tmp71_50 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc989) + %tmp71_51 = arith.cmpi slt, %tmp69_47, %tmp71_50 : tensor<128x64xi32> loc(#loc989) + %tmp72 = tt.expand_dims %tmp60_38 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc990) + %tmp72_52 = tt.broadcast %tmp72 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc990) + %tmp72_53 = arith.cmpi ne, %tmp71_51, %tmp72_52 : tensor<128x64xi1> loc(#loc990) + %tmp73 = arith.andi %tmp70_49, %tmp72_53 : tensor<128x64xi1> loc(#loc991) + %tmp74 = tt.expand_dims %tmp54_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc992) + %tmp74_54 = tt.broadcast %tmp74 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc992) + %tmp74_55 = arith.addi %tmp69_47, %tmp74_54 : tensor<128x64xi32> loc(#loc992) + %tmp75 = arith.select %tmp73, %tmp74_55, %tmp69_47 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc993) + %tmp76 = tt.expand_dims %tmp57_32 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc994) + %tmp76_56 = tt.broadcast %tmp76 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc994) + %tmp76_57 = arith.cmpi eq, %tmp75, %tmp76_56 : tensor<128x64xi32> loc(#loc994) + %tmp77 = tt.broadcast %tmp67 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc995) + %tmp77_58 = arith.andi %tmp77, %tmp76_57 : tensor<128x64xi1> loc(#loc995) + %tmp78 = arith.ori %tmp53_26, %tmp77_58 : tensor<128x64xi1> loc(#loc996) + %post_mod_scores = arith.constant 0xFF800000 : f32 loc(#loc997) + %post_mod_scores_59 = arith.constant 0xFF800000 : f32 loc(#loc997) + %post_mod_scores_60 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc997) + %post_mod_scores_61 = arith.select %tmp78, %qkT_14, %post_mod_scores_60 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc997) + %post_mod_scores_62 = arith.constant 1.44269502 : f32 loc(#loc998) + %post_mod_scores_63 = arith.constant 1.44269502 : f32 loc(#loc998) + %post_mod_scores_64 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc998) + %post_mod_scores_65 = arith.mulf %post_mod_scores_61, %post_mod_scores_64 : tensor<128x64xf32> loc(#loc998) + %pT = tt.expand_dims %lse_8 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc999) + %pT_66 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1000) + %pT_67 = arith.subf %post_mod_scores_65, %pT_66 : tensor<128x64xf32> loc(#loc1000) + %pT_68 = math.exp2 %pT_67 : tensor<128x64xf32> loc(#loc1001) + %do = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.load_checked_2d__Pbf16S64_128S_i32S64S_i32S128S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%do_ptrs, %offs_m1, %offs_v, %Q_LEN) : (tensor<64x128x!tt.ptr>, tensor<64xi32>, tensor<128xi32>, i32) -> tensor<64x128xbf16> loc(#loc1002) + %dv_69 = arith.truncf %pT_68 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1003) + %dv_70 = arith.constant 0.000000e+00 : f32 loc(#loc1004) + %dv_71 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1004) + %dv_72 = tt.dot %dv_69, %do, %dv_71, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1004) + %dv_73 = arith.addf %dv, %dv_72 : tensor<128x128xf32> loc(#loc1005) + %Di = tt.splat %DELTA : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc1006) + %Di_74 = tt.addptr %Di, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc1006) + %Di_75 = tt.load %Di_74 : tensor<64x!tt.ptr> loc(#loc1007) + %dpT = tt.trans %do {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc1008) + %dpT_76 = arith.constant 0.000000e+00 : f32 loc(#loc1009) + %dpT_77 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1009) + %dpT_78 = tt.dot %v, %dpT, %dpT_77, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc1009) + %dsT = tt.expand_dims %Di_75 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc1010) + %dsT_79 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1011) + %dsT_80 = arith.subf %dpT_78, %dsT_79 : tensor<128x64xf32> loc(#loc1011) + %dsT_81 = arith.mulf %pT_68, %dsT_80 : tensor<128x64xf32> loc(#loc1012) + %dsT_82 = arith.constant 0.000000e+00 : f32 loc(#loc1013) + %dsT_83 = arith.constant 0.000000e+00 : f32 loc(#loc1013) + %dsT_84 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1013) + %dsT_85 = arith.select %tmp78, %dsT_81, %dsT_84 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1013) + %dk_86 = arith.truncf %dsT_85 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1014) + %dk_87 = tt.trans %qT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc1015) + %dk_88 = arith.constant 0.000000e+00 : f32 loc(#loc1016) + %dk_89 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1016) + %dk_90 = tt.dot %dk_86, %dk_87, %dk_89, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1016) + %dk_91 = arith.addf %dk, %dk_90 : tensor<128x128xf32> loc(#loc1017) + tt.return %dk_91, %dv_73 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc443) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc444) + %1 = ub.poison : tensor<128x128xf32> loc(#loc444) + tt.return %0, %1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc444) + } loc(#loc374) + tt.func private @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.load_checked_2d__Pbf16S64_128S_i32S64S_i32S128S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: tensor<64x128x!tt.ptr> loc("ptr"(#loc211)), %offs_m: tensor<64xi32> loc("offs_m"(#loc211)), %offs_n: tensor<128xi32> loc("offs_n"(#loc211)), %M_LEN: i32 loc("M_LEN"(#loc211))) -> tensor<64x128xbf16> attributes {noinline = false} { + %0 = tt.load %ptr : tensor<64x128x!tt.ptr> loc(#loc218) + tt.return %0 : tensor<64x128xbf16> loc(#loc219) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x128xbf16> loc(#loc220) + tt.return %1 : tensor<64x128xbf16> loc(#loc220) + } loc(#loc211) + tt.func private @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(36,)cconstexpr_bf16__(37,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc342)), %arg_K: !tt.ptr loc("arg_K"(#loc342)), %arg_V: !tt.ptr loc("arg_V"(#loc342)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc342)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc342)), %arg_DO: !tt.ptr loc("arg_DO"(#loc342)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc342)), %arg_DV: !tt.ptr loc("arg_DV"(#loc342)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc342)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc342)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc342)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc342)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc342)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc342)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc342)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc342)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc342)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc342)), %Q: !tt.ptr loc("Q"(#loc342)), %DO: !tt.ptr loc("DO"(#loc342)), %DELTA: !tt.ptr loc("DELTA"(#loc342)), %LSE: !tt.ptr loc("LSE"(#loc342)), %dk: tensor<128x128xf32> loc("dk"(#loc342)), %dv: tensor<128x128xf32> loc("dv"(#loc342)), %k: tensor<128x128xbf16> loc("k"(#loc342)), %v: tensor<128x128xbf16> loc("v"(#loc342)), %off_z: i32 loc("off_z"(#loc342)), %off_hq: i32 loc("off_hq"(#loc342)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc342)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc342)), %stride_qm: i32 loc("stride_qm"(#loc342)), %stride_qd: i32 loc("stride_qd"(#loc342)), %stride_dom: i32 loc("stride_dom"(#loc342)), %stride_dod: i32 loc("stride_dod"(#loc342)), %q_indices: !tt.ptr loc("q_indices"(#loc342)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc342))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %Q_LEN = arith.constant 2048 : i32 loc(#loc883) + %KV_LEN = arith.constant 2048 : i32 loc(#loc884) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc885) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc886) + %qT_ptrs = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc887) + %qT_ptrs_0 = tt.splat %stride_qm : i32 -> tensor<1x64xi32> loc(#loc888) + %qT_ptrs_1 = arith.muli %qT_ptrs, %qT_ptrs_0 : tensor<1x64xi32> loc(#loc888) + %qT_ptrs_2 = tt.splat %Q : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc889) + %qT_ptrs_3 = tt.addptr %qT_ptrs_2, %qT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc889) + %qT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc890) + %qT_ptrs_5 = tt.splat %stride_qd : i32 -> tensor<128x1xi32> loc(#loc891) + %qT_ptrs_6 = arith.muli %qT_ptrs_4, %qT_ptrs_5 : tensor<128x1xi32> loc(#loc891) + %qT_ptrs_7 = tt.broadcast %qT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc892) + %qT_ptrs_8 = tt.broadcast %qT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc892) + %qT_ptrs_9 = tt.addptr %qT_ptrs_7, %qT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc892) + %do_ptrs = tt.expand_dims %offs_m1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc893) + %do_ptrs_10 = tt.splat %stride_dom : i32 -> tensor<64x1xi32> loc(#loc894) + %do_ptrs_11 = arith.muli %do_ptrs, %do_ptrs_10 : tensor<64x1xi32> loc(#loc894) + %do_ptrs_12 = tt.splat %DO : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc895) + %do_ptrs_13 = tt.addptr %do_ptrs_12, %do_ptrs_11 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc895) + %do_ptrs_14 = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc896) + %do_ptrs_15 = tt.splat %stride_dod : i32 -> tensor<1x128xi32> loc(#loc897) + %do_ptrs_16 = arith.muli %do_ptrs_14, %do_ptrs_15 : tensor<1x128xi32> loc(#loc897) + %do_ptrs_17 = tt.broadcast %do_ptrs_13 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc898) + %do_ptrs_18 = tt.broadcast %do_ptrs_16 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc898) + %do_ptrs_19 = tt.addptr %do_ptrs_17, %do_ptrs_18 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc898) + %hi = arith.constant 2 : i32 loc(#loc899) + %hi_20 = arith.constant 2 : i32 loc(#loc899) + %hi_21 = arith.muli %sparse_q_num_blocks, %hi_20 : i32 loc(#loc899) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%Q_LEN) : (i32) -> i32 loc(#loc900) + %hi_23 = arith.constant 1 : i32 loc(#loc901) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc901) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc902) + %c0_i32 = arith.constant 0 : i32 loc(#loc363) + %c1_i32 = arith.constant 1 : i32 loc(#loc363) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc363) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc363) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc363) + %3 = ub.poison : i32 loc(#loc363) + %do_ptrs_26:5 = scf.for %start_m = %0 to %1 step %2 iter_args(%dk_27 = %dk, %dv_28 = %dv, %offs_m1_29 = %offs_m1, %qT_ptrs_30 = %qT_ptrs_9, %do_ptrs_31 = %do_ptrs_19) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %6:2 = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(40,)cconstexpr_bf16__(41,)cconstexpr_1_d_44269504__(42,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %dk_27, %dv_28, %qT_ptrs_30, %k, %v, %do_ptrs_31, %DELTA, %LSE, %Q_LEN, %KV_LEN, %off_z, %off_hq, %offs_n1, %offs_m1_29, %offs_k, %offs_v, %stride_qm, %stride_qd, %stride_dom, %stride_dod, %q_indices, %sparse_q_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<64x128x!tt.ptr>, !tt.ptr, !tt.ptr, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc364) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_m, %q_indices, %sparse_q_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc904) + %qT_ptrs_32 = arith.muli %offset, %stride_qm : i32 loc(#loc905) + %qT_ptrs_33 = tt.splat %qT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc906) + %qT_ptrs_34 = tt.addptr %qT_ptrs_30, %qT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc906) + %do_ptrs_35 = arith.muli %offset, %stride_dom : i32 loc(#loc907) + %do_ptrs_36 = tt.splat %do_ptrs_35 : i32 -> tensor<64x128xi32> loc(#loc908) + %do_ptrs_37 = tt.addptr %do_ptrs_31, %do_ptrs_36 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc908) + %offs_m1_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc909) + %offs_m1_39 = arith.addi %offs_m1_29, %offs_m1_38 : tensor<64xi32> loc(#loc909) + scf.yield %6#0, %6#1, %offs_m1_39, %qT_ptrs_34, %do_ptrs_37 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc371) + } loc(#loc1029) + tt.return %do_ptrs_26#0, %do_ptrs_26#1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc372) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc373) + %5 = ub.poison : tensor<128x128xf32> loc(#loc373) + tt.return %4, %5 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc373) + } loc(#loc342) + tt.func private @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(40,)cconstexpr_bf16__(41,)cconstexpr_1_d_44269504__(42,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc374)), %arg_K: !tt.ptr loc("arg_K"(#loc374)), %arg_V: !tt.ptr loc("arg_V"(#loc374)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc374)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc374)), %arg_DO: !tt.ptr loc("arg_DO"(#loc374)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc374)), %arg_DV: !tt.ptr loc("arg_DV"(#loc374)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc374)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc374)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc374)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc374)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc374)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc374)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc374)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc374)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc374)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc374)), %dk: tensor<128x128xf32> loc("dk"(#loc374)), %dv: tensor<128x128xf32> loc("dv"(#loc374)), %qT_ptrs: tensor<128x64x!tt.ptr> loc("qT_ptrs"(#loc374)), %k: tensor<128x128xbf16> loc("k"(#loc374)), %v: tensor<128x128xbf16> loc("v"(#loc374)), %do_ptrs: tensor<64x128x!tt.ptr> loc("do_ptrs"(#loc374)), %DELTA: !tt.ptr loc("DELTA"(#loc374)), %LSE: !tt.ptr loc("LSE"(#loc374)), %Q_LEN: i32 loc("Q_LEN"(#loc374)), %KV_LEN: i32 loc("KV_LEN"(#loc374)), %off_z: i32 loc("off_z"(#loc374)), %off_hq: i32 loc("off_hq"(#loc374)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc374)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc374)), %offs_k: tensor<128xi32> loc("offs_k"(#loc374)), %offs_v: tensor<128xi32> loc("offs_v"(#loc374)), %stride_qm: i32 loc("stride_qm"(#loc374)), %stride_qd: i32 loc("stride_qd"(#loc374)), %stride_dom: i32 loc("stride_dom"(#loc374)), %stride_dod: i32 loc("stride_dod"(#loc374)), %q_indices: !tt.ptr loc("q_indices"(#loc374)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc374))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %qT = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(7,)cconstexpr_128_"(%qT_ptrs, %offs_k, %offs_m1, %Q_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc950) + %lse = tt.splat %LSE : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc951) + %lse_0 = tt.addptr %lse, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc951) + %lse_1 = tt.load %lse_0 : tensor<64x!tt.ptr> loc(#loc952) + %lse_2 = arith.constant 0xFF800000 : f32 loc(#loc953) + %lse_3 = arith.constant dense<0xFF800000> : tensor<64xf32> loc(#loc953) + %lse_4 = arith.cmpf oeq, %lse_1, %lse_3 : tensor<64xf32> loc(#loc953) + %lse_5 = arith.constant 0.000000e+00 : f32 loc(#loc954) + %lse_6 = arith.constant 0.000000e+00 : f32 loc(#loc954) + %lse_7 = arith.constant dense<0.000000e+00> : tensor<64xf32> loc(#loc954) + %lse_8 = arith.select %lse_4, %lse_7, %lse_1 : tensor<64xi1>, tensor<64xf32> loc(#loc954) + %qkT = arith.constant 0.000000e+00 : f32 loc(#loc955) + %qkT_9 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc955) + %qkT_10 = tt.dot %k, %qT, %qkT_9, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc955) + %qkT_11 = arith.constant 0.0883883461 : f32 loc(#loc956) + %qkT_12 = arith.constant 0.0883883461 : f32 loc(#loc956) + %qkT_13 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc956) + %qkT_14 = arith.mulf %qkT_10, %qkT_13 : tensor<128x64xf32> loc(#loc956) + %m = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc957) + %m_15 = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.get_bounded_indices__i32S1_64S__(1,)cconstexpr_None_"(%m) : (tensor<1x64xi32>) -> tensor<1x64xi32> loc(#loc958) + %n = tt.expand_dims %offs_n1 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc959) + %n_16 = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.get_bounded_indices__i32S128_1S__(1,)cconstexpr_None_"(%n) : (tensor<128x1xi32>) -> tensor<128x1xi32> loc(#loc960) + %post_mod_scores = arith.constant 1.44269502 : f32 loc(#loc998) + %post_mod_scores_17 = arith.constant 1.44269502 : f32 loc(#loc998) + %post_mod_scores_18 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc998) + %post_mod_scores_19 = arith.mulf %qkT_14, %post_mod_scores_18 : tensor<128x64xf32> loc(#loc998) + %pT = tt.expand_dims %lse_8 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc999) + %pT_20 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1000) + %pT_21 = arith.subf %post_mod_scores_19, %pT_20 : tensor<128x64xf32> loc(#loc1000) + %pT_22 = math.exp2 %pT_21 : tensor<128x64xf32> loc(#loc1001) + %do = tt.call @"torch._inductor.runtime.compile_tasks.c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.load_checked_2d__Pbf16S64_128S_i32S64S_i32S128S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%do_ptrs, %offs_m1, %offs_v, %Q_LEN) : (tensor<64x128x!tt.ptr>, tensor<64xi32>, tensor<128xi32>, i32) -> tensor<64x128xbf16> loc(#loc1002) + %dv_23 = arith.truncf %pT_22 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1003) + %dv_24 = arith.constant 0.000000e+00 : f32 loc(#loc1004) + %dv_25 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1004) + %dv_26 = tt.dot %dv_23, %do, %dv_25, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1004) + %dv_27 = arith.addf %dv, %dv_26 : tensor<128x128xf32> loc(#loc1005) + %Di = tt.splat %DELTA : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc1006) + %Di_28 = tt.addptr %Di, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc1006) + %Di_29 = tt.load %Di_28 : tensor<64x!tt.ptr> loc(#loc1007) + %dpT = tt.trans %do {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc1008) + %dpT_30 = arith.constant 0.000000e+00 : f32 loc(#loc1009) + %dpT_31 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1009) + %dpT_32 = tt.dot %v, %dpT, %dpT_31, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc1009) + %dsT = tt.expand_dims %Di_29 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc1010) + %dsT_33 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1011) + %dsT_34 = arith.subf %dpT_32, %dsT_33 : tensor<128x64xf32> loc(#loc1011) + %dsT_35 = arith.mulf %pT_22, %dsT_34 : tensor<128x64xf32> loc(#loc1012) + %dk_36 = arith.truncf %dsT_35 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1014) + %dk_37 = tt.trans %qT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc1015) + %dk_38 = arith.constant 0.000000e+00 : f32 loc(#loc1016) + %dk_39 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1016) + %dk_40 = tt.dot %dk_36, %dk_37, %dk_39, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1016) + %dk_41 = arith.addf %dk, %dk_40 : tensor<128x128xf32> loc(#loc1017) + tt.return %dk_41, %dv_27 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc443) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc444) + %1 = ub.poison : tensor<128x128xf32> loc(#loc444) + tt.return %0, %1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc444) + } loc(#loc374) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":94:49) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":95:49) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":96:49) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":97:53) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":99:53) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":100:53) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":102:9) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":103:9) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":104:10) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":105:12) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":106:10) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":107:13) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":111:24) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":112:36) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":113:34) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":115:27) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":116:28) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":117:23) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":119:15) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":120:16) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":122:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":124:25) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":124:47) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":124:35) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":124:59) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":125:25) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":125:47) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":125:35) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":125:59) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":128:27) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":128:50) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":128:37) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":128:61) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":131:9) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":132:9) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":133:10) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":135:14) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":136:26) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":137:26) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":139:14) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":139:7) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":140:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":142:29) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":143:30) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":144:29) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":144:54) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":144:44) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":145:35) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":146:41) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":147:31) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":148:26) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":149:26) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":151:35) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":152:42) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":152:54) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":154:55) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":154:78) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":155:50) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":155:83) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":155:68) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":158:30) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":158:52) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":158:40) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":158:63) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":159:32) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":159:55) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":159:42) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":159:66) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":160:32) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":160:55) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":160:42) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":160:66) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":161:30) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":161:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":161:46) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":161:56) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":163:17) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":164:19) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":167:19) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":168:21) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":169:25) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":172:22) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":174:36) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":175:42) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":175:29) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":178:107) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":179:111) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":185:34) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":185:25) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":186:33) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":186:26) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":190:30) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":190:50) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":191:18) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":195:30) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":196:27) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":196:41) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":197:53) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":197:39) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":199:42) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":199:29) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":207:12) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":214:39) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":215:31) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":215:45) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":216:62) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":216:43) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":218:46) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":218:33) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":226:16) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":231:32) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":231:43) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":231:24) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":231:63) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":231:74) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":231:56) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":232:14) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":234:30) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":239:29) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":240:30) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":242:26) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":244:30) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":245:25) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":246:25) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":249:22) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":250:22) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":252:25) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":253:42) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":253:29) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":256:107) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":257:107) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":262:30) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":263:32) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":263:51) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":266:34) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":266:56) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":266:44) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":266:67) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":267:36) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":267:59) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":267:46) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":267:70) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":268:36) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":268:59) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":268:46) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":268:70) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":269:34) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":269:39) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":269:50) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":269:60) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":271:21) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":272:23) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":275:25) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":276:29) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":278:39) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":279:46) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":279:58) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":281:58) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":281:80) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":282:53) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":282:81) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":282:70) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":286:32) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":287:30) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":287:43) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":288:55) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":288:42) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":290:45) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":290:32) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":298:16) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":306:41) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":307:34) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":307:47) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":308:64) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":308:46) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":310:49) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":310:36) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":318:20) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":303:12) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":323:31) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":323:42) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":323:23) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":323:62) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":323:73) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":323:55) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":325:26) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":326:25) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":327:25) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":330:30) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":334:14) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":337:29) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":344:31) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":344:27) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":344:48) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":344:41) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":344:66) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":344:58) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":345:29) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":345:69) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":139:4) +#loc202 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:16) +#loc203 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc204 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc205 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:11) +#loc206 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:4) +#loc207 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc208 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc209 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc210 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":825:27) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":825:38) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":825:20) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":825:56) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":825:67) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":825:49) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":835:23) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":835:15) +#loc220 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":828:4) +#loc222 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":384:12) +#loc223 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":385:13) +#loc224 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":387:26) +#loc225 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":388:26) +#loc226 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":390:26) +#loc227 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":390:37) +#loc228 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":390:18) +#loc229 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":390:56) +#loc230 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":390:67) +#loc231 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":390:49) +#loc232 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":391:26) +#loc233 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":391:37) +#loc234 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":391:18) +#loc235 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":391:56) +#loc236 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":391:67) +#loc237 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":391:49) +#loc238 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":395:43) +#loc239 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":395:90) +#loc240 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":395:101) +#loc241 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":395:63) +#loc242 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":397:28) +#loc243 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":405:12) +#loc244 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":411:64) +#loc245 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":414:28) +#loc246 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":414:19) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":415:28) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":415:19) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":417:19) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":417:8) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":419:11) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":419:4) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":458:105) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":459:19) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":461:14) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":464:36) +#loc258 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":464:46) +#loc259 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":467:36) +#loc260 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":467:46) +#loc261 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":479:35) +#loc262 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":482:23) +#loc263 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":483:23) +#loc264 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":485:34) +#loc265 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":485:23) +#loc266 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":486:22) +#loc267 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":487:23) +#loc268 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":488:23) +#loc269 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":489:23) +#loc270 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":490:23) +#loc271 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":491:23) +#loc272 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":492:35) +#loc273 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":493:24) +#loc274 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":494:24) +#loc275 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":495:32) +#loc276 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":496:25) +#loc277 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":497:92) +#loc278 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":498:92) +#loc279 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":499:25) +#loc280 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":500:24) +#loc281 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":501:24) +#loc282 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":502:39) +#loc283 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":503:25) +#loc284 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":504:24) +#loc285 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":505:24) +#loc286 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":506:23) +#loc287 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":507:25) +#loc288 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":508:25) +#loc289 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":509:92) +#loc290 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":510:25) +#loc291 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":511:24) +#loc292 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":512:24) +#loc293 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":513:39) +#loc294 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":514:25) +#loc295 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":515:24) +#loc296 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":516:24) +#loc297 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":521:69) +#loc298 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":524:27) +#loc299 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":525:39) +#loc300 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":525:21) +#loc301 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":528:104) +#loc302 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":530:20) +#loc303 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":531:22) +#loc304 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":531:19) +#loc305 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":531:14) +#loc306 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":542:32) +#loc307 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":542:43) +#loc308 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":542:62) +#loc309 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":542:73) +#loc310 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":542:54) +#loc311 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":549:43) +#loc312 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":551:15) +#loc313 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":553:30) +#loc314 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":553:21) +#loc315 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":553:10) +#loc316 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":555:11) +#loc317 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":555:4) +#loc319 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":798:11) +#loc320 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":798:4) +#loc322 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":788:33) +#loc323 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":789:38) +#loc324 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":789:24) +#loc325 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":790:109) +#loc326 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":790:113) +#loc327 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":790:39) +#loc328 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":790:55) +#loc329 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":790:25) +#loc330 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":791:30) +#loc331 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":791:35) +#loc332 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":791:60) +#loc333 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":792:34) +#loc334 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":792:48) +#loc335 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":792:63) +#loc336 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":793:29) +#loc337 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":793:47) +#loc338 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":793:61) +#loc339 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":793:42) +#loc340 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":794:11) +#loc341 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":794:4) +#loc343 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":595:12) +#loc344 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":596:13) +#loc345 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":598:26) +#loc346 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":599:26) +#loc347 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":601:26) +#loc348 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":601:37) +#loc349 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":601:18) +#loc350 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":601:56) +#loc351 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":601:67) +#loc352 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":601:49) +#loc353 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":602:27) +#loc354 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":602:38) +#loc355 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":602:19) +#loc356 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":602:58) +#loc357 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":602:69) +#loc358 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":602:51) +#loc359 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":608:42) +#loc360 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":608:87) +#loc361 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":608:98) +#loc362 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":608:61) +#loc363 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":610:28) +#loc364 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":618:12) +#loc365 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":623:62) +#loc366 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":626:28) +#loc367 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":626:19) +#loc368 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":627:28) +#loc369 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":627:19) +#loc370 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":628:19) +#loc371 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":628:8) +#loc372 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":630:11) +#loc373 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":630:4) +#loc375 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":669:105) +#loc376 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":672:28) +#loc377 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":672:22) +#loc378 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":675:26) +#loc379 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":675:46) +#loc380 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":676:20) +#loc381 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":678:15) +#loc382 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":680:36) +#loc383 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":680:46) +#loc384 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":683:36) +#loc385 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":683:46) +#loc386 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":695:36) +#loc387 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":698:25) +#loc388 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":699:25) +#loc389 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":701:35) +#loc390 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":701:24) +#loc391 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":702:24) +#loc392 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":703:25) +#loc393 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":704:24) +#loc394 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":705:24) +#loc395 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":706:24) +#loc396 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":707:24) +#loc397 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":708:35) +#loc398 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":709:25) +#loc399 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":710:25) +#loc400 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":711:32) +#loc401 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":712:25) +#loc402 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":713:92) +#loc403 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":714:92) +#loc404 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":715:25) +#loc405 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":716:24) +#loc406 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":717:24) +#loc407 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":718:39) +#loc408 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":719:25) +#loc409 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":720:24) +#loc410 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":721:24) +#loc411 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":722:24) +#loc412 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":723:25) +#loc413 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":724:25) +#loc414 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":725:92) +#loc415 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":726:25) +#loc416 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":727:24) +#loc417 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":728:24) +#loc418 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":729:39) +#loc419 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":730:25) +#loc420 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":731:24) +#loc421 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":732:24) +#loc422 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":736:69) +#loc423 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":739:27) +#loc424 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":740:44) +#loc425 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":740:40) +#loc426 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":740:22) +#loc427 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":741:99) +#loc428 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":744:24) +#loc429 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":744:43) +#loc430 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":744:10) +#loc431 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":746:29) +#loc432 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":746:21) +#loc433 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":750:29) +#loc434 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":750:20) +#loc435 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":751:25) +#loc436 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":751:22) +#loc437 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":751:16) +#loc438 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":773:45) +#loc439 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":775:24) +#loc440 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":775:52) +#loc441 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":775:43) +#loc442 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":775:10) +#loc443 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":777:11) +#loc444 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":777:4) +#loc463 = loc("ZQ"(#loc7)) +#loc464 = loc("HQ"(#loc8)) +#loc465 = loc("HKV"(#loc9)) +#loc466 = loc("Q_LEN"(#loc10)) +#loc467 = loc("ZKV"(#loc11)) +#loc468 = loc("KV_LEN"(#loc12)) +#loc469 = loc("pid"(#loc13)) +#loc470 = loc("NUM_KV_BLOCKS"(#loc14)) +#loc471 = loc("NUM_Q_BLOCKS"(#loc15)) +#loc472 = loc("off_zq"(#loc16)) +#loc473 = loc("off_hkv"(#loc17)) +#loc474 = loc("off_zkv"(#loc18)) +#loc475 = loc("SPARSE_Z"(#loc19)) +#loc476 = loc("SPARSE_HQ"(#loc20)) +#loc477 = loc("sparse_idx_z"(#loc21)) +#loc478 = loc("k_adj"(#loc22)) +#loc479 = loc("k_adj"(#loc23)) +#loc480 = loc("k_adj"(#loc24)) +#loc481 = loc("k_adj"(#loc25)) +#loc482 = loc("v_adj"(#loc26)) +#loc483 = loc("v_adj"(#loc27)) +#loc484 = loc("v_adj"(#loc28)) +#loc485 = loc("v_adj"(#loc29)) +#loc486 = loc("dv_adj"(#loc30)) +#loc487 = loc("dv_adj"(#loc31)) +#loc488 = loc("dv_adj"(#loc32)) +#loc489 = loc("dv_adj"(#loc33)) +#loc490 = loc("K"(#loc34)) +#loc491 = loc("V"(#loc35)) +#loc492 = loc("DV"(#loc36)) +#loc493 = loc("RCP_LN2"(#loc37)) +#loc494 = loc("offs_k"(#loc38)) +#loc495 = loc("offs_v"(#loc39)) +#loc496 = loc("off_pid"(#loc42)) +#loc497 = loc("SPARSE_Q_MULTIPLE"(#loc43)) +#loc498 = loc("SPARSE_KV_MULTIPLE"(#loc44)) +#loc499 = loc("off_hq2"(#loc45)) +#loc500 = loc("off_hq2"(#loc46)) +#loc501 = loc("off_hq2"(#loc47)) +#loc502 = loc("start_m2_block"(#loc48)) +#loc503 = loc("off_pid_mask"(#loc49)) +#loc504 = loc("stride_kv_num_blks_h"(#loc50)) +#loc505 = loc("stride_kv_idx_h"(#loc51)) +#loc506 = loc("stride_kv_idx_m"(#loc52)) +#loc507 = loc("sparse_idx_hq2"(#loc53)) +#loc508 = loc("sparse_hz_offset"(#loc54)) +#loc509 = loc("sparse_hz_offset"(#loc55)) +#loc510 = loc("sparse_kv_num_blks_offset"(#loc56)) +#loc511 = loc("sparse_kv_num_blks_offset"(#loc57)) +#loc512 = loc("sparse_kv_idx_offset"(#loc58)) +#loc513 = loc("sparse_kv_idx_offset"(#loc59)) +#loc514 = loc("sparse_kv_idx_offset"(#loc60)) +#loc515 = loc("q_adj2"(#loc61)) +#loc516 = loc("q_adj2"(#loc62)) +#loc517 = loc("q_adj2"(#loc63)) +#loc518 = loc("q_adj2"(#loc64)) +#loc519 = loc("do_adj2"(#loc65)) +#loc520 = loc("do_adj2"(#loc66)) +#loc521 = loc("do_adj2"(#loc67)) +#loc522 = loc("do_adj2"(#loc68)) +#loc523 = loc("dq_adj2"(#loc69)) +#loc524 = loc("dq_adj2"(#loc70)) +#loc525 = loc("dq_adj2"(#loc71)) +#loc526 = loc("dq_adj2"(#loc72)) +#loc527 = loc("off_chz2"(#loc73)) +#loc528 = loc("off_chz2"(#loc74)) +#loc529 = loc("off_chz2"(#loc75)) +#loc530 = loc("off_chz2"(#loc76)) +#loc531 = loc("Q2"(#loc77)) +#loc532 = loc("DO2"(#loc78)) +#loc533 = loc("DQ2"(#loc79)) +#loc534 = loc("LSE2"(#loc80)) +#loc535 = loc("DELTA2"(#loc81)) +#loc536 = loc("dq"(#loc82)) +#loc537 = loc("start_m2"(#loc83)) +#loc538 = loc("offs_m2"(#loc84)) +#loc539 = loc("offs_m2"(#loc85)) +#loc540 = loc("q"(#loc86)) +#loc541 = loc("do"(#loc87)) +#loc542 = loc("Di"(#loc88)) +#loc543 = loc("Di"(#loc89)) +#loc544 = loc("lse"(#loc90)) +#loc545 = loc("lse"(#loc91)) +#loc546 = loc("lse"(#loc92)) +#loc547 = loc("lse"(#loc93)) +#loc548 = loc("lse"(#loc94)) +#loc549 = loc("kv_indices"(#loc95)) +#loc550 = loc("kv_start"(#loc96)) +#loc551 = loc("kv_start"(#loc97)) +#loc552 = loc("sparse_kv_num_blocks"(#loc98)) +#loc553 = loc("sparse_kv_num_blocks"(#loc99)) +#loc554 = loc("offs_n2"(#loc100)) +#loc555 = loc("offs_n2"(#loc101)) +#loc556 = loc("dq"(#loc102)) +#loc557 = loc("kv_indices"(#loc103)) +#loc558 = loc("kv_start"(#loc104)) +#loc559 = loc("kv_start"(#loc105)) +#loc560 = loc("sparse_kv_num_blocks"(#loc106)) +#loc561 = loc("sparse_kv_num_blocks"(#loc107)) +#loc562 = loc("offs_n2"(#loc108)) +#loc563 = loc("offs_n2"(#loc109)) +#loc564 = loc("dq"(#loc110)) +#loc565 = loc("dq_ptrs"(#loc111)) +#loc566 = loc("dq_ptrs"(#loc112)) +#loc567 = loc("dq_ptrs"(#loc113)) +#loc568 = loc("dq_ptrs"(#loc114)) +#loc569 = loc("dq_ptrs"(#loc115)) +#loc570 = loc("dq_ptrs"(#loc116)) +#loc571 = loc("dq"(#loc117)) +#loc572 = loc("SPARSE_Q_MULTIPLE"(#loc119)) +#loc573 = loc("SPARSE_KV_MULTIPLE"(#loc120)) +#loc574 = loc("pid_mask"(#loc121)) +#loc575 = loc("stride_q_num_blks_h"(#loc122)) +#loc576 = loc("stride_q_idx_h"(#loc123)) +#loc577 = loc("stride_q_idx_n"(#loc124)) +#loc578 = loc("dv"(#loc125)) +#loc579 = loc("dk"(#loc126)) +#loc580 = loc("start_n1"(#loc127)) +#loc581 = loc("offs_n1"(#loc128)) +#loc582 = loc("offs_n1"(#loc129)) +#loc583 = loc("k"(#loc130)) +#loc584 = loc("v"(#loc131)) +#loc585 = loc("dv"(#loc132)) +#loc586 = loc("off_hq1"(#loc133)) +#loc587 = loc("off_hq1"(#loc134)) +#loc588 = loc("q_adj1"(#loc135)) +#loc589 = loc("q_adj1"(#loc136)) +#loc590 = loc("q_adj1"(#loc137)) +#loc591 = loc("q_adj1"(#loc138)) +#loc592 = loc("do_adj1"(#loc139)) +#loc593 = loc("do_adj1"(#loc140)) +#loc594 = loc("do_adj1"(#loc141)) +#loc595 = loc("do_adj1"(#loc142)) +#loc596 = loc("dq_adj1"(#loc143)) +#loc597 = loc("dq_adj1"(#loc144)) +#loc598 = loc("dq_adj1"(#loc145)) +#loc599 = loc("dq_adj1"(#loc146)) +#loc600 = loc("off_chz1"(#loc147)) +#loc601 = loc("off_chz1"(#loc148)) +#loc602 = loc("off_chz1"(#loc149)) +#loc603 = loc("off_chz1"(#loc150)) +#loc604 = loc("Q1"(#loc151)) +#loc605 = loc("DO1"(#loc152)) +#loc606 = loc("LSE1"(#loc153)) +#loc607 = loc("DELTA1"(#loc154)) +#loc608 = loc("sparse_idx_hq1"(#loc155)) +#loc609 = loc("sparse_hz_offset"(#loc156)) +#loc610 = loc("sparse_hz_offset"(#loc157)) +#loc611 = loc("sparse_q_num_blks_offset"(#loc158)) +#loc612 = loc("sparse_q_num_blks_offset"(#loc159)) +#loc613 = loc("sparse_q_idx_offset"(#loc160)) +#loc614 = loc("sparse_q_idx_offset"(#loc161)) +#loc615 = loc("sparse_q_idx_offset"(#loc162)) +#loc616 = loc("q_indices"(#loc163)) +#loc617 = loc("q_start"(#loc164)) +#loc618 = loc("q_start"(#loc165)) +#loc619 = loc("sparse_q_num_blocks"(#loc166)) +#loc620 = loc("sparse_q_num_blocks"(#loc167)) +#loc621 = loc("offs_m1"(#loc168)) +#loc622 = loc("offs_m1"(#loc169)) +#loc623 = loc("q_indices"(#loc171)) +#loc624 = loc("q_start"(#loc172)) +#loc625 = loc("q_start"(#loc173)) +#loc626 = loc("sparse_q_num_blocks"(#loc174)) +#loc627 = loc("sparse_q_num_blocks"(#loc175)) +#loc628 = loc("offs_m1"(#loc176)) +#loc629 = loc("offs_m1"(#loc177)) +#loc630 = loc("dv_ptrs"(#loc180)) +#loc631 = loc("dv_ptrs"(#loc181)) +#loc632 = loc("dv_ptrs"(#loc182)) +#loc633 = loc("dv_ptrs"(#loc183)) +#loc634 = loc("dv_ptrs"(#loc184)) +#loc635 = loc("dv_ptrs"(#loc185)) +#loc636 = loc("index_n"(#loc186)) +#loc637 = loc("index_k"(#loc187)) +#loc638 = loc("index_v"(#loc188)) +#loc639 = loc("dk"(#loc190)) +#loc640 = loc("mask"(#loc191)) +#loc641 = loc("xindex"(#loc192)) +#loc642 = loc("xindex"(#loc193)) +#loc643 = loc("xindex"(#loc194)) +#loc644 = loc("xindex"(#loc195)) +#loc645 = loc("xindex"(#loc196)) +#loc646 = loc("xindex"(#loc197)) +#loc654 = loc("ptr"(#loc212)) +#loc655 = loc("ptr"(#loc213)) +#loc656 = loc("ptr"(#loc214)) +#loc657 = loc("ptr"(#loc215)) +#loc658 = loc("ptr"(#loc216)) +#loc659 = loc("ptr"(#loc217)) +#loc695 = loc("Q_LEN"(#loc222)) +#loc696 = loc("KV_LEN"(#loc223)) +#loc697 = loc("offs_k"(#loc224)) +#loc698 = loc("offs_v"(#loc225)) +#loc699 = loc("kT_ptrs"(#loc226)) +#loc700 = loc("kT_ptrs"(#loc227)) +#loc701 = loc("kT_ptrs"(#loc228)) +#loc702 = loc("kT_ptrs"(#loc229)) +#loc703 = loc("kT_ptrs"(#loc230)) +#loc704 = loc("kT_ptrs"(#loc231)) +#loc705 = loc("vT_ptrs"(#loc232)) +#loc706 = loc("vT_ptrs"(#loc233)) +#loc707 = loc("vT_ptrs"(#loc234)) +#loc708 = loc("vT_ptrs"(#loc235)) +#loc709 = loc("vT_ptrs"(#loc236)) +#loc710 = loc("vT_ptrs"(#loc237)) +#loc711 = loc("hi"(#loc238)) +#loc712 = loc("hi"(#loc239)) +#loc713 = loc("hi"(#loc240)) +#loc714 = loc("hi"(#loc241)) +#loc715 = loc("dq"(#loc242)) +#loc716 = loc("dq"(#loc243)) +#loc717 = loc("offset"(#loc244)) +#loc718 = loc("kT_ptrs"(#loc245)) +#loc719 = loc("kT_ptrs"(#loc246)) +#loc720 = loc("vT_ptrs"(#loc247)) +#loc721 = loc("vT_ptrs"(#loc248)) +#loc722 = loc("offs_n2"(#loc249)) +#loc762 = loc("kT"(#loc254)) +#loc763 = loc("qk"(#loc255)) +#loc764 = loc("qk"(#loc256)) +#loc765 = loc("n"(#loc257)) +#loc766 = loc("n"(#loc258)) +#loc767 = loc("m"(#loc259)) +#loc768 = loc("m"(#loc260)) +#loc769 = loc("tmp1"(#loc261)) +#loc770 = loc("tmp4"(#loc262)) +#loc771 = loc("tmp5"(#loc263)) +#loc772 = loc("tmp7"(#loc264)) +#loc773 = loc("tmp7"(#loc265)) +#loc774 = loc("tmp8"(#loc266)) +#loc775 = loc("tmp9"(#loc267)) +#loc776 = loc("tmp10"(#loc268)) +#loc777 = loc("tmp11"(#loc269)) +#loc778 = loc("tmp12"(#loc270)) +#loc779 = loc("tmp13"(#loc271)) +#loc780 = loc("tmp14"(#loc272)) +#loc781 = loc("tmp15"(#loc273)) +#loc782 = loc("tmp16"(#loc274)) +#loc783 = loc("tmp17"(#loc275)) +#loc784 = loc("tmp18"(#loc276)) +#loc785 = loc("tmp19"(#loc277)) +#loc786 = loc("tmp20"(#loc278)) +#loc787 = loc("tmp21"(#loc279)) +#loc788 = loc("tmp22"(#loc280)) +#loc789 = loc("tmp23"(#loc281)) +#loc790 = loc("tmp24"(#loc282)) +#loc791 = loc("tmp25"(#loc283)) +#loc792 = loc("tmp26"(#loc284)) +#loc793 = loc("tmp27"(#loc285)) +#loc794 = loc("tmp28"(#loc286)) +#loc795 = loc("tmp29"(#loc287)) +#loc796 = loc("tmp30"(#loc288)) +#loc797 = loc("tmp31"(#loc289)) +#loc798 = loc("tmp32"(#loc290)) +#loc799 = loc("tmp33"(#loc291)) +#loc800 = loc("tmp34"(#loc292)) +#loc801 = loc("tmp35"(#loc293)) +#loc802 = loc("tmp36"(#loc294)) +#loc803 = loc("tmp37"(#loc295)) +#loc804 = loc("tmp38"(#loc296)) +#loc805 = loc("post_mod_scores"(#loc297)) +#loc806 = loc("post_mod_scores"(#loc298)) +#loc807 = loc("p"(#loc299)) +#loc808 = loc("p"(#loc300)) +#loc809 = loc("vT"(#loc301)) +#loc810 = loc("dp"(#loc302)) +#loc811 = loc("ds"(#loc303)) +#loc812 = loc("ds"(#loc304)) +#loc813 = loc("ds"(#loc305)) +#loc814 = loc("scatter_mask"(#loc306)) +#loc815 = loc("scatter_mask"(#loc307)) +#loc816 = loc("scatter_mask"(#loc308)) +#loc817 = loc("scatter_mask"(#loc309)) +#loc818 = loc("scatter_mask"(#loc310)) +#loc819 = loc("ds"(#loc311)) +#loc820 = loc("ds"(#loc312)) +#loc821 = loc("dq"(#loc313)) +#loc822 = loc("dq"(#loc314)) +#loc823 = loc("dq"(#loc315)) +#loc829 = loc("cur_block_idx"(#loc322)) +#loc830 = loc("cur_block"(#loc323)) +#loc831 = loc("cur_block"(#loc324)) +#loc832 = loc("next_block"(#loc325)) +#loc833 = loc("next_block"(#loc326)) +#loc834 = loc("next_block"(#loc327)) +#loc835 = loc("next_block"(#loc328)) +#loc836 = loc("next_block"(#loc329)) +#loc837 = loc("needs_jump"(#loc330)) +#loc838 = loc("needs_jump"(#loc331)) +#loc839 = loc("needs_jump"(#loc332)) +#loc840 = loc("jump_to_block"(#loc333)) +#loc841 = loc("jump_to_block"(#loc334)) +#loc842 = loc("jump_to_block"(#loc335)) +#loc843 = loc("offset"(#loc336)) +#loc844 = loc("offset"(#loc337)) +#loc845 = loc("offset"(#loc338)) +#loc846 = loc("offset"(#loc339)) +#loc883 = loc("Q_LEN"(#loc343)) +#loc884 = loc("KV_LEN"(#loc344)) +#loc885 = loc("offs_k"(#loc345)) +#loc886 = loc("offs_v"(#loc346)) +#loc887 = loc("qT_ptrs"(#loc347)) +#loc888 = loc("qT_ptrs"(#loc348)) +#loc889 = loc("qT_ptrs"(#loc349)) +#loc890 = loc("qT_ptrs"(#loc350)) +#loc891 = loc("qT_ptrs"(#loc351)) +#loc892 = loc("qT_ptrs"(#loc352)) +#loc893 = loc("do_ptrs"(#loc353)) +#loc894 = loc("do_ptrs"(#loc354)) +#loc895 = loc("do_ptrs"(#loc355)) +#loc896 = loc("do_ptrs"(#loc356)) +#loc897 = loc("do_ptrs"(#loc357)) +#loc898 = loc("do_ptrs"(#loc358)) +#loc899 = loc("hi"(#loc359)) +#loc900 = loc("hi"(#loc360)) +#loc901 = loc("hi"(#loc361)) +#loc902 = loc("hi"(#loc362)) +#loc903 = loc("dk"(#loc363)) +#loc904 = loc("offset"(#loc365)) +#loc905 = loc("qT_ptrs"(#loc366)) +#loc906 = loc("qT_ptrs"(#loc367)) +#loc907 = loc("do_ptrs"(#loc368)) +#loc908 = loc("do_ptrs"(#loc369)) +#loc909 = loc("offs_m1"(#loc370)) +#loc950 = loc("qT"(#loc375)) +#loc951 = loc("lse"(#loc376)) +#loc952 = loc("lse"(#loc377)) +#loc953 = loc("lse"(#loc378)) +#loc954 = loc("lse"(#loc379)) +#loc955 = loc("qkT"(#loc380)) +#loc956 = loc("qkT"(#loc381)) +#loc957 = loc("m"(#loc382)) +#loc958 = loc("m"(#loc383)) +#loc959 = loc("n"(#loc384)) +#loc960 = loc("n"(#loc385)) +#loc961 = loc("tmp41"(#loc386)) +#loc962 = loc("tmp44"(#loc387)) +#loc963 = loc("tmp45"(#loc388)) +#loc964 = loc("tmp47"(#loc389)) +#loc965 = loc("tmp47"(#loc390)) +#loc966 = loc("tmp48"(#loc391)) +#loc967 = loc("tmp49"(#loc392)) +#loc968 = loc("tmp50"(#loc393)) +#loc969 = loc("tmp51"(#loc394)) +#loc970 = loc("tmp52"(#loc395)) +#loc971 = loc("tmp53"(#loc396)) +#loc972 = loc("tmp54"(#loc397)) +#loc973 = loc("tmp55"(#loc398)) +#loc974 = loc("tmp56"(#loc399)) +#loc975 = loc("tmp57"(#loc400)) +#loc976 = loc("tmp58"(#loc401)) +#loc977 = loc("tmp59"(#loc402)) +#loc978 = loc("tmp60"(#loc403)) +#loc979 = loc("tmp61"(#loc404)) +#loc980 = loc("tmp62"(#loc405)) +#loc981 = loc("tmp63"(#loc406)) +#loc982 = loc("tmp64"(#loc407)) +#loc983 = loc("tmp65"(#loc408)) +#loc984 = loc("tmp66"(#loc409)) +#loc985 = loc("tmp67"(#loc410)) +#loc986 = loc("tmp68"(#loc411)) +#loc987 = loc("tmp69"(#loc412)) +#loc988 = loc("tmp70"(#loc413)) +#loc989 = loc("tmp71"(#loc414)) +#loc990 = loc("tmp72"(#loc415)) +#loc991 = loc("tmp73"(#loc416)) +#loc992 = loc("tmp74"(#loc417)) +#loc993 = loc("tmp75"(#loc418)) +#loc994 = loc("tmp76"(#loc419)) +#loc995 = loc("tmp77"(#loc420)) +#loc996 = loc("tmp78"(#loc421)) +#loc997 = loc("post_mod_scores"(#loc422)) +#loc998 = loc("post_mod_scores"(#loc423)) +#loc999 = loc("pT"(#loc424)) +#loc1000 = loc("pT"(#loc425)) +#loc1001 = loc("pT"(#loc426)) +#loc1002 = loc("do"(#loc427)) +#loc1003 = loc("dv"(#loc428)) +#loc1004 = loc("dv"(#loc429)) +#loc1005 = loc("dv"(#loc430)) +#loc1006 = loc("Di"(#loc431)) +#loc1007 = loc("Di"(#loc432)) +#loc1008 = loc("dpT"(#loc433)) +#loc1009 = loc("dpT"(#loc434)) +#loc1010 = loc("dsT"(#loc435)) +#loc1011 = loc("dsT"(#loc436)) +#loc1012 = loc("dsT"(#loc437)) +#loc1013 = loc("dsT"(#loc438)) +#loc1014 = loc("dk"(#loc439)) +#loc1015 = loc("dk"(#loc440)) +#loc1016 = loc("dk"(#loc441)) +#loc1017 = loc("dk"(#loc442)) +#loc1018 = loc("SPARSE_Q_MULTIPLE"(#loc497)) +#loc1019 = loc("SPARSE_KV_MULTIPLE"(#loc498)) +#loc1020 = loc("SPARSE_Q_MULTIPLE"(#loc572)) +#loc1021 = loc("SPARSE_KV_MULTIPLE"(#loc573)) +#loc1022 = loc("dk"(#loc585)) +#loc1023 = loc("offs_n2"(#loc715)) +#loc1024 = loc("dv"(#loc903)) +#loc1025 = loc("kT_ptrs"(#loc1023)) +#loc1026 = loc("offs_m1"(#loc1024)) +#loc1027 = loc("vT_ptrs"(#loc1025)) +#loc1028 = loc("qT_ptrs"(#loc1026)) +#loc1029 = loc("do_ptrs"(#loc1028)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..be3ee631f5d69fe49037d3d3826ea57c471ad484 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttgir @@ -0,0 +1,1750 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":18:0) +#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 64, 16]}> +#mma1 = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 128, 16]}> +#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}> +#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}> +#shared2 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}> +#smem = #ttg.shared_memory +#loc289 = loc("arg_Q"(#loc)) +#loc290 = loc("arg_K"(#loc)) +#loc291 = loc("arg_V"(#loc)) +#loc292 = loc("arg_LSE"(#loc)) +#loc293 = loc("arg_DELTA"(#loc)) +#loc294 = loc("arg_DO"(#loc)) +#loc295 = loc("arg_DQ"(#loc)) +#loc296 = loc("arg_DV"(#loc)) +#loc297 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc298 = loc("arg_KV_IDX"(#loc)) +#loc299 = loc("arg_Q_NUM_BLKS"(#loc)) +#loc300 = loc("arg_Q_IDX"(#loc)) +#loc301 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc302 = loc("arg_FULL_KV_IDX"(#loc)) +#loc303 = loc("arg_FULL_Q_NUM_BLKS"(#loc)) +#loc304 = loc("arg_FULL_Q_IDX"(#loc)) +#loc305 = loc("in_ptr16"(#loc)) +#loc306 = loc("out_ptr0"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_tem_fused_zeros_1(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_DELTA: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DELTA"(#loc)), %arg_DO: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DO"(#loc)), %arg_DQ: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DQ"(#loc)), %arg_DV: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DV"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_NUM_BLKS"(#loc)), %arg_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %arg_FULL_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_NUM_BLKS"(#loc)), %arg_FULL_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_IDX"(#loc)), %in_ptr16: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr16"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<128x1xi32, #mma> loc(#loc1) + %cst_0 = arith.constant dense<2048> : tensor<128x1xi32, #mma> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<4096> : tensor<128x1xi32, #blocked> loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c2097152_i32 = arith.constant 2097152 : i32 loc(#loc1) + %c262144_i32 = arith.constant 262144 : i32 loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_3 = arith.constant dense<0.0883883461> : tensor<128x128xf32, #mma1> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma1> loc(#loc1) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_6 = arith.constant dense<0.0883883461> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_7 = arith.constant dense<0xFF800000> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_8 = arith.constant dense<1.44269502> : tensor<128x64xf32, #mma> loc(#loc1) + %true = arith.constant true loc(#loc1) + %c-1_i32 = arith.constant -1 : i32 loc(#loc1) + %c3_i32 = arith.constant 3 : i32 loc(#loc1) + %cst_9 = arith.constant dense<8192> : tensor<128x64xi32, #blocked1> loc(#loc1) + %cst_10 = arith.constant dense<262144> : tensor<128x64xi32, #blocked1> loc(#loc1) + %cst_11 = arith.constant dense<8192> : tensor<64x128xi32, #blocked> loc(#loc1) + %cst_12 = arith.constant dense<64> : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1) + %cst_13 = arith.constant dense<2048> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_14 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_15 = arith.constant dense<4096> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_16 = arith.constant dense<128> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_17 = arith.constant dense<2048> : tensor<1x64xi32, #mma> loc(#loc1) + %cst_18 = arith.constant dense<2048> : tensor<128x64xi32, #mma> loc(#loc1) + %cst_19 = arith.constant dense<0xFF800000> : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1) + %cst_20 = arith.constant dense<0.000000e+00> : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1) + %cst_21 = arith.constant dense<0> : tensor<128x64xi32, #mma> loc(#loc1) + %cst_22 = arith.constant dense<0> : tensor<1x64xi32, #mma> loc(#loc1) + %cst_23 = arith.constant dense<0.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %cst_24 = arith.constant dense<0xFF800000> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %pid = tt.get_program_id x : i32 loc(#loc307) + %off_zq = tt.get_program_id y : i32 loc(#loc308) + %off_hkv = tt.get_program_id z : i32 loc(#loc309) + %off_zkv = arith.remsi %off_zq, %c8_i32 : i32 loc(#loc310) + %k_adj = arith.muli %off_hkv, %c262144_i32 : i32 loc(#loc311) + %k_adj_25 = arith.muli %off_zkv, %c2097152_i32 : i32 loc(#loc312) + %k_adj_26 = arith.addi %k_adj, %k_adj_25 : i32 loc(#loc313) + %k_adj_27 = arith.extsi %k_adj_26 : i32 to i64 loc(#loc314) + %dv_adj = arith.muli %off_zq, %c2097152_i32 : i32 loc(#loc315) + %dv_adj_28 = arith.addi %k_adj, %dv_adj : i32 loc(#loc316) + %dv_adj_29 = arith.extsi %dv_adj_28 : i32 to i64 loc(#loc317) + %K = tt.addptr %arg_K, %k_adj_27 : !tt.ptr, i64 loc(#loc318) + %V = tt.addptr %arg_V, %k_adj_27 : !tt.ptr, i64 loc(#loc319) + %DV = tt.addptr %arg_DV, %dv_adj_29 : !tt.ptr, i64 loc(#loc320) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc321) + %offs_k_30 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc321) + %0 = arith.cmpi sge, %pid, %c16_i32 : i32 loc(#loc17) + scf.if %0 { + %off_pid = arith.subi %pid, %c16_i32 : i32 loc(#loc322) + %off_hq2 = arith.divsi %off_pid, %c16_i32 : i32 loc(#loc323) + %off_hq2_31 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc324) + %off_hq2_32 = arith.addi %off_hq2, %off_hq2_31 : i32 loc(#loc325) + %start_m2_block = arith.remsi %off_pid, %c16_i32 : i32 loc(#loc326) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %c16_i32 : i32 loc(#loc327) + %sparse_kv_num_blks_offset_33 = arith.addi %sparse_kv_num_blks_offset, %start_m2_block : i32 loc(#loc328) + %sparse_kv_idx_offset = arith.muli %off_zkv, %c256_i32 : i32 loc(#loc329) + %sparse_kv_idx_offset_34 = arith.muli %start_m2_block, %c16_i32 : i32 loc(#loc330) + %sparse_kv_idx_offset_35 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_34 : i32 loc(#loc331) + %q_adj2 = arith.muli %off_hq2_32, %c128_i32 : i32 loc(#loc332) + %q_adj2_36 = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc333) + %q_adj2_37 = arith.addi %q_adj2, %q_adj2_36 : i32 loc(#loc334) + %q_adj2_38 = arith.extsi %q_adj2_37 : i32 to i64 loc(#loc335) + %do_adj2 = arith.muli %off_hq2_32, %c262144_i32 : i32 loc(#loc336) + %do_adj2_39 = arith.addi %do_adj2, %q_adj2_36 : i32 loc(#loc337) + %do_adj2_40 = arith.extsi %do_adj2_39 : i32 to i64 loc(#loc338) + %off_chz2 = arith.muli %off_zq, %c32_i32 : i32 loc(#loc339) + %off_chz2_41 = arith.addi %off_chz2, %off_hq2_32 : i32 loc(#loc340) + %off_chz2_42 = arith.muli %off_chz2_41, %c2048_i32 : i32 loc(#loc341) + %off_chz2_43 = arith.extsi %off_chz2_42 : i32 to i64 loc(#loc342) + %Q2 = tt.addptr %arg_Q, %q_adj2_38 : !tt.ptr, i64 loc(#loc343) + %DO2 = tt.addptr %arg_DO, %do_adj2_40 : !tt.ptr, i64 loc(#loc344) + %DQ2 = tt.addptr %arg_DQ, %q_adj2_38 : !tt.ptr, i64 loc(#loc345) + %LSE2 = tt.addptr %arg_LSE, %off_chz2_43 : !tt.ptr, i64 loc(#loc346) + %DELTA2 = tt.addptr %arg_DELTA, %off_chz2_43 : !tt.ptr, i64 loc(#loc347) + %start_m2 = arith.muli %start_m2_block, %c128_i32 : i32 loc(#loc348) + %offs_m2 = tt.splat %start_m2 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc349) + %offs_m2_44 = tt.splat %start_m2 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc349) + %offs_m2_45 = arith.addi %offs_m2, %offs_k : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc349) + %offs_m2_46 = arith.addi %offs_m2_44, %offs_k_30 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc349) + %ptr = tt.expand_dims %offs_m2_45 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc584) + %ptr_47 = tt.expand_dims %offs_m2_46 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xi32, #mma> loc(#loc584) + %ptr_48 = arith.muli %ptr, %cst_2 : tensor<128x1xi32, #blocked> loc(#loc585) + %ptr_49 = tt.splat %Q2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc586) + %ptr_50 = tt.addptr %ptr_49, %ptr_48 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc586) + %ptr_51 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc587) + %ptr_52 = tt.expand_dims %ptr_51 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc587) + %ptr_53 = tt.broadcast %ptr_50 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc588) + %ptr_54 = tt.broadcast %ptr_52 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc588) + %ptr_55 = tt.addptr %ptr_53, %ptr_54 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc588) + %q = tt.load %ptr_55 : tensor<128x128x!tt.ptr, #blocked> loc(#loc589) + %q_56 = ttg.local_alloc %q : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc589) + %ptr_57 = arith.muli %ptr, %cst_1 : tensor<128x1xi32, #blocked> loc(#loc590) + %ptr_58 = tt.splat %DO2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc591) + %ptr_59 = tt.addptr %ptr_58, %ptr_57 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc591) + %ptr_60 = tt.broadcast %ptr_59 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc592) + %ptr_61 = tt.addptr %ptr_60, %ptr_54 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc592) + %do = tt.load %ptr_61 : tensor<128x128x!tt.ptr, #blocked> loc(#loc593) + %do_62 = ttg.local_alloc %do : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc593) + %Di = tt.splat %DELTA2 : !tt.ptr -> tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc357) + %Di_63 = tt.addptr %Di, %offs_m2_46 : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc357) + %Di_64 = tt.load %Di_63 : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc358) + %lse = tt.splat %LSE2 : !tt.ptr -> tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc359) + %lse_65 = tt.addptr %lse, %offs_m2_46 : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc359) + %lse_66 = tt.load %lse_65 : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc360) + %lse_67 = arith.cmpf oeq, %lse_66, %cst_24 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc361) + %lse_68 = arith.select %lse_67, %cst_23, %lse_66 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc362) + %lse_69 = tt.expand_dims %lse_68 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc363) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_35 : !tt.ptr, i32 loc(#loc364) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc365) + %kv_start_70 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc366) + %sparse_kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_33 : !tt.ptr, i32 loc(#loc367) + %sparse_kv_num_blocks_71 = tt.load %sparse_kv_num_blocks : !tt.ptr loc(#loc368) + %offs_n2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc369) + %offs_n2_72 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc369) + %offs_n2_73 = tt.splat %kv_start_70 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc370) + %offs_n2_74 = tt.splat %kv_start_70 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc370) + %offs_n2_75 = arith.addi %offs_n2_73, %offs_n2 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc370) + %offs_n2_76 = arith.addi %offs_n2_74, %offs_n2_72 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc370) + %kT_ptrs = tt.expand_dims %offs_n2_75 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc594) + %kT_ptrs_77 = arith.muli %kT_ptrs, %cst_16 : tensor<1x64xi32, #blocked1> loc(#loc595) + %kT_ptrs_78 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc596) + %kT_ptrs_79 = tt.addptr %kT_ptrs_78, %kT_ptrs_77 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc596) + %kT_ptrs_80 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc597) + %kT_ptrs_81 = tt.expand_dims %kT_ptrs_80 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xi32, #blocked1> loc(#loc597) + %kT_ptrs_82 = tt.broadcast %kT_ptrs_79 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc598) + %kT_ptrs_83 = tt.broadcast %kT_ptrs_81 : tensor<128x1xi32, #blocked1> -> tensor<128x64xi32, #blocked1> loc(#loc598) + %kT_ptrs_84 = tt.addptr %kT_ptrs_82, %kT_ptrs_83 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc598) + %vT_ptrs = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc599) + %vT_ptrs_85 = tt.addptr %vT_ptrs, %kT_ptrs_77 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc599) + %vT_ptrs_86 = tt.broadcast %vT_ptrs_85 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc600) + %vT_ptrs_87 = tt.addptr %vT_ptrs_86, %kT_ptrs_83 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc600) + %hi = arith.muli %sparse_kv_num_blocks_71, %c2_i32 : i32 loc(#loc601) + %hi_88 = arith.minsi %hi, %c32_i32 : i32 loc(#loc602) + %tmp4 = tt.broadcast %ptr_47 : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc747) + %tmp7 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc748) + %vT_ptrs_89 = arith.cmpi sgt, %hi_88, %c0_i32 : i32 loc(#loc886) + %tmp7_90 = tt.load %tmp7, %vT_ptrs_89 : !tt.ptr loc(#loc750) + %tmp8 = tt.splat %tmp7_90 : i64 -> tensor<1x64xi64, #mma> loc(#loc751) + %tmp9 = arith.extsi %ptr_47 : tensor<128x1xi32, #mma> to tensor<128x1xi64, #mma> loc(#loc752) + %tmp10 = tt.splat %tmp7_90 : i64 -> tensor<128x1xi64, #mma> loc(#loc753) + %tmp10_91 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64, #mma> loc(#loc753) + %tmp11 = tt.broadcast %tmp10_91 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc754) + %p = tt.broadcast %lse_69 : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc755) + %ds = tt.expand_dims %Di_64 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc756) + %ds_92 = tt.broadcast %ds : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc757) + %kT = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc881) + %vT = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc882) + %kT_93 = ttg.memdesc_index %kT[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc881) + %vT_ptrs_94 = tt.splat %vT_ptrs_89 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc886) + %kT_95 = ttg.async_copy_global_to_local %kT_ptrs_84, %kT_93 mask %vT_ptrs_94 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc881) + %kT_96 = ttg.async_commit_group tokens %kT_95 loc(#loc881) + %vT_97 = ttg.memdesc_index %vT[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc882) + %vT_98 = ttg.async_copy_global_to_local %vT_ptrs_87, %vT_97 mask %vT_ptrs_94 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc882) + %vT_99 = ttg.async_commit_group tokens %vT_98 loc(#loc882) + %vT_ptrs_100 = arith.cmpi sgt, %hi_88, %c1_i32 : i32 loc(#loc886) + %kT_ptrs_101 = tt.addptr %kT_ptrs_84, %cst_9 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc605) + %vT_ptrs_102 = tt.addptr %vT_ptrs_87, %cst_9 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc606) + %kT_103 = ttg.memdesc_index %kT[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc881) + %vT_ptrs_104 = tt.splat %vT_ptrs_100 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc886) + %kT_105 = ttg.async_copy_global_to_local %kT_ptrs_101, %kT_103 mask %vT_ptrs_104 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc881) + %kT_106 = ttg.async_commit_group tokens %kT_105 loc(#loc881) + %vT_107 = ttg.memdesc_index %vT[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc882) + %vT_108 = ttg.async_copy_global_to_local %vT_ptrs_102, %vT_107 mask %vT_ptrs_104 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc882) + %vT_109 = ttg.async_commit_group tokens %vT_108 loc(#loc882) + ttng.fence_async_shared {bCluster = false} loc(#loc760) + %vT_ptrs_110:11 = scf.for %vT_ptrs_156 = %c0_i32 to %hi_88 step %c1_i32 iter_args(%arg19 = %cst_4, %kT_ptrs_157 = %kT_ptrs_101, %vT_ptrs_158 = %vT_ptrs_102, %offs_n2_159 = %offs_n2_76, %arg23 = %c1_i32, %arg24 = %c-1_i32, %kT_160 = %kT_96, %kT_161 = %kT_106, %vT_162 = %vT_99, %vT_163 = %vT_109, %arg29 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 { + %vT_ptrs_164 = arith.subi %hi_88, %c2_i32 : i32 loc(#loc886) + %vT_ptrs_165 = arith.cmpi slt, %vT_ptrs_156, %vT_ptrs_164 : i32 loc(#loc886) + %vT_ptrs_166 = arith.subi %hi_88, %c1_i32 : i32 loc(#loc886) + %vT_ptrs_167 = arith.cmpi slt, %vT_ptrs_156, %vT_ptrs_166 : i32 loc(#loc886) + %vT_ptrs_168 = arith.addi %arg24, %c1_i32 : i32 loc(#loc886) + %vT_ptrs_169 = arith.cmpi sge, %vT_ptrs_168, %c3_i32 : i32 loc(#loc886) + %vT_ptrs_170 = arith.select %vT_ptrs_169, %c0_i32, %vT_ptrs_168 : i32 loc(#loc886) + %kT_171 = ttg.async_wait %kT_160, %vT_162 {num = 2 : i32} loc(#loc881) + %kT_172 = ttg.memdesc_index %kT[%vT_ptrs_170] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc881) + %dq_173 = ttg.memdesc_trans %kT_172 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc761) + %qk = ttng.warp_group_dot %q_56, %kT_172, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc760) + %qk_174:3 = ttng.warp_group_dot_wait %qk, %q_56, %kT_172 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc760) + %qk_175 = arith.mulf %qk_174#0, %cst_6 : tensor<128x64xf32, #mma> loc(#loc762) + %n = tt.expand_dims %offs_n2_159 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc763) + %tmp4_176 = tt.broadcast %n : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc747) + %tmp4_177 = arith.cmpi sge, %tmp4, %tmp4_176 : tensor<128x64xi32, #mma> loc(#loc747) + %tmp5 = arith.extsi %n : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc764) + %tmp8_178 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64, #mma> loc(#loc751) + %tmp11_179 = tt.broadcast %tmp8_178 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc754) + %tmp11_180 = arith.andi %tmp11_179, %tmp11 : tensor<128x64xi1, #mma> loc(#loc754) + %tmp12 = arith.andi %tmp4_177, %tmp11_180 : tensor<128x64xi1, #mma> loc(#loc765) + %tmp15 = arith.cmpi sge, %n, %cst_17 : tensor<1x64xi32, #mma> loc(#loc766) + %tmp16 = arith.remsi %n, %cst_17 : tensor<1x64xi32, #mma> loc(#loc767) + %tmp18 = arith.cmpi ne, %tmp16, %cst_22 : tensor<1x64xi32, #mma> loc(#loc768) + %tmp19 = arith.cmpi slt, %tmp16, %cst_22 : tensor<1x64xi32, #mma> loc(#loc769) + %tmp22 = arith.andi %tmp18, %tmp19 : tensor<1x64xi1, #mma> loc(#loc770) + %tmp23 = arith.addi %tmp16, %cst_17 : tensor<1x64xi32, #mma> loc(#loc771) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1, #mma>, tensor<1x64xi32, #mma> loc(#loc772) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc773) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64, #mma> loc(#loc774) + %tmp27 = arith.andi %tmp15, %tmp26 : tensor<1x64xi1, #mma> loc(#loc775) + %tmp28 = arith.subi %tmp4_176, %tmp4 : tensor<128x64xi32, #mma> loc(#loc776) + %tmp29 = arith.remsi %tmp28, %cst_18 : tensor<128x64xi32, #mma> loc(#loc777) + %tmp30 = arith.cmpi ne, %tmp29, %cst_21 : tensor<128x64xi32, #mma> loc(#loc778) + %tmp31 = arith.cmpi slt, %tmp29, %cst_21 : tensor<128x64xi32, #mma> loc(#loc779) + %tmp33 = arith.andi %tmp30, %tmp31 : tensor<128x64xi1, #mma> loc(#loc780) + %tmp34 = arith.addi %tmp29, %cst_18 : tensor<128x64xi32, #mma> loc(#loc781) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29 : tensor<128x64xi1, #mma>, tensor<128x64xi32, #mma> loc(#loc782) + %tmp36 = arith.cmpi eq, %tmp35, %cst_21 : tensor<128x64xi32, #mma> loc(#loc783) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc784) + %tmp37_181 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1, #mma> loc(#loc784) + %tmp38 = arith.ori %tmp12, %tmp37_181 : tensor<128x64xi1, #mma> loc(#loc785) + %post_mod_scores = arith.select %tmp38, %qk_175, %cst_7 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc786) + %post_mod_scores_182 = arith.mulf %post_mod_scores, %cst_8 : tensor<128x64xf32, #mma> loc(#loc787) + %p_183 = arith.subf %post_mod_scores_182, %p : tensor<128x64xf32, #mma> loc(#loc755) + %p_184 = math.exp2 %p_183 : tensor<128x64xf32, #mma> loc(#loc788) + %vT_185 = ttg.memdesc_index %vT[%vT_ptrs_170] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc882) + %dp = ttng.warp_group_dot %do_62, %vT_185, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc789) + %dp_186:3 = ttng.warp_group_dot_wait %dp, %do_62, %vT_185 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc789) + %ds_187 = arith.subf %dp_186#0, %ds_92 : tensor<128x64xf32, #mma> loc(#loc757) + %ds_188 = arith.mulf %p_184, %ds_187 : tensor<128x64xf32, #mma> loc(#loc790) + %ds_189 = arith.select %tmp38, %ds_188, %cst_5 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc791) + %ds_190 = arith.truncf %ds_189 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc792) + %ds_191 = ttg.convert_layout %ds_190 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc792) + %dq_192 = ttng.warp_group_dot %ds_191, %dq_173, %arg19 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc793) + %offs_n2_193 = tt.splat %arg29 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc607) + %offs_n2_194 = arith.addi %offs_n2_159, %offs_n2_193 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc607) + %vT_ptrs_195 = arith.addi %vT_ptrs_156, %c1_i32 : i32 loc(#loc886) + %cur_block_idx = arith.divsi %vT_ptrs_195, %c2_i32 : i32 loc(#loc794) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc795) + %cur_block_196 = tt.load %cur_block, %vT_ptrs_167 evictionPolicy = evict_last : !tt.ptr loc(#loc796) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc797) + %next_block_197 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_71 : i32 loc(#loc798) + %next_block_198 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc799) + %vT_ptrs_199 = arith.andi %vT_ptrs_167, %next_block_197 : i1 loc(#loc886) + %next_block_200 = tt.load %next_block_198, %vT_ptrs_199 evictionPolicy = evict_last : !tt.ptr loc(#loc800) + %needs_jump = arith.addi %vT_ptrs_156, %c2_i32 : i32 loc(#loc801) + %needs_jump_201 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc802) + %needs_jump_202 = arith.cmpi eq, %needs_jump_201, %c0_i32 : i32 loc(#loc803) + %jump_to_block = arith.subi %next_block_200, %cur_block_196 : i32 loc(#loc804) + %jump_to_block_203 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc805) + %jump_to_block_204 = arith.subi %jump_to_block_203, %c64_i32 : i32 loc(#loc806) + %offset = arith.extui %needs_jump_202 : i1 to i32 loc(#loc807) + %offset_205 = arith.muli %jump_to_block_204, %offset : i32 loc(#loc807) + %offset_206 = arith.subi %c1_i32, %offset : i32 loc(#loc808) + %offset_207 = arith.muli %offset_206, %c64_i32 : i32 loc(#loc809) + %offset_208 = arith.addi %offset_205, %offset_207 : i32 loc(#loc810) + %kT_ptrs_209 = arith.muli %offset_208, %c128_i32 : i32 loc(#loc609) + %kT_ptrs_210 = tt.splat %kT_ptrs_209 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc605) + %kT_ptrs_211 = tt.addptr %kT_ptrs_157, %kT_ptrs_210 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc605) + %vT_ptrs_212 = tt.addptr %vT_ptrs_158, %kT_ptrs_210 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc606) + %vT_ptrs_213 = arith.addi %arg23, %c1_i32 : i32 loc(#loc886) + %vT_ptrs_214 = arith.cmpi sge, %vT_ptrs_213, %c3_i32 : i32 loc(#loc886) + %vT_ptrs_215 = arith.select %vT_ptrs_214, %c0_i32, %vT_ptrs_213 : i32 loc(#loc886) + %kT_216 = ttg.memdesc_index %kT[%vT_ptrs_215] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc881) + %vT_ptrs_217 = tt.splat %vT_ptrs_165 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc886) + %kT_218 = ttg.async_copy_global_to_local %kT_ptrs_211, %kT_216 mask %vT_ptrs_217 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc881) + %kT_219 = ttg.async_commit_group tokens %kT_218 loc(#loc881) + %vT_220 = ttg.memdesc_index %vT[%vT_ptrs_215] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc882) + %vT_221 = ttg.async_copy_global_to_local %vT_ptrs_212, %vT_220 mask %vT_ptrs_217 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc882) + %vT_222 = ttg.async_commit_group tokens %vT_221 loc(#loc882) + scf.yield %dq_192, %kT_ptrs_211, %vT_ptrs_212, %offs_n2_194, %vT_ptrs_215, %vT_ptrs_170, %kT_161, %kT_219, %vT_163, %vT_222, %offset_208 : tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc886) + } loc(#loc886) + %vT_ptrs_111 = ttng.warp_group_dot_wait %vT_ptrs_110#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc886) + %vT_ptrs_112 = ttg.async_wait {num = 0 : i32} loc(#loc886) + ttg.local_dealloc %vT : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc886) + ttg.local_dealloc %kT : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc886) + %kv_indices_113 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_35 : !tt.ptr, i32 loc(#loc451) + %kv_start_114 = tt.load %kv_indices_113 : !tt.ptr loc(#loc452) + %kv_start_115 = arith.muli %kv_start_114, %c128_i32 : i32 loc(#loc453) + %sparse_kv_num_blocks_116 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_33 : !tt.ptr, i32 loc(#loc454) + %sparse_kv_num_blocks_117 = tt.load %sparse_kv_num_blocks_116 : !tt.ptr loc(#loc455) + %offs_n2_118 = tt.splat %kv_start_115 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc456) + %offs_n2_119 = arith.addi %offs_n2_118, %offs_n2 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc456) + %kT_ptrs_120 = tt.expand_dims %offs_n2_119 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc610) + %kT_ptrs_121 = arith.muli %kT_ptrs_120, %cst_16 : tensor<1x64xi32, #blocked1> loc(#loc611) + %kT_ptrs_122 = tt.addptr %kT_ptrs_78, %kT_ptrs_121 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc612) + %kT_ptrs_123 = tt.broadcast %kT_ptrs_122 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc613) + %kT_ptrs_124 = tt.addptr %kT_ptrs_123, %kT_ptrs_83 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc613) + %vT_ptrs_125 = tt.addptr %vT_ptrs, %kT_ptrs_121 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc614) + %vT_ptrs_126 = tt.broadcast %vT_ptrs_125 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc615) + %vT_ptrs_127 = tt.addptr %vT_ptrs_126, %kT_ptrs_83 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc615) + %hi_128 = arith.muli %sparse_kv_num_blocks_117, %c2_i32 : i32 loc(#loc616) + %hi_129 = arith.minsi %hi_128, %c32_i32 : i32 loc(#loc617) + %kT_130 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc883) + %vT_131 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc884) + %vT_ptrs_132 = arith.cmpi sgt, %hi_129, %c0_i32 : i32 loc(#loc887) + %kT_133 = ttg.memdesc_index %kT_130[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc883) + %vT_ptrs_134 = tt.splat %vT_ptrs_132 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc887) + %kT_135 = ttg.async_copy_global_to_local %kT_ptrs_124, %kT_133 mask %vT_ptrs_134 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc883) + %kT_136 = ttg.async_commit_group tokens %kT_135 loc(#loc883) + %vT_137 = ttg.memdesc_index %vT_131[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc884) + %vT_138 = ttg.async_copy_global_to_local %vT_ptrs_127, %vT_137 mask %vT_ptrs_134 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc884) + %vT_139 = ttg.async_commit_group tokens %vT_138 loc(#loc884) + %vT_ptrs_140 = arith.cmpi sgt, %hi_129, %c1_i32 : i32 loc(#loc887) + %kT_ptrs_141 = tt.addptr %kT_ptrs_124, %cst_9 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc619) + %vT_ptrs_142 = tt.addptr %vT_ptrs_127, %cst_9 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc620) + %kT_143 = ttg.memdesc_index %kT_130[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc883) + %vT_ptrs_144 = tt.splat %vT_ptrs_140 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc887) + %kT_145 = ttg.async_copy_global_to_local %kT_ptrs_141, %kT_143 mask %vT_ptrs_144 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc883) + %kT_146 = ttg.async_commit_group tokens %kT_145 loc(#loc883) + %vT_147 = ttg.memdesc_index %vT_131[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc884) + %vT_148 = ttg.async_copy_global_to_local %vT_ptrs_142, %vT_147 mask %vT_ptrs_144 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc884) + %vT_149 = ttg.async_commit_group tokens %vT_148 loc(#loc884) + ttng.fence_async_shared {bCluster = false} loc(#loc813) + %vT_ptrs_150:9 = scf.for %vT_ptrs_156 = %c0_i32 to %hi_129 step %c1_i32 iter_args(%vT_ptrs_157 = %vT_ptrs_111, %kT_ptrs_158 = %kT_ptrs_141, %vT_ptrs_159 = %vT_ptrs_142, %arg22 = %c1_i32, %arg23 = %c-1_i32, %kT_160 = %kT_136, %kT_161 = %kT_146, %vT_162 = %vT_139, %vT_163 = %vT_149) -> (tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token) : i32 { + %vT_ptrs_164 = arith.subi %hi_129, %c2_i32 : i32 loc(#loc887) + %vT_ptrs_165 = arith.cmpi slt, %vT_ptrs_156, %vT_ptrs_164 : i32 loc(#loc887) + %vT_ptrs_166 = arith.subi %hi_129, %c1_i32 : i32 loc(#loc887) + %vT_ptrs_167 = arith.cmpi slt, %vT_ptrs_156, %vT_ptrs_166 : i32 loc(#loc887) + %vT_ptrs_168 = arith.addi %arg23, %c1_i32 : i32 loc(#loc887) + %vT_ptrs_169 = arith.cmpi sge, %vT_ptrs_168, %c3_i32 : i32 loc(#loc887) + %vT_ptrs_170 = arith.select %vT_ptrs_169, %c0_i32, %vT_ptrs_168 : i32 loc(#loc887) + %kT_171 = ttg.async_wait %kT_160, %vT_162 {num = 2 : i32} loc(#loc883) + %kT_172 = ttg.memdesc_index %kT_130[%vT_ptrs_170] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc883) + %dq_173 = ttg.memdesc_trans %kT_172 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc814) + %qk = ttng.warp_group_dot %q_56, %kT_172, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc813) + %qk_174:3 = ttng.warp_group_dot_wait %qk, %q_56, %kT_172 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc813) + %qk_175 = arith.mulf %qk_174#0, %cst_6 : tensor<128x64xf32, #mma> loc(#loc815) + %post_mod_scores = arith.mulf %qk_175, %cst_8 : tensor<128x64xf32, #mma> loc(#loc816) + %p_176 = arith.subf %post_mod_scores, %p : tensor<128x64xf32, #mma> loc(#loc817) + %p_177 = math.exp2 %p_176 : tensor<128x64xf32, #mma> loc(#loc818) + %vT_178 = ttg.memdesc_index %vT_131[%vT_ptrs_170] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc884) + %dp = ttng.warp_group_dot %do_62, %vT_178, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc819) + %dp_179:3 = ttng.warp_group_dot_wait %dp, %do_62, %vT_178 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc819) + %ds_180 = arith.subf %dp_179#0, %ds_92 : tensor<128x64xf32, #mma> loc(#loc820) + %ds_181 = arith.mulf %p_177, %ds_180 : tensor<128x64xf32, #mma> loc(#loc821) + %ds_182 = arith.truncf %ds_181 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc822) + %ds_183 = ttg.convert_layout %ds_182 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc822) + %dq_184 = ttng.warp_group_dot %ds_183, %dq_173, %vT_ptrs_157 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc823) + %vT_ptrs_185 = arith.addi %vT_ptrs_156, %c1_i32 : i32 loc(#loc887) + %cur_block_idx = arith.divsi %vT_ptrs_185, %c2_i32 : i32 loc(#loc824) + %cur_block = tt.addptr %kv_indices_113, %cur_block_idx : !tt.ptr, i32 loc(#loc825) + %cur_block_186 = tt.load %cur_block, %vT_ptrs_167 evictionPolicy = evict_last : !tt.ptr loc(#loc826) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc827) + %next_block_187 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_117 : i32 loc(#loc828) + %next_block_188 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc829) + %vT_ptrs_189 = arith.andi %vT_ptrs_167, %next_block_187 : i1 loc(#loc887) + %next_block_190 = tt.load %next_block_188, %vT_ptrs_189 evictionPolicy = evict_last : !tt.ptr loc(#loc830) + %needs_jump = arith.addi %vT_ptrs_156, %c2_i32 : i32 loc(#loc831) + %needs_jump_191 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc832) + %needs_jump_192 = arith.cmpi eq, %needs_jump_191, %c0_i32 : i32 loc(#loc833) + %jump_to_block = arith.subi %next_block_190, %cur_block_186 : i32 loc(#loc834) + %jump_to_block_193 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc835) + %jump_to_block_194 = arith.subi %jump_to_block_193, %c64_i32 : i32 loc(#loc836) + %offset = arith.extui %needs_jump_192 : i1 to i32 loc(#loc837) + %offset_195 = arith.muli %jump_to_block_194, %offset : i32 loc(#loc837) + %offset_196 = arith.subi %c1_i32, %offset : i32 loc(#loc838) + %offset_197 = arith.muli %offset_196, %c64_i32 : i32 loc(#loc839) + %offset_198 = arith.addi %offset_195, %offset_197 : i32 loc(#loc840) + %kT_ptrs_199 = arith.muli %offset_198, %c128_i32 : i32 loc(#loc622) + %kT_ptrs_200 = tt.splat %kT_ptrs_199 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc619) + %kT_ptrs_201 = tt.addptr %kT_ptrs_158, %kT_ptrs_200 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc619) + %vT_ptrs_202 = tt.addptr %vT_ptrs_159, %kT_ptrs_200 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc620) + %vT_ptrs_203 = arith.addi %arg22, %c1_i32 : i32 loc(#loc887) + %vT_ptrs_204 = arith.cmpi sge, %vT_ptrs_203, %c3_i32 : i32 loc(#loc887) + %vT_ptrs_205 = arith.select %vT_ptrs_204, %c0_i32, %vT_ptrs_203 : i32 loc(#loc887) + %kT_206 = ttg.memdesc_index %kT_130[%vT_ptrs_205] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc883) + %vT_ptrs_207 = tt.splat %vT_ptrs_165 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc887) + %kT_208 = ttg.async_copy_global_to_local %kT_ptrs_201, %kT_206 mask %vT_ptrs_207 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc883) + %kT_209 = ttg.async_commit_group tokens %kT_208 loc(#loc883) + %vT_210 = ttg.memdesc_index %vT_131[%vT_ptrs_205] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc884) + %vT_211 = ttg.async_copy_global_to_local %vT_ptrs_202, %vT_210 mask %vT_ptrs_207 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc884) + %vT_212 = ttg.async_commit_group tokens %vT_211 loc(#loc884) + scf.yield %dq_184, %kT_ptrs_201, %vT_ptrs_202, %vT_ptrs_205, %vT_ptrs_170, %kT_161, %kT_209, %vT_163, %vT_212 : tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token loc(#loc887) + } loc(#loc887) + %vT_ptrs_151 = ttng.warp_group_dot_wait %vT_ptrs_150#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc887) + %vT_ptrs_152 = ttg.async_wait {num = 0 : i32} loc(#loc887) + ttg.local_dealloc %vT_131 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc887) + ttg.local_dealloc %kT_130 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc887) + %dq_ptrs = tt.splat %DQ2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc458) + %dq_ptrs_153 = tt.addptr %dq_ptrs, %ptr_48 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc458) + %dq_ptrs_154 = tt.broadcast %dq_ptrs_153 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc459) + %dq_ptrs_155 = tt.addptr %dq_ptrs_154, %ptr_54 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc459) + %dq = arith.mulf %vT_ptrs_151, %cst_3 : tensor<128x128xf32, #mma1> loc(#loc460) + %1 = arith.truncf %dq : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc159) + %2 = ttg.convert_layout %1 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc159) + tt.store %dq_ptrs_155, %2 : tensor<128x128x!tt.ptr, #blocked> loc(#loc159) + } else { + %start_n1 = arith.muli %pid, %c128_i32 : i32 loc(#loc461) + %offs_n1 = tt.splat %start_n1 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc462) + %offs_n1_31 = tt.splat %start_n1 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc462) + %offs_n1_32 = arith.addi %offs_n1, %offs_k : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc462) + %offs_n1_33 = arith.addi %offs_n1_31, %offs_k_30 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc462) + %ptr = tt.expand_dims %offs_n1_32 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc623) + %ptr_34 = tt.expand_dims %offs_n1_33 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xi32, #mma> loc(#loc623) + %ptr_35 = arith.muli %ptr, %cst_1 : tensor<128x1xi32, #blocked> loc(#loc624) + %ptr_36 = tt.splat %K : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc625) + %ptr_37 = tt.addptr %ptr_36, %ptr_35 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc625) + %ptr_38 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc626) + %ptr_39 = tt.expand_dims %ptr_38 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc626) + %ptr_40 = tt.broadcast %ptr_37 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc627) + %ptr_41 = tt.broadcast %ptr_39 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc627) + %ptr_42 = tt.addptr %ptr_40, %ptr_41 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc627) + %k = tt.load %ptr_42 : tensor<128x128x!tt.ptr, #blocked> loc(#loc628) + %k_43 = ttg.local_alloc %k : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc628) + %ptr_44 = tt.splat %V : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc629) + %ptr_45 = tt.addptr %ptr_44, %ptr_35 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc629) + %ptr_46 = tt.broadcast %ptr_45 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc630) + %ptr_47 = tt.addptr %ptr_46, %ptr_41 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc630) + %v = tt.load %ptr_47 : tensor<128x128x!tt.ptr, #blocked> loc(#loc631) + %v_48 = ttg.local_alloc %v : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc631) + %off_hq1 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc465) + %q_adj1 = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc466) + %off_chz1 = arith.muli %off_zq, %c32_i32 : i32 loc(#loc467) + %sparse_q_num_blks_offset = arith.muli %off_zkv, %c16_i32 : i32 loc(#loc468) + %sparse_q_num_blks_offset_49 = arith.addi %sparse_q_num_blks_offset, %pid : i32 loc(#loc469) + %sparse_q_idx_offset = arith.muli %off_zkv, %c256_i32 : i32 loc(#loc470) + %sparse_q_idx_offset_50 = arith.muli %pid, %c16_i32 : i32 loc(#loc471) + %sparse_q_idx_offset_51 = arith.addi %sparse_q_idx_offset, %sparse_q_idx_offset_50 : i32 loc(#loc472) + %q_indices = tt.addptr %arg_Q_IDX, %sparse_q_idx_offset_51 : !tt.ptr, i32 loc(#loc473) + %q_start = tt.load %q_indices, %true : !tt.ptr loc(#loc474) + %q_start_52 = arith.muli %q_start, %c128_i32 : i32 loc(#loc475) + %sparse_q_num_blocks = tt.addptr %arg_Q_NUM_BLKS, %sparse_q_num_blks_offset_49 : !tt.ptr, i32 loc(#loc476) + %sparse_q_num_blocks_53 = tt.load %sparse_q_num_blocks, %true : !tt.ptr loc(#loc477) + %offs_m1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc478) + %offs_m1_54 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc478) + %offs_m1_55 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc478) + %offs_m1_56 = tt.splat %q_start_52 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc479) + %offs_m1_57 = tt.splat %q_start_52 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc479) + %offs_m1_58 = tt.splat %q_start_52 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc479) + %offs_m1_59 = arith.addi %offs_m1_56, %offs_m1 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc479) + %offs_m1_60 = arith.addi %offs_m1_57, %offs_m1_54 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc479) + %offs_m1_61 = arith.addi %offs_m1_58, %offs_m1_55 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc479) + %qT_ptrs = tt.expand_dims %offs_m1_59 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc632) + %qT_ptrs_62 = arith.muli %qT_ptrs, %cst_15 : tensor<1x64xi32, #blocked1> loc(#loc633) + %qT_ptrs_63 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc634) + %qT_ptrs_64 = tt.expand_dims %qT_ptrs_63 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xi32, #blocked1> loc(#loc634) + %qT_ptrs_65 = tt.broadcast %qT_ptrs_64 : tensor<128x1xi32, #blocked1> -> tensor<128x64xi32, #blocked1> loc(#loc635) + %do_ptrs = tt.expand_dims %offs_m1_61 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc636) + %do_ptrs_66 = arith.muli %do_ptrs, %cst_14 : tensor<64x1xi32, #blocked> loc(#loc637) + %do_ptrs_67 = tt.broadcast %ptr_39 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked> loc(#loc638) + %hi = arith.muli %sparse_q_num_blocks_53, %c2_i32 : i32 loc(#loc639) + %hi_68 = arith.minsi %hi, %c32_i32 : i32 loc(#loc640) + %tmp44 = tt.broadcast %ptr_34 : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc641) + %tmp45 = arith.extsi %ptr_34 : tensor<128x1xi32, #mma> to tensor<128x1xi64, #mma> loc(#loc642) + %tmp47 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc643) + %do_ptrs_69 = arith.cmpi sgt, %hi_68, %c0_i32 : i32 loc(#loc889) + %tmp47_70 = tt.load %tmp47, %do_ptrs_69 : !tt.ptr loc(#loc645) + %tmp48 = tt.splat %tmp47_70 : i64 -> tensor<128x1xi64, #mma> loc(#loc646) + %tmp48_71 = arith.cmpi slt, %tmp45, %tmp48 : tensor<128x1xi64, #mma> loc(#loc646) + %tmp50 = tt.splat %tmp47_70 : i64 -> tensor<1x64xi64, #mma> loc(#loc647) + %tmp51 = tt.broadcast %tmp48_71 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc648) + %tmp55 = arith.cmpi sge, %ptr_34, %cst_0 : tensor<128x1xi32, #mma> loc(#loc649) + %tmp56 = arith.remsi %ptr_34, %cst_0 : tensor<128x1xi32, #mma> loc(#loc650) + %tmp58 = arith.cmpi ne, %tmp56, %cst : tensor<128x1xi32, #mma> loc(#loc651) + %tmp59 = arith.cmpi slt, %tmp56, %cst : tensor<128x1xi32, #mma> loc(#loc652) + %tmp62 = arith.andi %tmp58, %tmp59 : tensor<128x1xi1, #mma> loc(#loc653) + %tmp63 = arith.addi %tmp56, %cst_0 : tensor<128x1xi32, #mma> loc(#loc654) + %tmp64 = arith.select %tmp62, %tmp63, %tmp56 : tensor<128x1xi1, #mma>, tensor<128x1xi32, #mma> loc(#loc655) + %tmp65 = arith.extsi %tmp64 : tensor<128x1xi32, #mma> to tensor<128x1xi64, #mma> loc(#loc656) + %tmp66 = arith.cmpi slt, %tmp65, %tmp48 : tensor<128x1xi64, #mma> loc(#loc657) + %tmp67 = arith.andi %tmp55, %tmp66 : tensor<128x1xi1, #mma> loc(#loc658) + %tmp77 = tt.broadcast %tmp67 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc659) + %q_indices_72 = tt.addptr %arg_FULL_Q_IDX, %sparse_q_idx_offset_51 : !tt.ptr, i32 loc(#loc509) + %q_start_73 = tt.load %q_indices_72, %true : !tt.ptr loc(#loc510) + %q_start_74 = arith.muli %q_start_73, %c128_i32 : i32 loc(#loc511) + %sparse_q_num_blocks_75 = tt.addptr %arg_FULL_Q_NUM_BLKS, %sparse_q_num_blks_offset_49 : !tt.ptr, i32 loc(#loc512) + %sparse_q_num_blocks_76 = tt.load %sparse_q_num_blocks_75, %true : !tt.ptr loc(#loc513) + %offs_m1_77 = tt.splat %q_start_74 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc514) + %offs_m1_78 = tt.splat %q_start_74 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc514) + %offs_m1_79 = tt.splat %q_start_74 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc514) + %offs_m1_80 = arith.addi %offs_m1_77, %offs_m1 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc514) + %offs_m1_81 = arith.addi %offs_m1_78, %offs_m1_54 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc514) + %offs_m1_82 = arith.addi %offs_m1_79, %offs_m1_55 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc514) + %qT_ptrs_83 = tt.expand_dims %offs_m1_80 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc660) + %qT_ptrs_84 = arith.muli %qT_ptrs_83, %cst_15 : tensor<1x64xi32, #blocked1> loc(#loc661) + %do_ptrs_85 = tt.expand_dims %offs_m1_82 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc662) + %do_ptrs_86 = arith.muli %do_ptrs_85, %cst_14 : tensor<64x1xi32, #blocked> loc(#loc663) + %hi_87 = arith.muli %sparse_q_num_blocks_76, %c2_i32 : i32 loc(#loc664) + %hi_88 = arith.minsi %hi_87, %c32_i32 : i32 loc(#loc665) + ttng.fence_async_shared {bCluster = false} loc(#loc666) + %dk:2 = scf.for %dk_98 = %c0_i32 to %c4_i32 step %c1_i32 iter_args(%arg19 = %cst_4, %arg20 = %cst_4) -> (tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>) : i32 { + %off_hq1_99 = arith.addi %off_hq1, %dk_98 : i32 loc(#loc517) + %q_adj1_100 = arith.muli %off_hq1_99, %c128_i32 : i32 loc(#loc518) + %q_adj1_101 = arith.addi %q_adj1_100, %q_adj1 : i32 loc(#loc519) + %q_adj1_102 = arith.extsi %q_adj1_101 : i32 to i64 loc(#loc520) + %do_adj1 = arith.muli %off_hq1_99, %c262144_i32 : i32 loc(#loc521) + %do_adj1_103 = arith.addi %do_adj1, %q_adj1 : i32 loc(#loc522) + %do_adj1_104 = arith.extsi %do_adj1_103 : i32 to i64 loc(#loc523) + %off_chz1_105 = arith.addi %off_chz1, %off_hq1_99 : i32 loc(#loc524) + %off_chz1_106 = arith.muli %off_chz1_105, %c2048_i32 : i32 loc(#loc525) + %off_chz1_107 = arith.extsi %off_chz1_106 : i32 to i64 loc(#loc526) + %Q1 = tt.addptr %arg_Q, %q_adj1_102 : !tt.ptr, i64 loc(#loc527) + %DO1 = tt.addptr %arg_DO, %do_adj1_104 : !tt.ptr, i64 loc(#loc528) + %LSE1 = tt.addptr %arg_LSE, %off_chz1_107 : !tt.ptr, i64 loc(#loc529) + %DELTA1 = tt.addptr %arg_DELTA, %off_chz1_107 : !tt.ptr, i64 loc(#loc530) + %qT_ptrs_108 = tt.splat %Q1 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc668) + %qT_ptrs_109 = tt.addptr %qT_ptrs_108, %qT_ptrs_62 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc668) + %qT_ptrs_110 = tt.broadcast %qT_ptrs_109 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc635) + %qT_ptrs_111 = tt.addptr %qT_ptrs_110, %qT_ptrs_65 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc635) + %do_ptrs_112 = tt.splat %DO1 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked> loc(#loc669) + %do_ptrs_113 = tt.addptr %do_ptrs_112, %do_ptrs_66 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc669) + %do_ptrs_114 = tt.broadcast %do_ptrs_113 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc638) + %do_ptrs_115 = tt.addptr %do_ptrs_114, %do_ptrs_67 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc638) + %lse = tt.splat %LSE1 : !tt.ptr -> tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc670) + %Di = tt.splat %DELTA1 : !tt.ptr -> tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc671) + %qT = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc842) + %lse_116 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc673) + %do = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc843) + %Di_117 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc675) + %qT_118 = ttg.memdesc_index %qT[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc842) + %do_ptrs_119 = tt.splat %do_ptrs_69 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc889) + %qT_120 = ttg.async_copy_global_to_local %qT_ptrs_111, %qT_118 mask %do_ptrs_119 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc842) + %qT_121 = ttg.async_commit_group tokens %qT_120 loc(#loc842) + %lse_122 = tt.addptr %lse, %offs_m1_60 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc670) + %lse_123 = ttg.memdesc_index %lse_116[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc673) + %do_ptrs_124 = tt.splat %do_ptrs_69 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc889) + %lse_125 = ttg.async_copy_global_to_local %lse_122, %lse_123 mask %do_ptrs_124 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc673) + %lse_126 = ttg.async_commit_group tokens %lse_125 loc(#loc673) + %do_127 = ttg.memdesc_index %do[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc843) + %do_ptrs_128 = tt.splat %do_ptrs_69 : i1 -> tensor<64x128xi1, #blocked> loc(#loc889) + %do_129 = ttg.async_copy_global_to_local %do_ptrs_115, %do_127 mask %do_ptrs_128 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc843) + %do_130 = ttg.async_commit_group tokens %do_129 loc(#loc843) + %Di_131 = tt.addptr %Di, %offs_m1_60 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc671) + %Di_132 = ttg.memdesc_index %Di_117[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc675) + %Di_133 = ttg.async_copy_global_to_local %Di_131, %Di_132 mask %do_ptrs_124 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc675) + %Di_134 = ttg.async_commit_group tokens %Di_133 loc(#loc675) + %do_ptrs_135 = arith.cmpi sgt, %hi_68, %c1_i32 : i32 loc(#loc889) + %qT_ptrs_136 = tt.addptr %qT_ptrs_111, %cst_10 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc676) + %do_ptrs_137 = tt.addptr %do_ptrs_115, %cst_11 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc677) + %offs_m1_138 = arith.addi %offs_m1_60, %cst_12 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc678) + %qT_139 = ttg.memdesc_index %qT[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc842) + %do_ptrs_140 = tt.splat %do_ptrs_135 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc889) + %qT_141 = ttg.async_copy_global_to_local %qT_ptrs_136, %qT_139 mask %do_ptrs_140 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc842) + %qT_142 = ttg.async_commit_group tokens %qT_141 loc(#loc842) + %lse_143 = tt.addptr %lse, %offs_m1_138 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc670) + %lse_144 = ttg.memdesc_index %lse_116[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc673) + %do_ptrs_145 = tt.splat %do_ptrs_135 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc889) + %lse_146 = ttg.async_copy_global_to_local %lse_143, %lse_144 mask %do_ptrs_145 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc673) + %lse_147 = ttg.async_commit_group tokens %lse_146 loc(#loc673) + %do_148 = ttg.memdesc_index %do[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc843) + %do_ptrs_149 = tt.splat %do_ptrs_135 : i1 -> tensor<64x128xi1, #blocked> loc(#loc889) + %do_150 = ttg.async_copy_global_to_local %do_ptrs_137, %do_148 mask %do_ptrs_149 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc843) + %do_151 = ttg.async_commit_group tokens %do_150 loc(#loc843) + %Di_152 = tt.addptr %Di, %offs_m1_138 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc671) + %Di_153 = ttg.memdesc_index %Di_117[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc675) + %Di_154 = ttg.async_copy_global_to_local %Di_152, %Di_153 mask %do_ptrs_145 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc675) + %Di_155 = ttg.async_commit_group tokens %Di_154 loc(#loc675) + %do_ptrs_156:19 = scf.for %do_ptrs_211 = %c0_i32 to %hi_68 step %c1_i32 iter_args(%arg22 = %arg20, %arg23 = %arg19, %qT_ptrs_212 = %qT_ptrs_136, %do_ptrs_213 = %do_ptrs_137, %offs_m1_214 = %offs_m1_138, %arg27 = %c1_i32, %arg28 = %c-1_i32, %arg29 = %c1_i32, %arg30 = %c-1_i32, %qT_215 = %qT_121, %qT_216 = %qT_142, %lse_217 = %lse_126, %lse_218 = %lse_147, %do_219 = %do_130, %do_220 = %do_151, %Di_221 = %Di_134, %Di_222 = %Di_155, %arg39 = %c64_i32, %offs_m1_223 = %offs_m1_60) -> (tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64x128x!tt.ptr, #blocked>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>) : i32 { + %do_ptrs_224 = arith.subi %hi_68, %c2_i32 : i32 loc(#loc889) + %do_ptrs_225 = arith.cmpi slt, %do_ptrs_211, %do_ptrs_224 : i32 loc(#loc889) + %do_ptrs_226 = arith.subi %hi_68, %c1_i32 : i32 loc(#loc889) + %do_ptrs_227 = arith.cmpi slt, %do_ptrs_211, %do_ptrs_226 : i32 loc(#loc889) + %do_ptrs_228 = arith.addi %arg30, %c1_i32 : i32 loc(#loc889) + %do_ptrs_229 = arith.cmpi sge, %do_ptrs_228, %c2_i32 : i32 loc(#loc889) + %do_ptrs_230 = arith.select %do_ptrs_229, %c0_i32, %do_ptrs_228 : i32 loc(#loc889) + %do_ptrs_231 = arith.addi %arg28, %c1_i32 : i32 loc(#loc889) + %do_ptrs_232 = arith.cmpi sge, %do_ptrs_231, %c3_i32 : i32 loc(#loc889) + %do_ptrs_233 = arith.select %do_ptrs_232, %c0_i32, %do_ptrs_231 : i32 loc(#loc889) + %qT_234 = ttg.async_wait %qT_215, %lse_217, %do_219, %Di_221 {num = 4 : i32} loc(#loc842) + %qT_235 = ttg.memdesc_index %qT[%do_ptrs_233] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc842) + %dk_236 = ttg.memdesc_trans %qT_235 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc679) + %lse_237 = ttg.memdesc_index %lse_116[%do_ptrs_230] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc673) + %lse_238 = ttg.local_load %lse_237 token %qT_234 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc673) + %lse_239 = arith.cmpf oeq, %lse_238, %cst_19 : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc680) + %lse_240 = arith.select %lse_239, %cst_20, %lse_238 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc681) + %qkT = ttng.warp_group_dot %k_43, %qT_235, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc666) + %qkT_241:3 = ttng.warp_group_dot_wait %qkT, %k_43, %qT_235 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc666) + %qkT_242 = arith.mulf %qkT_241#0, %cst_6 : tensor<128x64xf32, #mma> loc(#loc682) + %m = tt.expand_dims %offs_m1_223 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc683) + %tmp44_243 = tt.broadcast %m : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc641) + %tmp44_244 = arith.cmpi sge, %tmp44_243, %tmp44 : tensor<128x64xi32, #mma> loc(#loc641) + %tmp49 = arith.extsi %m : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc684) + %tmp50_245 = arith.cmpi slt, %tmp49, %tmp50 : tensor<1x64xi64, #mma> loc(#loc647) + %tmp51_246 = tt.broadcast %tmp50_245 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc648) + %tmp51_247 = arith.andi %tmp51, %tmp51_246 : tensor<128x64xi1, #mma> loc(#loc648) + %tmp52 = arith.andi %tmp44_244, %tmp51_247 : tensor<128x64xi1, #mma> loc(#loc685) + %tmp68 = arith.subi %tmp44, %tmp44_243 : tensor<128x64xi32, #mma> loc(#loc686) + %tmp69 = arith.remsi %tmp68, %cst_18 : tensor<128x64xi32, #mma> loc(#loc687) + %tmp70 = arith.cmpi ne, %tmp69, %cst_21 : tensor<128x64xi32, #mma> loc(#loc688) + %tmp71 = arith.cmpi slt, %tmp69, %cst_21 : tensor<128x64xi32, #mma> loc(#loc689) + %tmp73 = arith.andi %tmp70, %tmp71 : tensor<128x64xi1, #mma> loc(#loc690) + %tmp74 = arith.addi %tmp69, %cst_18 : tensor<128x64xi32, #mma> loc(#loc691) + %tmp75 = arith.select %tmp73, %tmp74, %tmp69 : tensor<128x64xi1, #mma>, tensor<128x64xi32, #mma> loc(#loc692) + %tmp76 = arith.cmpi eq, %tmp75, %cst_21 : tensor<128x64xi32, #mma> loc(#loc693) + %tmp77_248 = arith.andi %tmp77, %tmp76 : tensor<128x64xi1, #mma> loc(#loc659) + %tmp78 = arith.ori %tmp52, %tmp77_248 : tensor<128x64xi1, #mma> loc(#loc694) + %post_mod_scores = arith.select %tmp78, %qkT_242, %cst_7 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc695) + %post_mod_scores_249 = arith.mulf %post_mod_scores, %cst_8 : tensor<128x64xf32, #mma> loc(#loc696) + %pT = tt.expand_dims %lse_240 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xf32, #mma> loc(#loc697) + %pT_250 = tt.broadcast %pT : tensor<1x64xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc698) + %pT_251 = arith.subf %post_mod_scores_249, %pT_250 : tensor<128x64xf32, #mma> loc(#loc698) + %pT_252 = math.exp2 %pT_251 : tensor<128x64xf32, #mma> loc(#loc699) + %do_253 = ttg.memdesc_index %do[%do_ptrs_233] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc843) + %dpT = ttg.memdesc_trans %do_253 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc700) + %dv = arith.truncf %pT_252 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc701) + %dv_254 = ttg.convert_layout %dv : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc701) + %dv_255 = ttng.warp_group_dot %dv_254, %do_253, %arg23 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc702) + %Di_256 = ttg.memdesc_index %Di_117[%do_ptrs_230] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc675) + %Di_257 = ttg.local_load %Di_256 token %qT_234 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc675) + %dpT_258 = ttng.warp_group_dot %v_48, %dpT, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc703) + %dpT_259:3 = ttng.warp_group_dot_wait %dpT_258, %v_48, %dpT {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc703) + %dsT = tt.expand_dims %Di_257 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xf32, #mma> loc(#loc704) + %dsT_260 = tt.broadcast %dsT : tensor<1x64xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc705) + %dsT_261 = arith.subf %dpT_259#0, %dsT_260 : tensor<128x64xf32, #mma> loc(#loc705) + %dsT_262 = arith.mulf %pT_252, %dsT_261 : tensor<128x64xf32, #mma> loc(#loc706) + %dsT_263 = arith.select %tmp78, %dsT_262, %cst_5 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc707) + %dk_264 = arith.truncf %dsT_263 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc708) + %dk_265 = ttg.convert_layout %dk_264 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc708) + %dk_266 = ttng.warp_group_dot %dk_265, %dk_236, %arg22 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc709) + %offs_m1_267 = tt.splat %arg39 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc678) + %offs_m1_268 = arith.addi %offs_m1_223, %offs_m1_267 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc678) + %do_ptrs_269 = arith.addi %do_ptrs_211, %c1_i32 : i32 loc(#loc889) + %cur_block_idx = arith.divsi %do_ptrs_269, %c2_i32 : i32 loc(#loc844) + %cur_block = tt.addptr %q_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc845) + %cur_block_270 = tt.load %cur_block, %do_ptrs_227 evictionPolicy = evict_last : !tt.ptr loc(#loc846) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc847) + %next_block_271 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_53 : i32 loc(#loc848) + %next_block_272 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc849) + %do_ptrs_273 = arith.andi %do_ptrs_227, %next_block_271 : i1 loc(#loc889) + %next_block_274 = tt.load %next_block_272, %do_ptrs_273 evictionPolicy = evict_last : !tt.ptr loc(#loc850) + %needs_jump = arith.addi %do_ptrs_211, %c2_i32 : i32 loc(#loc851) + %needs_jump_275 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc852) + %needs_jump_276 = arith.cmpi eq, %needs_jump_275, %c0_i32 : i32 loc(#loc853) + %jump_to_block = arith.subi %next_block_274, %cur_block_270 : i32 loc(#loc854) + %jump_to_block_277 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc855) + %jump_to_block_278 = arith.subi %jump_to_block_277, %c64_i32 : i32 loc(#loc856) + %offset = arith.extui %needs_jump_276 : i1 to i32 loc(#loc857) + %offset_279 = arith.muli %jump_to_block_278, %offset : i32 loc(#loc857) + %offset_280 = arith.subi %c1_i32, %offset : i32 loc(#loc858) + %offset_281 = arith.muli %offset_280, %c64_i32 : i32 loc(#loc859) + %offset_282 = arith.addi %offset_279, %offset_281 : i32 loc(#loc860) + %qT_ptrs_283 = arith.muli %offset_282, %c4096_i32 : i32 loc(#loc711) + %qT_ptrs_284 = tt.splat %qT_ptrs_283 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc676) + %qT_ptrs_285 = tt.addptr %qT_ptrs_212, %qT_ptrs_284 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc676) + %do_ptrs_286 = arith.muli %offset_282, %c128_i32 : i32 loc(#loc712) + %do_ptrs_287 = tt.splat %do_ptrs_286 : i32 -> tensor<64x128xi32, #blocked> loc(#loc677) + %do_ptrs_288 = tt.addptr %do_ptrs_213, %do_ptrs_287 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc677) + %offs_m1_289 = tt.splat %offset_282 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc678) + %offs_m1_290 = arith.addi %offs_m1_214, %offs_m1_289 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc678) + %do_ptrs_291 = arith.addi %arg29, %c1_i32 : i32 loc(#loc889) + %do_ptrs_292 = arith.cmpi sge, %do_ptrs_291, %c2_i32 : i32 loc(#loc889) + %do_ptrs_293 = arith.select %do_ptrs_292, %c0_i32, %do_ptrs_291 : i32 loc(#loc889) + %do_ptrs_294 = arith.addi %arg27, %c1_i32 : i32 loc(#loc889) + %do_ptrs_295 = arith.cmpi sge, %do_ptrs_294, %c3_i32 : i32 loc(#loc889) + %do_ptrs_296 = arith.select %do_ptrs_295, %c0_i32, %do_ptrs_294 : i32 loc(#loc889) + %qT_297 = ttg.memdesc_index %qT[%do_ptrs_296] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc842) + %do_ptrs_298 = tt.splat %do_ptrs_225 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc889) + %qT_299 = ttg.async_copy_global_to_local %qT_ptrs_285, %qT_297 mask %do_ptrs_298 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc842) + %qT_300 = ttg.async_commit_group tokens %qT_299 loc(#loc842) + %lse_301 = tt.addptr %lse, %offs_m1_290 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc670) + %lse_302 = ttg.memdesc_index %lse_116[%do_ptrs_293] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc673) + %do_ptrs_303 = tt.splat %do_ptrs_225 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc889) + %lse_304 = ttg.async_copy_global_to_local %lse_301, %lse_302 mask %do_ptrs_303 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc673) + %lse_305 = ttg.async_commit_group tokens %lse_304 loc(#loc673) + %do_306 = ttg.memdesc_index %do[%do_ptrs_296] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc843) + %do_ptrs_307 = tt.splat %do_ptrs_225 : i1 -> tensor<64x128xi1, #blocked> loc(#loc889) + %do_308 = ttg.async_copy_global_to_local %do_ptrs_288, %do_306 mask %do_ptrs_307 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc843) + %do_309 = ttg.async_commit_group tokens %do_308 loc(#loc843) + %Di_310 = tt.addptr %Di, %offs_m1_290 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc671) + %Di_311 = ttg.memdesc_index %Di_117[%do_ptrs_293] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc675) + %Di_312 = ttg.async_copy_global_to_local %Di_310, %Di_311 mask %do_ptrs_303 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc675) + %Di_313 = ttg.async_commit_group tokens %Di_312 loc(#loc675) + scf.yield %dk_266, %dv_255, %qT_ptrs_285, %do_ptrs_288, %offs_m1_290, %do_ptrs_296, %do_ptrs_233, %do_ptrs_293, %do_ptrs_230, %qT_216, %qT_300, %lse_218, %lse_305, %do_220, %do_309, %Di_222, %Di_313, %offset_282, %offs_m1_268 : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64x128x!tt.ptr, #blocked>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc889) + } loc(#loc889) + %do_ptrs_157:2 = ttng.warp_group_dot_wait %do_ptrs_156#1, %do_ptrs_156#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1> loc(#loc889) + %do_ptrs_158 = ttg.async_wait {num = 0 : i32} loc(#loc889) + ttg.local_dealloc %Di_117 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc889) + ttg.local_dealloc %do : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc889) + ttg.local_dealloc %lse_116 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc889) + ttg.local_dealloc %qT : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc889) + %qT_ptrs_159 = tt.addptr %qT_ptrs_108, %qT_ptrs_84 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc713) + %qT_ptrs_160 = tt.broadcast %qT_ptrs_159 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc714) + %qT_ptrs_161 = tt.addptr %qT_ptrs_160, %qT_ptrs_65 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc714) + %do_ptrs_162 = tt.addptr %do_ptrs_112, %do_ptrs_86 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc715) + %do_ptrs_163 = tt.broadcast %do_ptrs_162 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc716) + %do_ptrs_164 = tt.addptr %do_ptrs_163, %do_ptrs_67 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc716) + %qT_165 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc861) + %lse_166 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc718) + %do_167 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc862) + %Di_168 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc720) + %do_ptrs_169 = arith.cmpi sgt, %hi_88, %c0_i32 : i32 loc(#loc890) + %qT_170 = ttg.memdesc_index %qT_165[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc861) + %do_ptrs_171 = tt.splat %do_ptrs_169 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc890) + %qT_172 = ttg.async_copy_global_to_local %qT_ptrs_161, %qT_170 mask %do_ptrs_171 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc861) + %qT_173 = ttg.async_commit_group tokens %qT_172 loc(#loc861) + %lse_174 = tt.addptr %lse, %offs_m1_81 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc721) + %lse_175 = ttg.memdesc_index %lse_166[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc718) + %do_ptrs_176 = tt.splat %do_ptrs_169 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc890) + %lse_177 = ttg.async_copy_global_to_local %lse_174, %lse_175 mask %do_ptrs_176 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc718) + %lse_178 = ttg.async_commit_group tokens %lse_177 loc(#loc718) + %do_179 = ttg.memdesc_index %do_167[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc862) + %do_ptrs_180 = tt.splat %do_ptrs_169 : i1 -> tensor<64x128xi1, #blocked> loc(#loc890) + %do_181 = ttg.async_copy_global_to_local %do_ptrs_164, %do_179 mask %do_ptrs_180 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc862) + %do_182 = ttg.async_commit_group tokens %do_181 loc(#loc862) + %Di_183 = tt.addptr %Di, %offs_m1_81 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc722) + %Di_184 = ttg.memdesc_index %Di_168[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc720) + %Di_185 = ttg.async_copy_global_to_local %Di_183, %Di_184 mask %do_ptrs_176 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc720) + %Di_186 = ttg.async_commit_group tokens %Di_185 loc(#loc720) + %do_ptrs_187 = arith.cmpi sgt, %hi_88, %c1_i32 : i32 loc(#loc890) + %qT_ptrs_188 = tt.addptr %qT_ptrs_161, %cst_10 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc723) + %do_ptrs_189 = tt.addptr %do_ptrs_164, %cst_11 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc724) + %offs_m1_190 = arith.addi %offs_m1_81, %cst_12 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc725) + %qT_191 = ttg.memdesc_index %qT_165[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc861) + %do_ptrs_192 = tt.splat %do_ptrs_187 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc890) + %qT_193 = ttg.async_copy_global_to_local %qT_ptrs_188, %qT_191 mask %do_ptrs_192 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc861) + %qT_194 = ttg.async_commit_group tokens %qT_193 loc(#loc861) + %lse_195 = tt.addptr %lse, %offs_m1_190 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc721) + %lse_196 = ttg.memdesc_index %lse_166[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc718) + %do_ptrs_197 = tt.splat %do_ptrs_187 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc890) + %lse_198 = ttg.async_copy_global_to_local %lse_195, %lse_196 mask %do_ptrs_197 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc718) + %lse_199 = ttg.async_commit_group tokens %lse_198 loc(#loc718) + %do_200 = ttg.memdesc_index %do_167[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc862) + %do_ptrs_201 = tt.splat %do_ptrs_187 : i1 -> tensor<64x128xi1, #blocked> loc(#loc890) + %do_202 = ttg.async_copy_global_to_local %do_ptrs_189, %do_200 mask %do_ptrs_201 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc862) + %do_203 = ttg.async_commit_group tokens %do_202 loc(#loc862) + %Di_204 = tt.addptr %Di, %offs_m1_190 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc722) + %Di_205 = ttg.memdesc_index %Di_168[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc720) + %Di_206 = ttg.async_copy_global_to_local %Di_204, %Di_205 mask %do_ptrs_197 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc720) + %Di_207 = ttg.async_commit_group tokens %Di_206 loc(#loc720) + %do_ptrs_208:17 = scf.for %do_ptrs_211 = %c0_i32 to %hi_88 step %c1_i32 iter_args(%do_ptrs_212 = %do_ptrs_157#1, %do_ptrs_213 = %do_ptrs_157#0, %qT_ptrs_214 = %qT_ptrs_188, %do_ptrs_215 = %do_ptrs_189, %offs_m1_216 = %offs_m1_190, %arg27 = %c1_i32, %arg28 = %c-1_i32, %arg29 = %c1_i32, %arg30 = %c-1_i32, %qT_217 = %qT_173, %qT_218 = %qT_194, %lse_219 = %lse_178, %lse_220 = %lse_199, %do_221 = %do_182, %do_222 = %do_203, %Di_223 = %Di_186, %Di_224 = %Di_207) -> (tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64x128x!tt.ptr, #blocked>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token) : i32 { + %do_ptrs_225 = arith.subi %hi_88, %c2_i32 : i32 loc(#loc890) + %do_ptrs_226 = arith.cmpi slt, %do_ptrs_211, %do_ptrs_225 : i32 loc(#loc890) + %do_ptrs_227 = arith.subi %hi_88, %c1_i32 : i32 loc(#loc890) + %do_ptrs_228 = arith.cmpi slt, %do_ptrs_211, %do_ptrs_227 : i32 loc(#loc890) + %do_ptrs_229 = arith.addi %arg30, %c1_i32 : i32 loc(#loc890) + %do_ptrs_230 = arith.cmpi sge, %do_ptrs_229, %c2_i32 : i32 loc(#loc890) + %do_ptrs_231 = arith.select %do_ptrs_230, %c0_i32, %do_ptrs_229 : i32 loc(#loc890) + %do_ptrs_232 = arith.addi %arg28, %c1_i32 : i32 loc(#loc890) + %do_ptrs_233 = arith.cmpi sge, %do_ptrs_232, %c3_i32 : i32 loc(#loc890) + %do_ptrs_234 = arith.select %do_ptrs_233, %c0_i32, %do_ptrs_232 : i32 loc(#loc890) + %qT_235 = ttg.async_wait %qT_217, %lse_219, %do_221, %Di_223 {num = 4 : i32} loc(#loc861) + %qT_236 = ttg.memdesc_index %qT_165[%do_ptrs_234] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc861) + %dk_237 = ttg.memdesc_trans %qT_236 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc726) + %lse_238 = ttg.memdesc_index %lse_166[%do_ptrs_231] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc718) + %lse_239 = ttg.local_load %lse_238 token %qT_235 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc718) + %lse_240 = arith.cmpf oeq, %lse_239, %cst_19 : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc727) + %lse_241 = arith.select %lse_240, %cst_20, %lse_239 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc728) + %qkT = ttng.warp_group_dot %k_43, %qT_236, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc729) + %qkT_242:3 = ttng.warp_group_dot_wait %qkT, %k_43, %qT_236 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc729) + %qkT_243 = arith.mulf %qkT_242#0, %cst_6 : tensor<128x64xf32, #mma> loc(#loc730) + %post_mod_scores = arith.mulf %qkT_243, %cst_8 : tensor<128x64xf32, #mma> loc(#loc731) + %pT = tt.expand_dims %lse_241 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xf32, #mma> loc(#loc732) + %pT_244 = tt.broadcast %pT : tensor<1x64xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc733) + %pT_245 = arith.subf %post_mod_scores, %pT_244 : tensor<128x64xf32, #mma> loc(#loc733) + %pT_246 = math.exp2 %pT_245 : tensor<128x64xf32, #mma> loc(#loc734) + %do_247 = ttg.memdesc_index %do_167[%do_ptrs_234] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc862) + %dpT = ttg.memdesc_trans %do_247 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc735) + %dv = arith.truncf %pT_246 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc736) + %dv_248 = ttg.convert_layout %dv : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc736) + %dv_249 = ttng.warp_group_dot %dv_248, %do_247, %do_ptrs_213 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc737) + %Di_250 = ttg.memdesc_index %Di_168[%do_ptrs_231] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc720) + %Di_251 = ttg.local_load %Di_250 token %qT_235 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc720) + %dpT_252 = ttng.warp_group_dot %v_48, %dpT, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc738) + %dpT_253:3 = ttng.warp_group_dot_wait %dpT_252, %v_48, %dpT {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc738) + %dsT = tt.expand_dims %Di_251 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xf32, #mma> loc(#loc739) + %dsT_254 = tt.broadcast %dsT : tensor<1x64xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc740) + %dsT_255 = arith.subf %dpT_253#0, %dsT_254 : tensor<128x64xf32, #mma> loc(#loc740) + %dsT_256 = arith.mulf %pT_246, %dsT_255 : tensor<128x64xf32, #mma> loc(#loc741) + %dk_257 = arith.truncf %dsT_256 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc742) + %dk_258 = ttg.convert_layout %dk_257 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc742) + %dk_259 = ttng.warp_group_dot %dk_258, %dk_237, %do_ptrs_212 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc743) + %do_ptrs_260 = arith.addi %do_ptrs_211, %c1_i32 : i32 loc(#loc890) + %cur_block_idx = arith.divsi %do_ptrs_260, %c2_i32 : i32 loc(#loc863) + %cur_block = tt.addptr %q_indices_72, %cur_block_idx : !tt.ptr, i32 loc(#loc864) + %cur_block_261 = tt.load %cur_block, %do_ptrs_228 evictionPolicy = evict_last : !tt.ptr loc(#loc865) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc866) + %next_block_262 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_76 : i32 loc(#loc867) + %next_block_263 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc868) + %do_ptrs_264 = arith.andi %do_ptrs_228, %next_block_262 : i1 loc(#loc890) + %next_block_265 = tt.load %next_block_263, %do_ptrs_264 evictionPolicy = evict_last : !tt.ptr loc(#loc869) + %needs_jump = arith.addi %do_ptrs_211, %c2_i32 : i32 loc(#loc870) + %needs_jump_266 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc871) + %needs_jump_267 = arith.cmpi eq, %needs_jump_266, %c0_i32 : i32 loc(#loc872) + %jump_to_block = arith.subi %next_block_265, %cur_block_261 : i32 loc(#loc873) + %jump_to_block_268 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc874) + %jump_to_block_269 = arith.subi %jump_to_block_268, %c64_i32 : i32 loc(#loc875) + %offset = arith.extui %needs_jump_267 : i1 to i32 loc(#loc876) + %offset_270 = arith.muli %jump_to_block_269, %offset : i32 loc(#loc876) + %offset_271 = arith.subi %c1_i32, %offset : i32 loc(#loc877) + %offset_272 = arith.muli %offset_271, %c64_i32 : i32 loc(#loc878) + %offset_273 = arith.addi %offset_270, %offset_272 : i32 loc(#loc879) + %qT_ptrs_274 = arith.muli %offset_273, %c4096_i32 : i32 loc(#loc745) + %qT_ptrs_275 = tt.splat %qT_ptrs_274 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc723) + %qT_ptrs_276 = tt.addptr %qT_ptrs_214, %qT_ptrs_275 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc723) + %do_ptrs_277 = arith.muli %offset_273, %c128_i32 : i32 loc(#loc746) + %do_ptrs_278 = tt.splat %do_ptrs_277 : i32 -> tensor<64x128xi32, #blocked> loc(#loc724) + %do_ptrs_279 = tt.addptr %do_ptrs_215, %do_ptrs_278 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc724) + %offs_m1_280 = tt.splat %offset_273 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc725) + %offs_m1_281 = arith.addi %offs_m1_216, %offs_m1_280 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc725) + %do_ptrs_282 = arith.addi %arg29, %c1_i32 : i32 loc(#loc890) + %do_ptrs_283 = arith.cmpi sge, %do_ptrs_282, %c2_i32 : i32 loc(#loc890) + %do_ptrs_284 = arith.select %do_ptrs_283, %c0_i32, %do_ptrs_282 : i32 loc(#loc890) + %do_ptrs_285 = arith.addi %arg27, %c1_i32 : i32 loc(#loc890) + %do_ptrs_286 = arith.cmpi sge, %do_ptrs_285, %c3_i32 : i32 loc(#loc890) + %do_ptrs_287 = arith.select %do_ptrs_286, %c0_i32, %do_ptrs_285 : i32 loc(#loc890) + %qT_288 = ttg.memdesc_index %qT_165[%do_ptrs_287] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc861) + %do_ptrs_289 = tt.splat %do_ptrs_226 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc890) + %qT_290 = ttg.async_copy_global_to_local %qT_ptrs_276, %qT_288 mask %do_ptrs_289 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc861) + %qT_291 = ttg.async_commit_group tokens %qT_290 loc(#loc861) + %lse_292 = tt.addptr %lse, %offs_m1_281 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc721) + %lse_293 = ttg.memdesc_index %lse_166[%do_ptrs_284] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc718) + %do_ptrs_294 = tt.splat %do_ptrs_226 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc890) + %lse_295 = ttg.async_copy_global_to_local %lse_292, %lse_293 mask %do_ptrs_294 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc718) + %lse_296 = ttg.async_commit_group tokens %lse_295 loc(#loc718) + %do_297 = ttg.memdesc_index %do_167[%do_ptrs_287] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc862) + %do_ptrs_298 = tt.splat %do_ptrs_226 : i1 -> tensor<64x128xi1, #blocked> loc(#loc890) + %do_299 = ttg.async_copy_global_to_local %do_ptrs_279, %do_297 mask %do_ptrs_298 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc862) + %do_300 = ttg.async_commit_group tokens %do_299 loc(#loc862) + %Di_301 = tt.addptr %Di, %offs_m1_281 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc722) + %Di_302 = ttg.memdesc_index %Di_168[%do_ptrs_284] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc720) + %Di_303 = ttg.async_copy_global_to_local %Di_301, %Di_302 mask %do_ptrs_294 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc720) + %Di_304 = ttg.async_commit_group tokens %Di_303 loc(#loc720) + scf.yield %dk_259, %dv_249, %qT_ptrs_276, %do_ptrs_279, %offs_m1_281, %do_ptrs_287, %do_ptrs_234, %do_ptrs_284, %do_ptrs_231, %qT_218, %qT_291, %lse_220, %lse_296, %do_222, %do_300, %Di_224, %Di_304 : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64x128x!tt.ptr, #blocked>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token loc(#loc890) + } loc(#loc890) + %do_ptrs_209:2 = ttng.warp_group_dot_wait %do_ptrs_208#1, %do_ptrs_208#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1> loc(#loc890) + %do_ptrs_210 = ttg.async_wait {num = 0 : i32} loc(#loc890) + ttg.local_dealloc %Di_168 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc890) + ttg.local_dealloc %do_167 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc890) + ttg.local_dealloc %lse_166 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc890) + ttg.local_dealloc %qT_165 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc890) + scf.yield %do_ptrs_209#0, %do_ptrs_209#1 : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1> loc(#loc277) + } loc(#loc667) + %dv_ptrs = tt.splat %DV : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc577) + %dv_ptrs_89 = tt.addptr %dv_ptrs, %ptr_35 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc577) + %dv_ptrs_90 = tt.broadcast %dv_ptrs_89 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc578) + %dv_ptrs_91 = tt.addptr %dv_ptrs_90, %ptr_41 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc578) + %1 = arith.truncf %dk#0 : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc280) + %2 = ttg.convert_layout %1 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc280) + tt.store %dv_ptrs_91, %2 : tensor<128x128x!tt.ptr, #blocked> loc(#loc280) + %dk_92 = arith.mulf %dk#1, %cst_3 : tensor<128x128xf32, #mma1> loc(#loc579) + %mask = arith.cmpi slt, %ptr, %cst_13 : tensor<128x1xi32, #blocked> loc(#loc580) + %xindex = tt.broadcast %ptr_35 : tensor<128x1xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc581) + %xindex_93 = arith.addi %ptr_41, %xindex : tensor<128x128xi32, #blocked> loc(#loc581) + %xindex_94 = tt.splat %k_adj : i32 -> tensor<128x128xi32, #blocked> loc(#loc582) + %xindex_95 = arith.addi %xindex_93, %xindex_94 : tensor<128x128xi32, #blocked> loc(#loc582) + %xindex_96 = tt.splat %dv_adj : i32 -> tensor<128x128xi32, #blocked> loc(#loc583) + %xindex_97 = arith.addi %xindex_95, %xindex_96 : tensor<128x128xi32, #blocked> loc(#loc583) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr, #blocked> loc(#loc286) + %4 = tt.addptr %3, %xindex_97 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc286) + %5 = tt.broadcast %mask : tensor<128x1xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc287) + %6 = arith.truncf %dk_92 : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc287) + %7 = ttg.convert_layout %6 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc287) + tt.store %4, %7, %5 : tensor<128x128x!tt.ptr, #blocked> loc(#loc287) + } loc(#loc18) + tt.return loc(#loc288) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":111:24) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":115:27) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":116:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":117:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":124:25) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":124:47) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":124:35) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":124:59) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":128:50) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":128:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":128:61) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":131:9) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":132:9) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":133:10) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":136:26) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":139:14) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":139:7) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":140:24) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":144:29) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":144:54) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":144:44) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":145:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":154:55) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":154:78) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":155:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":155:83) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":155:68) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":158:30) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":158:52) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":158:40) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":158:63) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":159:32) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":159:42) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":159:66) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":161:30) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":161:35) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":161:46) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":161:56) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":163:17) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":164:19) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":167:19) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":168:21) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":169:25) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":174:36) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":175:29) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":825:27) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":178:107) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":825:38) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":825:20) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":825:56) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":825:49) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":835:23) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":179:111) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":185:34) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":185:25) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":186:33) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":186:26) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":190:30) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":190:50) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":191:18) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":195:30) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":196:27) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":196:41) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":197:53) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":197:39) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":199:42) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":199:29) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":390:26) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":207:12) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":390:37) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":390:18) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":390:56) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":390:49) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":391:18) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":391:49) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":395:43) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":395:63) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":482:23) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":405:12) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":485:34) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":397:28) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":485:23) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":486:22) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":487:23) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":488:23) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":489:23) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":525:39) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":531:22) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":531:19) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":458:105) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":528:104) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":414:19) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":415:19) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":459:19) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":553:30) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":461:14) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":464:36) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":483:23) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":490:23) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":493:24) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":494:24) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":496:25) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":497:92) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":500:24) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":501:24) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":502:39) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":503:25) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":504:24) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":505:24) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":506:23) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":507:25) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":508:25) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":509:92) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":511:24) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":512:24) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":513:39) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":514:25) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":515:24) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":516:24) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":521:69) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":524:27) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":525:21) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":530:20) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":531:14) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":549:43) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":551:15) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":553:21) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":417:19) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":788:33) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":411:64) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":789:38) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":789:24) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":790:109) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":790:113) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":790:55) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":790:25) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":791:30) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":791:35) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":791:60) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":792:34) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":792:48) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":792:63) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":793:29) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":793:47) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":793:61) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":793:42) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":414:28) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":214:39) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":215:31) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":215:45) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":216:62) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":216:43) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":218:33) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":226:16) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":231:24) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":231:56) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":232:14) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":234:30) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":252:25) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":253:29) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":256:107) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":257:107) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":263:32) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":266:56) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":269:34) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":281:58) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":281:80) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":282:53) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":282:81) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":282:70) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":286:32) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":287:30) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":287:43) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":288:55) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":288:42) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":290:45) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":290:32) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":601:26) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":298:16) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":601:37) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":601:56) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":601:49) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":602:27) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":602:38) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":602:51) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":608:42) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":608:61) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":698:25) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":618:12) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":699:25) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":701:35) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":610:28) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":701:24) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":702:24) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":704:24) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":705:24) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":709:25) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":710:25) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":712:25) +#loc201 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":713:92) +#loc202 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":716:24) +#loc203 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":717:24) +#loc204 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":718:39) +#loc205 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":719:25) +#loc206 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":720:24) +#loc207 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":721:24) +#loc208 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":731:24) +#loc209 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":306:41) +#loc210 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":307:34) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":307:47) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":308:64) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":308:46) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":310:36) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":318:20) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":676:20) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":262:30) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":263:51) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":266:34) +#loc220 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":266:44) +#loc221 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":266:67) +#loc222 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":267:36) +#loc223 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":267:46) +#loc224 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":267:70) +#loc225 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":269:39) +#loc226 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":269:50) +#loc227 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":269:60) +#loc228 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":271:21) +#loc229 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":272:23) +#loc230 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":275:25) +#loc231 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":276:29) +#loc232 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":601:18) +#loc233 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":602:19) +#loc234 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":672:28) +#loc235 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":746:29) +#loc236 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":669:105) +#loc237 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":672:22) +#loc238 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":741:99) +#loc239 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":746:21) +#loc240 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":626:19) +#loc241 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":627:19) +#loc242 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":628:19) +#loc243 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":775:52) +#loc244 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":675:26) +#loc245 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":675:46) +#loc246 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":678:15) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":680:36) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":703:25) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":706:24) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":722:24) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":723:25) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":724:25) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":725:92) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":727:24) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":728:24) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":729:39) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":730:25) +#loc258 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":732:24) +#loc259 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":736:69) +#loc260 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":739:27) +#loc261 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":740:44) +#loc262 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":740:40) +#loc263 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":740:22) +#loc264 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":750:29) +#loc265 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":744:24) +#loc266 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":744:43) +#loc267 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":750:20) +#loc268 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":751:25) +#loc269 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":751:22) +#loc270 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":751:16) +#loc271 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":773:45) +#loc272 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":775:24) +#loc273 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":775:43) +#loc274 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":623:62) +#loc275 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":626:28) +#loc276 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":627:28) +#loc277 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":303:12) +#loc278 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":323:23) +#loc279 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":323:55) +#loc280 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":330:30) +#loc281 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":334:14) +#loc282 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":337:29) +#loc283 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":344:27) +#loc284 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":344:41) +#loc285 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":344:58) +#loc286 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":345:29) +#loc287 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":345:69) +#loc288 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":139:4) +#loc307 = loc("pid"(#loc2)) +#loc308 = loc("off_zq"(#loc3)) +#loc309 = loc("off_hkv"(#loc4)) +#loc310 = loc("off_zkv"(#loc5)) +#loc311 = loc("k_adj"(#loc6)) +#loc312 = loc("k_adj"(#loc7)) +#loc313 = loc("k_adj"(#loc8)) +#loc314 = loc("k_adj"(#loc9)) +#loc315 = loc("dv_adj"(#loc10)) +#loc316 = loc("dv_adj"(#loc11)) +#loc317 = loc("dv_adj"(#loc12)) +#loc318 = loc("K"(#loc13)) +#loc319 = loc("V"(#loc14)) +#loc320 = loc("DV"(#loc15)) +#loc321 = loc("offs_k"(#loc16)) +#loc322 = loc("off_pid"(#loc19)) +#loc323 = loc("off_hq2"(#loc20)) +#loc324 = loc("off_hq2"(#loc21)) +#loc325 = loc("off_hq2"(#loc22)) +#loc326 = loc("start_m2_block"(#loc23)) +#loc327 = loc("sparse_kv_num_blks_offset"(#loc24)) +#loc328 = loc("sparse_kv_num_blks_offset"(#loc25)) +#loc329 = loc("sparse_kv_idx_offset"(#loc26)) +#loc330 = loc("sparse_kv_idx_offset"(#loc27)) +#loc331 = loc("sparse_kv_idx_offset"(#loc28)) +#loc332 = loc("q_adj2"(#loc29)) +#loc333 = loc("q_adj2"(#loc30)) +#loc334 = loc("q_adj2"(#loc31)) +#loc335 = loc("q_adj2"(#loc32)) +#loc336 = loc("do_adj2"(#loc33)) +#loc337 = loc("do_adj2"(#loc34)) +#loc338 = loc("do_adj2"(#loc35)) +#loc339 = loc("off_chz2"(#loc36)) +#loc340 = loc("off_chz2"(#loc37)) +#loc341 = loc("off_chz2"(#loc38)) +#loc342 = loc("off_chz2"(#loc39)) +#loc343 = loc("Q2"(#loc40)) +#loc344 = loc("DO2"(#loc41)) +#loc345 = loc("DQ2"(#loc42)) +#loc346 = loc("LSE2"(#loc43)) +#loc347 = loc("DELTA2"(#loc44)) +#loc348 = loc("start_m2"(#loc45)) +#loc349 = loc("offs_m2"(#loc46)) +#loc350 = loc("ptr"(#loc47)) +#loc351 = loc("q"(#loc48)) +#loc352 = loc("ptr"(#loc49)) +#loc353 = loc("ptr"(#loc50)) +#loc354 = loc("ptr"(#loc51)) +#loc355 = loc("ptr"(#loc52)) +#loc356 = loc("do"(#loc54)) +#loc357 = loc("Di"(#loc55)) +#loc358 = loc("Di"(#loc56)) +#loc359 = loc("lse"(#loc57)) +#loc360 = loc("lse"(#loc58)) +#loc361 = loc("lse"(#loc59)) +#loc362 = loc("lse"(#loc60)) +#loc363 = loc("lse"(#loc61)) +#loc364 = loc("kv_indices"(#loc62)) +#loc365 = loc("kv_start"(#loc63)) +#loc366 = loc("kv_start"(#loc64)) +#loc367 = loc("sparse_kv_num_blocks"(#loc65)) +#loc368 = loc("sparse_kv_num_blocks"(#loc66)) +#loc369 = loc("offs_n2"(#loc67)) +#loc370 = loc("offs_n2"(#loc68)) +#loc371 = loc("kT_ptrs"(#loc69)) +#loc372 = loc("dq"(#loc70)) +#loc373 = loc("kT_ptrs"(#loc71)) +#loc374 = loc("kT_ptrs"(#loc72)) +#loc375 = loc("kT_ptrs"(#loc73)) +#loc376 = loc("kT_ptrs"(#loc74)) +#loc377 = loc("vT_ptrs"(#loc75)) +#loc378 = loc("vT_ptrs"(#loc76)) +#loc379 = loc("hi"(#loc77)) +#loc380 = loc("hi"(#loc78)) +#loc381 = loc("tmp4"(#loc79)) +#loc382 = loc("dq"(#loc80)) +#loc383 = loc("tmp7"(#loc81)) +#loc384 = loc("dq"(#loc82)) +#loc385 = loc("tmp7"(#loc83)) +#loc386 = loc("tmp8"(#loc84)) +#loc387 = loc("tmp9"(#loc85)) +#loc388 = loc("tmp10"(#loc86)) +#loc389 = loc("tmp11"(#loc87)) +#loc390 = loc("p"(#loc88)) +#loc391 = loc("ds"(#loc89)) +#loc392 = loc("ds"(#loc90)) +#loc393 = loc("kT"(#loc91)) +#loc394 = loc("vT"(#loc92)) +#loc395 = loc("kT_ptrs"(#loc93)) +#loc396 = loc("vT_ptrs"(#loc94)) +#loc397 = loc("qk"(#loc95)) +#loc398 = loc("dq"(#loc96)) +#loc399 = loc("qk"(#loc97)) +#loc400 = loc("n"(#loc98)) +#loc401 = loc("tmp5"(#loc99)) +#loc402 = loc("tmp12"(#loc100)) +#loc403 = loc("tmp15"(#loc101)) +#loc404 = loc("tmp16"(#loc102)) +#loc405 = loc("tmp18"(#loc103)) +#loc406 = loc("tmp19"(#loc104)) +#loc407 = loc("tmp22"(#loc105)) +#loc408 = loc("tmp23"(#loc106)) +#loc409 = loc("tmp24"(#loc107)) +#loc410 = loc("tmp25"(#loc108)) +#loc411 = loc("tmp26"(#loc109)) +#loc412 = loc("tmp27"(#loc110)) +#loc413 = loc("tmp28"(#loc111)) +#loc414 = loc("tmp29"(#loc112)) +#loc415 = loc("tmp30"(#loc113)) +#loc416 = loc("tmp31"(#loc114)) +#loc417 = loc("tmp33"(#loc115)) +#loc418 = loc("tmp34"(#loc116)) +#loc419 = loc("tmp35"(#loc117)) +#loc420 = loc("tmp36"(#loc118)) +#loc421 = loc("tmp37"(#loc119)) +#loc422 = loc("tmp38"(#loc120)) +#loc423 = loc("post_mod_scores"(#loc121)) +#loc424 = loc("post_mod_scores"(#loc122)) +#loc425 = loc("p"(#loc123)) +#loc426 = loc("dp"(#loc124)) +#loc427 = loc("ds"(#loc125)) +#loc428 = loc("ds"(#loc126)) +#loc429 = loc("ds"(#loc127)) +#loc430 = loc("dq"(#loc128)) +#loc431 = loc("offs_n2"(#loc129)) +#loc432 = loc("cur_block_idx"(#loc130)) +#loc433 = loc("offset"(#loc131)) +#loc434 = loc("cur_block"(#loc132)) +#loc435 = loc("cur_block"(#loc133)) +#loc436 = loc("next_block"(#loc134)) +#loc437 = loc("next_block"(#loc135)) +#loc438 = loc("next_block"(#loc136)) +#loc439 = loc("next_block"(#loc137)) +#loc440 = loc("needs_jump"(#loc138)) +#loc441 = loc("needs_jump"(#loc139)) +#loc442 = loc("needs_jump"(#loc140)) +#loc443 = loc("jump_to_block"(#loc141)) +#loc444 = loc("jump_to_block"(#loc142)) +#loc445 = loc("jump_to_block"(#loc143)) +#loc446 = loc("offset"(#loc144)) +#loc447 = loc("offset"(#loc145)) +#loc448 = loc("offset"(#loc146)) +#loc449 = loc("offset"(#loc147)) +#loc450 = loc("kT_ptrs"(#loc148)) +#loc451 = loc("kv_indices"(#loc149)) +#loc452 = loc("kv_start"(#loc150)) +#loc453 = loc("kv_start"(#loc151)) +#loc454 = loc("sparse_kv_num_blocks"(#loc152)) +#loc455 = loc("sparse_kv_num_blocks"(#loc153)) +#loc456 = loc("offs_n2"(#loc154)) +#loc457 = loc("dq"(#loc155)) +#loc458 = loc("dq_ptrs"(#loc156)) +#loc459 = loc("dq_ptrs"(#loc157)) +#loc460 = loc("dq"(#loc158)) +#loc461 = loc("start_n1"(#loc160)) +#loc462 = loc("offs_n1"(#loc161)) +#loc463 = loc("k"(#loc162)) +#loc464 = loc("v"(#loc163)) +#loc465 = loc("off_hq1"(#loc164)) +#loc466 = loc("q_adj1"(#loc165)) +#loc467 = loc("off_chz1"(#loc166)) +#loc468 = loc("sparse_q_num_blks_offset"(#loc167)) +#loc469 = loc("sparse_q_num_blks_offset"(#loc168)) +#loc470 = loc("sparse_q_idx_offset"(#loc169)) +#loc471 = loc("sparse_q_idx_offset"(#loc170)) +#loc472 = loc("sparse_q_idx_offset"(#loc171)) +#loc473 = loc("q_indices"(#loc172)) +#loc474 = loc("q_start"(#loc173)) +#loc475 = loc("q_start"(#loc174)) +#loc476 = loc("sparse_q_num_blocks"(#loc175)) +#loc477 = loc("sparse_q_num_blocks"(#loc176)) +#loc478 = loc("offs_m1"(#loc177)) +#loc479 = loc("offs_m1"(#loc178)) +#loc480 = loc("qT_ptrs"(#loc179)) +#loc481 = loc("qT_ptrs"(#loc181)) +#loc482 = loc("qT_ptrs"(#loc182)) +#loc483 = loc("qT_ptrs"(#loc183)) +#loc484 = loc("do_ptrs"(#loc184)) +#loc485 = loc("do_ptrs"(#loc185)) +#loc486 = loc("do_ptrs"(#loc186)) +#loc487 = loc("hi"(#loc187)) +#loc488 = loc("hi"(#loc188)) +#loc489 = loc("tmp44"(#loc189)) +#loc490 = loc(callsite(#loc190 at #loc180)) +#loc491 = loc("tmp45"(#loc191)) +#loc492 = loc("tmp47"(#loc192)) +#loc493 = loc("dk"(#loc193)) +#loc494 = loc("tmp47"(#loc194)) +#loc495 = loc("tmp48"(#loc195)) +#loc496 = loc("tmp50"(#loc196)) +#loc497 = loc("tmp51"(#loc197)) +#loc498 = loc("tmp55"(#loc198)) +#loc499 = loc("tmp56"(#loc199)) +#loc500 = loc("tmp58"(#loc200)) +#loc501 = loc("tmp59"(#loc201)) +#loc502 = loc("tmp62"(#loc202)) +#loc503 = loc("tmp63"(#loc203)) +#loc504 = loc("tmp64"(#loc204)) +#loc505 = loc("tmp65"(#loc205)) +#loc506 = loc("tmp66"(#loc206)) +#loc507 = loc("tmp67"(#loc207)) +#loc508 = loc("tmp77"(#loc208)) +#loc509 = loc("q_indices"(#loc209)) +#loc510 = loc("q_start"(#loc210)) +#loc511 = loc("q_start"(#loc211)) +#loc512 = loc("sparse_q_num_blocks"(#loc212)) +#loc513 = loc("sparse_q_num_blocks"(#loc213)) +#loc514 = loc("offs_m1"(#loc214)) +#loc515 = loc("qkT"(#loc216)) +#loc516 = loc("dv"(#loc217)) +#loc517 = loc("off_hq1"(#loc218)) +#loc518 = loc("q_adj1"(#loc219)) +#loc519 = loc("q_adj1"(#loc220)) +#loc520 = loc("q_adj1"(#loc221)) +#loc521 = loc("do_adj1"(#loc222)) +#loc522 = loc("do_adj1"(#loc223)) +#loc523 = loc("do_adj1"(#loc224)) +#loc524 = loc("off_chz1"(#loc225)) +#loc525 = loc("off_chz1"(#loc226)) +#loc526 = loc("off_chz1"(#loc227)) +#loc527 = loc("Q1"(#loc228)) +#loc528 = loc("DO1"(#loc229)) +#loc529 = loc("LSE1"(#loc230)) +#loc530 = loc("DELTA1"(#loc231)) +#loc531 = loc("qT_ptrs"(#loc232)) +#loc532 = loc("do_ptrs"(#loc233)) +#loc533 = loc("lse"(#loc234)) +#loc534 = loc("Di"(#loc235)) +#loc535 = loc("qT"(#loc236)) +#loc536 = loc("lse"(#loc237)) +#loc537 = loc("do"(#loc238)) +#loc538 = loc("Di"(#loc239)) +#loc539 = loc("qT_ptrs"(#loc240)) +#loc540 = loc("do_ptrs"(#loc241)) +#loc541 = loc("offs_m1"(#loc242)) +#loc542 = loc("dk"(#loc243)) +#loc543 = loc("lse"(#loc244)) +#loc544 = loc("lse"(#loc245)) +#loc545 = loc("qkT"(#loc246)) +#loc546 = loc("m"(#loc247)) +#loc547 = loc("tmp49"(#loc248)) +#loc548 = loc("tmp52"(#loc249)) +#loc549 = loc("tmp68"(#loc250)) +#loc550 = loc("tmp69"(#loc251)) +#loc551 = loc("tmp70"(#loc252)) +#loc552 = loc("tmp71"(#loc253)) +#loc553 = loc("tmp73"(#loc254)) +#loc554 = loc("tmp74"(#loc255)) +#loc555 = loc("tmp75"(#loc256)) +#loc556 = loc("tmp76"(#loc257)) +#loc557 = loc("tmp78"(#loc258)) +#loc558 = loc("post_mod_scores"(#loc259)) +#loc559 = loc("post_mod_scores"(#loc260)) +#loc560 = loc("pT"(#loc261)) +#loc561 = loc("pT"(#loc262)) +#loc562 = loc("pT"(#loc263)) +#loc563 = loc("dpT"(#loc264)) +#loc564 = loc("dv"(#loc265)) +#loc565 = loc("dv"(#loc266)) +#loc566 = loc("dpT"(#loc267)) +#loc567 = loc("dsT"(#loc268)) +#loc568 = loc("dsT"(#loc269)) +#loc569 = loc("dsT"(#loc270)) +#loc570 = loc("dsT"(#loc271)) +#loc571 = loc("dk"(#loc272)) +#loc572 = loc("dk"(#loc273)) +#loc573 = loc("offset"(#loc274)) +#loc574 = loc("qT_ptrs"(#loc275)) +#loc575 = loc("do_ptrs"(#loc276)) +#loc576 = loc(callsite(#loc190 at #loc215)) +#loc577 = loc("dv_ptrs"(#loc278)) +#loc578 = loc("dv_ptrs"(#loc279)) +#loc579 = loc("dk"(#loc281)) +#loc580 = loc("mask"(#loc282)) +#loc581 = loc("xindex"(#loc283)) +#loc582 = loc("xindex"(#loc284)) +#loc583 = loc("xindex"(#loc285)) +#loc584 = loc(callsite(#loc350 at #loc351)) +#loc585 = loc(callsite(#loc352 at #loc351)) +#loc586 = loc(callsite(#loc353 at #loc351)) +#loc587 = loc(callsite(#loc354 at #loc351)) +#loc588 = loc(callsite(#loc355 at #loc351)) +#loc589 = loc(callsite(#loc53 at #loc351)) +#loc590 = loc(callsite(#loc352 at #loc356)) +#loc591 = loc(callsite(#loc353 at #loc356)) +#loc592 = loc(callsite(#loc355 at #loc356)) +#loc593 = loc(callsite(#loc53 at #loc356)) +#loc594 = loc(callsite(#loc371 at #loc372)) +#loc595 = loc(callsite(#loc373 at #loc372)) +#loc596 = loc(callsite(#loc374 at #loc372)) +#loc597 = loc(callsite(#loc375 at #loc372)) +#loc598 = loc(callsite(#loc376 at #loc372)) +#loc599 = loc(callsite(#loc377 at #loc372)) +#loc600 = loc(callsite(#loc378 at #loc372)) +#loc601 = loc(callsite(#loc379 at #loc372)) +#loc602 = loc(callsite(#loc380 at #loc372)) +#loc603 = loc(callsite(#loc382 at #loc372)) +#loc604 = loc("offs_n2"(#loc384)) +#loc605 = loc(callsite(#loc395 at #loc372)) +#loc606 = loc(callsite(#loc396 at #loc372)) +#loc607 = loc(callsite(#loc431 at #loc372)) +#loc608 = loc(callsite(#loc433 at #loc372)) +#loc609 = loc(callsite(#loc450 at #loc372)) +#loc610 = loc(callsite(#loc371 at #loc457)) +#loc611 = loc(callsite(#loc373 at #loc457)) +#loc612 = loc(callsite(#loc374 at #loc457)) +#loc613 = loc(callsite(#loc376 at #loc457)) +#loc614 = loc(callsite(#loc377 at #loc457)) +#loc615 = loc(callsite(#loc378 at #loc457)) +#loc616 = loc(callsite(#loc379 at #loc457)) +#loc617 = loc(callsite(#loc380 at #loc457)) +#loc618 = loc(callsite(#loc382 at #loc457)) +#loc619 = loc(callsite(#loc395 at #loc457)) +#loc620 = loc(callsite(#loc396 at #loc457)) +#loc621 = loc(callsite(#loc433 at #loc457)) +#loc622 = loc(callsite(#loc450 at #loc457)) +#loc623 = loc(callsite(#loc350 at #loc463)) +#loc624 = loc(callsite(#loc352 at #loc463)) +#loc625 = loc(callsite(#loc353 at #loc463)) +#loc626 = loc(callsite(#loc354 at #loc463)) +#loc627 = loc(callsite(#loc355 at #loc463)) +#loc628 = loc(callsite(#loc53 at #loc463)) +#loc629 = loc(callsite(#loc353 at #loc464)) +#loc630 = loc(callsite(#loc355 at #loc464)) +#loc631 = loc(callsite(#loc53 at #loc464)) +#loc632 = loc(callsite(#loc480 at #loc180)) +#loc633 = loc(callsite(#loc481 at #loc180)) +#loc634 = loc(callsite(#loc482 at #loc180)) +#loc635 = loc(callsite(#loc483 at #loc180)) +#loc636 = loc(callsite(#loc484 at #loc180)) +#loc637 = loc(callsite(#loc485 at #loc180)) +#loc638 = loc(callsite(#loc486 at #loc180)) +#loc639 = loc(callsite(#loc487 at #loc180)) +#loc640 = loc(callsite(#loc488 at #loc180)) +#loc641 = loc(callsite(#loc489 at #loc490)) +#loc642 = loc(callsite(#loc491 at #loc490)) +#loc643 = loc(callsite(#loc492 at #loc490)) +#loc644 = loc("dv"(#loc493)) +#loc645 = loc(callsite(#loc494 at #loc490)) +#loc646 = loc(callsite(#loc495 at #loc490)) +#loc647 = loc(callsite(#loc496 at #loc490)) +#loc648 = loc(callsite(#loc497 at #loc490)) +#loc649 = loc(callsite(#loc498 at #loc490)) +#loc650 = loc(callsite(#loc499 at #loc490)) +#loc651 = loc(callsite(#loc500 at #loc490)) +#loc652 = loc(callsite(#loc501 at #loc490)) +#loc653 = loc(callsite(#loc502 at #loc490)) +#loc654 = loc(callsite(#loc503 at #loc490)) +#loc655 = loc(callsite(#loc504 at #loc490)) +#loc656 = loc(callsite(#loc505 at #loc490)) +#loc657 = loc(callsite(#loc506 at #loc490)) +#loc658 = loc(callsite(#loc507 at #loc490)) +#loc659 = loc(callsite(#loc508 at #loc490)) +#loc660 = loc(callsite(#loc480 at #loc215)) +#loc661 = loc(callsite(#loc481 at #loc215)) +#loc662 = loc(callsite(#loc484 at #loc215)) +#loc663 = loc(callsite(#loc485 at #loc215)) +#loc664 = loc(callsite(#loc487 at #loc215)) +#loc665 = loc(callsite(#loc488 at #loc215)) +#loc666 = loc(callsite(#loc515 at #loc490)) +#loc667 = loc("dk"(#loc516)) +#loc668 = loc(callsite(#loc531 at #loc180)) +#loc669 = loc(callsite(#loc532 at #loc180)) +#loc670 = loc(callsite(#loc533 at #loc490)) +#loc671 = loc(callsite(#loc534 at #loc490)) +#loc672 = loc(callsite(#loc535 at #loc490)) +#loc673 = loc(callsite(#loc536 at #loc490)) +#loc674 = loc(callsite(#loc537 at #loc490)) +#loc675 = loc(callsite(#loc538 at #loc490)) +#loc676 = loc(callsite(#loc539 at #loc180)) +#loc677 = loc(callsite(#loc540 at #loc180)) +#loc678 = loc(callsite(#loc541 at #loc180)) +#loc679 = loc(callsite(#loc542 at #loc490)) +#loc680 = loc(callsite(#loc543 at #loc490)) +#loc681 = loc(callsite(#loc544 at #loc490)) +#loc682 = loc(callsite(#loc545 at #loc490)) +#loc683 = loc(callsite(#loc546 at #loc490)) +#loc684 = loc(callsite(#loc547 at #loc490)) +#loc685 = loc(callsite(#loc548 at #loc490)) +#loc686 = loc(callsite(#loc549 at #loc490)) +#loc687 = loc(callsite(#loc550 at #loc490)) +#loc688 = loc(callsite(#loc551 at #loc490)) +#loc689 = loc(callsite(#loc552 at #loc490)) +#loc690 = loc(callsite(#loc553 at #loc490)) +#loc691 = loc(callsite(#loc554 at #loc490)) +#loc692 = loc(callsite(#loc555 at #loc490)) +#loc693 = loc(callsite(#loc556 at #loc490)) +#loc694 = loc(callsite(#loc557 at #loc490)) +#loc695 = loc(callsite(#loc558 at #loc490)) +#loc696 = loc(callsite(#loc559 at #loc490)) +#loc697 = loc(callsite(#loc560 at #loc490)) +#loc698 = loc(callsite(#loc561 at #loc490)) +#loc699 = loc(callsite(#loc562 at #loc490)) +#loc700 = loc(callsite(#loc563 at #loc490)) +#loc701 = loc(callsite(#loc564 at #loc490)) +#loc702 = loc(callsite(#loc565 at #loc490)) +#loc703 = loc(callsite(#loc566 at #loc490)) +#loc704 = loc(callsite(#loc567 at #loc490)) +#loc705 = loc(callsite(#loc568 at #loc490)) +#loc706 = loc(callsite(#loc569 at #loc490)) +#loc707 = loc(callsite(#loc570 at #loc490)) +#loc708 = loc(callsite(#loc571 at #loc490)) +#loc709 = loc(callsite(#loc572 at #loc490)) +#loc710 = loc(callsite(#loc573 at #loc180)) +#loc711 = loc(callsite(#loc574 at #loc180)) +#loc712 = loc(callsite(#loc575 at #loc180)) +#loc713 = loc(callsite(#loc531 at #loc215)) +#loc714 = loc(callsite(#loc483 at #loc215)) +#loc715 = loc(callsite(#loc532 at #loc215)) +#loc716 = loc(callsite(#loc486 at #loc215)) +#loc717 = loc(callsite(#loc535 at #loc576)) +#loc718 = loc(callsite(#loc536 at #loc576)) +#loc719 = loc(callsite(#loc537 at #loc576)) +#loc720 = loc(callsite(#loc538 at #loc576)) +#loc721 = loc(callsite(#loc533 at #loc576)) +#loc722 = loc(callsite(#loc534 at #loc576)) +#loc723 = loc(callsite(#loc539 at #loc215)) +#loc724 = loc(callsite(#loc540 at #loc215)) +#loc725 = loc(callsite(#loc541 at #loc215)) +#loc726 = loc(callsite(#loc542 at #loc576)) +#loc727 = loc(callsite(#loc543 at #loc576)) +#loc728 = loc(callsite(#loc544 at #loc576)) +#loc729 = loc(callsite(#loc515 at #loc576)) +#loc730 = loc(callsite(#loc545 at #loc576)) +#loc731 = loc(callsite(#loc559 at #loc576)) +#loc732 = loc(callsite(#loc560 at #loc576)) +#loc733 = loc(callsite(#loc561 at #loc576)) +#loc734 = loc(callsite(#loc562 at #loc576)) +#loc735 = loc(callsite(#loc563 at #loc576)) +#loc736 = loc(callsite(#loc564 at #loc576)) +#loc737 = loc(callsite(#loc565 at #loc576)) +#loc738 = loc(callsite(#loc566 at #loc576)) +#loc739 = loc(callsite(#loc567 at #loc576)) +#loc740 = loc(callsite(#loc568 at #loc576)) +#loc741 = loc(callsite(#loc569 at #loc576)) +#loc742 = loc(callsite(#loc571 at #loc576)) +#loc743 = loc(callsite(#loc572 at #loc576)) +#loc744 = loc(callsite(#loc573 at #loc215)) +#loc745 = loc(callsite(#loc574 at #loc215)) +#loc746 = loc(callsite(#loc575 at #loc215)) +#loc747 = loc(callsite(#loc381 at #loc603)) +#loc748 = loc(callsite(#loc383 at #loc603)) +#loc749 = loc("kT_ptrs"(#loc604)) +#loc750 = loc(callsite(#loc385 at #loc603)) +#loc751 = loc(callsite(#loc386 at #loc603)) +#loc752 = loc(callsite(#loc387 at #loc603)) +#loc753 = loc(callsite(#loc388 at #loc603)) +#loc754 = loc(callsite(#loc389 at #loc603)) +#loc755 = loc(callsite(#loc390 at #loc603)) +#loc756 = loc(callsite(#loc391 at #loc603)) +#loc757 = loc(callsite(#loc392 at #loc603)) +#loc758 = loc(callsite(#loc393 at #loc603)) +#loc759 = loc(callsite(#loc394 at #loc603)) +#loc760 = loc(callsite(#loc397 at #loc603)) +#loc761 = loc(callsite(#loc398 at #loc603)) +#loc762 = loc(callsite(#loc399 at #loc603)) +#loc763 = loc(callsite(#loc400 at #loc603)) +#loc764 = loc(callsite(#loc401 at #loc603)) +#loc765 = loc(callsite(#loc402 at #loc603)) +#loc766 = loc(callsite(#loc403 at #loc603)) +#loc767 = loc(callsite(#loc404 at #loc603)) +#loc768 = loc(callsite(#loc405 at #loc603)) +#loc769 = loc(callsite(#loc406 at #loc603)) +#loc770 = loc(callsite(#loc407 at #loc603)) +#loc771 = loc(callsite(#loc408 at #loc603)) +#loc772 = loc(callsite(#loc409 at #loc603)) +#loc773 = loc(callsite(#loc410 at #loc603)) +#loc774 = loc(callsite(#loc411 at #loc603)) +#loc775 = loc(callsite(#loc412 at #loc603)) +#loc776 = loc(callsite(#loc413 at #loc603)) +#loc777 = loc(callsite(#loc414 at #loc603)) +#loc778 = loc(callsite(#loc415 at #loc603)) +#loc779 = loc(callsite(#loc416 at #loc603)) +#loc780 = loc(callsite(#loc417 at #loc603)) +#loc781 = loc(callsite(#loc418 at #loc603)) +#loc782 = loc(callsite(#loc419 at #loc603)) +#loc783 = loc(callsite(#loc420 at #loc603)) +#loc784 = loc(callsite(#loc421 at #loc603)) +#loc785 = loc(callsite(#loc422 at #loc603)) +#loc786 = loc(callsite(#loc423 at #loc603)) +#loc787 = loc(callsite(#loc424 at #loc603)) +#loc788 = loc(callsite(#loc425 at #loc603)) +#loc789 = loc(callsite(#loc426 at #loc603)) +#loc790 = loc(callsite(#loc427 at #loc603)) +#loc791 = loc(callsite(#loc428 at #loc603)) +#loc792 = loc(callsite(#loc429 at #loc603)) +#loc793 = loc(callsite(#loc430 at #loc603)) +#loc794 = loc(callsite(#loc432 at #loc608)) +#loc795 = loc(callsite(#loc434 at #loc608)) +#loc796 = loc(callsite(#loc435 at #loc608)) +#loc797 = loc(callsite(#loc436 at #loc608)) +#loc798 = loc(callsite(#loc437 at #loc608)) +#loc799 = loc(callsite(#loc438 at #loc608)) +#loc800 = loc(callsite(#loc439 at #loc608)) +#loc801 = loc(callsite(#loc440 at #loc608)) +#loc802 = loc(callsite(#loc441 at #loc608)) +#loc803 = loc(callsite(#loc442 at #loc608)) +#loc804 = loc(callsite(#loc443 at #loc608)) +#loc805 = loc(callsite(#loc444 at #loc608)) +#loc806 = loc(callsite(#loc445 at #loc608)) +#loc807 = loc(callsite(#loc446 at #loc608)) +#loc808 = loc(callsite(#loc447 at #loc608)) +#loc809 = loc(callsite(#loc448 at #loc608)) +#loc810 = loc(callsite(#loc449 at #loc608)) +#loc811 = loc(callsite(#loc393 at #loc618)) +#loc812 = loc(callsite(#loc394 at #loc618)) +#loc813 = loc(callsite(#loc397 at #loc618)) +#loc814 = loc(callsite(#loc398 at #loc618)) +#loc815 = loc(callsite(#loc399 at #loc618)) +#loc816 = loc(callsite(#loc424 at #loc618)) +#loc817 = loc(callsite(#loc390 at #loc618)) +#loc818 = loc(callsite(#loc425 at #loc618)) +#loc819 = loc(callsite(#loc426 at #loc618)) +#loc820 = loc(callsite(#loc392 at #loc618)) +#loc821 = loc(callsite(#loc427 at #loc618)) +#loc822 = loc(callsite(#loc429 at #loc618)) +#loc823 = loc(callsite(#loc430 at #loc618)) +#loc824 = loc(callsite(#loc432 at #loc621)) +#loc825 = loc(callsite(#loc434 at #loc621)) +#loc826 = loc(callsite(#loc435 at #loc621)) +#loc827 = loc(callsite(#loc436 at #loc621)) +#loc828 = loc(callsite(#loc437 at #loc621)) +#loc829 = loc(callsite(#loc438 at #loc621)) +#loc830 = loc(callsite(#loc439 at #loc621)) +#loc831 = loc(callsite(#loc440 at #loc621)) +#loc832 = loc(callsite(#loc441 at #loc621)) +#loc833 = loc(callsite(#loc442 at #loc621)) +#loc834 = loc(callsite(#loc443 at #loc621)) +#loc835 = loc(callsite(#loc444 at #loc621)) +#loc836 = loc(callsite(#loc445 at #loc621)) +#loc837 = loc(callsite(#loc446 at #loc621)) +#loc838 = loc(callsite(#loc447 at #loc621)) +#loc839 = loc(callsite(#loc448 at #loc621)) +#loc840 = loc(callsite(#loc449 at #loc621)) +#loc841 = loc("offs_m1"(#loc644)) +#loc842 = loc(callsite(#loc53 at #loc672)) +#loc843 = loc(callsite(#loc53 at #loc674)) +#loc844 = loc(callsite(#loc432 at #loc710)) +#loc845 = loc(callsite(#loc434 at #loc710)) +#loc846 = loc(callsite(#loc435 at #loc710)) +#loc847 = loc(callsite(#loc436 at #loc710)) +#loc848 = loc(callsite(#loc437 at #loc710)) +#loc849 = loc(callsite(#loc438 at #loc710)) +#loc850 = loc(callsite(#loc439 at #loc710)) +#loc851 = loc(callsite(#loc440 at #loc710)) +#loc852 = loc(callsite(#loc441 at #loc710)) +#loc853 = loc(callsite(#loc442 at #loc710)) +#loc854 = loc(callsite(#loc443 at #loc710)) +#loc855 = loc(callsite(#loc444 at #loc710)) +#loc856 = loc(callsite(#loc445 at #loc710)) +#loc857 = loc(callsite(#loc446 at #loc710)) +#loc858 = loc(callsite(#loc447 at #loc710)) +#loc859 = loc(callsite(#loc448 at #loc710)) +#loc860 = loc(callsite(#loc449 at #loc710)) +#loc861 = loc(callsite(#loc53 at #loc717)) +#loc862 = loc(callsite(#loc53 at #loc719)) +#loc863 = loc(callsite(#loc432 at #loc744)) +#loc864 = loc(callsite(#loc434 at #loc744)) +#loc865 = loc(callsite(#loc435 at #loc744)) +#loc866 = loc(callsite(#loc436 at #loc744)) +#loc867 = loc(callsite(#loc437 at #loc744)) +#loc868 = loc(callsite(#loc438 at #loc744)) +#loc869 = loc(callsite(#loc439 at #loc744)) +#loc870 = loc(callsite(#loc440 at #loc744)) +#loc871 = loc(callsite(#loc441 at #loc744)) +#loc872 = loc(callsite(#loc442 at #loc744)) +#loc873 = loc(callsite(#loc443 at #loc744)) +#loc874 = loc(callsite(#loc444 at #loc744)) +#loc875 = loc(callsite(#loc445 at #loc744)) +#loc876 = loc(callsite(#loc446 at #loc744)) +#loc877 = loc(callsite(#loc447 at #loc744)) +#loc878 = loc(callsite(#loc448 at #loc744)) +#loc879 = loc(callsite(#loc449 at #loc744)) +#loc880 = loc("vT_ptrs"(#loc749)) +#loc881 = loc(callsite(#loc53 at #loc758)) +#loc882 = loc(callsite(#loc53 at #loc759)) +#loc883 = loc(callsite(#loc53 at #loc811)) +#loc884 = loc(callsite(#loc53 at #loc812)) +#loc885 = loc("qT_ptrs"(#loc841)) +#loc886 = loc(callsite(#loc880 at #loc372)) +#loc887 = loc(callsite(#loc880 at #loc457)) +#loc888 = loc("do_ptrs"(#loc885)) +#loc889 = loc(callsite(#loc888 at #loc180)) +#loc890 = loc(callsite(#loc888 at #loc215)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..c0065f67545f46007f5302ebcf19e40c771a9176 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/2J5U2YWHZDEYPTTFYZZF7L472MAKN4PYN3O5UNMFLMRH5VW6ZZQA/triton_tem_fused_zeros_1.ttir @@ -0,0 +1,1441 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":18:0) +#loc291 = loc("arg_Q"(#loc)) +#loc292 = loc("arg_K"(#loc)) +#loc293 = loc("arg_V"(#loc)) +#loc294 = loc("arg_LSE"(#loc)) +#loc295 = loc("arg_DELTA"(#loc)) +#loc296 = loc("arg_DO"(#loc)) +#loc297 = loc("arg_DQ"(#loc)) +#loc298 = loc("arg_DV"(#loc)) +#loc299 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc300 = loc("arg_KV_IDX"(#loc)) +#loc301 = loc("arg_Q_NUM_BLKS"(#loc)) +#loc302 = loc("arg_Q_IDX"(#loc)) +#loc303 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc304 = loc("arg_FULL_KV_IDX"(#loc)) +#loc305 = loc("arg_FULL_Q_NUM_BLKS"(#loc)) +#loc306 = loc("arg_FULL_Q_IDX"(#loc)) +#loc307 = loc("in_ptr16"(#loc)) +#loc308 = loc("out_ptr0"(#loc)) +module { + tt.func public @triton_tem_fused_zeros_1(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_DELTA: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DELTA"(#loc)), %arg_DO: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DO"(#loc)), %arg_DQ: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DQ"(#loc)), %arg_DV: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DV"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_NUM_BLKS"(#loc)), %arg_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %arg_FULL_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_NUM_BLKS"(#loc)), %arg_FULL_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_IDX"(#loc)), %in_ptr16: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr16"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_0 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<128x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<64xf32> loc(#loc1) + %cst_4 = arith.constant dense<0xFF800000> : tensor<64xf32> loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_5 = arith.constant dense<2048> : tensor<128x64xi32> loc(#loc1) + %cst_6 = arith.constant dense<2048> : tensor<1x64xi32> loc(#loc1) + %cst_7 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1) + %cst_8 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1) + %cst_9 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc1) + %cst_10 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc1) + %cst_11 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1) + %cst_12 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1) + %cst_13 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_14 = arith.constant dense<2048> : tensor<128x1xi32> loc(#loc1) + %cst_15 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc1) + %cst_16 = arith.constant dense<0.0883883461> : tensor<128x128xf32> loc(#loc1) + %cst_17 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc1) + %cst_18 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc1) + %cst_19 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c2097152_i32 = arith.constant 2097152 : i32 loc(#loc1) + %c262144_i32 = arith.constant 262144 : i32 loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %pid = tt.get_program_id x : i32 loc(#loc309) + %off_zq = tt.get_program_id y : i32 loc(#loc310) + %off_hkv = tt.get_program_id z : i32 loc(#loc311) + %off_zkv = arith.remsi %off_zq, %c8_i32 : i32 loc(#loc312) + %k_adj = arith.muli %off_hkv, %c262144_i32 : i32 loc(#loc313) + %k_adj_20 = arith.muli %off_zkv, %c2097152_i32 : i32 loc(#loc314) + %k_adj_21 = arith.addi %k_adj, %k_adj_20 : i32 loc(#loc315) + %k_adj_22 = arith.extsi %k_adj_21 : i32 to i64 loc(#loc316) + %dv_adj = arith.muli %off_zq, %c2097152_i32 : i32 loc(#loc317) + %dv_adj_23 = arith.addi %k_adj, %dv_adj : i32 loc(#loc318) + %dv_adj_24 = arith.extsi %dv_adj_23 : i32 to i64 loc(#loc319) + %K = tt.addptr %arg_K, %k_adj_22 : !tt.ptr, i64 loc(#loc320) + %V = tt.addptr %arg_V, %k_adj_22 : !tt.ptr, i64 loc(#loc321) + %DV = tt.addptr %arg_DV, %dv_adj_24 : !tt.ptr, i64 loc(#loc322) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc323) + %0 = arith.cmpi sge, %pid, %c16_i32 : i32 loc(#loc17) + scf.if %0 { + %off_pid = arith.subi %pid, %c16_i32 : i32 loc(#loc324) + %off_hq2 = arith.divsi %off_pid, %c16_i32 : i32 loc(#loc325) + %off_hq2_25 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc326) + %off_hq2_26 = arith.addi %off_hq2, %off_hq2_25 : i32 loc(#loc327) + %start_m2_block = arith.remsi %off_pid, %c16_i32 : i32 loc(#loc328) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %c16_i32 : i32 loc(#loc329) + %sparse_kv_num_blks_offset_27 = arith.addi %sparse_kv_num_blks_offset, %start_m2_block : i32 loc(#loc330) + %sparse_kv_idx_offset = arith.muli %off_zkv, %c256_i32 : i32 loc(#loc331) + %sparse_kv_idx_offset_28 = arith.muli %start_m2_block, %c16_i32 : i32 loc(#loc332) + %sparse_kv_idx_offset_29 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_28 : i32 loc(#loc333) + %q_adj2 = arith.muli %off_hq2_26, %c128_i32 : i32 loc(#loc334) + %q_adj2_30 = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc335) + %q_adj2_31 = arith.addi %q_adj2, %q_adj2_30 : i32 loc(#loc336) + %q_adj2_32 = arith.extsi %q_adj2_31 : i32 to i64 loc(#loc337) + %do_adj2 = arith.muli %off_hq2_26, %c262144_i32 : i32 loc(#loc338) + %do_adj2_33 = arith.addi %do_adj2, %q_adj2_30 : i32 loc(#loc339) + %do_adj2_34 = arith.extsi %do_adj2_33 : i32 to i64 loc(#loc340) + %off_chz2 = arith.muli %off_zq, %c32_i32 : i32 loc(#loc341) + %off_chz2_35 = arith.addi %off_chz2, %off_hq2_26 : i32 loc(#loc342) + %off_chz2_36 = arith.muli %off_chz2_35, %c2048_i32 : i32 loc(#loc343) + %off_chz2_37 = arith.extsi %off_chz2_36 : i32 to i64 loc(#loc344) + %Q2 = tt.addptr %arg_Q, %q_adj2_32 : !tt.ptr, i64 loc(#loc345) + %DO2 = tt.addptr %arg_DO, %do_adj2_34 : !tt.ptr, i64 loc(#loc346) + %DQ2 = tt.addptr %arg_DQ, %q_adj2_32 : !tt.ptr, i64 loc(#loc347) + %LSE2 = tt.addptr %arg_LSE, %off_chz2_37 : !tt.ptr, i64 loc(#loc348) + %DELTA2 = tt.addptr %arg_DELTA, %off_chz2_37 : !tt.ptr, i64 loc(#loc349) + %start_m2 = arith.muli %start_m2_block, %c128_i32 : i32 loc(#loc350) + %offs_m2 = tt.splat %start_m2 : i32 -> tensor<128xi32> loc(#loc351) + %offs_m2_38 = arith.addi %offs_m2, %offs_k : tensor<128xi32> loc(#loc351) + %ptr = tt.expand_dims %offs_m2_38 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc588) + %ptr_39 = arith.muli %ptr, %cst_17 : tensor<128x1xi32> loc(#loc589) + %ptr_40 = tt.splat %Q2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc590) + %ptr_41 = tt.addptr %ptr_40, %ptr_39 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc590) + %ptr_42 = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc591) + %ptr_43 = tt.broadcast %ptr_41 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc592) + %ptr_44 = tt.broadcast %ptr_42 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc592) + %ptr_45 = tt.addptr %ptr_43, %ptr_44 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc592) + %q = tt.load %ptr_45 : tensor<128x128x!tt.ptr> loc(#loc593) + %ptr_46 = arith.muli %ptr, %cst_15 : tensor<128x1xi32> loc(#loc594) + %ptr_47 = tt.splat %DO2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc595) + %ptr_48 = tt.addptr %ptr_47, %ptr_46 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc595) + %ptr_49 = tt.broadcast %ptr_48 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc596) + %ptr_50 = tt.addptr %ptr_49, %ptr_44 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc596) + %do = tt.load %ptr_50 : tensor<128x128x!tt.ptr> loc(#loc597) + %Di = tt.splat %DELTA2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc359) + %Di_51 = tt.addptr %Di, %offs_m2_38 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc359) + %Di_52 = tt.load %Di_51 : tensor<128x!tt.ptr> loc(#loc360) + %lse = tt.splat %LSE2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc361) + %lse_53 = tt.addptr %lse, %offs_m2_38 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc361) + %lse_54 = tt.load %lse_53 : tensor<128x!tt.ptr> loc(#loc362) + %lse_55 = arith.cmpf oeq, %lse_54, %cst_19 : tensor<128xf32> loc(#loc363) + %lse_56 = arith.select %lse_55, %cst_18, %lse_54 : tensor<128xi1>, tensor<128xf32> loc(#loc364) + %lse_57 = tt.expand_dims %lse_56 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc365) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_29 : !tt.ptr, i32 loc(#loc366) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc367) + %kv_start_58 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc368) + %sparse_kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_27 : !tt.ptr, i32 loc(#loc369) + %sparse_kv_num_blocks_59 = tt.load %sparse_kv_num_blocks : !tt.ptr loc(#loc370) + %offs_n2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc371) + %offs_n2_60 = tt.splat %kv_start_58 : i32 -> tensor<64xi32> loc(#loc372) + %offs_n2_61 = arith.addi %offs_n2_60, %offs_n2 : tensor<64xi32> loc(#loc372) + %kT_ptrs = tt.expand_dims %offs_n2_61 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc598) + %kT_ptrs_62 = arith.muli %kT_ptrs, %cst_1 : tensor<1x64xi32> loc(#loc599) + %kT_ptrs_63 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc600) + %kT_ptrs_64 = tt.addptr %kT_ptrs_63, %kT_ptrs_62 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc600) + %kT_ptrs_65 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc601) + %kT_ptrs_66 = tt.broadcast %kT_ptrs_64 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc602) + %kT_ptrs_67 = tt.broadcast %kT_ptrs_65 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc602) + %kT_ptrs_68 = tt.addptr %kT_ptrs_66, %kT_ptrs_67 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc602) + %vT_ptrs = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc603) + %vT_ptrs_69 = tt.addptr %vT_ptrs, %kT_ptrs_62 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc603) + %vT_ptrs_70 = tt.broadcast %vT_ptrs_69 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc604) + %vT_ptrs_71 = tt.addptr %vT_ptrs_70, %kT_ptrs_67 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc604) + %hi = arith.muli %sparse_kv_num_blocks_59, %c2_i32 : i32 loc(#loc605) + %hi_72 = arith.minsi %hi, %c32_i32 : i32 loc(#loc606) + %vT_ptrs_73:4 = scf.for %start_n = %c0_i32 to %hi_72 step %c1_i32 iter_args(%dq_95 = %cst_13, %offs_n2_96 = %offs_n2_61, %kT_ptrs_97 = %kT_ptrs_68, %vT_ptrs_98 = %vT_ptrs_71) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %kT = tt.load %kT_ptrs_97 : tensor<128x64x!tt.ptr> loc(#loc889) + %qk = tt.dot %q, %kT, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc756) + %qk_99 = arith.mulf %qk, %cst_11 : tensor<128x64xf32> loc(#loc757) + %n = tt.expand_dims %offs_n2_96 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc758) + %tmp4 = tt.broadcast %ptr : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc759) + %tmp4_100 = tt.broadcast %n : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc759) + %tmp4_101 = arith.cmpi sge, %tmp4, %tmp4_100 : tensor<128x64xi32> loc(#loc759) + %tmp5 = arith.extsi %n : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc760) + %tmp7 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc761) + %tmp7_102 = tt.load %tmp7 : !tt.ptr loc(#loc762) + %tmp8 = tt.splat %tmp7_102 : i64 -> tensor<1x64xi64> loc(#loc763) + %tmp8_103 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc763) + %tmp9 = arith.extsi %ptr : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc764) + %tmp10 = tt.splat %tmp7_102 : i64 -> tensor<128x1xi64> loc(#loc765) + %tmp10_104 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc765) + %tmp11 = tt.broadcast %tmp8_103 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc766) + %tmp11_105 = tt.broadcast %tmp10_104 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc766) + %tmp11_106 = arith.andi %tmp11, %tmp11_105 : tensor<128x64xi1> loc(#loc766) + %tmp12 = arith.andi %tmp4_101, %tmp11_106 : tensor<128x64xi1> loc(#loc767) + %tmp15 = arith.cmpi sge, %n, %cst_6 : tensor<1x64xi32> loc(#loc768) + %tmp16 = arith.remsi %n, %cst_6 : tensor<1x64xi32> loc(#loc769) + %tmp18 = arith.cmpi ne, %tmp16, %cst_10 : tensor<1x64xi32> loc(#loc770) + %tmp19 = arith.cmpi slt, %tmp16, %cst_10 : tensor<1x64xi32> loc(#loc771) + %tmp22 = arith.andi %tmp18, %tmp19 : tensor<1x64xi1> loc(#loc772) + %tmp23 = arith.addi %tmp16, %cst_6 : tensor<1x64xi32> loc(#loc773) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc774) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc775) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64> loc(#loc776) + %tmp27 = arith.andi %tmp15, %tmp26 : tensor<1x64xi1> loc(#loc777) + %tmp28 = arith.subi %tmp4_100, %tmp4 : tensor<128x64xi32> loc(#loc778) + %tmp29 = arith.remsi %tmp28, %cst_5 : tensor<128x64xi32> loc(#loc779) + %tmp30 = arith.cmpi ne, %tmp29, %cst_9 : tensor<128x64xi32> loc(#loc780) + %tmp31 = arith.cmpi slt, %tmp29, %cst_9 : tensor<128x64xi32> loc(#loc781) + %tmp33 = arith.andi %tmp30, %tmp31 : tensor<128x64xi1> loc(#loc782) + %tmp34 = arith.addi %tmp29, %cst_5 : tensor<128x64xi32> loc(#loc783) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc784) + %tmp36 = arith.cmpi eq, %tmp35, %cst_9 : tensor<128x64xi32> loc(#loc785) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc786) + %tmp37_107 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1> loc(#loc786) + %tmp38 = arith.ori %tmp12, %tmp37_107 : tensor<128x64xi1> loc(#loc787) + %post_mod_scores = arith.select %tmp38, %qk_99, %cst_8 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc788) + %post_mod_scores_108 = arith.mulf %post_mod_scores, %cst_7 : tensor<128x64xf32> loc(#loc789) + %p = tt.broadcast %lse_57 : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc790) + %p_109 = arith.subf %post_mod_scores_108, %p : tensor<128x64xf32> loc(#loc790) + %p_110 = math.exp2 %p_109 : tensor<128x64xf32> loc(#loc791) + %vT = tt.load %vT_ptrs_98 : tensor<128x64x!tt.ptr> loc(#loc890) + %dp = tt.dot %do, %vT, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc793) + %ds = tt.expand_dims %Di_52 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc794) + %ds_111 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc795) + %ds_112 = arith.subf %dp, %ds_111 : tensor<128x64xf32> loc(#loc795) + %ds_113 = arith.mulf %p_110, %ds_112 : tensor<128x64xf32> loc(#loc796) + %ds_114 = arith.select %tmp38, %ds_113, %cst_12 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc797) + %ds_115 = arith.truncf %ds_114 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc798) + %dq_116 = tt.trans %kT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc799) + %dq_117 = tt.dot %ds_115, %dq_116, %dq_95, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc800) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc801) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc802) + %cur_block_118 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc803) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc804) + %next_block_119 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_59 : i32 loc(#loc805) + %next_block_120 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc806) + %next_block_121 = tt.load %next_block_120, %next_block_119 evictionPolicy = evict_last : !tt.ptr loc(#loc807) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc808) + %needs_jump_122 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc809) + %needs_jump_123 = arith.cmpi eq, %needs_jump_122, %c0_i32 : i32 loc(#loc810) + %jump_to_block = arith.subi %next_block_121, %cur_block_118 : i32 loc(#loc811) + %jump_to_block_124 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc812) + %jump_to_block_125 = arith.subi %jump_to_block_124, %c64_i32 : i32 loc(#loc813) + %offset = arith.extui %needs_jump_123 : i1 to i32 loc(#loc814) + %offset_126 = arith.muli %jump_to_block_125, %offset : i32 loc(#loc814) + %offset_127 = arith.subi %c1_i32, %offset : i32 loc(#loc815) + %offset_128 = arith.muli %offset_127, %c64_i32 : i32 loc(#loc816) + %offset_129 = arith.addi %offset_126, %offset_128 : i32 loc(#loc817) + %kT_ptrs_130 = arith.muli %offset_129, %c128_i32 : i32 loc(#loc610) + %kT_ptrs_131 = tt.splat %kT_ptrs_130 : i32 -> tensor<128x64xi32> loc(#loc611) + %kT_ptrs_132 = tt.addptr %kT_ptrs_97, %kT_ptrs_131 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc611) + %vT_ptrs_133 = tt.addptr %vT_ptrs_98, %kT_ptrs_131 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc612) + %offs_n2_134 = tt.splat %offset_129 : i32 -> tensor<64xi32> loc(#loc613) + %offs_n2_135 = arith.addi %offs_n2_96, %offs_n2_134 : tensor<64xi32> loc(#loc613) + scf.yield %dq_117, %offs_n2_135, %kT_ptrs_132, %vT_ptrs_133 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc614) + } loc(#loc894) + %kv_indices_74 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_29 : !tt.ptr, i32 loc(#loc453) + %kv_start_75 = tt.load %kv_indices_74 : !tt.ptr loc(#loc454) + %kv_start_76 = arith.muli %kv_start_75, %c128_i32 : i32 loc(#loc455) + %sparse_kv_num_blocks_77 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_27 : !tt.ptr, i32 loc(#loc456) + %sparse_kv_num_blocks_78 = tt.load %sparse_kv_num_blocks_77 : !tt.ptr loc(#loc457) + %offs_n2_79 = tt.splat %kv_start_76 : i32 -> tensor<64xi32> loc(#loc458) + %offs_n2_80 = arith.addi %offs_n2_79, %offs_n2 : tensor<64xi32> loc(#loc458) + %kT_ptrs_81 = tt.expand_dims %offs_n2_80 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc615) + %kT_ptrs_82 = arith.muli %kT_ptrs_81, %cst_1 : tensor<1x64xi32> loc(#loc616) + %kT_ptrs_83 = tt.addptr %kT_ptrs_63, %kT_ptrs_82 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc617) + %kT_ptrs_84 = tt.broadcast %kT_ptrs_83 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc618) + %kT_ptrs_85 = tt.addptr %kT_ptrs_84, %kT_ptrs_67 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc618) + %vT_ptrs_86 = tt.addptr %vT_ptrs, %kT_ptrs_82 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc619) + %vT_ptrs_87 = tt.broadcast %vT_ptrs_86 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc620) + %vT_ptrs_88 = tt.addptr %vT_ptrs_87, %kT_ptrs_67 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc620) + %hi_89 = arith.muli %sparse_kv_num_blocks_78, %c2_i32 : i32 loc(#loc621) + %hi_90 = arith.minsi %hi_89, %c32_i32 : i32 loc(#loc622) + %vT_ptrs_91:4 = scf.for %start_n = %c0_i32 to %hi_90 step %c1_i32 iter_args(%dq_95 = %vT_ptrs_73#0, %offs_n2_96 = %offs_n2_80, %kT_ptrs_97 = %kT_ptrs_85, %vT_ptrs_98 = %vT_ptrs_88) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %kT = tt.load %kT_ptrs_97 : tensor<128x64x!tt.ptr> loc(#loc891) + %qk = tt.dot %q, %kT, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc819) + %qk_99 = arith.mulf %qk, %cst_11 : tensor<128x64xf32> loc(#loc820) + %post_mod_scores = arith.mulf %qk_99, %cst_7 : tensor<128x64xf32> loc(#loc821) + %p = tt.broadcast %lse_57 : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc822) + %p_100 = arith.subf %post_mod_scores, %p : tensor<128x64xf32> loc(#loc822) + %p_101 = math.exp2 %p_100 : tensor<128x64xf32> loc(#loc823) + %vT = tt.load %vT_ptrs_98 : tensor<128x64x!tt.ptr> loc(#loc892) + %dp = tt.dot %do, %vT, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc825) + %ds = tt.expand_dims %Di_52 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc826) + %ds_102 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc827) + %ds_103 = arith.subf %dp, %ds_102 : tensor<128x64xf32> loc(#loc827) + %ds_104 = arith.mulf %p_101, %ds_103 : tensor<128x64xf32> loc(#loc828) + %ds_105 = arith.truncf %ds_104 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc829) + %dq_106 = tt.trans %kT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc830) + %dq_107 = tt.dot %ds_105, %dq_106, %dq_95, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc831) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc832) + %cur_block = tt.addptr %kv_indices_74, %cur_block_idx : !tt.ptr, i32 loc(#loc833) + %cur_block_108 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc834) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc835) + %next_block_109 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_78 : i32 loc(#loc836) + %next_block_110 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc837) + %next_block_111 = tt.load %next_block_110, %next_block_109 evictionPolicy = evict_last : !tt.ptr loc(#loc838) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc839) + %needs_jump_112 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc840) + %needs_jump_113 = arith.cmpi eq, %needs_jump_112, %c0_i32 : i32 loc(#loc841) + %jump_to_block = arith.subi %next_block_111, %cur_block_108 : i32 loc(#loc842) + %jump_to_block_114 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc843) + %jump_to_block_115 = arith.subi %jump_to_block_114, %c64_i32 : i32 loc(#loc844) + %offset = arith.extui %needs_jump_113 : i1 to i32 loc(#loc845) + %offset_116 = arith.muli %jump_to_block_115, %offset : i32 loc(#loc845) + %offset_117 = arith.subi %c1_i32, %offset : i32 loc(#loc846) + %offset_118 = arith.muli %offset_117, %c64_i32 : i32 loc(#loc847) + %offset_119 = arith.addi %offset_116, %offset_118 : i32 loc(#loc848) + %kT_ptrs_120 = arith.muli %offset_119, %c128_i32 : i32 loc(#loc625) + %kT_ptrs_121 = tt.splat %kT_ptrs_120 : i32 -> tensor<128x64xi32> loc(#loc626) + %kT_ptrs_122 = tt.addptr %kT_ptrs_97, %kT_ptrs_121 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc626) + %vT_ptrs_123 = tt.addptr %vT_ptrs_98, %kT_ptrs_121 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc627) + %offs_n2_124 = tt.splat %offset_119 : i32 -> tensor<64xi32> loc(#loc628) + %offs_n2_125 = arith.addi %offs_n2_96, %offs_n2_124 : tensor<64xi32> loc(#loc628) + scf.yield %dq_107, %offs_n2_125, %kT_ptrs_122, %vT_ptrs_123 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc629) + } loc(#loc895) + %dq_ptrs = tt.splat %DQ2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc460) + %dq_ptrs_92 = tt.addptr %dq_ptrs, %ptr_39 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc460) + %dq_ptrs_93 = tt.broadcast %dq_ptrs_92 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc461) + %dq_ptrs_94 = tt.addptr %dq_ptrs_93, %ptr_44 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc461) + %dq = arith.mulf %vT_ptrs_91#0, %cst_16 : tensor<128x128xf32> loc(#loc462) + %1 = arith.truncf %dq : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc160) + tt.store %dq_ptrs_94, %1 : tensor<128x128x!tt.ptr> loc(#loc160) + } else { + %start_n1 = arith.muli %pid, %c128_i32 : i32 loc(#loc463) + %offs_n1 = tt.splat %start_n1 : i32 -> tensor<128xi32> loc(#loc464) + %offs_n1_25 = arith.addi %offs_n1, %offs_k : tensor<128xi32> loc(#loc464) + %ptr = tt.expand_dims %offs_n1_25 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc630) + %ptr_26 = arith.muli %ptr, %cst_15 : tensor<128x1xi32> loc(#loc631) + %ptr_27 = tt.splat %K : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc632) + %ptr_28 = tt.addptr %ptr_27, %ptr_26 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc632) + %ptr_29 = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc633) + %ptr_30 = tt.broadcast %ptr_28 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc634) + %ptr_31 = tt.broadcast %ptr_29 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc634) + %ptr_32 = tt.addptr %ptr_30, %ptr_31 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc634) + %k = tt.load %ptr_32 : tensor<128x128x!tt.ptr> loc(#loc635) + %ptr_33 = tt.splat %V : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc636) + %ptr_34 = tt.addptr %ptr_33, %ptr_26 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc636) + %ptr_35 = tt.broadcast %ptr_34 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc637) + %ptr_36 = tt.addptr %ptr_35, %ptr_31 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc637) + %v = tt.load %ptr_36 : tensor<128x128x!tt.ptr> loc(#loc638) + %dk:2 = scf.for %off_g = %c0_i32 to %c4_i32 step %c1_i32 iter_args(%dv = %cst_13, %dk_46 = %cst_13) -> (tensor<128x128xf32>, tensor<128x128xf32>) : i32 { + %off_hq1 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc468) + %off_hq1_47 = arith.addi %off_hq1, %off_g : i32 loc(#loc469) + %q_adj1 = arith.muli %off_hq1_47, %c128_i32 : i32 loc(#loc470) + %q_adj1_48 = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc471) + %q_adj1_49 = arith.addi %q_adj1, %q_adj1_48 : i32 loc(#loc472) + %q_adj1_50 = arith.extsi %q_adj1_49 : i32 to i64 loc(#loc473) + %do_adj1 = arith.muli %off_hq1_47, %c262144_i32 : i32 loc(#loc474) + %do_adj1_51 = arith.addi %do_adj1, %q_adj1_48 : i32 loc(#loc475) + %do_adj1_52 = arith.extsi %do_adj1_51 : i32 to i64 loc(#loc476) + %off_chz1 = arith.muli %off_zq, %c32_i32 : i32 loc(#loc477) + %off_chz1_53 = arith.addi %off_chz1, %off_hq1_47 : i32 loc(#loc478) + %off_chz1_54 = arith.muli %off_chz1_53, %c2048_i32 : i32 loc(#loc479) + %off_chz1_55 = arith.extsi %off_chz1_54 : i32 to i64 loc(#loc480) + %Q1 = tt.addptr %arg_Q, %q_adj1_50 : !tt.ptr, i64 loc(#loc481) + %DO1 = tt.addptr %arg_DO, %do_adj1_52 : !tt.ptr, i64 loc(#loc482) + %LSE1 = tt.addptr %arg_LSE, %off_chz1_55 : !tt.ptr, i64 loc(#loc483) + %DELTA1 = tt.addptr %arg_DELTA, %off_chz1_55 : !tt.ptr, i64 loc(#loc484) + %sparse_q_num_blks_offset = arith.muli %off_zkv, %c16_i32 : i32 loc(#loc485) + %sparse_q_num_blks_offset_56 = arith.addi %sparse_q_num_blks_offset, %pid : i32 loc(#loc486) + %sparse_q_idx_offset = arith.muli %off_zkv, %c256_i32 : i32 loc(#loc487) + %sparse_q_idx_offset_57 = arith.muli %pid, %c16_i32 : i32 loc(#loc488) + %sparse_q_idx_offset_58 = arith.addi %sparse_q_idx_offset, %sparse_q_idx_offset_57 : i32 loc(#loc489) + %q_indices = tt.addptr %arg_Q_IDX, %sparse_q_idx_offset_58 : !tt.ptr, i32 loc(#loc490) + %q_start = tt.load %q_indices : !tt.ptr loc(#loc491) + %q_start_59 = arith.muli %q_start, %c128_i32 : i32 loc(#loc492) + %sparse_q_num_blocks = tt.addptr %arg_Q_NUM_BLKS, %sparse_q_num_blks_offset_56 : !tt.ptr, i32 loc(#loc493) + %sparse_q_num_blocks_60 = tt.load %sparse_q_num_blocks : !tt.ptr loc(#loc494) + %offs_m1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc495) + %offs_m1_61 = tt.splat %q_start_59 : i32 -> tensor<64xi32> loc(#loc496) + %offs_m1_62 = arith.addi %offs_m1_61, %offs_m1 : tensor<64xi32> loc(#loc496) + %qT_ptrs = tt.expand_dims %offs_m1_62 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc640) + %qT_ptrs_63 = arith.muli %qT_ptrs, %cst_0 : tensor<1x64xi32> loc(#loc641) + %qT_ptrs_64 = tt.splat %Q1 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc642) + %qT_ptrs_65 = tt.addptr %qT_ptrs_64, %qT_ptrs_63 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc642) + %qT_ptrs_66 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc643) + %qT_ptrs_67 = tt.broadcast %qT_ptrs_65 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc644) + %qT_ptrs_68 = tt.broadcast %qT_ptrs_66 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc644) + %qT_ptrs_69 = tt.addptr %qT_ptrs_67, %qT_ptrs_68 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc644) + %do_ptrs = tt.expand_dims %offs_m1_62 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc645) + %do_ptrs_70 = arith.muli %do_ptrs, %cst : tensor<64x1xi32> loc(#loc646) + %do_ptrs_71 = tt.splat %DO1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc647) + %do_ptrs_72 = tt.addptr %do_ptrs_71, %do_ptrs_70 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc647) + %do_ptrs_73 = tt.broadcast %do_ptrs_72 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc648) + %do_ptrs_74 = tt.broadcast %ptr_29 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc648) + %do_ptrs_75 = tt.addptr %do_ptrs_73, %do_ptrs_74 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc648) + %hi = arith.muli %sparse_q_num_blocks_60, %c2_i32 : i32 loc(#loc649) + %hi_76 = arith.minsi %hi, %c32_i32 : i32 loc(#loc650) + %do_ptrs_77:5 = scf.for %start_m = %c0_i32 to %hi_76 step %c1_i32 iter_args(%dk_98 = %dk_46, %dv_99 = %dv, %offs_m1_100 = %offs_m1_62, %qT_ptrs_101 = %qT_ptrs_69, %do_ptrs_102 = %do_ptrs_75) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %qT = tt.load %qT_ptrs_101 : tensor<128x64x!tt.ptr> loc(#loc850) + %lse = tt.splat %LSE1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc653) + %lse_103 = tt.addptr %lse, %offs_m1_100 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc653) + %lse_104 = tt.load %lse_103 : tensor<64x!tt.ptr> loc(#loc654) + %lse_105 = arith.cmpf oeq, %lse_104, %cst_4 : tensor<64xf32> loc(#loc655) + %lse_106 = arith.select %lse_105, %cst_3, %lse_104 : tensor<64xi1>, tensor<64xf32> loc(#loc656) + %qkT = tt.dot %k, %qT, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc657) + %qkT_107 = arith.mulf %qkT, %cst_11 : tensor<128x64xf32> loc(#loc658) + %m = tt.expand_dims %offs_m1_100 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc659) + %tmp44 = tt.broadcast %m : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc660) + %tmp44_108 = tt.broadcast %ptr : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc660) + %tmp44_109 = arith.cmpi sge, %tmp44, %tmp44_108 : tensor<128x64xi32> loc(#loc660) + %tmp45 = arith.extsi %ptr : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc661) + %tmp47 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc662) + %tmp47_110 = tt.load %tmp47 : !tt.ptr loc(#loc663) + %tmp48 = tt.splat %tmp47_110 : i64 -> tensor<128x1xi64> loc(#loc664) + %tmp48_111 = arith.cmpi slt, %tmp45, %tmp48 : tensor<128x1xi64> loc(#loc664) + %tmp49 = arith.extsi %m : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc665) + %tmp50 = tt.splat %tmp47_110 : i64 -> tensor<1x64xi64> loc(#loc666) + %tmp50_112 = arith.cmpi slt, %tmp49, %tmp50 : tensor<1x64xi64> loc(#loc666) + %tmp51 = tt.broadcast %tmp48_111 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc667) + %tmp51_113 = tt.broadcast %tmp50_112 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc667) + %tmp51_114 = arith.andi %tmp51, %tmp51_113 : tensor<128x64xi1> loc(#loc667) + %tmp52 = arith.andi %tmp44_109, %tmp51_114 : tensor<128x64xi1> loc(#loc668) + %tmp55 = arith.cmpi sge, %ptr, %cst_14 : tensor<128x1xi32> loc(#loc669) + %tmp56 = arith.remsi %ptr, %cst_14 : tensor<128x1xi32> loc(#loc670) + %tmp58 = arith.cmpi ne, %tmp56, %cst_2 : tensor<128x1xi32> loc(#loc671) + %tmp59 = arith.cmpi slt, %tmp56, %cst_2 : tensor<128x1xi32> loc(#loc672) + %tmp62 = arith.andi %tmp58, %tmp59 : tensor<128x1xi1> loc(#loc673) + %tmp63 = arith.addi %tmp56, %cst_14 : tensor<128x1xi32> loc(#loc674) + %tmp64 = arith.select %tmp62, %tmp63, %tmp56 : tensor<128x1xi1>, tensor<128x1xi32> loc(#loc675) + %tmp65 = arith.extsi %tmp64 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc676) + %tmp66 = arith.cmpi slt, %tmp65, %tmp48 : tensor<128x1xi64> loc(#loc677) + %tmp67 = arith.andi %tmp55, %tmp66 : tensor<128x1xi1> loc(#loc678) + %tmp68 = arith.subi %tmp44_108, %tmp44 : tensor<128x64xi32> loc(#loc679) + %tmp69 = arith.remsi %tmp68, %cst_5 : tensor<128x64xi32> loc(#loc680) + %tmp70 = arith.cmpi ne, %tmp69, %cst_9 : tensor<128x64xi32> loc(#loc681) + %tmp71 = arith.cmpi slt, %tmp69, %cst_9 : tensor<128x64xi32> loc(#loc682) + %tmp73 = arith.andi %tmp70, %tmp71 : tensor<128x64xi1> loc(#loc683) + %tmp74 = arith.addi %tmp69, %cst_5 : tensor<128x64xi32> loc(#loc684) + %tmp75 = arith.select %tmp73, %tmp74, %tmp69 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc685) + %tmp76 = arith.cmpi eq, %tmp75, %cst_9 : tensor<128x64xi32> loc(#loc686) + %tmp77 = tt.broadcast %tmp67 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc687) + %tmp77_115 = arith.andi %tmp77, %tmp76 : tensor<128x64xi1> loc(#loc687) + %tmp78 = arith.ori %tmp52, %tmp77_115 : tensor<128x64xi1> loc(#loc688) + %post_mod_scores = arith.select %tmp78, %qkT_107, %cst_8 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc689) + %post_mod_scores_116 = arith.mulf %post_mod_scores, %cst_7 : tensor<128x64xf32> loc(#loc690) + %pT = tt.expand_dims %lse_106 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc691) + %pT_117 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc692) + %pT_118 = arith.subf %post_mod_scores_116, %pT_117 : tensor<128x64xf32> loc(#loc692) + %pT_119 = math.exp2 %pT_118 : tensor<128x64xf32> loc(#loc693) + %do = tt.load %do_ptrs_102 : tensor<64x128x!tt.ptr> loc(#loc851) + %dv_120 = arith.truncf %pT_119 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc695) + %dv_121 = tt.dot %dv_120, %do, %dv_99, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc696) + %Di = tt.splat %DELTA1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc697) + %Di_122 = tt.addptr %Di, %offs_m1_100 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc697) + %Di_123 = tt.load %Di_122 : tensor<64x!tt.ptr> loc(#loc698) + %dpT = tt.trans %do {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc699) + %dpT_124 = tt.dot %v, %dpT, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc700) + %dsT = tt.expand_dims %Di_123 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc701) + %dsT_125 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc702) + %dsT_126 = arith.subf %dpT_124, %dsT_125 : tensor<128x64xf32> loc(#loc702) + %dsT_127 = arith.mulf %pT_119, %dsT_126 : tensor<128x64xf32> loc(#loc703) + %dsT_128 = arith.select %tmp78, %dsT_127, %cst_12 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc704) + %dk_129 = arith.truncf %dsT_128 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc705) + %dk_130 = tt.trans %qT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc706) + %dk_131 = tt.dot %dk_129, %dk_130, %dk_98, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc707) + %cur_block_idx = arith.divsi %start_m, %c2_i32 : i32 loc(#loc852) + %cur_block = tt.addptr %q_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc853) + %cur_block_132 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc854) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc855) + %next_block_133 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_60 : i32 loc(#loc856) + %next_block_134 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc857) + %next_block_135 = tt.load %next_block_134, %next_block_133 evictionPolicy = evict_last : !tt.ptr loc(#loc858) + %needs_jump = arith.addi %start_m, %c1_i32 : i32 loc(#loc859) + %needs_jump_136 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc860) + %needs_jump_137 = arith.cmpi eq, %needs_jump_136, %c0_i32 : i32 loc(#loc861) + %jump_to_block = arith.subi %next_block_135, %cur_block_132 : i32 loc(#loc862) + %jump_to_block_138 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc863) + %jump_to_block_139 = arith.subi %jump_to_block_138, %c64_i32 : i32 loc(#loc864) + %offset = arith.extui %needs_jump_137 : i1 to i32 loc(#loc865) + %offset_140 = arith.muli %jump_to_block_139, %offset : i32 loc(#loc865) + %offset_141 = arith.subi %c1_i32, %offset : i32 loc(#loc866) + %offset_142 = arith.muli %offset_141, %c64_i32 : i32 loc(#loc867) + %offset_143 = arith.addi %offset_140, %offset_142 : i32 loc(#loc868) + %qT_ptrs_144 = arith.muli %offset_143, %c4096_i32 : i32 loc(#loc709) + %qT_ptrs_145 = tt.splat %qT_ptrs_144 : i32 -> tensor<128x64xi32> loc(#loc710) + %qT_ptrs_146 = tt.addptr %qT_ptrs_101, %qT_ptrs_145 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc710) + %do_ptrs_147 = arith.muli %offset_143, %c128_i32 : i32 loc(#loc711) + %do_ptrs_148 = tt.splat %do_ptrs_147 : i32 -> tensor<64x128xi32> loc(#loc712) + %do_ptrs_149 = tt.addptr %do_ptrs_102, %do_ptrs_148 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc712) + %offs_m1_150 = tt.splat %offset_143 : i32 -> tensor<64xi32> loc(#loc713) + %offs_m1_151 = arith.addi %offs_m1_100, %offs_m1_150 : tensor<64xi32> loc(#loc713) + scf.yield %dk_131, %dv_121, %offs_m1_151, %qT_ptrs_146, %do_ptrs_149 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc572) + } loc(#loc897) + %q_indices_78 = tt.addptr %arg_FULL_Q_IDX, %sparse_q_idx_offset_58 : !tt.ptr, i32 loc(#loc573) + %q_start_79 = tt.load %q_indices_78 : !tt.ptr loc(#loc574) + %q_start_80 = arith.muli %q_start_79, %c128_i32 : i32 loc(#loc575) + %sparse_q_num_blocks_81 = tt.addptr %arg_FULL_Q_NUM_BLKS, %sparse_q_num_blks_offset_56 : !tt.ptr, i32 loc(#loc576) + %sparse_q_num_blocks_82 = tt.load %sparse_q_num_blocks_81 : !tt.ptr loc(#loc577) + %offs_m1_83 = tt.splat %q_start_80 : i32 -> tensor<64xi32> loc(#loc578) + %offs_m1_84 = arith.addi %offs_m1_83, %offs_m1 : tensor<64xi32> loc(#loc578) + %qT_ptrs_85 = tt.expand_dims %offs_m1_84 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc714) + %qT_ptrs_86 = arith.muli %qT_ptrs_85, %cst_0 : tensor<1x64xi32> loc(#loc715) + %qT_ptrs_87 = tt.addptr %qT_ptrs_64, %qT_ptrs_86 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc716) + %qT_ptrs_88 = tt.broadcast %qT_ptrs_87 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc717) + %qT_ptrs_89 = tt.addptr %qT_ptrs_88, %qT_ptrs_68 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc717) + %do_ptrs_90 = tt.expand_dims %offs_m1_84 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc718) + %do_ptrs_91 = arith.muli %do_ptrs_90, %cst : tensor<64x1xi32> loc(#loc719) + %do_ptrs_92 = tt.addptr %do_ptrs_71, %do_ptrs_91 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc720) + %do_ptrs_93 = tt.broadcast %do_ptrs_92 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc721) + %do_ptrs_94 = tt.addptr %do_ptrs_93, %do_ptrs_74 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc721) + %hi_95 = arith.muli %sparse_q_num_blocks_82, %c2_i32 : i32 loc(#loc722) + %hi_96 = arith.minsi %hi_95, %c32_i32 : i32 loc(#loc723) + %do_ptrs_97:5 = scf.for %start_m = %c0_i32 to %hi_96 step %c1_i32 iter_args(%dk_98 = %do_ptrs_77#0, %dv_99 = %do_ptrs_77#1, %offs_m1_100 = %offs_m1_84, %qT_ptrs_101 = %qT_ptrs_89, %do_ptrs_102 = %do_ptrs_94) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %qT = tt.load %qT_ptrs_101 : tensor<128x64x!tt.ptr> loc(#loc869) + %lse = tt.splat %LSE1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc725) + %lse_103 = tt.addptr %lse, %offs_m1_100 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc725) + %lse_104 = tt.load %lse_103 : tensor<64x!tt.ptr> loc(#loc726) + %lse_105 = arith.cmpf oeq, %lse_104, %cst_4 : tensor<64xf32> loc(#loc727) + %lse_106 = arith.select %lse_105, %cst_3, %lse_104 : tensor<64xi1>, tensor<64xf32> loc(#loc728) + %qkT = tt.dot %k, %qT, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc729) + %qkT_107 = arith.mulf %qkT, %cst_11 : tensor<128x64xf32> loc(#loc730) + %post_mod_scores = arith.mulf %qkT_107, %cst_7 : tensor<128x64xf32> loc(#loc731) + %pT = tt.expand_dims %lse_106 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc732) + %pT_108 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc733) + %pT_109 = arith.subf %post_mod_scores, %pT_108 : tensor<128x64xf32> loc(#loc733) + %pT_110 = math.exp2 %pT_109 : tensor<128x64xf32> loc(#loc734) + %do = tt.load %do_ptrs_102 : tensor<64x128x!tt.ptr> loc(#loc870) + %dv_111 = arith.truncf %pT_110 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc736) + %dv_112 = tt.dot %dv_111, %do, %dv_99, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc737) + %Di = tt.splat %DELTA1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc738) + %Di_113 = tt.addptr %Di, %offs_m1_100 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc738) + %Di_114 = tt.load %Di_113 : tensor<64x!tt.ptr> loc(#loc739) + %dpT = tt.trans %do {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc740) + %dpT_115 = tt.dot %v, %dpT, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc741) + %dsT = tt.expand_dims %Di_114 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc742) + %dsT_116 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc743) + %dsT_117 = arith.subf %dpT_115, %dsT_116 : tensor<128x64xf32> loc(#loc743) + %dsT_118 = arith.mulf %pT_110, %dsT_117 : tensor<128x64xf32> loc(#loc744) + %dk_119 = arith.truncf %dsT_118 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc745) + %dk_120 = tt.trans %qT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc746) + %dk_121 = tt.dot %dk_119, %dk_120, %dk_98, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc747) + %cur_block_idx = arith.divsi %start_m, %c2_i32 : i32 loc(#loc871) + %cur_block = tt.addptr %q_indices_78, %cur_block_idx : !tt.ptr, i32 loc(#loc872) + %cur_block_122 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc873) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc874) + %next_block_123 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_82 : i32 loc(#loc875) + %next_block_124 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc876) + %next_block_125 = tt.load %next_block_124, %next_block_123 evictionPolicy = evict_last : !tt.ptr loc(#loc877) + %needs_jump = arith.addi %start_m, %c1_i32 : i32 loc(#loc878) + %needs_jump_126 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc879) + %needs_jump_127 = arith.cmpi eq, %needs_jump_126, %c0_i32 : i32 loc(#loc880) + %jump_to_block = arith.subi %next_block_125, %cur_block_122 : i32 loc(#loc881) + %jump_to_block_128 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc882) + %jump_to_block_129 = arith.subi %jump_to_block_128, %c64_i32 : i32 loc(#loc883) + %offset = arith.extui %needs_jump_127 : i1 to i32 loc(#loc884) + %offset_130 = arith.muli %jump_to_block_129, %offset : i32 loc(#loc884) + %offset_131 = arith.subi %c1_i32, %offset : i32 loc(#loc885) + %offset_132 = arith.muli %offset_131, %c64_i32 : i32 loc(#loc886) + %offset_133 = arith.addi %offset_130, %offset_132 : i32 loc(#loc887) + %qT_ptrs_134 = arith.muli %offset_133, %c4096_i32 : i32 loc(#loc749) + %qT_ptrs_135 = tt.splat %qT_ptrs_134 : i32 -> tensor<128x64xi32> loc(#loc750) + %qT_ptrs_136 = tt.addptr %qT_ptrs_101, %qT_ptrs_135 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc750) + %do_ptrs_137 = arith.muli %offset_133, %c128_i32 : i32 loc(#loc751) + %do_ptrs_138 = tt.splat %do_ptrs_137 : i32 -> tensor<64x128xi32> loc(#loc752) + %do_ptrs_139 = tt.addptr %do_ptrs_102, %do_ptrs_138 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc752) + %offs_m1_140 = tt.splat %offset_133 : i32 -> tensor<64xi32> loc(#loc753) + %offs_m1_141 = arith.addi %offs_m1_100, %offs_m1_140 : tensor<64xi32> loc(#loc753) + scf.yield %dk_121, %dv_112, %offs_m1_141, %qT_ptrs_136, %do_ptrs_139 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc580) + } loc(#loc898) + scf.yield %do_ptrs_97#1, %do_ptrs_97#0 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc279) + } loc(#loc639) + %dv_ptrs = tt.splat %DV : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc581) + %dv_ptrs_37 = tt.addptr %dv_ptrs, %ptr_26 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc581) + %dv_ptrs_38 = tt.broadcast %dv_ptrs_37 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc582) + %dv_ptrs_39 = tt.addptr %dv_ptrs_38, %ptr_31 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc582) + %1 = arith.truncf %dk#0 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc282) + tt.store %dv_ptrs_39, %1 : tensor<128x128x!tt.ptr> loc(#loc282) + %dk_40 = arith.mulf %dk#1, %cst_16 : tensor<128x128xf32> loc(#loc583) + %mask = arith.cmpi slt, %ptr, %cst_14 : tensor<128x1xi32> loc(#loc584) + %xindex = tt.broadcast %ptr_26 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc585) + %xindex_41 = arith.addi %ptr_31, %xindex : tensor<128x128xi32> loc(#loc585) + %xindex_42 = tt.splat %k_adj : i32 -> tensor<128x128xi32> loc(#loc586) + %xindex_43 = arith.addi %xindex_41, %xindex_42 : tensor<128x128xi32> loc(#loc586) + %xindex_44 = tt.splat %dv_adj : i32 -> tensor<128x128xi32> loc(#loc587) + %xindex_45 = arith.addi %xindex_43, %xindex_44 : tensor<128x128xi32> loc(#loc587) + %2 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc288) + %3 = tt.addptr %2, %xindex_45 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc288) + %4 = tt.broadcast %mask : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc289) + %5 = arith.truncf %dk_40 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc289) + tt.store %3, %5, %4 : tensor<128x128x!tt.ptr> loc(#loc289) + } loc(#loc18) + tt.return loc(#loc290) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":111:24) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":115:27) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":116:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":117:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":124:25) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":124:47) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":124:35) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":124:59) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":128:50) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":128:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":128:61) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":131:9) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":132:9) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":133:10) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":136:26) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":139:14) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":139:7) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":140:24) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":144:29) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":144:54) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":144:44) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":145:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":154:55) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":154:78) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":155:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":155:83) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":155:68) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":158:30) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":158:52) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":158:40) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":158:63) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":159:32) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":159:42) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":159:66) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":161:30) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":161:35) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":161:46) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":161:56) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":163:17) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":164:19) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":167:19) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":168:21) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":169:25) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":174:36) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":175:29) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":825:27) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":178:107) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":825:38) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":825:20) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":825:56) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":825:49) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":835:23) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":179:111) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":185:34) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":185:25) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":186:33) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":186:26) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":190:30) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":190:50) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":191:18) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":195:30) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":196:27) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":196:41) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":197:53) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":197:39) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":199:42) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":199:29) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":390:26) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":207:12) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":390:37) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":390:18) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":390:56) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":390:49) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":391:18) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":391:49) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":395:43) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":395:63) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":397:28) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":458:105) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":405:12) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":459:19) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":461:14) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":464:36) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":482:23) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":483:23) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":485:34) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":485:23) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":486:22) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":487:23) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":488:23) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":489:23) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":490:23) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":493:24) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":494:24) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":496:25) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":497:92) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":500:24) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":501:24) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":502:39) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":503:25) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":504:24) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":505:24) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":506:23) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":507:25) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":508:25) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":509:92) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":511:24) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":512:24) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":513:39) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":514:25) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":515:24) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":516:24) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":521:69) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":524:27) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":525:39) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":525:21) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":528:104) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":530:20) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":531:22) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":531:19) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":531:14) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":549:43) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":551:15) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":553:30) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":553:21) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":788:33) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":411:64) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":789:38) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":789:24) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":790:109) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":790:113) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":790:55) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":790:25) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":791:30) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":791:35) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":791:60) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":792:34) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":792:48) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":792:63) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":793:29) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":793:47) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":793:61) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":793:42) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":414:28) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":414:19) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":415:19) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":417:19) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":417:8) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":214:39) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":215:31) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":215:45) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":216:62) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":216:43) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":218:33) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":226:16) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":231:24) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":231:56) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":232:14) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":234:30) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":252:25) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":253:29) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":256:107) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":257:107) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":262:30) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":263:32) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":263:51) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":266:34) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":266:56) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":266:44) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":266:67) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":267:36) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":267:46) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":267:70) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":269:34) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":269:39) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":269:50) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":269:60) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":271:21) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":272:23) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":275:25) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":276:29) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":281:58) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":281:80) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":282:53) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":282:81) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":282:70) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":286:32) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":287:30) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":287:43) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":288:55) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":288:42) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":290:45) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":290:32) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":601:26) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":298:16) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":601:37) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":601:18) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":601:56) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":601:49) +#loc201 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":602:27) +#loc202 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":602:38) +#loc203 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":602:19) +#loc204 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":602:51) +#loc205 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":608:42) +#loc206 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":608:61) +#loc207 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":610:28) +#loc208 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":669:105) +#loc209 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":618:12) +#loc210 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":672:28) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":672:22) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":675:26) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":675:46) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":676:20) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":678:15) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":680:36) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":698:25) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":699:25) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":701:35) +#loc220 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":701:24) +#loc221 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":702:24) +#loc222 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":703:25) +#loc223 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":704:24) +#loc224 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":705:24) +#loc225 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":706:24) +#loc226 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":709:25) +#loc227 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":710:25) +#loc228 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":712:25) +#loc229 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":713:92) +#loc230 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":716:24) +#loc231 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":717:24) +#loc232 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":718:39) +#loc233 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":719:25) +#loc234 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":720:24) +#loc235 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":721:24) +#loc236 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":722:24) +#loc237 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":723:25) +#loc238 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":724:25) +#loc239 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":725:92) +#loc240 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":727:24) +#loc241 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":728:24) +#loc242 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":729:39) +#loc243 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":730:25) +#loc244 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":731:24) +#loc245 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":732:24) +#loc246 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":736:69) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":739:27) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":740:44) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":740:40) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":740:22) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":741:99) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":744:24) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":744:43) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":746:29) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":746:21) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":750:29) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":750:20) +#loc258 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":751:25) +#loc259 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":751:22) +#loc260 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":751:16) +#loc261 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":773:45) +#loc262 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":775:24) +#loc263 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":775:52) +#loc264 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":775:43) +#loc265 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":623:62) +#loc266 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":626:28) +#loc267 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":626:19) +#loc268 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":627:28) +#loc269 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":627:19) +#loc270 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":628:19) +#loc271 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":628:8) +#loc272 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":306:41) +#loc273 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":307:34) +#loc274 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":307:47) +#loc275 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":308:64) +#loc276 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":308:46) +#loc277 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":310:36) +#loc278 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":318:20) +#loc279 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":303:12) +#loc280 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":323:23) +#loc281 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":323:55) +#loc282 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":330:30) +#loc283 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":334:14) +#loc284 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":337:29) +#loc285 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":344:27) +#loc286 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":344:41) +#loc287 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":344:58) +#loc288 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":345:29) +#loc289 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":345:69) +#loc290 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4d/c4dw6ykxgbwk3glacutxkpzwhapvr5oszjet3n4i4q3snumjzm3x.py":139:4) +#loc309 = loc("pid"(#loc2)) +#loc310 = loc("off_zq"(#loc3)) +#loc311 = loc("off_hkv"(#loc4)) +#loc312 = loc("off_zkv"(#loc5)) +#loc313 = loc("k_adj"(#loc6)) +#loc314 = loc("k_adj"(#loc7)) +#loc315 = loc("k_adj"(#loc8)) +#loc316 = loc("k_adj"(#loc9)) +#loc317 = loc("dv_adj"(#loc10)) +#loc318 = loc("dv_adj"(#loc11)) +#loc319 = loc("dv_adj"(#loc12)) +#loc320 = loc("K"(#loc13)) +#loc321 = loc("V"(#loc14)) +#loc322 = loc("DV"(#loc15)) +#loc323 = loc("offs_k"(#loc16)) +#loc324 = loc("off_pid"(#loc19)) +#loc325 = loc("off_hq2"(#loc20)) +#loc326 = loc("off_hq2"(#loc21)) +#loc327 = loc("off_hq2"(#loc22)) +#loc328 = loc("start_m2_block"(#loc23)) +#loc329 = loc("sparse_kv_num_blks_offset"(#loc24)) +#loc330 = loc("sparse_kv_num_blks_offset"(#loc25)) +#loc331 = loc("sparse_kv_idx_offset"(#loc26)) +#loc332 = loc("sparse_kv_idx_offset"(#loc27)) +#loc333 = loc("sparse_kv_idx_offset"(#loc28)) +#loc334 = loc("q_adj2"(#loc29)) +#loc335 = loc("q_adj2"(#loc30)) +#loc336 = loc("q_adj2"(#loc31)) +#loc337 = loc("q_adj2"(#loc32)) +#loc338 = loc("do_adj2"(#loc33)) +#loc339 = loc("do_adj2"(#loc34)) +#loc340 = loc("do_adj2"(#loc35)) +#loc341 = loc("off_chz2"(#loc36)) +#loc342 = loc("off_chz2"(#loc37)) +#loc343 = loc("off_chz2"(#loc38)) +#loc344 = loc("off_chz2"(#loc39)) +#loc345 = loc("Q2"(#loc40)) +#loc346 = loc("DO2"(#loc41)) +#loc347 = loc("DQ2"(#loc42)) +#loc348 = loc("LSE2"(#loc43)) +#loc349 = loc("DELTA2"(#loc44)) +#loc350 = loc("start_m2"(#loc45)) +#loc351 = loc("offs_m2"(#loc46)) +#loc352 = loc("ptr"(#loc47)) +#loc353 = loc("q"(#loc48)) +#loc354 = loc("ptr"(#loc49)) +#loc355 = loc("ptr"(#loc50)) +#loc356 = loc("ptr"(#loc51)) +#loc357 = loc("ptr"(#loc52)) +#loc358 = loc("do"(#loc54)) +#loc359 = loc("Di"(#loc55)) +#loc360 = loc("Di"(#loc56)) +#loc361 = loc("lse"(#loc57)) +#loc362 = loc("lse"(#loc58)) +#loc363 = loc("lse"(#loc59)) +#loc364 = loc("lse"(#loc60)) +#loc365 = loc("lse"(#loc61)) +#loc366 = loc("kv_indices"(#loc62)) +#loc367 = loc("kv_start"(#loc63)) +#loc368 = loc("kv_start"(#loc64)) +#loc369 = loc("sparse_kv_num_blocks"(#loc65)) +#loc370 = loc("sparse_kv_num_blocks"(#loc66)) +#loc371 = loc("offs_n2"(#loc67)) +#loc372 = loc("offs_n2"(#loc68)) +#loc373 = loc("kT_ptrs"(#loc69)) +#loc374 = loc("dq"(#loc70)) +#loc375 = loc("kT_ptrs"(#loc71)) +#loc376 = loc("kT_ptrs"(#loc72)) +#loc377 = loc("kT_ptrs"(#loc73)) +#loc378 = loc("kT_ptrs"(#loc74)) +#loc379 = loc("vT_ptrs"(#loc75)) +#loc380 = loc("vT_ptrs"(#loc76)) +#loc381 = loc("hi"(#loc77)) +#loc382 = loc("hi"(#loc78)) +#loc383 = loc("dq"(#loc79)) +#loc384 = loc("kT"(#loc80)) +#loc385 = loc("dq"(#loc81)) +#loc386 = loc("qk"(#loc82)) +#loc387 = loc("qk"(#loc83)) +#loc388 = loc("n"(#loc84)) +#loc389 = loc("tmp4"(#loc85)) +#loc390 = loc("tmp5"(#loc86)) +#loc391 = loc("tmp7"(#loc87)) +#loc392 = loc("tmp7"(#loc88)) +#loc393 = loc("tmp8"(#loc89)) +#loc394 = loc("tmp9"(#loc90)) +#loc395 = loc("tmp10"(#loc91)) +#loc396 = loc("tmp11"(#loc92)) +#loc397 = loc("tmp12"(#loc93)) +#loc398 = loc("tmp15"(#loc94)) +#loc399 = loc("tmp16"(#loc95)) +#loc400 = loc("tmp18"(#loc96)) +#loc401 = loc("tmp19"(#loc97)) +#loc402 = loc("tmp22"(#loc98)) +#loc403 = loc("tmp23"(#loc99)) +#loc404 = loc("tmp24"(#loc100)) +#loc405 = loc("tmp25"(#loc101)) +#loc406 = loc("tmp26"(#loc102)) +#loc407 = loc("tmp27"(#loc103)) +#loc408 = loc("tmp28"(#loc104)) +#loc409 = loc("tmp29"(#loc105)) +#loc410 = loc("tmp30"(#loc106)) +#loc411 = loc("tmp31"(#loc107)) +#loc412 = loc("tmp33"(#loc108)) +#loc413 = loc("tmp34"(#loc109)) +#loc414 = loc("tmp35"(#loc110)) +#loc415 = loc("tmp36"(#loc111)) +#loc416 = loc("tmp37"(#loc112)) +#loc417 = loc("tmp38"(#loc113)) +#loc418 = loc("post_mod_scores"(#loc114)) +#loc419 = loc("post_mod_scores"(#loc115)) +#loc420 = loc("p"(#loc116)) +#loc421 = loc("p"(#loc117)) +#loc422 = loc("vT"(#loc118)) +#loc423 = loc("dp"(#loc119)) +#loc424 = loc("ds"(#loc120)) +#loc425 = loc("ds"(#loc121)) +#loc426 = loc("ds"(#loc122)) +#loc427 = loc("ds"(#loc123)) +#loc428 = loc("ds"(#loc124)) +#loc429 = loc("dq"(#loc125)) +#loc430 = loc("dq"(#loc126)) +#loc431 = loc("cur_block_idx"(#loc127)) +#loc432 = loc("offset"(#loc128)) +#loc433 = loc("cur_block"(#loc129)) +#loc434 = loc("cur_block"(#loc130)) +#loc435 = loc("next_block"(#loc131)) +#loc436 = loc("next_block"(#loc132)) +#loc437 = loc("next_block"(#loc133)) +#loc438 = loc("next_block"(#loc134)) +#loc439 = loc("needs_jump"(#loc135)) +#loc440 = loc("needs_jump"(#loc136)) +#loc441 = loc("needs_jump"(#loc137)) +#loc442 = loc("jump_to_block"(#loc138)) +#loc443 = loc("jump_to_block"(#loc139)) +#loc444 = loc("jump_to_block"(#loc140)) +#loc445 = loc("offset"(#loc141)) +#loc446 = loc("offset"(#loc142)) +#loc447 = loc("offset"(#loc143)) +#loc448 = loc("offset"(#loc144)) +#loc449 = loc("kT_ptrs"(#loc145)) +#loc450 = loc("kT_ptrs"(#loc146)) +#loc451 = loc("vT_ptrs"(#loc147)) +#loc452 = loc("offs_n2"(#loc148)) +#loc453 = loc("kv_indices"(#loc150)) +#loc454 = loc("kv_start"(#loc151)) +#loc455 = loc("kv_start"(#loc152)) +#loc456 = loc("sparse_kv_num_blocks"(#loc153)) +#loc457 = loc("sparse_kv_num_blocks"(#loc154)) +#loc458 = loc("offs_n2"(#loc155)) +#loc459 = loc("dq"(#loc156)) +#loc460 = loc("dq_ptrs"(#loc157)) +#loc461 = loc("dq_ptrs"(#loc158)) +#loc462 = loc("dq"(#loc159)) +#loc463 = loc("start_n1"(#loc161)) +#loc464 = loc("offs_n1"(#loc162)) +#loc465 = loc("k"(#loc163)) +#loc466 = loc("v"(#loc164)) +#loc467 = loc("dv"(#loc165)) +#loc468 = loc("off_hq1"(#loc166)) +#loc469 = loc("off_hq1"(#loc167)) +#loc470 = loc("q_adj1"(#loc168)) +#loc471 = loc("q_adj1"(#loc169)) +#loc472 = loc("q_adj1"(#loc170)) +#loc473 = loc("q_adj1"(#loc171)) +#loc474 = loc("do_adj1"(#loc172)) +#loc475 = loc("do_adj1"(#loc173)) +#loc476 = loc("do_adj1"(#loc174)) +#loc477 = loc("off_chz1"(#loc175)) +#loc478 = loc("off_chz1"(#loc176)) +#loc479 = loc("off_chz1"(#loc177)) +#loc480 = loc("off_chz1"(#loc178)) +#loc481 = loc("Q1"(#loc179)) +#loc482 = loc("DO1"(#loc180)) +#loc483 = loc("LSE1"(#loc181)) +#loc484 = loc("DELTA1"(#loc182)) +#loc485 = loc("sparse_q_num_blks_offset"(#loc183)) +#loc486 = loc("sparse_q_num_blks_offset"(#loc184)) +#loc487 = loc("sparse_q_idx_offset"(#loc185)) +#loc488 = loc("sparse_q_idx_offset"(#loc186)) +#loc489 = loc("sparse_q_idx_offset"(#loc187)) +#loc490 = loc("q_indices"(#loc188)) +#loc491 = loc("q_start"(#loc189)) +#loc492 = loc("q_start"(#loc190)) +#loc493 = loc("sparse_q_num_blocks"(#loc191)) +#loc494 = loc("sparse_q_num_blocks"(#loc192)) +#loc495 = loc("offs_m1"(#loc193)) +#loc496 = loc("offs_m1"(#loc194)) +#loc497 = loc("qT_ptrs"(#loc195)) +#loc498 = loc("qT_ptrs"(#loc197)) +#loc499 = loc("qT_ptrs"(#loc198)) +#loc500 = loc("qT_ptrs"(#loc199)) +#loc501 = loc("qT_ptrs"(#loc200)) +#loc502 = loc("do_ptrs"(#loc201)) +#loc503 = loc("do_ptrs"(#loc202)) +#loc504 = loc("do_ptrs"(#loc203)) +#loc505 = loc("do_ptrs"(#loc204)) +#loc506 = loc("hi"(#loc205)) +#loc507 = loc("hi"(#loc206)) +#loc508 = loc("dk"(#loc207)) +#loc509 = loc("qT"(#loc208)) +#loc510 = loc(callsite(#loc209 at #loc196)) +#loc511 = loc("lse"(#loc210)) +#loc512 = loc("lse"(#loc211)) +#loc513 = loc("lse"(#loc212)) +#loc514 = loc("lse"(#loc213)) +#loc515 = loc("qkT"(#loc214)) +#loc516 = loc("qkT"(#loc215)) +#loc517 = loc("m"(#loc216)) +#loc518 = loc("tmp44"(#loc217)) +#loc519 = loc("tmp45"(#loc218)) +#loc520 = loc("tmp47"(#loc219)) +#loc521 = loc("tmp47"(#loc220)) +#loc522 = loc("tmp48"(#loc221)) +#loc523 = loc("tmp49"(#loc222)) +#loc524 = loc("tmp50"(#loc223)) +#loc525 = loc("tmp51"(#loc224)) +#loc526 = loc("tmp52"(#loc225)) +#loc527 = loc("tmp55"(#loc226)) +#loc528 = loc("tmp56"(#loc227)) +#loc529 = loc("tmp58"(#loc228)) +#loc530 = loc("tmp59"(#loc229)) +#loc531 = loc("tmp62"(#loc230)) +#loc532 = loc("tmp63"(#loc231)) +#loc533 = loc("tmp64"(#loc232)) +#loc534 = loc("tmp65"(#loc233)) +#loc535 = loc("tmp66"(#loc234)) +#loc536 = loc("tmp67"(#loc235)) +#loc537 = loc("tmp68"(#loc236)) +#loc538 = loc("tmp69"(#loc237)) +#loc539 = loc("tmp70"(#loc238)) +#loc540 = loc("tmp71"(#loc239)) +#loc541 = loc("tmp73"(#loc240)) +#loc542 = loc("tmp74"(#loc241)) +#loc543 = loc("tmp75"(#loc242)) +#loc544 = loc("tmp76"(#loc243)) +#loc545 = loc("tmp77"(#loc244)) +#loc546 = loc("tmp78"(#loc245)) +#loc547 = loc("post_mod_scores"(#loc246)) +#loc548 = loc("post_mod_scores"(#loc247)) +#loc549 = loc("pT"(#loc248)) +#loc550 = loc("pT"(#loc249)) +#loc551 = loc("pT"(#loc250)) +#loc552 = loc("do"(#loc251)) +#loc553 = loc("dv"(#loc252)) +#loc554 = loc("dv"(#loc253)) +#loc555 = loc("Di"(#loc254)) +#loc556 = loc("Di"(#loc255)) +#loc557 = loc("dpT"(#loc256)) +#loc558 = loc("dpT"(#loc257)) +#loc559 = loc("dsT"(#loc258)) +#loc560 = loc("dsT"(#loc259)) +#loc561 = loc("dsT"(#loc260)) +#loc562 = loc("dsT"(#loc261)) +#loc563 = loc("dk"(#loc262)) +#loc564 = loc("dk"(#loc263)) +#loc565 = loc("dk"(#loc264)) +#loc566 = loc("offset"(#loc265)) +#loc567 = loc("qT_ptrs"(#loc266)) +#loc568 = loc("qT_ptrs"(#loc267)) +#loc569 = loc("do_ptrs"(#loc268)) +#loc570 = loc("do_ptrs"(#loc269)) +#loc571 = loc("offs_m1"(#loc270)) +#loc572 = loc(callsite(#loc271 at #loc196)) +#loc573 = loc("q_indices"(#loc272)) +#loc574 = loc("q_start"(#loc273)) +#loc575 = loc("q_start"(#loc274)) +#loc576 = loc("sparse_q_num_blocks"(#loc275)) +#loc577 = loc("sparse_q_num_blocks"(#loc276)) +#loc578 = loc("offs_m1"(#loc277)) +#loc579 = loc(callsite(#loc209 at #loc278)) +#loc580 = loc(callsite(#loc271 at #loc278)) +#loc581 = loc("dv_ptrs"(#loc280)) +#loc582 = loc("dv_ptrs"(#loc281)) +#loc583 = loc("dk"(#loc283)) +#loc584 = loc("mask"(#loc284)) +#loc585 = loc("xindex"(#loc285)) +#loc586 = loc("xindex"(#loc286)) +#loc587 = loc("xindex"(#loc287)) +#loc588 = loc(callsite(#loc352 at #loc353)) +#loc589 = loc(callsite(#loc354 at #loc353)) +#loc590 = loc(callsite(#loc355 at #loc353)) +#loc591 = loc(callsite(#loc356 at #loc353)) +#loc592 = loc(callsite(#loc357 at #loc353)) +#loc593 = loc(callsite(#loc53 at #loc353)) +#loc594 = loc(callsite(#loc354 at #loc358)) +#loc595 = loc(callsite(#loc355 at #loc358)) +#loc596 = loc(callsite(#loc357 at #loc358)) +#loc597 = loc(callsite(#loc53 at #loc358)) +#loc598 = loc(callsite(#loc373 at #loc374)) +#loc599 = loc(callsite(#loc375 at #loc374)) +#loc600 = loc(callsite(#loc376 at #loc374)) +#loc601 = loc(callsite(#loc377 at #loc374)) +#loc602 = loc(callsite(#loc378 at #loc374)) +#loc603 = loc(callsite(#loc379 at #loc374)) +#loc604 = loc(callsite(#loc380 at #loc374)) +#loc605 = loc(callsite(#loc381 at #loc374)) +#loc606 = loc(callsite(#loc382 at #loc374)) +#loc607 = loc("offs_n2"(#loc383)) +#loc608 = loc(callsite(#loc385 at #loc374)) +#loc609 = loc(callsite(#loc432 at #loc374)) +#loc610 = loc(callsite(#loc449 at #loc374)) +#loc611 = loc(callsite(#loc450 at #loc374)) +#loc612 = loc(callsite(#loc451 at #loc374)) +#loc613 = loc(callsite(#loc452 at #loc374)) +#loc614 = loc(callsite(#loc149 at #loc374)) +#loc615 = loc(callsite(#loc373 at #loc459)) +#loc616 = loc(callsite(#loc375 at #loc459)) +#loc617 = loc(callsite(#loc376 at #loc459)) +#loc618 = loc(callsite(#loc378 at #loc459)) +#loc619 = loc(callsite(#loc379 at #loc459)) +#loc620 = loc(callsite(#loc380 at #loc459)) +#loc621 = loc(callsite(#loc381 at #loc459)) +#loc622 = loc(callsite(#loc382 at #loc459)) +#loc623 = loc(callsite(#loc385 at #loc459)) +#loc624 = loc(callsite(#loc432 at #loc459)) +#loc625 = loc(callsite(#loc449 at #loc459)) +#loc626 = loc(callsite(#loc450 at #loc459)) +#loc627 = loc(callsite(#loc451 at #loc459)) +#loc628 = loc(callsite(#loc452 at #loc459)) +#loc629 = loc(callsite(#loc149 at #loc459)) +#loc630 = loc(callsite(#loc352 at #loc465)) +#loc631 = loc(callsite(#loc354 at #loc465)) +#loc632 = loc(callsite(#loc355 at #loc465)) +#loc633 = loc(callsite(#loc356 at #loc465)) +#loc634 = loc(callsite(#loc357 at #loc465)) +#loc635 = loc(callsite(#loc53 at #loc465)) +#loc636 = loc(callsite(#loc355 at #loc466)) +#loc637 = loc(callsite(#loc357 at #loc466)) +#loc638 = loc(callsite(#loc53 at #loc466)) +#loc639 = loc("dk"(#loc467)) +#loc640 = loc(callsite(#loc497 at #loc196)) +#loc641 = loc(callsite(#loc498 at #loc196)) +#loc642 = loc(callsite(#loc499 at #loc196)) +#loc643 = loc(callsite(#loc500 at #loc196)) +#loc644 = loc(callsite(#loc501 at #loc196)) +#loc645 = loc(callsite(#loc502 at #loc196)) +#loc646 = loc(callsite(#loc503 at #loc196)) +#loc647 = loc(callsite(#loc504 at #loc196)) +#loc648 = loc(callsite(#loc505 at #loc196)) +#loc649 = loc(callsite(#loc506 at #loc196)) +#loc650 = loc(callsite(#loc507 at #loc196)) +#loc651 = loc("dv"(#loc508)) +#loc652 = loc(callsite(#loc509 at #loc510)) +#loc653 = loc(callsite(#loc511 at #loc510)) +#loc654 = loc(callsite(#loc512 at #loc510)) +#loc655 = loc(callsite(#loc513 at #loc510)) +#loc656 = loc(callsite(#loc514 at #loc510)) +#loc657 = loc(callsite(#loc515 at #loc510)) +#loc658 = loc(callsite(#loc516 at #loc510)) +#loc659 = loc(callsite(#loc517 at #loc510)) +#loc660 = loc(callsite(#loc518 at #loc510)) +#loc661 = loc(callsite(#loc519 at #loc510)) +#loc662 = loc(callsite(#loc520 at #loc510)) +#loc663 = loc(callsite(#loc521 at #loc510)) +#loc664 = loc(callsite(#loc522 at #loc510)) +#loc665 = loc(callsite(#loc523 at #loc510)) +#loc666 = loc(callsite(#loc524 at #loc510)) +#loc667 = loc(callsite(#loc525 at #loc510)) +#loc668 = loc(callsite(#loc526 at #loc510)) +#loc669 = loc(callsite(#loc527 at #loc510)) +#loc670 = loc(callsite(#loc528 at #loc510)) +#loc671 = loc(callsite(#loc529 at #loc510)) +#loc672 = loc(callsite(#loc530 at #loc510)) +#loc673 = loc(callsite(#loc531 at #loc510)) +#loc674 = loc(callsite(#loc532 at #loc510)) +#loc675 = loc(callsite(#loc533 at #loc510)) +#loc676 = loc(callsite(#loc534 at #loc510)) +#loc677 = loc(callsite(#loc535 at #loc510)) +#loc678 = loc(callsite(#loc536 at #loc510)) +#loc679 = loc(callsite(#loc537 at #loc510)) +#loc680 = loc(callsite(#loc538 at #loc510)) +#loc681 = loc(callsite(#loc539 at #loc510)) +#loc682 = loc(callsite(#loc540 at #loc510)) +#loc683 = loc(callsite(#loc541 at #loc510)) +#loc684 = loc(callsite(#loc542 at #loc510)) +#loc685 = loc(callsite(#loc543 at #loc510)) +#loc686 = loc(callsite(#loc544 at #loc510)) +#loc687 = loc(callsite(#loc545 at #loc510)) +#loc688 = loc(callsite(#loc546 at #loc510)) +#loc689 = loc(callsite(#loc547 at #loc510)) +#loc690 = loc(callsite(#loc548 at #loc510)) +#loc691 = loc(callsite(#loc549 at #loc510)) +#loc692 = loc(callsite(#loc550 at #loc510)) +#loc693 = loc(callsite(#loc551 at #loc510)) +#loc694 = loc(callsite(#loc552 at #loc510)) +#loc695 = loc(callsite(#loc553 at #loc510)) +#loc696 = loc(callsite(#loc554 at #loc510)) +#loc697 = loc(callsite(#loc555 at #loc510)) +#loc698 = loc(callsite(#loc556 at #loc510)) +#loc699 = loc(callsite(#loc557 at #loc510)) +#loc700 = loc(callsite(#loc558 at #loc510)) +#loc701 = loc(callsite(#loc559 at #loc510)) +#loc702 = loc(callsite(#loc560 at #loc510)) +#loc703 = loc(callsite(#loc561 at #loc510)) +#loc704 = loc(callsite(#loc562 at #loc510)) +#loc705 = loc(callsite(#loc563 at #loc510)) +#loc706 = loc(callsite(#loc564 at #loc510)) +#loc707 = loc(callsite(#loc565 at #loc510)) +#loc708 = loc(callsite(#loc566 at #loc196)) +#loc709 = loc(callsite(#loc567 at #loc196)) +#loc710 = loc(callsite(#loc568 at #loc196)) +#loc711 = loc(callsite(#loc569 at #loc196)) +#loc712 = loc(callsite(#loc570 at #loc196)) +#loc713 = loc(callsite(#loc571 at #loc196)) +#loc714 = loc(callsite(#loc497 at #loc278)) +#loc715 = loc(callsite(#loc498 at #loc278)) +#loc716 = loc(callsite(#loc499 at #loc278)) +#loc717 = loc(callsite(#loc501 at #loc278)) +#loc718 = loc(callsite(#loc502 at #loc278)) +#loc719 = loc(callsite(#loc503 at #loc278)) +#loc720 = loc(callsite(#loc504 at #loc278)) +#loc721 = loc(callsite(#loc505 at #loc278)) +#loc722 = loc(callsite(#loc506 at #loc278)) +#loc723 = loc(callsite(#loc507 at #loc278)) +#loc724 = loc(callsite(#loc509 at #loc579)) +#loc725 = loc(callsite(#loc511 at #loc579)) +#loc726 = loc(callsite(#loc512 at #loc579)) +#loc727 = loc(callsite(#loc513 at #loc579)) +#loc728 = loc(callsite(#loc514 at #loc579)) +#loc729 = loc(callsite(#loc515 at #loc579)) +#loc730 = loc(callsite(#loc516 at #loc579)) +#loc731 = loc(callsite(#loc548 at #loc579)) +#loc732 = loc(callsite(#loc549 at #loc579)) +#loc733 = loc(callsite(#loc550 at #loc579)) +#loc734 = loc(callsite(#loc551 at #loc579)) +#loc735 = loc(callsite(#loc552 at #loc579)) +#loc736 = loc(callsite(#loc553 at #loc579)) +#loc737 = loc(callsite(#loc554 at #loc579)) +#loc738 = loc(callsite(#loc555 at #loc579)) +#loc739 = loc(callsite(#loc556 at #loc579)) +#loc740 = loc(callsite(#loc557 at #loc579)) +#loc741 = loc(callsite(#loc558 at #loc579)) +#loc742 = loc(callsite(#loc559 at #loc579)) +#loc743 = loc(callsite(#loc560 at #loc579)) +#loc744 = loc(callsite(#loc561 at #loc579)) +#loc745 = loc(callsite(#loc563 at #loc579)) +#loc746 = loc(callsite(#loc564 at #loc579)) +#loc747 = loc(callsite(#loc565 at #loc579)) +#loc748 = loc(callsite(#loc566 at #loc278)) +#loc749 = loc(callsite(#loc567 at #loc278)) +#loc750 = loc(callsite(#loc568 at #loc278)) +#loc751 = loc(callsite(#loc569 at #loc278)) +#loc752 = loc(callsite(#loc570 at #loc278)) +#loc753 = loc(callsite(#loc571 at #loc278)) +#loc754 = loc("kT_ptrs"(#loc607)) +#loc755 = loc(callsite(#loc384 at #loc608)) +#loc756 = loc(callsite(#loc386 at #loc608)) +#loc757 = loc(callsite(#loc387 at #loc608)) +#loc758 = loc(callsite(#loc388 at #loc608)) +#loc759 = loc(callsite(#loc389 at #loc608)) +#loc760 = loc(callsite(#loc390 at #loc608)) +#loc761 = loc(callsite(#loc391 at #loc608)) +#loc762 = loc(callsite(#loc392 at #loc608)) +#loc763 = loc(callsite(#loc393 at #loc608)) +#loc764 = loc(callsite(#loc394 at #loc608)) +#loc765 = loc(callsite(#loc395 at #loc608)) +#loc766 = loc(callsite(#loc396 at #loc608)) +#loc767 = loc(callsite(#loc397 at #loc608)) +#loc768 = loc(callsite(#loc398 at #loc608)) +#loc769 = loc(callsite(#loc399 at #loc608)) +#loc770 = loc(callsite(#loc400 at #loc608)) +#loc771 = loc(callsite(#loc401 at #loc608)) +#loc772 = loc(callsite(#loc402 at #loc608)) +#loc773 = loc(callsite(#loc403 at #loc608)) +#loc774 = loc(callsite(#loc404 at #loc608)) +#loc775 = loc(callsite(#loc405 at #loc608)) +#loc776 = loc(callsite(#loc406 at #loc608)) +#loc777 = loc(callsite(#loc407 at #loc608)) +#loc778 = loc(callsite(#loc408 at #loc608)) +#loc779 = loc(callsite(#loc409 at #loc608)) +#loc780 = loc(callsite(#loc410 at #loc608)) +#loc781 = loc(callsite(#loc411 at #loc608)) +#loc782 = loc(callsite(#loc412 at #loc608)) +#loc783 = loc(callsite(#loc413 at #loc608)) +#loc784 = loc(callsite(#loc414 at #loc608)) +#loc785 = loc(callsite(#loc415 at #loc608)) +#loc786 = loc(callsite(#loc416 at #loc608)) +#loc787 = loc(callsite(#loc417 at #loc608)) +#loc788 = loc(callsite(#loc418 at #loc608)) +#loc789 = loc(callsite(#loc419 at #loc608)) +#loc790 = loc(callsite(#loc420 at #loc608)) +#loc791 = loc(callsite(#loc421 at #loc608)) +#loc792 = loc(callsite(#loc422 at #loc608)) +#loc793 = loc(callsite(#loc423 at #loc608)) +#loc794 = loc(callsite(#loc424 at #loc608)) +#loc795 = loc(callsite(#loc425 at #loc608)) +#loc796 = loc(callsite(#loc426 at #loc608)) +#loc797 = loc(callsite(#loc427 at #loc608)) +#loc798 = loc(callsite(#loc428 at #loc608)) +#loc799 = loc(callsite(#loc429 at #loc608)) +#loc800 = loc(callsite(#loc430 at #loc608)) +#loc801 = loc(callsite(#loc431 at #loc609)) +#loc802 = loc(callsite(#loc433 at #loc609)) +#loc803 = loc(callsite(#loc434 at #loc609)) +#loc804 = loc(callsite(#loc435 at #loc609)) +#loc805 = loc(callsite(#loc436 at #loc609)) +#loc806 = loc(callsite(#loc437 at #loc609)) +#loc807 = loc(callsite(#loc438 at #loc609)) +#loc808 = loc(callsite(#loc439 at #loc609)) +#loc809 = loc(callsite(#loc440 at #loc609)) +#loc810 = loc(callsite(#loc441 at #loc609)) +#loc811 = loc(callsite(#loc442 at #loc609)) +#loc812 = loc(callsite(#loc443 at #loc609)) +#loc813 = loc(callsite(#loc444 at #loc609)) +#loc814 = loc(callsite(#loc445 at #loc609)) +#loc815 = loc(callsite(#loc446 at #loc609)) +#loc816 = loc(callsite(#loc447 at #loc609)) +#loc817 = loc(callsite(#loc448 at #loc609)) +#loc818 = loc(callsite(#loc384 at #loc623)) +#loc819 = loc(callsite(#loc386 at #loc623)) +#loc820 = loc(callsite(#loc387 at #loc623)) +#loc821 = loc(callsite(#loc419 at #loc623)) +#loc822 = loc(callsite(#loc420 at #loc623)) +#loc823 = loc(callsite(#loc421 at #loc623)) +#loc824 = loc(callsite(#loc422 at #loc623)) +#loc825 = loc(callsite(#loc423 at #loc623)) +#loc826 = loc(callsite(#loc424 at #loc623)) +#loc827 = loc(callsite(#loc425 at #loc623)) +#loc828 = loc(callsite(#loc426 at #loc623)) +#loc829 = loc(callsite(#loc428 at #loc623)) +#loc830 = loc(callsite(#loc429 at #loc623)) +#loc831 = loc(callsite(#loc430 at #loc623)) +#loc832 = loc(callsite(#loc431 at #loc624)) +#loc833 = loc(callsite(#loc433 at #loc624)) +#loc834 = loc(callsite(#loc434 at #loc624)) +#loc835 = loc(callsite(#loc435 at #loc624)) +#loc836 = loc(callsite(#loc436 at #loc624)) +#loc837 = loc(callsite(#loc437 at #loc624)) +#loc838 = loc(callsite(#loc438 at #loc624)) +#loc839 = loc(callsite(#loc439 at #loc624)) +#loc840 = loc(callsite(#loc440 at #loc624)) +#loc841 = loc(callsite(#loc441 at #loc624)) +#loc842 = loc(callsite(#loc442 at #loc624)) +#loc843 = loc(callsite(#loc443 at #loc624)) +#loc844 = loc(callsite(#loc444 at #loc624)) +#loc845 = loc(callsite(#loc445 at #loc624)) +#loc846 = loc(callsite(#loc446 at #loc624)) +#loc847 = loc(callsite(#loc447 at #loc624)) +#loc848 = loc(callsite(#loc448 at #loc624)) +#loc849 = loc("offs_m1"(#loc651)) +#loc850 = loc(callsite(#loc53 at #loc652)) +#loc851 = loc(callsite(#loc53 at #loc694)) +#loc852 = loc(callsite(#loc431 at #loc708)) +#loc853 = loc(callsite(#loc433 at #loc708)) +#loc854 = loc(callsite(#loc434 at #loc708)) +#loc855 = loc(callsite(#loc435 at #loc708)) +#loc856 = loc(callsite(#loc436 at #loc708)) +#loc857 = loc(callsite(#loc437 at #loc708)) +#loc858 = loc(callsite(#loc438 at #loc708)) +#loc859 = loc(callsite(#loc439 at #loc708)) +#loc860 = loc(callsite(#loc440 at #loc708)) +#loc861 = loc(callsite(#loc441 at #loc708)) +#loc862 = loc(callsite(#loc442 at #loc708)) +#loc863 = loc(callsite(#loc443 at #loc708)) +#loc864 = loc(callsite(#loc444 at #loc708)) +#loc865 = loc(callsite(#loc445 at #loc708)) +#loc866 = loc(callsite(#loc446 at #loc708)) +#loc867 = loc(callsite(#loc447 at #loc708)) +#loc868 = loc(callsite(#loc448 at #loc708)) +#loc869 = loc(callsite(#loc53 at #loc724)) +#loc870 = loc(callsite(#loc53 at #loc735)) +#loc871 = loc(callsite(#loc431 at #loc748)) +#loc872 = loc(callsite(#loc433 at #loc748)) +#loc873 = loc(callsite(#loc434 at #loc748)) +#loc874 = loc(callsite(#loc435 at #loc748)) +#loc875 = loc(callsite(#loc436 at #loc748)) +#loc876 = loc(callsite(#loc437 at #loc748)) +#loc877 = loc(callsite(#loc438 at #loc748)) +#loc878 = loc(callsite(#loc439 at #loc748)) +#loc879 = loc(callsite(#loc440 at #loc748)) +#loc880 = loc(callsite(#loc441 at #loc748)) +#loc881 = loc(callsite(#loc442 at #loc748)) +#loc882 = loc(callsite(#loc443 at #loc748)) +#loc883 = loc(callsite(#loc444 at #loc748)) +#loc884 = loc(callsite(#loc445 at #loc748)) +#loc885 = loc(callsite(#loc446 at #loc748)) +#loc886 = loc(callsite(#loc447 at #loc748)) +#loc887 = loc(callsite(#loc448 at #loc748)) +#loc888 = loc("vT_ptrs"(#loc754)) +#loc889 = loc(callsite(#loc53 at #loc755)) +#loc890 = loc(callsite(#loc53 at #loc792)) +#loc891 = loc(callsite(#loc53 at #loc818)) +#loc892 = loc(callsite(#loc53 at #loc824)) +#loc893 = loc("qT_ptrs"(#loc849)) +#loc894 = loc(callsite(#loc888 at #loc374)) +#loc895 = loc(callsite(#loc888 at #loc459)) +#loc896 = loc("do_ptrs"(#loc893)) +#loc897 = loc(callsite(#loc896 at #loc196)) +#loc898 = loc(callsite(#loc896 at #loc278)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/__grp__triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/__grp__triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5489951c0903b888b3643cf28155f9ea6dfa426d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/__grp__triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.source", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttir", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttgir", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.llir", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ptx", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.cubin", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..a8bdfa1d4c94307a96e01fb06370a4251de9ade9 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3bbb2eb49ee7ab547cf104e08c4a0de0bb4b8eaa --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json @@ -0,0 +1 @@ +{"hash": "d4e9ec88be03aebb42041f53f2cbb9bb30afcd0e87a7a62c3ff7f5fe862eba44", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..ef6f12cc97a6e04c66f89b021a86f72c0c657d53 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.llir @@ -0,0 +1,667 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_1 = internal constant [8 x i8] c"unknown\00" +@assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py\00" +@assertMessage_1 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp25 < ks4\00" +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py\00" +@assertMessage_0 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp5 < ks2\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9, i32 %10, ptr addrspace(1) readnone captures(none) %11, ptr addrspace(1) readnone captures(none) %12) local_unnamed_addr #1 !dbg !9 { + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %15 = shl i32 %14, 10, !dbg !11 + %16 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %17 = shl nuw nsw i32 %16, 3, !dbg !12 + %18 = and i32 %17, 1016, !dbg !12 + %19 = or disjoint i32 %18, %15, !dbg !13 + %20 = or disjoint i32 %19, 1, !dbg !13 + %21 = or disjoint i32 %19, 2, !dbg !13 + %22 = or disjoint i32 %19, 3, !dbg !13 + %23 = or disjoint i32 %19, 4, !dbg !13 + %24 = or disjoint i32 %19, 5, !dbg !13 + %25 = or disjoint i32 %19, 6, !dbg !13 + %26 = or disjoint i32 %19, 7, !dbg !13 + %27 = insertelement <8 x i32> poison, i32 %26, i64 0, !dbg !14 + %28 = insertelement <8 x i32> %27, i32 %25, i64 1, !dbg !14 + %29 = insertelement <8 x i32> %28, i32 %24, i64 2, !dbg !14 + %30 = insertelement <8 x i32> %29, i32 %23, i64 3, !dbg !14 + %31 = insertelement <8 x i32> %30, i32 %22, i64 4, !dbg !14 + %32 = insertelement <8 x i32> %31, i32 %21, i64 5, !dbg !14 + %33 = insertelement <8 x i32> %32, i32 %20, i64 6, !dbg !14 + %34 = insertelement <8 x i32> %33, i32 %19, i64 7, !dbg !14 + %35 = sext <8 x i32> %34 to <8 x i64>, !dbg !14 + %36 = extractelement <8 x i64> %35, i64 7, !dbg !15 + %37 = sdiv i64 %36, %5, !dbg !14 + %38 = extractelement <8 x i64> %35, i64 6, !dbg !15 + %39 = sdiv i64 %38, %5, !dbg !14 + %40 = extractelement <8 x i64> %35, i64 5, !dbg !15 + %41 = sdiv i64 %40, %5, !dbg !14 + %42 = extractelement <8 x i64> %35, i64 4, !dbg !15 + %43 = sdiv i64 %42, %5, !dbg !14 + %44 = extractelement <8 x i64> %35, i64 3, !dbg !15 + %45 = sdiv i64 %44, %5, !dbg !14 + %46 = extractelement <8 x i64> %35, i64 2, !dbg !15 + %47 = sdiv i64 %46, %5, !dbg !14 + %48 = extractelement <8 x i64> %35, i64 1, !dbg !15 + %49 = sdiv i64 %48, %5, !dbg !14 + %50 = extractelement <8 x i64> %35, i64 0, !dbg !15 + %51 = sdiv i64 %50, %5, !dbg !14 + %52 = srem i64 %37, %6, !dbg !16 + %53 = srem i64 %39, %6, !dbg !16 + %54 = srem i64 %41, %6, !dbg !16 + %55 = srem i64 %43, %6, !dbg !16 + %56 = srem i64 %45, %6, !dbg !16 + %57 = srem i64 %47, %6, !dbg !16 + %58 = srem i64 %49, %6, !dbg !16 + %59 = srem i64 %51, %6, !dbg !16 + %60 = getelementptr bfloat, ptr addrspace(1) %0, i64 %36, !dbg !15 + %61 = getelementptr bfloat, ptr addrspace(1) %0, i64 %38, !dbg !15 + %62 = getelementptr bfloat, ptr addrspace(1) %0, i64 %40, !dbg !15 + %63 = getelementptr bfloat, ptr addrspace(1) %0, i64 %42, !dbg !15 + %64 = getelementptr bfloat, ptr addrspace(1) %0, i64 %44, !dbg !15 + %65 = getelementptr bfloat, ptr addrspace(1) %0, i64 %46, !dbg !15 + %66 = getelementptr bfloat, ptr addrspace(1) %0, i64 %48, !dbg !15 + %67 = getelementptr bfloat, ptr addrspace(1) %0, i64 %50, !dbg !15 + %68 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %69 = getelementptr i64, ptr addrspace(1) %1, i64 %52, !dbg !18 + %70 = getelementptr i64, ptr addrspace(1) %1, i64 %53, !dbg !18 + %71 = getelementptr i64, ptr addrspace(1) %1, i64 %54, !dbg !18 + %72 = getelementptr i64, ptr addrspace(1) %1, i64 %55, !dbg !18 + %73 = getelementptr i64, ptr addrspace(1) %1, i64 %56, !dbg !18 + %74 = getelementptr i64, ptr addrspace(1) %1, i64 %57, !dbg !18 + %75 = getelementptr i64, ptr addrspace(1) %1, i64 %58, !dbg !18 + %76 = getelementptr i64, ptr addrspace(1) %1, i64 %59, !dbg !18 + %77 = insertelement <8 x i32> poison, i32 %19, i64 0, !dbg !19 + %78 = insertelement <8 x i32> %77, i32 %20, i64 1, !dbg !19 + %79 = insertelement <8 x i32> %78, i32 %21, i64 2, !dbg !19 + %80 = insertelement <8 x i32> %79, i32 %22, i64 3, !dbg !19 + %81 = insertelement <8 x i32> %80, i32 %23, i64 4, !dbg !19 + %82 = insertelement <8 x i32> %81, i32 %24, i64 5, !dbg !19 + %83 = insertelement <8 x i32> %82, i32 %25, i64 6, !dbg !19 + %84 = insertelement <8 x i32> %83, i32 %26, i64 7, !dbg !19 + %85 = insertelement <8 x i32> poison, i32 %10, i64 0, !dbg !19 + %86 = shufflevector <8 x i32> %85, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !19 + %87 = icmp slt <8 x i32> %84, %86, !dbg !19 + %88 = extractelement <8 x i1> %87, i64 0, !dbg !17 + %89 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %60, i64 %68, i1 %88) #4, !dbg !17 + %90 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %91 = extractelement <8 x i1> %87, i64 1, !dbg !17 + %92 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %61, i64 %90, i1 %91) #4, !dbg !17 + %93 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %94 = extractelement <8 x i1> %87, i64 2, !dbg !17 + %95 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %62, i64 %93, i1 %94) #4, !dbg !17 + %96 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %97 = extractelement <8 x i1> %87, i64 3, !dbg !17 + %98 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %63, i64 %96, i1 %97) #4, !dbg !17 + %99 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %100 = extractelement <8 x i1> %87, i64 4, !dbg !17 + %101 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %64, i64 %99, i1 %100) #4, !dbg !17 + %102 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %103 = extractelement <8 x i1> %87, i64 5, !dbg !17 + %104 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %65, i64 %102, i1 %103) #4, !dbg !17 + %105 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %106 = extractelement <8 x i1> %87, i64 6, !dbg !17 + %107 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %66, i64 %105, i1 %106) #4, !dbg !17 + %108 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %109 = extractelement <8 x i1> %87, i64 7, !dbg !17 + %110 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %67, i64 %108, i1 %109) #4, !dbg !17 + %111 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %112 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %69, i64 %111, i1 %88) #4, !dbg !20 + %113 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %114 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %70, i64 %113, i1 %91) #4, !dbg !20 + %115 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %116 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %71, i64 %115, i1 %94) #4, !dbg !20 + %117 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %118 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %72, i64 %117, i1 %97) #4, !dbg !20 + %119 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %120 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %73, i64 %119, i1 %100) #4, !dbg !20 + %121 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %122 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %74, i64 %121, i1 %103) #4, !dbg !20 + %123 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %124 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %75, i64 %123, i1 %106) #4, !dbg !20 + %125 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %126 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %76, i64 %125, i1 %109) #4, !dbg !20 + %127 = insertelement <8 x i64> poison, i64 %112, i64 0, !dbg !21 + %128 = insertelement <8 x i64> %127, i64 %114, i64 1, !dbg !21 + %129 = insertelement <8 x i64> %128, i64 %116, i64 2, !dbg !21 + %130 = insertelement <8 x i64> %129, i64 %118, i64 3, !dbg !21 + %131 = insertelement <8 x i64> %130, i64 %120, i64 4, !dbg !21 + %132 = insertelement <8 x i64> %131, i64 %122, i64 5, !dbg !21 + %133 = insertelement <8 x i64> %132, i64 %124, i64 6, !dbg !21 + %134 = insertelement <8 x i64> %133, i64 %126, i64 7, !dbg !21 + %135 = icmp slt <8 x i64> %134, zeroinitializer, !dbg !21 + %136 = insertelement <8 x i64> poison, i64 %7, i64 0, !dbg !22 + %137 = shufflevector <8 x i64> %136, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !22 + %138 = select <8 x i1> %135, <8 x i64> %137, <8 x i64> zeroinitializer, !dbg !22 + %139 = add <8 x i64> %138, %134, !dbg !22 + %140 = icmp slt <8 x i64> %139, zeroinitializer, !dbg !23 + %141 = icmp sge <8 x i64> %139, %137, !dbg !24 + %142 = or <8 x i1> %140, %141, !dbg !25 + %143 = and <8 x i1> %87, %142, !dbg !26 + %144 = bitcast <8 x i1> %143 to i8, !dbg !27 + %.not = icmp eq i8 %144, 0, !dbg !27 + br i1 %.not, label %146, label %145, !dbg !27 + +145: ; preds = %13 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 32, ptr nonnull @assertFunc_0, i64 1), !dbg !27 + unreachable, !dbg !27 + +146: ; preds = %13 + %147 = insertelement <8 x i64> poison, i64 %8, i64 0, !dbg !28 + %148 = shufflevector <8 x i64> %147, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !28 + %149 = srem <8 x i64> %35, %148, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !27 + %150 = extractelement <8 x i64> %139, i64 0, !dbg !29 + %151 = mul i64 %150, %8, !dbg !29 + %152 = extractelement <8 x i64> %139, i64 1, !dbg !29 + %153 = mul i64 %152, %8, !dbg !29 + %154 = extractelement <8 x i64> %139, i64 2, !dbg !29 + %155 = mul i64 %154, %8, !dbg !29 + %156 = extractelement <8 x i64> %139, i64 3, !dbg !29 + %157 = mul i64 %156, %8, !dbg !29 + %158 = extractelement <8 x i64> %139, i64 4, !dbg !29 + %159 = mul i64 %158, %8, !dbg !29 + %160 = extractelement <8 x i64> %139, i64 5, !dbg !29 + %161 = mul i64 %160, %8, !dbg !29 + %162 = extractelement <8 x i64> %139, i64 6, !dbg !29 + %163 = mul i64 %162, %8, !dbg !29 + %164 = extractelement <8 x i64> %139, i64 7, !dbg !29 + %165 = mul i64 %164, %8, !dbg !29 + %166 = extractelement <8 x i64> %149, i64 7, !dbg !30 + %167 = getelementptr bfloat, ptr addrspace(1) %2, i64 %166, !dbg !31 + %168 = getelementptr bfloat, ptr addrspace(1) %167, i64 %151, !dbg !31 + %169 = extractelement <8 x i64> %149, i64 6, !dbg !30 + %170 = getelementptr bfloat, ptr addrspace(1) %2, i64 %169, !dbg !31 + %171 = getelementptr bfloat, ptr addrspace(1) %170, i64 %153, !dbg !31 + %172 = extractelement <8 x i64> %149, i64 5, !dbg !30 + %173 = getelementptr bfloat, ptr addrspace(1) %2, i64 %172, !dbg !31 + %174 = getelementptr bfloat, ptr addrspace(1) %173, i64 %155, !dbg !31 + %175 = extractelement <8 x i64> %149, i64 4, !dbg !30 + %176 = getelementptr bfloat, ptr addrspace(1) %2, i64 %175, !dbg !31 + %177 = getelementptr bfloat, ptr addrspace(1) %176, i64 %157, !dbg !31 + %178 = extractelement <8 x i64> %149, i64 3, !dbg !30 + %179 = getelementptr bfloat, ptr addrspace(1) %2, i64 %178, !dbg !31 + %180 = getelementptr bfloat, ptr addrspace(1) %179, i64 %159, !dbg !31 + %181 = extractelement <8 x i64> %149, i64 2, !dbg !30 + %182 = getelementptr bfloat, ptr addrspace(1) %2, i64 %181, !dbg !31 + %183 = getelementptr bfloat, ptr addrspace(1) %182, i64 %161, !dbg !31 + %184 = extractelement <8 x i64> %149, i64 1, !dbg !30 + %185 = getelementptr bfloat, ptr addrspace(1) %2, i64 %184, !dbg !31 + %186 = getelementptr bfloat, ptr addrspace(1) %185, i64 %163, !dbg !31 + %187 = extractelement <8 x i64> %149, i64 0, !dbg !30 + %188 = getelementptr bfloat, ptr addrspace(1) %2, i64 %187, !dbg !31 + %189 = getelementptr bfloat, ptr addrspace(1) %188, i64 %165, !dbg !31 + %190 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %191 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %168, i64 %190, i1 %88) #4, !dbg !32 + %192 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %193 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %171, i64 %192, i1 %91) #4, !dbg !32 + %194 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %195 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %174, i64 %194, i1 %94) #4, !dbg !32 + %196 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %197 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %177, i64 %196, i1 %97) #4, !dbg !32 + %198 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %199 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %180, i64 %198, i1 %100) #4, !dbg !32 + %200 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %201 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %183, i64 %200, i1 %103) #4, !dbg !32 + %202 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %203 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %186, i64 %202, i1 %106) #4, !dbg !32 + %204 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %205 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %189, i64 %204, i1 %109) #4, !dbg !32 + %206 = sdiv i64 %8, 2, !dbg !33 + %207 = sub i64 %8, %206, !dbg !34 + %208 = icmp slt i64 %166, %207, !dbg !35 + %209 = icmp slt i64 %169, %207, !dbg !35 + %210 = icmp slt i64 %172, %207, !dbg !35 + %211 = icmp slt i64 %175, %207, !dbg !35 + %212 = icmp slt i64 %178, %207, !dbg !35 + %213 = icmp slt i64 %181, %207, !dbg !35 + %214 = icmp slt i64 %184, %207, !dbg !35 + %215 = icmp slt i64 %187, %207, !dbg !35 + %216 = sub nsw i64 %36, %166, !dbg !30 + %217 = sub nsw i64 %38, %169, !dbg !30 + %218 = sub nsw i64 %40, %172, !dbg !30 + %219 = sub nsw i64 %42, %175, !dbg !30 + %220 = sub nsw i64 %44, %178, !dbg !30 + %221 = sub nsw i64 %46, %181, !dbg !30 + %222 = sub nsw i64 %48, %184, !dbg !30 + %223 = sub nsw i64 %50, %187, !dbg !30 + %224 = getelementptr bfloat, ptr addrspace(1) %0, i64 %216, !dbg !36 + %225 = getelementptr bfloat, ptr addrspace(1) %224, i64 %206, !dbg !36 + %226 = getelementptr bfloat, ptr addrspace(1) %225, i64 %166, !dbg !36 + %227 = getelementptr bfloat, ptr addrspace(1) %0, i64 %217, !dbg !36 + %228 = getelementptr bfloat, ptr addrspace(1) %227, i64 %206, !dbg !36 + %229 = getelementptr bfloat, ptr addrspace(1) %228, i64 %169, !dbg !36 + %230 = getelementptr bfloat, ptr addrspace(1) %0, i64 %218, !dbg !36 + %231 = getelementptr bfloat, ptr addrspace(1) %230, i64 %206, !dbg !36 + %232 = getelementptr bfloat, ptr addrspace(1) %231, i64 %172, !dbg !36 + %233 = getelementptr bfloat, ptr addrspace(1) %0, i64 %219, !dbg !36 + %234 = getelementptr bfloat, ptr addrspace(1) %233, i64 %206, !dbg !36 + %235 = getelementptr bfloat, ptr addrspace(1) %234, i64 %175, !dbg !36 + %236 = getelementptr bfloat, ptr addrspace(1) %0, i64 %220, !dbg !36 + %237 = getelementptr bfloat, ptr addrspace(1) %236, i64 %206, !dbg !36 + %238 = getelementptr bfloat, ptr addrspace(1) %237, i64 %178, !dbg !36 + %239 = getelementptr bfloat, ptr addrspace(1) %0, i64 %221, !dbg !36 + %240 = getelementptr bfloat, ptr addrspace(1) %239, i64 %206, !dbg !36 + %241 = getelementptr bfloat, ptr addrspace(1) %240, i64 %181, !dbg !36 + %242 = getelementptr bfloat, ptr addrspace(1) %0, i64 %222, !dbg !36 + %243 = getelementptr bfloat, ptr addrspace(1) %242, i64 %206, !dbg !36 + %244 = getelementptr bfloat, ptr addrspace(1) %243, i64 %184, !dbg !36 + %245 = getelementptr bfloat, ptr addrspace(1) %0, i64 %223, !dbg !36 + %246 = getelementptr bfloat, ptr addrspace(1) %245, i64 %206, !dbg !36 + %247 = getelementptr bfloat, ptr addrspace(1) %246, i64 %187, !dbg !36 + %248 = and i1 %88, %208, !dbg !37 + %249 = and i1 %91, %209, !dbg !37 + %250 = and i1 %94, %210, !dbg !37 + %251 = and i1 %97, %211, !dbg !37 + %252 = and i1 %100, %212, !dbg !37 + %253 = and i1 %103, %213, !dbg !37 + %254 = and i1 %106, %214, !dbg !37 + %255 = and i1 %109, %215, !dbg !37 + %256 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %257 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %226, i64 %256, i1 %248) #4, !dbg !38 + %258 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %259 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %229, i64 %258, i1 %249) #4, !dbg !38 + %260 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %261 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %232, i64 %260, i1 %250) #4, !dbg !38 + %262 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %263 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %235, i64 %262, i1 %251) #4, !dbg !38 + %264 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %265 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %238, i64 %264, i1 %252) #4, !dbg !38 + %266 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %267 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %241, i64 %266, i1 %253) #4, !dbg !38 + %268 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %269 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %244, i64 %268, i1 %254) #4, !dbg !38 + %270 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %271 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %247, i64 %270, i1 %255) #4, !dbg !38 + %272 = insertelement <8 x i64> poison, i64 %207, i64 0, !dbg !39 + %273 = shufflevector <8 x i64> %272, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !39 + %274 = icmp sge <8 x i64> %149, %273, !dbg !39 + %275 = sub i64 %166, %8, !dbg !40 + %276 = sub i64 %169, %8, !dbg !40 + %277 = sub i64 %172, %8, !dbg !40 + %278 = sub i64 %175, %8, !dbg !40 + %279 = sub i64 %178, %8, !dbg !40 + %280 = sub i64 %181, %8, !dbg !40 + %281 = sub i64 %184, %8, !dbg !40 + %282 = sub i64 %187, %8, !dbg !40 + %283 = getelementptr bfloat, ptr addrspace(1) %224, i64 %275, !dbg !41 + %284 = getelementptr bfloat, ptr addrspace(1) %283, i64 %206, !dbg !41 + %285 = getelementptr bfloat, ptr addrspace(1) %227, i64 %276, !dbg !41 + %286 = getelementptr bfloat, ptr addrspace(1) %285, i64 %206, !dbg !41 + %287 = getelementptr bfloat, ptr addrspace(1) %230, i64 %277, !dbg !41 + %288 = getelementptr bfloat, ptr addrspace(1) %287, i64 %206, !dbg !41 + %289 = getelementptr bfloat, ptr addrspace(1) %233, i64 %278, !dbg !41 + %290 = getelementptr bfloat, ptr addrspace(1) %289, i64 %206, !dbg !41 + %291 = getelementptr bfloat, ptr addrspace(1) %236, i64 %279, !dbg !41 + %292 = getelementptr bfloat, ptr addrspace(1) %291, i64 %206, !dbg !41 + %293 = getelementptr bfloat, ptr addrspace(1) %239, i64 %280, !dbg !41 + %294 = getelementptr bfloat, ptr addrspace(1) %293, i64 %206, !dbg !41 + %295 = getelementptr bfloat, ptr addrspace(1) %242, i64 %281, !dbg !41 + %296 = getelementptr bfloat, ptr addrspace(1) %295, i64 %206, !dbg !41 + %297 = getelementptr bfloat, ptr addrspace(1) %245, i64 %282, !dbg !41 + %298 = getelementptr bfloat, ptr addrspace(1) %297, i64 %206, !dbg !41 + %299 = extractelement <8 x i1> %274, i64 7, !dbg !42 + %300 = and i1 %88, %299, !dbg !42 + %301 = extractelement <8 x i1> %274, i64 6, !dbg !42 + %302 = and i1 %91, %301, !dbg !42 + %303 = extractelement <8 x i1> %274, i64 5, !dbg !42 + %304 = and i1 %94, %303, !dbg !42 + %305 = extractelement <8 x i1> %274, i64 4, !dbg !42 + %306 = and i1 %97, %305, !dbg !42 + %307 = extractelement <8 x i1> %274, i64 3, !dbg !42 + %308 = and i1 %100, %307, !dbg !42 + %309 = extractelement <8 x i1> %274, i64 2, !dbg !42 + %310 = and i1 %103, %309, !dbg !42 + %311 = extractelement <8 x i1> %274, i64 1, !dbg !42 + %312 = and i1 %106, %311, !dbg !42 + %313 = extractelement <8 x i1> %274, i64 0, !dbg !42 + %314 = and i1 %109, %313, !dbg !42 + %315 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !43 + %316 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %284, i64 %315, i1 %300) #4, !dbg !43 + %317 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !43 + %318 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %286, i64 %317, i1 %302) #4, !dbg !43 + %319 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !43 + %320 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %288, i64 %319, i1 %304) #4, !dbg !43 + %321 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !43 + %322 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %290, i64 %321, i1 %306) #4, !dbg !43 + %323 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !43 + %324 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %292, i64 %323, i1 %308) #4, !dbg !43 + %325 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !43 + %326 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %294, i64 %325, i1 %310) #4, !dbg !43 + %327 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !43 + %328 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %296, i64 %327, i1 %312) #4, !dbg !43 + %329 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !43 + %330 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %298, i64 %329, i1 %314) #4, !dbg !43 + %331 = insertelement <8 x i64> poison, i64 %9, i64 0, !dbg !44 + %332 = shufflevector <8 x i64> %331, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !44 + %333 = select <8 x i1> %135, <8 x i64> %332, <8 x i64> zeroinitializer, !dbg !44 + %334 = add <8 x i64> %333, %134, !dbg !44 + %335 = icmp slt <8 x i64> %334, zeroinitializer, !dbg !45 + %336 = icmp sge <8 x i64> %334, %332, !dbg !46 + %337 = or <8 x i1> %335, %336, !dbg !47 + %338 = and <8 x i1> %87, %337, !dbg !48 + %339 = bitcast <8 x i1> %338 to i8, !dbg !49 + %.not87 = icmp eq i8 %339, 0, !dbg !49 + br i1 %.not87, label %341, label %340, !dbg !49 + +340: ; preds = %146 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 52, ptr nonnull @assertFunc_1, i64 1), !dbg !49 + unreachable, !dbg !49 + +341: ; preds = %146 + %342 = bitcast i16 %271 to bfloat, !dbg !38 + %343 = fpext bfloat %342 to float, !dbg !50 + %344 = fsub float 0.000000e+00, %343, !dbg !51 + %345 = bitcast i16 %330 to bfloat, !dbg !43 + %346 = fpext bfloat %345 to float, !dbg !52 + %347 = select i1 %215, float %344, float %346, !dbg !53 + %348 = bitcast i16 %269 to bfloat, !dbg !38 + %349 = fpext bfloat %348 to float, !dbg !50 + %350 = fsub float 0.000000e+00, %349, !dbg !51 + %351 = bitcast i16 %328 to bfloat, !dbg !43 + %352 = fpext bfloat %351 to float, !dbg !52 + %353 = select i1 %214, float %350, float %352, !dbg !53 + %354 = bitcast i16 %267 to bfloat, !dbg !38 + %355 = fpext bfloat %354 to float, !dbg !50 + %356 = fsub float 0.000000e+00, %355, !dbg !51 + %357 = bitcast i16 %326 to bfloat, !dbg !43 + %358 = fpext bfloat %357 to float, !dbg !52 + %359 = select i1 %213, float %356, float %358, !dbg !53 + %360 = bitcast i16 %265 to bfloat, !dbg !38 + %361 = fpext bfloat %360 to float, !dbg !50 + %362 = fsub float 0.000000e+00, %361, !dbg !51 + %363 = bitcast i16 %324 to bfloat, !dbg !43 + %364 = fpext bfloat %363 to float, !dbg !52 + %365 = select i1 %212, float %362, float %364, !dbg !53 + %366 = bitcast i16 %263 to bfloat, !dbg !38 + %367 = fpext bfloat %366 to float, !dbg !50 + %368 = fsub float 0.000000e+00, %367, !dbg !51 + %369 = bitcast i16 %322 to bfloat, !dbg !43 + %370 = fpext bfloat %369 to float, !dbg !52 + %371 = select i1 %211, float %368, float %370, !dbg !53 + %372 = bitcast i16 %261 to bfloat, !dbg !38 + %373 = fpext bfloat %372 to float, !dbg !50 + %374 = fsub float 0.000000e+00, %373, !dbg !51 + %375 = bitcast i16 %320 to bfloat, !dbg !43 + %376 = fpext bfloat %375 to float, !dbg !52 + %377 = select i1 %210, float %374, float %376, !dbg !53 + %378 = bitcast i16 %259 to bfloat, !dbg !38 + %379 = fpext bfloat %378 to float, !dbg !50 + %380 = fsub float 0.000000e+00, %379, !dbg !51 + %381 = bitcast i16 %318 to bfloat, !dbg !43 + %382 = fpext bfloat %381 to float, !dbg !52 + %383 = select i1 %209, float %380, float %382, !dbg !53 + %384 = bitcast i16 %257 to bfloat, !dbg !38 + %385 = fpext bfloat %384 to float, !dbg !50 + %386 = fsub float 0.000000e+00, %385, !dbg !51 + %387 = bitcast i16 %316 to bfloat, !dbg !43 + %388 = fpext bfloat %387 to float, !dbg !52 + %389 = select i1 %208, float %386, float %388, !dbg !53 + %390 = bitcast i16 %110 to bfloat, !dbg !17 + %391 = fpext bfloat %390 to float, !dbg !54 + %392 = bitcast i16 %107 to bfloat, !dbg !17 + %393 = fpext bfloat %392 to float, !dbg !54 + %394 = bitcast i16 %104 to bfloat, !dbg !17 + %395 = fpext bfloat %394 to float, !dbg !54 + %396 = bitcast i16 %101 to bfloat, !dbg !17 + %397 = fpext bfloat %396 to float, !dbg !54 + %398 = bitcast i16 %98 to bfloat, !dbg !17 + %399 = fpext bfloat %398 to float, !dbg !54 + %400 = bitcast i16 %95 to bfloat, !dbg !17 + %401 = fpext bfloat %400 to float, !dbg !54 + %402 = bitcast i16 %92 to bfloat, !dbg !17 + %403 = fpext bfloat %402 to float, !dbg !54 + %404 = bitcast i16 %89 to bfloat, !dbg !17 + %405 = fpext bfloat %404 to float, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !49 + %406 = extractelement <8 x i64> %334, i64 0, !dbg !55 + %407 = mul i64 %406, %8, !dbg !55 + %408 = extractelement <8 x i64> %334, i64 1, !dbg !55 + %409 = mul i64 %408, %8, !dbg !55 + %410 = extractelement <8 x i64> %334, i64 2, !dbg !55 + %411 = mul i64 %410, %8, !dbg !55 + %412 = extractelement <8 x i64> %334, i64 3, !dbg !55 + %413 = mul i64 %412, %8, !dbg !55 + %414 = extractelement <8 x i64> %334, i64 4, !dbg !55 + %415 = mul i64 %414, %8, !dbg !55 + %416 = extractelement <8 x i64> %334, i64 5, !dbg !55 + %417 = mul i64 %416, %8, !dbg !55 + %418 = extractelement <8 x i64> %334, i64 6, !dbg !55 + %419 = mul i64 %418, %8, !dbg !55 + %420 = extractelement <8 x i64> %334, i64 7, !dbg !55 + %421 = mul i64 %420, %8, !dbg !55 + %422 = getelementptr bfloat, ptr addrspace(1) %3, i64 %166, !dbg !56 + %423 = getelementptr bfloat, ptr addrspace(1) %422, i64 %407, !dbg !56 + %424 = getelementptr bfloat, ptr addrspace(1) %3, i64 %169, !dbg !56 + %425 = getelementptr bfloat, ptr addrspace(1) %424, i64 %409, !dbg !56 + %426 = getelementptr bfloat, ptr addrspace(1) %3, i64 %172, !dbg !56 + %427 = getelementptr bfloat, ptr addrspace(1) %426, i64 %411, !dbg !56 + %428 = getelementptr bfloat, ptr addrspace(1) %3, i64 %175, !dbg !56 + %429 = getelementptr bfloat, ptr addrspace(1) %428, i64 %413, !dbg !56 + %430 = getelementptr bfloat, ptr addrspace(1) %3, i64 %178, !dbg !56 + %431 = getelementptr bfloat, ptr addrspace(1) %430, i64 %415, !dbg !56 + %432 = getelementptr bfloat, ptr addrspace(1) %3, i64 %181, !dbg !56 + %433 = getelementptr bfloat, ptr addrspace(1) %432, i64 %417, !dbg !56 + %434 = getelementptr bfloat, ptr addrspace(1) %3, i64 %184, !dbg !56 + %435 = getelementptr bfloat, ptr addrspace(1) %434, i64 %419, !dbg !56 + %436 = getelementptr bfloat, ptr addrspace(1) %3, i64 %187, !dbg !56 + %437 = getelementptr bfloat, ptr addrspace(1) %436, i64 %421, !dbg !56 + %438 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %439 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %423, i64 %438, i1 %88) #4, !dbg !57 + %440 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %441 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %425, i64 %440, i1 %91) #4, !dbg !57 + %442 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %443 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %427, i64 %442, i1 %94) #4, !dbg !57 + %444 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %445 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %429, i64 %444, i1 %97) #4, !dbg !57 + %446 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %447 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %431, i64 %446, i1 %100) #4, !dbg !57 + %448 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %449 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %433, i64 %448, i1 %103) #4, !dbg !57 + %450 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %451 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %435, i64 %450, i1 %106) #4, !dbg !57 + %452 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !57 + %453 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %437, i64 %452, i1 %109) #4, !dbg !57 + %454 = insertelement <2 x i16> poison, i16 %191, i64 0, !dbg !32 + %455 = insertelement <2 x i16> %454, i16 %439, i64 1, !dbg !32 + %456 = bitcast <2 x i16> %455 to <2 x bfloat>, !dbg !32 + %457 = fpext <2 x bfloat> %456 to <2 x float>, !dbg !58 + %458 = insertelement <2 x float> poison, float %405, i64 0, !dbg !59 + %459 = insertelement <2 x float> %458, float %389, i64 1, !dbg !59 + %460 = fmul <2 x float> %459, %457, !dbg !59 + %461 = insertelement <2 x i16> poison, i16 %193, i64 0, !dbg !32 + %462 = insertelement <2 x i16> %461, i16 %441, i64 1, !dbg !32 + %463 = bitcast <2 x i16> %462 to <2 x bfloat>, !dbg !32 + %464 = fpext <2 x bfloat> %463 to <2 x float>, !dbg !58 + %465 = insertelement <2 x float> poison, float %403, i64 0, !dbg !59 + %466 = insertelement <2 x float> %465, float %383, i64 1, !dbg !59 + %467 = fmul <2 x float> %466, %464, !dbg !59 + %468 = insertelement <2 x i16> poison, i16 %195, i64 0, !dbg !32 + %469 = insertelement <2 x i16> %468, i16 %443, i64 1, !dbg !32 + %470 = bitcast <2 x i16> %469 to <2 x bfloat>, !dbg !32 + %471 = fpext <2 x bfloat> %470 to <2 x float>, !dbg !58 + %472 = insertelement <2 x float> poison, float %401, i64 0, !dbg !59 + %473 = insertelement <2 x float> %472, float %377, i64 1, !dbg !59 + %474 = fmul <2 x float> %473, %471, !dbg !59 + %475 = insertelement <2 x i16> poison, i16 %197, i64 0, !dbg !32 + %476 = insertelement <2 x i16> %475, i16 %445, i64 1, !dbg !32 + %477 = bitcast <2 x i16> %476 to <2 x bfloat>, !dbg !32 + %478 = fpext <2 x bfloat> %477 to <2 x float>, !dbg !58 + %479 = insertelement <2 x float> poison, float %399, i64 0, !dbg !59 + %480 = insertelement <2 x float> %479, float %371, i64 1, !dbg !59 + %481 = fmul <2 x float> %480, %478, !dbg !59 + %482 = insertelement <2 x i16> poison, i16 %199, i64 0, !dbg !32 + %483 = insertelement <2 x i16> %482, i16 %447, i64 1, !dbg !32 + %484 = bitcast <2 x i16> %483 to <2 x bfloat>, !dbg !32 + %485 = fpext <2 x bfloat> %484 to <2 x float>, !dbg !58 + %486 = insertelement <2 x float> poison, float %397, i64 0, !dbg !59 + %487 = insertelement <2 x float> %486, float %365, i64 1, !dbg !59 + %488 = fmul <2 x float> %487, %485, !dbg !59 + %489 = insertelement <2 x i16> poison, i16 %201, i64 0, !dbg !32 + %490 = insertelement <2 x i16> %489, i16 %449, i64 1, !dbg !32 + %491 = bitcast <2 x i16> %490 to <2 x bfloat>, !dbg !32 + %492 = fpext <2 x bfloat> %491 to <2 x float>, !dbg !58 + %493 = insertelement <2 x float> poison, float %395, i64 0, !dbg !59 + %494 = insertelement <2 x float> %493, float %359, i64 1, !dbg !59 + %495 = fmul <2 x float> %494, %492, !dbg !59 + %496 = insertelement <2 x i16> poison, i16 %203, i64 0, !dbg !32 + %497 = insertelement <2 x i16> %496, i16 %451, i64 1, !dbg !32 + %498 = bitcast <2 x i16> %497 to <2 x bfloat>, !dbg !32 + %499 = fpext <2 x bfloat> %498 to <2 x float>, !dbg !58 + %500 = insertelement <2 x float> poison, float %393, i64 0, !dbg !59 + %501 = insertelement <2 x float> %500, float %353, i64 1, !dbg !59 + %502 = fmul <2 x float> %501, %499, !dbg !59 + %503 = insertelement <2 x i16> poison, i16 %205, i64 0, !dbg !32 + %504 = insertelement <2 x i16> %503, i16 %453, i64 1, !dbg !32 + %505 = bitcast <2 x i16> %504 to <2 x bfloat>, !dbg !32 + %506 = fpext <2 x bfloat> %505 to <2 x float>, !dbg !58 + %507 = insertelement <2 x float> poison, float %391, i64 0, !dbg !59 + %508 = insertelement <2 x float> %507, float %347, i64 1, !dbg !59 + %509 = fmul <2 x float> %508, %506, !dbg !59 + %shift = shufflevector <2 x float> %460, <2 x float> poison, <2 x i32> , !dbg !60 + %foldExtExtBinop = fadd <2 x float> %460, %shift, !dbg !60 + %510 = extractelement <2 x float> %foldExtExtBinop, i64 0, !dbg !60 + %shift66 = shufflevector <2 x float> %467, <2 x float> poison, <2 x i32> , !dbg !60 + %foldExtExtBinop67 = fadd <2 x float> %467, %shift66, !dbg !60 + %511 = extractelement <2 x float> %foldExtExtBinop67, i64 0, !dbg !60 + %shift69 = shufflevector <2 x float> %474, <2 x float> poison, <2 x i32> , !dbg !60 + %foldExtExtBinop70 = fadd <2 x float> %474, %shift69, !dbg !60 + %512 = extractelement <2 x float> %foldExtExtBinop70, i64 0, !dbg !60 + %shift72 = shufflevector <2 x float> %481, <2 x float> poison, <2 x i32> , !dbg !60 + %foldExtExtBinop73 = fadd <2 x float> %481, %shift72, !dbg !60 + %513 = extractelement <2 x float> %foldExtExtBinop73, i64 0, !dbg !60 + %shift75 = shufflevector <2 x float> %488, <2 x float> poison, <2 x i32> , !dbg !60 + %foldExtExtBinop76 = fadd <2 x float> %488, %shift75, !dbg !60 + %514 = extractelement <2 x float> %foldExtExtBinop76, i64 0, !dbg !60 + %shift78 = shufflevector <2 x float> %495, <2 x float> poison, <2 x i32> , !dbg !60 + %foldExtExtBinop79 = fadd <2 x float> %495, %shift78, !dbg !60 + %515 = extractelement <2 x float> %foldExtExtBinop79, i64 0, !dbg !60 + %shift81 = shufflevector <2 x float> %502, <2 x float> poison, <2 x i32> , !dbg !60 + %foldExtExtBinop82 = fadd <2 x float> %502, %shift81, !dbg !60 + %516 = extractelement <2 x float> %foldExtExtBinop82, i64 0, !dbg !60 + %shift84 = shufflevector <2 x float> %509, <2 x float> poison, <2 x i32> , !dbg !60 + %foldExtExtBinop85 = fadd <2 x float> %509, %shift84, !dbg !60 + %517 = extractelement <2 x float> %foldExtExtBinop85, i64 0, !dbg !60 + %518 = getelementptr bfloat, ptr addrspace(1) %4, i64 %36, !dbg !61 + %519 = getelementptr bfloat, ptr addrspace(1) %4, i64 %38, !dbg !61 + %520 = getelementptr bfloat, ptr addrspace(1) %4, i64 %40, !dbg !61 + %521 = getelementptr bfloat, ptr addrspace(1) %4, i64 %42, !dbg !61 + %522 = getelementptr bfloat, ptr addrspace(1) %4, i64 %44, !dbg !61 + %523 = getelementptr bfloat, ptr addrspace(1) %4, i64 %46, !dbg !61 + %524 = getelementptr bfloat, ptr addrspace(1) %4, i64 %48, !dbg !61 + %525 = getelementptr bfloat, ptr addrspace(1) %4, i64 %50, !dbg !61 + %526 = fptrunc float %510 to bfloat, !dbg !62 + %527 = fptrunc float %511 to bfloat, !dbg !62 + %528 = fptrunc float %512 to bfloat, !dbg !62 + %529 = fptrunc float %513 to bfloat, !dbg !62 + %530 = fptrunc float %514 to bfloat, !dbg !62 + %531 = fptrunc float %515 to bfloat, !dbg !62 + %532 = fptrunc float %516 to bfloat, !dbg !62 + %533 = fptrunc float %517 to bfloat, !dbg !62 + %534 = bitcast bfloat %526 to i16, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %534, ptr addrspace(1) %518, i1 %88) #4, !dbg !62 + %535 = bitcast bfloat %527 to i16, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %535, ptr addrspace(1) %519, i1 %91) #4, !dbg !62 + %536 = bitcast bfloat %528 to i16, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %536, ptr addrspace(1) %520, i1 %94) #4, !dbg !62 + %537 = bitcast bfloat %529 to i16, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %537, ptr addrspace(1) %521, i1 %97) #4, !dbg !62 + %538 = bitcast bfloat %530 to i16, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %538, ptr addrspace(1) %522, i1 %100) #4, !dbg !62 + %539 = bitcast bfloat %531 to i16, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %539, ptr addrspace(1) %523, i1 %103) #4, !dbg !62 + %540 = bitcast bfloat %532 to i16, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %540, ptr addrspace(1) %524, i1 %106) #4, !dbg !62 + %541 = bitcast bfloat %533 to i16, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %541, ptr addrspace(1) %525, i1 %109) #4, !dbg !62 + ret void, !dbg !63 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="128" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0", linkageName: "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 19, column: 28, scope: !9) +!11 = !DILocation(line: 19, column: 33, scope: !9) +!12 = !DILocation(line: 20, column: 36, scope: !9) +!13 = !DILocation(line: 20, column: 23, scope: !9) +!14 = !DILocation(line: 23, column: 21, scope: !9) +!15 = !DILocation(line: 26, column: 30, scope: !9) +!16 = !DILocation(line: 23, column: 28, scope: !9) +!17 = !DILocation(line: 26, column: 35, scope: !9) +!18 = !DILocation(line: 27, column: 30, scope: !9) +!19 = !DILocation(line: 21, column: 21, scope: !9) +!20 = !DILocation(line: 27, column: 35, scope: !9) +!21 = !DILocation(line: 30, column: 18, scope: !9) +!22 = !DILocation(line: 31, column: 32, scope: !9) +!23 = !DILocation(line: 32, column: 28, scope: !9) +!24 = !DILocation(line: 32, column: 44, scope: !9) +!25 = !DILocation(line: 32, column: 37, scope: !9) +!26 = !DILocation(line: 32, column: 52, scope: !9) +!27 = !DILocation(line: 32, column: 62, scope: !9) +!28 = !DILocation(line: 24, column: 19, scope: !9) +!29 = !DILocation(line: 33, column: 39, scope: !9) +!30 = !DILocation(line: 40, column: 35, scope: !9) +!31 = !DILocation(line: 33, column: 30, scope: !9) +!32 = !DILocation(line: 33, column: 46, scope: !9) +!33 = !DILocation(line: 38, column: 31, scope: !9) +!34 = !DILocation(line: 38, column: 18, scope: !9) +!35 = !DILocation(line: 39, column: 19, scope: !9) +!36 = !DILocation(line: 40, column: 31, scope: !9) +!37 = !DILocation(line: 40, column: 68, scope: !9) +!38 = !DILocation(line: 40, column: 60, scope: !9) +!39 = !DILocation(line: 44, column: 20, scope: !9) +!40 = !DILocation(line: 47, column: 47, scope: !9) +!41 = !DILocation(line: 47, column: 31, scope: !9) +!42 = !DILocation(line: 47, column: 81, scope: !9) +!43 = !DILocation(line: 47, column: 73, scope: !9) +!44 = !DILocation(line: 51, column: 34, scope: !9) +!45 = !DILocation(line: 52, column: 28, scope: !9) +!46 = !DILocation(line: 52, column: 46, scope: !9) +!47 = !DILocation(line: 52, column: 38, scope: !9) +!48 = !DILocation(line: 52, column: 54, scope: !9) +!49 = !DILocation(line: 52, column: 64, scope: !9) +!50 = !DILocation(line: 40, column: 119, scope: !9) +!51 = !DILocation(line: 41, column: 13, scope: !9) +!52 = !DILocation(line: 47, column: 132, scope: !9) +!53 = !DILocation(line: 0, scope: !9) +!54 = !DILocation(line: 26, column: 75, scope: !9) +!55 = !DILocation(line: 53, column: 40, scope: !9) +!56 = !DILocation(line: 53, column: 31, scope: !9) +!57 = !DILocation(line: 53, column: 48, scope: !9) +!58 = !DILocation(line: 33, column: 86, scope: !9) +!59 = !DILocation(line: 34, column: 18, scope: !9) +!60 = !DILocation(line: 55, column: 19, scope: !9) +!61 = !DILocation(line: 56, column: 25, scope: !9) +!62 = !DILocation(line: 56, column: 37, scope: !9) +!63 = !DILocation(line: 56, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..cc5cb0d129131a9a5be8bd45df7fea3de3abcb6d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ptx @@ -0,0 +1,1534 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0 // -- Begin function triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 53, 111, 47, 99, 53, 111, 115, 119, 110, 114, 55, 100, 112, 119, 119, 99, 113, 112, 53, 109, 55, 116, 104, 112, 97, 102, 52, 111, 119, 118, 112, 115, 101, 107, 97, 54, 97, 109, 100, 118, 100, 114, 111, 101, 119, 110, 111, 114, 122, 115, 114, 101, 54, 110, 50, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 50, 53, 32, 60, 32, 107, 115, 52}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 53, 111, 47, 99, 53, 111, 115, 119, 110, 114, 55, 100, 112, 119, 119, 99, 113, 112, 53, 109, 55, 116, 104, 112, 97, 102, 52, 111, 119, 118, 112, 115, 101, 107, 97, 54, 97, 109, 100, 118, 100, 114, 111, 101, 119, 110, 111, 114, 122, 115, 114, 101, 54, 110, 50, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 53, 32, 60, 32, 107, 115, 50}; + // @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0 +.visible .entry triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_4, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_5, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_6, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_7, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_8, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_9, + .param .u32 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_10, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_11, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_12 +) +.reqntid 128 +{ + .reg .pred %p<179>; + .reg .b16 %rs<165>; + .reg .b32 %r<160>; + .reg .b64 %rd<500>; + .loc 1 18 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:18:0 + +// %bb.0: + ld.param.b64 %rd103, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_5]; +$L__tmp0: + .loc 1 19 28 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:19:28 + mov.u32 %r26, %ctaid.x; + .loc 1 19 33 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:19:33 + shl.b32 %r27, %r26, 10; + .loc 1 20 36 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:20:36 + mov.u32 %r28, %tid.x; + shl.b32 %r29, %r28, 3; + and.b32 %r30, %r29, 1016; + .loc 1 20 23 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:20:23 + or.b32 %r1, %r30, %r27; + or.b32 %r2, %r1, 1; + or.b32 %r3, %r1, 2; + .loc 1 23 21 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:23:21 + cvt.s64.s32 %rd7, %r2; + cvt.s64.s32 %rd8, %r1; + or.b64 %rd108, %rd8, %rd103; + and.b64 %rd109, %rd108, -4294967296; + setp.ne.b64 %p9, %rd109, 0; + @%p9 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd484, %rd8, %rd103; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r31, %rd103; + cvt.u32.u64 %r32, %rd8; + div.u32 %r33, %r32, %r31; + cvt.u64.u32 %rd484, %r33; +$L__BB0_3: + .loc 1 0 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0 + or.b32 %r4, %r1, 3; + .loc 1 23 21 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:23:21 + cvt.s64.s32 %rd6, %r3; + or.b64 %rd110, %rd7, %rd103; + and.b64 %rd111, %rd110, -4294967296; + setp.ne.b64 %p10, %rd111, 0; + @%p10 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd485, %rd7, %rd103; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r34, %rd103; + cvt.u32.u64 %r35, %rd7; + div.u32 %r36, %r35, %r34; + cvt.u64.u32 %rd485, %r36; +$L__BB0_6: + .loc 1 0 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0 + or.b32 %r5, %r1, 4; + .loc 1 23 21 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:23:21 + cvt.s64.s32 %rd5, %r4; + or.b64 %rd112, %rd6, %rd103; + and.b64 %rd113, %rd112, -4294967296; + setp.ne.b64 %p11, %rd113, 0; + @%p11 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + div.s64 %rd486, %rd6, %rd103; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r37, %rd103; + cvt.u32.u64 %r38, %rd6; + div.u32 %r39, %r38, %r37; + cvt.u64.u32 %rd486, %r39; +$L__BB0_9: + .loc 1 0 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0 + or.b32 %r6, %r1, 5; + .loc 1 23 21 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:23:21 + cvt.s64.s32 %rd4, %r5; + or.b64 %rd114, %rd5, %rd103; + and.b64 %rd115, %rd114, -4294967296; + setp.ne.b64 %p12, %rd115, 0; + @%p12 bra $L__BB0_11; + bra.uni $L__BB0_10; +$L__BB0_11: + div.s64 %rd487, %rd5, %rd103; + bra.uni $L__BB0_12; +$L__BB0_10: + cvt.u32.u64 %r40, %rd103; + cvt.u32.u64 %r41, %rd5; + div.u32 %r42, %r41, %r40; + cvt.u64.u32 %rd487, %r42; +$L__BB0_12: + .loc 1 0 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0 + or.b32 %r7, %r1, 6; + .loc 1 23 21 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:23:21 + cvt.s64.s32 %rd3, %r6; + or.b64 %rd116, %rd4, %rd103; + and.b64 %rd117, %rd116, -4294967296; + setp.ne.b64 %p13, %rd117, 0; + @%p13 bra $L__BB0_14; + bra.uni $L__BB0_13; +$L__BB0_14: + div.s64 %rd488, %rd4, %rd103; + bra.uni $L__BB0_15; +$L__BB0_13: + cvt.u32.u64 %r43, %rd103; + cvt.u32.u64 %r44, %rd4; + div.u32 %r45, %r44, %r43; + cvt.u64.u32 %rd488, %r45; +$L__BB0_15: + .loc 1 0 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0 + or.b32 %r8, %r1, 7; + .loc 1 23 21 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:23:21 + cvt.s64.s32 %rd2, %r7; + or.b64 %rd118, %rd3, %rd103; + and.b64 %rd119, %rd118, -4294967296; + setp.ne.b64 %p14, %rd119, 0; + @%p14 bra $L__BB0_17; + bra.uni $L__BB0_16; +$L__BB0_17: + div.s64 %rd489, %rd3, %rd103; + bra.uni $L__BB0_18; +$L__BB0_16: + cvt.u32.u64 %r46, %rd103; + cvt.u32.u64 %r47, %rd3; + div.u32 %r48, %r47, %r46; + cvt.u64.u32 %rd489, %r48; +$L__BB0_18: + cvt.s64.s32 %rd1, %r8; + or.b64 %rd120, %rd2, %rd103; + and.b64 %rd121, %rd120, -4294967296; + setp.ne.b64 %p15, %rd121, 0; + @%p15 bra $L__BB0_20; + bra.uni $L__BB0_19; +$L__BB0_20: + div.s64 %rd490, %rd2, %rd103; + bra.uni $L__BB0_21; +$L__BB0_19: + cvt.u32.u64 %r49, %rd103; + cvt.u32.u64 %r50, %rd2; + div.u32 %r51, %r50, %r49; + cvt.u64.u32 %rd490, %r51; +$L__BB0_21: + .loc 1 0 21 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0:21 + ld.param.b64 %rd104, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_6]; + .loc 1 23 21 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:23:21 + or.b64 %rd122, %rd1, %rd103; + and.b64 %rd123, %rd122, -4294967296; + setp.ne.b64 %p16, %rd123, 0; + @%p16 bra $L__BB0_23; + bra.uni $L__BB0_22; +$L__BB0_23: + div.s64 %rd491, %rd1, %rd103; + bra.uni $L__BB0_24; +$L__BB0_22: + cvt.u32.u64 %r52, %rd103; + cvt.u32.u64 %r53, %rd1; + div.u32 %r54, %r53, %r52; + cvt.u64.u32 %rd491, %r54; +$L__BB0_24: + .loc 1 23 28 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:23:28 + or.b64 %rd124, %rd484, %rd104; + and.b64 %rd125, %rd124, -4294967296; + setp.ne.b64 %p17, %rd125, 0; + @%p17 bra $L__BB0_26; + bra.uni $L__BB0_25; +$L__BB0_26: + rem.s64 %rd492, %rd484, %rd104; + bra.uni $L__BB0_27; +$L__BB0_25: + cvt.u32.u64 %r55, %rd104; + cvt.u32.u64 %r56, %rd484; + rem.u32 %r57, %r56, %r55; + cvt.u64.u32 %rd492, %r57; +$L__BB0_27: + or.b64 %rd126, %rd485, %rd104; + and.b64 %rd127, %rd126, -4294967296; + setp.ne.b64 %p18, %rd127, 0; + @%p18 bra $L__BB0_29; + bra.uni $L__BB0_28; +$L__BB0_29: + rem.s64 %rd493, %rd485, %rd104; + bra.uni $L__BB0_30; +$L__BB0_28: + cvt.u32.u64 %r58, %rd104; + cvt.u32.u64 %r59, %rd485; + rem.u32 %r60, %r59, %r58; + cvt.u64.u32 %rd493, %r60; +$L__BB0_30: + or.b64 %rd128, %rd486, %rd104; + and.b64 %rd129, %rd128, -4294967296; + setp.ne.b64 %p19, %rd129, 0; + @%p19 bra $L__BB0_32; + bra.uni $L__BB0_31; +$L__BB0_32: + rem.s64 %rd494, %rd486, %rd104; + bra.uni $L__BB0_33; +$L__BB0_31: + cvt.u32.u64 %r61, %rd104; + cvt.u32.u64 %r62, %rd486; + rem.u32 %r63, %r62, %r61; + cvt.u64.u32 %rd494, %r63; +$L__BB0_33: + or.b64 %rd130, %rd487, %rd104; + and.b64 %rd131, %rd130, -4294967296; + setp.ne.b64 %p20, %rd131, 0; + @%p20 bra $L__BB0_35; + bra.uni $L__BB0_34; +$L__BB0_35: + rem.s64 %rd495, %rd487, %rd104; + bra.uni $L__BB0_36; +$L__BB0_34: + cvt.u32.u64 %r64, %rd104; + cvt.u32.u64 %r65, %rd487; + rem.u32 %r66, %r65, %r64; + cvt.u64.u32 %rd495, %r66; +$L__BB0_36: + or.b64 %rd132, %rd488, %rd104; + and.b64 %rd133, %rd132, -4294967296; + setp.ne.b64 %p21, %rd133, 0; + @%p21 bra $L__BB0_38; + bra.uni $L__BB0_37; +$L__BB0_38: + rem.s64 %rd496, %rd488, %rd104; + bra.uni $L__BB0_39; +$L__BB0_37: + cvt.u32.u64 %r67, %rd104; + cvt.u32.u64 %r68, %rd488; + rem.u32 %r69, %r68, %r67; + cvt.u64.u32 %rd496, %r69; +$L__BB0_39: + or.b64 %rd134, %rd489, %rd104; + and.b64 %rd135, %rd134, -4294967296; + setp.ne.b64 %p22, %rd135, 0; + @%p22 bra $L__BB0_41; + bra.uni $L__BB0_40; +$L__BB0_41: + rem.s64 %rd497, %rd489, %rd104; + bra.uni $L__BB0_42; +$L__BB0_40: + cvt.u32.u64 %r70, %rd104; + cvt.u32.u64 %r71, %rd489; + rem.u32 %r72, %r71, %r70; + cvt.u64.u32 %rd497, %r72; +$L__BB0_42: + or.b64 %rd136, %rd490, %rd104; + and.b64 %rd137, %rd136, -4294967296; + setp.ne.b64 %p23, %rd137, 0; + @%p23 bra $L__BB0_44; + bra.uni $L__BB0_43; +$L__BB0_44: + rem.s64 %rd498, %rd490, %rd104; + bra.uni $L__BB0_45; +$L__BB0_43: + cvt.u32.u64 %r73, %rd104; + cvt.u32.u64 %r74, %rd490; + rem.u32 %r75, %r74, %r73; + cvt.u64.u32 %rd498, %r75; +$L__BB0_45: + .loc 1 0 28 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0:28 + ld.param.b32 %r25, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_10]; + ld.param.b64 %rd105, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_7]; + ld.param.b64 %rd99, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_1]; + ld.param.b64 %rd98, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_0]; + .loc 1 23 28 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:23:28 + or.b64 %rd138, %rd491, %rd104; + and.b64 %rd139, %rd138, -4294967296; + setp.ne.b64 %p24, %rd139, 0; + @%p24 bra $L__BB0_47; + bra.uni $L__BB0_46; +$L__BB0_47: + rem.s64 %rd499, %rd491, %rd104; + bra.uni $L__BB0_48; +$L__BB0_46: + cvt.u32.u64 %r76, %rd104; + cvt.u32.u64 %r77, %rd491; + rem.u32 %r78, %r77, %r76; + cvt.u64.u32 %rd499, %r78; +$L__BB0_48: + .loc 1 26 30 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:26:30 + shl.b64 %rd196, %rd8, 1; + add.s64 %rd141, %rd98, %rd196; + shl.b64 %rd197, %rd7, 1; + add.s64 %rd144, %rd98, %rd197; + shl.b64 %rd198, %rd6, 1; + add.s64 %rd147, %rd98, %rd198; + shl.b64 %rd199, %rd5, 1; + add.s64 %rd150, %rd98, %rd199; + shl.b64 %rd200, %rd4, 1; + add.s64 %rd153, %rd98, %rd200; + shl.b64 %rd201, %rd3, 1; + add.s64 %rd156, %rd98, %rd201; + shl.b64 %rd202, %rd2, 1; + add.s64 %rd159, %rd98, %rd202; + shl.b64 %rd203, %rd1, 1; + add.s64 %rd162, %rd98, %rd203; + .loc 1 26 35 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:26:35 + // begin inline asm + mov.u64 %rd140, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd140, 1.0; + // end inline asm + .loc 1 27 30 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:27:30 + shl.b64 %rd204, %rd492, 3; + add.s64 %rd166, %rd99, %rd204; + shl.b64 %rd205, %rd493, 3; + add.s64 %rd170, %rd99, %rd205; + shl.b64 %rd206, %rd494, 3; + add.s64 %rd174, %rd99, %rd206; + shl.b64 %rd207, %rd495, 3; + add.s64 %rd178, %rd99, %rd207; + shl.b64 %rd208, %rd496, 3; + add.s64 %rd182, %rd99, %rd208; + shl.b64 %rd209, %rd497, 3; + add.s64 %rd186, %rd99, %rd209; + shl.b64 %rd210, %rd498, 3; + add.s64 %rd190, %rd99, %rd210; + shl.b64 %rd211, %rd499, 3; + add.s64 %rd194, %rd99, %rd211; + .loc 1 21 21 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:21:21 + setp.lt.s32 %p8, %r8, %r25; + setp.lt.s32 %p7, %r7, %r25; + setp.lt.s32 %p6, %r6, %r25; + setp.lt.s32 %p5, %r5, %r25; + setp.lt.s32 %p4, %r4, %r25; + setp.lt.s32 %p3, %r3, %r25; + setp.lt.s32 %p2, %r2, %r25; + setp.lt.s32 %p1, %r1, %r25; + .loc 1 26 35 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:26:35 + // begin inline asm + mov.u16 %rs33, 0x0; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs33 }, [ %rd141 + 0 ], %rd140; + // end inline asm + // begin inline asm + mov.u64 %rd143, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd143, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs34, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs34 }, [ %rd144 + 0 ], %rd143; + // end inline asm + // begin inline asm + mov.u64 %rd146, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd146, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs35, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs35 }, [ %rd147 + 0 ], %rd146; + // end inline asm + // begin inline asm + mov.u64 %rd149, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd149, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs36, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs36 }, [ %rd150 + 0 ], %rd149; + // end inline asm + // begin inline asm + mov.u64 %rd152, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd152, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs37, 0x0; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs37 }, [ %rd153 + 0 ], %rd152; + // end inline asm + // begin inline asm + mov.u64 %rd155, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd155, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs38, 0x0; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs38 }, [ %rd156 + 0 ], %rd155; + // end inline asm + // begin inline asm + mov.u64 %rd158, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd158, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs39, 0x0; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs39 }, [ %rd159 + 0 ], %rd158; + // end inline asm + // begin inline asm + mov.u64 %rd161, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd161, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs40, 0x0; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs40 }, [ %rd162 + 0 ], %rd161; + // end inline asm + .loc 1 27 35 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:27:35 + // begin inline asm + mov.u64 %rd164, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd164, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd165, 0x0; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd165 }, [ %rd166 + 0 ], %rd164; + // end inline asm + // begin inline asm + mov.u64 %rd168, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd168, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd169, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd169 }, [ %rd170 + 0 ], %rd168; + // end inline asm + // begin inline asm + mov.u64 %rd172, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd172, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd173, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd173 }, [ %rd174 + 0 ], %rd172; + // end inline asm + // begin inline asm + mov.u64 %rd176, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd176, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd177, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd177 }, [ %rd178 + 0 ], %rd176; + // end inline asm + // begin inline asm + mov.u64 %rd180, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd180, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd181, 0x0; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd181 }, [ %rd182 + 0 ], %rd180; + // end inline asm + // begin inline asm + mov.u64 %rd184, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd184, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd185, 0x0; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd185 }, [ %rd186 + 0 ], %rd184; + // end inline asm + // begin inline asm + mov.u64 %rd188, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd188, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd189, 0x0; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd189 }, [ %rd190 + 0 ], %rd188; + // end inline asm + // begin inline asm + mov.u64 %rd192, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd192, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd193, 0x0; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd193 }, [ %rd194 + 0 ], %rd192; + // end inline asm + .loc 1 31 32 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:31:32 + shr.s64 %rd212, %rd173, 63; + and.b64 %rd213, %rd212, %rd105; + shr.s64 %rd214, %rd177, 63; + and.b64 %rd215, %rd214, %rd105; + shr.s64 %rd216, %rd165, 63; + and.b64 %rd217, %rd216, %rd105; + shr.s64 %rd218, %rd169, 63; + and.b64 %rd219, %rd218, %rd105; + shr.s64 %rd220, %rd189, 63; + and.b64 %rd221, %rd220, %rd105; + shr.s64 %rd222, %rd193, 63; + and.b64 %rd223, %rd222, %rd105; + shr.s64 %rd224, %rd181, 63; + and.b64 %rd225, %rd224, %rd105; + shr.s64 %rd226, %rd185, 63; + and.b64 %rd227, %rd226, %rd105; + add.s64 %rd78, %rd227, %rd185; + add.s64 %rd77, %rd225, %rd181; + add.s64 %rd80, %rd223, %rd193; + add.s64 %rd79, %rd221, %rd189; + add.s64 %rd74, %rd219, %rd169; + add.s64 %rd73, %rd217, %rd165; + add.s64 %rd76, %rd215, %rd177; + add.s64 %rd75, %rd213, %rd173; + .loc 1 32 28 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:32:28 + setp.lt.s64 %p41, %rd75, 0; + setp.lt.s64 %p42, %rd76, 0; + setp.lt.s64 %p43, %rd73, 0; + setp.lt.s64 %p44, %rd74, 0; + setp.lt.s64 %p45, %rd79, 0; + setp.lt.s64 %p46, %rd80, 0; + setp.lt.s64 %p47, %rd77, 0; + setp.lt.s64 %p48, %rd78, 0; + .loc 1 32 44 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:32:44 + setp.ge.s64 %p49, %rd75, %rd105; + setp.ge.s64 %p50, %rd76, %rd105; + setp.ge.s64 %p51, %rd73, %rd105; + setp.ge.s64 %p52, %rd74, %rd105; + setp.ge.s64 %p53, %rd79, %rd105; + setp.ge.s64 %p54, %rd80, %rd105; + setp.ge.s64 %p55, %rd77, %rd105; + setp.ge.s64 %p56, %rd78, %rd105; + .loc 1 32 37 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:32:37 + or.pred %p57, %p48, %p56; + or.pred %p58, %p47, %p55; + or.pred %p59, %p46, %p54; + or.pred %p60, %p45, %p53; + or.pred %p61, %p44, %p52; + or.pred %p62, %p43, %p51; + or.pred %p63, %p42, %p50; + or.pred %p64, %p41, %p49; + .loc 1 32 52 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:32:52 + and.pred %p65, %p3, %p64; + selp.b16 %rs41, 1, 0, %p65; + shl.b16 %rs42, %rs41, 2; + and.pred %p66, %p4, %p63; + selp.b16 %rs43, -1, 0, %p66; + shl.b16 %rs44, %rs43, 3; + or.b16 %rs45, %rs44, %rs42; + and.pred %p67, %p1, %p62; + selp.b16 %rs46, 1, 0, %p67; + and.pred %p68, %p2, %p61; + selp.b16 %rs47, -1, 0, %p68; + shl.b16 %rs48, %rs47, 1; + or.b16 %rs49, %rs46, %rs48; + and.b16 %rs50, %rs49, 3; + or.b16 %rs51, %rs50, %rs45; + and.b16 %rs52, %rs51, 15; + and.pred %p69, %p7, %p60; + selp.b16 %rs53, 1, 0, %p69; + shl.b16 %rs54, %rs53, 2; + and.pred %p70, %p8, %p59; + selp.b16 %rs55, -1, 0, %p70; + shl.b16 %rs56, %rs55, 3; + or.b16 %rs57, %rs56, %rs54; + and.pred %p71, %p5, %p58; + selp.b16 %rs58, 1, 0, %p71; + and.pred %p72, %p6, %p57; + selp.b16 %rs59, -1, 0, %p72; + shl.b16 %rs60, %rs59, 1; + or.b16 %rs61, %rs58, %rs60; + and.b16 %rs62, %rs61, 3; + or.b16 %rs63, %rs62, %rs57; + shl.b16 %rs64, %rs63, 4; + or.b16 %rs65, %rs52, %rs64; + .loc 1 32 62 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:32:62 + and.b16 %rs66, %rs65, 255; + setp.eq.b16 %p73, %rs66, 0; + @%p73 bra $L__BB0_50; + bra.uni $L__BB0_49; +$L__BB0_50: + .loc 1 0 62 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0:62 + ld.param.b64 %rd107, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_9]; + ld.param.b64 %rd106, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_8]; + ld.param.b64 %rd100, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_2]; + .loc 1 24 19 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:24:19 + rem.s64 %rd88, %rd1, %rd106; + rem.s64 %rd87, %rd2, %rd106; + rem.s64 %rd86, %rd3, %rd106; + rem.s64 %rd85, %rd4, %rd106; + rem.s64 %rd84, %rd5, %rd106; + rem.s64 %rd83, %rd6, %rd106; + rem.s64 %rd82, %rd7, %rd106; + rem.s64 %rd81, %rd8, %rd106; + .loc 1 32 62 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:32:62 + bar.sync 0; + .loc 1 33 39 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:39 + mul.lo.s64 %rd306, %rd73, %rd106; + mul.lo.s64 %rd307, %rd74, %rd106; + mul.lo.s64 %rd308, %rd75, %rd106; + mul.lo.s64 %rd309, %rd76, %rd106; + mul.lo.s64 %rd310, %rd77, %rd106; + mul.lo.s64 %rd311, %rd78, %rd106; + mul.lo.s64 %rd312, %rd79, %rd106; + mul.lo.s64 %rd313, %rd80, %rd106; + .loc 1 33 30 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:30 + shl.b64 %rd314, %rd81, 1; + add.s64 %rd315, %rd100, %rd314; + shl.b64 %rd316, %rd306, 1; + add.s64 %rd235, %rd315, %rd316; + shl.b64 %rd317, %rd82, 1; + add.s64 %rd318, %rd100, %rd317; + shl.b64 %rd319, %rd307, 1; + add.s64 %rd238, %rd318, %rd319; + shl.b64 %rd320, %rd83, 1; + add.s64 %rd321, %rd100, %rd320; + shl.b64 %rd322, %rd308, 1; + add.s64 %rd241, %rd321, %rd322; + shl.b64 %rd323, %rd84, 1; + add.s64 %rd324, %rd100, %rd323; + shl.b64 %rd325, %rd309, 1; + add.s64 %rd244, %rd324, %rd325; + shl.b64 %rd326, %rd85, 1; + add.s64 %rd327, %rd100, %rd326; + shl.b64 %rd328, %rd310, 1; + add.s64 %rd247, %rd327, %rd328; + shl.b64 %rd329, %rd86, 1; + add.s64 %rd330, %rd100, %rd329; + shl.b64 %rd331, %rd311, 1; + add.s64 %rd250, %rd330, %rd331; + shl.b64 %rd332, %rd87, 1; + add.s64 %rd333, %rd100, %rd332; + shl.b64 %rd334, %rd312, 1; + add.s64 %rd253, %rd333, %rd334; + shl.b64 %rd335, %rd88, 1; + add.s64 %rd336, %rd100, %rd335; + shl.b64 %rd337, %rd313, 1; + add.s64 %rd256, %rd336, %rd337; + .loc 1 33 46 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:46 + // begin inline asm + mov.u64 %rd234, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd234, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs67, 0x0; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs67 }, [ %rd235 + 0 ], %rd234; + // end inline asm + // begin inline asm + mov.u64 %rd237, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd237, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs68, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs68 }, [ %rd238 + 0 ], %rd237; + // end inline asm + // begin inline asm + mov.u64 %rd240, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd240, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs69, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs69 }, [ %rd241 + 0 ], %rd240; + // end inline asm + // begin inline asm + mov.u64 %rd243, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd243, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs70, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs70 }, [ %rd244 + 0 ], %rd243; + // end inline asm + // begin inline asm + mov.u64 %rd246, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd246, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs71, 0x0; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs71 }, [ %rd247 + 0 ], %rd246; + // end inline asm + // begin inline asm + mov.u64 %rd249, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd249, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs72, 0x0; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs72 }, [ %rd250 + 0 ], %rd249; + // end inline asm + // begin inline asm + mov.u64 %rd252, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd252, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs73, 0x0; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs73 }, [ %rd253 + 0 ], %rd252; + // end inline asm + // begin inline asm + mov.u64 %rd255, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd255, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs74, 0x0; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs74 }, [ %rd256 + 0 ], %rd255; + // end inline asm + .loc 1 38 31 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:38:31 + shr.u64 %rd338, %rd106, 63; + add.s64 %rd339, %rd106, %rd338; + shr.s64 %rd340, %rd339, 1; + .loc 1 38 18 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:38:18 + sub.s64 %rd89, %rd106, %rd340; + .loc 1 39 19 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:39:19 + setp.lt.s64 %p106, %rd81, %rd89; + setp.lt.s64 %p107, %rd82, %rd89; + setp.lt.s64 %p108, %rd83, %rd89; + setp.lt.s64 %p109, %rd84, %rd89; + setp.lt.s64 %p110, %rd85, %rd89; + setp.lt.s64 %p111, %rd86, %rd89; + setp.lt.s64 %p112, %rd87, %rd89; + setp.lt.s64 %p113, %rd88, %rd89; + .loc 1 40 35 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:40:35 + sub.s64 %rd341, %rd8, %rd81; + sub.s64 %rd342, %rd7, %rd82; + sub.s64 %rd343, %rd6, %rd83; + sub.s64 %rd344, %rd5, %rd84; + sub.s64 %rd345, %rd4, %rd85; + sub.s64 %rd346, %rd3, %rd86; + sub.s64 %rd347, %rd2, %rd87; + sub.s64 %rd348, %rd1, %rd88; + .loc 1 40 31 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:40:31 + shl.b64 %rd349, %rd341, 1; + add.s64 %rd350, %rd98, %rd349; + and.b64 %rd351, %rd339, -2; + add.s64 %rd352, %rd350, %rd351; + add.s64 %rd259, %rd352, %rd314; + shl.b64 %rd353, %rd342, 1; + add.s64 %rd354, %rd98, %rd353; + add.s64 %rd355, %rd354, %rd351; + add.s64 %rd262, %rd355, %rd317; + shl.b64 %rd356, %rd343, 1; + add.s64 %rd357, %rd98, %rd356; + add.s64 %rd358, %rd357, %rd351; + add.s64 %rd265, %rd358, %rd320; + shl.b64 %rd359, %rd344, 1; + add.s64 %rd360, %rd98, %rd359; + add.s64 %rd361, %rd360, %rd351; + add.s64 %rd268, %rd361, %rd323; + shl.b64 %rd362, %rd345, 1; + add.s64 %rd363, %rd98, %rd362; + add.s64 %rd364, %rd363, %rd351; + add.s64 %rd271, %rd364, %rd326; + shl.b64 %rd365, %rd346, 1; + add.s64 %rd366, %rd98, %rd365; + add.s64 %rd367, %rd366, %rd351; + add.s64 %rd274, %rd367, %rd329; + shl.b64 %rd368, %rd347, 1; + add.s64 %rd369, %rd98, %rd368; + add.s64 %rd370, %rd369, %rd351; + add.s64 %rd277, %rd370, %rd332; + shl.b64 %rd371, %rd348, 1; + add.s64 %rd372, %rd98, %rd371; + add.s64 %rd373, %rd372, %rd351; + add.s64 %rd280, %rd373, %rd335; + .loc 1 40 68 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:40:68 + and.pred %p82, %p1, %p106; + and.pred %p83, %p2, %p107; + and.pred %p84, %p3, %p108; + and.pred %p85, %p4, %p109; + and.pred %p86, %p5, %p110; + and.pred %p87, %p6, %p111; + and.pred %p88, %p7, %p112; + and.pred %p89, %p8, %p113; + .loc 1 40 60 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:40:60 + // begin inline asm + mov.u64 %rd258, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd258, 1.0; + // end inline asm + mov.b16 %rs76, 0; + // begin inline asm + mov.u16 %rs75, %rs76; + @%p82 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs75 }, [ %rd259 + 0 ], %rd258; + // end inline asm + // begin inline asm + mov.u64 %rd261, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd261, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs77, %rs76; + @%p83 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs77 }, [ %rd262 + 0 ], %rd261; + // end inline asm + // begin inline asm + mov.u64 %rd264, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd264, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs79, %rs76; + @%p84 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs79 }, [ %rd265 + 0 ], %rd264; + // end inline asm + // begin inline asm + mov.u64 %rd267, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd267, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs81, %rs76; + @%p85 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs81 }, [ %rd268 + 0 ], %rd267; + // end inline asm + // begin inline asm + mov.u64 %rd270, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd270, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs83, %rs76; + @%p86 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs83 }, [ %rd271 + 0 ], %rd270; + // end inline asm + // begin inline asm + mov.u64 %rd273, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd273, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs85, %rs76; + @%p87 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs85 }, [ %rd274 + 0 ], %rd273; + // end inline asm + // begin inline asm + mov.u64 %rd276, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd276, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs87, %rs76; + @%p88 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs87 }, [ %rd277 + 0 ], %rd276; + // end inline asm + // begin inline asm + mov.u64 %rd279, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd279, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs89, %rs76; + @%p89 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs89 }, [ %rd280 + 0 ], %rd279; + // end inline asm + .loc 1 44 20 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:44:20 + setp.ge.s64 %p114, %rd88, %rd89; + setp.ge.s64 %p115, %rd87, %rd89; + setp.ge.s64 %p116, %rd86, %rd89; + setp.ge.s64 %p117, %rd85, %rd89; + setp.ge.s64 %p118, %rd84, %rd89; + setp.ge.s64 %p119, %rd83, %rd89; + setp.ge.s64 %p120, %rd82, %rd89; + setp.ge.s64 %p121, %rd81, %rd89; + .loc 1 47 47 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:47:47 + sub.s64 %rd374, %rd81, %rd106; + sub.s64 %rd375, %rd82, %rd106; + sub.s64 %rd376, %rd83, %rd106; + sub.s64 %rd377, %rd84, %rd106; + sub.s64 %rd378, %rd85, %rd106; + sub.s64 %rd379, %rd86, %rd106; + sub.s64 %rd380, %rd87, %rd106; + sub.s64 %rd381, %rd88, %rd106; + .loc 1 47 31 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:47:31 + shl.b64 %rd382, %rd374, 1; + add.s64 %rd283, %rd352, %rd382; + shl.b64 %rd383, %rd375, 1; + add.s64 %rd286, %rd355, %rd383; + shl.b64 %rd384, %rd376, 1; + add.s64 %rd289, %rd358, %rd384; + shl.b64 %rd385, %rd377, 1; + add.s64 %rd292, %rd361, %rd385; + shl.b64 %rd386, %rd378, 1; + add.s64 %rd295, %rd364, %rd386; + shl.b64 %rd387, %rd379, 1; + add.s64 %rd298, %rd367, %rd387; + shl.b64 %rd388, %rd380, 1; + add.s64 %rd301, %rd370, %rd388; + shl.b64 %rd389, %rd381, 1; + add.s64 %rd304, %rd373, %rd389; + .loc 1 47 81 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:47:81 + and.pred %p90, %p1, %p121; + and.pred %p91, %p2, %p120; + and.pred %p92, %p3, %p119; + and.pred %p93, %p4, %p118; + and.pred %p94, %p5, %p117; + and.pred %p95, %p6, %p116; + and.pred %p96, %p7, %p115; + and.pred %p97, %p8, %p114; + .loc 1 47 73 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:47:73 + // begin inline asm + mov.u64 %rd282, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd282, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs91, %rs76; + @%p90 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs91 }, [ %rd283 + 0 ], %rd282; + // end inline asm + // begin inline asm + mov.u64 %rd285, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd285, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs93, %rs76; + @%p91 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs93 }, [ %rd286 + 0 ], %rd285; + // end inline asm + // begin inline asm + mov.u64 %rd288, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd288, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs95, %rs76; + @%p92 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs95 }, [ %rd289 + 0 ], %rd288; + // end inline asm + // begin inline asm + mov.u64 %rd291, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd291, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs97, %rs76; + @%p93 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs97 }, [ %rd292 + 0 ], %rd291; + // end inline asm + // begin inline asm + mov.u64 %rd294, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd294, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs99, %rs76; + @%p94 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs99 }, [ %rd295 + 0 ], %rd294; + // end inline asm + // begin inline asm + mov.u64 %rd297, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd297, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs101, %rs76; + @%p95 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs101 }, [ %rd298 + 0 ], %rd297; + // end inline asm + // begin inline asm + mov.u64 %rd300, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd300, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs103, %rs76; + @%p96 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs103 }, [ %rd301 + 0 ], %rd300; + // end inline asm + // begin inline asm + mov.u64 %rd303, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd303, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs105, %rs76; + @%p97 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs105 }, [ %rd304 + 0 ], %rd303; + // end inline asm + .loc 1 51 34 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:51:34 + and.b64 %rd391, %rd212, %rd107; + and.b64 %rd393, %rd214, %rd107; + and.b64 %rd395, %rd216, %rd107; + and.b64 %rd397, %rd218, %rd107; + and.b64 %rd399, %rd220, %rd107; + and.b64 %rd401, %rd222, %rd107; + and.b64 %rd403, %rd224, %rd107; + and.b64 %rd405, %rd226, %rd107; + add.s64 %rd95, %rd405, %rd185; + add.s64 %rd94, %rd403, %rd181; + add.s64 %rd97, %rd401, %rd193; + add.s64 %rd96, %rd399, %rd189; + add.s64 %rd91, %rd397, %rd169; + add.s64 %rd90, %rd395, %rd165; + add.s64 %rd93, %rd393, %rd177; + add.s64 %rd92, %rd391, %rd173; + .loc 1 52 28 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:52:28 + setp.lt.s64 %p122, %rd92, 0; + setp.lt.s64 %p123, %rd93, 0; + setp.lt.s64 %p124, %rd90, 0; + setp.lt.s64 %p125, %rd91, 0; + setp.lt.s64 %p126, %rd96, 0; + setp.lt.s64 %p127, %rd97, 0; + setp.lt.s64 %p128, %rd94, 0; + setp.lt.s64 %p129, %rd95, 0; + .loc 1 52 46 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:52:46 + setp.ge.s64 %p130, %rd92, %rd107; + setp.ge.s64 %p131, %rd93, %rd107; + setp.ge.s64 %p132, %rd90, %rd107; + setp.ge.s64 %p133, %rd91, %rd107; + setp.ge.s64 %p134, %rd96, %rd107; + setp.ge.s64 %p135, %rd97, %rd107; + setp.ge.s64 %p136, %rd94, %rd107; + setp.ge.s64 %p137, %rd95, %rd107; + .loc 1 52 38 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:52:38 + or.pred %p138, %p129, %p137; + or.pred %p139, %p128, %p136; + or.pred %p140, %p127, %p135; + or.pred %p141, %p126, %p134; + or.pred %p142, %p125, %p133; + or.pred %p143, %p124, %p132; + or.pred %p144, %p123, %p131; + or.pred %p145, %p122, %p130; + .loc 1 52 54 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:52:54 + and.pred %p146, %p3, %p145; + selp.b16 %rs107, 1, 0, %p146; + shl.b16 %rs108, %rs107, 2; + and.pred %p147, %p4, %p144; + selp.b16 %rs109, -1, 0, %p147; + shl.b16 %rs110, %rs109, 3; + or.b16 %rs111, %rs110, %rs108; + and.pred %p148, %p1, %p143; + selp.b16 %rs112, 1, 0, %p148; + and.pred %p149, %p2, %p142; + selp.b16 %rs113, -1, 0, %p149; + shl.b16 %rs114, %rs113, 1; + or.b16 %rs115, %rs112, %rs114; + and.b16 %rs116, %rs115, 3; + or.b16 %rs117, %rs116, %rs111; + and.b16 %rs118, %rs117, 15; + and.pred %p150, %p7, %p141; + selp.b16 %rs119, 1, 0, %p150; + shl.b16 %rs120, %rs119, 2; + and.pred %p151, %p8, %p140; + selp.b16 %rs121, -1, 0, %p151; + shl.b16 %rs122, %rs121, 3; + or.b16 %rs123, %rs122, %rs120; + and.pred %p152, %p5, %p139; + selp.b16 %rs124, 1, 0, %p152; + and.pred %p153, %p6, %p138; + selp.b16 %rs125, -1, 0, %p153; + shl.b16 %rs126, %rs125, 1; + or.b16 %rs127, %rs124, %rs126; + and.b16 %rs128, %rs127, 3; + or.b16 %rs129, %rs128, %rs123; + shl.b16 %rs130, %rs129, 4; + or.b16 %rs131, %rs118, %rs130; + .loc 1 52 64 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:52:64 + and.b16 %rs132, %rs131, 255; + setp.eq.b16 %p154, %rs132, 0; + @%p154 bra $L__BB0_52; + bra.uni $L__BB0_51; +$L__BB0_52: + .loc 1 0 64 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0:64 + ld.param.b64 %rd102, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_4]; + ld.param.b64 %rd101, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_3]; + .loc 1 40 119 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:40:119 + cvt.f32.bf16 %r79, %rs89; + mov.b32 %r80, 0f00000000; + .loc 1 41 13 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:41:13 + sub.f32 %r81, %r80, %r79; + .loc 1 47 132 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:47:132 + cvt.f32.bf16 %r82, %rs105; + .loc 1 0 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0 + selp.f32 %r83, %r81, %r82, %p113; + .loc 1 40 119 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:40:119 + cvt.f32.bf16 %r84, %rs87; + .loc 1 41 13 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:41:13 + sub.f32 %r85, %r80, %r84; + .loc 1 47 132 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:47:132 + cvt.f32.bf16 %r86, %rs103; + .loc 1 0 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0 + selp.f32 %r87, %r85, %r86, %p112; + .loc 1 40 119 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:40:119 + cvt.f32.bf16 %r88, %rs85; + .loc 1 41 13 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:41:13 + sub.f32 %r89, %r80, %r88; + .loc 1 47 132 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:47:132 + cvt.f32.bf16 %r90, %rs101; + .loc 1 0 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0 + selp.f32 %r91, %r89, %r90, %p111; + .loc 1 40 119 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:40:119 + cvt.f32.bf16 %r92, %rs83; + .loc 1 41 13 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:41:13 + sub.f32 %r93, %r80, %r92; + .loc 1 47 132 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:47:132 + cvt.f32.bf16 %r94, %rs99; + .loc 1 0 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0 + selp.f32 %r95, %r93, %r94, %p110; + .loc 1 40 119 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:40:119 + cvt.f32.bf16 %r96, %rs81; + .loc 1 41 13 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:41:13 + sub.f32 %r97, %r80, %r96; + .loc 1 47 132 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:47:132 + cvt.f32.bf16 %r98, %rs97; + .loc 1 0 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0 + selp.f32 %r99, %r97, %r98, %p109; + .loc 1 40 119 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:40:119 + cvt.f32.bf16 %r100, %rs79; + .loc 1 41 13 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:41:13 + sub.f32 %r101, %r80, %r100; + .loc 1 47 132 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:47:132 + cvt.f32.bf16 %r102, %rs95; + .loc 1 0 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0 + selp.f32 %r103, %r101, %r102, %p108; + .loc 1 40 119 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:40:119 + cvt.f32.bf16 %r104, %rs77; + .loc 1 41 13 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:41:13 + sub.f32 %r105, %r80, %r104; + .loc 1 47 132 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:47:132 + cvt.f32.bf16 %r106, %rs93; + .loc 1 0 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0 + selp.f32 %r107, %r105, %r106, %p107; + .loc 1 40 119 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:40:119 + cvt.f32.bf16 %r108, %rs75; + .loc 1 41 13 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:41:13 + sub.f32 %r109, %r80, %r108; + .loc 1 47 132 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:47:132 + cvt.f32.bf16 %r110, %rs91; + .loc 1 0 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0 + selp.f32 %r111, %r109, %r110, %p106; + .loc 1 26 75 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:26:75 + cvt.f32.bf16 %r112, %rs40; + cvt.f32.bf16 %r113, %rs39; + cvt.f32.bf16 %r114, %rs38; + cvt.f32.bf16 %r115, %rs37; + cvt.f32.bf16 %r116, %rs36; + cvt.f32.bf16 %r117, %rs35; + cvt.f32.bf16 %r118, %rs34; + cvt.f32.bf16 %r119, %rs33; + .loc 1 52 64 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:52:64 + bar.sync 0; + .loc 1 53 40 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:53:40 + mul.lo.s64 %rd444, %rd90, %rd106; + mul.lo.s64 %rd445, %rd91, %rd106; + mul.lo.s64 %rd446, %rd92, %rd106; + mul.lo.s64 %rd447, %rd93, %rd106; + mul.lo.s64 %rd448, %rd94, %rd106; + mul.lo.s64 %rd449, %rd95, %rd106; + mul.lo.s64 %rd450, %rd96, %rd106; + mul.lo.s64 %rd451, %rd97, %rd106; + .loc 1 53 31 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:53:31 + add.s64 %rd453, %rd101, %rd314; + shl.b64 %rd454, %rd444, 1; + add.s64 %rd413, %rd453, %rd454; + add.s64 %rd456, %rd101, %rd317; + shl.b64 %rd457, %rd445, 1; + add.s64 %rd416, %rd456, %rd457; + add.s64 %rd459, %rd101, %rd320; + shl.b64 %rd460, %rd446, 1; + add.s64 %rd419, %rd459, %rd460; + add.s64 %rd462, %rd101, %rd323; + shl.b64 %rd463, %rd447, 1; + add.s64 %rd422, %rd462, %rd463; + add.s64 %rd465, %rd101, %rd326; + shl.b64 %rd466, %rd448, 1; + add.s64 %rd425, %rd465, %rd466; + add.s64 %rd468, %rd101, %rd329; + shl.b64 %rd469, %rd449, 1; + add.s64 %rd428, %rd468, %rd469; + add.s64 %rd471, %rd101, %rd332; + shl.b64 %rd472, %rd450, 1; + add.s64 %rd431, %rd471, %rd472; + add.s64 %rd474, %rd101, %rd335; + shl.b64 %rd475, %rd451, 1; + add.s64 %rd434, %rd474, %rd475; + .loc 1 53 48 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:53:48 + // begin inline asm + mov.u64 %rd414, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd414, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs133, 0x0; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs133 }, [ %rd413 + 0 ], %rd414; + // end inline asm + // begin inline asm + mov.u64 %rd417, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd417, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs134, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs134 }, [ %rd416 + 0 ], %rd417; + // end inline asm + // begin inline asm + mov.u64 %rd420, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd420, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs135, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs135 }, [ %rd419 + 0 ], %rd420; + // end inline asm + // begin inline asm + mov.u64 %rd423, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd423, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs136, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs136 }, [ %rd422 + 0 ], %rd423; + // end inline asm + // begin inline asm + mov.u64 %rd426, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd426, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs137, 0x0; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs137 }, [ %rd425 + 0 ], %rd426; + // end inline asm + // begin inline asm + mov.u64 %rd429, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd429, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs138, 0x0; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs138 }, [ %rd428 + 0 ], %rd429; + // end inline asm + // begin inline asm + mov.u64 %rd432, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd432, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs139, 0x0; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs139 }, [ %rd431 + 0 ], %rd432; + // end inline asm + // begin inline asm + mov.u64 %rd435, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd435, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs140, 0x0; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs140 }, [ %rd434 + 0 ], %rd435; + // end inline asm + .loc 1 33 46 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:46 + mov.b32 %r120, {%rs67, %rs133}; + .loc 1 33 86 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:86 + mov.b32 {%rs149, %rs150}, %r120; + cvt.f32.bf16 %r121, %rs149; + cvt.f32.bf16 %r122, %rs150; + .loc 1 34 18 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:34:18 + mul.f32 %r123, %r111, %r122; + .loc 1 33 46 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:46 + mov.b32 %r124, {%rs68, %rs134}; + .loc 1 33 86 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:86 + mov.b32 {%rs151, %rs152}, %r124; + cvt.f32.bf16 %r125, %rs151; + cvt.f32.bf16 %r126, %rs152; + .loc 1 34 18 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:34:18 + mul.f32 %r127, %r107, %r126; + .loc 1 33 46 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:46 + mov.b32 %r128, {%rs69, %rs135}; + .loc 1 33 86 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:86 + mov.b32 {%rs153, %rs154}, %r128; + cvt.f32.bf16 %r129, %rs153; + cvt.f32.bf16 %r130, %rs154; + .loc 1 34 18 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:34:18 + mul.f32 %r131, %r103, %r130; + .loc 1 33 46 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:46 + mov.b32 %r132, {%rs70, %rs136}; + .loc 1 33 86 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:86 + mov.b32 {%rs155, %rs156}, %r132; + cvt.f32.bf16 %r133, %rs155; + cvt.f32.bf16 %r134, %rs156; + .loc 1 34 18 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:34:18 + mul.f32 %r135, %r99, %r134; + .loc 1 33 46 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:46 + mov.b32 %r136, {%rs71, %rs137}; + .loc 1 33 86 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:86 + mov.b32 {%rs157, %rs158}, %r136; + cvt.f32.bf16 %r137, %rs157; + cvt.f32.bf16 %r138, %rs158; + .loc 1 34 18 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:34:18 + mul.f32 %r139, %r95, %r138; + .loc 1 33 46 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:46 + mov.b32 %r140, {%rs72, %rs138}; + .loc 1 33 86 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:86 + mov.b32 {%rs159, %rs160}, %r140; + cvt.f32.bf16 %r141, %rs159; + cvt.f32.bf16 %r142, %rs160; + .loc 1 34 18 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:34:18 + mul.f32 %r143, %r91, %r142; + .loc 1 33 46 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:46 + mov.b32 %r144, {%rs73, %rs139}; + .loc 1 33 86 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:86 + mov.b32 {%rs161, %rs162}, %r144; + cvt.f32.bf16 %r145, %rs161; + cvt.f32.bf16 %r146, %rs162; + .loc 1 34 18 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:34:18 + mul.f32 %r147, %r87, %r146; + .loc 1 33 46 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:46 + mov.b32 %r148, {%rs74, %rs140}; + .loc 1 33 86 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:86 + mov.b32 {%rs163, %rs164}, %r148; + cvt.f32.bf16 %r149, %rs163; + cvt.f32.bf16 %r150, %rs164; + .loc 1 34 18 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:34:18 + mul.f32 %r151, %r83, %r150; + .loc 1 55 19 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:55:19 + fma.rn.f32 %r152, %r119, %r121, %r123; + fma.rn.f32 %r153, %r118, %r125, %r127; + fma.rn.f32 %r154, %r117, %r129, %r131; + fma.rn.f32 %r155, %r116, %r133, %r135; + fma.rn.f32 %r156, %r115, %r137, %r139; + fma.rn.f32 %r157, %r114, %r141, %r143; + fma.rn.f32 %r158, %r113, %r145, %r147; + fma.rn.f32 %r159, %r112, %r149, %r151; + .loc 1 56 25 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:56:25 + add.s64 %rd436, %rd102, %rd196; + add.s64 %rd437, %rd102, %rd197; + add.s64 %rd438, %rd102, %rd198; + add.s64 %rd439, %rd102, %rd199; + add.s64 %rd440, %rd102, %rd200; + add.s64 %rd441, %rd102, %rd201; + add.s64 %rd442, %rd102, %rd202; + add.s64 %rd443, %rd102, %rd203; + .loc 1 56 37 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:56:37 + cvt.rn.bf16.f32 %rs141, %r152; + cvt.rn.bf16.f32 %rs142, %r153; + cvt.rn.bf16.f32 %rs143, %r154; + cvt.rn.bf16.f32 %rs144, %r155; + cvt.rn.bf16.f32 %rs145, %r156; + cvt.rn.bf16.f32 %rs146, %r157; + cvt.rn.bf16.f32 %rs147, %r158; + cvt.rn.bf16.f32 %rs148, %r159; + // begin inline asm + @%p1 st.global.b16 [ %rd436 + 0 ], { %rs141 }; + // end inline asm + // begin inline asm + @%p2 st.global.b16 [ %rd437 + 0 ], { %rs142 }; + // end inline asm + // begin inline asm + @%p3 st.global.b16 [ %rd438 + 0 ], { %rs143 }; + // end inline asm + // begin inline asm + @%p4 st.global.b16 [ %rd439 + 0 ], { %rs144 }; + // end inline asm + // begin inline asm + @%p5 st.global.b16 [ %rd440 + 0 ], { %rs145 }; + // end inline asm + // begin inline asm + @%p6 st.global.b16 [ %rd441 + 0 ], { %rs146 }; + // end inline asm + // begin inline asm + @%p7 st.global.b16 [ %rd442 + 0 ], { %rs147 }; + // end inline asm + // begin inline asm + @%p8 st.global.b16 [ %rd443 + 0 ], { %rs148 }; + // end inline asm + .loc 1 56 4 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:56:4 + ret; +$L__BB0_49: + .loc 1 32 62 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:32:62 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd228, assertFunc_0; + cvta.global.u64 %rd229, %rd228; + st.param.b64 [param3], %rd229; + mov.b64 %rd230, assertFile_0; + cvta.global.u64 %rd231, %rd230; + st.param.b64 [param1], %rd231; + mov.b64 %rd232, assertMessage_0; + cvta.global.u64 %rd233, %rd232; + st.param.b64 [param0], %rd233; + st.param.b64 [param4], 1; + st.param.b32 [param2], 32; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__BB0_51: + .loc 1 52 64 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:52:64 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd406, assertFunc_1; + cvta.global.u64 %rd407, %rd406; + st.param.b64 [param3], %rd407; + mov.b64 %rd408, assertFile_1; + cvta.global.u64 %rd409, %rd408; + st.param.b64 [param1], %rd409; + mov.b64 %rd410, assertMessage_1; + cvta.global.u64 %rd411, %rd410; + st.param.b64 [param0], %rd411; + st.param.b64 [param4], 1; + st.param.b32 [param2], 52; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 53 +.b8 111 +.b8 115 +.b8 119 +.b8 110 +.b8 114 +.b8 55 +.b8 100 +.b8 112 +.b8 119 +.b8 119 +.b8 99 +.b8 113 +.b8 112 +.b8 53 +.b8 109 +.b8 55 +.b8 116 +.b8 104 +.b8 112 +.b8 97 +.b8 102 +.b8 52 +.b8 111 +.b8 119 +.b8 118 +.b8 112 +.b8 115 +.b8 101 +.b8 107 +.b8 97 +.b8 54 +.b8 97 +.b8 109 +.b8 100 +.b8 118 +.b8 100 +.b8 114 +.b8 111 +.b8 101 +.b8 119 +.b8 110 +.b8 111 +.b8 114 +.b8 122 +.b8 115 +.b8 114 +.b8 101 +.b8 54 +.b8 110 +.b8 50 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 53 +.b8 111 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.source new file mode 100644 index 0000000000000000000000000000000000000000..a9f6dad6c25a76fdba76f36d64c0c85614fd4c8c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.source @@ -0,0 +1,299 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":18:0) +#loc78 = loc("in_ptr0"(#loc)) +#loc79 = loc("in_ptr1"(#loc)) +#loc80 = loc("in_ptr2"(#loc)) +#loc81 = loc("in_ptr3"(#loc)) +#loc82 = loc("out_ptr0"(#loc)) +#loc83 = loc("ks0"(#loc)) +#loc84 = loc("ks1"(#loc)) +#loc85 = loc("ks2"(#loc)) +#loc86 = loc("ks3"(#loc)) +#loc87 = loc("ks4"(#loc)) +#loc88 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc89) + %xoffset_0 = arith.constant 1024 : i32 loc(#loc90) + %xoffset_1 = arith.constant 1024 : i32 loc(#loc90) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc90) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc91) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<1024xi32> loc(#loc92) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<1024xi32> loc(#loc92) + %xmask = tt.splat %xnumel : i32 -> tensor<1024xi32> loc(#loc93) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<1024xi32> loc(#loc93) + %x2 = arith.extsi %xindex_4 : tensor<1024xi32> to tensor<1024xi64> loc(#loc94) + %x2_6 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc94) + %x2_7 = arith.divsi %x2, %x2_6 : tensor<1024xi64> loc(#loc94) + %x2_8 = tt.splat %ks1 : i64 -> tensor<1024xi64> loc(#loc95) + %x2_9 = arith.remsi %x2_7, %x2_8 : tensor<1024xi64> loc(#loc95) + %x0 = arith.extsi %xindex_4 : tensor<1024xi32> to tensor<1024xi64> loc(#loc96) + %x0_10 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc96) + %x0_11 = arith.remsi %x0, %x0_10 : tensor<1024xi64> loc(#loc96) + %x5 = arith.extsi %xindex_4 : tensor<1024xi32> to tensor<1024xi64> loc(#loc97) + %x5_12 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc97) + %x5_13 = arith.divsi %x5, %x5_12 : tensor<1024xi64> loc(#loc97) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc98) + %tmp0_14 = tt.addptr %tmp0, %xindex_4 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc98) + %tmp0_15 = tt.load %tmp0_14, %xmask_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc99) + %tmp0_16 = arith.extf %tmp0_15 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc100) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc101) + %tmp1_17 = tt.addptr %tmp1, %x2_9 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc101) + %tmp1_18 = tt.load %tmp1_17, %xmask_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc102) + %tmp3 = tt.splat %ks2 : i64 -> tensor<1024xi64> loc(#loc103) + %tmp3_19 = arith.addi %tmp1_18, %tmp3 : tensor<1024xi64> loc(#loc103) + %tmp4 = arith.constant 0 : i32 loc(#loc104) + %tmp4_20 = arith.extsi %tmp4 : i32 to i64 loc(#loc104) + %tmp4_21 = tt.splat %tmp4_20 : i64 -> tensor<1024xi64> loc(#loc104) + %tmp4_22 = arith.cmpi slt, %tmp1_18, %tmp4_21 : tensor<1024xi64> loc(#loc104) + %tmp5 = arith.select %tmp4_22, %tmp3_19, %tmp1_18 : tensor<1024xi1>, tensor<1024xi64> loc(#loc105) + %c0_i32 = arith.constant 0 : i32 loc(#loc18) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc18) + %1 = tt.splat %0 : i64 -> tensor<1024xi64> loc(#loc18) + %2 = arith.cmpi sle, %1, %tmp5 : tensor<1024xi64> loc(#loc18) + %3 = tt.splat %ks2 : i64 -> tensor<1024xi64> loc(#loc19) + %4 = arith.cmpi slt, %tmp5, %3 : tensor<1024xi64> loc(#loc19) + %5 = arith.andi %2, %4 : tensor<1024xi1> loc(#loc20) + %true = arith.constant true loc(#loc21) + %cst = arith.constant dense : tensor<1024xi1> loc(#loc21) + %6 = arith.xori %xmask_5, %cst : tensor<1024xi1> loc(#loc21) + %7 = arith.ori %5, %6 : tensor<1024xi1> loc(#loc22) + tt.assert %7, "index out of bounds: 0 <= tmp5 < ks2" : tensor<1024xi1> loc(#loc23) + %tmp7 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc106) + %tmp7_23 = arith.muli %tmp7, %tmp5 : tensor<1024xi64> loc(#loc106) + %tmp7_24 = arith.addi %x0_11, %tmp7_23 : tensor<1024xi64> loc(#loc107) + %tmp7_25 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc108) + %tmp7_26 = tt.addptr %tmp7_25, %tmp7_24 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc108) + %tmp7_27 = tt.load %tmp7_26, %xmask_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc109) + %tmp7_28 = arith.extf %tmp7_27 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc110) + %tmp8 = arith.mulf %tmp0_16, %tmp7_28 : tensor<1024xf32> loc(#loc111) + %tmp10 = arith.constant 0 : i64 loc(#loc112) + %tmp10_29 = arith.constant dense<0> : tensor<1xi64> loc(#loc112) + %tmp11 = arith.constant dense<0> : tensor<1024xi64> loc(#loc113) + %tmp11_30 = arith.cmpi sge, %x0_11, %tmp11 : tensor<1024xi64> loc(#loc113) + %tmp12 = arith.constant 2 : i32 loc(#loc114) + %tmp12_31 = arith.constant 2 : i64 loc(#loc114) + %tmp12_32 = arith.divsi %ks3, %tmp12_31 : i64 loc(#loc114) + %tmp12_33 = arith.constant -1 : i32 loc(#loc115) + %tmp12_34 = arith.constant -1 : i64 loc(#loc115) + %tmp12_35 = arith.muli %tmp12_34, %tmp12_32 : i64 loc(#loc115) + %tmp12_36 = arith.addi %ks3, %tmp12_35 : i64 loc(#loc116) + %tmp13 = tt.splat %tmp12_36 : i64 -> tensor<1024xi64> loc(#loc117) + %tmp13_37 = arith.cmpi slt, %x0_11, %tmp13 : tensor<1024xi64> loc(#loc117) + %tmp14 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc118) + %tmp14_38 = arith.muli %tmp14, %x5_13 : tensor<1024xi64> loc(#loc118) + %tmp14_39 = arith.constant 2 : i32 loc(#loc119) + %tmp14_40 = arith.constant 2 : i64 loc(#loc119) + %tmp14_41 = arith.divsi %ks3, %tmp14_40 : i64 loc(#loc119) + %tmp14_42 = tt.splat %tmp14_41 : i64 -> tensor<1024xi64> loc(#loc120) + %tmp14_43 = arith.addi %tmp14_38, %tmp14_42 : tensor<1024xi64> loc(#loc120) + %tmp14_44 = arith.addi %tmp14_43, %x0_11 : tensor<1024xi64> loc(#loc121) + %tmp14_45 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc122) + %tmp14_46 = tt.addptr %tmp14_45, %tmp14_44 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc122) + %tmp14_47 = arith.andi %tmp13_37, %xmask_5 : tensor<1024xi1> loc(#loc123) + %tmp14_48 = arith.constant 0.000000e+00 : f32 loc(#loc124) + %tmp14_49 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc124) + %tmp14_50 = arith.truncf %tmp14_49 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc124) + %tmp14_51 = tt.load %tmp14_46, %tmp14_47, %tmp14_50 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc124) + %tmp14_52 = arith.extf %tmp14_51 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc125) + %tmp15 = arith.constant 0.000000e+00 : f32 loc(#loc126) + %tmp15_53 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc126) + %tmp15_54 = arith.subf %tmp15_53, %tmp14_52 : tensor<1024xf32> loc(#loc126) + %tmp16 = arith.constant 0.000000e+00 : f32 loc(#loc127) + %tmp16_55 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc127) + %tmp17 = arith.select %tmp13_37, %tmp15_54, %tmp16_55 : tensor<1024xi1>, tensor<1024xf32> loc(#loc128) + %tmp18 = tt.splat %tmp12_36 : i64 -> tensor<1024xi64> loc(#loc129) + %tmp18_56 = arith.cmpi sge, %x0_11, %tmp18 : tensor<1024xi64> loc(#loc129) + %tmp20 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc130) + %tmp20_57 = arith.cmpi slt, %x0_11, %tmp20 : tensor<1024xi64> loc(#loc130) + %tmp21 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc131) + %tmp21_58 = arith.muli %tmp21, %x5_13 : tensor<1024xi64> loc(#loc131) + %tmp21_59 = arith.constant -1 : i32 loc(#loc132) + %tmp21_60 = arith.constant -1 : i64 loc(#loc132) + %tmp21_61 = arith.muli %tmp21_60, %ks3 : i64 loc(#loc132) + %tmp21_62 = tt.splat %tmp21_61 : i64 -> tensor<1024xi64> loc(#loc133) + %tmp21_63 = arith.addi %x0_11, %tmp21_62 : tensor<1024xi64> loc(#loc133) + %tmp21_64 = arith.constant 2 : i32 loc(#loc134) + %tmp21_65 = arith.constant 2 : i64 loc(#loc134) + %tmp21_66 = arith.divsi %ks3, %tmp21_65 : i64 loc(#loc134) + %tmp21_67 = tt.splat %tmp21_66 : i64 -> tensor<1024xi64> loc(#loc135) + %tmp21_68 = arith.addi %tmp21_63, %tmp21_67 : tensor<1024xi64> loc(#loc135) + %tmp21_69 = arith.addi %tmp21_58, %tmp21_68 : tensor<1024xi64> loc(#loc136) + %tmp21_70 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc137) + %tmp21_71 = tt.addptr %tmp21_70, %tmp21_69 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc137) + %tmp21_72 = arith.andi %tmp18_56, %xmask_5 : tensor<1024xi1> loc(#loc138) + %tmp21_73 = arith.constant 0.000000e+00 : f32 loc(#loc139) + %tmp21_74 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc139) + %tmp21_75 = arith.truncf %tmp21_74 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc139) + %tmp21_76 = tt.load %tmp21_71, %tmp21_72, %tmp21_75 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc139) + %tmp21_77 = arith.extf %tmp21_76 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc140) + %tmp22 = arith.select %tmp13_37, %tmp17, %tmp21_77 : tensor<1024xi1>, tensor<1024xf32> loc(#loc141) + %tmp24 = tt.splat %ks4 : i64 -> tensor<1024xi64> loc(#loc142) + %tmp24_78 = arith.addi %tmp1_18, %tmp24 : tensor<1024xi64> loc(#loc142) + %tmp25 = arith.select %tmp4_22, %tmp24_78, %tmp1_18 : tensor<1024xi1>, tensor<1024xi64> loc(#loc143) + %c0_i32_79 = arith.constant 0 : i32 loc(#loc62) + %8 = arith.extsi %c0_i32_79 : i32 to i64 loc(#loc62) + %9 = tt.splat %8 : i64 -> tensor<1024xi64> loc(#loc62) + %10 = arith.cmpi sle, %9, %tmp25 : tensor<1024xi64> loc(#loc62) + %11 = tt.splat %ks4 : i64 -> tensor<1024xi64> loc(#loc63) + %12 = arith.cmpi slt, %tmp25, %11 : tensor<1024xi64> loc(#loc63) + %13 = arith.andi %10, %12 : tensor<1024xi1> loc(#loc64) + %true_80 = arith.constant true loc(#loc65) + %cst_81 = arith.constant dense : tensor<1024xi1> loc(#loc65) + %14 = arith.xori %xmask_5, %cst_81 : tensor<1024xi1> loc(#loc65) + %15 = arith.ori %13, %14 : tensor<1024xi1> loc(#loc66) + tt.assert %15, "index out of bounds: 0 <= tmp25 < ks4" : tensor<1024xi1> loc(#loc67) + %tmp27 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc144) + %tmp27_82 = arith.muli %tmp27, %tmp25 : tensor<1024xi64> loc(#loc144) + %tmp27_83 = arith.addi %x0_11, %tmp27_82 : tensor<1024xi64> loc(#loc145) + %tmp27_84 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc146) + %tmp27_85 = tt.addptr %tmp27_84, %tmp27_83 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc146) + %tmp27_86 = tt.load %tmp27_85, %xmask_5 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc147) + %tmp27_87 = arith.extf %tmp27_86 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc148) + %tmp28 = arith.mulf %tmp22, %tmp27_87 : tensor<1024xf32> loc(#loc149) + %tmp29 = arith.addf %tmp8, %tmp28 : tensor<1024xf32> loc(#loc150) + %16 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc75) + %17 = tt.addptr %16, %xindex_4 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc75) + %18 = arith.truncf %tmp29 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc76) + tt.store %17, %18, %xmask_5 : tensor<1024x!tt.ptr> loc(#loc76) + tt.return loc(#loc77) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":23:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":24:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":25:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":26:30) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":26:35) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":26:75) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":27:30) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":27:35) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":29:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":30:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":31:32) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:28) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:44) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:37) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:54) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:52) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:62) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:39) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:30) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:46) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:86) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":34:18) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":36:28) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":37:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":38:31) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":38:24) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":38:18) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":39:19) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:35) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:48) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:41) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:54) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:31) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:68) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:60) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:119) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":41:13) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":42:38) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":43:35) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":44:20) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":46:19) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:35) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:52) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:47) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:67) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:60) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:41) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:31) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:81) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:73) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:132) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":48:35) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":50:19) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":51:34) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:28) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:46) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:38) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:56) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:54) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:64) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:40) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:36) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:31) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:48) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:88) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":54:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":55:19) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":56:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":56:37) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":56:4) +#loc89 = loc("xoffset"(#loc1)) +#loc90 = loc("xoffset"(#loc2)) +#loc91 = loc("xindex"(#loc3)) +#loc92 = loc("xindex"(#loc4)) +#loc93 = loc("xmask"(#loc5)) +#loc94 = loc("x2"(#loc6)) +#loc95 = loc("x2"(#loc7)) +#loc96 = loc("x0"(#loc8)) +#loc97 = loc("x5"(#loc9)) +#loc98 = loc("tmp0"(#loc10)) +#loc99 = loc("tmp0"(#loc11)) +#loc100 = loc("tmp0"(#loc12)) +#loc101 = loc("tmp1"(#loc13)) +#loc102 = loc("tmp1"(#loc14)) +#loc103 = loc("tmp3"(#loc15)) +#loc104 = loc("tmp4"(#loc16)) +#loc105 = loc("tmp5"(#loc17)) +#loc106 = loc("tmp7"(#loc24)) +#loc107 = loc("tmp7"(#loc25)) +#loc108 = loc("tmp7"(#loc26)) +#loc109 = loc("tmp7"(#loc27)) +#loc110 = loc("tmp7"(#loc28)) +#loc111 = loc("tmp8"(#loc29)) +#loc112 = loc("tmp10"(#loc30)) +#loc113 = loc("tmp11"(#loc31)) +#loc114 = loc("tmp12"(#loc32)) +#loc115 = loc("tmp12"(#loc33)) +#loc116 = loc("tmp12"(#loc34)) +#loc117 = loc("tmp13"(#loc35)) +#loc118 = loc("tmp14"(#loc36)) +#loc119 = loc("tmp14"(#loc37)) +#loc120 = loc("tmp14"(#loc38)) +#loc121 = loc("tmp14"(#loc39)) +#loc122 = loc("tmp14"(#loc40)) +#loc123 = loc("tmp14"(#loc41)) +#loc124 = loc("tmp14"(#loc42)) +#loc125 = loc("tmp14"(#loc43)) +#loc126 = loc("tmp15"(#loc44)) +#loc127 = loc("tmp16"(#loc45)) +#loc128 = loc("tmp17"(#loc46)) +#loc129 = loc("tmp18"(#loc47)) +#loc130 = loc("tmp20"(#loc48)) +#loc131 = loc("tmp21"(#loc49)) +#loc132 = loc("tmp21"(#loc50)) +#loc133 = loc("tmp21"(#loc51)) +#loc134 = loc("tmp21"(#loc52)) +#loc135 = loc("tmp21"(#loc53)) +#loc136 = loc("tmp21"(#loc54)) +#loc137 = loc("tmp21"(#loc55)) +#loc138 = loc("tmp21"(#loc56)) +#loc139 = loc("tmp21"(#loc57)) +#loc140 = loc("tmp21"(#loc58)) +#loc141 = loc("tmp22"(#loc59)) +#loc142 = loc("tmp24"(#loc60)) +#loc143 = loc("tmp25"(#loc61)) +#loc144 = loc("tmp27"(#loc68)) +#loc145 = loc("tmp27"(#loc69)) +#loc146 = loc("tmp27"(#loc70)) +#loc147 = loc("tmp27"(#loc71)) +#loc148 = loc("tmp27"(#loc72)) +#loc149 = loc("tmp28"(#loc73)) +#loc150 = loc("tmp29"(#loc74)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..f1381591e7c0769812e9bfa1d44213196e5e7004 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttgir @@ -0,0 +1,232 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":18:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("in_ptr1"(#loc)) +#loc72 = loc("in_ptr2"(#loc)) +#loc73 = loc("in_ptr3"(#loc)) +#loc74 = loc("out_ptr0"(#loc)) +#loc75 = loc("ks0"(#loc)) +#loc76 = loc("ks1"(#loc)) +#loc77 = loc("ks2"(#loc)) +#loc78 = loc("ks3"(#loc)) +#loc79 = loc("ks4"(#loc)) +#loc80 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<1024xi1, #blocked> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xbf16, #blocked> loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<1024xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc81) + %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc82) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc83) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32, #blocked> loc(#loc84) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32, #blocked> loc(#loc84) + %xmask = tt.splat %xnumel : i32 -> tensor<1024xi32, #blocked> loc(#loc85) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<1024xi32, #blocked> loc(#loc85) + %x2 = arith.extsi %xindex_5 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> loc(#loc86) + %x2_7 = tt.splat %ks0 : i64 -> tensor<1024xi64, #blocked> loc(#loc86) + %x2_8 = arith.divsi %x2, %x2_7 : tensor<1024xi64, #blocked> loc(#loc86) + %x2_9 = tt.splat %ks1 : i64 -> tensor<1024xi64, #blocked> loc(#loc87) + %x2_10 = arith.remsi %x2_8, %x2_9 : tensor<1024xi64, #blocked> loc(#loc87) + %x0 = tt.splat %ks3 : i64 -> tensor<1024xi64, #blocked> loc(#loc88) + %x0_11 = arith.remsi %x2, %x0 : tensor<1024xi64, #blocked> loc(#loc88) + %x5 = arith.divsi %x2, %x0 : tensor<1024xi64, #blocked> loc(#loc89) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc90) + %tmp0_12 = tt.addptr %tmp0, %xindex_5 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc90) + %tmp0_13 = tt.load %tmp0_12, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc91) + %tmp0_14 = arith.extf %tmp0_13 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc92) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc93) + %tmp1_15 = tt.addptr %tmp1, %x2_10 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc93) + %tmp1_16 = tt.load %tmp1_15, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc94) + %tmp3 = tt.splat %ks2 : i64 -> tensor<1024xi64, #blocked> loc(#loc95) + %tmp3_17 = arith.addi %tmp1_16, %tmp3 : tensor<1024xi64, #blocked> loc(#loc95) + %tmp4 = arith.cmpi slt, %tmp1_16, %cst_1 : tensor<1024xi64, #blocked> loc(#loc96) + %tmp5 = arith.select %tmp4, %tmp3_17, %tmp1_16 : tensor<1024xi1, #blocked>, tensor<1024xi64, #blocked> loc(#loc97) + %0 = arith.cmpi sge, %tmp5, %cst_1 : tensor<1024xi64, #blocked> loc(#loc19) + %1 = arith.cmpi slt, %tmp5, %tmp3 : tensor<1024xi64, #blocked> loc(#loc20) + %2 = arith.andi %0, %1 : tensor<1024xi1, #blocked> loc(#loc21) + %3 = arith.xori %xmask_6, %cst : tensor<1024xi1, #blocked> loc(#loc22) + %4 = arith.ori %2, %3 : tensor<1024xi1, #blocked> loc(#loc23) + tt.assert %4, "index out of bounds: 0 <= tmp5 < ks2" : tensor<1024xi1, #blocked> loc(#loc24) + %tmp7 = arith.muli %x0, %tmp5 : tensor<1024xi64, #blocked> loc(#loc98) + %tmp7_18 = arith.addi %x0_11, %tmp7 : tensor<1024xi64, #blocked> loc(#loc99) + %tmp7_19 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc100) + %tmp7_20 = tt.addptr %tmp7_19, %tmp7_18 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc100) + %tmp7_21 = tt.load %tmp7_20, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc101) + %tmp7_22 = arith.extf %tmp7_21 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc102) + %tmp8 = arith.mulf %tmp0_14, %tmp7_22 : tensor<1024xf32, #blocked> loc(#loc103) + %tmp12 = arith.divsi %ks3, %c2_i64 : i64 loc(#loc104) + %tmp12_23 = arith.subi %ks3, %tmp12 : i64 loc(#loc105) + %tmp13 = tt.splat %tmp12_23 : i64 -> tensor<1024xi64, #blocked> loc(#loc106) + %tmp13_24 = arith.cmpi slt, %x0_11, %tmp13 : tensor<1024xi64, #blocked> loc(#loc106) + %tmp14 = arith.muli %x0, %x5 : tensor<1024xi64, #blocked> loc(#loc107) + %tmp14_25 = tt.splat %tmp12 : i64 -> tensor<1024xi64, #blocked> loc(#loc108) + %tmp14_26 = arith.addi %tmp14, %tmp14_25 : tensor<1024xi64, #blocked> loc(#loc108) + %tmp14_27 = arith.addi %tmp14_26, %x0_11 : tensor<1024xi64, #blocked> loc(#loc109) + %tmp14_28 = tt.addptr %tmp0, %tmp14_27 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc110) + %tmp14_29 = arith.andi %tmp13_24, %xmask_6 : tensor<1024xi1, #blocked> loc(#loc111) + %tmp14_30 = tt.load %tmp14_28, %tmp14_29, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc112) + %tmp14_31 = arith.extf %tmp14_30 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc113) + %tmp15 = arith.subf %cst_2, %tmp14_31 : tensor<1024xf32, #blocked> loc(#loc114) + %tmp18 = arith.cmpi sge, %x0_11, %tmp13 : tensor<1024xi64, #blocked> loc(#loc115) + %tmp21 = arith.muli %ks3, %c-1_i64 : i64 loc(#loc116) + %tmp21_32 = tt.splat %tmp21 : i64 -> tensor<1024xi64, #blocked> loc(#loc117) + %tmp21_33 = arith.addi %x0_11, %tmp21_32 : tensor<1024xi64, #blocked> loc(#loc117) + %tmp21_34 = arith.addi %tmp21_33, %tmp14_25 : tensor<1024xi64, #blocked> loc(#loc118) + %tmp21_35 = arith.addi %tmp14, %tmp21_34 : tensor<1024xi64, #blocked> loc(#loc119) + %tmp21_36 = tt.addptr %tmp0, %tmp21_35 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc120) + %tmp21_37 = arith.andi %tmp18, %xmask_6 : tensor<1024xi1, #blocked> loc(#loc121) + %tmp21_38 = tt.load %tmp21_36, %tmp21_37, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc122) + %tmp21_39 = arith.extf %tmp21_38 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc123) + %tmp22 = arith.select %tmp13_24, %tmp15, %tmp21_39 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc135) + %tmp24 = tt.splat %ks4 : i64 -> tensor<1024xi64, #blocked> loc(#loc126) + %tmp24_40 = arith.addi %tmp1_16, %tmp24 : tensor<1024xi64, #blocked> loc(#loc126) + %tmp25 = arith.select %tmp4, %tmp24_40, %tmp1_16 : tensor<1024xi1, #blocked>, tensor<1024xi64, #blocked> loc(#loc127) + %5 = arith.cmpi sge, %tmp25, %cst_1 : tensor<1024xi64, #blocked> loc(#loc55) + %6 = arith.cmpi slt, %tmp25, %tmp24 : tensor<1024xi64, #blocked> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<1024xi1, #blocked> loc(#loc57) + %8 = arith.ori %7, %3 : tensor<1024xi1, #blocked> loc(#loc58) + tt.assert %8, "index out of bounds: 0 <= tmp25 < ks4" : tensor<1024xi1, #blocked> loc(#loc59) + %tmp27 = arith.muli %x0, %tmp25 : tensor<1024xi64, #blocked> loc(#loc128) + %tmp27_41 = arith.addi %x0_11, %tmp27 : tensor<1024xi64, #blocked> loc(#loc129) + %tmp27_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc130) + %tmp27_43 = tt.addptr %tmp27_42, %tmp27_41 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc130) + %tmp27_44 = tt.load %tmp27_43, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc131) + %tmp27_45 = arith.extf %tmp27_44 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc132) + %tmp28 = arith.mulf %tmp22, %tmp27_45 : tensor<1024xf32, #blocked> loc(#loc133) + %tmp29 = arith.addf %tmp8, %tmp28 : tensor<1024xf32, #blocked> loc(#loc134) + %9 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc67) + %10 = tt.addptr %9, %xindex_5 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc67) + %11 = arith.truncf %tmp29 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc68) + tt.store %10, %11, %xmask_6 : tensor<1024x!tt.ptr, #blocked> loc(#loc68) + tt.return loc(#loc69) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":23:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":23:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":24:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":25:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":26:30) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":26:35) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":26:75) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":27:30) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":27:35) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":29:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":30:18) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":31:32) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:28) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:44) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:37) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:54) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:52) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:62) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:39) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:35) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:30) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:46) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:86) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":34:18) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":38:31) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":38:18) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":39:19) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:35) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:41) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:68) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:60) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:119) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":41:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":44:20) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:52) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:47) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:60) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:41) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:81) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:73) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:132) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":48:35) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":43:35) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":50:19) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":51:34) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:46) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:38) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:54) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:64) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:40) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:36) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:31) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:48) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:88) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":54:20) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":55:19) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":56:25) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":56:37) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":56:4) +#loc81 = loc("xoffset"(#loc2)) +#loc82 = loc("xoffset"(#loc3)) +#loc83 = loc("xindex"(#loc4)) +#loc84 = loc("xindex"(#loc5)) +#loc85 = loc("xmask"(#loc6)) +#loc86 = loc("x2"(#loc7)) +#loc87 = loc("x2"(#loc8)) +#loc88 = loc("x0"(#loc9)) +#loc89 = loc("x5"(#loc10)) +#loc90 = loc("tmp0"(#loc11)) +#loc91 = loc("tmp0"(#loc12)) +#loc92 = loc("tmp0"(#loc13)) +#loc93 = loc("tmp1"(#loc14)) +#loc94 = loc("tmp1"(#loc15)) +#loc95 = loc("tmp3"(#loc16)) +#loc96 = loc("tmp4"(#loc17)) +#loc97 = loc("tmp5"(#loc18)) +#loc98 = loc("tmp7"(#loc25)) +#loc99 = loc("tmp7"(#loc26)) +#loc100 = loc("tmp7"(#loc27)) +#loc101 = loc("tmp7"(#loc28)) +#loc102 = loc("tmp7"(#loc29)) +#loc103 = loc("tmp8"(#loc30)) +#loc104 = loc("tmp12"(#loc31)) +#loc105 = loc("tmp12"(#loc32)) +#loc106 = loc("tmp13"(#loc33)) +#loc107 = loc("tmp14"(#loc34)) +#loc108 = loc("tmp14"(#loc35)) +#loc109 = loc("tmp14"(#loc36)) +#loc110 = loc("tmp14"(#loc37)) +#loc111 = loc("tmp14"(#loc38)) +#loc112 = loc("tmp14"(#loc39)) +#loc113 = loc("tmp14"(#loc40)) +#loc114 = loc("tmp15"(#loc41)) +#loc115 = loc("tmp18"(#loc42)) +#loc116 = loc("tmp21"(#loc43)) +#loc117 = loc("tmp21"(#loc44)) +#loc118 = loc("tmp21"(#loc45)) +#loc119 = loc("tmp21"(#loc46)) +#loc120 = loc("tmp21"(#loc47)) +#loc121 = loc("tmp21"(#loc48)) +#loc122 = loc("tmp21"(#loc49)) +#loc123 = loc("tmp21"(#loc50)) +#loc124 = loc("tmp22"(#loc51)) +#loc125 = loc("tmp17"(#loc52)) +#loc126 = loc("tmp24"(#loc53)) +#loc127 = loc("tmp25"(#loc54)) +#loc128 = loc("tmp27"(#loc60)) +#loc129 = loc("tmp27"(#loc61)) +#loc130 = loc("tmp27"(#loc62)) +#loc131 = loc("tmp27"(#loc63)) +#loc132 = loc("tmp27"(#loc64)) +#loc133 = loc("tmp28"(#loc65)) +#loc134 = loc("tmp29"(#loc66)) +#loc135 = loc(fused[#loc124, #loc125]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..148bc3fb49fe6bbf6bcde95da01fe754fbef4afe --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/2TU6ZCF6AOXLWQQED5J7FS5ZXMYK7TIOQ6T2MLB767275BROXJCA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttir @@ -0,0 +1,231 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":18:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("in_ptr1"(#loc)) +#loc72 = loc("in_ptr2"(#loc)) +#loc73 = loc("in_ptr3"(#loc)) +#loc74 = loc("out_ptr0"(#loc)) +#loc75 = loc("ks0"(#loc)) +#loc76 = loc("ks1"(#loc)) +#loc77 = loc("ks2"(#loc)) +#loc78 = loc("ks3"(#loc)) +#loc79 = loc("ks4"(#loc)) +#loc80 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1024xbf16> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1024xi64> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1024xf32> loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %cst_2 = arith.constant dense : tensor<1024xi1> loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc81) + %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc82) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc83) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32> loc(#loc84) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32> loc(#loc84) + %xmask = tt.splat %xnumel : i32 -> tensor<1024xi32> loc(#loc85) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<1024xi32> loc(#loc85) + %x2 = arith.extsi %xindex_5 : tensor<1024xi32> to tensor<1024xi64> loc(#loc86) + %x2_7 = tt.splat %ks0 : i64 -> tensor<1024xi64> loc(#loc86) + %x2_8 = arith.divsi %x2, %x2_7 : tensor<1024xi64> loc(#loc86) + %x2_9 = tt.splat %ks1 : i64 -> tensor<1024xi64> loc(#loc87) + %x2_10 = arith.remsi %x2_8, %x2_9 : tensor<1024xi64> loc(#loc87) + %x0 = tt.splat %ks3 : i64 -> tensor<1024xi64> loc(#loc88) + %x0_11 = arith.remsi %x2, %x0 : tensor<1024xi64> loc(#loc88) + %x5 = arith.divsi %x2, %x0 : tensor<1024xi64> loc(#loc89) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc90) + %tmp0_12 = tt.addptr %tmp0, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc90) + %tmp0_13 = tt.load %tmp0_12, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc91) + %tmp0_14 = arith.extf %tmp0_13 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc92) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc93) + %tmp1_15 = tt.addptr %tmp1, %x2_10 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc93) + %tmp1_16 = tt.load %tmp1_15, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc94) + %tmp3 = tt.splat %ks2 : i64 -> tensor<1024xi64> loc(#loc95) + %tmp3_17 = arith.addi %tmp1_16, %tmp3 : tensor<1024xi64> loc(#loc95) + %tmp4 = arith.cmpi slt, %tmp1_16, %cst_0 : tensor<1024xi64> loc(#loc96) + %tmp5 = arith.select %tmp4, %tmp3_17, %tmp1_16 : tensor<1024xi1>, tensor<1024xi64> loc(#loc97) + %0 = arith.cmpi sge, %tmp5, %cst_0 : tensor<1024xi64> loc(#loc19) + %1 = arith.cmpi slt, %tmp5, %tmp3 : tensor<1024xi64> loc(#loc20) + %2 = arith.andi %0, %1 : tensor<1024xi1> loc(#loc21) + %3 = arith.xori %xmask_6, %cst_2 : tensor<1024xi1> loc(#loc22) + %4 = arith.ori %2, %3 : tensor<1024xi1> loc(#loc23) + tt.assert %4, "index out of bounds: 0 <= tmp5 < ks2" : tensor<1024xi1> loc(#loc24) + %tmp7 = arith.muli %x0, %tmp5 : tensor<1024xi64> loc(#loc98) + %tmp7_18 = arith.addi %x0_11, %tmp7 : tensor<1024xi64> loc(#loc99) + %tmp7_19 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc100) + %tmp7_20 = tt.addptr %tmp7_19, %tmp7_18 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc100) + %tmp7_21 = tt.load %tmp7_20, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc101) + %tmp7_22 = arith.extf %tmp7_21 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc102) + %tmp8 = arith.mulf %tmp0_14, %tmp7_22 : tensor<1024xf32> loc(#loc103) + %tmp12 = arith.divsi %ks3, %c2_i64 : i64 loc(#loc104) + %tmp12_23 = arith.subi %ks3, %tmp12 : i64 loc(#loc105) + %tmp13 = tt.splat %tmp12_23 : i64 -> tensor<1024xi64> loc(#loc106) + %tmp13_24 = arith.cmpi slt, %x0_11, %tmp13 : tensor<1024xi64> loc(#loc106) + %tmp14 = arith.muli %x0, %x5 : tensor<1024xi64> loc(#loc107) + %tmp14_25 = tt.splat %tmp12 : i64 -> tensor<1024xi64> loc(#loc108) + %tmp14_26 = arith.addi %tmp14, %tmp14_25 : tensor<1024xi64> loc(#loc108) + %tmp14_27 = arith.addi %tmp14_26, %x0_11 : tensor<1024xi64> loc(#loc109) + %tmp14_28 = tt.addptr %tmp0, %tmp14_27 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc110) + %tmp14_29 = arith.andi %tmp13_24, %xmask_6 : tensor<1024xi1> loc(#loc111) + %tmp14_30 = tt.load %tmp14_28, %tmp14_29, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc112) + %tmp14_31 = arith.extf %tmp14_30 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc113) + %tmp15 = arith.subf %cst_1, %tmp14_31 : tensor<1024xf32> loc(#loc114) + %tmp18 = arith.cmpi sge, %x0_11, %tmp13 : tensor<1024xi64> loc(#loc115) + %tmp21 = arith.muli %ks3, %c-1_i64 : i64 loc(#loc116) + %tmp21_32 = tt.splat %tmp21 : i64 -> tensor<1024xi64> loc(#loc117) + %tmp21_33 = arith.addi %x0_11, %tmp21_32 : tensor<1024xi64> loc(#loc117) + %tmp21_34 = arith.addi %tmp21_33, %tmp14_25 : tensor<1024xi64> loc(#loc118) + %tmp21_35 = arith.addi %tmp14, %tmp21_34 : tensor<1024xi64> loc(#loc119) + %tmp21_36 = tt.addptr %tmp0, %tmp21_35 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc120) + %tmp21_37 = arith.andi %tmp18, %xmask_6 : tensor<1024xi1> loc(#loc121) + %tmp21_38 = tt.load %tmp21_36, %tmp21_37, %cst evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc122) + %tmp21_39 = arith.extf %tmp21_38 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc123) + %tmp22 = arith.select %tmp13_24, %tmp15, %tmp21_39 : tensor<1024xi1>, tensor<1024xf32> loc(#loc135) + %tmp24 = tt.splat %ks4 : i64 -> tensor<1024xi64> loc(#loc126) + %tmp24_40 = arith.addi %tmp1_16, %tmp24 : tensor<1024xi64> loc(#loc126) + %tmp25 = arith.select %tmp4, %tmp24_40, %tmp1_16 : tensor<1024xi1>, tensor<1024xi64> loc(#loc127) + %5 = arith.cmpi sge, %tmp25, %cst_0 : tensor<1024xi64> loc(#loc55) + %6 = arith.cmpi slt, %tmp25, %tmp24 : tensor<1024xi64> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<1024xi1> loc(#loc57) + %8 = arith.ori %7, %3 : tensor<1024xi1> loc(#loc58) + tt.assert %8, "index out of bounds: 0 <= tmp25 < ks4" : tensor<1024xi1> loc(#loc59) + %tmp27 = arith.muli %x0, %tmp25 : tensor<1024xi64> loc(#loc128) + %tmp27_41 = arith.addi %x0_11, %tmp27 : tensor<1024xi64> loc(#loc129) + %tmp27_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc130) + %tmp27_43 = tt.addptr %tmp27_42, %tmp27_41 : tensor<1024x!tt.ptr>, tensor<1024xi64> loc(#loc130) + %tmp27_44 = tt.load %tmp27_43, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr> loc(#loc131) + %tmp27_45 = arith.extf %tmp27_44 : tensor<1024xbf16> to tensor<1024xf32> loc(#loc132) + %tmp28 = arith.mulf %tmp22, %tmp27_45 : tensor<1024xf32> loc(#loc133) + %tmp29 = arith.addf %tmp8, %tmp28 : tensor<1024xf32> loc(#loc134) + %9 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc67) + %10 = tt.addptr %9, %xindex_5 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc67) + %11 = arith.truncf %tmp29 : tensor<1024xf32> to tensor<1024xbf16> loc(#loc68) + tt.store %10, %11, %xmask_6 : tensor<1024x!tt.ptr> loc(#loc68) + tt.return loc(#loc69) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":23:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":23:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":24:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":25:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":26:30) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":26:35) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":26:75) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":27:30) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":27:35) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":29:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":30:18) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":31:32) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:28) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:44) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:37) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:54) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:52) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:62) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:39) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:35) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:30) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:46) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:86) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":34:18) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":38:31) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":38:18) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":39:19) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:35) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:41) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:68) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:60) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:119) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":41:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":44:20) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:52) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:47) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:60) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:41) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:81) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:73) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:132) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":48:35) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":43:35) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":50:19) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":51:34) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:46) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:38) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:54) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:64) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:40) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:36) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:31) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:48) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:88) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":54:20) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":55:19) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":56:25) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":56:37) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":56:4) +#loc81 = loc("xoffset"(#loc2)) +#loc82 = loc("xoffset"(#loc3)) +#loc83 = loc("xindex"(#loc4)) +#loc84 = loc("xindex"(#loc5)) +#loc85 = loc("xmask"(#loc6)) +#loc86 = loc("x2"(#loc7)) +#loc87 = loc("x2"(#loc8)) +#loc88 = loc("x0"(#loc9)) +#loc89 = loc("x5"(#loc10)) +#loc90 = loc("tmp0"(#loc11)) +#loc91 = loc("tmp0"(#loc12)) +#loc92 = loc("tmp0"(#loc13)) +#loc93 = loc("tmp1"(#loc14)) +#loc94 = loc("tmp1"(#loc15)) +#loc95 = loc("tmp3"(#loc16)) +#loc96 = loc("tmp4"(#loc17)) +#loc97 = loc("tmp5"(#loc18)) +#loc98 = loc("tmp7"(#loc25)) +#loc99 = loc("tmp7"(#loc26)) +#loc100 = loc("tmp7"(#loc27)) +#loc101 = loc("tmp7"(#loc28)) +#loc102 = loc("tmp7"(#loc29)) +#loc103 = loc("tmp8"(#loc30)) +#loc104 = loc("tmp12"(#loc31)) +#loc105 = loc("tmp12"(#loc32)) +#loc106 = loc("tmp13"(#loc33)) +#loc107 = loc("tmp14"(#loc34)) +#loc108 = loc("tmp14"(#loc35)) +#loc109 = loc("tmp14"(#loc36)) +#loc110 = loc("tmp14"(#loc37)) +#loc111 = loc("tmp14"(#loc38)) +#loc112 = loc("tmp14"(#loc39)) +#loc113 = loc("tmp14"(#loc40)) +#loc114 = loc("tmp15"(#loc41)) +#loc115 = loc("tmp18"(#loc42)) +#loc116 = loc("tmp21"(#loc43)) +#loc117 = loc("tmp21"(#loc44)) +#loc118 = loc("tmp21"(#loc45)) +#loc119 = loc("tmp21"(#loc46)) +#loc120 = loc("tmp21"(#loc47)) +#loc121 = loc("tmp21"(#loc48)) +#loc122 = loc("tmp21"(#loc49)) +#loc123 = loc("tmp21"(#loc50)) +#loc124 = loc("tmp22"(#loc51)) +#loc125 = loc("tmp17"(#loc52)) +#loc126 = loc("tmp24"(#loc53)) +#loc127 = loc("tmp25"(#loc54)) +#loc128 = loc("tmp27"(#loc60)) +#loc129 = loc("tmp27"(#loc61)) +#loc130 = loc("tmp27"(#loc62)) +#loc131 = loc("tmp27"(#loc63)) +#loc132 = loc("tmp27"(#loc64)) +#loc133 = loc("tmp28"(#loc65)) +#loc134 = loc("tmp29"(#loc66)) +#loc135 = loc(fused[#loc124, #loc125]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ac3c32e2fc1fa0aad32c684f0f1314579dc71558 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..32f33a4ed1075bd518c33033d04bab9affdb3dae Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8e4730ce8a5cf4fb55266ccba156bc4aa43c863b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json @@ -0,0 +1 @@ +{"hash": "db4fe3a24d03312f3dea8e5ee7eb1a7c6d73f10bc6de254a208984a64d7fe777", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..ab078d3e1070e12a70797dd4d547d238d8297b90 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir @@ -0,0 +1,333 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i32 %9, i32 %10, ptr addrspace(1) readnone captures(none) %11, ptr addrspace(1) readnone captures(none) %12) local_unnamed_addr #0 !dbg !4 { + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %15 = icmp slt i32 %14, %9, !dbg !8 + %16 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %17 = and i32 %16, 511, !dbg !9 + %18 = zext nneg i32 %14 to i64, !dbg !10 + %.frozen = freeze i64 %3, !dbg !10 + %19 = sdiv i64 %18, %.frozen, !dbg !10 + %20 = srem i64 %19, %4, !dbg !11 + %21 = mul i64 %19, %.frozen, !dbg !12 + %.decomposed = sub i64 %18, %21, !dbg !12 + %22 = sdiv i64 %18, %7, !dbg !13 + %23 = shl nsw i64 %20, 7, !dbg !14 + %24 = shl nuw nsw i64 %.decomposed, 7, !dbg !15 + %25 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !16 + %26 = and i32 %16, 127 + %27 = zext nneg i32 %26 to i64 + %28 = or disjoint i64 %24, %27 + %29 = icmp slt i64 %28, %6 + %30 = icmp sge i64 %28, %8 + %31 = tail call i64 @llvm.smin.i64(i64 %8, i64 0) + %32 = sub nsw i64 %.decomposed, %20 + %33 = shl nsw i64 %32, 7 + %34 = zext nneg i32 %17 to i64, !dbg !17 + %35 = zext nneg i32 %26 to i64, !dbg !17 + %36 = zext nneg i32 %16 to i64, !dbg !17 + %37 = insertelement <4 x i1> poison, i1 %15, i64 0, !dbg !18 + %38 = shufflevector <4 x i1> %37, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !18 + %39 = insertelement <4 x i1> poison, i1 %29, i64 0, !dbg !19 + %40 = shufflevector <4 x i1> %39, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !19 + %41 = insertelement <4 x i64> poison, i64 %23, i64 0, !dbg !20 + %42 = shufflevector <4 x i64> %41, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !20 + %43 = insertelement <4 x i64> poison, i64 %5, i64 0, !dbg !21 + %44 = shufflevector <4 x i64> %43, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !21 + %45 = insertelement <4 x i64> poison, i64 %28, i64 0, !dbg !22 + %46 = shufflevector <4 x i64> %45, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !22 + %47 = insertelement <4 x i1> poison, i1 %30, i64 0, !dbg !23 + %48 = shufflevector <4 x i1> %47, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !23 + %49 = insertelement <4 x i64> poison, i64 %33, i64 0, !dbg !24 + %50 = shufflevector <4 x i64> %49, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !24 + %51 = insertelement <4 x i64> poison, i64 %8, i64 0, !dbg !25 + %52 = shufflevector <4 x i64> %51, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !25 + br label %53, !dbg !17 + +53: ; preds = %13, %53 + %indvars.iv = phi i64 [ 0, %13 ], [ %indvars.iv.next, %53 ] + %54 = phi <4 x i64> [ zeroinitializer, %13 ], [ %128, %53 ] + %55 = or disjoint i64 %indvars.iv, %34, !dbg !26 + %56 = or disjoint i64 %indvars.iv, %36, !dbg !26 + %57 = lshr i64 %55, 7, !dbg !27 + %58 = lshr i64 %56, 7, !dbg !27 + %59 = trunc nuw nsw i64 %58 to i32, !dbg !27 + %60 = or i32 %59, 4, !dbg !27 + %61 = or disjoint i64 %57, 8, !dbg !27 + %62 = or i32 %59, 12, !dbg !27 + %63 = zext nneg i32 %60 to i64, !dbg !20 + %64 = zext nneg i32 %62 to i64, !dbg !20 + %65 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28 + %66 = sub nsw i64 %35, %57, !dbg !29 + %67 = sub nsw i32 %26, %60, !dbg !29 + %68 = sub nsw i64 %35, %61, !dbg !29 + %69 = sub nsw i32 %26, %62, !dbg !29 + %70 = sext i32 %67 to i64, !dbg !30 + %71 = sext i32 %69 to i64, !dbg !30 + %72 = insertelement <4 x i64> poison, i64 %57, i64 0, !dbg !20 + %73 = insertelement <4 x i64> %72, i64 %63, i64 1, !dbg !20 + %74 = insertelement <4 x i64> %73, i64 %61, i64 2, !dbg !20 + %75 = insertelement <4 x i64> %74, i64 %64, i64 3, !dbg !20 + %76 = or disjoint <4 x i64> %42, %75, !dbg !20 + %77 = icmp slt <4 x i64> %76, %44, !dbg !21 + %78 = and <4 x i1> %40, %77, !dbg !19 + %79 = icmp sge <4 x i64> %76, %46, !dbg !22 + %80 = extractelement <4 x i1> %78, i64 0, !dbg !31 + %81 = and i1 %15, %80, !dbg !31 + %82 = extractelement <4 x i1> %78, i64 1, !dbg !31 + %83 = and i1 %15, %82, !dbg !31 + %84 = extractelement <4 x i1> %78, i64 2, !dbg !31 + %85 = and i1 %15, %84, !dbg !31 + %86 = extractelement <4 x i1> %78, i64 3, !dbg !31 + %87 = and i1 %15, %86, !dbg !31 + %88 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %65, i1 %81) #5, !dbg !28 + %89 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28 + %90 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %89, i1 %83) #5, !dbg !28 + %91 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28 + %92 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %91, i1 %85) #5, !dbg !28 + %93 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28 + %94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %93, i1 %87) #5, !dbg !28 + %95 = insertelement <4 x i64> poison, i64 %88, i64 0, !dbg !32 + %96 = insertelement <4 x i64> %95, i64 %90, i64 1, !dbg !32 + %97 = insertelement <4 x i64> %96, i64 %92, i64 2, !dbg !32 + %98 = insertelement <4 x i64> %97, i64 %94, i64 3, !dbg !32 + %99 = icmp slt <4 x i64> %46, %98, !dbg !32 + %100 = icmp slt <4 x i64> %76, %98, !dbg !33 + %101 = and <4 x i1> %99, %100, !dbg !34 + %102 = and <4 x i1> %79, %101, !dbg !35 + %103 = srem i64 %28, %8, !dbg !36 + %.not = icmp eq i64 %103, 0, !dbg !37 + %104 = select i1 %.not, i64 0, i64 %31, !dbg !38 + %105 = add nsw i64 %104, %103, !dbg !38 + %106 = insertelement <4 x i64> poison, i64 %105, i64 0, !dbg !39 + %107 = shufflevector <4 x i64> %106, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !39 + %108 = icmp slt <4 x i64> %107, %98, !dbg !39 + %109 = insertelement <4 x i64> poison, i64 %66, i64 0, !dbg !24 + %110 = insertelement <4 x i64> %109, i64 %70, i64 1, !dbg !24 + %111 = insertelement <4 x i64> %110, i64 %68, i64 2, !dbg !24 + %112 = insertelement <4 x i64> %111, i64 %71, i64 3, !dbg !24 + %113 = add nsw <4 x i64> %50, %112, !dbg !24 + %114 = srem <4 x i64> %113, %52, !dbg !25 + %115 = icmp ne <4 x i64> %114, zeroinitializer, !dbg !40 + %116 = xor <4 x i64> %114, %52, !dbg !41 + %117 = icmp slt <4 x i64> %116, zeroinitializer, !dbg !41 + %118 = and <4 x i1> %115, %117, !dbg !42 + %119 = select <4 x i1> %118, <4 x i64> %52, <4 x i64> zeroinitializer, !dbg !43 + %120 = sub <4 x i64> zeroinitializer, %119, !dbg !44 + %121 = icmp eq <4 x i64> %114, %120, !dbg !44 + %122 = and <4 x i1> %108, %121, !dbg !23 + %123 = and <4 x i1> %48, %122, !dbg !23 + %124 = or <4 x i1> %102, %123, !dbg !45 + %125 = select <4 x i1> %38, <4 x i1> %78, <4 x i1> zeroinitializer, !dbg !18 + %126 = select <4 x i1> %125, <4 x i1> %124, <4 x i1> zeroinitializer, !dbg !18 + %127 = zext <4 x i1> %126 to <4 x i64>, !dbg !18 + %128 = add <4 x i64> %54, %127, !dbg !18 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !17 + %129 = icmp samesign ult i64 %indvars.iv, 14336, !dbg !17 + br i1 %129, label %53, label %130, !dbg !17 + +130: ; preds = %53 + %131 = and i32 %16, 31, !dbg !9 + %132 = lshr i32 %16, 5, !dbg !9 + %133 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %128), !dbg !46 + %extelt.offset = lshr i64 %133, 32, !dbg !50 + %134 = trunc nuw i64 %extelt.offset to i32, !dbg !50 + %135 = trunc i64 %133 to i32, !dbg !50 + %136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 16, i32 31), !dbg !50 + %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 16, i32 31), !dbg !50 + %138 = insertelement <2 x i32> poison, i32 %136, i64 0, !dbg !50 + %139 = insertelement <2 x i32> %138, i32 %137, i64 1, !dbg !50 + %140 = bitcast <2 x i32> %139 to i64, !dbg !50 + %141 = add i64 %133, %140, !dbg !46 + %extelt.offset1 = lshr i64 %141, 32, !dbg !50 + %142 = trunc nuw i64 %extelt.offset1 to i32, !dbg !50 + %143 = trunc i64 %141 to i32, !dbg !50 + %144 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %143, i32 8, i32 31), !dbg !50 + %145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 8, i32 31), !dbg !50 + %146 = insertelement <2 x i32> poison, i32 %144, i64 0, !dbg !50 + %147 = insertelement <2 x i32> %146, i32 %145, i64 1, !dbg !50 + %148 = bitcast <2 x i32> %147 to i64, !dbg !50 + %149 = add i64 %141, %148, !dbg !46 + %extelt.offset2 = lshr i64 %149, 32, !dbg !50 + %150 = trunc nuw i64 %extelt.offset2 to i32, !dbg !50 + %151 = trunc i64 %149 to i32, !dbg !50 + %152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %151, i32 4, i32 31), !dbg !50 + %153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %150, i32 4, i32 31), !dbg !50 + %154 = insertelement <2 x i32> poison, i32 %152, i64 0, !dbg !50 + %155 = insertelement <2 x i32> %154, i32 %153, i64 1, !dbg !50 + %156 = bitcast <2 x i32> %155 to i64, !dbg !50 + %157 = add i64 %149, %156, !dbg !46 + %extelt.offset3 = lshr i64 %157, 32, !dbg !50 + %158 = trunc nuw i64 %extelt.offset3 to i32, !dbg !50 + %159 = trunc i64 %157 to i32, !dbg !50 + %160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %159, i32 2, i32 31), !dbg !50 + %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %158, i32 2, i32 31), !dbg !50 + %162 = insertelement <2 x i32> poison, i32 %160, i64 0, !dbg !50 + %163 = insertelement <2 x i32> %162, i32 %161, i64 1, !dbg !50 + %164 = bitcast <2 x i32> %163 to i64, !dbg !50 + %165 = add i64 %157, %164, !dbg !46 + %extelt.offset4 = lshr i64 %165, 32, !dbg !50 + %166 = trunc nuw i64 %extelt.offset4 to i32, !dbg !50 + %167 = trunc i64 %165 to i32, !dbg !50 + %168 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %167, i32 1, i32 31), !dbg !50 + %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %166, i32 1, i32 31), !dbg !50 + %170 = insertelement <2 x i32> poison, i32 %168, i64 0, !dbg !50 + %171 = insertelement <2 x i32> %170, i32 %169, i64 1, !dbg !50 + %172 = bitcast <2 x i32> %171 to i64, !dbg !50 + %173 = add i64 %165, %172, !dbg !46 + %174 = and i32 %132, 15, !dbg !50 + %175 = icmp eq i32 %131, 0, !dbg !50 + %176 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %174, !dbg !50 + %177 = insertelement <1 x i64> poison, i64 %173, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %176, <1 x i64> %177, i1 %175) #5, !dbg !50 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50 + %178 = icmp samesign ult i32 %16, 16, !dbg !50 + %179 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %16, !dbg !50 + %180 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %179, i1 %178) #5, !dbg !50 + %extelt.offset5 = lshr i64 %180, 32, !dbg !50 + %181 = trunc nuw i64 %extelt.offset5 to i32, !dbg !50 + %182 = trunc i64 %180 to i32, !dbg !50 + %183 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %182, i32 8, i32 31), !dbg !50 + %184 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %181, i32 8, i32 31), !dbg !50 + %185 = insertelement <2 x i32> poison, i32 %183, i64 0, !dbg !50 + %186 = insertelement <2 x i32> %185, i32 %184, i64 1, !dbg !50 + %187 = bitcast <2 x i32> %186 to i64, !dbg !50 + %188 = add i64 %180, %187, !dbg !46 + %extelt.offset6 = lshr i64 %188, 32, !dbg !50 + %189 = trunc nuw i64 %extelt.offset6 to i32, !dbg !50 + %190 = trunc i64 %188 to i32, !dbg !50 + %191 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %190, i32 4, i32 31), !dbg !50 + %192 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %189, i32 4, i32 31), !dbg !50 + %193 = insertelement <2 x i32> poison, i32 %191, i64 0, !dbg !50 + %194 = insertelement <2 x i32> %193, i32 %192, i64 1, !dbg !50 + %195 = bitcast <2 x i32> %194 to i64, !dbg !50 + %196 = add i64 %188, %195, !dbg !46 + %extelt.offset7 = lshr i64 %196, 32, !dbg !50 + %197 = trunc nuw i64 %extelt.offset7 to i32, !dbg !50 + %198 = trunc i64 %196 to i32, !dbg !50 + %199 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 2, i32 31), !dbg !50 + %200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %197, i32 2, i32 31), !dbg !50 + %201 = insertelement <2 x i32> poison, i32 %199, i64 0, !dbg !50 + %202 = insertelement <2 x i32> %201, i32 %200, i64 1, !dbg !50 + %203 = bitcast <2 x i32> %202 to i64, !dbg !50 + %204 = add i64 %196, %203, !dbg !46 + %extelt.offset8 = lshr i64 %204, 32, !dbg !50 + %205 = trunc nuw i64 %extelt.offset8 to i32, !dbg !50 + %206 = trunc i64 %204 to i32, !dbg !50 + %207 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %206, i32 1, i32 31), !dbg !50 + %208 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %205, i32 1, i32 31), !dbg !50 + %209 = insertelement <2 x i32> poison, i32 %207, i64 0, !dbg !50 + %210 = insertelement <2 x i32> %209, i32 %208, i64 1, !dbg !50 + %211 = bitcast <2 x i32> %210 to i64, !dbg !50 + %212 = add i64 %204, %211, !dbg !46 + %213 = icmp eq i32 %16, 0, !dbg !50 + %214 = insertelement <1 x i64> poison, i64 %212, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %179, <1 x i64> %214, i1 %213) #5, !dbg !50 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50 + %215 = load i64, ptr addrspace(3) @global_smem, align 16, !dbg !50 + %216 = add i64 %215, -1, !dbg !51 + %217 = icmp ult i64 %216, 16383, !dbg !51 + %218 = zext i1 %217 to i32, !dbg !52 + %219 = icmp eq i64 %215, 16384, !dbg !53 + %220 = zext i1 %219 to i32, !dbg !52 + %221 = getelementptr i32, ptr addrspace(1) %1, i64 %18, !dbg !54 + %222 = icmp eq i32 %17, 0, !dbg !55 + %223 = and i1 %222, %15, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %218, ptr addrspace(1) %221, i1 %223) #5, !dbg !55 + %224 = getelementptr i32, ptr addrspace(1) %2, i64 %18, !dbg !56 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %220, ptr addrspace(1) %224, i1 %223) #5, !dbg !57 + ret void, !dbg !58 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.smin.i64(i64, i64) #4 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1", linkageName: "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 24, column: 21, scope: !4) +!9 = !DILocation(line: 25, column: 37, scope: !4) +!10 = !DILocation(line: 27, column: 21, scope: !4) +!11 = !DILocation(line: 27, column: 28, scope: !4) +!12 = !DILocation(line: 28, column: 19, scope: !4) +!13 = !DILocation(line: 29, column: 19, scope: !4) +!14 = !DILocation(line: 39, column: 26, scope: !4) +!15 = !DILocation(line: 42, column: 26, scope: !4) +!16 = !DILocation(line: 49, column: 35, scope: !4) +!17 = !DILocation(line: 32, column: 40, scope: !4) +!18 = !DILocation(line: 86, column: 50, scope: !4) +!19 = !DILocation(line: 45, column: 22, scope: !4) +!20 = !DILocation(line: 39, column: 22, scope: !4) +!21 = !DILocation(line: 41, column: 22, scope: !4) +!22 = !DILocation(line: 48, column: 23, scope: !4) +!23 = !DILocation(line: 79, column: 24, scope: !4) +!24 = !DILocation(line: 69, column: 51, scope: !4) +!25 = !DILocation(line: 70, column: 25, scope: !4) +!26 = !DILocation(line: 33, column: 31, scope: !4) +!27 = !DILocation(line: 37, column: 27, scope: !4) +!28 = !DILocation(line: 49, column: 77, scope: !4) +!29 = !DILocation(line: 69, column: 24, scope: !4) +!30 = !DILocation(line: 69, column: 38, scope: !4) +!31 = !DILocation(line: 49, column: 94, scope: !4) +!32 = !DILocation(line: 50, column: 23, scope: !4) +!33 = !DILocation(line: 51, column: 23, scope: !4) +!34 = !DILocation(line: 52, column: 24, scope: !4) +!35 = !DILocation(line: 53, column: 23, scope: !4) +!36 = !DILocation(line: 58, column: 24, scope: !4) +!37 = !DILocation(line: 60, column: 25, scope: !4) +!38 = !DILocation(line: 66, column: 39, scope: !4) +!39 = !DILocation(line: 67, column: 24, scope: !4) +!40 = !DILocation(line: 71, column: 25, scope: !4) +!41 = !DILocation(line: 73, column: 25, scope: !4) +!42 = !DILocation(line: 74, column: 24, scope: !4) +!43 = !DILocation(line: 76, column: 39, scope: !4) +!44 = !DILocation(line: 78, column: 25, scope: !4) +!45 = !DILocation(line: 80, column: 24, scope: !4) +!46 = !DILocation(line: 261, column: 15, scope: !47, inlinedAt: !49) +!47 = distinct !DILexicalBlockFile(scope: !4, file: !48, discriminator: 0) +!48 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!49 = !DILocation(line: 87, column: 27, scope: !4) +!50 = !DILocation(line: 291, column: 36, scope: !47, inlinedAt: !49) +!51 = !DILocation(line: 92, column: 20, scope: !4) +!52 = !DILocation(line: 0, scope: !4) +!53 = !DILocation(line: 95, column: 21, scope: !4) +!54 = !DILocation(line: 98, column: 25, scope: !4) +!55 = !DILocation(line: 98, column: 37, scope: !4) +!56 = !DILocation(line: 99, column: 25, scope: !4) +!57 = !DILocation(line: 99, column: 37, scope: !4) +!58 = !DILocation(line: 99, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..91121bc813ad1746fe93e789ab35348df5862483 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx @@ -0,0 +1,809 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 // -- Begin function triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 +.visible .entry triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_2, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_3, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_4, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_5, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_6, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_7, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_8, + .param .u32 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_9, + .param .u32 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_10, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_11, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_12 +) +.reqntid 512 +{ + .reg .pred %p<89>; + .reg .b32 %r<77>; + .reg .b64 %rd<216>; + .loc 1 18 0 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:18:0 +$L__func_begin0: + .loc 1 18 0 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:18:0 + +// %bb.0: + ld.param.b64 %rd65, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_4]; +$L__tmp0: + .loc 1 22 28 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:22:28 + mov.u32 %r8, %ctaid.x; + ld.param.b64 %rd70, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_3]; + .loc 1 27 21 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:27:21 + cvt.u64.u32 %rd1, %r8; + and.b64 %rd71, %rd70, -4294967296; + setp.ne.b64 %p21, %rd71, 0; + cvt.u32.u64 %r75, %rd1; + @%p21 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd205, %rd1, %rd70; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r9, %rd70; + div.u32 %r11, %r75, %r9; + cvt.u64.u32 %rd205, %r11; +$L__BB0_3: + .loc 1 0 21 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:0:21 + mov.u32 %r1, %tid.x; + ld.param.b64 %rd68, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_7]; + .loc 1 27 28 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:27:28 + or.b64 %rd72, %rd205, %rd65; + and.b64 %rd73, %rd72, -4294967296; + setp.ne.b64 %p22, %rd73, 0; + @%p22 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + rem.s64 %rd206, %rd205, %rd65; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r12, %rd65; + cvt.u32.u64 %r13, %rd205; + rem.u32 %r14, %r13, %r12; + cvt.u64.u32 %rd206, %r14; +$L__BB0_6: + .loc 1 0 28 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:0:28 + ld.param.b32 %r7, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_9]; + ld.param.b64 %rd69, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_8]; + ld.param.b64 %rd67, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_6]; + ld.param.b64 %rd62, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_0]; + and.b32 %r2, %r1, 511; + .loc 1 28 19 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:28:19 + mul.lo.s64 %rd74, %rd205, %rd70; + sub.s64 %rd9, %rd1, %rd74; + .loc 1 29 19 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:29:19 + and.b64 %rd75, %rd68, -4294967296; + setp.ne.b64 %p23, %rd75, 0; + @%p23 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + div.s64 %rd207, %rd1, %rd68; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r15, %rd68; + div.u32 %r17, %r75, %r15; + cvt.u64.u32 %rd207, %r17; +$L__BB0_9: + .loc 1 0 19 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:0:19 + ld.param.b64 %rd66, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_5]; + ld.param.b64 %rd64, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_2]; + ld.param.b64 %rd63, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_1]; + .loc 1 24 21 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:24:21 + setp.lt.s32 %p1, %r75, %r7; + .loc 1 39 26 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:39:26 + shl.b64 %rd16, %rd206, 7; + .loc 1 42 26 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:42:26 + shl.b64 %rd81, %rd9, 7; + .loc 1 49 35 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:49:35 + shl.b64 %rd82, %rd207, 3; + add.s64 %rd90, %rd62, %rd82; + and.b32 %r3, %r1, 127; + cvt.u64.u32 %rd83, %r3; + or.b64 %rd24, %rd81, %rd83; + setp.lt.s64 %p5, %rd24, %rd67; + setp.ge.s64 %p9, %rd24, %rd69; + min.s64 %rd15, %rd69, 0; + sub.s64 %rd84, %rd9, %rd206; + shl.b64 %rd28, %rd84, 7; + .loc 1 32 40 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:32:40 + cvt.u64.u32 %rd85, %r2; + cvt.u64.u32 %rd86, %r1; + shr.u64 %rd87, %rd86, 7; + cvt.u32.u64 %r76, %rd87; + shr.u64 %rd209, %rd85, 7; + sub.s64 %rd208, %rd83, %rd209; + mov.b64 %rd211, 0; + mov.b64 %rd210, -2048; + mov.b64 %rd212, %rd211; + mov.b64 %rd213, %rd211; + mov.b64 %rd214, %rd211; + bra.uni $L__BB0_10; +$L__BB0_12: // in Loop: Header=BB0_10 Depth=1 + .loc 1 58 24 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:58:24 + rem.s64 %rd215, %rd24, %rd69; +$L__BB0_13: // in Loop: Header=BB0_10 Depth=1 + .loc 1 0 0 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:0 + sub.s32 %r22, %r3, %r20; + add.s64 %rd45, %rd208, -8; + sub.s32 %r23, %r3, %r21; + cvt.s64.s32 %rd46, %r22; + cvt.s64.s32 %rd47, %r23; + .loc 1 60 25 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:60:25 + setp.eq.b64 %p42, %rd215, 0; + .loc 1 66 39 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:66:39 + selp.b64 %rd117, 0, %rd15, %p42; + add.s64 %rd118, %rd117, %rd215; + .loc 1 67 24 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:67:24 + setp.lt.s64 %p43, %rd118, %rd89; + setp.lt.s64 %p44, %rd118, %rd93; + setp.lt.s64 %p45, %rd118, %rd97; + setp.lt.s64 %p46, %rd118, %rd101; + .loc 1 69 51 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:69:51 + add.s64 %rd119, %rd28, %rd47; + add.s64 %rd120, %rd28, %rd45; + add.s64 %rd121, %rd28, %rd46; + add.s64 %rd122, %rd28, %rd208; + .loc 1 70 25 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:70:25 + rem.s64 %rd123, %rd122, %rd69; + rem.s64 %rd124, %rd121, %rd69; + rem.s64 %rd125, %rd120, %rd69; + rem.s64 %rd126, %rd119, %rd69; + .loc 1 71 25 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:71:25 + setp.ne.b64 %p47, %rd126, 0; + setp.ne.b64 %p48, %rd125, 0; + setp.ne.b64 %p49, %rd124, 0; + setp.ne.b64 %p50, %rd123, 0; + .loc 1 73 25 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:73:25 + xor.b64 %rd127, %rd126, %rd69; + xor.b64 %rd128, %rd125, %rd69; + xor.b64 %rd129, %rd124, %rd69; + xor.b64 %rd130, %rd123, %rd69; + .loc 1 76 39 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:76:39 + shr.s64 %rd131, %rd130, 63; + and.b64 %rd132, %rd131, %rd69; + selp.b64 %rd133, %rd132, 0, %p50; + shr.s64 %rd134, %rd129, 63; + and.b64 %rd135, %rd134, %rd69; + selp.b64 %rd136, %rd135, 0, %p49; + shr.s64 %rd137, %rd128, 63; + and.b64 %rd138, %rd137, %rd69; + selp.b64 %rd139, %rd138, 0, %p48; + shr.s64 %rd140, %rd127, 63; + and.b64 %rd141, %rd140, %rd69; + selp.b64 %rd142, %rd141, 0, %p47; + .loc 1 78 25 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:78:25 + neg.s64 %rd143, %rd142; + neg.s64 %rd144, %rd139; + neg.s64 %rd145, %rd136; + neg.s64 %rd146, %rd133; + setp.eq.b64 %p51, %rd123, %rd146; + setp.eq.b64 %p52, %rd124, %rd145; + setp.eq.b64 %p53, %rd125, %rd144; + setp.eq.b64 %p54, %rd126, %rd143; + .loc 1 79 24 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:79:24 + and.pred %p55, %p46, %p54; + and.pred %p57, %p45, %p53; + and.pred %p59, %p44, %p52; + and.pred %p61, %p43, %p51; + and.pred %p63, %p9, %p61; + and.pred %p64, %p9, %p59; + and.pred %p65, %p9, %p57; + and.pred %p66, %p9, %p55; + .loc 1 80 24 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:80:24 + or.pred %p67, %p20, %p66; + or.pred %p68, %p19, %p65; + or.pred %p69, %p18, %p64; + or.pred %p70, %p17, %p63; + .loc 1 86 50 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:86:50 + and.pred %p75, %p24, %p70; + and.pred %p76, %p25, %p69; + and.pred %p77, %p26, %p68; + and.pred %p78, %p27, %p67; + selp.b64 %rd147, 1, 0, %p78; + selp.b64 %rd148, 1, 0, %p77; + selp.b64 %rd149, 1, 0, %p76; + selp.b64 %rd150, 1, 0, %p75; + add.s64 %rd211, %rd211, %rd150; + add.s64 %rd212, %rd212, %rd149; + add.s64 %rd213, %rd213, %rd148; + add.s64 %rd214, %rd214, %rd147; + .loc 1 32 40 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:32:40 + add.s64 %rd210, %rd210, 2048; + add.s32 %r76, %r76, 16; + add.s64 %rd209, %rd209, 16; + add.s64 %rd208, %rd208, -16; + setp.lt.u64 %p79, %rd210, 14336; + @%p79 bra $L__BB0_10; + bra.uni $L__BB0_14; +$L__BB0_10: // =>This Inner Loop Header: Depth=1 + .loc 1 37 27 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:37:27 + or.b32 %r20, %r76, 4; + add.s64 %rd104, %rd209, %rd16; + or.b32 %r21, %r76, 12; + .loc 1 39 22 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:39:22 + cvt.u64.u32 %rd105, %r20; + cvt.u64.u32 %rd106, %r21; + .loc 1 49 77 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:49:77 + // begin inline asm + mov.u64 %rd88, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd88, 1.0; + // end inline asm + .loc 1 39 22 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:39:22 + or.b64 %rd107, %rd16, %rd106; + add.s64 %rd108, %rd104, 8; + or.b64 %rd109, %rd16, %rd105; + or.b64 %rd110, %rd16, %rd209; + .loc 1 41 22 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:41:22 + setp.lt.s64 %p29, %rd110, %rd66; + setp.lt.s64 %p30, %rd109, %rd66; + setp.lt.s64 %p31, %rd108, %rd66; + setp.lt.s64 %p32, %rd107, %rd66; + .loc 1 45 22 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:45:22 + and.pred %p16, %p5, %p32; + and.pred %p15, %p5, %p31; + and.pred %p14, %p5, %p30; + and.pred %p13, %p5, %p29; + .loc 1 48 23 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:48:23 + setp.ge.s64 %p33, %rd107, %rd24; + setp.ge.s64 %p34, %rd108, %rd24; + setp.ge.s64 %p35, %rd109, %rd24; + setp.ge.s64 %p36, %rd110, %rd24; + .loc 1 49 94 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:49:94 + and.pred %p24, %p1, %p13; + and.pred %p25, %p1, %p14; + and.pred %p26, %p1, %p15; + and.pred %p27, %p1, %p16; + .loc 1 49 77 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:49:77 + // begin inline asm + mov.u64 %rd89, 0x0; + @%p24 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd89 }, [ %rd90 + 0 ], %rd88; + // end inline asm + // begin inline asm + mov.u64 %rd92, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd92, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd93, 0x0; + @%p25 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd93 }, [ %rd90 + 0 ], %rd92; + // end inline asm + // begin inline asm + mov.u64 %rd96, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd96, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd97, 0x0; + @%p26 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd97 }, [ %rd90 + 0 ], %rd96; + // end inline asm + // begin inline asm + mov.u64 %rd100, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd100, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd101, 0x0; + @%p27 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd101 }, [ %rd90 + 0 ], %rd100; + // end inline asm + .loc 1 52 24 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:52:24 + max.s64 %rd111, %rd24, %rd107; + setp.lt.s64 %p37, %rd111, %rd101; + max.s64 %rd112, %rd24, %rd108; + setp.lt.s64 %p38, %rd112, %rd97; + max.s64 %rd113, %rd24, %rd109; + setp.lt.s64 %p39, %rd113, %rd93; + max.s64 %rd114, %rd24, %rd110; + setp.lt.s64 %p40, %rd114, %rd89; + .loc 1 53 23 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:53:23 + and.pred %p17, %p36, %p40; + and.pred %p18, %p35, %p39; + and.pred %p19, %p34, %p38; + and.pred %p20, %p33, %p37; + .loc 1 58 24 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:58:24 + or.b64 %rd115, %rd24, %rd69; + and.b64 %rd116, %rd115, -4294967296; + setp.ne.b64 %p41, %rd116, 0; + @%p41 bra $L__BB0_12; +// %bb.11: // in Loop: Header=BB0_10 Depth=1 + cvt.u32.u64 %r24, %rd69; + cvt.u32.u64 %r25, %rd24; + rem.u32 %r26, %r25, %r24; + cvt.u64.u32 %rd215, %r26; + bra.uni $L__BB0_13; +$L__BB0_14: + .loc 1 25 37 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:25:37 + and.b32 %r33, %r1, 31; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd156, %rd211, %rd213; + add.s64 %rd157, %rd212, %rd214; + add.s64 %rd158, %rd156, %rd157; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + mov.b64 {_, %r34}, %rd158; + cvt.u32.u64 %r35, %rd158; + shfl.sync.bfly.b32 %r36, %r35, 16, 31, -1; + shfl.sync.bfly.b32 %r37, %r34, 16, 31, -1; + cvt.u64.u32 %rd159, %r36; + cvt.u64.u32 %rd160, %r37; + shl.b64 %rd161, %rd160, 32; + or.b64 %rd162, %rd159, %rd161; + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd163, %rd158, %rd162; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + mov.b64 {_, %r38}, %rd163; + cvt.u32.u64 %r39, %rd163; + shfl.sync.bfly.b32 %r40, %r39, 8, 31, -1; + shfl.sync.bfly.b32 %r41, %r38, 8, 31, -1; + cvt.u64.u32 %rd164, %r40; + cvt.u64.u32 %rd165, %r41; + shl.b64 %rd166, %rd165, 32; + or.b64 %rd167, %rd164, %rd166; + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd168, %rd163, %rd167; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + mov.b64 {_, %r42}, %rd168; + cvt.u32.u64 %r43, %rd168; + shfl.sync.bfly.b32 %r44, %r43, 4, 31, -1; + shfl.sync.bfly.b32 %r45, %r42, 4, 31, -1; + cvt.u64.u32 %rd169, %r44; + cvt.u64.u32 %rd170, %r45; + shl.b64 %rd171, %rd170, 32; + or.b64 %rd172, %rd169, %rd171; + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd173, %rd168, %rd172; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + mov.b64 {_, %r46}, %rd173; + cvt.u32.u64 %r47, %rd173; + shfl.sync.bfly.b32 %r48, %r47, 2, 31, -1; + shfl.sync.bfly.b32 %r49, %r46, 2, 31, -1; + cvt.u64.u32 %rd174, %r48; + cvt.u64.u32 %rd175, %r49; + shl.b64 %rd176, %rd175, 32; + or.b64 %rd177, %rd174, %rd176; + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd178, %rd173, %rd177; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + mov.b64 {_, %r50}, %rd178; + cvt.u32.u64 %r51, %rd178; + shfl.sync.bfly.b32 %r52, %r51, 1, 31, -1; + shfl.sync.bfly.b32 %r53, %r50, 1, 31, -1; + cvt.u64.u32 %rd179, %r52; + cvt.u64.u32 %rd180, %r53; + shl.b64 %rd181, %rd180, 32; + or.b64 %rd182, %rd179, %rd181; + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd151, %rd178, %rd182; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + setp.eq.b32 %p80, %r33, 0; + shr.u32 %r54, %r1, 2; + and.b32 %r55, %r54, 120; + mov.b32 %r56, global_smem; + add.s32 %r27, %r56, %r55; + // begin inline asm + @%p80 st.shared.b64 [ %r27 + 0 ], %rd151; + // end inline asm + bar.sync 0; + setp.lt.u32 %p81, %r1, 16; + shl.b32 %r57, %r1, 3; + add.s32 %r28, %r56, %r57; + // begin inline asm + @%p81 ld.shared.b64 %rd152, [ %r28 + 0 ]; + // end inline asm + mov.b64 {_, %r58}, %rd152; + cvt.u32.u64 %r59, %rd152; + shfl.sync.bfly.b32 %r60, %r59, 8, 31, -1; + shfl.sync.bfly.b32 %r61, %r58, 8, 31, -1; + cvt.u64.u32 %rd183, %r60; + cvt.u64.u32 %rd184, %r61; + shl.b64 %rd185, %rd184, 32; + or.b64 %rd186, %rd183, %rd185; + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd187, %rd152, %rd186; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + mov.b64 {_, %r62}, %rd187; + cvt.u32.u64 %r63, %rd187; + shfl.sync.bfly.b32 %r64, %r63, 4, 31, -1; + shfl.sync.bfly.b32 %r65, %r62, 4, 31, -1; + cvt.u64.u32 %rd188, %r64; + cvt.u64.u32 %rd189, %r65; + shl.b64 %rd190, %rd189, 32; + or.b64 %rd191, %rd188, %rd190; + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd192, %rd187, %rd191; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + mov.b64 {_, %r66}, %rd192; + cvt.u32.u64 %r67, %rd192; + shfl.sync.bfly.b32 %r68, %r67, 2, 31, -1; + shfl.sync.bfly.b32 %r69, %r66, 2, 31, -1; + cvt.u64.u32 %rd193, %r68; + cvt.u64.u32 %rd194, %r69; + shl.b64 %rd195, %rd194, 32; + or.b64 %rd196, %rd193, %rd195; + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd197, %rd192, %rd196; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + mov.b64 {_, %r70}, %rd197; + cvt.u32.u64 %r71, %rd197; + shfl.sync.bfly.b32 %r72, %r71, 1, 31, -1; + shfl.sync.bfly.b32 %r73, %r70, 1, 31, -1; + cvt.u64.u32 %rd198, %r72; + cvt.u64.u32 %rd199, %r73; + shl.b64 %rd200, %rd199, 32; + or.b64 %rd201, %rd198, %rd200; + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd153, %rd197, %rd201; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + setp.eq.b32 %p82, %r1, 0; + // begin inline asm + @%p82 st.shared.b64 [ %r28 + 0 ], %rd153; + // end inline asm + bar.sync 0; + ld.shared.b64 %rd202, [global_smem]; +$L__tmp2: + .loc 1 92 20 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:92:20 + add.s64 %rd203, %rd202, -1; + setp.lt.u64 %p86, %rd203, 16383; + .loc 1 0 0 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:0 + selp.b32 %r30, 1, 0, %p86; + .loc 1 95 21 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:95:21 + setp.eq.b64 %p87, %rd202, 16384; + .loc 1 0 0 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:0 + selp.b32 %r31, 1, 0, %p87; + .loc 1 98 25 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:98:25 + shl.b64 %rd204, %rd1, 2; + add.s64 %rd154, %rd63, %rd204; + .loc 1 98 37 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:98:37 + setp.eq.b32 %p88, %r2, 0; + and.pred %p83, %p88, %p1; + // begin inline asm + @%p83 st.global.b32 [ %rd154 + 0 ], { %r30 }; + // end inline asm + .loc 1 99 25 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:99:25 + add.s64 %rd155, %rd64, %rd204; + .loc 1 99 37 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:99:37 + // begin inline asm + @%p83 st.global.b32 [ %rd155 + 0 ], { %r31 }; + // end inline asm + .loc 1 99 4 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:99:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 307 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x12c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 111 +.b8 113 +.b8 52 +.b8 118 +.b8 112 +.b8 109 +.b8 106 +.b8 121 +.b8 110 +.b8 104 +.b8 115 +.b8 98 +.b8 114 +.b8 51 +.b8 101 +.b8 104 +.b8 55 +.b8 51 +.b8 105 +.b8 55 +.b8 115 +.b8 116 +.b8 106 +.b8 105 +.b8 51 +.b8 110 +.b8 101 +.b8 115 +.b8 114 +.b8 115 +.b8 52 +.b8 107 +.b8 113 +.b8 55 +.b8 110 +.b8 120 +.b8 111 +.b8 113 +.b8 111 +.b8 53 +.b8 53 +.b8 103 +.b8 117 +.b8 120 +.b8 51 +.b8 50 +.b8 50 +.b8 97 +.b8 55 +.b8 113 +.b8 105 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 111 +.b8 113 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x7d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 111 +.b8 114 +.b8 95 +.b8 99 +.b8 111 +.b8 110 +.b8 115 +.b8 116 +.b8 97 +.b8 110 +.b8 116 +.b8 95 +.b8 112 +.b8 97 +.b8 100 +.b8 95 +.b8 110 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 101 +.b8 95 +.b8 103 +.b8 116 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 109 +.b8 117 +.b8 116 +.b8 101 +.b8 95 +.b8 114 +.b8 101 +.b8 109 +.b8 97 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 114 +.b8 95 +.b8 115 +.b8 117 +.b8 98 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x108:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11d:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 87 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source new file mode 100644 index 0000000000000000000000000000000000000000..0028398c5e8168b0875ae8ceb37f9060825a9726 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source @@ -0,0 +1,418 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":18:0) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc99 = loc(unknown) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc106 = loc("in_ptr0"(#loc)) +#loc107 = loc("out_ptr1"(#loc)) +#loc108 = loc("out_ptr2"(#loc)) +#loc109 = loc("ks0"(#loc)) +#loc110 = loc("ks1"(#loc)) +#loc111 = loc("ks2"(#loc)) +#loc112 = loc("ks3"(#loc)) +#loc113 = loc("ks4"(#loc)) +#loc114 = loc("ks5"(#loc)) +#loc115 = loc("xnumel"(#loc)) +#loc116 = loc("r0_numel"(#loc)) +#loc207 = loc("input"(#loc97)) +#loc208 = loc("a"(#loc102)) +#loc209 = loc("b"(#loc102)) +module { + tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 16384 : i32 loc(#loc117) + %xoffset = tt.get_program_id x : i32 loc(#loc118) + %xoffset_1 = arith.constant 1 : i32 loc(#loc119) + %xoffset_2 = arith.constant 1 : i32 loc(#loc119) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc119) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc120) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc121) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc122) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc122) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc123) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc123) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc124) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc125) + %x1 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc126) + %x1_9 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc126) + %x1_10 = arith.divsi %x1, %x1_9 : tensor<1x1xi64> loc(#loc126) + %x1_11 = tt.splat %ks1 : i64 -> tensor<1x1xi64> loc(#loc127) + %x1_12 = arith.remsi %x1_10, %x1_11 : tensor<1x1xi64> loc(#loc127) + %x0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc128) + %x0_13 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc128) + %x0_14 = arith.remsi %x0, %x0_13 : tensor<1x1xi64> loc(#loc128) + %x2 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc129) + %x2_15 = tt.splat %ks4 : i64 -> tensor<1x1xi64> loc(#loc129) + %x2_16 = arith.divsi %x2, %x2_15 : tensor<1x1xi64> loc(#loc129) + %_tmp46 = arith.constant 0 : i64 loc(#loc130) + %_tmp46_17 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc130) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp46_18 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp46_22 = %_tmp46_17) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc132) + %r0_index_23 = arith.addi %r0_index, %r0_base_8 : tensor<1x2048xi32> loc(#loc132) + %r0_mask = arith.constant dense<16384> : tensor<1x2048xi32> loc(#loc133) + %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x2048xi32> loc(#loc133) + %r0_4 = arith.constant 128 : i32 loc(#loc134) + %r0_4_25 = arith.constant 128 : i32 loc(#loc134) + %r0_4_26 = arith.constant dense<128> : tensor<1x2048xi32> loc(#loc134) + %r0_4_27 = arith.divsi %r0_index_23, %r0_4_26 : tensor<1x2048xi32> loc(#loc134) + %r0_3 = arith.constant 128 : i32 loc(#loc135) + %r0_3_28 = arith.constant 128 : i32 loc(#loc135) + %r0_3_29 = arith.constant dense<128> : tensor<1x2048xi32> loc(#loc135) + %r0_3_30 = arith.remsi %r0_index_23, %r0_3_29 : tensor<1x2048xi32> loc(#loc135) + %tmp0 = arith.constant 128 : i32 loc(#loc136) + %tmp0_31 = arith.constant 128 : i64 loc(#loc136) + %tmp0_32 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc136) + %tmp0_33 = arith.muli %tmp0_32, %x1_12 : tensor<1x1xi64> loc(#loc136) + %tmp0_34 = arith.extsi %r0_4_27 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc137) + %tmp0_35 = tt.broadcast %tmp0_33 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc137) + %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<1x2048xi64> loc(#loc137) + %tmp2 = tt.splat %ks2 : i64 -> tensor<1x2048xi64> loc(#loc138) + %tmp2_37 = arith.cmpi slt, %tmp0_36, %tmp2 : tensor<1x2048xi64> loc(#loc138) + %tmp3 = arith.constant 128 : i32 loc(#loc139) + %tmp3_38 = arith.constant 128 : i64 loc(#loc139) + %tmp3_39 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc139) + %tmp3_40 = arith.muli %tmp3_39, %x0_14 : tensor<1x1xi64> loc(#loc139) + %tmp3_41 = arith.extsi %r0_3_30 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc140) + %tmp3_42 = tt.broadcast %tmp3_40 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc140) + %tmp3_43 = arith.addi %tmp3_41, %tmp3_42 : tensor<1x2048xi64> loc(#loc140) + %tmp5 = tt.splat %ks3 : i64 -> tensor<1x2048xi64> loc(#loc141) + %tmp5_44 = arith.cmpi slt, %tmp3_43, %tmp5 : tensor<1x2048xi64> loc(#loc141) + %tmp6 = arith.andi %tmp2_37, %tmp5_44 : tensor<1x2048xi1> loc(#loc142) + %tmp7 = arith.constant 128 : i32 loc(#loc143) + %tmp7_45 = arith.constant 128 : i64 loc(#loc143) + %tmp7_46 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc143) + %tmp7_47 = arith.muli %tmp7_46, %x1_12 : tensor<1x1xi64> loc(#loc143) + %tmp7_48 = arith.extsi %r0_4_27 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc144) + %tmp7_49 = tt.broadcast %tmp7_47 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc144) + %tmp7_50 = arith.addi %tmp7_48, %tmp7_49 : tensor<1x2048xi64> loc(#loc144) + %tmp8 = arith.constant 128 : i32 loc(#loc145) + %tmp8_51 = arith.constant 128 : i64 loc(#loc145) + %tmp8_52 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc145) + %tmp8_53 = arith.muli %tmp8_52, %x0_14 : tensor<1x1xi64> loc(#loc145) + %tmp8_54 = arith.extsi %r0_3_30 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc146) + %tmp8_55 = tt.broadcast %tmp8_53 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc146) + %tmp8_56 = arith.addi %tmp8_54, %tmp8_55 : tensor<1x2048xi64> loc(#loc146) + %tmp9 = arith.cmpi sge, %tmp7_50, %tmp8_56 : tensor<1x2048xi64> loc(#loc147) + %tmp10 = tt.broadcast %x2_16 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc148) + %tmp10_57 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc149) + %tmp10_58 = tt.addptr %tmp10_57, %tmp10 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc149) + %tmp10_59 = arith.andi %r0_mask_24, %tmp6 : tensor<1x2048xi1> loc(#loc150) + %tmp10_60 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc151) + %tmp10_61 = arith.andi %tmp10_59, %tmp10_60 : tensor<1x2048xi1> loc(#loc151) + %tmp10_62 = arith.constant 0.000000e+00 : f32 loc(#loc152) + %tmp10_63 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc152) + %tmp10_64 = arith.fptosi %tmp10_63 : tensor<1x2048xf32> to tensor<1x2048xi64> loc(#loc152) + %tmp10_65 = tt.load %tmp10_58, %tmp10_61, %tmp10_64 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc152) + %tmp11 = arith.cmpi slt, %tmp8_56, %tmp10_65 : tensor<1x2048xi64> loc(#loc153) + %tmp12 = arith.cmpi slt, %tmp7_50, %tmp10_65 : tensor<1x2048xi64> loc(#loc154) + %tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x2048xi1> loc(#loc155) + %tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x2048xi1> loc(#loc156) + %tmp15 = arith.constant false loc(#loc157) + %tmp15_66 = arith.constant dense : tensor<1x1xi1> loc(#loc157) + %tmp16 = arith.constant dense : tensor<1x2048xi1> loc(#loc158) + %tmp16_67 = arith.ori %tmp16, %tmp14 : tensor<1x2048xi1> loc(#loc158) + %tmp17 = tt.splat %ks5 : i64 -> tensor<1x2048xi64> loc(#loc159) + %tmp18 = arith.cmpi sge, %tmp8_56, %tmp17 : tensor<1x2048xi64> loc(#loc160) + %tmp19 = arith.remsi %tmp8_56, %tmp17 : tensor<1x2048xi64> loc(#loc161) + %tmp20 = arith.constant 0 : i32 loc(#loc162) + %tmp20_68 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc162) + %tmp21 = arith.extsi %tmp20_68 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc163) + %tmp21_69 = tt.broadcast %tmp21 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc163) + %tmp21_70 = arith.cmpi ne, %tmp19, %tmp21_69 : tensor<1x2048xi64> loc(#loc163) + %tmp22 = arith.constant 0 : i32 loc(#loc164) + %tmp22_71 = arith.extsi %tmp22 : i32 to i64 loc(#loc164) + %tmp22_72 = tt.splat %tmp22_71 : i64 -> tensor<1x2048xi64> loc(#loc164) + %tmp22_73 = arith.cmpi slt, %tmp19, %tmp22_72 : tensor<1x2048xi64> loc(#loc164) + %tmp23 = arith.constant 0 : i32 loc(#loc165) + %tmp23_74 = arith.extsi %tmp23 : i32 to i64 loc(#loc165) + %tmp23_75 = tt.splat %tmp23_74 : i64 -> tensor<1x2048xi64> loc(#loc165) + %tmp23_76 = arith.cmpi slt, %tmp17, %tmp23_75 : tensor<1x2048xi64> loc(#loc165) + %tmp24 = arith.cmpi ne, %tmp22_73, %tmp23_76 : tensor<1x2048xi1> loc(#loc166) + %tmp25 = arith.andi %tmp21_70, %tmp24 : tensor<1x2048xi1> loc(#loc167) + %tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x2048xi64> loc(#loc168) + %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc169) + %tmp28 = arith.cmpi slt, %tmp27, %tmp10_65 : tensor<1x2048xi64> loc(#loc170) + %tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x2048xi1> loc(#loc171) + %tmp30 = arith.constant -1 : i32 loc(#loc172) + %tmp30_77 = arith.constant -1 : i32 loc(#loc172) + %tmp30_78 = arith.constant dense<-1> : tensor<1x2048xi32> loc(#loc172) + %tmp30_79 = arith.muli %tmp30_78, %r0_4_27 : tensor<1x2048xi32> loc(#loc172) + %tmp30_80 = arith.addi %r0_3_30, %tmp30_79 : tensor<1x2048xi32> loc(#loc173) + %tmp30_81 = arith.constant -128 : i32 loc(#loc174) + %tmp30_82 = arith.constant -128 : i64 loc(#loc174) + %tmp30_83 = arith.constant dense<-128> : tensor<1x1xi64> loc(#loc174) + %tmp30_84 = arith.muli %tmp30_83, %x1_12 : tensor<1x1xi64> loc(#loc174) + %tmp30_85 = arith.extsi %tmp30_80 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc175) + %tmp30_86 = tt.broadcast %tmp30_84 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc175) + %tmp30_87 = arith.addi %tmp30_85, %tmp30_86 : tensor<1x2048xi64> loc(#loc175) + %tmp30_88 = arith.constant 128 : i32 loc(#loc176) + %tmp30_89 = arith.constant 128 : i64 loc(#loc176) + %tmp30_90 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc176) + %tmp30_91 = arith.muli %tmp30_90, %x0_14 : tensor<1x1xi64> loc(#loc176) + %tmp30_92 = tt.broadcast %tmp30_91 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc177) + %tmp30_93 = arith.addi %tmp30_87, %tmp30_92 : tensor<1x2048xi64> loc(#loc177) + %tmp31 = arith.remsi %tmp30_93, %tmp17 : tensor<1x2048xi64> loc(#loc178) + %tmp32 = arith.extsi %tmp20_68 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc179) + %tmp32_94 = tt.broadcast %tmp32 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc179) + %tmp32_95 = arith.cmpi ne, %tmp31, %tmp32_94 : tensor<1x2048xi64> loc(#loc179) + %tmp33 = arith.constant 0 : i32 loc(#loc180) + %tmp33_96 = arith.extsi %tmp33 : i32 to i64 loc(#loc180) + %tmp33_97 = tt.splat %tmp33_96 : i64 -> tensor<1x2048xi64> loc(#loc180) + %tmp33_98 = arith.cmpi slt, %tmp31, %tmp33_97 : tensor<1x2048xi64> loc(#loc180) + %tmp34 = arith.cmpi ne, %tmp33_98, %tmp23_76 : tensor<1x2048xi1> loc(#loc181) + %tmp35 = arith.andi %tmp32_95, %tmp34 : tensor<1x2048xi1> loc(#loc182) + %tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x2048xi64> loc(#loc183) + %tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc184) + %tmp38 = arith.constant 0 : i64 loc(#loc185) + %tmp38_99 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc185) + %tmp39 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc186) + %tmp39_100 = arith.cmpi eq, %tmp37, %tmp39 : tensor<1x2048xi64> loc(#loc186) + %tmp40 = arith.andi %tmp29, %tmp39_100 : tensor<1x2048xi1> loc(#loc187) + %tmp41 = arith.ori %tmp16_67, %tmp40 : tensor<1x2048xi1> loc(#loc188) + %tmp42 = arith.constant false loc(#loc189) + %tmp42_101 = arith.constant dense : tensor<1x2048xi1> loc(#loc189) + %tmp43 = arith.select %tmp6, %tmp41, %tmp42_101 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc190) + %tmp44 = arith.extui %tmp43 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc191) + %tmp47 = arith.addi %_tmp46_22, %tmp44 : tensor<1x2048xi64> loc(#loc192) + %_tmp46_102 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc193) + %_tmp46_103 = arith.andi %r0_mask_24, %_tmp46_102 : tensor<1x2048xi1> loc(#loc193) + %_tmp46_104 = arith.select %_tmp46_103, %tmp47, %_tmp46_22 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc194) + scf.yield %_tmp46_104 : tensor<1x2048xi64> loc(#loc79) + } loc(#loc131) + %tmp46 = tt.call @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp46_18) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc195) + %tmp46_19 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc196) + %tmp48 = arith.constant 0 : i64 loc(#loc197) + %tmp48_20 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc197) + %tmp49 = arith.cmpi sgt, %tmp46_19, %tmp48_20 : tensor<1x1xi64> loc(#loc198) + %tmp50 = arith.constant 16384 : i64 loc(#loc199) + %tmp50_21 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc199) + %tmp51 = arith.cmpi slt, %tmp46_19, %tmp50_21 : tensor<1x1xi64> loc(#loc200) + %tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1> loc(#loc201) + %tmp53 = arith.extui %tmp52 : tensor<1x1xi1> to tensor<1x1xi8> loc(#loc202) + %tmp54 = arith.extsi %tmp53 : tensor<1x1xi8> to tensor<1x1xi32> loc(#loc203) + %tmp55 = arith.cmpi eq, %tmp46_19, %tmp50_21 : tensor<1x1xi64> loc(#loc204) + %tmp56 = arith.extui %tmp55 : tensor<1x1xi1> to tensor<1x1xi8> loc(#loc205) + %tmp57 = arith.extsi %tmp56 : tensor<1x1xi8> to tensor<1x1xi32> loc(#loc206) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc92) + %5 = tt.addptr %4, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc92) + tt.store %5, %tmp54, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc93) + %6 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc94) + %7 = tt.addptr %6, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc94) + tt.store %7, %tmp57, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc95) + tt.return loc(#loc96) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2048xi64> loc("input"(#loc97))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc98) + tt.reduce.return %2 : i64 loc(#loc98) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc98) + tt.return %0 : tensor<1xi64> loc(#loc100) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc101) + tt.return %1 : tensor<1xi64> loc(#loc101) + } loc(#loc97) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc102)), %b: i64 loc("b"(#loc102))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc103) + tt.return %0 : i64 loc(#loc104) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc105) + tt.return %1 : i64 loc(#loc105) + } loc(#loc102) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":27:21) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":27:28) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":28:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":29:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":30:44) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":32:40) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":33:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":34:29) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":37:27) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":38:27) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":39:26) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":39:22) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":41:22) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":42:26) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":42:22) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":44:22) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":45:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":46:26) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":46:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":47:26) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":47:22) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":48:23) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:55) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:35) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:87) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:94) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:77) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":50:23) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":51:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":52:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":53:23) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":54:39) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":55:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":56:37) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":57:24) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":58:24) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":59:35) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":60:25) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":61:92) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":62:92) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":63:25) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":64:24) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":65:24) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":66:39) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":67:24) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":68:24) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:29) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:24) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:45) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:38) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:55) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:51) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":70:25) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":71:25) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":72:92) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":73:25) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":74:24) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":75:24) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":76:39) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":77:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":78:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":79:24) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":80:24) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":81:44) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":82:38) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":83:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":85:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":86:36) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":86:50) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":86:8) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":87:27) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":87:30) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":88:31) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":89:20) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":90:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":91:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":92:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":93:21) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":94:21) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":95:21) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":96:21) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":97:21) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":98:25) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":98:37) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":99:25) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":99:37) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":99:4) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc117 = loc("r0_numel"(#loc1)) +#loc118 = loc("xoffset"(#loc2)) +#loc119 = loc("xoffset"(#loc3)) +#loc120 = loc("xindex"(#loc4)) +#loc121 = loc("xindex"(#loc5)) +#loc122 = loc("xindex"(#loc6)) +#loc123 = loc("xmask"(#loc7)) +#loc124 = loc("r0_base"(#loc8)) +#loc125 = loc("r0_base"(#loc9)) +#loc126 = loc("x1"(#loc10)) +#loc127 = loc("x1"(#loc11)) +#loc128 = loc("x0"(#loc12)) +#loc129 = loc("x2"(#loc13)) +#loc130 = loc("_tmp46"(#loc14)) +#loc131 = loc("_tmp46"(#loc15)) +#loc132 = loc("r0_index"(#loc16)) +#loc133 = loc("r0_mask"(#loc17)) +#loc134 = loc("r0_4"(#loc18)) +#loc135 = loc("r0_3"(#loc19)) +#loc136 = loc("tmp0"(#loc20)) +#loc137 = loc("tmp0"(#loc21)) +#loc138 = loc("tmp2"(#loc22)) +#loc139 = loc("tmp3"(#loc23)) +#loc140 = loc("tmp3"(#loc24)) +#loc141 = loc("tmp5"(#loc25)) +#loc142 = loc("tmp6"(#loc26)) +#loc143 = loc("tmp7"(#loc27)) +#loc144 = loc("tmp7"(#loc28)) +#loc145 = loc("tmp8"(#loc29)) +#loc146 = loc("tmp8"(#loc30)) +#loc147 = loc("tmp9"(#loc31)) +#loc148 = loc("tmp10"(#loc32)) +#loc149 = loc("tmp10"(#loc33)) +#loc150 = loc("tmp10"(#loc34)) +#loc151 = loc("tmp10"(#loc35)) +#loc152 = loc("tmp10"(#loc36)) +#loc153 = loc("tmp11"(#loc37)) +#loc154 = loc("tmp12"(#loc38)) +#loc155 = loc("tmp13"(#loc39)) +#loc156 = loc("tmp14"(#loc40)) +#loc157 = loc("tmp15"(#loc41)) +#loc158 = loc("tmp16"(#loc42)) +#loc159 = loc("tmp17"(#loc43)) +#loc160 = loc("tmp18"(#loc44)) +#loc161 = loc("tmp19"(#loc45)) +#loc162 = loc("tmp20"(#loc46)) +#loc163 = loc("tmp21"(#loc47)) +#loc164 = loc("tmp22"(#loc48)) +#loc165 = loc("tmp23"(#loc49)) +#loc166 = loc("tmp24"(#loc50)) +#loc167 = loc("tmp25"(#loc51)) +#loc168 = loc("tmp26"(#loc52)) +#loc169 = loc("tmp27"(#loc53)) +#loc170 = loc("tmp28"(#loc54)) +#loc171 = loc("tmp29"(#loc55)) +#loc172 = loc("tmp30"(#loc56)) +#loc173 = loc("tmp30"(#loc57)) +#loc174 = loc("tmp30"(#loc58)) +#loc175 = loc("tmp30"(#loc59)) +#loc176 = loc("tmp30"(#loc60)) +#loc177 = loc("tmp30"(#loc61)) +#loc178 = loc("tmp31"(#loc62)) +#loc179 = loc("tmp32"(#loc63)) +#loc180 = loc("tmp33"(#loc64)) +#loc181 = loc("tmp34"(#loc65)) +#loc182 = loc("tmp35"(#loc66)) +#loc183 = loc("tmp36"(#loc67)) +#loc184 = loc("tmp37"(#loc68)) +#loc185 = loc("tmp38"(#loc69)) +#loc186 = loc("tmp39"(#loc70)) +#loc187 = loc("tmp40"(#loc71)) +#loc188 = loc("tmp41"(#loc72)) +#loc189 = loc("tmp42"(#loc73)) +#loc190 = loc("tmp43"(#loc74)) +#loc191 = loc("tmp44"(#loc75)) +#loc192 = loc("tmp47"(#loc76)) +#loc193 = loc("_tmp46"(#loc77)) +#loc194 = loc("_tmp46"(#loc78)) +#loc195 = loc("tmp46"(#loc80)) +#loc196 = loc("tmp46"(#loc81)) +#loc197 = loc("tmp48"(#loc82)) +#loc198 = loc("tmp49"(#loc83)) +#loc199 = loc("tmp50"(#loc84)) +#loc200 = loc("tmp51"(#loc85)) +#loc201 = loc("tmp52"(#loc86)) +#loc202 = loc("tmp53"(#loc87)) +#loc203 = loc("tmp54"(#loc88)) +#loc204 = loc("tmp55"(#loc89)) +#loc205 = loc("tmp56"(#loc90)) +#loc206 = loc("tmp57"(#loc91)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..fed92947455b766ba212a482572832c5010be827 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir @@ -0,0 +1,280 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":18:0) +#loc1 = loc(unknown) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":87:27) +#loc79 = loc("in_ptr0"(#loc)) +#loc80 = loc("out_ptr1"(#loc)) +#loc81 = loc("out_ptr2"(#loc)) +#loc82 = loc("ks0"(#loc)) +#loc83 = loc("ks1"(#loc)) +#loc84 = loc("ks2"(#loc)) +#loc85 = loc("ks3"(#loc)) +#loc86 = loc("ks4"(#loc)) +#loc87 = loc("ks5"(#loc)) +#loc88 = loc("xnumel"(#loc)) +#loc89 = loc("r0_numel"(#loc)) +#loc149 = loc("tmp46"(#loc63)) +#loc164 = loc(callsite(#loc1 at #loc149)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<16384> : tensor<1x2048xi32, #blocked> loc(#loc1) + %c-128_i64 = arith.constant -128 : i64 loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_1 = arith.constant dense<16384> : tensor<1x1xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<1x1xi64, #blocked> loc(#loc1) + %cst_3 = arith.constant dense : tensor<1x2048xi1, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0> : tensor<1x2048xi64, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc90) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc91) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc92) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc92) + %x1 = arith.extsi %xoffset : i32 to i64 loc(#loc93) + %x1_6 = arith.divsi %x1, %ks0 : i64 loc(#loc93) + %x1_7 = arith.remsi %x1_6, %ks1 : i64 loc(#loc94) + %x0 = arith.remsi %x1, %ks0 : i64 loc(#loc95) + %x2 = arith.divsi %x1, %ks4 : i64 loc(#loc96) + %tmp0 = arith.muli %x1_7, %c128_i64 : i64 loc(#loc97) + %tmp0_8 = tt.splat %tmp0 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc159) + %tmp2 = tt.splat %ks2 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc99) + %tmp3 = arith.muli %x0, %c128_i64 : i64 loc(#loc100) + %tmp3_9 = tt.splat %tmp3 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc160) + %tmp5 = tt.splat %ks3 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc102) + %tmp10 = tt.addptr %in_ptr0, %x2 : !tt.ptr, i64 loc(#loc103) + %tmp10_10 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc161) + %tmp10_11 = tt.splat %tmp10 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc105) + %tmp17 = tt.splat %ks5 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc106) + %tmp23 = arith.cmpi slt, %ks5, %c0_i64 : i64 loc(#loc107) + %tmp23_12 = tt.splat %tmp23 : i1 -> tensor<1x2048xi1, #blocked> loc(#loc107) + %tmp30 = arith.muli %x1_7, %c-128_i64 : i64 loc(#loc108) + %tmp30_13 = tt.splat %tmp30 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc162) + %_tmp46 = scf.for %_tmp46_15 = %c0_i32 to %c16384_i32 step %c2048_i32 iter_args(%arg12 = %cst_4) -> (tensor<1x2048xi64, #blocked>) : i32 { + %r0_index = tt.splat %_tmp46_15 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc111) + %r0_index_16 = arith.addi %r0_index, %r0_base_5 : tensor<1x2048xi32, #blocked> loc(#loc111) + %r0_mask = arith.cmpi slt, %r0_index_16, %cst_0 : tensor<1x2048xi32, #blocked> loc(#loc112) + %r0_4 = arith.divsi %r0_index_16, %cst : tensor<1x2048xi32, #blocked> loc(#loc113) + %r0_3 = arith.remsi %r0_index_16, %cst : tensor<1x2048xi32, #blocked> loc(#loc114) + %tmp0_17 = arith.extsi %r0_4 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc98) + %tmp0_18 = arith.addi %tmp0_17, %tmp0_8 : tensor<1x2048xi64, #blocked> loc(#loc98) + %tmp2_19 = arith.cmpi slt, %tmp0_18, %tmp2 : tensor<1x2048xi64, #blocked> loc(#loc99) + %tmp3_20 = arith.extsi %r0_3 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc101) + %tmp3_21 = arith.addi %tmp3_20, %tmp3_9 : tensor<1x2048xi64, #blocked> loc(#loc101) + %tmp5_22 = arith.cmpi slt, %tmp3_21, %tmp5 : tensor<1x2048xi64, #blocked> loc(#loc102) + %tmp6 = arith.andi %tmp2_19, %tmp5_22 : tensor<1x2048xi1, #blocked> loc(#loc115) + %tmp9 = arith.cmpi sge, %tmp0_18, %tmp3_21 : tensor<1x2048xi64, #blocked> loc(#loc116) + %tmp10_23 = arith.andi %r0_mask, %tmp6 : tensor<1x2048xi1, #blocked> loc(#loc117) + %tmp10_24 = arith.andi %tmp10_23, %tmp10_10 : tensor<1x2048xi1, #blocked> loc(#loc104) + %tmp10_25 = tt.load %tmp10_11, %tmp10_24, %cst_4 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc105) + %tmp11 = arith.cmpi slt, %tmp3_21, %tmp10_25 : tensor<1x2048xi64, #blocked> loc(#loc118) + %tmp12 = arith.cmpi slt, %tmp0_18, %tmp10_25 : tensor<1x2048xi64, #blocked> loc(#loc119) + %tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x2048xi1, #blocked> loc(#loc120) + %tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x2048xi1, #blocked> loc(#loc121) + %tmp18 = arith.cmpi sge, %tmp3_21, %tmp17 : tensor<1x2048xi64, #blocked> loc(#loc122) + %tmp19 = arith.remsi %tmp3_21, %tmp17 : tensor<1x2048xi64, #blocked> loc(#loc123) + %tmp21 = arith.cmpi ne, %tmp19, %cst_4 : tensor<1x2048xi64, #blocked> loc(#loc124) + %tmp22 = arith.cmpi slt, %tmp19, %cst_4 : tensor<1x2048xi64, #blocked> loc(#loc125) + %tmp24 = arith.cmpi ne, %tmp22, %tmp23_12 : tensor<1x2048xi1, #blocked> loc(#loc126) + %tmp25 = arith.andi %tmp21, %tmp24 : tensor<1x2048xi1, #blocked> loc(#loc127) + %tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x2048xi64, #blocked> loc(#loc128) + %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc129) + %tmp28 = arith.cmpi slt, %tmp27, %tmp10_25 : tensor<1x2048xi64, #blocked> loc(#loc130) + %tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x2048xi1, #blocked> loc(#loc131) + %tmp30_26 = arith.subi %r0_3, %r0_4 : tensor<1x2048xi32, #blocked> loc(#loc132) + %tmp30_27 = arith.extsi %tmp30_26 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc109) + %tmp30_28 = arith.addi %tmp30_27, %tmp30_13 : tensor<1x2048xi64, #blocked> loc(#loc109) + %tmp30_29 = arith.addi %tmp30_28, %tmp3_9 : tensor<1x2048xi64, #blocked> loc(#loc133) + %tmp31 = arith.remsi %tmp30_29, %tmp17 : tensor<1x2048xi64, #blocked> loc(#loc134) + %tmp32 = arith.cmpi ne, %tmp31, %cst_4 : tensor<1x2048xi64, #blocked> loc(#loc135) + %tmp33 = arith.cmpi slt, %tmp31, %cst_4 : tensor<1x2048xi64, #blocked> loc(#loc136) + %tmp34 = arith.cmpi ne, %tmp33, %tmp23_12 : tensor<1x2048xi1, #blocked> loc(#loc137) + %tmp35 = arith.andi %tmp32, %tmp34 : tensor<1x2048xi1, #blocked> loc(#loc138) + %tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x2048xi64, #blocked> loc(#loc139) + %tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc140) + %tmp39 = arith.cmpi eq, %tmp37, %cst_4 : tensor<1x2048xi64, #blocked> loc(#loc141) + %tmp40 = arith.andi %tmp29, %tmp39 : tensor<1x2048xi1, #blocked> loc(#loc142) + %tmp41 = arith.ori %tmp14, %tmp40 : tensor<1x2048xi1, #blocked> loc(#loc143) + %tmp43 = arith.select %tmp6, %tmp41, %cst_3 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi1, #blocked> loc(#loc144) + %tmp44 = arith.extui %tmp43 : tensor<1x2048xi1, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc145) + %tmp47 = arith.addi %arg12, %tmp44 : tensor<1x2048xi64, #blocked> loc(#loc146) + %_tmp46_30 = arith.andi %r0_mask, %tmp10_10 : tensor<1x2048xi1, #blocked> loc(#loc147) + %_tmp46_31 = arith.select %_tmp46_30, %tmp47, %arg12 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc148) + scf.yield %_tmp46_31 : tensor<1x2048xi64, #blocked> loc(#loc61) + } loc(#loc110) + %tmp46 = "tt.reduce"(%_tmp46) <{axis = 1 : i32}> ({ + ^bb0(%tmp46_15: i64 loc(callsite(#loc1 at #loc149)), %tmp46_16: i64 loc(callsite(#loc1 at #loc149))): + %tmp46_17 = arith.addi %tmp46_15, %tmp46_16 : i64 loc(#loc167) + tt.reduce.return %tmp46_17 : i64 loc(#loc163) + }) : (tensor<1x2048xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc163) + %tmp46_14 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc150) + %tmp49 = arith.cmpi sgt, %tmp46_14, %cst_2 : tensor<1x1xi64, #blocked> loc(#loc151) + %tmp51 = arith.cmpi slt, %tmp46_14, %cst_1 : tensor<1x1xi64, #blocked> loc(#loc152) + %tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1, #blocked> loc(#loc153) + %tmp54 = arith.extui %tmp52 : tensor<1x1xi1, #blocked> to tensor<1x1xi32, #blocked> loc(#loc165) + %tmp55 = arith.cmpi eq, %tmp46_14, %cst_1 : tensor<1x1xi64, #blocked> loc(#loc156) + %tmp57 = arith.extui %tmp55 : tensor<1x1xi1, #blocked> to tensor<1x1xi32, #blocked> loc(#loc166) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc74) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc75) + %2 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc75) + tt.store %1, %tmp54, %2 : tensor<1x1x!tt.ptr, #blocked> loc(#loc75) + %3 = tt.addptr %out_ptr2, %xoffset : !tt.ptr, i32 loc(#loc76) + %4 = tt.splat %3 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc77) + tt.store %4, %tmp57, %2 : tensor<1x1x!tt.ptr, #blocked> loc(#loc77) + tt.return loc(#loc78) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":27:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":27:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":29:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":39:26) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":39:22) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":41:22) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":42:26) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":42:22) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":44:22) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:35) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:94) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:77) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":56:37) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":62:92) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:45) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:38) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":32:40) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":33:31) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":34:29) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":37:27) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":38:27) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":45:22) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":48:23) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:87) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":50:23) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":51:23) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":52:24) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":53:23) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":57:24) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":58:24) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":60:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":61:92) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":63:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":64:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":65:24) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":66:39) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":67:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":68:24) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:24) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:51) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":70:25) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":71:25) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":72:92) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":73:25) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":74:24) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":75:24) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":76:39) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":78:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":79:24) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":80:24) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":82:38) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":83:25) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":85:25) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":86:36) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":86:50) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":86:8) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":87:30) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":89:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":91:20) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":92:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":94:21) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":93:21) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":95:21) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":97:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":96:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":98:25) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":98:37) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":99:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":99:37) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":99:4) +#loc90 = loc("xoffset"(#loc2)) +#loc91 = loc("xmask"(#loc3)) +#loc92 = loc("r0_base"(#loc4)) +#loc93 = loc("x1"(#loc5)) +#loc94 = loc("x1"(#loc6)) +#loc95 = loc("x0"(#loc7)) +#loc96 = loc("x2"(#loc8)) +#loc97 = loc("tmp0"(#loc9)) +#loc98 = loc("tmp0"(#loc10)) +#loc99 = loc("tmp2"(#loc11)) +#loc100 = loc("tmp3"(#loc12)) +#loc101 = loc("tmp3"(#loc13)) +#loc102 = loc("tmp5"(#loc14)) +#loc103 = loc("tmp10"(#loc15)) +#loc104 = loc("tmp10"(#loc16)) +#loc105 = loc("tmp10"(#loc17)) +#loc106 = loc("tmp17"(#loc18)) +#loc107 = loc("tmp23"(#loc19)) +#loc108 = loc("tmp30"(#loc20)) +#loc109 = loc("tmp30"(#loc21)) +#loc110 = loc("_tmp46"(#loc22)) +#loc111 = loc("r0_index"(#loc23)) +#loc112 = loc("r0_mask"(#loc24)) +#loc113 = loc("r0_4"(#loc25)) +#loc114 = loc("r0_3"(#loc26)) +#loc115 = loc("tmp6"(#loc27)) +#loc116 = loc("tmp9"(#loc28)) +#loc117 = loc("tmp10"(#loc29)) +#loc118 = loc("tmp11"(#loc30)) +#loc119 = loc("tmp12"(#loc31)) +#loc120 = loc("tmp13"(#loc32)) +#loc121 = loc("tmp14"(#loc33)) +#loc122 = loc("tmp18"(#loc34)) +#loc123 = loc("tmp19"(#loc35)) +#loc124 = loc("tmp21"(#loc36)) +#loc125 = loc("tmp22"(#loc37)) +#loc126 = loc("tmp24"(#loc38)) +#loc127 = loc("tmp25"(#loc39)) +#loc128 = loc("tmp26"(#loc40)) +#loc129 = loc("tmp27"(#loc41)) +#loc130 = loc("tmp28"(#loc42)) +#loc131 = loc("tmp29"(#loc43)) +#loc132 = loc("tmp30"(#loc44)) +#loc133 = loc("tmp30"(#loc45)) +#loc134 = loc("tmp31"(#loc46)) +#loc135 = loc("tmp32"(#loc47)) +#loc136 = loc("tmp33"(#loc48)) +#loc137 = loc("tmp34"(#loc49)) +#loc138 = loc("tmp35"(#loc50)) +#loc139 = loc("tmp36"(#loc51)) +#loc140 = loc("tmp37"(#loc52)) +#loc141 = loc("tmp39"(#loc53)) +#loc142 = loc("tmp40"(#loc54)) +#loc143 = loc("tmp41"(#loc55)) +#loc144 = loc("tmp43"(#loc56)) +#loc145 = loc("tmp44"(#loc57)) +#loc146 = loc("tmp47"(#loc58)) +#loc147 = loc("_tmp46"(#loc59)) +#loc148 = loc("_tmp46"(#loc60)) +#loc150 = loc("tmp46"(#loc65)) +#loc151 = loc("tmp49"(#loc66)) +#loc152 = loc("tmp51"(#loc67)) +#loc153 = loc("tmp52"(#loc68)) +#loc154 = loc("tmp54"(#loc69)) +#loc155 = loc("tmp53"(#loc70)) +#loc156 = loc("tmp55"(#loc71)) +#loc157 = loc("tmp57"(#loc72)) +#loc158 = loc("tmp56"(#loc73)) +#loc159 = loc(fused[#loc98, #loc97]) +#loc160 = loc(fused[#loc101, #loc100]) +#loc161 = loc(fused[#loc104, #loc91]) +#loc162 = loc(fused[#loc109, #loc108]) +#loc163 = loc(callsite(#loc62 at #loc149)) +#loc165 = loc(fused[#loc154, #loc155]) +#loc166 = loc(fused[#loc157, #loc158]) +#loc167 = loc(callsite(#loc64 at #loc163)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..9248f77fd1cb2bc9e602042a8822168a2909b19d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3NH6HISNAMYS6PPKRZPOP2Y2PRWXH4ILY3PCKSRARGCKMTL7453Q/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir @@ -0,0 +1,283 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":18:0) +#loc1 = loc(unknown) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":87:27) +#loc81 = loc("in_ptr0"(#loc)) +#loc82 = loc("out_ptr1"(#loc)) +#loc83 = loc("out_ptr2"(#loc)) +#loc84 = loc("ks0"(#loc)) +#loc85 = loc("ks1"(#loc)) +#loc86 = loc("ks2"(#loc)) +#loc87 = loc("ks3"(#loc)) +#loc88 = loc("ks4"(#loc)) +#loc89 = loc("ks5"(#loc)) +#loc90 = loc("xnumel"(#loc)) +#loc91 = loc("r0_numel"(#loc)) +#loc153 = loc("tmp46"(#loc65)) +#loc168 = loc(callsite(#loc1 at #loc153)) +module { + tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c-128_i64 = arith.constant -128 : i64 loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc2) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %tmp50 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc92) + %cst = arith.constant dense<0> : tensor<1x1xi64> loc(#loc1) + %cst_0 = arith.constant dense : tensor<1x2048xi1> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<1x2048xi32> loc(#loc1) + %cst_2 = arith.constant dense<16384> : tensor<1x2048xi32> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc93) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc94) + %xmask_4 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc94) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc95) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc96) + %x1 = arith.extsi %xoffset : i32 to i64 loc(#loc97) + %x1_6 = arith.divsi %x1, %ks0 : i64 loc(#loc97) + %x1_7 = arith.remsi %x1_6, %ks1 : i64 loc(#loc98) + %x0 = arith.remsi %x1, %ks0 : i64 loc(#loc99) + %x2 = arith.divsi %x1, %ks4 : i64 loc(#loc100) + %_tmp46 = scf.for %r0_offset = %c0_i32 to %c16384_i32 step %c2048_i32 iter_args(%_tmp46_9 = %cst_3) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc102) + %r0_index_10 = arith.addi %r0_index, %r0_base_5 : tensor<1x2048xi32> loc(#loc102) + %r0_mask = arith.cmpi slt, %r0_index_10, %cst_2 : tensor<1x2048xi32> loc(#loc103) + %r0_4 = arith.divsi %r0_index_10, %cst_1 : tensor<1x2048xi32> loc(#loc104) + %r0_3 = arith.remsi %r0_index_10, %cst_1 : tensor<1x2048xi32> loc(#loc105) + %tmp0 = arith.muli %x1_7, %c128_i64 : i64 loc(#loc106) + %tmp0_11 = arith.extsi %r0_4 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc107) + %tmp0_12 = tt.splat %tmp0 : i64 -> tensor<1x2048xi64> loc(#loc163) + %tmp0_13 = arith.addi %tmp0_11, %tmp0_12 : tensor<1x2048xi64> loc(#loc107) + %tmp2 = tt.splat %ks2 : i64 -> tensor<1x2048xi64> loc(#loc108) + %tmp2_14 = arith.cmpi slt, %tmp0_13, %tmp2 : tensor<1x2048xi64> loc(#loc108) + %tmp3 = arith.muli %x0, %c128_i64 : i64 loc(#loc109) + %tmp3_15 = arith.extsi %r0_3 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc110) + %tmp3_16 = tt.splat %tmp3 : i64 -> tensor<1x2048xi64> loc(#loc164) + %tmp3_17 = arith.addi %tmp3_15, %tmp3_16 : tensor<1x2048xi64> loc(#loc110) + %tmp5 = tt.splat %ks3 : i64 -> tensor<1x2048xi64> loc(#loc111) + %tmp5_18 = arith.cmpi slt, %tmp3_17, %tmp5 : tensor<1x2048xi64> loc(#loc111) + %tmp6 = arith.andi %tmp2_14, %tmp5_18 : tensor<1x2048xi1> loc(#loc112) + %tmp9 = arith.cmpi sge, %tmp0_13, %tmp3_17 : tensor<1x2048xi64> loc(#loc113) + %tmp10 = tt.addptr %in_ptr0, %x2 : !tt.ptr, i64 loc(#loc114) + %tmp10_19 = tt.splat %tmp10 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc114) + %tmp10_20 = arith.andi %r0_mask, %tmp6 : tensor<1x2048xi1> loc(#loc115) + %tmp10_21 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc165) + %tmp10_22 = arith.andi %tmp10_20, %tmp10_21 : tensor<1x2048xi1> loc(#loc116) + %tmp10_23 = tt.load %tmp10_19, %tmp10_22, %cst_3 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc117) + %tmp11 = arith.cmpi slt, %tmp3_17, %tmp10_23 : tensor<1x2048xi64> loc(#loc118) + %tmp12 = arith.cmpi slt, %tmp0_13, %tmp10_23 : tensor<1x2048xi64> loc(#loc119) + %tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x2048xi1> loc(#loc120) + %tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x2048xi1> loc(#loc121) + %tmp17 = tt.splat %ks5 : i64 -> tensor<1x2048xi64> loc(#loc122) + %tmp18 = arith.cmpi sge, %tmp3_17, %tmp17 : tensor<1x2048xi64> loc(#loc123) + %tmp19 = arith.remsi %tmp3_17, %tmp17 : tensor<1x2048xi64> loc(#loc124) + %tmp21 = arith.cmpi ne, %tmp19, %cst_3 : tensor<1x2048xi64> loc(#loc125) + %tmp22 = arith.cmpi slt, %tmp19, %cst_3 : tensor<1x2048xi64> loc(#loc126) + %tmp23 = arith.cmpi slt, %ks5, %c0_i64 : i64 loc(#loc127) + %tmp23_24 = tt.splat %tmp23 : i1 -> tensor<1x2048xi1> loc(#loc127) + %tmp24 = arith.cmpi ne, %tmp22, %tmp23_24 : tensor<1x2048xi1> loc(#loc128) + %tmp25 = arith.andi %tmp21, %tmp24 : tensor<1x2048xi1> loc(#loc129) + %tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x2048xi64> loc(#loc130) + %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc131) + %tmp28 = arith.cmpi slt, %tmp27, %tmp10_23 : tensor<1x2048xi64> loc(#loc132) + %tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x2048xi1> loc(#loc133) + %tmp30 = arith.subi %r0_3, %r0_4 : tensor<1x2048xi32> loc(#loc134) + %tmp30_25 = arith.muli %x1_7, %c-128_i64 : i64 loc(#loc135) + %tmp30_26 = arith.extsi %tmp30 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc136) + %tmp30_27 = tt.splat %tmp30_25 : i64 -> tensor<1x2048xi64> loc(#loc166) + %tmp30_28 = arith.addi %tmp30_26, %tmp30_27 : tensor<1x2048xi64> loc(#loc136) + %tmp30_29 = arith.addi %tmp30_28, %tmp3_16 : tensor<1x2048xi64> loc(#loc137) + %tmp31 = arith.remsi %tmp30_29, %tmp17 : tensor<1x2048xi64> loc(#loc138) + %tmp32 = arith.cmpi ne, %tmp31, %cst_3 : tensor<1x2048xi64> loc(#loc139) + %tmp33 = arith.cmpi slt, %tmp31, %cst_3 : tensor<1x2048xi64> loc(#loc140) + %tmp34 = arith.cmpi ne, %tmp33, %tmp23_24 : tensor<1x2048xi1> loc(#loc141) + %tmp35 = arith.andi %tmp32, %tmp34 : tensor<1x2048xi1> loc(#loc142) + %tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x2048xi64> loc(#loc143) + %tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc144) + %tmp39 = arith.cmpi eq, %tmp37, %cst_3 : tensor<1x2048xi64> loc(#loc145) + %tmp40 = arith.andi %tmp29, %tmp39 : tensor<1x2048xi1> loc(#loc146) + %tmp41 = arith.ori %tmp14, %tmp40 : tensor<1x2048xi1> loc(#loc147) + %tmp43 = arith.select %tmp6, %tmp41, %cst_0 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc148) + %tmp44 = arith.extui %tmp43 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc149) + %tmp47 = arith.addi %_tmp46_9, %tmp44 : tensor<1x2048xi64> loc(#loc150) + %_tmp46_30 = arith.andi %r0_mask, %tmp10_21 : tensor<1x2048xi1> loc(#loc151) + %_tmp46_31 = arith.select %_tmp46_30, %tmp47, %_tmp46_9 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc152) + scf.yield %_tmp46_31 : tensor<1x2048xi64> loc(#loc63) + } loc(#loc101) + %tmp46 = "tt.reduce"(%_tmp46) <{axis = 1 : i32}> ({ + ^bb0(%tmp46_9: i64 loc(callsite(#loc1 at #loc153)), %tmp46_10: i64 loc(callsite(#loc1 at #loc153))): + %tmp46_11 = arith.addi %tmp46_9, %tmp46_10 : i64 loc(#loc171) + tt.reduce.return %tmp46_11 : i64 loc(#loc167) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc167) + %tmp46_8 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc154) + %tmp49 = arith.cmpi sgt, %tmp46_8, %cst : tensor<1x1xi64> loc(#loc155) + %tmp51 = arith.cmpi slt, %tmp46_8, %tmp50 : tensor<1x1xi64> loc(#loc156) + %tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1> loc(#loc157) + %tmp54 = arith.extui %tmp52 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc169) + %tmp55 = arith.cmpi eq, %tmp46_8, %tmp50 : tensor<1x1xi64> loc(#loc160) + %tmp57 = arith.extui %tmp55 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc170) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc76) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc76) + tt.store %1, %tmp54, %xmask_4 : tensor<1x1x!tt.ptr> loc(#loc77) + %2 = tt.addptr %out_ptr2, %xoffset : !tt.ptr, i32 loc(#loc78) + %3 = tt.splat %2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc78) + tt.store %3, %tmp57, %xmask_4 : tensor<1x1x!tt.ptr> loc(#loc79) + tt.return loc(#loc80) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":32:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":90:35) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":22:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":24:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":25:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":25:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":27:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":28:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":29:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":33:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":34:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":37:27) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":38:27) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":39:26) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":39:22) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":41:22) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":42:26) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":42:22) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":44:22) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":45:22) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":48:23) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:35) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:87) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:94) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:77) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":50:23) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":51:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":52:24) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":53:23) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":56:37) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":57:24) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":58:24) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":60:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":61:92) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":62:92) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":63:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":64:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":65:24) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":66:39) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":67:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":68:24) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:24) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:45) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:38) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:51) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":70:25) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":71:25) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":72:92) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":73:25) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":74:24) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":75:24) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":76:39) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":78:25) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":79:24) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":80:24) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":82:38) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":83:25) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":85:25) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":86:36) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":86:50) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":86:8) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":87:30) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":89:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":91:20) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":92:20) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":94:21) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":93:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":95:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":97:21) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":96:21) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":98:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":98:37) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":99:25) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":99:37) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":99:4) +#loc92 = loc("tmp50"(#loc3)) +#loc93 = loc("xoffset"(#loc4)) +#loc94 = loc("xmask"(#loc5)) +#loc95 = loc("r0_base"(#loc6)) +#loc96 = loc("r0_base"(#loc7)) +#loc97 = loc("x1"(#loc8)) +#loc98 = loc("x1"(#loc9)) +#loc99 = loc("x0"(#loc10)) +#loc100 = loc("x2"(#loc11)) +#loc101 = loc("_tmp46"(#loc2)) +#loc102 = loc("r0_index"(#loc12)) +#loc103 = loc("r0_mask"(#loc13)) +#loc104 = loc("r0_4"(#loc14)) +#loc105 = loc("r0_3"(#loc15)) +#loc106 = loc("tmp0"(#loc16)) +#loc107 = loc("tmp0"(#loc17)) +#loc108 = loc("tmp2"(#loc18)) +#loc109 = loc("tmp3"(#loc19)) +#loc110 = loc("tmp3"(#loc20)) +#loc111 = loc("tmp5"(#loc21)) +#loc112 = loc("tmp6"(#loc22)) +#loc113 = loc("tmp9"(#loc23)) +#loc114 = loc("tmp10"(#loc24)) +#loc115 = loc("tmp10"(#loc25)) +#loc116 = loc("tmp10"(#loc26)) +#loc117 = loc("tmp10"(#loc27)) +#loc118 = loc("tmp11"(#loc28)) +#loc119 = loc("tmp12"(#loc29)) +#loc120 = loc("tmp13"(#loc30)) +#loc121 = loc("tmp14"(#loc31)) +#loc122 = loc("tmp17"(#loc32)) +#loc123 = loc("tmp18"(#loc33)) +#loc124 = loc("tmp19"(#loc34)) +#loc125 = loc("tmp21"(#loc35)) +#loc126 = loc("tmp22"(#loc36)) +#loc127 = loc("tmp23"(#loc37)) +#loc128 = loc("tmp24"(#loc38)) +#loc129 = loc("tmp25"(#loc39)) +#loc130 = loc("tmp26"(#loc40)) +#loc131 = loc("tmp27"(#loc41)) +#loc132 = loc("tmp28"(#loc42)) +#loc133 = loc("tmp29"(#loc43)) +#loc134 = loc("tmp30"(#loc44)) +#loc135 = loc("tmp30"(#loc45)) +#loc136 = loc("tmp30"(#loc46)) +#loc137 = loc("tmp30"(#loc47)) +#loc138 = loc("tmp31"(#loc48)) +#loc139 = loc("tmp32"(#loc49)) +#loc140 = loc("tmp33"(#loc50)) +#loc141 = loc("tmp34"(#loc51)) +#loc142 = loc("tmp35"(#loc52)) +#loc143 = loc("tmp36"(#loc53)) +#loc144 = loc("tmp37"(#loc54)) +#loc145 = loc("tmp39"(#loc55)) +#loc146 = loc("tmp40"(#loc56)) +#loc147 = loc("tmp41"(#loc57)) +#loc148 = loc("tmp43"(#loc58)) +#loc149 = loc("tmp44"(#loc59)) +#loc150 = loc("tmp47"(#loc60)) +#loc151 = loc("_tmp46"(#loc61)) +#loc152 = loc("_tmp46"(#loc62)) +#loc154 = loc("tmp46"(#loc67)) +#loc155 = loc("tmp49"(#loc68)) +#loc156 = loc("tmp51"(#loc69)) +#loc157 = loc("tmp52"(#loc70)) +#loc158 = loc("tmp54"(#loc71)) +#loc159 = loc("tmp53"(#loc72)) +#loc160 = loc("tmp55"(#loc73)) +#loc161 = loc("tmp57"(#loc74)) +#loc162 = loc("tmp56"(#loc75)) +#loc163 = loc(fused[#loc107, #loc106]) +#loc164 = loc(fused[#loc110, #loc109]) +#loc165 = loc(fused[#loc116, #loc94]) +#loc166 = loc(fused[#loc136, #loc135]) +#loc167 = loc(callsite(#loc64 at #loc153)) +#loc169 = loc(fused[#loc158, #loc159]) +#loc170 = loc(fused[#loc161, #loc162]) +#loc171 = loc(callsite(#loc66 at #loc167)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7fab99f003e0385da6aa22a67e6a4a3d8d50e553 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin new file mode 100644 index 0000000000000000000000000000000000000000..93bdbd0c2afd70f9f4226bef76fd21a79f936baa Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..810382602545b441a6f1f488b2c64cc271c5b3f3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"hash": "dcf9c3d58e1d47ce5613d5d0ceadd19ad1e3325e6f9b526d3b2abd496024081a", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 512, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir new file mode 100644 index 0000000000000000000000000000000000000000..306e60bb94030e6d29f69c9b6b88ae4e09f9bc34 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir @@ -0,0 +1,781 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 3, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 24, !dbg !9 + %12 = lshr i32 %10, 5, !dbg !9 + %13 = and i32 %10, 7, !dbg !9 + %14 = lshr i32 %10, 3, !dbg !9 + %15 = and i32 %14, 7, !dbg !9 + %16 = or disjoint i32 %9, %13, !dbg !10 + %17 = or disjoint i32 %15, %9, !dbg !10 + %18 = icmp slt i32 %16, 32, !dbg !11 + %19 = icmp slt i32 %17, 32, !dbg !11 + %20 = or disjoint i32 %15, 8, !dbg !12 + %21 = shl nuw nsw i32 %13, 1, !dbg !12 + %22 = sdiv i32 %16, 16, !dbg !13 + %23 = mul nuw nsw i32 %15, 17, !dbg !14 + %24 = mul nuw nsw i32 %20, 17, !dbg !14 + %25 = shl i32 %22, 8, !dbg !15 + %26 = add i32 %25, %16, !dbg !15 + %27 = add i32 %26, %23, !dbg !16 + %28 = add i32 %26, %24, !dbg !16 + %29 = sext i32 %27 to i64, !dbg !17 + %30 = getelementptr i32, ptr addrspace(1) %0, i64 %29, !dbg !17 + %31 = sext i32 %28 to i64, !dbg !17 + %32 = getelementptr i32, ptr addrspace(1) %0, i64 %31, !dbg !17 + %33 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %30, i1 %18) #4, !dbg !18 + %34 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %32, i1 %18) #4, !dbg !18 + %35 = lshr i32 %10, 4, !dbg !19 + %.lobit = and i32 %35, 1, !dbg !19 + %36 = and i32 %10, 8, !dbg !19 + %.not = icmp eq i32 %36, 0, !dbg !19 + %.lobit1 = lshr exact i32 %36, 3, !dbg !19 + %37 = and i32 %10, 32, !dbg !19 + %.not3 = icmp eq i32 %37, 0, !dbg !19 + %.lobit2 = lshr exact i32 %37, 5, !dbg !19 + %38 = xor i32 %.lobit1, 1, !dbg !23 + %39 = xor i32 %.lobit, 1, !dbg !23 + %40 = xor i32 %.lobit2, 1, !dbg !23 + %41 = mul nuw nsw i32 %33, %38, !dbg !24 + %42 = mul nuw nsw i32 %34, %38, !dbg !24 + %43 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %41, i32 8, i32 31), !dbg !25 + %44 = add i32 %43, %41, !dbg !28 + %45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %42, i32 8, i32 31), !dbg !25 + %46 = add i32 %45, %42, !dbg !28 + %47 = mul nuw nsw i32 %33, %.lobit1, !dbg !29 + %48 = mul nuw nsw i32 %34, %.lobit1, !dbg !29 + %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %47, i32 8, i32 31), !dbg !25 + %50 = add i32 %49, %47, !dbg !28 + %51 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 8, i32 31), !dbg !25 + %52 = add i32 %51, %48, !dbg !28 + %53 = mul nuw nsw i32 %38, %15, !dbg !30 + %54 = mul nuw nsw i32 %20, %38, !dbg !30 + %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %53, i32 8, i32 31), !dbg !25 + %56 = add i32 %55, %53, !dbg !28 + %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 8, i32 31), !dbg !25 + %58 = add i32 %57, %54, !dbg !28 + %59 = mul nuw nsw i32 %15, %.lobit1, !dbg !31 + %60 = mul nuw nsw i32 %20, %.lobit1, !dbg !31 + %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %59, i32 8, i32 31), !dbg !25 + %62 = add i32 %61, %59, !dbg !28 + %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 8, i32 31), !dbg !25 + %64 = add i32 %63, %60, !dbg !28 + %65 = trunc i32 %35 to i1, !dbg !32 + %66 = icmp sge i32 %44, %50, !dbg !32 + %67 = icmp ne i32 %44, %50, !dbg !32 + %68 = icmp sle i32 %56, %62, !dbg !32 + %69 = or i1 %67, %68, !dbg !32 + %70 = and i1 %66, %69, !dbg !32 + %.not4 = xor i1 %70, %65, !dbg !32 + %71 = icmp sge i32 %46, %52, !dbg !32 + %72 = icmp ne i32 %46, %52, !dbg !32 + %73 = icmp sle i32 %58, %64, !dbg !32 + %74 = or i1 %72, %73, !dbg !32 + %75 = and i1 %71, %74, !dbg !32 + %.not5 = xor i1 %75, %65, !dbg !32 + %76 = xor i32 %50, %44, !dbg !33 + %77 = xor i32 %52, %46, !dbg !33 + %78 = select i1 %.not4, i32 0, i32 %76, !dbg !34 + %79 = select i1 %.not5, i32 0, i32 %77, !dbg !34 + %80 = xor i32 %78, %33, !dbg !35 + %81 = xor i32 %79, %34, !dbg !35 + %82 = xor i32 %62, %56, !dbg !36 + %83 = xor i32 %64, %58, !dbg !36 + %84 = select i1 %.not4, i32 0, i32 %82, !dbg !37 + %85 = select i1 %.not5, i32 0, i32 %83, !dbg !37 + %86 = xor i32 %84, %15, !dbg !38 + %87 = xor i32 %85, %20, !dbg !38 + %88 = mul nuw nsw i32 %80, %39, !dbg !24 + %89 = mul nuw nsw i32 %81, %39, !dbg !24 + %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 16, i32 31), !dbg !25 + %91 = add i32 %88, %90, !dbg !28 + %92 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 16, i32 31), !dbg !25 + %93 = add i32 %89, %92, !dbg !28 + %94 = mul nuw nsw i32 %80, %.lobit, !dbg !29 + %95 = mul nuw nsw i32 %81, %.lobit, !dbg !29 + %96 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 16, i32 31), !dbg !25 + %97 = add i32 %94, %96, !dbg !28 + %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %95, i32 16, i32 31), !dbg !25 + %99 = add i32 %95, %98, !dbg !28 + %100 = mul nuw nsw i32 %86, %39, !dbg !30 + %101 = mul nuw nsw i32 %87, %39, !dbg !30 + %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 16, i32 31), !dbg !25 + %103 = add i32 %100, %102, !dbg !28 + %104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 16, i32 31), !dbg !25 + %105 = add i32 %101, %104, !dbg !28 + %106 = mul nuw nsw i32 %86, %.lobit, !dbg !31 + %107 = mul nuw nsw i32 %87, %.lobit, !dbg !31 + %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 16, i32 31), !dbg !25 + %109 = add i32 %106, %108, !dbg !28 + %110 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 16, i32 31), !dbg !25 + %111 = add i32 %107, %110, !dbg !28 + %112 = icmp slt i32 %91, %97, !dbg !39 + %113 = icmp slt i32 %93, %99, !dbg !39 + %114 = icmp eq i32 %91, %97, !dbg !40 + %115 = icmp eq i32 %93, %99, !dbg !40 + %116 = icmp sgt i32 %103, %109, !dbg !41 + %117 = icmp sgt i32 %105, %111, !dbg !41 + %118 = and i1 %114, %116, !dbg !42 + %119 = and i1 %115, %117, !dbg !42 + %120 = or i1 %112, %118, !dbg !43 + %121 = or i1 %113, %119, !dbg !43 + %122 = zext i1 %120 to i32, !dbg !44 + %123 = zext i1 %121 to i32, !dbg !44 + %.not6 = icmp eq i32 %.lobit2, %122, !dbg !32 + %.not7 = icmp eq i32 %.lobit2, %123, !dbg !32 + %124 = xor i32 %91, %97, !dbg !33 + %125 = xor i32 %93, %99, !dbg !33 + %126 = select i1 %.not6, i32 0, i32 %124, !dbg !34 + %127 = select i1 %.not7, i32 0, i32 %125, !dbg !34 + %128 = xor i32 %126, %80, !dbg !35 + %129 = xor i32 %127, %81, !dbg !35 + %130 = xor i32 %103, %109, !dbg !36 + %131 = xor i32 %105, %111, !dbg !36 + %132 = select i1 %.not6, i32 0, i32 %130, !dbg !37 + %133 = select i1 %.not7, i32 0, i32 %131, !dbg !37 + %134 = xor i32 %132, %86, !dbg !38 + %135 = xor i32 %133, %87, !dbg !38 + %136 = mul nuw nsw i32 %128, %38, !dbg !24 + %137 = mul nuw nsw i32 %129, %38, !dbg !24 + %138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 8, i32 31), !dbg !25 + %139 = add i32 %136, %138, !dbg !28 + %140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 8, i32 31), !dbg !25 + %141 = add i32 %137, %140, !dbg !28 + %142 = mul nuw nsw i32 %128, %.lobit1, !dbg !29 + %143 = mul nuw nsw i32 %129, %.lobit1, !dbg !29 + %144 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 8, i32 31), !dbg !25 + %145 = add i32 %142, %144, !dbg !28 + %146 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %143, i32 8, i32 31), !dbg !25 + %147 = add i32 %143, %146, !dbg !28 + %148 = mul nuw nsw i32 %134, %38, !dbg !30 + %149 = mul nuw nsw i32 %135, %38, !dbg !30 + %150 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 8, i32 31), !dbg !25 + %151 = add i32 %148, %150, !dbg !28 + %152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %149, i32 8, i32 31), !dbg !25 + %153 = add i32 %149, %152, !dbg !28 + %154 = mul nuw nsw i32 %134, %.lobit1, !dbg !31 + %155 = mul nuw nsw i32 %135, %.lobit1, !dbg !31 + %156 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 8, i32 31), !dbg !25 + %157 = add i32 %154, %156, !dbg !28 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %155, i32 8, i32 31), !dbg !25 + %159 = add i32 %155, %158, !dbg !28 + %160 = icmp slt i32 %139, %145, !dbg !39 + %161 = icmp slt i32 %141, %147, !dbg !39 + %162 = icmp eq i32 %139, %145, !dbg !40 + %163 = icmp eq i32 %141, %147, !dbg !40 + %164 = icmp sgt i32 %151, %157, !dbg !41 + %165 = icmp sgt i32 %153, %159, !dbg !41 + %166 = and i1 %162, %164, !dbg !42 + %167 = and i1 %163, %165, !dbg !42 + %168 = or i1 %160, %166, !dbg !43 + %169 = or i1 %161, %167, !dbg !43 + %170 = zext i1 %168 to i32, !dbg !44 + %171 = zext i1 %169 to i32, !dbg !44 + %.not8 = icmp eq i32 %.lobit2, %170, !dbg !32 + %.not9 = icmp eq i32 %.lobit2, %171, !dbg !32 + %172 = xor i32 %139, %145, !dbg !33 + %173 = xor i32 %141, %147, !dbg !33 + %174 = select i1 %.not8, i32 0, i32 %172, !dbg !34 + %175 = select i1 %.not9, i32 0, i32 %173, !dbg !34 + %176 = xor i32 %174, %128, !dbg !35 + %177 = xor i32 %175, %129, !dbg !35 + %178 = xor i32 %151, %157, !dbg !36 + %179 = xor i32 %153, %159, !dbg !36 + %180 = select i1 %.not8, i32 0, i32 %178, !dbg !37 + %181 = select i1 %.not9, i32 0, i32 %179, !dbg !37 + %182 = xor i32 %180, %134, !dbg !38 + %183 = xor i32 %181, %135, !dbg !38 + %184 = mul nuw nsw i32 %176, %40, !dbg !24 + %185 = mul nuw nsw i32 %177, %40, !dbg !24 + %186 = and i32 %12, 1, !dbg !25 + %187 = shl nuw nsw i32 %10, 1, !dbg !25 + %188 = and i32 %187, 48, !dbg !25 + %189 = or disjoint i32 %188, %21, !dbg !25 + %.idx = shl nuw nsw i32 %189, 3, !dbg !25 + %190 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx, !dbg !25 + %191 = getelementptr i32, ptr addrspace(3) %190, i32 %186, !dbg !25 + %192 = insertelement <1 x i32> poison, i32 %184, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %192, i1 true) #4, !dbg !25 + %193 = getelementptr i8, ptr addrspace(3) %190, i32 8, !dbg !25 + %194 = getelementptr i32, ptr addrspace(3) %193, i32 %186, !dbg !25 + %195 = insertelement <1 x i32> poison, i32 %185, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %195, i1 true) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %196 = icmp samesign ult i32 %10, 128, !dbg !25 + %197 = getelementptr i32, ptr addrspace(3) @global_smem, i32 %10, !dbg !25 + %198 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25 + %199 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 1, i32 31), !dbg !25 + %200 = add i32 %199, %198, !dbg !28 + %201 = and i32 %10, 897, !dbg !25 + %202 = icmp eq i32 %201, 0, !dbg !25 + %203 = insertelement <1 x i32> poison, i32 %200, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %203, i1 %202) #4, !dbg !25 + %204 = getelementptr i8, ptr addrspace(3) %197, i32 256, !dbg !25 + %205 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25 + %206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %205, i32 1, i32 31), !dbg !25 + %207 = add i32 %206, %205, !dbg !28 + %208 = insertelement <1 x i32> poison, i32 %207, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %208, i1 %202) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %209 = load i32, ptr addrspace(3) %190, align 16, !dbg !25 + %210 = load i32, ptr addrspace(3) %193, align 8, !dbg !25 + %211 = mul nuw nsw i32 %176, %.lobit2, !dbg !29 + %212 = mul nuw nsw i32 %177, %.lobit2, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %213 = insertelement <1 x i32> poison, i32 %211, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %213, i1 true) #4, !dbg !25 + %214 = insertelement <1 x i32> poison, i32 %212, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %214, i1 true) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %215 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25 + %216 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %215, i32 1, i32 31), !dbg !25 + %217 = add i32 %216, %215, !dbg !28 + %218 = insertelement <1 x i32> poison, i32 %217, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %218, i1 %202) #4, !dbg !25 + %219 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25 + %220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 1, i32 31), !dbg !25 + %221 = add i32 %220, %219, !dbg !28 + %222 = insertelement <1 x i32> poison, i32 %221, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %222, i1 %202) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %223 = load i32, ptr addrspace(3) %190, align 16, !dbg !25 + %224 = load i32, ptr addrspace(3) %193, align 8, !dbg !25 + %225 = mul nuw nsw i32 %182, %40, !dbg !30 + %226 = mul nuw nsw i32 %183, %40, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %227 = insertelement <1 x i32> poison, i32 %225, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %227, i1 true) #4, !dbg !25 + %228 = insertelement <1 x i32> poison, i32 %226, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %228, i1 true) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %229 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25 + %230 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %229, i32 1, i32 31), !dbg !25 + %231 = add i32 %230, %229, !dbg !28 + %232 = insertelement <1 x i32> poison, i32 %231, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %232, i1 %202) #4, !dbg !25 + %233 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25 + %234 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %233, i32 1, i32 31), !dbg !25 + %235 = add i32 %234, %233, !dbg !28 + %236 = insertelement <1 x i32> poison, i32 %235, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %236, i1 %202) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %237 = load i32, ptr addrspace(3) %190, align 16, !dbg !25 + %238 = load i32, ptr addrspace(3) %193, align 8, !dbg !25 + %239 = mul nuw nsw i32 %182, %.lobit2, !dbg !31 + %240 = mul nuw nsw i32 %183, %.lobit2, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %241 = insertelement <1 x i32> poison, i32 %239, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %241, i1 true) #4, !dbg !25 + %242 = insertelement <1 x i32> poison, i32 %240, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %242, i1 true) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %243 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25 + %244 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 1, i32 31), !dbg !25 + %245 = add i32 %244, %243, !dbg !28 + %246 = insertelement <1 x i32> poison, i32 %245, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %246, i1 %202) #4, !dbg !25 + %247 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25 + %248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %247, i32 1, i32 31), !dbg !25 + %249 = add i32 %248, %247, !dbg !28 + %250 = insertelement <1 x i32> poison, i32 %249, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %250, i1 %202) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %251 = load i32, ptr addrspace(3) %190, align 16, !dbg !25 + %252 = load i32, ptr addrspace(3) %193, align 8, !dbg !25 + %253 = icmp slt i32 %209, %223, !dbg !39 + %254 = icmp sge i32 %210, %224, !dbg !39 + %255 = icmp eq i32 %209, %223, !dbg !40 + %256 = icmp ne i32 %210, %224, !dbg !40 + %257 = icmp sgt i32 %237, %251, !dbg !41 + %258 = icmp sle i32 %238, %252, !dbg !41 + %259 = and i1 %255, %257, !dbg !42 + %.not15 = or i1 %256, %258, !dbg !43 + %260 = or i1 %253, %259, !dbg !43 + %.not12 = and i1 %254, %.not15, !dbg !44 + %261 = xor i32 %223, %209, !dbg !33 + %262 = xor i32 %224, %210, !dbg !33 + %263 = select i1 %260, i32 %261, i32 0, !dbg !34 + %264 = select i1 %.not12, i32 %262, i32 0, !dbg !34 + %265 = xor i32 %263, %176, !dbg !35 + %266 = xor i32 %264, %177, !dbg !35 + %267 = xor i32 %251, %237, !dbg !36 + %268 = xor i32 %252, %238, !dbg !36 + %269 = select i1 %260, i32 %267, i32 0, !dbg !37 + %270 = select i1 %.not12, i32 %268, i32 0, !dbg !37 + %271 = xor i32 %269, %182, !dbg !38 + %272 = xor i32 %270, %183, !dbg !38 + %273 = mul nuw nsw i32 %265, %39, !dbg !24 + %274 = mul nuw nsw i32 %266, %39, !dbg !24 + %275 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %273, i32 16, i32 31), !dbg !25 + %276 = add i32 %273, %275, !dbg !28 + %277 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %274, i32 16, i32 31), !dbg !25 + %278 = add i32 %274, %277, !dbg !28 + %279 = mul nuw nsw i32 %265, %.lobit, !dbg !29 + %280 = mul nuw nsw i32 %266, %.lobit, !dbg !29 + %281 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %279, i32 16, i32 31), !dbg !25 + %282 = add i32 %279, %281, !dbg !28 + %283 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %280, i32 16, i32 31), !dbg !25 + %284 = add i32 %280, %283, !dbg !28 + %285 = mul nuw nsw i32 %271, %39, !dbg !30 + %286 = mul nuw nsw i32 %272, %39, !dbg !30 + %287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 16, i32 31), !dbg !25 + %288 = add i32 %285, %287, !dbg !28 + %289 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %286, i32 16, i32 31), !dbg !25 + %290 = add i32 %286, %289, !dbg !28 + %291 = mul nuw nsw i32 %271, %.lobit, !dbg !31 + %292 = mul nuw nsw i32 %272, %.lobit, !dbg !31 + %293 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %291, i32 16, i32 31), !dbg !25 + %294 = add i32 %293, %291, !dbg !28 + %295 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %292, i32 16, i32 31), !dbg !25 + %296 = add i32 %295, %292, !dbg !28 + %297 = icmp slt i32 %276, %282, !dbg !39 + %298 = icmp sge i32 %278, %284, !dbg !39 + %299 = icmp eq i32 %276, %282, !dbg !40 + %300 = icmp ne i32 %278, %284, !dbg !40 + %301 = icmp sgt i32 %288, %294, !dbg !41 + %302 = icmp sle i32 %290, %296, !dbg !41 + %303 = and i1 %299, %301, !dbg !42 + %.not21 = or i1 %300, %302, !dbg !43 + %304 = or i1 %297, %303, !dbg !43 + %.not18 = and i1 %298, %.not21, !dbg !44 + %305 = xor i32 %276, %282, !dbg !33 + %306 = xor i32 %278, %284, !dbg !33 + %307 = select i1 %304, i32 %305, i32 0, !dbg !34 + %308 = select i1 %.not18, i32 %306, i32 0, !dbg !34 + %309 = xor i32 %307, %265, !dbg !35 + %310 = xor i32 %308, %266, !dbg !35 + %311 = xor i32 %294, %288, !dbg !36 + %312 = xor i32 %296, %290, !dbg !36 + %313 = select i1 %304, i32 %311, i32 0, !dbg !37 + %314 = select i1 %.not18, i32 %312, i32 0, !dbg !37 + %315 = xor i32 %313, %271, !dbg !38 + %316 = xor i32 %314, %272, !dbg !38 + %317 = mul nuw nsw i32 %309, %38, !dbg !24 + %318 = mul nuw nsw i32 %310, %38, !dbg !24 + %319 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %317, i32 8, i32 31), !dbg !25 + %320 = add i32 %317, %319, !dbg !28 + %321 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %318, i32 8, i32 31), !dbg !25 + %322 = add i32 %318, %321, !dbg !28 + %323 = mul nuw nsw i32 %309, %.lobit1, !dbg !29 + %324 = mul nuw nsw i32 %310, %.lobit1, !dbg !29 + %325 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %323, i32 8, i32 31), !dbg !25 + %326 = add i32 %323, %325, !dbg !28 + %327 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %324, i32 8, i32 31), !dbg !25 + %328 = add i32 %324, %327, !dbg !28 + %329 = mul nuw nsw i32 %315, %38, !dbg !30 + %330 = mul nuw nsw i32 %316, %38, !dbg !30 + %331 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %329, i32 8, i32 31), !dbg !25 + %332 = add i32 %329, %331, !dbg !28 + %333 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %330, i32 8, i32 31), !dbg !25 + %334 = add i32 %330, %333, !dbg !28 + %335 = mul nuw nsw i32 %315, %.lobit1, !dbg !31 + %336 = mul nuw nsw i32 %316, %.lobit1, !dbg !31 + %337 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %335, i32 8, i32 31), !dbg !25 + %338 = add i32 %337, %335, !dbg !28 + %339 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %336, i32 8, i32 31), !dbg !25 + %340 = add i32 %339, %336, !dbg !28 + %341 = icmp slt i32 %320, %326, !dbg !39 + %342 = icmp sge i32 %322, %328, !dbg !39 + %343 = icmp eq i32 %320, %326, !dbg !40 + %344 = icmp ne i32 %322, %328, !dbg !40 + %345 = icmp sgt i32 %332, %338, !dbg !41 + %346 = icmp sle i32 %334, %340, !dbg !41 + %347 = and i1 %343, %345, !dbg !42 + %.not27 = or i1 %344, %346, !dbg !43 + %348 = or i1 %341, %347, !dbg !43 + %.not24 = and i1 %342, %.not27, !dbg !44 + %349 = xor i32 %320, %326, !dbg !33 + %350 = xor i32 %322, %328, !dbg !33 + %351 = select i1 %348, i32 %349, i32 0, !dbg !34 + %352 = select i1 %.not24, i32 %350, i32 0, !dbg !34 + %353 = xor i32 %351, %309, !dbg !35 + %354 = xor i32 %352, %310, !dbg !35 + %355 = xor i32 %338, %332, !dbg !36 + %356 = xor i32 %340, %334, !dbg !36 + %357 = select i1 %348, i32 %355, i32 0, !dbg !37 + %358 = select i1 %.not24, i32 %356, i32 0, !dbg !37 + %359 = xor i32 %357, %315, !dbg !38 + %360 = xor i32 %358, %316, !dbg !38 + %361 = icmp slt i32 %353, %354, !dbg !39 + %362 = icmp eq i32 %353, %354, !dbg !40 + %363 = icmp sgt i32 %359, %360, !dbg !41 + %364 = and i1 %362, %363, !dbg !42 + %365 = or i1 %361, %364, !dbg !43 + %366 = xor i32 %354, %353, !dbg !33 + %367 = select i1 %365, i32 %366, i32 0, !dbg !34 + %368 = xor i32 %367, %353, !dbg !35 + %369 = xor i32 %367, %354, !dbg !35 + %370 = xor i32 %360, %359, !dbg !36 + %371 = select i1 %365, i32 %370, i32 0, !dbg !37 + %372 = xor i32 %371, %359, !dbg !38 + %373 = xor i32 %371, %360, !dbg !38 + %374 = mul nuw nsw i32 %368, %40, !dbg !24 + %375 = mul nuw nsw i32 %369, %40, !dbg !24 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %376 = insertelement <1 x i32> poison, i32 %374, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %376, i1 true) #4, !dbg !25 + %377 = insertelement <1 x i32> poison, i32 %375, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %377, i1 true) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %378 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25 + %379 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %378, i32 1, i32 31), !dbg !25 + %380 = add i32 %379, %378, !dbg !28 + %381 = insertelement <1 x i32> poison, i32 %380, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %381, i1 %202) #4, !dbg !25 + %382 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25 + %383 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %382, i32 1, i32 31), !dbg !25 + %384 = add i32 %383, %382, !dbg !28 + %385 = insertelement <1 x i32> poison, i32 %384, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %385, i1 %202) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %386 = load i32, ptr addrspace(3) %190, align 16, !dbg !25 + %387 = load i32, ptr addrspace(3) %193, align 8, !dbg !25 + %388 = mul nuw nsw i32 %368, %.lobit2, !dbg !29 + %389 = mul nuw nsw i32 %369, %.lobit2, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %390 = insertelement <1 x i32> poison, i32 %388, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %390, i1 true) #4, !dbg !25 + %391 = insertelement <1 x i32> poison, i32 %389, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %391, i1 true) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %392 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25 + %393 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %392, i32 1, i32 31), !dbg !25 + %394 = add i32 %393, %392, !dbg !28 + %395 = insertelement <1 x i32> poison, i32 %394, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %395, i1 %202) #4, !dbg !25 + %396 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25 + %397 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %396, i32 1, i32 31), !dbg !25 + %398 = add i32 %397, %396, !dbg !28 + %399 = insertelement <1 x i32> poison, i32 %398, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %399, i1 %202) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %400 = load i32, ptr addrspace(3) %190, align 16, !dbg !25 + %401 = load i32, ptr addrspace(3) %193, align 8, !dbg !25 + %402 = mul nuw nsw i32 %372, %40, !dbg !30 + %403 = mul nuw nsw i32 %373, %40, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %404 = insertelement <1 x i32> poison, i32 %402, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %404, i1 true) #4, !dbg !25 + %405 = insertelement <1 x i32> poison, i32 %403, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %405, i1 true) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %406 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25 + %407 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %406, i32 1, i32 31), !dbg !25 + %408 = add i32 %407, %406, !dbg !28 + %409 = insertelement <1 x i32> poison, i32 %408, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %409, i1 %202) #4, !dbg !25 + %410 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25 + %411 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %410, i32 1, i32 31), !dbg !25 + %412 = add i32 %411, %410, !dbg !28 + %413 = insertelement <1 x i32> poison, i32 %412, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %413, i1 %202) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %414 = load i32, ptr addrspace(3) %190, align 16, !dbg !25 + %415 = load i32, ptr addrspace(3) %193, align 8, !dbg !25 + %416 = mul nuw nsw i32 %372, %.lobit2, !dbg !31 + %417 = mul nuw nsw i32 %373, %.lobit2, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %418 = insertelement <1 x i32> poison, i32 %416, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %418, i1 true) #4, !dbg !25 + %419 = insertelement <1 x i32> poison, i32 %417, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %419, i1 true) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %420 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %197, i1 %196) #4, !dbg !25 + %421 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %420, i32 1, i32 31), !dbg !25 + %422 = add i32 %421, %420, !dbg !28 + %423 = insertelement <1 x i32> poison, i32 %422, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %423, i1 %202) #4, !dbg !25 + %424 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %204, i1 %196) #4, !dbg !25 + %425 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %424, i32 1, i32 31), !dbg !25 + %426 = add i32 %425, %424, !dbg !28 + %427 = insertelement <1 x i32> poison, i32 %426, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %204, <1 x i32> %427, i1 %202) #4, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %428 = load i32, ptr addrspace(3) %190, align 16, !dbg !25 + %429 = load i32, ptr addrspace(3) %193, align 8, !dbg !25 + %430 = icmp slt i32 %386, %400, !dbg !39 + %431 = icmp slt i32 %387, %401, !dbg !39 + %432 = icmp eq i32 %386, %400, !dbg !40 + %433 = icmp eq i32 %387, %401, !dbg !40 + %434 = icmp sgt i32 %414, %428, !dbg !41 + %435 = icmp sgt i32 %415, %429, !dbg !41 + %436 = and i1 %432, %434, !dbg !42 + %437 = and i1 %433, %435, !dbg !42 + %438 = or i1 %430, %436, !dbg !43 + %439 = or i1 %431, %437, !dbg !43 + %440 = xor i32 %400, %386, !dbg !33 + %441 = xor i32 %401, %387, !dbg !33 + %442 = select i1 %438, i32 %440, i32 0, !dbg !34 + %443 = select i1 %439, i32 %441, i32 0, !dbg !34 + %444 = xor i32 %442, %368, !dbg !35 + %445 = xor i32 %443, %369, !dbg !35 + %446 = xor i32 %428, %414, !dbg !36 + %447 = xor i32 %429, %415, !dbg !36 + %448 = select i1 %438, i32 %446, i32 0, !dbg !37 + %449 = select i1 %439, i32 %447, i32 0, !dbg !37 + %450 = xor i32 %448, %372, !dbg !38 + %451 = xor i32 %449, %373, !dbg !38 + %452 = mul nuw nsw i32 %444, %39, !dbg !24 + %453 = mul nuw nsw i32 %445, %39, !dbg !24 + %454 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %452, i32 16, i32 31), !dbg !25 + %455 = add i32 %452, %454, !dbg !28 + %456 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %453, i32 16, i32 31), !dbg !25 + %457 = add i32 %453, %456, !dbg !28 + %458 = mul nuw nsw i32 %444, %.lobit, !dbg !29 + %459 = mul nuw nsw i32 %445, %.lobit, !dbg !29 + %460 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %458, i32 16, i32 31), !dbg !25 + %461 = add i32 %458, %460, !dbg !28 + %462 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %459, i32 16, i32 31), !dbg !25 + %463 = add i32 %459, %462, !dbg !28 + %464 = mul nuw nsw i32 %450, %39, !dbg !30 + %465 = mul nuw nsw i32 %451, %39, !dbg !30 + %466 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %464, i32 16, i32 31), !dbg !25 + %467 = add i32 %464, %466, !dbg !28 + %468 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %465, i32 16, i32 31), !dbg !25 + %469 = add i32 %465, %468, !dbg !28 + %470 = mul nuw nsw i32 %450, %.lobit, !dbg !31 + %471 = mul nuw nsw i32 %451, %.lobit, !dbg !31 + %472 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %470, i32 16, i32 31), !dbg !25 + %473 = add i32 %472, %470, !dbg !28 + %474 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %471, i32 16, i32 31), !dbg !25 + %475 = add i32 %474, %471, !dbg !28 + %476 = icmp slt i32 %455, %461, !dbg !39 + %477 = icmp slt i32 %457, %463, !dbg !39 + %478 = icmp eq i32 %455, %461, !dbg !40 + %479 = icmp eq i32 %457, %463, !dbg !40 + %480 = icmp sgt i32 %467, %473, !dbg !41 + %481 = icmp sgt i32 %469, %475, !dbg !41 + %482 = and i1 %478, %480, !dbg !42 + %483 = and i1 %479, %481, !dbg !42 + %484 = or i1 %476, %482, !dbg !43 + %485 = or i1 %477, %483, !dbg !43 + %486 = xor i32 %455, %461, !dbg !33 + %487 = xor i32 %457, %463, !dbg !33 + %488 = select i1 %484, i32 %486, i32 0, !dbg !34 + %489 = select i1 %485, i32 %487, i32 0, !dbg !34 + %490 = xor i32 %488, %444, !dbg !35 + %491 = xor i32 %489, %445, !dbg !35 + %492 = xor i32 %473, %467, !dbg !36 + %493 = xor i32 %475, %469, !dbg !36 + %494 = select i1 %484, i32 %492, i32 0, !dbg !37 + %495 = select i1 %485, i32 %493, i32 0, !dbg !37 + %496 = xor i32 %494, %450, !dbg !38 + %497 = xor i32 %495, %451, !dbg !38 + %498 = mul nuw nsw i32 %490, %38, !dbg !24 + %499 = mul nuw nsw i32 %491, %38, !dbg !24 + %500 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %498, i32 8, i32 31), !dbg !25 + %501 = add i32 %498, %500, !dbg !28 + %502 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %499, i32 8, i32 31), !dbg !25 + %503 = add i32 %499, %502, !dbg !28 + %504 = mul nuw nsw i32 %490, %.lobit1, !dbg !29 + %505 = mul nuw nsw i32 %491, %.lobit1, !dbg !29 + %506 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %504, i32 8, i32 31), !dbg !25 + %507 = add i32 %504, %506, !dbg !28 + %508 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %505, i32 8, i32 31), !dbg !25 + %509 = add i32 %505, %508, !dbg !28 + %510 = mul nuw nsw i32 %496, %38, !dbg !30 + %511 = mul nuw nsw i32 %497, %38, !dbg !30 + %512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %510, i32 8, i32 31), !dbg !25 + %513 = add i32 %510, %512, !dbg !28 + %514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %511, i32 8, i32 31), !dbg !25 + %515 = add i32 %511, %514, !dbg !28 + %516 = mul nuw nsw i32 %496, %.lobit1, !dbg !31 + %517 = mul nuw nsw i32 %497, %.lobit1, !dbg !31 + %518 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %516, i32 8, i32 31), !dbg !25 + %519 = add i32 %518, %516, !dbg !28 + %520 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %517, i32 8, i32 31), !dbg !25 + %521 = add i32 %520, %517, !dbg !28 + %522 = icmp slt i32 %501, %507, !dbg !39 + %523 = icmp slt i32 %503, %509, !dbg !39 + %524 = icmp eq i32 %501, %507, !dbg !40 + %525 = icmp eq i32 %503, %509, !dbg !40 + %526 = icmp sgt i32 %513, %519, !dbg !41 + %527 = icmp sgt i32 %515, %521, !dbg !41 + %528 = and i1 %524, %526, !dbg !42 + %529 = and i1 %525, %527, !dbg !42 + %530 = or i1 %522, %528, !dbg !43 + %531 = or i1 %523, %529, !dbg !43 + %532 = xor i32 %519, %513, !dbg !36 + %533 = xor i32 %521, %515, !dbg !36 + %534 = select i1 %530, i32 %532, i32 0, !dbg !37 + %535 = select i1 %531, i32 %533, i32 0, !dbg !37 + %536 = xor i32 %534, %496, !dbg !38 + %537 = xor i32 %535, %497, !dbg !38 + %narrow = select i1 %18, i32 %33, i32 0, !dbg !45 + %538 = sext i32 %narrow to i64, !dbg !45 + %narrow28 = select i1 %18, i32 %34, i32 0, !dbg !45 + %539 = sext i32 %narrow28 to i64, !dbg !45 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %540 = add nsw i64 %539, %538, !dbg !48 + %extelt.offset = lshr i64 %540, 32, !dbg !46 + %541 = trunc nuw i64 %extelt.offset to i32, !dbg !46 + %542 = trunc i64 %540 to i32, !dbg !46 + %543 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %542, i32 16, i32 31), !dbg !46 + %544 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %541, i32 16, i32 31), !dbg !46 + %545 = insertelement <2 x i32> poison, i32 %543, i64 0, !dbg !46 + %546 = insertelement <2 x i32> %545, i32 %544, i64 1, !dbg !46 + %547 = bitcast <2 x i32> %546 to i64, !dbg !46 + %548 = add i64 %540, %547, !dbg !48 + %extelt.offset29 = lshr i64 %548, 32, !dbg !46 + %549 = trunc nuw i64 %extelt.offset29 to i32, !dbg !46 + %550 = trunc i64 %548 to i32, !dbg !46 + %551 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %550, i32 8, i32 31), !dbg !46 + %552 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %549, i32 8, i32 31), !dbg !46 + %553 = insertelement <2 x i32> poison, i32 %551, i64 0, !dbg !46 + %554 = insertelement <2 x i32> %553, i32 %552, i64 1, !dbg !46 + %555 = bitcast <2 x i32> %554 to i64, !dbg !46 + %556 = add i64 %548, %555, !dbg !48 + %557 = icmp eq i32 %11, 0, !dbg !46 + %558 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %21, !dbg !46 + %559 = getelementptr i64, ptr addrspace(3) %558, i32 %186, !dbg !46 + %560 = insertelement <1 x i64> poison, i64 %556, i64 0, !dbg !46 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %559, <1 x i64> %560, i1 %557) #4, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %561 = icmp samesign ult i32 %10, 16, !dbg !46 + %562 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %10, !dbg !46 + %563 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %562, i1 %561) #4, !dbg !46 + %extelt.offset30 = lshr i64 %563, 32, !dbg !46 + %564 = trunc nuw i64 %extelt.offset30 to i32, !dbg !46 + %565 = trunc i64 %563 to i32, !dbg !46 + %566 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %565, i32 1, i32 31), !dbg !46 + %567 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %564, i32 1, i32 31), !dbg !46 + %568 = insertelement <2 x i32> poison, i32 %566, i64 0, !dbg !46 + %569 = insertelement <2 x i32> %568, i32 %567, i64 1, !dbg !46 + %570 = bitcast <2 x i32> %569 to i64, !dbg !46 + %571 = add i64 %563, %570, !dbg !48 + %572 = and i32 %10, 1009, !dbg !46 + %573 = icmp eq i32 %572, 0, !dbg !46 + %574 = insertelement <1 x i64> poison, i64 %571, i64 0, !dbg !46 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %562, <1 x i64> %574, i1 %573) #4, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %575 = load i64, ptr addrspace(3) %558, align 16, !dbg !46 + %576 = trunc i64 %575 to i32, !dbg !49 + %577 = shl i32 %17, 4, !dbg !50 + %578 = or disjoint i32 %577, %21, !dbg !51 + %579 = sext i32 %578 to i64, !dbg !52 + %580 = getelementptr i32, ptr addrspace(1) %1, i64 %579, !dbg !52 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %581 = and i32 %10, 3, !dbg !53 + %582 = shl nuw nsw i32 %581, 3, !dbg !53 + %583 = and i32 %187, 96, !dbg !53 + %584 = and i32 %10, 4, !dbg !53 + %585 = icmp eq i32 %584, 0, !dbg !53 + %586 = select i1 %585, i32 0, i32 192, !dbg !53 + %587 = select i1 %.not, i32 0, i32 260, !dbg !53 + %588 = or disjoint i32 %582, %583, !dbg !53 + %589 = xor i32 %588, %586, !dbg !53 + %590 = or disjoint i32 %589, %587, !dbg !53 + %591 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %590, !dbg !53 + %592 = insertelement <1 x i32> poison, i32 %536, i64 0, !dbg !53 + store <1 x i32> %592, ptr addrspace(3) %591, align 4, !dbg !53 + %593 = xor i32 %590, 4, !dbg !53 + %594 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %593, !dbg !53 + %595 = insertelement <1 x i32> poison, i32 %537, i64 0, !dbg !53 + store <1 x i32> %595, ptr addrspace(3) %594, align 4, !dbg !53 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %596 = shl nuw nsw i32 %581, 5, !dbg !53 + %597 = and i32 %10, 28, !dbg !53 + %598 = select i1 %.not3, i32 0, i32 192, !dbg !53 + %599 = or disjoint i32 %596, %597, !dbg !53 + %600 = xor i32 %599, %598, !dbg !53 + %601 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %600, !dbg !53 + %602 = load i32, ptr addrspace(3) %601, align 4, !dbg !53 + %603 = xor i32 %600, 260, !dbg !53 + %604 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %603, !dbg !53 + %605 = load i32, ptr addrspace(3) %604, align 4, !dbg !53 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %602, i32 %605, ptr addrspace(1) %580, i1 %19) #4, !dbg !53 + %606 = sext i32 %16 to i64, !dbg !54 + %607 = getelementptr i32, ptr addrspace(1) %2, i64 %606, !dbg !54 + %608 = and i32 %10, 56, !dbg !55 + %609 = icmp eq i32 %608, 0, !dbg !55 + %610 = and i1 %609, %18, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %576, ptr addrspace(1) %607, i1 %610) #4, !dbg !55 + ret void, !dbg !56 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", linkageName: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 24, column: 28, scope: !4) +!8 = !DILocation(line: 24, column: 33, scope: !4) +!9 = !DILocation(line: 25, column: 44, scope: !4) +!10 = !DILocation(line: 25, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 21, scope: !4) +!12 = !DILocation(line: 27, column: 38, scope: !4) +!13 = !DILocation(line: 34, column: 19, scope: !4) +!14 = !DILocation(line: 36, column: 38, scope: !4) +!15 = !DILocation(line: 36, column: 35, scope: !4) +!16 = !DILocation(line: 36, column: 45, scope: !4) +!17 = !DILocation(line: 36, column: 30, scope: !4) +!18 = !DILocation(line: 36, column: 54, scope: !4) +!19 = !DILocation(line: 627, column: 44, scope: !20, inlinedAt: !22) +!20 = distinct !DILexicalBlockFile(scope: !4, file: !21, discriminator: 0) +!21 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!22 = !DILocation(line: 41, column: 67, scope: !4) +!23 = !DILocation(line: 537, column: 21, scope: !20, inlinedAt: !22) +!24 = !DILocation(line: 538, column: 40, scope: !20, inlinedAt: !22) +!25 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !22) +!26 = distinct !DILexicalBlockFile(scope: !4, file: !27, discriminator: 0) +!27 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!28 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !22) +!29 = !DILocation(line: 539, column: 41, scope: !20, inlinedAt: !22) +!30 = !DILocation(line: 548, column: 23, scope: !20, inlinedAt: !22) +!31 = !DILocation(line: 551, column: 23, scope: !20, inlinedAt: !22) +!32 = !DILocation(line: 599, column: 28, scope: !20, inlinedAt: !22) +!33 = !DILocation(line: 600, column: 38, scope: !20, inlinedAt: !22) +!34 = !DILocation(line: 600, column: 46, scope: !20, inlinedAt: !22) +!35 = !DILocation(line: 600, column: 15, scope: !20, inlinedAt: !22) +!36 = !DILocation(line: 601, column: 48, scope: !20, inlinedAt: !22) +!37 = !DILocation(line: 601, column: 59, scope: !20, inlinedAt: !22) +!38 = !DILocation(line: 601, column: 22, scope: !20, inlinedAt: !22) +!39 = !DILocation(line: 574, column: 22, scope: !20, inlinedAt: !22) +!40 = !DILocation(line: 591, column: 21, scope: !20, inlinedAt: !22) +!41 = !DILocation(line: 594, column: 40, scope: !20, inlinedAt: !22) +!42 = !DILocation(line: 594, column: 29, scope: !20, inlinedAt: !22) +!43 = !DILocation(line: 594, column: 23, scope: !20, inlinedAt: !22) +!44 = !DILocation(line: 599, column: 19, scope: !20, inlinedAt: !22) +!45 = !DILocation(line: 44, column: 34, scope: !4) +!46 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !47) +!47 = !DILocation(line: 45, column: 26, scope: !4) +!48 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !47) +!49 = !DILocation(line: 48, column: 21, scope: !4) +!50 = !DILocation(line: 49, column: 35, scope: !4) +!51 = !DILocation(line: 49, column: 32, scope: !4) +!52 = !DILocation(line: 49, column: 25, scope: !4) +!53 = !DILocation(line: 49, column: 47, scope: !4) +!54 = !DILocation(line: 50, column: 25, scope: !4) +!55 = !DILocation(line: 50, column: 37, scope: !4) +!56 = !DILocation(line: 50, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx new file mode 100644 index 0000000000000000000000000000000000000000..57c743db1202ca2405e36856bc5a55b1dbc30fa6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx @@ -0,0 +1,1410 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 // -- Begin function triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.visible .entry triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_6 +) +.reqntid 64 +{ + .reg .pred %p<155>; + .reg .b32 %r<504>; + .reg .b64 %rd<27>; + .loc 1 18 0 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0]; + ld.param.b64 %rd9, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1]; +$L__tmp0: + .loc 1 24 28 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:24:28 + mov.u32 %r105, %ctaid.x; + .loc 1 24 33 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:24:33 + shl.b32 %r106, %r105, 3; + ld.param.b64 %rd10, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2]; + .loc 1 25 44 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:25:44 + mov.u32 %r107, %tid.x; + and.b32 %r108, %r107, 24; + and.b32 %r109, %r107, 7; + bfe.u32 %r110, %r107, 3, 3; + .loc 1 25 23 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:25:23 + or.b32 %r111, %r106, %r109; + or.b32 %r112, %r110, %r106; + .loc 1 26 21 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:26:21 + setp.lt.s32 %p1, %r111, 32; + setp.lt.s32 %p54, %r112, 32; + .loc 1 27 38 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:27:38 + or.b32 %r113, %r110, 8; + shl.b32 %r114, %r109, 1; + .loc 1 34 19 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:34:19 + bfe.s32 %r115, %r105, 28, 1; + shr.u32 %r116, %r115, 28; + add.s32 %r117, %r111, %r116; + .loc 1 36 35 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:36:35 + shl.b32 %r118, %r117, 4; + and.b32 %r119, %r118, -256; + add.s32 %r120, %r119, %r111; + .loc 1 36 45 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:36:45 + mad.lo.s32 %r121, %r110, 17, %r120; + add.s32 %r122, %r121, 136; + .loc 1 36 30 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:36:30 + mad.wide.s32 %rd1, %r121, 4, %rd8; + mad.wide.s32 %rd2, %r122, 4, %rd8; + .loc 1 36 54 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:36:54 + // begin inline asm + mov.u32 %r1, 0x0; + @%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2, 0x0; + @%p1 ld.global.b32 { %r2 }, [ %rd2 + 0 ]; + // end inline asm +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shr.u32 %r123, %r107, 4; + bfe.u32 %r124, %r107, 4, 1; + bfe.s32 %r125, %r107, 3, 1; + and.b32 %r126, %r107, 8; + bfe.u32 %r127, %r107, 3, 1; + bfe.s32 %r128, %r107, 5, 1; + bfe.u32 %r129, %r107, 5, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r130, %r127, 1; + xor.b32 %r131, %r124, 1; + xor.b32 %r132, %r129, 1; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r133, %r1, %r130; + mul.lo.s32 %r134, %r2, %r130; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r135, %r133, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r136, %r135, %r133; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r137, %r134, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r138, %r137, %r134; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r139, %r1, %r127; + mul.lo.s32 %r140, %r2, %r127; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r141, %r139, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r142, %r141, %r139; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r143, %r140, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r144, %r143, %r140; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r145, %r130, %r110; + shl.b32 %r146, %r130, 3; + or.b32 %r147, %r145, %r146; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r148, %r145, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r149, %r148, %r145; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r150, %r147, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r151, %r150, %r147; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r152, %r110, %r127; + or.b32 %r153, %r152, %r126; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r154, %r152, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r155, %r154, %r152; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r156, %r153, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r157, %r156, %r153; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.b32 %r158, %r123, 1; + setp.ne.b32 %p56, %r158, 0; + setp.ge.s32 %p57, %r136, %r142; + setp.ne.b32 %p58, %r136, %r142; + setp.le.s32 %p59, %r149, %r155; + or.pred %p60, %p58, %p59; + and.pred %p61, %p57, %p60; + xor.pred %p62, %p61, %p56; + setp.ge.s32 %p63, %r138, %r144; + setp.ne.b32 %p64, %r138, %r144; + setp.le.s32 %p65, %r151, %r157; + or.pred %p66, %p64, %p65; + and.pred %p67, %p63, %p66; + xor.pred %p68, %p67, %p56; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r159, %r142, %r136; + xor.b32 %r160, %r144, %r138; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r161, 0, %r159, %p62; + selp.b32 %r162, 0, %r160, %p68; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r163, %r161, %r1; + xor.b32 %r164, %r162, %r2; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r165, %r155, %r149; + xor.b32 %r166, %r157, %r151; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r167, 0, %r165, %p62; + selp.b32 %r168, 0, %r166, %p68; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r169, %r167, %r110; + xor.b32 %r170, %r168, %r113; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r171, %r163, %r131; + mul.lo.s32 %r172, %r164, %r131; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r173, %r171, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r174, %r171, %r173; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r175, %r172, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r176, %r172, %r175; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r177, %r163, %r124; + mul.lo.s32 %r178, %r164, %r124; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r179, %r177, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r180, %r177, %r179; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r181, %r178, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r182, %r178, %r181; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r183, %r169, %r131; + mul.lo.s32 %r184, %r170, %r131; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r185, %r183, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r186, %r183, %r185; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r187, %r184, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r188, %r184, %r187; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r189, %r169, %r124; + mul.lo.s32 %r190, %r170, %r124; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r191, %r189, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r192, %r189, %r191; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r193, %r190, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r194, %r190, %r193; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p69, %r174, %r180; + setp.lt.s32 %p70, %r176, %r182; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p71, %r174, %r180; + setp.eq.b32 %p72, %r176, %r182; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p73, %r186, %r192; + setp.gt.s32 %p74, %r188, %r194; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p75, %p71, %p73; + and.pred %p76, %p72, %p74; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p77, %p69, %p75; + or.pred %p78, %p70, %p76; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r195, 1, 0, %p77; + selp.b32 %r196, 1, 0, %p78; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p79, %r129, %r195; + setp.eq.b32 %p80, %r129, %r196; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r197, %r174, %r180; + xor.b32 %r198, %r176, %r182; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r199, 0, %r197, %p79; + selp.b32 %r200, 0, %r198, %p80; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r201, %r199, %r163; + xor.b32 %r202, %r200, %r164; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r203, %r186, %r192; + xor.b32 %r204, %r188, %r194; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r205, 0, %r203, %p79; + selp.b32 %r206, 0, %r204, %p80; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r207, %r205, %r169; + xor.b32 %r208, %r206, %r170; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r209, %r201, %r130; + mul.lo.s32 %r210, %r202, %r130; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r211, %r209, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r212, %r209, %r211; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r213, %r210, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r214, %r210, %r213; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r215, %r201, %r127; + mul.lo.s32 %r216, %r202, %r127; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r217, %r215, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r218, %r215, %r217; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r219, %r216, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r220, %r216, %r219; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r221, %r207, %r130; + mul.lo.s32 %r222, %r208, %r130; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r223, %r221, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r224, %r221, %r223; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r225, %r222, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r226, %r222, %r225; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r227, %r207, %r127; + mul.lo.s32 %r228, %r208, %r127; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r229, %r227, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r230, %r227, %r229; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r231, %r228, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r232, %r228, %r231; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p81, %r212, %r218; + setp.lt.s32 %p82, %r214, %r220; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p83, %r212, %r218; + setp.eq.b32 %p84, %r214, %r220; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p85, %r224, %r230; + setp.gt.s32 %p86, %r226, %r232; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p87, %p83, %p85; + and.pred %p88, %p84, %p86; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p89, %p81, %p87; + or.pred %p90, %p82, %p88; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r233, 1, 0, %p89; + selp.b32 %r234, 1, 0, %p90; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p91, %r129, %r233; + setp.eq.b32 %p92, %r129, %r234; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r235, %r212, %r218; + xor.b32 %r236, %r214, %r220; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r237, 0, %r235, %p91; + selp.b32 %r238, 0, %r236, %p92; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r239, %r237, %r201; + xor.b32 %r240, %r238, %r202; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r241, %r224, %r230; + xor.b32 %r242, %r226, %r232; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r243, 0, %r241, %p91; + selp.b32 %r244, 0, %r242, %p92; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r245, %r243, %r207; + xor.b32 %r246, %r244, %r208; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r4, %r239, %r132; + mul.lo.s32 %r6, %r240, %r132; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shl.b32 %r247, %r107, 1; + and.b32 %r248, %r247, 48; + or.b32 %r249, %r248, %r114; + shl.b32 %r250, %r249, 3; + mov.b32 %r251, global_smem; + add.s32 %r252, %r251, %r250; + shl.b32 %r253, %r129, 2; + add.s32 %r3, %r252, %r253; + mov.pred %p3, -1; + // begin inline asm + @%p3 st.shared.b32 [ %r3 + 0 ], %r4; + // end inline asm + add.s32 %r5, %r3, 8; + // begin inline asm + @%p3 st.shared.b32 [ %r5 + 0 ], %r6; + // end inline asm + bar.sync 0; + setp.lt.u32 %p5, %r107, 128; + shl.b32 %r254, %r107, 2; + add.s32 %r8, %r251, %r254; + // begin inline asm + @%p5 ld.shared.b32 %r7, [ %r8 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r255, %r7, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r10, %r255, %r7; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.b32 %r256, %r107, 897; + setp.eq.b32 %p6, %r256, 0; + // begin inline asm + @%p6 st.shared.b32 [ %r8 + 0 ], %r10; + // end inline asm + add.s32 %r12, %r8, 256; + // begin inline asm + @%p5 ld.shared.b32 %r11, [ %r12 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r257, %r11, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r14, %r257, %r11; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r12 + 0 ], %r14; + // end inline asm + bar.sync 0; + ld.shared.b32 %r258, [%r252]; + ld.shared.b32 %r259, [%r252+8]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r16, %r239, %r129; + mul.lo.s32 %r18, %r240, %r129; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p3 st.shared.b32 [ %r3 + 0 ], %r16; + // end inline asm + // begin inline asm + @%p3 st.shared.b32 [ %r5 + 0 ], %r18; + // end inline asm + bar.sync 0; + // begin inline asm + @%p5 ld.shared.b32 %r19, [ %r8 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r260, %r19, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r22, %r260, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r8 + 0 ], %r22; + // end inline asm + // begin inline asm + @%p5 ld.shared.b32 %r23, [ %r12 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r261, %r23, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r26, %r261, %r23; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r12 + 0 ], %r26; + // end inline asm + bar.sync 0; + ld.shared.b32 %r262, [%r252]; + ld.shared.b32 %r263, [%r252+8]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r28, %r245, %r132; + mul.lo.s32 %r30, %r246, %r132; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p3 st.shared.b32 [ %r3 + 0 ], %r28; + // end inline asm + // begin inline asm + @%p3 st.shared.b32 [ %r5 + 0 ], %r30; + // end inline asm + bar.sync 0; + // begin inline asm + @%p5 ld.shared.b32 %r31, [ %r8 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r264, %r31, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r34, %r264, %r31; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r8 + 0 ], %r34; + // end inline asm + // begin inline asm + @%p5 ld.shared.b32 %r35, [ %r12 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r265, %r35, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r38, %r265, %r35; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r12 + 0 ], %r38; + // end inline asm + bar.sync 0; + ld.shared.b32 %r266, [%r252]; + ld.shared.b32 %r267, [%r252+8]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r40, %r245, %r129; + mul.lo.s32 %r42, %r246, %r129; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p3 st.shared.b32 [ %r3 + 0 ], %r40; + // end inline asm + // begin inline asm + @%p3 st.shared.b32 [ %r5 + 0 ], %r42; + // end inline asm + bar.sync 0; + // begin inline asm + @%p5 ld.shared.b32 %r43, [ %r8 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r268, %r43, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r46, %r268, %r43; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r8 + 0 ], %r46; + // end inline asm + // begin inline asm + @%p5 ld.shared.b32 %r47, [ %r12 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r269, %r47, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r50, %r269, %r47; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r12 + 0 ], %r50; + // end inline asm + bar.sync 0; + ld.shared.b32 %r270, [%r252]; + ld.shared.b32 %r271, [%r252+8]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p93, %r258, %r262; + setp.ge.s32 %p94, %r259, %r263; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p95, %r258, %r262; + setp.ne.b32 %p96, %r259, %r263; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p97, %r266, %r270; + setp.le.s32 %p98, %r267, %r271; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p99, %p95, %p97; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p100, %p96, %p98; + or.pred %p101, %p93, %p99; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p102, %p94, %p100; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r272, %r262, %r258; + xor.b32 %r273, %r263, %r259; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r274, %r272, 0, %p101; + selp.b32 %r275, %r273, 0, %p102; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r276, %r274, %r239; + xor.b32 %r277, %r275, %r240; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r278, %r270, %r266; + xor.b32 %r279, %r271, %r267; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r280, %r278, 0, %p101; + selp.b32 %r281, %r279, 0, %p102; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r282, %r280, %r245; + xor.b32 %r283, %r281, %r246; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r284, %r276, %r131; + mul.lo.s32 %r285, %r277, %r131; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r286, %r284, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r287, %r284, %r286; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r288, %r285, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r289, %r285, %r288; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r290, %r276, %r124; + mul.lo.s32 %r291, %r277, %r124; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r292, %r290, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r293, %r290, %r292; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r294, %r291, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r295, %r291, %r294; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r296, %r282, %r131; + mul.lo.s32 %r297, %r283, %r131; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r298, %r296, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r299, %r296, %r298; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r300, %r297, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r301, %r297, %r300; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r302, %r282, %r124; + mul.lo.s32 %r303, %r283, %r124; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r304, %r302, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r305, %r304, %r302; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r306, %r303, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r307, %r306, %r303; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p103, %r287, %r293; + setp.ge.s32 %p104, %r289, %r295; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p105, %r287, %r293; + setp.ne.b32 %p106, %r289, %r295; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p107, %r299, %r305; + setp.le.s32 %p108, %r301, %r307; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p109, %p105, %p107; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p110, %p106, %p108; + or.pred %p111, %p103, %p109; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p112, %p104, %p110; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r308, %r287, %r293; + xor.b32 %r309, %r289, %r295; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r310, %r308, 0, %p111; + selp.b32 %r311, %r309, 0, %p112; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r312, %r310, %r276; + xor.b32 %r313, %r311, %r277; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r314, %r305, %r299; + xor.b32 %r315, %r307, %r301; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r316, %r314, 0, %p111; + selp.b32 %r317, %r315, 0, %p112; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r318, %r316, %r282; + xor.b32 %r319, %r317, %r283; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r320, %r312, %r130; + mul.lo.s32 %r321, %r313, %r130; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r322, %r320, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r323, %r320, %r322; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r324, %r321, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r325, %r321, %r324; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r326, %r312, %r127; + mul.lo.s32 %r327, %r313, %r127; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r328, %r326, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r329, %r326, %r328; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r330, %r327, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r331, %r327, %r330; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r332, %r318, %r130; + mul.lo.s32 %r333, %r319, %r130; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r334, %r332, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r335, %r332, %r334; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r336, %r333, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r337, %r333, %r336; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r338, %r318, %r127; + mul.lo.s32 %r339, %r319, %r127; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r340, %r338, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r341, %r340, %r338; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r342, %r339, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r343, %r342, %r339; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p113, %r323, %r329; + setp.ge.s32 %p114, %r325, %r331; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p115, %r323, %r329; + setp.ne.b32 %p116, %r325, %r331; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p117, %r335, %r341; + setp.le.s32 %p118, %r337, %r343; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p119, %p115, %p117; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p120, %p116, %p118; + or.pred %p121, %p113, %p119; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p122, %p114, %p120; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r344, %r323, %r329; + xor.b32 %r345, %r325, %r331; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r346, %r344, 0, %p121; + selp.b32 %r347, %r345, 0, %p122; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r348, %r346, %r312; + xor.b32 %r349, %r347, %r313; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r350, %r341, %r335; + xor.b32 %r351, %r343, %r337; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r352, %r350, 0, %p121; + selp.b32 %r353, %r351, 0, %p122; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r354, %r352, %r318; + xor.b32 %r355, %r353, %r319; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p123, %r348, %r349; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p124, %r348, %r349; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p125, %r354, %r355; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p126, %p124, %p125; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p127, %p123, %p126; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r356, %r349, %r348; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r357, %r356, 0, %p127; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r358, %r357, %r348; + xor.b32 %r359, %r357, %r349; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r360, %r355, %r354; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r361, %r360, 0, %p127; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r362, %r361, %r354; + xor.b32 %r363, %r361, %r355; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r52, %r358, %r132; + mul.lo.s32 %r54, %r359, %r132; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p3 st.shared.b32 [ %r3 + 0 ], %r52; + // end inline asm + // begin inline asm + @%p3 st.shared.b32 [ %r5 + 0 ], %r54; + // end inline asm + bar.sync 0; + // begin inline asm + @%p5 ld.shared.b32 %r55, [ %r8 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r364, %r55, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r58, %r364, %r55; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r8 + 0 ], %r58; + // end inline asm + // begin inline asm + @%p5 ld.shared.b32 %r59, [ %r12 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r365, %r59, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r62, %r365, %r59; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r12 + 0 ], %r62; + // end inline asm + bar.sync 0; + ld.shared.b32 %r366, [%r252]; + ld.shared.b32 %r367, [%r252+8]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r64, %r358, %r129; + mul.lo.s32 %r66, %r359, %r129; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p3 st.shared.b32 [ %r3 + 0 ], %r64; + // end inline asm + // begin inline asm + @%p3 st.shared.b32 [ %r5 + 0 ], %r66; + // end inline asm + bar.sync 0; + // begin inline asm + @%p5 ld.shared.b32 %r67, [ %r8 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r368, %r67, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r70, %r368, %r67; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r8 + 0 ], %r70; + // end inline asm + // begin inline asm + @%p5 ld.shared.b32 %r71, [ %r12 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r369, %r71, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r74, %r369, %r71; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r12 + 0 ], %r74; + // end inline asm + bar.sync 0; + ld.shared.b32 %r370, [%r252]; + ld.shared.b32 %r371, [%r252+8]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r76, %r362, %r132; + mul.lo.s32 %r78, %r363, %r132; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p3 st.shared.b32 [ %r3 + 0 ], %r76; + // end inline asm + // begin inline asm + @%p3 st.shared.b32 [ %r5 + 0 ], %r78; + // end inline asm + bar.sync 0; + // begin inline asm + @%p5 ld.shared.b32 %r79, [ %r8 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r372, %r79, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r82, %r372, %r79; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r8 + 0 ], %r82; + // end inline asm + // begin inline asm + @%p5 ld.shared.b32 %r83, [ %r12 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r373, %r83, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r86, %r373, %r83; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r12 + 0 ], %r86; + // end inline asm + bar.sync 0; + ld.shared.b32 %r374, [%r252]; + ld.shared.b32 %r375, [%r252+8]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r88, %r362, %r129; + mul.lo.s32 %r90, %r363, %r129; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p3 st.shared.b32 [ %r3 + 0 ], %r88; + // end inline asm + // begin inline asm + @%p3 st.shared.b32 [ %r5 + 0 ], %r90; + // end inline asm + bar.sync 0; + // begin inline asm + @%p5 ld.shared.b32 %r91, [ %r8 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r376, %r91, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r94, %r376, %r91; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r8 + 0 ], %r94; + // end inline asm + // begin inline asm + @%p5 ld.shared.b32 %r95, [ %r12 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r377, %r95, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r98, %r377, %r95; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p6 st.shared.b32 [ %r12 + 0 ], %r98; + // end inline asm + bar.sync 0; + ld.shared.b32 %r378, [%r252]; + ld.shared.b32 %r379, [%r252+8]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p128, %r366, %r370; + setp.lt.s32 %p129, %r367, %r371; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p130, %r366, %r370; + setp.eq.b32 %p131, %r367, %r371; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p132, %r374, %r378; + setp.gt.s32 %p133, %r375, %r379; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p134, %p130, %p132; + and.pred %p135, %p131, %p133; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p136, %p128, %p134; + or.pred %p137, %p129, %p135; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r380, %r370, %r366; + xor.b32 %r381, %r371, %r367; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r382, %r380, 0, %p136; + selp.b32 %r383, %r381, 0, %p137; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r384, %r382, %r358; + xor.b32 %r385, %r383, %r359; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r386, %r378, %r374; + xor.b32 %r387, %r379, %r375; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r388, %r386, 0, %p136; + selp.b32 %r389, %r387, 0, %p137; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r390, %r388, %r362; + xor.b32 %r391, %r389, %r363; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r392, %r384, %r131; + mul.lo.s32 %r393, %r385, %r131; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r394, %r392, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r395, %r392, %r394; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r396, %r393, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r397, %r393, %r396; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r398, %r384, %r124; + mul.lo.s32 %r399, %r385, %r124; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r400, %r398, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r401, %r398, %r400; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r402, %r399, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r403, %r399, %r402; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r404, %r390, %r131; + mul.lo.s32 %r405, %r391, %r131; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r406, %r404, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r407, %r404, %r406; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r408, %r405, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r409, %r405, %r408; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r410, %r390, %r124; + mul.lo.s32 %r411, %r391, %r124; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r412, %r410, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r413, %r412, %r410; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r414, %r411, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r415, %r414, %r411; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p138, %r395, %r401; + setp.lt.s32 %p139, %r397, %r403; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p140, %r395, %r401; + setp.eq.b32 %p141, %r397, %r403; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p142, %r407, %r413; + setp.gt.s32 %p143, %r409, %r415; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p144, %p140, %p142; + and.pred %p145, %p141, %p143; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p146, %p138, %p144; + or.pred %p147, %p139, %p145; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r416, %r395, %r401; + xor.b32 %r417, %r397, %r403; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r418, %r416, 0, %p146; + selp.b32 %r419, %r417, 0, %p147; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r420, %r418, %r384; + xor.b32 %r421, %r419, %r385; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r422, %r413, %r407; + xor.b32 %r423, %r415, %r409; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r424, %r422, 0, %p146; + selp.b32 %r425, %r423, 0, %p147; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r426, %r424, %r390; + xor.b32 %r427, %r425, %r391; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r428, %r420, %r130; + mul.lo.s32 %r429, %r421, %r130; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r430, %r428, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r431, %r428, %r430; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r432, %r429, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r433, %r429, %r432; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r434, %r420, %r127; + mul.lo.s32 %r435, %r421, %r127; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r436, %r434, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r437, %r434, %r436; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r438, %r435, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r439, %r435, %r438; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r440, %r426, %r130; + mul.lo.s32 %r441, %r427, %r130; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r442, %r440, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r443, %r440, %r442; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r444, %r441, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r445, %r441, %r444; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r446, %r426, %r127; + mul.lo.s32 %r447, %r427, %r127; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r448, %r446, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r449, %r448, %r446; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r450, %r447, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r451, %r450, %r447; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p148, %r431, %r437; + setp.lt.s32 %p149, %r433, %r439; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p150, %r431, %r437; + setp.eq.b32 %p151, %r433, %r439; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p152, %r443, %r449; + setp.gt.s32 %p153, %r445, %r451; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r452, %r449, %r443; + xor.b32 %r453, %r451, %r445; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r454, %r452, 0, %p152; + selp.b32 %r455, %r454, 0, %p150; + selp.b32 %r456, %r452, %r455, %p148; + selp.b32 %r457, %r453, 0, %p153; + selp.b32 %r458, %r457, 0, %p151; + selp.b32 %r459, %r453, %r458, %p149; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r460, %r456, %r426; + xor.b32 %r461, %r459, %r427; +$L__tmp2: + .loc 1 44 34 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:44:34 + selp.b32 %r462, %r1, 0, %p1; + cvt.s64.s32 %rd11, %r462; + selp.b32 %r463, %r2, 0, %p1; + cvt.s64.s32 %rd12, %r463; +$L__tmp3: + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + bar.sync 0; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + add.s64 %rd13, %rd12, %rd11; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + mov.b64 {_, %r464}, %rd13; + cvt.u32.u64 %r465, %rd13; + shfl.sync.bfly.b32 %r466, %r465, 16, 31, -1; + shfl.sync.bfly.b32 %r467, %r464, 16, 31, -1; + cvt.u64.u32 %rd14, %r466; + cvt.u64.u32 %rd15, %r467; + shl.b64 %rd16, %rd15, 32; + or.b64 %rd17, %rd14, %rd16; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + add.s64 %rd18, %rd13, %rd17; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + mov.b64 {_, %r468}, %rd18; + cvt.u32.u64 %r469, %rd18; + shfl.sync.bfly.b32 %r470, %r469, 8, 31, -1; + shfl.sync.bfly.b32 %r471, %r468, 8, 31, -1; + cvt.u64.u32 %rd19, %r470; + cvt.u64.u32 %rd20, %r471; + shl.b64 %rd21, %rd20, 32; + or.b64 %rd22, %rd19, %rd21; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + add.s64 %rd3, %rd18, %rd22; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + setp.eq.b32 %p51, %r108, 0; + shl.b32 %r472, %r109, 4; + add.s32 %r473, %r251, %r472; + shl.b32 %r474, %r129, 3; + add.s32 %r99, %r473, %r474; + // begin inline asm + @%p51 st.shared.b64 [ %r99 + 0 ], %rd3; + // end inline asm + bar.sync 0; + setp.lt.u32 %p52, %r107, 16; + shl.b32 %r475, %r107, 3; + add.s32 %r100, %r251, %r475; + // begin inline asm + @%p52 ld.shared.b64 %rd4, [ %r100 + 0 ]; + // end inline asm + mov.b64 {_, %r476}, %rd4; + cvt.u32.u64 %r477, %rd4; + shfl.sync.bfly.b32 %r478, %r477, 1, 31, -1; + shfl.sync.bfly.b32 %r479, %r476, 1, 31, -1; + cvt.u64.u32 %rd23, %r478; + cvt.u64.u32 %rd24, %r479; + shl.b64 %rd25, %rd24, 32; + or.b64 %rd26, %rd23, %rd25; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + add.s64 %rd5, %rd4, %rd26; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + and.b32 %r480, %r107, 1009; + setp.eq.b32 %p53, %r480, 0; + // begin inline asm + @%p53 st.shared.b64 [ %r100 + 0 ], %rd5; + // end inline asm + bar.sync 0; + ld.shared.b32 %r104, [%r473]; +$L__tmp4: + .loc 1 49 35 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:49:35 + shl.b32 %r481, %r112, 4; + .loc 1 49 32 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:49:32 + or.b32 %r482, %r481, %r114; + .loc 1 49 25 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:49:25 + mad.wide.s32 %rd6, %r482, 4, %rd9; + .loc 1 49 47 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:49:47 + bar.sync 0; + and.b32 %r483, %r107, 3; + shl.b32 %r484, %r483, 3; + and.b32 %r485, %r247, 96; + bfe.s32 %r486, %r107, 2, 1; + and.b32 %r487, %r486, 192; + and.b32 %r488, %r125, 260; + or.b32 %r489, %r484, %r485; + xor.b32 %r490, %r489, %r487; + or.b32 %r491, %r490, %r488; + add.s32 %r492, %r251, %r491; + st.shared.b32 [%r492], %r460; + xor.b32 %r493, %r491, 4; + add.s32 %r494, %r251, %r493; + st.shared.b32 [%r494], %r461; + bar.sync 0; + shl.b32 %r495, %r483, 5; + and.b32 %r496, %r107, 28; + and.b32 %r497, %r128, 192; + or.b32 %r498, %r495, %r496; + xor.b32 %r499, %r498, %r497; + add.s32 %r500, %r251, %r499; + ld.shared.b32 %r102, [%r500]; + xor.b32 %r501, %r499, 4; + add.s32 %r502, %r251, %r501; + ld.shared.b32 %r103, [%r502+256]; + // begin inline asm + @%p54 st.global.v2.b32 [ %rd6 + 0 ], { %r102, %r103 }; + // end inline asm + .loc 1 50 25 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:50:25 + mad.wide.s32 %rd7, %r111, 4, %rd10; + .loc 1 50 37 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:50:37 + and.b32 %r503, %r107, 56; + setp.eq.b32 %p154, %r503, 0; + and.pred %p55, %p154, %p1; + // begin inline asm + @%p55 st.global.b32 [ %rd7 + 0 ], { %r104 }; + // end inline asm + .loc 1 50 4 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:50:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 267 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x104 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 119 +.b8 55 +.b8 117 +.b8 54 +.b8 114 +.b8 103 +.b8 107 +.b8 111 +.b8 105 +.b8 104 +.b8 119 +.b8 105 +.b8 122 +.b8 117 +.b8 103 +.b8 101 +.b8 54 +.b8 52 +.b8 52 +.b8 109 +.b8 110 +.b8 103 +.b8 50 +.b8 100 +.b8 100 +.b8 103 +.b8 51 +.b8 98 +.b8 113 +.b8 105 +.b8 105 +.b8 120 +.b8 102 +.b8 120 +.b8 101 +.b8 103 +.b8 115 +.b8 109 +.b8 120 +.b8 117 +.b8 103 +.b8 116 +.b8 98 +.b8 108 +.b8 97 +.b8 102 +.b8 99 +.b8 54 +.b8 107 +.b8 122 +.b8 51 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 119 +.b8 55 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x3d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 99 +.b8 108 +.b8 111 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 108 +.b8 105 +.b8 99 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 116 +.b8 114 +.b8 97 +.b8 110 +.b8 115 +.b8 112 +.b8 111 +.b8 115 +.b8 101 +.b8 95 +.b8 51 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc8:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xdd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 67 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xf5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source b/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source new file mode 100644 index 0000000000000000000000000000000000000000..e044aea8b51ff3f2d54c7162fed37a6aa0489c54 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source @@ -0,0 +1,1221 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":18:0) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc88 = loc(unknown) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc140 = loc("in_ptr0"(#loc)) +#loc141 = loc("out_ptr2"(#loc)) +#loc142 = loc("out_ptr3"(#loc)) +#loc143 = loc("xnumel"(#loc)) +#loc144 = loc("r0_numel"(#loc)) +#loc174 = loc("x"(#loc38)) +#loc175 = loc("idxs"(#loc38)) +#loc176 = loc("x"(#loc42)) +#loc177 = loc("idxs"(#loc42)) +#loc182 = loc("x"(#loc50)) +#loc183 = loc("idxs"(#loc50)) +#loc184 = loc("flip"(#loc50)) +#loc240 = loc("input"(#loc113)) +#loc241 = loc("a"(#loc117)) +#loc242 = loc("b"(#loc117)) +#loc244 = loc("x"(#loc122)) +#loc245 = loc("x"(#loc126)) +#loc246 = loc("input"(#loc135)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 32 : i32 loc(#loc145) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc146) + %xoffset = tt.get_program_id x : i32 loc(#loc147) + %xoffset_2 = arith.constant 8 : i32 loc(#loc148) + %xoffset_3 = arith.constant 8 : i32 loc(#loc148) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc148) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc149) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc150) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc151) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc151) + %xmask = arith.constant dense<32> : tensor<8x1xi32> loc(#loc152) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<8x1xi32> loc(#loc152) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc153) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc154) + %r0_offset = arith.constant 0 : i32 loc(#loc155) + %r0_mask = arith.constant true loc(#loc156) + %r0_mask_10 = arith.constant dense : tensor<8x16xi1> loc(#loc156) + %x0 = arith.constant 16 : i32 loc(#loc157) + %x0_11 = arith.constant 16 : i32 loc(#loc157) + %x0_12 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc157) + %x0_13 = arith.remsi %xindex_7, %x0_12 : tensor<8x1xi32> loc(#loc157) + %x1 = arith.constant 16 : i32 loc(#loc158) + %x1_14 = arith.constant 16 : i32 loc(#loc158) + %x1_15 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc158) + %x1_16 = arith.divsi %xindex_7, %x1_15 : tensor<8x1xi32> loc(#loc158) + %tmp0 = arith.constant 17 : i32 loc(#loc159) + %tmp0_17 = arith.constant 17 : i32 loc(#loc159) + %tmp0_18 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc159) + %tmp0_19 = arith.muli %tmp0_18, %r0_index_9 : tensor<1x16xi32> loc(#loc159) + %tmp0_20 = tt.broadcast %x0_13 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc160) + %tmp0_21 = tt.broadcast %tmp0_19 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc160) + %tmp0_22 = arith.addi %tmp0_20, %tmp0_21 : tensor<8x16xi32> loc(#loc160) + %tmp0_23 = arith.constant 272 : i32 loc(#loc161) + %tmp0_24 = arith.constant 272 : i32 loc(#loc161) + %tmp0_25 = arith.constant dense<272> : tensor<8x1xi32> loc(#loc161) + %tmp0_26 = arith.muli %tmp0_25, %x1_16 : tensor<8x1xi32> loc(#loc161) + %tmp0_27 = tt.broadcast %tmp0_26 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc162) + %tmp0_28 = arith.addi %tmp0_22, %tmp0_27 : tensor<8x16xi32> loc(#loc162) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc163) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc163) + %tmp0_31 = arith.constant 0.000000e+00 : f32 loc(#loc164) + %tmp0_32 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc164) + %tmp0_33 = arith.constant dense<0.000000e+00> : tensor<8x16xf32> loc(#loc164) + %tmp0_34 = arith.fptosi %tmp0_33 : tensor<8x16xf32> to tensor<8x16xi32> loc(#loc164) + %tmp0_35 = tt.load %tmp0_30, %tmp0_32, %tmp0_34 : tensor<8x16x!tt.ptr> loc(#loc164) + %tmp2 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc165) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<8x16xi16> loc(#loc166) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp0_35, %tmp4) : (tensor<8x16xi32>, tensor<8x16xi16>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc23) + %tmp7 = arith.extsi %tmp0_35 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc167) + %tmp10 = arith.constant 0 : i32 loc(#loc168) + %tmp10_36 = arith.constant 0 : i64 loc(#loc168) + %tmp10_37 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc168) + %tmp10_38 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc168) + %tmp10_39 = arith.select %tmp10_38, %tmp7, %tmp10_37 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc168) + %tmp11 = tt.call @"triton.language.standard.sum__i64S8_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp10_39) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc169) + %tmp11_40 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc170) + %tmp12 = arith.extsi %0#1 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc171) + %tmp13 = arith.trunci %tmp12 : tensor<8x16xi64> to tensor<8x16xi32> loc(#loc172) + %tmp14 = arith.trunci %tmp11_40 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc173) + %c16_i32 = arith.constant 16 : i32 loc(#loc31) + %c16_i32_41 = arith.constant 16 : i32 loc(#loc31) + %cst = arith.constant dense<16> : tensor<8x1xi32> loc(#loc31) + %1 = arith.muli %cst, %xindex_7 : tensor<8x1xi32> loc(#loc31) + %2 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc32) + %3 = tt.broadcast %1 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc32) + %4 = arith.addi %2, %3 : tensor<8x16xi32> loc(#loc32) + %5 = tt.splat %out_ptr2 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc33) + %6 = tt.addptr %5, %4 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc33) + %7 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc34) + tt.store %6, %tmp13, %7 : tensor<8x16x!tt.ptr> loc(#loc34) + %8 = tt.splat %out_ptr3 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc35) + %9 = tt.addptr %8, %xindex_7 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc35) + tt.store %9, %tmp14, %xmask_8 : tensor<8x1x!tt.ptr> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc38)), %idxs: tensor<8x16xi16> loc("idxs"(#loc38))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<8x16xi32>, tensor<8x16xi16>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc39) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc39) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc39) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc39) + tt.return %3#0, %3#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc40) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc41) + %5 = ub.poison : tensor<8x16xi32> loc(#loc41) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc41) + } loc(#loc38) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc42)), %idxs: tensor<8x16xi16> loc("idxs"(#loc42))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc178) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc179) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc179) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc180) + %flip_3 = tt.reshape %flip_2 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc181) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i16S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi16>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + tt.return %0#0, %0#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc48) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x16xi32> loc(#loc49) + %2 = ub.poison : tensor<8x16xi32> loc(#loc49) + tt.return %1, %2 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i16S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc50)), %idxs: tensor<8x16xi16> loc("idxs"(#loc50)), %flip: tensor<8x16xi32> loc("flip"(#loc50))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<8x16xi16> -> tensor<64x2x1xi16> loc(#loc199) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc200) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc201) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<64x2x1xi16> loc(#loc201) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<64x2x1xi16>) -> tensor<64x1xi32> loc(#loc202) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc203) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc204) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc205) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc206) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<64x2x1xi16> loc(#loc206) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<64x2x1xi16>) -> tensor<64x1xi32> loc(#loc207) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc208) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc209) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc210) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_27 = arith.constant dense : tensor<8x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_28 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_49 = arith.constant true loc(#loc217) + %cond_50 = arith.constant dense : tensor<8x16xi1> loc(#loc217) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<8x16xi1> loc(#loc217) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<8x16xi1> loc(#loc218) + %cond_53 = arith.ori %cond, %cond_52 : tensor<8x16xi1> loc(#loc249) + scf.yield %cond_53 : tensor<8x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc221) + %eq_50 = arith.ori %eq, %eq_49 : tensor<8x16xi1> loc(#loc251) + scf.yield %eq_50 : tensor<8x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc88) + } loc(#loc91) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<8x16xi32> loc(#loc223) + %cond_30 = arith.andi %3, %cond_29 : tensor<8x16xi1> loc(#loc224) + %cond_31 = arith.ori %1, %cond_30 : tensor<8x16xi1> loc(#loc225) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<8x16xi1> loc(#loc226) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<8x16xi1> loc(#loc227) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<8x16xi1> loc(#loc228) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<8x16xi1> loc(#loc229) + %cond_36 = arith.extui %cond_35 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc230) + %cond_37 = arith.xori %cond_36, %flip : tensor<8x16xi32> loc(#loc230) + %cond_38 = arith.constant 0 : i32 loc(#loc231) + %cond_39 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc231) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<8x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc232) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc233) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc234) + %ret_43 = arith.xori %x, %ret_42 : tensor<8x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<8x16xi32> loc(#loc236) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S8_16S__(%idxs) : (tensor<8x16xi16>) -> tensor<8x16xi16> loc(#loc237) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<8x16xi16> to tensor<8x16xi32> loc(#loc238) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc238) + %new_idxs_47 = arith.extsi %idxs : tensor<8x16xi16> to tensor<8x16xi32> loc(#loc239) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<8x16xi32> loc(#loc239) + tt.return %ret_43, %new_idxs_48 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc112) + %5 = ub.poison : tensor<8x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x1xi32> loc("input"(#loc113))) -> tensor<64x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc114) + tt.return %0 : tensor<64x1xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x1xi32> loc(#loc116) + tt.return %1 : tensor<64x1xi32> loc(#loc116) + } loc(#loc113) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc117)), %b: i32 loc("b"(#loc117))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc118) + tt.return %0 : i32 loc(#loc119) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc120) + tt.return %1 : i32 loc(#loc120) + } loc(#loc117) + tt.func private @"triton.language.standard.sum__i16S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x1xi16> loc("input"(#loc113))) -> tensor<64x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc243) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc114) + tt.return %0 : tensor<64x1xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x1xi32> loc(#loc116) + tt.return %1 : tensor<64x1xi32> loc(#loc116) + } loc(#loc113) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%x: tensor<8x16xi32> loc("x"(#loc122))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc123) + %false = arith.constant false loc(#loc124) + tt.return %false : i1 loc(#loc124) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc125) + tt.return %1 : i1 loc(#loc125) + } loc(#loc122) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S8_16S__(%x: tensor<8x16xi32> loc("x"(#loc126))) -> tensor<8x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc127) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc128) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc128) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<8x16xi32> loc(#loc128) + %4 = arith.addi %x, %3 : tensor<8x16xi32> loc(#loc128) + tt.return %4 : tensor<8x16xi32> loc(#loc129) + ^bb1: // no predecessors + %5 = ub.poison : tensor<8x16xi32> loc(#loc130) + tt.return %5 : tensor<8x16xi32> loc(#loc130) + } loc(#loc126) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc132) + %cst = arith.constant dense : tensor<1xi1> loc(#loc132) + tt.return %cst : tensor<1xi1> loc(#loc133) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc134) + tt.return %0 : tensor<1xi1> loc(#loc134) + } loc(#loc131) + tt.func private @triton.language.standard.zeros_like__i32S8_16S__(%input: tensor<8x16xi32> loc("input"(#loc135))) -> tensor<8x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<8x16xi32> loc(#loc136) + tt.return %0 : tensor<8x16xi32> loc(#loc137) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x16xi32> loc(#loc138) + tt.return %1 : tensor<8x16xi32> loc(#loc138) + } loc(#loc135) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<8x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc132) + %cst = arith.constant dense<0> : tensor<8x16xi32> loc(#loc132) + tt.return %cst : tensor<8x16xi32> loc(#loc133) + ^bb1: // no predecessors + %0 = ub.poison : tensor<8x16xi32> loc(#loc134) + tt.return %0 : tensor<8x16xi32> loc(#loc134) + } loc(#loc131) + tt.func private @triton.language.standard.zeros_like__i16S8_16S__(%input: tensor<8x16xi16> loc("input"(#loc135))) -> tensor<8x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<8x16xi16> loc(#loc136) + tt.return %0 : tensor<8x16xi16> loc(#loc137) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x16xi16> loc(#loc138) + tt.return %1 : tensor<8x16xi16> loc(#loc138) + } loc(#loc135) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<8x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc132) + %cst = arith.constant dense<0> : tensor<8x16xi16> loc(#loc132) + tt.return %cst : tensor<8x16xi16> loc(#loc133) + ^bb1: // no predecessors + %0 = ub.poison : tensor<8x16xi16> loc(#loc134) + tt.return %0 : tensor<8x16xi16> loc(#loc134) + } loc(#loc131) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc42)), %idxs: tensor<8x16xi32> loc("idxs"(#loc42))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc178) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc179) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc179) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc180) + %flip_3 = tt.reshape %flip_2 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc181) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + tt.return %1#0, %1#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc48) + ^bb1: // no predecessors + %2 = ub.poison : tensor<8x16xi32> loc(#loc49) + %3 = ub.poison : tensor<8x16xi32> loc(#loc49) + tt.return %2, %3 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc50)), %idxs: tensor<8x16xi32> loc("idxs"(#loc50)), %flip: tensor<8x16xi32> loc("flip"(#loc50))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x2xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<32x2x2xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x2xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x2xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_45 = arith.constant true loc(#loc217) + %cond_46 = arith.constant dense : tensor<8x16xi1> loc(#loc217) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<8x16xi1> loc(#loc217) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<8x16xi1> loc(#loc218) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc249) + scf.yield %cond_49 : tensor<8x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc221) + %eq_46 = arith.ori %eq, %eq_45 : tensor<8x16xi1> loc(#loc251) + scf.yield %eq_46 : tensor<8x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc229) + %cond_34 = arith.extui %cond_33 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc230) + %cond_35 = arith.xori %cond_34, %flip : tensor<8x16xi32> loc(#loc230) + %cond_36 = arith.constant 0 : i32 loc(#loc231) + %cond_37 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc231) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<8x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc232) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc233) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc234) + %ret_41 = arith.xori %x, %ret_40 : tensor<8x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc236) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc237) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc238) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<8x16xi32> loc(#loc239) + tt.return %ret_41, %new_idxs_44 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc112) + %5 = ub.poison : tensor<8x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x2x2xi32> loc("input"(#loc113))) -> tensor<32x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc114) + tt.return %0 : tensor<32x2xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x2xi32> loc(#loc116) + tt.return %1 : tensor<32x2xi32> loc(#loc116) + } loc(#loc113) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc50)), %idxs: tensor<8x16xi32> loc("idxs"(#loc50)), %flip: tensor<8x16xi32> loc("flip"(#loc50))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x1xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x1xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_45 = arith.constant true loc(#loc217) + %cond_46 = arith.constant dense : tensor<8x16xi1> loc(#loc217) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<8x16xi1> loc(#loc217) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<8x16xi1> loc(#loc218) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc249) + scf.yield %cond_49 : tensor<8x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc221) + %eq_46 = arith.ori %eq, %eq_45 : tensor<8x16xi1> loc(#loc251) + scf.yield %eq_46 : tensor<8x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc229) + %cond_34 = arith.extui %cond_33 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc230) + %cond_35 = arith.xori %cond_34, %flip : tensor<8x16xi32> loc(#loc230) + %cond_36 = arith.constant 0 : i32 loc(#loc231) + %cond_37 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc231) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<8x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc232) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc233) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc234) + %ret_41 = arith.xori %x, %ret_40 : tensor<8x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc236) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc237) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc238) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<8x16xi32> loc(#loc239) + tt.return %ret_41, %new_idxs_44 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc112) + %5 = ub.poison : tensor<8x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc42)), %idxs: tensor<8x16xi32> loc("idxs"(#loc42))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc178) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc179) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc179) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc180) + %flip_3 = tt.reshape %flip_2 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc181) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + tt.return %2#0, %2#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc48) + ^bb1: // no predecessors + %3 = ub.poison : tensor<8x16xi32> loc(#loc49) + %4 = ub.poison : tensor<8x16xi32> loc(#loc49) + tt.return %3, %4 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc50)), %idxs: tensor<8x16xi32> loc("idxs"(#loc50)), %flip: tensor<8x16xi32> loc("flip"(#loc50))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<16x2x4xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<16x2x4xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<16x2x4xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<16x2x4xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_45 = arith.constant true loc(#loc217) + %cond_46 = arith.constant dense : tensor<8x16xi1> loc(#loc217) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<8x16xi1> loc(#loc217) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<8x16xi1> loc(#loc218) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc249) + scf.yield %cond_49 : tensor<8x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc221) + %eq_46 = arith.ori %eq, %eq_45 : tensor<8x16xi1> loc(#loc251) + scf.yield %eq_46 : tensor<8x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc229) + %cond_34 = arith.extui %cond_33 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc230) + %cond_35 = arith.xori %cond_34, %flip : tensor<8x16xi32> loc(#loc230) + %cond_36 = arith.constant 0 : i32 loc(#loc231) + %cond_37 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc231) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<8x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc232) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc233) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc234) + %ret_41 = arith.xori %x, %ret_40 : tensor<8x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc236) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc237) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc238) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<8x16xi32> loc(#loc239) + tt.return %ret_41, %new_idxs_44 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc112) + %5 = ub.poison : tensor<8x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<16x2x4xi32> loc("input"(#loc113))) -> tensor<16x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc114) + tt.return %0 : tensor<16x4xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<16x4xi32> loc(#loc116) + tt.return %1 : tensor<16x4xi32> loc(#loc116) + } loc(#loc113) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc42)), %idxs: tensor<8x16xi32> loc("idxs"(#loc42))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc247) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc47) + tt.return %3#0, %3#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc48) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc49) + %5 = ub.poison : tensor<8x16xi32> loc(#loc49) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc50)), %idxs: tensor<8x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x8xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<8x2x8xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x8xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x8xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc112) + %5 = ub.poison : tensor<8x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x8xi32> loc("input"(#loc113))) -> tensor<8x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc114) + tt.return %0 : tensor<8x8xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x8xi32> loc(#loc116) + tt.return %1 : tensor<8x8xi32> loc(#loc116) + } loc(#loc113) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc50)), %idxs: tensor<8x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<16x2x4xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<16x2x4xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<16x2x4xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<16x2x4xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc112) + %5 = ub.poison : tensor<8x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc50)), %idxs: tensor<8x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x2xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<32x2x2xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x2xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x2xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc112) + %5 = ub.poison : tensor<8x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc50)), %idxs: tensor<8x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x1xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x1xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc112) + %5 = ub.poison : tensor<8x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i64S8_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x16xi64> loc("input"(#loc113))) -> tensor<8xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc114) + tt.reduce.return %2 : i64 loc(#loc114) + }) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc114) + tt.return %0 : tensor<8xi64> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8xi64> loc(#loc116) + tt.return %1 : tensor<8xi64> loc(#loc116) + } loc(#loc113) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc117)), %b: i64 loc("b"(#loc117))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc118) + tt.return %0 : i64 loc(#loc119) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc120) + tt.return %1 : i64 loc(#loc120) + } loc(#loc117) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":33:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":34:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:38) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:49) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:54) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":38:19) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":40:33) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":41:67) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":42:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":44:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":45:26) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":45:29) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":46:20) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":47:21) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":48:21) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:35) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:32) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:47) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:37) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:4) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc145 = loc("xnumel"(#loc1)) +#loc146 = loc("r0_numel"(#loc2)) +#loc147 = loc("xoffset"(#loc3)) +#loc148 = loc("xoffset"(#loc4)) +#loc149 = loc("xindex"(#loc5)) +#loc150 = loc("xindex"(#loc6)) +#loc151 = loc("xindex"(#loc7)) +#loc152 = loc("xmask"(#loc8)) +#loc153 = loc("r0_index"(#loc9)) +#loc154 = loc("r0_index"(#loc10)) +#loc155 = loc("r0_offset"(#loc11)) +#loc156 = loc("r0_mask"(#loc12)) +#loc157 = loc("x0"(#loc13)) +#loc158 = loc("x1"(#loc14)) +#loc159 = loc("tmp0"(#loc15)) +#loc160 = loc("tmp0"(#loc16)) +#loc161 = loc("tmp0"(#loc17)) +#loc162 = loc("tmp0"(#loc18)) +#loc163 = loc("tmp0"(#loc19)) +#loc164 = loc("tmp0"(#loc20)) +#loc165 = loc("tmp2"(#loc21)) +#loc166 = loc("tmp4"(#loc22)) +#loc167 = loc("tmp7"(#loc24)) +#loc168 = loc("tmp10"(#loc25)) +#loc169 = loc("tmp11"(#loc26)) +#loc170 = loc("tmp11"(#loc27)) +#loc171 = loc("tmp12"(#loc28)) +#loc172 = loc("tmp13"(#loc29)) +#loc173 = loc("tmp14"(#loc30)) +#loc178 = loc("flip"(#loc43)) +#loc179 = loc("flip"(#loc44)) +#loc180 = loc("flip"(#loc45)) +#loc181 = loc("flip"(#loc46)) +#loc185 = loc("y"(#loc51)) +#loc186 = loc("right_mask"(#loc52)) +#loc187 = loc("right_mask"(#loc53)) +#loc188 = loc("left_mask"(#loc54)) +#loc189 = loc("ileft"(#loc55)) +#loc190 = loc("ileft"(#loc56)) +#loc191 = loc("ileft"(#loc57)) +#loc192 = loc("ileft"(#loc58)) +#loc193 = loc("iright"(#loc59)) +#loc194 = loc("iright"(#loc60)) +#loc195 = loc("iright"(#loc61)) +#loc196 = loc("iright"(#loc62)) +#loc197 = loc("ileft"(#loc63)) +#loc198 = loc("iright"(#loc64)) +#loc199 = loc("y_idx"(#loc65)) +#loc200 = loc("left_idx"(#loc66)) +#loc201 = loc("left_idx"(#loc67)) +#loc202 = loc("left_idx"(#loc68)) +#loc203 = loc("left_idx"(#loc69)) +#loc204 = loc("left_idx"(#loc70)) +#loc205 = loc("right_idx"(#loc71)) +#loc206 = loc("right_idx"(#loc72)) +#loc207 = loc("right_idx"(#loc73)) +#loc208 = loc("right_idx"(#loc74)) +#loc209 = loc("right_idx"(#loc75)) +#loc210 = loc("left_idx"(#loc76)) +#loc211 = loc("right_idx"(#loc77)) +#loc212 = loc("left_valid_mask"(#loc78)) +#loc213 = loc("right_valid_mask"(#loc79)) +#loc214 = loc("left_isnan"(#loc80)) +#loc215 = loc("right_isnan"(#loc81)) +#loc216 = loc("cond"(#loc82)) +#loc217 = loc("cond"(#loc85)) +#loc218 = loc("cond"(#loc86)) +#loc219 = loc("cond"(#loc87)) +#loc220 = loc("eq"(#loc89)) +#loc221 = loc("eq"(#loc92)) +#loc222 = loc("eq"(#loc93)) +#loc223 = loc("cond"(#loc94)) +#loc224 = loc("cond"(#loc95)) +#loc225 = loc("cond"(#loc96)) +#loc226 = loc("cond"(#loc97)) +#loc227 = loc("cond"(#loc98)) +#loc228 = loc("cond"(#loc99)) +#loc229 = loc("cond"(#loc100)) +#loc230 = loc("cond"(#loc101)) +#loc231 = loc("cond"(#loc102)) +#loc232 = loc("ret"(#loc103)) +#loc233 = loc("ret"(#loc104)) +#loc234 = loc("ret"(#loc105)) +#loc235 = loc("ret"(#loc106)) +#loc236 = loc("new_idxs"(#loc107)) +#loc237 = loc("new_idxs"(#loc108)) +#loc238 = loc("new_idxs"(#loc109)) +#loc239 = loc("new_idxs"(#loc110)) +#loc243 = loc("input"(#loc121)) +#loc247 = loc("flip"(#loc139)) +#loc248 = loc("cond"(#loc216)) +#loc249 = loc("cond"(#loc219)) +#loc250 = loc("eq"(#loc220)) +#loc251 = loc("eq"(#loc222)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..e3ed0b0f4d17a8c02a27a31add53e23e65fc8a23 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir @@ -0,0 +1,841 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [4, 8], warpsPerCTA = [2, 1], order = [1, 0]}> +#linear = #ttg.linear<{register = [[0, 8]], lane = [[1, 0], [2, 0], [4, 0], [0, 1], [0, 2]], warp = [[0, 4]], block = []}> +#linear1 = #ttg.linear<{register = [[4, 0, 0]], lane = [[8, 0, 0], [16, 0, 0], [32, 0, 0], [0, 1, 0], [1, 0, 0]], warp = [[2, 0, 0]], block = []}> +#linear2 = #ttg.linear<{register = [[2, 0, 0]], lane = [[4, 0, 0], [8, 0, 0], [16, 0, 0], [0, 0, 1], [0, 1, 0]], warp = [[1, 0, 0]], block = []}> +#linear3 = #ttg.linear<{register = [[1, 0, 0]], lane = [[2, 0, 0], [4, 0, 0], [8, 0, 0], [0, 0, 1], [0, 0, 2]], warp = [[0, 1, 0]], block = []}> +#linear4 = #ttg.linear<{register = [[0, 1, 0]], lane = [[1, 0, 0], [2, 0, 0], [4, 0, 0], [0, 0, 1], [0, 0, 2]], warp = [[0, 0, 4]], block = []}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":18:0) +#loc1 = loc(unknown) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":41:67) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":45:26) +#loc77 = loc("in_ptr0"(#loc)) +#loc78 = loc("out_ptr2"(#loc)) +#loc79 = loc("out_ptr3"(#loc)) +#loc80 = loc("xnumel"(#loc)) +#loc81 = loc("r0_numel"(#loc)) +#loc99 = loc(callsite(#loc19 at #loc20)) +#loc105 = loc("ileft"(#loc28)) +#loc109 = loc("iright"(#loc33)) +#loc118 = loc("left_idx"(#loc42)) +#loc123 = loc("right_idx"(#loc47)) +#loc143 = loc("tmp11"(#loc67)) +#loc149 = loc(callsite(#loc24 at #loc99)) +#loc153 = loc(callsite(#loc1 at #loc143)) +#loc157 = loc(callsite(#loc105 at #loc149)) +#loc161 = loc(callsite(#loc109 at #loc149)) +#loc169 = loc(callsite(#loc118 at #loc149)) +#loc174 = loc(callsite(#loc123 at #loc149)) +#loc194 = loc(callsite(#loc1 at #loc157)) +#loc196 = loc(callsite(#loc1 at #loc161)) +#loc199 = loc(callsite(#loc1 at #loc169)) +#loc202 = loc(callsite(#loc1 at #loc174)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<8x16xi32, #linear> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<8x16xi64, #blocked> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %cst_1 = arith.constant dense<32> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<32> : tensor<8x1xi32, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<16> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<16> : tensor<8x1xi32, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<17> : tensor<1x16xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<272> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<1> : tensor<1x2x1xi32, #linear1> loc(#loc1) + %cst_8 = arith.constant dense<1> : tensor<1x2x1xi32, #linear2> loc(#loc1) + %cst_9 = arith.constant dense<1> : tensor<1x2x1xi32, #linear3> loc(#loc1) + %cst_10 = arith.constant dense<1> : tensor<1x2x1xi32, #linear4> loc(#loc1) + %cst_11 = arith.constant dense<0> : tensor<8x16xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc82) + %xoffset_12 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc83) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc84) + %xindex_13 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc84) + %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc84) + %xindex_15 = tt.expand_dims %xindex_13 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc84) + %xindex_16 = tt.splat %xoffset_12 : i32 -> tensor<8x1xi32, #blocked> loc(#loc85) + %xindex_17 = tt.splat %xoffset_12 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc85) + %xindex_18 = arith.addi %xindex_16, %xindex_14 : tensor<8x1xi32, #blocked> loc(#loc85) + %xindex_19 = arith.addi %xindex_17, %xindex_15 : tensor<8x1xi32, #blocked1> loc(#loc85) + %xmask = arith.cmpi slt, %xindex_18, %cst_1 : tensor<8x1xi32, #blocked> loc(#loc86) + %xmask_20 = arith.cmpi slt, %xindex_19, %cst_2 : tensor<8x1xi32, #blocked1> loc(#loc86) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc87) + %r0_index_21 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> loc(#loc87) + %r0_index_22 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc87) + %r0_index_23 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc87) + %r0_index_24 = tt.expand_dims %r0_index_21 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> -> tensor<1x16xi32, #linear> loc(#loc87) + %r0_index_25 = tt.expand_dims %r0_index_22 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x16xi32, #blocked1> loc(#loc87) + %x0 = arith.remsi %xindex_18, %cst_3 : tensor<8x1xi32, #blocked> loc(#loc88) + %x1 = arith.divsi %xindex_18, %cst_3 : tensor<8x1xi32, #blocked> loc(#loc89) + %tmp0 = arith.muli %r0_index_23, %cst_5 : tensor<1x16xi32, #blocked> loc(#loc90) + %tmp0_26 = tt.broadcast %x0 : tensor<8x1xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc91) + %tmp0_27 = tt.broadcast %tmp0 : tensor<1x16xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc91) + %tmp0_28 = arith.addi %tmp0_26, %tmp0_27 : tensor<8x16xi32, #blocked> loc(#loc91) + %tmp0_29 = arith.muli %x1, %cst_6 : tensor<8x1xi32, #blocked> loc(#loc92) + %tmp0_30 = tt.broadcast %tmp0_29 : tensor<8x1xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc93) + %tmp0_31 = arith.addi %tmp0_28, %tmp0_30 : tensor<8x16xi32, #blocked> loc(#loc93) + %tmp0_32 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc94) + %tmp0_33 = tt.addptr %tmp0_32, %tmp0_31 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi32, #blocked> loc(#loc94) + %tmp0_34 = tt.broadcast %xmask : tensor<8x1xi1, #blocked> -> tensor<8x16xi1, #blocked> loc(#loc95) + %tmp0_35 = tt.broadcast %xmask_20 : tensor<8x1xi1, #blocked1> -> tensor<8x16xi1, #blocked1> loc(#loc95) + %tmp0_36 = tt.load %tmp0_33, %tmp0_34, %cst_11 : tensor<8x16x!tt.ptr, #blocked> loc(#loc95) + %tmp2 = arith.trunci %r0_index_24 : tensor<1x16xi32, #linear> to tensor<1x16xi16, #linear> loc(#loc96) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16, #linear> -> tensor<8x16xi16, #linear> loc(#loc97) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> loc(#loc146) + %flip_37 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> loc(#loc146) + %flip_38 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> loc(#loc146) + %flip_39 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> loc(#loc146) + %flip_40 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> loc(#loc146) + %flip_41 = tt.expand_dims %flip_37 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> loc(#loc146) + %flip_42 = tt.expand_dims %flip_38 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> loc(#loc146) + %flip_43 = tt.expand_dims %flip_39 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> loc(#loc146) + %flip_44 = tt.expand_dims %flip_40 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> -> tensor<1x2x1xi32, #linear2> loc(#loc146) + %flip_45 = tt.expand_dims %flip_41 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> -> tensor<1x2x1xi32, #linear1> loc(#loc146) + %flip_46 = tt.expand_dims %flip_42 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> -> tensor<1x2x1xi32, #linear3> loc(#loc146) + %flip_47 = tt.expand_dims %flip_43 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> -> tensor<1x2x1xi32, #linear4> loc(#loc146) + %flip_48 = tt.broadcast %flip_44 : tensor<1x2x1xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc147) + %flip_49 = tt.reshape %flip_48 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #blocked> loc(#loc148) + %flip_50 = tt.reshape %flip_48 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc148) + %y = tt.reshape %tmp0_36 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #linear1> loc(#loc154) + %left_mask = arith.subi %cst_7, %flip_45 : tensor<1x2x1xi32, #linear1> loc(#loc155) + %left_mask_51 = arith.subi %cst_8, %flip_44 : tensor<1x2x1xi32, #linear2> loc(#loc155) + %left_mask_52 = arith.subi %cst_9, %flip_46 : tensor<1x2x1xi32, #linear3> loc(#loc155) + %left_mask_53 = arith.subi %cst_10, %flip_47 : tensor<1x2x1xi32, #linear4> loc(#loc155) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc156) + %ileft_54 = arith.muli %y, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc156) + %ileft_55 = "tt.reduce"(%ileft_54) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_56 = tt.expand_dims %ileft_55 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc158) + %ileft_57 = tt.broadcast %ileft_56 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc159) + %iright = tt.broadcast %flip_45 : tensor<1x2x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc160) + %iright_58 = arith.muli %y, %iright : tensor<64x2x1xi32, #linear1> loc(#loc160) + %iright_59 = "tt.reduce"(%iright_58) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_60 = tt.expand_dims %iright_59 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc162) + %iright_61 = tt.broadcast %iright_60 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc163) + %ileft_62 = tt.reshape %ileft_57 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #blocked> loc(#loc164) + %ileft_63 = tt.reshape %ileft_57 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc164) + %iright_64 = tt.reshape %iright_61 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #blocked> loc(#loc165) + %iright_65 = tt.reshape %iright_61 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc165) + %y_idx = tt.reshape %tmp4 : tensor<8x16xi16, #linear> -> tensor<64x2x1xi16, #linear1> loc(#loc166) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc167) + %left_idx_66 = tt.broadcast %left_idx : tensor<1x2x1xi16, #linear1> -> tensor<64x2x1xi16, #linear1> loc(#loc168) + %left_idx_67 = arith.muli %y_idx, %left_idx_66 : tensor<64x2x1xi16, #linear1> loc(#loc168) + %input = arith.extsi %left_idx_67 : tensor<64x2x1xi16, #linear1> to tensor<64x2x1xi32, #linear1> loc(#loc197) + %left_idx_68 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_69 = tt.expand_dims %left_idx_68 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc170) + %left_idx_70 = tt.broadcast %left_idx_69 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc171) + %right_idx = arith.trunci %flip_45 : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc172) + %right_idx_71 = tt.broadcast %right_idx : tensor<1x2x1xi16, #linear1> -> tensor<64x2x1xi16, #linear1> loc(#loc173) + %right_idx_72 = arith.muli %y_idx, %right_idx_71 : tensor<64x2x1xi16, #linear1> loc(#loc173) + %input_73 = arith.extsi %right_idx_72 : tensor<64x2x1xi16, #linear1> to tensor<64x2x1xi32, #linear1> loc(#loc200) + %right_idx_74 = "tt.reduce"(%input_73) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_75 = tt.expand_dims %right_idx_74 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc175) + %right_idx_76 = tt.broadcast %right_idx_75 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc176) + %left_idx_77 = tt.reshape %left_idx_70 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #blocked> loc(#loc177) + %left_idx_78 = tt.reshape %left_idx_70 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc177) + %right_idx_79 = tt.reshape %right_idx_76 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #blocked> loc(#loc178) + %right_idx_80 = tt.reshape %right_idx_76 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc178) + %cond = arith.cmpi slt, %ileft_62, %iright_64 : tensor<8x16xi32, #blocked> loc(#loc179) + %cond_81 = arith.cmpi slt, %ileft_63, %iright_65 : tensor<8x16xi32, #linear> loc(#loc179) + %eq = arith.cmpi eq, %ileft_62, %iright_64 : tensor<8x16xi32, #blocked> loc(#loc180) + %eq_82 = arith.cmpi eq, %ileft_63, %iright_65 : tensor<8x16xi32, #linear> loc(#loc180) + %cond_83 = arith.cmpi sgt, %left_idx_77, %right_idx_79 : tensor<8x16xi32, #blocked> loc(#loc181) + %cond_84 = arith.cmpi sgt, %left_idx_78, %right_idx_80 : tensor<8x16xi32, #linear> loc(#loc181) + %cond_85 = arith.andi %eq, %cond_83 : tensor<8x16xi1, #blocked> loc(#loc182) + %cond_86 = arith.andi %eq_82, %cond_84 : tensor<8x16xi1, #linear> loc(#loc182) + %cond_87 = arith.ori %cond, %cond_85 : tensor<8x16xi1, #blocked> loc(#loc183) + %cond_88 = arith.ori %cond_81, %cond_86 : tensor<8x16xi1, #linear> loc(#loc183) + %cond_89 = arith.extui %cond_87 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc184) + %cond_90 = arith.extui %cond_88 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc184) + %cond_91 = arith.xori %cond_89, %flip_49 : tensor<8x16xi32, #blocked> loc(#loc184) + %cond_92 = arith.xori %cond_90, %flip_50 : tensor<8x16xi32, #linear> loc(#loc184) + %cond_93 = arith.cmpi ne, %cond_91, %cst_11 : tensor<8x16xi32, #blocked> loc(#loc185) + %cond_94 = arith.cmpi ne, %cond_92, %cst : tensor<8x16xi32, #linear> loc(#loc185) + %ret = arith.xori %ileft_62, %iright_64 : tensor<8x16xi32, #blocked> loc(#loc186) + %ret_95 = arith.select %cond_93, %ret, %cst_11 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc187) + %ret_96 = arith.xori %tmp0_36, %ret_95 : tensor<8x16xi32, #blocked> loc(#loc188) + %ret_97 = ttg.convert_layout %ret_96 : tensor<8x16xi32, #blocked> -> tensor<8x16xi32, #linear> loc(#loc188) + %new_idxs = arith.xori %left_idx_78, %right_idx_80 : tensor<8x16xi32, #linear> loc(#loc189) + %new_idxs_98 = arith.select %cond_94, %new_idxs, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190) + %new_idxs_99 = arith.extsi %tmp2 : tensor<1x16xi16, #linear> to tensor<1x16xi32, #linear> loc(#loc191) + %new_idxs_100 = tt.broadcast %new_idxs_99 : tensor<1x16xi32, #linear> -> tensor<8x16xi32, #linear> loc(#loc191) + %new_idxs_101 = arith.xori %new_idxs_100, %new_idxs_98 : tensor<8x16xi32, #linear> loc(#loc191) + %flip_102 = tt.broadcast %flip_46 : tensor<1x2x1xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc147) + %flip_103 = tt.reshape %flip_102 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc148) + %y_104 = tt.reshape %ret_96 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #linear2> loc(#loc154) + %ileft_105 = tt.broadcast %left_mask_51 : tensor<1x2x1xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc156) + %ileft_106 = arith.muli %y_104, %ileft_105 : tensor<32x2x2xi32, #linear2> loc(#loc156) + %ileft_107 = "tt.reduce"(%ileft_106) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193) + %ileft_108 = tt.expand_dims %ileft_107 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc158) + %ileft_109 = tt.broadcast %ileft_108 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc159) + %iright_110 = arith.muli %y_104, %flip_48 : tensor<32x2x2xi32, #linear2> loc(#loc160) + %iright_111 = "tt.reduce"(%iright_110) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195) + %iright_112 = tt.expand_dims %iright_111 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc162) + %iright_113 = tt.broadcast %iright_112 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc163) + %ileft_114 = tt.reshape %ileft_109 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc164) + %iright_115 = tt.reshape %iright_113 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc165) + %y_idx_116 = tt.reshape %new_idxs_101 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc166) + %left_idx_117 = arith.muli %y_idx_116, %ileft_105 : tensor<32x2x2xi32, #linear2> loc(#loc168) + %left_idx_118 = "tt.reduce"(%left_idx_117) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198) + %left_idx_119 = tt.expand_dims %left_idx_118 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc170) + %left_idx_120 = tt.broadcast %left_idx_119 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc171) + %right_idx_121 = arith.muli %y_idx_116, %flip_48 : tensor<32x2x2xi32, #linear2> loc(#loc173) + %right_idx_122 = "tt.reduce"(%right_idx_121) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201) + %right_idx_123 = tt.expand_dims %right_idx_122 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc175) + %right_idx_124 = tt.broadcast %right_idx_123 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc176) + %left_idx_125 = tt.reshape %left_idx_120 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc177) + %right_idx_126 = tt.reshape %right_idx_124 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc178) + %cond_127 = arith.cmpi slt, %ileft_114, %iright_115 : tensor<8x16xi32, #linear> loc(#loc179) + %eq_128 = arith.cmpi eq, %ileft_114, %iright_115 : tensor<8x16xi32, #linear> loc(#loc180) + %cond_129 = arith.cmpi sgt, %left_idx_125, %right_idx_126 : tensor<8x16xi32, #linear> loc(#loc181) + %cond_130 = arith.andi %eq_128, %cond_129 : tensor<8x16xi1, #linear> loc(#loc182) + %cond_131 = arith.ori %cond_127, %cond_130 : tensor<8x16xi1, #linear> loc(#loc183) + %cond_132 = arith.extui %cond_131 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc184) + %cond_133 = arith.xori %cond_132, %flip_103 : tensor<8x16xi32, #linear> loc(#loc184) + %cond_134 = arith.cmpi ne, %cond_133, %cst : tensor<8x16xi32, #linear> loc(#loc185) + %ret_135 = arith.xori %ileft_114, %iright_115 : tensor<8x16xi32, #linear> loc(#loc186) + %ret_136 = arith.select %cond_134, %ret_135, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187) + %ret_137 = arith.xori %ret_97, %ret_136 : tensor<8x16xi32, #linear> loc(#loc188) + %new_idxs_138 = arith.xori %left_idx_125, %right_idx_126 : tensor<8x16xi32, #linear> loc(#loc189) + %new_idxs_139 = arith.select %cond_134, %new_idxs_138, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190) + %new_idxs_140 = arith.xori %new_idxs_101, %new_idxs_139 : tensor<8x16xi32, #linear> loc(#loc191) + %y_141 = tt.reshape %ret_137 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc154) + %ileft_142 = arith.muli %y_141, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc156) + %ileft_143 = "tt.reduce"(%ileft_142) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_144 = tt.expand_dims %ileft_143 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc158) + %ileft_145 = tt.broadcast %ileft_144 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc159) + %iright_146 = arith.muli %y_141, %iright : tensor<64x2x1xi32, #linear1> loc(#loc160) + %iright_147 = "tt.reduce"(%iright_146) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_148 = tt.expand_dims %iright_147 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc162) + %iright_149 = tt.broadcast %iright_148 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc163) + %ileft_150 = tt.reshape %ileft_145 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc164) + %iright_151 = tt.reshape %iright_149 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc165) + %y_idx_152 = tt.reshape %new_idxs_140 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc166) + %left_idx_153 = arith.muli %y_idx_152, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc168) + %left_idx_154 = "tt.reduce"(%left_idx_153) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_155 = tt.expand_dims %left_idx_154 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc170) + %left_idx_156 = tt.broadcast %left_idx_155 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc171) + %right_idx_157 = arith.muli %y_idx_152, %iright : tensor<64x2x1xi32, #linear1> loc(#loc173) + %right_idx_158 = "tt.reduce"(%right_idx_157) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_159 = tt.expand_dims %right_idx_158 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc175) + %right_idx_160 = tt.broadcast %right_idx_159 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc176) + %left_idx_161 = tt.reshape %left_idx_156 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc177) + %right_idx_162 = tt.reshape %right_idx_160 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc178) + %cond_163 = arith.cmpi slt, %ileft_150, %iright_151 : tensor<8x16xi32, #linear> loc(#loc179) + %eq_164 = arith.cmpi eq, %ileft_150, %iright_151 : tensor<8x16xi32, #linear> loc(#loc180) + %cond_165 = arith.cmpi sgt, %left_idx_161, %right_idx_162 : tensor<8x16xi32, #linear> loc(#loc181) + %cond_166 = arith.andi %eq_164, %cond_165 : tensor<8x16xi1, #linear> loc(#loc182) + %cond_167 = arith.ori %cond_163, %cond_166 : tensor<8x16xi1, #linear> loc(#loc183) + %cond_168 = arith.extui %cond_167 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc184) + %cond_169 = arith.xori %cond_168, %flip_103 : tensor<8x16xi32, #linear> loc(#loc184) + %cond_170 = arith.cmpi ne, %cond_169, %cst : tensor<8x16xi32, #linear> loc(#loc185) + %ret_171 = arith.xori %ileft_150, %iright_151 : tensor<8x16xi32, #linear> loc(#loc186) + %ret_172 = arith.select %cond_170, %ret_171, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187) + %ret_173 = arith.xori %ret_137, %ret_172 : tensor<8x16xi32, #linear> loc(#loc188) + %new_idxs_174 = arith.xori %left_idx_161, %right_idx_162 : tensor<8x16xi32, #linear> loc(#loc189) + %new_idxs_175 = arith.select %cond_170, %new_idxs_174, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190) + %new_idxs_176 = arith.xori %new_idxs_140, %new_idxs_175 : tensor<8x16xi32, #linear> loc(#loc191) + %flip_177 = tt.broadcast %flip_47 : tensor<1x2x1xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc147) + %flip_178 = tt.reshape %flip_177 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc148) + %y_179 = tt.reshape %ret_173 : tensor<8x16xi32, #linear> -> tensor<16x2x4xi32, #linear3> loc(#loc154) + %ileft_180 = tt.broadcast %left_mask_52 : tensor<1x2x1xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc156) + %ileft_181 = arith.muli %y_179, %ileft_180 : tensor<16x2x4xi32, #linear3> loc(#loc156) + %ileft_182 = "tt.reduce"(%ileft_181) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc193) + %ileft_183 = tt.expand_dims %ileft_182 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc158) + %ileft_184 = tt.broadcast %ileft_183 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc159) + %iright_185 = arith.muli %y_179, %flip_102 : tensor<16x2x4xi32, #linear3> loc(#loc160) + %iright_186 = "tt.reduce"(%iright_185) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc195) + %iright_187 = tt.expand_dims %iright_186 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc162) + %iright_188 = tt.broadcast %iright_187 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc163) + %ileft_189 = tt.reshape %ileft_184 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc164) + %iright_190 = tt.reshape %iright_188 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc165) + %y_idx_191 = tt.reshape %new_idxs_176 : tensor<8x16xi32, #linear> -> tensor<16x2x4xi32, #linear3> loc(#loc166) + %left_idx_192 = arith.muli %y_idx_191, %ileft_180 : tensor<16x2x4xi32, #linear3> loc(#loc168) + %left_idx_193 = "tt.reduce"(%left_idx_192) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc198) + %left_idx_194 = tt.expand_dims %left_idx_193 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc170) + %left_idx_195 = tt.broadcast %left_idx_194 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc171) + %right_idx_196 = arith.muli %y_idx_191, %flip_102 : tensor<16x2x4xi32, #linear3> loc(#loc173) + %right_idx_197 = "tt.reduce"(%right_idx_196) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc201) + %right_idx_198 = tt.expand_dims %right_idx_197 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc175) + %right_idx_199 = tt.broadcast %right_idx_198 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc176) + %left_idx_200 = tt.reshape %left_idx_195 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc177) + %right_idx_201 = tt.reshape %right_idx_199 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc178) + %cond_202 = arith.cmpi slt, %ileft_189, %iright_190 : tensor<8x16xi32, #linear> loc(#loc179) + %eq_203 = arith.cmpi eq, %ileft_189, %iright_190 : tensor<8x16xi32, #linear> loc(#loc180) + %cond_204 = arith.cmpi sgt, %left_idx_200, %right_idx_201 : tensor<8x16xi32, #linear> loc(#loc181) + %cond_205 = arith.andi %eq_203, %cond_204 : tensor<8x16xi1, #linear> loc(#loc182) + %cond_206 = arith.ori %cond_202, %cond_205 : tensor<8x16xi1, #linear> loc(#loc183) + %cond_207 = arith.extui %cond_206 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc184) + %cond_208 = arith.xori %cond_207, %flip_178 : tensor<8x16xi32, #linear> loc(#loc184) + %cond_209 = arith.cmpi ne, %cond_208, %cst : tensor<8x16xi32, #linear> loc(#loc185) + %ret_210 = arith.xori %ileft_189, %iright_190 : tensor<8x16xi32, #linear> loc(#loc186) + %ret_211 = arith.select %cond_209, %ret_210, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187) + %ret_212 = arith.xori %ret_173, %ret_211 : tensor<8x16xi32, #linear> loc(#loc188) + %new_idxs_213 = arith.xori %left_idx_200, %right_idx_201 : tensor<8x16xi32, #linear> loc(#loc189) + %new_idxs_214 = arith.select %cond_209, %new_idxs_213, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190) + %new_idxs_215 = arith.xori %new_idxs_176, %new_idxs_214 : tensor<8x16xi32, #linear> loc(#loc191) + %y_216 = tt.reshape %ret_212 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc154) + %ileft_217 = arith.muli %y_216, %ileft_105 : tensor<32x2x2xi32, #linear2> loc(#loc156) + %ileft_218 = "tt.reduce"(%ileft_217) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193) + %ileft_219 = tt.expand_dims %ileft_218 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc158) + %ileft_220 = tt.broadcast %ileft_219 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc159) + %iright_221 = arith.muli %y_216, %flip_48 : tensor<32x2x2xi32, #linear2> loc(#loc160) + %iright_222 = "tt.reduce"(%iright_221) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195) + %iright_223 = tt.expand_dims %iright_222 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc162) + %iright_224 = tt.broadcast %iright_223 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc163) + %ileft_225 = tt.reshape %ileft_220 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc164) + %iright_226 = tt.reshape %iright_224 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc165) + %y_idx_227 = tt.reshape %new_idxs_215 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc166) + %left_idx_228 = arith.muli %y_idx_227, %ileft_105 : tensor<32x2x2xi32, #linear2> loc(#loc168) + %left_idx_229 = "tt.reduce"(%left_idx_228) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198) + %left_idx_230 = tt.expand_dims %left_idx_229 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc170) + %left_idx_231 = tt.broadcast %left_idx_230 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc171) + %right_idx_232 = arith.muli %y_idx_227, %flip_48 : tensor<32x2x2xi32, #linear2> loc(#loc173) + %right_idx_233 = "tt.reduce"(%right_idx_232) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201) + %right_idx_234 = tt.expand_dims %right_idx_233 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc175) + %right_idx_235 = tt.broadcast %right_idx_234 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc176) + %left_idx_236 = tt.reshape %left_idx_231 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc177) + %right_idx_237 = tt.reshape %right_idx_235 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc178) + %cond_238 = arith.cmpi slt, %ileft_225, %iright_226 : tensor<8x16xi32, #linear> loc(#loc179) + %eq_239 = arith.cmpi eq, %ileft_225, %iright_226 : tensor<8x16xi32, #linear> loc(#loc180) + %cond_240 = arith.cmpi sgt, %left_idx_236, %right_idx_237 : tensor<8x16xi32, #linear> loc(#loc181) + %cond_241 = arith.andi %eq_239, %cond_240 : tensor<8x16xi1, #linear> loc(#loc182) + %cond_242 = arith.ori %cond_238, %cond_241 : tensor<8x16xi1, #linear> loc(#loc183) + %cond_243 = arith.extui %cond_242 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc184) + %cond_244 = arith.xori %cond_243, %flip_178 : tensor<8x16xi32, #linear> loc(#loc184) + %cond_245 = arith.cmpi ne, %cond_244, %cst : tensor<8x16xi32, #linear> loc(#loc185) + %ret_246 = arith.xori %ileft_225, %iright_226 : tensor<8x16xi32, #linear> loc(#loc186) + %ret_247 = arith.select %cond_245, %ret_246, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187) + %ret_248 = arith.xori %ret_212, %ret_247 : tensor<8x16xi32, #linear> loc(#loc188) + %new_idxs_249 = arith.xori %left_idx_236, %right_idx_237 : tensor<8x16xi32, #linear> loc(#loc189) + %new_idxs_250 = arith.select %cond_245, %new_idxs_249, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190) + %new_idxs_251 = arith.xori %new_idxs_215, %new_idxs_250 : tensor<8x16xi32, #linear> loc(#loc191) + %y_252 = tt.reshape %ret_248 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc154) + %ileft_253 = arith.muli %y_252, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc156) + %ileft_254 = "tt.reduce"(%ileft_253) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_255 = tt.expand_dims %ileft_254 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc158) + %ileft_256 = tt.broadcast %ileft_255 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc159) + %iright_257 = arith.muli %y_252, %iright : tensor<64x2x1xi32, #linear1> loc(#loc160) + %iright_258 = "tt.reduce"(%iright_257) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_259 = tt.expand_dims %iright_258 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc162) + %iright_260 = tt.broadcast %iright_259 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc163) + %ileft_261 = tt.reshape %ileft_256 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc164) + %iright_262 = tt.reshape %iright_260 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc165) + %y_idx_263 = tt.reshape %new_idxs_251 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc166) + %left_idx_264 = arith.muli %y_idx_263, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc168) + %left_idx_265 = "tt.reduce"(%left_idx_264) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_266 = tt.expand_dims %left_idx_265 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc170) + %left_idx_267 = tt.broadcast %left_idx_266 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc171) + %right_idx_268 = arith.muli %y_idx_263, %iright : tensor<64x2x1xi32, #linear1> loc(#loc173) + %right_idx_269 = "tt.reduce"(%right_idx_268) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_270 = tt.expand_dims %right_idx_269 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc175) + %right_idx_271 = tt.broadcast %right_idx_270 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc176) + %left_idx_272 = tt.reshape %left_idx_267 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc177) + %right_idx_273 = tt.reshape %right_idx_271 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc178) + %cond_274 = arith.cmpi slt, %ileft_261, %iright_262 : tensor<8x16xi32, #linear> loc(#loc179) + %eq_275 = arith.cmpi eq, %ileft_261, %iright_262 : tensor<8x16xi32, #linear> loc(#loc180) + %cond_276 = arith.cmpi sgt, %left_idx_272, %right_idx_273 : tensor<8x16xi32, #linear> loc(#loc181) + %cond_277 = arith.andi %eq_275, %cond_276 : tensor<8x16xi1, #linear> loc(#loc182) + %cond_278 = arith.ori %cond_274, %cond_277 : tensor<8x16xi1, #linear> loc(#loc183) + %cond_279 = arith.extui %cond_278 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc184) + %cond_280 = arith.xori %cond_279, %flip_178 : tensor<8x16xi32, #linear> loc(#loc184) + %cond_281 = arith.cmpi ne, %cond_280, %cst : tensor<8x16xi32, #linear> loc(#loc185) + %ret_282 = arith.xori %ileft_261, %iright_262 : tensor<8x16xi32, #linear> loc(#loc186) + %ret_283 = arith.select %cond_281, %ret_282, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187) + %ret_284 = arith.xori %ret_248, %ret_283 : tensor<8x16xi32, #linear> loc(#loc188) + %new_idxs_285 = arith.xori %left_idx_272, %right_idx_273 : tensor<8x16xi32, #linear> loc(#loc189) + %new_idxs_286 = arith.select %cond_281, %new_idxs_285, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190) + %new_idxs_287 = arith.xori %new_idxs_251, %new_idxs_286 : tensor<8x16xi32, #linear> loc(#loc191) + %y_288 = tt.reshape %ret_284 : tensor<8x16xi32, #linear> -> tensor<8x2x8xi32, #linear4> loc(#loc154) + %ileft_289 = tt.broadcast %left_mask_53 : tensor<1x2x1xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc156) + %ileft_290 = arith.muli %y_288, %ileft_289 : tensor<8x2x8xi32, #linear4> loc(#loc156) + %ileft_291 = "tt.reduce"(%ileft_290) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<8x2x8xi32, #linear4>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc193) + %ileft_292 = tt.expand_dims %ileft_291 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<8x1x8xi32, #linear4> loc(#loc158) + %ileft_293 = tt.broadcast %ileft_292 : tensor<8x1x8xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc159) + %iright_294 = arith.muli %y_288, %flip_177 : tensor<8x2x8xi32, #linear4> loc(#loc160) + %iright_295 = "tt.reduce"(%iright_294) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<8x2x8xi32, #linear4>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc195) + %iright_296 = tt.expand_dims %iright_295 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<8x1x8xi32, #linear4> loc(#loc162) + %iright_297 = tt.broadcast %iright_296 : tensor<8x1x8xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc163) + %ileft_298 = tt.reshape %ileft_293 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc164) + %iright_299 = tt.reshape %iright_297 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc165) + %y_idx_300 = tt.reshape %new_idxs_287 : tensor<8x16xi32, #linear> -> tensor<8x2x8xi32, #linear4> loc(#loc166) + %left_idx_301 = arith.muli %y_idx_300, %ileft_289 : tensor<8x2x8xi32, #linear4> loc(#loc168) + %left_idx_302 = "tt.reduce"(%left_idx_301) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<8x2x8xi32, #linear4>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc198) + %left_idx_303 = tt.expand_dims %left_idx_302 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<8x1x8xi32, #linear4> loc(#loc170) + %left_idx_304 = tt.broadcast %left_idx_303 : tensor<8x1x8xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc171) + %right_idx_305 = arith.muli %y_idx_300, %flip_177 : tensor<8x2x8xi32, #linear4> loc(#loc173) + %right_idx_306 = "tt.reduce"(%right_idx_305) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<8x2x8xi32, #linear4>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc201) + %right_idx_307 = tt.expand_dims %right_idx_306 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<8x1x8xi32, #linear4> loc(#loc175) + %right_idx_308 = tt.broadcast %right_idx_307 : tensor<8x1x8xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc176) + %left_idx_309 = tt.reshape %left_idx_304 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc177) + %right_idx_310 = tt.reshape %right_idx_308 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc178) + %cond_311 = arith.cmpi slt, %ileft_298, %iright_299 : tensor<8x16xi32, #linear> loc(#loc179) + %eq_312 = arith.cmpi eq, %ileft_298, %iright_299 : tensor<8x16xi32, #linear> loc(#loc180) + %cond_313 = arith.cmpi sgt, %left_idx_309, %right_idx_310 : tensor<8x16xi32, #linear> loc(#loc181) + %cond_314 = arith.andi %eq_312, %cond_313 : tensor<8x16xi1, #linear> loc(#loc182) + %cond_315 = arith.ori %cond_311, %cond_314 : tensor<8x16xi1, #linear> loc(#loc183) + %ret_316 = arith.xori %ileft_298, %iright_299 : tensor<8x16xi32, #linear> loc(#loc186) + %ret_317 = arith.select %cond_315, %ret_316, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187) + %ret_318 = arith.xori %ret_284, %ret_317 : tensor<8x16xi32, #linear> loc(#loc188) + %new_idxs_319 = arith.xori %left_idx_309, %right_idx_310 : tensor<8x16xi32, #linear> loc(#loc189) + %new_idxs_320 = arith.select %cond_315, %new_idxs_319, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190) + %new_idxs_321 = arith.xori %new_idxs_287, %new_idxs_320 : tensor<8x16xi32, #linear> loc(#loc191) + %y_322 = tt.reshape %ret_318 : tensor<8x16xi32, #linear> -> tensor<16x2x4xi32, #linear3> loc(#loc154) + %ileft_323 = arith.muli %y_322, %ileft_180 : tensor<16x2x4xi32, #linear3> loc(#loc156) + %ileft_324 = "tt.reduce"(%ileft_323) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc193) + %ileft_325 = tt.expand_dims %ileft_324 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc158) + %ileft_326 = tt.broadcast %ileft_325 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc159) + %iright_327 = arith.muli %y_322, %flip_102 : tensor<16x2x4xi32, #linear3> loc(#loc160) + %iright_328 = "tt.reduce"(%iright_327) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc195) + %iright_329 = tt.expand_dims %iright_328 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc162) + %iright_330 = tt.broadcast %iright_329 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc163) + %ileft_331 = tt.reshape %ileft_326 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc164) + %iright_332 = tt.reshape %iright_330 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc165) + %y_idx_333 = tt.reshape %new_idxs_321 : tensor<8x16xi32, #linear> -> tensor<16x2x4xi32, #linear3> loc(#loc166) + %left_idx_334 = arith.muli %y_idx_333, %ileft_180 : tensor<16x2x4xi32, #linear3> loc(#loc168) + %left_idx_335 = "tt.reduce"(%left_idx_334) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc198) + %left_idx_336 = tt.expand_dims %left_idx_335 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc170) + %left_idx_337 = tt.broadcast %left_idx_336 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc171) + %right_idx_338 = arith.muli %y_idx_333, %flip_102 : tensor<16x2x4xi32, #linear3> loc(#loc173) + %right_idx_339 = "tt.reduce"(%right_idx_338) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc201) + %right_idx_340 = tt.expand_dims %right_idx_339 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc175) + %right_idx_341 = tt.broadcast %right_idx_340 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc176) + %left_idx_342 = tt.reshape %left_idx_337 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc177) + %right_idx_343 = tt.reshape %right_idx_341 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc178) + %cond_344 = arith.cmpi slt, %ileft_331, %iright_332 : tensor<8x16xi32, #linear> loc(#loc179) + %eq_345 = arith.cmpi eq, %ileft_331, %iright_332 : tensor<8x16xi32, #linear> loc(#loc180) + %cond_346 = arith.cmpi sgt, %left_idx_342, %right_idx_343 : tensor<8x16xi32, #linear> loc(#loc181) + %cond_347 = arith.andi %eq_345, %cond_346 : tensor<8x16xi1, #linear> loc(#loc182) + %cond_348 = arith.ori %cond_344, %cond_347 : tensor<8x16xi1, #linear> loc(#loc183) + %ret_349 = arith.xori %ileft_331, %iright_332 : tensor<8x16xi32, #linear> loc(#loc186) + %ret_350 = arith.select %cond_348, %ret_349, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187) + %ret_351 = arith.xori %ret_318, %ret_350 : tensor<8x16xi32, #linear> loc(#loc188) + %new_idxs_352 = arith.xori %left_idx_342, %right_idx_343 : tensor<8x16xi32, #linear> loc(#loc189) + %new_idxs_353 = arith.select %cond_348, %new_idxs_352, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190) + %new_idxs_354 = arith.xori %new_idxs_321, %new_idxs_353 : tensor<8x16xi32, #linear> loc(#loc191) + %y_355 = tt.reshape %ret_351 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc154) + %ileft_356 = arith.muli %y_355, %ileft_105 : tensor<32x2x2xi32, #linear2> loc(#loc156) + %ileft_357 = "tt.reduce"(%ileft_356) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193) + %ileft_358 = tt.expand_dims %ileft_357 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc158) + %ileft_359 = tt.broadcast %ileft_358 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc159) + %iright_360 = arith.muli %y_355, %flip_48 : tensor<32x2x2xi32, #linear2> loc(#loc160) + %iright_361 = "tt.reduce"(%iright_360) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195) + %iright_362 = tt.expand_dims %iright_361 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc162) + %iright_363 = tt.broadcast %iright_362 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc163) + %ileft_364 = tt.reshape %ileft_359 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc164) + %iright_365 = tt.reshape %iright_363 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc165) + %y_idx_366 = tt.reshape %new_idxs_354 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc166) + %left_idx_367 = arith.muli %y_idx_366, %ileft_105 : tensor<32x2x2xi32, #linear2> loc(#loc168) + %left_idx_368 = "tt.reduce"(%left_idx_367) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198) + %left_idx_369 = tt.expand_dims %left_idx_368 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc170) + %left_idx_370 = tt.broadcast %left_idx_369 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc171) + %right_idx_371 = arith.muli %y_idx_366, %flip_48 : tensor<32x2x2xi32, #linear2> loc(#loc173) + %right_idx_372 = "tt.reduce"(%right_idx_371) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201) + %right_idx_373 = tt.expand_dims %right_idx_372 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc175) + %right_idx_374 = tt.broadcast %right_idx_373 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc176) + %left_idx_375 = tt.reshape %left_idx_370 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc177) + %right_idx_376 = tt.reshape %right_idx_374 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc178) + %cond_377 = arith.cmpi slt, %ileft_364, %iright_365 : tensor<8x16xi32, #linear> loc(#loc179) + %eq_378 = arith.cmpi eq, %ileft_364, %iright_365 : tensor<8x16xi32, #linear> loc(#loc180) + %cond_379 = arith.cmpi sgt, %left_idx_375, %right_idx_376 : tensor<8x16xi32, #linear> loc(#loc181) + %cond_380 = arith.andi %eq_378, %cond_379 : tensor<8x16xi1, #linear> loc(#loc182) + %cond_381 = arith.ori %cond_377, %cond_380 : tensor<8x16xi1, #linear> loc(#loc183) + %ret_382 = arith.xori %ileft_364, %iright_365 : tensor<8x16xi32, #linear> loc(#loc186) + %ret_383 = arith.select %cond_381, %ret_382, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc187) + %ret_384 = arith.xori %ret_351, %ret_383 : tensor<8x16xi32, #linear> loc(#loc188) + %new_idxs_385 = arith.xori %left_idx_375, %right_idx_376 : tensor<8x16xi32, #linear> loc(#loc189) + %new_idxs_386 = arith.select %cond_381, %new_idxs_385, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190) + %new_idxs_387 = arith.xori %new_idxs_354, %new_idxs_386 : tensor<8x16xi32, #linear> loc(#loc191) + %y_388 = tt.reshape %ret_384 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc154) + %ileft_389 = arith.muli %y_388, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc156) + %ileft_390 = "tt.reduce"(%ileft_389) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_391 = tt.expand_dims %ileft_390 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc158) + %ileft_392 = tt.broadcast %ileft_391 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc159) + %iright_393 = arith.muli %y_388, %iright : tensor<64x2x1xi32, #linear1> loc(#loc160) + %iright_394 = "tt.reduce"(%iright_393) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_395 = tt.expand_dims %iright_394 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc162) + %iright_396 = tt.broadcast %iright_395 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc163) + %ileft_397 = tt.reshape %ileft_392 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc164) + %iright_398 = tt.reshape %iright_396 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc165) + %y_idx_399 = tt.reshape %new_idxs_387 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc166) + %left_idx_400 = arith.muli %y_idx_399, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc168) + %left_idx_401 = "tt.reduce"(%left_idx_400) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_402 = tt.expand_dims %left_idx_401 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc170) + %left_idx_403 = tt.broadcast %left_idx_402 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc171) + %right_idx_404 = arith.muli %y_idx_399, %iright : tensor<64x2x1xi32, #linear1> loc(#loc173) + %right_idx_405 = "tt.reduce"(%right_idx_404) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_406 = tt.expand_dims %right_idx_405 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc175) + %right_idx_407 = tt.broadcast %right_idx_406 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc176) + %left_idx_408 = tt.reshape %left_idx_403 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc177) + %right_idx_409 = tt.reshape %right_idx_407 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc178) + %cond_410 = arith.cmpi slt, %ileft_397, %iright_398 : tensor<8x16xi32, #linear> loc(#loc179) + %eq_411 = arith.cmpi eq, %ileft_397, %iright_398 : tensor<8x16xi32, #linear> loc(#loc180) + %cond_412 = arith.cmpi sgt, %left_idx_408, %right_idx_409 : tensor<8x16xi32, #linear> loc(#loc181) + %cond_413 = arith.andi %eq_411, %cond_412 : tensor<8x16xi1, #linear> loc(#loc182) + %cond_414 = arith.ori %cond_410, %cond_413 : tensor<8x16xi1, #linear> loc(#loc183) + %new_idxs_415 = arith.xori %left_idx_408, %right_idx_409 : tensor<8x16xi32, #linear> loc(#loc189) + %new_idxs_416 = arith.select %cond_414, %new_idxs_415, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc190) + %new_idxs_417 = arith.xori %new_idxs_387, %new_idxs_416 : tensor<8x16xi32, #linear> loc(#loc191) + %tmp7 = arith.extsi %tmp0_36 : tensor<8x16xi32, #blocked> to tensor<8x16xi64, #blocked> loc(#loc141) + %tmp10 = arith.select %tmp0_34, %tmp7, %cst_0 : tensor<8x16xi1, #blocked>, tensor<8x16xi64, #blocked> loc(#loc142) + %tmp11 = "tt.reduce"(%tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_419: i64 loc(callsite(#loc1 at #loc143)), %tmp11_420: i64 loc(callsite(#loc1 at #loc143))): + %tmp11_421 = arith.addi %tmp11_419, %tmp11_420 : i64 loc(#loc192) + tt.reduce.return %tmp11_421 : i64 loc(#loc152) + }) : (tensor<8x16xi64, #blocked>) -> tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc152) + %tmp11_418 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi64, #blocked> loc(#loc144) + %tmp14 = arith.trunci %tmp11_418 : tensor<8x1xi64, #blocked> to tensor<8x1xi32, #blocked> loc(#loc145) + %0 = arith.muli %xindex_19, %cst_4 : tensor<8x1xi32, #blocked1> loc(#loc70) + %1 = tt.broadcast %r0_index_25 : tensor<1x16xi32, #blocked1> -> tensor<8x16xi32, #blocked1> loc(#loc71) + %2 = tt.broadcast %0 : tensor<8x1xi32, #blocked1> -> tensor<8x16xi32, #blocked1> loc(#loc71) + %3 = arith.addi %1, %2 : tensor<8x16xi32, #blocked1> loc(#loc71) + %4 = tt.splat %out_ptr2 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked1> loc(#loc72) + %5 = tt.addptr %4, %3 : tensor<8x16x!tt.ptr, #blocked1>, tensor<8x16xi32, #blocked1> loc(#loc72) + %6 = ttg.convert_layout %new_idxs_417 : tensor<8x16xi32, #linear> -> tensor<8x16xi32, #blocked1> loc(#loc73) + tt.store %5, %6, %tmp0_35 : tensor<8x16x!tt.ptr, #blocked1> loc(#loc73) + %7 = tt.splat %out_ptr3 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked> loc(#loc74) + %8 = tt.addptr %7, %xindex_18 : tensor<8x1x!tt.ptr, #blocked>, tensor<8x1xi32, #blocked> loc(#loc74) + tt.store %8, %tmp14, %xmask : tensor<8x1x!tt.ptr, #blocked> loc(#loc75) + tt.return loc(#loc76) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":24:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":26:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":27:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":33:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":34:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:35) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:49) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:30) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:54) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":38:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":40:33) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":42:19) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":44:34) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":45:29) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":48:21) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:32) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:47) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:25) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:37) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:4) +#loc82 = loc("xoffset"(#loc2)) +#loc83 = loc("xoffset"(#loc3)) +#loc84 = loc("xindex"(#loc4)) +#loc85 = loc("xindex"(#loc5)) +#loc86 = loc("xmask"(#loc6)) +#loc87 = loc("r0_index"(#loc7)) +#loc88 = loc("x0"(#loc8)) +#loc89 = loc("x1"(#loc9)) +#loc90 = loc("tmp0"(#loc10)) +#loc91 = loc("tmp0"(#loc11)) +#loc92 = loc("tmp0"(#loc12)) +#loc93 = loc("tmp0"(#loc13)) +#loc94 = loc("tmp0"(#loc14)) +#loc95 = loc("tmp0"(#loc15)) +#loc96 = loc("tmp2"(#loc16)) +#loc97 = loc("tmp4"(#loc17)) +#loc98 = loc("flip"(#loc18)) +#loc100 = loc("flip"(#loc21)) +#loc101 = loc("flip"(#loc22)) +#loc102 = loc("y"(#loc23)) +#loc103 = loc("left_mask"(#loc25)) +#loc104 = loc("ileft"(#loc26)) +#loc106 = loc("ileft"(#loc30)) +#loc107 = loc("ileft"(#loc31)) +#loc108 = loc("iright"(#loc32)) +#loc110 = loc("iright"(#loc34)) +#loc111 = loc("iright"(#loc35)) +#loc112 = loc("ileft"(#loc36)) +#loc113 = loc("iright"(#loc37)) +#loc114 = loc("y_idx"(#loc38)) +#loc115 = loc("left_idx"(#loc39)) +#loc116 = loc("left_idx"(#loc40)) +#loc117 = loc("input"(#loc41)) +#loc119 = loc("left_idx"(#loc43)) +#loc120 = loc("left_idx"(#loc44)) +#loc121 = loc("right_idx"(#loc45)) +#loc122 = loc("right_idx"(#loc46)) +#loc124 = loc("right_idx"(#loc48)) +#loc125 = loc("right_idx"(#loc49)) +#loc126 = loc("left_idx"(#loc50)) +#loc127 = loc("right_idx"(#loc51)) +#loc128 = loc("cond"(#loc52)) +#loc129 = loc("eq"(#loc53)) +#loc130 = loc("cond"(#loc54)) +#loc131 = loc("cond"(#loc55)) +#loc132 = loc("cond"(#loc56)) +#loc133 = loc("cond"(#loc57)) +#loc134 = loc("cond"(#loc58)) +#loc135 = loc("ret"(#loc59)) +#loc136 = loc("ret"(#loc60)) +#loc137 = loc("ret"(#loc61)) +#loc138 = loc("new_idxs"(#loc62)) +#loc139 = loc("new_idxs"(#loc63)) +#loc140 = loc("new_idxs"(#loc64)) +#loc141 = loc("tmp7"(#loc65)) +#loc142 = loc("tmp10"(#loc66)) +#loc144 = loc("tmp11"(#loc68)) +#loc145 = loc("tmp14"(#loc69)) +#loc146 = loc(callsite(#loc98 at #loc99)) +#loc147 = loc(callsite(#loc100 at #loc99)) +#loc148 = loc(callsite(#loc101 at #loc99)) +#loc150 = loc("cond"(#loc128)) +#loc151 = loc("eq"(#loc129)) +#loc152 = loc(callsite(#loc27 at #loc143)) +#loc154 = loc(callsite(#loc102 at #loc149)) +#loc155 = loc(callsite(#loc103 at #loc149)) +#loc156 = loc(callsite(#loc104 at #loc149)) +#loc158 = loc(callsite(#loc106 at #loc149)) +#loc159 = loc(callsite(#loc107 at #loc149)) +#loc160 = loc(callsite(#loc108 at #loc149)) +#loc162 = loc(callsite(#loc110 at #loc149)) +#loc163 = loc(callsite(#loc111 at #loc149)) +#loc164 = loc(callsite(#loc112 at #loc149)) +#loc165 = loc(callsite(#loc113 at #loc149)) +#loc166 = loc(callsite(#loc114 at #loc149)) +#loc167 = loc(callsite(#loc115 at #loc149)) +#loc168 = loc(callsite(#loc116 at #loc149)) +#loc170 = loc(callsite(#loc119 at #loc149)) +#loc171 = loc(callsite(#loc120 at #loc149)) +#loc172 = loc(callsite(#loc121 at #loc149)) +#loc173 = loc(callsite(#loc122 at #loc149)) +#loc175 = loc(callsite(#loc124 at #loc149)) +#loc176 = loc(callsite(#loc125 at #loc149)) +#loc177 = loc(callsite(#loc126 at #loc149)) +#loc178 = loc(callsite(#loc127 at #loc149)) +#loc179 = loc(callsite(#loc150 at #loc149)) +#loc180 = loc(callsite(#loc151 at #loc149)) +#loc181 = loc(callsite(#loc130 at #loc149)) +#loc182 = loc(callsite(#loc131 at #loc149)) +#loc183 = loc(callsite(#loc132 at #loc149)) +#loc184 = loc(callsite(#loc133 at #loc149)) +#loc185 = loc(callsite(#loc134 at #loc149)) +#loc186 = loc(callsite(#loc135 at #loc149)) +#loc187 = loc(callsite(#loc136 at #loc149)) +#loc188 = loc(callsite(#loc137 at #loc149)) +#loc189 = loc(callsite(#loc138 at #loc149)) +#loc190 = loc(callsite(#loc139 at #loc149)) +#loc191 = loc(callsite(#loc140 at #loc149)) +#loc192 = loc(callsite(#loc29 at #loc152)) +#loc193 = loc(callsite(#loc27 at #loc157)) +#loc195 = loc(callsite(#loc27 at #loc161)) +#loc197 = loc(callsite(#loc117 at #loc169)) +#loc198 = loc(callsite(#loc27 at #loc169)) +#loc200 = loc(callsite(#loc117 at #loc174)) +#loc201 = loc(callsite(#loc27 at #loc174)) +#loc203 = loc(callsite(#loc29 at #loc193)) +#loc204 = loc(callsite(#loc29 at #loc195)) +#loc205 = loc(callsite(#loc29 at #loc198)) +#loc206 = loc(callsite(#loc29 at #loc201)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir new file mode 100644 index 0000000000000000000000000000000000000000..062b1eb5cde0fc72eb019723de759fe9004cb05f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3T44HVMODVD44VQT2XIM5LORTLI6GMS6N6NVE3J3FK6USYBEBANA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir @@ -0,0 +1,799 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":41:67) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":45:26) +#loc80 = loc("in_ptr0"(#loc)) +#loc81 = loc("out_ptr2"(#loc)) +#loc82 = loc("out_ptr3"(#loc)) +#loc83 = loc("xnumel"(#loc)) +#loc84 = loc("r0_numel"(#loc)) +#loc106 = loc(callsite(#loc23 at #loc2)) +#loc113 = loc("ileft"(#loc32)) +#loc117 = loc("iright"(#loc37)) +#loc126 = loc("left_idx"(#loc46)) +#loc131 = loc("right_idx"(#loc51)) +#loc150 = loc("tmp11"(#loc70)) +#loc157 = loc(callsite(#loc28 at #loc106)) +#loc161 = loc(callsite(#loc1 at #loc150)) +#loc165 = loc(callsite(#loc113 at #loc157)) +#loc169 = loc(callsite(#loc117 at #loc157)) +#loc177 = loc(callsite(#loc126 at #loc157)) +#loc182 = loc(callsite(#loc131 at #loc157)) +#loc202 = loc(callsite(#loc1 at #loc165)) +#loc204 = loc(callsite(#loc1 at #loc169)) +#loc207 = loc(callsite(#loc1 at #loc177)) +#loc210 = loc(callsite(#loc1 at #loc182)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc85) + %cst_0 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc1) + %tmp10 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc86) + %tmp0 = arith.constant dense<272> : tensor<8x1xi32> loc(#loc87) + %tmp0_1 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc88) + %cst_2 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc1) + %xmask = arith.constant dense<32> : tensor<8x1xi32> loc(#loc89) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc90) + %xoffset_3 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc91) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc92) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc93) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<8x1xi32> loc(#loc94) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<8x1xi32> loc(#loc94) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<8x1xi32> loc(#loc89) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc95) + %r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc96) + %x0 = arith.remsi %xindex_6, %cst_2 : tensor<8x1xi32> loc(#loc97) + %x1 = arith.divsi %xindex_6, %cst_2 : tensor<8x1xi32> loc(#loc98) + %tmp0_9 = arith.muli %r0_index_8, %tmp0_1 : tensor<1x16xi32> loc(#loc88) + %tmp0_10 = tt.broadcast %x0 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc99) + %tmp0_11 = tt.broadcast %tmp0_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc99) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<8x16xi32> loc(#loc99) + %tmp0_13 = arith.muli %x1, %tmp0 : tensor<8x1xi32> loc(#loc87) + %tmp0_14 = tt.broadcast %tmp0_13 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc100) + %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<8x16xi32> loc(#loc100) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc101) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc101) + %tmp0_18 = tt.broadcast %xmask_7 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc102) + %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %cst_0 : tensor<8x16x!tt.ptr> loc(#loc102) + %tmp2 = arith.trunci %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc103) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<8x16xi16> loc(#loc104) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc153) + %flip_20 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc154) + %flip_21 = tt.expand_dims %flip_20 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc154) + %flip_22 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc155) + %flip_23 = tt.reshape %flip_22 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc156) + %y = tt.reshape %tmp0_19 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc162) + %left_mask = arith.subi %cst, %flip_21 : tensor<1x2x1xi32> loc(#loc163) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc164) + %ileft_24 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc164) + %ileft_25 = "tt.reduce"(%ileft_24) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc201) + %ileft_26 = tt.expand_dims %ileft_25 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc166) + %ileft_27 = tt.broadcast %ileft_26 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc167) + %iright = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc168) + %iright_28 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc168) + %iright_29 = "tt.reduce"(%iright_28) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc203) + %iright_30 = tt.expand_dims %iright_29 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc170) + %iright_31 = tt.broadcast %iright_30 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc171) + %ileft_32 = tt.reshape %ileft_27 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc172) + %iright_33 = tt.reshape %iright_31 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc173) + %y_idx = tt.reshape %tmp4 : tensor<8x16xi16> -> tensor<64x2x1xi16> loc(#loc174) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc175) + %left_idx_34 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc176) + %left_idx_35 = arith.muli %y_idx, %left_idx_34 : tensor<64x2x1xi16> loc(#loc176) + %input = arith.extsi %left_idx_35 : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc205) + %left_idx_36 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc206) + %left_idx_37 = tt.expand_dims %left_idx_36 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc178) + %left_idx_38 = tt.broadcast %left_idx_37 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc179) + %right_idx = arith.trunci %flip_21 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc180) + %right_idx_39 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc181) + %right_idx_40 = arith.muli %y_idx, %right_idx_39 : tensor<64x2x1xi16> loc(#loc181) + %input_41 = arith.extsi %right_idx_40 : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc208) + %right_idx_42 = "tt.reduce"(%input_41) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc209) + %right_idx_43 = tt.expand_dims %right_idx_42 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc183) + %right_idx_44 = tt.broadcast %right_idx_43 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc184) + %left_idx_45 = tt.reshape %left_idx_38 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc185) + %right_idx_46 = tt.reshape %right_idx_44 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc186) + %cond = arith.cmpi slt, %ileft_32, %iright_33 : tensor<8x16xi32> loc(#loc187) + %eq = arith.cmpi eq, %ileft_32, %iright_33 : tensor<8x16xi32> loc(#loc188) + %cond_47 = arith.cmpi sgt, %left_idx_45, %right_idx_46 : tensor<8x16xi32> loc(#loc189) + %cond_48 = arith.andi %eq, %cond_47 : tensor<8x16xi1> loc(#loc190) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc191) + %cond_50 = arith.extui %cond_49 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc192) + %cond_51 = arith.xori %cond_50, %flip_23 : tensor<8x16xi32> loc(#loc192) + %cond_52 = arith.cmpi ne, %cond_51, %cst_0 : tensor<8x16xi32> loc(#loc193) + %ret = arith.xori %ileft_32, %iright_33 : tensor<8x16xi32> loc(#loc194) + %ret_53 = arith.select %cond_52, %ret, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195) + %ret_54 = arith.xori %tmp0_19, %ret_53 : tensor<8x16xi32> loc(#loc196) + %new_idxs = arith.xori %left_idx_45, %right_idx_46 : tensor<8x16xi32> loc(#loc197) + %new_idxs_55 = arith.select %cond_52, %new_idxs, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198) + %new_idxs_56 = arith.extsi %tmp2 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc199) + %new_idxs_57 = tt.broadcast %new_idxs_56 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc199) + %new_idxs_58 = arith.xori %new_idxs_57, %new_idxs_55 : tensor<8x16xi32> loc(#loc199) + %flip_59 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc155) + %flip_60 = tt.reshape %flip_59 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc156) + %y_61 = tt.reshape %ret_54 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc162) + %ileft_62 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc164) + %ileft_63 = arith.muli %y_61, %ileft_62 : tensor<32x2x2xi32> loc(#loc164) + %ileft_64 = "tt.reduce"(%ileft_63) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc201) + %ileft_65 = tt.expand_dims %ileft_64 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc166) + %ileft_66 = tt.broadcast %ileft_65 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc167) + %iright_67 = arith.muli %y_61, %flip_22 : tensor<32x2x2xi32> loc(#loc168) + %iright_68 = "tt.reduce"(%iright_67) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc203) + %iright_69 = tt.expand_dims %iright_68 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc170) + %iright_70 = tt.broadcast %iright_69 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc171) + %ileft_71 = tt.reshape %ileft_66 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc172) + %iright_72 = tt.reshape %iright_70 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc173) + %y_idx_73 = tt.reshape %new_idxs_58 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc174) + %left_idx_74 = arith.muli %y_idx_73, %ileft_62 : tensor<32x2x2xi32> loc(#loc176) + %left_idx_75 = "tt.reduce"(%left_idx_74) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc206) + %left_idx_76 = tt.expand_dims %left_idx_75 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc178) + %left_idx_77 = tt.broadcast %left_idx_76 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc179) + %right_idx_78 = arith.muli %y_idx_73, %flip_22 : tensor<32x2x2xi32> loc(#loc181) + %right_idx_79 = "tt.reduce"(%right_idx_78) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc209) + %right_idx_80 = tt.expand_dims %right_idx_79 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc183) + %right_idx_81 = tt.broadcast %right_idx_80 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc184) + %left_idx_82 = tt.reshape %left_idx_77 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc185) + %right_idx_83 = tt.reshape %right_idx_81 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc186) + %cond_84 = arith.cmpi slt, %ileft_71, %iright_72 : tensor<8x16xi32> loc(#loc187) + %eq_85 = arith.cmpi eq, %ileft_71, %iright_72 : tensor<8x16xi32> loc(#loc188) + %cond_86 = arith.cmpi sgt, %left_idx_82, %right_idx_83 : tensor<8x16xi32> loc(#loc189) + %cond_87 = arith.andi %eq_85, %cond_86 : tensor<8x16xi1> loc(#loc190) + %cond_88 = arith.ori %cond_84, %cond_87 : tensor<8x16xi1> loc(#loc191) + %cond_89 = arith.extui %cond_88 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc192) + %cond_90 = arith.xori %cond_89, %flip_60 : tensor<8x16xi32> loc(#loc192) + %cond_91 = arith.cmpi ne, %cond_90, %cst_0 : tensor<8x16xi32> loc(#loc193) + %ret_92 = arith.xori %ileft_71, %iright_72 : tensor<8x16xi32> loc(#loc194) + %ret_93 = arith.select %cond_91, %ret_92, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195) + %ret_94 = arith.xori %ret_54, %ret_93 : tensor<8x16xi32> loc(#loc196) + %new_idxs_95 = arith.xori %left_idx_82, %right_idx_83 : tensor<8x16xi32> loc(#loc197) + %new_idxs_96 = arith.select %cond_91, %new_idxs_95, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198) + %new_idxs_97 = arith.xori %new_idxs_58, %new_idxs_96 : tensor<8x16xi32> loc(#loc199) + %y_98 = tt.reshape %ret_94 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc162) + %ileft_99 = arith.muli %y_98, %ileft : tensor<64x2x1xi32> loc(#loc164) + %ileft_100 = "tt.reduce"(%ileft_99) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc201) + %ileft_101 = tt.expand_dims %ileft_100 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc166) + %ileft_102 = tt.broadcast %ileft_101 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc167) + %iright_103 = arith.muli %y_98, %iright : tensor<64x2x1xi32> loc(#loc168) + %iright_104 = "tt.reduce"(%iright_103) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc203) + %iright_105 = tt.expand_dims %iright_104 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc170) + %iright_106 = tt.broadcast %iright_105 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc171) + %ileft_107 = tt.reshape %ileft_102 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc172) + %iright_108 = tt.reshape %iright_106 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc173) + %y_idx_109 = tt.reshape %new_idxs_97 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc174) + %left_idx_110 = arith.muli %y_idx_109, %ileft : tensor<64x2x1xi32> loc(#loc176) + %left_idx_111 = "tt.reduce"(%left_idx_110) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc206) + %left_idx_112 = tt.expand_dims %left_idx_111 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc178) + %left_idx_113 = tt.broadcast %left_idx_112 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc179) + %right_idx_114 = arith.muli %y_idx_109, %iright : tensor<64x2x1xi32> loc(#loc181) + %right_idx_115 = "tt.reduce"(%right_idx_114) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc209) + %right_idx_116 = tt.expand_dims %right_idx_115 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc183) + %right_idx_117 = tt.broadcast %right_idx_116 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc184) + %left_idx_118 = tt.reshape %left_idx_113 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc185) + %right_idx_119 = tt.reshape %right_idx_117 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc186) + %cond_120 = arith.cmpi slt, %ileft_107, %iright_108 : tensor<8x16xi32> loc(#loc187) + %eq_121 = arith.cmpi eq, %ileft_107, %iright_108 : tensor<8x16xi32> loc(#loc188) + %cond_122 = arith.cmpi sgt, %left_idx_118, %right_idx_119 : tensor<8x16xi32> loc(#loc189) + %cond_123 = arith.andi %eq_121, %cond_122 : tensor<8x16xi1> loc(#loc190) + %cond_124 = arith.ori %cond_120, %cond_123 : tensor<8x16xi1> loc(#loc191) + %cond_125 = arith.extui %cond_124 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc192) + %cond_126 = arith.xori %cond_125, %flip_60 : tensor<8x16xi32> loc(#loc192) + %cond_127 = arith.cmpi ne, %cond_126, %cst_0 : tensor<8x16xi32> loc(#loc193) + %ret_128 = arith.xori %ileft_107, %iright_108 : tensor<8x16xi32> loc(#loc194) + %ret_129 = arith.select %cond_127, %ret_128, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195) + %ret_130 = arith.xori %ret_94, %ret_129 : tensor<8x16xi32> loc(#loc196) + %new_idxs_131 = arith.xori %left_idx_118, %right_idx_119 : tensor<8x16xi32> loc(#loc197) + %new_idxs_132 = arith.select %cond_127, %new_idxs_131, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198) + %new_idxs_133 = arith.xori %new_idxs_97, %new_idxs_132 : tensor<8x16xi32> loc(#loc199) + %flip_134 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc155) + %flip_135 = tt.reshape %flip_134 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc156) + %y_136 = tt.reshape %ret_130 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc162) + %ileft_137 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc164) + %ileft_138 = arith.muli %y_136, %ileft_137 : tensor<16x2x4xi32> loc(#loc164) + %ileft_139 = "tt.reduce"(%ileft_138) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc201) + %ileft_140 = tt.expand_dims %ileft_139 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc166) + %ileft_141 = tt.broadcast %ileft_140 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc167) + %iright_142 = arith.muli %y_136, %flip_59 : tensor<16x2x4xi32> loc(#loc168) + %iright_143 = "tt.reduce"(%iright_142) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc203) + %iright_144 = tt.expand_dims %iright_143 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc170) + %iright_145 = tt.broadcast %iright_144 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc171) + %ileft_146 = tt.reshape %ileft_141 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc172) + %iright_147 = tt.reshape %iright_145 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc173) + %y_idx_148 = tt.reshape %new_idxs_133 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc174) + %left_idx_149 = arith.muli %y_idx_148, %ileft_137 : tensor<16x2x4xi32> loc(#loc176) + %left_idx_150 = "tt.reduce"(%left_idx_149) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc206) + %left_idx_151 = tt.expand_dims %left_idx_150 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc178) + %left_idx_152 = tt.broadcast %left_idx_151 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc179) + %right_idx_153 = arith.muli %y_idx_148, %flip_59 : tensor<16x2x4xi32> loc(#loc181) + %right_idx_154 = "tt.reduce"(%right_idx_153) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc209) + %right_idx_155 = tt.expand_dims %right_idx_154 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc183) + %right_idx_156 = tt.broadcast %right_idx_155 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc184) + %left_idx_157 = tt.reshape %left_idx_152 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc185) + %right_idx_158 = tt.reshape %right_idx_156 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc186) + %cond_159 = arith.cmpi slt, %ileft_146, %iright_147 : tensor<8x16xi32> loc(#loc187) + %eq_160 = arith.cmpi eq, %ileft_146, %iright_147 : tensor<8x16xi32> loc(#loc188) + %cond_161 = arith.cmpi sgt, %left_idx_157, %right_idx_158 : tensor<8x16xi32> loc(#loc189) + %cond_162 = arith.andi %eq_160, %cond_161 : tensor<8x16xi1> loc(#loc190) + %cond_163 = arith.ori %cond_159, %cond_162 : tensor<8x16xi1> loc(#loc191) + %cond_164 = arith.extui %cond_163 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc192) + %cond_165 = arith.xori %cond_164, %flip_135 : tensor<8x16xi32> loc(#loc192) + %cond_166 = arith.cmpi ne, %cond_165, %cst_0 : tensor<8x16xi32> loc(#loc193) + %ret_167 = arith.xori %ileft_146, %iright_147 : tensor<8x16xi32> loc(#loc194) + %ret_168 = arith.select %cond_166, %ret_167, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195) + %ret_169 = arith.xori %ret_130, %ret_168 : tensor<8x16xi32> loc(#loc196) + %new_idxs_170 = arith.xori %left_idx_157, %right_idx_158 : tensor<8x16xi32> loc(#loc197) + %new_idxs_171 = arith.select %cond_166, %new_idxs_170, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198) + %new_idxs_172 = arith.xori %new_idxs_133, %new_idxs_171 : tensor<8x16xi32> loc(#loc199) + %y_173 = tt.reshape %ret_169 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc162) + %ileft_174 = arith.muli %y_173, %ileft_62 : tensor<32x2x2xi32> loc(#loc164) + %ileft_175 = "tt.reduce"(%ileft_174) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc201) + %ileft_176 = tt.expand_dims %ileft_175 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc166) + %ileft_177 = tt.broadcast %ileft_176 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc167) + %iright_178 = arith.muli %y_173, %flip_22 : tensor<32x2x2xi32> loc(#loc168) + %iright_179 = "tt.reduce"(%iright_178) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc203) + %iright_180 = tt.expand_dims %iright_179 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc170) + %iright_181 = tt.broadcast %iright_180 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc171) + %ileft_182 = tt.reshape %ileft_177 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc172) + %iright_183 = tt.reshape %iright_181 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc173) + %y_idx_184 = tt.reshape %new_idxs_172 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc174) + %left_idx_185 = arith.muli %y_idx_184, %ileft_62 : tensor<32x2x2xi32> loc(#loc176) + %left_idx_186 = "tt.reduce"(%left_idx_185) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc206) + %left_idx_187 = tt.expand_dims %left_idx_186 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc178) + %left_idx_188 = tt.broadcast %left_idx_187 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc179) + %right_idx_189 = arith.muli %y_idx_184, %flip_22 : tensor<32x2x2xi32> loc(#loc181) + %right_idx_190 = "tt.reduce"(%right_idx_189) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc209) + %right_idx_191 = tt.expand_dims %right_idx_190 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc183) + %right_idx_192 = tt.broadcast %right_idx_191 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc184) + %left_idx_193 = tt.reshape %left_idx_188 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc185) + %right_idx_194 = tt.reshape %right_idx_192 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc186) + %cond_195 = arith.cmpi slt, %ileft_182, %iright_183 : tensor<8x16xi32> loc(#loc187) + %eq_196 = arith.cmpi eq, %ileft_182, %iright_183 : tensor<8x16xi32> loc(#loc188) + %cond_197 = arith.cmpi sgt, %left_idx_193, %right_idx_194 : tensor<8x16xi32> loc(#loc189) + %cond_198 = arith.andi %eq_196, %cond_197 : tensor<8x16xi1> loc(#loc190) + %cond_199 = arith.ori %cond_195, %cond_198 : tensor<8x16xi1> loc(#loc191) + %cond_200 = arith.extui %cond_199 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc192) + %cond_201 = arith.xori %cond_200, %flip_135 : tensor<8x16xi32> loc(#loc192) + %cond_202 = arith.cmpi ne, %cond_201, %cst_0 : tensor<8x16xi32> loc(#loc193) + %ret_203 = arith.xori %ileft_182, %iright_183 : tensor<8x16xi32> loc(#loc194) + %ret_204 = arith.select %cond_202, %ret_203, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195) + %ret_205 = arith.xori %ret_169, %ret_204 : tensor<8x16xi32> loc(#loc196) + %new_idxs_206 = arith.xori %left_idx_193, %right_idx_194 : tensor<8x16xi32> loc(#loc197) + %new_idxs_207 = arith.select %cond_202, %new_idxs_206, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198) + %new_idxs_208 = arith.xori %new_idxs_172, %new_idxs_207 : tensor<8x16xi32> loc(#loc199) + %y_209 = tt.reshape %ret_205 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc162) + %ileft_210 = arith.muli %y_209, %ileft : tensor<64x2x1xi32> loc(#loc164) + %ileft_211 = "tt.reduce"(%ileft_210) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc201) + %ileft_212 = tt.expand_dims %ileft_211 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc166) + %ileft_213 = tt.broadcast %ileft_212 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc167) + %iright_214 = arith.muli %y_209, %iright : tensor<64x2x1xi32> loc(#loc168) + %iright_215 = "tt.reduce"(%iright_214) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc203) + %iright_216 = tt.expand_dims %iright_215 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc170) + %iright_217 = tt.broadcast %iright_216 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc171) + %ileft_218 = tt.reshape %ileft_213 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc172) + %iright_219 = tt.reshape %iright_217 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc173) + %y_idx_220 = tt.reshape %new_idxs_208 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc174) + %left_idx_221 = arith.muli %y_idx_220, %ileft : tensor<64x2x1xi32> loc(#loc176) + %left_idx_222 = "tt.reduce"(%left_idx_221) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc206) + %left_idx_223 = tt.expand_dims %left_idx_222 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc178) + %left_idx_224 = tt.broadcast %left_idx_223 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc179) + %right_idx_225 = arith.muli %y_idx_220, %iright : tensor<64x2x1xi32> loc(#loc181) + %right_idx_226 = "tt.reduce"(%right_idx_225) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc209) + %right_idx_227 = tt.expand_dims %right_idx_226 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc183) + %right_idx_228 = tt.broadcast %right_idx_227 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc184) + %left_idx_229 = tt.reshape %left_idx_224 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc185) + %right_idx_230 = tt.reshape %right_idx_228 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc186) + %cond_231 = arith.cmpi slt, %ileft_218, %iright_219 : tensor<8x16xi32> loc(#loc187) + %eq_232 = arith.cmpi eq, %ileft_218, %iright_219 : tensor<8x16xi32> loc(#loc188) + %cond_233 = arith.cmpi sgt, %left_idx_229, %right_idx_230 : tensor<8x16xi32> loc(#loc189) + %cond_234 = arith.andi %eq_232, %cond_233 : tensor<8x16xi1> loc(#loc190) + %cond_235 = arith.ori %cond_231, %cond_234 : tensor<8x16xi1> loc(#loc191) + %cond_236 = arith.extui %cond_235 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc192) + %cond_237 = arith.xori %cond_236, %flip_135 : tensor<8x16xi32> loc(#loc192) + %cond_238 = arith.cmpi ne, %cond_237, %cst_0 : tensor<8x16xi32> loc(#loc193) + %ret_239 = arith.xori %ileft_218, %iright_219 : tensor<8x16xi32> loc(#loc194) + %ret_240 = arith.select %cond_238, %ret_239, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195) + %ret_241 = arith.xori %ret_205, %ret_240 : tensor<8x16xi32> loc(#loc196) + %new_idxs_242 = arith.xori %left_idx_229, %right_idx_230 : tensor<8x16xi32> loc(#loc197) + %new_idxs_243 = arith.select %cond_238, %new_idxs_242, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198) + %new_idxs_244 = arith.xori %new_idxs_208, %new_idxs_243 : tensor<8x16xi32> loc(#loc199) + %y_245 = tt.reshape %ret_241 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc162) + %ileft_246 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc164) + %ileft_247 = arith.muli %y_245, %ileft_246 : tensor<8x2x8xi32> loc(#loc164) + %ileft_248 = "tt.reduce"(%ileft_247) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc201) + %ileft_249 = tt.expand_dims %ileft_248 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc166) + %ileft_250 = tt.broadcast %ileft_249 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc167) + %iright_251 = arith.muli %y_245, %flip_134 : tensor<8x2x8xi32> loc(#loc168) + %iright_252 = "tt.reduce"(%iright_251) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc203) + %iright_253 = tt.expand_dims %iright_252 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc170) + %iright_254 = tt.broadcast %iright_253 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc171) + %ileft_255 = tt.reshape %ileft_250 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc172) + %iright_256 = tt.reshape %iright_254 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc173) + %y_idx_257 = tt.reshape %new_idxs_244 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc174) + %left_idx_258 = arith.muli %y_idx_257, %ileft_246 : tensor<8x2x8xi32> loc(#loc176) + %left_idx_259 = "tt.reduce"(%left_idx_258) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc206) + %left_idx_260 = tt.expand_dims %left_idx_259 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc178) + %left_idx_261 = tt.broadcast %left_idx_260 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc179) + %right_idx_262 = arith.muli %y_idx_257, %flip_134 : tensor<8x2x8xi32> loc(#loc181) + %right_idx_263 = "tt.reduce"(%right_idx_262) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc209) + %right_idx_264 = tt.expand_dims %right_idx_263 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc183) + %right_idx_265 = tt.broadcast %right_idx_264 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc184) + %left_idx_266 = tt.reshape %left_idx_261 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc185) + %right_idx_267 = tt.reshape %right_idx_265 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc186) + %cond_268 = arith.cmpi slt, %ileft_255, %iright_256 : tensor<8x16xi32> loc(#loc187) + %eq_269 = arith.cmpi eq, %ileft_255, %iright_256 : tensor<8x16xi32> loc(#loc188) + %cond_270 = arith.cmpi sgt, %left_idx_266, %right_idx_267 : tensor<8x16xi32> loc(#loc189) + %cond_271 = arith.andi %eq_269, %cond_270 : tensor<8x16xi1> loc(#loc190) + %cond_272 = arith.ori %cond_268, %cond_271 : tensor<8x16xi1> loc(#loc191) + %ret_273 = arith.xori %ileft_255, %iright_256 : tensor<8x16xi32> loc(#loc194) + %ret_274 = arith.select %cond_272, %ret_273, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195) + %ret_275 = arith.xori %ret_241, %ret_274 : tensor<8x16xi32> loc(#loc196) + %new_idxs_276 = arith.xori %left_idx_266, %right_idx_267 : tensor<8x16xi32> loc(#loc197) + %new_idxs_277 = arith.select %cond_272, %new_idxs_276, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198) + %new_idxs_278 = arith.xori %new_idxs_244, %new_idxs_277 : tensor<8x16xi32> loc(#loc199) + %y_279 = tt.reshape %ret_275 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc162) + %ileft_280 = arith.muli %y_279, %ileft_137 : tensor<16x2x4xi32> loc(#loc164) + %ileft_281 = "tt.reduce"(%ileft_280) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc201) + %ileft_282 = tt.expand_dims %ileft_281 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc166) + %ileft_283 = tt.broadcast %ileft_282 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc167) + %iright_284 = arith.muli %y_279, %flip_59 : tensor<16x2x4xi32> loc(#loc168) + %iright_285 = "tt.reduce"(%iright_284) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc203) + %iright_286 = tt.expand_dims %iright_285 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc170) + %iright_287 = tt.broadcast %iright_286 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc171) + %ileft_288 = tt.reshape %ileft_283 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc172) + %iright_289 = tt.reshape %iright_287 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc173) + %y_idx_290 = tt.reshape %new_idxs_278 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc174) + %left_idx_291 = arith.muli %y_idx_290, %ileft_137 : tensor<16x2x4xi32> loc(#loc176) + %left_idx_292 = "tt.reduce"(%left_idx_291) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc206) + %left_idx_293 = tt.expand_dims %left_idx_292 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc178) + %left_idx_294 = tt.broadcast %left_idx_293 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc179) + %right_idx_295 = arith.muli %y_idx_290, %flip_59 : tensor<16x2x4xi32> loc(#loc181) + %right_idx_296 = "tt.reduce"(%right_idx_295) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc209) + %right_idx_297 = tt.expand_dims %right_idx_296 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc183) + %right_idx_298 = tt.broadcast %right_idx_297 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc184) + %left_idx_299 = tt.reshape %left_idx_294 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc185) + %right_idx_300 = tt.reshape %right_idx_298 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc186) + %cond_301 = arith.cmpi slt, %ileft_288, %iright_289 : tensor<8x16xi32> loc(#loc187) + %eq_302 = arith.cmpi eq, %ileft_288, %iright_289 : tensor<8x16xi32> loc(#loc188) + %cond_303 = arith.cmpi sgt, %left_idx_299, %right_idx_300 : tensor<8x16xi32> loc(#loc189) + %cond_304 = arith.andi %eq_302, %cond_303 : tensor<8x16xi1> loc(#loc190) + %cond_305 = arith.ori %cond_301, %cond_304 : tensor<8x16xi1> loc(#loc191) + %ret_306 = arith.xori %ileft_288, %iright_289 : tensor<8x16xi32> loc(#loc194) + %ret_307 = arith.select %cond_305, %ret_306, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195) + %ret_308 = arith.xori %ret_275, %ret_307 : tensor<8x16xi32> loc(#loc196) + %new_idxs_309 = arith.xori %left_idx_299, %right_idx_300 : tensor<8x16xi32> loc(#loc197) + %new_idxs_310 = arith.select %cond_305, %new_idxs_309, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198) + %new_idxs_311 = arith.xori %new_idxs_278, %new_idxs_310 : tensor<8x16xi32> loc(#loc199) + %y_312 = tt.reshape %ret_308 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc162) + %ileft_313 = arith.muli %y_312, %ileft_62 : tensor<32x2x2xi32> loc(#loc164) + %ileft_314 = "tt.reduce"(%ileft_313) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc201) + %ileft_315 = tt.expand_dims %ileft_314 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc166) + %ileft_316 = tt.broadcast %ileft_315 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc167) + %iright_317 = arith.muli %y_312, %flip_22 : tensor<32x2x2xi32> loc(#loc168) + %iright_318 = "tt.reduce"(%iright_317) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc203) + %iright_319 = tt.expand_dims %iright_318 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc170) + %iright_320 = tt.broadcast %iright_319 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc171) + %ileft_321 = tt.reshape %ileft_316 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc172) + %iright_322 = tt.reshape %iright_320 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc173) + %y_idx_323 = tt.reshape %new_idxs_311 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc174) + %left_idx_324 = arith.muli %y_idx_323, %ileft_62 : tensor<32x2x2xi32> loc(#loc176) + %left_idx_325 = "tt.reduce"(%left_idx_324) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc206) + %left_idx_326 = tt.expand_dims %left_idx_325 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc178) + %left_idx_327 = tt.broadcast %left_idx_326 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc179) + %right_idx_328 = arith.muli %y_idx_323, %flip_22 : tensor<32x2x2xi32> loc(#loc181) + %right_idx_329 = "tt.reduce"(%right_idx_328) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc209) + %right_idx_330 = tt.expand_dims %right_idx_329 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc183) + %right_idx_331 = tt.broadcast %right_idx_330 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc184) + %left_idx_332 = tt.reshape %left_idx_327 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc185) + %right_idx_333 = tt.reshape %right_idx_331 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc186) + %cond_334 = arith.cmpi slt, %ileft_321, %iright_322 : tensor<8x16xi32> loc(#loc187) + %eq_335 = arith.cmpi eq, %ileft_321, %iright_322 : tensor<8x16xi32> loc(#loc188) + %cond_336 = arith.cmpi sgt, %left_idx_332, %right_idx_333 : tensor<8x16xi32> loc(#loc189) + %cond_337 = arith.andi %eq_335, %cond_336 : tensor<8x16xi1> loc(#loc190) + %cond_338 = arith.ori %cond_334, %cond_337 : tensor<8x16xi1> loc(#loc191) + %ret_339 = arith.xori %ileft_321, %iright_322 : tensor<8x16xi32> loc(#loc194) + %ret_340 = arith.select %cond_338, %ret_339, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc195) + %ret_341 = arith.xori %ret_308, %ret_340 : tensor<8x16xi32> loc(#loc196) + %new_idxs_342 = arith.xori %left_idx_332, %right_idx_333 : tensor<8x16xi32> loc(#loc197) + %new_idxs_343 = arith.select %cond_338, %new_idxs_342, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198) + %new_idxs_344 = arith.xori %new_idxs_311, %new_idxs_343 : tensor<8x16xi32> loc(#loc199) + %y_345 = tt.reshape %ret_341 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc162) + %ileft_346 = arith.muli %y_345, %ileft : tensor<64x2x1xi32> loc(#loc164) + %ileft_347 = "tt.reduce"(%ileft_346) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc201) + %ileft_348 = tt.expand_dims %ileft_347 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc166) + %ileft_349 = tt.broadcast %ileft_348 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc167) + %iright_350 = arith.muli %y_345, %iright : tensor<64x2x1xi32> loc(#loc168) + %iright_351 = "tt.reduce"(%iright_350) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc203) + %iright_352 = tt.expand_dims %iright_351 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc170) + %iright_353 = tt.broadcast %iright_352 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc171) + %ileft_354 = tt.reshape %ileft_349 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc172) + %iright_355 = tt.reshape %iright_353 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc173) + %y_idx_356 = tt.reshape %new_idxs_344 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc174) + %left_idx_357 = arith.muli %y_idx_356, %ileft : tensor<64x2x1xi32> loc(#loc176) + %left_idx_358 = "tt.reduce"(%left_idx_357) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc206) + %left_idx_359 = tt.expand_dims %left_idx_358 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc178) + %left_idx_360 = tt.broadcast %left_idx_359 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc179) + %right_idx_361 = arith.muli %y_idx_356, %iright : tensor<64x2x1xi32> loc(#loc181) + %right_idx_362 = "tt.reduce"(%right_idx_361) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc209) + %right_idx_363 = tt.expand_dims %right_idx_362 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc183) + %right_idx_364 = tt.broadcast %right_idx_363 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc184) + %left_idx_365 = tt.reshape %left_idx_360 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc185) + %right_idx_366 = tt.reshape %right_idx_364 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc186) + %cond_367 = arith.cmpi slt, %ileft_354, %iright_355 : tensor<8x16xi32> loc(#loc187) + %eq_368 = arith.cmpi eq, %ileft_354, %iright_355 : tensor<8x16xi32> loc(#loc188) + %cond_369 = arith.cmpi sgt, %left_idx_365, %right_idx_366 : tensor<8x16xi32> loc(#loc189) + %cond_370 = arith.andi %eq_368, %cond_369 : tensor<8x16xi1> loc(#loc190) + %cond_371 = arith.ori %cond_367, %cond_370 : tensor<8x16xi1> loc(#loc191) + %new_idxs_372 = arith.xori %left_idx_365, %right_idx_366 : tensor<8x16xi32> loc(#loc197) + %new_idxs_373 = arith.select %cond_371, %new_idxs_372, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc198) + %new_idxs_374 = arith.xori %new_idxs_344, %new_idxs_373 : tensor<8x16xi32> loc(#loc199) + %tmp7 = arith.extsi %tmp0_19 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc149) + %tmp10_375 = arith.select %tmp0_18, %tmp7, %tmp10 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc86) + %tmp11 = "tt.reduce"(%tmp10_375) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_377: i64 loc(callsite(#loc1 at #loc150)), %tmp11_378: i64 loc(callsite(#loc1 at #loc150))): + %tmp11_379 = arith.addi %tmp11_377, %tmp11_378 : i64 loc(#loc200) + tt.reduce.return %tmp11_379 : i64 loc(#loc160) + }) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc160) + %tmp11_376 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc151) + %tmp14 = arith.trunci %tmp11_376 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc152) + %0 = arith.muli %xindex_6, %cst_2 : tensor<8x1xi32> loc(#loc73) + %1 = tt.broadcast %r0_index_8 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc74) + %2 = tt.broadcast %0 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc74) + %3 = arith.addi %1, %2 : tensor<8x16xi32> loc(#loc74) + %4 = tt.splat %out_ptr2 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc75) + %5 = tt.addptr %4, %3 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc75) + tt.store %5, %new_idxs_374, %tmp0_18 : tensor<8x16x!tt.ptr> loc(#loc76) + %6 = tt.splat %out_ptr3 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc77) + %7 = tt.addptr %6, %xindex_6 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc77) + tt.store %7, %tmp14, %xmask_7 : tensor<8x1x!tt.ptr> loc(#loc78) + tt.return loc(#loc79) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":44:34) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:49) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:38) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":26:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":24:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":24:33) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:36) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:44) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:23) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":27:28) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":27:38) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":33:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":34:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:45) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:30) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:54) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":38:19) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":40:33) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":42:19) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":45:29) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":48:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:35) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:32) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:47) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:25) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:37) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:4) +#loc85 = loc(callsite(#loc1 at #loc2)) +#loc86 = loc("tmp10"(#loc3)) +#loc87 = loc("tmp0"(#loc4)) +#loc88 = loc("tmp0"(#loc5)) +#loc89 = loc("xmask"(#loc6)) +#loc90 = loc("xoffset"(#loc7)) +#loc91 = loc("xoffset"(#loc8)) +#loc92 = loc("xindex"(#loc9)) +#loc93 = loc("xindex"(#loc10)) +#loc94 = loc("xindex"(#loc11)) +#loc95 = loc("r0_index"(#loc12)) +#loc96 = loc("r0_index"(#loc13)) +#loc97 = loc("x0"(#loc14)) +#loc98 = loc("x1"(#loc15)) +#loc99 = loc("tmp0"(#loc16)) +#loc100 = loc("tmp0"(#loc17)) +#loc101 = loc("tmp0"(#loc18)) +#loc102 = loc("tmp0"(#loc19)) +#loc103 = loc("tmp2"(#loc20)) +#loc104 = loc("tmp4"(#loc21)) +#loc105 = loc("flip"(#loc22)) +#loc107 = loc("flip"(#loc24)) +#loc108 = loc("flip"(#loc25)) +#loc109 = loc("flip"(#loc26)) +#loc110 = loc("y"(#loc27)) +#loc111 = loc("left_mask"(#loc29)) +#loc112 = loc("ileft"(#loc30)) +#loc114 = loc("ileft"(#loc34)) +#loc115 = loc("ileft"(#loc35)) +#loc116 = loc("iright"(#loc36)) +#loc118 = loc("iright"(#loc38)) +#loc119 = loc("iright"(#loc39)) +#loc120 = loc("ileft"(#loc40)) +#loc121 = loc("iright"(#loc41)) +#loc122 = loc("y_idx"(#loc42)) +#loc123 = loc("left_idx"(#loc43)) +#loc124 = loc("left_idx"(#loc44)) +#loc125 = loc("input"(#loc45)) +#loc127 = loc("left_idx"(#loc47)) +#loc128 = loc("left_idx"(#loc48)) +#loc129 = loc("right_idx"(#loc49)) +#loc130 = loc("right_idx"(#loc50)) +#loc132 = loc("right_idx"(#loc52)) +#loc133 = loc("right_idx"(#loc53)) +#loc134 = loc("left_idx"(#loc54)) +#loc135 = loc("right_idx"(#loc55)) +#loc136 = loc("cond"(#loc56)) +#loc137 = loc("eq"(#loc57)) +#loc138 = loc("cond"(#loc58)) +#loc139 = loc("cond"(#loc59)) +#loc140 = loc("cond"(#loc60)) +#loc141 = loc("cond"(#loc61)) +#loc142 = loc("cond"(#loc62)) +#loc143 = loc("ret"(#loc63)) +#loc144 = loc("ret"(#loc64)) +#loc145 = loc("ret"(#loc65)) +#loc146 = loc("new_idxs"(#loc66)) +#loc147 = loc("new_idxs"(#loc67)) +#loc148 = loc("new_idxs"(#loc68)) +#loc149 = loc("tmp7"(#loc69)) +#loc151 = loc("tmp11"(#loc71)) +#loc152 = loc("tmp14"(#loc72)) +#loc153 = loc(callsite(#loc105 at #loc106)) +#loc154 = loc(callsite(#loc107 at #loc106)) +#loc155 = loc(callsite(#loc108 at #loc106)) +#loc156 = loc(callsite(#loc109 at #loc106)) +#loc158 = loc("cond"(#loc136)) +#loc159 = loc("eq"(#loc137)) +#loc160 = loc(callsite(#loc31 at #loc150)) +#loc162 = loc(callsite(#loc110 at #loc157)) +#loc163 = loc(callsite(#loc111 at #loc157)) +#loc164 = loc(callsite(#loc112 at #loc157)) +#loc166 = loc(callsite(#loc114 at #loc157)) +#loc167 = loc(callsite(#loc115 at #loc157)) +#loc168 = loc(callsite(#loc116 at #loc157)) +#loc170 = loc(callsite(#loc118 at #loc157)) +#loc171 = loc(callsite(#loc119 at #loc157)) +#loc172 = loc(callsite(#loc120 at #loc157)) +#loc173 = loc(callsite(#loc121 at #loc157)) +#loc174 = loc(callsite(#loc122 at #loc157)) +#loc175 = loc(callsite(#loc123 at #loc157)) +#loc176 = loc(callsite(#loc124 at #loc157)) +#loc178 = loc(callsite(#loc127 at #loc157)) +#loc179 = loc(callsite(#loc128 at #loc157)) +#loc180 = loc(callsite(#loc129 at #loc157)) +#loc181 = loc(callsite(#loc130 at #loc157)) +#loc183 = loc(callsite(#loc132 at #loc157)) +#loc184 = loc(callsite(#loc133 at #loc157)) +#loc185 = loc(callsite(#loc134 at #loc157)) +#loc186 = loc(callsite(#loc135 at #loc157)) +#loc187 = loc(callsite(#loc158 at #loc157)) +#loc188 = loc(callsite(#loc159 at #loc157)) +#loc189 = loc(callsite(#loc138 at #loc157)) +#loc190 = loc(callsite(#loc139 at #loc157)) +#loc191 = loc(callsite(#loc140 at #loc157)) +#loc192 = loc(callsite(#loc141 at #loc157)) +#loc193 = loc(callsite(#loc142 at #loc157)) +#loc194 = loc(callsite(#loc143 at #loc157)) +#loc195 = loc(callsite(#loc144 at #loc157)) +#loc196 = loc(callsite(#loc145 at #loc157)) +#loc197 = loc(callsite(#loc146 at #loc157)) +#loc198 = loc(callsite(#loc147 at #loc157)) +#loc199 = loc(callsite(#loc148 at #loc157)) +#loc200 = loc(callsite(#loc33 at #loc160)) +#loc201 = loc(callsite(#loc31 at #loc165)) +#loc203 = loc(callsite(#loc31 at #loc169)) +#loc205 = loc(callsite(#loc125 at #loc177)) +#loc206 = loc(callsite(#loc31 at #loc177)) +#loc208 = loc(callsite(#loc125 at #loc182)) +#loc209 = loc(callsite(#loc31 at #loc182)) +#loc211 = loc(callsite(#loc33 at #loc201)) +#loc212 = loc(callsite(#loc33 at #loc203)) +#loc213 = loc(callsite(#loc33 at #loc206)) +#loc214 = loc(callsite(#loc33 at #loc209)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/__grp__triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/__grp__triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..80172ad60d41ac09070823ab076db8e4440b1bf3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/__grp__triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..eb419d056b04d11f22360fbc206d902222378d1e Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..949d73949091e0aa1e18783929511149e2106e7b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"hash": "dce51101d74002d90b1fe021a32b8040ce453636ab0b0ab209427e2e72d803f0", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..a61ce89ae67fb870abd174d63a5fd5d7e89c7cf9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.llir @@ -0,0 +1,611 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_argmax_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 3, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = and i32 %9, 384, !dbg !9 + %11 = lshr exact i32 %10, 7, !dbg !9 + %12 = or disjoint i32 %11, 4, !dbg !9 + %13 = or disjoint i32 %11, %8, !dbg !10 + %14 = or disjoint i32 %12, %8, !dbg !10 + %15 = shl nuw nsw i32 %9, 2, !dbg !11 + %16 = and i32 %15, 508, !dbg !11 + %17 = sdiv i32 %13, 2048, !dbg !12 + %18 = sdiv i32 %14, 2048, !dbg !12 + %19 = mul i32 %13, 32000 + %20 = mul i32 %17, 224000 + %21 = add i32 %20, %19 + %22 = mul i32 %14, 32000 + %23 = mul i32 %18, 224000 + %24 = add i32 %23, %22 + %25 = zext nneg i32 %16 to i64, !dbg !13 + br label %26, !dbg !13 + +26: ; preds = %6, %26 + %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %26 ] + %27 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %126, %26 ] + %28 = phi <2 x i32> [ splat (i32 2147483647), %6 ], [ %131, %26 ] + %29 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %127, %26 ] + %30 = phi <2 x i32> [ splat (i32 2147483647), %6 ], [ %132, %26 ] + %31 = phi <4 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %130, %26 ] + %32 = phi <4 x i32> [ splat (i32 2147483647), %6 ], [ %133, %26 ] + %33 = or disjoint i64 %indvars.iv, %25, !dbg !14 + %34 = icmp samesign ult i64 %33, 32000, !dbg !15 + %35 = trunc nuw nsw i64 %33 to i32, !dbg !16 + %36 = add i32 %21, %35, !dbg !16 + %37 = add i32 %24, %35, !dbg !16 + %38 = sext i32 %36 to i64, !dbg !17 + %39 = getelementptr float, ptr addrspace(1) %0, i64 %38, !dbg !17 + %40 = sext i32 %37 to i64, !dbg !17 + %41 = getelementptr float, ptr addrspace(1) %0, i64 %40, !dbg !17 + %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !18 + %43 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %39, i64 %42, i1 %34) #4, !dbg !18 + %44 = extractvalue { i32, i32, i32, i32 } %43, 0, !dbg !18 + %45 = extractvalue { i32, i32, i32, i32 } %43, 1, !dbg !18 + %46 = extractvalue { i32, i32, i32, i32 } %43, 2, !dbg !18 + %47 = extractvalue { i32, i32, i32, i32 } %43, 3, !dbg !18 + %48 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !18 + %49 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %41, i64 %48, i1 %34) #4, !dbg !18 + %50 = extractvalue { i32, i32, i32, i32 } %49, 0, !dbg !18 + %51 = extractvalue { i32, i32, i32, i32 } %49, 1, !dbg !18 + %52 = extractvalue { i32, i32, i32, i32 } %49, 2, !dbg !18 + %53 = extractvalue { i32, i32, i32, i32 } %49, 3, !dbg !18 + %54 = fcmp uno <2 x float> %27, zeroinitializer, !dbg !19 + %55 = fcmp uno <4 x float> %31, zeroinitializer, !dbg !19 + %56 = fcmp uno <2 x float> %29, zeroinitializer, !dbg !19 + %57 = sext <2 x i32> %28 to <2 x i64>, !dbg !23 + %58 = sext <2 x i32> %30 to <2 x i64>, !dbg !23 + %59 = trunc nuw nsw i64 %33 to i32, !dbg !24 + %60 = or disjoint i32 %59, 1, !dbg !24 + %61 = insertelement <2 x i32> poison, i32 %44, i64 0, !dbg !18 + %62 = insertelement <2 x i32> %61, i32 %45, i64 1, !dbg !18 + %63 = bitcast <2 x i32> %62 to <2 x float>, !dbg !18 + %64 = fcmp ogt <2 x float> %27, %63, !dbg !25 + %65 = fcmp oeq <2 x float> %27, %63, !dbg !26 + %66 = fcmp uno <2 x float> %63, zeroinitializer, !dbg !27 + %67 = xor <2 x i1> %66, splat (i1 true), !dbg !28 + %68 = and <2 x i1> %54, %67, !dbg !29 + %69 = or <2 x i1> %64, %68, !dbg !30 + %70 = and <2 x i1> %54, %66, !dbg !31 + %71 = or <2 x i1> %65, %70, !dbg !32 + %72 = insertelement <2 x i64> poison, i64 %33, i64 0, !dbg !23 + %73 = shufflevector <2 x i64> %72, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !23 + %74 = icmp sgt <2 x i64> %73, %57, !dbg !23 + %75 = icmp sge <2 x i64> %73, %57, !dbg !23 + %76 = shufflevector <2 x i1> %74, <2 x i1> %75, <2 x i32> , !dbg !23 + %77 = and <2 x i1> %76, %71, !dbg !33 + %78 = or <2 x i1> %69, %77, !dbg !34 + %79 = select <2 x i1> %78, <2 x float> %27, <2 x float> %63, !dbg !35 + %80 = insertelement <2 x i32> poison, i32 %35, i64 0, !dbg !24 + %81 = insertelement <2 x i32> %80, i32 %60, i64 1, !dbg !24 + %82 = select <2 x i1> %78, <2 x i32> %28, <2 x i32> %81, !dbg !24 + %83 = insertelement <2 x i32> poison, i32 %50, i64 0, !dbg !18 + %84 = insertelement <2 x i32> %83, i32 %51, i64 1, !dbg !18 + %85 = bitcast <2 x i32> %84 to <2 x float>, !dbg !18 + %86 = fcmp ogt <2 x float> %29, %85, !dbg !25 + %87 = fcmp oeq <2 x float> %29, %85, !dbg !26 + %88 = fcmp uno <2 x float> %85, zeroinitializer, !dbg !27 + %89 = xor <2 x i1> %88, splat (i1 true), !dbg !28 + %90 = and <2 x i1> %56, %89, !dbg !29 + %91 = or <2 x i1> %86, %90, !dbg !30 + %92 = and <2 x i1> %56, %88, !dbg !31 + %93 = or <2 x i1> %87, %92, !dbg !32 + %94 = icmp sgt <2 x i64> %73, %58, !dbg !23 + %95 = icmp sge <2 x i64> %73, %58, !dbg !23 + %96 = shufflevector <2 x i1> %94, <2 x i1> %95, <2 x i32> , !dbg !23 + %97 = and <2 x i1> %96, %93, !dbg !33 + %98 = or <2 x i1> %91, %97, !dbg !34 + %99 = select <2 x i1> %98, <2 x float> %29, <2 x float> %85, !dbg !35 + %100 = select <2 x i1> %98, <2 x i32> %30, <2 x i32> %81, !dbg !24 + %101 = or disjoint <2 x i64> %73, , !dbg !14 + %102 = shufflevector <2 x i64> %101, <2 x i64> poison, <4 x i32> , !dbg !14 + %103 = insertelement <4 x i32> poison, i32 %53, i64 0, !dbg !18 + %104 = insertelement <4 x i32> %103, i32 %52, i64 1, !dbg !18 + %105 = insertelement <4 x i32> %104, i32 %47, i64 2, !dbg !18 + %106 = insertelement <4 x i32> %105, i32 %46, i64 3, !dbg !18 + %107 = bitcast <4 x i32> %106 to <4 x float>, !dbg !18 + %108 = fcmp ogt <4 x float> %31, %107, !dbg !25 + %109 = fcmp oeq <4 x float> %31, %107, !dbg !26 + %110 = fcmp uno <4 x float> %107, zeroinitializer, !dbg !27 + %111 = xor <4 x i1> %110, splat (i1 true), !dbg !28 + %112 = and <4 x i1> %55, %111, !dbg !29 + %113 = or <4 x i1> %108, %112, !dbg !30 + %114 = and <4 x i1> %55, %110, !dbg !31 + %115 = or <4 x i1> %109, %114, !dbg !32 + %116 = sext <4 x i32> %32 to <4 x i64>, !dbg !23 + %117 = icmp sgt <4 x i64> %102, %116, !dbg !23 + %118 = and <4 x i1> %117, %115, !dbg !33 + %119 = or <4 x i1> %113, %118, !dbg !34 + %120 = select <4 x i1> %119, <4 x float> %31, <4 x float> %107, !dbg !35 + %121 = trunc <2 x i64> %101 to <2 x i32>, !dbg !24 + %122 = shufflevector <2 x i32> %121, <2 x i32> poison, <4 x i32> , !dbg !24 + %123 = select <4 x i1> %119, <4 x i32> %32, <4 x i32> %122, !dbg !24 + %124 = insertelement <2 x i1> poison, i1 %34, i64 0, !dbg !36 + %125 = shufflevector <2 x i1> %124, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !36 + %126 = select <2 x i1> %125, <2 x float> %79, <2 x float> %27, !dbg !36 + %127 = select <2 x i1> %125, <2 x float> %99, <2 x float> %29, !dbg !36 + %128 = insertelement <4 x i1> poison, i1 %34, i64 0, !dbg !36 + %129 = shufflevector <4 x i1> %128, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !36 + %130 = select <4 x i1> %129, <4 x float> %120, <4 x float> %31, !dbg !36 + %131 = select <2 x i1> %125, <2 x i32> %82, <2 x i32> %28, !dbg !37 + %132 = select <2 x i1> %125, <2 x i32> %100, <2 x i32> %30, !dbg !37 + %133 = select <4 x i1> %129, <4 x i32> %123, <4 x i32> %32, !dbg !37 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 512, !dbg !13 + %134 = icmp samesign ult i64 %indvars.iv, 31488, !dbg !13 + br i1 %134, label %26, label %135, !dbg !13 + +135: ; preds = %26 + %136 = and i32 %9, 7, !dbg !9 + %137 = or disjoint i32 %8, %136, !dbg !10 + %138 = and i32 %9, 31, !dbg !9 + %139 = lshr i32 %9, 5, !dbg !9 + %140 = shufflevector <2 x float> %126, <2 x float> poison, <2 x i32> , !dbg !38 + %141 = fcmp ogt <2 x float> %126, %140, !dbg !38 + %142 = fcmp oeq <2 x float> %126, %140, !dbg !38 + %143 = shufflevector <2 x i1> %141, <2 x i1> %142, <2 x i32> , !dbg !38 + %144 = extractelement <2 x float> %126, i64 0, !dbg !40 + %145 = fcmp uno float %144, 0.000000e+00, !dbg !40 + %146 = extractelement <2 x float> %126, i64 1, !dbg !41 + %147 = fcmp uno float %146, 0.000000e+00, !dbg !41 + %148 = xor i1 %147, true, !dbg !42 + %149 = insertelement <2 x i1> poison, i1 %145, i64 0, !dbg !43 + %150 = shufflevector <2 x i1> %149, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !43 + %151 = insertelement <2 x i1> poison, i1 %148, i64 0, !dbg !43 + %152 = insertelement <2 x i1> %151, i1 %147, i64 1, !dbg !43 + %153 = and <2 x i1> %150, %152, !dbg !43 + %154 = or <2 x i1> %143, %153, !dbg !44 + %155 = extractelement <2 x i32> %131, i64 0, !dbg !45 + %156 = extractelement <2 x i32> %131, i64 1, !dbg !45 + %157 = icmp slt i32 %155, %156, !dbg !45 + %158 = extractelement <2 x i1> %154, i64 1, !dbg !46 + %159 = and i1 %157, %158, !dbg !46 + %160 = extractelement <2 x i1> %154, i64 0, !dbg !47 + %161 = or i1 %160, %159, !dbg !47 + %162 = select i1 %161, float %144, float %146, !dbg !48 + %163 = select i1 %161, i32 %155, i32 %156, !dbg !49 + %164 = extractelement <4 x float> %130, i64 3, !dbg !38 + %165 = fcmp ogt float %162, %164, !dbg !38 + %166 = fcmp oeq float %162, %164, !dbg !50 + %167 = fcmp uno float %162, 0.000000e+00, !dbg !40 + %168 = fcmp uno <4 x float> %130, zeroinitializer, !dbg !41 + %169 = extractelement <4 x i1> %168, i64 3, !dbg !51 + %170 = xor i1 %169, true, !dbg !42 + %171 = and i1 %167, %170, !dbg !43 + %172 = or i1 %165, %171, !dbg !44 + %173 = and i1 %169, %167, !dbg !51 + %174 = or i1 %166, %173, !dbg !52 + %175 = extractelement <4 x i32> %133, i64 3, !dbg !45 + %176 = icmp slt i32 %163, %175, !dbg !45 + %177 = and i1 %176, %174, !dbg !46 + %178 = or i1 %172, %177, !dbg !47 + %179 = select i1 %178, float %162, float %164, !dbg !48 + %180 = select i1 %178, i32 %163, i32 %175, !dbg !49 + %181 = extractelement <4 x float> %130, i64 2, !dbg !38 + %182 = fcmp ogt float %179, %181, !dbg !38 + %183 = fcmp oeq float %179, %181, !dbg !50 + %184 = fcmp uno float %179, 0.000000e+00, !dbg !40 + %185 = extractelement <4 x i1> %168, i64 2, !dbg !51 + %186 = xor i1 %185, true, !dbg !42 + %187 = and i1 %184, %186, !dbg !43 + %188 = or i1 %182, %187, !dbg !44 + %189 = and i1 %185, %184, !dbg !51 + %190 = or i1 %183, %189, !dbg !52 + %191 = extractelement <4 x i32> %133, i64 2, !dbg !45 + %192 = icmp slt i32 %180, %191, !dbg !45 + %193 = and i1 %192, %190, !dbg !46 + %194 = or i1 %188, %193, !dbg !47 + %195 = select i1 %194, float %179, float %181, !dbg !48 + %196 = select i1 %194, i32 %180, i32 %191, !dbg !49 + %197 = shufflevector <2 x float> %127, <2 x float> poison, <2 x i32> , !dbg !38 + %198 = fcmp ogt <2 x float> %127, %197, !dbg !38 + %199 = fcmp oeq <2 x float> %127, %197, !dbg !38 + %200 = shufflevector <2 x i1> %198, <2 x i1> %199, <2 x i32> , !dbg !38 + %201 = extractelement <2 x float> %127, i64 0, !dbg !40 + %202 = fcmp uno float %201, 0.000000e+00, !dbg !40 + %203 = extractelement <2 x float> %127, i64 1, !dbg !41 + %204 = fcmp uno float %203, 0.000000e+00, !dbg !41 + %205 = xor i1 %204, true, !dbg !42 + %206 = insertelement <2 x i1> poison, i1 %202, i64 0, !dbg !43 + %207 = shufflevector <2 x i1> %206, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !43 + %208 = insertelement <2 x i1> poison, i1 %205, i64 0, !dbg !43 + %209 = insertelement <2 x i1> %208, i1 %204, i64 1, !dbg !43 + %210 = and <2 x i1> %207, %209, !dbg !43 + %211 = or <2 x i1> %200, %210, !dbg !44 + %212 = extractelement <2 x i32> %132, i64 0, !dbg !45 + %213 = extractelement <2 x i32> %132, i64 1, !dbg !45 + %214 = icmp slt i32 %212, %213, !dbg !45 + %215 = extractelement <2 x i1> %211, i64 1, !dbg !46 + %216 = and i1 %214, %215, !dbg !46 + %217 = extractelement <2 x i1> %211, i64 0, !dbg !47 + %218 = or i1 %217, %216, !dbg !47 + %219 = select i1 %218, float %201, float %203, !dbg !48 + %220 = select i1 %218, i32 %212, i32 %213, !dbg !49 + %221 = extractelement <4 x float> %130, i64 1, !dbg !38 + %222 = fcmp ogt float %219, %221, !dbg !38 + %223 = fcmp oeq float %219, %221, !dbg !50 + %224 = fcmp uno float %219, 0.000000e+00, !dbg !40 + %225 = extractelement <4 x i1> %168, i64 1, !dbg !51 + %226 = xor i1 %225, true, !dbg !42 + %227 = and i1 %224, %226, !dbg !43 + %228 = or i1 %222, %227, !dbg !44 + %229 = and i1 %225, %224, !dbg !51 + %230 = or i1 %223, %229, !dbg !52 + %231 = extractelement <4 x i32> %133, i64 1, !dbg !45 + %232 = icmp slt i32 %220, %231, !dbg !45 + %233 = and i1 %232, %230, !dbg !46 + %234 = or i1 %228, %233, !dbg !47 + %235 = select i1 %234, float %219, float %221, !dbg !48 + %236 = select i1 %234, i32 %220, i32 %231, !dbg !49 + %237 = extractelement <4 x float> %130, i64 0, !dbg !38 + %238 = fcmp ogt float %235, %237, !dbg !38 + %239 = fcmp oeq float %235, %237, !dbg !50 + %240 = fcmp uno float %235, 0.000000e+00, !dbg !40 + %241 = extractelement <4 x i1> %168, i64 0, !dbg !51 + %242 = xor i1 %241, true, !dbg !42 + %243 = and i1 %240, %242, !dbg !43 + %244 = or i1 %238, %243, !dbg !44 + %245 = and i1 %241, %240, !dbg !51 + %246 = or i1 %239, %245, !dbg !52 + %247 = extractelement <4 x i32> %133, i64 0, !dbg !45 + %248 = icmp slt i32 %236, %247, !dbg !45 + %249 = and i1 %248, %246, !dbg !46 + %250 = or i1 %244, %249, !dbg !47 + %251 = select i1 %250, float %235, float %237, !dbg !48 + %252 = select i1 %250, i32 %236, i32 %247, !dbg !49 + %253 = bitcast float %195 to i32, !dbg !53 + %254 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %253, i32 16, i32 31), !dbg !53 + %255 = bitcast i32 %254 to float, !dbg !53 + %256 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 16, i32 31), !dbg !53 + %257 = fcmp ogt float %195, %255, !dbg !38 + %258 = fcmp oeq float %195, %255, !dbg !50 + %259 = fcmp uno float %195, 0.000000e+00, !dbg !40 + %260 = fcmp uno float %255, 0.000000e+00, !dbg !41 + %261 = xor i1 %260, true, !dbg !42 + %262 = and i1 %259, %261, !dbg !43 + %263 = or i1 %257, %262, !dbg !44 + %264 = and i1 %259, %260, !dbg !51 + %265 = or i1 %258, %264, !dbg !52 + %266 = icmp slt i32 %196, %256, !dbg !45 + %267 = and i1 %266, %265, !dbg !46 + %268 = or i1 %263, %267, !dbg !47 + %269 = select i1 %268, float %195, float %255, !dbg !48 + %270 = select i1 %268, i32 %196, i32 %256, !dbg !49 + %271 = bitcast float %269 to i32, !dbg !53 + %272 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %271, i32 8, i32 31), !dbg !53 + %273 = bitcast i32 %272 to float, !dbg !53 + %274 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %270, i32 8, i32 31), !dbg !53 + %275 = fcmp ogt float %269, %273, !dbg !38 + %276 = fcmp oeq float %269, %273, !dbg !50 + %277 = fcmp uno float %269, 0.000000e+00, !dbg !40 + %278 = fcmp uno float %273, 0.000000e+00, !dbg !41 + %279 = xor i1 %278, true, !dbg !42 + %280 = and i1 %277, %279, !dbg !43 + %281 = or i1 %275, %280, !dbg !44 + %282 = and i1 %278, %277, !dbg !51 + %283 = or i1 %276, %282, !dbg !52 + %284 = icmp slt i32 %270, %274, !dbg !45 + %285 = and i1 %284, %283, !dbg !46 + %286 = or i1 %281, %285, !dbg !47 + %287 = select i1 %286, float %269, float %273, !dbg !48 + %288 = select i1 %286, i32 %270, i32 %274, !dbg !49 + %289 = bitcast float %287 to i32, !dbg !53 + %290 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %289, i32 4, i32 31), !dbg !53 + %291 = bitcast i32 %290 to float, !dbg !53 + %292 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %288, i32 4, i32 31), !dbg !53 + %293 = fcmp ogt float %287, %291, !dbg !38 + %294 = fcmp oeq float %287, %291, !dbg !50 + %295 = fcmp uno float %287, 0.000000e+00, !dbg !40 + %296 = fcmp uno float %291, 0.000000e+00, !dbg !41 + %297 = xor i1 %296, true, !dbg !42 + %298 = and i1 %295, %297, !dbg !43 + %299 = or i1 %293, %298, !dbg !44 + %300 = and i1 %296, %295, !dbg !51 + %301 = or i1 %294, %300, !dbg !52 + %302 = icmp slt i32 %288, %292, !dbg !45 + %303 = and i1 %302, %301, !dbg !46 + %304 = or i1 %299, %303, !dbg !47 + %305 = select i1 %304, float %287, float %291, !dbg !48 + %306 = select i1 %304, i32 %288, i32 %292, !dbg !49 + %307 = bitcast float %305 to i32, !dbg !53 + %308 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %307, i32 2, i32 31), !dbg !53 + %309 = bitcast i32 %308 to float, !dbg !53 + %310 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %306, i32 2, i32 31), !dbg !53 + %311 = fcmp ogt float %305, %309, !dbg !38 + %312 = fcmp oeq float %305, %309, !dbg !50 + %313 = fcmp uno float %305, 0.000000e+00, !dbg !40 + %314 = fcmp uno float %309, 0.000000e+00, !dbg !41 + %315 = xor i1 %314, true, !dbg !42 + %316 = and i1 %313, %315, !dbg !43 + %317 = or i1 %311, %316, !dbg !44 + %318 = and i1 %314, %313, !dbg !51 + %319 = or i1 %312, %318, !dbg !52 + %320 = icmp slt i32 %306, %310, !dbg !45 + %321 = and i1 %320, %319, !dbg !46 + %322 = or i1 %317, %321, !dbg !47 + %323 = select i1 %322, float %305, float %309, !dbg !48 + %324 = select i1 %322, i32 %306, i32 %310, !dbg !49 + %325 = bitcast float %323 to i32, !dbg !53 + %326 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %325, i32 1, i32 31), !dbg !53 + %327 = bitcast i32 %326 to float, !dbg !53 + %328 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %324, i32 1, i32 31), !dbg !53 + %329 = fcmp ogt float %323, %327, !dbg !38 + %330 = fcmp oeq float %323, %327, !dbg !50 + %331 = fcmp uno float %323, 0.000000e+00, !dbg !40 + %332 = fcmp uno float %327, 0.000000e+00, !dbg !41 + %333 = xor i1 %332, true, !dbg !42 + %334 = and i1 %331, %333, !dbg !43 + %335 = or i1 %329, %334, !dbg !44 + %336 = and i1 %332, %331, !dbg !51 + %337 = or i1 %330, %336, !dbg !52 + %338 = icmp slt i32 %324, %328, !dbg !45 + %339 = and i1 %338, %337, !dbg !46 + %340 = or i1 %335, %339, !dbg !47 + %341 = select i1 %340, i32 %324, i32 %328, !dbg !49 + %342 = bitcast float %251 to i32, !dbg !53 + %343 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %342, i32 16, i32 31), !dbg !53 + %344 = bitcast i32 %343 to float, !dbg !53 + %345 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 16, i32 31), !dbg !53 + %346 = fcmp ogt float %251, %344, !dbg !38 + %347 = fcmp oeq float %251, %344, !dbg !50 + %348 = fcmp uno float %251, 0.000000e+00, !dbg !40 + %349 = fcmp uno float %344, 0.000000e+00, !dbg !41 + %350 = xor i1 %349, true, !dbg !42 + %351 = and i1 %348, %350, !dbg !43 + %352 = or i1 %346, %351, !dbg !44 + %353 = and i1 %348, %349, !dbg !51 + %354 = or i1 %347, %353, !dbg !52 + %355 = icmp slt i32 %252, %345, !dbg !45 + %356 = and i1 %355, %354, !dbg !46 + %357 = or i1 %352, %356, !dbg !47 + %358 = select i1 %357, float %251, float %344, !dbg !48 + %359 = select i1 %357, i32 %252, i32 %345, !dbg !49 + %360 = bitcast float %358 to i32, !dbg !53 + %361 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %360, i32 8, i32 31), !dbg !53 + %362 = bitcast i32 %361 to float, !dbg !53 + %363 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %359, i32 8, i32 31), !dbg !53 + %364 = fcmp ogt float %358, %362, !dbg !38 + %365 = fcmp oeq float %358, %362, !dbg !50 + %366 = fcmp uno float %358, 0.000000e+00, !dbg !40 + %367 = fcmp uno float %362, 0.000000e+00, !dbg !41 + %368 = xor i1 %367, true, !dbg !42 + %369 = and i1 %366, %368, !dbg !43 + %370 = or i1 %364, %369, !dbg !44 + %371 = and i1 %367, %366, !dbg !51 + %372 = or i1 %365, %371, !dbg !52 + %373 = icmp slt i32 %359, %363, !dbg !45 + %374 = and i1 %373, %372, !dbg !46 + %375 = or i1 %370, %374, !dbg !47 + %376 = select i1 %375, float %358, float %362, !dbg !48 + %377 = select i1 %375, i32 %359, i32 %363, !dbg !49 + %378 = bitcast float %376 to i32, !dbg !53 + %379 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %378, i32 4, i32 31), !dbg !53 + %380 = bitcast i32 %379 to float, !dbg !53 + %381 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %377, i32 4, i32 31), !dbg !53 + %382 = fcmp ogt float %376, %380, !dbg !38 + %383 = fcmp oeq float %376, %380, !dbg !50 + %384 = fcmp uno float %376, 0.000000e+00, !dbg !40 + %385 = fcmp uno float %380, 0.000000e+00, !dbg !41 + %386 = xor i1 %385, true, !dbg !42 + %387 = and i1 %384, %386, !dbg !43 + %388 = or i1 %382, %387, !dbg !44 + %389 = and i1 %385, %384, !dbg !51 + %390 = or i1 %383, %389, !dbg !52 + %391 = icmp slt i32 %377, %381, !dbg !45 + %392 = and i1 %391, %390, !dbg !46 + %393 = or i1 %388, %392, !dbg !47 + %394 = select i1 %393, float %376, float %380, !dbg !48 + %395 = select i1 %393, i32 %377, i32 %381, !dbg !49 + %396 = bitcast float %394 to i32, !dbg !53 + %397 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %396, i32 2, i32 31), !dbg !53 + %398 = bitcast i32 %397 to float, !dbg !53 + %399 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %395, i32 2, i32 31), !dbg !53 + %400 = fcmp ogt float %394, %398, !dbg !38 + %401 = fcmp oeq float %394, %398, !dbg !50 + %402 = fcmp uno float %394, 0.000000e+00, !dbg !40 + %403 = fcmp uno float %398, 0.000000e+00, !dbg !41 + %404 = xor i1 %403, true, !dbg !42 + %405 = and i1 %402, %404, !dbg !43 + %406 = or i1 %400, %405, !dbg !44 + %407 = and i1 %403, %402, !dbg !51 + %408 = or i1 %401, %407, !dbg !52 + %409 = icmp slt i32 %395, %399, !dbg !45 + %410 = and i1 %409, %408, !dbg !46 + %411 = or i1 %406, %410, !dbg !47 + %412 = select i1 %411, float %394, float %398, !dbg !48 + %413 = select i1 %411, i32 %395, i32 %399, !dbg !49 + %414 = bitcast float %412 to i32, !dbg !53 + %415 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %414, i32 1, i32 31), !dbg !53 + %416 = bitcast i32 %415 to float, !dbg !53 + %417 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %413, i32 1, i32 31), !dbg !53 + %418 = fcmp ogt float %412, %416, !dbg !38 + %419 = fcmp oeq float %412, %416, !dbg !50 + %420 = fcmp uno float %412, 0.000000e+00, !dbg !40 + %421 = fcmp uno float %416, 0.000000e+00, !dbg !41 + %422 = xor i1 %421, true, !dbg !42 + %423 = and i1 %420, %422, !dbg !43 + %424 = or i1 %418, %423, !dbg !44 + %425 = and i1 %421, %420, !dbg !51 + %426 = or i1 %419, %425, !dbg !52 + %427 = icmp slt i32 %413, %417, !dbg !45 + %428 = and i1 %427, %426, !dbg !46 + %429 = or i1 %424, %428, !dbg !47 + %430 = select i1 %429, i32 %413, i32 %417, !dbg !49 + %431 = and i32 %139, 3, !dbg !53 + %432 = icmp eq i32 %138, 0, !dbg !53 + %433 = lshr exact i32 %10, 5, !dbg !53 + %434 = or disjoint i32 %433, %431, !dbg !53 + %435 = getelementptr float, ptr addrspace(3) @global_smem, i32 %434, !dbg !53 + %436 = select i1 %340, i32 %325, i32 %326, !dbg !48 + %437 = insertelement <1 x i32> poison, i32 %436, i64 0, !dbg !53 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %435, <1 x i32> %437, i1 %432) #4, !dbg !53 + %438 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %434, !dbg !53 + %439 = insertelement <1 x i32> poison, i32 %341, i64 0, !dbg !53 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %438, <1 x i32> %439, i1 %432) #4, !dbg !53 + %440 = shl nuw nsw i32 %12, 2, !dbg !53 + %441 = or disjoint i32 %440, %431, !dbg !53 + %442 = getelementptr float, ptr addrspace(3) @global_smem, i32 %441, !dbg !53 + %443 = select i1 %429, i32 %414, i32 %415, !dbg !48 + %444 = insertelement <1 x i32> poison, i32 %443, i64 0, !dbg !53 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %442, <1 x i32> %444, i1 %432) #4, !dbg !53 + %445 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %441, !dbg !53 + %446 = insertelement <1 x i32> poison, i32 %430, i64 0, !dbg !53 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %445, <1 x i32> %446, i1 %432) #4, !dbg !53 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %447 = icmp samesign ult i32 %9, 32, !dbg !53 + %448 = getelementptr float, ptr addrspace(3) @global_smem, i32 %9, !dbg !53 + %449 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %448, i1 %447) #4, !dbg !53 + %450 = bitcast i32 %449 to float, !dbg !53 + %451 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %9, !dbg !53 + %452 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %451, i1 %447) #4, !dbg !53 + %453 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %449, i32 2, i32 31), !dbg !53 + %454 = bitcast i32 %453 to float, !dbg !53 + %455 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %452, i32 2, i32 31), !dbg !53 + %456 = fcmp ogt float %450, %454, !dbg !38 + %457 = fcmp oeq float %450, %454, !dbg !50 + %458 = fcmp uno float %450, 0.000000e+00, !dbg !40 + %459 = fcmp uno float %454, 0.000000e+00, !dbg !41 + %460 = xor i1 %459, true, !dbg !42 + %461 = and i1 %458, %460, !dbg !43 + %462 = or i1 %456, %461, !dbg !44 + %463 = and i1 %458, %459, !dbg !51 + %464 = or i1 %457, %463, !dbg !52 + %465 = icmp slt i32 %452, %455, !dbg !45 + %466 = and i1 %465, %464, !dbg !46 + %467 = or i1 %462, %466, !dbg !47 + %468 = select i1 %467, float %450, float %454, !dbg !48 + %469 = select i1 %467, i32 %452, i32 %455, !dbg !49 + %470 = bitcast float %468 to i32, !dbg !53 + %471 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %470, i32 1, i32 31), !dbg !53 + %472 = bitcast i32 %471 to float, !dbg !53 + %473 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %469, i32 1, i32 31), !dbg !53 + %474 = fcmp ogt float %468, %472, !dbg !38 + %475 = fcmp oeq float %468, %472, !dbg !50 + %476 = fcmp uno float %468, 0.000000e+00, !dbg !40 + %477 = fcmp uno float %472, 0.000000e+00, !dbg !41 + %478 = xor i1 %477, true, !dbg !42 + %479 = and i1 %476, %478, !dbg !43 + %480 = or i1 %474, %479, !dbg !44 + %481 = and i1 %477, %476, !dbg !51 + %482 = or i1 %475, %481, !dbg !52 + %483 = icmp slt i32 %469, %473, !dbg !45 + %484 = and i1 %483, %482, !dbg !46 + %485 = or i1 %480, %484, !dbg !47 + %486 = select i1 %485, i32 %469, i32 %473, !dbg !49 + %487 = and i32 %9, 995, !dbg !53 + %488 = icmp eq i32 %487, 0, !dbg !53 + %489 = select i1 %485, i32 %470, i32 %471, !dbg !48 + %490 = insertelement <1 x i32> poison, i32 %489, i64 0, !dbg !53 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %448, <1 x i32> %490, i1 %488) #4, !dbg !53 + %491 = insertelement <1 x i32> poison, i32 %486, i64 0, !dbg !53 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %451, <1 x i32> %491, i1 %488) #4, !dbg !53 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %492 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %433, !dbg !53 + %493 = load i32, ptr addrspace(3) %492, align 16, !dbg !53 + %494 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %440, !dbg !53 + %495 = load i32, ptr addrspace(3) %494, align 16, !dbg !53 + %496 = sext i32 %137 to i64, !dbg !54 + %497 = getelementptr i64, ptr addrspace(1) %1, i64 %496, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !55 + %498 = lshr exact i32 %10, 4, !dbg !55 + %499 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %498, !dbg !55 + %500 = insertelement <2 x i32> poison, i32 %493, i64 0, !dbg !55 + %501 = insertelement <2 x i32> %500, i32 %495, i64 1, !dbg !55 + store <2 x i32> %501, ptr addrspace(3) %499, align 8, !dbg !55 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !55 + %502 = shl nuw nsw i32 %9, 3, !dbg !55 + %503 = and i32 %502, 24, !dbg !55 + %504 = and i32 %9, 4, !dbg !55 + %505 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %503, !dbg !55 + %506 = getelementptr inbounds nuw i8, ptr addrspace(3) %505, i32 %504, !dbg !55 + %507 = load i32, ptr addrspace(3) %506, align 4, !dbg !55 + %508 = sext i32 %507 to i64, !dbg !55 + %509 = and i32 %9, 504, !dbg !55 + %510 = icmp eq i32 %509, 0, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %508, ptr addrspace(1) %497, i1 %510) #4, !dbg !55 + ret void, !dbg !56 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_1", linkageName: "triton_red_fused_argmax_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 19, scope: !4) +!13 = !DILocation(line: 33, column: 40, scope: !4) +!14 = !DILocation(line: 34, column: 31, scope: !4) +!15 = !DILocation(line: 35, column: 29, scope: !4) +!16 = !DILocation(line: 39, column: 52, scope: !4) +!17 = !DILocation(line: 39, column: 34, scope: !4) +!18 = !DILocation(line: 39, column: 66, scope: !4) +!19 = !DILocation(line: 147, column: 29, scope: !20, inlinedAt: !22) +!20 = distinct !DILexicalBlockFile(scope: !4, file: !21, discriminator: 0) +!21 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!22 = !DILocation(line: 42, column: 38, scope: !4) +!23 = !DILocation(line: 154, column: 31, scope: !20, inlinedAt: !22) +!24 = !DILocation(line: 155, column: 69, scope: !20, inlinedAt: !22) +!25 = !DILocation(line: 144, column: 21, scope: !20, inlinedAt: !22) +!26 = !DILocation(line: 145, column: 23, scope: !20, inlinedAt: !22) +!27 = !DILocation(line: 148, column: 29, scope: !20, inlinedAt: !22) +!28 = !DILocation(line: 149, column: 31, scope: !20, inlinedAt: !22) +!29 = !DILocation(line: 149, column: 27, scope: !20, inlinedAt: !22) +!30 = !DILocation(line: 149, column: 16, scope: !20, inlinedAt: !22) +!31 = !DILocation(line: 151, column: 27, scope: !20, inlinedAt: !22) +!32 = !DILocation(line: 151, column: 17, scope: !20, inlinedAt: !22) +!33 = !DILocation(line: 154, column: 21, scope: !20, inlinedAt: !22) +!34 = !DILocation(line: 154, column: 12, scope: !20, inlinedAt: !22) +!35 = !DILocation(line: 155, column: 35, scope: !20, inlinedAt: !22) +!36 = !DILocation(line: 44, column: 46, scope: !4) +!37 = !DILocation(line: 45, column: 58, scope: !4) +!38 = !DILocation(line: 144, column: 21, scope: !20, inlinedAt: !39) +!39 = !DILocation(line: 46, column: 75, scope: !4) +!40 = !DILocation(line: 147, column: 29, scope: !20, inlinedAt: !39) +!41 = !DILocation(line: 148, column: 29, scope: !20, inlinedAt: !39) +!42 = !DILocation(line: 149, column: 31, scope: !20, inlinedAt: !39) +!43 = !DILocation(line: 149, column: 27, scope: !20, inlinedAt: !39) +!44 = !DILocation(line: 149, column: 16, scope: !20, inlinedAt: !39) +!45 = !DILocation(line: 154, column: 31, scope: !20, inlinedAt: !39) +!46 = !DILocation(line: 154, column: 21, scope: !20, inlinedAt: !39) +!47 = !DILocation(line: 154, column: 12, scope: !20, inlinedAt: !39) +!48 = !DILocation(line: 155, column: 35, scope: !20, inlinedAt: !39) +!49 = !DILocation(line: 155, column: 69, scope: !20, inlinedAt: !39) +!50 = !DILocation(line: 145, column: 23, scope: !20, inlinedAt: !39) +!51 = !DILocation(line: 151, column: 27, scope: !20, inlinedAt: !39) +!52 = !DILocation(line: 151, column: 17, scope: !20, inlinedAt: !39) +!53 = !DILocation(line: 165, column: 42, scope: !20, inlinedAt: !39) +!54 = !DILocation(line: 48, column: 25, scope: !4) +!55 = !DILocation(line: 48, column: 36, scope: !4) +!56 = !DILocation(line: 48, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..d98e6fa3c42e6145d70e98dc7a8f76a95cad6452 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ptx @@ -0,0 +1,1196 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_argmax_1 // -- Begin function triton_red_fused_argmax_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_argmax_1 +.visible .entry triton_red_fused_argmax_1( + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_1, + .param .u32 triton_red_fused_argmax_1_param_2, + .param .u32 triton_red_fused_argmax_1_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_5 +) +.reqntid 512 +{ + .reg .pred %p<325>; + .reg .b32 %r<226>; + .reg .b64 %rd<63>; + .loc 1 18 0 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:18:0 +$L__func_begin0: + .loc 1 18 0 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:18:0 + +// %bb.0: + ld.param.b64 %rd15, [triton_red_fused_argmax_1_param_1]; + ld.param.b64 %rd14, [triton_red_fused_argmax_1_param_0]; +$L__tmp0: + .loc 1 23 28 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:23:28 + mov.u32 %r27, %ctaid.x; + .loc 1 23 33 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:23:33 + shl.b32 %r1, %r27, 3; + .loc 1 24 44 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 384; + bfe.u32 %r28, %r2, 7, 2; + or.b32 %r4, %r28, 4; + .loc 1 24 23 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:23 + or.b32 %r29, %r28, %r1; + or.b32 %r30, %r4, %r1; + .loc 1 26 37 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:26:37 + shl.b32 %r31, %r2, 2; + and.b32 %r32, %r31, 508; + .loc 1 29 19 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:29:19 + bfe.s32 %r33, %r27, 28, 1; + shr.u32 %r34, %r33, 21; + add.s32 %r35, %r29, %r34; + shr.s32 %r36, %r35, 11; + add.s32 %r37, %r30, %r34; + shr.s32 %r38, %r37, 11; + .loc 1 33 40 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:33:40 + cvt.u64.u32 %rd1, %r32; + mul.lo.s32 %r39, %r27, 256000; + mad.lo.s32 %r40, %r38, 224000, %r39; + mul.lo.s32 %r41, %r28, 32000; + add.s32 %r42, %r40, %r41; + add.s32 %r43, %r42, %r32; + cvt.u64.u32 %rd2, %r43; + mad.lo.s32 %r44, %r36, 224000, %r39; + add.s32 %r45, %r44, %r41; + add.s32 %r46, %r45, %r32; + cvt.u64.u32 %rd3, %r46; + mov.b32 %r47, 0fFF800000; + mov.b64 %rd59, {%r47, %r47}; + mov.b32 %r218, 2147483647; + mov.b64 %rd58, -512; + mov.b32 %r219, %r218; + mov.b64 %rd60, %rd59; + mov.b32 %r220, %r218; + mov.b32 %r221, %r218; + mov.b64 %rd61, %rd59; + mov.b64 %rd62, %rd59; + mov.b32 %r222, %r218; + mov.b32 %r223, %r218; + mov.b32 %r224, %r218; + mov.b32 %r225, %r218; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 35 29 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:35:29 + add.s64 %rd26, %rd1, %rd58; + add.s64 %rd27, %rd26, 512; + setp.lt.u64 %p1, %rd27, 32000; + .loc 1 39 52 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:52 + add.s64 %rd28, %rd3, %rd58; + .loc 1 39 34 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:34 + add.s64 %rd29, %rd2, %rd58; + cvt.u32.u64 %r64, %rd28; + add.s32 %r65, %r64, 512; + mad.wide.s32 %rd21, %r65, 4, %rd14; + cvt.u32.u64 %r66, %rd29; + add.s32 %r67, %r66, 128512; + mad.wide.s32 %rd24, %r67, 4, %rd14; + .loc 1 39 66 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:66 + // begin inline asm + mov.u64 %rd20, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd20, 1.0; + // end inline asm + mov.b32 %r52, 0; + // begin inline asm + mov.u32 %r48, %r52; + mov.u32 %r49, %r52; + mov.u32 %r50, %r52; + mov.u32 %r51, %r52; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r48, %r49, %r50, %r51 }, [ %rd21 + 0 ], %rd20; + // end inline asm + // begin inline asm + mov.u64 %rd23, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd23, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r56, %r52; + mov.u32 %r57, %r52; + mov.u32 %r58, %r52; + mov.u32 %r59, %r52; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r56, %r57, %r58, %r59 }, [ %rd24 + 0 ], %rd23; + // end inline asm +$L__tmp1: + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + mov.b64 {%r68, %r69}, %rd59; + setp.nan.f32 %p3, %r68, %r68; + setp.nan.f32 %p4, %r69, %r69; + mov.b64 {%r70, %r71}, %rd61; + setp.nan.f32 %p5, %r70, %r70; + setp.nan.f32 %p6, %r71, %r71; + mov.b64 {%r72, %r73}, %rd62; + setp.nan.f32 %p7, %r72, %r72; + setp.nan.f32 %p8, %r73, %r73; + mov.b64 {%r74, %r75}, %rd60; + setp.nan.f32 %p9, %r74, %r74; + setp.nan.f32 %p10, %r75, %r75; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + cvt.s64.s32 %rd30, %r219; + cvt.s64.s32 %rd31, %r218; + cvt.s64.s32 %rd32, %r221; + cvt.s64.s32 %rd33, %r220; +$L__tmp2: + .loc 1 39 66 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:66 + cvt.u64.u32 %rd34, %r48; + cvt.u64.u32 %rd35, %r49; + shl.b64 %rd36, %rd35, 32; + or.b64 %rd37, %rd34, %rd36; +$L__tmp3: + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + mov.b64 {%r76, %r77}, %rd37; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.f32 %p11, %r69, %r77; + setp.gt.f32 %p12, %r68, %r76; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.eq.f32 %p13, %r68, %r76; + setp.eq.f32 %p14, %r69, %r77; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.nan.f32 %p15, %r77, %r77; + setp.nan.f32 %p16, %r76, %r76; + setp.num.f32 %p17, %r76, %r76; + setp.num.f32 %p18, %r77, %r77; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p19, %p4, %p18; + and.pred %p20, %p3, %p17; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p21, %p12, %p20; + or.pred %p22, %p11, %p19; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p23, %p3, %p16; + and.pred %p24, %p4, %p15; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p25, %p14, %p24; + or.pred %p26, %p13, %p23; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.s64 %p27, %rd27, %rd31; + setp.ge.s64 %p28, %rd27, %rd30; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p29, %p27, %p26; + and.pred %p30, %p28, %p25; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p31, %p22, %p30; + or.pred %p32, %p21, %p29; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + selp.f32 %r78, %r68, %r76, %p32; + selp.f32 %r79, %r69, %r77, %p31; + cvt.u32.u64 %r80, %rd27; + cvt.u32.u64 %r81, %rd26; + add.s32 %r82, %r81, 513; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + selp.b32 %r83, %r218, %r80, %p32; + selp.b32 %r84, %r219, %r82, %p31; +$L__tmp4: + .loc 1 39 66 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:66 + cvt.u64.u32 %rd38, %r56; + cvt.u64.u32 %rd39, %r57; + shl.b64 %rd40, %rd39, 32; + or.b64 %rd41, %rd38, %rd40; +$L__tmp5: + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + mov.b64 {%r85, %r86}, %rd41; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.f32 %p33, %r75, %r86; + setp.gt.f32 %p34, %r74, %r85; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.eq.f32 %p35, %r74, %r85; + setp.eq.f32 %p36, %r75, %r86; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.nan.f32 %p37, %r86, %r86; + setp.nan.f32 %p38, %r85, %r85; + setp.num.f32 %p39, %r85, %r85; + setp.num.f32 %p40, %r86, %r86; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p41, %p10, %p40; + and.pred %p42, %p9, %p39; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p43, %p34, %p42; + or.pred %p44, %p33, %p41; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p45, %p9, %p38; + and.pred %p46, %p10, %p37; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p47, %p36, %p46; + or.pred %p48, %p35, %p45; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.s64 %p49, %rd27, %rd33; + setp.ge.s64 %p50, %rd27, %rd32; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p51, %p49, %p48; + and.pred %p52, %p50, %p47; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p53, %p44, %p52; + or.pred %p54, %p43, %p51; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + selp.f32 %r87, %r74, %r85, %p54; + selp.f32 %r88, %r75, %r86, %p53; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + selp.b32 %r89, %r220, %r80, %p54; + selp.b32 %r90, %r221, %r82, %p53; +$L__tmp6: + .loc 1 34 31 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:34:31 + add.s64 %rd42, %rd26, 515; + add.s64 %rd43, %rd26, 514; + .loc 1 39 66 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:66 + cvt.u64.u32 %rd44, %r59; + cvt.u64.u32 %rd45, %r58; + shl.b64 %rd46, %rd45, 32; + or.b64 %rd47, %rd44, %rd46; + cvt.u64.u32 %rd48, %r51; + cvt.u64.u32 %rd49, %r50; + shl.b64 %rd50, %rd49, 32; + or.b64 %rd51, %rd48, %rd50; +$L__tmp7: + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + mov.b64 {%r91, %r92}, %rd51; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.f32 %p55, %r73, %r92; + setp.gt.f32 %p56, %r72, %r91; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + mov.b64 {%r93, %r94}, %rd47; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.f32 %p57, %r71, %r94; + setp.gt.f32 %p58, %r70, %r93; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.eq.f32 %p59, %r70, %r93; + setp.eq.f32 %p60, %r71, %r94; + setp.eq.f32 %p61, %r72, %r91; + setp.eq.f32 %p62, %r73, %r92; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.nan.f32 %p63, %r92, %r92; + setp.nan.f32 %p64, %r91, %r91; + setp.nan.f32 %p65, %r94, %r94; + setp.nan.f32 %p66, %r93, %r93; + setp.num.f32 %p67, %r93, %r93; + setp.num.f32 %p68, %r94, %r94; + setp.num.f32 %p69, %r91, %r91; + setp.num.f32 %p70, %r92, %r92; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p71, %p8, %p70; + and.pred %p72, %p7, %p69; + and.pred %p73, %p6, %p68; + and.pred %p74, %p5, %p67; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p75, %p58, %p74; + or.pred %p76, %p57, %p73; + or.pred %p77, %p56, %p72; + or.pred %p78, %p55, %p71; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p79, %p5, %p66; + and.pred %p80, %p6, %p65; + and.pred %p81, %p7, %p64; + and.pred %p82, %p8, %p63; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p83, %p62, %p82; + or.pred %p84, %p61, %p81; + or.pred %p85, %p60, %p80; + or.pred %p86, %p59, %p79; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + cvt.s64.s32 %rd52, %r222; + cvt.s64.s32 %rd53, %r223; + cvt.s64.s32 %rd54, %r224; + cvt.s64.s32 %rd55, %r225; + setp.gt.s64 %p87, %rd43, %rd55; + setp.gt.s64 %p88, %rd42, %rd54; + setp.gt.s64 %p89, %rd43, %rd53; + setp.gt.s64 %p90, %rd42, %rd52; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p91, %p90, %p86; + and.pred %p92, %p89, %p85; + and.pred %p93, %p88, %p84; + and.pred %p94, %p87, %p83; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p95, %p78, %p94; + or.pred %p96, %p77, %p93; + or.pred %p97, %p76, %p92; + or.pred %p98, %p75, %p91; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + selp.f32 %r95, %r70, %r93, %p98; + selp.f32 %r96, %r71, %r94, %p97; + selp.f32 %r97, %r72, %r91, %p96; + selp.f32 %r98, %r73, %r92, %p95; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + cvt.u32.u64 %r99, %rd43; + cvt.u32.u64 %r100, %rd42; + selp.b32 %r101, %r222, %r100, %p98; + selp.b32 %r102, %r223, %r99, %p97; + selp.b32 %r103, %r224, %r100, %p96; + selp.b32 %r104, %r225, %r99, %p95; +$L__tmp8: + .loc 1 44 46 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:44:46 + selp.f32 %r105, %r79, %r69, %p1; + selp.f32 %r106, %r78, %r68, %p1; + mov.b64 %rd59, {%r106, %r105}; + selp.f32 %r107, %r88, %r75, %p1; + selp.f32 %r108, %r87, %r74, %p1; + mov.b64 %rd60, {%r108, %r107}; + selp.f32 %r109, %r98, %r73, %p1; + selp.f32 %r110, %r97, %r72, %p1; + mov.b64 %rd62, {%r110, %r109}; + selp.f32 %r111, %r96, %r71, %p1; + selp.f32 %r112, %r95, %r70, %p1; + mov.b64 %rd61, {%r112, %r111}; + .loc 1 45 58 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:45:58 + selp.b32 %r219, %r84, %r219, %p1; + selp.b32 %r218, %r83, %r218, %p1; + selp.b32 %r221, %r90, %r221, %p1; + selp.b32 %r220, %r89, %r220, %p1; + selp.b32 %r225, %r104, %r225, %p1; + selp.b32 %r224, %r103, %r224, %p1; + selp.b32 %r223, %r102, %r223, %p1; + selp.b32 %r222, %r101, %r222, %p1; + .loc 1 33 40 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:33:40 + add.s64 %rd58, %rd58, 512; + setp.lt.u64 %p99, %rd58, 31488; + @%p99 bra $L__BB0_1; +// %bb.2: + .loc 1 24 44 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:44 + and.b32 %r129, %r2, 7; + .loc 1 24 23 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:23 + or.b32 %r130, %r1, %r129; + .loc 1 24 44 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:44 + and.b32 %r131, %r2, 31; +$L__tmp9: + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + mov.b64 {%r132, %r133}, %rd59; + setp.gt.f32 %p109, %r132, %r133; + setp.eq.f32 %p110, %r133, %r132; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p111, %r132, %r132; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.num.f32 %p112, %r133, %r133; + setp.nan.f32 %p113, %r133, %r133; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p114, %p111, %p113; + and.pred %p115, %p111, %p112; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p116, %p109, %p115; + or.pred %p117, %p110, %p114; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p118, %r218, %r219; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p119, %p118, %p117; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p120, %p116, %p119; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r134, %r132, %r133, %p120; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r135, %r218, %r219, %p120; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + mov.b64 {%r136, %r137}, %rd62; + setp.gt.f32 %p121, %r134, %r137; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p122, %r134, %r137; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p123, %r134, %r134; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + mov.b64 {%r138, %r139}, %rd61; + setp.nan.f32 %p124, %r138, %r138; + setp.num.f32 %p125, %r138, %r138; + setp.nan.f32 %p126, %r139, %r139; + setp.num.f32 %p127, %r139, %r139; + setp.nan.f32 %p128, %r136, %r136; + setp.num.f32 %p129, %r136, %r136; + setp.nan.f32 %p130, %r137, %r137; + setp.num.f32 %p131, %r137, %r137; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p132, %p123, %p131; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p133, %p121, %p132; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p134, %p130, %p123; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p135, %p122, %p134; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p136, %r135, %r225; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p137, %p136, %p135; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p138, %p133, %p137; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r140, %r134, %r137, %p138; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r141, %r135, %r225, %p138; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p139, %r140, %r136; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p140, %r140, %r136; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p141, %r140, %r140; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p142, %p141, %p129; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p143, %p139, %p142; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p144, %p128, %p141; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p145, %p140, %p144; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p146, %r141, %r224; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p147, %p146, %p145; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p148, %p143, %p147; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r142, %r140, %r136, %p148; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r143, %r141, %r224, %p148; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + mov.b64 {%r144, %r145}, %rd60; + setp.gt.f32 %p149, %r144, %r145; + setp.eq.f32 %p150, %r145, %r144; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p151, %r144, %r144; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.num.f32 %p152, %r145, %r145; + setp.nan.f32 %p153, %r145, %r145; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p154, %p151, %p153; + and.pred %p155, %p151, %p152; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p156, %p149, %p155; + or.pred %p157, %p150, %p154; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p158, %r220, %r221; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p159, %p158, %p157; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p160, %p156, %p159; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r146, %r144, %r145, %p160; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r147, %r220, %r221, %p160; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p161, %r146, %r139; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p162, %r146, %r139; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p163, %r146, %r146; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p164, %p163, %p127; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p165, %p161, %p164; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p166, %p126, %p163; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p167, %p162, %p166; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p168, %r147, %r223; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p169, %p168, %p167; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p170, %p165, %p169; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r148, %r146, %r139, %p170; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r149, %r147, %r223, %p170; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p171, %r148, %r138; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p172, %r148, %r138; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p173, %r148, %r148; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p174, %p173, %p125; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p175, %p171, %p174; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p176, %p124, %p173; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p177, %p172, %p176; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p178, %r149, %r222; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p179, %p178, %p177; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p180, %p175, %p179; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r150, %r148, %r138, %p180; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r151, %r149, %r222, %p180; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r152, %r142, 16, 31, -1; + shfl.sync.bfly.b32 %r153, %r143, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p181, %r142, %r152; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p182, %r142, %r152; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p183, %r142, %r142; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p184, %r152, %r152; + setp.num.f32 %p185, %r152, %r152; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p186, %p183, %p185; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p187, %p181, %p186; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p188, %p183, %p184; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p189, %p182, %p188; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p190, %r143, %r153; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p191, %p190, %p189; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p192, %p187, %p191; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r154, %r142, %r152, %p192; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r155, %r143, %r153, %p192; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r156, %r154, 8, 31, -1; + shfl.sync.bfly.b32 %r157, %r155, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p193, %r154, %r156; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p194, %r154, %r156; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p195, %r154, %r154; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p196, %r156, %r156; + setp.num.f32 %p197, %r156, %r156; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p198, %p195, %p197; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p199, %p193, %p198; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p200, %p196, %p195; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p201, %p194, %p200; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p202, %r155, %r157; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p203, %p202, %p201; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p204, %p199, %p203; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r158, %r154, %r156, %p204; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r159, %r155, %r157, %p204; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r160, %r158, 4, 31, -1; + shfl.sync.bfly.b32 %r161, %r159, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p205, %r158, %r160; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p206, %r158, %r160; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p207, %r158, %r158; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p208, %r160, %r160; + setp.num.f32 %p209, %r160, %r160; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p210, %p207, %p209; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p211, %p205, %p210; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p212, %p208, %p207; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p213, %p206, %p212; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p214, %r159, %r161; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p215, %p214, %p213; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p216, %p211, %p215; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r162, %r158, %r160, %p216; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r163, %r159, %r161, %p216; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r164, %r162, 2, 31, -1; + shfl.sync.bfly.b32 %r165, %r163, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p217, %r162, %r164; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p218, %r162, %r164; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p219, %r162, %r162; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p220, %r164, %r164; + setp.num.f32 %p221, %r164, %r164; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p222, %p219, %p221; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p223, %p217, %p222; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p224, %p220, %p219; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p225, %p218, %p224; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p226, %r163, %r165; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p227, %p226, %p225; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p228, %p223, %p227; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r166, %r162, %r164, %p228; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r167, %r163, %r165, %p228; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r168, %r166, 1, 31, -1; + shfl.sync.bfly.b32 %r169, %r167, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p229, %r166, %r168; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p230, %r166, %r168; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p231, %r166, %r166; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p232, %r168, %r168; + setp.num.f32 %p233, %r168, %r168; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p234, %p231, %p233; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p235, %p229, %p234; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p236, %p232, %p231; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p237, %p230, %p236; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p238, %r167, %r169; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p239, %p238, %p237; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p240, %p235, %p239; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r116, %r167, %r169, %p240; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r170, %r150, 16, 31, -1; + shfl.sync.bfly.b32 %r171, %r151, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p241, %r150, %r170; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p242, %r150, %r170; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p243, %r150, %r150; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p244, %r170, %r170; + setp.num.f32 %p245, %r170, %r170; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p246, %p243, %p245; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p247, %p241, %p246; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p248, %p243, %p244; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p249, %p242, %p248; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p250, %r151, %r171; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p251, %p250, %p249; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p252, %p247, %p251; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r172, %r150, %r170, %p252; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r173, %r151, %r171, %p252; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r174, %r172, 8, 31, -1; + shfl.sync.bfly.b32 %r175, %r173, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p253, %r172, %r174; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p254, %r172, %r174; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p255, %r172, %r172; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p256, %r174, %r174; + setp.num.f32 %p257, %r174, %r174; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p258, %p255, %p257; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p259, %p253, %p258; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p260, %p256, %p255; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p261, %p254, %p260; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p262, %r173, %r175; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p263, %p262, %p261; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p264, %p259, %p263; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r176, %r172, %r174, %p264; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r177, %r173, %r175, %p264; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r178, %r176, 4, 31, -1; + shfl.sync.bfly.b32 %r179, %r177, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p265, %r176, %r178; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p266, %r176, %r178; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p267, %r176, %r176; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p268, %r178, %r178; + setp.num.f32 %p269, %r178, %r178; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p270, %p267, %p269; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p271, %p265, %p270; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p272, %p268, %p267; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p273, %p266, %p272; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p274, %r177, %r179; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p275, %p274, %p273; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p276, %p271, %p275; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r180, %r176, %r178, %p276; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r181, %r177, %r179, %p276; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r182, %r180, 2, 31, -1; + shfl.sync.bfly.b32 %r183, %r181, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p277, %r180, %r182; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p278, %r180, %r182; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p279, %r180, %r180; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p280, %r182, %r182; + setp.num.f32 %p281, %r182, %r182; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p282, %p279, %p281; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p283, %p277, %p282; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p284, %p280, %p279; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p285, %p278, %p284; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p286, %r181, %r183; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p287, %p286, %p285; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p288, %p283, %p287; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r184, %r180, %r182, %p288; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r185, %r181, %r183, %p288; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r186, %r184, 1, 31, -1; + shfl.sync.bfly.b32 %r187, %r185, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p289, %r184, %r186; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p290, %r184, %r186; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p291, %r184, %r184; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p292, %r186, %r186; + setp.num.f32 %p293, %r186, %r186; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p294, %p291, %p293; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p295, %p289, %p294; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p296, %p292, %p291; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p297, %p290, %p296; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p298, %r185, %r187; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p299, %p298, %p297; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p300, %p295, %p299; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r120, %r185, %r187, %p300; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + bfe.u32 %r188, %r2, 5, 2; + setp.eq.b32 %p100, %r131, 0; + shr.u32 %r189, %r3, 5; + or.b32 %r190, %r189, %r188; + shl.b32 %r191, %r190, 2; + mov.b32 %r192, global_smem; + add.s32 %r113, %r192, %r191; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r114, %r166, %r168, %p240; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + // begin inline asm + @%p100 st.shared.b32 [ %r113 + 0 ], %r114; + // end inline asm + add.s32 %r193, %r192, 128; + add.s32 %r115, %r193, %r191; + // begin inline asm + @%p100 st.shared.b32 [ %r115 + 0 ], %r116; + // end inline asm + shl.b32 %r194, %r188, 2; + shl.b32 %r195, %r4, 4; + or.b32 %r196, %r195, %r194; + add.s32 %r117, %r192, %r196; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r118, %r184, %r186, %p300; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + // begin inline asm + @%p100 st.shared.b32 [ %r117 + 0 ], %r118; + // end inline asm + add.s32 %r119, %r193, %r196; + // begin inline asm + @%p100 st.shared.b32 [ %r119 + 0 ], %r120; + // end inline asm + bar.sync 0; + setp.lt.u32 %p104, %r2, 32; + add.s32 %r122, %r192, %r31; + // begin inline asm + @%p104 ld.shared.b32 %r121, [ %r122 + 0 ]; + // end inline asm + add.s32 %r124, %r193, %r31; + // begin inline asm + @%p104 ld.shared.b32 %r123, [ %r124 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r198, %r121, 2, 31, -1; + shfl.sync.bfly.b32 %r199, %r123, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p301, %r121, %r198; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p302, %r121, %r198; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p303, %r121, %r121; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p304, %r198, %r198; + setp.num.f32 %p305, %r198, %r198; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p306, %p303, %p305; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p307, %p301, %p306; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p308, %p303, %p304; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p309, %p302, %p308; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p310, %r123, %r199; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p311, %p310, %p309; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p312, %p307, %p311; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r200, %r121, %r198, %p312; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r201, %r123, %r199, %p312; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r202, %r200, 1, 31, -1; + shfl.sync.bfly.b32 %r203, %r201, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p313, %r200, %r202; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p314, %r200, %r202; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p315, %r200, %r200; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p316, %r202, %r202; + setp.num.f32 %p317, %r202, %r202; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p318, %p315, %p317; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p319, %p313, %p318; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p320, %p316, %p315; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p321, %p314, %p320; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p322, %r201, %r203; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p323, %p322, %p321; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p324, %p319, %p323; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r128, %r201, %r203, %p324; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.b32 %r204, %r2, 995; + setp.eq.b32 %p106, %r204, 0; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r126, %r200, %r202, %p324; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + // begin inline asm + @%p106 st.shared.b32 [ %r122 + 0 ], %r126; + // end inline asm + // begin inline asm + @%p106 st.shared.b32 [ %r124 + 0 ], %r128; + // end inline asm + bar.sync 0; + shr.u32 %r205, %r3, 3; + add.s32 %r206, %r193, %r205; + ld.shared.b32 %r207, [%r206]; + add.s32 %r208, %r193, %r195; + ld.shared.b32 %r209, [%r208]; +$L__tmp10: + .loc 1 48 25 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:48:25 + mad.wide.s32 %rd57, %r130, 8, %rd15; + .loc 1 48 36 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:48:36 + bar.sync 0; + shr.u32 %r210, %r3, 4; + add.s32 %r211, %r192, %r210; + st.shared.v2.b32 [%r211], {%r207, %r209}; + bar.sync 0; + shl.b32 %r212, %r2, 3; + and.b32 %r213, %r212, 24; + and.b32 %r214, %r2, 4; + add.s32 %r215, %r192, %r213; + add.s32 %r216, %r215, %r214; + ld.shared.s32 %rd56, [%r216]; + and.b32 %r217, %r2, 504; + setp.eq.b32 %p108, %r217, 0; + // begin inline asm + @%p108 st.global.b64 [ %rd57 + 0 ], { %rd56 }; + // end inline asm + .loc 1 48 4 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:48:4 + ret; +$L__tmp11: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 234 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 101 +.b8 53 +.b8 114 +.b8 111 +.b8 97 +.b8 116 +.b8 115 +.b8 101 +.b8 122 +.b8 111 +.b8 100 +.b8 113 +.b8 102 +.b8 53 +.b8 113 +.b8 112 +.b8 100 +.b8 51 +.b8 50 +.b8 104 +.b8 107 +.b8 115 +.b8 112 +.b8 116 +.b8 52 +.b8 102 +.b8 99 +.b8 102 +.b8 105 +.b8 113 +.b8 111 +.b8 55 +.b8 106 +.b8 111 +.b8 98 +.b8 111 +.b8 109 +.b8 55 +.b8 112 +.b8 122 +.b8 104 +.b8 115 +.b8 102 +.b8 116 +.b8 117 +.b8 107 +.b8 50 +.b8 55 +.b8 103 +.b8 106 +.b8 105 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 101 +.b8 53 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp9 // DW_AT_low_pc +.b64 $L__tmp10 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.source new file mode 100644 index 0000000000000000000000000000000000000000..a21b50a32315e280b320a9e381175a8f9e9aa734 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.source @@ -0,0 +1,315 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":18:0) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc45 = loc(unknown) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("out_ptr0"(#loc)) +#loc72 = loc("xnumel"(#loc)) +#loc73 = loc("r0_numel"(#loc)) +#loc100 = loc("a_value"(#loc33)) +#loc101 = loc("a_index"(#loc33)) +#loc102 = loc("b_value"(#loc33)) +#loc103 = loc("b_index"(#loc33)) +#loc116 = loc("x"(#loc53)) +#loc117 = loc("x"(#loc57)) +#loc118 = loc("value"(#loc66)) +#loc119 = loc("index"(#loc66)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 16384 : i32 loc(#loc74) + %r0_numel_1 = arith.constant 32000 : i32 loc(#loc75) + %xoffset = tt.get_program_id x : i32 loc(#loc76) + %xoffset_2 = arith.constant 8 : i32 loc(#loc77) + %xoffset_3 = arith.constant 8 : i32 loc(#loc77) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc77) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc78) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc79) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc80) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc80) + %xmask = arith.constant true loc(#loc81) + %xmask_8 = arith.constant dense : tensor<8x512xi1> loc(#loc81) + %r0_base = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc82) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<512xi32> -> tensor<1x512xi32> loc(#loc83) + %x0 = arith.constant 2048 : i32 loc(#loc84) + %x0_10 = arith.constant 2048 : i32 loc(#loc84) + %x0_11 = arith.constant dense<2048> : tensor<8x1xi32> loc(#loc84) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<8x1xi32> loc(#loc84) + %x1 = arith.constant 2048 : i32 loc(#loc85) + %x1_13 = arith.constant 2048 : i32 loc(#loc85) + %x1_14 = arith.constant dense<2048> : tensor<8x1xi32> loc(#loc85) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<8x1xi32> loc(#loc85) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc86) + %_tmp2_16 = arith.constant dense<0xFF800000> : tensor<8x512xf32> loc(#loc86) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc87) + %_tmp2_index_17 = arith.constant dense<2147483647> : tensor<8x512xi32> loc(#loc87) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c512_i32 = arith.constant 512 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c512_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp2_index_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_19 = %_tmp2_16, %_tmp2_index_20 = %_tmp2_index_17) -> (tensor<8x512xf32>, tensor<8x512xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x512xi32> loc(#loc89) + %r0_index_21 = arith.addi %r0_index, %r0_base_9 : tensor<1x512xi32> loc(#loc89) + %r0_mask = arith.constant dense<32000> : tensor<1x512xi32> loc(#loc90) + %r0_mask_22 = arith.cmpi slt, %r0_index_21, %r0_mask : tensor<1x512xi32> loc(#loc90) + %tmp0 = arith.constant 32000 : i32 loc(#loc91) + %tmp0_23 = arith.constant 32000 : i32 loc(#loc91) + %tmp0_24 = arith.constant dense<32000> : tensor<8x1xi32> loc(#loc91) + %tmp0_25 = arith.muli %tmp0_24, %x0_12 : tensor<8x1xi32> loc(#loc91) + %tmp0_26 = tt.broadcast %r0_index_21 : tensor<1x512xi32> -> tensor<8x512xi32> loc(#loc92) + %tmp0_27 = tt.broadcast %tmp0_25 : tensor<8x1xi32> -> tensor<8x512xi32> loc(#loc92) + %tmp0_28 = arith.addi %tmp0_26, %tmp0_27 : tensor<8x512xi32> loc(#loc92) + %tmp0_29 = arith.constant 65760000 : i32 loc(#loc93) + %tmp0_30 = arith.constant 65760000 : i32 loc(#loc93) + %tmp0_31 = arith.constant dense<65760000> : tensor<8x1xi32> loc(#loc93) + %tmp0_32 = arith.muli %tmp0_31, %x1_15 : tensor<8x1xi32> loc(#loc93) + %tmp0_33 = tt.broadcast %tmp0_32 : tensor<8x1xi32> -> tensor<8x512xi32> loc(#loc94) + %tmp0_34 = arith.addi %tmp0_28, %tmp0_33 : tensor<8x512xi32> loc(#loc94) + %tmp0_35 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x512x!tt.ptr> loc(#loc95) + %tmp0_36 = tt.addptr %tmp0_35, %tmp0_34 : tensor<8x512x!tt.ptr>, tensor<8x512xi32> loc(#loc95) + %tmp0_37 = arith.constant 0.000000e+00 : f32 loc(#loc96) + %tmp0_38 = tt.broadcast %r0_mask_22 : tensor<1x512xi1> -> tensor<8x512xi1> loc(#loc96) + %tmp0_39 = arith.constant dense<0.000000e+00> : tensor<8x512xf32> loc(#loc96) + %tmp0_40 = tt.load %tmp0_36, %tmp0_38, %tmp0_39 evictionPolicy = evict_first : tensor<8x512x!tt.ptr> loc(#loc96) + %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S8_512S_i32S8_512S_fp32S8_512S_i32S1_512S__(%_tmp2_19, %_tmp2_index_20, %tmp0_40, %r0_index_21) : (tensor<8x512xf32>, tensor<8x512xi32>, tensor<8x512xf32>, tensor<1x512xi32>) -> (tensor<8x512xf32>, tensor<8x512xi32>) loc(#loc24) + %_tmp2_41 = tt.broadcast %r0_mask_22 : tensor<1x512xi1> -> tensor<8x512xi1> loc(#loc97) + %_tmp2_42 = arith.select %_tmp2_41, %8#0, %_tmp2_19 : tensor<8x512xi1>, tensor<8x512xf32> loc(#loc97) + %_tmp2_index_43 = tt.broadcast %r0_mask_22 : tensor<1x512xi1> -> tensor<8x512xi1> loc(#loc98) + %_tmp2_index_44 = arith.select %_tmp2_index_43, %8#1, %_tmp2_index_20 : tensor<8x512xi1>, tensor<8x512xi32> loc(#loc98) + scf.yield %_tmp2_42, %_tmp2_index_44 : tensor<8x512xf32>, tensor<8x512xi32> loc(#loc27) + } loc(#loc120) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S8_512S_i32S8_512S__(2,)cconstexpr_1_"(%_tmp2_index_18#0, %_tmp2_index_18#1) : (tensor<8x512xf32>, tensor<8x512xi32>) -> (tensor<8xf32>, tensor<8xi32>) loc(#loc28) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc99) + %5 = tt.splat %out_ptr0 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc30) + %6 = tt.addptr %5, %xindex_7 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc30) + %7 = arith.extsi %tmp2 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc31) + tt.store %6, %7 : tensor<8x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S8_512S_i32S8_512S_fp32S8_512S_i32S1_512S__(%a_value: tensor<8x512xf32> loc("a_value"(#loc33)), %a_index: tensor<8x512xi32> loc("a_index"(#loc33)), %b_value: tensor<8x512xf32> loc("b_value"(#loc33)), %b_index: tensor<1x512xi32> loc("b_index"(#loc33))) -> (tensor<8x512xf32>, tensor<8x512xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<8x512xf32> loc(#loc121) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<8x512xf32> loc(#loc122) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S8_512S__(%a_value) : (tensor<8x512xf32>) -> i1 loc(#loc36) + %1:2 = scf.if %0 -> (tensor<8x512xi1>, tensor<8x512xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<8x512xf32> loc(#loc106) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<8x512xf32> loc(#loc107) + %mask_4 = arith.constant true loc(#loc108) + %mask_5 = arith.constant dense : tensor<8x512xi1> loc(#loc108) + %mask_6 = arith.xori %b_isnan, %mask_5 : tensor<8x512xi1> loc(#loc108) + %mask_7 = arith.andi %a_isnan, %mask_6 : tensor<8x512xi1> loc(#loc109) + %mask_8 = arith.ori %mask, %mask_7 : tensor<8x512xi1> loc(#loc123) + %equal_9 = arith.andi %a_isnan, %b_isnan : tensor<8x512xi1> loc(#loc111) + %equal_10 = arith.ori %equal, %equal_9 : tensor<8x512xi1> loc(#loc124) + scf.yield %mask_8, %equal_10 : tensor<8x512xi1>, tensor<8x512xi1> loc(#loc124) + } else { + scf.yield %mask, %equal : tensor<8x512xi1>, tensor<8x512xi1> loc(#loc45) + } loc(#loc37) + %mask_0 = tt.broadcast %b_index : tensor<1x512xi32> -> tensor<8x512xi32> loc(#loc113) + %mask_1 = arith.cmpi slt, %a_index, %mask_0 : tensor<8x512xi32> loc(#loc113) + %mask_2 = arith.andi %1#1, %mask_1 : tensor<8x512xi1> loc(#loc114) + %mask_3 = arith.ori %1#0, %mask_2 : tensor<8x512xi1> loc(#loc115) + %2 = arith.select %mask_3, %a_value, %b_value : tensor<8x512xi1>, tensor<8x512xf32> loc(#loc49) + %3 = tt.broadcast %b_index : tensor<1x512xi32> -> tensor<8x512xi32> loc(#loc50) + %4 = arith.select %mask_3, %a_index, %3 : tensor<8x512xi1>, tensor<8x512xi32> loc(#loc50) + tt.return %2, %4 : tensor<8x512xf32>, tensor<8x512xi32> loc(#loc51) + ^bb1: // no predecessors + %5 = ub.poison : tensor<8x512xf32> loc(#loc52) + %6 = ub.poison : tensor<8x512xi32> loc(#loc52) + tt.return %5, %6 : tensor<8x512xf32>, tensor<8x512xi32> loc(#loc52) + } loc(#loc33) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S8_512S__(%x: tensor<8x512xf32> loc("x"(#loc53))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S8_512S__(%x) : (tensor<8x512xf32>) -> tensor<8x512xf32> loc(#loc54) + %true = arith.constant true loc(#loc55) + tt.return %true : i1 loc(#loc55) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc56) + tt.return %1 : i1 loc(#loc56) + } loc(#loc53) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S8_512S__(%x: tensor<8x512xf32> loc("x"(#loc57))) -> tensor<8x512xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc59) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<8x512xf32> loc(#loc59) + %4 = arith.addf %x, %3 : tensor<8x512xf32> loc(#loc59) + tt.return %4 : tensor<8x512xf32> loc(#loc60) + ^bb1: // no predecessors + %5 = ub.poison : tensor<8x512xf32> loc(#loc61) + tt.return %5 : tensor<8x512xf32> loc(#loc61) + } loc(#loc57) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc63) + %cst = arith.constant dense : tensor<1xi1> loc(#loc63) + tt.return %cst : tensor<1xi1> loc(#loc64) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc65) + tt.return %0 : tensor<1xi1> loc(#loc65) + } loc(#loc62) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S8_512S_i32S8_512S__(2,)cconstexpr_1_"(%value: tensor<8x512xf32> loc("value"(#loc66)), %index: tensor<8x512xi32> loc("index"(#loc66))) -> (tensor<8xf32>, tensor<8xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc67) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc67) + }) : (tensor<8x512xf32>, tensor<8x512xi32>) -> (tensor<8xf32>, tensor<8xi32>) loc(#loc67) + tt.return %0#0, %0#1 : tensor<8xf32>, tensor<8xi32> loc(#loc68) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8xf32> loc(#loc69) + %2 = ub.poison : tensor<8xi32> loc(#loc69) + tt.return %1, %2 : tensor<8xf32>, tensor<8xi32> loc(#loc69) + } loc(#loc66) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc33)), %a_index: i32 loc("a_index"(#loc33)), %b_value: f32 loc("b_value"(#loc33)), %b_index: i32 loc("b_index"(#loc33))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc121) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc122) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc36) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc106) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc107) + %mask_3 = arith.constant true loc(#loc108) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc108) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc109) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc123) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc111) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc124) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc124) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc45) + } loc(#loc37) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc113) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc114) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc115) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc49) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc50) + tt.return %2, %3 : f32, i32 loc(#loc51) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc52) + %5 = ub.poison : i32 loc(#loc52) + tt.return %4, %5 : f32, i32 loc(#loc52) + } loc(#loc33) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc53))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc54) + %true = arith.constant true loc(#loc55) + tt.return %true : i1 loc(#loc55) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc56) + tt.return %1 : i1 loc(#loc56) + } loc(#loc53) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc57))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc59) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc59) + tt.return %3 : tensor<1xf32> loc(#loc60) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc61) + tt.return %4 : tensor<1xf32> loc(#loc61) + } loc(#loc57) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":29:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":30:55) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":31:58) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":33:40) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":34:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":35:29) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:47) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:61) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:52) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:66) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":42:38) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":44:46) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:58) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:8) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":46:75) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":47:20) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:25) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:36) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:4) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc74 = loc("xnumel"(#loc1)) +#loc75 = loc("r0_numel"(#loc2)) +#loc76 = loc("xoffset"(#loc3)) +#loc77 = loc("xoffset"(#loc4)) +#loc78 = loc("xindex"(#loc5)) +#loc79 = loc("xindex"(#loc6)) +#loc80 = loc("xindex"(#loc7)) +#loc81 = loc("xmask"(#loc8)) +#loc82 = loc("r0_base"(#loc9)) +#loc83 = loc("r0_base"(#loc10)) +#loc84 = loc("x0"(#loc11)) +#loc85 = loc("x1"(#loc12)) +#loc86 = loc("_tmp2"(#loc13)) +#loc87 = loc("_tmp2_index"(#loc14)) +#loc88 = loc("_tmp2"(#loc15)) +#loc89 = loc("r0_index"(#loc16)) +#loc90 = loc("r0_mask"(#loc17)) +#loc91 = loc("tmp0"(#loc18)) +#loc92 = loc("tmp0"(#loc19)) +#loc93 = loc("tmp0"(#loc20)) +#loc94 = loc("tmp0"(#loc21)) +#loc95 = loc("tmp0"(#loc22)) +#loc96 = loc("tmp0"(#loc23)) +#loc97 = loc("_tmp2"(#loc25)) +#loc98 = loc("_tmp2_index"(#loc26)) +#loc99 = loc("tmp2"(#loc29)) +#loc104 = loc("mask"(#loc34)) +#loc105 = loc("equal"(#loc35)) +#loc106 = loc("a_isnan"(#loc38)) +#loc107 = loc("b_isnan"(#loc39)) +#loc108 = loc("mask"(#loc40)) +#loc109 = loc("mask"(#loc41)) +#loc110 = loc("mask"(#loc42)) +#loc111 = loc("equal"(#loc43)) +#loc112 = loc("equal"(#loc44)) +#loc113 = loc("mask"(#loc46)) +#loc114 = loc("mask"(#loc47)) +#loc115 = loc("mask"(#loc48)) +#loc120 = loc("_tmp2_index"(#loc88)) +#loc121 = loc("mask"(#loc104)) +#loc122 = loc("equal"(#loc105)) +#loc123 = loc("mask"(#loc110)) +#loc124 = loc("equal"(#loc112)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..9beeeecbbde02006825b5c2dcabaf21036563df5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttgir @@ -0,0 +1,203 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 4], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":18:0) +#loc1 = loc(unknown) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":46:75) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc79 = loc(callsite(#loc1 at #loc37)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<65760000> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<32000> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<32000> : tensor<1x512xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<2048> : tensor<8x1xi32, #blocked> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<8x512xf32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %true = arith.constant true loc(#loc1) + %cst_4 = arith.constant dense : tensor<8x512xi1, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<2147483647> : tensor<8x512xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<0xFF800000> : tensor<8x512xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc46) + %xoffset_7 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc47) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc48) + %xindex_8 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc48) + %xindex_9 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc48) + %xindex_10 = tt.expand_dims %xindex_8 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc48) + %xindex_11 = tt.splat %xoffset_7 : i32 -> tensor<8x1xi32, #blocked> loc(#loc49) + %xindex_12 = tt.splat %xoffset_7 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc49) + %xindex_13 = arith.addi %xindex_11, %xindex_9 : tensor<8x1xi32, #blocked> loc(#loc49) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<8x1xi32, #blocked1> loc(#loc49) + %r0_base = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc50) + %r0_base_15 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<512xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x512xi32, #blocked> loc(#loc50) + %x0 = arith.remsi %xindex_13, %cst_2 : tensor<8x1xi32, #blocked> loc(#loc51) + %x1 = arith.divsi %xindex_13, %cst_2 : tensor<8x1xi32, #blocked> loc(#loc52) + %tmp0 = arith.muli %x0, %cst_0 : tensor<8x1xi32, #blocked> loc(#loc53) + %tmp0_16 = tt.broadcast %tmp0 : tensor<8x1xi32, #blocked> -> tensor<8x512xi32, #blocked> loc(#loc54) + %tmp0_17 = arith.muli %x1, %cst : tensor<8x1xi32, #blocked> loc(#loc55) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<8x1xi32, #blocked> -> tensor<8x512xi32, #blocked> loc(#loc56) + %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x512x!tt.ptr, #blocked> loc(#loc57) + %_tmp2_index:2 = scf.for %_tmp2_index_20 = %c0_i32 to %c32000_i32 step %c512_i32 iter_args(%_tmp2 = %cst_6, %_tmp2_index_21 = %cst_5) -> (tensor<8x512xf32, #blocked>, tensor<8x512xi32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp2_index_20 : i32 -> tensor<1x512xi32, #blocked> loc(#loc59) + %r0_index_22 = arith.addi %r0_index, %r0_base_15 : tensor<1x512xi32, #blocked> loc(#loc59) + %r0_mask = arith.cmpi slt, %r0_index_22, %cst_1 : tensor<1x512xi32, #blocked> loc(#loc60) + %tmp0_23 = tt.broadcast %r0_index_22 : tensor<1x512xi32, #blocked> -> tensor<8x512xi32, #blocked> loc(#loc54) + %tmp0_24 = arith.addi %tmp0_23, %tmp0_16 : tensor<8x512xi32, #blocked> loc(#loc54) + %tmp0_25 = arith.addi %tmp0_24, %tmp0_18 : tensor<8x512xi32, #blocked> loc(#loc56) + %tmp0_26 = tt.addptr %tmp0_19, %tmp0_25 : tensor<8x512x!tt.ptr, #blocked>, tensor<8x512xi32, #blocked> loc(#loc57) + %tmp0_27 = tt.broadcast %r0_mask : tensor<1x512xi1, #blocked> -> tensor<8x512xi1, #blocked> loc(#loc61) + %tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst_3 evictionPolicy = evict_first : tensor<8x512x!tt.ptr, #blocked> loc(#loc61) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_28 : tensor<8x512xf32, #blocked> loc(#loc104) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_28 : tensor<8x512xf32, #blocked> loc(#loc105) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<8x512xf32, #blocked> loc(#loc84) + %b_isnan = arith.cmpf une, %tmp0_28, %tmp0_28 : tensor<8x512xf32, #blocked> loc(#loc85) + %mask_29 = arith.xori %b_isnan, %cst_4 : tensor<8x512xi1, #blocked> loc(#loc86) + %mask_30 = arith.andi %a_isnan, %mask_29 : tensor<8x512xi1, #blocked> loc(#loc87) + %mask_31 = arith.ori %mask, %mask_30 : tensor<8x512xi1, #blocked> loc(#loc106) + %equal_32 = arith.andi %a_isnan, %b_isnan : tensor<8x512xi1, #blocked> loc(#loc89) + %equal_33 = arith.ori %equal, %equal_32 : tensor<8x512xi1, #blocked> loc(#loc107) + %mask_34 = arith.cmpi slt, %_tmp2_index_21, %tmp0_23 : tensor<8x512xi32, #blocked> loc(#loc91) + %mask_35 = arith.andi %equal_33, %mask_34 : tensor<8x512xi1, #blocked> loc(#loc92) + %mask_36 = arith.ori %mask_31, %mask_35 : tensor<8x512xi1, #blocked> loc(#loc93) + %5 = arith.select %mask_36, %_tmp2, %tmp0_28 : tensor<8x512xi1, #blocked>, tensor<8x512xf32, #blocked> loc(#loc74) + %6 = arith.select %mask_36, %_tmp2_index_21, %tmp0_23 : tensor<8x512xi1, #blocked>, tensor<8x512xi32, #blocked> loc(#loc75) + %_tmp2_37 = arith.select %tmp0_27, %5, %_tmp2 : tensor<8x512xi1, #blocked>, tensor<8x512xf32, #blocked> loc(#loc76) + %_tmp2_index_38 = arith.select %tmp0_27, %6, %_tmp2_index_21 : tensor<8x512xi1, #blocked>, tensor<8x512xi32, #blocked> loc(#loc77) + scf.yield %_tmp2_37, %_tmp2_index_38 : tensor<8x512xf32, #blocked>, tensor<8x512xi32, #blocked> loc(#loc35) + } loc(#loc81) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc37)), %arg5: i32 loc(callsite(#loc1 at #loc37)), %arg6: f32 loc(callsite(#loc1 at #loc37)), %arg7: i32 loc(callsite(#loc1 at #loc37))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc108) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc109) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc94) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc95) + %mask_20 = arith.xori %b_isnan, %true : i1 loc(#loc96) + %mask_21 = arith.andi %a_isnan, %mask_20 : i1 loc(#loc97) + %mask_22 = arith.ori %mask, %mask_21 : i1 loc(#loc110) + %equal_23 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc98) + %equal_24 = arith.ori %equal, %equal_23 : i1 loc(#loc111) + %mask_25 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc99) + %mask_26 = arith.andi %equal_24, %mask_25 : i1 loc(#loc100) + %mask_27 = arith.ori %mask_22, %mask_26 : i1 loc(#loc101) + %5 = arith.select %mask_27, %arg4, %arg6 : f32 loc(#loc102) + %6 = arith.select %mask_27, %arg5, %arg7 : i32 loc(#loc103) + tt.reduce.return %5, %6 : f32, i32 loc(#loc78) + }) : (tensor<8x512xf32, #blocked>, tensor<8x512xi32, #blocked>) -> (tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc78) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc80) + %1 = tt.splat %out_ptr0 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked1> loc(#loc39) + %2 = tt.addptr %1, %xindex_14 : tensor<8x1x!tt.ptr, #blocked1>, tensor<8x1xi32, #blocked1> loc(#loc39) + %3 = ttg.convert_layout %tmp2 : tensor<8x1xi32, #blocked> -> tensor<8x1xi32, #blocked1> loc(#loc40) + %4 = arith.extsi %3 : tensor<8x1xi32, #blocked1> to tensor<8x1xi64, #blocked1> loc(#loc40) + tt.store %2, %4 : tensor<8x1x!tt.ptr, #blocked1> loc(#loc40) + tt.return loc(#loc41) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":29:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:47) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:61) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:52) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:34) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":33:40) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":34:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":35:29) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:66) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":42:38) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":44:46) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:58) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:8) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":47:20) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:25) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:36) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:4) +#loc46 = loc("xoffset"(#loc2)) +#loc47 = loc("xoffset"(#loc3)) +#loc48 = loc("xindex"(#loc4)) +#loc49 = loc("xindex"(#loc5)) +#loc50 = loc("r0_base"(#loc6)) +#loc51 = loc("x0"(#loc7)) +#loc52 = loc("x1"(#loc8)) +#loc53 = loc("tmp0"(#loc9)) +#loc54 = loc("tmp0"(#loc10)) +#loc55 = loc("tmp0"(#loc11)) +#loc56 = loc("tmp0"(#loc12)) +#loc57 = loc("tmp0"(#loc13)) +#loc58 = loc("_tmp2"(#loc14)) +#loc59 = loc("r0_index"(#loc15)) +#loc60 = loc("r0_mask"(#loc16)) +#loc61 = loc("tmp0"(#loc17)) +#loc62 = loc("mask"(#loc18)) +#loc63 = loc("equal"(#loc20)) +#loc64 = loc("a_isnan"(#loc21)) +#loc65 = loc("b_isnan"(#loc22)) +#loc66 = loc("mask"(#loc23)) +#loc67 = loc("mask"(#loc24)) +#loc68 = loc("mask"(#loc25)) +#loc69 = loc("equal"(#loc26)) +#loc70 = loc("equal"(#loc27)) +#loc71 = loc("mask"(#loc28)) +#loc72 = loc("mask"(#loc29)) +#loc73 = loc("mask"(#loc30)) +#loc74 = loc(callsite(#loc31 at #loc19)) +#loc75 = loc(callsite(#loc32 at #loc19)) +#loc76 = loc("_tmp2"(#loc33)) +#loc77 = loc("_tmp2_index"(#loc34)) +#loc78 = loc(callsite(#loc36 at #loc37)) +#loc80 = loc("tmp2"(#loc38)) +#loc81 = loc("_tmp2_index"(#loc58)) +#loc82 = loc("mask"(#loc62)) +#loc83 = loc("equal"(#loc63)) +#loc84 = loc(callsite(#loc64 at #loc19)) +#loc85 = loc(callsite(#loc65 at #loc19)) +#loc86 = loc(callsite(#loc66 at #loc19)) +#loc87 = loc(callsite(#loc67 at #loc19)) +#loc88 = loc("mask"(#loc68)) +#loc89 = loc(callsite(#loc69 at #loc19)) +#loc90 = loc("equal"(#loc70)) +#loc91 = loc(callsite(#loc71 at #loc19)) +#loc92 = loc(callsite(#loc72 at #loc19)) +#loc93 = loc(callsite(#loc73 at #loc19)) +#loc94 = loc(callsite(#loc64 at #loc78)) +#loc95 = loc(callsite(#loc65 at #loc78)) +#loc96 = loc(callsite(#loc66 at #loc78)) +#loc97 = loc(callsite(#loc67 at #loc78)) +#loc98 = loc(callsite(#loc69 at #loc78)) +#loc99 = loc(callsite(#loc71 at #loc78)) +#loc100 = loc(callsite(#loc72 at #loc78)) +#loc101 = loc(callsite(#loc73 at #loc78)) +#loc102 = loc(callsite(#loc31 at #loc78)) +#loc103 = loc(callsite(#loc32 at #loc78)) +#loc104 = loc(callsite(#loc82 at #loc19)) +#loc105 = loc(callsite(#loc83 at #loc19)) +#loc106 = loc(callsite(#loc88 at #loc19)) +#loc107 = loc(callsite(#loc90 at #loc19)) +#loc108 = loc(callsite(#loc82 at #loc78)) +#loc109 = loc(callsite(#loc83 at #loc78)) +#loc110 = loc(callsite(#loc88 at #loc78)) +#loc111 = loc(callsite(#loc90 at #loc78)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..2f358c2e4bccf274c2ca673e83c992aa3ef982c2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/3TSRCAOXIABNSCY74AQ2GK4AIDHEKNRWVMFQVMQJIJ7C44WYAPYA/triton_red_fused_argmax_1.ttir @@ -0,0 +1,204 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":46:75) +#loc46 = loc("in_ptr0"(#loc)) +#loc47 = loc("out_ptr0"(#loc)) +#loc48 = loc("xnumel"(#loc)) +#loc49 = loc("r0_numel"(#loc)) +#loc50 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %true = arith.constant true loc(#loc50) + %cst = arith.constant dense : tensor<8x512xi1> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc3) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<8x512xf32> loc(#loc1) + %cst_1 = arith.constant dense<65760000> : tensor<8x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<32000> : tensor<8x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<32000> : tensor<1x512xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<8x512xi32> loc(#loc51) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<8x512xf32> loc(#loc52) + %cst_4 = arith.constant dense<2048> : tensor<8x1xi32> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc53) + %xoffset_5 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc54) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc55) + %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc56) + %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<8x1xi32> loc(#loc57) + %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<8x1xi32> loc(#loc57) + %r0_base = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc58) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<512xi32> -> tensor<1x512xi32> loc(#loc59) + %x0 = arith.remsi %xindex_8, %cst_4 : tensor<8x1xi32> loc(#loc60) + %x1 = arith.divsi %xindex_8, %cst_4 : tensor<8x1xi32> loc(#loc61) + %_tmp2_index_10:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c512_i32 iter_args(%_tmp2_11 = %_tmp2, %_tmp2_index_12 = %_tmp2_index) -> (tensor<8x512xf32>, tensor<8x512xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x512xi32> loc(#loc63) + %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x512xi32> loc(#loc63) + %r0_mask = arith.cmpi slt, %r0_index_13, %cst_3 : tensor<1x512xi32> loc(#loc64) + %tmp0 = arith.muli %x0, %cst_2 : tensor<8x1xi32> loc(#loc65) + %tmp0_14 = tt.broadcast %r0_index_13 : tensor<1x512xi32> -> tensor<8x512xi32> loc(#loc66) + %tmp0_15 = tt.broadcast %tmp0 : tensor<8x1xi32> -> tensor<8x512xi32> loc(#loc66) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<8x512xi32> loc(#loc66) + %tmp0_17 = arith.muli %x1, %cst_1 : tensor<8x1xi32> loc(#loc67) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<8x1xi32> -> tensor<8x512xi32> loc(#loc68) + %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<8x512xi32> loc(#loc68) + %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x512x!tt.ptr> loc(#loc69) + %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<8x512x!tt.ptr>, tensor<8x512xi32> loc(#loc69) + %tmp0_22 = tt.broadcast %r0_mask : tensor<1x512xi1> -> tensor<8x512xi1> loc(#loc70) + %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_0 evictionPolicy = evict_first : tensor<8x512x!tt.ptr> loc(#loc70) + %mask = arith.cmpf ogt, %_tmp2_11, %tmp0_23 : tensor<8x512xf32> loc(#loc112) + %equal = arith.cmpf oeq, %_tmp2_11, %tmp0_23 : tensor<8x512xf32> loc(#loc113) + %a_isnan = arith.cmpf une, %_tmp2_11, %_tmp2_11 : tensor<8x512xf32> loc(#loc92) + %b_isnan = arith.cmpf une, %tmp0_23, %tmp0_23 : tensor<8x512xf32> loc(#loc93) + %mask_24 = arith.xori %b_isnan, %cst : tensor<8x512xi1> loc(#loc94) + %mask_25 = arith.andi %a_isnan, %mask_24 : tensor<8x512xi1> loc(#loc95) + %mask_26 = arith.ori %mask, %mask_25 : tensor<8x512xi1> loc(#loc114) + %equal_27 = arith.andi %a_isnan, %b_isnan : tensor<8x512xi1> loc(#loc97) + %equal_28 = arith.ori %equal, %equal_27 : tensor<8x512xi1> loc(#loc115) + %mask_29 = arith.cmpi slt, %_tmp2_index_12, %tmp0_14 : tensor<8x512xi32> loc(#loc99) + %mask_30 = arith.andi %equal_28, %mask_29 : tensor<8x512xi1> loc(#loc100) + %mask_31 = arith.ori %mask_26, %mask_30 : tensor<8x512xi1> loc(#loc101) + %4 = arith.select %mask_31, %_tmp2_11, %tmp0_23 : tensor<8x512xi1>, tensor<8x512xf32> loc(#loc83) + %5 = arith.select %mask_31, %_tmp2_index_12, %tmp0_14 : tensor<8x512xi1>, tensor<8x512xi32> loc(#loc84) + %_tmp2_32 = arith.select %tmp0_22, %4, %_tmp2_11 : tensor<8x512xi1>, tensor<8x512xf32> loc(#loc85) + %_tmp2_index_33 = arith.select %tmp0_22, %5, %_tmp2_index_12 : tensor<8x512xi1>, tensor<8x512xi32> loc(#loc86) + scf.yield %_tmp2_32, %_tmp2_index_33 : tensor<8x512xf32>, tensor<8x512xi32> loc(#loc40) + } loc(#loc89) + %0:2 = "tt.reduce"(%_tmp2_index_10#0, %_tmp2_index_10#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc2)), %arg5: i32 loc(callsite(#loc1 at #loc2)), %arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc116) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc117) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc102) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc103) + %mask_11 = arith.xori %b_isnan, %true : i1 loc(#loc104) + %mask_12 = arith.andi %a_isnan, %mask_11 : i1 loc(#loc105) + %mask_13 = arith.ori %mask, %mask_12 : i1 loc(#loc118) + %equal_14 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc106) + %equal_15 = arith.ori %equal, %equal_14 : i1 loc(#loc119) + %mask_16 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc107) + %mask_17 = arith.andi %equal_15, %mask_16 : i1 loc(#loc108) + %mask_18 = arith.ori %mask_13, %mask_17 : i1 loc(#loc109) + %4 = arith.select %mask_18, %arg4, %arg6 : f32 loc(#loc110) + %5 = arith.select %mask_18, %arg5, %arg7 : i32 loc(#loc111) + tt.reduce.return %4, %5 : f32, i32 loc(#loc87) + }) : (tensor<8x512xf32>, tensor<8x512xi32>) -> (tensor<8xf32>, tensor<8xi32>) loc(#loc87) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc88) + %1 = tt.splat %out_ptr0 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc43) + %2 = tt.addptr %1, %xindex_8 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc43) + %3 = arith.extsi %tmp2 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc44) + tt.store %2, %3 : tensor<8x1x!tt.ptr> loc(#loc44) + tt.return loc(#loc45) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":33:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":31:58) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":30:55) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:33) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:44) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:23) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:27) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:37) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":28:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":29:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":34:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":35:29) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:47) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:41) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:61) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:52) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:66) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":42:38) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":44:46) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:58) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:8) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":47:20) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:25) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:36) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:4) +#loc51 = loc("_tmp2_index"(#loc4)) +#loc52 = loc("_tmp2"(#loc5)) +#loc53 = loc("xoffset"(#loc6)) +#loc54 = loc("xoffset"(#loc7)) +#loc55 = loc("xindex"(#loc8)) +#loc56 = loc("xindex"(#loc9)) +#loc57 = loc("xindex"(#loc10)) +#loc58 = loc("r0_base"(#loc11)) +#loc59 = loc("r0_base"(#loc12)) +#loc60 = loc("x0"(#loc13)) +#loc61 = loc("x1"(#loc14)) +#loc62 = loc("_tmp2"(#loc3)) +#loc63 = loc("r0_index"(#loc15)) +#loc64 = loc("r0_mask"(#loc16)) +#loc65 = loc("tmp0"(#loc17)) +#loc66 = loc("tmp0"(#loc18)) +#loc67 = loc("tmp0"(#loc19)) +#loc68 = loc("tmp0"(#loc20)) +#loc69 = loc("tmp0"(#loc21)) +#loc70 = loc("tmp0"(#loc22)) +#loc71 = loc("mask"(#loc23)) +#loc72 = loc("equal"(#loc25)) +#loc73 = loc("a_isnan"(#loc26)) +#loc74 = loc("b_isnan"(#loc27)) +#loc75 = loc("mask"(#loc28)) +#loc76 = loc("mask"(#loc29)) +#loc77 = loc("mask"(#loc30)) +#loc78 = loc("equal"(#loc31)) +#loc79 = loc("equal"(#loc32)) +#loc80 = loc("mask"(#loc33)) +#loc81 = loc("mask"(#loc34)) +#loc82 = loc("mask"(#loc35)) +#loc83 = loc(callsite(#loc36 at #loc24)) +#loc84 = loc(callsite(#loc37 at #loc24)) +#loc85 = loc("_tmp2"(#loc38)) +#loc86 = loc("_tmp2_index"(#loc39)) +#loc87 = loc(callsite(#loc41 at #loc2)) +#loc88 = loc("tmp2"(#loc42)) +#loc89 = loc("_tmp2_index"(#loc62)) +#loc90 = loc("mask"(#loc71)) +#loc91 = loc("equal"(#loc72)) +#loc92 = loc(callsite(#loc73 at #loc24)) +#loc93 = loc(callsite(#loc74 at #loc24)) +#loc94 = loc(callsite(#loc75 at #loc24)) +#loc95 = loc(callsite(#loc76 at #loc24)) +#loc96 = loc("mask"(#loc77)) +#loc97 = loc(callsite(#loc78 at #loc24)) +#loc98 = loc("equal"(#loc79)) +#loc99 = loc(callsite(#loc80 at #loc24)) +#loc100 = loc(callsite(#loc81 at #loc24)) +#loc101 = loc(callsite(#loc82 at #loc24)) +#loc102 = loc(callsite(#loc73 at #loc87)) +#loc103 = loc(callsite(#loc74 at #loc87)) +#loc104 = loc(callsite(#loc75 at #loc87)) +#loc105 = loc(callsite(#loc76 at #loc87)) +#loc106 = loc(callsite(#loc78 at #loc87)) +#loc107 = loc(callsite(#loc80 at #loc87)) +#loc108 = loc(callsite(#loc81 at #loc87)) +#loc109 = loc(callsite(#loc82 at #loc87)) +#loc110 = loc(callsite(#loc36 at #loc87)) +#loc111 = loc(callsite(#loc37 at #loc87)) +#loc112 = loc(callsite(#loc90 at #loc24)) +#loc113 = loc(callsite(#loc91 at #loc24)) +#loc114 = loc(callsite(#loc96 at #loc24)) +#loc115 = loc(callsite(#loc98 at #loc24)) +#loc116 = loc(callsite(#loc90 at #loc87)) +#loc117 = loc(callsite(#loc91 at #loc87)) +#loc118 = loc(callsite(#loc96 at #loc87)) +#loc119 = loc(callsite(#loc98 at #loc87)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/__grp__triton_poi_fused_new_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/__grp__triton_poi_fused_new_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1d5fb3d758e0632547a612dcff1855652f975dfb --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/__grp__triton_poi_fused_new_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_new_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.source", "triton_poi_fused_new_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttir", "triton_poi_fused_new_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttgir", "triton_poi_fused_new_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.llir", "triton_poi_fused_new_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ptx", "triton_poi_fused_new_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.cubin", "triton_poi_fused_new_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..e477d025951c791d5f8687eb545b4fdc2247f401 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..16ee80253632811c108e5efb438e73951f7bb4b4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.json @@ -0,0 +1 @@ +{"hash": "e69b538dd589c5b9c913d33cdaa69fd2fa4ad6662ded2e7cc313f4666af16fd4", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_new_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..f888ec8e37478ee1486e471985238ec32b549e89 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.llir @@ -0,0 +1,52 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_new_zeros_0(ptr addrspace(1) %0, i32 %1, ptr addrspace(1) readnone captures(none) %2, ptr addrspace(1) readnone captures(none) %3) local_unnamed_addr #0 !dbg !4 { + %5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %6 = shl i32 %5, 8, !dbg !8 + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %8 = shl nuw nsw i32 %7, 1, !dbg !9 + %9 = and i32 %8, 254, !dbg !9 + %10 = or disjoint i32 %9, %6, !dbg !10 + %11 = or disjoint i32 %10, 1, !dbg !10 + %12 = icmp slt i32 %10, %1, !dbg !11 + %13 = icmp slt i32 %11, %1, !dbg !11 + %14 = sext i32 %10 to i64, !dbg !12 + %15 = getelementptr i32, ptr addrspace(1) %0, i64 %14, !dbg !12 + %16 = sext i32 %11 to i64, !dbg !12 + %17 = getelementptr i32, ptr addrspace(1) %0, i64 %16, !dbg !12 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 0, ptr addrspace(1) %15, i1 %12) #2, !dbg !13 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 0, ptr addrspace(1) %17, i1 %13) #2, !dbg !13 + ret void, !dbg !14 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_new_zeros_0", linkageName: "triton_poi_fused_new_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 19, column: 28, scope: !4) +!8 = !DILocation(line: 19, column: 33, scope: !4) +!9 = !DILocation(line: 20, column: 36, scope: !4) +!10 = !DILocation(line: 20, column: 23, scope: !4) +!11 = !DILocation(line: 21, column: 21, scope: !4) +!12 = !DILocation(line: 24, column: 25, scope: !4) +!13 = !DILocation(line: 24, column: 36, scope: !4) +!14 = !DILocation(line: 24, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..b74e2d030cf384d1959210132d7cf6011e69936e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ptx @@ -0,0 +1,214 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_new_zeros_0 // -- Begin function triton_poi_fused_new_zeros_0 + // @triton_poi_fused_new_zeros_0 +.visible .entry triton_poi_fused_new_zeros_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_0_param_0, + .param .u32 triton_poi_fused_new_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_0_param_3 +) +.reqntid 128 +{ + .reg .pred %p<3>; + .reg .b32 %r<11>; + .reg .b64 %rd<4>; + .loc 1 18 0 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:18:0 + +// %bb.0: + ld.param.b64 %rd3, [triton_poi_fused_new_zeros_0_param_0]; + ld.param.b32 %r3, [triton_poi_fused_new_zeros_0_param_1]; +$L__tmp0: + .loc 1 19 28 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:19:28 + mov.u32 %r4, %ctaid.x; + .loc 1 19 33 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:19:33 + shl.b32 %r5, %r4, 8; + .loc 1 20 36 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:20:36 + mov.u32 %r6, %tid.x; + shl.b32 %r7, %r6, 1; + and.b32 %r8, %r7, 254; + .loc 1 20 23 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:20:23 + or.b32 %r9, %r8, %r5; + or.b32 %r10, %r9, 1; + .loc 1 21 21 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:21:21 + setp.lt.s32 %p1, %r9, %r3; + setp.lt.s32 %p2, %r10, %r3; + .loc 1 24 25 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:24:25 + mad.wide.s32 %rd1, %r9, 4, %rd3; + add.s64 %rd2, %rd1, 4; + mov.b32 %r1, 0; + .loc 1 24 36 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:24:36 + // begin inline asm + @%p1 st.global.b32 [ %rd1 + 0 ], { %r1 }; + // end inline asm + // begin inline asm + @%p2 st.global.b32 [ %rd2 + 0 ], { %r1 }; + // end inline asm + .loc 1 24 4 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:24:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 54 +.b8 106 +.b8 122 +.b8 120 +.b8 122 +.b8 116 +.b8 100 +.b8 120 +.b8 98 +.b8 106 +.b8 118 +.b8 53 +.b8 98 +.b8 50 +.b8 51 +.b8 110 +.b8 102 +.b8 109 +.b8 103 +.b8 122 +.b8 103 +.b8 116 +.b8 105 +.b8 122 +.b8 113 +.b8 112 +.b8 55 +.b8 55 +.b8 104 +.b8 55 +.b8 97 +.b8 101 +.b8 97 +.b8 107 +.b8 53 +.b8 106 +.b8 50 +.b8 106 +.b8 117 +.b8 107 +.b8 109 +.b8 122 +.b8 51 +.b8 114 +.b8 111 +.b8 113 +.b8 101 +.b8 105 +.b8 119 +.b8 51 +.b8 107 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 54 +.b8 106 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..c66ea928e4716f779a5bceb46a321e24451b52dc --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.source @@ -0,0 +1,38 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_new_zeros_0(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc12) + %xoffset_0 = arith.constant 256 : i32 loc(#loc13) + %xoffset_1 = arith.constant 256 : i32 loc(#loc13) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc13) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc14) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<256xi32> loc(#loc15) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<256xi32> loc(#loc15) + %xmask = tt.splat %xnumel : i32 -> tensor<256xi32> loc(#loc16) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<256xi32> loc(#loc16) + %tmp0 = arith.constant 0 : i32 loc(#loc17) + %tmp0_6 = arith.constant dense<0> : tensor<1xi32> loc(#loc17) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc7) + %1 = tt.addptr %0, %xindex_4 : tensor<256x!tt.ptr>, tensor<256xi32> loc(#loc7) + %cst = arith.constant dense<0> : tensor<256xi32> loc(#loc8) + tt.store %1, %cst, %xmask_5 : tensor<256x!tt.ptr> loc(#loc8) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":23:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":24:25) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":24:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":24:4) +#loc12 = loc("xoffset"(#loc1)) +#loc13 = loc("xoffset"(#loc2)) +#loc14 = loc("xindex"(#loc3)) +#loc15 = loc("xindex"(#loc4)) +#loc16 = loc("xmask"(#loc5)) +#loc17 = loc("tmp0"(#loc6)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..e54250ffdaef5c758e76de3870787a3f25846c4c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttgir @@ -0,0 +1,35 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_new_zeros_0(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %cst = arith.constant dense<0> : tensor<256xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc12) + %xoffset_0 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc13) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> loc(#loc14) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<256xi32, #blocked> loc(#loc15) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<256xi32, #blocked> loc(#loc15) + %xmask = tt.splat %xnumel : i32 -> tensor<256xi32, #blocked> loc(#loc16) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<256xi32, #blocked> loc(#loc16) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr, #blocked> loc(#loc7) + %1 = tt.addptr %0, %xindex_2 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> loc(#loc7) + tt.store %1, %cst, %xmask_3 : tensor<256x!tt.ptr, #blocked> loc(#loc8) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":24:25) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":24:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":24:4) +#loc12 = loc("xoffset"(#loc2)) +#loc13 = loc("xoffset"(#loc3)) +#loc14 = loc("xindex"(#loc4)) +#loc15 = loc("xindex"(#loc5)) +#loc16 = loc("xmask"(#loc6)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..94ec2f8dba2f7ffe2a5ea9403bca9a34f4c36326 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/42NVHDOVRHC3TSIT2M6NVJU72L5EVVTGFXWS47GDCP2GM2XRN7KA/triton_poi_fused_new_zeros_0.ttir @@ -0,0 +1,34 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_new_zeros_0(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<256xi32> loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc12) + %xoffset_0 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc13) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc14) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<256xi32> loc(#loc15) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<256xi32> loc(#loc15) + %xmask = tt.splat %xnumel : i32 -> tensor<256xi32> loc(#loc16) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<256xi32> loc(#loc16) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc8) + %1 = tt.addptr %0, %xindex_2 : tensor<256x!tt.ptr>, tensor<256xi32> loc(#loc8) + tt.store %1, %cst, %xmask_3 : tensor<256x!tt.ptr> loc(#loc1) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":24:36) +#loc2 = loc(unknown) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":19:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":19:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":20:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":20:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":21:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":24:25) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":24:4) +#loc12 = loc("xoffset"(#loc3)) +#loc13 = loc("xoffset"(#loc4)) +#loc14 = loc("xindex"(#loc5)) +#loc15 = loc("xindex"(#loc6)) +#loc16 = loc("xmask"(#loc7)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3e02e616fd60e3f79460203c1b0495b60612dbb7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..52b79a04e302a5ac31258d49754c7843189cd1dd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"hash": "e06ef592ad45788917e8ca2aaf880ce2d9dc1b17df1f92a71a87d94ada7fc9a8", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 16384, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..603633c555d83170cdf13d0b2d806a75a5169fc6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir @@ -0,0 +1,2749 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_1 = internal constant [8 x i8] c"unknown\00" +@assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py\00" +@assertMessage_1 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp49 < 17\00" +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py\00" +@assertMessage_0 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp40 < 17\00" +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %13 = shl i32 %12, 7, !dbg !11 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %15 = and i32 %14, 252, !dbg !12 + %16 = lshr exact i32 %15, 2, !dbg !12 + %17 = and i32 %14, 127, !dbg !12 + %18 = or disjoint i32 %16, %13, !dbg !13 + %19 = or disjoint i32 %18, 64, !dbg !13 + %20 = icmp slt i32 %18, 128, !dbg !14 + %21 = icmp slt i32 %19, 128, !dbg !14 + %22 = and i32 %14, 3, !dbg !15 + %23 = shl nuw nsw i32 %22, 2, !dbg !15 + %24 = or disjoint i32 %23, 1, !dbg !15 + %25 = or disjoint i32 %23, 2, !dbg !15 + %26 = or disjoint i32 %23, 3, !dbg !15 + %27 = shl i32 %18, 4, !dbg !16 + %28 = shl i32 %19, 4, !dbg !16 + %29 = or disjoint i32 %27, %23, !dbg !17 + %30 = or disjoint i32 %27, %25, !dbg !17 + %31 = or disjoint i32 %28, %23, !dbg !17 + %32 = or disjoint i32 %28, %25, !dbg !17 + %33 = sext i32 %29 to i64, !dbg !18 + %34 = getelementptr i64, ptr addrspace(1) %0, i64 %33, !dbg !18 + %35 = sext i32 %30 to i64, !dbg !18 + %36 = getelementptr i64, ptr addrspace(1) %0, i64 %35, !dbg !18 + %37 = sext i32 %31 to i64, !dbg !18 + %38 = getelementptr i64, ptr addrspace(1) %0, i64 %37, !dbg !18 + %39 = sext i32 %32 to i64, !dbg !18 + %40 = getelementptr i64, ptr addrspace(1) %0, i64 %39, !dbg !18 + %41 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %34, i1 %20) #6, !dbg !19 + %42 = extractvalue { i64, i64 } %41, 0, !dbg !19 + %43 = extractvalue { i64, i64 } %41, 1, !dbg !19 + %44 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %36, i1 %20) #6, !dbg !19 + %45 = extractvalue { i64, i64 } %44, 0, !dbg !19 + %46 = extractvalue { i64, i64 } %44, 1, !dbg !19 + %47 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %38, i1 %21) #6, !dbg !19 + %48 = extractvalue { i64, i64 } %47, 0, !dbg !19 + %49 = extractvalue { i64, i64 } %47, 1, !dbg !19 + %50 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %40, i1 %21) #6, !dbg !19 + %51 = extractvalue { i64, i64 } %50, 0, !dbg !19 + %52 = extractvalue { i64, i64 } %50, 1, !dbg !19 + %53 = and i32 %14, 1, !dbg !20 + %54 = lshr i32 %14, 1, !dbg !20 + %.lobit = and i32 %54, 1, !dbg !20 + %55 = xor i32 %53, 1, !dbg !24 + %56 = xor i32 %.lobit, 1, !dbg !24 + %57 = xor i32 %24, %23, !dbg !25 + %58 = xor i32 %25, %26, !dbg !25 + %59 = trunc i32 %14 to i1, !dbg !26 + %60 = trunc i32 %54 to i1, !dbg !26 + %61 = icmp eq i64 %42, 16384, !dbg !27 + %62 = icmp eq i64 %43, 16384, !dbg !27 + %63 = icmp eq i64 %45, 16384, !dbg !27 + %64 = icmp eq i64 %46, 16384, !dbg !27 + %65 = icmp eq i64 %48, 16384, !dbg !27 + %66 = icmp eq i64 %49, 16384, !dbg !27 + %67 = icmp eq i64 %51, 16384, !dbg !27 + %68 = icmp eq i64 %52, 16384, !dbg !27 + %69 = zext i1 %61 to i32, !dbg !28 + %70 = zext i1 %62 to i32, !dbg !28 + %71 = zext i1 %63 to i32, !dbg !28 + %72 = zext i1 %64 to i32, !dbg !28 + %73 = zext i1 %65 to i32, !dbg !28 + %74 = zext i1 %66 to i32, !dbg !28 + %75 = zext i1 %67 to i32, !dbg !28 + %76 = zext i1 %68 to i32, !dbg !28 + %77 = xor i1 %61, true, !dbg !29 + %78 = and i1 %62, %77, !dbg !29 + %79 = xor i1 %65, true, !dbg !29 + %80 = and i1 %66, %79, !dbg !29 + %.not26 = xor i1 %64, true, !dbg !31 + %81 = or i1 %63, %.not26, !dbg !31 + %.not27 = xor i1 %68, true, !dbg !31 + %82 = or i1 %67, %.not27, !dbg !31 + %83 = xor i32 %69, %70, !dbg !32 + %84 = xor i32 %71, %72, !dbg !32 + %85 = xor i32 %73, %74, !dbg !32 + %86 = xor i32 %75, %76, !dbg !32 + %87 = select i1 %78, i32 %83, i32 0, !dbg !33 + %88 = select i1 %81, i32 %84, i32 0, !dbg !33 + %89 = select i1 %80, i32 %85, i32 0, !dbg !33 + %90 = select i1 %82, i32 %86, i32 0, !dbg !33 + %91 = xor i32 %87, %69, !dbg !34 + %92 = xor i32 %87, %70, !dbg !34 + %93 = xor i32 %88, %71, !dbg !34 + %94 = xor i32 %88, %72, !dbg !34 + %95 = xor i32 %89, %73, !dbg !34 + %96 = xor i32 %89, %74, !dbg !34 + %97 = xor i32 %90, %75, !dbg !34 + %98 = xor i32 %90, %76, !dbg !34 + %99 = select i1 %78, i32 %57, i32 0, !dbg !35 + %100 = select i1 %81, i32 %58, i32 0, !dbg !35 + %101 = select i1 %80, i32 %57, i32 0, !dbg !35 + %102 = select i1 %82, i32 %58, i32 0, !dbg !35 + %103 = xor i32 %99, %23, !dbg !36 + %104 = xor i32 %99, %24, !dbg !36 + %105 = xor i32 %100, %25, !dbg !36 + %106 = xor i32 %100, %26, !dbg !36 + %107 = xor i32 %101, %23, !dbg !36 + %108 = xor i32 %101, %24, !dbg !36 + %109 = xor i32 %102, %25, !dbg !36 + %110 = xor i32 %102, %26, !dbg !36 + %111 = icmp samesign uge i32 %91, %93, !dbg !37 + %112 = icmp ne i32 %91, %93, !dbg !37 + %113 = icmp samesign ule i32 %103, %105, !dbg !37 + %114 = or i1 %113, %112, !dbg !37 + %115 = and i1 %111, %114, !dbg !37 + %.not28 = xor i1 %115, %59, !dbg !37 + %116 = icmp samesign uge i32 %92, %94, !dbg !37 + %117 = icmp ne i32 %92, %94, !dbg !37 + %118 = icmp samesign ule i32 %104, %106, !dbg !37 + %119 = or i1 %118, %117, !dbg !37 + %120 = and i1 %116, %119, !dbg !37 + %.not29 = xor i1 %120, %59, !dbg !37 + %121 = icmp samesign uge i32 %95, %97, !dbg !37 + %122 = icmp ne i32 %95, %97, !dbg !37 + %123 = icmp samesign ule i32 %107, %109, !dbg !37 + %124 = or i1 %123, %122, !dbg !37 + %125 = and i1 %121, %124, !dbg !37 + %.not30 = xor i1 %125, %59, !dbg !37 + %126 = icmp samesign uge i32 %96, %98, !dbg !37 + %127 = icmp ne i32 %96, %98, !dbg !37 + %128 = icmp samesign ule i32 %108, %110, !dbg !37 + %129 = or i1 %128, %127, !dbg !37 + %130 = and i1 %126, %129, !dbg !37 + %.not31 = xor i1 %130, %59, !dbg !37 + %131 = xor i32 %93, %91, !dbg !32 + %132 = xor i32 %94, %92, !dbg !32 + %133 = xor i32 %97, %95, !dbg !32 + %134 = xor i32 %98, %96, !dbg !32 + %135 = select i1 %.not28, i32 0, i32 %131, !dbg !33 + %136 = select i1 %.not29, i32 0, i32 %132, !dbg !33 + %137 = select i1 %.not30, i32 0, i32 %133, !dbg !33 + %138 = select i1 %.not31, i32 0, i32 %134, !dbg !33 + %139 = xor i32 %135, %91, !dbg !34 + %140 = xor i32 %136, %92, !dbg !34 + %141 = xor i32 %135, %93, !dbg !34 + %142 = xor i32 %136, %94, !dbg !34 + %143 = xor i32 %137, %95, !dbg !34 + %144 = xor i32 %138, %96, !dbg !34 + %145 = xor i32 %137, %97, !dbg !34 + %146 = xor i32 %138, %98, !dbg !34 + %147 = xor i32 %105, %103, !dbg !38 + %148 = xor i32 %106, %104, !dbg !38 + %149 = xor i32 %109, %107, !dbg !38 + %150 = xor i32 %110, %108, !dbg !38 + %151 = select i1 %.not28, i32 0, i32 %147, !dbg !35 + %152 = select i1 %.not29, i32 0, i32 %148, !dbg !35 + %153 = select i1 %.not30, i32 0, i32 %149, !dbg !35 + %154 = select i1 %.not31, i32 0, i32 %150, !dbg !35 + %155 = xor i32 %151, %103, !dbg !36 + %156 = xor i32 %152, %104, !dbg !36 + %157 = xor i32 %151, %105, !dbg !36 + %158 = xor i32 %152, %106, !dbg !36 + %159 = xor i32 %153, %107, !dbg !36 + %160 = xor i32 %154, %108, !dbg !36 + %161 = xor i32 %153, %109, !dbg !36 + %162 = xor i32 %154, %110, !dbg !36 + %163 = icmp samesign uge i32 %139, %140, !dbg !37 + %164 = icmp ne i32 %139, %140, !dbg !37 + %165 = icmp samesign ule i32 %155, %156, !dbg !37 + %166 = or i1 %164, %165, !dbg !37 + %167 = and i1 %163, %166, !dbg !37 + %.not32 = xor i1 %167, %59, !dbg !37 + %168 = icmp samesign uge i32 %141, %142, !dbg !37 + %169 = icmp ne i32 %141, %142, !dbg !37 + %170 = icmp samesign ule i32 %157, %158, !dbg !37 + %171 = or i1 %169, %170, !dbg !37 + %172 = and i1 %168, %171, !dbg !37 + %.not33 = xor i1 %172, %59, !dbg !37 + %173 = icmp samesign uge i32 %143, %144, !dbg !37 + %174 = icmp ne i32 %143, %144, !dbg !37 + %175 = icmp samesign ule i32 %159, %160, !dbg !37 + %176 = or i1 %174, %175, !dbg !37 + %177 = and i1 %173, %176, !dbg !37 + %.not34 = xor i1 %177, %59, !dbg !37 + %178 = icmp samesign uge i32 %145, %146, !dbg !37 + %179 = icmp ne i32 %145, %146, !dbg !37 + %180 = icmp samesign ule i32 %161, %162, !dbg !37 + %181 = or i1 %179, %180, !dbg !37 + %182 = and i1 %178, %181, !dbg !37 + %.not35 = xor i1 %182, %59, !dbg !37 + %183 = xor i32 %139, %140, !dbg !32 + %184 = xor i32 %141, %142, !dbg !32 + %185 = xor i32 %143, %144, !dbg !32 + %186 = xor i32 %145, %146, !dbg !32 + %187 = select i1 %.not32, i32 0, i32 %183, !dbg !33 + %188 = select i1 %.not33, i32 0, i32 %184, !dbg !33 + %189 = select i1 %.not34, i32 0, i32 %185, !dbg !33 + %190 = select i1 %.not35, i32 0, i32 %186, !dbg !33 + %191 = xor i32 %187, %139, !dbg !34 + %192 = xor i32 %187, %140, !dbg !34 + %193 = xor i32 %188, %141, !dbg !34 + %194 = xor i32 %188, %142, !dbg !34 + %195 = xor i32 %189, %143, !dbg !34 + %196 = xor i32 %189, %144, !dbg !34 + %197 = xor i32 %190, %145, !dbg !34 + %198 = xor i32 %190, %146, !dbg !34 + %199 = xor i32 %155, %156, !dbg !38 + %200 = xor i32 %157, %158, !dbg !38 + %201 = xor i32 %159, %160, !dbg !38 + %202 = xor i32 %161, %162, !dbg !38 + %203 = select i1 %.not32, i32 0, i32 %199, !dbg !35 + %204 = select i1 %.not33, i32 0, i32 %200, !dbg !35 + %205 = select i1 %.not34, i32 0, i32 %201, !dbg !35 + %206 = select i1 %.not35, i32 0, i32 %202, !dbg !35 + %207 = mul nuw nsw i32 %191, %55, !dbg !39 + %208 = mul nuw nsw i32 %192, %55, !dbg !39 + %209 = mul nuw nsw i32 %193, %55, !dbg !39 + %210 = mul nuw nsw i32 %194, %55, !dbg !39 + %211 = mul nuw nsw i32 %195, %55, !dbg !39 + %212 = mul nuw nsw i32 %196, %55, !dbg !39 + %213 = mul nuw nsw i32 %197, %55, !dbg !39 + %214 = mul nuw nsw i32 %198, %55, !dbg !39 + %215 = mul nuw nsw i32 %191, %53, !dbg !40 + %216 = mul nuw nsw i32 %192, %53, !dbg !40 + %217 = mul nuw nsw i32 %193, %53, !dbg !40 + %218 = mul nuw nsw i32 %194, %53, !dbg !40 + %219 = mul nuw nsw i32 %195, %53, !dbg !40 + %220 = mul nuw nsw i32 %196, %53, !dbg !40 + %221 = mul nuw nsw i32 %197, %53, !dbg !40 + %222 = mul nuw nsw i32 %198, %53, !dbg !40 + %223 = insertelement <4 x i32> poison, i32 %204, i64 0, !dbg !36 + %224 = insertelement <4 x i32> %223, i32 %203, i64 1, !dbg !36 + %225 = shufflevector <4 x i32> %224, <4 x i32> poison, <4 x i32> , !dbg !36 + %226 = insertelement <4 x i32> poison, i32 %155, i64 0, !dbg !36 + %227 = insertelement <4 x i32> %226, i32 %156, i64 1, !dbg !36 + %228 = insertelement <4 x i32> %227, i32 %157, i64 2, !dbg !36 + %229 = insertelement <4 x i32> %228, i32 %158, i64 3, !dbg !36 + %230 = xor <4 x i32> %225, %229, !dbg !36 + %231 = extractelement <4 x i32> %230, i64 0, !dbg !41 + %232 = mul nuw nsw i32 %231, %55, !dbg !42 + %233 = extractelement <4 x i32> %230, i64 1, !dbg !41 + %234 = mul nuw nsw i32 %233, %55, !dbg !42 + %235 = extractelement <4 x i32> %230, i64 2, !dbg !41 + %236 = mul nuw nsw i32 %235, %55, !dbg !42 + %237 = extractelement <4 x i32> %230, i64 3, !dbg !41 + %238 = mul nuw nsw i32 %237, %55, !dbg !42 + %239 = mul nuw nsw i32 %231, %53, !dbg !41 + %240 = mul nuw nsw i32 %233, %53, !dbg !41 + %241 = mul nuw nsw i32 %235, %53, !dbg !41 + %242 = mul nuw nsw i32 %237, %53, !dbg !41 + %243 = insertelement <2 x i1> poison, i1 %60, i64 0, !dbg !37 + %244 = shufflevector <2 x i1> %243, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !37 + %245 = insertelement <4 x i32> poison, i32 %56, i64 0, !dbg !42 + %246 = shufflevector <4 x i32> %245, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !42 + %247 = insertelement <4 x i32> poison, i32 %.lobit, i64 0, !dbg !41 + %248 = shufflevector <4 x i32> %247, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !41 + %249 = insertelement <4 x i32> poison, i32 %55, i64 0, !dbg !39 + %250 = shufflevector <4 x i32> %249, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !39 + %251 = insertelement <4 x i32> poison, i32 %53, i64 0, !dbg !40 + %252 = shufflevector <4 x i32> %251, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !40 + %253 = insertelement <4 x i32> poison, i32 %206, i64 0, !dbg !36 + %254 = insertelement <4 x i32> %253, i32 %205, i64 1, !dbg !36 + %255 = shufflevector <4 x i32> %254, <4 x i32> poison, <4 x i32> , !dbg !36 + %256 = insertelement <4 x i32> poison, i32 %159, i64 0, !dbg !36 + %257 = insertelement <4 x i32> %256, i32 %160, i64 1, !dbg !36 + %258 = insertelement <4 x i32> %257, i32 %161, i64 2, !dbg !36 + %259 = insertelement <4 x i32> %258, i32 %162, i64 3, !dbg !36 + %260 = xor <4 x i32> %255, %259, !dbg !36 + %261 = extractelement <4 x i32> %260, i64 0, !dbg !41 + %262 = mul nuw nsw i32 %261, %55, !dbg !42 + %263 = extractelement <4 x i32> %260, i64 1, !dbg !41 + %264 = mul nuw nsw i32 %263, %55, !dbg !42 + %265 = extractelement <4 x i32> %260, i64 2, !dbg !41 + %266 = mul nuw nsw i32 %265, %55, !dbg !42 + %267 = extractelement <4 x i32> %260, i64 3, !dbg !41 + %268 = mul nuw nsw i32 %267, %55, !dbg !42 + %269 = mul nuw nsw i32 %261, %53, !dbg !41 + %270 = mul nuw nsw i32 %263, %53, !dbg !41 + %271 = mul nuw nsw i32 %265, %53, !dbg !41 + %272 = mul nuw nsw i32 %267, %53, !dbg !41 + %273 = insertelement <4 x i64> poison, i64 %42, i64 0, !dbg !43 + %274 = insertelement <4 x i64> %273, i64 %43, i64 1, !dbg !43 + %275 = insertelement <4 x i64> %274, i64 %45, i64 2, !dbg !43 + %276 = insertelement <4 x i64> %275, i64 %46, i64 3, !dbg !43 + %277 = add <4 x i64> %276, splat (i64 -1), !dbg !43 + %278 = icmp ult <4 x i64> %277, splat (i64 16383), !dbg !43 + %279 = extractelement <4 x i1> %278, i64 0, !dbg !44 + %280 = zext i1 %279 to i32, !dbg !28 + %281 = extractelement <4 x i1> %278, i64 1, !dbg !44 + %282 = zext i1 %281 to i32, !dbg !28 + %283 = extractelement <4 x i1> %278, i64 2, !dbg !45 + %284 = zext i1 %283 to i32, !dbg !28 + %285 = extractelement <4 x i1> %278, i64 3, !dbg !45 + %286 = zext i1 %285 to i32, !dbg !28 + %287 = xor i1 %279, true, !dbg !44 + %288 = and i1 %281, %287, !dbg !44 + %.not = xor i1 %285, true, !dbg !45 + %289 = or i1 %283, %.not, !dbg !45 + %290 = xor i32 %280, %282, !dbg !46 + %291 = xor i32 %284, %286, !dbg !46 + %292 = select i1 %288, i32 %290, i32 0, !dbg !47 + %293 = select i1 %289, i32 %291, i32 0, !dbg !47 + %294 = xor i32 %292, %280, !dbg !48 + %295 = xor i32 %292, %282, !dbg !48 + %296 = xor i32 %293, %284, !dbg !48 + %297 = xor i32 %293, %286, !dbg !48 + %298 = select i1 %288, i32 %57, i32 0, !dbg !49 + %299 = select i1 %289, i32 %58, i32 0, !dbg !49 + %300 = xor i32 %298, %23, !dbg !50 + %301 = xor i32 %298, %24, !dbg !50 + %302 = xor i32 %299, %25, !dbg !50 + %303 = xor i32 %299, %26, !dbg !50 + %304 = insertelement <2 x i32> poison, i32 %294, i64 0, !dbg !26 + %305 = insertelement <2 x i32> %304, i32 %295, i64 1, !dbg !26 + %306 = insertelement <2 x i32> poison, i32 %296, i64 0, !dbg !26 + %307 = insertelement <2 x i32> %306, i32 %297, i64 1, !dbg !26 + %308 = icmp samesign uge <2 x i32> %305, %307, !dbg !26 + %309 = insertelement <2 x i32> %307, i32 %295, i64 1, !dbg !26 + %310 = insertelement <2 x i32> %305, i32 %297, i64 1, !dbg !26 + %311 = icmp ne <2 x i32> %309, %310, !dbg !26 + %312 = insertelement <2 x i32> poison, i32 %300, i64 0, !dbg !26 + %313 = insertelement <2 x i32> %312, i32 %301, i64 1, !dbg !26 + %314 = insertelement <2 x i32> poison, i32 %302, i64 0, !dbg !26 + %315 = insertelement <2 x i32> %314, i32 %303, i64 1, !dbg !26 + %316 = icmp ule <2 x i32> %313, %315, !dbg !26 + %317 = or <2 x i1> %316, %311, !dbg !26 + %318 = and <2 x i1> %308, %317, !dbg !26 + %319 = insertelement <2 x i1> poison, i1 %59, i64 0, !dbg !26 + %320 = shufflevector <2 x i1> %319, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !26 + %321 = xor <2 x i1> %318, %320, !dbg !26 + %322 = xor <2 x i32> %310, %309, !dbg !46 + %323 = select <2 x i1> %321, <2 x i32> zeroinitializer, <2 x i32> %322, !dbg !47 + %324 = extractelement <2 x i32> %323, i64 0, !dbg !48 + %325 = xor i32 %324, %294, !dbg !48 + %326 = extractelement <2 x i32> %323, i64 1, !dbg !48 + %327 = xor i32 %326, %295, !dbg !48 + %328 = xor <2 x i32> %323, %307, !dbg !48 + %329 = xor i32 %302, %300, !dbg !25 + %330 = xor i32 %303, %301, !dbg !25 + %331 = extractelement <2 x i1> %321, i64 0, !dbg !49 + %332 = select i1 %331, i32 0, i32 %329, !dbg !49 + %333 = extractelement <2 x i1> %321, i64 1, !dbg !49 + %334 = select i1 %333, i32 0, i32 %330, !dbg !49 + %335 = xor i32 %332, %300, !dbg !50 + %336 = xor i32 %334, %301, !dbg !50 + %337 = xor i32 %332, %302, !dbg !50 + %338 = xor i32 %334, %303, !dbg !50 + %339 = icmp samesign uge i32 %325, %327, !dbg !26 + %340 = icmp ne i32 %325, %327, !dbg !26 + %341 = icmp samesign ule i32 %335, %336, !dbg !26 + %342 = or i1 %340, %341, !dbg !26 + %343 = and i1 %339, %342, !dbg !26 + %.not6 = xor i1 %343, %59, !dbg !26 + %344 = extractelement <2 x i32> %328, i64 0, !dbg !26 + %345 = extractelement <2 x i32> %328, i64 1, !dbg !26 + %346 = icmp samesign uge i32 %344, %345, !dbg !26 + %347 = icmp ne i32 %344, %345, !dbg !26 + %348 = icmp samesign ule i32 %337, %338, !dbg !26 + %349 = or i1 %347, %348, !dbg !26 + %350 = and i1 %346, %349, !dbg !26 + %.not7 = xor i1 %350, %59, !dbg !26 + %351 = xor i32 %325, %327, !dbg !46 + %352 = xor i32 %344, %345, !dbg !46 + %353 = select i1 %.not6, i32 0, i32 %351, !dbg !47 + %354 = select i1 %.not7, i32 0, i32 %352, !dbg !47 + %355 = xor i32 %353, %325, !dbg !48 + %356 = xor i32 %353, %327, !dbg !48 + %357 = xor i32 %354, %344, !dbg !48 + %358 = xor i32 %354, %345, !dbg !48 + %359 = xor i32 %335, %336, !dbg !25 + %360 = xor i32 %337, %338, !dbg !25 + %361 = select i1 %.not6, i32 0, i32 %359, !dbg !49 + %362 = select i1 %.not7, i32 0, i32 %360, !dbg !49 + %363 = xor i32 %361, %335, !dbg !50 + %364 = xor i32 %361, %336, !dbg !50 + %365 = xor i32 %362, %337, !dbg !50 + %366 = xor i32 %362, %338, !dbg !50 + %367 = mul nuw nsw i32 %355, %55, !dbg !51 + %368 = mul nuw nsw i32 %356, %55, !dbg !51 + %369 = mul nuw nsw i32 %357, %55, !dbg !51 + %370 = mul nuw nsw i32 %358, %55, !dbg !51 + %371 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %367, i32 1, i32 31), !dbg !52 + %372 = add i32 %367, %371, !dbg !55 + %373 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %368, i32 1, i32 31), !dbg !52 + %374 = add i32 %368, %373, !dbg !55 + %375 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %369, i32 1, i32 31), !dbg !52 + %376 = add i32 %369, %375, !dbg !55 + %377 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %370, i32 1, i32 31), !dbg !52 + %378 = add i32 %370, %377, !dbg !55 + %379 = mul nuw nsw i32 %355, %53, !dbg !56 + %380 = mul nuw nsw i32 %356, %53, !dbg !56 + %381 = mul nuw nsw i32 %357, %53, !dbg !56 + %382 = mul nuw nsw i32 %358, %53, !dbg !56 + %383 = mul nuw nsw i32 %363, %55, !dbg !57 + %384 = mul nuw nsw i32 %364, %55, !dbg !57 + %385 = mul nuw nsw i32 %365, %55, !dbg !57 + %386 = mul nuw nsw i32 %366, %55, !dbg !57 + %387 = mul nuw nsw i32 %363, %53, !dbg !58 + %388 = mul nuw nsw i32 %364, %53, !dbg !58 + %389 = mul nuw nsw i32 %365, %53, !dbg !58 + %390 = mul nuw nsw i32 %366, %53, !dbg !58 + %391 = insertelement <4 x i1> poison, i1 %20, i64 0, !dbg !59 + %392 = shufflevector <4 x i1> %391, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !59 + %393 = select <4 x i1> %392, <4 x i1> %278, <4 x i1> zeroinitializer, !dbg !59 + %394 = insertelement <4 x i64> poison, i64 %48, i64 0, !dbg !43 + %395 = insertelement <4 x i64> %394, i64 %49, i64 1, !dbg !43 + %396 = insertelement <4 x i64> %395, i64 %51, i64 2, !dbg !43 + %397 = insertelement <4 x i64> %396, i64 %52, i64 3, !dbg !43 + %398 = add <4 x i64> %397, splat (i64 -1), !dbg !43 + %399 = icmp ult <4 x i64> %398, splat (i64 16383), !dbg !43 + %400 = extractelement <4 x i1> %399, i64 0, !dbg !44 + %401 = zext i1 %400 to i32, !dbg !28 + %402 = extractelement <4 x i1> %399, i64 1, !dbg !44 + %403 = zext i1 %402 to i32, !dbg !28 + %404 = extractelement <4 x i1> %399, i64 2, !dbg !45 + %405 = zext i1 %404 to i32, !dbg !28 + %406 = extractelement <4 x i1> %399, i64 3, !dbg !45 + %407 = zext i1 %406 to i32, !dbg !28 + %408 = xor i1 %400, true, !dbg !44 + %409 = and i1 %402, %408, !dbg !44 + %.not1 = xor i1 %406, true, !dbg !45 + %410 = or i1 %404, %.not1, !dbg !45 + %411 = xor i32 %401, %403, !dbg !46 + %412 = xor i32 %405, %407, !dbg !46 + %413 = select i1 %409, i32 %411, i32 0, !dbg !47 + %414 = select i1 %410, i32 %412, i32 0, !dbg !47 + %415 = xor i32 %413, %401, !dbg !48 + %416 = xor i32 %413, %403, !dbg !48 + %417 = xor i32 %414, %405, !dbg !48 + %418 = xor i32 %414, %407, !dbg !48 + %419 = select i1 %409, i32 %57, i32 0, !dbg !49 + %420 = select i1 %410, i32 %58, i32 0, !dbg !49 + %421 = xor i32 %419, %23, !dbg !50 + %422 = xor i32 %419, %24, !dbg !50 + %423 = xor i32 %420, %25, !dbg !50 + %424 = xor i32 %420, %26, !dbg !50 + %425 = insertelement <2 x i32> poison, i32 %415, i64 0, !dbg !26 + %426 = insertelement <2 x i32> %425, i32 %416, i64 1, !dbg !26 + %427 = insertelement <2 x i32> poison, i32 %417, i64 0, !dbg !26 + %428 = insertelement <2 x i32> %427, i32 %418, i64 1, !dbg !26 + %429 = icmp samesign uge <2 x i32> %426, %428, !dbg !26 + %430 = insertelement <2 x i32> %428, i32 %416, i64 1, !dbg !26 + %431 = insertelement <2 x i32> %426, i32 %418, i64 1, !dbg !26 + %432 = icmp ne <2 x i32> %430, %431, !dbg !26 + %433 = insertelement <2 x i32> poison, i32 %421, i64 0, !dbg !26 + %434 = insertelement <2 x i32> %433, i32 %422, i64 1, !dbg !26 + %435 = insertelement <2 x i32> poison, i32 %423, i64 0, !dbg !26 + %436 = insertelement <2 x i32> %435, i32 %424, i64 1, !dbg !26 + %437 = icmp ule <2 x i32> %434, %436, !dbg !26 + %438 = or <2 x i1> %437, %432, !dbg !26 + %439 = and <2 x i1> %429, %438, !dbg !26 + %440 = xor <2 x i1> %439, %320, !dbg !26 + %441 = xor <2 x i32> %431, %430, !dbg !46 + %442 = select <2 x i1> %440, <2 x i32> zeroinitializer, <2 x i32> %441, !dbg !47 + %443 = extractelement <2 x i32> %442, i64 0, !dbg !48 + %444 = xor i32 %443, %415, !dbg !48 + %445 = extractelement <2 x i32> %442, i64 1, !dbg !48 + %446 = xor i32 %445, %416, !dbg !48 + %447 = xor <2 x i32> %442, %428, !dbg !48 + %448 = xor i32 %423, %421, !dbg !25 + %449 = xor i32 %424, %422, !dbg !25 + %450 = extractelement <2 x i1> %440, i64 0, !dbg !49 + %451 = select i1 %450, i32 0, i32 %448, !dbg !49 + %452 = extractelement <2 x i1> %440, i64 1, !dbg !49 + %453 = select i1 %452, i32 0, i32 %449, !dbg !49 + %454 = xor i32 %451, %421, !dbg !50 + %455 = xor i32 %453, %422, !dbg !50 + %456 = xor i32 %451, %423, !dbg !50 + %457 = xor i32 %453, %424, !dbg !50 + %458 = icmp samesign uge i32 %444, %446, !dbg !26 + %459 = icmp ne i32 %444, %446, !dbg !26 + %460 = icmp samesign ule i32 %454, %455, !dbg !26 + %461 = or i1 %459, %460, !dbg !26 + %462 = and i1 %458, %461, !dbg !26 + %.not8 = xor i1 %462, %59, !dbg !26 + %463 = extractelement <2 x i32> %447, i64 0, !dbg !26 + %464 = extractelement <2 x i32> %447, i64 1, !dbg !26 + %465 = icmp samesign uge i32 %463, %464, !dbg !26 + %466 = icmp ne i32 %463, %464, !dbg !26 + %467 = icmp samesign ule i32 %456, %457, !dbg !26 + %468 = or i1 %466, %467, !dbg !26 + %469 = and i1 %465, %468, !dbg !26 + %.not9 = xor i1 %469, %59, !dbg !26 + %470 = xor i32 %444, %446, !dbg !46 + %471 = xor i32 %463, %464, !dbg !46 + %472 = select i1 %.not8, i32 0, i32 %470, !dbg !47 + %473 = select i1 %.not9, i32 0, i32 %471, !dbg !47 + %474 = xor i32 %472, %444, !dbg !48 + %475 = xor i32 %472, %446, !dbg !48 + %476 = xor i32 %473, %463, !dbg !48 + %477 = xor i32 %473, %464, !dbg !48 + %478 = xor i32 %454, %455, !dbg !25 + %479 = xor i32 %456, %457, !dbg !25 + %480 = select i1 %.not8, i32 0, i32 %478, !dbg !49 + %481 = select i1 %.not9, i32 0, i32 %479, !dbg !49 + %482 = xor i32 %480, %454, !dbg !50 + %483 = xor i32 %480, %455, !dbg !50 + %484 = xor i32 %481, %456, !dbg !50 + %485 = xor i32 %481, %457, !dbg !50 + %486 = mul nuw nsw i32 %474, %55, !dbg !51 + %487 = mul nuw nsw i32 %475, %55, !dbg !51 + %488 = mul nuw nsw i32 %476, %55, !dbg !51 + %489 = mul nuw nsw i32 %477, %55, !dbg !51 + %490 = mul nuw nsw i32 %474, %53, !dbg !56 + %491 = mul nuw nsw i32 %475, %53, !dbg !56 + %492 = mul nuw nsw i32 %476, %53, !dbg !56 + %493 = mul nuw nsw i32 %477, %53, !dbg !56 + %494 = mul nuw nsw i32 %482, %55, !dbg !57 + %495 = mul nuw nsw i32 %483, %55, !dbg !57 + %496 = mul nuw nsw i32 %484, %55, !dbg !57 + %497 = mul nuw nsw i32 %485, %55, !dbg !57 + %498 = mul nuw nsw i32 %482, %53, !dbg !58 + %499 = mul nuw nsw i32 %483, %53, !dbg !58 + %500 = mul nuw nsw i32 %484, %53, !dbg !58 + %501 = mul nuw nsw i32 %485, %53, !dbg !58 + %502 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %486, i32 1, i32 31), !dbg !52 + %503 = add i32 %486, %502, !dbg !55 + %504 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %487, i32 1, i32 31), !dbg !52 + %505 = add i32 %487, %504, !dbg !55 + %506 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %488, i32 1, i32 31), !dbg !52 + %507 = add i32 %488, %506, !dbg !55 + %508 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %489, i32 1, i32 31), !dbg !52 + %509 = add i32 %489, %508, !dbg !55 + %510 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %379, i32 1, i32 31), !dbg !52 + %511 = add i32 %379, %510, !dbg !55 + %512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %380, i32 1, i32 31), !dbg !52 + %513 = add i32 %380, %512, !dbg !55 + %514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %381, i32 1, i32 31), !dbg !52 + %515 = add i32 %381, %514, !dbg !55 + %516 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %382, i32 1, i32 31), !dbg !52 + %517 = add i32 %382, %516, !dbg !55 + %518 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %490, i32 1, i32 31), !dbg !52 + %519 = add i32 %490, %518, !dbg !55 + %520 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %491, i32 1, i32 31), !dbg !52 + %521 = add i32 %491, %520, !dbg !55 + %522 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %492, i32 1, i32 31), !dbg !52 + %523 = add i32 %492, %522, !dbg !55 + %524 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %493, i32 1, i32 31), !dbg !52 + %525 = add i32 %493, %524, !dbg !55 + %526 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %383, i32 1, i32 31), !dbg !52 + %527 = add i32 %383, %526, !dbg !55 + %528 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %384, i32 1, i32 31), !dbg !52 + %529 = add i32 %384, %528, !dbg !55 + %530 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %385, i32 1, i32 31), !dbg !52 + %531 = add i32 %530, %385, !dbg !55 + %532 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %386, i32 1, i32 31), !dbg !52 + %533 = add i32 %532, %386, !dbg !55 + %534 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %494, i32 1, i32 31), !dbg !52 + %535 = add i32 %534, %494, !dbg !55 + %536 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %495, i32 1, i32 31), !dbg !52 + %537 = add i32 %536, %495, !dbg !55 + %538 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %496, i32 1, i32 31), !dbg !52 + %539 = add i32 %538, %496, !dbg !55 + %540 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %497, i32 1, i32 31), !dbg !52 + %541 = add i32 %540, %497, !dbg !55 + %542 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %387, i32 1, i32 31), !dbg !52 + %543 = add i32 %542, %387, !dbg !55 + %544 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %388, i32 1, i32 31), !dbg !52 + %545 = add i32 %544, %388, !dbg !55 + %546 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %389, i32 1, i32 31), !dbg !52 + %547 = add i32 %546, %389, !dbg !55 + %548 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %390, i32 1, i32 31), !dbg !52 + %549 = add i32 %548, %390, !dbg !55 + %550 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %498, i32 1, i32 31), !dbg !52 + %551 = add i32 %550, %498, !dbg !55 + %552 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %499, i32 1, i32 31), !dbg !52 + %553 = add i32 %552, %499, !dbg !55 + %554 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %500, i32 1, i32 31), !dbg !52 + %555 = add i32 %554, %500, !dbg !55 + %556 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %501, i32 1, i32 31), !dbg !52 + %557 = add i32 %556, %501, !dbg !55 + %558 = icmp sge i32 %372, %511, !dbg !26 + %559 = icmp ne i32 %372, %511, !dbg !26 + %560 = icmp sle i32 %527, %543, !dbg !26 + %561 = or i1 %559, %560, !dbg !26 + %562 = and i1 %558, %561, !dbg !26 + %.not10 = xor i1 %562, %60, !dbg !26 + %563 = icmp sge i32 %374, %513, !dbg !26 + %564 = icmp ne i32 %374, %513, !dbg !26 + %565 = icmp sle i32 %529, %545, !dbg !26 + %566 = or i1 %564, %565, !dbg !26 + %567 = and i1 %563, %566, !dbg !26 + %.not11 = xor i1 %567, %60, !dbg !26 + %568 = icmp sge i32 %376, %515, !dbg !26 + %569 = icmp ne i32 %376, %515, !dbg !26 + %570 = icmp sle i32 %531, %547, !dbg !26 + %571 = or i1 %569, %570, !dbg !26 + %572 = and i1 %568, %571, !dbg !26 + %.not12 = xor i1 %572, %60, !dbg !26 + %573 = icmp sge i32 %378, %517, !dbg !26 + %574 = icmp ne i32 %378, %517, !dbg !26 + %575 = icmp sle i32 %533, %549, !dbg !26 + %576 = or i1 %574, %575, !dbg !26 + %577 = and i1 %573, %576, !dbg !26 + %.not13 = xor i1 %577, %60, !dbg !26 + %578 = icmp sge i32 %503, %519, !dbg !26 + %579 = icmp ne i32 %503, %519, !dbg !26 + %580 = icmp sle i32 %535, %551, !dbg !26 + %581 = or i1 %579, %580, !dbg !26 + %582 = and i1 %578, %581, !dbg !26 + %.not14 = xor i1 %582, %60, !dbg !26 + %583 = icmp sge i32 %505, %521, !dbg !26 + %584 = icmp ne i32 %505, %521, !dbg !26 + %585 = icmp sle i32 %537, %553, !dbg !26 + %586 = or i1 %584, %585, !dbg !26 + %587 = and i1 %583, %586, !dbg !26 + %.not15 = xor i1 %587, %60, !dbg !26 + %588 = icmp sge i32 %507, %523, !dbg !26 + %589 = icmp ne i32 %507, %523, !dbg !26 + %590 = icmp sle i32 %539, %555, !dbg !26 + %591 = or i1 %589, %590, !dbg !26 + %592 = and i1 %588, %591, !dbg !26 + %.not16 = xor i1 %592, %60, !dbg !26 + %593 = icmp sge i32 %509, %525, !dbg !26 + %594 = icmp ne i32 %509, %525, !dbg !26 + %595 = icmp sle i32 %541, %557, !dbg !26 + %596 = or i1 %594, %595, !dbg !26 + %597 = and i1 %593, %596, !dbg !26 + %.not17 = xor i1 %597, %60, !dbg !26 + %598 = xor i32 %372, %511, !dbg !46 + %599 = xor i32 %374, %513, !dbg !46 + %600 = xor i32 %376, %515, !dbg !46 + %601 = xor i32 %378, %517, !dbg !46 + %602 = xor i32 %503, %519, !dbg !46 + %603 = xor i32 %505, %521, !dbg !46 + %604 = xor i32 %507, %523, !dbg !46 + %605 = xor i32 %509, %525, !dbg !46 + %606 = select i1 %.not10, i32 0, i32 %598, !dbg !47 + %607 = select i1 %.not11, i32 0, i32 %599, !dbg !47 + %608 = select i1 %.not12, i32 0, i32 %600, !dbg !47 + %609 = select i1 %.not13, i32 0, i32 %601, !dbg !47 + %610 = select i1 %.not14, i32 0, i32 %602, !dbg !47 + %611 = select i1 %.not15, i32 0, i32 %603, !dbg !47 + %612 = select i1 %.not16, i32 0, i32 %604, !dbg !47 + %613 = select i1 %.not17, i32 0, i32 %605, !dbg !47 + %614 = xor i32 %606, %355, !dbg !48 + %615 = xor i32 %607, %356, !dbg !48 + %616 = xor i32 %608, %357, !dbg !48 + %617 = xor i32 %609, %358, !dbg !48 + %618 = xor i32 %610, %474, !dbg !48 + %619 = xor i32 %611, %475, !dbg !48 + %620 = xor i32 %612, %476, !dbg !48 + %621 = xor i32 %613, %477, !dbg !48 + %622 = xor i32 %543, %527, !dbg !25 + %623 = xor i32 %545, %529, !dbg !25 + %624 = xor i32 %547, %531, !dbg !25 + %625 = xor i32 %549, %533, !dbg !25 + %626 = xor i32 %551, %535, !dbg !25 + %627 = xor i32 %553, %537, !dbg !25 + %628 = xor i32 %555, %539, !dbg !25 + %629 = xor i32 %557, %541, !dbg !25 + %630 = select i1 %.not10, i32 0, i32 %622, !dbg !49 + %631 = select i1 %.not11, i32 0, i32 %623, !dbg !49 + %632 = select i1 %.not12, i32 0, i32 %624, !dbg !49 + %633 = select i1 %.not13, i32 0, i32 %625, !dbg !49 + %634 = select i1 %.not14, i32 0, i32 %626, !dbg !49 + %635 = select i1 %.not15, i32 0, i32 %627, !dbg !49 + %636 = select i1 %.not16, i32 0, i32 %628, !dbg !49 + %637 = select i1 %.not17, i32 0, i32 %629, !dbg !49 + %638 = xor i32 %630, %363, !dbg !50 + %639 = xor i32 %631, %364, !dbg !50 + %640 = xor i32 %632, %365, !dbg !50 + %641 = xor i32 %633, %366, !dbg !50 + %642 = xor i32 %634, %482, !dbg !50 + %643 = xor i32 %635, %483, !dbg !50 + %644 = xor i32 %636, %484, !dbg !50 + %645 = xor i32 %637, %485, !dbg !50 + %646 = icmp sge i32 %614, %616, !dbg !26 + %647 = icmp ne i32 %614, %616, !dbg !26 + %648 = icmp sle i32 %638, %640, !dbg !26 + %649 = or i1 %647, %648, !dbg !26 + %650 = and i1 %646, %649, !dbg !26 + %.not18 = xor i1 %650, %60, !dbg !26 + %651 = icmp sge i32 %615, %617, !dbg !26 + %652 = icmp ne i32 %615, %617, !dbg !26 + %653 = icmp sle i32 %639, %641, !dbg !26 + %654 = or i1 %652, %653, !dbg !26 + %655 = and i1 %651, %654, !dbg !26 + %.not19 = xor i1 %655, %60, !dbg !26 + %656 = icmp sge i32 %618, %620, !dbg !26 + %657 = icmp ne i32 %618, %620, !dbg !26 + %658 = icmp sle i32 %642, %644, !dbg !26 + %659 = or i1 %657, %658, !dbg !26 + %660 = and i1 %656, %659, !dbg !26 + %.not20 = xor i1 %660, %60, !dbg !26 + %661 = icmp sge i32 %619, %621, !dbg !26 + %662 = icmp ne i32 %619, %621, !dbg !26 + %663 = icmp sle i32 %643, %645, !dbg !26 + %664 = or i1 %662, %663, !dbg !26 + %665 = and i1 %661, %664, !dbg !26 + %.not21 = xor i1 %665, %60, !dbg !26 + %666 = xor i32 %616, %614, !dbg !46 + %667 = xor i32 %617, %615, !dbg !46 + %668 = xor i32 %620, %618, !dbg !46 + %669 = xor i32 %621, %619, !dbg !46 + %670 = select i1 %.not18, i32 0, i32 %666, !dbg !47 + %671 = select i1 %.not19, i32 0, i32 %667, !dbg !47 + %672 = select i1 %.not20, i32 0, i32 %668, !dbg !47 + %673 = select i1 %.not21, i32 0, i32 %669, !dbg !47 + %674 = xor i32 %670, %614, !dbg !48 + %675 = xor i32 %671, %615, !dbg !48 + %676 = xor i32 %670, %616, !dbg !48 + %677 = xor i32 %671, %617, !dbg !48 + %678 = xor i32 %672, %618, !dbg !48 + %679 = xor i32 %673, %619, !dbg !48 + %680 = xor i32 %672, %620, !dbg !48 + %681 = xor i32 %673, %621, !dbg !48 + %682 = xor i32 %640, %638, !dbg !25 + %683 = xor i32 %641, %639, !dbg !25 + %684 = xor i32 %644, %642, !dbg !25 + %685 = xor i32 %645, %643, !dbg !25 + %686 = select i1 %.not18, i32 0, i32 %682, !dbg !49 + %687 = select i1 %.not19, i32 0, i32 %683, !dbg !49 + %688 = select i1 %.not20, i32 0, i32 %684, !dbg !49 + %689 = select i1 %.not21, i32 0, i32 %685, !dbg !49 + %690 = xor i32 %686, %638, !dbg !50 + %691 = xor i32 %687, %639, !dbg !50 + %692 = xor i32 %686, %640, !dbg !50 + %693 = xor i32 %687, %641, !dbg !50 + %694 = xor i32 %688, %642, !dbg !50 + %695 = xor i32 %689, %643, !dbg !50 + %696 = xor i32 %688, %644, !dbg !50 + %697 = xor i32 %689, %645, !dbg !50 + %698 = icmp sge i32 %674, %675, !dbg !26 + %699 = icmp ne i32 %674, %675, !dbg !26 + %700 = icmp sle i32 %690, %691, !dbg !26 + %701 = or i1 %699, %700, !dbg !26 + %702 = and i1 %698, %701, !dbg !26 + %.not22 = xor i1 %702, %60, !dbg !26 + %703 = icmp sge i32 %676, %677, !dbg !26 + %704 = icmp ne i32 %676, %677, !dbg !26 + %705 = icmp sle i32 %692, %693, !dbg !26 + %706 = or i1 %704, %705, !dbg !26 + %707 = and i1 %703, %706, !dbg !26 + %.not23 = xor i1 %707, %60, !dbg !26 + %708 = icmp sge i32 %678, %679, !dbg !26 + %709 = icmp ne i32 %678, %679, !dbg !26 + %710 = icmp sle i32 %694, %695, !dbg !26 + %711 = or i1 %709, %710, !dbg !26 + %712 = and i1 %708, %711, !dbg !26 + %.not24 = xor i1 %712, %60, !dbg !26 + %713 = icmp sge i32 %680, %681, !dbg !26 + %714 = icmp ne i32 %680, %681, !dbg !26 + %715 = icmp sle i32 %696, %697, !dbg !26 + %716 = or i1 %714, %715, !dbg !26 + %717 = and i1 %713, %716, !dbg !26 + %.not25 = xor i1 %717, %60, !dbg !26 + %718 = xor i32 %675, %674, !dbg !46 + %719 = xor i32 %677, %676, !dbg !46 + %720 = xor i32 %679, %678, !dbg !46 + %721 = xor i32 %681, %680, !dbg !46 + %722 = select i1 %.not22, i32 0, i32 %718, !dbg !47 + %723 = select i1 %.not23, i32 0, i32 %719, !dbg !47 + %724 = select i1 %.not24, i32 0, i32 %720, !dbg !47 + %725 = select i1 %.not25, i32 0, i32 %721, !dbg !47 + %726 = xor i32 %722, %674, !dbg !48 + %727 = xor i32 %722, %675, !dbg !48 + %728 = xor i32 %723, %676, !dbg !48 + %729 = xor i32 %723, %677, !dbg !48 + %730 = xor i32 %724, %678, !dbg !48 + %731 = xor i32 %724, %679, !dbg !48 + %732 = xor i32 %725, %680, !dbg !48 + %733 = xor i32 %725, %681, !dbg !48 + %734 = xor i32 %691, %690, !dbg !25 + %735 = xor i32 %693, %692, !dbg !25 + %736 = xor i32 %695, %694, !dbg !25 + %737 = xor i32 %697, %696, !dbg !25 + %738 = select i1 %.not22, i32 0, i32 %734, !dbg !49 + %739 = select i1 %.not23, i32 0, i32 %735, !dbg !49 + %740 = select i1 %.not24, i32 0, i32 %736, !dbg !49 + %741 = select i1 %.not25, i32 0, i32 %737, !dbg !49 + %742 = xor i32 %738, %690, !dbg !50 + %743 = xor i32 %738, %691, !dbg !50 + %744 = xor i32 %739, %692, !dbg !50 + %745 = xor i32 %739, %693, !dbg !50 + %746 = xor i32 %740, %694, !dbg !50 + %747 = xor i32 %740, %695, !dbg !50 + %748 = xor i32 %741, %696, !dbg !50 + %749 = xor i32 %741, %697, !dbg !50 + %750 = mul nuw nsw i32 %726, %56, !dbg !51 + %751 = mul nuw nsw i32 %727, %56, !dbg !51 + %752 = mul nuw nsw i32 %728, %56, !dbg !51 + %753 = mul nuw nsw i32 %729, %56, !dbg !51 + %754 = mul nuw nsw i32 %730, %56, !dbg !51 + %755 = mul nuw nsw i32 %731, %56, !dbg !51 + %756 = mul nuw nsw i32 %732, %56, !dbg !51 + %757 = mul nuw nsw i32 %733, %56, !dbg !51 + %758 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %750, i32 2, i32 31), !dbg !52 + %759 = add i32 %750, %758, !dbg !55 + %760 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %751, i32 2, i32 31), !dbg !52 + %761 = add i32 %751, %760, !dbg !55 + %762 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %752, i32 2, i32 31), !dbg !52 + %763 = add i32 %752, %762, !dbg !55 + %764 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %753, i32 2, i32 31), !dbg !52 + %765 = add i32 %753, %764, !dbg !55 + %766 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %754, i32 2, i32 31), !dbg !52 + %767 = add i32 %754, %766, !dbg !55 + %768 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %755, i32 2, i32 31), !dbg !52 + %769 = add i32 %755, %768, !dbg !55 + %770 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %756, i32 2, i32 31), !dbg !52 + %771 = add i32 %756, %770, !dbg !55 + %772 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %757, i32 2, i32 31), !dbg !52 + %773 = add i32 %757, %772, !dbg !55 + %774 = mul nuw nsw i32 %726, %.lobit, !dbg !56 + %775 = mul nuw nsw i32 %727, %.lobit, !dbg !56 + %776 = mul nuw nsw i32 %728, %.lobit, !dbg !56 + %777 = mul nuw nsw i32 %729, %.lobit, !dbg !56 + %778 = mul nuw nsw i32 %730, %.lobit, !dbg !56 + %779 = mul nuw nsw i32 %731, %.lobit, !dbg !56 + %780 = mul nuw nsw i32 %732, %.lobit, !dbg !56 + %781 = mul nuw nsw i32 %733, %.lobit, !dbg !56 + %782 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %774, i32 2, i32 31), !dbg !52 + %783 = add i32 %774, %782, !dbg !55 + %784 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %775, i32 2, i32 31), !dbg !52 + %785 = add i32 %775, %784, !dbg !55 + %786 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %776, i32 2, i32 31), !dbg !52 + %787 = add i32 %776, %786, !dbg !55 + %788 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %777, i32 2, i32 31), !dbg !52 + %789 = add i32 %777, %788, !dbg !55 + %790 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %778, i32 2, i32 31), !dbg !52 + %791 = add i32 %778, %790, !dbg !55 + %792 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %779, i32 2, i32 31), !dbg !52 + %793 = add i32 %779, %792, !dbg !55 + %794 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %780, i32 2, i32 31), !dbg !52 + %795 = add i32 %780, %794, !dbg !55 + %796 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %781, i32 2, i32 31), !dbg !52 + %797 = add i32 %781, %796, !dbg !55 + %798 = mul nuw nsw i32 %742, %56, !dbg !57 + %799 = mul nuw nsw i32 %743, %56, !dbg !57 + %800 = mul nuw nsw i32 %744, %56, !dbg !57 + %801 = mul nuw nsw i32 %745, %56, !dbg !57 + %802 = mul nuw nsw i32 %746, %56, !dbg !57 + %803 = mul nuw nsw i32 %747, %56, !dbg !57 + %804 = mul nuw nsw i32 %748, %56, !dbg !57 + %805 = mul nuw nsw i32 %749, %56, !dbg !57 + %806 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %798, i32 2, i32 31), !dbg !52 + %807 = add i32 %806, %798, !dbg !55 + %808 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %799, i32 2, i32 31), !dbg !52 + %809 = add i32 %808, %799, !dbg !55 + %810 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %800, i32 2, i32 31), !dbg !52 + %811 = add i32 %810, %800, !dbg !55 + %812 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %801, i32 2, i32 31), !dbg !52 + %813 = add i32 %812, %801, !dbg !55 + %814 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %802, i32 2, i32 31), !dbg !52 + %815 = add i32 %814, %802, !dbg !55 + %816 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %803, i32 2, i32 31), !dbg !52 + %817 = add i32 %816, %803, !dbg !55 + %818 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %804, i32 2, i32 31), !dbg !52 + %819 = add i32 %818, %804, !dbg !55 + %820 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %805, i32 2, i32 31), !dbg !52 + %821 = add i32 %820, %805, !dbg !55 + %822 = mul nuw nsw i32 %742, %.lobit, !dbg !58 + %823 = mul nuw nsw i32 %743, %.lobit, !dbg !58 + %824 = mul nuw nsw i32 %744, %.lobit, !dbg !58 + %825 = mul nuw nsw i32 %745, %.lobit, !dbg !58 + %826 = mul nuw nsw i32 %746, %.lobit, !dbg !58 + %827 = mul nuw nsw i32 %747, %.lobit, !dbg !58 + %828 = mul nuw nsw i32 %748, %.lobit, !dbg !58 + %829 = mul nuw nsw i32 %749, %.lobit, !dbg !58 + %830 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %822, i32 2, i32 31), !dbg !52 + %831 = add i32 %830, %822, !dbg !55 + %832 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %823, i32 2, i32 31), !dbg !52 + %833 = add i32 %832, %823, !dbg !55 + %834 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %824, i32 2, i32 31), !dbg !52 + %835 = add i32 %834, %824, !dbg !55 + %836 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %825, i32 2, i32 31), !dbg !52 + %837 = add i32 %836, %825, !dbg !55 + %838 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %826, i32 2, i32 31), !dbg !52 + %839 = add i32 %838, %826, !dbg !55 + %840 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %827, i32 2, i32 31), !dbg !52 + %841 = add i32 %840, %827, !dbg !55 + %842 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %828, i32 2, i32 31), !dbg !52 + %843 = add i32 %842, %828, !dbg !55 + %844 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %829, i32 2, i32 31), !dbg !52 + %845 = add i32 %844, %829, !dbg !55 + %846 = icmp slt i32 %759, %783, !dbg !44 + %847 = icmp slt i32 %761, %785, !dbg !44 + %848 = icmp slt i32 %763, %787, !dbg !44 + %849 = icmp slt i32 %765, %789, !dbg !44 + %850 = icmp slt i32 %767, %791, !dbg !44 + %851 = icmp slt i32 %769, %793, !dbg !44 + %852 = icmp slt i32 %771, %795, !dbg !44 + %853 = icmp slt i32 %773, %797, !dbg !44 + %854 = icmp eq i32 %759, %783, !dbg !60 + %855 = icmp eq i32 %761, %785, !dbg !60 + %856 = icmp eq i32 %763, %787, !dbg !60 + %857 = icmp eq i32 %765, %789, !dbg !60 + %858 = icmp eq i32 %767, %791, !dbg !60 + %859 = icmp eq i32 %769, %793, !dbg !60 + %860 = icmp eq i32 %771, %795, !dbg !60 + %861 = icmp eq i32 %773, %797, !dbg !60 + %862 = icmp sgt i32 %807, %831, !dbg !61 + %863 = icmp sgt i32 %809, %833, !dbg !61 + %864 = icmp sgt i32 %811, %835, !dbg !61 + %865 = icmp sgt i32 %813, %837, !dbg !61 + %866 = icmp sgt i32 %815, %839, !dbg !61 + %867 = icmp sgt i32 %817, %841, !dbg !61 + %868 = icmp sgt i32 %819, %843, !dbg !61 + %869 = icmp sgt i32 %821, %845, !dbg !61 + %870 = and i1 %854, %862, !dbg !62 + %871 = and i1 %855, %863, !dbg !62 + %872 = and i1 %856, %864, !dbg !62 + %873 = and i1 %857, %865, !dbg !62 + %874 = and i1 %858, %866, !dbg !62 + %875 = and i1 %859, %867, !dbg !62 + %876 = and i1 %860, %868, !dbg !62 + %877 = and i1 %861, %869, !dbg !62 + %878 = or i1 %846, %870, !dbg !63 + %879 = or i1 %847, %871, !dbg !63 + %880 = or i1 %848, %872, !dbg !63 + %881 = or i1 %849, %873, !dbg !63 + %882 = or i1 %850, %874, !dbg !63 + %883 = or i1 %851, %875, !dbg !63 + %884 = or i1 %852, %876, !dbg !63 + %885 = or i1 %853, %877, !dbg !63 + %886 = xor i32 %759, %783, !dbg !46 + %887 = xor i32 %761, %785, !dbg !46 + %888 = xor i32 %763, %787, !dbg !46 + %889 = xor i32 %765, %789, !dbg !46 + %890 = xor i32 %767, %791, !dbg !46 + %891 = xor i32 %769, %793, !dbg !46 + %892 = xor i32 %771, %795, !dbg !46 + %893 = xor i32 %773, %797, !dbg !46 + %894 = select i1 %878, i32 %886, i32 0, !dbg !47 + %895 = select i1 %879, i32 %887, i32 0, !dbg !47 + %896 = select i1 %880, i32 %888, i32 0, !dbg !47 + %897 = select i1 %881, i32 %889, i32 0, !dbg !47 + %898 = select i1 %882, i32 %890, i32 0, !dbg !47 + %899 = select i1 %883, i32 %891, i32 0, !dbg !47 + %900 = select i1 %884, i32 %892, i32 0, !dbg !47 + %901 = select i1 %885, i32 %893, i32 0, !dbg !47 + %902 = xor i32 %894, %726, !dbg !48 + %903 = xor i32 %895, %727, !dbg !48 + %904 = xor i32 %896, %728, !dbg !48 + %905 = xor i32 %897, %729, !dbg !48 + %906 = xor i32 %898, %730, !dbg !48 + %907 = xor i32 %899, %731, !dbg !48 + %908 = xor i32 %900, %732, !dbg !48 + %909 = xor i32 %901, %733, !dbg !48 + %910 = xor i32 %831, %807, !dbg !25 + %911 = xor i32 %833, %809, !dbg !25 + %912 = xor i32 %835, %811, !dbg !25 + %913 = xor i32 %837, %813, !dbg !25 + %914 = xor i32 %839, %815, !dbg !25 + %915 = xor i32 %841, %817, !dbg !25 + %916 = xor i32 %843, %819, !dbg !25 + %917 = xor i32 %845, %821, !dbg !25 + %918 = select i1 %878, i32 %910, i32 0, !dbg !49 + %919 = select i1 %879, i32 %911, i32 0, !dbg !49 + %920 = select i1 %880, i32 %912, i32 0, !dbg !49 + %921 = select i1 %881, i32 %913, i32 0, !dbg !49 + %922 = select i1 %882, i32 %914, i32 0, !dbg !49 + %923 = select i1 %883, i32 %915, i32 0, !dbg !49 + %924 = select i1 %884, i32 %916, i32 0, !dbg !49 + %925 = select i1 %885, i32 %917, i32 0, !dbg !49 + %926 = xor i32 %918, %742, !dbg !50 + %927 = xor i32 %919, %743, !dbg !50 + %928 = xor i32 %920, %744, !dbg !50 + %929 = xor i32 %921, %745, !dbg !50 + %930 = xor i32 %922, %746, !dbg !50 + %931 = xor i32 %923, %747, !dbg !50 + %932 = xor i32 %924, %748, !dbg !50 + %933 = xor i32 %925, %749, !dbg !50 + %934 = mul nuw nsw i32 %902, %55, !dbg !51 + %935 = mul nuw nsw i32 %903, %55, !dbg !51 + %936 = mul nuw nsw i32 %904, %55, !dbg !51 + %937 = mul nuw nsw i32 %905, %55, !dbg !51 + %938 = mul nuw nsw i32 %906, %55, !dbg !51 + %939 = mul nuw nsw i32 %907, %55, !dbg !51 + %940 = mul nuw nsw i32 %908, %55, !dbg !51 + %941 = mul nuw nsw i32 %909, %55, !dbg !51 + %942 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %934, i32 1, i32 31), !dbg !52 + %943 = add i32 %942, %934, !dbg !55 + %944 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %935, i32 1, i32 31), !dbg !52 + %945 = add i32 %944, %935, !dbg !55 + %946 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %936, i32 1, i32 31), !dbg !52 + %947 = add i32 %946, %936, !dbg !55 + %948 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %937, i32 1, i32 31), !dbg !52 + %949 = add i32 %948, %937, !dbg !55 + %950 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %938, i32 1, i32 31), !dbg !52 + %951 = add i32 %950, %938, !dbg !55 + %952 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %939, i32 1, i32 31), !dbg !52 + %953 = add i32 %952, %939, !dbg !55 + %954 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %940, i32 1, i32 31), !dbg !52 + %955 = add i32 %954, %940, !dbg !55 + %956 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %941, i32 1, i32 31), !dbg !52 + %957 = add i32 %956, %941, !dbg !55 + %958 = mul nuw nsw i32 %902, %53, !dbg !56 + %959 = mul nuw nsw i32 %903, %53, !dbg !56 + %960 = mul nuw nsw i32 %904, %53, !dbg !56 + %961 = mul nuw nsw i32 %905, %53, !dbg !56 + %962 = mul nuw nsw i32 %906, %53, !dbg !56 + %963 = mul nuw nsw i32 %907, %53, !dbg !56 + %964 = mul nuw nsw i32 %908, %53, !dbg !56 + %965 = mul nuw nsw i32 %909, %53, !dbg !56 + %966 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %958, i32 1, i32 31), !dbg !52 + %967 = add i32 %966, %958, !dbg !55 + %968 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %959, i32 1, i32 31), !dbg !52 + %969 = add i32 %968, %959, !dbg !55 + %970 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %960, i32 1, i32 31), !dbg !52 + %971 = add i32 %970, %960, !dbg !55 + %972 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %961, i32 1, i32 31), !dbg !52 + %973 = add i32 %972, %961, !dbg !55 + %974 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %962, i32 1, i32 31), !dbg !52 + %975 = add i32 %974, %962, !dbg !55 + %976 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %963, i32 1, i32 31), !dbg !52 + %977 = add i32 %976, %963, !dbg !55 + %978 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %964, i32 1, i32 31), !dbg !52 + %979 = add i32 %978, %964, !dbg !55 + %980 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %965, i32 1, i32 31), !dbg !52 + %981 = add i32 %980, %965, !dbg !55 + %982 = mul nuw nsw i32 %926, %55, !dbg !57 + %983 = mul nuw nsw i32 %927, %55, !dbg !57 + %984 = mul nuw nsw i32 %928, %55, !dbg !57 + %985 = mul nuw nsw i32 %929, %55, !dbg !57 + %986 = mul nuw nsw i32 %930, %55, !dbg !57 + %987 = mul nuw nsw i32 %931, %55, !dbg !57 + %988 = mul nuw nsw i32 %932, %55, !dbg !57 + %989 = mul nuw nsw i32 %933, %55, !dbg !57 + %990 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %982, i32 1, i32 31), !dbg !52 + %991 = add i32 %990, %982, !dbg !55 + %992 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %983, i32 1, i32 31), !dbg !52 + %993 = add i32 %992, %983, !dbg !55 + %994 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %984, i32 1, i32 31), !dbg !52 + %995 = add i32 %994, %984, !dbg !55 + %996 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %985, i32 1, i32 31), !dbg !52 + %997 = add i32 %996, %985, !dbg !55 + %998 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %986, i32 1, i32 31), !dbg !52 + %999 = add i32 %998, %986, !dbg !55 + %1000 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %987, i32 1, i32 31), !dbg !52 + %1001 = add i32 %1000, %987, !dbg !55 + %1002 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %988, i32 1, i32 31), !dbg !52 + %1003 = add i32 %1002, %988, !dbg !55 + %1004 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %989, i32 1, i32 31), !dbg !52 + %1005 = add i32 %1004, %989, !dbg !55 + %1006 = mul nuw nsw i32 %926, %53, !dbg !58 + %1007 = mul nuw nsw i32 %927, %53, !dbg !58 + %1008 = mul nuw nsw i32 %928, %53, !dbg !58 + %1009 = mul nuw nsw i32 %929, %53, !dbg !58 + %1010 = mul nuw nsw i32 %930, %53, !dbg !58 + %1011 = mul nuw nsw i32 %931, %53, !dbg !58 + %1012 = mul nuw nsw i32 %932, %53, !dbg !58 + %1013 = mul nuw nsw i32 %933, %53, !dbg !58 + %1014 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1006, i32 1, i32 31), !dbg !52 + %1015 = add i32 %1014, %1006, !dbg !55 + %1016 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1007, i32 1, i32 31), !dbg !52 + %1017 = add i32 %1016, %1007, !dbg !55 + %1018 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1008, i32 1, i32 31), !dbg !52 + %1019 = add i32 %1018, %1008, !dbg !55 + %1020 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1009, i32 1, i32 31), !dbg !52 + %1021 = add i32 %1020, %1009, !dbg !55 + %1022 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1010, i32 1, i32 31), !dbg !52 + %1023 = add i32 %1022, %1010, !dbg !55 + %1024 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1011, i32 1, i32 31), !dbg !52 + %1025 = add i32 %1024, %1011, !dbg !55 + %1026 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1012, i32 1, i32 31), !dbg !52 + %1027 = add i32 %1026, %1012, !dbg !55 + %1028 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1013, i32 1, i32 31), !dbg !52 + %1029 = add i32 %1028, %1013, !dbg !55 + %1030 = icmp slt i32 %943, %967, !dbg !44 + %1031 = icmp slt i32 %945, %969, !dbg !44 + %1032 = icmp slt i32 %947, %971, !dbg !44 + %1033 = icmp slt i32 %949, %973, !dbg !44 + %1034 = icmp slt i32 %951, %975, !dbg !44 + %1035 = icmp slt i32 %953, %977, !dbg !44 + %1036 = icmp slt i32 %955, %979, !dbg !44 + %1037 = icmp slt i32 %957, %981, !dbg !44 + %1038 = icmp eq i32 %943, %967, !dbg !60 + %1039 = icmp eq i32 %945, %969, !dbg !60 + %1040 = icmp eq i32 %947, %971, !dbg !60 + %1041 = icmp eq i32 %949, %973, !dbg !60 + %1042 = icmp eq i32 %951, %975, !dbg !60 + %1043 = icmp eq i32 %953, %977, !dbg !60 + %1044 = icmp eq i32 %955, %979, !dbg !60 + %1045 = icmp eq i32 %957, %981, !dbg !60 + %1046 = icmp sgt i32 %991, %1015, !dbg !61 + %1047 = icmp sgt i32 %993, %1017, !dbg !61 + %1048 = icmp sgt i32 %995, %1019, !dbg !61 + %1049 = icmp sgt i32 %997, %1021, !dbg !61 + %1050 = icmp sgt i32 %999, %1023, !dbg !61 + %1051 = icmp sgt i32 %1001, %1025, !dbg !61 + %1052 = icmp sgt i32 %1003, %1027, !dbg !61 + %1053 = icmp sgt i32 %1005, %1029, !dbg !61 + %1054 = and i1 %1038, %1046, !dbg !62 + %1055 = and i1 %1039, %1047, !dbg !62 + %1056 = and i1 %1040, %1048, !dbg !62 + %1057 = and i1 %1041, %1049, !dbg !62 + %1058 = and i1 %1042, %1050, !dbg !62 + %1059 = and i1 %1043, %1051, !dbg !62 + %1060 = and i1 %1044, %1052, !dbg !62 + %1061 = and i1 %1045, %1053, !dbg !62 + %1062 = or i1 %1030, %1054, !dbg !63 + %1063 = or i1 %1031, %1055, !dbg !63 + %1064 = or i1 %1032, %1056, !dbg !63 + %1065 = or i1 %1033, %1057, !dbg !63 + %1066 = or i1 %1034, %1058, !dbg !63 + %1067 = or i1 %1035, %1059, !dbg !63 + %1068 = or i1 %1036, %1060, !dbg !63 + %1069 = or i1 %1037, %1061, !dbg !63 + %1070 = xor i32 %967, %943, !dbg !46 + %1071 = xor i32 %969, %945, !dbg !46 + %1072 = xor i32 %971, %947, !dbg !46 + %1073 = xor i32 %973, %949, !dbg !46 + %1074 = xor i32 %975, %951, !dbg !46 + %1075 = xor i32 %977, %953, !dbg !46 + %1076 = xor i32 %979, %955, !dbg !46 + %1077 = xor i32 %981, %957, !dbg !46 + %1078 = select i1 %1062, i32 %1070, i32 0, !dbg !47 + %1079 = select i1 %1063, i32 %1071, i32 0, !dbg !47 + %1080 = select i1 %1064, i32 %1072, i32 0, !dbg !47 + %1081 = select i1 %1065, i32 %1073, i32 0, !dbg !47 + %1082 = select i1 %1066, i32 %1074, i32 0, !dbg !47 + %1083 = select i1 %1067, i32 %1075, i32 0, !dbg !47 + %1084 = select i1 %1068, i32 %1076, i32 0, !dbg !47 + %1085 = select i1 %1069, i32 %1077, i32 0, !dbg !47 + %1086 = xor i32 %1078, %902, !dbg !48 + %1087 = xor i32 %1079, %903, !dbg !48 + %1088 = xor i32 %1080, %904, !dbg !48 + %1089 = xor i32 %1081, %905, !dbg !48 + %1090 = xor i32 %1082, %906, !dbg !48 + %1091 = xor i32 %1083, %907, !dbg !48 + %1092 = xor i32 %1084, %908, !dbg !48 + %1093 = xor i32 %1085, %909, !dbg !48 + %1094 = xor i32 %1015, %991, !dbg !25 + %1095 = xor i32 %1017, %993, !dbg !25 + %1096 = xor i32 %1019, %995, !dbg !25 + %1097 = xor i32 %1021, %997, !dbg !25 + %1098 = xor i32 %1023, %999, !dbg !25 + %1099 = xor i32 %1025, %1001, !dbg !25 + %1100 = xor i32 %1027, %1003, !dbg !25 + %1101 = xor i32 %1029, %1005, !dbg !25 + %1102 = select i1 %1062, i32 %1094, i32 0, !dbg !49 + %1103 = select i1 %1063, i32 %1095, i32 0, !dbg !49 + %1104 = select i1 %1064, i32 %1096, i32 0, !dbg !49 + %1105 = select i1 %1065, i32 %1097, i32 0, !dbg !49 + %1106 = select i1 %1066, i32 %1098, i32 0, !dbg !49 + %1107 = select i1 %1067, i32 %1099, i32 0, !dbg !49 + %1108 = select i1 %1068, i32 %1100, i32 0, !dbg !49 + %1109 = select i1 %1069, i32 %1101, i32 0, !dbg !49 + %1110 = xor i32 %1102, %926, !dbg !50 + %1111 = xor i32 %1103, %927, !dbg !50 + %1112 = xor i32 %1104, %928, !dbg !50 + %1113 = xor i32 %1105, %929, !dbg !50 + %1114 = xor i32 %1106, %930, !dbg !50 + %1115 = xor i32 %1107, %931, !dbg !50 + %1116 = xor i32 %1108, %932, !dbg !50 + %1117 = xor i32 %1109, %933, !dbg !50 + %1118 = icmp slt i32 %1086, %1088, !dbg !44 + %1119 = icmp slt i32 %1087, %1089, !dbg !44 + %1120 = icmp slt i32 %1090, %1092, !dbg !44 + %1121 = icmp slt i32 %1091, %1093, !dbg !44 + %1122 = icmp eq i32 %1086, %1088, !dbg !60 + %1123 = icmp eq i32 %1087, %1089, !dbg !60 + %1124 = icmp eq i32 %1090, %1092, !dbg !60 + %1125 = icmp eq i32 %1091, %1093, !dbg !60 + %1126 = icmp sgt i32 %1110, %1112, !dbg !61 + %1127 = icmp sgt i32 %1111, %1113, !dbg !61 + %1128 = icmp sgt i32 %1114, %1116, !dbg !61 + %1129 = icmp sgt i32 %1115, %1117, !dbg !61 + %1130 = and i1 %1122, %1126, !dbg !62 + %1131 = and i1 %1123, %1127, !dbg !62 + %1132 = and i1 %1124, %1128, !dbg !62 + %1133 = and i1 %1125, %1129, !dbg !62 + %1134 = or i1 %1118, %1130, !dbg !63 + %1135 = or i1 %1119, %1131, !dbg !63 + %1136 = or i1 %1120, %1132, !dbg !63 + %1137 = or i1 %1121, %1133, !dbg !63 + %1138 = xor i32 %1088, %1086, !dbg !46 + %1139 = xor i32 %1089, %1087, !dbg !46 + %1140 = xor i32 %1092, %1090, !dbg !46 + %1141 = xor i32 %1093, %1091, !dbg !46 + %1142 = select i1 %1134, i32 %1138, i32 0, !dbg !47 + %1143 = select i1 %1135, i32 %1139, i32 0, !dbg !47 + %1144 = select i1 %1136, i32 %1140, i32 0, !dbg !47 + %1145 = select i1 %1137, i32 %1141, i32 0, !dbg !47 + %1146 = xor i32 %1142, %1086, !dbg !48 + %1147 = xor i32 %1143, %1087, !dbg !48 + %1148 = xor i32 %1142, %1088, !dbg !48 + %1149 = xor i32 %1143, %1089, !dbg !48 + %1150 = xor i32 %1144, %1090, !dbg !48 + %1151 = xor i32 %1145, %1091, !dbg !48 + %1152 = xor i32 %1144, %1092, !dbg !48 + %1153 = xor i32 %1145, %1093, !dbg !48 + %1154 = xor i32 %1112, %1110, !dbg !25 + %1155 = xor i32 %1113, %1111, !dbg !25 + %1156 = xor i32 %1116, %1114, !dbg !25 + %1157 = xor i32 %1117, %1115, !dbg !25 + %1158 = select i1 %1134, i32 %1154, i32 0, !dbg !49 + %1159 = select i1 %1135, i32 %1155, i32 0, !dbg !49 + %1160 = select i1 %1136, i32 %1156, i32 0, !dbg !49 + %1161 = select i1 %1137, i32 %1157, i32 0, !dbg !49 + %1162 = xor i32 %1158, %1110, !dbg !50 + %1163 = xor i32 %1159, %1111, !dbg !50 + %1164 = xor i32 %1158, %1112, !dbg !50 + %1165 = xor i32 %1159, %1113, !dbg !50 + %1166 = xor i32 %1160, %1114, !dbg !50 + %1167 = xor i32 %1161, %1115, !dbg !50 + %1168 = xor i32 %1160, %1116, !dbg !50 + %1169 = xor i32 %1161, %1117, !dbg !50 + %1170 = insertelement <2 x i32> poison, i32 %1146, i64 0, !dbg !44 + %1171 = insertelement <2 x i32> %1170, i32 %1148, i64 1, !dbg !44 + %1172 = insertelement <2 x i32> poison, i32 %1147, i64 0, !dbg !44 + %1173 = insertelement <2 x i32> %1172, i32 %1149, i64 1, !dbg !44 + %1174 = icmp slt <2 x i32> %1171, %1173, !dbg !44 + %1175 = insertelement <2 x i32> poison, i32 %1150, i64 0, !dbg !44 + %1176 = insertelement <2 x i32> %1175, i32 %1152, i64 1, !dbg !44 + %1177 = insertelement <2 x i32> poison, i32 %1151, i64 0, !dbg !44 + %1178 = insertelement <2 x i32> %1177, i32 %1153, i64 1, !dbg !44 + %1179 = icmp slt <2 x i32> %1176, %1178, !dbg !44 + %1180 = insertelement <2 x i32> %1172, i32 %1148, i64 1, !dbg !60 + %1181 = insertelement <2 x i32> %1170, i32 %1149, i64 1, !dbg !60 + %1182 = icmp eq <2 x i32> %1180, %1181, !dbg !60 + %1183 = insertelement <2 x i32> poison, i32 %1162, i64 0, !dbg !61 + %1184 = insertelement <2 x i32> %1183, i32 %1164, i64 1, !dbg !61 + %1185 = insertelement <2 x i32> poison, i32 %1163, i64 0, !dbg !61 + %1186 = insertelement <2 x i32> %1185, i32 %1165, i64 1, !dbg !61 + %1187 = icmp sgt <2 x i32> %1184, %1186, !dbg !61 + %1188 = and <2 x i1> %1182, %1187, !dbg !62 + %1189 = insertelement <2 x i32> %1177, i32 %1152, i64 1, !dbg !60 + %1190 = insertelement <2 x i32> %1175, i32 %1153, i64 1, !dbg !60 + %1191 = icmp eq <2 x i32> %1189, %1190, !dbg !60 + %1192 = insertelement <2 x i32> poison, i32 %1166, i64 0, !dbg !61 + %1193 = insertelement <2 x i32> %1192, i32 %1168, i64 1, !dbg !61 + %1194 = insertelement <2 x i32> poison, i32 %1167, i64 0, !dbg !61 + %1195 = insertelement <2 x i32> %1194, i32 %1169, i64 1, !dbg !61 + %1196 = icmp sgt <2 x i32> %1193, %1195, !dbg !61 + %1197 = and <2 x i1> %1191, %1196, !dbg !62 + %1198 = or <2 x i1> %1174, %1188, !dbg !63 + %1199 = insertelement <2 x i32> %1183, i32 %1165, i64 1, !dbg !25 + %1200 = insertelement <2 x i32> %1185, i32 %1164, i64 1, !dbg !25 + %1201 = xor <2 x i32> %1199, %1200, !dbg !25 + %1202 = select <2 x i1> %1198, <2 x i32> %1201, <2 x i32> zeroinitializer, !dbg !49 + %1203 = shufflevector <2 x i32> %1202, <2 x i32> poison, <4 x i32> , !dbg !49 + %1204 = insertelement <4 x i32> poison, i32 %1162, i64 0, !dbg !50 + %1205 = insertelement <4 x i32> %1204, i32 %1163, i64 1, !dbg !50 + %1206 = insertelement <4 x i32> %1205, i32 %1164, i64 2, !dbg !50 + %1207 = insertelement <4 x i32> %1206, i32 %1165, i64 3, !dbg !50 + %1208 = xor <4 x i32> %1203, %1207, !dbg !50 + %1209 = or <2 x i1> %1179, %1197, !dbg !63 + %1210 = insertelement <2 x i32> %1192, i32 %1169, i64 1, !dbg !25 + %1211 = insertelement <2 x i32> %1194, i32 %1168, i64 1, !dbg !25 + %1212 = xor <2 x i32> %1210, %1211, !dbg !25 + %1213 = select <2 x i1> %1209, <2 x i32> %1212, <2 x i32> zeroinitializer, !dbg !49 + %1214 = shufflevector <2 x i32> %1213, <2 x i32> poison, <4 x i32> , !dbg !49 + %1215 = insertelement <4 x i32> poison, i32 %1166, i64 0, !dbg !50 + %1216 = insertelement <4 x i32> %1215, i32 %1167, i64 1, !dbg !50 + %1217 = insertelement <4 x i32> %1216, i32 %1168, i64 2, !dbg !50 + %1218 = insertelement <4 x i32> %1217, i32 %1169, i64 3, !dbg !50 + %1219 = xor <4 x i32> %1214, %1218, !dbg !50 + %1220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %207, i32 1, i32 31), !dbg !64 + %1221 = add i32 %1220, %207, !dbg !65 + %1222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %208, i32 1, i32 31), !dbg !64 + %1223 = add i32 %1222, %208, !dbg !65 + %1224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %209, i32 1, i32 31), !dbg !64 + %1225 = add i32 %1224, %209, !dbg !65 + %1226 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %210, i32 1, i32 31), !dbg !64 + %1227 = add i32 %1226, %210, !dbg !65 + %1228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %211, i32 1, i32 31), !dbg !64 + %1229 = add i32 %1228, %211, !dbg !65 + %1230 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %212, i32 1, i32 31), !dbg !64 + %1231 = add i32 %1230, %212, !dbg !65 + %1232 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %213, i32 1, i32 31), !dbg !64 + %1233 = add i32 %1232, %213, !dbg !65 + %1234 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %214, i32 1, i32 31), !dbg !64 + %1235 = add i32 %1234, %214, !dbg !65 + %1236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %215, i32 1, i32 31), !dbg !64 + %1237 = add i32 %1236, %215, !dbg !65 + %1238 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !64 + %1239 = add i32 %1238, %216, !dbg !65 + %1240 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %217, i32 1, i32 31), !dbg !64 + %1241 = add i32 %1240, %217, !dbg !65 + %1242 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %218, i32 1, i32 31), !dbg !64 + %1243 = add i32 %1242, %218, !dbg !65 + %1244 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 1, i32 31), !dbg !64 + %1245 = add i32 %1244, %219, !dbg !65 + %1246 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 1, i32 31), !dbg !64 + %1247 = add i32 %1246, %220, !dbg !65 + %1248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 1, i32 31), !dbg !64 + %1249 = add i32 %1248, %221, !dbg !65 + %1250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 1, i32 31), !dbg !64 + %1251 = add i32 %1250, %222, !dbg !65 + %1252 = icmp sge i32 %1221, %1237, !dbg !37 + %1253 = icmp ne i32 %1221, %1237, !dbg !37 + %1254 = icmp sge i32 %1223, %1239, !dbg !37 + %1255 = icmp ne i32 %1223, %1239, !dbg !37 + %1256 = icmp sge i32 %1225, %1241, !dbg !37 + %1257 = icmp ne i32 %1225, %1241, !dbg !37 + %1258 = icmp sge i32 %1227, %1243, !dbg !37 + %1259 = icmp ne i32 %1227, %1243, !dbg !37 + %1260 = icmp sge i32 %1229, %1245, !dbg !37 + %1261 = icmp ne i32 %1229, %1245, !dbg !37 + %1262 = icmp sge i32 %1231, %1247, !dbg !37 + %1263 = icmp ne i32 %1231, %1247, !dbg !37 + %1264 = icmp sge i32 %1233, %1249, !dbg !37 + %1265 = icmp ne i32 %1233, %1249, !dbg !37 + %1266 = icmp sge i32 %1235, %1251, !dbg !37 + %1267 = icmp ne i32 %1235, %1251, !dbg !37 + %1268 = xor i32 %1237, %1221, !dbg !32 + %1269 = xor i32 %1239, %1223, !dbg !32 + %1270 = xor i32 %1241, %1225, !dbg !32 + %1271 = xor i32 %1243, %1227, !dbg !32 + %1272 = xor i32 %1245, %1229, !dbg !32 + %1273 = xor i32 %1247, %1231, !dbg !32 + %1274 = xor i32 %1249, %1233, !dbg !32 + %1275 = xor i32 %1251, %1235, !dbg !32 + %1276 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !64 + %1277 = add i32 %1276, %232, !dbg !65 + %1278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %234, i32 1, i32 31), !dbg !64 + %1279 = add i32 %1278, %234, !dbg !65 + %1280 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %236, i32 1, i32 31), !dbg !64 + %1281 = add i32 %1280, %236, !dbg !65 + %1282 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %238, i32 1, i32 31), !dbg !64 + %1283 = add i32 %1282, %238, !dbg !65 + %1284 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %262, i32 1, i32 31), !dbg !64 + %1285 = add i32 %1284, %262, !dbg !65 + %1286 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %264, i32 1, i32 31), !dbg !64 + %1287 = add i32 %1286, %264, !dbg !65 + %1288 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %266, i32 1, i32 31), !dbg !64 + %1289 = add i32 %1288, %266, !dbg !65 + %1290 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 1, i32 31), !dbg !64 + %1291 = add i32 %1290, %268, !dbg !65 + %1292 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %239, i32 1, i32 31), !dbg !64 + %1293 = add i32 %1292, %239, !dbg !65 + %1294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 1, i32 31), !dbg !64 + %1295 = add i32 %1294, %240, !dbg !65 + %1296 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %241, i32 1, i32 31), !dbg !64 + %1297 = add i32 %1296, %241, !dbg !65 + %1298 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %242, i32 1, i32 31), !dbg !64 + %1299 = add i32 %1298, %242, !dbg !65 + %1300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 1, i32 31), !dbg !64 + %1301 = add i32 %1300, %269, !dbg !65 + %1302 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %270, i32 1, i32 31), !dbg !64 + %1303 = add i32 %1302, %270, !dbg !65 + %1304 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %271, i32 1, i32 31), !dbg !64 + %1305 = add i32 %1304, %271, !dbg !65 + %1306 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 1, i32 31), !dbg !64 + %1307 = add i32 %1306, %272, !dbg !65 + %1308 = icmp sle i32 %1277, %1293, !dbg !37 + %1309 = or i1 %1253, %1308, !dbg !37 + %1310 = and i1 %1252, %1309, !dbg !37 + %.not36 = xor i1 %1310, %60, !dbg !37 + %1311 = icmp sle i32 %1279, %1295, !dbg !37 + %1312 = or i1 %1255, %1311, !dbg !37 + %1313 = and i1 %1254, %1312, !dbg !37 + %.not37 = xor i1 %1313, %60, !dbg !37 + %1314 = icmp sle i32 %1281, %1297, !dbg !37 + %1315 = or i1 %1257, %1314, !dbg !37 + %1316 = and i1 %1256, %1315, !dbg !37 + %.not38 = xor i1 %1316, %60, !dbg !37 + %1317 = icmp sle i32 %1283, %1299, !dbg !37 + %1318 = or i1 %1259, %1317, !dbg !37 + %1319 = and i1 %1258, %1318, !dbg !37 + %.not39 = xor i1 %1319, %60, !dbg !37 + %1320 = icmp sle i32 %1285, %1301, !dbg !37 + %1321 = or i1 %1261, %1320, !dbg !37 + %1322 = and i1 %1260, %1321, !dbg !37 + %.not40 = xor i1 %1322, %60, !dbg !37 + %1323 = icmp sle i32 %1287, %1303, !dbg !37 + %1324 = or i1 %1263, %1323, !dbg !37 + %1325 = and i1 %1262, %1324, !dbg !37 + %.not41 = xor i1 %1325, %60, !dbg !37 + %1326 = icmp sle i32 %1289, %1305, !dbg !37 + %1327 = or i1 %1265, %1326, !dbg !37 + %1328 = and i1 %1264, %1327, !dbg !37 + %.not42 = xor i1 %1328, %60, !dbg !37 + %1329 = icmp sle i32 %1291, %1307, !dbg !37 + %1330 = or i1 %1267, %1329, !dbg !37 + %1331 = and i1 %1266, %1330, !dbg !37 + %.not43 = xor i1 %1331, %60, !dbg !37 + %1332 = select i1 %.not36, i32 0, i32 %1268, !dbg !33 + %1333 = select i1 %.not37, i32 0, i32 %1269, !dbg !33 + %1334 = select i1 %.not38, i32 0, i32 %1270, !dbg !33 + %1335 = select i1 %.not39, i32 0, i32 %1271, !dbg !33 + %1336 = select i1 %.not40, i32 0, i32 %1272, !dbg !33 + %1337 = select i1 %.not41, i32 0, i32 %1273, !dbg !33 + %1338 = select i1 %.not42, i32 0, i32 %1274, !dbg !33 + %1339 = select i1 %.not43, i32 0, i32 %1275, !dbg !33 + %1340 = xor i32 %1332, %191, !dbg !34 + %1341 = xor i32 %1333, %192, !dbg !34 + %1342 = xor i32 %1334, %193, !dbg !34 + %1343 = xor i32 %1335, %194, !dbg !34 + %1344 = xor i32 %1336, %195, !dbg !34 + %1345 = xor i32 %1337, %196, !dbg !34 + %1346 = xor i32 %1338, %197, !dbg !34 + %1347 = xor i32 %1339, %198, !dbg !34 + %1348 = xor i32 %1293, %1277, !dbg !38 + %1349 = xor i32 %1295, %1279, !dbg !38 + %1350 = xor i32 %1297, %1281, !dbg !38 + %1351 = xor i32 %1299, %1283, !dbg !38 + %1352 = xor i32 %1301, %1285, !dbg !38 + %1353 = xor i32 %1303, %1287, !dbg !38 + %1354 = xor i32 %1305, %1289, !dbg !38 + %1355 = xor i32 %1307, %1291, !dbg !38 + %1356 = insertelement <4 x i1> poison, i1 %.not36, i64 0, !dbg !35 + %1357 = insertelement <4 x i1> %1356, i1 %.not37, i64 1, !dbg !35 + %1358 = insertelement <4 x i1> %1357, i1 %.not38, i64 2, !dbg !35 + %1359 = insertelement <4 x i1> %1358, i1 %.not39, i64 3, !dbg !35 + %1360 = insertelement <4 x i32> poison, i32 %1348, i64 0, !dbg !35 + %1361 = insertelement <4 x i32> %1360, i32 %1349, i64 1, !dbg !35 + %1362 = insertelement <4 x i32> %1361, i32 %1350, i64 2, !dbg !35 + %1363 = insertelement <4 x i32> %1362, i32 %1351, i64 3, !dbg !35 + %1364 = select <4 x i1> %1359, <4 x i32> zeroinitializer, <4 x i32> %1363, !dbg !35 + %1365 = insertelement <4 x i1> poison, i1 %.not40, i64 0, !dbg !35 + %1366 = insertelement <4 x i1> %1365, i1 %.not41, i64 1, !dbg !35 + %1367 = insertelement <4 x i1> %1366, i1 %.not42, i64 2, !dbg !35 + %1368 = insertelement <4 x i1> %1367, i1 %.not43, i64 3, !dbg !35 + %1369 = insertelement <4 x i32> poison, i32 %1352, i64 0, !dbg !35 + %1370 = insertelement <4 x i32> %1369, i32 %1353, i64 1, !dbg !35 + %1371 = insertelement <4 x i32> %1370, i32 %1354, i64 2, !dbg !35 + %1372 = insertelement <4 x i32> %1371, i32 %1355, i64 3, !dbg !35 + %1373 = select <4 x i1> %1368, <4 x i32> zeroinitializer, <4 x i32> %1372, !dbg !35 + %1374 = xor <4 x i32> %1364, %230, !dbg !36 + %1375 = xor <4 x i32> %1373, %260, !dbg !36 + %1376 = icmp sge i32 %1340, %1342, !dbg !37 + %1377 = icmp ne i32 %1340, %1342, !dbg !37 + %shift = shufflevector <4 x i32> %1374, <4 x i32> poison, <4 x i32> , !dbg !37 + %1378 = icmp sle <4 x i32> %1374, %shift, !dbg !37 + %1379 = extractelement <4 x i1> %1378, i64 0, !dbg !37 + %1380 = or i1 %1377, %1379, !dbg !37 + %1381 = and i1 %1376, %1380, !dbg !37 + %1382 = icmp sge i32 %1341, %1343, !dbg !37 + %1383 = icmp ne i32 %1341, %1343, !dbg !37 + %shift109 = shufflevector <4 x i32> %1374, <4 x i32> poison, <4 x i32> , !dbg !37 + %1384 = icmp sle <4 x i32> %1374, %shift109, !dbg !37 + %1385 = extractelement <4 x i1> %1384, i64 1, !dbg !37 + %1386 = or i1 %1383, %1385, !dbg !37 + %1387 = and i1 %1382, %1386, !dbg !37 + %1388 = insertelement <2 x i1> poison, i1 %1387, i64 0, !dbg !37 + %1389 = insertelement <2 x i1> %1388, i1 %1381, i64 1, !dbg !37 + %1390 = xor <2 x i1> %1389, %244, !dbg !37 + %1391 = icmp sge i32 %1344, %1346, !dbg !37 + %1392 = icmp ne i32 %1344, %1346, !dbg !37 + %shift110 = shufflevector <4 x i32> %1375, <4 x i32> poison, <4 x i32> , !dbg !37 + %1393 = icmp sle <4 x i32> %1375, %shift110, !dbg !37 + %1394 = extractelement <4 x i1> %1393, i64 0, !dbg !37 + %1395 = or i1 %1392, %1394, !dbg !37 + %1396 = and i1 %1391, %1395, !dbg !37 + %1397 = icmp sge i32 %1345, %1347, !dbg !37 + %1398 = icmp ne i32 %1345, %1347, !dbg !37 + %shift111 = shufflevector <4 x i32> %1375, <4 x i32> poison, <4 x i32> , !dbg !37 + %1399 = icmp sle <4 x i32> %1375, %shift111, !dbg !37 + %1400 = extractelement <4 x i1> %1399, i64 1, !dbg !37 + %1401 = or i1 %1398, %1400, !dbg !37 + %1402 = and i1 %1397, %1401, !dbg !37 + %1403 = insertelement <2 x i1> poison, i1 %1402, i64 0, !dbg !37 + %1404 = insertelement <2 x i1> %1403, i1 %1396, i64 1, !dbg !37 + %1405 = xor <2 x i1> %1404, %244, !dbg !37 + %1406 = xor i32 %1342, %1340, !dbg !32 + %1407 = xor i32 %1343, %1341, !dbg !32 + %1408 = xor i32 %1346, %1344, !dbg !32 + %1409 = xor i32 %1347, %1345, !dbg !32 + %1410 = extractelement <2 x i1> %1390, i64 1, !dbg !33 + %1411 = select i1 %1410, i32 0, i32 %1406, !dbg !33 + %1412 = extractelement <2 x i1> %1390, i64 0, !dbg !33 + %1413 = select i1 %1412, i32 0, i32 %1407, !dbg !33 + %1414 = extractelement <2 x i1> %1405, i64 1, !dbg !33 + %1415 = select i1 %1414, i32 0, i32 %1408, !dbg !33 + %1416 = extractelement <2 x i1> %1405, i64 0, !dbg !33 + %1417 = select i1 %1416, i32 0, i32 %1409, !dbg !33 + %1418 = xor i32 %1411, %1340, !dbg !34 + %1419 = xor i32 %1413, %1341, !dbg !34 + %1420 = xor i32 %1411, %1342, !dbg !34 + %1421 = xor i32 %1413, %1343, !dbg !34 + %1422 = xor i32 %1415, %1344, !dbg !34 + %1423 = xor i32 %1417, %1345, !dbg !34 + %1424 = xor i32 %1415, %1346, !dbg !34 + %1425 = xor i32 %1417, %1347, !dbg !34 + %1426 = shufflevector <4 x i32> %1374, <4 x i32> poison, <2 x i32> , !dbg !38 + %1427 = shufflevector <4 x i32> %1374, <4 x i32> poison, <2 x i32> , !dbg !38 + %1428 = xor <2 x i32> %1426, %1427, !dbg !38 + %1429 = shufflevector <4 x i32> %1375, <4 x i32> poison, <2 x i32> , !dbg !38 + %1430 = shufflevector <4 x i32> %1375, <4 x i32> poison, <2 x i32> , !dbg !38 + %1431 = xor <2 x i32> %1429, %1430, !dbg !38 + %1432 = select <2 x i1> %1390, <2 x i32> zeroinitializer, <2 x i32> %1428, !dbg !35 + %1433 = shufflevector <2 x i32> %1432, <2 x i32> poison, <4 x i32> , !dbg !35 + %1434 = select <2 x i1> %1405, <2 x i32> zeroinitializer, <2 x i32> %1431, !dbg !35 + %1435 = shufflevector <2 x i32> %1434, <2 x i32> poison, <4 x i32> , !dbg !35 + %1436 = shufflevector <2 x i32> %1432, <2 x i32> poison, <2 x i32> , !dbg !36 + %1437 = shufflevector <4 x i32> %1374, <4 x i32> poison, <2 x i32> , !dbg !36 + %1438 = xor <2 x i32> %1436, %1437, !dbg !36 + %1439 = shufflevector <4 x i32> %1374, <4 x i32> poison, <2 x i32> , !dbg !36 + %1440 = xor <2 x i32> %1432, %1439, !dbg !36 + %1441 = shufflevector <2 x i32> %1434, <2 x i32> poison, <2 x i32> , !dbg !36 + %1442 = shufflevector <4 x i32> %1375, <4 x i32> poison, <2 x i32> , !dbg !36 + %1443 = xor <2 x i32> %1441, %1442, !dbg !36 + %1444 = shufflevector <4 x i32> %1375, <4 x i32> poison, <2 x i32> , !dbg !36 + %1445 = xor <2 x i32> %1434, %1444, !dbg !36 + %1446 = icmp sge i32 %1418, %1419, !dbg !37 + %1447 = icmp ne i32 %1418, %1419, !dbg !37 + %1448 = extractelement <2 x i32> %1440, i64 1, !dbg !37 + %1449 = extractelement <2 x i32> %1438, i64 1, !dbg !37 + %1450 = icmp sle i32 %1448, %1449, !dbg !37 + %1451 = or i1 %1447, %1450, !dbg !37 + %1452 = and i1 %1446, %1451, !dbg !37 + %.not48 = xor i1 %1452, %60, !dbg !37 + %1453 = icmp sge i32 %1420, %1421, !dbg !37 + %1454 = icmp ne i32 %1420, %1421, !dbg !37 + %1455 = extractelement <2 x i32> %1440, i64 0, !dbg !37 + %1456 = extractelement <2 x i32> %1438, i64 0, !dbg !37 + %1457 = icmp sle i32 %1456, %1455, !dbg !37 + %1458 = or i1 %1454, %1457, !dbg !37 + %1459 = and i1 %1453, %1458, !dbg !37 + %.not49 = xor i1 %1459, %60, !dbg !37 + %1460 = icmp sge i32 %1422, %1423, !dbg !37 + %1461 = icmp ne i32 %1422, %1423, !dbg !37 + %1462 = extractelement <2 x i32> %1445, i64 1, !dbg !37 + %1463 = extractelement <2 x i32> %1443, i64 1, !dbg !37 + %1464 = icmp sle i32 %1462, %1463, !dbg !37 + %1465 = or i1 %1461, %1464, !dbg !37 + %1466 = and i1 %1460, %1465, !dbg !37 + %.not50 = xor i1 %1466, %60, !dbg !37 + %1467 = icmp sge i32 %1424, %1425, !dbg !37 + %1468 = icmp ne i32 %1424, %1425, !dbg !37 + %1469 = extractelement <2 x i32> %1445, i64 0, !dbg !37 + %1470 = extractelement <2 x i32> %1443, i64 0, !dbg !37 + %1471 = icmp sle i32 %1470, %1469, !dbg !37 + %1472 = or i1 %1468, %1471, !dbg !37 + %1473 = and i1 %1467, %1472, !dbg !37 + %.not51 = xor i1 %1473, %60, !dbg !37 + %1474 = xor i32 %1419, %1418, !dbg !32 + %1475 = xor i32 %1421, %1420, !dbg !32 + %1476 = xor i32 %1423, %1422, !dbg !32 + %1477 = xor i32 %1425, %1424, !dbg !32 + %1478 = select i1 %.not48, i32 0, i32 %1474, !dbg !33 + %1479 = select i1 %.not49, i32 0, i32 %1475, !dbg !33 + %1480 = select i1 %.not50, i32 0, i32 %1476, !dbg !33 + %1481 = select i1 %.not51, i32 0, i32 %1477, !dbg !33 + %1482 = insertelement <2 x i32> poison, i32 %1479, i64 0, !dbg !34 + %1483 = insertelement <2 x i32> %1482, i32 %1478, i64 1, !dbg !34 + %1484 = insertelement <2 x i32> poison, i32 %1420, i64 0, !dbg !34 + %1485 = insertelement <2 x i32> %1484, i32 %1418, i64 1, !dbg !34 + %1486 = xor <2 x i32> %1483, %1485, !dbg !34 + %1487 = insertelement <2 x i32> poison, i32 %1421, i64 0, !dbg !34 + %1488 = insertelement <2 x i32> %1487, i32 %1419, i64 1, !dbg !34 + %1489 = xor <2 x i32> %1483, %1488, !dbg !34 + %1490 = insertelement <2 x i32> poison, i32 %1481, i64 0, !dbg !34 + %1491 = insertelement <2 x i32> %1490, i32 %1480, i64 1, !dbg !34 + %1492 = insertelement <2 x i32> poison, i32 %1424, i64 0, !dbg !34 + %1493 = insertelement <2 x i32> %1492, i32 %1422, i64 1, !dbg !34 + %1494 = xor <2 x i32> %1491, %1493, !dbg !34 + %1495 = insertelement <2 x i32> poison, i32 %1425, i64 0, !dbg !34 + %1496 = insertelement <2 x i32> %1495, i32 %1423, i64 1, !dbg !34 + %1497 = xor <2 x i32> %1491, %1496, !dbg !34 + %1498 = xor i32 %1449, %1448, !dbg !38 + %1499 = xor i32 %1455, %1456, !dbg !38 + %1500 = xor i32 %1463, %1462, !dbg !38 + %1501 = xor i32 %1469, %1470, !dbg !38 + %1502 = insertelement <2 x i1> poison, i1 %.not49, i64 0, !dbg !35 + %1503 = insertelement <2 x i1> %1502, i1 %.not48, i64 1, !dbg !35 + %1504 = insertelement <2 x i32> poison, i32 %1499, i64 0, !dbg !35 + %1505 = insertelement <2 x i32> %1504, i32 %1498, i64 1, !dbg !35 + %1506 = select <2 x i1> %1503, <2 x i32> zeroinitializer, <2 x i32> %1505, !dbg !35 + %1507 = insertelement <2 x i1> poison, i1 %.not51, i64 0, !dbg !35 + %1508 = insertelement <2 x i1> %1507, i1 %.not50, i64 1, !dbg !35 + %1509 = insertelement <2 x i32> poison, i32 %1501, i64 0, !dbg !35 + %1510 = insertelement <2 x i32> %1509, i32 %1500, i64 1, !dbg !35 + %1511 = select <2 x i1> %1508, <2 x i32> zeroinitializer, <2 x i32> %1510, !dbg !35 + %1512 = shufflevector <2 x i32> %1506, <2 x i32> poison, <4 x i32> , !dbg !36 + %1513 = xor <4 x i32> %1433, %1512, !dbg !36 + %1514 = xor <2 x i32> %1506, %1438, !dbg !36 + %1515 = xor <2 x i32> %1506, %1440, !dbg !36 + %1516 = shufflevector <2 x i32> %1511, <2 x i32> poison, <4 x i32> , !dbg !36 + %1517 = xor <4 x i32> %1435, %1516, !dbg !36 + %1518 = xor <2 x i32> %1511, %1443, !dbg !36 + %1519 = xor <2 x i32> %1511, %1445, !dbg !36 + %1520 = extractelement <2 x i32> %1486, i64 1, !dbg !40 + %1521 = mul nuw nsw i32 %1520, %56, !dbg !39 + %1522 = extractelement <2 x i32> %1489, i64 1, !dbg !40 + %1523 = mul nuw nsw i32 %1522, %56, !dbg !39 + %1524 = extractelement <2 x i32> %1486, i64 0, !dbg !40 + %1525 = mul nuw nsw i32 %1524, %56, !dbg !39 + %1526 = extractelement <2 x i32> %1489, i64 0, !dbg !40 + %1527 = mul nuw nsw i32 %1526, %56, !dbg !39 + %1528 = extractelement <2 x i32> %1494, i64 1, !dbg !40 + %1529 = mul nuw nsw i32 %1528, %56, !dbg !39 + %1530 = extractelement <2 x i32> %1497, i64 1, !dbg !40 + %1531 = mul nuw nsw i32 %1530, %56, !dbg !39 + %1532 = extractelement <2 x i32> %1494, i64 0, !dbg !40 + %1533 = mul nuw nsw i32 %1532, %56, !dbg !39 + %1534 = extractelement <2 x i32> %1497, i64 0, !dbg !40 + %1535 = mul nuw nsw i32 %1534, %56, !dbg !39 + %1536 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1521, i32 2, i32 31), !dbg !64 + %1537 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1523, i32 2, i32 31), !dbg !64 + %1538 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1525, i32 2, i32 31), !dbg !64 + %1539 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1527, i32 2, i32 31), !dbg !64 + %1540 = insertelement <4 x i32> poison, i32 %1521, i64 0, !dbg !65 + %1541 = insertelement <4 x i32> %1540, i32 %1523, i64 1, !dbg !65 + %1542 = insertelement <4 x i32> %1541, i32 %1525, i64 2, !dbg !65 + %1543 = insertelement <4 x i32> %1542, i32 %1527, i64 3, !dbg !65 + %1544 = insertelement <4 x i32> poison, i32 %1536, i64 0, !dbg !65 + %1545 = insertelement <4 x i32> %1544, i32 %1537, i64 1, !dbg !65 + %1546 = insertelement <4 x i32> %1545, i32 %1538, i64 2, !dbg !65 + %1547 = insertelement <4 x i32> %1546, i32 %1539, i64 3, !dbg !65 + %1548 = add <4 x i32> %1543, %1547, !dbg !65 + %1549 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1529, i32 2, i32 31), !dbg !64 + %1550 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1531, i32 2, i32 31), !dbg !64 + %1551 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1533, i32 2, i32 31), !dbg !64 + %1552 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1535, i32 2, i32 31), !dbg !64 + %1553 = insertelement <4 x i32> poison, i32 %1529, i64 0, !dbg !65 + %1554 = insertelement <4 x i32> %1553, i32 %1531, i64 1, !dbg !65 + %1555 = insertelement <4 x i32> %1554, i32 %1533, i64 2, !dbg !65 + %1556 = insertelement <4 x i32> %1555, i32 %1535, i64 3, !dbg !65 + %1557 = insertelement <4 x i32> poison, i32 %1549, i64 0, !dbg !65 + %1558 = insertelement <4 x i32> %1557, i32 %1550, i64 1, !dbg !65 + %1559 = insertelement <4 x i32> %1558, i32 %1551, i64 2, !dbg !65 + %1560 = insertelement <4 x i32> %1559, i32 %1552, i64 3, !dbg !65 + %1561 = add <4 x i32> %1556, %1560, !dbg !65 + %1562 = mul nuw nsw i32 %1520, %.lobit, !dbg !40 + %1563 = mul nuw nsw i32 %1522, %.lobit, !dbg !40 + %1564 = mul nuw nsw i32 %1524, %.lobit, !dbg !40 + %1565 = mul nuw nsw i32 %1526, %.lobit, !dbg !40 + %1566 = mul nuw nsw i32 %1528, %.lobit, !dbg !40 + %1567 = mul nuw nsw i32 %1530, %.lobit, !dbg !40 + %1568 = mul nuw nsw i32 %1532, %.lobit, !dbg !40 + %1569 = mul nuw nsw i32 %1534, %.lobit, !dbg !40 + %1570 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1562, i32 2, i32 31), !dbg !64 + %1571 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1563, i32 2, i32 31), !dbg !64 + %1572 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1564, i32 2, i32 31), !dbg !64 + %1573 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1565, i32 2, i32 31), !dbg !64 + %1574 = insertelement <4 x i32> poison, i32 %1562, i64 0, !dbg !65 + %1575 = insertelement <4 x i32> %1574, i32 %1563, i64 1, !dbg !65 + %1576 = insertelement <4 x i32> %1575, i32 %1564, i64 2, !dbg !65 + %1577 = insertelement <4 x i32> %1576, i32 %1565, i64 3, !dbg !65 + %1578 = insertelement <4 x i32> poison, i32 %1570, i64 0, !dbg !65 + %1579 = insertelement <4 x i32> %1578, i32 %1571, i64 1, !dbg !65 + %1580 = insertelement <4 x i32> %1579, i32 %1572, i64 2, !dbg !65 + %1581 = insertelement <4 x i32> %1580, i32 %1573, i64 3, !dbg !65 + %1582 = add <4 x i32> %1577, %1581, !dbg !65 + %1583 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1566, i32 2, i32 31), !dbg !64 + %1584 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1567, i32 2, i32 31), !dbg !64 + %1585 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1568, i32 2, i32 31), !dbg !64 + %1586 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1569, i32 2, i32 31), !dbg !64 + %1587 = insertelement <4 x i32> poison, i32 %1566, i64 0, !dbg !65 + %1588 = insertelement <4 x i32> %1587, i32 %1567, i64 1, !dbg !65 + %1589 = insertelement <4 x i32> %1588, i32 %1568, i64 2, !dbg !65 + %1590 = insertelement <4 x i32> %1589, i32 %1569, i64 3, !dbg !65 + %1591 = insertelement <4 x i32> poison, i32 %1583, i64 0, !dbg !65 + %1592 = insertelement <4 x i32> %1591, i32 %1584, i64 1, !dbg !65 + %1593 = insertelement <4 x i32> %1592, i32 %1585, i64 2, !dbg !65 + %1594 = insertelement <4 x i32> %1593, i32 %1586, i64 3, !dbg !65 + %1595 = add <4 x i32> %1590, %1594, !dbg !65 + %1596 = shufflevector <2 x i32> %1515, <2 x i32> %1514, <4 x i32> , !dbg !42 + %1597 = mul nuw nsw <4 x i32> %1596, %246, !dbg !42 + %1598 = shufflevector <2 x i32> %1519, <2 x i32> %1518, <4 x i32> , !dbg !42 + %1599 = mul nuw nsw <4 x i32> %1598, %246, !dbg !42 + %1600 = extractelement <4 x i32> %1597, i64 0, !dbg !64 + %1601 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1600, i32 2, i32 31), !dbg !64 + %1602 = extractelement <4 x i32> %1597, i64 1, !dbg !64 + %1603 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1602, i32 2, i32 31), !dbg !64 + %1604 = extractelement <4 x i32> %1597, i64 2, !dbg !64 + %1605 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1604, i32 2, i32 31), !dbg !64 + %1606 = extractelement <4 x i32> %1597, i64 3, !dbg !64 + %1607 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1606, i32 2, i32 31), !dbg !64 + %1608 = insertelement <4 x i32> poison, i32 %1601, i64 0, !dbg !65 + %1609 = insertelement <4 x i32> %1608, i32 %1603, i64 1, !dbg !65 + %1610 = insertelement <4 x i32> %1609, i32 %1605, i64 2, !dbg !65 + %1611 = insertelement <4 x i32> %1610, i32 %1607, i64 3, !dbg !65 + %1612 = add <4 x i32> %1611, %1597, !dbg !65 + %1613 = extractelement <4 x i32> %1599, i64 0, !dbg !64 + %1614 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1613, i32 2, i32 31), !dbg !64 + %1615 = extractelement <4 x i32> %1599, i64 1, !dbg !64 + %1616 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1615, i32 2, i32 31), !dbg !64 + %1617 = extractelement <4 x i32> %1599, i64 2, !dbg !64 + %1618 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1617, i32 2, i32 31), !dbg !64 + %1619 = extractelement <4 x i32> %1599, i64 3, !dbg !64 + %1620 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1619, i32 2, i32 31), !dbg !64 + %1621 = insertelement <4 x i32> poison, i32 %1614, i64 0, !dbg !65 + %1622 = insertelement <4 x i32> %1621, i32 %1616, i64 1, !dbg !65 + %1623 = insertelement <4 x i32> %1622, i32 %1618, i64 2, !dbg !65 + %1624 = insertelement <4 x i32> %1623, i32 %1620, i64 3, !dbg !65 + %1625 = add <4 x i32> %1624, %1599, !dbg !65 + %1626 = mul nuw nsw <4 x i32> %1596, %248, !dbg !41 + %1627 = mul nuw nsw <4 x i32> %1598, %248, !dbg !41 + %1628 = extractelement <4 x i32> %1626, i64 0, !dbg !64 + %1629 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1628, i32 2, i32 31), !dbg !64 + %1630 = extractelement <4 x i32> %1626, i64 1, !dbg !64 + %1631 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1630, i32 2, i32 31), !dbg !64 + %1632 = extractelement <4 x i32> %1626, i64 2, !dbg !64 + %1633 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1632, i32 2, i32 31), !dbg !64 + %1634 = extractelement <4 x i32> %1626, i64 3, !dbg !64 + %1635 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1634, i32 2, i32 31), !dbg !64 + %1636 = insertelement <4 x i32> poison, i32 %1629, i64 0, !dbg !65 + %1637 = insertelement <4 x i32> %1636, i32 %1631, i64 1, !dbg !65 + %1638 = insertelement <4 x i32> %1637, i32 %1633, i64 2, !dbg !65 + %1639 = insertelement <4 x i32> %1638, i32 %1635, i64 3, !dbg !65 + %1640 = add <4 x i32> %1639, %1626, !dbg !65 + %1641 = extractelement <4 x i32> %1627, i64 0, !dbg !64 + %1642 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1641, i32 2, i32 31), !dbg !64 + %1643 = extractelement <4 x i32> %1627, i64 1, !dbg !64 + %1644 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1643, i32 2, i32 31), !dbg !64 + %1645 = extractelement <4 x i32> %1627, i64 2, !dbg !64 + %1646 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1645, i32 2, i32 31), !dbg !64 + %1647 = extractelement <4 x i32> %1627, i64 3, !dbg !64 + %1648 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1647, i32 2, i32 31), !dbg !64 + %1649 = insertelement <4 x i32> poison, i32 %1642, i64 0, !dbg !65 + %1650 = insertelement <4 x i32> %1649, i32 %1644, i64 1, !dbg !65 + %1651 = insertelement <4 x i32> %1650, i32 %1646, i64 2, !dbg !65 + %1652 = insertelement <4 x i32> %1651, i32 %1648, i64 3, !dbg !65 + %1653 = add <4 x i32> %1652, %1627, !dbg !65 + %1654 = icmp slt <4 x i32> %1548, %1582, !dbg !29 + %1655 = icmp slt <4 x i32> %1561, %1595, !dbg !29 + %1656 = icmp eq <4 x i32> %1548, %1582, !dbg !66 + %1657 = icmp eq <4 x i32> %1561, %1595, !dbg !66 + %1658 = icmp sgt <4 x i32> %1612, %1640, !dbg !67 + %1659 = icmp sgt <4 x i32> %1625, %1653, !dbg !67 + %1660 = and <4 x i1> %1656, %1658, !dbg !68 + %1661 = and <4 x i1> %1657, %1659, !dbg !68 + %1662 = or <4 x i1> %1654, %1660, !dbg !69 + %1663 = or <4 x i1> %1654, %1660, !dbg !69 + %1664 = shufflevector <4 x i1> %1663, <4 x i1> poison, <2 x i32> , !dbg !69 + %1665 = or <4 x i1> %1654, %1660, !dbg !69 + %1666 = shufflevector <4 x i1> %1665, <4 x i1> poison, <2 x i32> , !dbg !69 + %1667 = or <4 x i1> %1655, %1661, !dbg !69 + %1668 = or <4 x i1> %1655, %1661, !dbg !69 + %1669 = shufflevector <4 x i1> %1668, <4 x i1> poison, <2 x i32> , !dbg !69 + %1670 = or <4 x i1> %1655, %1661, !dbg !69 + %1671 = shufflevector <4 x i1> %1670, <4 x i1> poison, <2 x i32> , !dbg !69 + %foldExtExtBinop = xor <4 x i32> %1548, %1582, !dbg !32 + %foldExtExtBinop113 = xor <4 x i32> %1548, %1582, !dbg !32 + %foldExtExtBinop115 = xor <4 x i32> %1548, %1582, !dbg !32 + %foldExtExtBinop117 = xor <4 x i32> %1548, %1582, !dbg !32 + %foldExtExtBinop119 = xor <4 x i32> %1561, %1595, !dbg !32 + %foldExtExtBinop121 = xor <4 x i32> %1561, %1595, !dbg !32 + %foldExtExtBinop123 = xor <4 x i32> %1561, %1595, !dbg !32 + %foldExtExtBinop125 = xor <4 x i32> %1561, %1595, !dbg !32 + %1672 = shufflevector <2 x i1> %1664, <2 x i1> %1666, <2 x i32> , !dbg !33 + %1673 = shufflevector <4 x i32> %foldExtExtBinop115, <4 x i32> %foldExtExtBinop, <2 x i32> , !dbg !33 + %1674 = select <2 x i1> %1672, <2 x i32> %1673, <2 x i32> zeroinitializer, !dbg !33 + %1675 = shufflevector <2 x i1> %1666, <2 x i1> %1664, <2 x i32> , !dbg !33 + %1676 = shufflevector <4 x i32> %foldExtExtBinop117, <4 x i32> %foldExtExtBinop113, <2 x i32> , !dbg !33 + %1677 = select <2 x i1> %1675, <2 x i32> %1676, <2 x i32> zeroinitializer, !dbg !33 + %1678 = shufflevector <2 x i1> %1669, <2 x i1> %1671, <2 x i32> , !dbg !33 + %1679 = shufflevector <4 x i32> %foldExtExtBinop123, <4 x i32> %foldExtExtBinop119, <2 x i32> , !dbg !33 + %1680 = select <2 x i1> %1678, <2 x i32> %1679, <2 x i32> zeroinitializer, !dbg !33 + %1681 = shufflevector <2 x i1> %1671, <2 x i1> %1669, <2 x i32> , !dbg !33 + %1682 = shufflevector <4 x i32> %foldExtExtBinop125, <4 x i32> %foldExtExtBinop121, <2 x i32> , !dbg !33 + %1683 = select <2 x i1> %1681, <2 x i32> %1682, <2 x i32> zeroinitializer, !dbg !33 + %1684 = shufflevector <2 x i32> %1674, <2 x i32> %1677, <2 x i32> , !dbg !34 + %1685 = shufflevector <2 x i32> %1486, <2 x i32> %1489, <2 x i32> , !dbg !34 + %1686 = xor <2 x i32> %1684, %1685, !dbg !34 + %1687 = xor <2 x i32> %1674, %1486, !dbg !34 + %1688 = xor <2 x i32> %1677, %1489, !dbg !34 + %1689 = shufflevector <2 x i32> %1677, <2 x i32> %1674, <2 x i32> , !dbg !34 + %1690 = shufflevector <2 x i32> %1489, <2 x i32> %1486, <2 x i32> , !dbg !34 + %1691 = xor <2 x i32> %1689, %1690, !dbg !34 + %1692 = shufflevector <2 x i32> %1680, <2 x i32> %1683, <2 x i32> , !dbg !34 + %1693 = shufflevector <2 x i32> %1494, <2 x i32> %1497, <2 x i32> , !dbg !34 + %1694 = xor <2 x i32> %1692, %1693, !dbg !34 + %1695 = xor <2 x i32> %1680, %1494, !dbg !34 + %1696 = xor <2 x i32> %1683, %1497, !dbg !34 + %1697 = shufflevector <2 x i32> %1683, <2 x i32> %1680, <2 x i32> , !dbg !34 + %1698 = shufflevector <2 x i32> %1497, <2 x i32> %1494, <2 x i32> , !dbg !34 + %1699 = xor <2 x i32> %1697, %1698, !dbg !34 + %1700 = xor <4 x i32> %1640, %1612, !dbg !38 + %1701 = xor <4 x i32> %1640, %1612, !dbg !38 + %1702 = shufflevector <4 x i32> %1701, <4 x i32> poison, <2 x i32> , !dbg !38 + %1703 = xor <4 x i32> %1640, %1612, !dbg !38 + %1704 = shufflevector <4 x i32> %1703, <4 x i32> poison, <2 x i32> , !dbg !38 + %1705 = xor <4 x i32> %1653, %1625, !dbg !38 + %1706 = xor <4 x i32> %1653, %1625, !dbg !38 + %1707 = shufflevector <4 x i32> %1706, <4 x i32> poison, <2 x i32> , !dbg !38 + %1708 = xor <4 x i32> %1653, %1625, !dbg !38 + %1709 = shufflevector <4 x i32> %1708, <4 x i32> poison, <2 x i32> , !dbg !38 + %1710 = select <4 x i1> %1662, <4 x i32> %1700, <4 x i32> zeroinitializer, !dbg !35 + %1711 = select <2 x i1> %1664, <2 x i32> %1702, <2 x i32> zeroinitializer, !dbg !35 + %1712 = select <2 x i1> %1666, <2 x i32> %1704, <2 x i32> zeroinitializer, !dbg !35 + %1713 = select <4 x i1> %1667, <4 x i32> %1705, <4 x i32> zeroinitializer, !dbg !35 + %1714 = select <2 x i1> %1669, <2 x i32> %1707, <2 x i32> zeroinitializer, !dbg !35 + %1715 = select <2 x i1> %1671, <2 x i32> %1709, <2 x i32> zeroinitializer, !dbg !35 + %1716 = xor <4 x i32> %1513, %1710, !dbg !36 + %1717 = shufflevector <2 x i32> %1712, <2 x i32> %1711, <2 x i32> , !dbg !36 + %1718 = shufflevector <2 x i32> %1515, <2 x i32> %1514, <2 x i32> , !dbg !36 + %1719 = xor <2 x i32> %1717, %1718, !dbg !36 + %1720 = shufflevector <2 x i32> %1711, <2 x i32> %1712, <2 x i32> , !dbg !36 + %1721 = shufflevector <2 x i32> %1514, <2 x i32> %1515, <2 x i32> , !dbg !36 + %1722 = xor <2 x i32> %1720, %1721, !dbg !36 + %1723 = shufflevector <2 x i32> %1712, <2 x i32> %1711, <2 x i32> , !dbg !36 + %1724 = shufflevector <2 x i32> %1515, <2 x i32> %1514, <2 x i32> , !dbg !36 + %1725 = xor <2 x i32> %1723, %1724, !dbg !36 + %1726 = shufflevector <2 x i32> %1711, <2 x i32> %1712, <2 x i32> , !dbg !36 + %1727 = shufflevector <2 x i32> %1514, <2 x i32> %1515, <2 x i32> , !dbg !36 + %1728 = xor <2 x i32> %1726, %1727, !dbg !36 + %1729 = xor <4 x i32> %1517, %1713, !dbg !36 + %1730 = shufflevector <2 x i32> %1715, <2 x i32> %1714, <2 x i32> , !dbg !36 + %1731 = shufflevector <2 x i32> %1519, <2 x i32> %1518, <2 x i32> , !dbg !36 + %1732 = xor <2 x i32> %1730, %1731, !dbg !36 + %1733 = shufflevector <2 x i32> %1714, <2 x i32> %1715, <2 x i32> , !dbg !36 + %1734 = shufflevector <2 x i32> %1518, <2 x i32> %1519, <2 x i32> , !dbg !36 + %1735 = xor <2 x i32> %1733, %1734, !dbg !36 + %1736 = shufflevector <2 x i32> %1715, <2 x i32> %1714, <2 x i32> , !dbg !36 + %1737 = shufflevector <2 x i32> %1519, <2 x i32> %1518, <2 x i32> , !dbg !36 + %1738 = xor <2 x i32> %1736, %1737, !dbg !36 + %1739 = shufflevector <2 x i32> %1714, <2 x i32> %1715, <2 x i32> , !dbg !36 + %1740 = shufflevector <2 x i32> %1518, <2 x i32> %1519, <2 x i32> , !dbg !36 + %1741 = xor <2 x i32> %1739, %1740, !dbg !36 + %1742 = shufflevector <2 x i32> %1687, <2 x i32> %1688, <4 x i32> , !dbg !39 + %1743 = mul nuw nsw <4 x i32> %1742, %250, !dbg !39 + %1744 = shufflevector <2 x i32> %1695, <2 x i32> %1696, <4 x i32> , !dbg !39 + %1745 = mul nuw nsw <4 x i32> %1744, %250, !dbg !39 + %1746 = extractelement <4 x i32> %1743, i64 0, !dbg !64 + %1747 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1746, i32 1, i32 31), !dbg !64 + %1748 = extractelement <4 x i32> %1743, i64 1, !dbg !64 + %1749 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1748, i32 1, i32 31), !dbg !64 + %1750 = extractelement <4 x i32> %1743, i64 2, !dbg !64 + %1751 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1750, i32 1, i32 31), !dbg !64 + %1752 = extractelement <4 x i32> %1743, i64 3, !dbg !64 + %1753 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1752, i32 1, i32 31), !dbg !64 + %1754 = insertelement <4 x i32> poison, i32 %1747, i64 0, !dbg !65 + %1755 = insertelement <4 x i32> %1754, i32 %1749, i64 1, !dbg !65 + %1756 = insertelement <4 x i32> %1755, i32 %1751, i64 2, !dbg !65 + %1757 = insertelement <4 x i32> %1756, i32 %1753, i64 3, !dbg !65 + %1758 = add <4 x i32> %1757, %1743, !dbg !65 + %1759 = extractelement <4 x i32> %1745, i64 0, !dbg !64 + %1760 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1759, i32 1, i32 31), !dbg !64 + %1761 = extractelement <4 x i32> %1745, i64 1, !dbg !64 + %1762 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1761, i32 1, i32 31), !dbg !64 + %1763 = extractelement <4 x i32> %1745, i64 2, !dbg !64 + %1764 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1763, i32 1, i32 31), !dbg !64 + %1765 = extractelement <4 x i32> %1745, i64 3, !dbg !64 + %1766 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1765, i32 1, i32 31), !dbg !64 + %1767 = insertelement <4 x i32> poison, i32 %1760, i64 0, !dbg !65 + %1768 = insertelement <4 x i32> %1767, i32 %1762, i64 1, !dbg !65 + %1769 = insertelement <4 x i32> %1768, i32 %1764, i64 2, !dbg !65 + %1770 = insertelement <4 x i32> %1769, i32 %1766, i64 3, !dbg !65 + %1771 = add <4 x i32> %1770, %1745, !dbg !65 + %1772 = mul nuw nsw <4 x i32> %1742, %252, !dbg !40 + %1773 = mul nuw nsw <4 x i32> %1744, %252, !dbg !40 + %1774 = extractelement <4 x i32> %1772, i64 0, !dbg !64 + %1775 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1774, i32 1, i32 31), !dbg !64 + %1776 = extractelement <4 x i32> %1772, i64 1, !dbg !64 + %1777 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1776, i32 1, i32 31), !dbg !64 + %1778 = extractelement <4 x i32> %1772, i64 2, !dbg !64 + %1779 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1778, i32 1, i32 31), !dbg !64 + %1780 = extractelement <4 x i32> %1772, i64 3, !dbg !64 + %1781 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1780, i32 1, i32 31), !dbg !64 + %1782 = insertelement <4 x i32> poison, i32 %1775, i64 0, !dbg !65 + %1783 = insertelement <4 x i32> %1782, i32 %1777, i64 1, !dbg !65 + %1784 = insertelement <4 x i32> %1783, i32 %1779, i64 2, !dbg !65 + %1785 = insertelement <4 x i32> %1784, i32 %1781, i64 3, !dbg !65 + %1786 = add <4 x i32> %1785, %1772, !dbg !65 + %1787 = extractelement <4 x i32> %1773, i64 0, !dbg !64 + %1788 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1787, i32 1, i32 31), !dbg !64 + %1789 = extractelement <4 x i32> %1773, i64 1, !dbg !64 + %1790 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1789, i32 1, i32 31), !dbg !64 + %1791 = extractelement <4 x i32> %1773, i64 2, !dbg !64 + %1792 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1791, i32 1, i32 31), !dbg !64 + %1793 = extractelement <4 x i32> %1773, i64 3, !dbg !64 + %1794 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1793, i32 1, i32 31), !dbg !64 + %1795 = insertelement <4 x i32> poison, i32 %1788, i64 0, !dbg !65 + %1796 = insertelement <4 x i32> %1795, i32 %1790, i64 1, !dbg !65 + %1797 = insertelement <4 x i32> %1796, i32 %1792, i64 2, !dbg !65 + %1798 = insertelement <4 x i32> %1797, i32 %1794, i64 3, !dbg !65 + %1799 = add <4 x i32> %1798, %1773, !dbg !65 + %1800 = extractelement <2 x i32> %1728, i64 1, !dbg !41 + %1801 = mul nuw nsw i32 %1800, %55, !dbg !42 + %1802 = extractelement <2 x i32> %1725, i64 1, !dbg !41 + %1803 = mul nuw nsw i32 %1802, %55, !dbg !42 + %1804 = extractelement <2 x i32> %1728, i64 0, !dbg !41 + %1805 = mul nuw nsw i32 %1804, %55, !dbg !42 + %1806 = extractelement <2 x i32> %1725, i64 0, !dbg !41 + %1807 = mul nuw nsw i32 %1806, %55, !dbg !42 + %1808 = extractelement <2 x i32> %1741, i64 1, !dbg !41 + %1809 = mul nuw nsw i32 %1808, %55, !dbg !42 + %1810 = extractelement <2 x i32> %1738, i64 1, !dbg !41 + %1811 = mul nuw nsw i32 %1810, %55, !dbg !42 + %1812 = extractelement <2 x i32> %1741, i64 0, !dbg !41 + %1813 = mul nuw nsw i32 %1812, %55, !dbg !42 + %1814 = extractelement <2 x i32> %1738, i64 0, !dbg !41 + %1815 = mul nuw nsw i32 %1814, %55, !dbg !42 + %1816 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1801, i32 1, i32 31), !dbg !64 + %1817 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1803, i32 1, i32 31), !dbg !64 + %1818 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1805, i32 1, i32 31), !dbg !64 + %1819 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1807, i32 1, i32 31), !dbg !64 + %1820 = insertelement <2 x i32> poison, i32 %1818, i64 0, !dbg !65 + %1821 = insertelement <2 x i32> %1820, i32 %1817, i64 1, !dbg !65 + %1822 = insertelement <2 x i32> poison, i32 %1805, i64 0, !dbg !65 + %1823 = insertelement <2 x i32> %1822, i32 %1803, i64 1, !dbg !65 + %1824 = add <2 x i32> %1821, %1823, !dbg !65 + %1825 = insertelement <4 x i32> poison, i32 %1816, i64 0, !dbg !65 + %1826 = insertelement <4 x i32> %1825, i32 %1817, i64 1, !dbg !65 + %1827 = insertelement <4 x i32> %1826, i32 %1818, i64 2, !dbg !65 + %1828 = insertelement <4 x i32> %1827, i32 %1819, i64 3, !dbg !65 + %1829 = insertelement <4 x i32> poison, i32 %1801, i64 0, !dbg !65 + %1830 = insertelement <4 x i32> %1829, i32 %1803, i64 1, !dbg !65 + %1831 = insertelement <4 x i32> %1830, i32 %1805, i64 2, !dbg !65 + %1832 = insertelement <4 x i32> %1831, i32 %1807, i64 3, !dbg !65 + %1833 = add <4 x i32> %1828, %1832, !dbg !65 + %1834 = insertelement <2 x i32> poison, i32 %1819, i64 0, !dbg !65 + %1835 = insertelement <2 x i32> %1834, i32 %1816, i64 1, !dbg !65 + %1836 = insertelement <2 x i32> poison, i32 %1807, i64 0, !dbg !65 + %1837 = insertelement <2 x i32> %1836, i32 %1801, i64 1, !dbg !65 + %1838 = add <2 x i32> %1835, %1837, !dbg !65 + %1839 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1809, i32 1, i32 31), !dbg !64 + %1840 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1811, i32 1, i32 31), !dbg !64 + %1841 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1813, i32 1, i32 31), !dbg !64 + %1842 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1815, i32 1, i32 31), !dbg !64 + %1843 = insertelement <2 x i32> poison, i32 %1841, i64 0, !dbg !65 + %1844 = insertelement <2 x i32> %1843, i32 %1840, i64 1, !dbg !65 + %1845 = insertelement <2 x i32> poison, i32 %1813, i64 0, !dbg !65 + %1846 = insertelement <2 x i32> %1845, i32 %1811, i64 1, !dbg !65 + %1847 = add <2 x i32> %1844, %1846, !dbg !65 + %1848 = insertelement <4 x i32> poison, i32 %1839, i64 0, !dbg !65 + %1849 = insertelement <4 x i32> %1848, i32 %1840, i64 1, !dbg !65 + %1850 = insertelement <4 x i32> %1849, i32 %1841, i64 2, !dbg !65 + %1851 = insertelement <4 x i32> %1850, i32 %1842, i64 3, !dbg !65 + %1852 = insertelement <4 x i32> poison, i32 %1809, i64 0, !dbg !65 + %1853 = insertelement <4 x i32> %1852, i32 %1811, i64 1, !dbg !65 + %1854 = insertelement <4 x i32> %1853, i32 %1813, i64 2, !dbg !65 + %1855 = insertelement <4 x i32> %1854, i32 %1815, i64 3, !dbg !65 + %1856 = add <4 x i32> %1851, %1855, !dbg !65 + %1857 = insertelement <2 x i32> poison, i32 %1842, i64 0, !dbg !65 + %1858 = insertelement <2 x i32> %1857, i32 %1839, i64 1, !dbg !65 + %1859 = insertelement <2 x i32> poison, i32 %1815, i64 0, !dbg !65 + %1860 = insertelement <2 x i32> %1859, i32 %1809, i64 1, !dbg !65 + %1861 = add <2 x i32> %1858, %1860, !dbg !65 + %1862 = mul nuw nsw i32 %1800, %53, !dbg !41 + %1863 = mul nuw nsw i32 %1802, %53, !dbg !41 + %1864 = mul nuw nsw i32 %1804, %53, !dbg !41 + %1865 = mul nuw nsw i32 %1806, %53, !dbg !41 + %1866 = mul nuw nsw i32 %1808, %53, !dbg !41 + %1867 = mul nuw nsw i32 %1810, %53, !dbg !41 + %1868 = mul nuw nsw i32 %1812, %53, !dbg !41 + %1869 = mul nuw nsw i32 %1814, %53, !dbg !41 + %1870 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1862, i32 1, i32 31), !dbg !64 + %1871 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1863, i32 1, i32 31), !dbg !64 + %1872 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1864, i32 1, i32 31), !dbg !64 + %1873 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1865, i32 1, i32 31), !dbg !64 + %1874 = insertelement <2 x i32> poison, i32 %1872, i64 0, !dbg !65 + %1875 = insertelement <2 x i32> %1874, i32 %1871, i64 1, !dbg !65 + %1876 = insertelement <2 x i32> poison, i32 %1864, i64 0, !dbg !65 + %1877 = insertelement <2 x i32> %1876, i32 %1863, i64 1, !dbg !65 + %1878 = add <2 x i32> %1875, %1877, !dbg !65 + %1879 = insertelement <4 x i32> poison, i32 %1870, i64 0, !dbg !65 + %1880 = insertelement <4 x i32> %1879, i32 %1871, i64 1, !dbg !65 + %1881 = insertelement <4 x i32> %1880, i32 %1872, i64 2, !dbg !65 + %1882 = insertelement <4 x i32> %1881, i32 %1873, i64 3, !dbg !65 + %1883 = insertelement <4 x i32> poison, i32 %1862, i64 0, !dbg !65 + %1884 = insertelement <4 x i32> %1883, i32 %1863, i64 1, !dbg !65 + %1885 = insertelement <4 x i32> %1884, i32 %1864, i64 2, !dbg !65 + %1886 = insertelement <4 x i32> %1885, i32 %1865, i64 3, !dbg !65 + %1887 = add <4 x i32> %1882, %1886, !dbg !65 + %1888 = insertelement <2 x i32> poison, i32 %1873, i64 0, !dbg !65 + %1889 = insertelement <2 x i32> %1888, i32 %1870, i64 1, !dbg !65 + %1890 = insertelement <2 x i32> poison, i32 %1865, i64 0, !dbg !65 + %1891 = insertelement <2 x i32> %1890, i32 %1862, i64 1, !dbg !65 + %1892 = add <2 x i32> %1889, %1891, !dbg !65 + %1893 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1866, i32 1, i32 31), !dbg !64 + %1894 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1867, i32 1, i32 31), !dbg !64 + %1895 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1868, i32 1, i32 31), !dbg !64 + %1896 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1869, i32 1, i32 31), !dbg !64 + %1897 = insertelement <2 x i32> poison, i32 %1895, i64 0, !dbg !65 + %1898 = insertelement <2 x i32> %1897, i32 %1894, i64 1, !dbg !65 + %1899 = insertelement <2 x i32> poison, i32 %1868, i64 0, !dbg !65 + %1900 = insertelement <2 x i32> %1899, i32 %1867, i64 1, !dbg !65 + %1901 = add <2 x i32> %1898, %1900, !dbg !65 + %1902 = insertelement <4 x i32> poison, i32 %1893, i64 0, !dbg !65 + %1903 = insertelement <4 x i32> %1902, i32 %1894, i64 1, !dbg !65 + %1904 = insertelement <4 x i32> %1903, i32 %1895, i64 2, !dbg !65 + %1905 = insertelement <4 x i32> %1904, i32 %1896, i64 3, !dbg !65 + %1906 = insertelement <4 x i32> poison, i32 %1866, i64 0, !dbg !65 + %1907 = insertelement <4 x i32> %1906, i32 %1867, i64 1, !dbg !65 + %1908 = insertelement <4 x i32> %1907, i32 %1868, i64 2, !dbg !65 + %1909 = insertelement <4 x i32> %1908, i32 %1869, i64 3, !dbg !65 + %1910 = add <4 x i32> %1905, %1909, !dbg !65 + %1911 = insertelement <2 x i32> poison, i32 %1896, i64 0, !dbg !65 + %1912 = insertelement <2 x i32> %1911, i32 %1893, i64 1, !dbg !65 + %1913 = insertelement <2 x i32> poison, i32 %1869, i64 0, !dbg !65 + %1914 = insertelement <2 x i32> %1913, i32 %1866, i64 1, !dbg !65 + %1915 = add <2 x i32> %1912, %1914, !dbg !65 + %1916 = icmp slt <4 x i32> %1758, %1786, !dbg !29 + %1917 = icmp slt <4 x i32> %1758, %1786, !dbg !29 + %1918 = shufflevector <4 x i1> %1917, <4 x i1> poison, <2 x i32> , !dbg !29 + %1919 = icmp slt <4 x i32> %1758, %1786, !dbg !29 + %1920 = shufflevector <4 x i1> %1919, <4 x i1> poison, <2 x i32> , !dbg !29 + %1921 = icmp slt <4 x i32> %1771, %1799, !dbg !29 + %1922 = icmp slt <4 x i32> %1771, %1799, !dbg !29 + %1923 = shufflevector <4 x i1> %1922, <4 x i1> poison, <2 x i32> , !dbg !29 + %1924 = icmp slt <4 x i32> %1771, %1799, !dbg !29 + %1925 = shufflevector <4 x i1> %1924, <4 x i1> poison, <2 x i32> , !dbg !29 + %1926 = icmp eq <4 x i32> %1758, %1786, !dbg !66 + %1927 = icmp eq <4 x i32> %1771, %1799, !dbg !66 + %1928 = icmp sgt <4 x i32> %1833, %1887, !dbg !67 + %1929 = icmp sgt <4 x i32> %1856, %1910, !dbg !67 + %1930 = and <4 x i1> %1926, %1928, !dbg !68 + %1931 = and <4 x i1> %1926, %1928, !dbg !68 + %1932 = shufflevector <4 x i1> %1931, <4 x i1> poison, <2 x i32> , !dbg !68 + %1933 = and <4 x i1> %1926, %1928, !dbg !68 + %1934 = shufflevector <4 x i1> %1933, <4 x i1> poison, <2 x i32> , !dbg !68 + %1935 = and <4 x i1> %1927, %1929, !dbg !68 + %1936 = and <4 x i1> %1927, %1929, !dbg !68 + %1937 = shufflevector <4 x i1> %1936, <4 x i1> poison, <2 x i32> , !dbg !68 + %1938 = and <4 x i1> %1927, %1929, !dbg !68 + %1939 = shufflevector <4 x i1> %1938, <4 x i1> poison, <2 x i32> , !dbg !68 + %1940 = or <4 x i1> %1916, %1930, !dbg !69 + %1941 = or <2 x i1> %1918, %1932, !dbg !69 + %1942 = or <2 x i1> %1920, %1934, !dbg !69 + %1943 = shufflevector <2 x i1> %1920, <2 x i1> %1918, <2 x i32> , !dbg !69 + %1944 = shufflevector <2 x i1> %1934, <2 x i1> %1932, <2 x i32> , !dbg !69 + %1945 = or <2 x i1> %1943, %1944, !dbg !69 + %1946 = shufflevector <2 x i1> %1918, <2 x i1> %1920, <2 x i32> , !dbg !69 + %1947 = shufflevector <2 x i1> %1932, <2 x i1> %1934, <2 x i32> , !dbg !69 + %1948 = or <2 x i1> %1946, %1947, !dbg !69 + %1949 = or <4 x i1> %1921, %1935, !dbg !69 + %1950 = or <2 x i1> %1923, %1937, !dbg !69 + %1951 = or <2 x i1> %1925, %1939, !dbg !69 + %1952 = shufflevector <2 x i1> %1925, <2 x i1> %1923, <2 x i32> , !dbg !69 + %1953 = shufflevector <2 x i1> %1939, <2 x i1> %1937, <2 x i32> , !dbg !69 + %1954 = or <2 x i1> %1952, %1953, !dbg !69 + %1955 = shufflevector <2 x i1> %1923, <2 x i1> %1925, <2 x i32> , !dbg !69 + %1956 = shufflevector <2 x i1> %1937, <2 x i1> %1939, <2 x i32> , !dbg !69 + %1957 = or <2 x i1> %1955, %1956, !dbg !69 + %1958 = xor <4 x i32> %1786, %1758, !dbg !32 + %1959 = shufflevector <4 x i32> %1958, <4 x i32> poison, <2 x i32> , !dbg !32 + %1960 = xor <4 x i32> %1786, %1758, !dbg !32 + %1961 = shufflevector <4 x i32> %1960, <4 x i32> poison, <2 x i32> , !dbg !32 + %1962 = xor <4 x i32> %1799, %1771, !dbg !32 + %1963 = shufflevector <4 x i32> %1962, <4 x i32> poison, <2 x i32> , !dbg !32 + %1964 = xor <4 x i32> %1799, %1771, !dbg !32 + %1965 = shufflevector <4 x i32> %1964, <4 x i32> poison, <2 x i32> , !dbg !32 + %1966 = shufflevector <2 x i1> %1948, <2 x i1> %1945, <2 x i32> , !dbg !33 + %1967 = shufflevector <2 x i32> %1959, <2 x i32> %1961, <2 x i32> , !dbg !33 + %1968 = select <2 x i1> %1966, <2 x i32> %1967, <2 x i32> zeroinitializer, !dbg !33 + %1969 = select <2 x i1> %1948, <2 x i32> %1959, <2 x i32> zeroinitializer, !dbg !33 + %1970 = select <2 x i1> %1945, <2 x i32> %1961, <2 x i32> zeroinitializer, !dbg !33 + %1971 = shufflevector <2 x i1> %1945, <2 x i1> %1948, <2 x i32> , !dbg !33 + %1972 = shufflevector <2 x i32> %1961, <2 x i32> %1959, <2 x i32> , !dbg !33 + %1973 = select <2 x i1> %1971, <2 x i32> %1972, <2 x i32> zeroinitializer, !dbg !33 + %1974 = shufflevector <2 x i1> %1957, <2 x i1> %1954, <2 x i32> , !dbg !33 + %1975 = shufflevector <2 x i32> %1963, <2 x i32> %1965, <2 x i32> , !dbg !33 + %1976 = select <2 x i1> %1974, <2 x i32> %1975, <2 x i32> zeroinitializer, !dbg !33 + %1977 = select <2 x i1> %1957, <2 x i32> %1963, <2 x i32> zeroinitializer, !dbg !33 + %1978 = select <2 x i1> %1954, <2 x i32> %1965, <2 x i32> zeroinitializer, !dbg !33 + %1979 = shufflevector <2 x i1> %1954, <2 x i1> %1957, <2 x i32> , !dbg !33 + %1980 = shufflevector <2 x i32> %1965, <2 x i32> %1963, <2 x i32> , !dbg !33 + %1981 = select <2 x i1> %1979, <2 x i32> %1980, <2 x i32> zeroinitializer, !dbg !33 + %1982 = xor <2 x i32> %1968, %1686, !dbg !34 + %1983 = xor <2 x i32> %1969, %1687, !dbg !34 + %1984 = xor <2 x i32> %1970, %1688, !dbg !34 + %1985 = xor <2 x i32> %1973, %1691, !dbg !34 + %1986 = xor <2 x i32> %1976, %1694, !dbg !34 + %1987 = xor <2 x i32> %1977, %1695, !dbg !34 + %1988 = xor <2 x i32> %1978, %1696, !dbg !34 + %1989 = xor <2 x i32> %1981, %1699, !dbg !34 + %1990 = shufflevector <2 x i32> %1892, <2 x i32> %1878, <4 x i32> , !dbg !38 + %1991 = shufflevector <2 x i32> %1838, <2 x i32> %1824, <4 x i32> , !dbg !38 + %1992 = xor <4 x i32> %1990, %1991, !dbg !38 + %1993 = xor <2 x i32> %1878, %1824, !dbg !38 + %1994 = xor <2 x i32> %1892, %1838, !dbg !38 + %1995 = shufflevector <2 x i32> %1915, <2 x i32> %1901, <4 x i32> , !dbg !38 + %1996 = shufflevector <2 x i32> %1861, <2 x i32> %1847, <4 x i32> , !dbg !38 + %1997 = xor <4 x i32> %1995, %1996, !dbg !38 + %1998 = xor <2 x i32> %1901, %1847, !dbg !38 + %1999 = xor <2 x i32> %1915, %1861, !dbg !38 + %2000 = select <4 x i1> %1940, <4 x i32> %1992, <4 x i32> zeroinitializer, !dbg !35 + %2001 = select <2 x i1> %1941, <2 x i32> %1993, <2 x i32> zeroinitializer, !dbg !35 + %2002 = select <2 x i1> %1942, <2 x i32> %1994, <2 x i32> zeroinitializer, !dbg !35 + %2003 = shufflevector <2 x i1> %1945, <2 x i1> %1948, <2 x i32> , !dbg !35 + %2004 = shufflevector <2 x i32> %1993, <2 x i32> %1994, <2 x i32> , !dbg !35 + %2005 = select <2 x i1> %2003, <2 x i32> %2004, <2 x i32> zeroinitializer, !dbg !35 + %2006 = shufflevector <2 x i1> %1945, <2 x i1> %1948, <2 x i32> , !dbg !35 + %2007 = shufflevector <2 x i32> %1994, <2 x i32> %1993, <2 x i32> , !dbg !35 + %2008 = select <2 x i1> %2006, <2 x i32> %2007, <2 x i32> zeroinitializer, !dbg !35 + %2009 = shufflevector <2 x i32> %1994, <2 x i32> %1993, <2 x i32> , !dbg !35 + %2010 = select <2 x i1> %1945, <2 x i32> %2009, <2 x i32> zeroinitializer, !dbg !35 + %2011 = shufflevector <2 x i32> %1993, <2 x i32> %1994, <2 x i32> , !dbg !35 + %2012 = select <2 x i1> %1948, <2 x i32> %2011, <2 x i32> zeroinitializer, !dbg !35 + %2013 = select <4 x i1> %1949, <4 x i32> %1997, <4 x i32> zeroinitializer, !dbg !35 + %2014 = select <2 x i1> %1950, <2 x i32> %1998, <2 x i32> zeroinitializer, !dbg !35 + %2015 = select <2 x i1> %1951, <2 x i32> %1999, <2 x i32> zeroinitializer, !dbg !35 + %2016 = shufflevector <2 x i1> %1954, <2 x i1> %1957, <2 x i32> , !dbg !35 + %2017 = shufflevector <2 x i32> %1998, <2 x i32> %1999, <2 x i32> , !dbg !35 + %2018 = select <2 x i1> %2016, <2 x i32> %2017, <2 x i32> zeroinitializer, !dbg !35 + %2019 = shufflevector <2 x i1> %1954, <2 x i1> %1957, <2 x i32> , !dbg !35 + %2020 = shufflevector <2 x i32> %1999, <2 x i32> %1998, <2 x i32> , !dbg !35 + %2021 = select <2 x i1> %2019, <2 x i32> %2020, <2 x i32> zeroinitializer, !dbg !35 + %2022 = shufflevector <2 x i32> %1999, <2 x i32> %1998, <2 x i32> , !dbg !35 + %2023 = select <2 x i1> %1954, <2 x i32> %2022, <2 x i32> zeroinitializer, !dbg !35 + %2024 = shufflevector <2 x i32> %1998, <2 x i32> %1999, <2 x i32> , !dbg !35 + %2025 = select <2 x i1> %1957, <2 x i32> %2024, <2 x i32> zeroinitializer, !dbg !35 + %2026 = xor <4 x i32> %1716, %2000, !dbg !36 + %2027 = xor <2 x i32> %2005, %1719, !dbg !36 + %2028 = xor <2 x i32> %2008, %1722, !dbg !36 + %2029 = xor <2 x i32> %2010, %1725, !dbg !36 + %2030 = xor <2 x i32> %2012, %1728, !dbg !36 + %2031 = xor <4 x i32> %1729, %2013, !dbg !36 + %2032 = xor <2 x i32> %2018, %1732, !dbg !36 + %2033 = xor <2 x i32> %2021, %1735, !dbg !36 + %2034 = xor <2 x i32> %2023, %1738, !dbg !36 + %2035 = xor <2 x i32> %2025, %1741, !dbg !36 + %2036 = extractelement <2 x i32> %1983, i64 0, !dbg !29 + %2037 = extractelement <2 x i32> %1983, i64 1, !dbg !29 + %2038 = icmp slt i32 %2037, %2036, !dbg !29 + %2039 = extractelement <2 x i32> %1984, i64 0, !dbg !29 + %2040 = extractelement <2 x i32> %1984, i64 1, !dbg !29 + %2041 = icmp slt i32 %2040, %2039, !dbg !29 + %2042 = extractelement <2 x i32> %1987, i64 0, !dbg !29 + %2043 = extractelement <2 x i32> %1987, i64 1, !dbg !29 + %2044 = icmp slt i32 %2043, %2042, !dbg !29 + %2045 = extractelement <2 x i32> %1988, i64 0, !dbg !29 + %2046 = extractelement <2 x i32> %1988, i64 1, !dbg !29 + %2047 = icmp slt i32 %2046, %2045, !dbg !29 + %2048 = icmp eq i32 %2037, %2036, !dbg !66 + %2049 = icmp eq i32 %2040, %2039, !dbg !66 + %2050 = icmp eq i32 %2043, %2042, !dbg !66 + %2051 = icmp eq i32 %2046, %2045, !dbg !66 + %shift127 = shufflevector <2 x i32> %2030, <2 x i32> poison, <2 x i32> , !dbg !67 + %2052 = icmp sgt <2 x i32> %shift127, %2030, !dbg !67 + %2053 = extractelement <2 x i1> %2052, i64 0, !dbg !67 + %shift128 = shufflevector <2 x i32> %2029, <2 x i32> poison, <2 x i32> , !dbg !67 + %2054 = icmp sgt <2 x i32> %shift128, %2029, !dbg !67 + %2055 = extractelement <2 x i1> %2054, i64 0, !dbg !67 + %shift129 = shufflevector <2 x i32> %2035, <2 x i32> poison, <2 x i32> , !dbg !67 + %2056 = icmp sgt <2 x i32> %shift129, %2035, !dbg !67 + %2057 = extractelement <2 x i1> %2056, i64 0, !dbg !67 + %shift130 = shufflevector <2 x i32> %2034, <2 x i32> poison, <2 x i32> , !dbg !67 + %2058 = icmp sgt <2 x i32> %shift130, %2034, !dbg !67 + %2059 = extractelement <2 x i1> %2058, i64 0, !dbg !67 + %2060 = and i1 %2048, %2053, !dbg !68 + %2061 = and i1 %2049, %2055, !dbg !68 + %2062 = and i1 %2050, %2057, !dbg !68 + %2063 = and i1 %2051, %2059, !dbg !68 + %2064 = insertelement <2 x i1> poison, i1 %2038, i64 0, !dbg !69 + %2065 = insertelement <2 x i1> %2064, i1 %2041, i64 1, !dbg !69 + %2066 = insertelement <2 x i1> poison, i1 %2060, i64 0, !dbg !69 + %2067 = insertelement <2 x i1> %2066, i1 %2061, i64 1, !dbg !69 + %2068 = or <2 x i1> %2065, %2067, !dbg !69 + %2069 = insertelement <2 x i1> poison, i1 %2044, i64 0, !dbg !69 + %2070 = insertelement <2 x i1> %2069, i1 %2047, i64 1, !dbg !69 + %2071 = insertelement <2 x i1> poison, i1 %2062, i64 0, !dbg !69 + %2072 = insertelement <2 x i1> %2071, i1 %2063, i64 1, !dbg !69 + %2073 = or <2 x i1> %2070, %2072, !dbg !69 + %2074 = shufflevector <2 x i32> %1983, <2 x i32> %1984, <2 x i32> , !dbg !32 + %2075 = shufflevector <2 x i32> %1983, <2 x i32> %1984, <2 x i32> , !dbg !32 + %2076 = xor <2 x i32> %2074, %2075, !dbg !32 + %2077 = shufflevector <2 x i32> %1987, <2 x i32> %1988, <2 x i32> , !dbg !32 + %2078 = shufflevector <2 x i32> %1987, <2 x i32> %1988, <2 x i32> , !dbg !32 + %2079 = xor <2 x i32> %2077, %2078, !dbg !32 + %2080 = select <2 x i1> %2068, <2 x i32> %2076, <2 x i32> zeroinitializer, !dbg !33 + %2081 = select <2 x i1> %2073, <2 x i32> %2079, <2 x i32> zeroinitializer, !dbg !33 + %2082 = xor <2 x i32> %2080, %1982, !dbg !34 + %2083 = shufflevector <2 x i32> %2080, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !34 + %2084 = xor <2 x i32> %2083, %1983, !dbg !34 + %2085 = shufflevector <2 x i32> %2080, <2 x i32> poison, <2 x i32> , !dbg !34 + %2086 = shufflevector <2 x i32> %2080, <2 x i32> poison, <2 x i32> , !dbg !34 + %2087 = xor <2 x i32> %2086, %1984, !dbg !34 + %2088 = xor <2 x i32> %2085, %1985, !dbg !34 + %2089 = xor <2 x i32> %2081, %1986, !dbg !34 + %2090 = shufflevector <2 x i32> %2081, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !34 + %2091 = xor <2 x i32> %2090, %1987, !dbg !34 + %2092 = shufflevector <2 x i32> %2081, <2 x i32> poison, <2 x i32> , !dbg !34 + %2093 = shufflevector <2 x i32> %2081, <2 x i32> poison, <2 x i32> , !dbg !34 + %2094 = xor <2 x i32> %2093, %1988, !dbg !34 + %2095 = xor <2 x i32> %2092, %1989, !dbg !34 + %2096 = xor <2 x i32> %2028, %2027, !dbg !38 + %2097 = xor <2 x i32> %2033, %2032, !dbg !38 + %2098 = shufflevector <2 x i1> %2068, <2 x i1> poison, <2 x i32> , !dbg !35 + %2099 = select <2 x i1> %2098, <2 x i32> %2096, <2 x i32> zeroinitializer, !dbg !35 + %2100 = shufflevector <2 x i1> %2073, <2 x i1> poison, <2 x i32> , !dbg !35 + %2101 = select <2 x i1> %2100, <2 x i32> %2097, <2 x i32> zeroinitializer, !dbg !35 + %2102 = shufflevector <2 x i32> %2099, <2 x i32> poison, <4 x i32> , !dbg !36 + %2103 = xor <4 x i32> %2026, %2102, !dbg !36 + %2104 = shufflevector <2 x i32> %2099, <2 x i32> poison, <2 x i32> , !dbg !36 + %2105 = xor <2 x i32> %2001, %2104, !dbg !36 + %2106 = shufflevector <2 x i32> %2099, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !36 + %2107 = xor <2 x i32> %2106, %2029, !dbg !36 + %2108 = shufflevector <2 x i32> %2099, <2 x i32> poison, <2 x i32> , !dbg !36 + %2109 = xor <2 x i32> %2108, %2030, !dbg !36 + %2110 = shufflevector <2 x i32> %2101, <2 x i32> poison, <4 x i32> , !dbg !36 + %2111 = xor <4 x i32> %2031, %2110, !dbg !36 + %2112 = shufflevector <2 x i32> %2101, <2 x i32> poison, <2 x i32> , !dbg !36 + %2113 = xor <2 x i32> %2014, %2112, !dbg !36 + %2114 = shufflevector <2 x i32> %2101, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !36 + %2115 = xor <2 x i32> %2114, %2034, !dbg !36 + %2116 = shufflevector <2 x i32> %2101, <2 x i32> poison, <2 x i32> , !dbg !36 + %2117 = xor <2 x i32> %2116, %2035, !dbg !36 + %2118 = icmp slt <2 x i32> %2084, %2087, !dbg !29 + %2119 = icmp slt <2 x i32> %2091, %2094, !dbg !29 + %2120 = icmp eq <2 x i32> %2082, %2088, !dbg !66 + %2121 = icmp eq <2 x i32> %2089, %2095, !dbg !66 + %2122 = icmp sgt <2 x i32> %2109, %2107, !dbg !67 + %2123 = icmp sgt <2 x i32> %2117, %2115, !dbg !67 + %2124 = and <2 x i1> %2120, %2122, !dbg !68 + %2125 = and <2 x i1> %2121, %2123, !dbg !68 + %2126 = or <2 x i1> %2118, %2124, !dbg !69 + %2127 = or <2 x i1> %2119, %2125, !dbg !69 + %2128 = xor <2 x i32> %2105, %2002, !dbg !38 + %2129 = xor <2 x i32> %2128, %1514, !dbg !38 + %2130 = xor <2 x i32> %2129, %1711, !dbg !38 + %2131 = xor <2 x i32> %2130, %1515, !dbg !38 + %2132 = xor <2 x i32> %2131, %1712, !dbg !38 + %2133 = xor <2 x i32> %2132, %2099, !dbg !38 + %2134 = xor <2 x i32> %2113, %2015, !dbg !38 + %2135 = xor <2 x i32> %2134, %1518, !dbg !38 + %2136 = xor <2 x i32> %2135, %1714, !dbg !38 + %2137 = xor <2 x i32> %2136, %1519, !dbg !38 + %2138 = xor <2 x i32> %2137, %1715, !dbg !38 + %2139 = xor <2 x i32> %2138, %2101, !dbg !38 + %2140 = select <2 x i1> %2126, <2 x i32> %2133, <2 x i32> zeroinitializer, !dbg !35 + %2141 = shufflevector <2 x i32> %2140, <2 x i32> poison, <4 x i32> , !dbg !35 + %2142 = select <2 x i1> %2127, <2 x i32> %2139, <2 x i32> zeroinitializer, !dbg !35 + %2143 = shufflevector <2 x i32> %2142, <2 x i32> poison, <4 x i32> , !dbg !35 + %2144 = xor <4 x i32> %2103, %2141, !dbg !36 + %2145 = xor <4 x i32> %2144, %1374, !dbg !36 + %2146 = xor <4 x i32> %2111, %2143, !dbg !36 + %2147 = xor <4 x i32> %2146, %1375, !dbg !36 + %2148 = insertelement <4 x i1> poison, i1 %21, i64 0, !dbg !59 + %2149 = shufflevector <4 x i1> %2148, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !59 + %2150 = select <4 x i1> %2149, <4 x i1> %399, <4 x i1> zeroinitializer, !dbg !59 + %2151 = bitcast <4 x i1> %393 to i4, !dbg !70 + %2152 = tail call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 %2151), !dbg !70 + %2153 = zext nneg i4 %2152 to i64, !dbg !70 + %2154 = bitcast <4 x i1> %2150 to i4, !dbg !70 + %2155 = tail call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 %2154), !dbg !70 + %2156 = zext nneg i4 %2155 to i64, !dbg !70 + %2157 = zext nneg i4 %2152 to i32, !dbg !72 + %2158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2157, i32 2, i32 31), !dbg !72 + %2159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 2, i32 31), !dbg !72 + %2160 = insertelement <2 x i32> poison, i32 %2158, i64 0, !dbg !72 + %2161 = insertelement <2 x i32> %2160, i32 %2159, i64 1, !dbg !72 + %2162 = bitcast <2 x i32> %2161 to i64, !dbg !72 + %2163 = add i64 %2153, %2162, !dbg !70 + %extelt.offset = lshr i64 %2163, 32, !dbg !72 + %2164 = trunc nuw i64 %extelt.offset to i32, !dbg !72 + %2165 = trunc i64 %2163 to i32, !dbg !72 + %2166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2165, i32 1, i32 31), !dbg !72 + %2167 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2164, i32 1, i32 31), !dbg !72 + %2168 = insertelement <2 x i32> poison, i32 %2166, i64 0, !dbg !72 + %2169 = insertelement <2 x i32> %2168, i32 %2167, i64 1, !dbg !72 + %2170 = bitcast <2 x i32> %2169 to i64, !dbg !72 + %2171 = add i64 %2163, %2170, !dbg !70 + %2172 = zext nneg i4 %2155 to i32, !dbg !72 + %2173 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2172, i32 2, i32 31), !dbg !72 + %2174 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 2, i32 31), !dbg !72 + %2175 = insertelement <2 x i32> poison, i32 %2173, i64 0, !dbg !72 + %2176 = insertelement <2 x i32> %2175, i32 %2174, i64 1, !dbg !72 + %2177 = bitcast <2 x i32> %2176 to i64, !dbg !72 + %2178 = add i64 %2156, %2177, !dbg !70 + %extelt.offset60 = lshr i64 %2178, 32, !dbg !72 + %2179 = trunc nuw i64 %extelt.offset60 to i32, !dbg !72 + %2180 = trunc i64 %2178 to i32, !dbg !72 + %2181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2180, i32 1, i32 31), !dbg !72 + %2182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2179, i32 1, i32 31), !dbg !72 + %2183 = insertelement <2 x i32> poison, i32 %2181, i64 0, !dbg !72 + %2184 = insertelement <2 x i32> %2183, i32 %2182, i64 1, !dbg !72 + %2185 = bitcast <2 x i32> %2184 to i64, !dbg !72 + %2186 = add i64 %2178, %2185, !dbg !70 + %2187 = shl nuw nsw i32 %15, 1, !dbg !73 + %2188 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2187, !dbg !73 + %2189 = insertelement <1 x i64> poison, i64 %2171, i64 0, !dbg !73 + store <1 x i64> %2189, ptr addrspace(3) %2188, align 8, !dbg !73 + %2190 = getelementptr inbounds nuw i8, ptr addrspace(3) %2188, i32 512, !dbg !73 + %2191 = insertelement <1 x i64> poison, i64 %2186, i64 0, !dbg !73 + store <1 x i64> %2191, ptr addrspace(3) %2190, align 8, !dbg !73 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !73 + %2192 = shl nuw nsw i32 %17, 3, !dbg !73 + %2193 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2192, !dbg !73 + %2194 = load i64, ptr addrspace(3) %2193, align 8, !dbg !73 + %2195 = insertelement <4 x i1> poison, i1 %61, i64 0, !dbg !74 + %2196 = insertelement <4 x i1> %2195, i1 %62, i64 1, !dbg !74 + %2197 = insertelement <4 x i1> %2196, i1 %63, i64 2, !dbg !74 + %2198 = insertelement <4 x i1> %2197, i1 %64, i64 3, !dbg !74 + %2199 = select <4 x i1> %392, <4 x i1> %2198, <4 x i1> zeroinitializer, !dbg !74 + %2200 = insertelement <4 x i1> poison, i1 %65, i64 0, !dbg !74 + %2201 = insertelement <4 x i1> %2200, i1 %66, i64 1, !dbg !74 + %2202 = insertelement <4 x i1> %2201, i1 %67, i64 2, !dbg !74 + %2203 = insertelement <4 x i1> %2202, i1 %68, i64 3, !dbg !74 + %2204 = select <4 x i1> %2149, <4 x i1> %2203, <4 x i1> zeroinitializer, !dbg !74 + %2205 = bitcast <4 x i1> %2199 to i4, !dbg !75 + %2206 = tail call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 %2205), !dbg !75 + %2207 = zext nneg i4 %2206 to i64, !dbg !75 + %2208 = bitcast <4 x i1> %2204 to i4, !dbg !75 + %2209 = tail call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 %2208), !dbg !75 + %2210 = zext nneg i4 %2209 to i64, !dbg !75 + %2211 = zext nneg i4 %2206 to i32, !dbg !77 + %2212 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2211, i32 2, i32 31), !dbg !77 + %2213 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 2, i32 31), !dbg !77 + %2214 = insertelement <2 x i32> poison, i32 %2212, i64 0, !dbg !77 + %2215 = insertelement <2 x i32> %2214, i32 %2213, i64 1, !dbg !77 + %2216 = bitcast <2 x i32> %2215 to i64, !dbg !77 + %2217 = add i64 %2207, %2216, !dbg !75 + %extelt.offset70 = lshr i64 %2217, 32, !dbg !77 + %2218 = trunc nuw i64 %extelt.offset70 to i32, !dbg !77 + %2219 = trunc i64 %2217 to i32, !dbg !77 + %2220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2219, i32 1, i32 31), !dbg !77 + %2221 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2218, i32 1, i32 31), !dbg !77 + %2222 = insertelement <2 x i32> poison, i32 %2220, i64 0, !dbg !77 + %2223 = insertelement <2 x i32> %2222, i32 %2221, i64 1, !dbg !77 + %2224 = bitcast <2 x i32> %2223 to i64, !dbg !77 + %2225 = add i64 %2217, %2224, !dbg !75 + %2226 = zext nneg i4 %2209 to i32, !dbg !77 + %2227 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2226, i32 2, i32 31), !dbg !77 + %2228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 2, i32 31), !dbg !77 + %2229 = insertelement <2 x i32> poison, i32 %2227, i64 0, !dbg !77 + %2230 = insertelement <2 x i32> %2229, i32 %2228, i64 1, !dbg !77 + %2231 = bitcast <2 x i32> %2230 to i64, !dbg !77 + %2232 = add i64 %2210, %2231, !dbg !75 + %extelt.offset72 = lshr i64 %2232, 32, !dbg !77 + %2233 = trunc nuw i64 %extelt.offset72 to i32, !dbg !77 + %2234 = trunc i64 %2232 to i32, !dbg !77 + %2235 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2234, i32 1, i32 31), !dbg !77 + %2236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2233, i32 1, i32 31), !dbg !77 + %2237 = insertelement <2 x i32> poison, i32 %2235, i64 0, !dbg !77 + %2238 = insertelement <2 x i32> %2237, i32 %2236, i64 1, !dbg !77 + %2239 = bitcast <2 x i32> %2238 to i64, !dbg !77 + %2240 = add i64 %2232, %2239, !dbg !75 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !78 + %2241 = insertelement <1 x i64> poison, i64 %2225, i64 0, !dbg !78 + store <1 x i64> %2241, ptr addrspace(3) %2188, align 8, !dbg !78 + %2242 = insertelement <1 x i64> poison, i64 %2240, i64 0, !dbg !78 + store <1 x i64> %2242, ptr addrspace(3) %2190, align 8, !dbg !78 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !78 + %2243 = load i64, ptr addrspace(3) %2193, align 8, !dbg !78 + %2244 = trunc i64 %2171 to i32, !dbg !73 + %2245 = trunc i64 %2186 to i32, !dbg !73 + %2246 = insertelement <4 x i32> poison, i32 %23, i64 0, !dbg !79 + %2247 = insertelement <4 x i32> %2246, i32 %24, i64 1, !dbg !79 + %2248 = insertelement <4 x i32> %2247, i32 %25, i64 2, !dbg !79 + %2249 = insertelement <4 x i32> %2248, i32 %26, i64 3, !dbg !79 + %2250 = insertelement <4 x i32> poison, i32 %2244, i64 0, !dbg !79 + %2251 = shufflevector <4 x i32> %2250, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !79 + %2252 = icmp slt <4 x i32> %2249, %2251, !dbg !79 + %2253 = select <4 x i1> %2252, <4 x i32> %1208, <4 x i32> splat (i32 16), !dbg !80 + %2254 = add <4 x i32> %2253, splat (i32 17), !dbg !81 + %2255 = insertelement <4 x i32> poison, i32 %2245, i64 0, !dbg !79 + %2256 = shufflevector <4 x i32> %2255, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !79 + %2257 = icmp slt <4 x i32> %2249, %2256, !dbg !79 + %2258 = select <4 x i1> %2257, <4 x i32> %1219, <4 x i32> splat (i32 16), !dbg !80 + %2259 = add <4 x i32> %2258, splat (i32 17), !dbg !81 + %2260 = icmp slt <4 x i32> %2253, zeroinitializer, !dbg !82 + %2261 = select <4 x i1> %2260, <4 x i32> %2254, <4 x i32> %2253, !dbg !83 + %2262 = icmp ugt <4 x i32> %2261, splat (i32 16), !dbg !84 + %2263 = icmp slt <4 x i32> %2258, zeroinitializer, !dbg !82 + %2264 = select <4 x i1> %2263, <4 x i32> %2259, <4 x i32> %2258, !dbg !83 + %2265 = icmp ugt <4 x i32> %2264, splat (i32 16), !dbg !84 + %2266 = bitcast <4 x i1> %2262 to i4, !dbg !85 + %2267 = icmp ne i4 %2266, 0, !dbg !85 + %2268 = and i1 %20, %2267, !dbg !85 + %2269 = bitcast <4 x i1> %2265 to i4, !dbg !85 + %2270 = icmp ne i4 %2269, 0, !dbg !85 + %2271 = and i1 %21, %2270, !dbg !85 + %2272 = or i1 %2268, %2271, !dbg !85 + br i1 %2272, label %2273, label %2274, !dbg !85 + +2273: ; preds = %11 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 71, ptr nonnull @assertFunc_0, i64 1), !dbg !85 + unreachable, !dbg !85 + +2274: ; preds = %11 + %2275 = trunc i64 %2240 to i32, !dbg !78 + %2276 = trunc i64 %2225 to i32, !dbg !78 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !85 + %2277 = insertelement <4 x i32> poison, i32 %2276, i64 0, !dbg !86 + %2278 = shufflevector <4 x i32> %2277, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !86 + %2279 = icmp slt <4 x i32> %2249, %2278, !dbg !86 + %2280 = select <4 x i1> %2279, <4 x i32> %2145, <4 x i32> splat (i32 16), !dbg !87 + %2281 = add <4 x i32> %2280, splat (i32 17), !dbg !88 + %2282 = icmp slt <4 x i32> %2280, zeroinitializer, !dbg !89 + %2283 = select <4 x i1> %2282, <4 x i32> %2281, <4 x i32> %2280, !dbg !90 + %2284 = icmp ugt <4 x i32> %2283, splat (i32 16), !dbg !91 + %2285 = insertelement <4 x i32> poison, i32 %2275, i64 0, !dbg !86 + %2286 = shufflevector <4 x i32> %2285, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !86 + %2287 = icmp slt <4 x i32> %2249, %2286, !dbg !86 + %2288 = select <4 x i1> %2287, <4 x i32> %2147, <4 x i32> splat (i32 16), !dbg !87 + %2289 = add <4 x i32> %2288, splat (i32 17), !dbg !88 + %2290 = icmp slt <4 x i32> %2288, zeroinitializer, !dbg !89 + %2291 = select <4 x i1> %2290, <4 x i32> %2289, <4 x i32> %2288, !dbg !90 + %2292 = icmp ugt <4 x i32> %2291, splat (i32 16), !dbg !91 + %2293 = bitcast <4 x i1> %2284 to i4, !dbg !92 + %2294 = icmp ne i4 %2293, 0, !dbg !92 + %2295 = and i1 %20, %2294, !dbg !92 + %2296 = bitcast <4 x i1> %2292 to i4, !dbg !92 + %2297 = icmp ne i4 %2296, 0, !dbg !92 + %2298 = and i1 %21, %2297, !dbg !92 + %2299 = or i1 %2295, %2298, !dbg !92 + br i1 %2299, label %2300, label %2301, !dbg !92 + +2300: ; preds = %2274 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 80, ptr nonnull @assertFunc_1, i64 1), !dbg !92 + unreachable, !dbg !92 + +2301: ; preds = %2274 + %2302 = trunc i64 %2243 to i32, !dbg !78 + %2303 = trunc i64 %2194 to i32, !dbg !73 + %2304 = or disjoint i32 %13, %17, !dbg !13 + %2305 = icmp slt i32 %2304, 128, !dbg !14 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !92 + %2306 = sext i32 %2304 to i64, !dbg !93 + %2307 = getelementptr i32, ptr addrspace(1) %1, i64 %2306, !dbg !93 + %2308 = and i32 %14, 128, !dbg !94 + %2309 = icmp eq i32 %2308, 0, !dbg !94 + %2310 = and i1 %2309, %2305, !dbg !94 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %2303, ptr addrspace(1) %2307, i1 %2310) #6, !dbg !94 + %2311 = getelementptr i32, ptr addrspace(1) %2, i64 %2306, !dbg !95 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %2302, ptr addrspace(1) %2311, i1 %2310) #6, !dbg !96 + %2312 = getelementptr i32, ptr addrspace(1) %3, i64 %33, !dbg !97 + %2313 = getelementptr i32, ptr addrspace(1) %3, i64 %37, !dbg !97 + %2314 = extractelement <4 x i32> %1208, i64 0, !dbg !98 + %2315 = extractelement <4 x i32> %1208, i64 1, !dbg !98 + %2316 = extractelement <4 x i32> %1208, i64 2, !dbg !98 + %2317 = extractelement <4 x i32> %1208, i64 3, !dbg !98 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %2314, i32 %2315, i32 %2316, i32 %2317, ptr addrspace(1) %2312, i1 %20) #6, !dbg !98 + %2318 = extractelement <4 x i32> %1219, i64 0, !dbg !98 + %2319 = extractelement <4 x i32> %1219, i64 1, !dbg !98 + %2320 = extractelement <4 x i32> %1219, i64 2, !dbg !98 + %2321 = extractelement <4 x i32> %1219, i64 3, !dbg !98 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %2318, i32 %2319, i32 %2320, i32 %2321, ptr addrspace(1) %2313, i1 %21) #6, !dbg !98 + %2322 = mul i32 %18, 17, !dbg !99 + %2323 = mul i32 %19, 17, !dbg !99 + %2324 = extractelement <4 x i32> %2261, i64 0, !dbg !100 + %2325 = add i32 %2324, %2322, !dbg !100 + %2326 = extractelement <4 x i32> %2261, i64 1, !dbg !100 + %2327 = add i32 %2326, %2322, !dbg !100 + %2328 = extractelement <4 x i32> %2261, i64 2, !dbg !100 + %2329 = add i32 %2328, %2322, !dbg !100 + %2330 = extractelement <4 x i32> %2261, i64 3, !dbg !100 + %2331 = add i32 %2330, %2322, !dbg !100 + %2332 = extractelement <4 x i32> %2264, i64 0, !dbg !100 + %2333 = add i32 %2332, %2323, !dbg !100 + %2334 = extractelement <4 x i32> %2264, i64 1, !dbg !100 + %2335 = add i32 %2334, %2323, !dbg !100 + %2336 = extractelement <4 x i32> %2264, i64 2, !dbg !100 + %2337 = add i32 %2336, %2323, !dbg !100 + %2338 = extractelement <4 x i32> %2264, i64 3, !dbg !100 + %2339 = add i32 %2338, %2323, !dbg !100 + %2340 = sext i32 %2325 to i64, !dbg !101 + %2341 = getelementptr i32, ptr addrspace(1) %4, i64 %2340, !dbg !101 + %2342 = sext i32 %2327 to i64, !dbg !101 + %2343 = getelementptr i32, ptr addrspace(1) %4, i64 %2342, !dbg !101 + %2344 = sext i32 %2329 to i64, !dbg !101 + %2345 = getelementptr i32, ptr addrspace(1) %4, i64 %2344, !dbg !101 + %2346 = sext i32 %2331 to i64, !dbg !101 + %2347 = getelementptr i32, ptr addrspace(1) %4, i64 %2346, !dbg !101 + %2348 = sext i32 %2333 to i64, !dbg !101 + %2349 = getelementptr i32, ptr addrspace(1) %4, i64 %2348, !dbg !101 + %2350 = sext i32 %2335 to i64, !dbg !101 + %2351 = getelementptr i32, ptr addrspace(1) %4, i64 %2350, !dbg !101 + %2352 = sext i32 %2337 to i64, !dbg !101 + %2353 = getelementptr i32, ptr addrspace(1) %4, i64 %2352, !dbg !101 + %2354 = sext i32 %2339 to i64, !dbg !101 + %2355 = getelementptr i32, ptr addrspace(1) %4, i64 %2354, !dbg !101 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !102 + %2356 = ptrtoint ptr addrspace(1) %2341 to i64, !dbg !102 + %2357 = ptrtoint ptr addrspace(1) %2343 to i64, !dbg !102 + %2358 = ptrtoint ptr addrspace(1) %2345 to i64, !dbg !102 + %2359 = ptrtoint ptr addrspace(1) %2347 to i64, !dbg !102 + %2360 = ptrtoint ptr addrspace(1) %2349 to i64, !dbg !102 + %2361 = ptrtoint ptr addrspace(1) %2351 to i64, !dbg !102 + %2362 = ptrtoint ptr addrspace(1) %2353 to i64, !dbg !102 + %2363 = ptrtoint ptr addrspace(1) %2355 to i64, !dbg !102 + %2364 = shl nuw nsw i32 %14, 9, !dbg !102 + %2365 = and i32 %2364, 12288, !dbg !102 + %2366 = shl nuw nsw i32 %22, 5, !dbg !102 + %2367 = shl nuw nsw i32 %15, 2, !dbg !102 + %2368 = or disjoint i32 %2365, %2366, !dbg !102 + %2369 = xor i32 %2368, %2367, !dbg !102 + %2370 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2369, !dbg !102 + %2371 = insertelement <2 x i64> poison, i64 %2356, i64 0, !dbg !102 + %2372 = insertelement <2 x i64> %2371, i64 %2358, i64 1, !dbg !102 + store <2 x i64> %2372, ptr addrspace(3) %2370, align 16, !dbg !102 + %2373 = getelementptr inbounds nuw i8, ptr addrspace(3) %2370, i32 2048, !dbg !102 + %2374 = insertelement <2 x i64> poison, i64 %2357, i64 0, !dbg !102 + %2375 = insertelement <2 x i64> %2374, i64 %2359, i64 1, !dbg !102 + store <2 x i64> %2375, ptr addrspace(3) %2373, align 16, !dbg !102 + %2376 = getelementptr inbounds nuw i8, ptr addrspace(3) %2370, i32 1024, !dbg !102 + %2377 = insertelement <2 x i64> poison, i64 %2360, i64 0, !dbg !102 + %2378 = insertelement <2 x i64> %2377, i64 %2362, i64 1, !dbg !102 + store <2 x i64> %2378, ptr addrspace(3) %2376, align 16, !dbg !102 + %2379 = getelementptr inbounds nuw i8, ptr addrspace(3) %2370, i32 3072, !dbg !102 + %2380 = insertelement <2 x i64> poison, i64 %2361, i64 0, !dbg !102 + %2381 = insertelement <2 x i64> %2380, i64 %2363, i64 1, !dbg !102 + store <2 x i64> %2381, ptr addrspace(3) %2379, align 16, !dbg !102 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !102 + %2382 = shl nuw nsw i32 %14, 11, !dbg !102 + %2383 = and i32 %2382, 12288, !dbg !102 + %2384 = shl nuw nsw i32 %14, 4, !dbg !102 + %2385 = and i32 %2384, 4080, !dbg !102 + %2386 = or disjoint i32 %2383, %2385, !dbg !102 + %2387 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2386, !dbg !102 + %2388 = load i64, ptr addrspace(3) %2387, align 16, !dbg !102 + %2389 = getelementptr inbounds nuw i8, ptr addrspace(3) %2387, i32 8, !dbg !102 + %2390 = load i64, ptr addrspace(3) %2389, align 8, !dbg !102 + %2391 = xor i32 %2386, 32, !dbg !102 + %2392 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2391, !dbg !102 + %2393 = load i64, ptr addrspace(3) %2392, align 16, !dbg !102 + %2394 = getelementptr inbounds nuw i8, ptr addrspace(3) %2392, i32 8, !dbg !102 + %2395 = load i64, ptr addrspace(3) %2394, align 8, !dbg !102 + %2396 = xor i32 %2386, 64, !dbg !102 + %2397 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2396, !dbg !102 + %2398 = load i64, ptr addrspace(3) %2397, align 16, !dbg !102 + %2399 = getelementptr inbounds nuw i8, ptr addrspace(3) %2397, i32 8, !dbg !102 + %2400 = load i64, ptr addrspace(3) %2399, align 8, !dbg !102 + %2401 = xor i32 %2386, 96, !dbg !102 + %2402 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2401, !dbg !102 + %2403 = load i64, ptr addrspace(3) %2402, align 16, !dbg !102 + %2404 = getelementptr inbounds nuw i8, ptr addrspace(3) %2402, i32 8, !dbg !102 + %2405 = load i64, ptr addrspace(3) %2404, align 8, !dbg !102 + %2406 = inttoptr i64 %2388 to ptr addrspace(1), !dbg !102 + %2407 = inttoptr i64 %2390 to ptr addrspace(1), !dbg !102 + %2408 = inttoptr i64 %2393 to ptr addrspace(1), !dbg !102 + %2409 = inttoptr i64 %2395 to ptr addrspace(1), !dbg !102 + %2410 = inttoptr i64 %2398 to ptr addrspace(1), !dbg !102 + %2411 = inttoptr i64 %2400 to ptr addrspace(1), !dbg !102 + %2412 = inttoptr i64 %2403 to ptr addrspace(1), !dbg !102 + %2413 = inttoptr i64 %2405 to ptr addrspace(1), !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2406, i1 %2305) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2407, i1 %2305) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2408, i1 %2305) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2409, i1 %2305) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2410, i1 %2305) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2411, i1 %2305) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2412, i1 %2305) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2413, i1 %2305) #6, !dbg !102 + %2414 = getelementptr i32, ptr addrspace(1) %5, i64 %33, !dbg !103 + %2415 = getelementptr i32, ptr addrspace(1) %5, i64 %37, !dbg !103 + %2416 = extractelement <4 x i32> %2145, i64 0, !dbg !104 + %2417 = extractelement <4 x i32> %2145, i64 1, !dbg !104 + %2418 = extractelement <4 x i32> %2145, i64 2, !dbg !104 + %2419 = extractelement <4 x i32> %2145, i64 3, !dbg !104 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %2416, i32 %2417, i32 %2418, i32 %2419, ptr addrspace(1) %2414, i1 %20) #6, !dbg !104 + %2420 = extractelement <4 x i32> %2147, i64 0, !dbg !104 + %2421 = extractelement <4 x i32> %2147, i64 1, !dbg !104 + %2422 = extractelement <4 x i32> %2147, i64 2, !dbg !104 + %2423 = extractelement <4 x i32> %2147, i64 3, !dbg !104 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %2420, i32 %2421, i32 %2422, i32 %2423, ptr addrspace(1) %2415, i1 %21) #6, !dbg !104 + %2424 = extractelement <4 x i32> %2283, i64 0, !dbg !105 + %2425 = add i32 %2424, %2322, !dbg !105 + %2426 = extractelement <4 x i32> %2283, i64 1, !dbg !105 + %2427 = add i32 %2426, %2322, !dbg !105 + %2428 = extractelement <4 x i32> %2283, i64 2, !dbg !105 + %2429 = add i32 %2428, %2322, !dbg !105 + %2430 = extractelement <4 x i32> %2283, i64 3, !dbg !105 + %2431 = add i32 %2430, %2322, !dbg !105 + %2432 = extractelement <4 x i32> %2291, i64 0, !dbg !105 + %2433 = add i32 %2432, %2323, !dbg !105 + %2434 = extractelement <4 x i32> %2291, i64 1, !dbg !105 + %2435 = add i32 %2434, %2323, !dbg !105 + %2436 = extractelement <4 x i32> %2291, i64 2, !dbg !105 + %2437 = add i32 %2436, %2323, !dbg !105 + %2438 = extractelement <4 x i32> %2291, i64 3, !dbg !105 + %2439 = add i32 %2438, %2323, !dbg !105 + %2440 = sext i32 %2425 to i64, !dbg !106 + %2441 = getelementptr i32, ptr addrspace(1) %6, i64 %2440, !dbg !106 + %2442 = sext i32 %2427 to i64, !dbg !106 + %2443 = getelementptr i32, ptr addrspace(1) %6, i64 %2442, !dbg !106 + %2444 = sext i32 %2429 to i64, !dbg !106 + %2445 = getelementptr i32, ptr addrspace(1) %6, i64 %2444, !dbg !106 + %2446 = sext i32 %2431 to i64, !dbg !106 + %2447 = getelementptr i32, ptr addrspace(1) %6, i64 %2446, !dbg !106 + %2448 = sext i32 %2433 to i64, !dbg !106 + %2449 = getelementptr i32, ptr addrspace(1) %6, i64 %2448, !dbg !106 + %2450 = sext i32 %2435 to i64, !dbg !106 + %2451 = getelementptr i32, ptr addrspace(1) %6, i64 %2450, !dbg !106 + %2452 = sext i32 %2437 to i64, !dbg !106 + %2453 = getelementptr i32, ptr addrspace(1) %6, i64 %2452, !dbg !106 + %2454 = sext i32 %2439 to i64, !dbg !106 + %2455 = getelementptr i32, ptr addrspace(1) %6, i64 %2454, !dbg !106 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !107 + %2456 = ptrtoint ptr addrspace(1) %2441 to i64, !dbg !107 + %2457 = ptrtoint ptr addrspace(1) %2443 to i64, !dbg !107 + %2458 = ptrtoint ptr addrspace(1) %2445 to i64, !dbg !107 + %2459 = ptrtoint ptr addrspace(1) %2447 to i64, !dbg !107 + %2460 = ptrtoint ptr addrspace(1) %2449 to i64, !dbg !107 + %2461 = ptrtoint ptr addrspace(1) %2451 to i64, !dbg !107 + %2462 = ptrtoint ptr addrspace(1) %2453 to i64, !dbg !107 + %2463 = ptrtoint ptr addrspace(1) %2455 to i64, !dbg !107 + %2464 = insertelement <2 x i64> poison, i64 %2456, i64 0, !dbg !107 + %2465 = insertelement <2 x i64> %2464, i64 %2458, i64 1, !dbg !107 + store <2 x i64> %2465, ptr addrspace(3) %2370, align 16, !dbg !107 + %2466 = insertelement <2 x i64> poison, i64 %2457, i64 0, !dbg !107 + %2467 = insertelement <2 x i64> %2466, i64 %2459, i64 1, !dbg !107 + store <2 x i64> %2467, ptr addrspace(3) %2373, align 16, !dbg !107 + %2468 = insertelement <2 x i64> poison, i64 %2460, i64 0, !dbg !107 + %2469 = insertelement <2 x i64> %2468, i64 %2462, i64 1, !dbg !107 + store <2 x i64> %2469, ptr addrspace(3) %2376, align 16, !dbg !107 + %2470 = insertelement <2 x i64> poison, i64 %2461, i64 0, !dbg !107 + %2471 = insertelement <2 x i64> %2470, i64 %2463, i64 1, !dbg !107 + store <2 x i64> %2471, ptr addrspace(3) %2379, align 16, !dbg !107 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !107 + %2472 = load i64, ptr addrspace(3) %2387, align 16, !dbg !107 + %2473 = load i64, ptr addrspace(3) %2389, align 8, !dbg !107 + %2474 = load i64, ptr addrspace(3) %2392, align 16, !dbg !107 + %2475 = load i64, ptr addrspace(3) %2394, align 8, !dbg !107 + %2476 = load i64, ptr addrspace(3) %2397, align 16, !dbg !107 + %2477 = load i64, ptr addrspace(3) %2399, align 8, !dbg !107 + %2478 = load i64, ptr addrspace(3) %2402, align 16, !dbg !107 + %2479 = load i64, ptr addrspace(3) %2404, align 8, !dbg !107 + %2480 = inttoptr i64 %2472 to ptr addrspace(1), !dbg !107 + %2481 = inttoptr i64 %2473 to ptr addrspace(1), !dbg !107 + %2482 = inttoptr i64 %2474 to ptr addrspace(1), !dbg !107 + %2483 = inttoptr i64 %2475 to ptr addrspace(1), !dbg !107 + %2484 = inttoptr i64 %2476 to ptr addrspace(1), !dbg !107 + %2485 = inttoptr i64 %2477 to ptr addrspace(1), !dbg !107 + %2486 = inttoptr i64 %2478 to ptr addrspace(1), !dbg !107 + %2487 = inttoptr i64 %2479 to ptr addrspace(1), !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2480, i1 %2305) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2481, i1 %2305) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2482, i1 %2305) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2483, i1 %2305) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2484, i1 %2305) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2485, i1 %2305) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2486, i1 %2305) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %2487, i1 %2305) #6, !dbg !107 + ret void, !dbg !108 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i4 @llvm.ctpop.i4(i4) #5 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="256" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", linkageName: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 24, column: 28, scope: !9) +!11 = !DILocation(line: 24, column: 33, scope: !9) +!12 = !DILocation(line: 25, column: 44, scope: !9) +!13 = !DILocation(line: 25, column: 23, scope: !9) +!14 = !DILocation(line: 26, column: 21, scope: !9) +!15 = !DILocation(line: 27, column: 38, scope: !9) +!16 = !DILocation(line: 34, column: 40, scope: !9) +!17 = !DILocation(line: 34, column: 37, scope: !9) +!18 = !DILocation(line: 34, column: 30, scope: !9) +!19 = !DILocation(line: 34, column: 45, scope: !9) +!20 = !DILocation(line: 627, column: 44, scope: !21, inlinedAt: !23) +!21 = distinct !DILexicalBlockFile(scope: !9, file: !22, discriminator: 0) +!22 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!23 = !DILocation(line: 46, column: 71, scope: !9) +!24 = !DILocation(line: 537, column: 21, scope: !21, inlinedAt: !23) +!25 = !DILocation(line: 601, column: 48, scope: !21, inlinedAt: !23) +!26 = !DILocation(line: 599, column: 28, scope: !21, inlinedAt: !23) +!27 = !DILocation(line: 47, column: 20, scope: !9) +!28 = !DILocation(line: 0, scope: !9) +!29 = !DILocation(line: 574, column: 22, scope: !21, inlinedAt: !30) +!30 = !DILocation(line: 51, column: 71, scope: !9) +!31 = !DILocation(line: 599, column: 19, scope: !21, inlinedAt: !30) +!32 = !DILocation(line: 600, column: 38, scope: !21, inlinedAt: !30) +!33 = !DILocation(line: 600, column: 46, scope: !21, inlinedAt: !30) +!34 = !DILocation(line: 600, column: 15, scope: !21, inlinedAt: !30) +!35 = !DILocation(line: 601, column: 59, scope: !21, inlinedAt: !30) +!36 = !DILocation(line: 601, column: 22, scope: !21, inlinedAt: !30) +!37 = !DILocation(line: 599, column: 28, scope: !21, inlinedAt: !30) +!38 = !DILocation(line: 601, column: 48, scope: !21, inlinedAt: !30) +!39 = !DILocation(line: 538, column: 40, scope: !21, inlinedAt: !30) +!40 = !DILocation(line: 539, column: 41, scope: !21, inlinedAt: !30) +!41 = !DILocation(line: 551, column: 23, scope: !21, inlinedAt: !30) +!42 = !DILocation(line: 548, column: 23, scope: !21, inlinedAt: !30) +!43 = !DILocation(line: 39, column: 18, scope: !9) +!44 = !DILocation(line: 574, column: 22, scope: !21, inlinedAt: !23) +!45 = !DILocation(line: 599, column: 19, scope: !21, inlinedAt: !23) +!46 = !DILocation(line: 600, column: 38, scope: !21, inlinedAt: !23) +!47 = !DILocation(line: 600, column: 46, scope: !21, inlinedAt: !23) +!48 = !DILocation(line: 600, column: 15, scope: !21, inlinedAt: !23) +!49 = !DILocation(line: 601, column: 59, scope: !21, inlinedAt: !23) +!50 = !DILocation(line: 601, column: 22, scope: !21, inlinedAt: !23) +!51 = !DILocation(line: 538, column: 40, scope: !21, inlinedAt: !23) +!52 = !DILocation(line: 291, column: 36, scope: !53, inlinedAt: !23) +!53 = distinct !DILexicalBlockFile(scope: !9, file: !54, discriminator: 0) +!54 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!55 = !DILocation(line: 261, column: 15, scope: !53, inlinedAt: !23) +!56 = !DILocation(line: 539, column: 41, scope: !21, inlinedAt: !23) +!57 = !DILocation(line: 548, column: 23, scope: !21, inlinedAt: !23) +!58 = !DILocation(line: 551, column: 23, scope: !21, inlinedAt: !23) +!59 = !DILocation(line: 54, column: 35, scope: !9) +!60 = !DILocation(line: 591, column: 21, scope: !21, inlinedAt: !23) +!61 = !DILocation(line: 594, column: 40, scope: !21, inlinedAt: !23) +!62 = !DILocation(line: 594, column: 29, scope: !21, inlinedAt: !23) +!63 = !DILocation(line: 594, column: 23, scope: !21, inlinedAt: !23) +!64 = !DILocation(line: 291, column: 36, scope: !53, inlinedAt: !30) +!65 = !DILocation(line: 261, column: 15, scope: !53, inlinedAt: !30) +!66 = !DILocation(line: 591, column: 21, scope: !21, inlinedAt: !30) +!67 = !DILocation(line: 594, column: 40, scope: !21, inlinedAt: !30) +!68 = !DILocation(line: 594, column: 29, scope: !21, inlinedAt: !30) +!69 = !DILocation(line: 594, column: 23, scope: !21, inlinedAt: !30) +!70 = !DILocation(line: 261, column: 15, scope: !53, inlinedAt: !71) +!71 = !DILocation(line: 55, column: 26, scope: !9) +!72 = !DILocation(line: 291, column: 36, scope: !53, inlinedAt: !71) +!73 = !DILocation(line: 60, column: 21, scope: !9) +!74 = !DILocation(line: 58, column: 35, scope: !9) +!75 = !DILocation(line: 261, column: 15, scope: !53, inlinedAt: !76) +!76 = !DILocation(line: 59, column: 26, scope: !9) +!77 = !DILocation(line: 291, column: 36, scope: !53, inlinedAt: !76) +!78 = !DILocation(line: 61, column: 21, scope: !9) +!79 = !DILocation(line: 64, column: 19, scope: !9) +!80 = !DILocation(line: 66, column: 35, scope: !9) +!81 = !DILocation(line: 68, column: 20, scope: !9) +!82 = !DILocation(line: 69, column: 20, scope: !9) +!83 = !DILocation(line: 70, column: 35, scope: !9) +!84 = !DILocation(line: 71, column: 38, scope: !9) +!85 = !DILocation(line: 71, column: 63, scope: !9) +!86 = !DILocation(line: 75, column: 19, scope: !9) +!87 = !DILocation(line: 76, column: 35, scope: !9) +!88 = !DILocation(line: 77, column: 20, scope: !9) +!89 = !DILocation(line: 78, column: 20, scope: !9) +!90 = !DILocation(line: 79, column: 35, scope: !9) +!91 = !DILocation(line: 80, column: 38, scope: !9) +!92 = !DILocation(line: 80, column: 63, scope: !9) +!93 = !DILocation(line: 81, column: 25, scope: !9) +!94 = !DILocation(line: 81, column: 37, scope: !9) +!95 = !DILocation(line: 82, column: 25, scope: !9) +!96 = !DILocation(line: 82, column: 37, scope: !9) +!97 = !DILocation(line: 83, column: 25, scope: !9) +!98 = !DILocation(line: 83, column: 47, scope: !9) +!99 = !DILocation(line: 84, column: 52, scope: !9) +!100 = !DILocation(line: 84, column: 49, scope: !9) +!101 = !DILocation(line: 84, column: 25, scope: !9) +!102 = !DILocation(line: 84, column: 85, scope: !9) +!103 = !DILocation(line: 85, column: 25, scope: !9) +!104 = !DILocation(line: 85, column: 47, scope: !9) +!105 = !DILocation(line: 86, column: 49, scope: !9) +!106 = !DILocation(line: 86, column: 25, scope: !9) +!107 = !DILocation(line: 86, column: 85, scope: !9) +!108 = !DILocation(line: 86, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..d39e7f8496ddfceb7c67b07c49d2aeb307980a04 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx @@ -0,0 +1,3491 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 // -- Begin function triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 114, 107, 47, 99, 114, 107, 120, 112, 119, 105, 119, 104, 122, 107, 118, 117, 110, 55, 105, 53, 100, 50, 112, 101, 103, 111, 102, 116, 104, 121, 102, 105, 106, 110, 53, 119, 121, 103, 119, 110, 97, 101, 118, 51, 116, 119, 119, 108, 114, 98, 117, 111, 106, 113, 101, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 57, 32, 60, 32, 49, 55}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 114, 107, 47, 99, 114, 107, 120, 112, 119, 105, 119, 104, 122, 107, 118, 117, 110, 55, 105, 53, 100, 50, 112, 101, 103, 111, 102, 116, 104, 121, 102, 105, 106, 110, 53, 119, 121, 103, 119, 110, 97, 101, 118, 51, 116, 119, 119, 108, 114, 98, 117, 111, 106, 113, 101, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 48, 32, 60, 32, 49, 55}; +.extern .shared .align 16 .b8 global_smem[]; + // @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.visible .entry triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_7, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_8, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_9, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_10 +) +.reqntid 256 +{ + .reg .pred %p<652>; + .reg .b16 %rs<97>; + .reg .b32 %r<1554>; + .reg .b64 %rd<138>; + .loc 1 18 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:18:0 +$L__func_begin0: + .loc 1 18 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:18:0 + +// %bb.0: + ld.param.b64 %rd25, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0]; +$L__tmp0: + .loc 1 24 28 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:24:28 + mov.u32 %r44, %ctaid.x; + .loc 1 24 33 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:24:33 + shl.b32 %r1, %r44, 7; + .loc 1 25 44 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:25:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 252; + bfe.u32 %r45, %r2, 2, 6; + and.b32 %r4, %r2, 127; + .loc 1 25 23 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:25:23 + or.b32 %r5, %r45, %r1; + or.b32 %r6, %r5, 64; + .loc 1 26 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:26:21 + setp.lt.s32 %p2, %r5, 128; + setp.lt.s32 %p4, %r6, 128; + .loc 1 27 38 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:27:38 + and.b32 %r7, %r2, 3; + shl.b32 %r24, %r7, 2; + or.b32 %r25, %r24, 1; + or.b32 %r26, %r24, 2; + or.b32 %r27, %r24, 3; + .loc 1 34 40 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:34:40 + shl.b32 %r46, %r5, 4; + shl.b32 %r47, %r6, 4; + .loc 1 34 37 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:34:37 + or.b32 %r48, %r46, %r24; + or.b32 %r49, %r47, %r24; + .loc 1 34 30 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:34:30 + mad.wide.s32 %rd15, %r48, 8, %rd25; + cvt.s64.s32 %rd26, %r46; + cvt.u64.u32 %rd27, %r24; + or.b64 %rd28, %rd26, %rd27; + shl.b64 %rd29, %rd28, 3; + add.s64 %rd30, %rd25, %rd29; + add.s64 %rd18, %rd30, 16; + mad.wide.s32 %rd21, %r49, 8, %rd25; + cvt.s64.s32 %rd31, %r47; + or.b64 %rd32, %rd31, %rd27; + shl.b64 %rd33, %rd32, 3; + add.s64 %rd34, %rd25, %rd33; + add.s64 %rd24, %rd34, 16; + .loc 1 34 45 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:34:45 + // begin inline asm + mov.u64 %rd13, 0x0; + mov.u64 %rd14, 0x0; + @%p2 ld.global.v2.b64 { %rd13, %rd14 }, [ %rd15 + 0 ]; + // end inline asm + // begin inline asm + mov.u64 %rd16, 0x0; + mov.u64 %rd17, 0x0; + @%p2 ld.global.v2.b64 { %rd16, %rd17 }, [ %rd18 + 0 ]; + // end inline asm + // begin inline asm + mov.u64 %rd19, 0x0; + mov.u64 %rd20, 0x0; + @%p4 ld.global.v2.b64 { %rd19, %rd20 }, [ %rd21 + 0 ]; + // end inline asm + // begin inline asm + mov.u64 %rd22, 0x0; + mov.u64 %rd23, 0x0; + @%p4 ld.global.v2.b64 { %rd22, %rd23 }, [ %rd24 + 0 ]; + // end inline asm +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.b32 %r50, %r2, 1; + shr.u32 %r51, %r2, 1; + bfe.u32 %r52, %r2, 1, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r53, %r50, 1; + xor.b32 %r54, %r52, 1; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r55, %r25, %r24; + xor.b32 %r56, %r26, %r27; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ne.b32 %p5, %r50, 0; + and.b32 %r57, %r51, 1; + setp.ne.b32 %p6, %r57, 0; +$L__tmp2: + .loc 1 47 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:47:20 + setp.ne.b64 %p7, %rd13, 16384; + setp.eq.b64 %p8, %rd13, 16384; + setp.eq.b64 %p9, %rd14, 16384; + setp.eq.b64 %p10, %rd16, 16384; + setp.ne.b64 %p11, %rd17, 16384; + setp.eq.b64 %p12, %rd17, 16384; + setp.ne.b64 %p13, %rd19, 16384; + setp.eq.b64 %p14, %rd19, 16384; + setp.eq.b64 %p15, %rd20, 16384; + setp.eq.b64 %p16, %rd22, 16384; + setp.ne.b64 %p17, %rd23, 16384; + setp.eq.b64 %p18, %rd23, 16384; + .loc 1 0 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:0 + selp.b32 %r58, 1, 0, %p8; + selp.b32 %r59, 1, 0, %p9; + selp.b32 %r60, 1, 0, %p10; + selp.b32 %r61, 1, 0, %p12; + selp.b32 %r62, 1, 0, %p14; + selp.b32 %r63, 1, 0, %p15; + selp.b32 %r64, 1, 0, %p16; + selp.b32 %r65, 1, 0, %p18; +$L__tmp3: + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + and.pred %p19, %p9, %p7; + and.pred %p20, %p15, %p13; + .loc 2 599 19 // triton_helpers.py:599:19 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + or.pred %p21, %p10, %p11; + or.pred %p22, %p16, %p17; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r66, %r58, %r59; + xor.b32 %r67, %r60, %r61; + xor.b32 %r68, %r62, %r63; + xor.b32 %r69, %r64, %r65; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r70, %r66, 0, %p19; + selp.b32 %r71, %r67, 0, %p21; + selp.b32 %r72, %r68, 0, %p20; + selp.b32 %r73, %r69, 0, %p22; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r74, %r70, %r58; + xor.b32 %r75, %r70, %r59; + xor.b32 %r76, %r71, %r60; + xor.b32 %r77, %r71, %r61; + xor.b32 %r78, %r72, %r62; + xor.b32 %r79, %r72, %r63; + xor.b32 %r80, %r73, %r64; + xor.b32 %r81, %r73, %r65; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r82, %r55, 0, %p19; + selp.b32 %r83, %r56, 0, %p21; + selp.b32 %r84, %r55, 0, %p20; + selp.b32 %r85, %r56, 0, %p22; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r86, %r82, %r24; + xor.b32 %r87, %r82, %r25; + xor.b32 %r88, %r83, %r26; + xor.b32 %r89, %r83, %r27; + xor.b32 %r90, %r84, %r24; + xor.b32 %r91, %r84, %r25; + xor.b32 %r92, %r85, %r26; + xor.b32 %r93, %r85, %r27; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.u32 %p23, %r74, %r76; + setp.ne.b32 %p24, %r74, %r76; + setp.le.u32 %p25, %r86, %r88; + or.pred %p26, %p25, %p24; + and.pred %p27, %p23, %p26; + xor.pred %p28, %p27, %p5; + setp.ge.u32 %p29, %r75, %r77; + setp.ne.b32 %p30, %r75, %r77; + setp.le.u32 %p31, %r87, %r89; + or.pred %p32, %p31, %p30; + and.pred %p33, %p29, %p32; + xor.pred %p34, %p33, %p5; + setp.ge.u32 %p35, %r78, %r80; + setp.ne.b32 %p36, %r78, %r80; + setp.le.u32 %p37, %r90, %r92; + or.pred %p38, %p37, %p36; + and.pred %p39, %p35, %p38; + xor.pred %p40, %p39, %p5; + setp.ge.u32 %p41, %r79, %r81; + setp.ne.b32 %p42, %r79, %r81; + setp.le.u32 %p43, %r91, %r93; + or.pred %p44, %p43, %p42; + and.pred %p45, %p41, %p44; + xor.pred %p46, %p45, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r94, %r76, %r74; + xor.b32 %r95, %r77, %r75; + xor.b32 %r96, %r80, %r78; + xor.b32 %r97, %r81, %r79; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r98, 0, %r94, %p28; + selp.b32 %r99, 0, %r95, %p34; + selp.b32 %r100, 0, %r96, %p40; + selp.b32 %r101, 0, %r97, %p46; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r102, %r98, %r74; + xor.b32 %r103, %r99, %r75; + xor.b32 %r104, %r98, %r76; + xor.b32 %r105, %r99, %r77; + xor.b32 %r106, %r100, %r78; + xor.b32 %r107, %r101, %r79; + xor.b32 %r108, %r100, %r80; + xor.b32 %r109, %r101, %r81; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r110, %r88, %r86; + xor.b32 %r111, %r89, %r87; + xor.b32 %r112, %r92, %r90; + xor.b32 %r113, %r93, %r91; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r114, 0, %r110, %p28; + selp.b32 %r115, 0, %r111, %p34; + selp.b32 %r116, 0, %r112, %p40; + selp.b32 %r117, 0, %r113, %p46; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r118, %r114, %r86; + xor.b32 %r119, %r115, %r87; + xor.b32 %r120, %r114, %r88; + xor.b32 %r121, %r115, %r89; + xor.b32 %r122, %r116, %r90; + xor.b32 %r123, %r117, %r91; + xor.b32 %r124, %r116, %r92; + xor.b32 %r125, %r117, %r93; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.u32 %p47, %r102, %r103; + setp.ne.b32 %p48, %r102, %r103; + setp.le.u32 %p49, %r118, %r119; + or.pred %p50, %p48, %p49; + and.pred %p51, %p47, %p50; + xor.pred %p52, %p51, %p5; + setp.ge.u32 %p53, %r104, %r105; + setp.ne.b32 %p54, %r104, %r105; + setp.le.u32 %p55, %r120, %r121; + or.pred %p56, %p54, %p55; + and.pred %p57, %p53, %p56; + xor.pred %p58, %p57, %p5; + setp.ge.u32 %p59, %r106, %r107; + setp.ne.b32 %p60, %r106, %r107; + setp.le.u32 %p61, %r122, %r123; + or.pred %p62, %p60, %p61; + and.pred %p63, %p59, %p62; + xor.pred %p64, %p63, %p5; + setp.ge.u32 %p65, %r108, %r109; + setp.ne.b32 %p66, %r108, %r109; + setp.le.u32 %p67, %r124, %r125; + or.pred %p68, %p66, %p67; + and.pred %p69, %p65, %p68; + xor.pred %p70, %p69, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r126, %r102, %r103; + xor.b32 %r127, %r104, %r105; + xor.b32 %r128, %r106, %r107; + xor.b32 %r129, %r108, %r109; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r130, 0, %r126, %p52; + selp.b32 %r131, 0, %r127, %p58; + selp.b32 %r132, 0, %r128, %p64; + selp.b32 %r133, 0, %r129, %p70; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r134, %r130, %r102; + xor.b32 %r135, %r130, %r103; + xor.b32 %r136, %r131, %r104; + xor.b32 %r137, %r131, %r105; + xor.b32 %r138, %r132, %r106; + xor.b32 %r139, %r132, %r107; + xor.b32 %r140, %r133, %r108; + xor.b32 %r141, %r133, %r109; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r142, %r118, %r119; + xor.b32 %r143, %r120, %r121; + xor.b32 %r144, %r122, %r123; + xor.b32 %r145, %r124, %r125; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r146, 0, %r142, %p52; + selp.b32 %r147, 0, %r143, %p58; + selp.b32 %r148, 0, %r144, %p64; + selp.b32 %r149, 0, %r145, %p70; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r150, %r134, %r53; + mul.lo.s32 %r151, %r135, %r53; + mul.lo.s32 %r152, %r136, %r53; + mul.lo.s32 %r153, %r137, %r53; + mul.lo.s32 %r154, %r138, %r53; + mul.lo.s32 %r155, %r139, %r53; + mul.lo.s32 %r156, %r140, %r53; + mul.lo.s32 %r157, %r141, %r53; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r158, %r134, %r50; + mul.lo.s32 %r159, %r135, %r50; + mul.lo.s32 %r160, %r136, %r50; + mul.lo.s32 %r161, %r137, %r50; + mul.lo.s32 %r162, %r138, %r50; + mul.lo.s32 %r163, %r139, %r50; + mul.lo.s32 %r164, %r140, %r50; + mul.lo.s32 %r165, %r141, %r50; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r166, %r147, %r121; + xor.b32 %r167, %r147, %r120; + xor.b32 %r168, %r146, %r119; + xor.b32 %r169, %r146, %r118; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r170, %r169, %r53; + mul.lo.s32 %r171, %r168, %r53; + mul.lo.s32 %r172, %r167, %r53; + mul.lo.s32 %r173, %r166, %r53; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r174, %r169, %r50; + mul.lo.s32 %r175, %r168, %r50; + mul.lo.s32 %r176, %r167, %r50; + mul.lo.s32 %r177, %r166, %r50; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r178, %r149, %r125; + xor.b32 %r179, %r149, %r124; + xor.b32 %r180, %r148, %r123; + xor.b32 %r181, %r148, %r122; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r182, %r181, %r53; + mul.lo.s32 %r183, %r180, %r53; + mul.lo.s32 %r184, %r179, %r53; + mul.lo.s32 %r185, %r178, %r53; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r186, %r181, %r50; + mul.lo.s32 %r187, %r180, %r50; + mul.lo.s32 %r188, %r179, %r50; + mul.lo.s32 %r189, %r178, %r50; +$L__tmp4: + .loc 1 39 18 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:39:18 + add.s64 %rd35, %rd14, -1; + add.s64 %rd36, %rd16, -1; + add.s64 %rd37, %rd13, -1; + add.s64 %rd38, %rd17, -1; + setp.gt.u64 %p71, %rd38, 16382; + setp.gt.u64 %p72, %rd37, 16382; + setp.lt.u64 %p73, %rd38, 16383; + setp.lt.u64 %p74, %rd36, 16383; + setp.lt.u64 %p75, %rd35, 16383; + setp.lt.u64 %p76, %rd37, 16383; + .loc 1 0 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:0 + selp.b32 %r190, 1, 0, %p76; + selp.b32 %r191, 1, 0, %p75; + selp.b32 %r192, 1, 0, %p74; + selp.b32 %r193, 1, 0, %p73; +$L__tmp5: + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.pred %p77, %p75, %p72; + .loc 2 599 19 // triton_helpers.py:599:19 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + or.pred %p78, %p74, %p71; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r194, %r190, %r191; + xor.b32 %r195, %r192, %r193; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r196, %r194, 0, %p77; + selp.b32 %r197, %r195, 0, %p78; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r198, %r196, %r190; + xor.b32 %r199, %r196, %r191; + xor.b32 %r200, %r197, %r192; + xor.b32 %r201, %r197, %r193; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r202, %r55, 0, %p77; + selp.b32 %r203, %r56, 0, %p78; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r204, %r202, %r24; + xor.b32 %r205, %r202, %r25; + xor.b32 %r206, %r203, %r26; + xor.b32 %r207, %r203, %r27; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.u32 %p79, %r198, %r200; + setp.ge.u32 %p80, %r199, %r201; + setp.ne.b32 %p81, %r199, %r201; + setp.ne.b32 %p82, %r200, %r198; + setp.le.u32 %p83, %r205, %r207; + setp.le.u32 %p84, %r204, %r206; + or.pred %p85, %p84, %p82; + or.pred %p86, %p83, %p81; + and.pred %p87, %p80, %p86; + and.pred %p88, %p79, %p85; + xor.pred %p89, %p88, %p5; + xor.pred %p90, %p87, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r208, %r198, %r200; + xor.b32 %r209, %r201, %r199; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r210, 0, %r209, %p90; + selp.b32 %r211, 0, %r208, %p89; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r212, %r211, %r198; + xor.b32 %r213, %r210, %r199; + xor.b32 %r214, %r210, %r201; + xor.b32 %r215, %r211, %r200; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r216, %r206, %r204; + xor.b32 %r217, %r207, %r205; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r218, 0, %r216, %p89; + selp.b32 %r219, 0, %r217, %p90; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r220, %r218, %r204; + xor.b32 %r221, %r219, %r205; + xor.b32 %r222, %r218, %r206; + xor.b32 %r223, %r219, %r207; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.u32 %p91, %r212, %r213; + setp.ne.b32 %p92, %r212, %r213; + setp.le.u32 %p93, %r220, %r221; + or.pred %p94, %p92, %p93; + and.pred %p95, %p91, %p94; + xor.pred %p96, %p95, %p5; + setp.ge.u32 %p97, %r215, %r214; + setp.ne.b32 %p98, %r215, %r214; + setp.le.u32 %p99, %r222, %r223; + or.pred %p100, %p98, %p99; + and.pred %p101, %p97, %p100; + xor.pred %p102, %p101, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r224, %r212, %r213; + xor.b32 %r225, %r215, %r214; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r226, 0, %r224, %p96; + selp.b32 %r227, 0, %r225, %p102; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r228, %r226, %r212; + xor.b32 %r229, %r226, %r213; + xor.b32 %r230, %r227, %r215; + xor.b32 %r231, %r227, %r214; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r232, %r220, %r221; + xor.b32 %r233, %r222, %r223; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r234, 0, %r232, %p96; + selp.b32 %r235, 0, %r233, %p102; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r236, %r234, %r220; + xor.b32 %r237, %r234, %r221; + xor.b32 %r238, %r235, %r222; + xor.b32 %r239, %r235, %r223; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r240, %r228, %r53; + mul.lo.s32 %r241, %r229, %r53; + mul.lo.s32 %r242, %r230, %r53; + mul.lo.s32 %r243, %r231, %r53; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r244, %r240, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r245, %r240, %r244; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r246, %r241, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r247, %r241, %r246; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r248, %r242, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r249, %r242, %r248; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r250, %r243, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r251, %r243, %r250; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r252, %r228, %r50; + mul.lo.s32 %r253, %r229, %r50; + mul.lo.s32 %r254, %r230, %r50; + mul.lo.s32 %r255, %r231, %r50; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r256, %r236, %r53; + mul.lo.s32 %r257, %r237, %r53; + mul.lo.s32 %r258, %r238, %r53; + mul.lo.s32 %r259, %r239, %r53; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r260, %r236, %r50; + mul.lo.s32 %r261, %r237, %r50; + mul.lo.s32 %r262, %r238, %r50; + mul.lo.s32 %r263, %r239, %r50; +$L__tmp6: + .loc 1 54 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:54:35 + and.pred %p103, %p2, %p74; + selp.b16 %rs1, 1, 0, %p103; + shl.b16 %rs2, %rs1, 2; + and.pred %p104, %p2, %p73; + selp.b16 %rs3, -1, 0, %p104; + shl.b16 %rs4, %rs3, 3; + or.b16 %rs5, %rs4, %rs2; + and.pred %p105, %p2, %p76; + selp.b16 %rs6, 1, 0, %p105; + and.pred %p106, %p2, %p75; + selp.b16 %rs7, -1, 0, %p106; + shl.b16 %rs8, %rs7, 1; + or.b16 %rs9, %rs6, %rs8; + and.b16 %rs10, %rs9, 3; + or.b16 %rs11, %rs10, %rs5; + .loc 1 39 18 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:39:18 + add.s64 %rd39, %rd20, -1; + add.s64 %rd40, %rd22, -1; + add.s64 %rd41, %rd19, -1; + add.s64 %rd42, %rd23, -1; + setp.gt.u64 %p107, %rd42, 16382; + setp.gt.u64 %p108, %rd41, 16382; + setp.lt.u64 %p109, %rd42, 16383; + setp.lt.u64 %p110, %rd40, 16383; + setp.lt.u64 %p111, %rd39, 16383; + setp.lt.u64 %p112, %rd41, 16383; + .loc 1 0 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:0 + selp.b32 %r264, 1, 0, %p112; + selp.b32 %r265, 1, 0, %p111; + selp.b32 %r266, 1, 0, %p110; + selp.b32 %r267, 1, 0, %p109; +$L__tmp7: + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.pred %p113, %p111, %p108; + .loc 2 599 19 // triton_helpers.py:599:19 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + or.pred %p114, %p110, %p107; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r268, %r264, %r265; + xor.b32 %r269, %r266, %r267; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r270, %r268, 0, %p113; + selp.b32 %r271, %r269, 0, %p114; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r272, %r270, %r264; + xor.b32 %r273, %r270, %r265; + xor.b32 %r274, %r271, %r266; + xor.b32 %r275, %r271, %r267; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r276, %r55, 0, %p113; + selp.b32 %r277, %r56, 0, %p114; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r278, %r276, %r24; + xor.b32 %r279, %r276, %r25; + xor.b32 %r280, %r277, %r26; + xor.b32 %r281, %r277, %r27; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.u32 %p115, %r272, %r274; + setp.ge.u32 %p116, %r273, %r275; + setp.ne.b32 %p117, %r273, %r275; + setp.ne.b32 %p118, %r274, %r272; + setp.le.u32 %p119, %r279, %r281; + setp.le.u32 %p120, %r278, %r280; + or.pred %p121, %p120, %p118; + or.pred %p122, %p119, %p117; + and.pred %p123, %p116, %p122; + and.pred %p124, %p115, %p121; + xor.pred %p125, %p124, %p5; + xor.pred %p126, %p123, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r282, %r272, %r274; + xor.b32 %r283, %r275, %r273; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r284, 0, %r283, %p126; + selp.b32 %r285, 0, %r282, %p125; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r286, %r285, %r272; + xor.b32 %r287, %r284, %r273; + xor.b32 %r288, %r284, %r275; + xor.b32 %r289, %r285, %r274; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r290, %r280, %r278; + xor.b32 %r291, %r281, %r279; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r292, 0, %r290, %p125; + selp.b32 %r293, 0, %r291, %p126; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r294, %r292, %r278; + xor.b32 %r295, %r293, %r279; + xor.b32 %r296, %r292, %r280; + xor.b32 %r297, %r293, %r281; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.u32 %p127, %r286, %r287; + setp.ne.b32 %p128, %r286, %r287; + setp.le.u32 %p129, %r294, %r295; + or.pred %p130, %p128, %p129; + and.pred %p131, %p127, %p130; + xor.pred %p132, %p131, %p5; + setp.ge.u32 %p133, %r289, %r288; + setp.ne.b32 %p134, %r289, %r288; + setp.le.u32 %p135, %r296, %r297; + or.pred %p136, %p134, %p135; + and.pred %p137, %p133, %p136; + xor.pred %p138, %p137, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r298, %r286, %r287; + xor.b32 %r299, %r289, %r288; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r300, 0, %r298, %p132; + selp.b32 %r301, 0, %r299, %p138; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r302, %r300, %r286; + xor.b32 %r303, %r300, %r287; + xor.b32 %r304, %r301, %r289; + xor.b32 %r305, %r301, %r288; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r306, %r294, %r295; + xor.b32 %r307, %r296, %r297; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r308, 0, %r306, %p132; + selp.b32 %r309, 0, %r307, %p138; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r310, %r308, %r294; + xor.b32 %r311, %r308, %r295; + xor.b32 %r312, %r309, %r296; + xor.b32 %r313, %r309, %r297; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r314, %r302, %r53; + mul.lo.s32 %r315, %r303, %r53; + mul.lo.s32 %r316, %r304, %r53; + mul.lo.s32 %r317, %r305, %r53; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r318, %r302, %r50; + mul.lo.s32 %r319, %r303, %r50; + mul.lo.s32 %r320, %r304, %r50; + mul.lo.s32 %r321, %r305, %r50; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r322, %r310, %r53; + mul.lo.s32 %r323, %r311, %r53; + mul.lo.s32 %r324, %r312, %r53; + mul.lo.s32 %r325, %r313, %r53; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r326, %r310, %r50; + mul.lo.s32 %r327, %r311, %r50; + mul.lo.s32 %r328, %r312, %r50; + mul.lo.s32 %r329, %r313, %r50; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r330, %r314, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r331, %r314, %r330; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r332, %r315, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r333, %r315, %r332; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r334, %r316, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r335, %r316, %r334; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r336, %r317, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r337, %r317, %r336; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r338, %r252, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r339, %r252, %r338; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r340, %r253, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r341, %r253, %r340; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r342, %r254, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r343, %r254, %r342; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r344, %r255, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r345, %r255, %r344; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r346, %r318, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r347, %r318, %r346; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r348, %r319, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r349, %r319, %r348; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r350, %r320, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r351, %r320, %r350; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r352, %r321, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r353, %r321, %r352; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r354, %r256, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r355, %r256, %r354; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r356, %r257, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r357, %r257, %r356; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r358, %r258, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r359, %r358, %r258; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r360, %r259, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r361, %r360, %r259; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r362, %r322, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r363, %r362, %r322; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r364, %r323, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r365, %r364, %r323; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r366, %r324, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r367, %r366, %r324; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r368, %r325, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r369, %r368, %r325; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r370, %r260, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r371, %r370, %r260; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r372, %r261, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r373, %r372, %r261; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r374, %r262, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r375, %r374, %r262; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r376, %r263, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r377, %r376, %r263; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r378, %r326, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r379, %r378, %r326; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r380, %r327, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r381, %r380, %r327; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r382, %r328, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r383, %r382, %r328; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r384, %r329, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r385, %r384, %r329; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.s32 %p139, %r245, %r339; + setp.ne.b32 %p140, %r245, %r339; + setp.le.s32 %p141, %r355, %r371; + or.pred %p142, %p140, %p141; + and.pred %p143, %p139, %p142; + xor.pred %p144, %p143, %p6; + setp.ge.s32 %p145, %r247, %r341; + setp.ne.b32 %p146, %r247, %r341; + setp.le.s32 %p147, %r357, %r373; + or.pred %p148, %p146, %p147; + and.pred %p149, %p145, %p148; + xor.pred %p150, %p149, %p6; + setp.ge.s32 %p151, %r249, %r343; + setp.ne.b32 %p152, %r249, %r343; + setp.le.s32 %p153, %r359, %r375; + or.pred %p154, %p152, %p153; + and.pred %p155, %p151, %p154; + xor.pred %p156, %p155, %p6; + setp.ge.s32 %p157, %r251, %r345; + setp.ne.b32 %p158, %r251, %r345; + setp.le.s32 %p159, %r361, %r377; + or.pred %p160, %p158, %p159; + and.pred %p161, %p157, %p160; + xor.pred %p162, %p161, %p6; + setp.ge.s32 %p163, %r331, %r347; + setp.ne.b32 %p164, %r331, %r347; + setp.le.s32 %p165, %r363, %r379; + or.pred %p166, %p164, %p165; + and.pred %p167, %p163, %p166; + xor.pred %p168, %p167, %p6; + setp.ge.s32 %p169, %r333, %r349; + setp.ne.b32 %p170, %r333, %r349; + setp.le.s32 %p171, %r365, %r381; + or.pred %p172, %p170, %p171; + and.pred %p173, %p169, %p172; + xor.pred %p174, %p173, %p6; + setp.ge.s32 %p175, %r335, %r351; + setp.ne.b32 %p176, %r335, %r351; + setp.le.s32 %p177, %r367, %r383; + or.pred %p178, %p176, %p177; + and.pred %p179, %p175, %p178; + xor.pred %p180, %p179, %p6; + setp.ge.s32 %p181, %r337, %r353; + setp.ne.b32 %p182, %r337, %r353; + setp.le.s32 %p183, %r369, %r385; + or.pred %p184, %p182, %p183; + and.pred %p185, %p181, %p184; + xor.pred %p186, %p185, %p6; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r386, %r245, %r339; + xor.b32 %r387, %r247, %r341; + xor.b32 %r388, %r249, %r343; + xor.b32 %r389, %r251, %r345; + xor.b32 %r390, %r331, %r347; + xor.b32 %r391, %r333, %r349; + xor.b32 %r392, %r335, %r351; + xor.b32 %r393, %r337, %r353; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r394, 0, %r386, %p144; + selp.b32 %r395, 0, %r387, %p150; + selp.b32 %r396, 0, %r388, %p156; + selp.b32 %r397, 0, %r389, %p162; + selp.b32 %r398, 0, %r390, %p168; + selp.b32 %r399, 0, %r391, %p174; + selp.b32 %r400, 0, %r392, %p180; + selp.b32 %r401, 0, %r393, %p186; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r402, %r394, %r228; + xor.b32 %r403, %r395, %r229; + xor.b32 %r404, %r396, %r230; + xor.b32 %r405, %r397, %r231; + xor.b32 %r406, %r398, %r302; + xor.b32 %r407, %r399, %r303; + xor.b32 %r408, %r400, %r304; + xor.b32 %r409, %r401, %r305; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r410, %r371, %r355; + xor.b32 %r411, %r373, %r357; + xor.b32 %r412, %r375, %r359; + xor.b32 %r413, %r377, %r361; + xor.b32 %r414, %r379, %r363; + xor.b32 %r415, %r381, %r365; + xor.b32 %r416, %r383, %r367; + xor.b32 %r417, %r385, %r369; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r418, 0, %r410, %p144; + selp.b32 %r419, 0, %r411, %p150; + selp.b32 %r420, 0, %r412, %p156; + selp.b32 %r421, 0, %r413, %p162; + selp.b32 %r422, 0, %r414, %p168; + selp.b32 %r423, 0, %r415, %p174; + selp.b32 %r424, 0, %r416, %p180; + selp.b32 %r425, 0, %r417, %p186; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r426, %r418, %r236; + xor.b32 %r427, %r419, %r237; + xor.b32 %r428, %r420, %r238; + xor.b32 %r429, %r421, %r239; + xor.b32 %r430, %r422, %r310; + xor.b32 %r431, %r423, %r311; + xor.b32 %r432, %r424, %r312; + xor.b32 %r433, %r425, %r313; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.s32 %p187, %r402, %r404; + setp.ne.b32 %p188, %r402, %r404; + setp.le.s32 %p189, %r426, %r428; + or.pred %p190, %p188, %p189; + and.pred %p191, %p187, %p190; + xor.pred %p192, %p191, %p6; + setp.ge.s32 %p193, %r403, %r405; + setp.ne.b32 %p194, %r403, %r405; + setp.le.s32 %p195, %r427, %r429; + or.pred %p196, %p194, %p195; + and.pred %p197, %p193, %p196; + xor.pred %p198, %p197, %p6; + setp.ge.s32 %p199, %r406, %r408; + setp.ne.b32 %p200, %r406, %r408; + setp.le.s32 %p201, %r430, %r432; + or.pred %p202, %p200, %p201; + and.pred %p203, %p199, %p202; + xor.pred %p204, %p203, %p6; + setp.ge.s32 %p205, %r407, %r409; + setp.ne.b32 %p206, %r407, %r409; + setp.le.s32 %p207, %r431, %r433; + or.pred %p208, %p206, %p207; + and.pred %p209, %p205, %p208; + xor.pred %p210, %p209, %p6; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r434, %r404, %r402; + xor.b32 %r435, %r405, %r403; + xor.b32 %r436, %r408, %r406; + xor.b32 %r437, %r409, %r407; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r438, 0, %r434, %p192; + selp.b32 %r439, 0, %r435, %p198; + selp.b32 %r440, 0, %r436, %p204; + selp.b32 %r441, 0, %r437, %p210; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r442, %r438, %r402; + xor.b32 %r443, %r439, %r403; + xor.b32 %r444, %r438, %r404; + xor.b32 %r445, %r439, %r405; + xor.b32 %r446, %r440, %r406; + xor.b32 %r447, %r441, %r407; + xor.b32 %r448, %r440, %r408; + xor.b32 %r449, %r441, %r409; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r450, %r428, %r426; + xor.b32 %r451, %r429, %r427; + xor.b32 %r452, %r432, %r430; + xor.b32 %r453, %r433, %r431; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r454, 0, %r450, %p192; + selp.b32 %r455, 0, %r451, %p198; + selp.b32 %r456, 0, %r452, %p204; + selp.b32 %r457, 0, %r453, %p210; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r458, %r454, %r426; + xor.b32 %r459, %r455, %r427; + xor.b32 %r460, %r454, %r428; + xor.b32 %r461, %r455, %r429; + xor.b32 %r462, %r456, %r430; + xor.b32 %r463, %r457, %r431; + xor.b32 %r464, %r456, %r432; + xor.b32 %r465, %r457, %r433; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.s32 %p211, %r442, %r443; + setp.ne.b32 %p212, %r442, %r443; + setp.le.s32 %p213, %r458, %r459; + or.pred %p214, %p212, %p213; + and.pred %p215, %p211, %p214; + xor.pred %p216, %p215, %p6; + setp.ge.s32 %p217, %r444, %r445; + setp.ne.b32 %p218, %r444, %r445; + setp.le.s32 %p219, %r460, %r461; + or.pred %p220, %p218, %p219; + and.pred %p221, %p217, %p220; + xor.pred %p222, %p221, %p6; + setp.ge.s32 %p223, %r446, %r447; + setp.ne.b32 %p224, %r446, %r447; + setp.le.s32 %p225, %r462, %r463; + or.pred %p226, %p224, %p225; + and.pred %p227, %p223, %p226; + xor.pred %p228, %p227, %p6; + setp.ge.s32 %p229, %r448, %r449; + setp.ne.b32 %p230, %r448, %r449; + setp.le.s32 %p231, %r464, %r465; + or.pred %p232, %p230, %p231; + and.pred %p233, %p229, %p232; + xor.pred %p234, %p233, %p6; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r466, %r443, %r442; + xor.b32 %r467, %r445, %r444; + xor.b32 %r468, %r447, %r446; + xor.b32 %r469, %r449, %r448; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r470, 0, %r466, %p216; + selp.b32 %r471, 0, %r467, %p222; + selp.b32 %r472, 0, %r468, %p228; + selp.b32 %r473, 0, %r469, %p234; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r474, %r470, %r442; + xor.b32 %r475, %r470, %r443; + xor.b32 %r476, %r471, %r444; + xor.b32 %r477, %r471, %r445; + xor.b32 %r478, %r472, %r446; + xor.b32 %r479, %r472, %r447; + xor.b32 %r480, %r473, %r448; + xor.b32 %r481, %r473, %r449; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r482, %r459, %r458; + xor.b32 %r483, %r461, %r460; + xor.b32 %r484, %r463, %r462; + xor.b32 %r485, %r465, %r464; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r486, 0, %r482, %p216; + selp.b32 %r487, 0, %r483, %p222; + selp.b32 %r488, 0, %r484, %p228; + selp.b32 %r489, 0, %r485, %p234; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r490, %r486, %r458; + xor.b32 %r491, %r486, %r459; + xor.b32 %r492, %r487, %r460; + xor.b32 %r493, %r487, %r461; + xor.b32 %r494, %r488, %r462; + xor.b32 %r495, %r488, %r463; + xor.b32 %r496, %r489, %r464; + xor.b32 %r497, %r489, %r465; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r498, %r474, %r54; + mul.lo.s32 %r499, %r475, %r54; + mul.lo.s32 %r500, %r476, %r54; + mul.lo.s32 %r501, %r477, %r54; + mul.lo.s32 %r502, %r478, %r54; + mul.lo.s32 %r503, %r479, %r54; + mul.lo.s32 %r504, %r480, %r54; + mul.lo.s32 %r505, %r481, %r54; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r506, %r498, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r507, %r498, %r506; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r508, %r499, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r509, %r499, %r508; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r510, %r500, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r511, %r500, %r510; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r512, %r501, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r513, %r501, %r512; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r514, %r502, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r515, %r502, %r514; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r516, %r503, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r517, %r503, %r516; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r518, %r504, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r519, %r504, %r518; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r520, %r505, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r521, %r505, %r520; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r522, %r474, %r52; + mul.lo.s32 %r523, %r475, %r52; + mul.lo.s32 %r524, %r476, %r52; + mul.lo.s32 %r525, %r477, %r52; + mul.lo.s32 %r526, %r478, %r52; + mul.lo.s32 %r527, %r479, %r52; + mul.lo.s32 %r528, %r480, %r52; + mul.lo.s32 %r529, %r481, %r52; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r530, %r522, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r531, %r522, %r530; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r532, %r523, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r533, %r523, %r532; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r534, %r524, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r535, %r524, %r534; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r536, %r525, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r537, %r525, %r536; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r538, %r526, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r539, %r526, %r538; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r540, %r527, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r541, %r527, %r540; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r542, %r528, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r543, %r528, %r542; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r544, %r529, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r545, %r529, %r544; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r546, %r490, %r54; + mul.lo.s32 %r547, %r491, %r54; + mul.lo.s32 %r548, %r492, %r54; + mul.lo.s32 %r549, %r493, %r54; + mul.lo.s32 %r550, %r494, %r54; + mul.lo.s32 %r551, %r495, %r54; + mul.lo.s32 %r552, %r496, %r54; + mul.lo.s32 %r553, %r497, %r54; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r554, %r546, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r555, %r554, %r546; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r556, %r547, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r557, %r556, %r547; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r558, %r548, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r559, %r558, %r548; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r560, %r549, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r561, %r560, %r549; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r562, %r550, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r563, %r562, %r550; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r564, %r551, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r565, %r564, %r551; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r566, %r552, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r567, %r566, %r552; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r568, %r553, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r569, %r568, %r553; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r570, %r490, %r52; + mul.lo.s32 %r571, %r491, %r52; + mul.lo.s32 %r572, %r492, %r52; + mul.lo.s32 %r573, %r493, %r52; + mul.lo.s32 %r574, %r494, %r52; + mul.lo.s32 %r575, %r495, %r52; + mul.lo.s32 %r576, %r496, %r52; + mul.lo.s32 %r577, %r497, %r52; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r578, %r570, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r579, %r578, %r570; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r580, %r571, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r581, %r580, %r571; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r582, %r572, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r583, %r582, %r572; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r584, %r573, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r585, %r584, %r573; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r586, %r574, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r587, %r586, %r574; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r588, %r575, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r589, %r588, %r575; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r590, %r576, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r591, %r590, %r576; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r592, %r577, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r593, %r592, %r577; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.lt.s32 %p235, %r507, %r531; + setp.lt.s32 %p236, %r509, %r533; + setp.lt.s32 %p237, %r511, %r535; + setp.lt.s32 %p238, %r513, %r537; + setp.lt.s32 %p239, %r515, %r539; + setp.lt.s32 %p240, %r517, %r541; + setp.lt.s32 %p241, %r519, %r543; + setp.lt.s32 %p242, %r521, %r545; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p243, %r507, %r531; + setp.eq.b32 %p244, %r509, %r533; + setp.eq.b32 %p245, %r511, %r535; + setp.eq.b32 %p246, %r513, %r537; + setp.eq.b32 %p247, %r515, %r539; + setp.eq.b32 %p248, %r517, %r541; + setp.eq.b32 %p249, %r519, %r543; + setp.eq.b32 %p250, %r521, %r545; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p251, %r555, %r579; + setp.gt.s32 %p252, %r557, %r581; + setp.gt.s32 %p253, %r559, %r583; + setp.gt.s32 %p254, %r561, %r585; + setp.gt.s32 %p255, %r563, %r587; + setp.gt.s32 %p256, %r565, %r589; + setp.gt.s32 %p257, %r567, %r591; + setp.gt.s32 %p258, %r569, %r593; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.pred %p259, %p243, %p251; + and.pred %p260, %p244, %p252; + and.pred %p261, %p245, %p253; + and.pred %p262, %p246, %p254; + and.pred %p263, %p247, %p255; + and.pred %p264, %p248, %p256; + and.pred %p265, %p249, %p257; + and.pred %p266, %p250, %p258; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + or.pred %p267, %p235, %p259; + or.pred %p268, %p236, %p260; + or.pred %p269, %p237, %p261; + or.pred %p270, %p238, %p262; + or.pred %p271, %p239, %p263; + or.pred %p272, %p240, %p264; + or.pred %p273, %p241, %p265; + or.pred %p274, %p242, %p266; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r594, %r507, %r531; + xor.b32 %r595, %r509, %r533; + xor.b32 %r596, %r511, %r535; + xor.b32 %r597, %r513, %r537; + xor.b32 %r598, %r515, %r539; + xor.b32 %r599, %r517, %r541; + xor.b32 %r600, %r519, %r543; + xor.b32 %r601, %r521, %r545; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r602, %r594, 0, %p267; + selp.b32 %r603, %r595, 0, %p268; + selp.b32 %r604, %r596, 0, %p269; + selp.b32 %r605, %r597, 0, %p270; + selp.b32 %r606, %r598, 0, %p271; + selp.b32 %r607, %r599, 0, %p272; + selp.b32 %r608, %r600, 0, %p273; + selp.b32 %r609, %r601, 0, %p274; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r610, %r602, %r474; + xor.b32 %r611, %r603, %r475; + xor.b32 %r612, %r604, %r476; + xor.b32 %r613, %r605, %r477; + xor.b32 %r614, %r606, %r478; + xor.b32 %r615, %r607, %r479; + xor.b32 %r616, %r608, %r480; + xor.b32 %r617, %r609, %r481; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r618, %r579, %r555; + xor.b32 %r619, %r581, %r557; + xor.b32 %r620, %r583, %r559; + xor.b32 %r621, %r585, %r561; + xor.b32 %r622, %r587, %r563; + xor.b32 %r623, %r589, %r565; + xor.b32 %r624, %r591, %r567; + xor.b32 %r625, %r593, %r569; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r626, %r618, 0, %p267; + selp.b32 %r627, %r619, 0, %p268; + selp.b32 %r628, %r620, 0, %p269; + selp.b32 %r629, %r621, 0, %p270; + selp.b32 %r630, %r622, 0, %p271; + selp.b32 %r631, %r623, 0, %p272; + selp.b32 %r632, %r624, 0, %p273; + selp.b32 %r633, %r625, 0, %p274; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r634, %r626, %r490; + xor.b32 %r635, %r627, %r491; + xor.b32 %r636, %r628, %r492; + xor.b32 %r637, %r629, %r493; + xor.b32 %r638, %r630, %r494; + xor.b32 %r639, %r631, %r495; + xor.b32 %r640, %r632, %r496; + xor.b32 %r641, %r633, %r497; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r642, %r610, %r53; + mul.lo.s32 %r643, %r611, %r53; + mul.lo.s32 %r644, %r612, %r53; + mul.lo.s32 %r645, %r613, %r53; + mul.lo.s32 %r646, %r614, %r53; + mul.lo.s32 %r647, %r615, %r53; + mul.lo.s32 %r648, %r616, %r53; + mul.lo.s32 %r649, %r617, %r53; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r650, %r642, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r651, %r650, %r642; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r652, %r643, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r653, %r652, %r643; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r654, %r644, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r655, %r654, %r644; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r656, %r645, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r657, %r656, %r645; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r658, %r646, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r659, %r658, %r646; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r660, %r647, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r661, %r660, %r647; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r662, %r648, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r663, %r662, %r648; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r664, %r649, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r665, %r664, %r649; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r666, %r610, %r50; + mul.lo.s32 %r667, %r611, %r50; + mul.lo.s32 %r668, %r612, %r50; + mul.lo.s32 %r669, %r613, %r50; + mul.lo.s32 %r670, %r614, %r50; + mul.lo.s32 %r671, %r615, %r50; + mul.lo.s32 %r672, %r616, %r50; + mul.lo.s32 %r673, %r617, %r50; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r674, %r666, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r675, %r674, %r666; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r676, %r667, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r677, %r676, %r667; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r678, %r668, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r679, %r678, %r668; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r680, %r669, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r681, %r680, %r669; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r682, %r670, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r683, %r682, %r670; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r684, %r671, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r685, %r684, %r671; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r686, %r672, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r687, %r686, %r672; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r688, %r673, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r689, %r688, %r673; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r690, %r634, %r53; + mul.lo.s32 %r691, %r635, %r53; + mul.lo.s32 %r692, %r636, %r53; + mul.lo.s32 %r693, %r637, %r53; + mul.lo.s32 %r694, %r638, %r53; + mul.lo.s32 %r695, %r639, %r53; + mul.lo.s32 %r696, %r640, %r53; + mul.lo.s32 %r697, %r641, %r53; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r698, %r690, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r699, %r698, %r690; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r700, %r691, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r701, %r700, %r691; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r702, %r692, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r703, %r702, %r692; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r704, %r693, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r705, %r704, %r693; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r706, %r694, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r707, %r706, %r694; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r708, %r695, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r709, %r708, %r695; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r710, %r696, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r711, %r710, %r696; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r712, %r697, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r713, %r712, %r697; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r714, %r634, %r50; + mul.lo.s32 %r715, %r635, %r50; + mul.lo.s32 %r716, %r636, %r50; + mul.lo.s32 %r717, %r637, %r50; + mul.lo.s32 %r718, %r638, %r50; + mul.lo.s32 %r719, %r639, %r50; + mul.lo.s32 %r720, %r640, %r50; + mul.lo.s32 %r721, %r641, %r50; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r722, %r714, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r723, %r722, %r714; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r724, %r715, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r725, %r724, %r715; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r726, %r716, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r727, %r726, %r716; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r728, %r717, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r729, %r728, %r717; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r730, %r718, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r731, %r730, %r718; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r732, %r719, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r733, %r732, %r719; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r734, %r720, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r735, %r734, %r720; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r736, %r721, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r737, %r736, %r721; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.lt.s32 %p275, %r651, %r675; + setp.lt.s32 %p276, %r653, %r677; + setp.lt.s32 %p277, %r655, %r679; + setp.lt.s32 %p278, %r657, %r681; + setp.lt.s32 %p279, %r659, %r683; + setp.lt.s32 %p280, %r661, %r685; + setp.lt.s32 %p281, %r663, %r687; + setp.lt.s32 %p282, %r665, %r689; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p283, %r651, %r675; + setp.eq.b32 %p284, %r653, %r677; + setp.eq.b32 %p285, %r655, %r679; + setp.eq.b32 %p286, %r657, %r681; + setp.eq.b32 %p287, %r659, %r683; + setp.eq.b32 %p288, %r661, %r685; + setp.eq.b32 %p289, %r663, %r687; + setp.eq.b32 %p290, %r665, %r689; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p291, %r699, %r723; + setp.gt.s32 %p292, %r701, %r725; + setp.gt.s32 %p293, %r703, %r727; + setp.gt.s32 %p294, %r705, %r729; + setp.gt.s32 %p295, %r707, %r731; + setp.gt.s32 %p296, %r709, %r733; + setp.gt.s32 %p297, %r711, %r735; + setp.gt.s32 %p298, %r713, %r737; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.pred %p299, %p283, %p291; + and.pred %p300, %p284, %p292; + and.pred %p301, %p285, %p293; + and.pred %p302, %p286, %p294; + and.pred %p303, %p287, %p295; + and.pred %p304, %p288, %p296; + and.pred %p305, %p289, %p297; + and.pred %p306, %p290, %p298; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + or.pred %p307, %p275, %p299; + or.pred %p308, %p276, %p300; + or.pred %p309, %p277, %p301; + or.pred %p310, %p278, %p302; + or.pred %p311, %p279, %p303; + or.pred %p312, %p280, %p304; + or.pred %p313, %p281, %p305; + or.pred %p314, %p282, %p306; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r738, %r675, %r651; + xor.b32 %r739, %r677, %r653; + xor.b32 %r740, %r679, %r655; + xor.b32 %r741, %r681, %r657; + xor.b32 %r742, %r683, %r659; + xor.b32 %r743, %r685, %r661; + xor.b32 %r744, %r687, %r663; + xor.b32 %r745, %r689, %r665; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r746, %r738, 0, %p307; + selp.b32 %r747, %r739, 0, %p308; + selp.b32 %r748, %r740, 0, %p309; + selp.b32 %r749, %r741, 0, %p310; + selp.b32 %r750, %r742, 0, %p311; + selp.b32 %r751, %r743, 0, %p312; + selp.b32 %r752, %r744, 0, %p313; + selp.b32 %r753, %r745, 0, %p314; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r754, %r746, %r610; + xor.b32 %r755, %r747, %r611; + xor.b32 %r756, %r748, %r612; + xor.b32 %r757, %r749, %r613; + xor.b32 %r758, %r750, %r614; + xor.b32 %r759, %r751, %r615; + xor.b32 %r760, %r752, %r616; + xor.b32 %r761, %r753, %r617; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r762, %r723, %r699; + xor.b32 %r763, %r725, %r701; + xor.b32 %r764, %r727, %r703; + xor.b32 %r765, %r729, %r705; + xor.b32 %r766, %r731, %r707; + xor.b32 %r767, %r733, %r709; + xor.b32 %r768, %r735, %r711; + xor.b32 %r769, %r737, %r713; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r770, %r762, 0, %p307; + selp.b32 %r771, %r763, 0, %p308; + selp.b32 %r772, %r764, 0, %p309; + selp.b32 %r773, %r765, 0, %p310; + selp.b32 %r774, %r766, 0, %p311; + selp.b32 %r775, %r767, 0, %p312; + selp.b32 %r776, %r768, 0, %p313; + selp.b32 %r777, %r769, 0, %p314; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r778, %r770, %r634; + xor.b32 %r779, %r771, %r635; + xor.b32 %r780, %r772, %r636; + xor.b32 %r781, %r773, %r637; + xor.b32 %r782, %r774, %r638; + xor.b32 %r783, %r775, %r639; + xor.b32 %r784, %r776, %r640; + xor.b32 %r785, %r777, %r641; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.lt.s32 %p315, %r754, %r756; + setp.lt.s32 %p316, %r755, %r757; + setp.lt.s32 %p317, %r758, %r760; + setp.lt.s32 %p318, %r759, %r761; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p319, %r754, %r756; + setp.eq.b32 %p320, %r755, %r757; + setp.eq.b32 %p321, %r758, %r760; + setp.eq.b32 %p322, %r759, %r761; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p323, %r778, %r780; + setp.gt.s32 %p324, %r779, %r781; + setp.gt.s32 %p325, %r782, %r784; + setp.gt.s32 %p326, %r783, %r785; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.pred %p327, %p319, %p323; + and.pred %p328, %p320, %p324; + and.pred %p329, %p321, %p325; + and.pred %p330, %p322, %p326; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + or.pred %p331, %p315, %p327; + or.pred %p332, %p316, %p328; + or.pred %p333, %p317, %p329; + or.pred %p334, %p318, %p330; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r786, %r756, %r754; + xor.b32 %r787, %r757, %r755; + xor.b32 %r788, %r760, %r758; + xor.b32 %r789, %r761, %r759; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r790, %r786, 0, %p331; + selp.b32 %r791, %r787, 0, %p332; + selp.b32 %r792, %r788, 0, %p333; + selp.b32 %r793, %r789, 0, %p334; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r794, %r790, %r754; + xor.b32 %r795, %r791, %r755; + xor.b32 %r796, %r790, %r756; + xor.b32 %r797, %r791, %r757; + xor.b32 %r798, %r792, %r758; + xor.b32 %r799, %r793, %r759; + xor.b32 %r800, %r792, %r760; + xor.b32 %r801, %r793, %r761; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r802, %r780, %r778; + xor.b32 %r803, %r781, %r779; + xor.b32 %r804, %r784, %r782; + xor.b32 %r805, %r785, %r783; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r806, %r802, 0, %p331; + selp.b32 %r807, %r803, 0, %p332; + selp.b32 %r808, %r804, 0, %p333; + selp.b32 %r809, %r805, 0, %p334; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r810, %r806, %r778; + xor.b32 %r811, %r807, %r779; + xor.b32 %r812, %r806, %r780; + xor.b32 %r813, %r807, %r781; + xor.b32 %r814, %r808, %r782; + xor.b32 %r815, %r809, %r783; + xor.b32 %r816, %r808, %r784; + xor.b32 %r817, %r809, %r785; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.lt.s32 %p335, %r796, %r797; + setp.lt.s32 %p336, %r794, %r795; + setp.lt.s32 %p337, %r800, %r801; + setp.lt.s32 %p338, %r798, %r799; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p339, %r796, %r797; + setp.eq.b32 %p340, %r795, %r794; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p341, %r812, %r813; + setp.gt.s32 %p342, %r810, %r811; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p343, %r800, %r801; + setp.eq.b32 %p344, %r799, %r798; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p345, %r816, %r817; + setp.gt.s32 %p346, %r814, %r815; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r818, %r813, %r812; + xor.b32 %r819, %r810, %r811; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r820, %r819, 0, %p342; + selp.b32 %r821, %r820, 0, %p340; + selp.b32 %r822, %r819, %r821, %p336; + selp.b32 %r823, %r818, 0, %p341; + selp.b32 %r824, %r823, 0, %p339; + selp.b32 %r825, %r818, %r824, %p335; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r10, %r825, %r812; + xor.b32 %r11, %r825, %r813; + xor.b32 %r8, %r822, %r810; + xor.b32 %r9, %r822, %r811; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r826, %r817, %r816; + xor.b32 %r827, %r814, %r815; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r828, %r827, 0, %p346; + selp.b32 %r829, %r828, 0, %p344; + selp.b32 %r830, %r827, %r829, %p338; + selp.b32 %r831, %r826, 0, %p345; + selp.b32 %r832, %r831, 0, %p343; + selp.b32 %r833, %r826, %r832, %p337; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r14, %r833, %r816; + xor.b32 %r15, %r833, %r817; + xor.b32 %r12, %r830, %r814; + xor.b32 %r13, %r830, %r815; +$L__tmp8: + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r834, %r150, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r835, %r834, %r150; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r836, %r151, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r837, %r836, %r151; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r838, %r152, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r839, %r838, %r152; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r840, %r153, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r841, %r840, %r153; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r842, %r154, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r843, %r842, %r154; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r844, %r155, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r845, %r844, %r155; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r846, %r156, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r847, %r846, %r156; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r848, %r157, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r849, %r848, %r157; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r850, %r158, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r851, %r850, %r158; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r852, %r159, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r853, %r852, %r159; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r854, %r160, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r855, %r854, %r160; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r856, %r161, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r857, %r856, %r161; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r858, %r162, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r859, %r858, %r162; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r860, %r163, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r861, %r860, %r163; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r862, %r164, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r863, %r862, %r164; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r864, %r165, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r865, %r864, %r165; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.s32 %p347, %r835, %r851; + setp.ne.b32 %p348, %r835, %r851; + setp.ge.s32 %p349, %r837, %r853; + setp.ne.b32 %p350, %r837, %r853; + setp.ge.s32 %p351, %r839, %r855; + setp.ne.b32 %p352, %r839, %r855; + setp.ge.s32 %p353, %r841, %r857; + setp.ne.b32 %p354, %r841, %r857; + setp.ge.s32 %p355, %r843, %r859; + setp.ne.b32 %p356, %r843, %r859; + setp.ge.s32 %p357, %r845, %r861; + setp.ne.b32 %p358, %r845, %r861; + setp.ge.s32 %p359, %r847, %r863; + setp.ne.b32 %p360, %r847, %r863; + setp.ge.s32 %p361, %r849, %r865; + setp.ne.b32 %p362, %r849, %r865; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r866, %r851, %r835; + xor.b32 %r867, %r853, %r837; + xor.b32 %r868, %r855, %r839; + xor.b32 %r869, %r857, %r841; + xor.b32 %r870, %r859, %r843; + xor.b32 %r871, %r861, %r845; + xor.b32 %r872, %r863, %r847; + xor.b32 %r873, %r865, %r849; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r874, %r170, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r875, %r874, %r170; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r876, %r171, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r877, %r876, %r171; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r878, %r172, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r879, %r878, %r172; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r880, %r173, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r881, %r880, %r173; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r882, %r182, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r883, %r882, %r182; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r884, %r183, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r885, %r884, %r183; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r886, %r184, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r887, %r886, %r184; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r888, %r185, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r889, %r888, %r185; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r890, %r174, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r891, %r890, %r174; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r892, %r175, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r893, %r892, %r175; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r894, %r176, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r895, %r894, %r176; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r896, %r177, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r897, %r896, %r177; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r898, %r186, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r899, %r898, %r186; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r900, %r187, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r901, %r900, %r187; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r902, %r188, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r903, %r902, %r188; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r904, %r189, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r905, %r904, %r189; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.le.s32 %p363, %r875, %r891; + or.pred %p364, %p348, %p363; + and.pred %p365, %p347, %p364; + xor.pred %p366, %p365, %p6; + setp.le.s32 %p367, %r877, %r893; + or.pred %p368, %p350, %p367; + and.pred %p369, %p349, %p368; + xor.pred %p370, %p369, %p6; + setp.le.s32 %p371, %r879, %r895; + or.pred %p372, %p352, %p371; + and.pred %p373, %p351, %p372; + xor.pred %p374, %p373, %p6; + setp.le.s32 %p375, %r881, %r897; + or.pred %p376, %p354, %p375; + and.pred %p377, %p353, %p376; + xor.pred %p378, %p377, %p6; + setp.le.s32 %p379, %r883, %r899; + or.pred %p380, %p356, %p379; + and.pred %p381, %p355, %p380; + xor.pred %p382, %p381, %p6; + setp.le.s32 %p383, %r885, %r901; + or.pred %p384, %p358, %p383; + and.pred %p385, %p357, %p384; + xor.pred %p386, %p385, %p6; + setp.le.s32 %p387, %r887, %r903; + or.pred %p388, %p360, %p387; + and.pred %p389, %p359, %p388; + xor.pred %p390, %p389, %p6; + setp.le.s32 %p391, %r889, %r905; + or.pred %p392, %p362, %p391; + and.pred %p393, %p361, %p392; + xor.pred %p394, %p393, %p6; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r906, 0, %r866, %p366; + selp.b32 %r907, 0, %r867, %p370; + selp.b32 %r908, 0, %r868, %p374; + selp.b32 %r909, 0, %r869, %p378; + selp.b32 %r910, 0, %r870, %p382; + selp.b32 %r911, 0, %r871, %p386; + selp.b32 %r912, 0, %r872, %p390; + selp.b32 %r913, 0, %r873, %p394; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r914, %r906, %r134; + xor.b32 %r915, %r907, %r135; + xor.b32 %r916, %r908, %r136; + xor.b32 %r917, %r909, %r137; + xor.b32 %r918, %r910, %r138; + xor.b32 %r919, %r911, %r139; + xor.b32 %r920, %r912, %r140; + xor.b32 %r921, %r913, %r141; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r922, %r891, %r875; + xor.b32 %r923, %r893, %r877; + xor.b32 %r924, %r895, %r879; + xor.b32 %r925, %r897, %r881; + xor.b32 %r926, %r899, %r883; + xor.b32 %r927, %r901, %r885; + xor.b32 %r928, %r903, %r887; + xor.b32 %r929, %r905, %r889; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r930, 0, %r925, %p378; + selp.b32 %r931, 0, %r923, %p370; + selp.b32 %r932, 0, %r924, %p374; + selp.b32 %r933, 0, %r922, %p366; + selp.b32 %r934, 0, %r929, %p394; + selp.b32 %r935, 0, %r927, %p386; + selp.b32 %r936, 0, %r928, %p390; + selp.b32 %r937, 0, %r926, %p382; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r938, %r933, %r169; + xor.b32 %r939, %r932, %r167; + xor.b32 %r940, %r931, %r168; + xor.b32 %r941, %r930, %r166; + xor.b32 %r942, %r937, %r181; + xor.b32 %r943, %r936, %r179; + xor.b32 %r944, %r935, %r180; + xor.b32 %r945, %r934, %r178; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.s32 %p395, %r914, %r916; + setp.ne.b32 %p396, %r914, %r916; + setp.le.s32 %p397, %r940, %r941; + setp.le.s32 %p398, %r938, %r939; + or.pred %p399, %p396, %p398; + and.pred %p400, %p395, %p399; + setp.ge.s32 %p401, %r915, %r917; + setp.ne.b32 %p402, %r915, %r917; + or.pred %p403, %p402, %p397; + and.pred %p404, %p401, %p403; + xor.pred %p405, %p404, %p6; + xor.pred %p406, %p400, %p6; + setp.ge.s32 %p407, %r918, %r920; + setp.ne.b32 %p408, %r918, %r920; + setp.le.s32 %p409, %r944, %r945; + setp.le.s32 %p410, %r942, %r943; + or.pred %p411, %p408, %p410; + and.pred %p412, %p407, %p411; + setp.ge.s32 %p413, %r919, %r921; + setp.ne.b32 %p414, %r919, %r921; + or.pred %p415, %p414, %p409; + and.pred %p416, %p413, %p415; + xor.pred %p417, %p416, %p6; + xor.pred %p418, %p412, %p6; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r946, %r916, %r914; + xor.b32 %r947, %r917, %r915; + xor.b32 %r948, %r920, %r918; + xor.b32 %r949, %r921, %r919; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r950, 0, %r946, %p406; + selp.b32 %r951, 0, %r947, %p405; + selp.b32 %r952, 0, %r948, %p418; + selp.b32 %r953, 0, %r949, %p417; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r954, %r950, %r914; + xor.b32 %r955, %r951, %r915; + xor.b32 %r956, %r950, %r916; + xor.b32 %r957, %r951, %r917; + xor.b32 %r958, %r952, %r918; + xor.b32 %r959, %r953, %r919; + xor.b32 %r960, %r952, %r920; + xor.b32 %r961, %r953, %r921; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r962, %r939, %r938; + xor.b32 %r963, %r941, %r940; + xor.b32 %r964, %r943, %r942; + xor.b32 %r965, %r945, %r944; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r966, 0, %r963, %p405; + selp.b32 %r967, 0, %r962, %p406; + selp.b32 %r968, 0, %r965, %p417; + selp.b32 %r969, 0, %r964, %p418; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r970, %r967, %r939; + xor.b32 %r971, %r966, %r940; + xor.b32 %r972, %r966, %r941; + xor.b32 %r973, %r967, %r938; + xor.b32 %r974, %r969, %r943; + xor.b32 %r975, %r968, %r944; + xor.b32 %r976, %r968, %r945; + xor.b32 %r977, %r969, %r942; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.s32 %p419, %r954, %r955; + setp.ne.b32 %p420, %r954, %r955; + setp.le.s32 %p421, %r973, %r971; + or.pred %p422, %p420, %p421; + and.pred %p423, %p419, %p422; + xor.pred %p424, %p423, %p6; + setp.ge.s32 %p425, %r956, %r957; + setp.ne.b32 %p426, %r956, %r957; + setp.le.s32 %p427, %r970, %r972; + or.pred %p428, %p426, %p427; + and.pred %p429, %p425, %p428; + xor.pred %p430, %p429, %p6; + setp.ge.s32 %p431, %r958, %r959; + setp.ne.b32 %p432, %r958, %r959; + setp.le.s32 %p433, %r977, %r975; + or.pred %p434, %p432, %p433; + and.pred %p435, %p431, %p434; + xor.pred %p436, %p435, %p6; + setp.ge.s32 %p437, %r960, %r961; + setp.ne.b32 %p438, %r960, %r961; + setp.le.s32 %p439, %r974, %r976; + or.pred %p440, %p438, %p439; + and.pred %p441, %p437, %p440; + xor.pred %p442, %p441, %p6; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r978, %r955, %r954; + xor.b32 %r979, %r957, %r956; + xor.b32 %r980, %r959, %r958; + xor.b32 %r981, %r961, %r960; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r982, 0, %r978, %p424; + selp.b32 %r983, 0, %r979, %p430; + selp.b32 %r984, 0, %r980, %p436; + selp.b32 %r985, 0, %r981, %p442; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r986, %r983, %r956; + xor.b32 %r987, %r982, %r954; + xor.b32 %r988, %r983, %r957; + xor.b32 %r989, %r982, %r955; + xor.b32 %r990, %r985, %r960; + xor.b32 %r991, %r984, %r958; + xor.b32 %r992, %r985, %r961; + xor.b32 %r993, %r984, %r959; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r994, %r971, %r973; + xor.b32 %r995, %r972, %r970; + xor.b32 %r996, %r975, %r977; + xor.b32 %r997, %r976, %r974; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r998, 0, %r995, %p430; + selp.b32 %r999, 0, %r994, %p424; + selp.b32 %r1000, 0, %r997, %p442; + selp.b32 %r1001, 0, %r996, %p436; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r1006, %r999, %r971; + xor.b32 %r1007, %r998, %r970; + xor.b32 %r1008, %r999, %r973; + xor.b32 %r1009, %r998, %r972; + xor.b32 %r1014, %r1001, %r975; + xor.b32 %r1015, %r1000, %r974; + xor.b32 %r1016, %r1001, %r977; + xor.b32 %r1017, %r1000, %r976; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r1018, %r987, %r54; + mul.lo.s32 %r1019, %r989, %r54; + mul.lo.s32 %r1020, %r986, %r54; + mul.lo.s32 %r1021, %r988, %r54; + mul.lo.s32 %r1022, %r991, %r54; + mul.lo.s32 %r1023, %r993, %r54; + mul.lo.s32 %r1024, %r990, %r54; + mul.lo.s32 %r1025, %r992, %r54; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r1026, %r1018, 2, 31, -1; + shfl.sync.bfly.b32 %r1027, %r1019, 2, 31, -1; + shfl.sync.bfly.b32 %r1028, %r1020, 2, 31, -1; + shfl.sync.bfly.b32 %r1029, %r1021, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r1030, %r1019, %r1027; + add.s32 %r1031, %r1021, %r1029; + add.s32 %r1032, %r1018, %r1026; + add.s32 %r1033, %r1020, %r1028; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r1034, %r1022, 2, 31, -1; + shfl.sync.bfly.b32 %r1035, %r1023, 2, 31, -1; + shfl.sync.bfly.b32 %r1036, %r1024, 2, 31, -1; + shfl.sync.bfly.b32 %r1037, %r1025, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r1038, %r1023, %r1035; + add.s32 %r1039, %r1025, %r1037; + add.s32 %r1040, %r1022, %r1034; + add.s32 %r1041, %r1024, %r1036; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r1042, %r987, %r52; + mul.lo.s32 %r1043, %r989, %r52; + mul.lo.s32 %r1044, %r986, %r52; + mul.lo.s32 %r1045, %r988, %r52; + mul.lo.s32 %r1046, %r991, %r52; + mul.lo.s32 %r1047, %r993, %r52; + mul.lo.s32 %r1048, %r990, %r52; + mul.lo.s32 %r1049, %r992, %r52; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r1050, %r1042, 2, 31, -1; + shfl.sync.bfly.b32 %r1051, %r1043, 2, 31, -1; + shfl.sync.bfly.b32 %r1052, %r1044, 2, 31, -1; + shfl.sync.bfly.b32 %r1053, %r1045, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r1054, %r1043, %r1051; + add.s32 %r1055, %r1045, %r1053; + add.s32 %r1056, %r1042, %r1050; + add.s32 %r1057, %r1044, %r1052; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r1058, %r1046, 2, 31, -1; + shfl.sync.bfly.b32 %r1059, %r1047, 2, 31, -1; + shfl.sync.bfly.b32 %r1060, %r1048, 2, 31, -1; + shfl.sync.bfly.b32 %r1061, %r1049, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r1062, %r1047, %r1059; + add.s32 %r1063, %r1049, %r1061; + add.s32 %r1064, %r1046, %r1058; + add.s32 %r1065, %r1048, %r1060; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r1066, %r1009, %r54; + mul.lo.s32 %r1067, %r1007, %r54; + mul.lo.s32 %r1068, %r1006, %r54; + mul.lo.s32 %r1069, %r1008, %r54; + mul.lo.s32 %r1070, %r1017, %r54; + mul.lo.s32 %r1071, %r1015, %r54; + mul.lo.s32 %r1072, %r1014, %r54; + mul.lo.s32 %r1073, %r1016, %r54; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r1074, %r1069, 2, 31, -1; + shfl.sync.bfly.b32 %r1075, %r1068, 2, 31, -1; + shfl.sync.bfly.b32 %r1076, %r1067, 2, 31, -1; + shfl.sync.bfly.b32 %r1077, %r1066, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r1078, %r1076, %r1067; + add.s32 %r1079, %r1074, %r1069; + add.s32 %r1080, %r1077, %r1066; + add.s32 %r1081, %r1075, %r1068; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r1082, %r1073, 2, 31, -1; + shfl.sync.bfly.b32 %r1083, %r1072, 2, 31, -1; + shfl.sync.bfly.b32 %r1084, %r1071, 2, 31, -1; + shfl.sync.bfly.b32 %r1085, %r1070, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r1086, %r1084, %r1071; + add.s32 %r1087, %r1082, %r1073; + add.s32 %r1088, %r1085, %r1070; + add.s32 %r1089, %r1083, %r1072; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r1090, %r1009, %r52; + mul.lo.s32 %r1091, %r1007, %r52; + mul.lo.s32 %r1092, %r1006, %r52; + mul.lo.s32 %r1093, %r1008, %r52; + mul.lo.s32 %r1094, %r1017, %r52; + mul.lo.s32 %r1095, %r1015, %r52; + mul.lo.s32 %r1096, %r1014, %r52; + mul.lo.s32 %r1097, %r1016, %r52; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r1098, %r1093, 2, 31, -1; + shfl.sync.bfly.b32 %r1099, %r1092, 2, 31, -1; + shfl.sync.bfly.b32 %r1100, %r1091, 2, 31, -1; + shfl.sync.bfly.b32 %r1101, %r1090, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r1102, %r1100, %r1091; + add.s32 %r1103, %r1098, %r1093; + add.s32 %r1104, %r1101, %r1090; + add.s32 %r1105, %r1099, %r1092; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r1106, %r1097, 2, 31, -1; + shfl.sync.bfly.b32 %r1107, %r1096, 2, 31, -1; + shfl.sync.bfly.b32 %r1108, %r1095, 2, 31, -1; + shfl.sync.bfly.b32 %r1109, %r1094, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r1110, %r1108, %r1095; + add.s32 %r1111, %r1106, %r1097; + add.s32 %r1112, %r1109, %r1094; + add.s32 %r1113, %r1107, %r1096; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.lt.s32 %p443, %r1033, %r1057; + setp.lt.s32 %p444, %r1032, %r1056; + setp.lt.s32 %p445, %r1031, %r1055; + setp.lt.s32 %p446, %r1030, %r1054; + setp.lt.s32 %p447, %r1041, %r1065; + setp.lt.s32 %p448, %r1040, %r1064; + setp.lt.s32 %p449, %r1039, %r1063; + setp.lt.s32 %p450, %r1038, %r1062; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.eq.b32 %p451, %r1030, %r1054; + setp.eq.b32 %p452, %r1031, %r1055; + setp.eq.b32 %p453, %r1032, %r1056; + setp.eq.b32 %p454, %r1033, %r1057; + setp.eq.b32 %p455, %r1038, %r1062; + setp.eq.b32 %p456, %r1039, %r1063; + setp.eq.b32 %p457, %r1040, %r1064; + setp.eq.b32 %p458, %r1041, %r1065; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.gt.s32 %p459, %r1081, %r1105; + setp.gt.s32 %p460, %r1080, %r1104; + setp.gt.s32 %p461, %r1079, %r1103; + setp.gt.s32 %p462, %r1078, %r1102; + setp.gt.s32 %p463, %r1089, %r1113; + setp.gt.s32 %p464, %r1088, %r1112; + setp.gt.s32 %p465, %r1087, %r1111; + setp.gt.s32 %p466, %r1086, %r1110; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + and.pred %p467, %p454, %p462; + and.pred %p468, %p453, %p461; + and.pred %p469, %p452, %p460; + and.pred %p470, %p451, %p459; + and.pred %p471, %p458, %p466; + and.pred %p472, %p457, %p465; + and.pred %p473, %p456, %p464; + and.pred %p474, %p455, %p463; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + or.pred %p475, %p446, %p470; + or.pred %p476, %p445, %p469; + or.pred %p477, %p444, %p468; + or.pred %p478, %p443, %p467; + or.pred %p479, %p450, %p474; + or.pred %p480, %p449, %p473; + or.pred %p481, %p448, %p472; + or.pred %p482, %p447, %p471; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r1114, %r1030, %r1054; + xor.b32 %r1115, %r1031, %r1055; + xor.b32 %r1116, %r1032, %r1056; + xor.b32 %r1117, %r1033, %r1057; + xor.b32 %r1118, %r1038, %r1062; + xor.b32 %r1119, %r1039, %r1063; + xor.b32 %r1120, %r1040, %r1064; + xor.b32 %r1121, %r1041, %r1065; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r1122, %r1117, 0, %p478; + selp.b32 %r1123, %r1116, 0, %p477; + selp.b32 %r1124, %r1115, 0, %p476; + selp.b32 %r1125, %r1114, 0, %p475; + selp.b32 %r1126, %r1121, 0, %p482; + selp.b32 %r1127, %r1120, 0, %p481; + selp.b32 %r1128, %r1119, 0, %p480; + selp.b32 %r1129, %r1118, 0, %p479; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r1130, %r1125, %r989; + xor.b32 %r1131, %r1123, %r987; + xor.b32 %r1132, %r1122, %r986; + xor.b32 %r1133, %r1124, %r988; + xor.b32 %r1134, %r1129, %r993; + xor.b32 %r1135, %r1127, %r991; + xor.b32 %r1136, %r1126, %r990; + xor.b32 %r1137, %r1128, %r992; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r1138, %r1105, %r1081; + xor.b32 %r1139, %r1102, %r1078; + xor.b32 %r1140, %r1104, %r1080; + xor.b32 %r1141, %r1103, %r1079; + xor.b32 %r1142, %r1113, %r1089; + xor.b32 %r1143, %r1110, %r1086; + xor.b32 %r1144, %r1112, %r1088; + xor.b32 %r1145, %r1111, %r1087; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r1146, %r1141, 0, %p477; + selp.b32 %r1147, %r1140, 0, %p476; + selp.b32 %r1148, %r1139, 0, %p478; + selp.b32 %r1149, %r1138, 0, %p475; + selp.b32 %r1150, %r1145, 0, %p481; + selp.b32 %r1151, %r1144, 0, %p480; + selp.b32 %r1152, %r1143, 0, %p482; + selp.b32 %r1153, %r1142, 0, %p479; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r1158, %r1149, %r1006; + xor.b32 %r1159, %r1146, %r1008; + xor.b32 %r1160, %r1147, %r1009; + xor.b32 %r1161, %r1148, %r1007; + xor.b32 %r1166, %r1153, %r1014; + xor.b32 %r1167, %r1150, %r1016; + xor.b32 %r1168, %r1151, %r1017; + xor.b32 %r1169, %r1152, %r1015; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r1170, %r1133, %r53; + mul.lo.s32 %r1171, %r1132, %r53; + mul.lo.s32 %r1172, %r1130, %r53; + mul.lo.s32 %r1173, %r1131, %r53; + mul.lo.s32 %r1174, %r1137, %r53; + mul.lo.s32 %r1175, %r1136, %r53; + mul.lo.s32 %r1176, %r1134, %r53; + mul.lo.s32 %r1177, %r1135, %r53; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r1178, %r1173, 1, 31, -1; + shfl.sync.bfly.b32 %r1179, %r1172, 1, 31, -1; + shfl.sync.bfly.b32 %r1180, %r1171, 1, 31, -1; + shfl.sync.bfly.b32 %r1181, %r1170, 1, 31, -1; + shfl.sync.bfly.b32 %r1186, %r1177, 1, 31, -1; + shfl.sync.bfly.b32 %r1187, %r1176, 1, 31, -1; + shfl.sync.bfly.b32 %r1188, %r1175, 1, 31, -1; + shfl.sync.bfly.b32 %r1189, %r1174, 1, 31, -1; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r1194, %r1133, %r50; + mul.lo.s32 %r1195, %r1132, %r50; + mul.lo.s32 %r1196, %r1130, %r50; + mul.lo.s32 %r1197, %r1131, %r50; + mul.lo.s32 %r1198, %r1137, %r50; + mul.lo.s32 %r1199, %r1136, %r50; + mul.lo.s32 %r1200, %r1134, %r50; + mul.lo.s32 %r1201, %r1135, %r50; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r1202, %r1197, 1, 31, -1; + shfl.sync.bfly.b32 %r1203, %r1196, 1, 31, -1; + shfl.sync.bfly.b32 %r1204, %r1195, 1, 31, -1; + shfl.sync.bfly.b32 %r1205, %r1194, 1, 31, -1; + shfl.sync.bfly.b32 %r1210, %r1201, 1, 31, -1; + shfl.sync.bfly.b32 %r1211, %r1200, 1, 31, -1; + shfl.sync.bfly.b32 %r1212, %r1199, 1, 31, -1; + shfl.sync.bfly.b32 %r1213, %r1198, 1, 31, -1; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r1218, %r1159, %r53; + mul.lo.s32 %r1219, %r1158, %r53; + mul.lo.s32 %r1220, %r1161, %r53; + mul.lo.s32 %r1221, %r1160, %r53; + mul.lo.s32 %r1222, %r1167, %r53; + mul.lo.s32 %r1223, %r1166, %r53; + mul.lo.s32 %r1224, %r1169, %r53; + mul.lo.s32 %r1225, %r1168, %r53; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r1226, %r1218, 1, 31, -1; + shfl.sync.bfly.b32 %r1227, %r1219, 1, 31, -1; + shfl.sync.bfly.b32 %r1228, %r1220, 1, 31, -1; + shfl.sync.bfly.b32 %r1229, %r1221, 1, 31, -1; + shfl.sync.bfly.b32 %r1234, %r1222, 1, 31, -1; + shfl.sync.bfly.b32 %r1235, %r1223, 1, 31, -1; + shfl.sync.bfly.b32 %r1236, %r1224, 1, 31, -1; + shfl.sync.bfly.b32 %r1237, %r1225, 1, 31, -1; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r1242, %r1159, %r50; + mul.lo.s32 %r1243, %r1158, %r50; + mul.lo.s32 %r1244, %r1161, %r50; + mul.lo.s32 %r1245, %r1160, %r50; + mul.lo.s32 %r1246, %r1167, %r50; + mul.lo.s32 %r1247, %r1166, %r50; + mul.lo.s32 %r1248, %r1169, %r50; + mul.lo.s32 %r1249, %r1168, %r50; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r1250, %r1242, 1, 31, -1; + shfl.sync.bfly.b32 %r1251, %r1243, 1, 31, -1; + shfl.sync.bfly.b32 %r1252, %r1244, 1, 31, -1; + shfl.sync.bfly.b32 %r1253, %r1245, 1, 31, -1; + shfl.sync.bfly.b32 %r1258, %r1246, 1, 31, -1; + shfl.sync.bfly.b32 %r1259, %r1247, 1, 31, -1; + shfl.sync.bfly.b32 %r1260, %r1248, 1, 31, -1; + shfl.sync.bfly.b32 %r1261, %r1249, 1, 31, -1; +$L__tmp9: + .loc 1 54 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:54:35 + and.pred %p555, %p4, %p110; + selp.b16 %rs12, 1, 0, %p555; + shl.b16 %rs13, %rs12, 2; + and.pred %p556, %p4, %p109; + selp.b16 %rs14, -1, 0, %p556; + shl.b16 %rs15, %rs14, 3; + or.b16 %rs16, %rs15, %rs13; + and.pred %p557, %p4, %p112; + selp.b16 %rs17, 1, 0, %p557; + and.pred %p558, %p4, %p111; + selp.b16 %rs18, -1, 0, %p558; + shl.b16 %rs19, %rs18, 1; + or.b16 %rs20, %rs17, %rs19; + and.b16 %rs21, %rs20, 3; + or.b16 %rs22, %rs21, %rs16; +$L__tmp10: + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + and.b16 %rs23, %rs11, 15; + cvt.u32.u16 %r1406, %rs23; + popc.b32 %r1407, %r1406; + cvt.u64.u32 %rd43, %r1407; + and.b16 %rs24, %rs22, 15; + cvt.u32.u16 %r1408, %rs24; + popc.b32 %r1409, %r1408; + cvt.u64.u32 %rd44, %r1409; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + shfl.sync.bfly.b32 %r1410, %r1407, 2, 31, -1; + mov.b32 %r1411, 0; + shfl.sync.bfly.b32 %r1412, %r1411, 2, 31, -1; + cvt.u64.u32 %rd45, %r1410; + cvt.u64.u32 %rd46, %r1412; + shl.b64 %rd47, %rd46, 32; + or.b64 %rd48, %rd45, %rd47; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + add.s64 %rd49, %rd43, %rd48; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + mov.b64 {_, %r1413}, %rd49; + cvt.u32.u64 %r1414, %rd49; + shfl.sync.bfly.b32 %r1415, %r1414, 1, 31, -1; + shfl.sync.bfly.b32 %r1416, %r1413, 1, 31, -1; + cvt.u64.u32 %rd50, %r1415; + cvt.u64.u32 %rd51, %r1416; + shl.b64 %rd52, %rd51, 32; + or.b64 %rd53, %rd50, %rd52; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + add.s64 %rd54, %rd49, %rd53; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + shfl.sync.bfly.b32 %r1417, %r1409, 2, 31, -1; + shfl.sync.bfly.b32 %r1418, %r1411, 2, 31, -1; + cvt.u64.u32 %rd55, %r1417; + cvt.u64.u32 %rd56, %r1418; + shl.b64 %rd57, %rd56, 32; + or.b64 %rd58, %rd55, %rd57; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + add.s64 %rd59, %rd44, %rd58; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + mov.b64 {_, %r1419}, %rd59; + cvt.u32.u64 %r1420, %rd59; + shfl.sync.bfly.b32 %r1421, %r1420, 1, 31, -1; + shfl.sync.bfly.b32 %r1422, %r1419, 1, 31, -1; + cvt.u64.u32 %rd60, %r1421; + cvt.u64.u32 %rd61, %r1422; + shl.b64 %rd62, %rd61, 32; + or.b64 %rd63, %rd60, %rd62; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + add.s64 %rd64, %rd59, %rd63; +$L__tmp11: + .loc 1 60 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:60:21 + shl.b32 %r1423, %r3, 1; + mov.b32 %r1424, global_smem; + add.s32 %r1425, %r1424, %r1423; + st.shared.b64 [%r1425], %rd54; + st.shared.b64 [%r1425+512], %rd64; + bar.sync 0; + shl.b32 %r1426, %r4, 3; + add.s32 %r1427, %r1424, %r1426; + ld.shared.b64 %rd3, [%r1427]; + .loc 1 58 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:58:35 + and.pred %p559, %p2, %p10; + selp.b16 %rs25, 1, 0, %p559; + shl.b16 %rs26, %rs25, 2; + and.pred %p560, %p2, %p12; + selp.b16 %rs27, -1, 0, %p560; + shl.b16 %rs28, %rs27, 3; + or.b16 %rs29, %rs28, %rs26; + and.pred %p561, %p2, %p8; + selp.b16 %rs30, 1, 0, %p561; + and.pred %p562, %p2, %p9; + selp.b16 %rs31, -1, 0, %p562; + shl.b16 %rs32, %rs31, 1; + or.b16 %rs33, %rs30, %rs32; + and.b16 %rs34, %rs33, 3; + or.b16 %rs35, %rs34, %rs29; + and.pred %p563, %p4, %p16; + selp.b16 %rs36, 1, 0, %p563; + shl.b16 %rs37, %rs36, 2; + and.pred %p564, %p4, %p18; + selp.b16 %rs38, -1, 0, %p564; + shl.b16 %rs39, %rs38, 3; + or.b16 %rs40, %rs39, %rs37; + and.pred %p565, %p4, %p14; + selp.b16 %rs41, 1, 0, %p565; + and.pred %p566, %p4, %p15; + selp.b16 %rs42, -1, 0, %p566; + shl.b16 %rs43, %rs42, 1; + or.b16 %rs44, %rs41, %rs43; + and.b16 %rs45, %rs44, 3; + or.b16 %rs46, %rs45, %rs40; +$L__tmp12: + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + and.b16 %rs47, %rs35, 15; + cvt.u32.u16 %r1428, %rs47; + popc.b32 %r1429, %r1428; + cvt.u64.u32 %rd65, %r1429; + and.b16 %rs48, %rs46, 15; + cvt.u32.u16 %r1430, %rs48; + popc.b32 %r1431, %r1430; + cvt.u64.u32 %rd66, %r1431; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + shfl.sync.bfly.b32 %r1432, %r1429, 2, 31, -1; + shfl.sync.bfly.b32 %r1433, %r1411, 2, 31, -1; + cvt.u64.u32 %rd67, %r1432; + cvt.u64.u32 %rd68, %r1433; + shl.b64 %rd69, %rd68, 32; + or.b64 %rd70, %rd67, %rd69; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + add.s64 %rd71, %rd65, %rd70; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + mov.b64 {_, %r1434}, %rd71; + cvt.u32.u64 %r1435, %rd71; + shfl.sync.bfly.b32 %r1436, %r1435, 1, 31, -1; + shfl.sync.bfly.b32 %r1437, %r1434, 1, 31, -1; + cvt.u64.u32 %rd72, %r1436; + cvt.u64.u32 %rd73, %r1437; + shl.b64 %rd74, %rd73, 32; + or.b64 %rd75, %rd72, %rd74; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + add.s64 %rd4, %rd71, %rd75; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + shfl.sync.bfly.b32 %r1438, %r1431, 2, 31, -1; + shfl.sync.bfly.b32 %r1439, %r1411, 2, 31, -1; + cvt.u64.u32 %rd76, %r1438; + cvt.u64.u32 %rd77, %r1439; + shl.b64 %rd78, %rd77, 32; + or.b64 %rd79, %rd76, %rd78; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + add.s64 %rd80, %rd66, %rd79; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + mov.b64 {_, %r1440}, %rd80; + cvt.u32.u64 %r1441, %rd80; + shfl.sync.bfly.b32 %r1442, %r1441, 1, 31, -1; + shfl.sync.bfly.b32 %r1443, %r1440, 1, 31, -1; + cvt.u64.u32 %rd81, %r1442; + cvt.u64.u32 %rd82, %r1443; + shl.b64 %rd83, %rd82, 32; + or.b64 %rd84, %rd81, %rd83; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + add.s64 %rd5, %rd80, %rd84; +$L__tmp13: + .loc 1 61 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:61:21 + bar.sync 0; + st.shared.b64 [%r1425], %rd4; + st.shared.b64 [%r1425+512], %rd5; + bar.sync 0; + .loc 1 60 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:60:21 + cvt.u32.u64 %r1444, %rd54; + cvt.u32.u64 %r1445, %rd64; + .loc 1 64 19 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:64:19 + setp.lt.s32 %p567, %r26, %r1444; + setp.lt.s32 %p568, %r27, %r1444; + setp.lt.s32 %p569, %r24, %r1444; + setp.lt.s32 %p570, %r25, %r1444; + .loc 1 66 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:66:35 + selp.b32 %r1446, %r9, 16, %p570; + selp.b32 %r1447, %r8, 16, %p569; + selp.b32 %r1448, %r11, 16, %p568; + selp.b32 %r1449, %r10, 16, %p567; + .loc 1 68 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:68:20 + add.s32 %r1450, %r1449, 17; + add.s32 %r1451, %r1448, 17; + add.s32 %r1452, %r1447, 17; + add.s32 %r1453, %r1446, 17; + .loc 1 64 19 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:64:19 + setp.lt.s32 %p571, %r26, %r1445; + setp.lt.s32 %p572, %r27, %r1445; + setp.lt.s32 %p573, %r24, %r1445; + setp.lt.s32 %p574, %r25, %r1445; + .loc 1 66 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:66:35 + selp.b32 %r1454, %r13, 16, %p574; + selp.b32 %r1455, %r12, 16, %p573; + selp.b32 %r1456, %r15, 16, %p572; + selp.b32 %r1457, %r14, 16, %p571; + .loc 1 68 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:68:20 + add.s32 %r1458, %r1457, 17; + add.s32 %r1459, %r1456, 17; + add.s32 %r1460, %r1455, 17; + add.s32 %r1461, %r1454, 17; + .loc 1 69 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:69:20 + setp.lt.s32 %p575, %r1449, 0; + setp.lt.s32 %p576, %r1448, 0; + setp.lt.s32 %p577, %r1447, 0; + setp.lt.s32 %p578, %r1446, 0; + .loc 1 70 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:70:35 + selp.b32 %r29, %r1453, %r1446, %p578; + selp.b32 %r28, %r1452, %r1447, %p577; + selp.b32 %r31, %r1451, %r1448, %p576; + selp.b32 %r30, %r1450, %r1449, %p575; + .loc 1 71 38 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:71:38 + setp.gt.u32 %p579, %r30, 16; + selp.b16 %rs49, 1, 0, %p579; + shl.b16 %rs50, %rs49, 2; + setp.gt.u32 %p580, %r31, 16; + selp.b16 %rs51, -1, 0, %p580; + shl.b16 %rs52, %rs51, 3; + or.b16 %rs53, %rs52, %rs50; + setp.gt.u32 %p581, %r28, 16; + selp.b16 %rs54, 1, 0, %p581; + setp.gt.u32 %p582, %r29, 16; + selp.b16 %rs55, -1, 0, %p582; + shl.b16 %rs56, %rs55, 1; + or.b16 %rs57, %rs54, %rs56; + and.b16 %rs58, %rs57, 3; + or.b16 %rs59, %rs58, %rs53; + .loc 1 69 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:69:20 + setp.lt.s32 %p583, %r1457, 0; + setp.lt.s32 %p584, %r1456, 0; + setp.lt.s32 %p585, %r1455, 0; + setp.lt.s32 %p586, %r1454, 0; + .loc 1 70 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:70:35 + selp.b32 %r33, %r1461, %r1454, %p586; + selp.b32 %r32, %r1460, %r1455, %p585; + selp.b32 %r35, %r1459, %r1456, %p584; + selp.b32 %r34, %r1458, %r1457, %p583; + .loc 1 71 38 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:71:38 + setp.gt.u32 %p587, %r34, 16; + selp.b16 %rs60, 1, 0, %p587; + shl.b16 %rs61, %rs60, 2; + setp.gt.u32 %p588, %r35, 16; + selp.b16 %rs62, -1, 0, %p588; + shl.b16 %rs63, %rs62, 3; + or.b16 %rs64, %rs63, %rs61; + setp.gt.u32 %p589, %r32, 16; + selp.b16 %rs65, 1, 0, %p589; + setp.gt.u32 %p590, %r33, 16; + selp.b16 %rs66, -1, 0, %p590; + shl.b16 %rs67, %rs66, 1; + or.b16 %rs68, %rs65, %rs67; + and.b16 %rs69, %rs68, 3; + or.b16 %rs70, %rs69, %rs64; + .loc 1 71 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:71:63 + and.b16 %rs71, %rs59, 15; + setp.ne.b16 %p591, %rs71, 0; + and.pred %p592, %p2, %p591; + and.b16 %rs72, %rs70, 15; + setp.ne.b16 %p593, %rs72, 0; + and.pred %p594, %p4, %p593; + or.pred %p595, %p592, %p594; + not.pred %p596, %p595; + @%p596 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + .loc 1 0 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:0 + xor.b32 %r1002, %r967, %r999; + xor.b32 %r1003, %r966, %r998; + xor.b32 %r1004, %r967, %r998; + xor.b32 %r1005, %r966, %r999; + xor.b32 %r1010, %r969, %r1001; + xor.b32 %r1011, %r968, %r1000; + xor.b32 %r1012, %r969, %r1000; + xor.b32 %r1013, %r968, %r1001; + xor.b32 %r1154, %r1005, %r1149; + xor.b32 %r1155, %r1004, %r1148; + xor.b32 %r1156, %r1003, %r1147; + xor.b32 %r1157, %r1002, %r1146; + xor.b32 %r1162, %r1013, %r1153; + xor.b32 %r1163, %r1012, %r1152; + xor.b32 %r1164, %r1011, %r1151; + xor.b32 %r1165, %r1010, %r1150; + add.s32 %r1182, %r1179, %r1172; + add.s32 %r1183, %r1181, %r1170; + add.s32 %r1184, %r1178, %r1173; + add.s32 %r1185, %r1180, %r1171; + add.s32 %r1190, %r1187, %r1176; + add.s32 %r1191, %r1189, %r1174; + add.s32 %r1192, %r1186, %r1177; + add.s32 %r1193, %r1188, %r1175; + add.s32 %r1206, %r1203, %r1196; + add.s32 %r1207, %r1205, %r1194; + add.s32 %r1208, %r1202, %r1197; + add.s32 %r1209, %r1204, %r1195; + add.s32 %r1214, %r1211, %r1200; + add.s32 %r1215, %r1213, %r1198; + add.s32 %r1216, %r1210, %r1201; + add.s32 %r1217, %r1212, %r1199; + add.s32 %r1230, %r1228, %r1220; + add.s32 %r1231, %r1227, %r1219; + add.s32 %r1232, %r1226, %r1218; + add.s32 %r1233, %r1229, %r1221; + add.s32 %r1238, %r1236, %r1224; + add.s32 %r1239, %r1235, %r1223; + add.s32 %r1240, %r1234, %r1222; + add.s32 %r1241, %r1237, %r1225; + add.s32 %r1254, %r1252, %r1244; + add.s32 %r1255, %r1251, %r1243; + add.s32 %r1256, %r1250, %r1242; + add.s32 %r1257, %r1253, %r1245; + add.s32 %r1262, %r1260, %r1248; + add.s32 %r1263, %r1259, %r1247; + add.s32 %r1264, %r1258, %r1246; + add.s32 %r1265, %r1261, %r1249; + setp.lt.s32 %p483, %r1185, %r1209; + setp.lt.s32 %p484, %r1184, %r1208; + setp.lt.s32 %p485, %r1183, %r1207; + setp.lt.s32 %p486, %r1182, %r1206; + setp.lt.s32 %p487, %r1193, %r1217; + setp.lt.s32 %p488, %r1192, %r1216; + setp.lt.s32 %p489, %r1191, %r1215; + setp.lt.s32 %p490, %r1190, %r1214; + setp.eq.b32 %p491, %r1182, %r1206; + setp.eq.b32 %p492, %r1183, %r1207; + setp.eq.b32 %p493, %r1184, %r1208; + setp.eq.b32 %p494, %r1185, %r1209; + setp.eq.b32 %p495, %r1190, %r1214; + setp.eq.b32 %p496, %r1191, %r1215; + setp.eq.b32 %p497, %r1192, %r1216; + setp.eq.b32 %p498, %r1193, %r1217; + setp.gt.s32 %p499, %r1231, %r1255; + setp.gt.s32 %p500, %r1233, %r1257; + setp.gt.s32 %p501, %r1232, %r1256; + setp.gt.s32 %p502, %r1230, %r1254; + setp.gt.s32 %p503, %r1239, %r1263; + setp.gt.s32 %p504, %r1241, %r1265; + setp.gt.s32 %p505, %r1240, %r1264; + setp.gt.s32 %p506, %r1238, %r1262; + and.pred %p507, %p494, %p502; + and.pred %p508, %p493, %p501; + and.pred %p509, %p492, %p500; + and.pred %p510, %p491, %p499; + and.pred %p511, %p498, %p506; + and.pred %p512, %p497, %p505; + and.pred %p513, %p496, %p504; + and.pred %p514, %p495, %p503; + or.pred %p515, %p486, %p510; + or.pred %p516, %p485, %p509; + or.pred %p517, %p484, %p508; + or.pred %p518, %p483, %p507; + or.pred %p519, %p490, %p514; + or.pred %p520, %p489, %p513; + or.pred %p521, %p488, %p512; + or.pred %p522, %p487, %p511; + xor.b32 %r1266, %r1206, %r1182; + xor.b32 %r1267, %r1207, %r1183; + xor.b32 %r1268, %r1208, %r1184; + xor.b32 %r1269, %r1209, %r1185; + xor.b32 %r1270, %r1214, %r1190; + xor.b32 %r1271, %r1215, %r1191; + xor.b32 %r1272, %r1216, %r1192; + xor.b32 %r1273, %r1217, %r1193; + selp.b32 %r1274, %r1269, 0, %p518; + selp.b32 %r1275, %r1268, 0, %p517; + selp.b32 %r1276, %r1267, 0, %p516; + selp.b32 %r1277, %r1266, 0, %p515; + selp.b32 %r1278, %r1273, 0, %p522; + selp.b32 %r1279, %r1272, 0, %p521; + selp.b32 %r1280, %r1271, 0, %p520; + selp.b32 %r1281, %r1270, 0, %p519; + xor.b32 %r1282, %r1274, %r1132; + xor.b32 %r1283, %r1275, %r1131; + xor.b32 %r1284, %r1277, %r1130; + xor.b32 %r1285, %r1276, %r1133; + xor.b32 %r1286, %r1278, %r1136; + xor.b32 %r1287, %r1279, %r1135; + xor.b32 %r1288, %r1281, %r1134; + xor.b32 %r1289, %r1280, %r1137; + xor.b32 %r1290, %r1254, %r1230; + xor.b32 %r1291, %r1255, %r1231; + xor.b32 %r1292, %r1256, %r1232; + xor.b32 %r1293, %r1257, %r1233; + xor.b32 %r1294, %r1262, %r1238; + xor.b32 %r1295, %r1263, %r1239; + xor.b32 %r1296, %r1264, %r1240; + xor.b32 %r1297, %r1265, %r1241; + selp.b32 %r1298, %r1291, 0, %p515; + selp.b32 %r1299, %r1290, 0, %p518; + selp.b32 %r1300, %r1293, 0, %p516; + selp.b32 %r1301, %r1292, 0, %p517; + selp.b32 %r1302, %r1295, 0, %p519; + selp.b32 %r1303, %r1294, 0, %p522; + selp.b32 %r1304, %r1297, 0, %p520; + selp.b32 %r1305, %r1296, 0, %p521; + xor.b32 %r1306, %r1157, %r1301; + xor.b32 %r1307, %r1156, %r1300; + xor.b32 %r1308, %r1301, %r1159; + xor.b32 %r1309, %r1299, %r1161; + xor.b32 %r1310, %r1298, %r1158; + xor.b32 %r1311, %r1300, %r1160; + xor.b32 %r1312, %r1165, %r1305; + xor.b32 %r1313, %r1164, %r1304; + xor.b32 %r1314, %r1305, %r1167; + xor.b32 %r1315, %r1303, %r1169; + xor.b32 %r1316, %r1302, %r1166; + xor.b32 %r1317, %r1304, %r1168; + setp.lt.s32 %p523, %r1283, %r1282; + setp.lt.s32 %p524, %r1284, %r1285; + setp.lt.s32 %p525, %r1287, %r1286; + setp.lt.s32 %p526, %r1288, %r1289; + setp.eq.b32 %p527, %r1283, %r1282; + setp.eq.b32 %p528, %r1284, %r1285; + setp.eq.b32 %p529, %r1287, %r1286; + setp.eq.b32 %p530, %r1288, %r1289; + setp.gt.s32 %p531, %r1308, %r1309; + setp.gt.s32 %p532, %r1310, %r1311; + setp.gt.s32 %p533, %r1314, %r1315; + setp.gt.s32 %p534, %r1316, %r1317; + and.pred %p535, %p527, %p531; + and.pred %p536, %p528, %p532; + and.pred %p537, %p529, %p533; + and.pred %p538, %p530, %p534; + or.pred %p539, %p523, %p535; + or.pred %p540, %p524, %p536; + or.pred %p541, %p525, %p537; + or.pred %p542, %p526, %p538; + xor.b32 %r1318, %r1282, %r1283; + xor.b32 %r1319, %r1285, %r1284; + xor.b32 %r1320, %r1286, %r1287; + xor.b32 %r1321, %r1289, %r1288; + selp.b32 %r1322, %r1319, 0, %p540; + selp.b32 %r1323, %r1318, 0, %p539; + selp.b32 %r1324, %r1321, 0, %p542; + selp.b32 %r1325, %r1320, 0, %p541; + xor.b32 %r1326, %r1323, %r1282; + xor.b32 %r1327, %r1322, %r1284; + xor.b32 %r1328, %r1323, %r1283; + xor.b32 %r1329, %r1322, %r1285; + xor.b32 %r1330, %r1325, %r1286; + xor.b32 %r1331, %r1324, %r1288; + xor.b32 %r1332, %r1325, %r1287; + xor.b32 %r1333, %r1324, %r1289; + xor.b32 %r1334, %r1311, %r1310; + xor.b32 %r1335, %r1309, %r1308; + xor.b32 %r1336, %r1317, %r1316; + xor.b32 %r1337, %r1315, %r1314; + selp.b32 %r1338, %r1335, 0, %p539; + selp.b32 %r1339, %r1334, 0, %p540; + selp.b32 %r1340, %r1337, 0, %p541; + selp.b32 %r1341, %r1336, 0, %p542; + xor.b32 %r1342, %r1307, %r1339; + xor.b32 %r1343, %r1299, %r1338; + xor.b32 %r1344, %r1343, %r1155; + xor.b32 %r1345, %r1298, %r1339; + xor.b32 %r1346, %r1345, %r1154; + xor.b32 %r1347, %r1306, %r1338; + xor.b32 %r1348, %r1339, %r1311; + xor.b32 %r1349, %r1339, %r1310; + xor.b32 %r1350, %r1338, %r1309; + xor.b32 %r1351, %r1338, %r1308; + xor.b32 %r1352, %r1313, %r1341; + xor.b32 %r1353, %r1303, %r1340; + xor.b32 %r1354, %r1353, %r1163; + xor.b32 %r1355, %r1302, %r1341; + xor.b32 %r1356, %r1355, %r1162; + xor.b32 %r1357, %r1312, %r1340; + xor.b32 %r1358, %r1341, %r1317; + xor.b32 %r1359, %r1341, %r1316; + xor.b32 %r1360, %r1340, %r1315; + xor.b32 %r1361, %r1340, %r1314; + setp.lt.s32 %p543, %r1328, %r1327; + setp.lt.s32 %p544, %r1326, %r1329; + setp.lt.s32 %p545, %r1332, %r1331; + setp.lt.s32 %p546, %r1330, %r1333; + setp.eq.b32 %p547, %r1327, %r1328; + setp.eq.b32 %p548, %r1326, %r1329; + setp.eq.b32 %p549, %r1331, %r1332; + setp.eq.b32 %p550, %r1330, %r1333; + setp.gt.s32 %p551, %r1351, %r1349; + setp.gt.s32 %p552, %r1350, %r1348; + setp.gt.s32 %p553, %r1361, %r1359; + setp.gt.s32 %p554, %r1360, %r1358; + xor.b32 %r1362, %r1343, %r1300; + xor.b32 %r1363, %r1345, %r1301; + xor.b32 %r1364, %r1363, %r1006; + xor.b32 %r1365, %r1362, %r1007; + xor.b32 %r1366, %r1365, %r1148; + xor.b32 %r1367, %r1364, %r1149; + xor.b32 %r1368, %r1367, %r1008; + xor.b32 %r1369, %r1366, %r1009; + xor.b32 %r1370, %r1369, %r1147; + xor.b32 %r1371, %r1368, %r1146; + xor.b32 %r1372, %r1371, %r1338; + xor.b32 %r1373, %r1370, %r1339; + xor.b32 %r1374, %r1353, %r1304; + xor.b32 %r1375, %r1355, %r1305; + xor.b32 %r1376, %r1375, %r1014; + xor.b32 %r1377, %r1374, %r1015; + xor.b32 %r1378, %r1377, %r1152; + xor.b32 %r1379, %r1376, %r1153; + xor.b32 %r1380, %r1379, %r1016; + xor.b32 %r1381, %r1378, %r1017; + xor.b32 %r1382, %r1381, %r1151; + xor.b32 %r1383, %r1380, %r1150; + xor.b32 %r1384, %r1383, %r1340; + xor.b32 %r1385, %r1382, %r1341; + selp.b32 %r1386, %r1373, 0, %p552; + selp.b32 %r1387, %r1386, 0, %p548; + selp.b32 %r1388, %r1373, %r1387, %p544; + selp.b32 %r1389, %r1372, 0, %p551; + selp.b32 %r1390, %r1389, 0, %p547; + selp.b32 %r1391, %r1372, %r1390, %p543; + selp.b32 %r1392, %r1385, 0, %p554; + selp.b32 %r1393, %r1392, 0, %p550; + selp.b32 %r1394, %r1385, %r1393, %p546; + selp.b32 %r1395, %r1384, 0, %p553; + selp.b32 %r1396, %r1395, 0, %p549; + selp.b32 %r1397, %r1384, %r1396, %p545; + xor.b32 %r1398, %r1347, %r1391; + xor.b32 %r1399, %r1346, %r1391; + xor.b32 %r1400, %r1344, %r1388; + xor.b32 %r1401, %r1342, %r1388; + xor.b32 %r19, %r1401, %r941; + xor.b32 %r18, %r1400, %r939; + xor.b32 %r17, %r1399, %r940; + xor.b32 %r16, %r1398, %r938; + xor.b32 %r1402, %r1357, %r1397; + xor.b32 %r1403, %r1356, %r1397; + xor.b32 %r1404, %r1354, %r1394; + xor.b32 %r1405, %r1352, %r1394; + xor.b32 %r23, %r1405, %r945; + xor.b32 %r22, %r1404, %r943; + xor.b32 %r21, %r1403, %r944; + xor.b32 %r20, %r1402, %r942; + .loc 1 61 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:61:21 + ld.shared.b64 %rd6, [%r1427]; + cvt.u32.u64 %r1462, %rd5; + cvt.u32.u64 %r1463, %rd4; + .loc 1 71 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:71:63 + bar.sync 0; + .loc 1 75 19 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:75:19 + setp.lt.s32 %p599, %r26, %r1463; + setp.lt.s32 %p600, %r27, %r1463; + setp.lt.s32 %p601, %r24, %r1463; + setp.lt.s32 %p602, %r25, %r1463; + .loc 1 76 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:76:35 + selp.b32 %r1464, %r17, 16, %p602; + selp.b32 %r1465, %r16, 16, %p601; + selp.b32 %r1466, %r19, 16, %p600; + selp.b32 %r1467, %r18, 16, %p599; + .loc 1 77 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:77:20 + add.s32 %r1468, %r1467, 17; + add.s32 %r1469, %r1466, 17; + add.s32 %r1470, %r1465, 17; + add.s32 %r1471, %r1464, 17; + .loc 1 78 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:78:20 + setp.lt.s32 %p603, %r1467, 0; + setp.lt.s32 %p604, %r1466, 0; + setp.lt.s32 %p605, %r1465, 0; + setp.lt.s32 %p606, %r1464, 0; + .loc 1 79 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:79:35 + selp.b32 %r37, %r1471, %r1464, %p606; + selp.b32 %r36, %r1470, %r1465, %p605; + selp.b32 %r39, %r1469, %r1466, %p604; + selp.b32 %r38, %r1468, %r1467, %p603; + .loc 1 80 38 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:80:38 + setp.gt.u32 %p607, %r38, 16; + selp.b16 %rs73, 1, 0, %p607; + shl.b16 %rs74, %rs73, 2; + setp.gt.u32 %p608, %r39, 16; + selp.b16 %rs75, -1, 0, %p608; + shl.b16 %rs76, %rs75, 3; + or.b16 %rs77, %rs76, %rs74; + setp.gt.u32 %p609, %r36, 16; + selp.b16 %rs78, 1, 0, %p609; + setp.gt.u32 %p610, %r37, 16; + selp.b16 %rs79, -1, 0, %p610; + shl.b16 %rs80, %rs79, 1; + or.b16 %rs81, %rs78, %rs80; + and.b16 %rs82, %rs81, 3; + or.b16 %rs83, %rs82, %rs77; + .loc 1 75 19 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:75:19 + setp.lt.s32 %p611, %r26, %r1462; + setp.lt.s32 %p612, %r27, %r1462; + setp.lt.s32 %p613, %r24, %r1462; + setp.lt.s32 %p614, %r25, %r1462; + .loc 1 76 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:76:35 + selp.b32 %r1472, %r21, 16, %p614; + selp.b32 %r1473, %r20, 16, %p613; + selp.b32 %r1474, %r23, 16, %p612; + selp.b32 %r1475, %r22, 16, %p611; + .loc 1 77 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:77:20 + add.s32 %r1476, %r1475, 17; + add.s32 %r1477, %r1474, 17; + add.s32 %r1478, %r1473, 17; + add.s32 %r1479, %r1472, 17; + .loc 1 78 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:78:20 + setp.lt.s32 %p615, %r1475, 0; + setp.lt.s32 %p616, %r1474, 0; + setp.lt.s32 %p617, %r1473, 0; + setp.lt.s32 %p618, %r1472, 0; + .loc 1 79 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:79:35 + selp.b32 %r41, %r1479, %r1472, %p618; + selp.b32 %r40, %r1478, %r1473, %p617; + selp.b32 %r43, %r1477, %r1474, %p616; + selp.b32 %r42, %r1476, %r1475, %p615; + .loc 1 80 38 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:80:38 + setp.gt.u32 %p619, %r42, 16; + selp.b16 %rs84, 1, 0, %p619; + shl.b16 %rs85, %rs84, 2; + setp.gt.u32 %p620, %r43, 16; + selp.b16 %rs86, -1, 0, %p620; + shl.b16 %rs87, %rs86, 3; + or.b16 %rs88, %rs87, %rs85; + setp.gt.u32 %p621, %r40, 16; + selp.b16 %rs89, 1, 0, %p621; + setp.gt.u32 %p622, %r41, 16; + selp.b16 %rs90, -1, 0, %p622; + shl.b16 %rs91, %rs90, 1; + or.b16 %rs92, %rs89, %rs91; + and.b16 %rs93, %rs92, 3; + or.b16 %rs94, %rs93, %rs88; + .loc 1 80 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:80:63 + and.b16 %rs95, %rs83, 15; + setp.ne.b16 %p623, %rs95, 0; + and.pred %p624, %p2, %p623; + and.b16 %rs96, %rs94, 15; + setp.ne.b16 %p625, %rs96, 0; + and.pred %p626, %p4, %p625; + or.pred %p627, %p624, %p626; + not.pred %p628, %p627; + @%p628 bra $L__BB0_4; + bra.uni $L__BB0_3; +$L__BB0_4: + .loc 1 0 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:0:63 + ld.param.b64 %rd12, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6]; + ld.param.b64 %rd11, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5]; + ld.param.b64 %rd10, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4]; + ld.param.b64 %rd9, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3]; + ld.param.b64 %rd8, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2]; + ld.param.b64 %rd7, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1]; + cvt.s64.s32 %rd1, %r48; + cvt.s64.s32 %rd2, %r49; + .loc 1 61 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:61:21 + cvt.u32.u64 %r1481, %rd6; + .loc 1 60 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:60:21 + cvt.u32.u64 %r1480, %rd3; + .loc 1 25 23 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:25:23 + or.b32 %r1514, %r1, %r4; + .loc 1 26 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:26:21 + setp.lt.s32 %p633, %r1514, 128; + .loc 1 80 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:80:63 + bar.sync 0; + .loc 1 81 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:81:25 + mul.wide.s32 %rd107, %r1514, 4; + add.s64 %rd85, %rd7, %rd107; + .loc 1 81 37 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:81:37 + and.b32 %r1515, %r2, 128; + setp.eq.b32 %p651, %r1515, 0; + and.pred %p629, %p651, %p633; + // begin inline asm + @%p629 st.global.b32 [ %rd85 + 0 ], { %r1480 }; + // end inline asm + .loc 1 82 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:82:25 + add.s64 %rd86, %rd8, %rd107; + .loc 1 82 37 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:82:37 + // begin inline asm + @%p629 st.global.b32 [ %rd86 + 0 ], { %r1481 }; + // end inline asm + .loc 1 83 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:83:25 + shl.b64 %rd108, %rd1, 2; + add.s64 %rd87, %rd9, %rd108; + shl.b64 %rd109, %rd2, 2; + add.s64 %rd88, %rd9, %rd109; + .loc 1 83 47 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:83:47 + // begin inline asm + @%p2 st.global.v4.b32 [ %rd87 + 0 ], { %r8, %r9, %r10, %r11 }; + // end inline asm + // begin inline asm + @%p4 st.global.v4.b32 [ %rd88 + 0 ], { %r12, %r13, %r14, %r15 }; + // end inline asm + .loc 1 84 52 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:84:52 + mul.lo.s32 %r1516, %r5, 17; + add.s32 %r1517, %r1516, 1088; + .loc 1 84 49 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:84:49 + add.s32 %r1518, %r28, %r1516; + add.s32 %r1519, %r29, %r1516; + add.s32 %r1520, %r30, %r1516; + add.s32 %r1521, %r31, %r1516; + add.s32 %r1522, %r32, %r1517; + add.s32 %r1523, %r33, %r1517; + add.s32 %r1524, %r34, %r1517; + add.s32 %r1525, %r35, %r1517; + .loc 1 84 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:84:25 + mad.wide.s32 %rd110, %r1518, 4, %rd10; + mad.wide.s32 %rd111, %r1519, 4, %rd10; + mad.wide.s32 %rd112, %r1520, 4, %rd10; + mad.wide.s32 %rd113, %r1521, 4, %rd10; + mad.wide.s32 %rd114, %r1522, 4, %rd10; + mad.wide.s32 %rd115, %r1523, 4, %rd10; + mad.wide.s32 %rd116, %r1524, 4, %rd10; + mad.wide.s32 %rd117, %r1525, 4, %rd10; + .loc 1 84 85 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:84:85 + bar.sync 0; + shl.b32 %r1526, %r2, 9; + and.b32 %r1527, %r1526, 12288; + shl.b32 %r1528, %r7, 5; + shl.b32 %r1529, %r3, 2; + or.b32 %r1530, %r1527, %r1528; + xor.b32 %r1531, %r1530, %r1529; + add.s32 %r1533, %r1424, %r1531; + st.shared.v2.b64 [%r1533], {%rd110, %rd112}; + st.shared.v2.b64 [%r1533+2048], {%rd111, %rd113}; + st.shared.v2.b64 [%r1533+1024], {%rd114, %rd116}; + st.shared.v2.b64 [%r1533+3072], {%rd115, %rd117}; + bar.sync 0; + shl.b32 %r1534, %r2, 11; + and.b32 %r1535, %r1534, 12288; + shl.b32 %r1536, %r2, 4; + and.b32 %r1537, %r1536, 4080; + or.b32 %r1538, %r1535, %r1537; + add.s32 %r1539, %r1424, %r1538; + ld.shared.v2.b64 {%rd89, %rd90}, [%r1539]; + xor.b32 %r1540, %r1538, 32; + add.s32 %r1541, %r1424, %r1540; + ld.shared.v2.b64 {%rd91, %rd92}, [%r1541]; + xor.b32 %r1542, %r1538, 64; + add.s32 %r1543, %r1424, %r1542; + ld.shared.v2.b64 {%rd93, %rd94}, [%r1543]; + xor.b32 %r1544, %r1538, 96; + add.s32 %r1545, %r1424, %r1544; + ld.shared.v2.b64 {%rd95, %rd96}, [%r1545]; + mov.b32 %r1490, 1; + // begin inline asm + @%p633 st.global.b32 [ %rd89 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd90 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd91 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd92 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd93 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd94 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd95 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd96 + 0 ], { %r1490 }; + // end inline asm + .loc 1 85 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:85:25 + add.s64 %rd97, %rd11, %rd108; + add.s64 %rd98, %rd11, %rd109; + .loc 1 85 47 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:85:47 + // begin inline asm + @%p2 st.global.v4.b32 [ %rd97 + 0 ], { %r16, %r17, %r18, %r19 }; + // end inline asm + // begin inline asm + @%p4 st.global.v4.b32 [ %rd98 + 0 ], { %r20, %r21, %r22, %r23 }; + // end inline asm + .loc 1 86 49 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:86:49 + add.s32 %r1546, %r36, %r1516; + add.s32 %r1547, %r37, %r1516; + add.s32 %r1548, %r38, %r1516; + add.s32 %r1549, %r39, %r1516; + add.s32 %r1550, %r40, %r1517; + add.s32 %r1551, %r41, %r1517; + add.s32 %r1552, %r42, %r1517; + add.s32 %r1553, %r43, %r1517; + .loc 1 86 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:86:25 + mad.wide.s32 %rd118, %r1546, 4, %rd12; + mad.wide.s32 %rd119, %r1547, 4, %rd12; + mad.wide.s32 %rd120, %r1548, 4, %rd12; + mad.wide.s32 %rd121, %r1549, 4, %rd12; + mad.wide.s32 %rd122, %r1550, 4, %rd12; + mad.wide.s32 %rd123, %r1551, 4, %rd12; + mad.wide.s32 %rd124, %r1552, 4, %rd12; + mad.wide.s32 %rd125, %r1553, 4, %rd12; + .loc 1 86 85 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:86:85 + bar.sync 0; + st.shared.v2.b64 [%r1533], {%rd118, %rd120}; + st.shared.v2.b64 [%r1533+2048], {%rd119, %rd121}; + st.shared.v2.b64 [%r1533+1024], {%rd122, %rd124}; + st.shared.v2.b64 [%r1533+3072], {%rd123, %rd125}; + bar.sync 0; + ld.shared.v2.b64 {%rd99, %rd100}, [%r1539]; + ld.shared.v2.b64 {%rd101, %rd102}, [%r1541]; + ld.shared.v2.b64 {%rd103, %rd104}, [%r1543]; + ld.shared.v2.b64 {%rd105, %rd106}, [%r1545]; + // begin inline asm + @%p633 st.global.b32 [ %rd99 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd100 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd101 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd102 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd103 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd104 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd105 + 0 ], { %r1490 }; + // end inline asm + // begin inline asm + @%p633 st.global.b32 [ %rd106 + 0 ], { %r1490 }; + // end inline asm + .loc 1 86 4 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:86:4 + ret; +$L__BB0_1: + .loc 1 71 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:71:63 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd132, assertFunc_0; + cvta.global.u64 %rd133, %rd132; + st.param.b64 [param3], %rd133; + mov.b64 %rd134, assertFile_0; + cvta.global.u64 %rd135, %rd134; + st.param.b64 [param1], %rd135; + mov.b64 %rd136, assertMessage_0; + cvta.global.u64 %rd137, %rd136; + st.param.b64 [param0], %rd137; + st.param.b64 [param4], 1; + st.param.b32 [param2], 71; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__BB0_3: + .loc 1 80 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:80:63 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd126, assertFunc_1; + cvta.global.u64 %rd127, %rd126; + st.param.b64 [param3], %rd127; + mov.b64 %rd128, assertFile_1; + cvta.global.u64 %rd129, %rd128; + st.param.b64 [param1], %rd129; + mov.b64 %rd130, assertMessage_1; + cvta.global.u64 %rd131, %rd130; + st.param.b64 [param0], %rd131; + st.param.b64 [param4], 1; + st.param.b32 [param2], 80; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp14: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 376 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x171 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 114 +.b8 107 +.b8 120 +.b8 112 +.b8 119 +.b8 105 +.b8 119 +.b8 104 +.b8 122 +.b8 107 +.b8 118 +.b8 117 +.b8 110 +.b8 55 +.b8 105 +.b8 53 +.b8 100 +.b8 50 +.b8 112 +.b8 101 +.b8 103 +.b8 111 +.b8 102 +.b8 116 +.b8 104 +.b8 121 +.b8 102 +.b8 105 +.b8 106 +.b8 110 +.b8 53 +.b8 119 +.b8 121 +.b8 103 +.b8 119 +.b8 110 +.b8 97 +.b8 101 +.b8 118 +.b8 51 +.b8 116 +.b8 119 +.b8 119 +.b8 108 +.b8 114 +.b8 98 +.b8 117 +.b8 111 +.b8 106 +.b8 113 +.b8 101 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 114 +.b8 107 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x7a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 116 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 112 +.b8 117 +.b8 116 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 110 +.b8 101 +.b8 119 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 115 +.b8 99 +.b8 97 +.b8 108 +.b8 97 +.b8 114 +.b8 95 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 119 +.b8 104 +.b8 101 +.b8 114 +.b8 101 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x105:0x76 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x132:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp9 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x14a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp10 // DW_AT_low_pc +.b64 $L__tmp11 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 55 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x162:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp12 // DW_AT_low_pc +.b64 $L__tmp13 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 59 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source b/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source new file mode 100644 index 0000000000000000000000000000000000000000..4b60b77e0eb173ea6f5ff4d263ca90dff436d53a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source @@ -0,0 +1,1405 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":18:0) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc141 = loc(unknown) +#loc166 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc170 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc175 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc179 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc188 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc193 = loc("in_ptr0"(#loc)) +#loc194 = loc("out_ptr4"(#loc)) +#loc195 = loc("out_ptr5"(#loc)) +#loc196 = loc("out_ptr6"(#loc)) +#loc197 = loc("out_ptr7"(#loc)) +#loc198 = loc("out_ptr8"(#loc)) +#loc199 = loc("out_ptr9"(#loc)) +#loc200 = loc("xnumel"(#loc)) +#loc201 = loc("r0_numel"(#loc)) +#loc257 = loc("x"(#loc91)) +#loc258 = loc("idxs"(#loc91)) +#loc259 = loc("x"(#loc95)) +#loc260 = loc("idxs"(#loc95)) +#loc265 = loc("x"(#loc103)) +#loc266 = loc("idxs"(#loc103)) +#loc267 = loc("flip"(#loc103)) +#loc323 = loc("input"(#loc166)) +#loc324 = loc("a"(#loc170)) +#loc325 = loc("b"(#loc170)) +#loc327 = loc("x"(#loc175)) +#loc328 = loc("x"(#loc179)) +#loc329 = loc("input"(#loc188)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 128 : i32 loc(#loc202) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc203) + %xoffset = tt.get_program_id x : i32 loc(#loc204) + %xoffset_2 = arith.constant 128 : i32 loc(#loc205) + %xoffset_3 = arith.constant 128 : i32 loc(#loc205) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc205) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc206) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc207) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<128x1xi32> loc(#loc208) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<128x1xi32> loc(#loc208) + %xmask = arith.constant dense<128> : tensor<128x1xi32> loc(#loc209) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<128x1xi32> loc(#loc209) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc210) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc211) + %r0_offset = arith.constant 0 : i32 loc(#loc212) + %r0_mask = arith.constant true loc(#loc213) + %r0_mask_10 = arith.constant dense : tensor<128x16xi1> loc(#loc213) + %tmp0 = arith.constant 16 : i32 loc(#loc214) + %tmp0_11 = arith.constant 16 : i32 loc(#loc214) + %tmp0_12 = arith.constant dense<16> : tensor<128x1xi32> loc(#loc214) + %tmp0_13 = arith.muli %tmp0_12, %xindex_7 : tensor<128x1xi32> loc(#loc214) + %tmp0_14 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc215) + %tmp0_15 = tt.broadcast %tmp0_13 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc215) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<128x16xi32> loc(#loc215) + %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc216) + %tmp0_18 = tt.addptr %tmp0_17, %tmp0_16 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc216) + %tmp0_19 = arith.constant 0.000000e+00 : f32 loc(#loc217) + %tmp0_20 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc217) + %tmp0_21 = arith.constant dense<0.000000e+00> : tensor<128x16xf32> loc(#loc217) + %tmp0_22 = arith.fptosi %tmp0_21 : tensor<128x16xf32> to tensor<128x16xi64> loc(#loc217) + %tmp0_23 = tt.load %tmp0_18, %tmp0_20, %tmp0_22 : tensor<128x16x!tt.ptr> loc(#loc217) + %tmp1 = arith.constant 0 : i64 loc(#loc218) + %tmp1_24 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc218) + %tmp2 = arith.constant dense<0> : tensor<128x16xi64> loc(#loc219) + %tmp2_25 = arith.cmpi sgt, %tmp0_23, %tmp2 : tensor<128x16xi64> loc(#loc219) + %tmp3 = arith.constant 16384 : i64 loc(#loc220) + %tmp3_26 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc220) + %tmp4 = arith.constant dense<16384> : tensor<128x16xi64> loc(#loc221) + %tmp4_27 = arith.cmpi slt, %tmp0_23, %tmp4 : tensor<128x16xi64> loc(#loc221) + %tmp5 = arith.andi %tmp2_25, %tmp4_27 : tensor<128x16xi1> loc(#loc222) + %tmp6 = arith.extui %tmp5 : tensor<128x16xi1> to tensor<128x16xi8> loc(#loc223) + %tmp7 = arith.extsi %tmp6 : tensor<128x16xi8> to tensor<128x16xi32> loc(#loc224) + %tmp9 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc225) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16> -> tensor<128x16xi16> loc(#loc226) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S128_16S_i16S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp7, %tmp11) : (tensor<128x16xi32>, tensor<128x16xi16>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc26) + %tmp14 = arith.constant dense<16384> : tensor<128x16xi64> loc(#loc227) + %tmp14_28 = arith.cmpi eq, %tmp0_23, %tmp14 : tensor<128x16xi64> loc(#loc227) + %tmp15 = arith.extui %tmp14_28 : tensor<128x16xi1> to tensor<128x16xi8> loc(#loc228) + %tmp16 = arith.extsi %tmp15 : tensor<128x16xi8> to tensor<128x16xi32> loc(#loc229) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S128_16S_i16S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp16, %tmp11) : (tensor<128x16xi32>, tensor<128x16xi16>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc30) + %tmp20 = arith.extsi %tmp7 : tensor<128x16xi32> to tensor<128x16xi64> loc(#loc230) + %tmp23 = arith.constant 0 : i32 loc(#loc231) + %tmp23_29 = arith.constant 0 : i64 loc(#loc231) + %tmp23_30 = arith.constant dense<0> : tensor<128x16xi64> loc(#loc231) + %tmp23_31 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc231) + %tmp23_32 = arith.select %tmp23_31, %tmp20, %tmp23_30 : tensor<128x16xi1>, tensor<128x16xi64> loc(#loc231) + %tmp24 = tt.call @"triton.language.standard.sum__i64S128_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp23_32) : (tensor<128x16xi64>) -> tensor<128xi64> loc(#loc232) + %tmp24_33 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<128xi64> -> tensor<128x1xi64> loc(#loc233) + %tmp25 = arith.extsi %tmp16 : tensor<128x16xi32> to tensor<128x16xi64> loc(#loc234) + %tmp28 = arith.constant 0 : i32 loc(#loc235) + %tmp28_34 = arith.constant 0 : i64 loc(#loc235) + %tmp28_35 = arith.constant dense<0> : tensor<128x16xi64> loc(#loc235) + %tmp28_36 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc235) + %tmp28_37 = arith.select %tmp28_36, %tmp25, %tmp28_35 : tensor<128x16xi1>, tensor<128x16xi64> loc(#loc235) + %tmp29 = tt.call @"triton.language.standard.sum__i64S128_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp28_37) : (tensor<128x16xi64>) -> tensor<128xi64> loc(#loc236) + %tmp29_38 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<128xi64> -> tensor<128x1xi64> loc(#loc237) + %tmp30 = arith.trunci %tmp24_33 : tensor<128x1xi64> to tensor<128x1xi32> loc(#loc238) + %tmp31 = arith.trunci %tmp29_38 : tensor<128x1xi64> to tensor<128x1xi32> loc(#loc239) + %tmp32 = arith.extsi %0#1 : tensor<128x16xi32> to tensor<128x16xi64> loc(#loc240) + %tmp33 = arith.trunci %tmp32 : tensor<128x16xi64> to tensor<128x16xi32> loc(#loc241) + %tmp34 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc242) + %tmp34_39 = tt.broadcast %tmp30 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc242) + %tmp34_40 = arith.cmpi slt, %tmp34, %tmp34_39 : tensor<128x16xi32> loc(#loc242) + %tmp35 = arith.constant 16 : i32 loc(#loc243) + %tmp35_41 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc243) + %tmp36 = arith.constant dense<16> : tensor<128x16xi32> loc(#loc244) + %tmp36_42 = arith.select %tmp34_40, %tmp33, %tmp36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc244) + %tmp37 = arith.constant 17 : i32 loc(#loc245) + %tmp37_43 = arith.constant dense<17> : tensor<128x16xi32> loc(#loc245) + %tmp38 = arith.addi %tmp36_42, %tmp37_43 : tensor<128x16xi32> loc(#loc246) + %tmp39 = arith.constant 0 : i32 loc(#loc247) + %tmp39_44 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc247) + %tmp39_45 = arith.cmpi slt, %tmp36_42, %tmp39_44 : tensor<128x16xi32> loc(#loc247) + %tmp40 = arith.select %tmp39_45, %tmp38, %tmp36_42 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc248) + %c0_i32 = arith.constant 0 : i32 loc(#loc50) + %cst = arith.constant dense<0> : tensor<128x16xi32> loc(#loc50) + %2 = arith.cmpi sle, %cst, %tmp40 : tensor<128x16xi32> loc(#loc50) + %c17_i32 = arith.constant 17 : i32 loc(#loc51) + %cst_46 = arith.constant dense<17> : tensor<128x16xi32> loc(#loc51) + %3 = arith.cmpi slt, %tmp40, %cst_46 : tensor<128x16xi32> loc(#loc51) + %4 = arith.andi %2, %3 : tensor<128x16xi1> loc(#loc52) + %true = arith.constant true loc(#loc53) + %cst_47 = arith.constant dense : tensor<128x1xi1> loc(#loc53) + %5 = arith.xori %xmask_8, %cst_47 : tensor<128x1xi1> loc(#loc53) + %6 = tt.broadcast %5 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc54) + %7 = arith.ori %4, %6 : tensor<128x16xi1> loc(#loc54) + tt.assert %7, "index out of bounds: 0 <= tmp40 < 17" : tensor<128x16xi1> loc(#loc55) + %tmp42 = arith.constant 1 : i32 loc(#loc249) + %tmp42_48 = arith.constant dense<1> : tensor<1x1xi32> loc(#loc249) + %tmp43 = arith.extsi %1#1 : tensor<128x16xi32> to tensor<128x16xi64> loc(#loc250) + %tmp44 = arith.trunci %tmp43 : tensor<128x16xi64> to tensor<128x16xi32> loc(#loc251) + %tmp45 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc252) + %tmp45_49 = tt.broadcast %tmp31 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc252) + %tmp45_50 = arith.cmpi slt, %tmp45, %tmp45_49 : tensor<128x16xi32> loc(#loc252) + %tmp46 = arith.constant dense<16> : tensor<128x16xi32> loc(#loc253) + %tmp46_51 = arith.select %tmp45_50, %tmp44, %tmp46 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc253) + %tmp47 = arith.addi %tmp46_51, %tmp37_43 : tensor<128x16xi32> loc(#loc254) + %tmp48 = arith.constant 0 : i32 loc(#loc255) + %tmp48_52 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc255) + %tmp48_53 = arith.cmpi slt, %tmp46_51, %tmp48_52 : tensor<128x16xi32> loc(#loc255) + %tmp49 = arith.select %tmp48_53, %tmp47, %tmp46_51 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc256) + %c0_i32_54 = arith.constant 0 : i32 loc(#loc64) + %cst_55 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc64) + %8 = arith.cmpi sle, %cst_55, %tmp49 : tensor<128x16xi32> loc(#loc64) + %c17_i32_56 = arith.constant 17 : i32 loc(#loc65) + %cst_57 = arith.constant dense<17> : tensor<128x16xi32> loc(#loc65) + %9 = arith.cmpi slt, %tmp49, %cst_57 : tensor<128x16xi32> loc(#loc65) + %10 = arith.andi %8, %9 : tensor<128x16xi1> loc(#loc66) + %true_58 = arith.constant true loc(#loc67) + %cst_59 = arith.constant dense : tensor<128x1xi1> loc(#loc67) + %11 = arith.xori %xmask_8, %cst_59 : tensor<128x1xi1> loc(#loc67) + %12 = tt.broadcast %11 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc68) + %13 = arith.ori %10, %12 : tensor<128x16xi1> loc(#loc68) + tt.assert %13, "index out of bounds: 0 <= tmp49 < 17" : tensor<128x16xi1> loc(#loc69) + %14 = tt.splat %out_ptr4 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc70) + %15 = tt.addptr %14, %xindex_7 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc70) + tt.store %15, %tmp30, %xmask_8 : tensor<128x1x!tt.ptr> loc(#loc71) + %16 = tt.splat %out_ptr5 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc72) + %17 = tt.addptr %16, %xindex_7 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc72) + tt.store %17, %tmp31, %xmask_8 : tensor<128x1x!tt.ptr> loc(#loc73) + %c16_i32 = arith.constant 16 : i32 loc(#loc74) + %c16_i32_60 = arith.constant 16 : i32 loc(#loc74) + %cst_61 = arith.constant dense<16> : tensor<128x1xi32> loc(#loc74) + %18 = arith.muli %cst_61, %xindex_7 : tensor<128x1xi32> loc(#loc74) + %19 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc75) + %20 = tt.broadcast %18 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc75) + %21 = arith.addi %19, %20 : tensor<128x16xi32> loc(#loc75) + %22 = tt.splat %out_ptr6 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc76) + %23 = tt.addptr %22, %21 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc76) + %24 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc77) + tt.store %23, %tmp33, %24 : tensor<128x16x!tt.ptr> loc(#loc77) + %c17_i32_62 = arith.constant 17 : i32 loc(#loc78) + %c17_i32_63 = arith.constant 17 : i32 loc(#loc78) + %cst_64 = arith.constant dense<17> : tensor<128x1xi32> loc(#loc78) + %25 = arith.muli %cst_64, %xindex_7 : tensor<128x1xi32> loc(#loc78) + %26 = tt.broadcast %25 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc79) + %27 = arith.addi %tmp40, %26 : tensor<128x16xi32> loc(#loc79) + %28 = tt.splat %out_ptr7 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc80) + %29 = tt.addptr %28, %27 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc80) + %cst_65 = arith.constant dense<1> : tensor<128x16xi32> loc(#loc81) + %30 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc81) + tt.store %29, %cst_65, %30 : tensor<128x16x!tt.ptr> loc(#loc81) + %c16_i32_66 = arith.constant 16 : i32 loc(#loc82) + %c16_i32_67 = arith.constant 16 : i32 loc(#loc82) + %cst_68 = arith.constant dense<16> : tensor<128x1xi32> loc(#loc82) + %31 = arith.muli %cst_68, %xindex_7 : tensor<128x1xi32> loc(#loc82) + %32 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc83) + %33 = tt.broadcast %31 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc83) + %34 = arith.addi %32, %33 : tensor<128x16xi32> loc(#loc83) + %35 = tt.splat %out_ptr8 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc84) + %36 = tt.addptr %35, %34 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc84) + %37 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc85) + tt.store %36, %tmp44, %37 : tensor<128x16x!tt.ptr> loc(#loc85) + %c17_i32_69 = arith.constant 17 : i32 loc(#loc86) + %c17_i32_70 = arith.constant 17 : i32 loc(#loc86) + %cst_71 = arith.constant dense<17> : tensor<128x1xi32> loc(#loc86) + %38 = arith.muli %cst_71, %xindex_7 : tensor<128x1xi32> loc(#loc86) + %39 = tt.broadcast %38 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc87) + %40 = arith.addi %tmp49, %39 : tensor<128x16xi32> loc(#loc87) + %41 = tt.splat %out_ptr9 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc88) + %42 = tt.addptr %41, %40 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc88) + %cst_72 = arith.constant dense<1> : tensor<128x16xi32> loc(#loc89) + %43 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc89) + tt.store %42, %cst_72, %43 : tensor<128x16x!tt.ptr> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S128_16S_i16S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc91)), %idxs: tensor<128x16xi16> loc("idxs"(#loc91))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i16S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<128x16xi32>, tensor<128x16xi16>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc92) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc92) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc92) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc92) + tt.return %3#0, %3#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc93) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc94) + %5 = ub.poison : tensor<128x16xi32> loc(#loc94) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc94) + } loc(#loc91) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i16S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc95)), %idxs: tensor<128x16xi16> loc("idxs"(#loc95))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i16S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi16>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + tt.return %0#0, %0#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc101) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x16xi32> loc(#loc102) + %2 = ub.poison : tensor<128x16xi32> loc(#loc102) + tt.return %1, %2 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i16S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc103)), %idxs: tensor<128x16xi16> loc("idxs"(#loc103)), %flip: tensor<128x16xi32> loc("flip"(#loc103))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<1024x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<1024x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<128x16xi16> -> tensor<1024x2x1xi16> loc(#loc282) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc283) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<1024x2x1xi16> loc(#loc284) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<1024x2x1xi16> loc(#loc284) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<1024x2x1xi16>) -> tensor<1024x1xi32> loc(#loc285) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc286) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc287) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc288) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<1024x2x1xi16> loc(#loc289) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<1024x2x1xi16> loc(#loc289) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<1024x2x1xi16>) -> tensor<1024x1xi32> loc(#loc290) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc291) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc292) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc293) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_27 = arith.constant dense : tensor<128x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_28 = arith.constant dense : tensor<128x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_49 = arith.constant true loc(#loc300) + %cond_50 = arith.constant dense : tensor<128x16xi1> loc(#loc300) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<128x16xi1> loc(#loc300) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<128x16xi1> loc(#loc301) + %cond_53 = arith.ori %cond, %cond_52 : tensor<128x16xi1> loc(#loc332) + scf.yield %cond_53 : tensor<128x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc304) + %eq_50 = arith.ori %eq, %eq_49 : tensor<128x16xi1> loc(#loc334) + scf.yield %eq_50 : tensor<128x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc141) + } loc(#loc144) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<128x16xi32> loc(#loc306) + %cond_30 = arith.andi %3, %cond_29 : tensor<128x16xi1> loc(#loc307) + %cond_31 = arith.ori %1, %cond_30 : tensor<128x16xi1> loc(#loc308) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<128x16xi1> loc(#loc309) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<128x16xi1> loc(#loc310) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<128x16xi1> loc(#loc311) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<128x16xi1> loc(#loc312) + %cond_36 = arith.extui %cond_35 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc313) + %cond_37 = arith.xori %cond_36, %flip : tensor<128x16xi32> loc(#loc313) + %cond_38 = arith.constant 0 : i32 loc(#loc314) + %cond_39 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc314) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<128x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc315) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc316) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc317) + %ret_43 = arith.xori %x, %ret_42 : tensor<128x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<128x16xi32> loc(#loc319) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S128_16S__(%idxs) : (tensor<128x16xi16>) -> tensor<128x16xi16> loc(#loc320) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<128x16xi16> to tensor<128x16xi32> loc(#loc321) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc321) + %new_idxs_47 = arith.extsi %idxs : tensor<128x16xi16> to tensor<128x16xi32> loc(#loc322) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<128x16xi32> loc(#loc322) + tt.return %ret_43, %new_idxs_48 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc165) + %5 = ub.poison : tensor<128x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1024x2x1xi32> loc("input"(#loc166))) -> tensor<1024x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc167) + tt.return %0 : tensor<1024x1xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1024x1xi32> loc(#loc169) + tt.return %1 : tensor<1024x1xi32> loc(#loc169) + } loc(#loc166) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc170)), %b: i32 loc("b"(#loc170))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc171) + tt.return %0 : i32 loc(#loc172) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc173) + tt.return %1 : i32 loc(#loc173) + } loc(#loc170) + tt.func private @"triton.language.standard.sum__i16S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1024x2x1xi16> loc("input"(#loc166))) -> tensor<1024x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<1024x2x1xi16> to tensor<1024x2x1xi32> loc(#loc326) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc167) + tt.return %0 : tensor<1024x1xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1024x1xi32> loc(#loc169) + tt.return %1 : tensor<1024x1xi32> loc(#loc169) + } loc(#loc166) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%x: tensor<128x16xi32> loc("x"(#loc175))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc176) + %false = arith.constant false loc(#loc177) + tt.return %false : i1 loc(#loc177) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc178) + tt.return %1 : i1 loc(#loc178) + } loc(#loc175) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S128_16S__(%x: tensor<128x16xi32> loc("x"(#loc179))) -> tensor<128x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc180) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc181) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc181) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<128x16xi32> loc(#loc181) + %4 = arith.addi %x, %3 : tensor<128x16xi32> loc(#loc181) + tt.return %4 : tensor<128x16xi32> loc(#loc182) + ^bb1: // no predecessors + %5 = ub.poison : tensor<128x16xi32> loc(#loc183) + tt.return %5 : tensor<128x16xi32> loc(#loc183) + } loc(#loc179) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc185) + %cst = arith.constant dense : tensor<1xi1> loc(#loc185) + tt.return %cst : tensor<1xi1> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc187) + tt.return %0 : tensor<1xi1> loc(#loc187) + } loc(#loc184) + tt.func private @triton.language.standard.zeros_like__i32S128_16S__(%input: tensor<128x16xi32> loc("input"(#loc188))) -> tensor<128x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<128x16xi32> loc(#loc189) + tt.return %0 : tensor<128x16xi32> loc(#loc190) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x16xi32> loc(#loc191) + tt.return %1 : tensor<128x16xi32> loc(#loc191) + } loc(#loc188) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<128x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc185) + %cst = arith.constant dense<0> : tensor<128x16xi32> loc(#loc185) + tt.return %cst : tensor<128x16xi32> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x16xi32> loc(#loc187) + tt.return %0 : tensor<128x16xi32> loc(#loc187) + } loc(#loc184) + tt.func private @triton.language.standard.zeros_like__i16S128_16S__(%input: tensor<128x16xi16> loc("input"(#loc188))) -> tensor<128x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<128x16xi16> loc(#loc189) + tt.return %0 : tensor<128x16xi16> loc(#loc190) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x16xi16> loc(#loc191) + tt.return %1 : tensor<128x16xi16> loc(#loc191) + } loc(#loc188) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<128x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc185) + %cst = arith.constant dense<0> : tensor<128x16xi16> loc(#loc185) + tt.return %cst : tensor<128x16xi16> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x16xi16> loc(#loc187) + tt.return %0 : tensor<128x16xi16> loc(#loc187) + } loc(#loc184) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc95)), %idxs: tensor<128x16xi32> loc("idxs"(#loc95))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + tt.return %1#0, %1#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc101) + ^bb1: // no predecessors + %2 = ub.poison : tensor<128x16xi32> loc(#loc102) + %3 = ub.poison : tensor<128x16xi32> loc(#loc102) + tt.return %2, %3 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc103)), %idxs: tensor<128x16xi32> loc("idxs"(#loc103)), %flip: tensor<128x16xi32> loc("flip"(#loc103))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<512x2x2xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<512x2x2xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<512x2x2xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<512x2x2xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<128x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<128x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<128x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<128x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<128x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<128x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<128x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<128x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<128x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<128x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<128x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc165) + %5 = ub.poison : tensor<128x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<512x2x2xi32> loc("input"(#loc166))) -> tensor<512x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc167) + tt.return %0 : tensor<512x2xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<512x2xi32> loc(#loc169) + tt.return %1 : tensor<512x2xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc103)), %idxs: tensor<128x16xi32> loc("idxs"(#loc103)), %flip: tensor<128x16xi32> loc("flip"(#loc103))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<1024x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<1024x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<1024x2x1xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<1024x2x1xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<128x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<128x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<128x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<128x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<128x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<128x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<128x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<128x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<128x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<128x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<128x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc165) + %5 = ub.poison : tensor<128x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc95)), %idxs: tensor<128x16xi32> loc("idxs"(#loc95))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + tt.return %2#0, %2#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc101) + ^bb1: // no predecessors + %3 = ub.poison : tensor<128x16xi32> loc(#loc102) + %4 = ub.poison : tensor<128x16xi32> loc(#loc102) + tt.return %3, %4 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc103)), %idxs: tensor<128x16xi32> loc("idxs"(#loc103)), %flip: tensor<128x16xi32> loc("flip"(#loc103))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x4xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<256x2x4xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x4xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x4xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<128x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<128x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<128x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<128x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<128x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<128x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<128x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<128x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<128x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<128x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<128x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc165) + %5 = ub.poison : tensor<128x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<256x2x4xi32> loc("input"(#loc166))) -> tensor<256x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc167) + tt.return %0 : tensor<256x4xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<256x4xi32> loc(#loc169) + tt.return %1 : tensor<256x4xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc95)), %idxs: tensor<128x16xi32> loc("idxs"(#loc95))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc330) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<128x16xi32>, tensor<128x16xi32>, i1) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<128x16xi32>, tensor<128x16xi32>, i1) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<128x16xi32>, tensor<128x16xi32>, i1) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<128x16xi32>, tensor<128x16xi32>, i1) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc100) + tt.return %3#0, %3#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc101) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc102) + %5 = ub.poison : tensor<128x16xi32> loc(#loc102) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc103)), %idxs: tensor<128x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<128x2x8xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<128x2x8xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<128x2x8xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<128x2x8xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<128x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<128x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<128x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<128x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<128x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<128x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<128x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<128x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<128x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<128x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<128x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc165) + %5 = ub.poison : tensor<128x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x2x8xi32> loc("input"(#loc166))) -> tensor<128x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc167) + tt.return %0 : tensor<128x8xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x8xi32> loc(#loc169) + tt.return %1 : tensor<128x8xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc103)), %idxs: tensor<128x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x4xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<256x2x4xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x4xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x4xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<128x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<128x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<128x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<128x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<128x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<128x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<128x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<128x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<128x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<128x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<128x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc165) + %5 = ub.poison : tensor<128x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc103)), %idxs: tensor<128x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<512x2x2xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<512x2x2xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<512x2x2xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<512x2x2xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<128x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<128x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<128x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<128x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<128x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<128x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<128x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<128x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<128x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<128x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<128x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc165) + %5 = ub.poison : tensor<128x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc103)), %idxs: tensor<128x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<1024x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<1024x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<1024x2x1xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<1024x2x1xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<128x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<128x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<128x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<128x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<128x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<128x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<128x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<128x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<128x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<128x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<128x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc165) + %5 = ub.poison : tensor<128x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i64S128_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x16xi64> loc("input"(#loc166))) -> tensor<128xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc167) + tt.reduce.return %2 : i64 loc(#loc167) + }) : (tensor<128x16xi64>) -> tensor<128xi64> loc(#loc167) + tt.return %0 : tensor<128xi64> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128xi64> loc(#loc169) + tt.return %1 : tensor<128xi64> loc(#loc169) + } loc(#loc166) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc170)), %b: i64 loc("b"(#loc170))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc171) + tt.return %0 : i64 loc(#loc172) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc173) + tt.return %1 : i64 loc(#loc173) + } loc(#loc170) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:30) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:45) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":35:30) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":36:18) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":37:34) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":38:18) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":39:18) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":40:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":41:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":43:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":45:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":46:71) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":47:20) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":48:21) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":49:21) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":51:71) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":52:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":54:35) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:26) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:29) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":56:21) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":58:35) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:26) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:29) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":60:21) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":61:21) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":62:21) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":63:21) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":64:19) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":65:32) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":66:35) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":67:44) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":68:20) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":69:20) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":70:35) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:28) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:46) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:38) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:55) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:53) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:63) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":72:31) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":73:21) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":74:21) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":75:19) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":76:35) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":77:20) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":78:20) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":79:35) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:28) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:46) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:38) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:55) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:53) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:63) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:37) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:37) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:32) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:47) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:52) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:49) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:25) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:85) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:35) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:32) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:25) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:47) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:52) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:49) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:85) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:4) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc140 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc142 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc143 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc144 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc145 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc146 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc147 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc148 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc149 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc150 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc151 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc152 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc153 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc154 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc155 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc156 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc157 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc158 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc159 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc160 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc161 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc162 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc163 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc164 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc165 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc167 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc168 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc169 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc171 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc172 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc173 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc174 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc176 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc177 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc178 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc180 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc181 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc182 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc183 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc184 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc185 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc186 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc187 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc189 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc190 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc191 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc192 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc202 = loc("xnumel"(#loc1)) +#loc203 = loc("r0_numel"(#loc2)) +#loc204 = loc("xoffset"(#loc3)) +#loc205 = loc("xoffset"(#loc4)) +#loc206 = loc("xindex"(#loc5)) +#loc207 = loc("xindex"(#loc6)) +#loc208 = loc("xindex"(#loc7)) +#loc209 = loc("xmask"(#loc8)) +#loc210 = loc("r0_index"(#loc9)) +#loc211 = loc("r0_index"(#loc10)) +#loc212 = loc("r0_offset"(#loc11)) +#loc213 = loc("r0_mask"(#loc12)) +#loc214 = loc("tmp0"(#loc13)) +#loc215 = loc("tmp0"(#loc14)) +#loc216 = loc("tmp0"(#loc15)) +#loc217 = loc("tmp0"(#loc16)) +#loc218 = loc("tmp1"(#loc17)) +#loc219 = loc("tmp2"(#loc18)) +#loc220 = loc("tmp3"(#loc19)) +#loc221 = loc("tmp4"(#loc20)) +#loc222 = loc("tmp5"(#loc21)) +#loc223 = loc("tmp6"(#loc22)) +#loc224 = loc("tmp7"(#loc23)) +#loc225 = loc("tmp9"(#loc24)) +#loc226 = loc("tmp11"(#loc25)) +#loc227 = loc("tmp14"(#loc27)) +#loc228 = loc("tmp15"(#loc28)) +#loc229 = loc("tmp16"(#loc29)) +#loc230 = loc("tmp20"(#loc31)) +#loc231 = loc("tmp23"(#loc32)) +#loc232 = loc("tmp24"(#loc33)) +#loc233 = loc("tmp24"(#loc34)) +#loc234 = loc("tmp25"(#loc35)) +#loc235 = loc("tmp28"(#loc36)) +#loc236 = loc("tmp29"(#loc37)) +#loc237 = loc("tmp29"(#loc38)) +#loc238 = loc("tmp30"(#loc39)) +#loc239 = loc("tmp31"(#loc40)) +#loc240 = loc("tmp32"(#loc41)) +#loc241 = loc("tmp33"(#loc42)) +#loc242 = loc("tmp34"(#loc43)) +#loc243 = loc("tmp35"(#loc44)) +#loc244 = loc("tmp36"(#loc45)) +#loc245 = loc("tmp37"(#loc46)) +#loc246 = loc("tmp38"(#loc47)) +#loc247 = loc("tmp39"(#loc48)) +#loc248 = loc("tmp40"(#loc49)) +#loc249 = loc("tmp42"(#loc56)) +#loc250 = loc("tmp43"(#loc57)) +#loc251 = loc("tmp44"(#loc58)) +#loc252 = loc("tmp45"(#loc59)) +#loc253 = loc("tmp46"(#loc60)) +#loc254 = loc("tmp47"(#loc61)) +#loc255 = loc("tmp48"(#loc62)) +#loc256 = loc("tmp49"(#loc63)) +#loc261 = loc("flip"(#loc96)) +#loc262 = loc("flip"(#loc97)) +#loc263 = loc("flip"(#loc98)) +#loc264 = loc("flip"(#loc99)) +#loc268 = loc("y"(#loc104)) +#loc269 = loc("right_mask"(#loc105)) +#loc270 = loc("right_mask"(#loc106)) +#loc271 = loc("left_mask"(#loc107)) +#loc272 = loc("ileft"(#loc108)) +#loc273 = loc("ileft"(#loc109)) +#loc274 = loc("ileft"(#loc110)) +#loc275 = loc("ileft"(#loc111)) +#loc276 = loc("iright"(#loc112)) +#loc277 = loc("iright"(#loc113)) +#loc278 = loc("iright"(#loc114)) +#loc279 = loc("iright"(#loc115)) +#loc280 = loc("ileft"(#loc116)) +#loc281 = loc("iright"(#loc117)) +#loc282 = loc("y_idx"(#loc118)) +#loc283 = loc("left_idx"(#loc119)) +#loc284 = loc("left_idx"(#loc120)) +#loc285 = loc("left_idx"(#loc121)) +#loc286 = loc("left_idx"(#loc122)) +#loc287 = loc("left_idx"(#loc123)) +#loc288 = loc("right_idx"(#loc124)) +#loc289 = loc("right_idx"(#loc125)) +#loc290 = loc("right_idx"(#loc126)) +#loc291 = loc("right_idx"(#loc127)) +#loc292 = loc("right_idx"(#loc128)) +#loc293 = loc("left_idx"(#loc129)) +#loc294 = loc("right_idx"(#loc130)) +#loc295 = loc("left_valid_mask"(#loc131)) +#loc296 = loc("right_valid_mask"(#loc132)) +#loc297 = loc("left_isnan"(#loc133)) +#loc298 = loc("right_isnan"(#loc134)) +#loc299 = loc("cond"(#loc135)) +#loc300 = loc("cond"(#loc138)) +#loc301 = loc("cond"(#loc139)) +#loc302 = loc("cond"(#loc140)) +#loc303 = loc("eq"(#loc142)) +#loc304 = loc("eq"(#loc145)) +#loc305 = loc("eq"(#loc146)) +#loc306 = loc("cond"(#loc147)) +#loc307 = loc("cond"(#loc148)) +#loc308 = loc("cond"(#loc149)) +#loc309 = loc("cond"(#loc150)) +#loc310 = loc("cond"(#loc151)) +#loc311 = loc("cond"(#loc152)) +#loc312 = loc("cond"(#loc153)) +#loc313 = loc("cond"(#loc154)) +#loc314 = loc("cond"(#loc155)) +#loc315 = loc("ret"(#loc156)) +#loc316 = loc("ret"(#loc157)) +#loc317 = loc("ret"(#loc158)) +#loc318 = loc("ret"(#loc159)) +#loc319 = loc("new_idxs"(#loc160)) +#loc320 = loc("new_idxs"(#loc161)) +#loc321 = loc("new_idxs"(#loc162)) +#loc322 = loc("new_idxs"(#loc163)) +#loc326 = loc("input"(#loc174)) +#loc330 = loc("flip"(#loc192)) +#loc331 = loc("cond"(#loc299)) +#loc332 = loc("cond"(#loc302)) +#loc333 = loc("eq"(#loc303)) +#loc334 = loc("eq"(#loc305)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..e9ccd089770033976d50952ba71bae6cbd7940b4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir @@ -0,0 +1,1480 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1, 4], threadsPerWarp = [8, 2, 2], warpsPerCTA = [8, 1, 1], order = [2, 1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1, 4], threadsPerWarp = [16, 2, 1], warpsPerCTA = [8, 1, 1], order = [2, 1, 0]}> +#blocked3 = #ttg.blocked<{sizePerThread = [1, 2, 2], threadsPerWarp = [32, 1, 1], warpsPerCTA = [8, 1, 1], order = [2, 1, 0]}> +#blocked4 = #ttg.blocked<{sizePerThread = [2, 2, 1], threadsPerWarp = [32, 1, 1], warpsPerCTA = [8, 1, 1], order = [2, 1, 0]}> +#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 2], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":18:0) +#loc1 = loc(unknown) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":46:71) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":51:71) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:26) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:26) +#loc117 = loc("in_ptr0"(#loc)) +#loc118 = loc("out_ptr4"(#loc)) +#loc119 = loc("out_ptr5"(#loc)) +#loc120 = loc("out_ptr6"(#loc)) +#loc121 = loc("out_ptr7"(#loc)) +#loc122 = loc("out_ptr8"(#loc)) +#loc123 = loc("out_ptr9"(#loc)) +#loc124 = loc("xnumel"(#loc)) +#loc125 = loc("r0_numel"(#loc)) +#loc144 = loc(callsite(#loc20 at #loc21)) +#loc150 = loc("ileft"(#loc29)) +#loc154 = loc("iright"(#loc34)) +#loc163 = loc("left_idx"(#loc43)) +#loc168 = loc("right_idx"(#loc48)) +#loc189 = loc(callsite(#loc20 at #loc69)) +#loc192 = loc("tmp24"(#loc72)) +#loc197 = loc("tmp29"(#loc77)) +#loc214 = loc(callsite(#loc25 at #loc144)) +#loc218 = loc(callsite(#loc25 at #loc189)) +#loc221 = loc(callsite(#loc1 at #loc192)) +#loc224 = loc(callsite(#loc1 at #loc197)) +#loc228 = loc(callsite(#loc150 at #loc214)) +#loc232 = loc(callsite(#loc154 at #loc214)) +#loc240 = loc(callsite(#loc163 at #loc214)) +#loc245 = loc(callsite(#loc168 at #loc214)) +#loc265 = loc(callsite(#loc150 at #loc218)) +#loc269 = loc(callsite(#loc154 at #loc218)) +#loc287 = loc(callsite(#loc163 at #loc218)) +#loc291 = loc(callsite(#loc168 at #loc218)) +#loc301 = loc(callsite(#loc1 at #loc228)) +#loc303 = loc(callsite(#loc1 at #loc232)) +#loc306 = loc(callsite(#loc1 at #loc240)) +#loc309 = loc(callsite(#loc1 at #loc245)) +#loc311 = loc(callsite(#loc1 at #loc265)) +#loc313 = loc(callsite(#loc1 at #loc269)) +#loc315 = loc(callsite(#loc1 at #loc287)) +#loc317 = loc(callsite(#loc1 at #loc291)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<128x1xi1, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<17> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked2> loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked3> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked4> loc(#loc1) + %cst_5 = arith.constant dense<16> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<128x1xi32, #blocked5> loc(#loc1) + %cst_7 = arith.constant dense<128> : tensor<128x1xi32, #blocked> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %cst_8 = arith.constant dense<1> : tensor<128x16xi32, #blocked5> loc(#loc1) + %cst_9 = arith.constant dense<0> : tensor<128x16xi32, #blocked> loc(#loc1) + %cst_10 = arith.constant dense<17> : tensor<128x16xi32, #blocked> loc(#loc1) + %cst_11 = arith.constant dense<16> : tensor<128x16xi32, #blocked> loc(#loc1) + %cst_12 = arith.constant dense<16384> : tensor<128x16xi64, #blocked> loc(#loc1) + %cst_13 = arith.constant dense<0> : tensor<128x16xi64, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc126) + %xoffset_14 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc127) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc128) + %xindex_15 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc128) + %xindex_16 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc128) + %xindex_17 = tt.expand_dims %xindex_15 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<128x1xi32, #blocked5> loc(#loc128) + %xindex_18 = tt.splat %xoffset_14 : i32 -> tensor<128x1xi32, #blocked> loc(#loc129) + %xindex_19 = tt.splat %xoffset_14 : i32 -> tensor<128x1xi32, #blocked5> loc(#loc129) + %xindex_20 = arith.addi %xindex_18, %xindex_16 : tensor<128x1xi32, #blocked> loc(#loc129) + %xindex_21 = arith.addi %xindex_19, %xindex_17 : tensor<128x1xi32, #blocked5> loc(#loc129) + %xmask = arith.cmpi slt, %xindex_20, %cst_7 : tensor<128x1xi32, #blocked> loc(#loc130) + %xmask_22 = arith.cmpi slt, %xindex_21, %cst_6 : tensor<128x1xi32, #blocked5> loc(#loc130) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc131) + %r0_index_23 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc131) + %tmp0 = arith.muli %xindex_20, %cst_5 : tensor<128x1xi32, #blocked> loc(#loc132) + %tmp0_24 = tt.broadcast %r0_index_23 : tensor<1x16xi32, #blocked> -> tensor<128x16xi32, #blocked> loc(#loc133) + %tmp0_25 = tt.broadcast %tmp0 : tensor<128x1xi32, #blocked> -> tensor<128x16xi32, #blocked> loc(#loc133) + %tmp0_26 = arith.addi %tmp0_24, %tmp0_25 : tensor<128x16xi32, #blocked> loc(#loc133) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x16x!tt.ptr, #blocked> loc(#loc134) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<128x16x!tt.ptr, #blocked>, tensor<128x16xi32, #blocked> loc(#loc134) + %tmp0_29 = tt.broadcast %xmask : tensor<128x1xi1, #blocked> -> tensor<128x16xi1, #blocked> loc(#loc135) + %tmp0_30 = tt.broadcast %xmask_22 : tensor<128x1xi1, #blocked5> -> tensor<128x16xi1, #blocked5> loc(#loc135) + %tmp0_31 = tt.load %tmp0_28, %tmp0_29, %cst_13 : tensor<128x16x!tt.ptr, #blocked> loc(#loc135) + %tmp2 = arith.cmpi sgt, %tmp0_31, %cst_13 : tensor<128x16xi64, #blocked> loc(#loc136) + %tmp4 = arith.cmpi slt, %tmp0_31, %cst_12 : tensor<128x16xi64, #blocked> loc(#loc137) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<128x16xi1, #blocked> loc(#loc138) + %tmp7 = arith.extui %tmp5 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc210) + %tmp9 = arith.trunci %r0_index_23 : tensor<1x16xi32, #blocked> to tensor<1x16xi16, #blocked> loc(#loc141) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16, #blocked> -> tensor<128x16xi16, #blocked> loc(#loc142) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> loc(#loc211) + %flip_32 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> loc(#loc211) + %flip_33 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> loc(#loc211) + %flip_34 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> loc(#loc211) + %flip_35 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> loc(#loc211) + %flip_36 = tt.expand_dims %flip_32 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> loc(#loc211) + %flip_37 = tt.expand_dims %flip_33 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> loc(#loc211) + %flip_38 = tt.expand_dims %flip_34 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> loc(#loc211) + %flip_39 = tt.expand_dims %flip_35 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> -> tensor<1x2x1xi32, #blocked3> loc(#loc211) + %flip_40 = tt.expand_dims %flip_36 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> -> tensor<1x2x1xi32, #blocked4> loc(#loc211) + %flip_41 = tt.expand_dims %flip_37 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> -> tensor<1x2x1xi32, #blocked2> loc(#loc211) + %flip_42 = tt.expand_dims %flip_38 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> -> tensor<1x2x1xi32, #blocked1> loc(#loc211) + %flip_43 = tt.broadcast %flip_39 : tensor<1x2x1xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc212) + %flip_44 = tt.reshape %flip_43 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc213) + %y = tt.reshape %tmp7 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc225) + %left_mask = arith.subi %cst_4, %flip_40 : tensor<1x2x1xi32, #blocked4> loc(#loc226) + %left_mask_45 = arith.subi %cst_3, %flip_39 : tensor<1x2x1xi32, #blocked3> loc(#loc226) + %left_mask_46 = arith.subi %cst_2, %flip_41 : tensor<1x2x1xi32, #blocked2> loc(#loc226) + %left_mask_47 = arith.subi %cst_1, %flip_42 : tensor<1x2x1xi32, #blocked1> loc(#loc226) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc227) + %ileft_48 = arith.muli %y, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc227) + %ileft_49 = "tt.reduce"(%ileft_48) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_50 = tt.expand_dims %ileft_49 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc229) + %ileft_51 = tt.broadcast %ileft_50 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc230) + %iright = tt.broadcast %flip_40 : tensor<1x2x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc231) + %iright_52 = arith.muli %y, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc231) + %iright_53 = "tt.reduce"(%iright_52) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_54 = tt.expand_dims %iright_53 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc233) + %iright_55 = tt.broadcast %iright_54 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc234) + %ileft_56 = tt.reshape %ileft_51 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_57 = tt.reshape %iright_55 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx = tt.reshape %tmp11 : tensor<128x16xi16, #blocked> -> tensor<1024x2x1xi16, #blocked4> loc(#loc237) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc238) + %left_idx_58 = tt.broadcast %left_idx : tensor<1x2x1xi16, #blocked4> -> tensor<1024x2x1xi16, #blocked4> loc(#loc239) + %left_idx_59 = arith.muli %y_idx, %left_idx_58 : tensor<1024x2x1xi16, #blocked4> loc(#loc239) + %input = arith.extsi %left_idx_59 : tensor<1024x2x1xi16, #blocked4> to tensor<1024x2x1xi32, #blocked4> loc(#loc304) + %left_idx_60 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_61 = tt.expand_dims %left_idx_60 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc241) + %left_idx_62 = tt.broadcast %left_idx_61 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc242) + %right_idx = arith.trunci %flip_40 : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc243) + %right_idx_63 = tt.broadcast %right_idx : tensor<1x2x1xi16, #blocked4> -> tensor<1024x2x1xi16, #blocked4> loc(#loc244) + %right_idx_64 = arith.muli %y_idx, %right_idx_63 : tensor<1024x2x1xi16, #blocked4> loc(#loc244) + %input_65 = arith.extsi %right_idx_64 : tensor<1024x2x1xi16, #blocked4> to tensor<1024x2x1xi32, #blocked4> loc(#loc307) + %right_idx_66 = "tt.reduce"(%input_65) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_67 = tt.expand_dims %right_idx_66 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc246) + %right_idx_68 = tt.broadcast %right_idx_67 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc247) + %left_idx_69 = tt.reshape %left_idx_62 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_70 = tt.reshape %right_idx_68 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond = arith.cmpi slt, %ileft_56, %iright_57 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq = arith.cmpi eq, %ileft_56, %iright_57 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_71 = arith.cmpi sgt, %left_idx_69, %right_idx_70 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_72 = arith.andi %eq, %cond_71 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_73 = arith.ori %cond, %cond_72 : tensor<128x16xi1, #blocked> loc(#loc254) + %cond_74 = arith.extui %cond_73 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc255) + %cond_75 = arith.xori %cond_74, %flip_44 : tensor<128x16xi32, #blocked> loc(#loc255) + %cond_76 = arith.cmpi ne, %cond_75, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc256) + %ret = arith.xori %ileft_56, %iright_57 : tensor<128x16xi32, #blocked> loc(#loc257) + %ret_77 = arith.select %cond_76, %ret, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc258) + %ret_78 = arith.xori %tmp7, %ret_77 : tensor<128x16xi32, #blocked> loc(#loc259) + %new_idxs = arith.xori %left_idx_69, %right_idx_70 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_79 = arith.select %cond_76, %new_idxs, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_80 = arith.extsi %tmp9 : tensor<1x16xi16, #blocked> to tensor<1x16xi32, #blocked> loc(#loc262) + %new_idxs_81 = tt.broadcast %new_idxs_80 : tensor<1x16xi32, #blocked> -> tensor<128x16xi32, #blocked> loc(#loc262) + %new_idxs_82 = arith.xori %new_idxs_81, %new_idxs_79 : tensor<128x16xi32, #blocked> loc(#loc262) + %flip_83 = tt.broadcast %flip_41 : tensor<1x2x1xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc212) + %flip_84 = tt.reshape %flip_83 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc213) + %y_85 = tt.reshape %ret_78 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc225) + %ileft_86 = tt.broadcast %left_mask_45 : tensor<1x2x1xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc227) + %ileft_87 = arith.muli %y_85, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc227) + %ileft_88 = "tt.reduce"(%ileft_87) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_89 = tt.expand_dims %ileft_88 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc229) + %ileft_90 = tt.broadcast %ileft_89 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc230) + %iright_91 = arith.muli %y_85, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc231) + %iright_92 = "tt.reduce"(%iright_91) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_93 = tt.expand_dims %iright_92 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc233) + %iright_94 = tt.broadcast %iright_93 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc234) + %ileft_95 = tt.reshape %ileft_90 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_96 = tt.reshape %iright_94 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx_97 = tt.reshape %new_idxs_82 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc237) + %left_idx_98 = arith.muli %y_idx_97, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc239) + %left_idx_99 = "tt.reduce"(%left_idx_98) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_100 = tt.expand_dims %left_idx_99 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc241) + %left_idx_101 = tt.broadcast %left_idx_100 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc242) + %right_idx_102 = arith.muli %y_idx_97, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc244) + %right_idx_103 = "tt.reduce"(%right_idx_102) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_104 = tt.expand_dims %right_idx_103 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc246) + %right_idx_105 = tt.broadcast %right_idx_104 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc247) + %left_idx_106 = tt.reshape %left_idx_101 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_107 = tt.reshape %right_idx_105 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond_108 = arith.cmpi slt, %ileft_95, %iright_96 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq_109 = arith.cmpi eq, %ileft_95, %iright_96 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_110 = arith.cmpi sgt, %left_idx_106, %right_idx_107 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_111 = arith.andi %eq_109, %cond_110 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_112 = arith.ori %cond_108, %cond_111 : tensor<128x16xi1, #blocked> loc(#loc254) + %cond_113 = arith.extui %cond_112 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc255) + %cond_114 = arith.xori %cond_113, %flip_84 : tensor<128x16xi32, #blocked> loc(#loc255) + %cond_115 = arith.cmpi ne, %cond_114, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc256) + %ret_116 = arith.xori %ileft_95, %iright_96 : tensor<128x16xi32, #blocked> loc(#loc257) + %ret_117 = arith.select %cond_115, %ret_116, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc258) + %ret_118 = arith.xori %ret_78, %ret_117 : tensor<128x16xi32, #blocked> loc(#loc259) + %new_idxs_119 = arith.xori %left_idx_106, %right_idx_107 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_120 = arith.select %cond_115, %new_idxs_119, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_121 = arith.xori %new_idxs_82, %new_idxs_120 : tensor<128x16xi32, #blocked> loc(#loc262) + %y_122 = tt.reshape %ret_118 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc225) + %ileft_123 = arith.muli %y_122, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc227) + %ileft_124 = "tt.reduce"(%ileft_123) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_125 = tt.expand_dims %ileft_124 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc229) + %ileft_126 = tt.broadcast %ileft_125 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc230) + %iright_127 = arith.muli %y_122, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc231) + %iright_128 = "tt.reduce"(%iright_127) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_129 = tt.expand_dims %iright_128 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc233) + %iright_130 = tt.broadcast %iright_129 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc234) + %ileft_131 = tt.reshape %ileft_126 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_132 = tt.reshape %iright_130 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx_133 = tt.reshape %new_idxs_121 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc237) + %left_idx_134 = arith.muli %y_idx_133, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc239) + %left_idx_135 = "tt.reduce"(%left_idx_134) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_136 = tt.expand_dims %left_idx_135 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc241) + %left_idx_137 = tt.broadcast %left_idx_136 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc242) + %right_idx_138 = arith.muli %y_idx_133, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc244) + %right_idx_139 = "tt.reduce"(%right_idx_138) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_140 = tt.expand_dims %right_idx_139 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc246) + %right_idx_141 = tt.broadcast %right_idx_140 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc247) + %left_idx_142 = tt.reshape %left_idx_137 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_143 = tt.reshape %right_idx_141 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond_144 = arith.cmpi slt, %ileft_131, %iright_132 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq_145 = arith.cmpi eq, %ileft_131, %iright_132 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_146 = arith.cmpi sgt, %left_idx_142, %right_idx_143 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_147 = arith.andi %eq_145, %cond_146 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_148 = arith.ori %cond_144, %cond_147 : tensor<128x16xi1, #blocked> loc(#loc254) + %cond_149 = arith.extui %cond_148 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc255) + %cond_150 = arith.xori %cond_149, %flip_84 : tensor<128x16xi32, #blocked> loc(#loc255) + %cond_151 = arith.cmpi ne, %cond_150, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc256) + %ret_152 = arith.xori %ileft_131, %iright_132 : tensor<128x16xi32, #blocked> loc(#loc257) + %ret_153 = arith.select %cond_151, %ret_152, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc258) + %ret_154 = arith.xori %ret_118, %ret_153 : tensor<128x16xi32, #blocked> loc(#loc259) + %new_idxs_155 = arith.xori %left_idx_142, %right_idx_143 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_156 = arith.select %cond_151, %new_idxs_155, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_157 = arith.xori %new_idxs_121, %new_idxs_156 : tensor<128x16xi32, #blocked> loc(#loc262) + %flip_158 = tt.broadcast %flip_42 : tensor<1x2x1xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc212) + %flip_159 = tt.reshape %flip_158 : tensor<128x2x8xi32, #blocked1> -> tensor<128x16xi32, #blocked> loc(#loc213) + %y_160 = tt.reshape %ret_154 : tensor<128x16xi32, #blocked> -> tensor<256x2x4xi32, #blocked2> loc(#loc225) + %ileft_161 = tt.broadcast %left_mask_46 : tensor<1x2x1xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc227) + %ileft_162 = arith.muli %y_160, %ileft_161 : tensor<256x2x4xi32, #blocked2> loc(#loc227) + %ileft_163 = "tt.reduce"(%ileft_162) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc300) + %ileft_164 = tt.expand_dims %ileft_163 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc229) + %ileft_165 = tt.broadcast %ileft_164 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc230) + %iright_166 = arith.muli %y_160, %flip_83 : tensor<256x2x4xi32, #blocked2> loc(#loc231) + %iright_167 = "tt.reduce"(%iright_166) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %iright_168 = tt.expand_dims %iright_167 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc233) + %iright_169 = tt.broadcast %iright_168 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc234) + %ileft_170 = tt.reshape %ileft_165 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_171 = tt.reshape %iright_169 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx_172 = tt.reshape %new_idxs_157 : tensor<128x16xi32, #blocked> -> tensor<256x2x4xi32, #blocked2> loc(#loc237) + %left_idx_173 = arith.muli %y_idx_172, %ileft_161 : tensor<256x2x4xi32, #blocked2> loc(#loc239) + %left_idx_174 = "tt.reduce"(%left_idx_173) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %left_idx_175 = tt.expand_dims %left_idx_174 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc241) + %left_idx_176 = tt.broadcast %left_idx_175 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc242) + %right_idx_177 = arith.muli %y_idx_172, %flip_83 : tensor<256x2x4xi32, #blocked2> loc(#loc244) + %right_idx_178 = "tt.reduce"(%right_idx_177) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc308) + %right_idx_179 = tt.expand_dims %right_idx_178 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc246) + %right_idx_180 = tt.broadcast %right_idx_179 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc247) + %left_idx_181 = tt.reshape %left_idx_176 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_182 = tt.reshape %right_idx_180 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond_183 = arith.cmpi slt, %ileft_170, %iright_171 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq_184 = arith.cmpi eq, %ileft_170, %iright_171 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_185 = arith.cmpi sgt, %left_idx_181, %right_idx_182 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_186 = arith.andi %eq_184, %cond_185 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_187 = arith.ori %cond_183, %cond_186 : tensor<128x16xi1, #blocked> loc(#loc254) + %cond_188 = arith.extui %cond_187 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc255) + %cond_189 = arith.xori %cond_188, %flip_159 : tensor<128x16xi32, #blocked> loc(#loc255) + %cond_190 = arith.cmpi ne, %cond_189, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc256) + %ret_191 = arith.xori %ileft_170, %iright_171 : tensor<128x16xi32, #blocked> loc(#loc257) + %ret_192 = arith.select %cond_190, %ret_191, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc258) + %ret_193 = arith.xori %ret_154, %ret_192 : tensor<128x16xi32, #blocked> loc(#loc259) + %new_idxs_194 = arith.xori %left_idx_181, %right_idx_182 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_195 = arith.select %cond_190, %new_idxs_194, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_196 = arith.xori %new_idxs_157, %new_idxs_195 : tensor<128x16xi32, #blocked> loc(#loc262) + %y_197 = tt.reshape %ret_193 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc225) + %ileft_198 = arith.muli %y_197, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc227) + %ileft_199 = "tt.reduce"(%ileft_198) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_200 = tt.expand_dims %ileft_199 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc229) + %ileft_201 = tt.broadcast %ileft_200 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc230) + %iright_202 = arith.muli %y_197, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc231) + %iright_203 = "tt.reduce"(%iright_202) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_204 = tt.expand_dims %iright_203 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc233) + %iright_205 = tt.broadcast %iright_204 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc234) + %ileft_206 = tt.reshape %ileft_201 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_207 = tt.reshape %iright_205 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx_208 = tt.reshape %new_idxs_196 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc237) + %left_idx_209 = arith.muli %y_idx_208, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc239) + %left_idx_210 = "tt.reduce"(%left_idx_209) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_211 = tt.expand_dims %left_idx_210 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc241) + %left_idx_212 = tt.broadcast %left_idx_211 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc242) + %right_idx_213 = arith.muli %y_idx_208, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc244) + %right_idx_214 = "tt.reduce"(%right_idx_213) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_215 = tt.expand_dims %right_idx_214 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc246) + %right_idx_216 = tt.broadcast %right_idx_215 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc247) + %left_idx_217 = tt.reshape %left_idx_212 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_218 = tt.reshape %right_idx_216 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond_219 = arith.cmpi slt, %ileft_206, %iright_207 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq_220 = arith.cmpi eq, %ileft_206, %iright_207 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_221 = arith.cmpi sgt, %left_idx_217, %right_idx_218 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_222 = arith.andi %eq_220, %cond_221 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_223 = arith.ori %cond_219, %cond_222 : tensor<128x16xi1, #blocked> loc(#loc254) + %cond_224 = arith.extui %cond_223 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc255) + %cond_225 = arith.xori %cond_224, %flip_159 : tensor<128x16xi32, #blocked> loc(#loc255) + %cond_226 = arith.cmpi ne, %cond_225, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc256) + %ret_227 = arith.xori %ileft_206, %iright_207 : tensor<128x16xi32, #blocked> loc(#loc257) + %ret_228 = arith.select %cond_226, %ret_227, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc258) + %ret_229 = arith.xori %ret_193, %ret_228 : tensor<128x16xi32, #blocked> loc(#loc259) + %new_idxs_230 = arith.xori %left_idx_217, %right_idx_218 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_231 = arith.select %cond_226, %new_idxs_230, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_232 = arith.xori %new_idxs_196, %new_idxs_231 : tensor<128x16xi32, #blocked> loc(#loc262) + %y_233 = tt.reshape %ret_229 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc225) + %ileft_234 = arith.muli %y_233, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc227) + %ileft_235 = "tt.reduce"(%ileft_234) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_236 = tt.expand_dims %ileft_235 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc229) + %ileft_237 = tt.broadcast %ileft_236 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc230) + %iright_238 = arith.muli %y_233, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc231) + %iright_239 = "tt.reduce"(%iright_238) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_240 = tt.expand_dims %iright_239 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc233) + %iright_241 = tt.broadcast %iright_240 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc234) + %ileft_242 = tt.reshape %ileft_237 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_243 = tt.reshape %iright_241 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx_244 = tt.reshape %new_idxs_232 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc237) + %left_idx_245 = arith.muli %y_idx_244, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc239) + %left_idx_246 = "tt.reduce"(%left_idx_245) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_247 = tt.expand_dims %left_idx_246 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc241) + %left_idx_248 = tt.broadcast %left_idx_247 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc242) + %right_idx_249 = arith.muli %y_idx_244, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc244) + %right_idx_250 = "tt.reduce"(%right_idx_249) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_251 = tt.expand_dims %right_idx_250 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc246) + %right_idx_252 = tt.broadcast %right_idx_251 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc247) + %left_idx_253 = tt.reshape %left_idx_248 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_254 = tt.reshape %right_idx_252 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond_255 = arith.cmpi slt, %ileft_242, %iright_243 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq_256 = arith.cmpi eq, %ileft_242, %iright_243 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_257 = arith.cmpi sgt, %left_idx_253, %right_idx_254 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_258 = arith.andi %eq_256, %cond_257 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_259 = arith.ori %cond_255, %cond_258 : tensor<128x16xi1, #blocked> loc(#loc254) + %cond_260 = arith.extui %cond_259 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc255) + %cond_261 = arith.xori %cond_260, %flip_159 : tensor<128x16xi32, #blocked> loc(#loc255) + %cond_262 = arith.cmpi ne, %cond_261, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc256) + %ret_263 = arith.xori %ileft_242, %iright_243 : tensor<128x16xi32, #blocked> loc(#loc257) + %ret_264 = arith.select %cond_262, %ret_263, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc258) + %ret_265 = arith.xori %ret_229, %ret_264 : tensor<128x16xi32, #blocked> loc(#loc259) + %new_idxs_266 = arith.xori %left_idx_253, %right_idx_254 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_267 = arith.select %cond_262, %new_idxs_266, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_268 = arith.xori %new_idxs_232, %new_idxs_267 : tensor<128x16xi32, #blocked> loc(#loc262) + %y_269 = tt.reshape %ret_265 : tensor<128x16xi32, #blocked> -> tensor<128x2x8xi32, #blocked1> loc(#loc225) + %ileft_270 = tt.broadcast %left_mask_47 : tensor<1x2x1xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc227) + %ileft_271 = arith.muli %y_269, %ileft_270 : tensor<128x2x8xi32, #blocked1> loc(#loc227) + %ileft_272 = "tt.reduce"(%ileft_271) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<128x2x8xi32, #blocked1>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc300) + %ileft_273 = tt.expand_dims %ileft_272 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1x8xi32, #blocked1> loc(#loc229) + %ileft_274 = tt.broadcast %ileft_273 : tensor<128x1x8xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc230) + %iright_275 = arith.muli %y_269, %flip_158 : tensor<128x2x8xi32, #blocked1> loc(#loc231) + %iright_276 = "tt.reduce"(%iright_275) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<128x2x8xi32, #blocked1>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc302) + %iright_277 = tt.expand_dims %iright_276 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1x8xi32, #blocked1> loc(#loc233) + %iright_278 = tt.broadcast %iright_277 : tensor<128x1x8xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc234) + %ileft_279 = tt.reshape %ileft_274 : tensor<128x2x8xi32, #blocked1> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_280 = tt.reshape %iright_278 : tensor<128x2x8xi32, #blocked1> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx_281 = tt.reshape %new_idxs_268 : tensor<128x16xi32, #blocked> -> tensor<128x2x8xi32, #blocked1> loc(#loc237) + %left_idx_282 = arith.muli %y_idx_281, %ileft_270 : tensor<128x2x8xi32, #blocked1> loc(#loc239) + %left_idx_283 = "tt.reduce"(%left_idx_282) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<128x2x8xi32, #blocked1>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc305) + %left_idx_284 = tt.expand_dims %left_idx_283 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1x8xi32, #blocked1> loc(#loc241) + %left_idx_285 = tt.broadcast %left_idx_284 : tensor<128x1x8xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc242) + %right_idx_286 = arith.muli %y_idx_281, %flip_158 : tensor<128x2x8xi32, #blocked1> loc(#loc244) + %right_idx_287 = "tt.reduce"(%right_idx_286) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<128x2x8xi32, #blocked1>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc308) + %right_idx_288 = tt.expand_dims %right_idx_287 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1x8xi32, #blocked1> loc(#loc246) + %right_idx_289 = tt.broadcast %right_idx_288 : tensor<128x1x8xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc247) + %left_idx_290 = tt.reshape %left_idx_285 : tensor<128x2x8xi32, #blocked1> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_291 = tt.reshape %right_idx_289 : tensor<128x2x8xi32, #blocked1> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond_292 = arith.cmpi slt, %ileft_279, %iright_280 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq_293 = arith.cmpi eq, %ileft_279, %iright_280 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_294 = arith.cmpi sgt, %left_idx_290, %right_idx_291 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_295 = arith.andi %eq_293, %cond_294 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_296 = arith.ori %cond_292, %cond_295 : tensor<128x16xi1, #blocked> loc(#loc254) + %ret_297 = arith.xori %ileft_279, %iright_280 : tensor<128x16xi32, #blocked> loc(#loc257) + %ret_298 = arith.select %cond_296, %ret_297, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc258) + %ret_299 = arith.xori %ret_265, %ret_298 : tensor<128x16xi32, #blocked> loc(#loc259) + %new_idxs_300 = arith.xori %left_idx_290, %right_idx_291 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_301 = arith.select %cond_296, %new_idxs_300, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_302 = arith.xori %new_idxs_268, %new_idxs_301 : tensor<128x16xi32, #blocked> loc(#loc262) + %y_303 = tt.reshape %ret_299 : tensor<128x16xi32, #blocked> -> tensor<256x2x4xi32, #blocked2> loc(#loc225) + %ileft_304 = arith.muli %y_303, %ileft_161 : tensor<256x2x4xi32, #blocked2> loc(#loc227) + %ileft_305 = "tt.reduce"(%ileft_304) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc300) + %ileft_306 = tt.expand_dims %ileft_305 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc229) + %ileft_307 = tt.broadcast %ileft_306 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc230) + %iright_308 = arith.muli %y_303, %flip_83 : tensor<256x2x4xi32, #blocked2> loc(#loc231) + %iright_309 = "tt.reduce"(%iright_308) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %iright_310 = tt.expand_dims %iright_309 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc233) + %iright_311 = tt.broadcast %iright_310 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc234) + %ileft_312 = tt.reshape %ileft_307 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_313 = tt.reshape %iright_311 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx_314 = tt.reshape %new_idxs_302 : tensor<128x16xi32, #blocked> -> tensor<256x2x4xi32, #blocked2> loc(#loc237) + %left_idx_315 = arith.muli %y_idx_314, %ileft_161 : tensor<256x2x4xi32, #blocked2> loc(#loc239) + %left_idx_316 = "tt.reduce"(%left_idx_315) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %left_idx_317 = tt.expand_dims %left_idx_316 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc241) + %left_idx_318 = tt.broadcast %left_idx_317 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc242) + %right_idx_319 = arith.muli %y_idx_314, %flip_83 : tensor<256x2x4xi32, #blocked2> loc(#loc244) + %right_idx_320 = "tt.reduce"(%right_idx_319) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc308) + %right_idx_321 = tt.expand_dims %right_idx_320 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc246) + %right_idx_322 = tt.broadcast %right_idx_321 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc247) + %left_idx_323 = tt.reshape %left_idx_318 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_324 = tt.reshape %right_idx_322 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond_325 = arith.cmpi slt, %ileft_312, %iright_313 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq_326 = arith.cmpi eq, %ileft_312, %iright_313 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_327 = arith.cmpi sgt, %left_idx_323, %right_idx_324 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_328 = arith.andi %eq_326, %cond_327 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_329 = arith.ori %cond_325, %cond_328 : tensor<128x16xi1, #blocked> loc(#loc254) + %ret_330 = arith.xori %ileft_312, %iright_313 : tensor<128x16xi32, #blocked> loc(#loc257) + %ret_331 = arith.select %cond_329, %ret_330, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc258) + %ret_332 = arith.xori %ret_299, %ret_331 : tensor<128x16xi32, #blocked> loc(#loc259) + %new_idxs_333 = arith.xori %left_idx_323, %right_idx_324 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_334 = arith.select %cond_329, %new_idxs_333, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_335 = arith.xori %new_idxs_302, %new_idxs_334 : tensor<128x16xi32, #blocked> loc(#loc262) + %y_336 = tt.reshape %ret_332 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc225) + %ileft_337 = arith.muli %y_336, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc227) + %ileft_338 = "tt.reduce"(%ileft_337) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_339 = tt.expand_dims %ileft_338 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc229) + %ileft_340 = tt.broadcast %ileft_339 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc230) + %iright_341 = arith.muli %y_336, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc231) + %iright_342 = "tt.reduce"(%iright_341) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_343 = tt.expand_dims %iright_342 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc233) + %iright_344 = tt.broadcast %iright_343 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc234) + %ileft_345 = tt.reshape %ileft_340 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_346 = tt.reshape %iright_344 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx_347 = tt.reshape %new_idxs_335 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc237) + %left_idx_348 = arith.muli %y_idx_347, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc239) + %left_idx_349 = "tt.reduce"(%left_idx_348) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_350 = tt.expand_dims %left_idx_349 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc241) + %left_idx_351 = tt.broadcast %left_idx_350 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc242) + %right_idx_352 = arith.muli %y_idx_347, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc244) + %right_idx_353 = "tt.reduce"(%right_idx_352) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_354 = tt.expand_dims %right_idx_353 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc246) + %right_idx_355 = tt.broadcast %right_idx_354 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc247) + %left_idx_356 = tt.reshape %left_idx_351 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_357 = tt.reshape %right_idx_355 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond_358 = arith.cmpi slt, %ileft_345, %iright_346 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq_359 = arith.cmpi eq, %ileft_345, %iright_346 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_360 = arith.cmpi sgt, %left_idx_356, %right_idx_357 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_361 = arith.andi %eq_359, %cond_360 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_362 = arith.ori %cond_358, %cond_361 : tensor<128x16xi1, #blocked> loc(#loc254) + %ret_363 = arith.xori %ileft_345, %iright_346 : tensor<128x16xi32, #blocked> loc(#loc257) + %ret_364 = arith.select %cond_362, %ret_363, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc258) + %ret_365 = arith.xori %ret_332, %ret_364 : tensor<128x16xi32, #blocked> loc(#loc259) + %new_idxs_366 = arith.xori %left_idx_356, %right_idx_357 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_367 = arith.select %cond_362, %new_idxs_366, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_368 = arith.xori %new_idxs_335, %new_idxs_367 : tensor<128x16xi32, #blocked> loc(#loc262) + %y_369 = tt.reshape %ret_365 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc225) + %ileft_370 = arith.muli %y_369, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc227) + %ileft_371 = "tt.reduce"(%ileft_370) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_372 = tt.expand_dims %ileft_371 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc229) + %ileft_373 = tt.broadcast %ileft_372 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc230) + %iright_374 = arith.muli %y_369, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc231) + %iright_375 = "tt.reduce"(%iright_374) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_376 = tt.expand_dims %iright_375 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc233) + %iright_377 = tt.broadcast %iright_376 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc234) + %ileft_378 = tt.reshape %ileft_373 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc235) + %iright_379 = tt.reshape %iright_377 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc236) + %y_idx_380 = tt.reshape %new_idxs_368 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc237) + %left_idx_381 = arith.muli %y_idx_380, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc239) + %left_idx_382 = "tt.reduce"(%left_idx_381) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_383 = tt.expand_dims %left_idx_382 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc241) + %left_idx_384 = tt.broadcast %left_idx_383 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc242) + %right_idx_385 = arith.muli %y_idx_380, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc244) + %right_idx_386 = "tt.reduce"(%right_idx_385) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_387 = tt.expand_dims %right_idx_386 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc246) + %right_idx_388 = tt.broadcast %right_idx_387 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc247) + %left_idx_389 = tt.reshape %left_idx_384 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc248) + %right_idx_390 = tt.reshape %right_idx_388 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc249) + %cond_391 = arith.cmpi slt, %ileft_378, %iright_379 : tensor<128x16xi32, #blocked> loc(#loc250) + %eq_392 = arith.cmpi eq, %ileft_378, %iright_379 : tensor<128x16xi32, #blocked> loc(#loc251) + %cond_393 = arith.cmpi sgt, %left_idx_389, %right_idx_390 : tensor<128x16xi32, #blocked> loc(#loc252) + %cond_394 = arith.andi %eq_392, %cond_393 : tensor<128x16xi1, #blocked> loc(#loc253) + %cond_395 = arith.ori %cond_391, %cond_394 : tensor<128x16xi1, #blocked> loc(#loc254) + %new_idxs_396 = arith.xori %left_idx_389, %right_idx_390 : tensor<128x16xi32, #blocked> loc(#loc260) + %new_idxs_397 = arith.select %cond_395, %new_idxs_396, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc261) + %new_idxs_398 = arith.xori %new_idxs_368, %new_idxs_397 : tensor<128x16xi32, #blocked> loc(#loc262) + %tmp14 = arith.cmpi eq, %tmp0_31, %cst_12 : tensor<128x16xi64, #blocked> loc(#loc186) + %tmp16 = arith.extui %tmp14 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc217) + %y_399 = tt.reshape %tmp16 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc263) + %ileft_400 = arith.muli %y_399, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc264) + %ileft_401 = "tt.reduce"(%ileft_400) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_402 = tt.expand_dims %ileft_401 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc266) + %ileft_403 = tt.broadcast %ileft_402 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc267) + %iright_404 = arith.muli %y_399, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc268) + %iright_405 = "tt.reduce"(%iright_404) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_406 = tt.expand_dims %iright_405 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc270) + %iright_407 = tt.broadcast %iright_406 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc271) + %ileft_408 = tt.reshape %ileft_403 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_409 = tt.reshape %iright_407 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc273) + %cond_410 = arith.cmpi slt, %ileft_408, %iright_409 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_411 = arith.cmpi eq, %ileft_408, %iright_409 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_412 = arith.andi %eq_411, %cond_71 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_413 = arith.ori %cond_410, %cond_412 : tensor<128x16xi1, #blocked> loc(#loc277) + %cond_414 = arith.extui %cond_413 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc278) + %cond_415 = arith.xori %cond_414, %flip_44 : tensor<128x16xi32, #blocked> loc(#loc278) + %cond_416 = arith.cmpi ne, %cond_415, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc279) + %ret_417 = arith.xori %ileft_408, %iright_409 : tensor<128x16xi32, #blocked> loc(#loc280) + %ret_418 = arith.select %cond_416, %ret_417, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc281) + %ret_419 = arith.xori %tmp16, %ret_418 : tensor<128x16xi32, #blocked> loc(#loc282) + %new_idxs_420 = arith.select %cond_416, %new_idxs, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_421 = arith.xori %new_idxs_81, %new_idxs_420 : tensor<128x16xi32, #blocked> loc(#loc284) + %y_422 = tt.reshape %ret_419 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc263) + %ileft_423 = arith.muli %y_422, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc264) + %ileft_424 = "tt.reduce"(%ileft_423) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_425 = tt.expand_dims %ileft_424 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc266) + %ileft_426 = tt.broadcast %ileft_425 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc267) + %iright_427 = arith.muli %y_422, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc268) + %iright_428 = "tt.reduce"(%iright_427) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_429 = tt.expand_dims %iright_428 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc270) + %iright_430 = tt.broadcast %iright_429 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc271) + %ileft_431 = tt.reshape %ileft_426 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_432 = tt.reshape %iright_430 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc273) + %y_idx_433 = tt.reshape %new_idxs_421 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc285) + %left_idx_434 = arith.muli %y_idx_433, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc286) + %left_idx_435 = "tt.reduce"(%left_idx_434) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_436 = tt.expand_dims %left_idx_435 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc288) + %left_idx_437 = tt.broadcast %left_idx_436 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc289) + %right_idx_438 = arith.muli %y_idx_433, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc290) + %right_idx_439 = "tt.reduce"(%right_idx_438) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_440 = tt.expand_dims %right_idx_439 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc292) + %right_idx_441 = tt.broadcast %right_idx_440 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc293) + %left_idx_442 = tt.reshape %left_idx_437 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc294) + %right_idx_443 = tt.reshape %right_idx_441 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc295) + %cond_444 = arith.cmpi slt, %ileft_431, %iright_432 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_445 = arith.cmpi eq, %ileft_431, %iright_432 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_446 = arith.cmpi sgt, %left_idx_442, %right_idx_443 : tensor<128x16xi32, #blocked> loc(#loc296) + %cond_447 = arith.andi %eq_445, %cond_446 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_448 = arith.ori %cond_444, %cond_447 : tensor<128x16xi1, #blocked> loc(#loc277) + %cond_449 = arith.extui %cond_448 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc278) + %cond_450 = arith.xori %cond_449, %flip_84 : tensor<128x16xi32, #blocked> loc(#loc278) + %cond_451 = arith.cmpi ne, %cond_450, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc279) + %ret_452 = arith.xori %ileft_431, %iright_432 : tensor<128x16xi32, #blocked> loc(#loc280) + %ret_453 = arith.select %cond_451, %ret_452, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc281) + %ret_454 = arith.xori %ret_419, %ret_453 : tensor<128x16xi32, #blocked> loc(#loc282) + %new_idxs_455 = arith.xori %left_idx_442, %right_idx_443 : tensor<128x16xi32, #blocked> loc(#loc297) + %new_idxs_456 = arith.select %cond_451, %new_idxs_455, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_457 = arith.xori %new_idxs_421, %new_idxs_456 : tensor<128x16xi32, #blocked> loc(#loc284) + %y_458 = tt.reshape %ret_454 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc263) + %ileft_459 = arith.muli %y_458, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc264) + %ileft_460 = "tt.reduce"(%ileft_459) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_461 = tt.expand_dims %ileft_460 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc266) + %ileft_462 = tt.broadcast %ileft_461 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc267) + %iright_463 = arith.muli %y_458, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc268) + %iright_464 = "tt.reduce"(%iright_463) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_465 = tt.expand_dims %iright_464 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc270) + %iright_466 = tt.broadcast %iright_465 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc271) + %ileft_467 = tt.reshape %ileft_462 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_468 = tt.reshape %iright_466 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc273) + %y_idx_469 = tt.reshape %new_idxs_457 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc285) + %left_idx_470 = arith.muli %y_idx_469, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc286) + %left_idx_471 = "tt.reduce"(%left_idx_470) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_472 = tt.expand_dims %left_idx_471 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc288) + %left_idx_473 = tt.broadcast %left_idx_472 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc289) + %right_idx_474 = arith.muli %y_idx_469, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc290) + %right_idx_475 = "tt.reduce"(%right_idx_474) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_476 = tt.expand_dims %right_idx_475 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc292) + %right_idx_477 = tt.broadcast %right_idx_476 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc293) + %left_idx_478 = tt.reshape %left_idx_473 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc294) + %right_idx_479 = tt.reshape %right_idx_477 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc295) + %cond_480 = arith.cmpi slt, %ileft_467, %iright_468 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_481 = arith.cmpi eq, %ileft_467, %iright_468 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_482 = arith.cmpi sgt, %left_idx_478, %right_idx_479 : tensor<128x16xi32, #blocked> loc(#loc296) + %cond_483 = arith.andi %eq_481, %cond_482 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_484 = arith.ori %cond_480, %cond_483 : tensor<128x16xi1, #blocked> loc(#loc277) + %cond_485 = arith.extui %cond_484 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc278) + %cond_486 = arith.xori %cond_485, %flip_84 : tensor<128x16xi32, #blocked> loc(#loc278) + %cond_487 = arith.cmpi ne, %cond_486, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc279) + %ret_488 = arith.xori %ileft_467, %iright_468 : tensor<128x16xi32, #blocked> loc(#loc280) + %ret_489 = arith.select %cond_487, %ret_488, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc281) + %ret_490 = arith.xori %ret_454, %ret_489 : tensor<128x16xi32, #blocked> loc(#loc282) + %new_idxs_491 = arith.xori %left_idx_478, %right_idx_479 : tensor<128x16xi32, #blocked> loc(#loc297) + %new_idxs_492 = arith.select %cond_487, %new_idxs_491, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_493 = arith.xori %new_idxs_457, %new_idxs_492 : tensor<128x16xi32, #blocked> loc(#loc284) + %y_494 = tt.reshape %ret_490 : tensor<128x16xi32, #blocked> -> tensor<256x2x4xi32, #blocked2> loc(#loc263) + %ileft_495 = arith.muli %y_494, %ileft_161 : tensor<256x2x4xi32, #blocked2> loc(#loc264) + %ileft_496 = "tt.reduce"(%ileft_495) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc310) + %ileft_497 = tt.expand_dims %ileft_496 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc266) + %ileft_498 = tt.broadcast %ileft_497 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc267) + %iright_499 = arith.muli %y_494, %flip_83 : tensor<256x2x4xi32, #blocked2> loc(#loc268) + %iright_500 = "tt.reduce"(%iright_499) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc312) + %iright_501 = tt.expand_dims %iright_500 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc270) + %iright_502 = tt.broadcast %iright_501 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc271) + %ileft_503 = tt.reshape %ileft_498 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_504 = tt.reshape %iright_502 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc273) + %y_idx_505 = tt.reshape %new_idxs_493 : tensor<128x16xi32, #blocked> -> tensor<256x2x4xi32, #blocked2> loc(#loc285) + %left_idx_506 = arith.muli %y_idx_505, %ileft_161 : tensor<256x2x4xi32, #blocked2> loc(#loc286) + %left_idx_507 = "tt.reduce"(%left_idx_506) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc314) + %left_idx_508 = tt.expand_dims %left_idx_507 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc288) + %left_idx_509 = tt.broadcast %left_idx_508 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc289) + %right_idx_510 = arith.muli %y_idx_505, %flip_83 : tensor<256x2x4xi32, #blocked2> loc(#loc290) + %right_idx_511 = "tt.reduce"(%right_idx_510) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc316) + %right_idx_512 = tt.expand_dims %right_idx_511 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc292) + %right_idx_513 = tt.broadcast %right_idx_512 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc293) + %left_idx_514 = tt.reshape %left_idx_509 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc294) + %right_idx_515 = tt.reshape %right_idx_513 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc295) + %cond_516 = arith.cmpi slt, %ileft_503, %iright_504 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_517 = arith.cmpi eq, %ileft_503, %iright_504 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_518 = arith.cmpi sgt, %left_idx_514, %right_idx_515 : tensor<128x16xi32, #blocked> loc(#loc296) + %cond_519 = arith.andi %eq_517, %cond_518 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_520 = arith.ori %cond_516, %cond_519 : tensor<128x16xi1, #blocked> loc(#loc277) + %cond_521 = arith.extui %cond_520 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc278) + %cond_522 = arith.xori %cond_521, %flip_159 : tensor<128x16xi32, #blocked> loc(#loc278) + %cond_523 = arith.cmpi ne, %cond_522, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc279) + %ret_524 = arith.xori %ileft_503, %iright_504 : tensor<128x16xi32, #blocked> loc(#loc280) + %ret_525 = arith.select %cond_523, %ret_524, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc281) + %ret_526 = arith.xori %ret_490, %ret_525 : tensor<128x16xi32, #blocked> loc(#loc282) + %new_idxs_527 = arith.xori %left_idx_514, %right_idx_515 : tensor<128x16xi32, #blocked> loc(#loc297) + %new_idxs_528 = arith.select %cond_523, %new_idxs_527, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_529 = arith.xori %new_idxs_493, %new_idxs_528 : tensor<128x16xi32, #blocked> loc(#loc284) + %y_530 = tt.reshape %ret_526 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc263) + %ileft_531 = arith.muli %y_530, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc264) + %ileft_532 = "tt.reduce"(%ileft_531) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_533 = tt.expand_dims %ileft_532 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc266) + %ileft_534 = tt.broadcast %ileft_533 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc267) + %iright_535 = arith.muli %y_530, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc268) + %iright_536 = "tt.reduce"(%iright_535) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_537 = tt.expand_dims %iright_536 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc270) + %iright_538 = tt.broadcast %iright_537 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc271) + %ileft_539 = tt.reshape %ileft_534 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_540 = tt.reshape %iright_538 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc273) + %y_idx_541 = tt.reshape %new_idxs_529 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc285) + %left_idx_542 = arith.muli %y_idx_541, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc286) + %left_idx_543 = "tt.reduce"(%left_idx_542) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_544 = tt.expand_dims %left_idx_543 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc288) + %left_idx_545 = tt.broadcast %left_idx_544 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc289) + %right_idx_546 = arith.muli %y_idx_541, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc290) + %right_idx_547 = "tt.reduce"(%right_idx_546) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_548 = tt.expand_dims %right_idx_547 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc292) + %right_idx_549 = tt.broadcast %right_idx_548 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc293) + %left_idx_550 = tt.reshape %left_idx_545 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc294) + %right_idx_551 = tt.reshape %right_idx_549 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc295) + %cond_552 = arith.cmpi slt, %ileft_539, %iright_540 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_553 = arith.cmpi eq, %ileft_539, %iright_540 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_554 = arith.cmpi sgt, %left_idx_550, %right_idx_551 : tensor<128x16xi32, #blocked> loc(#loc296) + %cond_555 = arith.andi %eq_553, %cond_554 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_556 = arith.ori %cond_552, %cond_555 : tensor<128x16xi1, #blocked> loc(#loc277) + %cond_557 = arith.extui %cond_556 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc278) + %cond_558 = arith.xori %cond_557, %flip_159 : tensor<128x16xi32, #blocked> loc(#loc278) + %cond_559 = arith.cmpi ne, %cond_558, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc279) + %ret_560 = arith.xori %ileft_539, %iright_540 : tensor<128x16xi32, #blocked> loc(#loc280) + %ret_561 = arith.select %cond_559, %ret_560, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc281) + %ret_562 = arith.xori %ret_526, %ret_561 : tensor<128x16xi32, #blocked> loc(#loc282) + %new_idxs_563 = arith.xori %left_idx_550, %right_idx_551 : tensor<128x16xi32, #blocked> loc(#loc297) + %new_idxs_564 = arith.select %cond_559, %new_idxs_563, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_565 = arith.xori %new_idxs_529, %new_idxs_564 : tensor<128x16xi32, #blocked> loc(#loc284) + %y_566 = tt.reshape %ret_562 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc263) + %ileft_567 = arith.muli %y_566, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc264) + %ileft_568 = "tt.reduce"(%ileft_567) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_569 = tt.expand_dims %ileft_568 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc266) + %ileft_570 = tt.broadcast %ileft_569 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc267) + %iright_571 = arith.muli %y_566, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc268) + %iright_572 = "tt.reduce"(%iright_571) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_573 = tt.expand_dims %iright_572 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc270) + %iright_574 = tt.broadcast %iright_573 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc271) + %ileft_575 = tt.reshape %ileft_570 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_576 = tt.reshape %iright_574 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc273) + %y_idx_577 = tt.reshape %new_idxs_565 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc285) + %left_idx_578 = arith.muli %y_idx_577, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc286) + %left_idx_579 = "tt.reduce"(%left_idx_578) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_580 = tt.expand_dims %left_idx_579 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc288) + %left_idx_581 = tt.broadcast %left_idx_580 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc289) + %right_idx_582 = arith.muli %y_idx_577, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc290) + %right_idx_583 = "tt.reduce"(%right_idx_582) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_584 = tt.expand_dims %right_idx_583 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc292) + %right_idx_585 = tt.broadcast %right_idx_584 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc293) + %left_idx_586 = tt.reshape %left_idx_581 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc294) + %right_idx_587 = tt.reshape %right_idx_585 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc295) + %cond_588 = arith.cmpi slt, %ileft_575, %iright_576 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_589 = arith.cmpi eq, %ileft_575, %iright_576 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_590 = arith.cmpi sgt, %left_idx_586, %right_idx_587 : tensor<128x16xi32, #blocked> loc(#loc296) + %cond_591 = arith.andi %eq_589, %cond_590 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_592 = arith.ori %cond_588, %cond_591 : tensor<128x16xi1, #blocked> loc(#loc277) + %cond_593 = arith.extui %cond_592 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc278) + %cond_594 = arith.xori %cond_593, %flip_159 : tensor<128x16xi32, #blocked> loc(#loc278) + %cond_595 = arith.cmpi ne, %cond_594, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc279) + %ret_596 = arith.xori %ileft_575, %iright_576 : tensor<128x16xi32, #blocked> loc(#loc280) + %ret_597 = arith.select %cond_595, %ret_596, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc281) + %ret_598 = arith.xori %ret_562, %ret_597 : tensor<128x16xi32, #blocked> loc(#loc282) + %new_idxs_599 = arith.xori %left_idx_586, %right_idx_587 : tensor<128x16xi32, #blocked> loc(#loc297) + %new_idxs_600 = arith.select %cond_595, %new_idxs_599, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_601 = arith.xori %new_idxs_565, %new_idxs_600 : tensor<128x16xi32, #blocked> loc(#loc284) + %y_602 = tt.reshape %ret_598 : tensor<128x16xi32, #blocked> -> tensor<128x2x8xi32, #blocked1> loc(#loc263) + %ileft_603 = arith.muli %y_602, %ileft_270 : tensor<128x2x8xi32, #blocked1> loc(#loc264) + %ileft_604 = "tt.reduce"(%ileft_603) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<128x2x8xi32, #blocked1>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc310) + %ileft_605 = tt.expand_dims %ileft_604 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1x8xi32, #blocked1> loc(#loc266) + %ileft_606 = tt.broadcast %ileft_605 : tensor<128x1x8xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc267) + %iright_607 = arith.muli %y_602, %flip_158 : tensor<128x2x8xi32, #blocked1> loc(#loc268) + %iright_608 = "tt.reduce"(%iright_607) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<128x2x8xi32, #blocked1>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc312) + %iright_609 = tt.expand_dims %iright_608 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1x8xi32, #blocked1> loc(#loc270) + %iright_610 = tt.broadcast %iright_609 : tensor<128x1x8xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc271) + %ileft_611 = tt.reshape %ileft_606 : tensor<128x2x8xi32, #blocked1> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_612 = tt.reshape %iright_610 : tensor<128x2x8xi32, #blocked1> -> tensor<128x16xi32, #blocked> loc(#loc273) + %y_idx_613 = tt.reshape %new_idxs_601 : tensor<128x16xi32, #blocked> -> tensor<128x2x8xi32, #blocked1> loc(#loc285) + %left_idx_614 = arith.muli %y_idx_613, %ileft_270 : tensor<128x2x8xi32, #blocked1> loc(#loc286) + %left_idx_615 = "tt.reduce"(%left_idx_614) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<128x2x8xi32, #blocked1>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc314) + %left_idx_616 = tt.expand_dims %left_idx_615 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1x8xi32, #blocked1> loc(#loc288) + %left_idx_617 = tt.broadcast %left_idx_616 : tensor<128x1x8xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc289) + %right_idx_618 = arith.muli %y_idx_613, %flip_158 : tensor<128x2x8xi32, #blocked1> loc(#loc290) + %right_idx_619 = "tt.reduce"(%right_idx_618) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<128x2x8xi32, #blocked1>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc316) + %right_idx_620 = tt.expand_dims %right_idx_619 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1x8xi32, #blocked1> loc(#loc292) + %right_idx_621 = tt.broadcast %right_idx_620 : tensor<128x1x8xi32, #blocked1> -> tensor<128x2x8xi32, #blocked1> loc(#loc293) + %left_idx_622 = tt.reshape %left_idx_617 : tensor<128x2x8xi32, #blocked1> -> tensor<128x16xi32, #blocked> loc(#loc294) + %right_idx_623 = tt.reshape %right_idx_621 : tensor<128x2x8xi32, #blocked1> -> tensor<128x16xi32, #blocked> loc(#loc295) + %cond_624 = arith.cmpi slt, %ileft_611, %iright_612 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_625 = arith.cmpi eq, %ileft_611, %iright_612 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_626 = arith.cmpi sgt, %left_idx_622, %right_idx_623 : tensor<128x16xi32, #blocked> loc(#loc296) + %cond_627 = arith.andi %eq_625, %cond_626 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_628 = arith.ori %cond_624, %cond_627 : tensor<128x16xi1, #blocked> loc(#loc277) + %ret_629 = arith.xori %ileft_611, %iright_612 : tensor<128x16xi32, #blocked> loc(#loc280) + %ret_630 = arith.select %cond_628, %ret_629, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc281) + %ret_631 = arith.xori %ret_598, %ret_630 : tensor<128x16xi32, #blocked> loc(#loc282) + %new_idxs_632 = arith.xori %left_idx_622, %right_idx_623 : tensor<128x16xi32, #blocked> loc(#loc297) + %new_idxs_633 = arith.select %cond_628, %new_idxs_632, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_634 = arith.xori %new_idxs_601, %new_idxs_633 : tensor<128x16xi32, #blocked> loc(#loc284) + %y_635 = tt.reshape %ret_631 : tensor<128x16xi32, #blocked> -> tensor<256x2x4xi32, #blocked2> loc(#loc263) + %ileft_636 = arith.muli %y_635, %ileft_161 : tensor<256x2x4xi32, #blocked2> loc(#loc264) + %ileft_637 = "tt.reduce"(%ileft_636) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc310) + %ileft_638 = tt.expand_dims %ileft_637 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc266) + %ileft_639 = tt.broadcast %ileft_638 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc267) + %iright_640 = arith.muli %y_635, %flip_83 : tensor<256x2x4xi32, #blocked2> loc(#loc268) + %iright_641 = "tt.reduce"(%iright_640) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc312) + %iright_642 = tt.expand_dims %iright_641 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc270) + %iright_643 = tt.broadcast %iright_642 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc271) + %ileft_644 = tt.reshape %ileft_639 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_645 = tt.reshape %iright_643 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc273) + %y_idx_646 = tt.reshape %new_idxs_634 : tensor<128x16xi32, #blocked> -> tensor<256x2x4xi32, #blocked2> loc(#loc285) + %left_idx_647 = arith.muli %y_idx_646, %ileft_161 : tensor<256x2x4xi32, #blocked2> loc(#loc286) + %left_idx_648 = "tt.reduce"(%left_idx_647) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc314) + %left_idx_649 = tt.expand_dims %left_idx_648 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc288) + %left_idx_650 = tt.broadcast %left_idx_649 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc289) + %right_idx_651 = arith.muli %y_idx_646, %flip_83 : tensor<256x2x4xi32, #blocked2> loc(#loc290) + %right_idx_652 = "tt.reduce"(%right_idx_651) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<256x2x4xi32, #blocked2>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc316) + %right_idx_653 = tt.expand_dims %right_idx_652 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<256x1x4xi32, #blocked2> loc(#loc292) + %right_idx_654 = tt.broadcast %right_idx_653 : tensor<256x1x4xi32, #blocked2> -> tensor<256x2x4xi32, #blocked2> loc(#loc293) + %left_idx_655 = tt.reshape %left_idx_650 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc294) + %right_idx_656 = tt.reshape %right_idx_654 : tensor<256x2x4xi32, #blocked2> -> tensor<128x16xi32, #blocked> loc(#loc295) + %cond_657 = arith.cmpi slt, %ileft_644, %iright_645 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_658 = arith.cmpi eq, %ileft_644, %iright_645 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_659 = arith.cmpi sgt, %left_idx_655, %right_idx_656 : tensor<128x16xi32, #blocked> loc(#loc296) + %cond_660 = arith.andi %eq_658, %cond_659 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_661 = arith.ori %cond_657, %cond_660 : tensor<128x16xi1, #blocked> loc(#loc277) + %ret_662 = arith.xori %ileft_644, %iright_645 : tensor<128x16xi32, #blocked> loc(#loc280) + %ret_663 = arith.select %cond_661, %ret_662, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc281) + %ret_664 = arith.xori %ret_631, %ret_663 : tensor<128x16xi32, #blocked> loc(#loc282) + %new_idxs_665 = arith.xori %left_idx_655, %right_idx_656 : tensor<128x16xi32, #blocked> loc(#loc297) + %new_idxs_666 = arith.select %cond_661, %new_idxs_665, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_667 = arith.xori %new_idxs_634, %new_idxs_666 : tensor<128x16xi32, #blocked> loc(#loc284) + %y_668 = tt.reshape %ret_664 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc263) + %ileft_669 = arith.muli %y_668, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc264) + %ileft_670 = "tt.reduce"(%ileft_669) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_671 = tt.expand_dims %ileft_670 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc266) + %ileft_672 = tt.broadcast %ileft_671 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc267) + %iright_673 = arith.muli %y_668, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc268) + %iright_674 = "tt.reduce"(%iright_673) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_675 = tt.expand_dims %iright_674 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc270) + %iright_676 = tt.broadcast %iright_675 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc271) + %ileft_677 = tt.reshape %ileft_672 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_678 = tt.reshape %iright_676 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc273) + %y_idx_679 = tt.reshape %new_idxs_667 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #blocked3> loc(#loc285) + %left_idx_680 = arith.muli %y_idx_679, %ileft_86 : tensor<512x2x2xi32, #blocked3> loc(#loc286) + %left_idx_681 = "tt.reduce"(%left_idx_680) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_682 = tt.expand_dims %left_idx_681 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc288) + %left_idx_683 = tt.broadcast %left_idx_682 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc289) + %right_idx_684 = arith.muli %y_idx_679, %flip_43 : tensor<512x2x2xi32, #blocked3> loc(#loc290) + %right_idx_685 = "tt.reduce"(%right_idx_684) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<512x2x2xi32, #blocked3>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_686 = tt.expand_dims %right_idx_685 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<512x1x2xi32, #blocked3> loc(#loc292) + %right_idx_687 = tt.broadcast %right_idx_686 : tensor<512x1x2xi32, #blocked3> -> tensor<512x2x2xi32, #blocked3> loc(#loc293) + %left_idx_688 = tt.reshape %left_idx_683 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc294) + %right_idx_689 = tt.reshape %right_idx_687 : tensor<512x2x2xi32, #blocked3> -> tensor<128x16xi32, #blocked> loc(#loc295) + %cond_690 = arith.cmpi slt, %ileft_677, %iright_678 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_691 = arith.cmpi eq, %ileft_677, %iright_678 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_692 = arith.cmpi sgt, %left_idx_688, %right_idx_689 : tensor<128x16xi32, #blocked> loc(#loc296) + %cond_693 = arith.andi %eq_691, %cond_692 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_694 = arith.ori %cond_690, %cond_693 : tensor<128x16xi1, #blocked> loc(#loc277) + %ret_695 = arith.xori %ileft_677, %iright_678 : tensor<128x16xi32, #blocked> loc(#loc280) + %ret_696 = arith.select %cond_694, %ret_695, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc281) + %ret_697 = arith.xori %ret_664, %ret_696 : tensor<128x16xi32, #blocked> loc(#loc282) + %new_idxs_698 = arith.xori %left_idx_688, %right_idx_689 : tensor<128x16xi32, #blocked> loc(#loc297) + %new_idxs_699 = arith.select %cond_694, %new_idxs_698, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_700 = arith.xori %new_idxs_667, %new_idxs_699 : tensor<128x16xi32, #blocked> loc(#loc284) + %y_701 = tt.reshape %ret_697 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc263) + %ileft_702 = arith.muli %y_701, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc264) + %ileft_703 = "tt.reduce"(%ileft_702) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_704 = tt.expand_dims %ileft_703 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc266) + %ileft_705 = tt.broadcast %ileft_704 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc267) + %iright_706 = arith.muli %y_701, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc268) + %iright_707 = "tt.reduce"(%iright_706) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_708 = tt.expand_dims %iright_707 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc270) + %iright_709 = tt.broadcast %iright_708 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc271) + %ileft_710 = tt.reshape %ileft_705 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc272) + %iright_711 = tt.reshape %iright_709 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc273) + %y_idx_712 = tt.reshape %new_idxs_700 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #blocked4> loc(#loc285) + %left_idx_713 = arith.muli %y_idx_712, %ileft : tensor<1024x2x1xi32, #blocked4> loc(#loc286) + %left_idx_714 = "tt.reduce"(%left_idx_713) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_715 = tt.expand_dims %left_idx_714 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc288) + %left_idx_716 = tt.broadcast %left_idx_715 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc289) + %right_idx_717 = arith.muli %y_idx_712, %iright : tensor<1024x2x1xi32, #blocked4> loc(#loc290) + %right_idx_718 = "tt.reduce"(%right_idx_717) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<1024x2x1xi32, #blocked4>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_719 = tt.expand_dims %right_idx_718 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<1024x1x1xi32, #blocked4> loc(#loc292) + %right_idx_720 = tt.broadcast %right_idx_719 : tensor<1024x1x1xi32, #blocked4> -> tensor<1024x2x1xi32, #blocked4> loc(#loc293) + %left_idx_721 = tt.reshape %left_idx_716 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc294) + %right_idx_722 = tt.reshape %right_idx_720 : tensor<1024x2x1xi32, #blocked4> -> tensor<128x16xi32, #blocked> loc(#loc295) + %cond_723 = arith.cmpi slt, %ileft_710, %iright_711 : tensor<128x16xi32, #blocked> loc(#loc274) + %eq_724 = arith.cmpi eq, %ileft_710, %iright_711 : tensor<128x16xi32, #blocked> loc(#loc275) + %cond_725 = arith.cmpi sgt, %left_idx_721, %right_idx_722 : tensor<128x16xi32, #blocked> loc(#loc296) + %cond_726 = arith.andi %eq_724, %cond_725 : tensor<128x16xi1, #blocked> loc(#loc276) + %cond_727 = arith.ori %cond_723, %cond_726 : tensor<128x16xi1, #blocked> loc(#loc277) + %new_idxs_728 = arith.xori %left_idx_721, %right_idx_722 : tensor<128x16xi32, #blocked> loc(#loc297) + %new_idxs_729 = arith.select %cond_727, %new_idxs_728, %cst_9 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc283) + %new_idxs_730 = arith.xori %new_idxs_700, %new_idxs_729 : tensor<128x16xi32, #blocked> loc(#loc284) + %tmp20 = arith.extui %tmp5 : tensor<128x16xi1, #blocked> to tensor<128x16xi64, #blocked> loc(#loc219) + %tmp23 = arith.select %tmp0_29, %tmp20, %cst_13 : tensor<128x16xi1, #blocked>, tensor<128x16xi64, #blocked> loc(#loc191) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_741: i64 loc(callsite(#loc1 at #loc192)), %tmp24_742: i64 loc(callsite(#loc1 at #loc192))): + %tmp24_743 = arith.addi %tmp24_741, %tmp24_742 : i64 loc(#loc298) + tt.reduce.return %tmp24_743 : i64 loc(#loc220) + }) : (tensor<128x16xi64, #blocked>) -> tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc220) + %tmp30 = ttg.convert_layout %tmp24 : tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc193) + %tmp24_731 = tt.expand_dims %tmp30 {axis = 1 : i32} : tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<128x1xi64, #blocked5> loc(#loc194) + %tmp24_732 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi64, #blocked> loc(#loc194) + %tmp25 = arith.extui %tmp14 : tensor<128x16xi1, #blocked> to tensor<128x16xi64, #blocked> loc(#loc222) + %tmp28 = arith.select %tmp0_29, %tmp25, %cst_13 : tensor<128x16xi1, #blocked>, tensor<128x16xi64, #blocked> loc(#loc196) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_741: i64 loc(callsite(#loc1 at #loc197)), %tmp29_742: i64 loc(callsite(#loc1 at #loc197))): + %tmp29_743 = arith.addi %tmp29_741, %tmp29_742 : i64 loc(#loc299) + tt.reduce.return %tmp29_743 : i64 loc(#loc223) + }) : (tensor<128x16xi64, #blocked>) -> tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc223) + %tmp31 = ttg.convert_layout %tmp29 : tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc198) + %tmp29_733 = tt.expand_dims %tmp31 {axis = 1 : i32} : tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<128x1xi64, #blocked5> loc(#loc199) + %tmp29_734 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi64, #blocked> loc(#loc199) + %tmp30_735 = arith.trunci %tmp24_731 : tensor<128x1xi64, #blocked5> to tensor<128x1xi32, #blocked5> loc(#loc193) + %tmp30_736 = arith.trunci %tmp24_732 : tensor<128x1xi64, #blocked> to tensor<128x1xi32, #blocked> loc(#loc193) + %tmp31_737 = arith.trunci %tmp29_733 : tensor<128x1xi64, #blocked5> to tensor<128x1xi32, #blocked5> loc(#loc198) + %tmp31_738 = arith.trunci %tmp29_734 : tensor<128x1xi64, #blocked> to tensor<128x1xi32, #blocked> loc(#loc198) + %tmp34 = tt.broadcast %tmp30_736 : tensor<128x1xi32, #blocked> -> tensor<128x16xi32, #blocked> loc(#loc200) + %tmp34_739 = arith.cmpi slt, %tmp0_24, %tmp34 : tensor<128x16xi32, #blocked> loc(#loc200) + %tmp36 = arith.select %tmp34_739, %new_idxs_398, %cst_11 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc201) + %tmp38 = arith.addi %tmp36, %cst_10 : tensor<128x16xi32, #blocked> loc(#loc202) + %tmp39 = arith.cmpi slt, %tmp36, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc203) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc204) + %0 = arith.cmpi sge, %tmp40, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc85) + %1 = arith.cmpi slt, %tmp40, %cst_10 : tensor<128x16xi32, #blocked> loc(#loc86) + %2 = arith.andi %0, %1 : tensor<128x16xi1, #blocked> loc(#loc87) + %3 = arith.xori %xmask, %cst : tensor<128x1xi1, #blocked> loc(#loc88) + %4 = tt.broadcast %3 : tensor<128x1xi1, #blocked> -> tensor<128x16xi1, #blocked> loc(#loc89) + %5 = arith.ori %2, %4 : tensor<128x16xi1, #blocked> loc(#loc89) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<128x16xi1, #blocked> loc(#loc90) + %tmp45 = tt.broadcast %tmp31_738 : tensor<128x1xi32, #blocked> -> tensor<128x16xi32, #blocked> loc(#loc205) + %tmp45_740 = arith.cmpi slt, %tmp0_24, %tmp45 : tensor<128x16xi32, #blocked> loc(#loc205) + %tmp46 = arith.select %tmp45_740, %new_idxs_730, %cst_11 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc206) + %tmp47 = arith.addi %tmp46, %cst_10 : tensor<128x16xi32, #blocked> loc(#loc207) + %tmp48 = arith.cmpi slt, %tmp46, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc208) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc209) + %6 = arith.cmpi sge, %tmp49, %cst_9 : tensor<128x16xi32, #blocked> loc(#loc96) + %7 = arith.cmpi slt, %tmp49, %cst_10 : tensor<128x16xi32, #blocked> loc(#loc97) + %8 = arith.andi %6, %7 : tensor<128x16xi1, #blocked> loc(#loc98) + %9 = arith.ori %8, %4 : tensor<128x16xi1, #blocked> loc(#loc99) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<128x16xi1, #blocked> loc(#loc100) + %10 = tt.splat %out_ptr4 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked5> loc(#loc101) + %11 = tt.addptr %10, %xindex_21 : tensor<128x1x!tt.ptr, #blocked5>, tensor<128x1xi32, #blocked5> loc(#loc101) + tt.store %11, %tmp30_735, %xmask_22 : tensor<128x1x!tt.ptr, #blocked5> loc(#loc102) + %12 = tt.splat %out_ptr5 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked5> loc(#loc103) + %13 = tt.addptr %12, %xindex_21 : tensor<128x1x!tt.ptr, #blocked5>, tensor<128x1xi32, #blocked5> loc(#loc103) + tt.store %13, %tmp31_737, %xmask_22 : tensor<128x1x!tt.ptr, #blocked5> loc(#loc104) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<128x16x!tt.ptr, #blocked> loc(#loc105) + %15 = tt.addptr %14, %tmp0_26 : tensor<128x16x!tt.ptr, #blocked>, tensor<128x16xi32, #blocked> loc(#loc105) + tt.store %15, %new_idxs_398, %tmp0_29 : tensor<128x16x!tt.ptr, #blocked> loc(#loc106) + %16 = arith.muli %xindex_20, %cst_0 : tensor<128x1xi32, #blocked> loc(#loc107) + %17 = tt.broadcast %16 : tensor<128x1xi32, #blocked> -> tensor<128x16xi32, #blocked> loc(#loc108) + %18 = arith.addi %tmp40, %17 : tensor<128x16xi32, #blocked> loc(#loc108) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<128x16x!tt.ptr, #blocked> loc(#loc109) + %20 = tt.addptr %19, %18 : tensor<128x16x!tt.ptr, #blocked>, tensor<128x16xi32, #blocked> loc(#loc109) + %21 = ttg.convert_layout %20 : tensor<128x16x!tt.ptr, #blocked> -> tensor<128x16x!tt.ptr, #blocked5> loc(#loc110) + tt.store %21, %cst_8, %tmp0_30 : tensor<128x16x!tt.ptr, #blocked5> loc(#loc110) + %22 = tt.splat %out_ptr8 : !tt.ptr -> tensor<128x16x!tt.ptr, #blocked> loc(#loc111) + %23 = tt.addptr %22, %tmp0_26 : tensor<128x16x!tt.ptr, #blocked>, tensor<128x16xi32, #blocked> loc(#loc111) + tt.store %23, %new_idxs_730, %tmp0_29 : tensor<128x16x!tt.ptr, #blocked> loc(#loc112) + %24 = arith.addi %tmp49, %17 : tensor<128x16xi32, #blocked> loc(#loc113) + %25 = tt.splat %out_ptr9 : !tt.ptr -> tensor<128x16x!tt.ptr, #blocked> loc(#loc114) + %26 = tt.addptr %25, %24 : tensor<128x16x!tt.ptr, #blocked>, tensor<128x16xi32, #blocked> loc(#loc114) + %27 = ttg.convert_layout %26 : tensor<128x16x!tt.ptr, #blocked> -> tensor<128x16x!tt.ptr, #blocked5> loc(#loc115) + tt.store %27, %cst_8, %tmp0_30 : tensor<128x16x!tt.ptr, #blocked5> loc(#loc115) + tt.return loc(#loc116) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":26:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:40) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:30) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":36:18) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":38:18) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":39:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":41:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":40:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":43:19) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":45:34) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":47:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":49:21) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":48:21) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":52:20) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":54:35) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":60:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:29) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":56:21) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":58:35) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":61:21) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:29) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":64:19) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":66:35) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":68:20) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":69:20) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":70:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:28) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:46) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:38) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:55) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:53) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:63) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":75:19) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":76:35) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":77:20) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":78:20) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":79:35) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:28) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:46) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:38) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:53) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:63) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:25) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:37) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:25) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:37) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:25) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:47) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:52) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:49) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:25) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:85) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:25) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:47) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:49) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:85) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:4) +#loc126 = loc("xoffset"(#loc2)) +#loc127 = loc("xoffset"(#loc3)) +#loc128 = loc("xindex"(#loc4)) +#loc129 = loc("xindex"(#loc5)) +#loc130 = loc("xmask"(#loc6)) +#loc131 = loc("r0_index"(#loc7)) +#loc132 = loc("tmp0"(#loc8)) +#loc133 = loc("tmp0"(#loc9)) +#loc134 = loc("tmp0"(#loc10)) +#loc135 = loc("tmp0"(#loc11)) +#loc136 = loc("tmp2"(#loc12)) +#loc137 = loc("tmp4"(#loc13)) +#loc138 = loc("tmp5"(#loc14)) +#loc139 = loc("tmp7"(#loc15)) +#loc140 = loc("tmp6"(#loc16)) +#loc141 = loc("tmp9"(#loc17)) +#loc142 = loc("tmp11"(#loc18)) +#loc143 = loc("flip"(#loc19)) +#loc145 = loc("flip"(#loc22)) +#loc146 = loc("flip"(#loc23)) +#loc147 = loc("y"(#loc24)) +#loc148 = loc("left_mask"(#loc26)) +#loc149 = loc("ileft"(#loc27)) +#loc151 = loc("ileft"(#loc31)) +#loc152 = loc("ileft"(#loc32)) +#loc153 = loc("iright"(#loc33)) +#loc155 = loc("iright"(#loc35)) +#loc156 = loc("iright"(#loc36)) +#loc157 = loc("ileft"(#loc37)) +#loc158 = loc("iright"(#loc38)) +#loc159 = loc("y_idx"(#loc39)) +#loc160 = loc("left_idx"(#loc40)) +#loc161 = loc("left_idx"(#loc41)) +#loc162 = loc("input"(#loc42)) +#loc164 = loc("left_idx"(#loc44)) +#loc165 = loc("left_idx"(#loc45)) +#loc166 = loc("right_idx"(#loc46)) +#loc167 = loc("right_idx"(#loc47)) +#loc169 = loc("right_idx"(#loc49)) +#loc170 = loc("right_idx"(#loc50)) +#loc171 = loc("left_idx"(#loc51)) +#loc172 = loc("right_idx"(#loc52)) +#loc173 = loc("cond"(#loc53)) +#loc174 = loc("eq"(#loc54)) +#loc175 = loc("cond"(#loc55)) +#loc176 = loc("cond"(#loc56)) +#loc177 = loc("cond"(#loc57)) +#loc178 = loc("cond"(#loc58)) +#loc179 = loc("cond"(#loc59)) +#loc180 = loc("ret"(#loc60)) +#loc181 = loc("ret"(#loc61)) +#loc182 = loc("ret"(#loc62)) +#loc183 = loc("new_idxs"(#loc63)) +#loc184 = loc("new_idxs"(#loc64)) +#loc185 = loc("new_idxs"(#loc65)) +#loc186 = loc("tmp14"(#loc66)) +#loc187 = loc("tmp16"(#loc67)) +#loc188 = loc("tmp15"(#loc68)) +#loc190 = loc("tmp20"(#loc70)) +#loc191 = loc("tmp23"(#loc71)) +#loc193 = loc("tmp30"(#loc73)) +#loc194 = loc("tmp24"(#loc74)) +#loc195 = loc("tmp25"(#loc75)) +#loc196 = loc("tmp28"(#loc76)) +#loc198 = loc("tmp31"(#loc78)) +#loc199 = loc("tmp29"(#loc79)) +#loc200 = loc("tmp34"(#loc80)) +#loc201 = loc("tmp36"(#loc81)) +#loc202 = loc("tmp38"(#loc82)) +#loc203 = loc("tmp39"(#loc83)) +#loc204 = loc("tmp40"(#loc84)) +#loc205 = loc("tmp45"(#loc91)) +#loc206 = loc("tmp46"(#loc92)) +#loc207 = loc("tmp47"(#loc93)) +#loc208 = loc("tmp48"(#loc94)) +#loc209 = loc("tmp49"(#loc95)) +#loc210 = loc(fused[#loc139, #loc140]) +#loc211 = loc(callsite(#loc143 at #loc144)) +#loc212 = loc(callsite(#loc145 at #loc144)) +#loc213 = loc(callsite(#loc146 at #loc144)) +#loc215 = loc("cond"(#loc173)) +#loc216 = loc("eq"(#loc174)) +#loc217 = loc(fused[#loc187, #loc188]) +#loc219 = loc(fused[#loc190, #loc139, #loc140]) +#loc220 = loc(callsite(#loc28 at #loc192)) +#loc222 = loc(fused[#loc195, #loc187, #loc188]) +#loc223 = loc(callsite(#loc28 at #loc197)) +#loc225 = loc(callsite(#loc147 at #loc214)) +#loc226 = loc(callsite(#loc148 at #loc214)) +#loc227 = loc(callsite(#loc149 at #loc214)) +#loc229 = loc(callsite(#loc151 at #loc214)) +#loc230 = loc(callsite(#loc152 at #loc214)) +#loc231 = loc(callsite(#loc153 at #loc214)) +#loc233 = loc(callsite(#loc155 at #loc214)) +#loc234 = loc(callsite(#loc156 at #loc214)) +#loc235 = loc(callsite(#loc157 at #loc214)) +#loc236 = loc(callsite(#loc158 at #loc214)) +#loc237 = loc(callsite(#loc159 at #loc214)) +#loc238 = loc(callsite(#loc160 at #loc214)) +#loc239 = loc(callsite(#loc161 at #loc214)) +#loc241 = loc(callsite(#loc164 at #loc214)) +#loc242 = loc(callsite(#loc165 at #loc214)) +#loc243 = loc(callsite(#loc166 at #loc214)) +#loc244 = loc(callsite(#loc167 at #loc214)) +#loc246 = loc(callsite(#loc169 at #loc214)) +#loc247 = loc(callsite(#loc170 at #loc214)) +#loc248 = loc(callsite(#loc171 at #loc214)) +#loc249 = loc(callsite(#loc172 at #loc214)) +#loc250 = loc(callsite(#loc215 at #loc214)) +#loc251 = loc(callsite(#loc216 at #loc214)) +#loc252 = loc(callsite(#loc175 at #loc214)) +#loc253 = loc(callsite(#loc176 at #loc214)) +#loc254 = loc(callsite(#loc177 at #loc214)) +#loc255 = loc(callsite(#loc178 at #loc214)) +#loc256 = loc(callsite(#loc179 at #loc214)) +#loc257 = loc(callsite(#loc180 at #loc214)) +#loc258 = loc(callsite(#loc181 at #loc214)) +#loc259 = loc(callsite(#loc182 at #loc214)) +#loc260 = loc(callsite(#loc183 at #loc214)) +#loc261 = loc(callsite(#loc184 at #loc214)) +#loc262 = loc(callsite(#loc185 at #loc214)) +#loc263 = loc(callsite(#loc147 at #loc218)) +#loc264 = loc(callsite(#loc149 at #loc218)) +#loc266 = loc(callsite(#loc151 at #loc218)) +#loc267 = loc(callsite(#loc152 at #loc218)) +#loc268 = loc(callsite(#loc153 at #loc218)) +#loc270 = loc(callsite(#loc155 at #loc218)) +#loc271 = loc(callsite(#loc156 at #loc218)) +#loc272 = loc(callsite(#loc157 at #loc218)) +#loc273 = loc(callsite(#loc158 at #loc218)) +#loc274 = loc(callsite(#loc215 at #loc218)) +#loc275 = loc(callsite(#loc216 at #loc218)) +#loc276 = loc(callsite(#loc176 at #loc218)) +#loc277 = loc(callsite(#loc177 at #loc218)) +#loc278 = loc(callsite(#loc178 at #loc218)) +#loc279 = loc(callsite(#loc179 at #loc218)) +#loc280 = loc(callsite(#loc180 at #loc218)) +#loc281 = loc(callsite(#loc181 at #loc218)) +#loc282 = loc(callsite(#loc182 at #loc218)) +#loc283 = loc(callsite(#loc184 at #loc218)) +#loc284 = loc(callsite(#loc185 at #loc218)) +#loc285 = loc(callsite(#loc159 at #loc218)) +#loc286 = loc(callsite(#loc161 at #loc218)) +#loc288 = loc(callsite(#loc164 at #loc218)) +#loc289 = loc(callsite(#loc165 at #loc218)) +#loc290 = loc(callsite(#loc167 at #loc218)) +#loc292 = loc(callsite(#loc169 at #loc218)) +#loc293 = loc(callsite(#loc170 at #loc218)) +#loc294 = loc(callsite(#loc171 at #loc218)) +#loc295 = loc(callsite(#loc172 at #loc218)) +#loc296 = loc(callsite(#loc175 at #loc218)) +#loc297 = loc(callsite(#loc183 at #loc218)) +#loc298 = loc(callsite(#loc30 at #loc220)) +#loc299 = loc(callsite(#loc30 at #loc223)) +#loc300 = loc(callsite(#loc28 at #loc228)) +#loc302 = loc(callsite(#loc28 at #loc232)) +#loc304 = loc(callsite(#loc162 at #loc240)) +#loc305 = loc(callsite(#loc28 at #loc240)) +#loc307 = loc(callsite(#loc162 at #loc245)) +#loc308 = loc(callsite(#loc28 at #loc245)) +#loc310 = loc(callsite(#loc28 at #loc265)) +#loc312 = loc(callsite(#loc28 at #loc269)) +#loc314 = loc(callsite(#loc28 at #loc287)) +#loc316 = loc(callsite(#loc28 at #loc291)) +#loc318 = loc(callsite(#loc30 at #loc300)) +#loc319 = loc(callsite(#loc30 at #loc302)) +#loc320 = loc(callsite(#loc30 at #loc305)) +#loc321 = loc(callsite(#loc30 at #loc308)) +#loc322 = loc(callsite(#loc30 at #loc310)) +#loc323 = loc(callsite(#loc30 at #loc312)) +#loc324 = loc(callsite(#loc30 at #loc314)) +#loc325 = loc(callsite(#loc30 at #loc316)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..61c6fecc01b6669e03c665cbf374952cef744b95 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4BXPLEVNIV4ISF7IZIVK7CAM4LM5YGYX34PZFJY2Q7MUVWT7ZGUA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir @@ -0,0 +1,1451 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":18:0) +#loc1 = loc(unknown) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":46:71) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":51:71) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:26) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:26) +#loc120 = loc("in_ptr0"(#loc)) +#loc121 = loc("out_ptr4"(#loc)) +#loc122 = loc("out_ptr5"(#loc)) +#loc123 = loc("out_ptr6"(#loc)) +#loc124 = loc("out_ptr7"(#loc)) +#loc125 = loc("out_ptr8"(#loc)) +#loc126 = loc("out_ptr9"(#loc)) +#loc127 = loc("xnumel"(#loc)) +#loc128 = loc("r0_numel"(#loc)) +#loc149 = loc(callsite(#loc22 at #loc23)) +#loc156 = loc("ileft"(#loc32)) +#loc160 = loc("iright"(#loc37)) +#loc169 = loc("left_idx"(#loc46)) +#loc174 = loc("right_idx"(#loc51)) +#loc195 = loc(callsite(#loc22 at #loc72)) +#loc198 = loc("tmp24"(#loc75)) +#loc202 = loc("tmp29"(#loc79)) +#loc221 = loc(callsite(#loc28 at #loc149)) +#loc225 = loc(callsite(#loc28 at #loc195)) +#loc228 = loc(callsite(#loc1 at #loc198)) +#loc231 = loc(callsite(#loc1 at #loc202)) +#loc235 = loc(callsite(#loc156 at #loc221)) +#loc239 = loc(callsite(#loc160 at #loc221)) +#loc247 = loc(callsite(#loc169 at #loc221)) +#loc252 = loc(callsite(#loc174 at #loc221)) +#loc272 = loc(callsite(#loc156 at #loc225)) +#loc276 = loc(callsite(#loc160 at #loc225)) +#loc294 = loc(callsite(#loc169 at #loc225)) +#loc298 = loc(callsite(#loc174 at #loc225)) +#loc308 = loc(callsite(#loc1 at #loc235)) +#loc310 = loc(callsite(#loc1 at #loc239)) +#loc313 = loc(callsite(#loc1 at #loc247)) +#loc316 = loc(callsite(#loc1 at #loc252)) +#loc318 = loc(callsite(#loc1 at #loc272)) +#loc320 = loc(callsite(#loc1 at #loc276)) +#loc322 = loc(callsite(#loc1 at #loc294)) +#loc324 = loc(callsite(#loc1 at #loc298)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<128x16xi32> loc(#loc1) + %cst_1 = arith.constant dense<17> : tensor<128x1xi32> loc(#loc1) + %cst_2 = arith.constant dense : tensor<128x1xi1> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc1) + %cst_4 = arith.constant dense<17> : tensor<128x16xi32> loc(#loc1) + %cst_5 = arith.constant dense<16> : tensor<128x16xi32> loc(#loc1) + %cst_6 = arith.constant dense<16384> : tensor<128x16xi64> loc(#loc1) + %cst_7 = arith.constant dense<0> : tensor<128x16xi64> loc(#loc1) + %cst_8 = arith.constant dense<16> : tensor<128x1xi32> loc(#loc1) + %xmask = arith.constant dense<128> : tensor<128x1xi32> loc(#loc129) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc130) + %xoffset_9 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc131) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc132) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc133) + %xindex_11 = tt.splat %xoffset_9 : i32 -> tensor<128x1xi32> loc(#loc134) + %xindex_12 = arith.addi %xindex_11, %xindex_10 : tensor<128x1xi32> loc(#loc134) + %xmask_13 = arith.cmpi slt, %xindex_12, %xmask : tensor<128x1xi32> loc(#loc129) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc135) + %r0_index_14 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc136) + %tmp0 = arith.muli %xindex_12, %cst_8 : tensor<128x1xi32> loc(#loc137) + %tmp0_15 = tt.broadcast %r0_index_14 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc138) + %tmp0_16 = tt.broadcast %tmp0 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc138) + %tmp0_17 = arith.addi %tmp0_15, %tmp0_16 : tensor<128x16xi32> loc(#loc138) + %tmp0_18 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc139) + %tmp0_19 = tt.addptr %tmp0_18, %tmp0_17 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc139) + %tmp0_20 = tt.broadcast %xmask_13 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc140) + %tmp0_21 = tt.load %tmp0_19, %tmp0_20, %cst_7 : tensor<128x16x!tt.ptr> loc(#loc140) + %tmp2 = arith.cmpi sgt, %tmp0_21, %cst_7 : tensor<128x16xi64> loc(#loc141) + %tmp4 = arith.cmpi slt, %tmp0_21, %cst_6 : tensor<128x16xi64> loc(#loc142) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<128x16xi1> loc(#loc143) + %tmp7 = arith.extui %tmp5 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc216) + %tmp9 = arith.trunci %r0_index_14 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc146) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16> -> tensor<128x16xi16> loc(#loc147) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc217) + %flip_22 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc218) + %flip_23 = tt.expand_dims %flip_22 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc218) + %flip_24 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc219) + %flip_25 = tt.reshape %flip_24 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc220) + %y = tt.reshape %tmp7 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc232) + %left_mask = arith.subi %cst, %flip_23 : tensor<1x2x1xi32> loc(#loc233) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc234) + %ileft_26 = arith.muli %y, %ileft : tensor<1024x2x1xi32> loc(#loc234) + %ileft_27 = "tt.reduce"(%ileft_26) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc307) + %ileft_28 = tt.expand_dims %ileft_27 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc236) + %ileft_29 = tt.broadcast %ileft_28 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc237) + %iright = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc238) + %iright_30 = arith.muli %y, %iright : tensor<1024x2x1xi32> loc(#loc238) + %iright_31 = "tt.reduce"(%iright_30) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc309) + %iright_32 = tt.expand_dims %iright_31 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc240) + %iright_33 = tt.broadcast %iright_32 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc241) + %ileft_34 = tt.reshape %ileft_29 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_35 = tt.reshape %iright_33 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx = tt.reshape %tmp11 : tensor<128x16xi16> -> tensor<1024x2x1xi16> loc(#loc244) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc245) + %left_idx_36 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<1024x2x1xi16> loc(#loc246) + %left_idx_37 = arith.muli %y_idx, %left_idx_36 : tensor<1024x2x1xi16> loc(#loc246) + %input = arith.extsi %left_idx_37 : tensor<1024x2x1xi16> to tensor<1024x2x1xi32> loc(#loc311) + %left_idx_38 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc312) + %left_idx_39 = tt.expand_dims %left_idx_38 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc248) + %left_idx_40 = tt.broadcast %left_idx_39 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc249) + %right_idx = arith.trunci %flip_23 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc250) + %right_idx_41 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<1024x2x1xi16> loc(#loc251) + %right_idx_42 = arith.muli %y_idx, %right_idx_41 : tensor<1024x2x1xi16> loc(#loc251) + %input_43 = arith.extsi %right_idx_42 : tensor<1024x2x1xi16> to tensor<1024x2x1xi32> loc(#loc314) + %right_idx_44 = "tt.reduce"(%input_43) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc315) + %right_idx_45 = tt.expand_dims %right_idx_44 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc253) + %right_idx_46 = tt.broadcast %right_idx_45 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc254) + %left_idx_47 = tt.reshape %left_idx_40 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_48 = tt.reshape %right_idx_46 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc256) + %cond = arith.cmpi slt, %ileft_34, %iright_35 : tensor<128x16xi32> loc(#loc257) + %eq = arith.cmpi eq, %ileft_34, %iright_35 : tensor<128x16xi32> loc(#loc258) + %cond_49 = arith.cmpi sgt, %left_idx_47, %right_idx_48 : tensor<128x16xi32> loc(#loc259) + %cond_50 = arith.andi %eq, %cond_49 : tensor<128x16xi1> loc(#loc260) + %cond_51 = arith.ori %cond, %cond_50 : tensor<128x16xi1> loc(#loc261) + %cond_52 = arith.extui %cond_51 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc262) + %cond_53 = arith.xori %cond_52, %flip_25 : tensor<128x16xi32> loc(#loc262) + %cond_54 = arith.cmpi ne, %cond_53, %cst_3 : tensor<128x16xi32> loc(#loc263) + %ret = arith.xori %ileft_34, %iright_35 : tensor<128x16xi32> loc(#loc264) + %ret_55 = arith.select %cond_54, %ret, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc265) + %ret_56 = arith.xori %tmp7, %ret_55 : tensor<128x16xi32> loc(#loc266) + %new_idxs = arith.xori %left_idx_47, %right_idx_48 : tensor<128x16xi32> loc(#loc267) + %new_idxs_57 = arith.select %cond_54, %new_idxs, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_58 = arith.extsi %tmp9 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc269) + %new_idxs_59 = tt.broadcast %new_idxs_58 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc269) + %new_idxs_60 = arith.xori %new_idxs_59, %new_idxs_57 : tensor<128x16xi32> loc(#loc269) + %flip_61 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc219) + %flip_62 = tt.reshape %flip_61 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc220) + %y_63 = tt.reshape %ret_56 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc232) + %ileft_64 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc234) + %ileft_65 = arith.muli %y_63, %ileft_64 : tensor<512x2x2xi32> loc(#loc234) + %ileft_66 = "tt.reduce"(%ileft_65) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc307) + %ileft_67 = tt.expand_dims %ileft_66 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc236) + %ileft_68 = tt.broadcast %ileft_67 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc237) + %iright_69 = arith.muli %y_63, %flip_24 : tensor<512x2x2xi32> loc(#loc238) + %iright_70 = "tt.reduce"(%iright_69) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc309) + %iright_71 = tt.expand_dims %iright_70 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc240) + %iright_72 = tt.broadcast %iright_71 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc241) + %ileft_73 = tt.reshape %ileft_68 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_74 = tt.reshape %iright_72 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx_75 = tt.reshape %new_idxs_60 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc244) + %left_idx_76 = arith.muli %y_idx_75, %ileft_64 : tensor<512x2x2xi32> loc(#loc246) + %left_idx_77 = "tt.reduce"(%left_idx_76) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc312) + %left_idx_78 = tt.expand_dims %left_idx_77 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc248) + %left_idx_79 = tt.broadcast %left_idx_78 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc249) + %right_idx_80 = arith.muli %y_idx_75, %flip_24 : tensor<512x2x2xi32> loc(#loc251) + %right_idx_81 = "tt.reduce"(%right_idx_80) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc315) + %right_idx_82 = tt.expand_dims %right_idx_81 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc253) + %right_idx_83 = tt.broadcast %right_idx_82 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc254) + %left_idx_84 = tt.reshape %left_idx_79 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_85 = tt.reshape %right_idx_83 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc256) + %cond_86 = arith.cmpi slt, %ileft_73, %iright_74 : tensor<128x16xi32> loc(#loc257) + %eq_87 = arith.cmpi eq, %ileft_73, %iright_74 : tensor<128x16xi32> loc(#loc258) + %cond_88 = arith.cmpi sgt, %left_idx_84, %right_idx_85 : tensor<128x16xi32> loc(#loc259) + %cond_89 = arith.andi %eq_87, %cond_88 : tensor<128x16xi1> loc(#loc260) + %cond_90 = arith.ori %cond_86, %cond_89 : tensor<128x16xi1> loc(#loc261) + %cond_91 = arith.extui %cond_90 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc262) + %cond_92 = arith.xori %cond_91, %flip_62 : tensor<128x16xi32> loc(#loc262) + %cond_93 = arith.cmpi ne, %cond_92, %cst_3 : tensor<128x16xi32> loc(#loc263) + %ret_94 = arith.xori %ileft_73, %iright_74 : tensor<128x16xi32> loc(#loc264) + %ret_95 = arith.select %cond_93, %ret_94, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc265) + %ret_96 = arith.xori %ret_56, %ret_95 : tensor<128x16xi32> loc(#loc266) + %new_idxs_97 = arith.xori %left_idx_84, %right_idx_85 : tensor<128x16xi32> loc(#loc267) + %new_idxs_98 = arith.select %cond_93, %new_idxs_97, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_99 = arith.xori %new_idxs_60, %new_idxs_98 : tensor<128x16xi32> loc(#loc269) + %y_100 = tt.reshape %ret_96 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc232) + %ileft_101 = arith.muli %y_100, %ileft : tensor<1024x2x1xi32> loc(#loc234) + %ileft_102 = "tt.reduce"(%ileft_101) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc307) + %ileft_103 = tt.expand_dims %ileft_102 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc236) + %ileft_104 = tt.broadcast %ileft_103 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc237) + %iright_105 = arith.muli %y_100, %iright : tensor<1024x2x1xi32> loc(#loc238) + %iright_106 = "tt.reduce"(%iright_105) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc309) + %iright_107 = tt.expand_dims %iright_106 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc240) + %iright_108 = tt.broadcast %iright_107 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc241) + %ileft_109 = tt.reshape %ileft_104 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_110 = tt.reshape %iright_108 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx_111 = tt.reshape %new_idxs_99 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc244) + %left_idx_112 = arith.muli %y_idx_111, %ileft : tensor<1024x2x1xi32> loc(#loc246) + %left_idx_113 = "tt.reduce"(%left_idx_112) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc312) + %left_idx_114 = tt.expand_dims %left_idx_113 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc248) + %left_idx_115 = tt.broadcast %left_idx_114 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc249) + %right_idx_116 = arith.muli %y_idx_111, %iright : tensor<1024x2x1xi32> loc(#loc251) + %right_idx_117 = "tt.reduce"(%right_idx_116) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc315) + %right_idx_118 = tt.expand_dims %right_idx_117 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc253) + %right_idx_119 = tt.broadcast %right_idx_118 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc254) + %left_idx_120 = tt.reshape %left_idx_115 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_121 = tt.reshape %right_idx_119 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc256) + %cond_122 = arith.cmpi slt, %ileft_109, %iright_110 : tensor<128x16xi32> loc(#loc257) + %eq_123 = arith.cmpi eq, %ileft_109, %iright_110 : tensor<128x16xi32> loc(#loc258) + %cond_124 = arith.cmpi sgt, %left_idx_120, %right_idx_121 : tensor<128x16xi32> loc(#loc259) + %cond_125 = arith.andi %eq_123, %cond_124 : tensor<128x16xi1> loc(#loc260) + %cond_126 = arith.ori %cond_122, %cond_125 : tensor<128x16xi1> loc(#loc261) + %cond_127 = arith.extui %cond_126 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc262) + %cond_128 = arith.xori %cond_127, %flip_62 : tensor<128x16xi32> loc(#loc262) + %cond_129 = arith.cmpi ne, %cond_128, %cst_3 : tensor<128x16xi32> loc(#loc263) + %ret_130 = arith.xori %ileft_109, %iright_110 : tensor<128x16xi32> loc(#loc264) + %ret_131 = arith.select %cond_129, %ret_130, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc265) + %ret_132 = arith.xori %ret_96, %ret_131 : tensor<128x16xi32> loc(#loc266) + %new_idxs_133 = arith.xori %left_idx_120, %right_idx_121 : tensor<128x16xi32> loc(#loc267) + %new_idxs_134 = arith.select %cond_129, %new_idxs_133, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_135 = arith.xori %new_idxs_99, %new_idxs_134 : tensor<128x16xi32> loc(#loc269) + %flip_136 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc219) + %flip_137 = tt.reshape %flip_136 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc220) + %y_138 = tt.reshape %ret_132 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc232) + %ileft_139 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc234) + %ileft_140 = arith.muli %y_138, %ileft_139 : tensor<256x2x4xi32> loc(#loc234) + %ileft_141 = "tt.reduce"(%ileft_140) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc307) + %ileft_142 = tt.expand_dims %ileft_141 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc236) + %ileft_143 = tt.broadcast %ileft_142 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc237) + %iright_144 = arith.muli %y_138, %flip_61 : tensor<256x2x4xi32> loc(#loc238) + %iright_145 = "tt.reduce"(%iright_144) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc309) + %iright_146 = tt.expand_dims %iright_145 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc240) + %iright_147 = tt.broadcast %iright_146 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc241) + %ileft_148 = tt.reshape %ileft_143 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_149 = tt.reshape %iright_147 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx_150 = tt.reshape %new_idxs_135 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc244) + %left_idx_151 = arith.muli %y_idx_150, %ileft_139 : tensor<256x2x4xi32> loc(#loc246) + %left_idx_152 = "tt.reduce"(%left_idx_151) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc312) + %left_idx_153 = tt.expand_dims %left_idx_152 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc248) + %left_idx_154 = tt.broadcast %left_idx_153 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc249) + %right_idx_155 = arith.muli %y_idx_150, %flip_61 : tensor<256x2x4xi32> loc(#loc251) + %right_idx_156 = "tt.reduce"(%right_idx_155) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc315) + %right_idx_157 = tt.expand_dims %right_idx_156 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc253) + %right_idx_158 = tt.broadcast %right_idx_157 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc254) + %left_idx_159 = tt.reshape %left_idx_154 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_160 = tt.reshape %right_idx_158 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc256) + %cond_161 = arith.cmpi slt, %ileft_148, %iright_149 : tensor<128x16xi32> loc(#loc257) + %eq_162 = arith.cmpi eq, %ileft_148, %iright_149 : tensor<128x16xi32> loc(#loc258) + %cond_163 = arith.cmpi sgt, %left_idx_159, %right_idx_160 : tensor<128x16xi32> loc(#loc259) + %cond_164 = arith.andi %eq_162, %cond_163 : tensor<128x16xi1> loc(#loc260) + %cond_165 = arith.ori %cond_161, %cond_164 : tensor<128x16xi1> loc(#loc261) + %cond_166 = arith.extui %cond_165 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc262) + %cond_167 = arith.xori %cond_166, %flip_137 : tensor<128x16xi32> loc(#loc262) + %cond_168 = arith.cmpi ne, %cond_167, %cst_3 : tensor<128x16xi32> loc(#loc263) + %ret_169 = arith.xori %ileft_148, %iright_149 : tensor<128x16xi32> loc(#loc264) + %ret_170 = arith.select %cond_168, %ret_169, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc265) + %ret_171 = arith.xori %ret_132, %ret_170 : tensor<128x16xi32> loc(#loc266) + %new_idxs_172 = arith.xori %left_idx_159, %right_idx_160 : tensor<128x16xi32> loc(#loc267) + %new_idxs_173 = arith.select %cond_168, %new_idxs_172, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_174 = arith.xori %new_idxs_135, %new_idxs_173 : tensor<128x16xi32> loc(#loc269) + %y_175 = tt.reshape %ret_171 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc232) + %ileft_176 = arith.muli %y_175, %ileft_64 : tensor<512x2x2xi32> loc(#loc234) + %ileft_177 = "tt.reduce"(%ileft_176) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc307) + %ileft_178 = tt.expand_dims %ileft_177 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc236) + %ileft_179 = tt.broadcast %ileft_178 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc237) + %iright_180 = arith.muli %y_175, %flip_24 : tensor<512x2x2xi32> loc(#loc238) + %iright_181 = "tt.reduce"(%iright_180) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc309) + %iright_182 = tt.expand_dims %iright_181 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc240) + %iright_183 = tt.broadcast %iright_182 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc241) + %ileft_184 = tt.reshape %ileft_179 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_185 = tt.reshape %iright_183 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx_186 = tt.reshape %new_idxs_174 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc244) + %left_idx_187 = arith.muli %y_idx_186, %ileft_64 : tensor<512x2x2xi32> loc(#loc246) + %left_idx_188 = "tt.reduce"(%left_idx_187) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc312) + %left_idx_189 = tt.expand_dims %left_idx_188 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc248) + %left_idx_190 = tt.broadcast %left_idx_189 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc249) + %right_idx_191 = arith.muli %y_idx_186, %flip_24 : tensor<512x2x2xi32> loc(#loc251) + %right_idx_192 = "tt.reduce"(%right_idx_191) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc315) + %right_idx_193 = tt.expand_dims %right_idx_192 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc253) + %right_idx_194 = tt.broadcast %right_idx_193 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc254) + %left_idx_195 = tt.reshape %left_idx_190 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_196 = tt.reshape %right_idx_194 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc256) + %cond_197 = arith.cmpi slt, %ileft_184, %iright_185 : tensor<128x16xi32> loc(#loc257) + %eq_198 = arith.cmpi eq, %ileft_184, %iright_185 : tensor<128x16xi32> loc(#loc258) + %cond_199 = arith.cmpi sgt, %left_idx_195, %right_idx_196 : tensor<128x16xi32> loc(#loc259) + %cond_200 = arith.andi %eq_198, %cond_199 : tensor<128x16xi1> loc(#loc260) + %cond_201 = arith.ori %cond_197, %cond_200 : tensor<128x16xi1> loc(#loc261) + %cond_202 = arith.extui %cond_201 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc262) + %cond_203 = arith.xori %cond_202, %flip_137 : tensor<128x16xi32> loc(#loc262) + %cond_204 = arith.cmpi ne, %cond_203, %cst_3 : tensor<128x16xi32> loc(#loc263) + %ret_205 = arith.xori %ileft_184, %iright_185 : tensor<128x16xi32> loc(#loc264) + %ret_206 = arith.select %cond_204, %ret_205, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc265) + %ret_207 = arith.xori %ret_171, %ret_206 : tensor<128x16xi32> loc(#loc266) + %new_idxs_208 = arith.xori %left_idx_195, %right_idx_196 : tensor<128x16xi32> loc(#loc267) + %new_idxs_209 = arith.select %cond_204, %new_idxs_208, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_210 = arith.xori %new_idxs_174, %new_idxs_209 : tensor<128x16xi32> loc(#loc269) + %y_211 = tt.reshape %ret_207 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc232) + %ileft_212 = arith.muli %y_211, %ileft : tensor<1024x2x1xi32> loc(#loc234) + %ileft_213 = "tt.reduce"(%ileft_212) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc307) + %ileft_214 = tt.expand_dims %ileft_213 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc236) + %ileft_215 = tt.broadcast %ileft_214 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc237) + %iright_216 = arith.muli %y_211, %iright : tensor<1024x2x1xi32> loc(#loc238) + %iright_217 = "tt.reduce"(%iright_216) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc309) + %iright_218 = tt.expand_dims %iright_217 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc240) + %iright_219 = tt.broadcast %iright_218 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc241) + %ileft_220 = tt.reshape %ileft_215 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_221 = tt.reshape %iright_219 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx_222 = tt.reshape %new_idxs_210 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc244) + %left_idx_223 = arith.muli %y_idx_222, %ileft : tensor<1024x2x1xi32> loc(#loc246) + %left_idx_224 = "tt.reduce"(%left_idx_223) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc312) + %left_idx_225 = tt.expand_dims %left_idx_224 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc248) + %left_idx_226 = tt.broadcast %left_idx_225 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc249) + %right_idx_227 = arith.muli %y_idx_222, %iright : tensor<1024x2x1xi32> loc(#loc251) + %right_idx_228 = "tt.reduce"(%right_idx_227) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc315) + %right_idx_229 = tt.expand_dims %right_idx_228 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc253) + %right_idx_230 = tt.broadcast %right_idx_229 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc254) + %left_idx_231 = tt.reshape %left_idx_226 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_232 = tt.reshape %right_idx_230 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc256) + %cond_233 = arith.cmpi slt, %ileft_220, %iright_221 : tensor<128x16xi32> loc(#loc257) + %eq_234 = arith.cmpi eq, %ileft_220, %iright_221 : tensor<128x16xi32> loc(#loc258) + %cond_235 = arith.cmpi sgt, %left_idx_231, %right_idx_232 : tensor<128x16xi32> loc(#loc259) + %cond_236 = arith.andi %eq_234, %cond_235 : tensor<128x16xi1> loc(#loc260) + %cond_237 = arith.ori %cond_233, %cond_236 : tensor<128x16xi1> loc(#loc261) + %cond_238 = arith.extui %cond_237 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc262) + %cond_239 = arith.xori %cond_238, %flip_137 : tensor<128x16xi32> loc(#loc262) + %cond_240 = arith.cmpi ne, %cond_239, %cst_3 : tensor<128x16xi32> loc(#loc263) + %ret_241 = arith.xori %ileft_220, %iright_221 : tensor<128x16xi32> loc(#loc264) + %ret_242 = arith.select %cond_240, %ret_241, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc265) + %ret_243 = arith.xori %ret_207, %ret_242 : tensor<128x16xi32> loc(#loc266) + %new_idxs_244 = arith.xori %left_idx_231, %right_idx_232 : tensor<128x16xi32> loc(#loc267) + %new_idxs_245 = arith.select %cond_240, %new_idxs_244, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_246 = arith.xori %new_idxs_210, %new_idxs_245 : tensor<128x16xi32> loc(#loc269) + %y_247 = tt.reshape %ret_243 : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc232) + %ileft_248 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc234) + %ileft_249 = arith.muli %y_247, %ileft_248 : tensor<128x2x8xi32> loc(#loc234) + %ileft_250 = "tt.reduce"(%ileft_249) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc307) + %ileft_251 = tt.expand_dims %ileft_250 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc236) + %ileft_252 = tt.broadcast %ileft_251 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc237) + %iright_253 = arith.muli %y_247, %flip_136 : tensor<128x2x8xi32> loc(#loc238) + %iright_254 = "tt.reduce"(%iright_253) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc309) + %iright_255 = tt.expand_dims %iright_254 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc240) + %iright_256 = tt.broadcast %iright_255 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc241) + %ileft_257 = tt.reshape %ileft_252 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_258 = tt.reshape %iright_256 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx_259 = tt.reshape %new_idxs_246 : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc244) + %left_idx_260 = arith.muli %y_idx_259, %ileft_248 : tensor<128x2x8xi32> loc(#loc246) + %left_idx_261 = "tt.reduce"(%left_idx_260) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc312) + %left_idx_262 = tt.expand_dims %left_idx_261 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc248) + %left_idx_263 = tt.broadcast %left_idx_262 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc249) + %right_idx_264 = arith.muli %y_idx_259, %flip_136 : tensor<128x2x8xi32> loc(#loc251) + %right_idx_265 = "tt.reduce"(%right_idx_264) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc315) + %right_idx_266 = tt.expand_dims %right_idx_265 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc253) + %right_idx_267 = tt.broadcast %right_idx_266 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc254) + %left_idx_268 = tt.reshape %left_idx_263 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_269 = tt.reshape %right_idx_267 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc256) + %cond_270 = arith.cmpi slt, %ileft_257, %iright_258 : tensor<128x16xi32> loc(#loc257) + %eq_271 = arith.cmpi eq, %ileft_257, %iright_258 : tensor<128x16xi32> loc(#loc258) + %cond_272 = arith.cmpi sgt, %left_idx_268, %right_idx_269 : tensor<128x16xi32> loc(#loc259) + %cond_273 = arith.andi %eq_271, %cond_272 : tensor<128x16xi1> loc(#loc260) + %cond_274 = arith.ori %cond_270, %cond_273 : tensor<128x16xi1> loc(#loc261) + %ret_275 = arith.xori %ileft_257, %iright_258 : tensor<128x16xi32> loc(#loc264) + %ret_276 = arith.select %cond_274, %ret_275, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc265) + %ret_277 = arith.xori %ret_243, %ret_276 : tensor<128x16xi32> loc(#loc266) + %new_idxs_278 = arith.xori %left_idx_268, %right_idx_269 : tensor<128x16xi32> loc(#loc267) + %new_idxs_279 = arith.select %cond_274, %new_idxs_278, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_280 = arith.xori %new_idxs_246, %new_idxs_279 : tensor<128x16xi32> loc(#loc269) + %y_281 = tt.reshape %ret_277 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc232) + %ileft_282 = arith.muli %y_281, %ileft_139 : tensor<256x2x4xi32> loc(#loc234) + %ileft_283 = "tt.reduce"(%ileft_282) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc307) + %ileft_284 = tt.expand_dims %ileft_283 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc236) + %ileft_285 = tt.broadcast %ileft_284 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc237) + %iright_286 = arith.muli %y_281, %flip_61 : tensor<256x2x4xi32> loc(#loc238) + %iright_287 = "tt.reduce"(%iright_286) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc309) + %iright_288 = tt.expand_dims %iright_287 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc240) + %iright_289 = tt.broadcast %iright_288 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc241) + %ileft_290 = tt.reshape %ileft_285 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_291 = tt.reshape %iright_289 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx_292 = tt.reshape %new_idxs_280 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc244) + %left_idx_293 = arith.muli %y_idx_292, %ileft_139 : tensor<256x2x4xi32> loc(#loc246) + %left_idx_294 = "tt.reduce"(%left_idx_293) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc312) + %left_idx_295 = tt.expand_dims %left_idx_294 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc248) + %left_idx_296 = tt.broadcast %left_idx_295 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc249) + %right_idx_297 = arith.muli %y_idx_292, %flip_61 : tensor<256x2x4xi32> loc(#loc251) + %right_idx_298 = "tt.reduce"(%right_idx_297) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc315) + %right_idx_299 = tt.expand_dims %right_idx_298 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc253) + %right_idx_300 = tt.broadcast %right_idx_299 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc254) + %left_idx_301 = tt.reshape %left_idx_296 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_302 = tt.reshape %right_idx_300 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc256) + %cond_303 = arith.cmpi slt, %ileft_290, %iright_291 : tensor<128x16xi32> loc(#loc257) + %eq_304 = arith.cmpi eq, %ileft_290, %iright_291 : tensor<128x16xi32> loc(#loc258) + %cond_305 = arith.cmpi sgt, %left_idx_301, %right_idx_302 : tensor<128x16xi32> loc(#loc259) + %cond_306 = arith.andi %eq_304, %cond_305 : tensor<128x16xi1> loc(#loc260) + %cond_307 = arith.ori %cond_303, %cond_306 : tensor<128x16xi1> loc(#loc261) + %ret_308 = arith.xori %ileft_290, %iright_291 : tensor<128x16xi32> loc(#loc264) + %ret_309 = arith.select %cond_307, %ret_308, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc265) + %ret_310 = arith.xori %ret_277, %ret_309 : tensor<128x16xi32> loc(#loc266) + %new_idxs_311 = arith.xori %left_idx_301, %right_idx_302 : tensor<128x16xi32> loc(#loc267) + %new_idxs_312 = arith.select %cond_307, %new_idxs_311, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_313 = arith.xori %new_idxs_280, %new_idxs_312 : tensor<128x16xi32> loc(#loc269) + %y_314 = tt.reshape %ret_310 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc232) + %ileft_315 = arith.muli %y_314, %ileft_64 : tensor<512x2x2xi32> loc(#loc234) + %ileft_316 = "tt.reduce"(%ileft_315) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc307) + %ileft_317 = tt.expand_dims %ileft_316 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc236) + %ileft_318 = tt.broadcast %ileft_317 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc237) + %iright_319 = arith.muli %y_314, %flip_24 : tensor<512x2x2xi32> loc(#loc238) + %iright_320 = "tt.reduce"(%iright_319) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc309) + %iright_321 = tt.expand_dims %iright_320 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc240) + %iright_322 = tt.broadcast %iright_321 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc241) + %ileft_323 = tt.reshape %ileft_318 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_324 = tt.reshape %iright_322 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx_325 = tt.reshape %new_idxs_313 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc244) + %left_idx_326 = arith.muli %y_idx_325, %ileft_64 : tensor<512x2x2xi32> loc(#loc246) + %left_idx_327 = "tt.reduce"(%left_idx_326) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc312) + %left_idx_328 = tt.expand_dims %left_idx_327 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc248) + %left_idx_329 = tt.broadcast %left_idx_328 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc249) + %right_idx_330 = arith.muli %y_idx_325, %flip_24 : tensor<512x2x2xi32> loc(#loc251) + %right_idx_331 = "tt.reduce"(%right_idx_330) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc315) + %right_idx_332 = tt.expand_dims %right_idx_331 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc253) + %right_idx_333 = tt.broadcast %right_idx_332 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc254) + %left_idx_334 = tt.reshape %left_idx_329 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_335 = tt.reshape %right_idx_333 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc256) + %cond_336 = arith.cmpi slt, %ileft_323, %iright_324 : tensor<128x16xi32> loc(#loc257) + %eq_337 = arith.cmpi eq, %ileft_323, %iright_324 : tensor<128x16xi32> loc(#loc258) + %cond_338 = arith.cmpi sgt, %left_idx_334, %right_idx_335 : tensor<128x16xi32> loc(#loc259) + %cond_339 = arith.andi %eq_337, %cond_338 : tensor<128x16xi1> loc(#loc260) + %cond_340 = arith.ori %cond_336, %cond_339 : tensor<128x16xi1> loc(#loc261) + %ret_341 = arith.xori %ileft_323, %iright_324 : tensor<128x16xi32> loc(#loc264) + %ret_342 = arith.select %cond_340, %ret_341, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc265) + %ret_343 = arith.xori %ret_310, %ret_342 : tensor<128x16xi32> loc(#loc266) + %new_idxs_344 = arith.xori %left_idx_334, %right_idx_335 : tensor<128x16xi32> loc(#loc267) + %new_idxs_345 = arith.select %cond_340, %new_idxs_344, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_346 = arith.xori %new_idxs_313, %new_idxs_345 : tensor<128x16xi32> loc(#loc269) + %y_347 = tt.reshape %ret_343 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc232) + %ileft_348 = arith.muli %y_347, %ileft : tensor<1024x2x1xi32> loc(#loc234) + %ileft_349 = "tt.reduce"(%ileft_348) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc307) + %ileft_350 = tt.expand_dims %ileft_349 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc236) + %ileft_351 = tt.broadcast %ileft_350 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc237) + %iright_352 = arith.muli %y_347, %iright : tensor<1024x2x1xi32> loc(#loc238) + %iright_353 = "tt.reduce"(%iright_352) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc309) + %iright_354 = tt.expand_dims %iright_353 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc240) + %iright_355 = tt.broadcast %iright_354 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc241) + %ileft_356 = tt.reshape %ileft_351 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc242) + %iright_357 = tt.reshape %iright_355 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc243) + %y_idx_358 = tt.reshape %new_idxs_346 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc244) + %left_idx_359 = arith.muli %y_idx_358, %ileft : tensor<1024x2x1xi32> loc(#loc246) + %left_idx_360 = "tt.reduce"(%left_idx_359) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc312) + %left_idx_361 = tt.expand_dims %left_idx_360 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc248) + %left_idx_362 = tt.broadcast %left_idx_361 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc249) + %right_idx_363 = arith.muli %y_idx_358, %iright : tensor<1024x2x1xi32> loc(#loc251) + %right_idx_364 = "tt.reduce"(%right_idx_363) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc315) + %right_idx_365 = tt.expand_dims %right_idx_364 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc253) + %right_idx_366 = tt.broadcast %right_idx_365 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc254) + %left_idx_367 = tt.reshape %left_idx_362 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc255) + %right_idx_368 = tt.reshape %right_idx_366 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc256) + %cond_369 = arith.cmpi slt, %ileft_356, %iright_357 : tensor<128x16xi32> loc(#loc257) + %eq_370 = arith.cmpi eq, %ileft_356, %iright_357 : tensor<128x16xi32> loc(#loc258) + %cond_371 = arith.cmpi sgt, %left_idx_367, %right_idx_368 : tensor<128x16xi32> loc(#loc259) + %cond_372 = arith.andi %eq_370, %cond_371 : tensor<128x16xi1> loc(#loc260) + %cond_373 = arith.ori %cond_369, %cond_372 : tensor<128x16xi1> loc(#loc261) + %new_idxs_374 = arith.xori %left_idx_367, %right_idx_368 : tensor<128x16xi32> loc(#loc267) + %new_idxs_375 = arith.select %cond_373, %new_idxs_374, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc268) + %new_idxs_376 = arith.xori %new_idxs_346, %new_idxs_375 : tensor<128x16xi32> loc(#loc269) + %tmp14 = arith.cmpi eq, %tmp0_21, %cst_6 : tensor<128x16xi64> loc(#loc192) + %tmp16 = arith.extui %tmp14 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc224) + %y_377 = tt.reshape %tmp16 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc270) + %ileft_378 = arith.muli %y_377, %ileft : tensor<1024x2x1xi32> loc(#loc271) + %ileft_379 = "tt.reduce"(%ileft_378) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc317) + %ileft_380 = tt.expand_dims %ileft_379 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc273) + %ileft_381 = tt.broadcast %ileft_380 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc274) + %iright_382 = arith.muli %y_377, %iright : tensor<1024x2x1xi32> loc(#loc275) + %iright_383 = "tt.reduce"(%iright_382) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc319) + %iright_384 = tt.expand_dims %iright_383 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc277) + %iright_385 = tt.broadcast %iright_384 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc278) + %ileft_386 = tt.reshape %ileft_381 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_387 = tt.reshape %iright_385 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc280) + %cond_388 = arith.cmpi slt, %ileft_386, %iright_387 : tensor<128x16xi32> loc(#loc281) + %eq_389 = arith.cmpi eq, %ileft_386, %iright_387 : tensor<128x16xi32> loc(#loc282) + %cond_390 = arith.andi %eq_389, %cond_49 : tensor<128x16xi1> loc(#loc283) + %cond_391 = arith.ori %cond_388, %cond_390 : tensor<128x16xi1> loc(#loc284) + %cond_392 = arith.extui %cond_391 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc285) + %cond_393 = arith.xori %cond_392, %flip_25 : tensor<128x16xi32> loc(#loc285) + %cond_394 = arith.cmpi ne, %cond_393, %cst_3 : tensor<128x16xi32> loc(#loc286) + %ret_395 = arith.xori %ileft_386, %iright_387 : tensor<128x16xi32> loc(#loc287) + %ret_396 = arith.select %cond_394, %ret_395, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc288) + %ret_397 = arith.xori %tmp16, %ret_396 : tensor<128x16xi32> loc(#loc289) + %new_idxs_398 = arith.select %cond_394, %new_idxs, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_399 = arith.xori %new_idxs_59, %new_idxs_398 : tensor<128x16xi32> loc(#loc291) + %y_400 = tt.reshape %ret_397 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc270) + %ileft_401 = arith.muli %y_400, %ileft_64 : tensor<512x2x2xi32> loc(#loc271) + %ileft_402 = "tt.reduce"(%ileft_401) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc317) + %ileft_403 = tt.expand_dims %ileft_402 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc273) + %ileft_404 = tt.broadcast %ileft_403 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc274) + %iright_405 = arith.muli %y_400, %flip_24 : tensor<512x2x2xi32> loc(#loc275) + %iright_406 = "tt.reduce"(%iright_405) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc319) + %iright_407 = tt.expand_dims %iright_406 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc277) + %iright_408 = tt.broadcast %iright_407 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc278) + %ileft_409 = tt.reshape %ileft_404 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_410 = tt.reshape %iright_408 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc280) + %y_idx_411 = tt.reshape %new_idxs_399 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc292) + %left_idx_412 = arith.muli %y_idx_411, %ileft_64 : tensor<512x2x2xi32> loc(#loc293) + %left_idx_413 = "tt.reduce"(%left_idx_412) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc321) + %left_idx_414 = tt.expand_dims %left_idx_413 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc295) + %left_idx_415 = tt.broadcast %left_idx_414 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc296) + %right_idx_416 = arith.muli %y_idx_411, %flip_24 : tensor<512x2x2xi32> loc(#loc297) + %right_idx_417 = "tt.reduce"(%right_idx_416) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc323) + %right_idx_418 = tt.expand_dims %right_idx_417 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc299) + %right_idx_419 = tt.broadcast %right_idx_418 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc300) + %left_idx_420 = tt.reshape %left_idx_415 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc301) + %right_idx_421 = tt.reshape %right_idx_419 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc302) + %cond_422 = arith.cmpi slt, %ileft_409, %iright_410 : tensor<128x16xi32> loc(#loc281) + %eq_423 = arith.cmpi eq, %ileft_409, %iright_410 : tensor<128x16xi32> loc(#loc282) + %cond_424 = arith.cmpi sgt, %left_idx_420, %right_idx_421 : tensor<128x16xi32> loc(#loc303) + %cond_425 = arith.andi %eq_423, %cond_424 : tensor<128x16xi1> loc(#loc283) + %cond_426 = arith.ori %cond_422, %cond_425 : tensor<128x16xi1> loc(#loc284) + %cond_427 = arith.extui %cond_426 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc285) + %cond_428 = arith.xori %cond_427, %flip_62 : tensor<128x16xi32> loc(#loc285) + %cond_429 = arith.cmpi ne, %cond_428, %cst_3 : tensor<128x16xi32> loc(#loc286) + %ret_430 = arith.xori %ileft_409, %iright_410 : tensor<128x16xi32> loc(#loc287) + %ret_431 = arith.select %cond_429, %ret_430, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc288) + %ret_432 = arith.xori %ret_397, %ret_431 : tensor<128x16xi32> loc(#loc289) + %new_idxs_433 = arith.xori %left_idx_420, %right_idx_421 : tensor<128x16xi32> loc(#loc304) + %new_idxs_434 = arith.select %cond_429, %new_idxs_433, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_435 = arith.xori %new_idxs_399, %new_idxs_434 : tensor<128x16xi32> loc(#loc291) + %y_436 = tt.reshape %ret_432 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc270) + %ileft_437 = arith.muli %y_436, %ileft : tensor<1024x2x1xi32> loc(#loc271) + %ileft_438 = "tt.reduce"(%ileft_437) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc317) + %ileft_439 = tt.expand_dims %ileft_438 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc273) + %ileft_440 = tt.broadcast %ileft_439 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc274) + %iright_441 = arith.muli %y_436, %iright : tensor<1024x2x1xi32> loc(#loc275) + %iright_442 = "tt.reduce"(%iright_441) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc319) + %iright_443 = tt.expand_dims %iright_442 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc277) + %iright_444 = tt.broadcast %iright_443 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc278) + %ileft_445 = tt.reshape %ileft_440 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_446 = tt.reshape %iright_444 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc280) + %y_idx_447 = tt.reshape %new_idxs_435 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc292) + %left_idx_448 = arith.muli %y_idx_447, %ileft : tensor<1024x2x1xi32> loc(#loc293) + %left_idx_449 = "tt.reduce"(%left_idx_448) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc321) + %left_idx_450 = tt.expand_dims %left_idx_449 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc295) + %left_idx_451 = tt.broadcast %left_idx_450 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc296) + %right_idx_452 = arith.muli %y_idx_447, %iright : tensor<1024x2x1xi32> loc(#loc297) + %right_idx_453 = "tt.reduce"(%right_idx_452) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc323) + %right_idx_454 = tt.expand_dims %right_idx_453 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc299) + %right_idx_455 = tt.broadcast %right_idx_454 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc300) + %left_idx_456 = tt.reshape %left_idx_451 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc301) + %right_idx_457 = tt.reshape %right_idx_455 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc302) + %cond_458 = arith.cmpi slt, %ileft_445, %iright_446 : tensor<128x16xi32> loc(#loc281) + %eq_459 = arith.cmpi eq, %ileft_445, %iright_446 : tensor<128x16xi32> loc(#loc282) + %cond_460 = arith.cmpi sgt, %left_idx_456, %right_idx_457 : tensor<128x16xi32> loc(#loc303) + %cond_461 = arith.andi %eq_459, %cond_460 : tensor<128x16xi1> loc(#loc283) + %cond_462 = arith.ori %cond_458, %cond_461 : tensor<128x16xi1> loc(#loc284) + %cond_463 = arith.extui %cond_462 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc285) + %cond_464 = arith.xori %cond_463, %flip_62 : tensor<128x16xi32> loc(#loc285) + %cond_465 = arith.cmpi ne, %cond_464, %cst_3 : tensor<128x16xi32> loc(#loc286) + %ret_466 = arith.xori %ileft_445, %iright_446 : tensor<128x16xi32> loc(#loc287) + %ret_467 = arith.select %cond_465, %ret_466, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc288) + %ret_468 = arith.xori %ret_432, %ret_467 : tensor<128x16xi32> loc(#loc289) + %new_idxs_469 = arith.xori %left_idx_456, %right_idx_457 : tensor<128x16xi32> loc(#loc304) + %new_idxs_470 = arith.select %cond_465, %new_idxs_469, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_471 = arith.xori %new_idxs_435, %new_idxs_470 : tensor<128x16xi32> loc(#loc291) + %y_472 = tt.reshape %ret_468 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc270) + %ileft_473 = arith.muli %y_472, %ileft_139 : tensor<256x2x4xi32> loc(#loc271) + %ileft_474 = "tt.reduce"(%ileft_473) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc317) + %ileft_475 = tt.expand_dims %ileft_474 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc273) + %ileft_476 = tt.broadcast %ileft_475 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc274) + %iright_477 = arith.muli %y_472, %flip_61 : tensor<256x2x4xi32> loc(#loc275) + %iright_478 = "tt.reduce"(%iright_477) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc319) + %iright_479 = tt.expand_dims %iright_478 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc277) + %iright_480 = tt.broadcast %iright_479 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc278) + %ileft_481 = tt.reshape %ileft_476 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_482 = tt.reshape %iright_480 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc280) + %y_idx_483 = tt.reshape %new_idxs_471 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc292) + %left_idx_484 = arith.muli %y_idx_483, %ileft_139 : tensor<256x2x4xi32> loc(#loc293) + %left_idx_485 = "tt.reduce"(%left_idx_484) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc321) + %left_idx_486 = tt.expand_dims %left_idx_485 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc295) + %left_idx_487 = tt.broadcast %left_idx_486 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc296) + %right_idx_488 = arith.muli %y_idx_483, %flip_61 : tensor<256x2x4xi32> loc(#loc297) + %right_idx_489 = "tt.reduce"(%right_idx_488) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc323) + %right_idx_490 = tt.expand_dims %right_idx_489 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc299) + %right_idx_491 = tt.broadcast %right_idx_490 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc300) + %left_idx_492 = tt.reshape %left_idx_487 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc301) + %right_idx_493 = tt.reshape %right_idx_491 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc302) + %cond_494 = arith.cmpi slt, %ileft_481, %iright_482 : tensor<128x16xi32> loc(#loc281) + %eq_495 = arith.cmpi eq, %ileft_481, %iright_482 : tensor<128x16xi32> loc(#loc282) + %cond_496 = arith.cmpi sgt, %left_idx_492, %right_idx_493 : tensor<128x16xi32> loc(#loc303) + %cond_497 = arith.andi %eq_495, %cond_496 : tensor<128x16xi1> loc(#loc283) + %cond_498 = arith.ori %cond_494, %cond_497 : tensor<128x16xi1> loc(#loc284) + %cond_499 = arith.extui %cond_498 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc285) + %cond_500 = arith.xori %cond_499, %flip_137 : tensor<128x16xi32> loc(#loc285) + %cond_501 = arith.cmpi ne, %cond_500, %cst_3 : tensor<128x16xi32> loc(#loc286) + %ret_502 = arith.xori %ileft_481, %iright_482 : tensor<128x16xi32> loc(#loc287) + %ret_503 = arith.select %cond_501, %ret_502, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc288) + %ret_504 = arith.xori %ret_468, %ret_503 : tensor<128x16xi32> loc(#loc289) + %new_idxs_505 = arith.xori %left_idx_492, %right_idx_493 : tensor<128x16xi32> loc(#loc304) + %new_idxs_506 = arith.select %cond_501, %new_idxs_505, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_507 = arith.xori %new_idxs_471, %new_idxs_506 : tensor<128x16xi32> loc(#loc291) + %y_508 = tt.reshape %ret_504 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc270) + %ileft_509 = arith.muli %y_508, %ileft_64 : tensor<512x2x2xi32> loc(#loc271) + %ileft_510 = "tt.reduce"(%ileft_509) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc317) + %ileft_511 = tt.expand_dims %ileft_510 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc273) + %ileft_512 = tt.broadcast %ileft_511 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc274) + %iright_513 = arith.muli %y_508, %flip_24 : tensor<512x2x2xi32> loc(#loc275) + %iright_514 = "tt.reduce"(%iright_513) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc319) + %iright_515 = tt.expand_dims %iright_514 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc277) + %iright_516 = tt.broadcast %iright_515 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc278) + %ileft_517 = tt.reshape %ileft_512 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_518 = tt.reshape %iright_516 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc280) + %y_idx_519 = tt.reshape %new_idxs_507 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc292) + %left_idx_520 = arith.muli %y_idx_519, %ileft_64 : tensor<512x2x2xi32> loc(#loc293) + %left_idx_521 = "tt.reduce"(%left_idx_520) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc321) + %left_idx_522 = tt.expand_dims %left_idx_521 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc295) + %left_idx_523 = tt.broadcast %left_idx_522 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc296) + %right_idx_524 = arith.muli %y_idx_519, %flip_24 : tensor<512x2x2xi32> loc(#loc297) + %right_idx_525 = "tt.reduce"(%right_idx_524) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc323) + %right_idx_526 = tt.expand_dims %right_idx_525 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc299) + %right_idx_527 = tt.broadcast %right_idx_526 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc300) + %left_idx_528 = tt.reshape %left_idx_523 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc301) + %right_idx_529 = tt.reshape %right_idx_527 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc302) + %cond_530 = arith.cmpi slt, %ileft_517, %iright_518 : tensor<128x16xi32> loc(#loc281) + %eq_531 = arith.cmpi eq, %ileft_517, %iright_518 : tensor<128x16xi32> loc(#loc282) + %cond_532 = arith.cmpi sgt, %left_idx_528, %right_idx_529 : tensor<128x16xi32> loc(#loc303) + %cond_533 = arith.andi %eq_531, %cond_532 : tensor<128x16xi1> loc(#loc283) + %cond_534 = arith.ori %cond_530, %cond_533 : tensor<128x16xi1> loc(#loc284) + %cond_535 = arith.extui %cond_534 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc285) + %cond_536 = arith.xori %cond_535, %flip_137 : tensor<128x16xi32> loc(#loc285) + %cond_537 = arith.cmpi ne, %cond_536, %cst_3 : tensor<128x16xi32> loc(#loc286) + %ret_538 = arith.xori %ileft_517, %iright_518 : tensor<128x16xi32> loc(#loc287) + %ret_539 = arith.select %cond_537, %ret_538, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc288) + %ret_540 = arith.xori %ret_504, %ret_539 : tensor<128x16xi32> loc(#loc289) + %new_idxs_541 = arith.xori %left_idx_528, %right_idx_529 : tensor<128x16xi32> loc(#loc304) + %new_idxs_542 = arith.select %cond_537, %new_idxs_541, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_543 = arith.xori %new_idxs_507, %new_idxs_542 : tensor<128x16xi32> loc(#loc291) + %y_544 = tt.reshape %ret_540 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc270) + %ileft_545 = arith.muli %y_544, %ileft : tensor<1024x2x1xi32> loc(#loc271) + %ileft_546 = "tt.reduce"(%ileft_545) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc317) + %ileft_547 = tt.expand_dims %ileft_546 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc273) + %ileft_548 = tt.broadcast %ileft_547 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc274) + %iright_549 = arith.muli %y_544, %iright : tensor<1024x2x1xi32> loc(#loc275) + %iright_550 = "tt.reduce"(%iright_549) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc319) + %iright_551 = tt.expand_dims %iright_550 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc277) + %iright_552 = tt.broadcast %iright_551 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc278) + %ileft_553 = tt.reshape %ileft_548 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_554 = tt.reshape %iright_552 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc280) + %y_idx_555 = tt.reshape %new_idxs_543 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc292) + %left_idx_556 = arith.muli %y_idx_555, %ileft : tensor<1024x2x1xi32> loc(#loc293) + %left_idx_557 = "tt.reduce"(%left_idx_556) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc321) + %left_idx_558 = tt.expand_dims %left_idx_557 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc295) + %left_idx_559 = tt.broadcast %left_idx_558 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc296) + %right_idx_560 = arith.muli %y_idx_555, %iright : tensor<1024x2x1xi32> loc(#loc297) + %right_idx_561 = "tt.reduce"(%right_idx_560) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc323) + %right_idx_562 = tt.expand_dims %right_idx_561 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc299) + %right_idx_563 = tt.broadcast %right_idx_562 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc300) + %left_idx_564 = tt.reshape %left_idx_559 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc301) + %right_idx_565 = tt.reshape %right_idx_563 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc302) + %cond_566 = arith.cmpi slt, %ileft_553, %iright_554 : tensor<128x16xi32> loc(#loc281) + %eq_567 = arith.cmpi eq, %ileft_553, %iright_554 : tensor<128x16xi32> loc(#loc282) + %cond_568 = arith.cmpi sgt, %left_idx_564, %right_idx_565 : tensor<128x16xi32> loc(#loc303) + %cond_569 = arith.andi %eq_567, %cond_568 : tensor<128x16xi1> loc(#loc283) + %cond_570 = arith.ori %cond_566, %cond_569 : tensor<128x16xi1> loc(#loc284) + %cond_571 = arith.extui %cond_570 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc285) + %cond_572 = arith.xori %cond_571, %flip_137 : tensor<128x16xi32> loc(#loc285) + %cond_573 = arith.cmpi ne, %cond_572, %cst_3 : tensor<128x16xi32> loc(#loc286) + %ret_574 = arith.xori %ileft_553, %iright_554 : tensor<128x16xi32> loc(#loc287) + %ret_575 = arith.select %cond_573, %ret_574, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc288) + %ret_576 = arith.xori %ret_540, %ret_575 : tensor<128x16xi32> loc(#loc289) + %new_idxs_577 = arith.xori %left_idx_564, %right_idx_565 : tensor<128x16xi32> loc(#loc304) + %new_idxs_578 = arith.select %cond_573, %new_idxs_577, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_579 = arith.xori %new_idxs_543, %new_idxs_578 : tensor<128x16xi32> loc(#loc291) + %y_580 = tt.reshape %ret_576 : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc270) + %ileft_581 = arith.muli %y_580, %ileft_248 : tensor<128x2x8xi32> loc(#loc271) + %ileft_582 = "tt.reduce"(%ileft_581) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc317) + %ileft_583 = tt.expand_dims %ileft_582 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc273) + %ileft_584 = tt.broadcast %ileft_583 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc274) + %iright_585 = arith.muli %y_580, %flip_136 : tensor<128x2x8xi32> loc(#loc275) + %iright_586 = "tt.reduce"(%iright_585) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc319) + %iright_587 = tt.expand_dims %iright_586 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc277) + %iright_588 = tt.broadcast %iright_587 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc278) + %ileft_589 = tt.reshape %ileft_584 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_590 = tt.reshape %iright_588 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc280) + %y_idx_591 = tt.reshape %new_idxs_579 : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc292) + %left_idx_592 = arith.muli %y_idx_591, %ileft_248 : tensor<128x2x8xi32> loc(#loc293) + %left_idx_593 = "tt.reduce"(%left_idx_592) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc321) + %left_idx_594 = tt.expand_dims %left_idx_593 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc295) + %left_idx_595 = tt.broadcast %left_idx_594 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc296) + %right_idx_596 = arith.muli %y_idx_591, %flip_136 : tensor<128x2x8xi32> loc(#loc297) + %right_idx_597 = "tt.reduce"(%right_idx_596) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc323) + %right_idx_598 = tt.expand_dims %right_idx_597 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc299) + %right_idx_599 = tt.broadcast %right_idx_598 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc300) + %left_idx_600 = tt.reshape %left_idx_595 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc301) + %right_idx_601 = tt.reshape %right_idx_599 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc302) + %cond_602 = arith.cmpi slt, %ileft_589, %iright_590 : tensor<128x16xi32> loc(#loc281) + %eq_603 = arith.cmpi eq, %ileft_589, %iright_590 : tensor<128x16xi32> loc(#loc282) + %cond_604 = arith.cmpi sgt, %left_idx_600, %right_idx_601 : tensor<128x16xi32> loc(#loc303) + %cond_605 = arith.andi %eq_603, %cond_604 : tensor<128x16xi1> loc(#loc283) + %cond_606 = arith.ori %cond_602, %cond_605 : tensor<128x16xi1> loc(#loc284) + %ret_607 = arith.xori %ileft_589, %iright_590 : tensor<128x16xi32> loc(#loc287) + %ret_608 = arith.select %cond_606, %ret_607, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc288) + %ret_609 = arith.xori %ret_576, %ret_608 : tensor<128x16xi32> loc(#loc289) + %new_idxs_610 = arith.xori %left_idx_600, %right_idx_601 : tensor<128x16xi32> loc(#loc304) + %new_idxs_611 = arith.select %cond_606, %new_idxs_610, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_612 = arith.xori %new_idxs_579, %new_idxs_611 : tensor<128x16xi32> loc(#loc291) + %y_613 = tt.reshape %ret_609 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc270) + %ileft_614 = arith.muli %y_613, %ileft_139 : tensor<256x2x4xi32> loc(#loc271) + %ileft_615 = "tt.reduce"(%ileft_614) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc317) + %ileft_616 = tt.expand_dims %ileft_615 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc273) + %ileft_617 = tt.broadcast %ileft_616 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc274) + %iright_618 = arith.muli %y_613, %flip_61 : tensor<256x2x4xi32> loc(#loc275) + %iright_619 = "tt.reduce"(%iright_618) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc319) + %iright_620 = tt.expand_dims %iright_619 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc277) + %iright_621 = tt.broadcast %iright_620 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc278) + %ileft_622 = tt.reshape %ileft_617 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_623 = tt.reshape %iright_621 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc280) + %y_idx_624 = tt.reshape %new_idxs_612 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc292) + %left_idx_625 = arith.muli %y_idx_624, %ileft_139 : tensor<256x2x4xi32> loc(#loc293) + %left_idx_626 = "tt.reduce"(%left_idx_625) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc321) + %left_idx_627 = tt.expand_dims %left_idx_626 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc295) + %left_idx_628 = tt.broadcast %left_idx_627 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc296) + %right_idx_629 = arith.muli %y_idx_624, %flip_61 : tensor<256x2x4xi32> loc(#loc297) + %right_idx_630 = "tt.reduce"(%right_idx_629) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc323) + %right_idx_631 = tt.expand_dims %right_idx_630 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc299) + %right_idx_632 = tt.broadcast %right_idx_631 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc300) + %left_idx_633 = tt.reshape %left_idx_628 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc301) + %right_idx_634 = tt.reshape %right_idx_632 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc302) + %cond_635 = arith.cmpi slt, %ileft_622, %iright_623 : tensor<128x16xi32> loc(#loc281) + %eq_636 = arith.cmpi eq, %ileft_622, %iright_623 : tensor<128x16xi32> loc(#loc282) + %cond_637 = arith.cmpi sgt, %left_idx_633, %right_idx_634 : tensor<128x16xi32> loc(#loc303) + %cond_638 = arith.andi %eq_636, %cond_637 : tensor<128x16xi1> loc(#loc283) + %cond_639 = arith.ori %cond_635, %cond_638 : tensor<128x16xi1> loc(#loc284) + %ret_640 = arith.xori %ileft_622, %iright_623 : tensor<128x16xi32> loc(#loc287) + %ret_641 = arith.select %cond_639, %ret_640, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc288) + %ret_642 = arith.xori %ret_609, %ret_641 : tensor<128x16xi32> loc(#loc289) + %new_idxs_643 = arith.xori %left_idx_633, %right_idx_634 : tensor<128x16xi32> loc(#loc304) + %new_idxs_644 = arith.select %cond_639, %new_idxs_643, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_645 = arith.xori %new_idxs_612, %new_idxs_644 : tensor<128x16xi32> loc(#loc291) + %y_646 = tt.reshape %ret_642 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc270) + %ileft_647 = arith.muli %y_646, %ileft_64 : tensor<512x2x2xi32> loc(#loc271) + %ileft_648 = "tt.reduce"(%ileft_647) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc317) + %ileft_649 = tt.expand_dims %ileft_648 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc273) + %ileft_650 = tt.broadcast %ileft_649 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc274) + %iright_651 = arith.muli %y_646, %flip_24 : tensor<512x2x2xi32> loc(#loc275) + %iright_652 = "tt.reduce"(%iright_651) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc319) + %iright_653 = tt.expand_dims %iright_652 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc277) + %iright_654 = tt.broadcast %iright_653 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc278) + %ileft_655 = tt.reshape %ileft_650 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_656 = tt.reshape %iright_654 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc280) + %y_idx_657 = tt.reshape %new_idxs_645 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc292) + %left_idx_658 = arith.muli %y_idx_657, %ileft_64 : tensor<512x2x2xi32> loc(#loc293) + %left_idx_659 = "tt.reduce"(%left_idx_658) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc321) + %left_idx_660 = tt.expand_dims %left_idx_659 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc295) + %left_idx_661 = tt.broadcast %left_idx_660 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc296) + %right_idx_662 = arith.muli %y_idx_657, %flip_24 : tensor<512x2x2xi32> loc(#loc297) + %right_idx_663 = "tt.reduce"(%right_idx_662) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc323) + %right_idx_664 = tt.expand_dims %right_idx_663 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc299) + %right_idx_665 = tt.broadcast %right_idx_664 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc300) + %left_idx_666 = tt.reshape %left_idx_661 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc301) + %right_idx_667 = tt.reshape %right_idx_665 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc302) + %cond_668 = arith.cmpi slt, %ileft_655, %iright_656 : tensor<128x16xi32> loc(#loc281) + %eq_669 = arith.cmpi eq, %ileft_655, %iright_656 : tensor<128x16xi32> loc(#loc282) + %cond_670 = arith.cmpi sgt, %left_idx_666, %right_idx_667 : tensor<128x16xi32> loc(#loc303) + %cond_671 = arith.andi %eq_669, %cond_670 : tensor<128x16xi1> loc(#loc283) + %cond_672 = arith.ori %cond_668, %cond_671 : tensor<128x16xi1> loc(#loc284) + %ret_673 = arith.xori %ileft_655, %iright_656 : tensor<128x16xi32> loc(#loc287) + %ret_674 = arith.select %cond_672, %ret_673, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc288) + %ret_675 = arith.xori %ret_642, %ret_674 : tensor<128x16xi32> loc(#loc289) + %new_idxs_676 = arith.xori %left_idx_666, %right_idx_667 : tensor<128x16xi32> loc(#loc304) + %new_idxs_677 = arith.select %cond_672, %new_idxs_676, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_678 = arith.xori %new_idxs_645, %new_idxs_677 : tensor<128x16xi32> loc(#loc291) + %y_679 = tt.reshape %ret_675 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc270) + %ileft_680 = arith.muli %y_679, %ileft : tensor<1024x2x1xi32> loc(#loc271) + %ileft_681 = "tt.reduce"(%ileft_680) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc317) + %ileft_682 = tt.expand_dims %ileft_681 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc273) + %ileft_683 = tt.broadcast %ileft_682 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc274) + %iright_684 = arith.muli %y_679, %iright : tensor<1024x2x1xi32> loc(#loc275) + %iright_685 = "tt.reduce"(%iright_684) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc319) + %iright_686 = tt.expand_dims %iright_685 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc277) + %iright_687 = tt.broadcast %iright_686 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc278) + %ileft_688 = tt.reshape %ileft_683 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc279) + %iright_689 = tt.reshape %iright_687 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc280) + %y_idx_690 = tt.reshape %new_idxs_678 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc292) + %left_idx_691 = arith.muli %y_idx_690, %ileft : tensor<1024x2x1xi32> loc(#loc293) + %left_idx_692 = "tt.reduce"(%left_idx_691) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc321) + %left_idx_693 = tt.expand_dims %left_idx_692 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc295) + %left_idx_694 = tt.broadcast %left_idx_693 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc296) + %right_idx_695 = arith.muli %y_idx_690, %iright : tensor<1024x2x1xi32> loc(#loc297) + %right_idx_696 = "tt.reduce"(%right_idx_695) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc323) + %right_idx_697 = tt.expand_dims %right_idx_696 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc299) + %right_idx_698 = tt.broadcast %right_idx_697 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc300) + %left_idx_699 = tt.reshape %left_idx_694 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc301) + %right_idx_700 = tt.reshape %right_idx_698 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc302) + %cond_701 = arith.cmpi slt, %ileft_688, %iright_689 : tensor<128x16xi32> loc(#loc281) + %eq_702 = arith.cmpi eq, %ileft_688, %iright_689 : tensor<128x16xi32> loc(#loc282) + %cond_703 = arith.cmpi sgt, %left_idx_699, %right_idx_700 : tensor<128x16xi32> loc(#loc303) + %cond_704 = arith.andi %eq_702, %cond_703 : tensor<128x16xi1> loc(#loc283) + %cond_705 = arith.ori %cond_701, %cond_704 : tensor<128x16xi1> loc(#loc284) + %new_idxs_706 = arith.xori %left_idx_699, %right_idx_700 : tensor<128x16xi32> loc(#loc304) + %new_idxs_707 = arith.select %cond_705, %new_idxs_706, %cst_3 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc290) + %new_idxs_708 = arith.xori %new_idxs_678, %new_idxs_707 : tensor<128x16xi32> loc(#loc291) + %tmp20 = arith.extui %tmp5 : tensor<128x16xi1> to tensor<128x16xi64> loc(#loc226) + %tmp23 = arith.select %tmp0_20, %tmp20, %cst_7 : tensor<128x16xi1>, tensor<128x16xi64> loc(#loc197) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_713: i64 loc(callsite(#loc1 at #loc198)), %tmp24_714: i64 loc(callsite(#loc1 at #loc198))): + %tmp24_715 = arith.addi %tmp24_713, %tmp24_714 : i64 loc(#loc305) + tt.reduce.return %tmp24_715 : i64 loc(#loc227) + }) : (tensor<128x16xi64>) -> tensor<128xi64> loc(#loc227) + %tmp24_709 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<128xi64> -> tensor<128x1xi64> loc(#loc199) + %tmp25 = arith.extui %tmp14 : tensor<128x16xi1> to tensor<128x16xi64> loc(#loc229) + %tmp28 = arith.select %tmp0_20, %tmp25, %cst_7 : tensor<128x16xi1>, tensor<128x16xi64> loc(#loc201) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_713: i64 loc(callsite(#loc1 at #loc202)), %tmp29_714: i64 loc(callsite(#loc1 at #loc202))): + %tmp29_715 = arith.addi %tmp29_713, %tmp29_714 : i64 loc(#loc306) + tt.reduce.return %tmp29_715 : i64 loc(#loc230) + }) : (tensor<128x16xi64>) -> tensor<128xi64> loc(#loc230) + %tmp29_710 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<128xi64> -> tensor<128x1xi64> loc(#loc203) + %tmp30 = arith.trunci %tmp24_709 : tensor<128x1xi64> to tensor<128x1xi32> loc(#loc204) + %tmp31 = arith.trunci %tmp29_710 : tensor<128x1xi64> to tensor<128x1xi32> loc(#loc205) + %tmp34 = tt.broadcast %tmp30 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc206) + %tmp34_711 = arith.cmpi slt, %tmp0_15, %tmp34 : tensor<128x16xi32> loc(#loc206) + %tmp36 = arith.select %tmp34_711, %new_idxs_376, %cst_5 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc207) + %tmp38 = arith.addi %tmp36, %cst_4 : tensor<128x16xi32> loc(#loc208) + %tmp39 = arith.cmpi slt, %tmp36, %cst_3 : tensor<128x16xi32> loc(#loc209) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc210) + %0 = arith.cmpi sge, %tmp40, %cst_3 : tensor<128x16xi32> loc(#loc88) + %1 = arith.cmpi slt, %tmp40, %cst_4 : tensor<128x16xi32> loc(#loc89) + %2 = arith.andi %0, %1 : tensor<128x16xi1> loc(#loc90) + %3 = arith.xori %xmask_13, %cst_2 : tensor<128x1xi1> loc(#loc91) + %4 = tt.broadcast %3 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc92) + %5 = arith.ori %2, %4 : tensor<128x16xi1> loc(#loc92) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<128x16xi1> loc(#loc93) + %tmp45 = tt.broadcast %tmp31 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc211) + %tmp45_712 = arith.cmpi slt, %tmp0_15, %tmp45 : tensor<128x16xi32> loc(#loc211) + %tmp46 = arith.select %tmp45_712, %new_idxs_708, %cst_5 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc212) + %tmp47 = arith.addi %tmp46, %cst_4 : tensor<128x16xi32> loc(#loc213) + %tmp48 = arith.cmpi slt, %tmp46, %cst_3 : tensor<128x16xi32> loc(#loc214) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc215) + %6 = arith.cmpi sge, %tmp49, %cst_3 : tensor<128x16xi32> loc(#loc99) + %7 = arith.cmpi slt, %tmp49, %cst_4 : tensor<128x16xi32> loc(#loc100) + %8 = arith.andi %6, %7 : tensor<128x16xi1> loc(#loc101) + %9 = arith.ori %8, %4 : tensor<128x16xi1> loc(#loc102) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<128x16xi1> loc(#loc103) + %10 = tt.splat %out_ptr4 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc104) + %11 = tt.addptr %10, %xindex_12 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc104) + tt.store %11, %tmp30, %xmask_13 : tensor<128x1x!tt.ptr> loc(#loc105) + %12 = tt.splat %out_ptr5 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc106) + %13 = tt.addptr %12, %xindex_12 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc106) + tt.store %13, %tmp31, %xmask_13 : tensor<128x1x!tt.ptr> loc(#loc107) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc108) + %15 = tt.addptr %14, %tmp0_17 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc108) + tt.store %15, %new_idxs_376, %tmp0_20 : tensor<128x16x!tt.ptr> loc(#loc109) + %16 = arith.muli %xindex_12, %cst_1 : tensor<128x1xi32> loc(#loc110) + %17 = tt.broadcast %16 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc111) + %18 = arith.addi %tmp40, %17 : tensor<128x16xi32> loc(#loc111) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc112) + %20 = tt.addptr %19, %18 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc112) + tt.store %20, %cst_0, %tmp0_20 : tensor<128x16x!tt.ptr> loc(#loc113) + %21 = tt.splat %out_ptr8 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc114) + %22 = tt.addptr %21, %tmp0_17 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc114) + tt.store %22, %new_idxs_708, %tmp0_20 : tensor<128x16x!tt.ptr> loc(#loc115) + %23 = arith.addi %tmp49, %17 : tensor<128x16xi32> loc(#loc116) + %24 = tt.splat %out_ptr9 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc117) + %25 = tt.addptr %24, %23 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc117) + tt.store %25, %cst_0, %tmp0_20 : tensor<128x16x!tt.ptr> loc(#loc118) + tt.return loc(#loc119) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":26:21) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:38) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:30) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":36:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":38:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":39:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":41:19) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":40:19) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":43:19) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":45:34) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":47:20) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":49:21) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":48:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":52:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":54:35) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:29) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":56:21) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":58:35) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:29) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":60:21) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":61:21) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":64:19) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":66:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":68:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":69:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":70:35) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:28) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:46) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:38) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:55) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:53) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:63) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":75:19) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":76:35) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":77:20) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":78:20) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":79:35) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:28) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:46) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:38) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:53) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:63) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:25) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:37) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:25) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:37) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:25) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:47) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:52) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:49) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:25) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:85) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:47) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:49) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:25) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:85) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:4) +#loc129 = loc("xmask"(#loc2)) +#loc130 = loc("xoffset"(#loc3)) +#loc131 = loc("xoffset"(#loc4)) +#loc132 = loc("xindex"(#loc5)) +#loc133 = loc("xindex"(#loc6)) +#loc134 = loc("xindex"(#loc7)) +#loc135 = loc("r0_index"(#loc8)) +#loc136 = loc("r0_index"(#loc9)) +#loc137 = loc("tmp0"(#loc10)) +#loc138 = loc("tmp0"(#loc11)) +#loc139 = loc("tmp0"(#loc12)) +#loc140 = loc("tmp0"(#loc13)) +#loc141 = loc("tmp2"(#loc14)) +#loc142 = loc("tmp4"(#loc15)) +#loc143 = loc("tmp5"(#loc16)) +#loc144 = loc("tmp7"(#loc17)) +#loc145 = loc("tmp6"(#loc18)) +#loc146 = loc("tmp9"(#loc19)) +#loc147 = loc("tmp11"(#loc20)) +#loc148 = loc("flip"(#loc21)) +#loc150 = loc("flip"(#loc24)) +#loc151 = loc("flip"(#loc25)) +#loc152 = loc("flip"(#loc26)) +#loc153 = loc("y"(#loc27)) +#loc154 = loc("left_mask"(#loc29)) +#loc155 = loc("ileft"(#loc30)) +#loc157 = loc("ileft"(#loc34)) +#loc158 = loc("ileft"(#loc35)) +#loc159 = loc("iright"(#loc36)) +#loc161 = loc("iright"(#loc38)) +#loc162 = loc("iright"(#loc39)) +#loc163 = loc("ileft"(#loc40)) +#loc164 = loc("iright"(#loc41)) +#loc165 = loc("y_idx"(#loc42)) +#loc166 = loc("left_idx"(#loc43)) +#loc167 = loc("left_idx"(#loc44)) +#loc168 = loc("input"(#loc45)) +#loc170 = loc("left_idx"(#loc47)) +#loc171 = loc("left_idx"(#loc48)) +#loc172 = loc("right_idx"(#loc49)) +#loc173 = loc("right_idx"(#loc50)) +#loc175 = loc("right_idx"(#loc52)) +#loc176 = loc("right_idx"(#loc53)) +#loc177 = loc("left_idx"(#loc54)) +#loc178 = loc("right_idx"(#loc55)) +#loc179 = loc("cond"(#loc56)) +#loc180 = loc("eq"(#loc57)) +#loc181 = loc("cond"(#loc58)) +#loc182 = loc("cond"(#loc59)) +#loc183 = loc("cond"(#loc60)) +#loc184 = loc("cond"(#loc61)) +#loc185 = loc("cond"(#loc62)) +#loc186 = loc("ret"(#loc63)) +#loc187 = loc("ret"(#loc64)) +#loc188 = loc("ret"(#loc65)) +#loc189 = loc("new_idxs"(#loc66)) +#loc190 = loc("new_idxs"(#loc67)) +#loc191 = loc("new_idxs"(#loc68)) +#loc192 = loc("tmp14"(#loc69)) +#loc193 = loc("tmp16"(#loc70)) +#loc194 = loc("tmp15"(#loc71)) +#loc196 = loc("tmp20"(#loc73)) +#loc197 = loc("tmp23"(#loc74)) +#loc199 = loc("tmp24"(#loc76)) +#loc200 = loc("tmp25"(#loc77)) +#loc201 = loc("tmp28"(#loc78)) +#loc203 = loc("tmp29"(#loc80)) +#loc204 = loc("tmp30"(#loc81)) +#loc205 = loc("tmp31"(#loc82)) +#loc206 = loc("tmp34"(#loc83)) +#loc207 = loc("tmp36"(#loc84)) +#loc208 = loc("tmp38"(#loc85)) +#loc209 = loc("tmp39"(#loc86)) +#loc210 = loc("tmp40"(#loc87)) +#loc211 = loc("tmp45"(#loc94)) +#loc212 = loc("tmp46"(#loc95)) +#loc213 = loc("tmp47"(#loc96)) +#loc214 = loc("tmp48"(#loc97)) +#loc215 = loc("tmp49"(#loc98)) +#loc216 = loc(fused[#loc144, #loc145]) +#loc217 = loc(callsite(#loc148 at #loc149)) +#loc218 = loc(callsite(#loc150 at #loc149)) +#loc219 = loc(callsite(#loc151 at #loc149)) +#loc220 = loc(callsite(#loc152 at #loc149)) +#loc222 = loc("cond"(#loc179)) +#loc223 = loc("eq"(#loc180)) +#loc224 = loc(fused[#loc193, #loc194]) +#loc226 = loc(fused[#loc196, #loc144, #loc145]) +#loc227 = loc(callsite(#loc31 at #loc198)) +#loc229 = loc(fused[#loc200, #loc193, #loc194]) +#loc230 = loc(callsite(#loc31 at #loc202)) +#loc232 = loc(callsite(#loc153 at #loc221)) +#loc233 = loc(callsite(#loc154 at #loc221)) +#loc234 = loc(callsite(#loc155 at #loc221)) +#loc236 = loc(callsite(#loc157 at #loc221)) +#loc237 = loc(callsite(#loc158 at #loc221)) +#loc238 = loc(callsite(#loc159 at #loc221)) +#loc240 = loc(callsite(#loc161 at #loc221)) +#loc241 = loc(callsite(#loc162 at #loc221)) +#loc242 = loc(callsite(#loc163 at #loc221)) +#loc243 = loc(callsite(#loc164 at #loc221)) +#loc244 = loc(callsite(#loc165 at #loc221)) +#loc245 = loc(callsite(#loc166 at #loc221)) +#loc246 = loc(callsite(#loc167 at #loc221)) +#loc248 = loc(callsite(#loc170 at #loc221)) +#loc249 = loc(callsite(#loc171 at #loc221)) +#loc250 = loc(callsite(#loc172 at #loc221)) +#loc251 = loc(callsite(#loc173 at #loc221)) +#loc253 = loc(callsite(#loc175 at #loc221)) +#loc254 = loc(callsite(#loc176 at #loc221)) +#loc255 = loc(callsite(#loc177 at #loc221)) +#loc256 = loc(callsite(#loc178 at #loc221)) +#loc257 = loc(callsite(#loc222 at #loc221)) +#loc258 = loc(callsite(#loc223 at #loc221)) +#loc259 = loc(callsite(#loc181 at #loc221)) +#loc260 = loc(callsite(#loc182 at #loc221)) +#loc261 = loc(callsite(#loc183 at #loc221)) +#loc262 = loc(callsite(#loc184 at #loc221)) +#loc263 = loc(callsite(#loc185 at #loc221)) +#loc264 = loc(callsite(#loc186 at #loc221)) +#loc265 = loc(callsite(#loc187 at #loc221)) +#loc266 = loc(callsite(#loc188 at #loc221)) +#loc267 = loc(callsite(#loc189 at #loc221)) +#loc268 = loc(callsite(#loc190 at #loc221)) +#loc269 = loc(callsite(#loc191 at #loc221)) +#loc270 = loc(callsite(#loc153 at #loc225)) +#loc271 = loc(callsite(#loc155 at #loc225)) +#loc273 = loc(callsite(#loc157 at #loc225)) +#loc274 = loc(callsite(#loc158 at #loc225)) +#loc275 = loc(callsite(#loc159 at #loc225)) +#loc277 = loc(callsite(#loc161 at #loc225)) +#loc278 = loc(callsite(#loc162 at #loc225)) +#loc279 = loc(callsite(#loc163 at #loc225)) +#loc280 = loc(callsite(#loc164 at #loc225)) +#loc281 = loc(callsite(#loc222 at #loc225)) +#loc282 = loc(callsite(#loc223 at #loc225)) +#loc283 = loc(callsite(#loc182 at #loc225)) +#loc284 = loc(callsite(#loc183 at #loc225)) +#loc285 = loc(callsite(#loc184 at #loc225)) +#loc286 = loc(callsite(#loc185 at #loc225)) +#loc287 = loc(callsite(#loc186 at #loc225)) +#loc288 = loc(callsite(#loc187 at #loc225)) +#loc289 = loc(callsite(#loc188 at #loc225)) +#loc290 = loc(callsite(#loc190 at #loc225)) +#loc291 = loc(callsite(#loc191 at #loc225)) +#loc292 = loc(callsite(#loc165 at #loc225)) +#loc293 = loc(callsite(#loc167 at #loc225)) +#loc295 = loc(callsite(#loc170 at #loc225)) +#loc296 = loc(callsite(#loc171 at #loc225)) +#loc297 = loc(callsite(#loc173 at #loc225)) +#loc299 = loc(callsite(#loc175 at #loc225)) +#loc300 = loc(callsite(#loc176 at #loc225)) +#loc301 = loc(callsite(#loc177 at #loc225)) +#loc302 = loc(callsite(#loc178 at #loc225)) +#loc303 = loc(callsite(#loc181 at #loc225)) +#loc304 = loc(callsite(#loc189 at #loc225)) +#loc305 = loc(callsite(#loc33 at #loc227)) +#loc306 = loc(callsite(#loc33 at #loc230)) +#loc307 = loc(callsite(#loc31 at #loc235)) +#loc309 = loc(callsite(#loc31 at #loc239)) +#loc311 = loc(callsite(#loc168 at #loc247)) +#loc312 = loc(callsite(#loc31 at #loc247)) +#loc314 = loc(callsite(#loc168 at #loc252)) +#loc315 = loc(callsite(#loc31 at #loc252)) +#loc317 = loc(callsite(#loc31 at #loc272)) +#loc319 = loc(callsite(#loc31 at #loc276)) +#loc321 = loc(callsite(#loc31 at #loc294)) +#loc323 = loc(callsite(#loc31 at #loc298)) +#loc325 = loc(callsite(#loc33 at #loc307)) +#loc326 = loc(callsite(#loc33 at #loc309)) +#loc327 = loc(callsite(#loc33 at #loc312)) +#loc328 = loc(callsite(#loc33 at #loc315)) +#loc329 = loc(callsite(#loc33 at #loc317)) +#loc330 = loc(callsite(#loc33 at #loc319)) +#loc331 = loc(callsite(#loc33 at #loc321)) +#loc332 = loc(callsite(#loc33 at #loc323)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..29c20417186500dd90c02241efc6cb56c2bed3ec --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..ac08557acaf51bfae56ccd12f822b960862c3ecb Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4877a375dde396b1e1fea7d3511f7465b0529c04 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "e1b3f1564b04bc7aede55cff09d6f5ffbd51b34448fc5f1195c8448b5e81fed8", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 16, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..48de9763bd2f25683750aa3f542661b3ae3e617f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.llir @@ -0,0 +1,190 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i64 %4, i32 %5, i32 %6, ptr addrspace(1) readnone captures(none) %7, ptr addrspace(1) readnone captures(none) %8) local_unnamed_addr #0 !dbg !4 { + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %11 = shl i32 %10, 2, !dbg !8 + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %13 = and i32 %12, 96, !dbg !9 + %14 = lshr exact i32 %13, 5, !dbg !9 + %15 = and i32 %12, 3, !dbg !9 + %16 = or disjoint i32 %14, %11, !dbg !10 + %17 = or disjoint i32 %11, %15, !dbg !10 + %18 = icmp slt i32 %16, %5, !dbg !11 + %19 = icmp slt i32 %17, %5, !dbg !11 + %20 = shl nuw nsw i32 %12, 2, !dbg !12 + %21 = and i32 %20, 124, !dbg !12 + %22 = sext i32 %16 to i64, !dbg !13 + %.frozen = freeze i64 %3, !dbg !14 + %23 = sdiv i64 %22, %.frozen, !dbg !14 + %24 = mul i64 %23, %.frozen, !dbg !13 + %.decomposed = sub i64 %22, %24, !dbg !13 + %25 = srem i64 %23, 32, !dbg !15 + %26 = sdiv i64 %22, %4, !dbg !16 + %.not = icmp ne i64 %.decomposed, 0, !dbg !17 + %27 = icmp slt i32 %11, 0, !dbg !21 + %28 = icmp slt i64 %3, 0, !dbg !22 + %29 = xor i1 %27, %28, !dbg !23 + %narrow = select i1 %29, i1 %.not, i1 false, !dbg !24 + %30 = sext i1 %narrow to i64, !dbg !24 + %31 = add nsw i64 %23, %30, !dbg !24 + %32 = zext nneg i32 %21 to i64, !dbg !25 + %33 = shl i64 %3, 12, !dbg !26 + %34 = mul i64 %33, %26, !dbg !27 + %.idx = shl nsw i64 %25, 8, !dbg !28 + %35 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx, !dbg !28 + %36 = getelementptr bfloat, ptr addrspace(1) %35, i64 %32, !dbg !28 + %.idx1 = shl nsw i64 %.decomposed, 13, !dbg !28 + %37 = getelementptr i8, ptr addrspace(1) %36, i64 %.idx1, !dbg !28 + %38 = getelementptr bfloat, ptr addrspace(1) %37, i64 %34, !dbg !28 + %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !29 + %40 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %38, i64 %39, i1 %18) #4, !dbg !29 + %41 = extractvalue { i32, i32 } %40, 0, !dbg !29 + %42 = bitcast i32 %41 to <2 x bfloat>, !dbg !29 + %43 = extractvalue { i32, i32 } %40, 1, !dbg !29 + %44 = bitcast i32 %43 to <2 x bfloat>, !dbg !29 + %45 = icmp slt i64 %3, 2, !dbg !30 + %46 = icmp sgt i64 %3, 1, !dbg !31 + %47 = select i1 %46, i64 %3, i64 0, !dbg !32 + %48 = zext i1 %45 to i64, !dbg !33 + %49 = add i64 %47, %48, !dbg !34 + %50 = shl i64 %49, 7, !dbg !35 + %51 = mul i64 %50, %31, !dbg !36 + %.idx2 = shl nsw i64 %.decomposed, 8, !dbg !37 + %52 = getelementptr i8, ptr addrspace(1) %1, i64 %.idx2, !dbg !37 + %53 = getelementptr bfloat, ptr addrspace(1) %52, i64 %32, !dbg !37 + %54 = getelementptr bfloat, ptr addrspace(1) %53, i64 %51, !dbg !37 + %55 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %56 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %54, i64 %55, i1 %18) #4, !dbg !38 + %57 = extractvalue { i32, i32 } %56, 0, !dbg !38 + %58 = bitcast i32 %57 to <2 x bfloat>, !dbg !38 + %59 = extractvalue { i32, i32 } %56, 1, !dbg !38 + %60 = bitcast i32 %59 to <2 x bfloat>, !dbg !38 + %61 = fpext <2 x bfloat> %42 to <2 x float>, !dbg !39 + %62 = fpext <2 x bfloat> %58 to <2 x float>, !dbg !40 + %63 = fmul <2 x float> %61, %62, !dbg !41 + %64 = fadd <2 x float> %63, zeroinitializer, !dbg !42 + %65 = fpext <2 x bfloat> %44 to <2 x float>, !dbg !39 + %66 = fpext <2 x bfloat> %60 to <2 x float>, !dbg !40 + %67 = fmul <2 x float> %65, %66, !dbg !41 + %68 = fadd <2 x float> %67, zeroinitializer, !dbg !42 + %shift = shufflevector <2 x float> %64, <2 x float> poison, <2 x i32> , !dbg !43 + %foldExtExtBinop = fadd <2 x float> %64, %shift, !dbg !43 + %foldExtExtBinop4 = fadd <2 x float> %68, %foldExtExtBinop, !dbg !43 + %shift6 = shufflevector <2 x float> %68, <2 x float> poison, <2 x i32> , !dbg !43 + %foldExtExtBinop7 = fadd <2 x float> %shift6, %foldExtExtBinop4, !dbg !43 + %69 = extractelement <2 x float> %foldExtExtBinop7, i64 0, !dbg !43 + %70 = select i1 %18, float %69, float 0.000000e+00, !dbg !43 + %71 = bitcast float %70 to i32, !dbg !47 + %72 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 16, i32 31), !dbg !47 + %73 = bitcast i32 %72 to float, !dbg !47 + %74 = fadd float %70, %73, !dbg !43 + %75 = bitcast float %74 to i32, !dbg !47 + %76 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %75, i32 8, i32 31), !dbg !47 + %77 = bitcast i32 %76 to float, !dbg !47 + %78 = fadd float %74, %77, !dbg !43 + %79 = bitcast float %78 to i32, !dbg !47 + %80 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %79, i32 4, i32 31), !dbg !47 + %81 = bitcast i32 %80 to float, !dbg !47 + %82 = fadd float %78, %81, !dbg !43 + %83 = bitcast float %82 to i32, !dbg !47 + %84 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %83, i32 2, i32 31), !dbg !47 + %85 = bitcast i32 %84 to float, !dbg !47 + %86 = fadd float %82, %85, !dbg !43 + %87 = bitcast float %86 to i32, !dbg !47 + %88 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 1, i32 31), !dbg !47 + %89 = bitcast i32 %88 to float, !dbg !47 + %90 = fadd float %86, %89, !dbg !43 + %91 = lshr exact i32 %13, 3, !dbg !48 + %92 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %91, !dbg !48 + store float %90, ptr addrspace(3) %92, align 4, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %93 = shl nuw nsw i32 %15, 2, !dbg !48 + %94 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %93, !dbg !48 + %95 = load i32, ptr addrspace(3) %94, align 4, !dbg !48 + %96 = sext i32 %17 to i64, !dbg !49 + %97 = getelementptr float, ptr addrspace(1) %2, i64 %96, !dbg !49 + %98 = and i32 %12, 124, !dbg !50 + %99 = icmp eq i32 %98, 0, !dbg !50 + %100 = and i1 %99, %19, !dbg !50 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %95, ptr addrspace(1) %97, i1 %100) #4, !dbg !50 + ret void, !dbg !51 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 22, column: 33, scope: !4) +!9 = !DILocation(line: 23, column: 44, scope: !4) +!10 = !DILocation(line: 23, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 21, scope: !4) +!12 = !DILocation(line: 25, column: 37, scope: !4) +!13 = !DILocation(line: 27, column: 19, scope: !4) +!14 = !DILocation(line: 28, column: 21, scope: !4) +!15 = !DILocation(line: 28, column: 28, scope: !4) +!16 = !DILocation(line: 29, column: 19, scope: !4) +!17 = !DILocation(line: 74, column: 34, scope: !18, inlinedAt: !20) +!18 = distinct !DILexicalBlockFile(scope: !4, file: !19, discriminator: 0) +!19 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!20 = !DILocation(line: 30, column: 51, scope: !4) +!21 = !DILocation(line: 75, column: 25, scope: !18, inlinedAt: !20) +!22 = !DILocation(line: 75, column: 36, scope: !18, inlinedAt: !20) +!23 = !DILocation(line: 75, column: 32, scope: !18, inlinedAt: !20) +!24 = !DILocation(line: 75, column: 47, scope: !18, inlinedAt: !20) +!25 = !DILocation(line: 39, column: 41, scope: !4) +!26 = !DILocation(line: 39, column: 65, scope: !4) +!27 = !DILocation(line: 39, column: 69, scope: !4) +!28 = !DILocation(line: 39, column: 34, scope: !4) +!29 = !DILocation(line: 39, column: 74, scope: !4) +!30 = !DILocation(line: 40, column: 73, scope: !4) +!31 = !DILocation(line: 40, column: 99, scope: !4) +!32 = !DILocation(line: 40, column: 90, scope: !4) +!33 = !DILocation(line: 40, scope: !4) +!34 = !DILocation(line: 40, column: 81, scope: !4) +!35 = !DILocation(line: 40, column: 54, scope: !4) +!36 = !DILocation(line: 40, column: 58, scope: !4) +!37 = !DILocation(line: 40, column: 34, scope: !4) +!38 = !DILocation(line: 40, column: 106, scope: !4) +!39 = !DILocation(line: 39, column: 136, scope: !4) +!40 = !DILocation(line: 40, column: 168, scope: !4) +!41 = !DILocation(line: 41, column: 22, scope: !4) +!42 = !DILocation(line: 43, column: 23, scope: !4) +!43 = !DILocation(line: 261, column: 15, scope: !44, inlinedAt: !46) +!44 = distinct !DILexicalBlockFile(scope: !4, file: !45, discriminator: 0) +!45 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!46 = !DILocation(line: 45, column: 25, scope: !4) +!47 = !DILocation(line: 291, column: 36, scope: !44, inlinedAt: !46) +!48 = !DILocation(line: 45, column: 28, scope: !4) +!49 = !DILocation(line: 49, column: 25, scope: !4) +!50 = !DILocation(line: 49, column: 36, scope: !4) +!51 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..d718dce9bdf65a6cc2d3b55cb28234fb4a646db2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.ptx @@ -0,0 +1,476 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u64 triton_red_fused_zeros_0_param_3, + .param .u64 triton_red_fused_zeros_0_param_4, + .param .u32 triton_red_fused_zeros_0_param_5, + .param .u32 triton_red_fused_zeros_0_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_7, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_8 +) +.reqntid 128 +{ + .reg .pred %p<15>; + .reg .b16 %rs<9>; + .reg .b32 %r<62>; + .reg .b64 %rd<53>; + .loc 1 18 0 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:18:0 + +// %bb.0: + ld.param.b64 %rd15, [triton_red_fused_zeros_0_param_4]; + ld.param.b64 %rd14, [triton_red_fused_zeros_0_param_3]; +$L__tmp0: + .loc 1 22 28 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:22:28 + mov.u32 %r8, %ctaid.x; + .loc 1 22 33 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:22:33 + shl.b32 %r1, %r8, 2; + .loc 1 23 44 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:23:44 + mov.u32 %r2, %tid.x; + bfe.u32 %r9, %r2, 5, 2; + and.b32 %r4, %r2, 3; + .loc 1 23 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:23:23 + or.b32 %r10, %r9, %r1; + .loc 1 25 37 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:25:37 + shl.b32 %r11, %r2, 2; + .loc 1 27 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:27:19 + cvt.s64.s32 %rd1, %r10; + .loc 1 28 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:28:21 + or.b64 %rd17, %rd1, %rd14; + and.b64 %rd18, %rd17, -4294967296; + setp.ne.b64 %p1, %rd18, 0; + cvt.u32.u64 %r61, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd51, %rd1, %rd14; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r12, %rd14; + div.u32 %r14, %r61, %r12; + cvt.u64.u32 %rd51, %r14; +$L__BB0_3: + .loc 1 0 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:0:21 + ld.param.b32 %r7, [triton_red_fused_zeros_0_param_5]; + ld.param.b64 %rd13, [triton_red_fused_zeros_0_param_2]; + ld.param.b64 %rd12, [triton_red_fused_zeros_0_param_1]; + ld.param.b64 %rd11, [triton_red_fused_zeros_0_param_0]; + and.b32 %r3, %r2, 96; + or.b32 %r5, %r1, %r4; + and.b32 %r6, %r11, 124; + .loc 1 27 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:27:19 + mul.lo.s64 %rd19, %rd51, %rd14; + sub.s64 %rd6, %rd1, %rd19; + .loc 1 28 28 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:28:28 + shr.s64 %rd20, %rd51, 63; + shr.u64 %rd21, %rd20, 59; + add.s64 %rd22, %rd51, %rd21; + and.b64 %rd23, %rd22, -32; + sub.s64 %rd7, %rd51, %rd23; + .loc 1 29 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:29:19 + or.b64 %rd24, %rd1, %rd15; + and.b64 %rd25, %rd24, -4294967296; + setp.ne.b64 %p2, %rd25, 0; + @%p2 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd52, %rd1, %rd15; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r15, %rd15; + div.u32 %r17, %r61, %r15; + cvt.u64.u32 %rd52, %r17; +$L__BB0_6: + .loc 1 24 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:24:21 + setp.lt.s32 %p6, %r5, %r7; + setp.lt.s32 %p3, %r61, %r7; +$L__tmp1: + .loc 2 74 34 // triton_helpers.py:74:34 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + setp.ne.b64 %p7, %rd6, 0; + .loc 2 75 25 // triton_helpers.py:75:25 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + setp.lt.s32 %p8, %r1, 0; + .loc 2 75 36 // triton_helpers.py:75:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + setp.lt.s64 %p9, %rd14, 0; + .loc 2 75 32 // triton_helpers.py:75:32 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + xor.pred %p10, %p8, %p9; + .loc 2 75 47 // triton_helpers.py:75:47 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + and.pred %p11, %p10, %p7; + selp.b64 %rd33, -1, 0, %p11; + add.s64 %rd34, %rd51, %rd33; +$L__tmp2: + .loc 1 39 69 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:69 + mul.lo.s64 %rd35, %rd14, %rd52; + .loc 1 39 34 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:34 + shl.b64 %rd36, %rd7, 8; + add.s64 %rd37, %rd11, %rd36; + mul.wide.u32 %rd38, %r6, 2; + add.s64 %rd39, %rd37, %rd38; + shl.b64 %rd40, %rd6, 13; + add.s64 %rd41, %rd39, %rd40; + shl.b64 %rd42, %rd35, 13; + add.s64 %rd27, %rd41, %rd42; + .loc 1 39 74 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:74 + // begin inline asm + mov.u64 %rd28, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd28, 1.0; + // end inline asm + mov.b32 %r20, 0; + // begin inline asm + mov.u32 %r18, %r20; + mov.u32 %r19, %r20; + @%p3 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r18, %r19 }, [ %rd27 + 0 ], %rd28; + // end inline asm + .loc 1 40 73 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:73 + setp.lt.s64 %p12, %rd14, 2; + .loc 1 40 99 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:99 + setp.gt.s64 %p13, %rd14, 1; + .loc 1 40 90 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:90 + selp.b64 %rd43, %rd14, 0, %p13; + .loc 1 40 0 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40 + selp.b64 %rd44, 1, 0, %p12; + .loc 1 40 81 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:81 + add.s64 %rd45, %rd43, %rd44; + .loc 1 40 58 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:58 + mul.lo.s64 %rd46, %rd45, %rd34; + .loc 1 40 34 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:34 + shl.b64 %rd47, %rd6, 8; + add.s64 %rd48, %rd12, %rd47; + add.s64 %rd49, %rd48, %rd38; + shl.b64 %rd50, %rd46, 8; + add.s64 %rd30, %rd49, %rd50; + .loc 1 40 106 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:106 + // begin inline asm + mov.u64 %rd31, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd31, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r22, %r20; + mov.u32 %r23, %r20; + @%p3 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r22, %r23 }, [ %rd30 + 0 ], %rd31; + // end inline asm + .loc 1 39 136 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:136 + mov.b32 {%rs1, %rs2}, %r18; + cvt.f32.bf16 %r28, %rs1; + cvt.f32.bf16 %r29, %rs2; + .loc 1 40 168 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:168 + mov.b32 {%rs3, %rs4}, %r22; + cvt.f32.bf16 %r30, %rs3; + cvt.f32.bf16 %r31, %rs4; + .loc 1 43 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:43:23 + fma.rn.f32 %r32, %r29, %r31, 0f00000000; + fma.rn.f32 %r33, %r28, %r30, 0f00000000; + .loc 1 39 136 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:136 + mov.b32 {%rs5, %rs6}, %r19; + cvt.f32.bf16 %r34, %rs5; + cvt.f32.bf16 %r35, %rs6; + .loc 1 40 168 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:168 + mov.b32 {%rs7, %rs8}, %r23; + cvt.f32.bf16 %r36, %rs7; + cvt.f32.bf16 %r37, %rs8; + .loc 1 43 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:43:23 + fma.rn.f32 %r38, %r35, %r37, 0f00000000; + fma.rn.f32 %r39, %r34, %r36, 0f00000000; +$L__tmp3: + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r40, %r33, %r32; + add.f32 %r41, %r39, %r40; + add.f32 %r42, %r38, %r41; + selp.f32 %r43, %r42, 0f00000000, %p3; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r44, %r43, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r45, %r43, %r44; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r46, %r45, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r47, %r45, %r46; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r48, %r47, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r49, %r47, %r48; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r50, %r49, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r51, %r49, %r50; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r52, %r51, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r53, %r51, %r52; +$L__tmp4: + .loc 1 45 28 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:28 + shr.u32 %r54, %r3, 3; + mov.b32 %r55, global_smem; + add.s32 %r56, %r55, %r54; + st.shared.b32 [%r56], %r53; + bar.sync 0; + shl.b32 %r57, %r4, 2; + add.s32 %r58, %r55, %r57; + ld.shared.b32 %r26, [%r58]; + .loc 1 49 25 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:49:25 + mad.wide.s32 %rd32, %r5, 4, %rd13; + .loc 1 49 36 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:49:36 + and.b32 %r59, %r2, 124; + setp.eq.b32 %p14, %r59, 0; + and.pred %p5, %p14, %p6; + // begin inline asm + @%p5 st.global.b32 [ %rd32 + 0 ], { %r26 }; + // end inline asm + .loc 1 49 4 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:49:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 233 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe2 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 54 +.b8 52 +.b8 100 +.b8 110 +.b8 112 +.b8 110 +.b8 99 +.b8 50 +.b8 52 +.b8 117 +.b8 106 +.b8 115 +.b8 52 +.b8 101 +.b8 108 +.b8 51 +.b8 111 +.b8 122 +.b8 101 +.b8 100 +.b8 97 +.b8 55 +.b8 112 +.b8 110 +.b8 115 +.b8 118 +.b8 116 +.b8 101 +.b8 119 +.b8 104 +.b8 55 +.b8 120 +.b8 54 +.b8 55 +.b8 103 +.b8 122 +.b8 119 +.b8 116 +.b8 102 +.b8 111 +.b8 55 +.b8 122 +.b8 120 +.b8 118 +.b8 114 +.b8 102 +.b8 115 +.b8 113 +.b8 122 +.b8 115 +.b8 103 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 54 +.b8 52 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 30 // DW_AT_call_line +.b8 51 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd3:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..0347d2c226e2bd59840ccfef62796db70364d778 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.source @@ -0,0 +1,325 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":18:0) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc70 = loc(unknown) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc77 = loc("in_ptr0"(#loc)) +#loc78 = loc("in_ptr1"(#loc)) +#loc79 = loc("out_ptr1"(#loc)) +#loc80 = loc("ks0"(#loc)) +#loc81 = loc("ks1"(#loc)) +#loc82 = loc("xnumel"(#loc)) +#loc83 = loc("r0_numel"(#loc)) +#loc135 = loc("a"(#loc56)) +#loc136 = loc("b"(#loc56)) +#loc142 = loc("input"(#loc68)) +#loc143 = loc("a"(#loc73)) +#loc144 = loc("b"(#loc73)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 128 : i32 loc(#loc84) + %xoffset = tt.get_program_id x : i32 loc(#loc85) + %xoffset_1 = arith.constant 4 : i32 loc(#loc86) + %xoffset_2 = arith.constant 4 : i32 loc(#loc86) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc86) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc87) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc88) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<4x1xi32> loc(#loc89) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<4x1xi32> loc(#loc89) + %xmask = tt.splat %xnumel : i32 -> tensor<4x1xi32> loc(#loc90) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<4x1xi32> loc(#loc90) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc91) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc92) + %x0 = arith.extsi %xindex_6 : tensor<4x1xi32> to tensor<4x1xi64> loc(#loc93) + %x0_9 = tt.splat %ks0 : i64 -> tensor<4x1xi64> loc(#loc93) + %x0_10 = arith.remsi %x0, %x0_9 : tensor<4x1xi64> loc(#loc93) + %x1 = arith.extsi %xindex_6 : tensor<4x1xi32> to tensor<4x1xi64> loc(#loc94) + %x1_11 = tt.splat %ks0 : i64 -> tensor<4x1xi64> loc(#loc94) + %x1_12 = arith.divsi %x1, %x1_11 : tensor<4x1xi64> loc(#loc94) + %x1_13 = arith.constant 32 : i32 loc(#loc95) + %x1_14 = arith.constant 32 : i64 loc(#loc95) + %x1_15 = arith.constant dense<32> : tensor<4x1xi64> loc(#loc95) + %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<4x1xi64> loc(#loc95) + %x2 = arith.extsi %xindex_6 : tensor<4x1xi32> to tensor<4x1xi64> loc(#loc96) + %x2_17 = tt.splat %ks1 : i64 -> tensor<4x1xi64> loc(#loc96) + %x2_18 = arith.divsi %x2, %x2_17 : tensor<4x1xi64> loc(#loc96) + %x5 = tt.call @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S4_1S_i64__(%xindex_6, %ks0) : (tensor<4x1xi32>, i64) -> tensor<4x1xi64> loc(#loc97) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc98) + %_tmp4_19 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc98) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c128_i32 = arith.constant 128 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_20 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_19) -> (tensor<4x128xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc100) + %r0_index_24 = arith.addi %r0_index, %r0_base_8 : tensor<1x128xi32> loc(#loc100) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc101) + %r0_mask_25 = arith.cmpi slt, %r0_index_24, %r0_mask : tensor<1x128xi32> loc(#loc101) + %tmp0 = arith.constant 128 : i32 loc(#loc102) + %tmp0_26 = arith.constant 128 : i64 loc(#loc102) + %tmp0_27 = arith.constant dense<128> : tensor<4x1xi64> loc(#loc102) + %tmp0_28 = arith.muli %tmp0_27, %x1_16 : tensor<4x1xi64> loc(#loc102) + %tmp0_29 = arith.extsi %r0_index_24 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc103) + %tmp0_30 = tt.broadcast %tmp0_29 : tensor<1x128xi64> -> tensor<4x128xi64> loc(#loc103) + %tmp0_31 = tt.broadcast %tmp0_28 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc103) + %tmp0_32 = arith.addi %tmp0_30, %tmp0_31 : tensor<4x128xi64> loc(#loc103) + %tmp0_33 = arith.constant 4096 : i32 loc(#loc104) + %tmp0_34 = arith.constant 4096 : i64 loc(#loc104) + %tmp0_35 = arith.constant dense<4096> : tensor<4x1xi64> loc(#loc104) + %tmp0_36 = arith.muli %tmp0_35, %x0_10 : tensor<4x1xi64> loc(#loc104) + %tmp0_37 = tt.broadcast %tmp0_36 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc105) + %tmp0_38 = arith.addi %tmp0_32, %tmp0_37 : tensor<4x128xi64> loc(#loc105) + %tmp0_39 = arith.constant 4096 : i32 loc(#loc106) + %tmp0_40 = arith.constant 4096 : i64 loc(#loc106) + %tmp0_41 = arith.muli %tmp0_40, %ks0 : i64 loc(#loc106) + %tmp0_42 = tt.splat %tmp0_41 : i64 -> tensor<4x1xi64> loc(#loc107) + %tmp0_43 = arith.muli %tmp0_42, %x2_18 : tensor<4x1xi64> loc(#loc107) + %tmp0_44 = tt.broadcast %tmp0_43 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc108) + %tmp0_45 = arith.addi %tmp0_38, %tmp0_44 : tensor<4x128xi64> loc(#loc108) + %tmp0_46 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc109) + %tmp0_47 = tt.addptr %tmp0_46, %tmp0_45 : tensor<4x128x!tt.ptr>, tensor<4x128xi64> loc(#loc109) + %tmp0_48 = tt.broadcast %r0_mask_25 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc110) + %tmp0_49 = tt.broadcast %xmask_7 : tensor<4x1xi1> -> tensor<4x128xi1> loc(#loc110) + %tmp0_50 = arith.andi %tmp0_48, %tmp0_49 : tensor<4x128xi1> loc(#loc110) + %tmp0_51 = arith.constant 0.000000e+00 : f32 loc(#loc111) + %tmp0_52 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc111) + %tmp0_53 = arith.truncf %tmp0_52 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc111) + %tmp0_54 = tt.load %tmp0_47, %tmp0_50, %tmp0_53 evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc111) + %tmp0_55 = arith.extf %tmp0_54 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc112) + %tmp1 = arith.constant 128 : i32 loc(#loc113) + %tmp1_56 = arith.constant 128 : i64 loc(#loc113) + %tmp1_57 = arith.constant dense<128> : tensor<4x1xi64> loc(#loc113) + %tmp1_58 = arith.muli %tmp1_57, %x0_10 : tensor<4x1xi64> loc(#loc113) + %tmp1_59 = arith.extsi %r0_index_24 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc114) + %tmp1_60 = tt.broadcast %tmp1_59 : tensor<1x128xi64> -> tensor<4x128xi64> loc(#loc114) + %tmp1_61 = tt.broadcast %tmp1_58 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc114) + %tmp1_62 = arith.addi %tmp1_60, %tmp1_61 : tensor<4x128xi64> loc(#loc114) + %tmp1_63 = arith.constant 128 : i32 loc(#loc115) + %tmp1_64 = arith.constant 128 : i64 loc(#loc115) + %tmp1_65 = arith.constant dense<128> : tensor<4x1xi64> loc(#loc115) + %tmp1_66 = arith.muli %tmp1_65, %x5 : tensor<4x1xi64> loc(#loc115) + %tmp1_67 = arith.constant 1 : i32 loc(#loc116) + %tmp1_68 = arith.extsi %tmp1_67 : i32 to i64 loc(#loc116) + %tmp1_69 = arith.cmpi sge, %tmp1_68, %ks0 : i64 loc(#loc116) + %tmp1_70 = arith.constant 1 : i32 loc(#loc117) + %tmp1_71 = arith.constant 1 : i32 loc(#loc117) + %tmp1_72 = arith.extui %tmp1_69 : i1 to i32 loc(#loc117) + %tmp1_73 = arith.muli %tmp1_71, %tmp1_72 : i32 loc(#loc117) + %tmp1_74 = arith.constant 1 : i32 loc(#loc118) + %tmp1_75 = arith.extsi %tmp1_74 : i32 to i64 loc(#loc118) + %tmp1_76 = arith.cmpi sgt, %ks0, %tmp1_75 : i64 loc(#loc118) + %tmp1_77 = arith.extui %tmp1_76 : i1 to i64 loc(#loc119) + %tmp1_78 = arith.muli %ks0, %tmp1_77 : i64 loc(#loc119) + %tmp1_79 = arith.extsi %tmp1_73 : i32 to i64 loc(#loc120) + %tmp1_80 = arith.addi %tmp1_79, %tmp1_78 : i64 loc(#loc120) + %tmp1_81 = tt.splat %tmp1_80 : i64 -> tensor<4x1xi64> loc(#loc121) + %tmp1_82 = arith.muli %tmp1_66, %tmp1_81 : tensor<4x1xi64> loc(#loc121) + %tmp1_83 = tt.broadcast %tmp1_82 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc122) + %tmp1_84 = arith.addi %tmp1_62, %tmp1_83 : tensor<4x128xi64> loc(#loc122) + %tmp1_85 = tt.splat %in_ptr1 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc123) + %tmp1_86 = tt.addptr %tmp1_85, %tmp1_84 : tensor<4x128x!tt.ptr>, tensor<4x128xi64> loc(#loc123) + %tmp1_87 = tt.broadcast %r0_mask_25 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc124) + %tmp1_88 = tt.broadcast %xmask_7 : tensor<4x1xi1> -> tensor<4x128xi1> loc(#loc124) + %tmp1_89 = arith.andi %tmp1_87, %tmp1_88 : tensor<4x128xi1> loc(#loc124) + %tmp1_90 = arith.constant 0.000000e+00 : f32 loc(#loc125) + %tmp1_91 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc125) + %tmp1_92 = arith.truncf %tmp1_91 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc125) + %tmp1_93 = tt.load %tmp1_86, %tmp1_89, %tmp1_92 evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc125) + %tmp1_94 = arith.extf %tmp1_93 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc126) + %tmp2 = arith.mulf %tmp0_55, %tmp1_94 : tensor<4x128xf32> loc(#loc127) + %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<4x128xf32> loc(#loc128) + %_tmp4_95 = tt.broadcast %r0_mask_25 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc129) + %_tmp4_96 = tt.broadcast %xmask_7 : tensor<4x1xi1> -> tensor<4x128xi1> loc(#loc129) + %_tmp4_97 = arith.andi %_tmp4_95, %_tmp4_96 : tensor<4x128xi1> loc(#loc129) + %_tmp4_98 = arith.select %_tmp4_97, %tmp5, %_tmp4_23 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc130) + scf.yield %_tmp4_98 : tensor<4x128xf32> loc(#loc48) + } loc(#loc99) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_20) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc131) + %tmp4_21 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc132) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc133) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<4x1xf32> loc(#loc134) + %tmp8_22 = arith.subf %tmp4_21, %tmp8 : tensor<4x1xf32> loc(#loc134) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<4x1x!tt.ptr> loc(#loc53) + %5 = tt.addptr %4, %xindex_6 : tensor<4x1x!tt.ptr>, tensor<4x1xi32> loc(#loc53) + tt.store %5, %tmp8_22, %xmask_7 : tensor<4x1x!tt.ptr> loc(#loc54) + tt.return loc(#loc55) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S4_1S_i64__(%a: tensor<4x1xi32> loc("a"(#loc56)), %b: i64 loc("b"(#loc56))) -> tensor<4x1xi64> attributes {noinline = false} { + %quot = arith.extsi %a : tensor<4x1xi32> to tensor<4x1xi64> loc(#loc137) + %quot_0 = tt.splat %b : i64 -> tensor<4x1xi64> loc(#loc137) + %quot_1 = arith.divsi %quot, %quot_0 : tensor<4x1xi64> loc(#loc137) + %remainder = arith.extsi %a : tensor<4x1xi32> to tensor<4x1xi64> loc(#loc138) + %remainder_2 = tt.splat %b : i64 -> tensor<4x1xi64> loc(#loc138) + %remainder_3 = arith.remsi %remainder, %remainder_2 : tensor<4x1xi64> loc(#loc138) + %fixed = arith.constant 0 : i32 loc(#loc139) + %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc139) + %fixed_5 = tt.splat %fixed_4 : i64 -> tensor<4x1xi64> loc(#loc139) + %fixed_6 = arith.cmpi ne, %remainder_3, %fixed_5 : tensor<4x1xi64> loc(#loc139) + %fixed_7 = arith.constant 1 : i32 loc(#loc140) + %fixed_8 = arith.constant 1 : i64 loc(#loc140) + %fixed_9 = arith.constant dense<1> : tensor<4x1xi64> loc(#loc140) + %fixed_10 = arith.subi %quot_1, %fixed_9 : tensor<4x1xi64> loc(#loc140) + %fixed_11 = arith.select %fixed_6, %fixed_10, %quot_1 : tensor<4x1xi1>, tensor<4x1xi64> loc(#loc141) + %c0_i32 = arith.constant 0 : i32 loc(#loc62) + %cst = arith.constant dense<0> : tensor<4x1xi32> loc(#loc62) + %0 = arith.cmpi slt, %a, %cst : tensor<4x1xi32> loc(#loc62) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc63) + %1 = arith.extsi %c0_i32_12 : i32 to i64 loc(#loc63) + %2 = arith.cmpi slt, %b, %1 : i64 loc(#loc63) + %3 = tt.splat %2 : i1 -> tensor<4x1xi1> loc(#loc64) + %4 = arith.cmpi ne, %0, %3 : tensor<4x1xi1> loc(#loc64) + %5 = arith.select %4, %fixed_11, %quot_1 : tensor<4x1xi1>, tensor<4x1xi64> loc(#loc65) + tt.return %5 : tensor<4x1xi64> loc(#loc66) + ^bb1: // no predecessors + %6 = ub.poison : tensor<4x1xi64> loc(#loc67) + tt.return %6 : tensor<4x1xi64> loc(#loc67) + } loc(#loc56) + tt.func private @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x128xf32> loc("input"(#loc68))) -> tensor<4xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc69) + tt.reduce.return %2 : f32 loc(#loc69) + }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc69) + tt.return %0 : tensor<4xf32> loc(#loc71) + ^bb1: // no predecessors + %1 = ub.poison : tensor<4xf32> loc(#loc72) + tt.return %1 : tensor<4xf32> loc(#loc72) + } loc(#loc68) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc73)), %b: f32 loc("b"(#loc73))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc74) + tt.return %0 : f32 loc(#loc75) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc76) + tt.return %1 : f32 loc(#loc76) + } loc(#loc73) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:21) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:28) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":29:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":30:51) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":31:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:65) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:69) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:60) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:34) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:84) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:74) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:136) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:45) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:41) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:54) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:73) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:65) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:99) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:90) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:81) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:58) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:50) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:34) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:116) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:106) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:168) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":41:22) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":43:23) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:35) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:48) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:8) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:25) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:28) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":47:11) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":48:18) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:36) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:4) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc84 = loc("r0_numel"(#loc1)) +#loc85 = loc("xoffset"(#loc2)) +#loc86 = loc("xoffset"(#loc3)) +#loc87 = loc("xindex"(#loc4)) +#loc88 = loc("xindex"(#loc5)) +#loc89 = loc("xindex"(#loc6)) +#loc90 = loc("xmask"(#loc7)) +#loc91 = loc("r0_base"(#loc8)) +#loc92 = loc("r0_base"(#loc9)) +#loc93 = loc("x0"(#loc10)) +#loc94 = loc("x1"(#loc11)) +#loc95 = loc("x1"(#loc12)) +#loc96 = loc("x2"(#loc13)) +#loc97 = loc("x5"(#loc14)) +#loc98 = loc("_tmp4"(#loc15)) +#loc99 = loc("_tmp4"(#loc16)) +#loc100 = loc("r0_index"(#loc17)) +#loc101 = loc("r0_mask"(#loc18)) +#loc102 = loc("tmp0"(#loc19)) +#loc103 = loc("tmp0"(#loc20)) +#loc104 = loc("tmp0"(#loc21)) +#loc105 = loc("tmp0"(#loc22)) +#loc106 = loc("tmp0"(#loc23)) +#loc107 = loc("tmp0"(#loc24)) +#loc108 = loc("tmp0"(#loc25)) +#loc109 = loc("tmp0"(#loc26)) +#loc110 = loc("tmp0"(#loc27)) +#loc111 = loc("tmp0"(#loc28)) +#loc112 = loc("tmp0"(#loc29)) +#loc113 = loc("tmp1"(#loc30)) +#loc114 = loc("tmp1"(#loc31)) +#loc115 = loc("tmp1"(#loc32)) +#loc116 = loc("tmp1"(#loc33)) +#loc117 = loc("tmp1"(#loc34)) +#loc118 = loc("tmp1"(#loc35)) +#loc119 = loc("tmp1"(#loc36)) +#loc120 = loc("tmp1"(#loc37)) +#loc121 = loc("tmp1"(#loc38)) +#loc122 = loc("tmp1"(#loc39)) +#loc123 = loc("tmp1"(#loc40)) +#loc124 = loc("tmp1"(#loc41)) +#loc125 = loc("tmp1"(#loc42)) +#loc126 = loc("tmp1"(#loc43)) +#loc127 = loc("tmp2"(#loc44)) +#loc128 = loc("tmp5"(#loc45)) +#loc129 = loc("_tmp4"(#loc46)) +#loc130 = loc("_tmp4"(#loc47)) +#loc131 = loc("tmp4"(#loc49)) +#loc132 = loc("tmp4"(#loc50)) +#loc133 = loc("tmp7"(#loc51)) +#loc134 = loc("tmp8"(#loc52)) +#loc137 = loc("quot"(#loc57)) +#loc138 = loc("remainder"(#loc58)) +#loc139 = loc("fixed"(#loc59)) +#loc140 = loc("fixed"(#loc60)) +#loc141 = loc("fixed"(#loc61)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..5a0ae1ea1150b567557d90b4d3697530bb6e8016 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,220 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":18:0) +#loc1 = loc(unknown) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:25) +#loc55 = loc("in_ptr0"(#loc)) +#loc56 = loc("in_ptr1"(#loc)) +#loc57 = loc("out_ptr1"(#loc)) +#loc58 = loc("ks0"(#loc)) +#loc59 = loc("ks1"(#loc)) +#loc60 = loc("xnumel"(#loc)) +#loc61 = loc("r0_numel"(#loc)) +#loc104 = loc("tmp4"(#loc49)) +#loc115 = loc(callsite(#loc1 at #loc104)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<4x1xi64, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<4x1xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<4x1xi64, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<4x1xi64, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<32> : tensor<4x1xi64, #blocked> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %cst_6 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16, #blocked> loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c4096_i64 = arith.constant 4096 : i64 loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc62) + %xoffset_8 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc63) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc64) + %xindex_9 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc64) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4x1xi32, #blocked> loc(#loc64) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc64) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<4x1xi32, #blocked> loc(#loc65) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<4x1xi32, #blocked1> loc(#loc65) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<4x1xi32, #blocked> loc(#loc65) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<4x1xi32, #blocked1> loc(#loc65) + %xmask = tt.splat %xnumel : i32 -> tensor<4x1xi32, #blocked> loc(#loc66) + %xmask_16 = tt.splat %xnumel : i32 -> tensor<4x1xi32, #blocked1> loc(#loc66) + %xmask_17 = arith.cmpi slt, %xindex_14, %xmask : tensor<4x1xi32, #blocked> loc(#loc66) + %xmask_18 = arith.cmpi slt, %xindex_15, %xmask_16 : tensor<4x1xi32, #blocked1> loc(#loc66) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc67) + %r0_base_19 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc67) + %x0 = arith.extsi %xindex_14 : tensor<4x1xi32, #blocked> to tensor<4x1xi64, #blocked> loc(#loc68) + %x0_20 = tt.splat %ks0 : i64 -> tensor<4x1xi64, #blocked> loc(#loc68) + %x0_21 = arith.remsi %x0, %x0_20 : tensor<4x1xi64, #blocked> loc(#loc68) + %x1 = arith.divsi %x0, %x0_20 : tensor<4x1xi64, #blocked> loc(#loc69) + %x1_22 = arith.remsi %x1, %cst_5 : tensor<4x1xi64, #blocked> loc(#loc70) + %x2 = tt.splat %ks1 : i64 -> tensor<4x1xi64, #blocked> loc(#loc71) + %x2_23 = arith.divsi %x0, %x2 : tensor<4x1xi64, #blocked> loc(#loc71) + %fixed = arith.cmpi ne, %x0_21, %cst_2 : tensor<4x1xi64, #blocked> loc(#loc106) + %fixed_24 = arith.subi %x1, %cst_4 : tensor<4x1xi64, #blocked> loc(#loc107) + %fixed_25 = arith.select %fixed, %fixed_24, %x1 : tensor<4x1xi1, #blocked>, tensor<4x1xi64, #blocked> loc(#loc108) + %x5 = arith.cmpi slt, %xindex_14, %cst_3 : tensor<4x1xi32, #blocked> loc(#loc109) + %x5_26 = arith.cmpi slt, %ks0, %c0_i64 : i64 loc(#loc110) + %x5_27 = tt.splat %x5_26 : i1 -> tensor<4x1xi1, #blocked> loc(#loc111) + %x5_28 = arith.cmpi ne, %x5, %x5_27 : tensor<4x1xi1, #blocked> loc(#loc111) + %x5_29 = arith.select %x5_28, %fixed_25, %x1 : tensor<4x1xi1, #blocked>, tensor<4x1xi64, #blocked> loc(#loc112) + %r0_mask = arith.cmpi slt, %r0_base_19, %cst : tensor<1x128xi32, #blocked> loc(#loc76) + %tmp0 = arith.muli %x1_22, %cst_0 : tensor<4x1xi64, #blocked> loc(#loc77) + %tmp0_30 = arith.extsi %r0_base_19 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked> loc(#loc78) + %tmp0_31 = tt.broadcast %tmp0_30 : tensor<1x128xi64, #blocked> -> tensor<4x128xi64, #blocked> loc(#loc78) + %tmp0_32 = tt.broadcast %tmp0 : tensor<4x1xi64, #blocked> -> tensor<4x128xi64, #blocked> loc(#loc78) + %tmp0_33 = arith.addi %tmp0_31, %tmp0_32 : tensor<4x128xi64, #blocked> loc(#loc78) + %tmp0_34 = arith.muli %x0_21, %cst_1 : tensor<4x1xi64, #blocked> loc(#loc79) + %tmp0_35 = tt.broadcast %tmp0_34 : tensor<4x1xi64, #blocked> -> tensor<4x128xi64, #blocked> loc(#loc80) + %tmp0_36 = arith.addi %tmp0_33, %tmp0_35 : tensor<4x128xi64, #blocked> loc(#loc80) + %tmp0_37 = arith.muli %ks0, %c4096_i64 : i64 loc(#loc81) + %tmp0_38 = tt.splat %tmp0_37 : i64 -> tensor<4x1xi64, #blocked> loc(#loc82) + %tmp0_39 = arith.muli %tmp0_38, %x2_23 : tensor<4x1xi64, #blocked> loc(#loc82) + %tmp0_40 = tt.broadcast %tmp0_39 : tensor<4x1xi64, #blocked> -> tensor<4x128xi64, #blocked> loc(#loc83) + %tmp0_41 = arith.addi %tmp0_36, %tmp0_40 : tensor<4x128xi64, #blocked> loc(#loc83) + %tmp0_42 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr, #blocked> loc(#loc84) + %tmp0_43 = tt.addptr %tmp0_42, %tmp0_41 : tensor<4x128x!tt.ptr, #blocked>, tensor<4x128xi64, #blocked> loc(#loc84) + %tmp0_44 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<4x128xi1, #blocked> loc(#loc85) + %tmp0_45 = tt.broadcast %xmask_17 : tensor<4x1xi1, #blocked> -> tensor<4x128xi1, #blocked> loc(#loc85) + %tmp0_46 = arith.andi %tmp0_44, %tmp0_45 : tensor<4x128xi1, #blocked> loc(#loc85) + %tmp0_47 = tt.load %tmp0_43, %tmp0_46, %cst_6 evictionPolicy = evict_first : tensor<4x128x!tt.ptr, #blocked> loc(#loc86) + %tmp0_48 = arith.extf %tmp0_47 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc87) + %tmp1 = arith.muli %x0_21, %cst_0 : tensor<4x1xi64, #blocked> loc(#loc88) + %tmp1_49 = tt.broadcast %tmp1 : tensor<4x1xi64, #blocked> -> tensor<4x128xi64, #blocked> loc(#loc89) + %tmp1_50 = arith.addi %tmp0_31, %tmp1_49 : tensor<4x128xi64, #blocked> loc(#loc89) + %tmp1_51 = arith.muli %x5_29, %cst_0 : tensor<4x1xi64, #blocked> loc(#loc90) + %tmp1_52 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc91) + %tmp1_53 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc92) + %tmp1_54 = arith.extui %tmp1_53 : i1 to i64 loc(#loc93) + %tmp1_55 = arith.muli %ks0, %tmp1_54 : i64 loc(#loc93) + %tmp1_56 = arith.extui %tmp1_52 : i1 to i64 loc(#loc113) + %tmp1_57 = arith.addi %tmp1_56, %tmp1_55 : i64 loc(#loc94) + %tmp1_58 = tt.splat %tmp1_57 : i64 -> tensor<4x1xi64, #blocked> loc(#loc96) + %tmp1_59 = arith.muli %tmp1_51, %tmp1_58 : tensor<4x1xi64, #blocked> loc(#loc96) + %tmp1_60 = tt.broadcast %tmp1_59 : tensor<4x1xi64, #blocked> -> tensor<4x128xi64, #blocked> loc(#loc97) + %tmp1_61 = arith.addi %tmp1_50, %tmp1_60 : tensor<4x128xi64, #blocked> loc(#loc97) + %tmp1_62 = tt.splat %in_ptr1 : !tt.ptr -> tensor<4x128x!tt.ptr, #blocked> loc(#loc98) + %tmp1_63 = tt.addptr %tmp1_62, %tmp1_61 : tensor<4x128x!tt.ptr, #blocked>, tensor<4x128xi64, #blocked> loc(#loc98) + %tmp1_64 = tt.load %tmp1_63, %tmp0_46, %cst_6 evictionPolicy = evict_first : tensor<4x128x!tt.ptr, #blocked> loc(#loc99) + %tmp1_65 = arith.extf %tmp1_64 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc100) + %tmp2 = arith.mulf %tmp0_48, %tmp1_65 : tensor<4x128xf32, #blocked> loc(#loc101) + %tmp5 = arith.addf %tmp2, %cst_7 : tensor<4x128xf32, #blocked> loc(#loc102) + %_tmp4 = arith.select %tmp0_46, %tmp5, %cst_7 : tensor<4x128xi1, #blocked>, tensor<4x128xf32, #blocked> loc(#loc103) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_68: f32 loc(callsite(#loc1 at #loc104)), %tmp4_69: f32 loc(callsite(#loc1 at #loc104))): + %tmp4_70 = arith.addf %tmp4_68, %tmp4_69 : f32 loc(#loc116) + tt.reduce.return %tmp4_70 : f32 loc(#loc114) + }) : (tensor<4x128xf32, #blocked>) -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc114) + %tmp4_66 = ttg.convert_layout %tmp4 : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc105) + %tmp4_67 = tt.expand_dims %tmp4_66 {axis = 1 : i32} : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xf32, #blocked1> loc(#loc105) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<4x1x!tt.ptr, #blocked1> loc(#loc52) + %1 = tt.addptr %0, %xindex_15 : tensor<4x1x!tt.ptr, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc52) + tt.store %1, %tmp4_67, %xmask_18 : tensor<4x1x!tt.ptr, #blocked1> loc(#loc53) + tt.return loc(#loc54) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":24:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":27:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:21) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:28) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":29:19) +#loc12 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":30:51) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":35:29) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:45) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:41) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:55) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:50) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:65) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:69) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:60) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:34) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:84) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:74) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:136) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:45) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:41) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:54) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:73) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:99) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:90) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:81) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:65) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:58) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:50) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:34) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:106) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:168) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":41:22) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":43:23) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:48) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:28) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:25) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:36) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:4) +#loc62 = loc("xoffset"(#loc2)) +#loc63 = loc("xoffset"(#loc3)) +#loc64 = loc("xindex"(#loc4)) +#loc65 = loc("xindex"(#loc5)) +#loc66 = loc("xmask"(#loc6)) +#loc67 = loc("r0_base"(#loc7)) +#loc68 = loc("x0"(#loc8)) +#loc69 = loc("x1"(#loc9)) +#loc70 = loc("x1"(#loc10)) +#loc71 = loc("x2"(#loc11)) +#loc72 = loc("fixed"(#loc12)) +#loc73 = loc("x5"(#loc13)) +#loc74 = loc("fixed"(#loc14)) +#loc75 = loc("fixed"(#loc15)) +#loc76 = loc("r0_mask"(#loc20)) +#loc77 = loc("tmp0"(#loc21)) +#loc78 = loc("tmp0"(#loc22)) +#loc79 = loc("tmp0"(#loc23)) +#loc80 = loc("tmp0"(#loc24)) +#loc81 = loc("tmp0"(#loc25)) +#loc82 = loc("tmp0"(#loc26)) +#loc83 = loc("tmp0"(#loc27)) +#loc84 = loc("tmp0"(#loc28)) +#loc85 = loc("tmp0"(#loc29)) +#loc86 = loc("tmp0"(#loc30)) +#loc87 = loc("tmp0"(#loc31)) +#loc88 = loc("tmp1"(#loc32)) +#loc89 = loc("tmp1"(#loc33)) +#loc90 = loc("tmp1"(#loc34)) +#loc91 = loc("tmp1"(#loc35)) +#loc92 = loc("tmp1"(#loc36)) +#loc93 = loc("tmp1"(#loc37)) +#loc94 = loc("tmp1"(#loc38)) +#loc95 = loc("tmp1"(#loc39)) +#loc96 = loc("tmp1"(#loc40)) +#loc97 = loc("tmp1"(#loc41)) +#loc98 = loc("tmp1"(#loc42)) +#loc99 = loc("tmp1"(#loc43)) +#loc100 = loc("tmp1"(#loc44)) +#loc101 = loc("tmp2"(#loc45)) +#loc102 = loc("tmp5"(#loc46)) +#loc103 = loc("_tmp4"(#loc47)) +#loc105 = loc("tmp4"(#loc51)) +#loc106 = loc(callsite(#loc72 at #loc73)) +#loc107 = loc(callsite(#loc74 at #loc73)) +#loc108 = loc(callsite(#loc75 at #loc73)) +#loc109 = loc(callsite(#loc16 at #loc73)) +#loc110 = loc(callsite(#loc17 at #loc73)) +#loc111 = loc(callsite(#loc18 at #loc73)) +#loc112 = loc(callsite(#loc19 at #loc73)) +#loc113 = loc(fused[#loc94, #loc95]) +#loc114 = loc(callsite(#loc48 at #loc104)) +#loc116 = loc(callsite(#loc50 at #loc114)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..420584eaa8b2ab4a340b7b3ec60c2ae86fa7baa2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4GZ7CVSLAS6HV3PFLT7QTVXV766VDM2EJD6F6EMVZBCIWXUB73MA/triton_red_fused_zeros_0.ttir @@ -0,0 +1,215 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":18:0) +#loc6 = loc(unknown) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:25) +#loc57 = loc("in_ptr0"(#loc)) +#loc58 = loc("in_ptr1"(#loc)) +#loc59 = loc("out_ptr1"(#loc)) +#loc60 = loc("ks0"(#loc)) +#loc61 = loc("ks1"(#loc)) +#loc62 = loc("xnumel"(#loc)) +#loc63 = loc("r0_numel"(#loc)) +#loc108 = loc("tmp4"(#loc51)) +#loc119 = loc(callsite(#loc6 at #loc108)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %fixed = arith.constant dense<1> : tensor<4x1xi64> loc(#loc110) + %x5 = arith.constant dense<0> : tensor<4x1xi32> loc(#loc111) + %fixed_0 = arith.constant dense<0> : tensor<4x1xi64> loc(#loc112) + %x5_1 = arith.constant 0 : i64 loc(#loc113) + %c1_i64 = arith.constant 1 : i64 loc(#loc6) + %cst = arith.constant dense<0.000000e+00> : tensor<4x128xbf16> loc(#loc6) + %cst_2 = arith.constant dense<4096> : tensor<4x1xi64> loc(#loc6) + %c4096_i64 = arith.constant 4096 : i64 loc(#loc6) + %cst_3 = arith.constant dense<128> : tensor<4x1xi64> loc(#loc6) + %cst_4 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc6) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc6) + %x1 = arith.constant dense<32> : tensor<4x1xi64> loc(#loc67) + %c4_i32 = arith.constant 4 : i32 loc(#loc6) + %xoffset = tt.get_program_id x : i32 loc(#loc68) + %xoffset_6 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc69) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc70) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc71) + %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<4x1xi32> loc(#loc72) + %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<4x1xi32> loc(#loc72) + %xmask = tt.splat %xnumel : i32 -> tensor<4x1xi32> loc(#loc73) + %xmask_10 = arith.cmpi slt, %xindex_9, %xmask : tensor<4x1xi32> loc(#loc73) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc74) + %r0_base_11 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc75) + %x0 = arith.extsi %xindex_9 : tensor<4x1xi32> to tensor<4x1xi64> loc(#loc76) + %x0_12 = tt.splat %ks0 : i64 -> tensor<4x1xi64> loc(#loc76) + %x0_13 = arith.remsi %x0, %x0_12 : tensor<4x1xi64> loc(#loc76) + %x1_14 = arith.divsi %x0, %x0_12 : tensor<4x1xi64> loc(#loc77) + %x1_15 = arith.remsi %x1_14, %x1 : tensor<4x1xi64> loc(#loc67) + %x2 = tt.splat %ks1 : i64 -> tensor<4x1xi64> loc(#loc78) + %x2_16 = arith.divsi %x0, %x2 : tensor<4x1xi64> loc(#loc78) + %fixed_17 = arith.cmpi ne, %x0_13, %fixed_0 : tensor<4x1xi64> loc(#loc112) + %fixed_18 = arith.subi %x1_14, %fixed : tensor<4x1xi64> loc(#loc110) + %fixed_19 = arith.select %fixed_17, %fixed_18, %x1_14 : tensor<4x1xi1>, tensor<4x1xi64> loc(#loc114) + %x5_20 = arith.cmpi slt, %xindex_9, %x5 : tensor<4x1xi32> loc(#loc111) + %x5_21 = arith.cmpi slt, %ks0, %x5_1 : i64 loc(#loc113) + %x5_22 = tt.splat %x5_21 : i1 -> tensor<4x1xi1> loc(#loc115) + %x5_23 = arith.cmpi ne, %x5_20, %x5_22 : tensor<4x1xi1> loc(#loc115) + %x5_24 = arith.select %x5_23, %fixed_19, %x1_14 : tensor<4x1xi1>, tensor<4x1xi64> loc(#loc116) + %r0_mask = arith.cmpi slt, %r0_base_11, %cst_4 : tensor<1x128xi32> loc(#loc80) + %tmp0 = arith.muli %x1_15, %cst_3 : tensor<4x1xi64> loc(#loc81) + %tmp0_25 = arith.extsi %r0_base_11 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc82) + %tmp0_26 = tt.broadcast %tmp0_25 : tensor<1x128xi64> -> tensor<4x128xi64> loc(#loc82) + %tmp0_27 = tt.broadcast %tmp0 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc82) + %tmp0_28 = arith.addi %tmp0_26, %tmp0_27 : tensor<4x128xi64> loc(#loc82) + %tmp0_29 = arith.muli %x0_13, %cst_2 : tensor<4x1xi64> loc(#loc83) + %tmp0_30 = tt.broadcast %tmp0_29 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc84) + %tmp0_31 = arith.addi %tmp0_28, %tmp0_30 : tensor<4x128xi64> loc(#loc84) + %tmp0_32 = arith.muli %ks0, %c4096_i64 : i64 loc(#loc85) + %tmp0_33 = tt.splat %tmp0_32 : i64 -> tensor<4x1xi64> loc(#loc86) + %tmp0_34 = arith.muli %tmp0_33, %x2_16 : tensor<4x1xi64> loc(#loc86) + %tmp0_35 = tt.broadcast %tmp0_34 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc87) + %tmp0_36 = arith.addi %tmp0_31, %tmp0_35 : tensor<4x128xi64> loc(#loc87) + %tmp0_37 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc88) + %tmp0_38 = tt.addptr %tmp0_37, %tmp0_36 : tensor<4x128x!tt.ptr>, tensor<4x128xi64> loc(#loc88) + %tmp0_39 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc89) + %tmp0_40 = tt.broadcast %xmask_10 : tensor<4x1xi1> -> tensor<4x128xi1> loc(#loc89) + %tmp0_41 = arith.andi %tmp0_39, %tmp0_40 : tensor<4x128xi1> loc(#loc89) + %tmp0_42 = tt.load %tmp0_38, %tmp0_41, %cst evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc90) + %tmp0_43 = arith.extf %tmp0_42 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc91) + %tmp1 = arith.muli %x0_13, %cst_3 : tensor<4x1xi64> loc(#loc92) + %tmp1_44 = tt.broadcast %tmp1 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc93) + %tmp1_45 = arith.addi %tmp0_26, %tmp1_44 : tensor<4x128xi64> loc(#loc93) + %tmp1_46 = arith.muli %x5_24, %cst_3 : tensor<4x1xi64> loc(#loc94) + %tmp1_47 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc95) + %tmp1_48 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc96) + %tmp1_49 = arith.extui %tmp1_48 : i1 to i64 loc(#loc97) + %tmp1_50 = arith.muli %ks0, %tmp1_49 : i64 loc(#loc97) + %tmp1_51 = arith.extui %tmp1_47 : i1 to i64 loc(#loc117) + %tmp1_52 = arith.addi %tmp1_51, %tmp1_50 : i64 loc(#loc98) + %tmp1_53 = tt.splat %tmp1_52 : i64 -> tensor<4x1xi64> loc(#loc100) + %tmp1_54 = arith.muli %tmp1_46, %tmp1_53 : tensor<4x1xi64> loc(#loc100) + %tmp1_55 = tt.broadcast %tmp1_54 : tensor<4x1xi64> -> tensor<4x128xi64> loc(#loc101) + %tmp1_56 = arith.addi %tmp1_45, %tmp1_55 : tensor<4x128xi64> loc(#loc101) + %tmp1_57 = tt.splat %in_ptr1 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc102) + %tmp1_58 = tt.addptr %tmp1_57, %tmp1_56 : tensor<4x128x!tt.ptr>, tensor<4x128xi64> loc(#loc102) + %tmp1_59 = tt.load %tmp1_58, %tmp0_41, %cst evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc103) + %tmp1_60 = arith.extf %tmp1_59 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc104) + %tmp2 = arith.mulf %tmp0_43, %tmp1_60 : tensor<4x128xf32> loc(#loc105) + %tmp5 = arith.addf %tmp2, %cst_5 : tensor<4x128xf32> loc(#loc106) + %_tmp4 = arith.select %tmp0_41, %tmp5, %cst_5 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc107) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_62: f32 loc(callsite(#loc6 at #loc108)), %tmp4_63: f32 loc(callsite(#loc6 at #loc108))): + %tmp4_64 = arith.addf %tmp4_62, %tmp4_63 : f32 loc(#loc120) + tt.reduce.return %tmp4_64 : f32 loc(#loc118) + }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc118) + %tmp4_61 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc109) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<4x1x!tt.ptr> loc(#loc54) + %1 = tt.addptr %0, %xindex_9 : tensor<4x1x!tt.ptr>, tensor<4x1xi32> loc(#loc54) + tt.store %1, %tmp4_61, %xmask_10 : tensor<4x1x!tt.ptr> loc(#loc55) + tt.return loc(#loc56) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":30:51) +#loc3 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc4 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc5 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:33) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:36) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:44) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:23) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":24:21) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:27) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:37) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":27:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:21) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":29:19) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":35:29) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:45) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:41) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:55) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:65) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:69) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:60) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:84) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:74) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:136) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:45) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:41) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:73) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:99) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:90) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:81) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:65) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:58) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:50) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:34) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:106) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:168) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":41:22) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":43:23) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:48) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:28) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:25) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:36) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:4) +#loc64 = loc("fixed"(#loc1)) +#loc65 = loc("x5"(#loc2)) +#loc66 = loc("fixed"(#loc4)) +#loc67 = loc("x1"(#loc7)) +#loc68 = loc("xoffset"(#loc8)) +#loc69 = loc("xoffset"(#loc9)) +#loc70 = loc("xindex"(#loc10)) +#loc71 = loc("xindex"(#loc11)) +#loc72 = loc("xindex"(#loc12)) +#loc73 = loc("xmask"(#loc13)) +#loc74 = loc("r0_base"(#loc14)) +#loc75 = loc("r0_base"(#loc15)) +#loc76 = loc("x0"(#loc16)) +#loc77 = loc("x1"(#loc17)) +#loc78 = loc("x2"(#loc18)) +#loc79 = loc("fixed"(#loc19)) +#loc80 = loc("r0_mask"(#loc22)) +#loc81 = loc("tmp0"(#loc23)) +#loc82 = loc("tmp0"(#loc24)) +#loc83 = loc("tmp0"(#loc25)) +#loc84 = loc("tmp0"(#loc26)) +#loc85 = loc("tmp0"(#loc27)) +#loc86 = loc("tmp0"(#loc28)) +#loc87 = loc("tmp0"(#loc29)) +#loc88 = loc("tmp0"(#loc30)) +#loc89 = loc("tmp0"(#loc31)) +#loc90 = loc("tmp0"(#loc32)) +#loc91 = loc("tmp0"(#loc33)) +#loc92 = loc("tmp1"(#loc34)) +#loc93 = loc("tmp1"(#loc35)) +#loc94 = loc("tmp1"(#loc36)) +#loc95 = loc("tmp1"(#loc37)) +#loc96 = loc("tmp1"(#loc38)) +#loc97 = loc("tmp1"(#loc39)) +#loc98 = loc("tmp1"(#loc40)) +#loc99 = loc("tmp1"(#loc41)) +#loc100 = loc("tmp1"(#loc42)) +#loc101 = loc("tmp1"(#loc43)) +#loc102 = loc("tmp1"(#loc44)) +#loc103 = loc("tmp1"(#loc45)) +#loc104 = loc("tmp1"(#loc46)) +#loc105 = loc("tmp2"(#loc47)) +#loc106 = loc("tmp5"(#loc48)) +#loc107 = loc("_tmp4"(#loc49)) +#loc109 = loc("tmp4"(#loc53)) +#loc110 = loc(callsite(#loc64 at #loc65)) +#loc111 = loc(callsite(#loc3 at #loc65)) +#loc112 = loc(callsite(#loc66 at #loc65)) +#loc113 = loc(callsite(#loc5 at #loc65)) +#loc114 = loc(callsite(#loc79 at #loc65)) +#loc115 = loc(callsite(#loc20 at #loc65)) +#loc116 = loc(callsite(#loc21 at #loc65)) +#loc117 = loc(fused[#loc98, #loc99]) +#loc118 = loc(callsite(#loc50 at #loc108)) +#loc120 = loc(callsite(#loc52 at #loc118)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1ec6deda1ccb0449488c1d84db4a8591867ef24f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..e26f85c24f4af797390a4e2a0cb73ea808162ace Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..dd1bba5c31c1a1387faab82c2a18d6a1fb5b95f5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "e386d88e99cc3c317fd2667573738d591a2c1800052620474ace50b4cd34f7ee", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 16, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..fcbbccbefb39766a8f0175aab01dacc0d423baf9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.llir @@ -0,0 +1,158 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 2, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 96, !dbg !9 + %12 = lshr exact i32 %11, 5, !dbg !9 + %13 = and i32 %10, 3, !dbg !9 + %14 = or disjoint i32 %12, %9, !dbg !10 + %15 = or disjoint i32 %9, %13, !dbg !10 + %16 = shl nuw nsw i32 %10, 2, !dbg !11 + %17 = and i32 %16, 124, !dbg !11 + %18 = sdiv i32 %14, 2048, !dbg !12 + %19 = mul i32 %18, 2048, !dbg !13 + %.decomposed = sub i32 %14, %19, !dbg !13 + %20 = srem i32 %18, 32, !dbg !14 + %21 = sdiv i32 %14, 65536, !dbg !15 + %22 = shl nsw i32 %20, 7, !dbg !16 + %23 = shl nsw i32 %.decomposed, 12, !dbg !17 + %24 = shl i32 %21, 23, !dbg !18 + %25 = or disjoint i32 %23, %17, !dbg !19 + %26 = add i32 %25, %24, !dbg !20 + %27 = add i32 %26, %22, !dbg !21 + %28 = sext i32 %27 to i64, !dbg !22 + %29 = getelementptr bfloat, ptr addrspace(1) %0, i64 %28, !dbg !22 + %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !23 + %31 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %29, i64 %30, i1 true) #4, !dbg !23 + %32 = extractvalue { i32, i32 } %31, 0, !dbg !23 + %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !23 + %34 = extractvalue { i32, i32 } %31, 1, !dbg !23 + %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !23 + %36 = shl i32 %14, 7, !dbg !24 + %37 = or disjoint i32 %36, %17, !dbg !25 + %38 = sext i32 %37 to i64, !dbg !26 + %39 = getelementptr bfloat, ptr addrspace(1) %1, i64 %38, !dbg !26 + %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !27 + %41 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %39, i64 %40, i1 true) #4, !dbg !27 + %42 = extractvalue { i32, i32 } %41, 0, !dbg !27 + %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !27 + %44 = extractvalue { i32, i32 } %41, 1, !dbg !27 + %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !27 + %46 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !28 + %47 = fpext <2 x bfloat> %43 to <2 x float>, !dbg !29 + %48 = fmul <2 x float> %46, %47, !dbg !30 + %49 = fadd <2 x float> %48, zeroinitializer, !dbg !31 + %50 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !28 + %51 = fpext <2 x bfloat> %45 to <2 x float>, !dbg !29 + %52 = fmul <2 x float> %50, %51, !dbg !30 + %53 = fadd <2 x float> %52, zeroinitializer, !dbg !31 + %shift = shufflevector <2 x float> %49, <2 x float> poison, <2 x i32> , !dbg !32 + %foldExtExtBinop = fadd <2 x float> %49, %shift, !dbg !32 + %foldExtExtBinop2 = fadd <2 x float> %53, %foldExtExtBinop, !dbg !32 + %shift4 = shufflevector <2 x float> %53, <2 x float> poison, <2 x i32> , !dbg !32 + %foldExtExtBinop5 = fadd <2 x float> %shift4, %foldExtExtBinop2, !dbg !32 + %54 = extractelement <2 x float> %foldExtExtBinop5, i64 0, !dbg !32 + %55 = bitcast float %54 to i32, !dbg !36 + %56 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 16, i32 31), !dbg !36 + %57 = bitcast i32 %56 to float, !dbg !36 + %58 = fadd float %54, %57, !dbg !32 + %59 = bitcast float %58 to i32, !dbg !36 + %60 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %59, i32 8, i32 31), !dbg !36 + %61 = bitcast i32 %60 to float, !dbg !36 + %62 = fadd float %58, %61, !dbg !32 + %63 = bitcast float %62 to i32, !dbg !36 + %64 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 4, i32 31), !dbg !36 + %65 = bitcast i32 %64 to float, !dbg !36 + %66 = fadd float %62, %65, !dbg !32 + %67 = bitcast float %66 to i32, !dbg !36 + %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 2, i32 31), !dbg !36 + %69 = bitcast i32 %68 to float, !dbg !36 + %70 = fadd float %66, %69, !dbg !32 + %71 = bitcast float %70 to i32, !dbg !36 + %72 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 1, i32 31), !dbg !36 + %73 = bitcast i32 %72 to float, !dbg !36 + %74 = fadd float %70, %73, !dbg !32 + %75 = lshr exact i32 %11, 3, !dbg !37 + %76 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %75, !dbg !37 + store float %74, ptr addrspace(3) %76, align 4, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37 + %77 = shl nuw nsw i32 %13, 2, !dbg !37 + %78 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %77, !dbg !37 + %79 = load i32, ptr addrspace(3) %78, align 4, !dbg !37 + %80 = sext i32 %15 to i64, !dbg !38 + %81 = getelementptr float, ptr addrspace(1) %2, i64 %80, !dbg !38 + %82 = and i32 %10, 124, !dbg !39 + %83 = icmp eq i32 %82, 0, !dbg !39 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %79, ptr addrspace(1) %81, i1 %83) #4, !dbg !39 + ret void, !dbg !40 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 21, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 29, column: 29, scope: !4) +!15 = !DILocation(line: 30, column: 19, scope: !4) +!16 = !DILocation(line: 39, column: 45, scope: !4) +!17 = !DILocation(line: 39, column: 55, scope: !4) +!18 = !DILocation(line: 39, column: 68, scope: !4) +!19 = !DILocation(line: 39, column: 41, scope: !4) +!20 = !DILocation(line: 39, column: 50, scope: !4) +!21 = !DILocation(line: 39, column: 60, scope: !4) +!22 = !DILocation(line: 39, column: 34, scope: !4) +!23 = !DILocation(line: 39, column: 73, scope: !4) +!24 = !DILocation(line: 40, column: 45, scope: !4) +!25 = !DILocation(line: 40, column: 41, scope: !4) +!26 = !DILocation(line: 40, column: 34, scope: !4) +!27 = !DILocation(line: 40, column: 50, scope: !4) +!28 = !DILocation(line: 39, column: 127, scope: !4) +!29 = !DILocation(line: 40, column: 104, scope: !4) +!30 = !DILocation(line: 41, column: 22, scope: !4) +!31 = !DILocation(line: 43, column: 23, scope: !4) +!32 = !DILocation(line: 261, column: 15, scope: !33, inlinedAt: !35) +!33 = distinct !DILexicalBlockFile(scope: !4, file: !34, discriminator: 0) +!34 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!35 = !DILocation(line: 45, column: 25, scope: !4) +!36 = !DILocation(line: 291, column: 36, scope: !33, inlinedAt: !35) +!37 = !DILocation(line: 45, column: 28, scope: !4) +!38 = !DILocation(line: 49, column: 25, scope: !4) +!39 = !DILocation(line: 49, column: 36, scope: !4) +!40 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..83bda27a6c30e87466dd6f5a4f8c22e88a6693de --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.ptx @@ -0,0 +1,412 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u32 triton_red_fused_zeros_0_param_3, + .param .u32 triton_red_fused_zeros_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_6 +) +.reqntid 128 +{ + .reg .pred %p<4>; + .reg .b16 %rs<9>; + .reg .b32 %r<72>; + .reg .b64 %rd<11>; + .loc 1 18 0 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_red_fused_zeros_0_param_0]; + ld.param.b64 %rd9, [triton_red_fused_zeros_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:23:28 + mov.u32 %r10, %ctaid.x; + .loc 1 23 33 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:23:33 + shl.b32 %r11, %r10, 2; + ld.param.b64 %rd10, [triton_red_fused_zeros_0_param_2]; + .loc 1 24 44 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:24:44 + mov.u32 %r12, %tid.x; + and.b32 %r13, %r12, 96; + bfe.u32 %r14, %r12, 5, 2; + and.b32 %r15, %r12, 3; + .loc 1 24 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:24:23 + or.b32 %r16, %r14, %r11; + or.b32 %r17, %r11, %r15; + .loc 1 26 37 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:26:37 + shl.b32 %r18, %r12, 2; + and.b32 %r19, %r18, 124; + .loc 1 29 21 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:29:21 + bfe.s32 %r20, %r10, 29, 1; + shr.u32 %r21, %r20, 21; + add.s32 %r22, %r16, %r21; + shr.s32 %r23, %r22, 11; + .loc 1 28 19 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:28:19 + and.b32 %r24, %r22, 1046528; + sub.s32 %r25, %r16, %r24; + .loc 1 29 29 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:29:29 + shr.u32 %r26, %r23, 27; + add.s32 %r27, %r23, %r26; + and.b32 %r28, %r27, 33554400; + sub.s32 %r29, %r23, %r28; + .loc 1 30 19 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:30:19 + shr.u32 %r30, %r20, 16; + add.s32 %r31, %r16, %r30; + .loc 1 39 45 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:45 + shl.b32 %r32, %r29, 7; + .loc 1 39 55 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:55 + shl.b32 %r33, %r25, 12; + .loc 1 39 68 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:68 + shl.b32 %r34, %r31, 7; + and.b32 %r35, %r34, -8388608; + .loc 1 39 41 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:41 + or.b32 %r36, %r33, %r19; + .loc 1 39 50 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:50 + add.s32 %r37, %r36, %r35; + .loc 1 39 60 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:60 + add.s32 %r38, %r37, %r32; + .loc 1 39 34 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:34 + mad.wide.s32 %rd2, %r38, 2, %rd8; + .loc 1 39 73 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:73 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd3, 1.0; + // end inline asm + mov.b32 %r3, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd2 + 0 ], %rd3; + // end inline asm + .loc 1 40 45 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:45 + shl.b32 %r39, %r16, 7; + .loc 1 40 41 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:41 + or.b32 %r40, %r39, %r19; + .loc 1 40 34 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:34 + mad.wide.s32 %rd5, %r40, 2, %rd9; + .loc 1 40 50 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:50 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r5, %r3; + mov.u32 %r6, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r5, %r6 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 39 127 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:127 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r41, %rs1; + cvt.f32.bf16 %r42, %rs2; + .loc 1 40 104 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:104 + mov.b32 {%rs3, %rs4}, %r5; + cvt.f32.bf16 %r43, %rs3; + cvt.f32.bf16 %r44, %rs4; + .loc 1 43 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:43:23 + fma.rn.f32 %r45, %r42, %r44, 0f00000000; + fma.rn.f32 %r46, %r41, %r43, 0f00000000; + .loc 1 39 127 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:127 + mov.b32 {%rs5, %rs6}, %r2; + cvt.f32.bf16 %r47, %rs5; + cvt.f32.bf16 %r48, %rs6; + .loc 1 40 104 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:104 + mov.b32 {%rs7, %rs8}, %r6; + cvt.f32.bf16 %r49, %rs7; + cvt.f32.bf16 %r50, %rs8; + .loc 1 43 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:43:23 + fma.rn.f32 %r51, %r48, %r50, 0f00000000; + fma.rn.f32 %r52, %r47, %r49, 0f00000000; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r53, %r46, %r45; + add.f32 %r54, %r52, %r53; + add.f32 %r55, %r51, %r54; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r56, %r55, 16, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r57, %r55, %r56; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r58, %r57, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r59, %r57, %r58; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r60, %r59, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r61, %r59, %r60; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r62, %r61, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r63, %r61, %r62; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r64, %r63, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r65, %r63, %r64; +$L__tmp2: + .loc 1 45 28 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:28 + shr.u32 %r66, %r13, 3; + mov.b32 %r67, global_smem; + add.s32 %r68, %r67, %r66; + st.shared.b32 [%r68], %r65; + bar.sync 0; + shl.b32 %r69, %r15, 2; + add.s32 %r70, %r67, %r69; + ld.shared.b32 %r9, [%r70]; + .loc 1 49 25 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:49:25 + mad.wide.s32 %rd7, %r17, 4, %rd10; + .loc 1 49 36 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:49:36 + and.b32 %r71, %r12, 124; + setp.eq.b32 %p3, %r71, 0; + // begin inline asm + @%p3 st.global.b32 [ %rd7 + 0 ], { %r9 }; + // end inline asm + .loc 1 49 4 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:49:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 209 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xca DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 111 +.b8 103 +.b8 111 +.b8 108 +.b8 53 +.b8 53 +.b8 99 +.b8 116 +.b8 104 +.b8 107 +.b8 52 +.b8 122 +.b8 101 +.b8 118 +.b8 115 +.b8 121 +.b8 51 +.b8 100 +.b8 108 +.b8 113 +.b8 105 +.b8 121 +.b8 122 +.b8 105 +.b8 112 +.b8 101 +.b8 102 +.b8 118 +.b8 55 +.b8 51 +.b8 53 +.b8 103 +.b8 101 +.b8 50 +.b8 119 +.b8 116 +.b8 97 +.b8 100 +.b8 100 +.b8 118 +.b8 107 +.b8 52 +.b8 51 +.b8 54 +.b8 113 +.b8 104 +.b8 116 +.b8 53 +.b8 110 +.b8 111 +.b8 120 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 111 +.b8 103 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..df1eed9494df17893839324c4058cb05d6d64898 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.source @@ -0,0 +1,222 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":18:0) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc46 = loc(unknown) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc53 = loc("in_ptr0"(#loc)) +#loc54 = loc("in_ptr1"(#loc)) +#loc55 = loc("out_ptr1"(#loc)) +#loc56 = loc("xnumel"(#loc)) +#loc57 = loc("r0_numel"(#loc)) +#loc97 = loc("input"(#loc44)) +#loc98 = loc("a"(#loc49)) +#loc99 = loc("b"(#loc49)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 131072 : i32 loc(#loc58) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc59) + %xoffset = tt.get_program_id x : i32 loc(#loc60) + %xoffset_2 = arith.constant 4 : i32 loc(#loc61) + %xoffset_3 = arith.constant 4 : i32 loc(#loc61) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc61) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc62) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc63) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<4x1xi32> loc(#loc64) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<4x1xi32> loc(#loc64) + %xmask = arith.constant true loc(#loc65) + %xmask_8 = arith.constant dense : tensor<4x128xi1> loc(#loc65) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc66) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc67) + %x0 = arith.constant 2048 : i32 loc(#loc68) + %x0_10 = arith.constant 2048 : i32 loc(#loc68) + %x0_11 = arith.constant dense<2048> : tensor<4x1xi32> loc(#loc68) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<4x1xi32> loc(#loc68) + %x1 = arith.constant 2048 : i32 loc(#loc69) + %x1_13 = arith.constant 2048 : i32 loc(#loc69) + %x1_14 = arith.constant dense<2048> : tensor<4x1xi32> loc(#loc69) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<4x1xi32> loc(#loc69) + %x1_16 = arith.constant 32 : i32 loc(#loc70) + %x1_17 = arith.constant 32 : i32 loc(#loc70) + %x1_18 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc70) + %x1_19 = arith.remsi %x1_15, %x1_18 : tensor<4x1xi32> loc(#loc70) + %x2 = arith.constant 65536 : i32 loc(#loc71) + %x2_20 = arith.constant 65536 : i32 loc(#loc71) + %x2_21 = arith.constant dense<65536> : tensor<4x1xi32> loc(#loc71) + %x2_22 = arith.divsi %xindex_7, %x2_21 : tensor<4x1xi32> loc(#loc71) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc72) + %_tmp4_23 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc72) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c128_i32 = arith.constant 128 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_24 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_27 = %_tmp4_23) -> (tensor<4x128xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc74) + %r0_index_28 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc74) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc75) + %r0_mask_29 = arith.cmpi slt, %r0_index_28, %r0_mask : tensor<1x128xi32> loc(#loc75) + %tmp0 = arith.constant 128 : i32 loc(#loc76) + %tmp0_30 = arith.constant 128 : i32 loc(#loc76) + %tmp0_31 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc76) + %tmp0_32 = arith.muli %tmp0_31, %x1_19 : tensor<4x1xi32> loc(#loc76) + %tmp0_33 = tt.broadcast %r0_index_28 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc77) + %tmp0_34 = tt.broadcast %tmp0_32 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc77) + %tmp0_35 = arith.addi %tmp0_33, %tmp0_34 : tensor<4x128xi32> loc(#loc77) + %tmp0_36 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_37 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_38 = arith.constant dense<4096> : tensor<4x1xi32> loc(#loc78) + %tmp0_39 = arith.muli %tmp0_38, %x0_12 : tensor<4x1xi32> loc(#loc78) + %tmp0_40 = tt.broadcast %tmp0_39 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc79) + %tmp0_41 = arith.addi %tmp0_35, %tmp0_40 : tensor<4x128xi32> loc(#loc79) + %tmp0_42 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_43 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_44 = arith.constant dense<8388608> : tensor<4x1xi32> loc(#loc80) + %tmp0_45 = arith.muli %tmp0_44, %x2_22 : tensor<4x1xi32> loc(#loc80) + %tmp0_46 = tt.broadcast %tmp0_45 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc81) + %tmp0_47 = arith.addi %tmp0_41, %tmp0_46 : tensor<4x128xi32> loc(#loc81) + %tmp0_48 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc82) + %tmp0_49 = tt.addptr %tmp0_48, %tmp0_47 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc82) + %tmp0_50 = arith.constant 0.000000e+00 : f32 loc(#loc83) + %tmp0_51 = tt.broadcast %r0_mask_29 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc83) + %tmp0_52 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc83) + %tmp0_53 = arith.truncf %tmp0_52 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc83) + %tmp0_54 = tt.load %tmp0_49, %tmp0_51, %tmp0_53 evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc83) + %tmp0_55 = arith.extf %tmp0_54 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc84) + %tmp1 = arith.constant 128 : i32 loc(#loc85) + %tmp1_56 = arith.constant 128 : i32 loc(#loc85) + %tmp1_57 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc85) + %tmp1_58 = arith.muli %tmp1_57, %xindex_7 : tensor<4x1xi32> loc(#loc85) + %tmp1_59 = tt.broadcast %r0_index_28 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc86) + %tmp1_60 = tt.broadcast %tmp1_58 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc86) + %tmp1_61 = arith.addi %tmp1_59, %tmp1_60 : tensor<4x128xi32> loc(#loc86) + %tmp1_62 = tt.splat %in_ptr1 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc87) + %tmp1_63 = tt.addptr %tmp1_62, %tmp1_61 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc87) + %tmp1_64 = arith.constant 0.000000e+00 : f32 loc(#loc88) + %tmp1_65 = tt.broadcast %r0_mask_29 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc88) + %tmp1_66 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc88) + %tmp1_67 = arith.truncf %tmp1_66 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc88) + %tmp1_68 = tt.load %tmp1_63, %tmp1_65, %tmp1_67 evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc88) + %tmp1_69 = arith.extf %tmp1_68 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc89) + %tmp2 = arith.mulf %tmp0_55, %tmp1_69 : tensor<4x128xf32> loc(#loc90) + %tmp5 = arith.addf %_tmp4_27, %tmp2 : tensor<4x128xf32> loc(#loc91) + %_tmp4_70 = tt.broadcast %r0_mask_29 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc92) + %_tmp4_71 = arith.select %_tmp4_70, %tmp5, %_tmp4_27 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc92) + scf.yield %_tmp4_71 : tensor<4x128xf32> loc(#loc36) + } loc(#loc73) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_24) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc93) + %tmp4_25 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc94) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc95) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<4x1xf32> loc(#loc96) + %tmp8_26 = arith.subf %tmp4_25, %tmp8 : tensor<4x1xf32> loc(#loc96) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<4x1x!tt.ptr> loc(#loc41) + %5 = tt.addptr %4, %xindex_7 : tensor<4x1x!tt.ptr>, tensor<4x1xi32> loc(#loc41) + tt.store %5, %tmp8_26 : tensor<4x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x128xf32> loc("input"(#loc44))) -> tensor<4xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc45) + tt.reduce.return %2 : f32 loc(#loc45) + }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc45) + tt.return %0 : tensor<4xf32> loc(#loc47) + ^bb1: // no predecessors + %1 = ub.poison : tensor<4xf32> loc(#loc48) + tt.return %1 : tensor<4xf32> loc(#loc48) + } loc(#loc44) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc49)), %b: f32 loc("b"(#loc49))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc50) + tt.return %0 : f32 loc(#loc51) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc52) + tt.return %1 : f32 loc(#loc52) + } loc(#loc49) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":30:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":32:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:68) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:60) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:73) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:127) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:45) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:50) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:104) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":41:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":43:23) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:40) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:28) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":47:11) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":48:18) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:4) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc58 = loc("xnumel"(#loc1)) +#loc59 = loc("r0_numel"(#loc2)) +#loc60 = loc("xoffset"(#loc3)) +#loc61 = loc("xoffset"(#loc4)) +#loc62 = loc("xindex"(#loc5)) +#loc63 = loc("xindex"(#loc6)) +#loc64 = loc("xindex"(#loc7)) +#loc65 = loc("xmask"(#loc8)) +#loc66 = loc("r0_base"(#loc9)) +#loc67 = loc("r0_base"(#loc10)) +#loc68 = loc("x0"(#loc11)) +#loc69 = loc("x1"(#loc12)) +#loc70 = loc("x1"(#loc13)) +#loc71 = loc("x2"(#loc14)) +#loc72 = loc("_tmp4"(#loc15)) +#loc73 = loc("_tmp4"(#loc16)) +#loc74 = loc("r0_index"(#loc17)) +#loc75 = loc("r0_mask"(#loc18)) +#loc76 = loc("tmp0"(#loc19)) +#loc77 = loc("tmp0"(#loc20)) +#loc78 = loc("tmp0"(#loc21)) +#loc79 = loc("tmp0"(#loc22)) +#loc80 = loc("tmp0"(#loc23)) +#loc81 = loc("tmp0"(#loc24)) +#loc82 = loc("tmp0"(#loc25)) +#loc83 = loc("tmp0"(#loc26)) +#loc84 = loc("tmp0"(#loc27)) +#loc85 = loc("tmp1"(#loc28)) +#loc86 = loc("tmp1"(#loc29)) +#loc87 = loc("tmp1"(#loc30)) +#loc88 = loc("tmp1"(#loc31)) +#loc89 = loc("tmp1"(#loc32)) +#loc90 = loc("tmp2"(#loc33)) +#loc91 = loc("tmp5"(#loc34)) +#loc92 = loc("_tmp4"(#loc35)) +#loc93 = loc("tmp4"(#loc37)) +#loc94 = loc("tmp4"(#loc38)) +#loc95 = loc("tmp7"(#loc39)) +#loc96 = loc("tmp8"(#loc40)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..00f504ff5e4c4af1b5a1b02c326c69b9022fb276 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,142 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":18:0) +#loc1 = loc(unknown) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:25) +#loc36 = loc("in_ptr0"(#loc)) +#loc37 = loc("in_ptr1"(#loc)) +#loc38 = loc("out_ptr1"(#loc)) +#loc39 = loc("xnumel"(#loc)) +#loc40 = loc("r0_numel"(#loc)) +#loc68 = loc("tmp4"(#loc30)) +#loc71 = loc(callsite(#loc1 at #loc68)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<8388608> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<65536> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<32> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<2048> : tensor<4x1xi32, #blocked> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %cst_6 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc41) + %xoffset_8 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc42) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc43) + %xindex_9 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc43) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4x1xi32, #blocked> loc(#loc43) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc43) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<4x1xi32, #blocked> loc(#loc44) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<4x1xi32, #blocked1> loc(#loc44) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<4x1xi32, #blocked> loc(#loc44) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<4x1xi32, #blocked1> loc(#loc44) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc45) + %r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc45) + %x0 = arith.remsi %xindex_14, %cst_5 : tensor<4x1xi32, #blocked> loc(#loc46) + %x1 = arith.divsi %xindex_14, %cst_5 : tensor<4x1xi32, #blocked> loc(#loc47) + %x1_17 = arith.remsi %x1, %cst_4 : tensor<4x1xi32, #blocked> loc(#loc48) + %x2 = arith.divsi %xindex_14, %cst_3 : tensor<4x1xi32, #blocked> loc(#loc49) + %r0_mask = arith.cmpi slt, %r0_base_16, %cst : tensor<1x128xi32, #blocked> loc(#loc50) + %tmp0 = arith.muli %x1_17, %cst_0 : tensor<4x1xi32, #blocked> loc(#loc51) + %tmp0_18 = tt.broadcast %r0_base_16 : tensor<1x128xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc52) + %tmp0_19 = tt.broadcast %tmp0 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc52) + %tmp0_20 = arith.addi %tmp0_18, %tmp0_19 : tensor<4x128xi32, #blocked> loc(#loc52) + %tmp0_21 = arith.muli %x0, %cst_1 : tensor<4x1xi32, #blocked> loc(#loc53) + %tmp0_22 = tt.broadcast %tmp0_21 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc54) + %tmp0_23 = arith.addi %tmp0_20, %tmp0_22 : tensor<4x128xi32, #blocked> loc(#loc54) + %tmp0_24 = arith.muli %x2, %cst_2 : tensor<4x1xi32, #blocked> loc(#loc55) + %tmp0_25 = tt.broadcast %tmp0_24 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc56) + %tmp0_26 = arith.addi %tmp0_23, %tmp0_25 : tensor<4x128xi32, #blocked> loc(#loc56) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr, #blocked> loc(#loc57) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<4x128x!tt.ptr, #blocked>, tensor<4x128xi32, #blocked> loc(#loc57) + %tmp0_29 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<4x128xi1, #blocked> loc(#loc58) + %tmp0_30 = tt.load %tmp0_28, %tmp0_29, %cst_6 evictionPolicy = evict_first : tensor<4x128x!tt.ptr, #blocked> loc(#loc58) + %tmp0_31 = arith.extf %tmp0_30 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc59) + %tmp1 = arith.muli %xindex_14, %cst_0 : tensor<4x1xi32, #blocked> loc(#loc60) + %tmp1_32 = tt.broadcast %tmp1 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc61) + %tmp1_33 = arith.addi %tmp0_18, %tmp1_32 : tensor<4x128xi32, #blocked> loc(#loc61) + %tmp1_34 = tt.splat %in_ptr1 : !tt.ptr -> tensor<4x128x!tt.ptr, #blocked> loc(#loc62) + %tmp1_35 = tt.addptr %tmp1_34, %tmp1_33 : tensor<4x128x!tt.ptr, #blocked>, tensor<4x128xi32, #blocked> loc(#loc62) + %tmp1_36 = tt.load %tmp1_35, %tmp0_29, %cst_6 evictionPolicy = evict_first : tensor<4x128x!tt.ptr, #blocked> loc(#loc63) + %tmp1_37 = arith.extf %tmp1_36 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc64) + %tmp2 = arith.mulf %tmp0_31, %tmp1_37 : tensor<4x128xf32, #blocked> loc(#loc65) + %tmp5 = arith.addf %tmp2, %cst_7 : tensor<4x128xf32, #blocked> loc(#loc66) + %_tmp4 = arith.select %tmp0_29, %tmp5, %cst_7 : tensor<4x128xi1, #blocked>, tensor<4x128xf32, #blocked> loc(#loc67) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_40: f32 loc(callsite(#loc1 at #loc68)), %tmp4_41: f32 loc(callsite(#loc1 at #loc68))): + %tmp4_42 = arith.addf %tmp4_40, %tmp4_41 : f32 loc(#loc72) + tt.reduce.return %tmp4_42 : f32 loc(#loc70) + }) : (tensor<4x128xf32, #blocked>) -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc70) + %tmp4_38 = ttg.convert_layout %tmp4 : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc69) + %tmp4_39 = tt.expand_dims %tmp4_38 {axis = 1 : i32} : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xf32, #blocked1> loc(#loc69) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<4x1x!tt.ptr, #blocked1> loc(#loc33) + %1 = tt.addptr %0, %xindex_15 : tensor<4x1x!tt.ptr, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc33) + tt.store %1, %tmp4_39 : tensor<4x1x!tt.ptr, #blocked1> loc(#loc34) + tt.return loc(#loc35) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":30:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":35:29) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:45) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:41) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:55) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:50) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:68) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:60) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:73) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:127) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:45) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:41) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:34) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:50) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:104) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":41:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":43:23) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:40) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:28) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:4) +#loc41 = loc("xoffset"(#loc2)) +#loc42 = loc("xoffset"(#loc3)) +#loc43 = loc("xindex"(#loc4)) +#loc44 = loc("xindex"(#loc5)) +#loc45 = loc("r0_base"(#loc6)) +#loc46 = loc("x0"(#loc7)) +#loc47 = loc("x1"(#loc8)) +#loc48 = loc("x1"(#loc9)) +#loc49 = loc("x2"(#loc10)) +#loc50 = loc("r0_mask"(#loc11)) +#loc51 = loc("tmp0"(#loc12)) +#loc52 = loc("tmp0"(#loc13)) +#loc53 = loc("tmp0"(#loc14)) +#loc54 = loc("tmp0"(#loc15)) +#loc55 = loc("tmp0"(#loc16)) +#loc56 = loc("tmp0"(#loc17)) +#loc57 = loc("tmp0"(#loc18)) +#loc58 = loc("tmp0"(#loc19)) +#loc59 = loc("tmp0"(#loc20)) +#loc60 = loc("tmp1"(#loc21)) +#loc61 = loc("tmp1"(#loc22)) +#loc62 = loc("tmp1"(#loc23)) +#loc63 = loc("tmp1"(#loc24)) +#loc64 = loc("tmp1"(#loc25)) +#loc65 = loc("tmp2"(#loc26)) +#loc66 = loc("tmp5"(#loc27)) +#loc67 = loc("_tmp4"(#loc28)) +#loc69 = loc("tmp4"(#loc32)) +#loc70 = loc(callsite(#loc29 at #loc68)) +#loc72 = loc(callsite(#loc31 at #loc70)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..c7b5a2ecbd2ffb947ebced4ef967a2597446f8ee --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4ODNRDUZZQ6DC76SMZ2XG44NLENCYGAAAUTCAR2KZZILJTJU67XA/triton_red_fused_zeros_0.ttir @@ -0,0 +1,139 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":18:0) +#loc1 = loc(unknown) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:25) +#loc38 = loc("in_ptr0"(#loc)) +#loc39 = loc("in_ptr1"(#loc)) +#loc40 = loc("out_ptr1"(#loc)) +#loc41 = loc("xnumel"(#loc)) +#loc42 = loc("r0_numel"(#loc)) +#loc72 = loc("tmp4"(#loc32)) +#loc75 = loc(callsite(#loc1 at #loc72)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<4x128xbf16> loc(#loc1) + %cst_0 = arith.constant dense<8388608> : tensor<4x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<4x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc1) + %x2 = arith.constant dense<65536> : tensor<4x1xi32> loc(#loc43) + %x1 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc44) + %cst_5 = arith.constant dense<2048> : tensor<4x1xi32> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc45) + %xoffset_6 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc46) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc47) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc48) + %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<4x1xi32> loc(#loc49) + %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<4x1xi32> loc(#loc49) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc50) + %r0_base_10 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc51) + %x0 = arith.remsi %xindex_9, %cst_5 : tensor<4x1xi32> loc(#loc52) + %x1_11 = arith.divsi %xindex_9, %cst_5 : tensor<4x1xi32> loc(#loc53) + %x1_12 = arith.remsi %x1_11, %x1 : tensor<4x1xi32> loc(#loc44) + %x2_13 = arith.divsi %xindex_9, %x2 : tensor<4x1xi32> loc(#loc43) + %r0_mask = arith.cmpi slt, %r0_base_10, %cst_3 : tensor<1x128xi32> loc(#loc54) + %tmp0 = arith.muli %x1_12, %cst_2 : tensor<4x1xi32> loc(#loc55) + %tmp0_14 = tt.broadcast %r0_base_10 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc56) + %tmp0_15 = tt.broadcast %tmp0 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc56) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<4x128xi32> loc(#loc56) + %tmp0_17 = arith.muli %x0, %cst_1 : tensor<4x1xi32> loc(#loc57) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc58) + %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<4x128xi32> loc(#loc58) + %tmp0_20 = arith.muli %x2_13, %cst_0 : tensor<4x1xi32> loc(#loc59) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc60) + %tmp0_22 = arith.addi %tmp0_19, %tmp0_21 : tensor<4x128xi32> loc(#loc60) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc61) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc61) + %tmp0_25 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc62) + %tmp0_26 = tt.load %tmp0_24, %tmp0_25, %cst evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc62) + %tmp0_27 = arith.extf %tmp0_26 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc63) + %tmp1 = arith.muli %xindex_9, %cst_2 : tensor<4x1xi32> loc(#loc64) + %tmp1_28 = tt.broadcast %tmp1 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc65) + %tmp1_29 = arith.addi %tmp0_14, %tmp1_28 : tensor<4x128xi32> loc(#loc65) + %tmp1_30 = tt.splat %in_ptr1 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc66) + %tmp1_31 = tt.addptr %tmp1_30, %tmp1_29 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc66) + %tmp1_32 = tt.load %tmp1_31, %tmp0_25, %cst evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc67) + %tmp1_33 = arith.extf %tmp1_32 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc68) + %tmp2 = arith.mulf %tmp0_27, %tmp1_33 : tensor<4x128xf32> loc(#loc69) + %tmp5 = arith.addf %tmp2, %cst_4 : tensor<4x128xf32> loc(#loc70) + %_tmp4 = arith.select %tmp0_25, %tmp5, %cst_4 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc71) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_35: f32 loc(callsite(#loc1 at #loc72)), %tmp4_36: f32 loc(callsite(#loc1 at #loc72))): + %tmp4_37 = arith.addf %tmp4_35, %tmp4_36 : f32 loc(#loc76) + tt.reduce.return %tmp4_37 : f32 loc(#loc74) + }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc74) + %tmp4_34 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc73) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<4x1x!tt.ptr> loc(#loc35) + %1 = tt.addptr %0, %xindex_9 : tensor<4x1x!tt.ptr>, tensor<4x1xi32> loc(#loc35) + tt.store %1, %tmp4_34 : tensor<4x1x!tt.ptr> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":30:19) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:29) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:33) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:44) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:23) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":35:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:45) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:41) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:55) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:50) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:68) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:60) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:73) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:127) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:45) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:41) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:104) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":41:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":43:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:28) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:36) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:4) +#loc43 = loc("x2"(#loc2)) +#loc44 = loc("x1"(#loc3)) +#loc45 = loc("xoffset"(#loc4)) +#loc46 = loc("xoffset"(#loc5)) +#loc47 = loc("xindex"(#loc6)) +#loc48 = loc("xindex"(#loc7)) +#loc49 = loc("xindex"(#loc8)) +#loc50 = loc("r0_base"(#loc9)) +#loc51 = loc("r0_base"(#loc10)) +#loc52 = loc("x0"(#loc11)) +#loc53 = loc("x1"(#loc12)) +#loc54 = loc("r0_mask"(#loc13)) +#loc55 = loc("tmp0"(#loc14)) +#loc56 = loc("tmp0"(#loc15)) +#loc57 = loc("tmp0"(#loc16)) +#loc58 = loc("tmp0"(#loc17)) +#loc59 = loc("tmp0"(#loc18)) +#loc60 = loc("tmp0"(#loc19)) +#loc61 = loc("tmp0"(#loc20)) +#loc62 = loc("tmp0"(#loc21)) +#loc63 = loc("tmp0"(#loc22)) +#loc64 = loc("tmp1"(#loc23)) +#loc65 = loc("tmp1"(#loc24)) +#loc66 = loc("tmp1"(#loc25)) +#loc67 = loc("tmp1"(#loc26)) +#loc68 = loc("tmp1"(#loc27)) +#loc69 = loc("tmp2"(#loc28)) +#loc70 = loc("tmp5"(#loc29)) +#loc71 = loc("_tmp4"(#loc30)) +#loc73 = loc("tmp4"(#loc34)) +#loc74 = loc(callsite(#loc31 at #loc72)) +#loc76 = loc(callsite(#loc33 at #loc74)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/__grp__triton_poi_fused_new_zeros_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/__grp__triton_poi_fused_new_zeros_1.json new file mode 100644 index 0000000000000000000000000000000000000000..098b4d27c1a9561d4762de56ec43ede4e951e6a3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/__grp__triton_poi_fused_new_zeros_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_new_zeros_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.source", "triton_poi_fused_new_zeros_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.ttir", "triton_poi_fused_new_zeros_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.ttgir", "triton_poi_fused_new_zeros_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.llir", "triton_poi_fused_new_zeros_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.ptx", "triton_poi_fused_new_zeros_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.cubin", "triton_poi_fused_new_zeros_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..d187e76799329e7a0e4c3be8a21476b3873135a1 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1c68bf2ca7cd3842e445300ae4647a6b62708cc5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.json @@ -0,0 +1 @@ +{"hash": "e52d86863b53ed033419ead08893f8066d383f827ed32f19d25db032f55fb4c0", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_new_zeros_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..aa72026463206aa044f26989cd880516a7f63fcb --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.llir @@ -0,0 +1,47 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_new_zeros_1(ptr addrspace(1) %0, i32 %1, ptr addrspace(1) readnone captures(none) %2, ptr addrspace(1) readnone captures(none) %3) local_unnamed_addr #0 !dbg !4 { + %5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %6 = shl i32 %5, 8, !dbg !8 + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %8 = shl nuw nsw i32 %7, 1, !dbg !9 + %9 = and i32 %8, 254, !dbg !9 + %10 = or disjoint i32 %9, %6, !dbg !10 + %11 = icmp slt i32 %10, 2176, !dbg !11 + %12 = sext i32 %10 to i64, !dbg !12 + %13 = getelementptr i32, ptr addrspace(1) %0, i64 %12, !dbg !12 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 0, i32 0, ptr addrspace(1) %13, i1 %11) #2, !dbg !13 + ret void, !dbg !14 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_new_zeros_1", linkageName: "triton_poi_fused_new_zeros_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 22, column: 21, scope: !4) +!12 = !DILocation(line: 25, column: 25, scope: !4) +!13 = !DILocation(line: 25, column: 36, scope: !4) +!14 = !DILocation(line: 25, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..7845c41ffbb9e4ed467ee1b32bf9a7d8c89fce01 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.ptx @@ -0,0 +1,207 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_new_zeros_1 // -- Begin function triton_poi_fused_new_zeros_1 + // @triton_poi_fused_new_zeros_1 +.visible .entry triton_poi_fused_new_zeros_1( + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_1_param_0, + .param .u32 triton_poi_fused_new_zeros_1_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_1_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_1_param_3 +) +.reqntid 128 +{ + .reg .pred %p<2>; + .reg .b32 %r<9>; + .reg .b64 %rd<3>; + .loc 1 18 0 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:18:0 +$L__func_begin0: + .loc 1 18 0 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:18:0 + +// %bb.0: + ld.param.b64 %rd2, [triton_poi_fused_new_zeros_1_param_0]; +$L__tmp0: + .loc 1 20 28 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:20:28 + mov.u32 %r3, %ctaid.x; + .loc 1 20 33 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:20:33 + shl.b32 %r4, %r3, 8; + .loc 1 21 36 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:21:36 + mov.u32 %r5, %tid.x; + shl.b32 %r6, %r5, 1; + and.b32 %r7, %r6, 254; + .loc 1 21 23 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:21:23 + or.b32 %r8, %r7, %r4; + .loc 1 22 21 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:22:21 + setp.lt.s32 %p1, %r8, 2176; + .loc 1 25 25 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:25:25 + mad.wide.s32 %rd1, %r8, 4, %rd2; + mov.b32 %r1, 0; + .loc 1 25 36 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:25:36 + // begin inline asm + @%p1 st.global.v2.b32 [ %rd1 + 0 ], { %r1, %r1 }; + // end inline asm + .loc 1 25 4 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:25:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 104 +.b8 119 +.b8 109 +.b8 52 +.b8 52 +.b8 106 +.b8 100 +.b8 113 +.b8 116 +.b8 111 +.b8 118 +.b8 121 +.b8 112 +.b8 119 +.b8 113 +.b8 107 +.b8 110 +.b8 101 +.b8 118 +.b8 113 +.b8 118 +.b8 122 +.b8 50 +.b8 100 +.b8 50 +.b8 120 +.b8 114 +.b8 97 +.b8 122 +.b8 99 +.b8 101 +.b8 98 +.b8 52 +.b8 99 +.b8 105 +.b8 50 +.b8 101 +.b8 114 +.b8 111 +.b8 111 +.b8 122 +.b8 52 +.b8 116 +.b8 97 +.b8 104 +.b8 108 +.b8 111 +.b8 99 +.b8 118 +.b8 122 +.b8 118 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 104 +.b8 119 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.source new file mode 100644 index 0000000000000000000000000000000000000000..5ef1073052fbc4d065abb7639fdeb4c77ab59a04 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.source @@ -0,0 +1,41 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":18:0) +#loc11 = loc("out_ptr0"(#loc)) +#loc12 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_new_zeros_1(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 2176 : i32 loc(#loc13) + %xoffset = tt.get_program_id x : i32 loc(#loc14) + %xoffset_1 = arith.constant 256 : i32 loc(#loc15) + %xoffset_2 = arith.constant 256 : i32 loc(#loc15) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc15) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc16) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<256xi32> loc(#loc17) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<256xi32> loc(#loc17) + %xmask = arith.constant dense<2176> : tensor<256xi32> loc(#loc18) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<256xi32> loc(#loc18) + %tmp0 = arith.constant 0 : i32 loc(#loc19) + %tmp0_7 = arith.constant dense<0> : tensor<1xi32> loc(#loc19) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc8) + %1 = tt.addptr %0, %xindex_5 : tensor<256x!tt.ptr>, tensor<256xi32> loc(#loc8) + %cst = arith.constant dense<0> : tensor<256xi32> loc(#loc9) + tt.store %1, %cst, %xmask_6 : tensor<256x!tt.ptr> loc(#loc9) + tt.return loc(#loc10) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":20:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":20:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":21:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":21:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":22:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":24:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":25:25) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":25:36) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":25:4) +#loc13 = loc("xnumel"(#loc1)) +#loc14 = loc("xoffset"(#loc2)) +#loc15 = loc("xoffset"(#loc3)) +#loc16 = loc("xindex"(#loc4)) +#loc17 = loc("xindex"(#loc5)) +#loc18 = loc("xmask"(#loc6)) +#loc19 = loc("tmp0"(#loc7)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..acf84b8ceeaee96748ca116737bc5732cefa9566 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.ttgir @@ -0,0 +1,35 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_new_zeros_1(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<2176> : tensor<256xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<256xi32, #blocked> loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc12) + %xoffset_1 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc13) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> loc(#loc14) + %xindex_2 = tt.splat %xoffset_1 : i32 -> tensor<256xi32, #blocked> loc(#loc15) + %xindex_3 = arith.addi %xindex_2, %xindex : tensor<256xi32, #blocked> loc(#loc15) + %xmask = arith.cmpi slt, %xindex_3, %cst : tensor<256xi32, #blocked> loc(#loc16) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr, #blocked> loc(#loc7) + %1 = tt.addptr %0, %xindex_3 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> loc(#loc7) + tt.store %1, %cst_0, %xmask : tensor<256x!tt.ptr, #blocked> loc(#loc8) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":20:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":20:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":21:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":21:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":22:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":25:25) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":25:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":25:4) +#loc12 = loc("xoffset"(#loc2)) +#loc13 = loc("xoffset"(#loc3)) +#loc14 = loc("xindex"(#loc4)) +#loc15 = loc("xindex"(#loc5)) +#loc16 = loc("xmask"(#loc6)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..1cec36983d18bd2f7f21ed2a2d54e65924293905 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/4UWYNBR3KPWQGNAZ5LIIRE7YAZWTQP4CP3JS6GOSLWYDF5K7WTAA/triton_poi_fused_new_zeros_1.ttir @@ -0,0 +1,34 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_new_zeros_1(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<256xi32> loc(#loc1) + %xmask = arith.constant dense<2176> : tensor<256xi32> loc(#loc12) + %c256_i32 = arith.constant 256 : i32 loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc13) + %xoffset_0 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc14) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc15) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<256xi32> loc(#loc16) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<256xi32> loc(#loc16) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<256xi32> loc(#loc12) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc8) + %1 = tt.addptr %0, %xindex_2 : tensor<256x!tt.ptr>, tensor<256xi32> loc(#loc8) + tt.store %1, %cst, %xmask_3 : tensor<256x!tt.ptr> loc(#loc1) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":25:36) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":22:21) +#loc3 = loc(unknown) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":20:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":20:33) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":21:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":21:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":25:25) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":25:4) +#loc12 = loc("xmask"(#loc2)) +#loc13 = loc("xoffset"(#loc4)) +#loc14 = loc("xoffset"(#loc5)) +#loc15 = loc("xindex"(#loc6)) +#loc16 = loc("xindex"(#loc7)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/__grp__triton_red_fused__to_copy_clone_slice_sum_transpose_5.json b/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/__grp__triton_red_fused__to_copy_clone_slice_sum_transpose_5.json new file mode 100644 index 0000000000000000000000000000000000000000..81d0db8c143ac4155269d4ac506bed778a05f610 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/__grp__triton_red_fused__to_copy_clone_slice_sum_transpose_5.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_clone_slice_sum_transpose_5.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin new file mode 100644 index 0000000000000000000000000000000000000000..1d3eb48eebba410ad50d3d0437c8d73543f1b579 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json b/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e8abbd3431ab6252267ccf96b29f1e8d7e51d027 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json @@ -0,0 +1 @@ +{"hash": "eabbaefeba1230f93a431f53b7695abb82cf61f468eb1ba0270ec3af2e1de0a4", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_clone_slice_sum_transpose_5"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir new file mode 100644 index 0000000000000000000000000000000000000000..39bc35bb4b19539967aa33e4a118ed691b669086 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir @@ -0,0 +1,190 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__to_copy_clone_slice_sum_transpose_5(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = shl nuw i32 %9, 1, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = lshr i32 %11, 4, !dbg !9 + %.lobit = and i32 %12, 1, !dbg !9 + %13 = and i32 %11, 15, !dbg !10 + %14 = icmp sgt i32 %5, 0, !dbg !11 + br i1 %14, label %.lr.ph, label %._crit_edge, !dbg !11 + +.lr.ph: ; preds = %8 + %15 = mul i64 %3, %2, !dbg !12 + %16 = or disjoint i32 %.lobit, %10, !dbg !13 + %17 = sext i32 %16 to i64, !dbg !14 + %.frozen = freeze i64 %2, !dbg !15 + %18 = sdiv i64 %17, %.frozen, !dbg !15 + %19 = mul i64 %15, %18, !dbg !16 + %20 = mul i64 %18, %.frozen, !dbg !14 + %.decomposed = sub i64 %17, %20, !dbg !14 + %21 = icmp slt i32 %16, %4, !dbg !17 + %22 = getelementptr i32, ptr addrspace(1) %0, i64 %.decomposed + %invariant.gep = getelementptr i32, ptr addrspace(1) %22, i64 %19, !dbg !11 + %.fr = freeze i1 %21 + br i1 %.fr, label %.lr.ph.split, label %.lr.ph.split.us + +.lr.ph.split.us: ; preds = %.lr.ph, %.lr.ph.split.us + %23 = phi i32 [ %29, %.lr.ph.split.us ], [ 0, %.lr.ph ] + %24 = or disjoint i32 %23, %13, !dbg !18 + %25 = sext i32 %24 to i64, !dbg !19 + %26 = mul i64 %2, %25, !dbg !19 + %gep.us = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %26, !dbg !20 + %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !21 + %28 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep.us, i64 %27, i1 false) #3, !dbg !21 + %29 = add i32 %23, 16, !dbg !11 + %30 = icmp slt i32 %29, %5, !dbg !11 + br i1 %30, label %.lr.ph.split.us, label %._crit_edge, !dbg !11 + +.lr.ph.split: ; preds = %.lr.ph, %.lr.ph.split + %31 = phi i64 [ %39, %.lr.ph.split ], [ 0, %.lr.ph ] + %32 = phi i32 [ %40, %.lr.ph.split ], [ 0, %.lr.ph ] + %33 = or disjoint i32 %32, %13, !dbg !18 + %34 = icmp slt i32 %33, %5, !dbg !22 + %35 = sext i32 %33 to i64, !dbg !19 + %36 = mul i64 %2, %35, !dbg !19 + %gep = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %36, !dbg !20 + %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !21 + %38 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep, i64 %37, i1 %34) #3, !dbg !21 + %narrow = select i1 %34, i32 %38, i32 0, !dbg !23 + %spec.select = sext i32 %narrow to i64, !dbg !23 + %39 = add i64 %31, %spec.select, !dbg !23 + %40 = add i32 %32, 16, !dbg !11 + %41 = icmp slt i32 %40, %5, !dbg !11 + br i1 %41, label %.lr.ph.split, label %._crit_edge, !dbg !11 + +._crit_edge: ; preds = %.lr.ph.split.us, %.lr.ph.split, %8 + %.lcssa = phi i64 [ 0, %8 ], [ %39, %.lr.ph.split ], [ 0, %.lr.ph.split.us ], !dbg !24 + %42 = and i32 %11, 1, !dbg !9 + %43 = or disjoint i32 %10, %42, !dbg !13 + %44 = sext i32 %43 to i64, !dbg !14 + %.frozen16 = freeze i64 %2, !dbg !15 + %45 = sdiv i64 %44, %.frozen16, !dbg !15 + %46 = mul i64 %45, %.frozen16, !dbg !14 + %.decomposed17 = sub i64 %44, %46, !dbg !14 + %47 = icmp slt i32 %43, %4, !dbg !17 + %extelt.offset = lshr i64 %.lcssa, 32, !dbg !25 + %48 = trunc nuw i64 %extelt.offset to i32, !dbg !25 + %49 = trunc i64 %.lcssa to i32, !dbg !25 + %50 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %49, i32 8, i32 31), !dbg !25 + %51 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 8, i32 31), !dbg !25 + %52 = insertelement <2 x i32> poison, i32 %50, i64 0, !dbg !25 + %53 = insertelement <2 x i32> %52, i32 %51, i64 1, !dbg !25 + %54 = bitcast <2 x i32> %53 to i64, !dbg !25 + %55 = add i64 %.lcssa, %54, !dbg !29 + %extelt.offset2 = lshr i64 %55, 32, !dbg !25 + %56 = trunc nuw i64 %extelt.offset2 to i32, !dbg !25 + %57 = trunc i64 %55 to i32, !dbg !25 + %58 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %57, i32 4, i32 31), !dbg !25 + %59 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 4, i32 31), !dbg !25 + %60 = insertelement <2 x i32> poison, i32 %58, i64 0, !dbg !25 + %61 = insertelement <2 x i32> %60, i32 %59, i64 1, !dbg !25 + %62 = bitcast <2 x i32> %61 to i64, !dbg !25 + %63 = add i64 %55, %62, !dbg !29 + %extelt.offset3 = lshr i64 %63, 32, !dbg !25 + %64 = trunc nuw i64 %extelt.offset3 to i32, !dbg !25 + %65 = trunc i64 %63 to i32, !dbg !25 + %66 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %65, i32 2, i32 31), !dbg !25 + %67 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 2, i32 31), !dbg !25 + %68 = insertelement <2 x i32> poison, i32 %66, i64 0, !dbg !25 + %69 = insertelement <2 x i32> %68, i32 %67, i64 1, !dbg !25 + %70 = bitcast <2 x i32> %69 to i64, !dbg !25 + %71 = add i64 %63, %70, !dbg !29 + %extelt.offset4 = lshr i64 %71, 32, !dbg !25 + %72 = trunc nuw i64 %extelt.offset4 to i32, !dbg !25 + %73 = trunc i64 %71 to i32, !dbg !25 + %74 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %73, i32 1, i32 31), !dbg !25 + %75 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 1, i32 31), !dbg !25 + %76 = insertelement <2 x i32> poison, i32 %74, i64 0, !dbg !25 + %77 = insertelement <2 x i32> %76, i32 %75, i64 1, !dbg !25 + %78 = bitcast <2 x i32> %77 to i64, !dbg !25 + %79 = add i64 %71, %78, !dbg !29 + %80 = and i32 %11, 14, !dbg !30 + %81 = shl nuw nsw i32 %11, 4, !dbg !30 + %82 = and i32 %81, 16, !dbg !30 + %83 = or disjoint i32 %82, %80, !dbg !30 + %84 = or disjoint i32 %83, %.lobit, !dbg !30 + %extelt.offset6 = lshr i64 %79, 32, !dbg !30 + %85 = trunc nuw i64 %extelt.offset6 to i32, !dbg !30 + %86 = trunc i64 %79 to i32, !dbg !30 + %87 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %86, i32 %84, i32 31), !dbg !30 + %88 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %85, i32 %84, i32 31), !dbg !30 + %89 = icmp slt i64 %2, 2, !dbg !31 + %90 = icmp sgt i64 %2, 1, !dbg !32 + %91 = select i1 %90, i64 %2, i64 0, !dbg !33 + %92 = zext i1 %89 to i64, !dbg !34 + %93 = add i64 %91, %92, !dbg !35 + %94 = mul i64 %45, %93, !dbg !36 + %95 = getelementptr i32, ptr addrspace(1) %1, i64 %.decomposed17, !dbg !37 + %96 = getelementptr i32, ptr addrspace(1) %95, i64 %94, !dbg !37 + %97 = and i32 %11, 62, !dbg !38 + %98 = icmp eq i32 %97, 0, !dbg !38 + %99 = and i1 %98, %47, !dbg !38 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %87, ptr addrspace(1) %96, i1 %99) #3, !dbg !38 + ret void, !dbg !39 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.idx.i32(i32, i32, i32, i32) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_clone_slice_sum_transpose_5", linkageName: "triton_red_fused__to_copy_clone_slice_sum_transpose_5", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 21, column: 28, scope: !4) +!8 = !DILocation(line: 21, column: 33, scope: !4) +!9 = !DILocation(line: 22, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 37, scope: !4) +!11 = !DILocation(line: 30, column: 40, scope: !4) +!12 = !DILocation(line: 36, column: 54, scope: !4) +!13 = !DILocation(line: 22, column: 23, scope: !4) +!14 = !DILocation(line: 26, column: 19, scope: !4) +!15 = !DILocation(line: 27, column: 19, scope: !4) +!16 = !DILocation(line: 36, column: 58, scope: !4) +!17 = !DILocation(line: 23, column: 21, scope: !4) +!18 = !DILocation(line: 31, column: 31, scope: !4) +!19 = !DILocation(line: 36, column: 43, scope: !4) +!20 = !DILocation(line: 36, column: 34, scope: !4) +!21 = !DILocation(line: 36, column: 63, scope: !4) +!22 = !DILocation(line: 32, column: 29, scope: !4) +!23 = !DILocation(line: 40, column: 48, scope: !4) +!24 = !DILocation(line: 28, column: 43, scope: !4) +!25 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !28) +!26 = distinct !DILexicalBlockFile(scope: !4, file: !27, discriminator: 0) +!27 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!28 = !DILocation(line: 41, column: 25, scope: !4) +!29 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !28) +!30 = !DILocation(line: 41, column: 28, scope: !4) +!31 = !DILocation(line: 43, column: 49, scope: !4) +!32 = !DILocation(line: 43, column: 75, scope: !4) +!33 = !DILocation(line: 43, column: 66, scope: !4) +!34 = !DILocation(line: 43, scope: !4) +!35 = !DILocation(line: 43, column: 57, scope: !4) +!36 = !DILocation(line: 43, column: 34, scope: !4) +!37 = !DILocation(line: 43, column: 25, scope: !4) +!38 = !DILocation(line: 43, column: 88, scope: !4) +!39 = !DILocation(line: 43, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx new file mode 100644 index 0000000000000000000000000000000000000000..a5d5351626e54730fe2719b17f2e2c133a54d2d5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx @@ -0,0 +1,528 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_clone_slice_sum_transpose_5 // -- Begin function triton_red_fused__to_copy_clone_slice_sum_transpose_5 + // @triton_red_fused__to_copy_clone_slice_sum_transpose_5 +.visible .entry triton_red_fused__to_copy_clone_slice_sum_transpose_5( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_1, + .param .u64 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_2, + .param .u64 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_3, + .param .u32 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_4, + .param .u32 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_7 +) +.reqntid 64 +{ + .reg .pred %p<14>; + .reg .b32 %r<60>; + .reg .b64 %rd<81>; + .loc 1 18 0 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:18:0 + +// %bb.0: + ld.param.b32 %r10, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_5]; + ld.param.b32 %r9, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_4]; + ld.param.b64 %rd19, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_2]; +$L__tmp0: + .loc 1 21 28 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:21:28 + mov.u32 %r11, %ctaid.x; + .loc 1 21 33 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:21:33 + shl.b32 %r1, %r11, 1; + .loc 1 22 44 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:22:44 + mov.u32 %r2, %tid.x; + bfe.u32 %r3, %r2, 4, 1; + .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40 + setp.gt.s32 %p1, %r10, 0; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: // %.lr.ph + .loc 1 0 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:40 + ld.param.b64 %rd20, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_3]; + ld.param.b64 %rd17, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_0]; + .loc 1 36 54 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:54 + mul.lo.s64 %rd2, %rd20, %rd19; + .loc 1 22 23 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:22:23 + or.b32 %r12, %r3, %r1; + .loc 1 26 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:26:19 + cvt.s64.s32 %rd3, %r12; + .loc 1 27 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:27:19 + or.b64 %rd22, %rd3, %rd19; + and.b64 %rd23, %rd22, -4294967296; + setp.ne.b64 %p2, %rd23, 0; + cvt.u32.u64 %r57, %rd3; + @%p2 bra $L__BB0_4; + bra.uni $L__BB0_3; +$L__BB0_4: + div.s64 %rd77, %rd3, %rd19; + bra.uni $L__BB0_5; +$L__BB0_1: // %.._crit_edge_crit_edge + .loc 1 0 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:19 + mov.b64 %rd79, 0; + .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40 + bra.uni $L__BB0_10; +$L__BB0_3: + .loc 1 27 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:27:19 + cvt.u32.u64 %r13, %rd19; + div.u32 %r15, %r57, %r13; + cvt.u64.u32 %rd77, %r15; +$L__BB0_5: + .loc 1 0 0 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0 + and.b32 %r4, %r2, 15; + .loc 1 36 58 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:58 + mul.lo.s64 %rd24, %rd2, %rd77; + .loc 1 26 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:26:19 + mul.lo.s64 %rd25, %rd77, %rd19; + sub.s64 %rd26, %rd3, %rd25; + .loc 1 23 21 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:23:21 + setp.lt.s32 %p3, %r57, %r9; + shl.b64 %rd27, %rd26, 2; + add.s64 %rd28, %rd17, %rd27; + .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40 + shl.b64 %rd29, %rd24, 2; + add.s64 %rd8, %rd28, %rd29; + @%p3 bra $L__BB0_8; + bra.uni $L__BB0_6; +$L__BB0_8: // %.lr.ph.split.preheader + .loc 1 0 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:40 + mov.b32 %r59, 0; + mov.b64 %rd79, 0; +$L__BB0_9: // %.lr.ph.split + // =>This Inner Loop Header: Depth=1 + .loc 1 32 29 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:32:29 + add.s32 %r22, %r4, %r59; + setp.lt.s32 %p6, %r22, %r10; + .loc 1 36 43 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:43 + cvt.s64.s32 %rd41, %r22; + mul.lo.s64 %rd42, %rd19, %rd41; + .loc 1 36 34 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:34 + shl.b64 %rd43, %rd42, 2; + add.s64 %rd39, %rd8, %rd43; + .loc 1 36 63 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:63 + // begin inline asm + mov.u64 %rd38, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd38, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r21, 0x0; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b32 { %r21 }, [ %rd39 + 0 ], %rd38; + // end inline asm + .loc 1 40 48 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:40:48 + selp.b32 %r23, %r21, 0, %p6; + cvt.s64.s32 %rd44, %r23; + add.s64 %rd79, %rd79, %rd44; + .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40 + add.s32 %r59, %r59, 16; + setp.lt.s32 %p7, %r59, %r10; + @%p7 bra $L__BB0_9; + bra.uni $L__BB0_10; +$L__BB0_6: // %.lr.ph.split.us.preheader + .loc 1 0 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:40 + mov.b32 %r58, 0; +$L__BB0_7: // %.lr.ph.split.us + // =>This Inner Loop Header: Depth=1 + .loc 1 36 43 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:43 + add.s32 %r19, %r4, %r58; + cvt.s64.s32 %rd34, %r19; + mul.lo.s64 %rd35, %rd19, %rd34; + .loc 1 36 34 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:34 + shl.b64 %rd36, %rd35, 2; + add.s64 %rd31, %rd8, %rd36; + .loc 1 36 63 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:63 + // begin inline asm + mov.u64 %rd30, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd30, 1.0; + // end inline asm + mov.pred %p4, 0; + // begin inline asm + mov.u32 %r18, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b32 { %r18 }, [ %rd31 + 0 ], %rd30; + // end inline asm + .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40 + add.s32 %r58, %r58, 16; + setp.lt.s32 %p5, %r58, %r10; + mov.b64 %rd79, 0; + @%p5 bra $L__BB0_7; +$L__BB0_10: // %._crit_edge + .loc 1 0 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:40 + ld.param.b64 %rd18, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_1]; + .loc 1 22 44 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:22:44 + and.b32 %r24, %r2, 1; + .loc 1 22 23 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:22:23 + or.b32 %r25, %r1, %r24; + .loc 1 26 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:26:19 + cvt.s64.s32 %rd13, %r25; + .loc 1 27 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:27:19 + or.b64 %rd45, %rd13, %rd19; + and.b64 %rd46, %rd45, -4294967296; + setp.ne.b64 %p8, %rd46, 0; + cvt.u32.u64 %r56, %rd13; + @%p8 bra $L__BB0_12; + bra.uni $L__BB0_11; +$L__BB0_12: + div.s64 %rd80, %rd13, %rd19; + bra.uni $L__BB0_13; +$L__BB0_11: + cvt.u32.u64 %r26, %rd19; + div.u32 %r28, %r56, %r26; + cvt.u64.u32 %rd80, %r28; +$L__BB0_13: + .loc 1 26 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:26:19 + mul.lo.s64 %rd48, %rd80, %rd19; + sub.s64 %rd49, %rd13, %rd48; + .loc 1 23 21 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:23:21 + setp.lt.s32 %p10, %r56, %r9; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + mov.b64 {_, %r31}, %rd79; + cvt.u32.u64 %r32, %rd79; + shfl.sync.bfly.b32 %r33, %r32, 8, 31, -1; + shfl.sync.bfly.b32 %r34, %r31, 8, 31, -1; + cvt.u64.u32 %rd50, %r33; + cvt.u64.u32 %rd51, %r34; + shl.b64 %rd52, %rd51, 32; + or.b64 %rd53, %rd50, %rd52; + .loc 2 261 15 // standard.py:261:15 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + add.s64 %rd54, %rd79, %rd53; + .loc 2 291 36 // standard.py:291:36 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + mov.b64 {_, %r35}, %rd54; + cvt.u32.u64 %r36, %rd54; + shfl.sync.bfly.b32 %r37, %r36, 4, 31, -1; + shfl.sync.bfly.b32 %r38, %r35, 4, 31, -1; + cvt.u64.u32 %rd55, %r37; + cvt.u64.u32 %rd56, %r38; + shl.b64 %rd57, %rd56, 32; + or.b64 %rd58, %rd55, %rd57; + .loc 2 261 15 // standard.py:261:15 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + add.s64 %rd59, %rd54, %rd58; + .loc 2 291 36 // standard.py:291:36 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + mov.b64 {_, %r39}, %rd59; + cvt.u32.u64 %r40, %rd59; + shfl.sync.bfly.b32 %r41, %r40, 2, 31, -1; + shfl.sync.bfly.b32 %r42, %r39, 2, 31, -1; + cvt.u64.u32 %rd60, %r41; + cvt.u64.u32 %rd61, %r42; + shl.b64 %rd62, %rd61, 32; + or.b64 %rd63, %rd60, %rd62; + .loc 2 261 15 // standard.py:261:15 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + add.s64 %rd64, %rd59, %rd63; + .loc 2 291 36 // standard.py:291:36 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + mov.b64 {_, %r43}, %rd64; + cvt.u32.u64 %r44, %rd64; + shfl.sync.bfly.b32 %r45, %r44, 1, 31, -1; + shfl.sync.bfly.b32 %r46, %r43, 1, 31, -1; + cvt.u64.u32 %rd65, %r45; + cvt.u64.u32 %rd66, %r46; + shl.b64 %rd67, %rd66, 32; + or.b64 %rd68, %rd65, %rd67; + .loc 2 261 15 // standard.py:261:15 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + add.s64 %rd69, %rd64, %rd68; +$L__tmp2: + .loc 1 41 28 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:28 + and.b32 %r47, %r2, 14; + shl.b32 %r48, %r2, 4; + and.b32 %r49, %r48, 16; + or.b32 %r50, %r49, %r47; + or.b32 %r51, %r50, %r3; + mov.b64 {_, %r52}, %rd69; + cvt.u32.u64 %r53, %rd69; + shfl.sync.idx.b32 %r29, %r53, %r51, 31, -1; + shfl.sync.idx.b32 %r54, %r52, %r51, 31, -1; + .loc 1 43 49 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:49 + setp.lt.s64 %p11, %rd19, 2; + .loc 1 43 75 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:75 + setp.gt.s64 %p12, %rd19, 1; + .loc 1 43 66 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:66 + selp.b64 %rd70, %rd19, 0, %p12; + .loc 1 43 0 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43 + selp.b64 %rd71, 1, 0, %p11; + .loc 1 43 57 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:57 + add.s64 %rd72, %rd70, %rd71; + .loc 1 43 34 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:34 + mul.lo.s64 %rd73, %rd80, %rd72; + .loc 1 43 25 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:25 + shl.b64 %rd74, %rd49, 2; + add.s64 %rd75, %rd18, %rd74; + shl.b64 %rd76, %rd73, 2; + add.s64 %rd47, %rd75, %rd76; + .loc 1 43 88 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:88 + and.b32 %r55, %r2, 62; + setp.eq.b32 %p13, %r55, 0; + and.pred %p9, %p13, %p10; + // begin inline asm + @%p9 st.global.b32 [ %rd47 + 0 ], { %r29 }; + // end inline asm + .loc 1 43 4 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 238 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe7 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 53 +.b8 117 +.b8 99 +.b8 102 +.b8 50 +.b8 105 +.b8 116 +.b8 54 +.b8 102 +.b8 119 +.b8 104 +.b8 109 +.b8 114 +.b8 117 +.b8 51 +.b8 50 +.b8 118 +.b8 112 +.b8 121 +.b8 50 +.b8 120 +.b8 109 +.b8 119 +.b8 107 +.b8 107 +.b8 52 +.b8 118 +.b8 103 +.b8 55 +.b8 112 +.b8 112 +.b8 110 +.b8 111 +.b8 120 +.b8 109 +.b8 104 +.b8 110 +.b8 107 +.b8 102 +.b8 54 +.b8 105 +.b8 102 +.b8 112 +.b8 98 +.b8 50 +.b8 104 +.b8 50 +.b8 116 +.b8 106 +.b8 115 +.b8 108 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 53 +.b8 117 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x38 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 99 +.b8 108 +.b8 111 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 108 +.b8 105 +.b8 99 +.b8 101 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 116 +.b8 114 +.b8 97 +.b8 110 +.b8 115 +.b8 112 +.b8 111 +.b8 115 +.b8 101 +.b8 95 +.b8 53 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc3:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xd8:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source b/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source new file mode 100644 index 0000000000000000000000000000000000000000..13447552aacfc3c19c61042b11b9d79948639045 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source @@ -0,0 +1,193 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":18:0) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc43 = loc(unknown) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc50 = loc("in_ptr0"(#loc)) +#loc51 = loc("out_ptr1"(#loc)) +#loc52 = loc("ks0"(#loc)) +#loc53 = loc("ks1"(#loc)) +#loc54 = loc("xnumel"(#loc)) +#loc55 = loc("r0_numel"(#loc)) +#loc85 = loc("input"(#loc41)) +#loc86 = loc("a"(#loc46)) +#loc87 = loc("b"(#loc46)) +module { + tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc56) + %xoffset_0 = arith.constant 2 : i32 loc(#loc57) + %xoffset_1 = arith.constant 2 : i32 loc(#loc57) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc57) + %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc58) + %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32> -> tensor<2x1xi32> loc(#loc59) + %xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<2x1xi32> loc(#loc60) + %xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<2x1xi32> loc(#loc60) + %xmask = tt.splat %xnumel : i32 -> tensor<2x1xi32> loc(#loc61) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<2x1xi32> loc(#loc61) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc62) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc63) + %x0 = arith.extsi %xindex_5 : tensor<2x1xi32> to tensor<2x1xi64> loc(#loc64) + %x0_8 = tt.splat %ks0 : i64 -> tensor<2x1xi64> loc(#loc64) + %x0_9 = arith.remsi %x0, %x0_8 : tensor<2x1xi64> loc(#loc64) + %x1 = arith.extsi %xindex_5 : tensor<2x1xi32> to tensor<2x1xi64> loc(#loc65) + %x1_10 = tt.splat %ks0 : i64 -> tensor<2x1xi64> loc(#loc65) + %x1_11 = arith.divsi %x1, %x1_10 : tensor<2x1xi64> loc(#loc65) + %_tmp3 = arith.constant 0 : i64 loc(#loc66) + %_tmp3_12 = arith.constant dense<0> : tensor<2x16xi64> loc(#loc66) + %c0_i32 = arith.constant 0 : i32 loc(#loc12) + %c16_i32 = arith.constant 16 : i32 loc(#loc12) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc12) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc12) + %2 = arith.bitcast %c16_i32 : i32 to i32 loc(#loc12) + %3 = ub.poison : i32 loc(#loc12) + %_tmp3_13 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_18 = %_tmp3_12) -> (tensor<2x16xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32> loc(#loc68) + %r0_index_19 = arith.addi %r0_index, %r0_base_7 : tensor<1x16xi32> loc(#loc68) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32> loc(#loc69) + %r0_mask_20 = arith.cmpi slt, %r0_index_19, %r0_mask : tensor<1x16xi32> loc(#loc69) + %tmp0 = arith.extsi %r0_index_19 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc70) + %tmp0_21 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc70) + %tmp0_22 = arith.muli %tmp0_21, %tmp0 : tensor<1x16xi64> loc(#loc70) + %tmp0_23 = tt.broadcast %x0_9 : tensor<2x1xi64> -> tensor<2x16xi64> loc(#loc71) + %tmp0_24 = tt.broadcast %tmp0_22 : tensor<1x16xi64> -> tensor<2x16xi64> loc(#loc71) + %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<2x16xi64> loc(#loc71) + %tmp0_26 = arith.muli %ks0, %ks1 : i64 loc(#loc72) + %tmp0_27 = tt.splat %tmp0_26 : i64 -> tensor<2x1xi64> loc(#loc73) + %tmp0_28 = arith.muli %tmp0_27, %x1_11 : tensor<2x1xi64> loc(#loc73) + %tmp0_29 = tt.broadcast %tmp0_28 : tensor<2x1xi64> -> tensor<2x16xi64> loc(#loc74) + %tmp0_30 = arith.addi %tmp0_25, %tmp0_29 : tensor<2x16xi64> loc(#loc74) + %tmp0_31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x16x!tt.ptr> loc(#loc75) + %tmp0_32 = tt.addptr %tmp0_31, %tmp0_30 : tensor<2x16x!tt.ptr>, tensor<2x16xi64> loc(#loc75) + %tmp0_33 = tt.broadcast %r0_mask_20 : tensor<1x16xi1> -> tensor<2x16xi1> loc(#loc76) + %tmp0_34 = tt.broadcast %xmask_6 : tensor<2x1xi1> -> tensor<2x16xi1> loc(#loc76) + %tmp0_35 = arith.andi %tmp0_33, %tmp0_34 : tensor<2x16xi1> loc(#loc76) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc77) + %tmp0_37 = arith.constant dense<0.000000e+00> : tensor<2x16xf32> loc(#loc77) + %tmp0_38 = arith.fptosi %tmp0_37 : tensor<2x16xf32> to tensor<2x16xi32> loc(#loc77) + %tmp0_39 = tt.load %tmp0_32, %tmp0_35, %tmp0_38 evictionPolicy = evict_last : tensor<2x16x!tt.ptr> loc(#loc77) + %tmp1 = arith.extsi %tmp0_39 : tensor<2x16xi32> to tensor<2x16xi64> loc(#loc78) + %tmp4 = arith.addi %_tmp3_18, %tmp1 : tensor<2x16xi64> loc(#loc79) + %_tmp3_40 = tt.broadcast %r0_mask_20 : tensor<1x16xi1> -> tensor<2x16xi1> loc(#loc80) + %_tmp3_41 = tt.broadcast %xmask_6 : tensor<2x1xi1> -> tensor<2x16xi1> loc(#loc80) + %_tmp3_42 = arith.andi %_tmp3_40, %_tmp3_41 : tensor<2x16xi1> loc(#loc80) + %_tmp3_43 = arith.select %_tmp3_42, %tmp4, %_tmp3_18 : tensor<2x16xi1>, tensor<2x16xi64> loc(#loc81) + scf.yield %_tmp3_43 : tensor<2x16xi64> loc(#loc27) + } loc(#loc67) + %tmp3 = tt.call @"triton.language.standard.sum__i64S2_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp3_13) : (tensor<2x16xi64>) -> tensor<2xi64> loc(#loc82) + %tmp3_14 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<2xi64> -> tensor<2x1xi64> loc(#loc83) + %tmp5 = arith.trunci %tmp3_14 : tensor<2x1xi64> to tensor<2x1xi32> loc(#loc84) + %c1_i32 = arith.constant 1 : i32 loc(#loc31) + %4 = arith.extsi %c1_i32 : i32 to i64 loc(#loc31) + %5 = arith.cmpi sge, %4, %ks0 : i64 loc(#loc31) + %c1_i32_15 = arith.constant 1 : i32 loc(#loc32) + %c1_i32_16 = arith.constant 1 : i32 loc(#loc32) + %6 = arith.extui %5 : i1 to i32 loc(#loc32) + %7 = arith.muli %c1_i32_16, %6 : i32 loc(#loc32) + %c1_i32_17 = arith.constant 1 : i32 loc(#loc33) + %8 = arith.extsi %c1_i32_17 : i32 to i64 loc(#loc33) + %9 = arith.cmpi sgt, %ks0, %8 : i64 loc(#loc33) + %10 = arith.extui %9 : i1 to i64 loc(#loc34) + %11 = arith.muli %ks0, %10 : i64 loc(#loc34) + %12 = arith.extsi %7 : i32 to i64 loc(#loc35) + %13 = arith.addi %12, %11 : i64 loc(#loc35) + %14 = tt.splat %13 : i64 -> tensor<2x1xi64> loc(#loc36) + %15 = arith.muli %x1_11, %14 : tensor<2x1xi64> loc(#loc36) + %16 = arith.addi %x0_9, %15 : tensor<2x1xi64> loc(#loc37) + %17 = tt.splat %out_ptr1 : !tt.ptr -> tensor<2x1x!tt.ptr> loc(#loc38) + %18 = tt.addptr %17, %16 : tensor<2x1x!tt.ptr>, tensor<2x1xi64> loc(#loc38) + tt.store %18, %tmp5, %xmask_6 : tensor<2x1x!tt.ptr> loc(#loc39) + tt.return loc(#loc40) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S2_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<2x16xi64> loc("input"(#loc41))) -> tensor<2xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc42) + tt.reduce.return %2 : i64 loc(#loc42) + }) : (tensor<2x16xi64>) -> tensor<2xi64> loc(#loc42) + tt.return %0 : tensor<2xi64> loc(#loc44) + ^bb1: // no predecessors + %1 = ub.poison : tensor<2xi64> loc(#loc45) + tt.return %1 : tensor<2xi64> loc(#loc45) + } loc(#loc41) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc46)), %b: i64 loc("b"(#loc46))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc47) + tt.return %0 : i64 loc(#loc48) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc49) + tt.return %1 : i64 loc(#loc49) + } loc(#loc46) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":26:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":28:43) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":30:40) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":31:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":32:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:39) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:54) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:58) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:50) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:73) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:63) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":37:23) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":39:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:48) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:8) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:25) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:28) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":42:19) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:49) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:75) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:66) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:57) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:34) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:30) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:88) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:4) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc56 = loc("xoffset"(#loc1)) +#loc57 = loc("xoffset"(#loc2)) +#loc58 = loc("xindex"(#loc3)) +#loc59 = loc("xindex"(#loc4)) +#loc60 = loc("xindex"(#loc5)) +#loc61 = loc("xmask"(#loc6)) +#loc62 = loc("r0_base"(#loc7)) +#loc63 = loc("r0_base"(#loc8)) +#loc64 = loc("x0"(#loc9)) +#loc65 = loc("x1"(#loc10)) +#loc66 = loc("_tmp3"(#loc11)) +#loc67 = loc("_tmp3"(#loc12)) +#loc68 = loc("r0_index"(#loc13)) +#loc69 = loc("r0_mask"(#loc14)) +#loc70 = loc("tmp0"(#loc15)) +#loc71 = loc("tmp0"(#loc16)) +#loc72 = loc("tmp0"(#loc17)) +#loc73 = loc("tmp0"(#loc18)) +#loc74 = loc("tmp0"(#loc19)) +#loc75 = loc("tmp0"(#loc20)) +#loc76 = loc("tmp0"(#loc21)) +#loc77 = loc("tmp0"(#loc22)) +#loc78 = loc("tmp1"(#loc23)) +#loc79 = loc("tmp4"(#loc24)) +#loc80 = loc("_tmp3"(#loc25)) +#loc81 = loc("_tmp3"(#loc26)) +#loc82 = loc("tmp3"(#loc28)) +#loc83 = loc("tmp3"(#loc29)) +#loc84 = loc("tmp5"(#loc30)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..ba859f7d4aa342a954656fb996b2435d3ac7e0aa --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir @@ -0,0 +1,159 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [2, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [1, 2], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":18:0) +#loc1 = loc(unknown) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:25) +#loc40 = loc("in_ptr0"(#loc)) +#loc41 = loc("out_ptr1"(#loc)) +#loc42 = loc("ks0"(#loc)) +#loc43 = loc("ks1"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc68 = loc("tmp3"(#loc26)) +#loc73 = loc(callsite(#loc1 at #loc68)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<2x16xi32, #blocked> loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<2x16xi64, #blocked> loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc46) + %xoffset_1 = arith.muli %xoffset, %c2_i32 : i32 loc(#loc47) + %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc48) + %xindex_2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc48) + %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<2x1xi32, #blocked> loc(#loc48) + %xindex_4 = tt.expand_dims %xindex_2 {axis = 1 : i32} : tensor<2xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xi32, #blocked1> loc(#loc48) + %xindex_5 = tt.splat %xoffset_1 : i32 -> tensor<2x1xi32, #blocked> loc(#loc49) + %xindex_6 = tt.splat %xoffset_1 : i32 -> tensor<2x1xi32, #blocked1> loc(#loc49) + %xindex_7 = arith.addi %xindex_5, %xindex_3 : tensor<2x1xi32, #blocked> loc(#loc49) + %xindex_8 = arith.addi %xindex_6, %xindex_4 : tensor<2x1xi32, #blocked1> loc(#loc49) + %xmask = tt.splat %xnumel : i32 -> tensor<2x1xi32, #blocked> loc(#loc50) + %xmask_9 = tt.splat %xnumel : i32 -> tensor<2x1xi32, #blocked1> loc(#loc50) + %xmask_10 = arith.cmpi slt, %xindex_7, %xmask : tensor<2x1xi32, #blocked> loc(#loc50) + %xmask_11 = arith.cmpi slt, %xindex_8, %xmask_9 : tensor<2x1xi32, #blocked1> loc(#loc50) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc51) + %r0_base_12 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc51) + %x0 = arith.extsi %xindex_7 : tensor<2x1xi32, #blocked> to tensor<2x1xi64, #blocked> loc(#loc52) + %x0_13 = arith.extsi %xindex_8 : tensor<2x1xi32, #blocked1> to tensor<2x1xi64, #blocked1> loc(#loc52) + %x0_14 = tt.splat %ks0 : i64 -> tensor<2x1xi64, #blocked> loc(#loc52) + %x0_15 = tt.splat %ks0 : i64 -> tensor<2x1xi64, #blocked1> loc(#loc52) + %x0_16 = arith.remsi %x0, %x0_14 : tensor<2x1xi64, #blocked> loc(#loc52) + %x0_17 = arith.remsi %x0_13, %x0_15 : tensor<2x1xi64, #blocked1> loc(#loc52) + %x1 = arith.divsi %x0, %x0_14 : tensor<2x1xi64, #blocked> loc(#loc53) + %x1_18 = arith.divsi %x0_13, %x0_15 : tensor<2x1xi64, #blocked1> loc(#loc53) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32, #blocked> loc(#loc54) + %tmp0 = tt.splat %ks0 : i64 -> tensor<1x16xi64, #blocked> loc(#loc55) + %tmp0_19 = tt.broadcast %x0_16 : tensor<2x1xi64, #blocked> -> tensor<2x16xi64, #blocked> loc(#loc56) + %tmp0_20 = arith.muli %ks0, %ks1 : i64 loc(#loc57) + %tmp0_21 = tt.splat %tmp0_20 : i64 -> tensor<2x1xi64, #blocked> loc(#loc58) + %tmp0_22 = arith.muli %tmp0_21, %x1 : tensor<2x1xi64, #blocked> loc(#loc58) + %tmp0_23 = tt.broadcast %tmp0_22 : tensor<2x1xi64, #blocked> -> tensor<2x16xi64, #blocked> loc(#loc59) + %tmp0_24 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x16x!tt.ptr, #blocked> loc(#loc60) + %tmp0_25 = tt.broadcast %xmask_10 : tensor<2x1xi1, #blocked> -> tensor<2x16xi1, #blocked> loc(#loc61) + %_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_28 = %cst_0) -> (tensor<2x16xi64, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32, #blocked> loc(#loc63) + %r0_index_29 = arith.addi %r0_index, %r0_base_12 : tensor<1x16xi32, #blocked> loc(#loc63) + %r0_mask_30 = arith.cmpi slt, %r0_index_29, %r0_mask : tensor<1x16xi32, #blocked> loc(#loc54) + %tmp0_31 = arith.extsi %r0_index_29 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc55) + %tmp0_32 = arith.muli %tmp0, %tmp0_31 : tensor<1x16xi64, #blocked> loc(#loc55) + %tmp0_33 = tt.broadcast %tmp0_32 : tensor<1x16xi64, #blocked> -> tensor<2x16xi64, #blocked> loc(#loc56) + %tmp0_34 = arith.addi %tmp0_19, %tmp0_33 : tensor<2x16xi64, #blocked> loc(#loc56) + %tmp0_35 = arith.addi %tmp0_34, %tmp0_23 : tensor<2x16xi64, #blocked> loc(#loc59) + %tmp0_36 = tt.addptr %tmp0_24, %tmp0_35 : tensor<2x16x!tt.ptr, #blocked>, tensor<2x16xi64, #blocked> loc(#loc60) + %tmp0_37 = tt.broadcast %r0_mask_30 : tensor<1x16xi1, #blocked> -> tensor<2x16xi1, #blocked> loc(#loc61) + %tmp0_38 = arith.andi %tmp0_37, %tmp0_25 : tensor<2x16xi1, #blocked> loc(#loc61) + %tmp0_39 = tt.load %tmp0_36, %tmp0_38, %cst evictionPolicy = evict_last : tensor<2x16x!tt.ptr, #blocked> loc(#loc64) + %tmp1 = arith.extsi %tmp0_39 : tensor<2x16xi32, #blocked> to tensor<2x16xi64, #blocked> loc(#loc65) + %tmp4 = arith.addi %_tmp3_28, %tmp1 : tensor<2x16xi64, #blocked> loc(#loc66) + %_tmp3_40 = arith.select %tmp0_38, %tmp4, %_tmp3_28 : tensor<2x16xi1, #blocked>, tensor<2x16xi64, #blocked> loc(#loc67) + scf.yield %_tmp3_40 : tensor<2x16xi64, #blocked> loc(#loc24) + } loc(#loc62) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_28: i64 loc(callsite(#loc1 at #loc68)), %tmp3_29: i64 loc(callsite(#loc1 at #loc68))): + %tmp3_30 = arith.addi %tmp3_28, %tmp3_29 : i64 loc(#loc74) + tt.reduce.return %tmp3_30 : i64 loc(#loc72) + }) : (tensor<2x16xi64, #blocked>) -> tensor<2xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc72) + %tmp3_26 = ttg.convert_layout %tmp3 : tensor<2xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<2xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc69) + %tmp3_27 = tt.expand_dims %tmp3_26 {axis = 1 : i32} : tensor<2xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<2x1xi64, #blocked1> loc(#loc69) + %tmp5 = arith.trunci %tmp3_27 : tensor<2x1xi64, #blocked1> to tensor<2x1xi32, #blocked1> loc(#loc70) + %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc30) + %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc31) + %2 = arith.extui %1 : i1 to i64 loc(#loc32) + %3 = arith.muli %ks0, %2 : i64 loc(#loc32) + %4 = arith.extui %0 : i1 to i64 loc(#loc71) + %5 = arith.addi %4, %3 : i64 loc(#loc33) + %6 = tt.splat %5 : i64 -> tensor<2x1xi64, #blocked1> loc(#loc35) + %7 = arith.muli %x1_18, %6 : tensor<2x1xi64, #blocked1> loc(#loc35) + %8 = arith.addi %x0_17, %7 : tensor<2x1xi64, #blocked1> loc(#loc36) + %9 = tt.splat %out_ptr1 : !tt.ptr -> tensor<2x1x!tt.ptr, #blocked1> loc(#loc37) + %10 = tt.addptr %9, %8 : tensor<2x1x!tt.ptr, #blocked1>, tensor<2x1xi64, #blocked1> loc(#loc37) + tt.store %10, %tmp5, %xmask_11 : tensor<2x1x!tt.ptr, #blocked1> loc(#loc38) + tt.return loc(#loc39) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":26:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":27:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":32:29) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:43) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:39) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:54) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:58) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:50) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:34) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:73) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":30:40) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":31:31) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:63) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":37:23) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":39:23) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:48) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:8) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":42:19) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:49) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:75) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:66) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:57) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:41) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:34) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:30) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:88) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:4) +#loc46 = loc("xoffset"(#loc2)) +#loc47 = loc("xoffset"(#loc3)) +#loc48 = loc("xindex"(#loc4)) +#loc49 = loc("xindex"(#loc5)) +#loc50 = loc("xmask"(#loc6)) +#loc51 = loc("r0_base"(#loc7)) +#loc52 = loc("x0"(#loc8)) +#loc53 = loc("x1"(#loc9)) +#loc54 = loc("r0_mask"(#loc10)) +#loc55 = loc("tmp0"(#loc11)) +#loc56 = loc("tmp0"(#loc12)) +#loc57 = loc("tmp0"(#loc13)) +#loc58 = loc("tmp0"(#loc14)) +#loc59 = loc("tmp0"(#loc15)) +#loc60 = loc("tmp0"(#loc16)) +#loc61 = loc("tmp0"(#loc17)) +#loc62 = loc("_tmp3"(#loc18)) +#loc63 = loc("r0_index"(#loc19)) +#loc64 = loc("tmp0"(#loc20)) +#loc65 = loc("tmp1"(#loc21)) +#loc66 = loc("tmp4"(#loc22)) +#loc67 = loc("_tmp3"(#loc23)) +#loc69 = loc("tmp3"(#loc28)) +#loc70 = loc("tmp5"(#loc29)) +#loc71 = loc(fused[#loc33, #loc34]) +#loc72 = loc(callsite(#loc25 at #loc68)) +#loc74 = loc(callsite(#loc27 at #loc72)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir new file mode 100644 index 0000000000000000000000000000000000000000..7ae97c13f0b943db8eb5c7046c0b99765c967020 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5K5257V2CIYPSOSDD5J3O2K2XOBM6YPUNDVRXIBHB3B26LQ54CSA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir @@ -0,0 +1,152 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":18:0) +#loc1 = loc(unknown) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:25) +#loc43 = loc("in_ptr0"(#loc)) +#loc44 = loc("out_ptr1"(#loc)) +#loc45 = loc("ks0"(#loc)) +#loc46 = loc("ks1"(#loc)) +#loc47 = loc("xnumel"(#loc)) +#loc48 = loc("r0_numel"(#loc)) +#loc74 = loc("tmp3"(#loc29)) +#loc79 = loc(callsite(#loc1 at #loc74)) +module { + tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst = arith.constant dense<0> : tensor<2x16xi32> loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %_tmp3 = arith.constant dense<0> : tensor<2x16xi64> loc(#loc49) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc50) + %xoffset_0 = arith.muli %xoffset, %c2_i32 : i32 loc(#loc51) + %xindex = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc52) + %xindex_1 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<2xi32> -> tensor<2x1xi32> loc(#loc53) + %xindex_2 = tt.splat %xoffset_0 : i32 -> tensor<2x1xi32> loc(#loc54) + %xindex_3 = arith.addi %xindex_2, %xindex_1 : tensor<2x1xi32> loc(#loc54) + %xmask = tt.splat %xnumel : i32 -> tensor<2x1xi32> loc(#loc55) + %xmask_4 = arith.cmpi slt, %xindex_3, %xmask : tensor<2x1xi32> loc(#loc55) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc56) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc57) + %x0 = arith.extsi %xindex_3 : tensor<2x1xi32> to tensor<2x1xi64> loc(#loc58) + %x0_6 = tt.splat %ks0 : i64 -> tensor<2x1xi64> loc(#loc58) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<2x1xi64> loc(#loc58) + %x1 = arith.divsi %x0, %x0_6 : tensor<2x1xi64> loc(#loc59) + %_tmp3_8 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_10 = %_tmp3) -> (tensor<2x16xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32> loc(#loc61) + %r0_index_11 = arith.addi %r0_index, %r0_base_5 : tensor<1x16xi32> loc(#loc61) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32> loc(#loc62) + %r0_mask_12 = arith.cmpi slt, %r0_index_11, %r0_mask : tensor<1x16xi32> loc(#loc62) + %tmp0 = arith.extsi %r0_index_11 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc63) + %tmp0_13 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc63) + %tmp0_14 = arith.muli %tmp0_13, %tmp0 : tensor<1x16xi64> loc(#loc63) + %tmp0_15 = tt.broadcast %x0_7 : tensor<2x1xi64> -> tensor<2x16xi64> loc(#loc64) + %tmp0_16 = tt.broadcast %tmp0_14 : tensor<1x16xi64> -> tensor<2x16xi64> loc(#loc64) + %tmp0_17 = arith.addi %tmp0_15, %tmp0_16 : tensor<2x16xi64> loc(#loc64) + %tmp0_18 = arith.muli %ks0, %ks1 : i64 loc(#loc65) + %tmp0_19 = tt.splat %tmp0_18 : i64 -> tensor<2x1xi64> loc(#loc66) + %tmp0_20 = arith.muli %tmp0_19, %x1 : tensor<2x1xi64> loc(#loc66) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<2x1xi64> -> tensor<2x16xi64> loc(#loc67) + %tmp0_22 = arith.addi %tmp0_17, %tmp0_21 : tensor<2x16xi64> loc(#loc67) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<2x16x!tt.ptr> loc(#loc68) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<2x16x!tt.ptr>, tensor<2x16xi64> loc(#loc68) + %tmp0_25 = tt.broadcast %r0_mask_12 : tensor<1x16xi1> -> tensor<2x16xi1> loc(#loc69) + %tmp0_26 = tt.broadcast %xmask_4 : tensor<2x1xi1> -> tensor<2x16xi1> loc(#loc69) + %tmp0_27 = arith.andi %tmp0_25, %tmp0_26 : tensor<2x16xi1> loc(#loc69) + %tmp0_28 = tt.load %tmp0_24, %tmp0_27, %cst evictionPolicy = evict_last : tensor<2x16x!tt.ptr> loc(#loc70) + %tmp1 = arith.extsi %tmp0_28 : tensor<2x16xi32> to tensor<2x16xi64> loc(#loc71) + %tmp4 = arith.addi %_tmp3_10, %tmp1 : tensor<2x16xi64> loc(#loc72) + %_tmp3_29 = arith.select %tmp0_27, %tmp4, %_tmp3_10 : tensor<2x16xi1>, tensor<2x16xi64> loc(#loc73) + scf.yield %_tmp3_29 : tensor<2x16xi64> loc(#loc27) + } loc(#loc60) + %tmp3 = "tt.reduce"(%_tmp3_8) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_10: i64 loc(callsite(#loc1 at #loc74)), %tmp3_11: i64 loc(callsite(#loc1 at #loc74))): + %tmp3_12 = arith.addi %tmp3_10, %tmp3_11 : i64 loc(#loc80) + tt.reduce.return %tmp3_12 : i64 loc(#loc78) + }) : (tensor<2x16xi64>) -> tensor<2xi64> loc(#loc78) + %tmp3_9 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<2xi64> -> tensor<2x1xi64> loc(#loc75) + %tmp5 = arith.trunci %tmp3_9 : tensor<2x1xi64> to tensor<2x1xi32> loc(#loc76) + %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc33) + %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc34) + %2 = arith.extui %1 : i1 to i64 loc(#loc35) + %3 = arith.muli %ks0, %2 : i64 loc(#loc35) + %4 = arith.extui %0 : i1 to i64 loc(#loc77) + %5 = arith.addi %4, %3 : i64 loc(#loc36) + %6 = tt.splat %5 : i64 -> tensor<2x1xi64> loc(#loc38) + %7 = arith.muli %x1, %6 : tensor<2x1xi64> loc(#loc38) + %8 = arith.addi %x0_7, %7 : tensor<2x1xi64> loc(#loc39) + %9 = tt.splat %out_ptr1 : !tt.ptr -> tensor<2x1x!tt.ptr> loc(#loc40) + %10 = tt.addptr %9, %8 : tensor<2x1x!tt.ptr>, tensor<2x1xi64> loc(#loc40) + tt.store %10, %tmp5, %xmask_4 : tensor<2x1x!tt.ptr> loc(#loc41) + tt.return loc(#loc42) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":30:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":28:43) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:33) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:44) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:23) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":23:21) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:27) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":26:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":27:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":31:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":32:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:43) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:39) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:54) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:58) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:50) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:73) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:63) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":37:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":39:23) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:48) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:8) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:28) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":42:19) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:49) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:75) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:66) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:57) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:41) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:34) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:30) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:25) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:88) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:4) +#loc49 = loc("_tmp3"(#loc3)) +#loc50 = loc("xoffset"(#loc4)) +#loc51 = loc("xoffset"(#loc5)) +#loc52 = loc("xindex"(#loc6)) +#loc53 = loc("xindex"(#loc7)) +#loc54 = loc("xindex"(#loc8)) +#loc55 = loc("xmask"(#loc9)) +#loc56 = loc("r0_base"(#loc10)) +#loc57 = loc("r0_base"(#loc11)) +#loc58 = loc("x0"(#loc12)) +#loc59 = loc("x1"(#loc13)) +#loc60 = loc("_tmp3"(#loc2)) +#loc61 = loc("r0_index"(#loc14)) +#loc62 = loc("r0_mask"(#loc15)) +#loc63 = loc("tmp0"(#loc16)) +#loc64 = loc("tmp0"(#loc17)) +#loc65 = loc("tmp0"(#loc18)) +#loc66 = loc("tmp0"(#loc19)) +#loc67 = loc("tmp0"(#loc20)) +#loc68 = loc("tmp0"(#loc21)) +#loc69 = loc("tmp0"(#loc22)) +#loc70 = loc("tmp0"(#loc23)) +#loc71 = loc("tmp1"(#loc24)) +#loc72 = loc("tmp4"(#loc25)) +#loc73 = loc("_tmp3"(#loc26)) +#loc75 = loc("tmp3"(#loc31)) +#loc76 = loc("tmp5"(#loc32)) +#loc77 = loc(fused[#loc36, #loc37]) +#loc78 = loc(callsite(#loc28 at #loc74)) +#loc80 = loc(callsite(#loc30 at #loc78)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/__grp__triton_red_fused__to_copy_mul_sum_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/__grp__triton_red_fused__to_copy_mul_sum_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ce5b20375440c2f43ceb6114b65cc0a54e263963 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/__grp__triton_red_fused__to_copy_mul_sum_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_mul_sum_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.source", "triton_red_fused__to_copy_mul_sum_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.ttir", "triton_red_fused__to_copy_mul_sum_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.ttgir", "triton_red_fused__to_copy_mul_sum_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.llir", "triton_red_fused__to_copy_mul_sum_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.ptx", "triton_red_fused__to_copy_mul_sum_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.cubin", "triton_red_fused__to_copy_mul_sum_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..3ce777919d2afc2e6cf5004d093f8424d47b5858 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.json new file mode 100644 index 0000000000000000000000000000000000000000..33100810c69b69ffadfcd97dcb96327347192605 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.json @@ -0,0 +1 @@ +{"hash": "eab00da2e4595f249937e6537c90d9a940b19c806c3f885fbde47bd85402f17a", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 512, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_mul_sum_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..c39d051b1608d438e7e8c3603b490c06b57842a9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.llir @@ -0,0 +1,256 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__to_copy_mul_sum_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i64 %4, i64 %5, i64 %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !4 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %13 = shl i32 %12, 6, !dbg !8 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %15 = and i32 %14, 63, !dbg !9 + %16 = or disjoint i32 %13, %15, !dbg !10 + %17 = icmp slt i32 %16, %7, !dbg !11 + %18 = lshr i32 %14, 6, !dbg !12 + %.lobit = and i32 %18, 1, !dbg !12 + %19 = sext i32 %16 to i64, !dbg !13 + %.frozen = freeze i64 %4, !dbg !13 + %20 = sdiv i64 %19, %.frozen, !dbg !13 + %21 = mul i64 %20, %.frozen, !dbg !14 + %.decomposed = sub i64 %19, %21, !dbg !14 + %22 = mul i64 %6, %5, !dbg !15 + %23 = add i64 %22, 31, !dbg !16 + %24 = sdiv i64 %23, 32, !dbg !17 + %25 = mul i64 %20, %24, !dbg !18 + %26 = icmp sgt i32 %8, 0, !dbg !19 + br i1 %26, label %.lr.ph.preheader, label %._crit_edge, !dbg !19 + +.lr.ph.preheader: ; preds = %11 + %27 = insertelement <4 x i32> poison, i32 %8, i64 0 + %28 = shufflevector <4 x i32> %27, <4 x i32> poison, <4 x i32> zeroinitializer + %29 = insertelement <4 x i64> poison, i64 %25, i64 0 + %30 = shufflevector <4 x i64> %29, <4 x i64> poison, <4 x i32> zeroinitializer + %31 = insertelement <4 x i64> poison, i64 %22, i64 0 + %32 = shufflevector <4 x i64> %31, <4 x i64> poison, <4 x i32> zeroinitializer + %33 = insertelement <4 x i1> poison, i1 %17, i64 0 + %34 = shufflevector <4 x i1> %33, <4 x i1> poison, <4 x i32> zeroinitializer + br label %.lr.ph, !dbg !19 + +.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph + %35 = phi i32 [ %132, %.lr.ph ], [ 0, %.lr.ph.preheader ] + %36 = phi <4 x float> [ %131, %.lr.ph ], [ zeroinitializer, %.lr.ph.preheader ] + %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %38 = or disjoint i32 %35, %.lobit, !dbg !21 + %39 = or disjoint i32 %38, 2, !dbg !21 + %40 = or disjoint i32 %38, 4, !dbg !21 + %41 = or disjoint i32 %38, 6, !dbg !21 + %42 = insertelement <4 x i32> poison, i32 %38, i64 0, !dbg !22 + %43 = insertelement <4 x i32> %42, i32 %39, i64 1, !dbg !22 + %44 = insertelement <4 x i32> %43, i32 %40, i64 2, !dbg !22 + %45 = insertelement <4 x i32> %44, i32 %41, i64 3, !dbg !22 + %46 = icmp slt <4 x i32> %45, %28, !dbg !22 + %47 = sext <4 x i32> %45 to <4 x i64>, !dbg !23 + %48 = add <4 x i64> %30, %47, !dbg !23 + %49 = icmp slt <4 x i64> %48, %32, !dbg !24 + %50 = extractelement <4 x i64> %48, i64 0, !dbg !25 + %51 = srem i64 %50, %22, !dbg !25 + %52 = extractelement <4 x i64> %48, i64 1, !dbg !25 + %53 = srem i64 %52, %22, !dbg !25 + %54 = extractelement <4 x i64> %48, i64 2, !dbg !25 + %55 = srem i64 %54, %22, !dbg !25 + %56 = extractelement <4 x i64> %48, i64 3, !dbg !25 + %57 = srem i64 %56, %22, !dbg !25 + %58 = mul i64 %51, %4, !dbg !26 + %59 = mul i64 %53, %4, !dbg !26 + %60 = mul i64 %55, %4, !dbg !26 + %61 = mul i64 %57, %4, !dbg !26 + %62 = add i64 %58, %.decomposed, !dbg !27 + %63 = add i64 %59, %.decomposed, !dbg !27 + %64 = add i64 %60, %.decomposed, !dbg !27 + %65 = add i64 %61, %.decomposed, !dbg !27 + %66 = getelementptr bfloat, ptr addrspace(1) %0, i64 %62, !dbg !28 + %67 = getelementptr bfloat, ptr addrspace(1) %0, i64 %63, !dbg !28 + %68 = getelementptr bfloat, ptr addrspace(1) %0, i64 %64, !dbg !28 + %69 = getelementptr bfloat, ptr addrspace(1) %0, i64 %65, !dbg !28 + %foldExtExtBinop = and <4 x i1> %46, %49, !dbg !29 + %70 = extractelement <4 x i1> %foldExtExtBinop, i64 0, !dbg !29 + %foldExtExtBinop8 = and <4 x i1> %46, %49, !dbg !29 + %71 = extractelement <4 x i1> %foldExtExtBinop8, i64 1, !dbg !29 + %foldExtExtBinop10 = and <4 x i1> %46, %49, !dbg !29 + %72 = extractelement <4 x i1> %foldExtExtBinop10, i64 2, !dbg !29 + %foldExtExtBinop12 = and <4 x i1> %46, %49, !dbg !29 + %73 = extractelement <4 x i1> %foldExtExtBinop12, i64 3, !dbg !29 + %74 = and i1 %17, %70, !dbg !30 + %75 = and i1 %17, %71, !dbg !30 + %76 = and i1 %17, %72, !dbg !30 + %77 = and i1 %17, %73, !dbg !30 + %78 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %66, i64 %37, i1 %74) #4, !dbg !20 + %79 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %80 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %67, i64 %79, i1 %75) #4, !dbg !20 + %81 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %82 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %68, i64 %81, i1 %76) #4, !dbg !20 + %83 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %84 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %69, i64 %83, i1 %77) #4, !dbg !20 + %85 = insertelement <4 x i16> poison, i16 %78, i64 0, !dbg !20 + %86 = insertelement <4 x i16> %85, i16 %80, i64 1, !dbg !20 + %87 = insertelement <4 x i16> %86, i16 %82, i64 2, !dbg !20 + %88 = insertelement <4 x i16> %87, i16 %84, i64 3, !dbg !20 + %89 = bitcast <4 x i16> %88 to <4 x bfloat>, !dbg !20 + %90 = fpext <4 x bfloat> %89 to <4 x float>, !dbg !31 + %91 = getelementptr bfloat, ptr addrspace(1) %1, i64 %62, !dbg !32 + %92 = getelementptr bfloat, ptr addrspace(1) %1, i64 %63, !dbg !32 + %93 = getelementptr bfloat, ptr addrspace(1) %1, i64 %64, !dbg !32 + %94 = getelementptr bfloat, ptr addrspace(1) %1, i64 %65, !dbg !32 + %95 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !33 + %96 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %91, i64 %95, i1 %74) #4, !dbg !33 + %97 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !33 + %98 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %92, i64 %97, i1 %75) #4, !dbg !33 + %99 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !33 + %100 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %93, i64 %99, i1 %76) #4, !dbg !33 + %101 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !33 + %102 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %94, i64 %101, i1 %77) #4, !dbg !33 + %103 = insertelement <4 x i16> poison, i16 %96, i64 0, !dbg !33 + %104 = insertelement <4 x i16> %103, i16 %98, i64 1, !dbg !33 + %105 = insertelement <4 x i16> %104, i16 %100, i64 2, !dbg !33 + %106 = insertelement <4 x i16> %105, i16 %102, i64 3, !dbg !33 + %107 = bitcast <4 x i16> %106 to <4 x bfloat>, !dbg !33 + %108 = fpext <4 x bfloat> %107 to <4 x float>, !dbg !34 + %109 = getelementptr float, ptr addrspace(1) %2, i64 %51, !dbg !35 + %110 = getelementptr float, ptr addrspace(1) %2, i64 %53, !dbg !35 + %111 = getelementptr float, ptr addrspace(1) %2, i64 %55, !dbg !35 + %112 = getelementptr float, ptr addrspace(1) %2, i64 %57, !dbg !35 + %113 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !36 + %114 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %109, i64 %113, i1 %74) #4, !dbg !36 + %115 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !36 + %116 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %110, i64 %115, i1 %75) #4, !dbg !36 + %117 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !36 + %118 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %111, i64 %117, i1 %76) #4, !dbg !36 + %119 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !36 + %120 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %112, i64 %119, i1 %77) #4, !dbg !36 + %121 = insertelement <4 x i32> poison, i32 %114, i64 0, !dbg !36 + %122 = insertelement <4 x i32> %121, i32 %116, i64 1, !dbg !36 + %123 = insertelement <4 x i32> %122, i32 %118, i64 2, !dbg !36 + %124 = insertelement <4 x i32> %123, i32 %120, i64 3, !dbg !36 + %125 = bitcast <4 x i32> %124 to <4 x float>, !dbg !36 + %126 = fmul <4 x float> %108, %125, !dbg !37 + %127 = fmul <4 x float> %126, %90, !dbg !38 + %128 = select <4 x i1> %49, <4 x float> %127, <4 x float> zeroinitializer, !dbg !39 + %129 = fadd <4 x float> %36, %128, !dbg !40 + %130 = and <4 x i1> %34, %46, !dbg !41 + %131 = select <4 x i1> %130, <4 x float> %129, <4 x float> %36, !dbg !42 + %132 = add i32 %35, 8, !dbg !19 + %133 = icmp slt i32 %132, %8, !dbg !19 + br i1 %133, label %.lr.ph, label %._crit_edge.loopexit, !dbg !19 + +._crit_edge.loopexit: ; preds = %.lr.ph + %shift = shufflevector <4 x float> %131, <4 x float> poison, <4 x i32> , !dbg !43 + %foldExtExtBinop14 = fadd <4 x float> %131, %shift, !dbg !43 + %shift16 = shufflevector <4 x float> %131, <4 x float> poison, <4 x i32> , !dbg !43 + %foldExtExtBinop17 = fadd <4 x float> %shift16, %foldExtExtBinop14, !dbg !43 + %shift19 = shufflevector <4 x float> %131, <4 x float> poison, <4 x i32> , !dbg !43 + %foldExtExtBinop20 = fadd <4 x float> %shift19, %foldExtExtBinop17, !dbg !43 + %134 = extractelement <4 x float> %foldExtExtBinop20, i64 0, !dbg !43 + %135 = bitcast float %134 to <1 x i32>, !dbg !47 + br label %._crit_edge, !dbg !43 + +._crit_edge: ; preds = %._crit_edge.loopexit, %11 + %136 = phi <1 x i32> [ zeroinitializer, %11 ], [ %135, %._crit_edge.loopexit ], !dbg !43 + %.idx = shl nuw nsw i32 %15, 3, !dbg !47 + %137 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx, !dbg !47 + %138 = getelementptr float, ptr addrspace(3) %137, i32 %.lobit, !dbg !47 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %138, <1 x i32> %136, i1 true) #4, !dbg !47 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !47 + %139 = icmp samesign ult i32 %14, 128, !dbg !47 + %140 = getelementptr float, ptr addrspace(3) @global_smem, i32 %14, !dbg !47 + %141 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %140, i1 %139) #4, !dbg !47 + %142 = bitcast i32 %141 to float, !dbg !47 + %143 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %141, i32 1, i32 31), !dbg !47 + %144 = bitcast i32 %143 to float, !dbg !47 + %145 = fadd float %142, %144, !dbg !43 + %146 = and i32 %14, 897, !dbg !47 + %147 = icmp eq i32 %146, 0, !dbg !47 + %148 = bitcast float %145 to <1 x i32>, !dbg !47 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %140, <1 x i32> %148, i1 %147) #4, !dbg !47 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !47 + %149 = load i32, ptr addrspace(3) %137, align 8, !dbg !47 + %150 = getelementptr float, ptr addrspace(1) %3, i64 %19, !dbg !48 + %151 = and i32 %14, 64, !dbg !49 + %152 = icmp eq i32 %151, 0, !dbg !49 + %153 = and i1 %152, %17, !dbg !49 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %149, ptr addrspace(1) %150, i1 %153) #4, !dbg !49 + ret void, !dbg !50 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_mul_sum_0", linkageName: "triton_red_fused__to_copy_mul_sum_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 21, column: 28, scope: !4) +!8 = !DILocation(line: 21, column: 33, scope: !4) +!9 = !DILocation(line: 22, column: 44, scope: !4) +!10 = !DILocation(line: 22, column: 23, scope: !4) +!11 = !DILocation(line: 23, column: 21, scope: !4) +!12 = !DILocation(line: 24, column: 37, scope: !4) +!13 = !DILocation(line: 26, column: 19, scope: !4) +!14 = !DILocation(line: 27, column: 19, scope: !4) +!15 = !DILocation(line: 36, column: 36, scope: !4) +!16 = !DILocation(line: 36, column: 32, scope: !4) +!17 = !DILocation(line: 36, column: 44, scope: !4) +!18 = !DILocation(line: 36, column: 26, scope: !4) +!19 = !DILocation(line: 30, column: 40, scope: !4) +!20 = !DILocation(line: 39, column: 96, scope: !4) +!21 = !DILocation(line: 31, column: 31, scope: !4) +!22 = !DILocation(line: 32, column: 29, scope: !4) +!23 = !DILocation(line: 36, column: 22, scope: !4) +!24 = !DILocation(line: 38, column: 22, scope: !4) +!25 = !DILocation(line: 39, column: 83, scope: !4) +!26 = !DILocation(line: 39, column: 45, scope: !4) +!27 = !DILocation(line: 39, column: 39, scope: !4) +!28 = !DILocation(line: 39, column: 34, scope: !4) +!29 = !DILocation(line: 39, column: 106, scope: !4) +!30 = !DILocation(line: 39, column: 113, scope: !4) +!31 = !DILocation(line: 39, column: 164, scope: !4) +!32 = !DILocation(line: 40, column: 34, scope: !4) +!33 = !DILocation(line: 40, column: 96, scope: !4) +!34 = !DILocation(line: 40, column: 164, scope: !4) +!35 = !DILocation(line: 42, column: 35, scope: !4) +!36 = !DILocation(line: 42, column: 85, scope: !4) +!37 = !DILocation(line: 43, column: 22, scope: !4) +!38 = !DILocation(line: 45, column: 22, scope: !4) +!39 = !DILocation(line: 47, column: 37, scope: !4) +!40 = !DILocation(line: 49, column: 25, scope: !4) +!41 = !DILocation(line: 50, column: 36, scope: !4) +!42 = !DILocation(line: 50, column: 50, scope: !4) +!43 = !DILocation(line: 261, column: 15, scope: !44, inlinedAt: !46) +!44 = distinct !DILexicalBlockFile(scope: !4, file: !45, discriminator: 0) +!45 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!46 = !DILocation(line: 51, column: 27, scope: !4) +!47 = !DILocation(line: 291, column: 36, scope: !44, inlinedAt: !46) +!48 = !DILocation(line: 52, column: 25, scope: !4) +!49 = !DILocation(line: 52, column: 37, scope: !4) +!50 = !DILocation(line: 52, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..3cd989c819e7dbad6c55d98fdf4a68e10893db41 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.ptx @@ -0,0 +1,688 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_mul_sum_0 // -- Begin function triton_red_fused__to_copy_mul_sum_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__to_copy_mul_sum_0 +.visible .entry triton_red_fused__to_copy_mul_sum_0( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_mul_sum_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_mul_sum_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_mul_sum_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_mul_sum_0_param_3, + .param .u64 triton_red_fused__to_copy_mul_sum_0_param_4, + .param .u64 triton_red_fused__to_copy_mul_sum_0_param_5, + .param .u64 triton_red_fused__to_copy_mul_sum_0_param_6, + .param .u32 triton_red_fused__to_copy_mul_sum_0_param_7, + .param .u32 triton_red_fused__to_copy_mul_sum_0_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_mul_sum_0_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_mul_sum_0_param_10 +) +.reqntid 128 +{ + .reg .pred %p<43>; + .reg .b16 %rs<25>; + .reg .b32 %r<119>; + .reg .b64 %rd<137>; + .loc 1 18 0 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:18:0 + +// %bb.0: + ld.param.b32 %r17, [triton_red_fused__to_copy_mul_sum_0_param_8]; + ld.param.b64 %rd46, [triton_red_fused__to_copy_mul_sum_0_param_4]; +$L__tmp0: + .loc 1 21 28 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:21:28 + mov.u32 %r18, %ctaid.x; + .loc 1 21 33 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:21:33 + shl.b32 %r19, %r18, 6; + .loc 1 22 44 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:22:44 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 63; + .loc 1 22 23 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:22:23 + or.b32 %r20, %r19, %r2; + .loc 1 26 19 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:26:19 + cvt.s64.s32 %rd1, %r20; + or.b64 %rd50, %rd1, %rd46; + and.b64 %rd51, %rd50, -4294967296; + setp.ne.b64 %p5, %rd51, 0; + cvt.u32.u64 %r116, %rd1; + @%p5 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd130, %rd1, %rd46; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r21, %rd46; + div.u32 %r23, %r116, %r21; + cvt.u64.u32 %rd130, %r23; +$L__BB0_3: + .loc 1 0 19 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:0:19 + ld.param.b32 %r16, [triton_red_fused__to_copy_mul_sum_0_param_7]; + ld.param.b64 %rd45, [triton_red_fused__to_copy_mul_sum_0_param_3]; + bfe.u32 %r3, %r1, 6, 1; + .loc 1 30 40 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:30:40 + setp.lt.s32 %p6, %r17, 1; + mov.b32 %r118, 0; + @%p6 bra $L__BB0_19; +// %bb.4: // %.lr.ph.preheader + .loc 1 0 40 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:0:40 + ld.param.b64 %rd48, [triton_red_fused__to_copy_mul_sum_0_param_6]; + ld.param.b64 %rd47, [triton_red_fused__to_copy_mul_sum_0_param_5]; + ld.param.b64 %rd44, [triton_red_fused__to_copy_mul_sum_0_param_2]; + ld.param.b64 %rd43, [triton_red_fused__to_copy_mul_sum_0_param_1]; + ld.param.b64 %rd42, [triton_red_fused__to_copy_mul_sum_0_param_0]; + mul.lo.s64 %rd52, %rd130, %rd46; + sub.s64 %rd6, %rd1, %rd52; + mul.lo.s64 %rd7, %rd48, %rd47; + add.s64 %rd53, %rd7, 31; + shr.s64 %rd54, %rd53, 63; + shr.u64 %rd55, %rd54, 59; + add.s64 %rd56, %rd53, %rd55; + shr.s64 %rd57, %rd56, 5; + mul.lo.s64 %rd8, %rd130, %rd57; + .loc 1 23 21 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:23:21 + setp.lt.s32 %p1, %r116, %r16; + mov.b32 %r27, 0f00000000; + mov.b64 %rd131, {%r27, %r27}; + mov.b32 %r117, 0; + mov.b64 %rd132, %rd131; + bra.uni $L__BB0_5; +$L__BB0_16: // in Loop: Header=BB0_5 Depth=1 + .loc 1 39 83 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:39:83 + rem.s64 %rd136, %rd23, %rd7; +$L__BB0_17: // in Loop: Header=BB0_5 Depth=1 + .loc 1 38 22 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:38:22 + setp.lt.s64 %p23, %rd20, %rd7; + setp.lt.s64 %p24, %rd21, %rd7; + setp.lt.s64 %p25, %rd22, %rd7; + setp.lt.s64 %p26, %rd23, %rd7; + .loc 1 32 29 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:32:29 + setp.lt.s32 %p27, %r9, %r17; + setp.lt.s32 %p28, %r10, %r17; + setp.lt.s32 %p29, %r11, %r17; + setp.lt.s32 %p30, %r12, %r17; + .loc 1 39 39 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:39:39 + mad.lo.s64 %rd108, %rd133, %rd46, %rd6; + mad.lo.s64 %rd109, %rd134, %rd46, %rd6; + mad.lo.s64 %rd110, %rd135, %rd46, %rd6; + mad.lo.s64 %rd111, %rd136, %rd46, %rd6; + .loc 1 39 34 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:39:34 + shl.b64 %rd112, %rd108, 1; + add.s64 %rd73, %rd42, %rd112; + shl.b64 %rd113, %rd109, 1; + add.s64 %rd76, %rd42, %rd113; + shl.b64 %rd114, %rd110, 1; + add.s64 %rd79, %rd42, %rd114; + shl.b64 %rd115, %rd111, 1; + add.s64 %rd82, %rd42, %rd115; + .loc 1 39 106 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:39:106 + and.pred %p32, %p30, %p26; + and.pred %p33, %p29, %p25; + and.pred %p34, %p28, %p24; + and.pred %p35, %p27, %p23; + .loc 1 39 113 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:39:113 + and.pred %p11, %p1, %p35; + and.pred %p12, %p1, %p34; + and.pred %p13, %p1, %p33; + and.pred %p14, %p1, %p32; + mov.b16 %rs2, 0; + .loc 1 39 96 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:39:96 + // begin inline asm + mov.u16 %rs1, %rs2; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs1 }, [ %rd73 + 0 ], %rd60; + // end inline asm + // begin inline asm + mov.u64 %rd75, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd75, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs3, %rs2; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs3 }, [ %rd76 + 0 ], %rd75; + // end inline asm + // begin inline asm + mov.u64 %rd78, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd78, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs5, %rs2; + @%p13 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs5 }, [ %rd79 + 0 ], %rd78; + // end inline asm + // begin inline asm + mov.u64 %rd81, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd81, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs7, %rs2; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs7 }, [ %rd82 + 0 ], %rd81; + // end inline asm + mov.b32 %r49, {%rs5, %rs7}; + mov.b32 %r50, {%rs1, %rs3}; + .loc 1 39 164 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:39:164 + mov.b32 {%rs17, %rs18}, %r50; + cvt.f32.bf16 %r51, %rs18; + cvt.f32.bf16 %r52, %rs17; + mov.b32 {%rs19, %rs20}, %r49; + cvt.f32.bf16 %r53, %rs20; + cvt.f32.bf16 %r54, %rs19; + .loc 1 40 34 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:40:34 + add.s64 %rd85, %rd43, %rd112; + add.s64 %rd88, %rd43, %rd113; + add.s64 %rd91, %rd43, %rd114; + add.s64 %rd94, %rd43, %rd115; + .loc 1 40 96 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:40:96 + // begin inline asm + mov.u64 %rd84, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd84, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs9, %rs2; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd85 + 0 ], %rd84; + // end inline asm + // begin inline asm + mov.u64 %rd87, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd87, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs11, %rs2; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd88 + 0 ], %rd87; + // end inline asm + // begin inline asm + mov.u64 %rd90, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd90, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs13, %rs2; + @%p13 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd91 + 0 ], %rd90; + // end inline asm + // begin inline asm + mov.u64 %rd93, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd93, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs2; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd94 + 0 ], %rd93; + // end inline asm + mov.b32 %r55, {%rs9, %rs11}; + mov.b32 %r56, {%rs13, %rs15}; + .loc 1 40 164 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:40:164 + mov.b32 {%rs21, %rs22}, %r56; + cvt.f32.bf16 %r57, %rs21; + cvt.f32.bf16 %r58, %rs22; + mov.b32 {%rs23, %rs24}, %r55; + cvt.f32.bf16 %r59, %rs23; + cvt.f32.bf16 %r60, %rs24; + .loc 1 42 35 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:42:35 + shl.b64 %rd116, %rd133, 2; + add.s64 %rd97, %rd44, %rd116; + shl.b64 %rd117, %rd134, 2; + add.s64 %rd100, %rd44, %rd117; + shl.b64 %rd118, %rd135, 2; + add.s64 %rd103, %rd44, %rd118; + shl.b64 %rd119, %rd136, 2; + add.s64 %rd106, %rd44, %rd119; + .loc 1 42 85 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:42:85 + // begin inline asm + mov.u64 %rd96, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd96, 1.0; + // end inline asm + mov.b32 %r41, 0; + // begin inline asm + mov.u32 %r40, %r41; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b32 { %r40 }, [ %rd97 + 0 ], %rd96; + // end inline asm + // begin inline asm + mov.u64 %rd99, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd99, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r42, %r41; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b32 { %r42 }, [ %rd100 + 0 ], %rd99; + // end inline asm + // begin inline asm + mov.u64 %rd102, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd102, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r44, %r41; + @%p13 ld.global.L1::evict_last.L2::cache_hint.b32 { %r44 }, [ %rd103 + 0 ], %rd102; + // end inline asm + // begin inline asm + mov.u64 %rd105, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd105, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r46, %r41; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b32 { %r46 }, [ %rd106 + 0 ], %rd105; + // end inline asm + cvt.u64.u32 %rd120, %r44; + cvt.u64.u32 %rd121, %r46; + shl.b64 %rd122, %rd121, 32; + or.b64 %rd123, %rd120, %rd122; + cvt.u64.u32 %rd124, %r40; + cvt.u64.u32 %rd125, %r42; + shl.b64 %rd126, %rd125, 32; + or.b64 %rd127, %rd124, %rd126; + .loc 1 43 22 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:43:22 + mov.b64 {%r61, %r62}, %rd127; + mul.f32 %r63, %r60, %r62; + mul.f32 %r64, %r59, %r61; + mov.b64 {%r65, %r66}, %rd123; + mul.f32 %r67, %r58, %r66; + mul.f32 %r68, %r57, %r65; + .loc 1 45 22 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:45:22 + mul.f32 %r69, %r68, %r54; + mul.f32 %r70, %r67, %r53; + mul.f32 %r71, %r64, %r52; + mul.f32 %r72, %r63, %r51; + .loc 1 47 37 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:47:37 + selp.f32 %r73, %r72, 0f00000000, %p24; + selp.f32 %r74, %r71, 0f00000000, %p23; + selp.f32 %r75, %r70, 0f00000000, %p26; + selp.f32 %r76, %r69, 0f00000000, %p25; + .loc 1 49 25 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:49:25 + mov.b64 {%r77, %r78}, %rd132; + add.f32 %r79, %r77, %r76; + add.f32 %r80, %r78, %r75; + mov.b64 {%r81, %r82}, %rd131; + add.f32 %r83, %r81, %r74; + add.f32 %r84, %r82, %r73; + .loc 1 50 50 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:50:50 + selp.f32 %r85, %r84, %r82, %p28; + selp.f32 %r86, %r85, %r82, %p1; + selp.f32 %r87, %r83, %r81, %p27; + selp.f32 %r88, %r87, %r81, %p1; + mov.b64 %rd131, {%r88, %r86}; + selp.f32 %r89, %r80, %r78, %p30; + selp.f32 %r90, %r89, %r78, %p1; + selp.f32 %r91, %r79, %r77, %p29; + selp.f32 %r92, %r91, %r77, %p1; + mov.b64 %rd132, {%r92, %r90}; + .loc 1 30 40 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:30:40 + add.s32 %r117, %r117, 8; + setp.lt.s32 %p36, %r117, %r17; + @%p36 bra $L__BB0_5; + bra.uni $L__BB0_18; +$L__BB0_5: // %.lr.ph + // =>This Inner Loop Header: Depth=1 + .loc 1 39 96 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:39:96 + // begin inline asm + mov.u64 %rd60, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd60, 1.0; + // end inline asm + .loc 1 31 31 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:31:31 + add.s32 %r9, %r3, %r117; + add.s32 %r10, %r9, 2; + .loc 1 36 22 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:36:22 + cvt.s64.s32 %rd61, %r9; + cvt.s64.s32 %rd62, %r10; + add.s64 %rd21, %rd8, %rd62; + add.s64 %rd20, %rd8, %rd61; + .loc 1 39 83 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:39:83 + or.b64 %rd65, %rd20, %rd7; + and.b64 %rd66, %rd65, -4294967296; + setp.ne.b64 %p7, %rd66, 0; + @%p7 bra $L__BB0_7; + bra.uni $L__BB0_6; +$L__BB0_7: // in Loop: Header=BB0_5 Depth=1 + rem.s64 %rd133, %rd20, %rd7; + bra.uni $L__BB0_8; +$L__BB0_6: // in Loop: Header=BB0_5 Depth=1 + cvt.u32.u64 %r28, %rd7; + cvt.u32.u64 %r29, %rd20; + rem.u32 %r30, %r29, %r28; + cvt.u64.u32 %rd133, %r30; +$L__BB0_8: // in Loop: Header=BB0_5 Depth=1 + .loc 1 0 0 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:0 + add.s32 %r11, %r9, 4; + cvt.s64.s32 %rd63, %r11; + add.s64 %rd22, %rd8, %rd63; + .loc 1 39 83 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:39:83 + or.b64 %rd67, %rd21, %rd7; + and.b64 %rd68, %rd67, -4294967296; + setp.ne.b64 %p8, %rd68, 0; + @%p8 bra $L__BB0_10; + bra.uni $L__BB0_9; +$L__BB0_10: // in Loop: Header=BB0_5 Depth=1 + rem.s64 %rd134, %rd21, %rd7; + bra.uni $L__BB0_11; +$L__BB0_9: // in Loop: Header=BB0_5 Depth=1 + cvt.u32.u64 %r31, %rd7; + cvt.u32.u64 %r32, %rd21; + rem.u32 %r33, %r32, %r31; + cvt.u64.u32 %rd134, %r33; +$L__BB0_11: // in Loop: Header=BB0_5 Depth=1 + .loc 1 0 0 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:0 + add.s32 %r12, %r9, 6; + cvt.s64.s32 %rd64, %r12; + add.s64 %rd23, %rd8, %rd64; + .loc 1 39 83 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:39:83 + or.b64 %rd69, %rd22, %rd7; + and.b64 %rd70, %rd69, -4294967296; + setp.ne.b64 %p9, %rd70, 0; + @%p9 bra $L__BB0_13; + bra.uni $L__BB0_12; +$L__BB0_13: // in Loop: Header=BB0_5 Depth=1 + rem.s64 %rd135, %rd22, %rd7; + bra.uni $L__BB0_14; +$L__BB0_12: // in Loop: Header=BB0_5 Depth=1 + cvt.u32.u64 %r34, %rd7; + cvt.u32.u64 %r35, %rd22; + rem.u32 %r36, %r35, %r34; + cvt.u64.u32 %rd135, %r36; +$L__BB0_14: // in Loop: Header=BB0_5 Depth=1 + or.b64 %rd71, %rd23, %rd7; + and.b64 %rd72, %rd71, -4294967296; + setp.ne.b64 %p10, %rd72, 0; + @%p10 bra $L__BB0_16; +// %bb.15: // in Loop: Header=BB0_5 Depth=1 + cvt.u32.u64 %r37, %rd7; + cvt.u32.u64 %r38, %rd23; + rem.u32 %r39, %r38, %r37; + cvt.u64.u32 %rd136, %r39; + bra.uni $L__BB0_17; +$L__BB0_18: // %._crit_edge.loopexit +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:51:27 ] + mov.b64 {%r93, %r94}, %rd131; + add.f32 %r95, %r93, %r94; + mov.b64 {%r96, %r97}, %rd132; + add.f32 %r98, %r96, %r95; + add.f32 %r118, %r97, %r98; +$L__tmp2: +$L__BB0_19: // %._crit_edge + .loc 1 23 21 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:23:21 + setp.lt.s32 %p41, %r116, %r16; +$L__tmp3: + .loc 2 291 36 // standard.py:291:36 @[ c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:51:27 ] + shl.b32 %r107, %r2, 3; + mov.b32 %r108, global_smem; + add.s32 %r109, %r108, %r107; + shl.b32 %r110, %r3, 2; + add.s32 %r99, %r109, %r110; + mov.pred %p37, -1; + // begin inline asm + @%p37 st.shared.b32 [ %r99 + 0 ], %r118; + // end inline asm + bar.sync 0; + setp.lt.u32 %p38, %r1, 128; + shl.b32 %r111, %r1, 2; + add.s32 %r102, %r108, %r111; + // begin inline asm + @%p38 ld.shared.b32 %r101, [ %r102 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r112, %r101, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:51:27 ] + add.f32 %r104, %r101, %r112; + .loc 2 291 36 // standard.py:291:36 @[ c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:51:27 ] + and.b32 %r113, %r1, 897; + setp.eq.b32 %p39, %r113, 0; + // begin inline asm + @%p39 st.shared.b32 [ %r102 + 0 ], %r104; + // end inline asm + bar.sync 0; + ld.shared.b32 %r105, [%r109]; +$L__tmp4: + .loc 1 52 25 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:52:25 + shl.b64 %rd129, %rd1, 2; + add.s64 %rd128, %rd45, %rd129; + .loc 1 52 37 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:52:37 + and.b32 %r114, %r1, 64; + setp.eq.b32 %p42, %r114, 0; + and.pred %p40, %p42, %p41; + // begin inline asm + @%p40 st.global.b32 [ %rd128 + 0 ], { %r105 }; + // end inline asm + .loc 1 52 4 // c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py:52:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 220 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd5 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 55 +.b8 122 +.b8 54 +.b8 107 +.b8 98 +.b8 104 +.b8 108 +.b8 104 +.b8 110 +.b8 100 +.b8 53 +.b8 53 +.b8 105 +.b8 122 +.b8 51 +.b8 115 +.b8 117 +.b8 120 +.b8 112 +.b8 122 +.b8 99 +.b8 102 +.b8 106 +.b8 104 +.b8 106 +.b8 118 +.b8 55 +.b8 112 +.b8 55 +.b8 105 +.b8 50 +.b8 122 +.b8 101 +.b8 108 +.b8 117 +.b8 50 +.b8 110 +.b8 105 +.b8 116 +.b8 106 +.b8 111 +.b8 101 +.b8 103 +.b8 114 +.b8 119 +.b8 99 +.b8 122 +.b8 98 +.b8 100 +.b8 121 +.b8 102 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 55 +.b8 122 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x26 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xb1:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xc6:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.source new file mode 100644 index 0000000000000000000000000000000000000000..fc559b0820957d453dac78e151eb0139dc92a86a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.source @@ -0,0 +1,338 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":18:0) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc76 = loc(unknown) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc83 = loc("in_ptr0"(#loc)) +#loc84 = loc("in_ptr1"(#loc)) +#loc85 = loc("in_ptr2"(#loc)) +#loc86 = loc("out_ptr0"(#loc)) +#loc87 = loc("ks0"(#loc)) +#loc88 = loc("ks1"(#loc)) +#loc89 = loc("ks2"(#loc)) +#loc90 = loc("xnumel"(#loc)) +#loc91 = loc("r0_numel"(#loc)) +#loc161 = loc("input"(#loc74)) +#loc162 = loc("a"(#loc79)) +#loc163 = loc("b"(#loc79)) +module { + tt.func public @triton_red_fused__to_copy_mul_sum_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc92) + %xoffset_0 = arith.constant 64 : i32 loc(#loc93) + %xoffset_1 = arith.constant 64 : i32 loc(#loc93) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc93) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc94) + %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc95) + %xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<64x1xi32> loc(#loc96) + %xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<64x1xi32> loc(#loc96) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32> loc(#loc97) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<64x1xi32> loc(#loc97) + %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc98) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32> -> tensor<1x8xi32> loc(#loc99) + %x1 = arith.extsi %xindex_5 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc100) + %x1_8 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc100) + %x1_9 = arith.divsi %x1, %x1_8 : tensor<64x1xi64> loc(#loc100) + %x0 = arith.extsi %xindex_5 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc101) + %x0_10 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc101) + %x0_11 = arith.remsi %x0, %x0_10 : tensor<64x1xi64> loc(#loc101) + %_tmp13 = arith.constant 0.000000e+00 : f32 loc(#loc102) + %_tmp13_12 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc102) + %c0_i32 = arith.constant 0 : i32 loc(#loc12) + %c8_i32 = arith.constant 8 : i32 loc(#loc12) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc12) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc12) + %2 = arith.bitcast %c8_i32 : i32 to i32 loc(#loc12) + %3 = ub.poison : i32 loc(#loc12) + %_tmp13_13 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp13_15 = %_tmp13_12) -> (tensor<64x8xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc104) + %r0_index_16 = arith.addi %r0_index, %r0_base_7 : tensor<1x8xi32> loc(#loc104) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x8xi32> loc(#loc105) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x8xi32> loc(#loc105) + %tmp0 = arith.muli %ks1, %ks2 : i64 loc(#loc106) + %tmp0_18 = arith.constant 31 : i32 loc(#loc107) + %tmp0_19 = arith.constant 31 : i64 loc(#loc107) + %tmp0_20 = arith.addi %tmp0_19, %tmp0 : i64 loc(#loc107) + %tmp0_21 = arith.constant 32 : i32 loc(#loc108) + %tmp0_22 = arith.constant 32 : i64 loc(#loc108) + %tmp0_23 = arith.divsi %tmp0_20, %tmp0_22 : i64 loc(#loc108) + %tmp0_24 = tt.splat %tmp0_23 : i64 -> tensor<64x1xi64> loc(#loc109) + %tmp0_25 = arith.muli %x1_9, %tmp0_24 : tensor<64x1xi64> loc(#loc109) + %tmp0_26 = arith.extsi %r0_index_16 : tensor<1x8xi32> to tensor<1x8xi64> loc(#loc110) + %tmp0_27 = tt.broadcast %tmp0_26 : tensor<1x8xi64> -> tensor<64x8xi64> loc(#loc110) + %tmp0_28 = tt.broadcast %tmp0_25 : tensor<64x1xi64> -> tensor<64x8xi64> loc(#loc110) + %tmp0_29 = arith.addi %tmp0_27, %tmp0_28 : tensor<64x8xi64> loc(#loc110) + %tmp1 = arith.muli %ks1, %ks2 : i64 loc(#loc111) + %tmp2 = tt.splat %tmp1 : i64 -> tensor<64x8xi64> loc(#loc112) + %tmp2_30 = arith.cmpi slt, %tmp0_29, %tmp2 : tensor<64x8xi64> loc(#loc112) + %tmp3 = arith.muli %ks1, %ks2 : i64 loc(#loc113) + %tmp3_31 = arith.constant 31 : i32 loc(#loc114) + %tmp3_32 = arith.constant 31 : i64 loc(#loc114) + %tmp3_33 = arith.addi %tmp3_32, %tmp3 : i64 loc(#loc114) + %tmp3_34 = arith.constant 32 : i32 loc(#loc115) + %tmp3_35 = arith.constant 32 : i64 loc(#loc115) + %tmp3_36 = arith.divsi %tmp3_33, %tmp3_35 : i64 loc(#loc115) + %tmp3_37 = tt.splat %tmp3_36 : i64 -> tensor<64x1xi64> loc(#loc116) + %tmp3_38 = arith.muli %x1_9, %tmp3_37 : tensor<64x1xi64> loc(#loc116) + %tmp3_39 = arith.extsi %r0_index_16 : tensor<1x8xi32> to tensor<1x8xi64> loc(#loc117) + %tmp3_40 = tt.broadcast %tmp3_39 : tensor<1x8xi64> -> tensor<64x8xi64> loc(#loc117) + %tmp3_41 = tt.broadcast %tmp3_38 : tensor<64x1xi64> -> tensor<64x8xi64> loc(#loc117) + %tmp3_42 = arith.addi %tmp3_40, %tmp3_41 : tensor<64x8xi64> loc(#loc117) + %tmp3_43 = arith.muli %ks1, %ks2 : i64 loc(#loc118) + %tmp3_44 = tt.splat %tmp3_43 : i64 -> tensor<64x8xi64> loc(#loc119) + %tmp3_45 = arith.remsi %tmp3_42, %tmp3_44 : tensor<64x8xi64> loc(#loc119) + %tmp3_46 = tt.splat %ks0 : i64 -> tensor<64x8xi64> loc(#loc120) + %tmp3_47 = arith.muli %tmp3_46, %tmp3_45 : tensor<64x8xi64> loc(#loc120) + %tmp3_48 = tt.broadcast %x0_11 : tensor<64x1xi64> -> tensor<64x8xi64> loc(#loc121) + %tmp3_49 = arith.addi %tmp3_48, %tmp3_47 : tensor<64x8xi64> loc(#loc121) + %tmp3_50 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc122) + %tmp3_51 = tt.addptr %tmp3_50, %tmp3_49 : tensor<64x8x!tt.ptr>, tensor<64x8xi64> loc(#loc122) + %tmp3_52 = tt.broadcast %r0_mask_17 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc123) + %tmp3_53 = arith.andi %tmp3_52, %tmp2_30 : tensor<64x8xi1> loc(#loc123) + %tmp3_54 = tt.broadcast %xmask_6 : tensor<64x1xi1> -> tensor<64x8xi1> loc(#loc124) + %tmp3_55 = arith.andi %tmp3_53, %tmp3_54 : tensor<64x8xi1> loc(#loc124) + %tmp3_56 = arith.constant 0.000000e+00 : f32 loc(#loc125) + %tmp3_57 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc125) + %tmp3_58 = arith.truncf %tmp3_57 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc125) + %tmp3_59 = tt.load %tmp3_51, %tmp3_55, %tmp3_58 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc125) + %tmp3_60 = arith.extf %tmp3_59 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc126) + %tmp4 = arith.muli %ks1, %ks2 : i64 loc(#loc127) + %tmp4_61 = arith.constant 31 : i32 loc(#loc128) + %tmp4_62 = arith.constant 31 : i64 loc(#loc128) + %tmp4_63 = arith.addi %tmp4_62, %tmp4 : i64 loc(#loc128) + %tmp4_64 = arith.constant 32 : i32 loc(#loc129) + %tmp4_65 = arith.constant 32 : i64 loc(#loc129) + %tmp4_66 = arith.divsi %tmp4_63, %tmp4_65 : i64 loc(#loc129) + %tmp4_67 = tt.splat %tmp4_66 : i64 -> tensor<64x1xi64> loc(#loc130) + %tmp4_68 = arith.muli %x1_9, %tmp4_67 : tensor<64x1xi64> loc(#loc130) + %tmp4_69 = arith.extsi %r0_index_16 : tensor<1x8xi32> to tensor<1x8xi64> loc(#loc131) + %tmp4_70 = tt.broadcast %tmp4_69 : tensor<1x8xi64> -> tensor<64x8xi64> loc(#loc131) + %tmp4_71 = tt.broadcast %tmp4_68 : tensor<64x1xi64> -> tensor<64x8xi64> loc(#loc131) + %tmp4_72 = arith.addi %tmp4_70, %tmp4_71 : tensor<64x8xi64> loc(#loc131) + %tmp4_73 = arith.muli %ks1, %ks2 : i64 loc(#loc132) + %tmp4_74 = tt.splat %tmp4_73 : i64 -> tensor<64x8xi64> loc(#loc133) + %tmp4_75 = arith.remsi %tmp4_72, %tmp4_74 : tensor<64x8xi64> loc(#loc133) + %tmp4_76 = tt.splat %ks0 : i64 -> tensor<64x8xi64> loc(#loc134) + %tmp4_77 = arith.muli %tmp4_76, %tmp4_75 : tensor<64x8xi64> loc(#loc134) + %tmp4_78 = tt.broadcast %x0_11 : tensor<64x1xi64> -> tensor<64x8xi64> loc(#loc135) + %tmp4_79 = arith.addi %tmp4_78, %tmp4_77 : tensor<64x8xi64> loc(#loc135) + %tmp4_80 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc136) + %tmp4_81 = tt.addptr %tmp4_80, %tmp4_79 : tensor<64x8x!tt.ptr>, tensor<64x8xi64> loc(#loc136) + %tmp4_82 = tt.broadcast %r0_mask_17 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc137) + %tmp4_83 = arith.andi %tmp4_82, %tmp2_30 : tensor<64x8xi1> loc(#loc137) + %tmp4_84 = tt.broadcast %xmask_6 : tensor<64x1xi1> -> tensor<64x8xi1> loc(#loc138) + %tmp4_85 = arith.andi %tmp4_83, %tmp4_84 : tensor<64x8xi1> loc(#loc138) + %tmp4_86 = arith.constant 0.000000e+00 : f32 loc(#loc139) + %tmp4_87 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc139) + %tmp4_88 = arith.truncf %tmp4_87 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc139) + %tmp4_89 = tt.load %tmp4_81, %tmp4_85, %tmp4_88 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc139) + %tmp4_90 = arith.extf %tmp4_89 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc140) + %tmp6 = arith.muli %ks1, %ks2 : i64 loc(#loc141) + %tmp6_91 = arith.constant 31 : i32 loc(#loc142) + %tmp6_92 = arith.constant 31 : i64 loc(#loc142) + %tmp6_93 = arith.addi %tmp6_92, %tmp6 : i64 loc(#loc142) + %tmp6_94 = arith.constant 32 : i32 loc(#loc143) + %tmp6_95 = arith.constant 32 : i64 loc(#loc143) + %tmp6_96 = arith.divsi %tmp6_93, %tmp6_95 : i64 loc(#loc143) + %tmp6_97 = tt.splat %tmp6_96 : i64 -> tensor<64x1xi64> loc(#loc144) + %tmp6_98 = arith.muli %x1_9, %tmp6_97 : tensor<64x1xi64> loc(#loc144) + %tmp6_99 = arith.extsi %r0_index_16 : tensor<1x8xi32> to tensor<1x8xi64> loc(#loc145) + %tmp6_100 = tt.broadcast %tmp6_99 : tensor<1x8xi64> -> tensor<64x8xi64> loc(#loc145) + %tmp6_101 = tt.broadcast %tmp6_98 : tensor<64x1xi64> -> tensor<64x8xi64> loc(#loc145) + %tmp6_102 = arith.addi %tmp6_100, %tmp6_101 : tensor<64x8xi64> loc(#loc145) + %tmp6_103 = arith.muli %ks1, %ks2 : i64 loc(#loc146) + %tmp6_104 = tt.splat %tmp6_103 : i64 -> tensor<64x8xi64> loc(#loc147) + %tmp6_105 = arith.remsi %tmp6_102, %tmp6_104 : tensor<64x8xi64> loc(#loc147) + %tmp6_106 = tt.splat %in_ptr2 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc148) + %tmp6_107 = tt.addptr %tmp6_106, %tmp6_105 : tensor<64x8x!tt.ptr>, tensor<64x8xi64> loc(#loc148) + %tmp6_108 = tt.broadcast %r0_mask_17 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc149) + %tmp6_109 = arith.andi %tmp6_108, %tmp2_30 : tensor<64x8xi1> loc(#loc149) + %tmp6_110 = tt.broadcast %xmask_6 : tensor<64x1xi1> -> tensor<64x8xi1> loc(#loc150) + %tmp6_111 = arith.andi %tmp6_109, %tmp6_110 : tensor<64x8xi1> loc(#loc150) + %tmp6_112 = arith.constant 0.000000e+00 : f32 loc(#loc151) + %tmp6_113 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc151) + %tmp6_114 = tt.load %tmp6_107, %tmp6_111, %tmp6_113 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc151) + %tmp7 = arith.mulf %tmp4_90, %tmp6_114 : tensor<64x8xf32> loc(#loc152) + %tmp9 = arith.mulf %tmp3_60, %tmp7 : tensor<64x8xf32> loc(#loc153) + %tmp10 = arith.constant 0.000000e+00 : f32 loc(#loc154) + %tmp10_115 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc154) + %tmp11 = arith.select %tmp2_30, %tmp9, %tmp10_115 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc155) + %tmp14 = arith.addf %_tmp13_15, %tmp11 : tensor<64x8xf32> loc(#loc156) + %_tmp13_116 = tt.broadcast %r0_mask_17 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc157) + %_tmp13_117 = tt.broadcast %xmask_6 : tensor<64x1xi1> -> tensor<64x8xi1> loc(#loc157) + %_tmp13_118 = arith.andi %_tmp13_116, %_tmp13_117 : tensor<64x8xi1> loc(#loc157) + %_tmp13_119 = arith.select %_tmp13_118, %tmp14, %_tmp13_15 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc158) + scf.yield %_tmp13_119 : tensor<64x8xf32> loc(#loc68) + } loc(#loc103) + %tmp13 = tt.call @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp13_13) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc159) + %tmp13_14 = tt.expand_dims %tmp13 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc160) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc71) + %5 = tt.addptr %4, %xindex_5 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc71) + tt.store %5, %tmp13_14, %xmask_6 : tensor<64x1x!tt.ptr> loc(#loc72) + tt.return loc(#loc73) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x8xf32> loc("input"(#loc74))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc75) + tt.reduce.return %2 : f32 loc(#loc75) + }) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc75) + tt.return %0 : tensor<64xf32> loc(#loc77) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc78) + tt.return %1 : tensor<64xf32> loc(#loc78) + } loc(#loc74) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc79)), %b: f32 loc("b"(#loc79))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc80) + tt.return %0 : f32 loc(#loc81) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc82) + tt.return %1 : f32 loc(#loc82) + } loc(#loc79) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":21:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":21:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":22:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":22:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":22:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":24:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":24:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":26:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":28:44) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":30:40) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":31:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":32:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":36:36) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":36:32) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":36:44) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":36:26) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":36:22) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":37:19) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":38:22) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:67) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:63) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:75) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:57) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:53) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:87) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:83) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:45) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:39) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:34) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:106) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:113) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:96) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:164) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:67) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:63) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:75) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:57) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:53) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:87) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:83) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:45) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:39) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:34) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:106) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:113) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:96) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:164) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":42:57) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":42:53) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":42:65) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":42:47) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":42:43) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":42:77) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":42:73) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":42:35) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":42:95) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":42:102) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":42:85) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":43:22) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":45:22) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":46:39) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":47:37) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":49:25) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":50:36) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":50:50) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":50:8) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":51:27) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":51:30) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":52:25) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":52:37) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":52:4) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc92 = loc("xoffset"(#loc1)) +#loc93 = loc("xoffset"(#loc2)) +#loc94 = loc("xindex"(#loc3)) +#loc95 = loc("xindex"(#loc4)) +#loc96 = loc("xindex"(#loc5)) +#loc97 = loc("xmask"(#loc6)) +#loc98 = loc("r0_base"(#loc7)) +#loc99 = loc("r0_base"(#loc8)) +#loc100 = loc("x1"(#loc9)) +#loc101 = loc("x0"(#loc10)) +#loc102 = loc("_tmp13"(#loc11)) +#loc103 = loc("_tmp13"(#loc12)) +#loc104 = loc("r0_index"(#loc13)) +#loc105 = loc("r0_mask"(#loc14)) +#loc106 = loc("tmp0"(#loc15)) +#loc107 = loc("tmp0"(#loc16)) +#loc108 = loc("tmp0"(#loc17)) +#loc109 = loc("tmp0"(#loc18)) +#loc110 = loc("tmp0"(#loc19)) +#loc111 = loc("tmp1"(#loc20)) +#loc112 = loc("tmp2"(#loc21)) +#loc113 = loc("tmp3"(#loc22)) +#loc114 = loc("tmp3"(#loc23)) +#loc115 = loc("tmp3"(#loc24)) +#loc116 = loc("tmp3"(#loc25)) +#loc117 = loc("tmp3"(#loc26)) +#loc118 = loc("tmp3"(#loc27)) +#loc119 = loc("tmp3"(#loc28)) +#loc120 = loc("tmp3"(#loc29)) +#loc121 = loc("tmp3"(#loc30)) +#loc122 = loc("tmp3"(#loc31)) +#loc123 = loc("tmp3"(#loc32)) +#loc124 = loc("tmp3"(#loc33)) +#loc125 = loc("tmp3"(#loc34)) +#loc126 = loc("tmp3"(#loc35)) +#loc127 = loc("tmp4"(#loc36)) +#loc128 = loc("tmp4"(#loc37)) +#loc129 = loc("tmp4"(#loc38)) +#loc130 = loc("tmp4"(#loc39)) +#loc131 = loc("tmp4"(#loc40)) +#loc132 = loc("tmp4"(#loc41)) +#loc133 = loc("tmp4"(#loc42)) +#loc134 = loc("tmp4"(#loc43)) +#loc135 = loc("tmp4"(#loc44)) +#loc136 = loc("tmp4"(#loc45)) +#loc137 = loc("tmp4"(#loc46)) +#loc138 = loc("tmp4"(#loc47)) +#loc139 = loc("tmp4"(#loc48)) +#loc140 = loc("tmp4"(#loc49)) +#loc141 = loc("tmp6"(#loc50)) +#loc142 = loc("tmp6"(#loc51)) +#loc143 = loc("tmp6"(#loc52)) +#loc144 = loc("tmp6"(#loc53)) +#loc145 = loc("tmp6"(#loc54)) +#loc146 = loc("tmp6"(#loc55)) +#loc147 = loc("tmp6"(#loc56)) +#loc148 = loc("tmp6"(#loc57)) +#loc149 = loc("tmp6"(#loc58)) +#loc150 = loc("tmp6"(#loc59)) +#loc151 = loc("tmp6"(#loc60)) +#loc152 = loc("tmp7"(#loc61)) +#loc153 = loc("tmp9"(#loc62)) +#loc154 = loc("tmp10"(#loc63)) +#loc155 = loc("tmp11"(#loc64)) +#loc156 = loc("tmp14"(#loc65)) +#loc157 = loc("_tmp13"(#loc66)) +#loc158 = loc("_tmp13"(#loc67)) +#loc159 = loc("tmp13"(#loc69)) +#loc160 = loc("tmp13"(#loc70)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..9722803325d3f5c2d22d189a671a1743ff50a06b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.ttgir @@ -0,0 +1,176 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":18:0) +#loc1 = loc(unknown) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":51:27) +#loc46 = loc("in_ptr0"(#loc)) +#loc47 = loc("in_ptr1"(#loc)) +#loc48 = loc("in_ptr2"(#loc)) +#loc49 = loc("out_ptr0"(#loc)) +#loc50 = loc("ks0"(#loc)) +#loc51 = loc("ks1"(#loc)) +#loc52 = loc("ks2"(#loc)) +#loc53 = loc("xnumel"(#loc)) +#loc54 = loc("r0_numel"(#loc)) +#loc91 = loc("tmp13"(#loc40)) +#loc94 = loc(callsite(#loc1 at #loc91)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_mul_sum_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked> loc(#loc1) + %c31_i64 = arith.constant 31 : i64 loc(#loc1) + %c32_i64 = arith.constant 32 : i64 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc55) + %xoffset_1 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc56) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc57) + %xindex_2 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc57) + %xindex_3 = tt.splat %xoffset_1 : i32 -> tensor<64x1xi32, #blocked> loc(#loc58) + %xindex_4 = arith.addi %xindex_3, %xindex_2 : tensor<64x1xi32, #blocked> loc(#loc58) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32, #blocked> loc(#loc59) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<64x1xi32, #blocked> loc(#loc59) + %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc60) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x8xi32, #blocked> loc(#loc60) + %x1 = arith.extsi %xindex_4 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> loc(#loc61) + %x1_7 = tt.splat %ks0 : i64 -> tensor<64x1xi64, #blocked> loc(#loc61) + %x1_8 = arith.divsi %x1, %x1_7 : tensor<64x1xi64, #blocked> loc(#loc61) + %x0 = arith.remsi %x1, %x1_7 : tensor<64x1xi64, #blocked> loc(#loc62) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x8xi32, #blocked> loc(#loc63) + %tmp0 = arith.muli %ks1, %ks2 : i64 loc(#loc64) + %tmp0_9 = arith.addi %tmp0, %c31_i64 : i64 loc(#loc65) + %tmp0_10 = arith.divsi %tmp0_9, %c32_i64 : i64 loc(#loc66) + %tmp0_11 = tt.splat %tmp0_10 : i64 -> tensor<64x1xi64, #blocked> loc(#loc67) + %tmp0_12 = arith.muli %x1_8, %tmp0_11 : tensor<64x1xi64, #blocked> loc(#loc67) + %tmp0_13 = tt.broadcast %tmp0_12 : tensor<64x1xi64, #blocked> -> tensor<64x8xi64, #blocked> loc(#loc68) + %tmp2 = tt.splat %tmp0 : i64 -> tensor<64x8xi64, #blocked> loc(#loc69) + %tmp3 = tt.splat %ks0 : i64 -> tensor<64x8xi64, #blocked> loc(#loc70) + %tmp3_14 = tt.broadcast %x0 : tensor<64x1xi64, #blocked> -> tensor<64x8xi64, #blocked> loc(#loc71) + %tmp3_15 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr, #blocked> loc(#loc72) + %tmp3_16 = tt.broadcast %xmask_5 : tensor<64x1xi1, #blocked> -> tensor<64x8xi1, #blocked> loc(#loc73) + %tmp4 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x8x!tt.ptr, #blocked> loc(#loc74) + %tmp6 = tt.splat %in_ptr2 : !tt.ptr -> tensor<64x8x!tt.ptr, #blocked> loc(#loc75) + %_tmp13 = scf.for %_tmp13_18 = %c0_i32 to %r0_numel step %c8_i32 iter_args(%arg10 = %cst) -> (tensor<64x8xf32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp13_18 : i32 -> tensor<1x8xi32, #blocked> loc(#loc77) + %r0_index_19 = arith.addi %r0_index, %r0_base_6 : tensor<1x8xi32, #blocked> loc(#loc77) + %r0_mask_20 = arith.cmpi slt, %r0_index_19, %r0_mask : tensor<1x8xi32, #blocked> loc(#loc63) + %tmp0_21 = arith.extsi %r0_index_19 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked> loc(#loc68) + %tmp0_22 = tt.broadcast %tmp0_21 : tensor<1x8xi64, #blocked> -> tensor<64x8xi64, #blocked> loc(#loc68) + %tmp0_23 = arith.addi %tmp0_22, %tmp0_13 : tensor<64x8xi64, #blocked> loc(#loc68) + %tmp2_24 = arith.cmpi slt, %tmp0_23, %tmp2 : tensor<64x8xi64, #blocked> loc(#loc69) + %tmp3_25 = arith.remsi %tmp0_23, %tmp2 : tensor<64x8xi64, #blocked> loc(#loc78) + %tmp3_26 = arith.muli %tmp3, %tmp3_25 : tensor<64x8xi64, #blocked> loc(#loc70) + %tmp3_27 = arith.addi %tmp3_14, %tmp3_26 : tensor<64x8xi64, #blocked> loc(#loc71) + %tmp3_28 = tt.addptr %tmp3_15, %tmp3_27 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi64, #blocked> loc(#loc72) + %tmp3_29 = tt.broadcast %r0_mask_20 : tensor<1x8xi1, #blocked> -> tensor<64x8xi1, #blocked> loc(#loc79) + %tmp3_30 = arith.andi %tmp3_29, %tmp2_24 : tensor<64x8xi1, #blocked> loc(#loc79) + %tmp3_31 = arith.andi %tmp3_30, %tmp3_16 : tensor<64x8xi1, #blocked> loc(#loc73) + %tmp3_32 = tt.load %tmp3_28, %tmp3_31, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr, #blocked> loc(#loc80) + %tmp3_33 = arith.extf %tmp3_32 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc81) + %tmp4_34 = tt.addptr %tmp4, %tmp3_27 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi64, #blocked> loc(#loc74) + %tmp4_35 = tt.load %tmp4_34, %tmp3_31, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr, #blocked> loc(#loc82) + %tmp4_36 = arith.extf %tmp4_35 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc83) + %tmp6_37 = tt.addptr %tmp6, %tmp3_25 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi64, #blocked> loc(#loc75) + %tmp6_38 = tt.load %tmp6_37, %tmp3_31, %cst evictionPolicy = evict_last : tensor<64x8x!tt.ptr, #blocked> loc(#loc84) + %tmp7 = arith.mulf %tmp4_36, %tmp6_38 : tensor<64x8xf32, #blocked> loc(#loc85) + %tmp9 = arith.mulf %tmp3_33, %tmp7 : tensor<64x8xf32, #blocked> loc(#loc86) + %tmp11 = arith.select %tmp2_24, %tmp9, %cst : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked> loc(#loc87) + %tmp14 = arith.addf %arg10, %tmp11 : tensor<64x8xf32, #blocked> loc(#loc88) + %_tmp13_39 = arith.andi %tmp3_29, %tmp3_16 : tensor<64x8xi1, #blocked> loc(#loc89) + %_tmp13_40 = arith.select %_tmp13_39, %tmp14, %arg10 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked> loc(#loc90) + scf.yield %_tmp13_40 : tensor<64x8xf32, #blocked> loc(#loc38) + } loc(#loc76) + %tmp13 = "tt.reduce"(%_tmp13) <{axis = 1 : i32}> ({ + ^bb0(%tmp13_18: f32 loc(callsite(#loc1 at #loc91)), %tmp13_19: f32 loc(callsite(#loc1 at #loc91))): + %tmp13_20 = arith.addf %tmp13_18, %tmp13_19 : f32 loc(#loc95) + tt.reduce.return %tmp13_20 : f32 loc(#loc93) + }) : (tensor<64x8xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc93) + %tmp13_17 = tt.expand_dims %tmp13 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xf32, #blocked> loc(#loc92) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked> loc(#loc43) + %1 = tt.addptr %0, %xindex_4 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc43) + tt.store %1, %tmp13_17, %xmask_5 : tensor<64x1x!tt.ptr, #blocked> loc(#loc44) + tt.return loc(#loc45) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":21:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":21:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":22:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":22:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":24:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":26:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":27:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":32:29) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":36:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":36:32) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":36:44) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":36:26) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":36:22) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":38:22) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:45) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:39) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:34) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:113) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":42:35) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":30:40) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":31:31) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:83) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:106) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:96) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:164) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:96) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:164) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":42:85) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":43:22) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":45:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":47:37) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":49:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":50:36) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":50:50) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":50:8) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":51:30) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":52:25) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":52:37) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":52:4) +#loc55 = loc("xoffset"(#loc2)) +#loc56 = loc("xoffset"(#loc3)) +#loc57 = loc("xindex"(#loc4)) +#loc58 = loc("xindex"(#loc5)) +#loc59 = loc("xmask"(#loc6)) +#loc60 = loc("r0_base"(#loc7)) +#loc61 = loc("x1"(#loc8)) +#loc62 = loc("x0"(#loc9)) +#loc63 = loc("r0_mask"(#loc10)) +#loc64 = loc("tmp0"(#loc11)) +#loc65 = loc("tmp0"(#loc12)) +#loc66 = loc("tmp0"(#loc13)) +#loc67 = loc("tmp0"(#loc14)) +#loc68 = loc("tmp0"(#loc15)) +#loc69 = loc("tmp2"(#loc16)) +#loc70 = loc("tmp3"(#loc17)) +#loc71 = loc("tmp3"(#loc18)) +#loc72 = loc("tmp3"(#loc19)) +#loc73 = loc("tmp3"(#loc20)) +#loc74 = loc("tmp4"(#loc21)) +#loc75 = loc("tmp6"(#loc22)) +#loc76 = loc("_tmp13"(#loc23)) +#loc77 = loc("r0_index"(#loc24)) +#loc78 = loc("tmp3"(#loc25)) +#loc79 = loc("tmp3"(#loc26)) +#loc80 = loc("tmp3"(#loc27)) +#loc81 = loc("tmp3"(#loc28)) +#loc82 = loc("tmp4"(#loc29)) +#loc83 = loc("tmp4"(#loc30)) +#loc84 = loc("tmp6"(#loc31)) +#loc85 = loc("tmp7"(#loc32)) +#loc86 = loc("tmp9"(#loc33)) +#loc87 = loc("tmp11"(#loc34)) +#loc88 = loc("tmp14"(#loc35)) +#loc89 = loc("_tmp13"(#loc36)) +#loc90 = loc("_tmp13"(#loc37)) +#loc92 = loc("tmp13"(#loc42)) +#loc93 = loc(callsite(#loc39 at #loc91)) +#loc95 = loc(callsite(#loc41 at #loc93)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..0ffd867582827d9569f68103eb87bae6df16ede3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5KYA3IXELFPSJGJX4ZJXZEGZVFALDHEANQ7YQX554R55QVAC6F5A/triton_red_fused__to_copy_mul_sum_0.ttir @@ -0,0 +1,179 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":18:0) +#loc1 = loc(unknown) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":51:27) +#loc48 = loc("in_ptr0"(#loc)) +#loc49 = loc("in_ptr1"(#loc)) +#loc50 = loc("in_ptr2"(#loc)) +#loc51 = loc("out_ptr0"(#loc)) +#loc52 = loc("ks0"(#loc)) +#loc53 = loc("ks1"(#loc)) +#loc54 = loc("ks2"(#loc)) +#loc55 = loc("xnumel"(#loc)) +#loc56 = loc("r0_numel"(#loc)) +#loc95 = loc("tmp13"(#loc42)) +#loc98 = loc(callsite(#loc1 at #loc95)) +module { + tt.func public @triton_red_fused__to_copy_mul_sum_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %c32_i64 = arith.constant 32 : i64 loc(#loc1) + %c31_i64 = arith.constant 31 : i64 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc57) + %xoffset_1 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc58) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc59) + %xindex_2 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc60) + %xindex_3 = tt.splat %xoffset_1 : i32 -> tensor<64x1xi32> loc(#loc61) + %xindex_4 = arith.addi %xindex_3, %xindex_2 : tensor<64x1xi32> loc(#loc61) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32> loc(#loc62) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<64x1xi32> loc(#loc62) + %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc63) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32> -> tensor<1x8xi32> loc(#loc64) + %x1 = arith.extsi %xindex_4 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc65) + %x1_7 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc65) + %x1_8 = arith.divsi %x1, %x1_7 : tensor<64x1xi64> loc(#loc65) + %x0 = arith.remsi %x1, %x1_7 : tensor<64x1xi64> loc(#loc66) + %_tmp13 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c8_i32 iter_args(%_tmp13_10 = %cst_0) -> (tensor<64x8xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc68) + %r0_index_11 = arith.addi %r0_index, %r0_base_6 : tensor<1x8xi32> loc(#loc68) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x8xi32> loc(#loc69) + %r0_mask_12 = arith.cmpi slt, %r0_index_11, %r0_mask : tensor<1x8xi32> loc(#loc69) + %tmp0 = arith.muli %ks1, %ks2 : i64 loc(#loc70) + %tmp0_13 = arith.addi %tmp0, %c31_i64 : i64 loc(#loc71) + %tmp0_14 = arith.divsi %tmp0_13, %c32_i64 : i64 loc(#loc72) + %tmp0_15 = tt.splat %tmp0_14 : i64 -> tensor<64x1xi64> loc(#loc73) + %tmp0_16 = arith.muli %x1_8, %tmp0_15 : tensor<64x1xi64> loc(#loc73) + %tmp0_17 = arith.extsi %r0_index_11 : tensor<1x8xi32> to tensor<1x8xi64> loc(#loc74) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<1x8xi64> -> tensor<64x8xi64> loc(#loc74) + %tmp0_19 = tt.broadcast %tmp0_16 : tensor<64x1xi64> -> tensor<64x8xi64> loc(#loc74) + %tmp0_20 = arith.addi %tmp0_18, %tmp0_19 : tensor<64x8xi64> loc(#loc74) + %tmp2 = tt.splat %tmp0 : i64 -> tensor<64x8xi64> loc(#loc75) + %tmp2_21 = arith.cmpi slt, %tmp0_20, %tmp2 : tensor<64x8xi64> loc(#loc75) + %tmp3 = arith.remsi %tmp0_20, %tmp2 : tensor<64x8xi64> loc(#loc76) + %tmp3_22 = tt.splat %ks0 : i64 -> tensor<64x8xi64> loc(#loc77) + %tmp3_23 = arith.muli %tmp3_22, %tmp3 : tensor<64x8xi64> loc(#loc77) + %tmp3_24 = tt.broadcast %x0 : tensor<64x1xi64> -> tensor<64x8xi64> loc(#loc78) + %tmp3_25 = arith.addi %tmp3_24, %tmp3_23 : tensor<64x8xi64> loc(#loc78) + %tmp3_26 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc79) + %tmp3_27 = tt.addptr %tmp3_26, %tmp3_25 : tensor<64x8x!tt.ptr>, tensor<64x8xi64> loc(#loc79) + %tmp3_28 = tt.broadcast %r0_mask_12 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc80) + %tmp3_29 = arith.andi %tmp3_28, %tmp2_21 : tensor<64x8xi1> loc(#loc80) + %tmp3_30 = tt.broadcast %xmask_5 : tensor<64x1xi1> -> tensor<64x8xi1> loc(#loc81) + %tmp3_31 = arith.andi %tmp3_29, %tmp3_30 : tensor<64x8xi1> loc(#loc81) + %tmp3_32 = tt.load %tmp3_27, %tmp3_31, %cst evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc82) + %tmp3_33 = arith.extf %tmp3_32 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc83) + %tmp4 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc84) + %tmp4_34 = tt.addptr %tmp4, %tmp3_25 : tensor<64x8x!tt.ptr>, tensor<64x8xi64> loc(#loc84) + %tmp4_35 = tt.load %tmp4_34, %tmp3_31, %cst evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc85) + %tmp4_36 = arith.extf %tmp4_35 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc86) + %tmp6 = tt.splat %in_ptr2 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc87) + %tmp6_37 = tt.addptr %tmp6, %tmp3 : tensor<64x8x!tt.ptr>, tensor<64x8xi64> loc(#loc87) + %tmp6_38 = tt.load %tmp6_37, %tmp3_31, %cst_0 evictionPolicy = evict_last : tensor<64x8x!tt.ptr> loc(#loc88) + %tmp7 = arith.mulf %tmp4_36, %tmp6_38 : tensor<64x8xf32> loc(#loc89) + %tmp9 = arith.mulf %tmp3_33, %tmp7 : tensor<64x8xf32> loc(#loc90) + %tmp11 = arith.select %tmp2_21, %tmp9, %cst_0 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc91) + %tmp14 = arith.addf %_tmp13_10, %tmp11 : tensor<64x8xf32> loc(#loc92) + %_tmp13_39 = arith.andi %tmp3_28, %tmp3_30 : tensor<64x8xi1> loc(#loc93) + %_tmp13_40 = arith.select %_tmp13_39, %tmp14, %_tmp13_10 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc94) + scf.yield %_tmp13_40 : tensor<64x8xf32> loc(#loc40) + } loc(#loc67) + %tmp13 = "tt.reduce"(%_tmp13) <{axis = 1 : i32}> ({ + ^bb0(%tmp13_10: f32 loc(callsite(#loc1 at #loc95)), %tmp13_11: f32 loc(callsite(#loc1 at #loc95))): + %tmp13_12 = arith.addf %tmp13_10, %tmp13_11 : f32 loc(#loc99) + tt.reduce.return %tmp13_12 : f32 loc(#loc97) + }) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc97) + %tmp13_9 = tt.expand_dims %tmp13 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc96) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc45) + %1 = tt.addptr %0, %xindex_4 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc45) + tt.store %1, %tmp13_9, %xmask_5 : tensor<64x1x!tt.ptr> loc(#loc46) + tt.return loc(#loc47) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":30:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":21:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":21:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":22:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":22:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":22:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":23:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":24:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":24:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":26:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":27:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":31:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":32:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":36:36) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":36:32) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":36:44) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":36:26) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":36:22) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":38:22) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:83) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:45) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:39) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:34) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:106) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:113) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:96) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":39:164) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:34) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:96) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":40:164) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":42:35) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":42:85) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":43:22) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":45:22) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":47:37) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":49:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":50:36) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":50:50) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":50:8) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":51:30) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":52:25) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":52:37) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z6kbhlhnd55iz3suxpzcfjhjv7p7i2zelu2nitjoegrwczbdyf.py":52:4) +#loc57 = loc("xoffset"(#loc3)) +#loc58 = loc("xoffset"(#loc4)) +#loc59 = loc("xindex"(#loc5)) +#loc60 = loc("xindex"(#loc6)) +#loc61 = loc("xindex"(#loc7)) +#loc62 = loc("xmask"(#loc8)) +#loc63 = loc("r0_base"(#loc9)) +#loc64 = loc("r0_base"(#loc10)) +#loc65 = loc("x1"(#loc11)) +#loc66 = loc("x0"(#loc12)) +#loc67 = loc("_tmp13"(#loc2)) +#loc68 = loc("r0_index"(#loc13)) +#loc69 = loc("r0_mask"(#loc14)) +#loc70 = loc("tmp0"(#loc15)) +#loc71 = loc("tmp0"(#loc16)) +#loc72 = loc("tmp0"(#loc17)) +#loc73 = loc("tmp0"(#loc18)) +#loc74 = loc("tmp0"(#loc19)) +#loc75 = loc("tmp2"(#loc20)) +#loc76 = loc("tmp3"(#loc21)) +#loc77 = loc("tmp3"(#loc22)) +#loc78 = loc("tmp3"(#loc23)) +#loc79 = loc("tmp3"(#loc24)) +#loc80 = loc("tmp3"(#loc25)) +#loc81 = loc("tmp3"(#loc26)) +#loc82 = loc("tmp3"(#loc27)) +#loc83 = loc("tmp3"(#loc28)) +#loc84 = loc("tmp4"(#loc29)) +#loc85 = loc("tmp4"(#loc30)) +#loc86 = loc("tmp4"(#loc31)) +#loc87 = loc("tmp6"(#loc32)) +#loc88 = loc("tmp6"(#loc33)) +#loc89 = loc("tmp7"(#loc34)) +#loc90 = loc("tmp9"(#loc35)) +#loc91 = loc("tmp11"(#loc36)) +#loc92 = loc("tmp14"(#loc37)) +#loc93 = loc("_tmp13"(#loc38)) +#loc94 = loc("_tmp13"(#loc39)) +#loc96 = loc("tmp13"(#loc44)) +#loc97 = loc(callsite(#loc41 at #loc95)) +#loc99 = loc(callsite(#loc43 at #loc97)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..af96ff96e5109ee9acc80668cc39f62bb46889c0 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..e5a2833ebf50c1cae63a3e11830b07f25d4fedb7 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f63e3783c40a4f8173585ceb6a84f45083fc2ae5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"hash": "ebfdec6a3c082a052340cf1ce53a4289947fc953ecb0b9bf91b98341040940f3", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 1024, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..4ed274a01febde45b27a4342e664fd33025f1c5b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir @@ -0,0 +1,1110 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_1 = internal constant [8 x i8] c"unknown\00" +@assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py\00" +@assertMessage_1 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp49 < 17\00" +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py\00" +@assertMessage_0 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp40 < 17\00" +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %13 = shl i32 %12, 3, !dbg !11 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %15 = and i32 %14, 56, !dbg !12 + %16 = lshr exact i32 %15, 3, !dbg !12 + %17 = and i32 %14, 7, !dbg !12 + %18 = or disjoint i32 %16, %13, !dbg !13 + %19 = icmp slt i32 %18, 32, !dbg !14 + %20 = shl nuw nsw i32 %17, 1, !dbg !15 + %21 = or disjoint i32 %20, 1, !dbg !15 + %22 = shl i32 %18, 4, !dbg !16 + %23 = or disjoint i32 %22, %20, !dbg !17 + %24 = sext i32 %23 to i64, !dbg !18 + %25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !18 + %26 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %25, i1 %19) #5, !dbg !19 + %27 = extractvalue { i64, i64 } %26, 0, !dbg !19 + %28 = extractvalue { i64, i64 } %26, 1, !dbg !19 + %29 = add i64 %27, -1, !dbg !20 + %30 = icmp ult i64 %29, 16383, !dbg !20 + %31 = add i64 %28, -1, !dbg !20 + %32 = icmp ult i64 %31, 16383, !dbg !20 + %33 = zext i1 %30 to i32, !dbg !21 + %34 = zext i1 %32 to i32, !dbg !21 + %35 = and i32 %14, 1, !dbg !22 + %36 = lshr i32 %14, 1, !dbg !22 + %.lobit = and i32 %36, 1, !dbg !22 + %37 = lshr i32 %14, 2, !dbg !22 + %.lobit1 = and i32 %37, 1, !dbg !22 + %38 = xor i32 %35, 1, !dbg !26 + %39 = xor i32 %.lobit, 1, !dbg !26 + %40 = xor i32 %.lobit1, 1, !dbg !26 + %41 = xor i1 %30, true, !dbg !27 + %42 = and i1 %32, %41, !dbg !27 + %43 = trunc i32 %14 to i1, !dbg !28 + %44 = xor i1 %42, %43, !dbg !28 + %45 = xor i32 %33, %34, !dbg !29 + %46 = select i1 %44, i32 %45, i32 0, !dbg !30 + %47 = xor i32 %46, %33, !dbg !31 + %48 = xor i32 %46, %34, !dbg !31 + %49 = xor i32 %21, %20, !dbg !32 + %50 = select i1 %44, i32 %49, i32 0, !dbg !33 + %51 = xor i32 %50, %20, !dbg !34 + %52 = xor i32 %50, %21, !dbg !34 + %53 = mul nuw nsw i32 %47, %38, !dbg !35 + %54 = mul nuw nsw i32 %48, %38, !dbg !35 + %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %53, i32 1, i32 31), !dbg !36 + %56 = add i32 %53, %55, !dbg !39 + %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 1, i32 31), !dbg !36 + %58 = add i32 %54, %57, !dbg !39 + %59 = mul nuw nsw i32 %47, %35, !dbg !40 + %60 = mul nuw nsw i32 %48, %35, !dbg !40 + %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %59, i32 1, i32 31), !dbg !36 + %62 = add i32 %59, %61, !dbg !39 + %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 1, i32 31), !dbg !36 + %64 = add i32 %60, %63, !dbg !39 + %65 = mul nuw nsw i32 %51, %38, !dbg !41 + %66 = mul nuw nsw i32 %52, %38, !dbg !41 + %67 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %65, i32 1, i32 31), !dbg !36 + %68 = add i32 %65, %67, !dbg !39 + %69 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %66, i32 1, i32 31), !dbg !36 + %70 = add i32 %66, %69, !dbg !39 + %71 = mul nuw nsw i32 %51, %35, !dbg !42 + %72 = mul nuw nsw i32 %52, %35, !dbg !42 + %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 1, i32 31), !dbg !36 + %74 = add i32 %71, %73, !dbg !39 + %75 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 1, i32 31), !dbg !36 + %76 = add i32 %72, %75, !dbg !39 + %77 = trunc i32 %36 to i1, !dbg !28 + %78 = insertelement <2 x i32> poison, i32 %56, i64 0, !dbg !28 + %79 = insertelement <2 x i32> %78, i32 %58, i64 1, !dbg !28 + %80 = insertelement <2 x i32> poison, i32 %62, i64 0, !dbg !28 + %81 = insertelement <2 x i32> %80, i32 %64, i64 1, !dbg !28 + %82 = icmp sge <2 x i32> %79, %81, !dbg !28 + %83 = icmp ne <2 x i32> %79, %81, !dbg !28 + %84 = insertelement <2 x i32> poison, i32 %68, i64 0, !dbg !28 + %85 = insertelement <2 x i32> %84, i32 %70, i64 1, !dbg !28 + %86 = insertelement <2 x i32> poison, i32 %74, i64 0, !dbg !28 + %87 = insertelement <2 x i32> %86, i32 %76, i64 1, !dbg !28 + %88 = icmp sle <2 x i32> %85, %87, !dbg !28 + %89 = or <2 x i1> %83, %88, !dbg !28 + %90 = and <2 x i1> %82, %89, !dbg !28 + %91 = insertelement <2 x i1> poison, i1 %77, i64 0, !dbg !28 + %92 = shufflevector <2 x i1> %91, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !28 + %93 = xor <2 x i1> %90, %92, !dbg !28 + %94 = xor <2 x i32> %79, %81, !dbg !29 + %95 = select <2 x i1> %93, <2 x i32> zeroinitializer, <2 x i32> %94, !dbg !30 + %96 = insertelement <2 x i32> poison, i32 %47, i64 0, !dbg !31 + %97 = insertelement <2 x i32> %96, i32 %48, i64 1, !dbg !31 + %98 = xor <2 x i32> %95, %97, !dbg !31 + %99 = xor <2 x i32> %85, %87, !dbg !32 + %100 = select <2 x i1> %93, <2 x i32> zeroinitializer, <2 x i32> %99, !dbg !33 + %101 = insertelement <2 x i32> poison, i32 %51, i64 0, !dbg !34 + %102 = insertelement <2 x i32> %101, i32 %52, i64 1, !dbg !34 + %103 = xor <2 x i32> %100, %102, !dbg !34 + %104 = extractelement <2 x i32> %98, i64 0, !dbg !28 + %105 = extractelement <2 x i32> %98, i64 1, !dbg !28 + %106 = icmp sge i32 %104, %105, !dbg !28 + %107 = icmp ne i32 %104, %105, !dbg !28 + %108 = extractelement <2 x i32> %103, i64 0, !dbg !28 + %109 = extractelement <2 x i32> %103, i64 1, !dbg !28 + %110 = icmp sle i32 %108, %109, !dbg !28 + %111 = or i1 %107, %110, !dbg !28 + %112 = and i1 %106, %111, !dbg !28 + %.not3 = xor i1 %112, %77, !dbg !28 + %113 = xor i32 %104, %105, !dbg !29 + %114 = select i1 %.not3, i32 0, i32 %113, !dbg !30 + %115 = xor i32 %108, %109, !dbg !32 + %116 = select i1 %.not3, i32 0, i32 %115, !dbg !33 + %117 = xor i32 %116, %108, !dbg !34 + %118 = xor i32 %116, %109, !dbg !34 + %119 = mul nuw nsw i32 %117, %39, !dbg !41 + %120 = mul nuw nsw i32 %118, %39, !dbg !41 + %121 = mul nuw nsw i32 %117, %.lobit, !dbg !42 + %122 = mul nuw nsw i32 %118, %.lobit, !dbg !42 + %123 = trunc i32 %37 to i1, !dbg !28 + %124 = insertelement <2 x i32> poison, i32 %114, i64 0, !dbg !31 + %125 = shufflevector <2 x i32> %124, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !31 + %126 = xor <2 x i32> %125, %98, !dbg !31 + %127 = extractelement <2 x i32> %126, i64 0, !dbg !40 + %128 = mul nuw nsw i32 %127, %39, !dbg !35 + %129 = extractelement <2 x i32> %126, i64 1, !dbg !40 + %130 = mul nuw nsw i32 %129, %39, !dbg !35 + %131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 2, i32 31), !dbg !36 + %132 = add i32 %128, %131, !dbg !39 + %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %130, i32 2, i32 31), !dbg !36 + %134 = add i32 %130, %133, !dbg !39 + %135 = mul nuw nsw i32 %127, %.lobit, !dbg !40 + %136 = mul nuw nsw i32 %129, %.lobit, !dbg !40 + %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 2, i32 31), !dbg !36 + %138 = add i32 %135, %137, !dbg !39 + %139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 2, i32 31), !dbg !36 + %140 = add i32 %136, %139, !dbg !39 + %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %119, i32 2, i32 31), !dbg !36 + %142 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 2, i32 31), !dbg !36 + %143 = insertelement <2 x i32> poison, i32 %119, i64 0, !dbg !39 + %144 = insertelement <2 x i32> %143, i32 %120, i64 1, !dbg !39 + %145 = insertelement <2 x i32> poison, i32 %141, i64 0, !dbg !39 + %146 = insertelement <2 x i32> %145, i32 %142, i64 1, !dbg !39 + %147 = add <2 x i32> %144, %146, !dbg !39 + %148 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 2, i32 31), !dbg !36 + %149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 2, i32 31), !dbg !36 + %150 = insertelement <2 x i32> poison, i32 %121, i64 0, !dbg !39 + %151 = insertelement <2 x i32> %150, i32 %122, i64 1, !dbg !39 + %152 = insertelement <2 x i32> poison, i32 %148, i64 0, !dbg !39 + %153 = insertelement <2 x i32> %152, i32 %149, i64 1, !dbg !39 + %154 = add <2 x i32> %151, %153, !dbg !39 + %155 = insertelement <2 x i32> poison, i32 %132, i64 0, !dbg !28 + %156 = insertelement <2 x i32> %155, i32 %134, i64 1, !dbg !28 + %157 = insertelement <2 x i32> poison, i32 %138, i64 0, !dbg !28 + %158 = insertelement <2 x i32> %157, i32 %140, i64 1, !dbg !28 + %159 = icmp sge <2 x i32> %156, %158, !dbg !28 + %160 = icmp ne <2 x i32> %156, %158, !dbg !28 + %161 = xor <2 x i32> %156, %158, !dbg !29 + %162 = icmp sle <2 x i32> %147, %154, !dbg !28 + %163 = or <2 x i1> %160, %162, !dbg !28 + %164 = and <2 x i1> %159, %163, !dbg !28 + %165 = insertelement <2 x i1> poison, i1 %123, i64 0, !dbg !28 + %166 = shufflevector <2 x i1> %165, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !28 + %167 = xor <2 x i1> %164, %166, !dbg !28 + %168 = select <2 x i1> %167, <2 x i32> zeroinitializer, <2 x i32> %161, !dbg !30 + %169 = xor <2 x i32> %168, %126, !dbg !31 + %170 = xor <2 x i32> %147, %154, !dbg !32 + %171 = select <2 x i1> %167, <2 x i32> zeroinitializer, <2 x i32> %170, !dbg !33 + %172 = insertelement <2 x i32> poison, i32 %117, i64 0, !dbg !34 + %173 = insertelement <2 x i32> %172, i32 %118, i64 1, !dbg !34 + %174 = xor <2 x i32> %171, %173, !dbg !34 + %175 = extractelement <2 x i32> %169, i64 0, !dbg !40 + %176 = mul nuw nsw i32 %175, %38, !dbg !35 + %177 = extractelement <2 x i32> %169, i64 1, !dbg !40 + %178 = mul nuw nsw i32 %177, %38, !dbg !35 + %179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 1, i32 31), !dbg !36 + %180 = add i32 %176, %179, !dbg !39 + %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 1, i32 31), !dbg !36 + %182 = add i32 %178, %181, !dbg !39 + %183 = mul nuw nsw i32 %175, %35, !dbg !40 + %184 = mul nuw nsw i32 %177, %35, !dbg !40 + %185 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %183, i32 1, i32 31), !dbg !36 + %186 = add i32 %183, %185, !dbg !39 + %187 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 1, i32 31), !dbg !36 + %188 = add i32 %184, %187, !dbg !39 + %189 = extractelement <2 x i32> %174, i64 0, !dbg !42 + %190 = mul nuw nsw i32 %189, %38, !dbg !41 + %191 = extractelement <2 x i32> %174, i64 1, !dbg !42 + %192 = mul nuw nsw i32 %191, %38, !dbg !41 + %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %190, i32 1, i32 31), !dbg !36 + %194 = add i32 %190, %193, !dbg !39 + %195 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 1, i32 31), !dbg !36 + %196 = add i32 %192, %195, !dbg !39 + %197 = mul nuw nsw i32 %189, %35, !dbg !42 + %198 = mul nuw nsw i32 %191, %35, !dbg !42 + %199 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %197, i32 1, i32 31), !dbg !36 + %200 = add i32 %197, %199, !dbg !39 + %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 1, i32 31), !dbg !36 + %202 = add i32 %198, %201, !dbg !39 + %203 = insertelement <2 x i32> poison, i32 %180, i64 0, !dbg !28 + %204 = insertelement <2 x i32> %203, i32 %182, i64 1, !dbg !28 + %205 = insertelement <2 x i32> poison, i32 %186, i64 0, !dbg !28 + %206 = insertelement <2 x i32> %205, i32 %188, i64 1, !dbg !28 + %207 = icmp sge <2 x i32> %204, %206, !dbg !28 + %208 = icmp ne <2 x i32> %204, %206, !dbg !28 + %209 = insertelement <2 x i32> poison, i32 %194, i64 0, !dbg !28 + %210 = insertelement <2 x i32> %209, i32 %196, i64 1, !dbg !28 + %211 = insertelement <2 x i32> poison, i32 %200, i64 0, !dbg !28 + %212 = insertelement <2 x i32> %211, i32 %202, i64 1, !dbg !28 + %213 = icmp sle <2 x i32> %210, %212, !dbg !28 + %214 = or <2 x i1> %208, %213, !dbg !28 + %215 = and <2 x i1> %207, %214, !dbg !28 + %216 = xor <2 x i1> %215, %166, !dbg !28 + %217 = xor <2 x i32> %204, %206, !dbg !29 + %218 = select <2 x i1> %216, <2 x i32> zeroinitializer, <2 x i32> %217, !dbg !30 + %219 = xor <2 x i32> %218, %169, !dbg !31 + %220 = xor <2 x i32> %210, %212, !dbg !32 + %221 = select <2 x i1> %216, <2 x i32> zeroinitializer, <2 x i32> %220, !dbg !33 + %222 = xor <2 x i32> %221, %174, !dbg !34 + %223 = extractelement <2 x i32> %219, i64 0, !dbg !28 + %224 = extractelement <2 x i32> %219, i64 1, !dbg !28 + %225 = icmp sge i32 %223, %224, !dbg !28 + %226 = icmp ne i32 %223, %224, !dbg !28 + %227 = extractelement <2 x i32> %222, i64 0, !dbg !28 + %228 = extractelement <2 x i32> %222, i64 1, !dbg !28 + %229 = icmp sle i32 %227, %228, !dbg !28 + %230 = or i1 %226, %229, !dbg !28 + %231 = and i1 %225, %230, !dbg !28 + %.not8 = xor i1 %231, %123, !dbg !28 + %232 = xor i32 %223, %224, !dbg !29 + %233 = select i1 %.not8, i32 0, i32 %232, !dbg !30 + %234 = insertelement <2 x i32> poison, i32 %233, i64 0, !dbg !31 + %235 = shufflevector <2 x i32> %234, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !31 + %236 = xor <2 x i32> %235, %219, !dbg !31 + %237 = xor i32 %227, %228, !dbg !32 + %238 = select i1 %.not8, i32 0, i32 %237, !dbg !33 + %239 = insertelement <2 x i32> poison, i32 %238, i64 0, !dbg !34 + %240 = shufflevector <2 x i32> %239, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !34 + %241 = xor <2 x i32> %240, %222, !dbg !34 + %242 = extractelement <2 x i32> %236, i64 0, !dbg !40 + %243 = mul nuw nsw i32 %242, %40, !dbg !35 + %244 = extractelement <2 x i32> %236, i64 1, !dbg !40 + %245 = mul nuw nsw i32 %244, %40, !dbg !35 + %246 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 4, i32 31), !dbg !36 + %247 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %245, i32 4, i32 31), !dbg !36 + %248 = insertelement <2 x i32> poison, i32 %243, i64 0, !dbg !39 + %249 = insertelement <2 x i32> %248, i32 %245, i64 1, !dbg !39 + %250 = insertelement <2 x i32> poison, i32 %246, i64 0, !dbg !39 + %251 = insertelement <2 x i32> %250, i32 %247, i64 1, !dbg !39 + %252 = add <2 x i32> %249, %251, !dbg !39 + %253 = mul nuw nsw i32 %242, %.lobit1, !dbg !40 + %254 = mul nuw nsw i32 %244, %.lobit1, !dbg !40 + %255 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %253, i32 4, i32 31), !dbg !36 + %256 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %254, i32 4, i32 31), !dbg !36 + %257 = insertelement <2 x i32> poison, i32 %253, i64 0, !dbg !39 + %258 = insertelement <2 x i32> %257, i32 %254, i64 1, !dbg !39 + %259 = insertelement <2 x i32> poison, i32 %255, i64 0, !dbg !39 + %260 = insertelement <2 x i32> %259, i32 %256, i64 1, !dbg !39 + %261 = add <2 x i32> %258, %260, !dbg !39 + %262 = extractelement <2 x i32> %241, i64 0, !dbg !42 + %263 = mul nuw nsw i32 %262, %40, !dbg !41 + %264 = extractelement <2 x i32> %241, i64 1, !dbg !42 + %265 = mul nuw nsw i32 %264, %40, !dbg !41 + %266 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %263, i32 4, i32 31), !dbg !36 + %267 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %265, i32 4, i32 31), !dbg !36 + %268 = insertelement <2 x i32> poison, i32 %263, i64 0, !dbg !39 + %269 = insertelement <2 x i32> %268, i32 %265, i64 1, !dbg !39 + %270 = insertelement <2 x i32> poison, i32 %266, i64 0, !dbg !39 + %271 = insertelement <2 x i32> %270, i32 %267, i64 1, !dbg !39 + %272 = add <2 x i32> %269, %271, !dbg !39 + %273 = mul nuw nsw i32 %262, %.lobit1, !dbg !42 + %274 = mul nuw nsw i32 %264, %.lobit1, !dbg !42 + %275 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %273, i32 4, i32 31), !dbg !36 + %276 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %274, i32 4, i32 31), !dbg !36 + %277 = insertelement <2 x i32> poison, i32 %273, i64 0, !dbg !39 + %278 = insertelement <2 x i32> %277, i32 %274, i64 1, !dbg !39 + %279 = insertelement <2 x i32> poison, i32 %275, i64 0, !dbg !39 + %280 = insertelement <2 x i32> %279, i32 %276, i64 1, !dbg !39 + %281 = add <2 x i32> %278, %280, !dbg !39 + %282 = icmp slt <2 x i32> %252, %261, !dbg !27 + %283 = icmp eq <2 x i32> %252, %261, !dbg !43 + %284 = icmp sgt <2 x i32> %272, %281, !dbg !44 + %285 = and <2 x i1> %283, %284, !dbg !45 + %286 = or <2 x i1> %282, %285, !dbg !46 + %287 = xor <2 x i32> %252, %261, !dbg !29 + %288 = select <2 x i1> %286, <2 x i32> %287, <2 x i32> zeroinitializer, !dbg !30 + %289 = xor <2 x i32> %288, %236, !dbg !31 + %290 = xor <2 x i32> %272, %281, !dbg !32 + %291 = select <2 x i1> %286, <2 x i32> %290, <2 x i32> zeroinitializer, !dbg !33 + %292 = xor <2 x i32> %291, %241, !dbg !34 + %293 = insertelement <2 x i32> poison, i32 %39, i64 0, !dbg !35 + %294 = shufflevector <2 x i32> %293, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !35 + %295 = mul nuw nsw <2 x i32> %289, %294, !dbg !35 + %296 = extractelement <2 x i32> %295, i64 0, !dbg !36 + %297 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %296, i32 2, i32 31), !dbg !36 + %298 = extractelement <2 x i32> %295, i64 1, !dbg !36 + %299 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 2, i32 31), !dbg !36 + %300 = insertelement <2 x i32> poison, i32 %297, i64 0, !dbg !39 + %301 = insertelement <2 x i32> %300, i32 %299, i64 1, !dbg !39 + %302 = add <2 x i32> %295, %301, !dbg !39 + %303 = insertelement <2 x i32> poison, i32 %.lobit, i64 0, !dbg !40 + %304 = shufflevector <2 x i32> %303, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !40 + %305 = mul nuw nsw <2 x i32> %289, %304, !dbg !40 + %306 = extractelement <2 x i32> %305, i64 0, !dbg !36 + %307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %306, i32 2, i32 31), !dbg !36 + %308 = extractelement <2 x i32> %305, i64 1, !dbg !36 + %309 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 2, i32 31), !dbg !36 + %310 = insertelement <2 x i32> poison, i32 %307, i64 0, !dbg !39 + %311 = insertelement <2 x i32> %310, i32 %309, i64 1, !dbg !39 + %312 = add <2 x i32> %305, %311, !dbg !39 + %313 = extractelement <2 x i32> %292, i64 0, !dbg !42 + %314 = mul nuw nsw i32 %313, %39, !dbg !41 + %315 = extractelement <2 x i32> %292, i64 1, !dbg !42 + %316 = mul nuw nsw i32 %315, %39, !dbg !41 + %317 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 2, i32 31), !dbg !36 + %318 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %316, i32 2, i32 31), !dbg !36 + %319 = insertelement <2 x i32> poison, i32 %314, i64 0, !dbg !39 + %320 = insertelement <2 x i32> %319, i32 %316, i64 1, !dbg !39 + %321 = insertelement <2 x i32> poison, i32 %317, i64 0, !dbg !39 + %322 = insertelement <2 x i32> %321, i32 %318, i64 1, !dbg !39 + %323 = add <2 x i32> %320, %322, !dbg !39 + %324 = mul nuw nsw i32 %313, %.lobit, !dbg !42 + %325 = mul nuw nsw i32 %315, %.lobit, !dbg !42 + %326 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %324, i32 2, i32 31), !dbg !36 + %327 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %325, i32 2, i32 31), !dbg !36 + %328 = insertelement <2 x i32> poison, i32 %324, i64 0, !dbg !39 + %329 = insertelement <2 x i32> %328, i32 %325, i64 1, !dbg !39 + %330 = insertelement <2 x i32> poison, i32 %326, i64 0, !dbg !39 + %331 = insertelement <2 x i32> %330, i32 %327, i64 1, !dbg !39 + %332 = add <2 x i32> %329, %331, !dbg !39 + %333 = icmp slt <2 x i32> %302, %312, !dbg !27 + %334 = icmp eq <2 x i32> %302, %312, !dbg !43 + %335 = icmp sgt <2 x i32> %323, %332, !dbg !44 + %336 = and <2 x i1> %334, %335, !dbg !45 + %337 = or <2 x i1> %333, %336, !dbg !46 + %338 = xor <2 x i32> %302, %312, !dbg !29 + %339 = select <2 x i1> %337, <2 x i32> %338, <2 x i32> zeroinitializer, !dbg !30 + %340 = xor <2 x i32> %339, %289, !dbg !31 + %341 = xor <2 x i32> %323, %332, !dbg !32 + %342 = select <2 x i1> %337, <2 x i32> %341, <2 x i32> zeroinitializer, !dbg !33 + %343 = xor <2 x i32> %342, %292, !dbg !34 + %344 = insertelement <2 x i32> poison, i32 %38, i64 0, !dbg !35 + %345 = shufflevector <2 x i32> %344, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !35 + %346 = mul nuw nsw <2 x i32> %340, %345, !dbg !35 + %347 = extractelement <2 x i32> %346, i64 0, !dbg !36 + %348 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %347, i32 1, i32 31), !dbg !36 + %349 = extractelement <2 x i32> %346, i64 1, !dbg !36 + %350 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %349, i32 1, i32 31), !dbg !36 + %351 = insertelement <2 x i32> poison, i32 %348, i64 0, !dbg !39 + %352 = insertelement <2 x i32> %351, i32 %350, i64 1, !dbg !39 + %353 = add <2 x i32> %346, %352, !dbg !39 + %354 = insertelement <2 x i32> poison, i32 %35, i64 0, !dbg !40 + %355 = shufflevector <2 x i32> %354, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !40 + %356 = mul nuw nsw <2 x i32> %340, %355, !dbg !40 + %357 = extractelement <2 x i32> %356, i64 0, !dbg !36 + %358 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %357, i32 1, i32 31), !dbg !36 + %359 = extractelement <2 x i32> %356, i64 1, !dbg !36 + %360 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %359, i32 1, i32 31), !dbg !36 + %361 = insertelement <2 x i32> poison, i32 %358, i64 0, !dbg !39 + %362 = insertelement <2 x i32> %361, i32 %360, i64 1, !dbg !39 + %363 = add <2 x i32> %356, %362, !dbg !39 + %364 = mul nuw nsw <2 x i32> %343, %345, !dbg !41 + %365 = extractelement <2 x i32> %364, i64 0, !dbg !36 + %366 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %365, i32 1, i32 31), !dbg !36 + %367 = extractelement <2 x i32> %364, i64 1, !dbg !36 + %368 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %367, i32 1, i32 31), !dbg !36 + %369 = insertelement <2 x i32> poison, i32 %366, i64 0, !dbg !39 + %370 = insertelement <2 x i32> %369, i32 %368, i64 1, !dbg !39 + %371 = add <2 x i32> %364, %370, !dbg !39 + %372 = mul nuw nsw <2 x i32> %343, %355, !dbg !42 + %373 = extractelement <2 x i32> %372, i64 0, !dbg !36 + %374 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %373, i32 1, i32 31), !dbg !36 + %375 = extractelement <2 x i32> %372, i64 1, !dbg !36 + %376 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %375, i32 1, i32 31), !dbg !36 + %377 = insertelement <2 x i32> poison, i32 %374, i64 0, !dbg !39 + %378 = insertelement <2 x i32> %377, i32 %376, i64 1, !dbg !39 + %379 = add <2 x i32> %372, %378, !dbg !39 + %380 = icmp slt <2 x i32> %353, %363, !dbg !27 + %381 = icmp eq <2 x i32> %353, %363, !dbg !43 + %382 = icmp sgt <2 x i32> %371, %379, !dbg !44 + %383 = and <2 x i1> %381, %382, !dbg !45 + %384 = or <2 x i1> %380, %383, !dbg !46 + %385 = xor <2 x i32> %353, %363, !dbg !29 + %386 = select <2 x i1> %384, <2 x i32> %385, <2 x i32> zeroinitializer, !dbg !30 + %387 = xor <2 x i32> %386, %340, !dbg !31 + %388 = xor <2 x i32> %371, %379, !dbg !32 + %389 = select <2 x i1> %384, <2 x i32> %388, <2 x i32> zeroinitializer, !dbg !33 + %390 = xor <2 x i32> %389, %343, !dbg !34 + %391 = extractelement <2 x i32> %387, i64 0, !dbg !27 + %392 = extractelement <2 x i32> %387, i64 1, !dbg !27 + %393 = icmp slt i32 %391, %392, !dbg !27 + %394 = icmp eq i32 %391, %392, !dbg !43 + %395 = extractelement <2 x i32> %390, i64 0, !dbg !44 + %396 = extractelement <2 x i32> %390, i64 1, !dbg !44 + %397 = icmp sgt i32 %395, %396, !dbg !44 + %398 = and i1 %394, %397, !dbg !45 + %399 = or i1 %393, %398, !dbg !46 + %400 = xor i32 %395, %396, !dbg !32 + %401 = select i1 %399, i32 %400, i32 0, !dbg !33 + %402 = xor i32 %401, %395, !dbg !34 + %403 = xor i32 %401, %396, !dbg !34 + %404 = icmp eq i64 %27, 16384, !dbg !47 + %405 = icmp eq i64 %28, 16384, !dbg !47 + %406 = zext i1 %404 to i32, !dbg !21 + %407 = zext i1 %405 to i32, !dbg !21 + %408 = xor i1 %404, true, !dbg !48 + %409 = and i1 %405, %408, !dbg !48 + %410 = xor i1 %409, %43, !dbg !50 + %411 = xor i32 %406, %407, !dbg !51 + %412 = select i1 %410, i32 %411, i32 0, !dbg !52 + %413 = xor i32 %412, %406, !dbg !53 + %414 = xor i32 %412, %407, !dbg !53 + %415 = select i1 %410, i32 %49, i32 0, !dbg !54 + %416 = xor i32 %415, %20, !dbg !55 + %417 = xor i32 %415, %21, !dbg !55 + %418 = mul nuw nsw i32 %413, %38, !dbg !56 + %419 = mul nuw nsw i32 %414, %38, !dbg !56 + %420 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %418, i32 1, i32 31), !dbg !57 + %421 = add i32 %420, %418, !dbg !58 + %422 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %419, i32 1, i32 31), !dbg !57 + %423 = add i32 %422, %419, !dbg !58 + %424 = mul nuw nsw i32 %413, %35, !dbg !59 + %425 = mul nuw nsw i32 %414, %35, !dbg !59 + %426 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %424, i32 1, i32 31), !dbg !57 + %427 = add i32 %426, %424, !dbg !58 + %428 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %425, i32 1, i32 31), !dbg !57 + %429 = add i32 %428, %425, !dbg !58 + %430 = mul nuw nsw i32 %416, %38, !dbg !60 + %431 = mul nuw nsw i32 %417, %38, !dbg !60 + %432 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %430, i32 1, i32 31), !dbg !57 + %433 = add i32 %432, %430, !dbg !58 + %434 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %431, i32 1, i32 31), !dbg !57 + %435 = add i32 %434, %431, !dbg !58 + %436 = mul nuw nsw i32 %416, %35, !dbg !61 + %437 = mul nuw nsw i32 %417, %35, !dbg !61 + %438 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %436, i32 1, i32 31), !dbg !57 + %439 = add i32 %438, %436, !dbg !58 + %440 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %437, i32 1, i32 31), !dbg !57 + %441 = add i32 %440, %437, !dbg !58 + %442 = insertelement <2 x i32> poison, i32 %421, i64 0, !dbg !50 + %443 = insertelement <2 x i32> %442, i32 %423, i64 1, !dbg !50 + %444 = insertelement <2 x i32> poison, i32 %427, i64 0, !dbg !50 + %445 = insertelement <2 x i32> %444, i32 %429, i64 1, !dbg !50 + %446 = icmp sge <2 x i32> %443, %445, !dbg !50 + %447 = icmp ne <2 x i32> %443, %445, !dbg !50 + %448 = insertelement <2 x i32> poison, i32 %433, i64 0, !dbg !50 + %449 = insertelement <2 x i32> %448, i32 %435, i64 1, !dbg !50 + %450 = insertelement <2 x i32> poison, i32 %439, i64 0, !dbg !50 + %451 = insertelement <2 x i32> %450, i32 %441, i64 1, !dbg !50 + %452 = icmp sle <2 x i32> %449, %451, !dbg !50 + %453 = or <2 x i1> %447, %452, !dbg !50 + %454 = and <2 x i1> %446, %453, !dbg !50 + %455 = xor <2 x i1> %454, %92, !dbg !50 + %456 = insertelement <2 x i32> %451, i32 %429, i64 1, !dbg !51 + %457 = insertelement <2 x i32> %449, i32 %423, i64 1, !dbg !51 + %458 = xor <2 x i32> %456, %457, !dbg !51 + %459 = insertelement <2 x i32> poison, i32 %441, i64 0, !dbg !51 + %460 = insertelement <2 x i32> %459, i32 %427, i64 1, !dbg !51 + %461 = insertelement <2 x i32> poison, i32 %435, i64 0, !dbg !51 + %462 = insertelement <2 x i32> %461, i32 %421, i64 1, !dbg !51 + %463 = xor <2 x i32> %460, %462, !dbg !51 + %464 = select <2 x i1> %455, <2 x i32> zeroinitializer, <2 x i32> %458, !dbg !52 + %465 = shufflevector <2 x i1> %455, <2 x i1> poison, <2 x i32> , !dbg !52 + %466 = select <2 x i1> %465, <2 x i32> zeroinitializer, <2 x i32> %463, !dbg !52 + %467 = insertelement <2 x i32> poison, i32 %416, i64 0, !dbg !53 + %468 = insertelement <2 x i32> %467, i32 %414, i64 1, !dbg !53 + %469 = xor <2 x i32> %464, %468, !dbg !53 + %470 = insertelement <2 x i32> poison, i32 %417, i64 0, !dbg !53 + %471 = insertelement <2 x i32> %470, i32 %413, i64 1, !dbg !53 + %472 = xor <2 x i32> %466, %471, !dbg !53 + %473 = extractelement <2 x i32> %469, i64 1, !dbg !50 + %474 = extractelement <2 x i32> %472, i64 1, !dbg !50 + %475 = icmp sge i32 %474, %473, !dbg !50 + %476 = icmp ne <2 x i32> %469, %472, !dbg !50 + %477 = icmp sle <2 x i32> %469, %472, !dbg !50 + %shift = shufflevector <2 x i1> %476, <2 x i1> poison, <2 x i32> , !dbg !50 + %foldExtExtBinop = or <2 x i1> %shift, %477, !dbg !50 + %478 = extractelement <2 x i1> %foldExtExtBinop, i64 0, !dbg !50 + %479 = and i1 %475, %478, !dbg !50 + %.not12 = xor i1 %479, %77, !dbg !50 + %480 = xor i32 %473, %474, !dbg !51 + %481 = select i1 %.not12, i32 0, i32 %480, !dbg !52 + %482 = xor i32 %481, %474, !dbg !53 + %483 = xor i32 %481, %473, !dbg !53 + %484 = extractelement <2 x i32> %469, i64 0, !dbg !62 + %485 = extractelement <2 x i32> %472, i64 0, !dbg !62 + %486 = xor i32 %485, %484, !dbg !62 + %487 = select i1 %.not12, i32 0, i32 %486, !dbg !54 + %488 = xor i32 %487, %484, !dbg !55 + %489 = xor i32 %487, %485, !dbg !55 + %490 = mul nuw nsw i32 %482, %39, !dbg !56 + %491 = mul nuw nsw i32 %483, %39, !dbg !56 + %492 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %490, i32 2, i32 31), !dbg !57 + %493 = add i32 %490, %492, !dbg !58 + %494 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %491, i32 2, i32 31), !dbg !57 + %495 = add i32 %491, %494, !dbg !58 + %496 = mul nuw nsw i32 %482, %.lobit, !dbg !59 + %497 = mul nuw nsw i32 %483, %.lobit, !dbg !59 + %498 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %496, i32 2, i32 31), !dbg !57 + %499 = add i32 %496, %498, !dbg !58 + %500 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %497, i32 2, i32 31), !dbg !57 + %501 = add i32 %497, %500, !dbg !58 + %502 = mul nuw nsw i32 %488, %39, !dbg !60 + %503 = mul nuw nsw i32 %489, %39, !dbg !60 + %504 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %502, i32 2, i32 31), !dbg !57 + %505 = add i32 %502, %504, !dbg !58 + %506 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %503, i32 2, i32 31), !dbg !57 + %507 = add i32 %503, %506, !dbg !58 + %508 = mul nuw nsw i32 %488, %.lobit, !dbg !61 + %509 = mul nuw nsw i32 %489, %.lobit, !dbg !61 + %510 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %508, i32 2, i32 31), !dbg !57 + %511 = add i32 %508, %510, !dbg !58 + %512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %509, i32 2, i32 31), !dbg !57 + %513 = add i32 %509, %512, !dbg !58 + %514 = insertelement <2 x i32> poison, i32 %493, i64 0, !dbg !50 + %515 = insertelement <2 x i32> %514, i32 %495, i64 1, !dbg !50 + %516 = insertelement <2 x i32> poison, i32 %499, i64 0, !dbg !50 + %517 = insertelement <2 x i32> %516, i32 %501, i64 1, !dbg !50 + %518 = icmp sge <2 x i32> %515, %517, !dbg !50 + %519 = icmp ne <2 x i32> %515, %517, !dbg !50 + %520 = xor i32 %493, %499, !dbg !51 + %521 = xor i32 %495, %501, !dbg !51 + %522 = insertelement <2 x i32> poison, i32 %505, i64 0, !dbg !50 + %523 = insertelement <2 x i32> %522, i32 %507, i64 1, !dbg !50 + %524 = insertelement <2 x i32> poison, i32 %511, i64 0, !dbg !50 + %525 = insertelement <2 x i32> %524, i32 %513, i64 1, !dbg !50 + %526 = icmp sle <2 x i32> %523, %525, !dbg !50 + %527 = or <2 x i1> %519, %526, !dbg !50 + %528 = and <2 x i1> %518, %527, !dbg !50 + %529 = xor <2 x i1> %528, %166, !dbg !50 + %530 = extractelement <2 x i1> %529, i64 0, !dbg !52 + %531 = select i1 %530, i32 0, i32 %520, !dbg !52 + %532 = extractelement <2 x i1> %529, i64 1, !dbg !52 + %533 = select i1 %532, i32 0, i32 %521, !dbg !52 + %534 = xor i32 %531, %482, !dbg !53 + %535 = xor i32 %533, %483, !dbg !53 + %536 = xor <2 x i32> %523, %525, !dbg !62 + %537 = select <2 x i1> %529, <2 x i32> zeroinitializer, <2 x i32> %536, !dbg !54 + %538 = insertelement <2 x i32> poison, i32 %488, i64 0, !dbg !55 + %539 = insertelement <2 x i32> %538, i32 %489, i64 1, !dbg !55 + %540 = xor <2 x i32> %537, %539, !dbg !55 + %541 = mul nuw nsw i32 %534, %38, !dbg !56 + %542 = mul nuw nsw i32 %535, %38, !dbg !56 + %543 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %541, i32 1, i32 31), !dbg !57 + %544 = add i32 %541, %543, !dbg !58 + %545 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %542, i32 1, i32 31), !dbg !57 + %546 = add i32 %542, %545, !dbg !58 + %547 = mul nuw nsw i32 %534, %35, !dbg !59 + %548 = mul nuw nsw i32 %535, %35, !dbg !59 + %549 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %547, i32 1, i32 31), !dbg !57 + %550 = add i32 %547, %549, !dbg !58 + %551 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %548, i32 1, i32 31), !dbg !57 + %552 = add i32 %548, %551, !dbg !58 + %553 = extractelement <2 x i32> %540, i64 0, !dbg !61 + %554 = mul nuw nsw i32 %553, %38, !dbg !60 + %555 = extractelement <2 x i32> %540, i64 1, !dbg !61 + %556 = mul nuw nsw i32 %555, %38, !dbg !60 + %557 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %554, i32 1, i32 31), !dbg !57 + %558 = add i32 %554, %557, !dbg !58 + %559 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %556, i32 1, i32 31), !dbg !57 + %560 = add i32 %556, %559, !dbg !58 + %561 = mul nuw nsw i32 %553, %35, !dbg !61 + %562 = mul nuw nsw i32 %555, %35, !dbg !61 + %563 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %561, i32 1, i32 31), !dbg !57 + %564 = add i32 %561, %563, !dbg !58 + %565 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %562, i32 1, i32 31), !dbg !57 + %566 = add i32 %562, %565, !dbg !58 + %567 = insertelement <2 x i32> poison, i32 %544, i64 0, !dbg !50 + %568 = insertelement <2 x i32> %567, i32 %546, i64 1, !dbg !50 + %569 = insertelement <2 x i32> poison, i32 %550, i64 0, !dbg !50 + %570 = insertelement <2 x i32> %569, i32 %552, i64 1, !dbg !50 + %571 = icmp sge <2 x i32> %568, %570, !dbg !50 + %572 = icmp ne <2 x i32> %568, %570, !dbg !50 + %573 = insertelement <2 x i32> poison, i32 %558, i64 0, !dbg !50 + %574 = insertelement <2 x i32> %573, i32 %560, i64 1, !dbg !50 + %575 = insertelement <2 x i32> poison, i32 %564, i64 0, !dbg !50 + %576 = insertelement <2 x i32> %575, i32 %566, i64 1, !dbg !50 + %577 = icmp sle <2 x i32> %574, %576, !dbg !50 + %578 = or <2 x i1> %572, %577, !dbg !50 + %579 = and <2 x i1> %571, %578, !dbg !50 + %580 = xor <2 x i1> %579, %166, !dbg !50 + %581 = xor <2 x i32> %568, %570, !dbg !51 + %582 = select <2 x i1> %580, <2 x i32> zeroinitializer, <2 x i32> %581, !dbg !52 + %583 = insertelement <2 x i32> poison, i32 %534, i64 0, !dbg !53 + %584 = insertelement <2 x i32> %583, i32 %535, i64 1, !dbg !53 + %585 = xor <2 x i32> %582, %584, !dbg !53 + %586 = xor <2 x i32> %574, %576, !dbg !62 + %587 = select <2 x i1> %580, <2 x i32> zeroinitializer, <2 x i32> %586, !dbg !54 + %588 = xor <2 x i32> %587, %540, !dbg !55 + %589 = extractelement <2 x i32> %585, i64 0, !dbg !50 + %590 = extractelement <2 x i32> %585, i64 1, !dbg !50 + %591 = icmp sge i32 %589, %590, !dbg !50 + %592 = icmp ne i32 %589, %590, !dbg !50 + %593 = extractelement <2 x i32> %588, i64 0, !dbg !50 + %594 = extractelement <2 x i32> %588, i64 1, !dbg !50 + %595 = icmp sle i32 %593, %594, !dbg !50 + %596 = or i1 %592, %595, !dbg !50 + %597 = and i1 %591, %596, !dbg !50 + %.not17 = xor i1 %597, %123, !dbg !50 + %598 = xor i32 %589, %590, !dbg !51 + %599 = select i1 %.not17, i32 0, i32 %598, !dbg !52 + %600 = xor i32 %599, %589, !dbg !53 + %601 = xor i32 %599, %590, !dbg !53 + %602 = xor i32 %593, %594, !dbg !62 + %603 = select i1 %.not17, i32 0, i32 %602, !dbg !54 + %604 = xor i32 %603, %593, !dbg !55 + %605 = xor i32 %603, %594, !dbg !55 + %606 = mul nuw nsw i32 %600, %40, !dbg !56 + %607 = mul nuw nsw i32 %601, %40, !dbg !56 + %608 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %606, i32 4, i32 31), !dbg !57 + %609 = add i32 %606, %608, !dbg !58 + %610 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %607, i32 4, i32 31), !dbg !57 + %611 = add i32 %607, %610, !dbg !58 + %612 = mul nuw nsw i32 %600, %.lobit1, !dbg !59 + %613 = mul nuw nsw i32 %601, %.lobit1, !dbg !59 + %614 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %612, i32 4, i32 31), !dbg !57 + %615 = add i32 %612, %614, !dbg !58 + %616 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %613, i32 4, i32 31), !dbg !57 + %617 = add i32 %613, %616, !dbg !58 + %618 = mul nuw nsw i32 %604, %40, !dbg !60 + %619 = mul nuw nsw i32 %605, %40, !dbg !60 + %620 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %618, i32 4, i32 31), !dbg !57 + %621 = add i32 %618, %620, !dbg !58 + %622 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %619, i32 4, i32 31), !dbg !57 + %623 = add i32 %619, %622, !dbg !58 + %624 = mul nuw nsw i32 %604, %.lobit1, !dbg !61 + %625 = mul nuw nsw i32 %605, %.lobit1, !dbg !61 + %626 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %624, i32 4, i32 31), !dbg !57 + %627 = add i32 %624, %626, !dbg !58 + %628 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %625, i32 4, i32 31), !dbg !57 + %629 = add i32 %625, %628, !dbg !58 + %630 = insertelement <2 x i32> poison, i32 %609, i64 0, !dbg !48 + %631 = insertelement <2 x i32> %630, i32 %611, i64 1, !dbg !48 + %632 = insertelement <2 x i32> poison, i32 %615, i64 0, !dbg !48 + %633 = insertelement <2 x i32> %632, i32 %617, i64 1, !dbg !48 + %634 = icmp slt <2 x i32> %631, %633, !dbg !48 + %635 = icmp eq <2 x i32> %631, %633, !dbg !63 + %636 = insertelement <2 x i32> poison, i32 %621, i64 0, !dbg !64 + %637 = insertelement <2 x i32> %636, i32 %623, i64 1, !dbg !64 + %638 = insertelement <2 x i32> poison, i32 %627, i64 0, !dbg !64 + %639 = insertelement <2 x i32> %638, i32 %629, i64 1, !dbg !64 + %640 = icmp sgt <2 x i32> %637, %639, !dbg !64 + %641 = and <2 x i1> %635, %640, !dbg !65 + %642 = or <2 x i1> %634, %641, !dbg !66 + %643 = xor <2 x i32> %631, %633, !dbg !51 + %644 = select <2 x i1> %642, <2 x i32> %643, <2 x i32> zeroinitializer, !dbg !52 + %645 = insertelement <2 x i32> poison, i32 %600, i64 0, !dbg !53 + %646 = insertelement <2 x i32> %645, i32 %601, i64 1, !dbg !53 + %647 = xor <2 x i32> %644, %646, !dbg !53 + %648 = xor i32 %621, %627, !dbg !62 + %649 = xor i32 %623, %629, !dbg !62 + %650 = extractelement <2 x i1> %642, i64 0, !dbg !54 + %651 = select i1 %650, i32 %648, i32 0, !dbg !54 + %652 = extractelement <2 x i1> %642, i64 1, !dbg !54 + %653 = select i1 %652, i32 %649, i32 0, !dbg !54 + %654 = xor i32 %651, %604, !dbg !55 + %655 = xor i32 %653, %605, !dbg !55 + %656 = extractelement <2 x i32> %647, i64 0, !dbg !59 + %657 = mul nuw nsw i32 %656, %39, !dbg !56 + %658 = extractelement <2 x i32> %647, i64 1, !dbg !59 + %659 = mul nuw nsw i32 %658, %39, !dbg !56 + %660 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %657, i32 2, i32 31), !dbg !57 + %661 = add i32 %657, %660, !dbg !58 + %662 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %659, i32 2, i32 31), !dbg !57 + %663 = add i32 %659, %662, !dbg !58 + %664 = mul nuw nsw i32 %656, %.lobit, !dbg !59 + %665 = mul nuw nsw i32 %658, %.lobit, !dbg !59 + %666 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %664, i32 2, i32 31), !dbg !57 + %667 = add i32 %664, %666, !dbg !58 + %668 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %665, i32 2, i32 31), !dbg !57 + %669 = add i32 %665, %668, !dbg !58 + %670 = mul nuw nsw i32 %654, %39, !dbg !60 + %671 = mul nuw nsw i32 %655, %39, !dbg !60 + %672 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %670, i32 2, i32 31), !dbg !57 + %673 = add i32 %670, %672, !dbg !58 + %674 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %671, i32 2, i32 31), !dbg !57 + %675 = add i32 %671, %674, !dbg !58 + %676 = mul nuw nsw i32 %654, %.lobit, !dbg !61 + %677 = mul nuw nsw i32 %655, %.lobit, !dbg !61 + %678 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %676, i32 2, i32 31), !dbg !57 + %679 = add i32 %676, %678, !dbg !58 + %680 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %677, i32 2, i32 31), !dbg !57 + %681 = add i32 %677, %680, !dbg !58 + %682 = insertelement <2 x i32> poison, i32 %661, i64 0, !dbg !48 + %683 = insertelement <2 x i32> %682, i32 %663, i64 1, !dbg !48 + %684 = insertelement <2 x i32> poison, i32 %667, i64 0, !dbg !48 + %685 = insertelement <2 x i32> %684, i32 %669, i64 1, !dbg !48 + %686 = icmp slt <2 x i32> %683, %685, !dbg !48 + %687 = icmp eq <2 x i32> %683, %685, !dbg !63 + %688 = insertelement <2 x i32> poison, i32 %673, i64 0, !dbg !64 + %689 = insertelement <2 x i32> %688, i32 %675, i64 1, !dbg !64 + %690 = insertelement <2 x i32> poison, i32 %679, i64 0, !dbg !64 + %691 = insertelement <2 x i32> %690, i32 %681, i64 1, !dbg !64 + %692 = icmp sgt <2 x i32> %689, %691, !dbg !64 + %693 = and <2 x i1> %687, %692, !dbg !65 + %694 = or <2 x i1> %686, %693, !dbg !66 + %695 = xor <2 x i32> %683, %685, !dbg !51 + %696 = select <2 x i1> %694, <2 x i32> %695, <2 x i32> zeroinitializer, !dbg !52 + %697 = xor <2 x i32> %696, %647, !dbg !53 + %698 = xor i32 %673, %679, !dbg !62 + %699 = xor i32 %675, %681, !dbg !62 + %700 = extractelement <2 x i1> %694, i64 0, !dbg !54 + %701 = select i1 %700, i32 %698, i32 0, !dbg !54 + %702 = extractelement <2 x i1> %694, i64 1, !dbg !54 + %703 = select i1 %702, i32 %699, i32 0, !dbg !54 + %704 = xor i32 %701, %654, !dbg !55 + %705 = xor i32 %703, %655, !dbg !55 + %706 = extractelement <2 x i32> %697, i64 0, !dbg !59 + %707 = mul nuw nsw i32 %706, %38, !dbg !56 + %708 = extractelement <2 x i32> %697, i64 1, !dbg !59 + %709 = mul nuw nsw i32 %708, %38, !dbg !56 + %710 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %707, i32 1, i32 31), !dbg !57 + %711 = add i32 %707, %710, !dbg !58 + %712 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %709, i32 1, i32 31), !dbg !57 + %713 = add i32 %709, %712, !dbg !58 + %714 = mul nuw nsw i32 %706, %35, !dbg !59 + %715 = mul nuw nsw i32 %708, %35, !dbg !59 + %716 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %714, i32 1, i32 31), !dbg !57 + %717 = add i32 %714, %716, !dbg !58 + %718 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %715, i32 1, i32 31), !dbg !57 + %719 = add i32 %715, %718, !dbg !58 + %720 = mul nuw nsw i32 %704, %38, !dbg !60 + %721 = mul nuw nsw i32 %705, %38, !dbg !60 + %722 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %720, i32 1, i32 31), !dbg !57 + %723 = add i32 %720, %722, !dbg !58 + %724 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %721, i32 1, i32 31), !dbg !57 + %725 = add i32 %721, %724, !dbg !58 + %726 = mul nuw nsw i32 %704, %35, !dbg !61 + %727 = mul nuw nsw i32 %705, %35, !dbg !61 + %728 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %726, i32 1, i32 31), !dbg !57 + %729 = add i32 %726, %728, !dbg !58 + %730 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %727, i32 1, i32 31), !dbg !57 + %731 = add i32 %727, %730, !dbg !58 + %732 = insertelement <2 x i32> poison, i32 %711, i64 0, !dbg !48 + %733 = insertelement <2 x i32> %732, i32 %713, i64 1, !dbg !48 + %734 = insertelement <2 x i32> poison, i32 %717, i64 0, !dbg !48 + %735 = insertelement <2 x i32> %734, i32 %719, i64 1, !dbg !48 + %736 = icmp slt <2 x i32> %733, %735, !dbg !48 + %737 = icmp eq <2 x i32> %733, %735, !dbg !63 + %738 = insertelement <2 x i32> poison, i32 %723, i64 0, !dbg !64 + %739 = insertelement <2 x i32> %738, i32 %725, i64 1, !dbg !64 + %740 = insertelement <2 x i32> poison, i32 %729, i64 0, !dbg !64 + %741 = insertelement <2 x i32> %740, i32 %731, i64 1, !dbg !64 + %742 = icmp sgt <2 x i32> %739, %741, !dbg !64 + %743 = and <2 x i1> %737, %742, !dbg !65 + %744 = or <2 x i1> %736, %743, !dbg !66 + %745 = xor <2 x i32> %733, %735, !dbg !51 + %746 = select <2 x i1> %744, <2 x i32> %745, <2 x i32> zeroinitializer, !dbg !52 + %747 = xor <2 x i32> %746, %697, !dbg !53 + %748 = xor i32 %723, %729, !dbg !62 + %749 = xor i32 %725, %731, !dbg !62 + %750 = extractelement <2 x i1> %744, i64 0, !dbg !54 + %751 = select i1 %750, i32 %748, i32 0, !dbg !54 + %752 = extractelement <2 x i1> %744, i64 1, !dbg !54 + %753 = select i1 %752, i32 %749, i32 0, !dbg !54 + %754 = xor i32 %751, %704, !dbg !55 + %755 = xor i32 %753, %705, !dbg !55 + %756 = extractelement <2 x i32> %747, i64 0, !dbg !48 + %757 = extractelement <2 x i32> %747, i64 1, !dbg !48 + %758 = icmp slt i32 %756, %757, !dbg !48 + %759 = icmp eq i32 %756, %757, !dbg !63 + %760 = icmp sgt i32 %754, %755, !dbg !64 + %761 = and i1 %759, %760, !dbg !65 + %762 = or i1 %758, %761, !dbg !66 + %763 = xor i32 %754, %755, !dbg !62 + %764 = select i1 %762, i32 %763, i32 0, !dbg !54 + %765 = xor i32 %764, %754, !dbg !55 + %766 = xor i32 %764, %755, !dbg !55 + %narrow = select i1 %19, i1 %30, i1 false, !dbg !67 + %767 = zext i1 %narrow to i64, !dbg !67 + %narrow18 = select i1 %19, i1 %32, i1 false, !dbg !67 + %768 = zext i1 %narrow18 to i64, !dbg !67 + %769 = add nuw nsw i64 %767, %768, !dbg !68 + %770 = trunc nuw nsw i64 %769 to i32, !dbg !70 + %771 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %770, i32 4, i32 31), !dbg !70 + %772 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 4, i32 31), !dbg !70 + %773 = insertelement <2 x i32> poison, i32 %771, i64 0, !dbg !70 + %774 = insertelement <2 x i32> %773, i32 %772, i64 1, !dbg !70 + %775 = bitcast <2 x i32> %774 to i64, !dbg !70 + %776 = add i64 %769, %775, !dbg !68 + %extelt.offset = lshr i64 %776, 32, !dbg !70 + %777 = trunc nuw i64 %extelt.offset to i32, !dbg !70 + %778 = trunc i64 %776 to i32, !dbg !70 + %779 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %778, i32 2, i32 31), !dbg !70 + %780 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %777, i32 2, i32 31), !dbg !70 + %781 = insertelement <2 x i32> poison, i32 %779, i64 0, !dbg !70 + %782 = insertelement <2 x i32> %781, i32 %780, i64 1, !dbg !70 + %783 = bitcast <2 x i32> %782 to i64, !dbg !70 + %784 = add i64 %776, %783, !dbg !68 + %extelt.offset19 = lshr i64 %784, 32, !dbg !70 + %785 = trunc nuw i64 %extelt.offset19 to i32, !dbg !70 + %786 = trunc i64 %784 to i32, !dbg !70 + %787 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %786, i32 1, i32 31), !dbg !70 + %788 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %785, i32 1, i32 31), !dbg !70 + %789 = insertelement <2 x i32> poison, i32 %787, i64 0, !dbg !70 + %790 = insertelement <2 x i32> %789, i32 %788, i64 1, !dbg !70 + %791 = bitcast <2 x i32> %790 to i64, !dbg !70 + %792 = add i64 %784, %791, !dbg !68 + %793 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %15, !dbg !71 + %794 = insertelement <1 x i64> poison, i64 %792, i64 0, !dbg !71 + store <1 x i64> %794, ptr addrspace(3) %793, align 8, !dbg !71 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !71 + %795 = shl nuw nsw i32 %17, 3, !dbg !71 + %796 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %795, !dbg !71 + %797 = load i64, ptr addrspace(3) %796, align 8, !dbg !71 + %narrow20 = select i1 %19, i1 %404, i1 false, !dbg !72 + %798 = zext i1 %narrow20 to i64, !dbg !72 + %narrow21 = select i1 %19, i1 %405, i1 false, !dbg !72 + %799 = zext i1 %narrow21 to i64, !dbg !72 + %800 = add nuw nsw i64 %798, %799, !dbg !73 + %801 = trunc nuw nsw i64 %800 to i32, !dbg !75 + %802 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %801, i32 4, i32 31), !dbg !75 + %803 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 4, i32 31), !dbg !75 + %804 = insertelement <2 x i32> poison, i32 %802, i64 0, !dbg !75 + %805 = insertelement <2 x i32> %804, i32 %803, i64 1, !dbg !75 + %806 = bitcast <2 x i32> %805 to i64, !dbg !75 + %807 = add i64 %800, %806, !dbg !73 + %extelt.offset23 = lshr i64 %807, 32, !dbg !75 + %808 = trunc nuw i64 %extelt.offset23 to i32, !dbg !75 + %809 = trunc i64 %807 to i32, !dbg !75 + %810 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %809, i32 2, i32 31), !dbg !75 + %811 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %808, i32 2, i32 31), !dbg !75 + %812 = insertelement <2 x i32> poison, i32 %810, i64 0, !dbg !75 + %813 = insertelement <2 x i32> %812, i32 %811, i64 1, !dbg !75 + %814 = bitcast <2 x i32> %813 to i64, !dbg !75 + %815 = add i64 %807, %814, !dbg !73 + %extelt.offset24 = lshr i64 %815, 32, !dbg !75 + %816 = trunc nuw i64 %extelt.offset24 to i32, !dbg !75 + %817 = trunc i64 %815 to i32, !dbg !75 + %818 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %817, i32 1, i32 31), !dbg !75 + %819 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %816, i32 1, i32 31), !dbg !75 + %820 = insertelement <2 x i32> poison, i32 %818, i64 0, !dbg !75 + %821 = insertelement <2 x i32> %820, i32 %819, i64 1, !dbg !75 + %822 = bitcast <2 x i32> %821 to i64, !dbg !75 + %823 = add i64 %815, %822, !dbg !73 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !76 + %824 = insertelement <1 x i64> poison, i64 %823, i64 0, !dbg !76 + store <1 x i64> %824, ptr addrspace(3) %793, align 8, !dbg !76 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !76 + %825 = load i64, ptr addrspace(3) %796, align 8, !dbg !76 + %826 = trunc i64 %792 to i32, !dbg !71 + %827 = icmp slt i32 %20, %826, !dbg !77 + %828 = icmp slt i32 %21, %826, !dbg !77 + %829 = select i1 %827, i32 %402, i32 16, !dbg !78 + %830 = select i1 %828, i32 %403, i32 16, !dbg !78 + %831 = add i32 %829, 17, !dbg !79 + %832 = add i32 %830, 17, !dbg !79 + %833 = icmp slt i32 %829, 0, !dbg !80 + %834 = icmp slt i32 %830, 0, !dbg !80 + %835 = select i1 %833, i32 %831, i32 %829, !dbg !81 + %836 = select i1 %834, i32 %832, i32 %830, !dbg !81 + %837 = icmp ugt i32 %835, 16, !dbg !82 + %838 = icmp ugt i32 %836, 16, !dbg !82 + %.not2629 = or i1 %837, %838, !dbg !83 + %839 = and i1 %19, %.not2629, !dbg !83 + br i1 %839, label %840, label %841, !dbg !83 + +840: ; preds = %11 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 71, ptr nonnull @assertFunc_0, i64 1), !dbg !83 + unreachable, !dbg !83 + +841: ; preds = %11 + %842 = trunc i64 %823 to i32, !dbg !76 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !83 + %843 = icmp slt i32 %20, %842, !dbg !84 + %844 = icmp slt i32 %21, %842, !dbg !84 + %845 = select i1 %843, i32 %765, i32 16, !dbg !85 + %846 = select i1 %844, i32 %766, i32 16, !dbg !85 + %847 = add i32 %845, 17, !dbg !86 + %848 = add i32 %846, 17, !dbg !86 + %849 = icmp slt i32 %845, 0, !dbg !87 + %850 = icmp slt i32 %846, 0, !dbg !87 + %851 = select i1 %849, i32 %847, i32 %845, !dbg !88 + %852 = select i1 %850, i32 %848, i32 %846, !dbg !88 + %853 = icmp ugt i32 %851, 16, !dbg !89 + %854 = icmp ugt i32 %852, 16, !dbg !89 + %.not3134 = or i1 %853, %854, !dbg !90 + %855 = and i1 %19, %.not3134, !dbg !90 + br i1 %855, label %856, label %857, !dbg !90 + +856: ; preds = %841 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 80, ptr nonnull @assertFunc_1, i64 1), !dbg !90 + unreachable, !dbg !90 + +857: ; preds = %841 + %858 = trunc i64 %825 to i32, !dbg !76 + %859 = trunc i64 %797 to i32, !dbg !71 + %860 = or disjoint i32 %13, %17, !dbg !13 + %861 = icmp slt i32 %860, 32, !dbg !14 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !90 + %862 = sext i32 %860 to i64, !dbg !91 + %863 = getelementptr i32, ptr addrspace(1) %1, i64 %862, !dbg !91 + %864 = icmp eq i32 %15, 0, !dbg !92 + %865 = and i1 %864, %861, !dbg !92 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %859, ptr addrspace(1) %863, i1 %865) #5, !dbg !92 + %866 = getelementptr i32, ptr addrspace(1) %2, i64 %862, !dbg !93 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %858, ptr addrspace(1) %866, i1 %865) #5, !dbg !94 + %867 = getelementptr i32, ptr addrspace(1) %3, i64 %24, !dbg !95 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %402, i32 %403, ptr addrspace(1) %867, i1 %19) #5, !dbg !96 + %868 = mul i32 %18, 17, !dbg !97 + %869 = add i32 %835, %868, !dbg !98 + %870 = add i32 %836, %868, !dbg !98 + %871 = sext i32 %869 to i64, !dbg !99 + %872 = getelementptr i32, ptr addrspace(1) %4, i64 %871, !dbg !99 + %873 = sext i32 %870 to i64, !dbg !99 + %874 = getelementptr i32, ptr addrspace(1) %4, i64 %873, !dbg !99 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !100 + %875 = ptrtoint ptr addrspace(1) %872 to i64, !dbg !100 + %876 = ptrtoint ptr addrspace(1) %874 to i64, !dbg !100 + %877 = shl nuw nsw i32 %14, 5, !dbg !100 + %878 = and i32 %877, 96, !dbg !100 + %879 = and i32 %14, 48, !dbg !100 + %880 = shl nuw nsw i32 %879, 3, !dbg !100 + %881 = shl nuw nsw i32 %14, 1, !dbg !100 + %882 = and i32 %881, 120, !dbg !100 + %883 = or disjoint i32 %878, %880, !dbg !100 + %884 = xor i32 %883, %882, !dbg !100 + %885 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %884, !dbg !100 + %886 = insertelement <1 x i64> poison, i64 %875, i64 0, !dbg !100 + store <1 x i64> %886, ptr addrspace(3) %885, align 8, !dbg !100 + %887 = xor i32 %884, 520, !dbg !100 + %888 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %887, !dbg !100 + %889 = insertelement <1 x i64> poison, i64 %876, i64 0, !dbg !100 + store <1 x i64> %889, ptr addrspace(3) %888, align 8, !dbg !100 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !100 + %890 = shl nuw nsw i32 %14, 6, !dbg !100 + %891 = and i32 %890, 384, !dbg !100 + %892 = shl nuw nsw i32 %17, 4, !dbg !100 + %893 = shl nuw nsw i32 %879, 1, !dbg !100 + %894 = and i32 %14, 8, !dbg !100 + %895 = icmp eq i32 %894, 0, !dbg !100 + %896 = select i1 %895, i32 0, i32 520, !dbg !100 + %897 = or disjoint i32 %891, %892, !dbg !100 + %898 = xor i32 %897, %893, !dbg !100 + %899 = or disjoint i32 %898, %896, !dbg !100 + %900 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %899, !dbg !100 + %901 = load i64, ptr addrspace(3) %900, align 8, !dbg !100 + %902 = xor i32 %899, 8, !dbg !100 + %903 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %902, !dbg !100 + %904 = load i64, ptr addrspace(3) %903, align 8, !dbg !100 + %905 = inttoptr i64 %901 to ptr addrspace(1), !dbg !100 + %906 = inttoptr i64 %904 to ptr addrspace(1), !dbg !100 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %905, i1 %861) #5, !dbg !100 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %906, i1 %861) #5, !dbg !100 + %907 = getelementptr i32, ptr addrspace(1) %5, i64 %24, !dbg !101 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %765, i32 %766, ptr addrspace(1) %907, i1 %19) #5, !dbg !102 + %908 = add i32 %851, %868, !dbg !103 + %909 = add i32 %852, %868, !dbg !103 + %910 = sext i32 %908 to i64, !dbg !104 + %911 = getelementptr i32, ptr addrspace(1) %6, i64 %910, !dbg !104 + %912 = sext i32 %909 to i64, !dbg !104 + %913 = getelementptr i32, ptr addrspace(1) %6, i64 %912, !dbg !104 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !105 + %914 = ptrtoint ptr addrspace(1) %911 to i64, !dbg !105 + %915 = ptrtoint ptr addrspace(1) %913 to i64, !dbg !105 + %916 = insertelement <1 x i64> poison, i64 %914, i64 0, !dbg !105 + store <1 x i64> %916, ptr addrspace(3) %885, align 8, !dbg !105 + %917 = insertelement <1 x i64> poison, i64 %915, i64 0, !dbg !105 + store <1 x i64> %917, ptr addrspace(3) %888, align 8, !dbg !105 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !105 + %918 = load i64, ptr addrspace(3) %900, align 8, !dbg !105 + %919 = load i64, ptr addrspace(3) %903, align 8, !dbg !105 + %920 = inttoptr i64 %918 to ptr addrspace(1), !dbg !105 + %921 = inttoptr i64 %919 to ptr addrspace(1), !dbg !105 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %920, i1 %861) #5, !dbg !105 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %921, i1 %861) #5, !dbg !105 + ret void, !dbg !106 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="64" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", linkageName: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 24, column: 28, scope: !9) +!11 = !DILocation(line: 24, column: 33, scope: !9) +!12 = !DILocation(line: 25, column: 44, scope: !9) +!13 = !DILocation(line: 25, column: 23, scope: !9) +!14 = !DILocation(line: 26, column: 21, scope: !9) +!15 = !DILocation(line: 27, column: 38, scope: !9) +!16 = !DILocation(line: 34, column: 40, scope: !9) +!17 = !DILocation(line: 34, column: 37, scope: !9) +!18 = !DILocation(line: 34, column: 30, scope: !9) +!19 = !DILocation(line: 34, column: 45, scope: !9) +!20 = !DILocation(line: 39, column: 18, scope: !9) +!21 = !DILocation(line: 0, scope: !9) +!22 = !DILocation(line: 627, column: 44, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !9, file: !24, discriminator: 0) +!24 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!25 = !DILocation(line: 46, column: 71, scope: !9) +!26 = !DILocation(line: 537, column: 21, scope: !23, inlinedAt: !25) +!27 = !DILocation(line: 574, column: 22, scope: !23, inlinedAt: !25) +!28 = !DILocation(line: 599, column: 28, scope: !23, inlinedAt: !25) +!29 = !DILocation(line: 600, column: 38, scope: !23, inlinedAt: !25) +!30 = !DILocation(line: 600, column: 46, scope: !23, inlinedAt: !25) +!31 = !DILocation(line: 600, column: 15, scope: !23, inlinedAt: !25) +!32 = !DILocation(line: 601, column: 48, scope: !23, inlinedAt: !25) +!33 = !DILocation(line: 601, column: 59, scope: !23, inlinedAt: !25) +!34 = !DILocation(line: 601, column: 22, scope: !23, inlinedAt: !25) +!35 = !DILocation(line: 538, column: 40, scope: !23, inlinedAt: !25) +!36 = !DILocation(line: 291, column: 36, scope: !37, inlinedAt: !25) +!37 = distinct !DILexicalBlockFile(scope: !9, file: !38, discriminator: 0) +!38 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!39 = !DILocation(line: 261, column: 15, scope: !37, inlinedAt: !25) +!40 = !DILocation(line: 539, column: 41, scope: !23, inlinedAt: !25) +!41 = !DILocation(line: 548, column: 23, scope: !23, inlinedAt: !25) +!42 = !DILocation(line: 551, column: 23, scope: !23, inlinedAt: !25) +!43 = !DILocation(line: 591, column: 21, scope: !23, inlinedAt: !25) +!44 = !DILocation(line: 594, column: 40, scope: !23, inlinedAt: !25) +!45 = !DILocation(line: 594, column: 29, scope: !23, inlinedAt: !25) +!46 = !DILocation(line: 594, column: 23, scope: !23, inlinedAt: !25) +!47 = !DILocation(line: 47, column: 20, scope: !9) +!48 = !DILocation(line: 574, column: 22, scope: !23, inlinedAt: !49) +!49 = !DILocation(line: 51, column: 71, scope: !9) +!50 = !DILocation(line: 599, column: 28, scope: !23, inlinedAt: !49) +!51 = !DILocation(line: 600, column: 38, scope: !23, inlinedAt: !49) +!52 = !DILocation(line: 600, column: 46, scope: !23, inlinedAt: !49) +!53 = !DILocation(line: 600, column: 15, scope: !23, inlinedAt: !49) +!54 = !DILocation(line: 601, column: 59, scope: !23, inlinedAt: !49) +!55 = !DILocation(line: 601, column: 22, scope: !23, inlinedAt: !49) +!56 = !DILocation(line: 538, column: 40, scope: !23, inlinedAt: !49) +!57 = !DILocation(line: 291, column: 36, scope: !37, inlinedAt: !49) +!58 = !DILocation(line: 261, column: 15, scope: !37, inlinedAt: !49) +!59 = !DILocation(line: 539, column: 41, scope: !23, inlinedAt: !49) +!60 = !DILocation(line: 548, column: 23, scope: !23, inlinedAt: !49) +!61 = !DILocation(line: 551, column: 23, scope: !23, inlinedAt: !49) +!62 = !DILocation(line: 601, column: 48, scope: !23, inlinedAt: !49) +!63 = !DILocation(line: 591, column: 21, scope: !23, inlinedAt: !49) +!64 = !DILocation(line: 594, column: 40, scope: !23, inlinedAt: !49) +!65 = !DILocation(line: 594, column: 29, scope: !23, inlinedAt: !49) +!66 = !DILocation(line: 594, column: 23, scope: !23, inlinedAt: !49) +!67 = !DILocation(line: 54, column: 35, scope: !9) +!68 = !DILocation(line: 261, column: 15, scope: !37, inlinedAt: !69) +!69 = !DILocation(line: 55, column: 26, scope: !9) +!70 = !DILocation(line: 291, column: 36, scope: !37, inlinedAt: !69) +!71 = !DILocation(line: 60, column: 21, scope: !9) +!72 = !DILocation(line: 58, column: 35, scope: !9) +!73 = !DILocation(line: 261, column: 15, scope: !37, inlinedAt: !74) +!74 = !DILocation(line: 59, column: 26, scope: !9) +!75 = !DILocation(line: 291, column: 36, scope: !37, inlinedAt: !74) +!76 = !DILocation(line: 61, column: 21, scope: !9) +!77 = !DILocation(line: 64, column: 19, scope: !9) +!78 = !DILocation(line: 66, column: 35, scope: !9) +!79 = !DILocation(line: 68, column: 20, scope: !9) +!80 = !DILocation(line: 69, column: 20, scope: !9) +!81 = !DILocation(line: 70, column: 35, scope: !9) +!82 = !DILocation(line: 71, column: 38, scope: !9) +!83 = !DILocation(line: 71, column: 63, scope: !9) +!84 = !DILocation(line: 75, column: 19, scope: !9) +!85 = !DILocation(line: 76, column: 35, scope: !9) +!86 = !DILocation(line: 77, column: 20, scope: !9) +!87 = !DILocation(line: 78, column: 20, scope: !9) +!88 = !DILocation(line: 79, column: 35, scope: !9) +!89 = !DILocation(line: 80, column: 38, scope: !9) +!90 = !DILocation(line: 80, column: 63, scope: !9) +!91 = !DILocation(line: 81, column: 25, scope: !9) +!92 = !DILocation(line: 81, column: 37, scope: !9) +!93 = !DILocation(line: 82, column: 25, scope: !9) +!94 = !DILocation(line: 82, column: 37, scope: !9) +!95 = !DILocation(line: 83, column: 25, scope: !9) +!96 = !DILocation(line: 83, column: 47, scope: !9) +!97 = !DILocation(line: 84, column: 52, scope: !9) +!98 = !DILocation(line: 84, column: 49, scope: !9) +!99 = !DILocation(line: 84, column: 25, scope: !9) +!100 = !DILocation(line: 84, column: 85, scope: !9) +!101 = !DILocation(line: 85, column: 25, scope: !9) +!102 = !DILocation(line: 85, column: 47, scope: !9) +!103 = !DILocation(line: 86, column: 49, scope: !9) +!104 = !DILocation(line: 86, column: 25, scope: !9) +!105 = !DILocation(line: 86, column: 85, scope: !9) +!106 = !DILocation(line: 86, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..1809805ffed6322ede31b178e4bbc77893effc96 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx @@ -0,0 +1,1774 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 // -- Begin function triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 55, 122, 47, 99, 55, 122, 50, 106, 98, 106, 117, 98, 51, 97, 117, 112, 103, 110, 101, 99, 104, 111, 108, 54, 53, 118, 107, 118, 105, 53, 114, 117, 119, 112, 121, 108, 122, 111, 115, 100, 98, 113, 118, 115, 99, 100, 121, 120, 109, 114, 101, 98, 51, 106, 121, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 57, 32, 60, 32, 49, 55}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 55, 122, 47, 99, 55, 122, 50, 106, 98, 106, 117, 98, 51, 97, 117, 112, 103, 110, 101, 99, 104, 111, 108, 54, 53, 118, 107, 118, 105, 53, 114, 117, 119, 112, 121, 108, 122, 111, 115, 100, 98, 113, 118, 115, 99, 100, 121, 120, 109, 114, 101, 98, 51, 106, 121, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 48, 32, 60, 32, 49, 55}; +.extern .shared .align 16 .b8 global_smem[]; + // @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.visible .entry triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_7, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_8, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_9, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_10 +) +.reqntid 64 +{ + .reg .pred %p<204>; + .reg .b32 %r<600>; + .reg .b64 %rd<78>; + .loc 1 18 0 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:18:0 + +// %bb.0: + ld.param.b64 %rd14, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0]; +$L__tmp0: + .loc 1 24 28 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:24:28 + mov.u32 %r16, %ctaid.x; + .loc 1 24 33 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:24:33 + shl.b32 %r1, %r16, 3; + .loc 1 25 44 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:25:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 56; + bfe.u32 %r17, %r2, 3, 3; + and.b32 %r4, %r2, 7; + .loc 1 25 23 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:25:23 + or.b32 %r5, %r17, %r1; + .loc 1 26 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:26:21 + setp.gt.s32 %p2, %r5, 31; + setp.lt.s32 %p1, %r5, 32; + .loc 1 27 38 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:27:38 + shl.b32 %r6, %r4, 1; + or.b32 %r7, %r6, 1; + .loc 1 34 40 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:34:40 + shl.b32 %r18, %r5, 4; + .loc 1 34 37 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:34:37 + or.b32 %r19, %r18, %r6; + .loc 1 34 30 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:34:30 + mad.wide.s32 %rd13, %r19, 8, %rd14; + .loc 1 34 45 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:34:45 + // begin inline asm + mov.u64 %rd11, 0x0; + mov.u64 %rd12, 0x0; + @%p1 ld.global.v2.b64 { %rd11, %rd12 }, [ %rd13 + 0 ]; + // end inline asm + .loc 1 39 18 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:39:18 + add.s64 %rd15, %rd11, -1; + setp.gt.u64 %p3, %rd15, 16382; + setp.lt.u64 %p4, %rd15, 16383; + add.s64 %rd16, %rd12, -1; + setp.lt.u64 %p5, %rd16, 16383; + .loc 1 0 0 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:0 + selp.b32 %r20, 1, 0, %p4; + selp.b32 %r21, 1, 0, %p5; +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.b32 %r22, %r2, 1; + shr.u32 %r23, %r2, 1; + bfe.u32 %r24, %r2, 1, 1; + shr.u32 %r25, %r2, 2; + bfe.u32 %r26, %r2, 2, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r27, %r22, 1; + xor.b32 %r28, %r24, 1; + xor.b32 %r29, %r26, 1; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.pred %p6, %p5, %p3; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.ne.b32 %p7, %r22, 0; + xor.pred %p8, %p6, %p7; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r30, %r20, %r21; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r31, %r30, 0, %p8; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r32, %r31, %r20; + xor.b32 %r33, %r31, %r21; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r34, %r7, %r6; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r35, %r34, 0, %p8; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r36, %r35, %r6; + xor.b32 %r37, %r35, %r7; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r38, %r32, %r27; + mul.lo.s32 %r39, %r33, %r27; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r40, %r38, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r41, %r38, %r40; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r42, %r39, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r43, %r39, %r42; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r44, %r32, %r22; + mul.lo.s32 %r45, %r33, %r22; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r46, %r44, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r47, %r44, %r46; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r48, %r45, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r49, %r45, %r48; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r50, %r36, %r27; + mul.lo.s32 %r51, %r37, %r27; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r52, %r50, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r53, %r50, %r52; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r54, %r51, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r55, %r51, %r54; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r56, %r36, %r22; + mul.lo.s32 %r57, %r37, %r22; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r58, %r56, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r59, %r56, %r58; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r60, %r57, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r61, %r57, %r60; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.b32 %r62, %r23, 1; + setp.ne.b32 %p9, %r62, 0; + setp.ge.s32 %p10, %r41, %r47; + setp.ge.s32 %p11, %r43, %r49; + setp.ne.b32 %p12, %r43, %r49; + setp.ne.b32 %p13, %r41, %r47; + setp.le.s32 %p14, %r55, %r61; + setp.le.s32 %p15, %r53, %r59; + or.pred %p16, %p13, %p15; + or.pred %p17, %p12, %p14; + and.pred %p18, %p11, %p17; + and.pred %p19, %p10, %p16; + xor.pred %p20, %p19, %p9; + xor.pred %p21, %p18, %p9; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r63, %r41, %r47; + xor.b32 %r64, %r43, %r49; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r65, 0, %r64, %p21; + selp.b32 %r66, 0, %r63, %p20; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r67, %r66, %r32; + xor.b32 %r68, %r65, %r33; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r69, %r53, %r59; + xor.b32 %r70, %r55, %r61; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r71, 0, %r70, %p21; + selp.b32 %r72, 0, %r69, %p20; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r73, %r72, %r36; + xor.b32 %r74, %r71, %r37; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.ge.s32 %p22, %r67, %r68; + setp.ne.b32 %p23, %r67, %r68; + setp.le.s32 %p24, %r73, %r74; + or.pred %p25, %p23, %p24; + and.pred %p26, %p22, %p25; + xor.pred %p27, %p26, %p9; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r75, %r67, %r68; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r76, 0, %r75, %p27; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r77, %r73, %r74; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r78, 0, %r77, %p27; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r79, %r78, %r73; + xor.b32 %r80, %r78, %r74; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r81, %r79, %r28; + mul.lo.s32 %r82, %r80, %r28; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r83, %r79, %r24; + mul.lo.s32 %r84, %r80, %r24; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.b32 %r85, %r25, 1; + setp.ne.b32 %p28, %r85, 0; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r86, %r76, %r68; + xor.b32 %r87, %r76, %r67; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r88, %r87, %r28; + mul.lo.s32 %r89, %r86, %r28; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r90, %r88, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r91, %r88, %r90; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r92, %r89, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r93, %r89, %r92; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r94, %r87, %r24; + mul.lo.s32 %r95, %r86, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r96, %r94, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r97, %r94, %r96; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r98, %r95, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r99, %r95, %r98; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r100, %r81, 2, 31, -1; + shfl.sync.bfly.b32 %r101, %r82, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r102, %r82, %r101; + add.s32 %r103, %r81, %r100; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r104, %r83, 2, 31, -1; + shfl.sync.bfly.b32 %r105, %r84, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r106, %r84, %r105; + add.s32 %r107, %r83, %r104; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.ge.s32 %p29, %r93, %r99; + setp.ge.s32 %p30, %r91, %r97; + setp.ne.b32 %p31, %r91, %r97; + setp.ne.b32 %p32, %r93, %r99; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r108, %r93, %r99; + xor.b32 %r109, %r91, %r97; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.le.s32 %p33, %r103, %r107; + setp.le.s32 %p34, %r102, %r106; + or.pred %p35, %p32, %p34; + or.pred %p36, %p31, %p33; + and.pred %p37, %p30, %p36; + and.pred %p38, %p29, %p35; + xor.pred %p39, %p38, %p28; + xor.pred %p40, %p37, %p28; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r110, 0, %r109, %p40; + selp.b32 %r111, 0, %r108, %p39; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r112, %r111, %r86; + xor.b32 %r113, %r110, %r87; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r114, %r102, %r106; + xor.b32 %r115, %r103, %r107; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r116, 0, %r115, %p40; + selp.b32 %r117, 0, %r114, %p39; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r118, %r117, %r80; + xor.b32 %r119, %r116, %r79; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r120, %r113, %r27; + mul.lo.s32 %r121, %r112, %r27; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r122, %r120, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r123, %r120, %r122; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r124, %r121, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r125, %r121, %r124; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r126, %r113, %r22; + mul.lo.s32 %r127, %r112, %r22; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r128, %r126, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r129, %r126, %r128; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r130, %r127, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r131, %r127, %r130; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r132, %r119, %r27; + mul.lo.s32 %r133, %r118, %r27; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r134, %r132, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r135, %r132, %r134; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r136, %r133, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r137, %r133, %r136; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r138, %r119, %r22; + mul.lo.s32 %r139, %r118, %r22; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r140, %r138, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r141, %r138, %r140; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r142, %r139, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r143, %r139, %r142; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.ge.s32 %p41, %r123, %r129; + setp.ge.s32 %p42, %r125, %r131; + setp.ne.b32 %p43, %r125, %r131; + setp.ne.b32 %p44, %r123, %r129; + setp.le.s32 %p45, %r137, %r143; + setp.le.s32 %p46, %r135, %r141; + or.pred %p47, %p44, %p46; + or.pred %p48, %p43, %p45; + and.pred %p49, %p42, %p48; + and.pred %p50, %p41, %p47; + xor.pred %p51, %p50, %p28; + xor.pred %p52, %p49, %p28; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r144, %r123, %r129; + xor.b32 %r145, %r125, %r131; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r146, 0, %r145, %p52; + selp.b32 %r147, 0, %r144, %p51; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r148, %r147, %r113; + xor.b32 %r149, %r146, %r112; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r150, %r135, %r141; + xor.b32 %r151, %r137, %r143; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r152, 0, %r151, %p52; + selp.b32 %r153, 0, %r150, %p51; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r154, %r153, %r119; + xor.b32 %r155, %r152, %r118; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.ge.s32 %p53, %r148, %r149; + setp.ne.b32 %p54, %r148, %r149; + setp.le.s32 %p55, %r154, %r155; + or.pred %p56, %p54, %p55; + and.pred %p57, %p53, %p56; + xor.pred %p58, %p57, %p28; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r156, %r148, %r149; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r157, 0, %r156, %p58; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r158, %r157, %r149; + xor.b32 %r159, %r157, %r148; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r160, %r154, %r155; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r161, 0, %r160, %p58; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r162, %r161, %r155; + xor.b32 %r163, %r161, %r154; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r164, %r159, %r29; + mul.lo.s32 %r165, %r158, %r29; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r166, %r164, 4, 31, -1; + shfl.sync.bfly.b32 %r167, %r165, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r168, %r164, %r166; + add.s32 %r169, %r165, %r167; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r170, %r159, %r26; + mul.lo.s32 %r171, %r158, %r26; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r172, %r170, 4, 31, -1; + shfl.sync.bfly.b32 %r173, %r171, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r174, %r170, %r172; + add.s32 %r175, %r171, %r173; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r176, %r163, %r29; + mul.lo.s32 %r177, %r162, %r29; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r178, %r176, 4, 31, -1; + shfl.sync.bfly.b32 %r179, %r177, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r180, %r177, %r179; + add.s32 %r181, %r176, %r178; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r182, %r163, %r26; + mul.lo.s32 %r183, %r162, %r26; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r184, %r182, 4, 31, -1; + shfl.sync.bfly.b32 %r185, %r183, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r186, %r183, %r185; + add.s32 %r187, %r182, %r184; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.lt.s32 %p59, %r169, %r175; + setp.lt.s32 %p60, %r168, %r174; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.eq.b32 %p61, %r168, %r174; + setp.eq.b32 %p62, %r169, %r175; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.gt.s32 %p63, %r181, %r187; + setp.gt.s32 %p64, %r180, %r186; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.pred %p65, %p62, %p64; + and.pred %p66, %p61, %p63; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + or.pred %p67, %p60, %p66; + or.pred %p68, %p59, %p65; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r188, %r168, %r174; + xor.b32 %r189, %r169, %r175; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r190, %r189, 0, %p68; + selp.b32 %r191, %r188, 0, %p67; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r192, %r191, %r159; + xor.b32 %r193, %r190, %r158; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r194, %r180, %r186; + xor.b32 %r195, %r181, %r187; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r196, %r195, 0, %p67; + selp.b32 %r197, %r194, 0, %p68; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r198, %r197, %r162; + xor.b32 %r199, %r196, %r163; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r200, %r193, %r28; + mul.lo.s32 %r201, %r192, %r28; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r202, %r201, 2, 31, -1; + shfl.sync.bfly.b32 %r203, %r200, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r204, %r201, %r202; + add.s32 %r205, %r200, %r203; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r206, %r193, %r24; + mul.lo.s32 %r207, %r192, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r208, %r207, 2, 31, -1; + shfl.sync.bfly.b32 %r209, %r206, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r210, %r207, %r208; + add.s32 %r211, %r206, %r209; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r212, %r199, %r28; + mul.lo.s32 %r213, %r198, %r28; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r214, %r212, 2, 31, -1; + shfl.sync.bfly.b32 %r215, %r213, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r216, %r213, %r215; + add.s32 %r217, %r212, %r214; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r218, %r199, %r24; + mul.lo.s32 %r219, %r198, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r220, %r218, 2, 31, -1; + shfl.sync.bfly.b32 %r221, %r219, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r222, %r219, %r221; + add.s32 %r223, %r218, %r220; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.lt.s32 %p69, %r205, %r211; + setp.lt.s32 %p70, %r204, %r210; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.eq.b32 %p71, %r204, %r210; + setp.eq.b32 %p72, %r205, %r211; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.gt.s32 %p73, %r217, %r223; + setp.gt.s32 %p74, %r216, %r222; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.pred %p75, %p72, %p74; + and.pred %p76, %p71, %p73; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + or.pred %p77, %p70, %p76; + or.pred %p78, %p69, %p75; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r224, %r204, %r210; + xor.b32 %r225, %r205, %r211; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r226, %r225, 0, %p78; + selp.b32 %r227, %r224, 0, %p77; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r228, %r227, %r192; + xor.b32 %r229, %r226, %r193; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r230, %r217, %r223; + xor.b32 %r231, %r216, %r222; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r232, %r231, 0, %p78; + selp.b32 %r233, %r230, 0, %p77; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r234, %r233, %r199; + xor.b32 %r235, %r232, %r198; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r236, %r229, %r27; + mul.lo.s32 %r237, %r228, %r27; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r238, %r237, 1, 31, -1; + shfl.sync.bfly.b32 %r239, %r236, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r240, %r237, %r238; + add.s32 %r241, %r236, %r239; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r242, %r229, %r22; + mul.lo.s32 %r243, %r228, %r22; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r244, %r243, 1, 31, -1; + shfl.sync.bfly.b32 %r245, %r242, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r246, %r243, %r244; + add.s32 %r247, %r242, %r245; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r248, %r235, %r27; + mul.lo.s32 %r249, %r234, %r27; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r250, %r249, 1, 31, -1; + shfl.sync.bfly.b32 %r251, %r248, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r252, %r248, %r251; + add.s32 %r253, %r249, %r250; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r254, %r235, %r22; + mul.lo.s32 %r255, %r234, %r22; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r256, %r255, 1, 31, -1; + shfl.sync.bfly.b32 %r257, %r254, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r258, %r254, %r257; + add.s32 %r259, %r255, %r256; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.lt.s32 %p79, %r241, %r247; + setp.lt.s32 %p80, %r240, %r246; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.eq.b32 %p81, %r240, %r246; + setp.eq.b32 %p82, %r241, %r247; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.gt.s32 %p83, %r253, %r259; + setp.gt.s32 %p84, %r252, %r258; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.pred %p85, %p82, %p84; + and.pred %p86, %p81, %p83; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + or.pred %p87, %p80, %p86; + or.pred %p88, %p79, %p85; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r260, %r240, %r246; + xor.b32 %r261, %r241, %r247; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r262, %r261, 0, %p88; + selp.b32 %r263, %r260, 0, %p87; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r264, %r263, %r228; + xor.b32 %r265, %r262, %r229; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r266, %r253, %r259; + xor.b32 %r267, %r252, %r258; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r268, %r267, 0, %p88; + selp.b32 %r269, %r266, 0, %p87; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r270, %r269, %r234; + xor.b32 %r271, %r268, %r235; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.lt.s32 %p89, %r264, %r265; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.eq.b32 %p90, %r264, %r265; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.gt.s32 %p91, %r270, %r271; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r272, %r270, %r271; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r273, %r272, 0, %p91; + selp.b32 %r274, %r273, 0, %p90; + selp.b32 %r275, %r272, %r274, %p89; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r8, %r275, %r270; + xor.b32 %r9, %r275, %r271; +$L__tmp2: + .loc 1 47 20 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:47:20 + setp.ne.b64 %p92, %rd11, 16384; + setp.eq.b64 %p93, %rd11, 16384; + setp.eq.b64 %p94, %rd12, 16384; + .loc 1 0 0 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:0 + selp.b32 %r276, 1, 0, %p93; + selp.b32 %r277, 1, 0, %p94; +$L__tmp3: + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + and.pred %p95, %p94, %p92; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.pred %p96, %p95, %p7; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r278, %r276, %r277; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r279, %r278, 0, %p96; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r280, %r279, %r276; + xor.b32 %r281, %r279, %r277; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r282, %r34, 0, %p96; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r283, %r282, %r6; + xor.b32 %r284, %r282, %r7; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r285, %r280, %r27; + mul.lo.s32 %r286, %r281, %r27; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r287, %r285, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r288, %r287, %r285; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r289, %r286, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r290, %r289, %r286; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r291, %r280, %r22; + mul.lo.s32 %r292, %r281, %r22; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r293, %r291, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r294, %r293, %r291; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r295, %r292, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r296, %r295, %r292; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r297, %r283, %r27; + mul.lo.s32 %r298, %r284, %r27; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r299, %r297, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r300, %r299, %r297; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r301, %r298, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r302, %r301, %r298; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r303, %r283, %r22; + mul.lo.s32 %r304, %r284, %r22; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r305, %r303, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r306, %r305, %r303; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r307, %r304, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r308, %r307, %r304; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.ge.s32 %p97, %r288, %r294; + setp.ge.s32 %p98, %r290, %r296; + setp.ne.b32 %p99, %r290, %r296; + setp.ne.b32 %p100, %r288, %r294; + setp.le.s32 %p101, %r302, %r308; + setp.le.s32 %p102, %r300, %r306; + or.pred %p103, %p100, %p102; + or.pred %p104, %p99, %p101; + and.pred %p105, %p98, %p104; + and.pred %p106, %p97, %p103; + xor.pred %p107, %p106, %p9; + xor.pred %p108, %p105, %p9; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r309, %r296, %r290; + xor.b32 %r310, %r306, %r300; + xor.b32 %r311, %r294, %r288; + xor.b32 %r312, %r308, %r302; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r313, 0, %r309, %p108; + selp.b32 %r314, 0, %r310, %p107; + selp.b32 %r315, 0, %r311, %p107; + selp.b32 %r316, 0, %r312, %p108; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r317, %r314, %r283; + xor.b32 %r318, %r313, %r281; + xor.b32 %r319, %r316, %r284; + xor.b32 %r320, %r315, %r280; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.ge.s32 %p109, %r320, %r318; + setp.ne.b32 %p110, %r318, %r320; + setp.le.s32 %p111, %r317, %r319; + or.pred %p112, %p110, %p111; + and.pred %p113, %p109, %p112; + xor.pred %p114, %p113, %p9; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r321, %r318, %r320; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r322, 0, %r321, %p114; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r323, %r322, %r320; + xor.b32 %r324, %r322, %r318; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r325, %r319, %r317; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r326, 0, %r325, %p114; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r327, %r326, %r317; + xor.b32 %r328, %r326, %r319; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r329, %r323, %r28; + mul.lo.s32 %r330, %r324, %r28; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r331, %r329, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r332, %r329, %r331; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r333, %r330, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r334, %r330, %r333; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r335, %r323, %r24; + mul.lo.s32 %r336, %r324, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r337, %r335, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r338, %r335, %r337; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r339, %r336, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r340, %r336, %r339; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r341, %r327, %r28; + mul.lo.s32 %r342, %r328, %r28; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r343, %r341, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r344, %r341, %r343; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r345, %r342, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r346, %r342, %r345; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r347, %r327, %r24; + mul.lo.s32 %r348, %r328, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r349, %r347, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r350, %r347, %r349; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r351, %r348, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r352, %r348, %r351; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.ge.s32 %p115, %r334, %r340; + setp.ge.s32 %p116, %r332, %r338; + setp.ne.b32 %p117, %r332, %r338; + setp.ne.b32 %p118, %r334, %r340; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r353, %r332, %r338; + xor.b32 %r354, %r334, %r340; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.le.s32 %p119, %r344, %r350; + setp.le.s32 %p120, %r346, %r352; + or.pred %p121, %p118, %p120; + or.pred %p122, %p117, %p119; + and.pred %p123, %p116, %p122; + and.pred %p124, %p115, %p121; + xor.pred %p125, %p124, %p28; + xor.pred %p126, %p123, %p28; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r355, 0, %r353, %p126; + selp.b32 %r356, 0, %r354, %p125; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r357, %r355, %r323; + xor.b32 %r358, %r356, %r324; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r359, %r346, %r352; + xor.b32 %r360, %r344, %r350; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r361, 0, %r360, %p126; + selp.b32 %r362, 0, %r359, %p125; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r363, %r362, %r328; + xor.b32 %r364, %r361, %r327; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r365, %r357, %r27; + mul.lo.s32 %r366, %r358, %r27; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r367, %r365, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r368, %r365, %r367; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r369, %r366, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r370, %r366, %r369; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r371, %r357, %r22; + mul.lo.s32 %r372, %r358, %r22; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r373, %r371, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r374, %r371, %r373; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r375, %r372, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r376, %r372, %r375; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r377, %r364, %r27; + mul.lo.s32 %r378, %r363, %r27; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r379, %r377, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r380, %r377, %r379; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r381, %r378, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r382, %r378, %r381; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r383, %r364, %r22; + mul.lo.s32 %r384, %r363, %r22; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r385, %r383, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r386, %r383, %r385; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r387, %r384, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r388, %r384, %r387; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.ge.s32 %p127, %r368, %r374; + setp.ge.s32 %p128, %r370, %r376; + setp.ne.b32 %p129, %r370, %r376; + setp.ne.b32 %p130, %r368, %r374; + setp.le.s32 %p131, %r382, %r388; + setp.le.s32 %p132, %r380, %r386; + or.pred %p133, %p130, %p132; + or.pred %p134, %p129, %p131; + and.pred %p135, %p128, %p134; + and.pred %p136, %p127, %p133; + xor.pred %p137, %p136, %p28; + xor.pred %p138, %p135, %p28; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r389, %r368, %r374; + xor.b32 %r390, %r370, %r376; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r391, 0, %r390, %p138; + selp.b32 %r392, 0, %r389, %p137; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r393, %r392, %r357; + xor.b32 %r394, %r391, %r358; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r395, %r380, %r386; + xor.b32 %r396, %r382, %r388; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r397, 0, %r396, %p138; + selp.b32 %r398, 0, %r395, %p137; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r399, %r398, %r364; + xor.b32 %r400, %r397, %r363; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.ge.s32 %p139, %r393, %r394; + setp.ne.b32 %p140, %r393, %r394; + setp.le.s32 %p141, %r399, %r400; + or.pred %p142, %p140, %p141; + and.pred %p143, %p139, %p142; + xor.pred %p144, %p143, %p28; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r401, %r393, %r394; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r402, 0, %r401, %p144; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r403, %r402, %r393; + xor.b32 %r404, %r402, %r394; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r405, %r399, %r400; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r406, 0, %r405, %p144; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r407, %r406, %r399; + xor.b32 %r408, %r406, %r400; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r409, %r403, %r29; + mul.lo.s32 %r410, %r404, %r29; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r411, %r409, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r412, %r409, %r411; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r413, %r410, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r414, %r410, %r413; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r415, %r403, %r26; + mul.lo.s32 %r416, %r404, %r26; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r417, %r415, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r418, %r415, %r417; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r419, %r416, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r420, %r416, %r419; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r421, %r407, %r29; + mul.lo.s32 %r422, %r408, %r29; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r423, %r421, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r424, %r421, %r423; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r425, %r422, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r426, %r422, %r425; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r427, %r407, %r26; + mul.lo.s32 %r428, %r408, %r26; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r429, %r427, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r430, %r427, %r429; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r431, %r428, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r432, %r428, %r431; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.lt.s32 %p145, %r412, %r418; + setp.lt.s32 %p146, %r414, %r420; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.eq.b32 %p147, %r414, %r420; + setp.eq.b32 %p148, %r412, %r418; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.gt.s32 %p149, %r426, %r432; + setp.gt.s32 %p150, %r424, %r430; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + and.pred %p151, %p148, %p150; + and.pred %p152, %p147, %p149; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + or.pred %p153, %p146, %p152; + or.pred %p154, %p145, %p151; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r433, %r414, %r420; + xor.b32 %r434, %r412, %r418; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r435, %r434, 0, %p154; + selp.b32 %r436, %r433, 0, %p153; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r437, %r436, %r404; + xor.b32 %r438, %r435, %r403; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r439, %r424, %r430; + xor.b32 %r440, %r426, %r432; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r441, %r439, 0, %p154; + selp.b32 %r442, %r440, 0, %p153; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r443, %r441, %r407; + xor.b32 %r444, %r442, %r408; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r445, %r438, %r28; + mul.lo.s32 %r446, %r437, %r28; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r447, %r445, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r448, %r445, %r447; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r449, %r446, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r450, %r446, %r449; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r451, %r438, %r24; + mul.lo.s32 %r452, %r437, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r453, %r451, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r454, %r451, %r453; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r455, %r452, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r456, %r452, %r455; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r457, %r443, %r28; + mul.lo.s32 %r458, %r444, %r28; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r459, %r457, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r460, %r457, %r459; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r461, %r458, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r462, %r458, %r461; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r463, %r443, %r24; + mul.lo.s32 %r464, %r444, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r465, %r463, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r466, %r463, %r465; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r467, %r464, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r468, %r464, %r467; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.lt.s32 %p155, %r448, %r454; + setp.lt.s32 %p156, %r450, %r456; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.eq.b32 %p157, %r450, %r456; + setp.eq.b32 %p158, %r448, %r454; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.gt.s32 %p159, %r462, %r468; + setp.gt.s32 %p160, %r460, %r466; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + and.pred %p161, %p158, %p160; + and.pred %p162, %p157, %p159; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + or.pred %p163, %p156, %p162; + or.pred %p164, %p155, %p161; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r469, %r450, %r456; + xor.b32 %r470, %r448, %r454; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r471, %r470, 0, %p164; + selp.b32 %r472, %r469, 0, %p163; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r473, %r472, %r437; + xor.b32 %r474, %r471, %r438; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r475, %r460, %r466; + xor.b32 %r476, %r462, %r468; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r477, %r475, 0, %p164; + selp.b32 %r478, %r476, 0, %p163; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r479, %r477, %r443; + xor.b32 %r480, %r478, %r444; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r481, %r474, %r27; + mul.lo.s32 %r482, %r473, %r27; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r483, %r481, 1, 31, -1; + shfl.sync.bfly.b32 %r485, %r482, 1, 31, -1; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r487, %r474, %r22; + mul.lo.s32 %r488, %r473, %r22; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r489, %r487, 1, 31, -1; + shfl.sync.bfly.b32 %r491, %r488, 1, 31, -1; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r493, %r479, %r27; + mul.lo.s32 %r494, %r480, %r27; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r495, %r493, 1, 31, -1; + shfl.sync.bfly.b32 %r497, %r494, 1, 31, -1; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r499, %r479, %r22; + mul.lo.s32 %r500, %r480, %r22; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r501, %r499, 1, 31, -1; + shfl.sync.bfly.b32 %r503, %r500, 1, 31, -1; +$L__tmp4: + .loc 1 54 35 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:54:35 + and.pred %p178, %p1, %p4; + selp.b64 %rd17, 1, 0, %p178; + and.pred %p179, %p1, %p5; + selp.b64 %rd18, 1, 0, %p179; +$L__tmp5: + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + add.s64 %rd19, %rd17, %rd18; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + cvt.u32.u64 %r521, %rd19; + shfl.sync.bfly.b32 %r522, %r521, 4, 31, -1; + mov.b32 %r523, 0; + shfl.sync.bfly.b32 %r524, %r523, 4, 31, -1; + cvt.u64.u32 %rd20, %r522; + cvt.u64.u32 %rd21, %r524; + shl.b64 %rd22, %rd21, 32; + or.b64 %rd23, %rd20, %rd22; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + add.s64 %rd24, %rd19, %rd23; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + mov.b64 {_, %r525}, %rd24; + cvt.u32.u64 %r526, %rd24; + shfl.sync.bfly.b32 %r527, %r526, 2, 31, -1; + shfl.sync.bfly.b32 %r528, %r525, 2, 31, -1; + cvt.u64.u32 %rd25, %r527; + cvt.u64.u32 %rd26, %r528; + shl.b64 %rd27, %rd26, 32; + or.b64 %rd28, %rd25, %rd27; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + add.s64 %rd29, %rd24, %rd28; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + mov.b64 {_, %r529}, %rd29; + cvt.u32.u64 %r530, %rd29; + shfl.sync.bfly.b32 %r531, %r530, 1, 31, -1; + shfl.sync.bfly.b32 %r532, %r529, 1, 31, -1; + cvt.u64.u32 %rd30, %r531; + cvt.u64.u32 %rd31, %r532; + shl.b64 %rd32, %rd31, 32; + or.b64 %rd33, %rd30, %rd32; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + add.s64 %rd34, %rd29, %rd33; +$L__tmp6: + .loc 1 60 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:60:21 + mov.b32 %r533, global_smem; + add.s32 %r534, %r533, %r3; + st.shared.b64 [%r534], %rd34; + bar.sync 0; + shl.b32 %r535, %r4, 3; + add.s32 %r536, %r533, %r535; + ld.shared.b64 %rd2, [%r536]; + .loc 1 58 35 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:58:35 + and.pred %p180, %p1, %p93; + selp.b64 %rd35, 1, 0, %p180; + and.pred %p181, %p1, %p94; + selp.b64 %rd36, 1, 0, %p181; +$L__tmp7: + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + add.s64 %rd37, %rd35, %rd36; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + cvt.u32.u64 %r537, %rd37; + shfl.sync.bfly.b32 %r538, %r537, 4, 31, -1; + shfl.sync.bfly.b32 %r539, %r523, 4, 31, -1; + cvt.u64.u32 %rd38, %r538; + cvt.u64.u32 %rd39, %r539; + shl.b64 %rd40, %rd39, 32; + or.b64 %rd41, %rd38, %rd40; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + add.s64 %rd42, %rd37, %rd41; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + mov.b64 {_, %r540}, %rd42; + cvt.u32.u64 %r541, %rd42; + shfl.sync.bfly.b32 %r542, %r541, 2, 31, -1; + shfl.sync.bfly.b32 %r543, %r540, 2, 31, -1; + cvt.u64.u32 %rd43, %r542; + cvt.u64.u32 %rd44, %r543; + shl.b64 %rd45, %rd44, 32; + or.b64 %rd46, %rd43, %rd45; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + add.s64 %rd47, %rd42, %rd46; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + mov.b64 {_, %r544}, %rd47; + cvt.u32.u64 %r545, %rd47; + shfl.sync.bfly.b32 %r546, %r545, 1, 31, -1; + shfl.sync.bfly.b32 %r547, %r544, 1, 31, -1; + cvt.u64.u32 %rd48, %r546; + cvt.u64.u32 %rd49, %r547; + shl.b64 %rd50, %rd49, 32; + or.b64 %rd51, %rd48, %rd50; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + add.s64 %rd3, %rd47, %rd51; +$L__tmp8: + .loc 1 61 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:61:21 + bar.sync 0; + st.shared.b64 [%r534], %rd3; + bar.sync 0; + .loc 1 60 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:60:21 + cvt.u32.u64 %r548, %rd34; + .loc 1 64 19 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:64:19 + setp.lt.s32 %p182, %r6, %r548; + setp.lt.s32 %p183, %r7, %r548; + .loc 1 66 35 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:66:35 + selp.b32 %r549, %r8, 16, %p182; + selp.b32 %r550, %r9, 16, %p183; + .loc 1 68 20 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:68:20 + add.s32 %r551, %r549, 17; + add.s32 %r552, %r550, 17; + .loc 1 69 20 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:69:20 + setp.lt.s32 %p184, %r549, 0; + setp.lt.s32 %p185, %r550, 0; + .loc 1 70 35 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:70:35 + selp.b32 %r12, %r551, %r549, %p184; + selp.b32 %r13, %r552, %r550, %p185; + .loc 1 71 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:71:63 + max.u32 %r553, %r12, %r13; + setp.lt.u32 %p186, %r553, 17; + or.pred %p187, %p2, %p186; + @%p187 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + .loc 1 0 0 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:0 + add.s32 %r484, %r481, %r483; + add.s32 %r486, %r482, %r485; + add.s32 %r490, %r487, %r489; + add.s32 %r492, %r488, %r491; + add.s32 %r496, %r493, %r495; + add.s32 %r498, %r494, %r497; + add.s32 %r502, %r499, %r501; + add.s32 %r504, %r500, %r503; + setp.lt.s32 %p165, %r486, %r492; + setp.lt.s32 %p166, %r484, %r490; + setp.eq.b32 %p167, %r484, %r490; + setp.eq.b32 %p168, %r486, %r492; + setp.gt.s32 %p169, %r496, %r502; + setp.gt.s32 %p170, %r498, %r504; + and.pred %p171, %p168, %p170; + and.pred %p172, %p167, %p169; + or.pred %p173, %p166, %p172; + or.pred %p174, %p165, %p171; + xor.b32 %r505, %r484, %r490; + xor.b32 %r506, %r486, %r492; + selp.b32 %r507, %r506, 0, %p174; + selp.b32 %r508, %r505, 0, %p173; + xor.b32 %r509, %r508, %r474; + xor.b32 %r510, %r507, %r473; + xor.b32 %r511, %r496, %r502; + xor.b32 %r512, %r498, %r504; + selp.b32 %r513, %r511, 0, %p173; + selp.b32 %r514, %r512, 0, %p174; + xor.b32 %r515, %r513, %r479; + xor.b32 %r516, %r514, %r480; + setp.lt.s32 %p175, %r509, %r510; + setp.eq.b32 %p176, %r509, %r510; + setp.gt.s32 %p177, %r515, %r516; + xor.b32 %r517, %r515, %r516; + selp.b32 %r518, %r517, 0, %p177; + selp.b32 %r519, %r518, 0, %p176; + selp.b32 %r520, %r517, %r519, %p175; + xor.b32 %r10, %r520, %r515; + xor.b32 %r11, %r520, %r516; + .loc 1 61 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:61:21 + ld.shared.b64 %rd4, [%r536]; + cvt.u32.u64 %r554, %rd3; + .loc 1 71 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:71:63 + bar.sync 0; + .loc 1 75 19 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:75:19 + setp.lt.s32 %p189, %r6, %r554; + setp.lt.s32 %p190, %r7, %r554; + .loc 1 76 35 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:76:35 + selp.b32 %r555, %r10, 16, %p189; + selp.b32 %r556, %r11, 16, %p190; + .loc 1 77 20 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:77:20 + add.s32 %r557, %r555, 17; + add.s32 %r558, %r556, 17; + .loc 1 78 20 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:78:20 + setp.lt.s32 %p191, %r555, 0; + setp.lt.s32 %p192, %r556, 0; + .loc 1 79 35 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:79:35 + selp.b32 %r14, %r557, %r555, %p191; + selp.b32 %r15, %r558, %r556, %p192; + .loc 1 80 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:80:63 + max.u32 %r559, %r14, %r15; + setp.lt.u32 %p193, %r559, 17; + or.pred %p194, %p2, %p193; + @%p194 bra $L__BB0_4; + bra.uni $L__BB0_3; +$L__BB0_4: + .loc 1 0 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:0:63 + ld.param.b64 %rd10, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6]; + ld.param.b64 %rd9, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5]; + ld.param.b64 %rd8, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4]; + ld.param.b64 %rd7, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3]; + ld.param.b64 %rd6, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2]; + ld.param.b64 %rd5, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1]; + cvt.s64.s32 %rd1, %r19; + .loc 1 61 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:61:21 + cvt.u32.u64 %r561, %rd4; + .loc 1 60 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:60:21 + cvt.u32.u64 %r560, %rd2; + .loc 1 25 23 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:25:23 + or.b32 %r570, %r1, %r4; + .loc 1 26 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:26:21 + setp.lt.s32 %p198, %r570, 32; + .loc 1 80 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:80:63 + bar.sync 0; + .loc 1 81 25 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:81:25 + mul.wide.s32 %rd60, %r570, 4; + add.s64 %rd52, %rd5, %rd60; + .loc 1 81 37 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:81:37 + setp.eq.b32 %p203, %r3, 0; + and.pred %p195, %p203, %p198; + // begin inline asm + @%p195 st.global.b32 [ %rd52 + 0 ], { %r560 }; + // end inline asm + .loc 1 82 25 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:82:25 + add.s64 %rd53, %rd6, %rd60; + .loc 1 82 37 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:82:37 + // begin inline asm + @%p195 st.global.b32 [ %rd53 + 0 ], { %r561 }; + // end inline asm + .loc 1 83 25 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:83:25 + shl.b64 %rd61, %rd1, 2; + add.s64 %rd54, %rd7, %rd61; + .loc 1 83 47 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:83:47 + // begin inline asm + @%p1 st.global.v2.b32 [ %rd54 + 0 ], { %r8, %r9 }; + // end inline asm + .loc 1 84 52 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:84:52 + mul.lo.s32 %r571, %r5, 17; + .loc 1 84 49 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:84:49 + add.s32 %r572, %r12, %r571; + add.s32 %r573, %r13, %r571; + .loc 1 84 25 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:84:25 + mad.wide.s32 %rd62, %r572, 4, %rd8; + mad.wide.s32 %rd63, %r573, 4, %rd8; + .loc 1 84 85 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:84:85 + bar.sync 0; + shl.b32 %r574, %r2, 5; + and.b32 %r575, %r574, 96; + and.b32 %r576, %r2, 48; + shl.b32 %r577, %r576, 3; + shl.b32 %r578, %r2, 1; + and.b32 %r579, %r578, 120; + or.b32 %r580, %r575, %r577; + xor.b32 %r581, %r580, %r579; + add.s32 %r583, %r533, %r581; + st.shared.b64 [%r583], %rd62; + xor.b32 %r584, %r581, 8; + add.s32 %r585, %r533, %r584; + st.shared.b64 [%r585+512], %rd63; + bar.sync 0; + shl.b32 %r586, %r2, 6; + and.b32 %r587, %r586, 384; + shl.b32 %r588, %r4, 4; + shl.b32 %r589, %r576, 1; + bfe.s32 %r590, %r2, 3, 1; + and.b32 %r591, %r590, 520; + or.b32 %r592, %r587, %r588; + xor.b32 %r593, %r592, %r589; + or.b32 %r594, %r593, %r591; + add.s32 %r595, %r533, %r594; + ld.shared.b64 %rd55, [%r595]; + xor.b32 %r596, %r594, 8; + add.s32 %r597, %r533, %r596; + ld.shared.b64 %rd56, [%r597]; + mov.b32 %r564, 1; + // begin inline asm + @%p198 st.global.b32 [ %rd55 + 0 ], { %r564 }; + // end inline asm + // begin inline asm + @%p198 st.global.b32 [ %rd56 + 0 ], { %r564 }; + // end inline asm + .loc 1 85 25 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:85:25 + add.s64 %rd57, %rd9, %rd61; + .loc 1 85 47 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:85:47 + // begin inline asm + @%p1 st.global.v2.b32 [ %rd57 + 0 ], { %r10, %r11 }; + // end inline asm + .loc 1 86 49 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:86:49 + add.s32 %r598, %r14, %r571; + add.s32 %r599, %r15, %r571; + .loc 1 86 25 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:86:25 + mad.wide.s32 %rd64, %r598, 4, %rd10; + mad.wide.s32 %rd65, %r599, 4, %rd10; + .loc 1 86 85 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:86:85 + bar.sync 0; + st.shared.b64 [%r583], %rd64; + st.shared.b64 [%r585+512], %rd65; + bar.sync 0; + ld.shared.b64 %rd58, [%r595]; + ld.shared.b64 %rd59, [%r597]; + // begin inline asm + @%p198 st.global.b32 [ %rd58 + 0 ], { %r564 }; + // end inline asm + // begin inline asm + @%p198 st.global.b32 [ %rd59 + 0 ], { %r564 }; + // end inline asm + .loc 1 86 4 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:86:4 + ret; +$L__BB0_1: + .loc 1 71 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:71:63 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd72, assertFunc_0; + cvta.global.u64 %rd73, %rd72; + st.param.b64 [param3], %rd73; + mov.b64 %rd74, assertFile_0; + cvta.global.u64 %rd75, %rd74; + st.param.b64 [param1], %rd75; + mov.b64 %rd76, assertMessage_0; + cvta.global.u64 %rd77, %rd76; + st.param.b64 [param0], %rd77; + st.param.b64 [param4], 1; + st.param.b32 [param2], 71; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__BB0_3: + .loc 1 80 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:80:63 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd66, assertFunc_1; + cvta.global.u64 %rd67, %rd66; + st.param.b64 [param3], %rd67; + mov.b64 %rd68, assertFile_1; + cvta.global.u64 %rd69, %rd68; + st.param.b64 [param1], %rd69; + mov.b64 %rd70, assertMessage_1; + cvta.global.u64 %rd71, %rd70; + st.param.b64 [param0], %rd71; + st.param.b64 [param4], 1; + st.param.b32 [param2], 80; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 376 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x171 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 55 +.b8 122 +.b8 50 +.b8 106 +.b8 98 +.b8 106 +.b8 117 +.b8 98 +.b8 51 +.b8 97 +.b8 117 +.b8 112 +.b8 103 +.b8 110 +.b8 101 +.b8 99 +.b8 104 +.b8 111 +.b8 108 +.b8 54 +.b8 53 +.b8 118 +.b8 107 +.b8 118 +.b8 105 +.b8 53 +.b8 114 +.b8 117 +.b8 119 +.b8 112 +.b8 121 +.b8 108 +.b8 122 +.b8 111 +.b8 115 +.b8 100 +.b8 98 +.b8 113 +.b8 118 +.b8 115 +.b8 99 +.b8 100 +.b8 121 +.b8 120 +.b8 109 +.b8 114 +.b8 101 +.b8 98 +.b8 51 +.b8 106 +.b8 121 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 55 +.b8 122 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x7a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 116 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 112 +.b8 117 +.b8 116 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 110 +.b8 101 +.b8 119 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 115 +.b8 99 +.b8 97 +.b8 108 +.b8 97 +.b8 114 +.b8 95 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 119 +.b8 104 +.b8 101 +.b8 114 +.b8 101 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x105:0x76 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x132:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x14a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 55 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x162:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 59 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source b/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source new file mode 100644 index 0000000000000000000000000000000000000000..d09ec72b33501b29d1222a67fbec8cdc178da3a7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source @@ -0,0 +1,1405 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":18:0) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc141 = loc(unknown) +#loc166 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc170 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc175 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc179 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc188 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc193 = loc("in_ptr0"(#loc)) +#loc194 = loc("out_ptr4"(#loc)) +#loc195 = loc("out_ptr5"(#loc)) +#loc196 = loc("out_ptr6"(#loc)) +#loc197 = loc("out_ptr7"(#loc)) +#loc198 = loc("out_ptr8"(#loc)) +#loc199 = loc("out_ptr9"(#loc)) +#loc200 = loc("xnumel"(#loc)) +#loc201 = loc("r0_numel"(#loc)) +#loc257 = loc("x"(#loc91)) +#loc258 = loc("idxs"(#loc91)) +#loc259 = loc("x"(#loc95)) +#loc260 = loc("idxs"(#loc95)) +#loc265 = loc("x"(#loc103)) +#loc266 = loc("idxs"(#loc103)) +#loc267 = loc("flip"(#loc103)) +#loc323 = loc("input"(#loc166)) +#loc324 = loc("a"(#loc170)) +#loc325 = loc("b"(#loc170)) +#loc327 = loc("x"(#loc175)) +#loc328 = loc("x"(#loc179)) +#loc329 = loc("input"(#loc188)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 32 : i32 loc(#loc202) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc203) + %xoffset = tt.get_program_id x : i32 loc(#loc204) + %xoffset_2 = arith.constant 8 : i32 loc(#loc205) + %xoffset_3 = arith.constant 8 : i32 loc(#loc205) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc205) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc206) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc207) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc208) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc208) + %xmask = arith.constant dense<32> : tensor<8x1xi32> loc(#loc209) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<8x1xi32> loc(#loc209) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc210) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc211) + %r0_offset = arith.constant 0 : i32 loc(#loc212) + %r0_mask = arith.constant true loc(#loc213) + %r0_mask_10 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %tmp0 = arith.constant 16 : i32 loc(#loc214) + %tmp0_11 = arith.constant 16 : i32 loc(#loc214) + %tmp0_12 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc214) + %tmp0_13 = arith.muli %tmp0_12, %xindex_7 : tensor<8x1xi32> loc(#loc214) + %tmp0_14 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc215) + %tmp0_15 = tt.broadcast %tmp0_13 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc215) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<8x16xi32> loc(#loc215) + %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc216) + %tmp0_18 = tt.addptr %tmp0_17, %tmp0_16 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc216) + %tmp0_19 = arith.constant 0.000000e+00 : f32 loc(#loc217) + %tmp0_20 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc217) + %tmp0_21 = arith.constant dense<0.000000e+00> : tensor<8x16xf32> loc(#loc217) + %tmp0_22 = arith.fptosi %tmp0_21 : tensor<8x16xf32> to tensor<8x16xi64> loc(#loc217) + %tmp0_23 = tt.load %tmp0_18, %tmp0_20, %tmp0_22 : tensor<8x16x!tt.ptr> loc(#loc217) + %tmp1 = arith.constant 0 : i64 loc(#loc218) + %tmp1_24 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc218) + %tmp2 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc219) + %tmp2_25 = arith.cmpi sgt, %tmp0_23, %tmp2 : tensor<8x16xi64> loc(#loc219) + %tmp3 = arith.constant 16384 : i64 loc(#loc220) + %tmp3_26 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc220) + %tmp4 = arith.constant dense<16384> : tensor<8x16xi64> loc(#loc221) + %tmp4_27 = arith.cmpi slt, %tmp0_23, %tmp4 : tensor<8x16xi64> loc(#loc221) + %tmp5 = arith.andi %tmp2_25, %tmp4_27 : tensor<8x16xi1> loc(#loc222) + %tmp6 = arith.extui %tmp5 : tensor<8x16xi1> to tensor<8x16xi8> loc(#loc223) + %tmp7 = arith.extsi %tmp6 : tensor<8x16xi8> to tensor<8x16xi32> loc(#loc224) + %tmp9 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc225) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16> -> tensor<8x16xi16> loc(#loc226) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp7, %tmp11) : (tensor<8x16xi32>, tensor<8x16xi16>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc26) + %tmp14 = arith.constant dense<16384> : tensor<8x16xi64> loc(#loc227) + %tmp14_28 = arith.cmpi eq, %tmp0_23, %tmp14 : tensor<8x16xi64> loc(#loc227) + %tmp15 = arith.extui %tmp14_28 : tensor<8x16xi1> to tensor<8x16xi8> loc(#loc228) + %tmp16 = arith.extsi %tmp15 : tensor<8x16xi8> to tensor<8x16xi32> loc(#loc229) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp16, %tmp11) : (tensor<8x16xi32>, tensor<8x16xi16>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc30) + %tmp20 = arith.extsi %tmp7 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc230) + %tmp23 = arith.constant 0 : i32 loc(#loc231) + %tmp23_29 = arith.constant 0 : i64 loc(#loc231) + %tmp23_30 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc231) + %tmp23_31 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc231) + %tmp23_32 = arith.select %tmp23_31, %tmp20, %tmp23_30 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc231) + %tmp24 = tt.call @"triton.language.standard.sum__i64S8_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp23_32) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc232) + %tmp24_33 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc233) + %tmp25 = arith.extsi %tmp16 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc234) + %tmp28 = arith.constant 0 : i32 loc(#loc235) + %tmp28_34 = arith.constant 0 : i64 loc(#loc235) + %tmp28_35 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc235) + %tmp28_36 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc235) + %tmp28_37 = arith.select %tmp28_36, %tmp25, %tmp28_35 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc235) + %tmp29 = tt.call @"triton.language.standard.sum__i64S8_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp28_37) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc236) + %tmp29_38 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc237) + %tmp30 = arith.trunci %tmp24_33 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc238) + %tmp31 = arith.trunci %tmp29_38 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc239) + %tmp32 = arith.extsi %0#1 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc240) + %tmp33 = arith.trunci %tmp32 : tensor<8x16xi64> to tensor<8x16xi32> loc(#loc241) + %tmp34 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc242) + %tmp34_39 = tt.broadcast %tmp30 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc242) + %tmp34_40 = arith.cmpi slt, %tmp34, %tmp34_39 : tensor<8x16xi32> loc(#loc242) + %tmp35 = arith.constant 16 : i32 loc(#loc243) + %tmp35_41 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc243) + %tmp36 = arith.constant dense<16> : tensor<8x16xi32> loc(#loc244) + %tmp36_42 = arith.select %tmp34_40, %tmp33, %tmp36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc244) + %tmp37 = arith.constant 17 : i32 loc(#loc245) + %tmp37_43 = arith.constant dense<17> : tensor<8x16xi32> loc(#loc245) + %tmp38 = arith.addi %tmp36_42, %tmp37_43 : tensor<8x16xi32> loc(#loc246) + %tmp39 = arith.constant 0 : i32 loc(#loc247) + %tmp39_44 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc247) + %tmp39_45 = arith.cmpi slt, %tmp36_42, %tmp39_44 : tensor<8x16xi32> loc(#loc247) + %tmp40 = arith.select %tmp39_45, %tmp38, %tmp36_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc248) + %c0_i32 = arith.constant 0 : i32 loc(#loc50) + %cst = arith.constant dense<0> : tensor<8x16xi32> loc(#loc50) + %2 = arith.cmpi sle, %cst, %tmp40 : tensor<8x16xi32> loc(#loc50) + %c17_i32 = arith.constant 17 : i32 loc(#loc51) + %cst_46 = arith.constant dense<17> : tensor<8x16xi32> loc(#loc51) + %3 = arith.cmpi slt, %tmp40, %cst_46 : tensor<8x16xi32> loc(#loc51) + %4 = arith.andi %2, %3 : tensor<8x16xi1> loc(#loc52) + %true = arith.constant true loc(#loc53) + %cst_47 = arith.constant dense : tensor<8x1xi1> loc(#loc53) + %5 = arith.xori %xmask_8, %cst_47 : tensor<8x1xi1> loc(#loc53) + %6 = tt.broadcast %5 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc54) + %7 = arith.ori %4, %6 : tensor<8x16xi1> loc(#loc54) + tt.assert %7, "index out of bounds: 0 <= tmp40 < 17" : tensor<8x16xi1> loc(#loc55) + %tmp42 = arith.constant 1 : i32 loc(#loc249) + %tmp42_48 = arith.constant dense<1> : tensor<1x1xi32> loc(#loc249) + %tmp43 = arith.extsi %1#1 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc250) + %tmp44 = arith.trunci %tmp43 : tensor<8x16xi64> to tensor<8x16xi32> loc(#loc251) + %tmp45 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc252) + %tmp45_49 = tt.broadcast %tmp31 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc252) + %tmp45_50 = arith.cmpi slt, %tmp45, %tmp45_49 : tensor<8x16xi32> loc(#loc252) + %tmp46 = arith.constant dense<16> : tensor<8x16xi32> loc(#loc253) + %tmp46_51 = arith.select %tmp45_50, %tmp44, %tmp46 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc253) + %tmp47 = arith.addi %tmp46_51, %tmp37_43 : tensor<8x16xi32> loc(#loc254) + %tmp48 = arith.constant 0 : i32 loc(#loc255) + %tmp48_52 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc255) + %tmp48_53 = arith.cmpi slt, %tmp46_51, %tmp48_52 : tensor<8x16xi32> loc(#loc255) + %tmp49 = arith.select %tmp48_53, %tmp47, %tmp46_51 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc256) + %c0_i32_54 = arith.constant 0 : i32 loc(#loc64) + %cst_55 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc64) + %8 = arith.cmpi sle, %cst_55, %tmp49 : tensor<8x16xi32> loc(#loc64) + %c17_i32_56 = arith.constant 17 : i32 loc(#loc65) + %cst_57 = arith.constant dense<17> : tensor<8x16xi32> loc(#loc65) + %9 = arith.cmpi slt, %tmp49, %cst_57 : tensor<8x16xi32> loc(#loc65) + %10 = arith.andi %8, %9 : tensor<8x16xi1> loc(#loc66) + %true_58 = arith.constant true loc(#loc67) + %cst_59 = arith.constant dense : tensor<8x1xi1> loc(#loc67) + %11 = arith.xori %xmask_8, %cst_59 : tensor<8x1xi1> loc(#loc67) + %12 = tt.broadcast %11 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc68) + %13 = arith.ori %10, %12 : tensor<8x16xi1> loc(#loc68) + tt.assert %13, "index out of bounds: 0 <= tmp49 < 17" : tensor<8x16xi1> loc(#loc69) + %14 = tt.splat %out_ptr4 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc70) + %15 = tt.addptr %14, %xindex_7 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc70) + tt.store %15, %tmp30, %xmask_8 : tensor<8x1x!tt.ptr> loc(#loc71) + %16 = tt.splat %out_ptr5 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc72) + %17 = tt.addptr %16, %xindex_7 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc72) + tt.store %17, %tmp31, %xmask_8 : tensor<8x1x!tt.ptr> loc(#loc73) + %c16_i32 = arith.constant 16 : i32 loc(#loc74) + %c16_i32_60 = arith.constant 16 : i32 loc(#loc74) + %cst_61 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc74) + %18 = arith.muli %cst_61, %xindex_7 : tensor<8x1xi32> loc(#loc74) + %19 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc75) + %20 = tt.broadcast %18 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc75) + %21 = arith.addi %19, %20 : tensor<8x16xi32> loc(#loc75) + %22 = tt.splat %out_ptr6 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc76) + %23 = tt.addptr %22, %21 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc76) + %24 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc77) + tt.store %23, %tmp33, %24 : tensor<8x16x!tt.ptr> loc(#loc77) + %c17_i32_62 = arith.constant 17 : i32 loc(#loc78) + %c17_i32_63 = arith.constant 17 : i32 loc(#loc78) + %cst_64 = arith.constant dense<17> : tensor<8x1xi32> loc(#loc78) + %25 = arith.muli %cst_64, %xindex_7 : tensor<8x1xi32> loc(#loc78) + %26 = tt.broadcast %25 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc79) + %27 = arith.addi %tmp40, %26 : tensor<8x16xi32> loc(#loc79) + %28 = tt.splat %out_ptr7 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc80) + %29 = tt.addptr %28, %27 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc80) + %cst_65 = arith.constant dense<1> : tensor<8x16xi32> loc(#loc81) + %30 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc81) + tt.store %29, %cst_65, %30 : tensor<8x16x!tt.ptr> loc(#loc81) + %c16_i32_66 = arith.constant 16 : i32 loc(#loc82) + %c16_i32_67 = arith.constant 16 : i32 loc(#loc82) + %cst_68 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc82) + %31 = arith.muli %cst_68, %xindex_7 : tensor<8x1xi32> loc(#loc82) + %32 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc83) + %33 = tt.broadcast %31 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc83) + %34 = arith.addi %32, %33 : tensor<8x16xi32> loc(#loc83) + %35 = tt.splat %out_ptr8 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc84) + %36 = tt.addptr %35, %34 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc84) + %37 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc85) + tt.store %36, %tmp44, %37 : tensor<8x16x!tt.ptr> loc(#loc85) + %c17_i32_69 = arith.constant 17 : i32 loc(#loc86) + %c17_i32_70 = arith.constant 17 : i32 loc(#loc86) + %cst_71 = arith.constant dense<17> : tensor<8x1xi32> loc(#loc86) + %38 = arith.muli %cst_71, %xindex_7 : tensor<8x1xi32> loc(#loc86) + %39 = tt.broadcast %38 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc87) + %40 = arith.addi %tmp49, %39 : tensor<8x16xi32> loc(#loc87) + %41 = tt.splat %out_ptr9 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc88) + %42 = tt.addptr %41, %40 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc88) + %cst_72 = arith.constant dense<1> : tensor<8x16xi32> loc(#loc89) + %43 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc89) + tt.store %42, %cst_72, %43 : tensor<8x16x!tt.ptr> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc91)), %idxs: tensor<8x16xi16> loc("idxs"(#loc91))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<8x16xi32>, tensor<8x16xi16>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc92) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc92) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc92) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc92) + tt.return %3#0, %3#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc93) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc94) + %5 = ub.poison : tensor<8x16xi32> loc(#loc94) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc94) + } loc(#loc91) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc95)), %idxs: tensor<8x16xi16> loc("idxs"(#loc95))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i16S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi16>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + tt.return %0#0, %0#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc101) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x16xi32> loc(#loc102) + %2 = ub.poison : tensor<8x16xi32> loc(#loc102) + tt.return %1, %2 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i16S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi16> loc("idxs"(#loc103)), %flip: tensor<8x16xi32> loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi16> -> tensor<64x2x1xi16> loc(#loc282) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc283) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc284) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<64x2x1xi16> loc(#loc284) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<64x2x1xi16>) -> tensor<64x1xi32> loc(#loc285) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc286) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc287) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc288) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc289) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<64x2x1xi16> loc(#loc289) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<64x2x1xi16>) -> tensor<64x1xi32> loc(#loc290) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc291) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc292) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_27 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_28 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_49 = arith.constant true loc(#loc300) + %cond_50 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<8x16xi1> loc(#loc300) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<8x16xi1> loc(#loc301) + %cond_53 = arith.ori %cond, %cond_52 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_53 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_50 = arith.ori %eq, %eq_49 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_50 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<8x16xi32> loc(#loc306) + %cond_30 = arith.andi %3, %cond_29 : tensor<8x16xi1> loc(#loc307) + %cond_31 = arith.ori %1, %cond_30 : tensor<8x16xi1> loc(#loc308) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<8x16xi1> loc(#loc309) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<8x16xi1> loc(#loc310) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<8x16xi1> loc(#loc311) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<8x16xi1> loc(#loc312) + %cond_36 = arith.extui %cond_35 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc313) + %cond_37 = arith.xori %cond_36, %flip : tensor<8x16xi32> loc(#loc313) + %cond_38 = arith.constant 0 : i32 loc(#loc314) + %cond_39 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc314) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<8x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_43 = arith.xori %x, %ret_42 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<8x16xi32> loc(#loc319) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S8_16S__(%idxs) : (tensor<8x16xi16>) -> tensor<8x16xi16> loc(#loc320) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<8x16xi16> to tensor<8x16xi32> loc(#loc321) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_47 = arith.extsi %idxs : tensor<8x16xi16> to tensor<8x16xi32> loc(#loc322) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_43, %new_idxs_48 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x1xi32> loc("input"(#loc166))) -> tensor<64x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc167) + tt.return %0 : tensor<64x1xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x1xi32> loc(#loc169) + tt.return %1 : tensor<64x1xi32> loc(#loc169) + } loc(#loc166) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc170)), %b: i32 loc("b"(#loc170))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc171) + tt.return %0 : i32 loc(#loc172) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc173) + tt.return %1 : i32 loc(#loc173) + } loc(#loc170) + tt.func private @"triton.language.standard.sum__i16S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x1xi16> loc("input"(#loc166))) -> tensor<64x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc326) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc167) + tt.return %0 : tensor<64x1xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x1xi32> loc(#loc169) + tt.return %1 : tensor<64x1xi32> loc(#loc169) + } loc(#loc166) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%x: tensor<8x16xi32> loc("x"(#loc175))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc176) + %false = arith.constant false loc(#loc177) + tt.return %false : i1 loc(#loc177) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc178) + tt.return %1 : i1 loc(#loc178) + } loc(#loc175) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S8_16S__(%x: tensor<8x16xi32> loc("x"(#loc179))) -> tensor<8x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc180) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc181) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc181) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<8x16xi32> loc(#loc181) + %4 = arith.addi %x, %3 : tensor<8x16xi32> loc(#loc181) + tt.return %4 : tensor<8x16xi32> loc(#loc182) + ^bb1: // no predecessors + %5 = ub.poison : tensor<8x16xi32> loc(#loc183) + tt.return %5 : tensor<8x16xi32> loc(#loc183) + } loc(#loc179) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc185) + %cst = arith.constant dense : tensor<1xi1> loc(#loc185) + tt.return %cst : tensor<1xi1> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc187) + tt.return %0 : tensor<1xi1> loc(#loc187) + } loc(#loc184) + tt.func private @triton.language.standard.zeros_like__i32S8_16S__(%input: tensor<8x16xi32> loc("input"(#loc188))) -> tensor<8x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<8x16xi32> loc(#loc189) + tt.return %0 : tensor<8x16xi32> loc(#loc190) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x16xi32> loc(#loc191) + tt.return %1 : tensor<8x16xi32> loc(#loc191) + } loc(#loc188) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<8x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc185) + %cst = arith.constant dense<0> : tensor<8x16xi32> loc(#loc185) + tt.return %cst : tensor<8x16xi32> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<8x16xi32> loc(#loc187) + tt.return %0 : tensor<8x16xi32> loc(#loc187) + } loc(#loc184) + tt.func private @triton.language.standard.zeros_like__i16S8_16S__(%input: tensor<8x16xi16> loc("input"(#loc188))) -> tensor<8x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<8x16xi16> loc(#loc189) + tt.return %0 : tensor<8x16xi16> loc(#loc190) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x16xi16> loc(#loc191) + tt.return %1 : tensor<8x16xi16> loc(#loc191) + } loc(#loc188) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<8x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc185) + %cst = arith.constant dense<0> : tensor<8x16xi16> loc(#loc185) + tt.return %cst : tensor<8x16xi16> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<8x16xi16> loc(#loc187) + tt.return %0 : tensor<8x16xi16> loc(#loc187) + } loc(#loc184) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc95)), %idxs: tensor<8x16xi32> loc("idxs"(#loc95))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + tt.return %1#0, %1#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc101) + ^bb1: // no predecessors + %2 = ub.poison : tensor<8x16xi32> loc(#loc102) + %3 = ub.poison : tensor<8x16xi32> loc(#loc102) + tt.return %2, %3 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: tensor<8x16xi32> loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x2xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<32x2x2xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x2xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x2xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<8x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<8x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<8x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<8x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x2x2xi32> loc("input"(#loc166))) -> tensor<32x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc167) + tt.return %0 : tensor<32x2xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x2xi32> loc(#loc169) + tt.return %1 : tensor<32x2xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: tensor<8x16xi32> loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x1xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x1xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<8x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<8x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<8x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<8x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc95)), %idxs: tensor<8x16xi32> loc("idxs"(#loc95))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + tt.return %2#0, %2#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc101) + ^bb1: // no predecessors + %3 = ub.poison : tensor<8x16xi32> loc(#loc102) + %4 = ub.poison : tensor<8x16xi32> loc(#loc102) + tt.return %3, %4 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: tensor<8x16xi32> loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<16x2x4xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<16x2x4xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<16x2x4xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<16x2x4xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<8x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<8x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<8x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<8x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<16x2x4xi32> loc("input"(#loc166))) -> tensor<16x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc167) + tt.return %0 : tensor<16x4xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<16x4xi32> loc(#loc169) + tt.return %1 : tensor<16x4xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc95)), %idxs: tensor<8x16xi32> loc("idxs"(#loc95))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc330) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + tt.return %3#0, %3#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc101) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc102) + %5 = ub.poison : tensor<8x16xi32> loc(#loc102) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x8xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<8x2x8xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x8xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x8xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x8xi32> loc("input"(#loc166))) -> tensor<8x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc167) + tt.return %0 : tensor<8x8xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x8xi32> loc(#loc169) + tt.return %1 : tensor<8x8xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<16x2x4xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<16x2x4xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<16x2x4xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<16x2x4xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x2xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<32x2x2xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x2xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x2xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x1xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x1xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i64S8_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x16xi64> loc("input"(#loc166))) -> tensor<8xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc167) + tt.reduce.return %2 : i64 loc(#loc167) + }) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc167) + tt.return %0 : tensor<8xi64> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8xi64> loc(#loc169) + tt.return %1 : tensor<8xi64> loc(#loc169) + } loc(#loc166) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc170)), %b: i64 loc("b"(#loc170))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc171) + tt.return %0 : i64 loc(#loc172) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc173) + tt.return %1 : i64 loc(#loc173) + } loc(#loc170) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:30) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:45) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":35:30) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":36:18) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":37:34) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":38:18) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":39:18) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":40:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":41:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":43:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":45:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":46:71) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":47:20) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":48:21) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":49:21) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":51:71) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":52:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":54:35) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":55:26) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":55:29) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":56:21) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":58:35) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":59:26) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":59:29) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":60:21) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":61:21) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":62:21) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":63:21) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":64:19) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":65:32) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":66:35) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":67:44) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":68:20) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":69:20) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":70:35) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:28) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:46) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:38) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:55) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:53) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:63) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":72:31) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":73:21) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":74:21) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":75:19) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":76:35) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":77:20) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":78:20) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":79:35) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:28) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:46) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:38) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:55) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:53) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:63) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":81:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":81:37) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":82:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":82:37) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:32) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:47) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:52) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:49) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:25) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:85) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:35) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:32) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:25) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:47) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:52) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:49) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:85) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:4) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc140 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc142 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc143 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc144 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc145 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc146 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc147 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc148 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc149 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc150 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc151 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc152 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc153 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc154 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc155 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc156 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc157 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc158 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc159 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc160 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc161 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc162 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc163 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc164 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc165 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc167 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc168 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc169 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc171 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc172 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc173 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc174 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc176 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc177 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc178 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc180 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc181 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc182 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc183 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc184 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc185 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc186 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc187 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc189 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc190 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc191 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc192 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc202 = loc("xnumel"(#loc1)) +#loc203 = loc("r0_numel"(#loc2)) +#loc204 = loc("xoffset"(#loc3)) +#loc205 = loc("xoffset"(#loc4)) +#loc206 = loc("xindex"(#loc5)) +#loc207 = loc("xindex"(#loc6)) +#loc208 = loc("xindex"(#loc7)) +#loc209 = loc("xmask"(#loc8)) +#loc210 = loc("r0_index"(#loc9)) +#loc211 = loc("r0_index"(#loc10)) +#loc212 = loc("r0_offset"(#loc11)) +#loc213 = loc("r0_mask"(#loc12)) +#loc214 = loc("tmp0"(#loc13)) +#loc215 = loc("tmp0"(#loc14)) +#loc216 = loc("tmp0"(#loc15)) +#loc217 = loc("tmp0"(#loc16)) +#loc218 = loc("tmp1"(#loc17)) +#loc219 = loc("tmp2"(#loc18)) +#loc220 = loc("tmp3"(#loc19)) +#loc221 = loc("tmp4"(#loc20)) +#loc222 = loc("tmp5"(#loc21)) +#loc223 = loc("tmp6"(#loc22)) +#loc224 = loc("tmp7"(#loc23)) +#loc225 = loc("tmp9"(#loc24)) +#loc226 = loc("tmp11"(#loc25)) +#loc227 = loc("tmp14"(#loc27)) +#loc228 = loc("tmp15"(#loc28)) +#loc229 = loc("tmp16"(#loc29)) +#loc230 = loc("tmp20"(#loc31)) +#loc231 = loc("tmp23"(#loc32)) +#loc232 = loc("tmp24"(#loc33)) +#loc233 = loc("tmp24"(#loc34)) +#loc234 = loc("tmp25"(#loc35)) +#loc235 = loc("tmp28"(#loc36)) +#loc236 = loc("tmp29"(#loc37)) +#loc237 = loc("tmp29"(#loc38)) +#loc238 = loc("tmp30"(#loc39)) +#loc239 = loc("tmp31"(#loc40)) +#loc240 = loc("tmp32"(#loc41)) +#loc241 = loc("tmp33"(#loc42)) +#loc242 = loc("tmp34"(#loc43)) +#loc243 = loc("tmp35"(#loc44)) +#loc244 = loc("tmp36"(#loc45)) +#loc245 = loc("tmp37"(#loc46)) +#loc246 = loc("tmp38"(#loc47)) +#loc247 = loc("tmp39"(#loc48)) +#loc248 = loc("tmp40"(#loc49)) +#loc249 = loc("tmp42"(#loc56)) +#loc250 = loc("tmp43"(#loc57)) +#loc251 = loc("tmp44"(#loc58)) +#loc252 = loc("tmp45"(#loc59)) +#loc253 = loc("tmp46"(#loc60)) +#loc254 = loc("tmp47"(#loc61)) +#loc255 = loc("tmp48"(#loc62)) +#loc256 = loc("tmp49"(#loc63)) +#loc261 = loc("flip"(#loc96)) +#loc262 = loc("flip"(#loc97)) +#loc263 = loc("flip"(#loc98)) +#loc264 = loc("flip"(#loc99)) +#loc268 = loc("y"(#loc104)) +#loc269 = loc("right_mask"(#loc105)) +#loc270 = loc("right_mask"(#loc106)) +#loc271 = loc("left_mask"(#loc107)) +#loc272 = loc("ileft"(#loc108)) +#loc273 = loc("ileft"(#loc109)) +#loc274 = loc("ileft"(#loc110)) +#loc275 = loc("ileft"(#loc111)) +#loc276 = loc("iright"(#loc112)) +#loc277 = loc("iright"(#loc113)) +#loc278 = loc("iright"(#loc114)) +#loc279 = loc("iright"(#loc115)) +#loc280 = loc("ileft"(#loc116)) +#loc281 = loc("iright"(#loc117)) +#loc282 = loc("y_idx"(#loc118)) +#loc283 = loc("left_idx"(#loc119)) +#loc284 = loc("left_idx"(#loc120)) +#loc285 = loc("left_idx"(#loc121)) +#loc286 = loc("left_idx"(#loc122)) +#loc287 = loc("left_idx"(#loc123)) +#loc288 = loc("right_idx"(#loc124)) +#loc289 = loc("right_idx"(#loc125)) +#loc290 = loc("right_idx"(#loc126)) +#loc291 = loc("right_idx"(#loc127)) +#loc292 = loc("right_idx"(#loc128)) +#loc293 = loc("left_idx"(#loc129)) +#loc294 = loc("right_idx"(#loc130)) +#loc295 = loc("left_valid_mask"(#loc131)) +#loc296 = loc("right_valid_mask"(#loc132)) +#loc297 = loc("left_isnan"(#loc133)) +#loc298 = loc("right_isnan"(#loc134)) +#loc299 = loc("cond"(#loc135)) +#loc300 = loc("cond"(#loc138)) +#loc301 = loc("cond"(#loc139)) +#loc302 = loc("cond"(#loc140)) +#loc303 = loc("eq"(#loc142)) +#loc304 = loc("eq"(#loc145)) +#loc305 = loc("eq"(#loc146)) +#loc306 = loc("cond"(#loc147)) +#loc307 = loc("cond"(#loc148)) +#loc308 = loc("cond"(#loc149)) +#loc309 = loc("cond"(#loc150)) +#loc310 = loc("cond"(#loc151)) +#loc311 = loc("cond"(#loc152)) +#loc312 = loc("cond"(#loc153)) +#loc313 = loc("cond"(#loc154)) +#loc314 = loc("cond"(#loc155)) +#loc315 = loc("ret"(#loc156)) +#loc316 = loc("ret"(#loc157)) +#loc317 = loc("ret"(#loc158)) +#loc318 = loc("ret"(#loc159)) +#loc319 = loc("new_idxs"(#loc160)) +#loc320 = loc("new_idxs"(#loc161)) +#loc321 = loc("new_idxs"(#loc162)) +#loc322 = loc("new_idxs"(#loc163)) +#loc326 = loc("input"(#loc174)) +#loc330 = loc("flip"(#loc192)) +#loc331 = loc("cond"(#loc299)) +#loc332 = loc("cond"(#loc302)) +#loc333 = loc("eq"(#loc303)) +#loc334 = loc("eq"(#loc305)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..1b3d0ad8f8e26ecdf2af24b78d56dcea589ea257 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir @@ -0,0 +1,1480 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [4, 8], warpsPerCTA = [2, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1, 2], threadsPerWarp = [4, 2, 4], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1, 2], threadsPerWarp = [8, 2, 2], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked3 = #ttg.blocked<{sizePerThread = [1, 1, 2], threadsPerWarp = [16, 2, 1], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked4 = #ttg.blocked<{sizePerThread = [1, 2, 1], threadsPerWarp = [32, 1, 1], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 2], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":18:0) +#loc1 = loc(unknown) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":46:71) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":51:71) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":55:26) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":59:26) +#loc117 = loc("in_ptr0"(#loc)) +#loc118 = loc("out_ptr4"(#loc)) +#loc119 = loc("out_ptr5"(#loc)) +#loc120 = loc("out_ptr6"(#loc)) +#loc121 = loc("out_ptr7"(#loc)) +#loc122 = loc("out_ptr8"(#loc)) +#loc123 = loc("out_ptr9"(#loc)) +#loc124 = loc("xnumel"(#loc)) +#loc125 = loc("r0_numel"(#loc)) +#loc144 = loc(callsite(#loc20 at #loc21)) +#loc150 = loc("ileft"(#loc29)) +#loc154 = loc("iright"(#loc34)) +#loc163 = loc("left_idx"(#loc43)) +#loc168 = loc("right_idx"(#loc48)) +#loc189 = loc(callsite(#loc20 at #loc69)) +#loc192 = loc("tmp24"(#loc72)) +#loc197 = loc("tmp29"(#loc77)) +#loc214 = loc(callsite(#loc25 at #loc144)) +#loc218 = loc(callsite(#loc25 at #loc189)) +#loc221 = loc(callsite(#loc1 at #loc192)) +#loc224 = loc(callsite(#loc1 at #loc197)) +#loc228 = loc(callsite(#loc150 at #loc214)) +#loc232 = loc(callsite(#loc154 at #loc214)) +#loc240 = loc(callsite(#loc163 at #loc214)) +#loc245 = loc(callsite(#loc168 at #loc214)) +#loc265 = loc(callsite(#loc150 at #loc218)) +#loc269 = loc(callsite(#loc154 at #loc218)) +#loc287 = loc(callsite(#loc163 at #loc218)) +#loc291 = loc(callsite(#loc168 at #loc218)) +#loc301 = loc(callsite(#loc1 at #loc228)) +#loc303 = loc(callsite(#loc1 at #loc232)) +#loc306 = loc(callsite(#loc1 at #loc240)) +#loc309 = loc(callsite(#loc1 at #loc245)) +#loc311 = loc(callsite(#loc1 at #loc265)) +#loc313 = loc(callsite(#loc1 at #loc269)) +#loc315 = loc(callsite(#loc1 at #loc287)) +#loc317 = loc(callsite(#loc1 at #loc291)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<8x1xi1, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<17> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked2> loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked3> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked4> loc(#loc1) + %cst_5 = arith.constant dense<16> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<32> : tensor<8x1xi32, #blocked5> loc(#loc1) + %cst_7 = arith.constant dense<32> : tensor<8x1xi32, #blocked> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %cst_8 = arith.constant dense<1> : tensor<8x16xi32, #blocked5> loc(#loc1) + %cst_9 = arith.constant dense<0> : tensor<8x16xi32, #blocked> loc(#loc1) + %cst_10 = arith.constant dense<17> : tensor<8x16xi32, #blocked> loc(#loc1) + %cst_11 = arith.constant dense<16> : tensor<8x16xi32, #blocked> loc(#loc1) + %cst_12 = arith.constant dense<16384> : tensor<8x16xi64, #blocked> loc(#loc1) + %cst_13 = arith.constant dense<0> : tensor<8x16xi64, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc126) + %xoffset_14 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc127) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc128) + %xindex_15 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc128) + %xindex_16 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc128) + %xindex_17 = tt.expand_dims %xindex_15 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<8x1xi32, #blocked5> loc(#loc128) + %xindex_18 = tt.splat %xoffset_14 : i32 -> tensor<8x1xi32, #blocked> loc(#loc129) + %xindex_19 = tt.splat %xoffset_14 : i32 -> tensor<8x1xi32, #blocked5> loc(#loc129) + %xindex_20 = arith.addi %xindex_18, %xindex_16 : tensor<8x1xi32, #blocked> loc(#loc129) + %xindex_21 = arith.addi %xindex_19, %xindex_17 : tensor<8x1xi32, #blocked5> loc(#loc129) + %xmask = arith.cmpi slt, %xindex_20, %cst_7 : tensor<8x1xi32, #blocked> loc(#loc130) + %xmask_22 = arith.cmpi slt, %xindex_21, %cst_6 : tensor<8x1xi32, #blocked5> loc(#loc130) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc131) + %r0_index_23 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc131) + %tmp0 = arith.muli %xindex_20, %cst_5 : tensor<8x1xi32, #blocked> loc(#loc132) + %tmp0_24 = tt.broadcast %r0_index_23 : tensor<1x16xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc133) + %tmp0_25 = tt.broadcast %tmp0 : tensor<8x1xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc133) + %tmp0_26 = arith.addi %tmp0_24, %tmp0_25 : tensor<8x16xi32, #blocked> loc(#loc133) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc134) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi32, #blocked> loc(#loc134) + %tmp0_29 = tt.broadcast %xmask : tensor<8x1xi1, #blocked> -> tensor<8x16xi1, #blocked> loc(#loc135) + %tmp0_30 = tt.broadcast %xmask_22 : tensor<8x1xi1, #blocked5> -> tensor<8x16xi1, #blocked5> loc(#loc135) + %tmp0_31 = tt.load %tmp0_28, %tmp0_29, %cst_13 : tensor<8x16x!tt.ptr, #blocked> loc(#loc135) + %tmp2 = arith.cmpi sgt, %tmp0_31, %cst_13 : tensor<8x16xi64, #blocked> loc(#loc136) + %tmp4 = arith.cmpi slt, %tmp0_31, %cst_12 : tensor<8x16xi64, #blocked> loc(#loc137) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<8x16xi1, #blocked> loc(#loc138) + %tmp7 = arith.extui %tmp5 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc210) + %tmp9 = arith.trunci %r0_index_23 : tensor<1x16xi32, #blocked> to tensor<1x16xi16, #blocked> loc(#loc141) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16, #blocked> -> tensor<8x16xi16, #blocked> loc(#loc142) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> loc(#loc211) + %flip_32 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> loc(#loc211) + %flip_33 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> loc(#loc211) + %flip_34 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> loc(#loc211) + %flip_35 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> loc(#loc211) + %flip_36 = tt.expand_dims %flip_32 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> loc(#loc211) + %flip_37 = tt.expand_dims %flip_33 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> loc(#loc211) + %flip_38 = tt.expand_dims %flip_34 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> loc(#loc211) + %flip_39 = tt.expand_dims %flip_35 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> -> tensor<1x2x1xi32, #blocked3> loc(#loc211) + %flip_40 = tt.expand_dims %flip_36 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> -> tensor<1x2x1xi32, #blocked4> loc(#loc211) + %flip_41 = tt.expand_dims %flip_37 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> -> tensor<1x2x1xi32, #blocked2> loc(#loc211) + %flip_42 = tt.expand_dims %flip_38 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> -> tensor<1x2x1xi32, #blocked1> loc(#loc211) + %flip_43 = tt.broadcast %flip_39 : tensor<1x2x1xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc212) + %flip_44 = tt.reshape %flip_43 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc213) + %y = tt.reshape %tmp7 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc225) + %left_mask = arith.subi %cst_4, %flip_40 : tensor<1x2x1xi32, #blocked4> loc(#loc226) + %left_mask_45 = arith.subi %cst_3, %flip_39 : tensor<1x2x1xi32, #blocked3> loc(#loc226) + %left_mask_46 = arith.subi %cst_2, %flip_41 : tensor<1x2x1xi32, #blocked2> loc(#loc226) + %left_mask_47 = arith.subi %cst_1, %flip_42 : tensor<1x2x1xi32, #blocked1> loc(#loc226) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc227) + %ileft_48 = arith.muli %y, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc227) + %ileft_49 = "tt.reduce"(%ileft_48) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_50 = tt.expand_dims %ileft_49 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc229) + %ileft_51 = tt.broadcast %ileft_50 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc230) + %iright = tt.broadcast %flip_40 : tensor<1x2x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc231) + %iright_52 = arith.muli %y, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc231) + %iright_53 = "tt.reduce"(%iright_52) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_54 = tt.expand_dims %iright_53 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc233) + %iright_55 = tt.broadcast %iright_54 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc234) + %ileft_56 = tt.reshape %ileft_51 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_57 = tt.reshape %iright_55 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx = tt.reshape %tmp11 : tensor<8x16xi16, #blocked> -> tensor<64x2x1xi16, #blocked4> loc(#loc237) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc238) + %left_idx_58 = tt.broadcast %left_idx : tensor<1x2x1xi16, #blocked4> -> tensor<64x2x1xi16, #blocked4> loc(#loc239) + %left_idx_59 = arith.muli %y_idx, %left_idx_58 : tensor<64x2x1xi16, #blocked4> loc(#loc239) + %input = arith.extsi %left_idx_59 : tensor<64x2x1xi16, #blocked4> to tensor<64x2x1xi32, #blocked4> loc(#loc304) + %left_idx_60 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_61 = tt.expand_dims %left_idx_60 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc241) + %left_idx_62 = tt.broadcast %left_idx_61 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc242) + %right_idx = arith.trunci %flip_40 : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc243) + %right_idx_63 = tt.broadcast %right_idx : tensor<1x2x1xi16, #blocked4> -> tensor<64x2x1xi16, #blocked4> loc(#loc244) + %right_idx_64 = arith.muli %y_idx, %right_idx_63 : tensor<64x2x1xi16, #blocked4> loc(#loc244) + %input_65 = arith.extsi %right_idx_64 : tensor<64x2x1xi16, #blocked4> to tensor<64x2x1xi32, #blocked4> loc(#loc307) + %right_idx_66 = "tt.reduce"(%input_65) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_67 = tt.expand_dims %right_idx_66 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc246) + %right_idx_68 = tt.broadcast %right_idx_67 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc247) + %left_idx_69 = tt.reshape %left_idx_62 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_70 = tt.reshape %right_idx_68 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond = arith.cmpi slt, %ileft_56, %iright_57 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq = arith.cmpi eq, %ileft_56, %iright_57 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_71 = arith.cmpi sgt, %left_idx_69, %right_idx_70 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_72 = arith.andi %eq, %cond_71 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_73 = arith.ori %cond, %cond_72 : tensor<8x16xi1, #blocked> loc(#loc254) + %cond_74 = arith.extui %cond_73 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc255) + %cond_75 = arith.xori %cond_74, %flip_44 : tensor<8x16xi32, #blocked> loc(#loc255) + %cond_76 = arith.cmpi ne, %cond_75, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc256) + %ret = arith.xori %ileft_56, %iright_57 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_77 = arith.select %cond_76, %ret, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_78 = arith.xori %tmp7, %ret_77 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs = arith.xori %left_idx_69, %right_idx_70 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_79 = arith.select %cond_76, %new_idxs, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_80 = arith.extsi %tmp9 : tensor<1x16xi16, #blocked> to tensor<1x16xi32, #blocked> loc(#loc262) + %new_idxs_81 = tt.broadcast %new_idxs_80 : tensor<1x16xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc262) + %new_idxs_82 = arith.xori %new_idxs_81, %new_idxs_79 : tensor<8x16xi32, #blocked> loc(#loc262) + %flip_83 = tt.broadcast %flip_41 : tensor<1x2x1xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc212) + %flip_84 = tt.reshape %flip_83 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc213) + %y_85 = tt.reshape %ret_78 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc225) + %ileft_86 = tt.broadcast %left_mask_45 : tensor<1x2x1xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc227) + %ileft_87 = arith.muli %y_85, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc227) + %ileft_88 = "tt.reduce"(%ileft_87) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_89 = tt.expand_dims %ileft_88 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc229) + %ileft_90 = tt.broadcast %ileft_89 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc230) + %iright_91 = arith.muli %y_85, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc231) + %iright_92 = "tt.reduce"(%iright_91) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_93 = tt.expand_dims %iright_92 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc233) + %iright_94 = tt.broadcast %iright_93 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc234) + %ileft_95 = tt.reshape %ileft_90 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_96 = tt.reshape %iright_94 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_97 = tt.reshape %new_idxs_82 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc237) + %left_idx_98 = arith.muli %y_idx_97, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc239) + %left_idx_99 = "tt.reduce"(%left_idx_98) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_100 = tt.expand_dims %left_idx_99 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc241) + %left_idx_101 = tt.broadcast %left_idx_100 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc242) + %right_idx_102 = arith.muli %y_idx_97, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc244) + %right_idx_103 = "tt.reduce"(%right_idx_102) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_104 = tt.expand_dims %right_idx_103 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc246) + %right_idx_105 = tt.broadcast %right_idx_104 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc247) + %left_idx_106 = tt.reshape %left_idx_101 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_107 = tt.reshape %right_idx_105 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_108 = arith.cmpi slt, %ileft_95, %iright_96 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_109 = arith.cmpi eq, %ileft_95, %iright_96 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_110 = arith.cmpi sgt, %left_idx_106, %right_idx_107 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_111 = arith.andi %eq_109, %cond_110 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_112 = arith.ori %cond_108, %cond_111 : tensor<8x16xi1, #blocked> loc(#loc254) + %cond_113 = arith.extui %cond_112 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc255) + %cond_114 = arith.xori %cond_113, %flip_84 : tensor<8x16xi32, #blocked> loc(#loc255) + %cond_115 = arith.cmpi ne, %cond_114, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc256) + %ret_116 = arith.xori %ileft_95, %iright_96 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_117 = arith.select %cond_115, %ret_116, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_118 = arith.xori %ret_78, %ret_117 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_119 = arith.xori %left_idx_106, %right_idx_107 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_120 = arith.select %cond_115, %new_idxs_119, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_121 = arith.xori %new_idxs_82, %new_idxs_120 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_122 = tt.reshape %ret_118 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc225) + %ileft_123 = arith.muli %y_122, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc227) + %ileft_124 = "tt.reduce"(%ileft_123) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_125 = tt.expand_dims %ileft_124 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc229) + %ileft_126 = tt.broadcast %ileft_125 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc230) + %iright_127 = arith.muli %y_122, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc231) + %iright_128 = "tt.reduce"(%iright_127) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_129 = tt.expand_dims %iright_128 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc233) + %iright_130 = tt.broadcast %iright_129 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc234) + %ileft_131 = tt.reshape %ileft_126 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_132 = tt.reshape %iright_130 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_133 = tt.reshape %new_idxs_121 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc237) + %left_idx_134 = arith.muli %y_idx_133, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc239) + %left_idx_135 = "tt.reduce"(%left_idx_134) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_136 = tt.expand_dims %left_idx_135 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc241) + %left_idx_137 = tt.broadcast %left_idx_136 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc242) + %right_idx_138 = arith.muli %y_idx_133, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc244) + %right_idx_139 = "tt.reduce"(%right_idx_138) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_140 = tt.expand_dims %right_idx_139 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc246) + %right_idx_141 = tt.broadcast %right_idx_140 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc247) + %left_idx_142 = tt.reshape %left_idx_137 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_143 = tt.reshape %right_idx_141 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_144 = arith.cmpi slt, %ileft_131, %iright_132 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_145 = arith.cmpi eq, %ileft_131, %iright_132 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_146 = arith.cmpi sgt, %left_idx_142, %right_idx_143 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_147 = arith.andi %eq_145, %cond_146 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_148 = arith.ori %cond_144, %cond_147 : tensor<8x16xi1, #blocked> loc(#loc254) + %cond_149 = arith.extui %cond_148 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc255) + %cond_150 = arith.xori %cond_149, %flip_84 : tensor<8x16xi32, #blocked> loc(#loc255) + %cond_151 = arith.cmpi ne, %cond_150, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc256) + %ret_152 = arith.xori %ileft_131, %iright_132 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_153 = arith.select %cond_151, %ret_152, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_154 = arith.xori %ret_118, %ret_153 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_155 = arith.xori %left_idx_142, %right_idx_143 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_156 = arith.select %cond_151, %new_idxs_155, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_157 = arith.xori %new_idxs_121, %new_idxs_156 : tensor<8x16xi32, #blocked> loc(#loc262) + %flip_158 = tt.broadcast %flip_42 : tensor<1x2x1xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc212) + %flip_159 = tt.reshape %flip_158 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc213) + %y_160 = tt.reshape %ret_154 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc225) + %ileft_161 = tt.broadcast %left_mask_46 : tensor<1x2x1xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc227) + %ileft_162 = arith.muli %y_160, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc227) + %ileft_163 = "tt.reduce"(%ileft_162) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc300) + %ileft_164 = tt.expand_dims %ileft_163 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc229) + %ileft_165 = tt.broadcast %ileft_164 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc230) + %iright_166 = arith.muli %y_160, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc231) + %iright_167 = "tt.reduce"(%iright_166) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %iright_168 = tt.expand_dims %iright_167 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc233) + %iright_169 = tt.broadcast %iright_168 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc234) + %ileft_170 = tt.reshape %ileft_165 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_171 = tt.reshape %iright_169 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_172 = tt.reshape %new_idxs_157 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc237) + %left_idx_173 = arith.muli %y_idx_172, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc239) + %left_idx_174 = "tt.reduce"(%left_idx_173) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %left_idx_175 = tt.expand_dims %left_idx_174 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc241) + %left_idx_176 = tt.broadcast %left_idx_175 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc242) + %right_idx_177 = arith.muli %y_idx_172, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc244) + %right_idx_178 = "tt.reduce"(%right_idx_177) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc308) + %right_idx_179 = tt.expand_dims %right_idx_178 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc246) + %right_idx_180 = tt.broadcast %right_idx_179 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc247) + %left_idx_181 = tt.reshape %left_idx_176 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_182 = tt.reshape %right_idx_180 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_183 = arith.cmpi slt, %ileft_170, %iright_171 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_184 = arith.cmpi eq, %ileft_170, %iright_171 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_185 = arith.cmpi sgt, %left_idx_181, %right_idx_182 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_186 = arith.andi %eq_184, %cond_185 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_187 = arith.ori %cond_183, %cond_186 : tensor<8x16xi1, #blocked> loc(#loc254) + %cond_188 = arith.extui %cond_187 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc255) + %cond_189 = arith.xori %cond_188, %flip_159 : tensor<8x16xi32, #blocked> loc(#loc255) + %cond_190 = arith.cmpi ne, %cond_189, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc256) + %ret_191 = arith.xori %ileft_170, %iright_171 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_192 = arith.select %cond_190, %ret_191, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_193 = arith.xori %ret_154, %ret_192 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_194 = arith.xori %left_idx_181, %right_idx_182 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_195 = arith.select %cond_190, %new_idxs_194, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_196 = arith.xori %new_idxs_157, %new_idxs_195 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_197 = tt.reshape %ret_193 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc225) + %ileft_198 = arith.muli %y_197, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc227) + %ileft_199 = "tt.reduce"(%ileft_198) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_200 = tt.expand_dims %ileft_199 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc229) + %ileft_201 = tt.broadcast %ileft_200 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc230) + %iright_202 = arith.muli %y_197, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc231) + %iright_203 = "tt.reduce"(%iright_202) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_204 = tt.expand_dims %iright_203 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc233) + %iright_205 = tt.broadcast %iright_204 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc234) + %ileft_206 = tt.reshape %ileft_201 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_207 = tt.reshape %iright_205 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_208 = tt.reshape %new_idxs_196 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc237) + %left_idx_209 = arith.muli %y_idx_208, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc239) + %left_idx_210 = "tt.reduce"(%left_idx_209) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_211 = tt.expand_dims %left_idx_210 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc241) + %left_idx_212 = tt.broadcast %left_idx_211 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc242) + %right_idx_213 = arith.muli %y_idx_208, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc244) + %right_idx_214 = "tt.reduce"(%right_idx_213) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_215 = tt.expand_dims %right_idx_214 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc246) + %right_idx_216 = tt.broadcast %right_idx_215 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc247) + %left_idx_217 = tt.reshape %left_idx_212 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_218 = tt.reshape %right_idx_216 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_219 = arith.cmpi slt, %ileft_206, %iright_207 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_220 = arith.cmpi eq, %ileft_206, %iright_207 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_221 = arith.cmpi sgt, %left_idx_217, %right_idx_218 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_222 = arith.andi %eq_220, %cond_221 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_223 = arith.ori %cond_219, %cond_222 : tensor<8x16xi1, #blocked> loc(#loc254) + %cond_224 = arith.extui %cond_223 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc255) + %cond_225 = arith.xori %cond_224, %flip_159 : tensor<8x16xi32, #blocked> loc(#loc255) + %cond_226 = arith.cmpi ne, %cond_225, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc256) + %ret_227 = arith.xori %ileft_206, %iright_207 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_228 = arith.select %cond_226, %ret_227, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_229 = arith.xori %ret_193, %ret_228 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_230 = arith.xori %left_idx_217, %right_idx_218 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_231 = arith.select %cond_226, %new_idxs_230, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_232 = arith.xori %new_idxs_196, %new_idxs_231 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_233 = tt.reshape %ret_229 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc225) + %ileft_234 = arith.muli %y_233, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc227) + %ileft_235 = "tt.reduce"(%ileft_234) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_236 = tt.expand_dims %ileft_235 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc229) + %ileft_237 = tt.broadcast %ileft_236 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc230) + %iright_238 = arith.muli %y_233, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc231) + %iright_239 = "tt.reduce"(%iright_238) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_240 = tt.expand_dims %iright_239 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc233) + %iright_241 = tt.broadcast %iright_240 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc234) + %ileft_242 = tt.reshape %ileft_237 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_243 = tt.reshape %iright_241 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_244 = tt.reshape %new_idxs_232 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc237) + %left_idx_245 = arith.muli %y_idx_244, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc239) + %left_idx_246 = "tt.reduce"(%left_idx_245) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_247 = tt.expand_dims %left_idx_246 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc241) + %left_idx_248 = tt.broadcast %left_idx_247 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc242) + %right_idx_249 = arith.muli %y_idx_244, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc244) + %right_idx_250 = "tt.reduce"(%right_idx_249) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_251 = tt.expand_dims %right_idx_250 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc246) + %right_idx_252 = tt.broadcast %right_idx_251 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc247) + %left_idx_253 = tt.reshape %left_idx_248 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_254 = tt.reshape %right_idx_252 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_255 = arith.cmpi slt, %ileft_242, %iright_243 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_256 = arith.cmpi eq, %ileft_242, %iright_243 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_257 = arith.cmpi sgt, %left_idx_253, %right_idx_254 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_258 = arith.andi %eq_256, %cond_257 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_259 = arith.ori %cond_255, %cond_258 : tensor<8x16xi1, #blocked> loc(#loc254) + %cond_260 = arith.extui %cond_259 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc255) + %cond_261 = arith.xori %cond_260, %flip_159 : tensor<8x16xi32, #blocked> loc(#loc255) + %cond_262 = arith.cmpi ne, %cond_261, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc256) + %ret_263 = arith.xori %ileft_242, %iright_243 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_264 = arith.select %cond_262, %ret_263, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_265 = arith.xori %ret_229, %ret_264 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_266 = arith.xori %left_idx_253, %right_idx_254 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_267 = arith.select %cond_262, %new_idxs_266, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_268 = arith.xori %new_idxs_232, %new_idxs_267 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_269 = tt.reshape %ret_265 : tensor<8x16xi32, #blocked> -> tensor<8x2x8xi32, #blocked1> loc(#loc225) + %ileft_270 = tt.broadcast %left_mask_47 : tensor<1x2x1xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc227) + %ileft_271 = arith.muli %y_269, %ileft_270 : tensor<8x2x8xi32, #blocked1> loc(#loc227) + %ileft_272 = "tt.reduce"(%ileft_271) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc300) + %ileft_273 = tt.expand_dims %ileft_272 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc229) + %ileft_274 = tt.broadcast %ileft_273 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc230) + %iright_275 = arith.muli %y_269, %flip_158 : tensor<8x2x8xi32, #blocked1> loc(#loc231) + %iright_276 = "tt.reduce"(%iright_275) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc302) + %iright_277 = tt.expand_dims %iright_276 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc233) + %iright_278 = tt.broadcast %iright_277 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc234) + %ileft_279 = tt.reshape %ileft_274 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_280 = tt.reshape %iright_278 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_281 = tt.reshape %new_idxs_268 : tensor<8x16xi32, #blocked> -> tensor<8x2x8xi32, #blocked1> loc(#loc237) + %left_idx_282 = arith.muli %y_idx_281, %ileft_270 : tensor<8x2x8xi32, #blocked1> loc(#loc239) + %left_idx_283 = "tt.reduce"(%left_idx_282) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc305) + %left_idx_284 = tt.expand_dims %left_idx_283 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc241) + %left_idx_285 = tt.broadcast %left_idx_284 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc242) + %right_idx_286 = arith.muli %y_idx_281, %flip_158 : tensor<8x2x8xi32, #blocked1> loc(#loc244) + %right_idx_287 = "tt.reduce"(%right_idx_286) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc308) + %right_idx_288 = tt.expand_dims %right_idx_287 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc246) + %right_idx_289 = tt.broadcast %right_idx_288 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc247) + %left_idx_290 = tt.reshape %left_idx_285 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_291 = tt.reshape %right_idx_289 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_292 = arith.cmpi slt, %ileft_279, %iright_280 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_293 = arith.cmpi eq, %ileft_279, %iright_280 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_294 = arith.cmpi sgt, %left_idx_290, %right_idx_291 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_295 = arith.andi %eq_293, %cond_294 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_296 = arith.ori %cond_292, %cond_295 : tensor<8x16xi1, #blocked> loc(#loc254) + %ret_297 = arith.xori %ileft_279, %iright_280 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_298 = arith.select %cond_296, %ret_297, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_299 = arith.xori %ret_265, %ret_298 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_300 = arith.xori %left_idx_290, %right_idx_291 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_301 = arith.select %cond_296, %new_idxs_300, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_302 = arith.xori %new_idxs_268, %new_idxs_301 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_303 = tt.reshape %ret_299 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc225) + %ileft_304 = arith.muli %y_303, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc227) + %ileft_305 = "tt.reduce"(%ileft_304) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc300) + %ileft_306 = tt.expand_dims %ileft_305 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc229) + %ileft_307 = tt.broadcast %ileft_306 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc230) + %iright_308 = arith.muli %y_303, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc231) + %iright_309 = "tt.reduce"(%iright_308) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %iright_310 = tt.expand_dims %iright_309 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc233) + %iright_311 = tt.broadcast %iright_310 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc234) + %ileft_312 = tt.reshape %ileft_307 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_313 = tt.reshape %iright_311 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_314 = tt.reshape %new_idxs_302 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc237) + %left_idx_315 = arith.muli %y_idx_314, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc239) + %left_idx_316 = "tt.reduce"(%left_idx_315) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %left_idx_317 = tt.expand_dims %left_idx_316 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc241) + %left_idx_318 = tt.broadcast %left_idx_317 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc242) + %right_idx_319 = arith.muli %y_idx_314, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc244) + %right_idx_320 = "tt.reduce"(%right_idx_319) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc308) + %right_idx_321 = tt.expand_dims %right_idx_320 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc246) + %right_idx_322 = tt.broadcast %right_idx_321 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc247) + %left_idx_323 = tt.reshape %left_idx_318 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_324 = tt.reshape %right_idx_322 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_325 = arith.cmpi slt, %ileft_312, %iright_313 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_326 = arith.cmpi eq, %ileft_312, %iright_313 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_327 = arith.cmpi sgt, %left_idx_323, %right_idx_324 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_328 = arith.andi %eq_326, %cond_327 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_329 = arith.ori %cond_325, %cond_328 : tensor<8x16xi1, #blocked> loc(#loc254) + %ret_330 = arith.xori %ileft_312, %iright_313 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_331 = arith.select %cond_329, %ret_330, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_332 = arith.xori %ret_299, %ret_331 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_333 = arith.xori %left_idx_323, %right_idx_324 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_334 = arith.select %cond_329, %new_idxs_333, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_335 = arith.xori %new_idxs_302, %new_idxs_334 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_336 = tt.reshape %ret_332 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc225) + %ileft_337 = arith.muli %y_336, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc227) + %ileft_338 = "tt.reduce"(%ileft_337) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_339 = tt.expand_dims %ileft_338 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc229) + %ileft_340 = tt.broadcast %ileft_339 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc230) + %iright_341 = arith.muli %y_336, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc231) + %iright_342 = "tt.reduce"(%iright_341) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_343 = tt.expand_dims %iright_342 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc233) + %iright_344 = tt.broadcast %iright_343 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc234) + %ileft_345 = tt.reshape %ileft_340 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_346 = tt.reshape %iright_344 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_347 = tt.reshape %new_idxs_335 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc237) + %left_idx_348 = arith.muli %y_idx_347, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc239) + %left_idx_349 = "tt.reduce"(%left_idx_348) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_350 = tt.expand_dims %left_idx_349 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc241) + %left_idx_351 = tt.broadcast %left_idx_350 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc242) + %right_idx_352 = arith.muli %y_idx_347, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc244) + %right_idx_353 = "tt.reduce"(%right_idx_352) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_354 = tt.expand_dims %right_idx_353 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc246) + %right_idx_355 = tt.broadcast %right_idx_354 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc247) + %left_idx_356 = tt.reshape %left_idx_351 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_357 = tt.reshape %right_idx_355 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_358 = arith.cmpi slt, %ileft_345, %iright_346 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_359 = arith.cmpi eq, %ileft_345, %iright_346 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_360 = arith.cmpi sgt, %left_idx_356, %right_idx_357 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_361 = arith.andi %eq_359, %cond_360 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_362 = arith.ori %cond_358, %cond_361 : tensor<8x16xi1, #blocked> loc(#loc254) + %ret_363 = arith.xori %ileft_345, %iright_346 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_364 = arith.select %cond_362, %ret_363, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_365 = arith.xori %ret_332, %ret_364 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_366 = arith.xori %left_idx_356, %right_idx_357 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_367 = arith.select %cond_362, %new_idxs_366, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_368 = arith.xori %new_idxs_335, %new_idxs_367 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_369 = tt.reshape %ret_365 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc225) + %ileft_370 = arith.muli %y_369, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc227) + %ileft_371 = "tt.reduce"(%ileft_370) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_372 = tt.expand_dims %ileft_371 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc229) + %ileft_373 = tt.broadcast %ileft_372 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc230) + %iright_374 = arith.muli %y_369, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc231) + %iright_375 = "tt.reduce"(%iright_374) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_376 = tt.expand_dims %iright_375 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc233) + %iright_377 = tt.broadcast %iright_376 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc234) + %ileft_378 = tt.reshape %ileft_373 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_379 = tt.reshape %iright_377 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_380 = tt.reshape %new_idxs_368 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc237) + %left_idx_381 = arith.muli %y_idx_380, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc239) + %left_idx_382 = "tt.reduce"(%left_idx_381) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_383 = tt.expand_dims %left_idx_382 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc241) + %left_idx_384 = tt.broadcast %left_idx_383 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc242) + %right_idx_385 = arith.muli %y_idx_380, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc244) + %right_idx_386 = "tt.reduce"(%right_idx_385) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_387 = tt.expand_dims %right_idx_386 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc246) + %right_idx_388 = tt.broadcast %right_idx_387 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc247) + %left_idx_389 = tt.reshape %left_idx_384 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_390 = tt.reshape %right_idx_388 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_391 = arith.cmpi slt, %ileft_378, %iright_379 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_392 = arith.cmpi eq, %ileft_378, %iright_379 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_393 = arith.cmpi sgt, %left_idx_389, %right_idx_390 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_394 = arith.andi %eq_392, %cond_393 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_395 = arith.ori %cond_391, %cond_394 : tensor<8x16xi1, #blocked> loc(#loc254) + %new_idxs_396 = arith.xori %left_idx_389, %right_idx_390 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_397 = arith.select %cond_395, %new_idxs_396, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_398 = arith.xori %new_idxs_368, %new_idxs_397 : tensor<8x16xi32, #blocked> loc(#loc262) + %tmp14 = arith.cmpi eq, %tmp0_31, %cst_12 : tensor<8x16xi64, #blocked> loc(#loc186) + %tmp16 = arith.extui %tmp14 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc217) + %y_399 = tt.reshape %tmp16 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc263) + %ileft_400 = arith.muli %y_399, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc264) + %ileft_401 = "tt.reduce"(%ileft_400) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_402 = tt.expand_dims %ileft_401 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc266) + %ileft_403 = tt.broadcast %ileft_402 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc267) + %iright_404 = arith.muli %y_399, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc268) + %iright_405 = "tt.reduce"(%iright_404) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_406 = tt.expand_dims %iright_405 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc270) + %iright_407 = tt.broadcast %iright_406 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc271) + %ileft_408 = tt.reshape %ileft_403 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_409 = tt.reshape %iright_407 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc273) + %cond_410 = arith.cmpi slt, %ileft_408, %iright_409 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_411 = arith.cmpi eq, %ileft_408, %iright_409 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_412 = arith.andi %eq_411, %cond_71 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_413 = arith.ori %cond_410, %cond_412 : tensor<8x16xi1, #blocked> loc(#loc277) + %cond_414 = arith.extui %cond_413 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc278) + %cond_415 = arith.xori %cond_414, %flip_44 : tensor<8x16xi32, #blocked> loc(#loc278) + %cond_416 = arith.cmpi ne, %cond_415, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc279) + %ret_417 = arith.xori %ileft_408, %iright_409 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_418 = arith.select %cond_416, %ret_417, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_419 = arith.xori %tmp16, %ret_418 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_420 = arith.select %cond_416, %new_idxs, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_421 = arith.xori %new_idxs_81, %new_idxs_420 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_422 = tt.reshape %ret_419 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc263) + %ileft_423 = arith.muli %y_422, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc264) + %ileft_424 = "tt.reduce"(%ileft_423) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_425 = tt.expand_dims %ileft_424 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc266) + %ileft_426 = tt.broadcast %ileft_425 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc267) + %iright_427 = arith.muli %y_422, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc268) + %iright_428 = "tt.reduce"(%iright_427) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_429 = tt.expand_dims %iright_428 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc270) + %iright_430 = tt.broadcast %iright_429 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc271) + %ileft_431 = tt.reshape %ileft_426 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_432 = tt.reshape %iright_430 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_433 = tt.reshape %new_idxs_421 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc285) + %left_idx_434 = arith.muli %y_idx_433, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc286) + %left_idx_435 = "tt.reduce"(%left_idx_434) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_436 = tt.expand_dims %left_idx_435 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc288) + %left_idx_437 = tt.broadcast %left_idx_436 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc289) + %right_idx_438 = arith.muli %y_idx_433, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc290) + %right_idx_439 = "tt.reduce"(%right_idx_438) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_440 = tt.expand_dims %right_idx_439 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc292) + %right_idx_441 = tt.broadcast %right_idx_440 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc293) + %left_idx_442 = tt.reshape %left_idx_437 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_443 = tt.reshape %right_idx_441 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_444 = arith.cmpi slt, %ileft_431, %iright_432 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_445 = arith.cmpi eq, %ileft_431, %iright_432 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_446 = arith.cmpi sgt, %left_idx_442, %right_idx_443 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_447 = arith.andi %eq_445, %cond_446 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_448 = arith.ori %cond_444, %cond_447 : tensor<8x16xi1, #blocked> loc(#loc277) + %cond_449 = arith.extui %cond_448 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc278) + %cond_450 = arith.xori %cond_449, %flip_84 : tensor<8x16xi32, #blocked> loc(#loc278) + %cond_451 = arith.cmpi ne, %cond_450, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc279) + %ret_452 = arith.xori %ileft_431, %iright_432 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_453 = arith.select %cond_451, %ret_452, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_454 = arith.xori %ret_419, %ret_453 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_455 = arith.xori %left_idx_442, %right_idx_443 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_456 = arith.select %cond_451, %new_idxs_455, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_457 = arith.xori %new_idxs_421, %new_idxs_456 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_458 = tt.reshape %ret_454 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc263) + %ileft_459 = arith.muli %y_458, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc264) + %ileft_460 = "tt.reduce"(%ileft_459) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_461 = tt.expand_dims %ileft_460 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc266) + %ileft_462 = tt.broadcast %ileft_461 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc267) + %iright_463 = arith.muli %y_458, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc268) + %iright_464 = "tt.reduce"(%iright_463) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_465 = tt.expand_dims %iright_464 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc270) + %iright_466 = tt.broadcast %iright_465 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc271) + %ileft_467 = tt.reshape %ileft_462 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_468 = tt.reshape %iright_466 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_469 = tt.reshape %new_idxs_457 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc285) + %left_idx_470 = arith.muli %y_idx_469, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc286) + %left_idx_471 = "tt.reduce"(%left_idx_470) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_472 = tt.expand_dims %left_idx_471 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc288) + %left_idx_473 = tt.broadcast %left_idx_472 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc289) + %right_idx_474 = arith.muli %y_idx_469, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc290) + %right_idx_475 = "tt.reduce"(%right_idx_474) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_476 = tt.expand_dims %right_idx_475 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc292) + %right_idx_477 = tt.broadcast %right_idx_476 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc293) + %left_idx_478 = tt.reshape %left_idx_473 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_479 = tt.reshape %right_idx_477 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_480 = arith.cmpi slt, %ileft_467, %iright_468 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_481 = arith.cmpi eq, %ileft_467, %iright_468 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_482 = arith.cmpi sgt, %left_idx_478, %right_idx_479 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_483 = arith.andi %eq_481, %cond_482 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_484 = arith.ori %cond_480, %cond_483 : tensor<8x16xi1, #blocked> loc(#loc277) + %cond_485 = arith.extui %cond_484 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc278) + %cond_486 = arith.xori %cond_485, %flip_84 : tensor<8x16xi32, #blocked> loc(#loc278) + %cond_487 = arith.cmpi ne, %cond_486, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc279) + %ret_488 = arith.xori %ileft_467, %iright_468 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_489 = arith.select %cond_487, %ret_488, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_490 = arith.xori %ret_454, %ret_489 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_491 = arith.xori %left_idx_478, %right_idx_479 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_492 = arith.select %cond_487, %new_idxs_491, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_493 = arith.xori %new_idxs_457, %new_idxs_492 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_494 = tt.reshape %ret_490 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc263) + %ileft_495 = arith.muli %y_494, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc264) + %ileft_496 = "tt.reduce"(%ileft_495) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc310) + %ileft_497 = tt.expand_dims %ileft_496 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc266) + %ileft_498 = tt.broadcast %ileft_497 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc267) + %iright_499 = arith.muli %y_494, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc268) + %iright_500 = "tt.reduce"(%iright_499) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc312) + %iright_501 = tt.expand_dims %iright_500 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc270) + %iright_502 = tt.broadcast %iright_501 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc271) + %ileft_503 = tt.reshape %ileft_498 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_504 = tt.reshape %iright_502 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_505 = tt.reshape %new_idxs_493 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc285) + %left_idx_506 = arith.muli %y_idx_505, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc286) + %left_idx_507 = "tt.reduce"(%left_idx_506) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc314) + %left_idx_508 = tt.expand_dims %left_idx_507 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc288) + %left_idx_509 = tt.broadcast %left_idx_508 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc289) + %right_idx_510 = arith.muli %y_idx_505, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc290) + %right_idx_511 = "tt.reduce"(%right_idx_510) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc316) + %right_idx_512 = tt.expand_dims %right_idx_511 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc292) + %right_idx_513 = tt.broadcast %right_idx_512 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc293) + %left_idx_514 = tt.reshape %left_idx_509 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_515 = tt.reshape %right_idx_513 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_516 = arith.cmpi slt, %ileft_503, %iright_504 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_517 = arith.cmpi eq, %ileft_503, %iright_504 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_518 = arith.cmpi sgt, %left_idx_514, %right_idx_515 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_519 = arith.andi %eq_517, %cond_518 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_520 = arith.ori %cond_516, %cond_519 : tensor<8x16xi1, #blocked> loc(#loc277) + %cond_521 = arith.extui %cond_520 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc278) + %cond_522 = arith.xori %cond_521, %flip_159 : tensor<8x16xi32, #blocked> loc(#loc278) + %cond_523 = arith.cmpi ne, %cond_522, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc279) + %ret_524 = arith.xori %ileft_503, %iright_504 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_525 = arith.select %cond_523, %ret_524, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_526 = arith.xori %ret_490, %ret_525 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_527 = arith.xori %left_idx_514, %right_idx_515 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_528 = arith.select %cond_523, %new_idxs_527, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_529 = arith.xori %new_idxs_493, %new_idxs_528 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_530 = tt.reshape %ret_526 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc263) + %ileft_531 = arith.muli %y_530, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc264) + %ileft_532 = "tt.reduce"(%ileft_531) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_533 = tt.expand_dims %ileft_532 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc266) + %ileft_534 = tt.broadcast %ileft_533 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc267) + %iright_535 = arith.muli %y_530, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc268) + %iright_536 = "tt.reduce"(%iright_535) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_537 = tt.expand_dims %iright_536 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc270) + %iright_538 = tt.broadcast %iright_537 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc271) + %ileft_539 = tt.reshape %ileft_534 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_540 = tt.reshape %iright_538 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_541 = tt.reshape %new_idxs_529 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc285) + %left_idx_542 = arith.muli %y_idx_541, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc286) + %left_idx_543 = "tt.reduce"(%left_idx_542) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_544 = tt.expand_dims %left_idx_543 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc288) + %left_idx_545 = tt.broadcast %left_idx_544 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc289) + %right_idx_546 = arith.muli %y_idx_541, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc290) + %right_idx_547 = "tt.reduce"(%right_idx_546) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_548 = tt.expand_dims %right_idx_547 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc292) + %right_idx_549 = tt.broadcast %right_idx_548 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc293) + %left_idx_550 = tt.reshape %left_idx_545 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_551 = tt.reshape %right_idx_549 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_552 = arith.cmpi slt, %ileft_539, %iright_540 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_553 = arith.cmpi eq, %ileft_539, %iright_540 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_554 = arith.cmpi sgt, %left_idx_550, %right_idx_551 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_555 = arith.andi %eq_553, %cond_554 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_556 = arith.ori %cond_552, %cond_555 : tensor<8x16xi1, #blocked> loc(#loc277) + %cond_557 = arith.extui %cond_556 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc278) + %cond_558 = arith.xori %cond_557, %flip_159 : tensor<8x16xi32, #blocked> loc(#loc278) + %cond_559 = arith.cmpi ne, %cond_558, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc279) + %ret_560 = arith.xori %ileft_539, %iright_540 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_561 = arith.select %cond_559, %ret_560, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_562 = arith.xori %ret_526, %ret_561 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_563 = arith.xori %left_idx_550, %right_idx_551 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_564 = arith.select %cond_559, %new_idxs_563, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_565 = arith.xori %new_idxs_529, %new_idxs_564 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_566 = tt.reshape %ret_562 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc263) + %ileft_567 = arith.muli %y_566, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc264) + %ileft_568 = "tt.reduce"(%ileft_567) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_569 = tt.expand_dims %ileft_568 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc266) + %ileft_570 = tt.broadcast %ileft_569 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc267) + %iright_571 = arith.muli %y_566, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc268) + %iright_572 = "tt.reduce"(%iright_571) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_573 = tt.expand_dims %iright_572 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc270) + %iright_574 = tt.broadcast %iright_573 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc271) + %ileft_575 = tt.reshape %ileft_570 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_576 = tt.reshape %iright_574 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_577 = tt.reshape %new_idxs_565 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc285) + %left_idx_578 = arith.muli %y_idx_577, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc286) + %left_idx_579 = "tt.reduce"(%left_idx_578) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_580 = tt.expand_dims %left_idx_579 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc288) + %left_idx_581 = tt.broadcast %left_idx_580 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc289) + %right_idx_582 = arith.muli %y_idx_577, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc290) + %right_idx_583 = "tt.reduce"(%right_idx_582) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_584 = tt.expand_dims %right_idx_583 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc292) + %right_idx_585 = tt.broadcast %right_idx_584 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc293) + %left_idx_586 = tt.reshape %left_idx_581 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_587 = tt.reshape %right_idx_585 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_588 = arith.cmpi slt, %ileft_575, %iright_576 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_589 = arith.cmpi eq, %ileft_575, %iright_576 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_590 = arith.cmpi sgt, %left_idx_586, %right_idx_587 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_591 = arith.andi %eq_589, %cond_590 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_592 = arith.ori %cond_588, %cond_591 : tensor<8x16xi1, #blocked> loc(#loc277) + %cond_593 = arith.extui %cond_592 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc278) + %cond_594 = arith.xori %cond_593, %flip_159 : tensor<8x16xi32, #blocked> loc(#loc278) + %cond_595 = arith.cmpi ne, %cond_594, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc279) + %ret_596 = arith.xori %ileft_575, %iright_576 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_597 = arith.select %cond_595, %ret_596, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_598 = arith.xori %ret_562, %ret_597 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_599 = arith.xori %left_idx_586, %right_idx_587 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_600 = arith.select %cond_595, %new_idxs_599, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_601 = arith.xori %new_idxs_565, %new_idxs_600 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_602 = tt.reshape %ret_598 : tensor<8x16xi32, #blocked> -> tensor<8x2x8xi32, #blocked1> loc(#loc263) + %ileft_603 = arith.muli %y_602, %ileft_270 : tensor<8x2x8xi32, #blocked1> loc(#loc264) + %ileft_604 = "tt.reduce"(%ileft_603) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc310) + %ileft_605 = tt.expand_dims %ileft_604 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc266) + %ileft_606 = tt.broadcast %ileft_605 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc267) + %iright_607 = arith.muli %y_602, %flip_158 : tensor<8x2x8xi32, #blocked1> loc(#loc268) + %iright_608 = "tt.reduce"(%iright_607) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc312) + %iright_609 = tt.expand_dims %iright_608 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc270) + %iright_610 = tt.broadcast %iright_609 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc271) + %ileft_611 = tt.reshape %ileft_606 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_612 = tt.reshape %iright_610 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_613 = tt.reshape %new_idxs_601 : tensor<8x16xi32, #blocked> -> tensor<8x2x8xi32, #blocked1> loc(#loc285) + %left_idx_614 = arith.muli %y_idx_613, %ileft_270 : tensor<8x2x8xi32, #blocked1> loc(#loc286) + %left_idx_615 = "tt.reduce"(%left_idx_614) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc314) + %left_idx_616 = tt.expand_dims %left_idx_615 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc288) + %left_idx_617 = tt.broadcast %left_idx_616 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc289) + %right_idx_618 = arith.muli %y_idx_613, %flip_158 : tensor<8x2x8xi32, #blocked1> loc(#loc290) + %right_idx_619 = "tt.reduce"(%right_idx_618) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc316) + %right_idx_620 = tt.expand_dims %right_idx_619 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc292) + %right_idx_621 = tt.broadcast %right_idx_620 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc293) + %left_idx_622 = tt.reshape %left_idx_617 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_623 = tt.reshape %right_idx_621 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_624 = arith.cmpi slt, %ileft_611, %iright_612 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_625 = arith.cmpi eq, %ileft_611, %iright_612 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_626 = arith.cmpi sgt, %left_idx_622, %right_idx_623 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_627 = arith.andi %eq_625, %cond_626 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_628 = arith.ori %cond_624, %cond_627 : tensor<8x16xi1, #blocked> loc(#loc277) + %ret_629 = arith.xori %ileft_611, %iright_612 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_630 = arith.select %cond_628, %ret_629, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_631 = arith.xori %ret_598, %ret_630 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_632 = arith.xori %left_idx_622, %right_idx_623 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_633 = arith.select %cond_628, %new_idxs_632, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_634 = arith.xori %new_idxs_601, %new_idxs_633 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_635 = tt.reshape %ret_631 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc263) + %ileft_636 = arith.muli %y_635, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc264) + %ileft_637 = "tt.reduce"(%ileft_636) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc310) + %ileft_638 = tt.expand_dims %ileft_637 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc266) + %ileft_639 = tt.broadcast %ileft_638 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc267) + %iright_640 = arith.muli %y_635, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc268) + %iright_641 = "tt.reduce"(%iright_640) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc312) + %iright_642 = tt.expand_dims %iright_641 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc270) + %iright_643 = tt.broadcast %iright_642 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc271) + %ileft_644 = tt.reshape %ileft_639 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_645 = tt.reshape %iright_643 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_646 = tt.reshape %new_idxs_634 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc285) + %left_idx_647 = arith.muli %y_idx_646, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc286) + %left_idx_648 = "tt.reduce"(%left_idx_647) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc314) + %left_idx_649 = tt.expand_dims %left_idx_648 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc288) + %left_idx_650 = tt.broadcast %left_idx_649 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc289) + %right_idx_651 = arith.muli %y_idx_646, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc290) + %right_idx_652 = "tt.reduce"(%right_idx_651) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc316) + %right_idx_653 = tt.expand_dims %right_idx_652 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc292) + %right_idx_654 = tt.broadcast %right_idx_653 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc293) + %left_idx_655 = tt.reshape %left_idx_650 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_656 = tt.reshape %right_idx_654 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_657 = arith.cmpi slt, %ileft_644, %iright_645 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_658 = arith.cmpi eq, %ileft_644, %iright_645 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_659 = arith.cmpi sgt, %left_idx_655, %right_idx_656 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_660 = arith.andi %eq_658, %cond_659 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_661 = arith.ori %cond_657, %cond_660 : tensor<8x16xi1, #blocked> loc(#loc277) + %ret_662 = arith.xori %ileft_644, %iright_645 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_663 = arith.select %cond_661, %ret_662, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_664 = arith.xori %ret_631, %ret_663 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_665 = arith.xori %left_idx_655, %right_idx_656 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_666 = arith.select %cond_661, %new_idxs_665, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_667 = arith.xori %new_idxs_634, %new_idxs_666 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_668 = tt.reshape %ret_664 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc263) + %ileft_669 = arith.muli %y_668, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc264) + %ileft_670 = "tt.reduce"(%ileft_669) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_671 = tt.expand_dims %ileft_670 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc266) + %ileft_672 = tt.broadcast %ileft_671 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc267) + %iright_673 = arith.muli %y_668, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc268) + %iright_674 = "tt.reduce"(%iright_673) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_675 = tt.expand_dims %iright_674 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc270) + %iright_676 = tt.broadcast %iright_675 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc271) + %ileft_677 = tt.reshape %ileft_672 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_678 = tt.reshape %iright_676 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_679 = tt.reshape %new_idxs_667 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc285) + %left_idx_680 = arith.muli %y_idx_679, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc286) + %left_idx_681 = "tt.reduce"(%left_idx_680) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_682 = tt.expand_dims %left_idx_681 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc288) + %left_idx_683 = tt.broadcast %left_idx_682 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc289) + %right_idx_684 = arith.muli %y_idx_679, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc290) + %right_idx_685 = "tt.reduce"(%right_idx_684) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_686 = tt.expand_dims %right_idx_685 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc292) + %right_idx_687 = tt.broadcast %right_idx_686 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc293) + %left_idx_688 = tt.reshape %left_idx_683 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_689 = tt.reshape %right_idx_687 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_690 = arith.cmpi slt, %ileft_677, %iright_678 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_691 = arith.cmpi eq, %ileft_677, %iright_678 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_692 = arith.cmpi sgt, %left_idx_688, %right_idx_689 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_693 = arith.andi %eq_691, %cond_692 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_694 = arith.ori %cond_690, %cond_693 : tensor<8x16xi1, #blocked> loc(#loc277) + %ret_695 = arith.xori %ileft_677, %iright_678 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_696 = arith.select %cond_694, %ret_695, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_697 = arith.xori %ret_664, %ret_696 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_698 = arith.xori %left_idx_688, %right_idx_689 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_699 = arith.select %cond_694, %new_idxs_698, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_700 = arith.xori %new_idxs_667, %new_idxs_699 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_701 = tt.reshape %ret_697 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc263) + %ileft_702 = arith.muli %y_701, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc264) + %ileft_703 = "tt.reduce"(%ileft_702) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_704 = tt.expand_dims %ileft_703 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc266) + %ileft_705 = tt.broadcast %ileft_704 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc267) + %iright_706 = arith.muli %y_701, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc268) + %iright_707 = "tt.reduce"(%iright_706) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_708 = tt.expand_dims %iright_707 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc270) + %iright_709 = tt.broadcast %iright_708 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc271) + %ileft_710 = tt.reshape %ileft_705 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_711 = tt.reshape %iright_709 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_712 = tt.reshape %new_idxs_700 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc285) + %left_idx_713 = arith.muli %y_idx_712, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc286) + %left_idx_714 = "tt.reduce"(%left_idx_713) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_715 = tt.expand_dims %left_idx_714 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc288) + %left_idx_716 = tt.broadcast %left_idx_715 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc289) + %right_idx_717 = arith.muli %y_idx_712, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc290) + %right_idx_718 = "tt.reduce"(%right_idx_717) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_719 = tt.expand_dims %right_idx_718 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc292) + %right_idx_720 = tt.broadcast %right_idx_719 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc293) + %left_idx_721 = tt.reshape %left_idx_716 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_722 = tt.reshape %right_idx_720 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_723 = arith.cmpi slt, %ileft_710, %iright_711 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_724 = arith.cmpi eq, %ileft_710, %iright_711 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_725 = arith.cmpi sgt, %left_idx_721, %right_idx_722 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_726 = arith.andi %eq_724, %cond_725 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_727 = arith.ori %cond_723, %cond_726 : tensor<8x16xi1, #blocked> loc(#loc277) + %new_idxs_728 = arith.xori %left_idx_721, %right_idx_722 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_729 = arith.select %cond_727, %new_idxs_728, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_730 = arith.xori %new_idxs_700, %new_idxs_729 : tensor<8x16xi32, #blocked> loc(#loc284) + %tmp20 = arith.extui %tmp5 : tensor<8x16xi1, #blocked> to tensor<8x16xi64, #blocked> loc(#loc219) + %tmp23 = arith.select %tmp0_29, %tmp20, %cst_13 : tensor<8x16xi1, #blocked>, tensor<8x16xi64, #blocked> loc(#loc191) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_741: i64 loc(callsite(#loc1 at #loc192)), %tmp24_742: i64 loc(callsite(#loc1 at #loc192))): + %tmp24_743 = arith.addi %tmp24_741, %tmp24_742 : i64 loc(#loc298) + tt.reduce.return %tmp24_743 : i64 loc(#loc220) + }) : (tensor<8x16xi64, #blocked>) -> tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc220) + %tmp30 = ttg.convert_layout %tmp24 : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc193) + %tmp24_731 = tt.expand_dims %tmp30 {axis = 1 : i32} : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<8x1xi64, #blocked5> loc(#loc194) + %tmp24_732 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi64, #blocked> loc(#loc194) + %tmp25 = arith.extui %tmp14 : tensor<8x16xi1, #blocked> to tensor<8x16xi64, #blocked> loc(#loc222) + %tmp28 = arith.select %tmp0_29, %tmp25, %cst_13 : tensor<8x16xi1, #blocked>, tensor<8x16xi64, #blocked> loc(#loc196) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_741: i64 loc(callsite(#loc1 at #loc197)), %tmp29_742: i64 loc(callsite(#loc1 at #loc197))): + %tmp29_743 = arith.addi %tmp29_741, %tmp29_742 : i64 loc(#loc299) + tt.reduce.return %tmp29_743 : i64 loc(#loc223) + }) : (tensor<8x16xi64, #blocked>) -> tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc223) + %tmp31 = ttg.convert_layout %tmp29 : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc198) + %tmp29_733 = tt.expand_dims %tmp31 {axis = 1 : i32} : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<8x1xi64, #blocked5> loc(#loc199) + %tmp29_734 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi64, #blocked> loc(#loc199) + %tmp30_735 = arith.trunci %tmp24_731 : tensor<8x1xi64, #blocked5> to tensor<8x1xi32, #blocked5> loc(#loc193) + %tmp30_736 = arith.trunci %tmp24_732 : tensor<8x1xi64, #blocked> to tensor<8x1xi32, #blocked> loc(#loc193) + %tmp31_737 = arith.trunci %tmp29_733 : tensor<8x1xi64, #blocked5> to tensor<8x1xi32, #blocked5> loc(#loc198) + %tmp31_738 = arith.trunci %tmp29_734 : tensor<8x1xi64, #blocked> to tensor<8x1xi32, #blocked> loc(#loc198) + %tmp34 = tt.broadcast %tmp30_736 : tensor<8x1xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc200) + %tmp34_739 = arith.cmpi slt, %tmp0_24, %tmp34 : tensor<8x16xi32, #blocked> loc(#loc200) + %tmp36 = arith.select %tmp34_739, %new_idxs_398, %cst_11 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc201) + %tmp38 = arith.addi %tmp36, %cst_10 : tensor<8x16xi32, #blocked> loc(#loc202) + %tmp39 = arith.cmpi slt, %tmp36, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc203) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc204) + %0 = arith.cmpi sge, %tmp40, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc85) + %1 = arith.cmpi slt, %tmp40, %cst_10 : tensor<8x16xi32, #blocked> loc(#loc86) + %2 = arith.andi %0, %1 : tensor<8x16xi1, #blocked> loc(#loc87) + %3 = arith.xori %xmask, %cst : tensor<8x1xi1, #blocked> loc(#loc88) + %4 = tt.broadcast %3 : tensor<8x1xi1, #blocked> -> tensor<8x16xi1, #blocked> loc(#loc89) + %5 = arith.ori %2, %4 : tensor<8x16xi1, #blocked> loc(#loc89) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<8x16xi1, #blocked> loc(#loc90) + %tmp45 = tt.broadcast %tmp31_738 : tensor<8x1xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc205) + %tmp45_740 = arith.cmpi slt, %tmp0_24, %tmp45 : tensor<8x16xi32, #blocked> loc(#loc205) + %tmp46 = arith.select %tmp45_740, %new_idxs_730, %cst_11 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc206) + %tmp47 = arith.addi %tmp46, %cst_10 : tensor<8x16xi32, #blocked> loc(#loc207) + %tmp48 = arith.cmpi slt, %tmp46, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc208) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc209) + %6 = arith.cmpi sge, %tmp49, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc96) + %7 = arith.cmpi slt, %tmp49, %cst_10 : tensor<8x16xi32, #blocked> loc(#loc97) + %8 = arith.andi %6, %7 : tensor<8x16xi1, #blocked> loc(#loc98) + %9 = arith.ori %8, %4 : tensor<8x16xi1, #blocked> loc(#loc99) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<8x16xi1, #blocked> loc(#loc100) + %10 = tt.splat %out_ptr4 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked5> loc(#loc101) + %11 = tt.addptr %10, %xindex_21 : tensor<8x1x!tt.ptr, #blocked5>, tensor<8x1xi32, #blocked5> loc(#loc101) + tt.store %11, %tmp30_735, %xmask_22 : tensor<8x1x!tt.ptr, #blocked5> loc(#loc102) + %12 = tt.splat %out_ptr5 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked5> loc(#loc103) + %13 = tt.addptr %12, %xindex_21 : tensor<8x1x!tt.ptr, #blocked5>, tensor<8x1xi32, #blocked5> loc(#loc103) + tt.store %13, %tmp31_737, %xmask_22 : tensor<8x1x!tt.ptr, #blocked5> loc(#loc104) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc105) + %15 = tt.addptr %14, %tmp0_26 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi32, #blocked> loc(#loc105) + tt.store %15, %new_idxs_398, %tmp0_29 : tensor<8x16x!tt.ptr, #blocked> loc(#loc106) + %16 = arith.muli %xindex_20, %cst_0 : tensor<8x1xi32, #blocked> loc(#loc107) + %17 = tt.broadcast %16 : tensor<8x1xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc108) + %18 = arith.addi %tmp40, %17 : tensor<8x16xi32, #blocked> loc(#loc108) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc109) + %20 = tt.addptr %19, %18 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi32, #blocked> loc(#loc109) + %21 = ttg.convert_layout %20 : tensor<8x16x!tt.ptr, #blocked> -> tensor<8x16x!tt.ptr, #blocked5> loc(#loc110) + tt.store %21, %cst_8, %tmp0_30 : tensor<8x16x!tt.ptr, #blocked5> loc(#loc110) + %22 = tt.splat %out_ptr8 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc111) + %23 = tt.addptr %22, %tmp0_26 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi32, #blocked> loc(#loc111) + tt.store %23, %new_idxs_730, %tmp0_29 : tensor<8x16x!tt.ptr, #blocked> loc(#loc112) + %24 = arith.addi %tmp49, %17 : tensor<8x16xi32, #blocked> loc(#loc113) + %25 = tt.splat %out_ptr9 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc114) + %26 = tt.addptr %25, %24 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi32, #blocked> loc(#loc114) + %27 = ttg.convert_layout %26 : tensor<8x16x!tt.ptr, #blocked> -> tensor<8x16x!tt.ptr, #blocked5> loc(#loc115) + tt.store %27, %cst_8, %tmp0_30 : tensor<8x16x!tt.ptr, #blocked5> loc(#loc115) + tt.return loc(#loc116) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":24:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":26:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":27:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:40) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:30) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":36:18) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":38:18) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":39:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":41:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":40:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":43:19) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":45:34) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":47:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":49:21) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":48:21) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":52:20) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":54:35) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":60:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":55:29) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":56:21) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":58:35) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":61:21) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":59:29) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":64:19) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":66:35) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":68:20) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":69:20) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":70:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:28) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:46) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:38) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:55) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:53) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:63) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":75:19) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":76:35) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":77:20) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":78:20) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":79:35) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:28) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:46) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:38) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:53) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:63) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":81:25) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":81:37) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":82:25) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":82:37) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:25) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:47) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:52) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:49) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:25) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:85) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:25) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:47) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:49) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:85) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:4) +#loc126 = loc("xoffset"(#loc2)) +#loc127 = loc("xoffset"(#loc3)) +#loc128 = loc("xindex"(#loc4)) +#loc129 = loc("xindex"(#loc5)) +#loc130 = loc("xmask"(#loc6)) +#loc131 = loc("r0_index"(#loc7)) +#loc132 = loc("tmp0"(#loc8)) +#loc133 = loc("tmp0"(#loc9)) +#loc134 = loc("tmp0"(#loc10)) +#loc135 = loc("tmp0"(#loc11)) +#loc136 = loc("tmp2"(#loc12)) +#loc137 = loc("tmp4"(#loc13)) +#loc138 = loc("tmp5"(#loc14)) +#loc139 = loc("tmp7"(#loc15)) +#loc140 = loc("tmp6"(#loc16)) +#loc141 = loc("tmp9"(#loc17)) +#loc142 = loc("tmp11"(#loc18)) +#loc143 = loc("flip"(#loc19)) +#loc145 = loc("flip"(#loc22)) +#loc146 = loc("flip"(#loc23)) +#loc147 = loc("y"(#loc24)) +#loc148 = loc("left_mask"(#loc26)) +#loc149 = loc("ileft"(#loc27)) +#loc151 = loc("ileft"(#loc31)) +#loc152 = loc("ileft"(#loc32)) +#loc153 = loc("iright"(#loc33)) +#loc155 = loc("iright"(#loc35)) +#loc156 = loc("iright"(#loc36)) +#loc157 = loc("ileft"(#loc37)) +#loc158 = loc("iright"(#loc38)) +#loc159 = loc("y_idx"(#loc39)) +#loc160 = loc("left_idx"(#loc40)) +#loc161 = loc("left_idx"(#loc41)) +#loc162 = loc("input"(#loc42)) +#loc164 = loc("left_idx"(#loc44)) +#loc165 = loc("left_idx"(#loc45)) +#loc166 = loc("right_idx"(#loc46)) +#loc167 = loc("right_idx"(#loc47)) +#loc169 = loc("right_idx"(#loc49)) +#loc170 = loc("right_idx"(#loc50)) +#loc171 = loc("left_idx"(#loc51)) +#loc172 = loc("right_idx"(#loc52)) +#loc173 = loc("cond"(#loc53)) +#loc174 = loc("eq"(#loc54)) +#loc175 = loc("cond"(#loc55)) +#loc176 = loc("cond"(#loc56)) +#loc177 = loc("cond"(#loc57)) +#loc178 = loc("cond"(#loc58)) +#loc179 = loc("cond"(#loc59)) +#loc180 = loc("ret"(#loc60)) +#loc181 = loc("ret"(#loc61)) +#loc182 = loc("ret"(#loc62)) +#loc183 = loc("new_idxs"(#loc63)) +#loc184 = loc("new_idxs"(#loc64)) +#loc185 = loc("new_idxs"(#loc65)) +#loc186 = loc("tmp14"(#loc66)) +#loc187 = loc("tmp16"(#loc67)) +#loc188 = loc("tmp15"(#loc68)) +#loc190 = loc("tmp20"(#loc70)) +#loc191 = loc("tmp23"(#loc71)) +#loc193 = loc("tmp30"(#loc73)) +#loc194 = loc("tmp24"(#loc74)) +#loc195 = loc("tmp25"(#loc75)) +#loc196 = loc("tmp28"(#loc76)) +#loc198 = loc("tmp31"(#loc78)) +#loc199 = loc("tmp29"(#loc79)) +#loc200 = loc("tmp34"(#loc80)) +#loc201 = loc("tmp36"(#loc81)) +#loc202 = loc("tmp38"(#loc82)) +#loc203 = loc("tmp39"(#loc83)) +#loc204 = loc("tmp40"(#loc84)) +#loc205 = loc("tmp45"(#loc91)) +#loc206 = loc("tmp46"(#loc92)) +#loc207 = loc("tmp47"(#loc93)) +#loc208 = loc("tmp48"(#loc94)) +#loc209 = loc("tmp49"(#loc95)) +#loc210 = loc(fused[#loc139, #loc140]) +#loc211 = loc(callsite(#loc143 at #loc144)) +#loc212 = loc(callsite(#loc145 at #loc144)) +#loc213 = loc(callsite(#loc146 at #loc144)) +#loc215 = loc("cond"(#loc173)) +#loc216 = loc("eq"(#loc174)) +#loc217 = loc(fused[#loc187, #loc188]) +#loc219 = loc(fused[#loc190, #loc139, #loc140]) +#loc220 = loc(callsite(#loc28 at #loc192)) +#loc222 = loc(fused[#loc195, #loc187, #loc188]) +#loc223 = loc(callsite(#loc28 at #loc197)) +#loc225 = loc(callsite(#loc147 at #loc214)) +#loc226 = loc(callsite(#loc148 at #loc214)) +#loc227 = loc(callsite(#loc149 at #loc214)) +#loc229 = loc(callsite(#loc151 at #loc214)) +#loc230 = loc(callsite(#loc152 at #loc214)) +#loc231 = loc(callsite(#loc153 at #loc214)) +#loc233 = loc(callsite(#loc155 at #loc214)) +#loc234 = loc(callsite(#loc156 at #loc214)) +#loc235 = loc(callsite(#loc157 at #loc214)) +#loc236 = loc(callsite(#loc158 at #loc214)) +#loc237 = loc(callsite(#loc159 at #loc214)) +#loc238 = loc(callsite(#loc160 at #loc214)) +#loc239 = loc(callsite(#loc161 at #loc214)) +#loc241 = loc(callsite(#loc164 at #loc214)) +#loc242 = loc(callsite(#loc165 at #loc214)) +#loc243 = loc(callsite(#loc166 at #loc214)) +#loc244 = loc(callsite(#loc167 at #loc214)) +#loc246 = loc(callsite(#loc169 at #loc214)) +#loc247 = loc(callsite(#loc170 at #loc214)) +#loc248 = loc(callsite(#loc171 at #loc214)) +#loc249 = loc(callsite(#loc172 at #loc214)) +#loc250 = loc(callsite(#loc215 at #loc214)) +#loc251 = loc(callsite(#loc216 at #loc214)) +#loc252 = loc(callsite(#loc175 at #loc214)) +#loc253 = loc(callsite(#loc176 at #loc214)) +#loc254 = loc(callsite(#loc177 at #loc214)) +#loc255 = loc(callsite(#loc178 at #loc214)) +#loc256 = loc(callsite(#loc179 at #loc214)) +#loc257 = loc(callsite(#loc180 at #loc214)) +#loc258 = loc(callsite(#loc181 at #loc214)) +#loc259 = loc(callsite(#loc182 at #loc214)) +#loc260 = loc(callsite(#loc183 at #loc214)) +#loc261 = loc(callsite(#loc184 at #loc214)) +#loc262 = loc(callsite(#loc185 at #loc214)) +#loc263 = loc(callsite(#loc147 at #loc218)) +#loc264 = loc(callsite(#loc149 at #loc218)) +#loc266 = loc(callsite(#loc151 at #loc218)) +#loc267 = loc(callsite(#loc152 at #loc218)) +#loc268 = loc(callsite(#loc153 at #loc218)) +#loc270 = loc(callsite(#loc155 at #loc218)) +#loc271 = loc(callsite(#loc156 at #loc218)) +#loc272 = loc(callsite(#loc157 at #loc218)) +#loc273 = loc(callsite(#loc158 at #loc218)) +#loc274 = loc(callsite(#loc215 at #loc218)) +#loc275 = loc(callsite(#loc216 at #loc218)) +#loc276 = loc(callsite(#loc176 at #loc218)) +#loc277 = loc(callsite(#loc177 at #loc218)) +#loc278 = loc(callsite(#loc178 at #loc218)) +#loc279 = loc(callsite(#loc179 at #loc218)) +#loc280 = loc(callsite(#loc180 at #loc218)) +#loc281 = loc(callsite(#loc181 at #loc218)) +#loc282 = loc(callsite(#loc182 at #loc218)) +#loc283 = loc(callsite(#loc184 at #loc218)) +#loc284 = loc(callsite(#loc185 at #loc218)) +#loc285 = loc(callsite(#loc159 at #loc218)) +#loc286 = loc(callsite(#loc161 at #loc218)) +#loc288 = loc(callsite(#loc164 at #loc218)) +#loc289 = loc(callsite(#loc165 at #loc218)) +#loc290 = loc(callsite(#loc167 at #loc218)) +#loc292 = loc(callsite(#loc169 at #loc218)) +#loc293 = loc(callsite(#loc170 at #loc218)) +#loc294 = loc(callsite(#loc171 at #loc218)) +#loc295 = loc(callsite(#loc172 at #loc218)) +#loc296 = loc(callsite(#loc175 at #loc218)) +#loc297 = loc(callsite(#loc183 at #loc218)) +#loc298 = loc(callsite(#loc30 at #loc220)) +#loc299 = loc(callsite(#loc30 at #loc223)) +#loc300 = loc(callsite(#loc28 at #loc228)) +#loc302 = loc(callsite(#loc28 at #loc232)) +#loc304 = loc(callsite(#loc162 at #loc240)) +#loc305 = loc(callsite(#loc28 at #loc240)) +#loc307 = loc(callsite(#loc162 at #loc245)) +#loc308 = loc(callsite(#loc28 at #loc245)) +#loc310 = loc(callsite(#loc28 at #loc265)) +#loc312 = loc(callsite(#loc28 at #loc269)) +#loc314 = loc(callsite(#loc28 at #loc287)) +#loc316 = loc(callsite(#loc28 at #loc291)) +#loc318 = loc(callsite(#loc30 at #loc300)) +#loc319 = loc(callsite(#loc30 at #loc302)) +#loc320 = loc(callsite(#loc30 at #loc305)) +#loc321 = loc(callsite(#loc30 at #loc308)) +#loc322 = loc(callsite(#loc30 at #loc310)) +#loc323 = loc(callsite(#loc30 at #loc312)) +#loc324 = loc(callsite(#loc30 at #loc314)) +#loc325 = loc(callsite(#loc30 at #loc316)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..123d4c8612ebd1b42837dfa94d8632d3d68f0095 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir @@ -0,0 +1,1451 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":18:0) +#loc1 = loc(unknown) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":46:71) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":51:71) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":55:26) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":59:26) +#loc120 = loc("in_ptr0"(#loc)) +#loc121 = loc("out_ptr4"(#loc)) +#loc122 = loc("out_ptr5"(#loc)) +#loc123 = loc("out_ptr6"(#loc)) +#loc124 = loc("out_ptr7"(#loc)) +#loc125 = loc("out_ptr8"(#loc)) +#loc126 = loc("out_ptr9"(#loc)) +#loc127 = loc("xnumel"(#loc)) +#loc128 = loc("r0_numel"(#loc)) +#loc149 = loc(callsite(#loc22 at #loc23)) +#loc156 = loc("ileft"(#loc32)) +#loc160 = loc("iright"(#loc37)) +#loc169 = loc("left_idx"(#loc46)) +#loc174 = loc("right_idx"(#loc51)) +#loc195 = loc(callsite(#loc22 at #loc72)) +#loc198 = loc("tmp24"(#loc75)) +#loc202 = loc("tmp29"(#loc79)) +#loc221 = loc(callsite(#loc28 at #loc149)) +#loc225 = loc(callsite(#loc28 at #loc195)) +#loc228 = loc(callsite(#loc1 at #loc198)) +#loc231 = loc(callsite(#loc1 at #loc202)) +#loc235 = loc(callsite(#loc156 at #loc221)) +#loc239 = loc(callsite(#loc160 at #loc221)) +#loc247 = loc(callsite(#loc169 at #loc221)) +#loc252 = loc(callsite(#loc174 at #loc221)) +#loc272 = loc(callsite(#loc156 at #loc225)) +#loc276 = loc(callsite(#loc160 at #loc225)) +#loc294 = loc(callsite(#loc169 at #loc225)) +#loc298 = loc(callsite(#loc174 at #loc225)) +#loc308 = loc(callsite(#loc1 at #loc235)) +#loc310 = loc(callsite(#loc1 at #loc239)) +#loc313 = loc(callsite(#loc1 at #loc247)) +#loc316 = loc(callsite(#loc1 at #loc252)) +#loc318 = loc(callsite(#loc1 at #loc272)) +#loc320 = loc(callsite(#loc1 at #loc276)) +#loc322 = loc(callsite(#loc1 at #loc294)) +#loc324 = loc(callsite(#loc1 at #loc298)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<8x16xi32> loc(#loc1) + %cst_1 = arith.constant dense<17> : tensor<8x1xi32> loc(#loc1) + %cst_2 = arith.constant dense : tensor<8x1xi1> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc1) + %cst_4 = arith.constant dense<17> : tensor<8x16xi32> loc(#loc1) + %cst_5 = arith.constant dense<16> : tensor<8x16xi32> loc(#loc1) + %cst_6 = arith.constant dense<16384> : tensor<8x16xi64> loc(#loc1) + %cst_7 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc1) + %cst_8 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc1) + %xmask = arith.constant dense<32> : tensor<8x1xi32> loc(#loc129) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc130) + %xoffset_9 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc131) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc132) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc133) + %xindex_11 = tt.splat %xoffset_9 : i32 -> tensor<8x1xi32> loc(#loc134) + %xindex_12 = arith.addi %xindex_11, %xindex_10 : tensor<8x1xi32> loc(#loc134) + %xmask_13 = arith.cmpi slt, %xindex_12, %xmask : tensor<8x1xi32> loc(#loc129) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc135) + %r0_index_14 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc136) + %tmp0 = arith.muli %xindex_12, %cst_8 : tensor<8x1xi32> loc(#loc137) + %tmp0_15 = tt.broadcast %r0_index_14 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc138) + %tmp0_16 = tt.broadcast %tmp0 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc138) + %tmp0_17 = arith.addi %tmp0_15, %tmp0_16 : tensor<8x16xi32> loc(#loc138) + %tmp0_18 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc139) + %tmp0_19 = tt.addptr %tmp0_18, %tmp0_17 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc139) + %tmp0_20 = tt.broadcast %xmask_13 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc140) + %tmp0_21 = tt.load %tmp0_19, %tmp0_20, %cst_7 : tensor<8x16x!tt.ptr> loc(#loc140) + %tmp2 = arith.cmpi sgt, %tmp0_21, %cst_7 : tensor<8x16xi64> loc(#loc141) + %tmp4 = arith.cmpi slt, %tmp0_21, %cst_6 : tensor<8x16xi64> loc(#loc142) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<8x16xi1> loc(#loc143) + %tmp7 = arith.extui %tmp5 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc216) + %tmp9 = arith.trunci %r0_index_14 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc146) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16> -> tensor<8x16xi16> loc(#loc147) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc217) + %flip_22 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc218) + %flip_23 = tt.expand_dims %flip_22 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc218) + %flip_24 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc219) + %flip_25 = tt.reshape %flip_24 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc220) + %y = tt.reshape %tmp7 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc232) + %left_mask = arith.subi %cst, %flip_23 : tensor<1x2x1xi32> loc(#loc233) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc234) + %ileft_26 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc234) + %ileft_27 = "tt.reduce"(%ileft_26) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc307) + %ileft_28 = tt.expand_dims %ileft_27 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc236) + %ileft_29 = tt.broadcast %ileft_28 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc237) + %iright = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc238) + %iright_30 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc238) + %iright_31 = "tt.reduce"(%iright_30) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc309) + %iright_32 = tt.expand_dims %iright_31 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc240) + %iright_33 = tt.broadcast %iright_32 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc241) + %ileft_34 = tt.reshape %ileft_29 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_35 = tt.reshape %iright_33 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx = tt.reshape %tmp11 : tensor<8x16xi16> -> tensor<64x2x1xi16> loc(#loc244) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc245) + %left_idx_36 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc246) + %left_idx_37 = arith.muli %y_idx, %left_idx_36 : tensor<64x2x1xi16> loc(#loc246) + %input = arith.extsi %left_idx_37 : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc311) + %left_idx_38 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc312) + %left_idx_39 = tt.expand_dims %left_idx_38 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc248) + %left_idx_40 = tt.broadcast %left_idx_39 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc249) + %right_idx = arith.trunci %flip_23 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc250) + %right_idx_41 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc251) + %right_idx_42 = arith.muli %y_idx, %right_idx_41 : tensor<64x2x1xi16> loc(#loc251) + %input_43 = arith.extsi %right_idx_42 : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc314) + %right_idx_44 = "tt.reduce"(%input_43) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc315) + %right_idx_45 = tt.expand_dims %right_idx_44 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc253) + %right_idx_46 = tt.broadcast %right_idx_45 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc254) + %left_idx_47 = tt.reshape %left_idx_40 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_48 = tt.reshape %right_idx_46 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc256) + %cond = arith.cmpi slt, %ileft_34, %iright_35 : tensor<8x16xi32> loc(#loc257) + %eq = arith.cmpi eq, %ileft_34, %iright_35 : tensor<8x16xi32> loc(#loc258) + %cond_49 = arith.cmpi sgt, %left_idx_47, %right_idx_48 : tensor<8x16xi32> loc(#loc259) + %cond_50 = arith.andi %eq, %cond_49 : tensor<8x16xi1> loc(#loc260) + %cond_51 = arith.ori %cond, %cond_50 : tensor<8x16xi1> loc(#loc261) + %cond_52 = arith.extui %cond_51 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc262) + %cond_53 = arith.xori %cond_52, %flip_25 : tensor<8x16xi32> loc(#loc262) + %cond_54 = arith.cmpi ne, %cond_53, %cst_3 : tensor<8x16xi32> loc(#loc263) + %ret = arith.xori %ileft_34, %iright_35 : tensor<8x16xi32> loc(#loc264) + %ret_55 = arith.select %cond_54, %ret, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_56 = arith.xori %tmp7, %ret_55 : tensor<8x16xi32> loc(#loc266) + %new_idxs = arith.xori %left_idx_47, %right_idx_48 : tensor<8x16xi32> loc(#loc267) + %new_idxs_57 = arith.select %cond_54, %new_idxs, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_58 = arith.extsi %tmp9 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc269) + %new_idxs_59 = tt.broadcast %new_idxs_58 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc269) + %new_idxs_60 = arith.xori %new_idxs_59, %new_idxs_57 : tensor<8x16xi32> loc(#loc269) + %flip_61 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc219) + %flip_62 = tt.reshape %flip_61 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc220) + %y_63 = tt.reshape %ret_56 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc232) + %ileft_64 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc234) + %ileft_65 = arith.muli %y_63, %ileft_64 : tensor<32x2x2xi32> loc(#loc234) + %ileft_66 = "tt.reduce"(%ileft_65) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc307) + %ileft_67 = tt.expand_dims %ileft_66 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc236) + %ileft_68 = tt.broadcast %ileft_67 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc237) + %iright_69 = arith.muli %y_63, %flip_24 : tensor<32x2x2xi32> loc(#loc238) + %iright_70 = "tt.reduce"(%iright_69) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc309) + %iright_71 = tt.expand_dims %iright_70 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc240) + %iright_72 = tt.broadcast %iright_71 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc241) + %ileft_73 = tt.reshape %ileft_68 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_74 = tt.reshape %iright_72 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_75 = tt.reshape %new_idxs_60 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc244) + %left_idx_76 = arith.muli %y_idx_75, %ileft_64 : tensor<32x2x2xi32> loc(#loc246) + %left_idx_77 = "tt.reduce"(%left_idx_76) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc312) + %left_idx_78 = tt.expand_dims %left_idx_77 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc248) + %left_idx_79 = tt.broadcast %left_idx_78 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc249) + %right_idx_80 = arith.muli %y_idx_75, %flip_24 : tensor<32x2x2xi32> loc(#loc251) + %right_idx_81 = "tt.reduce"(%right_idx_80) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc315) + %right_idx_82 = tt.expand_dims %right_idx_81 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc253) + %right_idx_83 = tt.broadcast %right_idx_82 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc254) + %left_idx_84 = tt.reshape %left_idx_79 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_85 = tt.reshape %right_idx_83 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_86 = arith.cmpi slt, %ileft_73, %iright_74 : tensor<8x16xi32> loc(#loc257) + %eq_87 = arith.cmpi eq, %ileft_73, %iright_74 : tensor<8x16xi32> loc(#loc258) + %cond_88 = arith.cmpi sgt, %left_idx_84, %right_idx_85 : tensor<8x16xi32> loc(#loc259) + %cond_89 = arith.andi %eq_87, %cond_88 : tensor<8x16xi1> loc(#loc260) + %cond_90 = arith.ori %cond_86, %cond_89 : tensor<8x16xi1> loc(#loc261) + %cond_91 = arith.extui %cond_90 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc262) + %cond_92 = arith.xori %cond_91, %flip_62 : tensor<8x16xi32> loc(#loc262) + %cond_93 = arith.cmpi ne, %cond_92, %cst_3 : tensor<8x16xi32> loc(#loc263) + %ret_94 = arith.xori %ileft_73, %iright_74 : tensor<8x16xi32> loc(#loc264) + %ret_95 = arith.select %cond_93, %ret_94, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_96 = arith.xori %ret_56, %ret_95 : tensor<8x16xi32> loc(#loc266) + %new_idxs_97 = arith.xori %left_idx_84, %right_idx_85 : tensor<8x16xi32> loc(#loc267) + %new_idxs_98 = arith.select %cond_93, %new_idxs_97, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_99 = arith.xori %new_idxs_60, %new_idxs_98 : tensor<8x16xi32> loc(#loc269) + %y_100 = tt.reshape %ret_96 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc232) + %ileft_101 = arith.muli %y_100, %ileft : tensor<64x2x1xi32> loc(#loc234) + %ileft_102 = "tt.reduce"(%ileft_101) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc307) + %ileft_103 = tt.expand_dims %ileft_102 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc236) + %ileft_104 = tt.broadcast %ileft_103 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc237) + %iright_105 = arith.muli %y_100, %iright : tensor<64x2x1xi32> loc(#loc238) + %iright_106 = "tt.reduce"(%iright_105) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc309) + %iright_107 = tt.expand_dims %iright_106 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc240) + %iright_108 = tt.broadcast %iright_107 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc241) + %ileft_109 = tt.reshape %ileft_104 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_110 = tt.reshape %iright_108 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_111 = tt.reshape %new_idxs_99 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc244) + %left_idx_112 = arith.muli %y_idx_111, %ileft : tensor<64x2x1xi32> loc(#loc246) + %left_idx_113 = "tt.reduce"(%left_idx_112) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc312) + %left_idx_114 = tt.expand_dims %left_idx_113 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc248) + %left_idx_115 = tt.broadcast %left_idx_114 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc249) + %right_idx_116 = arith.muli %y_idx_111, %iright : tensor<64x2x1xi32> loc(#loc251) + %right_idx_117 = "tt.reduce"(%right_idx_116) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc315) + %right_idx_118 = tt.expand_dims %right_idx_117 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc253) + %right_idx_119 = tt.broadcast %right_idx_118 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc254) + %left_idx_120 = tt.reshape %left_idx_115 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_121 = tt.reshape %right_idx_119 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_122 = arith.cmpi slt, %ileft_109, %iright_110 : tensor<8x16xi32> loc(#loc257) + %eq_123 = arith.cmpi eq, %ileft_109, %iright_110 : tensor<8x16xi32> loc(#loc258) + %cond_124 = arith.cmpi sgt, %left_idx_120, %right_idx_121 : tensor<8x16xi32> loc(#loc259) + %cond_125 = arith.andi %eq_123, %cond_124 : tensor<8x16xi1> loc(#loc260) + %cond_126 = arith.ori %cond_122, %cond_125 : tensor<8x16xi1> loc(#loc261) + %cond_127 = arith.extui %cond_126 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc262) + %cond_128 = arith.xori %cond_127, %flip_62 : tensor<8x16xi32> loc(#loc262) + %cond_129 = arith.cmpi ne, %cond_128, %cst_3 : tensor<8x16xi32> loc(#loc263) + %ret_130 = arith.xori %ileft_109, %iright_110 : tensor<8x16xi32> loc(#loc264) + %ret_131 = arith.select %cond_129, %ret_130, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_132 = arith.xori %ret_96, %ret_131 : tensor<8x16xi32> loc(#loc266) + %new_idxs_133 = arith.xori %left_idx_120, %right_idx_121 : tensor<8x16xi32> loc(#loc267) + %new_idxs_134 = arith.select %cond_129, %new_idxs_133, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_135 = arith.xori %new_idxs_99, %new_idxs_134 : tensor<8x16xi32> loc(#loc269) + %flip_136 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc219) + %flip_137 = tt.reshape %flip_136 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc220) + %y_138 = tt.reshape %ret_132 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc232) + %ileft_139 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc234) + %ileft_140 = arith.muli %y_138, %ileft_139 : tensor<16x2x4xi32> loc(#loc234) + %ileft_141 = "tt.reduce"(%ileft_140) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc307) + %ileft_142 = tt.expand_dims %ileft_141 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc236) + %ileft_143 = tt.broadcast %ileft_142 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc237) + %iright_144 = arith.muli %y_138, %flip_61 : tensor<16x2x4xi32> loc(#loc238) + %iright_145 = "tt.reduce"(%iright_144) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc309) + %iright_146 = tt.expand_dims %iright_145 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc240) + %iright_147 = tt.broadcast %iright_146 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc241) + %ileft_148 = tt.reshape %ileft_143 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_149 = tt.reshape %iright_147 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_150 = tt.reshape %new_idxs_135 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc244) + %left_idx_151 = arith.muli %y_idx_150, %ileft_139 : tensor<16x2x4xi32> loc(#loc246) + %left_idx_152 = "tt.reduce"(%left_idx_151) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc312) + %left_idx_153 = tt.expand_dims %left_idx_152 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc248) + %left_idx_154 = tt.broadcast %left_idx_153 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc249) + %right_idx_155 = arith.muli %y_idx_150, %flip_61 : tensor<16x2x4xi32> loc(#loc251) + %right_idx_156 = "tt.reduce"(%right_idx_155) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc315) + %right_idx_157 = tt.expand_dims %right_idx_156 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc253) + %right_idx_158 = tt.broadcast %right_idx_157 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc254) + %left_idx_159 = tt.reshape %left_idx_154 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_160 = tt.reshape %right_idx_158 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_161 = arith.cmpi slt, %ileft_148, %iright_149 : tensor<8x16xi32> loc(#loc257) + %eq_162 = arith.cmpi eq, %ileft_148, %iright_149 : tensor<8x16xi32> loc(#loc258) + %cond_163 = arith.cmpi sgt, %left_idx_159, %right_idx_160 : tensor<8x16xi32> loc(#loc259) + %cond_164 = arith.andi %eq_162, %cond_163 : tensor<8x16xi1> loc(#loc260) + %cond_165 = arith.ori %cond_161, %cond_164 : tensor<8x16xi1> loc(#loc261) + %cond_166 = arith.extui %cond_165 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc262) + %cond_167 = arith.xori %cond_166, %flip_137 : tensor<8x16xi32> loc(#loc262) + %cond_168 = arith.cmpi ne, %cond_167, %cst_3 : tensor<8x16xi32> loc(#loc263) + %ret_169 = arith.xori %ileft_148, %iright_149 : tensor<8x16xi32> loc(#loc264) + %ret_170 = arith.select %cond_168, %ret_169, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_171 = arith.xori %ret_132, %ret_170 : tensor<8x16xi32> loc(#loc266) + %new_idxs_172 = arith.xori %left_idx_159, %right_idx_160 : tensor<8x16xi32> loc(#loc267) + %new_idxs_173 = arith.select %cond_168, %new_idxs_172, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_174 = arith.xori %new_idxs_135, %new_idxs_173 : tensor<8x16xi32> loc(#loc269) + %y_175 = tt.reshape %ret_171 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc232) + %ileft_176 = arith.muli %y_175, %ileft_64 : tensor<32x2x2xi32> loc(#loc234) + %ileft_177 = "tt.reduce"(%ileft_176) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc307) + %ileft_178 = tt.expand_dims %ileft_177 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc236) + %ileft_179 = tt.broadcast %ileft_178 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc237) + %iright_180 = arith.muli %y_175, %flip_24 : tensor<32x2x2xi32> loc(#loc238) + %iright_181 = "tt.reduce"(%iright_180) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc309) + %iright_182 = tt.expand_dims %iright_181 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc240) + %iright_183 = tt.broadcast %iright_182 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc241) + %ileft_184 = tt.reshape %ileft_179 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_185 = tt.reshape %iright_183 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_186 = tt.reshape %new_idxs_174 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc244) + %left_idx_187 = arith.muli %y_idx_186, %ileft_64 : tensor<32x2x2xi32> loc(#loc246) + %left_idx_188 = "tt.reduce"(%left_idx_187) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc312) + %left_idx_189 = tt.expand_dims %left_idx_188 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc248) + %left_idx_190 = tt.broadcast %left_idx_189 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc249) + %right_idx_191 = arith.muli %y_idx_186, %flip_24 : tensor<32x2x2xi32> loc(#loc251) + %right_idx_192 = "tt.reduce"(%right_idx_191) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc315) + %right_idx_193 = tt.expand_dims %right_idx_192 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc253) + %right_idx_194 = tt.broadcast %right_idx_193 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc254) + %left_idx_195 = tt.reshape %left_idx_190 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_196 = tt.reshape %right_idx_194 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_197 = arith.cmpi slt, %ileft_184, %iright_185 : tensor<8x16xi32> loc(#loc257) + %eq_198 = arith.cmpi eq, %ileft_184, %iright_185 : tensor<8x16xi32> loc(#loc258) + %cond_199 = arith.cmpi sgt, %left_idx_195, %right_idx_196 : tensor<8x16xi32> loc(#loc259) + %cond_200 = arith.andi %eq_198, %cond_199 : tensor<8x16xi1> loc(#loc260) + %cond_201 = arith.ori %cond_197, %cond_200 : tensor<8x16xi1> loc(#loc261) + %cond_202 = arith.extui %cond_201 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc262) + %cond_203 = arith.xori %cond_202, %flip_137 : tensor<8x16xi32> loc(#loc262) + %cond_204 = arith.cmpi ne, %cond_203, %cst_3 : tensor<8x16xi32> loc(#loc263) + %ret_205 = arith.xori %ileft_184, %iright_185 : tensor<8x16xi32> loc(#loc264) + %ret_206 = arith.select %cond_204, %ret_205, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_207 = arith.xori %ret_171, %ret_206 : tensor<8x16xi32> loc(#loc266) + %new_idxs_208 = arith.xori %left_idx_195, %right_idx_196 : tensor<8x16xi32> loc(#loc267) + %new_idxs_209 = arith.select %cond_204, %new_idxs_208, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_210 = arith.xori %new_idxs_174, %new_idxs_209 : tensor<8x16xi32> loc(#loc269) + %y_211 = tt.reshape %ret_207 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc232) + %ileft_212 = arith.muli %y_211, %ileft : tensor<64x2x1xi32> loc(#loc234) + %ileft_213 = "tt.reduce"(%ileft_212) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc307) + %ileft_214 = tt.expand_dims %ileft_213 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc236) + %ileft_215 = tt.broadcast %ileft_214 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc237) + %iright_216 = arith.muli %y_211, %iright : tensor<64x2x1xi32> loc(#loc238) + %iright_217 = "tt.reduce"(%iright_216) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc309) + %iright_218 = tt.expand_dims %iright_217 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc240) + %iright_219 = tt.broadcast %iright_218 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc241) + %ileft_220 = tt.reshape %ileft_215 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_221 = tt.reshape %iright_219 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_222 = tt.reshape %new_idxs_210 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc244) + %left_idx_223 = arith.muli %y_idx_222, %ileft : tensor<64x2x1xi32> loc(#loc246) + %left_idx_224 = "tt.reduce"(%left_idx_223) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc312) + %left_idx_225 = tt.expand_dims %left_idx_224 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc248) + %left_idx_226 = tt.broadcast %left_idx_225 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc249) + %right_idx_227 = arith.muli %y_idx_222, %iright : tensor<64x2x1xi32> loc(#loc251) + %right_idx_228 = "tt.reduce"(%right_idx_227) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc315) + %right_idx_229 = tt.expand_dims %right_idx_228 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc253) + %right_idx_230 = tt.broadcast %right_idx_229 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc254) + %left_idx_231 = tt.reshape %left_idx_226 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_232 = tt.reshape %right_idx_230 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_233 = arith.cmpi slt, %ileft_220, %iright_221 : tensor<8x16xi32> loc(#loc257) + %eq_234 = arith.cmpi eq, %ileft_220, %iright_221 : tensor<8x16xi32> loc(#loc258) + %cond_235 = arith.cmpi sgt, %left_idx_231, %right_idx_232 : tensor<8x16xi32> loc(#loc259) + %cond_236 = arith.andi %eq_234, %cond_235 : tensor<8x16xi1> loc(#loc260) + %cond_237 = arith.ori %cond_233, %cond_236 : tensor<8x16xi1> loc(#loc261) + %cond_238 = arith.extui %cond_237 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc262) + %cond_239 = arith.xori %cond_238, %flip_137 : tensor<8x16xi32> loc(#loc262) + %cond_240 = arith.cmpi ne, %cond_239, %cst_3 : tensor<8x16xi32> loc(#loc263) + %ret_241 = arith.xori %ileft_220, %iright_221 : tensor<8x16xi32> loc(#loc264) + %ret_242 = arith.select %cond_240, %ret_241, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_243 = arith.xori %ret_207, %ret_242 : tensor<8x16xi32> loc(#loc266) + %new_idxs_244 = arith.xori %left_idx_231, %right_idx_232 : tensor<8x16xi32> loc(#loc267) + %new_idxs_245 = arith.select %cond_240, %new_idxs_244, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_246 = arith.xori %new_idxs_210, %new_idxs_245 : tensor<8x16xi32> loc(#loc269) + %y_247 = tt.reshape %ret_243 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc232) + %ileft_248 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc234) + %ileft_249 = arith.muli %y_247, %ileft_248 : tensor<8x2x8xi32> loc(#loc234) + %ileft_250 = "tt.reduce"(%ileft_249) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc307) + %ileft_251 = tt.expand_dims %ileft_250 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc236) + %ileft_252 = tt.broadcast %ileft_251 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc237) + %iright_253 = arith.muli %y_247, %flip_136 : tensor<8x2x8xi32> loc(#loc238) + %iright_254 = "tt.reduce"(%iright_253) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc309) + %iright_255 = tt.expand_dims %iright_254 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc240) + %iright_256 = tt.broadcast %iright_255 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc241) + %ileft_257 = tt.reshape %ileft_252 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_258 = tt.reshape %iright_256 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_259 = tt.reshape %new_idxs_246 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc244) + %left_idx_260 = arith.muli %y_idx_259, %ileft_248 : tensor<8x2x8xi32> loc(#loc246) + %left_idx_261 = "tt.reduce"(%left_idx_260) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc312) + %left_idx_262 = tt.expand_dims %left_idx_261 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc248) + %left_idx_263 = tt.broadcast %left_idx_262 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc249) + %right_idx_264 = arith.muli %y_idx_259, %flip_136 : tensor<8x2x8xi32> loc(#loc251) + %right_idx_265 = "tt.reduce"(%right_idx_264) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc315) + %right_idx_266 = tt.expand_dims %right_idx_265 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc253) + %right_idx_267 = tt.broadcast %right_idx_266 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc254) + %left_idx_268 = tt.reshape %left_idx_263 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_269 = tt.reshape %right_idx_267 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_270 = arith.cmpi slt, %ileft_257, %iright_258 : tensor<8x16xi32> loc(#loc257) + %eq_271 = arith.cmpi eq, %ileft_257, %iright_258 : tensor<8x16xi32> loc(#loc258) + %cond_272 = arith.cmpi sgt, %left_idx_268, %right_idx_269 : tensor<8x16xi32> loc(#loc259) + %cond_273 = arith.andi %eq_271, %cond_272 : tensor<8x16xi1> loc(#loc260) + %cond_274 = arith.ori %cond_270, %cond_273 : tensor<8x16xi1> loc(#loc261) + %ret_275 = arith.xori %ileft_257, %iright_258 : tensor<8x16xi32> loc(#loc264) + %ret_276 = arith.select %cond_274, %ret_275, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_277 = arith.xori %ret_243, %ret_276 : tensor<8x16xi32> loc(#loc266) + %new_idxs_278 = arith.xori %left_idx_268, %right_idx_269 : tensor<8x16xi32> loc(#loc267) + %new_idxs_279 = arith.select %cond_274, %new_idxs_278, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_280 = arith.xori %new_idxs_246, %new_idxs_279 : tensor<8x16xi32> loc(#loc269) + %y_281 = tt.reshape %ret_277 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc232) + %ileft_282 = arith.muli %y_281, %ileft_139 : tensor<16x2x4xi32> loc(#loc234) + %ileft_283 = "tt.reduce"(%ileft_282) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc307) + %ileft_284 = tt.expand_dims %ileft_283 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc236) + %ileft_285 = tt.broadcast %ileft_284 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc237) + %iright_286 = arith.muli %y_281, %flip_61 : tensor<16x2x4xi32> loc(#loc238) + %iright_287 = "tt.reduce"(%iright_286) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc309) + %iright_288 = tt.expand_dims %iright_287 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc240) + %iright_289 = tt.broadcast %iright_288 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc241) + %ileft_290 = tt.reshape %ileft_285 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_291 = tt.reshape %iright_289 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_292 = tt.reshape %new_idxs_280 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc244) + %left_idx_293 = arith.muli %y_idx_292, %ileft_139 : tensor<16x2x4xi32> loc(#loc246) + %left_idx_294 = "tt.reduce"(%left_idx_293) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc312) + %left_idx_295 = tt.expand_dims %left_idx_294 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc248) + %left_idx_296 = tt.broadcast %left_idx_295 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc249) + %right_idx_297 = arith.muli %y_idx_292, %flip_61 : tensor<16x2x4xi32> loc(#loc251) + %right_idx_298 = "tt.reduce"(%right_idx_297) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc315) + %right_idx_299 = tt.expand_dims %right_idx_298 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc253) + %right_idx_300 = tt.broadcast %right_idx_299 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc254) + %left_idx_301 = tt.reshape %left_idx_296 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_302 = tt.reshape %right_idx_300 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_303 = arith.cmpi slt, %ileft_290, %iright_291 : tensor<8x16xi32> loc(#loc257) + %eq_304 = arith.cmpi eq, %ileft_290, %iright_291 : tensor<8x16xi32> loc(#loc258) + %cond_305 = arith.cmpi sgt, %left_idx_301, %right_idx_302 : tensor<8x16xi32> loc(#loc259) + %cond_306 = arith.andi %eq_304, %cond_305 : tensor<8x16xi1> loc(#loc260) + %cond_307 = arith.ori %cond_303, %cond_306 : tensor<8x16xi1> loc(#loc261) + %ret_308 = arith.xori %ileft_290, %iright_291 : tensor<8x16xi32> loc(#loc264) + %ret_309 = arith.select %cond_307, %ret_308, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_310 = arith.xori %ret_277, %ret_309 : tensor<8x16xi32> loc(#loc266) + %new_idxs_311 = arith.xori %left_idx_301, %right_idx_302 : tensor<8x16xi32> loc(#loc267) + %new_idxs_312 = arith.select %cond_307, %new_idxs_311, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_313 = arith.xori %new_idxs_280, %new_idxs_312 : tensor<8x16xi32> loc(#loc269) + %y_314 = tt.reshape %ret_310 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc232) + %ileft_315 = arith.muli %y_314, %ileft_64 : tensor<32x2x2xi32> loc(#loc234) + %ileft_316 = "tt.reduce"(%ileft_315) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc307) + %ileft_317 = tt.expand_dims %ileft_316 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc236) + %ileft_318 = tt.broadcast %ileft_317 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc237) + %iright_319 = arith.muli %y_314, %flip_24 : tensor<32x2x2xi32> loc(#loc238) + %iright_320 = "tt.reduce"(%iright_319) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc309) + %iright_321 = tt.expand_dims %iright_320 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc240) + %iright_322 = tt.broadcast %iright_321 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc241) + %ileft_323 = tt.reshape %ileft_318 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_324 = tt.reshape %iright_322 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_325 = tt.reshape %new_idxs_313 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc244) + %left_idx_326 = arith.muli %y_idx_325, %ileft_64 : tensor<32x2x2xi32> loc(#loc246) + %left_idx_327 = "tt.reduce"(%left_idx_326) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc312) + %left_idx_328 = tt.expand_dims %left_idx_327 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc248) + %left_idx_329 = tt.broadcast %left_idx_328 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc249) + %right_idx_330 = arith.muli %y_idx_325, %flip_24 : tensor<32x2x2xi32> loc(#loc251) + %right_idx_331 = "tt.reduce"(%right_idx_330) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc315) + %right_idx_332 = tt.expand_dims %right_idx_331 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc253) + %right_idx_333 = tt.broadcast %right_idx_332 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc254) + %left_idx_334 = tt.reshape %left_idx_329 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_335 = tt.reshape %right_idx_333 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_336 = arith.cmpi slt, %ileft_323, %iright_324 : tensor<8x16xi32> loc(#loc257) + %eq_337 = arith.cmpi eq, %ileft_323, %iright_324 : tensor<8x16xi32> loc(#loc258) + %cond_338 = arith.cmpi sgt, %left_idx_334, %right_idx_335 : tensor<8x16xi32> loc(#loc259) + %cond_339 = arith.andi %eq_337, %cond_338 : tensor<8x16xi1> loc(#loc260) + %cond_340 = arith.ori %cond_336, %cond_339 : tensor<8x16xi1> loc(#loc261) + %ret_341 = arith.xori %ileft_323, %iright_324 : tensor<8x16xi32> loc(#loc264) + %ret_342 = arith.select %cond_340, %ret_341, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_343 = arith.xori %ret_310, %ret_342 : tensor<8x16xi32> loc(#loc266) + %new_idxs_344 = arith.xori %left_idx_334, %right_idx_335 : tensor<8x16xi32> loc(#loc267) + %new_idxs_345 = arith.select %cond_340, %new_idxs_344, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_346 = arith.xori %new_idxs_313, %new_idxs_345 : tensor<8x16xi32> loc(#loc269) + %y_347 = tt.reshape %ret_343 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc232) + %ileft_348 = arith.muli %y_347, %ileft : tensor<64x2x1xi32> loc(#loc234) + %ileft_349 = "tt.reduce"(%ileft_348) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc307) + %ileft_350 = tt.expand_dims %ileft_349 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc236) + %ileft_351 = tt.broadcast %ileft_350 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc237) + %iright_352 = arith.muli %y_347, %iright : tensor<64x2x1xi32> loc(#loc238) + %iright_353 = "tt.reduce"(%iright_352) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc309) + %iright_354 = tt.expand_dims %iright_353 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc240) + %iright_355 = tt.broadcast %iright_354 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc241) + %ileft_356 = tt.reshape %ileft_351 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_357 = tt.reshape %iright_355 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_358 = tt.reshape %new_idxs_346 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc244) + %left_idx_359 = arith.muli %y_idx_358, %ileft : tensor<64x2x1xi32> loc(#loc246) + %left_idx_360 = "tt.reduce"(%left_idx_359) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc312) + %left_idx_361 = tt.expand_dims %left_idx_360 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc248) + %left_idx_362 = tt.broadcast %left_idx_361 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc249) + %right_idx_363 = arith.muli %y_idx_358, %iright : tensor<64x2x1xi32> loc(#loc251) + %right_idx_364 = "tt.reduce"(%right_idx_363) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc315) + %right_idx_365 = tt.expand_dims %right_idx_364 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc253) + %right_idx_366 = tt.broadcast %right_idx_365 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc254) + %left_idx_367 = tt.reshape %left_idx_362 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_368 = tt.reshape %right_idx_366 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_369 = arith.cmpi slt, %ileft_356, %iright_357 : tensor<8x16xi32> loc(#loc257) + %eq_370 = arith.cmpi eq, %ileft_356, %iright_357 : tensor<8x16xi32> loc(#loc258) + %cond_371 = arith.cmpi sgt, %left_idx_367, %right_idx_368 : tensor<8x16xi32> loc(#loc259) + %cond_372 = arith.andi %eq_370, %cond_371 : tensor<8x16xi1> loc(#loc260) + %cond_373 = arith.ori %cond_369, %cond_372 : tensor<8x16xi1> loc(#loc261) + %new_idxs_374 = arith.xori %left_idx_367, %right_idx_368 : tensor<8x16xi32> loc(#loc267) + %new_idxs_375 = arith.select %cond_373, %new_idxs_374, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_376 = arith.xori %new_idxs_346, %new_idxs_375 : tensor<8x16xi32> loc(#loc269) + %tmp14 = arith.cmpi eq, %tmp0_21, %cst_6 : tensor<8x16xi64> loc(#loc192) + %tmp16 = arith.extui %tmp14 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc224) + %y_377 = tt.reshape %tmp16 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc270) + %ileft_378 = arith.muli %y_377, %ileft : tensor<64x2x1xi32> loc(#loc271) + %ileft_379 = "tt.reduce"(%ileft_378) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc317) + %ileft_380 = tt.expand_dims %ileft_379 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc273) + %ileft_381 = tt.broadcast %ileft_380 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc274) + %iright_382 = arith.muli %y_377, %iright : tensor<64x2x1xi32> loc(#loc275) + %iright_383 = "tt.reduce"(%iright_382) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc319) + %iright_384 = tt.expand_dims %iright_383 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc277) + %iright_385 = tt.broadcast %iright_384 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc278) + %ileft_386 = tt.reshape %ileft_381 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_387 = tt.reshape %iright_385 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %cond_388 = arith.cmpi slt, %ileft_386, %iright_387 : tensor<8x16xi32> loc(#loc281) + %eq_389 = arith.cmpi eq, %ileft_386, %iright_387 : tensor<8x16xi32> loc(#loc282) + %cond_390 = arith.andi %eq_389, %cond_49 : tensor<8x16xi1> loc(#loc283) + %cond_391 = arith.ori %cond_388, %cond_390 : tensor<8x16xi1> loc(#loc284) + %cond_392 = arith.extui %cond_391 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc285) + %cond_393 = arith.xori %cond_392, %flip_25 : tensor<8x16xi32> loc(#loc285) + %cond_394 = arith.cmpi ne, %cond_393, %cst_3 : tensor<8x16xi32> loc(#loc286) + %ret_395 = arith.xori %ileft_386, %iright_387 : tensor<8x16xi32> loc(#loc287) + %ret_396 = arith.select %cond_394, %ret_395, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_397 = arith.xori %tmp16, %ret_396 : tensor<8x16xi32> loc(#loc289) + %new_idxs_398 = arith.select %cond_394, %new_idxs, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_399 = arith.xori %new_idxs_59, %new_idxs_398 : tensor<8x16xi32> loc(#loc291) + %y_400 = tt.reshape %ret_397 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc270) + %ileft_401 = arith.muli %y_400, %ileft_64 : tensor<32x2x2xi32> loc(#loc271) + %ileft_402 = "tt.reduce"(%ileft_401) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc317) + %ileft_403 = tt.expand_dims %ileft_402 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc273) + %ileft_404 = tt.broadcast %ileft_403 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc274) + %iright_405 = arith.muli %y_400, %flip_24 : tensor<32x2x2xi32> loc(#loc275) + %iright_406 = "tt.reduce"(%iright_405) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc319) + %iright_407 = tt.expand_dims %iright_406 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc277) + %iright_408 = tt.broadcast %iright_407 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc278) + %ileft_409 = tt.reshape %ileft_404 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_410 = tt.reshape %iright_408 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_411 = tt.reshape %new_idxs_399 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc292) + %left_idx_412 = arith.muli %y_idx_411, %ileft_64 : tensor<32x2x2xi32> loc(#loc293) + %left_idx_413 = "tt.reduce"(%left_idx_412) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc321) + %left_idx_414 = tt.expand_dims %left_idx_413 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc295) + %left_idx_415 = tt.broadcast %left_idx_414 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc296) + %right_idx_416 = arith.muli %y_idx_411, %flip_24 : tensor<32x2x2xi32> loc(#loc297) + %right_idx_417 = "tt.reduce"(%right_idx_416) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc323) + %right_idx_418 = tt.expand_dims %right_idx_417 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc299) + %right_idx_419 = tt.broadcast %right_idx_418 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc300) + %left_idx_420 = tt.reshape %left_idx_415 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_421 = tt.reshape %right_idx_419 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_422 = arith.cmpi slt, %ileft_409, %iright_410 : tensor<8x16xi32> loc(#loc281) + %eq_423 = arith.cmpi eq, %ileft_409, %iright_410 : tensor<8x16xi32> loc(#loc282) + %cond_424 = arith.cmpi sgt, %left_idx_420, %right_idx_421 : tensor<8x16xi32> loc(#loc303) + %cond_425 = arith.andi %eq_423, %cond_424 : tensor<8x16xi1> loc(#loc283) + %cond_426 = arith.ori %cond_422, %cond_425 : tensor<8x16xi1> loc(#loc284) + %cond_427 = arith.extui %cond_426 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc285) + %cond_428 = arith.xori %cond_427, %flip_62 : tensor<8x16xi32> loc(#loc285) + %cond_429 = arith.cmpi ne, %cond_428, %cst_3 : tensor<8x16xi32> loc(#loc286) + %ret_430 = arith.xori %ileft_409, %iright_410 : tensor<8x16xi32> loc(#loc287) + %ret_431 = arith.select %cond_429, %ret_430, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_432 = arith.xori %ret_397, %ret_431 : tensor<8x16xi32> loc(#loc289) + %new_idxs_433 = arith.xori %left_idx_420, %right_idx_421 : tensor<8x16xi32> loc(#loc304) + %new_idxs_434 = arith.select %cond_429, %new_idxs_433, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_435 = arith.xori %new_idxs_399, %new_idxs_434 : tensor<8x16xi32> loc(#loc291) + %y_436 = tt.reshape %ret_432 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc270) + %ileft_437 = arith.muli %y_436, %ileft : tensor<64x2x1xi32> loc(#loc271) + %ileft_438 = "tt.reduce"(%ileft_437) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc317) + %ileft_439 = tt.expand_dims %ileft_438 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc273) + %ileft_440 = tt.broadcast %ileft_439 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc274) + %iright_441 = arith.muli %y_436, %iright : tensor<64x2x1xi32> loc(#loc275) + %iright_442 = "tt.reduce"(%iright_441) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc319) + %iright_443 = tt.expand_dims %iright_442 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc277) + %iright_444 = tt.broadcast %iright_443 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc278) + %ileft_445 = tt.reshape %ileft_440 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_446 = tt.reshape %iright_444 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_447 = tt.reshape %new_idxs_435 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc292) + %left_idx_448 = arith.muli %y_idx_447, %ileft : tensor<64x2x1xi32> loc(#loc293) + %left_idx_449 = "tt.reduce"(%left_idx_448) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc321) + %left_idx_450 = tt.expand_dims %left_idx_449 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc295) + %left_idx_451 = tt.broadcast %left_idx_450 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc296) + %right_idx_452 = arith.muli %y_idx_447, %iright : tensor<64x2x1xi32> loc(#loc297) + %right_idx_453 = "tt.reduce"(%right_idx_452) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc323) + %right_idx_454 = tt.expand_dims %right_idx_453 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc299) + %right_idx_455 = tt.broadcast %right_idx_454 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc300) + %left_idx_456 = tt.reshape %left_idx_451 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_457 = tt.reshape %right_idx_455 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_458 = arith.cmpi slt, %ileft_445, %iright_446 : tensor<8x16xi32> loc(#loc281) + %eq_459 = arith.cmpi eq, %ileft_445, %iright_446 : tensor<8x16xi32> loc(#loc282) + %cond_460 = arith.cmpi sgt, %left_idx_456, %right_idx_457 : tensor<8x16xi32> loc(#loc303) + %cond_461 = arith.andi %eq_459, %cond_460 : tensor<8x16xi1> loc(#loc283) + %cond_462 = arith.ori %cond_458, %cond_461 : tensor<8x16xi1> loc(#loc284) + %cond_463 = arith.extui %cond_462 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc285) + %cond_464 = arith.xori %cond_463, %flip_62 : tensor<8x16xi32> loc(#loc285) + %cond_465 = arith.cmpi ne, %cond_464, %cst_3 : tensor<8x16xi32> loc(#loc286) + %ret_466 = arith.xori %ileft_445, %iright_446 : tensor<8x16xi32> loc(#loc287) + %ret_467 = arith.select %cond_465, %ret_466, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_468 = arith.xori %ret_432, %ret_467 : tensor<8x16xi32> loc(#loc289) + %new_idxs_469 = arith.xori %left_idx_456, %right_idx_457 : tensor<8x16xi32> loc(#loc304) + %new_idxs_470 = arith.select %cond_465, %new_idxs_469, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_471 = arith.xori %new_idxs_435, %new_idxs_470 : tensor<8x16xi32> loc(#loc291) + %y_472 = tt.reshape %ret_468 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc270) + %ileft_473 = arith.muli %y_472, %ileft_139 : tensor<16x2x4xi32> loc(#loc271) + %ileft_474 = "tt.reduce"(%ileft_473) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc317) + %ileft_475 = tt.expand_dims %ileft_474 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc273) + %ileft_476 = tt.broadcast %ileft_475 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc274) + %iright_477 = arith.muli %y_472, %flip_61 : tensor<16x2x4xi32> loc(#loc275) + %iright_478 = "tt.reduce"(%iright_477) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc319) + %iright_479 = tt.expand_dims %iright_478 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc277) + %iright_480 = tt.broadcast %iright_479 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc278) + %ileft_481 = tt.reshape %ileft_476 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_482 = tt.reshape %iright_480 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_483 = tt.reshape %new_idxs_471 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc292) + %left_idx_484 = arith.muli %y_idx_483, %ileft_139 : tensor<16x2x4xi32> loc(#loc293) + %left_idx_485 = "tt.reduce"(%left_idx_484) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc321) + %left_idx_486 = tt.expand_dims %left_idx_485 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc295) + %left_idx_487 = tt.broadcast %left_idx_486 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc296) + %right_idx_488 = arith.muli %y_idx_483, %flip_61 : tensor<16x2x4xi32> loc(#loc297) + %right_idx_489 = "tt.reduce"(%right_idx_488) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc323) + %right_idx_490 = tt.expand_dims %right_idx_489 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc299) + %right_idx_491 = tt.broadcast %right_idx_490 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc300) + %left_idx_492 = tt.reshape %left_idx_487 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_493 = tt.reshape %right_idx_491 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_494 = arith.cmpi slt, %ileft_481, %iright_482 : tensor<8x16xi32> loc(#loc281) + %eq_495 = arith.cmpi eq, %ileft_481, %iright_482 : tensor<8x16xi32> loc(#loc282) + %cond_496 = arith.cmpi sgt, %left_idx_492, %right_idx_493 : tensor<8x16xi32> loc(#loc303) + %cond_497 = arith.andi %eq_495, %cond_496 : tensor<8x16xi1> loc(#loc283) + %cond_498 = arith.ori %cond_494, %cond_497 : tensor<8x16xi1> loc(#loc284) + %cond_499 = arith.extui %cond_498 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc285) + %cond_500 = arith.xori %cond_499, %flip_137 : tensor<8x16xi32> loc(#loc285) + %cond_501 = arith.cmpi ne, %cond_500, %cst_3 : tensor<8x16xi32> loc(#loc286) + %ret_502 = arith.xori %ileft_481, %iright_482 : tensor<8x16xi32> loc(#loc287) + %ret_503 = arith.select %cond_501, %ret_502, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_504 = arith.xori %ret_468, %ret_503 : tensor<8x16xi32> loc(#loc289) + %new_idxs_505 = arith.xori %left_idx_492, %right_idx_493 : tensor<8x16xi32> loc(#loc304) + %new_idxs_506 = arith.select %cond_501, %new_idxs_505, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_507 = arith.xori %new_idxs_471, %new_idxs_506 : tensor<8x16xi32> loc(#loc291) + %y_508 = tt.reshape %ret_504 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc270) + %ileft_509 = arith.muli %y_508, %ileft_64 : tensor<32x2x2xi32> loc(#loc271) + %ileft_510 = "tt.reduce"(%ileft_509) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc317) + %ileft_511 = tt.expand_dims %ileft_510 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc273) + %ileft_512 = tt.broadcast %ileft_511 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc274) + %iright_513 = arith.muli %y_508, %flip_24 : tensor<32x2x2xi32> loc(#loc275) + %iright_514 = "tt.reduce"(%iright_513) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc319) + %iright_515 = tt.expand_dims %iright_514 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc277) + %iright_516 = tt.broadcast %iright_515 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc278) + %ileft_517 = tt.reshape %ileft_512 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_518 = tt.reshape %iright_516 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_519 = tt.reshape %new_idxs_507 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc292) + %left_idx_520 = arith.muli %y_idx_519, %ileft_64 : tensor<32x2x2xi32> loc(#loc293) + %left_idx_521 = "tt.reduce"(%left_idx_520) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc321) + %left_idx_522 = tt.expand_dims %left_idx_521 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc295) + %left_idx_523 = tt.broadcast %left_idx_522 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc296) + %right_idx_524 = arith.muli %y_idx_519, %flip_24 : tensor<32x2x2xi32> loc(#loc297) + %right_idx_525 = "tt.reduce"(%right_idx_524) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc323) + %right_idx_526 = tt.expand_dims %right_idx_525 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc299) + %right_idx_527 = tt.broadcast %right_idx_526 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc300) + %left_idx_528 = tt.reshape %left_idx_523 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_529 = tt.reshape %right_idx_527 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_530 = arith.cmpi slt, %ileft_517, %iright_518 : tensor<8x16xi32> loc(#loc281) + %eq_531 = arith.cmpi eq, %ileft_517, %iright_518 : tensor<8x16xi32> loc(#loc282) + %cond_532 = arith.cmpi sgt, %left_idx_528, %right_idx_529 : tensor<8x16xi32> loc(#loc303) + %cond_533 = arith.andi %eq_531, %cond_532 : tensor<8x16xi1> loc(#loc283) + %cond_534 = arith.ori %cond_530, %cond_533 : tensor<8x16xi1> loc(#loc284) + %cond_535 = arith.extui %cond_534 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc285) + %cond_536 = arith.xori %cond_535, %flip_137 : tensor<8x16xi32> loc(#loc285) + %cond_537 = arith.cmpi ne, %cond_536, %cst_3 : tensor<8x16xi32> loc(#loc286) + %ret_538 = arith.xori %ileft_517, %iright_518 : tensor<8x16xi32> loc(#loc287) + %ret_539 = arith.select %cond_537, %ret_538, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_540 = arith.xori %ret_504, %ret_539 : tensor<8x16xi32> loc(#loc289) + %new_idxs_541 = arith.xori %left_idx_528, %right_idx_529 : tensor<8x16xi32> loc(#loc304) + %new_idxs_542 = arith.select %cond_537, %new_idxs_541, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_543 = arith.xori %new_idxs_507, %new_idxs_542 : tensor<8x16xi32> loc(#loc291) + %y_544 = tt.reshape %ret_540 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc270) + %ileft_545 = arith.muli %y_544, %ileft : tensor<64x2x1xi32> loc(#loc271) + %ileft_546 = "tt.reduce"(%ileft_545) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc317) + %ileft_547 = tt.expand_dims %ileft_546 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc273) + %ileft_548 = tt.broadcast %ileft_547 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc274) + %iright_549 = arith.muli %y_544, %iright : tensor<64x2x1xi32> loc(#loc275) + %iright_550 = "tt.reduce"(%iright_549) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc319) + %iright_551 = tt.expand_dims %iright_550 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc277) + %iright_552 = tt.broadcast %iright_551 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc278) + %ileft_553 = tt.reshape %ileft_548 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_554 = tt.reshape %iright_552 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_555 = tt.reshape %new_idxs_543 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc292) + %left_idx_556 = arith.muli %y_idx_555, %ileft : tensor<64x2x1xi32> loc(#loc293) + %left_idx_557 = "tt.reduce"(%left_idx_556) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc321) + %left_idx_558 = tt.expand_dims %left_idx_557 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc295) + %left_idx_559 = tt.broadcast %left_idx_558 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc296) + %right_idx_560 = arith.muli %y_idx_555, %iright : tensor<64x2x1xi32> loc(#loc297) + %right_idx_561 = "tt.reduce"(%right_idx_560) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc323) + %right_idx_562 = tt.expand_dims %right_idx_561 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc299) + %right_idx_563 = tt.broadcast %right_idx_562 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc300) + %left_idx_564 = tt.reshape %left_idx_559 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_565 = tt.reshape %right_idx_563 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_566 = arith.cmpi slt, %ileft_553, %iright_554 : tensor<8x16xi32> loc(#loc281) + %eq_567 = arith.cmpi eq, %ileft_553, %iright_554 : tensor<8x16xi32> loc(#loc282) + %cond_568 = arith.cmpi sgt, %left_idx_564, %right_idx_565 : tensor<8x16xi32> loc(#loc303) + %cond_569 = arith.andi %eq_567, %cond_568 : tensor<8x16xi1> loc(#loc283) + %cond_570 = arith.ori %cond_566, %cond_569 : tensor<8x16xi1> loc(#loc284) + %cond_571 = arith.extui %cond_570 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc285) + %cond_572 = arith.xori %cond_571, %flip_137 : tensor<8x16xi32> loc(#loc285) + %cond_573 = arith.cmpi ne, %cond_572, %cst_3 : tensor<8x16xi32> loc(#loc286) + %ret_574 = arith.xori %ileft_553, %iright_554 : tensor<8x16xi32> loc(#loc287) + %ret_575 = arith.select %cond_573, %ret_574, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_576 = arith.xori %ret_540, %ret_575 : tensor<8x16xi32> loc(#loc289) + %new_idxs_577 = arith.xori %left_idx_564, %right_idx_565 : tensor<8x16xi32> loc(#loc304) + %new_idxs_578 = arith.select %cond_573, %new_idxs_577, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_579 = arith.xori %new_idxs_543, %new_idxs_578 : tensor<8x16xi32> loc(#loc291) + %y_580 = tt.reshape %ret_576 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc270) + %ileft_581 = arith.muli %y_580, %ileft_248 : tensor<8x2x8xi32> loc(#loc271) + %ileft_582 = "tt.reduce"(%ileft_581) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc317) + %ileft_583 = tt.expand_dims %ileft_582 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc273) + %ileft_584 = tt.broadcast %ileft_583 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc274) + %iright_585 = arith.muli %y_580, %flip_136 : tensor<8x2x8xi32> loc(#loc275) + %iright_586 = "tt.reduce"(%iright_585) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc319) + %iright_587 = tt.expand_dims %iright_586 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc277) + %iright_588 = tt.broadcast %iright_587 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc278) + %ileft_589 = tt.reshape %ileft_584 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_590 = tt.reshape %iright_588 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_591 = tt.reshape %new_idxs_579 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc292) + %left_idx_592 = arith.muli %y_idx_591, %ileft_248 : tensor<8x2x8xi32> loc(#loc293) + %left_idx_593 = "tt.reduce"(%left_idx_592) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc321) + %left_idx_594 = tt.expand_dims %left_idx_593 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc295) + %left_idx_595 = tt.broadcast %left_idx_594 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc296) + %right_idx_596 = arith.muli %y_idx_591, %flip_136 : tensor<8x2x8xi32> loc(#loc297) + %right_idx_597 = "tt.reduce"(%right_idx_596) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc323) + %right_idx_598 = tt.expand_dims %right_idx_597 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc299) + %right_idx_599 = tt.broadcast %right_idx_598 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc300) + %left_idx_600 = tt.reshape %left_idx_595 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_601 = tt.reshape %right_idx_599 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_602 = arith.cmpi slt, %ileft_589, %iright_590 : tensor<8x16xi32> loc(#loc281) + %eq_603 = arith.cmpi eq, %ileft_589, %iright_590 : tensor<8x16xi32> loc(#loc282) + %cond_604 = arith.cmpi sgt, %left_idx_600, %right_idx_601 : tensor<8x16xi32> loc(#loc303) + %cond_605 = arith.andi %eq_603, %cond_604 : tensor<8x16xi1> loc(#loc283) + %cond_606 = arith.ori %cond_602, %cond_605 : tensor<8x16xi1> loc(#loc284) + %ret_607 = arith.xori %ileft_589, %iright_590 : tensor<8x16xi32> loc(#loc287) + %ret_608 = arith.select %cond_606, %ret_607, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_609 = arith.xori %ret_576, %ret_608 : tensor<8x16xi32> loc(#loc289) + %new_idxs_610 = arith.xori %left_idx_600, %right_idx_601 : tensor<8x16xi32> loc(#loc304) + %new_idxs_611 = arith.select %cond_606, %new_idxs_610, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_612 = arith.xori %new_idxs_579, %new_idxs_611 : tensor<8x16xi32> loc(#loc291) + %y_613 = tt.reshape %ret_609 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc270) + %ileft_614 = arith.muli %y_613, %ileft_139 : tensor<16x2x4xi32> loc(#loc271) + %ileft_615 = "tt.reduce"(%ileft_614) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc317) + %ileft_616 = tt.expand_dims %ileft_615 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc273) + %ileft_617 = tt.broadcast %ileft_616 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc274) + %iright_618 = arith.muli %y_613, %flip_61 : tensor<16x2x4xi32> loc(#loc275) + %iright_619 = "tt.reduce"(%iright_618) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc319) + %iright_620 = tt.expand_dims %iright_619 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc277) + %iright_621 = tt.broadcast %iright_620 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc278) + %ileft_622 = tt.reshape %ileft_617 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_623 = tt.reshape %iright_621 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_624 = tt.reshape %new_idxs_612 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc292) + %left_idx_625 = arith.muli %y_idx_624, %ileft_139 : tensor<16x2x4xi32> loc(#loc293) + %left_idx_626 = "tt.reduce"(%left_idx_625) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc321) + %left_idx_627 = tt.expand_dims %left_idx_626 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc295) + %left_idx_628 = tt.broadcast %left_idx_627 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc296) + %right_idx_629 = arith.muli %y_idx_624, %flip_61 : tensor<16x2x4xi32> loc(#loc297) + %right_idx_630 = "tt.reduce"(%right_idx_629) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc323) + %right_idx_631 = tt.expand_dims %right_idx_630 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc299) + %right_idx_632 = tt.broadcast %right_idx_631 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc300) + %left_idx_633 = tt.reshape %left_idx_628 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_634 = tt.reshape %right_idx_632 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_635 = arith.cmpi slt, %ileft_622, %iright_623 : tensor<8x16xi32> loc(#loc281) + %eq_636 = arith.cmpi eq, %ileft_622, %iright_623 : tensor<8x16xi32> loc(#loc282) + %cond_637 = arith.cmpi sgt, %left_idx_633, %right_idx_634 : tensor<8x16xi32> loc(#loc303) + %cond_638 = arith.andi %eq_636, %cond_637 : tensor<8x16xi1> loc(#loc283) + %cond_639 = arith.ori %cond_635, %cond_638 : tensor<8x16xi1> loc(#loc284) + %ret_640 = arith.xori %ileft_622, %iright_623 : tensor<8x16xi32> loc(#loc287) + %ret_641 = arith.select %cond_639, %ret_640, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_642 = arith.xori %ret_609, %ret_641 : tensor<8x16xi32> loc(#loc289) + %new_idxs_643 = arith.xori %left_idx_633, %right_idx_634 : tensor<8x16xi32> loc(#loc304) + %new_idxs_644 = arith.select %cond_639, %new_idxs_643, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_645 = arith.xori %new_idxs_612, %new_idxs_644 : tensor<8x16xi32> loc(#loc291) + %y_646 = tt.reshape %ret_642 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc270) + %ileft_647 = arith.muli %y_646, %ileft_64 : tensor<32x2x2xi32> loc(#loc271) + %ileft_648 = "tt.reduce"(%ileft_647) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc317) + %ileft_649 = tt.expand_dims %ileft_648 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc273) + %ileft_650 = tt.broadcast %ileft_649 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc274) + %iright_651 = arith.muli %y_646, %flip_24 : tensor<32x2x2xi32> loc(#loc275) + %iright_652 = "tt.reduce"(%iright_651) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc319) + %iright_653 = tt.expand_dims %iright_652 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc277) + %iright_654 = tt.broadcast %iright_653 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc278) + %ileft_655 = tt.reshape %ileft_650 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_656 = tt.reshape %iright_654 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_657 = tt.reshape %new_idxs_645 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc292) + %left_idx_658 = arith.muli %y_idx_657, %ileft_64 : tensor<32x2x2xi32> loc(#loc293) + %left_idx_659 = "tt.reduce"(%left_idx_658) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc321) + %left_idx_660 = tt.expand_dims %left_idx_659 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc295) + %left_idx_661 = tt.broadcast %left_idx_660 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc296) + %right_idx_662 = arith.muli %y_idx_657, %flip_24 : tensor<32x2x2xi32> loc(#loc297) + %right_idx_663 = "tt.reduce"(%right_idx_662) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc323) + %right_idx_664 = tt.expand_dims %right_idx_663 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc299) + %right_idx_665 = tt.broadcast %right_idx_664 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc300) + %left_idx_666 = tt.reshape %left_idx_661 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_667 = tt.reshape %right_idx_665 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_668 = arith.cmpi slt, %ileft_655, %iright_656 : tensor<8x16xi32> loc(#loc281) + %eq_669 = arith.cmpi eq, %ileft_655, %iright_656 : tensor<8x16xi32> loc(#loc282) + %cond_670 = arith.cmpi sgt, %left_idx_666, %right_idx_667 : tensor<8x16xi32> loc(#loc303) + %cond_671 = arith.andi %eq_669, %cond_670 : tensor<8x16xi1> loc(#loc283) + %cond_672 = arith.ori %cond_668, %cond_671 : tensor<8x16xi1> loc(#loc284) + %ret_673 = arith.xori %ileft_655, %iright_656 : tensor<8x16xi32> loc(#loc287) + %ret_674 = arith.select %cond_672, %ret_673, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_675 = arith.xori %ret_642, %ret_674 : tensor<8x16xi32> loc(#loc289) + %new_idxs_676 = arith.xori %left_idx_666, %right_idx_667 : tensor<8x16xi32> loc(#loc304) + %new_idxs_677 = arith.select %cond_672, %new_idxs_676, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_678 = arith.xori %new_idxs_645, %new_idxs_677 : tensor<8x16xi32> loc(#loc291) + %y_679 = tt.reshape %ret_675 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc270) + %ileft_680 = arith.muli %y_679, %ileft : tensor<64x2x1xi32> loc(#loc271) + %ileft_681 = "tt.reduce"(%ileft_680) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc317) + %ileft_682 = tt.expand_dims %ileft_681 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc273) + %ileft_683 = tt.broadcast %ileft_682 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc274) + %iright_684 = arith.muli %y_679, %iright : tensor<64x2x1xi32> loc(#loc275) + %iright_685 = "tt.reduce"(%iright_684) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc319) + %iright_686 = tt.expand_dims %iright_685 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc277) + %iright_687 = tt.broadcast %iright_686 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc278) + %ileft_688 = tt.reshape %ileft_683 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_689 = tt.reshape %iright_687 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_690 = tt.reshape %new_idxs_678 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc292) + %left_idx_691 = arith.muli %y_idx_690, %ileft : tensor<64x2x1xi32> loc(#loc293) + %left_idx_692 = "tt.reduce"(%left_idx_691) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc321) + %left_idx_693 = tt.expand_dims %left_idx_692 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc295) + %left_idx_694 = tt.broadcast %left_idx_693 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc296) + %right_idx_695 = arith.muli %y_idx_690, %iright : tensor<64x2x1xi32> loc(#loc297) + %right_idx_696 = "tt.reduce"(%right_idx_695) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc323) + %right_idx_697 = tt.expand_dims %right_idx_696 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc299) + %right_idx_698 = tt.broadcast %right_idx_697 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc300) + %left_idx_699 = tt.reshape %left_idx_694 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_700 = tt.reshape %right_idx_698 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_701 = arith.cmpi slt, %ileft_688, %iright_689 : tensor<8x16xi32> loc(#loc281) + %eq_702 = arith.cmpi eq, %ileft_688, %iright_689 : tensor<8x16xi32> loc(#loc282) + %cond_703 = arith.cmpi sgt, %left_idx_699, %right_idx_700 : tensor<8x16xi32> loc(#loc303) + %cond_704 = arith.andi %eq_702, %cond_703 : tensor<8x16xi1> loc(#loc283) + %cond_705 = arith.ori %cond_701, %cond_704 : tensor<8x16xi1> loc(#loc284) + %new_idxs_706 = arith.xori %left_idx_699, %right_idx_700 : tensor<8x16xi32> loc(#loc304) + %new_idxs_707 = arith.select %cond_705, %new_idxs_706, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_708 = arith.xori %new_idxs_678, %new_idxs_707 : tensor<8x16xi32> loc(#loc291) + %tmp20 = arith.extui %tmp5 : tensor<8x16xi1> to tensor<8x16xi64> loc(#loc226) + %tmp23 = arith.select %tmp0_20, %tmp20, %cst_7 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc197) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_713: i64 loc(callsite(#loc1 at #loc198)), %tmp24_714: i64 loc(callsite(#loc1 at #loc198))): + %tmp24_715 = arith.addi %tmp24_713, %tmp24_714 : i64 loc(#loc305) + tt.reduce.return %tmp24_715 : i64 loc(#loc227) + }) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc227) + %tmp24_709 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc199) + %tmp25 = arith.extui %tmp14 : tensor<8x16xi1> to tensor<8x16xi64> loc(#loc229) + %tmp28 = arith.select %tmp0_20, %tmp25, %cst_7 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc201) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_713: i64 loc(callsite(#loc1 at #loc202)), %tmp29_714: i64 loc(callsite(#loc1 at #loc202))): + %tmp29_715 = arith.addi %tmp29_713, %tmp29_714 : i64 loc(#loc306) + tt.reduce.return %tmp29_715 : i64 loc(#loc230) + }) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc230) + %tmp29_710 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc203) + %tmp30 = arith.trunci %tmp24_709 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc204) + %tmp31 = arith.trunci %tmp29_710 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc205) + %tmp34 = tt.broadcast %tmp30 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc206) + %tmp34_711 = arith.cmpi slt, %tmp0_15, %tmp34 : tensor<8x16xi32> loc(#loc206) + %tmp36 = arith.select %tmp34_711, %new_idxs_376, %cst_5 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc207) + %tmp38 = arith.addi %tmp36, %cst_4 : tensor<8x16xi32> loc(#loc208) + %tmp39 = arith.cmpi slt, %tmp36, %cst_3 : tensor<8x16xi32> loc(#loc209) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc210) + %0 = arith.cmpi sge, %tmp40, %cst_3 : tensor<8x16xi32> loc(#loc88) + %1 = arith.cmpi slt, %tmp40, %cst_4 : tensor<8x16xi32> loc(#loc89) + %2 = arith.andi %0, %1 : tensor<8x16xi1> loc(#loc90) + %3 = arith.xori %xmask_13, %cst_2 : tensor<8x1xi1> loc(#loc91) + %4 = tt.broadcast %3 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc92) + %5 = arith.ori %2, %4 : tensor<8x16xi1> loc(#loc92) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<8x16xi1> loc(#loc93) + %tmp45 = tt.broadcast %tmp31 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc211) + %tmp45_712 = arith.cmpi slt, %tmp0_15, %tmp45 : tensor<8x16xi32> loc(#loc211) + %tmp46 = arith.select %tmp45_712, %new_idxs_708, %cst_5 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc212) + %tmp47 = arith.addi %tmp46, %cst_4 : tensor<8x16xi32> loc(#loc213) + %tmp48 = arith.cmpi slt, %tmp46, %cst_3 : tensor<8x16xi32> loc(#loc214) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc215) + %6 = arith.cmpi sge, %tmp49, %cst_3 : tensor<8x16xi32> loc(#loc99) + %7 = arith.cmpi slt, %tmp49, %cst_4 : tensor<8x16xi32> loc(#loc100) + %8 = arith.andi %6, %7 : tensor<8x16xi1> loc(#loc101) + %9 = arith.ori %8, %4 : tensor<8x16xi1> loc(#loc102) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<8x16xi1> loc(#loc103) + %10 = tt.splat %out_ptr4 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc104) + %11 = tt.addptr %10, %xindex_12 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc104) + tt.store %11, %tmp30, %xmask_13 : tensor<8x1x!tt.ptr> loc(#loc105) + %12 = tt.splat %out_ptr5 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc106) + %13 = tt.addptr %12, %xindex_12 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc106) + tt.store %13, %tmp31, %xmask_13 : tensor<8x1x!tt.ptr> loc(#loc107) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc108) + %15 = tt.addptr %14, %tmp0_17 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc108) + tt.store %15, %new_idxs_376, %tmp0_20 : tensor<8x16x!tt.ptr> loc(#loc109) + %16 = arith.muli %xindex_12, %cst_1 : tensor<8x1xi32> loc(#loc110) + %17 = tt.broadcast %16 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc111) + %18 = arith.addi %tmp40, %17 : tensor<8x16xi32> loc(#loc111) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc112) + %20 = tt.addptr %19, %18 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc112) + tt.store %20, %cst_0, %tmp0_20 : tensor<8x16x!tt.ptr> loc(#loc113) + %21 = tt.splat %out_ptr8 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc114) + %22 = tt.addptr %21, %tmp0_17 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc114) + tt.store %22, %new_idxs_708, %tmp0_20 : tensor<8x16x!tt.ptr> loc(#loc115) + %23 = arith.addi %tmp49, %17 : tensor<8x16xi32> loc(#loc116) + %24 = tt.splat %out_ptr9 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc117) + %25 = tt.addptr %24, %23 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc117) + tt.store %25, %cst_0, %tmp0_20 : tensor<8x16x!tt.ptr> loc(#loc118) + tt.return loc(#loc119) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":26:21) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":27:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":27:38) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:30) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":36:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":38:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":39:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":41:19) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":40:19) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":43:19) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":45:34) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":47:20) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":49:21) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":48:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":52:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":54:35) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":55:29) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":56:21) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":58:35) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":59:29) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":60:21) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":61:21) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":64:19) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":66:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":68:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":69:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":70:35) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:28) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:46) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:38) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:55) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:53) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:63) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":75:19) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":76:35) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":77:20) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":78:20) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":79:35) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:28) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:46) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:38) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:53) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:63) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":81:25) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":81:37) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":82:25) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":82:37) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:25) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:47) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:52) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:49) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:25) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:85) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:47) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:49) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:25) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:85) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:4) +#loc129 = loc("xmask"(#loc2)) +#loc130 = loc("xoffset"(#loc3)) +#loc131 = loc("xoffset"(#loc4)) +#loc132 = loc("xindex"(#loc5)) +#loc133 = loc("xindex"(#loc6)) +#loc134 = loc("xindex"(#loc7)) +#loc135 = loc("r0_index"(#loc8)) +#loc136 = loc("r0_index"(#loc9)) +#loc137 = loc("tmp0"(#loc10)) +#loc138 = loc("tmp0"(#loc11)) +#loc139 = loc("tmp0"(#loc12)) +#loc140 = loc("tmp0"(#loc13)) +#loc141 = loc("tmp2"(#loc14)) +#loc142 = loc("tmp4"(#loc15)) +#loc143 = loc("tmp5"(#loc16)) +#loc144 = loc("tmp7"(#loc17)) +#loc145 = loc("tmp6"(#loc18)) +#loc146 = loc("tmp9"(#loc19)) +#loc147 = loc("tmp11"(#loc20)) +#loc148 = loc("flip"(#loc21)) +#loc150 = loc("flip"(#loc24)) +#loc151 = loc("flip"(#loc25)) +#loc152 = loc("flip"(#loc26)) +#loc153 = loc("y"(#loc27)) +#loc154 = loc("left_mask"(#loc29)) +#loc155 = loc("ileft"(#loc30)) +#loc157 = loc("ileft"(#loc34)) +#loc158 = loc("ileft"(#loc35)) +#loc159 = loc("iright"(#loc36)) +#loc161 = loc("iright"(#loc38)) +#loc162 = loc("iright"(#loc39)) +#loc163 = loc("ileft"(#loc40)) +#loc164 = loc("iright"(#loc41)) +#loc165 = loc("y_idx"(#loc42)) +#loc166 = loc("left_idx"(#loc43)) +#loc167 = loc("left_idx"(#loc44)) +#loc168 = loc("input"(#loc45)) +#loc170 = loc("left_idx"(#loc47)) +#loc171 = loc("left_idx"(#loc48)) +#loc172 = loc("right_idx"(#loc49)) +#loc173 = loc("right_idx"(#loc50)) +#loc175 = loc("right_idx"(#loc52)) +#loc176 = loc("right_idx"(#loc53)) +#loc177 = loc("left_idx"(#loc54)) +#loc178 = loc("right_idx"(#loc55)) +#loc179 = loc("cond"(#loc56)) +#loc180 = loc("eq"(#loc57)) +#loc181 = loc("cond"(#loc58)) +#loc182 = loc("cond"(#loc59)) +#loc183 = loc("cond"(#loc60)) +#loc184 = loc("cond"(#loc61)) +#loc185 = loc("cond"(#loc62)) +#loc186 = loc("ret"(#loc63)) +#loc187 = loc("ret"(#loc64)) +#loc188 = loc("ret"(#loc65)) +#loc189 = loc("new_idxs"(#loc66)) +#loc190 = loc("new_idxs"(#loc67)) +#loc191 = loc("new_idxs"(#loc68)) +#loc192 = loc("tmp14"(#loc69)) +#loc193 = loc("tmp16"(#loc70)) +#loc194 = loc("tmp15"(#loc71)) +#loc196 = loc("tmp20"(#loc73)) +#loc197 = loc("tmp23"(#loc74)) +#loc199 = loc("tmp24"(#loc76)) +#loc200 = loc("tmp25"(#loc77)) +#loc201 = loc("tmp28"(#loc78)) +#loc203 = loc("tmp29"(#loc80)) +#loc204 = loc("tmp30"(#loc81)) +#loc205 = loc("tmp31"(#loc82)) +#loc206 = loc("tmp34"(#loc83)) +#loc207 = loc("tmp36"(#loc84)) +#loc208 = loc("tmp38"(#loc85)) +#loc209 = loc("tmp39"(#loc86)) +#loc210 = loc("tmp40"(#loc87)) +#loc211 = loc("tmp45"(#loc94)) +#loc212 = loc("tmp46"(#loc95)) +#loc213 = loc("tmp47"(#loc96)) +#loc214 = loc("tmp48"(#loc97)) +#loc215 = loc("tmp49"(#loc98)) +#loc216 = loc(fused[#loc144, #loc145]) +#loc217 = loc(callsite(#loc148 at #loc149)) +#loc218 = loc(callsite(#loc150 at #loc149)) +#loc219 = loc(callsite(#loc151 at #loc149)) +#loc220 = loc(callsite(#loc152 at #loc149)) +#loc222 = loc("cond"(#loc179)) +#loc223 = loc("eq"(#loc180)) +#loc224 = loc(fused[#loc193, #loc194]) +#loc226 = loc(fused[#loc196, #loc144, #loc145]) +#loc227 = loc(callsite(#loc31 at #loc198)) +#loc229 = loc(fused[#loc200, #loc193, #loc194]) +#loc230 = loc(callsite(#loc31 at #loc202)) +#loc232 = loc(callsite(#loc153 at #loc221)) +#loc233 = loc(callsite(#loc154 at #loc221)) +#loc234 = loc(callsite(#loc155 at #loc221)) +#loc236 = loc(callsite(#loc157 at #loc221)) +#loc237 = loc(callsite(#loc158 at #loc221)) +#loc238 = loc(callsite(#loc159 at #loc221)) +#loc240 = loc(callsite(#loc161 at #loc221)) +#loc241 = loc(callsite(#loc162 at #loc221)) +#loc242 = loc(callsite(#loc163 at #loc221)) +#loc243 = loc(callsite(#loc164 at #loc221)) +#loc244 = loc(callsite(#loc165 at #loc221)) +#loc245 = loc(callsite(#loc166 at #loc221)) +#loc246 = loc(callsite(#loc167 at #loc221)) +#loc248 = loc(callsite(#loc170 at #loc221)) +#loc249 = loc(callsite(#loc171 at #loc221)) +#loc250 = loc(callsite(#loc172 at #loc221)) +#loc251 = loc(callsite(#loc173 at #loc221)) +#loc253 = loc(callsite(#loc175 at #loc221)) +#loc254 = loc(callsite(#loc176 at #loc221)) +#loc255 = loc(callsite(#loc177 at #loc221)) +#loc256 = loc(callsite(#loc178 at #loc221)) +#loc257 = loc(callsite(#loc222 at #loc221)) +#loc258 = loc(callsite(#loc223 at #loc221)) +#loc259 = loc(callsite(#loc181 at #loc221)) +#loc260 = loc(callsite(#loc182 at #loc221)) +#loc261 = loc(callsite(#loc183 at #loc221)) +#loc262 = loc(callsite(#loc184 at #loc221)) +#loc263 = loc(callsite(#loc185 at #loc221)) +#loc264 = loc(callsite(#loc186 at #loc221)) +#loc265 = loc(callsite(#loc187 at #loc221)) +#loc266 = loc(callsite(#loc188 at #loc221)) +#loc267 = loc(callsite(#loc189 at #loc221)) +#loc268 = loc(callsite(#loc190 at #loc221)) +#loc269 = loc(callsite(#loc191 at #loc221)) +#loc270 = loc(callsite(#loc153 at #loc225)) +#loc271 = loc(callsite(#loc155 at #loc225)) +#loc273 = loc(callsite(#loc157 at #loc225)) +#loc274 = loc(callsite(#loc158 at #loc225)) +#loc275 = loc(callsite(#loc159 at #loc225)) +#loc277 = loc(callsite(#loc161 at #loc225)) +#loc278 = loc(callsite(#loc162 at #loc225)) +#loc279 = loc(callsite(#loc163 at #loc225)) +#loc280 = loc(callsite(#loc164 at #loc225)) +#loc281 = loc(callsite(#loc222 at #loc225)) +#loc282 = loc(callsite(#loc223 at #loc225)) +#loc283 = loc(callsite(#loc182 at #loc225)) +#loc284 = loc(callsite(#loc183 at #loc225)) +#loc285 = loc(callsite(#loc184 at #loc225)) +#loc286 = loc(callsite(#loc185 at #loc225)) +#loc287 = loc(callsite(#loc186 at #loc225)) +#loc288 = loc(callsite(#loc187 at #loc225)) +#loc289 = loc(callsite(#loc188 at #loc225)) +#loc290 = loc(callsite(#loc190 at #loc225)) +#loc291 = loc(callsite(#loc191 at #loc225)) +#loc292 = loc(callsite(#loc165 at #loc225)) +#loc293 = loc(callsite(#loc167 at #loc225)) +#loc295 = loc(callsite(#loc170 at #loc225)) +#loc296 = loc(callsite(#loc171 at #loc225)) +#loc297 = loc(callsite(#loc173 at #loc225)) +#loc299 = loc(callsite(#loc175 at #loc225)) +#loc300 = loc(callsite(#loc176 at #loc225)) +#loc301 = loc(callsite(#loc177 at #loc225)) +#loc302 = loc(callsite(#loc178 at #loc225)) +#loc303 = loc(callsite(#loc181 at #loc225)) +#loc304 = loc(callsite(#loc189 at #loc225)) +#loc305 = loc(callsite(#loc33 at #loc227)) +#loc306 = loc(callsite(#loc33 at #loc230)) +#loc307 = loc(callsite(#loc31 at #loc235)) +#loc309 = loc(callsite(#loc31 at #loc239)) +#loc311 = loc(callsite(#loc168 at #loc247)) +#loc312 = loc(callsite(#loc31 at #loc247)) +#loc314 = loc(callsite(#loc168 at #loc252)) +#loc315 = loc(callsite(#loc31 at #loc252)) +#loc317 = loc(callsite(#loc31 at #loc272)) +#loc319 = loc(callsite(#loc31 at #loc276)) +#loc321 = loc(callsite(#loc31 at #loc294)) +#loc323 = loc(callsite(#loc31 at #loc298)) +#loc325 = loc(callsite(#loc33 at #loc307)) +#loc326 = loc(callsite(#loc33 at #loc309)) +#loc327 = loc(callsite(#loc33 at #loc312)) +#loc328 = loc(callsite(#loc33 at #loc315)) +#loc329 = loc(callsite(#loc33 at #loc317)) +#loc330 = loc(callsite(#loc33 at #loc319)) +#loc331 = loc(callsite(#loc33 at #loc321)) +#loc332 = loc(callsite(#loc33 at #loc323)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/__grp__triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.json b/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/__grp__triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d0a8a8e8e5621f6a8a2882861e0f5dc3dc898750 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/__grp__triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.source", "triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.ttir", "triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.ttgir", "triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.llir", "triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.ptx", "triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.cubin", "triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.cubin new file mode 100644 index 0000000000000000000000000000000000000000..6e8f9b8976855a73514e2c92be7d82ec26748129 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.json b/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.json new file mode 100644 index 0000000000000000000000000000000000000000..893bb237c2497cdac13502db1078f047b1470bc0 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.json @@ -0,0 +1 @@ +{"hash": "ec0409dee971ea6f361feab9e50436845d14fe0c95c533fcd5da5d2041e210f9", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.llir new file mode 100644 index 0000000000000000000000000000000000000000..3eecda1f8a7efe1816e0aadda180d25d4c242d8b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.llir @@ -0,0 +1,91 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !7 + %8 = and i32 %7, 1, !dbg !7 + %9 = zext nneg i32 %8 to i64, !dbg !8 + %10 = getelementptr i64, ptr addrspace(1) %0, i64 %9, !dbg !8 + %11 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l"(ptr addrspace(1) %10) #4, !dbg !9 + %12 = getelementptr i64, ptr addrspace(1) %1, i64 %9, !dbg !10 + %13 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l"(ptr addrspace(1) %12) #4, !dbg !11 + %extelt.offset = lshr i64 %11, 32, !dbg !12 + %14 = trunc nuw i64 %extelt.offset to i32, !dbg !12 + %15 = trunc i64 %11 to i32, !dbg !12 + %16 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %15, i32 1, i32 31), !dbg !12 + %17 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %14, i32 1, i32 31), !dbg !12 + %18 = insertelement <2 x i32> poison, i32 %16, i64 0, !dbg !12 + %19 = insertelement <2 x i32> %18, i32 %17, i64 1, !dbg !12 + %20 = bitcast <2 x i32> %19 to i64, !dbg !12 + %21 = add i64 %11, %20, !dbg !16 + %extelt.offset1 = lshr i64 %13, 32, !dbg !17 + %22 = trunc nuw i64 %extelt.offset1 to i32, !dbg !17 + %23 = trunc i64 %13 to i32, !dbg !17 + %24 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %23, i32 1, i32 31), !dbg !17 + %25 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %22, i32 1, i32 31), !dbg !17 + %26 = insertelement <2 x i32> poison, i32 %24, i64 0, !dbg !17 + %27 = insertelement <2 x i32> %26, i32 %25, i64 1, !dbg !17 + %28 = bitcast <2 x i32> %27 to i64, !dbg !17 + %29 = add i64 %13, %28, !dbg !19 + %30 = sitofp i64 %21 to float, !dbg !20 + %31 = sitofp i64 %29 to float, !dbg !21 + %32 = icmp sgt i64 %29, 0, !dbg !22 + %33 = select i1 %32, float %31, float 0x3EB0C6F7A0000000, !dbg !26 + %34 = tail call float @llvm.nvvm.div.full(float %30, float %33), !dbg !27 + %35 = and i32 %7, 63, !dbg !28 + %36 = icmp eq i32 %35, 0, !dbg !28 + %37 = bitcast float %34 to i32, !dbg !28 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %37, ptr addrspace(1) %2, i1 %36) #4, !dbg !28 + ret void, !dbg !29 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4", linkageName: "triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 27, column: 38, scope: !4) +!8 = !DILocation(line: 33, column: 30, scope: !4) +!9 = !DILocation(line: 33, column: 37, scope: !4) +!10 = !DILocation(line: 34, column: 30, scope: !4) +!11 = !DILocation(line: 34, column: 37, scope: !4) +!12 = !DILocation(line: 291, column: 36, scope: !13, inlinedAt: !15) +!13 = distinct !DILexicalBlockFile(scope: !4, file: !14, discriminator: 0) +!14 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!15 = !DILocation(line: 36, column: 24, scope: !4) +!16 = !DILocation(line: 261, column: 15, scope: !13, inlinedAt: !15) +!17 = !DILocation(line: 291, column: 36, scope: !13, inlinedAt: !18) +!18 = !DILocation(line: 38, column: 24, scope: !4) +!19 = !DILocation(line: 261, column: 15, scope: !13, inlinedAt: !18) +!20 = !DILocation(line: 39, column: 19, scope: !4) +!21 = !DILocation(line: 40, column: 19, scope: !4) +!22 = !DILocation(line: 110, column: 15, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !4, file: !24, discriminator: 0) +!24 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!25 = !DILocation(line: 42, column: 41, scope: !4) +!26 = !DILocation(line: 113, column: 29, scope: !23, inlinedAt: !25) +!27 = !DILocation(line: 43, column: 20, scope: !4) +!28 = !DILocation(line: 44, column: 68, scope: !4) +!29 = !DILocation(line: 44, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.ptx new file mode 100644 index 0000000000000000000000000000000000000000..49396b0870afb072f3a3277b018f8d88619059ab --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.ptx @@ -0,0 +1,373 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4 // -- Begin function triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4 + // @triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4 +.visible .entry triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4( + .param .u64 .ptr .global .align 1 triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4_param_2, + .param .u32 triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4_param_3, + .param .u64 .ptr .global .align 1 triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4_param_5 +) +.reqntid 64 +{ + .reg .pred %p<3>; + .reg .b32 %r<16>; + .reg .b64 %rd<19>; + .loc 1 18 0 // czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py:18:0 +$L__func_begin0: + .loc 1 18 0 // czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py:18:0 + +// %bb.0: + ld.param.b64 %rd6, [triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4_param_0]; + ld.param.b64 %rd7, [triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4_param_1]; +$L__tmp0: + .loc 1 27 38 // czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py:27:38 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 1; + ld.param.b64 %rd5, [triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4_param_2]; + .loc 1 33 30 // czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py:33:30 + mul.wide.u32 %rd8, %r3, 8; + add.s64 %rd2, %rd6, %rd8; + .loc 1 33 37 // czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py:33:37 + // begin inline asm + mov.u64 %rd1, 0x0; + ld.global.b64 { %rd1 }, [ %rd2 + 0 ]; + // end inline asm + .loc 1 34 30 // czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py:34:30 + add.s64 %rd4, %rd7, %rd8; + .loc 1 34 37 // czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py:34:37 + // begin inline asm + mov.u64 %rd3, 0x0; + ld.global.b64 { %rd3 }, [ %rd4 + 0 ]; + // end inline asm +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py:36:24 ] + mov.b64 {_, %r4}, %rd1; + cvt.u32.u64 %r5, %rd1; + shfl.sync.bfly.b32 %r6, %r5, 1, 31, -1; + shfl.sync.bfly.b32 %r7, %r4, 1, 31, -1; + cvt.u64.u32 %rd9, %r6; + cvt.u64.u32 %rd10, %r7; + shl.b64 %rd11, %rd10, 32; + or.b64 %rd12, %rd9, %rd11; + .loc 2 261 15 // standard.py:261:15 @[ czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py:36:24 ] + add.s64 %rd13, %rd1, %rd12; +$L__tmp2: + .loc 2 291 36 // standard.py:291:36 @[ czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py:38:24 ] + mov.b64 {_, %r8}, %rd3; + cvt.u32.u64 %r9, %rd3; + shfl.sync.bfly.b32 %r10, %r9, 1, 31, -1; + shfl.sync.bfly.b32 %r11, %r8, 1, 31, -1; + cvt.u64.u32 %rd14, %r10; + cvt.u64.u32 %rd15, %r11; + shl.b64 %rd16, %rd15, 32; + or.b64 %rd17, %rd14, %rd16; + .loc 2 261 15 // standard.py:261:15 @[ czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py:38:24 ] + add.s64 %rd18, %rd3, %rd17; +$L__tmp3: + .loc 1 39 19 // czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py:39:19 + cvt.rn.f32.s64 %r12, %rd13; + .loc 1 40 19 // czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py:40:19 + cvt.rn.f32.s64 %r13, %rd18; +$L__tmp4: + .loc 3 110 15 // triton_helpers.py:110:15 @[ czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py:42:41 ] + setp.gt.s64 %p2, %rd18, 0; + .loc 3 113 29 // triton_helpers.py:113:29 @[ czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py:42:41 ] + selp.f32 %r14, %r13, 0f358637BD, %p2; +$L__tmp5: + .loc 1 43 20 // czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py:43:20 + div.full.f32 %r1, %r12, %r14; + .loc 1 44 68 // czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py:44:68 + and.b32 %r15, %r2, 63; + setp.eq.b32 %p1, %r15, 0; + // begin inline asm + @%p1 st.global.b32 [ %rd5 + 0 ], { %r1 }; + // end inline asm + .loc 1 44 4 // czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py:44:4 + ret; +$L__tmp6: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 284 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x115 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 122 +.b8 102 +.b8 106 +.b8 112 +.b8 99 +.b8 50 +.b8 98 +.b8 109 +.b8 97 +.b8 106 +.b8 114 +.b8 54 +.b8 51 +.b8 119 +.b8 53 +.b8 112 +.b8 53 +.b8 53 +.b8 120 +.b8 108 +.b8 100 +.b8 51 +.b8 122 +.b8 108 +.b8 110 +.b8 105 +.b8 98 +.b8 50 +.b8 99 +.b8 120 +.b8 100 +.b8 122 +.b8 121 +.b8 120 +.b8 51 +.b8 110 +.b8 50 +.b8 117 +.b8 97 +.b8 110 +.b8 54 +.b8 114 +.b8 51 +.b8 50 +.b8 52 +.b8 117 +.b8 106 +.b8 107 +.b8 118 +.b8 53 +.b8 50 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 122 +.b8 102 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x36 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 108 +.b8 97 +.b8 109 +.b8 112 +.b8 95 +.b8 109 +.b8 105 +.b8 110 +.b8 95 +.b8 100 +.b8 105 +.b8 118 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 52 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc1:0x5e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xd6:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 36 // DW_AT_call_line +.b8 24 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xee:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp2 // DW_AT_low_pc +.b64 $L__tmp3 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 38 // DW_AT_call_line +.b8 24 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x106:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp4 // DW_AT_low_pc +.b64 $L__tmp5 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 41 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.source b/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.source new file mode 100644 index 0000000000000000000000000000000000000000..5f4d14d9c242d5c497aa34004700353edcbd9062 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.source @@ -0,0 +1,206 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":18:0) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc32 = loc(unknown) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":109:0) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc61 = loc("in_ptr0"(#loc)) +#loc62 = loc("in_ptr1"(#loc)) +#loc63 = loc("out_ptr2"(#loc)) +#loc64 = loc("r0_numel"(#loc)) +#loc90 = loc("input"(#loc30)) +#loc91 = loc("a"(#loc35)) +#loc92 = loc("b"(#loc35)) +#loc93 = loc("a"(#loc39)) +#loc94 = loc("b"(#loc39)) +#loc98 = loc("x"(#loc48)) +#loc99 = loc("x"(#loc52)) +module { + tt.func public @triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel = arith.constant 1 : i32 loc(#loc65) + %r0_numel_0 = arith.constant 2 : i32 loc(#loc66) + %xoffset = tt.get_program_id x : i32 loc(#loc67) + %xoffset_1 = arith.constant 1 : i32 loc(#loc68) + %xoffset_2 = arith.constant 1 : i32 loc(#loc68) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc68) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc69) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc70) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc71) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc71) + %xmask = arith.constant true loc(#loc72) + %xmask_7 = arith.constant dense : tensor<1x2xi1> loc(#loc72) + %r0_index = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc73) + %r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc74) + %r0_offset = arith.constant 0 : i32 loc(#loc75) + %r0_mask = arith.constant true loc(#loc76) + %r0_mask_9 = arith.constant dense : tensor<1x2xi1> loc(#loc76) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2x!tt.ptr> loc(#loc77) + %tmp0_10 = tt.addptr %tmp0, %r0_index_8 : tensor<1x2x!tt.ptr>, tensor<1x2xi32> loc(#loc77) + %tmp0_11 = tt.load %tmp0_10 : tensor<1x2x!tt.ptr> loc(#loc78) + %tmp4 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2x!tt.ptr> loc(#loc79) + %tmp4_12 = tt.addptr %tmp4, %r0_index_8 : tensor<1x2x!tt.ptr>, tensor<1x2xi32> loc(#loc79) + %tmp4_13 = tt.load %tmp4_12 : tensor<1x2x!tt.ptr> loc(#loc80) + %tmp3 = tt.call @"triton.language.standard.sum__i64S1_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp0_11) : (tensor<1x2xi64>) -> tensor<1xi64> loc(#loc81) + %tmp3_14 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc82) + %tmp7 = tt.call @"triton.language.standard.sum__i64S1_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp4_13) : (tensor<1x2xi64>) -> tensor<1xi64> loc(#loc83) + %tmp7_15 = tt.expand_dims %tmp7 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc84) + %tmp8 = arith.sitofp %tmp3_14 : tensor<1x1xi64> to tensor<1x1xf32> loc(#loc85) + %tmp9 = arith.sitofp %tmp7_15 : tensor<1x1xi64> to tensor<1x1xf32> loc(#loc86) + %tmp10 = arith.constant 9.99999997E-7 : f32 loc(#loc87) + %tmp11 = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32S1_1S_fp32__(%tmp9, %tmp10) : (tensor<1x1xf32>, f32) -> tensor<1x1xf32> loc(#loc88) + %tmp12 = arith.divf %tmp8, %tmp11 : tensor<1x1xf32> loc(#loc89) + %c0_i32 = arith.constant 0 : i32 loc(#loc26) + %cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc26) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc27) + %1 = tt.addptr %0, %cst : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc27) + tt.store %1, %tmp12 : tensor<1x1x!tt.ptr> loc(#loc28) + tt.return loc(#loc29) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2xi64> loc("input"(#loc30))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc31) + tt.reduce.return %2 : i64 loc(#loc31) + }) : (tensor<1x2xi64>) -> tensor<1xi64> loc(#loc31) + tt.return %0 : tensor<1xi64> loc(#loc33) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc34) + tt.return %1 : tensor<1xi64> loc(#loc34) + } loc(#loc30) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc35)), %b: i64 loc("b"(#loc35))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc36) + tt.return %0 : i64 loc(#loc37) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc38) + tt.return %1 : i64 loc(#loc38) + } loc(#loc35) + tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32S1_1S_fp32__(%a: tensor<1x1xf32> loc("a"(#loc39)), %b: f32 loc("b"(#loc39))) -> tensor<1x1xf32> attributes {noinline = false} { + %mask = tt.splat %b : f32 -> tensor<1x1xf32> loc(#loc95) + %mask_0 = arith.cmpf ogt, %a, %mask : tensor<1x1xf32> loc(#loc100) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_1S__(%a) : (tensor<1x1xf32>) -> i1 loc(#loc41) + %1 = scf.if %0 -> (tensor<1x1xi1>) { + %mask_1 = arith.cmpf une, %a, %a : tensor<1x1xf32> loc(#loc96) + %mask_2 = arith.ori %mask_0, %mask_1 : tensor<1x1xi1> loc(#loc101) + scf.yield %mask_2 : tensor<1x1xi1> loc(#loc101) + } else { + scf.yield %mask_0 : tensor<1x1xi1> loc(#loc32) + } loc(#loc42) + %2 = tt.splat %b : f32 -> tensor<1x1xf32> loc(#loc45) + %3 = arith.select %1, %a, %2 : tensor<1x1xi1>, tensor<1x1xf32> loc(#loc45) + tt.return %3 : tensor<1x1xf32> loc(#loc46) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x1xf32> loc(#loc47) + tt.return %4 : tensor<1x1xf32> loc(#loc47) + } loc(#loc39) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_1S__(%x: tensor<1x1xf32> loc("x"(#loc48))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_1S__(%x) : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc49) + %true = arith.constant true loc(#loc50) + tt.return %true : i1 loc(#loc50) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc51) + tt.return %1 : i1 loc(#loc51) + } loc(#loc48) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_1S__(%x: tensor<1x1xf32> loc("x"(#loc52))) -> tensor<1x1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc53) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc54) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc54) + %3 = arith.addf %x, %2 : tensor<1x1xf32> loc(#loc54) + tt.return %3 : tensor<1x1xf32> loc(#loc55) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x1xf32> loc(#loc56) + tt.return %4 : tensor<1x1xf32> loc(#loc56) + } loc(#loc52) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc58) + %cst = arith.constant dense : tensor<1xi1> loc(#loc58) + tt.return %cst : tensor<1xi1> loc(#loc59) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc60) + tt.return %0 : tensor<1xi1> loc(#loc60) + } loc(#loc57) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":26:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":33:30) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":33:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":34:30) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":34:37) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":36:24) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":36:27) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":38:24) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":38:27) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":39:19) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":40:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":41:12) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":42:41) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":43:20) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":44:49) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":44:25) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":44:68) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":44:4) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:19) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:7) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:11) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:4) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc65 = loc("xnumel"(#loc1)) +#loc66 = loc("r0_numel"(#loc2)) +#loc67 = loc("xoffset"(#loc3)) +#loc68 = loc("xoffset"(#loc4)) +#loc69 = loc("xindex"(#loc5)) +#loc70 = loc("xindex"(#loc6)) +#loc71 = loc("xindex"(#loc7)) +#loc72 = loc("xmask"(#loc8)) +#loc73 = loc("r0_index"(#loc9)) +#loc74 = loc("r0_index"(#loc10)) +#loc75 = loc("r0_offset"(#loc11)) +#loc76 = loc("r0_mask"(#loc12)) +#loc77 = loc("tmp0"(#loc13)) +#loc78 = loc("tmp0"(#loc14)) +#loc79 = loc("tmp4"(#loc15)) +#loc80 = loc("tmp4"(#loc16)) +#loc81 = loc("tmp3"(#loc17)) +#loc82 = loc("tmp3"(#loc18)) +#loc83 = loc("tmp7"(#loc19)) +#loc84 = loc("tmp7"(#loc20)) +#loc85 = loc("tmp8"(#loc21)) +#loc86 = loc("tmp9"(#loc22)) +#loc87 = loc("tmp10"(#loc23)) +#loc88 = loc("tmp11"(#loc24)) +#loc89 = loc("tmp12"(#loc25)) +#loc95 = loc("mask"(#loc40)) +#loc96 = loc("mask"(#loc43)) +#loc97 = loc("mask"(#loc44)) +#loc100 = loc("mask"(#loc95)) +#loc101 = loc("mask"(#loc97)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..1d25dcaeff0da10dec2ef20776ecebbba24b964a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.ttgir @@ -0,0 +1,91 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":18:0) +#loc1 = loc(unknown) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":36:24) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":38:24) +#loc23 = loc("in_ptr0"(#loc)) +#loc24 = loc("in_ptr1"(#loc)) +#loc25 = loc("out_ptr2"(#loc)) +#loc26 = loc("r0_numel"(#loc)) +#loc32 = loc("tmp3"(#loc8)) +#loc34 = loc("tmp7"(#loc11)) +#loc44 = loc(callsite(#loc1 at #loc32)) +#loc46 = loc(callsite(#loc1 at #loc34)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1) + %r0_index = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc27) + %r0_index_0 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2xi32, #blocked> loc(#loc27) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2x!tt.ptr, #blocked> loc(#loc28) + %tmp0_1 = tt.addptr %tmp0, %r0_index_0 : tensor<1x2x!tt.ptr, #blocked>, tensor<1x2xi32, #blocked> loc(#loc28) + %tmp0_2 = tt.load %tmp0_1 : tensor<1x2x!tt.ptr, #blocked> loc(#loc29) + %tmp4 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2x!tt.ptr, #blocked> loc(#loc30) + %tmp4_3 = tt.addptr %tmp4, %r0_index_0 : tensor<1x2x!tt.ptr, #blocked>, tensor<1x2xi32, #blocked> loc(#loc30) + %tmp4_4 = tt.load %tmp4_3 : tensor<1x2x!tt.ptr, #blocked> loc(#loc31) + %tmp3 = "tt.reduce"(%tmp0_2) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_9: i64 loc(callsite(#loc1 at #loc32)), %tmp3_10: i64 loc(callsite(#loc1 at #loc32))): + %tmp3_11 = arith.addi %tmp3_9, %tmp3_10 : i64 loc(#loc51) + tt.reduce.return %tmp3_11 : i64 loc(#loc43) + }) : (tensor<1x2xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc43) + %tmp3_5 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc33) + %tmp7 = "tt.reduce"(%tmp4_4) <{axis = 1 : i32}> ({ + ^bb0(%tmp7_9: i64 loc(callsite(#loc1 at #loc34)), %tmp7_10: i64 loc(callsite(#loc1 at #loc34))): + %tmp7_11 = arith.addi %tmp7_9, %tmp7_10 : i64 loc(#loc52) + tt.reduce.return %tmp7_11 : i64 loc(#loc45) + }) : (tensor<1x2xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc45) + %tmp7_6 = tt.expand_dims %tmp7 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc35) + %tmp8 = arith.sitofp %tmp3_5 : tensor<1x1xi64, #blocked> to tensor<1x1xf32, #blocked> loc(#loc36) + %tmp9 = arith.sitofp %tmp7_6 : tensor<1x1xi64, #blocked> to tensor<1x1xf32, #blocked> loc(#loc37) + %mask = arith.cmpf ogt, %tmp9, %cst : tensor<1x1xf32, #blocked> loc(#loc53) + %mask_7 = arith.cmpf une, %tmp9, %tmp9 : tensor<1x1xf32, #blocked> loc(#loc48) + %mask_8 = arith.ori %mask, %mask_7 : tensor<1x1xi1, #blocked> loc(#loc54) + %tmp11 = arith.select %mask_8, %tmp9, %cst : tensor<1x1xi1, #blocked>, tensor<1x1xf32, #blocked> loc(#loc50) + %tmp12 = arith.divf %tmp8, %tmp11 : tensor<1x1xf32, #blocked> loc(#loc42) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc21) + tt.store %0, %tmp12 : tensor<1x1x!tt.ptr, #blocked> loc(#loc21) + tt.return loc(#loc22) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":27:38) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":33:30) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":33:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":34:30) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":34:37) +#loc7 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc9 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":36:27) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":38:27) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":39:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":40:19) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":42:41) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":43:20) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":44:68) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":44:4) +#loc27 = loc("r0_index"(#loc2)) +#loc28 = loc("tmp0"(#loc3)) +#loc29 = loc("tmp0"(#loc4)) +#loc30 = loc("tmp4"(#loc5)) +#loc31 = loc("tmp4"(#loc6)) +#loc33 = loc("tmp3"(#loc10)) +#loc35 = loc("tmp7"(#loc12)) +#loc36 = loc("tmp8"(#loc13)) +#loc37 = loc("tmp9"(#loc14)) +#loc38 = loc("mask"(#loc15)) +#loc39 = loc("tmp11"(#loc16)) +#loc40 = loc("mask"(#loc17)) +#loc41 = loc("mask"(#loc18)) +#loc42 = loc("tmp12"(#loc20)) +#loc43 = loc(callsite(#loc7 at #loc32)) +#loc45 = loc(callsite(#loc7 at #loc34)) +#loc47 = loc("mask"(#loc38)) +#loc48 = loc(callsite(#loc40 at #loc39)) +#loc49 = loc("mask"(#loc41)) +#loc50 = loc(callsite(#loc19 at #loc39)) +#loc51 = loc(callsite(#loc9 at #loc43)) +#loc52 = loc(callsite(#loc9 at #loc45)) +#loc53 = loc(callsite(#loc47 at #loc39)) +#loc54 = loc(callsite(#loc49 at #loc39)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.ttir new file mode 100644 index 0000000000000000000000000000000000000000..606d49fd2029c87ab30ead77d9d78222f4aa4e0f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/5QCATXXJOHVG6NQ75K46KBBWQRORJ7QMSXCTH7GV3JOSAQPCCD4Q/triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4.ttir @@ -0,0 +1,93 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":18:0) +#loc1 = loc(unknown) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":36:24) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":38:24) +#loc25 = loc("in_ptr0"(#loc)) +#loc26 = loc("in_ptr1"(#loc)) +#loc27 = loc("out_ptr2"(#loc)) +#loc28 = loc("r0_numel"(#loc)) +#loc35 = loc("tmp3"(#loc9)) +#loc37 = loc("tmp7"(#loc12)) +#loc47 = loc(callsite(#loc1 at #loc35)) +#loc49 = loc(callsite(#loc1 at #loc37)) +module { + tt.func public @triton_per_fused_clamp_min_div_eq_mul_squeeze_sum_4(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc1) + %r0_index = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc29) + %r0_index_0 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc30) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2x!tt.ptr> loc(#loc31) + %tmp0_1 = tt.addptr %tmp0, %r0_index_0 : tensor<1x2x!tt.ptr>, tensor<1x2xi32> loc(#loc31) + %tmp0_2 = tt.load %tmp0_1 : tensor<1x2x!tt.ptr> loc(#loc32) + %tmp4 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2x!tt.ptr> loc(#loc33) + %tmp4_3 = tt.addptr %tmp4, %r0_index_0 : tensor<1x2x!tt.ptr>, tensor<1x2xi32> loc(#loc33) + %tmp4_4 = tt.load %tmp4_3 : tensor<1x2x!tt.ptr> loc(#loc34) + %tmp3 = "tt.reduce"(%tmp0_2) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_9: i64 loc(callsite(#loc1 at #loc35)), %tmp3_10: i64 loc(callsite(#loc1 at #loc35))): + %tmp3_11 = arith.addi %tmp3_9, %tmp3_10 : i64 loc(#loc54) + tt.reduce.return %tmp3_11 : i64 loc(#loc46) + }) : (tensor<1x2xi64>) -> tensor<1xi64> loc(#loc46) + %tmp3_5 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc36) + %tmp7 = "tt.reduce"(%tmp4_4) <{axis = 1 : i32}> ({ + ^bb0(%tmp7_9: i64 loc(callsite(#loc1 at #loc37)), %tmp7_10: i64 loc(callsite(#loc1 at #loc37))): + %tmp7_11 = arith.addi %tmp7_9, %tmp7_10 : i64 loc(#loc55) + tt.reduce.return %tmp7_11 : i64 loc(#loc48) + }) : (tensor<1x2xi64>) -> tensor<1xi64> loc(#loc48) + %tmp7_6 = tt.expand_dims %tmp7 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc38) + %tmp8 = arith.sitofp %tmp3_5 : tensor<1x1xi64> to tensor<1x1xf32> loc(#loc39) + %tmp9 = arith.sitofp %tmp7_6 : tensor<1x1xi64> to tensor<1x1xf32> loc(#loc40) + %mask = arith.cmpf ogt, %tmp9, %cst : tensor<1x1xf32> loc(#loc56) + %mask_7 = arith.cmpf une, %tmp9, %tmp9 : tensor<1x1xf32> loc(#loc51) + %mask_8 = arith.ori %mask, %mask_7 : tensor<1x1xi1> loc(#loc57) + %tmp11 = arith.select %mask_8, %tmp9, %cst : tensor<1x1xi1>, tensor<1x1xf32> loc(#loc53) + %tmp12 = arith.divf %tmp8, %tmp11 : tensor<1x1xf32> loc(#loc45) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc22) + tt.store %0, %tmp12 : tensor<1x1x!tt.ptr> loc(#loc23) + tt.return loc(#loc24) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":27:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":27:38) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":33:30) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":33:37) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":34:30) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":34:37) +#loc8 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc10 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":36:27) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":38:27) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":39:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":40:19) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":42:41) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":43:20) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":44:25) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":44:68) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zf/czfjpc2bmajr63w5p55xld3zlnib2cxdzyx3n2uan6r324ujkv52.py":44:4) +#loc29 = loc("r0_index"(#loc2)) +#loc30 = loc("r0_index"(#loc3)) +#loc31 = loc("tmp0"(#loc4)) +#loc32 = loc("tmp0"(#loc5)) +#loc33 = loc("tmp4"(#loc6)) +#loc34 = loc("tmp4"(#loc7)) +#loc36 = loc("tmp3"(#loc11)) +#loc38 = loc("tmp7"(#loc13)) +#loc39 = loc("tmp8"(#loc14)) +#loc40 = loc("tmp9"(#loc15)) +#loc41 = loc("mask"(#loc16)) +#loc42 = loc("tmp11"(#loc17)) +#loc43 = loc("mask"(#loc18)) +#loc44 = loc("mask"(#loc19)) +#loc45 = loc("tmp12"(#loc21)) +#loc46 = loc(callsite(#loc8 at #loc35)) +#loc48 = loc(callsite(#loc8 at #loc37)) +#loc50 = loc("mask"(#loc41)) +#loc51 = loc(callsite(#loc43 at #loc42)) +#loc52 = loc("mask"(#loc44)) +#loc53 = loc(callsite(#loc20 at #loc42)) +#loc54 = loc(callsite(#loc10 at #loc46)) +#loc55 = loc(callsite(#loc10 at #loc48)) +#loc56 = loc(callsite(#loc50 at #loc42)) +#loc57 = loc(callsite(#loc52 at #loc42)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/__grp__triton_poi_fused_new_zeros_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/__grp__triton_poi_fused_new_zeros_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3339701bc04efce9ce10bf4f81607e76ec2f227c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/__grp__triton_poi_fused_new_zeros_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_new_zeros_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.source", "triton_poi_fused_new_zeros_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.ttir", "triton_poi_fused_new_zeros_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.ttgir", "triton_poi_fused_new_zeros_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.llir", "triton_poi_fused_new_zeros_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.ptx", "triton_poi_fused_new_zeros_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.cubin", "triton_poi_fused_new_zeros_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..c75101afbe0ec86c819eff603bdc3a9df8c03720 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ddad9a25b421540766b2b9fc64fa100f4042552c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.json @@ -0,0 +1 @@ +{"hash": "f143f479009090646c7252c2be4342b94b67e2d7b7c0c3b6941ade655041dbc6", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_new_zeros_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..1eecf4071844a58adad7ba2950f4071d92ec2304 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.llir @@ -0,0 +1,47 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_new_zeros_1(ptr addrspace(1) %0, i32 %1, ptr addrspace(1) readnone captures(none) %2, ptr addrspace(1) readnone captures(none) %3) local_unnamed_addr #0 !dbg !4 { + %5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %6 = shl i32 %5, 8, !dbg !8 + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %8 = shl nuw nsw i32 %7, 1, !dbg !9 + %9 = and i32 %8, 254, !dbg !9 + %10 = or disjoint i32 %9, %6, !dbg !10 + %11 = icmp slt i32 %10, 544, !dbg !11 + %12 = sext i32 %10 to i64, !dbg !12 + %13 = getelementptr i32, ptr addrspace(1) %0, i64 %12, !dbg !12 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 0, i32 0, ptr addrspace(1) %13, i1 %11) #2, !dbg !13 + ret void, !dbg !14 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_new_zeros_1", linkageName: "triton_poi_fused_new_zeros_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 22, column: 21, scope: !4) +!12 = !DILocation(line: 25, column: 25, scope: !4) +!13 = !DILocation(line: 25, column: 36, scope: !4) +!14 = !DILocation(line: 25, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..5f7fef72e584a95269b66532f246bc5a2ba8dd4d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.ptx @@ -0,0 +1,207 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_new_zeros_1 // -- Begin function triton_poi_fused_new_zeros_1 + // @triton_poi_fused_new_zeros_1 +.visible .entry triton_poi_fused_new_zeros_1( + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_1_param_0, + .param .u32 triton_poi_fused_new_zeros_1_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_1_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_1_param_3 +) +.reqntid 128 +{ + .reg .pred %p<2>; + .reg .b32 %r<9>; + .reg .b64 %rd<3>; + .loc 1 18 0 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:18:0 +$L__func_begin0: + .loc 1 18 0 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:18:0 + +// %bb.0: + ld.param.b64 %rd2, [triton_poi_fused_new_zeros_1_param_0]; +$L__tmp0: + .loc 1 20 28 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:20:28 + mov.u32 %r3, %ctaid.x; + .loc 1 20 33 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:20:33 + shl.b32 %r4, %r3, 8; + .loc 1 21 36 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:21:36 + mov.u32 %r5, %tid.x; + shl.b32 %r6, %r5, 1; + and.b32 %r7, %r6, 254; + .loc 1 21 23 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:21:23 + or.b32 %r8, %r7, %r4; + .loc 1 22 21 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:22:21 + setp.lt.s32 %p1, %r8, 544; + .loc 1 25 25 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:25:25 + mad.wide.s32 %rd1, %r8, 4, %rd2; + mov.b32 %r1, 0; + .loc 1 25 36 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:25:36 + // begin inline asm + @%p1 st.global.v2.b32 [ %rd1 + 0 ], { %r1, %r1 }; + // end inline asm + .loc 1 25 4 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:25:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 122 +.b8 103 +.b8 118 +.b8 98 +.b8 118 +.b8 55 +.b8 109 +.b8 114 +.b8 111 +.b8 115 +.b8 111 +.b8 52 +.b8 118 +.b8 104 +.b8 101 +.b8 97 +.b8 51 +.b8 105 +.b8 98 +.b8 120 +.b8 120 +.b8 111 +.b8 115 +.b8 97 +.b8 53 +.b8 115 +.b8 121 +.b8 117 +.b8 115 +.b8 117 +.b8 99 +.b8 103 +.b8 115 +.b8 55 +.b8 117 +.b8 52 +.b8 103 +.b8 105 +.b8 50 +.b8 99 +.b8 108 +.b8 51 +.b8 98 +.b8 106 +.b8 50 +.b8 111 +.b8 105 +.b8 52 +.b8 117 +.b8 53 +.b8 108 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 122 +.b8 103 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.source new file mode 100644 index 0000000000000000000000000000000000000000..053b959507b4984cdf08eb533468b7952c026600 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.source @@ -0,0 +1,41 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":18:0) +#loc11 = loc("out_ptr0"(#loc)) +#loc12 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_new_zeros_1(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 544 : i32 loc(#loc13) + %xoffset = tt.get_program_id x : i32 loc(#loc14) + %xoffset_1 = arith.constant 256 : i32 loc(#loc15) + %xoffset_2 = arith.constant 256 : i32 loc(#loc15) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc15) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc16) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<256xi32> loc(#loc17) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<256xi32> loc(#loc17) + %xmask = arith.constant dense<544> : tensor<256xi32> loc(#loc18) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<256xi32> loc(#loc18) + %tmp0 = arith.constant 0 : i32 loc(#loc19) + %tmp0_7 = arith.constant dense<0> : tensor<1xi32> loc(#loc19) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc8) + %1 = tt.addptr %0, %xindex_5 : tensor<256x!tt.ptr>, tensor<256xi32> loc(#loc8) + %cst = arith.constant dense<0> : tensor<256xi32> loc(#loc9) + tt.store %1, %cst, %xmask_6 : tensor<256x!tt.ptr> loc(#loc9) + tt.return loc(#loc10) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":20:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":20:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":21:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":21:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":22:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":24:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":25:25) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":25:36) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":25:4) +#loc13 = loc("xnumel"(#loc1)) +#loc14 = loc("xoffset"(#loc2)) +#loc15 = loc("xoffset"(#loc3)) +#loc16 = loc("xindex"(#loc4)) +#loc17 = loc("xindex"(#loc5)) +#loc18 = loc("xmask"(#loc6)) +#loc19 = loc("tmp0"(#loc7)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..506e4a900cb2bd6580bbff12719725ffd1e54c34 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.ttgir @@ -0,0 +1,35 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_new_zeros_1(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<544> : tensor<256xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<256xi32, #blocked> loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc12) + %xoffset_1 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc13) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> loc(#loc14) + %xindex_2 = tt.splat %xoffset_1 : i32 -> tensor<256xi32, #blocked> loc(#loc15) + %xindex_3 = arith.addi %xindex_2, %xindex : tensor<256xi32, #blocked> loc(#loc15) + %xmask = arith.cmpi slt, %xindex_3, %cst : tensor<256xi32, #blocked> loc(#loc16) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr, #blocked> loc(#loc7) + %1 = tt.addptr %0, %xindex_3 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> loc(#loc7) + tt.store %1, %cst_0, %xmask : tensor<256x!tt.ptr, #blocked> loc(#loc8) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":20:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":20:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":21:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":21:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":22:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":25:25) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":25:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":25:4) +#loc12 = loc("xoffset"(#loc2)) +#loc13 = loc("xoffset"(#loc3)) +#loc14 = loc("xindex"(#loc4)) +#loc15 = loc("xindex"(#loc5)) +#loc16 = loc("xmask"(#loc6)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..cb8128a80b4a5dc6d56fe8f277c4d546b890e249 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/6FB7I6IASCIGI3DSKLBL4Q2CXFFWPYWXW7AMHNUUDLPGKUCB3PDA/triton_poi_fused_new_zeros_1.ttir @@ -0,0 +1,34 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_new_zeros_1(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<256xi32> loc(#loc1) + %xmask = arith.constant dense<544> : tensor<256xi32> loc(#loc12) + %c256_i32 = arith.constant 256 : i32 loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc13) + %xoffset_0 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc14) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc15) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<256xi32> loc(#loc16) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<256xi32> loc(#loc16) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<256xi32> loc(#loc12) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc8) + %1 = tt.addptr %0, %xindex_2 : tensor<256x!tt.ptr>, tensor<256xi32> loc(#loc8) + tt.store %1, %cst, %xmask_3 : tensor<256x!tt.ptr> loc(#loc1) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":25:36) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":22:21) +#loc3 = loc(unknown) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":20:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":20:33) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":21:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":21:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":25:25) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":25:4) +#loc12 = loc("xmask"(#loc2)) +#loc13 = loc("xoffset"(#loc4)) +#loc14 = loc("xoffset"(#loc5)) +#loc15 = loc("xindex"(#loc6)) +#loc16 = loc("xindex"(#loc7)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/__grp__triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/__grp__triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..71464e781ab80ba1f7fe1ba83e2a32c076095c3f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/__grp__triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.source", "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttir", "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttgir", "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.llir", "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ptx", "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.cubin", "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..08b295974ba32477f2ed670d320e8424608c98e5 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c132dc46a918f39f32dfddb72964b4802fba164e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json @@ -0,0 +1 @@ +{"hash": "f3c4aded81dbff22e27b92b661d7597c49082d3021070a0d7156b4ffcc7213a5", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..8737cde1f9709e6cb1cfdfce776f83d55c708c2b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.llir @@ -0,0 +1,481 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py\00" +@assertMessage_0 = internal constant [40 x i8] c"index out of bounds: 0 <= tmp6 < 151936\00" +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #1 !dbg !9 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %10 = icmp slt i32 %9, %4, !dbg !11 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %12 = shl nuw nsw i32 %11, 2, !dbg !12 + %13 = and i32 %12, 2044, !dbg !12 + %14 = mul i32 %9, 151936, !dbg !13 + %15 = zext nneg i32 %13 to i64, !dbg !14 + br label %16, !dbg !14 + +16: ; preds = %8, %16 + %indvars.iv = phi i64 [ 0, %8 ], [ %indvars.iv.next, %16 ] + %17 = phi i32 [ 2147483647, %8 ], [ %104, %16 ] + %18 = phi i32 [ 2147483647, %8 ], [ %105, %16 ] + %19 = phi float [ 0xFFF0000000000000, %8 ], [ %101, %16 ] + %20 = phi float [ 0xFFF0000000000000, %8 ], [ %102, %16 ] + %21 = phi <2 x float> [ splat (float 0xFFF0000000000000), %8 ], [ %100, %16 ] + %22 = phi <2 x i32> [ splat (i32 2147483647), %8 ], [ %103, %16 ] + %23 = or disjoint i64 %indvars.iv, %15, !dbg !15 + %24 = or disjoint i64 %23, 2, !dbg !15 + %25 = or disjoint i64 %23, 3, !dbg !15 + %26 = icmp samesign ult i64 %23, 151936, !dbg !16 + %27 = trunc nuw nsw i64 %23 to i32, !dbg !17 + %28 = add i32 %14, %27, !dbg !17 + %29 = sext i32 %28 to i64, !dbg !18 + %30 = getelementptr bfloat, ptr addrspace(1) %1, i64 %29, !dbg !18 + %31 = and i1 %10, %26, !dbg !19 + %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !20 + %33 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %30, i64 %32, i1 %31) #5, !dbg !20 + %34 = extractvalue { i32, i32 } %33, 0, !dbg !20 + %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !20 + %36 = extractvalue { i32, i32 } %33, 1, !dbg !20 + %37 = bitcast i32 %36 to <2 x bfloat>, !dbg !20 + %38 = extractelement <2 x bfloat> %37, i64 0, !dbg !20 + %39 = extractelement <2 x bfloat> %37, i64 1, !dbg !20 + %40 = fpext bfloat %38 to float, !dbg !21 + %41 = fpext bfloat %39 to float, !dbg !21 + %42 = fcmp ogt float %19, %40, !dbg !22 + %43 = fcmp ogt float %20, %41, !dbg !22 + %44 = fcmp oeq float %19, %40, !dbg !26 + %45 = fcmp oeq float %20, %41, !dbg !26 + %46 = fcmp uno <2 x float> %21, zeroinitializer, !dbg !27 + %47 = fcmp uno float %19, 0.000000e+00, !dbg !27 + %48 = fcmp uno float %20, 0.000000e+00, !dbg !27 + %49 = fcmp uno bfloat %38, 0xR0000, !dbg !28 + %50 = fcmp uno bfloat %39, 0xR0000, !dbg !28 + %51 = xor i1 %49, true, !dbg !29 + %52 = xor i1 %50, true, !dbg !29 + %53 = and i1 %47, %51, !dbg !30 + %54 = and i1 %48, %52, !dbg !30 + %55 = or i1 %42, %53, !dbg !31 + %56 = or i1 %43, %54, !dbg !31 + %57 = and i1 %47, %49, !dbg !32 + %58 = and i1 %48, %50, !dbg !32 + %59 = or i1 %44, %57, !dbg !33 + %60 = or i1 %45, %58, !dbg !33 + %61 = sext <2 x i32> %22 to <2 x i64>, !dbg !34 + %62 = sext i32 %17 to i64, !dbg !34 + %63 = icmp sgt i64 %24, %62, !dbg !34 + %64 = sext i32 %18 to i64, !dbg !34 + %65 = icmp sgt i64 %25, %64, !dbg !34 + %66 = and i1 %63, %59, !dbg !35 + %67 = and i1 %65, %60, !dbg !35 + %68 = or i1 %55, %66, !dbg !36 + %69 = or i1 %56, %67, !dbg !36 + %70 = select i1 %68, float %19, float %40, !dbg !37 + %71 = select i1 %69, float %20, float %41, !dbg !37 + %72 = trunc nuw nsw i64 %23 to i32, !dbg !38 + %73 = or disjoint i32 %72, 1, !dbg !38 + %74 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !21 + %75 = fcmp ogt <2 x float> %21, %74, !dbg !22 + %76 = fcmp oeq <2 x float> %21, %74, !dbg !26 + %77 = fcmp uno <2 x bfloat> %35, zeroinitializer, !dbg !28 + %78 = xor <2 x i1> %77, splat (i1 true), !dbg !29 + %79 = and <2 x i1> %46, %78, !dbg !30 + %80 = or <2 x i1> %75, %79, !dbg !31 + %81 = and <2 x i1> %46, %77, !dbg !32 + %82 = or <2 x i1> %76, %81, !dbg !33 + %83 = insertelement <2 x i64> poison, i64 %23, i64 0, !dbg !34 + %84 = shufflevector <2 x i64> %83, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !34 + %85 = icmp sgt <2 x i64> %84, %61, !dbg !34 + %86 = icmp sge <2 x i64> %84, %61, !dbg !34 + %87 = shufflevector <2 x i1> %85, <2 x i1> %86, <2 x i32> , !dbg !34 + %88 = and <2 x i1> %87, %82, !dbg !35 + %89 = or <2 x i1> %80, %88, !dbg !36 + %90 = select <2 x i1> %89, <2 x float> %21, <2 x float> %74, !dbg !37 + %91 = insertelement <2 x i32> poison, i32 %27, i64 0, !dbg !38 + %92 = insertelement <2 x i32> %91, i32 %73, i64 1, !dbg !38 + %93 = select <2 x i1> %89, <2 x i32> %22, <2 x i32> %92, !dbg !38 + %94 = trunc nuw nsw i64 %24 to i32, !dbg !38 + %95 = select i1 %68, i32 %17, i32 %94, !dbg !38 + %96 = trunc nuw nsw i64 %25 to i32, !dbg !38 + %97 = select i1 %69, i32 %18, i32 %96, !dbg !38 + %98 = insertelement <2 x i1> poison, i1 %31, i64 0, !dbg !39 + %99 = shufflevector <2 x i1> %98, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !39 + %100 = select <2 x i1> %99, <2 x float> %90, <2 x float> %21, !dbg !39 + %101 = select i1 %31, float %70, float %19, !dbg !39 + %102 = select i1 %31, float %71, float %20, !dbg !39 + %103 = select <2 x i1> %99, <2 x i32> %93, <2 x i32> %22, !dbg !40 + %104 = select i1 %31, i32 %95, i32 %17, !dbg !40 + %105 = select i1 %31, i32 %97, i32 %18, !dbg !40 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !14 + %106 = icmp samesign ult i64 %indvars.iv, 149888, !dbg !14 + br i1 %106, label %16, label %107, !dbg !14 + +107: ; preds = %16 + %108 = and i32 %11, 31, !dbg !12 + %109 = lshr i32 %11, 5, !dbg !12 + %110 = shufflevector <2 x float> %100, <2 x float> poison, <2 x i32> , !dbg !41 + %111 = fcmp ogt <2 x float> %100, %110, !dbg !41 + %112 = fcmp oeq <2 x float> %100, %110, !dbg !41 + %113 = shufflevector <2 x i1> %111, <2 x i1> %112, <2 x i32> , !dbg !41 + %114 = extractelement <2 x float> %100, i64 0, !dbg !43 + %115 = fcmp uno float %114, 0.000000e+00, !dbg !43 + %116 = extractelement <2 x float> %100, i64 1, !dbg !44 + %117 = fcmp uno float %116, 0.000000e+00, !dbg !44 + %118 = xor i1 %117, true, !dbg !45 + %119 = insertelement <2 x i1> poison, i1 %115, i64 0, !dbg !46 + %120 = shufflevector <2 x i1> %119, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !46 + %121 = insertelement <2 x i1> poison, i1 %118, i64 0, !dbg !46 + %122 = insertelement <2 x i1> %121, i1 %117, i64 1, !dbg !46 + %123 = and <2 x i1> %120, %122, !dbg !46 + %124 = or <2 x i1> %113, %123, !dbg !47 + %125 = extractelement <2 x i32> %103, i64 0, !dbg !48 + %126 = extractelement <2 x i32> %103, i64 1, !dbg !48 + %127 = icmp slt i32 %125, %126, !dbg !48 + %128 = extractelement <2 x i1> %124, i64 1, !dbg !49 + %129 = and i1 %127, %128, !dbg !49 + %130 = extractelement <2 x i1> %124, i64 0, !dbg !50 + %131 = or i1 %130, %129, !dbg !50 + %132 = select i1 %131, float %114, float %116, !dbg !51 + %133 = select i1 %131, i32 %125, i32 %126, !dbg !52 + %134 = fcmp ogt float %132, %101, !dbg !41 + %135 = fcmp oeq float %132, %101, !dbg !53 + %136 = fcmp uno float %132, 0.000000e+00, !dbg !43 + %137 = fcmp uno float %101, 0.000000e+00, !dbg !44 + %138 = xor i1 %137, true, !dbg !45 + %139 = and i1 %136, %138, !dbg !46 + %140 = or i1 %134, %139, !dbg !47 + %141 = and i1 %137, %136, !dbg !54 + %142 = or i1 %135, %141, !dbg !55 + %143 = icmp slt i32 %133, %104, !dbg !48 + %144 = and i1 %143, %142, !dbg !49 + %145 = or i1 %140, %144, !dbg !50 + %146 = select i1 %145, float %132, float %101, !dbg !51 + %147 = select i1 %145, i32 %133, i32 %104, !dbg !52 + %148 = fcmp ogt float %146, %102, !dbg !41 + %149 = fcmp oeq float %146, %102, !dbg !53 + %150 = fcmp uno float %146, 0.000000e+00, !dbg !43 + %151 = fcmp uno float %102, 0.000000e+00, !dbg !44 + %152 = xor i1 %151, true, !dbg !45 + %153 = and i1 %150, %152, !dbg !46 + %154 = or i1 %148, %153, !dbg !47 + %155 = and i1 %151, %150, !dbg !54 + %156 = or i1 %149, %155, !dbg !55 + %157 = icmp slt i32 %147, %105, !dbg !48 + %158 = and i1 %157, %156, !dbg !49 + %159 = or i1 %154, %158, !dbg !50 + %160 = select i1 %159, float %146, float %102, !dbg !51 + %161 = select i1 %159, i32 %147, i32 %105, !dbg !52 + %162 = bitcast float %160 to i32, !dbg !56 + %163 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %162, i32 16, i32 31), !dbg !56 + %164 = bitcast i32 %163 to float, !dbg !56 + %165 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 16, i32 31), !dbg !56 + %166 = fcmp ogt float %160, %164, !dbg !41 + %167 = fcmp oeq float %160, %164, !dbg !53 + %168 = fcmp uno float %160, 0.000000e+00, !dbg !43 + %169 = fcmp uno float %164, 0.000000e+00, !dbg !44 + %170 = xor i1 %169, true, !dbg !45 + %171 = and i1 %168, %170, !dbg !46 + %172 = or i1 %166, %171, !dbg !47 + %173 = and i1 %168, %169, !dbg !54 + %174 = or i1 %167, %173, !dbg !55 + %175 = icmp slt i32 %161, %165, !dbg !48 + %176 = and i1 %175, %174, !dbg !49 + %177 = or i1 %172, %176, !dbg !50 + %178 = select i1 %177, float %160, float %164, !dbg !51 + %179 = select i1 %177, i32 %161, i32 %165, !dbg !52 + %180 = bitcast float %178 to i32, !dbg !56 + %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 8, i32 31), !dbg !56 + %182 = bitcast i32 %181 to float, !dbg !56 + %183 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %179, i32 8, i32 31), !dbg !56 + %184 = fcmp ogt float %178, %182, !dbg !41 + %185 = fcmp oeq float %178, %182, !dbg !53 + %186 = fcmp uno float %178, 0.000000e+00, !dbg !43 + %187 = fcmp uno float %182, 0.000000e+00, !dbg !44 + %188 = xor i1 %187, true, !dbg !45 + %189 = and i1 %186, %188, !dbg !46 + %190 = or i1 %184, %189, !dbg !47 + %191 = and i1 %187, %186, !dbg !54 + %192 = or i1 %185, %191, !dbg !55 + %193 = icmp slt i32 %179, %183, !dbg !48 + %194 = and i1 %193, %192, !dbg !49 + %195 = or i1 %190, %194, !dbg !50 + %196 = select i1 %195, float %178, float %182, !dbg !51 + %197 = select i1 %195, i32 %179, i32 %183, !dbg !52 + %198 = bitcast float %196 to i32, !dbg !56 + %199 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 4, i32 31), !dbg !56 + %200 = bitcast i32 %199 to float, !dbg !56 + %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %197, i32 4, i32 31), !dbg !56 + %202 = fcmp ogt float %196, %200, !dbg !41 + %203 = fcmp oeq float %196, %200, !dbg !53 + %204 = fcmp uno float %196, 0.000000e+00, !dbg !43 + %205 = fcmp uno float %200, 0.000000e+00, !dbg !44 + %206 = xor i1 %205, true, !dbg !45 + %207 = and i1 %204, %206, !dbg !46 + %208 = or i1 %202, %207, !dbg !47 + %209 = and i1 %205, %204, !dbg !54 + %210 = or i1 %203, %209, !dbg !55 + %211 = icmp slt i32 %197, %201, !dbg !48 + %212 = and i1 %211, %210, !dbg !49 + %213 = or i1 %208, %212, !dbg !50 + %214 = select i1 %213, float %196, float %200, !dbg !51 + %215 = select i1 %213, i32 %197, i32 %201, !dbg !52 + %216 = bitcast float %214 to i32, !dbg !56 + %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 2, i32 31), !dbg !56 + %218 = bitcast i32 %217 to float, !dbg !56 + %219 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %215, i32 2, i32 31), !dbg !56 + %220 = fcmp ogt float %214, %218, !dbg !41 + %221 = fcmp oeq float %214, %218, !dbg !53 + %222 = fcmp uno float %214, 0.000000e+00, !dbg !43 + %223 = fcmp uno float %218, 0.000000e+00, !dbg !44 + %224 = xor i1 %223, true, !dbg !45 + %225 = and i1 %222, %224, !dbg !46 + %226 = or i1 %220, %225, !dbg !47 + %227 = and i1 %223, %222, !dbg !54 + %228 = or i1 %221, %227, !dbg !55 + %229 = icmp slt i32 %215, %219, !dbg !48 + %230 = and i1 %229, %228, !dbg !49 + %231 = or i1 %226, %230, !dbg !50 + %232 = select i1 %231, float %214, float %218, !dbg !51 + %233 = select i1 %231, i32 %215, i32 %219, !dbg !52 + %234 = bitcast float %232 to i32, !dbg !56 + %235 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %234, i32 1, i32 31), !dbg !56 + %236 = bitcast i32 %235 to float, !dbg !56 + %237 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %233, i32 1, i32 31), !dbg !56 + %238 = fcmp ogt float %232, %236, !dbg !41 + %239 = fcmp oeq float %232, %236, !dbg !53 + %240 = fcmp uno float %232, 0.000000e+00, !dbg !43 + %241 = fcmp uno float %236, 0.000000e+00, !dbg !44 + %242 = xor i1 %241, true, !dbg !45 + %243 = and i1 %240, %242, !dbg !46 + %244 = or i1 %238, %243, !dbg !47 + %245 = and i1 %241, %240, !dbg !54 + %246 = or i1 %239, %245, !dbg !55 + %247 = icmp slt i32 %233, %237, !dbg !48 + %248 = and i1 %247, %246, !dbg !49 + %249 = or i1 %244, %248, !dbg !50 + %250 = select i1 %249, i32 %233, i32 %237, !dbg !52 + %251 = and i32 %109, 15, !dbg !56 + %252 = icmp eq i32 %108, 0, !dbg !56 + %253 = getelementptr float, ptr addrspace(3) @global_smem, i32 %251, !dbg !56 + %254 = select i1 %249, i32 %234, i32 %235, !dbg !51 + %255 = insertelement <1 x i32> poison, i32 %254, i64 0, !dbg !56 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %253, <1 x i32> %255, i1 %252) #5, !dbg !56 + %256 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %251, !dbg !56 + %257 = insertelement <1 x i32> poison, i32 %250, i64 0, !dbg !56 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %256, <1 x i32> %257, i1 %252) #5, !dbg !56 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !56 + %258 = icmp samesign ult i32 %11, 16, !dbg !56 + %259 = getelementptr float, ptr addrspace(3) @global_smem, i32 %11, !dbg !56 + %260 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %259, i1 %258) #5, !dbg !56 + %261 = bitcast i32 %260 to float, !dbg !56 + %262 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %11, !dbg !56 + %263 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %262, i1 %258) #5, !dbg !56 + %264 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %260, i32 8, i32 31), !dbg !56 + %265 = bitcast i32 %264 to float, !dbg !56 + %266 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %263, i32 8, i32 31), !dbg !56 + %267 = fcmp ogt float %261, %265, !dbg !41 + %268 = fcmp oeq float %261, %265, !dbg !53 + %269 = fcmp uno float %261, 0.000000e+00, !dbg !43 + %270 = fcmp uno float %265, 0.000000e+00, !dbg !44 + %271 = xor i1 %270, true, !dbg !45 + %272 = and i1 %269, %271, !dbg !46 + %273 = or i1 %267, %272, !dbg !47 + %274 = and i1 %269, %270, !dbg !54 + %275 = or i1 %268, %274, !dbg !55 + %276 = icmp slt i32 %263, %266, !dbg !48 + %277 = and i1 %276, %275, !dbg !49 + %278 = or i1 %273, %277, !dbg !50 + %279 = select i1 %278, float %261, float %265, !dbg !51 + %280 = select i1 %278, i32 %263, i32 %266, !dbg !52 + %281 = bitcast float %279 to i32, !dbg !56 + %282 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %281, i32 4, i32 31), !dbg !56 + %283 = bitcast i32 %282 to float, !dbg !56 + %284 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %280, i32 4, i32 31), !dbg !56 + %285 = fcmp ogt float %279, %283, !dbg !41 + %286 = fcmp oeq float %279, %283, !dbg !53 + %287 = fcmp uno float %279, 0.000000e+00, !dbg !43 + %288 = fcmp uno float %283, 0.000000e+00, !dbg !44 + %289 = xor i1 %288, true, !dbg !45 + %290 = and i1 %287, %289, !dbg !46 + %291 = or i1 %285, %290, !dbg !47 + %292 = and i1 %288, %287, !dbg !54 + %293 = or i1 %286, %292, !dbg !55 + %294 = icmp slt i32 %280, %284, !dbg !48 + %295 = and i1 %294, %293, !dbg !49 + %296 = or i1 %291, %295, !dbg !50 + %297 = select i1 %296, float %279, float %283, !dbg !51 + %298 = select i1 %296, i32 %280, i32 %284, !dbg !52 + %299 = bitcast float %297 to i32, !dbg !56 + %300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %299, i32 2, i32 31), !dbg !56 + %301 = bitcast i32 %300 to float, !dbg !56 + %302 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 2, i32 31), !dbg !56 + %303 = fcmp ogt float %297, %301, !dbg !41 + %304 = fcmp oeq float %297, %301, !dbg !53 + %305 = fcmp uno float %297, 0.000000e+00, !dbg !43 + %306 = fcmp uno float %301, 0.000000e+00, !dbg !44 + %307 = xor i1 %306, true, !dbg !45 + %308 = and i1 %305, %307, !dbg !46 + %309 = or i1 %303, %308, !dbg !47 + %310 = and i1 %306, %305, !dbg !54 + %311 = or i1 %304, %310, !dbg !55 + %312 = icmp slt i32 %298, %302, !dbg !48 + %313 = and i1 %312, %311, !dbg !49 + %314 = or i1 %309, %313, !dbg !50 + %315 = select i1 %314, float %297, float %301, !dbg !51 + %316 = select i1 %314, i32 %298, i32 %302, !dbg !52 + %317 = bitcast float %315 to i32, !dbg !56 + %318 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %317, i32 1, i32 31), !dbg !56 + %319 = bitcast i32 %318 to float, !dbg !56 + %320 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %316, i32 1, i32 31), !dbg !56 + %321 = fcmp ogt float %315, %319, !dbg !41 + %322 = fcmp oeq float %315, %319, !dbg !53 + %323 = fcmp uno float %315, 0.000000e+00, !dbg !43 + %324 = fcmp uno float %319, 0.000000e+00, !dbg !44 + %325 = xor i1 %324, true, !dbg !45 + %326 = and i1 %323, %325, !dbg !46 + %327 = or i1 %321, %326, !dbg !47 + %328 = and i1 %324, %323, !dbg !54 + %329 = or i1 %322, %328, !dbg !55 + %330 = icmp slt i32 %316, %320, !dbg !48 + %331 = and i1 %330, %329, !dbg !49 + %332 = or i1 %327, %331, !dbg !50 + %333 = select i1 %332, i32 %316, i32 %320, !dbg !52 + %334 = icmp eq i32 %11, 0, !dbg !56 + %335 = select i1 %332, i32 %317, i32 %318, !dbg !51 + %336 = insertelement <1 x i32> poison, i32 %335, i64 0, !dbg !56 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %259, <1 x i32> %336, i1 %334) #5, !dbg !56 + %337 = insertelement <1 x i32> poison, i32 %333, i64 0, !dbg !56 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %262, <1 x i32> %337, i1 %334) #5, !dbg !56 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !56 + %338 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !56 + %339 = zext nneg i32 %9 to i64, !dbg !57 + %340 = getelementptr i64, ptr addrspace(1) %3, i64 %339, !dbg !57 + %341 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !58 + %342 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %340, i64 %341, i1 %10) #5, !dbg !58 + %343 = add i32 %338, 151936, !dbg !59 + %344 = icmp slt i32 %338, 0, !dbg !60 + %345 = select i1 %344, i32 %343, i32 %338, !dbg !61 + %346 = icmp ugt i32 %345, 151935, !dbg !62 + %.not2 = and i1 %10, %346, !dbg !63 + br i1 %.not2, label %347, label %348, !dbg !63 + +347: ; preds = %107 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 50, ptr nonnull @assertFunc_0, i64 1), !dbg !63 + unreachable, !dbg !63 + +348: ; preds = %107 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !63 + %349 = sext i32 %345 to i64, !dbg !64 + %350 = getelementptr i1, ptr addrspace(1) %2, i64 %349, !dbg !64 + %351 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !65 + %352 = tail call i8 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b8 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %350, i64 %351, i1 %10) #5, !dbg !65 + %.not = icmp eq i8 %352, 0, !dbg !65 + %353 = select i1 %.not, i64 0, i64 %342, !dbg !66 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !67 + %354 = getelementptr i64, ptr addrspace(1) %0, i64 %339, !dbg !68 + %355 = and i32 %11, 511, !dbg !69 + %356 = icmp eq i32 %355, 0, !dbg !69 + %357 = and i1 %356, %10, !dbg !69 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %353, ptr addrspace(1) %354, i1 %357) #5, !dbg !69 + ret void, !dbg !70 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="512" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0", linkageName: "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 22, column: 28, scope: !9) +!11 = !DILocation(line: 24, column: 21, scope: !9) +!12 = !DILocation(line: 25, column: 37, scope: !9) +!13 = !DILocation(line: 36, column: 48, scope: !9) +!14 = !DILocation(line: 30, column: 40, scope: !9) +!15 = !DILocation(line: 31, column: 31, scope: !9) +!16 = !DILocation(line: 32, column: 29, scope: !9) +!17 = !DILocation(line: 36, column: 41, scope: !9) +!18 = !DILocation(line: 36, column: 34, scope: !9) +!19 = !DILocation(line: 36, column: 63, scope: !9) +!20 = !DILocation(line: 36, column: 53, scope: !9) +!21 = !DILocation(line: 36, column: 115, scope: !9) +!22 = !DILocation(line: 144, column: 21, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !9, file: !24, discriminator: 0) +!24 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!25 = !DILocation(line: 39, column: 38, scope: !9) +!26 = !DILocation(line: 145, column: 23, scope: !23, inlinedAt: !25) +!27 = !DILocation(line: 147, column: 29, scope: !23, inlinedAt: !25) +!28 = !DILocation(line: 148, column: 29, scope: !23, inlinedAt: !25) +!29 = !DILocation(line: 149, column: 31, scope: !23, inlinedAt: !25) +!30 = !DILocation(line: 149, column: 27, scope: !23, inlinedAt: !25) +!31 = !DILocation(line: 149, column: 16, scope: !23, inlinedAt: !25) +!32 = !DILocation(line: 151, column: 27, scope: !23, inlinedAt: !25) +!33 = !DILocation(line: 151, column: 17, scope: !23, inlinedAt: !25) +!34 = !DILocation(line: 154, column: 31, scope: !23, inlinedAt: !25) +!35 = !DILocation(line: 154, column: 21, scope: !23, inlinedAt: !25) +!36 = !DILocation(line: 154, column: 12, scope: !23, inlinedAt: !25) +!37 = !DILocation(line: 155, column: 35, scope: !23, inlinedAt: !25) +!38 = !DILocation(line: 155, column: 69, scope: !23, inlinedAt: !25) +!39 = !DILocation(line: 41, column: 54, scope: !9) +!40 = !DILocation(line: 42, column: 66, scope: !9) +!41 = !DILocation(line: 144, column: 21, scope: !23, inlinedAt: !42) +!42 = !DILocation(line: 43, column: 75, scope: !9) +!43 = !DILocation(line: 147, column: 29, scope: !23, inlinedAt: !42) +!44 = !DILocation(line: 148, column: 29, scope: !23, inlinedAt: !42) +!45 = !DILocation(line: 149, column: 31, scope: !23, inlinedAt: !42) +!46 = !DILocation(line: 149, column: 27, scope: !23, inlinedAt: !42) +!47 = !DILocation(line: 149, column: 16, scope: !23, inlinedAt: !42) +!48 = !DILocation(line: 154, column: 31, scope: !23, inlinedAt: !42) +!49 = !DILocation(line: 154, column: 21, scope: !23, inlinedAt: !42) +!50 = !DILocation(line: 154, column: 12, scope: !23, inlinedAt: !42) +!51 = !DILocation(line: 155, column: 35, scope: !23, inlinedAt: !42) +!52 = !DILocation(line: 155, column: 69, scope: !23, inlinedAt: !42) +!53 = !DILocation(line: 145, column: 23, scope: !23, inlinedAt: !42) +!54 = !DILocation(line: 151, column: 27, scope: !23, inlinedAt: !42) +!55 = !DILocation(line: 151, column: 17, scope: !23, inlinedAt: !42) +!56 = !DILocation(line: 165, column: 42, scope: !23, inlinedAt: !42) +!57 = !DILocation(line: 45, column: 31, scope: !9) +!58 = !DILocation(line: 45, column: 36, scope: !9) +!59 = !DILocation(line: 47, column: 18, scope: !9) +!60 = !DILocation(line: 48, column: 18, scope: !9) +!61 = !DILocation(line: 49, column: 32, scope: !9) +!62 = !DILocation(line: 50, column: 37, scope: !9) +!63 = !DILocation(line: 50, column: 65, scope: !9) +!64 = !DILocation(line: 51, column: 30, scope: !9) +!65 = !DILocation(line: 51, column: 37, scope: !9) +!66 = !DILocation(line: 54, column: 20, scope: !9) +!67 = !DILocation(line: 55, column: 4, scope: !9) +!68 = !DILocation(line: 56, column: 28, scope: !9) +!69 = !DILocation(line: 56, column: 40, scope: !9) +!70 = !DILocation(line: 56, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..9663e6641977da4b3ce9f2c76f58eb56491dd032 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ptx @@ -0,0 +1,954 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0 // -- Begin function triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 55, 98, 47, 99, 55, 98, 101, 117, 110, 101, 116, 102, 105, 111, 117, 97, 51, 105, 103, 100, 121, 107, 112, 114, 55, 112, 100, 117, 111, 121, 110, 114, 104, 50, 52, 105, 105, 107, 109, 119, 118, 106, 115, 55, 54, 103, 112, 97, 54, 111, 109, 121, 120, 118, 115, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[40] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 54, 32, 60, 32, 49, 53, 49, 57, 51, 54}; +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0 +.visible .entry triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_3, + .param .u32 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_4, + .param .u32 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_7 +) +.reqntid 512 +{ + .reg .pred %p<212>; + .reg .b16 %rs<7>; + .reg .b32 %r<118>; + .reg .b64 %rd<46>; + .loc 1 18 0 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:18:0 + +// %bb.0: + ld.param.b32 %r16, [triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_4]; + ld.param.b64 %rd12, [triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_3]; + ld.param.b64 %rd11, [triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_2]; + ld.param.b64 %rd10, [triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_1]; + ld.param.b64 %rd9, [triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_0]; +$L__tmp0: + .loc 1 22 28 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:22:28 + mov.u32 %r1, %ctaid.x; + .loc 1 25 37 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:25:37 + mov.u32 %r2, %tid.x; + shl.b32 %r21, %r2, 2; + and.b32 %r22, %r21, 2044; + .loc 1 30 40 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:30:40 + cvt.u64.u32 %rd1, %r22; + mad.lo.s32 %r23, %r1, 151936, %r22; + cvt.u64.u32 %rd2, %r23; + mov.b32 %r114, 0fFF800000; + mov.b64 %rd45, {%r114, %r114}; + mov.b32 %r112, 2147483647; + mov.b64 %rd44, -2048; + setp.lt.s32 %p2, %r1, %r16; + mov.b32 %r113, %r112; + mov.b32 %r115, %r114; + mov.b32 %r116, %r112; + mov.b32 %r117, %r112; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 31 31 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:31:31 + add.s64 %rd18, %rd1, %rd44; + add.s64 %rd19, %rd18, 2048; + add.s64 %rd20, %rd18, 2050; + .loc 1 32 29 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:32:29 + add.s64 %rd21, %rd18, 2051; + setp.lt.u64 %p3, %rd19, 151936; + .loc 1 36 34 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:36:34 + add.s64 %rd22, %rd2, %rd44; + cvt.u32.u64 %r28, %rd22; + add.s32 %r29, %r28, 2048; + mad.wide.s32 %rd16, %r29, 2, %rd10; + .loc 1 36 63 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:36:63 + and.pred %p1, %p2, %p3; + .loc 1 36 53 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:36:53 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0; + // end inline asm + mov.b32 %r26, 0; + // begin inline asm + mov.u32 %r24, %r26; + mov.u32 %r25, %r26; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r24, %r25 }, [ %rd16 + 0 ], %rd15; + // end inline asm + mov.b32 {%rs1, %rs2}, %r25; + .loc 1 36 115 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:36:115 + cvt.f32.bf16 %r30, %rs1; + cvt.f32.bf16 %r31, %rs2; +$L__tmp1: + .loc 2 144 21 // triton_helpers.py:144:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + setp.gt.f32 %p4, %r114, %r30; + setp.gt.f32 %p5, %r115, %r31; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + setp.eq.f32 %p6, %r114, %r30; + setp.eq.f32 %p7, %r115, %r31; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + mov.b64 {%r32, %r33}, %rd45; + setp.nan.f32 %p8, %r32, %r32; + setp.nan.f32 %p9, %r33, %r33; + setp.nan.f32 %p10, %r114, %r114; + setp.nan.f32 %p11, %r115, %r115; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + setp.nan.bf16 %p12, %rs1, %rs1; + setp.num.bf16 %p13, %rs1, %rs1; + setp.nan.bf16 %p14, %rs2, %rs2; + setp.num.bf16 %p15, %rs2, %rs2; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + and.pred %p16, %p10, %p13; + and.pred %p17, %p11, %p15; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + or.pred %p18, %p4, %p16; + or.pred %p19, %p5, %p17; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + and.pred %p20, %p10, %p12; + and.pred %p21, %p11, %p14; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + or.pred %p22, %p6, %p20; + or.pred %p23, %p7, %p21; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + cvt.s64.s32 %rd23, %r117; + cvt.s64.s32 %rd24, %r116; + cvt.s64.s32 %rd25, %r112; + setp.gt.s64 %p24, %rd20, %rd25; + cvt.s64.s32 %rd26, %r113; + setp.gt.s64 %p25, %rd21, %rd26; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + and.pred %p26, %p24, %p22; + and.pred %p27, %p25, %p23; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + or.pred %p28, %p18, %p26; + or.pred %p29, %p19, %p27; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + selp.f32 %r34, %r114, %r30, %p28; + selp.f32 %r35, %r115, %r31, %p29; +$L__tmp2: + .loc 1 36 115 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:36:115 + mov.b32 {%rs3, %rs4}, %r24; + cvt.f32.bf16 %r36, %rs3; + cvt.f32.bf16 %r37, %rs4; +$L__tmp3: + .loc 2 144 21 // triton_helpers.py:144:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + setp.gt.f32 %p30, %r33, %r37; + setp.gt.f32 %p31, %r32, %r36; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + setp.eq.f32 %p32, %r32, %r36; + setp.eq.f32 %p33, %r33, %r37; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + setp.nan.bf16x2 %p34|%p35, %r24, %r26; + setp.num.bf16x2 %p36|%p37, %r24, %r26; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + and.pred %p38, %p9, %p37; + and.pred %p39, %p8, %p36; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + or.pred %p40, %p31, %p39; + or.pred %p41, %p30, %p38; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + and.pred %p42, %p8, %p34; + and.pred %p43, %p9, %p35; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + or.pred %p44, %p33, %p43; + or.pred %p45, %p32, %p42; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + setp.gt.s64 %p46, %rd19, %rd24; + setp.ge.s64 %p47, %rd19, %rd23; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + and.pred %p48, %p46, %p45; + and.pred %p49, %p47, %p44; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + or.pred %p50, %p41, %p49; + or.pred %p51, %p40, %p48; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + selp.f32 %r38, %r32, %r36, %p51; + selp.f32 %r39, %r33, %r37, %p50; + cvt.u32.u64 %r40, %rd19; + cvt.u32.u64 %r41, %rd18; + add.s32 %r42, %r41, 2049; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:39:38 ] + selp.b32 %r43, %r116, %r40, %p51; + selp.b32 %r44, %r117, %r42, %p50; + cvt.u32.u64 %r45, %rd20; + selp.b32 %r46, %r112, %r45, %p28; + cvt.u32.u64 %r47, %rd21; + selp.b32 %r48, %r113, %r47, %p29; +$L__tmp4: + .loc 1 41 54 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:41:54 + selp.f32 %r49, %r39, %r33, %p1; + selp.f32 %r50, %r38, %r32, %p1; + mov.b64 %rd45, {%r50, %r49}; + selp.f32 %r114, %r34, %r114, %p1; + selp.f32 %r115, %r35, %r115, %p1; + .loc 1 42 66 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:42:66 + selp.b32 %r117, %r44, %r117, %p1; + selp.b32 %r116, %r43, %r116, %p1; + selp.b32 %r112, %r46, %r112, %p1; + selp.b32 %r113, %r48, %r113, %p1; + .loc 1 30 40 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:30:40 + add.s64 %rd44, %rd44, 2048; + setp.lt.u64 %p52, %rd44, 149888; + @%p52 bra $L__BB0_1; +// %bb.2: + .loc 1 24 21 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:24:21 + setp.ge.s32 %p60, %r1, %r16; + .loc 1 25 37 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:25:37 + and.b32 %r63, %r2, 31; +$L__tmp5: + .loc 2 144 21 // triton_helpers.py:144:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + mov.b64 {%r64, %r65}, %rd45; + setp.gt.f32 %p61, %r64, %r65; + setp.eq.f32 %p62, %r65, %r64; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p63, %r64, %r64; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.num.f32 %p64, %r65, %r65; + setp.nan.f32 %p65, %r65, %r65; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p66, %p63, %p65; + and.pred %p67, %p63, %p64; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p68, %p61, %p67; + or.pred %p69, %p62, %p66; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.lt.s32 %p70, %r116, %r117; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p71, %p70, %p69; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p72, %p68, %p71; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.f32 %r66, %r64, %r65, %p72; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.b32 %r67, %r116, %r117, %p72; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.gt.f32 %p73, %r66, %r114; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.eq.f32 %p74, %r66, %r114; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p75, %r66, %r66; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p76, %r114, %r114; + setp.num.f32 %p77, %r114, %r114; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p78, %p75, %p77; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p79, %p73, %p78; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p80, %p76, %p75; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p81, %p74, %p80; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.lt.s32 %p82, %r67, %r112; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p83, %p82, %p81; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p84, %p79, %p83; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.f32 %r68, %r66, %r114, %p84; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.b32 %r69, %r67, %r112, %p84; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.gt.f32 %p85, %r68, %r115; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.eq.f32 %p86, %r68, %r115; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p87, %r68, %r68; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p88, %r115, %r115; + setp.num.f32 %p89, %r115, %r115; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p90, %p87, %p89; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p91, %p85, %p90; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p92, %p88, %p87; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p93, %p86, %p92; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.lt.s32 %p94, %r69, %r113; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p95, %p94, %p93; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p96, %p91, %p95; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.f32 %r70, %r68, %r115, %p96; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.b32 %r71, %r69, %r113, %p96; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + shfl.sync.bfly.b32 %r72, %r70, 16, 31, -1; + shfl.sync.bfly.b32 %r73, %r71, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.gt.f32 %p97, %r70, %r72; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.eq.f32 %p98, %r70, %r72; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p99, %r70, %r70; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p100, %r72, %r72; + setp.num.f32 %p101, %r72, %r72; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p102, %p99, %p101; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p103, %p97, %p102; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p104, %p99, %p100; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p105, %p98, %p104; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.lt.s32 %p106, %r71, %r73; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p107, %p106, %p105; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p108, %p103, %p107; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.f32 %r74, %r70, %r72, %p108; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.b32 %r75, %r71, %r73, %p108; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + shfl.sync.bfly.b32 %r76, %r74, 8, 31, -1; + shfl.sync.bfly.b32 %r77, %r75, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.gt.f32 %p109, %r74, %r76; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.eq.f32 %p110, %r74, %r76; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p111, %r74, %r74; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p112, %r76, %r76; + setp.num.f32 %p113, %r76, %r76; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p114, %p111, %p113; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p115, %p109, %p114; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p116, %p112, %p111; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p117, %p110, %p116; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.lt.s32 %p118, %r75, %r77; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p119, %p118, %p117; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p120, %p115, %p119; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.f32 %r78, %r74, %r76, %p120; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.b32 %r79, %r75, %r77, %p120; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + shfl.sync.bfly.b32 %r80, %r78, 4, 31, -1; + shfl.sync.bfly.b32 %r81, %r79, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.gt.f32 %p121, %r78, %r80; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.eq.f32 %p122, %r78, %r80; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p123, %r78, %r78; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p124, %r80, %r80; + setp.num.f32 %p125, %r80, %r80; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p126, %p123, %p125; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p127, %p121, %p126; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p128, %p124, %p123; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p129, %p122, %p128; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.lt.s32 %p130, %r79, %r81; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p131, %p130, %p129; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p132, %p127, %p131; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.f32 %r82, %r78, %r80, %p132; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.b32 %r83, %r79, %r81, %p132; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + shfl.sync.bfly.b32 %r84, %r82, 2, 31, -1; + shfl.sync.bfly.b32 %r85, %r83, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.gt.f32 %p133, %r82, %r84; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.eq.f32 %p134, %r82, %r84; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p135, %r82, %r82; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p136, %r84, %r84; + setp.num.f32 %p137, %r84, %r84; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p138, %p135, %p137; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p139, %p133, %p138; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p140, %p136, %p135; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p141, %p134, %p140; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.lt.s32 %p142, %r83, %r85; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p143, %p142, %p141; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p144, %p139, %p143; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.f32 %r86, %r82, %r84, %p144; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.b32 %r87, %r83, %r85, %p144; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + shfl.sync.bfly.b32 %r88, %r86, 1, 31, -1; + shfl.sync.bfly.b32 %r89, %r87, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.gt.f32 %p145, %r86, %r88; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.eq.f32 %p146, %r86, %r88; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p147, %r86, %r86; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p148, %r88, %r88; + setp.num.f32 %p149, %r88, %r88; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p150, %p147, %p149; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p151, %p145, %p150; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p152, %p148, %p147; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p153, %p146, %p152; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.lt.s32 %p154, %r87, %r89; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p155, %p154, %p153; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p156, %p151, %p155; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.b32 %r54, %r87, %r89, %p156; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.eq.b32 %p53, %r63, 0; + shr.u32 %r90, %r2, 3; + and.b32 %r91, %r90, 60; + mov.b32 %r92, global_smem; + add.s32 %r51, %r92, %r91; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.b32 %r52, %r86, %r88, %p156; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + // begin inline asm + @%p53 st.shared.b32 [ %r51 + 0 ], %r52; + // end inline asm + add.s32 %r93, %r92, 64; + add.s32 %r53, %r93, %r91; + // begin inline asm + @%p53 st.shared.b32 [ %r53 + 0 ], %r54; + // end inline asm + bar.sync 0; + setp.lt.u32 %p55, %r2, 16; + add.s32 %r56, %r92, %r21; + // begin inline asm + @%p55 ld.shared.b32 %r55, [ %r56 + 0 ]; + // end inline asm + add.s32 %r58, %r93, %r21; + // begin inline asm + @%p55 ld.shared.b32 %r57, [ %r58 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r95, %r55, 8, 31, -1; + shfl.sync.bfly.b32 %r96, %r57, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.gt.f32 %p157, %r55, %r95; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.eq.f32 %p158, %r55, %r95; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p159, %r55, %r55; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p160, %r95, %r95; + setp.num.f32 %p161, %r95, %r95; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p162, %p159, %p161; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p163, %p157, %p162; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p164, %p159, %p160; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p165, %p158, %p164; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.lt.s32 %p166, %r57, %r96; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p167, %p166, %p165; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p168, %p163, %p167; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.f32 %r97, %r55, %r95, %p168; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.b32 %r98, %r57, %r96, %p168; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + shfl.sync.bfly.b32 %r99, %r97, 4, 31, -1; + shfl.sync.bfly.b32 %r100, %r98, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.gt.f32 %p169, %r97, %r99; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.eq.f32 %p170, %r97, %r99; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p171, %r97, %r97; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p172, %r99, %r99; + setp.num.f32 %p173, %r99, %r99; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p174, %p171, %p173; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p175, %p169, %p174; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p176, %p172, %p171; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p177, %p170, %p176; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.lt.s32 %p178, %r98, %r100; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p179, %p178, %p177; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p180, %p175, %p179; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.f32 %r101, %r97, %r99, %p180; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.b32 %r102, %r98, %r100, %p180; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + shfl.sync.bfly.b32 %r103, %r101, 2, 31, -1; + shfl.sync.bfly.b32 %r104, %r102, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.gt.f32 %p181, %r101, %r103; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.eq.f32 %p182, %r101, %r103; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p183, %r101, %r101; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p184, %r103, %r103; + setp.num.f32 %p185, %r103, %r103; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p186, %p183, %p185; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p187, %p181, %p186; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p188, %p184, %p183; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p189, %p182, %p188; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.lt.s32 %p190, %r102, %r104; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p191, %p190, %p189; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p192, %p187, %p191; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.f32 %r105, %r101, %r103, %p192; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.b32 %r106, %r102, %r104, %p192; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + shfl.sync.bfly.b32 %r107, %r105, 1, 31, -1; + shfl.sync.bfly.b32 %r108, %r106, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.gt.f32 %p193, %r105, %r107; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.eq.f32 %p194, %r105, %r107; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p195, %r105, %r105; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.nan.f32 %p196, %r107, %r107; + setp.num.f32 %p197, %r107, %r107; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p198, %p195, %p197; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p199, %p193, %p198; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p200, %p196, %p195; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p201, %p194, %p200; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.lt.s32 %p202, %r106, %r108; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + and.pred %p203, %p202, %p201; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + or.pred %p204, %p199, %p203; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.b32 %r62, %r106, %r108, %p204; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + setp.eq.b32 %p57, %r2, 0; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + selp.b32 %r60, %r105, %r107, %p204; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:43:75 ] + // begin inline asm + @%p57 st.shared.b32 [ %r56 + 0 ], %r60; + // end inline asm + // begin inline asm + @%p57 st.shared.b32 [ %r58 + 0 ], %r62; + // end inline asm + bar.sync 0; + ld.shared.b32 %r109, [global_smem+64]; +$L__tmp6: + .loc 1 45 31 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:45:31 + mad.wide.u32 %rd29, %r1, 8, %rd12; + .loc 1 45 36 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:45:36 + // begin inline asm + mov.u64 %rd27, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd27, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd28, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd28 }, [ %rd29 + 0 ], %rd27; + // end inline asm + .loc 1 47 18 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:47:18 + add.s32 %r110, %r109, 151936; + .loc 1 48 18 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:48:18 + setp.lt.s32 %p205, %r109, 0; + .loc 1 49 32 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:49:32 + selp.b32 %r15, %r110, %r109, %p205; + .loc 1 50 37 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:50:37 + setp.lt.u32 %p206, %r15, 151936; + .loc 1 50 65 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:50:65 + or.pred %p207, %p60, %p206; + @%p207 bra $L__BB0_4; + bra.uni $L__BB0_3; +$L__BB0_4: + .loc 1 0 0 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:0 + cvt.u64.u32 %rd7, %r1; + .loc 1 50 65 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:50:65 + bar.sync 0; + .loc 1 51 30 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:51:30 + cvt.s64.s32 %rd36, %r15; + add.s64 %rd32, %rd11, %rd36; + .loc 1 51 37 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:51:37 + // begin inline asm + mov.u64 %rd33, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd33, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs5, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b8 { %rs5 }, [ %rd32 + 0 ], %rd33; + // end inline asm + and.b16 %rs6, %rs5, 255; + setp.eq.b16 %p210, %rs6, 0; + .loc 1 54 20 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:54:20 + selp.b64 %rd34, 0, %rd28, %p210; + .loc 1 55 4 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:55:4 + bar.sync 0; + .loc 1 56 28 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:56:28 + shl.b64 %rd37, %rd7, 3; + add.s64 %rd35, %rd9, %rd37; + .loc 1 56 40 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:56:40 + and.b32 %r111, %r2, 511; + setp.eq.b32 %p211, %r111, 0; + and.pred %p209, %p211, %p2; + // begin inline asm + @%p209 st.global.b64 [ %rd35 + 0 ], { %rd34 }; + // end inline asm + .loc 1 56 4 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:56:4 + ret; +$L__BB0_3: + .loc 1 50 65 // c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py:50:65 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd38, assertFunc_0; + cvta.global.u64 %rd39, %rd38; + st.param.b64 [param3], %rd39; + mov.b64 %rd40, assertFile_0; + cvta.global.u64 %rd41, %rd40; + st.param.b64 [param1], %rd41; + mov.b64 %rd42, assertMessage_0; + cvta.global.u64 %rd43, %rd42; + st.param.b64 [param0], %rd43; + st.param.b64 [param4], 1; + st.param.b32 [param2], 50; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp7: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 263 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x100 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 55 +.b8 98 +.b8 101 +.b8 117 +.b8 110 +.b8 101 +.b8 116 +.b8 102 +.b8 105 +.b8 111 +.b8 117 +.b8 97 +.b8 51 +.b8 105 +.b8 103 +.b8 100 +.b8 121 +.b8 107 +.b8 112 +.b8 114 +.b8 55 +.b8 112 +.b8 100 +.b8 117 +.b8 111 +.b8 121 +.b8 110 +.b8 114 +.b8 104 +.b8 50 +.b8 52 +.b8 105 +.b8 105 +.b8 107 +.b8 109 +.b8 119 +.b8 118 +.b8 106 +.b8 115 +.b8 55 +.b8 54 +.b8 103 +.b8 112 +.b8 97 +.b8 54 +.b8 111 +.b8 109 +.b8 121 +.b8 120 +.b8 118 +.b8 115 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 55 +.b8 98 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x39 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc4:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xd9:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 39 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xf1:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 43 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.source new file mode 100644 index 0000000000000000000000000000000000000000..c8017123f85c804ca09e6b52ded15cd5b78c0293 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.source @@ -0,0 +1,363 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":18:0) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc62 = loc(unknown) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc87 = loc("in_out_ptr0"(#loc)) +#loc88 = loc("in_ptr0"(#loc)) +#loc89 = loc("in_ptr1"(#loc)) +#loc90 = loc("in_ptr2"(#loc)) +#loc91 = loc("xnumel"(#loc)) +#loc92 = loc("r0_numel"(#loc)) +#loc129 = loc("a_value"(#loc50)) +#loc130 = loc("a_index"(#loc50)) +#loc131 = loc("b_value"(#loc50)) +#loc132 = loc("b_index"(#loc50)) +#loc145 = loc("x"(#loc70)) +#loc146 = loc("x"(#loc74)) +#loc147 = loc("value"(#loc83)) +#loc148 = loc("index"(#loc83)) +module { + tt.func public @triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 151936 : i32 loc(#loc93) + %xoffset = tt.get_program_id x : i32 loc(#loc94) + %xoffset_1 = arith.constant 1 : i32 loc(#loc95) + %xoffset_2 = arith.constant 1 : i32 loc(#loc95) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc95) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc96) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc97) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc98) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc98) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc99) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc99) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc100) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc101) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc102) + %_tmp2_9 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc102) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc103) + %_tmp2_index_10 = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc103) + %c0_i32 = arith.constant 0 : i32 loc(#loc12) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc12) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc12) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc12) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc12) + %3 = ub.poison : i32 loc(#loc12) + %_tmp2_index_11:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_26 = %_tmp2_9, %_tmp2_index_27 = %_tmp2_index_10) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc105) + %r0_index_28 = arith.addi %r0_index, %r0_base_8 : tensor<1x2048xi32> loc(#loc105) + %r0_mask = arith.constant dense<151936> : tensor<1x2048xi32> loc(#loc106) + %r0_mask_29 = arith.cmpi slt, %r0_index_28, %r0_mask : tensor<1x2048xi32> loc(#loc106) + %tmp0 = arith.constant 151936 : i32 loc(#loc107) + %tmp0_30 = arith.constant 151936 : i32 loc(#loc107) + %tmp0_31 = arith.constant dense<151936> : tensor<1x1xi32> loc(#loc107) + %tmp0_32 = arith.muli %tmp0_31, %xindex_6 : tensor<1x1xi32> loc(#loc107) + %tmp0_33 = tt.broadcast %tmp0_32 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc108) + %tmp0_34 = arith.addi %r0_index_28, %tmp0_33 : tensor<1x2048xi32> loc(#loc108) + %tmp0_35 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc109) + %tmp0_36 = tt.addptr %tmp0_35, %tmp0_34 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc109) + %tmp0_37 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc110) + %tmp0_38 = arith.andi %r0_mask_29, %tmp0_37 : tensor<1x2048xi1> loc(#loc110) + %tmp0_39 = arith.constant 0.000000e+00 : f32 loc(#loc111) + %tmp0_40 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc111) + %tmp0_41 = arith.truncf %tmp0_40 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc111) + %tmp0_42 = tt.load %tmp0_36, %tmp0_38, %tmp0_41 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc111) + %tmp0_43 = arith.extf %tmp0_42 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc112) + %12:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%_tmp2_26, %_tmp2_index_27, %tmp0_43, %r0_index_28) : (tensor<1x2048xf32>, tensor<1x2048xi32>, tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) loc(#loc21) + %_tmp2_44 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc113) + %_tmp2_45 = arith.andi %r0_mask_29, %_tmp2_44 : tensor<1x2048xi1> loc(#loc113) + %_tmp2_46 = arith.select %_tmp2_45, %12#0, %_tmp2_26 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc114) + %_tmp2_index_47 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc115) + %_tmp2_index_48 = arith.andi %r0_mask_29, %_tmp2_index_47 : tensor<1x2048xi1> loc(#loc115) + %_tmp2_index_49 = arith.select %_tmp2_index_48, %12#1, %_tmp2_index_27 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc116) + scf.yield %_tmp2_46, %_tmp2_index_49 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc26) + } loc(#loc149) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%_tmp2_index_11#0, %_tmp2_index_11#1) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc27) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc117) + %tmp11 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc118) + %tmp11_12 = tt.addptr %tmp11, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc118) + %tmp11_13 = tt.load %tmp11_12, %xmask_7 evictionPolicy = evict_last : tensor<1x1x!tt.ptr> loc(#loc119) + %tmp3 = arith.constant 151936 : i32 loc(#loc120) + %tmp3_14 = arith.constant dense<151936> : tensor<1x1xi32> loc(#loc120) + %tmp4 = arith.addi %tmp2, %tmp3_14 : tensor<1x1xi32> loc(#loc121) + %tmp5 = arith.constant 0 : i32 loc(#loc122) + %tmp5_15 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc122) + %tmp5_16 = arith.cmpi slt, %tmp2, %tmp5_15 : tensor<1x1xi32> loc(#loc122) + %tmp6 = arith.select %tmp5_16, %tmp4, %tmp2 : tensor<1x1xi1>, tensor<1x1xi32> loc(#loc123) + %c0_i32_17 = arith.constant 0 : i32 loc(#loc35) + %cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc35) + %5 = arith.cmpi sle, %cst, %tmp6 : tensor<1x1xi32> loc(#loc35) + %c151936_i32 = arith.constant 151936 : i32 loc(#loc36) + %cst_18 = arith.constant dense<151936> : tensor<1x1xi32> loc(#loc36) + %6 = arith.cmpi slt, %tmp6, %cst_18 : tensor<1x1xi32> loc(#loc36) + %7 = arith.andi %5, %6 : tensor<1x1xi1> loc(#loc37) + %true = arith.constant true loc(#loc38) + %cst_19 = arith.constant dense : tensor<1x1xi1> loc(#loc38) + %8 = arith.xori %xmask_7, %cst_19 : tensor<1x1xi1> loc(#loc38) + %9 = arith.ori %7, %8 : tensor<1x1xi1> loc(#loc39) + tt.assert %9, "index out of bounds: 0 <= tmp6 < 151936" : tensor<1x1xi1> loc(#loc40) + %tmp8 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc124) + %tmp8_20 = tt.addptr %tmp8, %tmp6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc124) + %tmp8_21 = tt.bitcast %tmp8_20 : tensor<1x1x!tt.ptr> -> tensor<1x1x!tt.ptr> loc(#loc125) + %tmp8_22 = tt.load %tmp8_21, %xmask_7 evictionPolicy = evict_last : tensor<1x1x!tt.ptr> loc(#loc125) + %tmp8_23 = arith.constant 0 : i8 loc(#loc125) + %tmp8_24 = arith.constant dense<0> : tensor<1x1xi8> loc(#loc125) + %tmp8_25 = arith.cmpi ne, %tmp8_22, %tmp8_24 : tensor<1x1xi8> loc(#loc125) + %tmp9 = arith.extui %tmp8_25 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc126) + %tmp10 = arith.extsi %tmp9 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc127) + %tmp12 = arith.muli %tmp10, %tmp11_13 : tensor<1x1xi64> loc(#loc128) + gpu.barrier loc(#loc46) + %10 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc47) + %11 = tt.addptr %10, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc47) + tt.store %11, %tmp12, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc48) + tt.return loc(#loc49) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%a_value: tensor<1x2048xf32> loc("a_value"(#loc50)), %a_index: tensor<1x2048xi32> loc("a_index"(#loc50)), %b_value: tensor<1x2048xf32> loc("b_value"(#loc50)), %b_index: tensor<1x2048xi32> loc("b_index"(#loc50))) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<1x2048xf32> loc(#loc150) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<1x2048xf32> loc(#loc151) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%a_value) : (tensor<1x2048xf32>) -> i1 loc(#loc53) + %1:2 = scf.if %0 -> (tensor<1x2048xi1>, tensor<1x2048xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<1x2048xf32> loc(#loc135) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<1x2048xf32> loc(#loc136) + %mask_3 = arith.constant true loc(#loc137) + %mask_4 = arith.constant dense : tensor<1x2048xi1> loc(#loc137) + %mask_5 = arith.xori %b_isnan, %mask_4 : tensor<1x2048xi1> loc(#loc137) + %mask_6 = arith.andi %a_isnan, %mask_5 : tensor<1x2048xi1> loc(#loc138) + %mask_7 = arith.ori %mask, %mask_6 : tensor<1x2048xi1> loc(#loc152) + %equal_8 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc140) + %equal_9 = arith.ori %equal, %equal_8 : tensor<1x2048xi1> loc(#loc153) + scf.yield %mask_7, %equal_9 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc153) + } else { + scf.yield %mask, %equal : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc62) + } loc(#loc54) + %mask_0 = arith.cmpi slt, %a_index, %b_index : tensor<1x2048xi32> loc(#loc142) + %mask_1 = arith.andi %1#1, %mask_0 : tensor<1x2048xi1> loc(#loc143) + %mask_2 = arith.ori %1#0, %mask_1 : tensor<1x2048xi1> loc(#loc144) + %2 = arith.select %mask_2, %a_value, %b_value : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc66) + %3 = arith.select %mask_2, %a_index, %b_index : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc67) + tt.return %2, %3 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc68) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x2048xf32> loc(#loc69) + %5 = ub.poison : tensor<1x2048xi32> loc(#loc69) + tt.return %4, %5 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc69) + } loc(#loc50) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc70))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc71) + %true = arith.constant true loc(#loc72) + tt.return %true : i1 loc(#loc72) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc73) + tt.return %1 : i1 loc(#loc73) + } loc(#loc70) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc74))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc75) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc76) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc76) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc76) + %4 = arith.addf %x, %3 : tensor<1x2048xf32> loc(#loc76) + tt.return %4 : tensor<1x2048xf32> loc(#loc77) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x2048xf32> loc(#loc78) + tt.return %5 : tensor<1x2048xf32> loc(#loc78) + } loc(#loc74) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc80) + %cst = arith.constant dense : tensor<1xi1> loc(#loc80) + tt.return %cst : tensor<1xi1> loc(#loc81) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc82) + tt.return %0 : tensor<1xi1> loc(#loc82) + } loc(#loc79) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%value: tensor<1x2048xf32> loc("value"(#loc83)), %index: tensor<1x2048xi32> loc("index"(#loc83))) -> (tensor<1xf32>, tensor<1xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc84) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc84) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc84) + tt.return %0#0, %0#1 : tensor<1xf32>, tensor<1xi32> loc(#loc85) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc86) + %2 = ub.poison : tensor<1xi32> loc(#loc86) + tt.return %1, %2 : tensor<1xf32>, tensor<1xi32> loc(#loc86) + } loc(#loc83) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc50)), %a_index: i32 loc("a_index"(#loc50)), %b_value: f32 loc("b_value"(#loc50)), %b_index: i32 loc("b_index"(#loc50))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc150) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc151) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc53) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc135) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc136) + %mask_3 = arith.constant true loc(#loc137) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc137) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc138) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc152) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc140) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc153) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc153) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc62) + } loc(#loc54) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc142) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc143) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc144) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc66) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc67) + tt.return %2, %3 : f32, i32 loc(#loc68) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc69) + %5 = ub.poison : i32 loc(#loc69) + tt.return %4, %5 : f32, i32 loc(#loc69) + } loc(#loc50) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc70))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc71) + %true = arith.constant true loc(#loc72) + tt.return %true : i1 loc(#loc72) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc73) + tt.return %1 : i1 loc(#loc73) + } loc(#loc70) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc74))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc75) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc76) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc76) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc76) + tt.return %3 : tensor<1xf32> loc(#loc77) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc78) + tt.return %4 : tensor<1xf32> loc(#loc78) + } loc(#loc74) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":28:55) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":29:58) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":30:40) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":31:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":32:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":36:48) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":36:41) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":36:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":36:63) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":36:53) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":36:115) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":39:38) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":41:35) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":41:54) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":42:41) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":42:66) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":42:8) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":43:75) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":44:20) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":45:31) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":45:36) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":46:40) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":47:18) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":48:18) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":49:32) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":50:28) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":50:44) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":50:37) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":50:57) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":50:55) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":50:65) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":51:30) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":51:37) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":52:19) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":53:20) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":54:20) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":55:4) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":56:28) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":56:40) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":56:4) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc93 = loc("r0_numel"(#loc1)) +#loc94 = loc("xoffset"(#loc2)) +#loc95 = loc("xoffset"(#loc3)) +#loc96 = loc("xindex"(#loc4)) +#loc97 = loc("xindex"(#loc5)) +#loc98 = loc("xindex"(#loc6)) +#loc99 = loc("xmask"(#loc7)) +#loc100 = loc("r0_base"(#loc8)) +#loc101 = loc("r0_base"(#loc9)) +#loc102 = loc("_tmp2"(#loc10)) +#loc103 = loc("_tmp2_index"(#loc11)) +#loc104 = loc("_tmp2"(#loc12)) +#loc105 = loc("r0_index"(#loc13)) +#loc106 = loc("r0_mask"(#loc14)) +#loc107 = loc("tmp0"(#loc15)) +#loc108 = loc("tmp0"(#loc16)) +#loc109 = loc("tmp0"(#loc17)) +#loc110 = loc("tmp0"(#loc18)) +#loc111 = loc("tmp0"(#loc19)) +#loc112 = loc("tmp0"(#loc20)) +#loc113 = loc("_tmp2"(#loc22)) +#loc114 = loc("_tmp2"(#loc23)) +#loc115 = loc("_tmp2_index"(#loc24)) +#loc116 = loc("_tmp2_index"(#loc25)) +#loc117 = loc("tmp2"(#loc28)) +#loc118 = loc("tmp11"(#loc29)) +#loc119 = loc("tmp11"(#loc30)) +#loc120 = loc("tmp3"(#loc31)) +#loc121 = loc("tmp4"(#loc32)) +#loc122 = loc("tmp5"(#loc33)) +#loc123 = loc("tmp6"(#loc34)) +#loc124 = loc("tmp8"(#loc41)) +#loc125 = loc("tmp8"(#loc42)) +#loc126 = loc("tmp9"(#loc43)) +#loc127 = loc("tmp10"(#loc44)) +#loc128 = loc("tmp12"(#loc45)) +#loc133 = loc("mask"(#loc51)) +#loc134 = loc("equal"(#loc52)) +#loc135 = loc("a_isnan"(#loc55)) +#loc136 = loc("b_isnan"(#loc56)) +#loc137 = loc("mask"(#loc57)) +#loc138 = loc("mask"(#loc58)) +#loc139 = loc("mask"(#loc59)) +#loc140 = loc("equal"(#loc60)) +#loc141 = loc("equal"(#loc61)) +#loc142 = loc("mask"(#loc63)) +#loc143 = loc("mask"(#loc64)) +#loc144 = loc("mask"(#loc65)) +#loc149 = loc("_tmp2_index"(#loc104)) +#loc150 = loc("mask"(#loc133)) +#loc151 = loc("equal"(#loc134)) +#loc152 = loc("mask"(#loc139)) +#loc153 = loc("equal"(#loc141)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..5b899d548ab48fcad698b4da35c8c5d1bf9a68ef --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttgir @@ -0,0 +1,242 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":18:0) +#loc1 = loc(unknown) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":43:75) +#loc55 = loc("in_out_ptr0"(#loc)) +#loc56 = loc("in_ptr0"(#loc)) +#loc57 = loc("in_ptr1"(#loc)) +#loc58 = loc("in_ptr2"(#loc)) +#loc59 = loc("xnumel"(#loc)) +#loc60 = loc("r0_numel"(#loc)) +#loc90 = loc(callsite(#loc1 at #loc33)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x1xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<151936> : tensor<1x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<151936> : tensor<1x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<2147483647> : tensor<1x2048xi32, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked1> loc(#loc1) + %cst_4 = arith.constant dense<0> : tensor<1x1xi8, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<151936> : tensor<1x2048xi32, #blocked1> loc(#loc1) + %cst_6 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked1> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c151936_i32 = arith.constant 151936 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %true = arith.constant true loc(#loc1) + %cst_7 = arith.constant dense : tensor<1x2048xi1, #blocked1> loc(#loc1) + %cst_8 = arith.constant dense<0> : tensor<1x1xi32, #blocked1> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc61) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc62) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc63) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x2048xi32, #blocked1> loc(#loc63) + %tmp0 = arith.muli %xoffset, %c151936_i32 : i32 loc(#loc64) + %tmp0_10 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked1> loc(#loc102) + %tmp0_11 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked1> loc(#loc66) + %tmp0_12 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked1> loc(#loc103) + %_tmp2_index:2 = scf.for %_tmp2_index_26 = %c0_i32 to %c151936_i32 step %c2048_i32 iter_args(%_tmp2 = %cst_3, %_tmp2_index_27 = %cst_2) -> (tensor<1x2048xf32, #blocked1>, tensor<1x2048xi32, #blocked1>) : i32 { + %r0_index = tt.splat %_tmp2_index_26 : i32 -> tensor<1x2048xi32, #blocked1> loc(#loc69) + %r0_index_28 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32, #blocked1> loc(#loc69) + %r0_mask = arith.cmpi slt, %r0_index_28, %cst_5 : tensor<1x2048xi32, #blocked1> loc(#loc70) + %tmp0_29 = arith.addi %r0_index_28, %tmp0_10 : tensor<1x2048xi32, #blocked1> loc(#loc65) + %tmp0_30 = tt.addptr %tmp0_11, %tmp0_29 : tensor<1x2048x!tt.ptr, #blocked1>, tensor<1x2048xi32, #blocked1> loc(#loc66) + %tmp0_31 = arith.andi %r0_mask, %tmp0_12 : tensor<1x2048xi1, #blocked1> loc(#loc67) + %tmp0_32 = tt.load %tmp0_30, %tmp0_31, %cst_6 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked1> loc(#loc71) + %tmp0_33 = arith.extf %tmp0_32 : tensor<1x2048xbf16, #blocked1> to tensor<1x2048xf32, #blocked1> loc(#loc72) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_33 : tensor<1x2048xf32, #blocked1> loc(#loc129) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_33 : tensor<1x2048xf32, #blocked1> loc(#loc130) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<1x2048xf32, #blocked1> loc(#loc107) + %b_isnan = arith.cmpf une, %tmp0_33, %tmp0_33 : tensor<1x2048xf32, #blocked1> loc(#loc108) + %mask_34 = arith.xori %b_isnan, %cst_7 : tensor<1x2048xi1, #blocked1> loc(#loc109) + %mask_35 = arith.andi %a_isnan, %mask_34 : tensor<1x2048xi1, #blocked1> loc(#loc110) + %mask_36 = arith.ori %mask, %mask_35 : tensor<1x2048xi1, #blocked1> loc(#loc131) + %equal_37 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1, #blocked1> loc(#loc112) + %equal_38 = arith.ori %equal, %equal_37 : tensor<1x2048xi1, #blocked1> loc(#loc132) + %mask_39 = arith.cmpi slt, %_tmp2_index_27, %r0_index_28 : tensor<1x2048xi32, #blocked1> loc(#loc114) + %mask_40 = arith.andi %equal_38, %mask_39 : tensor<1x2048xi1, #blocked1> loc(#loc115) + %mask_41 = arith.ori %mask_36, %mask_40 : tensor<1x2048xi1, #blocked1> loc(#loc116) + %8 = arith.select %mask_41, %_tmp2, %tmp0_33 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xf32, #blocked1> loc(#loc85) + %9 = arith.select %mask_41, %_tmp2_index_27, %r0_index_28 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xi32, #blocked1> loc(#loc86) + %_tmp2_42 = arith.select %tmp0_31, %8, %_tmp2 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xf32, #blocked1> loc(#loc87) + %_tmp2_index_43 = arith.select %tmp0_31, %9, %_tmp2_index_27 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xi32, #blocked1> loc(#loc88) + scf.yield %_tmp2_42, %_tmp2_index_43 : tensor<1x2048xf32, #blocked1>, tensor<1x2048xi32, #blocked1> loc(#loc31) + } loc(#loc104) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc33)), %arg7: i32 loc(callsite(#loc1 at #loc33)), %arg8: f32 loc(callsite(#loc1 at #loc33)), %arg9: i32 loc(callsite(#loc1 at #loc33))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc133) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc134) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc117) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc118) + %mask_26 = arith.xori %b_isnan, %true : i1 loc(#loc119) + %mask_27 = arith.andi %a_isnan, %mask_26 : i1 loc(#loc120) + %mask_28 = arith.ori %mask, %mask_27 : i1 loc(#loc135) + %equal_29 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc121) + %equal_30 = arith.ori %equal, %equal_29 : i1 loc(#loc136) + %mask_31 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc122) + %mask_32 = arith.andi %equal_30, %mask_31 : i1 loc(#loc123) + %mask_33 = arith.ori %mask_28, %mask_32 : i1 loc(#loc124) + %8 = arith.select %mask_33, %arg6, %arg8 : f32 loc(#loc125) + %9 = arith.select %mask_33, %arg7, %arg9 : i32 loc(#loc126) + tt.reduce.return %8, %9 : f32, i32 loc(#loc89) + }) : (tensor<1x2048xf32, #blocked1>, tensor<1x2048xi32, #blocked1>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>>, tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked1}>>) loc(#loc89) + %tmp8 = ttg.convert_layout %0#1 : tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc91) + %tmp2 = tt.expand_dims %tmp8 {axis = 1 : i32} : tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi32, #blocked> loc(#loc92) + %tmp2_13 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi32, #blocked1> loc(#loc92) + %tmp11 = tt.addptr %in_ptr2, %xoffset : !tt.ptr, i32 loc(#loc93) + %tmp11_14 = tt.splat %tmp11 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc94) + %tmp11_15 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc94) + %tmp11_16 = tt.load %tmp11_14, %tmp11_15 evictionPolicy = evict_last : tensor<1x1x!tt.ptr, #blocked> loc(#loc94) + %tmp4 = arith.addi %tmp2, %cst_0 : tensor<1x1xi32, #blocked> loc(#loc95) + %tmp4_17 = arith.addi %tmp2_13, %cst_1 : tensor<1x1xi32, #blocked1> loc(#loc95) + %tmp5 = arith.cmpi slt, %tmp2, %cst : tensor<1x1xi32, #blocked> loc(#loc96) + %tmp5_18 = arith.cmpi slt, %tmp2_13, %cst_8 : tensor<1x1xi32, #blocked1> loc(#loc96) + %tmp6 = arith.select %tmp5, %tmp4, %tmp2 : tensor<1x1xi1, #blocked>, tensor<1x1xi32, #blocked> loc(#loc97) + %tmp6_19 = arith.select %tmp5_18, %tmp4_17, %tmp2_13 : tensor<1x1xi1, #blocked1>, tensor<1x1xi32, #blocked1> loc(#loc97) + %1 = arith.cmpi sge, %tmp6_19, %cst_8 : tensor<1x1xi32, #blocked1> loc(#loc41) + %2 = arith.cmpi slt, %tmp6_19, %cst_1 : tensor<1x1xi32, #blocked1> loc(#loc42) + %3 = arith.andi %1, %2 : tensor<1x1xi1, #blocked1> loc(#loc43) + %xmask_20 = arith.cmpi sge, %xoffset, %xnumel : i32 loc(#loc127) + %4 = tt.splat %xmask_20 : i1 -> tensor<1x1xi1, #blocked1> loc(#loc44) + %5 = arith.ori %3, %4 : tensor<1x1xi1, #blocked1> loc(#loc45) + tt.assert %5, "index out of bounds: 0 <= tmp6 < 151936" : tensor<1x1xi1, #blocked1> loc(#loc46) + %tmp8_21 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc98) + %tmp8_22 = tt.addptr %tmp8_21, %tmp6 : tensor<1x1x!tt.ptr, #blocked>, tensor<1x1xi32, #blocked> loc(#loc98) + %tmp8_23 = tt.bitcast %tmp8_22 : tensor<1x1x!tt.ptr, #blocked> -> tensor<1x1x!tt.ptr, #blocked> loc(#loc91) + %tmp8_24 = tt.load %tmp8_23, %tmp11_15 evictionPolicy = evict_last : tensor<1x1x!tt.ptr, #blocked> loc(#loc91) + %tmp8_25 = arith.cmpi ne, %tmp8_24, %cst_4 : tensor<1x1xi8, #blocked> loc(#loc91) + %tmp10 = arith.extui %tmp8_25 : tensor<1x1xi1, #blocked> to tensor<1x1xi64, #blocked> loc(#loc128) + %tmp12 = arith.muli %tmp10, %tmp11_16 : tensor<1x1xi64, #blocked> loc(#loc101) + gpu.barrier loc(#loc51) + %6 = tt.addptr %in_out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc52) + %7 = tt.splat %6 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc53) + tt.store %7, %tmp12, %tmp11_15 : tensor<1x1x!tt.ptr, #blocked> loc(#loc53) + tt.return loc(#loc54) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":36:48) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":36:41) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":36:34) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":36:63) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":30:40) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":31:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":32:29) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":36:53) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":36:115) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":39:38) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":41:54) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":42:66) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":42:8) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":51:37) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":44:20) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":45:31) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":45:36) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":47:18) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":48:18) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":49:32) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":50:28) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":50:44) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":50:37) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":50:57) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":50:55) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":50:65) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":51:30) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":53:20) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":52:19) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":54:20) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":55:4) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":56:28) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":56:40) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":56:4) +#loc61 = loc("xoffset"(#loc2)) +#loc62 = loc("xmask"(#loc3)) +#loc63 = loc("r0_base"(#loc4)) +#loc64 = loc("tmp0"(#loc5)) +#loc65 = loc("tmp0"(#loc6)) +#loc66 = loc("tmp0"(#loc7)) +#loc67 = loc("tmp0"(#loc8)) +#loc68 = loc("_tmp2"(#loc9)) +#loc69 = loc("r0_index"(#loc10)) +#loc70 = loc("r0_mask"(#loc11)) +#loc71 = loc("tmp0"(#loc12)) +#loc72 = loc("tmp0"(#loc13)) +#loc73 = loc("mask"(#loc14)) +#loc74 = loc("equal"(#loc16)) +#loc75 = loc("a_isnan"(#loc17)) +#loc76 = loc("b_isnan"(#loc18)) +#loc77 = loc("mask"(#loc19)) +#loc78 = loc("mask"(#loc20)) +#loc79 = loc("mask"(#loc21)) +#loc80 = loc("equal"(#loc22)) +#loc81 = loc("equal"(#loc23)) +#loc82 = loc("mask"(#loc24)) +#loc83 = loc("mask"(#loc25)) +#loc84 = loc("mask"(#loc26)) +#loc85 = loc(callsite(#loc27 at #loc15)) +#loc86 = loc(callsite(#loc28 at #loc15)) +#loc87 = loc("_tmp2"(#loc29)) +#loc88 = loc("_tmp2_index"(#loc30)) +#loc89 = loc(callsite(#loc32 at #loc33)) +#loc91 = loc("tmp8"(#loc34)) +#loc92 = loc("tmp2"(#loc35)) +#loc93 = loc("tmp11"(#loc36)) +#loc94 = loc("tmp11"(#loc37)) +#loc95 = loc("tmp4"(#loc38)) +#loc96 = loc("tmp5"(#loc39)) +#loc97 = loc("tmp6"(#loc40)) +#loc98 = loc("tmp8"(#loc47)) +#loc99 = loc("tmp10"(#loc48)) +#loc100 = loc("tmp9"(#loc49)) +#loc101 = loc("tmp12"(#loc50)) +#loc102 = loc(fused[#loc65, #loc64]) +#loc103 = loc(fused[#loc67, #loc62]) +#loc104 = loc("_tmp2_index"(#loc68)) +#loc105 = loc("mask"(#loc73)) +#loc106 = loc("equal"(#loc74)) +#loc107 = loc(callsite(#loc75 at #loc15)) +#loc108 = loc(callsite(#loc76 at #loc15)) +#loc109 = loc(callsite(#loc77 at #loc15)) +#loc110 = loc(callsite(#loc78 at #loc15)) +#loc111 = loc("mask"(#loc79)) +#loc112 = loc(callsite(#loc80 at #loc15)) +#loc113 = loc("equal"(#loc81)) +#loc114 = loc(callsite(#loc82 at #loc15)) +#loc115 = loc(callsite(#loc83 at #loc15)) +#loc116 = loc(callsite(#loc84 at #loc15)) +#loc117 = loc(callsite(#loc75 at #loc89)) +#loc118 = loc(callsite(#loc76 at #loc89)) +#loc119 = loc(callsite(#loc77 at #loc89)) +#loc120 = loc(callsite(#loc78 at #loc89)) +#loc121 = loc(callsite(#loc80 at #loc89)) +#loc122 = loc(callsite(#loc82 at #loc89)) +#loc123 = loc(callsite(#loc83 at #loc89)) +#loc124 = loc(callsite(#loc84 at #loc89)) +#loc125 = loc(callsite(#loc27 at #loc89)) +#loc126 = loc(callsite(#loc28 at #loc89)) +#loc127 = loc(fused[#loc44, #loc62]) +#loc128 = loc(fused[#loc99, #loc100]) +#loc129 = loc(callsite(#loc105 at #loc15)) +#loc130 = loc(callsite(#loc106 at #loc15)) +#loc131 = loc(callsite(#loc111 at #loc15)) +#loc132 = loc(callsite(#loc113 at #loc15)) +#loc133 = loc(callsite(#loc105 at #loc89)) +#loc134 = loc(callsite(#loc106 at #loc89)) +#loc135 = loc(callsite(#loc111 at #loc89)) +#loc136 = loc(callsite(#loc113 at #loc89)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..2ccdbf2b8264d489df2f3922d9a2e8455e2cfd76 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/6PCK33MB3P7SFYT3SK3GDV2ZPREQQLJQEEDQUDLRK22P7TDSCOSQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttir @@ -0,0 +1,238 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":18:0) +#loc1 = loc(unknown) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":43:75) +#loc58 = loc("in_out_ptr0"(#loc)) +#loc59 = loc("in_ptr0"(#loc)) +#loc60 = loc("in_ptr1"(#loc)) +#loc61 = loc("in_ptr2"(#loc)) +#loc62 = loc("xnumel"(#loc)) +#loc63 = loc("r0_numel"(#loc)) +#loc97 = loc(callsite(#loc1 at #loc37)) +module { + tt.func public @triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %true = arith.constant true loc(#loc1) + %cst = arith.constant dense : tensor<1x2048xi1> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc2) + %c151936_i32 = arith.constant 151936 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %tmp8 = arith.constant dense<0> : tensor<1x1xi8> loc(#loc64) + %cst_1 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<151936> : tensor<1x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<151936> : tensor<1x2048xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc65) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc66) + %xoffset = tt.get_program_id x : i32 loc(#loc67) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc68) + %xmask_4 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc68) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc69) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc70) + %_tmp2_index_6:2 = scf.for %r0_offset = %c0_i32 to %c151936_i32 step %c2048_i32 iter_args(%_tmp2_14 = %_tmp2, %_tmp2_index_15 = %_tmp2_index) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc72) + %r0_index_16 = arith.addi %r0_index, %r0_base_5 : tensor<1x2048xi32> loc(#loc72) + %r0_mask = arith.cmpi slt, %r0_index_16, %cst_3 : tensor<1x2048xi32> loc(#loc73) + %tmp0 = arith.muli %xoffset, %c151936_i32 : i32 loc(#loc74) + %tmp0_17 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc109) + %tmp0_18 = arith.addi %r0_index_16, %tmp0_17 : tensor<1x2048xi32> loc(#loc75) + %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc76) + %tmp0_20 = tt.addptr %tmp0_19, %tmp0_18 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc76) + %tmp0_21 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc110) + %tmp0_22 = arith.andi %r0_mask, %tmp0_21 : tensor<1x2048xi1> loc(#loc77) + %tmp0_23 = tt.load %tmp0_20, %tmp0_22, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc78) + %tmp0_24 = arith.extf %tmp0_23 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc79) + %mask = arith.cmpf ogt, %_tmp2_14, %tmp0_24 : tensor<1x2048xf32> loc(#loc134) + %equal = arith.cmpf oeq, %_tmp2_14, %tmp0_24 : tensor<1x2048xf32> loc(#loc135) + %a_isnan = arith.cmpf une, %_tmp2_14, %_tmp2_14 : tensor<1x2048xf32> loc(#loc113) + %b_isnan = arith.cmpf une, %tmp0_24, %tmp0_24 : tensor<1x2048xf32> loc(#loc114) + %mask_25 = arith.xori %b_isnan, %cst : tensor<1x2048xi1> loc(#loc115) + %mask_26 = arith.andi %a_isnan, %mask_25 : tensor<1x2048xi1> loc(#loc116) + %mask_27 = arith.ori %mask, %mask_26 : tensor<1x2048xi1> loc(#loc136) + %equal_28 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc118) + %equal_29 = arith.ori %equal, %equal_28 : tensor<1x2048xi1> loc(#loc137) + %mask_30 = arith.cmpi slt, %_tmp2_index_15, %r0_index_16 : tensor<1x2048xi32> loc(#loc120) + %mask_31 = arith.andi %equal_29, %mask_30 : tensor<1x2048xi1> loc(#loc121) + %mask_32 = arith.ori %mask_27, %mask_31 : tensor<1x2048xi1> loc(#loc122) + %9 = arith.select %mask_32, %_tmp2_14, %tmp0_24 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc92) + %10 = arith.select %mask_32, %_tmp2_index_15, %r0_index_16 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc93) + %_tmp2_33 = arith.select %tmp0_22, %9, %_tmp2_14 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc94) + %_tmp2_index_34 = arith.select %tmp0_22, %10, %_tmp2_index_15 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc95) + scf.yield %_tmp2_33, %_tmp2_index_34 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc35) + } loc(#loc108) + %0:2 = "tt.reduce"(%_tmp2_index_6#0, %_tmp2_index_6#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc37)), %arg7: i32 loc(callsite(#loc1 at #loc37)), %arg8: f32 loc(callsite(#loc1 at #loc37)), %arg9: i32 loc(callsite(#loc1 at #loc37))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc138) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc139) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc123) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc124) + %mask_14 = arith.xori %b_isnan, %true : i1 loc(#loc125) + %mask_15 = arith.andi %a_isnan, %mask_14 : i1 loc(#loc126) + %mask_16 = arith.ori %mask, %mask_15 : i1 loc(#loc140) + %equal_17 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc127) + %equal_18 = arith.ori %equal, %equal_17 : i1 loc(#loc141) + %mask_19 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc128) + %mask_20 = arith.andi %equal_18, %mask_19 : i1 loc(#loc129) + %mask_21 = arith.ori %mask_16, %mask_20 : i1 loc(#loc130) + %9 = arith.select %mask_21, %arg6, %arg8 : f32 loc(#loc131) + %10 = arith.select %mask_21, %arg7, %arg9 : i32 loc(#loc132) + tt.reduce.return %9, %10 : f32, i32 loc(#loc96) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc96) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc98) + %tmp11 = tt.addptr %in_ptr2, %xoffset : !tt.ptr, i32 loc(#loc99) + %tmp11_7 = tt.splat %tmp11 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc99) + %tmp11_8 = tt.load %tmp11_7, %xmask_4 evictionPolicy = evict_last : tensor<1x1x!tt.ptr> loc(#loc100) + %tmp4 = arith.addi %tmp2, %cst_2 : tensor<1x1xi32> loc(#loc101) + %tmp5 = arith.cmpi slt, %tmp2, %cst_1 : tensor<1x1xi32> loc(#loc102) + %tmp6 = arith.select %tmp5, %tmp4, %tmp2 : tensor<1x1xi1>, tensor<1x1xi32> loc(#loc103) + %1 = arith.cmpi sge, %tmp6, %cst_1 : tensor<1x1xi32> loc(#loc44) + %2 = arith.cmpi slt, %tmp6, %cst_2 : tensor<1x1xi32> loc(#loc45) + %3 = arith.andi %1, %2 : tensor<1x1xi1> loc(#loc46) + %4 = arith.xori %xmask, %true : i1 loc(#loc47) + %5 = tt.splat %4 : i1 -> tensor<1x1xi1> loc(#loc47) + %6 = arith.ori %3, %5 : tensor<1x1xi1> loc(#loc48) + tt.assert %6, "index out of bounds: 0 <= tmp6 < 151936" : tensor<1x1xi1> loc(#loc49) + %tmp8_9 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc104) + %tmp8_10 = tt.addptr %tmp8_9, %tmp6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc104) + %tmp8_11 = tt.bitcast %tmp8_10 : tensor<1x1x!tt.ptr> -> tensor<1x1x!tt.ptr> loc(#loc64) + %tmp8_12 = tt.load %tmp8_11, %xmask_4 evictionPolicy = evict_last : tensor<1x1x!tt.ptr> loc(#loc64) + %tmp8_13 = arith.cmpi ne, %tmp8_12, %tmp8 : tensor<1x1xi8> loc(#loc64) + %tmp10 = arith.extui %tmp8_13 : tensor<1x1xi1> to tensor<1x1xi64> loc(#loc133) + %tmp12 = arith.muli %tmp10, %tmp11_8 : tensor<1x1xi64> loc(#loc107) + gpu.barrier loc(#loc54) + %7 = tt.addptr %in_out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc55) + %8 = tt.splat %7 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc55) + tt.store %8, %tmp12, %xmask_4 : tensor<1x1x!tt.ptr> loc(#loc56) + tt.return loc(#loc57) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":30:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":51:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":29:58) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":28:55) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":22:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":31:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":32:29) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":36:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":36:41) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":36:34) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":36:63) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":36:53) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":36:115) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":39:38) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":41:54) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":42:66) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":42:8) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":44:20) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":45:31) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":45:36) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":47:18) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":48:18) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":49:32) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":50:28) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":50:44) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":50:37) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":50:57) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":50:55) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":50:65) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":51:30) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":53:20) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":52:19) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":54:20) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":55:4) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":56:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":56:40) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7b/c7beunetfioua3igdykpr7pduoynrh24iikmwvjs76gpa6omyxvs.py":56:4) +#loc64 = loc("tmp8"(#loc3)) +#loc65 = loc("_tmp2_index"(#loc4)) +#loc66 = loc("_tmp2"(#loc5)) +#loc67 = loc("xoffset"(#loc6)) +#loc68 = loc("xmask"(#loc7)) +#loc69 = loc("r0_base"(#loc8)) +#loc70 = loc("r0_base"(#loc9)) +#loc71 = loc("_tmp2"(#loc2)) +#loc72 = loc("r0_index"(#loc10)) +#loc73 = loc("r0_mask"(#loc11)) +#loc74 = loc("tmp0"(#loc12)) +#loc75 = loc("tmp0"(#loc13)) +#loc76 = loc("tmp0"(#loc14)) +#loc77 = loc("tmp0"(#loc15)) +#loc78 = loc("tmp0"(#loc16)) +#loc79 = loc("tmp0"(#loc17)) +#loc80 = loc("mask"(#loc18)) +#loc81 = loc("equal"(#loc20)) +#loc82 = loc("a_isnan"(#loc21)) +#loc83 = loc("b_isnan"(#loc22)) +#loc84 = loc("mask"(#loc23)) +#loc85 = loc("mask"(#loc24)) +#loc86 = loc("mask"(#loc25)) +#loc87 = loc("equal"(#loc26)) +#loc88 = loc("equal"(#loc27)) +#loc89 = loc("mask"(#loc28)) +#loc90 = loc("mask"(#loc29)) +#loc91 = loc("mask"(#loc30)) +#loc92 = loc(callsite(#loc31 at #loc19)) +#loc93 = loc(callsite(#loc32 at #loc19)) +#loc94 = loc("_tmp2"(#loc33)) +#loc95 = loc("_tmp2_index"(#loc34)) +#loc96 = loc(callsite(#loc36 at #loc37)) +#loc98 = loc("tmp2"(#loc38)) +#loc99 = loc("tmp11"(#loc39)) +#loc100 = loc("tmp11"(#loc40)) +#loc101 = loc("tmp4"(#loc41)) +#loc102 = loc("tmp5"(#loc42)) +#loc103 = loc("tmp6"(#loc43)) +#loc104 = loc("tmp8"(#loc50)) +#loc105 = loc("tmp10"(#loc51)) +#loc106 = loc("tmp9"(#loc52)) +#loc107 = loc("tmp12"(#loc53)) +#loc108 = loc("_tmp2_index"(#loc71)) +#loc109 = loc(fused[#loc75, #loc74]) +#loc110 = loc(fused[#loc77, #loc68]) +#loc111 = loc("mask"(#loc80)) +#loc112 = loc("equal"(#loc81)) +#loc113 = loc(callsite(#loc82 at #loc19)) +#loc114 = loc(callsite(#loc83 at #loc19)) +#loc115 = loc(callsite(#loc84 at #loc19)) +#loc116 = loc(callsite(#loc85 at #loc19)) +#loc117 = loc("mask"(#loc86)) +#loc118 = loc(callsite(#loc87 at #loc19)) +#loc119 = loc("equal"(#loc88)) +#loc120 = loc(callsite(#loc89 at #loc19)) +#loc121 = loc(callsite(#loc90 at #loc19)) +#loc122 = loc(callsite(#loc91 at #loc19)) +#loc123 = loc(callsite(#loc82 at #loc96)) +#loc124 = loc(callsite(#loc83 at #loc96)) +#loc125 = loc(callsite(#loc84 at #loc96)) +#loc126 = loc(callsite(#loc85 at #loc96)) +#loc127 = loc(callsite(#loc87 at #loc96)) +#loc128 = loc(callsite(#loc89 at #loc96)) +#loc129 = loc(callsite(#loc90 at #loc96)) +#loc130 = loc(callsite(#loc91 at #loc96)) +#loc131 = loc(callsite(#loc31 at #loc96)) +#loc132 = loc(callsite(#loc32 at #loc96)) +#loc133 = loc(fused[#loc105, #loc106]) +#loc134 = loc(callsite(#loc111 at #loc19)) +#loc135 = loc(callsite(#loc112 at #loc19)) +#loc136 = loc(callsite(#loc117 at #loc19)) +#loc137 = loc(callsite(#loc119 at #loc19)) +#loc138 = loc(callsite(#loc111 at #loc96)) +#loc139 = loc(callsite(#loc112 at #loc96)) +#loc140 = loc(callsite(#loc117 at #loc96)) +#loc141 = loc(callsite(#loc119 at #loc96)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/__grp__triton_per_fused__to_copy_mul_sum_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/__grp__triton_per_fused__to_copy_mul_sum_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ea23766f5f0f3b4e88d35c675a7ba25aadeccc4b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/__grp__triton_per_fused__to_copy_mul_sum_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_mul_sum_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.source", "triton_per_fused__to_copy_mul_sum_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.ttir", "triton_per_fused__to_copy_mul_sum_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.ttgir", "triton_per_fused__to_copy_mul_sum_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.llir", "triton_per_fused__to_copy_mul_sum_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.ptx", "triton_per_fused__to_copy_mul_sum_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.cubin", "triton_per_fused__to_copy_mul_sum_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bbf5dd545b3626ea2b80ce80d93cad8ff12ac4f7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/7AX37JF3GC3476WMWTYVB3NGNRIS5J7SJEWETBMPGJB4H67EAVCA/triton_per_fused__to_copy_mul_sum_1.json @@ -0,0 +1 @@ +{"hash": "f82fbfa4bb30b7cffaccb4f150eda66c512ea7f2492c49858f3243c3fbe40544", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 1024, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_mul_sum_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..64ba6ec6790050fdeaab8e53e4e751ccbd0d42e8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..adec210a443efc8f89befcd57022a0ab0ec350db Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ff50e3644e06f4c83eb7ede9d5ea411ec4f5eda4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "fad905126101239177747a38cb46af067cac0e1487e40b93854ea9f035362ddc", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..f53bd571815da524ef82865ca89f8c6c1d2ed489 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.llir @@ -0,0 +1,135 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 7, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 127, !dbg !9 + %12 = or disjoint i32 %9, %11, !dbg !10 + %13 = sdiv i32 %12, 2048, !dbg !11 + %14 = mul i32 %13, 2048, !dbg !12 + %.decomposed = sub i32 %12, %14, !dbg !12 + %15 = srem i32 %13, 32, !dbg !13 + %16 = sdiv i32 %12, 65536, !dbg !14 + %17 = shl nsw i32 %15, 7, !dbg !15 + %18 = shl nsw i32 %.decomposed, 12, !dbg !16 + %19 = shl i32 %16, 23, !dbg !17 + %20 = shl i32 %12, 7, !dbg !18 + %21 = add i32 %19, %18 + %22 = add i32 %21, %17 + %23 = sext i32 %20 to i64, !dbg !19 + %24 = sext i32 %22 to i64, !dbg !19 + %invariant.gep = getelementptr bfloat, ptr addrspace(1) %0, i64 %24, !dbg !19 + %invariant.gep7 = getelementptr bfloat, ptr addrspace(1) %1, i64 %23, !dbg !19 + br label %25, !dbg !19 + +25: ; preds = %7, %25 + %indvars.iv = phi i64 [ 0, %7 ], [ %indvars.iv.next, %25 ] + %26 = phi float [ 0.000000e+00, %7 ], [ %62, %25 ] + %27 = phi float [ 0.000000e+00, %7 ], [ %63, %25 ] + %28 = phi float [ 0.000000e+00, %7 ], [ %64, %25 ] + %29 = phi float [ 0.000000e+00, %7 ], [ %65, %25 ] + %gep = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %indvars.iv, !dbg !20 + %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #2, !dbg !21 + %31 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %gep, i64 %30, i1 true) #2, !dbg !21 + %32 = extractvalue { i32, i32 } %31, 0, !dbg !21 + %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !21 + %34 = extractvalue { i32, i32 } %31, 1, !dbg !21 + %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !21 + %36 = extractelement <2 x bfloat> %33, i64 0, !dbg !21 + %37 = extractelement <2 x bfloat> %33, i64 1, !dbg !21 + %38 = extractelement <2 x bfloat> %35, i64 0, !dbg !21 + %39 = extractelement <2 x bfloat> %35, i64 1, !dbg !21 + %40 = fpext bfloat %36 to float, !dbg !22 + %41 = fpext bfloat %37 to float, !dbg !22 + %42 = fpext bfloat %38 to float, !dbg !22 + %43 = fpext bfloat %39 to float, !dbg !22 + %gep8 = getelementptr bfloat, ptr addrspace(1) %invariant.gep7, i64 %indvars.iv, !dbg !23 + %44 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #2, !dbg !24 + %45 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %gep8, i64 %44, i1 true) #2, !dbg !24 + %46 = extractvalue { i32, i32 } %45, 0, !dbg !24 + %47 = bitcast i32 %46 to <2 x bfloat>, !dbg !24 + %48 = extractvalue { i32, i32 } %45, 1, !dbg !24 + %49 = bitcast i32 %48 to <2 x bfloat>, !dbg !24 + %50 = extractelement <2 x bfloat> %47, i64 0, !dbg !24 + %51 = extractelement <2 x bfloat> %47, i64 1, !dbg !24 + %52 = extractelement <2 x bfloat> %49, i64 0, !dbg !24 + %53 = extractelement <2 x bfloat> %49, i64 1, !dbg !24 + %54 = fpext bfloat %50 to float, !dbg !25 + %55 = fpext bfloat %51 to float, !dbg !25 + %56 = fpext bfloat %52 to float, !dbg !25 + %57 = fpext bfloat %53 to float, !dbg !25 + %58 = fmul float %40, %54, !dbg !26 + %59 = fmul float %41, %55, !dbg !26 + %60 = fmul float %42, %56, !dbg !26 + %61 = fmul float %43, %57, !dbg !26 + %62 = fadd float %26, %58, !dbg !27 + %63 = fadd float %27, %59, !dbg !27 + %64 = fadd float %28, %60, !dbg !27 + %65 = fadd float %29, %61, !dbg !27 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4, !dbg !19 + %66 = icmp samesign ult i64 %indvars.iv, 124, !dbg !19 + br i1 %66, label %25, label %67, !dbg !19 + +67: ; preds = %25 + %68 = fadd float %62, %63, !dbg !28 + %69 = fadd float %64, %68, !dbg !28 + %70 = fadd float %65, %69, !dbg !28 + %71 = sext i32 %12 to i64, !dbg !32 + %72 = getelementptr float, ptr addrspace(1) %2, i64 %71, !dbg !32 + %73 = bitcast float %70 to i32, !dbg !33 + tail call void asm sideeffect "st.global.b32 [ $1 + 0 ], { $0 };", "r,l"(i32 %73, ptr addrspace(1) %72) #2, !dbg !33 + ret void, !dbg !34 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 29, column: 21, scope: !4) +!12 = !DILocation(line: 28, column: 19, scope: !4) +!13 = !DILocation(line: 29, column: 29, scope: !4) +!14 = !DILocation(line: 30, column: 19, scope: !4) +!15 = !DILocation(line: 39, column: 45, scope: !4) +!16 = !DILocation(line: 39, column: 55, scope: !4) +!17 = !DILocation(line: 39, column: 68, scope: !4) +!18 = !DILocation(line: 40, column: 45, scope: !4) +!19 = !DILocation(line: 33, column: 40, scope: !4) +!20 = !DILocation(line: 39, column: 34, scope: !4) +!21 = !DILocation(line: 39, column: 73, scope: !4) +!22 = !DILocation(line: 39, column: 127, scope: !4) +!23 = !DILocation(line: 40, column: 34, scope: !4) +!24 = !DILocation(line: 40, column: 50, scope: !4) +!25 = !DILocation(line: 40, column: 104, scope: !4) +!26 = !DILocation(line: 41, column: 22, scope: !4) +!27 = !DILocation(line: 43, column: 23, scope: !4) +!28 = !DILocation(line: 261, column: 15, scope: !29, inlinedAt: !31) +!29 = distinct !DILexicalBlockFile(scope: !4, file: !30, discriminator: 0) +!30 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!31 = !DILocation(line: 45, column: 25, scope: !4) +!32 = !DILocation(line: 49, column: 25, scope: !4) +!33 = !DILocation(line: 49, column: 36, scope: !4) +!34 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..0a1bb4a2866f5c5e5a8730b6c68d3594e7c95397 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.ptx @@ -0,0 +1,377 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u32 triton_red_fused_zeros_0_param_3, + .param .u32 triton_red_fused_zeros_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_6 +) +.reqntid 128 +{ + .reg .pred %p<4>; + .reg .b16 %rs<9>; + .reg .b32 %r<59>; + .reg .b64 %rd<23>; + .loc 1 18 0 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:18:0 + +// %bb.0: + ld.param.b64 %rd9, [triton_red_fused_zeros_0_param_2]; + ld.param.b64 %rd11, [triton_red_fused_zeros_0_param_0]; + ld.param.b64 %rd12, [triton_red_fused_zeros_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:23:28 + mov.u32 %r11, %ctaid.x; + .loc 1 23 33 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:23:33 + shl.b32 %r12, %r11, 7; + .loc 1 24 44 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:24:44 + mov.u32 %r13, %tid.x; + and.b32 %r14, %r13, 127; + .loc 1 24 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:24:23 + or.b32 %r1, %r12, %r14; + .loc 1 29 21 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:29:21 + bfe.s32 %r15, %r11, 24, 1; + shr.u32 %r16, %r15, 21; + add.s32 %r17, %r1, %r16; + shr.s32 %r18, %r17, 11; + .loc 1 28 19 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:28:19 + and.b32 %r19, %r17, 1046528; + .loc 1 29 29 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:29:29 + shr.u32 %r20, %r18, 27; + add.s32 %r21, %r18, %r20; + and.b32 %r22, %r21, 33554400; + sub.s32 %r23, %r18, %r22; + .loc 1 30 19 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:30:19 + shr.u32 %r24, %r15, 16; + add.s32 %r25, %r1, %r24; + .loc 1 39 45 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:45 + shl.b32 %r26, %r23, 7; + .loc 1 39 68 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:68 + shl.b32 %r27, %r25, 7; + and.b32 %r28, %r27, -8388608; + .loc 1 33 40 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:33:40 + shl.b32 %r29, %r11, 14; + shl.b32 %r30, %r14, 7; + or.b32 %r31, %r29, %r30; + mad.wide.s32 %rd21, %r31, 2, %rd12; + sub.s32 %r32, %r1, %r19; + shl.b32 %r33, %r32, 12; + add.s32 %r34, %r28, %r33; + add.s32 %r35, %r34, %r26; + mad.wide.s32 %rd20, %r35, 2, %rd11; + mov.b32 %r55, 0f00000000; + mov.b64 %rd22, -4; + mov.b32 %r56, %r55; + mov.b32 %r57, %r55; + mov.b32 %r58, %r55; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 39 73 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:73 + // begin inline asm + mov.u64 %rd13, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd13, 1.0; + // end inline asm + mov.b32 %r38, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r36, %r38; + mov.u32 %r37, %r38; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r36, %r37 }, [ %rd20 + 0 ], %rd13; + // end inline asm + mov.b32 {%rs1, %rs2}, %r36; + mov.b32 {%rs3, %rs4}, %r37; + .loc 1 39 127 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:127 + cvt.f32.bf16 %r44, %rs1; + cvt.f32.bf16 %r45, %rs2; + cvt.f32.bf16 %r46, %rs3; + cvt.f32.bf16 %r47, %rs4; + .loc 1 40 50 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:50 + // begin inline asm + mov.u64 %rd16, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd16, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r40, %r38; + mov.u32 %r41, %r38; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r40, %r41 }, [ %rd21 + 0 ], %rd16; + // end inline asm + mov.b32 {%rs5, %rs6}, %r40; + mov.b32 {%rs7, %rs8}, %r41; + .loc 1 40 104 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:104 + cvt.f32.bf16 %r48, %rs5; + cvt.f32.bf16 %r49, %rs6; + cvt.f32.bf16 %r50, %rs7; + cvt.f32.bf16 %r51, %rs8; + .loc 1 43 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:43:23 + fma.rn.f32 %r55, %r44, %r48, %r55; + fma.rn.f32 %r56, %r45, %r49, %r56; + fma.rn.f32 %r57, %r46, %r50, %r57; + fma.rn.f32 %r58, %r47, %r51, %r58; + .loc 1 33 40 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:33:40 + add.s64 %rd22, %rd22, 4; + add.s64 %rd21, %rd21, 8; + add.s64 %rd20, %rd20, 8; + setp.lt.u64 %p3, %rd22, 124; + @%p3 bra $L__BB0_1; +// %bb.2: +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r53, %r55, %r56; + add.f32 %r54, %r57, %r53; + add.f32 %r52, %r58, %r54; +$L__tmp2: + .loc 1 49 25 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:49:25 + mad.wide.s32 %rd19, %r1, 4, %rd9; + .loc 1 49 36 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:49:36 + // begin inline asm + st.global.b32 [ %rd19 + 0 ], { %r52 }; + // end inline asm + .loc 1 49 4 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:49:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 209 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xca DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 100 +.b8 113 +.b8 120 +.b8 120 +.b8 101 +.b8 118 +.b8 100 +.b8 121 +.b8 115 +.b8 115 +.b8 111 +.b8 121 +.b8 117 +.b8 116 +.b8 50 +.b8 101 +.b8 117 +.b8 119 +.b8 53 +.b8 53 +.b8 121 +.b8 50 +.b8 55 +.b8 99 +.b8 97 +.b8 104 +.b8 113 +.b8 113 +.b8 99 +.b8 103 +.b8 109 +.b8 118 +.b8 121 +.b8 117 +.b8 104 +.b8 100 +.b8 105 +.b8 104 +.b8 98 +.b8 52 +.b8 116 +.b8 109 +.b8 110 +.b8 101 +.b8 114 +.b8 55 +.b8 99 +.b8 102 +.b8 99 +.b8 55 +.b8 102 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 100 +.b8 113 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..974e62af88c681419a34a81889e2c8f493fe3fda --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.source @@ -0,0 +1,222 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":18:0) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc46 = loc(unknown) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc53 = loc("in_ptr0"(#loc)) +#loc54 = loc("in_ptr1"(#loc)) +#loc55 = loc("out_ptr1"(#loc)) +#loc56 = loc("xnumel"(#loc)) +#loc57 = loc("r0_numel"(#loc)) +#loc97 = loc("input"(#loc44)) +#loc98 = loc("a"(#loc49)) +#loc99 = loc("b"(#loc49)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 524288 : i32 loc(#loc58) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc59) + %xoffset = tt.get_program_id x : i32 loc(#loc60) + %xoffset_2 = arith.constant 128 : i32 loc(#loc61) + %xoffset_3 = arith.constant 128 : i32 loc(#loc61) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc61) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc62) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc63) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<128x1xi32> loc(#loc64) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<128x1xi32> loc(#loc64) + %xmask = arith.constant true loc(#loc65) + %xmask_8 = arith.constant dense : tensor<128x4xi1> loc(#loc65) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc66) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc67) + %x0 = arith.constant 2048 : i32 loc(#loc68) + %x0_10 = arith.constant 2048 : i32 loc(#loc68) + %x0_11 = arith.constant dense<2048> : tensor<128x1xi32> loc(#loc68) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<128x1xi32> loc(#loc68) + %x1 = arith.constant 2048 : i32 loc(#loc69) + %x1_13 = arith.constant 2048 : i32 loc(#loc69) + %x1_14 = arith.constant dense<2048> : tensor<128x1xi32> loc(#loc69) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<128x1xi32> loc(#loc69) + %x1_16 = arith.constant 32 : i32 loc(#loc70) + %x1_17 = arith.constant 32 : i32 loc(#loc70) + %x1_18 = arith.constant dense<32> : tensor<128x1xi32> loc(#loc70) + %x1_19 = arith.remsi %x1_15, %x1_18 : tensor<128x1xi32> loc(#loc70) + %x2 = arith.constant 65536 : i32 loc(#loc71) + %x2_20 = arith.constant 65536 : i32 loc(#loc71) + %x2_21 = arith.constant dense<65536> : tensor<128x1xi32> loc(#loc71) + %x2_22 = arith.divsi %xindex_7, %x2_21 : tensor<128x1xi32> loc(#loc71) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc72) + %_tmp4_23 = arith.constant dense<0.000000e+00> : tensor<128x4xf32> loc(#loc72) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c4_i32 = arith.constant 4 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_24 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_27 = %_tmp4_23) -> (tensor<128x4xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc74) + %r0_index_28 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc74) + %r0_mask = arith.constant dense<128> : tensor<1x4xi32> loc(#loc75) + %r0_mask_29 = arith.cmpi slt, %r0_index_28, %r0_mask : tensor<1x4xi32> loc(#loc75) + %tmp0 = arith.constant 128 : i32 loc(#loc76) + %tmp0_30 = arith.constant 128 : i32 loc(#loc76) + %tmp0_31 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc76) + %tmp0_32 = arith.muli %tmp0_31, %x1_19 : tensor<128x1xi32> loc(#loc76) + %tmp0_33 = tt.broadcast %r0_index_28 : tensor<1x4xi32> -> tensor<128x4xi32> loc(#loc77) + %tmp0_34 = tt.broadcast %tmp0_32 : tensor<128x1xi32> -> tensor<128x4xi32> loc(#loc77) + %tmp0_35 = arith.addi %tmp0_33, %tmp0_34 : tensor<128x4xi32> loc(#loc77) + %tmp0_36 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_37 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_38 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc78) + %tmp0_39 = arith.muli %tmp0_38, %x0_12 : tensor<128x1xi32> loc(#loc78) + %tmp0_40 = tt.broadcast %tmp0_39 : tensor<128x1xi32> -> tensor<128x4xi32> loc(#loc79) + %tmp0_41 = arith.addi %tmp0_35, %tmp0_40 : tensor<128x4xi32> loc(#loc79) + %tmp0_42 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_43 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_44 = arith.constant dense<8388608> : tensor<128x1xi32> loc(#loc80) + %tmp0_45 = arith.muli %tmp0_44, %x2_22 : tensor<128x1xi32> loc(#loc80) + %tmp0_46 = tt.broadcast %tmp0_45 : tensor<128x1xi32> -> tensor<128x4xi32> loc(#loc81) + %tmp0_47 = arith.addi %tmp0_41, %tmp0_46 : tensor<128x4xi32> loc(#loc81) + %tmp0_48 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x4x!tt.ptr> loc(#loc82) + %tmp0_49 = tt.addptr %tmp0_48, %tmp0_47 : tensor<128x4x!tt.ptr>, tensor<128x4xi32> loc(#loc82) + %tmp0_50 = arith.constant 0.000000e+00 : f32 loc(#loc83) + %tmp0_51 = tt.broadcast %r0_mask_29 : tensor<1x4xi1> -> tensor<128x4xi1> loc(#loc83) + %tmp0_52 = arith.constant dense<0.000000e+00> : tensor<128x4xf32> loc(#loc83) + %tmp0_53 = arith.truncf %tmp0_52 : tensor<128x4xf32> to tensor<128x4xbf16> loc(#loc83) + %tmp0_54 = tt.load %tmp0_49, %tmp0_51, %tmp0_53 evictionPolicy = evict_first : tensor<128x4x!tt.ptr> loc(#loc83) + %tmp0_55 = arith.extf %tmp0_54 : tensor<128x4xbf16> to tensor<128x4xf32> loc(#loc84) + %tmp1 = arith.constant 128 : i32 loc(#loc85) + %tmp1_56 = arith.constant 128 : i32 loc(#loc85) + %tmp1_57 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc85) + %tmp1_58 = arith.muli %tmp1_57, %xindex_7 : tensor<128x1xi32> loc(#loc85) + %tmp1_59 = tt.broadcast %r0_index_28 : tensor<1x4xi32> -> tensor<128x4xi32> loc(#loc86) + %tmp1_60 = tt.broadcast %tmp1_58 : tensor<128x1xi32> -> tensor<128x4xi32> loc(#loc86) + %tmp1_61 = arith.addi %tmp1_59, %tmp1_60 : tensor<128x4xi32> loc(#loc86) + %tmp1_62 = tt.splat %in_ptr1 : !tt.ptr -> tensor<128x4x!tt.ptr> loc(#loc87) + %tmp1_63 = tt.addptr %tmp1_62, %tmp1_61 : tensor<128x4x!tt.ptr>, tensor<128x4xi32> loc(#loc87) + %tmp1_64 = arith.constant 0.000000e+00 : f32 loc(#loc88) + %tmp1_65 = tt.broadcast %r0_mask_29 : tensor<1x4xi1> -> tensor<128x4xi1> loc(#loc88) + %tmp1_66 = arith.constant dense<0.000000e+00> : tensor<128x4xf32> loc(#loc88) + %tmp1_67 = arith.truncf %tmp1_66 : tensor<128x4xf32> to tensor<128x4xbf16> loc(#loc88) + %tmp1_68 = tt.load %tmp1_63, %tmp1_65, %tmp1_67 evictionPolicy = evict_first : tensor<128x4x!tt.ptr> loc(#loc88) + %tmp1_69 = arith.extf %tmp1_68 : tensor<128x4xbf16> to tensor<128x4xf32> loc(#loc89) + %tmp2 = arith.mulf %tmp0_55, %tmp1_69 : tensor<128x4xf32> loc(#loc90) + %tmp5 = arith.addf %_tmp4_27, %tmp2 : tensor<128x4xf32> loc(#loc91) + %_tmp4_70 = tt.broadcast %r0_mask_29 : tensor<1x4xi1> -> tensor<128x4xi1> loc(#loc92) + %_tmp4_71 = arith.select %_tmp4_70, %tmp5, %_tmp4_27 : tensor<128x4xi1>, tensor<128x4xf32> loc(#loc92) + scf.yield %_tmp4_71 : tensor<128x4xf32> loc(#loc36) + } loc(#loc73) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S128_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_24) : (tensor<128x4xf32>) -> tensor<128xf32> loc(#loc93) + %tmp4_25 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc94) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc95) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<128x1xf32> loc(#loc96) + %tmp8_26 = arith.subf %tmp4_25, %tmp8 : tensor<128x1xf32> loc(#loc96) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc41) + %5 = tt.addptr %4, %xindex_7 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc41) + tt.store %5, %tmp8_26 : tensor<128x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S128_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x4xf32> loc("input"(#loc44))) -> tensor<128xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc45) + tt.reduce.return %2 : f32 loc(#loc45) + }) : (tensor<128x4xf32>) -> tensor<128xf32> loc(#loc45) + tt.return %0 : tensor<128xf32> loc(#loc47) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128xf32> loc(#loc48) + tt.return %1 : tensor<128xf32> loc(#loc48) + } loc(#loc44) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc49)), %b: f32 loc("b"(#loc49))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc50) + tt.return %0 : f32 loc(#loc51) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc52) + tt.return %1 : f32 loc(#loc52) + } loc(#loc49) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":30:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":32:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:68) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:60) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:73) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:127) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:45) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:50) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:104) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":41:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":43:23) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:40) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:28) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":47:11) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":48:18) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:4) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc58 = loc("xnumel"(#loc1)) +#loc59 = loc("r0_numel"(#loc2)) +#loc60 = loc("xoffset"(#loc3)) +#loc61 = loc("xoffset"(#loc4)) +#loc62 = loc("xindex"(#loc5)) +#loc63 = loc("xindex"(#loc6)) +#loc64 = loc("xindex"(#loc7)) +#loc65 = loc("xmask"(#loc8)) +#loc66 = loc("r0_base"(#loc9)) +#loc67 = loc("r0_base"(#loc10)) +#loc68 = loc("x0"(#loc11)) +#loc69 = loc("x1"(#loc12)) +#loc70 = loc("x1"(#loc13)) +#loc71 = loc("x2"(#loc14)) +#loc72 = loc("_tmp4"(#loc15)) +#loc73 = loc("_tmp4"(#loc16)) +#loc74 = loc("r0_index"(#loc17)) +#loc75 = loc("r0_mask"(#loc18)) +#loc76 = loc("tmp0"(#loc19)) +#loc77 = loc("tmp0"(#loc20)) +#loc78 = loc("tmp0"(#loc21)) +#loc79 = loc("tmp0"(#loc22)) +#loc80 = loc("tmp0"(#loc23)) +#loc81 = loc("tmp0"(#loc24)) +#loc82 = loc("tmp0"(#loc25)) +#loc83 = loc("tmp0"(#loc26)) +#loc84 = loc("tmp0"(#loc27)) +#loc85 = loc("tmp1"(#loc28)) +#loc86 = loc("tmp1"(#loc29)) +#loc87 = loc("tmp1"(#loc30)) +#loc88 = loc("tmp1"(#loc31)) +#loc89 = loc("tmp1"(#loc32)) +#loc90 = loc("tmp2"(#loc33)) +#loc91 = loc("tmp5"(#loc34)) +#loc92 = loc("_tmp4"(#loc35)) +#loc93 = loc("tmp4"(#loc37)) +#loc94 = loc("tmp4"(#loc38)) +#loc95 = loc("tmp7"(#loc39)) +#loc96 = loc("tmp8"(#loc40)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..38484b73543295b76e79501a6b7e32828d60382f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,154 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":18:0) +#loc1 = loc(unknown) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:25) +#loc39 = loc("in_ptr0"(#loc)) +#loc40 = loc("in_ptr1"(#loc)) +#loc41 = loc("out_ptr1"(#loc)) +#loc42 = loc("xnumel"(#loc)) +#loc43 = loc("r0_numel"(#loc)) +#loc73 = loc("tmp4"(#loc33)) +#loc76 = loc(callsite(#loc1 at #loc73)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x4xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<8388608> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<65536> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<32> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<2048> : tensor<128x1xi32, #blocked> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %cst_6 = arith.constant dense<0.000000e+00> : tensor<128x4xbf16, #blocked> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<128x4xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc44) + %xoffset_8 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc45) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc46) + %xindex_9 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc46) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc46) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xi32, #blocked1> loc(#loc46) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<128x1xi32, #blocked> loc(#loc47) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<128x1xi32, #blocked1> loc(#loc47) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<128x1xi32, #blocked> loc(#loc47) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<128x1xi32, #blocked1> loc(#loc47) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc48) + %r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4xi32, #blocked> loc(#loc48) + %x0 = arith.remsi %xindex_14, %cst_5 : tensor<128x1xi32, #blocked> loc(#loc49) + %x1 = arith.divsi %xindex_14, %cst_5 : tensor<128x1xi32, #blocked> loc(#loc50) + %x1_17 = arith.remsi %x1, %cst_4 : tensor<128x1xi32, #blocked> loc(#loc51) + %x2 = arith.divsi %xindex_14, %cst_3 : tensor<128x1xi32, #blocked> loc(#loc52) + %tmp0 = arith.muli %x1_17, %cst_0 : tensor<128x1xi32, #blocked> loc(#loc53) + %tmp0_18 = tt.broadcast %tmp0 : tensor<128x1xi32, #blocked> -> tensor<128x4xi32, #blocked> loc(#loc54) + %tmp0_19 = arith.muli %x0, %cst_1 : tensor<128x1xi32, #blocked> loc(#loc55) + %tmp0_20 = tt.broadcast %tmp0_19 : tensor<128x1xi32, #blocked> -> tensor<128x4xi32, #blocked> loc(#loc56) + %tmp0_21 = arith.muli %x2, %cst_2 : tensor<128x1xi32, #blocked> loc(#loc57) + %tmp0_22 = tt.broadcast %tmp0_21 : tensor<128x1xi32, #blocked> -> tensor<128x4xi32, #blocked> loc(#loc58) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x4x!tt.ptr, #blocked> loc(#loc59) + %tmp1 = arith.muli %xindex_14, %cst_0 : tensor<128x1xi32, #blocked> loc(#loc60) + %tmp1_24 = tt.broadcast %tmp1 : tensor<128x1xi32, #blocked> -> tensor<128x4xi32, #blocked> loc(#loc61) + %tmp1_25 = tt.splat %in_ptr1 : !tt.ptr -> tensor<128x4x!tt.ptr, #blocked> loc(#loc62) + %_tmp4 = scf.for %_tmp4_28 = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%arg6 = %cst_7) -> (tensor<128x4xf32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp4_28 : i32 -> tensor<1x4xi32, #blocked> loc(#loc64) + %r0_index_29 = arith.addi %r0_index, %r0_base_16 : tensor<1x4xi32, #blocked> loc(#loc64) + %r0_mask = arith.cmpi slt, %r0_index_29, %cst : tensor<1x4xi32, #blocked> loc(#loc65) + %tmp0_30 = tt.broadcast %r0_index_29 : tensor<1x4xi32, #blocked> -> tensor<128x4xi32, #blocked> loc(#loc54) + %tmp0_31 = arith.addi %tmp0_30, %tmp0_18 : tensor<128x4xi32, #blocked> loc(#loc54) + %tmp0_32 = arith.addi %tmp0_31, %tmp0_20 : tensor<128x4xi32, #blocked> loc(#loc56) + %tmp0_33 = arith.addi %tmp0_32, %tmp0_22 : tensor<128x4xi32, #blocked> loc(#loc58) + %tmp0_34 = tt.addptr %tmp0_23, %tmp0_33 : tensor<128x4x!tt.ptr, #blocked>, tensor<128x4xi32, #blocked> loc(#loc59) + %tmp0_35 = tt.broadcast %r0_mask : tensor<1x4xi1, #blocked> -> tensor<128x4xi1, #blocked> loc(#loc66) + %tmp0_36 = tt.load %tmp0_34, %tmp0_35, %cst_6 evictionPolicy = evict_first : tensor<128x4x!tt.ptr, #blocked> loc(#loc66) + %tmp0_37 = arith.extf %tmp0_36 : tensor<128x4xbf16, #blocked> to tensor<128x4xf32, #blocked> loc(#loc67) + %tmp1_38 = arith.addi %tmp0_30, %tmp1_24 : tensor<128x4xi32, #blocked> loc(#loc61) + %tmp1_39 = tt.addptr %tmp1_25, %tmp1_38 : tensor<128x4x!tt.ptr, #blocked>, tensor<128x4xi32, #blocked> loc(#loc62) + %tmp1_40 = tt.load %tmp1_39, %tmp0_35, %cst_6 evictionPolicy = evict_first : tensor<128x4x!tt.ptr, #blocked> loc(#loc68) + %tmp1_41 = arith.extf %tmp1_40 : tensor<128x4xbf16, #blocked> to tensor<128x4xf32, #blocked> loc(#loc69) + %tmp2 = arith.mulf %tmp0_37, %tmp1_41 : tensor<128x4xf32, #blocked> loc(#loc70) + %tmp5 = arith.addf %arg6, %tmp2 : tensor<128x4xf32, #blocked> loc(#loc71) + %_tmp4_42 = arith.select %tmp0_35, %tmp5, %arg6 : tensor<128x4xi1, #blocked>, tensor<128x4xf32, #blocked> loc(#loc72) + scf.yield %_tmp4_42 : tensor<128x4xf32, #blocked> loc(#loc31) + } loc(#loc63) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_28: f32 loc(callsite(#loc1 at #loc73)), %tmp4_29: f32 loc(callsite(#loc1 at #loc73))): + %tmp4_30 = arith.addf %tmp4_28, %tmp4_29 : f32 loc(#loc77) + tt.reduce.return %tmp4_30 : f32 loc(#loc75) + }) : (tensor<128x4xf32, #blocked>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc75) + %tmp4_26 = ttg.convert_layout %tmp4 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc74) + %tmp4_27 = tt.expand_dims %tmp4_26 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xf32, #blocked1> loc(#loc74) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> loc(#loc36) + %1 = tt.addptr %0, %xindex_15 : tensor<128x1x!tt.ptr, #blocked1>, tensor<128x1xi32, #blocked1> loc(#loc36) + tt.store %1, %tmp4_27 : tensor<128x1x!tt.ptr, #blocked1> loc(#loc37) + tt.return loc(#loc38) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":30:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:41) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:55) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:50) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:68) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:60) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":33:40) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":34:31) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":35:29) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:73) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:127) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:104) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":41:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":43:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:40) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:8) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:28) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:36) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:4) +#loc44 = loc("xoffset"(#loc2)) +#loc45 = loc("xoffset"(#loc3)) +#loc46 = loc("xindex"(#loc4)) +#loc47 = loc("xindex"(#loc5)) +#loc48 = loc("r0_base"(#loc6)) +#loc49 = loc("x0"(#loc7)) +#loc50 = loc("x1"(#loc8)) +#loc51 = loc("x1"(#loc9)) +#loc52 = loc("x2"(#loc10)) +#loc53 = loc("tmp0"(#loc11)) +#loc54 = loc("tmp0"(#loc12)) +#loc55 = loc("tmp0"(#loc13)) +#loc56 = loc("tmp0"(#loc14)) +#loc57 = loc("tmp0"(#loc15)) +#loc58 = loc("tmp0"(#loc16)) +#loc59 = loc("tmp0"(#loc17)) +#loc60 = loc("tmp1"(#loc18)) +#loc61 = loc("tmp1"(#loc19)) +#loc62 = loc("tmp1"(#loc20)) +#loc63 = loc("_tmp4"(#loc21)) +#loc64 = loc("r0_index"(#loc22)) +#loc65 = loc("r0_mask"(#loc23)) +#loc66 = loc("tmp0"(#loc24)) +#loc67 = loc("tmp0"(#loc25)) +#loc68 = loc("tmp1"(#loc26)) +#loc69 = loc("tmp1"(#loc27)) +#loc70 = loc("tmp2"(#loc28)) +#loc71 = loc("tmp5"(#loc29)) +#loc72 = loc("_tmp4"(#loc30)) +#loc74 = loc("tmp4"(#loc35)) +#loc75 = loc(callsite(#loc32 at #loc73)) +#loc77 = loc(callsite(#loc34 at #loc75)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..a358eb653a187e115fcf2cc752b766b1205fbd75 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/7LMQKETBAERZC53UPI4MWRVPAZ6KYDQUQ7SAXE4FJ2U7ANJWFXOA/triton_red_fused_zeros_0.ttir @@ -0,0 +1,151 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":18:0) +#loc1 = loc(unknown) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:25) +#loc41 = loc("in_ptr0"(#loc)) +#loc42 = loc("in_ptr1"(#loc)) +#loc43 = loc("out_ptr1"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc77 = loc("tmp4"(#loc35)) +#loc80 = loc(callsite(#loc1 at #loc77)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<128x4xbf16> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst_0 = arith.constant dense<8388608> : tensor<128x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<128> : tensor<1x4xi32> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<128x4xf32> loc(#loc1) + %x2 = arith.constant dense<65536> : tensor<128x1xi32> loc(#loc46) + %x1 = arith.constant dense<32> : tensor<128x1xi32> loc(#loc47) + %cst_5 = arith.constant dense<2048> : tensor<128x1xi32> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc48) + %xoffset_6 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc49) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc50) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc51) + %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<128x1xi32> loc(#loc52) + %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<128x1xi32> loc(#loc52) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc53) + %r0_base_10 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc54) + %x0 = arith.remsi %xindex_9, %cst_5 : tensor<128x1xi32> loc(#loc55) + %x1_11 = arith.divsi %xindex_9, %cst_5 : tensor<128x1xi32> loc(#loc56) + %x1_12 = arith.remsi %x1_11, %x1 : tensor<128x1xi32> loc(#loc47) + %x2_13 = arith.divsi %xindex_9, %x2 : tensor<128x1xi32> loc(#loc46) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_15 = %cst_4) -> (tensor<128x4xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc58) + %r0_index_16 = arith.addi %r0_index, %r0_base_10 : tensor<1x4xi32> loc(#loc58) + %r0_mask = arith.cmpi slt, %r0_index_16, %cst_3 : tensor<1x4xi32> loc(#loc59) + %tmp0 = arith.muli %x1_12, %cst_2 : tensor<128x1xi32> loc(#loc60) + %tmp0_17 = tt.broadcast %r0_index_16 : tensor<1x4xi32> -> tensor<128x4xi32> loc(#loc61) + %tmp0_18 = tt.broadcast %tmp0 : tensor<128x1xi32> -> tensor<128x4xi32> loc(#loc61) + %tmp0_19 = arith.addi %tmp0_17, %tmp0_18 : tensor<128x4xi32> loc(#loc61) + %tmp0_20 = arith.muli %x0, %cst_1 : tensor<128x1xi32> loc(#loc62) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<128x1xi32> -> tensor<128x4xi32> loc(#loc63) + %tmp0_22 = arith.addi %tmp0_19, %tmp0_21 : tensor<128x4xi32> loc(#loc63) + %tmp0_23 = arith.muli %x2_13, %cst_0 : tensor<128x1xi32> loc(#loc64) + %tmp0_24 = tt.broadcast %tmp0_23 : tensor<128x1xi32> -> tensor<128x4xi32> loc(#loc65) + %tmp0_25 = arith.addi %tmp0_22, %tmp0_24 : tensor<128x4xi32> loc(#loc65) + %tmp0_26 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x4x!tt.ptr> loc(#loc66) + %tmp0_27 = tt.addptr %tmp0_26, %tmp0_25 : tensor<128x4x!tt.ptr>, tensor<128x4xi32> loc(#loc66) + %tmp0_28 = tt.broadcast %r0_mask : tensor<1x4xi1> -> tensor<128x4xi1> loc(#loc67) + %tmp0_29 = tt.load %tmp0_27, %tmp0_28, %cst evictionPolicy = evict_first : tensor<128x4x!tt.ptr> loc(#loc67) + %tmp0_30 = arith.extf %tmp0_29 : tensor<128x4xbf16> to tensor<128x4xf32> loc(#loc68) + %tmp1 = arith.muli %xindex_9, %cst_2 : tensor<128x1xi32> loc(#loc69) + %tmp1_31 = tt.broadcast %tmp1 : tensor<128x1xi32> -> tensor<128x4xi32> loc(#loc70) + %tmp1_32 = arith.addi %tmp0_17, %tmp1_31 : tensor<128x4xi32> loc(#loc70) + %tmp1_33 = tt.splat %in_ptr1 : !tt.ptr -> tensor<128x4x!tt.ptr> loc(#loc71) + %tmp1_34 = tt.addptr %tmp1_33, %tmp1_32 : tensor<128x4x!tt.ptr>, tensor<128x4xi32> loc(#loc71) + %tmp1_35 = tt.load %tmp1_34, %tmp0_28, %cst evictionPolicy = evict_first : tensor<128x4x!tt.ptr> loc(#loc72) + %tmp1_36 = arith.extf %tmp1_35 : tensor<128x4xbf16> to tensor<128x4xf32> loc(#loc73) + %tmp2 = arith.mulf %tmp0_30, %tmp1_36 : tensor<128x4xf32> loc(#loc74) + %tmp5 = arith.addf %_tmp4_15, %tmp2 : tensor<128x4xf32> loc(#loc75) + %_tmp4_37 = arith.select %tmp0_28, %tmp5, %_tmp4_15 : tensor<128x4xi1>, tensor<128x4xf32> loc(#loc76) + scf.yield %_tmp4_37 : tensor<128x4xf32> loc(#loc33) + } loc(#loc57) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_15: f32 loc(callsite(#loc1 at #loc77)), %tmp4_16: f32 loc(callsite(#loc1 at #loc77))): + %tmp4_17 = arith.addf %tmp4_15, %tmp4_16 : f32 loc(#loc81) + tt.reduce.return %tmp4_17 : f32 loc(#loc79) + }) : (tensor<128x4xf32>) -> tensor<128xf32> loc(#loc79) + %tmp4_14 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc78) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc38) + %1 = tt.addptr %0, %xindex_9 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc38) + tt.store %1, %tmp4_14 : tensor<128x1x!tt.ptr> loc(#loc39) + tt.return loc(#loc40) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":33:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":30:19) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:29) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:28) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:33) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:36) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:44) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:23) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:27) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":28:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:21) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":34:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":35:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:45) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:55) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:50) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:68) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:60) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:73) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:127) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:45) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:41) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:34) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:50) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:104) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":41:22) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":43:23) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:40) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:8) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:28) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:36) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:4) +#loc46 = loc("x2"(#loc3)) +#loc47 = loc("x1"(#loc4)) +#loc48 = loc("xoffset"(#loc5)) +#loc49 = loc("xoffset"(#loc6)) +#loc50 = loc("xindex"(#loc7)) +#loc51 = loc("xindex"(#loc8)) +#loc52 = loc("xindex"(#loc9)) +#loc53 = loc("r0_base"(#loc10)) +#loc54 = loc("r0_base"(#loc11)) +#loc55 = loc("x0"(#loc12)) +#loc56 = loc("x1"(#loc13)) +#loc57 = loc("_tmp4"(#loc2)) +#loc58 = loc("r0_index"(#loc14)) +#loc59 = loc("r0_mask"(#loc15)) +#loc60 = loc("tmp0"(#loc16)) +#loc61 = loc("tmp0"(#loc17)) +#loc62 = loc("tmp0"(#loc18)) +#loc63 = loc("tmp0"(#loc19)) +#loc64 = loc("tmp0"(#loc20)) +#loc65 = loc("tmp0"(#loc21)) +#loc66 = loc("tmp0"(#loc22)) +#loc67 = loc("tmp0"(#loc23)) +#loc68 = loc("tmp0"(#loc24)) +#loc69 = loc("tmp1"(#loc25)) +#loc70 = loc("tmp1"(#loc26)) +#loc71 = loc("tmp1"(#loc27)) +#loc72 = loc("tmp1"(#loc28)) +#loc73 = loc("tmp1"(#loc29)) +#loc74 = loc("tmp2"(#loc30)) +#loc75 = loc("tmp5"(#loc31)) +#loc76 = loc("_tmp4"(#loc32)) +#loc78 = loc("tmp4"(#loc37)) +#loc79 = loc(callsite(#loc34 at #loc77)) +#loc81 = loc(callsite(#loc36 at #loc79)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..74a72d1caea8aa8de813029c5811975c081aafee --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..c82740cd609a14f7d1497cf00f1b6dae89edbd1a Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d4787571dad5bed9532fbcbab0e4c45a9b958637 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"hash": "fe376ba41d2f05ff2bb7c13c37b09b85f38cb341d26f28c38925cd0f032bd098", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..5be28d74f3c9a5daee49d2b80e87047ce6c6cdb2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir @@ -0,0 +1,266 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py\00" +@assertMessage_0 = internal constant [90 x i8] c"index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i64 %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %13 = icmp samesign ult i32 %12, 32, !dbg !11 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %15 = and i32 %14, 31, !dbg !12 + %16 = zext nneg i32 %12 to i64, !dbg !13 + %17 = mul i64 %5, %16, !dbg !13 + %18 = icmp sgt i32 %8, 0, !dbg !14 + br i1 %18, label %.lr.ph, label %._crit_edge, !dbg !14 + +.lr.ph: ; preds = %11 + %19 = getelementptr i32, ptr addrspace(1) %0, i64 %17 + br i1 %13, label %.lr.ph.split, label %.lr.ph.split.us + +.lr.ph.split.us: ; preds = %.lr.ph, %.lr.ph.split.us + %20 = phi i32 [ %26, %.lr.ph.split.us ], [ 0, %.lr.ph ] + %21 = or disjoint i32 %20, %15, !dbg !15 + %22 = sext i32 %21 to i64, !dbg !16 + %23 = getelementptr i32, ptr addrspace(1) %19, i64 %22, !dbg !17 + %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %25 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %23, i64 %24, i1 false) #5, !dbg !18 + %26 = add i32 %20, 32, !dbg !14 + %27 = icmp slt i32 %26, %8, !dbg !14 + br i1 %27, label %.lr.ph.split.us, label %._crit_edge, !dbg !14 + +.lr.ph.split: ; preds = %.lr.ph, %.lr.ph.split + %28 = phi i64 [ %36, %.lr.ph.split ], [ 0, %.lr.ph ] + %29 = phi i32 [ %37, %.lr.ph.split ], [ 0, %.lr.ph ] + %30 = or disjoint i32 %29, %15, !dbg !15 + %31 = icmp slt i32 %30, %8, !dbg !19 + %32 = sext i32 %30 to i64, !dbg !16 + %33 = getelementptr i32, ptr addrspace(1) %19, i64 %32, !dbg !17 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %35 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %33, i64 %34, i1 %31) #5, !dbg !18 + %narrow16 = select i1 %31, i32 %35, i32 0, !dbg !20 + %spec.select = sext i32 %narrow16 to i64, !dbg !20 + %36 = add i64 %28, %spec.select, !dbg !20 + %37 = add i32 %29, 32, !dbg !14 + %38 = icmp slt i32 %37, %8, !dbg !14 + br i1 %38, label %.lr.ph.split, label %._crit_edge, !dbg !14 + +._crit_edge: ; preds = %.lr.ph.split.us, %.lr.ph.split, %11 + %.lcssa = phi i64 [ 0, %11 ], [ %36, %.lr.ph.split ], [ 0, %.lr.ph.split.us ], !dbg !21 + %extelt.offset = lshr i64 %.lcssa, 32, !dbg !22 + %39 = trunc nuw i64 %extelt.offset to i32, !dbg !22 + %40 = trunc i64 %.lcssa to i32, !dbg !22 + %41 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %40, i32 16, i32 31), !dbg !22 + %42 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 16, i32 31), !dbg !22 + %43 = insertelement <2 x i32> poison, i32 %41, i64 0, !dbg !22 + %44 = insertelement <2 x i32> %43, i32 %42, i64 1, !dbg !22 + %45 = bitcast <2 x i32> %44 to i64, !dbg !22 + %46 = add i64 %.lcssa, %45, !dbg !26 + %extelt.offset3 = lshr i64 %46, 32, !dbg !22 + %47 = trunc nuw i64 %extelt.offset3 to i32, !dbg !22 + %48 = trunc i64 %46 to i32, !dbg !22 + %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 8, i32 31), !dbg !22 + %50 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %47, i32 8, i32 31), !dbg !22 + %51 = insertelement <2 x i32> poison, i32 %49, i64 0, !dbg !22 + %52 = insertelement <2 x i32> %51, i32 %50, i64 1, !dbg !22 + %53 = bitcast <2 x i32> %52 to i64, !dbg !22 + %54 = add i64 %46, %53, !dbg !26 + %extelt.offset4 = lshr i64 %54, 32, !dbg !22 + %55 = trunc nuw i64 %extelt.offset4 to i32, !dbg !22 + %56 = trunc i64 %54 to i32, !dbg !22 + %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 4, i32 31), !dbg !22 + %58 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 4, i32 31), !dbg !22 + %59 = insertelement <2 x i32> poison, i32 %57, i64 0, !dbg !22 + %60 = insertelement <2 x i32> %59, i32 %58, i64 1, !dbg !22 + %61 = bitcast <2 x i32> %60 to i64, !dbg !22 + %62 = add i64 %54, %61, !dbg !26 + %extelt.offset5 = lshr i64 %62, 32, !dbg !22 + %63 = trunc nuw i64 %extelt.offset5 to i32, !dbg !22 + %64 = trunc i64 %62 to i32, !dbg !22 + %65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 2, i32 31), !dbg !22 + %66 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 2, i32 31), !dbg !22 + %67 = insertelement <2 x i32> poison, i32 %65, i64 0, !dbg !22 + %68 = insertelement <2 x i32> %67, i32 %66, i64 1, !dbg !22 + %69 = bitcast <2 x i32> %68 to i64, !dbg !22 + %70 = add i64 %62, %69, !dbg !26 + %extelt.offset6 = lshr i64 %70, 32, !dbg !22 + %71 = trunc nuw i64 %extelt.offset6 to i32, !dbg !22 + %72 = trunc i64 %70 to i32, !dbg !22 + %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 1, i32 31), !dbg !22 + %74 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 1, i32 31), !dbg !22 + %75 = insertelement <2 x i32> poison, i32 %73, i64 0, !dbg !22 + %76 = insertelement <2 x i32> %75, i32 %74, i64 1, !dbg !22 + %77 = bitcast <2 x i32> %76 to i64, !dbg !22 + %78 = add i64 %70, %77, !dbg !26 + %79 = trunc i64 %78 to i32, !dbg !27 + %80 = getelementptr i32, ptr addrspace(1) %2, i64 %16, !dbg !28 + %81 = and i32 %14, 32, !dbg !29 + %82 = icmp eq i32 %81, 0, !dbg !29 + %83 = and i32 %14, 63, !dbg !29 + %84 = icmp eq i32 %83, 0, !dbg !29 + %85 = and i1 %13, %84, !dbg !29 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %79, ptr addrspace(1) %80, i1 %85) #5, !dbg !29 + %86 = icmp slt i64 %5, 2, !dbg !30 + %87 = icmp sgt i64 %5, 1, !dbg !31 + %88 = select i1 %87, i64 %5, i64 0, !dbg !32 + %89 = zext i1 %86 to i64, !dbg !33 + %90 = add i64 %88, %89, !dbg !34 + %91 = mul i64 %90, %16, !dbg !35 + %92 = add i64 %5, 1, !dbg !36 + %93 = add i64 %6, 127, !dbg !37 + %94 = sdiv i64 %93, 128, !dbg !38 + %95 = and i64 %93, 127, !dbg !42 + %.not = icmp ne i64 %95, 0, !dbg !42 + %96 = icmp slt i64 %93, 0, !dbg !43 + %narrow = and i1 %96, %.not, !dbg !44 + %97 = sext i1 %narrow to i64, !dbg !44 + %98 = add nsw i64 %94, %97, !dbg !44 + br i1 %18, label %.lr.ph14, label %._crit_edge15, !dbg !45 + +.lr.ph14: ; preds = %._crit_edge, %119 + %99 = phi i32 [ %131, %119 ], [ 0, %._crit_edge ] + %100 = or disjoint i32 %99, %15, !dbg !46 + %101 = icmp slt i32 %100, %8, !dbg !47 + %102 = sext i32 %100 to i64, !dbg !48 + %103 = add i64 %91, %102, !dbg !48 + %104 = getelementptr i64, ptr addrspace(1) %1, i64 %103, !dbg !49 + %105 = and i1 %13, %101, !dbg !50 + %106 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !51 + %107 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %106, i1 %105) #5, !dbg !51 + %108 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !51 + %109 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %108, i1 %105) #5, !dbg !51 + %110 = icmp slt i32 %100, %79, !dbg !52 + %sext7 = shl i64 %109, 32, !dbg !53 + %111 = ashr exact i64 %sext7, 32, !dbg !53 + %112 = select i1 %110, i64 %111, i64 %5, !dbg !53 + %113 = icmp slt i64 %112, 0, !dbg !54 + %114 = select i1 %113, i64 %92, i64 0, !dbg !55 + %115 = add i64 %114, %112, !dbg !55 + %116 = icmp slt i64 %115, 0, !dbg !56 + %117 = icmp sgt i64 %115, %98, !dbg !57 + %.not12 = or i1 %116, %117, !dbg !58 + %.not9 = and i1 %105, %.not12, !dbg !59 + br i1 %.not9, label %118, label %119, !dbg !59 + +118: ; preds = %.lr.ph14 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 59, ptr nonnull @assertFunc_0, i64 1), !dbg !59 + unreachable, !dbg !59 + +119: ; preds = %.lr.ph14 + %sext = shl i64 %107, 32, !dbg !53 + %120 = ashr exact i64 %sext, 32, !dbg !53 + %121 = select i1 %110, i64 %120, i64 %5, !dbg !53 + %122 = icmp slt i64 %121, 0, !dbg !54 + %123 = select i1 %122, i64 %92, i64 0, !dbg !55 + %124 = trunc i64 %109 to i32, !dbg !60 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !59 + %125 = getelementptr i32, ptr addrspace(1) %3, i64 %103, !dbg !61 + %126 = and i1 %82, %105, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %124, ptr addrspace(1) %125, i1 %126) #5, !dbg !62 + %127 = getelementptr i32, ptr addrspace(1) %4, i64 %121, !dbg !63 + %128 = getelementptr i32, ptr addrspace(1) %127, i64 %123, !dbg !63 + %129 = getelementptr i32, ptr addrspace(1) %128, i64 %16, !dbg !63 + %130 = getelementptr i32, ptr addrspace(1) %129, i64 %17, !dbg !63 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %130, i1 %126) #5, !dbg !64 + %131 = add i32 %99, 32, !dbg !45 + %132 = icmp slt i32 %131, %8, !dbg !45 + br i1 %132, label %.lr.ph14, label %._crit_edge15, !dbg !45 + +._crit_edge15: ; preds = %119, %._crit_edge + ret void, !dbg !65 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="64" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2", linkageName: "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 22, column: 28, scope: !9) +!11 = !DILocation(line: 24, column: 21, scope: !9) +!12 = !DILocation(line: 25, column: 37, scope: !9) +!13 = !DILocation(line: 35, column: 45, scope: !9) +!14 = !DILocation(line: 29, column: 40, scope: !9) +!15 = !DILocation(line: 30, column: 31, scope: !9) +!16 = !DILocation(line: 35, column: 41, scope: !9) +!17 = !DILocation(line: 35, column: 34, scope: !9) +!18 = !DILocation(line: 35, column: 50, scope: !9) +!19 = !DILocation(line: 31, column: 29, scope: !9) +!20 = !DILocation(line: 39, column: 48, scope: !9) +!21 = !DILocation(line: 28, column: 43, scope: !9) +!22 = !DILocation(line: 291, column: 36, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !9, file: !24, discriminator: 0) +!24 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!25 = !DILocation(line: 40, column: 25, scope: !9) +!26 = !DILocation(line: 261, column: 15, scope: !23, inlinedAt: !25) +!27 = !DILocation(line: 41, column: 19, scope: !9) +!28 = !DILocation(line: 42, column: 25, scope: !9) +!29 = !DILocation(line: 42, column: 36, scope: !9) +!30 = !DILocation(line: 49, column: 60, scope: !9) +!31 = !DILocation(line: 49, column: 86, scope: !9) +!32 = !DILocation(line: 49, column: 77, scope: !9) +!33 = !DILocation(line: 49, scope: !9) +!34 = !DILocation(line: 49, column: 68, scope: !9) +!35 = !DILocation(line: 49, column: 45, scope: !9) +!36 = !DILocation(line: 55, column: 20, scope: !9) +!37 = !DILocation(line: 59, column: 94, scope: !9) +!38 = !DILocation(line: 72, column: 16, scope: !39, inlinedAt: !41) +!39 = distinct !DILexicalBlockFile(scope: !9, file: !40, discriminator: 0) +!40 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!41 = !DILocation(line: 59, column: 100, scope: !9) +!42 = !DILocation(line: 74, column: 34, scope: !39, inlinedAt: !41) +!43 = !DILocation(line: 75, column: 25, scope: !39, inlinedAt: !41) +!44 = !DILocation(line: 75, column: 47, scope: !39, inlinedAt: !41) +!45 = !DILocation(line: 43, column: 40, scope: !9) +!46 = !DILocation(line: 44, column: 31, scope: !9) +!47 = !DILocation(line: 45, column: 29, scope: !9) +!48 = !DILocation(line: 49, column: 41, scope: !9) +!49 = !DILocation(line: 49, column: 34, scope: !9) +!50 = !DILocation(line: 49, column: 103, scope: !9) +!51 = !DILocation(line: 49, column: 93, scope: !9) +!52 = !DILocation(line: 52, column: 22, scope: !9) +!53 = !DILocation(line: 54, column: 37, scope: !9) +!54 = !DILocation(line: 57, column: 24, scope: !9) +!55 = !DILocation(line: 58, column: 39, scope: !9) +!56 = !DILocation(line: 59, column: 32, scope: !9) +!57 = !DILocation(line: 59, column: 50, scope: !9) +!58 = !DILocation(line: 59, column: 112, scope: !9) +!59 = !DILocation(line: 59, column: 130, scope: !9) +!60 = !DILocation(line: 50, column: 23, scope: !9) +!61 = !DILocation(line: 61, column: 29, scope: !9) +!62 = !DILocation(line: 61, column: 94, scope: !9) +!63 = !DILocation(line: 62, column: 29, scope: !9) +!64 = !DILocation(line: 62, column: 95, scope: !9) +!65 = !DILocation(line: 43, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..cb92bc9379d43e84ff0a25560e5a0612bbaf320d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx @@ -0,0 +1,640 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 // -- Begin function triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 112, 122, 47, 99, 112, 122, 52, 113, 116, 98, 53, 52, 122, 111, 104, 117, 107, 114, 109, 99, 97, 103, 100, 97, 105, 53, 122, 117, 104, 51, 117, 116, 103, 117, 51, 101, 97, 120, 52, 103, 104, 119, 118, 104, 116, 100, 115, 101, 107, 122, 50, 99, 99, 108, 109, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[90] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 53, 32, 60, 32, 49, 32, 43, 32, 40, 116, 114, 105, 116, 111, 110, 95, 104, 101, 108, 112, 101, 114, 115, 46, 100, 105, 118, 95, 102, 108, 111, 111, 114, 95, 105, 110, 116, 101, 103, 101, 114, 40, 49, 50, 55, 32, 43, 32, 107, 115, 49, 44, 32, 32, 49, 50, 56, 41, 41}; + // @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 +.visible .entry triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_4, + .param .u64 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_5, + .param .u64 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_6, + .param .u32 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_7, + .param .u32 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_10 +) +.reqntid 64 +{ + .reg .pred %p<32>; + .reg .b32 %r<53>; + .reg .b64 %rd<103>; + .loc 1 18 0 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:18:0 + +// %bb.0: + ld.param.b32 %r12, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_8]; + ld.param.b64 %rd18, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_5]; + ld.param.b64 %rd15, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_2]; +$L__tmp0: + .loc 1 22 28 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:22:28 + mov.u32 %r13, %ctaid.x; + .loc 1 25 37 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:25:37 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + .loc 1 35 45 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:35:45 + cvt.u64.u32 %rd1, %r13; + mul.lo.s64 %rd2, %rd18, %rd1; + .loc 1 29 40 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:29:40 + setp.lt.s32 %p2, %r12, 1; + mov.b64 %rd102, 0; + cvt.u32.u64 %r49, %rd1; + shl.b64 %rd100, %rd2, 2; + @%p2 bra $L__BB0_6; +// %bb.1: // %.lr.ph + .loc 1 0 40 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:0:40 + ld.param.b64 %rd13, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_0]; + .loc 1 24 21 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:24:21 + setp.lt.u32 %p3, %r49, 32; + add.s64 %rd3, %rd13, %rd100; + @%p3 bra $L__BB0_4; + bra.uni $L__BB0_2; +$L__BB0_4: // %.lr.ph.split.preheader + .loc 1 0 21 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:0:21 + mov.b32 %r51, 0; + mov.b64 %rd102, 0; +$L__BB0_5: // %.lr.ph.split + // =>This Inner Loop Header: Depth=1 + .loc 1 31 29 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:31:29 + add.s32 %r20, %r2, %r51; + setp.lt.s32 %p6, %r20, %r12; + .loc 1 35 34 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:35:34 + mad.wide.s32 %rd28, %r20, 4, %rd3; + .loc 1 35 50 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:35:50 + // begin inline asm + mov.u64 %rd27, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd27, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r19, 0x0; + @%p6 ld.global.L1::evict_first.L2::cache_hint.b32 { %r19 }, [ %rd28 + 0 ], %rd27; + // end inline asm + .loc 1 39 48 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:39:48 + selp.b32 %r21, %r19, 0, %p6; + cvt.s64.s32 %rd30, %r21; + add.s64 %rd102, %rd102, %rd30; + .loc 1 29 40 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:29:40 + add.s32 %r51, %r51, 32; + setp.lt.s32 %p7, %r51, %r12; + @%p7 bra $L__BB0_5; + bra.uni $L__BB0_6; +$L__BB0_2: // %.lr.ph.split.us.preheader + .loc 1 0 40 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:0:40 + mov.b32 %r50, 0; +$L__BB0_3: // %.lr.ph.split.us + // =>This Inner Loop Header: Depth=1 + .loc 1 35 41 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:35:41 + add.s32 %r17, %r2, %r50; + .loc 1 35 34 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:35:34 + mad.wide.s32 %rd23, %r17, 4, %rd3; + .loc 1 35 50 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:35:50 + // begin inline asm + mov.u64 %rd22, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd22, 1.0; + // end inline asm + mov.pred %p4, 0; + // begin inline asm + mov.u32 %r16, 0x0; + @%p4 ld.global.L1::evict_first.L2::cache_hint.b32 { %r16 }, [ %rd23 + 0 ], %rd22; + // end inline asm + .loc 1 29 40 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:29:40 + add.s32 %r50, %r50, 32; + setp.lt.s32 %p5, %r50, %r12; + @%p5 bra $L__BB0_3; +$L__BB0_6: // %._crit_edge + .loc 1 24 21 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:24:21 + setp.lt.u32 %p10, %r49, 32; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:40:25 ] + mov.b64 {_, %r24}, %rd102; + cvt.u32.u64 %r25, %rd102; + shfl.sync.bfly.b32 %r26, %r25, 16, 31, -1; + shfl.sync.bfly.b32 %r27, %r24, 16, 31, -1; + cvt.u64.u32 %rd32, %r26; + cvt.u64.u32 %rd33, %r27; + shl.b64 %rd34, %rd33, 32; + or.b64 %rd35, %rd32, %rd34; + .loc 2 261 15 // standard.py:261:15 @[ cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:40:25 ] + add.s64 %rd36, %rd102, %rd35; + .loc 2 291 36 // standard.py:291:36 @[ cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:40:25 ] + mov.b64 {_, %r28}, %rd36; + cvt.u32.u64 %r29, %rd36; + shfl.sync.bfly.b32 %r30, %r29, 8, 31, -1; + shfl.sync.bfly.b32 %r31, %r28, 8, 31, -1; + cvt.u64.u32 %rd37, %r30; + cvt.u64.u32 %rd38, %r31; + shl.b64 %rd39, %rd38, 32; + or.b64 %rd40, %rd37, %rd39; + .loc 2 261 15 // standard.py:261:15 @[ cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:40:25 ] + add.s64 %rd41, %rd36, %rd40; + .loc 2 291 36 // standard.py:291:36 @[ cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:40:25 ] + mov.b64 {_, %r32}, %rd41; + cvt.u32.u64 %r33, %rd41; + shfl.sync.bfly.b32 %r34, %r33, 4, 31, -1; + shfl.sync.bfly.b32 %r35, %r32, 4, 31, -1; + cvt.u64.u32 %rd42, %r34; + cvt.u64.u32 %rd43, %r35; + shl.b64 %rd44, %rd43, 32; + or.b64 %rd45, %rd42, %rd44; + .loc 2 261 15 // standard.py:261:15 @[ cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:40:25 ] + add.s64 %rd46, %rd41, %rd45; + .loc 2 291 36 // standard.py:291:36 @[ cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:40:25 ] + mov.b64 {_, %r36}, %rd46; + cvt.u32.u64 %r37, %rd46; + shfl.sync.bfly.b32 %r38, %r37, 2, 31, -1; + shfl.sync.bfly.b32 %r39, %r36, 2, 31, -1; + cvt.u64.u32 %rd47, %r38; + cvt.u64.u32 %rd48, %r39; + shl.b64 %rd49, %rd48, 32; + or.b64 %rd50, %rd47, %rd49; + .loc 2 261 15 // standard.py:261:15 @[ cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:40:25 ] + add.s64 %rd51, %rd46, %rd50; + .loc 2 291 36 // standard.py:291:36 @[ cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:40:25 ] + mov.b64 {_, %r40}, %rd51; + cvt.u32.u64 %r41, %rd51; + shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1; + shfl.sync.bfly.b32 %r43, %r40, 1, 31, -1; + cvt.u64.u32 %rd52, %r42; + .loc 2 261 15 // standard.py:261:15 @[ cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:40:25 ] + add.s64 %rd53, %rd51, %rd52; +$L__tmp2: + .loc 1 41 19 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:41:19 + cvt.u32.u64 %r22, %rd53; + .loc 1 42 25 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:42:25 + shl.b64 %rd54, %rd1, 2; + add.s64 %rd31, %rd15, %rd54; + .loc 1 42 36 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:42:36 + and.b32 %r44, %r1, 63; + setp.eq.b32 %p11, %r44, 0; + and.pred %p8, %p10, %p11; + // begin inline asm + @%p8 st.global.b32 [ %rd31 + 0 ], { %r22 }; + // end inline asm + .loc 1 43 40 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:43:40 + @%p2 bra $L__BB0_11; +// %bb.7: // %.lr.ph14.preheader + .loc 1 0 40 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:0:40 + ld.param.b64 %rd19, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_6]; + ld.param.b64 %rd17, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_4]; + ld.param.b64 %rd16, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_3]; + ld.param.b64 %rd14, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_1]; + and.b32 %r8, %r1, 32; + setp.lt.s64 %p12, %rd18, 2; + setp.gt.s64 %p13, %rd18, 1; + selp.b64 %rd55, %rd18, 0, %p13; + selp.b64 %rd56, 1, 0, %p12; + add.s64 %rd57, %rd55, %rd56; + mul.lo.s64 %rd7, %rd57, %rd1; + add.s64 %rd8, %rd18, 1; + add.s64 %rd58, %rd19, 127; + shr.s64 %rd59, %rd58, 63; + shr.u64 %rd60, %rd59, 57; + add.s64 %rd61, %rd58, %rd60; + shr.s64 %rd62, %rd61, 7; + and.b64 %rd63, %rd58, 127; + setp.ne.b64 %p14, %rd63, 0; + setp.lt.s64 %p15, %rd58, 0; + and.pred %p16, %p15, %p14; + selp.b64 %rd64, -1, 0, %p16; + add.s64 %rd9, %rd62, %rd64; + mov.b32 %r52, 0; +$L__BB0_8: // %.lr.ph14 + // =>This Inner Loop Header: Depth=1 + .loc 1 45 29 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:45:29 + add.s32 %r10, %r2, %r52; + setp.lt.s32 %p20, %r10, %r12; + .loc 1 49 41 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:49:41 + cvt.s64.s32 %rd73, %r10; + add.s64 %rd10, %rd7, %rd73; + .loc 1 49 34 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:49:34 + shl.b64 %rd74, %rd10, 3; + add.s64 %rd67, %rd14, %rd74; + .loc 1 49 103 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:49:103 + and.pred %p18, %p10, %p20; + .loc 1 49 93 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:49:93 + // begin inline asm + mov.u64 %rd65, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd65, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd66, 0x0; + @%p18 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd66 }, [ %rd67 + 0 ], %rd65; + // end inline asm + // begin inline asm + mov.u64 %rd69, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd69, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd70, 0x0; + @%p18 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd70 }, [ %rd67 + 0 ], %rd69; + // end inline asm + .loc 1 52 22 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:52:22 + setp.lt.s32 %p21, %r10, %r22; + .loc 1 54 37 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:54:37 + cvt.s64.s32 %rd75, %rd70; + selp.b64 %rd76, %rd75, %rd18, %p21; + .loc 1 58 39 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:58:39 + shr.s64 %rd77, %rd76, 63; + and.b64 %rd78, %rd77, %rd8; + add.s64 %rd79, %rd78, %rd76; + .loc 1 59 32 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:59:32 + setp.lt.s64 %p22, %rd79, 0; + .loc 1 59 50 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:59:50 + setp.gt.s64 %p23, %rd79, %rd9; + .loc 1 59 112 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:59:112 + or.pred %p24, %p22, %p23; + .loc 1 59 130 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:59:130 + and.pred %p25, %p18, %p24; + not.pred %p26, %p25; + @%p26 bra $L__BB0_10; + bra.uni $L__BB0_9; +$L__BB0_10: // in Loop: Header=BB0_8 Depth=1 + .loc 1 42 36 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:42:36 + setp.eq.b32 %p30, %r8, 0; + .loc 1 54 37 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:54:37 + cvt.s64.s32 %rd82, %rd66; + selp.b64 %rd83, %rd82, %rd18, %p21; + .loc 1 58 39 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:58:39 + shr.s64 %rd84, %rd83, 63; + and.b64 %rd85, %rd84, %rd8; + .loc 1 50 23 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:50:23 + cvt.u32.u64 %r47, %rd70; + .loc 1 59 130 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:59:130 + bar.sync 0; + .loc 1 61 29 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:61:29 + shl.b64 %rd86, %rd10, 2; + add.s64 %rd80, %rd16, %rd86; + .loc 1 61 94 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:61:94 + and.pred %p27, %p30, %p18; + // begin inline asm + @%p27 st.global.b32 [ %rd80 + 0 ], { %r47 }; + // end inline asm + .loc 1 62 29 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:62:29 + shl.b64 %rd87, %rd83, 2; + add.s64 %rd88, %rd17, %rd87; + shl.b64 %rd89, %rd85, 2; + add.s64 %rd90, %rd88, %rd89; + add.s64 %rd92, %rd90, %rd54; + add.s64 %rd81, %rd92, %rd100; + mov.b32 %r48, 1; + .loc 1 62 95 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:62:95 + // begin inline asm + @%p27 st.global.b32 [ %rd81 + 0 ], { %r48 }; + // end inline asm + .loc 1 43 40 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:43:40 + add.s32 %r52, %r52, 32; + setp.lt.s32 %p31, %r52, %r12; + @%p31 bra $L__BB0_8; +$L__BB0_11: // %._crit_edge15 + .loc 1 43 4 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:43:4 + ret; +$L__BB0_9: + .loc 1 59 130 // cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py:59:130 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd94, assertFunc_0; + cvta.global.u64 %rd95, %rd94; + st.param.b64 [param3], %rd95; + mov.b64 %rd96, assertFile_0; + cvta.global.u64 %rd97, %rd96; + st.param.b64 [param1], %rd97; + mov.b64 %rd98, assertMessage_0; + cvta.global.u64 %rd99, %rd98; + st.param.b64 [param0], %rd99; + st.param.b64 [param4], 1; + st.param.b32 [param2], 59; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 281 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x112 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 112 +.b8 122 +.b8 52 +.b8 113 +.b8 116 +.b8 98 +.b8 53 +.b8 52 +.b8 122 +.b8 111 +.b8 104 +.b8 117 +.b8 107 +.b8 114 +.b8 109 +.b8 99 +.b8 97 +.b8 103 +.b8 100 +.b8 97 +.b8 105 +.b8 53 +.b8 122 +.b8 117 +.b8 104 +.b8 51 +.b8 117 +.b8 116 +.b8 103 +.b8 117 +.b8 51 +.b8 101 +.b8 97 +.b8 120 +.b8 52 +.b8 103 +.b8 104 +.b8 119 +.b8 118 +.b8 104 +.b8 116 +.b8 100 +.b8 115 +.b8 101 +.b8 107 +.b8 122 +.b8 50 +.b8 99 +.b8 99 +.b8 108 +.b8 109 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 112 +.b8 122 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x63 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 112 +.b8 117 +.b8 116 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 110 +.b8 101 +.b8 119 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 115 +.b8 99 +.b8 97 +.b8 108 +.b8 97 +.b8 114 +.b8 95 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 119 +.b8 104 +.b8 101 +.b8 114 +.b8 101 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xee:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x103:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 40 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source b/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source new file mode 100644 index 0000000000000000000000000000000000000000..5c404b7b08ff6f81e8549849de0c1604dfee4ad9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source @@ -0,0 +1,379 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":18:0) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc79 = loc(unknown) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0) +#loc97 = loc("in_ptr0"(#loc)) +#loc98 = loc("in_ptr1"(#loc)) +#loc99 = loc("out_ptr1"(#loc)) +#loc100 = loc("out_ptr2"(#loc)) +#loc101 = loc("out_ptr3"(#loc)) +#loc102 = loc("ks0"(#loc)) +#loc103 = loc("ks1"(#loc)) +#loc104 = loc("xnumel"(#loc)) +#loc105 = loc("r0_numel"(#loc)) +#loc151 = loc("input"(#loc77)) +#loc152 = loc("a"(#loc82)) +#loc153 = loc("b"(#loc82)) +#loc154 = loc("a"(#loc86)) +module { + tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 32 : i32 loc(#loc106) + %xoffset = tt.get_program_id x : i32 loc(#loc107) + %xoffset_1 = arith.constant 1 : i32 loc(#loc108) + %xoffset_2 = arith.constant 1 : i32 loc(#loc108) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc108) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc109) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc110) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc111) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc111) + %xmask = arith.constant dense<32> : tensor<1x1xi32> loc(#loc112) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc112) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc113) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc114) + %_tmp3 = arith.constant 0 : i64 loc(#loc115) + %_tmp3_9 = arith.constant dense<0> : tensor<1x32xi64> loc(#loc115) + %c0_i32 = arith.constant 0 : i32 loc(#loc11) + %c32_i32 = arith.constant 32 : i32 loc(#loc11) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc11) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc11) + %2 = arith.bitcast %c32_i32 : i32 to i32 loc(#loc11) + %3 = ub.poison : i32 loc(#loc11) + %_tmp3_10 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_14 = %_tmp3_9) -> (tensor<1x32xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc117) + %r0_index_15 = arith.addi %r0_index, %r0_base_8 : tensor<1x32xi32> loc(#loc117) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc118) + %r0_mask_16 = arith.cmpi slt, %r0_index_15, %r0_mask : tensor<1x32xi32> loc(#loc118) + %tmp0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc119) + %tmp0_17 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc119) + %tmp0_18 = arith.muli %tmp0_17, %tmp0 : tensor<1x1xi64> loc(#loc119) + %tmp0_19 = arith.extsi %r0_index_15 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc120) + %tmp0_20 = tt.broadcast %tmp0_18 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc120) + %tmp0_21 = arith.addi %tmp0_19, %tmp0_20 : tensor<1x32xi64> loc(#loc120) + %tmp0_22 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc121) + %tmp0_23 = tt.addptr %tmp0_22, %tmp0_21 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc121) + %tmp0_24 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc122) + %tmp0_25 = arith.andi %r0_mask_16, %tmp0_24 : tensor<1x32xi1> loc(#loc122) + %tmp0_26 = arith.constant 0.000000e+00 : f32 loc(#loc123) + %tmp0_27 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc123) + %tmp0_28 = arith.fptosi %tmp0_27 : tensor<1x32xf32> to tensor<1x32xi32> loc(#loc123) + %tmp0_29 = tt.load %tmp0_23, %tmp0_25, %tmp0_28 evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc123) + %tmp1 = arith.extsi %tmp0_29 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc124) + %tmp4 = arith.addi %_tmp3_14, %tmp1 : tensor<1x32xi64> loc(#loc125) + %_tmp3_30 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc126) + %_tmp3_31 = arith.andi %r0_mask_16, %_tmp3_30 : tensor<1x32xi1> loc(#loc126) + %_tmp3_32 = arith.select %_tmp3_31, %tmp4, %_tmp3_14 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc127) + scf.yield %_tmp3_32 : tensor<1x32xi64> loc(#loc23) + } loc(#loc116) + %tmp3 = tt.call @"triton.language.standard.sum__i64S1_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp3_10) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc128) + %tmp3_11 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc129) + %tmp5 = arith.trunci %tmp3_11 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc130) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc27) + %5 = tt.addptr %4, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc27) + tt.store %5, %tmp5, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc28) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc29) + %c32_i32_13 = arith.constant 32 : i32 loc(#loc29) + %6 = arith.bitcast %c0_i32_12 : i32 to i32 loc(#loc29) + %7 = arith.bitcast %r0_numel : i32 to i32 loc(#loc29) + %8 = arith.bitcast %c32_i32_13 : i32 to i32 loc(#loc29) + %9 = ub.poison : i32 loc(#loc29) + scf.for %r0_offset = %6 to %7 step %8 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc131) + %r0_index_14 = arith.addi %r0_index, %r0_base_8 : tensor<1x32xi32> loc(#loc131) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc132) + %r0_mask_15 = arith.cmpi slt, %r0_index_14, %r0_mask : tensor<1x32xi32> loc(#loc132) + %tmp6 = arith.constant 1 : i32 loc(#loc133) + %tmp6_16 = arith.extsi %tmp6 : i32 to i64 loc(#loc133) + %tmp6_17 = arith.cmpi sge, %tmp6_16, %ks0 : i64 loc(#loc133) + %tmp6_18 = arith.constant 1 : i32 loc(#loc134) + %tmp6_19 = arith.constant 1 : i32 loc(#loc134) + %tmp6_20 = arith.extui %tmp6_17 : i1 to i32 loc(#loc134) + %tmp6_21 = arith.muli %tmp6_19, %tmp6_20 : i32 loc(#loc134) + %tmp6_22 = arith.constant 1 : i32 loc(#loc135) + %tmp6_23 = arith.extsi %tmp6_22 : i32 to i64 loc(#loc135) + %tmp6_24 = arith.cmpi sgt, %ks0, %tmp6_23 : i64 loc(#loc135) + %tmp6_25 = arith.extui %tmp6_24 : i1 to i64 loc(#loc136) + %tmp6_26 = arith.muli %ks0, %tmp6_25 : i64 loc(#loc136) + %tmp6_27 = arith.extsi %tmp6_21 : i32 to i64 loc(#loc137) + %tmp6_28 = arith.addi %tmp6_27, %tmp6_26 : i64 loc(#loc137) + %tmp6_29 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc138) + %tmp6_30 = tt.splat %tmp6_28 : i64 -> tensor<1x1xi64> loc(#loc138) + %tmp6_31 = arith.muli %tmp6_29, %tmp6_30 : tensor<1x1xi64> loc(#loc138) + %tmp6_32 = arith.extsi %r0_index_14 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc139) + %tmp6_33 = tt.broadcast %tmp6_31 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc139) + %tmp6_34 = arith.addi %tmp6_32, %tmp6_33 : tensor<1x32xi64> loc(#loc139) + %tmp6_35 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc140) + %tmp6_36 = tt.addptr %tmp6_35, %tmp6_34 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc140) + %tmp6_37 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc141) + %tmp6_38 = arith.andi %r0_mask_15, %tmp6_37 : tensor<1x32xi1> loc(#loc141) + %tmp6_39 = arith.constant 0.000000e+00 : f32 loc(#loc142) + %tmp6_40 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc142) + %tmp6_41 = arith.fptosi %tmp6_40 : tensor<1x32xf32> to tensor<1x32xi64> loc(#loc142) + %tmp6_42 = tt.load %tmp6_36, %tmp6_38, %tmp6_41 evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc142) + %tmp7 = arith.trunci %tmp6_42 : tensor<1x32xi64> to tensor<1x32xi32> loc(#loc143) + %tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32> -> tensor<1x32xi32> loc(#loc144) + %tmp9_43 = arith.cmpi slt, %r0_index_14, %tmp9 : tensor<1x32xi32> loc(#loc144) + %tmp11 = arith.extsi %tmp7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc145) + %tmp11_44 = tt.splat %ks0 : i64 -> tensor<1x32xi64> loc(#loc145) + %tmp11_45 = arith.select %tmp9_43, %tmp11, %tmp11_44 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc145) + %tmp12 = arith.constant 1 : i32 loc(#loc146) + %tmp12_46 = arith.constant 1 : i64 loc(#loc146) + %tmp12_47 = arith.addi %tmp12_46, %ks0 : i64 loc(#loc146) + %tmp13 = tt.splat %tmp12_47 : i64 -> tensor<1x32xi64> loc(#loc147) + %tmp13_48 = arith.addi %tmp11_45, %tmp13 : tensor<1x32xi64> loc(#loc147) + %tmp14 = arith.constant 0 : i32 loc(#loc148) + %tmp14_49 = arith.extsi %tmp14 : i32 to i64 loc(#loc148) + %tmp14_50 = tt.splat %tmp14_49 : i64 -> tensor<1x32xi64> loc(#loc148) + %tmp14_51 = arith.cmpi slt, %tmp11_45, %tmp14_50 : tensor<1x32xi64> loc(#loc148) + %tmp15 = arith.select %tmp14_51, %tmp13_48, %tmp11_45 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc149) + %c0_i32_52 = arith.constant 0 : i32 loc(#loc49) + %10 = arith.extsi %c0_i32_52 : i32 to i64 loc(#loc49) + %11 = tt.splat %10 : i64 -> tensor<1x32xi64> loc(#loc49) + %12 = arith.cmpi sle, %11, %tmp15 : tensor<1x32xi64> loc(#loc49) + %c127_i32 = arith.constant 127 : i32 loc(#loc50) + %c127_i64 = arith.constant 127 : i64 loc(#loc50) + %13 = arith.addi %c127_i64, %ks1 : i64 loc(#loc50) + %14 = tt.call @"torch._inductor.runtime.triton_helpers.div_floor_integer__i64__(1,)cconstexpr_128_"(%13) : (i64) -> i64 loc(#loc51) + %c1_i32 = arith.constant 1 : i32 loc(#loc52) + %c1_i64 = arith.constant 1 : i64 loc(#loc52) + %15 = arith.addi %c1_i64, %14 : i64 loc(#loc52) + %16 = tt.splat %15 : i64 -> tensor<1x32xi64> loc(#loc53) + %17 = arith.cmpi slt, %tmp15, %16 : tensor<1x32xi64> loc(#loc53) + %18 = arith.andi %12, %17 : tensor<1x32xi1> loc(#loc54) + %19 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc55) + %20 = arith.andi %r0_mask_15, %19 : tensor<1x32xi1> loc(#loc55) + %true = arith.constant true loc(#loc56) + %cst = arith.constant dense : tensor<1x32xi1> loc(#loc56) + %21 = arith.xori %20, %cst : tensor<1x32xi1> loc(#loc56) + %22 = arith.ori %18, %21 : tensor<1x32xi1> loc(#loc57) + tt.assert %22, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1> loc(#loc58) + %tmp17 = arith.constant 1 : i32 loc(#loc150) + %tmp17_53 = arith.constant dense<1> : tensor<1x1xi32> loc(#loc150) + %c1_i32_54 = arith.constant 1 : i32 loc(#loc60) + %23 = arith.extsi %c1_i32_54 : i32 to i64 loc(#loc60) + %24 = arith.cmpi sge, %23, %ks0 : i64 loc(#loc60) + %c1_i32_55 = arith.constant 1 : i32 loc(#loc61) + %c1_i32_56 = arith.constant 1 : i32 loc(#loc61) + %25 = arith.extui %24 : i1 to i32 loc(#loc61) + %26 = arith.muli %c1_i32_56, %25 : i32 loc(#loc61) + %c1_i32_57 = arith.constant 1 : i32 loc(#loc62) + %27 = arith.extsi %c1_i32_57 : i32 to i64 loc(#loc62) + %28 = arith.cmpi sgt, %ks0, %27 : i64 loc(#loc62) + %29 = arith.extui %28 : i1 to i64 loc(#loc63) + %30 = arith.muli %ks0, %29 : i64 loc(#loc63) + %31 = arith.extsi %26 : i32 to i64 loc(#loc64) + %32 = arith.addi %31, %30 : i64 loc(#loc64) + %33 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc65) + %34 = tt.splat %32 : i64 -> tensor<1x1xi64> loc(#loc65) + %35 = arith.muli %33, %34 : tensor<1x1xi64> loc(#loc65) + %36 = arith.extsi %r0_index_14 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc66) + %37 = tt.broadcast %35 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc66) + %38 = arith.addi %36, %37 : tensor<1x32xi64> loc(#loc66) + %39 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc67) + %40 = tt.addptr %39, %38 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc67) + %41 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc68) + %42 = arith.andi %r0_mask_15, %41 : tensor<1x32xi1> loc(#loc68) + tt.store %40, %tmp7, %42 : tensor<1x32x!tt.ptr> loc(#loc69) + %43 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc70) + %44 = tt.broadcast %43 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc70) + %45 = arith.addi %tmp15, %44 : tensor<1x32xi64> loc(#loc70) + %46 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc71) + %47 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc71) + %48 = arith.muli %47, %46 : tensor<1x1xi64> loc(#loc71) + %49 = tt.broadcast %48 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc72) + %50 = arith.addi %45, %49 : tensor<1x32xi64> loc(#loc72) + %51 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc73) + %52 = tt.addptr %51, %50 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc73) + %53 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc74) + %54 = arith.andi %r0_mask_15, %53 : tensor<1x32xi1> loc(#loc74) + %cst_58 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc75) + tt.store %52, %cst_58, %54 : tensor<1x32x!tt.ptr> loc(#loc75) + } loc(#loc29) + tt.return loc(#loc76) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x32xi64> loc("input"(#loc77))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc78) + tt.reduce.return %2 : i64 loc(#loc78) + }) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc78) + tt.return %0 : tensor<1xi64> loc(#loc80) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc81) + tt.return %1 : tensor<1xi64> loc(#loc81) + } loc(#loc77) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc82)), %b: i64 loc("b"(#loc82))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc83) + tt.return %0 : i64 loc(#loc84) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc85) + tt.return %1 : i64 loc(#loc85) + } loc(#loc82) + tt.func private @"torch._inductor.runtime.triton_helpers.div_floor_integer__i64__(1,)cconstexpr_128_"(%a: i64 loc("a"(#loc86))) -> i64 attributes {noinline = false} { + %quot = arith.constant 128 : i32 loc(#loc155) + %quot_0 = arith.constant 128 : i64 loc(#loc155) + %quot_1 = arith.divsi %a, %quot_0 : i64 loc(#loc155) + %remainder = arith.constant 128 : i32 loc(#loc156) + %remainder_2 = arith.constant 128 : i64 loc(#loc156) + %remainder_3 = arith.remsi %a, %remainder_2 : i64 loc(#loc156) + %fixed = arith.constant 0 : i32 loc(#loc157) + %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc157) + %fixed_5 = arith.cmpi ne, %remainder_3, %fixed_4 : i64 loc(#loc157) + %fixed_6 = arith.constant 1 : i32 loc(#loc158) + %fixed_7 = arith.constant 1 : i64 loc(#loc158) + %fixed_8 = arith.subi %quot_1, %fixed_7 : i64 loc(#loc158) + %fixed_9 = arith.select %fixed_5, %fixed_8, %quot_1 : i64 loc(#loc159) + %c0_i32 = arith.constant 0 : i32 loc(#loc92) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc92) + %1 = arith.cmpi slt, %a, %0 : i64 loc(#loc92) + %false = arith.constant false loc(#loc93) + %2 = arith.cmpi ne, %1, %false : i1 loc(#loc93) + %3 = arith.select %2, %fixed_9, %quot_1 : i64 loc(#loc94) + tt.return %3 : i64 loc(#loc95) + ^bb1: // no predecessors + %4 = ub.poison : i64 loc(#loc96) + tt.return %4 : i64 loc(#loc96) + } loc(#loc86) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":28:43) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":29:40) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":30:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":31:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":35:45) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":35:41) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":35:34) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":35:60) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":35:50) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":36:23) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":38:23) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":39:35) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":39:48) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":39:8) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":40:25) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":40:28) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":41:19) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":42:25) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":42:36) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":43:40) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":44:31) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":45:29) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:60) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:52) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:86) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:77) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:68) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:45) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:41) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:34) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:103) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:93) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":50:23) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":52:22) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":54:37) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":55:20) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":56:24) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":57:24) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":58:39) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:32) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:94) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:100) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:55) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:50) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:42) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:122) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:112) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:110) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:130) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":60:35) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":61:55) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":61:47) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":61:81) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":61:72) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":61:63) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":61:40) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":61:36) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":61:29) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":61:104) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":61:94) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":62:53) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":62:62) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":62:58) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":62:29) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":62:105) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":62:95) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":43:4) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4) +#loc106 = loc("xnumel"(#loc1)) +#loc107 = loc("xoffset"(#loc2)) +#loc108 = loc("xoffset"(#loc3)) +#loc109 = loc("xindex"(#loc4)) +#loc110 = loc("xindex"(#loc5)) +#loc111 = loc("xindex"(#loc6)) +#loc112 = loc("xmask"(#loc7)) +#loc113 = loc("r0_base"(#loc8)) +#loc114 = loc("r0_base"(#loc9)) +#loc115 = loc("_tmp3"(#loc10)) +#loc116 = loc("_tmp3"(#loc11)) +#loc117 = loc("r0_index"(#loc12)) +#loc118 = loc("r0_mask"(#loc13)) +#loc119 = loc("tmp0"(#loc14)) +#loc120 = loc("tmp0"(#loc15)) +#loc121 = loc("tmp0"(#loc16)) +#loc122 = loc("tmp0"(#loc17)) +#loc123 = loc("tmp0"(#loc18)) +#loc124 = loc("tmp1"(#loc19)) +#loc125 = loc("tmp4"(#loc20)) +#loc126 = loc("_tmp3"(#loc21)) +#loc127 = loc("_tmp3"(#loc22)) +#loc128 = loc("tmp3"(#loc24)) +#loc129 = loc("tmp3"(#loc25)) +#loc130 = loc("tmp5"(#loc26)) +#loc131 = loc("r0_index"(#loc30)) +#loc132 = loc("r0_mask"(#loc31)) +#loc133 = loc("tmp6"(#loc32)) +#loc134 = loc("tmp6"(#loc33)) +#loc135 = loc("tmp6"(#loc34)) +#loc136 = loc("tmp6"(#loc35)) +#loc137 = loc("tmp6"(#loc36)) +#loc138 = loc("tmp6"(#loc37)) +#loc139 = loc("tmp6"(#loc38)) +#loc140 = loc("tmp6"(#loc39)) +#loc141 = loc("tmp6"(#loc40)) +#loc142 = loc("tmp6"(#loc41)) +#loc143 = loc("tmp7"(#loc42)) +#loc144 = loc("tmp9"(#loc43)) +#loc145 = loc("tmp11"(#loc44)) +#loc146 = loc("tmp12"(#loc45)) +#loc147 = loc("tmp13"(#loc46)) +#loc148 = loc("tmp14"(#loc47)) +#loc149 = loc("tmp15"(#loc48)) +#loc150 = loc("tmp17"(#loc59)) +#loc155 = loc("quot"(#loc87)) +#loc156 = loc("remainder"(#loc88)) +#loc157 = loc("fixed"(#loc89)) +#loc158 = loc("fixed"(#loc90)) +#loc159 = loc("fixed"(#loc91)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..6723f4712db7f3eaa3f630aeae4ff6a40d606db4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir @@ -0,0 +1,270 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":18:0) +#loc1 = loc(unknown) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":40:25) +#loc68 = loc("in_ptr0"(#loc)) +#loc69 = loc("in_ptr1"(#loc)) +#loc70 = loc("out_ptr1"(#loc)) +#loc71 = loc("out_ptr2"(#loc)) +#loc72 = loc("out_ptr3"(#loc)) +#loc73 = loc("ks0"(#loc)) +#loc74 = loc("ks1"(#loc)) +#loc75 = loc("xnumel"(#loc)) +#loc76 = loc("r0_numel"(#loc)) +#loc91 = loc("tmp3"(#loc18)) +#loc124 = loc(callsite(#loc1 at #loc91)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x32xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x32xi64, #blocked1> loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c127_i64 = arith.constant 127 : i64 loc(#loc1) + %cst_1 = arith.constant dense : tensor<1x32xi1, #blocked1> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<1x32xi32, #blocked1> loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x32xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc77) + %xmask = arith.cmpi slt, %xoffset, %c32_i32 : i32 loc(#loc78) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc79) + %r0_base_4 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc79) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> loc(#loc79) + %r0_base_6 = tt.expand_dims %r0_base_4 {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> loc(#loc79) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32, #blocked1> loc(#loc80) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc81) + %tmp0_7 = arith.muli %ks0, %tmp0 : i64 loc(#loc81) + %tmp0_8 = tt.splat %tmp0_7 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc121) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc83) + %tmp0_10 = tt.splat %xmask : i1 -> tensor<1x32xi1, #blocked1> loc(#loc122) + %_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 iter_args(%_tmp3_31 = %cst_0) -> (tensor<1x32xi64, #blocked1>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked1> loc(#loc86) + %r0_index_32 = arith.addi %r0_index, %r0_base_6 : tensor<1x32xi32, #blocked1> loc(#loc86) + %r0_mask_33 = arith.cmpi slt, %r0_index_32, %r0_mask : tensor<1x32xi32, #blocked1> loc(#loc80) + %tmp0_34 = arith.extsi %r0_index_32 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc82) + %tmp0_35 = arith.addi %tmp0_34, %tmp0_8 : tensor<1x32xi64, #blocked1> loc(#loc82) + %tmp0_36 = tt.addptr %tmp0_9, %tmp0_35 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc83) + %tmp0_37 = arith.andi %r0_mask_33, %tmp0_10 : tensor<1x32xi1, #blocked1> loc(#loc84) + %tmp0_38 = tt.load %tmp0_36, %tmp0_37, %cst_2 evictionPolicy = evict_first : tensor<1x32x!tt.ptr, #blocked1> loc(#loc87) + %tmp1 = arith.extsi %tmp0_38 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc88) + %tmp4 = arith.addi %_tmp3_31, %tmp1 : tensor<1x32xi64, #blocked1> loc(#loc89) + %_tmp3_39 = arith.select %tmp0_37, %tmp4, %_tmp3_31 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc90) + scf.yield %_tmp3_39 : tensor<1x32xi64, #blocked1> loc(#loc16) + } loc(#loc85) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_31: i64 loc(callsite(#loc1 at #loc91)), %tmp3_32: i64 loc(callsite(#loc1 at #loc91))): + %tmp3_33 = arith.addi %tmp3_31, %tmp3_32 : i64 loc(#loc133) + tt.reduce.return %tmp3_33 : i64 loc(#loc123) + }) : (tensor<1x32xi64, #blocked1>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc123) + %0 = ttg.convert_layout %tmp3 : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc20) + %tmp3_11 = tt.expand_dims %0 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc92) + %tmp3_12 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi64, #blocked1> loc(#loc92) + %tmp5 = arith.trunci %tmp3_11 : tensor<1x1xi64, #blocked> to tensor<1x1xi32, #blocked> loc(#loc93) + %tmp5_13 = arith.trunci %tmp3_12 : tensor<1x1xi64, #blocked1> to tensor<1x1xi32, #blocked1> loc(#loc93) + %1 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc23) + %2 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc24) + %3 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc24) + tt.store %2, %tmp5, %3 : tensor<1x1x!tt.ptr, #blocked> loc(#loc24) + %r0_mask_14 = tt.splat %r0_numel : i32 -> tensor<1x32xi32, #blocked> loc(#loc94) + %tmp6 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc95) + %tmp6_15 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc96) + %tmp6_16 = arith.extui %tmp6_15 : i1 to i64 loc(#loc97) + %tmp6_17 = arith.muli %ks0, %tmp6_16 : i64 loc(#loc97) + %tmp6_18 = arith.extui %tmp6 : i1 to i64 loc(#loc125) + %tmp6_19 = arith.addi %tmp6_18, %tmp6_17 : i64 loc(#loc98) + %tmp6_20 = arith.muli %tmp0, %tmp6_19 : i64 loc(#loc100) + %tmp6_21 = tt.splat %tmp6_20 : i64 -> tensor<1x32xi64, #blocked> loc(#loc126) + %tmp6_22 = tt.splat %tmp6_20 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc126) + %tmp6_23 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked> loc(#loc102) + %tmp6_24 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc102) + %tmp6_25 = tt.splat %xmask : i1 -> tensor<1x32xi1, #blocked> loc(#loc127) + %tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32, #blocked> -> tensor<1x32xi32, #blocked> loc(#loc104) + %tmp9_26 = tt.broadcast %tmp5_13 : tensor<1x1xi32, #blocked1> -> tensor<1x32xi32, #blocked1> loc(#loc104) + %tmp11 = tt.splat %ks0 : i64 -> tensor<1x32xi64, #blocked> loc(#loc105) + %tmp11_27 = tt.splat %ks0 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc105) + %tmp12 = arith.addi %ks0, %c1_i64 : i64 loc(#loc106) + %tmp13 = tt.splat %tmp12 : i64 -> tensor<1x32xi64, #blocked> loc(#loc107) + %tmp13_28 = tt.splat %tmp12 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc107) + %4 = arith.addi %ks1, %c127_i64 : i64 loc(#loc39) + %quot = arith.divsi %4, %c128_i64 : i64 loc(#loc128) + %remainder = arith.remsi %4, %c128_i64 : i64 loc(#loc129) + %fixed = arith.cmpi ne, %remainder, %c0_i64 : i64 loc(#loc130) + %fixed_29 = arith.subi %quot, %c1_i64 : i64 loc(#loc131) + %fixed_30 = arith.select %fixed, %fixed_29, %quot : i64 loc(#loc132) + %5 = arith.cmpi slt, %4, %c0_i64 : i64 loc(#loc113) + %6 = arith.select %5, %fixed_30, %quot : i64 loc(#loc114) + %7 = arith.addi %6, %c1_i64 : i64 loc(#loc48) + %8 = tt.splat %7 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc49) + %9 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc50) + %10 = tt.splat %tmp0 : i64 -> tensor<1x32xi64, #blocked> loc(#loc51) + %11 = tt.splat %tmp0_7 : i64 -> tensor<1x32xi64, #blocked> loc(#loc115) + %12 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked> loc(#loc54) + scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked> loc(#loc116) + %r0_index_31 = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked1> loc(#loc116) + %r0_index_32 = arith.addi %r0_index, %r0_base_5 : tensor<1x32xi32, #blocked> loc(#loc116) + %r0_index_33 = arith.addi %r0_index_31, %r0_base_6 : tensor<1x32xi32, #blocked1> loc(#loc116) + %r0_mask_34 = arith.cmpi slt, %r0_index_32, %r0_mask_14 : tensor<1x32xi32, #blocked> loc(#loc94) + %r0_mask_35 = arith.cmpi slt, %r0_index_33, %r0_mask : tensor<1x32xi32, #blocked1> loc(#loc94) + %tmp6_36 = arith.extsi %r0_index_32 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> loc(#loc101) + %tmp6_37 = arith.extsi %r0_index_33 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc101) + %tmp6_38 = arith.addi %tmp6_36, %tmp6_21 : tensor<1x32xi64, #blocked> loc(#loc101) + %tmp6_39 = arith.addi %tmp6_37, %tmp6_22 : tensor<1x32xi64, #blocked1> loc(#loc101) + %tmp6_40 = tt.addptr %tmp6_23, %tmp6_38 : tensor<1x32x!tt.ptr, #blocked>, tensor<1x32xi64, #blocked> loc(#loc102) + %tmp6_41 = tt.addptr %tmp6_24, %tmp6_39 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc102) + %tmp6_42 = arith.andi %r0_mask_34, %tmp6_25 : tensor<1x32xi1, #blocked> loc(#loc103) + %tmp6_43 = arith.andi %r0_mask_35, %tmp0_10 : tensor<1x32xi1, #blocked1> loc(#loc103) + %tmp6_44 = tt.load %tmp6_40, %tmp6_42, %cst evictionPolicy = evict_first : tensor<1x32x!tt.ptr, #blocked> loc(#loc117) + %tmp6_45 = tt.load %tmp6_41, %tmp6_43, %cst_0 evictionPolicy = evict_first : tensor<1x32x!tt.ptr, #blocked1> loc(#loc117) + %tmp7 = arith.trunci %tmp6_44 : tensor<1x32xi64, #blocked> to tensor<1x32xi32, #blocked> loc(#loc118) + %tmp7_46 = arith.trunci %tmp6_45 : tensor<1x32xi64, #blocked1> to tensor<1x32xi32, #blocked1> loc(#loc118) + %tmp9_47 = arith.cmpi slt, %r0_index_32, %tmp9 : tensor<1x32xi32, #blocked> loc(#loc104) + %tmp9_48 = arith.cmpi slt, %r0_index_33, %tmp9_26 : tensor<1x32xi32, #blocked1> loc(#loc104) + %tmp11_49 = arith.extsi %tmp7 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> loc(#loc105) + %tmp11_50 = arith.extsi %tmp7_46 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc105) + %tmp11_51 = arith.select %tmp9_47, %tmp11_49, %tmp11 : tensor<1x32xi1, #blocked>, tensor<1x32xi64, #blocked> loc(#loc105) + %tmp11_52 = arith.select %tmp9_48, %tmp11_50, %tmp11_27 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc105) + %tmp13_53 = arith.addi %tmp11_51, %tmp13 : tensor<1x32xi64, #blocked> loc(#loc107) + %tmp13_54 = arith.addi %tmp11_52, %tmp13_28 : tensor<1x32xi64, #blocked1> loc(#loc107) + %tmp14 = arith.cmpi slt, %tmp11_51, %cst : tensor<1x32xi64, #blocked> loc(#loc119) + %tmp14_55 = arith.cmpi slt, %tmp11_52, %cst_0 : tensor<1x32xi64, #blocked1> loc(#loc119) + %tmp15 = arith.select %tmp14, %tmp13_53, %tmp11_51 : tensor<1x32xi1, #blocked>, tensor<1x32xi64, #blocked> loc(#loc120) + %tmp15_56 = arith.select %tmp14_55, %tmp13_54, %tmp11_52 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc120) + %13 = arith.cmpi sge, %tmp15_56, %cst_0 : tensor<1x32xi64, #blocked1> loc(#loc61) + %14 = arith.cmpi slt, %tmp15_56, %8 : tensor<1x32xi64, #blocked1> loc(#loc49) + %15 = arith.andi %13, %14 : tensor<1x32xi1, #blocked1> loc(#loc62) + %16 = arith.xori %tmp6_43, %cst_1 : tensor<1x32xi1, #blocked1> loc(#loc63) + %17 = arith.ori %15, %16 : tensor<1x32xi1, #blocked1> loc(#loc64) + tt.assert %17, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1, #blocked1> loc(#loc65) + %18 = tt.addptr %9, %tmp6_39 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc50) + tt.store %18, %tmp7_46, %tmp6_43 : tensor<1x32x!tt.ptr, #blocked1> loc(#loc66) + %19 = arith.addi %tmp15, %10 : tensor<1x32xi64, #blocked> loc(#loc51) + %20 = arith.addi %19, %11 : tensor<1x32xi64, #blocked> loc(#loc52) + %21 = tt.addptr %12, %20 : tensor<1x32x!tt.ptr, #blocked>, tensor<1x32xi64, #blocked> loc(#loc54) + tt.store %21, %cst_3, %tmp6_42 : tensor<1x32x!tt.ptr, #blocked> loc(#loc20) + } loc(#loc55) + tt.return loc(#loc67) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":31:29) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":35:45) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":35:41) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":35:34) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":35:60) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":29:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":30:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":35:50) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":36:23) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":38:23) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":39:48) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":39:8) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":62:95) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":40:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":41:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":42:25) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":42:36) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":45:29) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:60) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:86) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:77) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:68) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:52) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:45) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:103) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":52:22) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":54:37) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":55:20) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":56:24) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:94) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:100) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:55) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:50) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":61:29) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":62:53) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":62:58) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":62:62) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":62:29) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":43:40) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":44:31) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:93) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":50:23) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":57:24) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":58:39) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:32) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:112) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:110) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:130) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":61:94) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":43:4) +#loc77 = loc("xoffset"(#loc2)) +#loc78 = loc("xmask"(#loc3)) +#loc79 = loc("r0_base"(#loc4)) +#loc80 = loc("r0_mask"(#loc5)) +#loc81 = loc("tmp0"(#loc6)) +#loc82 = loc("tmp0"(#loc7)) +#loc83 = loc("tmp0"(#loc8)) +#loc84 = loc("tmp0"(#loc9)) +#loc85 = loc("_tmp3"(#loc10)) +#loc86 = loc("r0_index"(#loc11)) +#loc87 = loc("tmp0"(#loc12)) +#loc88 = loc("tmp1"(#loc13)) +#loc89 = loc("tmp4"(#loc14)) +#loc90 = loc("_tmp3"(#loc15)) +#loc92 = loc("tmp3"(#loc21)) +#loc93 = loc("tmp5"(#loc22)) +#loc94 = loc("r0_mask"(#loc25)) +#loc95 = loc("tmp6"(#loc26)) +#loc96 = loc("tmp6"(#loc27)) +#loc97 = loc("tmp6"(#loc28)) +#loc98 = loc("tmp6"(#loc29)) +#loc99 = loc("tmp6"(#loc30)) +#loc100 = loc("tmp6"(#loc31)) +#loc101 = loc("tmp6"(#loc32)) +#loc102 = loc("tmp6"(#loc33)) +#loc103 = loc("tmp6"(#loc34)) +#loc104 = loc("tmp9"(#loc35)) +#loc105 = loc("tmp11"(#loc36)) +#loc106 = loc("tmp12"(#loc37)) +#loc107 = loc("tmp13"(#loc38)) +#loc108 = loc("quot"(#loc40)) +#loc109 = loc("remainder"(#loc42)) +#loc110 = loc("fixed"(#loc43)) +#loc111 = loc("fixed"(#loc44)) +#loc112 = loc("fixed"(#loc45)) +#loc113 = loc(callsite(#loc46 at #loc41)) +#loc114 = loc(callsite(#loc47 at #loc41)) +#loc115 = loc(fused[#loc52, #loc53]) +#loc116 = loc("r0_index"(#loc56)) +#loc117 = loc("tmp6"(#loc57)) +#loc118 = loc("tmp7"(#loc58)) +#loc119 = loc("tmp14"(#loc59)) +#loc120 = loc("tmp15"(#loc60)) +#loc121 = loc(fused[#loc82, #loc81]) +#loc122 = loc(fused[#loc84, #loc78]) +#loc123 = loc(callsite(#loc17 at #loc91)) +#loc125 = loc(fused[#loc98, #loc99]) +#loc126 = loc(fused[#loc101, #loc100]) +#loc127 = loc(fused[#loc103, #loc78]) +#loc128 = loc(callsite(#loc108 at #loc41)) +#loc129 = loc(callsite(#loc109 at #loc41)) +#loc130 = loc(callsite(#loc110 at #loc41)) +#loc131 = loc(callsite(#loc111 at #loc41)) +#loc132 = loc(callsite(#loc112 at #loc41)) +#loc133 = loc(callsite(#loc19 at #loc123)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..97f172a7d9a03c03baa3615ce08e56ddd537c37e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/7Y3WXJA5F4C76K5XYE6DPME3QXZYZM2B2JXSRQ4JEXGQ6AZL2CMA/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir @@ -0,0 +1,246 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":18:0) +#loc1 = loc(unknown) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":40:25) +#loc69 = loc("in_ptr0"(#loc)) +#loc70 = loc("in_ptr1"(#loc)) +#loc71 = loc("out_ptr1"(#loc)) +#loc72 = loc("out_ptr2"(#loc)) +#loc73 = loc("out_ptr3"(#loc)) +#loc74 = loc("ks0"(#loc)) +#loc75 = loc("ks1"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc93 = loc("tmp3"(#loc19)) +#loc126 = loc(callsite(#loc1 at #loc93)) +module { + tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %cst = arith.constant dense<0> : tensor<1x32xi32> loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc1) + %cst_1 = arith.constant dense : tensor<1x32xi1> loc(#loc1) + %c127_i64 = arith.constant 127 : i64 loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<1x32xi64> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc78) + %xmask = arith.cmpi slt, %xoffset, %c32_i32 : i32 loc(#loc79) + %xmask_3 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc79) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc80) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc81) + %_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 iter_args(%_tmp3_6 = %cst_2) -> (tensor<1x32xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc83) + %r0_index_7 = arith.addi %r0_index, %r0_base_4 : tensor<1x32xi32> loc(#loc83) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc84) + %r0_mask_8 = arith.cmpi slt, %r0_index_7, %r0_mask : tensor<1x32xi32> loc(#loc84) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc85) + %tmp0_9 = arith.muli %ks0, %tmp0 : i64 loc(#loc85) + %tmp0_10 = arith.extsi %r0_index_7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc86) + %tmp0_11 = tt.splat %tmp0_9 : i64 -> tensor<1x32xi64> loc(#loc123) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<1x32xi64> loc(#loc86) + %tmp0_13 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc87) + %tmp0_14 = tt.addptr %tmp0_13, %tmp0_12 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc87) + %tmp0_15 = tt.splat %xmask : i1 -> tensor<1x32xi1> loc(#loc124) + %tmp0_16 = arith.andi %r0_mask_8, %tmp0_15 : tensor<1x32xi1> loc(#loc88) + %tmp0_17 = tt.load %tmp0_14, %tmp0_16, %cst evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc89) + %tmp1 = arith.extsi %tmp0_17 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc90) + %tmp4 = arith.addi %_tmp3_6, %tmp1 : tensor<1x32xi64> loc(#loc91) + %_tmp3_18 = arith.select %tmp0_16, %tmp4, %_tmp3_6 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc92) + scf.yield %_tmp3_18 : tensor<1x32xi64> loc(#loc17) + } loc(#loc82) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_6: i64 loc(callsite(#loc1 at #loc93)), %tmp3_7: i64 loc(callsite(#loc1 at #loc93))): + %tmp3_8 = arith.addi %tmp3_6, %tmp3_7 : i64 loc(#loc135) + tt.reduce.return %tmp3_8 : i64 loc(#loc125) + }) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc125) + %tmp3_5 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc94) + %tmp5 = arith.trunci %tmp3_5 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc95) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc23) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc23) + tt.store %1, %tmp5, %xmask_3 : tensor<1x1x!tt.ptr> loc(#loc24) + scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc96) + %r0_index_6 = arith.addi %r0_index, %r0_base_4 : tensor<1x32xi32> loc(#loc96) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc97) + %r0_mask_7 = arith.cmpi slt, %r0_index_6, %r0_mask : tensor<1x32xi32> loc(#loc97) + %tmp6 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc98) + %tmp6_8 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc99) + %tmp6_9 = arith.extui %tmp6_8 : i1 to i64 loc(#loc100) + %tmp6_10 = arith.muli %ks0, %tmp6_9 : i64 loc(#loc100) + %tmp6_11 = arith.extui %tmp6 : i1 to i64 loc(#loc127) + %tmp6_12 = arith.addi %tmp6_11, %tmp6_10 : i64 loc(#loc101) + %tmp6_13 = arith.extsi %xoffset : i32 to i64 loc(#loc103) + %tmp6_14 = arith.muli %tmp6_13, %tmp6_12 : i64 loc(#loc103) + %tmp6_15 = arith.extsi %r0_index_6 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc104) + %tmp6_16 = tt.splat %tmp6_14 : i64 -> tensor<1x32xi64> loc(#loc128) + %tmp6_17 = arith.addi %tmp6_15, %tmp6_16 : tensor<1x32xi64> loc(#loc104) + %tmp6_18 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc105) + %tmp6_19 = tt.addptr %tmp6_18, %tmp6_17 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc105) + %tmp6_20 = tt.splat %xmask : i1 -> tensor<1x32xi1> loc(#loc129) + %tmp6_21 = arith.andi %r0_mask_7, %tmp6_20 : tensor<1x32xi1> loc(#loc106) + %tmp6_22 = tt.load %tmp6_19, %tmp6_21, %cst_2 evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc107) + %tmp7 = arith.trunci %tmp6_22 : tensor<1x32xi64> to tensor<1x32xi32> loc(#loc108) + %tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32> -> tensor<1x32xi32> loc(#loc109) + %tmp9_23 = arith.cmpi slt, %r0_index_6, %tmp9 : tensor<1x32xi32> loc(#loc109) + %tmp11 = arith.extsi %tmp7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc110) + %tmp11_24 = tt.splat %ks0 : i64 -> tensor<1x32xi64> loc(#loc110) + %tmp11_25 = arith.select %tmp9_23, %tmp11, %tmp11_24 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc110) + %tmp12 = arith.addi %ks0, %c1_i64 : i64 loc(#loc111) + %tmp13 = tt.splat %tmp12 : i64 -> tensor<1x32xi64> loc(#loc112) + %tmp13_26 = arith.addi %tmp11_25, %tmp13 : tensor<1x32xi64> loc(#loc112) + %tmp14 = arith.cmpi slt, %tmp11_25, %cst_2 : tensor<1x32xi64> loc(#loc113) + %tmp15 = arith.select %tmp14, %tmp13_26, %tmp11_25 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc114) + %2 = arith.cmpi sge, %tmp15, %cst_2 : tensor<1x32xi64> loc(#loc45) + %3 = arith.addi %ks1, %c127_i64 : i64 loc(#loc46) + %quot = arith.divsi %3, %c128_i64 : i64 loc(#loc130) + %remainder = arith.remsi %3, %c128_i64 : i64 loc(#loc131) + %fixed = arith.cmpi ne, %remainder, %c0_i64 : i64 loc(#loc132) + %fixed_27 = arith.subi %quot, %c1_i64 : i64 loc(#loc133) + %fixed_28 = arith.select %fixed, %fixed_27, %quot : i64 loc(#loc134) + %4 = arith.cmpi slt, %3, %c0_i64 : i64 loc(#loc120) + %5 = arith.select %4, %fixed_28, %quot : i64 loc(#loc121) + %6 = arith.addi %5, %c1_i64 : i64 loc(#loc55) + %7 = tt.splat %6 : i64 -> tensor<1x32xi64> loc(#loc56) + %8 = arith.cmpi slt, %tmp15, %7 : tensor<1x32xi64> loc(#loc56) + %9 = arith.andi %2, %8 : tensor<1x32xi1> loc(#loc57) + %10 = arith.xori %tmp6_21, %cst_1 : tensor<1x32xi1> loc(#loc58) + %11 = arith.ori %9, %10 : tensor<1x32xi1> loc(#loc59) + tt.assert %11, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1> loc(#loc60) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc61) + %13 = tt.addptr %12, %tmp6_17 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc61) + tt.store %13, %tmp7, %tmp6_21 : tensor<1x32x!tt.ptr> loc(#loc62) + %14 = tt.splat %tmp6_13 : i64 -> tensor<1x32xi64> loc(#loc63) + %15 = arith.addi %tmp15, %14 : tensor<1x32xi64> loc(#loc63) + %16 = arith.muli %ks0, %tmp6_13 : i64 loc(#loc64) + %17 = tt.splat %16 : i64 -> tensor<1x32xi64> loc(#loc122) + %18 = arith.addi %15, %17 : tensor<1x32xi64> loc(#loc65) + %19 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc66) + %20 = tt.addptr %19, %18 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc66) + tt.store %20, %cst_0, %tmp6_21 : tensor<1x32x!tt.ptr> loc(#loc67) + } loc(#loc25) + tt.return loc(#loc68) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":25:27) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":25:37) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":29:40) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":30:31) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":31:29) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":35:45) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":35:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":35:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":35:60) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":35:50) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":36:23) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":38:23) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":39:48) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":39:8) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":40:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":41:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":42:25) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":42:36) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":43:40) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":44:31) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":45:29) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:60) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:86) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:77) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:68) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:52) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:45) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:41) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:34) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:103) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":49:93) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":50:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":52:22) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":54:37) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":55:20) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":56:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":57:24) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":58:39) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:32) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:94) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:100) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:55) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:50) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:42) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:112) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:110) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":59:130) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":61:29) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":61:94) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":62:53) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":62:62) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":62:58) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":62:29) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":62:95) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pz/cpz4qtb54zohukrmcagdai5zuh3utgu3eax4ghwvhtdsekz2cclm.py":43:4) +#loc78 = loc("xoffset"(#loc2)) +#loc79 = loc("xmask"(#loc3)) +#loc80 = loc("r0_base"(#loc4)) +#loc81 = loc("r0_base"(#loc5)) +#loc82 = loc("_tmp3"(#loc6)) +#loc83 = loc("r0_index"(#loc7)) +#loc84 = loc("r0_mask"(#loc8)) +#loc85 = loc("tmp0"(#loc9)) +#loc86 = loc("tmp0"(#loc10)) +#loc87 = loc("tmp0"(#loc11)) +#loc88 = loc("tmp0"(#loc12)) +#loc89 = loc("tmp0"(#loc13)) +#loc90 = loc("tmp1"(#loc14)) +#loc91 = loc("tmp4"(#loc15)) +#loc92 = loc("_tmp3"(#loc16)) +#loc94 = loc("tmp3"(#loc21)) +#loc95 = loc("tmp5"(#loc22)) +#loc96 = loc("r0_index"(#loc26)) +#loc97 = loc("r0_mask"(#loc27)) +#loc98 = loc("tmp6"(#loc28)) +#loc99 = loc("tmp6"(#loc29)) +#loc100 = loc("tmp6"(#loc30)) +#loc101 = loc("tmp6"(#loc31)) +#loc102 = loc("tmp6"(#loc32)) +#loc103 = loc("tmp6"(#loc33)) +#loc104 = loc("tmp6"(#loc34)) +#loc105 = loc("tmp6"(#loc35)) +#loc106 = loc("tmp6"(#loc36)) +#loc107 = loc("tmp6"(#loc37)) +#loc108 = loc("tmp7"(#loc38)) +#loc109 = loc("tmp9"(#loc39)) +#loc110 = loc("tmp11"(#loc40)) +#loc111 = loc("tmp12"(#loc41)) +#loc112 = loc("tmp13"(#loc42)) +#loc113 = loc("tmp14"(#loc43)) +#loc114 = loc("tmp15"(#loc44)) +#loc115 = loc("quot"(#loc47)) +#loc116 = loc("remainder"(#loc49)) +#loc117 = loc("fixed"(#loc50)) +#loc118 = loc("fixed"(#loc51)) +#loc119 = loc("fixed"(#loc52)) +#loc120 = loc(callsite(#loc53 at #loc48)) +#loc121 = loc(callsite(#loc54 at #loc48)) +#loc122 = loc(fused[#loc65, #loc64]) +#loc123 = loc(fused[#loc86, #loc85]) +#loc124 = loc(fused[#loc88, #loc79]) +#loc125 = loc(callsite(#loc18 at #loc93)) +#loc127 = loc(fused[#loc101, #loc102]) +#loc128 = loc(fused[#loc104, #loc103]) +#loc129 = loc(fused[#loc106, #loc79]) +#loc130 = loc(callsite(#loc115 at #loc48)) +#loc131 = loc(callsite(#loc116 at #loc48)) +#loc132 = loc(callsite(#loc117 at #loc48)) +#loc133 = loc(callsite(#loc118 at #loc48)) +#loc134 = loc(callsite(#loc119 at #loc48)) +#loc135 = loc(callsite(#loc20 at #loc125)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..70d3e4db156da039345235479b27f875100ad147 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin new file mode 100644 index 0000000000000000000000000000000000000000..fafe8d42b8fe7b1c8d6482565d19a6834ef0691f Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..beec3fa5a32eff013a0bab0800bb387fd59bbb0f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"hash": "07c7815d2ce5fa33e16044674f04a1dcbb415776e0b5f0da0149af801b6db42c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 2048, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir new file mode 100644 index 0000000000000000000000000000000000000000..83c7e83bbe8335a37cf7c51de6874d96ec0afb7a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir @@ -0,0 +1,1814 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 5, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = lshr i32 %10, 5, !dbg !9 + %12 = and i32 %10, 31, !dbg !9 + %13 = lshr i32 %10, 2, !dbg !9 + %14 = and i32 %13, 31, !dbg !9 + %15 = or disjoint i32 %9, %12, !dbg !10 + %16 = or disjoint i32 %14, %9, !dbg !10 + %17 = icmp slt i32 %15, 32, !dbg !11 + %18 = icmp slt i32 %16, 32, !dbg !11 + %19 = and i32 %10, 96, !dbg !12 + %20 = lshr exact i32 %19, 5, !dbg !12 + %21 = or disjoint i32 %20, 4, !dbg !12 + %22 = or disjoint i32 %20, 8, !dbg !12 + %23 = or disjoint i32 %20, 12, !dbg !12 + %24 = shl nuw nsw i32 %10, 2, !dbg !12 + %25 = and i32 %24, 12, !dbg !12 + %26 = sdiv i32 %15, 16, !dbg !13 + %27 = mul nuw nsw i32 %20, 17, !dbg !14 + %28 = mul nuw nsw i32 %21, 17, !dbg !14 + %29 = mul nuw nsw i32 %22, 17, !dbg !14 + %30 = mul nuw nsw i32 %23, 17, !dbg !14 + %31 = shl i32 %26, 8, !dbg !15 + %32 = add i32 %31, %15, !dbg !15 + %33 = add i32 %32, %27, !dbg !16 + %34 = add i32 %32, %28, !dbg !16 + %35 = add i32 %32, %29, !dbg !16 + %36 = add i32 %32, %30, !dbg !16 + %37 = sext i32 %33 to i64, !dbg !17 + %38 = getelementptr i32, ptr addrspace(1) %0, i64 %37, !dbg !17 + %39 = sext i32 %34 to i64, !dbg !17 + %40 = getelementptr i32, ptr addrspace(1) %0, i64 %39, !dbg !17 + %41 = sext i32 %35 to i64, !dbg !17 + %42 = getelementptr i32, ptr addrspace(1) %0, i64 %41, !dbg !17 + %43 = sext i32 %36 to i64, !dbg !17 + %44 = getelementptr i32, ptr addrspace(1) %0, i64 %43, !dbg !17 + %45 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %38, i1 %17) #5, !dbg !18 + %46 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %40, i1 %17) #5, !dbg !18 + %47 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %42, i1 %17) #5, !dbg !18 + %48 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %44, i1 %17) #5, !dbg !18 + %49 = lshr i32 %10, 6, !dbg !19 + %.lobit = and i32 %49, 1, !dbg !19 + %.lobit1 = and i32 %11, 1, !dbg !19 + %50 = xor i32 %.lobit1, 1, !dbg !23 + %51 = xor i32 %.lobit, 1, !dbg !23 + %52 = mul nuw nsw i32 %45, %50, !dbg !24 + %53 = mul nuw nsw i32 %46, %50, !dbg !24 + %54 = mul nuw nsw i32 %47, %50, !dbg !24 + %55 = mul nuw nsw i32 %48, %50, !dbg !24 + %.idx66 = shl nuw nsw i32 %.lobit, 3, !dbg !25 + %56 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx66, !dbg !25 + %.idx67 = shl nuw nsw i32 %12, 6, !dbg !25 + %57 = getelementptr i8, ptr addrspace(3) %56, i32 %.idx67, !dbg !25 + %58 = getelementptr i32, ptr addrspace(3) %57, i32 %.lobit1, !dbg !25 + %59 = insertelement <1 x i32> poison, i32 %52, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %59, i1 true) #5, !dbg !25 + %60 = getelementptr i8, ptr addrspace(3) %57, i32 16, !dbg !25 + %61 = getelementptr i32, ptr addrspace(3) %60, i32 %.lobit1, !dbg !25 + %62 = insertelement <1 x i32> poison, i32 %53, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %62, i1 true) #5, !dbg !25 + %63 = getelementptr i8, ptr addrspace(3) %57, i32 32, !dbg !25 + %64 = getelementptr i32, ptr addrspace(3) %63, i32 %.lobit1, !dbg !25 + %65 = insertelement <1 x i32> poison, i32 %54, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %65, i1 true) #5, !dbg !25 + %66 = getelementptr i8, ptr addrspace(3) %57, i32 48, !dbg !25 + %67 = getelementptr i32, ptr addrspace(3) %66, i32 %.lobit1, !dbg !25 + %68 = insertelement <1 x i32> poison, i32 %55, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %68, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %69 = icmp samesign ult i32 %10, 512, !dbg !25 + %70 = getelementptr i32, ptr addrspace(3) @global_smem, i32 %10, !dbg !25 + %71 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %72 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 1, i32 31), !dbg !25 + %73 = add i32 %72, %71, !dbg !28 + %74 = and i32 %10, 513, !dbg !25 + %75 = icmp eq i32 %74, 0, !dbg !25 + %76 = insertelement <1 x i32> poison, i32 %73, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %76, i1 %75) #5, !dbg !25 + %77 = getelementptr i8, ptr addrspace(3) %70, i32 512, !dbg !25 + %78 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 1, i32 31), !dbg !25 + %80 = add i32 %79, %78, !dbg !28 + %81 = insertelement <1 x i32> poison, i32 %80, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %81, i1 %75) #5, !dbg !25 + %82 = getelementptr i8, ptr addrspace(3) %70, i32 1024, !dbg !25 + %83 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %84 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %83, i32 1, i32 31), !dbg !25 + %85 = add i32 %84, %83, !dbg !28 + %86 = insertelement <1 x i32> poison, i32 %85, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %86, i1 %75) #5, !dbg !25 + %87 = getelementptr i8, ptr addrspace(3) %70, i32 1536, !dbg !25 + %88 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 1, i32 31), !dbg !25 + %90 = add i32 %89, %88, !dbg !28 + %91 = insertelement <1 x i32> poison, i32 %90, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %91, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %92 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %93 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %94 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %95 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %96 = mul nuw nsw i32 %45, %.lobit1, !dbg !29 + %97 = mul nuw nsw i32 %46, %.lobit1, !dbg !29 + %98 = mul nuw nsw i32 %47, %.lobit1, !dbg !29 + %99 = mul nuw nsw i32 %48, %.lobit1, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %100 = insertelement <1 x i32> poison, i32 %96, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %100, i1 true) #5, !dbg !25 + %101 = insertelement <1 x i32> poison, i32 %97, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %101, i1 true) #5, !dbg !25 + %102 = insertelement <1 x i32> poison, i32 %98, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %102, i1 true) #5, !dbg !25 + %103 = insertelement <1 x i32> poison, i32 %99, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %103, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %104 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 1, i32 31), !dbg !25 + %106 = add i32 %105, %104, !dbg !28 + %107 = insertelement <1 x i32> poison, i32 %106, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %107, i1 %75) #5, !dbg !25 + %108 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 1, i32 31), !dbg !25 + %110 = add i32 %109, %108, !dbg !28 + %111 = insertelement <1 x i32> poison, i32 %110, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %111, i1 %75) #5, !dbg !25 + %112 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !25 + %114 = add i32 %113, %112, !dbg !28 + %115 = insertelement <1 x i32> poison, i32 %114, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %115, i1 %75) #5, !dbg !25 + %116 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 1, i32 31), !dbg !25 + %118 = add i32 %117, %116, !dbg !28 + %119 = insertelement <1 x i32> poison, i32 %118, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %119, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %120 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %121 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %122 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %123 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %124 = mul nuw nsw i32 %50, %20, !dbg !30 + %125 = mul nuw nsw i32 %21, %50, !dbg !30 + %126 = mul nuw nsw i32 %22, %50, !dbg !30 + %127 = mul nuw nsw i32 %23, %50, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %128 = insertelement <1 x i32> poison, i32 %124, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %128, i1 true) #5, !dbg !25 + %129 = insertelement <1 x i32> poison, i32 %125, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %129, i1 true) #5, !dbg !25 + %130 = insertelement <1 x i32> poison, i32 %126, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %130, i1 true) #5, !dbg !25 + %131 = insertelement <1 x i32> poison, i32 %127, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %131, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %132 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 1, i32 31), !dbg !25 + %134 = add i32 %133, %132, !dbg !28 + %135 = insertelement <1 x i32> poison, i32 %134, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %135, i1 %75) #5, !dbg !25 + %136 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 1, i32 31), !dbg !25 + %138 = add i32 %137, %136, !dbg !28 + %139 = insertelement <1 x i32> poison, i32 %138, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %139, i1 %75) #5, !dbg !25 + %140 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 1, i32 31), !dbg !25 + %142 = add i32 %141, %140, !dbg !28 + %143 = insertelement <1 x i32> poison, i32 %142, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %143, i1 %75) #5, !dbg !25 + %144 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 1, i32 31), !dbg !25 + %146 = add i32 %145, %144, !dbg !28 + %147 = insertelement <1 x i32> poison, i32 %146, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %147, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %148 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %149 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %150 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %151 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %152 = mul nuw nsw i32 %20, %.lobit1, !dbg !31 + %153 = mul nuw nsw i32 %21, %.lobit1, !dbg !31 + %154 = mul nuw nsw i32 %22, %.lobit1, !dbg !31 + %155 = mul nuw nsw i32 %23, %.lobit1, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %156 = insertelement <1 x i32> poison, i32 %152, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %156, i1 true) #5, !dbg !25 + %157 = insertelement <1 x i32> poison, i32 %153, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %157, i1 true) #5, !dbg !25 + %158 = insertelement <1 x i32> poison, i32 %154, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %158, i1 true) #5, !dbg !25 + %159 = insertelement <1 x i32> poison, i32 %155, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %159, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %160 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 1, i32 31), !dbg !25 + %162 = add i32 %161, %160, !dbg !28 + %163 = insertelement <1 x i32> poison, i32 %162, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %163, i1 %75) #5, !dbg !25 + %164 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %165 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 1, i32 31), !dbg !25 + %166 = add i32 %165, %164, !dbg !28 + %167 = insertelement <1 x i32> poison, i32 %166, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %167, i1 %75) #5, !dbg !25 + %168 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 1, i32 31), !dbg !25 + %170 = add i32 %169, %168, !dbg !28 + %171 = insertelement <1 x i32> poison, i32 %170, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %171, i1 %75) #5, !dbg !25 + %172 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %173 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %172, i32 1, i32 31), !dbg !25 + %174 = add i32 %173, %172, !dbg !28 + %175 = insertelement <1 x i32> poison, i32 %174, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %175, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %176 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %177 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %178 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %179 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %180 = trunc i32 %49 to i1, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %181 = shl nuw nsw i32 %12, 2, !dbg !25 + %182 = or disjoint i32 %181, 1, !dbg !25 + %183 = or disjoint i32 %181, 2, !dbg !25 + %184 = or disjoint i32 %181, 3, !dbg !25 + %185 = shl nuw nsw i32 %.lobit1, 7, !dbg !25 + %186 = or disjoint i32 %185, %181, !dbg !25 + %.idx = shl nuw nsw i32 %186, 3, !dbg !25 + %187 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx, !dbg !25 + %188 = getelementptr i32, ptr addrspace(3) %187, i32 %.lobit, !dbg !25 + %189 = or disjoint i32 %185, %182, !dbg !25 + %.idx63 = shl nuw nsw i32 %189, 3, !dbg !25 + %190 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx63, !dbg !25 + %191 = getelementptr i32, ptr addrspace(3) %190, i32 %.lobit, !dbg !25 + %192 = or disjoint i32 %185, %183, !dbg !25 + %.idx64 = shl nuw nsw i32 %192, 3, !dbg !25 + %193 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx64, !dbg !25 + %194 = getelementptr i32, ptr addrspace(3) %193, i32 %.lobit, !dbg !25 + %195 = or disjoint i32 %185, %184, !dbg !25 + %.idx65 = shl nuw nsw i32 %195, 3, !dbg !25 + %196 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx65, !dbg !25 + %197 = getelementptr i32, ptr addrspace(3) %196, i32 %.lobit, !dbg !25 + %198 = insertelement <2 x i32> poison, i32 %92, i64 0, !dbg !32 + %199 = insertelement <2 x i32> %198, i32 %93, i64 1, !dbg !32 + %200 = insertelement <2 x i32> poison, i32 %120, i64 0, !dbg !32 + %201 = insertelement <2 x i32> %200, i32 %121, i64 1, !dbg !32 + %202 = icmp sge <2 x i32> %199, %201, !dbg !32 + %203 = icmp ne <2 x i32> %199, %201, !dbg !32 + %204 = insertelement <2 x i32> poison, i32 %148, i64 0, !dbg !32 + %205 = insertelement <2 x i32> %204, i32 %149, i64 1, !dbg !32 + %206 = insertelement <2 x i32> poison, i32 %176, i64 0, !dbg !32 + %207 = insertelement <2 x i32> %206, i32 %177, i64 1, !dbg !32 + %208 = icmp sle <2 x i32> %205, %207, !dbg !32 + %209 = or <2 x i1> %203, %208, !dbg !32 + %210 = and <2 x i1> %202, %209, !dbg !32 + %211 = insertelement <2 x i1> poison, i1 %180, i64 0, !dbg !32 + %212 = shufflevector <2 x i1> %211, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !32 + %213 = xor <2 x i1> %210, %212, !dbg !32 + %214 = insertelement <2 x i32> %206, i32 %121, i64 1, !dbg !33 + %215 = insertelement <2 x i32> %204, i32 %93, i64 1, !dbg !33 + %216 = xor <2 x i32> %214, %215, !dbg !33 + %217 = insertelement <2 x i32> poison, i32 %177, i64 0, !dbg !33 + %218 = insertelement <2 x i32> %217, i32 %120, i64 1, !dbg !33 + %219 = insertelement <2 x i32> poison, i32 %149, i64 0, !dbg !33 + %220 = insertelement <2 x i32> %219, i32 %92, i64 1, !dbg !33 + %221 = xor <2 x i32> %218, %220, !dbg !33 + %222 = select <2 x i1> %213, <2 x i32> zeroinitializer, <2 x i32> %216, !dbg !34 + %223 = shufflevector <2 x i1> %213, <2 x i1> poison, <2 x i32> , !dbg !34 + %224 = select <2 x i1> %223, <2 x i32> zeroinitializer, <2 x i32> %221, !dbg !34 + %225 = insertelement <2 x i32> poison, i32 %20, i64 0, !dbg !35 + %226 = insertelement <2 x i32> %225, i32 %46, i64 1, !dbg !35 + %227 = xor <2 x i32> %222, %226, !dbg !35 + %228 = insertelement <2 x i32> poison, i32 %21, i64 0, !dbg !35 + %229 = insertelement <2 x i32> %228, i32 %45, i64 1, !dbg !35 + %230 = xor <2 x i32> %224, %229, !dbg !35 + %231 = extractelement <2 x i32> %230, i64 1, !dbg !29 + %232 = mul nuw nsw i32 %231, %51, !dbg !24 + %233 = extractelement <2 x i32> %227, i64 1, !dbg !29 + %234 = mul nuw nsw i32 %233, %51, !dbg !24 + %235 = insertelement <1 x i32> poison, i32 %232, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %235, i1 true) #5, !dbg !25 + %236 = insertelement <1 x i32> poison, i32 %234, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %236, i1 true) #5, !dbg !25 + %237 = mul nuw nsw i32 %231, %.lobit, !dbg !29 + %238 = mul nuw nsw i32 %233, %.lobit, !dbg !29 + %239 = insertelement <1 x i32> poison, i32 %237, i64 0, !dbg !25 + %240 = insertelement <1 x i32> poison, i32 %238, i64 0, !dbg !25 + %241 = extractelement <2 x i32> %227, i64 0, !dbg !31 + %242 = mul nuw nsw i32 %241, %51, !dbg !30 + %243 = extractelement <2 x i32> %230, i64 0, !dbg !31 + %244 = mul nuw nsw i32 %243, %51, !dbg !30 + %245 = insertelement <1 x i32> poison, i32 %242, i64 0, !dbg !25 + %246 = insertelement <1 x i32> poison, i32 %244, i64 0, !dbg !25 + %247 = mul nuw nsw i32 %241, %.lobit, !dbg !31 + %248 = mul nuw nsw i32 %243, %.lobit, !dbg !31 + %249 = insertelement <1 x i32> poison, i32 %247, i64 0, !dbg !25 + %250 = insertelement <1 x i32> poison, i32 %248, i64 0, !dbg !25 + %251 = insertelement <2 x i32> poison, i32 %94, i64 0, !dbg !32 + %252 = insertelement <2 x i32> %251, i32 %95, i64 1, !dbg !32 + %253 = insertelement <2 x i32> poison, i32 %122, i64 0, !dbg !32 + %254 = insertelement <2 x i32> %253, i32 %123, i64 1, !dbg !32 + %255 = icmp sge <2 x i32> %252, %254, !dbg !32 + %256 = icmp ne <2 x i32> %252, %254, !dbg !32 + %257 = insertelement <2 x i32> poison, i32 %150, i64 0, !dbg !32 + %258 = insertelement <2 x i32> %257, i32 %151, i64 1, !dbg !32 + %259 = insertelement <2 x i32> poison, i32 %178, i64 0, !dbg !32 + %260 = insertelement <2 x i32> %259, i32 %179, i64 1, !dbg !32 + %261 = icmp sle <2 x i32> %258, %260, !dbg !32 + %262 = or <2 x i1> %256, %261, !dbg !32 + %263 = and <2 x i1> %255, %262, !dbg !32 + %264 = xor <2 x i1> %263, %212, !dbg !32 + %265 = insertelement <2 x i32> %259, i32 %123, i64 1, !dbg !33 + %266 = insertelement <2 x i32> %257, i32 %95, i64 1, !dbg !33 + %267 = xor <2 x i32> %265, %266, !dbg !33 + %268 = insertelement <2 x i32> poison, i32 %179, i64 0, !dbg !33 + %269 = insertelement <2 x i32> %268, i32 %122, i64 1, !dbg !33 + %270 = insertelement <2 x i32> poison, i32 %151, i64 0, !dbg !33 + %271 = insertelement <2 x i32> %270, i32 %94, i64 1, !dbg !33 + %272 = xor <2 x i32> %269, %271, !dbg !33 + %273 = select <2 x i1> %264, <2 x i32> zeroinitializer, <2 x i32> %267, !dbg !34 + %274 = shufflevector <2 x i1> %264, <2 x i1> poison, <2 x i32> , !dbg !34 + %275 = select <2 x i1> %274, <2 x i32> zeroinitializer, <2 x i32> %272, !dbg !34 + %276 = insertelement <2 x i32> poison, i32 %22, i64 0, !dbg !35 + %277 = insertelement <2 x i32> %276, i32 %48, i64 1, !dbg !35 + %278 = xor <2 x i32> %273, %277, !dbg !35 + %279 = insertelement <2 x i32> poison, i32 %23, i64 0, !dbg !35 + %280 = insertelement <2 x i32> %279, i32 %47, i64 1, !dbg !35 + %281 = xor <2 x i32> %275, %280, !dbg !35 + %282 = extractelement <2 x i32> %281, i64 1, !dbg !29 + %283 = mul nuw nsw i32 %282, %51, !dbg !24 + %284 = extractelement <2 x i32> %278, i64 1, !dbg !29 + %285 = mul nuw nsw i32 %284, %51, !dbg !24 + %286 = insertelement <1 x i32> poison, i32 %283, i64 0, !dbg !25 + %287 = insertelement <1 x i32> poison, i32 %285, i64 0, !dbg !25 + %288 = mul nuw nsw i32 %282, %.lobit, !dbg !29 + %289 = mul nuw nsw i32 %284, %.lobit, !dbg !29 + %290 = insertelement <1 x i32> poison, i32 %288, i64 0, !dbg !25 + %291 = insertelement <1 x i32> poison, i32 %289, i64 0, !dbg !25 + %292 = extractelement <2 x i32> %278, i64 0, !dbg !31 + %293 = mul nuw nsw i32 %292, %51, !dbg !30 + %294 = extractelement <2 x i32> %281, i64 0, !dbg !31 + %295 = mul nuw nsw i32 %294, %51, !dbg !30 + %296 = insertelement <1 x i32> poison, i32 %293, i64 0, !dbg !25 + %297 = insertelement <1 x i32> poison, i32 %295, i64 0, !dbg !25 + %298 = mul nuw nsw i32 %292, %.lobit, !dbg !31 + %299 = mul nuw nsw i32 %294, %.lobit, !dbg !31 + %300 = insertelement <1 x i32> poison, i32 %298, i64 0, !dbg !25 + %301 = insertelement <1 x i32> poison, i32 %299, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %286, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %287, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %302 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %303 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %302, i32 1, i32 31), !dbg !25 + %304 = add i32 %303, %302, !dbg !28 + %305 = insertelement <1 x i32> poison, i32 %304, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %305, i1 %75) #5, !dbg !25 + %306 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %306, i32 1, i32 31), !dbg !25 + %308 = add i32 %307, %306, !dbg !28 + %309 = insertelement <1 x i32> poison, i32 %308, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %309, i1 %75) #5, !dbg !25 + %310 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %311 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %310, i32 1, i32 31), !dbg !25 + %312 = add i32 %311, %310, !dbg !28 + %313 = insertelement <1 x i32> poison, i32 %312, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %313, i1 %75) #5, !dbg !25 + %314 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !25 + %316 = add i32 %315, %314, !dbg !28 + %317 = insertelement <1 x i32> poison, i32 %316, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %317, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %318 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %319 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %320 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %321 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %239, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %240, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %290, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %291, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %322 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %323 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %322, i32 1, i32 31), !dbg !25 + %324 = add i32 %323, %322, !dbg !28 + %325 = insertelement <1 x i32> poison, i32 %324, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %325, i1 %75) #5, !dbg !25 + %326 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %327 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %326, i32 1, i32 31), !dbg !25 + %328 = add i32 %327, %326, !dbg !28 + %329 = insertelement <1 x i32> poison, i32 %328, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %329, i1 %75) #5, !dbg !25 + %330 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %331 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %330, i32 1, i32 31), !dbg !25 + %332 = add i32 %331, %330, !dbg !28 + %333 = insertelement <1 x i32> poison, i32 %332, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %333, i1 %75) #5, !dbg !25 + %334 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %335 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %334, i32 1, i32 31), !dbg !25 + %336 = add i32 %335, %334, !dbg !28 + %337 = insertelement <1 x i32> poison, i32 %336, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %337, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %338 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %339 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %340 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %341 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %245, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %246, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %296, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %297, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %342 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %343 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %342, i32 1, i32 31), !dbg !25 + %344 = add i32 %343, %342, !dbg !28 + %345 = insertelement <1 x i32> poison, i32 %344, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %345, i1 %75) #5, !dbg !25 + %346 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %347 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %346, i32 1, i32 31), !dbg !25 + %348 = add i32 %347, %346, !dbg !28 + %349 = insertelement <1 x i32> poison, i32 %348, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %349, i1 %75) #5, !dbg !25 + %350 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %351 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %350, i32 1, i32 31), !dbg !25 + %352 = add i32 %351, %350, !dbg !28 + %353 = insertelement <1 x i32> poison, i32 %352, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %353, i1 %75) #5, !dbg !25 + %354 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %355 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %354, i32 1, i32 31), !dbg !25 + %356 = add i32 %355, %354, !dbg !28 + %357 = insertelement <1 x i32> poison, i32 %356, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %357, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %358 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %359 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %360 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %361 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %249, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %250, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %300, i1 true) #5, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %301, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %362 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %363 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %362, i32 1, i32 31), !dbg !25 + %364 = add i32 %363, %362, !dbg !28 + %365 = insertelement <1 x i32> poison, i32 %364, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %365, i1 %75) #5, !dbg !25 + %366 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %367 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %366, i32 1, i32 31), !dbg !25 + %368 = add i32 %367, %366, !dbg !28 + %369 = insertelement <1 x i32> poison, i32 %368, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %369, i1 %75) #5, !dbg !25 + %370 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %371 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %370, i32 1, i32 31), !dbg !25 + %372 = add i32 %371, %370, !dbg !28 + %373 = insertelement <1 x i32> poison, i32 %372, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %373, i1 %75) #5, !dbg !25 + %374 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %375 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %374, i32 1, i32 31), !dbg !25 + %376 = add i32 %375, %374, !dbg !28 + %377 = insertelement <1 x i32> poison, i32 %376, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %377, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %378 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %379 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %380 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %381 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + %382 = icmp slt i32 %318, %338, !dbg !36 + %383 = icmp sge i32 %319, %339, !dbg !36 + %384 = icmp slt i32 %320, %340, !dbg !36 + %385 = icmp sge i32 %321, %341, !dbg !36 + %386 = icmp eq i32 %318, %338, !dbg !37 + %387 = icmp ne i32 %319, %339, !dbg !37 + %388 = icmp eq i32 %320, %340, !dbg !37 + %389 = icmp ne i32 %321, %341, !dbg !37 + %390 = icmp sgt i32 %358, %378, !dbg !38 + %391 = icmp sle i32 %359, %379, !dbg !38 + %392 = icmp sgt i32 %360, %380, !dbg !38 + %393 = icmp sle i32 %361, %381, !dbg !38 + %394 = insertelement <2 x i1> poison, i1 %386, i64 0, !dbg !39 + %395 = insertelement <2 x i1> %394, i1 %387, i64 1, !dbg !39 + %396 = insertelement <2 x i1> poison, i1 %390, i64 0, !dbg !39 + %397 = insertelement <2 x i1> %396, i1 %391, i64 1, !dbg !39 + %398 = or <2 x i1> %395, %397, !dbg !39 + %399 = and <2 x i1> %395, %397, !dbg !39 + %400 = shufflevector <2 x i1> %399, <2 x i1> %398, <2 x i32> , !dbg !39 + %401 = insertelement <2 x i1> poison, i1 %388, i64 0, !dbg !40 + %402 = insertelement <2 x i1> %401, i1 %389, i64 1, !dbg !40 + %403 = insertelement <2 x i1> poison, i1 %392, i64 0, !dbg !40 + %404 = insertelement <2 x i1> %403, i1 %393, i64 1, !dbg !40 + %405 = or <2 x i1> %402, %404, !dbg !40 + %406 = and <2 x i1> %402, %404, !dbg !40 + %407 = shufflevector <2 x i1> %406, <2 x i1> %405, <2 x i32> , !dbg !40 + %408 = insertelement <2 x i1> poison, i1 %382, i64 0, !dbg !40 + %409 = insertelement <2 x i1> %408, i1 %383, i64 1, !dbg !40 + %410 = and <2 x i1> %409, %400, !dbg !40 + %411 = or <2 x i1> %409, %400, !dbg !40 + %412 = shufflevector <2 x i1> %411, <2 x i1> %410, <2 x i32> , !dbg !40 + %413 = insertelement <2 x i1> poison, i1 %384, i64 0, !dbg !41 + %414 = insertelement <2 x i1> %413, i1 %385, i64 1, !dbg !41 + %415 = and <2 x i1> %414, %407, !dbg !41 + %416 = or <2 x i1> %414, %407, !dbg !41 + %417 = shufflevector <2 x i1> %416, <2 x i1> %415, <2 x i32> , !dbg !41 + %418 = insertelement <2 x i32> poison, i32 %378, i64 0, !dbg !33 + %419 = insertelement <2 x i32> %418, i32 %339, i64 1, !dbg !33 + %420 = insertelement <2 x i32> poison, i32 %358, i64 0, !dbg !33 + %421 = insertelement <2 x i32> %420, i32 %319, i64 1, !dbg !33 + %422 = xor <2 x i32> %419, %421, !dbg !33 + %423 = insertelement <2 x i32> poison, i32 %379, i64 0, !dbg !33 + %424 = insertelement <2 x i32> %423, i32 %338, i64 1, !dbg !33 + %425 = insertelement <2 x i32> poison, i32 %359, i64 0, !dbg !33 + %426 = insertelement <2 x i32> %425, i32 %318, i64 1, !dbg !33 + %427 = xor <2 x i32> %424, %426, !dbg !33 + %428 = insertelement <2 x i32> poison, i32 %380, i64 0, !dbg !33 + %429 = insertelement <2 x i32> %428, i32 %341, i64 1, !dbg !33 + %430 = insertelement <2 x i32> poison, i32 %360, i64 0, !dbg !33 + %431 = insertelement <2 x i32> %430, i32 %321, i64 1, !dbg !33 + %432 = xor <2 x i32> %429, %431, !dbg !33 + %433 = insertelement <2 x i32> poison, i32 %381, i64 0, !dbg !33 + %434 = insertelement <2 x i32> %433, i32 %340, i64 1, !dbg !33 + %435 = insertelement <2 x i32> poison, i32 %361, i64 0, !dbg !33 + %436 = insertelement <2 x i32> %435, i32 %320, i64 1, !dbg !33 + %437 = xor <2 x i32> %434, %436, !dbg !33 + %438 = select <2 x i1> %412, <2 x i32> %422, <2 x i32> zeroinitializer, !dbg !34 + %439 = shufflevector <2 x i1> %410, <2 x i1> %411, <2 x i32> , !dbg !34 + %440 = select <2 x i1> %439, <2 x i32> %427, <2 x i32> zeroinitializer, !dbg !34 + %441 = select <2 x i1> %417, <2 x i32> %432, <2 x i32> zeroinitializer, !dbg !34 + %442 = shufflevector <2 x i1> %415, <2 x i1> %416, <2 x i32> , !dbg !34 + %443 = select <2 x i1> %442, <2 x i32> %437, <2 x i32> zeroinitializer, !dbg !34 + %444 = xor <2 x i32> %438, %227, !dbg !35 + %445 = xor <2 x i32> %440, %230, !dbg !35 + %446 = xor <2 x i32> %441, %278, !dbg !35 + %447 = xor <2 x i32> %443, %281, !dbg !35 + %448 = extractelement <2 x i32> %445, i64 1, !dbg !29 + %449 = mul nuw nsw i32 %448, %50, !dbg !24 + %450 = extractelement <2 x i32> %444, i64 1, !dbg !29 + %451 = mul nuw nsw i32 %450, %50, !dbg !24 + %452 = extractelement <2 x i32> %447, i64 1, !dbg !29 + %453 = mul nuw nsw i32 %452, %50, !dbg !24 + %454 = extractelement <2 x i32> %446, i64 1, !dbg !29 + %455 = mul nuw nsw i32 %454, %50, !dbg !24 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %456 = insertelement <1 x i32> poison, i32 %449, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %456, i1 true) #5, !dbg !25 + %457 = insertelement <1 x i32> poison, i32 %451, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %457, i1 true) #5, !dbg !25 + %458 = insertelement <1 x i32> poison, i32 %453, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %458, i1 true) #5, !dbg !25 + %459 = insertelement <1 x i32> poison, i32 %455, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %459, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %460 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %461 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %460, i32 1, i32 31), !dbg !25 + %462 = add i32 %461, %460, !dbg !28 + %463 = insertelement <1 x i32> poison, i32 %462, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %463, i1 %75) #5, !dbg !25 + %464 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %465 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %464, i32 1, i32 31), !dbg !25 + %466 = add i32 %465, %464, !dbg !28 + %467 = insertelement <1 x i32> poison, i32 %466, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %467, i1 %75) #5, !dbg !25 + %468 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %469 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %468, i32 1, i32 31), !dbg !25 + %470 = add i32 %469, %468, !dbg !28 + %471 = insertelement <1 x i32> poison, i32 %470, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %471, i1 %75) #5, !dbg !25 + %472 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %473 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %472, i32 1, i32 31), !dbg !25 + %474 = add i32 %473, %472, !dbg !28 + %475 = insertelement <1 x i32> poison, i32 %474, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %475, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %476 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %477 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %478 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %479 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %480 = mul nuw nsw i32 %448, %.lobit1, !dbg !29 + %481 = mul nuw nsw i32 %450, %.lobit1, !dbg !29 + %482 = mul nuw nsw i32 %452, %.lobit1, !dbg !29 + %483 = mul nuw nsw i32 %454, %.lobit1, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %484 = insertelement <1 x i32> poison, i32 %480, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %484, i1 true) #5, !dbg !25 + %485 = insertelement <1 x i32> poison, i32 %481, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %485, i1 true) #5, !dbg !25 + %486 = insertelement <1 x i32> poison, i32 %482, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %486, i1 true) #5, !dbg !25 + %487 = insertelement <1 x i32> poison, i32 %483, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %487, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %488 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %489 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %488, i32 1, i32 31), !dbg !25 + %490 = add i32 %489, %488, !dbg !28 + %491 = insertelement <1 x i32> poison, i32 %490, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %491, i1 %75) #5, !dbg !25 + %492 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %493 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %492, i32 1, i32 31), !dbg !25 + %494 = add i32 %493, %492, !dbg !28 + %495 = insertelement <1 x i32> poison, i32 %494, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %495, i1 %75) #5, !dbg !25 + %496 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %497 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %496, i32 1, i32 31), !dbg !25 + %498 = add i32 %497, %496, !dbg !28 + %499 = insertelement <1 x i32> poison, i32 %498, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %499, i1 %75) #5, !dbg !25 + %500 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %501 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %500, i32 1, i32 31), !dbg !25 + %502 = add i32 %501, %500, !dbg !28 + %503 = insertelement <1 x i32> poison, i32 %502, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %503, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %504 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %505 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %506 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %507 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %508 = extractelement <2 x i32> %444, i64 0, !dbg !31 + %509 = mul nuw nsw i32 %508, %50, !dbg !30 + %510 = extractelement <2 x i32> %445, i64 0, !dbg !31 + %511 = mul nuw nsw i32 %510, %50, !dbg !30 + %512 = extractelement <2 x i32> %446, i64 0, !dbg !31 + %513 = mul nuw nsw i32 %512, %50, !dbg !30 + %514 = extractelement <2 x i32> %447, i64 0, !dbg !31 + %515 = mul nuw nsw i32 %514, %50, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %516 = insertelement <1 x i32> poison, i32 %509, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %516, i1 true) #5, !dbg !25 + %517 = insertelement <1 x i32> poison, i32 %511, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %517, i1 true) #5, !dbg !25 + %518 = insertelement <1 x i32> poison, i32 %513, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %518, i1 true) #5, !dbg !25 + %519 = insertelement <1 x i32> poison, i32 %515, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %519, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %520 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %521 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %520, i32 1, i32 31), !dbg !25 + %522 = add i32 %521, %520, !dbg !28 + %523 = insertelement <1 x i32> poison, i32 %522, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %523, i1 %75) #5, !dbg !25 + %524 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %525 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %524, i32 1, i32 31), !dbg !25 + %526 = add i32 %525, %524, !dbg !28 + %527 = insertelement <1 x i32> poison, i32 %526, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %527, i1 %75) #5, !dbg !25 + %528 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %529 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %528, i32 1, i32 31), !dbg !25 + %530 = add i32 %529, %528, !dbg !28 + %531 = insertelement <1 x i32> poison, i32 %530, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %531, i1 %75) #5, !dbg !25 + %532 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %533 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %532, i32 1, i32 31), !dbg !25 + %534 = add i32 %533, %532, !dbg !28 + %535 = insertelement <1 x i32> poison, i32 %534, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %535, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %536 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %537 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %538 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %539 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %540 = mul nuw nsw i32 %508, %.lobit1, !dbg !31 + %541 = mul nuw nsw i32 %510, %.lobit1, !dbg !31 + %542 = mul nuw nsw i32 %512, %.lobit1, !dbg !31 + %543 = mul nuw nsw i32 %514, %.lobit1, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %544 = insertelement <1 x i32> poison, i32 %540, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %544, i1 true) #5, !dbg !25 + %545 = insertelement <1 x i32> poison, i32 %541, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %545, i1 true) #5, !dbg !25 + %546 = insertelement <1 x i32> poison, i32 %542, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %546, i1 true) #5, !dbg !25 + %547 = insertelement <1 x i32> poison, i32 %543, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %547, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %548 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %549 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %548, i32 1, i32 31), !dbg !25 + %550 = add i32 %549, %548, !dbg !28 + %551 = insertelement <1 x i32> poison, i32 %550, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %551, i1 %75) #5, !dbg !25 + %552 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %553 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %552, i32 1, i32 31), !dbg !25 + %554 = add i32 %553, %552, !dbg !28 + %555 = insertelement <1 x i32> poison, i32 %554, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %555, i1 %75) #5, !dbg !25 + %556 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %557 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %556, i32 1, i32 31), !dbg !25 + %558 = add i32 %557, %556, !dbg !28 + %559 = insertelement <1 x i32> poison, i32 %558, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %559, i1 %75) #5, !dbg !25 + %560 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %561 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %560, i32 1, i32 31), !dbg !25 + %562 = add i32 %561, %560, !dbg !28 + %563 = insertelement <1 x i32> poison, i32 %562, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %563, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %564 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %565 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %566 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %567 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %568 = icmp slt i32 %476, %504, !dbg !36 + %569 = icmp sge i32 %477, %505, !dbg !36 + %570 = icmp slt i32 %478, %506, !dbg !36 + %571 = icmp sge i32 %479, %507, !dbg !36 + %572 = icmp eq i32 %476, %504, !dbg !37 + %573 = icmp ne i32 %477, %505, !dbg !37 + %574 = icmp eq i32 %478, %506, !dbg !37 + %575 = icmp ne i32 %479, %507, !dbg !37 + %576 = icmp sgt i32 %536, %564, !dbg !38 + %577 = icmp sle i32 %537, %565, !dbg !38 + %578 = icmp sgt i32 %538, %566, !dbg !38 + %579 = icmp sle i32 %539, %567, !dbg !38 + %580 = insertelement <2 x i1> poison, i1 %572, i64 0, !dbg !39 + %581 = insertelement <2 x i1> %580, i1 %573, i64 1, !dbg !39 + %582 = insertelement <2 x i1> poison, i1 %576, i64 0, !dbg !39 + %583 = insertelement <2 x i1> %582, i1 %577, i64 1, !dbg !39 + %584 = or <2 x i1> %581, %583, !dbg !39 + %585 = and <2 x i1> %581, %583, !dbg !39 + %586 = shufflevector <2 x i1> %585, <2 x i1> %584, <2 x i32> , !dbg !39 + %587 = insertelement <2 x i1> poison, i1 %574, i64 0, !dbg !40 + %588 = insertelement <2 x i1> %587, i1 %575, i64 1, !dbg !40 + %589 = insertelement <2 x i1> poison, i1 %578, i64 0, !dbg !40 + %590 = insertelement <2 x i1> %589, i1 %579, i64 1, !dbg !40 + %591 = or <2 x i1> %588, %590, !dbg !40 + %592 = and <2 x i1> %588, %590, !dbg !40 + %593 = shufflevector <2 x i1> %592, <2 x i1> %591, <2 x i32> , !dbg !40 + %594 = insertelement <2 x i1> poison, i1 %568, i64 0, !dbg !40 + %595 = insertelement <2 x i1> %594, i1 %569, i64 1, !dbg !40 + %596 = and <2 x i1> %595, %586, !dbg !40 + %597 = or <2 x i1> %595, %586, !dbg !40 + %598 = shufflevector <2 x i1> %597, <2 x i1> %596, <2 x i32> , !dbg !40 + %599 = insertelement <2 x i1> poison, i1 %570, i64 0, !dbg !41 + %600 = insertelement <2 x i1> %599, i1 %571, i64 1, !dbg !41 + %601 = and <2 x i1> %600, %593, !dbg !41 + %602 = or <2 x i1> %600, %593, !dbg !41 + %603 = shufflevector <2 x i1> %602, <2 x i1> %601, <2 x i32> , !dbg !41 + %604 = insertelement <2 x i32> poison, i32 %564, i64 0, !dbg !33 + %605 = insertelement <2 x i32> %604, i32 %505, i64 1, !dbg !33 + %606 = insertelement <2 x i32> poison, i32 %536, i64 0, !dbg !33 + %607 = insertelement <2 x i32> %606, i32 %477, i64 1, !dbg !33 + %608 = xor <2 x i32> %605, %607, !dbg !33 + %609 = insertelement <2 x i32> poison, i32 %565, i64 0, !dbg !33 + %610 = insertelement <2 x i32> %609, i32 %504, i64 1, !dbg !33 + %611 = insertelement <2 x i32> poison, i32 %537, i64 0, !dbg !33 + %612 = insertelement <2 x i32> %611, i32 %476, i64 1, !dbg !33 + %613 = xor <2 x i32> %610, %612, !dbg !33 + %614 = insertelement <2 x i32> poison, i32 %566, i64 0, !dbg !33 + %615 = insertelement <2 x i32> %614, i32 %507, i64 1, !dbg !33 + %616 = insertelement <2 x i32> poison, i32 %538, i64 0, !dbg !33 + %617 = insertelement <2 x i32> %616, i32 %479, i64 1, !dbg !33 + %618 = xor <2 x i32> %615, %617, !dbg !33 + %619 = insertelement <2 x i32> poison, i32 %567, i64 0, !dbg !33 + %620 = insertelement <2 x i32> %619, i32 %506, i64 1, !dbg !33 + %621 = insertelement <2 x i32> poison, i32 %539, i64 0, !dbg !33 + %622 = insertelement <2 x i32> %621, i32 %478, i64 1, !dbg !33 + %623 = xor <2 x i32> %620, %622, !dbg !33 + %624 = select <2 x i1> %598, <2 x i32> %608, <2 x i32> zeroinitializer, !dbg !34 + %625 = shufflevector <2 x i1> %596, <2 x i1> %597, <2 x i32> , !dbg !34 + %626 = select <2 x i1> %625, <2 x i32> %613, <2 x i32> zeroinitializer, !dbg !34 + %627 = select <2 x i1> %603, <2 x i32> %618, <2 x i32> zeroinitializer, !dbg !34 + %628 = shufflevector <2 x i1> %601, <2 x i1> %602, <2 x i32> , !dbg !34 + %629 = select <2 x i1> %628, <2 x i32> %623, <2 x i32> zeroinitializer, !dbg !34 + %630 = xor <2 x i32> %624, %444, !dbg !35 + %631 = xor <2 x i32> %626, %445, !dbg !35 + %632 = xor <2 x i32> %627, %446, !dbg !35 + %633 = xor <2 x i32> %629, %447, !dbg !35 + %634 = extractelement <2 x i32> %630, i64 1, !dbg !36 + %635 = extractelement <2 x i32> %631, i64 1, !dbg !36 + %636 = icmp slt i32 %635, %634, !dbg !36 + %637 = extractelement <2 x i32> %632, i64 1, !dbg !36 + %638 = extractelement <2 x i32> %633, i64 1, !dbg !36 + %639 = icmp sge i32 %638, %637, !dbg !36 + %640 = icmp eq <2 x i32> %630, %631, !dbg !37 + %641 = icmp sgt <2 x i32> %630, %631, !dbg !37 + %642 = icmp ne <2 x i32> %632, %633, !dbg !37 + %643 = icmp sle <2 x i32> %632, %633, !dbg !37 + %shift = shufflevector <2 x i1> %640, <2 x i1> poison, <2 x i32> , !dbg !39 + %foldExtExtBinop = and <2 x i1> %shift, %641, !dbg !39 + %644 = extractelement <2 x i1> %foldExtExtBinop, i64 0, !dbg !39 + %shift69 = shufflevector <2 x i1> %642, <2 x i1> poison, <2 x i32> , !dbg !40 + %foldExtExtBinop70 = or <2 x i1> %shift69, %643, !dbg !40 + %.not34 = extractelement <2 x i1> %foldExtExtBinop70, i64 0, !dbg !40 + %645 = or i1 %636, %644, !dbg !40 + %.not31 = and i1 %639, %.not34, !dbg !41 + %646 = xor i32 %634, %635, !dbg !33 + %647 = xor i32 %637, %638, !dbg !33 + %648 = select i1 %645, i32 %646, i32 0, !dbg !34 + %649 = select i1 %.not31, i32 %647, i32 0, !dbg !34 + %foldExtExtBinop72 = xor <2 x i32> %631, %630, !dbg !42 + %650 = extractelement <2 x i32> %foldExtExtBinop72, i64 0, !dbg !42 + %foldExtExtBinop74 = xor <2 x i32> %633, %632, !dbg !42 + %651 = extractelement <2 x i32> %foldExtExtBinop74, i64 0, !dbg !42 + %652 = select i1 %645, i32 %650, i32 0, !dbg !43 + %653 = select i1 %.not31, i32 %651, i32 0, !dbg !43 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %654 = insertelement <2 x i32> poison, i32 %648, i64 0, !dbg !35 + %655 = shufflevector <2 x i32> %654, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !35 + %656 = shufflevector <2 x i32> %630, <2 x i32> %631, <2 x i32> , !dbg !35 + %657 = xor <2 x i32> %655, %656, !dbg !35 + %658 = insertelement <2 x i32> poison, i32 %649, i64 0, !dbg !35 + %659 = shufflevector <2 x i32> %658, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !35 + %660 = shufflevector <2 x i32> %632, <2 x i32> %633, <2 x i32> , !dbg !35 + %661 = xor <2 x i32> %659, %660, !dbg !35 + %662 = insertelement <2 x i32> poison, i32 %652, i64 0, !dbg !44 + %663 = shufflevector <2 x i32> %662, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !44 + %664 = shufflevector <2 x i32> %631, <2 x i32> %630, <2 x i32> , !dbg !44 + %665 = xor <2 x i32> %663, %664, !dbg !44 + %666 = insertelement <2 x i32> poison, i32 %653, i64 0, !dbg !44 + %667 = shufflevector <2 x i32> %666, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !44 + %668 = shufflevector <2 x i32> %633, <2 x i32> %632, <2 x i32> , !dbg !44 + %669 = xor <2 x i32> %667, %668, !dbg !44 + %670 = extractelement <2 x i32> %657, i64 1, !dbg !29 + %671 = mul nuw nsw i32 %670, %51, !dbg !24 + %672 = extractelement <2 x i32> %657, i64 0, !dbg !29 + %673 = mul nuw nsw i32 %672, %51, !dbg !24 + %674 = extractelement <2 x i32> %661, i64 1, !dbg !29 + %675 = mul nuw nsw i32 %674, %51, !dbg !24 + %676 = extractelement <2 x i32> %661, i64 0, !dbg !29 + %677 = mul nuw nsw i32 %676, %51, !dbg !24 + %678 = insertelement <1 x i32> poison, i32 %671, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %678, i1 true) #5, !dbg !25 + %679 = insertelement <1 x i32> poison, i32 %673, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %679, i1 true) #5, !dbg !25 + %680 = insertelement <1 x i32> poison, i32 %675, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %680, i1 true) #5, !dbg !25 + %681 = insertelement <1 x i32> poison, i32 %677, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %681, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %682 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %683 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %682, i32 1, i32 31), !dbg !25 + %684 = add i32 %683, %682, !dbg !28 + %685 = insertelement <1 x i32> poison, i32 %684, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %685, i1 %75) #5, !dbg !25 + %686 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %687 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %686, i32 1, i32 31), !dbg !25 + %688 = add i32 %687, %686, !dbg !28 + %689 = insertelement <1 x i32> poison, i32 %688, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %689, i1 %75) #5, !dbg !25 + %690 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %691 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %690, i32 1, i32 31), !dbg !25 + %692 = add i32 %691, %690, !dbg !28 + %693 = insertelement <1 x i32> poison, i32 %692, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %693, i1 %75) #5, !dbg !25 + %694 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %695 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %694, i32 1, i32 31), !dbg !25 + %696 = add i32 %695, %694, !dbg !28 + %697 = insertelement <1 x i32> poison, i32 %696, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %697, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %698 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %699 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %700 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %701 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + %702 = mul nuw nsw i32 %670, %.lobit, !dbg !29 + %703 = mul nuw nsw i32 %672, %.lobit, !dbg !29 + %704 = mul nuw nsw i32 %674, %.lobit, !dbg !29 + %705 = mul nuw nsw i32 %676, %.lobit, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %706 = insertelement <1 x i32> poison, i32 %702, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %706, i1 true) #5, !dbg !25 + %707 = insertelement <1 x i32> poison, i32 %703, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %707, i1 true) #5, !dbg !25 + %708 = insertelement <1 x i32> poison, i32 %704, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %708, i1 true) #5, !dbg !25 + %709 = insertelement <1 x i32> poison, i32 %705, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %709, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %710 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %711 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %710, i32 1, i32 31), !dbg !25 + %712 = add i32 %711, %710, !dbg !28 + %713 = insertelement <1 x i32> poison, i32 %712, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %713, i1 %75) #5, !dbg !25 + %714 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %715 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %714, i32 1, i32 31), !dbg !25 + %716 = add i32 %715, %714, !dbg !28 + %717 = insertelement <1 x i32> poison, i32 %716, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %717, i1 %75) #5, !dbg !25 + %718 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %719 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %718, i32 1, i32 31), !dbg !25 + %720 = add i32 %719, %718, !dbg !28 + %721 = insertelement <1 x i32> poison, i32 %720, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %721, i1 %75) #5, !dbg !25 + %722 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %723 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %722, i32 1, i32 31), !dbg !25 + %724 = add i32 %723, %722, !dbg !28 + %725 = insertelement <1 x i32> poison, i32 %724, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %725, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %726 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %727 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %728 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %729 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + %730 = extractelement <2 x i32> %665, i64 1, !dbg !31 + %731 = mul nuw nsw i32 %730, %51, !dbg !30 + %732 = extractelement <2 x i32> %665, i64 0, !dbg !31 + %733 = mul nuw nsw i32 %732, %51, !dbg !30 + %734 = extractelement <2 x i32> %669, i64 1, !dbg !31 + %735 = mul nuw nsw i32 %734, %51, !dbg !30 + %736 = extractelement <2 x i32> %669, i64 0, !dbg !31 + %737 = mul nuw nsw i32 %736, %51, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %738 = insertelement <1 x i32> poison, i32 %731, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %738, i1 true) #5, !dbg !25 + %739 = insertelement <1 x i32> poison, i32 %733, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %739, i1 true) #5, !dbg !25 + %740 = insertelement <1 x i32> poison, i32 %735, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %740, i1 true) #5, !dbg !25 + %741 = insertelement <1 x i32> poison, i32 %737, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %741, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %742 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %743 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %742, i32 1, i32 31), !dbg !25 + %744 = add i32 %743, %742, !dbg !28 + %745 = insertelement <1 x i32> poison, i32 %744, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %745, i1 %75) #5, !dbg !25 + %746 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %747 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %746, i32 1, i32 31), !dbg !25 + %748 = add i32 %747, %746, !dbg !28 + %749 = insertelement <1 x i32> poison, i32 %748, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %749, i1 %75) #5, !dbg !25 + %750 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %751 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %750, i32 1, i32 31), !dbg !25 + %752 = add i32 %751, %750, !dbg !28 + %753 = insertelement <1 x i32> poison, i32 %752, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %753, i1 %75) #5, !dbg !25 + %754 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %755 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %754, i32 1, i32 31), !dbg !25 + %756 = add i32 %755, %754, !dbg !28 + %757 = insertelement <1 x i32> poison, i32 %756, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %757, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %758 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %759 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %760 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %761 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + %762 = mul nuw nsw i32 %730, %.lobit, !dbg !31 + %763 = mul nuw nsw i32 %732, %.lobit, !dbg !31 + %764 = mul nuw nsw i32 %734, %.lobit, !dbg !31 + %765 = mul nuw nsw i32 %736, %.lobit, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %766 = insertelement <1 x i32> poison, i32 %762, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %766, i1 true) #5, !dbg !25 + %767 = insertelement <1 x i32> poison, i32 %763, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %767, i1 true) #5, !dbg !25 + %768 = insertelement <1 x i32> poison, i32 %764, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %768, i1 true) #5, !dbg !25 + %769 = insertelement <1 x i32> poison, i32 %765, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %769, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %770 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %771 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %770, i32 1, i32 31), !dbg !25 + %772 = add i32 %771, %770, !dbg !28 + %773 = insertelement <1 x i32> poison, i32 %772, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %773, i1 %75) #5, !dbg !25 + %774 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %775 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %774, i32 1, i32 31), !dbg !25 + %776 = add i32 %775, %774, !dbg !28 + %777 = insertelement <1 x i32> poison, i32 %776, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %777, i1 %75) #5, !dbg !25 + %778 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %779 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %778, i32 1, i32 31), !dbg !25 + %780 = add i32 %779, %778, !dbg !28 + %781 = insertelement <1 x i32> poison, i32 %780, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %781, i1 %75) #5, !dbg !25 + %782 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %783 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %782, i32 1, i32 31), !dbg !25 + %784 = add i32 %783, %782, !dbg !28 + %785 = insertelement <1 x i32> poison, i32 %784, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %785, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %786 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %787 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %788 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %789 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + %790 = insertelement <2 x i32> poison, i32 %699, i64 0, !dbg !36 + %791 = insertelement <2 x i32> %790, i32 %698, i64 1, !dbg !36 + %792 = insertelement <2 x i32> poison, i32 %727, i64 0, !dbg !36 + %793 = insertelement <2 x i32> %792, i32 %726, i64 1, !dbg !36 + %794 = icmp slt <2 x i32> %791, %793, !dbg !36 + %795 = insertelement <2 x i32> poison, i32 %701, i64 0, !dbg !36 + %796 = insertelement <2 x i32> %795, i32 %700, i64 1, !dbg !36 + %797 = insertelement <2 x i32> poison, i32 %729, i64 0, !dbg !36 + %798 = insertelement <2 x i32> %797, i32 %728, i64 1, !dbg !36 + %799 = icmp sge <2 x i32> %796, %798, !dbg !36 + %800 = icmp eq <2 x i32> %791, %793, !dbg !37 + %801 = icmp ne <2 x i32> %796, %798, !dbg !37 + %802 = insertelement <2 x i32> poison, i32 %759, i64 0, !dbg !38 + %803 = insertelement <2 x i32> %802, i32 %758, i64 1, !dbg !38 + %804 = insertelement <2 x i32> poison, i32 %787, i64 0, !dbg !38 + %805 = insertelement <2 x i32> %804, i32 %786, i64 1, !dbg !38 + %806 = icmp sgt <2 x i32> %803, %805, !dbg !38 + %807 = insertelement <2 x i32> poison, i32 %761, i64 0, !dbg !38 + %808 = insertelement <2 x i32> %807, i32 %760, i64 1, !dbg !38 + %809 = insertelement <2 x i32> poison, i32 %789, i64 0, !dbg !38 + %810 = insertelement <2 x i32> %809, i32 %788, i64 1, !dbg !38 + %811 = icmp sle <2 x i32> %808, %810, !dbg !38 + %812 = and <2 x i1> %800, %806, !dbg !39 + %813 = or <2 x i1> %801, %811, !dbg !40 + %814 = or <2 x i1> %794, %812, !dbg !40 + %815 = and <2 x i1> %799, %813, !dbg !41 + %816 = xor <2 x i32> %793, %791, !dbg !33 + %817 = xor <2 x i32> %798, %796, !dbg !33 + %818 = select <2 x i1> %814, <2 x i32> %816, <2 x i32> zeroinitializer, !dbg !34 + %819 = select <2 x i1> %815, <2 x i32> %817, <2 x i32> zeroinitializer, !dbg !34 + %820 = xor <2 x i32> %818, %657, !dbg !35 + %821 = xor <2 x i32> %819, %661, !dbg !35 + %822 = xor <2 x i32> %805, %803, !dbg !42 + %823 = xor <2 x i32> %810, %808, !dbg !42 + %824 = select <2 x i1> %814, <2 x i32> %822, <2 x i32> zeroinitializer, !dbg !43 + %825 = select <2 x i1> %815, <2 x i32> %823, <2 x i32> zeroinitializer, !dbg !43 + %826 = xor <2 x i32> %824, %665, !dbg !44 + %827 = xor <2 x i32> %825, %669, !dbg !44 + %828 = extractelement <2 x i32> %820, i64 1, !dbg !29 + %829 = mul nuw nsw i32 %828, %50, !dbg !24 + %830 = extractelement <2 x i32> %820, i64 0, !dbg !29 + %831 = mul nuw nsw i32 %830, %50, !dbg !24 + %832 = extractelement <2 x i32> %821, i64 1, !dbg !29 + %833 = mul nuw nsw i32 %832, %50, !dbg !24 + %834 = extractelement <2 x i32> %821, i64 0, !dbg !29 + %835 = mul nuw nsw i32 %834, %50, !dbg !24 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %836 = insertelement <1 x i32> poison, i32 %829, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %836, i1 true) #5, !dbg !25 + %837 = insertelement <1 x i32> poison, i32 %831, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %837, i1 true) #5, !dbg !25 + %838 = insertelement <1 x i32> poison, i32 %833, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %838, i1 true) #5, !dbg !25 + %839 = insertelement <1 x i32> poison, i32 %835, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %839, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %840 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %841 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %840, i32 1, i32 31), !dbg !25 + %842 = add i32 %841, %840, !dbg !28 + %843 = insertelement <1 x i32> poison, i32 %842, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %843, i1 %75) #5, !dbg !25 + %844 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %845 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %844, i32 1, i32 31), !dbg !25 + %846 = add i32 %845, %844, !dbg !28 + %847 = insertelement <1 x i32> poison, i32 %846, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %847, i1 %75) #5, !dbg !25 + %848 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %849 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %848, i32 1, i32 31), !dbg !25 + %850 = add i32 %849, %848, !dbg !28 + %851 = insertelement <1 x i32> poison, i32 %850, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %851, i1 %75) #5, !dbg !25 + %852 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %853 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %852, i32 1, i32 31), !dbg !25 + %854 = add i32 %853, %852, !dbg !28 + %855 = insertelement <1 x i32> poison, i32 %854, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %855, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %856 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %857 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %858 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %859 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %860 = mul nuw nsw i32 %828, %.lobit1, !dbg !29 + %861 = mul nuw nsw i32 %830, %.lobit1, !dbg !29 + %862 = mul nuw nsw i32 %832, %.lobit1, !dbg !29 + %863 = mul nuw nsw i32 %834, %.lobit1, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %864 = insertelement <1 x i32> poison, i32 %860, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %864, i1 true) #5, !dbg !25 + %865 = insertelement <1 x i32> poison, i32 %861, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %865, i1 true) #5, !dbg !25 + %866 = insertelement <1 x i32> poison, i32 %862, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %866, i1 true) #5, !dbg !25 + %867 = insertelement <1 x i32> poison, i32 %863, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %867, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %868 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %869 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %868, i32 1, i32 31), !dbg !25 + %870 = add i32 %869, %868, !dbg !28 + %871 = insertelement <1 x i32> poison, i32 %870, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %871, i1 %75) #5, !dbg !25 + %872 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %873 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %872, i32 1, i32 31), !dbg !25 + %874 = add i32 %873, %872, !dbg !28 + %875 = insertelement <1 x i32> poison, i32 %874, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %875, i1 %75) #5, !dbg !25 + %876 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %877 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %876, i32 1, i32 31), !dbg !25 + %878 = add i32 %877, %876, !dbg !28 + %879 = insertelement <1 x i32> poison, i32 %878, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %879, i1 %75) #5, !dbg !25 + %880 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %881 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %880, i32 1, i32 31), !dbg !25 + %882 = add i32 %881, %880, !dbg !28 + %883 = insertelement <1 x i32> poison, i32 %882, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %883, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %884 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %885 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %886 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %887 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %888 = extractelement <2 x i32> %826, i64 1, !dbg !31 + %889 = mul nuw nsw i32 %888, %50, !dbg !30 + %890 = extractelement <2 x i32> %826, i64 0, !dbg !31 + %891 = mul nuw nsw i32 %890, %50, !dbg !30 + %892 = extractelement <2 x i32> %827, i64 1, !dbg !31 + %893 = mul nuw nsw i32 %892, %50, !dbg !30 + %894 = extractelement <2 x i32> %827, i64 0, !dbg !31 + %895 = mul nuw nsw i32 %894, %50, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %896 = insertelement <1 x i32> poison, i32 %889, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %896, i1 true) #5, !dbg !25 + %897 = insertelement <1 x i32> poison, i32 %891, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %897, i1 true) #5, !dbg !25 + %898 = insertelement <1 x i32> poison, i32 %893, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %898, i1 true) #5, !dbg !25 + %899 = insertelement <1 x i32> poison, i32 %895, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %899, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %900 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %901 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %900, i32 1, i32 31), !dbg !25 + %902 = add i32 %901, %900, !dbg !28 + %903 = insertelement <1 x i32> poison, i32 %902, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %903, i1 %75) #5, !dbg !25 + %904 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %905 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %904, i32 1, i32 31), !dbg !25 + %906 = add i32 %905, %904, !dbg !28 + %907 = insertelement <1 x i32> poison, i32 %906, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %907, i1 %75) #5, !dbg !25 + %908 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %909 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %908, i32 1, i32 31), !dbg !25 + %910 = add i32 %909, %908, !dbg !28 + %911 = insertelement <1 x i32> poison, i32 %910, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %911, i1 %75) #5, !dbg !25 + %912 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %913 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %912, i32 1, i32 31), !dbg !25 + %914 = add i32 %913, %912, !dbg !28 + %915 = insertelement <1 x i32> poison, i32 %914, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %915, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %916 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %917 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %918 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %919 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %920 = mul nuw nsw i32 %888, %.lobit1, !dbg !31 + %921 = mul nuw nsw i32 %890, %.lobit1, !dbg !31 + %922 = mul nuw nsw i32 %892, %.lobit1, !dbg !31 + %923 = mul nuw nsw i32 %894, %.lobit1, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %924 = insertelement <1 x i32> poison, i32 %920, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %924, i1 true) #5, !dbg !25 + %925 = insertelement <1 x i32> poison, i32 %921, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %925, i1 true) #5, !dbg !25 + %926 = insertelement <1 x i32> poison, i32 %922, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %926, i1 true) #5, !dbg !25 + %927 = insertelement <1 x i32> poison, i32 %923, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %927, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %928 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %929 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %928, i32 1, i32 31), !dbg !25 + %930 = add i32 %929, %928, !dbg !28 + %931 = insertelement <1 x i32> poison, i32 %930, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %931, i1 %75) #5, !dbg !25 + %932 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %933 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %932, i32 1, i32 31), !dbg !25 + %934 = add i32 %933, %932, !dbg !28 + %935 = insertelement <1 x i32> poison, i32 %934, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %935, i1 %75) #5, !dbg !25 + %936 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %937 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %936, i32 1, i32 31), !dbg !25 + %938 = add i32 %937, %936, !dbg !28 + %939 = insertelement <1 x i32> poison, i32 %938, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %939, i1 %75) #5, !dbg !25 + %940 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %941 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %940, i32 1, i32 31), !dbg !25 + %942 = add i32 %941, %940, !dbg !28 + %943 = insertelement <1 x i32> poison, i32 %942, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %943, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %944 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %945 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %946 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %947 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %948 = insertelement <2 x i32> poison, i32 %857, i64 0, !dbg !36 + %949 = insertelement <2 x i32> %948, i32 %856, i64 1, !dbg !36 + %950 = insertelement <2 x i32> poison, i32 %885, i64 0, !dbg !36 + %951 = insertelement <2 x i32> %950, i32 %884, i64 1, !dbg !36 + %952 = icmp slt <2 x i32> %949, %951, !dbg !36 + %953 = insertelement <2 x i32> poison, i32 %859, i64 0, !dbg !36 + %954 = insertelement <2 x i32> %953, i32 %858, i64 1, !dbg !36 + %955 = insertelement <2 x i32> poison, i32 %887, i64 0, !dbg !36 + %956 = insertelement <2 x i32> %955, i32 %886, i64 1, !dbg !36 + %957 = icmp sge <2 x i32> %954, %956, !dbg !36 + %958 = icmp eq <2 x i32> %949, %951, !dbg !37 + %959 = icmp ne <2 x i32> %954, %956, !dbg !37 + %960 = insertelement <2 x i32> poison, i32 %917, i64 0, !dbg !38 + %961 = insertelement <2 x i32> %960, i32 %916, i64 1, !dbg !38 + %962 = insertelement <2 x i32> poison, i32 %945, i64 0, !dbg !38 + %963 = insertelement <2 x i32> %962, i32 %944, i64 1, !dbg !38 + %964 = icmp sgt <2 x i32> %961, %963, !dbg !38 + %965 = insertelement <2 x i32> poison, i32 %919, i64 0, !dbg !38 + %966 = insertelement <2 x i32> %965, i32 %918, i64 1, !dbg !38 + %967 = insertelement <2 x i32> poison, i32 %947, i64 0, !dbg !38 + %968 = insertelement <2 x i32> %967, i32 %946, i64 1, !dbg !38 + %969 = icmp sle <2 x i32> %966, %968, !dbg !38 + %970 = and <2 x i1> %958, %964, !dbg !39 + %971 = or <2 x i1> %959, %969, !dbg !40 + %972 = or <2 x i1> %952, %970, !dbg !40 + %973 = and <2 x i1> %957, %971, !dbg !41 + %974 = xor <2 x i32> %951, %949, !dbg !33 + %975 = xor <2 x i32> %956, %954, !dbg !33 + %976 = select <2 x i1> %972, <2 x i32> %974, <2 x i32> zeroinitializer, !dbg !34 + %977 = select <2 x i1> %973, <2 x i32> %975, <2 x i32> zeroinitializer, !dbg !34 + %978 = xor <2 x i32> %976, %820, !dbg !35 + %979 = xor <2 x i32> %977, %821, !dbg !35 + %980 = xor <2 x i32> %963, %961, !dbg !42 + %981 = xor <2 x i32> %968, %966, !dbg !42 + %982 = select <2 x i1> %972, <2 x i32> %980, <2 x i32> zeroinitializer, !dbg !43 + %983 = select <2 x i1> %973, <2 x i32> %981, <2 x i32> zeroinitializer, !dbg !43 + %984 = xor <2 x i32> %982, %826, !dbg !44 + %985 = xor <2 x i32> %983, %827, !dbg !44 + %986 = icmp slt <2 x i32> %978, %979, !dbg !36 + %987 = icmp eq <2 x i32> %978, %979, !dbg !37 + %988 = icmp sgt <2 x i32> %984, %985, !dbg !38 + %989 = and <2 x i1> %987, %988, !dbg !39 + %990 = or <2 x i1> %986, %989, !dbg !40 + %991 = xor <2 x i32> %979, %978, !dbg !33 + %992 = select <2 x i1> %990, <2 x i32> %991, <2 x i32> zeroinitializer, !dbg !34 + %foldExtExtBinop76 = xor <2 x i32> %992, %978, !dbg !35 + %993 = extractelement <2 x i32> %foldExtExtBinop76, i64 1, !dbg !35 + %foldExtExtBinop78 = xor <2 x i32> %992, %978, !dbg !35 + %994 = extractelement <2 x i32> %foldExtExtBinop78, i64 0, !dbg !35 + %995 = xor <2 x i32> %992, %979, !dbg !35 + %996 = extractelement <2 x i32> %995, i64 0, !dbg !36 + %997 = extractelement <2 x i32> %995, i64 1, !dbg !36 + %998 = xor i32 %994, %993, !dbg !33 + %999 = xor i32 %996, %997, !dbg !33 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1000 = xor <2 x i32> %985, %984, !dbg !42 + %1001 = select <2 x i1> %990, <2 x i32> %1000, <2 x i32> zeroinitializer, !dbg !43 + %1002 = shufflevector <2 x i32> %1001, <2 x i32> poison, <4 x i32> , !dbg !44 + %1003 = shufflevector <2 x i32> %984, <2 x i32> %985, <4 x i32> , !dbg !44 + %1004 = xor <4 x i32> %1002, %1003, !dbg !44 + %1005 = shufflevector <2 x i32> %1001, <2 x i32> poison, <2 x i32> , !dbg !44 + %1006 = shufflevector <2 x i32> %984, <2 x i32> %985, <2 x i32> , !dbg !44 + %1007 = xor <2 x i32> %1005, %1006, !dbg !44 + %1008 = shufflevector <2 x i32> %985, <2 x i32> %984, <2 x i32> , !dbg !44 + %1009 = shufflevector <2 x i32> %1001, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !44 + %1010 = shufflevector <2 x i32> %985, <2 x i32> %984, <2 x i32> , !dbg !44 + %1011 = xor <2 x i32> %1009, %1010, !dbg !44 + %1012 = shufflevector <2 x i32> %1001, <2 x i32> poison, <2 x i32> , !dbg !44 + %1013 = shufflevector <2 x i32> %985, <2 x i32> %984, <2 x i32> , !dbg !44 + %1014 = xor <2 x i32> %1012, %1013, !dbg !44 + %1015 = shufflevector <2 x i32> %995, <2 x i32> %foldExtExtBinop76, <2 x i32> , !dbg !36 + %1016 = shufflevector <2 x i32> %995, <2 x i32> %foldExtExtBinop78, <2 x i32> , !dbg !36 + %1017 = icmp slt <2 x i32> %1015, %1016, !dbg !36 + %1018 = icmp eq <2 x i32> %1015, %1016, !dbg !37 + %1019 = icmp sgt <2 x i32> %1014, %1011, !dbg !38 + %1020 = and <2 x i1> %1018, %1019, !dbg !39 + %1021 = or <2 x i1> %1017, %1020, !dbg !40 + %1022 = extractelement <2 x i1> %1021, i64 1, !dbg !34 + %1023 = select i1 %1022, i32 %998, i32 0, !dbg !34 + %1024 = extractelement <2 x i1> %1021, i64 0, !dbg !34 + %1025 = select i1 %1024, i32 %999, i32 0, !dbg !34 + %1026 = xor i32 %1023, %993, !dbg !35 + %1027 = xor i32 %1023, %994, !dbg !35 + %1028 = xor i32 %1025, %997, !dbg !35 + %1029 = xor i32 %1025, %996, !dbg !35 + %1030 = xor <2 x i32> %1008, %1007, !dbg !42 + %1031 = xor <2 x i32> %1030, %1001, !dbg !42 + %1032 = select <2 x i1> %1021, <2 x i32> %1031, <2 x i32> zeroinitializer, !dbg !43 + %1033 = shufflevector <2 x i32> %1032, <2 x i32> poison, <4 x i32> , !dbg !43 + %1034 = xor <4 x i32> %1033, %1004, !dbg !44 + %1035 = mul nuw nsw i32 %1026, %51, !dbg !24 + %1036 = mul nuw nsw i32 %1027, %51, !dbg !24 + %1037 = mul nuw nsw i32 %1028, %51, !dbg !24 + %1038 = mul nuw nsw i32 %1029, %51, !dbg !24 + %1039 = insertelement <1 x i32> poison, i32 %1035, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %1039, i1 true) #5, !dbg !25 + %1040 = insertelement <1 x i32> poison, i32 %1036, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %1040, i1 true) #5, !dbg !25 + %1041 = insertelement <1 x i32> poison, i32 %1037, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %1041, i1 true) #5, !dbg !25 + %1042 = insertelement <1 x i32> poison, i32 %1038, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %1042, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1043 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %1044 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1043, i32 1, i32 31), !dbg !25 + %1045 = add i32 %1044, %1043, !dbg !28 + %1046 = insertelement <1 x i32> poison, i32 %1045, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %1046, i1 %75) #5, !dbg !25 + %1047 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %1048 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1047, i32 1, i32 31), !dbg !25 + %1049 = add i32 %1048, %1047, !dbg !28 + %1050 = insertelement <1 x i32> poison, i32 %1049, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %1050, i1 %75) #5, !dbg !25 + %1051 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %1052 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1051, i32 1, i32 31), !dbg !25 + %1053 = add i32 %1052, %1051, !dbg !28 + %1054 = insertelement <1 x i32> poison, i32 %1053, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1054, i1 %75) #5, !dbg !25 + %1055 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %1056 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1055, i32 1, i32 31), !dbg !25 + %1057 = add i32 %1056, %1055, !dbg !28 + %1058 = insertelement <1 x i32> poison, i32 %1057, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1058, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1059 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %1060 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %1061 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %1062 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + %1063 = mul nuw nsw i32 %1026, %.lobit, !dbg !29 + %1064 = mul nuw nsw i32 %1027, %.lobit, !dbg !29 + %1065 = mul nuw nsw i32 %1028, %.lobit, !dbg !29 + %1066 = mul nuw nsw i32 %1029, %.lobit, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1067 = insertelement <1 x i32> poison, i32 %1063, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %1067, i1 true) #5, !dbg !25 + %1068 = insertelement <1 x i32> poison, i32 %1064, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %1068, i1 true) #5, !dbg !25 + %1069 = insertelement <1 x i32> poison, i32 %1065, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %1069, i1 true) #5, !dbg !25 + %1070 = insertelement <1 x i32> poison, i32 %1066, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %1070, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1071 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %1072 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1071, i32 1, i32 31), !dbg !25 + %1073 = add i32 %1072, %1071, !dbg !28 + %1074 = insertelement <1 x i32> poison, i32 %1073, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %1074, i1 %75) #5, !dbg !25 + %1075 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %1076 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1075, i32 1, i32 31), !dbg !25 + %1077 = add i32 %1076, %1075, !dbg !28 + %1078 = insertelement <1 x i32> poison, i32 %1077, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %1078, i1 %75) #5, !dbg !25 + %1079 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %1080 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1079, i32 1, i32 31), !dbg !25 + %1081 = add i32 %1080, %1079, !dbg !28 + %1082 = insertelement <1 x i32> poison, i32 %1081, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1082, i1 %75) #5, !dbg !25 + %1083 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %1084 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1083, i32 1, i32 31), !dbg !25 + %1085 = add i32 %1084, %1083, !dbg !28 + %1086 = insertelement <1 x i32> poison, i32 %1085, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1086, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1087 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %1088 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %1089 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %1090 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + %1091 = extractelement <4 x i32> %1034, i64 0, !dbg !31 + %1092 = mul nuw nsw i32 %1091, %51, !dbg !30 + %1093 = extractelement <4 x i32> %1034, i64 1, !dbg !31 + %1094 = mul nuw nsw i32 %1093, %51, !dbg !30 + %1095 = extractelement <4 x i32> %1034, i64 2, !dbg !31 + %1096 = mul nuw nsw i32 %1095, %51, !dbg !30 + %1097 = extractelement <4 x i32> %1034, i64 3, !dbg !31 + %1098 = mul nuw nsw i32 %1097, %51, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1099 = insertelement <1 x i32> poison, i32 %1092, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %1099, i1 true) #5, !dbg !25 + %1100 = insertelement <1 x i32> poison, i32 %1094, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %1100, i1 true) #5, !dbg !25 + %1101 = insertelement <1 x i32> poison, i32 %1096, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %1101, i1 true) #5, !dbg !25 + %1102 = insertelement <1 x i32> poison, i32 %1098, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %1102, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1103 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %1104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1103, i32 1, i32 31), !dbg !25 + %1105 = add i32 %1104, %1103, !dbg !28 + %1106 = insertelement <1 x i32> poison, i32 %1105, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %1106, i1 %75) #5, !dbg !25 + %1107 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %1108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1107, i32 1, i32 31), !dbg !25 + %1109 = add i32 %1108, %1107, !dbg !28 + %1110 = insertelement <1 x i32> poison, i32 %1109, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %1110, i1 %75) #5, !dbg !25 + %1111 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %1112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1111, i32 1, i32 31), !dbg !25 + %1113 = add i32 %1112, %1111, !dbg !28 + %1114 = insertelement <1 x i32> poison, i32 %1113, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1114, i1 %75) #5, !dbg !25 + %1115 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %1116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1115, i32 1, i32 31), !dbg !25 + %1117 = add i32 %1116, %1115, !dbg !28 + %1118 = insertelement <1 x i32> poison, i32 %1117, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1118, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1119 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %1120 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %1121 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %1122 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + %1123 = mul nuw nsw i32 %1091, %.lobit, !dbg !31 + %1124 = mul nuw nsw i32 %1093, %.lobit, !dbg !31 + %1125 = mul nuw nsw i32 %1095, %.lobit, !dbg !31 + %1126 = mul nuw nsw i32 %1097, %.lobit, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1127 = insertelement <1 x i32> poison, i32 %1123, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %188, <1 x i32> %1127, i1 true) #5, !dbg !25 + %1128 = insertelement <1 x i32> poison, i32 %1124, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %191, <1 x i32> %1128, i1 true) #5, !dbg !25 + %1129 = insertelement <1 x i32> poison, i32 %1125, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %194, <1 x i32> %1129, i1 true) #5, !dbg !25 + %1130 = insertelement <1 x i32> poison, i32 %1126, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %197, <1 x i32> %1130, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1131 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %1132 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1131, i32 1, i32 31), !dbg !25 + %1133 = add i32 %1132, %1131, !dbg !28 + %1134 = insertelement <1 x i32> poison, i32 %1133, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %1134, i1 %75) #5, !dbg !25 + %1135 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %1136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1135, i32 1, i32 31), !dbg !25 + %1137 = add i32 %1136, %1135, !dbg !28 + %1138 = insertelement <1 x i32> poison, i32 %1137, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %1138, i1 %75) #5, !dbg !25 + %1139 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %1140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1139, i32 1, i32 31), !dbg !25 + %1141 = add i32 %1140, %1139, !dbg !28 + %1142 = insertelement <1 x i32> poison, i32 %1141, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1142, i1 %75) #5, !dbg !25 + %1143 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %1144 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1143, i32 1, i32 31), !dbg !25 + %1145 = add i32 %1144, %1143, !dbg !28 + %1146 = insertelement <1 x i32> poison, i32 %1145, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1146, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1147 = load i32, ptr addrspace(3) %187, align 16, !dbg !25 + %1148 = load i32, ptr addrspace(3) %190, align 8, !dbg !25 + %1149 = load i32, ptr addrspace(3) %193, align 16, !dbg !25 + %1150 = load i32, ptr addrspace(3) %196, align 8, !dbg !25 + %1151 = insertelement <4 x i32> poison, i32 %1059, i64 0, !dbg !36 + %1152 = insertelement <4 x i32> %1151, i32 %1060, i64 1, !dbg !36 + %1153 = insertelement <4 x i32> %1152, i32 %1061, i64 2, !dbg !36 + %1154 = insertelement <4 x i32> %1153, i32 %1062, i64 3, !dbg !36 + %1155 = insertelement <4 x i32> poison, i32 %1087, i64 0, !dbg !36 + %1156 = insertelement <4 x i32> %1155, i32 %1088, i64 1, !dbg !36 + %1157 = insertelement <4 x i32> %1156, i32 %1089, i64 2, !dbg !36 + %1158 = insertelement <4 x i32> %1157, i32 %1090, i64 3, !dbg !36 + %1159 = icmp slt <4 x i32> %1154, %1158, !dbg !36 + %1160 = icmp eq <4 x i32> %1154, %1158, !dbg !37 + %1161 = insertelement <4 x i32> poison, i32 %1119, i64 0, !dbg !38 + %1162 = insertelement <4 x i32> %1161, i32 %1120, i64 1, !dbg !38 + %1163 = insertelement <4 x i32> %1162, i32 %1121, i64 2, !dbg !38 + %1164 = insertelement <4 x i32> %1163, i32 %1122, i64 3, !dbg !38 + %1165 = insertelement <4 x i32> poison, i32 %1147, i64 0, !dbg !38 + %1166 = insertelement <4 x i32> %1165, i32 %1148, i64 1, !dbg !38 + %1167 = insertelement <4 x i32> %1166, i32 %1149, i64 2, !dbg !38 + %1168 = insertelement <4 x i32> %1167, i32 %1150, i64 3, !dbg !38 + %1169 = icmp sgt <4 x i32> %1164, %1168, !dbg !38 + %1170 = and <4 x i1> %1160, %1169, !dbg !39 + %1171 = or <4 x i1> %1159, %1170, !dbg !40 + %1172 = xor i32 %1087, %1059, !dbg !33 + %1173 = xor i32 %1088, %1060, !dbg !33 + %1174 = xor i32 %1089, %1061, !dbg !33 + %1175 = xor i32 %1090, %1062, !dbg !33 + %1176 = extractelement <4 x i1> %1171, i64 0, !dbg !34 + %1177 = select i1 %1176, i32 %1172, i32 0, !dbg !34 + %1178 = extractelement <4 x i1> %1171, i64 1, !dbg !34 + %1179 = select i1 %1178, i32 %1173, i32 0, !dbg !34 + %1180 = extractelement <4 x i1> %1171, i64 2, !dbg !34 + %1181 = select i1 %1180, i32 %1174, i32 0, !dbg !34 + %1182 = extractelement <4 x i1> %1171, i64 3, !dbg !34 + %1183 = select i1 %1182, i32 %1175, i32 0, !dbg !34 + %1184 = xor i32 %1177, %1026, !dbg !35 + %1185 = xor i32 %1179, %1027, !dbg !35 + %1186 = xor i32 %1181, %1028, !dbg !35 + %1187 = xor i32 %1183, %1029, !dbg !35 + %1188 = xor <4 x i32> %1168, %1164, !dbg !42 + %1189 = select <4 x i1> %1171, <4 x i32> %1188, <4 x i32> zeroinitializer, !dbg !43 + %1190 = xor <4 x i32> %1189, %1034, !dbg !44 + %1191 = mul nuw nsw i32 %1184, %50, !dbg !24 + %1192 = mul nuw nsw i32 %1185, %50, !dbg !24 + %1193 = mul nuw nsw i32 %1186, %50, !dbg !24 + %1194 = mul nuw nsw i32 %1187, %50, !dbg !24 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1195 = insertelement <1 x i32> poison, i32 %1191, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %1195, i1 true) #5, !dbg !25 + %1196 = insertelement <1 x i32> poison, i32 %1192, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %1196, i1 true) #5, !dbg !25 + %1197 = insertelement <1 x i32> poison, i32 %1193, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %1197, i1 true) #5, !dbg !25 + %1198 = insertelement <1 x i32> poison, i32 %1194, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %1198, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1199 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %1200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1199, i32 1, i32 31), !dbg !25 + %1201 = add i32 %1200, %1199, !dbg !28 + %1202 = insertelement <1 x i32> poison, i32 %1201, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %1202, i1 %75) #5, !dbg !25 + %1203 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %1204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1203, i32 1, i32 31), !dbg !25 + %1205 = add i32 %1204, %1203, !dbg !28 + %1206 = insertelement <1 x i32> poison, i32 %1205, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %1206, i1 %75) #5, !dbg !25 + %1207 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %1208 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1207, i32 1, i32 31), !dbg !25 + %1209 = add i32 %1208, %1207, !dbg !28 + %1210 = insertelement <1 x i32> poison, i32 %1209, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1210, i1 %75) #5, !dbg !25 + %1211 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %1212 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1211, i32 1, i32 31), !dbg !25 + %1213 = add i32 %1212, %1211, !dbg !28 + %1214 = insertelement <1 x i32> poison, i32 %1213, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1214, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1215 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %1216 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %1217 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %1218 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %1219 = mul nuw nsw i32 %1184, %.lobit1, !dbg !29 + %1220 = mul nuw nsw i32 %1185, %.lobit1, !dbg !29 + %1221 = mul nuw nsw i32 %1186, %.lobit1, !dbg !29 + %1222 = mul nuw nsw i32 %1187, %.lobit1, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1223 = insertelement <1 x i32> poison, i32 %1219, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %1223, i1 true) #5, !dbg !25 + %1224 = insertelement <1 x i32> poison, i32 %1220, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %1224, i1 true) #5, !dbg !25 + %1225 = insertelement <1 x i32> poison, i32 %1221, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %1225, i1 true) #5, !dbg !25 + %1226 = insertelement <1 x i32> poison, i32 %1222, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %1226, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1227 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %1228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1227, i32 1, i32 31), !dbg !25 + %1229 = add i32 %1228, %1227, !dbg !28 + %1230 = insertelement <1 x i32> poison, i32 %1229, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %1230, i1 %75) #5, !dbg !25 + %1231 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %1232 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1231, i32 1, i32 31), !dbg !25 + %1233 = add i32 %1232, %1231, !dbg !28 + %1234 = insertelement <1 x i32> poison, i32 %1233, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %1234, i1 %75) #5, !dbg !25 + %1235 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %1236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1235, i32 1, i32 31), !dbg !25 + %1237 = add i32 %1236, %1235, !dbg !28 + %1238 = insertelement <1 x i32> poison, i32 %1237, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1238, i1 %75) #5, !dbg !25 + %1239 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %1240 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1239, i32 1, i32 31), !dbg !25 + %1241 = add i32 %1240, %1239, !dbg !28 + %1242 = insertelement <1 x i32> poison, i32 %1241, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1242, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1243 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %1244 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %1245 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %1246 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %1247 = extractelement <4 x i32> %1190, i64 0, !dbg !31 + %1248 = mul nuw nsw i32 %1247, %50, !dbg !30 + %1249 = extractelement <4 x i32> %1190, i64 1, !dbg !31 + %1250 = mul nuw nsw i32 %1249, %50, !dbg !30 + %1251 = extractelement <4 x i32> %1190, i64 2, !dbg !31 + %1252 = mul nuw nsw i32 %1251, %50, !dbg !30 + %1253 = extractelement <4 x i32> %1190, i64 3, !dbg !31 + %1254 = mul nuw nsw i32 %1253, %50, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1255 = insertelement <1 x i32> poison, i32 %1248, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %1255, i1 true) #5, !dbg !25 + %1256 = insertelement <1 x i32> poison, i32 %1250, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %1256, i1 true) #5, !dbg !25 + %1257 = insertelement <1 x i32> poison, i32 %1252, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %1257, i1 true) #5, !dbg !25 + %1258 = insertelement <1 x i32> poison, i32 %1254, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %1258, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1259 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %1260 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1259, i32 1, i32 31), !dbg !25 + %1261 = add i32 %1260, %1259, !dbg !28 + %1262 = insertelement <1 x i32> poison, i32 %1261, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %1262, i1 %75) #5, !dbg !25 + %1263 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %1264 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1263, i32 1, i32 31), !dbg !25 + %1265 = add i32 %1264, %1263, !dbg !28 + %1266 = insertelement <1 x i32> poison, i32 %1265, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %1266, i1 %75) #5, !dbg !25 + %1267 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %1268 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1267, i32 1, i32 31), !dbg !25 + %1269 = add i32 %1268, %1267, !dbg !28 + %1270 = insertelement <1 x i32> poison, i32 %1269, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1270, i1 %75) #5, !dbg !25 + %1271 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %1272 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1271, i32 1, i32 31), !dbg !25 + %1273 = add i32 %1272, %1271, !dbg !28 + %1274 = insertelement <1 x i32> poison, i32 %1273, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1274, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1275 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %1276 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %1277 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %1278 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %1279 = mul nuw nsw i32 %1247, %.lobit1, !dbg !31 + %1280 = mul nuw nsw i32 %1249, %.lobit1, !dbg !31 + %1281 = mul nuw nsw i32 %1251, %.lobit1, !dbg !31 + %1282 = mul nuw nsw i32 %1253, %.lobit1, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1283 = insertelement <1 x i32> poison, i32 %1279, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %58, <1 x i32> %1283, i1 true) #5, !dbg !25 + %1284 = insertelement <1 x i32> poison, i32 %1280, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %1284, i1 true) #5, !dbg !25 + %1285 = insertelement <1 x i32> poison, i32 %1281, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %1285, i1 true) #5, !dbg !25 + %1286 = insertelement <1 x i32> poison, i32 %1282, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %67, <1 x i32> %1286, i1 true) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1287 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %70, i1 %69) #5, !dbg !25 + %1288 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1287, i32 1, i32 31), !dbg !25 + %1289 = add i32 %1288, %1287, !dbg !28 + %1290 = insertelement <1 x i32> poison, i32 %1289, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, <1 x i32> %1290, i1 %75) #5, !dbg !25 + %1291 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %77, i1 %69) #5, !dbg !25 + %1292 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1291, i32 1, i32 31), !dbg !25 + %1293 = add i32 %1292, %1291, !dbg !28 + %1294 = insertelement <1 x i32> poison, i32 %1293, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %77, <1 x i32> %1294, i1 %75) #5, !dbg !25 + %1295 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %82, i1 %69) #5, !dbg !25 + %1296 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1295, i32 1, i32 31), !dbg !25 + %1297 = add i32 %1296, %1295, !dbg !28 + %1298 = insertelement <1 x i32> poison, i32 %1297, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1298, i1 %75) #5, !dbg !25 + %1299 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %87, i1 %69) #5, !dbg !25 + %1300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1299, i32 1, i32 31), !dbg !25 + %1301 = add i32 %1300, %1299, !dbg !28 + %1302 = insertelement <1 x i32> poison, i32 %1301, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1302, i1 %75) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %1303 = load i32, ptr addrspace(3) %57, align 8, !dbg !25 + %1304 = load i32, ptr addrspace(3) %60, align 8, !dbg !25 + %1305 = load i32, ptr addrspace(3) %63, align 8, !dbg !25 + %1306 = load i32, ptr addrspace(3) %66, align 8, !dbg !25 + %1307 = insertelement <4 x i32> poison, i32 %1215, i64 0, !dbg !36 + %1308 = insertelement <4 x i32> %1307, i32 %1216, i64 1, !dbg !36 + %1309 = insertelement <4 x i32> %1308, i32 %1217, i64 2, !dbg !36 + %1310 = insertelement <4 x i32> %1309, i32 %1218, i64 3, !dbg !36 + %1311 = insertelement <4 x i32> poison, i32 %1243, i64 0, !dbg !36 + %1312 = insertelement <4 x i32> %1311, i32 %1244, i64 1, !dbg !36 + %1313 = insertelement <4 x i32> %1312, i32 %1245, i64 2, !dbg !36 + %1314 = insertelement <4 x i32> %1313, i32 %1246, i64 3, !dbg !36 + %1315 = icmp slt <4 x i32> %1310, %1314, !dbg !36 + %1316 = icmp eq <4 x i32> %1310, %1314, !dbg !37 + %1317 = insertelement <4 x i32> poison, i32 %1275, i64 0, !dbg !38 + %1318 = insertelement <4 x i32> %1317, i32 %1276, i64 1, !dbg !38 + %1319 = insertelement <4 x i32> %1318, i32 %1277, i64 2, !dbg !38 + %1320 = insertelement <4 x i32> %1319, i32 %1278, i64 3, !dbg !38 + %1321 = insertelement <4 x i32> poison, i32 %1303, i64 0, !dbg !38 + %1322 = insertelement <4 x i32> %1321, i32 %1304, i64 1, !dbg !38 + %1323 = insertelement <4 x i32> %1322, i32 %1305, i64 2, !dbg !38 + %1324 = insertelement <4 x i32> %1323, i32 %1306, i64 3, !dbg !38 + %1325 = icmp sgt <4 x i32> %1320, %1324, !dbg !38 + %1326 = and <4 x i1> %1316, %1325, !dbg !39 + %1327 = or <4 x i1> %1315, %1326, !dbg !40 + %1328 = xor <4 x i32> %1324, %1320, !dbg !42 + %1329 = select <4 x i1> %1327, <4 x i32> %1328, <4 x i32> zeroinitializer, !dbg !43 + %1330 = xor <4 x i32> %1329, %1190, !dbg !44 + %1331 = insertelement <4 x i1> poison, i1 %17, i64 0, !dbg !45 + %1332 = shufflevector <4 x i1> %1331, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !45 + %1333 = insertelement <4 x i32> poison, i32 %46, i64 0, !dbg !45 + %1334 = insertelement <4 x i32> %1333, i32 %45, i64 1, !dbg !45 + %1335 = insertelement <4 x i32> %1334, i32 %47, i64 2, !dbg !45 + %1336 = insertelement <4 x i32> %1335, i32 %48, i64 3, !dbg !45 + %1337 = sext <4 x i32> %1336 to <4 x i64>, !dbg !45 + %1338 = select <4 x i1> %1332, <4 x i64> %1337, <4 x i64> zeroinitializer, !dbg !45 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %1339 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %1338), !dbg !48 + %1340 = and i32 %11, 3, !dbg !46 + %1341 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %181, !dbg !46 + %1342 = getelementptr i64, ptr addrspace(3) %1341, i32 %1340, !dbg !46 + %1343 = insertelement <1 x i64> poison, i64 %1339, i64 0, !dbg !46 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %1342, <1 x i64> %1343, i1 true) #5, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %1344 = icmp samesign ult i32 %10, 128, !dbg !46 + %1345 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %10, !dbg !46 + %1346 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %1345, i1 %1344) #5, !dbg !46 + %extelt.offset = lshr i64 %1346, 32, !dbg !46 + %1347 = trunc nuw i64 %extelt.offset to i32, !dbg !46 + %1348 = trunc i64 %1346 to i32, !dbg !46 + %1349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1348, i32 2, i32 31), !dbg !46 + %1350 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1347, i32 2, i32 31), !dbg !46 + %1351 = insertelement <2 x i32> poison, i32 %1349, i64 0, !dbg !46 + %1352 = insertelement <2 x i32> %1351, i32 %1350, i64 1, !dbg !46 + %1353 = bitcast <2 x i32> %1352 to i64, !dbg !46 + %1354 = add i64 %1346, %1353, !dbg !48 + %extelt.offset62 = lshr i64 %1354, 32, !dbg !46 + %1355 = trunc nuw i64 %extelt.offset62 to i32, !dbg !46 + %1356 = trunc i64 %1354 to i32, !dbg !46 + %1357 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1356, i32 1, i32 31), !dbg !46 + %1358 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1355, i32 1, i32 31), !dbg !46 + %1359 = insertelement <2 x i32> poison, i32 %1357, i64 0, !dbg !46 + %1360 = insertelement <2 x i32> %1359, i32 %1358, i64 1, !dbg !46 + %1361 = bitcast <2 x i32> %1360 to i64, !dbg !46 + %1362 = add i64 %1354, %1361, !dbg !48 + %1363 = and i32 %10, 899, !dbg !46 + %1364 = icmp eq i32 %1363, 0, !dbg !46 + %1365 = insertelement <1 x i64> poison, i64 %1362, i64 0, !dbg !46 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %1345, <1 x i64> %1365, i1 %1364) #5, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %1366 = load i64, ptr addrspace(3) %1341, align 16, !dbg !46 + %1367 = trunc i64 %1366 to i32, !dbg !49 + %1368 = shl i32 %16, 4, !dbg !50 + %1369 = or disjoint i32 %1368, %25, !dbg !51 + %1370 = sext i32 %1369 to i64, !dbg !52 + %1371 = getelementptr i32, ptr addrspace(1) %1, i64 %1370, !dbg !52 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %1372 = shl nuw nsw i32 %10, 4, !dbg !53 + %1373 = and i32 %1372, 2032, !dbg !53 + %1374 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1373, !dbg !53 + store <4 x i32> %1330, ptr addrspace(3) %1374, align 16, !dbg !53 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %1375 = shl nuw nsw i32 %10, 6, !dbg !53 + %1376 = and i32 %1375, 1536, !dbg !53 + %1377 = and i32 %1372, 112, !dbg !53 + %1378 = shl nuw nsw i32 %19, 2, !dbg !53 + %1379 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1376, !dbg !53 + %1380 = getelementptr inbounds nuw i8, ptr addrspace(3) %1379, i32 %1377, !dbg !53 + %1381 = getelementptr inbounds nuw i8, ptr addrspace(3) %1380, i32 %1378, !dbg !53 + %1382 = ptrtoint ptr addrspace(3) %1381 to i32, !dbg !53 + %1383 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %1382) #5, !dbg !53 + %1384 = extractvalue { i32, i32, i32, i32 } %1383, 0, !dbg !53 + %1385 = extractvalue { i32, i32, i32, i32 } %1383, 1, !dbg !53 + %1386 = extractvalue { i32, i32, i32, i32 } %1383, 2, !dbg !53 + %1387 = extractvalue { i32, i32, i32, i32 } %1383, 3, !dbg !53 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1384, i32 %1385, i32 %1386, i32 %1387, ptr addrspace(1) %1371, i1 %18) #5, !dbg !53 + %1388 = sext i32 %15 to i64, !dbg !54 + %1389 = getelementptr i32, ptr addrspace(1) %2, i64 %1388, !dbg !54 + %1390 = icmp eq i32 %19, 0, !dbg !55 + %1391 = and i1 %1390, %17, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %1367, ptr addrspace(1) %1389, i1 %1391) #5, !dbg !55 + ret void, !dbg !56 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", linkageName: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 24, column: 28, scope: !4) +!8 = !DILocation(line: 24, column: 33, scope: !4) +!9 = !DILocation(line: 25, column: 44, scope: !4) +!10 = !DILocation(line: 25, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 21, scope: !4) +!12 = !DILocation(line: 27, column: 38, scope: !4) +!13 = !DILocation(line: 34, column: 19, scope: !4) +!14 = !DILocation(line: 36, column: 38, scope: !4) +!15 = !DILocation(line: 36, column: 35, scope: !4) +!16 = !DILocation(line: 36, column: 45, scope: !4) +!17 = !DILocation(line: 36, column: 30, scope: !4) +!18 = !DILocation(line: 36, column: 54, scope: !4) +!19 = !DILocation(line: 627, column: 44, scope: !20, inlinedAt: !22) +!20 = distinct !DILexicalBlockFile(scope: !4, file: !21, discriminator: 0) +!21 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!22 = !DILocation(line: 41, column: 67, scope: !4) +!23 = !DILocation(line: 537, column: 21, scope: !20, inlinedAt: !22) +!24 = !DILocation(line: 538, column: 40, scope: !20, inlinedAt: !22) +!25 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !22) +!26 = distinct !DILexicalBlockFile(scope: !4, file: !27, discriminator: 0) +!27 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!28 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !22) +!29 = !DILocation(line: 539, column: 41, scope: !20, inlinedAt: !22) +!30 = !DILocation(line: 548, column: 23, scope: !20, inlinedAt: !22) +!31 = !DILocation(line: 551, column: 23, scope: !20, inlinedAt: !22) +!32 = !DILocation(line: 599, column: 28, scope: !20, inlinedAt: !22) +!33 = !DILocation(line: 600, column: 38, scope: !20, inlinedAt: !22) +!34 = !DILocation(line: 600, column: 46, scope: !20, inlinedAt: !22) +!35 = !DILocation(line: 600, column: 15, scope: !20, inlinedAt: !22) +!36 = !DILocation(line: 574, column: 22, scope: !20, inlinedAt: !22) +!37 = !DILocation(line: 591, column: 21, scope: !20, inlinedAt: !22) +!38 = !DILocation(line: 594, column: 40, scope: !20, inlinedAt: !22) +!39 = !DILocation(line: 594, column: 29, scope: !20, inlinedAt: !22) +!40 = !DILocation(line: 594, column: 23, scope: !20, inlinedAt: !22) +!41 = !DILocation(line: 599, column: 19, scope: !20, inlinedAt: !22) +!42 = !DILocation(line: 601, column: 48, scope: !20, inlinedAt: !22) +!43 = !DILocation(line: 601, column: 59, scope: !20, inlinedAt: !22) +!44 = !DILocation(line: 601, column: 22, scope: !20, inlinedAt: !22) +!45 = !DILocation(line: 44, column: 34, scope: !4) +!46 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !47) +!47 = !DILocation(line: 45, column: 26, scope: !4) +!48 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !47) +!49 = !DILocation(line: 48, column: 21, scope: !4) +!50 = !DILocation(line: 49, column: 35, scope: !4) +!51 = !DILocation(line: 49, column: 32, scope: !4) +!52 = !DILocation(line: 49, column: 25, scope: !4) +!53 = !DILocation(line: 49, column: 47, scope: !4) +!54 = !DILocation(line: 50, column: 25, scope: !4) +!55 = !DILocation(line: 50, column: 37, scope: !4) +!56 = !DILocation(line: 50, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx new file mode 100644 index 0000000000000000000000000000000000000000..88e8a18f79f961e94fb9f7672e6973c777bc0a8e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx @@ -0,0 +1,2813 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 // -- Begin function triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.visible .entry triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_6 +) +.reqntid 128 +{ + .reg .pred %p<514>; + .reg .b32 %r<1198>; + .reg .b64 %rd<32>; + .loc 1 18 0 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:18:0 + +// %bb.0: + ld.param.b64 %rd10, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0]; + ld.param.b64 %rd11, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1]; +$L__tmp0: + .loc 1 24 28 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:24:28 + mov.u32 %r690, %ctaid.x; + .loc 1 24 33 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:24:33 + shl.b32 %r691, %r690, 5; + ld.param.b64 %rd12, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2]; + .loc 1 25 44 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:25:44 + mov.u32 %r692, %tid.x; + and.b32 %r693, %r692, 31; + shr.u32 %r694, %r692, 2; + bfe.u32 %r695, %r692, 2, 5; + .loc 1 25 23 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:25:23 + or.b32 %r696, %r691, %r693; + or.b32 %r697, %r695, %r691; + .loc 1 26 21 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:26:21 + setp.lt.s32 %p1, %r696, 32; + setp.lt.s32 %p344, %r697, 32; + .loc 1 27 38 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:27:38 + and.b32 %r698, %r692, 96; + bfe.u32 %r699, %r692, 5, 2; + or.b32 %r700, %r699, 4; + or.b32 %r701, %r699, 8; + or.b32 %r702, %r699, 12; + shl.b32 %r703, %r692, 2; + and.b32 %r704, %r703, 12; + .loc 1 34 19 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:34:19 + bfe.s32 %r705, %r690, 26, 1; + shr.u32 %r706, %r705, 28; + add.s32 %r707, %r696, %r706; + .loc 1 36 35 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:36:35 + shl.b32 %r708, %r707, 4; + and.b32 %r709, %r708, -256; + add.s32 %r710, %r709, %r696; + .loc 1 36 45 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:36:45 + mad.lo.s32 %r711, %r699, 17, %r710; + add.s32 %r712, %r711, 68; + add.s32 %r713, %r711, 136; + add.s32 %r714, %r711, 204; + .loc 1 36 30 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:36:30 + mad.wide.s32 %rd1, %r711, 4, %rd10; + mad.wide.s32 %rd2, %r712, 4, %rd10; + mad.wide.s32 %rd3, %r713, 4, %rd10; + mad.wide.s32 %rd4, %r714, 4, %rd10; + .loc 1 36 54 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:36:54 + // begin inline asm + mov.u32 %r1, 0x0; + @%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2, 0x0; + @%p1 ld.global.b32 { %r2 }, [ %rd2 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r3, 0x0; + @%p1 ld.global.b32 { %r3 }, [ %rd3 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r4, 0x0; + @%p1 ld.global.b32 { %r4 }, [ %rd4 + 0 ]; + // end inline asm +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shr.u32 %r715, %r692, 6; + bfe.u32 %r716, %r692, 6, 1; + bfe.u32 %r717, %r692, 5, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r718, %r717, 1; + xor.b32 %r719, %r716, 1; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r6, %r1, %r718; + mul.lo.s32 %r8, %r2, %r718; + mul.lo.s32 %r10, %r3, %r718; + mul.lo.s32 %r12, %r4, %r718; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shl.b32 %r720, %r716, 3; + mov.b32 %r721, global_smem; + add.s32 %r722, %r721, %r720; + shl.b32 %r723, %r693, 6; + add.s32 %r724, %r722, %r723; + shl.b32 %r725, %r717, 2; + add.s32 %r5, %r724, %r725; + mov.pred %p5, -1; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r6; + // end inline asm + add.s32 %r7, %r5, 16; + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r8; + // end inline asm + add.s32 %r9, %r5, 32; + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r10; + // end inline asm + add.s32 %r11, %r5, 48; + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r12; + // end inline asm + bar.sync 0; + setp.lt.u32 %p9, %r692, 512; + add.s32 %r14, %r721, %r703; + // begin inline asm + @%p9 ld.shared.b32 %r13, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r726, %r13, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r16, %r726, %r13; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.b32 %r727, %r692, 513; + setp.eq.b32 %p10, %r727, 0; + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r16; + // end inline asm + add.s32 %r18, %r14, 512; + // begin inline asm + @%p9 ld.shared.b32 %r17, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r728, %r17, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r20, %r728, %r17; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r20; + // end inline asm + add.s32 %r22, %r14, 1024; + // begin inline asm + @%p9 ld.shared.b32 %r21, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r729, %r21, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r24, %r729, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r24; + // end inline asm + add.s32 %r26, %r14, 1536; + // begin inline asm + @%p9 ld.shared.b32 %r25, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r730, %r25, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r28, %r730, %r25; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r28; + // end inline asm + bar.sync 0; + ld.shared.b32 %r731, [%r724]; + ld.shared.b32 %r732, [%r724+16]; + ld.shared.b32 %r733, [%r724+32]; + ld.shared.b32 %r734, [%r724+48]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r30, %r1, %r717; + mul.lo.s32 %r32, %r2, %r717; + mul.lo.s32 %r34, %r3, %r717; + mul.lo.s32 %r36, %r4, %r717; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r30; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r32; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r34; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r36; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r37, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r735, %r37, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r40, %r735, %r37; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r40; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r41, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r736, %r41, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r44, %r736, %r41; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r44; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r45, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r737, %r45, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r48, %r737, %r45; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r48; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r49, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r738, %r49, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r52, %r738, %r49; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r52; + // end inline asm + bar.sync 0; + ld.shared.b32 %r739, [%r724]; + ld.shared.b32 %r740, [%r724+16]; + ld.shared.b32 %r741, [%r724+32]; + ld.shared.b32 %r742, [%r724+48]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r54, %r718, %r699; + shl.b32 %r743, %r718, 2; + or.b32 %r56, %r54, %r743; + add.s32 %r58, %r56, %r743; + add.s32 %r60, %r58, %r743; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r54; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r56; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r58; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r60; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r61, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r744, %r61, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r64, %r744, %r61; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r64; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r65, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r745, %r65, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r68, %r745, %r65; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r68; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r69, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r746, %r69, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r72, %r746, %r69; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r72; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r73, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r747, %r73, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r76, %r747, %r73; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r76; + // end inline asm + bar.sync 0; + ld.shared.b32 %r748, [%r724]; + ld.shared.b32 %r749, [%r724+16]; + ld.shared.b32 %r750, [%r724+32]; + ld.shared.b32 %r751, [%r724+48]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r78, %r699, %r717; + or.b32 %r80, %r78, %r725; + add.s32 %r82, %r80, %r725; + add.s32 %r84, %r82, %r725; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r78; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r80; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r82; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r84; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r85, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r752, %r85, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r88, %r752, %r85; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r88; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r89, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r753, %r89, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r92, %r753, %r89; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r92; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r93, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r754, %r93, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r96, %r754, %r93; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r96; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r97, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r755, %r97, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r100, %r755, %r97; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r100; + // end inline asm + bar.sync 0; + ld.shared.b32 %r756, [%r724]; + ld.shared.b32 %r757, [%r724+16]; + ld.shared.b32 %r758, [%r724+32]; + ld.shared.b32 %r759, [%r724+48]; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.b32 %r760, %r715, 1; + setp.ne.b32 %p346, %r760, 0; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + shl.b32 %r761, %r717, 10; + shl.b32 %r762, %r693, 5; + or.b32 %r763, %r761, %r762; + add.s32 %r764, %r721, %r763; + shl.b32 %r765, %r716, 2; + add.s32 %r101, %r764, %r765; + add.s32 %r766, %r764, 8; + add.s32 %r103, %r766, %r765; + add.s32 %r767, %r764, 16; + add.s32 %r105, %r767, %r765; + add.s32 %r768, %r764, 24; + add.s32 %r107, %r768, %r765; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.ge.s32 %p347, %r731, %r739; + setp.ge.s32 %p348, %r732, %r740; + setp.ne.b32 %p349, %r732, %r740; + setp.ne.b32 %p350, %r731, %r739; + setp.le.s32 %p351, %r749, %r757; + setp.le.s32 %p352, %r748, %r756; + or.pred %p353, %p350, %p352; + or.pred %p354, %p349, %p351; + and.pred %p355, %p348, %p354; + and.pred %p356, %p347, %p353; + xor.pred %p357, %p356, %p346; + xor.pred %p358, %p355, %p346; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r769, %r756, %r748; + xor.b32 %r770, %r740, %r732; + xor.b32 %r771, %r757, %r749; + xor.b32 %r772, %r739, %r731; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r773, 0, %r770, %p358; + selp.b32 %r774, 0, %r769, %p357; + selp.b32 %r775, 0, %r772, %p357; + selp.b32 %r776, 0, %r771, %p358; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r777, %r774, %r699; + xor.b32 %r778, %r773, %r2; + xor.b32 %r779, %r776, %r700; + xor.b32 %r780, %r775, %r1; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r102, %r780, %r719; + mul.lo.s32 %r104, %r778, %r719; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r102; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r104; + // end inline asm + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r126, %r780, %r716; + mul.lo.s32 %r128, %r778, %r716; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r150, %r777, %r719; + mul.lo.s32 %r152, %r779, %r719; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r174, %r777, %r716; + mul.lo.s32 %r176, %r779, %r716; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.ge.s32 %p359, %r733, %r741; + setp.ge.s32 %p360, %r734, %r742; + setp.ne.b32 %p361, %r734, %r742; + setp.ne.b32 %p362, %r733, %r741; + setp.le.s32 %p363, %r751, %r759; + setp.le.s32 %p364, %r750, %r758; + or.pred %p365, %p362, %p364; + or.pred %p366, %p361, %p363; + and.pred %p367, %p360, %p366; + and.pred %p368, %p359, %p365; + xor.pred %p369, %p368, %p346; + xor.pred %p370, %p367, %p346; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r781, %r758, %r750; + xor.b32 %r782, %r742, %r734; + xor.b32 %r783, %r759, %r751; + xor.b32 %r784, %r741, %r733; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r785, 0, %r782, %p370; + selp.b32 %r786, 0, %r781, %p369; + selp.b32 %r787, 0, %r784, %p369; + selp.b32 %r788, 0, %r783, %p370; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r789, %r786, %r701; + xor.b32 %r790, %r785, %r4; + xor.b32 %r791, %r788, %r702; + xor.b32 %r792, %r787, %r3; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r106, %r792, %r719; + mul.lo.s32 %r108, %r790, %r719; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r130, %r792, %r716; + mul.lo.s32 %r132, %r790, %r716; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r154, %r789, %r719; + mul.lo.s32 %r156, %r791, %r719; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r178, %r789, %r716; + mul.lo.s32 %r180, %r791, %r716; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r106; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r108; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r109, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r793, %r109, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r112, %r793, %r109; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r112; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r113, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r794, %r113, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r116, %r794, %r113; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r116; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r117, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r795, %r117, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r120, %r795, %r117; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r120; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r121, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r796, %r121, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r124, %r796, %r121; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r124; + // end inline asm + bar.sync 0; + ld.shared.b32 %r797, [%r764]; + ld.shared.b32 %r798, [%r764+8]; + ld.shared.b32 %r799, [%r764+16]; + ld.shared.b32 %r800, [%r764+24]; + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r126; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r128; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r130; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r132; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r133, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r801, %r133, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r136, %r801, %r133; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r136; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r137, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r802, %r137, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r140, %r802, %r137; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r140; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r141, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r803, %r141, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r144, %r803, %r141; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r144; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r145, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r804, %r145, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r148, %r804, %r145; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r148; + // end inline asm + bar.sync 0; + ld.shared.b32 %r805, [%r764]; + ld.shared.b32 %r806, [%r764+8]; + ld.shared.b32 %r807, [%r764+16]; + ld.shared.b32 %r808, [%r764+24]; + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r150; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r152; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r154; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r156; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r157, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r809, %r157, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r160, %r809, %r157; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r160; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r161, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r810, %r161, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r164, %r810, %r161; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r164; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r165, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r811, %r165, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r168, %r811, %r165; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r168; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r169, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r812, %r169, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r172, %r812, %r169; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r172; + // end inline asm + bar.sync 0; + ld.shared.b32 %r813, [%r764]; + ld.shared.b32 %r814, [%r764+8]; + ld.shared.b32 %r815, [%r764+16]; + ld.shared.b32 %r816, [%r764+24]; + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r174; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r176; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r178; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r180; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r181, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r817, %r181, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r184, %r817, %r181; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r184; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r185, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r818, %r185, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r188, %r818, %r185; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r188; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r189, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r819, %r189, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r192, %r819, %r189; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r192; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r193, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r820, %r193, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r196, %r820, %r193; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r196; + // end inline asm + bar.sync 0; + ld.shared.b32 %r821, [%r764]; + ld.shared.b32 %r822, [%r764+8]; + ld.shared.b32 %r823, [%r764+16]; + ld.shared.b32 %r824, [%r764+24]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p371, %r797, %r805; + setp.ge.s32 %p372, %r798, %r806; + setp.lt.s32 %p373, %r799, %r807; + setp.ge.s32 %p374, %r800, %r808; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p375, %r797, %r805; + setp.ne.b32 %p376, %r798, %r806; + setp.eq.b32 %p377, %r799, %r807; + setp.ne.b32 %p378, %r800, %r808; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p379, %r813, %r821; + setp.le.s32 %p380, %r814, %r822; + setp.gt.s32 %p381, %r815, %r823; + setp.le.s32 %p382, %r816, %r824; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p383, %p376, %p380; + and.pred %p384, %p375, %p379; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p385, %p378, %p382; + and.pred %p386, %p377, %p381; + and.pred %p387, %p372, %p383; + or.pred %p388, %p371, %p384; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p389, %p374, %p385; + or.pred %p390, %p373, %p386; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r825, %r821, %r813; + xor.b32 %r826, %r806, %r798; + xor.b32 %r827, %r822, %r814; + xor.b32 %r828, %r805, %r797; + xor.b32 %r829, %r823, %r815; + xor.b32 %r830, %r808, %r800; + xor.b32 %r831, %r824, %r816; + xor.b32 %r832, %r807, %r799; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r833, %r826, 0, %p387; + selp.b32 %r834, %r825, 0, %p388; + selp.b32 %r835, %r828, 0, %p388; + selp.b32 %r836, %r827, 0, %p387; + selp.b32 %r837, %r830, 0, %p389; + selp.b32 %r838, %r829, 0, %p390; + selp.b32 %r839, %r832, 0, %p390; + selp.b32 %r840, %r831, 0, %p389; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r841, %r834, %r777; + xor.b32 %r842, %r833, %r778; + xor.b32 %r843, %r836, %r779; + xor.b32 %r844, %r835, %r780; + xor.b32 %r845, %r838, %r789; + xor.b32 %r846, %r837, %r790; + xor.b32 %r847, %r840, %r791; + xor.b32 %r848, %r839, %r792; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r198, %r844, %r718; + mul.lo.s32 %r200, %r842, %r718; + mul.lo.s32 %r202, %r848, %r718; + mul.lo.s32 %r204, %r846, %r718; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r198; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r200; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r202; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r204; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r205, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r849, %r205, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r208, %r849, %r205; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r208; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r209, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r850, %r209, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r212, %r850, %r209; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r212; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r213, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r851, %r213, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r216, %r851, %r213; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r216; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r217, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r852, %r217, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r220, %r852, %r217; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r220; + // end inline asm + bar.sync 0; + ld.shared.b32 %r853, [%r724]; + ld.shared.b32 %r854, [%r724+16]; + ld.shared.b32 %r855, [%r724+32]; + ld.shared.b32 %r856, [%r724+48]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r222, %r844, %r717; + mul.lo.s32 %r224, %r842, %r717; + mul.lo.s32 %r226, %r848, %r717; + mul.lo.s32 %r228, %r846, %r717; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r222; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r224; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r226; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r228; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r229, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r857, %r229, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r232, %r857, %r229; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r232; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r233, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r858, %r233, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r236, %r858, %r233; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r236; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r237, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r859, %r237, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r240, %r859, %r237; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r240; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r241, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r860, %r241, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r244, %r860, %r241; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r244; + // end inline asm + bar.sync 0; + ld.shared.b32 %r861, [%r724]; + ld.shared.b32 %r862, [%r724+16]; + ld.shared.b32 %r863, [%r724+32]; + ld.shared.b32 %r864, [%r724+48]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r246, %r841, %r718; + mul.lo.s32 %r248, %r843, %r718; + mul.lo.s32 %r250, %r845, %r718; + mul.lo.s32 %r252, %r847, %r718; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r246; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r248; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r250; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r252; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r253, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r865, %r253, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r256, %r865, %r253; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r256; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r257, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r866, %r257, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r260, %r866, %r257; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r260; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r261, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r867, %r261, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r264, %r867, %r261; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r264; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r265, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r868, %r265, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r268, %r868, %r265; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r268; + // end inline asm + bar.sync 0; + ld.shared.b32 %r869, [%r724]; + ld.shared.b32 %r870, [%r724+16]; + ld.shared.b32 %r871, [%r724+32]; + ld.shared.b32 %r872, [%r724+48]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r270, %r841, %r717; + mul.lo.s32 %r272, %r843, %r717; + mul.lo.s32 %r274, %r845, %r717; + mul.lo.s32 %r276, %r847, %r717; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r270; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r272; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r274; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r276; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r277, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r873, %r277, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r280, %r873, %r277; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r280; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r281, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r874, %r281, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r284, %r874, %r281; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r284; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r285, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r875, %r285, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r288, %r875, %r285; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r288; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r289, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r876, %r289, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r292, %r876, %r289; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r292; + // end inline asm + bar.sync 0; + ld.shared.b32 %r877, [%r724]; + ld.shared.b32 %r878, [%r724+16]; + ld.shared.b32 %r879, [%r724+32]; + ld.shared.b32 %r880, [%r724+48]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p391, %r853, %r861; + setp.ge.s32 %p392, %r854, %r862; + setp.lt.s32 %p393, %r855, %r863; + setp.ge.s32 %p394, %r856, %r864; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p395, %r853, %r861; + setp.ne.b32 %p396, %r854, %r862; + setp.eq.b32 %p397, %r855, %r863; + setp.ne.b32 %p398, %r856, %r864; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p399, %r869, %r877; + setp.le.s32 %p400, %r870, %r878; + setp.gt.s32 %p401, %r871, %r879; + setp.le.s32 %p402, %r872, %r880; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p403, %p396, %p400; + and.pred %p404, %p395, %p399; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p405, %p398, %p402; + and.pred %p406, %p397, %p401; + and.pred %p407, %p392, %p403; + or.pred %p408, %p391, %p404; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p409, %p394, %p405; + or.pred %p410, %p393, %p406; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r881, %r877, %r869; + xor.b32 %r882, %r862, %r854; + xor.b32 %r883, %r878, %r870; + xor.b32 %r884, %r861, %r853; + xor.b32 %r885, %r879, %r871; + xor.b32 %r886, %r864, %r856; + xor.b32 %r887, %r880, %r872; + xor.b32 %r888, %r863, %r855; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r889, %r882, 0, %p407; + selp.b32 %r890, %r881, 0, %p408; + selp.b32 %r891, %r884, 0, %p408; + selp.b32 %r892, %r883, 0, %p407; + selp.b32 %r893, %r886, 0, %p409; + selp.b32 %r894, %r885, 0, %p410; + selp.b32 %r895, %r888, 0, %p410; + selp.b32 %r896, %r887, 0, %p409; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r897, %r890, %r841; + xor.b32 %r898, %r889, %r842; + xor.b32 %r899, %r892, %r843; + xor.b32 %r900, %r891, %r844; + xor.b32 %r901, %r894, %r845; + xor.b32 %r902, %r893, %r846; + xor.b32 %r903, %r896, %r847; + xor.b32 %r904, %r895, %r848; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p411, %r900, %r898; + setp.ge.s32 %p412, %r904, %r902; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p413, %r898, %r900; + setp.gt.s32 %p414, %r897, %r899; + setp.ne.b32 %p415, %r902, %r904; + setp.le.s32 %p416, %r901, %r903; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p417, %p413, %p414; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p418, %p415, %p416; + or.pred %p419, %p411, %p417; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p420, %p412, %p418; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r905, %r898, %r900; + xor.b32 %r906, %r902, %r904; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r907, %r905, 0, %p419; + selp.b32 %r908, %r906, 0, %p420; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r909, %r899, %r897; + xor.b32 %r910, %r903, %r901; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r911, %r909, 0, %p419; + selp.b32 %r912, %r910, 0, %p420; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r913, %r907, %r898; + xor.b32 %r914, %r907, %r900; + xor.b32 %r915, %r908, %r902; + xor.b32 %r916, %r908, %r904; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r917, %r911, %r899; + xor.b32 %r918, %r911, %r897; + xor.b32 %r919, %r912, %r903; + xor.b32 %r920, %r912, %r901; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r294, %r914, %r719; + mul.lo.s32 %r296, %r913, %r719; + mul.lo.s32 %r298, %r916, %r719; + mul.lo.s32 %r300, %r915, %r719; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r294; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r296; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r298; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r300; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r301, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r921, %r301, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r304, %r921, %r301; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r304; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r305, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r922, %r305, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r308, %r922, %r305; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r308; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r309, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r923, %r309, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r312, %r923, %r309; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r312; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r313, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r924, %r313, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r316, %r924, %r313; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r316; + // end inline asm + bar.sync 0; + ld.shared.b32 %r925, [%r764]; + ld.shared.b32 %r926, [%r764+8]; + ld.shared.b32 %r927, [%r764+16]; + ld.shared.b32 %r928, [%r764+24]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r318, %r914, %r716; + mul.lo.s32 %r320, %r913, %r716; + mul.lo.s32 %r322, %r916, %r716; + mul.lo.s32 %r324, %r915, %r716; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r318; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r320; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r322; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r324; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r325, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r929, %r325, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r328, %r929, %r325; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r328; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r329, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r930, %r329, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r332, %r930, %r329; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r332; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r333, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r931, %r333, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r336, %r931, %r333; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r336; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r337, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r932, %r337, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r340, %r932, %r337; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r340; + // end inline asm + bar.sync 0; + ld.shared.b32 %r933, [%r764]; + ld.shared.b32 %r934, [%r764+8]; + ld.shared.b32 %r935, [%r764+16]; + ld.shared.b32 %r936, [%r764+24]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r342, %r918, %r719; + mul.lo.s32 %r344, %r917, %r719; + mul.lo.s32 %r346, %r920, %r719; + mul.lo.s32 %r348, %r919, %r719; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r342; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r344; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r346; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r348; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r349, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r937, %r349, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r352, %r937, %r349; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r352; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r353, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r938, %r353, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r356, %r938, %r353; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r356; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r357, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r939, %r357, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r360, %r939, %r357; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r360; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r361, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r940, %r361, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r364, %r940, %r361; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r364; + // end inline asm + bar.sync 0; + ld.shared.b32 %r941, [%r764]; + ld.shared.b32 %r942, [%r764+8]; + ld.shared.b32 %r943, [%r764+16]; + ld.shared.b32 %r944, [%r764+24]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r366, %r918, %r716; + mul.lo.s32 %r368, %r917, %r716; + mul.lo.s32 %r370, %r920, %r716; + mul.lo.s32 %r372, %r919, %r716; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r366; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r368; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r370; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r372; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r373, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r945, %r373, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r376, %r945, %r373; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r376; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r377, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r946, %r377, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r380, %r946, %r377; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r380; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r381, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r947, %r381, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r384, %r947, %r381; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r384; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r385, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r948, %r385, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r388, %r948, %r385; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r388; + // end inline asm + bar.sync 0; + ld.shared.b32 %r949, [%r764]; + ld.shared.b32 %r950, [%r764+8]; + ld.shared.b32 %r951, [%r764+16]; + ld.shared.b32 %r952, [%r764+24]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p421, %r925, %r933; + setp.lt.s32 %p422, %r926, %r934; + setp.ge.s32 %p423, %r927, %r935; + setp.ge.s32 %p424, %r928, %r936; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p425, %r926, %r934; + setp.eq.b32 %p426, %r925, %r933; + setp.ne.b32 %p427, %r928, %r936; + setp.ne.b32 %p428, %r927, %r935; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p429, %r942, %r950; + setp.gt.s32 %p430, %r941, %r949; + setp.le.s32 %p431, %r944, %r952; + setp.le.s32 %p432, %r943, %r951; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p433, %p426, %p430; + and.pred %p434, %p425, %p429; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p435, %p428, %p432; + or.pred %p436, %p427, %p431; + or.pred %p437, %p422, %p434; + or.pred %p438, %p421, %p433; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p439, %p424, %p436; + and.pred %p440, %p423, %p435; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r953, %r934, %r926; + xor.b32 %r954, %r933, %r925; + xor.b32 %r955, %r936, %r928; + xor.b32 %r956, %r935, %r927; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r957, %r954, 0, %p438; + selp.b32 %r958, %r953, 0, %p437; + selp.b32 %r959, %r956, 0, %p440; + selp.b32 %r960, %r955, 0, %p439; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r961, %r958, %r913; + xor.b32 %r962, %r957, %r914; + xor.b32 %r963, %r960, %r915; + xor.b32 %r964, %r959, %r916; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r965, %r950, %r942; + xor.b32 %r966, %r949, %r941; + xor.b32 %r967, %r952, %r944; + xor.b32 %r968, %r951, %r943; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r969, %r966, 0, %p438; + selp.b32 %r970, %r965, 0, %p437; + selp.b32 %r971, %r968, 0, %p440; + selp.b32 %r972, %r967, 0, %p439; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r973, %r970, %r917; + xor.b32 %r974, %r969, %r918; + xor.b32 %r975, %r972, %r919; + xor.b32 %r976, %r971, %r920; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r390, %r962, %r718; + mul.lo.s32 %r392, %r961, %r718; + mul.lo.s32 %r394, %r964, %r718; + mul.lo.s32 %r396, %r963, %r718; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r390; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r392; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r394; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r396; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r397, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r977, %r397, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r400, %r977, %r397; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r400; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r401, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r978, %r401, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r404, %r978, %r401; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r404; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r405, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r979, %r405, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r408, %r979, %r405; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r408; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r409, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r980, %r409, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r412, %r980, %r409; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r412; + // end inline asm + bar.sync 0; + ld.shared.b32 %r981, [%r724]; + ld.shared.b32 %r982, [%r724+16]; + ld.shared.b32 %r983, [%r724+32]; + ld.shared.b32 %r984, [%r724+48]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r414, %r962, %r717; + mul.lo.s32 %r416, %r961, %r717; + mul.lo.s32 %r418, %r964, %r717; + mul.lo.s32 %r420, %r963, %r717; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r414; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r416; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r418; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r420; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r421, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r985, %r421, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r424, %r985, %r421; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r424; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r425, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r986, %r425, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r428, %r986, %r425; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r428; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r429, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r987, %r429, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r432, %r987, %r429; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r432; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r433, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r988, %r433, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r436, %r988, %r433; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r436; + // end inline asm + bar.sync 0; + ld.shared.b32 %r989, [%r724]; + ld.shared.b32 %r990, [%r724+16]; + ld.shared.b32 %r991, [%r724+32]; + ld.shared.b32 %r992, [%r724+48]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r438, %r974, %r718; + mul.lo.s32 %r440, %r973, %r718; + mul.lo.s32 %r442, %r976, %r718; + mul.lo.s32 %r444, %r975, %r718; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r438; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r440; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r442; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r444; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r445, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r993, %r445, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r448, %r993, %r445; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r448; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r449, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r994, %r449, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r452, %r994, %r449; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r452; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r453, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r995, %r453, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r456, %r995, %r453; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r456; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r457, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r996, %r457, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r460, %r996, %r457; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r460; + // end inline asm + bar.sync 0; + ld.shared.b32 %r997, [%r724]; + ld.shared.b32 %r998, [%r724+16]; + ld.shared.b32 %r999, [%r724+32]; + ld.shared.b32 %r1000, [%r724+48]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r462, %r974, %r717; + mul.lo.s32 %r464, %r973, %r717; + mul.lo.s32 %r466, %r976, %r717; + mul.lo.s32 %r468, %r975, %r717; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r462; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r464; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r466; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r468; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r469, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1001, %r469, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r472, %r1001, %r469; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r472; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r473, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1002, %r473, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r476, %r1002, %r473; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r476; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r477, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1003, %r477, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r480, %r1003, %r477; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r480; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r481, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1004, %r481, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r484, %r1004, %r481; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r484; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1005, [%r724]; + ld.shared.b32 %r1006, [%r724+16]; + ld.shared.b32 %r1007, [%r724+32]; + ld.shared.b32 %r1008, [%r724+48]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p441, %r982, %r990; + setp.lt.s32 %p442, %r981, %r989; + setp.ge.s32 %p443, %r984, %r992; + setp.ge.s32 %p444, %r983, %r991; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p445, %r981, %r989; + setp.eq.b32 %p446, %r982, %r990; + setp.ne.b32 %p447, %r983, %r991; + setp.ne.b32 %p448, %r984, %r992; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p449, %r997, %r1005; + setp.gt.s32 %p450, %r998, %r1006; + setp.le.s32 %p451, %r999, %r1007; + setp.le.s32 %p452, %r1000, %r1008; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p453, %p446, %p450; + and.pred %p454, %p445, %p449; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p455, %p448, %p452; + or.pred %p456, %p447, %p451; + or.pred %p457, %p442, %p454; + or.pred %p458, %p441, %p453; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p459, %p444, %p456; + and.pred %p460, %p443, %p455; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r1009, %r989, %r981; + xor.b32 %r1010, %r990, %r982; + xor.b32 %r1011, %r991, %r983; + xor.b32 %r1012, %r992, %r984; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r1013, %r1010, 0, %p458; + selp.b32 %r1014, %r1009, 0, %p457; + selp.b32 %r1015, %r1012, 0, %p460; + selp.b32 %r1016, %r1011, 0, %p459; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r1017, %r1014, %r962; + xor.b32 %r1018, %r1013, %r961; + xor.b32 %r1019, %r1016, %r964; + xor.b32 %r1020, %r1015, %r963; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r1021, %r1006, %r998; + xor.b32 %r1022, %r1005, %r997; + xor.b32 %r1023, %r1008, %r1000; + xor.b32 %r1024, %r1007, %r999; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r1025, %r1022, 0, %p457; + selp.b32 %r1026, %r1021, 0, %p458; + selp.b32 %r1027, %r1024, 0, %p459; + selp.b32 %r1028, %r1023, 0, %p460; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r1029, %r1026, %r973; + xor.b32 %r1030, %r1025, %r974; + xor.b32 %r1031, %r1028, %r975; + xor.b32 %r1032, %r1027, %r976; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p461, %r1018, %r1020; + setp.lt.s32 %p462, %r1017, %r1019; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p463, %r1017, %r1019; + setp.eq.b32 %p464, %r1018, %r1020; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p465, %r1030, %r1032; + setp.gt.s32 %p466, %r1029, %r1031; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p467, %p464, %p466; + and.pred %p468, %p463, %p465; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p469, %p462, %p468; + or.pred %p470, %p461, %p467; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r1033, %r1019, %r1017; + xor.b32 %r1034, %r1020, %r1018; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r1035, %r1034, 0, %p470; + selp.b32 %r1036, %r1033, 0, %p469; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r1037, %r1036, %r1017; + xor.b32 %r1038, %r1035, %r1018; + xor.b32 %r1039, %r1036, %r1019; + xor.b32 %r1040, %r1035, %r1020; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r1041, %r1038, %r1037; + xor.b32 %r1042, %r1040, %r1039; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r1043, %r1031, %r1029; + xor.b32 %r1044, %r1032, %r1030; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r1045, %r1044, 0, %p469; + selp.b32 %r1046, %r1043, 0, %p470; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r1047, %r1046, %r1029; + xor.b32 %r1048, %r1045, %r1032; + xor.b32 %r1049, %r1046, %r1031; + xor.b32 %r1050, %r1045, %r1030; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p471, %r1037, %r1038; + setp.lt.s32 %p472, %r1039, %r1040; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p473, %r1039, %r1040; + setp.eq.b32 %p474, %r1037, %r1038; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p475, %r1048, %r1049; + setp.gt.s32 %p476, %r1050, %r1047; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p477, %p474, %p476; + and.pred %p478, %p473, %p475; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p479, %p472, %p478; + or.pred %p480, %p471, %p477; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r1051, %r1041, 0, %p480; + selp.b32 %r1052, %r1042, 0, %p479; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r1053, %r1051, %r1037; + xor.b32 %r1054, %r1051, %r1038; + xor.b32 %r1055, %r1052, %r1039; + xor.b32 %r1056, %r1052, %r1040; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r1057, %r1030, %r1047; + xor.b32 %r1058, %r1031, %r1048; + xor.b32 %r1059, %r1058, %r1046; + xor.b32 %r1060, %r1057, %r1045; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r1061, %r1060, 0, %p480; + selp.b32 %r1062, %r1059, 0, %p479; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r1063, %r1062, %r1049; + xor.b32 %r1064, %r1062, %r1048; + xor.b32 %r1065, %r1061, %r1047; + xor.b32 %r1066, %r1061, %r1050; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r486, %r1053, %r719; + mul.lo.s32 %r488, %r1054, %r719; + mul.lo.s32 %r490, %r1055, %r719; + mul.lo.s32 %r492, %r1056, %r719; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r486; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r488; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r490; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r492; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r493, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1067, %r493, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r496, %r1067, %r493; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r496; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r497, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1068, %r497, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r500, %r1068, %r497; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r500; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r501, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1069, %r501, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r504, %r1069, %r501; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r504; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r505, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1070, %r505, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r508, %r1070, %r505; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r508; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1071, [%r764]; + ld.shared.b32 %r1072, [%r764+8]; + ld.shared.b32 %r1073, [%r764+16]; + ld.shared.b32 %r1074, [%r764+24]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r510, %r1053, %r716; + mul.lo.s32 %r512, %r1054, %r716; + mul.lo.s32 %r514, %r1055, %r716; + mul.lo.s32 %r516, %r1056, %r716; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r510; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r512; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r514; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r516; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r517, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1075, %r517, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r520, %r1075, %r517; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r520; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r521, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1076, %r521, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r524, %r1076, %r521; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r524; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r525, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1077, %r525, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r528, %r1077, %r525; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r528; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r529, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1078, %r529, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r532, %r1078, %r529; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r532; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1079, [%r764]; + ld.shared.b32 %r1080, [%r764+8]; + ld.shared.b32 %r1081, [%r764+16]; + ld.shared.b32 %r1082, [%r764+24]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r534, %r1066, %r719; + mul.lo.s32 %r536, %r1065, %r719; + mul.lo.s32 %r538, %r1064, %r719; + mul.lo.s32 %r540, %r1063, %r719; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r534; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r536; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r538; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r540; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r541, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1083, %r541, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r544, %r1083, %r541; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r544; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r545, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1084, %r545, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r548, %r1084, %r545; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r548; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r549, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1085, %r549, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r552, %r1085, %r549; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r552; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r553, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1086, %r553, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r556, %r1086, %r553; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r556; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1087, [%r764]; + ld.shared.b32 %r1088, [%r764+8]; + ld.shared.b32 %r1089, [%r764+16]; + ld.shared.b32 %r1090, [%r764+24]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r558, %r1066, %r716; + mul.lo.s32 %r560, %r1065, %r716; + mul.lo.s32 %r562, %r1064, %r716; + mul.lo.s32 %r564, %r1063, %r716; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r101 + 0 ], %r558; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r103 + 0 ], %r560; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r105 + 0 ], %r562; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r107 + 0 ], %r564; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r565, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1091, %r565, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r568, %r1091, %r565; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r568; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r569, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1092, %r569, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r572, %r1092, %r569; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r572; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r573, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1093, %r573, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r576, %r1093, %r573; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r576; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r577, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1094, %r577, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r580, %r1094, %r577; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r580; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1095, [%r764]; + ld.shared.b32 %r1096, [%r764+8]; + ld.shared.b32 %r1097, [%r764+16]; + ld.shared.b32 %r1098, [%r764+24]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p481, %r1071, %r1079; + setp.lt.s32 %p482, %r1072, %r1080; + setp.lt.s32 %p483, %r1073, %r1081; + setp.lt.s32 %p484, %r1074, %r1082; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p485, %r1074, %r1082; + setp.eq.b32 %p486, %r1073, %r1081; + setp.eq.b32 %p487, %r1072, %r1080; + setp.eq.b32 %p488, %r1071, %r1079; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p489, %r1090, %r1098; + setp.gt.s32 %p490, %r1089, %r1097; + setp.gt.s32 %p491, %r1088, %r1096; + setp.gt.s32 %p492, %r1087, %r1095; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p493, %p488, %p492; + and.pred %p494, %p487, %p491; + and.pred %p495, %p486, %p490; + and.pred %p496, %p485, %p489; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p497, %p484, %p496; + or.pred %p498, %p483, %p495; + or.pred %p499, %p482, %p494; + or.pred %p500, %p481, %p493; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r1099, %r1079, %r1071; + xor.b32 %r1100, %r1080, %r1072; + xor.b32 %r1101, %r1081, %r1073; + xor.b32 %r1102, %r1082, %r1074; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r1103, %r1099, 0, %p500; + selp.b32 %r1104, %r1100, 0, %p499; + selp.b32 %r1105, %r1101, 0, %p498; + selp.b32 %r1106, %r1102, 0, %p497; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r1107, %r1103, %r1053; + xor.b32 %r1108, %r1104, %r1054; + xor.b32 %r1109, %r1105, %r1055; + xor.b32 %r1110, %r1106, %r1056; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r1111, %r1098, %r1090; + xor.b32 %r1112, %r1097, %r1089; + xor.b32 %r1113, %r1096, %r1088; + xor.b32 %r1114, %r1095, %r1087; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r1115, %r1114, 0, %p500; + selp.b32 %r1116, %r1113, 0, %p499; + selp.b32 %r1117, %r1112, 0, %p498; + selp.b32 %r1118, %r1111, 0, %p497; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r1119, %r1118, %r1063; + xor.b32 %r1120, %r1117, %r1064; + xor.b32 %r1121, %r1116, %r1065; + xor.b32 %r1122, %r1115, %r1066; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r582, %r1107, %r718; + mul.lo.s32 %r584, %r1108, %r718; + mul.lo.s32 %r586, %r1109, %r718; + mul.lo.s32 %r588, %r1110, %r718; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r582; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r584; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r586; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r588; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r589, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1123, %r589, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r592, %r1123, %r589; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r592; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r593, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1124, %r593, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r596, %r1124, %r593; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r596; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r597, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1125, %r597, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r600, %r1125, %r597; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r600; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r601, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1126, %r601, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r604, %r1126, %r601; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r604; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1127, [%r724]; + ld.shared.b32 %r1128, [%r724+16]; + ld.shared.b32 %r1129, [%r724+32]; + ld.shared.b32 %r1130, [%r724+48]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r606, %r1107, %r717; + mul.lo.s32 %r608, %r1108, %r717; + mul.lo.s32 %r610, %r1109, %r717; + mul.lo.s32 %r612, %r1110, %r717; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r606; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r608; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r610; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r612; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r613, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1131, %r613, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r616, %r1131, %r613; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r616; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r617, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1132, %r617, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r620, %r1132, %r617; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r620; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r621, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1133, %r621, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r624, %r1133, %r621; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r624; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r625, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1134, %r625, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r628, %r1134, %r625; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r628; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1135, [%r724]; + ld.shared.b32 %r1136, [%r724+16]; + ld.shared.b32 %r1137, [%r724+32]; + ld.shared.b32 %r1138, [%r724+48]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r630, %r1122, %r718; + mul.lo.s32 %r632, %r1121, %r718; + mul.lo.s32 %r634, %r1120, %r718; + mul.lo.s32 %r636, %r1119, %r718; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r630; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r632; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r634; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r636; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r637, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1139, %r637, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r640, %r1139, %r637; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r640; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r641, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1140, %r641, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r644, %r1140, %r641; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r644; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r645, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1141, %r645, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r648, %r1141, %r645; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r648; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r649, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1142, %r649, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r652, %r1142, %r649; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r652; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1143, [%r724]; + ld.shared.b32 %r1144, [%r724+16]; + ld.shared.b32 %r1145, [%r724+32]; + ld.shared.b32 %r1146, [%r724+48]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r654, %r1122, %r717; + mul.lo.s32 %r656, %r1121, %r717; + mul.lo.s32 %r658, %r1120, %r717; + mul.lo.s32 %r660, %r1119, %r717; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r5 + 0 ], %r654; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r7 + 0 ], %r656; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r9 + 0 ], %r658; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r11 + 0 ], %r660; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r661, [ %r14 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1147, %r661, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r664, %r1147, %r661; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r14 + 0 ], %r664; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r665, [ %r18 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1148, %r665, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r668, %r1148, %r665; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r18 + 0 ], %r668; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r669, [ %r22 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1149, %r669, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r672, %r1149, %r669; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r22 + 0 ], %r672; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r673, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1150, %r673, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r676, %r1150, %r673; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + // begin inline asm + @%p10 st.shared.b32 [ %r26 + 0 ], %r676; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1151, [%r724]; + ld.shared.b32 %r1152, [%r724+16]; + ld.shared.b32 %r1153, [%r724+32]; + ld.shared.b32 %r1154, [%r724+48]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p501, %r1130, %r1138; + setp.lt.s32 %p502, %r1129, %r1137; + setp.lt.s32 %p503, %r1128, %r1136; + setp.lt.s32 %p504, %r1127, %r1135; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p505, %r1130, %r1138; + setp.eq.b32 %p506, %r1129, %r1137; + setp.eq.b32 %p507, %r1128, %r1136; + setp.eq.b32 %p508, %r1127, %r1135; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p509, %r1146, %r1154; + setp.gt.s32 %p510, %r1145, %r1153; + setp.gt.s32 %p511, %r1144, %r1152; + setp.gt.s32 %p512, %r1143, %r1151; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r1155, %r1154, %r1146; + xor.b32 %r1156, %r1153, %r1145; + xor.b32 %r1157, %r1152, %r1144; + xor.b32 %r1158, %r1151, %r1143; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r1159, %r1158, 0, %p512; + selp.b32 %r1160, %r1159, 0, %p508; + selp.b32 %r1161, %r1158, %r1160, %p504; + selp.b32 %r1162, %r1157, 0, %p511; + selp.b32 %r1163, %r1162, 0, %p507; + selp.b32 %r1164, %r1157, %r1163, %p503; + selp.b32 %r1165, %r1156, 0, %p510; + selp.b32 %r1166, %r1165, 0, %p506; + selp.b32 %r1167, %r1156, %r1166, %p502; + selp.b32 %r1168, %r1155, 0, %p509; + selp.b32 %r1169, %r1168, 0, %p505; + selp.b32 %r1170, %r1155, %r1169, %p501; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r1171, %r1170, %r1119; + xor.b32 %r1172, %r1167, %r1120; + xor.b32 %r1173, %r1164, %r1121; + xor.b32 %r1174, %r1161, %r1122; +$L__tmp2: + .loc 1 44 34 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:44:34 + cvt.s64.s32 %rd13, %r3; + cvt.s64.s32 %rd14, %r2; + cvt.s64.s32 %rd15, %r4; + cvt.s64.s32 %rd16, %r1; + selp.b64 %rd17, %rd16, 0, %p1; + selp.b64 %rd18, %rd15, 0, %p1; + selp.b64 %rd19, %rd14, 0, %p1; + selp.b64 %rd20, %rd13, 0, %p1; +$L__tmp3: + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + bar.sync 0; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + add.s64 %rd21, %rd19, %rd20; + add.s64 %rd22, %rd17, %rd18; + add.s64 %rd5, %rd21, %rd22; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + add.s32 %r1175, %r721, %r762; + and.b32 %r1176, %r694, 24; + add.s32 %r677, %r1175, %r1176; + // begin inline asm + @%p5 st.shared.b64 [ %r677 + 0 ], %rd5; + // end inline asm + bar.sync 0; + setp.lt.u32 %p342, %r692, 128; + shl.b32 %r1177, %r692, 3; + add.s32 %r678, %r721, %r1177; + // begin inline asm + @%p342 ld.shared.b64 %rd6, [ %r678 + 0 ]; + // end inline asm + mov.b64 {_, %r1178}, %rd6; + cvt.u32.u64 %r1179, %rd6; + shfl.sync.bfly.b32 %r1180, %r1179, 2, 31, -1; + shfl.sync.bfly.b32 %r1181, %r1178, 2, 31, -1; + cvt.u64.u32 %rd23, %r1180; + cvt.u64.u32 %rd24, %r1181; + shl.b64 %rd25, %rd24, 32; + or.b64 %rd26, %rd23, %rd25; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + add.s64 %rd27, %rd6, %rd26; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + mov.b64 {_, %r1182}, %rd27; + cvt.u32.u64 %r1183, %rd27; + shfl.sync.bfly.b32 %r1184, %r1183, 1, 31, -1; + shfl.sync.bfly.b32 %r1185, %r1182, 1, 31, -1; + cvt.u64.u32 %rd28, %r1184; + cvt.u64.u32 %rd29, %r1185; + shl.b64 %rd30, %rd29, 32; + or.b64 %rd31, %rd28, %rd30; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + add.s64 %rd7, %rd27, %rd31; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + and.b32 %r1186, %r692, 899; + setp.eq.b32 %p343, %r1186, 0; + // begin inline asm + @%p343 st.shared.b64 [ %r678 + 0 ], %rd7; + // end inline asm + bar.sync 0; + ld.shared.b32 %r689, [%r1175]; +$L__tmp4: + .loc 1 49 35 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:49:35 + shl.b32 %r1187, %r697, 4; + .loc 1 49 32 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:49:32 + or.b32 %r1188, %r1187, %r704; + .loc 1 49 25 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:49:25 + mad.wide.s32 %rd8, %r1188, 4, %rd11; + .loc 1 49 47 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:49:47 + bar.sync 0; + shl.b32 %r1189, %r692, 4; + and.b32 %r1190, %r1189, 2032; + add.s32 %r1191, %r721, %r1190; + st.shared.v4.b32 [%r1191], {%r1174, %r1173, %r1172, %r1171}; + bar.sync 0; + shl.b32 %r1192, %r692, 6; + and.b32 %r1193, %r1192, 1536; + and.b32 %r1194, %r1189, 112; + shl.b32 %r1195, %r698, 2; + add.s32 %r1196, %r721, %r1193; + add.s32 %r1197, %r1196, %r1194; + add.s32 %r684, %r1197, %r1195; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r685, %r686, %r687, %r688}, [%r684]; + // end inline asm + // begin inline asm + @%p344 st.global.v4.b32 [ %rd8 + 0 ], { %r685, %r686, %r687, %r688 }; + // end inline asm + .loc 1 50 25 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:50:25 + mad.wide.s32 %rd9, %r696, 4, %rd12; + .loc 1 50 37 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:50:37 + setp.eq.b32 %p513, %r698, 0; + and.pred %p345, %p513, %p1; + // begin inline asm + @%p345 st.global.b32 [ %rd9 + 0 ], { %r689 }; + // end inline asm + .loc 1 50 4 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:50:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 267 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x104 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 119 +.b8 55 +.b8 117 +.b8 54 +.b8 114 +.b8 103 +.b8 107 +.b8 111 +.b8 105 +.b8 104 +.b8 119 +.b8 105 +.b8 122 +.b8 117 +.b8 103 +.b8 101 +.b8 54 +.b8 52 +.b8 52 +.b8 109 +.b8 110 +.b8 103 +.b8 50 +.b8 100 +.b8 100 +.b8 103 +.b8 51 +.b8 98 +.b8 113 +.b8 105 +.b8 105 +.b8 120 +.b8 102 +.b8 120 +.b8 101 +.b8 103 +.b8 115 +.b8 109 +.b8 120 +.b8 117 +.b8 103 +.b8 116 +.b8 98 +.b8 108 +.b8 97 +.b8 102 +.b8 99 +.b8 54 +.b8 107 +.b8 122 +.b8 51 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 119 +.b8 55 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x3d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 99 +.b8 108 +.b8 111 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 108 +.b8 105 +.b8 99 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 116 +.b8 114 +.b8 97 +.b8 110 +.b8 115 +.b8 112 +.b8 111 +.b8 115 +.b8 101 +.b8 95 +.b8 51 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc8:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xdd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 67 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xf5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source b/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source new file mode 100644 index 0000000000000000000000000000000000000000..fe6a7dc12059bda314d33853da1d72f70371dd8a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source @@ -0,0 +1,1221 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":18:0) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc88 = loc(unknown) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc140 = loc("in_ptr0"(#loc)) +#loc141 = loc("out_ptr2"(#loc)) +#loc142 = loc("out_ptr3"(#loc)) +#loc143 = loc("xnumel"(#loc)) +#loc144 = loc("r0_numel"(#loc)) +#loc174 = loc("x"(#loc38)) +#loc175 = loc("idxs"(#loc38)) +#loc176 = loc("x"(#loc42)) +#loc177 = loc("idxs"(#loc42)) +#loc182 = loc("x"(#loc50)) +#loc183 = loc("idxs"(#loc50)) +#loc184 = loc("flip"(#loc50)) +#loc240 = loc("input"(#loc113)) +#loc241 = loc("a"(#loc117)) +#loc242 = loc("b"(#loc117)) +#loc244 = loc("x"(#loc122)) +#loc245 = loc("x"(#loc126)) +#loc246 = loc("input"(#loc135)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 32 : i32 loc(#loc145) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc146) + %xoffset = tt.get_program_id x : i32 loc(#loc147) + %xoffset_2 = arith.constant 32 : i32 loc(#loc148) + %xoffset_3 = arith.constant 32 : i32 loc(#loc148) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc148) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc149) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc150) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<32x1xi32> loc(#loc151) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<32x1xi32> loc(#loc151) + %xmask = arith.constant dense<32> : tensor<32x1xi32> loc(#loc152) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<32x1xi32> loc(#loc152) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc153) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc154) + %r0_offset = arith.constant 0 : i32 loc(#loc155) + %r0_mask = arith.constant true loc(#loc156) + %r0_mask_10 = arith.constant dense : tensor<32x16xi1> loc(#loc156) + %x0 = arith.constant 16 : i32 loc(#loc157) + %x0_11 = arith.constant 16 : i32 loc(#loc157) + %x0_12 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc157) + %x0_13 = arith.remsi %xindex_7, %x0_12 : tensor<32x1xi32> loc(#loc157) + %x1 = arith.constant 16 : i32 loc(#loc158) + %x1_14 = arith.constant 16 : i32 loc(#loc158) + %x1_15 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc158) + %x1_16 = arith.divsi %xindex_7, %x1_15 : tensor<32x1xi32> loc(#loc158) + %tmp0 = arith.constant 17 : i32 loc(#loc159) + %tmp0_17 = arith.constant 17 : i32 loc(#loc159) + %tmp0_18 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc159) + %tmp0_19 = arith.muli %tmp0_18, %r0_index_9 : tensor<1x16xi32> loc(#loc159) + %tmp0_20 = tt.broadcast %x0_13 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc160) + %tmp0_21 = tt.broadcast %tmp0_19 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc160) + %tmp0_22 = arith.addi %tmp0_20, %tmp0_21 : tensor<32x16xi32> loc(#loc160) + %tmp0_23 = arith.constant 272 : i32 loc(#loc161) + %tmp0_24 = arith.constant 272 : i32 loc(#loc161) + %tmp0_25 = arith.constant dense<272> : tensor<32x1xi32> loc(#loc161) + %tmp0_26 = arith.muli %tmp0_25, %x1_16 : tensor<32x1xi32> loc(#loc161) + %tmp0_27 = tt.broadcast %tmp0_26 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc162) + %tmp0_28 = arith.addi %tmp0_22, %tmp0_27 : tensor<32x16xi32> loc(#loc162) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc163) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc163) + %tmp0_31 = arith.constant 0.000000e+00 : f32 loc(#loc164) + %tmp0_32 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc164) + %tmp0_33 = arith.constant dense<0.000000e+00> : tensor<32x16xf32> loc(#loc164) + %tmp0_34 = arith.fptosi %tmp0_33 : tensor<32x16xf32> to tensor<32x16xi32> loc(#loc164) + %tmp0_35 = tt.load %tmp0_30, %tmp0_32, %tmp0_34 : tensor<32x16x!tt.ptr> loc(#loc164) + %tmp2 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc165) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<32x16xi16> loc(#loc166) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp0_35, %tmp4) : (tensor<32x16xi32>, tensor<32x16xi16>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc23) + %tmp7 = arith.extsi %tmp0_35 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc167) + %tmp10 = arith.constant 0 : i32 loc(#loc168) + %tmp10_36 = arith.constant 0 : i64 loc(#loc168) + %tmp10_37 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc168) + %tmp10_38 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc168) + %tmp10_39 = arith.select %tmp10_38, %tmp7, %tmp10_37 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc168) + %tmp11 = tt.call @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp10_39) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc169) + %tmp11_40 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc170) + %tmp12 = arith.extsi %0#1 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc171) + %tmp13 = arith.trunci %tmp12 : tensor<32x16xi64> to tensor<32x16xi32> loc(#loc172) + %tmp14 = arith.trunci %tmp11_40 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc173) + %c16_i32 = arith.constant 16 : i32 loc(#loc31) + %c16_i32_41 = arith.constant 16 : i32 loc(#loc31) + %cst = arith.constant dense<16> : tensor<32x1xi32> loc(#loc31) + %1 = arith.muli %cst, %xindex_7 : tensor<32x1xi32> loc(#loc31) + %2 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc32) + %3 = tt.broadcast %1 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc32) + %4 = arith.addi %2, %3 : tensor<32x16xi32> loc(#loc32) + %5 = tt.splat %out_ptr2 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc33) + %6 = tt.addptr %5, %4 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc33) + %7 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc34) + tt.store %6, %tmp13, %7 : tensor<32x16x!tt.ptr> loc(#loc34) + %8 = tt.splat %out_ptr3 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc35) + %9 = tt.addptr %8, %xindex_7 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc35) + tt.store %9, %tmp14, %xmask_8 : tensor<32x1x!tt.ptr> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc38)), %idxs: tensor<32x16xi16> loc("idxs"(#loc38))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<32x16xi32>, tensor<32x16xi16>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc39) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc39) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc39) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc39) + tt.return %3#0, %3#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc40) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc41) + %5 = ub.poison : tensor<32x16xi32> loc(#loc41) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc41) + } loc(#loc38) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc42)), %idxs: tensor<32x16xi16> loc("idxs"(#loc42))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc178) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc179) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc179) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc180) + %flip_3 = tt.reshape %flip_2 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc181) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i16S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi16>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + tt.return %0#0, %0#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc48) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi32> loc(#loc49) + %2 = ub.poison : tensor<32x16xi32> loc(#loc49) + tt.return %1, %2 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i16S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi16> loc("idxs"(#loc50)), %flip: tensor<32x16xi32> loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi16> -> tensor<256x2x1xi16> loc(#loc199) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc200) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc201) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<256x2x1xi16> loc(#loc201) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<256x2x1xi16>) -> tensor<256x1xi32> loc(#loc202) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc203) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc204) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc205) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc206) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<256x2x1xi16> loc(#loc206) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<256x2x1xi16>) -> tensor<256x1xi32> loc(#loc207) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc208) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc209) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_27 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_28 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_49 = arith.constant true loc(#loc217) + %cond_50 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<32x16xi1> loc(#loc217) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<32x16xi1> loc(#loc218) + %cond_53 = arith.ori %cond, %cond_52 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_53 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_50 = arith.ori %eq, %eq_49 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_50 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<32x16xi32> loc(#loc223) + %cond_30 = arith.andi %3, %cond_29 : tensor<32x16xi1> loc(#loc224) + %cond_31 = arith.ori %1, %cond_30 : tensor<32x16xi1> loc(#loc225) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<32x16xi1> loc(#loc226) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<32x16xi1> loc(#loc227) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<32x16xi1> loc(#loc228) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<32x16xi1> loc(#loc229) + %cond_36 = arith.extui %cond_35 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc230) + %cond_37 = arith.xori %cond_36, %flip : tensor<32x16xi32> loc(#loc230) + %cond_38 = arith.constant 0 : i32 loc(#loc231) + %cond_39 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc231) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<32x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_43 = arith.xori %x, %ret_42 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<32x16xi32> loc(#loc236) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S32_16S__(%idxs) : (tensor<32x16xi16>) -> tensor<32x16xi16> loc(#loc237) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<32x16xi16> to tensor<32x16xi32> loc(#loc238) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_47 = arith.extsi %idxs : tensor<32x16xi16> to tensor<32x16xi32> loc(#loc239) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_43, %new_idxs_48 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<256x2x1xi32> loc("input"(#loc113))) -> tensor<256x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc114) + tt.return %0 : tensor<256x1xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<256x1xi32> loc(#loc116) + tt.return %1 : tensor<256x1xi32> loc(#loc116) + } loc(#loc113) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc117)), %b: i32 loc("b"(#loc117))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc118) + tt.return %0 : i32 loc(#loc119) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc120) + tt.return %1 : i32 loc(#loc120) + } loc(#loc117) + tt.func private @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<256x2x1xi16> loc("input"(#loc113))) -> tensor<256x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc243) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc114) + tt.return %0 : tensor<256x1xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<256x1xi32> loc(#loc116) + tt.return %1 : tensor<256x1xi32> loc(#loc116) + } loc(#loc113) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%x: tensor<32x16xi32> loc("x"(#loc122))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc123) + %false = arith.constant false loc(#loc124) + tt.return %false : i1 loc(#loc124) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc125) + tt.return %1 : i1 loc(#loc125) + } loc(#loc122) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S32_16S__(%x: tensor<32x16xi32> loc("x"(#loc126))) -> tensor<32x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc127) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc128) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc128) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<32x16xi32> loc(#loc128) + %4 = arith.addi %x, %3 : tensor<32x16xi32> loc(#loc128) + tt.return %4 : tensor<32x16xi32> loc(#loc129) + ^bb1: // no predecessors + %5 = ub.poison : tensor<32x16xi32> loc(#loc130) + tt.return %5 : tensor<32x16xi32> loc(#loc130) + } loc(#loc126) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc132) + %cst = arith.constant dense : tensor<1xi1> loc(#loc132) + tt.return %cst : tensor<1xi1> loc(#loc133) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc134) + tt.return %0 : tensor<1xi1> loc(#loc134) + } loc(#loc131) + tt.func private @triton.language.standard.zeros_like__i32S32_16S__(%input: tensor<32x16xi32> loc("input"(#loc135))) -> tensor<32x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<32x16xi32> loc(#loc136) + tt.return %0 : tensor<32x16xi32> loc(#loc137) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi32> loc(#loc138) + tt.return %1 : tensor<32x16xi32> loc(#loc138) + } loc(#loc135) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<32x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc132) + %cst = arith.constant dense<0> : tensor<32x16xi32> loc(#loc132) + tt.return %cst : tensor<32x16xi32> loc(#loc133) + ^bb1: // no predecessors + %0 = ub.poison : tensor<32x16xi32> loc(#loc134) + tt.return %0 : tensor<32x16xi32> loc(#loc134) + } loc(#loc131) + tt.func private @triton.language.standard.zeros_like__i16S32_16S__(%input: tensor<32x16xi16> loc("input"(#loc135))) -> tensor<32x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<32x16xi16> loc(#loc136) + tt.return %0 : tensor<32x16xi16> loc(#loc137) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi16> loc(#loc138) + tt.return %1 : tensor<32x16xi16> loc(#loc138) + } loc(#loc135) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<32x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc132) + %cst = arith.constant dense<0> : tensor<32x16xi16> loc(#loc132) + tt.return %cst : tensor<32x16xi16> loc(#loc133) + ^bb1: // no predecessors + %0 = ub.poison : tensor<32x16xi16> loc(#loc134) + tt.return %0 : tensor<32x16xi16> loc(#loc134) + } loc(#loc131) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc42)), %idxs: tensor<32x16xi32> loc("idxs"(#loc42))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc178) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc179) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc179) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc180) + %flip_3 = tt.reshape %flip_2 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc181) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + tt.return %1#0, %1#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc48) + ^bb1: // no predecessors + %2 = ub.poison : tensor<32x16xi32> loc(#loc49) + %3 = ub.poison : tensor<32x16xi32> loc(#loc49) + tt.return %2, %3 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: tensor<32x16xi32> loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<128x2x2xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<128x2x2xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<128x2x2xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<128x2x2xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc217) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc217) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc218) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc230) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc230) + %cond_36 = arith.constant 0 : i32 loc(#loc231) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc231) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x2x2xi32> loc("input"(#loc113))) -> tensor<128x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc114) + tt.return %0 : tensor<128x2xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x2xi32> loc(#loc116) + tt.return %1 : tensor<128x2xi32> loc(#loc116) + } loc(#loc113) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: tensor<32x16xi32> loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x1xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x1xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc217) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc217) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc218) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc230) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc230) + %cond_36 = arith.constant 0 : i32 loc(#loc231) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc231) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc42)), %idxs: tensor<32x16xi32> loc("idxs"(#loc42))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc178) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc179) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc179) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc180) + %flip_3 = tt.reshape %flip_2 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc181) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + tt.return %2#0, %2#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc48) + ^bb1: // no predecessors + %3 = ub.poison : tensor<32x16xi32> loc(#loc49) + %4 = ub.poison : tensor<32x16xi32> loc(#loc49) + tt.return %3, %4 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: tensor<32x16xi32> loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x4xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<64x2x4xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x4xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x4xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc217) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc217) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc218) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc230) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc230) + %cond_36 = arith.constant 0 : i32 loc(#loc231) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc231) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x4xi32> loc("input"(#loc113))) -> tensor<64x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc114) + tt.return %0 : tensor<64x4xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x4xi32> loc(#loc116) + tt.return %1 : tensor<64x4xi32> loc(#loc116) + } loc(#loc113) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc42)), %idxs: tensor<32x16xi32> loc("idxs"(#loc42))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc247) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc47) + tt.return %3#0, %3#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc48) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc49) + %5 = ub.poison : tensor<32x16xi32> loc(#loc49) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x8xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<32x2x8xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x8xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x8xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x2x8xi32> loc("input"(#loc113))) -> tensor<32x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc114) + tt.return %0 : tensor<32x8xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x8xi32> loc(#loc116) + tt.return %1 : tensor<32x8xi32> loc(#loc116) + } loc(#loc113) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x4xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<64x2x4xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x4xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x4xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<128x2x2xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<128x2x2xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<128x2x2xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<128x2x2xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc50)), %idxs: tensor<32x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x1xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x1xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc112) + %5 = ub.poison : tensor<32x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x16xi64> loc("input"(#loc113))) -> tensor<32xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc114) + tt.reduce.return %2 : i64 loc(#loc114) + }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc114) + tt.return %0 : tensor<32xi64> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32xi64> loc(#loc116) + tt.return %1 : tensor<32xi64> loc(#loc116) + } loc(#loc113) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc117)), %b: i64 loc("b"(#loc117))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc118) + tt.return %0 : i64 loc(#loc119) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc120) + tt.return %1 : i64 loc(#loc120) + } loc(#loc117) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":33:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":34:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:38) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:49) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:54) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":38:19) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":40:33) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":41:67) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":42:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":44:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":45:26) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":45:29) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":46:20) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":47:21) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":48:21) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:35) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:32) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:47) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:37) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:4) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc145 = loc("xnumel"(#loc1)) +#loc146 = loc("r0_numel"(#loc2)) +#loc147 = loc("xoffset"(#loc3)) +#loc148 = loc("xoffset"(#loc4)) +#loc149 = loc("xindex"(#loc5)) +#loc150 = loc("xindex"(#loc6)) +#loc151 = loc("xindex"(#loc7)) +#loc152 = loc("xmask"(#loc8)) +#loc153 = loc("r0_index"(#loc9)) +#loc154 = loc("r0_index"(#loc10)) +#loc155 = loc("r0_offset"(#loc11)) +#loc156 = loc("r0_mask"(#loc12)) +#loc157 = loc("x0"(#loc13)) +#loc158 = loc("x1"(#loc14)) +#loc159 = loc("tmp0"(#loc15)) +#loc160 = loc("tmp0"(#loc16)) +#loc161 = loc("tmp0"(#loc17)) +#loc162 = loc("tmp0"(#loc18)) +#loc163 = loc("tmp0"(#loc19)) +#loc164 = loc("tmp0"(#loc20)) +#loc165 = loc("tmp2"(#loc21)) +#loc166 = loc("tmp4"(#loc22)) +#loc167 = loc("tmp7"(#loc24)) +#loc168 = loc("tmp10"(#loc25)) +#loc169 = loc("tmp11"(#loc26)) +#loc170 = loc("tmp11"(#loc27)) +#loc171 = loc("tmp12"(#loc28)) +#loc172 = loc("tmp13"(#loc29)) +#loc173 = loc("tmp14"(#loc30)) +#loc178 = loc("flip"(#loc43)) +#loc179 = loc("flip"(#loc44)) +#loc180 = loc("flip"(#loc45)) +#loc181 = loc("flip"(#loc46)) +#loc185 = loc("y"(#loc51)) +#loc186 = loc("right_mask"(#loc52)) +#loc187 = loc("right_mask"(#loc53)) +#loc188 = loc("left_mask"(#loc54)) +#loc189 = loc("ileft"(#loc55)) +#loc190 = loc("ileft"(#loc56)) +#loc191 = loc("ileft"(#loc57)) +#loc192 = loc("ileft"(#loc58)) +#loc193 = loc("iright"(#loc59)) +#loc194 = loc("iright"(#loc60)) +#loc195 = loc("iright"(#loc61)) +#loc196 = loc("iright"(#loc62)) +#loc197 = loc("ileft"(#loc63)) +#loc198 = loc("iright"(#loc64)) +#loc199 = loc("y_idx"(#loc65)) +#loc200 = loc("left_idx"(#loc66)) +#loc201 = loc("left_idx"(#loc67)) +#loc202 = loc("left_idx"(#loc68)) +#loc203 = loc("left_idx"(#loc69)) +#loc204 = loc("left_idx"(#loc70)) +#loc205 = loc("right_idx"(#loc71)) +#loc206 = loc("right_idx"(#loc72)) +#loc207 = loc("right_idx"(#loc73)) +#loc208 = loc("right_idx"(#loc74)) +#loc209 = loc("right_idx"(#loc75)) +#loc210 = loc("left_idx"(#loc76)) +#loc211 = loc("right_idx"(#loc77)) +#loc212 = loc("left_valid_mask"(#loc78)) +#loc213 = loc("right_valid_mask"(#loc79)) +#loc214 = loc("left_isnan"(#loc80)) +#loc215 = loc("right_isnan"(#loc81)) +#loc216 = loc("cond"(#loc82)) +#loc217 = loc("cond"(#loc85)) +#loc218 = loc("cond"(#loc86)) +#loc219 = loc("cond"(#loc87)) +#loc220 = loc("eq"(#loc89)) +#loc221 = loc("eq"(#loc92)) +#loc222 = loc("eq"(#loc93)) +#loc223 = loc("cond"(#loc94)) +#loc224 = loc("cond"(#loc95)) +#loc225 = loc("cond"(#loc96)) +#loc226 = loc("cond"(#loc97)) +#loc227 = loc("cond"(#loc98)) +#loc228 = loc("cond"(#loc99)) +#loc229 = loc("cond"(#loc100)) +#loc230 = loc("cond"(#loc101)) +#loc231 = loc("cond"(#loc102)) +#loc232 = loc("ret"(#loc103)) +#loc233 = loc("ret"(#loc104)) +#loc234 = loc("ret"(#loc105)) +#loc235 = loc("ret"(#loc106)) +#loc236 = loc("new_idxs"(#loc107)) +#loc237 = loc("new_idxs"(#loc108)) +#loc238 = loc("new_idxs"(#loc109)) +#loc239 = loc("new_idxs"(#loc110)) +#loc243 = loc("input"(#loc121)) +#loc247 = loc("flip"(#loc139)) +#loc248 = loc("cond"(#loc216)) +#loc249 = loc("cond"(#loc219)) +#loc250 = loc("eq"(#loc220)) +#loc251 = loc("eq"(#loc222)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..d22ef6b51af1a9216b537360bb22bf9eb137184a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir @@ -0,0 +1,841 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}> +#linear = #ttg.linear<{register = [[0, 4], [0, 8]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0]], warp = [[0, 1], [0, 2]], block = []}> +#linear1 = #ttg.linear<{register = [[2, 0, 0], [4, 0, 0]], lane = [[8, 0, 0], [16, 0, 0], [32, 0, 0], [64, 0, 0], [128, 0, 0]], warp = [[0, 1, 0], [1, 0, 0]], block = []}> +#linear2 = #ttg.linear<{register = [[1, 0, 0], [2, 0, 0]], lane = [[4, 0, 0], [8, 0, 0], [16, 0, 0], [32, 0, 0], [64, 0, 0]], warp = [[0, 0, 1], [0, 1, 0]], block = []}> +#linear3 = #ttg.linear<{register = [[0, 1, 0], [1, 0, 0]], lane = [[2, 0, 0], [4, 0, 0], [8, 0, 0], [16, 0, 0], [32, 0, 0]], warp = [[0, 0, 1], [0, 0, 2]], block = []}> +#linear4 = #ttg.linear<{register = [[0, 0, 4], [0, 1, 0]], lane = [[1, 0, 0], [2, 0, 0], [4, 0, 0], [8, 0, 0], [16, 0, 0]], warp = [[0, 0, 1], [0, 0, 2]], block = []}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":18:0) +#loc1 = loc(unknown) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":41:67) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":45:26) +#loc77 = loc("in_ptr0"(#loc)) +#loc78 = loc("out_ptr2"(#loc)) +#loc79 = loc("out_ptr3"(#loc)) +#loc80 = loc("xnumel"(#loc)) +#loc81 = loc("r0_numel"(#loc)) +#loc99 = loc(callsite(#loc19 at #loc20)) +#loc105 = loc("ileft"(#loc28)) +#loc109 = loc("iright"(#loc33)) +#loc118 = loc("left_idx"(#loc42)) +#loc123 = loc("right_idx"(#loc47)) +#loc143 = loc("tmp11"(#loc67)) +#loc149 = loc(callsite(#loc24 at #loc99)) +#loc153 = loc(callsite(#loc1 at #loc143)) +#loc157 = loc(callsite(#loc105 at #loc149)) +#loc161 = loc(callsite(#loc109 at #loc149)) +#loc169 = loc(callsite(#loc118 at #loc149)) +#loc174 = loc(callsite(#loc123 at #loc149)) +#loc194 = loc(callsite(#loc1 at #loc157)) +#loc196 = loc(callsite(#loc1 at #loc161)) +#loc199 = loc(callsite(#loc1 at #loc169)) +#loc202 = loc(callsite(#loc1 at #loc174)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<32x16xi32, #linear> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<32x16xi64, #blocked> loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %cst_1 = arith.constant dense<32> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<32> : tensor<32x1xi32, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<16> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<16> : tensor<32x1xi32, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<17> : tensor<1x16xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<272> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<1> : tensor<1x2x1xi32, #linear1> loc(#loc1) + %cst_8 = arith.constant dense<1> : tensor<1x2x1xi32, #linear2> loc(#loc1) + %cst_9 = arith.constant dense<1> : tensor<1x2x1xi32, #linear3> loc(#loc1) + %cst_10 = arith.constant dense<1> : tensor<1x2x1xi32, #linear4> loc(#loc1) + %cst_11 = arith.constant dense<0> : tensor<32x16xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc82) + %xoffset_12 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc83) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc84) + %xindex_13 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc84) + %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc84) + %xindex_15 = tt.expand_dims %xindex_13 {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1> loc(#loc84) + %xindex_16 = tt.splat %xoffset_12 : i32 -> tensor<32x1xi32, #blocked> loc(#loc85) + %xindex_17 = tt.splat %xoffset_12 : i32 -> tensor<32x1xi32, #blocked1> loc(#loc85) + %xindex_18 = arith.addi %xindex_16, %xindex_14 : tensor<32x1xi32, #blocked> loc(#loc85) + %xindex_19 = arith.addi %xindex_17, %xindex_15 : tensor<32x1xi32, #blocked1> loc(#loc85) + %xmask = arith.cmpi slt, %xindex_18, %cst_1 : tensor<32x1xi32, #blocked> loc(#loc86) + %xmask_20 = arith.cmpi slt, %xindex_19, %cst_2 : tensor<32x1xi32, #blocked1> loc(#loc86) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc87) + %r0_index_21 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> loc(#loc87) + %r0_index_22 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc87) + %r0_index_23 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc87) + %r0_index_24 = tt.expand_dims %r0_index_21 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> -> tensor<1x16xi32, #linear> loc(#loc87) + %r0_index_25 = tt.expand_dims %r0_index_22 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x16xi32, #blocked1> loc(#loc87) + %x0 = arith.remsi %xindex_18, %cst_3 : tensor<32x1xi32, #blocked> loc(#loc88) + %x1 = arith.divsi %xindex_18, %cst_3 : tensor<32x1xi32, #blocked> loc(#loc89) + %tmp0 = arith.muli %r0_index_23, %cst_5 : tensor<1x16xi32, #blocked> loc(#loc90) + %tmp0_26 = tt.broadcast %x0 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc91) + %tmp0_27 = tt.broadcast %tmp0 : tensor<1x16xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc91) + %tmp0_28 = arith.addi %tmp0_26, %tmp0_27 : tensor<32x16xi32, #blocked> loc(#loc91) + %tmp0_29 = arith.muli %x1, %cst_6 : tensor<32x1xi32, #blocked> loc(#loc92) + %tmp0_30 = tt.broadcast %tmp0_29 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc93) + %tmp0_31 = arith.addi %tmp0_28, %tmp0_30 : tensor<32x16xi32, #blocked> loc(#loc93) + %tmp0_32 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc94) + %tmp0_33 = tt.addptr %tmp0_32, %tmp0_31 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc94) + %tmp0_34 = tt.broadcast %xmask : tensor<32x1xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc95) + %tmp0_35 = tt.broadcast %xmask_20 : tensor<32x1xi1, #blocked1> -> tensor<32x16xi1, #blocked1> loc(#loc95) + %tmp0_36 = tt.load %tmp0_33, %tmp0_34, %cst_11 : tensor<32x16x!tt.ptr, #blocked> loc(#loc95) + %tmp2 = arith.trunci %r0_index_24 : tensor<1x16xi32, #linear> to tensor<1x16xi16, #linear> loc(#loc96) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16, #linear> -> tensor<32x16xi16, #linear> loc(#loc97) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> loc(#loc146) + %flip_37 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> loc(#loc146) + %flip_38 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> loc(#loc146) + %flip_39 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> loc(#loc146) + %flip_40 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> loc(#loc146) + %flip_41 = tt.expand_dims %flip_37 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> loc(#loc146) + %flip_42 = tt.expand_dims %flip_38 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> loc(#loc146) + %flip_43 = tt.expand_dims %flip_39 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> loc(#loc146) + %flip_44 = tt.expand_dims %flip_40 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> -> tensor<1x2x1xi32, #linear2> loc(#loc146) + %flip_45 = tt.expand_dims %flip_41 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> -> tensor<1x2x1xi32, #linear1> loc(#loc146) + %flip_46 = tt.expand_dims %flip_42 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> -> tensor<1x2x1xi32, #linear3> loc(#loc146) + %flip_47 = tt.expand_dims %flip_43 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> -> tensor<1x2x1xi32, #linear4> loc(#loc146) + %flip_48 = tt.broadcast %flip_44 : tensor<1x2x1xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc147) + %flip_49 = tt.reshape %flip_48 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #blocked> loc(#loc148) + %flip_50 = tt.reshape %flip_48 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc148) + %y = tt.reshape %tmp0_36 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #linear1> loc(#loc154) + %left_mask = arith.subi %cst_7, %flip_45 : tensor<1x2x1xi32, #linear1> loc(#loc155) + %left_mask_51 = arith.subi %cst_8, %flip_44 : tensor<1x2x1xi32, #linear2> loc(#loc155) + %left_mask_52 = arith.subi %cst_9, %flip_46 : tensor<1x2x1xi32, #linear3> loc(#loc155) + %left_mask_53 = arith.subi %cst_10, %flip_47 : tensor<1x2x1xi32, #linear4> loc(#loc155) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc156) + %ileft_54 = arith.muli %y, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156) + %ileft_55 = "tt.reduce"(%ileft_54) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_56 = tt.expand_dims %ileft_55 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158) + %ileft_57 = tt.broadcast %ileft_56 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159) + %iright = tt.broadcast %flip_45 : tensor<1x2x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc160) + %iright_58 = arith.muli %y, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160) + %iright_59 = "tt.reduce"(%iright_58) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_60 = tt.expand_dims %iright_59 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162) + %iright_61 = tt.broadcast %iright_60 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163) + %ileft_62 = tt.reshape %ileft_57 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc164) + %ileft_63 = tt.reshape %ileft_57 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_64 = tt.reshape %iright_61 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc165) + %iright_65 = tt.reshape %iright_61 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx = tt.reshape %tmp4 : tensor<32x16xi16, #linear> -> tensor<256x2x1xi16, #linear1> loc(#loc166) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc167) + %left_idx_66 = tt.broadcast %left_idx : tensor<1x2x1xi16, #linear1> -> tensor<256x2x1xi16, #linear1> loc(#loc168) + %left_idx_67 = arith.muli %y_idx, %left_idx_66 : tensor<256x2x1xi16, #linear1> loc(#loc168) + %input = arith.extsi %left_idx_67 : tensor<256x2x1xi16, #linear1> to tensor<256x2x1xi32, #linear1> loc(#loc197) + %left_idx_68 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_69 = tt.expand_dims %left_idx_68 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170) + %left_idx_70 = tt.broadcast %left_idx_69 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171) + %right_idx = arith.trunci %flip_45 : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc172) + %right_idx_71 = tt.broadcast %right_idx : tensor<1x2x1xi16, #linear1> -> tensor<256x2x1xi16, #linear1> loc(#loc173) + %right_idx_72 = arith.muli %y_idx, %right_idx_71 : tensor<256x2x1xi16, #linear1> loc(#loc173) + %input_73 = arith.extsi %right_idx_72 : tensor<256x2x1xi16, #linear1> to tensor<256x2x1xi32, #linear1> loc(#loc200) + %right_idx_74 = "tt.reduce"(%input_73) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_75 = tt.expand_dims %right_idx_74 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175) + %right_idx_76 = tt.broadcast %right_idx_75 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176) + %left_idx_77 = tt.reshape %left_idx_70 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc177) + %left_idx_78 = tt.reshape %left_idx_70 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_79 = tt.reshape %right_idx_76 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc178) + %right_idx_80 = tt.reshape %right_idx_76 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond = arith.cmpi slt, %ileft_62, %iright_64 : tensor<32x16xi32, #blocked> loc(#loc179) + %cond_81 = arith.cmpi slt, %ileft_63, %iright_65 : tensor<32x16xi32, #linear> loc(#loc179) + %eq = arith.cmpi eq, %ileft_62, %iright_64 : tensor<32x16xi32, #blocked> loc(#loc180) + %eq_82 = arith.cmpi eq, %ileft_63, %iright_65 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_83 = arith.cmpi sgt, %left_idx_77, %right_idx_79 : tensor<32x16xi32, #blocked> loc(#loc181) + %cond_84 = arith.cmpi sgt, %left_idx_78, %right_idx_80 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_85 = arith.andi %eq, %cond_83 : tensor<32x16xi1, #blocked> loc(#loc182) + %cond_86 = arith.andi %eq_82, %cond_84 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_87 = arith.ori %cond, %cond_85 : tensor<32x16xi1, #blocked> loc(#loc183) + %cond_88 = arith.ori %cond_81, %cond_86 : tensor<32x16xi1, #linear> loc(#loc183) + %cond_89 = arith.extui %cond_87 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc184) + %cond_90 = arith.extui %cond_88 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184) + %cond_91 = arith.xori %cond_89, %flip_49 : tensor<32x16xi32, #blocked> loc(#loc184) + %cond_92 = arith.xori %cond_90, %flip_50 : tensor<32x16xi32, #linear> loc(#loc184) + %cond_93 = arith.cmpi ne, %cond_91, %cst_11 : tensor<32x16xi32, #blocked> loc(#loc185) + %cond_94 = arith.cmpi ne, %cond_92, %cst : tensor<32x16xi32, #linear> loc(#loc185) + %ret = arith.xori %ileft_62, %iright_64 : tensor<32x16xi32, #blocked> loc(#loc186) + %ret_95 = arith.select %cond_93, %ret, %cst_11 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc187) + %ret_96 = arith.xori %tmp0_36, %ret_95 : tensor<32x16xi32, #blocked> loc(#loc188) + %ret_97 = ttg.convert_layout %ret_96 : tensor<32x16xi32, #blocked> -> tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs = arith.xori %left_idx_78, %right_idx_80 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_98 = arith.select %cond_94, %new_idxs, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_99 = arith.extsi %tmp2 : tensor<1x16xi16, #linear> to tensor<1x16xi32, #linear> loc(#loc191) + %new_idxs_100 = tt.broadcast %new_idxs_99 : tensor<1x16xi32, #linear> -> tensor<32x16xi32, #linear> loc(#loc191) + %new_idxs_101 = arith.xori %new_idxs_100, %new_idxs_98 : tensor<32x16xi32, #linear> loc(#loc191) + %flip_102 = tt.broadcast %flip_46 : tensor<1x2x1xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc147) + %flip_103 = tt.reshape %flip_102 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc148) + %y_104 = tt.reshape %ret_96 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #linear2> loc(#loc154) + %ileft_105 = tt.broadcast %left_mask_51 : tensor<1x2x1xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc156) + %ileft_106 = arith.muli %y_104, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc156) + %ileft_107 = "tt.reduce"(%ileft_106) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193) + %ileft_108 = tt.expand_dims %ileft_107 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc158) + %ileft_109 = tt.broadcast %ileft_108 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc159) + %iright_110 = arith.muli %y_104, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc160) + %iright_111 = "tt.reduce"(%iright_110) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195) + %iright_112 = tt.expand_dims %iright_111 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc162) + %iright_113 = tt.broadcast %iright_112 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc163) + %ileft_114 = tt.reshape %ileft_109 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_115 = tt.reshape %iright_113 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_116 = tt.reshape %new_idxs_101 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc166) + %left_idx_117 = arith.muli %y_idx_116, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc168) + %left_idx_118 = "tt.reduce"(%left_idx_117) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198) + %left_idx_119 = tt.expand_dims %left_idx_118 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc170) + %left_idx_120 = tt.broadcast %left_idx_119 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc171) + %right_idx_121 = arith.muli %y_idx_116, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc173) + %right_idx_122 = "tt.reduce"(%right_idx_121) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201) + %right_idx_123 = tt.expand_dims %right_idx_122 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc175) + %right_idx_124 = tt.broadcast %right_idx_123 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc176) + %left_idx_125 = tt.reshape %left_idx_120 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_126 = tt.reshape %right_idx_124 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_127 = arith.cmpi slt, %ileft_114, %iright_115 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_128 = arith.cmpi eq, %ileft_114, %iright_115 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_129 = arith.cmpi sgt, %left_idx_125, %right_idx_126 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_130 = arith.andi %eq_128, %cond_129 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_131 = arith.ori %cond_127, %cond_130 : tensor<32x16xi1, #linear> loc(#loc183) + %cond_132 = arith.extui %cond_131 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184) + %cond_133 = arith.xori %cond_132, %flip_103 : tensor<32x16xi32, #linear> loc(#loc184) + %cond_134 = arith.cmpi ne, %cond_133, %cst : tensor<32x16xi32, #linear> loc(#loc185) + %ret_135 = arith.xori %ileft_114, %iright_115 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_136 = arith.select %cond_134, %ret_135, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_137 = arith.xori %ret_97, %ret_136 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_138 = arith.xori %left_idx_125, %right_idx_126 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_139 = arith.select %cond_134, %new_idxs_138, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_140 = arith.xori %new_idxs_101, %new_idxs_139 : tensor<32x16xi32, #linear> loc(#loc191) + %y_141 = tt.reshape %ret_137 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc154) + %ileft_142 = arith.muli %y_141, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156) + %ileft_143 = "tt.reduce"(%ileft_142) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_144 = tt.expand_dims %ileft_143 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158) + %ileft_145 = tt.broadcast %ileft_144 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159) + %iright_146 = arith.muli %y_141, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160) + %iright_147 = "tt.reduce"(%iright_146) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_148 = tt.expand_dims %iright_147 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162) + %iright_149 = tt.broadcast %iright_148 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163) + %ileft_150 = tt.reshape %ileft_145 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_151 = tt.reshape %iright_149 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_152 = tt.reshape %new_idxs_140 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc166) + %left_idx_153 = arith.muli %y_idx_152, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc168) + %left_idx_154 = "tt.reduce"(%left_idx_153) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_155 = tt.expand_dims %left_idx_154 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170) + %left_idx_156 = tt.broadcast %left_idx_155 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171) + %right_idx_157 = arith.muli %y_idx_152, %iright : tensor<256x2x1xi32, #linear1> loc(#loc173) + %right_idx_158 = "tt.reduce"(%right_idx_157) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_159 = tt.expand_dims %right_idx_158 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175) + %right_idx_160 = tt.broadcast %right_idx_159 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176) + %left_idx_161 = tt.reshape %left_idx_156 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_162 = tt.reshape %right_idx_160 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_163 = arith.cmpi slt, %ileft_150, %iright_151 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_164 = arith.cmpi eq, %ileft_150, %iright_151 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_165 = arith.cmpi sgt, %left_idx_161, %right_idx_162 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_166 = arith.andi %eq_164, %cond_165 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_167 = arith.ori %cond_163, %cond_166 : tensor<32x16xi1, #linear> loc(#loc183) + %cond_168 = arith.extui %cond_167 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184) + %cond_169 = arith.xori %cond_168, %flip_103 : tensor<32x16xi32, #linear> loc(#loc184) + %cond_170 = arith.cmpi ne, %cond_169, %cst : tensor<32x16xi32, #linear> loc(#loc185) + %ret_171 = arith.xori %ileft_150, %iright_151 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_172 = arith.select %cond_170, %ret_171, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_173 = arith.xori %ret_137, %ret_172 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_174 = arith.xori %left_idx_161, %right_idx_162 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_175 = arith.select %cond_170, %new_idxs_174, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_176 = arith.xori %new_idxs_140, %new_idxs_175 : tensor<32x16xi32, #linear> loc(#loc191) + %flip_177 = tt.broadcast %flip_47 : tensor<1x2x1xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc147) + %flip_178 = tt.reshape %flip_177 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc148) + %y_179 = tt.reshape %ret_173 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc154) + %ileft_180 = tt.broadcast %left_mask_52 : tensor<1x2x1xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc156) + %ileft_181 = arith.muli %y_179, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc156) + %ileft_182 = "tt.reduce"(%ileft_181) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc193) + %ileft_183 = tt.expand_dims %ileft_182 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc158) + %ileft_184 = tt.broadcast %ileft_183 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc159) + %iright_185 = arith.muli %y_179, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc160) + %iright_186 = "tt.reduce"(%iright_185) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc195) + %iright_187 = tt.expand_dims %iright_186 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc162) + %iright_188 = tt.broadcast %iright_187 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc163) + %ileft_189 = tt.reshape %ileft_184 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_190 = tt.reshape %iright_188 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_191 = tt.reshape %new_idxs_176 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc166) + %left_idx_192 = arith.muli %y_idx_191, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc168) + %left_idx_193 = "tt.reduce"(%left_idx_192) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc198) + %left_idx_194 = tt.expand_dims %left_idx_193 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc170) + %left_idx_195 = tt.broadcast %left_idx_194 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc171) + %right_idx_196 = arith.muli %y_idx_191, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc173) + %right_idx_197 = "tt.reduce"(%right_idx_196) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc201) + %right_idx_198 = tt.expand_dims %right_idx_197 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc175) + %right_idx_199 = tt.broadcast %right_idx_198 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc176) + %left_idx_200 = tt.reshape %left_idx_195 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_201 = tt.reshape %right_idx_199 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_202 = arith.cmpi slt, %ileft_189, %iright_190 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_203 = arith.cmpi eq, %ileft_189, %iright_190 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_204 = arith.cmpi sgt, %left_idx_200, %right_idx_201 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_205 = arith.andi %eq_203, %cond_204 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_206 = arith.ori %cond_202, %cond_205 : tensor<32x16xi1, #linear> loc(#loc183) + %cond_207 = arith.extui %cond_206 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184) + %cond_208 = arith.xori %cond_207, %flip_178 : tensor<32x16xi32, #linear> loc(#loc184) + %cond_209 = arith.cmpi ne, %cond_208, %cst : tensor<32x16xi32, #linear> loc(#loc185) + %ret_210 = arith.xori %ileft_189, %iright_190 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_211 = arith.select %cond_209, %ret_210, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_212 = arith.xori %ret_173, %ret_211 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_213 = arith.xori %left_idx_200, %right_idx_201 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_214 = arith.select %cond_209, %new_idxs_213, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_215 = arith.xori %new_idxs_176, %new_idxs_214 : tensor<32x16xi32, #linear> loc(#loc191) + %y_216 = tt.reshape %ret_212 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc154) + %ileft_217 = arith.muli %y_216, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc156) + %ileft_218 = "tt.reduce"(%ileft_217) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193) + %ileft_219 = tt.expand_dims %ileft_218 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc158) + %ileft_220 = tt.broadcast %ileft_219 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc159) + %iright_221 = arith.muli %y_216, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc160) + %iright_222 = "tt.reduce"(%iright_221) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195) + %iright_223 = tt.expand_dims %iright_222 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc162) + %iright_224 = tt.broadcast %iright_223 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc163) + %ileft_225 = tt.reshape %ileft_220 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_226 = tt.reshape %iright_224 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_227 = tt.reshape %new_idxs_215 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc166) + %left_idx_228 = arith.muli %y_idx_227, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc168) + %left_idx_229 = "tt.reduce"(%left_idx_228) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198) + %left_idx_230 = tt.expand_dims %left_idx_229 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc170) + %left_idx_231 = tt.broadcast %left_idx_230 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc171) + %right_idx_232 = arith.muli %y_idx_227, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc173) + %right_idx_233 = "tt.reduce"(%right_idx_232) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201) + %right_idx_234 = tt.expand_dims %right_idx_233 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc175) + %right_idx_235 = tt.broadcast %right_idx_234 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc176) + %left_idx_236 = tt.reshape %left_idx_231 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_237 = tt.reshape %right_idx_235 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_238 = arith.cmpi slt, %ileft_225, %iright_226 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_239 = arith.cmpi eq, %ileft_225, %iright_226 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_240 = arith.cmpi sgt, %left_idx_236, %right_idx_237 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_241 = arith.andi %eq_239, %cond_240 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_242 = arith.ori %cond_238, %cond_241 : tensor<32x16xi1, #linear> loc(#loc183) + %cond_243 = arith.extui %cond_242 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184) + %cond_244 = arith.xori %cond_243, %flip_178 : tensor<32x16xi32, #linear> loc(#loc184) + %cond_245 = arith.cmpi ne, %cond_244, %cst : tensor<32x16xi32, #linear> loc(#loc185) + %ret_246 = arith.xori %ileft_225, %iright_226 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_247 = arith.select %cond_245, %ret_246, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_248 = arith.xori %ret_212, %ret_247 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_249 = arith.xori %left_idx_236, %right_idx_237 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_250 = arith.select %cond_245, %new_idxs_249, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_251 = arith.xori %new_idxs_215, %new_idxs_250 : tensor<32x16xi32, #linear> loc(#loc191) + %y_252 = tt.reshape %ret_248 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc154) + %ileft_253 = arith.muli %y_252, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156) + %ileft_254 = "tt.reduce"(%ileft_253) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_255 = tt.expand_dims %ileft_254 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158) + %ileft_256 = tt.broadcast %ileft_255 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159) + %iright_257 = arith.muli %y_252, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160) + %iright_258 = "tt.reduce"(%iright_257) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_259 = tt.expand_dims %iright_258 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162) + %iright_260 = tt.broadcast %iright_259 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163) + %ileft_261 = tt.reshape %ileft_256 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_262 = tt.reshape %iright_260 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_263 = tt.reshape %new_idxs_251 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc166) + %left_idx_264 = arith.muli %y_idx_263, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc168) + %left_idx_265 = "tt.reduce"(%left_idx_264) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_266 = tt.expand_dims %left_idx_265 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170) + %left_idx_267 = tt.broadcast %left_idx_266 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171) + %right_idx_268 = arith.muli %y_idx_263, %iright : tensor<256x2x1xi32, #linear1> loc(#loc173) + %right_idx_269 = "tt.reduce"(%right_idx_268) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_270 = tt.expand_dims %right_idx_269 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175) + %right_idx_271 = tt.broadcast %right_idx_270 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176) + %left_idx_272 = tt.reshape %left_idx_267 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_273 = tt.reshape %right_idx_271 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_274 = arith.cmpi slt, %ileft_261, %iright_262 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_275 = arith.cmpi eq, %ileft_261, %iright_262 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_276 = arith.cmpi sgt, %left_idx_272, %right_idx_273 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_277 = arith.andi %eq_275, %cond_276 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_278 = arith.ori %cond_274, %cond_277 : tensor<32x16xi1, #linear> loc(#loc183) + %cond_279 = arith.extui %cond_278 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc184) + %cond_280 = arith.xori %cond_279, %flip_178 : tensor<32x16xi32, #linear> loc(#loc184) + %cond_281 = arith.cmpi ne, %cond_280, %cst : tensor<32x16xi32, #linear> loc(#loc185) + %ret_282 = arith.xori %ileft_261, %iright_262 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_283 = arith.select %cond_281, %ret_282, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_284 = arith.xori %ret_248, %ret_283 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_285 = arith.xori %left_idx_272, %right_idx_273 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_286 = arith.select %cond_281, %new_idxs_285, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_287 = arith.xori %new_idxs_251, %new_idxs_286 : tensor<32x16xi32, #linear> loc(#loc191) + %y_288 = tt.reshape %ret_284 : tensor<32x16xi32, #linear> -> tensor<32x2x8xi32, #linear4> loc(#loc154) + %ileft_289 = tt.broadcast %left_mask_53 : tensor<1x2x1xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc156) + %ileft_290 = arith.muli %y_288, %ileft_289 : tensor<32x2x8xi32, #linear4> loc(#loc156) + %ileft_291 = "tt.reduce"(%ileft_290) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc193) + %ileft_292 = tt.expand_dims %ileft_291 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc158) + %ileft_293 = tt.broadcast %ileft_292 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc159) + %iright_294 = arith.muli %y_288, %flip_177 : tensor<32x2x8xi32, #linear4> loc(#loc160) + %iright_295 = "tt.reduce"(%iright_294) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc195) + %iright_296 = tt.expand_dims %iright_295 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc162) + %iright_297 = tt.broadcast %iright_296 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc163) + %ileft_298 = tt.reshape %ileft_293 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_299 = tt.reshape %iright_297 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_300 = tt.reshape %new_idxs_287 : tensor<32x16xi32, #linear> -> tensor<32x2x8xi32, #linear4> loc(#loc166) + %left_idx_301 = arith.muli %y_idx_300, %ileft_289 : tensor<32x2x8xi32, #linear4> loc(#loc168) + %left_idx_302 = "tt.reduce"(%left_idx_301) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc198) + %left_idx_303 = tt.expand_dims %left_idx_302 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc170) + %left_idx_304 = tt.broadcast %left_idx_303 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc171) + %right_idx_305 = arith.muli %y_idx_300, %flip_177 : tensor<32x2x8xi32, #linear4> loc(#loc173) + %right_idx_306 = "tt.reduce"(%right_idx_305) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc201) + %right_idx_307 = tt.expand_dims %right_idx_306 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc175) + %right_idx_308 = tt.broadcast %right_idx_307 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc176) + %left_idx_309 = tt.reshape %left_idx_304 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_310 = tt.reshape %right_idx_308 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_311 = arith.cmpi slt, %ileft_298, %iright_299 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_312 = arith.cmpi eq, %ileft_298, %iright_299 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_313 = arith.cmpi sgt, %left_idx_309, %right_idx_310 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_314 = arith.andi %eq_312, %cond_313 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_315 = arith.ori %cond_311, %cond_314 : tensor<32x16xi1, #linear> loc(#loc183) + %ret_316 = arith.xori %ileft_298, %iright_299 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_317 = arith.select %cond_315, %ret_316, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_318 = arith.xori %ret_284, %ret_317 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_319 = arith.xori %left_idx_309, %right_idx_310 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_320 = arith.select %cond_315, %new_idxs_319, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_321 = arith.xori %new_idxs_287, %new_idxs_320 : tensor<32x16xi32, #linear> loc(#loc191) + %y_322 = tt.reshape %ret_318 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc154) + %ileft_323 = arith.muli %y_322, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc156) + %ileft_324 = "tt.reduce"(%ileft_323) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc193) + %ileft_325 = tt.expand_dims %ileft_324 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc158) + %ileft_326 = tt.broadcast %ileft_325 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc159) + %iright_327 = arith.muli %y_322, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc160) + %iright_328 = "tt.reduce"(%iright_327) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc195) + %iright_329 = tt.expand_dims %iright_328 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc162) + %iright_330 = tt.broadcast %iright_329 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc163) + %ileft_331 = tt.reshape %ileft_326 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_332 = tt.reshape %iright_330 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_333 = tt.reshape %new_idxs_321 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc166) + %left_idx_334 = arith.muli %y_idx_333, %ileft_180 : tensor<64x2x4xi32, #linear3> loc(#loc168) + %left_idx_335 = "tt.reduce"(%left_idx_334) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc198) + %left_idx_336 = tt.expand_dims %left_idx_335 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc170) + %left_idx_337 = tt.broadcast %left_idx_336 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc171) + %right_idx_338 = arith.muli %y_idx_333, %flip_102 : tensor<64x2x4xi32, #linear3> loc(#loc173) + %right_idx_339 = "tt.reduce"(%right_idx_338) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc201) + %right_idx_340 = tt.expand_dims %right_idx_339 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc175) + %right_idx_341 = tt.broadcast %right_idx_340 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc176) + %left_idx_342 = tt.reshape %left_idx_337 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_343 = tt.reshape %right_idx_341 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_344 = arith.cmpi slt, %ileft_331, %iright_332 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_345 = arith.cmpi eq, %ileft_331, %iright_332 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_346 = arith.cmpi sgt, %left_idx_342, %right_idx_343 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_347 = arith.andi %eq_345, %cond_346 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_348 = arith.ori %cond_344, %cond_347 : tensor<32x16xi1, #linear> loc(#loc183) + %ret_349 = arith.xori %ileft_331, %iright_332 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_350 = arith.select %cond_348, %ret_349, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_351 = arith.xori %ret_318, %ret_350 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_352 = arith.xori %left_idx_342, %right_idx_343 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_353 = arith.select %cond_348, %new_idxs_352, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_354 = arith.xori %new_idxs_321, %new_idxs_353 : tensor<32x16xi32, #linear> loc(#loc191) + %y_355 = tt.reshape %ret_351 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc154) + %ileft_356 = arith.muli %y_355, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc156) + %ileft_357 = "tt.reduce"(%ileft_356) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193) + %ileft_358 = tt.expand_dims %ileft_357 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc158) + %ileft_359 = tt.broadcast %ileft_358 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc159) + %iright_360 = arith.muli %y_355, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc160) + %iright_361 = "tt.reduce"(%iright_360) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195) + %iright_362 = tt.expand_dims %iright_361 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc162) + %iright_363 = tt.broadcast %iright_362 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc163) + %ileft_364 = tt.reshape %ileft_359 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_365 = tt.reshape %iright_363 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_366 = tt.reshape %new_idxs_354 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc166) + %left_idx_367 = arith.muli %y_idx_366, %ileft_105 : tensor<128x2x2xi32, #linear2> loc(#loc168) + %left_idx_368 = "tt.reduce"(%left_idx_367) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198) + %left_idx_369 = tt.expand_dims %left_idx_368 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc170) + %left_idx_370 = tt.broadcast %left_idx_369 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc171) + %right_idx_371 = arith.muli %y_idx_366, %flip_48 : tensor<128x2x2xi32, #linear2> loc(#loc173) + %right_idx_372 = "tt.reduce"(%right_idx_371) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201) + %right_idx_373 = tt.expand_dims %right_idx_372 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc175) + %right_idx_374 = tt.broadcast %right_idx_373 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc176) + %left_idx_375 = tt.reshape %left_idx_370 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_376 = tt.reshape %right_idx_374 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_377 = arith.cmpi slt, %ileft_364, %iright_365 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_378 = arith.cmpi eq, %ileft_364, %iright_365 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_379 = arith.cmpi sgt, %left_idx_375, %right_idx_376 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_380 = arith.andi %eq_378, %cond_379 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_381 = arith.ori %cond_377, %cond_380 : tensor<32x16xi1, #linear> loc(#loc183) + %ret_382 = arith.xori %ileft_364, %iright_365 : tensor<32x16xi32, #linear> loc(#loc186) + %ret_383 = arith.select %cond_381, %ret_382, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc187) + %ret_384 = arith.xori %ret_351, %ret_383 : tensor<32x16xi32, #linear> loc(#loc188) + %new_idxs_385 = arith.xori %left_idx_375, %right_idx_376 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_386 = arith.select %cond_381, %new_idxs_385, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_387 = arith.xori %new_idxs_354, %new_idxs_386 : tensor<32x16xi32, #linear> loc(#loc191) + %y_388 = tt.reshape %ret_384 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc154) + %ileft_389 = arith.muli %y_388, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc156) + %ileft_390 = "tt.reduce"(%ileft_389) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_391 = tt.expand_dims %ileft_390 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc158) + %ileft_392 = tt.broadcast %ileft_391 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc159) + %iright_393 = arith.muli %y_388, %iright : tensor<256x2x1xi32, #linear1> loc(#loc160) + %iright_394 = "tt.reduce"(%iright_393) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_395 = tt.expand_dims %iright_394 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc162) + %iright_396 = tt.broadcast %iright_395 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc163) + %ileft_397 = tt.reshape %ileft_392 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc164) + %iright_398 = tt.reshape %iright_396 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc165) + %y_idx_399 = tt.reshape %new_idxs_387 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc166) + %left_idx_400 = arith.muli %y_idx_399, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc168) + %left_idx_401 = "tt.reduce"(%left_idx_400) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_402 = tt.expand_dims %left_idx_401 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc170) + %left_idx_403 = tt.broadcast %left_idx_402 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc171) + %right_idx_404 = arith.muli %y_idx_399, %iright : tensor<256x2x1xi32, #linear1> loc(#loc173) + %right_idx_405 = "tt.reduce"(%right_idx_404) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_406 = tt.expand_dims %right_idx_405 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc175) + %right_idx_407 = tt.broadcast %right_idx_406 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc176) + %left_idx_408 = tt.reshape %left_idx_403 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc177) + %right_idx_409 = tt.reshape %right_idx_407 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc178) + %cond_410 = arith.cmpi slt, %ileft_397, %iright_398 : tensor<32x16xi32, #linear> loc(#loc179) + %eq_411 = arith.cmpi eq, %ileft_397, %iright_398 : tensor<32x16xi32, #linear> loc(#loc180) + %cond_412 = arith.cmpi sgt, %left_idx_408, %right_idx_409 : tensor<32x16xi32, #linear> loc(#loc181) + %cond_413 = arith.andi %eq_411, %cond_412 : tensor<32x16xi1, #linear> loc(#loc182) + %cond_414 = arith.ori %cond_410, %cond_413 : tensor<32x16xi1, #linear> loc(#loc183) + %new_idxs_415 = arith.xori %left_idx_408, %right_idx_409 : tensor<32x16xi32, #linear> loc(#loc189) + %new_idxs_416 = arith.select %cond_414, %new_idxs_415, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc190) + %new_idxs_417 = arith.xori %new_idxs_387, %new_idxs_416 : tensor<32x16xi32, #linear> loc(#loc191) + %tmp7 = arith.extsi %tmp0_36 : tensor<32x16xi32, #blocked> to tensor<32x16xi64, #blocked> loc(#loc141) + %tmp10 = arith.select %tmp0_34, %tmp7, %cst_0 : tensor<32x16xi1, #blocked>, tensor<32x16xi64, #blocked> loc(#loc142) + %tmp11 = "tt.reduce"(%tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_419: i64 loc(callsite(#loc1 at #loc143)), %tmp11_420: i64 loc(callsite(#loc1 at #loc143))): + %tmp11_421 = arith.addi %tmp11_419, %tmp11_420 : i64 loc(#loc192) + tt.reduce.return %tmp11_421 : i64 loc(#loc152) + }) : (tensor<32x16xi64, #blocked>) -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc152) + %tmp11_418 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi64, #blocked> loc(#loc144) + %tmp14 = arith.trunci %tmp11_418 : tensor<32x1xi64, #blocked> to tensor<32x1xi32, #blocked> loc(#loc145) + %0 = arith.muli %xindex_19, %cst_4 : tensor<32x1xi32, #blocked1> loc(#loc70) + %1 = tt.broadcast %r0_index_25 : tensor<1x16xi32, #blocked1> -> tensor<32x16xi32, #blocked1> loc(#loc71) + %2 = tt.broadcast %0 : tensor<32x1xi32, #blocked1> -> tensor<32x16xi32, #blocked1> loc(#loc71) + %3 = arith.addi %1, %2 : tensor<32x16xi32, #blocked1> loc(#loc71) + %4 = tt.splat %out_ptr2 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked1> loc(#loc72) + %5 = tt.addptr %4, %3 : tensor<32x16x!tt.ptr, #blocked1>, tensor<32x16xi32, #blocked1> loc(#loc72) + %6 = ttg.convert_layout %new_idxs_417 : tensor<32x16xi32, #linear> -> tensor<32x16xi32, #blocked1> loc(#loc73) + tt.store %5, %6, %tmp0_35 : tensor<32x16x!tt.ptr, #blocked1> loc(#loc73) + %7 = tt.splat %out_ptr3 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> loc(#loc74) + %8 = tt.addptr %7, %xindex_18 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> loc(#loc74) + tt.store %8, %tmp14, %xmask : tensor<32x1x!tt.ptr, #blocked> loc(#loc75) + tt.return loc(#loc76) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":24:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":26:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":27:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":33:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":34:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:35) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:49) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:30) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:54) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":38:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":40:33) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":42:19) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":44:34) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":45:29) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":48:21) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:32) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:47) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:25) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:37) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:4) +#loc82 = loc("xoffset"(#loc2)) +#loc83 = loc("xoffset"(#loc3)) +#loc84 = loc("xindex"(#loc4)) +#loc85 = loc("xindex"(#loc5)) +#loc86 = loc("xmask"(#loc6)) +#loc87 = loc("r0_index"(#loc7)) +#loc88 = loc("x0"(#loc8)) +#loc89 = loc("x1"(#loc9)) +#loc90 = loc("tmp0"(#loc10)) +#loc91 = loc("tmp0"(#loc11)) +#loc92 = loc("tmp0"(#loc12)) +#loc93 = loc("tmp0"(#loc13)) +#loc94 = loc("tmp0"(#loc14)) +#loc95 = loc("tmp0"(#loc15)) +#loc96 = loc("tmp2"(#loc16)) +#loc97 = loc("tmp4"(#loc17)) +#loc98 = loc("flip"(#loc18)) +#loc100 = loc("flip"(#loc21)) +#loc101 = loc("flip"(#loc22)) +#loc102 = loc("y"(#loc23)) +#loc103 = loc("left_mask"(#loc25)) +#loc104 = loc("ileft"(#loc26)) +#loc106 = loc("ileft"(#loc30)) +#loc107 = loc("ileft"(#loc31)) +#loc108 = loc("iright"(#loc32)) +#loc110 = loc("iright"(#loc34)) +#loc111 = loc("iright"(#loc35)) +#loc112 = loc("ileft"(#loc36)) +#loc113 = loc("iright"(#loc37)) +#loc114 = loc("y_idx"(#loc38)) +#loc115 = loc("left_idx"(#loc39)) +#loc116 = loc("left_idx"(#loc40)) +#loc117 = loc("input"(#loc41)) +#loc119 = loc("left_idx"(#loc43)) +#loc120 = loc("left_idx"(#loc44)) +#loc121 = loc("right_idx"(#loc45)) +#loc122 = loc("right_idx"(#loc46)) +#loc124 = loc("right_idx"(#loc48)) +#loc125 = loc("right_idx"(#loc49)) +#loc126 = loc("left_idx"(#loc50)) +#loc127 = loc("right_idx"(#loc51)) +#loc128 = loc("cond"(#loc52)) +#loc129 = loc("eq"(#loc53)) +#loc130 = loc("cond"(#loc54)) +#loc131 = loc("cond"(#loc55)) +#loc132 = loc("cond"(#loc56)) +#loc133 = loc("cond"(#loc57)) +#loc134 = loc("cond"(#loc58)) +#loc135 = loc("ret"(#loc59)) +#loc136 = loc("ret"(#loc60)) +#loc137 = loc("ret"(#loc61)) +#loc138 = loc("new_idxs"(#loc62)) +#loc139 = loc("new_idxs"(#loc63)) +#loc140 = loc("new_idxs"(#loc64)) +#loc141 = loc("tmp7"(#loc65)) +#loc142 = loc("tmp10"(#loc66)) +#loc144 = loc("tmp11"(#loc68)) +#loc145 = loc("tmp14"(#loc69)) +#loc146 = loc(callsite(#loc98 at #loc99)) +#loc147 = loc(callsite(#loc100 at #loc99)) +#loc148 = loc(callsite(#loc101 at #loc99)) +#loc150 = loc("cond"(#loc128)) +#loc151 = loc("eq"(#loc129)) +#loc152 = loc(callsite(#loc27 at #loc143)) +#loc154 = loc(callsite(#loc102 at #loc149)) +#loc155 = loc(callsite(#loc103 at #loc149)) +#loc156 = loc(callsite(#loc104 at #loc149)) +#loc158 = loc(callsite(#loc106 at #loc149)) +#loc159 = loc(callsite(#loc107 at #loc149)) +#loc160 = loc(callsite(#loc108 at #loc149)) +#loc162 = loc(callsite(#loc110 at #loc149)) +#loc163 = loc(callsite(#loc111 at #loc149)) +#loc164 = loc(callsite(#loc112 at #loc149)) +#loc165 = loc(callsite(#loc113 at #loc149)) +#loc166 = loc(callsite(#loc114 at #loc149)) +#loc167 = loc(callsite(#loc115 at #loc149)) +#loc168 = loc(callsite(#loc116 at #loc149)) +#loc170 = loc(callsite(#loc119 at #loc149)) +#loc171 = loc(callsite(#loc120 at #loc149)) +#loc172 = loc(callsite(#loc121 at #loc149)) +#loc173 = loc(callsite(#loc122 at #loc149)) +#loc175 = loc(callsite(#loc124 at #loc149)) +#loc176 = loc(callsite(#loc125 at #loc149)) +#loc177 = loc(callsite(#loc126 at #loc149)) +#loc178 = loc(callsite(#loc127 at #loc149)) +#loc179 = loc(callsite(#loc150 at #loc149)) +#loc180 = loc(callsite(#loc151 at #loc149)) +#loc181 = loc(callsite(#loc130 at #loc149)) +#loc182 = loc(callsite(#loc131 at #loc149)) +#loc183 = loc(callsite(#loc132 at #loc149)) +#loc184 = loc(callsite(#loc133 at #loc149)) +#loc185 = loc(callsite(#loc134 at #loc149)) +#loc186 = loc(callsite(#loc135 at #loc149)) +#loc187 = loc(callsite(#loc136 at #loc149)) +#loc188 = loc(callsite(#loc137 at #loc149)) +#loc189 = loc(callsite(#loc138 at #loc149)) +#loc190 = loc(callsite(#loc139 at #loc149)) +#loc191 = loc(callsite(#loc140 at #loc149)) +#loc192 = loc(callsite(#loc29 at #loc152)) +#loc193 = loc(callsite(#loc27 at #loc157)) +#loc195 = loc(callsite(#loc27 at #loc161)) +#loc197 = loc(callsite(#loc117 at #loc169)) +#loc198 = loc(callsite(#loc27 at #loc169)) +#loc200 = loc(callsite(#loc117 at #loc174)) +#loc201 = loc(callsite(#loc27 at #loc174)) +#loc203 = loc(callsite(#loc29 at #loc193)) +#loc204 = loc(callsite(#loc29 at #loc195)) +#loc205 = loc(callsite(#loc29 at #loc198)) +#loc206 = loc(callsite(#loc29 at #loc201)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir new file mode 100644 index 0000000000000000000000000000000000000000..ef5d03dc91511bd4088fd23bd60877f241650b2c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/A7DYCXJM4X5DHYLAIRTU6BFB3S5UCV3W4C27BWQBJGXYAG3NWQWA/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir @@ -0,0 +1,799 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":41:67) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":45:26) +#loc80 = loc("in_ptr0"(#loc)) +#loc81 = loc("out_ptr2"(#loc)) +#loc82 = loc("out_ptr3"(#loc)) +#loc83 = loc("xnumel"(#loc)) +#loc84 = loc("r0_numel"(#loc)) +#loc106 = loc(callsite(#loc23 at #loc2)) +#loc113 = loc("ileft"(#loc32)) +#loc117 = loc("iright"(#loc37)) +#loc126 = loc("left_idx"(#loc46)) +#loc131 = loc("right_idx"(#loc51)) +#loc150 = loc("tmp11"(#loc70)) +#loc157 = loc(callsite(#loc28 at #loc106)) +#loc161 = loc(callsite(#loc1 at #loc150)) +#loc165 = loc(callsite(#loc113 at #loc157)) +#loc169 = loc(callsite(#loc117 at #loc157)) +#loc177 = loc(callsite(#loc126 at #loc157)) +#loc182 = loc(callsite(#loc131 at #loc157)) +#loc202 = loc(callsite(#loc1 at #loc165)) +#loc204 = loc(callsite(#loc1 at #loc169)) +#loc207 = loc(callsite(#loc1 at #loc177)) +#loc210 = loc(callsite(#loc1 at #loc182)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc85) + %cst_0 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc1) + %tmp10 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc86) + %tmp0 = arith.constant dense<272> : tensor<32x1xi32> loc(#loc87) + %tmp0_1 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc88) + %cst_2 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc1) + %xmask = arith.constant dense<32> : tensor<32x1xi32> loc(#loc89) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc90) + %xoffset_3 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc91) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc92) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc93) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<32x1xi32> loc(#loc94) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<32x1xi32> loc(#loc94) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<32x1xi32> loc(#loc89) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc95) + %r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc96) + %x0 = arith.remsi %xindex_6, %cst_2 : tensor<32x1xi32> loc(#loc97) + %x1 = arith.divsi %xindex_6, %cst_2 : tensor<32x1xi32> loc(#loc98) + %tmp0_9 = arith.muli %r0_index_8, %tmp0_1 : tensor<1x16xi32> loc(#loc88) + %tmp0_10 = tt.broadcast %x0 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc99) + %tmp0_11 = tt.broadcast %tmp0_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc99) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<32x16xi32> loc(#loc99) + %tmp0_13 = arith.muli %x1, %tmp0 : tensor<32x1xi32> loc(#loc87) + %tmp0_14 = tt.broadcast %tmp0_13 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc100) + %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<32x16xi32> loc(#loc100) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc101) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc101) + %tmp0_18 = tt.broadcast %xmask_7 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc102) + %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %cst_0 : tensor<32x16x!tt.ptr> loc(#loc102) + %tmp2 = arith.trunci %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc103) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<32x16xi16> loc(#loc104) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc153) + %flip_20 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc154) + %flip_21 = tt.expand_dims %flip_20 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc154) + %flip_22 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc155) + %flip_23 = tt.reshape %flip_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc156) + %y = tt.reshape %tmp0_19 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162) + %left_mask = arith.subi %cst, %flip_21 : tensor<1x2x1xi32> loc(#loc163) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc164) + %ileft_24 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc164) + %ileft_25 = "tt.reduce"(%ileft_24) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201) + %ileft_26 = tt.expand_dims %ileft_25 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166) + %ileft_27 = tt.broadcast %ileft_26 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167) + %iright = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc168) + %iright_28 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc168) + %iright_29 = "tt.reduce"(%iright_28) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203) + %iright_30 = tt.expand_dims %iright_29 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170) + %iright_31 = tt.broadcast %iright_30 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171) + %ileft_32 = tt.reshape %ileft_27 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_33 = tt.reshape %iright_31 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx = tt.reshape %tmp4 : tensor<32x16xi16> -> tensor<256x2x1xi16> loc(#loc174) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc175) + %left_idx_34 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc176) + %left_idx_35 = arith.muli %y_idx, %left_idx_34 : tensor<256x2x1xi16> loc(#loc176) + %input = arith.extsi %left_idx_35 : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc205) + %left_idx_36 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206) + %left_idx_37 = tt.expand_dims %left_idx_36 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178) + %left_idx_38 = tt.broadcast %left_idx_37 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179) + %right_idx = arith.trunci %flip_21 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc180) + %right_idx_39 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc181) + %right_idx_40 = arith.muli %y_idx, %right_idx_39 : tensor<256x2x1xi16> loc(#loc181) + %input_41 = arith.extsi %right_idx_40 : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc208) + %right_idx_42 = "tt.reduce"(%input_41) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209) + %right_idx_43 = tt.expand_dims %right_idx_42 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183) + %right_idx_44 = tt.broadcast %right_idx_43 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184) + %left_idx_45 = tt.reshape %left_idx_38 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_46 = tt.reshape %right_idx_44 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186) + %cond = arith.cmpi slt, %ileft_32, %iright_33 : tensor<32x16xi32> loc(#loc187) + %eq = arith.cmpi eq, %ileft_32, %iright_33 : tensor<32x16xi32> loc(#loc188) + %cond_47 = arith.cmpi sgt, %left_idx_45, %right_idx_46 : tensor<32x16xi32> loc(#loc189) + %cond_48 = arith.andi %eq, %cond_47 : tensor<32x16xi1> loc(#loc190) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc191) + %cond_50 = arith.extui %cond_49 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192) + %cond_51 = arith.xori %cond_50, %flip_23 : tensor<32x16xi32> loc(#loc192) + %cond_52 = arith.cmpi ne, %cond_51, %cst_0 : tensor<32x16xi32> loc(#loc193) + %ret = arith.xori %ileft_32, %iright_33 : tensor<32x16xi32> loc(#loc194) + %ret_53 = arith.select %cond_52, %ret, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_54 = arith.xori %tmp0_19, %ret_53 : tensor<32x16xi32> loc(#loc196) + %new_idxs = arith.xori %left_idx_45, %right_idx_46 : tensor<32x16xi32> loc(#loc197) + %new_idxs_55 = arith.select %cond_52, %new_idxs, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_56 = arith.extsi %tmp2 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc199) + %new_idxs_57 = tt.broadcast %new_idxs_56 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc199) + %new_idxs_58 = arith.xori %new_idxs_57, %new_idxs_55 : tensor<32x16xi32> loc(#loc199) + %flip_59 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc155) + %flip_60 = tt.reshape %flip_59 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc156) + %y_61 = tt.reshape %ret_54 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc162) + %ileft_62 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc164) + %ileft_63 = arith.muli %y_61, %ileft_62 : tensor<128x2x2xi32> loc(#loc164) + %ileft_64 = "tt.reduce"(%ileft_63) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc201) + %ileft_65 = tt.expand_dims %ileft_64 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc166) + %ileft_66 = tt.broadcast %ileft_65 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc167) + %iright_67 = arith.muli %y_61, %flip_22 : tensor<128x2x2xi32> loc(#loc168) + %iright_68 = "tt.reduce"(%iright_67) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc203) + %iright_69 = tt.expand_dims %iright_68 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc170) + %iright_70 = tt.broadcast %iright_69 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc171) + %ileft_71 = tt.reshape %ileft_66 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_72 = tt.reshape %iright_70 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_73 = tt.reshape %new_idxs_58 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc174) + %left_idx_74 = arith.muli %y_idx_73, %ileft_62 : tensor<128x2x2xi32> loc(#loc176) + %left_idx_75 = "tt.reduce"(%left_idx_74) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc206) + %left_idx_76 = tt.expand_dims %left_idx_75 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc178) + %left_idx_77 = tt.broadcast %left_idx_76 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc179) + %right_idx_78 = arith.muli %y_idx_73, %flip_22 : tensor<128x2x2xi32> loc(#loc181) + %right_idx_79 = "tt.reduce"(%right_idx_78) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc209) + %right_idx_80 = tt.expand_dims %right_idx_79 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc183) + %right_idx_81 = tt.broadcast %right_idx_80 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc184) + %left_idx_82 = tt.reshape %left_idx_77 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_83 = tt.reshape %right_idx_81 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_84 = arith.cmpi slt, %ileft_71, %iright_72 : tensor<32x16xi32> loc(#loc187) + %eq_85 = arith.cmpi eq, %ileft_71, %iright_72 : tensor<32x16xi32> loc(#loc188) + %cond_86 = arith.cmpi sgt, %left_idx_82, %right_idx_83 : tensor<32x16xi32> loc(#loc189) + %cond_87 = arith.andi %eq_85, %cond_86 : tensor<32x16xi1> loc(#loc190) + %cond_88 = arith.ori %cond_84, %cond_87 : tensor<32x16xi1> loc(#loc191) + %cond_89 = arith.extui %cond_88 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192) + %cond_90 = arith.xori %cond_89, %flip_60 : tensor<32x16xi32> loc(#loc192) + %cond_91 = arith.cmpi ne, %cond_90, %cst_0 : tensor<32x16xi32> loc(#loc193) + %ret_92 = arith.xori %ileft_71, %iright_72 : tensor<32x16xi32> loc(#loc194) + %ret_93 = arith.select %cond_91, %ret_92, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_94 = arith.xori %ret_54, %ret_93 : tensor<32x16xi32> loc(#loc196) + %new_idxs_95 = arith.xori %left_idx_82, %right_idx_83 : tensor<32x16xi32> loc(#loc197) + %new_idxs_96 = arith.select %cond_91, %new_idxs_95, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_97 = arith.xori %new_idxs_58, %new_idxs_96 : tensor<32x16xi32> loc(#loc199) + %y_98 = tt.reshape %ret_94 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162) + %ileft_99 = arith.muli %y_98, %ileft : tensor<256x2x1xi32> loc(#loc164) + %ileft_100 = "tt.reduce"(%ileft_99) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201) + %ileft_101 = tt.expand_dims %ileft_100 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166) + %ileft_102 = tt.broadcast %ileft_101 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167) + %iright_103 = arith.muli %y_98, %iright : tensor<256x2x1xi32> loc(#loc168) + %iright_104 = "tt.reduce"(%iright_103) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203) + %iright_105 = tt.expand_dims %iright_104 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170) + %iright_106 = tt.broadcast %iright_105 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171) + %ileft_107 = tt.reshape %ileft_102 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_108 = tt.reshape %iright_106 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_109 = tt.reshape %new_idxs_97 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc174) + %left_idx_110 = arith.muli %y_idx_109, %ileft : tensor<256x2x1xi32> loc(#loc176) + %left_idx_111 = "tt.reduce"(%left_idx_110) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206) + %left_idx_112 = tt.expand_dims %left_idx_111 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178) + %left_idx_113 = tt.broadcast %left_idx_112 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179) + %right_idx_114 = arith.muli %y_idx_109, %iright : tensor<256x2x1xi32> loc(#loc181) + %right_idx_115 = "tt.reduce"(%right_idx_114) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209) + %right_idx_116 = tt.expand_dims %right_idx_115 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183) + %right_idx_117 = tt.broadcast %right_idx_116 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184) + %left_idx_118 = tt.reshape %left_idx_113 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_119 = tt.reshape %right_idx_117 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_120 = arith.cmpi slt, %ileft_107, %iright_108 : tensor<32x16xi32> loc(#loc187) + %eq_121 = arith.cmpi eq, %ileft_107, %iright_108 : tensor<32x16xi32> loc(#loc188) + %cond_122 = arith.cmpi sgt, %left_idx_118, %right_idx_119 : tensor<32x16xi32> loc(#loc189) + %cond_123 = arith.andi %eq_121, %cond_122 : tensor<32x16xi1> loc(#loc190) + %cond_124 = arith.ori %cond_120, %cond_123 : tensor<32x16xi1> loc(#loc191) + %cond_125 = arith.extui %cond_124 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192) + %cond_126 = arith.xori %cond_125, %flip_60 : tensor<32x16xi32> loc(#loc192) + %cond_127 = arith.cmpi ne, %cond_126, %cst_0 : tensor<32x16xi32> loc(#loc193) + %ret_128 = arith.xori %ileft_107, %iright_108 : tensor<32x16xi32> loc(#loc194) + %ret_129 = arith.select %cond_127, %ret_128, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_130 = arith.xori %ret_94, %ret_129 : tensor<32x16xi32> loc(#loc196) + %new_idxs_131 = arith.xori %left_idx_118, %right_idx_119 : tensor<32x16xi32> loc(#loc197) + %new_idxs_132 = arith.select %cond_127, %new_idxs_131, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_133 = arith.xori %new_idxs_97, %new_idxs_132 : tensor<32x16xi32> loc(#loc199) + %flip_134 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc155) + %flip_135 = tt.reshape %flip_134 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc156) + %y_136 = tt.reshape %ret_130 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc162) + %ileft_137 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc164) + %ileft_138 = arith.muli %y_136, %ileft_137 : tensor<64x2x4xi32> loc(#loc164) + %ileft_139 = "tt.reduce"(%ileft_138) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc201) + %ileft_140 = tt.expand_dims %ileft_139 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc166) + %ileft_141 = tt.broadcast %ileft_140 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc167) + %iright_142 = arith.muli %y_136, %flip_59 : tensor<64x2x4xi32> loc(#loc168) + %iright_143 = "tt.reduce"(%iright_142) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc203) + %iright_144 = tt.expand_dims %iright_143 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc170) + %iright_145 = tt.broadcast %iright_144 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc171) + %ileft_146 = tt.reshape %ileft_141 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_147 = tt.reshape %iright_145 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_148 = tt.reshape %new_idxs_133 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc174) + %left_idx_149 = arith.muli %y_idx_148, %ileft_137 : tensor<64x2x4xi32> loc(#loc176) + %left_idx_150 = "tt.reduce"(%left_idx_149) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc206) + %left_idx_151 = tt.expand_dims %left_idx_150 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc178) + %left_idx_152 = tt.broadcast %left_idx_151 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc179) + %right_idx_153 = arith.muli %y_idx_148, %flip_59 : tensor<64x2x4xi32> loc(#loc181) + %right_idx_154 = "tt.reduce"(%right_idx_153) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc209) + %right_idx_155 = tt.expand_dims %right_idx_154 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc183) + %right_idx_156 = tt.broadcast %right_idx_155 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc184) + %left_idx_157 = tt.reshape %left_idx_152 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_158 = tt.reshape %right_idx_156 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_159 = arith.cmpi slt, %ileft_146, %iright_147 : tensor<32x16xi32> loc(#loc187) + %eq_160 = arith.cmpi eq, %ileft_146, %iright_147 : tensor<32x16xi32> loc(#loc188) + %cond_161 = arith.cmpi sgt, %left_idx_157, %right_idx_158 : tensor<32x16xi32> loc(#loc189) + %cond_162 = arith.andi %eq_160, %cond_161 : tensor<32x16xi1> loc(#loc190) + %cond_163 = arith.ori %cond_159, %cond_162 : tensor<32x16xi1> loc(#loc191) + %cond_164 = arith.extui %cond_163 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192) + %cond_165 = arith.xori %cond_164, %flip_135 : tensor<32x16xi32> loc(#loc192) + %cond_166 = arith.cmpi ne, %cond_165, %cst_0 : tensor<32x16xi32> loc(#loc193) + %ret_167 = arith.xori %ileft_146, %iright_147 : tensor<32x16xi32> loc(#loc194) + %ret_168 = arith.select %cond_166, %ret_167, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_169 = arith.xori %ret_130, %ret_168 : tensor<32x16xi32> loc(#loc196) + %new_idxs_170 = arith.xori %left_idx_157, %right_idx_158 : tensor<32x16xi32> loc(#loc197) + %new_idxs_171 = arith.select %cond_166, %new_idxs_170, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_172 = arith.xori %new_idxs_133, %new_idxs_171 : tensor<32x16xi32> loc(#loc199) + %y_173 = tt.reshape %ret_169 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc162) + %ileft_174 = arith.muli %y_173, %ileft_62 : tensor<128x2x2xi32> loc(#loc164) + %ileft_175 = "tt.reduce"(%ileft_174) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc201) + %ileft_176 = tt.expand_dims %ileft_175 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc166) + %ileft_177 = tt.broadcast %ileft_176 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc167) + %iright_178 = arith.muli %y_173, %flip_22 : tensor<128x2x2xi32> loc(#loc168) + %iright_179 = "tt.reduce"(%iright_178) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc203) + %iright_180 = tt.expand_dims %iright_179 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc170) + %iright_181 = tt.broadcast %iright_180 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc171) + %ileft_182 = tt.reshape %ileft_177 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_183 = tt.reshape %iright_181 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_184 = tt.reshape %new_idxs_172 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc174) + %left_idx_185 = arith.muli %y_idx_184, %ileft_62 : tensor<128x2x2xi32> loc(#loc176) + %left_idx_186 = "tt.reduce"(%left_idx_185) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc206) + %left_idx_187 = tt.expand_dims %left_idx_186 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc178) + %left_idx_188 = tt.broadcast %left_idx_187 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc179) + %right_idx_189 = arith.muli %y_idx_184, %flip_22 : tensor<128x2x2xi32> loc(#loc181) + %right_idx_190 = "tt.reduce"(%right_idx_189) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc209) + %right_idx_191 = tt.expand_dims %right_idx_190 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc183) + %right_idx_192 = tt.broadcast %right_idx_191 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc184) + %left_idx_193 = tt.reshape %left_idx_188 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_194 = tt.reshape %right_idx_192 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_195 = arith.cmpi slt, %ileft_182, %iright_183 : tensor<32x16xi32> loc(#loc187) + %eq_196 = arith.cmpi eq, %ileft_182, %iright_183 : tensor<32x16xi32> loc(#loc188) + %cond_197 = arith.cmpi sgt, %left_idx_193, %right_idx_194 : tensor<32x16xi32> loc(#loc189) + %cond_198 = arith.andi %eq_196, %cond_197 : tensor<32x16xi1> loc(#loc190) + %cond_199 = arith.ori %cond_195, %cond_198 : tensor<32x16xi1> loc(#loc191) + %cond_200 = arith.extui %cond_199 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192) + %cond_201 = arith.xori %cond_200, %flip_135 : tensor<32x16xi32> loc(#loc192) + %cond_202 = arith.cmpi ne, %cond_201, %cst_0 : tensor<32x16xi32> loc(#loc193) + %ret_203 = arith.xori %ileft_182, %iright_183 : tensor<32x16xi32> loc(#loc194) + %ret_204 = arith.select %cond_202, %ret_203, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_205 = arith.xori %ret_169, %ret_204 : tensor<32x16xi32> loc(#loc196) + %new_idxs_206 = arith.xori %left_idx_193, %right_idx_194 : tensor<32x16xi32> loc(#loc197) + %new_idxs_207 = arith.select %cond_202, %new_idxs_206, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_208 = arith.xori %new_idxs_172, %new_idxs_207 : tensor<32x16xi32> loc(#loc199) + %y_209 = tt.reshape %ret_205 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162) + %ileft_210 = arith.muli %y_209, %ileft : tensor<256x2x1xi32> loc(#loc164) + %ileft_211 = "tt.reduce"(%ileft_210) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201) + %ileft_212 = tt.expand_dims %ileft_211 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166) + %ileft_213 = tt.broadcast %ileft_212 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167) + %iright_214 = arith.muli %y_209, %iright : tensor<256x2x1xi32> loc(#loc168) + %iright_215 = "tt.reduce"(%iright_214) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203) + %iright_216 = tt.expand_dims %iright_215 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170) + %iright_217 = tt.broadcast %iright_216 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171) + %ileft_218 = tt.reshape %ileft_213 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_219 = tt.reshape %iright_217 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_220 = tt.reshape %new_idxs_208 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc174) + %left_idx_221 = arith.muli %y_idx_220, %ileft : tensor<256x2x1xi32> loc(#loc176) + %left_idx_222 = "tt.reduce"(%left_idx_221) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206) + %left_idx_223 = tt.expand_dims %left_idx_222 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178) + %left_idx_224 = tt.broadcast %left_idx_223 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179) + %right_idx_225 = arith.muli %y_idx_220, %iright : tensor<256x2x1xi32> loc(#loc181) + %right_idx_226 = "tt.reduce"(%right_idx_225) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209) + %right_idx_227 = tt.expand_dims %right_idx_226 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183) + %right_idx_228 = tt.broadcast %right_idx_227 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184) + %left_idx_229 = tt.reshape %left_idx_224 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_230 = tt.reshape %right_idx_228 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_231 = arith.cmpi slt, %ileft_218, %iright_219 : tensor<32x16xi32> loc(#loc187) + %eq_232 = arith.cmpi eq, %ileft_218, %iright_219 : tensor<32x16xi32> loc(#loc188) + %cond_233 = arith.cmpi sgt, %left_idx_229, %right_idx_230 : tensor<32x16xi32> loc(#loc189) + %cond_234 = arith.andi %eq_232, %cond_233 : tensor<32x16xi1> loc(#loc190) + %cond_235 = arith.ori %cond_231, %cond_234 : tensor<32x16xi1> loc(#loc191) + %cond_236 = arith.extui %cond_235 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc192) + %cond_237 = arith.xori %cond_236, %flip_135 : tensor<32x16xi32> loc(#loc192) + %cond_238 = arith.cmpi ne, %cond_237, %cst_0 : tensor<32x16xi32> loc(#loc193) + %ret_239 = arith.xori %ileft_218, %iright_219 : tensor<32x16xi32> loc(#loc194) + %ret_240 = arith.select %cond_238, %ret_239, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_241 = arith.xori %ret_205, %ret_240 : tensor<32x16xi32> loc(#loc196) + %new_idxs_242 = arith.xori %left_idx_229, %right_idx_230 : tensor<32x16xi32> loc(#loc197) + %new_idxs_243 = arith.select %cond_238, %new_idxs_242, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_244 = arith.xori %new_idxs_208, %new_idxs_243 : tensor<32x16xi32> loc(#loc199) + %y_245 = tt.reshape %ret_241 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc162) + %ileft_246 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc164) + %ileft_247 = arith.muli %y_245, %ileft_246 : tensor<32x2x8xi32> loc(#loc164) + %ileft_248 = "tt.reduce"(%ileft_247) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc201) + %ileft_249 = tt.expand_dims %ileft_248 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc166) + %ileft_250 = tt.broadcast %ileft_249 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc167) + %iright_251 = arith.muli %y_245, %flip_134 : tensor<32x2x8xi32> loc(#loc168) + %iright_252 = "tt.reduce"(%iright_251) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc203) + %iright_253 = tt.expand_dims %iright_252 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc170) + %iright_254 = tt.broadcast %iright_253 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc171) + %ileft_255 = tt.reshape %ileft_250 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_256 = tt.reshape %iright_254 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_257 = tt.reshape %new_idxs_244 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc174) + %left_idx_258 = arith.muli %y_idx_257, %ileft_246 : tensor<32x2x8xi32> loc(#loc176) + %left_idx_259 = "tt.reduce"(%left_idx_258) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc206) + %left_idx_260 = tt.expand_dims %left_idx_259 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc178) + %left_idx_261 = tt.broadcast %left_idx_260 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc179) + %right_idx_262 = arith.muli %y_idx_257, %flip_134 : tensor<32x2x8xi32> loc(#loc181) + %right_idx_263 = "tt.reduce"(%right_idx_262) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc209) + %right_idx_264 = tt.expand_dims %right_idx_263 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc183) + %right_idx_265 = tt.broadcast %right_idx_264 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc184) + %left_idx_266 = tt.reshape %left_idx_261 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_267 = tt.reshape %right_idx_265 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_268 = arith.cmpi slt, %ileft_255, %iright_256 : tensor<32x16xi32> loc(#loc187) + %eq_269 = arith.cmpi eq, %ileft_255, %iright_256 : tensor<32x16xi32> loc(#loc188) + %cond_270 = arith.cmpi sgt, %left_idx_266, %right_idx_267 : tensor<32x16xi32> loc(#loc189) + %cond_271 = arith.andi %eq_269, %cond_270 : tensor<32x16xi1> loc(#loc190) + %cond_272 = arith.ori %cond_268, %cond_271 : tensor<32x16xi1> loc(#loc191) + %ret_273 = arith.xori %ileft_255, %iright_256 : tensor<32x16xi32> loc(#loc194) + %ret_274 = arith.select %cond_272, %ret_273, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_275 = arith.xori %ret_241, %ret_274 : tensor<32x16xi32> loc(#loc196) + %new_idxs_276 = arith.xori %left_idx_266, %right_idx_267 : tensor<32x16xi32> loc(#loc197) + %new_idxs_277 = arith.select %cond_272, %new_idxs_276, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_278 = arith.xori %new_idxs_244, %new_idxs_277 : tensor<32x16xi32> loc(#loc199) + %y_279 = tt.reshape %ret_275 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc162) + %ileft_280 = arith.muli %y_279, %ileft_137 : tensor<64x2x4xi32> loc(#loc164) + %ileft_281 = "tt.reduce"(%ileft_280) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc201) + %ileft_282 = tt.expand_dims %ileft_281 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc166) + %ileft_283 = tt.broadcast %ileft_282 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc167) + %iright_284 = arith.muli %y_279, %flip_59 : tensor<64x2x4xi32> loc(#loc168) + %iright_285 = "tt.reduce"(%iright_284) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc203) + %iright_286 = tt.expand_dims %iright_285 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc170) + %iright_287 = tt.broadcast %iright_286 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc171) + %ileft_288 = tt.reshape %ileft_283 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_289 = tt.reshape %iright_287 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_290 = tt.reshape %new_idxs_278 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc174) + %left_idx_291 = arith.muli %y_idx_290, %ileft_137 : tensor<64x2x4xi32> loc(#loc176) + %left_idx_292 = "tt.reduce"(%left_idx_291) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc206) + %left_idx_293 = tt.expand_dims %left_idx_292 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc178) + %left_idx_294 = tt.broadcast %left_idx_293 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc179) + %right_idx_295 = arith.muli %y_idx_290, %flip_59 : tensor<64x2x4xi32> loc(#loc181) + %right_idx_296 = "tt.reduce"(%right_idx_295) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc209) + %right_idx_297 = tt.expand_dims %right_idx_296 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc183) + %right_idx_298 = tt.broadcast %right_idx_297 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc184) + %left_idx_299 = tt.reshape %left_idx_294 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_300 = tt.reshape %right_idx_298 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_301 = arith.cmpi slt, %ileft_288, %iright_289 : tensor<32x16xi32> loc(#loc187) + %eq_302 = arith.cmpi eq, %ileft_288, %iright_289 : tensor<32x16xi32> loc(#loc188) + %cond_303 = arith.cmpi sgt, %left_idx_299, %right_idx_300 : tensor<32x16xi32> loc(#loc189) + %cond_304 = arith.andi %eq_302, %cond_303 : tensor<32x16xi1> loc(#loc190) + %cond_305 = arith.ori %cond_301, %cond_304 : tensor<32x16xi1> loc(#loc191) + %ret_306 = arith.xori %ileft_288, %iright_289 : tensor<32x16xi32> loc(#loc194) + %ret_307 = arith.select %cond_305, %ret_306, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_308 = arith.xori %ret_275, %ret_307 : tensor<32x16xi32> loc(#loc196) + %new_idxs_309 = arith.xori %left_idx_299, %right_idx_300 : tensor<32x16xi32> loc(#loc197) + %new_idxs_310 = arith.select %cond_305, %new_idxs_309, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_311 = arith.xori %new_idxs_278, %new_idxs_310 : tensor<32x16xi32> loc(#loc199) + %y_312 = tt.reshape %ret_308 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc162) + %ileft_313 = arith.muli %y_312, %ileft_62 : tensor<128x2x2xi32> loc(#loc164) + %ileft_314 = "tt.reduce"(%ileft_313) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc201) + %ileft_315 = tt.expand_dims %ileft_314 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc166) + %ileft_316 = tt.broadcast %ileft_315 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc167) + %iright_317 = arith.muli %y_312, %flip_22 : tensor<128x2x2xi32> loc(#loc168) + %iright_318 = "tt.reduce"(%iright_317) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc203) + %iright_319 = tt.expand_dims %iright_318 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc170) + %iright_320 = tt.broadcast %iright_319 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc171) + %ileft_321 = tt.reshape %ileft_316 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_322 = tt.reshape %iright_320 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_323 = tt.reshape %new_idxs_311 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc174) + %left_idx_324 = arith.muli %y_idx_323, %ileft_62 : tensor<128x2x2xi32> loc(#loc176) + %left_idx_325 = "tt.reduce"(%left_idx_324) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc206) + %left_idx_326 = tt.expand_dims %left_idx_325 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc178) + %left_idx_327 = tt.broadcast %left_idx_326 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc179) + %right_idx_328 = arith.muli %y_idx_323, %flip_22 : tensor<128x2x2xi32> loc(#loc181) + %right_idx_329 = "tt.reduce"(%right_idx_328) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc209) + %right_idx_330 = tt.expand_dims %right_idx_329 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc183) + %right_idx_331 = tt.broadcast %right_idx_330 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc184) + %left_idx_332 = tt.reshape %left_idx_327 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_333 = tt.reshape %right_idx_331 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_334 = arith.cmpi slt, %ileft_321, %iright_322 : tensor<32x16xi32> loc(#loc187) + %eq_335 = arith.cmpi eq, %ileft_321, %iright_322 : tensor<32x16xi32> loc(#loc188) + %cond_336 = arith.cmpi sgt, %left_idx_332, %right_idx_333 : tensor<32x16xi32> loc(#loc189) + %cond_337 = arith.andi %eq_335, %cond_336 : tensor<32x16xi1> loc(#loc190) + %cond_338 = arith.ori %cond_334, %cond_337 : tensor<32x16xi1> loc(#loc191) + %ret_339 = arith.xori %ileft_321, %iright_322 : tensor<32x16xi32> loc(#loc194) + %ret_340 = arith.select %cond_338, %ret_339, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc195) + %ret_341 = arith.xori %ret_308, %ret_340 : tensor<32x16xi32> loc(#loc196) + %new_idxs_342 = arith.xori %left_idx_332, %right_idx_333 : tensor<32x16xi32> loc(#loc197) + %new_idxs_343 = arith.select %cond_338, %new_idxs_342, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_344 = arith.xori %new_idxs_311, %new_idxs_343 : tensor<32x16xi32> loc(#loc199) + %y_345 = tt.reshape %ret_341 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc162) + %ileft_346 = arith.muli %y_345, %ileft : tensor<256x2x1xi32> loc(#loc164) + %ileft_347 = "tt.reduce"(%ileft_346) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc201) + %ileft_348 = tt.expand_dims %ileft_347 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc166) + %ileft_349 = tt.broadcast %ileft_348 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc167) + %iright_350 = arith.muli %y_345, %iright : tensor<256x2x1xi32> loc(#loc168) + %iright_351 = "tt.reduce"(%iright_350) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc203) + %iright_352 = tt.expand_dims %iright_351 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc170) + %iright_353 = tt.broadcast %iright_352 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc171) + %ileft_354 = tt.reshape %ileft_349 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc172) + %iright_355 = tt.reshape %iright_353 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc173) + %y_idx_356 = tt.reshape %new_idxs_344 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc174) + %left_idx_357 = arith.muli %y_idx_356, %ileft : tensor<256x2x1xi32> loc(#loc176) + %left_idx_358 = "tt.reduce"(%left_idx_357) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc206) + %left_idx_359 = tt.expand_dims %left_idx_358 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc178) + %left_idx_360 = tt.broadcast %left_idx_359 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc179) + %right_idx_361 = arith.muli %y_idx_356, %iright : tensor<256x2x1xi32> loc(#loc181) + %right_idx_362 = "tt.reduce"(%right_idx_361) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc209) + %right_idx_363 = tt.expand_dims %right_idx_362 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc183) + %right_idx_364 = tt.broadcast %right_idx_363 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc184) + %left_idx_365 = tt.reshape %left_idx_360 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc185) + %right_idx_366 = tt.reshape %right_idx_364 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc186) + %cond_367 = arith.cmpi slt, %ileft_354, %iright_355 : tensor<32x16xi32> loc(#loc187) + %eq_368 = arith.cmpi eq, %ileft_354, %iright_355 : tensor<32x16xi32> loc(#loc188) + %cond_369 = arith.cmpi sgt, %left_idx_365, %right_idx_366 : tensor<32x16xi32> loc(#loc189) + %cond_370 = arith.andi %eq_368, %cond_369 : tensor<32x16xi1> loc(#loc190) + %cond_371 = arith.ori %cond_367, %cond_370 : tensor<32x16xi1> loc(#loc191) + %new_idxs_372 = arith.xori %left_idx_365, %right_idx_366 : tensor<32x16xi32> loc(#loc197) + %new_idxs_373 = arith.select %cond_371, %new_idxs_372, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc198) + %new_idxs_374 = arith.xori %new_idxs_344, %new_idxs_373 : tensor<32x16xi32> loc(#loc199) + %tmp7 = arith.extsi %tmp0_19 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc149) + %tmp10_375 = arith.select %tmp0_18, %tmp7, %tmp10 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc86) + %tmp11 = "tt.reduce"(%tmp10_375) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_377: i64 loc(callsite(#loc1 at #loc150)), %tmp11_378: i64 loc(callsite(#loc1 at #loc150))): + %tmp11_379 = arith.addi %tmp11_377, %tmp11_378 : i64 loc(#loc200) + tt.reduce.return %tmp11_379 : i64 loc(#loc160) + }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc160) + %tmp11_376 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc151) + %tmp14 = arith.trunci %tmp11_376 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc152) + %0 = arith.muli %xindex_6, %cst_2 : tensor<32x1xi32> loc(#loc73) + %1 = tt.broadcast %r0_index_8 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc74) + %2 = tt.broadcast %0 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc74) + %3 = arith.addi %1, %2 : tensor<32x16xi32> loc(#loc74) + %4 = tt.splat %out_ptr2 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc75) + %5 = tt.addptr %4, %3 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc75) + tt.store %5, %new_idxs_374, %tmp0_18 : tensor<32x16x!tt.ptr> loc(#loc76) + %6 = tt.splat %out_ptr3 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc77) + %7 = tt.addptr %6, %xindex_6 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc77) + tt.store %7, %tmp14, %xmask_7 : tensor<32x1x!tt.ptr> loc(#loc78) + tt.return loc(#loc79) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":44:34) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:49) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:38) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":26:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":24:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":24:33) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:36) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:44) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:23) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":27:28) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":27:38) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":33:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":34:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:45) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:30) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:54) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":38:19) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":40:33) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":42:19) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":45:29) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":48:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:35) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:32) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:47) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:25) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:37) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:4) +#loc85 = loc(callsite(#loc1 at #loc2)) +#loc86 = loc("tmp10"(#loc3)) +#loc87 = loc("tmp0"(#loc4)) +#loc88 = loc("tmp0"(#loc5)) +#loc89 = loc("xmask"(#loc6)) +#loc90 = loc("xoffset"(#loc7)) +#loc91 = loc("xoffset"(#loc8)) +#loc92 = loc("xindex"(#loc9)) +#loc93 = loc("xindex"(#loc10)) +#loc94 = loc("xindex"(#loc11)) +#loc95 = loc("r0_index"(#loc12)) +#loc96 = loc("r0_index"(#loc13)) +#loc97 = loc("x0"(#loc14)) +#loc98 = loc("x1"(#loc15)) +#loc99 = loc("tmp0"(#loc16)) +#loc100 = loc("tmp0"(#loc17)) +#loc101 = loc("tmp0"(#loc18)) +#loc102 = loc("tmp0"(#loc19)) +#loc103 = loc("tmp2"(#loc20)) +#loc104 = loc("tmp4"(#loc21)) +#loc105 = loc("flip"(#loc22)) +#loc107 = loc("flip"(#loc24)) +#loc108 = loc("flip"(#loc25)) +#loc109 = loc("flip"(#loc26)) +#loc110 = loc("y"(#loc27)) +#loc111 = loc("left_mask"(#loc29)) +#loc112 = loc("ileft"(#loc30)) +#loc114 = loc("ileft"(#loc34)) +#loc115 = loc("ileft"(#loc35)) +#loc116 = loc("iright"(#loc36)) +#loc118 = loc("iright"(#loc38)) +#loc119 = loc("iright"(#loc39)) +#loc120 = loc("ileft"(#loc40)) +#loc121 = loc("iright"(#loc41)) +#loc122 = loc("y_idx"(#loc42)) +#loc123 = loc("left_idx"(#loc43)) +#loc124 = loc("left_idx"(#loc44)) +#loc125 = loc("input"(#loc45)) +#loc127 = loc("left_idx"(#loc47)) +#loc128 = loc("left_idx"(#loc48)) +#loc129 = loc("right_idx"(#loc49)) +#loc130 = loc("right_idx"(#loc50)) +#loc132 = loc("right_idx"(#loc52)) +#loc133 = loc("right_idx"(#loc53)) +#loc134 = loc("left_idx"(#loc54)) +#loc135 = loc("right_idx"(#loc55)) +#loc136 = loc("cond"(#loc56)) +#loc137 = loc("eq"(#loc57)) +#loc138 = loc("cond"(#loc58)) +#loc139 = loc("cond"(#loc59)) +#loc140 = loc("cond"(#loc60)) +#loc141 = loc("cond"(#loc61)) +#loc142 = loc("cond"(#loc62)) +#loc143 = loc("ret"(#loc63)) +#loc144 = loc("ret"(#loc64)) +#loc145 = loc("ret"(#loc65)) +#loc146 = loc("new_idxs"(#loc66)) +#loc147 = loc("new_idxs"(#loc67)) +#loc148 = loc("new_idxs"(#loc68)) +#loc149 = loc("tmp7"(#loc69)) +#loc151 = loc("tmp11"(#loc71)) +#loc152 = loc("tmp14"(#loc72)) +#loc153 = loc(callsite(#loc105 at #loc106)) +#loc154 = loc(callsite(#loc107 at #loc106)) +#loc155 = loc(callsite(#loc108 at #loc106)) +#loc156 = loc(callsite(#loc109 at #loc106)) +#loc158 = loc("cond"(#loc136)) +#loc159 = loc("eq"(#loc137)) +#loc160 = loc(callsite(#loc31 at #loc150)) +#loc162 = loc(callsite(#loc110 at #loc157)) +#loc163 = loc(callsite(#loc111 at #loc157)) +#loc164 = loc(callsite(#loc112 at #loc157)) +#loc166 = loc(callsite(#loc114 at #loc157)) +#loc167 = loc(callsite(#loc115 at #loc157)) +#loc168 = loc(callsite(#loc116 at #loc157)) +#loc170 = loc(callsite(#loc118 at #loc157)) +#loc171 = loc(callsite(#loc119 at #loc157)) +#loc172 = loc(callsite(#loc120 at #loc157)) +#loc173 = loc(callsite(#loc121 at #loc157)) +#loc174 = loc(callsite(#loc122 at #loc157)) +#loc175 = loc(callsite(#loc123 at #loc157)) +#loc176 = loc(callsite(#loc124 at #loc157)) +#loc178 = loc(callsite(#loc127 at #loc157)) +#loc179 = loc(callsite(#loc128 at #loc157)) +#loc180 = loc(callsite(#loc129 at #loc157)) +#loc181 = loc(callsite(#loc130 at #loc157)) +#loc183 = loc(callsite(#loc132 at #loc157)) +#loc184 = loc(callsite(#loc133 at #loc157)) +#loc185 = loc(callsite(#loc134 at #loc157)) +#loc186 = loc(callsite(#loc135 at #loc157)) +#loc187 = loc(callsite(#loc158 at #loc157)) +#loc188 = loc(callsite(#loc159 at #loc157)) +#loc189 = loc(callsite(#loc138 at #loc157)) +#loc190 = loc(callsite(#loc139 at #loc157)) +#loc191 = loc(callsite(#loc140 at #loc157)) +#loc192 = loc(callsite(#loc141 at #loc157)) +#loc193 = loc(callsite(#loc142 at #loc157)) +#loc194 = loc(callsite(#loc143 at #loc157)) +#loc195 = loc(callsite(#loc144 at #loc157)) +#loc196 = loc(callsite(#loc145 at #loc157)) +#loc197 = loc(callsite(#loc146 at #loc157)) +#loc198 = loc(callsite(#loc147 at #loc157)) +#loc199 = loc(callsite(#loc148 at #loc157)) +#loc200 = loc(callsite(#loc33 at #loc160)) +#loc201 = loc(callsite(#loc31 at #loc165)) +#loc203 = loc(callsite(#loc31 at #loc169)) +#loc205 = loc(callsite(#loc125 at #loc177)) +#loc206 = loc(callsite(#loc31 at #loc177)) +#loc208 = loc(callsite(#loc125 at #loc182)) +#loc209 = loc(callsite(#loc31 at #loc182)) +#loc211 = loc(callsite(#loc33 at #loc201)) +#loc212 = loc(callsite(#loc33 at #loc203)) +#loc213 = loc(callsite(#loc33 at #loc206)) +#loc214 = loc(callsite(#loc33 at #loc209)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/__grp__triton_red_fused__to_copy_clone_slice_sum_transpose_5.json b/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/__grp__triton_red_fused__to_copy_clone_slice_sum_transpose_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1a12fd39b51487531cb3e0f08ef5b3674f692458 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/__grp__triton_red_fused__to_copy_clone_slice_sum_transpose_5.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_clone_slice_sum_transpose_5.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin new file mode 100644 index 0000000000000000000000000000000000000000..95bb0f052a436b0dc3cf9e68c5873099b7600210 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json b/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json new file mode 100644 index 0000000000000000000000000000000000000000..dbe709b21c3e6aa8f71f56fbbe3d526d7cf8e6c0 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json @@ -0,0 +1 @@ +{"hash": "00a907c0907a502b960f02358753ab661e4b3cc5f3e6a5b9ffadd78fd1872cd6", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_clone_slice_sum_transpose_5"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir new file mode 100644 index 0000000000000000000000000000000000000000..c6caa8b24d8c8e433f09388473db1ed3ac97cccf --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir @@ -0,0 +1,175 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__to_copy_clone_slice_sum_transpose_5(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = shl i32 %9, 5, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 124, !dbg !9 + %13 = and i32 %11, 3, !dbg !10 + %14 = icmp sgt i32 %5, 0, !dbg !11 + br i1 %14, label %.lr.ph, label %._crit_edge, !dbg !11 + +.lr.ph: ; preds = %8 + %15 = mul i64 %3, %2, !dbg !12 + %16 = lshr exact i32 %12, 2, !dbg !9 + %17 = or disjoint i32 %16, %10, !dbg !13 + %18 = sext i32 %17 to i64, !dbg !14 + %.frozen = freeze i64 %2, !dbg !15 + %19 = sdiv i64 %18, %.frozen, !dbg !15 + %20 = mul i64 %15, %19, !dbg !16 + %21 = mul i64 %19, %.frozen, !dbg !14 + %.decomposed = sub i64 %18, %21, !dbg !14 + %22 = icmp slt i32 %17, %4, !dbg !17 + %23 = getelementptr i32, ptr addrspace(1) %0, i64 %.decomposed + %invariant.gep = getelementptr i32, ptr addrspace(1) %23, i64 %20, !dbg !11 + %.fr = freeze i1 %22 + br i1 %.fr, label %.lr.ph.split, label %.lr.ph.split.us + +.lr.ph.split.us: ; preds = %.lr.ph, %.lr.ph.split.us + %24 = phi i32 [ %30, %.lr.ph.split.us ], [ 0, %.lr.ph ] + %25 = or disjoint i32 %24, %13, !dbg !18 + %26 = sext i32 %25 to i64, !dbg !19 + %27 = mul i64 %2, %26, !dbg !19 + %gep.us = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %27, !dbg !20 + %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %29 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep.us, i64 %28, i1 false) #4, !dbg !21 + %30 = add i32 %24, 4, !dbg !11 + %31 = icmp slt i32 %30, %5, !dbg !11 + br i1 %31, label %.lr.ph.split.us, label %._crit_edge, !dbg !11 + +.lr.ph.split: ; preds = %.lr.ph, %.lr.ph.split + %32 = phi i64 [ %40, %.lr.ph.split ], [ 0, %.lr.ph ] + %33 = phi i32 [ %41, %.lr.ph.split ], [ 0, %.lr.ph ] + %34 = or disjoint i32 %33, %13, !dbg !18 + %35 = icmp slt i32 %34, %5, !dbg !22 + %36 = sext i32 %34 to i64, !dbg !19 + %37 = mul i64 %2, %36, !dbg !19 + %gep = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %37, !dbg !20 + %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %39 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep, i64 %38, i1 %35) #4, !dbg !21 + %narrow = select i1 %35, i32 %39, i32 0, !dbg !23 + %spec.select = sext i32 %narrow to i64, !dbg !23 + %40 = add i64 %32, %spec.select, !dbg !23 + %41 = add i32 %33, 4, !dbg !11 + %42 = icmp slt i32 %41, %5, !dbg !11 + br i1 %42, label %.lr.ph.split, label %._crit_edge, !dbg !11 + +._crit_edge: ; preds = %.lr.ph.split.us, %.lr.ph.split, %8 + %.lcssa = phi i64 [ 0, %8 ], [ %40, %.lr.ph.split ], [ 0, %.lr.ph.split.us ], !dbg !24 + %43 = and i32 %11, 31, !dbg !9 + %44 = or disjoint i32 %10, %43, !dbg !13 + %45 = sext i32 %44 to i64, !dbg !14 + %.frozen12 = freeze i64 %2, !dbg !15 + %46 = sdiv i64 %45, %.frozen12, !dbg !15 + %47 = mul i64 %46, %.frozen12, !dbg !14 + %.decomposed13 = sub i64 %45, %47, !dbg !14 + %48 = icmp slt i32 %44, %4, !dbg !17 + %extelt.offset = lshr i64 %.lcssa, 32, !dbg !25 + %49 = trunc nuw i64 %extelt.offset to i32, !dbg !25 + %50 = trunc i64 %.lcssa to i32, !dbg !25 + %51 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %50, i32 2, i32 31), !dbg !25 + %52 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %49, i32 2, i32 31), !dbg !25 + %53 = insertelement <2 x i32> poison, i32 %51, i64 0, !dbg !25 + %54 = insertelement <2 x i32> %53, i32 %52, i64 1, !dbg !25 + %55 = bitcast <2 x i32> %54 to i64, !dbg !25 + %56 = add i64 %.lcssa, %55, !dbg !29 + %extelt.offset2 = lshr i64 %56, 32, !dbg !25 + %57 = trunc nuw i64 %extelt.offset2 to i32, !dbg !25 + %58 = trunc i64 %56 to i32, !dbg !25 + %59 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %58, i32 1, i32 31), !dbg !25 + %60 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %57, i32 1, i32 31), !dbg !25 + %61 = insertelement <2 x i32> poison, i32 %59, i64 0, !dbg !25 + %62 = insertelement <2 x i32> %61, i32 %60, i64 1, !dbg !25 + %63 = bitcast <2 x i32> %62 to i64, !dbg !25 + %64 = add i64 %56, %63, !dbg !29 + %65 = shl nuw nsw i32 %12, 1, !dbg !30 + %66 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %65, !dbg !30 + %67 = insertelement <1 x i64> poison, i64 %64, i64 0, !dbg !30 + store <1 x i64> %67, ptr addrspace(3) %66, align 8, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !30 + %68 = shl nuw nsw i32 %43, 3, !dbg !30 + %69 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %68, !dbg !30 + %70 = load i64, ptr addrspace(3) %69, align 8, !dbg !30 + %71 = trunc i64 %70 to i32, !dbg !31 + %72 = icmp slt i64 %2, 2, !dbg !32 + %73 = icmp sgt i64 %2, 1, !dbg !33 + %74 = select i1 %73, i64 %2, i64 0, !dbg !34 + %75 = zext i1 %72 to i64, !dbg !35 + %76 = add i64 %74, %75, !dbg !36 + %77 = mul i64 %46, %76, !dbg !37 + %78 = getelementptr i32, ptr addrspace(1) %1, i64 %.decomposed13, !dbg !38 + %79 = getelementptr i32, ptr addrspace(1) %78, i64 %77, !dbg !38 + %80 = and i32 %11, 224, !dbg !39 + %81 = icmp eq i32 %80, 0, !dbg !39 + %82 = and i1 %81, %48, !dbg !39 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %71, ptr addrspace(1) %79, i1 %82) #4, !dbg !39 + ret void, !dbg !40 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_clone_slice_sum_transpose_5", linkageName: "triton_red_fused__to_copy_clone_slice_sum_transpose_5", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 21, column: 28, scope: !4) +!8 = !DILocation(line: 21, column: 33, scope: !4) +!9 = !DILocation(line: 22, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 37, scope: !4) +!11 = !DILocation(line: 30, column: 40, scope: !4) +!12 = !DILocation(line: 36, column: 54, scope: !4) +!13 = !DILocation(line: 22, column: 23, scope: !4) +!14 = !DILocation(line: 26, column: 19, scope: !4) +!15 = !DILocation(line: 27, column: 19, scope: !4) +!16 = !DILocation(line: 36, column: 58, scope: !4) +!17 = !DILocation(line: 23, column: 21, scope: !4) +!18 = !DILocation(line: 31, column: 31, scope: !4) +!19 = !DILocation(line: 36, column: 43, scope: !4) +!20 = !DILocation(line: 36, column: 34, scope: !4) +!21 = !DILocation(line: 36, column: 63, scope: !4) +!22 = !DILocation(line: 32, column: 29, scope: !4) +!23 = !DILocation(line: 40, column: 48, scope: !4) +!24 = !DILocation(line: 28, column: 43, scope: !4) +!25 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !28) +!26 = distinct !DILexicalBlockFile(scope: !4, file: !27, discriminator: 0) +!27 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!28 = !DILocation(line: 41, column: 25, scope: !4) +!29 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !28) +!30 = !DILocation(line: 41, column: 28, scope: !4) +!31 = !DILocation(line: 42, column: 19, scope: !4) +!32 = !DILocation(line: 43, column: 49, scope: !4) +!33 = !DILocation(line: 43, column: 75, scope: !4) +!34 = !DILocation(line: 43, column: 66, scope: !4) +!35 = !DILocation(line: 43, scope: !4) +!36 = !DILocation(line: 43, column: 57, scope: !4) +!37 = !DILocation(line: 43, column: 34, scope: !4) +!38 = !DILocation(line: 43, column: 25, scope: !4) +!39 = !DILocation(line: 43, column: 88, scope: !4) +!40 = !DILocation(line: 43, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx new file mode 100644 index 0000000000000000000000000000000000000000..22453b593a42c20f78c7d21a34fe2e5ce3a0e39f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx @@ -0,0 +1,508 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_clone_slice_sum_transpose_5 // -- Begin function triton_red_fused__to_copy_clone_slice_sum_transpose_5 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__to_copy_clone_slice_sum_transpose_5 +.visible .entry triton_red_fused__to_copy_clone_slice_sum_transpose_5( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_1, + .param .u64 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_2, + .param .u64 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_3, + .param .u32 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_4, + .param .u32 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_7 +) +.reqntid 256 +{ + .reg .pred %p<14>; + .reg .b32 %r<50>; + .reg .b64 %rd<71>; + .loc 1 18 0 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:18:0 + +// %bb.0: + ld.param.b32 %r11, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_5]; + ld.param.b32 %r10, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_4]; + ld.param.b64 %rd19, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_2]; +$L__tmp0: + .loc 1 21 28 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:21:28 + mov.u32 %r12, %ctaid.x; + .loc 1 21 33 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:21:33 + shl.b32 %r1, %r12, 5; + .loc 1 22 44 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:22:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 124; + .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40 + setp.gt.s32 %p1, %r11, 0; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: // %.lr.ph + .loc 1 0 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:40 + ld.param.b64 %rd20, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_3]; + ld.param.b64 %rd17, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_0]; + .loc 1 36 54 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:54 + mul.lo.s64 %rd2, %rd20, %rd19; + .loc 1 22 44 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:22:44 + shr.u32 %r13, %r3, 2; + .loc 1 22 23 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:22:23 + or.b32 %r14, %r13, %r1; + .loc 1 26 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:26:19 + cvt.s64.s32 %rd3, %r14; + .loc 1 27 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:27:19 + or.b64 %rd22, %rd3, %rd19; + and.b64 %rd23, %rd22, -4294967296; + setp.ne.b64 %p2, %rd23, 0; + cvt.u32.u64 %r47, %rd3; + @%p2 bra $L__BB0_4; + bra.uni $L__BB0_3; +$L__BB0_4: + div.s64 %rd67, %rd3, %rd19; + bra.uni $L__BB0_5; +$L__BB0_1: // %.._crit_edge_crit_edge + .loc 1 0 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:19 + mov.b64 %rd69, 0; + .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40 + bra.uni $L__BB0_10; +$L__BB0_3: + .loc 1 27 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:27:19 + cvt.u32.u64 %r15, %rd19; + div.u32 %r17, %r47, %r15; + cvt.u64.u32 %rd67, %r17; +$L__BB0_5: + .loc 1 0 0 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0 + and.b32 %r4, %r2, 3; + .loc 1 36 58 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:58 + mul.lo.s64 %rd24, %rd2, %rd67; + .loc 1 26 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:26:19 + mul.lo.s64 %rd25, %rd67, %rd19; + sub.s64 %rd26, %rd3, %rd25; + .loc 1 23 21 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:23:21 + setp.lt.s32 %p3, %r47, %r10; + shl.b64 %rd27, %rd26, 2; + add.s64 %rd28, %rd17, %rd27; + .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40 + shl.b64 %rd29, %rd24, 2; + add.s64 %rd8, %rd28, %rd29; + @%p3 bra $L__BB0_8; + bra.uni $L__BB0_6; +$L__BB0_8: // %.lr.ph.split.preheader + .loc 1 0 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:40 + mov.b32 %r49, 0; + mov.b64 %rd69, 0; +$L__BB0_9: // %.lr.ph.split + // =>This Inner Loop Header: Depth=1 + .loc 1 32 29 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:32:29 + add.s32 %r24, %r4, %r49; + setp.lt.s32 %p6, %r24, %r11; + .loc 1 36 43 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:43 + cvt.s64.s32 %rd41, %r24; + mul.lo.s64 %rd42, %rd19, %rd41; + .loc 1 36 34 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:34 + shl.b64 %rd43, %rd42, 2; + add.s64 %rd39, %rd8, %rd43; + .loc 1 36 63 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:63 + // begin inline asm + mov.u64 %rd38, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd38, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r23, 0x0; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b32 { %r23 }, [ %rd39 + 0 ], %rd38; + // end inline asm + .loc 1 40 48 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:40:48 + selp.b32 %r25, %r23, 0, %p6; + cvt.s64.s32 %rd44, %r25; + add.s64 %rd69, %rd69, %rd44; + .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40 + add.s32 %r49, %r49, 4; + setp.lt.s32 %p7, %r49, %r11; + @%p7 bra $L__BB0_9; + bra.uni $L__BB0_10; +$L__BB0_6: // %.lr.ph.split.us.preheader + .loc 1 0 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:40 + mov.b32 %r48, 0; +$L__BB0_7: // %.lr.ph.split.us + // =>This Inner Loop Header: Depth=1 + .loc 1 36 43 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:43 + add.s32 %r21, %r4, %r48; + cvt.s64.s32 %rd34, %r21; + mul.lo.s64 %rd35, %rd19, %rd34; + .loc 1 36 34 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:34 + shl.b64 %rd36, %rd35, 2; + add.s64 %rd31, %rd8, %rd36; + .loc 1 36 63 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:63 + // begin inline asm + mov.u64 %rd30, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd30, 1.0; + // end inline asm + mov.pred %p4, 0; + // begin inline asm + mov.u32 %r20, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b32 { %r20 }, [ %rd31 + 0 ], %rd30; + // end inline asm + .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40 + add.s32 %r48, %r48, 4; + setp.lt.s32 %p5, %r48, %r11; + mov.b64 %rd69, 0; + @%p5 bra $L__BB0_7; +$L__BB0_10: // %._crit_edge + .loc 1 0 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:40 + ld.param.b64 %rd18, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_1]; + .loc 1 22 44 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:22:44 + and.b32 %r9, %r2, 31; + .loc 1 22 23 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:22:23 + or.b32 %r26, %r1, %r9; + .loc 1 26 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:26:19 + cvt.s64.s32 %rd13, %r26; + .loc 1 27 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:27:19 + or.b64 %rd45, %rd13, %rd19; + and.b64 %rd46, %rd45, -4294967296; + setp.ne.b64 %p8, %rd46, 0; + cvt.u32.u64 %r46, %rd13; + @%p8 bra $L__BB0_12; + bra.uni $L__BB0_11; +$L__BB0_12: + div.s64 %rd70, %rd13, %rd19; + bra.uni $L__BB0_13; +$L__BB0_11: + cvt.u32.u64 %r27, %rd19; + div.u32 %r29, %r46, %r27; + cvt.u64.u32 %rd70, %r29; +$L__BB0_13: + .loc 1 26 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:26:19 + mul.lo.s64 %rd48, %rd70, %rd19; + sub.s64 %rd49, %rd13, %rd48; + .loc 1 23 21 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:23:21 + setp.lt.s32 %p10, %r46, %r10; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + mov.b64 {_, %r32}, %rd69; + cvt.u32.u64 %r33, %rd69; + shfl.sync.bfly.b32 %r34, %r33, 2, 31, -1; + shfl.sync.bfly.b32 %r35, %r32, 2, 31, -1; + cvt.u64.u32 %rd50, %r34; + cvt.u64.u32 %rd51, %r35; + shl.b64 %rd52, %rd51, 32; + or.b64 %rd53, %rd50, %rd52; + .loc 2 261 15 // standard.py:261:15 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + add.s64 %rd54, %rd69, %rd53; + .loc 2 291 36 // standard.py:291:36 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + mov.b64 {_, %r36}, %rd54; + cvt.u32.u64 %r37, %rd54; + shfl.sync.bfly.b32 %r38, %r37, 1, 31, -1; + shfl.sync.bfly.b32 %r39, %r36, 1, 31, -1; + cvt.u64.u32 %rd55, %r38; + cvt.u64.u32 %rd56, %r39; + shl.b64 %rd57, %rd56, 32; + or.b64 %rd58, %rd55, %rd57; + .loc 2 261 15 // standard.py:261:15 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + add.s64 %rd59, %rd54, %rd58; +$L__tmp2: + .loc 1 41 28 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:28 + shl.b32 %r40, %r3, 1; + mov.b32 %r41, global_smem; + add.s32 %r42, %r41, %r40; + st.shared.b64 [%r42], %rd59; + bar.sync 0; + shl.b32 %r43, %r9, 3; + add.s32 %r44, %r41, %r43; + ld.shared.b32 %r30, [%r44]; + .loc 1 43 49 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:49 + setp.lt.s64 %p11, %rd19, 2; + .loc 1 43 75 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:75 + setp.gt.s64 %p12, %rd19, 1; + .loc 1 43 66 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:66 + selp.b64 %rd60, %rd19, 0, %p12; + .loc 1 43 0 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43 + selp.b64 %rd61, 1, 0, %p11; + .loc 1 43 57 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:57 + add.s64 %rd62, %rd60, %rd61; + .loc 1 43 34 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:34 + mul.lo.s64 %rd63, %rd70, %rd62; + .loc 1 43 25 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:25 + shl.b64 %rd64, %rd49, 2; + add.s64 %rd65, %rd18, %rd64; + shl.b64 %rd66, %rd63, 2; + add.s64 %rd47, %rd65, %rd66; + .loc 1 43 88 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:88 + and.b32 %r45, %r2, 224; + setp.eq.b32 %p13, %r45, 0; + and.pred %p9, %p13, %p10; + // begin inline asm + @%p9 st.global.b32 [ %rd47 + 0 ], { %r30 }; + // end inline asm + .loc 1 43 4 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 238 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe7 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 53 +.b8 117 +.b8 99 +.b8 102 +.b8 50 +.b8 105 +.b8 116 +.b8 54 +.b8 102 +.b8 119 +.b8 104 +.b8 109 +.b8 114 +.b8 117 +.b8 51 +.b8 50 +.b8 118 +.b8 112 +.b8 121 +.b8 50 +.b8 120 +.b8 109 +.b8 119 +.b8 107 +.b8 107 +.b8 52 +.b8 118 +.b8 103 +.b8 55 +.b8 112 +.b8 112 +.b8 110 +.b8 111 +.b8 120 +.b8 109 +.b8 104 +.b8 110 +.b8 107 +.b8 102 +.b8 54 +.b8 105 +.b8 102 +.b8 112 +.b8 98 +.b8 50 +.b8 104 +.b8 50 +.b8 116 +.b8 106 +.b8 115 +.b8 108 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 53 +.b8 117 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x38 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 99 +.b8 108 +.b8 111 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 108 +.b8 105 +.b8 99 +.b8 101 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 116 +.b8 114 +.b8 97 +.b8 110 +.b8 115 +.b8 112 +.b8 111 +.b8 115 +.b8 101 +.b8 95 +.b8 53 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc3:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xd8:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source b/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source new file mode 100644 index 0000000000000000000000000000000000000000..a1cb81c5d6e0d66b84cc6e55d95db736a49a9560 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source @@ -0,0 +1,193 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":18:0) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc43 = loc(unknown) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc50 = loc("in_ptr0"(#loc)) +#loc51 = loc("out_ptr1"(#loc)) +#loc52 = loc("ks0"(#loc)) +#loc53 = loc("ks1"(#loc)) +#loc54 = loc("xnumel"(#loc)) +#loc55 = loc("r0_numel"(#loc)) +#loc85 = loc("input"(#loc41)) +#loc86 = loc("a"(#loc46)) +#loc87 = loc("b"(#loc46)) +module { + tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc56) + %xoffset_0 = arith.constant 32 : i32 loc(#loc57) + %xoffset_1 = arith.constant 32 : i32 loc(#loc57) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc57) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc58) + %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc59) + %xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<32x1xi32> loc(#loc60) + %xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<32x1xi32> loc(#loc60) + %xmask = tt.splat %xnumel : i32 -> tensor<32x1xi32> loc(#loc61) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<32x1xi32> loc(#loc61) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc62) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc63) + %x0 = arith.extsi %xindex_5 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc64) + %x0_8 = tt.splat %ks0 : i64 -> tensor<32x1xi64> loc(#loc64) + %x0_9 = arith.remsi %x0, %x0_8 : tensor<32x1xi64> loc(#loc64) + %x1 = arith.extsi %xindex_5 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc65) + %x1_10 = tt.splat %ks0 : i64 -> tensor<32x1xi64> loc(#loc65) + %x1_11 = arith.divsi %x1, %x1_10 : tensor<32x1xi64> loc(#loc65) + %_tmp3 = arith.constant 0 : i64 loc(#loc66) + %_tmp3_12 = arith.constant dense<0> : tensor<32x4xi64> loc(#loc66) + %c0_i32 = arith.constant 0 : i32 loc(#loc12) + %c4_i32 = arith.constant 4 : i32 loc(#loc12) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc12) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc12) + %2 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc12) + %3 = ub.poison : i32 loc(#loc12) + %_tmp3_13 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_18 = %_tmp3_12) -> (tensor<32x4xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc68) + %r0_index_19 = arith.addi %r0_index, %r0_base_7 : tensor<1x4xi32> loc(#loc68) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x4xi32> loc(#loc69) + %r0_mask_20 = arith.cmpi slt, %r0_index_19, %r0_mask : tensor<1x4xi32> loc(#loc69) + %tmp0 = arith.extsi %r0_index_19 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc70) + %tmp0_21 = tt.splat %ks0 : i64 -> tensor<1x4xi64> loc(#loc70) + %tmp0_22 = arith.muli %tmp0_21, %tmp0 : tensor<1x4xi64> loc(#loc70) + %tmp0_23 = tt.broadcast %x0_9 : tensor<32x1xi64> -> tensor<32x4xi64> loc(#loc71) + %tmp0_24 = tt.broadcast %tmp0_22 : tensor<1x4xi64> -> tensor<32x4xi64> loc(#loc71) + %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<32x4xi64> loc(#loc71) + %tmp0_26 = arith.muli %ks0, %ks1 : i64 loc(#loc72) + %tmp0_27 = tt.splat %tmp0_26 : i64 -> tensor<32x1xi64> loc(#loc73) + %tmp0_28 = arith.muli %tmp0_27, %x1_11 : tensor<32x1xi64> loc(#loc73) + %tmp0_29 = tt.broadcast %tmp0_28 : tensor<32x1xi64> -> tensor<32x4xi64> loc(#loc74) + %tmp0_30 = arith.addi %tmp0_25, %tmp0_29 : tensor<32x4xi64> loc(#loc74) + %tmp0_31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x4x!tt.ptr> loc(#loc75) + %tmp0_32 = tt.addptr %tmp0_31, %tmp0_30 : tensor<32x4x!tt.ptr>, tensor<32x4xi64> loc(#loc75) + %tmp0_33 = tt.broadcast %r0_mask_20 : tensor<1x4xi1> -> tensor<32x4xi1> loc(#loc76) + %tmp0_34 = tt.broadcast %xmask_6 : tensor<32x1xi1> -> tensor<32x4xi1> loc(#loc76) + %tmp0_35 = arith.andi %tmp0_33, %tmp0_34 : tensor<32x4xi1> loc(#loc76) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc77) + %tmp0_37 = arith.constant dense<0.000000e+00> : tensor<32x4xf32> loc(#loc77) + %tmp0_38 = arith.fptosi %tmp0_37 : tensor<32x4xf32> to tensor<32x4xi32> loc(#loc77) + %tmp0_39 = tt.load %tmp0_32, %tmp0_35, %tmp0_38 evictionPolicy = evict_last : tensor<32x4x!tt.ptr> loc(#loc77) + %tmp1 = arith.extsi %tmp0_39 : tensor<32x4xi32> to tensor<32x4xi64> loc(#loc78) + %tmp4 = arith.addi %_tmp3_18, %tmp1 : tensor<32x4xi64> loc(#loc79) + %_tmp3_40 = tt.broadcast %r0_mask_20 : tensor<1x4xi1> -> tensor<32x4xi1> loc(#loc80) + %_tmp3_41 = tt.broadcast %xmask_6 : tensor<32x1xi1> -> tensor<32x4xi1> loc(#loc80) + %_tmp3_42 = arith.andi %_tmp3_40, %_tmp3_41 : tensor<32x4xi1> loc(#loc80) + %_tmp3_43 = arith.select %_tmp3_42, %tmp4, %_tmp3_18 : tensor<32x4xi1>, tensor<32x4xi64> loc(#loc81) + scf.yield %_tmp3_43 : tensor<32x4xi64> loc(#loc27) + } loc(#loc67) + %tmp3 = tt.call @"triton.language.standard.sum__i64S32_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp3_13) : (tensor<32x4xi64>) -> tensor<32xi64> loc(#loc82) + %tmp3_14 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc83) + %tmp5 = arith.trunci %tmp3_14 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc84) + %c1_i32 = arith.constant 1 : i32 loc(#loc31) + %4 = arith.extsi %c1_i32 : i32 to i64 loc(#loc31) + %5 = arith.cmpi sge, %4, %ks0 : i64 loc(#loc31) + %c1_i32_15 = arith.constant 1 : i32 loc(#loc32) + %c1_i32_16 = arith.constant 1 : i32 loc(#loc32) + %6 = arith.extui %5 : i1 to i32 loc(#loc32) + %7 = arith.muli %c1_i32_16, %6 : i32 loc(#loc32) + %c1_i32_17 = arith.constant 1 : i32 loc(#loc33) + %8 = arith.extsi %c1_i32_17 : i32 to i64 loc(#loc33) + %9 = arith.cmpi sgt, %ks0, %8 : i64 loc(#loc33) + %10 = arith.extui %9 : i1 to i64 loc(#loc34) + %11 = arith.muli %ks0, %10 : i64 loc(#loc34) + %12 = arith.extsi %7 : i32 to i64 loc(#loc35) + %13 = arith.addi %12, %11 : i64 loc(#loc35) + %14 = tt.splat %13 : i64 -> tensor<32x1xi64> loc(#loc36) + %15 = arith.muli %x1_11, %14 : tensor<32x1xi64> loc(#loc36) + %16 = arith.addi %x0_9, %15 : tensor<32x1xi64> loc(#loc37) + %17 = tt.splat %out_ptr1 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc38) + %18 = tt.addptr %17, %16 : tensor<32x1x!tt.ptr>, tensor<32x1xi64> loc(#loc38) + tt.store %18, %tmp5, %xmask_6 : tensor<32x1x!tt.ptr> loc(#loc39) + tt.return loc(#loc40) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S32_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x4xi64> loc("input"(#loc41))) -> tensor<32xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc42) + tt.reduce.return %2 : i64 loc(#loc42) + }) : (tensor<32x4xi64>) -> tensor<32xi64> loc(#loc42) + tt.return %0 : tensor<32xi64> loc(#loc44) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32xi64> loc(#loc45) + tt.return %1 : tensor<32xi64> loc(#loc45) + } loc(#loc41) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc46)), %b: i64 loc("b"(#loc46))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc47) + tt.return %0 : i64 loc(#loc48) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc49) + tt.return %1 : i64 loc(#loc49) + } loc(#loc46) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":26:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":28:43) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":30:40) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":31:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":32:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:39) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:54) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:58) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:50) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:73) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:63) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":37:23) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":39:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:48) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:8) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:25) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:28) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":42:19) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:49) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:75) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:66) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:57) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:34) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:30) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:88) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:4) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc56 = loc("xoffset"(#loc1)) +#loc57 = loc("xoffset"(#loc2)) +#loc58 = loc("xindex"(#loc3)) +#loc59 = loc("xindex"(#loc4)) +#loc60 = loc("xindex"(#loc5)) +#loc61 = loc("xmask"(#loc6)) +#loc62 = loc("r0_base"(#loc7)) +#loc63 = loc("r0_base"(#loc8)) +#loc64 = loc("x0"(#loc9)) +#loc65 = loc("x1"(#loc10)) +#loc66 = loc("_tmp3"(#loc11)) +#loc67 = loc("_tmp3"(#loc12)) +#loc68 = loc("r0_index"(#loc13)) +#loc69 = loc("r0_mask"(#loc14)) +#loc70 = loc("tmp0"(#loc15)) +#loc71 = loc("tmp0"(#loc16)) +#loc72 = loc("tmp0"(#loc17)) +#loc73 = loc("tmp0"(#loc18)) +#loc74 = loc("tmp0"(#loc19)) +#loc75 = loc("tmp0"(#loc20)) +#loc76 = loc("tmp0"(#loc21)) +#loc77 = loc("tmp0"(#loc22)) +#loc78 = loc("tmp1"(#loc23)) +#loc79 = loc("tmp4"(#loc24)) +#loc80 = loc("_tmp3"(#loc25)) +#loc81 = loc("_tmp3"(#loc26)) +#loc82 = loc("tmp3"(#loc28)) +#loc83 = loc("tmp3"(#loc29)) +#loc84 = loc("tmp5"(#loc30)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..6cb85d93b279e0eb1841d4ebca06d1f82034210d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir @@ -0,0 +1,159 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 8], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":18:0) +#loc1 = loc(unknown) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:25) +#loc40 = loc("in_ptr0"(#loc)) +#loc41 = loc("out_ptr1"(#loc)) +#loc42 = loc("ks0"(#loc)) +#loc43 = loc("ks1"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc68 = loc("tmp3"(#loc26)) +#loc73 = loc(callsite(#loc1 at #loc68)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<32x4xi32, #blocked> loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<32x4xi64, #blocked> loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc46) + %xoffset_1 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc47) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc48) + %xindex_2 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc48) + %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc48) + %xindex_4 = tt.expand_dims %xindex_2 {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1> loc(#loc48) + %xindex_5 = tt.splat %xoffset_1 : i32 -> tensor<32x1xi32, #blocked> loc(#loc49) + %xindex_6 = tt.splat %xoffset_1 : i32 -> tensor<32x1xi32, #blocked1> loc(#loc49) + %xindex_7 = arith.addi %xindex_5, %xindex_3 : tensor<32x1xi32, #blocked> loc(#loc49) + %xindex_8 = arith.addi %xindex_6, %xindex_4 : tensor<32x1xi32, #blocked1> loc(#loc49) + %xmask = tt.splat %xnumel : i32 -> tensor<32x1xi32, #blocked> loc(#loc50) + %xmask_9 = tt.splat %xnumel : i32 -> tensor<32x1xi32, #blocked1> loc(#loc50) + %xmask_10 = arith.cmpi slt, %xindex_7, %xmask : tensor<32x1xi32, #blocked> loc(#loc50) + %xmask_11 = arith.cmpi slt, %xindex_8, %xmask_9 : tensor<32x1xi32, #blocked1> loc(#loc50) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc51) + %r0_base_12 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4xi32, #blocked> loc(#loc51) + %x0 = arith.extsi %xindex_7 : tensor<32x1xi32, #blocked> to tensor<32x1xi64, #blocked> loc(#loc52) + %x0_13 = arith.extsi %xindex_8 : tensor<32x1xi32, #blocked1> to tensor<32x1xi64, #blocked1> loc(#loc52) + %x0_14 = tt.splat %ks0 : i64 -> tensor<32x1xi64, #blocked> loc(#loc52) + %x0_15 = tt.splat %ks0 : i64 -> tensor<32x1xi64, #blocked1> loc(#loc52) + %x0_16 = arith.remsi %x0, %x0_14 : tensor<32x1xi64, #blocked> loc(#loc52) + %x0_17 = arith.remsi %x0_13, %x0_15 : tensor<32x1xi64, #blocked1> loc(#loc52) + %x1 = arith.divsi %x0, %x0_14 : tensor<32x1xi64, #blocked> loc(#loc53) + %x1_18 = arith.divsi %x0_13, %x0_15 : tensor<32x1xi64, #blocked1> loc(#loc53) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x4xi32, #blocked> loc(#loc54) + %tmp0 = tt.splat %ks0 : i64 -> tensor<1x4xi64, #blocked> loc(#loc55) + %tmp0_19 = tt.broadcast %x0_16 : tensor<32x1xi64, #blocked> -> tensor<32x4xi64, #blocked> loc(#loc56) + %tmp0_20 = arith.muli %ks0, %ks1 : i64 loc(#loc57) + %tmp0_21 = tt.splat %tmp0_20 : i64 -> tensor<32x1xi64, #blocked> loc(#loc58) + %tmp0_22 = arith.muli %tmp0_21, %x1 : tensor<32x1xi64, #blocked> loc(#loc58) + %tmp0_23 = tt.broadcast %tmp0_22 : tensor<32x1xi64, #blocked> -> tensor<32x4xi64, #blocked> loc(#loc59) + %tmp0_24 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x4x!tt.ptr, #blocked> loc(#loc60) + %tmp0_25 = tt.broadcast %xmask_10 : tensor<32x1xi1, #blocked> -> tensor<32x4xi1, #blocked> loc(#loc61) + %_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c4_i32 iter_args(%_tmp3_28 = %cst_0) -> (tensor<32x4xi64, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked> loc(#loc63) + %r0_index_29 = arith.addi %r0_index, %r0_base_12 : tensor<1x4xi32, #blocked> loc(#loc63) + %r0_mask_30 = arith.cmpi slt, %r0_index_29, %r0_mask : tensor<1x4xi32, #blocked> loc(#loc54) + %tmp0_31 = arith.extsi %r0_index_29 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked> loc(#loc55) + %tmp0_32 = arith.muli %tmp0, %tmp0_31 : tensor<1x4xi64, #blocked> loc(#loc55) + %tmp0_33 = tt.broadcast %tmp0_32 : tensor<1x4xi64, #blocked> -> tensor<32x4xi64, #blocked> loc(#loc56) + %tmp0_34 = arith.addi %tmp0_19, %tmp0_33 : tensor<32x4xi64, #blocked> loc(#loc56) + %tmp0_35 = arith.addi %tmp0_34, %tmp0_23 : tensor<32x4xi64, #blocked> loc(#loc59) + %tmp0_36 = tt.addptr %tmp0_24, %tmp0_35 : tensor<32x4x!tt.ptr, #blocked>, tensor<32x4xi64, #blocked> loc(#loc60) + %tmp0_37 = tt.broadcast %r0_mask_30 : tensor<1x4xi1, #blocked> -> tensor<32x4xi1, #blocked> loc(#loc61) + %tmp0_38 = arith.andi %tmp0_37, %tmp0_25 : tensor<32x4xi1, #blocked> loc(#loc61) + %tmp0_39 = tt.load %tmp0_36, %tmp0_38, %cst evictionPolicy = evict_last : tensor<32x4x!tt.ptr, #blocked> loc(#loc64) + %tmp1 = arith.extsi %tmp0_39 : tensor<32x4xi32, #blocked> to tensor<32x4xi64, #blocked> loc(#loc65) + %tmp4 = arith.addi %_tmp3_28, %tmp1 : tensor<32x4xi64, #blocked> loc(#loc66) + %_tmp3_40 = arith.select %tmp0_38, %tmp4, %_tmp3_28 : tensor<32x4xi1, #blocked>, tensor<32x4xi64, #blocked> loc(#loc67) + scf.yield %_tmp3_40 : tensor<32x4xi64, #blocked> loc(#loc24) + } loc(#loc62) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_28: i64 loc(callsite(#loc1 at #loc68)), %tmp3_29: i64 loc(callsite(#loc1 at #loc68))): + %tmp3_30 = arith.addi %tmp3_28, %tmp3_29 : i64 loc(#loc74) + tt.reduce.return %tmp3_30 : i64 loc(#loc72) + }) : (tensor<32x4xi64, #blocked>) -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc72) + %tmp3_26 = ttg.convert_layout %tmp3 : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc69) + %tmp3_27 = tt.expand_dims %tmp3_26 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi64, #blocked1> loc(#loc69) + %tmp5 = arith.trunci %tmp3_27 : tensor<32x1xi64, #blocked1> to tensor<32x1xi32, #blocked1> loc(#loc70) + %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc30) + %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc31) + %2 = arith.extui %1 : i1 to i64 loc(#loc32) + %3 = arith.muli %ks0, %2 : i64 loc(#loc32) + %4 = arith.extui %0 : i1 to i64 loc(#loc71) + %5 = arith.addi %4, %3 : i64 loc(#loc33) + %6 = tt.splat %5 : i64 -> tensor<32x1xi64, #blocked1> loc(#loc35) + %7 = arith.muli %x1_18, %6 : tensor<32x1xi64, #blocked1> loc(#loc35) + %8 = arith.addi %x0_17, %7 : tensor<32x1xi64, #blocked1> loc(#loc36) + %9 = tt.splat %out_ptr1 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked1> loc(#loc37) + %10 = tt.addptr %9, %8 : tensor<32x1x!tt.ptr, #blocked1>, tensor<32x1xi64, #blocked1> loc(#loc37) + tt.store %10, %tmp5, %xmask_11 : tensor<32x1x!tt.ptr, #blocked1> loc(#loc38) + tt.return loc(#loc39) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":26:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":27:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":32:29) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:43) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:39) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:54) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:58) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:50) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:34) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:73) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":30:40) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":31:31) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:63) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":37:23) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":39:23) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:48) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:8) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":42:19) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:49) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:75) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:66) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:57) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:41) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:34) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:30) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:88) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:4) +#loc46 = loc("xoffset"(#loc2)) +#loc47 = loc("xoffset"(#loc3)) +#loc48 = loc("xindex"(#loc4)) +#loc49 = loc("xindex"(#loc5)) +#loc50 = loc("xmask"(#loc6)) +#loc51 = loc("r0_base"(#loc7)) +#loc52 = loc("x0"(#loc8)) +#loc53 = loc("x1"(#loc9)) +#loc54 = loc("r0_mask"(#loc10)) +#loc55 = loc("tmp0"(#loc11)) +#loc56 = loc("tmp0"(#loc12)) +#loc57 = loc("tmp0"(#loc13)) +#loc58 = loc("tmp0"(#loc14)) +#loc59 = loc("tmp0"(#loc15)) +#loc60 = loc("tmp0"(#loc16)) +#loc61 = loc("tmp0"(#loc17)) +#loc62 = loc("_tmp3"(#loc18)) +#loc63 = loc("r0_index"(#loc19)) +#loc64 = loc("tmp0"(#loc20)) +#loc65 = loc("tmp1"(#loc21)) +#loc66 = loc("tmp4"(#loc22)) +#loc67 = loc("_tmp3"(#loc23)) +#loc69 = loc("tmp3"(#loc28)) +#loc70 = loc("tmp5"(#loc29)) +#loc71 = loc(fused[#loc33, #loc34]) +#loc72 = loc(callsite(#loc25 at #loc68)) +#loc74 = loc(callsite(#loc27 at #loc72)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir new file mode 100644 index 0000000000000000000000000000000000000000..3f8b59411de46d555d6dedfa1854b51b901280dd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ACUQPQEQPJICXFQPAI2YOU5LMYPEWPGF6PTKLOP7VXLY7UMHFTLA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir @@ -0,0 +1,152 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":18:0) +#loc1 = loc(unknown) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:25) +#loc43 = loc("in_ptr0"(#loc)) +#loc44 = loc("out_ptr1"(#loc)) +#loc45 = loc("ks0"(#loc)) +#loc46 = loc("ks1"(#loc)) +#loc47 = loc("xnumel"(#loc)) +#loc48 = loc("r0_numel"(#loc)) +#loc74 = loc("tmp3"(#loc29)) +#loc79 = loc(callsite(#loc1 at #loc74)) +module { + tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst = arith.constant dense<0> : tensor<32x4xi32> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %_tmp3 = arith.constant dense<0> : tensor<32x4xi64> loc(#loc49) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc50) + %xoffset_0 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc51) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc52) + %xindex_1 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc53) + %xindex_2 = tt.splat %xoffset_0 : i32 -> tensor<32x1xi32> loc(#loc54) + %xindex_3 = arith.addi %xindex_2, %xindex_1 : tensor<32x1xi32> loc(#loc54) + %xmask = tt.splat %xnumel : i32 -> tensor<32x1xi32> loc(#loc55) + %xmask_4 = arith.cmpi slt, %xindex_3, %xmask : tensor<32x1xi32> loc(#loc55) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc56) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc57) + %x0 = arith.extsi %xindex_3 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc58) + %x0_6 = tt.splat %ks0 : i64 -> tensor<32x1xi64> loc(#loc58) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<32x1xi64> loc(#loc58) + %x1 = arith.divsi %x0, %x0_6 : tensor<32x1xi64> loc(#loc59) + %_tmp3_8 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c4_i32 iter_args(%_tmp3_10 = %_tmp3) -> (tensor<32x4xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc61) + %r0_index_11 = arith.addi %r0_index, %r0_base_5 : tensor<1x4xi32> loc(#loc61) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x4xi32> loc(#loc62) + %r0_mask_12 = arith.cmpi slt, %r0_index_11, %r0_mask : tensor<1x4xi32> loc(#loc62) + %tmp0 = arith.extsi %r0_index_11 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc63) + %tmp0_13 = tt.splat %ks0 : i64 -> tensor<1x4xi64> loc(#loc63) + %tmp0_14 = arith.muli %tmp0_13, %tmp0 : tensor<1x4xi64> loc(#loc63) + %tmp0_15 = tt.broadcast %x0_7 : tensor<32x1xi64> -> tensor<32x4xi64> loc(#loc64) + %tmp0_16 = tt.broadcast %tmp0_14 : tensor<1x4xi64> -> tensor<32x4xi64> loc(#loc64) + %tmp0_17 = arith.addi %tmp0_15, %tmp0_16 : tensor<32x4xi64> loc(#loc64) + %tmp0_18 = arith.muli %ks0, %ks1 : i64 loc(#loc65) + %tmp0_19 = tt.splat %tmp0_18 : i64 -> tensor<32x1xi64> loc(#loc66) + %tmp0_20 = arith.muli %tmp0_19, %x1 : tensor<32x1xi64> loc(#loc66) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<32x1xi64> -> tensor<32x4xi64> loc(#loc67) + %tmp0_22 = arith.addi %tmp0_17, %tmp0_21 : tensor<32x4xi64> loc(#loc67) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x4x!tt.ptr> loc(#loc68) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<32x4x!tt.ptr>, tensor<32x4xi64> loc(#loc68) + %tmp0_25 = tt.broadcast %r0_mask_12 : tensor<1x4xi1> -> tensor<32x4xi1> loc(#loc69) + %tmp0_26 = tt.broadcast %xmask_4 : tensor<32x1xi1> -> tensor<32x4xi1> loc(#loc69) + %tmp0_27 = arith.andi %tmp0_25, %tmp0_26 : tensor<32x4xi1> loc(#loc69) + %tmp0_28 = tt.load %tmp0_24, %tmp0_27, %cst evictionPolicy = evict_last : tensor<32x4x!tt.ptr> loc(#loc70) + %tmp1 = arith.extsi %tmp0_28 : tensor<32x4xi32> to tensor<32x4xi64> loc(#loc71) + %tmp4 = arith.addi %_tmp3_10, %tmp1 : tensor<32x4xi64> loc(#loc72) + %_tmp3_29 = arith.select %tmp0_27, %tmp4, %_tmp3_10 : tensor<32x4xi1>, tensor<32x4xi64> loc(#loc73) + scf.yield %_tmp3_29 : tensor<32x4xi64> loc(#loc27) + } loc(#loc60) + %tmp3 = "tt.reduce"(%_tmp3_8) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_10: i64 loc(callsite(#loc1 at #loc74)), %tmp3_11: i64 loc(callsite(#loc1 at #loc74))): + %tmp3_12 = arith.addi %tmp3_10, %tmp3_11 : i64 loc(#loc80) + tt.reduce.return %tmp3_12 : i64 loc(#loc78) + }) : (tensor<32x4xi64>) -> tensor<32xi64> loc(#loc78) + %tmp3_9 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc75) + %tmp5 = arith.trunci %tmp3_9 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc76) + %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc33) + %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc34) + %2 = arith.extui %1 : i1 to i64 loc(#loc35) + %3 = arith.muli %ks0, %2 : i64 loc(#loc35) + %4 = arith.extui %0 : i1 to i64 loc(#loc77) + %5 = arith.addi %4, %3 : i64 loc(#loc36) + %6 = tt.splat %5 : i64 -> tensor<32x1xi64> loc(#loc38) + %7 = arith.muli %x1, %6 : tensor<32x1xi64> loc(#loc38) + %8 = arith.addi %x0_7, %7 : tensor<32x1xi64> loc(#loc39) + %9 = tt.splat %out_ptr1 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc40) + %10 = tt.addptr %9, %8 : tensor<32x1x!tt.ptr>, tensor<32x1xi64> loc(#loc40) + tt.store %10, %tmp5, %xmask_4 : tensor<32x1x!tt.ptr> loc(#loc41) + tt.return loc(#loc42) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":30:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":28:43) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:33) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:44) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:23) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":23:21) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:27) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":26:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":27:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":31:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":32:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:43) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:39) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:54) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:58) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:50) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:73) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:63) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":37:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":39:23) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:48) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:8) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:28) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":42:19) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:49) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:75) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:66) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:57) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:41) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:34) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:30) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:25) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:88) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:4) +#loc49 = loc("_tmp3"(#loc3)) +#loc50 = loc("xoffset"(#loc4)) +#loc51 = loc("xoffset"(#loc5)) +#loc52 = loc("xindex"(#loc6)) +#loc53 = loc("xindex"(#loc7)) +#loc54 = loc("xindex"(#loc8)) +#loc55 = loc("xmask"(#loc9)) +#loc56 = loc("r0_base"(#loc10)) +#loc57 = loc("r0_base"(#loc11)) +#loc58 = loc("x0"(#loc12)) +#loc59 = loc("x1"(#loc13)) +#loc60 = loc("_tmp3"(#loc2)) +#loc61 = loc("r0_index"(#loc14)) +#loc62 = loc("r0_mask"(#loc15)) +#loc63 = loc("tmp0"(#loc16)) +#loc64 = loc("tmp0"(#loc17)) +#loc65 = loc("tmp0"(#loc18)) +#loc66 = loc("tmp0"(#loc19)) +#loc67 = loc("tmp0"(#loc20)) +#loc68 = loc("tmp0"(#loc21)) +#loc69 = loc("tmp0"(#loc22)) +#loc70 = loc("tmp0"(#loc23)) +#loc71 = loc("tmp1"(#loc24)) +#loc72 = loc("tmp4"(#loc25)) +#loc73 = loc("_tmp3"(#loc26)) +#loc75 = loc("tmp3"(#loc31)) +#loc76 = loc("tmp5"(#loc32)) +#loc77 = loc(fused[#loc36, #loc37]) +#loc78 = loc(callsite(#loc28 at #loc74)) +#loc80 = loc(callsite(#loc30 at #loc78)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6bf115ae8b6bd8a7dcdbd50718801e571b6581ea --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..23c449804f3312e54485e525b4fa8b795349bb2c Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2e24a8d2fb96551654c519b583f69bd398f40b0a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "00c59fc984d266ad088fac53f4f2eb19a06a315dab54d30fa39d98e13a72e275", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..5b0f3539bbb7d3dc04b0dca85d95f2bab5d2bc28 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.llir @@ -0,0 +1,206 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i64 %4, i32 %5, i32 %6, ptr addrspace(1) readnone captures(none) %7, ptr addrspace(1) readnone captures(none) %8) local_unnamed_addr #0 !dbg !4 { + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %11 = shl i32 %10, 6, !dbg !8 + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %13 = and i32 %12, 126, !dbg !9 + %14 = lshr exact i32 %13, 1, !dbg !9 + %15 = or disjoint i32 %14, %11, !dbg !10 + %16 = icmp slt i32 %15, %5, !dbg !11 + %17 = shl nuw nsw i32 %12, 2, !dbg !12 + %18 = and i32 %17, 4, !dbg !12 + %19 = sext i32 %15 to i64, !dbg !13 + %.frozen = freeze i64 %3, !dbg !14 + %20 = sdiv i64 %19, %.frozen, !dbg !14 + %21 = mul i64 %20, %.frozen, !dbg !13 + %.decomposed = sub i64 %19, %21, !dbg !13 + %22 = srem i64 %20, 32, !dbg !15 + %23 = sdiv i64 %19, %4, !dbg !16 + %.not = icmp ne i64 %.decomposed, 0, !dbg !17 + %24 = icmp slt i32 %11, 0, !dbg !21 + %25 = icmp slt i64 %3, 0, !dbg !22 + %26 = xor i1 %24, %25, !dbg !23 + %narrow = select i1 %26, i1 %.not, i1 false, !dbg !24 + %27 = sext i1 %narrow to i64, !dbg !24 + %28 = add nsw i64 %20, %27, !dbg !24 + %29 = shl i64 %3, 12, !dbg !25 + %30 = mul i64 %29, %23, !dbg !26 + %31 = icmp slt i64 %3, 2, !dbg !27 + %32 = icmp sgt i64 %3, 1, !dbg !28 + %33 = select i1 %32, i64 %3, i64 0, !dbg !29 + %34 = zext i1 %31 to i64, !dbg !30 + %35 = add i64 %33, %34, !dbg !31 + %36 = shl i64 %35, 7, !dbg !32 + %37 = mul i64 %36, %28, !dbg !33 + %.idx = shl nsw i64 %22, 8 + %38 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx + %.idx1 = shl nsw i64 %.decomposed, 13 + %invariant.gep = getelementptr i8, ptr addrspace(1) %38, i64 %.idx1, !dbg !34 + %invariant.gep3 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %30, !dbg !34 + %.idx2 = shl nsw i64 %.decomposed, 8 + %39 = getelementptr i8, ptr addrspace(1) %1, i64 %.idx2 + %invariant.gep5 = getelementptr bfloat, ptr addrspace(1) %39, i64 %37, !dbg !34 + %40 = zext nneg i32 %18 to i64, !dbg !34 + br label %41, !dbg !34 + +41: ; preds = %9, %41 + %indvars.iv = phi i64 [ 0, %9 ], [ %indvars.iv.next, %41 ] + %42 = phi float [ 0.000000e+00, %9 ], [ %83, %41 ] + %43 = phi float [ 0.000000e+00, %9 ], [ %84, %41 ] + %44 = phi float [ 0.000000e+00, %9 ], [ %85, %41 ] + %45 = phi float [ 0.000000e+00, %9 ], [ %86, %41 ] + %46 = or disjoint i64 %indvars.iv, %40, !dbg !35 + %gep4 = getelementptr bfloat, ptr addrspace(1) %invariant.gep3, i64 %46, !dbg !36 + %47 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !37 + %48 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %gep4, i64 %47, i1 %16) #4, !dbg !37 + %49 = extractvalue { i32, i32 } %48, 0, !dbg !37 + %50 = bitcast i32 %49 to <2 x bfloat>, !dbg !37 + %51 = extractvalue { i32, i32 } %48, 1, !dbg !37 + %52 = bitcast i32 %51 to <2 x bfloat>, !dbg !37 + %53 = extractelement <2 x bfloat> %50, i64 0, !dbg !37 + %54 = extractelement <2 x bfloat> %50, i64 1, !dbg !37 + %55 = extractelement <2 x bfloat> %52, i64 0, !dbg !37 + %56 = extractelement <2 x bfloat> %52, i64 1, !dbg !37 + %57 = fpext bfloat %53 to float, !dbg !38 + %58 = fpext bfloat %54 to float, !dbg !38 + %59 = fpext bfloat %55 to float, !dbg !38 + %60 = fpext bfloat %56 to float, !dbg !38 + %gep = getelementptr bfloat, ptr addrspace(1) %invariant.gep5, i64 %46, !dbg !39 + %61 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !40 + %62 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %gep, i64 %61, i1 %16) #4, !dbg !40 + %63 = extractvalue { i32, i32 } %62, 0, !dbg !40 + %64 = bitcast i32 %63 to <2 x bfloat>, !dbg !40 + %65 = extractvalue { i32, i32 } %62, 1, !dbg !40 + %66 = bitcast i32 %65 to <2 x bfloat>, !dbg !40 + %67 = extractelement <2 x bfloat> %64, i64 0, !dbg !40 + %68 = extractelement <2 x bfloat> %64, i64 1, !dbg !40 + %69 = extractelement <2 x bfloat> %66, i64 0, !dbg !40 + %70 = extractelement <2 x bfloat> %66, i64 1, !dbg !40 + %71 = fpext bfloat %67 to float, !dbg !41 + %72 = fpext bfloat %68 to float, !dbg !41 + %73 = fpext bfloat %69 to float, !dbg !41 + %74 = fpext bfloat %70 to float, !dbg !41 + %75 = fmul float %57, %71, !dbg !42 + %76 = fmul float %58, %72, !dbg !42 + %77 = fmul float %59, %73, !dbg !42 + %78 = fmul float %60, %74, !dbg !42 + %79 = fadd float %42, %75, !dbg !43 + %80 = fadd float %43, %76, !dbg !43 + %81 = fadd float %44, %77, !dbg !43 + %82 = fadd float %45, %78, !dbg !43 + %83 = select i1 %16, float %79, float %42, !dbg !44 + %84 = select i1 %16, float %80, float %43, !dbg !44 + %85 = select i1 %16, float %81, float %44, !dbg !44 + %86 = select i1 %16, float %82, float %45, !dbg !44 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8, !dbg !34 + %87 = icmp samesign ult i64 %indvars.iv, 120, !dbg !34 + br i1 %87, label %41, label %88, !dbg !34 + +88: ; preds = %41 + %89 = and i32 %12, 63, !dbg !9 + %90 = or disjoint i32 %11, %89, !dbg !10 + %91 = icmp slt i32 %90, %5, !dbg !11 + %92 = fadd float %83, %84, !dbg !45 + %93 = fadd float %85, %92, !dbg !45 + %94 = fadd float %86, %93, !dbg !45 + %95 = bitcast float %94 to i32, !dbg !49 + %96 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %95, i32 1, i32 31), !dbg !49 + %97 = bitcast i32 %96 to float, !dbg !49 + %98 = fadd float %94, %97, !dbg !45 + %99 = shl nuw nsw i32 %13, 1, !dbg !50 + %100 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %99, !dbg !50 + store float %98, ptr addrspace(3) %100, align 4, !dbg !50 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50 + %101 = shl nuw nsw i32 %89, 2, !dbg !50 + %102 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %101, !dbg !50 + %103 = load i32, ptr addrspace(3) %102, align 4, !dbg !50 + %104 = sext i32 %90 to i64, !dbg !51 + %105 = getelementptr float, ptr addrspace(1) %2, i64 %104, !dbg !51 + %106 = and i32 %12, 64, !dbg !52 + %107 = icmp eq i32 %106, 0, !dbg !52 + %108 = and i1 %107, %91, !dbg !52 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %103, ptr addrspace(1) %105, i1 %108) #4, !dbg !52 + ret void, !dbg !53 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 22, column: 33, scope: !4) +!9 = !DILocation(line: 23, column: 44, scope: !4) +!10 = !DILocation(line: 23, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 21, scope: !4) +!12 = !DILocation(line: 25, column: 37, scope: !4) +!13 = !DILocation(line: 27, column: 19, scope: !4) +!14 = !DILocation(line: 28, column: 21, scope: !4) +!15 = !DILocation(line: 28, column: 28, scope: !4) +!16 = !DILocation(line: 29, column: 19, scope: !4) +!17 = !DILocation(line: 74, column: 34, scope: !18, inlinedAt: !20) +!18 = distinct !DILexicalBlockFile(scope: !4, file: !19, discriminator: 0) +!19 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!20 = !DILocation(line: 30, column: 51, scope: !4) +!21 = !DILocation(line: 75, column: 25, scope: !18, inlinedAt: !20) +!22 = !DILocation(line: 75, column: 36, scope: !18, inlinedAt: !20) +!23 = !DILocation(line: 75, column: 32, scope: !18, inlinedAt: !20) +!24 = !DILocation(line: 75, column: 47, scope: !18, inlinedAt: !20) +!25 = !DILocation(line: 39, column: 65, scope: !4) +!26 = !DILocation(line: 39, column: 69, scope: !4) +!27 = !DILocation(line: 40, column: 73, scope: !4) +!28 = !DILocation(line: 40, column: 99, scope: !4) +!29 = !DILocation(line: 40, column: 90, scope: !4) +!30 = !DILocation(line: 40, scope: !4) +!31 = !DILocation(line: 40, column: 81, scope: !4) +!32 = !DILocation(line: 40, column: 54, scope: !4) +!33 = !DILocation(line: 40, column: 58, scope: !4) +!34 = !DILocation(line: 33, column: 40, scope: !4) +!35 = !DILocation(line: 34, column: 31, scope: !4) +!36 = !DILocation(line: 39, column: 34, scope: !4) +!37 = !DILocation(line: 39, column: 74, scope: !4) +!38 = !DILocation(line: 39, column: 136, scope: !4) +!39 = !DILocation(line: 40, column: 34, scope: !4) +!40 = !DILocation(line: 40, column: 106, scope: !4) +!41 = !DILocation(line: 40, column: 168, scope: !4) +!42 = !DILocation(line: 41, column: 22, scope: !4) +!43 = !DILocation(line: 43, column: 23, scope: !4) +!44 = !DILocation(line: 44, column: 48, scope: !4) +!45 = !DILocation(line: 261, column: 15, scope: !46, inlinedAt: !48) +!46 = distinct !DILexicalBlockFile(scope: !4, file: !47, discriminator: 0) +!47 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!48 = !DILocation(line: 45, column: 25, scope: !4) +!49 = !DILocation(line: 291, column: 36, scope: !46, inlinedAt: !48) +!50 = !DILocation(line: 45, column: 28, scope: !4) +!51 = !DILocation(line: 49, column: 25, scope: !4) +!52 = !DILocation(line: 49, column: 36, scope: !4) +!53 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..456dc67c5c336561d1f35004e9d767984bda726e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.ptx @@ -0,0 +1,474 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u64 triton_red_fused_zeros_0_param_3, + .param .u64 triton_red_fused_zeros_0_param_4, + .param .u32 triton_red_fused_zeros_0_param_5, + .param .u32 triton_red_fused_zeros_0_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_7, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_8 +) +.reqntid 128 +{ + .reg .pred %p<16>; + .reg .b16 %rs<9>; + .reg .b32 %r<66>; + .reg .b64 %rd<67>; + .loc 1 18 0 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:18:0 + +// %bb.0: + ld.param.b64 %rd24, [triton_red_fused_zeros_0_param_4]; + ld.param.b64 %rd23, [triton_red_fused_zeros_0_param_3]; +$L__tmp0: + .loc 1 22 28 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:22:28 + mov.u32 %r14, %ctaid.x; + .loc 1 22 33 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:22:33 + shl.b32 %r1, %r14, 6; + .loc 1 23 44 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:23:44 + mov.u32 %r2, %tid.x; + bfe.u32 %r4, %r2, 1, 6; + .loc 1 23 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:23:23 + or.b32 %r15, %r4, %r1; + .loc 1 27 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:27:19 + cvt.s64.s32 %rd1, %r15; + .loc 1 28 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:28:21 + or.b64 %rd26, %rd1, %rd23; + and.b64 %rd27, %rd26, -4294967296; + setp.ne.b64 %p1, %rd27, 0; + cvt.u32.u64 %r61, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd62, %rd1, %rd23; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r16, %rd23; + div.u32 %r18, %r61, %r16; + cvt.u64.u32 %rd62, %r18; +$L__BB0_3: + .loc 1 0 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:0:21 + ld.param.b32 %r13, [triton_red_fused_zeros_0_param_5]; + ld.param.b64 %rd21, [triton_red_fused_zeros_0_param_1]; + ld.param.b64 %rd20, [triton_red_fused_zeros_0_param_0]; + .loc 1 27 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:27:19 + mul.lo.s64 %rd6, %rd62, %rd23; + sub.s64 %rd7, %rd1, %rd6; + .loc 1 28 28 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:28:28 + shr.s64 %rd28, %rd62, 63; + shr.u64 %rd29, %rd28, 59; + add.s64 %rd30, %rd62, %rd29; + and.b64 %rd31, %rd30, -32; + sub.s64 %rd8, %rd62, %rd31; + .loc 1 29 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:29:19 + or.b64 %rd32, %rd1, %rd24; + and.b64 %rd33, %rd32, -4294967296; + setp.ne.b64 %p2, %rd33, 0; + @%p2 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd63, %rd1, %rd24; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r19, %rd24; + div.u32 %r21, %r61, %r19; + cvt.u64.u32 %rd63, %r21; +$L__BB0_6: + .loc 1 0 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:0:19 + ld.param.b64 %rd22, [triton_red_fused_zeros_0_param_2]; + and.b32 %r3, %r2, 126; +$L__tmp1: + .loc 2 74 34 // triton_helpers.py:74:34 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + setp.ne.b64 %p3, %rd7, 0; + .loc 2 75 25 // triton_helpers.py:75:25 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + setp.lt.s32 %p4, %r1, 0; + .loc 2 75 36 // triton_helpers.py:75:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + setp.lt.s64 %p5, %rd23, 0; + .loc 2 75 32 // triton_helpers.py:75:32 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + xor.pred %p6, %p4, %p5; +$L__tmp2: + .loc 1 40 73 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:73 + setp.lt.s64 %p7, %rd23, 2; + .loc 1 40 99 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:99 + setp.gt.s64 %p8, %rd23, 1; + .loc 1 40 90 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:90 + selp.b64 %rd35, %rd23, 0, %p8; + .loc 1 40 0 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40 + selp.b64 %rd36, 1, 0, %p7; + .loc 1 40 81 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:81 + add.s64 %rd37, %rd35, %rd36; + shl.b64 %rd38, %rd8, 8; + .loc 1 33 40 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:33:40 + and.pred %p9, %p3, %p6; + selp.b64 %rd39, -1, 0, %p9; + add.s64 %rd40, %rd62, %rd39; + mul.lo.s64 %rd41, %rd40, %rd37; + shl.b64 %rd42, %rd41, 8; + add.s32 %r23, %r1, %r4; + mad.wide.s32 %rd43, %r23, 256, %rd42; + and.b32 %r24, %r2, 1; + mul.wide.u32 %rd44, %r24, 8; + or.b64 %rd45, %rd43, %rd44; + shl.b64 %rd46, %rd6, 8; + sub.s64 %rd47, %rd45, %rd46; + add.s64 %rd65, %rd21, %rd47; + mul.lo.s64 %rd48, %rd63, %rd23; + shl.b64 %rd49, %rd48, 13; + mad.wide.s32 %rd50, %r23, 8192, %rd49; + add.s64 %rd51, %rd50, %rd38; + or.b64 %rd52, %rd51, %rd44; + shl.b64 %rd53, %rd6, 13; + sub.s64 %rd54, %rd52, %rd53; + add.s64 %rd64, %rd20, %rd54; + mov.b32 %r62, 0f00000000; + mov.b64 %rd66, -8; + setp.lt.s32 %p10, %r61, %r13; + mov.b32 %r63, %r62; + mov.b32 %r64, %r62; + mov.b32 %r65, %r62; +$L__BB0_7: // =>This Inner Loop Header: Depth=1 + .loc 1 39 74 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:74 + // begin inline asm + mov.u64 %rd55, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd55, 1.0; + // end inline asm + mov.b32 %r27, 0; + // begin inline asm + mov.u32 %r25, %r27; + mov.u32 %r26, %r27; + @%p10 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r25, %r26 }, [ %rd64 + 0 ], %rd55; + // end inline asm + mov.b32 {%rs1, %rs2}, %r25; + mov.b32 {%rs3, %rs4}, %r26; + .loc 1 39 136 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:136 + cvt.f32.bf16 %r34, %rs1; + cvt.f32.bf16 %r35, %rs2; + cvt.f32.bf16 %r36, %rs3; + cvt.f32.bf16 %r37, %rs4; + .loc 1 40 106 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:106 + // begin inline asm + mov.u64 %rd58, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd58, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r29, %r27; + mov.u32 %r30, %r27; + @%p10 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r29, %r30 }, [ %rd65 + 0 ], %rd58; + // end inline asm + mov.b32 {%rs5, %rs6}, %r29; + mov.b32 {%rs7, %rs8}, %r30; + .loc 1 40 168 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:168 + cvt.f32.bf16 %r38, %rs5; + cvt.f32.bf16 %r39, %rs6; + cvt.f32.bf16 %r40, %rs7; + cvt.f32.bf16 %r41, %rs8; + .loc 1 43 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:43:23 + fma.rn.f32 %r42, %r34, %r38, %r62; + fma.rn.f32 %r43, %r35, %r39, %r63; + fma.rn.f32 %r44, %r36, %r40, %r64; + fma.rn.f32 %r45, %r37, %r41, %r65; + .loc 1 44 48 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:44:48 + selp.f32 %r62, %r42, %r62, %p10; + selp.f32 %r63, %r43, %r63, %p10; + selp.f32 %r64, %r44, %r64, %p10; + selp.f32 %r65, %r45, %r65, %p10; + .loc 1 33 40 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:33:40 + add.s64 %rd66, %rd66, 8; + add.s64 %rd65, %rd65, 16; + add.s64 %rd64, %rd64, 16; + setp.lt.u64 %p12, %rd66, 120; + @%p12 bra $L__BB0_7; +// %bb.8: + .loc 1 23 44 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:23:44 + and.b32 %r47, %r2, 63; + .loc 1 23 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:23:23 + or.b32 %r48, %r1, %r47; + .loc 1 24 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:24:21 + setp.lt.s32 %p14, %r48, %r13; +$L__tmp3: + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r49, %r62, %r63; + add.f32 %r50, %r64, %r49; + add.f32 %r51, %r65, %r50; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r52, %r51, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r53, %r51, %r52; +$L__tmp4: + .loc 1 45 28 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:28 + shl.b32 %r54, %r3, 1; + mov.b32 %r55, global_smem; + add.s32 %r56, %r55, %r54; + st.shared.b32 [%r56], %r53; + bar.sync 0; + shl.b32 %r57, %r47, 2; + add.s32 %r58, %r55, %r57; + ld.shared.b32 %r46, [%r58]; + .loc 1 49 25 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:49:25 + mad.wide.s32 %rd61, %r48, 4, %rd22; + .loc 1 49 36 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:49:36 + and.b32 %r59, %r2, 64; + setp.eq.b32 %p15, %r59, 0; + and.pred %p13, %p15, %p14; + // begin inline asm + @%p13 st.global.b32 [ %rd61 + 0 ], { %r46 }; + // end inline asm + .loc 1 49 4 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:49:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 233 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe2 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 54 +.b8 52 +.b8 100 +.b8 110 +.b8 112 +.b8 110 +.b8 99 +.b8 50 +.b8 52 +.b8 117 +.b8 106 +.b8 115 +.b8 52 +.b8 101 +.b8 108 +.b8 51 +.b8 111 +.b8 122 +.b8 101 +.b8 100 +.b8 97 +.b8 55 +.b8 112 +.b8 110 +.b8 115 +.b8 118 +.b8 116 +.b8 101 +.b8 119 +.b8 104 +.b8 55 +.b8 120 +.b8 54 +.b8 55 +.b8 103 +.b8 122 +.b8 119 +.b8 116 +.b8 102 +.b8 111 +.b8 55 +.b8 122 +.b8 120 +.b8 118 +.b8 114 +.b8 102 +.b8 115 +.b8 113 +.b8 122 +.b8 115 +.b8 103 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 54 +.b8 52 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 30 // DW_AT_call_line +.b8 51 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd3:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..131b4fe7e9b045c32f434ecba9de4979a7e94f9d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.source @@ -0,0 +1,325 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":18:0) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc70 = loc(unknown) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc77 = loc("in_ptr0"(#loc)) +#loc78 = loc("in_ptr1"(#loc)) +#loc79 = loc("out_ptr1"(#loc)) +#loc80 = loc("ks0"(#loc)) +#loc81 = loc("ks1"(#loc)) +#loc82 = loc("xnumel"(#loc)) +#loc83 = loc("r0_numel"(#loc)) +#loc135 = loc("a"(#loc56)) +#loc136 = loc("b"(#loc56)) +#loc142 = loc("input"(#loc68)) +#loc143 = loc("a"(#loc73)) +#loc144 = loc("b"(#loc73)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 128 : i32 loc(#loc84) + %xoffset = tt.get_program_id x : i32 loc(#loc85) + %xoffset_1 = arith.constant 64 : i32 loc(#loc86) + %xoffset_2 = arith.constant 64 : i32 loc(#loc86) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc86) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc87) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc88) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<64x1xi32> loc(#loc89) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<64x1xi32> loc(#loc89) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32> loc(#loc90) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<64x1xi32> loc(#loc90) + %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc91) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32> -> tensor<1x8xi32> loc(#loc92) + %x0 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc93) + %x0_9 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc93) + %x0_10 = arith.remsi %x0, %x0_9 : tensor<64x1xi64> loc(#loc93) + %x1 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc94) + %x1_11 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc94) + %x1_12 = arith.divsi %x1, %x1_11 : tensor<64x1xi64> loc(#loc94) + %x1_13 = arith.constant 32 : i32 loc(#loc95) + %x1_14 = arith.constant 32 : i64 loc(#loc95) + %x1_15 = arith.constant dense<32> : tensor<64x1xi64> loc(#loc95) + %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<64x1xi64> loc(#loc95) + %x2 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc96) + %x2_17 = tt.splat %ks1 : i64 -> tensor<64x1xi64> loc(#loc96) + %x2_18 = arith.divsi %x2, %x2_17 : tensor<64x1xi64> loc(#loc96) + %x5 = tt.call @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S64_1S_i64__(%xindex_6, %ks0) : (tensor<64x1xi32>, i64) -> tensor<64x1xi64> loc(#loc97) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc98) + %_tmp4_19 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc98) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c8_i32 = arith.constant 8 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c8_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_20 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_19) -> (tensor<64x8xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc100) + %r0_index_24 = arith.addi %r0_index, %r0_base_8 : tensor<1x8xi32> loc(#loc100) + %r0_mask = arith.constant dense<128> : tensor<1x8xi32> loc(#loc101) + %r0_mask_25 = arith.cmpi slt, %r0_index_24, %r0_mask : tensor<1x8xi32> loc(#loc101) + %tmp0 = arith.constant 128 : i32 loc(#loc102) + %tmp0_26 = arith.constant 128 : i64 loc(#loc102) + %tmp0_27 = arith.constant dense<128> : tensor<64x1xi64> loc(#loc102) + %tmp0_28 = arith.muli %tmp0_27, %x1_16 : tensor<64x1xi64> loc(#loc102) + %tmp0_29 = arith.extsi %r0_index_24 : tensor<1x8xi32> to tensor<1x8xi64> loc(#loc103) + %tmp0_30 = tt.broadcast %tmp0_29 : tensor<1x8xi64> -> tensor<64x8xi64> loc(#loc103) + %tmp0_31 = tt.broadcast %tmp0_28 : tensor<64x1xi64> -> tensor<64x8xi64> loc(#loc103) + %tmp0_32 = arith.addi %tmp0_30, %tmp0_31 : tensor<64x8xi64> loc(#loc103) + %tmp0_33 = arith.constant 4096 : i32 loc(#loc104) + %tmp0_34 = arith.constant 4096 : i64 loc(#loc104) + %tmp0_35 = arith.constant dense<4096> : tensor<64x1xi64> loc(#loc104) + %tmp0_36 = arith.muli %tmp0_35, %x0_10 : tensor<64x1xi64> loc(#loc104) + %tmp0_37 = tt.broadcast %tmp0_36 : tensor<64x1xi64> -> tensor<64x8xi64> loc(#loc105) + %tmp0_38 = arith.addi %tmp0_32, %tmp0_37 : tensor<64x8xi64> loc(#loc105) + %tmp0_39 = arith.constant 4096 : i32 loc(#loc106) + %tmp0_40 = arith.constant 4096 : i64 loc(#loc106) + %tmp0_41 = arith.muli %tmp0_40, %ks0 : i64 loc(#loc106) + %tmp0_42 = tt.splat %tmp0_41 : i64 -> tensor<64x1xi64> loc(#loc107) + %tmp0_43 = arith.muli %tmp0_42, %x2_18 : tensor<64x1xi64> loc(#loc107) + %tmp0_44 = tt.broadcast %tmp0_43 : tensor<64x1xi64> -> tensor<64x8xi64> loc(#loc108) + %tmp0_45 = arith.addi %tmp0_38, %tmp0_44 : tensor<64x8xi64> loc(#loc108) + %tmp0_46 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc109) + %tmp0_47 = tt.addptr %tmp0_46, %tmp0_45 : tensor<64x8x!tt.ptr>, tensor<64x8xi64> loc(#loc109) + %tmp0_48 = tt.broadcast %r0_mask_25 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc110) + %tmp0_49 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x8xi1> loc(#loc110) + %tmp0_50 = arith.andi %tmp0_48, %tmp0_49 : tensor<64x8xi1> loc(#loc110) + %tmp0_51 = arith.constant 0.000000e+00 : f32 loc(#loc111) + %tmp0_52 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc111) + %tmp0_53 = arith.truncf %tmp0_52 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc111) + %tmp0_54 = tt.load %tmp0_47, %tmp0_50, %tmp0_53 evictionPolicy = evict_first : tensor<64x8x!tt.ptr> loc(#loc111) + %tmp0_55 = arith.extf %tmp0_54 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc112) + %tmp1 = arith.constant 128 : i32 loc(#loc113) + %tmp1_56 = arith.constant 128 : i64 loc(#loc113) + %tmp1_57 = arith.constant dense<128> : tensor<64x1xi64> loc(#loc113) + %tmp1_58 = arith.muli %tmp1_57, %x0_10 : tensor<64x1xi64> loc(#loc113) + %tmp1_59 = arith.extsi %r0_index_24 : tensor<1x8xi32> to tensor<1x8xi64> loc(#loc114) + %tmp1_60 = tt.broadcast %tmp1_59 : tensor<1x8xi64> -> tensor<64x8xi64> loc(#loc114) + %tmp1_61 = tt.broadcast %tmp1_58 : tensor<64x1xi64> -> tensor<64x8xi64> loc(#loc114) + %tmp1_62 = arith.addi %tmp1_60, %tmp1_61 : tensor<64x8xi64> loc(#loc114) + %tmp1_63 = arith.constant 128 : i32 loc(#loc115) + %tmp1_64 = arith.constant 128 : i64 loc(#loc115) + %tmp1_65 = arith.constant dense<128> : tensor<64x1xi64> loc(#loc115) + %tmp1_66 = arith.muli %tmp1_65, %x5 : tensor<64x1xi64> loc(#loc115) + %tmp1_67 = arith.constant 1 : i32 loc(#loc116) + %tmp1_68 = arith.extsi %tmp1_67 : i32 to i64 loc(#loc116) + %tmp1_69 = arith.cmpi sge, %tmp1_68, %ks0 : i64 loc(#loc116) + %tmp1_70 = arith.constant 1 : i32 loc(#loc117) + %tmp1_71 = arith.constant 1 : i32 loc(#loc117) + %tmp1_72 = arith.extui %tmp1_69 : i1 to i32 loc(#loc117) + %tmp1_73 = arith.muli %tmp1_71, %tmp1_72 : i32 loc(#loc117) + %tmp1_74 = arith.constant 1 : i32 loc(#loc118) + %tmp1_75 = arith.extsi %tmp1_74 : i32 to i64 loc(#loc118) + %tmp1_76 = arith.cmpi sgt, %ks0, %tmp1_75 : i64 loc(#loc118) + %tmp1_77 = arith.extui %tmp1_76 : i1 to i64 loc(#loc119) + %tmp1_78 = arith.muli %ks0, %tmp1_77 : i64 loc(#loc119) + %tmp1_79 = arith.extsi %tmp1_73 : i32 to i64 loc(#loc120) + %tmp1_80 = arith.addi %tmp1_79, %tmp1_78 : i64 loc(#loc120) + %tmp1_81 = tt.splat %tmp1_80 : i64 -> tensor<64x1xi64> loc(#loc121) + %tmp1_82 = arith.muli %tmp1_66, %tmp1_81 : tensor<64x1xi64> loc(#loc121) + %tmp1_83 = tt.broadcast %tmp1_82 : tensor<64x1xi64> -> tensor<64x8xi64> loc(#loc122) + %tmp1_84 = arith.addi %tmp1_62, %tmp1_83 : tensor<64x8xi64> loc(#loc122) + %tmp1_85 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc123) + %tmp1_86 = tt.addptr %tmp1_85, %tmp1_84 : tensor<64x8x!tt.ptr>, tensor<64x8xi64> loc(#loc123) + %tmp1_87 = tt.broadcast %r0_mask_25 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc124) + %tmp1_88 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x8xi1> loc(#loc124) + %tmp1_89 = arith.andi %tmp1_87, %tmp1_88 : tensor<64x8xi1> loc(#loc124) + %tmp1_90 = arith.constant 0.000000e+00 : f32 loc(#loc125) + %tmp1_91 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc125) + %tmp1_92 = arith.truncf %tmp1_91 : tensor<64x8xf32> to tensor<64x8xbf16> loc(#loc125) + %tmp1_93 = tt.load %tmp1_86, %tmp1_89, %tmp1_92 evictionPolicy = evict_first : tensor<64x8x!tt.ptr> loc(#loc125) + %tmp1_94 = arith.extf %tmp1_93 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc126) + %tmp2 = arith.mulf %tmp0_55, %tmp1_94 : tensor<64x8xf32> loc(#loc127) + %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<64x8xf32> loc(#loc128) + %_tmp4_95 = tt.broadcast %r0_mask_25 : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc129) + %_tmp4_96 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x8xi1> loc(#loc129) + %_tmp4_97 = arith.andi %_tmp4_95, %_tmp4_96 : tensor<64x8xi1> loc(#loc129) + %_tmp4_98 = arith.select %_tmp4_97, %tmp5, %_tmp4_23 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc130) + scf.yield %_tmp4_98 : tensor<64x8xf32> loc(#loc48) + } loc(#loc99) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_20) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc131) + %tmp4_21 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc132) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc133) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<64x1xf32> loc(#loc134) + %tmp8_22 = arith.subf %tmp4_21, %tmp8 : tensor<64x1xf32> loc(#loc134) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc53) + %5 = tt.addptr %4, %xindex_6 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc53) + tt.store %5, %tmp8_22, %xmask_7 : tensor<64x1x!tt.ptr> loc(#loc54) + tt.return loc(#loc55) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S64_1S_i64__(%a: tensor<64x1xi32> loc("a"(#loc56)), %b: i64 loc("b"(#loc56))) -> tensor<64x1xi64> attributes {noinline = false} { + %quot = arith.extsi %a : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc137) + %quot_0 = tt.splat %b : i64 -> tensor<64x1xi64> loc(#loc137) + %quot_1 = arith.divsi %quot, %quot_0 : tensor<64x1xi64> loc(#loc137) + %remainder = arith.extsi %a : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc138) + %remainder_2 = tt.splat %b : i64 -> tensor<64x1xi64> loc(#loc138) + %remainder_3 = arith.remsi %remainder, %remainder_2 : tensor<64x1xi64> loc(#loc138) + %fixed = arith.constant 0 : i32 loc(#loc139) + %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc139) + %fixed_5 = tt.splat %fixed_4 : i64 -> tensor<64x1xi64> loc(#loc139) + %fixed_6 = arith.cmpi ne, %remainder_3, %fixed_5 : tensor<64x1xi64> loc(#loc139) + %fixed_7 = arith.constant 1 : i32 loc(#loc140) + %fixed_8 = arith.constant 1 : i64 loc(#loc140) + %fixed_9 = arith.constant dense<1> : tensor<64x1xi64> loc(#loc140) + %fixed_10 = arith.subi %quot_1, %fixed_9 : tensor<64x1xi64> loc(#loc140) + %fixed_11 = arith.select %fixed_6, %fixed_10, %quot_1 : tensor<64x1xi1>, tensor<64x1xi64> loc(#loc141) + %c0_i32 = arith.constant 0 : i32 loc(#loc62) + %cst = arith.constant dense<0> : tensor<64x1xi32> loc(#loc62) + %0 = arith.cmpi slt, %a, %cst : tensor<64x1xi32> loc(#loc62) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc63) + %1 = arith.extsi %c0_i32_12 : i32 to i64 loc(#loc63) + %2 = arith.cmpi slt, %b, %1 : i64 loc(#loc63) + %3 = tt.splat %2 : i1 -> tensor<64x1xi1> loc(#loc64) + %4 = arith.cmpi ne, %0, %3 : tensor<64x1xi1> loc(#loc64) + %5 = arith.select %4, %fixed_11, %quot_1 : tensor<64x1xi1>, tensor<64x1xi64> loc(#loc65) + tt.return %5 : tensor<64x1xi64> loc(#loc66) + ^bb1: // no predecessors + %6 = ub.poison : tensor<64x1xi64> loc(#loc67) + tt.return %6 : tensor<64x1xi64> loc(#loc67) + } loc(#loc56) + tt.func private @"triton.language.standard.sum__fp32S64_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x8xf32> loc("input"(#loc68))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc69) + tt.reduce.return %2 : f32 loc(#loc69) + }) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc69) + tt.return %0 : tensor<64xf32> loc(#loc71) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc72) + tt.return %1 : tensor<64xf32> loc(#loc72) + } loc(#loc68) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc73)), %b: f32 loc("b"(#loc73))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc74) + tt.return %0 : f32 loc(#loc75) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc76) + tt.return %1 : f32 loc(#loc76) + } loc(#loc73) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:21) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:28) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":29:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":30:51) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":31:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:65) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:69) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:60) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:34) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:84) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:74) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:136) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:45) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:41) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:54) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:73) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:65) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:99) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:90) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:81) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:58) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:50) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:34) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:116) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:106) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:168) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":41:22) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":43:23) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:35) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:48) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:8) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:25) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:28) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":47:11) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":48:18) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:36) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:4) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc84 = loc("r0_numel"(#loc1)) +#loc85 = loc("xoffset"(#loc2)) +#loc86 = loc("xoffset"(#loc3)) +#loc87 = loc("xindex"(#loc4)) +#loc88 = loc("xindex"(#loc5)) +#loc89 = loc("xindex"(#loc6)) +#loc90 = loc("xmask"(#loc7)) +#loc91 = loc("r0_base"(#loc8)) +#loc92 = loc("r0_base"(#loc9)) +#loc93 = loc("x0"(#loc10)) +#loc94 = loc("x1"(#loc11)) +#loc95 = loc("x1"(#loc12)) +#loc96 = loc("x2"(#loc13)) +#loc97 = loc("x5"(#loc14)) +#loc98 = loc("_tmp4"(#loc15)) +#loc99 = loc("_tmp4"(#loc16)) +#loc100 = loc("r0_index"(#loc17)) +#loc101 = loc("r0_mask"(#loc18)) +#loc102 = loc("tmp0"(#loc19)) +#loc103 = loc("tmp0"(#loc20)) +#loc104 = loc("tmp0"(#loc21)) +#loc105 = loc("tmp0"(#loc22)) +#loc106 = loc("tmp0"(#loc23)) +#loc107 = loc("tmp0"(#loc24)) +#loc108 = loc("tmp0"(#loc25)) +#loc109 = loc("tmp0"(#loc26)) +#loc110 = loc("tmp0"(#loc27)) +#loc111 = loc("tmp0"(#loc28)) +#loc112 = loc("tmp0"(#loc29)) +#loc113 = loc("tmp1"(#loc30)) +#loc114 = loc("tmp1"(#loc31)) +#loc115 = loc("tmp1"(#loc32)) +#loc116 = loc("tmp1"(#loc33)) +#loc117 = loc("tmp1"(#loc34)) +#loc118 = loc("tmp1"(#loc35)) +#loc119 = loc("tmp1"(#loc36)) +#loc120 = loc("tmp1"(#loc37)) +#loc121 = loc("tmp1"(#loc38)) +#loc122 = loc("tmp1"(#loc39)) +#loc123 = loc("tmp1"(#loc40)) +#loc124 = loc("tmp1"(#loc41)) +#loc125 = loc("tmp1"(#loc42)) +#loc126 = loc("tmp1"(#loc43)) +#loc127 = loc("tmp2"(#loc44)) +#loc128 = loc("tmp5"(#loc45)) +#loc129 = loc("_tmp4"(#loc46)) +#loc130 = loc("_tmp4"(#loc47)) +#loc131 = loc("tmp4"(#loc49)) +#loc132 = loc("tmp4"(#loc50)) +#loc133 = loc("tmp7"(#loc51)) +#loc134 = loc("tmp8"(#loc52)) +#loc137 = loc("quot"(#loc57)) +#loc138 = loc("remainder"(#loc58)) +#loc139 = loc("fixed"(#loc59)) +#loc140 = loc("fixed"(#loc60)) +#loc141 = loc("fixed"(#loc61)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..898e2735f94f8ba10f214a9e09a47a8bad1ca684 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,233 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":18:0) +#loc1 = loc(unknown) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:25) +#loc58 = loc("in_ptr0"(#loc)) +#loc59 = loc("in_ptr1"(#loc)) +#loc60 = loc("out_ptr1"(#loc)) +#loc61 = loc("ks0"(#loc)) +#loc62 = loc("ks1"(#loc)) +#loc63 = loc("xnumel"(#loc)) +#loc64 = loc("r0_numel"(#loc)) +#loc109 = loc("tmp4"(#loc52)) +#loc120 = loc(callsite(#loc1 at #loc109)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x8xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<32> : tensor<64x1xi64, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked> loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c4096_i64 = arith.constant 4096 : i64 loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc65) + %xoffset_8 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc66) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc67) + %xindex_9 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc67) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc67) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc67) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked> loc(#loc68) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc68) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<64x1xi32, #blocked> loc(#loc68) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<64x1xi32, #blocked1> loc(#loc68) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32, #blocked> loc(#loc69) + %xmask_16 = tt.splat %xnumel : i32 -> tensor<64x1xi32, #blocked1> loc(#loc69) + %xmask_17 = arith.cmpi slt, %xindex_14, %xmask : tensor<64x1xi32, #blocked> loc(#loc69) + %xmask_18 = arith.cmpi slt, %xindex_15, %xmask_16 : tensor<64x1xi32, #blocked1> loc(#loc69) + %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc70) + %r0_base_19 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x8xi32, #blocked> loc(#loc70) + %x0 = arith.extsi %xindex_14 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> loc(#loc71) + %x0_20 = tt.splat %ks0 : i64 -> tensor<64x1xi64, #blocked> loc(#loc71) + %x0_21 = arith.remsi %x0, %x0_20 : tensor<64x1xi64, #blocked> loc(#loc71) + %x1 = arith.divsi %x0, %x0_20 : tensor<64x1xi64, #blocked> loc(#loc72) + %x1_22 = arith.remsi %x1, %cst_5 : tensor<64x1xi64, #blocked> loc(#loc73) + %x2 = tt.splat %ks1 : i64 -> tensor<64x1xi64, #blocked> loc(#loc74) + %x2_23 = arith.divsi %x0, %x2 : tensor<64x1xi64, #blocked> loc(#loc74) + %fixed = arith.cmpi ne, %x0_21, %cst_2 : tensor<64x1xi64, #blocked> loc(#loc111) + %fixed_24 = arith.subi %x1, %cst_4 : tensor<64x1xi64, #blocked> loc(#loc112) + %fixed_25 = arith.select %fixed, %fixed_24, %x1 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked> loc(#loc113) + %x5 = arith.cmpi slt, %xindex_14, %cst_3 : tensor<64x1xi32, #blocked> loc(#loc114) + %x5_26 = arith.cmpi slt, %ks0, %c0_i64 : i64 loc(#loc115) + %x5_27 = tt.splat %x5_26 : i1 -> tensor<64x1xi1, #blocked> loc(#loc116) + %x5_28 = arith.cmpi ne, %x5, %x5_27 : tensor<64x1xi1, #blocked> loc(#loc116) + %x5_29 = arith.select %x5_28, %fixed_25, %x1 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked> loc(#loc117) + %tmp0 = arith.muli %x1_22, %cst_0 : tensor<64x1xi64, #blocked> loc(#loc79) + %tmp0_30 = tt.broadcast %tmp0 : tensor<64x1xi64, #blocked> -> tensor<64x8xi64, #blocked> loc(#loc80) + %tmp0_31 = arith.muli %x0_21, %cst_1 : tensor<64x1xi64, #blocked> loc(#loc81) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<64x1xi64, #blocked> -> tensor<64x8xi64, #blocked> loc(#loc82) + %tmp0_33 = arith.muli %ks0, %c4096_i64 : i64 loc(#loc83) + %tmp0_34 = tt.splat %tmp0_33 : i64 -> tensor<64x1xi64, #blocked> loc(#loc84) + %tmp0_35 = arith.muli %tmp0_34, %x2_23 : tensor<64x1xi64, #blocked> loc(#loc84) + %tmp0_36 = tt.broadcast %tmp0_35 : tensor<64x1xi64, #blocked> -> tensor<64x8xi64, #blocked> loc(#loc85) + %tmp0_37 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr, #blocked> loc(#loc86) + %tmp0_38 = tt.broadcast %xmask_17 : tensor<64x1xi1, #blocked> -> tensor<64x8xi1, #blocked> loc(#loc87) + %tmp1 = arith.muli %x0_21, %cst_0 : tensor<64x1xi64, #blocked> loc(#loc88) + %tmp1_39 = tt.broadcast %tmp1 : tensor<64x1xi64, #blocked> -> tensor<64x8xi64, #blocked> loc(#loc89) + %tmp1_40 = arith.muli %x5_29, %cst_0 : tensor<64x1xi64, #blocked> loc(#loc90) + %tmp1_41 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc91) + %tmp1_42 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc92) + %tmp1_43 = arith.extui %tmp1_42 : i1 to i64 loc(#loc93) + %tmp1_44 = arith.muli %ks0, %tmp1_43 : i64 loc(#loc93) + %tmp1_45 = arith.extui %tmp1_41 : i1 to i64 loc(#loc118) + %tmp1_46 = arith.addi %tmp1_45, %tmp1_44 : i64 loc(#loc94) + %tmp1_47 = tt.splat %tmp1_46 : i64 -> tensor<64x1xi64, #blocked> loc(#loc96) + %tmp1_48 = arith.muli %tmp1_40, %tmp1_47 : tensor<64x1xi64, #blocked> loc(#loc96) + %tmp1_49 = tt.broadcast %tmp1_48 : tensor<64x1xi64, #blocked> -> tensor<64x8xi64, #blocked> loc(#loc97) + %tmp1_50 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x8x!tt.ptr, #blocked> loc(#loc98) + %_tmp4 = scf.for %_tmp4_53 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg8 = %cst_7) -> (tensor<64x8xf32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp4_53 : i32 -> tensor<1x8xi32, #blocked> loc(#loc100) + %r0_index_54 = arith.addi %r0_index, %r0_base_19 : tensor<1x8xi32, #blocked> loc(#loc100) + %r0_mask = arith.cmpi slt, %r0_index_54, %cst : tensor<1x8xi32, #blocked> loc(#loc101) + %tmp0_55 = arith.extsi %r0_index_54 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked> loc(#loc80) + %tmp0_56 = tt.broadcast %tmp0_55 : tensor<1x8xi64, #blocked> -> tensor<64x8xi64, #blocked> loc(#loc80) + %tmp0_57 = arith.addi %tmp0_56, %tmp0_30 : tensor<64x8xi64, #blocked> loc(#loc80) + %tmp0_58 = arith.addi %tmp0_57, %tmp0_32 : tensor<64x8xi64, #blocked> loc(#loc82) + %tmp0_59 = arith.addi %tmp0_58, %tmp0_36 : tensor<64x8xi64, #blocked> loc(#loc85) + %tmp0_60 = tt.addptr %tmp0_37, %tmp0_59 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi64, #blocked> loc(#loc86) + %tmp0_61 = tt.broadcast %r0_mask : tensor<1x8xi1, #blocked> -> tensor<64x8xi1, #blocked> loc(#loc87) + %tmp0_62 = arith.andi %tmp0_61, %tmp0_38 : tensor<64x8xi1, #blocked> loc(#loc87) + %tmp0_63 = tt.load %tmp0_60, %tmp0_62, %cst_6 evictionPolicy = evict_first : tensor<64x8x!tt.ptr, #blocked> loc(#loc102) + %tmp0_64 = arith.extf %tmp0_63 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc103) + %tmp1_65 = arith.addi %tmp0_56, %tmp1_39 : tensor<64x8xi64, #blocked> loc(#loc89) + %tmp1_66 = arith.addi %tmp1_65, %tmp1_49 : tensor<64x8xi64, #blocked> loc(#loc97) + %tmp1_67 = tt.addptr %tmp1_50, %tmp1_66 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi64, #blocked> loc(#loc98) + %tmp1_68 = tt.load %tmp1_67, %tmp0_62, %cst_6 evictionPolicy = evict_first : tensor<64x8x!tt.ptr, #blocked> loc(#loc104) + %tmp1_69 = arith.extf %tmp1_68 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> loc(#loc105) + %tmp2 = arith.mulf %tmp0_64, %tmp1_69 : tensor<64x8xf32, #blocked> loc(#loc106) + %tmp5 = arith.addf %arg8, %tmp2 : tensor<64x8xf32, #blocked> loc(#loc107) + %_tmp4_70 = arith.select %tmp0_62, %tmp5, %arg8 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked> loc(#loc108) + scf.yield %_tmp4_70 : tensor<64x8xf32, #blocked> loc(#loc50) + } loc(#loc99) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_53: f32 loc(callsite(#loc1 at #loc109)), %tmp4_54: f32 loc(callsite(#loc1 at #loc109))): + %tmp4_55 = arith.addf %tmp4_53, %tmp4_54 : f32 loc(#loc121) + tt.reduce.return %tmp4_55 : f32 loc(#loc119) + }) : (tensor<64x8xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc119) + %tmp4_51 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc110) + %tmp4_52 = tt.expand_dims %tmp4_51 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc110) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc55) + %1 = tt.addptr %0, %xindex_15 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc55) + tt.store %1, %tmp4_52, %xmask_18 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc56) + tt.return loc(#loc57) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":24:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":27:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:21) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:28) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":29:19) +#loc12 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":30:51) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:45) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:41) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:55) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:50) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:65) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:69) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:60) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:34) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:84) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:45) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:41) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:54) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:73) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:99) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:90) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:81) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:65) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:58) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:50) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:34) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":33:40) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":34:31) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":35:29) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:74) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:136) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:106) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:168) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":41:22) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":43:23) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:48) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:8) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:28) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:25) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:36) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:4) +#loc65 = loc("xoffset"(#loc2)) +#loc66 = loc("xoffset"(#loc3)) +#loc67 = loc("xindex"(#loc4)) +#loc68 = loc("xindex"(#loc5)) +#loc69 = loc("xmask"(#loc6)) +#loc70 = loc("r0_base"(#loc7)) +#loc71 = loc("x0"(#loc8)) +#loc72 = loc("x1"(#loc9)) +#loc73 = loc("x1"(#loc10)) +#loc74 = loc("x2"(#loc11)) +#loc75 = loc("fixed"(#loc12)) +#loc76 = loc("x5"(#loc13)) +#loc77 = loc("fixed"(#loc14)) +#loc78 = loc("fixed"(#loc15)) +#loc79 = loc("tmp0"(#loc20)) +#loc80 = loc("tmp0"(#loc21)) +#loc81 = loc("tmp0"(#loc22)) +#loc82 = loc("tmp0"(#loc23)) +#loc83 = loc("tmp0"(#loc24)) +#loc84 = loc("tmp0"(#loc25)) +#loc85 = loc("tmp0"(#loc26)) +#loc86 = loc("tmp0"(#loc27)) +#loc87 = loc("tmp0"(#loc28)) +#loc88 = loc("tmp1"(#loc29)) +#loc89 = loc("tmp1"(#loc30)) +#loc90 = loc("tmp1"(#loc31)) +#loc91 = loc("tmp1"(#loc32)) +#loc92 = loc("tmp1"(#loc33)) +#loc93 = loc("tmp1"(#loc34)) +#loc94 = loc("tmp1"(#loc35)) +#loc95 = loc("tmp1"(#loc36)) +#loc96 = loc("tmp1"(#loc37)) +#loc97 = loc("tmp1"(#loc38)) +#loc98 = loc("tmp1"(#loc39)) +#loc99 = loc("_tmp4"(#loc40)) +#loc100 = loc("r0_index"(#loc41)) +#loc101 = loc("r0_mask"(#loc42)) +#loc102 = loc("tmp0"(#loc43)) +#loc103 = loc("tmp0"(#loc44)) +#loc104 = loc("tmp1"(#loc45)) +#loc105 = loc("tmp1"(#loc46)) +#loc106 = loc("tmp2"(#loc47)) +#loc107 = loc("tmp5"(#loc48)) +#loc108 = loc("_tmp4"(#loc49)) +#loc110 = loc("tmp4"(#loc54)) +#loc111 = loc(callsite(#loc75 at #loc76)) +#loc112 = loc(callsite(#loc77 at #loc76)) +#loc113 = loc(callsite(#loc78 at #loc76)) +#loc114 = loc(callsite(#loc16 at #loc76)) +#loc115 = loc(callsite(#loc17 at #loc76)) +#loc116 = loc(callsite(#loc18 at #loc76)) +#loc117 = loc(callsite(#loc19 at #loc76)) +#loc118 = loc(fused[#loc94, #loc95]) +#loc119 = loc(callsite(#loc51 at #loc109)) +#loc121 = loc(callsite(#loc53 at #loc119)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..cf72c2cbf7a78b9ed303e989d8bd2ed677622bba --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ADCZ7SME2JTK2CEPVRJ7J4XLDGQGUMK5VNKNGD5DTWMOCOTS4J2Q/triton_red_fused_zeros_0.ttir @@ -0,0 +1,228 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":18:0) +#loc6 = loc(unknown) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:25) +#loc60 = loc("in_ptr0"(#loc)) +#loc61 = loc("in_ptr1"(#loc)) +#loc62 = loc("out_ptr1"(#loc)) +#loc63 = loc("ks0"(#loc)) +#loc64 = loc("ks1"(#loc)) +#loc65 = loc("xnumel"(#loc)) +#loc66 = loc("r0_numel"(#loc)) +#loc113 = loc("tmp4"(#loc54)) +#loc124 = loc(callsite(#loc6 at #loc113)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %fixed = arith.constant dense<1> : tensor<64x1xi64> loc(#loc115) + %x5 = arith.constant dense<0> : tensor<64x1xi32> loc(#loc116) + %fixed_0 = arith.constant dense<0> : tensor<64x1xi64> loc(#loc117) + %x5_1 = arith.constant 0 : i64 loc(#loc118) + %c1_i64 = arith.constant 1 : i64 loc(#loc6) + %cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16> loc(#loc6) + %c8_i32 = arith.constant 8 : i32 loc(#loc7) + %c128_i32 = arith.constant 128 : i32 loc(#loc7) + %c0_i32 = arith.constant 0 : i32 loc(#loc7) + %cst_2 = arith.constant dense<4096> : tensor<64x1xi64> loc(#loc6) + %c4096_i64 = arith.constant 4096 : i64 loc(#loc6) + %cst_3 = arith.constant dense<128> : tensor<64x1xi64> loc(#loc6) + %cst_4 = arith.constant dense<128> : tensor<1x8xi32> loc(#loc6) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> loc(#loc6) + %x1 = arith.constant dense<32> : tensor<64x1xi64> loc(#loc70) + %c64_i32 = arith.constant 64 : i32 loc(#loc6) + %xoffset = tt.get_program_id x : i32 loc(#loc71) + %xoffset_6 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc72) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc73) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc74) + %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<64x1xi32> loc(#loc75) + %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<64x1xi32> loc(#loc75) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32> loc(#loc76) + %xmask_10 = arith.cmpi slt, %xindex_9, %xmask : tensor<64x1xi32> loc(#loc76) + %r0_base = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc77) + %r0_base_11 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<8xi32> -> tensor<1x8xi32> loc(#loc78) + %x0 = arith.extsi %xindex_9 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc79) + %x0_12 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc79) + %x0_13 = arith.remsi %x0, %x0_12 : tensor<64x1xi64> loc(#loc79) + %x1_14 = arith.divsi %x0, %x0_12 : tensor<64x1xi64> loc(#loc80) + %x1_15 = arith.remsi %x1_14, %x1 : tensor<64x1xi64> loc(#loc70) + %x2 = tt.splat %ks1 : i64 -> tensor<64x1xi64> loc(#loc81) + %x2_16 = arith.divsi %x0, %x2 : tensor<64x1xi64> loc(#loc81) + %fixed_17 = arith.cmpi ne, %x0_13, %fixed_0 : tensor<64x1xi64> loc(#loc117) + %fixed_18 = arith.subi %x1_14, %fixed : tensor<64x1xi64> loc(#loc115) + %fixed_19 = arith.select %fixed_17, %fixed_18, %x1_14 : tensor<64x1xi1>, tensor<64x1xi64> loc(#loc119) + %x5_20 = arith.cmpi slt, %xindex_9, %x5 : tensor<64x1xi32> loc(#loc116) + %x5_21 = arith.cmpi slt, %ks0, %x5_1 : i64 loc(#loc118) + %x5_22 = tt.splat %x5_21 : i1 -> tensor<64x1xi1> loc(#loc120) + %x5_23 = arith.cmpi ne, %x5_20, %x5_22 : tensor<64x1xi1> loc(#loc120) + %x5_24 = arith.select %x5_23, %fixed_19, %x1_14 : tensor<64x1xi1>, tensor<64x1xi64> loc(#loc121) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%_tmp4_26 = %cst_5) -> (tensor<64x8xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x8xi32> loc(#loc84) + %r0_index_27 = arith.addi %r0_index, %r0_base_11 : tensor<1x8xi32> loc(#loc84) + %r0_mask = arith.cmpi slt, %r0_index_27, %cst_4 : tensor<1x8xi32> loc(#loc85) + %tmp0 = arith.muli %x1_15, %cst_3 : tensor<64x1xi64> loc(#loc86) + %tmp0_28 = arith.extsi %r0_index_27 : tensor<1x8xi32> to tensor<1x8xi64> loc(#loc87) + %tmp0_29 = tt.broadcast %tmp0_28 : tensor<1x8xi64> -> tensor<64x8xi64> loc(#loc87) + %tmp0_30 = tt.broadcast %tmp0 : tensor<64x1xi64> -> tensor<64x8xi64> loc(#loc87) + %tmp0_31 = arith.addi %tmp0_29, %tmp0_30 : tensor<64x8xi64> loc(#loc87) + %tmp0_32 = arith.muli %x0_13, %cst_2 : tensor<64x1xi64> loc(#loc88) + %tmp0_33 = tt.broadcast %tmp0_32 : tensor<64x1xi64> -> tensor<64x8xi64> loc(#loc89) + %tmp0_34 = arith.addi %tmp0_31, %tmp0_33 : tensor<64x8xi64> loc(#loc89) + %tmp0_35 = arith.muli %ks0, %c4096_i64 : i64 loc(#loc90) + %tmp0_36 = tt.splat %tmp0_35 : i64 -> tensor<64x1xi64> loc(#loc91) + %tmp0_37 = arith.muli %tmp0_36, %x2_16 : tensor<64x1xi64> loc(#loc91) + %tmp0_38 = tt.broadcast %tmp0_37 : tensor<64x1xi64> -> tensor<64x8xi64> loc(#loc92) + %tmp0_39 = arith.addi %tmp0_34, %tmp0_38 : tensor<64x8xi64> loc(#loc92) + %tmp0_40 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc93) + %tmp0_41 = tt.addptr %tmp0_40, %tmp0_39 : tensor<64x8x!tt.ptr>, tensor<64x8xi64> loc(#loc93) + %tmp0_42 = tt.broadcast %r0_mask : tensor<1x8xi1> -> tensor<64x8xi1> loc(#loc94) + %tmp0_43 = tt.broadcast %xmask_10 : tensor<64x1xi1> -> tensor<64x8xi1> loc(#loc94) + %tmp0_44 = arith.andi %tmp0_42, %tmp0_43 : tensor<64x8xi1> loc(#loc94) + %tmp0_45 = tt.load %tmp0_41, %tmp0_44, %cst evictionPolicy = evict_first : tensor<64x8x!tt.ptr> loc(#loc95) + %tmp0_46 = arith.extf %tmp0_45 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc96) + %tmp1 = arith.muli %x0_13, %cst_3 : tensor<64x1xi64> loc(#loc97) + %tmp1_47 = tt.broadcast %tmp1 : tensor<64x1xi64> -> tensor<64x8xi64> loc(#loc98) + %tmp1_48 = arith.addi %tmp0_29, %tmp1_47 : tensor<64x8xi64> loc(#loc98) + %tmp1_49 = arith.muli %x5_24, %cst_3 : tensor<64x1xi64> loc(#loc99) + %tmp1_50 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc100) + %tmp1_51 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc101) + %tmp1_52 = arith.extui %tmp1_51 : i1 to i64 loc(#loc102) + %tmp1_53 = arith.muli %ks0, %tmp1_52 : i64 loc(#loc102) + %tmp1_54 = arith.extui %tmp1_50 : i1 to i64 loc(#loc122) + %tmp1_55 = arith.addi %tmp1_54, %tmp1_53 : i64 loc(#loc103) + %tmp1_56 = tt.splat %tmp1_55 : i64 -> tensor<64x1xi64> loc(#loc105) + %tmp1_57 = arith.muli %tmp1_49, %tmp1_56 : tensor<64x1xi64> loc(#loc105) + %tmp1_58 = tt.broadcast %tmp1_57 : tensor<64x1xi64> -> tensor<64x8xi64> loc(#loc106) + %tmp1_59 = arith.addi %tmp1_48, %tmp1_58 : tensor<64x8xi64> loc(#loc106) + %tmp1_60 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x8x!tt.ptr> loc(#loc107) + %tmp1_61 = tt.addptr %tmp1_60, %tmp1_59 : tensor<64x8x!tt.ptr>, tensor<64x8xi64> loc(#loc107) + %tmp1_62 = tt.load %tmp1_61, %tmp0_44, %cst evictionPolicy = evict_first : tensor<64x8x!tt.ptr> loc(#loc108) + %tmp1_63 = arith.extf %tmp1_62 : tensor<64x8xbf16> to tensor<64x8xf32> loc(#loc109) + %tmp2 = arith.mulf %tmp0_46, %tmp1_63 : tensor<64x8xf32> loc(#loc110) + %tmp5 = arith.addf %_tmp4_26, %tmp2 : tensor<64x8xf32> loc(#loc111) + %_tmp4_64 = arith.select %tmp0_44, %tmp5, %_tmp4_26 : tensor<64x8xi1>, tensor<64x8xf32> loc(#loc112) + scf.yield %_tmp4_64 : tensor<64x8xf32> loc(#loc52) + } loc(#loc83) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_26: f32 loc(callsite(#loc6 at #loc113)), %tmp4_27: f32 loc(callsite(#loc6 at #loc113))): + %tmp4_28 = arith.addf %tmp4_26, %tmp4_27 : f32 loc(#loc125) + tt.reduce.return %tmp4_28 : f32 loc(#loc123) + }) : (tensor<64x8xf32>) -> tensor<64xf32> loc(#loc123) + %tmp4_25 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc114) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc57) + %1 = tt.addptr %0, %xindex_9 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc57) + tt.store %1, %tmp4_25, %xmask_10 : tensor<64x1x!tt.ptr> loc(#loc58) + tt.return loc(#loc59) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":30:51) +#loc3 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc4 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc5 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":33:40) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:33) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:44) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:23) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":24:21) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:27) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:37) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":27:19) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:21) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":29:19) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":34:31) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":35:29) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:45) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:41) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:55) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:50) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:65) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:69) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:60) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:34) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:84) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:74) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:136) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:45) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:41) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:54) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:73) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:99) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:90) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:81) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:65) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:58) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:50) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:34) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:106) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:168) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":41:22) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":43:23) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:48) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:8) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:28) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:25) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:36) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:4) +#loc67 = loc("fixed"(#loc1)) +#loc68 = loc("x5"(#loc2)) +#loc69 = loc("fixed"(#loc4)) +#loc70 = loc("x1"(#loc8)) +#loc71 = loc("xoffset"(#loc9)) +#loc72 = loc("xoffset"(#loc10)) +#loc73 = loc("xindex"(#loc11)) +#loc74 = loc("xindex"(#loc12)) +#loc75 = loc("xindex"(#loc13)) +#loc76 = loc("xmask"(#loc14)) +#loc77 = loc("r0_base"(#loc15)) +#loc78 = loc("r0_base"(#loc16)) +#loc79 = loc("x0"(#loc17)) +#loc80 = loc("x1"(#loc18)) +#loc81 = loc("x2"(#loc19)) +#loc82 = loc("fixed"(#loc20)) +#loc83 = loc("_tmp4"(#loc7)) +#loc84 = loc("r0_index"(#loc23)) +#loc85 = loc("r0_mask"(#loc24)) +#loc86 = loc("tmp0"(#loc25)) +#loc87 = loc("tmp0"(#loc26)) +#loc88 = loc("tmp0"(#loc27)) +#loc89 = loc("tmp0"(#loc28)) +#loc90 = loc("tmp0"(#loc29)) +#loc91 = loc("tmp0"(#loc30)) +#loc92 = loc("tmp0"(#loc31)) +#loc93 = loc("tmp0"(#loc32)) +#loc94 = loc("tmp0"(#loc33)) +#loc95 = loc("tmp0"(#loc34)) +#loc96 = loc("tmp0"(#loc35)) +#loc97 = loc("tmp1"(#loc36)) +#loc98 = loc("tmp1"(#loc37)) +#loc99 = loc("tmp1"(#loc38)) +#loc100 = loc("tmp1"(#loc39)) +#loc101 = loc("tmp1"(#loc40)) +#loc102 = loc("tmp1"(#loc41)) +#loc103 = loc("tmp1"(#loc42)) +#loc104 = loc("tmp1"(#loc43)) +#loc105 = loc("tmp1"(#loc44)) +#loc106 = loc("tmp1"(#loc45)) +#loc107 = loc("tmp1"(#loc46)) +#loc108 = loc("tmp1"(#loc47)) +#loc109 = loc("tmp1"(#loc48)) +#loc110 = loc("tmp2"(#loc49)) +#loc111 = loc("tmp5"(#loc50)) +#loc112 = loc("_tmp4"(#loc51)) +#loc114 = loc("tmp4"(#loc56)) +#loc115 = loc(callsite(#loc67 at #loc68)) +#loc116 = loc(callsite(#loc3 at #loc68)) +#loc117 = loc(callsite(#loc69 at #loc68)) +#loc118 = loc(callsite(#loc5 at #loc68)) +#loc119 = loc(callsite(#loc82 at #loc68)) +#loc120 = loc(callsite(#loc21 at #loc68)) +#loc121 = loc(callsite(#loc22 at #loc68)) +#loc122 = loc(fused[#loc103, #loc104]) +#loc123 = loc(callsite(#loc53 at #loc113)) +#loc125 = loc(callsite(#loc55 at #loc123)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/__grp__triton_tem_fused_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/__grp__triton_tem_fused_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5d34986b56f7e53165b9eb17229d9acb5fdb202b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/__grp__triton_tem_fused_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_tem_fused_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.source", "triton_tem_fused_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.ttir", "triton_tem_fused_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.ttgir", "triton_tem_fused_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.llir", "triton_tem_fused_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.ptx", "triton_tem_fused_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.cubin", "triton_tem_fused_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.json new file mode 100644 index 0000000000000000000000000000000000000000..049009a20950a894e08b1a6024b934210895dd03 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.json @@ -0,0 +1 @@ +{"hash": "054ebb534471be1ce9a839a3e3970cc4956fbfc8199c32310b787c15a9ff453e", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 131072, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..984ff255b77226ec24d70dab496226d14e5818ea --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.llir @@ -0,0 +1,5227 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_tem_fused_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, ptr addrspace(1) %10, ptr addrspace(1) readnone captures(none) %11, ptr addrspace(1) readnone captures(none) %12) local_unnamed_addr #0 !dbg !5 { + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %15 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !9 + %16 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !10 + %17 = and i32 %15, 7, !dbg !11 + %18 = shl i32 %15, 23, !dbg !12 + %19 = shl nuw nsw i32 %16, 7, !dbg !13 + %20 = or disjoint i32 %18, %19, !dbg !14 + %21 = shl nuw nsw i32 %17, 21, !dbg !15 + %22 = shl nuw i32 %16, 16, !dbg !16 + %23 = and i32 %22, -262144, !dbg !16 + %24 = add i32 %21, %23, !dbg !17 + %25 = sext i32 %20 to i64, !dbg !18 + %26 = getelementptr bfloat, ptr addrspace(1) %0, i64 %25, !dbg !18 + %27 = sext i32 %24 to i64, !dbg !19 + %28 = getelementptr bfloat, ptr addrspace(1) %1, i64 %27, !dbg !19 + %29 = getelementptr bfloat, ptr addrspace(1) %2, i64 %27, !dbg !20 + %30 = shl nuw nsw i32 %17, 4, !dbg !21 + %31 = add nuw i32 %30, %14, !dbg !22 + %32 = shl nuw nsw i32 %17, 8, !dbg !23 + %33 = shl i32 %14, 4, !dbg !24 + %34 = add i32 %32, %33, !dbg !25 + %35 = shl i32 %14, 7, !dbg !26 + %36 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !27 + %37 = lshr i32 %36, 5, !dbg !27 + %38 = and i32 %36, 240, !dbg !27 + %39 = lshr exact i32 %38, 4, !dbg !27 + %40 = or disjoint i32 %39, 16, !dbg !27 + %41 = or disjoint i32 %39, 32, !dbg !27 + %42 = or disjoint i32 %39, 48, !dbg !27 + %43 = lshr i32 %36, 1, !dbg !27 + %44 = and i32 %43, 112, !dbg !27 + %45 = lshr i32 %36, 2, !dbg !27 + %46 = and i32 %45, 7, !dbg !27 + %47 = or disjoint i32 %44, %46, !dbg !27 + %48 = or disjoint i32 %39, %35, !dbg !28 + %49 = or disjoint i32 %40, %35, !dbg !28 + %50 = or disjoint i32 %41, %35, !dbg !28 + %51 = or disjoint i32 %42, %35, !dbg !28 + %52 = or disjoint i32 %48, 64, !dbg !28 + %53 = or disjoint i32 %48, 80, !dbg !28 + %54 = or disjoint i32 %48, 96, !dbg !28 + %55 = or disjoint i32 %48, 112, !dbg !28 + %56 = or disjoint i32 %47, %35, !dbg !28 + %57 = or disjoint i32 %56, 8, !dbg !28 + %58 = shl i32 %48, 12, !dbg !29 + %59 = shl i32 %49, 12, !dbg !29 + %60 = shl i32 %50, 12, !dbg !29 + %61 = shl i32 %51, 12, !dbg !29 + %62 = shl i32 %52, 12, !dbg !29 + %63 = shl i32 %53, 12, !dbg !29 + %64 = shl i32 %54, 12, !dbg !29 + %65 = shl i32 %55, 12, !dbg !29 + %66 = sext i32 %58 to i64, !dbg !32 + %67 = getelementptr bfloat, ptr addrspace(1) %26, i64 %66, !dbg !32 + %68 = sext i32 %59 to i64, !dbg !32 + %69 = getelementptr bfloat, ptr addrspace(1) %26, i64 %68, !dbg !32 + %70 = sext i32 %60 to i64, !dbg !32 + %71 = getelementptr bfloat, ptr addrspace(1) %26, i64 %70, !dbg !32 + %72 = sext i32 %61 to i64, !dbg !32 + %73 = getelementptr bfloat, ptr addrspace(1) %26, i64 %72, !dbg !32 + %74 = sext i32 %62 to i64, !dbg !32 + %75 = getelementptr bfloat, ptr addrspace(1) %26, i64 %74, !dbg !32 + %76 = sext i32 %63 to i64, !dbg !32 + %77 = getelementptr bfloat, ptr addrspace(1) %26, i64 %76, !dbg !32 + %78 = sext i32 %64 to i64, !dbg !32 + %79 = getelementptr bfloat, ptr addrspace(1) %26, i64 %78, !dbg !32 + %80 = sext i32 %65 to i64, !dbg !32 + %81 = getelementptr bfloat, ptr addrspace(1) %26, i64 %80, !dbg !32 + %82 = shl nuw nsw i32 %36, 3, !dbg !33 + %83 = and i32 %82, 120, !dbg !33 + %84 = zext nneg i32 %83 to i64, !dbg !34 + %85 = getelementptr bfloat, ptr addrspace(1) %67, i64 %84, !dbg !34 + %86 = getelementptr bfloat, ptr addrspace(1) %69, i64 %84, !dbg !34 + %87 = getelementptr bfloat, ptr addrspace(1) %71, i64 %84, !dbg !34 + %88 = getelementptr bfloat, ptr addrspace(1) %73, i64 %84, !dbg !34 + %89 = getelementptr bfloat, ptr addrspace(1) %75, i64 %84, !dbg !34 + %90 = getelementptr bfloat, ptr addrspace(1) %77, i64 %84, !dbg !34 + %91 = getelementptr bfloat, ptr addrspace(1) %79, i64 %84, !dbg !34 + %92 = getelementptr bfloat, ptr addrspace(1) %81, i64 %84, !dbg !34 + %93 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %85) #2, !dbg !35 + %94 = extractvalue { i32, i32, i32, i32 } %93, 0, !dbg !35 + %95 = extractvalue { i32, i32, i32, i32 } %93, 1, !dbg !35 + %96 = extractvalue { i32, i32, i32, i32 } %93, 2, !dbg !35 + %97 = extractvalue { i32, i32, i32, i32 } %93, 3, !dbg !35 + %98 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %86) #2, !dbg !35 + %99 = extractvalue { i32, i32, i32, i32 } %98, 0, !dbg !35 + %100 = extractvalue { i32, i32, i32, i32 } %98, 1, !dbg !35 + %101 = extractvalue { i32, i32, i32, i32 } %98, 2, !dbg !35 + %102 = extractvalue { i32, i32, i32, i32 } %98, 3, !dbg !35 + %103 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %87) #2, !dbg !35 + %104 = extractvalue { i32, i32, i32, i32 } %103, 0, !dbg !35 + %105 = extractvalue { i32, i32, i32, i32 } %103, 1, !dbg !35 + %106 = extractvalue { i32, i32, i32, i32 } %103, 2, !dbg !35 + %107 = extractvalue { i32, i32, i32, i32 } %103, 3, !dbg !35 + %108 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %88) #2, !dbg !35 + %109 = extractvalue { i32, i32, i32, i32 } %108, 0, !dbg !35 + %110 = extractvalue { i32, i32, i32, i32 } %108, 1, !dbg !35 + %111 = extractvalue { i32, i32, i32, i32 } %108, 2, !dbg !35 + %112 = extractvalue { i32, i32, i32, i32 } %108, 3, !dbg !35 + %113 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %89) #2, !dbg !35 + %114 = extractvalue { i32, i32, i32, i32 } %113, 0, !dbg !35 + %115 = extractvalue { i32, i32, i32, i32 } %113, 1, !dbg !35 + %116 = extractvalue { i32, i32, i32, i32 } %113, 2, !dbg !35 + %117 = extractvalue { i32, i32, i32, i32 } %113, 3, !dbg !35 + %118 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %90) #2, !dbg !35 + %119 = extractvalue { i32, i32, i32, i32 } %118, 0, !dbg !35 + %120 = extractvalue { i32, i32, i32, i32 } %118, 1, !dbg !35 + %121 = extractvalue { i32, i32, i32, i32 } %118, 2, !dbg !35 + %122 = extractvalue { i32, i32, i32, i32 } %118, 3, !dbg !35 + %123 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %91) #2, !dbg !35 + %124 = extractvalue { i32, i32, i32, i32 } %123, 0, !dbg !35 + %125 = extractvalue { i32, i32, i32, i32 } %123, 1, !dbg !35 + %126 = extractvalue { i32, i32, i32, i32 } %123, 2, !dbg !35 + %127 = extractvalue { i32, i32, i32, i32 } %123, 3, !dbg !35 + %128 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %92) #2, !dbg !35 + %129 = extractvalue { i32, i32, i32, i32 } %128, 0, !dbg !35 + %130 = extractvalue { i32, i32, i32, i32 } %128, 1, !dbg !35 + %131 = extractvalue { i32, i32, i32, i32 } %128, 2, !dbg !35 + %132 = extractvalue { i32, i32, i32, i32 } %128, 3, !dbg !35 + %133 = and i32 %36, 7, !dbg !35 + %134 = shl nuw nsw i32 %133, 4, !dbg !35 + %135 = shl nuw nsw i32 %38, 3, !dbg !35 + %136 = and i32 %36, 112, !dbg !35 + %137 = and i32 %36, 8, !dbg !35 + %138 = shl nuw nsw i32 %137, 11, !dbg !35 + %139 = or disjoint i32 %134, %135, !dbg !35 + %140 = xor i32 %139, %136, !dbg !35 + %141 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %138, !dbg !35 + %142 = getelementptr inbounds nuw i8, ptr addrspace(3) %141, i32 %140, !dbg !35 + %143 = insertelement <4 x i32> poison, i32 %94, i64 0, !dbg !35 + %144 = insertelement <4 x i32> %143, i32 %95, i64 1, !dbg !35 + %145 = insertelement <4 x i32> %144, i32 %96, i64 2, !dbg !35 + %146 = insertelement <4 x i32> %145, i32 %97, i64 3, !dbg !35 + store <4 x i32> %146, ptr addrspace(3) %142, align 16, !dbg !35 + %147 = getelementptr inbounds nuw i8, ptr addrspace(3) %142, i32 2048, !dbg !35 + %148 = insertelement <4 x i32> poison, i32 %99, i64 0, !dbg !35 + %149 = insertelement <4 x i32> %148, i32 %100, i64 1, !dbg !35 + %150 = insertelement <4 x i32> %149, i32 %101, i64 2, !dbg !35 + %151 = insertelement <4 x i32> %150, i32 %102, i64 3, !dbg !35 + store <4 x i32> %151, ptr addrspace(3) %147, align 16, !dbg !35 + %152 = getelementptr inbounds nuw i8, ptr addrspace(3) %142, i32 4096, !dbg !35 + %153 = insertelement <4 x i32> poison, i32 %104, i64 0, !dbg !35 + %154 = insertelement <4 x i32> %153, i32 %105, i64 1, !dbg !35 + %155 = insertelement <4 x i32> %154, i32 %106, i64 2, !dbg !35 + %156 = insertelement <4 x i32> %155, i32 %107, i64 3, !dbg !35 + store <4 x i32> %156, ptr addrspace(3) %152, align 16, !dbg !35 + %157 = getelementptr inbounds nuw i8, ptr addrspace(3) %142, i32 6144, !dbg !35 + %158 = insertelement <4 x i32> poison, i32 %109, i64 0, !dbg !35 + %159 = insertelement <4 x i32> %158, i32 %110, i64 1, !dbg !35 + %160 = insertelement <4 x i32> %159, i32 %111, i64 2, !dbg !35 + %161 = insertelement <4 x i32> %160, i32 %112, i64 3, !dbg !35 + store <4 x i32> %161, ptr addrspace(3) %157, align 16, !dbg !35 + %162 = getelementptr inbounds nuw i8, ptr addrspace(3) %142, i32 8192, !dbg !35 + %163 = insertelement <4 x i32> poison, i32 %114, i64 0, !dbg !35 + %164 = insertelement <4 x i32> %163, i32 %115, i64 1, !dbg !35 + %165 = insertelement <4 x i32> %164, i32 %116, i64 2, !dbg !35 + %166 = insertelement <4 x i32> %165, i32 %117, i64 3, !dbg !35 + store <4 x i32> %166, ptr addrspace(3) %162, align 16, !dbg !35 + %167 = getelementptr inbounds nuw i8, ptr addrspace(3) %142, i32 10240, !dbg !35 + %168 = insertelement <4 x i32> poison, i32 %119, i64 0, !dbg !35 + %169 = insertelement <4 x i32> %168, i32 %120, i64 1, !dbg !35 + %170 = insertelement <4 x i32> %169, i32 %121, i64 2, !dbg !35 + %171 = insertelement <4 x i32> %170, i32 %122, i64 3, !dbg !35 + store <4 x i32> %171, ptr addrspace(3) %167, align 16, !dbg !35 + %172 = getelementptr inbounds nuw i8, ptr addrspace(3) %142, i32 12288, !dbg !35 + %173 = insertelement <4 x i32> poison, i32 %124, i64 0, !dbg !35 + %174 = insertelement <4 x i32> %173, i32 %125, i64 1, !dbg !35 + %175 = insertelement <4 x i32> %174, i32 %126, i64 2, !dbg !35 + %176 = insertelement <4 x i32> %175, i32 %127, i64 3, !dbg !35 + store <4 x i32> %176, ptr addrspace(3) %172, align 16, !dbg !35 + %177 = getelementptr inbounds nuw i8, ptr addrspace(3) %142, i32 14336, !dbg !35 + %178 = insertelement <4 x i32> poison, i32 %129, i64 0, !dbg !35 + %179 = insertelement <4 x i32> %178, i32 %130, i64 1, !dbg !35 + %180 = insertelement <4 x i32> %179, i32 %131, i64 2, !dbg !35 + %181 = insertelement <4 x i32> %180, i32 %132, i64 3, !dbg !35 + store <4 x i32> %181, ptr addrspace(3) %177, align 16, !dbg !35 + %182 = sext i32 %34 to i64, !dbg !36 + %183 = getelementptr i32, ptr addrspace(1) %6, i64 %182, !dbg !36 + %184 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %183) #2, !dbg !37 + %185 = shl i32 %184, 7, !dbg !38 + %186 = sext i32 %31 to i64, !dbg !39 + %187 = getelementptr i32, ptr addrspace(1) %5, i64 %186, !dbg !39 + %188 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %187) #2, !dbg !40 + %189 = shl i32 %188, 1, !dbg !41 + %190 = and i32 %36, 3, !dbg !42 + %191 = zext nneg i32 %15 to i64, !dbg !43 + %192 = getelementptr i64, ptr addrspace(1) %9, i64 %191, !dbg !43 + %193 = icmp sgt i32 %189, 0, !dbg !45 + %194 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %192, i1 %193) #2, !dbg !46 + %195 = sext i32 %56 to i64, !dbg !47 + %196 = sext i32 %57 to i64, !dbg !47 + %197 = icmp sgt i64 %194, %195, !dbg !48 + %198 = icmp sgt i64 %194, %196, !dbg !48 + %199 = or disjoint i32 %185, %39, !dbg !49 + %200 = or disjoint i32 %185, %40, !dbg !49 + %201 = or disjoint i32 %185, %41, !dbg !49 + %202 = or disjoint i32 %185, %42, !dbg !49 + %203 = shl i32 %199, 7, !dbg !50 + %204 = shl i32 %200, 7, !dbg !50 + %205 = shl i32 %201, 7, !dbg !50 + %206 = shl i32 %202, 7, !dbg !50 + %207 = sext i32 %203 to i64, !dbg !51 + %208 = getelementptr bfloat, ptr addrspace(1) %28, i64 %207, !dbg !51 + %209 = sext i32 %204 to i64, !dbg !51 + %210 = getelementptr bfloat, ptr addrspace(1) %28, i64 %209, !dbg !51 + %211 = sext i32 %205 to i64, !dbg !51 + %212 = getelementptr bfloat, ptr addrspace(1) %28, i64 %211, !dbg !51 + %213 = sext i32 %206 to i64, !dbg !51 + %214 = getelementptr bfloat, ptr addrspace(1) %28, i64 %213, !dbg !51 + %215 = getelementptr bfloat, ptr addrspace(1) %208, i64 %84, !dbg !52 + %216 = getelementptr bfloat, ptr addrspace(1) %210, i64 %84, !dbg !52 + %217 = getelementptr bfloat, ptr addrspace(1) %212, i64 %84, !dbg !52 + %218 = getelementptr bfloat, ptr addrspace(1) %214, i64 %84, !dbg !52 + %219 = shl nuw nsw i32 %137, 10, !dbg !53 + %220 = or disjoint i32 %140, %219, !dbg !53 + %221 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %220, !dbg !53 + %222 = select i1 %193, i32 16, i32 0, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %221, ptr addrspace(1) %215, i32 %222) #2, !dbg !53 + %223 = or disjoint i32 %220, 2048, !dbg !53 + %224 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %223, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %224, ptr addrspace(1) %216, i32 %222) #2, !dbg !53 + %225 = or disjoint i32 %220, 4096, !dbg !53 + %226 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %225, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %226, ptr addrspace(1) %217, i32 %222) #2, !dbg !53 + %227 = or disjoint i32 %220, 6144, !dbg !53 + %228 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %227, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %228, ptr addrspace(1) %218, i32 %222) #2, !dbg !53 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !53 + %229 = getelementptr bfloat, ptr addrspace(1) %29, i64 %207, !dbg !51 + %230 = getelementptr bfloat, ptr addrspace(1) %29, i64 %209, !dbg !51 + %231 = getelementptr bfloat, ptr addrspace(1) %29, i64 %211, !dbg !51 + %232 = getelementptr bfloat, ptr addrspace(1) %29, i64 %213, !dbg !51 + %233 = getelementptr bfloat, ptr addrspace(1) %229, i64 %84, !dbg !52 + %234 = getelementptr bfloat, ptr addrspace(1) %230, i64 %84, !dbg !52 + %235 = getelementptr bfloat, ptr addrspace(1) %231, i64 %84, !dbg !52 + %236 = getelementptr bfloat, ptr addrspace(1) %232, i64 %84, !dbg !52 + %237 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %220, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %237, ptr addrspace(1) %233, i32 %222) #2, !dbg !53 + %238 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %223, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %238, ptr addrspace(1) %234, i32 %222) #2, !dbg !53 + %239 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %225, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %239, ptr addrspace(1) %235, i32 %222) #2, !dbg !53 + %240 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %227, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %240, ptr addrspace(1) %236, i32 %222) #2, !dbg !53 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !53 + %241 = icmp sgt i32 %189, 1, !dbg !45 + %242 = or disjoint i32 %185, 64, !dbg !54 + %243 = or disjoint i32 %242, %39, !dbg !49 + %244 = or disjoint i32 %242, %40, !dbg !49 + %245 = or disjoint i32 %242, %41, !dbg !49 + %246 = or disjoint i32 %242, %42, !dbg !49 + %247 = shl i32 %243, 7, !dbg !50 + %248 = shl i32 %244, 7, !dbg !50 + %249 = shl i32 %245, 7, !dbg !50 + %250 = shl i32 %246, 7, !dbg !50 + %251 = sext i32 %247 to i64, !dbg !51 + %252 = getelementptr bfloat, ptr addrspace(1) %28, i64 %251, !dbg !51 + %253 = sext i32 %248 to i64, !dbg !51 + %254 = getelementptr bfloat, ptr addrspace(1) %28, i64 %253, !dbg !51 + %255 = sext i32 %249 to i64, !dbg !51 + %256 = getelementptr bfloat, ptr addrspace(1) %28, i64 %255, !dbg !51 + %257 = sext i32 %250 to i64, !dbg !51 + %258 = getelementptr bfloat, ptr addrspace(1) %28, i64 %257, !dbg !51 + %259 = getelementptr bfloat, ptr addrspace(1) %252, i64 %84, !dbg !52 + %260 = getelementptr bfloat, ptr addrspace(1) %254, i64 %84, !dbg !52 + %261 = getelementptr bfloat, ptr addrspace(1) %256, i64 %84, !dbg !52 + %262 = getelementptr bfloat, ptr addrspace(1) %258, i64 %84, !dbg !52 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %263 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %220, !dbg !53 + %264 = select i1 %241, i32 16, i32 0, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %263, ptr addrspace(1) %259, i32 %264) #2, !dbg !53 + %265 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %223, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %265, ptr addrspace(1) %260, i32 %264) #2, !dbg !53 + %266 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %225, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %266, ptr addrspace(1) %261, i32 %264) #2, !dbg !53 + %267 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %227, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %267, ptr addrspace(1) %262, i32 %264) #2, !dbg !53 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !53 + %268 = getelementptr bfloat, ptr addrspace(1) %29, i64 %251, !dbg !51 + %269 = getelementptr bfloat, ptr addrspace(1) %29, i64 %253, !dbg !51 + %270 = getelementptr bfloat, ptr addrspace(1) %29, i64 %255, !dbg !51 + %271 = getelementptr bfloat, ptr addrspace(1) %29, i64 %257, !dbg !51 + %272 = getelementptr bfloat, ptr addrspace(1) %268, i64 %84, !dbg !52 + %273 = getelementptr bfloat, ptr addrspace(1) %269, i64 %84, !dbg !52 + %274 = getelementptr bfloat, ptr addrspace(1) %270, i64 %84, !dbg !52 + %275 = getelementptr bfloat, ptr addrspace(1) %271, i64 %84, !dbg !52 + %276 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %220, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %276, ptr addrspace(1) %272, i32 %264) #2, !dbg !53 + %277 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %223, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %277, ptr addrspace(1) %273, i32 %264) #2, !dbg !53 + %278 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %225, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %278, ptr addrspace(1) %274, i32 %264) #2, !dbg !53 + %279 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %227, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %279, ptr addrspace(1) %275, i32 %264) #2, !dbg !53 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !53 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #2, !dbg !55 + %invariant.gep = getelementptr bfloat, ptr addrspace(1) %28, i64 %84, !dbg !45 + %invariant.gep335 = getelementptr bfloat, ptr addrspace(1) %29, i64 %84, !dbg !45 + br i1 %193, label %.lr.ph, label %._crit_edge, !dbg !45 + +.lr.ph: ; preds = %13 + %280 = tail call i32 @llvm.umin.i32(i32 %189, i32 32), !dbg !56 + %281 = shl nuw nsw i32 %190, 1, !dbg !42 + %282 = or disjoint i32 %185, %281, !dbg !42 + %283 = insertelement <8 x i32> poison, i32 %282, i64 0, !dbg !57 + %284 = add nsw i32 %280, -2 + %285 = add nsw i32 %280, -1 + %286 = shufflevector <8 x i32> %283, <8 x i32> poison, <16 x i32> + %287 = insertelement <16 x i32> %286, i32 %282, i64 8 + %288 = insertelement <16 x i32> %287, i32 %282, i64 9 + %289 = insertelement <16 x i32> %288, i32 %282, i64 10 + %290 = insertelement <16 x i32> %289, i32 %282, i64 11 + %291 = insertelement <16 x i32> %290, i32 %282, i64 12 + %292 = insertelement <16 x i32> %291, i32 %282, i64 13 + %293 = insertelement <16 x i32> %292, i32 %282, i64 14 + %294 = or disjoint <16 x i32> %293, + %295 = insertelement <16 x i32> %294, i32 %282, i64 15 + %296 = insertelement <32 x i32> poison, i32 %57, i64 0, !dbg !58 + %297 = insertelement <32 x i32> %296, i32 %56, i64 1, !dbg !58 + %298 = shufflevector <32 x i32> %297, <32 x i32> poison, <32 x i32> , !dbg !58 + %299 = insertelement <8 x i64> poison, i64 %194, i64 0, !dbg !59 + %300 = shufflevector <8 x i64> %299, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !59 + br label %301, !dbg !45 + +301: ; preds = %.lr.ph, %__nv_exp2f.exit291 + %302 = phi i32 [ 64, %.lr.ph ], [ %1899, %__nv_exp2f.exit291 ] + %303 = phi i32 [ -1, %.lr.ph ], [ %380, %__nv_exp2f.exit291 ] + %304 = phi i32 [ 1, %.lr.ph ], [ %1903, %__nv_exp2f.exit291 ] + %305 = phi i32 [ 64, %.lr.ph ], [ %1900, %__nv_exp2f.exit291 ] + %306 = phi float [ 0.000000e+00, %.lr.ph ], [ %1466, %__nv_exp2f.exit291 ] + %307 = phi float [ 0.000000e+00, %.lr.ph ], [ %1467, %__nv_exp2f.exit291 ] + %308 = phi float [ 0.000000e+00, %.lr.ph ], [ %1813, %__nv_exp2f.exit291 ] + %309 = phi float [ 0.000000e+00, %.lr.ph ], [ %1814, %__nv_exp2f.exit291 ] + %310 = phi float [ 0.000000e+00, %.lr.ph ], [ %1815, %__nv_exp2f.exit291 ] + %311 = phi float [ 0.000000e+00, %.lr.ph ], [ %1816, %__nv_exp2f.exit291 ] + %312 = phi float [ 0.000000e+00, %.lr.ph ], [ %1817, %__nv_exp2f.exit291 ] + %313 = phi float [ 0.000000e+00, %.lr.ph ], [ %1818, %__nv_exp2f.exit291 ] + %314 = phi float [ 0.000000e+00, %.lr.ph ], [ %1819, %__nv_exp2f.exit291 ] + %315 = phi float [ 0.000000e+00, %.lr.ph ], [ %1820, %__nv_exp2f.exit291 ] + %316 = phi float [ 0.000000e+00, %.lr.ph ], [ %1821, %__nv_exp2f.exit291 ] + %317 = phi float [ 0.000000e+00, %.lr.ph ], [ %1822, %__nv_exp2f.exit291 ] + %318 = phi float [ 0.000000e+00, %.lr.ph ], [ %1823, %__nv_exp2f.exit291 ] + %319 = phi float [ 0.000000e+00, %.lr.ph ], [ %1824, %__nv_exp2f.exit291 ] + %320 = phi float [ 0.000000e+00, %.lr.ph ], [ %1825, %__nv_exp2f.exit291 ] + %321 = phi float [ 0.000000e+00, %.lr.ph ], [ %1826, %__nv_exp2f.exit291 ] + %322 = phi float [ 0.000000e+00, %.lr.ph ], [ %1827, %__nv_exp2f.exit291 ] + %323 = phi float [ 0.000000e+00, %.lr.ph ], [ %1828, %__nv_exp2f.exit291 ] + %324 = phi float [ 0.000000e+00, %.lr.ph ], [ %1829, %__nv_exp2f.exit291 ] + %325 = phi float [ 0.000000e+00, %.lr.ph ], [ %1830, %__nv_exp2f.exit291 ] + %326 = phi float [ 0.000000e+00, %.lr.ph ], [ %1831, %__nv_exp2f.exit291 ] + %327 = phi float [ 0.000000e+00, %.lr.ph ], [ %1832, %__nv_exp2f.exit291 ] + %328 = phi float [ 0.000000e+00, %.lr.ph ], [ %1833, %__nv_exp2f.exit291 ] + %329 = phi float [ 0.000000e+00, %.lr.ph ], [ %1834, %__nv_exp2f.exit291 ] + %330 = phi float [ 0.000000e+00, %.lr.ph ], [ %1835, %__nv_exp2f.exit291 ] + %331 = phi float [ 0.000000e+00, %.lr.ph ], [ %1836, %__nv_exp2f.exit291 ] + %332 = phi float [ 0.000000e+00, %.lr.ph ], [ %1837, %__nv_exp2f.exit291 ] + %333 = phi float [ 0.000000e+00, %.lr.ph ], [ %1838, %__nv_exp2f.exit291 ] + %334 = phi float [ 0.000000e+00, %.lr.ph ], [ %1839, %__nv_exp2f.exit291 ] + %335 = phi float [ 0.000000e+00, %.lr.ph ], [ %1840, %__nv_exp2f.exit291 ] + %336 = phi float [ 0.000000e+00, %.lr.ph ], [ %1841, %__nv_exp2f.exit291 ] + %337 = phi float [ 0.000000e+00, %.lr.ph ], [ %1842, %__nv_exp2f.exit291 ] + %338 = phi float [ 0.000000e+00, %.lr.ph ], [ %1843, %__nv_exp2f.exit291 ] + %339 = phi float [ 0.000000e+00, %.lr.ph ], [ %1844, %__nv_exp2f.exit291 ] + %340 = phi float [ 0.000000e+00, %.lr.ph ], [ %1845, %__nv_exp2f.exit291 ] + %341 = phi float [ 0.000000e+00, %.lr.ph ], [ %1846, %__nv_exp2f.exit291 ] + %342 = phi float [ 0.000000e+00, %.lr.ph ], [ %1847, %__nv_exp2f.exit291 ] + %343 = phi float [ 0.000000e+00, %.lr.ph ], [ %1848, %__nv_exp2f.exit291 ] + %344 = phi float [ 0.000000e+00, %.lr.ph ], [ %1849, %__nv_exp2f.exit291 ] + %345 = phi float [ 0.000000e+00, %.lr.ph ], [ %1850, %__nv_exp2f.exit291 ] + %346 = phi float [ 0.000000e+00, %.lr.ph ], [ %1851, %__nv_exp2f.exit291 ] + %347 = phi float [ 0.000000e+00, %.lr.ph ], [ %1852, %__nv_exp2f.exit291 ] + %348 = phi float [ 0.000000e+00, %.lr.ph ], [ %1853, %__nv_exp2f.exit291 ] + %349 = phi float [ 0.000000e+00, %.lr.ph ], [ %1854, %__nv_exp2f.exit291 ] + %350 = phi float [ 0.000000e+00, %.lr.ph ], [ %1855, %__nv_exp2f.exit291 ] + %351 = phi float [ 0.000000e+00, %.lr.ph ], [ %1856, %__nv_exp2f.exit291 ] + %352 = phi float [ 0.000000e+00, %.lr.ph ], [ %1857, %__nv_exp2f.exit291 ] + %353 = phi float [ 0.000000e+00, %.lr.ph ], [ %1858, %__nv_exp2f.exit291 ] + %354 = phi float [ 0.000000e+00, %.lr.ph ], [ %1859, %__nv_exp2f.exit291 ] + %355 = phi float [ 0.000000e+00, %.lr.ph ], [ %1860, %__nv_exp2f.exit291 ] + %356 = phi float [ 0.000000e+00, %.lr.ph ], [ %1861, %__nv_exp2f.exit291 ] + %357 = phi float [ 0.000000e+00, %.lr.ph ], [ %1862, %__nv_exp2f.exit291 ] + %358 = phi float [ 0.000000e+00, %.lr.ph ], [ %1863, %__nv_exp2f.exit291 ] + %359 = phi float [ 0.000000e+00, %.lr.ph ], [ %1864, %__nv_exp2f.exit291 ] + %360 = phi float [ 0.000000e+00, %.lr.ph ], [ %1865, %__nv_exp2f.exit291 ] + %361 = phi float [ 0.000000e+00, %.lr.ph ], [ %1866, %__nv_exp2f.exit291 ] + %362 = phi float [ 0.000000e+00, %.lr.ph ], [ %1867, %__nv_exp2f.exit291 ] + %363 = phi float [ 0.000000e+00, %.lr.ph ], [ %1868, %__nv_exp2f.exit291 ] + %364 = phi float [ 0.000000e+00, %.lr.ph ], [ %1869, %__nv_exp2f.exit291 ] + %365 = phi float [ 0.000000e+00, %.lr.ph ], [ %1870, %__nv_exp2f.exit291 ] + %366 = phi float [ 0.000000e+00, %.lr.ph ], [ %1871, %__nv_exp2f.exit291 ] + %367 = phi float [ 0.000000e+00, %.lr.ph ], [ %1872, %__nv_exp2f.exit291 ] + %368 = phi float [ 0.000000e+00, %.lr.ph ], [ %1873, %__nv_exp2f.exit291 ] + %369 = phi float [ 0.000000e+00, %.lr.ph ], [ %1874, %__nv_exp2f.exit291 ] + %370 = phi float [ 0.000000e+00, %.lr.ph ], [ %1875, %__nv_exp2f.exit291 ] + %371 = phi float [ 0.000000e+00, %.lr.ph ], [ %1876, %__nv_exp2f.exit291 ] + %372 = phi i32 [ 0, %.lr.ph ], [ %1880, %__nv_exp2f.exit291 ] + %373 = phi <2 x float> [ splat (float 0xFFF0000000000000), %.lr.ph ], [ %1205, %__nv_exp2f.exit291 ] + %374 = phi <16 x i32> [ %295, %.lr.ph ], [ %1879, %__nv_exp2f.exit291 ] + %375 = shufflevector <16 x i32> %374, <16 x i32> poison, <32 x i32> + %376 = icmp slt i32 %372, %284, !dbg !45 + %377 = icmp slt i32 %372, %285, !dbg !45 + %378 = add i32 %303, 1, !dbg !45 + %379 = icmp sgt i32 %378, 2, !dbg !45 + %380 = select i1 %379, i32 0, i32 %378, !dbg !45 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !53 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %381 = shl i32 %380, 13, !dbg !53 + %382 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %381, !dbg !53 + %383 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %37, i32 0, i32 31), !dbg !55 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !55 + %384 = shl i32 %383, 11, !dbg !55 + %385 = and i32 %384, 8192, !dbg !55 + %386 = add i32 %385, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !55 + %387 = lshr exact i32 %386, 4, !dbg !55 + %388 = and i32 %387, 16383, !dbg !55 + %389 = zext nneg i32 %388 to i64, !dbg !55 + %390 = or disjoint i64 %389, 4611686293372403712, !dbg !55 + %391 = ptrtoint ptr addrspace(3) %382 to i32, !dbg !55 + %392 = lshr exact i32 %391, 4, !dbg !55 + %393 = and i32 %392, 16383, !dbg !55 + %394 = zext nneg i32 %393 to i64, !dbg !55 + %395 = or disjoint i64 %394, 4611686293338849280, !dbg !55 + %396 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %390, i64 %395) #2, !dbg !55 + %397 = add i32 %385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 32), !dbg !55 + %398 = lshr exact i32 %397, 4, !dbg !55 + %399 = and i32 %398, 16383, !dbg !55 + %400 = zext nneg i32 %399 to i64, !dbg !55 + %401 = or disjoint i64 %400, 4611686293372403712, !dbg !55 + %402 = add i32 %391, 32, !dbg !55 + %403 = lshr exact i32 %402, 4, !dbg !55 + %404 = and i32 %403, 16383, !dbg !55 + %405 = zext nneg i32 %404 to i64, !dbg !55 + %406 = or disjoint i64 %405, 4611686293338849280, !dbg !55 + %407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 0, !dbg !55 + %408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 1, !dbg !55 + %409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 2, !dbg !55 + %410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 3, !dbg !55 + %411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 4, !dbg !55 + %412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 5, !dbg !55 + %413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 6, !dbg !55 + %414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 7, !dbg !55 + %415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 8, !dbg !55 + %416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 9, !dbg !55 + %417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 10, !dbg !55 + %418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 11, !dbg !55 + %419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 12, !dbg !55 + %420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 13, !dbg !55 + %421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 14, !dbg !55 + %422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 15, !dbg !55 + %423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 16, !dbg !55 + %424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 17, !dbg !55 + %425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 18, !dbg !55 + %426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 19, !dbg !55 + %427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 20, !dbg !55 + %428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 21, !dbg !55 + %429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 22, !dbg !55 + %430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 23, !dbg !55 + %431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 24, !dbg !55 + %432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 25, !dbg !55 + %433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 26, !dbg !55 + %434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 27, !dbg !55 + %435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 28, !dbg !55 + %436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 29, !dbg !55 + %437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 30, !dbg !55 + %438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %396, 31, !dbg !55 + %439 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %407, float %408, float %409, float %410, float %411, float %412, float %413, float %414, float %415, float %416, float %417, float %418, float %419, float %420, float %421, float %422, float %423, float %424, float %425, float %426, float %427, float %428, float %429, float %430, float %431, float %432, float %433, float %434, float %435, float %436, float %437, float %438, i64 %401, i64 %406, i1 true) #2, !dbg !55 + %440 = add i32 %385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 64), !dbg !55 + %441 = lshr exact i32 %440, 4, !dbg !55 + %442 = and i32 %441, 16383, !dbg !55 + %443 = zext nneg i32 %442 to i64, !dbg !55 + %444 = or disjoint i64 %443, 4611686293372403712, !dbg !55 + %445 = add i32 %391, 64, !dbg !55 + %446 = lshr exact i32 %445, 4, !dbg !55 + %447 = and i32 %446, 16383, !dbg !55 + %448 = zext nneg i32 %447 to i64, !dbg !55 + %449 = or disjoint i64 %448, 4611686293338849280, !dbg !55 + %450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 0, !dbg !55 + %451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 1, !dbg !55 + %452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 2, !dbg !55 + %453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 3, !dbg !55 + %454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 4, !dbg !55 + %455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 5, !dbg !55 + %456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 6, !dbg !55 + %457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 7, !dbg !55 + %458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 8, !dbg !55 + %459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 9, !dbg !55 + %460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 10, !dbg !55 + %461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 11, !dbg !55 + %462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 12, !dbg !55 + %463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 13, !dbg !55 + %464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 14, !dbg !55 + %465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 15, !dbg !55 + %466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 16, !dbg !55 + %467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 17, !dbg !55 + %468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 18, !dbg !55 + %469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 19, !dbg !55 + %470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 20, !dbg !55 + %471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 21, !dbg !55 + %472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 22, !dbg !55 + %473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 23, !dbg !55 + %474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 24, !dbg !55 + %475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 25, !dbg !55 + %476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 26, !dbg !55 + %477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 27, !dbg !55 + %478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 28, !dbg !55 + %479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 29, !dbg !55 + %480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 30, !dbg !55 + %481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %439, 31, !dbg !55 + %482 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %450, float %451, float %452, float %453, float %454, float %455, float %456, float %457, float %458, float %459, float %460, float %461, float %462, float %463, float %464, float %465, float %466, float %467, float %468, float %469, float %470, float %471, float %472, float %473, float %474, float %475, float %476, float %477, float %478, float %479, float %480, float %481, i64 %444, i64 %449, i1 true) #2, !dbg !55 + %483 = add i32 %385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 96), !dbg !55 + %484 = lshr exact i32 %483, 4, !dbg !55 + %485 = and i32 %484, 16383, !dbg !55 + %486 = zext nneg i32 %485 to i64, !dbg !55 + %487 = or disjoint i64 %486, 4611686293372403712, !dbg !55 + %488 = add i32 %391, 96, !dbg !55 + %489 = lshr exact i32 %488, 4, !dbg !55 + %490 = and i32 %489, 16383, !dbg !55 + %491 = zext nneg i32 %490 to i64, !dbg !55 + %492 = or disjoint i64 %491, 4611686293338849280, !dbg !55 + %493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 0, !dbg !55 + %494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 1, !dbg !55 + %495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 2, !dbg !55 + %496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 3, !dbg !55 + %497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 4, !dbg !55 + %498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 5, !dbg !55 + %499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 6, !dbg !55 + %500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 7, !dbg !55 + %501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 8, !dbg !55 + %502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 9, !dbg !55 + %503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 10, !dbg !55 + %504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 11, !dbg !55 + %505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 12, !dbg !55 + %506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 13, !dbg !55 + %507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 14, !dbg !55 + %508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 15, !dbg !55 + %509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 16, !dbg !55 + %510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 17, !dbg !55 + %511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 18, !dbg !55 + %512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 19, !dbg !55 + %513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 20, !dbg !55 + %514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 21, !dbg !55 + %515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 22, !dbg !55 + %516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 23, !dbg !55 + %517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 24, !dbg !55 + %518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 25, !dbg !55 + %519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 26, !dbg !55 + %520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 27, !dbg !55 + %521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 28, !dbg !55 + %522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 29, !dbg !55 + %523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 30, !dbg !55 + %524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %482, 31, !dbg !55 + %525 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %493, float %494, float %495, float %496, float %497, float %498, float %499, float %500, float %501, float %502, float %503, float %504, float %505, float %506, float %507, float %508, float %509, float %510, float %511, float %512, float %513, float %514, float %515, float %516, float %517, float %518, float %519, float %520, float %521, float %522, float %523, float %524, i64 %487, i64 %492, i1 true) #2, !dbg !55 + %526 = add i32 %385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16384), !dbg !55 + %527 = lshr exact i32 %526, 4, !dbg !55 + %528 = and i32 %527, 16383, !dbg !55 + %529 = zext nneg i32 %528 to i64, !dbg !55 + %530 = or disjoint i64 %529, 4611686293372403712, !dbg !55 + %531 = add i32 %391, 8192, !dbg !55 + %532 = lshr exact i32 %531, 4, !dbg !55 + %533 = and i32 %532, 16383, !dbg !55 + %534 = zext nneg i32 %533 to i64, !dbg !55 + %535 = or disjoint i64 %534, 4611686293338849280, !dbg !55 + %536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 0, !dbg !55 + %537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 1, !dbg !55 + %538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 2, !dbg !55 + %539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 3, !dbg !55 + %540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 4, !dbg !55 + %541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 5, !dbg !55 + %542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 6, !dbg !55 + %543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 7, !dbg !55 + %544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 8, !dbg !55 + %545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 9, !dbg !55 + %546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 10, !dbg !55 + %547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 11, !dbg !55 + %548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 12, !dbg !55 + %549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 13, !dbg !55 + %550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 14, !dbg !55 + %551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 15, !dbg !55 + %552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 16, !dbg !55 + %553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 17, !dbg !55 + %554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 18, !dbg !55 + %555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 19, !dbg !55 + %556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 20, !dbg !55 + %557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 21, !dbg !55 + %558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 22, !dbg !55 + %559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 23, !dbg !55 + %560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 24, !dbg !55 + %561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 25, !dbg !55 + %562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 26, !dbg !55 + %563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 27, !dbg !55 + %564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 28, !dbg !55 + %565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 29, !dbg !55 + %566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 30, !dbg !55 + %567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %525, 31, !dbg !55 + %568 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %536, float %537, float %538, float %539, float %540, float %541, float %542, float %543, float %544, float %545, float %546, float %547, float %548, float %549, float %550, float %551, float %552, float %553, float %554, float %555, float %556, float %557, float %558, float %559, float %560, float %561, float %562, float %563, float %564, float %565, float %566, float %567, i64 %530, i64 %535, i1 true) #2, !dbg !55 + %569 = add i32 %385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16416), !dbg !55 + %570 = lshr exact i32 %569, 4, !dbg !55 + %571 = and i32 %570, 16383, !dbg !55 + %572 = zext nneg i32 %571 to i64, !dbg !55 + %573 = or disjoint i64 %572, 4611686293372403712, !dbg !55 + %574 = add i32 %391, 8224, !dbg !55 + %575 = lshr exact i32 %574, 4, !dbg !55 + %576 = and i32 %575, 16383, !dbg !55 + %577 = zext nneg i32 %576 to i64, !dbg !55 + %578 = or disjoint i64 %577, 4611686293338849280, !dbg !55 + %579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 0, !dbg !55 + %580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 1, !dbg !55 + %581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 2, !dbg !55 + %582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 3, !dbg !55 + %583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 4, !dbg !55 + %584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 5, !dbg !55 + %585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 6, !dbg !55 + %586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 7, !dbg !55 + %587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 8, !dbg !55 + %588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 9, !dbg !55 + %589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 10, !dbg !55 + %590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 11, !dbg !55 + %591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 12, !dbg !55 + %592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 13, !dbg !55 + %593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 14, !dbg !55 + %594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 15, !dbg !55 + %595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 16, !dbg !55 + %596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 17, !dbg !55 + %597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 18, !dbg !55 + %598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 19, !dbg !55 + %599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 20, !dbg !55 + %600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 21, !dbg !55 + %601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 22, !dbg !55 + %602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 23, !dbg !55 + %603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 24, !dbg !55 + %604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 25, !dbg !55 + %605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 26, !dbg !55 + %606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 27, !dbg !55 + %607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 28, !dbg !55 + %608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 29, !dbg !55 + %609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 30, !dbg !55 + %610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %568, 31, !dbg !55 + %611 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %579, float %580, float %581, float %582, float %583, float %584, float %585, float %586, float %587, float %588, float %589, float %590, float %591, float %592, float %593, float %594, float %595, float %596, float %597, float %598, float %599, float %600, float %601, float %602, float %603, float %604, float %605, float %606, float %607, float %608, float %609, float %610, i64 %573, i64 %578, i1 true) #2, !dbg !55 + %612 = add i32 %385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16448), !dbg !55 + %613 = lshr exact i32 %612, 4, !dbg !55 + %614 = and i32 %613, 16383, !dbg !55 + %615 = zext nneg i32 %614 to i64, !dbg !55 + %616 = or disjoint i64 %615, 4611686293372403712, !dbg !55 + %617 = add i32 %391, 8256, !dbg !55 + %618 = lshr exact i32 %617, 4, !dbg !55 + %619 = and i32 %618, 16383, !dbg !55 + %620 = zext nneg i32 %619 to i64, !dbg !55 + %621 = or disjoint i64 %620, 4611686293338849280, !dbg !55 + %622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 0, !dbg !55 + %623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 1, !dbg !55 + %624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 2, !dbg !55 + %625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 3, !dbg !55 + %626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 4, !dbg !55 + %627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 5, !dbg !55 + %628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 6, !dbg !55 + %629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 7, !dbg !55 + %630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 8, !dbg !55 + %631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 9, !dbg !55 + %632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 10, !dbg !55 + %633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 11, !dbg !55 + %634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 12, !dbg !55 + %635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 13, !dbg !55 + %636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 14, !dbg !55 + %637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 15, !dbg !55 + %638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 16, !dbg !55 + %639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 17, !dbg !55 + %640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 18, !dbg !55 + %641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 19, !dbg !55 + %642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 20, !dbg !55 + %643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 21, !dbg !55 + %644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 22, !dbg !55 + %645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 23, !dbg !55 + %646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 24, !dbg !55 + %647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 25, !dbg !55 + %648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 26, !dbg !55 + %649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 27, !dbg !55 + %650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 28, !dbg !55 + %651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 29, !dbg !55 + %652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 30, !dbg !55 + %653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %611, 31, !dbg !55 + %654 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %622, float %623, float %624, float %625, float %626, float %627, float %628, float %629, float %630, float %631, float %632, float %633, float %634, float %635, float %636, float %637, float %638, float %639, float %640, float %641, float %642, float %643, float %644, float %645, float %646, float %647, float %648, float %649, float %650, float %651, float %652, float %653, i64 %616, i64 %621, i1 true) #2, !dbg !55 + %655 = add i32 %385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16480), !dbg !55 + %656 = lshr exact i32 %655, 4, !dbg !55 + %657 = and i32 %656, 16383, !dbg !55 + %658 = zext nneg i32 %657 to i64, !dbg !55 + %659 = or disjoint i64 %658, 4611686293372403712, !dbg !55 + %660 = add i32 %391, 8288, !dbg !55 + %661 = lshr exact i32 %660, 4, !dbg !55 + %662 = and i32 %661, 16383, !dbg !55 + %663 = zext nneg i32 %662 to i64, !dbg !55 + %664 = or disjoint i64 %663, 4611686293338849280, !dbg !55 + %665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 0, !dbg !55 + %666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 1, !dbg !55 + %667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 2, !dbg !55 + %668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 3, !dbg !55 + %669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 4, !dbg !55 + %670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 5, !dbg !55 + %671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 6, !dbg !55 + %672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 7, !dbg !55 + %673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 8, !dbg !55 + %674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 9, !dbg !55 + %675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 10, !dbg !55 + %676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 11, !dbg !55 + %677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 12, !dbg !55 + %678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 13, !dbg !55 + %679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 14, !dbg !55 + %680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 15, !dbg !55 + %681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 16, !dbg !55 + %682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 17, !dbg !55 + %683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 18, !dbg !55 + %684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 19, !dbg !55 + %685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 20, !dbg !55 + %686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 21, !dbg !55 + %687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 22, !dbg !55 + %688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 23, !dbg !55 + %689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 24, !dbg !55 + %690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 25, !dbg !55 + %691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 26, !dbg !55 + %692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 27, !dbg !55 + %693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 28, !dbg !55 + %694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 29, !dbg !55 + %695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 30, !dbg !55 + %696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 31, !dbg !55 + %697 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %665, float %666, float %667, float %668, float %669, float %670, float %671, float %672, float %673, float %674, float %675, float %676, float %677, float %678, float %679, float %680, float %681, float %682, float %683, float %684, float %685, float %686, float %687, float %688, float %689, float %690, float %691, float %692, float %693, float %694, float %695, float %696, i64 %659, i64 %664, i1 true) #2, !dbg !55 + %698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 0, !dbg !55 + %699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 1, !dbg !55 + %700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 2, !dbg !55 + %701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 3, !dbg !55 + %702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 4, !dbg !55 + %703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 5, !dbg !55 + %704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 6, !dbg !55 + %705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 7, !dbg !55 + %706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 8, !dbg !55 + %707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 9, !dbg !55 + %708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 10, !dbg !55 + %709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 11, !dbg !55 + %710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 12, !dbg !55 + %711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 13, !dbg !55 + %712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 14, !dbg !55 + %713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 15, !dbg !55 + %714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 16, !dbg !55 + %715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 17, !dbg !55 + %716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 18, !dbg !55 + %717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 19, !dbg !55 + %718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 20, !dbg !55 + %719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 21, !dbg !55 + %720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 22, !dbg !55 + %721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 23, !dbg !55 + %722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 24, !dbg !55 + %723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 25, !dbg !55 + %724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 26, !dbg !55 + %725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 27, !dbg !55 + %726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 28, !dbg !55 + %727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 29, !dbg !55 + %728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 30, !dbg !55 + %729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %697, 31, !dbg !55 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !55 + %730 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101"(float %698, float %699, float %700, float %701, float %702, float %703, float %704, float %705, float %706, float %707, float %708, float %709, float %710, float %711, float %712, float %713, float %714, float %715, float %716, float %717, float %718, float %719, float %720, float %721, float %722, float %723, float %724, float %725, float %726, float %727, float %728, float %729, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %382, i32 0, i32 0, float %308, float %309, float %310, float %311, float %312, float %313, float %314, float %315, float %316, float %317, float %318, float %319, float %320, float %321, float %322, float %323, float %324, float %325, float %326, float %327, float %328, float %329, float %330, float %331, float %332, float %333, float %334, float %335, float %336, float %337, float %338, float %339, float %340, float %341, float %342, float %343, float %344, float %345, float %346, float %347, float %348, float %349, float %350, float %351, float %352, float %353, float %354, float %355, float %356, float %357, float %358, float %359, float %360, float %361, float %362, float %363, float %364, float %365, float %366, float %367, float %368, float %369, float %370, float %371) #2, !dbg !55 + %731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 0, !dbg !55 + %732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 1, !dbg !55 + %733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 2, !dbg !55 + %734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 3, !dbg !55 + %735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 4, !dbg !55 + %736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 5, !dbg !55 + %737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 6, !dbg !55 + %738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 7, !dbg !55 + %739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 8, !dbg !55 + %740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 9, !dbg !55 + %741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 10, !dbg !55 + %742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 11, !dbg !55 + %743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 12, !dbg !55 + %744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 13, !dbg !55 + %745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 14, !dbg !55 + %746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 15, !dbg !55 + %747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 16, !dbg !55 + %748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 17, !dbg !55 + %749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 18, !dbg !55 + %750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 19, !dbg !55 + %751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 20, !dbg !55 + %752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 21, !dbg !55 + %753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 22, !dbg !55 + %754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 23, !dbg !55 + %755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 24, !dbg !55 + %756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 25, !dbg !55 + %757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 26, !dbg !55 + %758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 27, !dbg !55 + %759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 28, !dbg !55 + %760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 29, !dbg !55 + %761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 30, !dbg !55 + %762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 31, !dbg !55 + %763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 38, !dbg !55 + %764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 39, !dbg !55 + %765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 40, !dbg !55 + %766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 41, !dbg !55 + %767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 42, !dbg !55 + %768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 43, !dbg !55 + %769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 44, !dbg !55 + %770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 45, !dbg !55 + %771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 46, !dbg !55 + %772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 47, !dbg !55 + %773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 48, !dbg !55 + %774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 49, !dbg !55 + %775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 50, !dbg !55 + %776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 51, !dbg !55 + %777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 52, !dbg !55 + %778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 53, !dbg !55 + %779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 54, !dbg !55 + %780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 55, !dbg !55 + %781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 56, !dbg !55 + %782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 57, !dbg !55 + %783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 58, !dbg !55 + %784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 59, !dbg !55 + %785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 60, !dbg !55 + %786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 61, !dbg !55 + %787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 62, !dbg !55 + %788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 63, !dbg !55 + %789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 64, !dbg !55 + %790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 65, !dbg !55 + %791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 66, !dbg !55 + %792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 67, !dbg !55 + %793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 68, !dbg !55 + %794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 69, !dbg !55 + %795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 70, !dbg !55 + %796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 71, !dbg !55 + %797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 72, !dbg !55 + %798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 73, !dbg !55 + %799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 74, !dbg !55 + %800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 75, !dbg !55 + %801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 76, !dbg !55 + %802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 77, !dbg !55 + %803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 78, !dbg !55 + %804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 79, !dbg !55 + %805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 80, !dbg !55 + %806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 81, !dbg !55 + %807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 82, !dbg !55 + %808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 83, !dbg !55 + %809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 84, !dbg !55 + %810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 85, !dbg !55 + %811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 86, !dbg !55 + %812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 87, !dbg !55 + %813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 88, !dbg !55 + %814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 89, !dbg !55 + %815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 90, !dbg !55 + %816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 91, !dbg !55 + %817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 92, !dbg !55 + %818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 93, !dbg !55 + %819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 94, !dbg !55 + %820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 95, !dbg !55 + %821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 96, !dbg !55 + %822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 97, !dbg !55 + %823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 98, !dbg !55 + %824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 99, !dbg !55 + %825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 100, !dbg !55 + %826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %730, 101, !dbg !55 + %827 = fmul float %731, 0x3FB6A09E60000000, !dbg !60 + %828 = fmul float %732, 0x3FB6A09E60000000, !dbg !60 + %829 = fmul float %733, 0x3FB6A09E60000000, !dbg !60 + %830 = fmul float %734, 0x3FB6A09E60000000, !dbg !60 + %831 = fmul float %735, 0x3FB6A09E60000000, !dbg !60 + %832 = fmul float %736, 0x3FB6A09E60000000, !dbg !60 + %833 = fmul float %737, 0x3FB6A09E60000000, !dbg !60 + %834 = fmul float %738, 0x3FB6A09E60000000, !dbg !60 + %835 = fmul float %739, 0x3FB6A09E60000000, !dbg !60 + %836 = fmul float %740, 0x3FB6A09E60000000, !dbg !60 + %837 = fmul float %741, 0x3FB6A09E60000000, !dbg !60 + %838 = fmul float %742, 0x3FB6A09E60000000, !dbg !60 + %839 = fmul float %743, 0x3FB6A09E60000000, !dbg !60 + %840 = fmul float %744, 0x3FB6A09E60000000, !dbg !60 + %841 = fmul float %745, 0x3FB6A09E60000000, !dbg !60 + %842 = fmul float %746, 0x3FB6A09E60000000, !dbg !60 + %843 = fmul float %747, 0x3FB6A09E60000000, !dbg !60 + %844 = fmul float %748, 0x3FB6A09E60000000, !dbg !60 + %845 = fmul float %749, 0x3FB6A09E60000000, !dbg !60 + %846 = fmul float %750, 0x3FB6A09E60000000, !dbg !60 + %847 = fmul float %751, 0x3FB6A09E60000000, !dbg !60 + %848 = fmul float %752, 0x3FB6A09E60000000, !dbg !60 + %849 = fmul float %753, 0x3FB6A09E60000000, !dbg !60 + %850 = fmul float %754, 0x3FB6A09E60000000, !dbg !60 + %851 = fmul float %755, 0x3FB6A09E60000000, !dbg !60 + %852 = fmul float %756, 0x3FB6A09E60000000, !dbg !60 + %853 = fmul float %757, 0x3FB6A09E60000000, !dbg !60 + %854 = fmul float %758, 0x3FB6A09E60000000, !dbg !60 + %855 = fmul float %759, 0x3FB6A09E60000000, !dbg !60 + %856 = fmul float %760, 0x3FB6A09E60000000, !dbg !60 + %857 = fmul float %761, 0x3FB6A09E60000000, !dbg !60 + %858 = fmul float %762, 0x3FB6A09E60000000, !dbg !60 + %859 = extractelement <16 x i32> %374, i64 15, !dbg !61 + %860 = icmp sge i32 %56, %859, !dbg !61 + %861 = extractelement <16 x i32> %374, i64 14, !dbg !62 + %862 = icmp sge i32 %56, %861, !dbg !61 + %863 = icmp sge i32 %57, %859, !dbg !61 + %864 = icmp sge i32 %57, %861, !dbg !61 + %865 = extractelement <16 x i32> %374, i64 13, !dbg !61 + %866 = icmp sge i32 %56, %865, !dbg !61 + %867 = extractelement <16 x i32> %374, i64 12, !dbg !62 + %868 = icmp sge i32 %56, %867, !dbg !61 + %869 = icmp sge i32 %57, %865, !dbg !61 + %870 = icmp sge i32 %57, %867, !dbg !61 + %871 = extractelement <16 x i32> %374, i64 11, !dbg !61 + %872 = icmp sge i32 %56, %871, !dbg !61 + %873 = extractelement <16 x i32> %374, i64 10, !dbg !62 + %874 = icmp sge i32 %56, %873, !dbg !61 + %875 = icmp sge i32 %57, %871, !dbg !61 + %876 = icmp sge i32 %57, %873, !dbg !61 + %877 = extractelement <16 x i32> %374, i64 9, !dbg !61 + %878 = icmp sge i32 %56, %877, !dbg !61 + %879 = extractelement <16 x i32> %374, i64 8, !dbg !62 + %880 = icmp sge i32 %56, %879, !dbg !61 + %881 = icmp sge i32 %57, %877, !dbg !61 + %882 = icmp sge i32 %57, %879, !dbg !61 + %883 = extractelement <16 x i32> %374, i64 7, !dbg !61 + %884 = icmp sge i32 %56, %883, !dbg !61 + %885 = extractelement <16 x i32> %374, i64 6, !dbg !62 + %886 = icmp sge i32 %56, %885, !dbg !61 + %887 = icmp sge i32 %57, %883, !dbg !61 + %888 = icmp sge i32 %57, %885, !dbg !61 + %889 = extractelement <16 x i32> %374, i64 5, !dbg !61 + %890 = icmp sge i32 %56, %889, !dbg !61 + %891 = extractelement <16 x i32> %374, i64 4, !dbg !62 + %892 = icmp sge i32 %56, %891, !dbg !61 + %893 = icmp sge i32 %57, %889, !dbg !61 + %894 = icmp sge i32 %57, %891, !dbg !61 + %895 = extractelement <16 x i32> %374, i64 3, !dbg !61 + %896 = icmp sge i32 %56, %895, !dbg !61 + %897 = extractelement <16 x i32> %374, i64 2, !dbg !62 + %898 = icmp sge i32 %56, %897, !dbg !61 + %899 = icmp sge i32 %57, %895, !dbg !61 + %900 = icmp sge i32 %57, %897, !dbg !61 + %901 = extractelement <16 x i32> %374, i64 1, !dbg !61 + %902 = icmp sge i32 %56, %901, !dbg !61 + %903 = extractelement <16 x i32> %374, i64 0, !dbg !62 + %904 = icmp sge i32 %56, %903, !dbg !61 + %905 = icmp sge i32 %57, %901, !dbg !61 + %906 = icmp sge i32 %57, %903, !dbg !61 + %907 = and i1 %197, %860, !dbg !63 + %908 = and i1 %197, %862, !dbg !63 + %909 = and i1 %198, %863, !dbg !63 + %910 = and i1 %198, %864, !dbg !63 + %911 = and i1 %197, %866, !dbg !63 + %912 = and i1 %197, %868, !dbg !63 + %913 = and i1 %198, %869, !dbg !63 + %914 = and i1 %198, %870, !dbg !63 + %915 = and i1 %197, %872, !dbg !63 + %916 = and i1 %197, %874, !dbg !63 + %917 = and i1 %198, %875, !dbg !63 + %918 = and i1 %198, %876, !dbg !63 + %919 = and i1 %197, %878, !dbg !63 + %920 = and i1 %197, %880, !dbg !63 + %921 = and i1 %198, %881, !dbg !63 + %922 = and i1 %198, %882, !dbg !63 + %923 = and i1 %197, %884, !dbg !63 + %924 = and i1 %197, %886, !dbg !63 + %925 = and i1 %198, %887, !dbg !63 + %926 = and i1 %198, %888, !dbg !63 + %927 = and i1 %197, %890, !dbg !63 + %928 = and i1 %197, %892, !dbg !63 + %929 = and i1 %198, %893, !dbg !63 + %930 = and i1 %198, %894, !dbg !63 + %931 = and i1 %197, %896, !dbg !63 + %932 = and i1 %197, %898, !dbg !63 + %933 = and i1 %198, %899, !dbg !63 + %934 = and i1 %198, %900, !dbg !63 + %935 = and i1 %197, %902, !dbg !63 + %936 = and i1 %197, %904, !dbg !63 + %937 = and i1 %198, %905, !dbg !63 + %938 = and i1 %198, %906, !dbg !63 + %939 = icmp sgt i32 %859, 2047, !dbg !64 + %940 = icmp sgt i32 %865, 2047, !dbg !64 + %941 = icmp sgt i32 %871, 2047, !dbg !64 + %942 = icmp sgt i32 %877, 2047, !dbg !64 + %943 = icmp sgt i32 %883, 2047, !dbg !64 + %944 = icmp sgt i32 %889, 2047, !dbg !64 + %945 = icmp sgt i32 %895, 2047, !dbg !64 + %946 = icmp sgt i32 %901, 2047, !dbg !64 + %947 = shufflevector <16 x i32> %374, <16 x i32> poison, <8 x i32> , !dbg !62 + %948 = srem <8 x i32> %947, splat (i32 2048), !dbg !62 + %949 = icmp ne <8 x i32> %948, zeroinitializer, !dbg !65 + %950 = shufflevector <16 x i32> %374, <16 x i32> poison, <8 x i32> , !dbg !66 + %951 = and <8 x i32> %950, splat (i32 -2147481601), !dbg !66 + %952 = icmp ugt <8 x i32> %951, splat (i32 -2147483648), !dbg !66 + %953 = trunc <8 x i32> %950 to <8 x i16>, !dbg !67 + %954 = and <8 x i16> %953, splat (i16 2047), !dbg !67 + %955 = zext nneg <8 x i16> %954 to <8 x i64>, !dbg !59 + %956 = icmp sgt <8 x i64> %300, %955, !dbg !59 + %957 = and <8 x i1> %952, %949, !dbg !68 + %958 = add nsw <8 x i32> %948, splat (i32 2048), !dbg !69 + %959 = select <8 x i1> %957, <8 x i32> %958, <8 x i32> %948, !dbg !70 + %960 = sext <8 x i32> %959 to <8 x i64>, !dbg !59 + %961 = icmp sgt <8 x i64> %300, %960, !dbg !59 + %962 = extractelement <8 x i1> %956, i64 7, !dbg !71 + %963 = and i1 %939, %962, !dbg !71 + %964 = extractelement <8 x i1> %961, i64 7, !dbg !71 + %965 = and i1 %939, %964, !dbg !71 + %966 = extractelement <8 x i1> %956, i64 6, !dbg !71 + %967 = and i1 %940, %966, !dbg !71 + %968 = extractelement <8 x i1> %961, i64 6, !dbg !71 + %969 = and i1 %940, %968, !dbg !71 + %970 = extractelement <8 x i1> %956, i64 5, !dbg !71 + %971 = and i1 %941, %970, !dbg !71 + %972 = extractelement <8 x i1> %961, i64 5, !dbg !71 + %973 = and i1 %941, %972, !dbg !71 + %974 = extractelement <8 x i1> %956, i64 4, !dbg !71 + %975 = and i1 %942, %974, !dbg !71 + %976 = extractelement <8 x i1> %961, i64 4, !dbg !71 + %977 = and i1 %942, %976, !dbg !71 + %978 = extractelement <8 x i1> %956, i64 3, !dbg !71 + %979 = and i1 %943, %978, !dbg !71 + %980 = extractelement <8 x i1> %961, i64 3, !dbg !71 + %981 = and i1 %943, %980, !dbg !71 + %982 = extractelement <8 x i1> %956, i64 2, !dbg !71 + %983 = and i1 %944, %982, !dbg !71 + %984 = extractelement <8 x i1> %961, i64 2, !dbg !71 + %985 = and i1 %944, %984, !dbg !71 + %986 = extractelement <8 x i1> %956, i64 1, !dbg !71 + %987 = and i1 %945, %986, !dbg !71 + %988 = extractelement <8 x i1> %961, i64 1, !dbg !71 + %989 = and i1 %945, %988, !dbg !71 + %990 = extractelement <8 x i1> %956, i64 0, !dbg !71 + %991 = and i1 %946, %990, !dbg !71 + %992 = extractelement <8 x i1> %961, i64 0, !dbg !71 + %993 = and i1 %946, %992, !dbg !71 + %994 = sub <32 x i32> %375, %298, !dbg !58 + %995 = and <32 x i32> %994, splat (i32 2047), !dbg !72 + %996 = icmp eq <32 x i32> %995, zeroinitializer, !dbg !73 + %997 = extractelement <32 x i1> %996, i64 31, !dbg !74 + %998 = and i1 %997, %963, !dbg !74 + %999 = extractelement <32 x i1> %996, i64 30, !dbg !74 + %1000 = and i1 %999, %965, !dbg !74 + %1001 = extractelement <32 x i1> %996, i64 29, !dbg !74 + %1002 = and i1 %1001, %963, !dbg !74 + %1003 = extractelement <32 x i1> %996, i64 28, !dbg !74 + %1004 = and i1 %1003, %965, !dbg !74 + %1005 = extractelement <32 x i1> %996, i64 27, !dbg !74 + %1006 = and i1 %1005, %967, !dbg !74 + %1007 = extractelement <32 x i1> %996, i64 26, !dbg !74 + %1008 = and i1 %1007, %969, !dbg !74 + %1009 = extractelement <32 x i1> %996, i64 25, !dbg !74 + %1010 = and i1 %1009, %967, !dbg !74 + %1011 = extractelement <32 x i1> %996, i64 24, !dbg !74 + %1012 = and i1 %1011, %969, !dbg !74 + %1013 = extractelement <32 x i1> %996, i64 23, !dbg !74 + %1014 = and i1 %1013, %971, !dbg !74 + %1015 = extractelement <32 x i1> %996, i64 22, !dbg !74 + %1016 = and i1 %1015, %973, !dbg !74 + %1017 = extractelement <32 x i1> %996, i64 21, !dbg !74 + %1018 = and i1 %1017, %971, !dbg !74 + %1019 = extractelement <32 x i1> %996, i64 20, !dbg !74 + %1020 = and i1 %1019, %973, !dbg !74 + %1021 = extractelement <32 x i1> %996, i64 19, !dbg !74 + %1022 = and i1 %1021, %975, !dbg !74 + %1023 = extractelement <32 x i1> %996, i64 18, !dbg !74 + %1024 = and i1 %1023, %977, !dbg !74 + %1025 = extractelement <32 x i1> %996, i64 17, !dbg !74 + %1026 = and i1 %1025, %975, !dbg !74 + %1027 = extractelement <32 x i1> %996, i64 16, !dbg !74 + %1028 = and i1 %1027, %977, !dbg !74 + %1029 = extractelement <32 x i1> %996, i64 15, !dbg !74 + %1030 = and i1 %1029, %979, !dbg !74 + %1031 = extractelement <32 x i1> %996, i64 14, !dbg !74 + %1032 = and i1 %1031, %981, !dbg !74 + %1033 = extractelement <32 x i1> %996, i64 13, !dbg !74 + %1034 = and i1 %1033, %979, !dbg !74 + %1035 = extractelement <32 x i1> %996, i64 12, !dbg !74 + %1036 = and i1 %1035, %981, !dbg !74 + %1037 = extractelement <32 x i1> %996, i64 11, !dbg !74 + %1038 = and i1 %1037, %983, !dbg !74 + %1039 = extractelement <32 x i1> %996, i64 10, !dbg !74 + %1040 = and i1 %1039, %985, !dbg !74 + %1041 = extractelement <32 x i1> %996, i64 9, !dbg !74 + %1042 = and i1 %1041, %983, !dbg !74 + %1043 = extractelement <32 x i1> %996, i64 8, !dbg !74 + %1044 = and i1 %1043, %985, !dbg !74 + %1045 = extractelement <32 x i1> %996, i64 7, !dbg !74 + %1046 = and i1 %1045, %987, !dbg !74 + %1047 = extractelement <32 x i1> %996, i64 6, !dbg !74 + %1048 = and i1 %1047, %989, !dbg !74 + %1049 = extractelement <32 x i1> %996, i64 5, !dbg !74 + %1050 = and i1 %1049, %987, !dbg !74 + %1051 = extractelement <32 x i1> %996, i64 4, !dbg !74 + %1052 = and i1 %1051, %989, !dbg !74 + %1053 = extractelement <32 x i1> %996, i64 3, !dbg !74 + %1054 = and i1 %1053, %991, !dbg !74 + %1055 = extractelement <32 x i1> %996, i64 2, !dbg !74 + %1056 = and i1 %1055, %993, !dbg !74 + %1057 = extractelement <32 x i1> %996, i64 1, !dbg !74 + %1058 = and i1 %1057, %991, !dbg !74 + %1059 = extractelement <32 x i1> %996, i64 0, !dbg !74 + %1060 = and i1 %1059, %993, !dbg !74 + %1061 = or i1 %907, %998, !dbg !75 + %1062 = or i1 %908, %1000, !dbg !75 + %1063 = or i1 %909, %1002, !dbg !75 + %1064 = or i1 %910, %1004, !dbg !75 + %1065 = or i1 %911, %1006, !dbg !75 + %1066 = or i1 %912, %1008, !dbg !75 + %1067 = or i1 %913, %1010, !dbg !75 + %1068 = or i1 %914, %1012, !dbg !75 + %1069 = or i1 %915, %1014, !dbg !75 + %1070 = or i1 %916, %1016, !dbg !75 + %1071 = or i1 %917, %1018, !dbg !75 + %1072 = or i1 %918, %1020, !dbg !75 + %1073 = or i1 %919, %1022, !dbg !75 + %1074 = or i1 %920, %1024, !dbg !75 + %1075 = or i1 %921, %1026, !dbg !75 + %1076 = or i1 %922, %1028, !dbg !75 + %1077 = or i1 %923, %1030, !dbg !75 + %1078 = or i1 %924, %1032, !dbg !75 + %1079 = or i1 %925, %1034, !dbg !75 + %1080 = or i1 %926, %1036, !dbg !75 + %1081 = or i1 %927, %1038, !dbg !75 + %1082 = or i1 %928, %1040, !dbg !75 + %1083 = or i1 %929, %1042, !dbg !75 + %1084 = or i1 %930, %1044, !dbg !75 + %1085 = or i1 %931, %1046, !dbg !75 + %1086 = or i1 %932, %1048, !dbg !75 + %1087 = or i1 %933, %1050, !dbg !75 + %1088 = or i1 %934, %1052, !dbg !75 + %1089 = or i1 %935, %1054, !dbg !75 + %1090 = or i1 %936, %1056, !dbg !75 + %1091 = or i1 %937, %1058, !dbg !75 + %1092 = or i1 %938, %1060, !dbg !75 + %1093 = fmul float %827, 0x3FF7154760000000, !dbg !76 + %1094 = select i1 %1061, float %1093, float 0xFFF0000000000000, !dbg !77 + %1095 = fmul float %828, 0x3FF7154760000000, !dbg !76 + %1096 = select i1 %1062, float %1095, float 0xFFF0000000000000, !dbg !77 + %1097 = fmul float %829, 0x3FF7154760000000, !dbg !76 + %1098 = select i1 %1063, float %1097, float 0xFFF0000000000000, !dbg !77 + %1099 = fmul float %830, 0x3FF7154760000000, !dbg !76 + %1100 = select i1 %1064, float %1099, float 0xFFF0000000000000, !dbg !77 + %1101 = fmul float %831, 0x3FF7154760000000, !dbg !76 + %1102 = select i1 %1065, float %1101, float 0xFFF0000000000000, !dbg !77 + %1103 = fmul float %832, 0x3FF7154760000000, !dbg !76 + %1104 = select i1 %1066, float %1103, float 0xFFF0000000000000, !dbg !77 + %1105 = fmul float %833, 0x3FF7154760000000, !dbg !76 + %1106 = select i1 %1067, float %1105, float 0xFFF0000000000000, !dbg !77 + %1107 = fmul float %834, 0x3FF7154760000000, !dbg !76 + %1108 = select i1 %1068, float %1107, float 0xFFF0000000000000, !dbg !77 + %1109 = fmul float %835, 0x3FF7154760000000, !dbg !76 + %1110 = select i1 %1069, float %1109, float 0xFFF0000000000000, !dbg !77 + %1111 = fmul float %836, 0x3FF7154760000000, !dbg !76 + %1112 = select i1 %1070, float %1111, float 0xFFF0000000000000, !dbg !77 + %1113 = fmul float %837, 0x3FF7154760000000, !dbg !76 + %1114 = select i1 %1071, float %1113, float 0xFFF0000000000000, !dbg !77 + %1115 = fmul float %838, 0x3FF7154760000000, !dbg !76 + %1116 = select i1 %1072, float %1115, float 0xFFF0000000000000, !dbg !77 + %1117 = fmul float %839, 0x3FF7154760000000, !dbg !76 + %1118 = select i1 %1073, float %1117, float 0xFFF0000000000000, !dbg !77 + %1119 = fmul float %840, 0x3FF7154760000000, !dbg !76 + %1120 = select i1 %1074, float %1119, float 0xFFF0000000000000, !dbg !77 + %1121 = fmul float %841, 0x3FF7154760000000, !dbg !76 + %1122 = select i1 %1075, float %1121, float 0xFFF0000000000000, !dbg !77 + %1123 = fmul float %842, 0x3FF7154760000000, !dbg !76 + %1124 = select i1 %1076, float %1123, float 0xFFF0000000000000, !dbg !77 + %1125 = fmul float %843, 0x3FF7154760000000, !dbg !76 + %1126 = select i1 %1077, float %1125, float 0xFFF0000000000000, !dbg !77 + %1127 = fmul float %844, 0x3FF7154760000000, !dbg !76 + %1128 = select i1 %1078, float %1127, float 0xFFF0000000000000, !dbg !77 + %1129 = fmul float %845, 0x3FF7154760000000, !dbg !76 + %1130 = select i1 %1079, float %1129, float 0xFFF0000000000000, !dbg !77 + %1131 = fmul float %846, 0x3FF7154760000000, !dbg !76 + %1132 = select i1 %1080, float %1131, float 0xFFF0000000000000, !dbg !77 + %1133 = fmul float %847, 0x3FF7154760000000, !dbg !76 + %1134 = select i1 %1081, float %1133, float 0xFFF0000000000000, !dbg !77 + %1135 = fmul float %848, 0x3FF7154760000000, !dbg !76 + %1136 = select i1 %1082, float %1135, float 0xFFF0000000000000, !dbg !77 + %1137 = fmul float %849, 0x3FF7154760000000, !dbg !76 + %1138 = select i1 %1083, float %1137, float 0xFFF0000000000000, !dbg !77 + %1139 = fmul float %850, 0x3FF7154760000000, !dbg !76 + %1140 = select i1 %1084, float %1139, float 0xFFF0000000000000, !dbg !77 + %1141 = fmul float %851, 0x3FF7154760000000, !dbg !76 + %1142 = select i1 %1085, float %1141, float 0xFFF0000000000000, !dbg !77 + %1143 = fmul float %852, 0x3FF7154760000000, !dbg !76 + %1144 = select i1 %1086, float %1143, float 0xFFF0000000000000, !dbg !77 + %1145 = fmul float %853, 0x3FF7154760000000, !dbg !76 + %1146 = select i1 %1087, float %1145, float 0xFFF0000000000000, !dbg !77 + %1147 = fmul float %854, 0x3FF7154760000000, !dbg !76 + %1148 = select i1 %1088, float %1147, float 0xFFF0000000000000, !dbg !77 + %1149 = fmul float %855, 0x3FF7154760000000, !dbg !76 + %1150 = select i1 %1089, float %1149, float 0xFFF0000000000000, !dbg !77 + %1151 = fmul float %856, 0x3FF7154760000000, !dbg !76 + %1152 = select i1 %1090, float %1151, float 0xFFF0000000000000, !dbg !77 + %1153 = fmul float %857, 0x3FF7154760000000, !dbg !76 + %1154 = select i1 %1091, float %1153, float 0xFFF0000000000000, !dbg !77 + %1155 = fmul float %858, 0x3FF7154760000000, !dbg !76 + %1156 = select i1 %1092, float %1155, float 0xFFF0000000000000, !dbg !77 + %1157 = tail call float @llvm.maxnum.f32(float %1094, float %1096), !dbg !78 + %1158 = tail call float @llvm.maxnum.f32(float %1098, float %1100), !dbg !78 + %1159 = tail call float @llvm.maxnum.f32(float %1157, float %1102), !dbg !78 + %1160 = tail call float @llvm.maxnum.f32(float %1159, float %1104), !dbg !78 + %1161 = tail call float @llvm.maxnum.f32(float %1158, float %1106), !dbg !78 + %1162 = tail call float @llvm.maxnum.f32(float %1161, float %1108), !dbg !78 + %1163 = tail call float @llvm.maxnum.f32(float %1160, float %1110), !dbg !78 + %1164 = tail call float @llvm.maxnum.f32(float %1163, float %1112), !dbg !78 + %1165 = tail call float @llvm.maxnum.f32(float %1162, float %1114), !dbg !78 + %1166 = tail call float @llvm.maxnum.f32(float %1165, float %1116), !dbg !78 + %1167 = tail call float @llvm.maxnum.f32(float %1164, float %1118), !dbg !78 + %1168 = tail call float @llvm.maxnum.f32(float %1167, float %1120), !dbg !78 + %1169 = tail call float @llvm.maxnum.f32(float %1166, float %1122), !dbg !78 + %1170 = tail call float @llvm.maxnum.f32(float %1169, float %1124), !dbg !78 + %1171 = tail call float @llvm.maxnum.f32(float %1168, float %1126), !dbg !78 + %1172 = tail call float @llvm.maxnum.f32(float %1171, float %1128), !dbg !78 + %1173 = tail call float @llvm.maxnum.f32(float %1170, float %1130), !dbg !78 + %1174 = tail call float @llvm.maxnum.f32(float %1173, float %1132), !dbg !78 + %1175 = tail call float @llvm.maxnum.f32(float %1172, float %1134), !dbg !78 + %1176 = tail call float @llvm.maxnum.f32(float %1175, float %1136), !dbg !78 + %1177 = tail call float @llvm.maxnum.f32(float %1174, float %1138), !dbg !78 + %1178 = tail call float @llvm.maxnum.f32(float %1177, float %1140), !dbg !78 + %1179 = tail call float @llvm.maxnum.f32(float %1176, float %1142), !dbg !78 + %1180 = tail call float @llvm.maxnum.f32(float %1179, float %1144), !dbg !78 + %1181 = tail call float @llvm.maxnum.f32(float %1178, float %1146), !dbg !78 + %1182 = tail call float @llvm.maxnum.f32(float %1181, float %1148), !dbg !78 + %1183 = tail call float @llvm.maxnum.f32(float %1180, float %1150), !dbg !78 + %1184 = tail call float @llvm.maxnum.f32(float %1183, float %1152), !dbg !78 + %1185 = tail call float @llvm.maxnum.f32(float %1182, float %1154), !dbg !78 + %1186 = tail call float @llvm.maxnum.f32(float %1185, float %1156), !dbg !78 + %1187 = bitcast float %1184 to i32, !dbg !81 + %1188 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1187, i32 2, i32 31), !dbg !81 + %1189 = bitcast i32 %1188 to float, !dbg !81 + %1190 = bitcast float %1186 to i32, !dbg !81 + %1191 = tail call float @llvm.maxnum.f32(float %1184, float %1189), !dbg !78 + %1192 = bitcast float %1191 to i32, !dbg !81 + %1193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1192, i32 1, i32 31), !dbg !81 + %1194 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1190, i32 2, i32 31), !dbg !81 + %1195 = bitcast i32 %1194 to float, !dbg !81 + %1196 = tail call float @llvm.maxnum.f32(float %1186, float %1195), !dbg !78 + %1197 = bitcast float %1196 to i32, !dbg !81 + %1198 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1197, i32 1, i32 31), !dbg !81 + %1199 = insertelement <2 x i32> poison, i32 %1193, i64 0, !dbg !81 + %1200 = insertelement <2 x i32> %1199, i32 %1198, i64 1, !dbg !81 + %1201 = bitcast <2 x i32> %1200 to <2 x float>, !dbg !81 + %1202 = insertelement <2 x float> poison, float %1191, i64 0, !dbg !78 + %1203 = insertelement <2 x float> %1202, float %1196, i64 1, !dbg !78 + %1204 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %1203, <2 x float> %1201), !dbg !78 + %1205 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %373, <2 x float> %1204), !dbg !82 + %1206 = extractelement <2 x float> %1205, i64 0, !dbg !83 + %1207 = fcmp oeq float %1206, 0xFFF0000000000000, !dbg !84 + %1208 = extractelement <2 x float> %1205, i64 1, !dbg !83 + %1209 = fcmp oeq float %1208, 0xFFF0000000000000, !dbg !84 + %1210 = select i1 %1207, float 0.000000e+00, float %1206, !dbg !83 + %1211 = select i1 %1209, float 0.000000e+00, float %1208, !dbg !83 + %1212 = extractelement <2 x float> %373, i64 0, !dbg !85 + %1213 = fsub float %1212, %1210, !dbg !85 + %1214 = extractelement <2 x float> %373, i64 1, !dbg !85 + %1215 = fsub float %1214, %1211, !dbg !85 + %1216 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !86 + %.not.i190 = icmp eq i32 %1216, 0, !dbg !86 + br i1 %.not.i190, label %1219, label %1217, !dbg !86 + +1217: ; preds = %301 + %1218 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1213) #2, !dbg !86 + br label %__nv_exp2f.exit192, !dbg !86 + +1219: ; preds = %301 + %1220 = tail call float @llvm.nvvm.ex2.approx.f(float %1213) #2, !dbg !86 + br label %__nv_exp2f.exit192, !dbg !86 + +__nv_exp2f.exit192: ; preds = %1217, %1219 + %.0.i191 = phi float [ %1218, %1217 ], [ %1220, %1219 ], !dbg !86 + %1221 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !86 + %.not.i193 = icmp eq i32 %1221, 0, !dbg !86 + br i1 %.not.i193, label %1224, label %1222, !dbg !86 + +1222: ; preds = %__nv_exp2f.exit192 + %1223 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1215) #2, !dbg !86 + br label %__nv_exp2f.exit195, !dbg !86 + +1224: ; preds = %__nv_exp2f.exit192 + %1225 = tail call float @llvm.nvvm.ex2.approx.f(float %1215) #2, !dbg !86 + br label %__nv_exp2f.exit195, !dbg !86 + +__nv_exp2f.exit195: ; preds = %1222, %1224 + %.0.i194 = phi float [ %1223, %1222 ], [ %1225, %1224 ], !dbg !86 + %1226 = fsub float %1094, %1210, !dbg !87 + %1227 = fsub float %1096, %1210, !dbg !87 + %1228 = fsub float %1098, %1211, !dbg !87 + %1229 = fsub float %1100, %1211, !dbg !87 + %1230 = fsub float %1102, %1210, !dbg !87 + %1231 = fsub float %1104, %1210, !dbg !87 + %1232 = fsub float %1106, %1211, !dbg !87 + %1233 = fsub float %1108, %1211, !dbg !87 + %1234 = fsub float %1110, %1210, !dbg !87 + %1235 = fsub float %1112, %1210, !dbg !87 + %1236 = fsub float %1114, %1211, !dbg !87 + %1237 = fsub float %1116, %1211, !dbg !87 + %1238 = fsub float %1118, %1210, !dbg !87 + %1239 = fsub float %1120, %1210, !dbg !87 + %1240 = fsub float %1122, %1211, !dbg !87 + %1241 = fsub float %1124, %1211, !dbg !87 + %1242 = fsub float %1126, %1210, !dbg !87 + %1243 = fsub float %1128, %1210, !dbg !87 + %1244 = fsub float %1130, %1211, !dbg !87 + %1245 = fsub float %1132, %1211, !dbg !87 + %1246 = fsub float %1134, %1210, !dbg !87 + %1247 = fsub float %1136, %1210, !dbg !87 + %1248 = fsub float %1138, %1211, !dbg !87 + %1249 = fsub float %1140, %1211, !dbg !87 + %1250 = fsub float %1142, %1210, !dbg !87 + %1251 = fsub float %1144, %1210, !dbg !87 + %1252 = fsub float %1146, %1211, !dbg !87 + %1253 = fsub float %1148, %1211, !dbg !87 + %1254 = fsub float %1150, %1210, !dbg !87 + %1255 = fsub float %1152, %1210, !dbg !87 + %1256 = fsub float %1154, %1211, !dbg !87 + %1257 = fsub float %1156, %1211, !dbg !87 + %1258 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i196 = icmp eq i32 %1258, 0, !dbg !88 + br i1 %.not.i196, label %1261, label %1259, !dbg !88 + +1259: ; preds = %__nv_exp2f.exit195 + %1260 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1226) #2, !dbg !88 + br label %__nv_exp2f.exit198, !dbg !88 + +1261: ; preds = %__nv_exp2f.exit195 + %1262 = tail call float @llvm.nvvm.ex2.approx.f(float %1226) #2, !dbg !88 + br label %__nv_exp2f.exit198, !dbg !88 + +__nv_exp2f.exit198: ; preds = %1259, %1261 + %.0.i197 = phi float [ %1260, %1259 ], [ %1262, %1261 ], !dbg !88 + %1263 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i199 = icmp eq i32 %1263, 0, !dbg !88 + br i1 %.not.i199, label %1266, label %1264, !dbg !88 + +1264: ; preds = %__nv_exp2f.exit198 + %1265 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1227) #2, !dbg !88 + br label %__nv_exp2f.exit201, !dbg !88 + +1266: ; preds = %__nv_exp2f.exit198 + %1267 = tail call float @llvm.nvvm.ex2.approx.f(float %1227) #2, !dbg !88 + br label %__nv_exp2f.exit201, !dbg !88 + +__nv_exp2f.exit201: ; preds = %1264, %1266 + %.0.i200 = phi float [ %1265, %1264 ], [ %1267, %1266 ], !dbg !88 + %1268 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i202 = icmp eq i32 %1268, 0, !dbg !88 + br i1 %.not.i202, label %1271, label %1269, !dbg !88 + +1269: ; preds = %__nv_exp2f.exit201 + %1270 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1228) #2, !dbg !88 + br label %__nv_exp2f.exit204, !dbg !88 + +1271: ; preds = %__nv_exp2f.exit201 + %1272 = tail call float @llvm.nvvm.ex2.approx.f(float %1228) #2, !dbg !88 + br label %__nv_exp2f.exit204, !dbg !88 + +__nv_exp2f.exit204: ; preds = %1269, %1271 + %.0.i203 = phi float [ %1270, %1269 ], [ %1272, %1271 ], !dbg !88 + %1273 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i205 = icmp eq i32 %1273, 0, !dbg !88 + br i1 %.not.i205, label %1276, label %1274, !dbg !88 + +1274: ; preds = %__nv_exp2f.exit204 + %1275 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1229) #2, !dbg !88 + br label %__nv_exp2f.exit207, !dbg !88 + +1276: ; preds = %__nv_exp2f.exit204 + %1277 = tail call float @llvm.nvvm.ex2.approx.f(float %1229) #2, !dbg !88 + br label %__nv_exp2f.exit207, !dbg !88 + +__nv_exp2f.exit207: ; preds = %1274, %1276 + %.0.i206 = phi float [ %1275, %1274 ], [ %1277, %1276 ], !dbg !88 + %1278 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i208 = icmp eq i32 %1278, 0, !dbg !88 + br i1 %.not.i208, label %1281, label %1279, !dbg !88 + +1279: ; preds = %__nv_exp2f.exit207 + %1280 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1230) #2, !dbg !88 + br label %__nv_exp2f.exit210, !dbg !88 + +1281: ; preds = %__nv_exp2f.exit207 + %1282 = tail call float @llvm.nvvm.ex2.approx.f(float %1230) #2, !dbg !88 + br label %__nv_exp2f.exit210, !dbg !88 + +__nv_exp2f.exit210: ; preds = %1279, %1281 + %.0.i209 = phi float [ %1280, %1279 ], [ %1282, %1281 ], !dbg !88 + %1283 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i211 = icmp eq i32 %1283, 0, !dbg !88 + br i1 %.not.i211, label %1286, label %1284, !dbg !88 + +1284: ; preds = %__nv_exp2f.exit210 + %1285 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1231) #2, !dbg !88 + br label %__nv_exp2f.exit213, !dbg !88 + +1286: ; preds = %__nv_exp2f.exit210 + %1287 = tail call float @llvm.nvvm.ex2.approx.f(float %1231) #2, !dbg !88 + br label %__nv_exp2f.exit213, !dbg !88 + +__nv_exp2f.exit213: ; preds = %1284, %1286 + %.0.i212 = phi float [ %1285, %1284 ], [ %1287, %1286 ], !dbg !88 + %1288 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i214 = icmp eq i32 %1288, 0, !dbg !88 + br i1 %.not.i214, label %1291, label %1289, !dbg !88 + +1289: ; preds = %__nv_exp2f.exit213 + %1290 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1232) #2, !dbg !88 + br label %__nv_exp2f.exit216, !dbg !88 + +1291: ; preds = %__nv_exp2f.exit213 + %1292 = tail call float @llvm.nvvm.ex2.approx.f(float %1232) #2, !dbg !88 + br label %__nv_exp2f.exit216, !dbg !88 + +__nv_exp2f.exit216: ; preds = %1289, %1291 + %.0.i215 = phi float [ %1290, %1289 ], [ %1292, %1291 ], !dbg !88 + %1293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i217 = icmp eq i32 %1293, 0, !dbg !88 + br i1 %.not.i217, label %1296, label %1294, !dbg !88 + +1294: ; preds = %__nv_exp2f.exit216 + %1295 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1233) #2, !dbg !88 + br label %__nv_exp2f.exit219, !dbg !88 + +1296: ; preds = %__nv_exp2f.exit216 + %1297 = tail call float @llvm.nvvm.ex2.approx.f(float %1233) #2, !dbg !88 + br label %__nv_exp2f.exit219, !dbg !88 + +__nv_exp2f.exit219: ; preds = %1294, %1296 + %.0.i218 = phi float [ %1295, %1294 ], [ %1297, %1296 ], !dbg !88 + %1298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i220 = icmp eq i32 %1298, 0, !dbg !88 + br i1 %.not.i220, label %1301, label %1299, !dbg !88 + +1299: ; preds = %__nv_exp2f.exit219 + %1300 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1234) #2, !dbg !88 + br label %__nv_exp2f.exit222, !dbg !88 + +1301: ; preds = %__nv_exp2f.exit219 + %1302 = tail call float @llvm.nvvm.ex2.approx.f(float %1234) #2, !dbg !88 + br label %__nv_exp2f.exit222, !dbg !88 + +__nv_exp2f.exit222: ; preds = %1299, %1301 + %.0.i221 = phi float [ %1300, %1299 ], [ %1302, %1301 ], !dbg !88 + %1303 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i223 = icmp eq i32 %1303, 0, !dbg !88 + br i1 %.not.i223, label %1306, label %1304, !dbg !88 + +1304: ; preds = %__nv_exp2f.exit222 + %1305 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1235) #2, !dbg !88 + br label %__nv_exp2f.exit225, !dbg !88 + +1306: ; preds = %__nv_exp2f.exit222 + %1307 = tail call float @llvm.nvvm.ex2.approx.f(float %1235) #2, !dbg !88 + br label %__nv_exp2f.exit225, !dbg !88 + +__nv_exp2f.exit225: ; preds = %1304, %1306 + %.0.i224 = phi float [ %1305, %1304 ], [ %1307, %1306 ], !dbg !88 + %1308 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i226 = icmp eq i32 %1308, 0, !dbg !88 + br i1 %.not.i226, label %1311, label %1309, !dbg !88 + +1309: ; preds = %__nv_exp2f.exit225 + %1310 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1236) #2, !dbg !88 + br label %__nv_exp2f.exit228, !dbg !88 + +1311: ; preds = %__nv_exp2f.exit225 + %1312 = tail call float @llvm.nvvm.ex2.approx.f(float %1236) #2, !dbg !88 + br label %__nv_exp2f.exit228, !dbg !88 + +__nv_exp2f.exit228: ; preds = %1309, %1311 + %.0.i227 = phi float [ %1310, %1309 ], [ %1312, %1311 ], !dbg !88 + %1313 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i229 = icmp eq i32 %1313, 0, !dbg !88 + br i1 %.not.i229, label %1316, label %1314, !dbg !88 + +1314: ; preds = %__nv_exp2f.exit228 + %1315 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1237) #2, !dbg !88 + br label %__nv_exp2f.exit231, !dbg !88 + +1316: ; preds = %__nv_exp2f.exit228 + %1317 = tail call float @llvm.nvvm.ex2.approx.f(float %1237) #2, !dbg !88 + br label %__nv_exp2f.exit231, !dbg !88 + +__nv_exp2f.exit231: ; preds = %1314, %1316 + %.0.i230 = phi float [ %1315, %1314 ], [ %1317, %1316 ], !dbg !88 + %1318 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i232 = icmp eq i32 %1318, 0, !dbg !88 + br i1 %.not.i232, label %1321, label %1319, !dbg !88 + +1319: ; preds = %__nv_exp2f.exit231 + %1320 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1238) #2, !dbg !88 + br label %__nv_exp2f.exit234, !dbg !88 + +1321: ; preds = %__nv_exp2f.exit231 + %1322 = tail call float @llvm.nvvm.ex2.approx.f(float %1238) #2, !dbg !88 + br label %__nv_exp2f.exit234, !dbg !88 + +__nv_exp2f.exit234: ; preds = %1319, %1321 + %.0.i233 = phi float [ %1320, %1319 ], [ %1322, %1321 ], !dbg !88 + %1323 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i235 = icmp eq i32 %1323, 0, !dbg !88 + br i1 %.not.i235, label %1326, label %1324, !dbg !88 + +1324: ; preds = %__nv_exp2f.exit234 + %1325 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1239) #2, !dbg !88 + br label %__nv_exp2f.exit237, !dbg !88 + +1326: ; preds = %__nv_exp2f.exit234 + %1327 = tail call float @llvm.nvvm.ex2.approx.f(float %1239) #2, !dbg !88 + br label %__nv_exp2f.exit237, !dbg !88 + +__nv_exp2f.exit237: ; preds = %1324, %1326 + %.0.i236 = phi float [ %1325, %1324 ], [ %1327, %1326 ], !dbg !88 + %1328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i238 = icmp eq i32 %1328, 0, !dbg !88 + br i1 %.not.i238, label %1331, label %1329, !dbg !88 + +1329: ; preds = %__nv_exp2f.exit237 + %1330 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1240) #2, !dbg !88 + br label %__nv_exp2f.exit240, !dbg !88 + +1331: ; preds = %__nv_exp2f.exit237 + %1332 = tail call float @llvm.nvvm.ex2.approx.f(float %1240) #2, !dbg !88 + br label %__nv_exp2f.exit240, !dbg !88 + +__nv_exp2f.exit240: ; preds = %1329, %1331 + %.0.i239 = phi float [ %1330, %1329 ], [ %1332, %1331 ], !dbg !88 + %1333 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i241 = icmp eq i32 %1333, 0, !dbg !88 + br i1 %.not.i241, label %1336, label %1334, !dbg !88 + +1334: ; preds = %__nv_exp2f.exit240 + %1335 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1241) #2, !dbg !88 + br label %__nv_exp2f.exit243, !dbg !88 + +1336: ; preds = %__nv_exp2f.exit240 + %1337 = tail call float @llvm.nvvm.ex2.approx.f(float %1241) #2, !dbg !88 + br label %__nv_exp2f.exit243, !dbg !88 + +__nv_exp2f.exit243: ; preds = %1334, %1336 + %.0.i242 = phi float [ %1335, %1334 ], [ %1337, %1336 ], !dbg !88 + %1338 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i244 = icmp eq i32 %1338, 0, !dbg !88 + br i1 %.not.i244, label %1341, label %1339, !dbg !88 + +1339: ; preds = %__nv_exp2f.exit243 + %1340 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1242) #2, !dbg !88 + br label %__nv_exp2f.exit246, !dbg !88 + +1341: ; preds = %__nv_exp2f.exit243 + %1342 = tail call float @llvm.nvvm.ex2.approx.f(float %1242) #2, !dbg !88 + br label %__nv_exp2f.exit246, !dbg !88 + +__nv_exp2f.exit246: ; preds = %1339, %1341 + %.0.i245 = phi float [ %1340, %1339 ], [ %1342, %1341 ], !dbg !88 + %1343 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i247 = icmp eq i32 %1343, 0, !dbg !88 + br i1 %.not.i247, label %1346, label %1344, !dbg !88 + +1344: ; preds = %__nv_exp2f.exit246 + %1345 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1243) #2, !dbg !88 + br label %__nv_exp2f.exit249, !dbg !88 + +1346: ; preds = %__nv_exp2f.exit246 + %1347 = tail call float @llvm.nvvm.ex2.approx.f(float %1243) #2, !dbg !88 + br label %__nv_exp2f.exit249, !dbg !88 + +__nv_exp2f.exit249: ; preds = %1344, %1346 + %.0.i248 = phi float [ %1345, %1344 ], [ %1347, %1346 ], !dbg !88 + %1348 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i250 = icmp eq i32 %1348, 0, !dbg !88 + br i1 %.not.i250, label %1351, label %1349, !dbg !88 + +1349: ; preds = %__nv_exp2f.exit249 + %1350 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1244) #2, !dbg !88 + br label %__nv_exp2f.exit252, !dbg !88 + +1351: ; preds = %__nv_exp2f.exit249 + %1352 = tail call float @llvm.nvvm.ex2.approx.f(float %1244) #2, !dbg !88 + br label %__nv_exp2f.exit252, !dbg !88 + +__nv_exp2f.exit252: ; preds = %1349, %1351 + %.0.i251 = phi float [ %1350, %1349 ], [ %1352, %1351 ], !dbg !88 + %1353 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i253 = icmp eq i32 %1353, 0, !dbg !88 + br i1 %.not.i253, label %1356, label %1354, !dbg !88 + +1354: ; preds = %__nv_exp2f.exit252 + %1355 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1245) #2, !dbg !88 + br label %__nv_exp2f.exit255, !dbg !88 + +1356: ; preds = %__nv_exp2f.exit252 + %1357 = tail call float @llvm.nvvm.ex2.approx.f(float %1245) #2, !dbg !88 + br label %__nv_exp2f.exit255, !dbg !88 + +__nv_exp2f.exit255: ; preds = %1354, %1356 + %.0.i254 = phi float [ %1355, %1354 ], [ %1357, %1356 ], !dbg !88 + %1358 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i256 = icmp eq i32 %1358, 0, !dbg !88 + br i1 %.not.i256, label %1361, label %1359, !dbg !88 + +1359: ; preds = %__nv_exp2f.exit255 + %1360 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1246) #2, !dbg !88 + br label %__nv_exp2f.exit258, !dbg !88 + +1361: ; preds = %__nv_exp2f.exit255 + %1362 = tail call float @llvm.nvvm.ex2.approx.f(float %1246) #2, !dbg !88 + br label %__nv_exp2f.exit258, !dbg !88 + +__nv_exp2f.exit258: ; preds = %1359, %1361 + %.0.i257 = phi float [ %1360, %1359 ], [ %1362, %1361 ], !dbg !88 + %1363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i259 = icmp eq i32 %1363, 0, !dbg !88 + br i1 %.not.i259, label %1366, label %1364, !dbg !88 + +1364: ; preds = %__nv_exp2f.exit258 + %1365 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1247) #2, !dbg !88 + br label %__nv_exp2f.exit261, !dbg !88 + +1366: ; preds = %__nv_exp2f.exit258 + %1367 = tail call float @llvm.nvvm.ex2.approx.f(float %1247) #2, !dbg !88 + br label %__nv_exp2f.exit261, !dbg !88 + +__nv_exp2f.exit261: ; preds = %1364, %1366 + %.0.i260 = phi float [ %1365, %1364 ], [ %1367, %1366 ], !dbg !88 + %1368 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i262 = icmp eq i32 %1368, 0, !dbg !88 + br i1 %.not.i262, label %1371, label %1369, !dbg !88 + +1369: ; preds = %__nv_exp2f.exit261 + %1370 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1248) #2, !dbg !88 + br label %__nv_exp2f.exit264, !dbg !88 + +1371: ; preds = %__nv_exp2f.exit261 + %1372 = tail call float @llvm.nvvm.ex2.approx.f(float %1248) #2, !dbg !88 + br label %__nv_exp2f.exit264, !dbg !88 + +__nv_exp2f.exit264: ; preds = %1369, %1371 + %.0.i263 = phi float [ %1370, %1369 ], [ %1372, %1371 ], !dbg !88 + %1373 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i265 = icmp eq i32 %1373, 0, !dbg !88 + br i1 %.not.i265, label %1376, label %1374, !dbg !88 + +1374: ; preds = %__nv_exp2f.exit264 + %1375 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1249) #2, !dbg !88 + br label %__nv_exp2f.exit267, !dbg !88 + +1376: ; preds = %__nv_exp2f.exit264 + %1377 = tail call float @llvm.nvvm.ex2.approx.f(float %1249) #2, !dbg !88 + br label %__nv_exp2f.exit267, !dbg !88 + +__nv_exp2f.exit267: ; preds = %1374, %1376 + %.0.i266 = phi float [ %1375, %1374 ], [ %1377, %1376 ], !dbg !88 + %1378 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i268 = icmp eq i32 %1378, 0, !dbg !88 + br i1 %.not.i268, label %1381, label %1379, !dbg !88 + +1379: ; preds = %__nv_exp2f.exit267 + %1380 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1250) #2, !dbg !88 + br label %__nv_exp2f.exit270, !dbg !88 + +1381: ; preds = %__nv_exp2f.exit267 + %1382 = tail call float @llvm.nvvm.ex2.approx.f(float %1250) #2, !dbg !88 + br label %__nv_exp2f.exit270, !dbg !88 + +__nv_exp2f.exit270: ; preds = %1379, %1381 + %.0.i269 = phi float [ %1380, %1379 ], [ %1382, %1381 ], !dbg !88 + %1383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i271 = icmp eq i32 %1383, 0, !dbg !88 + br i1 %.not.i271, label %1386, label %1384, !dbg !88 + +1384: ; preds = %__nv_exp2f.exit270 + %1385 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1251) #2, !dbg !88 + br label %__nv_exp2f.exit273, !dbg !88 + +1386: ; preds = %__nv_exp2f.exit270 + %1387 = tail call float @llvm.nvvm.ex2.approx.f(float %1251) #2, !dbg !88 + br label %__nv_exp2f.exit273, !dbg !88 + +__nv_exp2f.exit273: ; preds = %1384, %1386 + %.0.i272 = phi float [ %1385, %1384 ], [ %1387, %1386 ], !dbg !88 + %1388 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i274 = icmp eq i32 %1388, 0, !dbg !88 + br i1 %.not.i274, label %1391, label %1389, !dbg !88 + +1389: ; preds = %__nv_exp2f.exit273 + %1390 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1252) #2, !dbg !88 + br label %__nv_exp2f.exit276, !dbg !88 + +1391: ; preds = %__nv_exp2f.exit273 + %1392 = tail call float @llvm.nvvm.ex2.approx.f(float %1252) #2, !dbg !88 + br label %__nv_exp2f.exit276, !dbg !88 + +__nv_exp2f.exit276: ; preds = %1389, %1391 + %.0.i275 = phi float [ %1390, %1389 ], [ %1392, %1391 ], !dbg !88 + %1393 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i277 = icmp eq i32 %1393, 0, !dbg !88 + br i1 %.not.i277, label %1396, label %1394, !dbg !88 + +1394: ; preds = %__nv_exp2f.exit276 + %1395 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1253) #2, !dbg !88 + br label %__nv_exp2f.exit279, !dbg !88 + +1396: ; preds = %__nv_exp2f.exit276 + %1397 = tail call float @llvm.nvvm.ex2.approx.f(float %1253) #2, !dbg !88 + br label %__nv_exp2f.exit279, !dbg !88 + +__nv_exp2f.exit279: ; preds = %1394, %1396 + %.0.i278 = phi float [ %1395, %1394 ], [ %1397, %1396 ], !dbg !88 + %1398 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i280 = icmp eq i32 %1398, 0, !dbg !88 + br i1 %.not.i280, label %1401, label %1399, !dbg !88 + +1399: ; preds = %__nv_exp2f.exit279 + %1400 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1254) #2, !dbg !88 + br label %__nv_exp2f.exit282, !dbg !88 + +1401: ; preds = %__nv_exp2f.exit279 + %1402 = tail call float @llvm.nvvm.ex2.approx.f(float %1254) #2, !dbg !88 + br label %__nv_exp2f.exit282, !dbg !88 + +__nv_exp2f.exit282: ; preds = %1399, %1401 + %.0.i281 = phi float [ %1400, %1399 ], [ %1402, %1401 ], !dbg !88 + %1403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i283 = icmp eq i32 %1403, 0, !dbg !88 + br i1 %.not.i283, label %1406, label %1404, !dbg !88 + +1404: ; preds = %__nv_exp2f.exit282 + %1405 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1255) #2, !dbg !88 + br label %__nv_exp2f.exit285, !dbg !88 + +1406: ; preds = %__nv_exp2f.exit282 + %1407 = tail call float @llvm.nvvm.ex2.approx.f(float %1255) #2, !dbg !88 + br label %__nv_exp2f.exit285, !dbg !88 + +__nv_exp2f.exit285: ; preds = %1404, %1406 + %.0.i284 = phi float [ %1405, %1404 ], [ %1407, %1406 ], !dbg !88 + %1408 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i286 = icmp eq i32 %1408, 0, !dbg !88 + br i1 %.not.i286, label %1411, label %1409, !dbg !88 + +1409: ; preds = %__nv_exp2f.exit285 + %1410 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1256) #2, !dbg !88 + br label %__nv_exp2f.exit288, !dbg !88 + +1411: ; preds = %__nv_exp2f.exit285 + %1412 = tail call float @llvm.nvvm.ex2.approx.f(float %1256) #2, !dbg !88 + br label %__nv_exp2f.exit288, !dbg !88 + +__nv_exp2f.exit288: ; preds = %1409, %1411 + %.0.i287 = phi float [ %1410, %1409 ], [ %1412, %1411 ], !dbg !88 + %1413 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i289 = icmp eq i32 %1413, 0, !dbg !88 + br i1 %.not.i289, label %1416, label %1414, !dbg !88 + +1414: ; preds = %__nv_exp2f.exit288 + %1415 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1257) #2, !dbg !88 + br label %__nv_exp2f.exit291, !dbg !88 + +1416: ; preds = %__nv_exp2f.exit288 + %1417 = tail call float @llvm.nvvm.ex2.approx.f(float %1257) #2, !dbg !88 + br label %__nv_exp2f.exit291, !dbg !88 + +__nv_exp2f.exit291: ; preds = %1414, %1416 + %.0.i290 = phi float [ %1415, %1414 ], [ %1417, %1416 ], !dbg !88 + %1418 = fmul float %306, %.0.i191, !dbg !89 + %1419 = fmul float %307, %.0.i194, !dbg !89 + %1420 = fadd float %.0.i197, %.0.i200, !dbg !90 + %1421 = fadd float %.0.i203, %.0.i206, !dbg !90 + %1422 = fadd float %1420, %.0.i209, !dbg !90 + %1423 = fadd float %1422, %.0.i212, !dbg !90 + %1424 = fadd float %1421, %.0.i215, !dbg !90 + %1425 = fadd float %1424, %.0.i218, !dbg !90 + %1426 = fadd float %1423, %.0.i221, !dbg !90 + %1427 = fadd float %1426, %.0.i224, !dbg !90 + %1428 = fadd float %1425, %.0.i227, !dbg !90 + %1429 = fadd float %1428, %.0.i230, !dbg !90 + %1430 = fadd float %1427, %.0.i233, !dbg !90 + %1431 = fadd float %1430, %.0.i236, !dbg !90 + %1432 = fadd float %1429, %.0.i239, !dbg !90 + %1433 = fadd float %1432, %.0.i242, !dbg !90 + %1434 = fadd float %1431, %.0.i245, !dbg !90 + %1435 = fadd float %1434, %.0.i248, !dbg !90 + %1436 = fadd float %1433, %.0.i251, !dbg !90 + %1437 = fadd float %1436, %.0.i254, !dbg !90 + %1438 = fadd float %1435, %.0.i257, !dbg !90 + %1439 = fadd float %1438, %.0.i260, !dbg !90 + %1440 = fadd float %1437, %.0.i263, !dbg !90 + %1441 = fadd float %1440, %.0.i266, !dbg !90 + %1442 = fadd float %1439, %.0.i269, !dbg !90 + %1443 = fadd float %1442, %.0.i272, !dbg !90 + %1444 = fadd float %1441, %.0.i275, !dbg !90 + %1445 = fadd float %1444, %.0.i278, !dbg !90 + %1446 = fadd float %1443, %.0.i281, !dbg !90 + %1447 = fadd float %1446, %.0.i284, !dbg !90 + %1448 = fadd float %1445, %.0.i287, !dbg !90 + %1449 = fadd float %1448, %.0.i290, !dbg !90 + %1450 = bitcast float %1447 to i32, !dbg !91 + %1451 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1450, i32 2, i32 31), !dbg !91 + %1452 = bitcast i32 %1451 to float, !dbg !91 + %1453 = fadd float %1447, %1452, !dbg !90 + %1454 = bitcast float %1453 to i32, !dbg !91 + %1455 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1454, i32 1, i32 31), !dbg !91 + %1456 = bitcast i32 %1455 to float, !dbg !91 + %1457 = fadd float %1453, %1456, !dbg !90 + %1458 = bitcast float %1449 to i32, !dbg !91 + %1459 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1458, i32 2, i32 31), !dbg !91 + %1460 = bitcast i32 %1459 to float, !dbg !91 + %1461 = fadd float %1449, %1460, !dbg !90 + %1462 = bitcast float %1461 to i32, !dbg !91 + %1463 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1462, i32 1, i32 31), !dbg !91 + %1464 = bitcast i32 %1463 to float, !dbg !91 + %1465 = fadd float %1461, %1464, !dbg !90 + %1466 = fadd float %1418, %1457, !dbg !92 + %1467 = fadd float %1419, %1465, !dbg !92 + %1468 = fmul float %763, %.0.i191, !dbg !93 + %1469 = fmul float %764, %.0.i191, !dbg !93 + %1470 = fmul float %765, %.0.i194, !dbg !93 + %1471 = fmul float %766, %.0.i194, !dbg !93 + %1472 = fmul float %767, %.0.i191, !dbg !93 + %1473 = fmul float %768, %.0.i191, !dbg !93 + %1474 = fmul float %769, %.0.i194, !dbg !93 + %1475 = fmul float %770, %.0.i194, !dbg !93 + %1476 = fmul float %771, %.0.i191, !dbg !93 + %1477 = fmul float %772, %.0.i191, !dbg !93 + %1478 = fmul float %773, %.0.i194, !dbg !93 + %1479 = fmul float %774, %.0.i194, !dbg !93 + %1480 = fmul float %775, %.0.i191, !dbg !93 + %1481 = fmul float %776, %.0.i191, !dbg !93 + %1482 = fmul float %777, %.0.i194, !dbg !93 + %1483 = fmul float %778, %.0.i194, !dbg !93 + %1484 = fmul float %779, %.0.i191, !dbg !93 + %1485 = fmul float %780, %.0.i191, !dbg !93 + %1486 = fmul float %781, %.0.i194, !dbg !93 + %1487 = fmul float %782, %.0.i194, !dbg !93 + %1488 = fmul float %783, %.0.i191, !dbg !93 + %1489 = fmul float %784, %.0.i191, !dbg !93 + %1490 = fmul float %785, %.0.i194, !dbg !93 + %1491 = fmul float %786, %.0.i194, !dbg !93 + %1492 = fmul float %787, %.0.i191, !dbg !93 + %1493 = fmul float %788, %.0.i191, !dbg !93 + %1494 = fmul float %789, %.0.i194, !dbg !93 + %1495 = fmul float %790, %.0.i194, !dbg !93 + %1496 = fmul float %791, %.0.i191, !dbg !93 + %1497 = fmul float %792, %.0.i191, !dbg !93 + %1498 = fmul float %793, %.0.i194, !dbg !93 + %1499 = fmul float %794, %.0.i194, !dbg !93 + %1500 = fmul float %795, %.0.i191, !dbg !93 + %1501 = fmul float %796, %.0.i191, !dbg !93 + %1502 = fmul float %797, %.0.i194, !dbg !93 + %1503 = fmul float %798, %.0.i194, !dbg !93 + %1504 = fmul float %799, %.0.i191, !dbg !93 + %1505 = fmul float %800, %.0.i191, !dbg !93 + %1506 = fmul float %801, %.0.i194, !dbg !93 + %1507 = fmul float %802, %.0.i194, !dbg !93 + %1508 = fmul float %803, %.0.i191, !dbg !93 + %1509 = fmul float %804, %.0.i191, !dbg !93 + %1510 = fmul float %805, %.0.i194, !dbg !93 + %1511 = fmul float %806, %.0.i194, !dbg !93 + %1512 = fmul float %807, %.0.i191, !dbg !93 + %1513 = fmul float %808, %.0.i191, !dbg !93 + %1514 = fmul float %809, %.0.i194, !dbg !93 + %1515 = fmul float %810, %.0.i194, !dbg !93 + %1516 = fmul float %811, %.0.i191, !dbg !93 + %1517 = fmul float %812, %.0.i191, !dbg !93 + %1518 = fmul float %813, %.0.i194, !dbg !93 + %1519 = fmul float %814, %.0.i194, !dbg !93 + %1520 = fmul float %815, %.0.i191, !dbg !93 + %1521 = fmul float %816, %.0.i191, !dbg !93 + %1522 = fmul float %817, %.0.i194, !dbg !93 + %1523 = fmul float %818, %.0.i194, !dbg !93 + %1524 = fmul float %819, %.0.i191, !dbg !93 + %1525 = fmul float %820, %.0.i191, !dbg !93 + %1526 = fmul float %821, %.0.i194, !dbg !93 + %1527 = fmul float %822, %.0.i194, !dbg !93 + %1528 = fmul float %823, %.0.i191, !dbg !93 + %1529 = fmul float %824, %.0.i191, !dbg !93 + %1530 = fmul float %825, %.0.i194, !dbg !93 + %1531 = fmul float %826, %.0.i194, !dbg !93 + %1532 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %381, !dbg !53 + %1533 = insertelement <2 x float> poison, float %.0.i197, i64 0, !dbg !94 + %1534 = insertelement <2 x float> %1533, float %.0.i200, i64 1, !dbg !94 + %1535 = fptrunc <2 x float> %1534 to <2 x bfloat>, !dbg !94 + %1536 = insertelement <2 x float> poison, float %.0.i203, i64 0, !dbg !94 + %1537 = insertelement <2 x float> %1536, float %.0.i206, i64 1, !dbg !94 + %1538 = fptrunc <2 x float> %1537 to <2 x bfloat>, !dbg !94 + %1539 = insertelement <2 x float> poison, float %.0.i209, i64 0, !dbg !94 + %1540 = insertelement <2 x float> %1539, float %.0.i212, i64 1, !dbg !94 + %1541 = fptrunc <2 x float> %1540 to <2 x bfloat>, !dbg !94 + %1542 = insertelement <2 x float> poison, float %.0.i215, i64 0, !dbg !94 + %1543 = insertelement <2 x float> %1542, float %.0.i218, i64 1, !dbg !94 + %1544 = fptrunc <2 x float> %1543 to <2 x bfloat>, !dbg !94 + %1545 = insertelement <2 x float> poison, float %.0.i221, i64 0, !dbg !94 + %1546 = insertelement <2 x float> %1545, float %.0.i224, i64 1, !dbg !94 + %1547 = fptrunc <2 x float> %1546 to <2 x bfloat>, !dbg !94 + %1548 = insertelement <2 x float> poison, float %.0.i227, i64 0, !dbg !94 + %1549 = insertelement <2 x float> %1548, float %.0.i230, i64 1, !dbg !94 + %1550 = fptrunc <2 x float> %1549 to <2 x bfloat>, !dbg !94 + %1551 = insertelement <2 x float> poison, float %.0.i233, i64 0, !dbg !94 + %1552 = insertelement <2 x float> %1551, float %.0.i236, i64 1, !dbg !94 + %1553 = fptrunc <2 x float> %1552 to <2 x bfloat>, !dbg !94 + %1554 = insertelement <2 x float> poison, float %.0.i239, i64 0, !dbg !94 + %1555 = insertelement <2 x float> %1554, float %.0.i242, i64 1, !dbg !94 + %1556 = fptrunc <2 x float> %1555 to <2 x bfloat>, !dbg !94 + %1557 = insertelement <2 x float> poison, float %.0.i245, i64 0, !dbg !94 + %1558 = insertelement <2 x float> %1557, float %.0.i248, i64 1, !dbg !94 + %1559 = fptrunc <2 x float> %1558 to <2 x bfloat>, !dbg !94 + %1560 = insertelement <2 x float> poison, float %.0.i251, i64 0, !dbg !94 + %1561 = insertelement <2 x float> %1560, float %.0.i254, i64 1, !dbg !94 + %1562 = fptrunc <2 x float> %1561 to <2 x bfloat>, !dbg !94 + %1563 = insertelement <2 x float> poison, float %.0.i257, i64 0, !dbg !94 + %1564 = insertelement <2 x float> %1563, float %.0.i260, i64 1, !dbg !94 + %1565 = fptrunc <2 x float> %1564 to <2 x bfloat>, !dbg !94 + %1566 = insertelement <2 x float> poison, float %.0.i263, i64 0, !dbg !94 + %1567 = insertelement <2 x float> %1566, float %.0.i266, i64 1, !dbg !94 + %1568 = fptrunc <2 x float> %1567 to <2 x bfloat>, !dbg !94 + %1569 = insertelement <2 x float> poison, float %.0.i269, i64 0, !dbg !94 + %1570 = insertelement <2 x float> %1569, float %.0.i272, i64 1, !dbg !94 + %1571 = fptrunc <2 x float> %1570 to <2 x bfloat>, !dbg !94 + %1572 = insertelement <2 x float> poison, float %.0.i275, i64 0, !dbg !94 + %1573 = insertelement <2 x float> %1572, float %.0.i278, i64 1, !dbg !94 + %1574 = fptrunc <2 x float> %1573 to <2 x bfloat>, !dbg !94 + %1575 = insertelement <2 x float> poison, float %.0.i281, i64 0, !dbg !94 + %1576 = insertelement <2 x float> %1575, float %.0.i284, i64 1, !dbg !94 + %1577 = fptrunc <2 x float> %1576 to <2 x bfloat>, !dbg !94 + %1578 = insertelement <2 x float> poison, float %.0.i287, i64 0, !dbg !94 + %1579 = insertelement <2 x float> %1578, float %.0.i290, i64 1, !dbg !94 + %1580 = fptrunc <2 x float> %1579 to <2 x bfloat>, !dbg !94 + %1581 = bitcast <2 x bfloat> %1535 to i32, !dbg !95 + %1582 = bitcast <2 x bfloat> %1538 to i32, !dbg !95 + %1583 = bitcast <2 x bfloat> %1541 to i32, !dbg !95 + %1584 = bitcast <2 x bfloat> %1544 to i32, !dbg !95 + %1585 = bitcast <2 x bfloat> %1547 to i32, !dbg !95 + %1586 = bitcast <2 x bfloat> %1550 to i32, !dbg !95 + %1587 = bitcast <2 x bfloat> %1553 to i32, !dbg !95 + %1588 = bitcast <2 x bfloat> %1556 to i32, !dbg !95 + %1589 = bitcast <2 x bfloat> %1559 to i32, !dbg !95 + %1590 = bitcast <2 x bfloat> %1562 to i32, !dbg !95 + %1591 = bitcast <2 x bfloat> %1565 to i32, !dbg !95 + %1592 = bitcast <2 x bfloat> %1568 to i32, !dbg !95 + %1593 = bitcast <2 x bfloat> %1571 to i32, !dbg !95 + %1594 = bitcast <2 x bfloat> %1574 to i32, !dbg !95 + %1595 = bitcast <2 x bfloat> %1577 to i32, !dbg !95 + %1596 = bitcast <2 x bfloat> %1580 to i32, !dbg !95 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !95 + %1597 = ptrtoint ptr addrspace(3) %1532 to i32, !dbg !95 + %1598 = lshr exact i32 %1597, 4, !dbg !95 + %1599 = and i32 %1598, 16383, !dbg !95 + %1600 = zext nneg i32 %1599 to i64, !dbg !95 + %1601 = or disjoint i64 %1600, 4611686293338849280, !dbg !95 + %1602 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1468, float %1469, float %1470, float %1471, float %1472, float %1473, float %1474, float %1475, float %1476, float %1477, float %1478, float %1479, float %1480, float %1481, float %1482, float %1483, float %1484, float %1485, float %1486, float %1487, float %1488, float %1489, float %1490, float %1491, float %1492, float %1493, float %1494, float %1495, float %1496, float %1497, float %1498, float %1499, float %1500, float %1501, float %1502, float %1503, float %1504, float %1505, float %1506, float %1507, float %1508, float %1509, float %1510, float %1511, float %1512, float %1513, float %1514, float %1515, float %1516, float %1517, float %1518, float %1519, float %1520, float %1521, float %1522, float %1523, float %1524, float %1525, float %1526, float %1527, float %1528, float %1529, float %1530, float %1531, i32 %1581, i32 %1582, i32 %1583, i32 %1584, i64 %1601, i1 true) #2, !dbg !95 + %1603 = add i32 %1597, 2048, !dbg !95 + %1604 = lshr exact i32 %1603, 4, !dbg !95 + %1605 = and i32 %1604, 16383, !dbg !95 + %1606 = zext nneg i32 %1605 to i64, !dbg !95 + %1607 = or disjoint i64 %1606, 4611686293338849280, !dbg !95 + %1608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 0, !dbg !95 + %1609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 1, !dbg !95 + %1610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 2, !dbg !95 + %1611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 3, !dbg !95 + %1612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 4, !dbg !95 + %1613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 5, !dbg !95 + %1614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 6, !dbg !95 + %1615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 7, !dbg !95 + %1616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 8, !dbg !95 + %1617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 9, !dbg !95 + %1618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 10, !dbg !95 + %1619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 11, !dbg !95 + %1620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 12, !dbg !95 + %1621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 13, !dbg !95 + %1622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 14, !dbg !95 + %1623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 15, !dbg !95 + %1624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 16, !dbg !95 + %1625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 17, !dbg !95 + %1626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 18, !dbg !95 + %1627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 19, !dbg !95 + %1628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 20, !dbg !95 + %1629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 21, !dbg !95 + %1630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 22, !dbg !95 + %1631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 23, !dbg !95 + %1632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 24, !dbg !95 + %1633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 25, !dbg !95 + %1634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 26, !dbg !95 + %1635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 27, !dbg !95 + %1636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 28, !dbg !95 + %1637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 29, !dbg !95 + %1638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 30, !dbg !95 + %1639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 31, !dbg !95 + %1640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 32, !dbg !95 + %1641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 33, !dbg !95 + %1642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 34, !dbg !95 + %1643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 35, !dbg !95 + %1644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 36, !dbg !95 + %1645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 37, !dbg !95 + %1646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 38, !dbg !95 + %1647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 39, !dbg !95 + %1648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 40, !dbg !95 + %1649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 41, !dbg !95 + %1650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 42, !dbg !95 + %1651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 43, !dbg !95 + %1652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 44, !dbg !95 + %1653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 45, !dbg !95 + %1654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 46, !dbg !95 + %1655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 47, !dbg !95 + %1656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 48, !dbg !95 + %1657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 49, !dbg !95 + %1658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 50, !dbg !95 + %1659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 51, !dbg !95 + %1660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 52, !dbg !95 + %1661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 53, !dbg !95 + %1662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 54, !dbg !95 + %1663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 55, !dbg !95 + %1664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 56, !dbg !95 + %1665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 57, !dbg !95 + %1666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 58, !dbg !95 + %1667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 59, !dbg !95 + %1668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 60, !dbg !95 + %1669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 61, !dbg !95 + %1670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 62, !dbg !95 + %1671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1602, 63, !dbg !95 + %1672 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1608, float %1609, float %1610, float %1611, float %1612, float %1613, float %1614, float %1615, float %1616, float %1617, float %1618, float %1619, float %1620, float %1621, float %1622, float %1623, float %1624, float %1625, float %1626, float %1627, float %1628, float %1629, float %1630, float %1631, float %1632, float %1633, float %1634, float %1635, float %1636, float %1637, float %1638, float %1639, float %1640, float %1641, float %1642, float %1643, float %1644, float %1645, float %1646, float %1647, float %1648, float %1649, float %1650, float %1651, float %1652, float %1653, float %1654, float %1655, float %1656, float %1657, float %1658, float %1659, float %1660, float %1661, float %1662, float %1663, float %1664, float %1665, float %1666, float %1667, float %1668, float %1669, float %1670, float %1671, i32 %1585, i32 %1586, i32 %1587, i32 %1588, i64 %1607, i1 true) #2, !dbg !95 + %1673 = add i32 %1597, 4096, !dbg !95 + %1674 = lshr exact i32 %1673, 4, !dbg !95 + %1675 = and i32 %1674, 16383, !dbg !95 + %1676 = zext nneg i32 %1675 to i64, !dbg !95 + %1677 = or disjoint i64 %1676, 4611686293338849280, !dbg !95 + %1678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 0, !dbg !95 + %1679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 1, !dbg !95 + %1680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 2, !dbg !95 + %1681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 3, !dbg !95 + %1682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 4, !dbg !95 + %1683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 5, !dbg !95 + %1684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 6, !dbg !95 + %1685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 7, !dbg !95 + %1686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 8, !dbg !95 + %1687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 9, !dbg !95 + %1688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 10, !dbg !95 + %1689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 11, !dbg !95 + %1690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 12, !dbg !95 + %1691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 13, !dbg !95 + %1692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 14, !dbg !95 + %1693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 15, !dbg !95 + %1694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 16, !dbg !95 + %1695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 17, !dbg !95 + %1696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 18, !dbg !95 + %1697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 19, !dbg !95 + %1698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 20, !dbg !95 + %1699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 21, !dbg !95 + %1700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 22, !dbg !95 + %1701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 23, !dbg !95 + %1702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 24, !dbg !95 + %1703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 25, !dbg !95 + %1704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 26, !dbg !95 + %1705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 27, !dbg !95 + %1706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 28, !dbg !95 + %1707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 29, !dbg !95 + %1708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 30, !dbg !95 + %1709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 31, !dbg !95 + %1710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 32, !dbg !95 + %1711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 33, !dbg !95 + %1712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 34, !dbg !95 + %1713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 35, !dbg !95 + %1714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 36, !dbg !95 + %1715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 37, !dbg !95 + %1716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 38, !dbg !95 + %1717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 39, !dbg !95 + %1718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 40, !dbg !95 + %1719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 41, !dbg !95 + %1720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 42, !dbg !95 + %1721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 43, !dbg !95 + %1722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 44, !dbg !95 + %1723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 45, !dbg !95 + %1724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 46, !dbg !95 + %1725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 47, !dbg !95 + %1726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 48, !dbg !95 + %1727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 49, !dbg !95 + %1728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 50, !dbg !95 + %1729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 51, !dbg !95 + %1730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 52, !dbg !95 + %1731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 53, !dbg !95 + %1732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 54, !dbg !95 + %1733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 55, !dbg !95 + %1734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 56, !dbg !95 + %1735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 57, !dbg !95 + %1736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 58, !dbg !95 + %1737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 59, !dbg !95 + %1738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 60, !dbg !95 + %1739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 61, !dbg !95 + %1740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 62, !dbg !95 + %1741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1672, 63, !dbg !95 + %1742 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1678, float %1679, float %1680, float %1681, float %1682, float %1683, float %1684, float %1685, float %1686, float %1687, float %1688, float %1689, float %1690, float %1691, float %1692, float %1693, float %1694, float %1695, float %1696, float %1697, float %1698, float %1699, float %1700, float %1701, float %1702, float %1703, float %1704, float %1705, float %1706, float %1707, float %1708, float %1709, float %1710, float %1711, float %1712, float %1713, float %1714, float %1715, float %1716, float %1717, float %1718, float %1719, float %1720, float %1721, float %1722, float %1723, float %1724, float %1725, float %1726, float %1727, float %1728, float %1729, float %1730, float %1731, float %1732, float %1733, float %1734, float %1735, float %1736, float %1737, float %1738, float %1739, float %1740, float %1741, i32 %1589, i32 %1590, i32 %1591, i32 %1592, i64 %1677, i1 true) #2, !dbg !95 + %1743 = add i32 %1597, 6144, !dbg !95 + %1744 = lshr exact i32 %1743, 4, !dbg !95 + %1745 = and i32 %1744, 16383, !dbg !95 + %1746 = zext nneg i32 %1745 to i64, !dbg !95 + %1747 = or disjoint i64 %1746, 4611686293338849280, !dbg !95 + %1748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 0, !dbg !95 + %1749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 1, !dbg !95 + %1750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 2, !dbg !95 + %1751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 3, !dbg !95 + %1752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 4, !dbg !95 + %1753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 5, !dbg !95 + %1754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 6, !dbg !95 + %1755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 7, !dbg !95 + %1756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 8, !dbg !95 + %1757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 9, !dbg !95 + %1758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 10, !dbg !95 + %1759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 11, !dbg !95 + %1760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 12, !dbg !95 + %1761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 13, !dbg !95 + %1762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 14, !dbg !95 + %1763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 15, !dbg !95 + %1764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 16, !dbg !95 + %1765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 17, !dbg !95 + %1766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 18, !dbg !95 + %1767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 19, !dbg !95 + %1768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 20, !dbg !95 + %1769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 21, !dbg !95 + %1770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 22, !dbg !95 + %1771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 23, !dbg !95 + %1772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 24, !dbg !95 + %1773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 25, !dbg !95 + %1774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 26, !dbg !95 + %1775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 27, !dbg !95 + %1776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 28, !dbg !95 + %1777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 29, !dbg !95 + %1778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 30, !dbg !95 + %1779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 31, !dbg !95 + %1780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 32, !dbg !95 + %1781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 33, !dbg !95 + %1782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 34, !dbg !95 + %1783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 35, !dbg !95 + %1784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 36, !dbg !95 + %1785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 37, !dbg !95 + %1786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 38, !dbg !95 + %1787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 39, !dbg !95 + %1788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 40, !dbg !95 + %1789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 41, !dbg !95 + %1790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 42, !dbg !95 + %1791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 43, !dbg !95 + %1792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 44, !dbg !95 + %1793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 45, !dbg !95 + %1794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 46, !dbg !95 + %1795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 47, !dbg !95 + %1796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 48, !dbg !95 + %1797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 49, !dbg !95 + %1798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 50, !dbg !95 + %1799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 51, !dbg !95 + %1800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 52, !dbg !95 + %1801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 53, !dbg !95 + %1802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 54, !dbg !95 + %1803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 55, !dbg !95 + %1804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 56, !dbg !95 + %1805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 57, !dbg !95 + %1806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 58, !dbg !95 + %1807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 59, !dbg !95 + %1808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 60, !dbg !95 + %1809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 61, !dbg !95 + %1810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 62, !dbg !95 + %1811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1742, 63, !dbg !95 + %1812 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1748, float %1749, float %1750, float %1751, float %1752, float %1753, float %1754, float %1755, float %1756, float %1757, float %1758, float %1759, float %1760, float %1761, float %1762, float %1763, float %1764, float %1765, float %1766, float %1767, float %1768, float %1769, float %1770, float %1771, float %1772, float %1773, float %1774, float %1775, float %1776, float %1777, float %1778, float %1779, float %1780, float %1781, float %1782, float %1783, float %1784, float %1785, float %1786, float %1787, float %1788, float %1789, float %1790, float %1791, float %1792, float %1793, float %1794, float %1795, float %1796, float %1797, float %1798, float %1799, float %1800, float %1801, float %1802, float %1803, float %1804, float %1805, float %1806, float %1807, float %1808, float %1809, float %1810, float %1811, i32 %1593, i32 %1594, i32 %1595, i32 %1596, i64 %1747, i1 true) #2, !dbg !95 + %1813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 0, !dbg !95 + %1814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 1, !dbg !95 + %1815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 2, !dbg !95 + %1816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 3, !dbg !95 + %1817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 4, !dbg !95 + %1818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 5, !dbg !95 + %1819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 6, !dbg !95 + %1820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 7, !dbg !95 + %1821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 8, !dbg !95 + %1822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 9, !dbg !95 + %1823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 10, !dbg !95 + %1824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 11, !dbg !95 + %1825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 12, !dbg !95 + %1826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 13, !dbg !95 + %1827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 14, !dbg !95 + %1828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 15, !dbg !95 + %1829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 16, !dbg !95 + %1830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 17, !dbg !95 + %1831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 18, !dbg !95 + %1832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 19, !dbg !95 + %1833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 20, !dbg !95 + %1834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 21, !dbg !95 + %1835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 22, !dbg !95 + %1836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 23, !dbg !95 + %1837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 24, !dbg !95 + %1838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 25, !dbg !95 + %1839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 26, !dbg !95 + %1840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 27, !dbg !95 + %1841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 28, !dbg !95 + %1842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 29, !dbg !95 + %1843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 30, !dbg !95 + %1844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 31, !dbg !95 + %1845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 32, !dbg !95 + %1846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 33, !dbg !95 + %1847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 34, !dbg !95 + %1848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 35, !dbg !95 + %1849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 36, !dbg !95 + %1850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 37, !dbg !95 + %1851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 38, !dbg !95 + %1852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 39, !dbg !95 + %1853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 40, !dbg !95 + %1854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 41, !dbg !95 + %1855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 42, !dbg !95 + %1856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 43, !dbg !95 + %1857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 44, !dbg !95 + %1858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 45, !dbg !95 + %1859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 46, !dbg !95 + %1860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 47, !dbg !95 + %1861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 48, !dbg !95 + %1862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 49, !dbg !95 + %1863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 50, !dbg !95 + %1864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 51, !dbg !95 + %1865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 52, !dbg !95 + %1866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 53, !dbg !95 + %1867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 54, !dbg !95 + %1868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 55, !dbg !95 + %1869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 56, !dbg !95 + %1870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 57, !dbg !95 + %1871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 58, !dbg !95 + %1872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 59, !dbg !95 + %1873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 60, !dbg !95 + %1874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 61, !dbg !95 + %1875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 62, !dbg !95 + %1876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1812, 63, !dbg !95 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !95 + %1877 = insertelement <16 x i32> poison, i32 %302, i64 0, !dbg !96 + %1878 = shufflevector <16 x i32> %1877, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !96 + %1879 = add <16 x i32> %1878, %374, !dbg !96 + %1880 = add nuw nsw i32 %372, 1, !dbg !45 + %1881 = lshr i32 %1880, 1, !dbg !97 + %1882 = zext nneg i32 %1881 to i64, !dbg !98 + %1883 = getelementptr i32, ptr addrspace(1) %183, i64 %1882, !dbg !98 + %1884 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !99 + %1885 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %1883, i64 %1884, i1 %377) #2, !dbg !99 + %1886 = add nuw nsw i32 %1881, 1, !dbg !100 + %1887 = icmp slt i32 %1886, %188, !dbg !101 + %1888 = getelementptr i8, ptr addrspace(1) %1883, i64 4, !dbg !102 + %1889 = and i1 %377, %1887, !dbg !45 + %1890 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !103 + %1891 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %1888, i64 %1890, i1 %1889) #2, !dbg !103 + %1892 = and i32 %372, 1, !dbg !104 + %1893 = sub i32 %1891, %1885, !dbg !105 + %1894 = shl i32 %1893, 7, !dbg !106 + %1895 = add i32 %1894, -64, !dbg !107 + %1896 = xor i32 %1892, 1, !dbg !108 + %1897 = mul nuw nsw i32 %1895, %1896, !dbg !108 + %1898 = shl nuw nsw i32 %1892, 6, !dbg !109 + %1899 = add i32 %1897, %1898, !dbg !110 + %1900 = add i32 %1899, %305, !dbg !111 + %1901 = add i32 %304, 1, !dbg !45 + %1902 = icmp sgt i32 %1901, 2, !dbg !45 + %1903 = select i1 %1902, i32 0, i32 %1901, !dbg !45 + %1904 = add i32 %1900, %185, !dbg !54 + %1905 = add i32 %1904, %39, !dbg !49 + %1906 = add i32 %1904, %40, !dbg !49 + %1907 = add i32 %1904, %41, !dbg !49 + %1908 = add i32 %1904, %42, !dbg !49 + %1909 = shl i32 %1905, 7, !dbg !50 + %1910 = shl i32 %1906, 7, !dbg !50 + %1911 = shl i32 %1907, 7, !dbg !50 + %1912 = shl i32 %1908, 7, !dbg !50 + %1913 = sext i32 %1909 to i64, !dbg !51 + %1914 = sext i32 %1910 to i64, !dbg !51 + %1915 = sext i32 %1911 to i64, !dbg !51 + %1916 = sext i32 %1912 to i64, !dbg !51 + %gep = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %1913, !dbg !52 + %gep330 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %1914, !dbg !52 + %gep332 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %1915, !dbg !52 + %gep334 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %1916, !dbg !52 + %1917 = shl i32 %1903, 13, !dbg !53 + %1918 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %1917, !dbg !53 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %1919 = getelementptr inbounds nuw i8, ptr addrspace(3) %1918, i32 %220, !dbg !53 + %1920 = select i1 %376, i32 16, i32 0, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %1919, ptr addrspace(1) %gep, i32 %1920) #2, !dbg !53 + %1921 = getelementptr inbounds nuw i8, ptr addrspace(3) %1918, i32 %223, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %1921, ptr addrspace(1) %gep330, i32 %1920) #2, !dbg !53 + %1922 = getelementptr inbounds nuw i8, ptr addrspace(3) %1918, i32 %225, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %1922, ptr addrspace(1) %gep332, i32 %1920) #2, !dbg !53 + %1923 = getelementptr inbounds nuw i8, ptr addrspace(3) %1918, i32 %227, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %1923, ptr addrspace(1) %gep334, i32 %1920) #2, !dbg !53 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !53 + %gep336 = getelementptr bfloat, ptr addrspace(1) %invariant.gep335, i64 %1913, !dbg !52 + %gep338 = getelementptr bfloat, ptr addrspace(1) %invariant.gep335, i64 %1914, !dbg !52 + %gep340 = getelementptr bfloat, ptr addrspace(1) %invariant.gep335, i64 %1915, !dbg !52 + %gep342 = getelementptr bfloat, ptr addrspace(1) %invariant.gep335, i64 %1916, !dbg !52 + %1924 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %1917, !dbg !53 + %1925 = getelementptr inbounds nuw i8, ptr addrspace(3) %1924, i32 %220, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %1925, ptr addrspace(1) %gep336, i32 %1920) #2, !dbg !53 + %1926 = getelementptr inbounds nuw i8, ptr addrspace(3) %1924, i32 %223, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %1926, ptr addrspace(1) %gep338, i32 %1920) #2, !dbg !53 + %1927 = getelementptr inbounds nuw i8, ptr addrspace(3) %1924, i32 %225, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %1927, ptr addrspace(1) %gep340, i32 %1920) #2, !dbg !53 + %1928 = getelementptr inbounds nuw i8, ptr addrspace(3) %1924, i32 %227, !dbg !53 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %1928, ptr addrspace(1) %gep342, i32 %1920) #2, !dbg !53 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !53 + %exitcond.not = icmp eq i32 %1880, %280, !dbg !45 + br i1 %exitcond.not, label %._crit_edge, label %301, !dbg !45 + +._crit_edge: ; preds = %__nv_exp2f.exit291, %13 + %1929 = phi float [ 0.000000e+00, %13 ], [ %1813, %__nv_exp2f.exit291 ], !dbg !112 + %1930 = phi float [ 0.000000e+00, %13 ], [ %1814, %__nv_exp2f.exit291 ], !dbg !112 + %1931 = phi float [ 0.000000e+00, %13 ], [ %1815, %__nv_exp2f.exit291 ], !dbg !112 + %1932 = phi float [ 0.000000e+00, %13 ], [ %1816, %__nv_exp2f.exit291 ], !dbg !112 + %1933 = phi float [ 0.000000e+00, %13 ], [ %1817, %__nv_exp2f.exit291 ], !dbg !112 + %1934 = phi float [ 0.000000e+00, %13 ], [ %1818, %__nv_exp2f.exit291 ], !dbg !112 + %1935 = phi float [ 0.000000e+00, %13 ], [ %1819, %__nv_exp2f.exit291 ], !dbg !112 + %1936 = phi float [ 0.000000e+00, %13 ], [ %1820, %__nv_exp2f.exit291 ], !dbg !112 + %1937 = phi float [ 0.000000e+00, %13 ], [ %1821, %__nv_exp2f.exit291 ], !dbg !112 + %1938 = phi float [ 0.000000e+00, %13 ], [ %1822, %__nv_exp2f.exit291 ], !dbg !112 + %1939 = phi float [ 0.000000e+00, %13 ], [ %1823, %__nv_exp2f.exit291 ], !dbg !112 + %1940 = phi float [ 0.000000e+00, %13 ], [ %1824, %__nv_exp2f.exit291 ], !dbg !112 + %1941 = phi float [ 0.000000e+00, %13 ], [ %1825, %__nv_exp2f.exit291 ], !dbg !112 + %1942 = phi float [ 0.000000e+00, %13 ], [ %1826, %__nv_exp2f.exit291 ], !dbg !112 + %1943 = phi float [ 0.000000e+00, %13 ], [ %1827, %__nv_exp2f.exit291 ], !dbg !112 + %1944 = phi float [ 0.000000e+00, %13 ], [ %1828, %__nv_exp2f.exit291 ], !dbg !112 + %1945 = phi float [ 0.000000e+00, %13 ], [ %1829, %__nv_exp2f.exit291 ], !dbg !112 + %1946 = phi float [ 0.000000e+00, %13 ], [ %1830, %__nv_exp2f.exit291 ], !dbg !112 + %1947 = phi float [ 0.000000e+00, %13 ], [ %1831, %__nv_exp2f.exit291 ], !dbg !112 + %1948 = phi float [ 0.000000e+00, %13 ], [ %1832, %__nv_exp2f.exit291 ], !dbg !112 + %1949 = phi float [ 0.000000e+00, %13 ], [ %1833, %__nv_exp2f.exit291 ], !dbg !112 + %1950 = phi float [ 0.000000e+00, %13 ], [ %1834, %__nv_exp2f.exit291 ], !dbg !112 + %1951 = phi float [ 0.000000e+00, %13 ], [ %1835, %__nv_exp2f.exit291 ], !dbg !112 + %1952 = phi float [ 0.000000e+00, %13 ], [ %1836, %__nv_exp2f.exit291 ], !dbg !112 + %1953 = phi float [ 0.000000e+00, %13 ], [ %1837, %__nv_exp2f.exit291 ], !dbg !112 + %1954 = phi float [ 0.000000e+00, %13 ], [ %1838, %__nv_exp2f.exit291 ], !dbg !112 + %1955 = phi float [ 0.000000e+00, %13 ], [ %1839, %__nv_exp2f.exit291 ], !dbg !112 + %1956 = phi float [ 0.000000e+00, %13 ], [ %1840, %__nv_exp2f.exit291 ], !dbg !112 + %1957 = phi float [ 0.000000e+00, %13 ], [ %1841, %__nv_exp2f.exit291 ], !dbg !112 + %1958 = phi float [ 0.000000e+00, %13 ], [ %1842, %__nv_exp2f.exit291 ], !dbg !112 + %1959 = phi float [ 0.000000e+00, %13 ], [ %1843, %__nv_exp2f.exit291 ], !dbg !112 + %1960 = phi float [ 0.000000e+00, %13 ], [ %1844, %__nv_exp2f.exit291 ], !dbg !112 + %1961 = phi float [ 0.000000e+00, %13 ], [ %1845, %__nv_exp2f.exit291 ], !dbg !112 + %1962 = phi float [ 0.000000e+00, %13 ], [ %1846, %__nv_exp2f.exit291 ], !dbg !112 + %1963 = phi float [ 0.000000e+00, %13 ], [ %1847, %__nv_exp2f.exit291 ], !dbg !112 + %1964 = phi float [ 0.000000e+00, %13 ], [ %1848, %__nv_exp2f.exit291 ], !dbg !112 + %1965 = phi float [ 0.000000e+00, %13 ], [ %1849, %__nv_exp2f.exit291 ], !dbg !112 + %1966 = phi float [ 0.000000e+00, %13 ], [ %1850, %__nv_exp2f.exit291 ], !dbg !112 + %1967 = phi float [ 0.000000e+00, %13 ], [ %1851, %__nv_exp2f.exit291 ], !dbg !112 + %1968 = phi float [ 0.000000e+00, %13 ], [ %1852, %__nv_exp2f.exit291 ], !dbg !112 + %1969 = phi float [ 0.000000e+00, %13 ], [ %1853, %__nv_exp2f.exit291 ], !dbg !112 + %1970 = phi float [ 0.000000e+00, %13 ], [ %1854, %__nv_exp2f.exit291 ], !dbg !112 + %1971 = phi float [ 0.000000e+00, %13 ], [ %1855, %__nv_exp2f.exit291 ], !dbg !112 + %1972 = phi float [ 0.000000e+00, %13 ], [ %1856, %__nv_exp2f.exit291 ], !dbg !112 + %1973 = phi float [ 0.000000e+00, %13 ], [ %1857, %__nv_exp2f.exit291 ], !dbg !112 + %1974 = phi float [ 0.000000e+00, %13 ], [ %1858, %__nv_exp2f.exit291 ], !dbg !112 + %1975 = phi float [ 0.000000e+00, %13 ], [ %1859, %__nv_exp2f.exit291 ], !dbg !112 + %1976 = phi float [ 0.000000e+00, %13 ], [ %1860, %__nv_exp2f.exit291 ], !dbg !112 + %1977 = phi float [ 0.000000e+00, %13 ], [ %1861, %__nv_exp2f.exit291 ], !dbg !112 + %1978 = phi float [ 0.000000e+00, %13 ], [ %1862, %__nv_exp2f.exit291 ], !dbg !112 + %1979 = phi float [ 0.000000e+00, %13 ], [ %1863, %__nv_exp2f.exit291 ], !dbg !112 + %1980 = phi float [ 0.000000e+00, %13 ], [ %1864, %__nv_exp2f.exit291 ], !dbg !112 + %1981 = phi float [ 0.000000e+00, %13 ], [ %1865, %__nv_exp2f.exit291 ], !dbg !112 + %1982 = phi float [ 0.000000e+00, %13 ], [ %1866, %__nv_exp2f.exit291 ], !dbg !112 + %1983 = phi float [ 0.000000e+00, %13 ], [ %1867, %__nv_exp2f.exit291 ], !dbg !112 + %1984 = phi float [ 0.000000e+00, %13 ], [ %1868, %__nv_exp2f.exit291 ], !dbg !112 + %1985 = phi float [ 0.000000e+00, %13 ], [ %1869, %__nv_exp2f.exit291 ], !dbg !112 + %1986 = phi float [ 0.000000e+00, %13 ], [ %1870, %__nv_exp2f.exit291 ], !dbg !112 + %1987 = phi float [ 0.000000e+00, %13 ], [ %1871, %__nv_exp2f.exit291 ], !dbg !112 + %1988 = phi float [ 0.000000e+00, %13 ], [ %1872, %__nv_exp2f.exit291 ], !dbg !112 + %1989 = phi float [ 0.000000e+00, %13 ], [ %1873, %__nv_exp2f.exit291 ], !dbg !112 + %1990 = phi float [ 0.000000e+00, %13 ], [ %1874, %__nv_exp2f.exit291 ], !dbg !112 + %1991 = phi float [ 0.000000e+00, %13 ], [ %1875, %__nv_exp2f.exit291 ], !dbg !112 + %1992 = phi float [ 0.000000e+00, %13 ], [ %1876, %__nv_exp2f.exit291 ], !dbg !112 + %1993 = phi float [ 0.000000e+00, %13 ], [ %1466, %__nv_exp2f.exit291 ] + %1994 = phi float [ 0.000000e+00, %13 ], [ %1467, %__nv_exp2f.exit291 ] + %1995 = phi <2 x float> [ splat (float 0xFFF0000000000000), %13 ], [ %1205, %__nv_exp2f.exit291 ] + %1996 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %1929, float %1930, float %1931, float %1932, float %1933, float %1934, float %1935, float %1936, float %1937, float %1938, float %1939, float %1940, float %1941, float %1942, float %1943, float %1944, float %1945, float %1946, float %1947, float %1948, float %1949, float %1950, float %1951, float %1952, float %1953, float %1954, float %1955, float %1956, float %1957, float %1958, float %1959, float %1960, float %1961, float %1962, float %1963, float %1964, float %1965, float %1966, float %1967, float %1968, float %1969, float %1970, float %1971, float %1972, float %1973, float %1974, float %1975, float %1976, float %1977, float %1978, float %1979, float %1980, float %1981, float %1982, float %1983, float %1984, float %1985, float %1986, float %1987, float %1988, float %1989, float %1990, float %1991, float %1992) #2, !dbg !45 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !45 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !45 + %1997 = getelementptr i32, ptr addrspace(1) %8, i64 %182, !dbg !113 + %1998 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %1997) #2, !dbg !114 + %1999 = shl i32 %1998, 7, !dbg !115 + %2000 = getelementptr i32, ptr addrspace(1) %7, i64 %186, !dbg !116 + %2001 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %2000) #2, !dbg !117 + %2002 = shl i32 %2001, 1, !dbg !118 + %2003 = icmp sgt i32 %2002, 0, !dbg !119 + %2004 = or disjoint i32 %1999, %39, !dbg !121 + %2005 = or disjoint i32 %1999, %40, !dbg !121 + %2006 = or disjoint i32 %1999, %41, !dbg !121 + %2007 = or disjoint i32 %1999, %42, !dbg !121 + %2008 = shl i32 %2004, 7, !dbg !122 + %2009 = shl i32 %2005, 7, !dbg !122 + %2010 = shl i32 %2006, 7, !dbg !122 + %2011 = shl i32 %2007, 7, !dbg !122 + %2012 = sext i32 %2008 to i64, !dbg !123 + %2013 = getelementptr bfloat, ptr addrspace(1) %28, i64 %2012, !dbg !123 + %2014 = sext i32 %2009 to i64, !dbg !123 + %2015 = getelementptr bfloat, ptr addrspace(1) %28, i64 %2014, !dbg !123 + %2016 = sext i32 %2010 to i64, !dbg !123 + %2017 = getelementptr bfloat, ptr addrspace(1) %28, i64 %2016, !dbg !123 + %2018 = sext i32 %2011 to i64, !dbg !123 + %2019 = getelementptr bfloat, ptr addrspace(1) %28, i64 %2018, !dbg !123 + %2020 = getelementptr bfloat, ptr addrspace(1) %2013, i64 %84, !dbg !124 + %2021 = getelementptr bfloat, ptr addrspace(1) %2015, i64 %84, !dbg !124 + %2022 = getelementptr bfloat, ptr addrspace(1) %2017, i64 %84, !dbg !124 + %2023 = getelementptr bfloat, ptr addrspace(1) %2019, i64 %84, !dbg !124 + %2024 = select i1 %2003, i32 16, i32 0, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %221, ptr addrspace(1) %2020, i32 %2024) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %224, ptr addrspace(1) %2021, i32 %2024) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %226, ptr addrspace(1) %2022, i32 %2024) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %228, ptr addrspace(1) %2023, i32 %2024) #2, !dbg !125 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !125 + %2025 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2012, !dbg !123 + %2026 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2014, !dbg !123 + %2027 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2016, !dbg !123 + %2028 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2018, !dbg !123 + %2029 = getelementptr bfloat, ptr addrspace(1) %2025, i64 %84, !dbg !124 + %2030 = getelementptr bfloat, ptr addrspace(1) %2026, i64 %84, !dbg !124 + %2031 = getelementptr bfloat, ptr addrspace(1) %2027, i64 %84, !dbg !124 + %2032 = getelementptr bfloat, ptr addrspace(1) %2028, i64 %84, !dbg !124 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %237, ptr addrspace(1) %2029, i32 %2024) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %238, ptr addrspace(1) %2030, i32 %2024) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %239, ptr addrspace(1) %2031, i32 %2024) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %240, ptr addrspace(1) %2032, i32 %2024) #2, !dbg !125 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !125 + %2033 = icmp sgt i32 %2002, 1, !dbg !119 + %2034 = or disjoint i32 %1999, 64, !dbg !126 + %2035 = or disjoint i32 %2034, %39, !dbg !121 + %2036 = or disjoint i32 %2034, %40, !dbg !121 + %2037 = or disjoint i32 %2034, %41, !dbg !121 + %2038 = or disjoint i32 %2034, %42, !dbg !121 + %2039 = shl i32 %2035, 7, !dbg !122 + %2040 = shl i32 %2036, 7, !dbg !122 + %2041 = shl i32 %2037, 7, !dbg !122 + %2042 = shl i32 %2038, 7, !dbg !122 + %2043 = sext i32 %2039 to i64, !dbg !123 + %2044 = getelementptr bfloat, ptr addrspace(1) %28, i64 %2043, !dbg !123 + %2045 = sext i32 %2040 to i64, !dbg !123 + %2046 = getelementptr bfloat, ptr addrspace(1) %28, i64 %2045, !dbg !123 + %2047 = sext i32 %2041 to i64, !dbg !123 + %2048 = getelementptr bfloat, ptr addrspace(1) %28, i64 %2047, !dbg !123 + %2049 = sext i32 %2042 to i64, !dbg !123 + %2050 = getelementptr bfloat, ptr addrspace(1) %28, i64 %2049, !dbg !123 + %2051 = getelementptr bfloat, ptr addrspace(1) %2044, i64 %84, !dbg !124 + %2052 = getelementptr bfloat, ptr addrspace(1) %2046, i64 %84, !dbg !124 + %2053 = getelementptr bfloat, ptr addrspace(1) %2048, i64 %84, !dbg !124 + %2054 = getelementptr bfloat, ptr addrspace(1) %2050, i64 %84, !dbg !124 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !125 + %2055 = select i1 %2033, i32 16, i32 0, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %263, ptr addrspace(1) %2051, i32 %2055) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %265, ptr addrspace(1) %2052, i32 %2055) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %266, ptr addrspace(1) %2053, i32 %2055) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %267, ptr addrspace(1) %2054, i32 %2055) #2, !dbg !125 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !125 + %2056 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2043, !dbg !123 + %2057 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2045, !dbg !123 + %2058 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2047, !dbg !123 + %2059 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2049, !dbg !123 + %2060 = getelementptr bfloat, ptr addrspace(1) %2056, i64 %84, !dbg !124 + %2061 = getelementptr bfloat, ptr addrspace(1) %2057, i64 %84, !dbg !124 + %2062 = getelementptr bfloat, ptr addrspace(1) %2058, i64 %84, !dbg !124 + %2063 = getelementptr bfloat, ptr addrspace(1) %2059, i64 %84, !dbg !124 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %276, ptr addrspace(1) %2060, i32 %2055) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %277, ptr addrspace(1) %2061, i32 %2055) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %278, ptr addrspace(1) %2062, i32 %2055) #2, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %279, ptr addrspace(1) %2063, i32 %2055) #2, !dbg !125 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !125 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #2, !dbg !127 + %2064 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 0, !dbg !119 + %2065 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 1, !dbg !119 + %2066 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 2, !dbg !119 + %2067 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 3, !dbg !119 + %2068 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 4, !dbg !119 + %2069 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 5, !dbg !119 + %2070 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 6, !dbg !119 + %2071 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 7, !dbg !119 + %2072 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 8, !dbg !119 + %2073 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 9, !dbg !119 + %2074 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 10, !dbg !119 + %2075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 11, !dbg !119 + %2076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 12, !dbg !119 + %2077 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 13, !dbg !119 + %2078 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 14, !dbg !119 + %2079 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 15, !dbg !119 + %2080 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 16, !dbg !119 + %2081 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 17, !dbg !119 + %2082 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 18, !dbg !119 + %2083 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 19, !dbg !119 + %2084 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 20, !dbg !119 + %2085 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 21, !dbg !119 + %2086 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 22, !dbg !119 + %2087 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 23, !dbg !119 + %2088 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 24, !dbg !119 + %2089 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 25, !dbg !119 + %2090 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 26, !dbg !119 + %2091 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 27, !dbg !119 + %2092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 28, !dbg !119 + %2093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 29, !dbg !119 + %2094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 30, !dbg !119 + %2095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 31, !dbg !119 + %2096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 32, !dbg !119 + %2097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 33, !dbg !119 + %2098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 34, !dbg !119 + %2099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 35, !dbg !119 + %2100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 36, !dbg !119 + %2101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 37, !dbg !119 + %2102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 38, !dbg !119 + %2103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 39, !dbg !119 + %2104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 40, !dbg !119 + %2105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 41, !dbg !119 + %2106 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 42, !dbg !119 + %2107 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 43, !dbg !119 + %2108 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 44, !dbg !119 + %2109 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 45, !dbg !119 + %2110 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 46, !dbg !119 + %2111 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 47, !dbg !119 + %2112 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 48, !dbg !119 + %2113 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 49, !dbg !119 + %2114 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 50, !dbg !119 + %2115 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 51, !dbg !119 + %2116 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 52, !dbg !119 + %2117 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 53, !dbg !119 + %2118 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 54, !dbg !119 + %2119 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 55, !dbg !119 + %2120 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 56, !dbg !119 + %2121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 57, !dbg !119 + %2122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 58, !dbg !119 + %2123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 59, !dbg !119 + %2124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 60, !dbg !119 + %2125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 61, !dbg !119 + %2126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 62, !dbg !119 + %2127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 63, !dbg !119 + br i1 %2003, label %.lr.ph378, label %._crit_edge379, !dbg !119 + +.lr.ph378: ; preds = %._crit_edge + %2128 = tail call i32 @llvm.umin.i32(i32 %2002, i32 32), !dbg !128 + %2129 = add nsw i32 %2128, -2 + %2130 = add nsw i32 %2128, -1 + %2131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 0, !dbg !119 + %2132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 1, !dbg !119 + %2133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 2, !dbg !119 + %2134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 3, !dbg !119 + %2135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 4, !dbg !119 + %2136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 5, !dbg !119 + %2137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 6, !dbg !119 + %2138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 7, !dbg !119 + %2139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 8, !dbg !119 + %2140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 9, !dbg !119 + %2141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 10, !dbg !119 + %2142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 11, !dbg !119 + %2143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 12, !dbg !119 + %2144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 13, !dbg !119 + %2145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 14, !dbg !119 + %2146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 15, !dbg !119 + %2147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 16, !dbg !119 + %2148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 17, !dbg !119 + %2149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 18, !dbg !119 + %2150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 19, !dbg !119 + %2151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 20, !dbg !119 + %2152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 21, !dbg !119 + %2153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 22, !dbg !119 + %2154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 23, !dbg !119 + %2155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 24, !dbg !119 + %2156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 25, !dbg !119 + %2157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 26, !dbg !119 + %2158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 27, !dbg !119 + %2159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 28, !dbg !119 + %2160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 29, !dbg !119 + %2161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 30, !dbg !119 + %2162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 31, !dbg !119 + %2163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 32, !dbg !119 + %2164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 33, !dbg !119 + %2165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 34, !dbg !119 + %2166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 35, !dbg !119 + %2167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 36, !dbg !119 + %2168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 37, !dbg !119 + %2169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 38, !dbg !119 + %2170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 39, !dbg !119 + %2171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 40, !dbg !119 + %2172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 41, !dbg !119 + %2173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 42, !dbg !119 + %2174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 43, !dbg !119 + %2175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 44, !dbg !119 + %2176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 45, !dbg !119 + %2177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 46, !dbg !119 + %2178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 47, !dbg !119 + %2179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 48, !dbg !119 + %2180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 49, !dbg !119 + %2181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 50, !dbg !119 + %2182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 51, !dbg !119 + %2183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 52, !dbg !119 + %2184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 53, !dbg !119 + %2185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 54, !dbg !119 + %2186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 55, !dbg !119 + %2187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 56, !dbg !119 + %2188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 57, !dbg !119 + %2189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 58, !dbg !119 + %2190 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 59, !dbg !119 + %2191 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 60, !dbg !119 + %2192 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 61, !dbg !119 + %2193 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 62, !dbg !119 + %2194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1996, 63, !dbg !119 + br label %2195, !dbg !119 + +2195: ; preds = %.lr.ph378, %__nv_exp2f.exit189 + %2196 = phi i32 [ -1, %.lr.ph378 ], [ %2205, %__nv_exp2f.exit189 ] + %2197 = phi i32 [ 1, %.lr.ph378 ], [ %3459, %__nv_exp2f.exit189 ] + %2198 = phi i32 [ 64, %.lr.ph378 ], [ %3456, %__nv_exp2f.exit189 ] + %.pn = phi float [ %2131, %.lr.ph378 ], [ %3372, %__nv_exp2f.exit189 ] + %.pn397 = phi float [ %2132, %.lr.ph378 ], [ %3373, %__nv_exp2f.exit189 ] + %.pn398 = phi float [ %2133, %.lr.ph378 ], [ %3374, %__nv_exp2f.exit189 ] + %.pn399 = phi float [ %2134, %.lr.ph378 ], [ %3375, %__nv_exp2f.exit189 ] + %.pn400 = phi float [ %2135, %.lr.ph378 ], [ %3376, %__nv_exp2f.exit189 ] + %.pn401 = phi float [ %2136, %.lr.ph378 ], [ %3377, %__nv_exp2f.exit189 ] + %.pn402 = phi float [ %2137, %.lr.ph378 ], [ %3378, %__nv_exp2f.exit189 ] + %.pn403 = phi float [ %2138, %.lr.ph378 ], [ %3379, %__nv_exp2f.exit189 ] + %.pn404 = phi float [ %2139, %.lr.ph378 ], [ %3380, %__nv_exp2f.exit189 ] + %.pn405 = phi float [ %2140, %.lr.ph378 ], [ %3381, %__nv_exp2f.exit189 ] + %.pn406 = phi float [ %2141, %.lr.ph378 ], [ %3382, %__nv_exp2f.exit189 ] + %.pn407 = phi float [ %2142, %.lr.ph378 ], [ %3383, %__nv_exp2f.exit189 ] + %.pn408 = phi float [ %2143, %.lr.ph378 ], [ %3384, %__nv_exp2f.exit189 ] + %.pn409 = phi float [ %2144, %.lr.ph378 ], [ %3385, %__nv_exp2f.exit189 ] + %.pn410 = phi float [ %2145, %.lr.ph378 ], [ %3386, %__nv_exp2f.exit189 ] + %.pn411 = phi float [ %2146, %.lr.ph378 ], [ %3387, %__nv_exp2f.exit189 ] + %.pn412 = phi float [ %2147, %.lr.ph378 ], [ %3388, %__nv_exp2f.exit189 ] + %.pn413 = phi float [ %2148, %.lr.ph378 ], [ %3389, %__nv_exp2f.exit189 ] + %.pn414 = phi float [ %2149, %.lr.ph378 ], [ %3390, %__nv_exp2f.exit189 ] + %.pn415 = phi float [ %2150, %.lr.ph378 ], [ %3391, %__nv_exp2f.exit189 ] + %.pn416 = phi float [ %2151, %.lr.ph378 ], [ %3392, %__nv_exp2f.exit189 ] + %.pn417 = phi float [ %2152, %.lr.ph378 ], [ %3393, %__nv_exp2f.exit189 ] + %.pn418 = phi float [ %2153, %.lr.ph378 ], [ %3394, %__nv_exp2f.exit189 ] + %.pn419 = phi float [ %2154, %.lr.ph378 ], [ %3395, %__nv_exp2f.exit189 ] + %.pn420 = phi float [ %2155, %.lr.ph378 ], [ %3396, %__nv_exp2f.exit189 ] + %.pn421 = phi float [ %2156, %.lr.ph378 ], [ %3397, %__nv_exp2f.exit189 ] + %.pn422 = phi float [ %2157, %.lr.ph378 ], [ %3398, %__nv_exp2f.exit189 ] + %.pn423 = phi float [ %2158, %.lr.ph378 ], [ %3399, %__nv_exp2f.exit189 ] + %.pn424 = phi float [ %2159, %.lr.ph378 ], [ %3400, %__nv_exp2f.exit189 ] + %.pn425 = phi float [ %2160, %.lr.ph378 ], [ %3401, %__nv_exp2f.exit189 ] + %.pn426 = phi float [ %2161, %.lr.ph378 ], [ %3402, %__nv_exp2f.exit189 ] + %.pn427 = phi float [ %2162, %.lr.ph378 ], [ %3403, %__nv_exp2f.exit189 ] + %.pn428 = phi float [ %2163, %.lr.ph378 ], [ %3404, %__nv_exp2f.exit189 ] + %.pn429 = phi float [ %2164, %.lr.ph378 ], [ %3405, %__nv_exp2f.exit189 ] + %.pn430 = phi float [ %2165, %.lr.ph378 ], [ %3406, %__nv_exp2f.exit189 ] + %.pn431 = phi float [ %2166, %.lr.ph378 ], [ %3407, %__nv_exp2f.exit189 ] + %.pn432 = phi float [ %2167, %.lr.ph378 ], [ %3408, %__nv_exp2f.exit189 ] + %.pn433 = phi float [ %2168, %.lr.ph378 ], [ %3409, %__nv_exp2f.exit189 ] + %.pn434 = phi float [ %2169, %.lr.ph378 ], [ %3410, %__nv_exp2f.exit189 ] + %.pn435 = phi float [ %2170, %.lr.ph378 ], [ %3411, %__nv_exp2f.exit189 ] + %.pn436 = phi float [ %2171, %.lr.ph378 ], [ %3412, %__nv_exp2f.exit189 ] + %.pn437 = phi float [ %2172, %.lr.ph378 ], [ %3413, %__nv_exp2f.exit189 ] + %.pn438 = phi float [ %2173, %.lr.ph378 ], [ %3414, %__nv_exp2f.exit189 ] + %.pn439 = phi float [ %2174, %.lr.ph378 ], [ %3415, %__nv_exp2f.exit189 ] + %.pn440 = phi float [ %2175, %.lr.ph378 ], [ %3416, %__nv_exp2f.exit189 ] + %.pn441 = phi float [ %2176, %.lr.ph378 ], [ %3417, %__nv_exp2f.exit189 ] + %.pn442 = phi float [ %2177, %.lr.ph378 ], [ %3418, %__nv_exp2f.exit189 ] + %.pn443 = phi float [ %2178, %.lr.ph378 ], [ %3419, %__nv_exp2f.exit189 ] + %.pn444 = phi float [ %2179, %.lr.ph378 ], [ %3420, %__nv_exp2f.exit189 ] + %.pn445 = phi float [ %2180, %.lr.ph378 ], [ %3421, %__nv_exp2f.exit189 ] + %.pn446 = phi float [ %2181, %.lr.ph378 ], [ %3422, %__nv_exp2f.exit189 ] + %.pn447 = phi float [ %2182, %.lr.ph378 ], [ %3423, %__nv_exp2f.exit189 ] + %.pn448 = phi float [ %2183, %.lr.ph378 ], [ %3424, %__nv_exp2f.exit189 ] + %.pn449 = phi float [ %2184, %.lr.ph378 ], [ %3425, %__nv_exp2f.exit189 ] + %.pn450 = phi float [ %2185, %.lr.ph378 ], [ %3426, %__nv_exp2f.exit189 ] + %.pn451 = phi float [ %2186, %.lr.ph378 ], [ %3427, %__nv_exp2f.exit189 ] + %.pn452 = phi float [ %2187, %.lr.ph378 ], [ %3428, %__nv_exp2f.exit189 ] + %.pn453 = phi float [ %2188, %.lr.ph378 ], [ %3429, %__nv_exp2f.exit189 ] + %.pn454 = phi float [ %2189, %.lr.ph378 ], [ %3430, %__nv_exp2f.exit189 ] + %.pn455 = phi float [ %2190, %.lr.ph378 ], [ %3431, %__nv_exp2f.exit189 ] + %.pn456 = phi float [ %2191, %.lr.ph378 ], [ %3432, %__nv_exp2f.exit189 ] + %.pn457 = phi float [ %2192, %.lr.ph378 ], [ %3433, %__nv_exp2f.exit189 ] + %.pn458 = phi float [ %2193, %.lr.ph378 ], [ %3434, %__nv_exp2f.exit189 ] + %.pn459 = phi float [ %2194, %.lr.ph378 ], [ %3435, %__nv_exp2f.exit189 ] + %2199 = phi i32 [ 0, %.lr.ph378 ], [ %3436, %__nv_exp2f.exit189 ] + %.pn464 = phi float [ %1993, %.lr.ph378 ], [ %3025, %__nv_exp2f.exit189 ] + %.pn462 = phi float [ %1994, %.lr.ph378 ], [ %3026, %__nv_exp2f.exit189 ] + %2200 = phi <2 x float> [ %1995, %.lr.ph378 ], [ %2764, %__nv_exp2f.exit189 ] + %2201 = icmp slt i32 %2199, %2129, !dbg !119 + %2202 = icmp slt i32 %2199, %2130, !dbg !119 + %2203 = add i32 %2196, 1, !dbg !119 + %2204 = icmp sgt i32 %2203, 2, !dbg !119 + %2205 = select i1 %2204, i32 0, i32 %2203, !dbg !119 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !125 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !125 + %2206 = shl i32 %2205, 13, !dbg !125 + %2207 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %2206, !dbg !125 + %2208 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %37, i32 0, i32 31), !dbg !127 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !127 + %2209 = shl i32 %2208, 11, !dbg !127 + %2210 = and i32 %2209, 8192, !dbg !127 + %2211 = add i32 %2210, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !127 + %2212 = lshr exact i32 %2211, 4, !dbg !127 + %2213 = and i32 %2212, 16383, !dbg !127 + %2214 = zext nneg i32 %2213 to i64, !dbg !127 + %2215 = or disjoint i64 %2214, 4611686293372403712, !dbg !127 + %2216 = ptrtoint ptr addrspace(3) %2207 to i32, !dbg !127 + %2217 = lshr exact i32 %2216, 4, !dbg !127 + %2218 = and i32 %2217, 16383, !dbg !127 + %2219 = zext nneg i32 %2218 to i64, !dbg !127 + %2220 = or disjoint i64 %2219, 4611686293338849280, !dbg !127 + %2221 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %2215, i64 %2220) #2, !dbg !127 + %2222 = add i32 %2210, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 32), !dbg !127 + %2223 = lshr exact i32 %2222, 4, !dbg !127 + %2224 = and i32 %2223, 16383, !dbg !127 + %2225 = zext nneg i32 %2224 to i64, !dbg !127 + %2226 = or disjoint i64 %2225, 4611686293372403712, !dbg !127 + %2227 = add i32 %2216, 32, !dbg !127 + %2228 = lshr exact i32 %2227, 4, !dbg !127 + %2229 = and i32 %2228, 16383, !dbg !127 + %2230 = zext nneg i32 %2229 to i64, !dbg !127 + %2231 = or disjoint i64 %2230, 4611686293338849280, !dbg !127 + %2232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 0, !dbg !127 + %2233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 1, !dbg !127 + %2234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 2, !dbg !127 + %2235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 3, !dbg !127 + %2236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 4, !dbg !127 + %2237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 5, !dbg !127 + %2238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 6, !dbg !127 + %2239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 7, !dbg !127 + %2240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 8, !dbg !127 + %2241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 9, !dbg !127 + %2242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 10, !dbg !127 + %2243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 11, !dbg !127 + %2244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 12, !dbg !127 + %2245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 13, !dbg !127 + %2246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 14, !dbg !127 + %2247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 15, !dbg !127 + %2248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 16, !dbg !127 + %2249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 17, !dbg !127 + %2250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 18, !dbg !127 + %2251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 19, !dbg !127 + %2252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 20, !dbg !127 + %2253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 21, !dbg !127 + %2254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 22, !dbg !127 + %2255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 23, !dbg !127 + %2256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 24, !dbg !127 + %2257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 25, !dbg !127 + %2258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 26, !dbg !127 + %2259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 27, !dbg !127 + %2260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 28, !dbg !127 + %2261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 29, !dbg !127 + %2262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 30, !dbg !127 + %2263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 31, !dbg !127 + %2264 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2232, float %2233, float %2234, float %2235, float %2236, float %2237, float %2238, float %2239, float %2240, float %2241, float %2242, float %2243, float %2244, float %2245, float %2246, float %2247, float %2248, float %2249, float %2250, float %2251, float %2252, float %2253, float %2254, float %2255, float %2256, float %2257, float %2258, float %2259, float %2260, float %2261, float %2262, float %2263, i64 %2226, i64 %2231, i1 true) #2, !dbg !127 + %2265 = add i32 %2210, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 64), !dbg !127 + %2266 = lshr exact i32 %2265, 4, !dbg !127 + %2267 = and i32 %2266, 16383, !dbg !127 + %2268 = zext nneg i32 %2267 to i64, !dbg !127 + %2269 = or disjoint i64 %2268, 4611686293372403712, !dbg !127 + %2270 = add i32 %2216, 64, !dbg !127 + %2271 = lshr exact i32 %2270, 4, !dbg !127 + %2272 = and i32 %2271, 16383, !dbg !127 + %2273 = zext nneg i32 %2272 to i64, !dbg !127 + %2274 = or disjoint i64 %2273, 4611686293338849280, !dbg !127 + %2275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 0, !dbg !127 + %2276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 1, !dbg !127 + %2277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 2, !dbg !127 + %2278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 3, !dbg !127 + %2279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 4, !dbg !127 + %2280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 5, !dbg !127 + %2281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 6, !dbg !127 + %2282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 7, !dbg !127 + %2283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 8, !dbg !127 + %2284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 9, !dbg !127 + %2285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 10, !dbg !127 + %2286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 11, !dbg !127 + %2287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 12, !dbg !127 + %2288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 13, !dbg !127 + %2289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 14, !dbg !127 + %2290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 15, !dbg !127 + %2291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 16, !dbg !127 + %2292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 17, !dbg !127 + %2293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 18, !dbg !127 + %2294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 19, !dbg !127 + %2295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 20, !dbg !127 + %2296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 21, !dbg !127 + %2297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 22, !dbg !127 + %2298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 23, !dbg !127 + %2299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 24, !dbg !127 + %2300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 25, !dbg !127 + %2301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 26, !dbg !127 + %2302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 27, !dbg !127 + %2303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 28, !dbg !127 + %2304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 29, !dbg !127 + %2305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 30, !dbg !127 + %2306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2264, 31, !dbg !127 + %2307 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2275, float %2276, float %2277, float %2278, float %2279, float %2280, float %2281, float %2282, float %2283, float %2284, float %2285, float %2286, float %2287, float %2288, float %2289, float %2290, float %2291, float %2292, float %2293, float %2294, float %2295, float %2296, float %2297, float %2298, float %2299, float %2300, float %2301, float %2302, float %2303, float %2304, float %2305, float %2306, i64 %2269, i64 %2274, i1 true) #2, !dbg !127 + %2308 = add i32 %2210, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 96), !dbg !127 + %2309 = lshr exact i32 %2308, 4, !dbg !127 + %2310 = and i32 %2309, 16383, !dbg !127 + %2311 = zext nneg i32 %2310 to i64, !dbg !127 + %2312 = or disjoint i64 %2311, 4611686293372403712, !dbg !127 + %2313 = add i32 %2216, 96, !dbg !127 + %2314 = lshr exact i32 %2313, 4, !dbg !127 + %2315 = and i32 %2314, 16383, !dbg !127 + %2316 = zext nneg i32 %2315 to i64, !dbg !127 + %2317 = or disjoint i64 %2316, 4611686293338849280, !dbg !127 + %2318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 0, !dbg !127 + %2319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 1, !dbg !127 + %2320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 2, !dbg !127 + %2321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 3, !dbg !127 + %2322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 4, !dbg !127 + %2323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 5, !dbg !127 + %2324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 6, !dbg !127 + %2325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 7, !dbg !127 + %2326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 8, !dbg !127 + %2327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 9, !dbg !127 + %2328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 10, !dbg !127 + %2329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 11, !dbg !127 + %2330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 12, !dbg !127 + %2331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 13, !dbg !127 + %2332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 14, !dbg !127 + %2333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 15, !dbg !127 + %2334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 16, !dbg !127 + %2335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 17, !dbg !127 + %2336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 18, !dbg !127 + %2337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 19, !dbg !127 + %2338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 20, !dbg !127 + %2339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 21, !dbg !127 + %2340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 22, !dbg !127 + %2341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 23, !dbg !127 + %2342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 24, !dbg !127 + %2343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 25, !dbg !127 + %2344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 26, !dbg !127 + %2345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 27, !dbg !127 + %2346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 28, !dbg !127 + %2347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 29, !dbg !127 + %2348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 30, !dbg !127 + %2349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2307, 31, !dbg !127 + %2350 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2318, float %2319, float %2320, float %2321, float %2322, float %2323, float %2324, float %2325, float %2326, float %2327, float %2328, float %2329, float %2330, float %2331, float %2332, float %2333, float %2334, float %2335, float %2336, float %2337, float %2338, float %2339, float %2340, float %2341, float %2342, float %2343, float %2344, float %2345, float %2346, float %2347, float %2348, float %2349, i64 %2312, i64 %2317, i1 true) #2, !dbg !127 + %2351 = add i32 %2210, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16384), !dbg !127 + %2352 = lshr exact i32 %2351, 4, !dbg !127 + %2353 = and i32 %2352, 16383, !dbg !127 + %2354 = zext nneg i32 %2353 to i64, !dbg !127 + %2355 = or disjoint i64 %2354, 4611686293372403712, !dbg !127 + %2356 = add i32 %2216, 8192, !dbg !127 + %2357 = lshr exact i32 %2356, 4, !dbg !127 + %2358 = and i32 %2357, 16383, !dbg !127 + %2359 = zext nneg i32 %2358 to i64, !dbg !127 + %2360 = or disjoint i64 %2359, 4611686293338849280, !dbg !127 + %2361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 0, !dbg !127 + %2362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 1, !dbg !127 + %2363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 2, !dbg !127 + %2364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 3, !dbg !127 + %2365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 4, !dbg !127 + %2366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 5, !dbg !127 + %2367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 6, !dbg !127 + %2368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 7, !dbg !127 + %2369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 8, !dbg !127 + %2370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 9, !dbg !127 + %2371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 10, !dbg !127 + %2372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 11, !dbg !127 + %2373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 12, !dbg !127 + %2374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 13, !dbg !127 + %2375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 14, !dbg !127 + %2376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 15, !dbg !127 + %2377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 16, !dbg !127 + %2378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 17, !dbg !127 + %2379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 18, !dbg !127 + %2380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 19, !dbg !127 + %2381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 20, !dbg !127 + %2382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 21, !dbg !127 + %2383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 22, !dbg !127 + %2384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 23, !dbg !127 + %2385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 24, !dbg !127 + %2386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 25, !dbg !127 + %2387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 26, !dbg !127 + %2388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 27, !dbg !127 + %2389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 28, !dbg !127 + %2390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 29, !dbg !127 + %2391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 30, !dbg !127 + %2392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2350, 31, !dbg !127 + %2393 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2361, float %2362, float %2363, float %2364, float %2365, float %2366, float %2367, float %2368, float %2369, float %2370, float %2371, float %2372, float %2373, float %2374, float %2375, float %2376, float %2377, float %2378, float %2379, float %2380, float %2381, float %2382, float %2383, float %2384, float %2385, float %2386, float %2387, float %2388, float %2389, float %2390, float %2391, float %2392, i64 %2355, i64 %2360, i1 true) #2, !dbg !127 + %2394 = add i32 %2210, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16416), !dbg !127 + %2395 = lshr exact i32 %2394, 4, !dbg !127 + %2396 = and i32 %2395, 16383, !dbg !127 + %2397 = zext nneg i32 %2396 to i64, !dbg !127 + %2398 = or disjoint i64 %2397, 4611686293372403712, !dbg !127 + %2399 = add i32 %2216, 8224, !dbg !127 + %2400 = lshr exact i32 %2399, 4, !dbg !127 + %2401 = and i32 %2400, 16383, !dbg !127 + %2402 = zext nneg i32 %2401 to i64, !dbg !127 + %2403 = or disjoint i64 %2402, 4611686293338849280, !dbg !127 + %2404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 0, !dbg !127 + %2405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 1, !dbg !127 + %2406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 2, !dbg !127 + %2407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 3, !dbg !127 + %2408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 4, !dbg !127 + %2409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 5, !dbg !127 + %2410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 6, !dbg !127 + %2411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 7, !dbg !127 + %2412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 8, !dbg !127 + %2413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 9, !dbg !127 + %2414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 10, !dbg !127 + %2415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 11, !dbg !127 + %2416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 12, !dbg !127 + %2417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 13, !dbg !127 + %2418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 14, !dbg !127 + %2419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 15, !dbg !127 + %2420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 16, !dbg !127 + %2421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 17, !dbg !127 + %2422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 18, !dbg !127 + %2423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 19, !dbg !127 + %2424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 20, !dbg !127 + %2425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 21, !dbg !127 + %2426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 22, !dbg !127 + %2427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 23, !dbg !127 + %2428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 24, !dbg !127 + %2429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 25, !dbg !127 + %2430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 26, !dbg !127 + %2431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 27, !dbg !127 + %2432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 28, !dbg !127 + %2433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 29, !dbg !127 + %2434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 30, !dbg !127 + %2435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2393, 31, !dbg !127 + %2436 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2404, float %2405, float %2406, float %2407, float %2408, float %2409, float %2410, float %2411, float %2412, float %2413, float %2414, float %2415, float %2416, float %2417, float %2418, float %2419, float %2420, float %2421, float %2422, float %2423, float %2424, float %2425, float %2426, float %2427, float %2428, float %2429, float %2430, float %2431, float %2432, float %2433, float %2434, float %2435, i64 %2398, i64 %2403, i1 true) #2, !dbg !127 + %2437 = add i32 %2210, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16448), !dbg !127 + %2438 = lshr exact i32 %2437, 4, !dbg !127 + %2439 = and i32 %2438, 16383, !dbg !127 + %2440 = zext nneg i32 %2439 to i64, !dbg !127 + %2441 = or disjoint i64 %2440, 4611686293372403712, !dbg !127 + %2442 = add i32 %2216, 8256, !dbg !127 + %2443 = lshr exact i32 %2442, 4, !dbg !127 + %2444 = and i32 %2443, 16383, !dbg !127 + %2445 = zext nneg i32 %2444 to i64, !dbg !127 + %2446 = or disjoint i64 %2445, 4611686293338849280, !dbg !127 + %2447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 0, !dbg !127 + %2448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 1, !dbg !127 + %2449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 2, !dbg !127 + %2450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 3, !dbg !127 + %2451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 4, !dbg !127 + %2452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 5, !dbg !127 + %2453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 6, !dbg !127 + %2454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 7, !dbg !127 + %2455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 8, !dbg !127 + %2456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 9, !dbg !127 + %2457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 10, !dbg !127 + %2458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 11, !dbg !127 + %2459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 12, !dbg !127 + %2460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 13, !dbg !127 + %2461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 14, !dbg !127 + %2462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 15, !dbg !127 + %2463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 16, !dbg !127 + %2464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 17, !dbg !127 + %2465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 18, !dbg !127 + %2466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 19, !dbg !127 + %2467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 20, !dbg !127 + %2468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 21, !dbg !127 + %2469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 22, !dbg !127 + %2470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 23, !dbg !127 + %2471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 24, !dbg !127 + %2472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 25, !dbg !127 + %2473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 26, !dbg !127 + %2474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 27, !dbg !127 + %2475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 28, !dbg !127 + %2476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 29, !dbg !127 + %2477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 30, !dbg !127 + %2478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2436, 31, !dbg !127 + %2479 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2447, float %2448, float %2449, float %2450, float %2451, float %2452, float %2453, float %2454, float %2455, float %2456, float %2457, float %2458, float %2459, float %2460, float %2461, float %2462, float %2463, float %2464, float %2465, float %2466, float %2467, float %2468, float %2469, float %2470, float %2471, float %2472, float %2473, float %2474, float %2475, float %2476, float %2477, float %2478, i64 %2441, i64 %2446, i1 true) #2, !dbg !127 + %2480 = add i32 %2210, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16480), !dbg !127 + %2481 = lshr exact i32 %2480, 4, !dbg !127 + %2482 = and i32 %2481, 16383, !dbg !127 + %2483 = zext nneg i32 %2482 to i64, !dbg !127 + %2484 = or disjoint i64 %2483, 4611686293372403712, !dbg !127 + %2485 = add i32 %2216, 8288, !dbg !127 + %2486 = lshr exact i32 %2485, 4, !dbg !127 + %2487 = and i32 %2486, 16383, !dbg !127 + %2488 = zext nneg i32 %2487 to i64, !dbg !127 + %2489 = or disjoint i64 %2488, 4611686293338849280, !dbg !127 + %2490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 0, !dbg !127 + %2491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 1, !dbg !127 + %2492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 2, !dbg !127 + %2493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 3, !dbg !127 + %2494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 4, !dbg !127 + %2495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 5, !dbg !127 + %2496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 6, !dbg !127 + %2497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 7, !dbg !127 + %2498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 8, !dbg !127 + %2499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 9, !dbg !127 + %2500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 10, !dbg !127 + %2501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 11, !dbg !127 + %2502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 12, !dbg !127 + %2503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 13, !dbg !127 + %2504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 14, !dbg !127 + %2505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 15, !dbg !127 + %2506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 16, !dbg !127 + %2507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 17, !dbg !127 + %2508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 18, !dbg !127 + %2509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 19, !dbg !127 + %2510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 20, !dbg !127 + %2511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 21, !dbg !127 + %2512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 22, !dbg !127 + %2513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 23, !dbg !127 + %2514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 24, !dbg !127 + %2515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 25, !dbg !127 + %2516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 26, !dbg !127 + %2517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 27, !dbg !127 + %2518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 28, !dbg !127 + %2519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 29, !dbg !127 + %2520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 30, !dbg !127 + %2521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2479, 31, !dbg !127 + %2522 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2490, float %2491, float %2492, float %2493, float %2494, float %2495, float %2496, float %2497, float %2498, float %2499, float %2500, float %2501, float %2502, float %2503, float %2504, float %2505, float %2506, float %2507, float %2508, float %2509, float %2510, float %2511, float %2512, float %2513, float %2514, float %2515, float %2516, float %2517, float %2518, float %2519, float %2520, float %2521, i64 %2484, i64 %2489, i1 true) #2, !dbg !127 + %2523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 0, !dbg !127 + %2524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 1, !dbg !127 + %2525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 2, !dbg !127 + %2526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 3, !dbg !127 + %2527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 4, !dbg !127 + %2528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 5, !dbg !127 + %2529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 6, !dbg !127 + %2530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 7, !dbg !127 + %2531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 8, !dbg !127 + %2532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 9, !dbg !127 + %2533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 10, !dbg !127 + %2534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 11, !dbg !127 + %2535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 12, !dbg !127 + %2536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 13, !dbg !127 + %2537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 14, !dbg !127 + %2538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 15, !dbg !127 + %2539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 16, !dbg !127 + %2540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 17, !dbg !127 + %2541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 18, !dbg !127 + %2542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 19, !dbg !127 + %2543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 20, !dbg !127 + %2544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 21, !dbg !127 + %2545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 22, !dbg !127 + %2546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 23, !dbg !127 + %2547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 24, !dbg !127 + %2548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 25, !dbg !127 + %2549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 26, !dbg !127 + %2550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 27, !dbg !127 + %2551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 28, !dbg !127 + %2552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 29, !dbg !127 + %2553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 30, !dbg !127 + %2554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2522, 31, !dbg !127 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !127 + %2555 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101"(float %2523, float %2524, float %2525, float %2526, float %2527, float %2528, float %2529, float %2530, float %2531, float %2532, float %2533, float %2534, float %2535, float %2536, float %2537, float %2538, float %2539, float %2540, float %2541, float %2542, float %2543, float %2544, float %2545, float %2546, float %2547, float %2548, float %2549, float %2550, float %2551, float %2552, float %2553, float %2554, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %2207, i32 0, i32 0, float %.pn, float %.pn397, float %.pn398, float %.pn399, float %.pn400, float %.pn401, float %.pn402, float %.pn403, float %.pn404, float %.pn405, float %.pn406, float %.pn407, float %.pn408, float %.pn409, float %.pn410, float %.pn411, float %.pn412, float %.pn413, float %.pn414, float %.pn415, float %.pn416, float %.pn417, float %.pn418, float %.pn419, float %.pn420, float %.pn421, float %.pn422, float %.pn423, float %.pn424, float %.pn425, float %.pn426, float %.pn427, float %.pn428, float %.pn429, float %.pn430, float %.pn431, float %.pn432, float %.pn433, float %.pn434, float %.pn435, float %.pn436, float %.pn437, float %.pn438, float %.pn439, float %.pn440, float %.pn441, float %.pn442, float %.pn443, float %.pn444, float %.pn445, float %.pn446, float %.pn447, float %.pn448, float %.pn449, float %.pn450, float %.pn451, float %.pn452, float %.pn453, float %.pn454, float %.pn455, float %.pn456, float %.pn457, float %.pn458, float %.pn459) #2, !dbg !127 + %2556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 0, !dbg !127 + %2557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 1, !dbg !127 + %2558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 2, !dbg !127 + %2559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 3, !dbg !127 + %2560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 4, !dbg !127 + %2561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 5, !dbg !127 + %2562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 6, !dbg !127 + %2563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 7, !dbg !127 + %2564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 8, !dbg !127 + %2565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 9, !dbg !127 + %2566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 10, !dbg !127 + %2567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 11, !dbg !127 + %2568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 12, !dbg !127 + %2569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 13, !dbg !127 + %2570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 14, !dbg !127 + %2571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 15, !dbg !127 + %2572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 16, !dbg !127 + %2573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 17, !dbg !127 + %2574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 18, !dbg !127 + %2575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 19, !dbg !127 + %2576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 20, !dbg !127 + %2577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 21, !dbg !127 + %2578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 22, !dbg !127 + %2579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 23, !dbg !127 + %2580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 24, !dbg !127 + %2581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 25, !dbg !127 + %2582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 26, !dbg !127 + %2583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 27, !dbg !127 + %2584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 28, !dbg !127 + %2585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 29, !dbg !127 + %2586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 30, !dbg !127 + %2587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 31, !dbg !127 + %2588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 38, !dbg !127 + %2589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 39, !dbg !127 + %2590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 40, !dbg !127 + %2591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 41, !dbg !127 + %2592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 42, !dbg !127 + %2593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 43, !dbg !127 + %2594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 44, !dbg !127 + %2595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 45, !dbg !127 + %2596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 46, !dbg !127 + %2597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 47, !dbg !127 + %2598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 48, !dbg !127 + %2599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 49, !dbg !127 + %2600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 50, !dbg !127 + %2601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 51, !dbg !127 + %2602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 52, !dbg !127 + %2603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 53, !dbg !127 + %2604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 54, !dbg !127 + %2605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 55, !dbg !127 + %2606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 56, !dbg !127 + %2607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 57, !dbg !127 + %2608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 58, !dbg !127 + %2609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 59, !dbg !127 + %2610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 60, !dbg !127 + %2611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 61, !dbg !127 + %2612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 62, !dbg !127 + %2613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 63, !dbg !127 + %2614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 64, !dbg !127 + %2615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 65, !dbg !127 + %2616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 66, !dbg !127 + %2617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 67, !dbg !127 + %2618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 68, !dbg !127 + %2619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 69, !dbg !127 + %2620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 70, !dbg !127 + %2621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 71, !dbg !127 + %2622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 72, !dbg !127 + %2623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 73, !dbg !127 + %2624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 74, !dbg !127 + %2625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 75, !dbg !127 + %2626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 76, !dbg !127 + %2627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 77, !dbg !127 + %2628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 78, !dbg !127 + %2629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 79, !dbg !127 + %2630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 80, !dbg !127 + %2631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 81, !dbg !127 + %2632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 82, !dbg !127 + %2633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 83, !dbg !127 + %2634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 84, !dbg !127 + %2635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 85, !dbg !127 + %2636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 86, !dbg !127 + %2637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 87, !dbg !127 + %2638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 88, !dbg !127 + %2639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 89, !dbg !127 + %2640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 90, !dbg !127 + %2641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 91, !dbg !127 + %2642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 92, !dbg !127 + %2643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 93, !dbg !127 + %2644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 94, !dbg !127 + %2645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 95, !dbg !127 + %2646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 96, !dbg !127 + %2647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 97, !dbg !127 + %2648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 98, !dbg !127 + %2649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 99, !dbg !127 + %2650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 100, !dbg !127 + %2651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2555, 101, !dbg !127 + %2652 = fmul float %2556, 0x3FB6A09E60000000, !dbg !129 + %2653 = fmul float %2557, 0x3FB6A09E60000000, !dbg !129 + %2654 = fmul float %2558, 0x3FB6A09E60000000, !dbg !129 + %2655 = fmul float %2559, 0x3FB6A09E60000000, !dbg !129 + %2656 = fmul float %2560, 0x3FB6A09E60000000, !dbg !129 + %2657 = fmul float %2561, 0x3FB6A09E60000000, !dbg !129 + %2658 = fmul float %2562, 0x3FB6A09E60000000, !dbg !129 + %2659 = fmul float %2563, 0x3FB6A09E60000000, !dbg !129 + %2660 = fmul float %2564, 0x3FB6A09E60000000, !dbg !129 + %2661 = fmul float %2565, 0x3FB6A09E60000000, !dbg !129 + %2662 = fmul float %2566, 0x3FB6A09E60000000, !dbg !129 + %2663 = fmul float %2567, 0x3FB6A09E60000000, !dbg !129 + %2664 = fmul float %2568, 0x3FB6A09E60000000, !dbg !129 + %2665 = fmul float %2569, 0x3FB6A09E60000000, !dbg !129 + %2666 = fmul float %2570, 0x3FB6A09E60000000, !dbg !129 + %2667 = fmul float %2571, 0x3FB6A09E60000000, !dbg !129 + %2668 = fmul float %2572, 0x3FB6A09E60000000, !dbg !129 + %2669 = fmul float %2573, 0x3FB6A09E60000000, !dbg !129 + %2670 = fmul float %2574, 0x3FB6A09E60000000, !dbg !129 + %2671 = fmul float %2575, 0x3FB6A09E60000000, !dbg !129 + %2672 = fmul float %2576, 0x3FB6A09E60000000, !dbg !129 + %2673 = fmul float %2577, 0x3FB6A09E60000000, !dbg !129 + %2674 = fmul float %2578, 0x3FB6A09E60000000, !dbg !129 + %2675 = fmul float %2579, 0x3FB6A09E60000000, !dbg !129 + %2676 = fmul float %2580, 0x3FB6A09E60000000, !dbg !129 + %2677 = fmul float %2581, 0x3FB6A09E60000000, !dbg !129 + %2678 = fmul float %2582, 0x3FB6A09E60000000, !dbg !129 + %2679 = fmul float %2583, 0x3FB6A09E60000000, !dbg !129 + %2680 = fmul float %2584, 0x3FB6A09E60000000, !dbg !129 + %2681 = fmul float %2585, 0x3FB6A09E60000000, !dbg !129 + %2682 = fmul float %2586, 0x3FB6A09E60000000, !dbg !129 + %2683 = fmul float %2587, 0x3FB6A09E60000000, !dbg !129 + %2684 = fmul float %2652, 0x3FF7154760000000, !dbg !130 + %2685 = fmul float %2653, 0x3FF7154760000000, !dbg !130 + %2686 = fmul float %2654, 0x3FF7154760000000, !dbg !130 + %2687 = fmul float %2655, 0x3FF7154760000000, !dbg !130 + %2688 = fmul float %2656, 0x3FF7154760000000, !dbg !130 + %2689 = fmul float %2657, 0x3FF7154760000000, !dbg !130 + %2690 = fmul float %2658, 0x3FF7154760000000, !dbg !130 + %2691 = fmul float %2659, 0x3FF7154760000000, !dbg !130 + %2692 = fmul float %2660, 0x3FF7154760000000, !dbg !130 + %2693 = fmul float %2661, 0x3FF7154760000000, !dbg !130 + %2694 = fmul float %2662, 0x3FF7154760000000, !dbg !130 + %2695 = fmul float %2663, 0x3FF7154760000000, !dbg !130 + %2696 = fmul float %2664, 0x3FF7154760000000, !dbg !130 + %2697 = fmul float %2665, 0x3FF7154760000000, !dbg !130 + %2698 = fmul float %2666, 0x3FF7154760000000, !dbg !130 + %2699 = fmul float %2667, 0x3FF7154760000000, !dbg !130 + %2700 = fmul float %2668, 0x3FF7154760000000, !dbg !130 + %2701 = fmul float %2669, 0x3FF7154760000000, !dbg !130 + %2702 = fmul float %2670, 0x3FF7154760000000, !dbg !130 + %2703 = fmul float %2671, 0x3FF7154760000000, !dbg !130 + %2704 = fmul float %2672, 0x3FF7154760000000, !dbg !130 + %2705 = fmul float %2673, 0x3FF7154760000000, !dbg !130 + %2706 = fmul float %2674, 0x3FF7154760000000, !dbg !130 + %2707 = fmul float %2675, 0x3FF7154760000000, !dbg !130 + %2708 = fmul float %2676, 0x3FF7154760000000, !dbg !130 + %2709 = fmul float %2677, 0x3FF7154760000000, !dbg !130 + %2710 = fmul float %2678, 0x3FF7154760000000, !dbg !130 + %2711 = fmul float %2679, 0x3FF7154760000000, !dbg !130 + %2712 = fmul float %2680, 0x3FF7154760000000, !dbg !130 + %2713 = fmul float %2681, 0x3FF7154760000000, !dbg !130 + %2714 = fmul float %2682, 0x3FF7154760000000, !dbg !130 + %2715 = fmul float %2683, 0x3FF7154760000000, !dbg !130 + %2716 = tail call float @llvm.maxnum.f32(float %2684, float %2685), !dbg !131 + %2717 = tail call float @llvm.maxnum.f32(float %2686, float %2687), !dbg !131 + %2718 = tail call float @llvm.maxnum.f32(float %2716, float %2688), !dbg !131 + %2719 = tail call float @llvm.maxnum.f32(float %2718, float %2689), !dbg !131 + %2720 = tail call float @llvm.maxnum.f32(float %2717, float %2690), !dbg !131 + %2721 = tail call float @llvm.maxnum.f32(float %2720, float %2691), !dbg !131 + %2722 = tail call float @llvm.maxnum.f32(float %2719, float %2692), !dbg !131 + %2723 = tail call float @llvm.maxnum.f32(float %2722, float %2693), !dbg !131 + %2724 = tail call float @llvm.maxnum.f32(float %2721, float %2694), !dbg !131 + %2725 = tail call float @llvm.maxnum.f32(float %2724, float %2695), !dbg !131 + %2726 = tail call float @llvm.maxnum.f32(float %2723, float %2696), !dbg !131 + %2727 = tail call float @llvm.maxnum.f32(float %2726, float %2697), !dbg !131 + %2728 = tail call float @llvm.maxnum.f32(float %2725, float %2698), !dbg !131 + %2729 = tail call float @llvm.maxnum.f32(float %2728, float %2699), !dbg !131 + %2730 = tail call float @llvm.maxnum.f32(float %2727, float %2700), !dbg !131 + %2731 = tail call float @llvm.maxnum.f32(float %2730, float %2701), !dbg !131 + %2732 = tail call float @llvm.maxnum.f32(float %2729, float %2702), !dbg !131 + %2733 = tail call float @llvm.maxnum.f32(float %2732, float %2703), !dbg !131 + %2734 = tail call float @llvm.maxnum.f32(float %2731, float %2704), !dbg !131 + %2735 = tail call float @llvm.maxnum.f32(float %2734, float %2705), !dbg !131 + %2736 = tail call float @llvm.maxnum.f32(float %2733, float %2706), !dbg !131 + %2737 = tail call float @llvm.maxnum.f32(float %2736, float %2707), !dbg !131 + %2738 = tail call float @llvm.maxnum.f32(float %2735, float %2708), !dbg !131 + %2739 = tail call float @llvm.maxnum.f32(float %2738, float %2709), !dbg !131 + %2740 = tail call float @llvm.maxnum.f32(float %2737, float %2710), !dbg !131 + %2741 = tail call float @llvm.maxnum.f32(float %2740, float %2711), !dbg !131 + %2742 = tail call float @llvm.maxnum.f32(float %2739, float %2712), !dbg !131 + %2743 = tail call float @llvm.maxnum.f32(float %2742, float %2713), !dbg !131 + %2744 = tail call float @llvm.maxnum.f32(float %2741, float %2714), !dbg !131 + %2745 = tail call float @llvm.maxnum.f32(float %2744, float %2715), !dbg !131 + %2746 = bitcast float %2743 to i32, !dbg !132 + %2747 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2746, i32 2, i32 31), !dbg !132 + %2748 = bitcast i32 %2747 to float, !dbg !132 + %2749 = bitcast float %2745 to i32, !dbg !132 + %2750 = tail call float @llvm.maxnum.f32(float %2743, float %2748), !dbg !131 + %2751 = bitcast float %2750 to i32, !dbg !132 + %2752 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2751, i32 1, i32 31), !dbg !132 + %2753 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2749, i32 2, i32 31), !dbg !132 + %2754 = bitcast i32 %2753 to float, !dbg !132 + %2755 = tail call float @llvm.maxnum.f32(float %2745, float %2754), !dbg !131 + %2756 = bitcast float %2755 to i32, !dbg !132 + %2757 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2756, i32 1, i32 31), !dbg !132 + %2758 = insertelement <2 x i32> poison, i32 %2752, i64 0, !dbg !132 + %2759 = insertelement <2 x i32> %2758, i32 %2757, i64 1, !dbg !132 + %2760 = bitcast <2 x i32> %2759 to <2 x float>, !dbg !132 + %2761 = insertelement <2 x float> poison, float %2750, i64 0, !dbg !131 + %2762 = insertelement <2 x float> %2761, float %2755, i64 1, !dbg !131 + %2763 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %2762, <2 x float> %2760), !dbg !131 + %2764 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %2200, <2 x float> %2763), !dbg !133 + %2765 = extractelement <2 x float> %2764, i64 0, !dbg !134 + %2766 = fcmp oeq float %2765, 0xFFF0000000000000, !dbg !135 + %2767 = extractelement <2 x float> %2764, i64 1, !dbg !134 + %2768 = fcmp oeq float %2767, 0xFFF0000000000000, !dbg !135 + %2769 = select i1 %2766, float 0.000000e+00, float %2765, !dbg !134 + %2770 = select i1 %2768, float 0.000000e+00, float %2767, !dbg !134 + %2771 = extractelement <2 x float> %2200, i64 0, !dbg !136 + %2772 = fsub float %2771, %2769, !dbg !136 + %2773 = extractelement <2 x float> %2200, i64 1, !dbg !136 + %2774 = fsub float %2773, %2770, !dbg !136 + %2775 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !137 + %.not.i = icmp eq i32 %2775, 0, !dbg !137 + br i1 %.not.i, label %2778, label %2776, !dbg !137 + +2776: ; preds = %2195 + %2777 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2772) #2, !dbg !137 + br label %__nv_exp2f.exit, !dbg !137 + +2778: ; preds = %2195 + %2779 = tail call float @llvm.nvvm.ex2.approx.f(float %2772) #2, !dbg !137 + br label %__nv_exp2f.exit, !dbg !137 + +__nv_exp2f.exit: ; preds = %2776, %2778 + %.0.i = phi float [ %2777, %2776 ], [ %2779, %2778 ], !dbg !137 + %2780 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !137 + %.not.i91 = icmp eq i32 %2780, 0, !dbg !137 + br i1 %.not.i91, label %2783, label %2781, !dbg !137 + +2781: ; preds = %__nv_exp2f.exit + %2782 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2774) #2, !dbg !137 + br label %__nv_exp2f.exit93, !dbg !137 + +2783: ; preds = %__nv_exp2f.exit + %2784 = tail call float @llvm.nvvm.ex2.approx.f(float %2774) #2, !dbg !137 + br label %__nv_exp2f.exit93, !dbg !137 + +__nv_exp2f.exit93: ; preds = %2781, %2783 + %.0.i92 = phi float [ %2782, %2781 ], [ %2784, %2783 ], !dbg !137 + %2785 = fsub float %2684, %2769, !dbg !138 + %2786 = fsub float %2685, %2769, !dbg !138 + %2787 = fsub float %2686, %2770, !dbg !138 + %2788 = fsub float %2687, %2770, !dbg !138 + %2789 = fsub float %2688, %2769, !dbg !138 + %2790 = fsub float %2689, %2769, !dbg !138 + %2791 = fsub float %2690, %2770, !dbg !138 + %2792 = fsub float %2691, %2770, !dbg !138 + %2793 = fsub float %2692, %2769, !dbg !138 + %2794 = fsub float %2693, %2769, !dbg !138 + %2795 = fsub float %2694, %2770, !dbg !138 + %2796 = fsub float %2695, %2770, !dbg !138 + %2797 = fsub float %2696, %2769, !dbg !138 + %2798 = fsub float %2697, %2769, !dbg !138 + %2799 = fsub float %2698, %2770, !dbg !138 + %2800 = fsub float %2699, %2770, !dbg !138 + %2801 = fsub float %2700, %2769, !dbg !138 + %2802 = fsub float %2701, %2769, !dbg !138 + %2803 = fsub float %2702, %2770, !dbg !138 + %2804 = fsub float %2703, %2770, !dbg !138 + %2805 = fsub float %2704, %2769, !dbg !138 + %2806 = fsub float %2705, %2769, !dbg !138 + %2807 = fsub float %2706, %2770, !dbg !138 + %2808 = fsub float %2707, %2770, !dbg !138 + %2809 = fsub float %2708, %2769, !dbg !138 + %2810 = fsub float %2709, %2769, !dbg !138 + %2811 = fsub float %2710, %2770, !dbg !138 + %2812 = fsub float %2711, %2770, !dbg !138 + %2813 = fsub float %2712, %2769, !dbg !138 + %2814 = fsub float %2713, %2769, !dbg !138 + %2815 = fsub float %2714, %2770, !dbg !138 + %2816 = fsub float %2715, %2770, !dbg !138 + %2817 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i94 = icmp eq i32 %2817, 0, !dbg !139 + br i1 %.not.i94, label %2820, label %2818, !dbg !139 + +2818: ; preds = %__nv_exp2f.exit93 + %2819 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2785) #2, !dbg !139 + br label %__nv_exp2f.exit96, !dbg !139 + +2820: ; preds = %__nv_exp2f.exit93 + %2821 = tail call float @llvm.nvvm.ex2.approx.f(float %2785) #2, !dbg !139 + br label %__nv_exp2f.exit96, !dbg !139 + +__nv_exp2f.exit96: ; preds = %2818, %2820 + %.0.i95 = phi float [ %2819, %2818 ], [ %2821, %2820 ], !dbg !139 + %2822 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i97 = icmp eq i32 %2822, 0, !dbg !139 + br i1 %.not.i97, label %2825, label %2823, !dbg !139 + +2823: ; preds = %__nv_exp2f.exit96 + %2824 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2786) #2, !dbg !139 + br label %__nv_exp2f.exit99, !dbg !139 + +2825: ; preds = %__nv_exp2f.exit96 + %2826 = tail call float @llvm.nvvm.ex2.approx.f(float %2786) #2, !dbg !139 + br label %__nv_exp2f.exit99, !dbg !139 + +__nv_exp2f.exit99: ; preds = %2823, %2825 + %.0.i98 = phi float [ %2824, %2823 ], [ %2826, %2825 ], !dbg !139 + %2827 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i100 = icmp eq i32 %2827, 0, !dbg !139 + br i1 %.not.i100, label %2830, label %2828, !dbg !139 + +2828: ; preds = %__nv_exp2f.exit99 + %2829 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2787) #2, !dbg !139 + br label %__nv_exp2f.exit102, !dbg !139 + +2830: ; preds = %__nv_exp2f.exit99 + %2831 = tail call float @llvm.nvvm.ex2.approx.f(float %2787) #2, !dbg !139 + br label %__nv_exp2f.exit102, !dbg !139 + +__nv_exp2f.exit102: ; preds = %2828, %2830 + %.0.i101 = phi float [ %2829, %2828 ], [ %2831, %2830 ], !dbg !139 + %2832 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i103 = icmp eq i32 %2832, 0, !dbg !139 + br i1 %.not.i103, label %2835, label %2833, !dbg !139 + +2833: ; preds = %__nv_exp2f.exit102 + %2834 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2788) #2, !dbg !139 + br label %__nv_exp2f.exit105, !dbg !139 + +2835: ; preds = %__nv_exp2f.exit102 + %2836 = tail call float @llvm.nvvm.ex2.approx.f(float %2788) #2, !dbg !139 + br label %__nv_exp2f.exit105, !dbg !139 + +__nv_exp2f.exit105: ; preds = %2833, %2835 + %.0.i104 = phi float [ %2834, %2833 ], [ %2836, %2835 ], !dbg !139 + %2837 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i106 = icmp eq i32 %2837, 0, !dbg !139 + br i1 %.not.i106, label %2840, label %2838, !dbg !139 + +2838: ; preds = %__nv_exp2f.exit105 + %2839 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2789) #2, !dbg !139 + br label %__nv_exp2f.exit108, !dbg !139 + +2840: ; preds = %__nv_exp2f.exit105 + %2841 = tail call float @llvm.nvvm.ex2.approx.f(float %2789) #2, !dbg !139 + br label %__nv_exp2f.exit108, !dbg !139 + +__nv_exp2f.exit108: ; preds = %2838, %2840 + %.0.i107 = phi float [ %2839, %2838 ], [ %2841, %2840 ], !dbg !139 + %2842 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i109 = icmp eq i32 %2842, 0, !dbg !139 + br i1 %.not.i109, label %2845, label %2843, !dbg !139 + +2843: ; preds = %__nv_exp2f.exit108 + %2844 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2790) #2, !dbg !139 + br label %__nv_exp2f.exit111, !dbg !139 + +2845: ; preds = %__nv_exp2f.exit108 + %2846 = tail call float @llvm.nvvm.ex2.approx.f(float %2790) #2, !dbg !139 + br label %__nv_exp2f.exit111, !dbg !139 + +__nv_exp2f.exit111: ; preds = %2843, %2845 + %.0.i110 = phi float [ %2844, %2843 ], [ %2846, %2845 ], !dbg !139 + %2847 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i112 = icmp eq i32 %2847, 0, !dbg !139 + br i1 %.not.i112, label %2850, label %2848, !dbg !139 + +2848: ; preds = %__nv_exp2f.exit111 + %2849 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2791) #2, !dbg !139 + br label %__nv_exp2f.exit114, !dbg !139 + +2850: ; preds = %__nv_exp2f.exit111 + %2851 = tail call float @llvm.nvvm.ex2.approx.f(float %2791) #2, !dbg !139 + br label %__nv_exp2f.exit114, !dbg !139 + +__nv_exp2f.exit114: ; preds = %2848, %2850 + %.0.i113 = phi float [ %2849, %2848 ], [ %2851, %2850 ], !dbg !139 + %2852 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i115 = icmp eq i32 %2852, 0, !dbg !139 + br i1 %.not.i115, label %2855, label %2853, !dbg !139 + +2853: ; preds = %__nv_exp2f.exit114 + %2854 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2792) #2, !dbg !139 + br label %__nv_exp2f.exit117, !dbg !139 + +2855: ; preds = %__nv_exp2f.exit114 + %2856 = tail call float @llvm.nvvm.ex2.approx.f(float %2792) #2, !dbg !139 + br label %__nv_exp2f.exit117, !dbg !139 + +__nv_exp2f.exit117: ; preds = %2853, %2855 + %.0.i116 = phi float [ %2854, %2853 ], [ %2856, %2855 ], !dbg !139 + %2857 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i118 = icmp eq i32 %2857, 0, !dbg !139 + br i1 %.not.i118, label %2860, label %2858, !dbg !139 + +2858: ; preds = %__nv_exp2f.exit117 + %2859 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2793) #2, !dbg !139 + br label %__nv_exp2f.exit120, !dbg !139 + +2860: ; preds = %__nv_exp2f.exit117 + %2861 = tail call float @llvm.nvvm.ex2.approx.f(float %2793) #2, !dbg !139 + br label %__nv_exp2f.exit120, !dbg !139 + +__nv_exp2f.exit120: ; preds = %2858, %2860 + %.0.i119 = phi float [ %2859, %2858 ], [ %2861, %2860 ], !dbg !139 + %2862 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i121 = icmp eq i32 %2862, 0, !dbg !139 + br i1 %.not.i121, label %2865, label %2863, !dbg !139 + +2863: ; preds = %__nv_exp2f.exit120 + %2864 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2794) #2, !dbg !139 + br label %__nv_exp2f.exit123, !dbg !139 + +2865: ; preds = %__nv_exp2f.exit120 + %2866 = tail call float @llvm.nvvm.ex2.approx.f(float %2794) #2, !dbg !139 + br label %__nv_exp2f.exit123, !dbg !139 + +__nv_exp2f.exit123: ; preds = %2863, %2865 + %.0.i122 = phi float [ %2864, %2863 ], [ %2866, %2865 ], !dbg !139 + %2867 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i124 = icmp eq i32 %2867, 0, !dbg !139 + br i1 %.not.i124, label %2870, label %2868, !dbg !139 + +2868: ; preds = %__nv_exp2f.exit123 + %2869 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2795) #2, !dbg !139 + br label %__nv_exp2f.exit126, !dbg !139 + +2870: ; preds = %__nv_exp2f.exit123 + %2871 = tail call float @llvm.nvvm.ex2.approx.f(float %2795) #2, !dbg !139 + br label %__nv_exp2f.exit126, !dbg !139 + +__nv_exp2f.exit126: ; preds = %2868, %2870 + %.0.i125 = phi float [ %2869, %2868 ], [ %2871, %2870 ], !dbg !139 + %2872 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i127 = icmp eq i32 %2872, 0, !dbg !139 + br i1 %.not.i127, label %2875, label %2873, !dbg !139 + +2873: ; preds = %__nv_exp2f.exit126 + %2874 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2796) #2, !dbg !139 + br label %__nv_exp2f.exit129, !dbg !139 + +2875: ; preds = %__nv_exp2f.exit126 + %2876 = tail call float @llvm.nvvm.ex2.approx.f(float %2796) #2, !dbg !139 + br label %__nv_exp2f.exit129, !dbg !139 + +__nv_exp2f.exit129: ; preds = %2873, %2875 + %.0.i128 = phi float [ %2874, %2873 ], [ %2876, %2875 ], !dbg !139 + %2877 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i130 = icmp eq i32 %2877, 0, !dbg !139 + br i1 %.not.i130, label %2880, label %2878, !dbg !139 + +2878: ; preds = %__nv_exp2f.exit129 + %2879 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2797) #2, !dbg !139 + br label %__nv_exp2f.exit132, !dbg !139 + +2880: ; preds = %__nv_exp2f.exit129 + %2881 = tail call float @llvm.nvvm.ex2.approx.f(float %2797) #2, !dbg !139 + br label %__nv_exp2f.exit132, !dbg !139 + +__nv_exp2f.exit132: ; preds = %2878, %2880 + %.0.i131 = phi float [ %2879, %2878 ], [ %2881, %2880 ], !dbg !139 + %2882 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i133 = icmp eq i32 %2882, 0, !dbg !139 + br i1 %.not.i133, label %2885, label %2883, !dbg !139 + +2883: ; preds = %__nv_exp2f.exit132 + %2884 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2798) #2, !dbg !139 + br label %__nv_exp2f.exit135, !dbg !139 + +2885: ; preds = %__nv_exp2f.exit132 + %2886 = tail call float @llvm.nvvm.ex2.approx.f(float %2798) #2, !dbg !139 + br label %__nv_exp2f.exit135, !dbg !139 + +__nv_exp2f.exit135: ; preds = %2883, %2885 + %.0.i134 = phi float [ %2884, %2883 ], [ %2886, %2885 ], !dbg !139 + %2887 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i136 = icmp eq i32 %2887, 0, !dbg !139 + br i1 %.not.i136, label %2890, label %2888, !dbg !139 + +2888: ; preds = %__nv_exp2f.exit135 + %2889 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2799) #2, !dbg !139 + br label %__nv_exp2f.exit138, !dbg !139 + +2890: ; preds = %__nv_exp2f.exit135 + %2891 = tail call float @llvm.nvvm.ex2.approx.f(float %2799) #2, !dbg !139 + br label %__nv_exp2f.exit138, !dbg !139 + +__nv_exp2f.exit138: ; preds = %2888, %2890 + %.0.i137 = phi float [ %2889, %2888 ], [ %2891, %2890 ], !dbg !139 + %2892 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i139 = icmp eq i32 %2892, 0, !dbg !139 + br i1 %.not.i139, label %2895, label %2893, !dbg !139 + +2893: ; preds = %__nv_exp2f.exit138 + %2894 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2800) #2, !dbg !139 + br label %__nv_exp2f.exit141, !dbg !139 + +2895: ; preds = %__nv_exp2f.exit138 + %2896 = tail call float @llvm.nvvm.ex2.approx.f(float %2800) #2, !dbg !139 + br label %__nv_exp2f.exit141, !dbg !139 + +__nv_exp2f.exit141: ; preds = %2893, %2895 + %.0.i140 = phi float [ %2894, %2893 ], [ %2896, %2895 ], !dbg !139 + %2897 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i142 = icmp eq i32 %2897, 0, !dbg !139 + br i1 %.not.i142, label %2900, label %2898, !dbg !139 + +2898: ; preds = %__nv_exp2f.exit141 + %2899 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2801) #2, !dbg !139 + br label %__nv_exp2f.exit144, !dbg !139 + +2900: ; preds = %__nv_exp2f.exit141 + %2901 = tail call float @llvm.nvvm.ex2.approx.f(float %2801) #2, !dbg !139 + br label %__nv_exp2f.exit144, !dbg !139 + +__nv_exp2f.exit144: ; preds = %2898, %2900 + %.0.i143 = phi float [ %2899, %2898 ], [ %2901, %2900 ], !dbg !139 + %2902 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i145 = icmp eq i32 %2902, 0, !dbg !139 + br i1 %.not.i145, label %2905, label %2903, !dbg !139 + +2903: ; preds = %__nv_exp2f.exit144 + %2904 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2802) #2, !dbg !139 + br label %__nv_exp2f.exit147, !dbg !139 + +2905: ; preds = %__nv_exp2f.exit144 + %2906 = tail call float @llvm.nvvm.ex2.approx.f(float %2802) #2, !dbg !139 + br label %__nv_exp2f.exit147, !dbg !139 + +__nv_exp2f.exit147: ; preds = %2903, %2905 + %.0.i146 = phi float [ %2904, %2903 ], [ %2906, %2905 ], !dbg !139 + %2907 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i148 = icmp eq i32 %2907, 0, !dbg !139 + br i1 %.not.i148, label %2910, label %2908, !dbg !139 + +2908: ; preds = %__nv_exp2f.exit147 + %2909 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2803) #2, !dbg !139 + br label %__nv_exp2f.exit150, !dbg !139 + +2910: ; preds = %__nv_exp2f.exit147 + %2911 = tail call float @llvm.nvvm.ex2.approx.f(float %2803) #2, !dbg !139 + br label %__nv_exp2f.exit150, !dbg !139 + +__nv_exp2f.exit150: ; preds = %2908, %2910 + %.0.i149 = phi float [ %2909, %2908 ], [ %2911, %2910 ], !dbg !139 + %2912 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i151 = icmp eq i32 %2912, 0, !dbg !139 + br i1 %.not.i151, label %2915, label %2913, !dbg !139 + +2913: ; preds = %__nv_exp2f.exit150 + %2914 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2804) #2, !dbg !139 + br label %__nv_exp2f.exit153, !dbg !139 + +2915: ; preds = %__nv_exp2f.exit150 + %2916 = tail call float @llvm.nvvm.ex2.approx.f(float %2804) #2, !dbg !139 + br label %__nv_exp2f.exit153, !dbg !139 + +__nv_exp2f.exit153: ; preds = %2913, %2915 + %.0.i152 = phi float [ %2914, %2913 ], [ %2916, %2915 ], !dbg !139 + %2917 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i154 = icmp eq i32 %2917, 0, !dbg !139 + br i1 %.not.i154, label %2920, label %2918, !dbg !139 + +2918: ; preds = %__nv_exp2f.exit153 + %2919 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2805) #2, !dbg !139 + br label %__nv_exp2f.exit156, !dbg !139 + +2920: ; preds = %__nv_exp2f.exit153 + %2921 = tail call float @llvm.nvvm.ex2.approx.f(float %2805) #2, !dbg !139 + br label %__nv_exp2f.exit156, !dbg !139 + +__nv_exp2f.exit156: ; preds = %2918, %2920 + %.0.i155 = phi float [ %2919, %2918 ], [ %2921, %2920 ], !dbg !139 + %2922 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i157 = icmp eq i32 %2922, 0, !dbg !139 + br i1 %.not.i157, label %2925, label %2923, !dbg !139 + +2923: ; preds = %__nv_exp2f.exit156 + %2924 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2806) #2, !dbg !139 + br label %__nv_exp2f.exit159, !dbg !139 + +2925: ; preds = %__nv_exp2f.exit156 + %2926 = tail call float @llvm.nvvm.ex2.approx.f(float %2806) #2, !dbg !139 + br label %__nv_exp2f.exit159, !dbg !139 + +__nv_exp2f.exit159: ; preds = %2923, %2925 + %.0.i158 = phi float [ %2924, %2923 ], [ %2926, %2925 ], !dbg !139 + %2927 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i160 = icmp eq i32 %2927, 0, !dbg !139 + br i1 %.not.i160, label %2930, label %2928, !dbg !139 + +2928: ; preds = %__nv_exp2f.exit159 + %2929 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2807) #2, !dbg !139 + br label %__nv_exp2f.exit162, !dbg !139 + +2930: ; preds = %__nv_exp2f.exit159 + %2931 = tail call float @llvm.nvvm.ex2.approx.f(float %2807) #2, !dbg !139 + br label %__nv_exp2f.exit162, !dbg !139 + +__nv_exp2f.exit162: ; preds = %2928, %2930 + %.0.i161 = phi float [ %2929, %2928 ], [ %2931, %2930 ], !dbg !139 + %2932 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i163 = icmp eq i32 %2932, 0, !dbg !139 + br i1 %.not.i163, label %2935, label %2933, !dbg !139 + +2933: ; preds = %__nv_exp2f.exit162 + %2934 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2808) #2, !dbg !139 + br label %__nv_exp2f.exit165, !dbg !139 + +2935: ; preds = %__nv_exp2f.exit162 + %2936 = tail call float @llvm.nvvm.ex2.approx.f(float %2808) #2, !dbg !139 + br label %__nv_exp2f.exit165, !dbg !139 + +__nv_exp2f.exit165: ; preds = %2933, %2935 + %.0.i164 = phi float [ %2934, %2933 ], [ %2936, %2935 ], !dbg !139 + %2937 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i166 = icmp eq i32 %2937, 0, !dbg !139 + br i1 %.not.i166, label %2940, label %2938, !dbg !139 + +2938: ; preds = %__nv_exp2f.exit165 + %2939 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2809) #2, !dbg !139 + br label %__nv_exp2f.exit168, !dbg !139 + +2940: ; preds = %__nv_exp2f.exit165 + %2941 = tail call float @llvm.nvvm.ex2.approx.f(float %2809) #2, !dbg !139 + br label %__nv_exp2f.exit168, !dbg !139 + +__nv_exp2f.exit168: ; preds = %2938, %2940 + %.0.i167 = phi float [ %2939, %2938 ], [ %2941, %2940 ], !dbg !139 + %2942 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i169 = icmp eq i32 %2942, 0, !dbg !139 + br i1 %.not.i169, label %2945, label %2943, !dbg !139 + +2943: ; preds = %__nv_exp2f.exit168 + %2944 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2810) #2, !dbg !139 + br label %__nv_exp2f.exit171, !dbg !139 + +2945: ; preds = %__nv_exp2f.exit168 + %2946 = tail call float @llvm.nvvm.ex2.approx.f(float %2810) #2, !dbg !139 + br label %__nv_exp2f.exit171, !dbg !139 + +__nv_exp2f.exit171: ; preds = %2943, %2945 + %.0.i170 = phi float [ %2944, %2943 ], [ %2946, %2945 ], !dbg !139 + %2947 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i172 = icmp eq i32 %2947, 0, !dbg !139 + br i1 %.not.i172, label %2950, label %2948, !dbg !139 + +2948: ; preds = %__nv_exp2f.exit171 + %2949 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2811) #2, !dbg !139 + br label %__nv_exp2f.exit174, !dbg !139 + +2950: ; preds = %__nv_exp2f.exit171 + %2951 = tail call float @llvm.nvvm.ex2.approx.f(float %2811) #2, !dbg !139 + br label %__nv_exp2f.exit174, !dbg !139 + +__nv_exp2f.exit174: ; preds = %2948, %2950 + %.0.i173 = phi float [ %2949, %2948 ], [ %2951, %2950 ], !dbg !139 + %2952 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i175 = icmp eq i32 %2952, 0, !dbg !139 + br i1 %.not.i175, label %2955, label %2953, !dbg !139 + +2953: ; preds = %__nv_exp2f.exit174 + %2954 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2812) #2, !dbg !139 + br label %__nv_exp2f.exit177, !dbg !139 + +2955: ; preds = %__nv_exp2f.exit174 + %2956 = tail call float @llvm.nvvm.ex2.approx.f(float %2812) #2, !dbg !139 + br label %__nv_exp2f.exit177, !dbg !139 + +__nv_exp2f.exit177: ; preds = %2953, %2955 + %.0.i176 = phi float [ %2954, %2953 ], [ %2956, %2955 ], !dbg !139 + %2957 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i178 = icmp eq i32 %2957, 0, !dbg !139 + br i1 %.not.i178, label %2960, label %2958, !dbg !139 + +2958: ; preds = %__nv_exp2f.exit177 + %2959 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2813) #2, !dbg !139 + br label %__nv_exp2f.exit180, !dbg !139 + +2960: ; preds = %__nv_exp2f.exit177 + %2961 = tail call float @llvm.nvvm.ex2.approx.f(float %2813) #2, !dbg !139 + br label %__nv_exp2f.exit180, !dbg !139 + +__nv_exp2f.exit180: ; preds = %2958, %2960 + %.0.i179 = phi float [ %2959, %2958 ], [ %2961, %2960 ], !dbg !139 + %2962 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i181 = icmp eq i32 %2962, 0, !dbg !139 + br i1 %.not.i181, label %2965, label %2963, !dbg !139 + +2963: ; preds = %__nv_exp2f.exit180 + %2964 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2814) #2, !dbg !139 + br label %__nv_exp2f.exit183, !dbg !139 + +2965: ; preds = %__nv_exp2f.exit180 + %2966 = tail call float @llvm.nvvm.ex2.approx.f(float %2814) #2, !dbg !139 + br label %__nv_exp2f.exit183, !dbg !139 + +__nv_exp2f.exit183: ; preds = %2963, %2965 + %.0.i182 = phi float [ %2964, %2963 ], [ %2966, %2965 ], !dbg !139 + %2967 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i184 = icmp eq i32 %2967, 0, !dbg !139 + br i1 %.not.i184, label %2970, label %2968, !dbg !139 + +2968: ; preds = %__nv_exp2f.exit183 + %2969 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2815) #2, !dbg !139 + br label %__nv_exp2f.exit186, !dbg !139 + +2970: ; preds = %__nv_exp2f.exit183 + %2971 = tail call float @llvm.nvvm.ex2.approx.f(float %2815) #2, !dbg !139 + br label %__nv_exp2f.exit186, !dbg !139 + +__nv_exp2f.exit186: ; preds = %2968, %2970 + %.0.i185 = phi float [ %2969, %2968 ], [ %2971, %2970 ], !dbg !139 + %2972 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !139 + %.not.i187 = icmp eq i32 %2972, 0, !dbg !139 + br i1 %.not.i187, label %2975, label %2973, !dbg !139 + +2973: ; preds = %__nv_exp2f.exit186 + %2974 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2816) #2, !dbg !139 + br label %__nv_exp2f.exit189, !dbg !139 + +2975: ; preds = %__nv_exp2f.exit186 + %2976 = tail call float @llvm.nvvm.ex2.approx.f(float %2816) #2, !dbg !139 + br label %__nv_exp2f.exit189, !dbg !139 + +__nv_exp2f.exit189: ; preds = %2973, %2975 + %.0.i188 = phi float [ %2974, %2973 ], [ %2976, %2975 ], !dbg !139 + %2977 = fmul float %.pn464, %.0.i, !dbg !140 + %2978 = fmul float %.pn462, %.0.i92, !dbg !140 + %2979 = fadd float %.0.i95, %.0.i98, !dbg !141 + %2980 = fadd float %.0.i101, %.0.i104, !dbg !141 + %2981 = fadd float %2979, %.0.i107, !dbg !141 + %2982 = fadd float %2981, %.0.i110, !dbg !141 + %2983 = fadd float %2980, %.0.i113, !dbg !141 + %2984 = fadd float %2983, %.0.i116, !dbg !141 + %2985 = fadd float %2982, %.0.i119, !dbg !141 + %2986 = fadd float %2985, %.0.i122, !dbg !141 + %2987 = fadd float %2984, %.0.i125, !dbg !141 + %2988 = fadd float %2987, %.0.i128, !dbg !141 + %2989 = fadd float %2986, %.0.i131, !dbg !141 + %2990 = fadd float %2989, %.0.i134, !dbg !141 + %2991 = fadd float %2988, %.0.i137, !dbg !141 + %2992 = fadd float %2991, %.0.i140, !dbg !141 + %2993 = fadd float %2990, %.0.i143, !dbg !141 + %2994 = fadd float %2993, %.0.i146, !dbg !141 + %2995 = fadd float %2992, %.0.i149, !dbg !141 + %2996 = fadd float %2995, %.0.i152, !dbg !141 + %2997 = fadd float %2994, %.0.i155, !dbg !141 + %2998 = fadd float %2997, %.0.i158, !dbg !141 + %2999 = fadd float %2996, %.0.i161, !dbg !141 + %3000 = fadd float %2999, %.0.i164, !dbg !141 + %3001 = fadd float %2998, %.0.i167, !dbg !141 + %3002 = fadd float %3001, %.0.i170, !dbg !141 + %3003 = fadd float %3000, %.0.i173, !dbg !141 + %3004 = fadd float %3003, %.0.i176, !dbg !141 + %3005 = fadd float %3002, %.0.i179, !dbg !141 + %3006 = fadd float %3005, %.0.i182, !dbg !141 + %3007 = fadd float %3004, %.0.i185, !dbg !141 + %3008 = fadd float %3007, %.0.i188, !dbg !141 + %3009 = bitcast float %3006 to i32, !dbg !142 + %3010 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3009, i32 2, i32 31), !dbg !142 + %3011 = bitcast i32 %3010 to float, !dbg !142 + %3012 = fadd float %3006, %3011, !dbg !141 + %3013 = bitcast float %3012 to i32, !dbg !142 + %3014 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3013, i32 1, i32 31), !dbg !142 + %3015 = bitcast i32 %3014 to float, !dbg !142 + %3016 = fadd float %3012, %3015, !dbg !141 + %3017 = bitcast float %3008 to i32, !dbg !142 + %3018 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3017, i32 2, i32 31), !dbg !142 + %3019 = bitcast i32 %3018 to float, !dbg !142 + %3020 = fadd float %3008, %3019, !dbg !141 + %3021 = bitcast float %3020 to i32, !dbg !142 + %3022 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3021, i32 1, i32 31), !dbg !142 + %3023 = bitcast i32 %3022 to float, !dbg !142 + %3024 = fadd float %3020, %3023, !dbg !141 + %3025 = fadd float %2977, %3016, !dbg !143 + %3026 = fadd float %2978, %3024, !dbg !143 + %3027 = fmul float %2588, %.0.i, !dbg !144 + %3028 = fmul float %2589, %.0.i, !dbg !144 + %3029 = fmul float %2590, %.0.i92, !dbg !144 + %3030 = fmul float %2591, %.0.i92, !dbg !144 + %3031 = fmul float %2592, %.0.i, !dbg !144 + %3032 = fmul float %2593, %.0.i, !dbg !144 + %3033 = fmul float %2594, %.0.i92, !dbg !144 + %3034 = fmul float %2595, %.0.i92, !dbg !144 + %3035 = fmul float %2596, %.0.i, !dbg !144 + %3036 = fmul float %2597, %.0.i, !dbg !144 + %3037 = fmul float %2598, %.0.i92, !dbg !144 + %3038 = fmul float %2599, %.0.i92, !dbg !144 + %3039 = fmul float %2600, %.0.i, !dbg !144 + %3040 = fmul float %2601, %.0.i, !dbg !144 + %3041 = fmul float %2602, %.0.i92, !dbg !144 + %3042 = fmul float %2603, %.0.i92, !dbg !144 + %3043 = fmul float %2604, %.0.i, !dbg !144 + %3044 = fmul float %2605, %.0.i, !dbg !144 + %3045 = fmul float %2606, %.0.i92, !dbg !144 + %3046 = fmul float %2607, %.0.i92, !dbg !144 + %3047 = fmul float %2608, %.0.i, !dbg !144 + %3048 = fmul float %2609, %.0.i, !dbg !144 + %3049 = fmul float %2610, %.0.i92, !dbg !144 + %3050 = fmul float %2611, %.0.i92, !dbg !144 + %3051 = fmul float %2612, %.0.i, !dbg !144 + %3052 = fmul float %2613, %.0.i, !dbg !144 + %3053 = fmul float %2614, %.0.i92, !dbg !144 + %3054 = fmul float %2615, %.0.i92, !dbg !144 + %3055 = fmul float %2616, %.0.i, !dbg !144 + %3056 = fmul float %2617, %.0.i, !dbg !144 + %3057 = fmul float %2618, %.0.i92, !dbg !144 + %3058 = fmul float %2619, %.0.i92, !dbg !144 + %3059 = fmul float %2620, %.0.i, !dbg !144 + %3060 = fmul float %2621, %.0.i, !dbg !144 + %3061 = fmul float %2622, %.0.i92, !dbg !144 + %3062 = fmul float %2623, %.0.i92, !dbg !144 + %3063 = fmul float %2624, %.0.i, !dbg !144 + %3064 = fmul float %2625, %.0.i, !dbg !144 + %3065 = fmul float %2626, %.0.i92, !dbg !144 + %3066 = fmul float %2627, %.0.i92, !dbg !144 + %3067 = fmul float %2628, %.0.i, !dbg !144 + %3068 = fmul float %2629, %.0.i, !dbg !144 + %3069 = fmul float %2630, %.0.i92, !dbg !144 + %3070 = fmul float %2631, %.0.i92, !dbg !144 + %3071 = fmul float %2632, %.0.i, !dbg !144 + %3072 = fmul float %2633, %.0.i, !dbg !144 + %3073 = fmul float %2634, %.0.i92, !dbg !144 + %3074 = fmul float %2635, %.0.i92, !dbg !144 + %3075 = fmul float %2636, %.0.i, !dbg !144 + %3076 = fmul float %2637, %.0.i, !dbg !144 + %3077 = fmul float %2638, %.0.i92, !dbg !144 + %3078 = fmul float %2639, %.0.i92, !dbg !144 + %3079 = fmul float %2640, %.0.i, !dbg !144 + %3080 = fmul float %2641, %.0.i, !dbg !144 + %3081 = fmul float %2642, %.0.i92, !dbg !144 + %3082 = fmul float %2643, %.0.i92, !dbg !144 + %3083 = fmul float %2644, %.0.i, !dbg !144 + %3084 = fmul float %2645, %.0.i, !dbg !144 + %3085 = fmul float %2646, %.0.i92, !dbg !144 + %3086 = fmul float %2647, %.0.i92, !dbg !144 + %3087 = fmul float %2648, %.0.i, !dbg !144 + %3088 = fmul float %2649, %.0.i, !dbg !144 + %3089 = fmul float %2650, %.0.i92, !dbg !144 + %3090 = fmul float %2651, %.0.i92, !dbg !144 + %3091 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %2206, !dbg !125 + %3092 = insertelement <2 x float> poison, float %.0.i95, i64 0, !dbg !145 + %3093 = insertelement <2 x float> %3092, float %.0.i98, i64 1, !dbg !145 + %3094 = fptrunc <2 x float> %3093 to <2 x bfloat>, !dbg !145 + %3095 = insertelement <2 x float> poison, float %.0.i101, i64 0, !dbg !145 + %3096 = insertelement <2 x float> %3095, float %.0.i104, i64 1, !dbg !145 + %3097 = fptrunc <2 x float> %3096 to <2 x bfloat>, !dbg !145 + %3098 = insertelement <2 x float> poison, float %.0.i107, i64 0, !dbg !145 + %3099 = insertelement <2 x float> %3098, float %.0.i110, i64 1, !dbg !145 + %3100 = fptrunc <2 x float> %3099 to <2 x bfloat>, !dbg !145 + %3101 = insertelement <2 x float> poison, float %.0.i113, i64 0, !dbg !145 + %3102 = insertelement <2 x float> %3101, float %.0.i116, i64 1, !dbg !145 + %3103 = fptrunc <2 x float> %3102 to <2 x bfloat>, !dbg !145 + %3104 = insertelement <2 x float> poison, float %.0.i119, i64 0, !dbg !145 + %3105 = insertelement <2 x float> %3104, float %.0.i122, i64 1, !dbg !145 + %3106 = fptrunc <2 x float> %3105 to <2 x bfloat>, !dbg !145 + %3107 = insertelement <2 x float> poison, float %.0.i125, i64 0, !dbg !145 + %3108 = insertelement <2 x float> %3107, float %.0.i128, i64 1, !dbg !145 + %3109 = fptrunc <2 x float> %3108 to <2 x bfloat>, !dbg !145 + %3110 = insertelement <2 x float> poison, float %.0.i131, i64 0, !dbg !145 + %3111 = insertelement <2 x float> %3110, float %.0.i134, i64 1, !dbg !145 + %3112 = fptrunc <2 x float> %3111 to <2 x bfloat>, !dbg !145 + %3113 = insertelement <2 x float> poison, float %.0.i137, i64 0, !dbg !145 + %3114 = insertelement <2 x float> %3113, float %.0.i140, i64 1, !dbg !145 + %3115 = fptrunc <2 x float> %3114 to <2 x bfloat>, !dbg !145 + %3116 = insertelement <2 x float> poison, float %.0.i143, i64 0, !dbg !145 + %3117 = insertelement <2 x float> %3116, float %.0.i146, i64 1, !dbg !145 + %3118 = fptrunc <2 x float> %3117 to <2 x bfloat>, !dbg !145 + %3119 = insertelement <2 x float> poison, float %.0.i149, i64 0, !dbg !145 + %3120 = insertelement <2 x float> %3119, float %.0.i152, i64 1, !dbg !145 + %3121 = fptrunc <2 x float> %3120 to <2 x bfloat>, !dbg !145 + %3122 = insertelement <2 x float> poison, float %.0.i155, i64 0, !dbg !145 + %3123 = insertelement <2 x float> %3122, float %.0.i158, i64 1, !dbg !145 + %3124 = fptrunc <2 x float> %3123 to <2 x bfloat>, !dbg !145 + %3125 = insertelement <2 x float> poison, float %.0.i161, i64 0, !dbg !145 + %3126 = insertelement <2 x float> %3125, float %.0.i164, i64 1, !dbg !145 + %3127 = fptrunc <2 x float> %3126 to <2 x bfloat>, !dbg !145 + %3128 = insertelement <2 x float> poison, float %.0.i167, i64 0, !dbg !145 + %3129 = insertelement <2 x float> %3128, float %.0.i170, i64 1, !dbg !145 + %3130 = fptrunc <2 x float> %3129 to <2 x bfloat>, !dbg !145 + %3131 = insertelement <2 x float> poison, float %.0.i173, i64 0, !dbg !145 + %3132 = insertelement <2 x float> %3131, float %.0.i176, i64 1, !dbg !145 + %3133 = fptrunc <2 x float> %3132 to <2 x bfloat>, !dbg !145 + %3134 = insertelement <2 x float> poison, float %.0.i179, i64 0, !dbg !145 + %3135 = insertelement <2 x float> %3134, float %.0.i182, i64 1, !dbg !145 + %3136 = fptrunc <2 x float> %3135 to <2 x bfloat>, !dbg !145 + %3137 = insertelement <2 x float> poison, float %.0.i185, i64 0, !dbg !145 + %3138 = insertelement <2 x float> %3137, float %.0.i188, i64 1, !dbg !145 + %3139 = fptrunc <2 x float> %3138 to <2 x bfloat>, !dbg !145 + %3140 = bitcast <2 x bfloat> %3094 to i32, !dbg !146 + %3141 = bitcast <2 x bfloat> %3097 to i32, !dbg !146 + %3142 = bitcast <2 x bfloat> %3100 to i32, !dbg !146 + %3143 = bitcast <2 x bfloat> %3103 to i32, !dbg !146 + %3144 = bitcast <2 x bfloat> %3106 to i32, !dbg !146 + %3145 = bitcast <2 x bfloat> %3109 to i32, !dbg !146 + %3146 = bitcast <2 x bfloat> %3112 to i32, !dbg !146 + %3147 = bitcast <2 x bfloat> %3115 to i32, !dbg !146 + %3148 = bitcast <2 x bfloat> %3118 to i32, !dbg !146 + %3149 = bitcast <2 x bfloat> %3121 to i32, !dbg !146 + %3150 = bitcast <2 x bfloat> %3124 to i32, !dbg !146 + %3151 = bitcast <2 x bfloat> %3127 to i32, !dbg !146 + %3152 = bitcast <2 x bfloat> %3130 to i32, !dbg !146 + %3153 = bitcast <2 x bfloat> %3133 to i32, !dbg !146 + %3154 = bitcast <2 x bfloat> %3136 to i32, !dbg !146 + %3155 = bitcast <2 x bfloat> %3139 to i32, !dbg !146 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !146 + %3156 = ptrtoint ptr addrspace(3) %3091 to i32, !dbg !146 + %3157 = lshr exact i32 %3156, 4, !dbg !146 + %3158 = and i32 %3157, 16383, !dbg !146 + %3159 = zext nneg i32 %3158 to i64, !dbg !146 + %3160 = or disjoint i64 %3159, 4611686293338849280, !dbg !146 + %3161 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3027, float %3028, float %3029, float %3030, float %3031, float %3032, float %3033, float %3034, float %3035, float %3036, float %3037, float %3038, float %3039, float %3040, float %3041, float %3042, float %3043, float %3044, float %3045, float %3046, float %3047, float %3048, float %3049, float %3050, float %3051, float %3052, float %3053, float %3054, float %3055, float %3056, float %3057, float %3058, float %3059, float %3060, float %3061, float %3062, float %3063, float %3064, float %3065, float %3066, float %3067, float %3068, float %3069, float %3070, float %3071, float %3072, float %3073, float %3074, float %3075, float %3076, float %3077, float %3078, float %3079, float %3080, float %3081, float %3082, float %3083, float %3084, float %3085, float %3086, float %3087, float %3088, float %3089, float %3090, i32 %3140, i32 %3141, i32 %3142, i32 %3143, i64 %3160, i1 true) #2, !dbg !146 + %3162 = add i32 %3156, 2048, !dbg !146 + %3163 = lshr exact i32 %3162, 4, !dbg !146 + %3164 = and i32 %3163, 16383, !dbg !146 + %3165 = zext nneg i32 %3164 to i64, !dbg !146 + %3166 = or disjoint i64 %3165, 4611686293338849280, !dbg !146 + %3167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 0, !dbg !146 + %3168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 1, !dbg !146 + %3169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 2, !dbg !146 + %3170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 3, !dbg !146 + %3171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 4, !dbg !146 + %3172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 5, !dbg !146 + %3173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 6, !dbg !146 + %3174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 7, !dbg !146 + %3175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 8, !dbg !146 + %3176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 9, !dbg !146 + %3177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 10, !dbg !146 + %3178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 11, !dbg !146 + %3179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 12, !dbg !146 + %3180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 13, !dbg !146 + %3181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 14, !dbg !146 + %3182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 15, !dbg !146 + %3183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 16, !dbg !146 + %3184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 17, !dbg !146 + %3185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 18, !dbg !146 + %3186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 19, !dbg !146 + %3187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 20, !dbg !146 + %3188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 21, !dbg !146 + %3189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 22, !dbg !146 + %3190 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 23, !dbg !146 + %3191 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 24, !dbg !146 + %3192 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 25, !dbg !146 + %3193 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 26, !dbg !146 + %3194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 27, !dbg !146 + %3195 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 28, !dbg !146 + %3196 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 29, !dbg !146 + %3197 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 30, !dbg !146 + %3198 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 31, !dbg !146 + %3199 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 32, !dbg !146 + %3200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 33, !dbg !146 + %3201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 34, !dbg !146 + %3202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 35, !dbg !146 + %3203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 36, !dbg !146 + %3204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 37, !dbg !146 + %3205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 38, !dbg !146 + %3206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 39, !dbg !146 + %3207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 40, !dbg !146 + %3208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 41, !dbg !146 + %3209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 42, !dbg !146 + %3210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 43, !dbg !146 + %3211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 44, !dbg !146 + %3212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 45, !dbg !146 + %3213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 46, !dbg !146 + %3214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 47, !dbg !146 + %3215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 48, !dbg !146 + %3216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 49, !dbg !146 + %3217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 50, !dbg !146 + %3218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 51, !dbg !146 + %3219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 52, !dbg !146 + %3220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 53, !dbg !146 + %3221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 54, !dbg !146 + %3222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 55, !dbg !146 + %3223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 56, !dbg !146 + %3224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 57, !dbg !146 + %3225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 58, !dbg !146 + %3226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 59, !dbg !146 + %3227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 60, !dbg !146 + %3228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 61, !dbg !146 + %3229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 62, !dbg !146 + %3230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3161, 63, !dbg !146 + %3231 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3167, float %3168, float %3169, float %3170, float %3171, float %3172, float %3173, float %3174, float %3175, float %3176, float %3177, float %3178, float %3179, float %3180, float %3181, float %3182, float %3183, float %3184, float %3185, float %3186, float %3187, float %3188, float %3189, float %3190, float %3191, float %3192, float %3193, float %3194, float %3195, float %3196, float %3197, float %3198, float %3199, float %3200, float %3201, float %3202, float %3203, float %3204, float %3205, float %3206, float %3207, float %3208, float %3209, float %3210, float %3211, float %3212, float %3213, float %3214, float %3215, float %3216, float %3217, float %3218, float %3219, float %3220, float %3221, float %3222, float %3223, float %3224, float %3225, float %3226, float %3227, float %3228, float %3229, float %3230, i32 %3144, i32 %3145, i32 %3146, i32 %3147, i64 %3166, i1 true) #2, !dbg !146 + %3232 = add i32 %3156, 4096, !dbg !146 + %3233 = lshr exact i32 %3232, 4, !dbg !146 + %3234 = and i32 %3233, 16383, !dbg !146 + %3235 = zext nneg i32 %3234 to i64, !dbg !146 + %3236 = or disjoint i64 %3235, 4611686293338849280, !dbg !146 + %3237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 0, !dbg !146 + %3238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 1, !dbg !146 + %3239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 2, !dbg !146 + %3240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 3, !dbg !146 + %3241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 4, !dbg !146 + %3242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 5, !dbg !146 + %3243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 6, !dbg !146 + %3244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 7, !dbg !146 + %3245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 8, !dbg !146 + %3246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 9, !dbg !146 + %3247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 10, !dbg !146 + %3248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 11, !dbg !146 + %3249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 12, !dbg !146 + %3250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 13, !dbg !146 + %3251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 14, !dbg !146 + %3252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 15, !dbg !146 + %3253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 16, !dbg !146 + %3254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 17, !dbg !146 + %3255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 18, !dbg !146 + %3256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 19, !dbg !146 + %3257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 20, !dbg !146 + %3258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 21, !dbg !146 + %3259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 22, !dbg !146 + %3260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 23, !dbg !146 + %3261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 24, !dbg !146 + %3262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 25, !dbg !146 + %3263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 26, !dbg !146 + %3264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 27, !dbg !146 + %3265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 28, !dbg !146 + %3266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 29, !dbg !146 + %3267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 30, !dbg !146 + %3268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 31, !dbg !146 + %3269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 32, !dbg !146 + %3270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 33, !dbg !146 + %3271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 34, !dbg !146 + %3272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 35, !dbg !146 + %3273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 36, !dbg !146 + %3274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 37, !dbg !146 + %3275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 38, !dbg !146 + %3276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 39, !dbg !146 + %3277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 40, !dbg !146 + %3278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 41, !dbg !146 + %3279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 42, !dbg !146 + %3280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 43, !dbg !146 + %3281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 44, !dbg !146 + %3282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 45, !dbg !146 + %3283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 46, !dbg !146 + %3284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 47, !dbg !146 + %3285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 48, !dbg !146 + %3286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 49, !dbg !146 + %3287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 50, !dbg !146 + %3288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 51, !dbg !146 + %3289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 52, !dbg !146 + %3290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 53, !dbg !146 + %3291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 54, !dbg !146 + %3292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 55, !dbg !146 + %3293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 56, !dbg !146 + %3294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 57, !dbg !146 + %3295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 58, !dbg !146 + %3296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 59, !dbg !146 + %3297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 60, !dbg !146 + %3298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 61, !dbg !146 + %3299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 62, !dbg !146 + %3300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3231, 63, !dbg !146 + %3301 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3237, float %3238, float %3239, float %3240, float %3241, float %3242, float %3243, float %3244, float %3245, float %3246, float %3247, float %3248, float %3249, float %3250, float %3251, float %3252, float %3253, float %3254, float %3255, float %3256, float %3257, float %3258, float %3259, float %3260, float %3261, float %3262, float %3263, float %3264, float %3265, float %3266, float %3267, float %3268, float %3269, float %3270, float %3271, float %3272, float %3273, float %3274, float %3275, float %3276, float %3277, float %3278, float %3279, float %3280, float %3281, float %3282, float %3283, float %3284, float %3285, float %3286, float %3287, float %3288, float %3289, float %3290, float %3291, float %3292, float %3293, float %3294, float %3295, float %3296, float %3297, float %3298, float %3299, float %3300, i32 %3148, i32 %3149, i32 %3150, i32 %3151, i64 %3236, i1 true) #2, !dbg !146 + %3302 = add i32 %3156, 6144, !dbg !146 + %3303 = lshr exact i32 %3302, 4, !dbg !146 + %3304 = and i32 %3303, 16383, !dbg !146 + %3305 = zext nneg i32 %3304 to i64, !dbg !146 + %3306 = or disjoint i64 %3305, 4611686293338849280, !dbg !146 + %3307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 0, !dbg !146 + %3308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 1, !dbg !146 + %3309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 2, !dbg !146 + %3310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 3, !dbg !146 + %3311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 4, !dbg !146 + %3312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 5, !dbg !146 + %3313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 6, !dbg !146 + %3314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 7, !dbg !146 + %3315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 8, !dbg !146 + %3316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 9, !dbg !146 + %3317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 10, !dbg !146 + %3318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 11, !dbg !146 + %3319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 12, !dbg !146 + %3320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 13, !dbg !146 + %3321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 14, !dbg !146 + %3322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 15, !dbg !146 + %3323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 16, !dbg !146 + %3324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 17, !dbg !146 + %3325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 18, !dbg !146 + %3326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 19, !dbg !146 + %3327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 20, !dbg !146 + %3328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 21, !dbg !146 + %3329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 22, !dbg !146 + %3330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 23, !dbg !146 + %3331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 24, !dbg !146 + %3332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 25, !dbg !146 + %3333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 26, !dbg !146 + %3334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 27, !dbg !146 + %3335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 28, !dbg !146 + %3336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 29, !dbg !146 + %3337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 30, !dbg !146 + %3338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 31, !dbg !146 + %3339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 32, !dbg !146 + %3340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 33, !dbg !146 + %3341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 34, !dbg !146 + %3342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 35, !dbg !146 + %3343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 36, !dbg !146 + %3344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 37, !dbg !146 + %3345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 38, !dbg !146 + %3346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 39, !dbg !146 + %3347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 40, !dbg !146 + %3348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 41, !dbg !146 + %3349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 42, !dbg !146 + %3350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 43, !dbg !146 + %3351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 44, !dbg !146 + %3352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 45, !dbg !146 + %3353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 46, !dbg !146 + %3354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 47, !dbg !146 + %3355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 48, !dbg !146 + %3356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 49, !dbg !146 + %3357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 50, !dbg !146 + %3358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 51, !dbg !146 + %3359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 52, !dbg !146 + %3360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 53, !dbg !146 + %3361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 54, !dbg !146 + %3362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 55, !dbg !146 + %3363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 56, !dbg !146 + %3364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 57, !dbg !146 + %3365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 58, !dbg !146 + %3366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 59, !dbg !146 + %3367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 60, !dbg !146 + %3368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 61, !dbg !146 + %3369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 62, !dbg !146 + %3370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3301, 63, !dbg !146 + %3371 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3307, float %3308, float %3309, float %3310, float %3311, float %3312, float %3313, float %3314, float %3315, float %3316, float %3317, float %3318, float %3319, float %3320, float %3321, float %3322, float %3323, float %3324, float %3325, float %3326, float %3327, float %3328, float %3329, float %3330, float %3331, float %3332, float %3333, float %3334, float %3335, float %3336, float %3337, float %3338, float %3339, float %3340, float %3341, float %3342, float %3343, float %3344, float %3345, float %3346, float %3347, float %3348, float %3349, float %3350, float %3351, float %3352, float %3353, float %3354, float %3355, float %3356, float %3357, float %3358, float %3359, float %3360, float %3361, float %3362, float %3363, float %3364, float %3365, float %3366, float %3367, float %3368, float %3369, float %3370, i32 %3152, i32 %3153, i32 %3154, i32 %3155, i64 %3306, i1 true) #2, !dbg !146 + %3372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 0, !dbg !146 + %3373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 1, !dbg !146 + %3374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 2, !dbg !146 + %3375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 3, !dbg !146 + %3376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 4, !dbg !146 + %3377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 5, !dbg !146 + %3378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 6, !dbg !146 + %3379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 7, !dbg !146 + %3380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 8, !dbg !146 + %3381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 9, !dbg !146 + %3382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 10, !dbg !146 + %3383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 11, !dbg !146 + %3384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 12, !dbg !146 + %3385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 13, !dbg !146 + %3386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 14, !dbg !146 + %3387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 15, !dbg !146 + %3388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 16, !dbg !146 + %3389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 17, !dbg !146 + %3390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 18, !dbg !146 + %3391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 19, !dbg !146 + %3392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 20, !dbg !146 + %3393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 21, !dbg !146 + %3394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 22, !dbg !146 + %3395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 23, !dbg !146 + %3396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 24, !dbg !146 + %3397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 25, !dbg !146 + %3398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 26, !dbg !146 + %3399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 27, !dbg !146 + %3400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 28, !dbg !146 + %3401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 29, !dbg !146 + %3402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 30, !dbg !146 + %3403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 31, !dbg !146 + %3404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 32, !dbg !146 + %3405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 33, !dbg !146 + %3406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 34, !dbg !146 + %3407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 35, !dbg !146 + %3408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 36, !dbg !146 + %3409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 37, !dbg !146 + %3410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 38, !dbg !146 + %3411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 39, !dbg !146 + %3412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 40, !dbg !146 + %3413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 41, !dbg !146 + %3414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 42, !dbg !146 + %3415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 43, !dbg !146 + %3416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 44, !dbg !146 + %3417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 45, !dbg !146 + %3418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 46, !dbg !146 + %3419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 47, !dbg !146 + %3420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 48, !dbg !146 + %3421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 49, !dbg !146 + %3422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 50, !dbg !146 + %3423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 51, !dbg !146 + %3424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 52, !dbg !146 + %3425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 53, !dbg !146 + %3426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 54, !dbg !146 + %3427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 55, !dbg !146 + %3428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 56, !dbg !146 + %3429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 57, !dbg !146 + %3430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 58, !dbg !146 + %3431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 59, !dbg !146 + %3432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 60, !dbg !146 + %3433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 61, !dbg !146 + %3434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 62, !dbg !146 + %3435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3371, 63, !dbg !146 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !146 + %3436 = add nuw nsw i32 %2199, 1, !dbg !119 + %3437 = lshr i32 %3436, 1, !dbg !147 + %3438 = zext nneg i32 %3437 to i64, !dbg !148 + %3439 = getelementptr i32, ptr addrspace(1) %1997, i64 %3438, !dbg !148 + %3440 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !149 + %3441 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %3439, i64 %3440, i1 %2202) #2, !dbg !149 + %3442 = add nuw nsw i32 %3437, 1, !dbg !150 + %3443 = icmp slt i32 %3442, %2001, !dbg !151 + %3444 = getelementptr i8, ptr addrspace(1) %3439, i64 4, !dbg !152 + %3445 = and i1 %2202, %3443, !dbg !119 + %3446 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !153 + %3447 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %3444, i64 %3446, i1 %3445) #2, !dbg !153 + %3448 = and i32 %2199, 1, !dbg !154 + %3449 = sub i32 %3447, %3441, !dbg !155 + %3450 = shl i32 %3449, 7, !dbg !156 + %3451 = add i32 %3450, -64, !dbg !157 + %3452 = xor i32 %3448, 1, !dbg !158 + %3453 = mul nuw nsw i32 %3451, %3452, !dbg !158 + %3454 = shl nuw nsw i32 %3448, 6, !dbg !159 + %3455 = add i32 %2198, %3454, !dbg !160 + %3456 = add i32 %3455, %3453, !dbg !161 + %3457 = add i32 %2197, 1, !dbg !119 + %3458 = icmp sgt i32 %3457, 2, !dbg !119 + %3459 = select i1 %3458, i32 0, i32 %3457, !dbg !119 + %3460 = add i32 %3456, %1999, !dbg !126 + %3461 = add i32 %3460, %39, !dbg !121 + %3462 = add i32 %3460, %40, !dbg !121 + %3463 = add i32 %3460, %41, !dbg !121 + %3464 = add i32 %3460, %42, !dbg !121 + %3465 = shl i32 %3461, 7, !dbg !122 + %3466 = shl i32 %3462, 7, !dbg !122 + %3467 = shl i32 %3463, 7, !dbg !122 + %3468 = shl i32 %3464, 7, !dbg !122 + %3469 = sext i32 %3465 to i64, !dbg !123 + %3470 = sext i32 %3466 to i64, !dbg !123 + %3471 = sext i32 %3467 to i64, !dbg !123 + %3472 = sext i32 %3468 to i64, !dbg !123 + %gep362 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3469, !dbg !124 + %gep364 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3470, !dbg !124 + %gep366 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3471, !dbg !124 + %gep368 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3472, !dbg !124 + %3473 = shl i32 %3459, 13, !dbg !125 + %3474 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %3473, !dbg !125 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !125 + %3475 = getelementptr inbounds nuw i8, ptr addrspace(3) %3474, i32 %220, !dbg !125 + %3476 = select i1 %2201, i32 16, i32 0, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %3475, ptr addrspace(1) %gep362, i32 %3476) #2, !dbg !125 + %3477 = getelementptr inbounds nuw i8, ptr addrspace(3) %3474, i32 %223, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3477, ptr addrspace(1) %gep364, i32 %3476) #2, !dbg !125 + %3478 = getelementptr inbounds nuw i8, ptr addrspace(3) %3474, i32 %225, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3478, ptr addrspace(1) %gep366, i32 %3476) #2, !dbg !125 + %3479 = getelementptr inbounds nuw i8, ptr addrspace(3) %3474, i32 %227, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3479, ptr addrspace(1) %gep368, i32 %3476) #2, !dbg !125 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !125 + %gep370 = getelementptr bfloat, ptr addrspace(1) %invariant.gep335, i64 %3469, !dbg !124 + %gep372 = getelementptr bfloat, ptr addrspace(1) %invariant.gep335, i64 %3470, !dbg !124 + %gep374 = getelementptr bfloat, ptr addrspace(1) %invariant.gep335, i64 %3471, !dbg !124 + %gep376 = getelementptr bfloat, ptr addrspace(1) %invariant.gep335, i64 %3472, !dbg !124 + %3480 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %3473, !dbg !125 + %3481 = getelementptr inbounds nuw i8, ptr addrspace(3) %3480, i32 %220, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %3481, ptr addrspace(1) %gep370, i32 %3476) #2, !dbg !125 + %3482 = getelementptr inbounds nuw i8, ptr addrspace(3) %3480, i32 %223, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3482, ptr addrspace(1) %gep372, i32 %3476) #2, !dbg !125 + %3483 = getelementptr inbounds nuw i8, ptr addrspace(3) %3480, i32 %225, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3483, ptr addrspace(1) %gep374, i32 %3476) #2, !dbg !125 + %3484 = getelementptr inbounds nuw i8, ptr addrspace(3) %3480, i32 %227, !dbg !125 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3484, ptr addrspace(1) %gep376, i32 %3476) #2, !dbg !125 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !125 + %exitcond390.not = icmp eq i32 %3436, %2128, !dbg !119 + br i1 %exitcond390.not, label %._crit_edge379, label %2195, !dbg !119 + +._crit_edge379: ; preds = %__nv_exp2f.exit189, %._crit_edge + %3485 = phi float [ %2064, %._crit_edge ], [ %3372, %__nv_exp2f.exit189 ], !dbg !44 + %3486 = phi float [ %2065, %._crit_edge ], [ %3373, %__nv_exp2f.exit189 ], !dbg !44 + %3487 = phi float [ %2066, %._crit_edge ], [ %3374, %__nv_exp2f.exit189 ], !dbg !44 + %3488 = phi float [ %2067, %._crit_edge ], [ %3375, %__nv_exp2f.exit189 ], !dbg !44 + %3489 = phi float [ %2068, %._crit_edge ], [ %3376, %__nv_exp2f.exit189 ], !dbg !44 + %3490 = phi float [ %2069, %._crit_edge ], [ %3377, %__nv_exp2f.exit189 ], !dbg !44 + %3491 = phi float [ %2070, %._crit_edge ], [ %3378, %__nv_exp2f.exit189 ], !dbg !44 + %3492 = phi float [ %2071, %._crit_edge ], [ %3379, %__nv_exp2f.exit189 ], !dbg !44 + %3493 = phi float [ %2072, %._crit_edge ], [ %3380, %__nv_exp2f.exit189 ], !dbg !44 + %3494 = phi float [ %2073, %._crit_edge ], [ %3381, %__nv_exp2f.exit189 ], !dbg !44 + %3495 = phi float [ %2074, %._crit_edge ], [ %3382, %__nv_exp2f.exit189 ], !dbg !44 + %3496 = phi float [ %2075, %._crit_edge ], [ %3383, %__nv_exp2f.exit189 ], !dbg !44 + %3497 = phi float [ %2076, %._crit_edge ], [ %3384, %__nv_exp2f.exit189 ], !dbg !44 + %3498 = phi float [ %2077, %._crit_edge ], [ %3385, %__nv_exp2f.exit189 ], !dbg !44 + %3499 = phi float [ %2078, %._crit_edge ], [ %3386, %__nv_exp2f.exit189 ], !dbg !44 + %3500 = phi float [ %2079, %._crit_edge ], [ %3387, %__nv_exp2f.exit189 ], !dbg !44 + %3501 = phi float [ %2080, %._crit_edge ], [ %3388, %__nv_exp2f.exit189 ], !dbg !44 + %3502 = phi float [ %2081, %._crit_edge ], [ %3389, %__nv_exp2f.exit189 ], !dbg !44 + %3503 = phi float [ %2082, %._crit_edge ], [ %3390, %__nv_exp2f.exit189 ], !dbg !44 + %3504 = phi float [ %2083, %._crit_edge ], [ %3391, %__nv_exp2f.exit189 ], !dbg !44 + %3505 = phi float [ %2084, %._crit_edge ], [ %3392, %__nv_exp2f.exit189 ], !dbg !44 + %3506 = phi float [ %2085, %._crit_edge ], [ %3393, %__nv_exp2f.exit189 ], !dbg !44 + %3507 = phi float [ %2086, %._crit_edge ], [ %3394, %__nv_exp2f.exit189 ], !dbg !44 + %3508 = phi float [ %2087, %._crit_edge ], [ %3395, %__nv_exp2f.exit189 ], !dbg !44 + %3509 = phi float [ %2088, %._crit_edge ], [ %3396, %__nv_exp2f.exit189 ], !dbg !44 + %3510 = phi float [ %2089, %._crit_edge ], [ %3397, %__nv_exp2f.exit189 ], !dbg !44 + %3511 = phi float [ %2090, %._crit_edge ], [ %3398, %__nv_exp2f.exit189 ], !dbg !44 + %3512 = phi float [ %2091, %._crit_edge ], [ %3399, %__nv_exp2f.exit189 ], !dbg !44 + %3513 = phi float [ %2092, %._crit_edge ], [ %3400, %__nv_exp2f.exit189 ], !dbg !44 + %3514 = phi float [ %2093, %._crit_edge ], [ %3401, %__nv_exp2f.exit189 ], !dbg !44 + %3515 = phi float [ %2094, %._crit_edge ], [ %3402, %__nv_exp2f.exit189 ], !dbg !44 + %3516 = phi float [ %2095, %._crit_edge ], [ %3403, %__nv_exp2f.exit189 ], !dbg !44 + %3517 = phi float [ %2096, %._crit_edge ], [ %3404, %__nv_exp2f.exit189 ], !dbg !44 + %3518 = phi float [ %2097, %._crit_edge ], [ %3405, %__nv_exp2f.exit189 ], !dbg !44 + %3519 = phi float [ %2098, %._crit_edge ], [ %3406, %__nv_exp2f.exit189 ], !dbg !44 + %3520 = phi float [ %2099, %._crit_edge ], [ %3407, %__nv_exp2f.exit189 ], !dbg !44 + %3521 = phi float [ %2100, %._crit_edge ], [ %3408, %__nv_exp2f.exit189 ], !dbg !44 + %3522 = phi float [ %2101, %._crit_edge ], [ %3409, %__nv_exp2f.exit189 ], !dbg !44 + %3523 = phi float [ %2102, %._crit_edge ], [ %3410, %__nv_exp2f.exit189 ], !dbg !44 + %3524 = phi float [ %2103, %._crit_edge ], [ %3411, %__nv_exp2f.exit189 ], !dbg !44 + %3525 = phi float [ %2104, %._crit_edge ], [ %3412, %__nv_exp2f.exit189 ], !dbg !44 + %3526 = phi float [ %2105, %._crit_edge ], [ %3413, %__nv_exp2f.exit189 ], !dbg !44 + %3527 = phi float [ %2106, %._crit_edge ], [ %3414, %__nv_exp2f.exit189 ], !dbg !44 + %3528 = phi float [ %2107, %._crit_edge ], [ %3415, %__nv_exp2f.exit189 ], !dbg !44 + %3529 = phi float [ %2108, %._crit_edge ], [ %3416, %__nv_exp2f.exit189 ], !dbg !44 + %3530 = phi float [ %2109, %._crit_edge ], [ %3417, %__nv_exp2f.exit189 ], !dbg !44 + %3531 = phi float [ %2110, %._crit_edge ], [ %3418, %__nv_exp2f.exit189 ], !dbg !44 + %3532 = phi float [ %2111, %._crit_edge ], [ %3419, %__nv_exp2f.exit189 ], !dbg !44 + %3533 = phi float [ %2112, %._crit_edge ], [ %3420, %__nv_exp2f.exit189 ], !dbg !44 + %3534 = phi float [ %2113, %._crit_edge ], [ %3421, %__nv_exp2f.exit189 ], !dbg !44 + %3535 = phi float [ %2114, %._crit_edge ], [ %3422, %__nv_exp2f.exit189 ], !dbg !44 + %3536 = phi float [ %2115, %._crit_edge ], [ %3423, %__nv_exp2f.exit189 ], !dbg !44 + %3537 = phi float [ %2116, %._crit_edge ], [ %3424, %__nv_exp2f.exit189 ], !dbg !44 + %3538 = phi float [ %2117, %._crit_edge ], [ %3425, %__nv_exp2f.exit189 ], !dbg !44 + %3539 = phi float [ %2118, %._crit_edge ], [ %3426, %__nv_exp2f.exit189 ], !dbg !44 + %3540 = phi float [ %2119, %._crit_edge ], [ %3427, %__nv_exp2f.exit189 ], !dbg !44 + %3541 = phi float [ %2120, %._crit_edge ], [ %3428, %__nv_exp2f.exit189 ], !dbg !44 + %3542 = phi float [ %2121, %._crit_edge ], [ %3429, %__nv_exp2f.exit189 ], !dbg !44 + %3543 = phi float [ %2122, %._crit_edge ], [ %3430, %__nv_exp2f.exit189 ], !dbg !44 + %3544 = phi float [ %2123, %._crit_edge ], [ %3431, %__nv_exp2f.exit189 ], !dbg !44 + %3545 = phi float [ %2124, %._crit_edge ], [ %3432, %__nv_exp2f.exit189 ], !dbg !44 + %3546 = phi float [ %2125, %._crit_edge ], [ %3433, %__nv_exp2f.exit189 ], !dbg !44 + %3547 = phi float [ %2126, %._crit_edge ], [ %3434, %__nv_exp2f.exit189 ], !dbg !44 + %3548 = phi float [ %2127, %._crit_edge ], [ %3435, %__nv_exp2f.exit189 ], !dbg !44 + %3549 = phi float [ %1993, %._crit_edge ], [ %3025, %__nv_exp2f.exit189 ], !dbg !44 + %3550 = phi float [ %1994, %._crit_edge ], [ %3026, %__nv_exp2f.exit189 ], !dbg !44 + %3551 = phi <2 x float> [ %1995, %._crit_edge ], [ %2764, %__nv_exp2f.exit189 ], !dbg !44 + %3552 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %3485, float %3486, float %3487, float %3488, float %3489, float %3490, float %3491, float %3492, float %3493, float %3494, float %3495, float %3496, float %3497, float %3498, float %3499, float %3500, float %3501, float %3502, float %3503, float %3504, float %3505, float %3506, float %3507, float %3508, float %3509, float %3510, float %3511, float %3512, float %3513, float %3514, float %3515, float %3516, float %3517, float %3518, float %3519, float %3520, float %3521, float %3522, float %3523, float %3524, float %3525, float %3526, float %3527, float %3528, float %3529, float %3530, float %3531, float %3532, float %3533, float %3534, float %3535, float %3536, float %3537, float %3538, float %3539, float %3540, float %3541, float %3542, float %3543, float %3544, float %3545, float %3546, float %3547, float %3548) #2, !dbg !119 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !119 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !119 + %3553 = fcmp oeq float %3549, 0.000000e+00, !dbg !162 + %3554 = fcmp oeq float %3550, 0.000000e+00, !dbg !162 + %3555 = select i1 %3553, float 1.000000e+00, float %3549, !dbg !163 + %3556 = select i1 %3554, float 1.000000e+00, float %3550, !dbg !163 + %3557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 0, !dbg !164 + %3558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 1, !dbg !164 + %3559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 2, !dbg !164 + %3560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 3, !dbg !164 + %3561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 4, !dbg !164 + %3562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 5, !dbg !164 + %3563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 6, !dbg !164 + %3564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 7, !dbg !164 + %3565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 8, !dbg !164 + %3566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 9, !dbg !164 + %3567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 10, !dbg !164 + %3568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 11, !dbg !164 + %3569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 12, !dbg !164 + %3570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 13, !dbg !164 + %3571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 14, !dbg !164 + %3572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 15, !dbg !164 + %3573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 16, !dbg !164 + %3574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 17, !dbg !164 + %3575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 18, !dbg !164 + %3576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 19, !dbg !164 + %3577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 20, !dbg !164 + %3578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 21, !dbg !164 + %3579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 22, !dbg !164 + %3580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 23, !dbg !164 + %3581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 24, !dbg !164 + %3582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 25, !dbg !164 + %3583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 26, !dbg !164 + %3584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 27, !dbg !164 + %3585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 28, !dbg !164 + %3586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 29, !dbg !164 + %3587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 30, !dbg !164 + %3588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 31, !dbg !164 + %3589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 32, !dbg !164 + %3590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 33, !dbg !164 + %3591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 34, !dbg !164 + %3592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 35, !dbg !164 + %3593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 36, !dbg !164 + %3594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 37, !dbg !164 + %3595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 38, !dbg !164 + %3596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 39, !dbg !164 + %3597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 40, !dbg !164 + %3598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 41, !dbg !164 + %3599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 42, !dbg !164 + %3600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 43, !dbg !164 + %3601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 44, !dbg !164 + %3602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 45, !dbg !164 + %3603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 46, !dbg !164 + %3604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 47, !dbg !164 + %3605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 48, !dbg !164 + %3606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 49, !dbg !164 + %3607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 50, !dbg !164 + %3608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 51, !dbg !164 + %3609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 52, !dbg !164 + %3610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 53, !dbg !164 + %3611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 54, !dbg !164 + %3612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 55, !dbg !164 + %3613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 56, !dbg !164 + %3614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 57, !dbg !164 + %3615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 58, !dbg !164 + %3616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 59, !dbg !164 + %3617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 60, !dbg !164 + %3618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 61, !dbg !164 + %3619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 62, !dbg !164 + %3620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3552, 63, !dbg !164 + %3621 = tail call float @llvm.nvvm.div.full(float %3557, float %3555), !dbg !164 + %3622 = tail call float @llvm.nvvm.div.full(float %3558, float %3555), !dbg !164 + %3623 = tail call float @llvm.nvvm.div.full(float %3559, float %3556), !dbg !164 + %3624 = tail call float @llvm.nvvm.div.full(float %3560, float %3556), !dbg !164 + %3625 = tail call float @llvm.nvvm.div.full(float %3561, float %3555), !dbg !164 + %3626 = tail call float @llvm.nvvm.div.full(float %3562, float %3555), !dbg !164 + %3627 = tail call float @llvm.nvvm.div.full(float %3563, float %3556), !dbg !164 + %3628 = tail call float @llvm.nvvm.div.full(float %3564, float %3556), !dbg !164 + %3629 = tail call float @llvm.nvvm.div.full(float %3565, float %3555), !dbg !164 + %3630 = tail call float @llvm.nvvm.div.full(float %3566, float %3555), !dbg !164 + %3631 = tail call float @llvm.nvvm.div.full(float %3567, float %3556), !dbg !164 + %3632 = tail call float @llvm.nvvm.div.full(float %3568, float %3556), !dbg !164 + %3633 = tail call float @llvm.nvvm.div.full(float %3569, float %3555), !dbg !164 + %3634 = tail call float @llvm.nvvm.div.full(float %3570, float %3555), !dbg !164 + %3635 = tail call float @llvm.nvvm.div.full(float %3571, float %3556), !dbg !164 + %3636 = tail call float @llvm.nvvm.div.full(float %3572, float %3556), !dbg !164 + %3637 = tail call float @llvm.nvvm.div.full(float %3573, float %3555), !dbg !164 + %3638 = tail call float @llvm.nvvm.div.full(float %3574, float %3555), !dbg !164 + %3639 = tail call float @llvm.nvvm.div.full(float %3575, float %3556), !dbg !164 + %3640 = tail call float @llvm.nvvm.div.full(float %3576, float %3556), !dbg !164 + %3641 = tail call float @llvm.nvvm.div.full(float %3577, float %3555), !dbg !164 + %3642 = tail call float @llvm.nvvm.div.full(float %3578, float %3555), !dbg !164 + %3643 = tail call float @llvm.nvvm.div.full(float %3579, float %3556), !dbg !164 + %3644 = tail call float @llvm.nvvm.div.full(float %3580, float %3556), !dbg !164 + %3645 = tail call float @llvm.nvvm.div.full(float %3581, float %3555), !dbg !164 + %3646 = tail call float @llvm.nvvm.div.full(float %3582, float %3555), !dbg !164 + %3647 = tail call float @llvm.nvvm.div.full(float %3583, float %3556), !dbg !164 + %3648 = tail call float @llvm.nvvm.div.full(float %3584, float %3556), !dbg !164 + %3649 = tail call float @llvm.nvvm.div.full(float %3585, float %3555), !dbg !164 + %3650 = tail call float @llvm.nvvm.div.full(float %3586, float %3555), !dbg !164 + %3651 = tail call float @llvm.nvvm.div.full(float %3587, float %3556), !dbg !164 + %3652 = tail call float @llvm.nvvm.div.full(float %3588, float %3556), !dbg !164 + %3653 = tail call float @llvm.nvvm.div.full(float %3589, float %3555), !dbg !164 + %3654 = tail call float @llvm.nvvm.div.full(float %3590, float %3555), !dbg !164 + %3655 = tail call float @llvm.nvvm.div.full(float %3591, float %3556), !dbg !164 + %3656 = tail call float @llvm.nvvm.div.full(float %3592, float %3556), !dbg !164 + %3657 = tail call float @llvm.nvvm.div.full(float %3593, float %3555), !dbg !164 + %3658 = tail call float @llvm.nvvm.div.full(float %3594, float %3555), !dbg !164 + %3659 = tail call float @llvm.nvvm.div.full(float %3595, float %3556), !dbg !164 + %3660 = tail call float @llvm.nvvm.div.full(float %3596, float %3556), !dbg !164 + %3661 = tail call float @llvm.nvvm.div.full(float %3597, float %3555), !dbg !164 + %3662 = tail call float @llvm.nvvm.div.full(float %3598, float %3555), !dbg !164 + %3663 = tail call float @llvm.nvvm.div.full(float %3599, float %3556), !dbg !164 + %3664 = tail call float @llvm.nvvm.div.full(float %3600, float %3556), !dbg !164 + %3665 = tail call float @llvm.nvvm.div.full(float %3601, float %3555), !dbg !164 + %3666 = tail call float @llvm.nvvm.div.full(float %3602, float %3555), !dbg !164 + %3667 = tail call float @llvm.nvvm.div.full(float %3603, float %3556), !dbg !164 + %3668 = tail call float @llvm.nvvm.div.full(float %3604, float %3556), !dbg !164 + %3669 = tail call float @llvm.nvvm.div.full(float %3605, float %3555), !dbg !164 + %3670 = tail call float @llvm.nvvm.div.full(float %3606, float %3555), !dbg !164 + %3671 = tail call float @llvm.nvvm.div.full(float %3607, float %3556), !dbg !164 + %3672 = tail call float @llvm.nvvm.div.full(float %3608, float %3556), !dbg !164 + %3673 = tail call float @llvm.nvvm.div.full(float %3609, float %3555), !dbg !164 + %3674 = tail call float @llvm.nvvm.div.full(float %3610, float %3555), !dbg !164 + %3675 = tail call float @llvm.nvvm.div.full(float %3611, float %3556), !dbg !164 + %3676 = tail call float @llvm.nvvm.div.full(float %3612, float %3556), !dbg !164 + %3677 = tail call float @llvm.nvvm.div.full(float %3613, float %3555), !dbg !164 + %3678 = tail call float @llvm.nvvm.div.full(float %3614, float %3555), !dbg !164 + %3679 = tail call float @llvm.nvvm.div.full(float %3615, float %3556), !dbg !164 + %3680 = tail call float @llvm.nvvm.div.full(float %3616, float %3556), !dbg !164 + %3681 = tail call float @llvm.nvvm.div.full(float %3617, float %3555), !dbg !164 + %3682 = tail call float @llvm.nvvm.div.full(float %3618, float %3555), !dbg !164 + %3683 = tail call float @llvm.nvvm.div.full(float %3619, float %3556), !dbg !164 + %3684 = tail call float @llvm.nvvm.div.full(float %3620, float %3556), !dbg !164 + %3685 = icmp slt i32 %48, 2048, !dbg !165 + %3686 = icmp slt i32 %49, 2048, !dbg !165 + %3687 = icmp slt i32 %50, 2048, !dbg !165 + %3688 = icmp slt i32 %51, 2048, !dbg !165 + %3689 = icmp slt i32 %52, 2048, !dbg !165 + %3690 = icmp slt i32 %53, 2048, !dbg !165 + %3691 = icmp slt i32 %54, 2048, !dbg !165 + %3692 = icmp slt i32 %55, 2048, !dbg !165 + %3693 = or disjoint i32 %83, %19, !dbg !166 + %3694 = or disjoint i32 %3693, %18, !dbg !167 + %3695 = add i32 %3694, %58, !dbg !168 + %3696 = add i32 %3694, %59, !dbg !168 + %3697 = add i32 %3694, %60, !dbg !168 + %3698 = add i32 %3694, %61, !dbg !168 + %3699 = add i32 %3694, %62, !dbg !168 + %3700 = add i32 %3694, %63, !dbg !168 + %3701 = add i32 %3694, %64, !dbg !168 + %3702 = add i32 %3694, %65, !dbg !168 + %3703 = sext i32 %3695 to i64, !dbg !169 + %3704 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3703, !dbg !169 + %3705 = sext i32 %3696 to i64, !dbg !169 + %3706 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3705, !dbg !169 + %3707 = sext i32 %3697 to i64, !dbg !169 + %3708 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3707, !dbg !169 + %3709 = sext i32 %3698 to i64, !dbg !169 + %3710 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3709, !dbg !169 + %3711 = sext i32 %3699 to i64, !dbg !169 + %3712 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3711, !dbg !169 + %3713 = sext i32 %3700 to i64, !dbg !169 + %3714 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3713, !dbg !169 + %3715 = sext i32 %3701 to i64, !dbg !169 + %3716 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3715, !dbg !169 + %3717 = sext i32 %3702 to i64, !dbg !169 + %3718 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3717, !dbg !169 + %3719 = insertelement <2 x float> poison, float %3621, i64 0, !dbg !170 + %3720 = insertelement <2 x float> %3719, float %3622, i64 1, !dbg !170 + %3721 = fptrunc <2 x float> %3720 to <2 x bfloat>, !dbg !170 + %3722 = insertelement <2 x float> poison, float %3623, i64 0, !dbg !170 + %3723 = insertelement <2 x float> %3722, float %3624, i64 1, !dbg !170 + %3724 = fptrunc <2 x float> %3723 to <2 x bfloat>, !dbg !170 + %3725 = insertelement <2 x float> poison, float %3625, i64 0, !dbg !170 + %3726 = insertelement <2 x float> %3725, float %3626, i64 1, !dbg !170 + %3727 = fptrunc <2 x float> %3726 to <2 x bfloat>, !dbg !170 + %3728 = insertelement <2 x float> poison, float %3627, i64 0, !dbg !170 + %3729 = insertelement <2 x float> %3728, float %3628, i64 1, !dbg !170 + %3730 = fptrunc <2 x float> %3729 to <2 x bfloat>, !dbg !170 + %3731 = insertelement <2 x float> poison, float %3629, i64 0, !dbg !170 + %3732 = insertelement <2 x float> %3731, float %3630, i64 1, !dbg !170 + %3733 = fptrunc <2 x float> %3732 to <2 x bfloat>, !dbg !170 + %3734 = insertelement <2 x float> poison, float %3631, i64 0, !dbg !170 + %3735 = insertelement <2 x float> %3734, float %3632, i64 1, !dbg !170 + %3736 = fptrunc <2 x float> %3735 to <2 x bfloat>, !dbg !170 + %3737 = insertelement <2 x float> poison, float %3633, i64 0, !dbg !170 + %3738 = insertelement <2 x float> %3737, float %3634, i64 1, !dbg !170 + %3739 = fptrunc <2 x float> %3738 to <2 x bfloat>, !dbg !170 + %3740 = insertelement <2 x float> poison, float %3635, i64 0, !dbg !170 + %3741 = insertelement <2 x float> %3740, float %3636, i64 1, !dbg !170 + %3742 = fptrunc <2 x float> %3741 to <2 x bfloat>, !dbg !170 + %3743 = insertelement <2 x float> poison, float %3637, i64 0, !dbg !170 + %3744 = insertelement <2 x float> %3743, float %3638, i64 1, !dbg !170 + %3745 = fptrunc <2 x float> %3744 to <2 x bfloat>, !dbg !170 + %3746 = insertelement <2 x float> poison, float %3639, i64 0, !dbg !170 + %3747 = insertelement <2 x float> %3746, float %3640, i64 1, !dbg !170 + %3748 = fptrunc <2 x float> %3747 to <2 x bfloat>, !dbg !170 + %3749 = insertelement <2 x float> poison, float %3641, i64 0, !dbg !170 + %3750 = insertelement <2 x float> %3749, float %3642, i64 1, !dbg !170 + %3751 = fptrunc <2 x float> %3750 to <2 x bfloat>, !dbg !170 + %3752 = insertelement <2 x float> poison, float %3643, i64 0, !dbg !170 + %3753 = insertelement <2 x float> %3752, float %3644, i64 1, !dbg !170 + %3754 = fptrunc <2 x float> %3753 to <2 x bfloat>, !dbg !170 + %3755 = insertelement <2 x float> poison, float %3645, i64 0, !dbg !170 + %3756 = insertelement <2 x float> %3755, float %3646, i64 1, !dbg !170 + %3757 = fptrunc <2 x float> %3756 to <2 x bfloat>, !dbg !170 + %3758 = insertelement <2 x float> poison, float %3647, i64 0, !dbg !170 + %3759 = insertelement <2 x float> %3758, float %3648, i64 1, !dbg !170 + %3760 = fptrunc <2 x float> %3759 to <2 x bfloat>, !dbg !170 + %3761 = insertelement <2 x float> poison, float %3649, i64 0, !dbg !170 + %3762 = insertelement <2 x float> %3761, float %3650, i64 1, !dbg !170 + %3763 = fptrunc <2 x float> %3762 to <2 x bfloat>, !dbg !170 + %3764 = insertelement <2 x float> poison, float %3651, i64 0, !dbg !170 + %3765 = insertelement <2 x float> %3764, float %3652, i64 1, !dbg !170 + %3766 = fptrunc <2 x float> %3765 to <2 x bfloat>, !dbg !170 + %3767 = insertelement <2 x float> poison, float %3653, i64 0, !dbg !170 + %3768 = insertelement <2 x float> %3767, float %3654, i64 1, !dbg !170 + %3769 = fptrunc <2 x float> %3768 to <2 x bfloat>, !dbg !170 + %3770 = insertelement <2 x float> poison, float %3655, i64 0, !dbg !170 + %3771 = insertelement <2 x float> %3770, float %3656, i64 1, !dbg !170 + %3772 = fptrunc <2 x float> %3771 to <2 x bfloat>, !dbg !170 + %3773 = insertelement <2 x float> poison, float %3657, i64 0, !dbg !170 + %3774 = insertelement <2 x float> %3773, float %3658, i64 1, !dbg !170 + %3775 = fptrunc <2 x float> %3774 to <2 x bfloat>, !dbg !170 + %3776 = insertelement <2 x float> poison, float %3659, i64 0, !dbg !170 + %3777 = insertelement <2 x float> %3776, float %3660, i64 1, !dbg !170 + %3778 = fptrunc <2 x float> %3777 to <2 x bfloat>, !dbg !170 + %3779 = insertelement <2 x float> poison, float %3661, i64 0, !dbg !170 + %3780 = insertelement <2 x float> %3779, float %3662, i64 1, !dbg !170 + %3781 = fptrunc <2 x float> %3780 to <2 x bfloat>, !dbg !170 + %3782 = insertelement <2 x float> poison, float %3663, i64 0, !dbg !170 + %3783 = insertelement <2 x float> %3782, float %3664, i64 1, !dbg !170 + %3784 = fptrunc <2 x float> %3783 to <2 x bfloat>, !dbg !170 + %3785 = insertelement <2 x float> poison, float %3665, i64 0, !dbg !170 + %3786 = insertelement <2 x float> %3785, float %3666, i64 1, !dbg !170 + %3787 = fptrunc <2 x float> %3786 to <2 x bfloat>, !dbg !170 + %3788 = insertelement <2 x float> poison, float %3667, i64 0, !dbg !170 + %3789 = insertelement <2 x float> %3788, float %3668, i64 1, !dbg !170 + %3790 = fptrunc <2 x float> %3789 to <2 x bfloat>, !dbg !170 + %3791 = insertelement <2 x float> poison, float %3669, i64 0, !dbg !170 + %3792 = insertelement <2 x float> %3791, float %3670, i64 1, !dbg !170 + %3793 = fptrunc <2 x float> %3792 to <2 x bfloat>, !dbg !170 + %3794 = insertelement <2 x float> poison, float %3671, i64 0, !dbg !170 + %3795 = insertelement <2 x float> %3794, float %3672, i64 1, !dbg !170 + %3796 = fptrunc <2 x float> %3795 to <2 x bfloat>, !dbg !170 + %3797 = insertelement <2 x float> poison, float %3673, i64 0, !dbg !170 + %3798 = insertelement <2 x float> %3797, float %3674, i64 1, !dbg !170 + %3799 = fptrunc <2 x float> %3798 to <2 x bfloat>, !dbg !170 + %3800 = insertelement <2 x float> poison, float %3675, i64 0, !dbg !170 + %3801 = insertelement <2 x float> %3800, float %3676, i64 1, !dbg !170 + %3802 = fptrunc <2 x float> %3801 to <2 x bfloat>, !dbg !170 + %3803 = insertelement <2 x float> poison, float %3677, i64 0, !dbg !170 + %3804 = insertelement <2 x float> %3803, float %3678, i64 1, !dbg !170 + %3805 = fptrunc <2 x float> %3804 to <2 x bfloat>, !dbg !170 + %3806 = insertelement <2 x float> poison, float %3679, i64 0, !dbg !170 + %3807 = insertelement <2 x float> %3806, float %3680, i64 1, !dbg !170 + %3808 = fptrunc <2 x float> %3807 to <2 x bfloat>, !dbg !170 + %3809 = insertelement <2 x float> poison, float %3681, i64 0, !dbg !170 + %3810 = insertelement <2 x float> %3809, float %3682, i64 1, !dbg !170 + %3811 = fptrunc <2 x float> %3810 to <2 x bfloat>, !dbg !170 + %3812 = insertelement <2 x float> poison, float %3683, i64 0, !dbg !170 + %3813 = insertelement <2 x float> %3812, float %3684, i64 1, !dbg !170 + %3814 = fptrunc <2 x float> %3813 to <2 x bfloat>, !dbg !170 + %3815 = shl nuw nsw i32 %190, 13, !dbg !170 + %3816 = shl nuw nsw i32 %36, 5, !dbg !170 + %3817 = and i32 %3816, 7264, !dbg !170 + %3818 = and i32 %36, 24, !dbg !170 + %3819 = shl nuw nsw i32 %3818, 4, !dbg !170 + %3820 = shl nuw nsw i32 %36, 2, !dbg !170 + %3821 = and i32 %3820, 16, !dbg !170 + %3822 = or disjoint i32 %3815, %3821, !dbg !170 + %3823 = or disjoint i32 %3817, %3819, !dbg !170 + %3824 = or disjoint i32 %3822, %3823, !dbg !170 + %3825 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %3824, !dbg !170 + %3826 = bitcast <2 x bfloat> %3721 to i32, !dbg !170 + %3827 = bitcast <2 x bfloat> %3727 to i32, !dbg !170 + %3828 = bitcast <2 x bfloat> %3733 to i32, !dbg !170 + %3829 = bitcast <2 x bfloat> %3739 to i32, !dbg !170 + %3830 = insertelement <4 x i32> poison, i32 %3826, i64 0, !dbg !170 + %3831 = insertelement <4 x i32> %3830, i32 %3827, i64 1, !dbg !170 + %3832 = insertelement <4 x i32> %3831, i32 %3828, i64 2, !dbg !170 + %3833 = insertelement <4 x i32> %3832, i32 %3829, i64 3, !dbg !170 + store <4 x i32> %3833, ptr addrspace(3) %3825, align 16, !dbg !170 + %3834 = getelementptr inbounds nuw i8, ptr addrspace(3) %3825, i32 512, !dbg !170 + %3835 = bitcast <2 x bfloat> %3724 to i32, !dbg !170 + %3836 = bitcast <2 x bfloat> %3730 to i32, !dbg !170 + %3837 = bitcast <2 x bfloat> %3736 to i32, !dbg !170 + %3838 = bitcast <2 x bfloat> %3742 to i32, !dbg !170 + %3839 = insertelement <4 x i32> poison, i32 %3835, i64 0, !dbg !170 + %3840 = insertelement <4 x i32> %3839, i32 %3836, i64 1, !dbg !170 + %3841 = insertelement <4 x i32> %3840, i32 %3837, i64 2, !dbg !170 + %3842 = insertelement <4 x i32> %3841, i32 %3838, i64 3, !dbg !170 + store <4 x i32> %3842, ptr addrspace(3) %3834, align 16, !dbg !170 + %3843 = xor i32 %3824, 32, !dbg !170 + %3844 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %3843, !dbg !170 + %3845 = bitcast <2 x bfloat> %3745 to i32, !dbg !170 + %3846 = bitcast <2 x bfloat> %3751 to i32, !dbg !170 + %3847 = bitcast <2 x bfloat> %3757 to i32, !dbg !170 + %3848 = bitcast <2 x bfloat> %3763 to i32, !dbg !170 + %3849 = insertelement <4 x i32> poison, i32 %3845, i64 0, !dbg !170 + %3850 = insertelement <4 x i32> %3849, i32 %3846, i64 1, !dbg !170 + %3851 = insertelement <4 x i32> %3850, i32 %3847, i64 2, !dbg !170 + %3852 = insertelement <4 x i32> %3851, i32 %3848, i64 3, !dbg !170 + store <4 x i32> %3852, ptr addrspace(3) %3844, align 16, !dbg !170 + %3853 = getelementptr inbounds nuw i8, ptr addrspace(3) %3844, i32 512, !dbg !170 + %3854 = bitcast <2 x bfloat> %3748 to i32, !dbg !170 + %3855 = bitcast <2 x bfloat> %3754 to i32, !dbg !170 + %3856 = bitcast <2 x bfloat> %3760 to i32, !dbg !170 + %3857 = bitcast <2 x bfloat> %3766 to i32, !dbg !170 + %3858 = insertelement <4 x i32> poison, i32 %3854, i64 0, !dbg !170 + %3859 = insertelement <4 x i32> %3858, i32 %3855, i64 1, !dbg !170 + %3860 = insertelement <4 x i32> %3859, i32 %3856, i64 2, !dbg !170 + %3861 = insertelement <4 x i32> %3860, i32 %3857, i64 3, !dbg !170 + store <4 x i32> %3861, ptr addrspace(3) %3853, align 16, !dbg !170 + %3862 = xor i32 %3824, 64, !dbg !170 + %3863 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %3862, !dbg !170 + %3864 = bitcast <2 x bfloat> %3769 to i32, !dbg !170 + %3865 = bitcast <2 x bfloat> %3775 to i32, !dbg !170 + %3866 = bitcast <2 x bfloat> %3781 to i32, !dbg !170 + %3867 = bitcast <2 x bfloat> %3787 to i32, !dbg !170 + %3868 = insertelement <4 x i32> poison, i32 %3864, i64 0, !dbg !170 + %3869 = insertelement <4 x i32> %3868, i32 %3865, i64 1, !dbg !170 + %3870 = insertelement <4 x i32> %3869, i32 %3866, i64 2, !dbg !170 + %3871 = insertelement <4 x i32> %3870, i32 %3867, i64 3, !dbg !170 + store <4 x i32> %3871, ptr addrspace(3) %3863, align 16, !dbg !170 + %3872 = getelementptr inbounds nuw i8, ptr addrspace(3) %3863, i32 512, !dbg !170 + %3873 = bitcast <2 x bfloat> %3772 to i32, !dbg !170 + %3874 = bitcast <2 x bfloat> %3778 to i32, !dbg !170 + %3875 = bitcast <2 x bfloat> %3784 to i32, !dbg !170 + %3876 = bitcast <2 x bfloat> %3790 to i32, !dbg !170 + %3877 = insertelement <4 x i32> poison, i32 %3873, i64 0, !dbg !170 + %3878 = insertelement <4 x i32> %3877, i32 %3874, i64 1, !dbg !170 + %3879 = insertelement <4 x i32> %3878, i32 %3875, i64 2, !dbg !170 + %3880 = insertelement <4 x i32> %3879, i32 %3876, i64 3, !dbg !170 + store <4 x i32> %3880, ptr addrspace(3) %3872, align 16, !dbg !170 + %3881 = xor i32 %3824, 96, !dbg !170 + %3882 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %3881, !dbg !170 + %3883 = bitcast <2 x bfloat> %3793 to i32, !dbg !170 + %3884 = bitcast <2 x bfloat> %3799 to i32, !dbg !170 + %3885 = bitcast <2 x bfloat> %3805 to i32, !dbg !170 + %3886 = bitcast <2 x bfloat> %3811 to i32, !dbg !170 + %3887 = insertelement <4 x i32> poison, i32 %3883, i64 0, !dbg !170 + %3888 = insertelement <4 x i32> %3887, i32 %3884, i64 1, !dbg !170 + %3889 = insertelement <4 x i32> %3888, i32 %3885, i64 2, !dbg !170 + %3890 = insertelement <4 x i32> %3889, i32 %3886, i64 3, !dbg !170 + store <4 x i32> %3890, ptr addrspace(3) %3882, align 16, !dbg !170 + %3891 = getelementptr inbounds nuw i8, ptr addrspace(3) %3882, i32 512, !dbg !170 + %3892 = bitcast <2 x bfloat> %3796 to i32, !dbg !170 + %3893 = bitcast <2 x bfloat> %3802 to i32, !dbg !170 + %3894 = bitcast <2 x bfloat> %3808 to i32, !dbg !170 + %3895 = bitcast <2 x bfloat> %3814 to i32, !dbg !170 + %3896 = insertelement <4 x i32> poison, i32 %3892, i64 0, !dbg !170 + %3897 = insertelement <4 x i32> %3896, i32 %3893, i64 1, !dbg !170 + %3898 = insertelement <4 x i32> %3897, i32 %3894, i64 2, !dbg !170 + %3899 = insertelement <4 x i32> %3898, i32 %3895, i64 3, !dbg !170 + store <4 x i32> %3899, ptr addrspace(3) %3891, align 16, !dbg !170 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !170 + %3900 = shl nuw nsw i32 %3818, 10, !dbg !170 + %3901 = shl nuw nsw i32 %190, 5, !dbg !170 + %3902 = and i32 %36, 252, !dbg !170 + %3903 = shl nuw nsw i32 %3902, 2, !dbg !170 + %3904 = or disjoint i32 %3900, %3901, !dbg !170 + %3905 = xor i32 %3904, %3903, !dbg !170 + %3906 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %3905, !dbg !170 + %3907 = ptrtoint ptr addrspace(3) %3906 to i32, !dbg !170 + %3908 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %3907) #2, !dbg !170 + %3909 = extractvalue { i32, i32, i32, i32 } %3908, 0, !dbg !170 + %3910 = extractvalue { i32, i32, i32, i32 } %3908, 1, !dbg !170 + %3911 = extractvalue { i32, i32, i32, i32 } %3908, 2, !dbg !170 + %3912 = extractvalue { i32, i32, i32, i32 } %3908, 3, !dbg !170 + %3913 = getelementptr inbounds nuw i8, ptr addrspace(3) %3906, i32 1024, !dbg !170 + %3914 = ptrtoint ptr addrspace(3) %3913 to i32, !dbg !170 + %3915 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %3914) #2, !dbg !170 + %3916 = extractvalue { i32, i32, i32, i32 } %3915, 0, !dbg !170 + %3917 = extractvalue { i32, i32, i32, i32 } %3915, 1, !dbg !170 + %3918 = extractvalue { i32, i32, i32, i32 } %3915, 2, !dbg !170 + %3919 = extractvalue { i32, i32, i32, i32 } %3915, 3, !dbg !170 + %3920 = getelementptr inbounds nuw i8, ptr addrspace(3) %3906, i32 2048, !dbg !170 + %3921 = ptrtoint ptr addrspace(3) %3920 to i32, !dbg !170 + %3922 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %3921) #2, !dbg !170 + %3923 = extractvalue { i32, i32, i32, i32 } %3922, 0, !dbg !170 + %3924 = extractvalue { i32, i32, i32, i32 } %3922, 1, !dbg !170 + %3925 = extractvalue { i32, i32, i32, i32 } %3922, 2, !dbg !170 + %3926 = extractvalue { i32, i32, i32, i32 } %3922, 3, !dbg !170 + %3927 = getelementptr inbounds nuw i8, ptr addrspace(3) %3906, i32 3072, !dbg !170 + %3928 = ptrtoint ptr addrspace(3) %3927 to i32, !dbg !170 + %3929 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %3928) #2, !dbg !170 + %3930 = extractvalue { i32, i32, i32, i32 } %3929, 0, !dbg !170 + %3931 = extractvalue { i32, i32, i32, i32 } %3929, 1, !dbg !170 + %3932 = extractvalue { i32, i32, i32, i32 } %3929, 2, !dbg !170 + %3933 = extractvalue { i32, i32, i32, i32 } %3929, 3, !dbg !170 + %3934 = getelementptr inbounds nuw i8, ptr addrspace(3) %3906, i32 4096, !dbg !170 + %3935 = ptrtoint ptr addrspace(3) %3934 to i32, !dbg !170 + %3936 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %3935) #2, !dbg !170 + %3937 = extractvalue { i32, i32, i32, i32 } %3936, 0, !dbg !170 + %3938 = extractvalue { i32, i32, i32, i32 } %3936, 1, !dbg !170 + %3939 = extractvalue { i32, i32, i32, i32 } %3936, 2, !dbg !170 + %3940 = extractvalue { i32, i32, i32, i32 } %3936, 3, !dbg !170 + %3941 = getelementptr inbounds nuw i8, ptr addrspace(3) %3906, i32 5120, !dbg !170 + %3942 = ptrtoint ptr addrspace(3) %3941 to i32, !dbg !170 + %3943 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %3942) #2, !dbg !170 + %3944 = extractvalue { i32, i32, i32, i32 } %3943, 0, !dbg !170 + %3945 = extractvalue { i32, i32, i32, i32 } %3943, 1, !dbg !170 + %3946 = extractvalue { i32, i32, i32, i32 } %3943, 2, !dbg !170 + %3947 = extractvalue { i32, i32, i32, i32 } %3943, 3, !dbg !170 + %3948 = getelementptr inbounds nuw i8, ptr addrspace(3) %3906, i32 6144, !dbg !170 + %3949 = ptrtoint ptr addrspace(3) %3948 to i32, !dbg !170 + %3950 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %3949) #2, !dbg !170 + %3951 = extractvalue { i32, i32, i32, i32 } %3950, 0, !dbg !170 + %3952 = extractvalue { i32, i32, i32, i32 } %3950, 1, !dbg !170 + %3953 = extractvalue { i32, i32, i32, i32 } %3950, 2, !dbg !170 + %3954 = extractvalue { i32, i32, i32, i32 } %3950, 3, !dbg !170 + %3955 = getelementptr inbounds nuw i8, ptr addrspace(3) %3906, i32 7168, !dbg !170 + %3956 = ptrtoint ptr addrspace(3) %3955 to i32, !dbg !170 + %3957 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %3956) #2, !dbg !170 + %3958 = extractvalue { i32, i32, i32, i32 } %3957, 0, !dbg !170 + %3959 = extractvalue { i32, i32, i32, i32 } %3957, 1, !dbg !170 + %3960 = extractvalue { i32, i32, i32, i32 } %3957, 2, !dbg !170 + %3961 = extractvalue { i32, i32, i32, i32 } %3957, 3, !dbg !170 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %3909, i32 %3910, i32 %3911, i32 %3912, ptr addrspace(1) %3704, i1 %3685) #2, !dbg !170 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %3916, i32 %3917, i32 %3918, i32 %3919, ptr addrspace(1) %3706, i1 %3686) #2, !dbg !170 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %3923, i32 %3924, i32 %3925, i32 %3926, ptr addrspace(1) %3708, i1 %3687) #2, !dbg !170 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %3930, i32 %3931, i32 %3932, i32 %3933, ptr addrspace(1) %3710, i1 %3688) #2, !dbg !170 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %3937, i32 %3938, i32 %3939, i32 %3940, ptr addrspace(1) %3712, i1 %3689) #2, !dbg !170 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %3944, i32 %3945, i32 %3946, i32 %3947, ptr addrspace(1) %3714, i1 %3690) #2, !dbg !170 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %3951, i32 %3952, i32 %3953, i32 %3954, ptr addrspace(1) %3716, i1 %3691) #2, !dbg !170 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %3958, i32 %3959, i32 %3960, i32 %3961, ptr addrspace(1) %3718, i1 %3692) #2, !dbg !170 + %3962 = fcmp olt float %3555, 0x3810000000000000, !dbg !171 + %3963 = fmul float %3555, 0x4160000000000000, !dbg !171 + %.02.i = select i1 %3962, float %3963, float %3555, !dbg !171 + %i.i.0.i = select i1 %3962, float -2.300000e+01, float 0.000000e+00, !dbg !171 + %3964 = bitcast float %.02.i to i32, !dbg !171 + %3965 = add i32 %3964, -1060439283, !dbg !171 + %3966 = and i32 %3965, -8388608, !dbg !171 + %3967 = sub i32 %3964, %3966, !dbg !171 + %3968 = bitcast i32 %3967 to float, !dbg !171 + %3969 = sitofp i32 %3966 to float, !dbg !171 + %3970 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not.i292 = icmp eq i32 %3970, 0, !dbg !171 + %3971 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3969, float 0x3E80000000000000, float %i.i.0.i) #2, !dbg !171 + %3972 = tail call float @llvm.nvvm.fma.rn.f(float %3969, float 0x3E80000000000000, float %i.i.0.i) #2, !dbg !171 + %.08.i = select i1 %.not.i292, float %3972, float %3971, !dbg !171 + %3973 = fadd float %3968, -1.000000e+00, !dbg !171 + %3974 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not1.i = icmp eq i32 %3974, 0, !dbg !171 + %3975 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %3973, float 0xBFC58FE600000000) #2, !dbg !171 + %3976 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %3973, float 0xBFC58FE600000000) #2, !dbg !171 + %.010.i = select i1 %.not1.i, float %3976, float %3975, !dbg !171 + %3977 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not2.i = icmp eq i32 %3977, 0, !dbg !171 + %3978 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i, float %3973, float 0x3FC5F9E540000000) #2, !dbg !171 + %3979 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i, float %3973, float 0x3FC5F9E540000000) #2, !dbg !171 + %.011.i = select i1 %.not2.i, float %3979, float %3978, !dbg !171 + %3980 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not3.i = icmp eq i32 %3980, 0, !dbg !171 + %3981 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i, float %3973, float 0xBFC6E9C860000000) #2, !dbg !171 + %3982 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i, float %3973, float 0xBFC6E9C860000000) #2, !dbg !171 + %.012.i = select i1 %.not3.i, float %3982, float %3981, !dbg !171 + %3983 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not4.i = icmp eq i32 %3983, 0, !dbg !171 + %3984 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i, float %3973, float 0x3FCA417E80000000) #2, !dbg !171 + %3985 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i, float %3973, float 0x3FCA417E80000000) #2, !dbg !171 + %.09.i = select i1 %.not4.i, float %3985, float %3984, !dbg !171 + %3986 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not5.i = icmp eq i32 %3986, 0, !dbg !171 + %3987 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i, float %3973, float 0xBFCEC79160000000) #2, !dbg !171 + %3988 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i, float %3973, float 0xBFCEC79160000000) #2, !dbg !171 + %.05.i = select i1 %.not5.i, float %3988, float %3987, !dbg !171 + %3989 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not6.i = icmp eq i32 %3989, 0, !dbg !171 + %3990 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %3973, float 0x3FD277F320000000) #2, !dbg !171 + %3991 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %3973, float 0x3FD277F320000000) #2, !dbg !171 + %.01.i = select i1 %.not6.i, float %3991, float %3990, !dbg !171 + %3992 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not7.i = icmp eq i32 %3992, 0, !dbg !171 + %3993 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i, float %3973, float 0xBFD7154920000000) #2, !dbg !171 + %3994 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i, float %3973, float 0xBFD7154920000000) #2, !dbg !171 + %.0.i293 = select i1 %.not7.i, float %3994, float %3993, !dbg !171 + %3995 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not8.i = icmp eq i32 %3995, 0, !dbg !171 + %3996 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i293, float %3973, float 0x3FDEC70940000000) #2, !dbg !171 + %3997 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i293, float %3973, float 0x3FDEC70940000000) #2, !dbg !171 + %.07.i = select i1 %.not8.i, float %3997, float %3996, !dbg !171 + %3998 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not9.i = icmp eq i32 %3998, 0, !dbg !171 + %3999 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %3973, float 0xBFE7154760000000) #2, !dbg !171 + %4000 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %3973, float 0xBFE7154760000000) #2, !dbg !171 + %.06.i = select i1 %.not9.i, float %4000, float %3999, !dbg !171 + %4001 = fmul float %3973, %.06.i, !dbg !171 + %4002 = fmul float %3973, %4001, !dbg !171 + %4003 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not10.i = icmp eq i32 %4003, 0, !dbg !171 + %4004 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3973, float 0x3FF7154760000000, float %4002) #2, !dbg !171 + %4005 = tail call float @llvm.nvvm.fma.rn.f(float %3973, float 0x3FF7154760000000, float %4002) #2, !dbg !171 + %.04.i = select i1 %.not10.i, float %4005, float %4004, !dbg !171 + %4006 = fadd float %.08.i, %.04.i, !dbg !171 + %4007 = icmp ugt i32 %3964, 2139095039, !dbg !171 + br i1 %4007, label %__nv_fmaf_rn.exit.i.i, label %__nv_log2f.exit, !dbg !171 + +__nv_fmaf_rn.exit.i.i: ; preds = %._crit_edge379 + %4008 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not11.i = icmp eq i32 %4008, 0, !dbg !171 + %4009 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !171 + %4010 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !171 + %.03.i = select i1 %.not11.i, float %4010, float %4009, !dbg !171 + br label %__nv_log2f.exit, !dbg !171 + +__nv_log2f.exit: ; preds = %._crit_edge379, %__nv_fmaf_rn.exit.i.i + %r.i.0.i = phi float [ %.03.i, %__nv_fmaf_rn.exit.i.i ], [ %4006, %._crit_edge379 ], !dbg !171 + %4011 = fcmp olt float %3556, 0x3810000000000000, !dbg !171 + %4012 = fmul float %3556, 0x4160000000000000, !dbg !171 + %.02.i294 = select i1 %4011, float %4012, float %3556, !dbg !171 + %i.i.0.i295 = select i1 %4011, float -2.300000e+01, float 0.000000e+00, !dbg !171 + %4013 = bitcast float %.02.i294 to i32, !dbg !171 + %4014 = add i32 %4013, -1060439283, !dbg !171 + %4015 = and i32 %4014, -8388608, !dbg !171 + %4016 = sub i32 %4013, %4015, !dbg !171 + %4017 = bitcast i32 %4016 to float, !dbg !171 + %4018 = sitofp i32 %4015 to float, !dbg !171 + %4019 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not.i296 = icmp eq i32 %4019, 0, !dbg !171 + %4020 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %4018, float 0x3E80000000000000, float %i.i.0.i295) #2, !dbg !171 + %4021 = tail call float @llvm.nvvm.fma.rn.f(float %4018, float 0x3E80000000000000, float %i.i.0.i295) #2, !dbg !171 + %.08.i297 = select i1 %.not.i296, float %4021, float %4020, !dbg !171 + %4022 = fadd float %4017, -1.000000e+00, !dbg !171 + %4023 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not1.i298 = icmp eq i32 %4023, 0, !dbg !171 + %4024 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %4022, float 0xBFC58FE600000000) #2, !dbg !171 + %4025 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %4022, float 0xBFC58FE600000000) #2, !dbg !171 + %.010.i299 = select i1 %.not1.i298, float %4025, float %4024, !dbg !171 + %4026 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not2.i300 = icmp eq i32 %4026, 0, !dbg !171 + %4027 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i299, float %4022, float 0x3FC5F9E540000000) #2, !dbg !171 + %4028 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i299, float %4022, float 0x3FC5F9E540000000) #2, !dbg !171 + %.011.i301 = select i1 %.not2.i300, float %4028, float %4027, !dbg !171 + %4029 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not3.i302 = icmp eq i32 %4029, 0, !dbg !171 + %4030 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i301, float %4022, float 0xBFC6E9C860000000) #2, !dbg !171 + %4031 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i301, float %4022, float 0xBFC6E9C860000000) #2, !dbg !171 + %.012.i303 = select i1 %.not3.i302, float %4031, float %4030, !dbg !171 + %4032 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not4.i304 = icmp eq i32 %4032, 0, !dbg !171 + %4033 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i303, float %4022, float 0x3FCA417E80000000) #2, !dbg !171 + %4034 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i303, float %4022, float 0x3FCA417E80000000) #2, !dbg !171 + %.09.i305 = select i1 %.not4.i304, float %4034, float %4033, !dbg !171 + %4035 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not5.i306 = icmp eq i32 %4035, 0, !dbg !171 + %4036 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i305, float %4022, float 0xBFCEC79160000000) #2, !dbg !171 + %4037 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i305, float %4022, float 0xBFCEC79160000000) #2, !dbg !171 + %.05.i307 = select i1 %.not5.i306, float %4037, float %4036, !dbg !171 + %4038 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not6.i308 = icmp eq i32 %4038, 0, !dbg !171 + %4039 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i307, float %4022, float 0x3FD277F320000000) #2, !dbg !171 + %4040 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i307, float %4022, float 0x3FD277F320000000) #2, !dbg !171 + %.01.i309 = select i1 %.not6.i308, float %4040, float %4039, !dbg !171 + %4041 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not7.i310 = icmp eq i32 %4041, 0, !dbg !171 + %4042 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i309, float %4022, float 0xBFD7154920000000) #2, !dbg !171 + %4043 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i309, float %4022, float 0xBFD7154920000000) #2, !dbg !171 + %.0.i311 = select i1 %.not7.i310, float %4043, float %4042, !dbg !171 + %4044 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not8.i312 = icmp eq i32 %4044, 0, !dbg !171 + %4045 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i311, float %4022, float 0x3FDEC70940000000) #2, !dbg !171 + %4046 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i311, float %4022, float 0x3FDEC70940000000) #2, !dbg !171 + %.07.i313 = select i1 %.not8.i312, float %4046, float %4045, !dbg !171 + %4047 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not9.i314 = icmp eq i32 %4047, 0, !dbg !171 + %4048 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i313, float %4022, float 0xBFE7154760000000) #2, !dbg !171 + %4049 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i313, float %4022, float 0xBFE7154760000000) #2, !dbg !171 + %.06.i315 = select i1 %.not9.i314, float %4049, float %4048, !dbg !171 + %4050 = fmul float %4022, %.06.i315, !dbg !171 + %4051 = fmul float %4022, %4050, !dbg !171 + %4052 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not10.i316 = icmp eq i32 %4052, 0, !dbg !171 + %4053 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %4022, float 0x3FF7154760000000, float %4051) #2, !dbg !171 + %4054 = tail call float @llvm.nvvm.fma.rn.f(float %4022, float 0x3FF7154760000000, float %4051) #2, !dbg !171 + %.04.i317 = select i1 %.not10.i316, float %4054, float %4053, !dbg !171 + %4055 = fadd float %.08.i297, %.04.i317, !dbg !171 + %4056 = icmp ugt i32 %4013, 2139095039, !dbg !171 + br i1 %4056, label %__nv_fmaf_rn.exit.i.i320, label %__nv_log2f.exit323, !dbg !171 + +__nv_fmaf_rn.exit.i.i320: ; preds = %__nv_log2f.exit + %4057 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !171 + %.not11.i321 = icmp eq i32 %4057, 0, !dbg !171 + %4058 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i294, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !171 + %4059 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i294, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !171 + %.03.i322 = select i1 %.not11.i321, float %4059, float %4058, !dbg !171 + br label %__nv_log2f.exit323, !dbg !171 + +__nv_log2f.exit323: ; preds = %__nv_log2f.exit, %__nv_fmaf_rn.exit.i.i320 + %r.i.0.i318 = phi float [ %.03.i322, %__nv_fmaf_rn.exit.i.i320 ], [ %4055, %__nv_log2f.exit ], !dbg !171 + %4060 = insertelement <2 x float> poison, float %.02.i, i64 0, !dbg !171 + %4061 = insertelement <2 x float> %4060, float %.02.i294, i64 1, !dbg !171 + %4062 = fcmp oeq <2 x float> %4061, zeroinitializer, !dbg !171 + %4063 = shl nuw i32 %15, 16, !dbg !172 + %4064 = shl nuw nsw i32 %16, 11, !dbg !172 + %4065 = add i32 %4063, %4064, !dbg !172 + %4066 = sext i32 %4065 to i64, !dbg !173 + %4067 = getelementptr float, ptr addrspace(1) %3, i64 %4066, !dbg !173 + %4068 = and i32 %36, 127, !dbg !27 + %4069 = or disjoint i32 %35, %4068, !dbg !28 + %4070 = sext i32 %4069 to i64, !dbg !174 + %4071 = getelementptr float, ptr addrspace(1) %4067, i64 %4070, !dbg !174 + %4072 = insertelement <2 x float> poison, float %r.i.0.i, i64 0, !dbg !171 + %4073 = insertelement <2 x float> %4072, float %r.i.0.i318, i64 1, !dbg !171 + %4074 = select <2 x i1> %4062, <2 x float> splat (float 0xFFF0000000000000), <2 x float> %4073, !dbg !171 + %4075 = fadd <2 x float> %3551, %4074, !dbg !175 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !176 + %4076 = shl nuw nsw i32 %3902, 1, !dbg !176 + %4077 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4076, !dbg !176 + store <2 x float> %4075, ptr addrspace(3) %4077, align 8, !dbg !176 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !176 + %4078 = shl nuw nsw i32 %133, 3, !dbg !176 + %4079 = shl nuw nsw i32 %136, 2, !dbg !176 + %4080 = lshr exact i32 %137, 1, !dbg !176 + %4081 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4078, !dbg !176 + %4082 = getelementptr inbounds nuw i8, ptr addrspace(3) %4081, i32 %4079, !dbg !176 + %4083 = getelementptr inbounds nuw i8, ptr addrspace(3) %4082, i32 %4080, !dbg !176 + %4084 = load i32, ptr addrspace(3) %4083, align 4, !dbg !176 + %4085 = and i32 %36, 128, !dbg !176 + %4086 = icmp eq i32 %4085, 0, !dbg !176 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %4084, ptr addrspace(1) %4071, i1 %4086) #2, !dbg !176 + ret void, !dbg !177 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.commit.group() #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.wait.group(i32 immarg) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.idx.i32(i32, i32, i32, i32) #5 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.fence.sync.aligned() #6 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.commit_group.sync.aligned() #6 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.maxnum.f32(float, float) #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #5 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #7 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.f(float, float, float) #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.umin.i32(i32, i32) #8 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #8 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #6 = { convergent nounwind } +attributes #7 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #8 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_tem_fused_0", linkageName: "triton_tem_fused_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 97, column: 28, scope: !5) +!9 = !DILocation(line: 98, column: 27, scope: !5) +!10 = !DILocation(line: 99, column: 27, scope: !5) +!11 = !DILocation(line: 103, column: 23, scope: !5) +!12 = !DILocation(line: 107, column: 24, scope: !5) +!13 = !DILocation(line: 107, column: 45, scope: !5) +!14 = !DILocation(line: 107, column: 36, scope: !5) +!15 = !DILocation(line: 108, column: 25, scope: !5) +!16 = !DILocation(line: 108, column: 47, scope: !5) +!17 = !DILocation(line: 108, column: 37, scope: !5) +!18 = !DILocation(line: 111, column: 12, scope: !5) +!19 = !DILocation(line: 112, column: 12, scope: !5) +!20 = !DILocation(line: 113, column: 12, scope: !5) +!21 = !DILocation(line: 142, column: 51, scope: !5) +!22 = !DILocation(line: 142, column: 74, scope: !5) +!23 = !DILocation(line: 143, column: 46, scope: !5) +!24 = !DILocation(line: 143, column: 97, scope: !5) +!25 = !DILocation(line: 143, column: 64, scope: !5) +!26 = !DILocation(line: 144, column: 23, scope: !5) +!27 = !DILocation(line: 144, column: 46, scope: !5) +!28 = !DILocation(line: 144, column: 33, scope: !5) +!29 = !DILocation(line: 284, column: 38, scope: !30, inlinedAt: !31) +!30 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!31 = !DILocation(line: 146, column: 101, scope: !5) +!32 = !DILocation(line: 284, column: 20, scope: !30, inlinedAt: !31) +!33 = !DILocation(line: 284, column: 56, scope: !30, inlinedAt: !31) +!34 = !DILocation(line: 284, column: 49, scope: !30, inlinedAt: !31) +!35 = !DILocation(line: 294, column: 23, scope: !30, inlinedAt: !31) +!36 = !DILocation(line: 151, column: 26, scope: !5) +!37 = !DILocation(line: 152, column: 23, scope: !5) +!38 = !DILocation(line: 152, column: 37, scope: !5) +!39 = !DILocation(line: 153, column: 42, scope: !5) +!40 = !DILocation(line: 153, column: 28, scope: !5) +!41 = !DILocation(line: 154, column: 45, scope: !5) +!42 = !DILocation(line: 159, column: 37, scope: !5) +!43 = !DILocation(line: 376, column: 33, scope: !30, inlinedAt: !44) +!44 = !DILocation(line: 172, column: 41, scope: !5) +!45 = !DILocation(line: 502, column: 40, scope: !30, inlinedAt: !44) +!46 = !DILocation(line: 376, column: 23, scope: !30, inlinedAt: !44) +!47 = !DILocation(line: 378, column: 23, scope: !30, inlinedAt: !44) +!48 = !DILocation(line: 379, column: 23, scope: !30, inlinedAt: !44) +!49 = !DILocation(line: 346, column: 35, scope: !30, inlinedAt: !44) +!50 = !DILocation(line: 284, column: 38, scope: !30, inlinedAt: !44) +!51 = !DILocation(line: 284, column: 20, scope: !30, inlinedAt: !44) +!52 = !DILocation(line: 284, column: 49, scope: !30, inlinedAt: !44) +!53 = !DILocation(line: 294, column: 23, scope: !30, inlinedAt: !44) +!54 = !DILocation(line: 342, column: 32, scope: !30, inlinedAt: !44) +!55 = !DILocation(line: 351, column: 19, scope: !30, inlinedAt: !44) +!56 = !DILocation(line: 154, column: 65, scope: !5) +!57 = !DILocation(line: 159, column: 24, scope: !5) +!58 = !DILocation(line: 397, column: 23, scope: !30, inlinedAt: !44) +!59 = !DILocation(line: 395, column: 24, scope: !30, inlinedAt: !44) +!60 = !DILocation(line: 353, column: 14, scope: !30, inlinedAt: !44) +!61 = !DILocation(line: 373, column: 23, scope: !30, inlinedAt: !44) +!62 = !DILocation(line: 385, column: 24, scope: !30, inlinedAt: !44) +!63 = !DILocation(line: 381, column: 23, scope: !30, inlinedAt: !44) +!64 = !DILocation(line: 384, column: 24, scope: !30, inlinedAt: !44) +!65 = !DILocation(line: 387, column: 25, scope: !30, inlinedAt: !44) +!66 = !DILocation(line: 388, column: 92, scope: !30, inlinedAt: !44) +!67 = !DILocation(line: 394, column: 25, scope: !30, inlinedAt: !44) +!68 = !DILocation(line: 391, column: 24, scope: !30, inlinedAt: !44) +!69 = !DILocation(line: 392, column: 24, scope: !30, inlinedAt: !44) +!70 = !DILocation(line: 393, column: 39, scope: !30, inlinedAt: !44) +!71 = !DILocation(line: 396, column: 24, scope: !30, inlinedAt: !44) +!72 = !DILocation(line: 404, column: 39, scope: !30, inlinedAt: !44) +!73 = !DILocation(line: 405, column: 25, scope: !30, inlinedAt: !44) +!74 = !DILocation(line: 406, column: 24, scope: !30, inlinedAt: !44) +!75 = !DILocation(line: 407, column: 24, scope: !30, inlinedAt: !44) +!76 = !DILocation(line: 417, column: 27, scope: !30, inlinedAt: !44) +!77 = !DILocation(line: 414, column: 69, scope: !30, inlinedAt: !44) +!78 = !DILocation(line: 168, column: 27, scope: !79, inlinedAt: !44) +!79 = distinct !DILexicalBlockFile(scope: !5, file: !80, discriminator: 0) +!80 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!81 = !DILocation(line: 189, column: 40, scope: !79, inlinedAt: !44) +!82 = !DILocation(line: 421, column: 27, scope: !30, inlinedAt: !44) +!83 = !DILocation(line: 424, column: 51, scope: !30, inlinedAt: !44) +!84 = !DILocation(line: 423, column: 35, scope: !30, inlinedAt: !44) +!85 = !DILocation(line: 428, column: 31, scope: !30, inlinedAt: !44) +!86 = !DILocation(line: 428, column: 25, scope: !30, inlinedAt: !44) +!87 = !DILocation(line: 429, column: 39, scope: !30, inlinedAt: !44) +!88 = !DILocation(line: 429, column: 21, scope: !30, inlinedAt: !44) +!89 = !DILocation(line: 434, column: 16, scope: !30, inlinedAt: !44) +!90 = !DILocation(line: 261, column: 15, scope: !79, inlinedAt: !44) +!91 = !DILocation(line: 291, column: 36, scope: !79, inlinedAt: !44) +!92 = !DILocation(line: 434, column: 24, scope: !30, inlinedAt: !44) +!93 = !DILocation(line: 436, column: 16, scope: !30, inlinedAt: !44) +!94 = !DILocation(line: 440, column: 22, scope: !30, inlinedAt: !44) +!95 = !DILocation(line: 440, column: 44, scope: !30, inlinedAt: !44) +!96 = !DILocation(line: 548, column: 26, scope: !30, inlinedAt: !44) +!97 = !DILocation(line: 247, column: 33, scope: !30, inlinedAt: !44) +!98 = !DILocation(line: 248, column: 38, scope: !30, inlinedAt: !44) +!99 = !DILocation(line: 248, column: 24, scope: !30, inlinedAt: !44) +!100 = !DILocation(line: 249, column: 109, scope: !30, inlinedAt: !44) +!101 = !DILocation(line: 249, column: 113, scope: !30, inlinedAt: !44) +!102 = !DILocation(line: 249, column: 55, scope: !30, inlinedAt: !44) +!103 = !DILocation(line: 249, column: 25, scope: !30, inlinedAt: !44) +!104 = !DILocation(line: 250, column: 35, scope: !30, inlinedAt: !44) +!105 = !DILocation(line: 251, column: 34, scope: !30, inlinedAt: !44) +!106 = !DILocation(line: 251, column: 48, scope: !30, inlinedAt: !44) +!107 = !DILocation(line: 251, column: 63, scope: !30, inlinedAt: !44) +!108 = !DILocation(line: 252, column: 29, scope: !30, inlinedAt: !44) +!109 = !DILocation(line: 252, column: 61, scope: !30, inlinedAt: !44) +!110 = !DILocation(line: 252, column: 42, scope: !30, inlinedAt: !44) +!111 = !DILocation(line: 549, column: 21, scope: !30, inlinedAt: !44) +!112 = !DILocation(line: 136, column: 19, scope: !5) +!113 = !DILocation(line: 181, column: 35, scope: !5) +!114 = !DILocation(line: 182, column: 27, scope: !5) +!115 = !DILocation(line: 182, column: 41, scope: !5) +!116 = !DILocation(line: 183, column: 51, scope: !5) +!117 = !DILocation(line: 183, column: 32, scope: !5) +!118 = !DILocation(line: 184, column: 49, scope: !5) +!119 = !DILocation(line: 502, column: 40, scope: !30, inlinedAt: !120) +!120 = !DILocation(line: 198, column: 45, scope: !5) +!121 = !DILocation(line: 346, column: 35, scope: !30, inlinedAt: !120) +!122 = !DILocation(line: 284, column: 38, scope: !30, inlinedAt: !120) +!123 = !DILocation(line: 284, column: 20, scope: !30, inlinedAt: !120) +!124 = !DILocation(line: 284, column: 49, scope: !30, inlinedAt: !120) +!125 = !DILocation(line: 294, column: 23, scope: !30, inlinedAt: !120) +!126 = !DILocation(line: 342, column: 32, scope: !30, inlinedAt: !120) +!127 = !DILocation(line: 351, column: 19, scope: !30, inlinedAt: !120) +!128 = !DILocation(line: 184, column: 69, scope: !5) +!129 = !DILocation(line: 353, column: 14, scope: !30, inlinedAt: !120) +!130 = !DILocation(line: 417, column: 27, scope: !30, inlinedAt: !120) +!131 = !DILocation(line: 168, column: 27, scope: !79, inlinedAt: !120) +!132 = !DILocation(line: 189, column: 40, scope: !79, inlinedAt: !120) +!133 = !DILocation(line: 421, column: 27, scope: !30, inlinedAt: !120) +!134 = !DILocation(line: 424, column: 51, scope: !30, inlinedAt: !120) +!135 = !DILocation(line: 423, column: 35, scope: !30, inlinedAt: !120) +!136 = !DILocation(line: 428, column: 31, scope: !30, inlinedAt: !120) +!137 = !DILocation(line: 428, column: 25, scope: !30, inlinedAt: !120) +!138 = !DILocation(line: 429, column: 39, scope: !30, inlinedAt: !120) +!139 = !DILocation(line: 429, column: 21, scope: !30, inlinedAt: !120) +!140 = !DILocation(line: 434, column: 16, scope: !30, inlinedAt: !120) +!141 = !DILocation(line: 261, column: 15, scope: !79, inlinedAt: !120) +!142 = !DILocation(line: 291, column: 36, scope: !79, inlinedAt: !120) +!143 = !DILocation(line: 434, column: 24, scope: !30, inlinedAt: !120) +!144 = !DILocation(line: 436, column: 16, scope: !30, inlinedAt: !120) +!145 = !DILocation(line: 440, column: 22, scope: !30, inlinedAt: !120) +!146 = !DILocation(line: 440, column: 44, scope: !30, inlinedAt: !120) +!147 = !DILocation(line: 247, column: 33, scope: !30, inlinedAt: !120) +!148 = !DILocation(line: 248, column: 38, scope: !30, inlinedAt: !120) +!149 = !DILocation(line: 248, column: 24, scope: !30, inlinedAt: !120) +!150 = !DILocation(line: 249, column: 109, scope: !30, inlinedAt: !120) +!151 = !DILocation(line: 249, column: 113, scope: !30, inlinedAt: !120) +!152 = !DILocation(line: 249, column: 55, scope: !30, inlinedAt: !120) +!153 = !DILocation(line: 249, column: 25, scope: !30, inlinedAt: !120) +!154 = !DILocation(line: 250, column: 35, scope: !30, inlinedAt: !120) +!155 = !DILocation(line: 251, column: 34, scope: !30, inlinedAt: !120) +!156 = !DILocation(line: 251, column: 48, scope: !30, inlinedAt: !120) +!157 = !DILocation(line: 251, column: 63, scope: !30, inlinedAt: !120) +!158 = !DILocation(line: 252, column: 29, scope: !30, inlinedAt: !120) +!159 = !DILocation(line: 252, column: 61, scope: !30, inlinedAt: !120) +!160 = !DILocation(line: 252, column: 42, scope: !30, inlinedAt: !120) +!161 = !DILocation(line: 549, column: 21, scope: !30, inlinedAt: !120) +!162 = !DILocation(line: 206, column: 26, scope: !5) +!163 = !DILocation(line: 206, column: 34, scope: !5) +!164 = !DILocation(line: 208, column: 16, scope: !5) +!165 = !DILocation(line: 214, column: 20, scope: !5) +!166 = !DILocation(line: 218, column: 49, scope: !5) +!167 = !DILocation(line: 218, column: 62, scope: !5) +!168 = !DILocation(line: 218, column: 75, scope: !5) +!169 = !DILocation(line: 218, column: 25, scope: !5) +!170 = !DILocation(line: 218, column: 109, scope: !5) +!171 = !DILocation(line: 223, column: 33, scope: !5) +!172 = !DILocation(line: 222, column: 32, scope: !5) +!173 = !DILocation(line: 222, column: 23, scope: !5) +!174 = !DILocation(line: 222, column: 40, scope: !5) +!175 = !DILocation(line: 223, column: 20, scope: !5) +!176 = !DILocation(line: 225, column: 29, scope: !5) +!177 = !DILocation(line: 229, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..4ca903cef91b3ea2351a968ea512499f0fec96bb --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.ptx @@ -0,0 +1,3160 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_tem_fused_0 // -- Begin function triton_tem_fused_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_tem_fused_0 +.visible .entry triton_tem_fused_0( + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_0, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_1, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_2, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_3, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_4, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_5, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_6, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_7, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_8, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_9, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_10, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_11, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_12 +) +.reqntid 256 +{ + .reg .pred %p<201>; + .reg .b16 %rs<62>; + .reg .b32 %r<4978>; + .reg .b64 %rd<285>; + .loc 1 18 0 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:18:0 + +// %bb.0: + ld.param.b64 %rd29, [triton_tem_fused_0_param_8]; + ld.param.b64 %rd28, [triton_tem_fused_0_param_7]; + ld.param.b64 %rd60, [triton_tem_fused_0_param_0]; +$L__tmp0: + .loc 1 97 28 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:97:28 + mov.u32 %r696, %ctaid.x; + ld.param.b64 %rd61, [triton_tem_fused_0_param_1]; + .loc 1 98 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:98:27 + mov.u32 %r1, %ctaid.y; + ld.param.b64 %rd62, [triton_tem_fused_0_param_2]; + .loc 1 99 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:99:27 + mov.u32 %r2, %ctaid.z; + .loc 1 103 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:103:23 + and.b32 %r697, %r1, 7; + .loc 1 107 24 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:107:24 + shl.b32 %r3, %r1, 23; + ld.param.b64 %rd63, [triton_tem_fused_0_param_5]; + .loc 1 107 45 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:107:45 + shl.b32 %r4, %r2, 7; + ld.param.b64 %rd64, [triton_tem_fused_0_param_6]; + .loc 1 107 36 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:107:36 + or.b32 %r698, %r3, %r4; + .loc 1 108 25 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:108:25 + shl.b32 %r699, %r697, 21; + .loc 1 108 47 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:108:47 + shl.b32 %r700, %r2, 16; + ld.param.b64 %rd65, [triton_tem_fused_0_param_9]; + and.b32 %r701, %r700, -262144; + .loc 1 108 37 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:108:37 + add.s32 %r702, %r699, %r701; + .loc 1 111 12 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:111:12 + mad.wide.s32 %rd66, %r698, 2, %rd60; + .loc 1 112 12 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:112:12 + mul.wide.s32 %rd67, %r702, 2; + add.s64 %rd1, %rd61, %rd67; + .loc 1 113 12 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:113:12 + add.s64 %rd2, %rd62, %rd67; + .loc 1 142 51 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:142:51 + shl.b32 %r703, %r697, 4; + .loc 1 142 74 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:142:74 + add.s32 %r704, %r703, %r696; + .loc 1 143 46 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:143:46 + shl.b32 %r705, %r697, 8; + .loc 1 143 97 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:143:97 + shl.b32 %r706, %r696, 4; + .loc 1 143 64 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:143:64 + add.s32 %r707, %r705, %r706; + .loc 1 144 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:144:23 + shl.b32 %r5, %r696, 7; + .loc 1 144 46 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:144:46 + mov.u32 %r6, %tid.x; + shr.u32 %r7, %r6, 5; + and.b32 %r708, %r6, 240; + bfe.u32 %r8, %r6, 4, 4; + or.b32 %r9, %r8, 16; + or.b32 %r10, %r8, 32; + or.b32 %r11, %r8, 48; + .loc 1 144 33 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:144:33 + or.b32 %r12, %r8, %r5; + or.b32 %r13, %r9, %r5; + or.b32 %r14, %r10, %r5; + or.b32 %r15, %r11, %r5; + or.b32 %r16, %r12, 64; + or.b32 %r17, %r12, 80; + or.b32 %r18, %r12, 96; + or.b32 %r19, %r12, 112; +$L__tmp1: + .loc 1 284 38 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:38 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:146:101 ] + shl.b32 %r20, %r12, 12; + shl.b32 %r21, %r13, 12; + shl.b32 %r22, %r14, 12; + shl.b32 %r23, %r15, 12; + shl.b32 %r24, %r16, 12; + shl.b32 %r25, %r17, 12; + shl.b32 %r26, %r18, 12; + shl.b32 %r27, %r19, 12; + .loc 1 284 20 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:20 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:146:101 ] + mad.wide.s32 %rd68, %r20, 2, %rd66; + mad.wide.s32 %rd69, %r21, 2, %rd66; + mad.wide.s32 %rd70, %r22, 2, %rd66; + mad.wide.s32 %rd71, %r23, 2, %rd66; + mad.wide.s32 %rd72, %r24, 2, %rd66; + mad.wide.s32 %rd73, %r25, 2, %rd66; + mad.wide.s32 %rd74, %r26, 2, %rd66; + mad.wide.s32 %rd75, %r27, 2, %rd66; + .loc 1 284 56 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:56 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:146:101 ] + shl.b32 %r715, %r6, 3; + and.b32 %r716, %r715, 120; + .loc 1 284 49 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:49 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:146:101 ] + cvt.u64.u32 %rd3, %r716; + mul.wide.u32 %rd76, %r716, 2; + add.s64 %rd31, %rd68, %rd76; + add.s64 %rd32, %rd69, %rd76; + add.s64 %rd33, %rd70, %rd76; + add.s64 %rd34, %rd71, %rd76; + add.s64 %rd35, %rd72, %rd76; + add.s64 %rd36, %rd73, %rd76; + add.s64 %rd37, %rd74, %rd76; + add.s64 %rd38, %rd75, %rd76; + .loc 1 294 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:294:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:146:101 ] + // begin inline asm + mov.u32 %r629, 0x0; + mov.u32 %r630, 0x0; + mov.u32 %r631, 0x0; + mov.u32 %r632, 0x0; + ld.global.v4.b32 { %r629, %r630, %r631, %r632 }, [ %rd31 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r633, 0x0; + mov.u32 %r634, 0x0; + mov.u32 %r635, 0x0; + mov.u32 %r636, 0x0; + ld.global.v4.b32 { %r633, %r634, %r635, %r636 }, [ %rd32 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r637, 0x0; + mov.u32 %r638, 0x0; + mov.u32 %r639, 0x0; + mov.u32 %r640, 0x0; + ld.global.v4.b32 { %r637, %r638, %r639, %r640 }, [ %rd33 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r641, 0x0; + mov.u32 %r642, 0x0; + mov.u32 %r643, 0x0; + mov.u32 %r644, 0x0; + ld.global.v4.b32 { %r641, %r642, %r643, %r644 }, [ %rd34 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r645, 0x0; + mov.u32 %r646, 0x0; + mov.u32 %r647, 0x0; + mov.u32 %r648, 0x0; + ld.global.v4.b32 { %r645, %r646, %r647, %r648 }, [ %rd35 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r649, 0x0; + mov.u32 %r650, 0x0; + mov.u32 %r651, 0x0; + mov.u32 %r652, 0x0; + ld.global.v4.b32 { %r649, %r650, %r651, %r652 }, [ %rd36 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r653, 0x0; + mov.u32 %r654, 0x0; + mov.u32 %r655, 0x0; + mov.u32 %r656, 0x0; + ld.global.v4.b32 { %r653, %r654, %r655, %r656 }, [ %rd37 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r657, 0x0; + mov.u32 %r658, 0x0; + mov.u32 %r659, 0x0; + mov.u32 %r660, 0x0; + ld.global.v4.b32 { %r657, %r658, %r659, %r660 }, [ %rd38 + 0 ]; + // end inline asm + and.b32 %r28, %r6, 7; + shl.b32 %r717, %r28, 4; + shl.b32 %r718, %r708, 3; + and.b32 %r29, %r6, 112; + and.b32 %r30, %r6, 8; + shl.b32 %r719, %r30, 11; + or.b32 %r720, %r717, %r718; + xor.b32 %r721, %r720, %r29; + mov.b32 %r722, global_smem; + add.s32 %r723, %r722, %r719; + add.s32 %r724, %r723, %r721; + st.shared.v4.b32 [%r724+98304], {%r629, %r630, %r631, %r632}; + st.shared.v4.b32 [%r724+100352], {%r633, %r634, %r635, %r636}; + st.shared.v4.b32 [%r724+102400], {%r637, %r638, %r639, %r640}; + st.shared.v4.b32 [%r724+104448], {%r641, %r642, %r643, %r644}; + st.shared.v4.b32 [%r724+106496], {%r645, %r646, %r647, %r648}; + st.shared.v4.b32 [%r724+108544], {%r649, %r650, %r651, %r652}; + st.shared.v4.b32 [%r724+110592], {%r653, %r654, %r655, %r656}; + st.shared.v4.b32 [%r724+112640], {%r657, %r658, %r659, %r660}; +$L__tmp2: + .loc 1 151 26 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:151:26 + cvt.s64.s32 %rd4, %r707; + mad.wide.s32 %rd39, %r707, 4, %rd64; + .loc 1 152 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:152:23 + // begin inline asm + mov.u32 %r661, 0x0; + ld.global.b32 { %r661 }, [ %rd39 + 0 ]; + // end inline asm + .loc 1 152 37 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:152:37 + shl.b32 %r31, %r661, 7; + .loc 1 153 42 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:153:42 + cvt.s64.s32 %rd6, %r704; + mad.wide.s32 %rd40, %r704, 4, %rd63; + .loc 1 153 28 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:153:28 + // begin inline asm + mov.u32 %r662, 0x0; + ld.global.b32 { %r662 }, [ %rd40 + 0 ]; + // end inline asm + .loc 1 154 45 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:154:45 + shl.b32 %r33, %r662, 1; + .loc 1 159 37 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:159:37 + and.b32 %r34, %r6, 3; +$L__tmp3: + .loc 1 376 33 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:376:33 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mad.wide.u32 %rd42, %r1, 8, %rd65; + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + setp.lt.s32 %p2, %r33, 1; + setp.gt.s32 %p1, %r33, 0; + .loc 1 376 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:376:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + // begin inline asm + mov.u64 %rd7, 0x0; + @%p1 ld.global.b64 { %rd7 }, [ %rd42 + 0 ]; + // end inline asm + .loc 1 346 35 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:346:35 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + or.b32 %r725, %r31, %r8; + or.b32 %r726, %r31, %r9; + or.b32 %r727, %r31, %r10; + or.b32 %r728, %r31, %r11; + .loc 1 284 38 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:38 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + shl.b32 %r729, %r725, 7; + shl.b32 %r730, %r726, 7; + shl.b32 %r731, %r727, 7; + shl.b32 %r732, %r728, 7; + .loc 1 284 20 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:20 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.wide.s32 %rd77, %r729, 2; + mul.wide.s32 %rd78, %r730, 2; + mul.wide.s32 %rd79, %r731, 2; + mul.wide.s32 %rd80, %r732, 2; + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s64 %rd10, %rd1, %rd76; + .loc 1 284 49 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:49 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s64 %rd43, %rd10, %rd77; + add.s64 %rd44, %rd10, %rd78; + add.s64 %rd45, %rd10, %rd79; + add.s64 %rd46, %rd10, %rd80; + .loc 1 294 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:294:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + shl.b32 %r733, %r30, 10; + or.b32 %r35, %r721, %r733; + add.s32 %r2699, %r722, %r35; + selp.b32 %r664, 16, 0, %p1; + // begin inline asm + cp.async.cg.shared.global [ %r2699 + 0 ], [ %rd43 + 0 ], 0x10, %r664; + // end inline asm + add.s32 %r2701, %r2699, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r2701 + 0 ], [ %rd44 + 0 ], 0x10, %r664; + // end inline asm + add.s32 %r2703, %r2699, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r2703 + 0 ], [ %rd45 + 0 ], 0x10, %r664; + // end inline asm + add.s32 %r2705, %r2699, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r2705 + 0 ], [ %rd46 + 0 ], 0x10, %r664; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s64 %rd11, %rd2, %rd76; + .loc 1 284 49 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:49 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s64 %rd47, %rd11, %rd77; + add.s64 %rd48, %rd11, %rd78; + add.s64 %rd49, %rd11, %rd79; + add.s64 %rd50, %rd11, %rd80; + .loc 1 294 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:294:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s32 %r671, %r2699, 49152; + // begin inline asm + cp.async.cg.shared.global [ %r671 + 0 ], [ %rd47 + 0 ], 0x10, %r664; + // end inline asm + add.s32 %r673, %r2699, 51200; + // begin inline asm + cp.async.cg.shared.global [ %r673 + 0 ], [ %rd48 + 0 ], 0x10, %r664; + // end inline asm + add.s32 %r675, %r2699, 53248; + // begin inline asm + cp.async.cg.shared.global [ %r675 + 0 ], [ %rd49 + 0 ], 0x10, %r664; + // end inline asm + add.s32 %r677, %r2699, 55296; + // begin inline asm + cp.async.cg.shared.global [ %r677 + 0 ], [ %rd50 + 0 ], 0x10, %r664; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + setp.gt.s32 %p3, %r33, 1; + .loc 1 342 32 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:342:32 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + or.b32 %r734, %r31, 64; + .loc 1 346 35 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:346:35 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + or.b32 %r735, %r734, %r8; + or.b32 %r736, %r734, %r9; + or.b32 %r737, %r734, %r10; + or.b32 %r738, %r734, %r11; + .loc 1 284 38 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:38 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + shl.b32 %r739, %r735, 7; + shl.b32 %r740, %r736, 7; + shl.b32 %r741, %r737, 7; + shl.b32 %r742, %r738, 7; + .loc 1 284 20 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:20 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.wide.s32 %rd81, %r739, 2; + mul.wide.s32 %rd82, %r740, 2; + mul.wide.s32 %rd83, %r741, 2; + mul.wide.s32 %rd84, %r742, 2; + .loc 1 284 49 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:49 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s64 %rd51, %rd10, %rd81; + add.s64 %rd52, %rd10, %rd82; + add.s64 %rd53, %rd10, %rd83; + add.s64 %rd54, %rd10, %rd84; + .loc 1 294 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:294:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + bar.sync 0; + add.s32 %r679, %r2699, 16384; + selp.b32 %r680, 16, 0, %p3; + // begin inline asm + cp.async.cg.shared.global [ %r679 + 0 ], [ %rd51 + 0 ], 0x10, %r680; + // end inline asm + add.s32 %r681, %r2699, 18432; + // begin inline asm + cp.async.cg.shared.global [ %r681 + 0 ], [ %rd52 + 0 ], 0x10, %r680; + // end inline asm + add.s32 %r683, %r2699, 20480; + // begin inline asm + cp.async.cg.shared.global [ %r683 + 0 ], [ %rd53 + 0 ], 0x10, %r680; + // end inline asm + add.s32 %r685, %r2699, 22528; + // begin inline asm + cp.async.cg.shared.global [ %r685 + 0 ], [ %rd54 + 0 ], 0x10, %r680; + // end inline asm + cp.async.commit_group; + .loc 1 284 49 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:49 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s64 %rd55, %rd11, %rd81; + add.s64 %rd56, %rd11, %rd82; + add.s64 %rd57, %rd11, %rd83; + add.s64 %rd58, %rd11, %rd84; + .loc 1 294 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:294:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s32 %r687, %r2699, 65536; + // begin inline asm + cp.async.cg.shared.global [ %r687 + 0 ], [ %rd55 + 0 ], 0x10, %r680; + // end inline asm + add.s32 %r689, %r2699, 67584; + // begin inline asm + cp.async.cg.shared.global [ %r689 + 0 ], [ %rd56 + 0 ], 0x10, %r680; + // end inline asm + add.s32 %r691, %r2699, 69632; + // begin inline asm + cp.async.cg.shared.global [ %r691 + 0 ], [ %rd57 + 0 ], 0x10, %r680; + // end inline asm + add.s32 %r693, %r2699, 71680; + // begin inline asm + cp.async.cg.shared.global [ %r693 + 0 ], [ %rd58 + 0 ], 0x10, %r680; + // end inline asm + cp.async.commit_group; + .loc 1 351 19 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:351:19 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + mov.b32 %r743, 0fFF800000; + mov.b64 %rd284, {%r743, %r743}; + mov.b32 %r1567, 0f00000000; + mov.b32 %r1568, %r1567; + mov.b32 %r1569, %r1567; + mov.b32 %r1570, %r1567; + mov.b32 %r1571, %r1567; + mov.b32 %r1572, %r1567; + mov.b32 %r1573, %r1567; + mov.b32 %r1574, %r1567; + mov.b32 %r1575, %r1567; + mov.b32 %r1576, %r1567; + mov.b32 %r1577, %r1567; + mov.b32 %r1578, %r1567; + mov.b32 %r1579, %r1567; + mov.b32 %r1580, %r1567; + mov.b32 %r1581, %r1567; + mov.b32 %r1582, %r1567; + mov.b32 %r1583, %r1567; + mov.b32 %r1584, %r1567; + mov.b32 %r1585, %r1567; + mov.b32 %r1586, %r1567; + mov.b32 %r1587, %r1567; + mov.b32 %r1588, %r1567; + mov.b32 %r1589, %r1567; + mov.b32 %r1590, %r1567; + mov.b32 %r1591, %r1567; + mov.b32 %r1592, %r1567; + mov.b32 %r1593, %r1567; + mov.b32 %r1594, %r1567; + mov.b32 %r1595, %r1567; + mov.b32 %r1596, %r1567; + mov.b32 %r1597, %r1567; + mov.b32 %r1598, %r1567; + mov.b32 %r1599, %r1567; + mov.b32 %r1600, %r1567; + mov.b32 %r1601, %r1567; + mov.b32 %r1602, %r1567; + mov.b32 %r1603, %r1567; + mov.b32 %r1604, %r1567; + mov.b32 %r1605, %r1567; + mov.b32 %r1606, %r1567; + mov.b32 %r1607, %r1567; + mov.b32 %r1608, %r1567; + mov.b32 %r1609, %r1567; + mov.b32 %r1610, %r1567; + mov.b32 %r1611, %r1567; + mov.b32 %r1612, %r1567; + mov.b32 %r1613, %r1567; + mov.b32 %r1614, %r1567; + mov.b32 %r1615, %r1567; + mov.b32 %r1616, %r1567; + mov.b32 %r1617, %r1567; + mov.b32 %r1618, %r1567; + mov.b32 %r1619, %r1567; + mov.b32 %r1620, %r1567; + mov.b32 %r1621, %r1567; + mov.b32 %r1622, %r1567; + mov.b32 %r1623, %r1567; + mov.b32 %r1624, %r1567; + mov.b32 %r1625, %r1567; + mov.b32 %r1626, %r1567; + mov.b32 %r1627, %r1567; + mov.b32 %r1628, %r1567; + mov.b32 %r1629, %r1567; + mov.b32 %r1630, %r1567; + mov.b32 %r4974, %r1567; + mov.b32 %r4975, %r1567; + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + @%p2 bra $L__BB0_3; +$L__tmp4: +// %bb.1: // %.lr.ph + .loc 1 0 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:0:40 + shr.u32 %r709, %r6, 1; + and.b32 %r710, %r709, 112; + bfe.u32 %r711, %r6, 2, 3; + or.b32 %r712, %r710, %r711; + or.b32 %r713, %r712, %r5; + or.b32 %r714, %r713, 8; + cvt.s64.s32 %rd8, %r713; + cvt.s64.s32 %rd9, %r714; + cvt.u32.u64 %r71, %rd9; + cvt.u32.u64 %r73, %rd8; + .loc 1 154 65 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:154:65 + min.u32 %r52, %r33, 32; + .loc 1 159 37 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:159:37 + shl.b32 %r749, %r34, 1; + or.b32 %r4773, %r31, %r749; + add.s32 %r53, %r52, -2; + add.s32 %r54, %r52, -1; + or.b32 %r4772, %r4773, 1; + or.b32 %r4771, %r4773, 8; + or.b32 %r4770, %r4773, 9; + or.b32 %r4769, %r4773, 16; + or.b32 %r4768, %r4773, 17; + or.b32 %r4767, %r4773, 24; + or.b32 %r4766, %r4773, 25; + or.b32 %r4765, %r4773, 32; + or.b32 %r4764, %r4773, 33; + or.b32 %r4763, %r4773, 40; + or.b32 %r4762, %r4773, 41; + or.b32 %r4761, %r4773, 48; + or.b32 %r4760, %r4773, 49; + or.b32 %r4759, %r4773, 56; + or.b32 %r4758, %r4773, 57; + mov.b64 %rd284, {%r743, %r743}; + mov.b32 %r1366, 0; + mov.b32 %r4974, 0f00000000; + mov.b32 %r4689, 1; + mov.b32 %r4688, -1; + mov.b32 %r4687, 64; + setp.gt.s64 %p17, %rd7, %rd9; + setp.gt.s64 %p18, %rd7, %rd8; + mov.b32 %r4690, %r4687; + mov.b32 %r4975, %r4974; + mov.b32 %r1567, %r4974; + mov.b32 %r1568, %r4974; + mov.b32 %r1569, %r4974; + mov.b32 %r1570, %r4974; + mov.b32 %r1571, %r4974; + mov.b32 %r1572, %r4974; + mov.b32 %r1573, %r4974; + mov.b32 %r1574, %r4974; + mov.b32 %r1575, %r4974; + mov.b32 %r1576, %r4974; + mov.b32 %r1577, %r4974; + mov.b32 %r1578, %r4974; + mov.b32 %r1579, %r4974; + mov.b32 %r1580, %r4974; + mov.b32 %r1581, %r4974; + mov.b32 %r1582, %r4974; + mov.b32 %r1583, %r4974; + mov.b32 %r1584, %r4974; + mov.b32 %r1585, %r4974; + mov.b32 %r1586, %r4974; + mov.b32 %r1587, %r4974; + mov.b32 %r1588, %r4974; + mov.b32 %r1589, %r4974; + mov.b32 %r1590, %r4974; + mov.b32 %r1591, %r4974; + mov.b32 %r1592, %r4974; + mov.b32 %r1593, %r4974; + mov.b32 %r1594, %r4974; + mov.b32 %r1595, %r4974; + mov.b32 %r1596, %r4974; + mov.b32 %r1597, %r4974; + mov.b32 %r1598, %r4974; + mov.b32 %r1599, %r4974; + mov.b32 %r1600, %r4974; + mov.b32 %r1601, %r4974; + mov.b32 %r1602, %r4974; + mov.b32 %r1603, %r4974; + mov.b32 %r1604, %r4974; + mov.b32 %r1605, %r4974; + mov.b32 %r1606, %r4974; + mov.b32 %r1607, %r4974; + mov.b32 %r1608, %r4974; + mov.b32 %r1609, %r4974; + mov.b32 %r1610, %r4974; + mov.b32 %r1611, %r4974; + mov.b32 %r1612, %r4974; + mov.b32 %r1613, %r4974; + mov.b32 %r1614, %r4974; + mov.b32 %r1615, %r4974; + mov.b32 %r1616, %r4974; + mov.b32 %r1617, %r4974; + mov.b32 %r1618, %r4974; + mov.b32 %r1619, %r4974; + mov.b32 %r1620, %r4974; + mov.b32 %r1621, %r4974; + mov.b32 %r1622, %r4974; + mov.b32 %r1623, %r4974; + mov.b32 %r1624, %r4974; + mov.b32 %r1625, %r4974; + mov.b32 %r1626, %r4974; + mov.b32 %r1627, %r4974; + mov.b32 %r1628, %r4974; + mov.b32 %r1629, %r4974; + mov.b32 %r1630, %r4974; + mov.b32 %r4757, %r1366; +$L__BB0_2: // %__nv_exp2f.exit192 + // =>This Inner Loop Header: Depth=1 +$L__tmp5: + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + setp.lt.s32 %p19, %r4757, %r53; + setp.lt.s32 %p15, %r4757, %r54; + add.s32 %r1983, %r4688, 1; + setp.gt.s32 %p20, %r1983, 2; + selp.b32 %r4688, 0, %r1983, %p20; + .loc 1 294 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:294:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r1984, %r4688, 14; + add.s32 %r1266, %r722, %r1984; + .loc 1 351 19 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:351:19 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + shfl.sync.idx.b32 %r1986, %r7, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r1987, %r1986, 11; + and.b32 %r1988, %r1987, 8192; + add.s32 %r1263, %r722, 98304; + add.s32 %r1989, %r1988, %r1263; + bfe.u32 %r1990, %r1989, 4, 14; + cvt.u64.u32 %rd120, %r1990; + or.b64 %rd86, %rd120, 4611686293372403712; + bfe.u32 %r1991, %r1266, 4, 14; + cvt.u64.u32 %rd121, %r1991; + or.b64 %rd87, %rd121, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r847,%r848,%r849,%r850,%r851,%r852,%r853,%r854,%r855,%r856,%r857,%r858,%r859,%r860,%r861,%r862,%r863,%r864,%r865,%r866,%r867,%r868,%r869,%r870,%r871,%r872,%r873,%r874,%r875,%r876,%r877,%r878}, %rd86, %rd87, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r1992, %r1989, 32; + bfe.u32 %r1993, %r1992, 4, 14; + cvt.u64.u32 %rd122, %r1993; + or.b64 %rd88, %rd122, 4611686293372403712; + add.s32 %r1994, %r1266, 32; + bfe.u32 %r1995, %r1994, 4, 14; + cvt.u64.u32 %rd123, %r1995; + or.b64 %rd89, %rd123, 4611686293338849280; + mov.pred %p4, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r847,%r848,%r849,%r850,%r851,%r852,%r853,%r854,%r855,%r856,%r857,%r858,%r859,%r860,%r861,%r862,%r863,%r864,%r865,%r866,%r867,%r868,%r869,%r870,%r871,%r872,%r873,%r874,%r875,%r876,%r877,%r878}, %rd88, %rd89, %p4, 1, 1, 0, 0; + // end inline asm + add.s32 %r1996, %r1989, 64; + bfe.u32 %r1997, %r1996, 4, 14; + cvt.u64.u32 %rd124, %r1997; + or.b64 %rd90, %rd124, 4611686293372403712; + add.s32 %r1998, %r1266, 64; + bfe.u32 %r1999, %r1998, 4, 14; + cvt.u64.u32 %rd125, %r1999; + or.b64 %rd91, %rd125, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r847,%r848,%r849,%r850,%r851,%r852,%r853,%r854,%r855,%r856,%r857,%r858,%r859,%r860,%r861,%r862,%r863,%r864,%r865,%r866,%r867,%r868,%r869,%r870,%r871,%r872,%r873,%r874,%r875,%r876,%r877,%r878}, %rd90, %rd91, %p4, 1, 1, 0, 0; + // end inline asm + add.s32 %r2000, %r1989, 96; + bfe.u32 %r2001, %r2000, 4, 14; + cvt.u64.u32 %rd126, %r2001; + or.b64 %rd92, %rd126, 4611686293372403712; + add.s32 %r2002, %r1266, 96; + bfe.u32 %r2003, %r2002, 4, 14; + cvt.u64.u32 %rd127, %r2003; + or.b64 %rd93, %rd127, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r847,%r848,%r849,%r850,%r851,%r852,%r853,%r854,%r855,%r856,%r857,%r858,%r859,%r860,%r861,%r862,%r863,%r864,%r865,%r866,%r867,%r868,%r869,%r870,%r871,%r872,%r873,%r874,%r875,%r876,%r877,%r878}, %rd92, %rd93, %p4, 1, 1, 0, 0; + // end inline asm + add.s32 %r2004, %r1989, 16384; + bfe.u32 %r2005, %r2004, 4, 14; + cvt.u64.u32 %rd128, %r2005; + or.b64 %rd94, %rd128, 4611686293372403712; + add.s32 %r2006, %r1266, 8192; + bfe.u32 %r2007, %r2006, 4, 14; + cvt.u64.u32 %rd129, %r2007; + or.b64 %rd95, %rd129, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r847,%r848,%r849,%r850,%r851,%r852,%r853,%r854,%r855,%r856,%r857,%r858,%r859,%r860,%r861,%r862,%r863,%r864,%r865,%r866,%r867,%r868,%r869,%r870,%r871,%r872,%r873,%r874,%r875,%r876,%r877,%r878}, %rd94, %rd95, %p4, 1, 1, 0, 0; + // end inline asm + add.s32 %r2008, %r1989, 16416; + bfe.u32 %r2009, %r2008, 4, 14; + cvt.u64.u32 %rd130, %r2009; + or.b64 %rd96, %rd130, 4611686293372403712; + add.s32 %r2010, %r1266, 8224; + bfe.u32 %r2011, %r2010, 4, 14; + cvt.u64.u32 %rd131, %r2011; + or.b64 %rd97, %rd131, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r847,%r848,%r849,%r850,%r851,%r852,%r853,%r854,%r855,%r856,%r857,%r858,%r859,%r860,%r861,%r862,%r863,%r864,%r865,%r866,%r867,%r868,%r869,%r870,%r871,%r872,%r873,%r874,%r875,%r876,%r877,%r878}, %rd96, %rd97, %p4, 1, 1, 0, 0; + // end inline asm + add.s32 %r2012, %r1989, 16448; + bfe.u32 %r2013, %r2012, 4, 14; + cvt.u64.u32 %rd132, %r2013; + or.b64 %rd98, %rd132, 4611686293372403712; + add.s32 %r2014, %r1266, 8256; + bfe.u32 %r2015, %r2014, 4, 14; + cvt.u64.u32 %rd133, %r2015; + or.b64 %rd99, %rd133, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r847,%r848,%r849,%r850,%r851,%r852,%r853,%r854,%r855,%r856,%r857,%r858,%r859,%r860,%r861,%r862,%r863,%r864,%r865,%r866,%r867,%r868,%r869,%r870,%r871,%r872,%r873,%r874,%r875,%r876,%r877,%r878}, %rd98, %rd99, %p4, 1, 1, 0, 0; + // end inline asm + add.s32 %r2016, %r1989, 16480; + bfe.u32 %r2017, %r2016, 4, 14; + cvt.u64.u32 %rd134, %r2017; + or.b64 %rd100, %rd134, 4611686293372403712; + add.s32 %r2018, %r1266, 8288; + bfe.u32 %r2019, %r2018, 4, 14; + cvt.u64.u32 %rd135, %r2019; + or.b64 %rd101, %rd135, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r847,%r848,%r849,%r850,%r851,%r852,%r853,%r854,%r855,%r856,%r857,%r858,%r859,%r860,%r861,%r862,%r863,%r864,%r865,%r866,%r867,%r868,%r869,%r870,%r871,%r872,%r873,%r874,%r875,%r876,%r877,%r878}, %rd100, %rd101, %p4, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r1264, %r1366; + mov.b32 %r1265, %r1366; + mov.b32 %r1267, %r1366; + mov.b32 %r1268, %r1366; + // begin inline asm + // wait for regs: %r847,%r848,%r849,%r850,%r851,%r852,%r853,%r854,%r855,%r856,%r857,%r858,%r859,%r860,%r861,%r862,%r863,%r864,%r865,%r866,%r867,%r868,%r869,%r870,%r871,%r872,%r873,%r874,%r875,%r876,%r877,%r878,%r1263,%r1264,%r1265,%r1266,%r1267,%r1268,%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 353 14 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:353:14 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2020, %r847, 0f3DB504F3; + mul.f32 %r2021, %r848, 0f3DB504F3; + mul.f32 %r2022, %r849, 0f3DB504F3; + mul.f32 %r2023, %r850, 0f3DB504F3; + mul.f32 %r2024, %r851, 0f3DB504F3; + mul.f32 %r2025, %r852, 0f3DB504F3; + mul.f32 %r2026, %r853, 0f3DB504F3; + mul.f32 %r2027, %r854, 0f3DB504F3; + mul.f32 %r2028, %r855, 0f3DB504F3; + mul.f32 %r2029, %r856, 0f3DB504F3; + mul.f32 %r2030, %r857, 0f3DB504F3; + mul.f32 %r2031, %r858, 0f3DB504F3; + mul.f32 %r2032, %r859, 0f3DB504F3; + mul.f32 %r2033, %r860, 0f3DB504F3; + mul.f32 %r2034, %r861, 0f3DB504F3; + mul.f32 %r2035, %r862, 0f3DB504F3; + mul.f32 %r2036, %r863, 0f3DB504F3; + mul.f32 %r2037, %r864, 0f3DB504F3; + mul.f32 %r2038, %r865, 0f3DB504F3; + mul.f32 %r2039, %r866, 0f3DB504F3; + mul.f32 %r2040, %r867, 0f3DB504F3; + mul.f32 %r2041, %r868, 0f3DB504F3; + mul.f32 %r2042, %r869, 0f3DB504F3; + mul.f32 %r2043, %r870, 0f3DB504F3; + mul.f32 %r2044, %r871, 0f3DB504F3; + mul.f32 %r2045, %r872, 0f3DB504F3; + mul.f32 %r2046, %r873, 0f3DB504F3; + mul.f32 %r2047, %r874, 0f3DB504F3; + mul.f32 %r2048, %r875, 0f3DB504F3; + mul.f32 %r2049, %r876, 0f3DB504F3; + mul.f32 %r2050, %r877, 0f3DB504F3; + mul.f32 %r2051, %r878, 0f3DB504F3; + .loc 1 373 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:373:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + setp.ge.s32 %p21, %r73, %r4773; + setp.ge.s32 %p22, %r73, %r4772; + setp.ge.s32 %p23, %r71, %r4773; + setp.ge.s32 %p24, %r71, %r4772; + setp.ge.s32 %p25, %r73, %r4771; + setp.ge.s32 %p26, %r73, %r4770; + setp.ge.s32 %p27, %r71, %r4771; + setp.ge.s32 %p28, %r71, %r4770; + setp.ge.s32 %p29, %r73, %r4769; + setp.ge.s32 %p30, %r73, %r4768; + setp.ge.s32 %p31, %r71, %r4769; + setp.ge.s32 %p32, %r71, %r4768; + setp.ge.s32 %p33, %r73, %r4767; + setp.ge.s32 %p34, %r73, %r4766; + setp.ge.s32 %p35, %r71, %r4767; + setp.ge.s32 %p36, %r71, %r4766; + setp.ge.s32 %p37, %r73, %r4765; + setp.ge.s32 %p38, %r73, %r4764; + setp.ge.s32 %p39, %r71, %r4765; + setp.ge.s32 %p40, %r71, %r4764; + setp.ge.s32 %p41, %r73, %r4763; + setp.ge.s32 %p42, %r73, %r4762; + setp.ge.s32 %p43, %r71, %r4763; + setp.ge.s32 %p44, %r71, %r4762; + setp.ge.s32 %p45, %r73, %r4761; + setp.ge.s32 %p46, %r73, %r4760; + setp.ge.s32 %p47, %r71, %r4761; + setp.ge.s32 %p48, %r71, %r4760; + setp.ge.s32 %p49, %r73, %r4759; + setp.ge.s32 %p50, %r73, %r4758; + setp.ge.s32 %p51, %r71, %r4759; + setp.ge.s32 %p52, %r71, %r4758; + .loc 1 384 24 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:384:24 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + setp.gt.s32 %p53, %r4773, 2047; + setp.gt.s32 %p54, %r4771, 2047; + setp.gt.s32 %p55, %r4769, 2047; + setp.gt.s32 %p56, %r4767, 2047; + setp.gt.s32 %p57, %r4765, 2047; + setp.gt.s32 %p58, %r4763, 2047; + setp.gt.s32 %p59, %r4761, 2047; + setp.gt.s32 %p60, %r4759, 2047; + .loc 1 385 24 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:385:24 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + shr.s32 %r2052, %r4758, 31; + shr.u32 %r2053, %r2052, 21; + add.s32 %r2054, %r4758, %r2053; + and.b32 %r2055, %r2054, -2048; + sub.s32 %r2056, %r4758, %r2055; + shr.s32 %r2057, %r4760, 31; + shr.u32 %r2058, %r2057, 21; + add.s32 %r2059, %r4760, %r2058; + and.b32 %r2060, %r2059, -2048; + sub.s32 %r2061, %r4760, %r2060; + shr.s32 %r2062, %r4762, 31; + shr.u32 %r2063, %r2062, 21; + add.s32 %r2064, %r4762, %r2063; + and.b32 %r2065, %r2064, -2048; + sub.s32 %r2066, %r4762, %r2065; + shr.s32 %r2067, %r4764, 31; + shr.u32 %r2068, %r2067, 21; + add.s32 %r2069, %r4764, %r2068; + and.b32 %r2070, %r2069, -2048; + sub.s32 %r2071, %r4764, %r2070; + shr.s32 %r2072, %r4766, 31; + shr.u32 %r2073, %r2072, 21; + add.s32 %r2074, %r4766, %r2073; + and.b32 %r2075, %r2074, -2048; + sub.s32 %r2076, %r4766, %r2075; + shr.s32 %r2077, %r4768, 31; + shr.u32 %r2078, %r2077, 21; + add.s32 %r2079, %r4768, %r2078; + and.b32 %r2080, %r2079, -2048; + sub.s32 %r2081, %r4768, %r2080; + shr.s32 %r2082, %r4770, 31; + shr.u32 %r2083, %r2082, 21; + add.s32 %r2084, %r4770, %r2083; + and.b32 %r2085, %r2084, -2048; + sub.s32 %r2086, %r4770, %r2085; + shr.s32 %r2087, %r4772, 31; + shr.u32 %r2088, %r2087, 21; + add.s32 %r2089, %r4772, %r2088; + and.b32 %r2090, %r2089, -2048; + sub.s32 %r2091, %r4772, %r2090; + .loc 1 387 25 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:387:25 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + setp.ne.b32 %p61, %r2091, 0; + setp.ne.b32 %p62, %r2086, 0; + setp.ne.b32 %p63, %r2081, 0; + setp.ne.b32 %p64, %r2076, 0; + setp.ne.b32 %p65, %r2071, 0; + setp.ne.b32 %p66, %r2066, 0; + setp.ne.b32 %p67, %r2061, 0; + setp.ne.b32 %p68, %r2056, 0; + .loc 1 388 92 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:388:92 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + and.b32 %r2092, %r4759, -2147481601; + and.b32 %r2093, %r4761, -2147481601; + and.b32 %r2094, %r4763, -2147481601; + and.b32 %r2095, %r4765, -2147481601; + and.b32 %r2096, %r4767, -2147481601; + and.b32 %r2097, %r4769, -2147481601; + and.b32 %r2098, %r4771, -2147481601; + and.b32 %r2099, %r4773, -2147481601; + setp.gt.u32 %p69, %r2099, -2147483648; + setp.gt.u32 %p70, %r2098, -2147483648; + setp.gt.u32 %p71, %r2097, -2147483648; + setp.gt.u32 %p72, %r2096, -2147483648; + setp.gt.u32 %p73, %r2095, -2147483648; + setp.gt.u32 %p74, %r2094, -2147483648; + setp.gt.u32 %p75, %r2093, -2147483648; + setp.gt.u32 %p76, %r2092, -2147483648; + .loc 1 394 25 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:394:25 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + prmt.b32 %r2100, %r4771, %r4773, 0x5410U; + prmt.b32 %r2101, %r4767, %r4769, 0x5410U; + prmt.b32 %r2102, %r4763, %r4765, 0x5410U; + prmt.b32 %r2103, %r4759, %r4761, 0x5410U; + and.b32 %r2104, %r2103, 134154239; + and.b32 %r2105, %r2102, 134154239; + and.b32 %r2106, %r2101, 134154239; + and.b32 %r2107, %r2100, 134154239; + .loc 1 395 24 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:395:24 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mov.b32 {%rs1, %rs2}, %r2107; + cvt.u64.u16 %rd136, %rs2; + cvt.u64.u16 %rd137, %rs1; + mov.b32 {%rs3, %rs4}, %r2106; + cvt.u64.u16 %rd138, %rs4; + cvt.u64.u16 %rd139, %rs3; + mov.b32 {%rs5, %rs6}, %r2105; + cvt.u64.u16 %rd140, %rs6; + cvt.u64.u16 %rd141, %rs5; + mov.b32 {%rs7, %rs8}, %r2104; + cvt.u64.u16 %rd142, %rs8; + cvt.u64.u16 %rd143, %rs7; + setp.gt.s64 %p77, %rd7, %rd143; + setp.gt.s64 %p78, %rd7, %rd142; + setp.gt.s64 %p79, %rd7, %rd141; + setp.gt.s64 %p80, %rd7, %rd140; + setp.gt.s64 %p81, %rd7, %rd139; + setp.gt.s64 %p82, %rd7, %rd138; + setp.gt.s64 %p83, %rd7, %rd137; + setp.gt.s64 %p84, %rd7, %rd136; + .loc 1 392 24 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:392:24 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s32 %r2108, %r2091, 2048; + add.s32 %r2109, %r2086, 2048; + add.s32 %r2110, %r2081, 2048; + add.s32 %r2111, %r2076, 2048; + add.s32 %r2112, %r2071, 2048; + add.s32 %r2113, %r2066, 2048; + add.s32 %r2114, %r2061, 2048; + add.s32 %r2115, %r2056, 2048; + .loc 1 393 39 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:393:39 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.b32 %r2116, %r2115, %r2056, %p68; + selp.b32 %r2117, %r2116, %r2056, %p76; + selp.b32 %r2118, %r2114, %r2061, %p67; + selp.b32 %r2119, %r2118, %r2061, %p75; + selp.b32 %r2120, %r2113, %r2066, %p66; + selp.b32 %r2121, %r2120, %r2066, %p74; + selp.b32 %r2122, %r2112, %r2071, %p65; + selp.b32 %r2123, %r2122, %r2071, %p73; + selp.b32 %r2124, %r2111, %r2076, %p64; + selp.b32 %r2125, %r2124, %r2076, %p72; + selp.b32 %r2126, %r2110, %r2081, %p63; + selp.b32 %r2127, %r2126, %r2081, %p71; + selp.b32 %r2128, %r2109, %r2086, %p62; + selp.b32 %r2129, %r2128, %r2086, %p70; + selp.b32 %r2130, %r2108, %r2091, %p61; + selp.b32 %r2131, %r2130, %r2091, %p69; + .loc 1 395 24 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:395:24 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + cvt.s64.s32 %rd144, %r2131; + cvt.s64.s32 %rd145, %r2129; + cvt.s64.s32 %rd146, %r2127; + cvt.s64.s32 %rd147, %r2125; + cvt.s64.s32 %rd148, %r2123; + cvt.s64.s32 %rd149, %r2121; + cvt.s64.s32 %rd150, %r2119; + cvt.s64.s32 %rd151, %r2117; + setp.gt.s64 %p85, %rd7, %rd151; + setp.gt.s64 %p86, %rd7, %rd150; + setp.gt.s64 %p87, %rd7, %rd149; + setp.gt.s64 %p88, %rd7, %rd148; + setp.gt.s64 %p89, %rd7, %rd147; + setp.gt.s64 %p90, %rd7, %rd146; + setp.gt.s64 %p91, %rd7, %rd145; + setp.gt.s64 %p92, %rd7, %rd144; + .loc 1 396 24 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:396:24 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + and.pred %p93, %p53, %p84; + and.pred %p94, %p53, %p92; + and.pred %p95, %p54, %p83; + and.pred %p96, %p54, %p91; + and.pred %p97, %p55, %p82; + and.pred %p98, %p55, %p90; + and.pred %p99, %p56, %p81; + and.pred %p100, %p56, %p89; + and.pred %p101, %p57, %p80; + and.pred %p102, %p57, %p88; + and.pred %p103, %p58, %p79; + and.pred %p104, %p58, %p87; + and.pred %p105, %p59, %p78; + and.pred %p106, %p59, %p86; + and.pred %p107, %p60, %p77; + and.pred %p108, %p60, %p85; + .loc 1 397 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:397:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + sub.s32 %r2132, %r4762, %r73; + sub.s32 %r2133, %r4763, %r73; + sub.s32 %r2134, %r4762, %r71; + sub.s32 %r2135, %r4763, %r71; + sub.s32 %r2136, %r4764, %r73; + sub.s32 %r2137, %r4765, %r73; + sub.s32 %r2138, %r4764, %r71; + sub.s32 %r2139, %r4765, %r71; + sub.s32 %r2140, %r4758, %r73; + sub.s32 %r2141, %r4759, %r73; + sub.s32 %r2142, %r4758, %r71; + sub.s32 %r2143, %r4759, %r71; + sub.s32 %r2144, %r4760, %r73; + sub.s32 %r2145, %r4761, %r73; + sub.s32 %r2146, %r4760, %r71; + sub.s32 %r2147, %r4761, %r71; + sub.s32 %r2148, %r4766, %r71; + sub.s32 %r2149, %r4767, %r71; + sub.s32 %r2150, %r4766, %r73; + sub.s32 %r2151, %r4767, %r73; + sub.s32 %r2152, %r4768, %r71; + sub.s32 %r2153, %r4769, %r71; + sub.s32 %r2154, %r4768, %r73; + sub.s32 %r2155, %r4769, %r73; + sub.s32 %r2156, %r4770, %r71; + sub.s32 %r2157, %r4771, %r71; + sub.s32 %r2158, %r4770, %r73; + sub.s32 %r2159, %r4771, %r73; + sub.s32 %r2160, %r4772, %r71; + sub.s32 %r2161, %r4773, %r71; + sub.s32 %r2162, %r4772, %r73; + sub.s32 %r2163, %r4773, %r73; + .loc 1 404 39 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:404:39 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + and.b32 %r2164, %r2163, 2047; + and.b32 %r2165, %r2162, 2047; + and.b32 %r2166, %r2161, 2047; + and.b32 %r2167, %r2160, 2047; + and.b32 %r2168, %r2159, 2047; + and.b32 %r2169, %r2158, 2047; + and.b32 %r2170, %r2157, 2047; + and.b32 %r2171, %r2156, 2047; + and.b32 %r2172, %r2155, 2047; + and.b32 %r2173, %r2154, 2047; + and.b32 %r2174, %r2153, 2047; + and.b32 %r2175, %r2152, 2047; + and.b32 %r2176, %r2151, 2047; + and.b32 %r2177, %r2150, 2047; + and.b32 %r2178, %r2149, 2047; + and.b32 %r2179, %r2148, 2047; + and.b32 %r2180, %r2147, 2047; + and.b32 %r2181, %r2146, 2047; + and.b32 %r2182, %r2145, 2047; + and.b32 %r2183, %r2144, 2047; + and.b32 %r2184, %r2143, 2047; + and.b32 %r2185, %r2142, 2047; + and.b32 %r2186, %r2141, 2047; + and.b32 %r2187, %r2140, 2047; + and.b32 %r2188, %r2139, 2047; + and.b32 %r2189, %r2138, 2047; + and.b32 %r2190, %r2137, 2047; + and.b32 %r2191, %r2136, 2047; + and.b32 %r2192, %r2135, 2047; + and.b32 %r2193, %r2134, 2047; + and.b32 %r2194, %r2133, 2047; + and.b32 %r2195, %r2132, 2047; + .loc 1 405 25 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:405:25 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + setp.eq.b32 %p109, %r2195, 0; + selp.b16 %rs9, 1, 0, %p109; + shl.b16 %rs10, %rs9, 2; + setp.eq.b32 %p110, %r2194, 0; + selp.b16 %rs11, -1, 0, %p110; + shl.b16 %rs12, %rs11, 3; + or.b16 %rs13, %rs12, %rs10; + setp.eq.b32 %p111, %r2193, 0; + selp.b16 %rs14, 1, 0, %p111; + setp.eq.b32 %p112, %r2192, 0; + selp.b16 %rs15, -1, 0, %p112; + shl.b16 %rs16, %rs15, 1; + or.b16 %rs17, %rs14, %rs16; + and.b16 %rs18, %rs17, 3; + or.b16 %rs19, %rs18, %rs13; + and.b16 %rs20, %rs19, 15; + shl.b16 %rs21, %rs20, 8; + setp.eq.b32 %p113, %r2191, 0; + selp.b16 %rs22, 1, 0, %p113; + shl.b16 %rs23, %rs22, 2; + setp.eq.b32 %p114, %r2190, 0; + selp.b16 %rs24, -1, 0, %p114; + shl.b16 %rs25, %rs24, 3; + or.b16 %rs26, %rs25, %rs23; + setp.eq.b32 %p115, %r2189, 0; + selp.b16 %rs27, 1, 0, %p115; + setp.eq.b32 %p116, %r2188, 0; + selp.b16 %rs28, -1, 0, %p116; + shl.b16 %rs29, %rs28, 1; + or.b16 %rs30, %rs27, %rs29; + and.b16 %rs31, %rs30, 3; + or.b16 %rs32, %rs31, %rs26; + shl.b16 %rs33, %rs32, 12; + or.b16 %rs34, %rs33, %rs21; + setp.eq.b32 %p117, %r2187, 0; + selp.b16 %rs35, 1, 0, %p117; + shl.b16 %rs36, %rs35, 2; + setp.eq.b32 %p118, %r2186, 0; + selp.b16 %rs37, -1, 0, %p118; + shl.b16 %rs38, %rs37, 3; + or.b16 %rs39, %rs38, %rs36; + setp.eq.b32 %p119, %r2185, 0; + selp.b16 %rs40, 1, 0, %p119; + setp.eq.b32 %p120, %r2184, 0; + selp.b16 %rs41, -1, 0, %p120; + shl.b16 %rs42, %rs41, 1; + or.b16 %rs43, %rs40, %rs42; + and.b16 %rs44, %rs43, 3; + or.b16 %rs45, %rs44, %rs39; + and.b16 %rs46, %rs45, 15; + setp.eq.b32 %p121, %r2183, 0; + selp.b16 %rs47, 1, 0, %p121; + shl.b16 %rs48, %rs47, 2; + setp.eq.b32 %p122, %r2182, 0; + selp.b16 %rs49, -1, 0, %p122; + shl.b16 %rs50, %rs49, 3; + or.b16 %rs51, %rs50, %rs48; + setp.eq.b32 %p123, %r2181, 0; + selp.b16 %rs52, 1, 0, %p123; + setp.eq.b32 %p124, %r2180, 0; + selp.b16 %rs53, -1, 0, %p124; + shl.b16 %rs54, %rs53, 1; + or.b16 %rs55, %rs52, %rs54; + and.b16 %rs56, %rs55, 3; + or.b16 %rs57, %rs56, %rs51; + shl.b16 %rs58, %rs57, 4; + or.b16 %rs59, %rs46, %rs58; + and.b16 %rs60, %rs59, 255; + or.b16 %rs61, %rs60, %rs34; + cvt.u32.u16 %r2196, %rs61; + setp.eq.b32 %p125, %r2179, 0; + setp.eq.b32 %p126, %r2178, 0; + setp.eq.b32 %p127, %r2177, 0; + setp.eq.b32 %p128, %r2176, 0; + setp.eq.b32 %p129, %r2175, 0; + setp.eq.b32 %p130, %r2174, 0; + setp.eq.b32 %p131, %r2173, 0; + setp.eq.b32 %p132, %r2172, 0; + setp.eq.b32 %p133, %r2171, 0; + setp.eq.b32 %p134, %r2170, 0; + setp.eq.b32 %p135, %r2169, 0; + setp.eq.b32 %p136, %r2168, 0; + setp.eq.b32 %p137, %r2167, 0; + setp.eq.b32 %p138, %r2166, 0; + setp.eq.b32 %p139, %r2165, 0; + setp.eq.b32 %p140, %r2164, 0; + .loc 1 406 24 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:406:24 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + shr.u32 %r2197, %r2196, 15; + and.b32 %r2198, %r2197, 1; + setp.ne.b32 %p141, %r2198, 0; + shr.u32 %r2199, %r2196, 14; + and.b32 %r2200, %r2199, 1; + setp.ne.b32 %p142, %r2200, 0; + shr.u32 %r2201, %r2196, 13; + and.b32 %r2202, %r2201, 1; + setp.ne.b32 %p143, %r2202, 0; + shr.u32 %r2203, %r2196, 12; + and.b32 %r2204, %r2203, 1; + setp.ne.b32 %p144, %r2204, 0; + shr.u32 %r2205, %r2196, 11; + and.b32 %r2206, %r2205, 1; + setp.ne.b32 %p145, %r2206, 0; + shr.u32 %r2207, %r2196, 10; + and.b32 %r2208, %r2207, 1; + setp.ne.b32 %p146, %r2208, 0; + shr.u32 %r2209, %r2196, 9; + and.b32 %r2210, %r2209, 1; + setp.ne.b32 %p147, %r2210, 0; + shr.u32 %r2211, %r2196, 8; + and.b32 %r2212, %r2211, 1; + setp.ne.b32 %p148, %r2212, 0; + shr.u32 %r2213, %r2196, 7; + and.b32 %r2214, %r2213, 1; + setp.ne.b32 %p149, %r2214, 0; + shr.u32 %r2215, %r2196, 6; + and.b32 %r2216, %r2215, 1; + setp.ne.b32 %p150, %r2216, 0; + shr.u32 %r2217, %r2196, 5; + and.b32 %r2218, %r2217, 1; + setp.ne.b32 %p151, %r2218, 0; + shr.u32 %r2219, %r2196, 4; + and.b32 %r2220, %r2219, 1; + setp.ne.b32 %p152, %r2220, 0; + shr.u32 %r2221, %r2196, 3; + and.b32 %r2222, %r2221, 1; + setp.ne.b32 %p153, %r2222, 0; + shr.u32 %r2223, %r2196, 2; + and.b32 %r2224, %r2223, 1; + setp.ne.b32 %p154, %r2224, 0; + shr.u32 %r2225, %r2196, 1; + and.b32 %r2226, %r2225, 1; + setp.ne.b32 %p155, %r2226, 0; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2227, %r2020, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2228, %r2227, 0fFF800000, %p93; + selp.f32 %r2229, %r2228, 0fFF800000, %p140; + selp.f32 %r2230, %r2227, %r2229, %p21; + selp.f32 %r2231, %r2230, %r2229, %p18; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2232, %r2021, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2233, %r2232, 0fFF800000, %p94; + selp.f32 %r2234, %r2233, 0fFF800000, %p139; + selp.f32 %r2235, %r2232, %r2234, %p22; + selp.f32 %r2236, %r2235, %r2234, %p18; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2237, %r2022, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2238, %r2237, 0fFF800000, %p93; + selp.f32 %r2239, %r2238, 0fFF800000, %p138; + selp.f32 %r2240, %r2237, %r2239, %p23; + selp.f32 %r2241, %r2240, %r2239, %p17; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2242, %r2023, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2243, %r2242, 0fFF800000, %p94; + selp.f32 %r2244, %r2243, 0fFF800000, %p137; + selp.f32 %r2245, %r2242, %r2244, %p24; + selp.f32 %r2246, %r2245, %r2244, %p17; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2247, %r2024, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2248, %r2247, 0fFF800000, %p95; + selp.f32 %r2249, %r2248, 0fFF800000, %p136; + selp.f32 %r2250, %r2247, %r2249, %p25; + selp.f32 %r2251, %r2250, %r2249, %p18; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2252, %r2025, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2253, %r2252, 0fFF800000, %p96; + selp.f32 %r2254, %r2253, 0fFF800000, %p135; + selp.f32 %r2255, %r2252, %r2254, %p26; + selp.f32 %r2256, %r2255, %r2254, %p18; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2257, %r2026, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2258, %r2257, 0fFF800000, %p95; + selp.f32 %r2259, %r2258, 0fFF800000, %p134; + selp.f32 %r2260, %r2257, %r2259, %p27; + selp.f32 %r2261, %r2260, %r2259, %p17; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2262, %r2027, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2263, %r2262, 0fFF800000, %p96; + selp.f32 %r2264, %r2263, 0fFF800000, %p133; + selp.f32 %r2265, %r2262, %r2264, %p28; + selp.f32 %r2266, %r2265, %r2264, %p17; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2267, %r2028, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2268, %r2267, 0fFF800000, %p97; + selp.f32 %r2269, %r2268, 0fFF800000, %p132; + selp.f32 %r2270, %r2267, %r2269, %p29; + selp.f32 %r2271, %r2270, %r2269, %p18; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2272, %r2029, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2273, %r2272, 0fFF800000, %p98; + selp.f32 %r2274, %r2273, 0fFF800000, %p131; + selp.f32 %r2275, %r2272, %r2274, %p30; + selp.f32 %r2276, %r2275, %r2274, %p18; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2277, %r2030, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2278, %r2277, 0fFF800000, %p97; + selp.f32 %r2279, %r2278, 0fFF800000, %p130; + selp.f32 %r2280, %r2277, %r2279, %p31; + selp.f32 %r2281, %r2280, %r2279, %p17; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2282, %r2031, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2283, %r2282, 0fFF800000, %p98; + selp.f32 %r2284, %r2283, 0fFF800000, %p129; + selp.f32 %r2285, %r2282, %r2284, %p32; + selp.f32 %r2286, %r2285, %r2284, %p17; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2287, %r2032, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2288, %r2287, 0fFF800000, %p99; + selp.f32 %r2289, %r2288, 0fFF800000, %p128; + selp.f32 %r2290, %r2287, %r2289, %p33; + selp.f32 %r2291, %r2290, %r2289, %p18; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2292, %r2033, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2293, %r2292, 0fFF800000, %p100; + selp.f32 %r2294, %r2293, 0fFF800000, %p127; + selp.f32 %r2295, %r2292, %r2294, %p34; + selp.f32 %r2296, %r2295, %r2294, %p18; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2297, %r2034, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2298, %r2297, 0fFF800000, %p99; + selp.f32 %r2299, %r2298, 0fFF800000, %p126; + selp.f32 %r2300, %r2297, %r2299, %p35; + selp.f32 %r2301, %r2300, %r2299, %p17; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2302, %r2035, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2303, %r2302, 0fFF800000, %p100; + selp.f32 %r2304, %r2303, 0fFF800000, %p125; + selp.f32 %r2305, %r2302, %r2304, %p36; + selp.f32 %r2306, %r2305, %r2304, %p17; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2307, %r2036, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2308, %r2307, 0fFF800000, %p101; + selp.f32 %r2309, %r2308, 0fFF800000, %p141; + selp.f32 %r2310, %r2307, %r2309, %p37; + selp.f32 %r2311, %r2310, %r2309, %p18; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2312, %r2037, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2313, %r2312, 0fFF800000, %p102; + selp.f32 %r2314, %r2313, 0fFF800000, %p142; + selp.f32 %r2315, %r2312, %r2314, %p38; + selp.f32 %r2316, %r2315, %r2314, %p18; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2317, %r2038, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2318, %r2317, 0fFF800000, %p101; + selp.f32 %r2319, %r2318, 0fFF800000, %p143; + selp.f32 %r2320, %r2317, %r2319, %p39; + selp.f32 %r2321, %r2320, %r2319, %p17; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2322, %r2039, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2323, %r2322, 0fFF800000, %p102; + selp.f32 %r2324, %r2323, 0fFF800000, %p144; + selp.f32 %r2325, %r2322, %r2324, %p40; + selp.f32 %r2326, %r2325, %r2324, %p17; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2327, %r2040, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2328, %r2327, 0fFF800000, %p103; + selp.f32 %r2329, %r2328, 0fFF800000, %p145; + selp.f32 %r2330, %r2327, %r2329, %p41; + selp.f32 %r2331, %r2330, %r2329, %p18; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2332, %r2041, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2333, %r2332, 0fFF800000, %p104; + selp.f32 %r2334, %r2333, 0fFF800000, %p146; + selp.f32 %r2335, %r2332, %r2334, %p42; + selp.f32 %r2336, %r2335, %r2334, %p18; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2337, %r2042, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2338, %r2337, 0fFF800000, %p103; + selp.f32 %r2339, %r2338, 0fFF800000, %p147; + selp.f32 %r2340, %r2337, %r2339, %p43; + selp.f32 %r2341, %r2340, %r2339, %p17; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2342, %r2043, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2343, %r2342, 0fFF800000, %p104; + selp.f32 %r2344, %r2343, 0fFF800000, %p148; + selp.f32 %r2345, %r2342, %r2344, %p44; + selp.f32 %r2346, %r2345, %r2344, %p17; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2347, %r2044, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2348, %r2347, 0fFF800000, %p105; + selp.f32 %r2349, %r2348, 0fFF800000, %p149; + selp.f32 %r2350, %r2347, %r2349, %p45; + selp.f32 %r2351, %r2350, %r2349, %p18; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2352, %r2045, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2353, %r2352, 0fFF800000, %p106; + selp.f32 %r2354, %r2353, 0fFF800000, %p150; + selp.f32 %r2355, %r2352, %r2354, %p46; + selp.f32 %r2356, %r2355, %r2354, %p18; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2357, %r2046, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2358, %r2357, 0fFF800000, %p105; + selp.f32 %r2359, %r2358, 0fFF800000, %p151; + selp.f32 %r2360, %r2357, %r2359, %p47; + selp.f32 %r2361, %r2360, %r2359, %p17; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2362, %r2047, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2363, %r2362, 0fFF800000, %p106; + selp.f32 %r2364, %r2363, 0fFF800000, %p152; + selp.f32 %r2365, %r2362, %r2364, %p48; + selp.f32 %r2366, %r2365, %r2364, %p17; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2367, %r2048, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2368, %r2367, 0fFF800000, %p107; + selp.f32 %r2369, %r2368, 0fFF800000, %p153; + selp.f32 %r2370, %r2367, %r2369, %p49; + selp.f32 %r2371, %r2370, %r2369, %p18; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2372, %r2049, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2373, %r2372, 0fFF800000, %p108; + selp.f32 %r2374, %r2373, 0fFF800000, %p154; + selp.f32 %r2375, %r2372, %r2374, %p50; + selp.f32 %r2376, %r2375, %r2374, %p18; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2377, %r2050, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2378, %r2377, 0fFF800000, %p107; + selp.f32 %r2379, %r2378, 0fFF800000, %p155; + selp.f32 %r2380, %r2377, %r2379, %p51; + selp.f32 %r2381, %r2380, %r2379, %p17; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r2382, %r2051, 0f3FB8AA3B; + .loc 1 414 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:414:69 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2383, %r2382, 0fFF800000, %p108; + selp.f32 %r2384, %r2383, 0fFF800000, %p119; + selp.f32 %r2385, %r2382, %r2384, %p52; + selp.f32 %r2386, %r2385, %r2384, %p17; + .loc 2 168 27 // standard.py:168:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + max.f32 %r2387, %r2231, %r2236; + max.f32 %r2388, %r2241, %r2246; + max.f32 %r2389, %r2387, %r2251; + max.f32 %r2390, %r2389, %r2256; + max.f32 %r2391, %r2388, %r2261; + max.f32 %r2392, %r2391, %r2266; + max.f32 %r2393, %r2390, %r2271; + max.f32 %r2394, %r2393, %r2276; + max.f32 %r2395, %r2392, %r2281; + max.f32 %r2396, %r2395, %r2286; + max.f32 %r2397, %r2394, %r2291; + max.f32 %r2398, %r2397, %r2296; + max.f32 %r2399, %r2396, %r2301; + max.f32 %r2400, %r2399, %r2306; + max.f32 %r2401, %r2398, %r2311; + max.f32 %r2402, %r2401, %r2316; + max.f32 %r2403, %r2400, %r2321; + max.f32 %r2404, %r2403, %r2326; + max.f32 %r2405, %r2402, %r2331; + max.f32 %r2406, %r2405, %r2336; + max.f32 %r2407, %r2404, %r2341; + max.f32 %r2408, %r2407, %r2346; + max.f32 %r2409, %r2406, %r2351; + max.f32 %r2410, %r2409, %r2356; + max.f32 %r2411, %r2408, %r2361; + max.f32 %r2412, %r2411, %r2366; + max.f32 %r2413, %r2410, %r2371; + max.f32 %r2414, %r2413, %r2376; + max.f32 %r2415, %r2412, %r2381; + max.f32 %r2416, %r2415, %r2386; + .loc 2 189 40 // standard.py:189:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + shfl.sync.bfly.b32 %r2417, %r2414, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + max.f32 %r2418, %r2414, %r2417; + .loc 2 189 40 // standard.py:189:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + shfl.sync.bfly.b32 %r2419, %r2418, 1, 31, -1; + shfl.sync.bfly.b32 %r2420, %r2416, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + max.f32 %r2421, %r2416, %r2420; + .loc 2 189 40 // standard.py:189:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + shfl.sync.bfly.b32 %r2422, %r2421, 1, 31, -1; + cvt.u64.u32 %rd152, %r2419; + cvt.u64.u32 %rd153, %r2422; + shl.b64 %rd154, %rd153, 32; + or.b64 %rd155, %rd152, %rd154; + .loc 2 168 27 // standard.py:168:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mov.b64 {%r2423, %r2424}, %rd155; + max.f32 %r2425, %r2418, %r2423; + max.f32 %r2426, %r2421, %r2424; + .loc 1 421 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:421:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mov.b64 {%r2427, %r2428}, %rd284; + max.f32 %r2429, %r2428, %r2426; + max.f32 %r2430, %r2427, %r2425; + mov.b64 %rd284, {%r2430, %r2429}; + .loc 1 423 35 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:423:35 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + setp.eq.f32 %p156, %r2430, 0fFF800000; + setp.eq.f32 %p157, %r2429, 0fFF800000; + .loc 1 424 51 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:424:51 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + selp.f32 %r2431, 0f00000000, %r2430, %p156; + selp.f32 %r2432, 0f00000000, %r2429, %p157; + .loc 1 428 31 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:428:31 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + sub.f32 %r2433, %r2427, %r2431; + sub.f32 %r2434, %r2428, %r2432; + .loc 1 428 25 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:428:25 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + ex2.approx.ftz.f32 %r2435, %r2433; + ex2.approx.ftz.f32 %r2436, %r2434; + .loc 1 429 39 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:429:39 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + sub.f32 %r2437, %r2231, %r2431; + sub.f32 %r2438, %r2236, %r2431; + sub.f32 %r2439, %r2241, %r2432; + sub.f32 %r2440, %r2246, %r2432; + sub.f32 %r2441, %r2251, %r2431; + sub.f32 %r2442, %r2256, %r2431; + sub.f32 %r2443, %r2261, %r2432; + sub.f32 %r2444, %r2266, %r2432; + sub.f32 %r2445, %r2271, %r2431; + sub.f32 %r2446, %r2276, %r2431; + sub.f32 %r2447, %r2281, %r2432; + sub.f32 %r2448, %r2286, %r2432; + sub.f32 %r2449, %r2291, %r2431; + sub.f32 %r2450, %r2296, %r2431; + sub.f32 %r2451, %r2301, %r2432; + sub.f32 %r2452, %r2306, %r2432; + sub.f32 %r2453, %r2311, %r2431; + sub.f32 %r2454, %r2316, %r2431; + sub.f32 %r2455, %r2321, %r2432; + sub.f32 %r2456, %r2326, %r2432; + sub.f32 %r2457, %r2331, %r2431; + sub.f32 %r2458, %r2336, %r2431; + sub.f32 %r2459, %r2341, %r2432; + sub.f32 %r2460, %r2346, %r2432; + sub.f32 %r2461, %r2351, %r2431; + sub.f32 %r2462, %r2356, %r2431; + sub.f32 %r2463, %r2361, %r2432; + sub.f32 %r2464, %r2366, %r2432; + sub.f32 %r2465, %r2371, %r2431; + sub.f32 %r2466, %r2376, %r2431; + sub.f32 %r2467, %r2381, %r2432; + sub.f32 %r2468, %r2386, %r2432; + .loc 1 429 21 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:429:21 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + ex2.approx.ftz.f32 %r2469, %r2437; + ex2.approx.ftz.f32 %r2470, %r2438; + ex2.approx.ftz.f32 %r2471, %r2439; + ex2.approx.ftz.f32 %r2472, %r2440; + ex2.approx.ftz.f32 %r2473, %r2441; + ex2.approx.ftz.f32 %r2474, %r2442; + ex2.approx.ftz.f32 %r2475, %r2443; + ex2.approx.ftz.f32 %r2476, %r2444; + ex2.approx.ftz.f32 %r2477, %r2445; + ex2.approx.ftz.f32 %r2478, %r2446; + ex2.approx.ftz.f32 %r2479, %r2447; + ex2.approx.ftz.f32 %r2480, %r2448; + ex2.approx.ftz.f32 %r2481, %r2449; + ex2.approx.ftz.f32 %r2482, %r2450; + ex2.approx.ftz.f32 %r2483, %r2451; + ex2.approx.ftz.f32 %r2484, %r2452; + ex2.approx.ftz.f32 %r2485, %r2453; + ex2.approx.ftz.f32 %r2486, %r2454; + ex2.approx.ftz.f32 %r2487, %r2455; + ex2.approx.ftz.f32 %r2488, %r2456; + ex2.approx.ftz.f32 %r2489, %r2457; + ex2.approx.ftz.f32 %r2490, %r2458; + ex2.approx.ftz.f32 %r2491, %r2459; + ex2.approx.ftz.f32 %r2492, %r2460; + ex2.approx.ftz.f32 %r2493, %r2461; + ex2.approx.ftz.f32 %r2494, %r2462; + ex2.approx.ftz.f32 %r2495, %r2463; + ex2.approx.ftz.f32 %r2496, %r2464; + ex2.approx.ftz.f32 %r2497, %r2465; + ex2.approx.ftz.f32 %r2498, %r2466; + ex2.approx.ftz.f32 %r2499, %r2467; + ex2.approx.ftz.f32 %r2500, %r2468; + .loc 2 261 15 // standard.py:261:15 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.f32 %r2501, %r2469, %r2470; + add.f32 %r2502, %r2471, %r2472; + add.f32 %r2503, %r2501, %r2473; + add.f32 %r2504, %r2503, %r2474; + add.f32 %r2505, %r2502, %r2475; + add.f32 %r2506, %r2505, %r2476; + add.f32 %r2507, %r2504, %r2477; + add.f32 %r2508, %r2507, %r2478; + add.f32 %r2509, %r2506, %r2479; + add.f32 %r2510, %r2509, %r2480; + add.f32 %r2511, %r2508, %r2481; + add.f32 %r2512, %r2511, %r2482; + add.f32 %r2513, %r2510, %r2483; + add.f32 %r2514, %r2513, %r2484; + add.f32 %r2515, %r2512, %r2485; + add.f32 %r2516, %r2515, %r2486; + add.f32 %r2517, %r2514, %r2487; + add.f32 %r2518, %r2517, %r2488; + add.f32 %r2519, %r2516, %r2489; + add.f32 %r2520, %r2519, %r2490; + add.f32 %r2521, %r2518, %r2491; + add.f32 %r2522, %r2521, %r2492; + add.f32 %r2523, %r2520, %r2493; + add.f32 %r2524, %r2523, %r2494; + add.f32 %r2525, %r2522, %r2495; + add.f32 %r2526, %r2525, %r2496; + add.f32 %r2527, %r2524, %r2497; + add.f32 %r2528, %r2527, %r2498; + add.f32 %r2529, %r2526, %r2499; + add.f32 %r2530, %r2529, %r2500; + .loc 2 291 36 // standard.py:291:36 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + shfl.sync.bfly.b32 %r2531, %r2528, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.f32 %r2532, %r2528, %r2531; + .loc 2 291 36 // standard.py:291:36 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + shfl.sync.bfly.b32 %r2533, %r2532, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.f32 %r2534, %r2532, %r2533; + .loc 2 291 36 // standard.py:291:36 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + shfl.sync.bfly.b32 %r2535, %r2530, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.f32 %r2536, %r2530, %r2535; + .loc 2 291 36 // standard.py:291:36 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + shfl.sync.bfly.b32 %r2537, %r2536, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.f32 %r2538, %r2536, %r2537; + .loc 1 434 24 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:434:24 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + fma.rn.f32 %r4974, %r4974, %r2435, %r2534; + fma.rn.f32 %r4975, %r4975, %r2436, %r2538; + .loc 1 436 16 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:436:16 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.f32 %r1567, %r1567, %r2435; + mul.f32 %r1568, %r1568, %r2435; + mul.f32 %r1569, %r1569, %r2436; + mul.f32 %r1570, %r1570, %r2436; + mul.f32 %r1571, %r1571, %r2435; + mul.f32 %r1572, %r1572, %r2435; + mul.f32 %r1573, %r1573, %r2436; + mul.f32 %r1574, %r1574, %r2436; + mul.f32 %r1575, %r1575, %r2435; + mul.f32 %r1576, %r1576, %r2435; + mul.f32 %r1577, %r1577, %r2436; + mul.f32 %r1578, %r1578, %r2436; + mul.f32 %r1579, %r1579, %r2435; + mul.f32 %r1580, %r1580, %r2435; + mul.f32 %r1581, %r1581, %r2436; + mul.f32 %r1582, %r1582, %r2436; + mul.f32 %r1583, %r1583, %r2435; + mul.f32 %r1584, %r1584, %r2435; + mul.f32 %r1585, %r1585, %r2436; + mul.f32 %r1586, %r1586, %r2436; + mul.f32 %r1587, %r1587, %r2435; + mul.f32 %r1588, %r1588, %r2435; + mul.f32 %r1589, %r1589, %r2436; + mul.f32 %r1590, %r1590, %r2436; + mul.f32 %r1591, %r1591, %r2435; + mul.f32 %r1592, %r1592, %r2435; + mul.f32 %r1593, %r1593, %r2436; + mul.f32 %r1594, %r1594, %r2436; + mul.f32 %r1595, %r1595, %r2435; + mul.f32 %r1596, %r1596, %r2435; + mul.f32 %r1597, %r1597, %r2436; + mul.f32 %r1598, %r1598, %r2436; + mul.f32 %r1599, %r1599, %r2435; + mul.f32 %r1600, %r1600, %r2435; + mul.f32 %r1601, %r1601, %r2436; + mul.f32 %r1602, %r1602, %r2436; + mul.f32 %r1603, %r1603, %r2435; + mul.f32 %r1604, %r1604, %r2435; + mul.f32 %r1605, %r1605, %r2436; + mul.f32 %r1606, %r1606, %r2436; + mul.f32 %r1607, %r1607, %r2435; + mul.f32 %r1608, %r1608, %r2435; + mul.f32 %r1609, %r1609, %r2436; + mul.f32 %r1610, %r1610, %r2436; + mul.f32 %r1611, %r1611, %r2435; + mul.f32 %r1612, %r1612, %r2435; + mul.f32 %r1613, %r1613, %r2436; + mul.f32 %r1614, %r1614, %r2436; + mul.f32 %r1615, %r1615, %r2435; + mul.f32 %r1616, %r1616, %r2435; + mul.f32 %r1617, %r1617, %r2436; + mul.f32 %r1618, %r1618, %r2436; + mul.f32 %r1619, %r1619, %r2435; + mul.f32 %r1620, %r1620, %r2435; + mul.f32 %r1621, %r1621, %r2436; + mul.f32 %r1622, %r1622, %r2436; + mul.f32 %r1623, %r1623, %r2435; + mul.f32 %r1624, %r1624, %r2435; + mul.f32 %r1625, %r1625, %r2436; + mul.f32 %r1626, %r1626, %r2436; + mul.f32 %r1627, %r1627, %r2435; + mul.f32 %r1628, %r1628, %r2435; + mul.f32 %r1629, %r1629, %r2436; + mul.f32 %r1630, %r1630, %r2436; + .loc 1 294 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:294:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s32 %r2539, %r722, 49152; + add.s32 %r2540, %r2539, %r1984; + .loc 1 440 22 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:440:22 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + cvt.rn.bf16x2.f32 %r1563, %r2470, %r2469; + cvt.rn.bf16x2.f32 %r1564, %r2472, %r2471; + cvt.rn.bf16x2.f32 %r1565, %r2474, %r2473; + cvt.rn.bf16x2.f32 %r1566, %r2476, %r2475; + cvt.rn.bf16x2.f32 %r1695, %r2478, %r2477; + cvt.rn.bf16x2.f32 %r1696, %r2480, %r2479; + cvt.rn.bf16x2.f32 %r1697, %r2482, %r2481; + cvt.rn.bf16x2.f32 %r1698, %r2484, %r2483; + cvt.rn.bf16x2.f32 %r1827, %r2486, %r2485; + cvt.rn.bf16x2.f32 %r1828, %r2488, %r2487; + cvt.rn.bf16x2.f32 %r1829, %r2490, %r2489; + cvt.rn.bf16x2.f32 %r1830, %r2492, %r2491; + cvt.rn.bf16x2.f32 %r1959, %r2494, %r2493; + cvt.rn.bf16x2.f32 %r1960, %r2496, %r2495; + cvt.rn.bf16x2.f32 %r1961, %r2498, %r2497; + cvt.rn.bf16x2.f32 %r1962, %r2500, %r2499; + .loc 1 440 44 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:440:44 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + wgmma.fence.sync.aligned; + bfe.u32 %r2541, %r2540, 4, 14; + cvt.u64.u32 %rd156, %r2541; + or.b64 %rd102, %rd156, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630}, {%r1563,%r1564,%r1565,%r1566}, %rd102, %p4, 1, 1, 1; + // end inline asm + add.s32 %r2542, %r2540, 2048; + bfe.u32 %r2543, %r2542, 4, 14; + cvt.u64.u32 %rd157, %r2543; + or.b64 %rd103, %rd157, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630}, {%r1695,%r1696,%r1697,%r1698}, %rd103, %p4, 1, 1, 1; + // end inline asm + add.s32 %r2544, %r2540, 4096; + bfe.u32 %r2545, %r2544, 4, 14; + cvt.u64.u32 %rd158, %r2545; + or.b64 %rd104, %rd158, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630}, {%r1827,%r1828,%r1829,%r1830}, %rd104, %p4, 1, 1, 1; + // end inline asm + add.s32 %r2546, %r2540, 6144; + bfe.u32 %r2547, %r2546, 4, 14; + cvt.u64.u32 %rd159, %r2547; + or.b64 %rd105, %rd159, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630}, {%r1959,%r1960,%r1961,%r1962}, %rd105, %p4, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 548 26 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:548:26 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s32 %r4758, %r4687, %r4758; + add.s32 %r4759, %r4687, %r4759; + add.s32 %r4760, %r4687, %r4760; + add.s32 %r4761, %r4687, %r4761; + add.s32 %r4762, %r4687, %r4762; + add.s32 %r4763, %r4687, %r4763; + add.s32 %r4764, %r4687, %r4764; + add.s32 %r4765, %r4687, %r4765; + add.s32 %r4766, %r4687, %r4766; + add.s32 %r4767, %r4687, %r4767; + add.s32 %r4768, %r4687, %r4768; + add.s32 %r4769, %r4687, %r4769; + add.s32 %r4770, %r4687, %r4770; + add.s32 %r4771, %r4687, %r4771; + add.s32 %r4772, %r4687, %r4772; + add.s32 %r4773, %r4687, %r4773; + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s32 %r273, %r4757, 1; + .loc 1 247 33 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:247:33 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + shr.u32 %r2548, %r273, 1; + .loc 1 248 38 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:248:38 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mad.wide.u32 %rd107, %r2548, 4, %rd39; + .loc 1 248 24 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:248:24 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + // begin inline asm + mov.u64 %rd106, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd106, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r1963, 0x0; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b32 { %r1963 }, [ %rd107 + 0 ], %rd106; + // end inline asm + .loc 1 249 109 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:249:109 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s32 %r2549, %r2548, 1; + .loc 1 249 113 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:249:113 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + setp.lt.s32 %p158, %r2549, %r662; + .loc 1 249 55 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:249:55 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s64 %rd110, %rd107, 4; + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + and.pred %p16, %p15, %p158; + .loc 1 249 25 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:249:25 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + // begin inline asm + mov.u64 %rd109, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd109, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r1964, 0x0; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b32 { %r1964 }, [ %rd110 + 0 ], %rd109; + // end inline asm + .loc 1 250 35 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:250:35 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + and.b32 %r2550, %r4757, 1; + .loc 1 251 34 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:251:34 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + sub.s32 %r2551, %r1964, %r1963; + .loc 1 251 48 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:251:48 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + shl.b32 %r2552, %r2551, 7; + .loc 1 251 63 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:251:63 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s32 %r2553, %r2552, -64; + .loc 1 252 29 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:252:29 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + xor.b32 %r2554, %r2550, 1; + .loc 1 252 61 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:252:61 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + shl.b32 %r2555, %r2550, 6; + .loc 1 252 42 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:252:42 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mad.lo.s32 %r4687, %r2553, %r2554, %r2555; + .loc 1 549 21 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:549:21 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s32 %r4690, %r4687, %r4690; + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s32 %r2556, %r4689, 1; + setp.gt.s32 %p159, %r2556, 2; + selp.b32 %r4689, 0, %r2556, %p159; + .loc 1 342 32 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:342:32 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s32 %r2557, %r4690, %r31; + .loc 1 346 35 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:346:35 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s32 %r2558, %r2557, %r8; + add.s32 %r2559, %r2557, %r9; + add.s32 %r2560, %r2557, %r10; + add.s32 %r2561, %r2557, %r11; + .loc 1 284 38 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:38 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + shl.b32 %r2562, %r2558, 7; + shl.b32 %r2563, %r2559, 7; + shl.b32 %r2564, %r2560, 7; + shl.b32 %r2565, %r2561, 7; + .loc 1 284 49 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:49 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + mul.wide.s32 %rd160, %r2562, 2; + add.s64 %rd112, %rd10, %rd160; + mul.wide.s32 %rd161, %r2563, 2; + add.s64 %rd113, %rd10, %rd161; + mul.wide.s32 %rd162, %r2564, 2; + add.s64 %rd114, %rd10, %rd162; + mul.wide.s32 %rd163, %r2565, 2; + add.s64 %rd115, %rd10, %rd163; + .loc 1 294 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:294:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + shl.b32 %r2566, %r4689, 14; + add.s32 %r2567, %r722, %r2566; + bar.sync 0; + add.s32 %r1965, %r2567, %r35; + selp.b32 %r1966, 16, 0, %p19; + // begin inline asm + cp.async.cg.shared.global [ %r1965 + 0 ], [ %rd112 + 0 ], 0x10, %r1966; + // end inline asm + add.s32 %r1967, %r1965, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r1967 + 0 ], [ %rd113 + 0 ], 0x10, %r1966; + // end inline asm + add.s32 %r1969, %r1965, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r1969 + 0 ], [ %rd114 + 0 ], 0x10, %r1966; + // end inline asm + add.s32 %r1971, %r1965, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r1971 + 0 ], [ %rd115 + 0 ], 0x10, %r1966; + // end inline asm + cp.async.commit_group; + .loc 1 284 49 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:49 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s64 %rd116, %rd11, %rd160; + add.s64 %rd117, %rd11, %rd161; + add.s64 %rd118, %rd11, %rd162; + add.s64 %rd119, %rd11, %rd163; + .loc 1 294 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:294:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + add.s32 %r2568, %r2539, %r2566; + add.s32 %r1973, %r2568, %r35; + // begin inline asm + cp.async.cg.shared.global [ %r1973 + 0 ], [ %rd116 + 0 ], 0x10, %r1966; + // end inline asm + add.s32 %r1975, %r1973, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r1975 + 0 ], [ %rd117 + 0 ], 0x10, %r1966; + // end inline asm + add.s32 %r1977, %r1973, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r1977 + 0 ], [ %rd118 + 0 ], 0x10, %r1966; + // end inline asm + add.s32 %r1979, %r1973, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r1979 + 0 ], [ %rd119 + 0 ], 0x10, %r1966; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + setp.ne.b32 %p160, %r52, %r273; + mov.b32 %r4757, %r273; + @%p160 bra $L__BB0_2; +$L__BB0_3: // %._crit_edge + .loc 1 0 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:0:40 + ld.param.b64 %rd30, [triton_tem_fused_0_param_10]; + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:172:41 ] + // begin inline asm + // wait for regs: %r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp6: + .loc 1 181 35 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:181:35 + shl.b64 %rd182, %rd4, 2; + add.s64 %rd164, %rd29, %rd182; + .loc 1 182 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:182:27 + // begin inline asm + mov.u32 %r2697, 0x0; + ld.global.b32 { %r2697 }, [ %rd164 + 0 ]; + // end inline asm + .loc 1 182 41 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:182:41 + shl.b32 %r343, %r2697, 7; + .loc 1 183 51 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:183:51 + shl.b64 %rd183, %rd6, 2; + add.s64 %rd165, %rd28, %rd183; + .loc 1 183 32 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:183:32 + // begin inline asm + mov.u32 %r2698, 0x0; + ld.global.b32 { %r2698 }, [ %rd165 + 0 ]; + // end inline asm + .loc 1 184 49 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:184:49 + shl.b32 %r345, %r2698, 1; +$L__tmp7: + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + setp.lt.s32 %p161, %r345, 1; + setp.gt.s32 %p162, %r345, 0; + .loc 1 346 35 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:346:35 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + or.b32 %r2731, %r343, %r8; + or.b32 %r2732, %r343, %r9; + or.b32 %r2733, %r343, %r10; + or.b32 %r2734, %r343, %r11; + .loc 1 284 38 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:38 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + shl.b32 %r2735, %r2731, 7; + shl.b32 %r2736, %r2732, 7; + shl.b32 %r2737, %r2733, 7; + shl.b32 %r2738, %r2734, 7; + .loc 1 284 20 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:20 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + mul.wide.s32 %rd184, %r2735, 2; + add.s64 %rd185, %rd1, %rd184; + mul.wide.s32 %rd186, %r2736, 2; + add.s64 %rd187, %rd1, %rd186; + mul.wide.s32 %rd188, %r2737, 2; + add.s64 %rd189, %rd1, %rd188; + mul.wide.s32 %rd190, %r2738, 2; + add.s64 %rd191, %rd1, %rd190; + .loc 1 284 49 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:49 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + shl.b64 %rd192, %rd3, 1; + add.s64 %rd166, %rd185, %rd192; + add.s64 %rd167, %rd187, %rd192; + add.s64 %rd168, %rd189, %rd192; + add.s64 %rd169, %rd191, %rd192; + .loc 1 294 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:294:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + selp.b32 %r2700, 16, 0, %p162; + // begin inline asm + cp.async.cg.shared.global [ %r2699 + 0 ], [ %rd166 + 0 ], 0x10, %r2700; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2701 + 0 ], [ %rd167 + 0 ], 0x10, %r2700; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2703 + 0 ], [ %rd168 + 0 ], 0x10, %r2700; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2705 + 0 ], [ %rd169 + 0 ], 0x10, %r2700; + // end inline asm + cp.async.commit_group; + .loc 1 284 20 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:20 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.s64 %rd193, %rd2, %rd184; + add.s64 %rd194, %rd2, %rd186; + add.s64 %rd195, %rd2, %rd188; + add.s64 %rd196, %rd2, %rd190; + .loc 1 284 49 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:49 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.s64 %rd170, %rd193, %rd192; + add.s64 %rd171, %rd194, %rd192; + add.s64 %rd172, %rd195, %rd192; + add.s64 %rd173, %rd196, %rd192; + .loc 1 294 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:294:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + // begin inline asm + cp.async.cg.shared.global [ %r671 + 0 ], [ %rd170 + 0 ], 0x10, %r2700; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r673 + 0 ], [ %rd171 + 0 ], 0x10, %r2700; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r675 + 0 ], [ %rd172 + 0 ], 0x10, %r2700; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r677 + 0 ], [ %rd173 + 0 ], 0x10, %r2700; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + setp.gt.s32 %p163, %r345, 1; + .loc 1 342 32 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:342:32 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + or.b32 %r2739, %r343, 64; + .loc 1 346 35 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:346:35 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + or.b32 %r2740, %r2739, %r8; + or.b32 %r2741, %r2739, %r9; + or.b32 %r2742, %r2739, %r10; + or.b32 %r2743, %r2739, %r11; + .loc 1 284 38 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:38 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + shl.b32 %r2744, %r2740, 7; + shl.b32 %r2745, %r2741, 7; + shl.b32 %r2746, %r2742, 7; + shl.b32 %r2747, %r2743, 7; + .loc 1 284 20 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:20 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + mul.wide.s32 %rd197, %r2744, 2; + add.s64 %rd198, %rd1, %rd197; + mul.wide.s32 %rd199, %r2745, 2; + add.s64 %rd200, %rd1, %rd199; + mul.wide.s32 %rd201, %r2746, 2; + add.s64 %rd202, %rd1, %rd201; + mul.wide.s32 %rd203, %r2747, 2; + add.s64 %rd204, %rd1, %rd203; + .loc 1 284 49 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:49 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.s64 %rd174, %rd198, %rd192; + add.s64 %rd175, %rd200, %rd192; + add.s64 %rd176, %rd202, %rd192; + add.s64 %rd177, %rd204, %rd192; + .loc 1 294 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:294:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + bar.sync 0; + selp.b32 %r2716, 16, 0, %p163; + // begin inline asm + cp.async.cg.shared.global [ %r679 + 0 ], [ %rd174 + 0 ], 0x10, %r2716; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r681 + 0 ], [ %rd175 + 0 ], 0x10, %r2716; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r683 + 0 ], [ %rd176 + 0 ], 0x10, %r2716; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r685 + 0 ], [ %rd177 + 0 ], 0x10, %r2716; + // end inline asm + cp.async.commit_group; + .loc 1 284 20 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:20 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.s64 %rd205, %rd2, %rd197; + add.s64 %rd206, %rd2, %rd199; + add.s64 %rd207, %rd2, %rd201; + add.s64 %rd208, %rd2, %rd203; + .loc 1 284 49 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:49 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.s64 %rd178, %rd205, %rd192; + add.s64 %rd179, %rd206, %rd192; + add.s64 %rd180, %rd207, %rd192; + add.s64 %rd181, %rd208, %rd192; + .loc 1 294 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:294:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + // begin inline asm + cp.async.cg.shared.global [ %r687 + 0 ], [ %rd178 + 0 ], 0x10, %r2716; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r689 + 0 ], [ %rd179 + 0 ], 0x10, %r2716; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r691 + 0 ], [ %rd180 + 0 ], 0x10, %r2716; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r693 + 0 ], [ %rd181 + 0 ], 0x10, %r2716; + // end inline asm + cp.async.commit_group; + .loc 1 351 19 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:351:19 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + @%p161 bra $L__BB0_6; +$L__tmp8: +// %bb.4: // %.lr.ph378 + .loc 1 184 69 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:184:69 + min.u32 %r410, %r345, 32; + add.s32 %r411, %r410, -2; + add.s32 %r412, %r410, -1; + mov.b32 %r3367, 0; + mov.b32 %r4842, 64; + mov.b32 %r4841, 1; + mov.b32 %r4840, -1; + mov.b32 %r4907, %r3367; +$L__BB0_5: // %__nv_exp2f.exit + // =>This Inner Loop Header: Depth=1 +$L__tmp9: + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + setp.lt.s32 %p177, %r4907, %r411; + setp.lt.s32 %p175, %r4907, %r412; + add.s32 %r3982, %r4840, 1; + setp.gt.s32 %p178, %r3982, 2; + selp.b32 %r4840, 0, %r3982, %p178; + .loc 1 294 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:294:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r3983, %r4840, 14; + add.s32 %r3267, %r722, %r3983; + .loc 1 351 19 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:351:19 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + shfl.sync.idx.b32 %r3985, %r7, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r3986, %r3985, 11; + and.b32 %r3987, %r3986, 8192; + add.s32 %r3264, %r722, 98304; + add.s32 %r3988, %r3987, %r3264; + bfe.u32 %r3989, %r3988, 4, 14; + cvt.u64.u32 %rd243, %r3989; + or.b64 %rd209, %rd243, 4611686293372403712; + bfe.u32 %r3990, %r3267, 4, 14; + cvt.u64.u32 %rd244, %r3990; + or.b64 %rd210, %rd244, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2848,%r2849,%r2850,%r2851,%r2852,%r2853,%r2854,%r2855,%r2856,%r2857,%r2858,%r2859,%r2860,%r2861,%r2862,%r2863,%r2864,%r2865,%r2866,%r2867,%r2868,%r2869,%r2870,%r2871,%r2872,%r2873,%r2874,%r2875,%r2876,%r2877,%r2878,%r2879}, %rd209, %rd210, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r3991, %r3988, 32; + bfe.u32 %r3992, %r3991, 4, 14; + cvt.u64.u32 %rd245, %r3992; + or.b64 %rd211, %rd245, 4611686293372403712; + add.s32 %r3993, %r3267, 32; + bfe.u32 %r3994, %r3993, 4, 14; + cvt.u64.u32 %rd246, %r3994; + or.b64 %rd212, %rd246, 4611686293338849280; + mov.pred %p164, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2848,%r2849,%r2850,%r2851,%r2852,%r2853,%r2854,%r2855,%r2856,%r2857,%r2858,%r2859,%r2860,%r2861,%r2862,%r2863,%r2864,%r2865,%r2866,%r2867,%r2868,%r2869,%r2870,%r2871,%r2872,%r2873,%r2874,%r2875,%r2876,%r2877,%r2878,%r2879}, %rd211, %rd212, %p164, 1, 1, 0, 0; + // end inline asm + add.s32 %r3995, %r3988, 64; + bfe.u32 %r3996, %r3995, 4, 14; + cvt.u64.u32 %rd247, %r3996; + or.b64 %rd213, %rd247, 4611686293372403712; + add.s32 %r3997, %r3267, 64; + bfe.u32 %r3998, %r3997, 4, 14; + cvt.u64.u32 %rd248, %r3998; + or.b64 %rd214, %rd248, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2848,%r2849,%r2850,%r2851,%r2852,%r2853,%r2854,%r2855,%r2856,%r2857,%r2858,%r2859,%r2860,%r2861,%r2862,%r2863,%r2864,%r2865,%r2866,%r2867,%r2868,%r2869,%r2870,%r2871,%r2872,%r2873,%r2874,%r2875,%r2876,%r2877,%r2878,%r2879}, %rd213, %rd214, %p164, 1, 1, 0, 0; + // end inline asm + add.s32 %r3999, %r3988, 96; + bfe.u32 %r4000, %r3999, 4, 14; + cvt.u64.u32 %rd249, %r4000; + or.b64 %rd215, %rd249, 4611686293372403712; + add.s32 %r4001, %r3267, 96; + bfe.u32 %r4002, %r4001, 4, 14; + cvt.u64.u32 %rd250, %r4002; + or.b64 %rd216, %rd250, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2848,%r2849,%r2850,%r2851,%r2852,%r2853,%r2854,%r2855,%r2856,%r2857,%r2858,%r2859,%r2860,%r2861,%r2862,%r2863,%r2864,%r2865,%r2866,%r2867,%r2868,%r2869,%r2870,%r2871,%r2872,%r2873,%r2874,%r2875,%r2876,%r2877,%r2878,%r2879}, %rd215, %rd216, %p164, 1, 1, 0, 0; + // end inline asm + add.s32 %r4003, %r3988, 16384; + bfe.u32 %r4004, %r4003, 4, 14; + cvt.u64.u32 %rd251, %r4004; + or.b64 %rd217, %rd251, 4611686293372403712; + add.s32 %r4005, %r3267, 8192; + bfe.u32 %r4006, %r4005, 4, 14; + cvt.u64.u32 %rd252, %r4006; + or.b64 %rd218, %rd252, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2848,%r2849,%r2850,%r2851,%r2852,%r2853,%r2854,%r2855,%r2856,%r2857,%r2858,%r2859,%r2860,%r2861,%r2862,%r2863,%r2864,%r2865,%r2866,%r2867,%r2868,%r2869,%r2870,%r2871,%r2872,%r2873,%r2874,%r2875,%r2876,%r2877,%r2878,%r2879}, %rd217, %rd218, %p164, 1, 1, 0, 0; + // end inline asm + add.s32 %r4007, %r3988, 16416; + bfe.u32 %r4008, %r4007, 4, 14; + cvt.u64.u32 %rd253, %r4008; + or.b64 %rd219, %rd253, 4611686293372403712; + add.s32 %r4009, %r3267, 8224; + bfe.u32 %r4010, %r4009, 4, 14; + cvt.u64.u32 %rd254, %r4010; + or.b64 %rd220, %rd254, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2848,%r2849,%r2850,%r2851,%r2852,%r2853,%r2854,%r2855,%r2856,%r2857,%r2858,%r2859,%r2860,%r2861,%r2862,%r2863,%r2864,%r2865,%r2866,%r2867,%r2868,%r2869,%r2870,%r2871,%r2872,%r2873,%r2874,%r2875,%r2876,%r2877,%r2878,%r2879}, %rd219, %rd220, %p164, 1, 1, 0, 0; + // end inline asm + add.s32 %r4011, %r3988, 16448; + bfe.u32 %r4012, %r4011, 4, 14; + cvt.u64.u32 %rd255, %r4012; + or.b64 %rd221, %rd255, 4611686293372403712; + add.s32 %r4013, %r3267, 8256; + bfe.u32 %r4014, %r4013, 4, 14; + cvt.u64.u32 %rd256, %r4014; + or.b64 %rd222, %rd256, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2848,%r2849,%r2850,%r2851,%r2852,%r2853,%r2854,%r2855,%r2856,%r2857,%r2858,%r2859,%r2860,%r2861,%r2862,%r2863,%r2864,%r2865,%r2866,%r2867,%r2868,%r2869,%r2870,%r2871,%r2872,%r2873,%r2874,%r2875,%r2876,%r2877,%r2878,%r2879}, %rd221, %rd222, %p164, 1, 1, 0, 0; + // end inline asm + add.s32 %r4015, %r3988, 16480; + bfe.u32 %r4016, %r4015, 4, 14; + cvt.u64.u32 %rd257, %r4016; + or.b64 %rd223, %rd257, 4611686293372403712; + add.s32 %r4017, %r3267, 8288; + bfe.u32 %r4018, %r4017, 4, 14; + cvt.u64.u32 %rd258, %r4018; + or.b64 %rd224, %rd258, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2848,%r2849,%r2850,%r2851,%r2852,%r2853,%r2854,%r2855,%r2856,%r2857,%r2858,%r2859,%r2860,%r2861,%r2862,%r2863,%r2864,%r2865,%r2866,%r2867,%r2868,%r2869,%r2870,%r2871,%r2872,%r2873,%r2874,%r2875,%r2876,%r2877,%r2878,%r2879}, %rd223, %rd224, %p164, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r3265, %r3367; + mov.b32 %r3266, %r3367; + mov.b32 %r3268, %r3367; + mov.b32 %r3269, %r3367; + // begin inline asm + // wait for regs: %r2848,%r2849,%r2850,%r2851,%r2852,%r2853,%r2854,%r2855,%r2856,%r2857,%r2858,%r2859,%r2860,%r2861,%r2862,%r2863,%r2864,%r2865,%r2866,%r2867,%r2868,%r2869,%r2870,%r2871,%r2872,%r2873,%r2874,%r2875,%r2876,%r2877,%r2878,%r2879,%r3264,%r3265,%r3266,%r3267,%r3268,%r3269,%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 353 14 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:353:14 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + mul.f32 %r4019, %r2848, 0f3DB504F3; + mul.f32 %r4020, %r2849, 0f3DB504F3; + mul.f32 %r4021, %r2850, 0f3DB504F3; + mul.f32 %r4022, %r2851, 0f3DB504F3; + mul.f32 %r4023, %r2852, 0f3DB504F3; + mul.f32 %r4024, %r2853, 0f3DB504F3; + mul.f32 %r4025, %r2854, 0f3DB504F3; + mul.f32 %r4026, %r2855, 0f3DB504F3; + mul.f32 %r4027, %r2856, 0f3DB504F3; + mul.f32 %r4028, %r2857, 0f3DB504F3; + mul.f32 %r4029, %r2858, 0f3DB504F3; + mul.f32 %r4030, %r2859, 0f3DB504F3; + mul.f32 %r4031, %r2860, 0f3DB504F3; + mul.f32 %r4032, %r2861, 0f3DB504F3; + mul.f32 %r4033, %r2862, 0f3DB504F3; + mul.f32 %r4034, %r2863, 0f3DB504F3; + mul.f32 %r4035, %r2864, 0f3DB504F3; + mul.f32 %r4036, %r2865, 0f3DB504F3; + mul.f32 %r4037, %r2866, 0f3DB504F3; + mul.f32 %r4038, %r2867, 0f3DB504F3; + mul.f32 %r4039, %r2868, 0f3DB504F3; + mul.f32 %r4040, %r2869, 0f3DB504F3; + mul.f32 %r4041, %r2870, 0f3DB504F3; + mul.f32 %r4042, %r2871, 0f3DB504F3; + mul.f32 %r4043, %r2872, 0f3DB504F3; + mul.f32 %r4044, %r2873, 0f3DB504F3; + mul.f32 %r4045, %r2874, 0f3DB504F3; + mul.f32 %r4046, %r2875, 0f3DB504F3; + mul.f32 %r4047, %r2876, 0f3DB504F3; + mul.f32 %r4048, %r2877, 0f3DB504F3; + mul.f32 %r4049, %r2878, 0f3DB504F3; + mul.f32 %r4050, %r2879, 0f3DB504F3; + .loc 1 417 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:417:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + mul.f32 %r4051, %r4019, 0f3FB8AA3B; + mul.f32 %r4052, %r4020, 0f3FB8AA3B; + mul.f32 %r4053, %r4021, 0f3FB8AA3B; + mul.f32 %r4054, %r4022, 0f3FB8AA3B; + mul.f32 %r4055, %r4023, 0f3FB8AA3B; + mul.f32 %r4056, %r4024, 0f3FB8AA3B; + mul.f32 %r4057, %r4025, 0f3FB8AA3B; + mul.f32 %r4058, %r4026, 0f3FB8AA3B; + mul.f32 %r4059, %r4027, 0f3FB8AA3B; + mul.f32 %r4060, %r4028, 0f3FB8AA3B; + mul.f32 %r4061, %r4029, 0f3FB8AA3B; + mul.f32 %r4062, %r4030, 0f3FB8AA3B; + mul.f32 %r4063, %r4031, 0f3FB8AA3B; + mul.f32 %r4064, %r4032, 0f3FB8AA3B; + mul.f32 %r4065, %r4033, 0f3FB8AA3B; + mul.f32 %r4066, %r4034, 0f3FB8AA3B; + mul.f32 %r4067, %r4035, 0f3FB8AA3B; + mul.f32 %r4068, %r4036, 0f3FB8AA3B; + mul.f32 %r4069, %r4037, 0f3FB8AA3B; + mul.f32 %r4070, %r4038, 0f3FB8AA3B; + mul.f32 %r4071, %r4039, 0f3FB8AA3B; + mul.f32 %r4072, %r4040, 0f3FB8AA3B; + mul.f32 %r4073, %r4041, 0f3FB8AA3B; + mul.f32 %r4074, %r4042, 0f3FB8AA3B; + mul.f32 %r4075, %r4043, 0f3FB8AA3B; + mul.f32 %r4076, %r4044, 0f3FB8AA3B; + mul.f32 %r4077, %r4045, 0f3FB8AA3B; + mul.f32 %r4078, %r4046, 0f3FB8AA3B; + mul.f32 %r4079, %r4047, 0f3FB8AA3B; + mul.f32 %r4080, %r4048, 0f3FB8AA3B; + mul.f32 %r4081, %r4049, 0f3FB8AA3B; + mul.f32 %r4082, %r4050, 0f3FB8AA3B; + .loc 2 168 27 // standard.py:168:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + max.f32 %r4083, %r4051, %r4052; + max.f32 %r4084, %r4053, %r4054; + max.f32 %r4085, %r4083, %r4055; + max.f32 %r4086, %r4085, %r4056; + max.f32 %r4087, %r4084, %r4057; + max.f32 %r4088, %r4087, %r4058; + max.f32 %r4089, %r4086, %r4059; + max.f32 %r4090, %r4089, %r4060; + max.f32 %r4091, %r4088, %r4061; + max.f32 %r4092, %r4091, %r4062; + max.f32 %r4093, %r4090, %r4063; + max.f32 %r4094, %r4093, %r4064; + max.f32 %r4095, %r4092, %r4065; + max.f32 %r4096, %r4095, %r4066; + max.f32 %r4097, %r4094, %r4067; + max.f32 %r4098, %r4097, %r4068; + max.f32 %r4099, %r4096, %r4069; + max.f32 %r4100, %r4099, %r4070; + max.f32 %r4101, %r4098, %r4071; + max.f32 %r4102, %r4101, %r4072; + max.f32 %r4103, %r4100, %r4073; + max.f32 %r4104, %r4103, %r4074; + max.f32 %r4105, %r4102, %r4075; + max.f32 %r4106, %r4105, %r4076; + max.f32 %r4107, %r4104, %r4077; + max.f32 %r4108, %r4107, %r4078; + max.f32 %r4109, %r4106, %r4079; + max.f32 %r4110, %r4109, %r4080; + max.f32 %r4111, %r4108, %r4081; + max.f32 %r4112, %r4111, %r4082; + .loc 2 189 40 // standard.py:189:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + shfl.sync.bfly.b32 %r4113, %r4110, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + max.f32 %r4114, %r4110, %r4113; + .loc 2 189 40 // standard.py:189:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + shfl.sync.bfly.b32 %r4115, %r4114, 1, 31, -1; + shfl.sync.bfly.b32 %r4116, %r4112, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + max.f32 %r4117, %r4112, %r4116; + .loc 2 189 40 // standard.py:189:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + shfl.sync.bfly.b32 %r4118, %r4117, 1, 31, -1; + cvt.u64.u32 %rd259, %r4115; + cvt.u64.u32 %rd260, %r4118; + shl.b64 %rd261, %rd260, 32; + or.b64 %rd262, %rd259, %rd261; + .loc 2 168 27 // standard.py:168:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + mov.b64 {%r4119, %r4120}, %rd262; + max.f32 %r4121, %r4114, %r4119; + max.f32 %r4122, %r4117, %r4120; + .loc 1 421 27 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:421:27 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + mov.b64 {%r4123, %r4124}, %rd284; + max.f32 %r4125, %r4124, %r4122; + max.f32 %r4126, %r4123, %r4121; + mov.b64 %rd284, {%r4126, %r4125}; + .loc 1 423 35 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:423:35 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + setp.eq.f32 %p179, %r4126, 0fFF800000; + setp.eq.f32 %p180, %r4125, 0fFF800000; + .loc 1 424 51 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:424:51 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + selp.f32 %r4127, 0f00000000, %r4126, %p179; + selp.f32 %r4128, 0f00000000, %r4125, %p180; + .loc 1 428 31 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:428:31 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + sub.f32 %r4129, %r4123, %r4127; + sub.f32 %r4130, %r4124, %r4128; + .loc 1 428 25 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:428:25 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + ex2.approx.ftz.f32 %r4131, %r4129; + ex2.approx.ftz.f32 %r4132, %r4130; + .loc 1 429 39 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:429:39 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + neg.f32 %r4133, %r4127; + fma.rn.f32 %r4134, %r4019, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4135, %r4020, 0f3FB8AA3B, %r4133; + neg.f32 %r4136, %r4128; + fma.rn.f32 %r4137, %r4021, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4138, %r4022, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4139, %r4023, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4140, %r4024, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4141, %r4025, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4142, %r4026, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4143, %r4027, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4144, %r4028, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4145, %r4029, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4146, %r4030, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4147, %r4031, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4148, %r4032, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4149, %r4033, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4150, %r4034, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4151, %r4035, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4152, %r4036, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4153, %r4037, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4154, %r4038, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4155, %r4039, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4156, %r4040, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4157, %r4041, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4158, %r4042, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4159, %r4043, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4160, %r4044, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4161, %r4045, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4162, %r4046, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4163, %r4047, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4164, %r4048, 0f3FB8AA3B, %r4133; + fma.rn.f32 %r4165, %r4049, 0f3FB8AA3B, %r4136; + fma.rn.f32 %r4166, %r4050, 0f3FB8AA3B, %r4136; + .loc 1 429 21 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:429:21 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + ex2.approx.ftz.f32 %r4167, %r4134; + ex2.approx.ftz.f32 %r4168, %r4135; + ex2.approx.ftz.f32 %r4169, %r4137; + ex2.approx.ftz.f32 %r4170, %r4138; + ex2.approx.ftz.f32 %r4171, %r4139; + ex2.approx.ftz.f32 %r4172, %r4140; + ex2.approx.ftz.f32 %r4173, %r4141; + ex2.approx.ftz.f32 %r4174, %r4142; + ex2.approx.ftz.f32 %r4175, %r4143; + ex2.approx.ftz.f32 %r4176, %r4144; + ex2.approx.ftz.f32 %r4177, %r4145; + ex2.approx.ftz.f32 %r4178, %r4146; + ex2.approx.ftz.f32 %r4179, %r4147; + ex2.approx.ftz.f32 %r4180, %r4148; + ex2.approx.ftz.f32 %r4181, %r4149; + ex2.approx.ftz.f32 %r4182, %r4150; + ex2.approx.ftz.f32 %r4183, %r4151; + ex2.approx.ftz.f32 %r4184, %r4152; + ex2.approx.ftz.f32 %r4185, %r4153; + ex2.approx.ftz.f32 %r4186, %r4154; + ex2.approx.ftz.f32 %r4187, %r4155; + ex2.approx.ftz.f32 %r4188, %r4156; + ex2.approx.ftz.f32 %r4189, %r4157; + ex2.approx.ftz.f32 %r4190, %r4158; + ex2.approx.ftz.f32 %r4191, %r4159; + ex2.approx.ftz.f32 %r4192, %r4160; + ex2.approx.ftz.f32 %r4193, %r4161; + ex2.approx.ftz.f32 %r4194, %r4162; + ex2.approx.ftz.f32 %r4195, %r4163; + ex2.approx.ftz.f32 %r4196, %r4164; + ex2.approx.ftz.f32 %r4197, %r4165; + ex2.approx.ftz.f32 %r4198, %r4166; + .loc 2 261 15 // standard.py:261:15 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.f32 %r4199, %r4167, %r4168; + add.f32 %r4200, %r4169, %r4170; + add.f32 %r4201, %r4199, %r4171; + add.f32 %r4202, %r4201, %r4172; + add.f32 %r4203, %r4200, %r4173; + add.f32 %r4204, %r4203, %r4174; + add.f32 %r4205, %r4202, %r4175; + add.f32 %r4206, %r4205, %r4176; + add.f32 %r4207, %r4204, %r4177; + add.f32 %r4208, %r4207, %r4178; + add.f32 %r4209, %r4206, %r4179; + add.f32 %r4210, %r4209, %r4180; + add.f32 %r4211, %r4208, %r4181; + add.f32 %r4212, %r4211, %r4182; + add.f32 %r4213, %r4210, %r4183; + add.f32 %r4214, %r4213, %r4184; + add.f32 %r4215, %r4212, %r4185; + add.f32 %r4216, %r4215, %r4186; + add.f32 %r4217, %r4214, %r4187; + add.f32 %r4218, %r4217, %r4188; + add.f32 %r4219, %r4216, %r4189; + add.f32 %r4220, %r4219, %r4190; + add.f32 %r4221, %r4218, %r4191; + add.f32 %r4222, %r4221, %r4192; + add.f32 %r4223, %r4220, %r4193; + add.f32 %r4224, %r4223, %r4194; + add.f32 %r4225, %r4222, %r4195; + add.f32 %r4226, %r4225, %r4196; + add.f32 %r4227, %r4224, %r4197; + add.f32 %r4228, %r4227, %r4198; + .loc 2 291 36 // standard.py:291:36 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + shfl.sync.bfly.b32 %r4229, %r4226, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.f32 %r4230, %r4226, %r4229; + .loc 2 291 36 // standard.py:291:36 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + shfl.sync.bfly.b32 %r4231, %r4230, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.f32 %r4232, %r4230, %r4231; + .loc 2 291 36 // standard.py:291:36 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + shfl.sync.bfly.b32 %r4233, %r4228, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.f32 %r4234, %r4228, %r4233; + .loc 2 291 36 // standard.py:291:36 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + shfl.sync.bfly.b32 %r4235, %r4234, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.f32 %r4236, %r4234, %r4235; + .loc 1 434 24 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:434:24 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + fma.rn.f32 %r4974, %r4974, %r4131, %r4232; + fma.rn.f32 %r4975, %r4975, %r4132, %r4236; + .loc 1 436 16 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:436:16 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + mul.f32 %r1567, %r1567, %r4131; + mul.f32 %r1568, %r1568, %r4131; + mul.f32 %r1569, %r1569, %r4132; + mul.f32 %r1570, %r1570, %r4132; + mul.f32 %r1571, %r1571, %r4131; + mul.f32 %r1572, %r1572, %r4131; + mul.f32 %r1573, %r1573, %r4132; + mul.f32 %r1574, %r1574, %r4132; + mul.f32 %r1575, %r1575, %r4131; + mul.f32 %r1576, %r1576, %r4131; + mul.f32 %r1577, %r1577, %r4132; + mul.f32 %r1578, %r1578, %r4132; + mul.f32 %r1579, %r1579, %r4131; + mul.f32 %r1580, %r1580, %r4131; + mul.f32 %r1581, %r1581, %r4132; + mul.f32 %r1582, %r1582, %r4132; + mul.f32 %r1583, %r1583, %r4131; + mul.f32 %r1584, %r1584, %r4131; + mul.f32 %r1585, %r1585, %r4132; + mul.f32 %r1586, %r1586, %r4132; + mul.f32 %r1587, %r1587, %r4131; + mul.f32 %r1588, %r1588, %r4131; + mul.f32 %r1589, %r1589, %r4132; + mul.f32 %r1590, %r1590, %r4132; + mul.f32 %r1591, %r1591, %r4131; + mul.f32 %r1592, %r1592, %r4131; + mul.f32 %r1593, %r1593, %r4132; + mul.f32 %r1594, %r1594, %r4132; + mul.f32 %r1595, %r1595, %r4131; + mul.f32 %r1596, %r1596, %r4131; + mul.f32 %r1597, %r1597, %r4132; + mul.f32 %r1598, %r1598, %r4132; + mul.f32 %r1599, %r1599, %r4131; + mul.f32 %r1600, %r1600, %r4131; + mul.f32 %r1601, %r1601, %r4132; + mul.f32 %r1602, %r1602, %r4132; + mul.f32 %r1603, %r1603, %r4131; + mul.f32 %r1604, %r1604, %r4131; + mul.f32 %r1605, %r1605, %r4132; + mul.f32 %r1606, %r1606, %r4132; + mul.f32 %r1607, %r1607, %r4131; + mul.f32 %r1608, %r1608, %r4131; + mul.f32 %r1609, %r1609, %r4132; + mul.f32 %r1610, %r1610, %r4132; + mul.f32 %r1611, %r1611, %r4131; + mul.f32 %r1612, %r1612, %r4131; + mul.f32 %r1613, %r1613, %r4132; + mul.f32 %r1614, %r1614, %r4132; + mul.f32 %r1615, %r1615, %r4131; + mul.f32 %r1616, %r1616, %r4131; + mul.f32 %r1617, %r1617, %r4132; + mul.f32 %r1618, %r1618, %r4132; + mul.f32 %r1619, %r1619, %r4131; + mul.f32 %r1620, %r1620, %r4131; + mul.f32 %r1621, %r1621, %r4132; + mul.f32 %r1622, %r1622, %r4132; + mul.f32 %r1623, %r1623, %r4131; + mul.f32 %r1624, %r1624, %r4131; + mul.f32 %r1625, %r1625, %r4132; + mul.f32 %r1626, %r1626, %r4132; + mul.f32 %r1627, %r1627, %r4131; + mul.f32 %r1628, %r1628, %r4131; + mul.f32 %r1629, %r1629, %r4132; + mul.f32 %r1630, %r1630, %r4132; + .loc 1 294 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:294:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.s32 %r4237, %r722, 49152; + add.s32 %r4238, %r4237, %r3983; + .loc 1 440 22 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:440:22 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + cvt.rn.bf16x2.f32 %r3564, %r4168, %r4167; + cvt.rn.bf16x2.f32 %r3565, %r4170, %r4169; + cvt.rn.bf16x2.f32 %r3566, %r4172, %r4171; + cvt.rn.bf16x2.f32 %r3567, %r4174, %r4173; + cvt.rn.bf16x2.f32 %r3696, %r4176, %r4175; + cvt.rn.bf16x2.f32 %r3697, %r4178, %r4177; + cvt.rn.bf16x2.f32 %r3698, %r4180, %r4179; + cvt.rn.bf16x2.f32 %r3699, %r4182, %r4181; + cvt.rn.bf16x2.f32 %r3828, %r4184, %r4183; + cvt.rn.bf16x2.f32 %r3829, %r4186, %r4185; + cvt.rn.bf16x2.f32 %r3830, %r4188, %r4187; + cvt.rn.bf16x2.f32 %r3831, %r4190, %r4189; + cvt.rn.bf16x2.f32 %r3960, %r4192, %r4191; + cvt.rn.bf16x2.f32 %r3961, %r4194, %r4193; + cvt.rn.bf16x2.f32 %r3962, %r4196, %r4195; + cvt.rn.bf16x2.f32 %r3963, %r4198, %r4197; + .loc 1 440 44 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:440:44 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + wgmma.fence.sync.aligned; + bfe.u32 %r4239, %r4238, 4, 14; + cvt.u64.u32 %rd263, %r4239; + or.b64 %rd225, %rd263, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630}, {%r3564,%r3565,%r3566,%r3567}, %rd225, %p164, 1, 1, 1; + // end inline asm + add.s32 %r4240, %r4238, 2048; + bfe.u32 %r4241, %r4240, 4, 14; + cvt.u64.u32 %rd264, %r4241; + or.b64 %rd226, %rd264, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630}, {%r3696,%r3697,%r3698,%r3699}, %rd226, %p164, 1, 1, 1; + // end inline asm + add.s32 %r4242, %r4238, 4096; + bfe.u32 %r4243, %r4242, 4, 14; + cvt.u64.u32 %rd265, %r4243; + or.b64 %rd227, %rd265, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630}, {%r3828,%r3829,%r3830,%r3831}, %rd227, %p164, 1, 1, 1; + // end inline asm + add.s32 %r4244, %r4238, 6144; + bfe.u32 %r4245, %r4244, 4, 14; + cvt.u64.u32 %rd266, %r4245; + or.b64 %rd228, %rd266, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630}, {%r3960,%r3961,%r3962,%r3963}, %rd228, %p164, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.s32 %r550, %r4907, 1; + .loc 1 247 33 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:247:33 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + shr.u32 %r4246, %r550, 1; + .loc 1 248 38 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:248:38 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + mad.wide.u32 %rd230, %r4246, 4, %rd164; + .loc 1 248 24 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:248:24 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + // begin inline asm + mov.u64 %rd229, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd229, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r3964, 0x0; + @%p175 ld.global.L1::evict_last.L2::cache_hint.b32 { %r3964 }, [ %rd230 + 0 ], %rd229; + // end inline asm + .loc 1 249 109 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:249:109 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.s32 %r4247, %r4246, 1; + .loc 1 249 113 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:249:113 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + setp.lt.s32 %p181, %r4247, %r2698; + .loc 1 249 55 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:249:55 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.s64 %rd233, %rd230, 4; + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + and.pred %p176, %p175, %p181; + .loc 1 249 25 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:249:25 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + // begin inline asm + mov.u64 %rd232, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd232, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r3965, 0x0; + @%p176 ld.global.L1::evict_last.L2::cache_hint.b32 { %r3965 }, [ %rd233 + 0 ], %rd232; + // end inline asm + .loc 1 250 35 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:250:35 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + and.b32 %r4248, %r4907, 1; + .loc 1 251 34 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:251:34 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + sub.s32 %r4249, %r3965, %r3964; + .loc 1 251 48 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:251:48 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + shl.b32 %r4250, %r4249, 7; + .loc 1 251 63 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:251:63 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.s32 %r4251, %r4250, -64; + .loc 1 252 29 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:252:29 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + xor.b32 %r4252, %r4248, 1; + .loc 1 252 61 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:252:61 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + shl.b32 %r4253, %r4248, 6; + .loc 1 252 42 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:252:42 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.s32 %r4254, %r4842, %r4253; + .loc 1 549 21 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:549:21 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + mad.lo.s32 %r4842, %r4251, %r4252, %r4254; + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.s32 %r4255, %r4841, 1; + setp.gt.s32 %p182, %r4255, 2; + selp.b32 %r4841, 0, %r4255, %p182; + .loc 1 342 32 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:342:32 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.s32 %r4256, %r4842, %r343; + .loc 1 346 35 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:346:35 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.s32 %r4257, %r4256, %r8; + add.s32 %r4258, %r4256, %r9; + add.s32 %r4259, %r4256, %r10; + add.s32 %r4260, %r4256, %r11; + .loc 1 284 38 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:38 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + shl.b32 %r4261, %r4257, 7; + shl.b32 %r4262, %r4258, 7; + shl.b32 %r4263, %r4259, 7; + shl.b32 %r4264, %r4260, 7; + .loc 1 284 49 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:49 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + mul.wide.s32 %rd267, %r4261, 2; + add.s64 %rd235, %rd10, %rd267; + mul.wide.s32 %rd268, %r4262, 2; + add.s64 %rd236, %rd10, %rd268; + mul.wide.s32 %rd269, %r4263, 2; + add.s64 %rd237, %rd10, %rd269; + mul.wide.s32 %rd270, %r4264, 2; + add.s64 %rd238, %rd10, %rd270; + .loc 1 294 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:294:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + shl.b32 %r4265, %r4841, 14; + add.s32 %r4266, %r722, %r4265; + bar.sync 0; + add.s32 %r3966, %r4266, %r35; + selp.b32 %r3967, 16, 0, %p177; + // begin inline asm + cp.async.cg.shared.global [ %r3966 + 0 ], [ %rd235 + 0 ], 0x10, %r3967; + // end inline asm + add.s32 %r3968, %r3966, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r3968 + 0 ], [ %rd236 + 0 ], 0x10, %r3967; + // end inline asm + add.s32 %r3970, %r3966, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r3970 + 0 ], [ %rd237 + 0 ], 0x10, %r3967; + // end inline asm + add.s32 %r3972, %r3966, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r3972 + 0 ], [ %rd238 + 0 ], 0x10, %r3967; + // end inline asm + cp.async.commit_group; + .loc 1 284 49 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:284:49 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.s64 %rd239, %rd11, %rd267; + add.s64 %rd240, %rd11, %rd268; + add.s64 %rd241, %rd11, %rd269; + add.s64 %rd242, %rd11, %rd270; + .loc 1 294 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:294:23 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + add.s32 %r4267, %r4237, %r4265; + add.s32 %r3974, %r4267, %r35; + // begin inline asm + cp.async.cg.shared.global [ %r3974 + 0 ], [ %rd239 + 0 ], 0x10, %r3967; + // end inline asm + add.s32 %r3976, %r3974, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r3976 + 0 ], [ %rd240 + 0 ], 0x10, %r3967; + // end inline asm + add.s32 %r3978, %r3974, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r3978 + 0 ], [ %rd241 + 0 ], 0x10, %r3967; + // end inline asm + add.s32 %r3980, %r3974, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r3980 + 0 ], [ %rd242 + 0 ], 0x10, %r3967; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + setp.ne.b32 %p183, %r410, %r550; + mov.b32 %r4907, %r550; + @%p183 bra $L__BB0_5; +$L__BB0_6: // %._crit_edge379 + .loc 1 0 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:0:40 + cvt.u32.u64 %r4468, %rd3; + .loc 1 502 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:502:40 @[ c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:198:45 ] + // begin inline asm + // wait for regs: %r1567,%r1568,%r1569,%r1570,%r1571,%r1572,%r1573,%r1574,%r1575,%r1576,%r1577,%r1578,%r1579,%r1580,%r1581,%r1582,%r1583,%r1584,%r1585,%r1586,%r1587,%r1588,%r1589,%r1590,%r1591,%r1592,%r1593,%r1594,%r1595,%r1596,%r1597,%r1598,%r1599,%r1600,%r1601,%r1602,%r1603,%r1604,%r1605,%r1606,%r1607,%r1608,%r1609,%r1610,%r1611,%r1612,%r1613,%r1614,%r1615,%r1616,%r1617,%r1618,%r1619,%r1620,%r1621,%r1622,%r1623,%r1624,%r1625,%r1626,%r1627,%r1628,%r1629,%r1630 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp10: + .loc 1 206 26 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:206:26 + setp.eq.f32 %p192, %r4974, 0f00000000; + setp.eq.f32 %p193, %r4975, 0f00000000; + .loc 1 206 34 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:206:34 + selp.f32 %r4469, 0f3F800000, %r4974, %p192; + selp.f32 %r619, 0f3F800000, %r4975, %p193; + .loc 1 208 16 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:208:16 + div.full.f32 %r4470, %r1567, %r4469; + div.full.f32 %r4471, %r1568, %r4469; + div.full.f32 %r4472, %r1569, %r619; + div.full.f32 %r4473, %r1570, %r619; + div.full.f32 %r4474, %r1571, %r4469; + div.full.f32 %r4475, %r1572, %r4469; + div.full.f32 %r4476, %r1573, %r619; + div.full.f32 %r4477, %r1574, %r619; + div.full.f32 %r4478, %r1575, %r4469; + div.full.f32 %r4479, %r1576, %r4469; + div.full.f32 %r4480, %r1577, %r619; + div.full.f32 %r4481, %r1578, %r619; + div.full.f32 %r4482, %r1579, %r4469; + div.full.f32 %r4483, %r1580, %r4469; + div.full.f32 %r4484, %r1581, %r619; + div.full.f32 %r4485, %r1582, %r619; + div.full.f32 %r4486, %r1583, %r4469; + div.full.f32 %r4487, %r1584, %r4469; + div.full.f32 %r4488, %r1585, %r619; + div.full.f32 %r4489, %r1586, %r619; + div.full.f32 %r4490, %r1587, %r4469; + div.full.f32 %r4491, %r1588, %r4469; + div.full.f32 %r4492, %r1589, %r619; + div.full.f32 %r4493, %r1590, %r619; + div.full.f32 %r4494, %r1591, %r4469; + div.full.f32 %r4495, %r1592, %r4469; + div.full.f32 %r4496, %r1593, %r619; + div.full.f32 %r4497, %r1594, %r619; + div.full.f32 %r4498, %r1595, %r4469; + div.full.f32 %r4499, %r1596, %r4469; + div.full.f32 %r4500, %r1597, %r619; + div.full.f32 %r4501, %r1598, %r619; + div.full.f32 %r4502, %r1599, %r4469; + div.full.f32 %r4503, %r1600, %r4469; + div.full.f32 %r4504, %r1601, %r619; + div.full.f32 %r4505, %r1602, %r619; + div.full.f32 %r4506, %r1603, %r4469; + div.full.f32 %r4507, %r1604, %r4469; + div.full.f32 %r4508, %r1605, %r619; + div.full.f32 %r4509, %r1606, %r619; + div.full.f32 %r4510, %r1607, %r4469; + div.full.f32 %r4511, %r1608, %r4469; + div.full.f32 %r4512, %r1609, %r619; + div.full.f32 %r4513, %r1610, %r619; + div.full.f32 %r4514, %r1611, %r4469; + div.full.f32 %r4515, %r1612, %r4469; + div.full.f32 %r4516, %r1613, %r619; + div.full.f32 %r4517, %r1614, %r619; + div.full.f32 %r4518, %r1615, %r4469; + div.full.f32 %r4519, %r1616, %r4469; + div.full.f32 %r4520, %r1617, %r619; + div.full.f32 %r4521, %r1618, %r619; + div.full.f32 %r4522, %r1619, %r4469; + div.full.f32 %r4523, %r1620, %r4469; + div.full.f32 %r4524, %r1621, %r619; + div.full.f32 %r4525, %r1622, %r619; + div.full.f32 %r4526, %r1623, %r4469; + div.full.f32 %r4527, %r1624, %r4469; + div.full.f32 %r4528, %r1625, %r619; + div.full.f32 %r4529, %r1626, %r619; + div.full.f32 %r4530, %r1627, %r4469; + div.full.f32 %r4531, %r1628, %r4469; + div.full.f32 %r4532, %r1629, %r619; + div.full.f32 %r4533, %r1630, %r619; + .loc 1 214 20 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:214:20 + setp.lt.s32 %p184, %r12, 2048; + setp.lt.s32 %p185, %r13, 2048; + setp.lt.s32 %p186, %r14, 2048; + setp.lt.s32 %p187, %r15, 2048; + setp.lt.s32 %p188, %r16, 2048; + setp.lt.s32 %p189, %r17, 2048; + setp.lt.s32 %p190, %r18, 2048; + setp.lt.s32 %p191, %r19, 2048; + .loc 1 218 49 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:218:49 + or.b32 %r4534, %r4468, %r4; + .loc 1 218 62 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:218:62 + or.b32 %r4535, %r4534, %r3; + .loc 1 218 75 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:218:75 + add.s32 %r4536, %r4535, %r20; + add.s32 %r4537, %r4535, %r21; + add.s32 %r4538, %r4535, %r22; + add.s32 %r4539, %r4535, %r23; + add.s32 %r4540, %r4535, %r24; + add.s32 %r4541, %r4535, %r25; + add.s32 %r4542, %r4535, %r26; + add.s32 %r4543, %r4535, %r27; + .loc 1 218 25 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:218:25 + mad.wide.s32 %rd271, %r4536, 2, %rd30; + mad.wide.s32 %rd272, %r4537, 2, %rd30; + mad.wide.s32 %rd273, %r4538, 2, %rd30; + mad.wide.s32 %rd274, %r4539, 2, %rd30; + mad.wide.s32 %rd275, %r4540, 2, %rd30; + mad.wide.s32 %rd276, %r4541, 2, %rd30; + mad.wide.s32 %rd277, %r4542, 2, %rd30; + mad.wide.s32 %rd278, %r4543, 2, %rd30; + .loc 1 218 109 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:218:109 + cvt.rn.bf16x2.f32 %r4544, %r4471, %r4470; + cvt.rn.bf16x2.f32 %r4545, %r4473, %r4472; + cvt.rn.bf16x2.f32 %r4546, %r4475, %r4474; + cvt.rn.bf16x2.f32 %r4547, %r4477, %r4476; + cvt.rn.bf16x2.f32 %r4548, %r4479, %r4478; + cvt.rn.bf16x2.f32 %r4549, %r4481, %r4480; + cvt.rn.bf16x2.f32 %r4550, %r4483, %r4482; + cvt.rn.bf16x2.f32 %r4551, %r4485, %r4484; + cvt.rn.bf16x2.f32 %r4552, %r4487, %r4486; + cvt.rn.bf16x2.f32 %r4553, %r4489, %r4488; + cvt.rn.bf16x2.f32 %r4554, %r4491, %r4490; + cvt.rn.bf16x2.f32 %r4555, %r4493, %r4492; + cvt.rn.bf16x2.f32 %r4556, %r4495, %r4494; + cvt.rn.bf16x2.f32 %r4557, %r4497, %r4496; + cvt.rn.bf16x2.f32 %r4558, %r4499, %r4498; + cvt.rn.bf16x2.f32 %r4559, %r4501, %r4500; + cvt.rn.bf16x2.f32 %r4560, %r4503, %r4502; + cvt.rn.bf16x2.f32 %r4561, %r4505, %r4504; + cvt.rn.bf16x2.f32 %r4562, %r4507, %r4506; + cvt.rn.bf16x2.f32 %r4563, %r4509, %r4508; + cvt.rn.bf16x2.f32 %r4564, %r4511, %r4510; + cvt.rn.bf16x2.f32 %r4565, %r4513, %r4512; + cvt.rn.bf16x2.f32 %r4566, %r4515, %r4514; + cvt.rn.bf16x2.f32 %r4567, %r4517, %r4516; + cvt.rn.bf16x2.f32 %r4568, %r4519, %r4518; + cvt.rn.bf16x2.f32 %r4569, %r4521, %r4520; + cvt.rn.bf16x2.f32 %r4570, %r4523, %r4522; + cvt.rn.bf16x2.f32 %r4571, %r4525, %r4524; + cvt.rn.bf16x2.f32 %r4572, %r4527, %r4526; + cvt.rn.bf16x2.f32 %r4573, %r4529, %r4528; + cvt.rn.bf16x2.f32 %r4574, %r4531, %r4530; + cvt.rn.bf16x2.f32 %r4575, %r4533, %r4532; + shl.b32 %r4576, %r34, 13; + shl.b32 %r4577, %r6, 5; + and.b32 %r4578, %r4577, 7264; + and.b32 %r4579, %r6, 24; + shl.b32 %r4580, %r4579, 4; + shl.b32 %r4581, %r6, 2; + and.b32 %r4582, %r4581, 16; + or.b32 %r4583, %r4576, %r4582; + or.b32 %r4584, %r4578, %r4580; + or.b32 %r4585, %r4583, %r4584; + add.s32 %r4587, %r722, %r4585; + st.shared.v4.b32 [%r4587], {%r4544, %r4546, %r4548, %r4550}; + st.shared.v4.b32 [%r4587+512], {%r4545, %r4547, %r4549, %r4551}; + xor.b32 %r4588, %r4585, 32; + add.s32 %r4589, %r722, %r4588; + st.shared.v4.b32 [%r4589], {%r4552, %r4554, %r4556, %r4558}; + st.shared.v4.b32 [%r4589+512], {%r4553, %r4555, %r4557, %r4559}; + xor.b32 %r4590, %r4585, 64; + add.s32 %r4591, %r722, %r4590; + st.shared.v4.b32 [%r4591], {%r4560, %r4562, %r4564, %r4566}; + st.shared.v4.b32 [%r4591+512], {%r4561, %r4563, %r4565, %r4567}; + xor.b32 %r4592, %r4585, 96; + add.s32 %r4593, %r722, %r4592; + st.shared.v4.b32 [%r4593], {%r4568, %r4570, %r4572, %r4574}; + st.shared.v4.b32 [%r4593+512], {%r4569, %r4571, %r4573, %r4575}; + bar.sync 0; + shl.b32 %r4594, %r4579, 10; + shl.b32 %r4595, %r34, 5; + and.b32 %r620, %r6, 252; + shl.b32 %r4596, %r620, 2; + or.b32 %r4597, %r4594, %r4595; + xor.b32 %r4598, %r4597, %r4596; + add.s32 %r4400, %r722, %r4598; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4396, %r4397, %r4398, %r4399}, [%r4400]; + // end inline asm + add.s32 %r4405, %r4400, 1024; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4401, %r4402, %r4403, %r4404}, [%r4405]; + // end inline asm + add.s32 %r4410, %r4400, 2048; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4406, %r4407, %r4408, %r4409}, [%r4410]; + // end inline asm + add.s32 %r4415, %r4400, 3072; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4411, %r4412, %r4413, %r4414}, [%r4415]; + // end inline asm + add.s32 %r4420, %r4400, 4096; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4416, %r4417, %r4418, %r4419}, [%r4420]; + // end inline asm + add.s32 %r4425, %r4400, 5120; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4421, %r4422, %r4423, %r4424}, [%r4425]; + // end inline asm + add.s32 %r4430, %r4400, 6144; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4426, %r4427, %r4428, %r4429}, [%r4430]; + // end inline asm + add.s32 %r4435, %r4400, 7168; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4431, %r4432, %r4433, %r4434}, [%r4435]; + // end inline asm + // begin inline asm + @%p184 st.global.v4.b32 [ %rd271 + 0 ], { %r4396, %r4397, %r4398, %r4399 }; + // end inline asm + // begin inline asm + @%p185 st.global.v4.b32 [ %rd272 + 0 ], { %r4401, %r4402, %r4403, %r4404 }; + // end inline asm + // begin inline asm + @%p186 st.global.v4.b32 [ %rd273 + 0 ], { %r4406, %r4407, %r4408, %r4409 }; + // end inline asm + // begin inline asm + @%p187 st.global.v4.b32 [ %rd274 + 0 ], { %r4411, %r4412, %r4413, %r4414 }; + // end inline asm + // begin inline asm + @%p188 st.global.v4.b32 [ %rd275 + 0 ], { %r4416, %r4417, %r4418, %r4419 }; + // end inline asm + // begin inline asm + @%p189 st.global.v4.b32 [ %rd276 + 0 ], { %r4421, %r4422, %r4423, %r4424 }; + // end inline asm + // begin inline asm + @%p190 st.global.v4.b32 [ %rd277 + 0 ], { %r4426, %r4427, %r4428, %r4429 }; + // end inline asm + // begin inline asm + @%p191 st.global.v4.b32 [ %rd278 + 0 ], { %r4431, %r4432, %r4433, %r4434 }; + // end inline asm + .loc 1 223 33 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:223:33 + setp.lt.f32 %p194, %r4469, 0f00800000; + mul.f32 %r4599, %r4469, 0f4B000000; + selp.f32 %r621, %r4599, %r4469, %p194; + selp.f32 %r4600, 0fC1B80000, 0f00000000, %p194; + add.s32 %r4601, %r621, -1060439283; + and.b32 %r4602, %r4601, -8388608; + sub.s32 %r4603, %r621, %r4602; + cvt.rn.f32.s32 %r4604, %r4602; + mov.b32 %r4605, 0f34000000; + fma.rn.ftz.f32 %r4606, %r4604, %r4605, %r4600; + add.f32 %r4607, %r4603, 0fBF800000; + mov.b32 %r4608, 0fBE2C7F30; + mov.b32 %r4609, 0f3DC6B27F; + fma.rn.ftz.f32 %r4610, %r4609, %r4607, %r4608; + mov.b32 %r4611, 0f3E2FCF2A; + fma.rn.ftz.f32 %r4612, %r4610, %r4607, %r4611; + mov.b32 %r4613, 0fBE374E43; + fma.rn.ftz.f32 %r4614, %r4612, %r4607, %r4613; + mov.b32 %r4615, 0f3E520BF4; + fma.rn.ftz.f32 %r4616, %r4614, %r4607, %r4615; + mov.b32 %r4617, 0fBE763C8B; + fma.rn.ftz.f32 %r4618, %r4616, %r4607, %r4617; + mov.b32 %r4619, 0f3E93BF99; + fma.rn.ftz.f32 %r4620, %r4618, %r4607, %r4619; + mov.b32 %r4621, 0fBEB8AA49; + fma.rn.ftz.f32 %r4622, %r4620, %r4607, %r4621; + mov.b32 %r4623, 0f3EF6384A; + fma.rn.ftz.f32 %r4624, %r4622, %r4607, %r4623; + mov.b32 %r4625, 0fBF38AA3B; + fma.rn.ftz.f32 %r4626, %r4624, %r4607, %r4625; + mul.f32 %r4627, %r4607, %r4626; + mul.f32 %r4628, %r4607, %r4627; + mov.b32 %r4629, 0f3FB8AA3B; + fma.rn.ftz.f32 %r4630, %r4607, %r4629, %r4628; + add.f32 %r4976, %r4606, %r4630; + setp.lt.u32 %p195, %r621, 2139095040; + mov.b32 %r4631, 0f7F800000; + @%p195 bra $L__BB0_8; +// %bb.7: // %__nv_fmaf_rn.exit.i.i + .loc 1 0 33 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:0:33 + fma.rn.ftz.f32 %r4976, %r621, %r4631, %r4631; +$L__BB0_8: // %__nv_log2f.exit + ld.param.b64 %rd27, [triton_tem_fused_0_param_3]; + .loc 1 223 33 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:223:33 + setp.lt.f32 %p196, %r619, 0f00800000; + mul.f32 %r4632, %r619, 0f4B000000; + selp.f32 %r625, %r4632, %r619, %p196; + selp.f32 %r4633, 0fC1B80000, 0f00000000, %p196; + add.s32 %r4634, %r625, -1060439283; + and.b32 %r4635, %r4634, -8388608; + sub.s32 %r4636, %r625, %r4635; + cvt.rn.f32.s32 %r4637, %r4635; + fma.rn.ftz.f32 %r4639, %r4637, %r4605, %r4633; + add.f32 %r4640, %r4636, 0fBF800000; + fma.rn.ftz.f32 %r4643, %r4609, %r4640, %r4608; + fma.rn.ftz.f32 %r4645, %r4643, %r4640, %r4611; + fma.rn.ftz.f32 %r4647, %r4645, %r4640, %r4613; + fma.rn.ftz.f32 %r4649, %r4647, %r4640, %r4615; + fma.rn.ftz.f32 %r4651, %r4649, %r4640, %r4617; + fma.rn.ftz.f32 %r4653, %r4651, %r4640, %r4619; + fma.rn.ftz.f32 %r4655, %r4653, %r4640, %r4621; + fma.rn.ftz.f32 %r4657, %r4655, %r4640, %r4623; + fma.rn.ftz.f32 %r4659, %r4657, %r4640, %r4625; + mul.f32 %r4660, %r4640, %r4659; + mul.f32 %r4661, %r4640, %r4660; + fma.rn.ftz.f32 %r4663, %r4640, %r4629, %r4661; + add.f32 %r4977, %r4639, %r4663; + setp.lt.u32 %p197, %r625, 2139095040; + @%p197 bra $L__BB0_10; +// %bb.9: // %__nv_fmaf_rn.exit.i.i320 + .loc 1 0 33 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:0:33 + fma.rn.ftz.f32 %r4977, %r625, %r4631, %r4631; +$L__BB0_10: // %__nv_log2f.exit323 + .loc 1 223 33 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:223:33 + setp.eq.f32 %p199, %r625, 0f00000000; + setp.eq.f32 %p200, %r621, 0f00000000; + .loc 1 222 32 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:222:32 + shl.b32 %r4666, %r1, 16; + shl.b32 %r4667, %r2, 11; + add.s32 %r4668, %r4666, %r4667; + .loc 1 222 23 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:222:23 + mad.wide.s32 %rd280, %r4668, 4, %rd27; + .loc 1 144 46 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:144:46 + and.b32 %r4669, %r6, 127; + .loc 1 144 33 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:144:33 + or.b32 %r4670, %r5, %r4669; + .loc 1 222 40 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:222:40 + mad.wide.s32 %rd279, %r4670, 4, %rd280; + .loc 1 223 33 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:223:33 + selp.f32 %r4671, 0fFF800000, %r4976, %p200; + selp.f32 %r4672, 0fFF800000, %r4977, %p199; + .loc 1 223 20 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:223:20 + mov.b64 {%r4673, %r4674}, %rd284; + add.f32 %r4675, %r4674, %r4672; + add.f32 %r4676, %r4673, %r4671; + .loc 1 225 29 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:225:29 + bar.sync 0; + shl.b32 %r4677, %r620, 1; + add.s32 %r4679, %r722, %r4677; + st.shared.v2.b32 [%r4679], {%r4676, %r4675}; + bar.sync 0; + shl.b32 %r4680, %r28, 3; + shl.b32 %r4681, %r29, 2; + shr.u32 %r4682, %r30, 1; + add.s32 %r4683, %r722, %r4680; + add.s32 %r4684, %r4683, %r4681; + add.s32 %r4685, %r4684, %r4682; + ld.shared.b32 %r4665, [%r4685]; + and.b32 %r4686, %r6, 128; + setp.eq.b32 %p198, %r4686, 0; + // begin inline asm + @%p198 st.global.b32 [ %rd279 + 0 ], { %r4665 }; + // end inline asm + .loc 1 229 4 // c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py:229:4 + ret; +$L__tmp11: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 251 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xf4 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 53 +.b8 112 +.b8 98 +.b8 107 +.b8 103 +.b8 53 +.b8 101 +.b8 113 +.b8 54 +.b8 52 +.b8 101 +.b8 109 +.b8 117 +.b8 118 +.b8 50 +.b8 53 +.b8 117 +.b8 107 +.b8 107 +.b8 105 +.b8 55 +.b8 97 +.b8 53 +.b8 100 +.b8 120 +.b8 118 +.b8 110 +.b8 50 +.b8 112 +.b8 50 +.b8 115 +.b8 104 +.b8 54 +.b8 106 +.b8 101 +.b8 105 +.b8 119 +.b8 98 +.b8 54 +.b8 98 +.b8 53 +.b8 52 +.b8 116 +.b8 98 +.b8 105 +.b8 100 +.b8 112 +.b8 115 +.b8 53 +.b8 119 +.b8 55 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 53 +.b8 112 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x15 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 116 +.b8 101 +.b8 109 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa0:0x5e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xb5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 146 // DW_AT_call_line +.b8 101 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xcd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 172 // DW_AT_call_line +.b8 41 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xe5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp10 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 198 // DW_AT_call_line +.b8 45 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.source new file mode 100644 index 0000000000000000000000000000000000000000..7286e97027255d8209586f15052762f4a1d2e9f6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.source @@ -0,0 +1,1132 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":18:0) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":271:0) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":32:0) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":448:0) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":299:0) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":256:0) +#loc220 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":175:0) +#loc222 = loc(unknown) +#loc225 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":167:0) +#loc229 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc233 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc237 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":240:0) +#loc258 = loc("arg_Q"(#loc)) +#loc259 = loc("arg_K"(#loc)) +#loc260 = loc("arg_V"(#loc)) +#loc261 = loc("arg_LSE"(#loc)) +#loc262 = loc("arg_MAX"(#loc)) +#loc263 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc264 = loc("arg_KV_IDX"(#loc)) +#loc265 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc266 = loc("arg_FULL_KV_IDX"(#loc)) +#loc267 = loc("in_ptr9"(#loc)) +#loc268 = loc("out_ptr0"(#loc)) +#loc367 = loc("ptr"(#loc122)) +#loc368 = loc("offs_m"(#loc122)) +#loc369 = loc("offs_n"(#loc122)) +#loc370 = loc("stride_m"(#loc122)) +#loc371 = loc("stride_n"(#loc122)) +#loc372 = loc("M_LEN"(#loc122)) +#loc379 = loc("x"(#loc132)) +#loc380 = loc("arg_Q"(#loc138)) +#loc381 = loc("arg_K"(#loc138)) +#loc382 = loc("arg_V"(#loc138)) +#loc383 = loc("arg_LSE"(#loc138)) +#loc384 = loc("arg_MAX"(#loc138)) +#loc385 = loc("arg_KV_NUM_BLKS"(#loc138)) +#loc386 = loc("arg_KV_IDX"(#loc138)) +#loc387 = loc("arg_FULL_KV_NUM_BLKS"(#loc138)) +#loc388 = loc("arg_FULL_KV_IDX"(#loc138)) +#loc389 = loc("in_ptr9"(#loc138)) +#loc390 = loc("out_ptr0"(#loc138)) +#loc391 = loc("q"(#loc138)) +#loc392 = loc("K"(#loc138)) +#loc393 = loc("V"(#loc138)) +#loc394 = loc("Q_LEN"(#loc138)) +#loc395 = loc("KV_LEN"(#loc138)) +#loc396 = loc("acc"(#loc138)) +#loc397 = loc("l_i"(#loc138)) +#loc398 = loc("m_i"(#loc138)) +#loc399 = loc("off_z"(#loc138)) +#loc400 = loc("off_h"(#loc138)) +#loc401 = loc("offs_m"(#loc138)) +#loc402 = loc("offs_n"(#loc138)) +#loc403 = loc("kv_start"(#loc138)) +#loc404 = loc("kv_indices"(#loc138)) +#loc405 = loc("kv_num_blocks"(#loc138)) +#loc406 = loc("block_n_end"(#loc138)) +#loc407 = loc("stride_kk"(#loc138)) +#loc408 = loc("stride_kn"(#loc138)) +#loc409 = loc("stride_vn"(#loc138)) +#loc410 = loc("stride_vk"(#loc138)) +#loc416 = loc("arg_Q"(#loc148)) +#loc417 = loc("arg_K"(#loc148)) +#loc418 = loc("arg_V"(#loc148)) +#loc419 = loc("arg_LSE"(#loc148)) +#loc420 = loc("arg_MAX"(#loc148)) +#loc421 = loc("arg_KV_NUM_BLKS"(#loc148)) +#loc422 = loc("arg_KV_IDX"(#loc148)) +#loc423 = loc("arg_FULL_KV_NUM_BLKS"(#loc148)) +#loc424 = loc("arg_FULL_KV_IDX"(#loc148)) +#loc425 = loc("in_ptr9"(#loc148)) +#loc426 = loc("out_ptr0"(#loc148)) +#loc427 = loc("q"(#loc148)) +#loc428 = loc("K"(#loc148)) +#loc429 = loc("V"(#loc148)) +#loc430 = loc("Q_LEN"(#loc148)) +#loc431 = loc("KV_LEN"(#loc148)) +#loc432 = loc("acc"(#loc148)) +#loc433 = loc("l_i"(#loc148)) +#loc434 = loc("m_i"(#loc148)) +#loc435 = loc("off_z"(#loc148)) +#loc436 = loc("off_h"(#loc148)) +#loc437 = loc("offs_m"(#loc148)) +#loc438 = loc("offs_n"(#loc148)) +#loc439 = loc("kv_start"(#loc148)) +#loc440 = loc("kv_offset"(#loc148)) +#loc441 = loc("stride_kk"(#loc148)) +#loc442 = loc("stride_kn"(#loc148)) +#loc443 = loc("stride_vn"(#loc148)) +#loc444 = loc("stride_vk"(#loc148)) +#loc511 = loc("indices"(#loc217)) +#loc512 = loc("input"(#loc220)) +#loc513 = loc("a"(#loc225)) +#loc514 = loc("b"(#loc225)) +#loc515 = loc("input"(#loc229)) +#loc516 = loc("a"(#loc233)) +#loc517 = loc("b"(#loc233)) +#loc518 = loc("loop_iter"(#loc237)) +#loc519 = loc("col_indices"(#loc237)) +#loc520 = loc("total_blocks"(#loc237)) +module { + tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %in_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr9"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc))) attributes {noinline = false} { + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c2097152_i32 = arith.constant 2097152 : i32 loc(#loc2) + %c262144_i32 = arith.constant 262144 : i32 loc(#loc2) + %c128_i32_0 = arith.constant 128 : i32 loc(#loc2) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc2) + %c2097152_i32_2 = arith.constant 2097152 : i32 loc(#loc3) + %c262144_i32_3 = arith.constant 262144 : i32 loc(#loc3) + %c128_i32_4 = arith.constant 128 : i32 loc(#loc3) + %c1_i32_5 = arith.constant 1 : i32 loc(#loc3) + %ZQ = arith.constant 8 : i32 loc(#loc269) + %HQ = arith.constant 32 : i32 loc(#loc270) + %Q_LEN = arith.constant 2048 : i32 loc(#loc271) + %ZKV = arith.constant 8 : i32 loc(#loc272) + %KV_LEN = arith.constant 2048 : i32 loc(#loc273) + %q_start = tt.get_program_id x : i32 loc(#loc274) + %off_zq = tt.get_program_id y : i32 loc(#loc275) + %off_hq = tt.get_program_id z : i32 loc(#loc276) + %off_zkv = arith.remsi %off_zq, %ZKV : i32 loc(#loc277) + %off_hkv = arith.constant 4 : i32 loc(#loc278) + %off_hkv_6 = arith.constant 4 : i32 loc(#loc278) + %off_hkv_7 = arith.divsi %off_hq, %off_hkv_6 : i32 loc(#loc278) + %off_g = arith.constant 4 : i32 loc(#loc279) + %off_g_8 = arith.constant 4 : i32 loc(#loc279) + %off_g_9 = arith.remsi %off_hq, %off_g_8 : i32 loc(#loc279) + %q_offset = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc280) + %q_offset_10 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc281) + %q_offset_11 = arith.addi %q_offset, %q_offset_10 : i32 loc(#loc282) + %k_offset = arith.muli %off_zkv, %c2097152_i32 : i32 loc(#loc283) + %k_offset_12 = arith.muli %off_hkv_7, %c262144_i32 : i32 loc(#loc284) + %k_offset_13 = arith.addi %k_offset, %k_offset_12 : i32 loc(#loc285) + %v_offset = arith.muli %off_zkv, %c2097152_i32_2 : i32 loc(#loc286) + %v_offset_14 = arith.muli %off_hkv_7, %c262144_i32_3 : i32 loc(#loc287) + %v_offset_15 = arith.addi %v_offset, %v_offset_14 : i32 loc(#loc288) + %Q = tt.addptr %arg_Q, %q_offset_11 : !tt.ptr, i32 loc(#loc289) + %K = tt.addptr %arg_K, %k_offset_13 : !tt.ptr, i32 loc(#loc290) + %V = tt.addptr %arg_V, %v_offset_15 : !tt.ptr, i32 loc(#loc291) + %SPARSE_Z = arith.constant 8 : i32 loc(#loc292) + %SPARSE_HQ = arith.constant 1 : i32 loc(#loc293) + %sparse_idx_z = arith.remsi %off_zq, %SPARSE_Z : i32 loc(#loc294) + %sparse_idx_hq = arith.remsi %off_hq, %SPARSE_HQ : i32 loc(#loc295) + %stride_kv_num_blks_h = arith.constant 16 : i32 loc(#loc296) + %stride_kv_idx_h = arith.constant 256 : i32 loc(#loc297) + %stride_kv_idx_m = arith.constant 16 : i32 loc(#loc298) + %m_i = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128xf32> loc(#loc299) + %m_i_16 = arith.constant 0x7F800000 : f32 loc(#loc300) + %m_i_17 = arith.constant 0x7F800000 : f32 loc(#loc300) + %m_i_18 = arith.constant dense<0x7F800000> : tensor<128xf32> loc(#loc300) + %m_i_19 = arith.subf %m_i, %m_i_18 : tensor<128xf32> loc(#loc300) + %l_i = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128xf32> loc(#loc301) + %acc = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc302) + %offs_m = arith.constant 128 : i32 loc(#loc303) + %offs_m_20 = arith.constant 128 : i32 loc(#loc303) + %offs_m_21 = arith.muli %q_start, %offs_m_20 : i32 loc(#loc303) + %offs_m_22 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc304) + %offs_m_23 = tt.splat %offs_m_21 : i32 -> tensor<128xi32> loc(#loc305) + %offs_m_24 = arith.addi %offs_m_23, %offs_m_22 : tensor<128xi32> loc(#loc305) + %sparse_hz_offset = arith.muli %sparse_idx_z, %SPARSE_HQ : i32 loc(#loc306) + %sparse_hz_offset_25 = arith.addi %sparse_hz_offset, %sparse_idx_hq : i32 loc(#loc307) + %sparse_kv_num_blks_offset = arith.muli %sparse_hz_offset_25, %stride_kv_num_blks_h : i32 loc(#loc308) + %sparse_kv_num_blks_offset_26 = arith.constant 1 : i32 loc(#loc309) + %sparse_kv_num_blks_offset_27 = arith.constant 1 : i32 loc(#loc309) + %sparse_kv_num_blks_offset_28 = arith.divsi %q_start, %sparse_kv_num_blks_offset_27 : i32 loc(#loc309) + %sparse_kv_num_blks_offset_29 = arith.addi %sparse_kv_num_blks_offset, %sparse_kv_num_blks_offset_28 : i32 loc(#loc310) + %sparse_kv_idx_offset = arith.muli %sparse_hz_offset_25, %stride_kv_idx_h : i32 loc(#loc311) + %sparse_kv_idx_offset_30 = arith.constant 1 : i32 loc(#loc312) + %sparse_kv_idx_offset_31 = arith.constant 1 : i32 loc(#loc312) + %sparse_kv_idx_offset_32 = arith.divsi %q_start, %sparse_kv_idx_offset_31 : i32 loc(#loc312) + %sparse_kv_idx_offset_33 = arith.muli %sparse_kv_idx_offset_32, %stride_kv_idx_m : i32 loc(#loc313) + %sparse_kv_idx_offset_34 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_33 : i32 loc(#loc314) + %offs_m_35 = arith.constant 128 : i32 loc(#loc315) + %offs_m_36 = arith.constant 128 : i32 loc(#loc315) + %offs_m_37 = arith.muli %q_start, %offs_m_36 : i32 loc(#loc315) + %offs_m_38 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc316) + %offs_m_39 = tt.splat %offs_m_37 : i32 -> tensor<128xi32> loc(#loc317) + %offs_m_40 = arith.addi %offs_m_39, %offs_m_38 : tensor<128xi32> loc(#loc317) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc318) + %q = tt.call @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%Q, %offs_m_40, %offs_k, %c4096_i32, %c1_i32, %Q_LEN) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc319) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_34 : !tt.ptr, i32 loc(#loc320) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc321) + %kv_start_41 = arith.constant 128 : i32 loc(#loc322) + %kv_start_42 = arith.constant 128 : i32 loc(#loc322) + %kv_start_43 = arith.muli %kv_start, %kv_start_42 : i32 loc(#loc322) + %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_29 : !tt.ptr, i32 loc(#loc323) + %kv_num_blocks_44 = tt.load %kv_num_blocks : !tt.ptr loc(#loc324) + %block_n_end = arith.constant 2 : i32 loc(#loc325) + %block_n_end_45 = arith.constant 2 : i32 loc(#loc325) + %block_n_end_46 = arith.muli %kv_num_blocks_44, %block_n_end_45 : i32 loc(#loc325) + %block_n_end_47 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%KV_LEN) : (i32) -> i32 loc(#loc326) + %block_n_end_48 = arith.constant 1 : i32 loc(#loc327) + %block_n_end_49 = arith.maxsi %block_n_end_47, %block_n_end_48 : i32 loc(#loc327) + %block_n_end_50 = arith.minsi %block_n_end_46, %block_n_end_49 : i32 loc(#loc328) + %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc329) + %offs_n_51 = tt.splat %kv_start_43 : i32 -> tensor<64xi32> loc(#loc330) + %offs_n_52 = arith.addi %offs_n_51, %offs_n : tensor<64xi32> loc(#loc330) + %0 = tt.expand_dims %offs_m_40 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc66) + %1 = tt.expand_dims %offs_n_52 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc67) + %2:3 = tt.call @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(14,)cNone_(15,)cNone_(28,)cconstexpr_0__(30,)cconstexpr_bf16__(35,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %q, %K, %V, %Q_LEN, %KV_LEN, %acc, %l_i, %m_i_19, %off_zq, %off_hq, %0, %1, %kv_start_43, %kv_indices, %kv_num_blocks_44, %block_n_end_50, %c1_i32_1, %c128_i32_0, %c128_i32_4, %c1_i32_5) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, !tt.ptr, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc68) + %kv_indices_53 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_34 : !tt.ptr, i32 loc(#loc331) + %kv_start_54 = tt.load %kv_indices_53 : !tt.ptr loc(#loc332) + %kv_start_55 = arith.constant 128 : i32 loc(#loc333) + %kv_start_56 = arith.constant 128 : i32 loc(#loc333) + %kv_start_57 = arith.muli %kv_start_54, %kv_start_56 : i32 loc(#loc333) + %kv_num_blocks_58 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_29 : !tt.ptr, i32 loc(#loc334) + %kv_num_blocks_59 = tt.load %kv_num_blocks_58 : !tt.ptr loc(#loc335) + %block_n_end_60 = arith.constant 2 : i32 loc(#loc336) + %block_n_end_61 = arith.constant 2 : i32 loc(#loc336) + %block_n_end_62 = arith.muli %kv_num_blocks_59, %block_n_end_61 : i32 loc(#loc336) + %block_n_end_63 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%KV_LEN) : (i32) -> i32 loc(#loc337) + %block_n_end_64 = arith.constant 1 : i32 loc(#loc338) + %block_n_end_65 = arith.maxsi %block_n_end_63, %block_n_end_64 : i32 loc(#loc338) + %block_n_end_66 = arith.minsi %block_n_end_62, %block_n_end_65 : i32 loc(#loc339) + %offs_n_67 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc340) + %offs_n_68 = tt.splat %kv_start_57 : i32 -> tensor<64xi32> loc(#loc341) + %offs_n_69 = arith.addi %offs_n_68, %offs_n_67 : tensor<64xi32> loc(#loc341) + %3 = tt.expand_dims %offs_m_40 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc80) + %4 = tt.expand_dims %offs_n_69 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc81) + %5:3 = tt.call @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(14,)cNone_(15,)cNone_(28,)cconstexpr_0__(30,)cconstexpr_bf16__(35,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %q, %K, %V, %Q_LEN, %KV_LEN, %2#0, %2#1, %2#2, %off_zq, %off_hq, %3, %4, %kv_start_57, %kv_indices_53, %kv_num_blocks_59, %block_n_end_66, %c1_i32_1, %c128_i32_0, %c128_i32_4, %c1_i32_5) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, !tt.ptr, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc82) + %l_i_70 = arith.constant 0.000000e+00 : f32 loc(#loc342) + %l_i_71 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc342) + %l_i_72 = arith.cmpf oeq, %5#1, %l_i_71 : tensor<128xf32> loc(#loc342) + %l_i_73 = arith.constant 1 : i32 loc(#loc343) + %l_i_74 = arith.constant 1.000000e+00 : f32 loc(#loc343) + %l_i_75 = arith.constant dense<1.000000e+00> : tensor<128xf32> loc(#loc343) + %l_i_76 = arith.select %l_i_72, %l_i_75, %5#1 : tensor<128xi1>, tensor<128xf32> loc(#loc343) + %acc_77 = tt.expand_dims %l_i_76 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc344) + %acc_78 = tt.broadcast %acc_77 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc345) + %acc_79 = arith.divf %5#0, %acc_78 : tensor<128x128xf32> loc(#loc345) + %idx_zq = tt.get_program_id y : i32 loc(#loc346) + %idx_hq = tt.get_program_id z : i32 loc(#loc347) + %idx_m = tt.expand_dims %offs_m_40 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc348) + %idx_d = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc349) + %idx_d_80 = tt.expand_dims %idx_d {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc350) + %mask = arith.constant dense<2048> : tensor<128x1xi32> loc(#loc351) + %mask_81 = arith.cmpi slt, %idx_m, %mask : tensor<128x1xi32> loc(#loc351) + %mask_82 = arith.constant 128 : i32 loc(#loc352) + %mask_83 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc352) + %mask_84 = arith.cmpi slt, %idx_d_80, %mask_83 : tensor<1x128xi32> loc(#loc352) + %mask_85 = tt.broadcast %mask_81 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc353) + %mask_86 = tt.broadcast %mask_84 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc353) + %mask_87 = arith.andi %mask_85, %mask_86 : tensor<128x128xi1> loc(#loc353) + %xindex = arith.constant 128 : i32 loc(#loc354) + %xindex_88 = arith.constant 128 : i32 loc(#loc354) + %xindex_89 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc354) + %xindex_90 = arith.muli %xindex_89, %idx_m : tensor<128x1xi32> loc(#loc354) + %xindex_91 = tt.broadcast %idx_d_80 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc355) + %xindex_92 = tt.broadcast %xindex_90 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc355) + %xindex_93 = arith.addi %xindex_91, %xindex_92 : tensor<128x128xi32> loc(#loc355) + %xindex_94 = arith.constant 262144 : i32 loc(#loc356) + %xindex_95 = arith.constant 262144 : i32 loc(#loc356) + %xindex_96 = arith.muli %xindex_95, %idx_hq : i32 loc(#loc356) + %xindex_97 = tt.splat %xindex_96 : i32 -> tensor<128x128xi32> loc(#loc357) + %xindex_98 = arith.addi %xindex_93, %xindex_97 : tensor<128x128xi32> loc(#loc357) + %xindex_99 = arith.constant 8388608 : i32 loc(#loc358) + %xindex_100 = arith.constant 8388608 : i32 loc(#loc358) + %xindex_101 = arith.muli %xindex_100, %idx_zq : i32 loc(#loc358) + %xindex_102 = tt.splat %xindex_101 : i32 -> tensor<128x128xi32> loc(#loc359) + %xindex_103 = arith.addi %xindex_98, %xindex_102 : tensor<128x128xi32> loc(#loc359) + %c128_i32_104 = arith.constant 128 : i32 loc(#loc101) + %c128_i32_105 = arith.constant 128 : i32 loc(#loc101) + %6 = arith.muli %c128_i32_105, %idx_hq : i32 loc(#loc101) + %7 = tt.splat %6 : i32 -> tensor<1x128xi32> loc(#loc102) + %8 = arith.addi %idx_d_80, %7 : tensor<1x128xi32> loc(#loc102) + %c4096_i32_106 = arith.constant 4096 : i32 loc(#loc103) + %c4096_i32_107 = arith.constant 4096 : i32 loc(#loc103) + %cst = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc103) + %9 = arith.muli %cst, %idx_m : tensor<128x1xi32> loc(#loc103) + %10 = tt.broadcast %8 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc104) + %11 = tt.broadcast %9 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc104) + %12 = arith.addi %10, %11 : tensor<128x128xi32> loc(#loc104) + %c8388608_i32_108 = arith.constant 8388608 : i32 loc(#loc105) + %c8388608_i32_109 = arith.constant 8388608 : i32 loc(#loc105) + %13 = arith.muli %c8388608_i32_109, %idx_zq : i32 loc(#loc105) + %14 = tt.splat %13 : i32 -> tensor<128x128xi32> loc(#loc106) + %15 = arith.addi %12, %14 : tensor<128x128xi32> loc(#loc106) + %16 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc107) + %17 = tt.addptr %16, %15 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc107) + %18 = arith.truncf %acc_79 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc108) + tt.store %17, %18, %mask_87 : tensor<128x128x!tt.ptr> loc(#loc108) + %off_hz = arith.muli %off_zq, %HQ : i32 loc(#loc360) + %off_hz_110 = arith.addi %off_hz, %off_hq : i32 loc(#loc361) + %l_ptrs = arith.muli %off_hz_110, %Q_LEN : i32 loc(#loc362) + %l_ptrs_111 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr, i32 loc(#loc363) + %l_ptrs_112 = tt.splat %l_ptrs_111 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc364) + %l_ptrs_113 = tt.addptr %l_ptrs_112, %offs_m_40 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc364) + %lse = math.log2 %l_i_76 : tensor<128xf32> loc(#loc365) + %lse_114 = arith.addf %5#2, %lse : tensor<128xf32> loc(#loc366) + tt.store %l_ptrs_113, %lse_114 : tensor<128x!tt.ptr> loc(#loc116) + tt.return loc(#loc117) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(1,)cconstexpr_fp32_"() -> tensor<128xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc119) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc119) + tt.return %cst_0 : tensor<128xf32> loc(#loc120) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128xf32> loc(#loc121) + tt.return %0 : tensor<128xf32> loc(#loc121) + } loc(#loc118) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() -> tensor<128x128xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc119) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc119) + tt.return %cst_0 : tensor<128x128xf32> loc(#loc120) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc121) + tt.return %0 : tensor<128x128xf32> loc(#loc121) + } loc(#loc118) + tt.func private @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: !tt.ptr loc("ptr"(#loc122)), %offs_m: tensor<128xi32> loc("offs_m"(#loc122)), %offs_n: tensor<128xi32> loc("offs_n"(#loc122)), %stride_m: i32 loc("stride_m"(#loc122)), %stride_n: i32 loc("stride_n"(#loc122)), %M_LEN: i32 loc("M_LEN"(#loc122))) -> tensor<128x128xbf16> attributes {noinline = false} { + %ptr_0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc373) + %ptr_1 = tt.splat %stride_m : i32 -> tensor<128x1xi32> loc(#loc374) + %ptr_2 = arith.muli %ptr_0, %ptr_1 : tensor<128x1xi32> loc(#loc374) + %ptr_3 = tt.splat %ptr : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc375) + %ptr_4 = tt.addptr %ptr_3, %ptr_2 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc375) + %ptr_5 = tt.expand_dims %offs_n {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc376) + %ptr_6 = tt.splat %stride_n : i32 -> tensor<1x128xi32> loc(#loc377) + %ptr_7 = arith.muli %ptr_5, %ptr_6 : tensor<1x128xi32> loc(#loc377) + %ptr_8 = tt.broadcast %ptr_4 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc378) + %ptr_9 = tt.broadcast %ptr_7 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc378) + %ptr_10 = tt.addptr %ptr_8, %ptr_9 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc378) + %0 = tt.load %ptr_10 : tensor<128x128x!tt.ptr> loc(#loc129) + tt.return %0 : tensor<128x128xbf16> loc(#loc130) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x128xbf16> loc(#loc131) + tt.return %1 : tensor<128x128xbf16> loc(#loc131) + } loc(#loc122) + tt.func private @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%x: i32 loc("x"(#loc132))) -> i32 attributes {noinline = false} { + %c64_i32 = arith.constant 64 : i32 loc(#loc133) + %c64_i32_0 = arith.constant 64 : i32 loc(#loc133) + %0 = arith.addi %x, %c64_i32_0 : i32 loc(#loc133) + %c1_i32 = arith.constant 1 : i32 loc(#loc134) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc134) + %1 = arith.subi %0, %c1_i32_1 : i32 loc(#loc134) + %c64_i32_2 = arith.constant 64 : i32 loc(#loc135) + %c64_i32_3 = arith.constant 64 : i32 loc(#loc135) + %2 = arith.divsi %1, %c64_i32_3 : i32 loc(#loc135) + tt.return %2 : i32 loc(#loc136) + ^bb1: // no predecessors + %3 = ub.poison : i32 loc(#loc137) + tt.return %3 : i32 loc(#loc137) + } loc(#loc132) + tt.func private @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(14,)cNone_(15,)cNone_(28,)cconstexpr_0__(30,)cconstexpr_bf16__(35,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc138)), %arg_K: !tt.ptr loc("arg_K"(#loc138)), %arg_V: !tt.ptr loc("arg_V"(#loc138)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc138)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc138)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc138)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc138)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc138)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc138)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc138)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc138)), %q: tensor<128x128xbf16> loc("q"(#loc138)), %K: !tt.ptr loc("K"(#loc138)), %V: !tt.ptr loc("V"(#loc138)), %Q_LEN: i32 loc("Q_LEN"(#loc138)), %KV_LEN: i32 loc("KV_LEN"(#loc138)), %acc: tensor<128x128xf32> loc("acc"(#loc138)), %l_i: tensor<128xf32> loc("l_i"(#loc138)), %m_i: tensor<128xf32> loc("m_i"(#loc138)), %off_z: i32 loc("off_z"(#loc138)), %off_h: i32 loc("off_h"(#loc138)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc138)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc138)), %kv_start: i32 loc("kv_start"(#loc138)), %kv_indices: !tt.ptr loc("kv_indices"(#loc138)), %kv_num_blocks: i32 loc("kv_num_blocks"(#loc138)), %block_n_end: i32 loc("block_n_end"(#loc138)), %stride_kk: i32 loc("stride_kk"(#loc138)), %stride_kn: i32 loc("stride_kn"(#loc138)), %stride_vn: i32 loc("stride_vn"(#loc138)), %stride_vk: i32 loc("stride_vk"(#loc138))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_offset = arith.constant 0 : i32 loc(#loc411) + %c0_i32 = arith.constant 0 : i32 loc(#loc140) + %c1_i32 = arith.constant 1 : i32 loc(#loc140) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc140) + %1 = arith.bitcast %block_n_end : i32 to i32 loc(#loc140) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc140) + %3 = ub.poison : i32 loc(#loc140) + %kv_offset_0:5 = scf.for %start_n = %0 to %1 step %2 iter_args(%acc_1 = %acc, %l_i_2 = %l_i, %m_i_3 = %m_i, %offs_n_4 = %offs_n, %kv_offset_5 = %kv_offset) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %7:3 = tt.call @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(14,)cconstexpr_None__(15,)cconstexpr_None__(27,)cconstexpr_bf16__(28,)cconstexpr_1_d_44269504__(33,)cconstexpr_False__(34,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %q, %K, %V, %Q_LEN, %KV_LEN, %acc_1, %l_i_2, %m_i_3, %off_z, %off_h, %offs_m, %offs_n_4, %kv_start, %kv_offset_5, %stride_kk, %stride_kn, %stride_vn, %stride_vk) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc141) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc413) + %offs_n_6 = tt.splat %offset : i32 -> tensor<1x64xi32> loc(#loc414) + %offs_n_7 = arith.addi %offs_n_4, %offs_n_6 : tensor<1x64xi32> loc(#loc414) + %kv_offset_8 = arith.addi %kv_offset_5, %offset : i32 loc(#loc415) + scf.yield %7#0, %7#1, %7#2, %offs_n_7, %kv_offset_8 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc145) + } loc(#loc542) + tt.return %kv_offset_0#0, %kv_offset_0#1, %kv_offset_0#2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc146) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc147) + %5 = ub.poison : tensor<128xf32> loc(#loc147) + %6 = ub.poison : tensor<128xf32> loc(#loc147) + tt.return %4, %5, %6 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc147) + } loc(#loc138) + tt.func private @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(14,)cconstexpr_None__(15,)cconstexpr_None__(27,)cconstexpr_bf16__(28,)cconstexpr_1_d_44269504__(33,)cconstexpr_False__(34,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc148)), %arg_K: !tt.ptr loc("arg_K"(#loc148)), %arg_V: !tt.ptr loc("arg_V"(#loc148)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc148)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc148)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc148)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc148)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc148)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc148)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc148)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc148)), %q: tensor<128x128xbf16> loc("q"(#loc148)), %K: !tt.ptr loc("K"(#loc148)), %V: !tt.ptr loc("V"(#loc148)), %Q_LEN: i32 loc("Q_LEN"(#loc148)), %KV_LEN: i32 loc("KV_LEN"(#loc148)), %acc: tensor<128x128xf32> loc("acc"(#loc148)), %l_i: tensor<128xf32> loc("l_i"(#loc148)), %m_i: tensor<128xf32> loc("m_i"(#loc148)), %off_z: i32 loc("off_z"(#loc148)), %off_h: i32 loc("off_h"(#loc148)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc148)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc148)), %kv_start: i32 loc("kv_start"(#loc148)), %kv_offset: i32 loc("kv_offset"(#loc148)), %stride_kk: i32 loc("stride_kk"(#loc148)), %stride_kn: i32 loc("stride_kn"(#loc148)), %stride_vn: i32 loc("stride_vn"(#loc148)), %stride_vk: i32 loc("stride_vk"(#loc148))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_base_offset = arith.addi %kv_start, %kv_offset : i32 loc(#loc445) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc446) + %offs_n_load = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc447) + %offs_n_load_0 = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc448) + %offs_n_load_1 = arith.addi %offs_n_load_0, %offs_n_load : tensor<64xi32> loc(#loc448) + %k = tt.call @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%K, %offs_n_load_1, %offs_k, %stride_kn, %stride_kk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc449) + %k_2 = tt.trans %k {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc450) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc451) + %qk_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc451) + %qk_4 = tt.dot %q, %k_2, %qk_3, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc451) + %qk_5 = arith.constant 0.0883883461 : f32 loc(#loc452) + %qk_6 = arith.constant 0.0883883461 : f32 loc(#loc452) + %qk_7 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc452) + %qk_8 = arith.mulf %qk_4, %qk_7 : tensor<128x64xf32> loc(#loc452) + %m = tt.call @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.get_bounded_indices__i32S128_1S__(1,)cconstexpr_None_"(%offs_m) : (tensor<128x1xi32>) -> tensor<128x1xi32> loc(#loc453) + %n = tt.call @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.get_bounded_indices__i32S1_64S__(1,)cconstexpr_None_"(%offs_n) : (tensor<1x64xi32>) -> tensor<1x64xi32> loc(#loc454) + %tmp1 = arith.constant false loc(#loc455) + %tmp1_9 = arith.constant dense : tensor<1xi1> loc(#loc455) + %tmp4 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc456) + %tmp4_10 = tt.broadcast %n : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc456) + %tmp4_11 = arith.cmpi sge, %tmp4, %tmp4_10 : tensor<128x64xi32> loc(#loc456) + %tmp5 = arith.extsi %n : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc457) + %tmp7 = tt.addptr %in_ptr9, %off_z : !tt.ptr, i32 loc(#loc458) + %tmp7_12 = tt.load %tmp7 : !tt.ptr loc(#loc459) + %tmp8 = tt.splat %tmp7_12 : i64 -> tensor<1x64xi64> loc(#loc460) + %tmp8_13 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc460) + %tmp9 = arith.extsi %m : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc461) + %tmp10 = tt.splat %tmp7_12 : i64 -> tensor<128x1xi64> loc(#loc462) + %tmp10_14 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc462) + %tmp11 = tt.broadcast %tmp8_13 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc463) + %tmp11_15 = tt.broadcast %tmp10_14 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc463) + %tmp11_16 = arith.andi %tmp11, %tmp11_15 : tensor<128x64xi1> loc(#loc463) + %tmp12 = arith.andi %tmp4_11, %tmp11_16 : tensor<128x64xi1> loc(#loc464) + %tmp13 = tt.expand_dims %tmp1_9 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc465) + %tmp13_17 = tt.broadcast %tmp13 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc465) + %tmp13_18 = arith.ori %tmp13_17, %tmp12 : tensor<128x64xi1> loc(#loc465) + %tmp14 = arith.constant 2048 : i32 loc(#loc466) + %tmp14_19 = arith.constant dense<2048> : tensor<1xi32> loc(#loc466) + %tmp15 = tt.expand_dims %tmp14_19 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc467) + %tmp15_20 = tt.broadcast %tmp15 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc467) + %tmp15_21 = arith.cmpi sge, %n, %tmp15_20 : tensor<1x64xi32> loc(#loc467) + %tmp16 = tt.expand_dims %tmp14_19 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc468) + %tmp16_22 = tt.broadcast %tmp16 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc468) + %tmp16_23 = arith.remsi %n, %tmp16_22 : tensor<1x64xi32> loc(#loc468) + %tmp17 = arith.constant 0 : i32 loc(#loc469) + %tmp17_24 = arith.constant dense<0> : tensor<1xi32> loc(#loc469) + %tmp18 = tt.expand_dims %tmp17_24 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc470) + %tmp18_25 = tt.broadcast %tmp18 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc470) + %tmp18_26 = arith.cmpi ne, %tmp16_23, %tmp18_25 : tensor<1x64xi32> loc(#loc470) + %tmp19 = arith.constant 0 : i32 loc(#loc471) + %tmp19_27 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc471) + %tmp19_28 = arith.cmpi slt, %tmp16_23, %tmp19_27 : tensor<1x64xi32> loc(#loc471) + %tmp20 = arith.constant 0 : i32 loc(#loc472) + %tmp20_29 = arith.constant dense<0> : tensor<1xi32> loc(#loc472) + %tmp20_30 = arith.cmpi slt, %tmp14_19, %tmp20_29 : tensor<1xi32> loc(#loc472) + %tmp21 = tt.expand_dims %tmp20_30 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc473) + %tmp21_31 = tt.broadcast %tmp21 : tensor<1x1xi1> -> tensor<1x64xi1> loc(#loc473) + %tmp21_32 = arith.cmpi ne, %tmp19_28, %tmp21_31 : tensor<1x64xi1> loc(#loc473) + %tmp22 = arith.andi %tmp18_26, %tmp21_32 : tensor<1x64xi1> loc(#loc474) + %tmp23 = tt.expand_dims %tmp14_19 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc475) + %tmp23_33 = tt.broadcast %tmp23 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc475) + %tmp23_34 = arith.addi %tmp16_23, %tmp23_33 : tensor<1x64xi32> loc(#loc475) + %tmp24 = arith.select %tmp22, %tmp23_34, %tmp16_23 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc476) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc477) + %tmp26 = tt.splat %tmp7_12 : i64 -> tensor<1x64xi64> loc(#loc478) + %tmp26_35 = arith.cmpi slt, %tmp25, %tmp26 : tensor<1x64xi64> loc(#loc478) + %tmp27 = arith.andi %tmp15_21, %tmp26_35 : tensor<1x64xi1> loc(#loc479) + %tmp28 = tt.broadcast %n : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc480) + %tmp28_36 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc480) + %tmp28_37 = arith.subi %tmp28, %tmp28_36 : tensor<128x64xi32> loc(#loc480) + %tmp29 = tt.expand_dims %tmp14_19 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc481) + %tmp29_38 = tt.broadcast %tmp29 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc481) + %tmp29_39 = arith.remsi %tmp28_37, %tmp29_38 : tensor<128x64xi32> loc(#loc481) + %tmp30 = tt.expand_dims %tmp17_24 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc482) + %tmp30_40 = tt.broadcast %tmp30 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc482) + %tmp30_41 = arith.cmpi ne, %tmp29_39, %tmp30_40 : tensor<128x64xi32> loc(#loc482) + %tmp31 = arith.constant 0 : i32 loc(#loc483) + %tmp31_42 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc483) + %tmp31_43 = arith.cmpi slt, %tmp29_39, %tmp31_42 : tensor<128x64xi32> loc(#loc483) + %tmp32 = tt.expand_dims %tmp20_30 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc484) + %tmp32_44 = tt.broadcast %tmp32 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc484) + %tmp32_45 = arith.cmpi ne, %tmp31_43, %tmp32_44 : tensor<128x64xi1> loc(#loc484) + %tmp33 = arith.andi %tmp30_41, %tmp32_45 : tensor<128x64xi1> loc(#loc485) + %tmp34 = tt.expand_dims %tmp14_19 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc486) + %tmp34_46 = tt.broadcast %tmp34 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc486) + %tmp34_47 = arith.addi %tmp29_39, %tmp34_46 : tensor<128x64xi32> loc(#loc486) + %tmp35 = arith.select %tmp33, %tmp34_47, %tmp29_39 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc487) + %tmp36 = tt.expand_dims %tmp17_24 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc488) + %tmp36_48 = tt.broadcast %tmp36 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc488) + %tmp36_49 = arith.cmpi eq, %tmp35, %tmp36_48 : tensor<128x64xi32> loc(#loc488) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc489) + %tmp37_50 = arith.andi %tmp37, %tmp36_49 : tensor<128x64xi1> loc(#loc489) + %tmp38 = arith.ori %tmp13_18, %tmp37_50 : tensor<128x64xi1> loc(#loc490) + %post_mod_scores = arith.constant 0xFF800000 : f32 loc(#loc491) + %post_mod_scores_51 = arith.constant 0xFF800000 : f32 loc(#loc491) + %post_mod_scores_52 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc491) + %post_mod_scores_53 = arith.select %tmp38, %qk_8, %post_mod_scores_52 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc491) + %post_mod_scores_54 = arith.constant 1.44269502 : f32 loc(#loc492) + %post_mod_scores_55 = arith.constant 1.44269502 : f32 loc(#loc492) + %post_mod_scores_56 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc492) + %post_mod_scores_57 = arith.mulf %post_mod_scores_53, %post_mod_scores_56 : tensor<128x64xf32> loc(#loc492) + %m_ij = tt.call @"triton.language.standard.max__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cconstexpr_True__(4,)cconstexpr_False_"(%post_mod_scores_57) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc493) + %m_ij_58 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc494) + %masked_out_rows = arith.constant 0xFF800000 : f32 loc(#loc495) + %masked_out_rows_59 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc495) + %masked_out_rows_60 = arith.cmpf oeq, %m_ij_58, %masked_out_rows_59 : tensor<128xf32> loc(#loc495) + %m_ij_masked = arith.constant 0 : i32 loc(#loc496) + %m_ij_masked_61 = arith.constant 0.000000e+00 : f32 loc(#loc496) + %m_ij_masked_62 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc496) + %m_ij_masked_63 = arith.select %masked_out_rows_60, %m_ij_masked_62, %m_ij_58 : tensor<128xi1>, tensor<128xf32> loc(#loc496) + %alpha = arith.subf %m_i, %m_ij_masked_63 : tensor<128xf32> loc(#loc497) + %alpha_64 = math.exp2 %alpha : tensor<128xf32> loc(#loc498) + %p = tt.expand_dims %m_ij_masked_63 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc499) + %p_65 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc500) + %p_66 = arith.subf %post_mod_scores_57, %p_65 : tensor<128x64xf32> loc(#loc500) + %p_67 = math.exp2 %p_66 : tensor<128x64xf32> loc(#loc501) + %l_i_68 = arith.mulf %l_i, %alpha_64 : tensor<128xf32> loc(#loc502) + %l_i_69 = tt.call @"triton.language.standard.sum__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%p_67) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc503) + %l_i_70 = arith.addf %l_i_68, %l_i_69 : tensor<128xf32> loc(#loc504) + %acc_71 = tt.expand_dims %alpha_64 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc505) + %acc_72 = tt.broadcast %acc_71 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc506) + %acc_73 = arith.mulf %acc, %acc_72 : tensor<128x128xf32> loc(#loc506) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc507) + %v = tt.call @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%V, %offs_n_load_1, %offs_v, %stride_vn, %stride_vk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc508) + %acc_74 = arith.truncf %p_67 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc509) + %acc_75 = arith.constant 0.000000e+00 : f32 loc(#loc510) + %acc_76 = tt.dot %acc_74, %v, %acc_73, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc510) + tt.return %acc_76, %l_i_70, %m_ij_58 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc215) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc216) + %1 = ub.poison : tensor<128xf32> loc(#loc216) + %2 = ub.poison : tensor<128xf32> loc(#loc216) + tt.return %0, %1, %2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc216) + } loc(#loc148) + tt.func private @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: !tt.ptr loc("ptr"(#loc122)), %offs_m: tensor<64xi32> loc("offs_m"(#loc122)), %offs_n: tensor<128xi32> loc("offs_n"(#loc122)), %stride_m: i32 loc("stride_m"(#loc122)), %stride_n: i32 loc("stride_n"(#loc122)), %M_LEN: i32 loc("M_LEN"(#loc122))) -> tensor<64x128xbf16> attributes {noinline = false} { + %ptr_0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc373) + %ptr_1 = tt.splat %stride_m : i32 -> tensor<64x1xi32> loc(#loc374) + %ptr_2 = arith.muli %ptr_0, %ptr_1 : tensor<64x1xi32> loc(#loc374) + %ptr_3 = tt.splat %ptr : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc375) + %ptr_4 = tt.addptr %ptr_3, %ptr_2 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc375) + %ptr_5 = tt.expand_dims %offs_n {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc376) + %ptr_6 = tt.splat %stride_n : i32 -> tensor<1x128xi32> loc(#loc377) + %ptr_7 = arith.muli %ptr_5, %ptr_6 : tensor<1x128xi32> loc(#loc377) + %ptr_8 = tt.broadcast %ptr_4 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc378) + %ptr_9 = tt.broadcast %ptr_7 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc378) + %ptr_10 = tt.addptr %ptr_8, %ptr_9 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc378) + %0 = tt.load %ptr_10 : tensor<64x128x!tt.ptr> loc(#loc129) + tt.return %0 : tensor<64x128xbf16> loc(#loc130) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x128xbf16> loc(#loc131) + tt.return %1 : tensor<64x128xbf16> loc(#loc131) + } loc(#loc122) + tt.func private @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.get_bounded_indices__i32S128_1S__(1,)cconstexpr_None_"(%indices: tensor<128x1xi32> loc("indices"(#loc217))) -> tensor<128x1xi32> attributes {noinline = false} { + tt.return %indices : tensor<128x1xi32> loc(#loc218) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x1xi32> loc(#loc219) + tt.return %0 : tensor<128x1xi32> loc(#loc219) + } loc(#loc217) + tt.func private @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.get_bounded_indices__i32S1_64S__(1,)cconstexpr_None_"(%indices: tensor<1x64xi32> loc("indices"(#loc217))) -> tensor<1x64xi32> attributes {noinline = false} { + tt.return %indices : tensor<1x64xi32> loc(#loc218) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x64xi32> loc(#loc219) + tt.return %0 : tensor<1x64xi32> loc(#loc219) + } loc(#loc217) + tt.func private @"triton.language.standard.max__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cconstexpr_True__(4,)cconstexpr_False_"(%input: tensor<128x64xf32> loc("input"(#loc220))) -> tensor<128xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._elementwise_max__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc221) + tt.reduce.return %2 : f32 loc(#loc221) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc221) + tt.return %0 : tensor<128xf32> loc(#loc223) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128xf32> loc(#loc224) + tt.return %1 : tensor<128xf32> loc(#loc224) + } loc(#loc220) + tt.func private @triton.language.standard._elementwise_max__fp32_fp32__(%a: f32 loc("a"(#loc225)), %b: f32 loc("b"(#loc225))) -> f32 attributes {noinline = false} { + %0 = arith.maxnumf %a, %b : f32 loc(#loc226) + tt.return %0 : f32 loc(#loc227) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc228) + tt.return %1 : f32 loc(#loc228) + } loc(#loc225) + tt.func private @"triton.language.standard.sum__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x64xf32> loc("input"(#loc229))) -> tensor<128xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc230) + tt.reduce.return %2 : f32 loc(#loc230) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc230) + tt.return %0 : tensor<128xf32> loc(#loc231) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128xf32> loc(#loc232) + tt.return %1 : tensor<128xf32> loc(#loc232) + } loc(#loc229) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc233)), %b: f32 loc("b"(#loc233))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc234) + tt.return %0 : f32 loc(#loc235) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc236) + tt.return %1 : f32 loc(#loc236) + } loc(#loc233) + tt.func private @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%loop_iter: i32 loc("loop_iter"(#loc237)), %col_indices: !tt.ptr loc("col_indices"(#loc237)), %total_blocks: i32 loc("total_blocks"(#loc237))) -> i32 attributes {noinline = false} { + %cur_block_idx = arith.constant 2 : i32 loc(#loc521) + %cur_block_idx_0 = arith.constant 2 : i32 loc(#loc521) + %cur_block_idx_1 = arith.divsi %loop_iter, %cur_block_idx_0 : i32 loc(#loc521) + %cur_block = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc522) + %cur_block_2 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc523) + %next_block = arith.constant 1 : i32 loc(#loc524) + %next_block_3 = arith.constant 1 : i32 loc(#loc524) + %next_block_4 = arith.addi %cur_block_idx_1, %next_block_3 : i32 loc(#loc524) + %next_block_5 = arith.cmpi slt, %next_block_4, %total_blocks : i32 loc(#loc525) + %next_block_6 = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc526) + %next_block_7 = arith.constant 1 : i32 loc(#loc527) + %next_block_8 = tt.addptr %next_block_6, %next_block_7 : !tt.ptr, i32 loc(#loc527) + %next_block_9 = tt.load %next_block_8, %next_block_5 evictionPolicy = evict_last : !tt.ptr loc(#loc528) + %needs_jump = arith.constant 1 : i32 loc(#loc529) + %needs_jump_10 = arith.constant 1 : i32 loc(#loc529) + %needs_jump_11 = arith.addi %loop_iter, %needs_jump_10 : i32 loc(#loc529) + %needs_jump_12 = arith.constant 2 : i32 loc(#loc530) + %needs_jump_13 = arith.constant 2 : i32 loc(#loc530) + %needs_jump_14 = arith.remsi %needs_jump_11, %needs_jump_13 : i32 loc(#loc530) + %needs_jump_15 = arith.constant 0 : i32 loc(#loc531) + %needs_jump_16 = arith.cmpi eq, %needs_jump_14, %needs_jump_15 : i32 loc(#loc531) + %jump_to_block = arith.subi %next_block_9, %cur_block_2 : i32 loc(#loc532) + %jump_to_block_17 = arith.constant 128 : i32 loc(#loc533) + %jump_to_block_18 = arith.constant 128 : i32 loc(#loc533) + %jump_to_block_19 = arith.muli %jump_to_block, %jump_to_block_18 : i32 loc(#loc533) + %jump_to_block_20 = arith.constant 64 : i32 loc(#loc534) + %jump_to_block_21 = arith.constant 64 : i32 loc(#loc534) + %jump_to_block_22 = arith.subi %jump_to_block_19, %jump_to_block_21 : i32 loc(#loc534) + %offset = arith.extui %needs_jump_16 : i1 to i32 loc(#loc535) + %offset_23 = arith.muli %jump_to_block_22, %offset : i32 loc(#loc535) + %offset_24 = arith.constant 1 : i32 loc(#loc536) + %offset_25 = arith.constant 1 : i32 loc(#loc536) + %offset_26 = arith.extui %needs_jump_16 : i1 to i32 loc(#loc536) + %offset_27 = arith.subi %offset_25, %offset_26 : i32 loc(#loc536) + %offset_28 = arith.constant 64 : i32 loc(#loc537) + %offset_29 = arith.constant 64 : i32 loc(#loc537) + %offset_30 = arith.muli %offset_27, %offset_29 : i32 loc(#loc537) + %offset_31 = arith.addi %offset_23, %offset_30 : i32 loc(#loc538) + tt.return %offset_31 : i32 loc(#loc256) + ^bb1: // no predecessors + %0 = ub.poison : i32 loc(#loc257) + tt.return %0 : i32 loc(#loc257) + } loc(#loc237) + tt.func private @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(14,)cNone_(15,)cNone_(28,)cconstexpr_0__(30,)cconstexpr_bf16__(35,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc138)), %arg_K: !tt.ptr loc("arg_K"(#loc138)), %arg_V: !tt.ptr loc("arg_V"(#loc138)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc138)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc138)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc138)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc138)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc138)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc138)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc138)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc138)), %q: tensor<128x128xbf16> loc("q"(#loc138)), %K: !tt.ptr loc("K"(#loc138)), %V: !tt.ptr loc("V"(#loc138)), %Q_LEN: i32 loc("Q_LEN"(#loc138)), %KV_LEN: i32 loc("KV_LEN"(#loc138)), %acc: tensor<128x128xf32> loc("acc"(#loc138)), %l_i: tensor<128xf32> loc("l_i"(#loc138)), %m_i: tensor<128xf32> loc("m_i"(#loc138)), %off_z: i32 loc("off_z"(#loc138)), %off_h: i32 loc("off_h"(#loc138)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc138)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc138)), %kv_start: i32 loc("kv_start"(#loc138)), %kv_indices: !tt.ptr loc("kv_indices"(#loc138)), %kv_num_blocks: i32 loc("kv_num_blocks"(#loc138)), %block_n_end: i32 loc("block_n_end"(#loc138)), %stride_kk: i32 loc("stride_kk"(#loc138)), %stride_kn: i32 loc("stride_kn"(#loc138)), %stride_vn: i32 loc("stride_vn"(#loc138)), %stride_vk: i32 loc("stride_vk"(#loc138))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_offset = arith.constant 0 : i32 loc(#loc411) + %c0_i32 = arith.constant 0 : i32 loc(#loc140) + %c1_i32 = arith.constant 1 : i32 loc(#loc140) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc140) + %1 = arith.bitcast %block_n_end : i32 to i32 loc(#loc140) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc140) + %3 = ub.poison : i32 loc(#loc140) + %kv_offset_0:5 = scf.for %start_n = %0 to %1 step %2 iter_args(%acc_1 = %acc, %l_i_2 = %l_i, %m_i_3 = %m_i, %offs_n_4 = %offs_n, %kv_offset_5 = %kv_offset) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %7:3 = tt.call @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(14,)cconstexpr_None__(15,)cconstexpr_None__(27,)cconstexpr_bf16__(28,)cconstexpr_1_d_44269504__(33,)cconstexpr_True__(34,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %q, %K, %V, %Q_LEN, %KV_LEN, %acc_1, %l_i_2, %m_i_3, %off_z, %off_h, %offs_m, %offs_n_4, %kv_start, %kv_offset_5, %stride_kk, %stride_kn, %stride_vn, %stride_vk) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc141) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc413) + %offs_n_6 = tt.splat %offset : i32 -> tensor<1x64xi32> loc(#loc414) + %offs_n_7 = arith.addi %offs_n_4, %offs_n_6 : tensor<1x64xi32> loc(#loc414) + %kv_offset_8 = arith.addi %kv_offset_5, %offset : i32 loc(#loc415) + scf.yield %7#0, %7#1, %7#2, %offs_n_7, %kv_offset_8 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc145) + } loc(#loc542) + tt.return %kv_offset_0#0, %kv_offset_0#1, %kv_offset_0#2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc146) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc147) + %5 = ub.poison : tensor<128xf32> loc(#loc147) + %6 = ub.poison : tensor<128xf32> loc(#loc147) + tt.return %4, %5, %6 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc147) + } loc(#loc138) + tt.func private @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(14,)cconstexpr_None__(15,)cconstexpr_None__(27,)cconstexpr_bf16__(28,)cconstexpr_1_d_44269504__(33,)cconstexpr_True__(34,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc148)), %arg_K: !tt.ptr loc("arg_K"(#loc148)), %arg_V: !tt.ptr loc("arg_V"(#loc148)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc148)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc148)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc148)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc148)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc148)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc148)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc148)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc148)), %q: tensor<128x128xbf16> loc("q"(#loc148)), %K: !tt.ptr loc("K"(#loc148)), %V: !tt.ptr loc("V"(#loc148)), %Q_LEN: i32 loc("Q_LEN"(#loc148)), %KV_LEN: i32 loc("KV_LEN"(#loc148)), %acc: tensor<128x128xf32> loc("acc"(#loc148)), %l_i: tensor<128xf32> loc("l_i"(#loc148)), %m_i: tensor<128xf32> loc("m_i"(#loc148)), %off_z: i32 loc("off_z"(#loc148)), %off_h: i32 loc("off_h"(#loc148)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc148)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc148)), %kv_start: i32 loc("kv_start"(#loc148)), %kv_offset: i32 loc("kv_offset"(#loc148)), %stride_kk: i32 loc("stride_kk"(#loc148)), %stride_kn: i32 loc("stride_kn"(#loc148)), %stride_vn: i32 loc("stride_vn"(#loc148)), %stride_vk: i32 loc("stride_vk"(#loc148))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_base_offset = arith.addi %kv_start, %kv_offset : i32 loc(#loc445) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc446) + %offs_n_load = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc447) + %offs_n_load_0 = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc448) + %offs_n_load_1 = arith.addi %offs_n_load_0, %offs_n_load : tensor<64xi32> loc(#loc448) + %k = tt.call @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%K, %offs_n_load_1, %offs_k, %stride_kn, %stride_kk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc449) + %k_2 = tt.trans %k {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc450) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc451) + %qk_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc451) + %qk_4 = tt.dot %q, %k_2, %qk_3, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc451) + %qk_5 = arith.constant 0.0883883461 : f32 loc(#loc452) + %qk_6 = arith.constant 0.0883883461 : f32 loc(#loc452) + %qk_7 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc452) + %qk_8 = arith.mulf %qk_4, %qk_7 : tensor<128x64xf32> loc(#loc452) + %m = tt.call @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.get_bounded_indices__i32S128_1S__(1,)cconstexpr_None_"(%offs_m) : (tensor<128x1xi32>) -> tensor<128x1xi32> loc(#loc453) + %n = tt.call @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.get_bounded_indices__i32S1_64S__(1,)cconstexpr_None_"(%offs_n) : (tensor<1x64xi32>) -> tensor<1x64xi32> loc(#loc454) + %post_mod_scores = arith.constant 1.44269502 : f32 loc(#loc492) + %post_mod_scores_9 = arith.constant 1.44269502 : f32 loc(#loc492) + %post_mod_scores_10 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc492) + %post_mod_scores_11 = arith.mulf %qk_8, %post_mod_scores_10 : tensor<128x64xf32> loc(#loc492) + %m_ij = tt.call @"triton.language.standard.max__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cconstexpr_True__(4,)cconstexpr_False_"(%post_mod_scores_11) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc493) + %m_ij_12 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc494) + %masked_out_rows = arith.constant 0xFF800000 : f32 loc(#loc495) + %masked_out_rows_13 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc495) + %masked_out_rows_14 = arith.cmpf oeq, %m_ij_12, %masked_out_rows_13 : tensor<128xf32> loc(#loc495) + %m_ij_masked = arith.constant 0 : i32 loc(#loc496) + %m_ij_masked_15 = arith.constant 0.000000e+00 : f32 loc(#loc496) + %m_ij_masked_16 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc496) + %m_ij_masked_17 = arith.select %masked_out_rows_14, %m_ij_masked_16, %m_ij_12 : tensor<128xi1>, tensor<128xf32> loc(#loc496) + %alpha = arith.subf %m_i, %m_ij_masked_17 : tensor<128xf32> loc(#loc497) + %alpha_18 = math.exp2 %alpha : tensor<128xf32> loc(#loc498) + %p = tt.expand_dims %m_ij_masked_17 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc499) + %p_19 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc500) + %p_20 = arith.subf %post_mod_scores_11, %p_19 : tensor<128x64xf32> loc(#loc500) + %p_21 = math.exp2 %p_20 : tensor<128x64xf32> loc(#loc501) + %l_i_22 = arith.mulf %l_i, %alpha_18 : tensor<128xf32> loc(#loc502) + %l_i_23 = tt.call @"triton.language.standard.sum__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%p_21) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc503) + %l_i_24 = arith.addf %l_i_22, %l_i_23 : tensor<128xf32> loc(#loc504) + %acc_25 = tt.expand_dims %alpha_18 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc505) + %acc_26 = tt.broadcast %acc_25 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc506) + %acc_27 = arith.mulf %acc, %acc_26 : tensor<128x128xf32> loc(#loc506) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc507) + %v = tt.call @"torch._inductor.runtime.compile_tasks.c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%V, %offs_n_load_1, %offs_v, %stride_vn, %stride_vk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc508) + %acc_28 = arith.truncf %p_21 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc509) + %acc_29 = arith.constant 0.000000e+00 : f32 loc(#loc510) + %acc_30 = tt.dot %acc_28, %v, %acc_27, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc510) + tt.return %acc_30, %l_i_24, %m_ij_12 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc215) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc216) + %1 = ub.poison : tensor<128xf32> loc(#loc216) + %2 = ub.poison : tensor<128xf32> loc(#loc216) + tt.return %0, %1, %2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc216) + } loc(#loc148) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":85:49) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":86:49) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":87:49) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":89:9) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":90:9) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":91:12) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":92:10) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":93:13) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":97:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":98:27) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":99:27) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":103:23) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":104:24) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":105:21) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":107:24) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":107:45) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":107:36) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":108:25) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":108:47) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":108:37) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":109:25) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":109:47) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":109:37) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":111:12) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":112:12) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":113:12) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":120:15) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":121:16) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":123:28) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":124:29) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":129:27) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":130:22) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":131:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":134:19) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":134:50) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":135:19) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":136:19) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":138:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":138:46) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":138:33) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":141:38) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":141:50) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":142:51) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":142:85) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":142:74) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":143:46) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":143:76) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":143:97) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":143:64) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":144:23) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":144:46) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":144:33) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":145:26) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":146:101) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":151:26) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":152:23) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":152:37) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":153:42) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":153:28) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":154:45) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":154:92) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":154:102) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":154:65) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":159:37) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":159:24) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":167:31) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":167:48) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":172:41) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":181:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":182:27) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":182:41) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":183:51) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":183:32) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":184:49) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":184:96) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":184:106) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":184:69) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":186:41) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":186:28) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":193:35) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":193:52) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":198:45) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":206:26) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":206:34) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":208:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":208:16) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":209:27) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":210:27) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":211:19) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":212:25) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":212:45) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":214:20) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":214:38) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":214:30) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":217:25) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":217:21) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":217:40) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":217:33) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":217:57) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":217:49) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":218:53) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":218:49) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":218:67) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":218:62) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":218:83) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":218:75) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":218:25) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":218:109) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":221:26) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":221:31) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":222:32) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":222:23) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":222:40) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":223:33) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":223:20) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":225:29) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":229:4) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":284:27) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":284:38) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":284:20) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":284:56) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":284:67) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":284:49) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":294:23) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":294:15) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":287:4) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:16) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:11) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:4) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":499:16) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":502:40) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":518:16) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":545:63) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":548:26) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":549:21) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":549:8) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":552:11) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":552:4) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":342:32) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":345:26) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":346:48) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":346:35) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":347:107) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":349:17) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":351:19) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":353:14) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":358:36) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":359:36) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":370:35) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":373:23) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":374:23) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":376:33) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":376:23) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":377:22) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":378:23) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":379:23) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":380:23) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":381:23) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":382:23) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":383:35) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":384:24) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":385:24) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":386:32) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":387:25) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":388:92) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":389:92) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":390:25) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":391:24) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":392:24) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":393:39) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":394:25) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":395:24) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":396:24) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":397:23) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":398:25) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":399:25) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":400:92) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":401:25) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":402:24) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":403:24) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":404:39) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":405:25) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":406:24) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":407:24) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":414:69) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":417:27) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":421:51) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":421:27) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":423:35) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":424:51) +#loc201 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":428:31) +#loc202 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":428:25) +#loc203 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":429:51) +#loc204 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":429:39) +#loc205 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":429:21) +#loc206 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":434:16) +#loc207 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":434:34) +#loc208 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":434:24) +#loc209 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":436:22) +#loc210 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":436:16) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":438:26) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":439:107) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":440:22) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":440:44) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":445:11) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":445:4) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":257:11) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":257:4) +#loc221 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40) +#loc223 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:15) +#loc224 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":177:4) +#loc226 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27) +#loc227 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:11) +#loc228 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:4) +#loc230 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc231 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc232 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc234 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc235 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc236 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc238 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":247:33) +#loc239 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":248:38) +#loc240 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":248:24) +#loc241 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":249:109) +#loc242 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":249:113) +#loc243 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":249:39) +#loc244 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":249:55) +#loc245 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":249:25) +#loc246 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":250:30) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":250:35) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":250:60) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":251:34) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":251:48) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":251:63) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":252:29) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":252:47) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":252:61) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":252:42) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":253:11) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":253:4) +#loc269 = loc("ZQ"(#loc4)) +#loc270 = loc("HQ"(#loc5)) +#loc271 = loc("Q_LEN"(#loc6)) +#loc272 = loc("ZKV"(#loc7)) +#loc273 = loc("KV_LEN"(#loc8)) +#loc274 = loc("q_start"(#loc9)) +#loc275 = loc("off_zq"(#loc10)) +#loc276 = loc("off_hq"(#loc11)) +#loc277 = loc("off_zkv"(#loc12)) +#loc278 = loc("off_hkv"(#loc13)) +#loc279 = loc("off_g"(#loc14)) +#loc280 = loc("q_offset"(#loc15)) +#loc281 = loc("q_offset"(#loc16)) +#loc282 = loc("q_offset"(#loc17)) +#loc283 = loc("k_offset"(#loc18)) +#loc284 = loc("k_offset"(#loc19)) +#loc285 = loc("k_offset"(#loc20)) +#loc286 = loc("v_offset"(#loc21)) +#loc287 = loc("v_offset"(#loc22)) +#loc288 = loc("v_offset"(#loc23)) +#loc289 = loc("Q"(#loc24)) +#loc290 = loc("K"(#loc25)) +#loc291 = loc("V"(#loc26)) +#loc292 = loc("SPARSE_Z"(#loc27)) +#loc293 = loc("SPARSE_HQ"(#loc28)) +#loc294 = loc("sparse_idx_z"(#loc29)) +#loc295 = loc("sparse_idx_hq"(#loc30)) +#loc296 = loc("stride_kv_num_blks_h"(#loc31)) +#loc297 = loc("stride_kv_idx_h"(#loc32)) +#loc298 = loc("stride_kv_idx_m"(#loc33)) +#loc299 = loc("m_i"(#loc34)) +#loc300 = loc("m_i"(#loc35)) +#loc301 = loc("l_i"(#loc36)) +#loc302 = loc("acc"(#loc37)) +#loc303 = loc("offs_m"(#loc38)) +#loc304 = loc("offs_m"(#loc39)) +#loc305 = loc("offs_m"(#loc40)) +#loc306 = loc("sparse_hz_offset"(#loc41)) +#loc307 = loc("sparse_hz_offset"(#loc42)) +#loc308 = loc("sparse_kv_num_blks_offset"(#loc43)) +#loc309 = loc("sparse_kv_num_blks_offset"(#loc44)) +#loc310 = loc("sparse_kv_num_blks_offset"(#loc45)) +#loc311 = loc("sparse_kv_idx_offset"(#loc46)) +#loc312 = loc("sparse_kv_idx_offset"(#loc47)) +#loc313 = loc("sparse_kv_idx_offset"(#loc48)) +#loc314 = loc("sparse_kv_idx_offset"(#loc49)) +#loc315 = loc("offs_m"(#loc50)) +#loc316 = loc("offs_m"(#loc51)) +#loc317 = loc("offs_m"(#loc52)) +#loc318 = loc("offs_k"(#loc53)) +#loc319 = loc("q"(#loc54)) +#loc320 = loc("kv_indices"(#loc55)) +#loc321 = loc("kv_start"(#loc56)) +#loc322 = loc("kv_start"(#loc57)) +#loc323 = loc("kv_num_blocks"(#loc58)) +#loc324 = loc("kv_num_blocks"(#loc59)) +#loc325 = loc("block_n_end"(#loc60)) +#loc326 = loc("block_n_end"(#loc61)) +#loc327 = loc("block_n_end"(#loc62)) +#loc328 = loc("block_n_end"(#loc63)) +#loc329 = loc("offs_n"(#loc64)) +#loc330 = loc("offs_n"(#loc65)) +#loc331 = loc("kv_indices"(#loc69)) +#loc332 = loc("kv_start"(#loc70)) +#loc333 = loc("kv_start"(#loc71)) +#loc334 = loc("kv_num_blocks"(#loc72)) +#loc335 = loc("kv_num_blocks"(#loc73)) +#loc336 = loc("block_n_end"(#loc74)) +#loc337 = loc("block_n_end"(#loc75)) +#loc338 = loc("block_n_end"(#loc76)) +#loc339 = loc("block_n_end"(#loc77)) +#loc340 = loc("offs_n"(#loc78)) +#loc341 = loc("offs_n"(#loc79)) +#loc342 = loc("l_i"(#loc83)) +#loc343 = loc("l_i"(#loc84)) +#loc344 = loc("acc"(#loc85)) +#loc345 = loc("acc"(#loc86)) +#loc346 = loc("idx_zq"(#loc87)) +#loc347 = loc("idx_hq"(#loc88)) +#loc348 = loc("idx_m"(#loc89)) +#loc349 = loc("idx_d"(#loc90)) +#loc350 = loc("idx_d"(#loc91)) +#loc351 = loc("mask"(#loc92)) +#loc352 = loc("mask"(#loc93)) +#loc353 = loc("mask"(#loc94)) +#loc354 = loc("xindex"(#loc95)) +#loc355 = loc("xindex"(#loc96)) +#loc356 = loc("xindex"(#loc97)) +#loc357 = loc("xindex"(#loc98)) +#loc358 = loc("xindex"(#loc99)) +#loc359 = loc("xindex"(#loc100)) +#loc360 = loc("off_hz"(#loc109)) +#loc361 = loc("off_hz"(#loc110)) +#loc362 = loc("l_ptrs"(#loc111)) +#loc363 = loc("l_ptrs"(#loc112)) +#loc364 = loc("l_ptrs"(#loc113)) +#loc365 = loc("lse"(#loc114)) +#loc366 = loc("lse"(#loc115)) +#loc373 = loc("ptr"(#loc123)) +#loc374 = loc("ptr"(#loc124)) +#loc375 = loc("ptr"(#loc125)) +#loc376 = loc("ptr"(#loc126)) +#loc377 = loc("ptr"(#loc127)) +#loc378 = loc("ptr"(#loc128)) +#loc411 = loc("kv_offset"(#loc139)) +#loc412 = loc("acc"(#loc140)) +#loc413 = loc("offset"(#loc142)) +#loc414 = loc("offs_n"(#loc143)) +#loc415 = loc("kv_offset"(#loc144)) +#loc445 = loc("kv_base_offset"(#loc149)) +#loc446 = loc("offs_k"(#loc150)) +#loc447 = loc("offs_n_load"(#loc151)) +#loc448 = loc("offs_n_load"(#loc152)) +#loc449 = loc("k"(#loc153)) +#loc450 = loc("k"(#loc154)) +#loc451 = loc("qk"(#loc155)) +#loc452 = loc("qk"(#loc156)) +#loc453 = loc("m"(#loc157)) +#loc454 = loc("n"(#loc158)) +#loc455 = loc("tmp1"(#loc159)) +#loc456 = loc("tmp4"(#loc160)) +#loc457 = loc("tmp5"(#loc161)) +#loc458 = loc("tmp7"(#loc162)) +#loc459 = loc("tmp7"(#loc163)) +#loc460 = loc("tmp8"(#loc164)) +#loc461 = loc("tmp9"(#loc165)) +#loc462 = loc("tmp10"(#loc166)) +#loc463 = loc("tmp11"(#loc167)) +#loc464 = loc("tmp12"(#loc168)) +#loc465 = loc("tmp13"(#loc169)) +#loc466 = loc("tmp14"(#loc170)) +#loc467 = loc("tmp15"(#loc171)) +#loc468 = loc("tmp16"(#loc172)) +#loc469 = loc("tmp17"(#loc173)) +#loc470 = loc("tmp18"(#loc174)) +#loc471 = loc("tmp19"(#loc175)) +#loc472 = loc("tmp20"(#loc176)) +#loc473 = loc("tmp21"(#loc177)) +#loc474 = loc("tmp22"(#loc178)) +#loc475 = loc("tmp23"(#loc179)) +#loc476 = loc("tmp24"(#loc180)) +#loc477 = loc("tmp25"(#loc181)) +#loc478 = loc("tmp26"(#loc182)) +#loc479 = loc("tmp27"(#loc183)) +#loc480 = loc("tmp28"(#loc184)) +#loc481 = loc("tmp29"(#loc185)) +#loc482 = loc("tmp30"(#loc186)) +#loc483 = loc("tmp31"(#loc187)) +#loc484 = loc("tmp32"(#loc188)) +#loc485 = loc("tmp33"(#loc189)) +#loc486 = loc("tmp34"(#loc190)) +#loc487 = loc("tmp35"(#loc191)) +#loc488 = loc("tmp36"(#loc192)) +#loc489 = loc("tmp37"(#loc193)) +#loc490 = loc("tmp38"(#loc194)) +#loc491 = loc("post_mod_scores"(#loc195)) +#loc492 = loc("post_mod_scores"(#loc196)) +#loc493 = loc("m_ij"(#loc197)) +#loc494 = loc("m_ij"(#loc198)) +#loc495 = loc("masked_out_rows"(#loc199)) +#loc496 = loc("m_ij_masked"(#loc200)) +#loc497 = loc("alpha"(#loc201)) +#loc498 = loc("alpha"(#loc202)) +#loc499 = loc("p"(#loc203)) +#loc500 = loc("p"(#loc204)) +#loc501 = loc("p"(#loc205)) +#loc502 = loc("l_i"(#loc206)) +#loc503 = loc("l_i"(#loc207)) +#loc504 = loc("l_i"(#loc208)) +#loc505 = loc("acc"(#loc209)) +#loc506 = loc("acc"(#loc210)) +#loc507 = loc("offs_v"(#loc211)) +#loc508 = loc("v"(#loc212)) +#loc509 = loc("acc"(#loc213)) +#loc510 = loc("acc"(#loc214)) +#loc521 = loc("cur_block_idx"(#loc238)) +#loc522 = loc("cur_block"(#loc239)) +#loc523 = loc("cur_block"(#loc240)) +#loc524 = loc("next_block"(#loc241)) +#loc525 = loc("next_block"(#loc242)) +#loc526 = loc("next_block"(#loc243)) +#loc527 = loc("next_block"(#loc244)) +#loc528 = loc("next_block"(#loc245)) +#loc529 = loc("needs_jump"(#loc246)) +#loc530 = loc("needs_jump"(#loc247)) +#loc531 = loc("needs_jump"(#loc248)) +#loc532 = loc("jump_to_block"(#loc249)) +#loc533 = loc("jump_to_block"(#loc250)) +#loc534 = loc("jump_to_block"(#loc251)) +#loc535 = loc("offset"(#loc252)) +#loc536 = loc("offset"(#loc253)) +#loc537 = loc("offset"(#loc254)) +#loc538 = loc("offset"(#loc255)) +#loc539 = loc("l_i"(#loc412)) +#loc540 = loc("m_i"(#loc539)) +#loc541 = loc("offs_n"(#loc540)) +#loc542 = loc("kv_offset"(#loc541)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..42cad3382f61f9eee3659b4bd9a6f1bec907c850 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.ttgir @@ -0,0 +1,897 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":18:0) +#loc1 = loc(unknown) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":518:16) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":172:41) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":421:51) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":434:34) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":198:45) +#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 64, 16]}> +#mma1 = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 128, 16]}> +#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}> +#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}> +#smem = #ttg.shared_memory +#loc151 = loc("arg_Q"(#loc)) +#loc152 = loc("arg_K"(#loc)) +#loc153 = loc("arg_V"(#loc)) +#loc154 = loc("arg_LSE"(#loc)) +#loc155 = loc("arg_MAX"(#loc)) +#loc156 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc157 = loc("arg_KV_IDX"(#loc)) +#loc158 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc159 = loc("arg_FULL_KV_IDX"(#loc)) +#loc160 = loc("in_ptr9"(#loc)) +#loc161 = loc("out_ptr0"(#loc)) +#loc200 = loc(callsite(#loc42 at #loc43)) +#loc239 = loc("m_ij"(#loc83)) +#loc249 = loc("l_i"(#loc95)) +#loc282 = loc(callsite(#loc42 at #loc129)) +#loc342 = loc(callsite(#loc239 at #loc200)) +#loc352 = loc(callsite(#loc249 at #loc200)) +#loc369 = loc(callsite(#loc239 at #loc282)) +#loc379 = loc(callsite(#loc249 at #loc282)) +#loc397 = loc(callsite(#loc1 at #loc342)) +#loc399 = loc(callsite(#loc1 at #loc352)) +#loc426 = loc(callsite(#loc1 at #loc369)) +#loc428 = loc(callsite(#loc1 at #loc379)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %in_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr9"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x64xi32, #mma> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<128x64xi32, #mma> loc(#loc1) + %cst_1 = arith.constant dense<2048> : tensor<128x64xi32, #mma> loc(#loc1) + %cst_2 = arith.constant dense<2048> : tensor<1x64xi32, #mma> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_4 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<4096> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<2048> : tensor<128x1xi32, #blocked> loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c2097152_i32 = arith.constant 2097152 : i32 loc(#loc1) + %c262144_i32 = arith.constant 262144 : i32 loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_8 = arith.constant dense<0.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %cst_9 = arith.constant dense<1.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %cst_10 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma1> loc(#loc1) + %cst_11 = arith.constant dense<0.0883883461> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_12 = arith.constant dense<0xFF800000> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_13 = arith.constant dense<1.44269502> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_14 = arith.constant dense<0xFF800000> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %c-1_i32 = arith.constant -1 : i32 loc(#loc1) + %c3_i32 = arith.constant 3 : i32 loc(#loc1) + %q_start = tt.get_program_id x : i32 loc(#loc162) + %off_zq = tt.get_program_id y : i32 loc(#loc163) + %off_hq = tt.get_program_id z : i32 loc(#loc164) + %off_zkv = arith.remsi %off_zq, %c8_i32 : i32 loc(#loc165) + %off_hkv = arith.divsi %off_hq, %c4_i32 : i32 loc(#loc166) + %q_offset = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc167) + %q_offset_15 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc168) + %q_offset_16 = arith.addi %q_offset, %q_offset_15 : i32 loc(#loc169) + %k_offset = arith.muli %off_zkv, %c2097152_i32 : i32 loc(#loc170) + %k_offset_17 = arith.muli %off_hkv, %c262144_i32 : i32 loc(#loc171) + %k_offset_18 = arith.addi %k_offset, %k_offset_17 : i32 loc(#loc172) + %Q = tt.addptr %arg_Q, %q_offset_16 : !tt.ptr, i32 loc(#loc173) + %K = tt.addptr %arg_K, %k_offset_18 : !tt.ptr, i32 loc(#loc174) + %V = tt.addptr %arg_V, %k_offset_18 : !tt.ptr, i32 loc(#loc175) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %c16_i32 : i32 loc(#loc176) + %sparse_kv_num_blks_offset_19 = arith.addi %sparse_kv_num_blks_offset, %q_start : i32 loc(#loc177) + %sparse_kv_idx_offset = arith.muli %off_zkv, %c256_i32 : i32 loc(#loc178) + %sparse_kv_idx_offset_20 = arith.muli %q_start, %c16_i32 : i32 loc(#loc179) + %sparse_kv_idx_offset_21 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_20 : i32 loc(#loc180) + %offs_m = arith.muli %q_start, %c128_i32 : i32 loc(#loc181) + %offs_m_22 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc182) + %offs_m_23 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc182) + %offs_m_24 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked1> loc(#loc182) + %offs_m_25 = tt.splat %offs_m : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc183) + %offs_m_26 = tt.splat %offs_m : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc183) + %offs_m_27 = tt.splat %offs_m : i32 -> tensor<128xi32, #blocked1> loc(#loc183) + %offs_m_28 = arith.addi %offs_m_25, %offs_m_22 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc183) + %offs_m_29 = arith.addi %offs_m_26, %offs_m_23 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc183) + %offs_m_30 = arith.addi %offs_m_27, %offs_m_24 : tensor<128xi32, #blocked1> loc(#loc183) + %ptr = tt.expand_dims %offs_m_28 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc297) + %ptr_31 = tt.expand_dims %offs_m_29 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xi32, #mma> loc(#loc297) + %ptr_32 = arith.muli %ptr, %cst_5 : tensor<128x1xi32, #blocked> loc(#loc298) + %ptr_33 = tt.splat %Q : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc299) + %ptr_34 = tt.addptr %ptr_33, %ptr_32 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc299) + %ptr_35 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc300) + %ptr_36 = tt.expand_dims %ptr_35 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc300) + %ptr_37 = tt.broadcast %ptr_34 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc301) + %ptr_38 = tt.broadcast %ptr_36 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc301) + %ptr_39 = tt.addptr %ptr_37, %ptr_38 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc301) + %q = tt.load %ptr_39 : tensor<128x128x!tt.ptr, #blocked> loc(#loc302) + %q_40 = ttg.local_alloc %q : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc302) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_21 : !tt.ptr, i32 loc(#loc190) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc191) + %kv_start_41 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc192) + %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_19 : !tt.ptr, i32 loc(#loc193) + %kv_num_blocks_42 = tt.load %kv_num_blocks : !tt.ptr loc(#loc194) + %block_n_end = arith.muli %kv_num_blocks_42, %c2_i32 : i32 loc(#loc195) + %block_n_end_43 = arith.minsi %block_n_end, %c32_i32 : i32 loc(#loc196) + %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc197) + %offs_n_44 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc197) + %offs_n_45 = tt.splat %kv_start_41 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc198) + %offs_n_46 = arith.addi %offs_n_45, %offs_n : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc198) + %0 = tt.expand_dims %offs_n_46 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc40) + %ptr_47 = tt.splat %K : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked> loc(#loc387) + %ptr_48 = tt.broadcast %ptr_36 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked> loc(#loc388) + %tmp4 = tt.broadcast %ptr_31 : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc304) + %tmp7 = tt.addptr %in_ptr9, %off_zq : !tt.ptr, i32 loc(#loc305) + %kv_offset = arith.cmpi sgt, %block_n_end_43, %c0_i32 : i32 loc(#loc452) + %tmp7_49 = tt.load %tmp7, %kv_offset : !tt.ptr loc(#loc307) + %tmp8 = tt.splat %tmp7_49 : i64 -> tensor<1x64xi64, #mma> loc(#loc308) + %tmp9 = arith.extsi %ptr_31 : tensor<128x1xi32, #mma> to tensor<128x1xi64, #mma> loc(#loc309) + %tmp10 = tt.splat %tmp7_49 : i64 -> tensor<128x1xi64, #mma> loc(#loc310) + %tmp10_50 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64, #mma> loc(#loc310) + %tmp11 = tt.broadcast %tmp10_50 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc311) + %ptr_51 = tt.splat %V : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked> loc(#loc390) + %k = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc391) + %v = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc392) + %offs_n_load = tt.splat %kv_start_41 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc313) + %offs_n_load_52 = arith.addi %offs_n_load, %offs_n_44 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc313) + %ptr_53 = tt.expand_dims %offs_n_load_52 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc393) + %ptr_54 = arith.muli %ptr_53, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc394) + %ptr_55 = tt.addptr %ptr_47, %ptr_54 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc387) + %ptr_56 = tt.broadcast %ptr_55 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc388) + %ptr_57 = tt.addptr %ptr_56, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc388) + %k_58 = ttg.memdesc_index %k[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc391) + %kv_offset_59 = tt.splat %kv_offset : i1 -> tensor<64x128xi1, #blocked> loc(#loc452) + %k_60 = ttg.async_copy_global_to_local %ptr_57, %k_58 mask %kv_offset_59 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc391) + %k_61 = ttg.async_commit_group tokens %k_60 loc(#loc391) + %ptr_62 = tt.addptr %ptr_51, %ptr_54 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc390) + %ptr_63 = tt.broadcast %ptr_62 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc395) + %ptr_64 = tt.addptr %ptr_63, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc395) + %v_65 = ttg.memdesc_index %v[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc392) + %v_66 = ttg.async_copy_global_to_local %ptr_64, %v_65 mask %kv_offset_59 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc392) + %v_67 = ttg.async_commit_group tokens %v_66 loc(#loc392) + %kv_offset_68 = arith.cmpi sgt, %block_n_end_43, %c1_i32 : i32 loc(#loc452) + %kv_base_offset = arith.addi %kv_start_41, %c64_i32 : i32 loc(#loc314) + %offs_n_load_69 = tt.splat %kv_base_offset : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc313) + %offs_n_load_70 = arith.addi %offs_n_load_69, %offs_n_44 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc313) + %ptr_71 = tt.expand_dims %offs_n_load_70 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc393) + %ptr_72 = arith.muli %ptr_71, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc394) + %ptr_73 = tt.addptr %ptr_47, %ptr_72 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc387) + %ptr_74 = tt.broadcast %ptr_73 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc388) + %ptr_75 = tt.addptr %ptr_74, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc388) + %k_76 = ttg.memdesc_index %k[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc391) + %kv_offset_77 = tt.splat %kv_offset_68 : i1 -> tensor<64x128xi1, #blocked> loc(#loc452) + %k_78 = ttg.async_copy_global_to_local %ptr_75, %k_76 mask %kv_offset_77 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc391) + %k_79 = ttg.async_commit_group tokens %k_78 loc(#loc391) + %ptr_80 = tt.addptr %ptr_51, %ptr_72 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc390) + %ptr_81 = tt.broadcast %ptr_80 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc395) + %ptr_82 = tt.addptr %ptr_81, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc395) + %v_83 = ttg.memdesc_index %v[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc392) + %v_84 = ttg.async_copy_global_to_local %ptr_82, %v_83 mask %kv_offset_77 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc392) + %v_85 = ttg.async_commit_group tokens %v_84 loc(#loc392) + ttng.fence_async_shared {bCluster = false} loc(#loc315) + %kv_offset_86:12 = scf.for %kv_offset_151 = %c0_i32 to %block_n_end_43 step %c1_i32 iter_args(%acc_152 = %cst_10, %arg13 = %cst_8, %arg14 = %cst_14, %arg15 = %c64_i32, %arg16 = %0, %arg17 = %c1_i32, %arg18 = %c-1_i32, %k_153 = %k_61, %k_154 = %k_79, %v_155 = %v_67, %v_156 = %v_85, %arg23 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 { + %kv_offset_157 = arith.subi %block_n_end_43, %c2_i32 : i32 loc(#loc452) + %kv_offset_158 = arith.cmpi slt, %kv_offset_151, %kv_offset_157 : i32 loc(#loc452) + %kv_offset_159 = arith.subi %block_n_end_43, %c1_i32 : i32 loc(#loc452) + %kv_offset_160 = arith.cmpi slt, %kv_offset_151, %kv_offset_159 : i32 loc(#loc452) + %kv_offset_161 = arith.addi %arg18, %c1_i32 : i32 loc(#loc452) + %kv_offset_162 = arith.cmpi sge, %kv_offset_161, %c3_i32 : i32 loc(#loc452) + %kv_offset_163 = arith.select %kv_offset_162, %c0_i32, %kv_offset_161 : i32 loc(#loc452) + %k_164 = ttg.async_wait %k_153, %v_155 {num = 2 : i32} loc(#loc391) + %k_165 = ttg.memdesc_index %k[%kv_offset_163] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc391) + %k_166 = ttg.memdesc_trans %k_165 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc316) + %qk = ttng.warp_group_dot %q_40, %k_166, %cst_3 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc315) + %qk_167:4 = ttng.warp_group_dot_wait %qk, %q_40, %k_166, %acc_152 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64>, tensor<128x128xf32, #mma1> loc(#loc315) + %qk_168 = arith.mulf %qk_167#0, %cst_11 : tensor<128x64xf32, #mma> loc(#loc317) + %tmp4_169 = tt.broadcast %arg16 : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc304) + %tmp4_170 = arith.cmpi sge, %tmp4, %tmp4_169 : tensor<128x64xi32, #mma> loc(#loc304) + %tmp5 = arith.extsi %arg16 : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc318) + %tmp8_171 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64, #mma> loc(#loc308) + %tmp11_172 = tt.broadcast %tmp8_171 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc311) + %tmp11_173 = arith.andi %tmp11_172, %tmp11 : tensor<128x64xi1, #mma> loc(#loc311) + %tmp12 = arith.andi %tmp4_170, %tmp11_173 : tensor<128x64xi1, #mma> loc(#loc319) + %tmp15 = arith.cmpi sge, %arg16, %cst_2 : tensor<1x64xi32, #mma> loc(#loc320) + %tmp16 = arith.remsi %arg16, %cst_2 : tensor<1x64xi32, #mma> loc(#loc321) + %tmp18 = arith.cmpi ne, %tmp16, %cst : tensor<1x64xi32, #mma> loc(#loc322) + %tmp19 = arith.cmpi slt, %tmp16, %cst : tensor<1x64xi32, #mma> loc(#loc323) + %tmp22 = arith.andi %tmp18, %tmp19 : tensor<1x64xi1, #mma> loc(#loc324) + %tmp23 = arith.addi %tmp16, %cst_2 : tensor<1x64xi32, #mma> loc(#loc325) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1, #mma>, tensor<1x64xi32, #mma> loc(#loc326) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc327) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64, #mma> loc(#loc328) + %tmp27 = arith.andi %tmp15, %tmp26 : tensor<1x64xi1, #mma> loc(#loc329) + %tmp28 = arith.subi %tmp4_169, %tmp4 : tensor<128x64xi32, #mma> loc(#loc330) + %tmp29 = arith.remsi %tmp28, %cst_1 : tensor<128x64xi32, #mma> loc(#loc331) + %tmp30 = arith.cmpi ne, %tmp29, %cst_0 : tensor<128x64xi32, #mma> loc(#loc332) + %tmp31 = arith.cmpi slt, %tmp29, %cst_0 : tensor<128x64xi32, #mma> loc(#loc333) + %tmp33 = arith.andi %tmp30, %tmp31 : tensor<128x64xi1, #mma> loc(#loc334) + %tmp34 = arith.addi %tmp29, %cst_1 : tensor<128x64xi32, #mma> loc(#loc335) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29 : tensor<128x64xi1, #mma>, tensor<128x64xi32, #mma> loc(#loc336) + %tmp36 = arith.cmpi eq, %tmp35, %cst_0 : tensor<128x64xi32, #mma> loc(#loc337) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc338) + %tmp37_174 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1, #mma> loc(#loc338) + %tmp38 = arith.ori %tmp12, %tmp37_174 : tensor<128x64xi1, #mma> loc(#loc339) + %post_mod_scores = arith.select %tmp38, %qk_168, %cst_12 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc340) + %post_mod_scores_175 = arith.mulf %post_mod_scores, %cst_13 : tensor<128x64xf32, #mma> loc(#loc341) + %m_ij = "tt.reduce"(%post_mod_scores_175) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_230: f32 loc(callsite(#loc1 at #loc342)), %m_ij_231: f32 loc(callsite(#loc1 at #loc342))): + %m_ij_232 = arith.maxnumf %m_ij_230, %m_ij_231 : f32 loc(#loc447) + tt.reduce.return %m_ij_232 : f32 loc(#loc396) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc396) + %m_ij_176 = arith.maxnumf %arg14, %m_ij : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc343) + %masked_out_rows = arith.cmpf oeq, %m_ij_176, %cst_14 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc344) + %m_ij_masked = arith.select %masked_out_rows, %cst_8, %m_ij_176 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc345) + %alpha = arith.subf %arg14, %m_ij_masked : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc346) + %alpha_177 = math.exp2 %alpha : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc347) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc348) + %p_178 = tt.broadcast %p : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc349) + %p_179 = arith.subf %post_mod_scores_175, %p_178 : tensor<128x64xf32, #mma> loc(#loc349) + %p_180 = math.exp2 %p_179 : tensor<128x64xf32, #mma> loc(#loc350) + %l_i_181 = arith.mulf %arg13, %alpha_177 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc351) + %l_i_182 = "tt.reduce"(%p_180) <{axis = 1 : i32}> ({ + ^bb0(%l_i_230: f32 loc(callsite(#loc1 at #loc352)), %l_i_231: f32 loc(callsite(#loc1 at #loc352))): + %l_i_232 = arith.addf %l_i_230, %l_i_231 : f32 loc(#loc448) + tt.reduce.return %l_i_232 : f32 loc(#loc398) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc398) + %l_i_183 = arith.addf %l_i_181, %l_i_182 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc353) + %acc_184 = tt.expand_dims %alpha_177 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc354) + %acc_185 = ttg.convert_layout %acc_184 : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc355) + %acc_186 = tt.broadcast %acc_185 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc355) + %acc_187 = arith.mulf %qk_167#3, %acc_186 : tensor<128x128xf32, #mma1> loc(#loc355) + %v_188 = ttg.memdesc_index %v[%kv_offset_163] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc392) + %acc_189 = arith.truncf %p_180 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc356) + %acc_190 = ttg.convert_layout %acc_189 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc356) + %acc_191 = ttng.warp_group_dot %acc_190, %v_188, %acc_187 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc357) + %offs_n_192 = tt.splat %arg23 : i32 -> tensor<1x64xi32, #mma> loc(#loc358) + %offs_n_193 = arith.addi %arg16, %offs_n_192 : tensor<1x64xi32, #mma> loc(#loc358) + %kv_offset_194 = arith.addi %kv_offset_151, %c1_i32 : i32 loc(#loc452) + %cur_block_idx = arith.divsi %kv_offset_194, %c2_i32 : i32 loc(#loc400) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc401) + %cur_block_195 = tt.load %cur_block, %kv_offset_160 evictionPolicy = evict_last : !tt.ptr loc(#loc402) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc403) + %next_block_196 = arith.cmpi slt, %next_block, %kv_num_blocks_42 : i32 loc(#loc404) + %next_block_197 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc405) + %kv_offset_198 = arith.andi %kv_offset_160, %next_block_196 : i1 loc(#loc452) + %next_block_199 = tt.load %next_block_197, %kv_offset_198 evictionPolicy = evict_last : !tt.ptr loc(#loc406) + %needs_jump = arith.addi %kv_offset_151, %c2_i32 : i32 loc(#loc407) + %needs_jump_200 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc408) + %needs_jump_201 = arith.cmpi eq, %needs_jump_200, %c0_i32 : i32 loc(#loc409) + %jump_to_block = arith.subi %next_block_199, %cur_block_195 : i32 loc(#loc410) + %jump_to_block_202 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc411) + %jump_to_block_203 = arith.subi %jump_to_block_202, %c64_i32 : i32 loc(#loc412) + %offset = arith.extui %needs_jump_201 : i1 to i32 loc(#loc413) + %offset_204 = arith.muli %jump_to_block_203, %offset : i32 loc(#loc413) + %offset_205 = arith.subi %c1_i32, %offset : i32 loc(#loc414) + %offset_206 = arith.muli %offset_205, %c64_i32 : i32 loc(#loc415) + %offset_207 = arith.addi %offset_204, %offset_206 : i32 loc(#loc416) + %kv_offset_208 = arith.addi %arg15, %offset_207 : i32 loc(#loc360) + %kv_offset_209 = arith.addi %arg17, %c1_i32 : i32 loc(#loc452) + %kv_offset_210 = arith.cmpi sge, %kv_offset_209, %c3_i32 : i32 loc(#loc452) + %kv_offset_211 = arith.select %kv_offset_210, %c0_i32, %kv_offset_209 : i32 loc(#loc452) + %kv_base_offset_212 = arith.addi %kv_start_41, %kv_offset_208 : i32 loc(#loc314) + %offs_n_load_213 = tt.splat %kv_base_offset_212 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc313) + %offs_n_load_214 = arith.addi %offs_n_load_213, %offs_n_44 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc313) + %ptr_215 = tt.expand_dims %offs_n_load_214 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc393) + %ptr_216 = arith.muli %ptr_215, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc394) + %ptr_217 = tt.addptr %ptr_47, %ptr_216 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc387) + %ptr_218 = tt.broadcast %ptr_217 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc388) + %ptr_219 = tt.addptr %ptr_218, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc388) + %k_220 = ttg.memdesc_index %k[%kv_offset_211] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc391) + %kv_offset_221 = tt.splat %kv_offset_158 : i1 -> tensor<64x128xi1, #blocked> loc(#loc452) + %k_222 = ttg.async_copy_global_to_local %ptr_219, %k_220 mask %kv_offset_221 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc391) + %k_223 = ttg.async_commit_group tokens %k_222 loc(#loc391) + %ptr_224 = tt.addptr %ptr_51, %ptr_216 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc390) + %ptr_225 = tt.broadcast %ptr_224 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc395) + %ptr_226 = tt.addptr %ptr_225, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc395) + %v_227 = ttg.memdesc_index %v[%kv_offset_211] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc392) + %v_228 = ttg.async_copy_global_to_local %ptr_226, %v_227 mask %kv_offset_221 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc392) + %v_229 = ttg.async_commit_group tokens %v_228 loc(#loc392) + scf.yield %acc_191, %l_i_183, %m_ij_176, %kv_offset_208, %offs_n_193, %kv_offset_211, %kv_offset_163, %k_154, %k_223, %v_156, %v_229, %offset_207 : tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc452) + } loc(#loc452) + %kv_offset_87 = ttng.warp_group_dot_wait %kv_offset_86#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc452) + %kv_offset_88 = ttg.async_wait {num = 0 : i32} loc(#loc452) + ttg.local_dealloc %v : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc452) + ttg.local_dealloc %k : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc452) + %kv_indices_89 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_21 : !tt.ptr, i32 loc(#loc275) + %kv_start_90 = tt.load %kv_indices_89 : !tt.ptr loc(#loc276) + %kv_start_91 = arith.muli %kv_start_90, %c128_i32 : i32 loc(#loc277) + %kv_num_blocks_92 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_19 : !tt.ptr, i32 loc(#loc278) + %kv_num_blocks_93 = tt.load %kv_num_blocks_92 : !tt.ptr loc(#loc279) + %block_n_end_94 = arith.muli %kv_num_blocks_93, %c2_i32 : i32 loc(#loc280) + %block_n_end_95 = arith.minsi %block_n_end_94, %c32_i32 : i32 loc(#loc281) + %k_96 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc417) + %v_97 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc418) + %kv_offset_98 = arith.cmpi sgt, %block_n_end_95, %c0_i32 : i32 loc(#loc453) + %offs_n_load_99 = tt.splat %kv_start_91 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc363) + %offs_n_load_100 = arith.addi %offs_n_load_99, %offs_n_44 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc363) + %ptr_101 = tt.expand_dims %offs_n_load_100 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc419) + %ptr_102 = arith.muli %ptr_101, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc420) + %ptr_103 = tt.addptr %ptr_47, %ptr_102 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc421) + %ptr_104 = tt.broadcast %ptr_103 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc422) + %ptr_105 = tt.addptr %ptr_104, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc422) + %k_106 = ttg.memdesc_index %k_96[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc417) + %kv_offset_107 = tt.splat %kv_offset_98 : i1 -> tensor<64x128xi1, #blocked> loc(#loc453) + %k_108 = ttg.async_copy_global_to_local %ptr_105, %k_106 mask %kv_offset_107 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc417) + %k_109 = ttg.async_commit_group tokens %k_108 loc(#loc417) + %ptr_110 = tt.addptr %ptr_51, %ptr_102 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc423) + %ptr_111 = tt.broadcast %ptr_110 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc424) + %ptr_112 = tt.addptr %ptr_111, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc424) + %v_113 = ttg.memdesc_index %v_97[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc418) + %v_114 = ttg.async_copy_global_to_local %ptr_112, %v_113 mask %kv_offset_107 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc418) + %v_115 = ttg.async_commit_group tokens %v_114 loc(#loc418) + %kv_offset_116 = arith.cmpi sgt, %block_n_end_95, %c1_i32 : i32 loc(#loc453) + %kv_base_offset_117 = arith.addi %kv_start_91, %c64_i32 : i32 loc(#loc364) + %offs_n_load_118 = tt.splat %kv_base_offset_117 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc363) + %offs_n_load_119 = arith.addi %offs_n_load_118, %offs_n_44 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc363) + %ptr_120 = tt.expand_dims %offs_n_load_119 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc419) + %ptr_121 = arith.muli %ptr_120, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc420) + %ptr_122 = tt.addptr %ptr_47, %ptr_121 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc421) + %ptr_123 = tt.broadcast %ptr_122 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc422) + %ptr_124 = tt.addptr %ptr_123, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc422) + %k_125 = ttg.memdesc_index %k_96[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc417) + %kv_offset_126 = tt.splat %kv_offset_116 : i1 -> tensor<64x128xi1, #blocked> loc(#loc453) + %k_127 = ttg.async_copy_global_to_local %ptr_124, %k_125 mask %kv_offset_126 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc417) + %k_128 = ttg.async_commit_group tokens %k_127 loc(#loc417) + %ptr_129 = tt.addptr %ptr_51, %ptr_121 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc423) + %ptr_130 = tt.broadcast %ptr_129 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc424) + %ptr_131 = tt.addptr %ptr_130, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc424) + %v_132 = ttg.memdesc_index %v_97[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc418) + %v_133 = ttg.async_copy_global_to_local %ptr_131, %v_132 mask %kv_offset_126 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc418) + %v_134 = ttg.async_commit_group tokens %v_133 loc(#loc418) + ttng.fence_async_shared {bCluster = false} loc(#loc365) + %kv_offset_135:10 = scf.for %kv_offset_151 = %c0_i32 to %block_n_end_95 step %c1_i32 iter_args(%kv_offset_152 = %kv_offset_87, %kv_offset_153 = %kv_offset_86#1, %kv_offset_154 = %kv_offset_86#2, %arg15 = %c64_i32, %arg16 = %c1_i32, %arg17 = %c-1_i32, %k_155 = %k_109, %k_156 = %k_128, %v_157 = %v_115, %v_158 = %v_134) -> (tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token) : i32 { + %kv_offset_159 = arith.subi %block_n_end_95, %c2_i32 : i32 loc(#loc453) + %kv_offset_160 = arith.cmpi slt, %kv_offset_151, %kv_offset_159 : i32 loc(#loc453) + %kv_offset_161 = arith.subi %block_n_end_95, %c1_i32 : i32 loc(#loc453) + %kv_offset_162 = arith.cmpi slt, %kv_offset_151, %kv_offset_161 : i32 loc(#loc453) + %kv_offset_163 = arith.addi %arg17, %c1_i32 : i32 loc(#loc453) + %kv_offset_164 = arith.cmpi sge, %kv_offset_163, %c3_i32 : i32 loc(#loc453) + %kv_offset_165 = arith.select %kv_offset_164, %c0_i32, %kv_offset_163 : i32 loc(#loc453) + %k_166 = ttg.async_wait %k_155, %v_157 {num = 2 : i32} loc(#loc417) + %k_167 = ttg.memdesc_index %k_96[%kv_offset_165] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc417) + %k_168 = ttg.memdesc_trans %k_167 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc366) + %qk = ttng.warp_group_dot %q_40, %k_168, %cst_3 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc365) + %qk_169:4 = ttng.warp_group_dot_wait %qk, %q_40, %k_168, %kv_offset_152 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64>, tensor<128x128xf32, #mma1> loc(#loc365) + %qk_170 = arith.mulf %qk_169#0, %cst_11 : tensor<128x64xf32, #mma> loc(#loc367) + %post_mod_scores = arith.mulf %qk_170, %cst_13 : tensor<128x64xf32, #mma> loc(#loc368) + %m_ij = "tt.reduce"(%post_mod_scores) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_223: f32 loc(callsite(#loc1 at #loc369)), %m_ij_224: f32 loc(callsite(#loc1 at #loc369))): + %m_ij_225 = arith.maxnumf %m_ij_223, %m_ij_224 : f32 loc(#loc449) + tt.reduce.return %m_ij_225 : f32 loc(#loc425) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc425) + %m_ij_171 = arith.maxnumf %kv_offset_154, %m_ij : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc370) + %masked_out_rows = arith.cmpf oeq, %m_ij_171, %cst_14 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc371) + %m_ij_masked = arith.select %masked_out_rows, %cst_8, %m_ij_171 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc372) + %alpha = arith.subf %kv_offset_154, %m_ij_masked : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc373) + %alpha_172 = math.exp2 %alpha : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc374) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc375) + %p_173 = tt.broadcast %p : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc376) + %p_174 = arith.subf %post_mod_scores, %p_173 : tensor<128x64xf32, #mma> loc(#loc376) + %p_175 = math.exp2 %p_174 : tensor<128x64xf32, #mma> loc(#loc377) + %l_i_176 = arith.mulf %kv_offset_153, %alpha_172 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc378) + %l_i_177 = "tt.reduce"(%p_175) <{axis = 1 : i32}> ({ + ^bb0(%l_i_223: f32 loc(callsite(#loc1 at #loc379)), %l_i_224: f32 loc(callsite(#loc1 at #loc379))): + %l_i_225 = arith.addf %l_i_223, %l_i_224 : f32 loc(#loc450) + tt.reduce.return %l_i_225 : f32 loc(#loc427) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc427) + %l_i_178 = arith.addf %l_i_176, %l_i_177 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc380) + %acc_179 = tt.expand_dims %alpha_172 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc381) + %acc_180 = ttg.convert_layout %acc_179 : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc382) + %acc_181 = tt.broadcast %acc_180 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc382) + %acc_182 = arith.mulf %qk_169#3, %acc_181 : tensor<128x128xf32, #mma1> loc(#loc382) + %v_183 = ttg.memdesc_index %v_97[%kv_offset_165] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc418) + %acc_184 = arith.truncf %p_175 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc383) + %acc_185 = ttg.convert_layout %acc_184 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc383) + %acc_186 = ttng.warp_group_dot %acc_185, %v_183, %acc_182 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc384) + %kv_offset_187 = arith.addi %kv_offset_151, %c1_i32 : i32 loc(#loc453) + %cur_block_idx = arith.divsi %kv_offset_187, %c2_i32 : i32 loc(#loc429) + %cur_block = tt.addptr %kv_indices_89, %cur_block_idx : !tt.ptr, i32 loc(#loc430) + %cur_block_188 = tt.load %cur_block, %kv_offset_162 evictionPolicy = evict_last : !tt.ptr loc(#loc431) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc432) + %next_block_189 = arith.cmpi slt, %next_block, %kv_num_blocks_93 : i32 loc(#loc433) + %next_block_190 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc434) + %kv_offset_191 = arith.andi %kv_offset_162, %next_block_189 : i1 loc(#loc453) + %next_block_192 = tt.load %next_block_190, %kv_offset_191 evictionPolicy = evict_last : !tt.ptr loc(#loc435) + %needs_jump = arith.addi %kv_offset_151, %c2_i32 : i32 loc(#loc436) + %needs_jump_193 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc437) + %needs_jump_194 = arith.cmpi eq, %needs_jump_193, %c0_i32 : i32 loc(#loc438) + %jump_to_block = arith.subi %next_block_192, %cur_block_188 : i32 loc(#loc439) + %jump_to_block_195 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc440) + %jump_to_block_196 = arith.subi %jump_to_block_195, %c64_i32 : i32 loc(#loc441) + %offset = arith.extui %needs_jump_194 : i1 to i32 loc(#loc442) + %offset_197 = arith.muli %jump_to_block_196, %offset : i32 loc(#loc442) + %offset_198 = arith.subi %c1_i32, %offset : i32 loc(#loc443) + %offset_199 = arith.muli %offset_198, %c64_i32 : i32 loc(#loc444) + %offset_200 = arith.addi %offset_197, %offset_199 : i32 loc(#loc445) + %kv_offset_201 = arith.addi %arg15, %offset_200 : i32 loc(#loc386) + %kv_offset_202 = arith.addi %arg16, %c1_i32 : i32 loc(#loc453) + %kv_offset_203 = arith.cmpi sge, %kv_offset_202, %c3_i32 : i32 loc(#loc453) + %kv_offset_204 = arith.select %kv_offset_203, %c0_i32, %kv_offset_202 : i32 loc(#loc453) + %kv_base_offset_205 = arith.addi %kv_start_91, %kv_offset_201 : i32 loc(#loc364) + %offs_n_load_206 = tt.splat %kv_base_offset_205 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc363) + %offs_n_load_207 = arith.addi %offs_n_load_206, %offs_n_44 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc363) + %ptr_208 = tt.expand_dims %offs_n_load_207 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc419) + %ptr_209 = arith.muli %ptr_208, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc420) + %ptr_210 = tt.addptr %ptr_47, %ptr_209 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc421) + %ptr_211 = tt.broadcast %ptr_210 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc422) + %ptr_212 = tt.addptr %ptr_211, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc422) + %k_213 = ttg.memdesc_index %k_96[%kv_offset_204] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc417) + %kv_offset_214 = tt.splat %kv_offset_160 : i1 -> tensor<64x128xi1, #blocked> loc(#loc453) + %k_215 = ttg.async_copy_global_to_local %ptr_212, %k_213 mask %kv_offset_214 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc417) + %k_216 = ttg.async_commit_group tokens %k_215 loc(#loc417) + %ptr_217 = tt.addptr %ptr_51, %ptr_209 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc423) + %ptr_218 = tt.broadcast %ptr_217 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc424) + %ptr_219 = tt.addptr %ptr_218, %ptr_48 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc424) + %v_220 = ttg.memdesc_index %v_97[%kv_offset_204] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc418) + %v_221 = ttg.async_copy_global_to_local %ptr_219, %v_220 mask %kv_offset_214 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc418) + %v_222 = ttg.async_commit_group tokens %v_221 loc(#loc418) + scf.yield %acc_186, %l_i_178, %m_ij_171, %kv_offset_201, %kv_offset_204, %kv_offset_165, %k_156, %k_216, %v_158, %v_222 : tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token loc(#loc453) + } loc(#loc453) + %kv_offset_136 = ttng.warp_group_dot_wait %kv_offset_135#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc453) + %kv_offset_137 = ttg.async_wait {num = 0 : i32} loc(#loc453) + ttg.local_dealloc %v_97 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc453) + ttg.local_dealloc %k_96 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc453) + %l_i = arith.cmpf oeq, %kv_offset_135#1, %cst_8 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc283) + %l_i_138 = arith.select %l_i, %cst_9, %kv_offset_135#1 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc284) + %acc = tt.expand_dims %l_i_138 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc285) + %acc_139 = ttg.convert_layout %acc : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc286) + %acc_140 = tt.broadcast %acc_139 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc286) + %acc_141 = arith.divf %kv_offset_136, %acc_140 : tensor<128x128xf32, #mma1> loc(#loc286) + %mask = arith.cmpi slt, %ptr, %cst_7 : tensor<128x1xi32, #blocked> loc(#loc287) + %mask_142 = arith.cmpi slt, %ptr_36, %cst_6 : tensor<1x128xi32, #blocked> loc(#loc288) + %mask_143 = tt.broadcast %mask : tensor<128x1xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc289) + %mask_144 = tt.broadcast %mask_142 : tensor<1x128xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc289) + %mask_145 = arith.andi %mask_143, %mask_144 : tensor<128x128xi1, #blocked> loc(#loc289) + %1 = tt.splat %q_offset_15 : i32 -> tensor<1x128xi32, #blocked> loc(#loc137) + %2 = arith.addi %ptr_36, %1 : tensor<1x128xi32, #blocked> loc(#loc137) + %3 = tt.broadcast %2 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc138) + %4 = tt.broadcast %ptr_32 : tensor<128x1xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc138) + %5 = arith.addi %3, %4 : tensor<128x128xi32, #blocked> loc(#loc138) + %6 = tt.splat %q_offset : i32 -> tensor<128x128xi32, #blocked> loc(#loc139) + %7 = arith.addi %5, %6 : tensor<128x128xi32, #blocked> loc(#loc139) + %8 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr, #blocked> loc(#loc140) + %9 = tt.addptr %8, %7 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc140) + %10 = arith.truncf %acc_141 : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc141) + %11 = ttg.convert_layout %10 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc141) + tt.store %9, %11, %mask_145 : tensor<128x128x!tt.ptr, #blocked> loc(#loc141) + %off_hz = arith.muli %off_zq, %c32_i32 : i32 loc(#loc290) + %off_hz_146 = arith.addi %off_hz, %off_hq : i32 loc(#loc291) + %l_ptrs = arith.muli %off_hz_146, %c2048_i32 : i32 loc(#loc292) + %l_ptrs_147 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr, i32 loc(#loc293) + %l_ptrs_148 = tt.splat %l_ptrs_147 : !tt.ptr -> tensor<128x!tt.ptr, #blocked1> loc(#loc294) + %l_ptrs_149 = tt.addptr %l_ptrs_148, %offs_m_30 : tensor<128x!tt.ptr, #blocked1>, tensor<128xi32, #blocked1> loc(#loc294) + %lse = math.log2 %l_i_138 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc295) + %lse_150 = arith.addf %kv_offset_135#2, %lse : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc296) + %12 = ttg.convert_layout %lse_150 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128xf32, #blocked1> loc(#loc149) + tt.store %l_ptrs_149, %12 : tensor<128x!tt.ptr, #blocked1> loc(#loc149) + tt.return loc(#loc150) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":97:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":98:27) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":99:27) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":103:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":104:24) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":107:24) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":107:45) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":107:36) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":108:25) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":108:47) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":108:37) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":111:12) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":112:12) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":113:12) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":142:51) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":142:74) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":143:46) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":143:97) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":143:64) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":144:23) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":144:46) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":144:33) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":284:27) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":146:101) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":284:38) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":284:20) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":284:56) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":284:49) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":294:23) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":151:26) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":152:23) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":152:37) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":153:42) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":153:28) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":154:45) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":154:65) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":159:37) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":159:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":167:48) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":347:107) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":373:23) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":376:33) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":502:40) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":376:23) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":377:22) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":378:23) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":379:23) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":380:23) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":439:107) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":346:35) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":342:32) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":351:19) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":349:17) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":353:14) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":374:23) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":381:23) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":384:24) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":385:24) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":387:25) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":388:92) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":391:24) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":392:24) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":393:39) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":394:25) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":395:24) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":396:24) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":397:23) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":398:25) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":399:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":400:92) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":402:24) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":403:24) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":404:39) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":405:25) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":406:24) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":407:24) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":414:69) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":417:27) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":421:27) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":423:35) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":424:51) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":428:31) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":428:25) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":429:51) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":429:39) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":429:21) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":434:16) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":434:24) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":436:22) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":436:16) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":440:22) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":440:44) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":548:26) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":247:33) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":545:63) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":248:38) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":248:24) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":249:109) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":249:113) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":249:55) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":249:25) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":250:30) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":250:35) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":250:60) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":251:34) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":251:48) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":251:63) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":252:29) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":252:47) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":252:61) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":252:42) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":549:21) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":181:35) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":182:27) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":182:41) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":183:51) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":183:32) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":184:49) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":184:69) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":206:26) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":206:34) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":208:20) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":208:16) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":214:20) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":214:38) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":214:30) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":218:49) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":218:62) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":218:75) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":218:25) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":218:109) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":221:26) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":221:31) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":222:32) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":222:23) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":222:40) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":223:33) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":223:20) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":225:29) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":229:4) +#loc162 = loc("q_start"(#loc2)) +#loc163 = loc("off_zq"(#loc3)) +#loc164 = loc("off_hq"(#loc4)) +#loc165 = loc("off_zkv"(#loc5)) +#loc166 = loc("off_hkv"(#loc6)) +#loc167 = loc("q_offset"(#loc7)) +#loc168 = loc("q_offset"(#loc8)) +#loc169 = loc("q_offset"(#loc9)) +#loc170 = loc("k_offset"(#loc10)) +#loc171 = loc("k_offset"(#loc11)) +#loc172 = loc("k_offset"(#loc12)) +#loc173 = loc("Q"(#loc13)) +#loc174 = loc("K"(#loc14)) +#loc175 = loc("V"(#loc15)) +#loc176 = loc("sparse_kv_num_blks_offset"(#loc16)) +#loc177 = loc("sparse_kv_num_blks_offset"(#loc17)) +#loc178 = loc("sparse_kv_idx_offset"(#loc18)) +#loc179 = loc("sparse_kv_idx_offset"(#loc19)) +#loc180 = loc("sparse_kv_idx_offset"(#loc20)) +#loc181 = loc("offs_m"(#loc21)) +#loc182 = loc("offs_m"(#loc22)) +#loc183 = loc("offs_m"(#loc23)) +#loc184 = loc("ptr"(#loc24)) +#loc185 = loc("q"(#loc25)) +#loc186 = loc("ptr"(#loc26)) +#loc187 = loc("ptr"(#loc27)) +#loc188 = loc("ptr"(#loc28)) +#loc189 = loc("ptr"(#loc29)) +#loc190 = loc("kv_indices"(#loc31)) +#loc191 = loc("kv_start"(#loc32)) +#loc192 = loc("kv_start"(#loc33)) +#loc193 = loc("kv_num_blocks"(#loc34)) +#loc194 = loc("kv_num_blocks"(#loc35)) +#loc195 = loc("block_n_end"(#loc36)) +#loc196 = loc("block_n_end"(#loc37)) +#loc197 = loc("offs_n"(#loc38)) +#loc198 = loc("offs_n"(#loc39)) +#loc199 = loc("k"(#loc41)) +#loc201 = loc("tmp4"(#loc44)) +#loc202 = loc("tmp7"(#loc45)) +#loc203 = loc("acc"(#loc46)) +#loc204 = loc("tmp7"(#loc47)) +#loc205 = loc("tmp8"(#loc48)) +#loc206 = loc("tmp9"(#loc49)) +#loc207 = loc("tmp10"(#loc50)) +#loc208 = loc("tmp11"(#loc51)) +#loc209 = loc("v"(#loc52)) +#loc210 = loc("offs_n_load"(#loc53)) +#loc211 = loc("kv_base_offset"(#loc54)) +#loc212 = loc("qk"(#loc55)) +#loc213 = loc("k"(#loc56)) +#loc214 = loc("qk"(#loc57)) +#loc215 = loc("tmp5"(#loc58)) +#loc216 = loc("tmp12"(#loc59)) +#loc217 = loc("tmp15"(#loc60)) +#loc218 = loc("tmp16"(#loc61)) +#loc219 = loc("tmp18"(#loc62)) +#loc220 = loc("tmp19"(#loc63)) +#loc221 = loc("tmp22"(#loc64)) +#loc222 = loc("tmp23"(#loc65)) +#loc223 = loc("tmp24"(#loc66)) +#loc224 = loc("tmp25"(#loc67)) +#loc225 = loc("tmp26"(#loc68)) +#loc226 = loc("tmp27"(#loc69)) +#loc227 = loc("tmp28"(#loc70)) +#loc228 = loc("tmp29"(#loc71)) +#loc229 = loc("tmp30"(#loc72)) +#loc230 = loc("tmp31"(#loc73)) +#loc231 = loc("tmp33"(#loc74)) +#loc232 = loc("tmp34"(#loc75)) +#loc233 = loc("tmp35"(#loc76)) +#loc234 = loc("tmp36"(#loc77)) +#loc235 = loc("tmp37"(#loc78)) +#loc236 = loc("tmp38"(#loc79)) +#loc237 = loc("post_mod_scores"(#loc80)) +#loc238 = loc("post_mod_scores"(#loc81)) +#loc240 = loc("m_ij"(#loc85)) +#loc241 = loc("masked_out_rows"(#loc86)) +#loc242 = loc("m_ij_masked"(#loc87)) +#loc243 = loc("alpha"(#loc88)) +#loc244 = loc("alpha"(#loc89)) +#loc245 = loc("p"(#loc90)) +#loc246 = loc("p"(#loc91)) +#loc247 = loc("p"(#loc92)) +#loc248 = loc("l_i"(#loc93)) +#loc250 = loc("l_i"(#loc97)) +#loc251 = loc("acc"(#loc98)) +#loc252 = loc("acc"(#loc99)) +#loc253 = loc("acc"(#loc100)) +#loc254 = loc("acc"(#loc101)) +#loc255 = loc("offs_n"(#loc102)) +#loc256 = loc("cur_block_idx"(#loc103)) +#loc257 = loc("offset"(#loc104)) +#loc258 = loc("cur_block"(#loc105)) +#loc259 = loc("cur_block"(#loc106)) +#loc260 = loc("next_block"(#loc107)) +#loc261 = loc("next_block"(#loc108)) +#loc262 = loc("next_block"(#loc109)) +#loc263 = loc("next_block"(#loc110)) +#loc264 = loc("needs_jump"(#loc111)) +#loc265 = loc("needs_jump"(#loc112)) +#loc266 = loc("needs_jump"(#loc113)) +#loc267 = loc("jump_to_block"(#loc114)) +#loc268 = loc("jump_to_block"(#loc115)) +#loc269 = loc("jump_to_block"(#loc116)) +#loc270 = loc("offset"(#loc117)) +#loc271 = loc("offset"(#loc118)) +#loc272 = loc("offset"(#loc119)) +#loc273 = loc("offset"(#loc120)) +#loc274 = loc("kv_offset"(#loc121)) +#loc275 = loc("kv_indices"(#loc122)) +#loc276 = loc("kv_start"(#loc123)) +#loc277 = loc("kv_start"(#loc124)) +#loc278 = loc("kv_num_blocks"(#loc125)) +#loc279 = loc("kv_num_blocks"(#loc126)) +#loc280 = loc("block_n_end"(#loc127)) +#loc281 = loc("block_n_end"(#loc128)) +#loc283 = loc("l_i"(#loc130)) +#loc284 = loc("l_i"(#loc131)) +#loc285 = loc("acc"(#loc132)) +#loc286 = loc("acc"(#loc133)) +#loc287 = loc("mask"(#loc134)) +#loc288 = loc("mask"(#loc135)) +#loc289 = loc("mask"(#loc136)) +#loc290 = loc("off_hz"(#loc142)) +#loc291 = loc("off_hz"(#loc143)) +#loc292 = loc("l_ptrs"(#loc144)) +#loc293 = loc("l_ptrs"(#loc145)) +#loc294 = loc("l_ptrs"(#loc146)) +#loc295 = loc("lse"(#loc147)) +#loc296 = loc("lse"(#loc148)) +#loc297 = loc(callsite(#loc184 at #loc185)) +#loc298 = loc(callsite(#loc186 at #loc185)) +#loc299 = loc(callsite(#loc187 at #loc185)) +#loc300 = loc(callsite(#loc188 at #loc185)) +#loc301 = loc(callsite(#loc189 at #loc185)) +#loc302 = loc(callsite(#loc30 at #loc185)) +#loc303 = loc(callsite(#loc199 at #loc200)) +#loc304 = loc(callsite(#loc201 at #loc200)) +#loc305 = loc(callsite(#loc202 at #loc200)) +#loc306 = loc("l_i"(#loc203)) +#loc307 = loc(callsite(#loc204 at #loc200)) +#loc308 = loc(callsite(#loc205 at #loc200)) +#loc309 = loc(callsite(#loc206 at #loc200)) +#loc310 = loc(callsite(#loc207 at #loc200)) +#loc311 = loc(callsite(#loc208 at #loc200)) +#loc312 = loc(callsite(#loc209 at #loc200)) +#loc313 = loc(callsite(#loc210 at #loc200)) +#loc314 = loc(callsite(#loc211 at #loc200)) +#loc315 = loc(callsite(#loc212 at #loc200)) +#loc316 = loc(callsite(#loc213 at #loc200)) +#loc317 = loc(callsite(#loc214 at #loc200)) +#loc318 = loc(callsite(#loc215 at #loc200)) +#loc319 = loc(callsite(#loc216 at #loc200)) +#loc320 = loc(callsite(#loc217 at #loc200)) +#loc321 = loc(callsite(#loc218 at #loc200)) +#loc322 = loc(callsite(#loc219 at #loc200)) +#loc323 = loc(callsite(#loc220 at #loc200)) +#loc324 = loc(callsite(#loc221 at #loc200)) +#loc325 = loc(callsite(#loc222 at #loc200)) +#loc326 = loc(callsite(#loc223 at #loc200)) +#loc327 = loc(callsite(#loc224 at #loc200)) +#loc328 = loc(callsite(#loc225 at #loc200)) +#loc329 = loc(callsite(#loc226 at #loc200)) +#loc330 = loc(callsite(#loc227 at #loc200)) +#loc331 = loc(callsite(#loc228 at #loc200)) +#loc332 = loc(callsite(#loc229 at #loc200)) +#loc333 = loc(callsite(#loc230 at #loc200)) +#loc334 = loc(callsite(#loc231 at #loc200)) +#loc335 = loc(callsite(#loc232 at #loc200)) +#loc336 = loc(callsite(#loc233 at #loc200)) +#loc337 = loc(callsite(#loc234 at #loc200)) +#loc338 = loc(callsite(#loc235 at #loc200)) +#loc339 = loc(callsite(#loc236 at #loc200)) +#loc340 = loc(callsite(#loc237 at #loc200)) +#loc341 = loc(callsite(#loc238 at #loc200)) +#loc343 = loc(callsite(#loc240 at #loc200)) +#loc344 = loc(callsite(#loc241 at #loc200)) +#loc345 = loc(callsite(#loc242 at #loc200)) +#loc346 = loc(callsite(#loc243 at #loc200)) +#loc347 = loc(callsite(#loc244 at #loc200)) +#loc348 = loc(callsite(#loc245 at #loc200)) +#loc349 = loc(callsite(#loc246 at #loc200)) +#loc350 = loc(callsite(#loc247 at #loc200)) +#loc351 = loc(callsite(#loc248 at #loc200)) +#loc353 = loc(callsite(#loc250 at #loc200)) +#loc354 = loc(callsite(#loc251 at #loc200)) +#loc355 = loc(callsite(#loc252 at #loc200)) +#loc356 = loc(callsite(#loc253 at #loc200)) +#loc357 = loc(callsite(#loc254 at #loc200)) +#loc358 = loc(callsite(#loc255 at #loc43)) +#loc359 = loc(callsite(#loc257 at #loc43)) +#loc360 = loc(callsite(#loc274 at #loc43)) +#loc361 = loc(callsite(#loc199 at #loc282)) +#loc362 = loc(callsite(#loc209 at #loc282)) +#loc363 = loc(callsite(#loc210 at #loc282)) +#loc364 = loc(callsite(#loc211 at #loc282)) +#loc365 = loc(callsite(#loc212 at #loc282)) +#loc366 = loc(callsite(#loc213 at #loc282)) +#loc367 = loc(callsite(#loc214 at #loc282)) +#loc368 = loc(callsite(#loc238 at #loc282)) +#loc370 = loc(callsite(#loc240 at #loc282)) +#loc371 = loc(callsite(#loc241 at #loc282)) +#loc372 = loc(callsite(#loc242 at #loc282)) +#loc373 = loc(callsite(#loc243 at #loc282)) +#loc374 = loc(callsite(#loc244 at #loc282)) +#loc375 = loc(callsite(#loc245 at #loc282)) +#loc376 = loc(callsite(#loc246 at #loc282)) +#loc377 = loc(callsite(#loc247 at #loc282)) +#loc378 = loc(callsite(#loc248 at #loc282)) +#loc380 = loc(callsite(#loc250 at #loc282)) +#loc381 = loc(callsite(#loc251 at #loc282)) +#loc382 = loc(callsite(#loc252 at #loc282)) +#loc383 = loc(callsite(#loc253 at #loc282)) +#loc384 = loc(callsite(#loc254 at #loc282)) +#loc385 = loc(callsite(#loc257 at #loc129)) +#loc386 = loc(callsite(#loc274 at #loc129)) +#loc387 = loc(callsite(#loc187 at #loc303)) +#loc388 = loc(callsite(#loc189 at #loc303)) +#loc389 = loc("m_i"(#loc306)) +#loc390 = loc(callsite(#loc187 at #loc312)) +#loc391 = loc(callsite(#loc30 at #loc303)) +#loc392 = loc(callsite(#loc30 at #loc312)) +#loc393 = loc(callsite(#loc184 at #loc303)) +#loc394 = loc(callsite(#loc186 at #loc303)) +#loc395 = loc(callsite(#loc189 at #loc312)) +#loc396 = loc(callsite(#loc82 at #loc342)) +#loc398 = loc(callsite(#loc94 at #loc352)) +#loc400 = loc(callsite(#loc256 at #loc359)) +#loc401 = loc(callsite(#loc258 at #loc359)) +#loc402 = loc(callsite(#loc259 at #loc359)) +#loc403 = loc(callsite(#loc260 at #loc359)) +#loc404 = loc(callsite(#loc261 at #loc359)) +#loc405 = loc(callsite(#loc262 at #loc359)) +#loc406 = loc(callsite(#loc263 at #loc359)) +#loc407 = loc(callsite(#loc264 at #loc359)) +#loc408 = loc(callsite(#loc265 at #loc359)) +#loc409 = loc(callsite(#loc266 at #loc359)) +#loc410 = loc(callsite(#loc267 at #loc359)) +#loc411 = loc(callsite(#loc268 at #loc359)) +#loc412 = loc(callsite(#loc269 at #loc359)) +#loc413 = loc(callsite(#loc270 at #loc359)) +#loc414 = loc(callsite(#loc271 at #loc359)) +#loc415 = loc(callsite(#loc272 at #loc359)) +#loc416 = loc(callsite(#loc273 at #loc359)) +#loc417 = loc(callsite(#loc30 at #loc361)) +#loc418 = loc(callsite(#loc30 at #loc362)) +#loc419 = loc(callsite(#loc184 at #loc361)) +#loc420 = loc(callsite(#loc186 at #loc361)) +#loc421 = loc(callsite(#loc187 at #loc361)) +#loc422 = loc(callsite(#loc189 at #loc361)) +#loc423 = loc(callsite(#loc187 at #loc362)) +#loc424 = loc(callsite(#loc189 at #loc362)) +#loc425 = loc(callsite(#loc82 at #loc369)) +#loc427 = loc(callsite(#loc94 at #loc379)) +#loc429 = loc(callsite(#loc256 at #loc385)) +#loc430 = loc(callsite(#loc258 at #loc385)) +#loc431 = loc(callsite(#loc259 at #loc385)) +#loc432 = loc(callsite(#loc260 at #loc385)) +#loc433 = loc(callsite(#loc261 at #loc385)) +#loc434 = loc(callsite(#loc262 at #loc385)) +#loc435 = loc(callsite(#loc263 at #loc385)) +#loc436 = loc(callsite(#loc264 at #loc385)) +#loc437 = loc(callsite(#loc265 at #loc385)) +#loc438 = loc(callsite(#loc266 at #loc385)) +#loc439 = loc(callsite(#loc267 at #loc385)) +#loc440 = loc(callsite(#loc268 at #loc385)) +#loc441 = loc(callsite(#loc269 at #loc385)) +#loc442 = loc(callsite(#loc270 at #loc385)) +#loc443 = loc(callsite(#loc271 at #loc385)) +#loc444 = loc(callsite(#loc272 at #loc385)) +#loc445 = loc(callsite(#loc273 at #loc385)) +#loc446 = loc("offs_n"(#loc389)) +#loc447 = loc(callsite(#loc84 at #loc396)) +#loc448 = loc(callsite(#loc96 at #loc398)) +#loc449 = loc(callsite(#loc84 at #loc425)) +#loc450 = loc(callsite(#loc96 at #loc427)) +#loc451 = loc("kv_offset"(#loc446)) +#loc452 = loc(callsite(#loc451 at #loc43)) +#loc453 = loc(callsite(#loc451 at #loc129)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..817226e0fda47738fafda8b8565afd45ffbf509b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/AVHLWU2EOG7BZ2NIHGR6HFYMYSKW7P6IDGODEMILPB6BLKP7IU7A/triton_tem_fused_0.ttir @@ -0,0 +1,763 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":172:41) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":518:16) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":421:51) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":434:34) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":198:45) +#loc157 = loc("arg_Q"(#loc)) +#loc158 = loc("arg_K"(#loc)) +#loc159 = loc("arg_V"(#loc)) +#loc160 = loc("arg_LSE"(#loc)) +#loc161 = loc("arg_MAX"(#loc)) +#loc162 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc163 = loc("arg_KV_IDX"(#loc)) +#loc164 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc165 = loc("arg_FULL_KV_IDX"(#loc)) +#loc166 = loc("in_ptr9"(#loc)) +#loc167 = loc("out_ptr0"(#loc)) +#loc213 = loc(callsite(#loc50 at #loc2)) +#loc250 = loc("m_ij"(#loc88)) +#loc260 = loc("l_i"(#loc100)) +#loc296 = loc(callsite(#loc50 at #loc138)) +#loc354 = loc(callsite(#loc250 at #loc213)) +#loc364 = loc(callsite(#loc260 at #loc213)) +#loc381 = loc(callsite(#loc250 at #loc296)) +#loc391 = loc(callsite(#loc260 at #loc296)) +#loc408 = loc(callsite(#loc1 at #loc354)) +#loc410 = loc(callsite(#loc1 at #loc364)) +#loc437 = loc(callsite(#loc1 at #loc381)) +#loc439 = loc(callsite(#loc1 at #loc391)) +module { + tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %in_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr9"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<2048> : tensor<128x64xi32> loc(#loc168) + %cst_1 = arith.constant dense<2048> : tensor<1x64xi32> loc(#loc168) + %cst_2 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc1) + %cst_3 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1) + %cst_4 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc168) + %cst_5 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc168) + %cst_6 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc168) + %cst_7 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1) + %cst_8 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %acc = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc309) + %cst_9 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc1) + %mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc170) + %mask_10 = arith.constant dense<2048> : tensor<128x1xi32> loc(#loc171) + %l_i = arith.constant dense<1.000000e+00> : tensor<128xf32> loc(#loc172) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %stride_kv_idx_h = arith.constant 256 : i32 loc(#loc173) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c2097152_i32 = arith.constant 2097152 : i32 loc(#loc1) + %c262144_i32 = arith.constant 262144 : i32 loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %q_start = tt.get_program_id x : i32 loc(#loc174) + %off_zq = tt.get_program_id y : i32 loc(#loc175) + %off_hq = tt.get_program_id z : i32 loc(#loc176) + %off_zkv = arith.remsi %off_zq, %c8_i32 : i32 loc(#loc177) + %off_hkv = arith.divsi %off_hq, %c4_i32 : i32 loc(#loc178) + %q_offset = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc179) + %q_offset_12 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc180) + %q_offset_13 = arith.addi %q_offset, %q_offset_12 : i32 loc(#loc181) + %k_offset = arith.muli %off_zkv, %c2097152_i32 : i32 loc(#loc182) + %k_offset_14 = arith.muli %off_hkv, %c262144_i32 : i32 loc(#loc183) + %k_offset_15 = arith.addi %k_offset, %k_offset_14 : i32 loc(#loc184) + %Q = tt.addptr %arg_Q, %q_offset_13 : !tt.ptr, i32 loc(#loc185) + %K = tt.addptr %arg_K, %k_offset_15 : !tt.ptr, i32 loc(#loc186) + %V = tt.addptr %arg_V, %k_offset_15 : !tt.ptr, i32 loc(#loc187) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %c16_i32 : i32 loc(#loc188) + %sparse_kv_num_blks_offset_16 = arith.addi %sparse_kv_num_blks_offset, %q_start : i32 loc(#loc189) + %sparse_kv_idx_offset = arith.muli %off_zkv, %stride_kv_idx_h : i32 loc(#loc190) + %sparse_kv_idx_offset_17 = arith.muli %q_start, %c16_i32 : i32 loc(#loc191) + %sparse_kv_idx_offset_18 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_17 : i32 loc(#loc192) + %offs_m = arith.muli %q_start, %c128_i32 : i32 loc(#loc193) + %offs_m_19 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc194) + %offs_m_20 = tt.splat %offs_m : i32 -> tensor<128xi32> loc(#loc195) + %offs_m_21 = arith.addi %offs_m_20, %offs_m_19 : tensor<128xi32> loc(#loc195) + %ptr = tt.expand_dims %offs_m_21 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc310) + %ptr_22 = arith.muli %ptr, %cst_9 : tensor<128x1xi32> loc(#loc311) + %ptr_23 = tt.splat %Q : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc312) + %ptr_24 = tt.addptr %ptr_23, %ptr_22 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc312) + %ptr_25 = tt.expand_dims %offs_m_19 {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc313) + %ptr_26 = tt.broadcast %ptr_24 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc314) + %ptr_27 = tt.broadcast %ptr_25 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc314) + %ptr_28 = tt.addptr %ptr_26, %ptr_27 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc314) + %q = tt.load %ptr_28 : tensor<128x128x!tt.ptr> loc(#loc315) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_18 : !tt.ptr, i32 loc(#loc202) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc203) + %kv_start_29 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc204) + %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_16 : !tt.ptr, i32 loc(#loc205) + %kv_num_blocks_30 = tt.load %kv_num_blocks : !tt.ptr loc(#loc206) + %block_n_end = arith.muli %kv_num_blocks_30, %c2_i32 : i32 loc(#loc207) + %block_n_end_31 = arith.minsi %block_n_end, %c32_i32 : i32 loc(#loc208) + %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc209) + %offs_n_32 = tt.splat %kv_start_29 : i32 -> tensor<64xi32> loc(#loc210) + %offs_n_33 = arith.addi %offs_n_32, %offs_n : tensor<64xi32> loc(#loc210) + %0 = tt.expand_dims %offs_n_33 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc47) + %kv_offset:5 = scf.for %start_n = %c0_i32 to %block_n_end_31 step %c1_i32 iter_args(%acc_59 = %acc, %l_i_60 = %cst_11, %m_i = %cst_2, %offs_n_61 = %0, %kv_offset_62 = %c0_i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %kv_base_offset = arith.addi %kv_start_29, %kv_offset_62 : i32 loc(#loc317) + %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc318) + %offs_n_load_63 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc318) + %ptr_64 = tt.expand_dims %offs_n_load_63 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc402) + %ptr_65 = arith.muli %ptr_64, %cst : tensor<64x1xi32> loc(#loc403) + %ptr_66 = tt.splat %K : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc404) + %ptr_67 = tt.addptr %ptr_66, %ptr_65 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc404) + %ptr_68 = tt.broadcast %ptr_67 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc405) + %ptr_69 = tt.broadcast %ptr_25 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc405) + %ptr_70 = tt.addptr %ptr_68, %ptr_69 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc405) + %k = tt.load %ptr_70 : tensor<64x128x!tt.ptr> loc(#loc406) + %k_71 = tt.trans %k {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc320) + %qk = tt.dot %q, %k_71, %cst_8, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc321) + %qk_72 = arith.mulf %qk, %cst_7 : tensor<128x64xf32> loc(#loc322) + %tmp4 = tt.broadcast %ptr : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc323) + %tmp4_73 = tt.broadcast %offs_n_61 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc323) + %tmp4_74 = arith.cmpi sge, %tmp4, %tmp4_73 : tensor<128x64xi32> loc(#loc323) + %tmp5 = arith.extsi %offs_n_61 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc324) + %tmp7 = tt.addptr %in_ptr9, %off_zq : !tt.ptr, i32 loc(#loc325) + %tmp7_75 = tt.load %tmp7 : !tt.ptr loc(#loc326) + %tmp8 = tt.splat %tmp7_75 : i64 -> tensor<1x64xi64> loc(#loc327) + %tmp8_76 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc327) + %tmp9 = arith.extsi %ptr : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc328) + %tmp10 = tt.splat %tmp7_75 : i64 -> tensor<128x1xi64> loc(#loc329) + %tmp10_77 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc329) + %tmp11 = tt.broadcast %tmp8_76 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc330) + %tmp11_78 = tt.broadcast %tmp10_77 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc330) + %tmp11_79 = arith.andi %tmp11, %tmp11_78 : tensor<128x64xi1> loc(#loc330) + %tmp12 = arith.andi %tmp4_74, %tmp11_79 : tensor<128x64xi1> loc(#loc331) + %tmp15 = arith.cmpi sge, %offs_n_61, %cst_1 : tensor<1x64xi32> loc(#loc332) + %tmp16 = arith.remsi %offs_n_61, %cst_1 : tensor<1x64xi32> loc(#loc333) + %tmp18 = arith.cmpi ne, %tmp16, %cst_6 : tensor<1x64xi32> loc(#loc334) + %tmp19 = arith.cmpi slt, %tmp16, %cst_6 : tensor<1x64xi32> loc(#loc335) + %tmp22 = arith.andi %tmp18, %tmp19 : tensor<1x64xi1> loc(#loc336) + %tmp23 = arith.addi %tmp16, %cst_1 : tensor<1x64xi32> loc(#loc337) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc338) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc339) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64> loc(#loc340) + %tmp27 = arith.andi %tmp15, %tmp26 : tensor<1x64xi1> loc(#loc341) + %tmp28 = arith.subi %tmp4_73, %tmp4 : tensor<128x64xi32> loc(#loc342) + %tmp29 = arith.remsi %tmp28, %cst_0 : tensor<128x64xi32> loc(#loc343) + %tmp30 = arith.cmpi ne, %tmp29, %cst_5 : tensor<128x64xi32> loc(#loc344) + %tmp31 = arith.cmpi slt, %tmp29, %cst_5 : tensor<128x64xi32> loc(#loc345) + %tmp33 = arith.andi %tmp30, %tmp31 : tensor<128x64xi1> loc(#loc346) + %tmp34 = arith.addi %tmp29, %cst_0 : tensor<128x64xi32> loc(#loc347) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc348) + %tmp36 = arith.cmpi eq, %tmp35, %cst_5 : tensor<128x64xi32> loc(#loc349) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc350) + %tmp37_80 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1> loc(#loc350) + %tmp38 = arith.ori %tmp12, %tmp37_80 : tensor<128x64xi1> loc(#loc351) + %post_mod_scores = arith.select %tmp38, %qk_72, %cst_4 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc352) + %post_mod_scores_81 = arith.mulf %post_mod_scores, %cst_3 : tensor<128x64xf32> loc(#loc353) + %m_ij = "tt.reduce"(%post_mod_scores_81) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_114: f32 loc(callsite(#loc1 at #loc354)), %m_ij_115: f32 loc(callsite(#loc1 at #loc354))): + %m_ij_116 = arith.maxnumf %m_ij_114, %m_ij_115 : f32 loc(#loc461) + tt.reduce.return %m_ij_116 : f32 loc(#loc407) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc407) + %m_ij_82 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc355) + %masked_out_rows = arith.cmpf oeq, %m_ij_82, %cst_2 : tensor<128xf32> loc(#loc356) + %m_ij_masked = arith.select %masked_out_rows, %cst_11, %m_ij_82 : tensor<128xi1>, tensor<128xf32> loc(#loc357) + %alpha = arith.subf %m_i, %m_ij_masked : tensor<128xf32> loc(#loc358) + %alpha_83 = math.exp2 %alpha : tensor<128xf32> loc(#loc359) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc360) + %p_84 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc361) + %p_85 = arith.subf %post_mod_scores_81, %p_84 : tensor<128x64xf32> loc(#loc361) + %p_86 = math.exp2 %p_85 : tensor<128x64xf32> loc(#loc362) + %l_i_87 = arith.mulf %l_i_60, %alpha_83 : tensor<128xf32> loc(#loc363) + %l_i_88 = "tt.reduce"(%p_86) <{axis = 1 : i32}> ({ + ^bb0(%l_i_114: f32 loc(callsite(#loc1 at #loc364)), %l_i_115: f32 loc(callsite(#loc1 at #loc364))): + %l_i_116 = arith.addf %l_i_114, %l_i_115 : f32 loc(#loc462) + tt.reduce.return %l_i_116 : f32 loc(#loc409) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc409) + %l_i_89 = arith.addf %l_i_87, %l_i_88 : tensor<128xf32> loc(#loc365) + %acc_90 = tt.expand_dims %alpha_83 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc366) + %acc_91 = tt.broadcast %acc_90 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc367) + %acc_92 = arith.mulf %acc_59, %acc_91 : tensor<128x128xf32> loc(#loc367) + %ptr_93 = tt.splat %V : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc411) + %ptr_94 = tt.addptr %ptr_93, %ptr_65 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc411) + %ptr_95 = tt.broadcast %ptr_94 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc412) + %ptr_96 = tt.addptr %ptr_95, %ptr_69 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc412) + %v = tt.load %ptr_96 : tensor<64x128x!tt.ptr> loc(#loc413) + %acc_97 = arith.truncf %p_86 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc369) + %acc_98 = tt.dot %acc_97, %v, %acc_92, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc370) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc414) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc415) + %cur_block_99 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc416) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc417) + %next_block_100 = arith.cmpi slt, %next_block, %kv_num_blocks_30 : i32 loc(#loc418) + %next_block_101 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc419) + %next_block_102 = tt.load %next_block_101, %next_block_100 evictionPolicy = evict_last : !tt.ptr loc(#loc420) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc421) + %needs_jump_103 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc422) + %needs_jump_104 = arith.cmpi eq, %needs_jump_103, %c0_i32 : i32 loc(#loc423) + %jump_to_block = arith.subi %next_block_102, %cur_block_99 : i32 loc(#loc424) + %jump_to_block_105 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc425) + %jump_to_block_106 = arith.subi %jump_to_block_105, %c64_i32 : i32 loc(#loc426) + %offset = arith.extui %needs_jump_104 : i1 to i32 loc(#loc427) + %offset_107 = arith.muli %jump_to_block_106, %offset : i32 loc(#loc427) + %offset_108 = arith.subi %c1_i32, %offset : i32 loc(#loc428) + %offset_109 = arith.muli %offset_108, %c64_i32 : i32 loc(#loc429) + %offset_110 = arith.addi %offset_107, %offset_109 : i32 loc(#loc430) + %offs_n_111 = tt.splat %offset_110 : i32 -> tensor<1x64xi32> loc(#loc372) + %offs_n_112 = arith.addi %offs_n_61, %offs_n_111 : tensor<1x64xi32> loc(#loc372) + %kv_offset_113 = arith.addi %kv_offset_62, %offset_110 : i32 loc(#loc373) + scf.yield %acc_98, %l_i_89, %m_ij_82, %offs_n_112, %kv_offset_113 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc287) + } loc(#loc466) + %kv_indices_34 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_18 : !tt.ptr, i32 loc(#loc288) + %kv_start_35 = tt.load %kv_indices_34 : !tt.ptr loc(#loc289) + %kv_start_36 = arith.muli %kv_start_35, %c128_i32 : i32 loc(#loc290) + %kv_num_blocks_37 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_16 : !tt.ptr, i32 loc(#loc291) + %kv_num_blocks_38 = tt.load %kv_num_blocks_37 : !tt.ptr loc(#loc292) + %block_n_end_39 = arith.muli %kv_num_blocks_38, %c2_i32 : i32 loc(#loc293) + %block_n_end_40 = arith.minsi %block_n_end_39, %c32_i32 : i32 loc(#loc294) + %offs_n_41 = tt.splat %kv_start_36 : i32 -> tensor<64xi32> loc(#loc295) + %offs_n_42 = arith.addi %offs_n_41, %offs_n : tensor<64xi32> loc(#loc295) + %1 = tt.expand_dims %offs_n_42 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc137) + %kv_offset_43:5 = scf.for %start_n = %c0_i32 to %block_n_end_40 step %c1_i32 iter_args(%acc_59 = %kv_offset#0, %l_i_60 = %kv_offset#1, %m_i = %kv_offset#2, %offs_n_61 = %1, %kv_offset_62 = %c0_i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %kv_base_offset = arith.addi %kv_start_36, %kv_offset_62 : i32 loc(#loc374) + %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc375) + %offs_n_load_63 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc375) + %ptr_64 = tt.expand_dims %offs_n_load_63 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc431) + %ptr_65 = arith.muli %ptr_64, %cst : tensor<64x1xi32> loc(#loc432) + %ptr_66 = tt.splat %K : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc433) + %ptr_67 = tt.addptr %ptr_66, %ptr_65 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc433) + %ptr_68 = tt.broadcast %ptr_67 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc434) + %ptr_69 = tt.broadcast %ptr_25 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc434) + %ptr_70 = tt.addptr %ptr_68, %ptr_69 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc434) + %k = tt.load %ptr_70 : tensor<64x128x!tt.ptr> loc(#loc435) + %k_71 = tt.trans %k {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc377) + %qk = tt.dot %q, %k_71, %cst_8, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc378) + %qk_72 = arith.mulf %qk, %cst_7 : tensor<128x64xf32> loc(#loc379) + %post_mod_scores = arith.mulf %qk_72, %cst_3 : tensor<128x64xf32> loc(#loc380) + %m_ij = "tt.reduce"(%post_mod_scores) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_105: f32 loc(callsite(#loc1 at #loc381)), %m_ij_106: f32 loc(callsite(#loc1 at #loc381))): + %m_ij_107 = arith.maxnumf %m_ij_105, %m_ij_106 : f32 loc(#loc463) + tt.reduce.return %m_ij_107 : f32 loc(#loc436) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc436) + %m_ij_73 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc382) + %masked_out_rows = arith.cmpf oeq, %m_ij_73, %cst_2 : tensor<128xf32> loc(#loc383) + %m_ij_masked = arith.select %masked_out_rows, %cst_11, %m_ij_73 : tensor<128xi1>, tensor<128xf32> loc(#loc384) + %alpha = arith.subf %m_i, %m_ij_masked : tensor<128xf32> loc(#loc385) + %alpha_74 = math.exp2 %alpha : tensor<128xf32> loc(#loc386) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc387) + %p_75 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc388) + %p_76 = arith.subf %post_mod_scores, %p_75 : tensor<128x64xf32> loc(#loc388) + %p_77 = math.exp2 %p_76 : tensor<128x64xf32> loc(#loc389) + %l_i_78 = arith.mulf %l_i_60, %alpha_74 : tensor<128xf32> loc(#loc390) + %l_i_79 = "tt.reduce"(%p_77) <{axis = 1 : i32}> ({ + ^bb0(%l_i_105: f32 loc(callsite(#loc1 at #loc391)), %l_i_106: f32 loc(callsite(#loc1 at #loc391))): + %l_i_107 = arith.addf %l_i_105, %l_i_106 : f32 loc(#loc464) + tt.reduce.return %l_i_107 : f32 loc(#loc438) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc438) + %l_i_80 = arith.addf %l_i_78, %l_i_79 : tensor<128xf32> loc(#loc392) + %acc_81 = tt.expand_dims %alpha_74 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc393) + %acc_82 = tt.broadcast %acc_81 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc394) + %acc_83 = arith.mulf %acc_59, %acc_82 : tensor<128x128xf32> loc(#loc394) + %ptr_84 = tt.splat %V : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc440) + %ptr_85 = tt.addptr %ptr_84, %ptr_65 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc440) + %ptr_86 = tt.broadcast %ptr_85 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc441) + %ptr_87 = tt.addptr %ptr_86, %ptr_69 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc441) + %v = tt.load %ptr_87 : tensor<64x128x!tt.ptr> loc(#loc442) + %acc_88 = arith.truncf %p_77 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc396) + %acc_89 = tt.dot %acc_88, %v, %acc_83, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc397) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc443) + %cur_block = tt.addptr %kv_indices_34, %cur_block_idx : !tt.ptr, i32 loc(#loc444) + %cur_block_90 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc445) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc446) + %next_block_91 = arith.cmpi slt, %next_block, %kv_num_blocks_38 : i32 loc(#loc447) + %next_block_92 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc448) + %next_block_93 = tt.load %next_block_92, %next_block_91 evictionPolicy = evict_last : !tt.ptr loc(#loc449) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc450) + %needs_jump_94 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc451) + %needs_jump_95 = arith.cmpi eq, %needs_jump_94, %c0_i32 : i32 loc(#loc452) + %jump_to_block = arith.subi %next_block_93, %cur_block_90 : i32 loc(#loc453) + %jump_to_block_96 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc454) + %jump_to_block_97 = arith.subi %jump_to_block_96, %c64_i32 : i32 loc(#loc455) + %offset = arith.extui %needs_jump_95 : i1 to i32 loc(#loc456) + %offset_98 = arith.muli %jump_to_block_97, %offset : i32 loc(#loc456) + %offset_99 = arith.subi %c1_i32, %offset : i32 loc(#loc457) + %offset_100 = arith.muli %offset_99, %c64_i32 : i32 loc(#loc458) + %offset_101 = arith.addi %offset_98, %offset_100 : i32 loc(#loc459) + %offs_n_102 = tt.splat %offset_101 : i32 -> tensor<1x64xi32> loc(#loc399) + %offs_n_103 = arith.addi %offs_n_61, %offs_n_102 : tensor<1x64xi32> loc(#loc399) + %kv_offset_104 = arith.addi %kv_offset_62, %offset_101 : i32 loc(#loc400) + scf.yield %acc_89, %l_i_80, %m_ij_73, %offs_n_103, %kv_offset_104 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc297) + } loc(#loc467) + %l_i_44 = arith.cmpf oeq, %kv_offset_43#1, %cst_11 : tensor<128xf32> loc(#loc298) + %l_i_45 = arith.select %l_i_44, %l_i, %kv_offset_43#1 : tensor<128xi1>, tensor<128xf32> loc(#loc172) + %acc_46 = tt.expand_dims %l_i_45 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc299) + %acc_47 = tt.broadcast %acc_46 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc300) + %acc_48 = arith.divf %kv_offset_43#0, %acc_47 : tensor<128x128xf32> loc(#loc300) + %mask_49 = arith.cmpi slt, %ptr, %mask_10 : tensor<128x1xi32> loc(#loc171) + %mask_50 = arith.cmpi slt, %ptr_25, %mask : tensor<1x128xi32> loc(#loc170) + %mask_51 = tt.broadcast %mask_49 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc301) + %mask_52 = tt.broadcast %mask_50 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc301) + %mask_53 = arith.andi %mask_51, %mask_52 : tensor<128x128xi1> loc(#loc301) + %2 = tt.splat %q_offset_12 : i32 -> tensor<1x128xi32> loc(#loc143) + %3 = arith.addi %ptr_25, %2 : tensor<1x128xi32> loc(#loc143) + %4 = tt.broadcast %3 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc144) + %5 = tt.broadcast %ptr_22 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc144) + %6 = arith.addi %4, %5 : tensor<128x128xi32> loc(#loc144) + %7 = tt.splat %q_offset : i32 -> tensor<128x128xi32> loc(#loc145) + %8 = arith.addi %6, %7 : tensor<128x128xi32> loc(#loc145) + %9 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc146) + %10 = tt.addptr %9, %8 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc146) + %11 = arith.truncf %acc_48 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc147) + tt.store %10, %11, %mask_53 : tensor<128x128x!tt.ptr> loc(#loc147) + %off_hz = arith.muli %off_zq, %c32_i32 : i32 loc(#loc302) + %off_hz_54 = arith.addi %off_hz, %off_hq : i32 loc(#loc303) + %l_ptrs = arith.muli %off_hz_54, %c2048_i32 : i32 loc(#loc304) + %l_ptrs_55 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr, i32 loc(#loc305) + %l_ptrs_56 = tt.splat %l_ptrs_55 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc306) + %l_ptrs_57 = tt.addptr %l_ptrs_56, %offs_m_21 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc306) + %lse = math.log2 %l_i_45 : tensor<128xf32> loc(#loc307) + %lse_58 = arith.addf %kv_offset_43#2, %lse : tensor<128xf32> loc(#loc308) + tt.store %l_ptrs_57, %lse_58 : tensor<128x!tt.ptr> loc(#loc155) + tt.return loc(#loc156) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":136:19) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":214:38) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":214:20) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":206:34) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":130:22) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":97:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":98:27) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":99:27) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":103:23) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":104:24) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":107:24) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":107:45) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":107:36) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":108:25) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":108:47) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":108:37) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":111:12) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":112:12) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":113:12) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":142:51) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":142:74) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":143:46) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":143:97) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":143:64) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":144:23) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":144:46) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":144:33) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":284:27) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":146:101) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":284:38) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":284:20) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":284:56) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":284:49) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":294:23) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":151:26) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":152:23) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":152:37) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":153:42) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":153:28) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":154:45) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":154:65) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":159:37) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":159:24) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":167:48) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":502:40) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":342:32) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":346:35) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":347:107) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":349:17) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":351:19) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":353:14) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":373:23) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":374:23) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":376:33) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":376:23) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":377:22) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":378:23) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":379:23) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":380:23) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":381:23) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":384:24) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":385:24) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":387:25) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":388:92) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":391:24) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":392:24) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":393:39) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":394:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":395:24) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":396:24) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":397:23) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":398:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":399:25) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":400:92) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":402:24) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":403:24) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":404:39) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":405:25) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":406:24) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":407:24) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":414:69) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":417:27) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":421:27) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":423:35) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":424:51) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":428:31) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":428:25) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":429:51) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":429:39) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":429:21) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":434:16) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":434:24) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":436:22) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":436:16) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":439:107) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":440:22) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":440:44) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":247:33) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":545:63) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":248:38) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":248:24) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":249:109) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":249:113) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":249:55) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":249:25) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":250:30) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":250:35) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":250:60) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":251:34) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":251:48) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":251:63) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":252:29) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":252:47) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":252:61) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":252:42) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":548:26) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":549:21) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":549:8) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":181:35) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":182:27) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":182:41) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":183:51) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":183:32) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":184:49) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":184:69) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":186:28) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":193:52) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":206:26) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":208:20) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":208:16) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":214:30) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":218:49) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":218:62) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":218:75) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":218:25) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":218:109) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":221:26) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":221:31) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":222:32) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":222:23) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":222:40) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":223:33) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":223:20) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":225:29) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5p/c5pbkg5eq64emuv25ukki7a5dxvn2p2sh6jeiwb6b54tbidps5w7.py":229:4) +#loc168 = loc(callsite(#loc1 at #loc2)) +#loc169 = loc("acc"(#loc4)) +#loc170 = loc("mask"(#loc5)) +#loc171 = loc("mask"(#loc6)) +#loc172 = loc("l_i"(#loc7)) +#loc173 = loc("stride_kv_idx_h"(#loc8)) +#loc174 = loc("q_start"(#loc9)) +#loc175 = loc("off_zq"(#loc10)) +#loc176 = loc("off_hq"(#loc11)) +#loc177 = loc("off_zkv"(#loc12)) +#loc178 = loc("off_hkv"(#loc13)) +#loc179 = loc("q_offset"(#loc14)) +#loc180 = loc("q_offset"(#loc15)) +#loc181 = loc("q_offset"(#loc16)) +#loc182 = loc("k_offset"(#loc17)) +#loc183 = loc("k_offset"(#loc18)) +#loc184 = loc("k_offset"(#loc19)) +#loc185 = loc("Q"(#loc20)) +#loc186 = loc("K"(#loc21)) +#loc187 = loc("V"(#loc22)) +#loc188 = loc("sparse_kv_num_blks_offset"(#loc23)) +#loc189 = loc("sparse_kv_num_blks_offset"(#loc24)) +#loc190 = loc("sparse_kv_idx_offset"(#loc25)) +#loc191 = loc("sparse_kv_idx_offset"(#loc26)) +#loc192 = loc("sparse_kv_idx_offset"(#loc27)) +#loc193 = loc("offs_m"(#loc28)) +#loc194 = loc("offs_m"(#loc29)) +#loc195 = loc("offs_m"(#loc30)) +#loc196 = loc("ptr"(#loc31)) +#loc197 = loc("q"(#loc32)) +#loc198 = loc("ptr"(#loc33)) +#loc199 = loc("ptr"(#loc34)) +#loc200 = loc("ptr"(#loc35)) +#loc201 = loc("ptr"(#loc36)) +#loc202 = loc("kv_indices"(#loc38)) +#loc203 = loc("kv_start"(#loc39)) +#loc204 = loc("kv_start"(#loc40)) +#loc205 = loc("kv_num_blocks"(#loc41)) +#loc206 = loc("kv_num_blocks"(#loc42)) +#loc207 = loc("block_n_end"(#loc43)) +#loc208 = loc("block_n_end"(#loc44)) +#loc209 = loc("offs_n"(#loc45)) +#loc210 = loc("offs_n"(#loc46)) +#loc211 = loc("acc"(#loc48)) +#loc212 = loc("kv_base_offset"(#loc49)) +#loc214 = loc("offs_n_load"(#loc51)) +#loc215 = loc("k"(#loc52)) +#loc216 = loc("k"(#loc53)) +#loc217 = loc("qk"(#loc54)) +#loc218 = loc("qk"(#loc55)) +#loc219 = loc("tmp4"(#loc56)) +#loc220 = loc("tmp5"(#loc57)) +#loc221 = loc("tmp7"(#loc58)) +#loc222 = loc("tmp7"(#loc59)) +#loc223 = loc("tmp8"(#loc60)) +#loc224 = loc("tmp9"(#loc61)) +#loc225 = loc("tmp10"(#loc62)) +#loc226 = loc("tmp11"(#loc63)) +#loc227 = loc("tmp12"(#loc64)) +#loc228 = loc("tmp15"(#loc65)) +#loc229 = loc("tmp16"(#loc66)) +#loc230 = loc("tmp18"(#loc67)) +#loc231 = loc("tmp19"(#loc68)) +#loc232 = loc("tmp22"(#loc69)) +#loc233 = loc("tmp23"(#loc70)) +#loc234 = loc("tmp24"(#loc71)) +#loc235 = loc("tmp25"(#loc72)) +#loc236 = loc("tmp26"(#loc73)) +#loc237 = loc("tmp27"(#loc74)) +#loc238 = loc("tmp28"(#loc75)) +#loc239 = loc("tmp29"(#loc76)) +#loc240 = loc("tmp30"(#loc77)) +#loc241 = loc("tmp31"(#loc78)) +#loc242 = loc("tmp33"(#loc79)) +#loc243 = loc("tmp34"(#loc80)) +#loc244 = loc("tmp35"(#loc81)) +#loc245 = loc("tmp36"(#loc82)) +#loc246 = loc("tmp37"(#loc83)) +#loc247 = loc("tmp38"(#loc84)) +#loc248 = loc("post_mod_scores"(#loc85)) +#loc249 = loc("post_mod_scores"(#loc86)) +#loc251 = loc("m_ij"(#loc90)) +#loc252 = loc("masked_out_rows"(#loc91)) +#loc253 = loc("m_ij_masked"(#loc92)) +#loc254 = loc("alpha"(#loc93)) +#loc255 = loc("alpha"(#loc94)) +#loc256 = loc("p"(#loc95)) +#loc257 = loc("p"(#loc96)) +#loc258 = loc("p"(#loc97)) +#loc259 = loc("l_i"(#loc98)) +#loc261 = loc("l_i"(#loc102)) +#loc262 = loc("acc"(#loc103)) +#loc263 = loc("acc"(#loc104)) +#loc264 = loc("v"(#loc105)) +#loc265 = loc("acc"(#loc106)) +#loc266 = loc("acc"(#loc107)) +#loc267 = loc("cur_block_idx"(#loc108)) +#loc268 = loc("offset"(#loc109)) +#loc269 = loc("cur_block"(#loc110)) +#loc270 = loc("cur_block"(#loc111)) +#loc271 = loc("next_block"(#loc112)) +#loc272 = loc("next_block"(#loc113)) +#loc273 = loc("next_block"(#loc114)) +#loc274 = loc("next_block"(#loc115)) +#loc275 = loc("needs_jump"(#loc116)) +#loc276 = loc("needs_jump"(#loc117)) +#loc277 = loc("needs_jump"(#loc118)) +#loc278 = loc("jump_to_block"(#loc119)) +#loc279 = loc("jump_to_block"(#loc120)) +#loc280 = loc("jump_to_block"(#loc121)) +#loc281 = loc("offset"(#loc122)) +#loc282 = loc("offset"(#loc123)) +#loc283 = loc("offset"(#loc124)) +#loc284 = loc("offset"(#loc125)) +#loc285 = loc("offs_n"(#loc126)) +#loc286 = loc("kv_offset"(#loc127)) +#loc287 = loc(callsite(#loc128 at #loc2)) +#loc288 = loc("kv_indices"(#loc129)) +#loc289 = loc("kv_start"(#loc130)) +#loc290 = loc("kv_start"(#loc131)) +#loc291 = loc("kv_num_blocks"(#loc132)) +#loc292 = loc("kv_num_blocks"(#loc133)) +#loc293 = loc("block_n_end"(#loc134)) +#loc294 = loc("block_n_end"(#loc135)) +#loc295 = loc("offs_n"(#loc136)) +#loc297 = loc(callsite(#loc128 at #loc138)) +#loc298 = loc("l_i"(#loc139)) +#loc299 = loc("acc"(#loc140)) +#loc300 = loc("acc"(#loc141)) +#loc301 = loc("mask"(#loc142)) +#loc302 = loc("off_hz"(#loc148)) +#loc303 = loc("off_hz"(#loc149)) +#loc304 = loc("l_ptrs"(#loc150)) +#loc305 = loc("l_ptrs"(#loc151)) +#loc306 = loc("l_ptrs"(#loc152)) +#loc307 = loc("lse"(#loc153)) +#loc308 = loc("lse"(#loc154)) +#loc309 = loc(callsite(#loc3 at #loc169)) +#loc310 = loc(callsite(#loc196 at #loc197)) +#loc311 = loc(callsite(#loc198 at #loc197)) +#loc312 = loc(callsite(#loc199 at #loc197)) +#loc313 = loc(callsite(#loc200 at #loc197)) +#loc314 = loc(callsite(#loc201 at #loc197)) +#loc315 = loc(callsite(#loc37 at #loc197)) +#loc316 = loc("l_i"(#loc211)) +#loc317 = loc(callsite(#loc212 at #loc213)) +#loc318 = loc(callsite(#loc214 at #loc213)) +#loc319 = loc(callsite(#loc215 at #loc213)) +#loc320 = loc(callsite(#loc216 at #loc213)) +#loc321 = loc(callsite(#loc217 at #loc213)) +#loc322 = loc(callsite(#loc218 at #loc213)) +#loc323 = loc(callsite(#loc219 at #loc213)) +#loc324 = loc(callsite(#loc220 at #loc213)) +#loc325 = loc(callsite(#loc221 at #loc213)) +#loc326 = loc(callsite(#loc222 at #loc213)) +#loc327 = loc(callsite(#loc223 at #loc213)) +#loc328 = loc(callsite(#loc224 at #loc213)) +#loc329 = loc(callsite(#loc225 at #loc213)) +#loc330 = loc(callsite(#loc226 at #loc213)) +#loc331 = loc(callsite(#loc227 at #loc213)) +#loc332 = loc(callsite(#loc228 at #loc213)) +#loc333 = loc(callsite(#loc229 at #loc213)) +#loc334 = loc(callsite(#loc230 at #loc213)) +#loc335 = loc(callsite(#loc231 at #loc213)) +#loc336 = loc(callsite(#loc232 at #loc213)) +#loc337 = loc(callsite(#loc233 at #loc213)) +#loc338 = loc(callsite(#loc234 at #loc213)) +#loc339 = loc(callsite(#loc235 at #loc213)) +#loc340 = loc(callsite(#loc236 at #loc213)) +#loc341 = loc(callsite(#loc237 at #loc213)) +#loc342 = loc(callsite(#loc238 at #loc213)) +#loc343 = loc(callsite(#loc239 at #loc213)) +#loc344 = loc(callsite(#loc240 at #loc213)) +#loc345 = loc(callsite(#loc241 at #loc213)) +#loc346 = loc(callsite(#loc242 at #loc213)) +#loc347 = loc(callsite(#loc243 at #loc213)) +#loc348 = loc(callsite(#loc244 at #loc213)) +#loc349 = loc(callsite(#loc245 at #loc213)) +#loc350 = loc(callsite(#loc246 at #loc213)) +#loc351 = loc(callsite(#loc247 at #loc213)) +#loc352 = loc(callsite(#loc248 at #loc213)) +#loc353 = loc(callsite(#loc249 at #loc213)) +#loc355 = loc(callsite(#loc251 at #loc213)) +#loc356 = loc(callsite(#loc252 at #loc213)) +#loc357 = loc(callsite(#loc253 at #loc213)) +#loc358 = loc(callsite(#loc254 at #loc213)) +#loc359 = loc(callsite(#loc255 at #loc213)) +#loc360 = loc(callsite(#loc256 at #loc213)) +#loc361 = loc(callsite(#loc257 at #loc213)) +#loc362 = loc(callsite(#loc258 at #loc213)) +#loc363 = loc(callsite(#loc259 at #loc213)) +#loc365 = loc(callsite(#loc261 at #loc213)) +#loc366 = loc(callsite(#loc262 at #loc213)) +#loc367 = loc(callsite(#loc263 at #loc213)) +#loc368 = loc(callsite(#loc264 at #loc213)) +#loc369 = loc(callsite(#loc265 at #loc213)) +#loc370 = loc(callsite(#loc266 at #loc213)) +#loc371 = loc(callsite(#loc268 at #loc2)) +#loc372 = loc(callsite(#loc285 at #loc2)) +#loc373 = loc(callsite(#loc286 at #loc2)) +#loc374 = loc(callsite(#loc212 at #loc296)) +#loc375 = loc(callsite(#loc214 at #loc296)) +#loc376 = loc(callsite(#loc215 at #loc296)) +#loc377 = loc(callsite(#loc216 at #loc296)) +#loc378 = loc(callsite(#loc217 at #loc296)) +#loc379 = loc(callsite(#loc218 at #loc296)) +#loc380 = loc(callsite(#loc249 at #loc296)) +#loc382 = loc(callsite(#loc251 at #loc296)) +#loc383 = loc(callsite(#loc252 at #loc296)) +#loc384 = loc(callsite(#loc253 at #loc296)) +#loc385 = loc(callsite(#loc254 at #loc296)) +#loc386 = loc(callsite(#loc255 at #loc296)) +#loc387 = loc(callsite(#loc256 at #loc296)) +#loc388 = loc(callsite(#loc257 at #loc296)) +#loc389 = loc(callsite(#loc258 at #loc296)) +#loc390 = loc(callsite(#loc259 at #loc296)) +#loc392 = loc(callsite(#loc261 at #loc296)) +#loc393 = loc(callsite(#loc262 at #loc296)) +#loc394 = loc(callsite(#loc263 at #loc296)) +#loc395 = loc(callsite(#loc264 at #loc296)) +#loc396 = loc(callsite(#loc265 at #loc296)) +#loc397 = loc(callsite(#loc266 at #loc296)) +#loc398 = loc(callsite(#loc268 at #loc138)) +#loc399 = loc(callsite(#loc285 at #loc138)) +#loc400 = loc(callsite(#loc286 at #loc138)) +#loc401 = loc("m_i"(#loc316)) +#loc402 = loc(callsite(#loc196 at #loc319)) +#loc403 = loc(callsite(#loc198 at #loc319)) +#loc404 = loc(callsite(#loc199 at #loc319)) +#loc405 = loc(callsite(#loc201 at #loc319)) +#loc406 = loc(callsite(#loc37 at #loc319)) +#loc407 = loc(callsite(#loc87 at #loc354)) +#loc409 = loc(callsite(#loc99 at #loc364)) +#loc411 = loc(callsite(#loc199 at #loc368)) +#loc412 = loc(callsite(#loc201 at #loc368)) +#loc413 = loc(callsite(#loc37 at #loc368)) +#loc414 = loc(callsite(#loc267 at #loc371)) +#loc415 = loc(callsite(#loc269 at #loc371)) +#loc416 = loc(callsite(#loc270 at #loc371)) +#loc417 = loc(callsite(#loc271 at #loc371)) +#loc418 = loc(callsite(#loc272 at #loc371)) +#loc419 = loc(callsite(#loc273 at #loc371)) +#loc420 = loc(callsite(#loc274 at #loc371)) +#loc421 = loc(callsite(#loc275 at #loc371)) +#loc422 = loc(callsite(#loc276 at #loc371)) +#loc423 = loc(callsite(#loc277 at #loc371)) +#loc424 = loc(callsite(#loc278 at #loc371)) +#loc425 = loc(callsite(#loc279 at #loc371)) +#loc426 = loc(callsite(#loc280 at #loc371)) +#loc427 = loc(callsite(#loc281 at #loc371)) +#loc428 = loc(callsite(#loc282 at #loc371)) +#loc429 = loc(callsite(#loc283 at #loc371)) +#loc430 = loc(callsite(#loc284 at #loc371)) +#loc431 = loc(callsite(#loc196 at #loc376)) +#loc432 = loc(callsite(#loc198 at #loc376)) +#loc433 = loc(callsite(#loc199 at #loc376)) +#loc434 = loc(callsite(#loc201 at #loc376)) +#loc435 = loc(callsite(#loc37 at #loc376)) +#loc436 = loc(callsite(#loc87 at #loc381)) +#loc438 = loc(callsite(#loc99 at #loc391)) +#loc440 = loc(callsite(#loc199 at #loc395)) +#loc441 = loc(callsite(#loc201 at #loc395)) +#loc442 = loc(callsite(#loc37 at #loc395)) +#loc443 = loc(callsite(#loc267 at #loc398)) +#loc444 = loc(callsite(#loc269 at #loc398)) +#loc445 = loc(callsite(#loc270 at #loc398)) +#loc446 = loc(callsite(#loc271 at #loc398)) +#loc447 = loc(callsite(#loc272 at #loc398)) +#loc448 = loc(callsite(#loc273 at #loc398)) +#loc449 = loc(callsite(#loc274 at #loc398)) +#loc450 = loc(callsite(#loc275 at #loc398)) +#loc451 = loc(callsite(#loc276 at #loc398)) +#loc452 = loc(callsite(#loc277 at #loc398)) +#loc453 = loc(callsite(#loc278 at #loc398)) +#loc454 = loc(callsite(#loc279 at #loc398)) +#loc455 = loc(callsite(#loc280 at #loc398)) +#loc456 = loc(callsite(#loc281 at #loc398)) +#loc457 = loc(callsite(#loc282 at #loc398)) +#loc458 = loc(callsite(#loc283 at #loc398)) +#loc459 = loc(callsite(#loc284 at #loc398)) +#loc460 = loc("offs_n"(#loc401)) +#loc461 = loc(callsite(#loc89 at #loc407)) +#loc462 = loc(callsite(#loc101 at #loc409)) +#loc463 = loc(callsite(#loc89 at #loc436)) +#loc464 = loc(callsite(#loc101 at #loc438)) +#loc465 = loc("kv_offset"(#loc460)) +#loc466 = loc(callsite(#loc465 at #loc2)) +#loc467 = loc(callsite(#loc465 at #loc138)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/__grp__triton_poi_fused_new_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/__grp__triton_poi_fused_new_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..47967e1847e5c842f916a80169dc05d5bb4d1045 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/__grp__triton_poi_fused_new_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_new_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.source", "triton_poi_fused_new_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.ttir", "triton_poi_fused_new_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.ttgir", "triton_poi_fused_new_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.llir", "triton_poi_fused_new_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.ptx", "triton_poi_fused_new_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.cubin", "triton_poi_fused_new_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..60125e0097f078f150fbf29272dcbc80073f66fe Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1ea4e6090eb6d91ec80aa2bc2a722c6bb6e30844 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.json @@ -0,0 +1 @@ +{"hash": "0f3d1b0faf23144847e40c7c836c269ab8ff8ac1ab5ce1b730bf5a0adef541f4", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_new_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..a17c12621e41c4b4275cd36b5baf59ad326e7fab --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.llir @@ -0,0 +1,46 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_new_zeros_0(ptr addrspace(1) %0, i32 %1, ptr addrspace(1) readnone captures(none) %2, ptr addrspace(1) readnone captures(none) %3) local_unnamed_addr #0 !dbg !4 { + %5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %6 = shl i32 %5, 7, !dbg !8 + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %8 = and i32 %7, 127, !dbg !9 + %9 = or disjoint i32 %6, %8, !dbg !10 + %10 = icmp slt i32 %9, %1, !dbg !11 + %11 = sext i32 %9 to i64, !dbg !12 + %12 = getelementptr i32, ptr addrspace(1) %0, i64 %11, !dbg !12 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 0, ptr addrspace(1) %12, i1 %10) #2, !dbg !13 + ret void, !dbg !14 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_new_zeros_0", linkageName: "triton_poi_fused_new_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 19, column: 28, scope: !4) +!8 = !DILocation(line: 19, column: 33, scope: !4) +!9 = !DILocation(line: 20, column: 36, scope: !4) +!10 = !DILocation(line: 20, column: 23, scope: !4) +!11 = !DILocation(line: 21, column: 21, scope: !4) +!12 = !DILocation(line: 24, column: 25, scope: !4) +!13 = !DILocation(line: 24, column: 36, scope: !4) +!14 = !DILocation(line: 24, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..72c2243997514f685ff6e27d7b0268ac258eb960 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.ptx @@ -0,0 +1,207 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_new_zeros_0 // -- Begin function triton_poi_fused_new_zeros_0 + // @triton_poi_fused_new_zeros_0 +.visible .entry triton_poi_fused_new_zeros_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_0_param_0, + .param .u32 triton_poi_fused_new_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_0_param_3 +) +.reqntid 128 +{ + .reg .pred %p<2>; + .reg .b32 %r<8>; + .reg .b64 %rd<3>; + .loc 1 18 0 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:18:0 + +// %bb.0: + ld.param.b64 %rd2, [triton_poi_fused_new_zeros_0_param_0]; + ld.param.b32 %r2, [triton_poi_fused_new_zeros_0_param_1]; +$L__tmp0: + .loc 1 19 28 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:19:28 + mov.u32 %r3, %ctaid.x; + .loc 1 19 33 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:19:33 + shl.b32 %r4, %r3, 7; + .loc 1 20 36 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:20:36 + mov.u32 %r5, %tid.x; + and.b32 %r6, %r5, 127; + .loc 1 20 23 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:20:23 + or.b32 %r7, %r4, %r6; + .loc 1 21 21 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:21:21 + setp.lt.s32 %p1, %r7, %r2; + .loc 1 24 25 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:24:25 + mad.wide.s32 %rd1, %r7, 4, %rd2; + mov.b32 %r1, 0; + .loc 1 24 36 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:24:36 + // begin inline asm + @%p1 st.global.b32 [ %rd1 + 0 ], { %r1 }; + // end inline asm + .loc 1 24 4 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:24:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 117 +.b8 115 +.b8 115 +.b8 53 +.b8 112 +.b8 101 +.b8 114 +.b8 101 +.b8 107 +.b8 114 +.b8 105 +.b8 118 +.b8 53 +.b8 122 +.b8 108 +.b8 117 +.b8 98 +.b8 110 +.b8 107 +.b8 53 +.b8 50 +.b8 102 +.b8 51 +.b8 112 +.b8 101 +.b8 106 +.b8 53 +.b8 113 +.b8 99 +.b8 108 +.b8 122 +.b8 114 +.b8 109 +.b8 116 +.b8 97 +.b8 55 +.b8 99 +.b8 118 +.b8 105 +.b8 100 +.b8 104 +.b8 107 +.b8 112 +.b8 122 +.b8 104 +.b8 117 +.b8 112 +.b8 109 +.b8 118 +.b8 116 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 117 +.b8 115 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..9effbc58593c0bd65430e879cdf1bb44dde67409 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.source @@ -0,0 +1,38 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_new_zeros_0(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc12) + %xoffset_0 = arith.constant 128 : i32 loc(#loc13) + %xoffset_1 = arith.constant 128 : i32 loc(#loc13) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc13) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc14) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<128xi32> loc(#loc15) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<128xi32> loc(#loc15) + %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc16) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<128xi32> loc(#loc16) + %tmp0 = arith.constant 0 : i32 loc(#loc17) + %tmp0_6 = arith.constant dense<0> : tensor<1xi32> loc(#loc17) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc7) + %1 = tt.addptr %0, %xindex_4 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc7) + %cst = arith.constant dense<0> : tensor<128xi32> loc(#loc8) + tt.store %1, %cst, %xmask_5 : tensor<128x!tt.ptr> loc(#loc8) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":23:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":24:25) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":24:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":24:4) +#loc12 = loc("xoffset"(#loc1)) +#loc13 = loc("xoffset"(#loc2)) +#loc14 = loc("xindex"(#loc3)) +#loc15 = loc("xindex"(#loc4)) +#loc16 = loc("xmask"(#loc5)) +#loc17 = loc("tmp0"(#loc6)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..88e9dc10a757a41bfd6147071e0ee8f24ff21adf --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.ttgir @@ -0,0 +1,35 @@ +#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_new_zeros_0(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %cst = arith.constant dense<0> : tensor<128xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc12) + %xoffset_0 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc13) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked> loc(#loc14) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<128xi32, #blocked> loc(#loc15) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<128xi32, #blocked> loc(#loc15) + %xmask = tt.splat %xnumel : i32 -> tensor<128xi32, #blocked> loc(#loc16) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<128xi32, #blocked> loc(#loc16) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr, #blocked> loc(#loc7) + %1 = tt.addptr %0, %xindex_2 : tensor<128x!tt.ptr, #blocked>, tensor<128xi32, #blocked> loc(#loc7) + tt.store %1, %cst, %xmask_3 : tensor<128x!tt.ptr, #blocked> loc(#loc8) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":24:25) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":24:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":24:4) +#loc12 = loc("xoffset"(#loc2)) +#loc13 = loc("xoffset"(#loc3)) +#loc14 = loc("xindex"(#loc4)) +#loc15 = loc("xindex"(#loc5)) +#loc16 = loc("xmask"(#loc6)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..d94dc8dd303d0f4d4cd5ad6ca037b148f8c905c2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/B46RWD5PEMKEQR7EBR6IG3BGTK4P7CWBVNOODNZQX5NAVXXVIH2A/triton_poi_fused_new_zeros_0.ttir @@ -0,0 +1,34 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_new_zeros_0(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<128xi32> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc12) + %xoffset_0 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc13) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc14) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<128xi32> loc(#loc15) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<128xi32> loc(#loc15) + %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc16) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<128xi32> loc(#loc16) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc8) + %1 = tt.addptr %0, %xindex_2 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc8) + tt.store %1, %cst, %xmask_3 : tensor<128x!tt.ptr> loc(#loc1) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":24:36) +#loc2 = loc(unknown) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":19:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":19:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":20:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":20:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":21:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":24:25) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":24:4) +#loc12 = loc("xoffset"(#loc3)) +#loc13 = loc("xoffset"(#loc4)) +#loc14 = loc("xindex"(#loc5)) +#loc15 = loc("xindex"(#loc6)) +#loc16 = loc("xmask"(#loc7)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4d35c9ec7d8d5c1c56188693892f287f583fde1c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..eb0c85ea4da74fa73945aecb295158340450ce51 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..554762f9f2fd13b86f0e11f31ad25f1eb3e76d6b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "0c26ceae614c16280db457c0b301ae5b8479049d2cfeca61a0e3f0197645a333", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..85583c81de34298c26b658d2f5744ec2ba885404 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.llir @@ -0,0 +1,140 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 6, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 252, !dbg !9 + %12 = lshr exact i32 %11, 2, !dbg !9 + %13 = or disjoint i32 %12, %9, !dbg !10 + %14 = and i32 %10, 3, !dbg !11 + %15 = sdiv i32 %13, 2048, !dbg !12 + %16 = mul i32 %15, 2048, !dbg !13 + %.decomposed = sub i32 %13, %16, !dbg !13 + %17 = srem i32 %15, 32, !dbg !14 + %18 = sdiv i32 %13, 65536, !dbg !15 + %19 = shl nsw i32 %17, 7, !dbg !16 + %20 = shl nsw i32 %.decomposed, 12, !dbg !17 + %21 = shl i32 %18, 23, !dbg !18 + %22 = shl i32 %13, 7, !dbg !19 + %23 = add i32 %21, %20 + %24 = add i32 %23, %19 + %25 = zext nneg i32 %14 to i64, !dbg !20 + %26 = sext i32 %22 to i64, !dbg !20 + %invariant.gep = getelementptr bfloat, ptr addrspace(1) %1, i64 %26, !dbg !20 + br label %27, !dbg !20 + +27: ; preds = %7, %27 + %indvars.iv = phi i64 [ 0, %7 ], [ %indvars.iv.next, %27 ] + %28 = phi float [ 0.000000e+00, %7 ], [ %43, %27 ] + %29 = or disjoint i64 %indvars.iv, %25, !dbg !21 + %30 = trunc nuw nsw i64 %29 to i32, !dbg !22 + %31 = add i32 %24, %30, !dbg !22 + %32 = sext i32 %31 to i64, !dbg !23 + %33 = getelementptr bfloat, ptr addrspace(1) %0, i64 %32, !dbg !23 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !24 + %35 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %33, i64 %34, i1 true) #4, !dbg !24 + %36 = bitcast i16 %35 to bfloat, !dbg !24 + %37 = fpext bfloat %36 to float, !dbg !25 + %gep = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %29, !dbg !26 + %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !27 + %39 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep, i64 %38, i1 true) #4, !dbg !27 + %40 = bitcast i16 %39 to bfloat, !dbg !27 + %41 = fpext bfloat %40 to float, !dbg !28 + %42 = fmul float %37, %41, !dbg !29 + %43 = fadd float %28, %42, !dbg !30 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4, !dbg !20 + %44 = icmp samesign ult i64 %indvars.iv, 124, !dbg !20 + br i1 %44, label %27, label %45, !dbg !20 + +45: ; preds = %27 + %46 = and i32 %10, 63, !dbg !9 + %47 = or disjoint i32 %9, %46, !dbg !10 + %48 = bitcast float %43 to i32, !dbg !31 + %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 2, i32 31), !dbg !31 + %50 = bitcast i32 %49 to float, !dbg !31 + %51 = fadd float %43, %50, !dbg !35 + %52 = bitcast float %51 to i32, !dbg !31 + %53 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 1, i32 31), !dbg !31 + %54 = bitcast i32 %53 to float, !dbg !31 + %55 = fadd float %51, %54, !dbg !35 + %56 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %11, !dbg !36 + store float %55, ptr addrspace(3) %56, align 4, !dbg !36 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !36 + %57 = shl nuw nsw i32 %46, 2, !dbg !36 + %58 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %57, !dbg !36 + %59 = load i32, ptr addrspace(3) %58, align 4, !dbg !36 + %60 = sext i32 %47 to i64, !dbg !37 + %61 = getelementptr float, ptr addrspace(1) %2, i64 %60, !dbg !37 + %62 = and i32 %10, 192, !dbg !38 + %63 = icmp eq i32 %62, 0, !dbg !38 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %59, ptr addrspace(1) %61, i1 %63) #4, !dbg !38 + ret void, !dbg !39 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 21, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 29, column: 29, scope: !4) +!15 = !DILocation(line: 30, column: 19, scope: !4) +!16 = !DILocation(line: 39, column: 45, scope: !4) +!17 = !DILocation(line: 39, column: 55, scope: !4) +!18 = !DILocation(line: 39, column: 68, scope: !4) +!19 = !DILocation(line: 40, column: 45, scope: !4) +!20 = !DILocation(line: 33, column: 40, scope: !4) +!21 = !DILocation(line: 34, column: 31, scope: !4) +!22 = !DILocation(line: 39, column: 60, scope: !4) +!23 = !DILocation(line: 39, column: 34, scope: !4) +!24 = !DILocation(line: 39, column: 73, scope: !4) +!25 = !DILocation(line: 39, column: 127, scope: !4) +!26 = !DILocation(line: 40, column: 34, scope: !4) +!27 = !DILocation(line: 40, column: 50, scope: !4) +!28 = !DILocation(line: 40, column: 104, scope: !4) +!29 = !DILocation(line: 41, column: 22, scope: !4) +!30 = !DILocation(line: 43, column: 23, scope: !4) +!31 = !DILocation(line: 291, column: 36, scope: !32, inlinedAt: !34) +!32 = distinct !DILexicalBlockFile(scope: !4, file: !33, discriminator: 0) +!33 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!34 = !DILocation(line: 45, column: 25, scope: !4) +!35 = !DILocation(line: 261, column: 15, scope: !32, inlinedAt: !34) +!36 = !DILocation(line: 45, column: 28, scope: !4) +!37 = !DILocation(line: 49, column: 25, scope: !4) +!38 = !DILocation(line: 49, column: 36, scope: !4) +!39 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..ba2fa552b5fed45c0217119aa74210b1ca4a5b77 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.ptx @@ -0,0 +1,391 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u32 triton_red_fused_zeros_0_param_3, + .param .u32 triton_red_fused_zeros_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_6 +) +.reqntid 256 +{ + .reg .pred %p<5>; + .reg .b16 %rs<5>; + .reg .b32 %r<52>; + .reg .b64 %rd<25>; + .loc 1 18 0 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_red_fused_zeros_0_param_2]; + ld.param.b64 %rd7, [triton_red_fused_zeros_0_param_0]; + ld.param.b64 %rd10, [triton_red_fused_zeros_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:23:28 + mov.u32 %r7, %ctaid.x; + .loc 1 23 33 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:23:33 + shl.b32 %r1, %r7, 6; + .loc 1 24 44 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:24:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 252; + bfe.u32 %r8, %r2, 2, 6; + .loc 1 24 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:24:23 + or.b32 %r9, %r8, %r1; + .loc 1 26 37 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:26:37 + and.b32 %r10, %r2, 3; + .loc 1 29 21 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:29:21 + bfe.s32 %r11, %r7, 25, 1; + shr.u32 %r12, %r11, 21; + add.s32 %r13, %r9, %r12; + shr.s32 %r14, %r13, 11; + .loc 1 29 29 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:29:29 + shr.u32 %r15, %r14, 27; + add.s32 %r16, %r14, %r15; + and.b32 %r17, %r16, 33554400; + sub.s32 %r18, %r14, %r17; + .loc 1 30 19 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:30:19 + shr.u32 %r19, %r11, 16; + add.s32 %r20, %r9, %r19; + .loc 1 39 45 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:45 + shl.b32 %r21, %r18, 7; + .loc 1 39 68 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:68 + shl.b32 %r22, %r20, 7; + and.b32 %r23, %r22, -8388608; + .loc 1 33 40 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:33:40 + cvt.u64.u32 %rd11, %r10; + shl.b32 %r24, %r7, 13; + shl.b32 %r25, %r8, 7; + or.b32 %r26, %r24, %r25; + cvt.s64.s32 %rd12, %r26; + or.b64 %rd13, %rd12, %rd11; + shl.b64 %rd14, %rd13, 1; + add.s64 %rd23, %rd10, %rd14; + shl.b32 %r27, %r7, 18; + add.s32 %r28, %r23, %r27; + shl.b32 %r29, %r8, 12; + or.b32 %r30, %r28, %r29; + add.s32 %r31, %r30, %r21; + or.b32 %r32, %r31, %r10; + shl.b32 %r33, %r14, 23; + sub.s32 %r34, %r32, %r33; + cvt.u64.u32 %rd2, %r34; + mov.b32 %r51, 0f00000000; + mov.b64 %rd24, -4; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 39 34 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:34 + add.s64 %rd21, %rd2, %rd24; + cvt.u32.u64 %r35, %rd21; + add.s32 %r36, %r35, 4; + mad.wide.s32 %rd16, %r36, 2, %rd7; + .loc 1 39 73 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:73 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0; + // end inline asm + mov.b16 %rs2, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u16 %rs1, %rs2; + @%p1 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs1 }, [ %rd16 + 0 ], %rd15; + // end inline asm + .loc 1 39 127 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:127 + cvt.f32.bf16 %r37, %rs1; + .loc 1 40 50 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:50 + // begin inline asm + mov.u64 %rd18, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd18, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs3, %rs2; + @%p1 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs3 }, [ %rd23 + 0 ], %rd18; + // end inline asm + .loc 1 40 104 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:104 + cvt.f32.bf16 %r38, %rs3; + .loc 1 43 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:43:23 + fma.rn.f32 %r51, %r37, %r38, %r51; + .loc 1 33 40 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:33:40 + add.s64 %rd24, %rd24, 4; + add.s64 %rd23, %rd23, 8; + setp.lt.u64 %p3, %rd24, 124; + @%p3 bra $L__BB0_1; +// %bb.2: + .loc 1 24 44 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:24:44 + and.b32 %r40, %r2, 63; + .loc 1 24 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:24:23 + or.b32 %r41, %r1, %r40; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r42, %r51, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r43, %r51, %r42; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r44, %r43, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r45, %r43, %r44; +$L__tmp2: + .loc 1 45 28 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:28 + mov.b32 %r46, global_smem; + add.s32 %r47, %r46, %r3; + st.shared.b32 [%r47], %r45; + bar.sync 0; + shl.b32 %r48, %r40, 2; + add.s32 %r49, %r46, %r48; + ld.shared.b32 %r39, [%r49]; + .loc 1 49 25 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:49:25 + mad.wide.s32 %rd22, %r41, 4, %rd8; + .loc 1 49 36 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:49:36 + and.b32 %r50, %r2, 192; + setp.eq.b32 %p4, %r50, 0; + // begin inline asm + @%p4 st.global.b32 [ %rd22 + 0 ], { %r39 }; + // end inline asm + .loc 1 49 4 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:49:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 209 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xca DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 100 +.b8 113 +.b8 120 +.b8 120 +.b8 101 +.b8 118 +.b8 100 +.b8 121 +.b8 115 +.b8 115 +.b8 111 +.b8 121 +.b8 117 +.b8 116 +.b8 50 +.b8 101 +.b8 117 +.b8 119 +.b8 53 +.b8 53 +.b8 121 +.b8 50 +.b8 55 +.b8 99 +.b8 97 +.b8 104 +.b8 113 +.b8 113 +.b8 99 +.b8 103 +.b8 109 +.b8 118 +.b8 121 +.b8 117 +.b8 104 +.b8 100 +.b8 105 +.b8 104 +.b8 98 +.b8 52 +.b8 116 +.b8 109 +.b8 110 +.b8 101 +.b8 114 +.b8 55 +.b8 99 +.b8 102 +.b8 99 +.b8 55 +.b8 102 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 100 +.b8 113 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..5a5918a303ce36669dcfd365d133a068035acf04 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.source @@ -0,0 +1,222 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":18:0) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc46 = loc(unknown) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc53 = loc("in_ptr0"(#loc)) +#loc54 = loc("in_ptr1"(#loc)) +#loc55 = loc("out_ptr1"(#loc)) +#loc56 = loc("xnumel"(#loc)) +#loc57 = loc("r0_numel"(#loc)) +#loc97 = loc("input"(#loc44)) +#loc98 = loc("a"(#loc49)) +#loc99 = loc("b"(#loc49)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 524288 : i32 loc(#loc58) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc59) + %xoffset = tt.get_program_id x : i32 loc(#loc60) + %xoffset_2 = arith.constant 64 : i32 loc(#loc61) + %xoffset_3 = arith.constant 64 : i32 loc(#loc61) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc61) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc62) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc63) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc64) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc64) + %xmask = arith.constant true loc(#loc65) + %xmask_8 = arith.constant dense : tensor<64x4xi1> loc(#loc65) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc66) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc67) + %x0 = arith.constant 2048 : i32 loc(#loc68) + %x0_10 = arith.constant 2048 : i32 loc(#loc68) + %x0_11 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc68) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc68) + %x1 = arith.constant 2048 : i32 loc(#loc69) + %x1_13 = arith.constant 2048 : i32 loc(#loc69) + %x1_14 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc69) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc69) + %x1_16 = arith.constant 32 : i32 loc(#loc70) + %x1_17 = arith.constant 32 : i32 loc(#loc70) + %x1_18 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc70) + %x1_19 = arith.remsi %x1_15, %x1_18 : tensor<64x1xi32> loc(#loc70) + %x2 = arith.constant 65536 : i32 loc(#loc71) + %x2_20 = arith.constant 65536 : i32 loc(#loc71) + %x2_21 = arith.constant dense<65536> : tensor<64x1xi32> loc(#loc71) + %x2_22 = arith.divsi %xindex_7, %x2_21 : tensor<64x1xi32> loc(#loc71) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc72) + %_tmp4_23 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc72) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c4_i32 = arith.constant 4 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_24 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_27 = %_tmp4_23) -> (tensor<64x4xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc74) + %r0_index_28 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc74) + %r0_mask = arith.constant dense<128> : tensor<1x4xi32> loc(#loc75) + %r0_mask_29 = arith.cmpi slt, %r0_index_28, %r0_mask : tensor<1x4xi32> loc(#loc75) + %tmp0 = arith.constant 128 : i32 loc(#loc76) + %tmp0_30 = arith.constant 128 : i32 loc(#loc76) + %tmp0_31 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc76) + %tmp0_32 = arith.muli %tmp0_31, %x1_19 : tensor<64x1xi32> loc(#loc76) + %tmp0_33 = tt.broadcast %r0_index_28 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc77) + %tmp0_34 = tt.broadcast %tmp0_32 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc77) + %tmp0_35 = arith.addi %tmp0_33, %tmp0_34 : tensor<64x4xi32> loc(#loc77) + %tmp0_36 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_37 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_38 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc78) + %tmp0_39 = arith.muli %tmp0_38, %x0_12 : tensor<64x1xi32> loc(#loc78) + %tmp0_40 = tt.broadcast %tmp0_39 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc79) + %tmp0_41 = arith.addi %tmp0_35, %tmp0_40 : tensor<64x4xi32> loc(#loc79) + %tmp0_42 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_43 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_44 = arith.constant dense<8388608> : tensor<64x1xi32> loc(#loc80) + %tmp0_45 = arith.muli %tmp0_44, %x2_22 : tensor<64x1xi32> loc(#loc80) + %tmp0_46 = tt.broadcast %tmp0_45 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc81) + %tmp0_47 = arith.addi %tmp0_41, %tmp0_46 : tensor<64x4xi32> loc(#loc81) + %tmp0_48 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc82) + %tmp0_49 = tt.addptr %tmp0_48, %tmp0_47 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc82) + %tmp0_50 = arith.constant 0.000000e+00 : f32 loc(#loc83) + %tmp0_51 = tt.broadcast %r0_mask_29 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc83) + %tmp0_52 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc83) + %tmp0_53 = arith.truncf %tmp0_52 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc83) + %tmp0_54 = tt.load %tmp0_49, %tmp0_51, %tmp0_53 evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc83) + %tmp0_55 = arith.extf %tmp0_54 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc84) + %tmp1 = arith.constant 128 : i32 loc(#loc85) + %tmp1_56 = arith.constant 128 : i32 loc(#loc85) + %tmp1_57 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc85) + %tmp1_58 = arith.muli %tmp1_57, %xindex_7 : tensor<64x1xi32> loc(#loc85) + %tmp1_59 = tt.broadcast %r0_index_28 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc86) + %tmp1_60 = tt.broadcast %tmp1_58 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc86) + %tmp1_61 = arith.addi %tmp1_59, %tmp1_60 : tensor<64x4xi32> loc(#loc86) + %tmp1_62 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc87) + %tmp1_63 = tt.addptr %tmp1_62, %tmp1_61 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc87) + %tmp1_64 = arith.constant 0.000000e+00 : f32 loc(#loc88) + %tmp1_65 = tt.broadcast %r0_mask_29 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc88) + %tmp1_66 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc88) + %tmp1_67 = arith.truncf %tmp1_66 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc88) + %tmp1_68 = tt.load %tmp1_63, %tmp1_65, %tmp1_67 evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc88) + %tmp1_69 = arith.extf %tmp1_68 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc89) + %tmp2 = arith.mulf %tmp0_55, %tmp1_69 : tensor<64x4xf32> loc(#loc90) + %tmp5 = arith.addf %_tmp4_27, %tmp2 : tensor<64x4xf32> loc(#loc91) + %_tmp4_70 = tt.broadcast %r0_mask_29 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc92) + %_tmp4_71 = arith.select %_tmp4_70, %tmp5, %_tmp4_27 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc92) + scf.yield %_tmp4_71 : tensor<64x4xf32> loc(#loc36) + } loc(#loc73) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_24) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc93) + %tmp4_25 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc94) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc95) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<64x1xf32> loc(#loc96) + %tmp8_26 = arith.subf %tmp4_25, %tmp8 : tensor<64x1xf32> loc(#loc96) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc41) + %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc41) + tt.store %5, %tmp8_26 : tensor<64x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x4xf32> loc("input"(#loc44))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc45) + tt.reduce.return %2 : f32 loc(#loc45) + }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc45) + tt.return %0 : tensor<64xf32> loc(#loc47) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc48) + tt.return %1 : tensor<64xf32> loc(#loc48) + } loc(#loc44) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc49)), %b: f32 loc("b"(#loc49))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc50) + tt.return %0 : f32 loc(#loc51) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc52) + tt.return %1 : f32 loc(#loc52) + } loc(#loc49) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":30:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":32:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:68) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:60) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:73) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:127) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:45) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:50) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:104) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":41:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":43:23) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:40) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:28) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":47:11) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":48:18) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:4) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc58 = loc("xnumel"(#loc1)) +#loc59 = loc("r0_numel"(#loc2)) +#loc60 = loc("xoffset"(#loc3)) +#loc61 = loc("xoffset"(#loc4)) +#loc62 = loc("xindex"(#loc5)) +#loc63 = loc("xindex"(#loc6)) +#loc64 = loc("xindex"(#loc7)) +#loc65 = loc("xmask"(#loc8)) +#loc66 = loc("r0_base"(#loc9)) +#loc67 = loc("r0_base"(#loc10)) +#loc68 = loc("x0"(#loc11)) +#loc69 = loc("x1"(#loc12)) +#loc70 = loc("x1"(#loc13)) +#loc71 = loc("x2"(#loc14)) +#loc72 = loc("_tmp4"(#loc15)) +#loc73 = loc("_tmp4"(#loc16)) +#loc74 = loc("r0_index"(#loc17)) +#loc75 = loc("r0_mask"(#loc18)) +#loc76 = loc("tmp0"(#loc19)) +#loc77 = loc("tmp0"(#loc20)) +#loc78 = loc("tmp0"(#loc21)) +#loc79 = loc("tmp0"(#loc22)) +#loc80 = loc("tmp0"(#loc23)) +#loc81 = loc("tmp0"(#loc24)) +#loc82 = loc("tmp0"(#loc25)) +#loc83 = loc("tmp0"(#loc26)) +#loc84 = loc("tmp0"(#loc27)) +#loc85 = loc("tmp1"(#loc28)) +#loc86 = loc("tmp1"(#loc29)) +#loc87 = loc("tmp1"(#loc30)) +#loc88 = loc("tmp1"(#loc31)) +#loc89 = loc("tmp1"(#loc32)) +#loc90 = loc("tmp2"(#loc33)) +#loc91 = loc("tmp5"(#loc34)) +#loc92 = loc("_tmp4"(#loc35)) +#loc93 = loc("tmp4"(#loc37)) +#loc94 = loc("tmp4"(#loc38)) +#loc95 = loc("tmp7"(#loc39)) +#loc96 = loc("tmp8"(#loc40)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..b49d228b1287a7d3c5ac5f72fa65bf5acf19095f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,155 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":18:0) +#loc1 = loc(unknown) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:25) +#loc39 = loc("in_ptr0"(#loc)) +#loc40 = loc("in_ptr1"(#loc)) +#loc41 = loc("out_ptr1"(#loc)) +#loc42 = loc("xnumel"(#loc)) +#loc43 = loc("r0_numel"(#loc)) +#loc73 = loc("tmp4"(#loc33)) +#loc76 = loc(callsite(#loc1 at #loc73)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<2048> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<65536> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<4096> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<8388608> : tensor<64x1xi32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<1x4xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc44) + %xoffset_8 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc45) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc46) + %xindex_9 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc46) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc46) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc46) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked> loc(#loc47) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc47) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<64x1xi32, #blocked> loc(#loc47) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<64x1xi32, #blocked1> loc(#loc47) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc48) + %r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4xi32, #blocked> loc(#loc48) + %x0 = arith.remsi %xindex_14, %cst : tensor<64x1xi32, #blocked> loc(#loc49) + %x1 = arith.divsi %xindex_14, %cst : tensor<64x1xi32, #blocked> loc(#loc50) + %x1_17 = arith.remsi %x1, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc51) + %x2 = arith.divsi %xindex_14, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc52) + %tmp0 = arith.muli %x1_17, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc53) + %tmp0_18 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc54) + %tmp0_19 = arith.muli %x0, %cst_3 : tensor<64x1xi32, #blocked> loc(#loc55) + %tmp0_20 = tt.broadcast %tmp0_19 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc56) + %tmp0_21 = arith.muli %x2, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc57) + %tmp0_22 = tt.broadcast %tmp0_21 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc58) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked> loc(#loc59) + %tmp1 = arith.muli %xindex_14, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc60) + %tmp1_24 = tt.broadcast %tmp1 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc61) + %tmp1_25 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked> loc(#loc62) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_28 = %cst_7) -> (tensor<64x4xf32, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked> loc(#loc64) + %r0_index_29 = arith.addi %r0_index, %r0_base_16 : tensor<1x4xi32, #blocked> loc(#loc64) + %r0_mask = arith.cmpi slt, %r0_index_29, %cst_6 : tensor<1x4xi32, #blocked> loc(#loc65) + %tmp0_30 = tt.broadcast %r0_index_29 : tensor<1x4xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc54) + %tmp0_31 = arith.addi %tmp0_30, %tmp0_18 : tensor<64x4xi32, #blocked> loc(#loc54) + %tmp0_32 = arith.addi %tmp0_31, %tmp0_20 : tensor<64x4xi32, #blocked> loc(#loc56) + %tmp0_33 = arith.addi %tmp0_32, %tmp0_22 : tensor<64x4xi32, #blocked> loc(#loc58) + %tmp0_34 = tt.addptr %tmp0_23, %tmp0_33 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi32, #blocked> loc(#loc59) + %tmp0_35 = tt.broadcast %r0_mask : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc66) + %tmp0_36 = tt.load %tmp0_34, %tmp0_35, %cst_5 evictionPolicy = evict_first : tensor<64x4x!tt.ptr, #blocked> loc(#loc66) + %tmp0_37 = arith.extf %tmp0_36 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc67) + %tmp1_38 = arith.addi %tmp0_30, %tmp1_24 : tensor<64x4xi32, #blocked> loc(#loc61) + %tmp1_39 = tt.addptr %tmp1_25, %tmp1_38 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi32, #blocked> loc(#loc62) + %tmp1_40 = tt.load %tmp1_39, %tmp0_35, %cst_5 evictionPolicy = evict_first : tensor<64x4x!tt.ptr, #blocked> loc(#loc68) + %tmp1_41 = arith.extf %tmp1_40 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc69) + %tmp2 = arith.mulf %tmp0_37, %tmp1_41 : tensor<64x4xf32, #blocked> loc(#loc70) + %tmp5 = arith.addf %_tmp4_28, %tmp2 : tensor<64x4xf32, #blocked> loc(#loc71) + %_tmp4_42 = arith.select %tmp0_35, %tmp5, %_tmp4_28 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc72) + scf.yield %_tmp4_42 : tensor<64x4xf32, #blocked> loc(#loc31) + } loc(#loc63) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_28: f32 loc(callsite(#loc1 at #loc73)), %tmp4_29: f32 loc(callsite(#loc1 at #loc73))): + %tmp4_30 = arith.addf %tmp4_28, %tmp4_29 : f32 loc(#loc77) + tt.reduce.return %tmp4_30 : f32 loc(#loc75) + }) : (tensor<64x4xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc75) + %tmp4_26 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc74) + %tmp4_27 = tt.expand_dims %tmp4_26 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc74) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc36) + %1 = tt.addptr %0, %xindex_15 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc36) + tt.store %1, %tmp4_27 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc37) + tt.return loc(#loc38) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":30:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:41) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:55) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:50) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:68) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:60) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":33:40) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":34:31) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":35:29) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:73) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:127) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:104) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":41:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":43:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:40) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:8) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:28) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:36) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:4) +#loc44 = loc("xoffset"(#loc2)) +#loc45 = loc("xoffset"(#loc3)) +#loc46 = loc("xindex"(#loc4)) +#loc47 = loc("xindex"(#loc5)) +#loc48 = loc("r0_base"(#loc6)) +#loc49 = loc("x0"(#loc7)) +#loc50 = loc("x1"(#loc8)) +#loc51 = loc("x1"(#loc9)) +#loc52 = loc("x2"(#loc10)) +#loc53 = loc("tmp0"(#loc11)) +#loc54 = loc("tmp0"(#loc12)) +#loc55 = loc("tmp0"(#loc13)) +#loc56 = loc("tmp0"(#loc14)) +#loc57 = loc("tmp0"(#loc15)) +#loc58 = loc("tmp0"(#loc16)) +#loc59 = loc("tmp0"(#loc17)) +#loc60 = loc("tmp1"(#loc18)) +#loc61 = loc("tmp1"(#loc19)) +#loc62 = loc("tmp1"(#loc20)) +#loc63 = loc("_tmp4"(#loc21)) +#loc64 = loc("r0_index"(#loc22)) +#loc65 = loc("r0_mask"(#loc23)) +#loc66 = loc("tmp0"(#loc24)) +#loc67 = loc("tmp0"(#loc25)) +#loc68 = loc("tmp1"(#loc26)) +#loc69 = loc("tmp1"(#loc27)) +#loc70 = loc("tmp2"(#loc28)) +#loc71 = loc("tmp5"(#loc29)) +#loc72 = loc("_tmp4"(#loc30)) +#loc74 = loc("tmp4"(#loc35)) +#loc75 = loc(callsite(#loc32 at #loc73)) +#loc77 = loc(callsite(#loc34 at #loc75)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..95e05ba250a050a11c6ae9061964776fc2b664fc --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.ttir @@ -0,0 +1,152 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":18:0) +#loc1 = loc(unknown) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:25) +#loc41 = loc("in_ptr0"(#loc)) +#loc42 = loc("in_ptr1"(#loc)) +#loc43 = loc("out_ptr1"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc77 = loc("tmp4"(#loc35)) +#loc80 = loc(callsite(#loc1 at #loc77)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x4xbf16> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc2) + %c128_i32 = arith.constant 128 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst_0 = arith.constant dense<8388608> : tensor<64x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<128> : tensor<1x4xi32> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc1) + %x2 = arith.constant dense<65536> : tensor<64x1xi32> loc(#loc46) + %x1 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc47) + %cst_5 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc48) + %xoffset_6 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc49) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc50) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc51) + %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<64x1xi32> loc(#loc52) + %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<64x1xi32> loc(#loc52) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc53) + %r0_base_10 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc54) + %x0 = arith.remsi %xindex_9, %cst_5 : tensor<64x1xi32> loc(#loc55) + %x1_11 = arith.divsi %xindex_9, %cst_5 : tensor<64x1xi32> loc(#loc56) + %x1_12 = arith.remsi %x1_11, %x1 : tensor<64x1xi32> loc(#loc47) + %x2_13 = arith.divsi %xindex_9, %x2 : tensor<64x1xi32> loc(#loc46) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_15 = %cst_4) -> (tensor<64x4xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc58) + %r0_index_16 = arith.addi %r0_index, %r0_base_10 : tensor<1x4xi32> loc(#loc58) + %r0_mask = arith.cmpi slt, %r0_index_16, %cst_3 : tensor<1x4xi32> loc(#loc59) + %tmp0 = arith.muli %x1_12, %cst_2 : tensor<64x1xi32> loc(#loc60) + %tmp0_17 = tt.broadcast %r0_index_16 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc61) + %tmp0_18 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc61) + %tmp0_19 = arith.addi %tmp0_17, %tmp0_18 : tensor<64x4xi32> loc(#loc61) + %tmp0_20 = arith.muli %x0, %cst_1 : tensor<64x1xi32> loc(#loc62) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc63) + %tmp0_22 = arith.addi %tmp0_19, %tmp0_21 : tensor<64x4xi32> loc(#loc63) + %tmp0_23 = arith.muli %x2_13, %cst_0 : tensor<64x1xi32> loc(#loc64) + %tmp0_24 = tt.broadcast %tmp0_23 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc65) + %tmp0_25 = arith.addi %tmp0_22, %tmp0_24 : tensor<64x4xi32> loc(#loc65) + %tmp0_26 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc66) + %tmp0_27 = tt.addptr %tmp0_26, %tmp0_25 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc66) + %tmp0_28 = tt.broadcast %r0_mask : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc67) + %tmp0_29 = tt.load %tmp0_27, %tmp0_28, %cst evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc67) + %tmp0_30 = arith.extf %tmp0_29 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc68) + %tmp1 = arith.muli %xindex_9, %cst_2 : tensor<64x1xi32> loc(#loc69) + %tmp1_31 = tt.broadcast %tmp1 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc70) + %tmp1_32 = arith.addi %tmp0_17, %tmp1_31 : tensor<64x4xi32> loc(#loc70) + %tmp1_33 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc71) + %tmp1_34 = tt.addptr %tmp1_33, %tmp1_32 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc71) + %tmp1_35 = tt.load %tmp1_34, %tmp0_28, %cst evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc72) + %tmp1_36 = arith.extf %tmp1_35 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc73) + %tmp2 = arith.mulf %tmp0_30, %tmp1_36 : tensor<64x4xf32> loc(#loc74) + %tmp5 = arith.addf %_tmp4_15, %tmp2 : tensor<64x4xf32> loc(#loc75) + %_tmp4_37 = arith.select %tmp0_28, %tmp5, %_tmp4_15 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc76) + scf.yield %_tmp4_37 : tensor<64x4xf32> loc(#loc33) + } loc(#loc57) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_15: f32 loc(callsite(#loc1 at #loc77)), %tmp4_16: f32 loc(callsite(#loc1 at #loc77))): + %tmp4_17 = arith.addf %tmp4_15, %tmp4_16 : f32 loc(#loc81) + tt.reduce.return %tmp4_17 : f32 loc(#loc79) + }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc79) + %tmp4_14 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc78) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc38) + %1 = tt.addptr %0, %xindex_9 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc38) + tt.store %1, %tmp4_14 : tensor<64x1x!tt.ptr> loc(#loc39) + tt.return loc(#loc40) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":33:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":30:19) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:29) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:28) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:33) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:36) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:44) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:23) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:27) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":28:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:21) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":34:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":35:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:45) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:55) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:50) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:68) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:60) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:73) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:127) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:45) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:41) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:34) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:50) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:104) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":41:22) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":43:23) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:40) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:8) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:28) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:36) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:4) +#loc46 = loc("x2"(#loc3)) +#loc47 = loc("x1"(#loc4)) +#loc48 = loc("xoffset"(#loc5)) +#loc49 = loc("xoffset"(#loc6)) +#loc50 = loc("xindex"(#loc7)) +#loc51 = loc("xindex"(#loc8)) +#loc52 = loc("xindex"(#loc9)) +#loc53 = loc("r0_base"(#loc10)) +#loc54 = loc("r0_base"(#loc11)) +#loc55 = loc("x0"(#loc12)) +#loc56 = loc("x1"(#loc13)) +#loc57 = loc("_tmp4"(#loc2)) +#loc58 = loc("r0_index"(#loc14)) +#loc59 = loc("r0_mask"(#loc15)) +#loc60 = loc("tmp0"(#loc16)) +#loc61 = loc("tmp0"(#loc17)) +#loc62 = loc("tmp0"(#loc18)) +#loc63 = loc("tmp0"(#loc19)) +#loc64 = loc("tmp0"(#loc20)) +#loc65 = loc("tmp0"(#loc21)) +#loc66 = loc("tmp0"(#loc22)) +#loc67 = loc("tmp0"(#loc23)) +#loc68 = loc("tmp0"(#loc24)) +#loc69 = loc("tmp1"(#loc25)) +#loc70 = loc("tmp1"(#loc26)) +#loc71 = loc("tmp1"(#loc27)) +#loc72 = loc("tmp1"(#loc28)) +#loc73 = loc("tmp1"(#loc29)) +#loc74 = loc("tmp2"(#loc30)) +#loc75 = loc("tmp5"(#loc31)) +#loc76 = loc("_tmp4"(#loc32)) +#loc78 = loc("tmp4"(#loc37)) +#loc79 = loc(callsite(#loc34 at #loc77)) +#loc81 = loc(callsite(#loc36 at #loc79)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/__grp__triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/__grp__triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.json new file mode 100644 index 0000000000000000000000000000000000000000..b93cfea52cb4006c12d9c0fa8818117ba512e557 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/__grp__triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.source", "triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.ttir", "triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.ttgir", "triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.llir", "triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.ptx", "triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.cubin", "triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..08b2c9765d4f01308e31ab3b48445280e95e7f23 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9ee55584690ef8c4a9ca458150810280f1282ddd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.json @@ -0,0 +1 @@ +{"hash": "0d1c65e9bf3a1a52cd807ae5ac1fab344b84da35b1dcd5a9a7faaca34fb91122", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..7fb8ad2c3b1195e2d0ec6c9e9bf0522b505cdfc8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.llir @@ -0,0 +1,394 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !7 + %10 = shl nuw nsw i32 %9, 1, !dbg !7 + %11 = and i32 %10, 1022, !dbg !7 + %12 = or i32 %10, 1024, !dbg !7 + %13 = zext nneg i32 %12 to i64, !dbg !8 + %14 = zext nneg i32 %11 to i64, !dbg !8 + %15 = getelementptr i64, ptr addrspace(1) %0, i64 %14, !dbg !9 + %16 = getelementptr i64, ptr addrspace(1) %0, i64 %13, !dbg !9 + %17 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !10 + %18 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %15, i64 %17, i1 true) #6, !dbg !10 + %19 = extractvalue { i64, i64 } %18, 0, !dbg !10 + %20 = extractvalue { i64, i64 } %18, 1, !dbg !10 + %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !10 + %22 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %16, i64 %21, i1 true) #6, !dbg !10 + %23 = extractvalue { i64, i64 } %22, 0, !dbg !10 + %24 = extractvalue { i64, i64 } %22, 1, !dbg !10 + %25 = getelementptr i64, ptr addrspace(1) %1, i64 %14, !dbg !11 + %26 = getelementptr i64, ptr addrspace(1) %1, i64 %13, !dbg !11 + %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !12 + %28 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %25, i64 %27, i1 true) #6, !dbg !12 + %29 = extractvalue { i64, i64 } %28, 0, !dbg !12 + %30 = extractvalue { i64, i64 } %28, 1, !dbg !12 + %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !12 + %32 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %26, i64 %31, i1 true) #6, !dbg !12 + %33 = extractvalue { i64, i64 } %32, 0, !dbg !12 + %34 = extractvalue { i64, i64 } %32, 1, !dbg !12 + %35 = getelementptr i64, ptr addrspace(1) %2, i64 %14, !dbg !13 + %36 = getelementptr i64, ptr addrspace(1) %2, i64 %13, !dbg !13 + %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %38 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %35, i64 %37, i1 true) #6, !dbg !14 + %39 = extractvalue { i64, i64 } %38, 0, !dbg !14 + %40 = extractvalue { i64, i64 } %38, 1, !dbg !14 + %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %42 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %36, i64 %41, i1 true) #6, !dbg !14 + %43 = extractvalue { i64, i64 } %42, 0, !dbg !14 + %44 = extractvalue { i64, i64 } %42, 1, !dbg !14 + %45 = or disjoint i64 %14, 2048, !dbg !15 + %46 = or disjoint i64 %13, 2048, !dbg !15 + %47 = getelementptr i64, ptr addrspace(1) %0, i64 %45, !dbg !9 + %48 = getelementptr i64, ptr addrspace(1) %0, i64 %46, !dbg !9 + %49 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !10 + %50 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %47, i64 %49, i1 true) #6, !dbg !10 + %51 = extractvalue { i64, i64 } %50, 0, !dbg !10 + %52 = extractvalue { i64, i64 } %50, 1, !dbg !10 + %53 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !10 + %54 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %48, i64 %53, i1 true) #6, !dbg !10 + %55 = extractvalue { i64, i64 } %54, 0, !dbg !10 + %56 = extractvalue { i64, i64 } %54, 1, !dbg !10 + %57 = getelementptr i64, ptr addrspace(1) %1, i64 %45, !dbg !11 + %58 = getelementptr i64, ptr addrspace(1) %1, i64 %46, !dbg !11 + %59 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !12 + %60 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %57, i64 %59, i1 true) #6, !dbg !12 + %61 = extractvalue { i64, i64 } %60, 0, !dbg !12 + %62 = extractvalue { i64, i64 } %60, 1, !dbg !12 + %63 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !12 + %64 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %58, i64 %63, i1 true) #6, !dbg !12 + %65 = extractvalue { i64, i64 } %64, 0, !dbg !12 + %66 = extractvalue { i64, i64 } %64, 1, !dbg !12 + %67 = getelementptr i64, ptr addrspace(1) %2, i64 %45, !dbg !13 + %68 = getelementptr i64, ptr addrspace(1) %2, i64 %46, !dbg !13 + %69 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %70 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %67, i64 %69, i1 true) #6, !dbg !14 + %71 = extractvalue { i64, i64 } %70, 0, !dbg !14 + %72 = extractvalue { i64, i64 } %70, 1, !dbg !14 + %73 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %74 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %68, i64 %73, i1 true) #6, !dbg !14 + %75 = extractvalue { i64, i64 } %74, 0, !dbg !14 + %76 = extractvalue { i64, i64 } %74, 1, !dbg !14 + %77 = insertelement <8 x i64> poison, i64 %51, i64 0, !dbg !16 + %78 = insertelement <8 x i64> %77, i64 %19, i64 1, !dbg !16 + %79 = insertelement <8 x i64> %78, i64 %52, i64 2, !dbg !16 + %80 = insertelement <8 x i64> %79, i64 %20, i64 3, !dbg !16 + %81 = insertelement <8 x i64> %80, i64 %55, i64 4, !dbg !16 + %82 = insertelement <8 x i64> %81, i64 %23, i64 5, !dbg !16 + %83 = insertelement <8 x i64> %82, i64 %56, i64 6, !dbg !16 + %84 = insertelement <8 x i64> %83, i64 %24, i64 7, !dbg !16 + %85 = insertelement <8 x i64> poison, i64 %61, i64 0, !dbg !16 + %86 = insertelement <8 x i64> %85, i64 %29, i64 1, !dbg !16 + %87 = insertelement <8 x i64> %86, i64 %62, i64 2, !dbg !16 + %88 = insertelement <8 x i64> %87, i64 %30, i64 3, !dbg !16 + %89 = insertelement <8 x i64> %88, i64 %65, i64 4, !dbg !16 + %90 = insertelement <8 x i64> %89, i64 %33, i64 5, !dbg !16 + %91 = insertelement <8 x i64> %90, i64 %66, i64 6, !dbg !16 + %92 = insertelement <8 x i64> %91, i64 %34, i64 7, !dbg !16 + %93 = icmp eq <8 x i64> %84, %92, !dbg !16 + %94 = insertelement <8 x i64> poison, i64 %71, i64 0, !dbg !17 + %95 = insertelement <8 x i64> %94, i64 %39, i64 1, !dbg !17 + %96 = insertelement <8 x i64> %95, i64 %72, i64 2, !dbg !17 + %97 = insertelement <8 x i64> %96, i64 %40, i64 3, !dbg !17 + %98 = insertelement <8 x i64> %97, i64 %75, i64 4, !dbg !17 + %99 = insertelement <8 x i64> %98, i64 %43, i64 5, !dbg !17 + %100 = insertelement <8 x i64> %99, i64 %76, i64 6, !dbg !17 + %101 = insertelement <8 x i64> %100, i64 %44, i64 7, !dbg !17 + %102 = select <8 x i1> %93, <8 x i64> %101, <8 x i64> zeroinitializer, !dbg !17 + %103 = and i32 %9, 31, !dbg !7 + %104 = lshr i32 %9, 5, !dbg !7 + %105 = tail call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %102), !dbg !18 + %extelt.offset = lshr i64 %105, 32, !dbg !22 + %106 = trunc nuw i64 %extelt.offset to i32, !dbg !22 + %107 = trunc i64 %105 to i32, !dbg !22 + %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 16, i32 31), !dbg !22 + %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 16, i32 31), !dbg !22 + %110 = insertelement <2 x i32> poison, i32 %108, i64 0, !dbg !22 + %111 = insertelement <2 x i32> %110, i32 %109, i64 1, !dbg !22 + %112 = bitcast <2 x i32> %111 to i64, !dbg !22 + %113 = add i64 %105, %112, !dbg !18 + %extelt.offset2 = lshr i64 %113, 32, !dbg !22 + %114 = trunc nuw i64 %extelt.offset2 to i32, !dbg !22 + %115 = trunc i64 %113 to i32, !dbg !22 + %116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %115, i32 8, i32 31), !dbg !22 + %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 8, i32 31), !dbg !22 + %118 = insertelement <2 x i32> poison, i32 %116, i64 0, !dbg !22 + %119 = insertelement <2 x i32> %118, i32 %117, i64 1, !dbg !22 + %120 = bitcast <2 x i32> %119 to i64, !dbg !22 + %121 = add i64 %113, %120, !dbg !18 + %extelt.offset3 = lshr i64 %121, 32, !dbg !22 + %122 = trunc nuw i64 %extelt.offset3 to i32, !dbg !22 + %123 = trunc i64 %121 to i32, !dbg !22 + %124 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %123, i32 4, i32 31), !dbg !22 + %125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 4, i32 31), !dbg !22 + %126 = insertelement <2 x i32> poison, i32 %124, i64 0, !dbg !22 + %127 = insertelement <2 x i32> %126, i32 %125, i64 1, !dbg !22 + %128 = bitcast <2 x i32> %127 to i64, !dbg !22 + %129 = add i64 %121, %128, !dbg !18 + %extelt.offset4 = lshr i64 %129, 32, !dbg !22 + %130 = trunc nuw i64 %extelt.offset4 to i32, !dbg !22 + %131 = trunc i64 %129 to i32, !dbg !22 + %132 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %131, i32 2, i32 31), !dbg !22 + %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %130, i32 2, i32 31), !dbg !22 + %134 = insertelement <2 x i32> poison, i32 %132, i64 0, !dbg !22 + %135 = insertelement <2 x i32> %134, i32 %133, i64 1, !dbg !22 + %136 = bitcast <2 x i32> %135 to i64, !dbg !22 + %137 = add i64 %129, %136, !dbg !18 + %extelt.offset5 = lshr i64 %137, 32, !dbg !22 + %138 = trunc nuw i64 %extelt.offset5 to i32, !dbg !22 + %139 = trunc i64 %137 to i32, !dbg !22 + %140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 1, i32 31), !dbg !22 + %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %138, i32 1, i32 31), !dbg !22 + %142 = insertelement <2 x i32> poison, i32 %140, i64 0, !dbg !22 + %143 = insertelement <2 x i32> %142, i32 %141, i64 1, !dbg !22 + %144 = bitcast <2 x i32> %143 to i64, !dbg !22 + %145 = add i64 %137, %144, !dbg !18 + %146 = and i32 %104, 15, !dbg !22 + %147 = icmp eq i32 %103, 0, !dbg !22 + %148 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %146, !dbg !22 + %149 = insertelement <1 x i64> poison, i64 %145, i64 0, !dbg !22 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %148, <1 x i64> %149, i1 %147) #6, !dbg !22 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !22 + %150 = icmp samesign ult i32 %9, 16, !dbg !22 + %151 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %9, !dbg !22 + %152 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %151, i1 %150) #6, !dbg !22 + %extelt.offset6 = lshr i64 %152, 32, !dbg !22 + %153 = trunc nuw i64 %extelt.offset6 to i32, !dbg !22 + %154 = trunc i64 %152 to i32, !dbg !22 + %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 8, i32 31), !dbg !22 + %156 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 8, i32 31), !dbg !22 + %157 = insertelement <2 x i32> poison, i32 %155, i64 0, !dbg !22 + %158 = insertelement <2 x i32> %157, i32 %156, i64 1, !dbg !22 + %159 = bitcast <2 x i32> %158 to i64, !dbg !22 + %160 = add i64 %152, %159, !dbg !18 + %extelt.offset7 = lshr i64 %160, 32, !dbg !22 + %161 = trunc nuw i64 %extelt.offset7 to i32, !dbg !22 + %162 = trunc i64 %160 to i32, !dbg !22 + %163 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %162, i32 4, i32 31), !dbg !22 + %164 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 4, i32 31), !dbg !22 + %165 = insertelement <2 x i32> poison, i32 %163, i64 0, !dbg !22 + %166 = insertelement <2 x i32> %165, i32 %164, i64 1, !dbg !22 + %167 = bitcast <2 x i32> %166 to i64, !dbg !22 + %168 = add i64 %160, %167, !dbg !18 + %extelt.offset8 = lshr i64 %168, 32, !dbg !22 + %169 = trunc nuw i64 %extelt.offset8 to i32, !dbg !22 + %170 = trunc i64 %168 to i32, !dbg !22 + %171 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %170, i32 2, i32 31), !dbg !22 + %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 2, i32 31), !dbg !22 + %173 = insertelement <2 x i32> poison, i32 %171, i64 0, !dbg !22 + %174 = insertelement <2 x i32> %173, i32 %172, i64 1, !dbg !22 + %175 = bitcast <2 x i32> %174 to i64, !dbg !22 + %176 = add i64 %168, %175, !dbg !18 + %extelt.offset9 = lshr i64 %176, 32, !dbg !22 + %177 = trunc nuw i64 %extelt.offset9 to i32, !dbg !22 + %178 = trunc i64 %176 to i32, !dbg !22 + %179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 1, i32 31), !dbg !22 + %180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 1, i32 31), !dbg !22 + %181 = insertelement <2 x i32> poison, i32 %179, i64 0, !dbg !22 + %182 = insertelement <2 x i32> %181, i32 %180, i64 1, !dbg !22 + %183 = bitcast <2 x i32> %182 to i64, !dbg !22 + %184 = add i64 %176, %183, !dbg !18 + %185 = icmp eq i32 %9, 0, !dbg !22 + %186 = insertelement <1 x i64> poison, i64 %184, i64 0, !dbg !22 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %151, <1 x i64> %186, i1 %185) #6, !dbg !22 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !22 + %187 = load i64, ptr addrspace(3) @global_smem, align 16, !dbg !22 + %188 = getelementptr i64, ptr addrspace(1) %3, i64 %14, !dbg !23 + %189 = getelementptr i64, ptr addrspace(1) %3, i64 %13, !dbg !23 + %190 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !24 + %191 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %188, i64 %190, i1 true) #6, !dbg !24 + %192 = extractvalue { i64, i64 } %191, 0, !dbg !24 + %193 = extractvalue { i64, i64 } %191, 1, !dbg !24 + %194 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !24 + %195 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %189, i64 %194, i1 true) #6, !dbg !24 + %196 = extractvalue { i64, i64 } %195, 0, !dbg !24 + %197 = extractvalue { i64, i64 } %195, 1, !dbg !24 + %198 = getelementptr i64, ptr addrspace(1) %3, i64 %45, !dbg !23 + %199 = getelementptr i64, ptr addrspace(1) %3, i64 %46, !dbg !23 + %200 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !24 + %201 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %198, i64 %200, i1 true) #6, !dbg !24 + %202 = extractvalue { i64, i64 } %201, 0, !dbg !24 + %203 = extractvalue { i64, i64 } %201, 1, !dbg !24 + %204 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !24 + %205 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %199, i64 %204, i1 true) #6, !dbg !24 + %206 = extractvalue { i64, i64 } %205, 0, !dbg !24 + %207 = extractvalue { i64, i64 } %205, 1, !dbg !24 + %208 = add i64 %202, %192, !dbg !25 + %209 = add i64 %203, %193, !dbg !25 + %210 = add i64 %206, %196, !dbg !25 + %211 = add i64 %207, %197, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !26 + %212 = add i64 %208, %209, !dbg !28 + %213 = add i64 %212, %210, !dbg !28 + %214 = add i64 %213, %211, !dbg !28 + %extelt.offset10 = lshr i64 %214, 32, !dbg !26 + %215 = trunc nuw i64 %extelt.offset10 to i32, !dbg !26 + %216 = trunc i64 %214 to i32, !dbg !26 + %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 16, i32 31), !dbg !26 + %218 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %215, i32 16, i32 31), !dbg !26 + %219 = insertelement <2 x i32> poison, i32 %217, i64 0, !dbg !26 + %220 = insertelement <2 x i32> %219, i32 %218, i64 1, !dbg !26 + %221 = bitcast <2 x i32> %220 to i64, !dbg !26 + %222 = add i64 %214, %221, !dbg !28 + %extelt.offset11 = lshr i64 %222, 32, !dbg !26 + %223 = trunc nuw i64 %extelt.offset11 to i32, !dbg !26 + %224 = trunc i64 %222 to i32, !dbg !26 + %225 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %224, i32 8, i32 31), !dbg !26 + %226 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 8, i32 31), !dbg !26 + %227 = insertelement <2 x i32> poison, i32 %225, i64 0, !dbg !26 + %228 = insertelement <2 x i32> %227, i32 %226, i64 1, !dbg !26 + %229 = bitcast <2 x i32> %228 to i64, !dbg !26 + %230 = add i64 %222, %229, !dbg !28 + %extelt.offset12 = lshr i64 %230, 32, !dbg !26 + %231 = trunc nuw i64 %extelt.offset12 to i32, !dbg !26 + %232 = trunc i64 %230 to i32, !dbg !26 + %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 4, i32 31), !dbg !26 + %234 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %231, i32 4, i32 31), !dbg !26 + %235 = insertelement <2 x i32> poison, i32 %233, i64 0, !dbg !26 + %236 = insertelement <2 x i32> %235, i32 %234, i64 1, !dbg !26 + %237 = bitcast <2 x i32> %236 to i64, !dbg !26 + %238 = add i64 %230, %237, !dbg !28 + %extelt.offset13 = lshr i64 %238, 32, !dbg !26 + %239 = trunc nuw i64 %extelt.offset13 to i32, !dbg !26 + %240 = trunc i64 %238 to i32, !dbg !26 + %241 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 2, i32 31), !dbg !26 + %242 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %239, i32 2, i32 31), !dbg !26 + %243 = insertelement <2 x i32> poison, i32 %241, i64 0, !dbg !26 + %244 = insertelement <2 x i32> %243, i32 %242, i64 1, !dbg !26 + %245 = bitcast <2 x i32> %244 to i64, !dbg !26 + %246 = add i64 %238, %245, !dbg !28 + %extelt.offset14 = lshr i64 %246, 32, !dbg !26 + %247 = trunc nuw i64 %extelt.offset14 to i32, !dbg !26 + %248 = trunc i64 %246 to i32, !dbg !26 + %249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %248, i32 1, i32 31), !dbg !26 + %250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %247, i32 1, i32 31), !dbg !26 + %251 = insertelement <2 x i32> poison, i32 %249, i64 0, !dbg !26 + %252 = insertelement <2 x i32> %251, i32 %250, i64 1, !dbg !26 + %253 = bitcast <2 x i32> %252 to i64, !dbg !26 + %254 = add i64 %246, %253, !dbg !28 + %255 = insertelement <1 x i64> poison, i64 %254, i64 0, !dbg !26 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %148, <1 x i64> %255, i1 %147) #6, !dbg !26 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !26 + %256 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %151, i1 %150) #6, !dbg !26 + %extelt.offset15 = lshr i64 %256, 32, !dbg !26 + %257 = trunc nuw i64 %extelt.offset15 to i32, !dbg !26 + %258 = trunc i64 %256 to i32, !dbg !26 + %259 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %258, i32 8, i32 31), !dbg !26 + %260 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %257, i32 8, i32 31), !dbg !26 + %261 = insertelement <2 x i32> poison, i32 %259, i64 0, !dbg !26 + %262 = insertelement <2 x i32> %261, i32 %260, i64 1, !dbg !26 + %263 = bitcast <2 x i32> %262 to i64, !dbg !26 + %264 = add i64 %256, %263, !dbg !28 + %extelt.offset16 = lshr i64 %264, 32, !dbg !26 + %265 = trunc nuw i64 %extelt.offset16 to i32, !dbg !26 + %266 = trunc i64 %264 to i32, !dbg !26 + %267 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %266, i32 4, i32 31), !dbg !26 + %268 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %265, i32 4, i32 31), !dbg !26 + %269 = insertelement <2 x i32> poison, i32 %267, i64 0, !dbg !26 + %270 = insertelement <2 x i32> %269, i32 %268, i64 1, !dbg !26 + %271 = bitcast <2 x i32> %270 to i64, !dbg !26 + %272 = add i64 %264, %271, !dbg !28 + %extelt.offset17 = lshr i64 %272, 32, !dbg !26 + %273 = trunc nuw i64 %extelt.offset17 to i32, !dbg !26 + %274 = trunc i64 %272 to i32, !dbg !26 + %275 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %274, i32 2, i32 31), !dbg !26 + %276 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %273, i32 2, i32 31), !dbg !26 + %277 = insertelement <2 x i32> poison, i32 %275, i64 0, !dbg !26 + %278 = insertelement <2 x i32> %277, i32 %276, i64 1, !dbg !26 + %279 = bitcast <2 x i32> %278 to i64, !dbg !26 + %280 = add i64 %272, %279, !dbg !28 + %extelt.offset18 = lshr i64 %280, 32, !dbg !26 + %281 = trunc nuw i64 %extelt.offset18 to i32, !dbg !26 + %282 = trunc i64 %280 to i32, !dbg !26 + %283 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %282, i32 1, i32 31), !dbg !26 + %284 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %281, i32 1, i32 31), !dbg !26 + %285 = insertelement <2 x i32> poison, i32 %283, i64 0, !dbg !26 + %286 = insertelement <2 x i32> %285, i32 %284, i64 1, !dbg !26 + %287 = bitcast <2 x i32> %286 to i64, !dbg !26 + %288 = add i64 %280, %287, !dbg !28 + %289 = insertelement <1 x i64> poison, i64 %288, i64 0, !dbg !26 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %151, <1 x i64> %289, i1 %185) #6, !dbg !26 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !26 + %290 = load i64, ptr addrspace(3) @global_smem, align 16, !dbg !26 + %291 = sitofp i64 %187 to float, !dbg !29 + %292 = sitofp i64 %290 to float, !dbg !30 + %293 = icmp sgt i64 %290, 0, !dbg !31 + %294 = select i1 %293, float %292, float 0x3EB0C6F7A0000000, !dbg !35 + %295 = tail call float @llvm.nvvm.div.full(float %291, float %294), !dbg !36 + %296 = and i32 %9, 511, !dbg !37 + %297 = icmp eq i32 %296, 0, !dbg !37 + %298 = bitcast float %295 to i32, !dbg !37 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %298, ptr addrspace(1) %4, i1 %297) #6, !dbg !37 + ret void, !dbg !38 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) #5 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2", linkageName: "triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 26, column: 37, scope: !4) +!8 = !DILocation(line: 29, column: 40, scope: !4) +!9 = !DILocation(line: 35, column: 34, scope: !4) +!10 = !DILocation(line: 35, column: 41, scope: !4) +!11 = !DILocation(line: 36, column: 34, scope: !4) +!12 = !DILocation(line: 36, column: 41, scope: !4) +!13 = !DILocation(line: 37, column: 34, scope: !4) +!14 = !DILocation(line: 37, column: 41, scope: !4) +!15 = !DILocation(line: 30, column: 31, scope: !4) +!16 = !DILocation(line: 38, column: 23, scope: !4) +!17 = !DILocation(line: 40, column: 22, scope: !4) +!18 = !DILocation(line: 261, column: 15, scope: !19, inlinedAt: !21) +!19 = distinct !DILexicalBlockFile(scope: !4, file: !20, discriminator: 0) +!20 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!21 = !DILocation(line: 44, column: 25, scope: !4) +!22 = !DILocation(line: 291, column: 36, scope: !19, inlinedAt: !21) +!23 = !DILocation(line: 52, column: 34, scope: !4) +!24 = !DILocation(line: 52, column: 41, scope: !4) +!25 = !DILocation(line: 54, column: 25, scope: !4) +!26 = !DILocation(line: 291, column: 36, scope: !19, inlinedAt: !27) +!27 = !DILocation(line: 56, column: 27, scope: !4) +!28 = !DILocation(line: 261, column: 15, scope: !19, inlinedAt: !27) +!29 = !DILocation(line: 57, column: 20, scope: !4) +!30 = !DILocation(line: 58, column: 21, scope: !4) +!31 = !DILocation(line: 110, column: 15, scope: !32, inlinedAt: !34) +!32 = distinct !DILexicalBlockFile(scope: !4, file: !33, discriminator: 0) +!33 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!34 = !DILocation(line: 60, column: 42, scope: !4) +!35 = !DILocation(line: 113, column: 29, scope: !32, inlinedAt: !34) +!36 = !DILocation(line: 61, column: 21, scope: !4) +!37 = !DILocation(line: 62, column: 68, scope: !4) +!38 = !DILocation(line: 62, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..eb2258b0f384dcdb1bd48f067fb271fc597b7a92 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.ptx @@ -0,0 +1,795 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2 // -- Begin function triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2 +.visible .entry triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2( + .param .u64 .ptr .global .align 1 triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2_param_4, + .param .u32 triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2_param_7 +) +.reqntid 512 +{ + .reg .pred %p<33>; + .reg .b32 %r<93>; + .reg .b64 %rd<204>; + .loc 1 18 0 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:18:0 + +// %bb.0: + ld.param.b64 %rd88, [triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2_param_0]; + ld.param.b64 %rd89, [triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2_param_1]; +$L__tmp0: + .loc 1 26 37 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:26:37 + mov.u32 %r8, %tid.x; + shl.b32 %r9, %r8, 1; + ld.param.b64 %rd90, [triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2_param_2]; + and.b32 %r10, %r9, 1022; + ld.param.b64 %rd91, [triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2_param_3]; + or.b32 %r11, %r9, 1024; + ld.param.b64 %rd87, [triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2_param_4]; + .loc 1 35 34 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:35:34 + mul.wide.u32 %rd92, %r10, 8; + add.s64 %rd4, %rd88, %rd92; + mul.wide.u32 %rd93, %r11, 8; + add.s64 %rd9, %rd88, %rd93; + .loc 1 35 41 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:35:41 + // begin inline asm + mov.u64 %rd5, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd5, 1.0; + // end inline asm + mov.pred %p1, -1; + // begin inline asm + mov.u64 %rd2, 0x0; + mov.u64 %rd3, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd2, %rd3 }, [ %rd4 + 0 ], %rd5; + // end inline asm + // begin inline asm + mov.u64 %rd10, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd10, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd7, 0x0; + mov.u64 %rd8, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd7, %rd8 }, [ %rd9 + 0 ], %rd10; + // end inline asm + .loc 1 36 34 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:36:34 + add.s64 %rd14, %rd89, %rd92; + add.s64 %rd19, %rd89, %rd93; + .loc 1 36 41 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:36:41 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd12, 0x0; + mov.u64 %rd13, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd12, %rd13 }, [ %rd14 + 0 ], %rd15; + // end inline asm + // begin inline asm + mov.u64 %rd20, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd20, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd17, 0x0; + mov.u64 %rd18, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd17, %rd18 }, [ %rd19 + 0 ], %rd20; + // end inline asm + .loc 1 37 34 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:37:34 + add.s64 %rd24, %rd90, %rd92; + add.s64 %rd29, %rd90, %rd93; + .loc 1 37 41 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:37:41 + // begin inline asm + mov.u64 %rd25, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd25, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd22, 0x0; + mov.u64 %rd23, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd22, %rd23 }, [ %rd24 + 0 ], %rd25; + // end inline asm + // begin inline asm + mov.u64 %rd30, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd30, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd27, 0x0; + mov.u64 %rd28, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd27, %rd28 }, [ %rd29 + 0 ], %rd30; + // end inline asm + .loc 1 35 34 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:35:34 + add.s64 %rd34, %rd4, 16384; + add.s64 %rd39, %rd9, 16384; + .loc 1 35 41 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:35:41 + // begin inline asm + mov.u64 %rd35, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd35, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd32, 0x0; + mov.u64 %rd33, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd32, %rd33 }, [ %rd34 + 0 ], %rd35; + // end inline asm + // begin inline asm + mov.u64 %rd40, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd40, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd37, 0x0; + mov.u64 %rd38, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd37, %rd38 }, [ %rd39 + 0 ], %rd40; + // end inline asm + .loc 1 36 34 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:36:34 + add.s64 %rd44, %rd14, 16384; + add.s64 %rd49, %rd19, 16384; + .loc 1 36 41 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:36:41 + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd45, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd42, 0x0; + mov.u64 %rd43, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd42, %rd43 }, [ %rd44 + 0 ], %rd45; + // end inline asm + // begin inline asm + mov.u64 %rd50, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd50, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd47, 0x0; + mov.u64 %rd48, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd47, %rd48 }, [ %rd49 + 0 ], %rd50; + // end inline asm + .loc 1 37 34 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:37:34 + add.s64 %rd54, %rd24, 16384; + add.s64 %rd59, %rd29, 16384; + .loc 1 37 41 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:37:41 + // begin inline asm + mov.u64 %rd55, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd55, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd52, 0x0; + mov.u64 %rd53, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd52, %rd53 }, [ %rd54 + 0 ], %rd55; + // end inline asm + // begin inline asm + mov.u64 %rd60, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd60, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd57, 0x0; + mov.u64 %rd58, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd57, %rd58 }, [ %rd59 + 0 ], %rd60; + // end inline asm + .loc 1 38 23 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:38:23 + setp.eq.b64 %p24, %rd8, %rd18; + setp.eq.b64 %p25, %rd3, %rd13; + setp.eq.b64 %p26, %rd7, %rd17; + setp.eq.b64 %p27, %rd2, %rd12; + setp.eq.b64 %p28, %rd38, %rd48; + setp.eq.b64 %p29, %rd33, %rd43; + setp.eq.b64 %p30, %rd37, %rd47; + setp.eq.b64 %p31, %rd32, %rd42; + .loc 1 40 22 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:40:22 + selp.b64 %rd94, %rd52, 0, %p31; + selp.b64 %rd95, %rd57, 0, %p30; + selp.b64 %rd96, %rd53, 0, %p29; + selp.b64 %rd97, %rd58, 0, %p28; + selp.b64 %rd98, %rd22, 0, %p27; + selp.b64 %rd99, %rd27, 0, %p26; + selp.b64 %rd100, %rd23, 0, %p25; + selp.b64 %rd101, %rd28, 0, %p24; + .loc 1 26 37 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:26:37 + and.b32 %r12, %r8, 31; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + add.s64 %rd102, %rd100, %rd101; + add.s64 %rd103, %rd98, %rd99; + add.s64 %rd104, %rd103, %rd102; + add.s64 %rd105, %rd96, %rd97; + add.s64 %rd106, %rd94, %rd95; + add.s64 %rd107, %rd106, %rd105; + add.s64 %rd108, %rd107, %rd104; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + mov.b64 {_, %r13}, %rd108; + cvt.u32.u64 %r14, %rd108; + shfl.sync.bfly.b32 %r15, %r14, 16, 31, -1; + shfl.sync.bfly.b32 %r16, %r13, 16, 31, -1; + cvt.u64.u32 %rd109, %r15; + cvt.u64.u32 %rd110, %r16; + shl.b64 %rd111, %rd110, 32; + or.b64 %rd112, %rd109, %rd111; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + add.s64 %rd113, %rd108, %rd112; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + mov.b64 {_, %r17}, %rd113; + cvt.u32.u64 %r18, %rd113; + shfl.sync.bfly.b32 %r19, %r18, 8, 31, -1; + shfl.sync.bfly.b32 %r20, %r17, 8, 31, -1; + cvt.u64.u32 %rd114, %r19; + cvt.u64.u32 %rd115, %r20; + shl.b64 %rd116, %rd115, 32; + or.b64 %rd117, %rd114, %rd116; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + add.s64 %rd118, %rd113, %rd117; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + mov.b64 {_, %r21}, %rd118; + cvt.u32.u64 %r22, %rd118; + shfl.sync.bfly.b32 %r23, %r22, 4, 31, -1; + shfl.sync.bfly.b32 %r24, %r21, 4, 31, -1; + cvt.u64.u32 %rd119, %r23; + cvt.u64.u32 %rd120, %r24; + shl.b64 %rd121, %rd120, 32; + or.b64 %rd122, %rd119, %rd121; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + add.s64 %rd123, %rd118, %rd122; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + mov.b64 {_, %r25}, %rd123; + cvt.u32.u64 %r26, %rd123; + shfl.sync.bfly.b32 %r27, %r26, 2, 31, -1; + shfl.sync.bfly.b32 %r28, %r25, 2, 31, -1; + cvt.u64.u32 %rd124, %r27; + cvt.u64.u32 %rd125, %r28; + shl.b64 %rd126, %rd125, 32; + or.b64 %rd127, %rd124, %rd126; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + add.s64 %rd128, %rd123, %rd127; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + mov.b64 {_, %r29}, %rd128; + cvt.u32.u64 %r30, %rd128; + shfl.sync.bfly.b32 %r31, %r30, 1, 31, -1; + shfl.sync.bfly.b32 %r32, %r29, 1, 31, -1; + cvt.u64.u32 %rd129, %r31; + cvt.u64.u32 %rd130, %r32; + shl.b64 %rd131, %rd130, 32; + or.b64 %rd132, %rd129, %rd131; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + add.s64 %rd61, %rd128, %rd132; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + setp.eq.b32 %p13, %r12, 0; + shr.u32 %r33, %r8, 2; + and.b32 %r34, %r33, 120; + mov.b32 %r35, global_smem; + add.s32 %r1, %r35, %r34; + // begin inline asm + @%p13 st.shared.b64 [ %r1 + 0 ], %rd61; + // end inline asm + bar.sync 0; + setp.lt.u32 %p14, %r8, 16; + shl.b32 %r36, %r8, 3; + add.s32 %r2, %r35, %r36; + // begin inline asm + @%p14 ld.shared.b64 %rd62, [ %r2 + 0 ]; + // end inline asm + mov.b64 {_, %r37}, %rd62; + cvt.u32.u64 %r38, %rd62; + shfl.sync.bfly.b32 %r39, %r38, 8, 31, -1; + shfl.sync.bfly.b32 %r40, %r37, 8, 31, -1; + cvt.u64.u32 %rd133, %r39; + cvt.u64.u32 %rd134, %r40; + shl.b64 %rd135, %rd134, 32; + or.b64 %rd136, %rd133, %rd135; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + add.s64 %rd137, %rd62, %rd136; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + mov.b64 {_, %r41}, %rd137; + cvt.u32.u64 %r42, %rd137; + shfl.sync.bfly.b32 %r43, %r42, 4, 31, -1; + shfl.sync.bfly.b32 %r44, %r41, 4, 31, -1; + cvt.u64.u32 %rd138, %r43; + cvt.u64.u32 %rd139, %r44; + shl.b64 %rd140, %rd139, 32; + or.b64 %rd141, %rd138, %rd140; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + add.s64 %rd142, %rd137, %rd141; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + mov.b64 {_, %r45}, %rd142; + cvt.u32.u64 %r46, %rd142; + shfl.sync.bfly.b32 %r47, %r46, 2, 31, -1; + shfl.sync.bfly.b32 %r48, %r45, 2, 31, -1; + cvt.u64.u32 %rd143, %r47; + cvt.u64.u32 %rd144, %r48; + shl.b64 %rd145, %rd144, 32; + or.b64 %rd146, %rd143, %rd145; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + add.s64 %rd147, %rd142, %rd146; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + mov.b64 {_, %r49}, %rd147; + cvt.u32.u64 %r50, %rd147; + shfl.sync.bfly.b32 %r51, %r50, 1, 31, -1; + shfl.sync.bfly.b32 %r52, %r49, 1, 31, -1; + cvt.u64.u32 %rd148, %r51; + cvt.u64.u32 %rd149, %r52; + shl.b64 %rd150, %rd149, 32; + or.b64 %rd151, %rd148, %rd150; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + add.s64 %rd63, %rd147, %rd151; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:44:25 ] + setp.eq.b32 %p15, %r8, 0; + // begin inline asm + @%p15 st.shared.b64 [ %r2 + 0 ], %rd63; + // end inline asm + bar.sync 0; + ld.shared.b64 %rd152, [global_smem]; +$L__tmp2: + .loc 1 52 34 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:52:34 + add.s64 %rd67, %rd91, %rd92; + add.s64 %rd72, %rd91, %rd93; + .loc 1 52 41 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:52:41 + // begin inline asm + mov.u64 %rd68, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd68, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd65, 0x0; + mov.u64 %rd66, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd65, %rd66 }, [ %rd67 + 0 ], %rd68; + // end inline asm + // begin inline asm + mov.u64 %rd73, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd73, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd70, 0x0; + mov.u64 %rd71, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd70, %rd71 }, [ %rd72 + 0 ], %rd73; + // end inline asm + .loc 1 52 34 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:52:34 + add.s64 %rd77, %rd67, 16384; + add.s64 %rd82, %rd72, 16384; + .loc 1 52 41 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:52:41 + // begin inline asm + mov.u64 %rd78, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd78, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd75, 0x0; + mov.u64 %rd76, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd75, %rd76 }, [ %rd77 + 0 ], %rd78; + // end inline asm + // begin inline asm + mov.u64 %rd83, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd83, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd80, 0x0; + mov.u64 %rd81, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd80, %rd81 }, [ %rd82 + 0 ], %rd83; + // end inline asm + .loc 1 54 25 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:54:25 + add.s64 %rd153, %rd75, %rd65; + add.s64 %rd154, %rd76, %rd66; + add.s64 %rd155, %rd80, %rd70; + add.s64 %rd156, %rd81, %rd71; +$L__tmp3: + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + bar.sync 0; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + add.s64 %rd157, %rd153, %rd154; + add.s64 %rd158, %rd157, %rd155; + add.s64 %rd159, %rd158, %rd156; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + mov.b64 {_, %r53}, %rd159; + cvt.u32.u64 %r54, %rd159; + shfl.sync.bfly.b32 %r55, %r54, 16, 31, -1; + shfl.sync.bfly.b32 %r56, %r53, 16, 31, -1; + cvt.u64.u32 %rd160, %r55; + cvt.u64.u32 %rd161, %r56; + shl.b64 %rd162, %rd161, 32; + or.b64 %rd163, %rd160, %rd162; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + add.s64 %rd164, %rd159, %rd163; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + mov.b64 {_, %r57}, %rd164; + cvt.u32.u64 %r58, %rd164; + shfl.sync.bfly.b32 %r59, %r58, 8, 31, -1; + shfl.sync.bfly.b32 %r60, %r57, 8, 31, -1; + cvt.u64.u32 %rd165, %r59; + cvt.u64.u32 %rd166, %r60; + shl.b64 %rd167, %rd166, 32; + or.b64 %rd168, %rd165, %rd167; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + add.s64 %rd169, %rd164, %rd168; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + mov.b64 {_, %r61}, %rd169; + cvt.u32.u64 %r62, %rd169; + shfl.sync.bfly.b32 %r63, %r62, 4, 31, -1; + shfl.sync.bfly.b32 %r64, %r61, 4, 31, -1; + cvt.u64.u32 %rd170, %r63; + cvt.u64.u32 %rd171, %r64; + shl.b64 %rd172, %rd171, 32; + or.b64 %rd173, %rd170, %rd172; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + add.s64 %rd174, %rd169, %rd173; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + mov.b64 {_, %r65}, %rd174; + cvt.u32.u64 %r66, %rd174; + shfl.sync.bfly.b32 %r67, %r66, 2, 31, -1; + shfl.sync.bfly.b32 %r68, %r65, 2, 31, -1; + cvt.u64.u32 %rd175, %r67; + cvt.u64.u32 %rd176, %r68; + shl.b64 %rd177, %rd176, 32; + or.b64 %rd178, %rd175, %rd177; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + add.s64 %rd179, %rd174, %rd178; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + mov.b64 {_, %r69}, %rd179; + cvt.u32.u64 %r70, %rd179; + shfl.sync.bfly.b32 %r71, %r70, 1, 31, -1; + shfl.sync.bfly.b32 %r72, %r69, 1, 31, -1; + cvt.u64.u32 %rd180, %r71; + cvt.u64.u32 %rd181, %r72; + shl.b64 %rd182, %rd181, 32; + or.b64 %rd183, %rd180, %rd182; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + add.s64 %rd84, %rd179, %rd183; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + // begin inline asm + @%p13 st.shared.b64 [ %r1 + 0 ], %rd84; + // end inline asm + bar.sync 0; + // begin inline asm + @%p14 ld.shared.b64 %rd85, [ %r2 + 0 ]; + // end inline asm + mov.b64 {_, %r73}, %rd85; + cvt.u32.u64 %r74, %rd85; + shfl.sync.bfly.b32 %r75, %r74, 8, 31, -1; + shfl.sync.bfly.b32 %r76, %r73, 8, 31, -1; + cvt.u64.u32 %rd184, %r75; + cvt.u64.u32 %rd185, %r76; + shl.b64 %rd186, %rd185, 32; + or.b64 %rd187, %rd184, %rd186; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + add.s64 %rd188, %rd85, %rd187; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + mov.b64 {_, %r77}, %rd188; + cvt.u32.u64 %r78, %rd188; + shfl.sync.bfly.b32 %r79, %r78, 4, 31, -1; + shfl.sync.bfly.b32 %r80, %r77, 4, 31, -1; + cvt.u64.u32 %rd189, %r79; + cvt.u64.u32 %rd190, %r80; + shl.b64 %rd191, %rd190, 32; + or.b64 %rd192, %rd189, %rd191; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + add.s64 %rd193, %rd188, %rd192; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + mov.b64 {_, %r81}, %rd193; + cvt.u32.u64 %r82, %rd193; + shfl.sync.bfly.b32 %r83, %r82, 2, 31, -1; + shfl.sync.bfly.b32 %r84, %r81, 2, 31, -1; + cvt.u64.u32 %rd194, %r83; + cvt.u64.u32 %rd195, %r84; + shl.b64 %rd196, %rd195, 32; + or.b64 %rd197, %rd194, %rd196; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + add.s64 %rd198, %rd193, %rd197; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + mov.b64 {_, %r85}, %rd198; + cvt.u32.u64 %r86, %rd198; + shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1; + shfl.sync.bfly.b32 %r88, %r85, 1, 31, -1; + cvt.u64.u32 %rd199, %r87; + cvt.u64.u32 %rd200, %r88; + shl.b64 %rd201, %rd200, 32; + or.b64 %rd202, %rd199, %rd201; + .loc 2 261 15 // standard.py:261:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + add.s64 %rd86, %rd198, %rd202; + .loc 2 291 36 // standard.py:291:36 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:56:27 ] + // begin inline asm + @%p15 st.shared.b64 [ %r2 + 0 ], %rd86; + // end inline asm + bar.sync 0; + ld.shared.b64 %rd203, [global_smem]; +$L__tmp4: + .loc 1 57 20 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:57:20 + cvt.rn.f32.s64 %r89, %rd152; + .loc 1 58 21 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:58:21 + cvt.rn.f32.s64 %r90, %rd203; +$L__tmp5: + .loc 3 110 15 // triton_helpers.py:110:15 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:60:42 ] + setp.gt.s64 %p32, %rd203, 0; + .loc 3 113 29 // triton_helpers.py:113:29 @[ cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:60:42 ] + selp.f32 %r91, %r90, 0f358637BD, %p32; +$L__tmp6: + .loc 1 61 21 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:61:21 + div.full.f32 %r7, %r89, %r91; + .loc 1 62 68 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:62:68 + and.b32 %r92, %r8, 511; + setp.eq.b32 %p23, %r92, 0; + // begin inline asm + @%p23 st.global.b32 [ %rd87 + 0 ], { %r7 }; + // end inline asm + .loc 1 62 4 // cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py:62:4 + ret; +$L__tmp7: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 284 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x115 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 99 +.b8 105 +.b8 105 +.b8 120 +.b8 106 +.b8 114 +.b8 120 +.b8 109 +.b8 113 +.b8 120 +.b8 107 +.b8 114 +.b8 117 +.b8 97 +.b8 54 +.b8 113 +.b8 117 +.b8 50 +.b8 113 +.b8 112 +.b8 52 +.b8 51 +.b8 121 +.b8 106 +.b8 53 +.b8 100 +.b8 112 +.b8 106 +.b8 106 +.b8 106 +.b8 103 +.b8 54 +.b8 110 +.b8 109 +.b8 121 +.b8 103 +.b8 100 +.b8 102 +.b8 103 +.b8 106 +.b8 106 +.b8 119 +.b8 50 +.b8 117 +.b8 117 +.b8 102 +.b8 111 +.b8 54 +.b8 110 +.b8 106 +.b8 104 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 99 +.b8 105 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x36 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 99 +.b8 108 +.b8 97 +.b8 109 +.b8 112 +.b8 95 +.b8 109 +.b8 105 +.b8 110 +.b8 95 +.b8 100 +.b8 105 +.b8 118 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc1:0x5e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xd6:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xee:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 56 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x106:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 60 // DW_AT_call_line +.b8 42 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.source b/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.source new file mode 100644 index 0000000000000000000000000000000000000000..1ebd523074cb858a0e1f4fe2532f8e7535057d9b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.source @@ -0,0 +1,296 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":18:0) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc51 = loc(unknown) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":109:0) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc80 = loc("in_ptr0"(#loc)) +#loc81 = loc("in_ptr1"(#loc)) +#loc82 = loc("in_ptr2"(#loc)) +#loc83 = loc("in_ptr3"(#loc)) +#loc84 = loc("out_ptr2"(#loc)) +#loc85 = loc("r0_numel"(#loc)) +#loc128 = loc("input"(#loc49)) +#loc129 = loc("a"(#loc54)) +#loc130 = loc("b"(#loc54)) +#loc131 = loc("a"(#loc58)) +#loc132 = loc("b"(#loc58)) +#loc136 = loc("x"(#loc67)) +#loc137 = loc("x"(#loc71)) +module { + tt.func public @triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel = arith.constant 1 : i32 loc(#loc86) + %r0_numel_0 = arith.constant 4096 : i32 loc(#loc87) + %xoffset = tt.get_program_id x : i32 loc(#loc88) + %xoffset_1 = arith.constant 1 : i32 loc(#loc89) + %xoffset_2 = arith.constant 1 : i32 loc(#loc89) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc89) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc90) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc91) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc92) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc92) + %xmask = arith.constant true loc(#loc93) + %xmask_7 = arith.constant dense : tensor<1x2048xi1> loc(#loc93) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc94) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc95) + %_tmp7 = arith.constant 0 : i64 loc(#loc96) + %_tmp7_9 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc96) + %c0_i32 = arith.constant 0 : i32 loc(#loc12) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc12) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc12) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc12) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc12) + %3 = ub.poison : i32 loc(#loc12) + %_tmp7_10 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp7_18 = %_tmp7_9) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc98) + %r0_index_19 = arith.addi %r0_index, %r0_base_8 : tensor<1x2048xi32> loc(#loc98) + %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc99) + %r0_mask_20 = arith.cmpi slt, %r0_index_19, %r0_mask : tensor<1x2048xi32> loc(#loc99) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc100) + %tmp0_21 = tt.addptr %tmp0, %r0_index_19 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc100) + %tmp0_22 = arith.constant 0.000000e+00 : f32 loc(#loc101) + %tmp0_23 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc101) + %tmp0_24 = arith.fptosi %tmp0_23 : tensor<1x2048xf32> to tensor<1x2048xi64> loc(#loc101) + %tmp0_25 = tt.load %tmp0_21, %r0_mask_20, %tmp0_24 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc101) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc102) + %tmp1_26 = tt.addptr %tmp1, %r0_index_19 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc102) + %tmp1_27 = arith.constant 0.000000e+00 : f32 loc(#loc103) + %tmp1_28 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc103) + %tmp1_29 = arith.fptosi %tmp1_28 : tensor<1x2048xf32> to tensor<1x2048xi64> loc(#loc103) + %tmp1_30 = tt.load %tmp1_26, %r0_mask_20, %tmp1_29 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc103) + %tmp4 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc104) + %tmp4_31 = tt.addptr %tmp4, %r0_index_19 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc104) + %tmp4_32 = arith.constant 0.000000e+00 : f32 loc(#loc105) + %tmp4_33 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc105) + %tmp4_34 = arith.fptosi %tmp4_33 : tensor<1x2048xf32> to tensor<1x2048xi64> loc(#loc105) + %tmp4_35 = tt.load %tmp4_31, %r0_mask_20, %tmp4_34 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc105) + %tmp2 = arith.cmpi eq, %tmp0_25, %tmp1_30 : tensor<1x2048xi64> loc(#loc106) + %tmp3 = arith.extui %tmp2 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc107) + %tmp5 = arith.muli %tmp3, %tmp4_35 : tensor<1x2048xi64> loc(#loc108) + %tmp8 = arith.addi %_tmp7_18, %tmp5 : tensor<1x2048xi64> loc(#loc109) + %_tmp7_36 = arith.select %r0_mask_20, %tmp8, %_tmp7_18 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc110) + scf.yield %_tmp7_36 : tensor<1x2048xi64> loc(#loc26) + } loc(#loc97) + %tmp7 = tt.call @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp7_10) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc111) + %tmp7_11 = tt.expand_dims %tmp7 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc112) + %_tmp11 = arith.constant 0 : i64 loc(#loc113) + %_tmp11_12 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc113) + %c0_i32_13 = arith.constant 0 : i32 loc(#loc30) + %c2048_i32_14 = arith.constant 2048 : i32 loc(#loc30) + %4 = arith.bitcast %c0_i32_13 : i32 to i32 loc(#loc30) + %5 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc30) + %6 = arith.bitcast %c2048_i32_14 : i32 to i32 loc(#loc30) + %7 = ub.poison : i32 loc(#loc30) + %_tmp11_15 = scf.for %r0_offset = %4 to %5 step %6 iter_args(%_tmp11_18 = %_tmp11_12) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc115) + %r0_index_19 = arith.addi %r0_index, %r0_base_8 : tensor<1x2048xi32> loc(#loc115) + %r0_mask = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc116) + %r0_mask_20 = arith.cmpi slt, %r0_index_19, %r0_mask : tensor<1x2048xi32> loc(#loc116) + %tmp9 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc117) + %tmp9_21 = tt.addptr %tmp9, %r0_index_19 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc117) + %tmp9_22 = arith.constant 0.000000e+00 : f32 loc(#loc118) + %tmp9_23 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc118) + %tmp9_24 = arith.fptosi %tmp9_23 : tensor<1x2048xf32> to tensor<1x2048xi64> loc(#loc118) + %tmp9_25 = tt.load %tmp9_21, %r0_mask_20, %tmp9_24 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc118) + %tmp12 = arith.addi %_tmp11_18, %tmp9_25 : tensor<1x2048xi64> loc(#loc119) + %_tmp11_26 = arith.select %r0_mask_20, %tmp12, %_tmp11_18 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc120) + scf.yield %_tmp11_26 : tensor<1x2048xi64> loc(#loc37) + } loc(#loc114) + %tmp11 = tt.call @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp11_15) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc121) + %tmp11_16 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc122) + %tmp13 = arith.sitofp %tmp7_11 : tensor<1x1xi64> to tensor<1x1xf32> loc(#loc123) + %tmp14 = arith.sitofp %tmp11_16 : tensor<1x1xi64> to tensor<1x1xf32> loc(#loc124) + %tmp15 = arith.constant 9.99999997E-7 : f32 loc(#loc125) + %tmp16 = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32S1_1S_fp32__(%tmp14, %tmp15) : (tensor<1x1xf32>, f32) -> tensor<1x1xf32> loc(#loc126) + %tmp17 = arith.divf %tmp13, %tmp16 : tensor<1x1xf32> loc(#loc127) + %c0_i32_17 = arith.constant 0 : i32 loc(#loc45) + %cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc45) + %8 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc46) + %9 = tt.addptr %8, %cst : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc46) + tt.store %9, %tmp17 : tensor<1x1x!tt.ptr> loc(#loc47) + tt.return loc(#loc48) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2048xi64> loc("input"(#loc49))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc50) + tt.reduce.return %2 : i64 loc(#loc50) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc50) + tt.return %0 : tensor<1xi64> loc(#loc52) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc53) + tt.return %1 : tensor<1xi64> loc(#loc53) + } loc(#loc49) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc54)), %b: i64 loc("b"(#loc54))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc55) + tt.return %0 : i64 loc(#loc56) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc57) + tt.return %1 : i64 loc(#loc57) + } loc(#loc54) + tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32S1_1S_fp32__(%a: tensor<1x1xf32> loc("a"(#loc58)), %b: f32 loc("b"(#loc58))) -> tensor<1x1xf32> attributes {noinline = false} { + %mask = tt.splat %b : f32 -> tensor<1x1xf32> loc(#loc133) + %mask_0 = arith.cmpf ogt, %a, %mask : tensor<1x1xf32> loc(#loc138) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_1S__(%a) : (tensor<1x1xf32>) -> i1 loc(#loc60) + %1 = scf.if %0 -> (tensor<1x1xi1>) { + %mask_1 = arith.cmpf une, %a, %a : tensor<1x1xf32> loc(#loc134) + %mask_2 = arith.ori %mask_0, %mask_1 : tensor<1x1xi1> loc(#loc139) + scf.yield %mask_2 : tensor<1x1xi1> loc(#loc139) + } else { + scf.yield %mask_0 : tensor<1x1xi1> loc(#loc51) + } loc(#loc61) + %2 = tt.splat %b : f32 -> tensor<1x1xf32> loc(#loc64) + %3 = arith.select %1, %a, %2 : tensor<1x1xi1>, tensor<1x1xf32> loc(#loc64) + tt.return %3 : tensor<1x1xf32> loc(#loc65) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x1xf32> loc(#loc66) + tt.return %4 : tensor<1x1xf32> loc(#loc66) + } loc(#loc58) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_1S__(%x: tensor<1x1xf32> loc("x"(#loc67))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_1S__(%x) : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc68) + %true = arith.constant true loc(#loc69) + tt.return %true : i1 loc(#loc69) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc70) + tt.return %1 : i1 loc(#loc70) + } loc(#loc67) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_1S__(%x: tensor<1x1xf32> loc("x"(#loc71))) -> tensor<1x1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc72) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc73) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc73) + %3 = arith.addf %x, %2 : tensor<1x1xf32> loc(#loc73) + tt.return %3 : tensor<1x1xf32> loc(#loc74) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x1xf32> loc(#loc75) + tt.return %4 : tensor<1x1xf32> loc(#loc75) + } loc(#loc71) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc77) + %cst = arith.constant dense : tensor<1xi1> loc(#loc77) + tt.return %cst : tensor<1xi1> loc(#loc78) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc79) + tt.return %0 : tensor<1xi1> loc(#loc79) + } loc(#loc76) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":28:43) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":29:40) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":30:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":31:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":35:34) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":35:41) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":36:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":36:41) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":37:34) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":37:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":38:23) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":39:23) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":40:22) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":42:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":43:40) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":43:8) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":44:25) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":44:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":45:44) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":46:40) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":47:31) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":48:29) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":52:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":52:41) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":54:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":55:42) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":55:8) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":56:27) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":56:30) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":57:20) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":58:21) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":59:12) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":60:42) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":61:21) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":62:49) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":62:25) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":62:68) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":62:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:19) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:7) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:11) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:4) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc86 = loc("xnumel"(#loc1)) +#loc87 = loc("r0_numel"(#loc2)) +#loc88 = loc("xoffset"(#loc3)) +#loc89 = loc("xoffset"(#loc4)) +#loc90 = loc("xindex"(#loc5)) +#loc91 = loc("xindex"(#loc6)) +#loc92 = loc("xindex"(#loc7)) +#loc93 = loc("xmask"(#loc8)) +#loc94 = loc("r0_base"(#loc9)) +#loc95 = loc("r0_base"(#loc10)) +#loc96 = loc("_tmp7"(#loc11)) +#loc97 = loc("_tmp7"(#loc12)) +#loc98 = loc("r0_index"(#loc13)) +#loc99 = loc("r0_mask"(#loc14)) +#loc100 = loc("tmp0"(#loc15)) +#loc101 = loc("tmp0"(#loc16)) +#loc102 = loc("tmp1"(#loc17)) +#loc103 = loc("tmp1"(#loc18)) +#loc104 = loc("tmp4"(#loc19)) +#loc105 = loc("tmp4"(#loc20)) +#loc106 = loc("tmp2"(#loc21)) +#loc107 = loc("tmp3"(#loc22)) +#loc108 = loc("tmp5"(#loc23)) +#loc109 = loc("tmp8"(#loc24)) +#loc110 = loc("_tmp7"(#loc25)) +#loc111 = loc("tmp7"(#loc27)) +#loc112 = loc("tmp7"(#loc28)) +#loc113 = loc("_tmp11"(#loc29)) +#loc114 = loc("_tmp11"(#loc30)) +#loc115 = loc("r0_index"(#loc31)) +#loc116 = loc("r0_mask"(#loc32)) +#loc117 = loc("tmp9"(#loc33)) +#loc118 = loc("tmp9"(#loc34)) +#loc119 = loc("tmp12"(#loc35)) +#loc120 = loc("_tmp11"(#loc36)) +#loc121 = loc("tmp11"(#loc38)) +#loc122 = loc("tmp11"(#loc39)) +#loc123 = loc("tmp13"(#loc40)) +#loc124 = loc("tmp14"(#loc41)) +#loc125 = loc("tmp15"(#loc42)) +#loc126 = loc("tmp16"(#loc43)) +#loc127 = loc("tmp17"(#loc44)) +#loc133 = loc("mask"(#loc59)) +#loc134 = loc("mask"(#loc62)) +#loc135 = loc("mask"(#loc63)) +#loc138 = loc("mask"(#loc133)) +#loc139 = loc("mask"(#loc135)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..3ed7df0e009a61e3fce215e7df9540a543ebccc2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.ttgir @@ -0,0 +1,161 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":18:0) +#loc1 = loc(unknown) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":44:25) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":56:27) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("in_ptr1"(#loc)) +#loc44 = loc("in_ptr2"(#loc)) +#loc45 = loc("in_ptr3"(#loc)) +#loc46 = loc("out_ptr2"(#loc)) +#loc47 = loc("r0_numel"(#loc)) +#loc63 = loc("tmp7"(#loc19)) +#loc72 = loc("tmp11"(#loc30)) +#loc82 = loc(callsite(#loc1 at #loc63)) +#loc84 = loc(callsite(#loc1 at #loc72)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<4096> : tensor<1x2048xi32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %cst_0 = arith.constant dense<9.99999997E-7> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<1x2048xi64, #blocked> loc(#loc1) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc48) + %r0_base_2 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc48) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc49) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc50) + %tmp4 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc51) + %_tmp7 = scf.for %_tmp7_7 = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%arg7 = %cst_1) -> (tensor<1x2048xi64, #blocked>) : i32 { + %r0_index = tt.splat %_tmp7_7 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc53) + %r0_index_8 = arith.addi %r0_index, %r0_base_2 : tensor<1x2048xi32, #blocked> loc(#loc53) + %r0_mask = arith.cmpi slt, %r0_index_8, %cst : tensor<1x2048xi32, #blocked> loc(#loc54) + %tmp0_9 = tt.addptr %tmp0, %r0_index_8 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc49) + %tmp0_10 = tt.load %tmp0_9, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc55) + %tmp1_11 = tt.addptr %tmp1, %r0_index_8 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc50) + %tmp1_12 = tt.load %tmp1_11, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc56) + %tmp4_13 = tt.addptr %tmp4, %r0_index_8 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc51) + %tmp4_14 = tt.load %tmp4_13, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc57) + %tmp2 = arith.cmpi eq, %tmp0_10, %tmp1_12 : tensor<1x2048xi64, #blocked> loc(#loc58) + %tmp3 = arith.extui %tmp2 : tensor<1x2048xi1, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc59) + %tmp5 = arith.muli %tmp3, %tmp4_14 : tensor<1x2048xi64, #blocked> loc(#loc60) + %tmp8 = arith.addi %arg7, %tmp5 : tensor<1x2048xi64, #blocked> loc(#loc61) + %_tmp7_15 = arith.select %r0_mask, %tmp8, %arg7 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc62) + scf.yield %_tmp7_15 : tensor<1x2048xi64, #blocked> loc(#loc17) + } loc(#loc52) + %tmp7 = "tt.reduce"(%_tmp7) <{axis = 1 : i32}> ({ + ^bb0(%tmp7_7: i64 loc(callsite(#loc1 at #loc63)), %tmp7_8: i64 loc(callsite(#loc1 at #loc63))): + %tmp7_9 = arith.addi %tmp7_7, %tmp7_8 : i64 loc(#loc89) + tt.reduce.return %tmp7_9 : i64 loc(#loc81) + }) : (tensor<1x2048xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc81) + %tmp7_3 = tt.expand_dims %tmp7 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc64) + %tmp9 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc65) + %_tmp11 = scf.for %_tmp11_7 = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%arg7 = %cst_1) -> (tensor<1x2048xi64, #blocked>) : i32 { + %r0_index = tt.splat %_tmp11_7 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc67) + %r0_index_8 = arith.addi %r0_index, %r0_base_2 : tensor<1x2048xi32, #blocked> loc(#loc67) + %r0_mask = arith.cmpi slt, %r0_index_8, %cst : tensor<1x2048xi32, #blocked> loc(#loc68) + %tmp9_9 = tt.addptr %tmp9, %r0_index_8 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc65) + %tmp9_10 = tt.load %tmp9_9, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc69) + %tmp12 = arith.addi %arg7, %tmp9_10 : tensor<1x2048xi64, #blocked> loc(#loc70) + %_tmp11_11 = arith.select %r0_mask, %tmp12, %arg7 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc71) + scf.yield %_tmp11_11 : tensor<1x2048xi64, #blocked> loc(#loc29) + } loc(#loc66) + %tmp11 = "tt.reduce"(%_tmp11) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_7: i64 loc(callsite(#loc1 at #loc72)), %tmp11_8: i64 loc(callsite(#loc1 at #loc72))): + %tmp11_9 = arith.addi %tmp11_7, %tmp11_8 : i64 loc(#loc90) + tt.reduce.return %tmp11_9 : i64 loc(#loc83) + }) : (tensor<1x2048xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc83) + %tmp11_4 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc73) + %tmp13 = arith.sitofp %tmp7_3 : tensor<1x1xi64, #blocked> to tensor<1x1xf32, #blocked> loc(#loc74) + %tmp14 = arith.sitofp %tmp11_4 : tensor<1x1xi64, #blocked> to tensor<1x1xf32, #blocked> loc(#loc75) + %mask = arith.cmpf ogt, %tmp14, %cst_0 : tensor<1x1xf32, #blocked> loc(#loc91) + %mask_5 = arith.cmpf une, %tmp14, %tmp14 : tensor<1x1xf32, #blocked> loc(#loc86) + %mask_6 = arith.ori %mask, %mask_5 : tensor<1x1xi1, #blocked> loc(#loc92) + %tmp16 = arith.select %mask_6, %tmp14, %cst_0 : tensor<1x1xi1, #blocked>, tensor<1x1xf32, #blocked> loc(#loc88) + %tmp17 = arith.divf %tmp13, %tmp16 : tensor<1x1xf32, #blocked> loc(#loc80) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc40) + %1 = ttg.convert_layout %tmp17 : tensor<1x1xf32, #blocked> -> tensor<1x1xf32, #blocked1> loc(#loc40) + tt.store %0, %1 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc40) + tt.return loc(#loc41) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":26:37) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":35:34) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":36:34) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":37:34) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":29:40) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":30:31) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":31:29) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":35:41) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":36:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":37:41) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":38:23) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":39:23) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":40:22) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":42:23) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":43:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":43:8) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":44:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":52:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":46:40) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":47:31) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":48:29) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":52:41) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":54:25) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":55:42) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":55:8) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":56:30) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":57:20) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":58:21) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":60:42) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":61:21) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":62:68) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":62:4) +#loc48 = loc("r0_base"(#loc2)) +#loc49 = loc("tmp0"(#loc3)) +#loc50 = loc("tmp1"(#loc4)) +#loc51 = loc("tmp4"(#loc5)) +#loc52 = loc("_tmp7"(#loc6)) +#loc53 = loc("r0_index"(#loc7)) +#loc54 = loc("r0_mask"(#loc8)) +#loc55 = loc("tmp0"(#loc9)) +#loc56 = loc("tmp1"(#loc10)) +#loc57 = loc("tmp4"(#loc11)) +#loc58 = loc("tmp2"(#loc12)) +#loc59 = loc("tmp3"(#loc13)) +#loc60 = loc("tmp5"(#loc14)) +#loc61 = loc("tmp8"(#loc15)) +#loc62 = loc("_tmp7"(#loc16)) +#loc64 = loc("tmp7"(#loc21)) +#loc65 = loc("tmp9"(#loc22)) +#loc66 = loc("_tmp11"(#loc23)) +#loc67 = loc("r0_index"(#loc24)) +#loc68 = loc("r0_mask"(#loc25)) +#loc69 = loc("tmp9"(#loc26)) +#loc70 = loc("tmp12"(#loc27)) +#loc71 = loc("_tmp11"(#loc28)) +#loc73 = loc("tmp11"(#loc31)) +#loc74 = loc("tmp13"(#loc32)) +#loc75 = loc("tmp14"(#loc33)) +#loc76 = loc("mask"(#loc34)) +#loc77 = loc("tmp16"(#loc35)) +#loc78 = loc("mask"(#loc36)) +#loc79 = loc("mask"(#loc37)) +#loc80 = loc("tmp17"(#loc39)) +#loc81 = loc(callsite(#loc18 at #loc63)) +#loc83 = loc(callsite(#loc18 at #loc72)) +#loc85 = loc("mask"(#loc76)) +#loc86 = loc(callsite(#loc78 at #loc77)) +#loc87 = loc("mask"(#loc79)) +#loc88 = loc(callsite(#loc38 at #loc77)) +#loc89 = loc(callsite(#loc20 at #loc81)) +#loc90 = loc(callsite(#loc20 at #loc83)) +#loc91 = loc(callsite(#loc85 at #loc77)) +#loc92 = loc(callsite(#loc87 at #loc77)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..053ff685701b1c769cde1c7652d7756c2d60bc2c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BUOGL2N7HINFFTMAPLS2YH5LGRFYJWRVWHONLKNH7KWKGT5ZCERA/triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2.ttir @@ -0,0 +1,161 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":18:0) +#loc1 = loc(unknown) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":44:25) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":56:27) +#loc44 = loc("in_ptr0"(#loc)) +#loc45 = loc("in_ptr1"(#loc)) +#loc46 = loc("in_ptr2"(#loc)) +#loc47 = loc("in_ptr3"(#loc)) +#loc48 = loc("out_ptr2"(#loc)) +#loc49 = loc("r0_numel"(#loc)) +#loc66 = loc("tmp7"(#loc20)) +#loc75 = loc("tmp11"(#loc31)) +#loc85 = loc(callsite(#loc1 at #loc66)) +#loc87 = loc(callsite(#loc1 at #loc75)) +module { + tt.func public @triton_red_fused_clamp_min_div_eq_mul_squeeze_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<9.99999997E-7> : tensor<1x1xf32> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<4096> : tensor<1x2048xi32> loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc1) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc50) + %r0_base_2 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc51) + %_tmp7 = scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%_tmp7_7 = %cst_1) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc53) + %r0_index_8 = arith.addi %r0_index, %r0_base_2 : tensor<1x2048xi32> loc(#loc53) + %r0_mask = arith.cmpi slt, %r0_index_8, %cst_0 : tensor<1x2048xi32> loc(#loc54) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc55) + %tmp0_9 = tt.addptr %tmp0, %r0_index_8 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc55) + %tmp0_10 = tt.load %tmp0_9, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc56) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc57) + %tmp1_11 = tt.addptr %tmp1, %r0_index_8 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc57) + %tmp1_12 = tt.load %tmp1_11, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc58) + %tmp4 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc59) + %tmp4_13 = tt.addptr %tmp4, %r0_index_8 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc59) + %tmp4_14 = tt.load %tmp4_13, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc60) + %tmp2 = arith.cmpi eq, %tmp0_10, %tmp1_12 : tensor<1x2048xi64> loc(#loc61) + %tmp3 = arith.extui %tmp2 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc62) + %tmp5 = arith.muli %tmp3, %tmp4_14 : tensor<1x2048xi64> loc(#loc63) + %tmp8 = arith.addi %_tmp7_7, %tmp5 : tensor<1x2048xi64> loc(#loc64) + %_tmp7_15 = arith.select %r0_mask, %tmp8, %_tmp7_7 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc65) + scf.yield %_tmp7_15 : tensor<1x2048xi64> loc(#loc18) + } loc(#loc52) + %tmp7 = "tt.reduce"(%_tmp7) <{axis = 1 : i32}> ({ + ^bb0(%tmp7_7: i64 loc(callsite(#loc1 at #loc66)), %tmp7_8: i64 loc(callsite(#loc1 at #loc66))): + %tmp7_9 = arith.addi %tmp7_7, %tmp7_8 : i64 loc(#loc92) + tt.reduce.return %tmp7_9 : i64 loc(#loc84) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc84) + %tmp7_3 = tt.expand_dims %tmp7 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc67) + %_tmp11 = scf.for %r0_offset = %c0_i32 to %c4096_i32 step %c2048_i32 iter_args(%_tmp11_7 = %cst_1) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc69) + %r0_index_8 = arith.addi %r0_index, %r0_base_2 : tensor<1x2048xi32> loc(#loc69) + %r0_mask = arith.cmpi slt, %r0_index_8, %cst_0 : tensor<1x2048xi32> loc(#loc70) + %tmp9 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc71) + %tmp9_9 = tt.addptr %tmp9, %r0_index_8 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc71) + %tmp9_10 = tt.load %tmp9_9, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc72) + %tmp12 = arith.addi %_tmp11_7, %tmp9_10 : tensor<1x2048xi64> loc(#loc73) + %_tmp11_11 = arith.select %r0_mask, %tmp12, %_tmp11_7 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc74) + scf.yield %_tmp11_11 : tensor<1x2048xi64> loc(#loc30) + } loc(#loc68) + %tmp11 = "tt.reduce"(%_tmp11) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_7: i64 loc(callsite(#loc1 at #loc75)), %tmp11_8: i64 loc(callsite(#loc1 at #loc75))): + %tmp11_9 = arith.addi %tmp11_7, %tmp11_8 : i64 loc(#loc93) + tt.reduce.return %tmp11_9 : i64 loc(#loc86) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc86) + %tmp11_4 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc76) + %tmp13 = arith.sitofp %tmp7_3 : tensor<1x1xi64> to tensor<1x1xf32> loc(#loc77) + %tmp14 = arith.sitofp %tmp11_4 : tensor<1x1xi64> to tensor<1x1xf32> loc(#loc78) + %mask = arith.cmpf ogt, %tmp14, %cst : tensor<1x1xf32> loc(#loc94) + %mask_5 = arith.cmpf une, %tmp14, %tmp14 : tensor<1x1xf32> loc(#loc89) + %mask_6 = arith.ori %mask, %mask_5 : tensor<1x1xi1> loc(#loc95) + %tmp16 = arith.select %mask_6, %tmp14, %cst : tensor<1x1xi1>, tensor<1x1xf32> loc(#loc91) + %tmp17 = arith.divf %tmp13, %tmp16 : tensor<1x1xf32> loc(#loc83) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc41) + tt.store %0, %tmp17 : tensor<1x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":26:27) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":26:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":29:40) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":30:31) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":31:29) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":35:34) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":35:41) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":36:34) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":36:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":37:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":37:41) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":38:23) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":39:23) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":40:22) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":42:23) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":43:40) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":43:8) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":44:28) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":46:40) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":47:31) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":48:29) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":52:34) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":52:41) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":54:25) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":55:42) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":55:8) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":56:30) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":57:20) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":58:21) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":60:42) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":61:21) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":62:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":62:68) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ci/cciixjrxmqxkrua6qu2qp43yj5dpjjjg6nmygdfgjjw2uufo6njh.py":62:4) +#loc50 = loc("r0_base"(#loc2)) +#loc51 = loc("r0_base"(#loc3)) +#loc52 = loc("_tmp7"(#loc4)) +#loc53 = loc("r0_index"(#loc5)) +#loc54 = loc("r0_mask"(#loc6)) +#loc55 = loc("tmp0"(#loc7)) +#loc56 = loc("tmp0"(#loc8)) +#loc57 = loc("tmp1"(#loc9)) +#loc58 = loc("tmp1"(#loc10)) +#loc59 = loc("tmp4"(#loc11)) +#loc60 = loc("tmp4"(#loc12)) +#loc61 = loc("tmp2"(#loc13)) +#loc62 = loc("tmp3"(#loc14)) +#loc63 = loc("tmp5"(#loc15)) +#loc64 = loc("tmp8"(#loc16)) +#loc65 = loc("_tmp7"(#loc17)) +#loc67 = loc("tmp7"(#loc22)) +#loc68 = loc("_tmp11"(#loc23)) +#loc69 = loc("r0_index"(#loc24)) +#loc70 = loc("r0_mask"(#loc25)) +#loc71 = loc("tmp9"(#loc26)) +#loc72 = loc("tmp9"(#loc27)) +#loc73 = loc("tmp12"(#loc28)) +#loc74 = loc("_tmp11"(#loc29)) +#loc76 = loc("tmp11"(#loc32)) +#loc77 = loc("tmp13"(#loc33)) +#loc78 = loc("tmp14"(#loc34)) +#loc79 = loc("mask"(#loc35)) +#loc80 = loc("tmp16"(#loc36)) +#loc81 = loc("mask"(#loc37)) +#loc82 = loc("mask"(#loc38)) +#loc83 = loc("tmp17"(#loc40)) +#loc84 = loc(callsite(#loc19 at #loc66)) +#loc86 = loc(callsite(#loc19 at #loc75)) +#loc88 = loc("mask"(#loc79)) +#loc89 = loc(callsite(#loc81 at #loc80)) +#loc90 = loc("mask"(#loc82)) +#loc91 = loc(callsite(#loc39 at #loc80)) +#loc92 = loc(callsite(#loc21 at #loc84)) +#loc93 = loc(callsite(#loc21 at #loc86)) +#loc94 = loc(callsite(#loc88 at #loc80)) +#loc95 = loc(callsite(#loc90 at #loc80)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b94564104598d6f0b19c8a3b01cf742f809e7192 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..26e0b69413b823710383a50ed5f15a9f645035cf Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a55bd471005e7d94d3926d581750ce8e28160939 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "0ded994aca0cd06fe06e27a0359763c50eba37bf33c83c5b7fc8e3076af68985", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..af413ef6a5e4a456b5ec844abb470924609318f7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.llir @@ -0,0 +1,244 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i64 %4, i32 %5, i32 %6, ptr addrspace(1) readnone captures(none) %7, ptr addrspace(1) readnone captures(none) %8) local_unnamed_addr #0 !dbg !4 { + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %11 = shl i32 %10, 6, !dbg !8 + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %13 = and i32 %12, 504, !dbg !9 + %14 = lshr exact i32 %13, 3, !dbg !9 + %15 = or disjoint i32 %14, %11, !dbg !10 + %16 = icmp slt i32 %15, %5, !dbg !11 + %17 = shl nuw nsw i32 %12, 3, !dbg !12 + %18 = and i32 %17, 56, !dbg !12 + %19 = sext i32 %15 to i64, !dbg !13 + %.frozen = freeze i64 %3, !dbg !14 + %20 = sdiv i64 %19, %.frozen, !dbg !14 + %21 = mul i64 %20, %.frozen, !dbg !13 + %.decomposed = sub i64 %19, %21, !dbg !13 + %22 = srem i64 %20, 32, !dbg !15 + %23 = sdiv i64 %19, %4, !dbg !16 + %.not = icmp ne i64 %.decomposed, 0, !dbg !17 + %24 = icmp slt i32 %11, 0, !dbg !21 + %25 = icmp slt i64 %3, 0, !dbg !22 + %26 = xor i1 %24, %25, !dbg !23 + %narrow = select i1 %26, i1 %.not, i1 false, !dbg !24 + %27 = sext i1 %narrow to i64, !dbg !24 + %28 = add nsw i64 %20, %27, !dbg !24 + %29 = shl i64 %3, 12, !dbg !25 + %30 = mul i64 %29, %23, !dbg !26 + %31 = icmp slt i64 %3, 2, !dbg !27 + %32 = icmp sgt i64 %3, 1, !dbg !28 + %33 = select i1 %32, i64 %3, i64 0, !dbg !29 + %34 = zext i1 %31 to i64, !dbg !30 + %35 = add i64 %33, %34, !dbg !31 + %36 = shl i64 %35, 7, !dbg !32 + %37 = mul i64 %36, %28, !dbg !33 + %.idx = shl nsw i64 %22, 8 + %38 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx + %.idx1 = shl nsw i64 %.decomposed, 13 + %invariant.gep = getelementptr i8, ptr addrspace(1) %38, i64 %.idx1, !dbg !34 + %invariant.gep3 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %30, !dbg !34 + %.idx2 = shl nsw i64 %.decomposed, 8 + %39 = getelementptr i8, ptr addrspace(1) %1, i64 %.idx2 + %invariant.gep5 = getelementptr bfloat, ptr addrspace(1) %39, i64 %37, !dbg !34 + %40 = zext nneg i32 %18 to i64, !dbg !34 + %gep4 = getelementptr bfloat, ptr addrspace(1) %invariant.gep3, i64 %40, !dbg !35 + %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !36 + %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %gep4, i64 %41, i1 %16) #4, !dbg !36 + %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !36 + %44 = bitcast i32 %43 to <2 x bfloat>, !dbg !36 + %45 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !36 + %46 = bitcast i32 %45 to <2 x bfloat>, !dbg !36 + %47 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !36 + %48 = bitcast i32 %47 to <2 x bfloat>, !dbg !36 + %49 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !36 + %50 = bitcast i32 %49 to <2 x bfloat>, !dbg !36 + %gep = getelementptr bfloat, ptr addrspace(1) %invariant.gep5, i64 %40, !dbg !37 + %51 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %52 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %gep, i64 %51, i1 %16) #4, !dbg !38 + %53 = extractvalue { i32, i32, i32, i32 } %52, 0, !dbg !38 + %54 = bitcast i32 %53 to <2 x bfloat>, !dbg !38 + %55 = extractvalue { i32, i32, i32, i32 } %52, 1, !dbg !38 + %56 = bitcast i32 %55 to <2 x bfloat>, !dbg !38 + %57 = extractvalue { i32, i32, i32, i32 } %52, 2, !dbg !38 + %58 = bitcast i32 %57 to <2 x bfloat>, !dbg !38 + %59 = extractvalue { i32, i32, i32, i32 } %52, 3, !dbg !38 + %60 = bitcast i32 %59 to <2 x bfloat>, !dbg !38 + %61 = or disjoint i64 %40, 64, !dbg !39 + %gep4.1 = getelementptr bfloat, ptr addrspace(1) %invariant.gep3, i64 %61, !dbg !35 + %62 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !36 + %63 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %gep4.1, i64 %62, i1 %16) #4, !dbg !36 + %64 = extractvalue { i32, i32, i32, i32 } %63, 0, !dbg !36 + %65 = bitcast i32 %64 to <2 x bfloat>, !dbg !36 + %66 = extractvalue { i32, i32, i32, i32 } %63, 1, !dbg !36 + %67 = bitcast i32 %66 to <2 x bfloat>, !dbg !36 + %68 = extractvalue { i32, i32, i32, i32 } %63, 2, !dbg !36 + %69 = bitcast i32 %68 to <2 x bfloat>, !dbg !36 + %70 = extractvalue { i32, i32, i32, i32 } %63, 3, !dbg !36 + %71 = bitcast i32 %70 to <2 x bfloat>, !dbg !36 + %gep.1 = getelementptr bfloat, ptr addrspace(1) %invariant.gep5, i64 %61, !dbg !37 + %72 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %73 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %gep.1, i64 %72, i1 %16) #4, !dbg !38 + %74 = extractvalue { i32, i32, i32, i32 } %73, 0, !dbg !38 + %75 = bitcast i32 %74 to <2 x bfloat>, !dbg !38 + %76 = extractvalue { i32, i32, i32, i32 } %73, 1, !dbg !38 + %77 = bitcast i32 %76 to <2 x bfloat>, !dbg !38 + %78 = extractvalue { i32, i32, i32, i32 } %73, 2, !dbg !38 + %79 = bitcast i32 %78 to <2 x bfloat>, !dbg !38 + %80 = extractvalue { i32, i32, i32, i32 } %73, 3, !dbg !38 + %81 = bitcast i32 %80 to <2 x bfloat>, !dbg !38 + %82 = fpext <2 x bfloat> %44 to <2 x float>, !dbg !40 + %83 = fpext <2 x bfloat> %54 to <2 x float>, !dbg !41 + %84 = fmul <2 x float> %82, %83, !dbg !42 + %85 = fadd <2 x float> %84, zeroinitializer, !dbg !43 + %86 = fpext <2 x bfloat> %65 to <2 x float>, !dbg !40 + %87 = fpext <2 x bfloat> %75 to <2 x float>, !dbg !41 + %88 = fmul <2 x float> %86, %87, !dbg !42 + %89 = fadd <2 x float> %85, %88, !dbg !43 + %90 = fpext <2 x bfloat> %46 to <2 x float>, !dbg !40 + %91 = fpext <2 x bfloat> %56 to <2 x float>, !dbg !41 + %92 = fmul <2 x float> %90, %91, !dbg !42 + %93 = fadd <2 x float> %92, zeroinitializer, !dbg !43 + %94 = fpext <2 x bfloat> %67 to <2 x float>, !dbg !40 + %95 = fpext <2 x bfloat> %77 to <2 x float>, !dbg !41 + %96 = fmul <2 x float> %94, %95, !dbg !42 + %97 = fadd <2 x float> %93, %96, !dbg !43 + %98 = fpext <2 x bfloat> %48 to <2 x float>, !dbg !40 + %99 = fpext <2 x bfloat> %58 to <2 x float>, !dbg !41 + %100 = fmul <2 x float> %98, %99, !dbg !42 + %101 = fadd <2 x float> %100, zeroinitializer, !dbg !43 + %102 = fpext <2 x bfloat> %69 to <2 x float>, !dbg !40 + %103 = fpext <2 x bfloat> %79 to <2 x float>, !dbg !41 + %104 = fmul <2 x float> %102, %103, !dbg !42 + %105 = fadd <2 x float> %101, %104, !dbg !43 + %106 = fpext <2 x bfloat> %50 to <2 x float>, !dbg !40 + %107 = fpext <2 x bfloat> %60 to <2 x float>, !dbg !41 + %108 = fmul <2 x float> %106, %107, !dbg !42 + %109 = fadd <2 x float> %108, zeroinitializer, !dbg !43 + %110 = fpext <2 x bfloat> %71 to <2 x float>, !dbg !40 + %111 = fpext <2 x bfloat> %81 to <2 x float>, !dbg !41 + %112 = fmul <2 x float> %110, %111, !dbg !42 + %113 = fadd <2 x float> %109, %112, !dbg !43 + %114 = and i32 %12, 63, !dbg !9 + %115 = or disjoint i32 %11, %114, !dbg !10 + %116 = icmp slt i32 %115, %5, !dbg !11 + %shift = shufflevector <2 x float> %89, <2 x float> poison, <2 x i32> , !dbg !44 + %foldExtExtBinop = fadd <2 x float> %89, %shift, !dbg !44 + %foldExtExtBinop14 = fadd <2 x float> %97, %foldExtExtBinop, !dbg !44 + %shift16 = shufflevector <2 x float> %97, <2 x float> poison, <2 x i32> , !dbg !44 + %foldExtExtBinop17 = fadd <2 x float> %shift16, %foldExtExtBinop14, !dbg !44 + %foldExtExtBinop19 = fadd <2 x float> %105, %foldExtExtBinop17, !dbg !44 + %shift21 = shufflevector <2 x float> %105, <2 x float> poison, <2 x i32> , !dbg !44 + %foldExtExtBinop22 = fadd <2 x float> %shift21, %foldExtExtBinop19, !dbg !44 + %foldExtExtBinop24 = fadd <2 x float> %113, %foldExtExtBinop22, !dbg !44 + %shift26 = shufflevector <2 x float> %113, <2 x float> poison, <2 x i32> , !dbg !44 + %foldExtExtBinop27 = fadd <2 x float> %shift26, %foldExtExtBinop24, !dbg !44 + %117 = extractelement <2 x float> %foldExtExtBinop27, i64 0, !dbg !44 + %118 = select i1 %16, float %117, float 0.000000e+00, !dbg !44 + %119 = bitcast float %118 to i32, !dbg !48 + %120 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %119, i32 4, i32 31), !dbg !48 + %121 = bitcast i32 %120 to float, !dbg !48 + %122 = fadd float %118, %121, !dbg !44 + %123 = bitcast float %122 to i32, !dbg !48 + %124 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %123, i32 2, i32 31), !dbg !48 + %125 = bitcast i32 %124 to float, !dbg !48 + %126 = fadd float %122, %125, !dbg !44 + %127 = bitcast float %126 to i32, !dbg !48 + %128 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %127, i32 1, i32 31), !dbg !48 + %129 = bitcast i32 %128 to float, !dbg !48 + %130 = fadd float %126, %129, !dbg !44 + %131 = lshr exact i32 %13, 1, !dbg !49 + %132 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %131, !dbg !49 + store float %130, ptr addrspace(3) %132, align 4, !dbg !49 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !49 + %133 = shl nuw nsw i32 %114, 2, !dbg !49 + %134 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %133, !dbg !49 + %135 = load i32, ptr addrspace(3) %134, align 4, !dbg !49 + %136 = sext i32 %115 to i64, !dbg !50 + %137 = getelementptr float, ptr addrspace(1) %2, i64 %136, !dbg !50 + %138 = and i32 %12, 448, !dbg !51 + %139 = icmp eq i32 %138, 0, !dbg !51 + %140 = and i1 %139, %116, !dbg !51 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %135, ptr addrspace(1) %137, i1 %140) #4, !dbg !51 + ret void, !dbg !52 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 22, column: 33, scope: !4) +!9 = !DILocation(line: 23, column: 44, scope: !4) +!10 = !DILocation(line: 23, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 21, scope: !4) +!12 = !DILocation(line: 25, column: 37, scope: !4) +!13 = !DILocation(line: 27, column: 19, scope: !4) +!14 = !DILocation(line: 28, column: 21, scope: !4) +!15 = !DILocation(line: 28, column: 28, scope: !4) +!16 = !DILocation(line: 29, column: 19, scope: !4) +!17 = !DILocation(line: 74, column: 34, scope: !18, inlinedAt: !20) +!18 = distinct !DILexicalBlockFile(scope: !4, file: !19, discriminator: 0) +!19 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!20 = !DILocation(line: 30, column: 51, scope: !4) +!21 = !DILocation(line: 75, column: 25, scope: !18, inlinedAt: !20) +!22 = !DILocation(line: 75, column: 36, scope: !18, inlinedAt: !20) +!23 = !DILocation(line: 75, column: 32, scope: !18, inlinedAt: !20) +!24 = !DILocation(line: 75, column: 47, scope: !18, inlinedAt: !20) +!25 = !DILocation(line: 39, column: 65, scope: !4) +!26 = !DILocation(line: 39, column: 69, scope: !4) +!27 = !DILocation(line: 40, column: 73, scope: !4) +!28 = !DILocation(line: 40, column: 99, scope: !4) +!29 = !DILocation(line: 40, column: 90, scope: !4) +!30 = !DILocation(line: 40, scope: !4) +!31 = !DILocation(line: 40, column: 81, scope: !4) +!32 = !DILocation(line: 40, column: 54, scope: !4) +!33 = !DILocation(line: 40, column: 58, scope: !4) +!34 = !DILocation(line: 33, column: 40, scope: !4) +!35 = !DILocation(line: 39, column: 34, scope: !4) +!36 = !DILocation(line: 39, column: 74, scope: !4) +!37 = !DILocation(line: 40, column: 34, scope: !4) +!38 = !DILocation(line: 40, column: 106, scope: !4) +!39 = !DILocation(line: 34, column: 31, scope: !4) +!40 = !DILocation(line: 39, column: 136, scope: !4) +!41 = !DILocation(line: 40, column: 168, scope: !4) +!42 = !DILocation(line: 41, column: 22, scope: !4) +!43 = !DILocation(line: 43, column: 23, scope: !4) +!44 = !DILocation(line: 261, column: 15, scope: !45, inlinedAt: !47) +!45 = distinct !DILexicalBlockFile(scope: !4, file: !46, discriminator: 0) +!46 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!47 = !DILocation(line: 45, column: 25, scope: !4) +!48 = !DILocation(line: 291, column: 36, scope: !45, inlinedAt: !47) +!49 = !DILocation(line: 45, column: 28, scope: !4) +!50 = !DILocation(line: 49, column: 25, scope: !4) +!51 = !DILocation(line: 49, column: 36, scope: !4) +!52 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..c0262232e171eef7babf48e2410d81c320d5c651 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.ptx @@ -0,0 +1,574 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u64 triton_red_fused_zeros_0_param_3, + .param .u64 triton_red_fused_zeros_0_param_4, + .param .u32 triton_red_fused_zeros_0_param_5, + .param .u32 triton_red_fused_zeros_0_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_7, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_8 +) +.reqntid 512 +{ + .reg .pred %p<17>; + .reg .b16 %rs<33>; + .reg .b32 %r<122>; + .reg .b64 %rd<59>; + .loc 1 18 0 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:18:0 + +// %bb.0: + ld.param.b64 %rd15, [triton_red_fused_zeros_0_param_4]; + ld.param.b64 %rd14, [triton_red_fused_zeros_0_param_3]; +$L__tmp0: + .loc 1 22 28 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:22:28 + mov.u32 %r6, %ctaid.x; + .loc 1 22 33 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:22:33 + shl.b32 %r1, %r6, 6; + .loc 1 23 44 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:23:44 + mov.u32 %r2, %tid.x; + bfe.u32 %r7, %r2, 3, 6; + .loc 1 23 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:23:23 + or.b32 %r8, %r7, %r1; + .loc 1 25 37 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:25:37 + shl.b32 %r9, %r2, 3; + .loc 1 27 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:27:19 + cvt.s64.s32 %rd1, %r8; + .loc 1 28 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:28:21 + or.b64 %rd17, %rd1, %rd14; + and.b64 %rd18, %rd17, -4294967296; + setp.ne.b64 %p1, %rd18, 0; + cvt.u32.u64 %r121, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd57, %rd1, %rd14; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r10, %rd14; + div.u32 %r12, %r121, %r10; + cvt.u64.u32 %rd57, %r12; +$L__BB0_3: + .loc 1 0 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:0:21 + ld.param.b32 %r5, [triton_red_fused_zeros_0_param_5]; + ld.param.b64 %rd13, [triton_red_fused_zeros_0_param_2]; + ld.param.b64 %rd12, [triton_red_fused_zeros_0_param_1]; + ld.param.b64 %rd11, [triton_red_fused_zeros_0_param_0]; + and.b32 %r3, %r2, 504; + and.b32 %r4, %r9, 56; + .loc 1 27 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:27:19 + mul.lo.s64 %rd19, %rd57, %rd14; + sub.s64 %rd6, %rd1, %rd19; + .loc 1 28 28 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:28:28 + shr.s64 %rd20, %rd57, 63; + shr.u64 %rd21, %rd20, 59; + add.s64 %rd22, %rd57, %rd21; + and.b64 %rd23, %rd22, -32; + sub.s64 %rd7, %rd57, %rd23; + .loc 1 29 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:29:19 + or.b64 %rd24, %rd1, %rd15; + and.b64 %rd25, %rd24, -4294967296; + setp.ne.b64 %p2, %rd25, 0; + @%p2 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd58, %rd1, %rd15; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r13, %rd15; + div.u32 %r15, %r121, %r13; + cvt.u64.u32 %rd58, %r15; +$L__BB0_6: + .loc 1 24 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:24:21 + setp.lt.s32 %p3, %r121, %r5; +$L__tmp1: + .loc 2 74 34 // triton_helpers.py:74:34 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + setp.ne.b64 %p8, %rd6, 0; + .loc 2 75 25 // triton_helpers.py:75:25 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + setp.lt.s32 %p9, %r1, 0; + .loc 2 75 36 // triton_helpers.py:75:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + setp.lt.s64 %p10, %rd14, 0; + .loc 2 75 32 // triton_helpers.py:75:32 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + xor.pred %p11, %p9, %p10; + .loc 2 75 47 // triton_helpers.py:75:47 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + and.pred %p12, %p11, %p8; + selp.b64 %rd39, -1, 0, %p12; + add.s64 %rd40, %rd57, %rd39; +$L__tmp2: + .loc 1 39 69 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:69 + mul.lo.s64 %rd41, %rd14, %rd58; + .loc 1 40 73 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:73 + setp.lt.s64 %p13, %rd14, 2; + .loc 1 40 99 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:99 + setp.gt.s64 %p14, %rd14, 1; + .loc 1 40 90 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:90 + selp.b64 %rd42, %rd14, 0, %p14; + .loc 1 40 0 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40 + selp.b64 %rd43, 1, 0, %p13; + .loc 1 40 81 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:81 + add.s64 %rd44, %rd42, %rd43; + .loc 1 40 58 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:58 + mul.lo.s64 %rd45, %rd44, %rd40; + shl.b64 %rd46, %rd7, 8; + add.s64 %rd47, %rd11, %rd46; + shl.b64 %rd48, %rd6, 13; + .loc 1 33 40 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:33:40 + add.s64 %rd49, %rd47, %rd48; + shl.b64 %rd50, %rd41, 13; + add.s64 %rd51, %rd49, %rd50; + shl.b64 %rd52, %rd6, 8; + add.s64 %rd53, %rd12, %rd52; + shl.b64 %rd54, %rd45, 8; + add.s64 %rd55, %rd53, %rd54; + .loc 1 39 34 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:34 + mul.wide.u32 %rd56, %r4, 2; + add.s64 %rd27, %rd51, %rd56; + .loc 1 39 74 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:74 + // begin inline asm + mov.u64 %rd28, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd28, 1.0; + // end inline asm + mov.b32 %r20, 0; + // begin inline asm + mov.u32 %r16, %r20; + mov.u32 %r17, %r20; + mov.u32 %r18, %r20; + mov.u32 %r19, %r20; + @%p3 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r16, %r17, %r18, %r19 }, [ %rd27 + 0 ], %rd28; + // end inline asm + .loc 1 40 34 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:34 + add.s64 %rd30, %rd55, %rd56; + .loc 1 40 106 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:106 + // begin inline asm + mov.u64 %rd31, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd31, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r24, %r20; + mov.u32 %r25, %r20; + mov.u32 %r26, %r20; + mov.u32 %r27, %r20; + @%p3 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r24, %r25, %r26, %r27 }, [ %rd30 + 0 ], %rd31; + // end inline asm + .loc 1 39 34 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:34 + add.s64 %rd33, %rd27, 128; + .loc 1 39 74 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:74 + // begin inline asm + mov.u64 %rd34, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd34, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r32, %r20; + mov.u32 %r33, %r20; + mov.u32 %r34, %r20; + mov.u32 %r35, %r20; + @%p3 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r32, %r33, %r34, %r35 }, [ %rd33 + 0 ], %rd34; + // end inline asm + .loc 1 40 34 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:34 + add.s64 %rd36, %rd30, 128; + .loc 1 40 106 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:106 + // begin inline asm + mov.u64 %rd37, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd37, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r40, %r20; + mov.u32 %r41, %r20; + mov.u32 %r42, %r20; + mov.u32 %r43, %r20; + @%p3 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r40, %r41, %r42, %r43 }, [ %rd36 + 0 ], %rd37; + // end inline asm + .loc 1 39 136 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:136 + mov.b32 {%rs1, %rs2}, %r16; + cvt.f32.bf16 %r50, %rs2; + cvt.f32.bf16 %r51, %rs1; + .loc 1 40 168 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:168 + mov.b32 {%rs3, %rs4}, %r24; + cvt.f32.bf16 %r52, %rs4; + cvt.f32.bf16 %r53, %rs3; + .loc 1 43 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:43:23 + fma.rn.f32 %r54, %r51, %r53, 0f00000000; + fma.rn.f32 %r55, %r50, %r52, 0f00000000; + .loc 1 39 136 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:136 + mov.b32 {%rs5, %rs6}, %r32; + cvt.f32.bf16 %r56, %rs5; + cvt.f32.bf16 %r57, %rs6; + .loc 1 40 168 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:168 + mov.b32 {%rs7, %rs8}, %r40; + cvt.f32.bf16 %r58, %rs7; + cvt.f32.bf16 %r59, %rs8; + .loc 1 43 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:43:23 + fma.rn.f32 %r60, %r57, %r59, %r55; + fma.rn.f32 %r61, %r56, %r58, %r54; + .loc 1 39 136 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:136 + mov.b32 {%rs9, %rs10}, %r17; + cvt.f32.bf16 %r62, %rs10; + cvt.f32.bf16 %r63, %rs9; + .loc 1 40 168 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:168 + mov.b32 {%rs11, %rs12}, %r25; + cvt.f32.bf16 %r64, %rs12; + cvt.f32.bf16 %r65, %rs11; + .loc 1 43 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:43:23 + fma.rn.f32 %r66, %r63, %r65, 0f00000000; + fma.rn.f32 %r67, %r62, %r64, 0f00000000; + .loc 1 39 136 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:136 + mov.b32 {%rs13, %rs14}, %r33; + cvt.f32.bf16 %r68, %rs13; + cvt.f32.bf16 %r69, %rs14; + .loc 1 40 168 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:168 + mov.b32 {%rs15, %rs16}, %r41; + cvt.f32.bf16 %r70, %rs15; + cvt.f32.bf16 %r71, %rs16; + .loc 1 43 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:43:23 + fma.rn.f32 %r72, %r69, %r71, %r67; + fma.rn.f32 %r73, %r68, %r70, %r66; + .loc 1 39 136 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:136 + mov.b32 {%rs17, %rs18}, %r18; + cvt.f32.bf16 %r74, %rs18; + cvt.f32.bf16 %r75, %rs17; + .loc 1 40 168 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:168 + mov.b32 {%rs19, %rs20}, %r26; + cvt.f32.bf16 %r76, %rs20; + cvt.f32.bf16 %r77, %rs19; + .loc 1 43 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:43:23 + fma.rn.f32 %r78, %r75, %r77, 0f00000000; + fma.rn.f32 %r79, %r74, %r76, 0f00000000; + .loc 1 39 136 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:136 + mov.b32 {%rs21, %rs22}, %r34; + cvt.f32.bf16 %r80, %rs21; + cvt.f32.bf16 %r81, %rs22; + .loc 1 40 168 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:168 + mov.b32 {%rs23, %rs24}, %r42; + cvt.f32.bf16 %r82, %rs23; + cvt.f32.bf16 %r83, %rs24; + .loc 1 43 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:43:23 + fma.rn.f32 %r84, %r81, %r83, %r79; + fma.rn.f32 %r85, %r80, %r82, %r78; + .loc 1 39 136 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:136 + mov.b32 {%rs25, %rs26}, %r19; + cvt.f32.bf16 %r86, %rs26; + cvt.f32.bf16 %r87, %rs25; + .loc 1 40 168 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:168 + mov.b32 {%rs27, %rs28}, %r27; + cvt.f32.bf16 %r88, %rs28; + cvt.f32.bf16 %r89, %rs27; + .loc 1 43 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:43:23 + fma.rn.f32 %r90, %r87, %r89, 0f00000000; + fma.rn.f32 %r91, %r86, %r88, 0f00000000; + .loc 1 39 136 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:136 + mov.b32 {%rs29, %rs30}, %r35; + cvt.f32.bf16 %r92, %rs29; + cvt.f32.bf16 %r93, %rs30; + .loc 1 40 168 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:168 + mov.b32 {%rs31, %rs32}, %r43; + cvt.f32.bf16 %r94, %rs31; + cvt.f32.bf16 %r95, %rs32; + .loc 1 43 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:43:23 + fma.rn.f32 %r96, %r93, %r95, %r91; + fma.rn.f32 %r97, %r92, %r94, %r90; + .loc 1 23 44 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:23:44 + and.b32 %r98, %r2, 63; + .loc 1 23 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:23:23 + or.b32 %r99, %r1, %r98; + .loc 1 24 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:24:21 + setp.lt.s32 %p15, %r99, %r5; +$L__tmp3: + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r100, %r61, %r60; + add.f32 %r101, %r73, %r100; + add.f32 %r102, %r72, %r101; + add.f32 %r103, %r85, %r102; + add.f32 %r104, %r84, %r103; + add.f32 %r105, %r97, %r104; + add.f32 %r106, %r96, %r105; + selp.f32 %r107, %r106, 0f00000000, %p3; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r108, %r107, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r109, %r107, %r108; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r110, %r109, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r111, %r109, %r110; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r112, %r111, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r113, %r111, %r112; +$L__tmp4: + .loc 1 45 28 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:28 + shr.u32 %r114, %r3, 1; + mov.b32 %r115, global_smem; + add.s32 %r116, %r115, %r114; + st.shared.b32 [%r116], %r113; + bar.sync 0; + shl.b32 %r117, %r98, 2; + add.s32 %r118, %r115, %r117; + ld.shared.b32 %r48, [%r118]; + .loc 1 49 25 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:49:25 + mad.wide.s32 %rd38, %r99, 4, %rd13; + .loc 1 49 36 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:49:36 + and.b32 %r119, %r2, 448; + setp.eq.b32 %p16, %r119, 0; + and.pred %p7, %p16, %p15; + // begin inline asm + @%p7 st.global.b32 [ %rd38 + 0 ], { %r48 }; + // end inline asm + .loc 1 49 4 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:49:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 233 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe2 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 54 +.b8 52 +.b8 100 +.b8 110 +.b8 112 +.b8 110 +.b8 99 +.b8 50 +.b8 52 +.b8 117 +.b8 106 +.b8 115 +.b8 52 +.b8 101 +.b8 108 +.b8 51 +.b8 111 +.b8 122 +.b8 101 +.b8 100 +.b8 97 +.b8 55 +.b8 112 +.b8 110 +.b8 115 +.b8 118 +.b8 116 +.b8 101 +.b8 119 +.b8 104 +.b8 55 +.b8 120 +.b8 54 +.b8 55 +.b8 103 +.b8 122 +.b8 119 +.b8 116 +.b8 102 +.b8 111 +.b8 55 +.b8 122 +.b8 120 +.b8 118 +.b8 114 +.b8 102 +.b8 115 +.b8 113 +.b8 122 +.b8 115 +.b8 103 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 54 +.b8 52 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 30 // DW_AT_call_line +.b8 51 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd3:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..10bc3081b8cf7e3404fb236ffa68673a5928e27e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.source @@ -0,0 +1,325 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":18:0) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc70 = loc(unknown) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc77 = loc("in_ptr0"(#loc)) +#loc78 = loc("in_ptr1"(#loc)) +#loc79 = loc("out_ptr1"(#loc)) +#loc80 = loc("ks0"(#loc)) +#loc81 = loc("ks1"(#loc)) +#loc82 = loc("xnumel"(#loc)) +#loc83 = loc("r0_numel"(#loc)) +#loc135 = loc("a"(#loc56)) +#loc136 = loc("b"(#loc56)) +#loc142 = loc("input"(#loc68)) +#loc143 = loc("a"(#loc73)) +#loc144 = loc("b"(#loc73)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 128 : i32 loc(#loc84) + %xoffset = tt.get_program_id x : i32 loc(#loc85) + %xoffset_1 = arith.constant 64 : i32 loc(#loc86) + %xoffset_2 = arith.constant 64 : i32 loc(#loc86) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc86) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc87) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc88) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<64x1xi32> loc(#loc89) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<64x1xi32> loc(#loc89) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32> loc(#loc90) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<64x1xi32> loc(#loc90) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc91) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc92) + %x0 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc93) + %x0_9 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc93) + %x0_10 = arith.remsi %x0, %x0_9 : tensor<64x1xi64> loc(#loc93) + %x1 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc94) + %x1_11 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc94) + %x1_12 = arith.divsi %x1, %x1_11 : tensor<64x1xi64> loc(#loc94) + %x1_13 = arith.constant 32 : i32 loc(#loc95) + %x1_14 = arith.constant 32 : i64 loc(#loc95) + %x1_15 = arith.constant dense<32> : tensor<64x1xi64> loc(#loc95) + %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<64x1xi64> loc(#loc95) + %x2 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc96) + %x2_17 = tt.splat %ks1 : i64 -> tensor<64x1xi64> loc(#loc96) + %x2_18 = arith.divsi %x2, %x2_17 : tensor<64x1xi64> loc(#loc96) + %x5 = tt.call @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S64_1S_i64__(%xindex_6, %ks0) : (tensor<64x1xi32>, i64) -> tensor<64x1xi64> loc(#loc97) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc98) + %_tmp4_19 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc98) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c64_i32 = arith.constant 64 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_20 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_19) -> (tensor<64x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc100) + %r0_index_24 = arith.addi %r0_index, %r0_base_8 : tensor<1x64xi32> loc(#loc100) + %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc101) + %r0_mask_25 = arith.cmpi slt, %r0_index_24, %r0_mask : tensor<1x64xi32> loc(#loc101) + %tmp0 = arith.constant 128 : i32 loc(#loc102) + %tmp0_26 = arith.constant 128 : i64 loc(#loc102) + %tmp0_27 = arith.constant dense<128> : tensor<64x1xi64> loc(#loc102) + %tmp0_28 = arith.muli %tmp0_27, %x1_16 : tensor<64x1xi64> loc(#loc102) + %tmp0_29 = arith.extsi %r0_index_24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc103) + %tmp0_30 = tt.broadcast %tmp0_29 : tensor<1x64xi64> -> tensor<64x64xi64> loc(#loc103) + %tmp0_31 = tt.broadcast %tmp0_28 : tensor<64x1xi64> -> tensor<64x64xi64> loc(#loc103) + %tmp0_32 = arith.addi %tmp0_30, %tmp0_31 : tensor<64x64xi64> loc(#loc103) + %tmp0_33 = arith.constant 4096 : i32 loc(#loc104) + %tmp0_34 = arith.constant 4096 : i64 loc(#loc104) + %tmp0_35 = arith.constant dense<4096> : tensor<64x1xi64> loc(#loc104) + %tmp0_36 = arith.muli %tmp0_35, %x0_10 : tensor<64x1xi64> loc(#loc104) + %tmp0_37 = tt.broadcast %tmp0_36 : tensor<64x1xi64> -> tensor<64x64xi64> loc(#loc105) + %tmp0_38 = arith.addi %tmp0_32, %tmp0_37 : tensor<64x64xi64> loc(#loc105) + %tmp0_39 = arith.constant 4096 : i32 loc(#loc106) + %tmp0_40 = arith.constant 4096 : i64 loc(#loc106) + %tmp0_41 = arith.muli %tmp0_40, %ks0 : i64 loc(#loc106) + %tmp0_42 = tt.splat %tmp0_41 : i64 -> tensor<64x1xi64> loc(#loc107) + %tmp0_43 = arith.muli %tmp0_42, %x2_18 : tensor<64x1xi64> loc(#loc107) + %tmp0_44 = tt.broadcast %tmp0_43 : tensor<64x1xi64> -> tensor<64x64xi64> loc(#loc108) + %tmp0_45 = arith.addi %tmp0_38, %tmp0_44 : tensor<64x64xi64> loc(#loc108) + %tmp0_46 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc109) + %tmp0_47 = tt.addptr %tmp0_46, %tmp0_45 : tensor<64x64x!tt.ptr>, tensor<64x64xi64> loc(#loc109) + %tmp0_48 = tt.broadcast %r0_mask_25 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc110) + %tmp0_49 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc110) + %tmp0_50 = arith.andi %tmp0_48, %tmp0_49 : tensor<64x64xi1> loc(#loc110) + %tmp0_51 = arith.constant 0.000000e+00 : f32 loc(#loc111) + %tmp0_52 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc111) + %tmp0_53 = arith.truncf %tmp0_52 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc111) + %tmp0_54 = tt.load %tmp0_47, %tmp0_50, %tmp0_53 evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc111) + %tmp0_55 = arith.extf %tmp0_54 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc112) + %tmp1 = arith.constant 128 : i32 loc(#loc113) + %tmp1_56 = arith.constant 128 : i64 loc(#loc113) + %tmp1_57 = arith.constant dense<128> : tensor<64x1xi64> loc(#loc113) + %tmp1_58 = arith.muli %tmp1_57, %x0_10 : tensor<64x1xi64> loc(#loc113) + %tmp1_59 = arith.extsi %r0_index_24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc114) + %tmp1_60 = tt.broadcast %tmp1_59 : tensor<1x64xi64> -> tensor<64x64xi64> loc(#loc114) + %tmp1_61 = tt.broadcast %tmp1_58 : tensor<64x1xi64> -> tensor<64x64xi64> loc(#loc114) + %tmp1_62 = arith.addi %tmp1_60, %tmp1_61 : tensor<64x64xi64> loc(#loc114) + %tmp1_63 = arith.constant 128 : i32 loc(#loc115) + %tmp1_64 = arith.constant 128 : i64 loc(#loc115) + %tmp1_65 = arith.constant dense<128> : tensor<64x1xi64> loc(#loc115) + %tmp1_66 = arith.muli %tmp1_65, %x5 : tensor<64x1xi64> loc(#loc115) + %tmp1_67 = arith.constant 1 : i32 loc(#loc116) + %tmp1_68 = arith.extsi %tmp1_67 : i32 to i64 loc(#loc116) + %tmp1_69 = arith.cmpi sge, %tmp1_68, %ks0 : i64 loc(#loc116) + %tmp1_70 = arith.constant 1 : i32 loc(#loc117) + %tmp1_71 = arith.constant 1 : i32 loc(#loc117) + %tmp1_72 = arith.extui %tmp1_69 : i1 to i32 loc(#loc117) + %tmp1_73 = arith.muli %tmp1_71, %tmp1_72 : i32 loc(#loc117) + %tmp1_74 = arith.constant 1 : i32 loc(#loc118) + %tmp1_75 = arith.extsi %tmp1_74 : i32 to i64 loc(#loc118) + %tmp1_76 = arith.cmpi sgt, %ks0, %tmp1_75 : i64 loc(#loc118) + %tmp1_77 = arith.extui %tmp1_76 : i1 to i64 loc(#loc119) + %tmp1_78 = arith.muli %ks0, %tmp1_77 : i64 loc(#loc119) + %tmp1_79 = arith.extsi %tmp1_73 : i32 to i64 loc(#loc120) + %tmp1_80 = arith.addi %tmp1_79, %tmp1_78 : i64 loc(#loc120) + %tmp1_81 = tt.splat %tmp1_80 : i64 -> tensor<64x1xi64> loc(#loc121) + %tmp1_82 = arith.muli %tmp1_66, %tmp1_81 : tensor<64x1xi64> loc(#loc121) + %tmp1_83 = tt.broadcast %tmp1_82 : tensor<64x1xi64> -> tensor<64x64xi64> loc(#loc122) + %tmp1_84 = arith.addi %tmp1_62, %tmp1_83 : tensor<64x64xi64> loc(#loc122) + %tmp1_85 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc123) + %tmp1_86 = tt.addptr %tmp1_85, %tmp1_84 : tensor<64x64x!tt.ptr>, tensor<64x64xi64> loc(#loc123) + %tmp1_87 = tt.broadcast %r0_mask_25 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc124) + %tmp1_88 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc124) + %tmp1_89 = arith.andi %tmp1_87, %tmp1_88 : tensor<64x64xi1> loc(#loc124) + %tmp1_90 = arith.constant 0.000000e+00 : f32 loc(#loc125) + %tmp1_91 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc125) + %tmp1_92 = arith.truncf %tmp1_91 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc125) + %tmp1_93 = tt.load %tmp1_86, %tmp1_89, %tmp1_92 evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc125) + %tmp1_94 = arith.extf %tmp1_93 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc126) + %tmp2 = arith.mulf %tmp0_55, %tmp1_94 : tensor<64x64xf32> loc(#loc127) + %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<64x64xf32> loc(#loc128) + %_tmp4_95 = tt.broadcast %r0_mask_25 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc129) + %_tmp4_96 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc129) + %_tmp4_97 = arith.andi %_tmp4_95, %_tmp4_96 : tensor<64x64xi1> loc(#loc129) + %_tmp4_98 = arith.select %_tmp4_97, %tmp5, %_tmp4_23 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc130) + scf.yield %_tmp4_98 : tensor<64x64xf32> loc(#loc48) + } loc(#loc99) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_20) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc131) + %tmp4_21 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc132) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc133) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<64x1xf32> loc(#loc134) + %tmp8_22 = arith.subf %tmp4_21, %tmp8 : tensor<64x1xf32> loc(#loc134) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc53) + %5 = tt.addptr %4, %xindex_6 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc53) + tt.store %5, %tmp8_22, %xmask_7 : tensor<64x1x!tt.ptr> loc(#loc54) + tt.return loc(#loc55) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S64_1S_i64__(%a: tensor<64x1xi32> loc("a"(#loc56)), %b: i64 loc("b"(#loc56))) -> tensor<64x1xi64> attributes {noinline = false} { + %quot = arith.extsi %a : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc137) + %quot_0 = tt.splat %b : i64 -> tensor<64x1xi64> loc(#loc137) + %quot_1 = arith.divsi %quot, %quot_0 : tensor<64x1xi64> loc(#loc137) + %remainder = arith.extsi %a : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc138) + %remainder_2 = tt.splat %b : i64 -> tensor<64x1xi64> loc(#loc138) + %remainder_3 = arith.remsi %remainder, %remainder_2 : tensor<64x1xi64> loc(#loc138) + %fixed = arith.constant 0 : i32 loc(#loc139) + %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc139) + %fixed_5 = tt.splat %fixed_4 : i64 -> tensor<64x1xi64> loc(#loc139) + %fixed_6 = arith.cmpi ne, %remainder_3, %fixed_5 : tensor<64x1xi64> loc(#loc139) + %fixed_7 = arith.constant 1 : i32 loc(#loc140) + %fixed_8 = arith.constant 1 : i64 loc(#loc140) + %fixed_9 = arith.constant dense<1> : tensor<64x1xi64> loc(#loc140) + %fixed_10 = arith.subi %quot_1, %fixed_9 : tensor<64x1xi64> loc(#loc140) + %fixed_11 = arith.select %fixed_6, %fixed_10, %quot_1 : tensor<64x1xi1>, tensor<64x1xi64> loc(#loc141) + %c0_i32 = arith.constant 0 : i32 loc(#loc62) + %cst = arith.constant dense<0> : tensor<64x1xi32> loc(#loc62) + %0 = arith.cmpi slt, %a, %cst : tensor<64x1xi32> loc(#loc62) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc63) + %1 = arith.extsi %c0_i32_12 : i32 to i64 loc(#loc63) + %2 = arith.cmpi slt, %b, %1 : i64 loc(#loc63) + %3 = tt.splat %2 : i1 -> tensor<64x1xi1> loc(#loc64) + %4 = arith.cmpi ne, %0, %3 : tensor<64x1xi1> loc(#loc64) + %5 = arith.select %4, %fixed_11, %quot_1 : tensor<64x1xi1>, tensor<64x1xi64> loc(#loc65) + tt.return %5 : tensor<64x1xi64> loc(#loc66) + ^bb1: // no predecessors + %6 = ub.poison : tensor<64x1xi64> loc(#loc67) + tt.return %6 : tensor<64x1xi64> loc(#loc67) + } loc(#loc56) + tt.func private @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x64xf32> loc("input"(#loc68))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc69) + tt.reduce.return %2 : f32 loc(#loc69) + }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc69) + tt.return %0 : tensor<64xf32> loc(#loc71) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc72) + tt.return %1 : tensor<64xf32> loc(#loc72) + } loc(#loc68) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc73)), %b: f32 loc("b"(#loc73))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc74) + tt.return %0 : f32 loc(#loc75) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc76) + tt.return %1 : f32 loc(#loc76) + } loc(#loc73) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:21) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:28) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":29:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":30:51) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":31:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:65) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:69) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:60) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:34) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:84) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:74) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:136) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:45) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:41) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:54) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:73) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:65) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:99) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:90) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:81) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:58) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:50) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:34) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:116) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:106) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:168) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":41:22) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":43:23) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:35) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:48) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:8) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:25) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:28) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":47:11) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":48:18) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:36) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:4) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc84 = loc("r0_numel"(#loc1)) +#loc85 = loc("xoffset"(#loc2)) +#loc86 = loc("xoffset"(#loc3)) +#loc87 = loc("xindex"(#loc4)) +#loc88 = loc("xindex"(#loc5)) +#loc89 = loc("xindex"(#loc6)) +#loc90 = loc("xmask"(#loc7)) +#loc91 = loc("r0_base"(#loc8)) +#loc92 = loc("r0_base"(#loc9)) +#loc93 = loc("x0"(#loc10)) +#loc94 = loc("x1"(#loc11)) +#loc95 = loc("x1"(#loc12)) +#loc96 = loc("x2"(#loc13)) +#loc97 = loc("x5"(#loc14)) +#loc98 = loc("_tmp4"(#loc15)) +#loc99 = loc("_tmp4"(#loc16)) +#loc100 = loc("r0_index"(#loc17)) +#loc101 = loc("r0_mask"(#loc18)) +#loc102 = loc("tmp0"(#loc19)) +#loc103 = loc("tmp0"(#loc20)) +#loc104 = loc("tmp0"(#loc21)) +#loc105 = loc("tmp0"(#loc22)) +#loc106 = loc("tmp0"(#loc23)) +#loc107 = loc("tmp0"(#loc24)) +#loc108 = loc("tmp0"(#loc25)) +#loc109 = loc("tmp0"(#loc26)) +#loc110 = loc("tmp0"(#loc27)) +#loc111 = loc("tmp0"(#loc28)) +#loc112 = loc("tmp0"(#loc29)) +#loc113 = loc("tmp1"(#loc30)) +#loc114 = loc("tmp1"(#loc31)) +#loc115 = loc("tmp1"(#loc32)) +#loc116 = loc("tmp1"(#loc33)) +#loc117 = loc("tmp1"(#loc34)) +#loc118 = loc("tmp1"(#loc35)) +#loc119 = loc("tmp1"(#loc36)) +#loc120 = loc("tmp1"(#loc37)) +#loc121 = loc("tmp1"(#loc38)) +#loc122 = loc("tmp1"(#loc39)) +#loc123 = loc("tmp1"(#loc40)) +#loc124 = loc("tmp1"(#loc41)) +#loc125 = loc("tmp1"(#loc42)) +#loc126 = loc("tmp1"(#loc43)) +#loc127 = loc("tmp2"(#loc44)) +#loc128 = loc("tmp5"(#loc45)) +#loc129 = loc("_tmp4"(#loc46)) +#loc130 = loc("_tmp4"(#loc47)) +#loc131 = loc("tmp4"(#loc49)) +#loc132 = loc("tmp4"(#loc50)) +#loc133 = loc("tmp7"(#loc51)) +#loc134 = loc("tmp8"(#loc52)) +#loc137 = loc("quot"(#loc57)) +#loc138 = loc("remainder"(#loc58)) +#loc139 = loc("fixed"(#loc59)) +#loc140 = loc("fixed"(#loc60)) +#loc141 = loc("fixed"(#loc61)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..5e5a6e0bc30b9b6ca10859cd7a10d41006303b0f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,232 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [16, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 8], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":18:0) +#loc1 = loc(unknown) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:25) +#loc58 = loc("in_ptr0"(#loc)) +#loc59 = loc("in_ptr1"(#loc)) +#loc60 = loc("out_ptr1"(#loc)) +#loc61 = loc("ks0"(#loc)) +#loc62 = loc("ks1"(#loc)) +#loc63 = loc("xnumel"(#loc)) +#loc64 = loc("r0_numel"(#loc)) +#loc109 = loc("tmp4"(#loc52)) +#loc120 = loc(callsite(#loc1 at #loc109)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<32> : tensor<64x1xi64, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked> loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c4096_i64 = arith.constant 4096 : i64 loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc65) + %xoffset_8 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc66) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc67) + %xindex_9 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc67) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc67) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc67) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked> loc(#loc68) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc68) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<64x1xi32, #blocked> loc(#loc68) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<64x1xi32, #blocked1> loc(#loc68) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32, #blocked> loc(#loc69) + %xmask_16 = tt.splat %xnumel : i32 -> tensor<64x1xi32, #blocked1> loc(#loc69) + %xmask_17 = arith.cmpi slt, %xindex_14, %xmask : tensor<64x1xi32, #blocked> loc(#loc69) + %xmask_18 = arith.cmpi slt, %xindex_15, %xmask_16 : tensor<64x1xi32, #blocked1> loc(#loc69) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc70) + %r0_base_19 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc70) + %x0 = arith.extsi %xindex_14 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> loc(#loc71) + %x0_20 = tt.splat %ks0 : i64 -> tensor<64x1xi64, #blocked> loc(#loc71) + %x0_21 = arith.remsi %x0, %x0_20 : tensor<64x1xi64, #blocked> loc(#loc71) + %x1 = arith.divsi %x0, %x0_20 : tensor<64x1xi64, #blocked> loc(#loc72) + %x1_22 = arith.remsi %x1, %cst_5 : tensor<64x1xi64, #blocked> loc(#loc73) + %x2 = tt.splat %ks1 : i64 -> tensor<64x1xi64, #blocked> loc(#loc74) + %x2_23 = arith.divsi %x0, %x2 : tensor<64x1xi64, #blocked> loc(#loc74) + %fixed = arith.cmpi ne, %x0_21, %cst_2 : tensor<64x1xi64, #blocked> loc(#loc111) + %fixed_24 = arith.subi %x1, %cst_4 : tensor<64x1xi64, #blocked> loc(#loc112) + %fixed_25 = arith.select %fixed, %fixed_24, %x1 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked> loc(#loc113) + %x5 = arith.cmpi slt, %xindex_14, %cst_3 : tensor<64x1xi32, #blocked> loc(#loc114) + %x5_26 = arith.cmpi slt, %ks0, %c0_i64 : i64 loc(#loc115) + %x5_27 = tt.splat %x5_26 : i1 -> tensor<64x1xi1, #blocked> loc(#loc116) + %x5_28 = arith.cmpi ne, %x5, %x5_27 : tensor<64x1xi1, #blocked> loc(#loc116) + %x5_29 = arith.select %x5_28, %fixed_25, %x1 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked> loc(#loc117) + %tmp0 = arith.muli %x1_22, %cst_0 : tensor<64x1xi64, #blocked> loc(#loc79) + %tmp0_30 = tt.broadcast %tmp0 : tensor<64x1xi64, #blocked> -> tensor<64x64xi64, #blocked> loc(#loc80) + %tmp0_31 = arith.muli %x0_21, %cst_1 : tensor<64x1xi64, #blocked> loc(#loc81) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<64x1xi64, #blocked> -> tensor<64x64xi64, #blocked> loc(#loc82) + %tmp0_33 = arith.muli %ks0, %c4096_i64 : i64 loc(#loc83) + %tmp0_34 = tt.splat %tmp0_33 : i64 -> tensor<64x1xi64, #blocked> loc(#loc84) + %tmp0_35 = arith.muli %tmp0_34, %x2_23 : tensor<64x1xi64, #blocked> loc(#loc84) + %tmp0_36 = tt.broadcast %tmp0_35 : tensor<64x1xi64, #blocked> -> tensor<64x64xi64, #blocked> loc(#loc85) + %tmp0_37 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> loc(#loc86) + %tmp0_38 = tt.broadcast %xmask_17 : tensor<64x1xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc87) + %tmp1 = arith.muli %x0_21, %cst_0 : tensor<64x1xi64, #blocked> loc(#loc88) + %tmp1_39 = tt.broadcast %tmp1 : tensor<64x1xi64, #blocked> -> tensor<64x64xi64, #blocked> loc(#loc89) + %tmp1_40 = arith.muli %x5_29, %cst_0 : tensor<64x1xi64, #blocked> loc(#loc90) + %tmp1_41 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc91) + %tmp1_42 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc92) + %tmp1_43 = arith.extui %tmp1_42 : i1 to i64 loc(#loc93) + %tmp1_44 = arith.muli %ks0, %tmp1_43 : i64 loc(#loc93) + %tmp1_45 = arith.extui %tmp1_41 : i1 to i64 loc(#loc118) + %tmp1_46 = arith.addi %tmp1_45, %tmp1_44 : i64 loc(#loc94) + %tmp1_47 = tt.splat %tmp1_46 : i64 -> tensor<64x1xi64, #blocked> loc(#loc96) + %tmp1_48 = arith.muli %tmp1_40, %tmp1_47 : tensor<64x1xi64, #blocked> loc(#loc96) + %tmp1_49 = tt.broadcast %tmp1_48 : tensor<64x1xi64, #blocked> -> tensor<64x64xi64, #blocked> loc(#loc97) + %tmp1_50 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> loc(#loc98) + %_tmp4 = scf.for %_tmp4_53 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg8 = %cst_7) -> (tensor<64x64xf32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp4_53 : i32 -> tensor<1x64xi32, #blocked> loc(#loc100) + %r0_index_54 = arith.addi %r0_index, %r0_base_19 : tensor<1x64xi32, #blocked> loc(#loc100) + %r0_mask = arith.cmpi slt, %r0_index_54, %cst : tensor<1x64xi32, #blocked> loc(#loc101) + %tmp0_55 = arith.extsi %r0_index_54 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> loc(#loc80) + %tmp0_56 = tt.broadcast %tmp0_55 : tensor<1x64xi64, #blocked> -> tensor<64x64xi64, #blocked> loc(#loc80) + %tmp0_57 = arith.addi %tmp0_56, %tmp0_30 : tensor<64x64xi64, #blocked> loc(#loc80) + %tmp0_58 = arith.addi %tmp0_57, %tmp0_32 : tensor<64x64xi64, #blocked> loc(#loc82) + %tmp0_59 = arith.addi %tmp0_58, %tmp0_36 : tensor<64x64xi64, #blocked> loc(#loc85) + %tmp0_60 = tt.addptr %tmp0_37, %tmp0_59 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi64, #blocked> loc(#loc86) + %tmp0_61 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc87) + %tmp0_62 = arith.andi %tmp0_61, %tmp0_38 : tensor<64x64xi1, #blocked> loc(#loc87) + %tmp0_63 = tt.load %tmp0_60, %tmp0_62, %cst_6 evictionPolicy = evict_first : tensor<64x64x!tt.ptr, #blocked> loc(#loc102) + %tmp0_64 = arith.extf %tmp0_63 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc103) + %tmp1_65 = arith.addi %tmp0_56, %tmp1_39 : tensor<64x64xi64, #blocked> loc(#loc89) + %tmp1_66 = arith.addi %tmp1_65, %tmp1_49 : tensor<64x64xi64, #blocked> loc(#loc97) + %tmp1_67 = tt.addptr %tmp1_50, %tmp1_66 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi64, #blocked> loc(#loc98) + %tmp1_68 = tt.load %tmp1_67, %tmp0_62, %cst_6 evictionPolicy = evict_first : tensor<64x64x!tt.ptr, #blocked> loc(#loc104) + %tmp1_69 = arith.extf %tmp1_68 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc105) + %tmp2 = arith.mulf %tmp0_64, %tmp1_69 : tensor<64x64xf32, #blocked> loc(#loc106) + %tmp5 = arith.addf %arg8, %tmp2 : tensor<64x64xf32, #blocked> loc(#loc107) + %_tmp4_70 = arith.select %tmp0_62, %tmp5, %arg8 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc108) + scf.yield %_tmp4_70 : tensor<64x64xf32, #blocked> loc(#loc50) + } loc(#loc99) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_53: f32 loc(callsite(#loc1 at #loc109)), %tmp4_54: f32 loc(callsite(#loc1 at #loc109))): + %tmp4_55 = arith.addf %tmp4_53, %tmp4_54 : f32 loc(#loc121) + tt.reduce.return %tmp4_55 : f32 loc(#loc119) + }) : (tensor<64x64xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc119) + %tmp4_51 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc110) + %tmp4_52 = tt.expand_dims %tmp4_51 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc110) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc55) + %1 = tt.addptr %0, %xindex_15 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc55) + tt.store %1, %tmp4_52, %xmask_18 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc56) + tt.return loc(#loc57) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":24:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":27:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:21) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:28) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":29:19) +#loc12 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":30:51) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:45) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:41) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:55) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:50) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:65) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:69) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:60) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:34) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:84) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:45) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:41) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:54) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:73) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:99) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:90) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:81) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:65) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:58) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:50) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:34) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":33:40) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":34:31) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":35:29) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:74) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:136) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:106) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:168) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":41:22) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":43:23) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:48) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:8) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:28) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:25) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:36) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:4) +#loc65 = loc("xoffset"(#loc2)) +#loc66 = loc("xoffset"(#loc3)) +#loc67 = loc("xindex"(#loc4)) +#loc68 = loc("xindex"(#loc5)) +#loc69 = loc("xmask"(#loc6)) +#loc70 = loc("r0_base"(#loc7)) +#loc71 = loc("x0"(#loc8)) +#loc72 = loc("x1"(#loc9)) +#loc73 = loc("x1"(#loc10)) +#loc74 = loc("x2"(#loc11)) +#loc75 = loc("fixed"(#loc12)) +#loc76 = loc("x5"(#loc13)) +#loc77 = loc("fixed"(#loc14)) +#loc78 = loc("fixed"(#loc15)) +#loc79 = loc("tmp0"(#loc20)) +#loc80 = loc("tmp0"(#loc21)) +#loc81 = loc("tmp0"(#loc22)) +#loc82 = loc("tmp0"(#loc23)) +#loc83 = loc("tmp0"(#loc24)) +#loc84 = loc("tmp0"(#loc25)) +#loc85 = loc("tmp0"(#loc26)) +#loc86 = loc("tmp0"(#loc27)) +#loc87 = loc("tmp0"(#loc28)) +#loc88 = loc("tmp1"(#loc29)) +#loc89 = loc("tmp1"(#loc30)) +#loc90 = loc("tmp1"(#loc31)) +#loc91 = loc("tmp1"(#loc32)) +#loc92 = loc("tmp1"(#loc33)) +#loc93 = loc("tmp1"(#loc34)) +#loc94 = loc("tmp1"(#loc35)) +#loc95 = loc("tmp1"(#loc36)) +#loc96 = loc("tmp1"(#loc37)) +#loc97 = loc("tmp1"(#loc38)) +#loc98 = loc("tmp1"(#loc39)) +#loc99 = loc("_tmp4"(#loc40)) +#loc100 = loc("r0_index"(#loc41)) +#loc101 = loc("r0_mask"(#loc42)) +#loc102 = loc("tmp0"(#loc43)) +#loc103 = loc("tmp0"(#loc44)) +#loc104 = loc("tmp1"(#loc45)) +#loc105 = loc("tmp1"(#loc46)) +#loc106 = loc("tmp2"(#loc47)) +#loc107 = loc("tmp5"(#loc48)) +#loc108 = loc("_tmp4"(#loc49)) +#loc110 = loc("tmp4"(#loc54)) +#loc111 = loc(callsite(#loc75 at #loc76)) +#loc112 = loc(callsite(#loc77 at #loc76)) +#loc113 = loc(callsite(#loc78 at #loc76)) +#loc114 = loc(callsite(#loc16 at #loc76)) +#loc115 = loc(callsite(#loc17 at #loc76)) +#loc116 = loc(callsite(#loc18 at #loc76)) +#loc117 = loc(callsite(#loc19 at #loc76)) +#loc118 = loc(fused[#loc94, #loc95]) +#loc119 = loc(callsite(#loc51 at #loc109)) +#loc121 = loc(callsite(#loc53 at #loc119)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..2ad1a167828ef072dacf338f3024476ac04ef627 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BXWZSSWKBTIG7YDOE6QDLF3DYUHLUN57GPEDYW37ZDRQO2XWRGCQ/triton_red_fused_zeros_0.ttir @@ -0,0 +1,224 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":18:0) +#loc6 = loc(unknown) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:25) +#loc59 = loc("in_ptr0"(#loc)) +#loc60 = loc("in_ptr1"(#loc)) +#loc61 = loc("out_ptr1"(#loc)) +#loc62 = loc("ks0"(#loc)) +#loc63 = loc("ks1"(#loc)) +#loc64 = loc("xnumel"(#loc)) +#loc65 = loc("r0_numel"(#loc)) +#loc111 = loc("tmp4"(#loc53)) +#loc122 = loc(callsite(#loc6 at #loc111)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %fixed = arith.constant dense<1> : tensor<64x1xi64> loc(#loc113) + %x5 = arith.constant dense<0> : tensor<64x1xi32> loc(#loc114) + %fixed_0 = arith.constant dense<0> : tensor<64x1xi64> loc(#loc115) + %x5_1 = arith.constant 0 : i64 loc(#loc116) + %c1_i64 = arith.constant 1 : i64 loc(#loc6) + %cst = arith.constant dense<0.000000e+00> : tensor<64x64xbf16> loc(#loc6) + %c128_i32 = arith.constant 128 : i32 loc(#loc7) + %c0_i32 = arith.constant 0 : i32 loc(#loc7) + %cst_2 = arith.constant dense<4096> : tensor<64x1xi64> loc(#loc6) + %c4096_i64 = arith.constant 4096 : i64 loc(#loc6) + %cst_3 = arith.constant dense<128> : tensor<64x1xi64> loc(#loc6) + %cst_4 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc6) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc6) + %x1 = arith.constant dense<32> : tensor<64x1xi64> loc(#loc69) + %c64_i32 = arith.constant 64 : i32 loc(#loc6) + %xoffset = tt.get_program_id x : i32 loc(#loc70) + %xoffset_6 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc71) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc72) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc73) + %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<64x1xi32> loc(#loc74) + %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<64x1xi32> loc(#loc74) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32> loc(#loc75) + %xmask_10 = arith.cmpi slt, %xindex_9, %xmask : tensor<64x1xi32> loc(#loc75) + %r0_base = tt.expand_dims %xindex {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc76) + %x0 = arith.extsi %xindex_9 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc77) + %x0_11 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc77) + %x0_12 = arith.remsi %x0, %x0_11 : tensor<64x1xi64> loc(#loc77) + %x1_13 = arith.divsi %x0, %x0_11 : tensor<64x1xi64> loc(#loc78) + %x1_14 = arith.remsi %x1_13, %x1 : tensor<64x1xi64> loc(#loc69) + %x2 = tt.splat %ks1 : i64 -> tensor<64x1xi64> loc(#loc79) + %x2_15 = arith.divsi %x0, %x2 : tensor<64x1xi64> loc(#loc79) + %fixed_16 = arith.cmpi ne, %x0_12, %fixed_0 : tensor<64x1xi64> loc(#loc115) + %fixed_17 = arith.subi %x1_13, %fixed : tensor<64x1xi64> loc(#loc113) + %fixed_18 = arith.select %fixed_16, %fixed_17, %x1_13 : tensor<64x1xi1>, tensor<64x1xi64> loc(#loc117) + %x5_19 = arith.cmpi slt, %xindex_9, %x5 : tensor<64x1xi32> loc(#loc114) + %x5_20 = arith.cmpi slt, %ks0, %x5_1 : i64 loc(#loc116) + %x5_21 = tt.splat %x5_20 : i1 -> tensor<64x1xi1> loc(#loc118) + %x5_22 = arith.cmpi ne, %x5_19, %x5_21 : tensor<64x1xi1> loc(#loc118) + %x5_23 = arith.select %x5_22, %fixed_18, %x1_13 : tensor<64x1xi1>, tensor<64x1xi64> loc(#loc119) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%_tmp4_25 = %cst_5) -> (tensor<64x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc82) + %r0_index_26 = arith.addi %r0_index, %r0_base : tensor<1x64xi32> loc(#loc82) + %r0_mask = arith.cmpi slt, %r0_index_26, %cst_4 : tensor<1x64xi32> loc(#loc83) + %tmp0 = arith.muli %x1_14, %cst_3 : tensor<64x1xi64> loc(#loc84) + %tmp0_27 = arith.extsi %r0_index_26 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc85) + %tmp0_28 = tt.broadcast %tmp0_27 : tensor<1x64xi64> -> tensor<64x64xi64> loc(#loc85) + %tmp0_29 = tt.broadcast %tmp0 : tensor<64x1xi64> -> tensor<64x64xi64> loc(#loc85) + %tmp0_30 = arith.addi %tmp0_28, %tmp0_29 : tensor<64x64xi64> loc(#loc85) + %tmp0_31 = arith.muli %x0_12, %cst_2 : tensor<64x1xi64> loc(#loc86) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<64x1xi64> -> tensor<64x64xi64> loc(#loc87) + %tmp0_33 = arith.addi %tmp0_30, %tmp0_32 : tensor<64x64xi64> loc(#loc87) + %tmp0_34 = arith.muli %ks0, %c4096_i64 : i64 loc(#loc88) + %tmp0_35 = tt.splat %tmp0_34 : i64 -> tensor<64x1xi64> loc(#loc89) + %tmp0_36 = arith.muli %tmp0_35, %x2_15 : tensor<64x1xi64> loc(#loc89) + %tmp0_37 = tt.broadcast %tmp0_36 : tensor<64x1xi64> -> tensor<64x64xi64> loc(#loc90) + %tmp0_38 = arith.addi %tmp0_33, %tmp0_37 : tensor<64x64xi64> loc(#loc90) + %tmp0_39 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc91) + %tmp0_40 = tt.addptr %tmp0_39, %tmp0_38 : tensor<64x64x!tt.ptr>, tensor<64x64xi64> loc(#loc91) + %tmp0_41 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc92) + %tmp0_42 = tt.broadcast %xmask_10 : tensor<64x1xi1> -> tensor<64x64xi1> loc(#loc92) + %tmp0_43 = arith.andi %tmp0_41, %tmp0_42 : tensor<64x64xi1> loc(#loc92) + %tmp0_44 = tt.load %tmp0_40, %tmp0_43, %cst evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc93) + %tmp0_45 = arith.extf %tmp0_44 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc94) + %tmp1 = arith.muli %x0_12, %cst_3 : tensor<64x1xi64> loc(#loc95) + %tmp1_46 = tt.broadcast %tmp1 : tensor<64x1xi64> -> tensor<64x64xi64> loc(#loc96) + %tmp1_47 = arith.addi %tmp0_28, %tmp1_46 : tensor<64x64xi64> loc(#loc96) + %tmp1_48 = arith.muli %x5_23, %cst_3 : tensor<64x1xi64> loc(#loc97) + %tmp1_49 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc98) + %tmp1_50 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc99) + %tmp1_51 = arith.extui %tmp1_50 : i1 to i64 loc(#loc100) + %tmp1_52 = arith.muli %ks0, %tmp1_51 : i64 loc(#loc100) + %tmp1_53 = arith.extui %tmp1_49 : i1 to i64 loc(#loc120) + %tmp1_54 = arith.addi %tmp1_53, %tmp1_52 : i64 loc(#loc101) + %tmp1_55 = tt.splat %tmp1_54 : i64 -> tensor<64x1xi64> loc(#loc103) + %tmp1_56 = arith.muli %tmp1_48, %tmp1_55 : tensor<64x1xi64> loc(#loc103) + %tmp1_57 = tt.broadcast %tmp1_56 : tensor<64x1xi64> -> tensor<64x64xi64> loc(#loc104) + %tmp1_58 = arith.addi %tmp1_47, %tmp1_57 : tensor<64x64xi64> loc(#loc104) + %tmp1_59 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc105) + %tmp1_60 = tt.addptr %tmp1_59, %tmp1_58 : tensor<64x64x!tt.ptr>, tensor<64x64xi64> loc(#loc105) + %tmp1_61 = tt.load %tmp1_60, %tmp0_43, %cst evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc106) + %tmp1_62 = arith.extf %tmp1_61 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc107) + %tmp2 = arith.mulf %tmp0_45, %tmp1_62 : tensor<64x64xf32> loc(#loc108) + %tmp5 = arith.addf %_tmp4_25, %tmp2 : tensor<64x64xf32> loc(#loc109) + %_tmp4_63 = arith.select %tmp0_43, %tmp5, %_tmp4_25 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc110) + scf.yield %_tmp4_63 : tensor<64x64xf32> loc(#loc51) + } loc(#loc81) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_25: f32 loc(callsite(#loc6 at #loc111)), %tmp4_26: f32 loc(callsite(#loc6 at #loc111))): + %tmp4_27 = arith.addf %tmp4_25, %tmp4_26 : f32 loc(#loc123) + tt.reduce.return %tmp4_27 : f32 loc(#loc121) + }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc121) + %tmp4_24 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc112) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc56) + %1 = tt.addptr %0, %xindex_9 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc56) + tt.store %1, %tmp4_24, %xmask_10 : tensor<64x1x!tt.ptr> loc(#loc57) + tt.return loc(#loc58) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":30:51) +#loc3 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc4 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc5 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":33:40) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:33) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:44) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:23) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":24:21) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:37) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":27:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:21) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":29:19) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":34:31) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":35:29) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:45) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:41) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:55) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:50) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:65) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:69) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:60) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:34) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:84) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:74) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:136) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:45) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:41) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:54) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:73) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:99) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:90) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:81) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:65) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:58) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:50) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:34) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:106) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:168) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":41:22) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":43:23) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:48) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:8) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:25) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:36) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:4) +#loc66 = loc("fixed"(#loc1)) +#loc67 = loc("x5"(#loc2)) +#loc68 = loc("fixed"(#loc4)) +#loc69 = loc("x1"(#loc8)) +#loc70 = loc("xoffset"(#loc9)) +#loc71 = loc("xoffset"(#loc10)) +#loc72 = loc("xindex"(#loc11)) +#loc73 = loc("xindex"(#loc12)) +#loc74 = loc("xindex"(#loc13)) +#loc75 = loc("xmask"(#loc14)) +#loc76 = loc("r0_base"(#loc15)) +#loc77 = loc("x0"(#loc16)) +#loc78 = loc("x1"(#loc17)) +#loc79 = loc("x2"(#loc18)) +#loc80 = loc("fixed"(#loc19)) +#loc81 = loc("_tmp4"(#loc7)) +#loc82 = loc("r0_index"(#loc22)) +#loc83 = loc("r0_mask"(#loc23)) +#loc84 = loc("tmp0"(#loc24)) +#loc85 = loc("tmp0"(#loc25)) +#loc86 = loc("tmp0"(#loc26)) +#loc87 = loc("tmp0"(#loc27)) +#loc88 = loc("tmp0"(#loc28)) +#loc89 = loc("tmp0"(#loc29)) +#loc90 = loc("tmp0"(#loc30)) +#loc91 = loc("tmp0"(#loc31)) +#loc92 = loc("tmp0"(#loc32)) +#loc93 = loc("tmp0"(#loc33)) +#loc94 = loc("tmp0"(#loc34)) +#loc95 = loc("tmp1"(#loc35)) +#loc96 = loc("tmp1"(#loc36)) +#loc97 = loc("tmp1"(#loc37)) +#loc98 = loc("tmp1"(#loc38)) +#loc99 = loc("tmp1"(#loc39)) +#loc100 = loc("tmp1"(#loc40)) +#loc101 = loc("tmp1"(#loc41)) +#loc102 = loc("tmp1"(#loc42)) +#loc103 = loc("tmp1"(#loc43)) +#loc104 = loc("tmp1"(#loc44)) +#loc105 = loc("tmp1"(#loc45)) +#loc106 = loc("tmp1"(#loc46)) +#loc107 = loc("tmp1"(#loc47)) +#loc108 = loc("tmp2"(#loc48)) +#loc109 = loc("tmp5"(#loc49)) +#loc110 = loc("_tmp4"(#loc50)) +#loc112 = loc("tmp4"(#loc55)) +#loc113 = loc(callsite(#loc66 at #loc67)) +#loc114 = loc(callsite(#loc3 at #loc67)) +#loc115 = loc(callsite(#loc68 at #loc67)) +#loc116 = loc(callsite(#loc5 at #loc67)) +#loc117 = loc(callsite(#loc80 at #loc67)) +#loc118 = loc(callsite(#loc20 at #loc67)) +#loc119 = loc(callsite(#loc21 at #loc67)) +#loc120 = loc(fused[#loc101, #loc102]) +#loc121 = loc(callsite(#loc52 at #loc111)) +#loc123 = loc(callsite(#loc54 at #loc121)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff4b7018106a28af08cdea98908dd0b4b738234 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..d38fef08cd1c77ff37fcd6b2690afc10f6095856 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..359ab9e1ae6818cd5abc13bd428a33c6b8db4a18 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "0e745787b0413e43c7cff91f579e3c3aa5bc5f7083d168227a8ec8e4b815b08c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 32, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..21a32a29f2f55069a83268b92084845938c83e1d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.llir @@ -0,0 +1,158 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 3, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 224, !dbg !9 + %12 = lshr exact i32 %11, 5, !dbg !9 + %13 = and i32 %10, 7, !dbg !9 + %14 = or disjoint i32 %12, %9, !dbg !10 + %15 = or disjoint i32 %9, %13, !dbg !10 + %16 = shl nuw nsw i32 %10, 2, !dbg !11 + %17 = and i32 %16, 124, !dbg !11 + %18 = sdiv i32 %14, 2048, !dbg !12 + %19 = mul i32 %18, 2048, !dbg !13 + %.decomposed = sub i32 %14, %19, !dbg !13 + %20 = srem i32 %18, 32, !dbg !14 + %21 = sdiv i32 %14, 65536, !dbg !15 + %22 = shl nsw i32 %20, 7, !dbg !16 + %23 = shl nsw i32 %.decomposed, 12, !dbg !17 + %24 = shl i32 %21, 23, !dbg !18 + %25 = or disjoint i32 %23, %17, !dbg !19 + %26 = add i32 %25, %24, !dbg !20 + %27 = add i32 %26, %22, !dbg !21 + %28 = sext i32 %27 to i64, !dbg !22 + %29 = getelementptr bfloat, ptr addrspace(1) %0, i64 %28, !dbg !22 + %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !23 + %31 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %29, i64 %30, i1 true) #4, !dbg !23 + %32 = extractvalue { i32, i32 } %31, 0, !dbg !23 + %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !23 + %34 = extractvalue { i32, i32 } %31, 1, !dbg !23 + %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !23 + %36 = shl i32 %14, 7, !dbg !24 + %37 = or disjoint i32 %36, %17, !dbg !25 + %38 = sext i32 %37 to i64, !dbg !26 + %39 = getelementptr bfloat, ptr addrspace(1) %1, i64 %38, !dbg !26 + %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !27 + %41 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %39, i64 %40, i1 true) #4, !dbg !27 + %42 = extractvalue { i32, i32 } %41, 0, !dbg !27 + %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !27 + %44 = extractvalue { i32, i32 } %41, 1, !dbg !27 + %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !27 + %46 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !28 + %47 = fpext <2 x bfloat> %43 to <2 x float>, !dbg !29 + %48 = fmul <2 x float> %46, %47, !dbg !30 + %49 = fadd <2 x float> %48, zeroinitializer, !dbg !31 + %50 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !28 + %51 = fpext <2 x bfloat> %45 to <2 x float>, !dbg !29 + %52 = fmul <2 x float> %50, %51, !dbg !30 + %53 = fadd <2 x float> %52, zeroinitializer, !dbg !31 + %shift = shufflevector <2 x float> %49, <2 x float> poison, <2 x i32> , !dbg !32 + %foldExtExtBinop = fadd <2 x float> %49, %shift, !dbg !32 + %foldExtExtBinop2 = fadd <2 x float> %53, %foldExtExtBinop, !dbg !32 + %shift4 = shufflevector <2 x float> %53, <2 x float> poison, <2 x i32> , !dbg !32 + %foldExtExtBinop5 = fadd <2 x float> %shift4, %foldExtExtBinop2, !dbg !32 + %54 = extractelement <2 x float> %foldExtExtBinop5, i64 0, !dbg !32 + %55 = bitcast float %54 to i32, !dbg !36 + %56 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 16, i32 31), !dbg !36 + %57 = bitcast i32 %56 to float, !dbg !36 + %58 = fadd float %54, %57, !dbg !32 + %59 = bitcast float %58 to i32, !dbg !36 + %60 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %59, i32 8, i32 31), !dbg !36 + %61 = bitcast i32 %60 to float, !dbg !36 + %62 = fadd float %58, %61, !dbg !32 + %63 = bitcast float %62 to i32, !dbg !36 + %64 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 4, i32 31), !dbg !36 + %65 = bitcast i32 %64 to float, !dbg !36 + %66 = fadd float %62, %65, !dbg !32 + %67 = bitcast float %66 to i32, !dbg !36 + %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 2, i32 31), !dbg !36 + %69 = bitcast i32 %68 to float, !dbg !36 + %70 = fadd float %66, %69, !dbg !32 + %71 = bitcast float %70 to i32, !dbg !36 + %72 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 1, i32 31), !dbg !36 + %73 = bitcast i32 %72 to float, !dbg !36 + %74 = fadd float %70, %73, !dbg !32 + %75 = lshr exact i32 %11, 3, !dbg !37 + %76 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %75, !dbg !37 + store float %74, ptr addrspace(3) %76, align 4, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37 + %77 = shl nuw nsw i32 %13, 2, !dbg !37 + %78 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %77, !dbg !37 + %79 = load i32, ptr addrspace(3) %78, align 4, !dbg !37 + %80 = sext i32 %15 to i64, !dbg !38 + %81 = getelementptr float, ptr addrspace(1) %2, i64 %80, !dbg !38 + %82 = and i32 %10, 248, !dbg !39 + %83 = icmp eq i32 %82, 0, !dbg !39 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %79, ptr addrspace(1) %81, i1 %83) #4, !dbg !39 + ret void, !dbg !40 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 21, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 29, column: 29, scope: !4) +!15 = !DILocation(line: 30, column: 19, scope: !4) +!16 = !DILocation(line: 39, column: 45, scope: !4) +!17 = !DILocation(line: 39, column: 55, scope: !4) +!18 = !DILocation(line: 39, column: 68, scope: !4) +!19 = !DILocation(line: 39, column: 41, scope: !4) +!20 = !DILocation(line: 39, column: 50, scope: !4) +!21 = !DILocation(line: 39, column: 60, scope: !4) +!22 = !DILocation(line: 39, column: 34, scope: !4) +!23 = !DILocation(line: 39, column: 73, scope: !4) +!24 = !DILocation(line: 40, column: 45, scope: !4) +!25 = !DILocation(line: 40, column: 41, scope: !4) +!26 = !DILocation(line: 40, column: 34, scope: !4) +!27 = !DILocation(line: 40, column: 50, scope: !4) +!28 = !DILocation(line: 39, column: 127, scope: !4) +!29 = !DILocation(line: 40, column: 104, scope: !4) +!30 = !DILocation(line: 41, column: 22, scope: !4) +!31 = !DILocation(line: 43, column: 23, scope: !4) +!32 = !DILocation(line: 261, column: 15, scope: !33, inlinedAt: !35) +!33 = distinct !DILexicalBlockFile(scope: !4, file: !34, discriminator: 0) +!34 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!35 = !DILocation(line: 45, column: 25, scope: !4) +!36 = !DILocation(line: 291, column: 36, scope: !33, inlinedAt: !35) +!37 = !DILocation(line: 45, column: 28, scope: !4) +!38 = !DILocation(line: 49, column: 25, scope: !4) +!39 = !DILocation(line: 49, column: 36, scope: !4) +!40 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..e2fcc223b539d9f90ee01ae82f3ac2ebe5f61b04 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.ptx @@ -0,0 +1,412 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u32 triton_red_fused_zeros_0_param_3, + .param .u32 triton_red_fused_zeros_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_6 +) +.reqntid 256 +{ + .reg .pred %p<4>; + .reg .b16 %rs<9>; + .reg .b32 %r<72>; + .reg .b64 %rd<11>; + .loc 1 18 0 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_red_fused_zeros_0_param_0]; + ld.param.b64 %rd9, [triton_red_fused_zeros_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:23:28 + mov.u32 %r10, %ctaid.x; + .loc 1 23 33 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:23:33 + shl.b32 %r11, %r10, 3; + ld.param.b64 %rd10, [triton_red_fused_zeros_0_param_2]; + .loc 1 24 44 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:24:44 + mov.u32 %r12, %tid.x; + and.b32 %r13, %r12, 224; + bfe.u32 %r14, %r12, 5, 3; + and.b32 %r15, %r12, 7; + .loc 1 24 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:24:23 + or.b32 %r16, %r14, %r11; + or.b32 %r17, %r11, %r15; + .loc 1 26 37 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:26:37 + shl.b32 %r18, %r12, 2; + and.b32 %r19, %r18, 124; + .loc 1 29 21 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:29:21 + bfe.s32 %r20, %r10, 28, 1; + shr.u32 %r21, %r20, 21; + add.s32 %r22, %r16, %r21; + shr.s32 %r23, %r22, 11; + .loc 1 28 19 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:28:19 + and.b32 %r24, %r22, 1046528; + sub.s32 %r25, %r16, %r24; + .loc 1 29 29 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:29:29 + shr.u32 %r26, %r23, 27; + add.s32 %r27, %r23, %r26; + and.b32 %r28, %r27, 33554400; + sub.s32 %r29, %r23, %r28; + .loc 1 30 19 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:30:19 + shr.u32 %r30, %r20, 16; + add.s32 %r31, %r16, %r30; + .loc 1 39 45 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:45 + shl.b32 %r32, %r29, 7; + .loc 1 39 55 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:55 + shl.b32 %r33, %r25, 12; + .loc 1 39 68 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:68 + shl.b32 %r34, %r31, 7; + and.b32 %r35, %r34, -8388608; + .loc 1 39 41 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:41 + or.b32 %r36, %r33, %r19; + .loc 1 39 50 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:50 + add.s32 %r37, %r36, %r35; + .loc 1 39 60 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:60 + add.s32 %r38, %r37, %r32; + .loc 1 39 34 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:34 + mad.wide.s32 %rd2, %r38, 2, %rd8; + .loc 1 39 73 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:73 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd3, 1.0; + // end inline asm + mov.b32 %r3, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd2 + 0 ], %rd3; + // end inline asm + .loc 1 40 45 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:45 + shl.b32 %r39, %r16, 7; + .loc 1 40 41 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:41 + or.b32 %r40, %r39, %r19; + .loc 1 40 34 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:34 + mad.wide.s32 %rd5, %r40, 2, %rd9; + .loc 1 40 50 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:50 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r5, %r3; + mov.u32 %r6, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r5, %r6 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 39 127 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:127 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r41, %rs1; + cvt.f32.bf16 %r42, %rs2; + .loc 1 40 104 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:104 + mov.b32 {%rs3, %rs4}, %r5; + cvt.f32.bf16 %r43, %rs3; + cvt.f32.bf16 %r44, %rs4; + .loc 1 43 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:43:23 + fma.rn.f32 %r45, %r42, %r44, 0f00000000; + fma.rn.f32 %r46, %r41, %r43, 0f00000000; + .loc 1 39 127 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:127 + mov.b32 {%rs5, %rs6}, %r2; + cvt.f32.bf16 %r47, %rs5; + cvt.f32.bf16 %r48, %rs6; + .loc 1 40 104 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:104 + mov.b32 {%rs7, %rs8}, %r6; + cvt.f32.bf16 %r49, %rs7; + cvt.f32.bf16 %r50, %rs8; + .loc 1 43 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:43:23 + fma.rn.f32 %r51, %r48, %r50, 0f00000000; + fma.rn.f32 %r52, %r47, %r49, 0f00000000; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r53, %r46, %r45; + add.f32 %r54, %r52, %r53; + add.f32 %r55, %r51, %r54; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r56, %r55, 16, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r57, %r55, %r56; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r58, %r57, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r59, %r57, %r58; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r60, %r59, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r61, %r59, %r60; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r62, %r61, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r63, %r61, %r62; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r64, %r63, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r65, %r63, %r64; +$L__tmp2: + .loc 1 45 28 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:28 + shr.u32 %r66, %r13, 3; + mov.b32 %r67, global_smem; + add.s32 %r68, %r67, %r66; + st.shared.b32 [%r68], %r65; + bar.sync 0; + shl.b32 %r69, %r15, 2; + add.s32 %r70, %r67, %r69; + ld.shared.b32 %r9, [%r70]; + .loc 1 49 25 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:49:25 + mad.wide.s32 %rd7, %r17, 4, %rd10; + .loc 1 49 36 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:49:36 + and.b32 %r71, %r12, 248; + setp.eq.b32 %p3, %r71, 0; + // begin inline asm + @%p3 st.global.b32 [ %rd7 + 0 ], { %r9 }; + // end inline asm + .loc 1 49 4 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:49:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 209 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xca DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 100 +.b8 113 +.b8 120 +.b8 120 +.b8 101 +.b8 118 +.b8 100 +.b8 121 +.b8 115 +.b8 115 +.b8 111 +.b8 121 +.b8 117 +.b8 116 +.b8 50 +.b8 101 +.b8 117 +.b8 119 +.b8 53 +.b8 53 +.b8 121 +.b8 50 +.b8 55 +.b8 99 +.b8 97 +.b8 104 +.b8 113 +.b8 113 +.b8 99 +.b8 103 +.b8 109 +.b8 118 +.b8 121 +.b8 117 +.b8 104 +.b8 100 +.b8 105 +.b8 104 +.b8 98 +.b8 52 +.b8 116 +.b8 109 +.b8 110 +.b8 101 +.b8 114 +.b8 55 +.b8 99 +.b8 102 +.b8 99 +.b8 55 +.b8 102 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 100 +.b8 113 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..7d68d6841390481245920a275ea472a15f9098f1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.source @@ -0,0 +1,222 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":18:0) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc46 = loc(unknown) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc53 = loc("in_ptr0"(#loc)) +#loc54 = loc("in_ptr1"(#loc)) +#loc55 = loc("out_ptr1"(#loc)) +#loc56 = loc("xnumel"(#loc)) +#loc57 = loc("r0_numel"(#loc)) +#loc97 = loc("input"(#loc44)) +#loc98 = loc("a"(#loc49)) +#loc99 = loc("b"(#loc49)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 524288 : i32 loc(#loc58) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc59) + %xoffset = tt.get_program_id x : i32 loc(#loc60) + %xoffset_2 = arith.constant 8 : i32 loc(#loc61) + %xoffset_3 = arith.constant 8 : i32 loc(#loc61) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc61) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc62) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc63) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc64) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc64) + %xmask = arith.constant true loc(#loc65) + %xmask_8 = arith.constant dense : tensor<8x128xi1> loc(#loc65) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc66) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc67) + %x0 = arith.constant 2048 : i32 loc(#loc68) + %x0_10 = arith.constant 2048 : i32 loc(#loc68) + %x0_11 = arith.constant dense<2048> : tensor<8x1xi32> loc(#loc68) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<8x1xi32> loc(#loc68) + %x1 = arith.constant 2048 : i32 loc(#loc69) + %x1_13 = arith.constant 2048 : i32 loc(#loc69) + %x1_14 = arith.constant dense<2048> : tensor<8x1xi32> loc(#loc69) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<8x1xi32> loc(#loc69) + %x1_16 = arith.constant 32 : i32 loc(#loc70) + %x1_17 = arith.constant 32 : i32 loc(#loc70) + %x1_18 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc70) + %x1_19 = arith.remsi %x1_15, %x1_18 : tensor<8x1xi32> loc(#loc70) + %x2 = arith.constant 65536 : i32 loc(#loc71) + %x2_20 = arith.constant 65536 : i32 loc(#loc71) + %x2_21 = arith.constant dense<65536> : tensor<8x1xi32> loc(#loc71) + %x2_22 = arith.divsi %xindex_7, %x2_21 : tensor<8x1xi32> loc(#loc71) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc72) + %_tmp4_23 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc72) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c128_i32 = arith.constant 128 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_24 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_27 = %_tmp4_23) -> (tensor<8x128xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc74) + %r0_index_28 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc74) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc75) + %r0_mask_29 = arith.cmpi slt, %r0_index_28, %r0_mask : tensor<1x128xi32> loc(#loc75) + %tmp0 = arith.constant 128 : i32 loc(#loc76) + %tmp0_30 = arith.constant 128 : i32 loc(#loc76) + %tmp0_31 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc76) + %tmp0_32 = arith.muli %tmp0_31, %x1_19 : tensor<8x1xi32> loc(#loc76) + %tmp0_33 = tt.broadcast %r0_index_28 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc77) + %tmp0_34 = tt.broadcast %tmp0_32 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc77) + %tmp0_35 = arith.addi %tmp0_33, %tmp0_34 : tensor<8x128xi32> loc(#loc77) + %tmp0_36 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_37 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_38 = arith.constant dense<4096> : tensor<8x1xi32> loc(#loc78) + %tmp0_39 = arith.muli %tmp0_38, %x0_12 : tensor<8x1xi32> loc(#loc78) + %tmp0_40 = tt.broadcast %tmp0_39 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc79) + %tmp0_41 = arith.addi %tmp0_35, %tmp0_40 : tensor<8x128xi32> loc(#loc79) + %tmp0_42 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_43 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_44 = arith.constant dense<8388608> : tensor<8x1xi32> loc(#loc80) + %tmp0_45 = arith.muli %tmp0_44, %x2_22 : tensor<8x1xi32> loc(#loc80) + %tmp0_46 = tt.broadcast %tmp0_45 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc81) + %tmp0_47 = arith.addi %tmp0_41, %tmp0_46 : tensor<8x128xi32> loc(#loc81) + %tmp0_48 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc82) + %tmp0_49 = tt.addptr %tmp0_48, %tmp0_47 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc82) + %tmp0_50 = arith.constant 0.000000e+00 : f32 loc(#loc83) + %tmp0_51 = tt.broadcast %r0_mask_29 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc83) + %tmp0_52 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc83) + %tmp0_53 = arith.truncf %tmp0_52 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc83) + %tmp0_54 = tt.load %tmp0_49, %tmp0_51, %tmp0_53 evictionPolicy = evict_first : tensor<8x128x!tt.ptr> loc(#loc83) + %tmp0_55 = arith.extf %tmp0_54 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc84) + %tmp1 = arith.constant 128 : i32 loc(#loc85) + %tmp1_56 = arith.constant 128 : i32 loc(#loc85) + %tmp1_57 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc85) + %tmp1_58 = arith.muli %tmp1_57, %xindex_7 : tensor<8x1xi32> loc(#loc85) + %tmp1_59 = tt.broadcast %r0_index_28 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc86) + %tmp1_60 = tt.broadcast %tmp1_58 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc86) + %tmp1_61 = arith.addi %tmp1_59, %tmp1_60 : tensor<8x128xi32> loc(#loc86) + %tmp1_62 = tt.splat %in_ptr1 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc87) + %tmp1_63 = tt.addptr %tmp1_62, %tmp1_61 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc87) + %tmp1_64 = arith.constant 0.000000e+00 : f32 loc(#loc88) + %tmp1_65 = tt.broadcast %r0_mask_29 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc88) + %tmp1_66 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc88) + %tmp1_67 = arith.truncf %tmp1_66 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc88) + %tmp1_68 = tt.load %tmp1_63, %tmp1_65, %tmp1_67 evictionPolicy = evict_first : tensor<8x128x!tt.ptr> loc(#loc88) + %tmp1_69 = arith.extf %tmp1_68 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc89) + %tmp2 = arith.mulf %tmp0_55, %tmp1_69 : tensor<8x128xf32> loc(#loc90) + %tmp5 = arith.addf %_tmp4_27, %tmp2 : tensor<8x128xf32> loc(#loc91) + %_tmp4_70 = tt.broadcast %r0_mask_29 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc92) + %_tmp4_71 = arith.select %_tmp4_70, %tmp5, %_tmp4_27 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc92) + scf.yield %_tmp4_71 : tensor<8x128xf32> loc(#loc36) + } loc(#loc73) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_24) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc93) + %tmp4_25 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc94) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc95) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<8x1xf32> loc(#loc96) + %tmp8_26 = arith.subf %tmp4_25, %tmp8 : tensor<8x1xf32> loc(#loc96) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc41) + %5 = tt.addptr %4, %xindex_7 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc41) + tt.store %5, %tmp8_26 : tensor<8x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x128xf32> loc("input"(#loc44))) -> tensor<8xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc45) + tt.reduce.return %2 : f32 loc(#loc45) + }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc45) + tt.return %0 : tensor<8xf32> loc(#loc47) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8xf32> loc(#loc48) + tt.return %1 : tensor<8xf32> loc(#loc48) + } loc(#loc44) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc49)), %b: f32 loc("b"(#loc49))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc50) + tt.return %0 : f32 loc(#loc51) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc52) + tt.return %1 : f32 loc(#loc52) + } loc(#loc49) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":30:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":32:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:68) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:60) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:73) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:127) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:45) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:50) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:104) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":41:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":43:23) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:40) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:28) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":47:11) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":48:18) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:4) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc58 = loc("xnumel"(#loc1)) +#loc59 = loc("r0_numel"(#loc2)) +#loc60 = loc("xoffset"(#loc3)) +#loc61 = loc("xoffset"(#loc4)) +#loc62 = loc("xindex"(#loc5)) +#loc63 = loc("xindex"(#loc6)) +#loc64 = loc("xindex"(#loc7)) +#loc65 = loc("xmask"(#loc8)) +#loc66 = loc("r0_base"(#loc9)) +#loc67 = loc("r0_base"(#loc10)) +#loc68 = loc("x0"(#loc11)) +#loc69 = loc("x1"(#loc12)) +#loc70 = loc("x1"(#loc13)) +#loc71 = loc("x2"(#loc14)) +#loc72 = loc("_tmp4"(#loc15)) +#loc73 = loc("_tmp4"(#loc16)) +#loc74 = loc("r0_index"(#loc17)) +#loc75 = loc("r0_mask"(#loc18)) +#loc76 = loc("tmp0"(#loc19)) +#loc77 = loc("tmp0"(#loc20)) +#loc78 = loc("tmp0"(#loc21)) +#loc79 = loc("tmp0"(#loc22)) +#loc80 = loc("tmp0"(#loc23)) +#loc81 = loc("tmp0"(#loc24)) +#loc82 = loc("tmp0"(#loc25)) +#loc83 = loc("tmp0"(#loc26)) +#loc84 = loc("tmp0"(#loc27)) +#loc85 = loc("tmp1"(#loc28)) +#loc86 = loc("tmp1"(#loc29)) +#loc87 = loc("tmp1"(#loc30)) +#loc88 = loc("tmp1"(#loc31)) +#loc89 = loc("tmp1"(#loc32)) +#loc90 = loc("tmp2"(#loc33)) +#loc91 = loc("tmp5"(#loc34)) +#loc92 = loc("_tmp4"(#loc35)) +#loc93 = loc("tmp4"(#loc37)) +#loc94 = loc("tmp4"(#loc38)) +#loc95 = loc("tmp7"(#loc39)) +#loc96 = loc("tmp8"(#loc40)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..69d9c3e0b2306108ff440f6a4ff4308d2ca282ca --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,142 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":18:0) +#loc1 = loc(unknown) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:25) +#loc36 = loc("in_ptr0"(#loc)) +#loc37 = loc("in_ptr1"(#loc)) +#loc38 = loc("out_ptr1"(#loc)) +#loc39 = loc("xnumel"(#loc)) +#loc40 = loc("r0_numel"(#loc)) +#loc68 = loc("tmp4"(#loc30)) +#loc71 = loc(callsite(#loc1 at #loc68)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<8388608> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<65536> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<32> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<2048> : tensor<8x1xi32, #blocked> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %cst_6 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<8x128xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc41) + %xoffset_8 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc42) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc43) + %xindex_9 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc43) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc43) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc43) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<8x1xi32, #blocked> loc(#loc44) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc44) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<8x1xi32, #blocked> loc(#loc44) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<8x1xi32, #blocked1> loc(#loc44) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc45) + %r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc45) + %x0 = arith.remsi %xindex_14, %cst_5 : tensor<8x1xi32, #blocked> loc(#loc46) + %x1 = arith.divsi %xindex_14, %cst_5 : tensor<8x1xi32, #blocked> loc(#loc47) + %x1_17 = arith.remsi %x1, %cst_4 : tensor<8x1xi32, #blocked> loc(#loc48) + %x2 = arith.divsi %xindex_14, %cst_3 : tensor<8x1xi32, #blocked> loc(#loc49) + %r0_mask = arith.cmpi slt, %r0_base_16, %cst : tensor<1x128xi32, #blocked> loc(#loc50) + %tmp0 = arith.muli %x1_17, %cst_0 : tensor<8x1xi32, #blocked> loc(#loc51) + %tmp0_18 = tt.broadcast %r0_base_16 : tensor<1x128xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc52) + %tmp0_19 = tt.broadcast %tmp0 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc52) + %tmp0_20 = arith.addi %tmp0_18, %tmp0_19 : tensor<8x128xi32, #blocked> loc(#loc52) + %tmp0_21 = arith.muli %x0, %cst_1 : tensor<8x1xi32, #blocked> loc(#loc53) + %tmp0_22 = tt.broadcast %tmp0_21 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc54) + %tmp0_23 = arith.addi %tmp0_20, %tmp0_22 : tensor<8x128xi32, #blocked> loc(#loc54) + %tmp0_24 = arith.muli %x2, %cst_2 : tensor<8x1xi32, #blocked> loc(#loc55) + %tmp0_25 = tt.broadcast %tmp0_24 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc56) + %tmp0_26 = arith.addi %tmp0_23, %tmp0_25 : tensor<8x128xi32, #blocked> loc(#loc56) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr, #blocked> loc(#loc57) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<8x128x!tt.ptr, #blocked>, tensor<8x128xi32, #blocked> loc(#loc57) + %tmp0_29 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc58) + %tmp0_30 = tt.load %tmp0_28, %tmp0_29, %cst_6 evictionPolicy = evict_first : tensor<8x128x!tt.ptr, #blocked> loc(#loc58) + %tmp0_31 = arith.extf %tmp0_30 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc59) + %tmp1 = arith.muli %xindex_14, %cst_0 : tensor<8x1xi32, #blocked> loc(#loc60) + %tmp1_32 = tt.broadcast %tmp1 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc61) + %tmp1_33 = arith.addi %tmp0_18, %tmp1_32 : tensor<8x128xi32, #blocked> loc(#loc61) + %tmp1_34 = tt.splat %in_ptr1 : !tt.ptr -> tensor<8x128x!tt.ptr, #blocked> loc(#loc62) + %tmp1_35 = tt.addptr %tmp1_34, %tmp1_33 : tensor<8x128x!tt.ptr, #blocked>, tensor<8x128xi32, #blocked> loc(#loc62) + %tmp1_36 = tt.load %tmp1_35, %tmp0_29, %cst_6 evictionPolicy = evict_first : tensor<8x128x!tt.ptr, #blocked> loc(#loc63) + %tmp1_37 = arith.extf %tmp1_36 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc64) + %tmp2 = arith.mulf %tmp0_31, %tmp1_37 : tensor<8x128xf32, #blocked> loc(#loc65) + %tmp5 = arith.addf %tmp2, %cst_7 : tensor<8x128xf32, #blocked> loc(#loc66) + %_tmp4 = arith.select %tmp0_29, %tmp5, %cst_7 : tensor<8x128xi1, #blocked>, tensor<8x128xf32, #blocked> loc(#loc67) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_40: f32 loc(callsite(#loc1 at #loc68)), %tmp4_41: f32 loc(callsite(#loc1 at #loc68))): + %tmp4_42 = arith.addf %tmp4_40, %tmp4_41 : f32 loc(#loc72) + tt.reduce.return %tmp4_42 : f32 loc(#loc70) + }) : (tensor<8x128xf32, #blocked>) -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc70) + %tmp4_38 = ttg.convert_layout %tmp4 : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc69) + %tmp4_39 = tt.expand_dims %tmp4_38 {axis = 1 : i32} : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xf32, #blocked1> loc(#loc69) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked1> loc(#loc33) + %1 = tt.addptr %0, %xindex_15 : tensor<8x1x!tt.ptr, #blocked1>, tensor<8x1xi32, #blocked1> loc(#loc33) + tt.store %1, %tmp4_39 : tensor<8x1x!tt.ptr, #blocked1> loc(#loc34) + tt.return loc(#loc35) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":30:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":35:29) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:45) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:41) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:55) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:50) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:68) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:60) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:73) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:127) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:45) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:41) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:34) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:50) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:104) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":41:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":43:23) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:40) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:28) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:4) +#loc41 = loc("xoffset"(#loc2)) +#loc42 = loc("xoffset"(#loc3)) +#loc43 = loc("xindex"(#loc4)) +#loc44 = loc("xindex"(#loc5)) +#loc45 = loc("r0_base"(#loc6)) +#loc46 = loc("x0"(#loc7)) +#loc47 = loc("x1"(#loc8)) +#loc48 = loc("x1"(#loc9)) +#loc49 = loc("x2"(#loc10)) +#loc50 = loc("r0_mask"(#loc11)) +#loc51 = loc("tmp0"(#loc12)) +#loc52 = loc("tmp0"(#loc13)) +#loc53 = loc("tmp0"(#loc14)) +#loc54 = loc("tmp0"(#loc15)) +#loc55 = loc("tmp0"(#loc16)) +#loc56 = loc("tmp0"(#loc17)) +#loc57 = loc("tmp0"(#loc18)) +#loc58 = loc("tmp0"(#loc19)) +#loc59 = loc("tmp0"(#loc20)) +#loc60 = loc("tmp1"(#loc21)) +#loc61 = loc("tmp1"(#loc22)) +#loc62 = loc("tmp1"(#loc23)) +#loc63 = loc("tmp1"(#loc24)) +#loc64 = loc("tmp1"(#loc25)) +#loc65 = loc("tmp2"(#loc26)) +#loc66 = loc("tmp5"(#loc27)) +#loc67 = loc("_tmp4"(#loc28)) +#loc69 = loc("tmp4"(#loc32)) +#loc70 = loc(callsite(#loc29 at #loc68)) +#loc72 = loc(callsite(#loc31 at #loc70)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..7babc7e7fb8a266f9168e5bf1144973d7cb6cc37 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/BZ2FPB5QIE7EHR6P7EPVPHR4HKS3YX3QQPIWQIT2R3EOJOAVWCGA/triton_red_fused_zeros_0.ttir @@ -0,0 +1,139 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":18:0) +#loc1 = loc(unknown) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:25) +#loc38 = loc("in_ptr0"(#loc)) +#loc39 = loc("in_ptr1"(#loc)) +#loc40 = loc("out_ptr1"(#loc)) +#loc41 = loc("xnumel"(#loc)) +#loc42 = loc("r0_numel"(#loc)) +#loc72 = loc("tmp4"(#loc32)) +#loc75 = loc(callsite(#loc1 at #loc72)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<8x128xbf16> loc(#loc1) + %cst_0 = arith.constant dense<8388608> : tensor<8x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<8x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc1) + %x2 = arith.constant dense<65536> : tensor<8x1xi32> loc(#loc43) + %x1 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc44) + %cst_5 = arith.constant dense<2048> : tensor<8x1xi32> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc45) + %xoffset_6 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc46) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc47) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc48) + %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<8x1xi32> loc(#loc49) + %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<8x1xi32> loc(#loc49) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc50) + %r0_base_10 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc51) + %x0 = arith.remsi %xindex_9, %cst_5 : tensor<8x1xi32> loc(#loc52) + %x1_11 = arith.divsi %xindex_9, %cst_5 : tensor<8x1xi32> loc(#loc53) + %x1_12 = arith.remsi %x1_11, %x1 : tensor<8x1xi32> loc(#loc44) + %x2_13 = arith.divsi %xindex_9, %x2 : tensor<8x1xi32> loc(#loc43) + %r0_mask = arith.cmpi slt, %r0_base_10, %cst_3 : tensor<1x128xi32> loc(#loc54) + %tmp0 = arith.muli %x1_12, %cst_2 : tensor<8x1xi32> loc(#loc55) + %tmp0_14 = tt.broadcast %r0_base_10 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc56) + %tmp0_15 = tt.broadcast %tmp0 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc56) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<8x128xi32> loc(#loc56) + %tmp0_17 = arith.muli %x0, %cst_1 : tensor<8x1xi32> loc(#loc57) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc58) + %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<8x128xi32> loc(#loc58) + %tmp0_20 = arith.muli %x2_13, %cst_0 : tensor<8x1xi32> loc(#loc59) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc60) + %tmp0_22 = arith.addi %tmp0_19, %tmp0_21 : tensor<8x128xi32> loc(#loc60) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc61) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc61) + %tmp0_25 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc62) + %tmp0_26 = tt.load %tmp0_24, %tmp0_25, %cst evictionPolicy = evict_first : tensor<8x128x!tt.ptr> loc(#loc62) + %tmp0_27 = arith.extf %tmp0_26 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc63) + %tmp1 = arith.muli %xindex_9, %cst_2 : tensor<8x1xi32> loc(#loc64) + %tmp1_28 = tt.broadcast %tmp1 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc65) + %tmp1_29 = arith.addi %tmp0_14, %tmp1_28 : tensor<8x128xi32> loc(#loc65) + %tmp1_30 = tt.splat %in_ptr1 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc66) + %tmp1_31 = tt.addptr %tmp1_30, %tmp1_29 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc66) + %tmp1_32 = tt.load %tmp1_31, %tmp0_25, %cst evictionPolicy = evict_first : tensor<8x128x!tt.ptr> loc(#loc67) + %tmp1_33 = arith.extf %tmp1_32 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc68) + %tmp2 = arith.mulf %tmp0_27, %tmp1_33 : tensor<8x128xf32> loc(#loc69) + %tmp5 = arith.addf %tmp2, %cst_4 : tensor<8x128xf32> loc(#loc70) + %_tmp4 = arith.select %tmp0_25, %tmp5, %cst_4 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc71) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_35: f32 loc(callsite(#loc1 at #loc72)), %tmp4_36: f32 loc(callsite(#loc1 at #loc72))): + %tmp4_37 = arith.addf %tmp4_35, %tmp4_36 : f32 loc(#loc76) + tt.reduce.return %tmp4_37 : f32 loc(#loc74) + }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc74) + %tmp4_34 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc73) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc35) + %1 = tt.addptr %0, %xindex_9 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc35) + tt.store %1, %tmp4_34 : tensor<8x1x!tt.ptr> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":30:19) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:29) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:33) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:44) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:23) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":35:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:45) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:41) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:55) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:50) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:68) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:60) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:73) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:127) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:45) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:41) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:104) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":41:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":43:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:28) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:36) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:4) +#loc43 = loc("x2"(#loc2)) +#loc44 = loc("x1"(#loc3)) +#loc45 = loc("xoffset"(#loc4)) +#loc46 = loc("xoffset"(#loc5)) +#loc47 = loc("xindex"(#loc6)) +#loc48 = loc("xindex"(#loc7)) +#loc49 = loc("xindex"(#loc8)) +#loc50 = loc("r0_base"(#loc9)) +#loc51 = loc("r0_base"(#loc10)) +#loc52 = loc("x0"(#loc11)) +#loc53 = loc("x1"(#loc12)) +#loc54 = loc("r0_mask"(#loc13)) +#loc55 = loc("tmp0"(#loc14)) +#loc56 = loc("tmp0"(#loc15)) +#loc57 = loc("tmp0"(#loc16)) +#loc58 = loc("tmp0"(#loc17)) +#loc59 = loc("tmp0"(#loc18)) +#loc60 = loc("tmp0"(#loc19)) +#loc61 = loc("tmp0"(#loc20)) +#loc62 = loc("tmp0"(#loc21)) +#loc63 = loc("tmp0"(#loc22)) +#loc64 = loc("tmp1"(#loc23)) +#loc65 = loc("tmp1"(#loc24)) +#loc66 = loc("tmp1"(#loc25)) +#loc67 = loc("tmp1"(#loc26)) +#loc68 = loc("tmp1"(#loc27)) +#loc69 = loc("tmp2"(#loc28)) +#loc70 = loc("tmp5"(#loc29)) +#loc71 = loc("_tmp4"(#loc30)) +#loc73 = loc("tmp4"(#loc34)) +#loc74 = loc(callsite(#loc31 at #loc72)) +#loc76 = loc(callsite(#loc33 at #loc74)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5559ee464ff9cdfd888024532e0c4dee90745172 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json @@ -0,0 +1 @@ +{"hash": "16ca2c8864609722b876bac87eb220846cb74e2fe4810a46a027c2f3e653aead", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 64, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..4050c64d8941e7d18402f45d89716358b89a5aef --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir @@ -0,0 +1,934 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !5 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = shl nuw nsw i32 %8, 2, !dbg !9 + %10 = and i32 %9, 2044, !dbg !9 + %11 = mul i32 %7, 32000, !dbg !10 + %12 = zext nneg i32 %10 to i64, !dbg !11 + br label %13, !dbg !11 + +13: ; preds = %6, %13 + %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %13 ] + %14 = phi <2 x float> [ zeroinitializer, %6 ], [ %271, %13 ] + %15 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %269, %13 ] + %16 = phi <2 x float> [ zeroinitializer, %6 ], [ %270, %13 ] + %17 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %268, %13 ] + %18 = or disjoint i64 %indvars.iv, %12, !dbg !12 + %19 = icmp samesign ult i64 %18, 32000, !dbg !13 + %20 = trunc nuw nsw i64 %18 to i32, !dbg !14 + %21 = add i32 %11, %20, !dbg !14 + %22 = sext i32 %21 to i64, !dbg !15 + %23 = getelementptr bfloat, ptr addrspace(1) %0, i64 %22, !dbg !15 + %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !16 + %25 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %23, i64 %24, i1 %19) #6, !dbg !16 + %26 = extractvalue { i32, i32 } %25, 0, !dbg !16 + %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !16 + %28 = extractvalue { i32, i32 } %25, 1, !dbg !16 + %29 = bitcast i32 %28 to <2 x bfloat>, !dbg !16 + %30 = fcmp uno <2 x float> %17, zeroinitializer, !dbg !17 + %31 = fcmp uno <2 x float> %15, zeroinitializer, !dbg !17 + %32 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i72 = icmp eq i32 %32, 0, !dbg !21 + %33 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i74 = icmp eq i32 %33, 0, !dbg !21 + %34 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i78 = icmp eq i32 %35, 0, !dbg !21 + %36 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i80 = icmp eq i32 %36, 0, !dbg !21 + %37 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i82 = icmp eq i32 %37, 0, !dbg !21 + %38 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i84 = icmp eq i32 %38, 0, !dbg !21 + %39 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %40 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i88 = icmp eq i32 %40, 0, !dbg !21 + %41 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i90 = icmp eq i32 %41, 0, !dbg !21 + %42 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i92 = icmp eq i32 %42, 0, !dbg !21 + %43 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i94 = icmp eq i32 %43, 0, !dbg !21 + %44 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %45 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i98 = icmp eq i32 %45, 0, !dbg !21 + %46 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i100 = icmp eq i32 %46, 0, !dbg !21 + %47 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i102 = icmp eq i32 %47, 0, !dbg !21 + %48 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i104 = icmp eq i32 %48, 0, !dbg !21 + %49 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %50 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i108 = icmp eq i32 %50, 0, !dbg !21 + %51 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i110 = icmp eq i32 %51, 0, !dbg !21 + %52 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i112 = icmp eq i32 %52, 0, !dbg !21 + %53 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i114 = icmp eq i32 %53, 0, !dbg !21 + %54 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %55 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i118 = icmp eq i32 %55, 0, !dbg !21 + %56 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i120 = icmp eq i32 %56, 0, !dbg !21 + %57 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i122 = icmp eq i32 %57, 0, !dbg !21 + %58 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i124 = icmp eq i32 %58, 0, !dbg !21 + %59 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %60 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i128 = icmp eq i32 %60, 0, !dbg !21 + %61 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i130 = icmp eq i32 %61, 0, !dbg !21 + %62 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i132 = icmp eq i32 %62, 0, !dbg !21 + %63 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i134 = icmp eq i32 %63, 0, !dbg !21 + %64 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %65 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i138 = icmp eq i32 %65, 0, !dbg !21 + %66 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i140 = icmp eq i32 %66, 0, !dbg !21 + %67 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i142 = icmp eq i32 %67, 0, !dbg !21 + %68 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i144 = icmp eq i32 %68, 0, !dbg !21 + %69 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %70 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i148 = icmp eq i32 %70, 0, !dbg !21 + %71 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i150 = icmp eq i32 %71, 0, !dbg !21 + %72 = fpext <2 x bfloat> %27 to <2 x float>, !dbg !22 + %73 = fcmp ogt <2 x float> %17, %72, !dbg !23 + %74 = or <2 x i1> %30, %73, !dbg !24 + %75 = select <2 x i1> %74, <2 x float> %17, <2 x float> %72, !dbg !25 + %76 = fcmp oeq <2 x float> %75, splat (float 0xFFF0000000000000), !dbg !26 + %foldExtExtBinop = fsub <2 x float> %17, %75, !dbg !27 + %77 = extractelement <2 x float> %foldExtExtBinop, i64 0, !dbg !27 + %foldExtExtBinop177 = fsub <2 x float> %17, %75, !dbg !27 + %78 = extractelement <2 x float> %foldExtExtBinop177, i64 1, !dbg !27 + %79 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %77, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %80 = tail call float @llvm.nvvm.fma.rn.f(float %77, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i73 = select i1 %.not.i72, float %80, float %79, !dbg !21 + %81 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i73) #6, !dbg !21 + %82 = tail call float @llvm.nvvm.saturate.f(float %.02.i73) #6, !dbg !21 + %.03.i75 = select i1 %.not1.i74, float %82, float %81, !dbg !21 + %83 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i75, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %84 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i75, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %85 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %78, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %86 = tail call float @llvm.nvvm.fma.rn.f(float %78, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i83 = select i1 %.not.i82, float %86, float %85, !dbg !21 + %87 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i83) #6, !dbg !21 + %88 = tail call float @llvm.nvvm.saturate.f(float %.02.i83) #6, !dbg !21 + %.03.i85 = select i1 %.not1.i84, float %88, float %87, !dbg !21 + %89 = insertelement <2 x i32> poison, i32 %34, i64 0, !dbg !21 + %90 = insertelement <2 x i32> %89, i32 %39, i64 1, !dbg !21 + %91 = icmp eq <2 x i32> %90, zeroinitializer, !dbg !21 + %92 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i85, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %93 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i85, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %94 = insertelement <2 x float> poison, float %84, i64 0, !dbg !21 + %95 = insertelement <2 x float> %94, float %93, i64 1, !dbg !21 + %96 = insertelement <2 x float> poison, float %83, i64 0, !dbg !21 + %97 = insertelement <2 x float> %96, float %92, i64 1, !dbg !21 + %98 = select <2 x i1> %91, <2 x float> %95, <2 x float> %97, !dbg !21 + %99 = extractelement <2 x float> %98, i64 0, !dbg !21 + %100 = fadd float %99, 0xC168000FE0000000, !dbg !21 + %101 = fneg float %100, !dbg !21 + %102 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %77, float 0x3FF7154760000000, float %101) #6, !dbg !21 + %103 = tail call float @llvm.nvvm.fma.rn.f(float %77, float 0x3FF7154760000000, float %101) #6, !dbg !21 + %.0.i79 = select i1 %.not3.i78, float %103, float %102, !dbg !21 + %104 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %77, float 0x3E54AE0C00000000, float %.0.i79) #6, !dbg !21 + %105 = tail call float @llvm.nvvm.fma.rn.f(float %77, float 0x3E54AE0C00000000, float %.0.i79) #6, !dbg !21 + %.01.i81 = select i1 %.not4.i80, float %105, float %104, !dbg !21 + %106 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i81) #6, !dbg !21 + %107 = extractelement <2 x float> %98, i64 1, !dbg !21 + %108 = fadd float %107, 0xC168000FE0000000, !dbg !21 + %109 = fneg float %108, !dbg !21 + %110 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %78, float 0x3FF7154760000000, float %109) #6, !dbg !21 + %111 = tail call float @llvm.nvvm.fma.rn.f(float %78, float 0x3FF7154760000000, float %109) #6, !dbg !21 + %.0.i89 = select i1 %.not3.i88, float %111, float %110, !dbg !21 + %112 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %78, float 0x3E54AE0C00000000, float %.0.i89) #6, !dbg !21 + %113 = tail call float @llvm.nvvm.fma.rn.f(float %78, float 0x3E54AE0C00000000, float %.0.i89) #6, !dbg !21 + %.01.i91 = select i1 %.not4.i90, float %113, float %112, !dbg !21 + %114 = bitcast <2 x float> %98 to <2 x i32>, !dbg !21 + %115 = shl <2 x i32> %114, splat (i32 23), !dbg !21 + %116 = bitcast <2 x i32> %115 to <2 x float>, !dbg !21 + %117 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i91) #6, !dbg !21 + %118 = insertelement <2 x float> poison, float %106, i64 0, !dbg !21 + %119 = insertelement <2 x float> %118, float %117, i64 1, !dbg !21 + %120 = fmul <2 x float> %119, %116, !dbg !21 + %121 = select <2 x i1> %76, <2 x float> splat (float 1.000000e+00), <2 x float> %120, !dbg !28 + %foldExtExtBinop179 = fsub <2 x float> %72, %75, !dbg !29 + %122 = extractelement <2 x float> %foldExtExtBinop179, i64 0, !dbg !29 + %foldExtExtBinop181 = fsub <2 x float> %72, %75, !dbg !29 + %123 = extractelement <2 x float> %foldExtExtBinop181, i64 1, !dbg !29 + %124 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %122, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %125 = tail call float @llvm.nvvm.fma.rn.f(float %122, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i113 = select i1 %.not.i112, float %125, float %124, !dbg !21 + %126 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i113) #6, !dbg !21 + %127 = tail call float @llvm.nvvm.saturate.f(float %.02.i113) #6, !dbg !21 + %.03.i115 = select i1 %.not1.i114, float %127, float %126, !dbg !21 + %128 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i115, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %129 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i115, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %130 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %123, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %131 = tail call float @llvm.nvvm.fma.rn.f(float %123, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i123 = select i1 %.not.i122, float %131, float %130, !dbg !21 + %132 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i123) #6, !dbg !21 + %133 = tail call float @llvm.nvvm.saturate.f(float %.02.i123) #6, !dbg !21 + %.03.i125 = select i1 %.not1.i124, float %133, float %132, !dbg !21 + %134 = insertelement <2 x i32> poison, i32 %54, i64 0, !dbg !21 + %135 = insertelement <2 x i32> %134, i32 %59, i64 1, !dbg !21 + %136 = icmp eq <2 x i32> %135, zeroinitializer, !dbg !21 + %137 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i125, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %138 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i125, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %139 = insertelement <2 x float> poison, float %129, i64 0, !dbg !21 + %140 = insertelement <2 x float> %139, float %138, i64 1, !dbg !21 + %141 = insertelement <2 x float> poison, float %128, i64 0, !dbg !21 + %142 = insertelement <2 x float> %141, float %137, i64 1, !dbg !21 + %143 = select <2 x i1> %136, <2 x float> %140, <2 x float> %142, !dbg !21 + %144 = extractelement <2 x float> %143, i64 0, !dbg !21 + %145 = fadd float %144, 0xC168000FE0000000, !dbg !21 + %146 = fneg float %145, !dbg !21 + %147 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %122, float 0x3FF7154760000000, float %146) #6, !dbg !21 + %148 = tail call float @llvm.nvvm.fma.rn.f(float %122, float 0x3FF7154760000000, float %146) #6, !dbg !21 + %.0.i119 = select i1 %.not3.i118, float %148, float %147, !dbg !21 + %149 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %122, float 0x3E54AE0C00000000, float %.0.i119) #6, !dbg !21 + %150 = tail call float @llvm.nvvm.fma.rn.f(float %122, float 0x3E54AE0C00000000, float %.0.i119) #6, !dbg !21 + %.01.i121 = select i1 %.not4.i120, float %150, float %149, !dbg !21 + %151 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i121) #6, !dbg !21 + %152 = extractelement <2 x float> %143, i64 1, !dbg !21 + %153 = fadd float %152, 0xC168000FE0000000, !dbg !21 + %154 = fneg float %153, !dbg !21 + %155 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %123, float 0x3FF7154760000000, float %154) #6, !dbg !21 + %156 = tail call float @llvm.nvvm.fma.rn.f(float %123, float 0x3FF7154760000000, float %154) #6, !dbg !21 + %.0.i129 = select i1 %.not3.i128, float %156, float %155, !dbg !21 + %157 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %123, float 0x3E54AE0C00000000, float %.0.i129) #6, !dbg !21 + %158 = tail call float @llvm.nvvm.fma.rn.f(float %123, float 0x3E54AE0C00000000, float %.0.i129) #6, !dbg !21 + %.01.i131 = select i1 %.not4.i130, float %158, float %157, !dbg !21 + %159 = bitcast <2 x float> %143 to <2 x i32>, !dbg !21 + %160 = shl <2 x i32> %159, splat (i32 23), !dbg !21 + %161 = bitcast <2 x i32> %160 to <2 x float>, !dbg !21 + %162 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i131) #6, !dbg !21 + %163 = insertelement <2 x float> poison, float %151, i64 0, !dbg !21 + %164 = insertelement <2 x float> %163, float %162, i64 1, !dbg !21 + %165 = fmul <2 x float> %164, %161, !dbg !21 + %166 = select <2 x i1> %76, <2 x float> splat (float 1.000000e+00), <2 x float> %165, !dbg !30 + %167 = fmul <2 x float> %16, %121, !dbg !31 + %168 = fadd <2 x float> %167, %166, !dbg !32 + %169 = fpext <2 x bfloat> %29 to <2 x float>, !dbg !22 + %170 = fcmp ogt <2 x float> %15, %169, !dbg !23 + %171 = or <2 x i1> %31, %170, !dbg !24 + %172 = select <2 x i1> %171, <2 x float> %15, <2 x float> %169, !dbg !25 + %173 = fcmp oeq <2 x float> %172, splat (float 0xFFF0000000000000), !dbg !26 + %foldExtExtBinop183 = fsub <2 x float> %15, %172, !dbg !27 + %174 = extractelement <2 x float> %foldExtExtBinop183, i64 0, !dbg !27 + %foldExtExtBinop185 = fsub <2 x float> %15, %172, !dbg !27 + %175 = extractelement <2 x float> %foldExtExtBinop185, i64 1, !dbg !27 + %176 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %174, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %177 = tail call float @llvm.nvvm.fma.rn.f(float %174, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i93 = select i1 %.not.i92, float %177, float %176, !dbg !21 + %178 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i93) #6, !dbg !21 + %179 = tail call float @llvm.nvvm.saturate.f(float %.02.i93) #6, !dbg !21 + %.03.i95 = select i1 %.not1.i94, float %179, float %178, !dbg !21 + %180 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i95, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %181 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i95, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %182 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %175, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %183 = tail call float @llvm.nvvm.fma.rn.f(float %175, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i103 = select i1 %.not.i102, float %183, float %182, !dbg !21 + %184 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i103) #6, !dbg !21 + %185 = tail call float @llvm.nvvm.saturate.f(float %.02.i103) #6, !dbg !21 + %.03.i105 = select i1 %.not1.i104, float %185, float %184, !dbg !21 + %186 = insertelement <2 x i32> poison, i32 %44, i64 0, !dbg !21 + %187 = insertelement <2 x i32> %186, i32 %49, i64 1, !dbg !21 + %188 = icmp eq <2 x i32> %187, zeroinitializer, !dbg !21 + %189 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i105, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %190 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i105, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %191 = insertelement <2 x float> poison, float %181, i64 0, !dbg !21 + %192 = insertelement <2 x float> %191, float %190, i64 1, !dbg !21 + %193 = insertelement <2 x float> poison, float %180, i64 0, !dbg !21 + %194 = insertelement <2 x float> %193, float %189, i64 1, !dbg !21 + %195 = select <2 x i1> %188, <2 x float> %192, <2 x float> %194, !dbg !21 + %196 = extractelement <2 x float> %195, i64 0, !dbg !21 + %197 = fadd float %196, 0xC168000FE0000000, !dbg !21 + %198 = fneg float %197, !dbg !21 + %199 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %174, float 0x3FF7154760000000, float %198) #6, !dbg !21 + %200 = tail call float @llvm.nvvm.fma.rn.f(float %174, float 0x3FF7154760000000, float %198) #6, !dbg !21 + %.0.i99 = select i1 %.not3.i98, float %200, float %199, !dbg !21 + %201 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %174, float 0x3E54AE0C00000000, float %.0.i99) #6, !dbg !21 + %202 = tail call float @llvm.nvvm.fma.rn.f(float %174, float 0x3E54AE0C00000000, float %.0.i99) #6, !dbg !21 + %.01.i101 = select i1 %.not4.i100, float %202, float %201, !dbg !21 + %203 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i101) #6, !dbg !21 + %204 = extractelement <2 x float> %195, i64 1, !dbg !21 + %205 = fadd float %204, 0xC168000FE0000000, !dbg !21 + %206 = fneg float %205, !dbg !21 + %207 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %175, float 0x3FF7154760000000, float %206) #6, !dbg !21 + %208 = tail call float @llvm.nvvm.fma.rn.f(float %175, float 0x3FF7154760000000, float %206) #6, !dbg !21 + %.0.i109 = select i1 %.not3.i108, float %208, float %207, !dbg !21 + %209 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %175, float 0x3E54AE0C00000000, float %.0.i109) #6, !dbg !21 + %210 = tail call float @llvm.nvvm.fma.rn.f(float %175, float 0x3E54AE0C00000000, float %.0.i109) #6, !dbg !21 + %.01.i111 = select i1 %.not4.i110, float %210, float %209, !dbg !21 + %211 = bitcast <2 x float> %195 to <2 x i32>, !dbg !21 + %212 = shl <2 x i32> %211, splat (i32 23), !dbg !21 + %213 = bitcast <2 x i32> %212 to <2 x float>, !dbg !21 + %214 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i111) #6, !dbg !21 + %215 = insertelement <2 x float> poison, float %203, i64 0, !dbg !21 + %216 = insertelement <2 x float> %215, float %214, i64 1, !dbg !21 + %217 = fmul <2 x float> %216, %213, !dbg !21 + %218 = select <2 x i1> %173, <2 x float> splat (float 1.000000e+00), <2 x float> %217, !dbg !28 + %foldExtExtBinop187 = fsub <2 x float> %169, %172, !dbg !29 + %219 = extractelement <2 x float> %foldExtExtBinop187, i64 0, !dbg !29 + %foldExtExtBinop189 = fsub <2 x float> %169, %172, !dbg !29 + %220 = extractelement <2 x float> %foldExtExtBinop189, i64 1, !dbg !29 + %221 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %219, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %222 = tail call float @llvm.nvvm.fma.rn.f(float %219, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i133 = select i1 %.not.i132, float %222, float %221, !dbg !21 + %223 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i133) #6, !dbg !21 + %224 = tail call float @llvm.nvvm.saturate.f(float %.02.i133) #6, !dbg !21 + %.03.i135 = select i1 %.not1.i134, float %224, float %223, !dbg !21 + %225 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i135, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %226 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i135, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %227 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %220, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %228 = tail call float @llvm.nvvm.fma.rn.f(float %220, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i143 = select i1 %.not.i142, float %228, float %227, !dbg !21 + %229 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i143) #6, !dbg !21 + %230 = tail call float @llvm.nvvm.saturate.f(float %.02.i143) #6, !dbg !21 + %.03.i145 = select i1 %.not1.i144, float %230, float %229, !dbg !21 + %231 = insertelement <2 x i32> poison, i32 %64, i64 0, !dbg !21 + %232 = insertelement <2 x i32> %231, i32 %69, i64 1, !dbg !21 + %233 = icmp eq <2 x i32> %232, zeroinitializer, !dbg !21 + %234 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i145, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %235 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i145, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %236 = insertelement <2 x float> poison, float %226, i64 0, !dbg !21 + %237 = insertelement <2 x float> %236, float %235, i64 1, !dbg !21 + %238 = insertelement <2 x float> poison, float %225, i64 0, !dbg !21 + %239 = insertelement <2 x float> %238, float %234, i64 1, !dbg !21 + %240 = select <2 x i1> %233, <2 x float> %237, <2 x float> %239, !dbg !21 + %241 = extractelement <2 x float> %240, i64 0, !dbg !21 + %242 = fadd float %241, 0xC168000FE0000000, !dbg !21 + %243 = fneg float %242, !dbg !21 + %244 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %219, float 0x3FF7154760000000, float %243) #6, !dbg !21 + %245 = tail call float @llvm.nvvm.fma.rn.f(float %219, float 0x3FF7154760000000, float %243) #6, !dbg !21 + %.0.i139 = select i1 %.not3.i138, float %245, float %244, !dbg !21 + %246 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %219, float 0x3E54AE0C00000000, float %.0.i139) #6, !dbg !21 + %247 = tail call float @llvm.nvvm.fma.rn.f(float %219, float 0x3E54AE0C00000000, float %.0.i139) #6, !dbg !21 + %.01.i141 = select i1 %.not4.i140, float %247, float %246, !dbg !21 + %248 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i141) #6, !dbg !21 + %249 = extractelement <2 x float> %240, i64 1, !dbg !21 + %250 = fadd float %249, 0xC168000FE0000000, !dbg !21 + %251 = fneg float %250, !dbg !21 + %252 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %220, float 0x3FF7154760000000, float %251) #6, !dbg !21 + %253 = tail call float @llvm.nvvm.fma.rn.f(float %220, float 0x3FF7154760000000, float %251) #6, !dbg !21 + %.0.i149 = select i1 %.not3.i148, float %253, float %252, !dbg !21 + %254 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %220, float 0x3E54AE0C00000000, float %.0.i149) #6, !dbg !21 + %255 = tail call float @llvm.nvvm.fma.rn.f(float %220, float 0x3E54AE0C00000000, float %.0.i149) #6, !dbg !21 + %.01.i151 = select i1 %.not4.i150, float %255, float %254, !dbg !21 + %256 = bitcast <2 x float> %240 to <2 x i32>, !dbg !21 + %257 = shl <2 x i32> %256, splat (i32 23), !dbg !21 + %258 = bitcast <2 x i32> %257 to <2 x float>, !dbg !21 + %259 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i151) #6, !dbg !21 + %260 = insertelement <2 x float> poison, float %248, i64 0, !dbg !21 + %261 = insertelement <2 x float> %260, float %259, i64 1, !dbg !21 + %262 = fmul <2 x float> %261, %258, !dbg !21 + %263 = select <2 x i1> %173, <2 x float> splat (float 1.000000e+00), <2 x float> %262, !dbg !30 + %264 = fmul <2 x float> %14, %218, !dbg !31 + %265 = fadd <2 x float> %264, %263, !dbg !32 + %266 = insertelement <2 x i1> poison, i1 %19, i64 0, !dbg !33 + %267 = shufflevector <2 x i1> %266, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !33 + %268 = select <2 x i1> %267, <2 x float> %75, <2 x float> %17, !dbg !33 + %269 = select <2 x i1> %267, <2 x float> %172, <2 x float> %15, !dbg !33 + %270 = select <2 x i1> %267, <2 x float> %168, <2 x float> %16, !dbg !34 + %271 = select <2 x i1> %267, <2 x float> %265, <2 x float> %14, !dbg !34 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !11 + %272 = icmp samesign ult i64 %indvars.iv, 29952, !dbg !11 + br i1 %272, label %13, label %273, !dbg !11 + +273: ; preds = %13 + %274 = and i32 %8, 31, !dbg !9 + %275 = lshr i32 %8, 5, !dbg !9 + %276 = extractelement <2 x float> %268, i64 0, !dbg !35 + %277 = extractelement <2 x float> %268, i64 1, !dbg !35 + %278 = fcmp ogt float %276, %277, !dbg !35 + %279 = fcmp uno float %276, 0.000000e+00, !dbg !37 + %280 = or i1 %278, %279, !dbg !38 + %281 = select i1 %280, float %276, float %277, !dbg !39 + %282 = extractelement <2 x float> %269, i64 0, !dbg !35 + %283 = fcmp ogt float %281, %282, !dbg !35 + %284 = fcmp uno float %281, 0.000000e+00, !dbg !37 + %285 = or i1 %283, %284, !dbg !38 + %286 = select i1 %285, float %281, float %282, !dbg !39 + %287 = extractelement <2 x float> %269, i64 1, !dbg !35 + %288 = fcmp ogt float %286, %287, !dbg !35 + %289 = fcmp uno float %286, 0.000000e+00, !dbg !37 + %290 = or i1 %288, %289, !dbg !38 + %291 = select i1 %290, float %286, float %287, !dbg !39 + %292 = bitcast float %291 to i32, !dbg !40 + %293 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %292, i32 16, i32 31), !dbg !40 + %294 = bitcast i32 %293 to float, !dbg !40 + %295 = fcmp ogt float %291, %294, !dbg !35 + %296 = fcmp uno float %291, 0.000000e+00, !dbg !37 + %297 = or i1 %296, %295, !dbg !38 + %298 = select i1 %297, float %291, float %294, !dbg !39 + %299 = bitcast float %298 to i32, !dbg !40 + %300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %299, i32 8, i32 31), !dbg !40 + %301 = bitcast i32 %300 to float, !dbg !40 + %302 = fcmp ogt float %298, %301, !dbg !35 + %303 = fcmp uno float %298, 0.000000e+00, !dbg !37 + %304 = or i1 %302, %303, !dbg !38 + %305 = select i1 %304, float %298, float %301, !dbg !39 + %306 = bitcast float %305 to i32, !dbg !40 + %307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %306, i32 4, i32 31), !dbg !40 + %308 = bitcast i32 %307 to float, !dbg !40 + %309 = fcmp ogt float %305, %308, !dbg !35 + %310 = fcmp uno float %305, 0.000000e+00, !dbg !37 + %311 = or i1 %309, %310, !dbg !38 + %312 = select i1 %311, float %305, float %308, !dbg !39 + %313 = bitcast float %312 to i32, !dbg !40 + %314 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %313, i32 2, i32 31), !dbg !40 + %315 = bitcast i32 %314 to float, !dbg !40 + %316 = fcmp ogt float %312, %315, !dbg !35 + %317 = fcmp uno float %312, 0.000000e+00, !dbg !37 + %318 = or i1 %316, %317, !dbg !38 + %319 = select i1 %318, float %312, float %315, !dbg !39 + %320 = bitcast float %319 to i32, !dbg !40 + %321 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %320, i32 1, i32 31), !dbg !40 + %322 = bitcast i32 %321 to float, !dbg !40 + %323 = fcmp ogt float %319, %322, !dbg !35 + %324 = fcmp uno float %319, 0.000000e+00, !dbg !37 + %325 = or i1 %323, %324, !dbg !38 + %326 = and i32 %275, 15, !dbg !40 + %327 = icmp eq i32 %274, 0, !dbg !40 + %328 = getelementptr float, ptr addrspace(3) @global_smem, i32 %326, !dbg !40 + %329 = select i1 %325, i32 %320, i32 %321, !dbg !39 + %330 = insertelement <1 x i32> poison, i32 %329, i64 0, !dbg !40 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %328, <1 x i32> %330, i1 %327) #6, !dbg !40 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !40 + %331 = icmp samesign ult i32 %8, 16, !dbg !40 + %332 = getelementptr float, ptr addrspace(3) @global_smem, i32 %8, !dbg !40 + %333 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %332, i1 %331) #6, !dbg !40 + %334 = bitcast i32 %333 to float, !dbg !40 + %335 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %333, i32 8, i32 31), !dbg !40 + %336 = bitcast i32 %335 to float, !dbg !40 + %337 = fcmp ogt float %334, %336, !dbg !35 + %338 = fcmp uno float %334, 0.000000e+00, !dbg !37 + %339 = or i1 %338, %337, !dbg !38 + %340 = select i1 %339, float %334, float %336, !dbg !39 + %341 = bitcast float %340 to i32, !dbg !40 + %342 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %341, i32 4, i32 31), !dbg !40 + %343 = bitcast i32 %342 to float, !dbg !40 + %344 = fcmp ogt float %340, %343, !dbg !35 + %345 = fcmp uno float %340, 0.000000e+00, !dbg !37 + %346 = or i1 %344, %345, !dbg !38 + %347 = select i1 %346, float %340, float %343, !dbg !39 + %348 = bitcast float %347 to i32, !dbg !40 + %349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %348, i32 2, i32 31), !dbg !40 + %350 = bitcast i32 %349 to float, !dbg !40 + %351 = fcmp ogt float %347, %350, !dbg !35 + %352 = fcmp uno float %347, 0.000000e+00, !dbg !37 + %353 = or i1 %351, %352, !dbg !38 + %354 = select i1 %353, float %347, float %350, !dbg !39 + %355 = bitcast float %354 to i32, !dbg !40 + %356 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %355, i32 1, i32 31), !dbg !40 + %357 = bitcast i32 %356 to float, !dbg !40 + %358 = fcmp ogt float %354, %357, !dbg !35 + %359 = fcmp uno float %354, 0.000000e+00, !dbg !37 + %360 = or i1 %358, %359, !dbg !38 + %361 = icmp eq i32 %8, 0, !dbg !40 + %362 = select i1 %360, i32 %355, i32 %356, !dbg !39 + %363 = insertelement <1 x i32> poison, i32 %362, i64 0, !dbg !40 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %332, <1 x i32> %363, i1 %361) #6, !dbg !40 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !40 + %364 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !40 + %365 = fcmp oeq float %364, 0xFFF0000000000000, !dbg !41 + %366 = fsub float %276, %364, !dbg !42 + %367 = fsub float %277, %364, !dbg !42 + %368 = fsub float %282, %364, !dbg !42 + %369 = fsub float %287, %364, !dbg !42 + %370 = select i1 %365, float 0.000000e+00, float %366, !dbg !43 + %371 = select i1 %365, float 0.000000e+00, float %367, !dbg !43 + %372 = select i1 %365, float 0.000000e+00, float %368, !dbg !43 + %373 = select i1 %365, float 0.000000e+00, float %369, !dbg !43 + %374 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i = icmp eq i32 %374, 0, !dbg !44 + %375 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %370, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %376 = tail call float @llvm.nvvm.fma.rn.f(float %370, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i = select i1 %.not.i, float %376, float %375, !dbg !44 + %377 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i = icmp eq i32 %377, 0, !dbg !44 + %378 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i) #6, !dbg !44 + %379 = tail call float @llvm.nvvm.saturate.f(float %.02.i) #6, !dbg !44 + %.03.i = select i1 %.not1.i, float %379, float %378, !dbg !44 + %380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %381 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %382 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i = icmp eq i32 %383, 0, !dbg !44 + %384 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i = icmp eq i32 %384, 0, !dbg !44 + %385 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i2 = icmp eq i32 %385, 0, !dbg !44 + %386 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %371, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %387 = tail call float @llvm.nvvm.fma.rn.f(float %371, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i3 = select i1 %.not.i2, float %387, float %386, !dbg !44 + %388 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i4 = icmp eq i32 %388, 0, !dbg !44 + %389 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i3) #6, !dbg !44 + %390 = tail call float @llvm.nvvm.saturate.f(float %.02.i3) #6, !dbg !44 + %.03.i5 = select i1 %.not1.i4, float %390, float %389, !dbg !44 + %391 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %392 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i5, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %393 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i5, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %394 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i8 = icmp eq i32 %394, 0, !dbg !44 + %395 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i10 = icmp eq i32 %395, 0, !dbg !44 + %396 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i12 = icmp eq i32 %396, 0, !dbg !44 + %397 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %372, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %398 = tail call float @llvm.nvvm.fma.rn.f(float %372, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i13 = select i1 %.not.i12, float %398, float %397, !dbg !44 + %399 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i14 = icmp eq i32 %399, 0, !dbg !44 + %400 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i13) #6, !dbg !44 + %401 = tail call float @llvm.nvvm.saturate.f(float %.02.i13) #6, !dbg !44 + %.03.i15 = select i1 %.not1.i14, float %401, float %400, !dbg !44 + %402 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %403 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i15, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %404 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i15, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %405 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i18 = icmp eq i32 %405, 0, !dbg !44 + %406 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i20 = icmp eq i32 %406, 0, !dbg !44 + %407 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i22 = icmp eq i32 %407, 0, !dbg !44 + %408 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %373, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %409 = tail call float @llvm.nvvm.fma.rn.f(float %373, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i23 = select i1 %.not.i22, float %409, float %408, !dbg !44 + %410 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i24 = icmp eq i32 %410, 0, !dbg !44 + %411 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i23) #6, !dbg !44 + %412 = tail call float @llvm.nvvm.saturate.f(float %.02.i23) #6, !dbg !44 + %.03.i25 = select i1 %.not1.i24, float %412, float %411, !dbg !44 + %413 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %414 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i25, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %415 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i25, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %416 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i28 = icmp eq i32 %416, 0, !dbg !44 + %417 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i30 = icmp eq i32 %417, 0, !dbg !44 + %418 = insertelement <2 x i32> poison, i32 %380, i64 0, !dbg !44 + %419 = insertelement <2 x i32> %418, i32 %391, i64 1, !dbg !44 + %420 = icmp eq <2 x i32> %419, zeroinitializer, !dbg !44 + %421 = insertelement <2 x float> poison, float %382, i64 0, !dbg !44 + %422 = insertelement <2 x float> %421, float %393, i64 1, !dbg !44 + %423 = insertelement <2 x float> poison, float %381, i64 0, !dbg !44 + %424 = insertelement <2 x float> %423, float %392, i64 1, !dbg !44 + %425 = select <2 x i1> %420, <2 x float> %422, <2 x float> %424, !dbg !44 + %426 = extractelement <2 x float> %425, i64 0, !dbg !44 + %427 = fadd float %426, 0xC168000FE0000000, !dbg !44 + %428 = fneg float %427, !dbg !44 + %429 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %370, float 0x3FF7154760000000, float %428) #6, !dbg !44 + %430 = tail call float @llvm.nvvm.fma.rn.f(float %370, float 0x3FF7154760000000, float %428) #6, !dbg !44 + %.0.i = select i1 %.not3.i, float %430, float %429, !dbg !44 + %431 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %370, float 0x3E54AE0C00000000, float %.0.i) #6, !dbg !44 + %432 = tail call float @llvm.nvvm.fma.rn.f(float %370, float 0x3E54AE0C00000000, float %.0.i) #6, !dbg !44 + %.01.i = select i1 %.not4.i, float %432, float %431, !dbg !44 + %433 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i) #6, !dbg !44 + %434 = extractelement <2 x float> %425, i64 1, !dbg !44 + %435 = fadd float %434, 0xC168000FE0000000, !dbg !44 + %436 = fneg float %435, !dbg !44 + %437 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %371, float 0x3FF7154760000000, float %436) #6, !dbg !44 + %438 = tail call float @llvm.nvvm.fma.rn.f(float %371, float 0x3FF7154760000000, float %436) #6, !dbg !44 + %.0.i9 = select i1 %.not3.i8, float %438, float %437, !dbg !44 + %439 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %371, float 0x3E54AE0C00000000, float %.0.i9) #6, !dbg !44 + %440 = tail call float @llvm.nvvm.fma.rn.f(float %371, float 0x3E54AE0C00000000, float %.0.i9) #6, !dbg !44 + %.01.i11 = select i1 %.not4.i10, float %440, float %439, !dbg !44 + %441 = bitcast <2 x float> %425 to <2 x i32>, !dbg !44 + %442 = shl <2 x i32> %441, splat (i32 23), !dbg !44 + %443 = bitcast <2 x i32> %442 to <2 x float>, !dbg !44 + %444 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i11) #6, !dbg !44 + %445 = insertelement <2 x float> poison, float %433, i64 0, !dbg !44 + %446 = insertelement <2 x float> %445, float %444, i64 1, !dbg !44 + %447 = fmul <2 x float> %446, %443, !dbg !44 + %448 = fmul <2 x float> %270, %447, !dbg !45 + %449 = insertelement <2 x i32> poison, i32 %402, i64 0, !dbg !44 + %450 = insertelement <2 x i32> %449, i32 %413, i64 1, !dbg !44 + %451 = icmp eq <2 x i32> %450, zeroinitializer, !dbg !44 + %452 = insertelement <2 x float> poison, float %404, i64 0, !dbg !44 + %453 = insertelement <2 x float> %452, float %415, i64 1, !dbg !44 + %454 = insertelement <2 x float> poison, float %403, i64 0, !dbg !44 + %455 = insertelement <2 x float> %454, float %414, i64 1, !dbg !44 + %456 = select <2 x i1> %451, <2 x float> %453, <2 x float> %455, !dbg !44 + %457 = extractelement <2 x float> %456, i64 0, !dbg !44 + %458 = fadd float %457, 0xC168000FE0000000, !dbg !44 + %459 = fneg float %458, !dbg !44 + %460 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %372, float 0x3FF7154760000000, float %459) #6, !dbg !44 + %461 = tail call float @llvm.nvvm.fma.rn.f(float %372, float 0x3FF7154760000000, float %459) #6, !dbg !44 + %.0.i19 = select i1 %.not3.i18, float %461, float %460, !dbg !44 + %462 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %372, float 0x3E54AE0C00000000, float %.0.i19) #6, !dbg !44 + %463 = tail call float @llvm.nvvm.fma.rn.f(float %372, float 0x3E54AE0C00000000, float %.0.i19) #6, !dbg !44 + %.01.i21 = select i1 %.not4.i20, float %463, float %462, !dbg !44 + %464 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i21) #6, !dbg !44 + %465 = extractelement <2 x float> %456, i64 1, !dbg !44 + %466 = fadd float %465, 0xC168000FE0000000, !dbg !44 + %467 = fneg float %466, !dbg !44 + %468 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %373, float 0x3FF7154760000000, float %467) #6, !dbg !44 + %469 = tail call float @llvm.nvvm.fma.rn.f(float %373, float 0x3FF7154760000000, float %467) #6, !dbg !44 + %.0.i29 = select i1 %.not3.i28, float %469, float %468, !dbg !44 + %470 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %373, float 0x3E54AE0C00000000, float %.0.i29) #6, !dbg !44 + %471 = tail call float @llvm.nvvm.fma.rn.f(float %373, float 0x3E54AE0C00000000, float %.0.i29) #6, !dbg !44 + %.01.i31 = select i1 %.not4.i30, float %471, float %470, !dbg !44 + %472 = bitcast <2 x float> %456 to <2 x i32>, !dbg !44 + %473 = shl <2 x i32> %472, splat (i32 23), !dbg !44 + %474 = bitcast <2 x i32> %473 to <2 x float>, !dbg !44 + %475 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i31) #6, !dbg !44 + %476 = insertelement <2 x float> poison, float %464, i64 0, !dbg !44 + %477 = insertelement <2 x float> %476, float %475, i64 1, !dbg !44 + %478 = fmul <2 x float> %477, %474, !dbg !44 + %479 = fmul <2 x float> %271, %478, !dbg !45 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %shift = shufflevector <2 x float> %448, <2 x float> poison, <2 x i32> , !dbg !49 + %foldExtExtBinop191 = fadd <2 x float> %448, %shift, !dbg !49 + %foldExtExtBinop193 = fadd <2 x float> %foldExtExtBinop191, %479, !dbg !49 + %shift195 = shufflevector <2 x float> %479, <2 x float> poison, <2 x i32> , !dbg !49 + %foldExtExtBinop196 = fadd <2 x float> %foldExtExtBinop193, %shift195, !dbg !49 + %480 = extractelement <2 x float> %foldExtExtBinop196, i64 0, !dbg !49 + %481 = bitcast float %480 to i32, !dbg !46 + %482 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %481, i32 16, i32 31), !dbg !46 + %483 = bitcast i32 %482 to float, !dbg !46 + %484 = fadd float %480, %483, !dbg !49 + %485 = bitcast float %484 to i32, !dbg !46 + %486 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %485, i32 8, i32 31), !dbg !46 + %487 = bitcast i32 %486 to float, !dbg !46 + %488 = fadd float %484, %487, !dbg !49 + %489 = bitcast float %488 to i32, !dbg !46 + %490 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %489, i32 4, i32 31), !dbg !46 + %491 = bitcast i32 %490 to float, !dbg !46 + %492 = fadd float %488, %491, !dbg !49 + %493 = bitcast float %492 to i32, !dbg !46 + %494 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %493, i32 2, i32 31), !dbg !46 + %495 = bitcast i32 %494 to float, !dbg !46 + %496 = fadd float %492, %495, !dbg !49 + %497 = bitcast float %496 to i32, !dbg !46 + %498 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %497, i32 1, i32 31), !dbg !46 + %499 = bitcast i32 %498 to float, !dbg !46 + %500 = fadd float %496, %499, !dbg !49 + %501 = bitcast float %500 to <1 x i32>, !dbg !46 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %328, <1 x i32> %501, i1 %327) #6, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %502 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %332, i1 %331) #6, !dbg !46 + %503 = bitcast i32 %502 to float, !dbg !46 + %504 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %502, i32 8, i32 31), !dbg !46 + %505 = bitcast i32 %504 to float, !dbg !46 + %506 = fadd float %503, %505, !dbg !49 + %507 = bitcast float %506 to i32, !dbg !46 + %508 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %507, i32 4, i32 31), !dbg !46 + %509 = bitcast i32 %508 to float, !dbg !46 + %510 = fadd float %506, %509, !dbg !49 + %511 = bitcast float %510 to i32, !dbg !46 + %512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %511, i32 2, i32 31), !dbg !46 + %513 = bitcast i32 %512 to float, !dbg !46 + %514 = fadd float %510, %513, !dbg !49 + %515 = bitcast float %514 to i32, !dbg !46 + %516 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %515, i32 1, i32 31), !dbg !46 + %517 = bitcast i32 %516 to float, !dbg !46 + %518 = fadd float %514, %517, !dbg !49 + %519 = bitcast float %518 to <1 x i32>, !dbg !46 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %332, <1 x i32> %519, i1 %361) #6, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %520 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !46 + br label %521, !dbg !50 + +521: ; preds = %273, %521 + %indvars.iv160 = phi i64 [ 0, %273 ], [ %indvars.iv.next161, %521 ] + %522 = or disjoint i64 %indvars.iv160, %12, !dbg !51 + %523 = icmp samesign ult i64 %522, 32000, !dbg !52 + %524 = trunc nuw nsw i64 %522 to i32, !dbg !53 + %525 = add i32 %11, %524, !dbg !53 + %526 = sext i32 %525 to i64, !dbg !54 + %527 = getelementptr bfloat, ptr addrspace(1) %0, i64 %526, !dbg !54 + %528 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !55 + %529 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %527, i64 %528, i1 %523) #6, !dbg !55 + %530 = extractvalue { i32, i32 } %529, 0, !dbg !55 + %531 = bitcast i32 %530 to <2 x bfloat>, !dbg !55 + %532 = extractvalue { i32, i32 } %529, 1, !dbg !55 + %533 = bitcast i32 %532 to <2 x bfloat>, !dbg !55 + %534 = extractelement <2 x bfloat> %531, i64 0, !dbg !55 + %535 = extractelement <2 x bfloat> %531, i64 1, !dbg !55 + %536 = extractelement <2 x bfloat> %533, i64 0, !dbg !55 + %537 = extractelement <2 x bfloat> %533, i64 1, !dbg !55 + %538 = fpext bfloat %534 to float, !dbg !56 + %539 = fpext bfloat %535 to float, !dbg !56 + %540 = fpext bfloat %536 to float, !dbg !56 + %541 = fpext bfloat %537 to float, !dbg !56 + %542 = fsub float %538, %364, !dbg !57 + %543 = fsub float %539, %364, !dbg !57 + %544 = fsub float %540, %364, !dbg !57 + %545 = fsub float %541, %364, !dbg !57 + %546 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i32 = icmp eq i32 %546, 0, !dbg !58 + %547 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %542, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %548 = tail call float @llvm.nvvm.fma.rn.f(float %542, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i33 = select i1 %.not.i32, float %548, float %547, !dbg !58 + %549 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i34 = icmp eq i32 %549, 0, !dbg !58 + %550 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i33) #6, !dbg !58 + %551 = tail call float @llvm.nvvm.saturate.f(float %.02.i33) #6, !dbg !58 + %.03.i35 = select i1 %.not1.i34, float %551, float %550, !dbg !58 + %552 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i36 = icmp eq i32 %552, 0, !dbg !58 + %553 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i35, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %554 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i35, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i37 = select i1 %.not2.i36, float %554, float %553, !dbg !58 + %555 = fadd float %.04.i37, 0xC168000FE0000000, !dbg !58 + %556 = fneg float %555, !dbg !58 + %557 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i38 = icmp eq i32 %557, 0, !dbg !58 + %558 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %542, float 0x3FF7154760000000, float %556) #6, !dbg !58 + %559 = tail call float @llvm.nvvm.fma.rn.f(float %542, float 0x3FF7154760000000, float %556) #6, !dbg !58 + %.0.i39 = select i1 %.not3.i38, float %559, float %558, !dbg !58 + %560 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i40 = icmp eq i32 %560, 0, !dbg !58 + %561 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %542, float 0x3E54AE0C00000000, float %.0.i39) #6, !dbg !58 + %562 = tail call float @llvm.nvvm.fma.rn.f(float %542, float 0x3E54AE0C00000000, float %.0.i39) #6, !dbg !58 + %.01.i41 = select i1 %.not4.i40, float %562, float %561, !dbg !58 + %563 = bitcast float %.04.i37 to i32, !dbg !58 + %564 = shl i32 %563, 23, !dbg !58 + %565 = bitcast i32 %564 to float, !dbg !58 + %566 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i41) #6, !dbg !58 + %567 = fmul float %566, %565, !dbg !58 + %568 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i42 = icmp eq i32 %568, 0, !dbg !58 + %569 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %543, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %570 = tail call float @llvm.nvvm.fma.rn.f(float %543, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i43 = select i1 %.not.i42, float %570, float %569, !dbg !58 + %571 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i44 = icmp eq i32 %571, 0, !dbg !58 + %572 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i43) #6, !dbg !58 + %573 = tail call float @llvm.nvvm.saturate.f(float %.02.i43) #6, !dbg !58 + %.03.i45 = select i1 %.not1.i44, float %573, float %572, !dbg !58 + %574 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i46 = icmp eq i32 %574, 0, !dbg !58 + %575 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i45, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %576 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i45, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i47 = select i1 %.not2.i46, float %576, float %575, !dbg !58 + %577 = fadd float %.04.i47, 0xC168000FE0000000, !dbg !58 + %578 = fneg float %577, !dbg !58 + %579 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i48 = icmp eq i32 %579, 0, !dbg !58 + %580 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %543, float 0x3FF7154760000000, float %578) #6, !dbg !58 + %581 = tail call float @llvm.nvvm.fma.rn.f(float %543, float 0x3FF7154760000000, float %578) #6, !dbg !58 + %.0.i49 = select i1 %.not3.i48, float %581, float %580, !dbg !58 + %582 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i50 = icmp eq i32 %582, 0, !dbg !58 + %583 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %543, float 0x3E54AE0C00000000, float %.0.i49) #6, !dbg !58 + %584 = tail call float @llvm.nvvm.fma.rn.f(float %543, float 0x3E54AE0C00000000, float %.0.i49) #6, !dbg !58 + %.01.i51 = select i1 %.not4.i50, float %584, float %583, !dbg !58 + %585 = bitcast float %.04.i47 to i32, !dbg !58 + %586 = shl i32 %585, 23, !dbg !58 + %587 = bitcast i32 %586 to float, !dbg !58 + %588 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i51) #6, !dbg !58 + %589 = fmul float %588, %587, !dbg !58 + %590 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i52 = icmp eq i32 %590, 0, !dbg !58 + %591 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %544, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %592 = tail call float @llvm.nvvm.fma.rn.f(float %544, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i53 = select i1 %.not.i52, float %592, float %591, !dbg !58 + %593 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i54 = icmp eq i32 %593, 0, !dbg !58 + %594 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i53) #6, !dbg !58 + %595 = tail call float @llvm.nvvm.saturate.f(float %.02.i53) #6, !dbg !58 + %.03.i55 = select i1 %.not1.i54, float %595, float %594, !dbg !58 + %596 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i56 = icmp eq i32 %596, 0, !dbg !58 + %597 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i55, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %598 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i55, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i57 = select i1 %.not2.i56, float %598, float %597, !dbg !58 + %599 = fadd float %.04.i57, 0xC168000FE0000000, !dbg !58 + %600 = fneg float %599, !dbg !58 + %601 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i58 = icmp eq i32 %601, 0, !dbg !58 + %602 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %544, float 0x3FF7154760000000, float %600) #6, !dbg !58 + %603 = tail call float @llvm.nvvm.fma.rn.f(float %544, float 0x3FF7154760000000, float %600) #6, !dbg !58 + %.0.i59 = select i1 %.not3.i58, float %603, float %602, !dbg !58 + %604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i60 = icmp eq i32 %604, 0, !dbg !58 + %605 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %544, float 0x3E54AE0C00000000, float %.0.i59) #6, !dbg !58 + %606 = tail call float @llvm.nvvm.fma.rn.f(float %544, float 0x3E54AE0C00000000, float %.0.i59) #6, !dbg !58 + %.01.i61 = select i1 %.not4.i60, float %606, float %605, !dbg !58 + %607 = bitcast float %.04.i57 to i32, !dbg !58 + %608 = shl i32 %607, 23, !dbg !58 + %609 = bitcast i32 %608 to float, !dbg !58 + %610 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i61) #6, !dbg !58 + %611 = fmul float %610, %609, !dbg !58 + %612 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i62 = icmp eq i32 %612, 0, !dbg !58 + %613 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %545, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %614 = tail call float @llvm.nvvm.fma.rn.f(float %545, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i63 = select i1 %.not.i62, float %614, float %613, !dbg !58 + %615 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i64 = icmp eq i32 %615, 0, !dbg !58 + %616 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i63) #6, !dbg !58 + %617 = tail call float @llvm.nvvm.saturate.f(float %.02.i63) #6, !dbg !58 + %.03.i65 = select i1 %.not1.i64, float %617, float %616, !dbg !58 + %618 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i66 = icmp eq i32 %618, 0, !dbg !58 + %619 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i65, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %620 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i65, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i67 = select i1 %.not2.i66, float %620, float %619, !dbg !58 + %621 = fadd float %.04.i67, 0xC168000FE0000000, !dbg !58 + %622 = fneg float %621, !dbg !58 + %623 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i68 = icmp eq i32 %623, 0, !dbg !58 + %624 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %545, float 0x3FF7154760000000, float %622) #6, !dbg !58 + %625 = tail call float @llvm.nvvm.fma.rn.f(float %545, float 0x3FF7154760000000, float %622) #6, !dbg !58 + %.0.i69 = select i1 %.not3.i68, float %625, float %624, !dbg !58 + %626 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i70 = icmp eq i32 %626, 0, !dbg !58 + %627 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %545, float 0x3E54AE0C00000000, float %.0.i69) #6, !dbg !58 + %628 = tail call float @llvm.nvvm.fma.rn.f(float %545, float 0x3E54AE0C00000000, float %.0.i69) #6, !dbg !58 + %.01.i71 = select i1 %.not4.i70, float %628, float %627, !dbg !58 + %629 = bitcast float %.04.i67 to i32, !dbg !58 + %630 = shl i32 %629, 23, !dbg !58 + %631 = bitcast i32 %630 to float, !dbg !58 + %632 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i71) #6, !dbg !58 + %633 = fmul float %632, %631, !dbg !58 + %634 = tail call float @llvm.nvvm.div.full(float %567, float %520), !dbg !59 + %635 = tail call float @llvm.nvvm.div.full(float %589, float %520), !dbg !59 + %636 = tail call float @llvm.nvvm.div.full(float %611, float %520), !dbg !59 + %637 = tail call float @llvm.nvvm.div.full(float %633, float %520), !dbg !59 + %638 = getelementptr float, ptr addrspace(1) %1, i64 %526, !dbg !60 + %639 = bitcast float %634 to i32, !dbg !61 + %640 = bitcast float %635 to i32, !dbg !61 + %641 = bitcast float %636 to i32, !dbg !61 + %642 = bitcast float %637 to i32, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %639, i32 %640, i32 %641, i32 %642, ptr addrspace(1) %638, i1 %523) #6, !dbg !61 + %indvars.iv.next161 = add nuw nsw i64 %indvars.iv160, 2048, !dbg !50 + %643 = icmp samesign ult i64 %indvars.iv160, 29952, !dbg !50 + br i1 %643, label %521, label %644, !dbg !50 + +644: ; preds = %521 + ret void, !dbg !62 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.saturate.ftz.f(float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.saturate.f(float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rm.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rm.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0", linkageName: "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 26, column: 37, scope: !5) +!10 = !DILocation(line: 37, column: 47, scope: !5) +!11 = !DILocation(line: 31, column: 40, scope: !5) +!12 = !DILocation(line: 32, column: 31, scope: !5) +!13 = !DILocation(line: 33, column: 29, scope: !5) +!14 = !DILocation(line: 37, column: 41, scope: !5) +!15 = !DILocation(line: 37, column: 34, scope: !5) +!16 = !DILocation(line: 37, column: 52, scope: !5) +!17 = !DILocation(line: 112, column: 21, scope: !18, inlinedAt: !20) +!18 = distinct !DILexicalBlockFile(scope: !5, file: !19, discriminator: 0) +!19 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!20 = !DILocation(line: 42, column: 40, scope: !5) +!21 = !DILocation(line: 173, column: 29, scope: !18, inlinedAt: !20) +!22 = !DILocation(line: 37, column: 105, scope: !5) +!23 = !DILocation(line: 110, column: 15, scope: !18, inlinedAt: !20) +!24 = !DILocation(line: 112, column: 16, scope: !18, inlinedAt: !20) +!25 = !DILocation(line: 113, column: 29, scope: !18, inlinedAt: !20) +!26 = !DILocation(line: 196, column: 19, scope: !18, inlinedAt: !20) +!27 = !DILocation(line: 196, column: 53, scope: !18, inlinedAt: !20) +!28 = !DILocation(line: 196, column: 39, scope: !18, inlinedAt: !20) +!29 = !DILocation(line: 199, column: 53, scope: !18, inlinedAt: !20) +!30 = !DILocation(line: 199, column: 39, scope: !18, inlinedAt: !20) +!31 = !DILocation(line: 205, column: 24, scope: !18, inlinedAt: !20) +!32 = !DILocation(line: 205, column: 36, scope: !18, inlinedAt: !20) +!33 = !DILocation(line: 45, column: 54, scope: !5) +!34 = !DILocation(line: 46, column: 54, scope: !5) +!35 = !DILocation(line: 110, column: 15, scope: !18, inlinedAt: !36) +!36 = !DILocation(line: 49, column: 33, scope: !5) +!37 = !DILocation(line: 112, column: 21, scope: !18, inlinedAt: !36) +!38 = !DILocation(line: 112, column: 16, scope: !18, inlinedAt: !36) +!39 = !DILocation(line: 113, column: 29, scope: !18, inlinedAt: !36) +!40 = !DILocation(line: 123, column: 29, scope: !18, inlinedAt: !36) +!41 = !DILocation(line: 180, column: 40, scope: !18, inlinedAt: !36) +!42 = !DILocation(line: 180, column: 68, scope: !18, inlinedAt: !36) +!43 = !DILocation(line: 180, column: 58, scope: !18, inlinedAt: !36) +!44 = !DILocation(line: 173, column: 29, scope: !18, inlinedAt: !36) +!45 = !DILocation(line: 181, column: 31, scope: !18, inlinedAt: !36) +!46 = !DILocation(line: 291, column: 36, scope: !47, inlinedAt: !36) +!47 = distinct !DILexicalBlockFile(scope: !5, file: !48, discriminator: 0) +!48 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!49 = !DILocation(line: 261, column: 15, scope: !47, inlinedAt: !36) +!50 = !DILocation(line: 52, column: 40, scope: !5) +!51 = !DILocation(line: 53, column: 31, scope: !5) +!52 = !DILocation(line: 54, column: 29, scope: !5) +!53 = !DILocation(line: 58, column: 41, scope: !5) +!54 = !DILocation(line: 58, column: 34, scope: !5) +!55 = !DILocation(line: 58, column: 52, scope: !5) +!56 = !DILocation(line: 58, column: 106, scope: !5) +!57 = !DILocation(line: 60, column: 22, scope: !5) +!58 = !DILocation(line: 61, column: 29, scope: !5) +!59 = !DILocation(line: 62, column: 23, scope: !5) +!60 = !DILocation(line: 63, column: 29, scope: !5) +!61 = !DILocation(line: 63, column: 53, scope: !5) +!62 = !DILocation(line: 52, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..18394d2b4fa8cdcfcdfcb3bff09c229c526d965b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx @@ -0,0 +1,921 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 // -- Begin function triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 +.visible .entry triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0( + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_1, + .param .u32 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_2, + .param .u32 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_5 +) +.reqntid 512 +{ + .reg .pred %p<49>; + .reg .b16 %rs<9>; + .reg .b32 %r<354>; + .reg .b64 %rd<68>; + .loc 1 18 0 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:18:0 + +// %bb.0: + ld.param.b64 %rd16, [triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_1]; + ld.param.b64 %rd15, [triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_0]; +$L__tmp0: + .loc 1 23 28 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:23:28 + mov.u32 %r4, %ctaid.x; + .loc 1 26 37 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:26:37 + mov.u32 %r1, %tid.x; + shl.b32 %r5, %r1, 2; + and.b32 %r6, %r5, 2044; + .loc 1 31 40 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:31:40 + cvt.u64.u32 %rd1, %r6; + mad.lo.s32 %r7, %r4, 32000, %r6; + cvt.u64.u32 %rd2, %r7; + mov.b32 %r8, 0fFF800000; + mov.b64 %rd64, {%r8, %r8}; + mov.b32 %r9, 0f00000000; + mov.b64 %rd63, {%r9, %r9}; + mov.b64 %rd62, 0; + mov.b64 %rd65, %rd63; + mov.b64 %rd66, %rd64; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 33 29 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:33:29 + add.s64 %rd23, %rd1, %rd62; + setp.lt.u64 %p1, %rd23, 32000; + .loc 1 37 34 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:34 + add.s64 %rd24, %rd2, %rd62; + cvt.u32.u64 %r14, %rd24; + mad.wide.s32 %rd21, %r14, 2, %rd15; + .loc 1 37 52 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:52 + // begin inline asm + mov.u64 %rd20, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0; + // end inline asm + mov.b32 %r12, 0; + // begin inline asm + mov.u32 %r10, %r12; + mov.u32 %r11, %r12; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r10, %r11 }, [ %rd21 + 0 ], %rd20; + // end inline asm +$L__tmp1: + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + mov.b64 {%r15, %r16}, %rd66; + setp.nan.f32 %p2, %r15, %r15; + setp.nan.f32 %p3, %r16, %r16; + mov.b64 {%r17, %r18}, %rd64; + setp.nan.f32 %p4, %r17, %r17; + setp.nan.f32 %p5, %r18, %r18; +$L__tmp2: + .loc 1 37 105 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:105 + mov.b32 {%rs1, %rs2}, %r10; + cvt.f32.bf16 %r19, %rs2; + cvt.f32.bf16 %r20, %rs1; +$L__tmp3: + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.gt.f32 %p6, %r15, %r20; + setp.gt.f32 %p7, %r16, %r19; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r21, %r16, %r19, %p7; + selp.f32 %r22, %r16, %r21, %p3; + selp.f32 %r23, %r15, %r20, %p6; + selp.f32 %r24, %r15, %r23, %p2; + .loc 2 196 19 // triton_helpers.py:196:19 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.eq.f32 %p8, %r24, 0fFF800000; + setp.eq.f32 %p9, %r22, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + sub.f32 %r25, %r16, %r22; + sub.f32 %r26, %r15, %r24; + mov.b32 %r27, 0f3F000000; + mov.b32 %r28, 0f3BBB989D; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.ftz.f32 %r29, %r26, %r28, %r27; + cvt.ftz.sat.f32.f32 %r30, %r29; + mov.b32 %r31, 0f4B400001; + mov.b32 %r32, 0f437C0000; + fma.rm.ftz.f32 %r33, %r30, %r32, %r31; + fma.rn.ftz.f32 %r34, %r25, %r28, %r27; + cvt.ftz.sat.f32.f32 %r35, %r34; + fma.rm.ftz.f32 %r36, %r35, %r32, %r31; + mov.b64 %rd25, {%r33, %r36}; + cvt.u32.u64 %r37, %rd25; + add.f32 %r38, %r33, 0fCB40007F; + neg.f32 %r39, %r38; + mov.b32 %r40, 0f3FB8AA3B; + fma.rn.ftz.f32 %r41, %r26, %r40, %r39; + mov.b32 %r42, 0f32A57060; + fma.rn.ftz.f32 %r43, %r26, %r42, %r41; + ex2.approx.ftz.f32 %r44, %r43; + add.f32 %r45, %r36, 0fCB40007F; + neg.f32 %r46, %r45; + fma.rn.ftz.f32 %r47, %r25, %r40, %r46; + fma.rn.ftz.f32 %r48, %r25, %r42, %r47; + shl.b64 %rd26, %rd25, 23; + and.b64 %rd27, %rd26, -36028797018963968; + shl.b32 %r49, %r37, 23; + cvt.u64.u32 %rd28, %r49; + or.b64 %rd29, %rd28, %rd27; + ex2.approx.ftz.f32 %r50, %r48; + mov.b64 {%r51, %r52}, %rd29; + mul.f32 %r53, %r44, %r51; + mul.f32 %r54, %r50, %r52; + .loc 2 196 39 // triton_helpers.py:196:39 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r55, 0f3F800000, %r54, %p9; + selp.f32 %r56, 0f3F800000, %r53, %p8; + .loc 2 199 53 // triton_helpers.py:199:53 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + sub.f32 %r57, %r19, %r22; + sub.f32 %r58, %r20, %r24; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.ftz.f32 %r59, %r58, %r28, %r27; + cvt.ftz.sat.f32.f32 %r60, %r59; + fma.rm.ftz.f32 %r61, %r60, %r32, %r31; + fma.rn.ftz.f32 %r62, %r57, %r28, %r27; + cvt.ftz.sat.f32.f32 %r63, %r62; + fma.rm.ftz.f32 %r64, %r63, %r32, %r31; + mov.b64 %rd30, {%r61, %r64}; + cvt.u32.u64 %r65, %rd30; + add.f32 %r66, %r61, 0fCB40007F; + neg.f32 %r67, %r66; + fma.rn.ftz.f32 %r68, %r58, %r40, %r67; + fma.rn.ftz.f32 %r69, %r58, %r42, %r68; + ex2.approx.ftz.f32 %r70, %r69; + add.f32 %r71, %r64, 0fCB40007F; + neg.f32 %r72, %r71; + fma.rn.ftz.f32 %r73, %r57, %r40, %r72; + fma.rn.ftz.f32 %r74, %r57, %r42, %r73; + shl.b64 %rd31, %rd30, 23; + and.b64 %rd32, %rd31, -36028797018963968; + shl.b32 %r75, %r65, 23; + cvt.u64.u32 %rd33, %r75; + or.b64 %rd34, %rd33, %rd32; + ex2.approx.ftz.f32 %r76, %r74; + mov.b64 {%r77, %r78}, %rd34; + mul.f32 %r79, %r70, %r77; + mul.f32 %r80, %r76, %r78; + .loc 2 199 39 // triton_helpers.py:199:39 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r81, 0f3F800000, %r80, %p9; + selp.f32 %r82, 0f3F800000, %r79, %p8; + .loc 2 205 36 // triton_helpers.py:205:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + mov.b64 {%r83, %r84}, %rd65; + fma.rn.f32 %r85, %r83, %r56, %r82; + fma.rn.f32 %r86, %r84, %r55, %r81; +$L__tmp4: + .loc 1 37 105 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:105 + mov.b32 {%rs3, %rs4}, %r11; + cvt.f32.bf16 %r87, %rs4; + cvt.f32.bf16 %r88, %rs3; +$L__tmp5: + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.gt.f32 %p10, %r17, %r88; + setp.gt.f32 %p11, %r18, %r87; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r89, %r18, %r87, %p11; + selp.f32 %r90, %r18, %r89, %p5; + selp.f32 %r91, %r17, %r88, %p10; + selp.f32 %r92, %r17, %r91, %p4; + .loc 2 196 19 // triton_helpers.py:196:19 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.eq.f32 %p12, %r92, 0fFF800000; + setp.eq.f32 %p13, %r90, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + sub.f32 %r93, %r18, %r90; + sub.f32 %r94, %r17, %r92; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.ftz.f32 %r95, %r94, %r28, %r27; + cvt.ftz.sat.f32.f32 %r96, %r95; + fma.rm.ftz.f32 %r97, %r96, %r32, %r31; + fma.rn.ftz.f32 %r98, %r93, %r28, %r27; + cvt.ftz.sat.f32.f32 %r99, %r98; + fma.rm.ftz.f32 %r100, %r99, %r32, %r31; + mov.b64 %rd35, {%r97, %r100}; + cvt.u32.u64 %r101, %rd35; + add.f32 %r102, %r97, 0fCB40007F; + neg.f32 %r103, %r102; + fma.rn.ftz.f32 %r104, %r94, %r40, %r103; + fma.rn.ftz.f32 %r105, %r94, %r42, %r104; + ex2.approx.ftz.f32 %r106, %r105; + add.f32 %r107, %r100, 0fCB40007F; + neg.f32 %r108, %r107; + fma.rn.ftz.f32 %r109, %r93, %r40, %r108; + fma.rn.ftz.f32 %r110, %r93, %r42, %r109; + shl.b64 %rd36, %rd35, 23; + and.b64 %rd37, %rd36, -36028797018963968; + shl.b32 %r111, %r101, 23; + cvt.u64.u32 %rd38, %r111; + or.b64 %rd39, %rd38, %rd37; + ex2.approx.ftz.f32 %r112, %r110; + mov.b64 {%r113, %r114}, %rd39; + mul.f32 %r115, %r106, %r113; + mul.f32 %r116, %r112, %r114; + .loc 2 196 39 // triton_helpers.py:196:39 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r117, 0f3F800000, %r116, %p13; + selp.f32 %r118, 0f3F800000, %r115, %p12; + .loc 2 199 53 // triton_helpers.py:199:53 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + sub.f32 %r119, %r87, %r90; + sub.f32 %r120, %r88, %r92; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.ftz.f32 %r121, %r120, %r28, %r27; + cvt.ftz.sat.f32.f32 %r122, %r121; + fma.rm.ftz.f32 %r123, %r122, %r32, %r31; + fma.rn.ftz.f32 %r124, %r119, %r28, %r27; + cvt.ftz.sat.f32.f32 %r125, %r124; + fma.rm.ftz.f32 %r126, %r125, %r32, %r31; + mov.b64 %rd40, {%r123, %r126}; + cvt.u32.u64 %r127, %rd40; + add.f32 %r128, %r123, 0fCB40007F; + neg.f32 %r129, %r128; + fma.rn.ftz.f32 %r130, %r120, %r40, %r129; + fma.rn.ftz.f32 %r131, %r120, %r42, %r130; + ex2.approx.ftz.f32 %r132, %r131; + add.f32 %r133, %r126, 0fCB40007F; + neg.f32 %r134, %r133; + fma.rn.ftz.f32 %r135, %r119, %r40, %r134; + fma.rn.ftz.f32 %r136, %r119, %r42, %r135; + shl.b64 %rd41, %rd40, 23; + and.b64 %rd42, %rd41, -36028797018963968; + shl.b32 %r137, %r127, 23; + cvt.u64.u32 %rd43, %r137; + or.b64 %rd44, %rd43, %rd42; + ex2.approx.ftz.f32 %r138, %r136; + mov.b64 {%r139, %r140}, %rd44; + mul.f32 %r141, %r132, %r139; + mul.f32 %r142, %r138, %r140; + .loc 2 199 39 // triton_helpers.py:199:39 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r143, 0f3F800000, %r142, %p13; + selp.f32 %r144, 0f3F800000, %r141, %p12; + .loc 2 205 36 // triton_helpers.py:205:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + mov.b64 {%r145, %r146}, %rd63; + fma.rn.f32 %r147, %r145, %r118, %r144; + fma.rn.f32 %r148, %r146, %r117, %r143; +$L__tmp6: + .loc 1 45 54 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:45:54 + selp.f32 %r149, %r22, %r16, %p1; + selp.f32 %r150, %r24, %r15, %p1; + mov.b64 %rd66, {%r150, %r149}; + selp.f32 %r151, %r90, %r18, %p1; + selp.f32 %r152, %r92, %r17, %p1; + mov.b64 %rd64, {%r152, %r151}; + .loc 1 46 54 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:46:54 + selp.f32 %r153, %r86, %r84, %p1; + selp.f32 %r154, %r85, %r83, %p1; + mov.b64 %rd65, {%r154, %r153}; + selp.f32 %r155, %r148, %r146, %p1; + selp.f32 %r156, %r147, %r145, %p1; + mov.b64 %rd63, {%r156, %r155}; + .loc 1 31 40 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:31:40 + add.s64 %rd12, %rd62, 2048; + setp.lt.u64 %p14, %rd62, 29952; + mov.b64 %rd62, %rd12; + @%p14 bra $L__BB0_1; +// %bb.2: + .loc 1 26 37 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:26:37 + and.b32 %r169, %r1, 31; +$L__tmp7: + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mov.b64 {%r170, %r171}, %rd66; + setp.gt.f32 %p21, %r170, %r171; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p22, %r170, %r170; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r172, %r170, %r171, %p22; + selp.f32 %r173, %r170, %r172, %p21; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mov.b64 {%r174, %r175}, %rd64; + setp.gt.f32 %p23, %r173, %r174; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p24, %r173, %r173; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r176, %r173, %r174, %p24; + selp.f32 %r177, %r173, %r176, %p23; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p25, %r177, %r175; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p26, %r177, %r177; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r178, %r177, %r175, %p26; + selp.f32 %r179, %r177, %r178, %p25; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r180, %r179, 16, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p27, %r179, %r180; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p28, %r179, %r179; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r181, %r179, %r180, %p27; + selp.f32 %r182, %r179, %r181, %p28; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r183, %r182, 8, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p29, %r182, %r183; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p30, %r182, %r182; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r184, %r182, %r183, %p30; + selp.f32 %r185, %r182, %r184, %p29; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r186, %r185, 4, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p31, %r185, %r186; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p32, %r185, %r185; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r187, %r185, %r186, %p32; + selp.f32 %r188, %r185, %r187, %p31; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r189, %r188, 2, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p33, %r188, %r189; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p34, %r188, %r188; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r190, %r188, %r189, %p34; + selp.f32 %r191, %r188, %r190, %p33; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r192, %r191, 1, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p35, %r191, %r192; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p36, %r191, %r191; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.eq.b32 %p15, %r169, 0; + shr.u32 %r193, %r1, 3; + and.b32 %r194, %r193, 60; + mov.b32 %r195, global_smem; + add.s32 %r157, %r195, %r194; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.b32 %r196, %r191, %r192, %p36; + selp.b32 %r158, %r191, %r196, %p35; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + // begin inline asm + @%p15 st.shared.b32 [ %r157 + 0 ], %r158; + // end inline asm + bar.sync 0; + setp.lt.u32 %p16, %r1, 16; + add.s32 %r160, %r195, %r5; + // begin inline asm + @%p16 ld.shared.b32 %r159, [ %r160 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r198, %r159, 8, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p37, %r159, %r198; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p38, %r159, %r159; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r199, %r159, %r198, %p37; + selp.f32 %r200, %r159, %r199, %p38; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r201, %r200, 4, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p39, %r200, %r201; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p40, %r200, %r200; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r202, %r200, %r201, %p40; + selp.f32 %r203, %r200, %r202, %p39; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r204, %r203, 2, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p41, %r203, %r204; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p42, %r203, %r203; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r205, %r203, %r204, %p42; + selp.f32 %r206, %r203, %r205, %p41; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r207, %r206, 1, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p43, %r206, %r207; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p44, %r206, %r206; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.eq.b32 %p17, %r1, 0; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.b32 %r208, %r206, %r207, %p44; + selp.b32 %r162, %r206, %r208, %p43; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + // begin inline asm + @%p17 st.shared.b32 [ %r160 + 0 ], %r162; + // end inline asm + bar.sync 0; + ld.shared.b32 %r2, [global_smem]; + .loc 2 180 40 // triton_helpers.py:180:40 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.eq.f32 %p45, %r2, 0fFF800000; + .loc 2 180 68 // triton_helpers.py:180:68 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + sub.f32 %r209, %r170, %r2; + sub.f32 %r210, %r171, %r2; + sub.f32 %r211, %r174, %r2; + sub.f32 %r212, %r175, %r2; + .loc 2 180 58 // triton_helpers.py:180:58 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r213, 0f00000000, %r209, %p45; + selp.f32 %r214, 0f00000000, %r210, %p45; + selp.f32 %r215, 0f00000000, %r211, %p45; + selp.f32 %r216, 0f00000000, %r212, %p45; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + fma.rn.ftz.f32 %r219, %r213, %r28, %r27; + cvt.ftz.sat.f32.f32 %r220, %r219; + fma.rm.ftz.f32 %r223, %r220, %r32, %r31; + fma.rn.ftz.f32 %r224, %r214, %r28, %r27; + cvt.ftz.sat.f32.f32 %r225, %r224; + fma.rm.ftz.f32 %r226, %r225, %r32, %r31; + fma.rn.ftz.f32 %r227, %r215, %r28, %r27; + cvt.ftz.sat.f32.f32 %r228, %r227; + fma.rm.ftz.f32 %r229, %r228, %r32, %r31; + fma.rn.ftz.f32 %r230, %r216, %r28, %r27; + cvt.ftz.sat.f32.f32 %r231, %r230; + fma.rm.ftz.f32 %r232, %r231, %r32, %r31; + mov.b64 %rd46, {%r223, %r226}; + cvt.u32.u64 %r233, %rd46; + add.f32 %r234, %r223, 0fCB40007F; + neg.f32 %r235, %r234; + fma.rn.ftz.f32 %r237, %r213, %r40, %r235; + fma.rn.ftz.f32 %r239, %r213, %r42, %r237; + ex2.approx.ftz.f32 %r240, %r239; + add.f32 %r241, %r226, 0fCB40007F; + neg.f32 %r242, %r241; + fma.rn.ftz.f32 %r243, %r214, %r40, %r242; + fma.rn.ftz.f32 %r244, %r214, %r42, %r243; + shl.b64 %rd47, %rd46, 23; + and.b64 %rd48, %rd47, -36028797018963968; + shl.b32 %r245, %r233, 23; + cvt.u64.u32 %rd49, %r245; + or.b64 %rd50, %rd49, %rd48; + ex2.approx.ftz.f32 %r246, %r244; + mov.b64 {%r247, %r248}, %rd50; + mul.f32 %r249, %r240, %r247; + mul.f32 %r250, %r246, %r248; + .loc 2 181 31 // triton_helpers.py:181:31 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mov.b64 {%r251, %r252}, %rd65; + mul.f32 %r253, %r252, %r250; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mov.b64 %rd51, {%r229, %r232}; + cvt.u32.u64 %r254, %rd51; + add.f32 %r255, %r229, 0fCB40007F; + neg.f32 %r256, %r255; + fma.rn.ftz.f32 %r257, %r215, %r40, %r256; + fma.rn.ftz.f32 %r258, %r215, %r42, %r257; + ex2.approx.ftz.f32 %r259, %r258; + add.f32 %r260, %r232, 0fCB40007F; + neg.f32 %r261, %r260; + fma.rn.ftz.f32 %r262, %r216, %r40, %r261; + fma.rn.ftz.f32 %r263, %r216, %r42, %r262; + shl.b64 %rd52, %rd51, 23; + and.b64 %rd53, %rd52, -36028797018963968; + shl.b32 %r264, %r254, 23; + cvt.u64.u32 %rd54, %r264; + or.b64 %rd55, %rd54, %rd53; + ex2.approx.ftz.f32 %r265, %r263; + mov.b64 {%r266, %r267}, %rd55; + mul.f32 %r268, %r265, %r267; + mul.f32 %r269, %r259, %r266; + .loc 2 181 31 // triton_helpers.py:181:31 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mov.b64 {%r270, %r271}, %rd63; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + bar.sync 0; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + fma.rn.f32 %r272, %r251, %r249, %r253; + fma.rn.f32 %r273, %r270, %r269, %r272; + fma.rn.f32 %r274, %r271, %r268, %r273; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r275, %r274, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r276, %r274, %r275; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r277, %r276, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r278, %r276, %r277; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r279, %r278, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r280, %r278, %r279; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r281, %r280, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r282, %r280, %r281; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r283, %r282, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r164, %r282, %r283; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + // begin inline asm + @%p15 st.shared.b32 [ %r157 + 0 ], %r164; + // end inline asm + bar.sync 0; + // begin inline asm + @%p16 ld.shared.b32 %r165, [ %r160 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r284, %r165, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r285, %r165, %r284; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r286, %r285, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r287, %r285, %r286; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r288, %r287, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r289, %r287, %r288; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r290, %r289, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r168, %r289, %r290; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + // begin inline asm + @%p17 st.shared.b32 [ %r160 + 0 ], %r168; + // end inline asm + bar.sync 0; + mov.b64 %rd67, 0; + ld.shared.b32 %r3, [global_smem]; +$L__tmp8: +$L__BB0_3: // =>This Inner Loop Header: Depth=1 + .loc 1 54 29 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:54:29 + add.s64 %rd60, %rd1, %rd67; + setp.lt.u64 %p46, %rd60, 32000; + .loc 1 58 34 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:58:34 + add.s64 %rd61, %rd2, %rd67; + cvt.u32.u64 %r299, %rd61; + mad.wide.s32 %rd57, %r299, 2, %rd15; + .loc 1 58 52 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:58:52 + // begin inline asm + mov.u64 %rd56, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd56, 1.0; + // end inline asm + mov.b32 %r293, 0; + // begin inline asm + mov.u32 %r291, %r293; + mov.u32 %r292, %r293; + @%p46 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r291, %r292 }, [ %rd57 + 0 ], %rd56; + // end inline asm + mov.b32 {%rs5, %rs6}, %r291; + mov.b32 {%rs7, %rs8}, %r292; + .loc 1 58 106 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:58:106 + cvt.f32.bf16 %r300, %rs5; + cvt.f32.bf16 %r301, %rs6; + cvt.f32.bf16 %r302, %rs7; + cvt.f32.bf16 %r303, %rs8; + .loc 1 60 22 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:60:22 + sub.f32 %r304, %r300, %r2; + sub.f32 %r305, %r301, %r2; + sub.f32 %r306, %r302, %r2; + sub.f32 %r307, %r303, %r2; + mov.b32 %r308, 0f3F000000; + mov.b32 %r309, 0f3BBB989D; + .loc 1 61 29 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:61:29 + fma.rn.ftz.f32 %r310, %r304, %r309, %r308; + cvt.ftz.sat.f32.f32 %r311, %r310; + mov.b32 %r312, 0f4B400001; + mov.b32 %r313, 0f437C0000; + fma.rm.ftz.f32 %r314, %r311, %r313, %r312; + add.f32 %r315, %r314, 0fCB40007F; + neg.f32 %r316, %r315; + mov.b32 %r317, 0f3FB8AA3B; + fma.rn.ftz.f32 %r318, %r304, %r317, %r316; + mov.b32 %r319, 0f32A57060; + fma.rn.ftz.f32 %r320, %r304, %r319, %r318; + shl.b32 %r321, %r314, 23; + ex2.approx.ftz.f32 %r322, %r320; + mul.f32 %r323, %r322, %r321; + fma.rn.ftz.f32 %r324, %r305, %r309, %r308; + cvt.ftz.sat.f32.f32 %r325, %r324; + fma.rm.ftz.f32 %r326, %r325, %r313, %r312; + add.f32 %r327, %r326, 0fCB40007F; + neg.f32 %r328, %r327; + fma.rn.ftz.f32 %r329, %r305, %r317, %r328; + fma.rn.ftz.f32 %r330, %r305, %r319, %r329; + shl.b32 %r331, %r326, 23; + ex2.approx.ftz.f32 %r332, %r330; + mul.f32 %r333, %r332, %r331; + fma.rn.ftz.f32 %r334, %r306, %r309, %r308; + cvt.ftz.sat.f32.f32 %r335, %r334; + fma.rm.ftz.f32 %r336, %r335, %r313, %r312; + add.f32 %r337, %r336, 0fCB40007F; + neg.f32 %r338, %r337; + fma.rn.ftz.f32 %r339, %r306, %r317, %r338; + fma.rn.ftz.f32 %r340, %r306, %r319, %r339; + shl.b32 %r341, %r336, 23; + ex2.approx.ftz.f32 %r342, %r340; + mul.f32 %r343, %r342, %r341; + fma.rn.ftz.f32 %r344, %r307, %r309, %r308; + cvt.ftz.sat.f32.f32 %r345, %r344; + fma.rm.ftz.f32 %r346, %r345, %r313, %r312; + add.f32 %r347, %r346, 0fCB40007F; + neg.f32 %r348, %r347; + fma.rn.ftz.f32 %r349, %r307, %r317, %r348; + fma.rn.ftz.f32 %r350, %r307, %r319, %r349; + shl.b32 %r351, %r346, 23; + ex2.approx.ftz.f32 %r352, %r350; + mul.f32 %r353, %r352, %r351; + .loc 1 62 23 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:62:23 + div.full.f32 %r295, %r323, %r3; + div.full.f32 %r296, %r333, %r3; + div.full.f32 %r297, %r343, %r3; + div.full.f32 %r298, %r353, %r3; + .loc 1 63 29 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:63:29 + mad.wide.s32 %rd59, %r299, 4, %rd16; + .loc 1 63 53 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:63:53 + // begin inline asm + @%p46 st.global.v4.b32 [ %rd59 + 0 ], { %r295, %r296, %r297, %r298 }; + // end inline asm + .loc 1 52 40 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:52:40 + add.s64 %rd14, %rd67, 2048; + setp.lt.u64 %p48, %rd67, 29952; + mov.b64 %rd67, %rd14; + @%p48 bra $L__BB0_3; +// %bb.4: + .loc 1 52 4 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:52:4 + ret; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 276 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x10d DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 99 +.b8 122 +.b8 109 +.b8 109 +.b8 114 +.b8 108 +.b8 117 +.b8 51 +.b8 117 +.b8 54 +.b8 55 +.b8 106 +.b8 110 +.b8 51 +.b8 103 +.b8 105 +.b8 116 +.b8 102 +.b8 119 +.b8 121 +.b8 117 +.b8 51 +.b8 106 +.b8 105 +.b8 102 +.b8 51 +.b8 98 +.b8 51 +.b8 112 +.b8 119 +.b8 111 +.b8 120 +.b8 106 +.b8 51 +.b8 55 +.b8 120 +.b8 112 +.b8 52 +.b8 100 +.b8 100 +.b8 97 +.b8 99 +.b8 107 +.b8 99 +.b8 117 +.b8 102 +.b8 119 +.b8 105 +.b8 113 +.b8 114 +.b8 105 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 99 +.b8 122 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x46 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 101 +.b8 120 +.b8 112 +.b8 95 +.b8 112 +.b8 114 +.b8 101 +.b8 112 +.b8 97 +.b8 114 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 111 +.b8 110 +.b8 108 +.b8 105 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 117 +.b8 98 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xd1:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xe6:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 40 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xfe:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 49 // DW_AT_call_line +.b8 33 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source new file mode 100644 index 0000000000000000000000000000000000000000..269e4399f8859bcf58fb4d4b5dfbc769b683b412 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source @@ -0,0 +1,449 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":18:0) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":186:0) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":109:0) +#loc68 = loc(unknown) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":169:0) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":177:0) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":122:0) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc108 = loc("in_ptr0"(#loc)) +#loc109 = loc("out_ptr2"(#loc)) +#loc110 = loc("xnumel"(#loc)) +#loc111 = loc("r0_numel"(#loc)) +#loc146 = loc("lhs_max"(#loc48)) +#loc147 = loc("lhs_sum"(#loc48)) +#loc148 = loc("rhs_max"(#loc48)) +#loc160 = loc("a"(#loc62)) +#loc161 = loc("b"(#loc62)) +#loc165 = loc("x"(#loc72)) +#loc166 = loc("x"(#loc76)) +#loc167 = loc("x"(#loc81)) +#loc168 = loc("lhs_max"(#loc85)) +#loc169 = loc("lhs_sum"(#loc85)) +#loc178 = loc("a"(#loc96)) +#loc179 = loc("input"(#loc100)) +#loc180 = loc("a"(#loc104)) +#loc181 = loc("b"(#loc104)) +module { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 4096 : i32 loc(#loc112) + %r0_numel_1 = arith.constant 32000 : i32 loc(#loc113) + %xoffset = tt.get_program_id x : i32 loc(#loc114) + %xoffset_2 = arith.constant 1 : i32 loc(#loc115) + %xoffset_3 = arith.constant 1 : i32 loc(#loc115) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc115) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc116) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc117) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc118) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc118) + %xmask = arith.constant true loc(#loc119) + %xmask_8 = arith.constant dense : tensor<1x2048xi1> loc(#loc119) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc120) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc121) + %_tmp3_max = arith.constant 0xFF800000 : f32 loc(#loc122) + %_tmp3_max_10 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc122) + %_tmp3_sum = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc123) + %c0_i32 = arith.constant 0 : i32 loc(#loc13) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc13) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc13) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc13) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc13) + %3 = ub.poison : i32 loc(#loc13) + %_tmp3_sum_11:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_max_14 = %_tmp3_max_10, %_tmp3_sum_15 = %_tmp3_sum) -> (tensor<1x2048xf32>, tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc125) + %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc125) + %r0_mask = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc126) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x2048xi32> loc(#loc126) + %tmp0 = arith.constant 32000 : i32 loc(#loc127) + %tmp0_18 = arith.constant 32000 : i32 loc(#loc127) + %tmp0_19 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc127) + %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc127) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc128) + %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x2048xi32> loc(#loc128) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc129) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc129) + %tmp0_25 = arith.constant 0.000000e+00 : f32 loc(#loc130) + %tmp0_26 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc130) + %tmp0_27 = arith.truncf %tmp0_26 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc130) + %tmp0_28 = tt.load %tmp0_24, %r0_mask_17, %tmp0_27 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc130) + %tmp0_29 = arith.extf %tmp0_28 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc131) + %9:2 = tt.call @"torch._inductor.runtime.triton_helpers.online_softmax_combine__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_False_"(%_tmp3_max_14, %_tmp3_sum_15, %tmp0_29) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1x2048xf32>, tensor<1x2048xf32>) loc(#loc21) + %_tmp3_max_30 = arith.select %r0_mask_17, %9#0, %_tmp3_max_14 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc132) + %_tmp3_sum_31 = arith.select %r0_mask_17, %9#1, %_tmp3_sum_15 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc133) + scf.yield %_tmp3_max_30, %_tmp3_sum_31 : tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc24) + } loc(#loc182) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.online_softmax_reduce__fp32S1_2048S_fp32S1_2048S__(2,)cconstexpr_1__(3,)cconstexpr_False_"(%_tmp3_sum_11#0, %_tmp3_sum_11#1) : (tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>) loc(#loc25) + %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc134) + %tmp4 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc135) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc28) + %c2048_i32_13 = arith.constant 2048 : i32 loc(#loc28) + %5 = arith.bitcast %c0_i32_12 : i32 to i32 loc(#loc28) + %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc28) + %7 = arith.bitcast %c2048_i32_13 : i32 to i32 loc(#loc28) + %8 = ub.poison : i32 loc(#loc28) + scf.for %r0_offset = %5 to %6 step %7 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc136) + %r0_index_14 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc136) + %r0_mask = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc137) + %r0_mask_15 = arith.cmpi slt, %r0_index_14, %r0_mask : tensor<1x2048xi32> loc(#loc137) + %tmp5 = arith.constant 32000 : i32 loc(#loc138) + %tmp5_16 = arith.constant 32000 : i32 loc(#loc138) + %tmp5_17 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc138) + %tmp5_18 = arith.muli %tmp5_17, %xindex_7 : tensor<1x1xi32> loc(#loc138) + %tmp5_19 = tt.broadcast %tmp5_18 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc139) + %tmp5_20 = arith.addi %r0_index_14, %tmp5_19 : tensor<1x2048xi32> loc(#loc139) + %tmp5_21 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc140) + %tmp5_22 = tt.addptr %tmp5_21, %tmp5_20 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc140) + %tmp5_23 = arith.constant 0.000000e+00 : f32 loc(#loc141) + %tmp5_24 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc141) + %tmp5_25 = arith.truncf %tmp5_24 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc141) + %tmp5_26 = tt.load %tmp5_22, %r0_mask_15, %tmp5_25 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc141) + %tmp5_27 = arith.extf %tmp5_26 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc142) + %tmp7 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc143) + %tmp7_28 = arith.subf %tmp5_27, %tmp7 : tensor<1x2048xf32> loc(#loc143) + %tmp8 = tt.extern_elementwise %tmp7_28 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc144) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc145) + %tmp9_29 = arith.divf %tmp8, %tmp9 : tensor<1x2048xf32> loc(#loc145) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc39) + %c32000_i32_30 = arith.constant 32000 : i32 loc(#loc39) + %cst = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc39) + %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc39) + %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc40) + %11 = arith.addi %r0_index_14, %10 : tensor<1x2048xi32> loc(#loc40) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc41) + %13 = tt.addptr %12, %11 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc41) + tt.store %13, %tmp9_29, %r0_mask_15 : tensor<1x2048x!tt.ptr> loc(#loc42) + } loc(#loc28) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() -> tensor<1x2048xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc45) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc45) + tt.return %cst_0 : tensor<1x2048xf32> loc(#loc46) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x2048xf32> loc(#loc47) + tt.return %0 : tensor<1x2048xf32> loc(#loc47) + } loc(#loc44) + tt.func private @"torch._inductor.runtime.triton_helpers.online_softmax_combine__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_False_"(%lhs_max: tensor<1x2048xf32> loc("lhs_max"(#loc48)), %lhs_sum: tensor<1x2048xf32> loc("lhs_sum"(#loc48)), %rhs_max: tensor<1x2048xf32> loc("rhs_max"(#loc48))) -> (tensor<1x2048xf32>, tensor<1x2048xf32>) attributes {noinline = false} { + %out_max = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32S1_2048S_fp32S1_2048S__(%lhs_max, %rhs_max) : (tensor<1x2048xf32>, tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc149) + %lhs_scale = arith.constant 0xFF800000 : f32 loc(#loc150) + %lhs_scale_0 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc150) + %lhs_scale_1 = arith.cmpf oeq, %out_max, %lhs_scale_0 : tensor<1x2048xf32> loc(#loc150) + %lhs_scale_2 = arith.subf %lhs_max, %out_max : tensor<1x2048xf32> loc(#loc151) + %lhs_scale_3 = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_2048S__(1,)cconstexpr_False_"(%lhs_scale_2) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc152) + %lhs_scale_4 = arith.constant 1.000000e+00 : f32 loc(#loc153) + %lhs_scale_5 = arith.constant 1.000000e+00 : f32 loc(#loc153) + %lhs_scale_6 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc153) + %lhs_scale_7 = arith.select %lhs_scale_1, %lhs_scale_6, %lhs_scale_3 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc153) + %rhs_scale = arith.constant 0xFF800000 : f32 loc(#loc154) + %rhs_scale_8 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc154) + %rhs_scale_9 = arith.cmpf oeq, %out_max, %rhs_scale_8 : tensor<1x2048xf32> loc(#loc154) + %rhs_scale_10 = arith.subf %rhs_max, %out_max : tensor<1x2048xf32> loc(#loc155) + %rhs_scale_11 = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_2048S__(1,)cconstexpr_False_"(%rhs_scale_10) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc156) + %rhs_scale_12 = arith.constant 1.000000e+00 : f32 loc(#loc157) + %rhs_scale_13 = arith.constant 1.000000e+00 : f32 loc(#loc157) + %rhs_scale_14 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc157) + %rhs_scale_15 = arith.select %rhs_scale_9, %rhs_scale_14, %rhs_scale_11 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc157) + %out_sum = arith.mulf %lhs_sum, %lhs_scale_7 : tensor<1x2048xf32> loc(#loc158) + %out_sum_16 = arith.addf %out_sum, %rhs_scale_15 : tensor<1x2048xf32> loc(#loc159) + tt.return %out_max, %out_sum_16 : tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc60) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x2048xf32> loc(#loc61) + %1 = ub.poison : tensor<1x2048xf32> loc(#loc61) + tt.return %0, %1 : tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc61) + } loc(#loc48) + tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32S1_2048S_fp32S1_2048S__(%a: tensor<1x2048xf32> loc("a"(#loc62)), %b: tensor<1x2048xf32> loc("b"(#loc62))) -> tensor<1x2048xf32> attributes {noinline = false} { + %mask = arith.cmpf ogt, %a, %b : tensor<1x2048xf32> loc(#loc183) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%a) : (tensor<1x2048xf32>) -> i1 loc(#loc64) + %1 = scf.if %0 -> (tensor<1x2048xi1>) { + %mask_0 = arith.cmpf une, %a, %a : tensor<1x2048xf32> loc(#loc163) + %mask_1 = arith.ori %mask, %mask_0 : tensor<1x2048xi1> loc(#loc184) + scf.yield %mask_1 : tensor<1x2048xi1> loc(#loc184) + } else { + scf.yield %mask : tensor<1x2048xi1> loc(#loc68) + } loc(#loc65) + %2 = arith.select %1, %a, %b : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc69) + tt.return %2 : tensor<1x2048xf32> loc(#loc70) + ^bb1: // no predecessors + %3 = ub.poison : tensor<1x2048xf32> loc(#loc71) + tt.return %3 : tensor<1x2048xf32> loc(#loc71) + } loc(#loc62) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc72))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc73) + %true = arith.constant true loc(#loc74) + tt.return %true : i1 loc(#loc74) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc75) + tt.return %1 : i1 loc(#loc75) + } loc(#loc72) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc76))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc77) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc78) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc78) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc78) + %4 = arith.addf %x, %3 : tensor<1x2048xf32> loc(#loc78) + tt.return %4 : tensor<1x2048xf32> loc(#loc79) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x2048xf32> loc(#loc80) + tt.return %5 : tensor<1x2048xf32> loc(#loc80) + } loc(#loc76) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc45) + %cst = arith.constant dense : tensor<1xi1> loc(#loc45) + tt.return %cst : tensor<1xi1> loc(#loc46) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc47) + tt.return %0 : tensor<1xi1> loc(#loc47) + } loc(#loc44) + tt.func private @"torch._inductor.runtime.triton_helpers.exp__fp32S1_2048S__(1,)cconstexpr_False_"(%x: tensor<1x2048xf32> loc("x"(#loc81))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.extern_elementwise %x {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc82) + tt.return %0 : tensor<1x2048xf32> loc(#loc83) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x2048xf32> loc(#loc84) + tt.return %1 : tensor<1x2048xf32> loc(#loc84) + } loc(#loc81) + tt.func private @"torch._inductor.runtime.triton_helpers.online_softmax_reduce__fp32S1_2048S_fp32S1_2048S__(2,)cconstexpr_1__(3,)cconstexpr_False_"(%lhs_max: tensor<1x2048xf32> loc("lhs_max"(#loc85)), %lhs_sum: tensor<1x2048xf32> loc("lhs_sum"(#loc85))) -> (tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} { + %out_max = tt.call @"torch._inductor.runtime.triton_helpers.max2__fp32S1_2048S__(1,)cconstexpr_1_"(%lhs_max) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc170) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc171) + %delta = arith.constant 0xFF800000 : f32 loc(#loc172) + %delta_0 = arith.constant dense<0xFF800000> : tensor<1x1xf32> loc(#loc172) + %delta_1 = arith.cmpf oeq, %out_max_keepdim, %delta_0 : tensor<1x1xf32> loc(#loc172) + %delta_2 = tt.broadcast %out_max_keepdim : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc173) + %delta_3 = arith.subf %lhs_max, %delta_2 : tensor<1x2048xf32> loc(#loc173) + %delta_4 = arith.constant 0 : i32 loc(#loc174) + %delta_5 = arith.constant 0.000000e+00 : f32 loc(#loc174) + %delta_6 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc174) + %delta_7 = tt.broadcast %delta_1 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc174) + %delta_8 = arith.select %delta_7, %delta_6, %delta_3 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc174) + %out_sum = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_2048S__(1,)cconstexpr_False_"(%delta_8) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc175) + %out_sum_9 = arith.mulf %lhs_sum, %out_sum : tensor<1x2048xf32> loc(#loc176) + %out_sum_10 = tt.call @"triton.language.standard.sum__fp32S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%out_sum_9) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc177) + tt.return %out_max, %out_sum_10 : tensor<1xf32>, tensor<1xf32> loc(#loc94) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xf32> loc(#loc95) + %1 = ub.poison : tensor<1xf32> loc(#loc95) + tt.return %0, %1 : tensor<1xf32>, tensor<1xf32> loc(#loc95) + } loc(#loc85) + tt.func private @"torch._inductor.runtime.triton_helpers.max2__fp32S1_2048S__(1,)cconstexpr_1_"(%a: tensor<1x2048xf32> loc("a"(#loc96))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%a) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc97) + tt.reduce.return %2 : f32 loc(#loc97) + }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc97) + tt.return %0 : tensor<1xf32> loc(#loc98) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc99) + tt.return %1 : tensor<1xf32> loc(#loc99) + } loc(#loc96) + tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%a: f32 loc("a"(#loc62)), %b: f32 loc("b"(#loc62))) -> f32 attributes {noinline = false} { + %mask = arith.cmpf ogt, %a, %b : f32 loc(#loc183) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a) : (f32) -> i1 loc(#loc64) + %1 = scf.if %0 -> (i1) { + %mask_0 = arith.cmpf une, %a, %a : f32 loc(#loc163) + %mask_1 = arith.ori %mask, %mask_0 : i1 loc(#loc184) + scf.yield %mask_1 : i1 loc(#loc184) + } else { + scf.yield %mask : i1 loc(#loc68) + } loc(#loc65) + %2 = arith.select %1, %a, %b : f32 loc(#loc69) + tt.return %2 : f32 loc(#loc70) + ^bb1: // no predecessors + %3 = ub.poison : f32 loc(#loc71) + tt.return %3 : f32 loc(#loc71) + } loc(#loc62) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc72))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc73) + %true = arith.constant true loc(#loc74) + tt.return %true : i1 loc(#loc74) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc75) + tt.return %1 : i1 loc(#loc75) + } loc(#loc72) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc76))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc77) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc78) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc78) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc78) + tt.return %3 : tensor<1xf32> loc(#loc79) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc80) + tt.return %4 : tensor<1xf32> loc(#loc80) + } loc(#loc76) + tt.func private @"triton.language.standard.sum__fp32S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2048xf32> loc("input"(#loc100))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc101) + tt.reduce.return %2 : f32 loc(#loc101) + }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc101) + tt.return %0 : tensor<1xf32> loc(#loc102) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc103) + tt.return %1 : tensor<1xf32> loc(#loc103) + } loc(#loc100) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc104)), %b: f32 loc("b"(#loc104))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc105) + tt.return %0 : f32 loc(#loc106) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc107) + tt.return %1 : f32 loc(#loc107) + } loc(#loc104) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":29:59) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":30:45) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":31:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":32:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":33:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:47) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:52) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:105) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":42:40) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":45:54) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":46:54) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":46:8) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":49:33) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":50:16) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":51:16) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":52:40) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":53:31) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":54:29) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:47) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:52) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:106) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":60:22) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":61:29) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":62:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:42) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:36) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:29) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:53) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":52:4) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:19) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":206:11) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":206:4) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:19) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:7) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:11) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:4) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:15) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":170:4) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":182:11) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":182:4) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:11) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:4) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc112 = loc("xnumel"(#loc1)) +#loc113 = loc("r0_numel"(#loc2)) +#loc114 = loc("xoffset"(#loc3)) +#loc115 = loc("xoffset"(#loc4)) +#loc116 = loc("xindex"(#loc5)) +#loc117 = loc("xindex"(#loc6)) +#loc118 = loc("xindex"(#loc7)) +#loc119 = loc("xmask"(#loc8)) +#loc120 = loc("r0_base"(#loc9)) +#loc121 = loc("r0_base"(#loc10)) +#loc122 = loc("_tmp3_max"(#loc11)) +#loc123 = loc("_tmp3_sum"(#loc12)) +#loc124 = loc("_tmp3_max"(#loc13)) +#loc125 = loc("r0_index"(#loc14)) +#loc126 = loc("r0_mask"(#loc15)) +#loc127 = loc("tmp0"(#loc16)) +#loc128 = loc("tmp0"(#loc17)) +#loc129 = loc("tmp0"(#loc18)) +#loc130 = loc("tmp0"(#loc19)) +#loc131 = loc("tmp0"(#loc20)) +#loc132 = loc("_tmp3_max"(#loc22)) +#loc133 = loc("_tmp3_sum"(#loc23)) +#loc134 = loc("tmp3"(#loc26)) +#loc135 = loc("tmp4"(#loc27)) +#loc136 = loc("r0_index"(#loc29)) +#loc137 = loc("r0_mask"(#loc30)) +#loc138 = loc("tmp5"(#loc31)) +#loc139 = loc("tmp5"(#loc32)) +#loc140 = loc("tmp5"(#loc33)) +#loc141 = loc("tmp5"(#loc34)) +#loc142 = loc("tmp5"(#loc35)) +#loc143 = loc("tmp7"(#loc36)) +#loc144 = loc("tmp8"(#loc37)) +#loc145 = loc("tmp9"(#loc38)) +#loc149 = loc("out_max"(#loc49)) +#loc150 = loc("lhs_scale"(#loc50)) +#loc151 = loc("lhs_scale"(#loc51)) +#loc152 = loc("lhs_scale"(#loc52)) +#loc153 = loc("lhs_scale"(#loc53)) +#loc154 = loc("rhs_scale"(#loc54)) +#loc155 = loc("rhs_scale"(#loc55)) +#loc156 = loc("rhs_scale"(#loc56)) +#loc157 = loc("rhs_scale"(#loc57)) +#loc158 = loc("out_sum"(#loc58)) +#loc159 = loc("out_sum"(#loc59)) +#loc162 = loc("mask"(#loc63)) +#loc163 = loc("mask"(#loc66)) +#loc164 = loc("mask"(#loc67)) +#loc170 = loc("out_max"(#loc86)) +#loc171 = loc("out_max_keepdim"(#loc87)) +#loc172 = loc("delta"(#loc88)) +#loc173 = loc("delta"(#loc89)) +#loc174 = loc("delta"(#loc90)) +#loc175 = loc("out_sum"(#loc91)) +#loc176 = loc("out_sum"(#loc92)) +#loc177 = loc("out_sum"(#loc93)) +#loc182 = loc("_tmp3_sum"(#loc124)) +#loc183 = loc("mask"(#loc162)) +#loc184 = loc("mask"(#loc164)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..41e049d3c2c83b741ff1e9be23f0ed70f4a980f3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/C3FCZCDEMCLSFODWXLEH5MRAQRWLOTRP4SAQURVAE7BPHZSTV2WQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir @@ -0,0 +1,226 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":18:0) +#loc1 = loc(unknown) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":49:33) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc57 = loc("in_ptr0"(#loc)) +#loc58 = loc("out_ptr2"(#loc)) +#loc59 = loc("xnumel"(#loc)) +#loc60 = loc("r0_numel"(#loc)) +#loc86 = loc("out_max"(#loc32)) +#loc93 = loc("out_sum"(#loc41)) +#loc118 = loc(callsite(#loc86 at #loc33)) +#loc125 = loc(callsite(#loc93 at #loc33)) +#loc133 = loc(callsite(#loc1 at #loc118)) +#loc136 = loc(callsite(#loc1 at #loc125)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32000> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %cst_1 = arith.constant dense<0xFF800000> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc61) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc62) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc62) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc63) + %tmp0_6 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc104) + %tmp0_7 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc65) + %_tmp3_sum:2 = scf.for %_tmp3_sum_14 = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%arg5 = %cst_4, %arg6 = %cst_3) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp3_sum_14 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc67) + %r0_index_15 = arith.addi %r0_index, %r0_base_5 : tensor<1x2048xi32, #blocked> loc(#loc67) + %r0_mask = arith.cmpi slt, %r0_index_15, %cst : tensor<1x2048xi32, #blocked> loc(#loc68) + %tmp0_16 = arith.addi %r0_index_15, %tmp0_6 : tensor<1x2048xi32, #blocked> loc(#loc64) + %tmp0_17 = tt.addptr %tmp0_7, %tmp0_16 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc65) + %tmp0_18 = tt.load %tmp0_17, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc69) + %tmp0_19 = arith.extf %tmp0_18 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc70) + %mask = arith.cmpf ogt, %arg5, %tmp0_19 : tensor<1x2048xf32, #blocked> loc(#loc126) + %mask_20 = arith.cmpf une, %arg5, %arg5 : tensor<1x2048xf32, #blocked> loc(#loc127) + %mask_21 = arith.ori %mask, %mask_20 : tensor<1x2048xi1, #blocked> loc(#loc128) + %out_max_22 = arith.select %mask_21, %arg5, %tmp0_19 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc129) + %lhs_scale = arith.cmpf oeq, %out_max_22, %cst_4 : tensor<1x2048xf32, #blocked> loc(#loc109) + %lhs_scale_23 = arith.subf %arg5, %out_max_22 : tensor<1x2048xf32, #blocked> loc(#loc110) + %lhs_scale_24 = tt.extern_elementwise %lhs_scale_23 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked> loc(#loc130) + %lhs_scale_25 = arith.select %lhs_scale, %cst_2, %lhs_scale_24 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc112) + %rhs_scale = arith.subf %tmp0_19, %out_max_22 : tensor<1x2048xf32, #blocked> loc(#loc113) + %rhs_scale_26 = tt.extern_elementwise %rhs_scale {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked> loc(#loc131) + %rhs_scale_27 = arith.select %lhs_scale, %cst_2, %rhs_scale_26 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc115) + %out_sum_28 = arith.mulf %arg6, %lhs_scale_25 : tensor<1x2048xf32, #blocked> loc(#loc116) + %out_sum_29 = arith.addf %out_sum_28, %rhs_scale_27 : tensor<1x2048xf32, #blocked> loc(#loc117) + %_tmp3_max = arith.select %r0_mask, %out_max_22, %arg5 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc84) + %_tmp3_sum_30 = arith.select %r0_mask, %out_sum_29, %arg6 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc85) + scf.yield %_tmp3_max, %_tmp3_sum_30 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc30) + } loc(#loc105) + %out_max = "tt.reduce"(%_tmp3_sum#0) <{axis = 1 : i32}> ({ + ^bb0(%out_max_14: f32 loc(callsite(#loc1 at #loc118)), %out_max_15: f32 loc(callsite(#loc1 at #loc118))): + %mask = arith.cmpf ogt, %out_max_14, %out_max_15 : f32 loc(#loc137) + %mask_16 = arith.cmpf une, %out_max_14, %out_max_14 : f32 loc(#loc138) + %mask_17 = arith.ori %mask, %mask_16 : i1 loc(#loc139) + %out_max_18 = arith.select %mask_17, %out_max_14, %out_max_15 : f32 loc(#loc140) + tt.reduce.return %out_max_18 : f32 loc(#loc132) + }) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc132) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc119) + %delta = arith.cmpf oeq, %out_max_keepdim, %cst_1 : tensor<1x1xf32, #blocked> loc(#loc120) + %delta_8 = tt.broadcast %out_max_keepdim : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc121) + %delta_9 = arith.subf %_tmp3_sum#0, %delta_8 : tensor<1x2048xf32, #blocked> loc(#loc121) + %delta_10 = tt.broadcast %delta : tensor<1x1xi1, #blocked> -> tensor<1x2048xi1, #blocked> loc(#loc122) + %delta_11 = arith.select %delta_10, %cst_3, %delta_9 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc122) + %out_sum = tt.extern_elementwise %delta_11 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked> loc(#loc134) + %out_sum_12 = arith.mulf %_tmp3_sum#1, %out_sum : tensor<1x2048xf32, #blocked> loc(#loc124) + %out_sum_13 = "tt.reduce"(%out_sum_12) <{axis = 1 : i32}> ({ + ^bb0(%out_sum_14: f32 loc(callsite(#loc1 at #loc125)), %out_sum_15: f32 loc(callsite(#loc1 at #loc125))): + %out_sum_16 = arith.addf %out_sum_14, %out_sum_15 : f32 loc(#loc141) + tt.reduce.return %out_sum_16 : f32 loc(#loc135) + }) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc135) + %tmp4 = tt.expand_dims %out_sum_13 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc94) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc95) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc45) + scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc96) + %r0_index_14 = arith.addi %r0_index, %r0_base_5 : tensor<1x2048xi32, #blocked> loc(#loc96) + %r0_mask = arith.cmpi slt, %r0_index_14, %cst : tensor<1x2048xi32, #blocked> loc(#loc97) + %tmp5 = arith.addi %r0_index_14, %tmp0_6 : tensor<1x2048xi32, #blocked> loc(#loc98) + %tmp5_15 = tt.addptr %tmp0_7, %tmp5 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc99) + %tmp5_16 = tt.load %tmp5_15, %r0_mask, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc100) + %tmp5_17 = arith.extf %tmp5_16 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc101) + %tmp7 = arith.subf %tmp5_17, %delta_8 : tensor<1x2048xf32, #blocked> loc(#loc102) + %tmp8 = tt.extern_elementwise %tmp7 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked> loc(#loc103) + %tmp9_18 = arith.divf %tmp8, %tmp9 : tensor<1x2048xf32, #blocked> loc(#loc95) + %1 = tt.addptr %0, %tmp5 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc45) + tt.store %1, %tmp9_18, %r0_mask : tensor<1x2048x!tt.ptr, #blocked> loc(#loc55) + } loc(#loc46) + tt.return loc(#loc56) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":26:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:47) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:41) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:34) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":31:40) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":32:31) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":33:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:52) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:105) +#loc12 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc13 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":42:40) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":45:54) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":46:54) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":46:8) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":51:16) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":62:23) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:29) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":52:40) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":53:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":54:29) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:41) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:34) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:52) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:106) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":60:22) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":61:29) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:53) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":52:4) +#loc61 = loc("xoffset"(#loc2)) +#loc62 = loc("r0_base"(#loc3)) +#loc63 = loc("tmp0"(#loc4)) +#loc64 = loc("tmp0"(#loc5)) +#loc65 = loc("tmp0"(#loc6)) +#loc66 = loc("_tmp3_max"(#loc7)) +#loc67 = loc("r0_index"(#loc8)) +#loc68 = loc("r0_mask"(#loc9)) +#loc69 = loc("tmp0"(#loc10)) +#loc70 = loc("tmp0"(#loc11)) +#loc71 = loc("mask"(#loc12)) +#loc72 = loc("out_max"(#loc13)) +#loc73 = loc("mask"(#loc15)) +#loc74 = loc("mask"(#loc16)) +#loc75 = loc("lhs_scale"(#loc18)) +#loc76 = loc("lhs_scale"(#loc19)) +#loc77 = loc("lhs_scale"(#loc21)) +#loc78 = loc("lhs_scale"(#loc22)) +#loc79 = loc("rhs_scale"(#loc23)) +#loc80 = loc("rhs_scale"(#loc24)) +#loc81 = loc("rhs_scale"(#loc25)) +#loc82 = loc("out_sum"(#loc26)) +#loc83 = loc("out_sum"(#loc27)) +#loc84 = loc("_tmp3_max"(#loc28)) +#loc85 = loc("_tmp3_sum"(#loc29)) +#loc87 = loc("out_max_keepdim"(#loc34)) +#loc88 = loc("delta"(#loc35)) +#loc89 = loc("delta"(#loc36)) +#loc90 = loc("delta"(#loc37)) +#loc91 = loc("out_sum"(#loc38)) +#loc92 = loc("out_sum"(#loc39)) +#loc94 = loc("tmp4"(#loc43)) +#loc95 = loc("tmp9"(#loc44)) +#loc96 = loc("r0_index"(#loc47)) +#loc97 = loc("r0_mask"(#loc48)) +#loc98 = loc("tmp5"(#loc49)) +#loc99 = loc("tmp5"(#loc50)) +#loc100 = loc("tmp5"(#loc51)) +#loc101 = loc("tmp5"(#loc52)) +#loc102 = loc("tmp7"(#loc53)) +#loc103 = loc("tmp8"(#loc54)) +#loc104 = loc(fused[#loc64, #loc63]) +#loc105 = loc("_tmp3_sum"(#loc66)) +#loc106 = loc("mask"(#loc71)) +#loc107 = loc(callsite(#loc72 at #loc14)) +#loc108 = loc("mask"(#loc74)) +#loc109 = loc(callsite(#loc75 at #loc14)) +#loc110 = loc(callsite(#loc76 at #loc14)) +#loc111 = loc(callsite(#loc77 at #loc14)) +#loc112 = loc(callsite(#loc78 at #loc14)) +#loc113 = loc(callsite(#loc79 at #loc14)) +#loc114 = loc(callsite(#loc80 at #loc14)) +#loc115 = loc(callsite(#loc81 at #loc14)) +#loc116 = loc(callsite(#loc82 at #loc14)) +#loc117 = loc(callsite(#loc83 at #loc14)) +#loc119 = loc(callsite(#loc87 at #loc33)) +#loc120 = loc(callsite(#loc88 at #loc33)) +#loc121 = loc(callsite(#loc89 at #loc33)) +#loc122 = loc(callsite(#loc90 at #loc33)) +#loc123 = loc(callsite(#loc91 at #loc33)) +#loc124 = loc(callsite(#loc92 at #loc33)) +#loc126 = loc(callsite(#loc106 at #loc107)) +#loc127 = loc(callsite(#loc73 at #loc107)) +#loc128 = loc(callsite(#loc108 at #loc107)) +#loc129 = loc(callsite(#loc17 at #loc107)) +#loc130 = loc(callsite(#loc20 at #loc111)) +#loc131 = loc(callsite(#loc20 at #loc114)) +#loc132 = loc(callsite(#loc31 at #loc118)) +#loc134 = loc(callsite(#loc20 at #loc123)) +#loc135 = loc(callsite(#loc40 at #loc125)) +#loc137 = loc(callsite(#loc106 at #loc132)) +#loc138 = loc(callsite(#loc73 at #loc132)) +#loc139 = loc(callsite(#loc108 at #loc132)) +#loc140 = loc(callsite(#loc17 at #loc132)) +#loc141 = loc(callsite(#loc42 at #loc135)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0b27aca88e2b281adfb4b4192e01534a0cd21159 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ca6a25aca8c11d55d1a5f860385f5d8bec230156 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json @@ -0,0 +1 @@ +{"hash": "101205f0dc80840d38f351cb0f5cd251028b5ffd2cf2374f80cb0d1b22e1461e", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 16384, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..1eb6f91ee5846837955ec6e710b9fbac3d186bc6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir @@ -0,0 +1,5386 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !5 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = and i32 %8, 511, !dbg !9 + %10 = shl nuw nsw i32 %9, 3, !dbg !9 + %11 = or disjoint i32 %10, 4096, !dbg !9 + %12 = or disjoint i32 %10, 8192, !dbg !9 + %13 = or disjoint i32 %10, 12288, !dbg !9 + %14 = mul i32 %7, 32000, !dbg !10 + %15 = zext nneg i32 %13 to i64, !dbg !11 + br label %16, !dbg !11 + +16: ; preds = %6, %16 + %17 = phi i1 [ true, %6 ], [ false, %16 ] + %indvars.iv = phi i64 [ 0, %6 ], [ 16384, %16 ] + %18 = phi float [ 0.000000e+00, %6 ], [ %1512, %16 ] + %19 = phi float [ 0.000000e+00, %6 ], [ %1513, %16 ] + %20 = phi float [ 0.000000e+00, %6 ], [ %1514, %16 ] + %21 = phi float [ 0.000000e+00, %6 ], [ %1515, %16 ] + %22 = phi float [ 0.000000e+00, %6 ], [ %1516, %16 ] + %23 = phi float [ 0.000000e+00, %6 ], [ %1517, %16 ] + %24 = phi float [ 0.000000e+00, %6 ], [ %1518, %16 ] + %25 = phi float [ 0.000000e+00, %6 ], [ %1519, %16 ] + %26 = phi float [ 0.000000e+00, %6 ], [ %1820, %16 ] + %27 = phi float [ 0.000000e+00, %6 ], [ %1821, %16 ] + %28 = phi float [ 0xFFF0000000000000, %6 ], [ %1813, %16 ] + %29 = phi float [ 0xFFF0000000000000, %6 ], [ %1814, %16 ] + %30 = phi <2 x float> [ zeroinitializer, %6 ], [ %1824, %16 ] + %31 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %1819, %16 ] + %32 = phi <2 x float> [ zeroinitializer, %6 ], [ %1823, %16 ] + %33 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %1818, %16 ] + %34 = phi <2 x float> [ zeroinitializer, %6 ], [ %1822, %16 ] + %35 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %1817, %16 ] + %36 = phi <16 x float> [ zeroinitializer, %6 ], [ %1511, %16 ] + %37 = phi <16 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %876, %16 ] + %38 = phi <8 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %127, %16 ] + %39 = or disjoint i64 %indvars.iv, %15, !dbg !12 + %40 = icmp samesign ult i64 %39, 32000, !dbg !13 + %41 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !14 + %42 = or disjoint i32 %10, %41, !dbg !14 + %43 = add i32 %42, %14, !dbg !14 + %44 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !14 + %45 = or disjoint i32 %11, %44, !dbg !14 + %46 = add i32 %45, %14, !dbg !14 + %47 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !14 + %48 = or disjoint i32 %12, %47, !dbg !14 + %49 = add i32 %48, %14, !dbg !14 + %50 = trunc nuw nsw i64 %39 to i32, !dbg !14 + %51 = add i32 %14, %50, !dbg !14 + %52 = sext i32 %43 to i64, !dbg !15 + %53 = getelementptr bfloat, ptr addrspace(1) %0, i64 %52, !dbg !15 + %54 = sext i32 %46 to i64, !dbg !15 + %55 = getelementptr bfloat, ptr addrspace(1) %0, i64 %54, !dbg !15 + %56 = sext i32 %49 to i64, !dbg !15 + %57 = getelementptr bfloat, ptr addrspace(1) %0, i64 %56, !dbg !15 + %58 = sext i32 %51 to i64, !dbg !15 + %59 = getelementptr bfloat, ptr addrspace(1) %0, i64 %58, !dbg !15 + %60 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !16 + %61 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %53, i64 %60, i1 true) #6, !dbg !16 + %62 = extractvalue { i32, i32, i32, i32 } %61, 0, !dbg !16 + %63 = bitcast i32 %62 to <2 x bfloat>, !dbg !16 + %64 = extractvalue { i32, i32, i32, i32 } %61, 1, !dbg !16 + %65 = bitcast i32 %64 to <2 x bfloat>, !dbg !16 + %66 = extractvalue { i32, i32, i32, i32 } %61, 2, !dbg !16 + %67 = bitcast i32 %66 to <2 x bfloat>, !dbg !16 + %68 = extractvalue { i32, i32, i32, i32 } %61, 3, !dbg !16 + %69 = bitcast i32 %68 to <2 x bfloat>, !dbg !16 + %70 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !16 + %71 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %55, i64 %70, i1 true) #6, !dbg !16 + %72 = extractvalue { i32, i32, i32, i32 } %71, 0, !dbg !16 + %73 = bitcast i32 %72 to <2 x bfloat>, !dbg !16 + %74 = extractvalue { i32, i32, i32, i32 } %71, 1, !dbg !16 + %75 = bitcast i32 %74 to <2 x bfloat>, !dbg !16 + %76 = extractvalue { i32, i32, i32, i32 } %71, 2, !dbg !16 + %77 = bitcast i32 %76 to <2 x bfloat>, !dbg !16 + %78 = extractvalue { i32, i32, i32, i32 } %71, 3, !dbg !16 + %79 = bitcast i32 %78 to <2 x bfloat>, !dbg !16 + %80 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !16 + %81 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %57, i64 %80, i1 true) #6, !dbg !16 + %82 = extractvalue { i32, i32, i32, i32 } %81, 0, !dbg !16 + %83 = bitcast i32 %82 to <2 x bfloat>, !dbg !16 + %84 = extractvalue { i32, i32, i32, i32 } %81, 1, !dbg !16 + %85 = bitcast i32 %84 to <2 x bfloat>, !dbg !16 + %86 = extractvalue { i32, i32, i32, i32 } %81, 2, !dbg !16 + %87 = bitcast i32 %86 to <2 x bfloat>, !dbg !16 + %88 = extractvalue { i32, i32, i32, i32 } %81, 3, !dbg !16 + %89 = bitcast i32 %88 to <2 x bfloat>, !dbg !16 + %90 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !16 + %91 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %59, i64 %90, i1 %40) #6, !dbg !16 + %92 = extractvalue { i32, i32, i32, i32 } %91, 0, !dbg !16 + %93 = bitcast i32 %92 to <2 x bfloat>, !dbg !16 + %94 = extractvalue { i32, i32, i32, i32 } %91, 1, !dbg !16 + %95 = bitcast i32 %94 to <2 x bfloat>, !dbg !16 + %96 = extractvalue { i32, i32, i32, i32 } %91, 2, !dbg !16 + %97 = bitcast i32 %96 to <2 x bfloat>, !dbg !16 + %98 = extractvalue { i32, i32, i32, i32 } %91, 3, !dbg !16 + %99 = bitcast i32 %98 to <2 x bfloat>, !dbg !16 + %100 = extractelement <2 x bfloat> %93, i64 0, !dbg !16 + %101 = extractelement <2 x bfloat> %93, i64 1, !dbg !16 + %102 = fpext bfloat %100 to float, !dbg !17 + %103 = fpext bfloat %101 to float, !dbg !17 + %104 = fcmp ogt float %28, %102, !dbg !18 + %105 = fcmp ogt float %29, %103, !dbg !18 + %106 = fcmp uno <16 x float> %37, zeroinitializer, !dbg !22 + %107 = fcmp uno <8 x float> %38, zeroinitializer, !dbg !22 + %108 = fcmp uno float %28, 0.000000e+00, !dbg !22 + %109 = fcmp uno float %29, 0.000000e+00, !dbg !22 + %110 = fcmp uno <2 x float> %35, zeroinitializer, !dbg !22 + %111 = fcmp uno <2 x float> %33, zeroinitializer, !dbg !22 + %112 = fcmp uno <2 x float> %31, zeroinitializer, !dbg !22 + %113 = or i1 %108, %104, !dbg !23 + %114 = or i1 %109, %105, !dbg !23 + %115 = shufflevector <2 x bfloat> %83, <2 x bfloat> %85, <8 x i32> , !dbg !17 + %116 = shufflevector <2 x bfloat> %87, <2 x bfloat> poison, <8 x i32> , !dbg !17 + %117 = shufflevector <8 x bfloat> %115, <8 x bfloat> %116, <8 x i32> , !dbg !17 + %118 = shufflevector <2 x bfloat> %87, <2 x bfloat> poison, <8 x i32> , !dbg !17 + %119 = shufflevector <8 x bfloat> %117, <8 x bfloat> %118, <8 x i32> , !dbg !17 + %120 = shufflevector <2 x bfloat> %89, <2 x bfloat> poison, <8 x i32> , !dbg !17 + %121 = shufflevector <8 x bfloat> %119, <8 x bfloat> %120, <8 x i32> , !dbg !17 + %122 = shufflevector <2 x bfloat> %89, <2 x bfloat> poison, <8 x i32> , !dbg !17 + %123 = shufflevector <8 x bfloat> %121, <8 x bfloat> %122, <8 x i32> , !dbg !17 + %124 = fpext <8 x bfloat> %123 to <8 x float>, !dbg !17 + %125 = fcmp ogt <8 x float> %38, %124, !dbg !18 + %126 = or <8 x i1> %107, %125, !dbg !23 + %127 = select <8 x i1> %126, <8 x float> %38, <8 x float> %124, !dbg !24 + %128 = select i1 %113, float %28, float %102, !dbg !24 + %129 = select i1 %114, float %29, float %103, !dbg !24 + %130 = extractelement <8 x float> %127, i64 0, !dbg !25 + %131 = fcmp oeq float %130, 0xFFF0000000000000, !dbg !26 + %132 = extractelement <8 x float> %127, i64 1, !dbg !25 + %133 = fcmp oeq float %132, 0xFFF0000000000000, !dbg !26 + %134 = extractelement <8 x float> %127, i64 2, !dbg !25 + %135 = fcmp oeq float %134, 0xFFF0000000000000, !dbg !26 + %136 = extractelement <8 x float> %127, i64 3, !dbg !25 + %137 = fcmp oeq float %136, 0xFFF0000000000000, !dbg !26 + %138 = extractelement <8 x float> %127, i64 4, !dbg !25 + %139 = fcmp oeq float %138, 0xFFF0000000000000, !dbg !26 + %140 = extractelement <8 x float> %127, i64 5, !dbg !25 + %141 = fcmp oeq float %140, 0xFFF0000000000000, !dbg !26 + %142 = extractelement <8 x float> %127, i64 6, !dbg !25 + %143 = fcmp oeq float %142, 0xFFF0000000000000, !dbg !26 + %144 = extractelement <8 x float> %127, i64 7, !dbg !25 + %145 = fcmp oeq float %144, 0xFFF0000000000000, !dbg !26 + %146 = fcmp oeq float %128, 0xFFF0000000000000, !dbg !26 + %147 = fcmp oeq float %129, 0xFFF0000000000000, !dbg !26 + %foldExtExtBinop = fsub <8 x float> %38, %127, !dbg !27 + %148 = extractelement <8 x float> %foldExtExtBinop, i64 0, !dbg !27 + %foldExtExtBinop1527 = fsub <8 x float> %38, %127, !dbg !27 + %149 = extractelement <8 x float> %foldExtExtBinop1527, i64 1, !dbg !27 + %foldExtExtBinop1529 = fsub <8 x float> %38, %127, !dbg !27 + %150 = extractelement <8 x float> %foldExtExtBinop1529, i64 2, !dbg !27 + %foldExtExtBinop1531 = fsub <8 x float> %38, %127, !dbg !27 + %151 = extractelement <8 x float> %foldExtExtBinop1531, i64 3, !dbg !27 + %foldExtExtBinop1533 = fsub <8 x float> %38, %127, !dbg !27 + %152 = extractelement <8 x float> %foldExtExtBinop1533, i64 4, !dbg !27 + %foldExtExtBinop1535 = fsub <8 x float> %38, %127, !dbg !27 + %153 = extractelement <8 x float> %foldExtExtBinop1535, i64 5, !dbg !27 + %foldExtExtBinop1537 = fsub <8 x float> %38, %127, !dbg !27 + %154 = extractelement <8 x float> %foldExtExtBinop1537, i64 6, !dbg !27 + %foldExtExtBinop1539 = fsub <8 x float> %38, %127, !dbg !27 + %155 = extractelement <8 x float> %foldExtExtBinop1539, i64 7, !dbg !27 + %156 = fsub float %28, %128, !dbg !27 + %157 = fsub float %29, %129, !dbg !27 + %158 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i694 = icmp eq i32 %158, 0, !dbg !28 + %159 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i696 = icmp eq i32 %159, 0, !dbg !28 + %160 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %161 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i700 = icmp eq i32 %161, 0, !dbg !28 + %162 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i702 = icmp eq i32 %162, 0, !dbg !28 + %163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i704 = icmp eq i32 %163, 0, !dbg !28 + %164 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i706 = icmp eq i32 %164, 0, !dbg !28 + %165 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %166 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i710 = icmp eq i32 %166, 0, !dbg !28 + %167 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i712 = icmp eq i32 %167, 0, !dbg !28 + %168 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i714 = icmp eq i32 %168, 0, !dbg !28 + %169 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i716 = icmp eq i32 %169, 0, !dbg !28 + %170 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %171 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i720 = icmp eq i32 %171, 0, !dbg !28 + %172 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i722 = icmp eq i32 %172, 0, !dbg !28 + %173 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i724 = icmp eq i32 %173, 0, !dbg !28 + %174 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i726 = icmp eq i32 %174, 0, !dbg !28 + %175 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %176 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i730 = icmp eq i32 %176, 0, !dbg !28 + %177 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i732 = icmp eq i32 %177, 0, !dbg !28 + %178 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i734 = icmp eq i32 %178, 0, !dbg !28 + %179 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i736 = icmp eq i32 %179, 0, !dbg !28 + %180 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %181 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i740 = icmp eq i32 %181, 0, !dbg !28 + %182 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i742 = icmp eq i32 %182, 0, !dbg !28 + %183 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i744 = icmp eq i32 %183, 0, !dbg !28 + %184 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i746 = icmp eq i32 %184, 0, !dbg !28 + %185 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %186 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i750 = icmp eq i32 %186, 0, !dbg !28 + %187 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i752 = icmp eq i32 %187, 0, !dbg !28 + %188 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i754 = icmp eq i32 %188, 0, !dbg !28 + %189 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i756 = icmp eq i32 %189, 0, !dbg !28 + %190 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %191 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i760 = icmp eq i32 %191, 0, !dbg !28 + %192 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i762 = icmp eq i32 %192, 0, !dbg !28 + %193 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i764 = icmp eq i32 %193, 0, !dbg !28 + %194 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i766 = icmp eq i32 %194, 0, !dbg !28 + %195 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %196 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i770 = icmp eq i32 %196, 0, !dbg !28 + %197 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i772 = icmp eq i32 %197, 0, !dbg !28 + %198 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i774 = icmp eq i32 %198, 0, !dbg !28 + %199 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i776 = icmp eq i32 %199, 0, !dbg !28 + %200 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %201 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i780 = icmp eq i32 %201, 0, !dbg !28 + %202 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i782 = icmp eq i32 %202, 0, !dbg !28 + %203 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i784 = icmp eq i32 %203, 0, !dbg !28 + %204 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i786 = icmp eq i32 %204, 0, !dbg !28 + %205 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %206 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i790 = icmp eq i32 %206, 0, !dbg !28 + %207 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i792 = icmp eq i32 %207, 0, !dbg !28 + %208 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i794 = icmp eq i32 %208, 0, !dbg !28 + %209 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i796 = icmp eq i32 %209, 0, !dbg !28 + %210 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %211 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i800 = icmp eq i32 %211, 0, !dbg !28 + %212 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i802 = icmp eq i32 %212, 0, !dbg !28 + %213 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i804 = icmp eq i32 %213, 0, !dbg !28 + %214 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i806 = icmp eq i32 %214, 0, !dbg !28 + %215 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %216 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i810 = icmp eq i32 %216, 0, !dbg !28 + %217 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i812 = icmp eq i32 %217, 0, !dbg !28 + %218 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i814 = icmp eq i32 %218, 0, !dbg !28 + %219 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i816 = icmp eq i32 %219, 0, !dbg !28 + %220 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %221 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i820 = icmp eq i32 %221, 0, !dbg !28 + %222 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i822 = icmp eq i32 %222, 0, !dbg !28 + %223 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i824 = icmp eq i32 %223, 0, !dbg !28 + %224 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i826 = icmp eq i32 %224, 0, !dbg !28 + %225 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %226 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i830 = icmp eq i32 %226, 0, !dbg !28 + %227 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i832 = icmp eq i32 %227, 0, !dbg !28 + %228 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i834 = icmp eq i32 %228, 0, !dbg !28 + %229 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i836 = icmp eq i32 %229, 0, !dbg !28 + %230 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %231 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i840 = icmp eq i32 %231, 0, !dbg !28 + %232 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i842 = icmp eq i32 %232, 0, !dbg !28 + %233 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i844 = icmp eq i32 %233, 0, !dbg !28 + %234 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i846 = icmp eq i32 %234, 0, !dbg !28 + %235 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %236 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i850 = icmp eq i32 %236, 0, !dbg !28 + %237 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i852 = icmp eq i32 %237, 0, !dbg !28 + %238 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i854 = icmp eq i32 %238, 0, !dbg !28 + %239 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %148, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %240 = tail call float @llvm.nvvm.fma.rn.f(float %148, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i855 = select i1 %.not.i854, float %240, float %239, !dbg !28 + %241 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i856 = icmp eq i32 %241, 0, !dbg !28 + %242 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i855) #6, !dbg !28 + %243 = tail call float @llvm.nvvm.saturate.f(float %.02.i855) #6, !dbg !28 + %.03.i857 = select i1 %.not1.i856, float %243, float %242, !dbg !28 + %244 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i858 = icmp eq i32 %244, 0, !dbg !28 + %245 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i857, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %246 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i857, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i859 = select i1 %.not2.i858, float %246, float %245, !dbg !28 + %247 = fadd float %.04.i859, 0xC168000FE0000000, !dbg !28 + %248 = fneg float %247, !dbg !28 + %249 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i860 = icmp eq i32 %249, 0, !dbg !28 + %250 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %148, float 0x3FF7154760000000, float %248) #6, !dbg !28 + %251 = tail call float @llvm.nvvm.fma.rn.f(float %148, float 0x3FF7154760000000, float %248) #6, !dbg !28 + %.0.i861 = select i1 %.not3.i860, float %251, float %250, !dbg !28 + %252 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i862 = icmp eq i32 %252, 0, !dbg !28 + %253 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %148, float 0x3E54AE0C00000000, float %.0.i861) #6, !dbg !28 + %254 = tail call float @llvm.nvvm.fma.rn.f(float %148, float 0x3E54AE0C00000000, float %.0.i861) #6, !dbg !28 + %.01.i863 = select i1 %.not4.i862, float %254, float %253, !dbg !28 + %255 = bitcast float %.04.i859 to i32, !dbg !28 + %256 = shl i32 %255, 23, !dbg !28 + %257 = bitcast i32 %256 to float, !dbg !28 + %258 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i863) #6, !dbg !28 + %259 = fmul float %258, %257, !dbg !28 + %260 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i864 = icmp eq i32 %260, 0, !dbg !28 + %261 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %149, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %262 = tail call float @llvm.nvvm.fma.rn.f(float %149, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i865 = select i1 %.not.i864, float %262, float %261, !dbg !28 + %263 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i866 = icmp eq i32 %263, 0, !dbg !28 + %264 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i865) #6, !dbg !28 + %265 = tail call float @llvm.nvvm.saturate.f(float %.02.i865) #6, !dbg !28 + %.03.i867 = select i1 %.not1.i866, float %265, float %264, !dbg !28 + %266 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i868 = icmp eq i32 %266, 0, !dbg !28 + %267 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i867, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %268 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i867, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i869 = select i1 %.not2.i868, float %268, float %267, !dbg !28 + %269 = fadd float %.04.i869, 0xC168000FE0000000, !dbg !28 + %270 = fneg float %269, !dbg !28 + %271 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i870 = icmp eq i32 %271, 0, !dbg !28 + %272 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %149, float 0x3FF7154760000000, float %270) #6, !dbg !28 + %273 = tail call float @llvm.nvvm.fma.rn.f(float %149, float 0x3FF7154760000000, float %270) #6, !dbg !28 + %.0.i871 = select i1 %.not3.i870, float %273, float %272, !dbg !28 + %274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i872 = icmp eq i32 %274, 0, !dbg !28 + %275 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %149, float 0x3E54AE0C00000000, float %.0.i871) #6, !dbg !28 + %276 = tail call float @llvm.nvvm.fma.rn.f(float %149, float 0x3E54AE0C00000000, float %.0.i871) #6, !dbg !28 + %.01.i873 = select i1 %.not4.i872, float %276, float %275, !dbg !28 + %277 = bitcast float %.04.i869 to i32, !dbg !28 + %278 = shl i32 %277, 23, !dbg !28 + %279 = bitcast i32 %278 to float, !dbg !28 + %280 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i873) #6, !dbg !28 + %281 = fmul float %280, %279, !dbg !28 + %282 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i874 = icmp eq i32 %282, 0, !dbg !28 + %283 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %150, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %284 = tail call float @llvm.nvvm.fma.rn.f(float %150, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i875 = select i1 %.not.i874, float %284, float %283, !dbg !28 + %285 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i876 = icmp eq i32 %285, 0, !dbg !28 + %286 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i875) #6, !dbg !28 + %287 = tail call float @llvm.nvvm.saturate.f(float %.02.i875) #6, !dbg !28 + %.03.i877 = select i1 %.not1.i876, float %287, float %286, !dbg !28 + %288 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i878 = icmp eq i32 %288, 0, !dbg !28 + %289 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i877, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %290 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i877, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i879 = select i1 %.not2.i878, float %290, float %289, !dbg !28 + %291 = fadd float %.04.i879, 0xC168000FE0000000, !dbg !28 + %292 = fneg float %291, !dbg !28 + %293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i880 = icmp eq i32 %293, 0, !dbg !28 + %294 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %150, float 0x3FF7154760000000, float %292) #6, !dbg !28 + %295 = tail call float @llvm.nvvm.fma.rn.f(float %150, float 0x3FF7154760000000, float %292) #6, !dbg !28 + %.0.i881 = select i1 %.not3.i880, float %295, float %294, !dbg !28 + %296 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i882 = icmp eq i32 %296, 0, !dbg !28 + %297 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %150, float 0x3E54AE0C00000000, float %.0.i881) #6, !dbg !28 + %298 = tail call float @llvm.nvvm.fma.rn.f(float %150, float 0x3E54AE0C00000000, float %.0.i881) #6, !dbg !28 + %.01.i883 = select i1 %.not4.i882, float %298, float %297, !dbg !28 + %299 = bitcast float %.04.i879 to i32, !dbg !28 + %300 = shl i32 %299, 23, !dbg !28 + %301 = bitcast i32 %300 to float, !dbg !28 + %302 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i883) #6, !dbg !28 + %303 = fmul float %302, %301, !dbg !28 + %304 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i884 = icmp eq i32 %304, 0, !dbg !28 + %305 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %151, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %306 = tail call float @llvm.nvvm.fma.rn.f(float %151, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i885 = select i1 %.not.i884, float %306, float %305, !dbg !28 + %307 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i886 = icmp eq i32 %307, 0, !dbg !28 + %308 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i885) #6, !dbg !28 + %309 = tail call float @llvm.nvvm.saturate.f(float %.02.i885) #6, !dbg !28 + %.03.i887 = select i1 %.not1.i886, float %309, float %308, !dbg !28 + %310 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i888 = icmp eq i32 %310, 0, !dbg !28 + %311 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i887, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %312 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i887, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i889 = select i1 %.not2.i888, float %312, float %311, !dbg !28 + %313 = fadd float %.04.i889, 0xC168000FE0000000, !dbg !28 + %314 = fneg float %313, !dbg !28 + %315 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i890 = icmp eq i32 %315, 0, !dbg !28 + %316 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %151, float 0x3FF7154760000000, float %314) #6, !dbg !28 + %317 = tail call float @llvm.nvvm.fma.rn.f(float %151, float 0x3FF7154760000000, float %314) #6, !dbg !28 + %.0.i891 = select i1 %.not3.i890, float %317, float %316, !dbg !28 + %318 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i892 = icmp eq i32 %318, 0, !dbg !28 + %319 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %151, float 0x3E54AE0C00000000, float %.0.i891) #6, !dbg !28 + %320 = tail call float @llvm.nvvm.fma.rn.f(float %151, float 0x3E54AE0C00000000, float %.0.i891) #6, !dbg !28 + %.01.i893 = select i1 %.not4.i892, float %320, float %319, !dbg !28 + %321 = bitcast float %.04.i889 to i32, !dbg !28 + %322 = shl i32 %321, 23, !dbg !28 + %323 = bitcast i32 %322 to float, !dbg !28 + %324 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i893) #6, !dbg !28 + %325 = fmul float %324, %323, !dbg !28 + %326 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i894 = icmp eq i32 %326, 0, !dbg !28 + %327 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %152, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %328 = tail call float @llvm.nvvm.fma.rn.f(float %152, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i895 = select i1 %.not.i894, float %328, float %327, !dbg !28 + %329 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i896 = icmp eq i32 %329, 0, !dbg !28 + %330 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i895) #6, !dbg !28 + %331 = tail call float @llvm.nvvm.saturate.f(float %.02.i895) #6, !dbg !28 + %.03.i897 = select i1 %.not1.i896, float %331, float %330, !dbg !28 + %332 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i898 = icmp eq i32 %332, 0, !dbg !28 + %333 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i897, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %334 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i897, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i899 = select i1 %.not2.i898, float %334, float %333, !dbg !28 + %335 = fadd float %.04.i899, 0xC168000FE0000000, !dbg !28 + %336 = fneg float %335, !dbg !28 + %337 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i900 = icmp eq i32 %337, 0, !dbg !28 + %338 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %152, float 0x3FF7154760000000, float %336) #6, !dbg !28 + %339 = tail call float @llvm.nvvm.fma.rn.f(float %152, float 0x3FF7154760000000, float %336) #6, !dbg !28 + %.0.i901 = select i1 %.not3.i900, float %339, float %338, !dbg !28 + %340 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i902 = icmp eq i32 %340, 0, !dbg !28 + %341 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %152, float 0x3E54AE0C00000000, float %.0.i901) #6, !dbg !28 + %342 = tail call float @llvm.nvvm.fma.rn.f(float %152, float 0x3E54AE0C00000000, float %.0.i901) #6, !dbg !28 + %.01.i903 = select i1 %.not4.i902, float %342, float %341, !dbg !28 + %343 = bitcast float %.04.i899 to i32, !dbg !28 + %344 = shl i32 %343, 23, !dbg !28 + %345 = bitcast i32 %344 to float, !dbg !28 + %346 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i903) #6, !dbg !28 + %347 = fmul float %346, %345, !dbg !28 + %348 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i904 = icmp eq i32 %348, 0, !dbg !28 + %349 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %153, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %350 = tail call float @llvm.nvvm.fma.rn.f(float %153, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i905 = select i1 %.not.i904, float %350, float %349, !dbg !28 + %351 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i906 = icmp eq i32 %351, 0, !dbg !28 + %352 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i905) #6, !dbg !28 + %353 = tail call float @llvm.nvvm.saturate.f(float %.02.i905) #6, !dbg !28 + %.03.i907 = select i1 %.not1.i906, float %353, float %352, !dbg !28 + %354 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i908 = icmp eq i32 %354, 0, !dbg !28 + %355 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i907, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %356 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i907, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i909 = select i1 %.not2.i908, float %356, float %355, !dbg !28 + %357 = fadd float %.04.i909, 0xC168000FE0000000, !dbg !28 + %358 = fneg float %357, !dbg !28 + %359 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i910 = icmp eq i32 %359, 0, !dbg !28 + %360 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %153, float 0x3FF7154760000000, float %358) #6, !dbg !28 + %361 = tail call float @llvm.nvvm.fma.rn.f(float %153, float 0x3FF7154760000000, float %358) #6, !dbg !28 + %.0.i911 = select i1 %.not3.i910, float %361, float %360, !dbg !28 + %362 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i912 = icmp eq i32 %362, 0, !dbg !28 + %363 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %153, float 0x3E54AE0C00000000, float %.0.i911) #6, !dbg !28 + %364 = tail call float @llvm.nvvm.fma.rn.f(float %153, float 0x3E54AE0C00000000, float %.0.i911) #6, !dbg !28 + %.01.i913 = select i1 %.not4.i912, float %364, float %363, !dbg !28 + %365 = bitcast float %.04.i909 to i32, !dbg !28 + %366 = shl i32 %365, 23, !dbg !28 + %367 = bitcast i32 %366 to float, !dbg !28 + %368 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i913) #6, !dbg !28 + %369 = fmul float %368, %367, !dbg !28 + %370 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i914 = icmp eq i32 %370, 0, !dbg !28 + %371 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %154, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %372 = tail call float @llvm.nvvm.fma.rn.f(float %154, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i915 = select i1 %.not.i914, float %372, float %371, !dbg !28 + %373 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i916 = icmp eq i32 %373, 0, !dbg !28 + %374 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i915) #6, !dbg !28 + %375 = tail call float @llvm.nvvm.saturate.f(float %.02.i915) #6, !dbg !28 + %.03.i917 = select i1 %.not1.i916, float %375, float %374, !dbg !28 + %376 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i918 = icmp eq i32 %376, 0, !dbg !28 + %377 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i917, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %378 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i917, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i919 = select i1 %.not2.i918, float %378, float %377, !dbg !28 + %379 = fadd float %.04.i919, 0xC168000FE0000000, !dbg !28 + %380 = fneg float %379, !dbg !28 + %381 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i920 = icmp eq i32 %381, 0, !dbg !28 + %382 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %154, float 0x3FF7154760000000, float %380) #6, !dbg !28 + %383 = tail call float @llvm.nvvm.fma.rn.f(float %154, float 0x3FF7154760000000, float %380) #6, !dbg !28 + %.0.i921 = select i1 %.not3.i920, float %383, float %382, !dbg !28 + %384 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i922 = icmp eq i32 %384, 0, !dbg !28 + %385 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %154, float 0x3E54AE0C00000000, float %.0.i921) #6, !dbg !28 + %386 = tail call float @llvm.nvvm.fma.rn.f(float %154, float 0x3E54AE0C00000000, float %.0.i921) #6, !dbg !28 + %.01.i923 = select i1 %.not4.i922, float %386, float %385, !dbg !28 + %387 = bitcast float %.04.i919 to i32, !dbg !28 + %388 = shl i32 %387, 23, !dbg !28 + %389 = bitcast i32 %388 to float, !dbg !28 + %390 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i923) #6, !dbg !28 + %391 = fmul float %390, %389, !dbg !28 + %392 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i924 = icmp eq i32 %392, 0, !dbg !28 + %393 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %155, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %394 = tail call float @llvm.nvvm.fma.rn.f(float %155, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i925 = select i1 %.not.i924, float %394, float %393, !dbg !28 + %395 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i926 = icmp eq i32 %395, 0, !dbg !28 + %396 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i925) #6, !dbg !28 + %397 = tail call float @llvm.nvvm.saturate.f(float %.02.i925) #6, !dbg !28 + %.03.i927 = select i1 %.not1.i926, float %397, float %396, !dbg !28 + %398 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i928 = icmp eq i32 %398, 0, !dbg !28 + %399 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i927, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %400 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i927, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i929 = select i1 %.not2.i928, float %400, float %399, !dbg !28 + %401 = fadd float %.04.i929, 0xC168000FE0000000, !dbg !28 + %402 = fneg float %401, !dbg !28 + %403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i930 = icmp eq i32 %403, 0, !dbg !28 + %404 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %155, float 0x3FF7154760000000, float %402) #6, !dbg !28 + %405 = tail call float @llvm.nvvm.fma.rn.f(float %155, float 0x3FF7154760000000, float %402) #6, !dbg !28 + %.0.i931 = select i1 %.not3.i930, float %405, float %404, !dbg !28 + %406 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i932 = icmp eq i32 %406, 0, !dbg !28 + %407 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %155, float 0x3E54AE0C00000000, float %.0.i931) #6, !dbg !28 + %408 = tail call float @llvm.nvvm.fma.rn.f(float %155, float 0x3E54AE0C00000000, float %.0.i931) #6, !dbg !28 + %.01.i933 = select i1 %.not4.i932, float %408, float %407, !dbg !28 + %409 = bitcast float %.04.i929 to i32, !dbg !28 + %410 = shl i32 %409, 23, !dbg !28 + %411 = bitcast i32 %410 to float, !dbg !28 + %412 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i933) #6, !dbg !28 + %413 = fmul float %412, %411, !dbg !28 + %414 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i934 = icmp eq i32 %414, 0, !dbg !28 + %415 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %156, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %416 = tail call float @llvm.nvvm.fma.rn.f(float %156, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i935 = select i1 %.not.i934, float %416, float %415, !dbg !28 + %417 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i936 = icmp eq i32 %417, 0, !dbg !28 + %418 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i935) #6, !dbg !28 + %419 = tail call float @llvm.nvvm.saturate.f(float %.02.i935) #6, !dbg !28 + %.03.i937 = select i1 %.not1.i936, float %419, float %418, !dbg !28 + %420 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i938 = icmp eq i32 %420, 0, !dbg !28 + %421 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i937, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %422 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i937, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i939 = select i1 %.not2.i938, float %422, float %421, !dbg !28 + %423 = fadd float %.04.i939, 0xC168000FE0000000, !dbg !28 + %424 = fneg float %423, !dbg !28 + %425 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i940 = icmp eq i32 %425, 0, !dbg !28 + %426 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %156, float 0x3FF7154760000000, float %424) #6, !dbg !28 + %427 = tail call float @llvm.nvvm.fma.rn.f(float %156, float 0x3FF7154760000000, float %424) #6, !dbg !28 + %.0.i941 = select i1 %.not3.i940, float %427, float %426, !dbg !28 + %428 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i942 = icmp eq i32 %428, 0, !dbg !28 + %429 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %156, float 0x3E54AE0C00000000, float %.0.i941) #6, !dbg !28 + %430 = tail call float @llvm.nvvm.fma.rn.f(float %156, float 0x3E54AE0C00000000, float %.0.i941) #6, !dbg !28 + %.01.i943 = select i1 %.not4.i942, float %430, float %429, !dbg !28 + %431 = bitcast float %.04.i939 to i32, !dbg !28 + %432 = shl i32 %431, 23, !dbg !28 + %433 = bitcast i32 %432 to float, !dbg !28 + %434 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i943) #6, !dbg !28 + %435 = fmul float %434, %433, !dbg !28 + %436 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i944 = icmp eq i32 %436, 0, !dbg !28 + %437 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %157, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %438 = tail call float @llvm.nvvm.fma.rn.f(float %157, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i945 = select i1 %.not.i944, float %438, float %437, !dbg !28 + %439 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i946 = icmp eq i32 %439, 0, !dbg !28 + %440 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i945) #6, !dbg !28 + %441 = tail call float @llvm.nvvm.saturate.f(float %.02.i945) #6, !dbg !28 + %.03.i947 = select i1 %.not1.i946, float %441, float %440, !dbg !28 + %442 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i948 = icmp eq i32 %442, 0, !dbg !28 + %443 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i947, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %444 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i947, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i949 = select i1 %.not2.i948, float %444, float %443, !dbg !28 + %445 = fadd float %.04.i949, 0xC168000FE0000000, !dbg !28 + %446 = fneg float %445, !dbg !28 + %447 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i950 = icmp eq i32 %447, 0, !dbg !28 + %448 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %157, float 0x3FF7154760000000, float %446) #6, !dbg !28 + %449 = tail call float @llvm.nvvm.fma.rn.f(float %157, float 0x3FF7154760000000, float %446) #6, !dbg !28 + %.0.i951 = select i1 %.not3.i950, float %449, float %448, !dbg !28 + %450 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i952 = icmp eq i32 %450, 0, !dbg !28 + %451 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %157, float 0x3E54AE0C00000000, float %.0.i951) #6, !dbg !28 + %452 = tail call float @llvm.nvvm.fma.rn.f(float %157, float 0x3E54AE0C00000000, float %.0.i951) #6, !dbg !28 + %.01.i953 = select i1 %.not4.i952, float %452, float %451, !dbg !28 + %453 = bitcast float %.04.i949 to i32, !dbg !28 + %454 = shl i32 %453, 23, !dbg !28 + %455 = bitcast i32 %454 to float, !dbg !28 + %456 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i953) #6, !dbg !28 + %457 = fmul float %456, %455, !dbg !28 + %458 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i954 = icmp eq i32 %458, 0, !dbg !28 + %459 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i956 = icmp eq i32 %459, 0, !dbg !28 + %460 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %461 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i960 = icmp eq i32 %461, 0, !dbg !28 + %462 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i962 = icmp eq i32 %462, 0, !dbg !28 + %463 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i964 = icmp eq i32 %463, 0, !dbg !28 + %464 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i966 = icmp eq i32 %464, 0, !dbg !28 + %465 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %466 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i970 = icmp eq i32 %466, 0, !dbg !28 + %467 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i972 = icmp eq i32 %467, 0, !dbg !28 + %468 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i974 = icmp eq i32 %468, 0, !dbg !28 + %469 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i976 = icmp eq i32 %469, 0, !dbg !28 + %470 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %471 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i980 = icmp eq i32 %471, 0, !dbg !28 + %472 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i982 = icmp eq i32 %472, 0, !dbg !28 + %473 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i984 = icmp eq i32 %473, 0, !dbg !28 + %474 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i986 = icmp eq i32 %474, 0, !dbg !28 + %475 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %476 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i990 = icmp eq i32 %476, 0, !dbg !28 + %477 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i992 = icmp eq i32 %477, 0, !dbg !28 + %478 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i994 = icmp eq i32 %478, 0, !dbg !28 + %479 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i996 = icmp eq i32 %479, 0, !dbg !28 + %480 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %481 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1000 = icmp eq i32 %481, 0, !dbg !28 + %482 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1002 = icmp eq i32 %482, 0, !dbg !28 + %483 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1004 = icmp eq i32 %483, 0, !dbg !28 + %484 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1006 = icmp eq i32 %484, 0, !dbg !28 + %485 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %486 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1010 = icmp eq i32 %486, 0, !dbg !28 + %487 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1012 = icmp eq i32 %487, 0, !dbg !28 + %488 = select i1 %131, float 1.000000e+00, float %259, !dbg !29 + %489 = select i1 %133, float 1.000000e+00, float %281, !dbg !29 + %490 = select i1 %135, float 1.000000e+00, float %303, !dbg !29 + %491 = select i1 %137, float 1.000000e+00, float %325, !dbg !29 + %492 = select i1 %139, float 1.000000e+00, float %347, !dbg !29 + %493 = select i1 %141, float 1.000000e+00, float %369, !dbg !29 + %494 = select i1 %143, float 1.000000e+00, float %391, !dbg !29 + %495 = select i1 %145, float 1.000000e+00, float %413, !dbg !29 + %496 = select i1 %146, float 1.000000e+00, float %435, !dbg !29 + %497 = select i1 %147, float 1.000000e+00, float %457, !dbg !29 + %foldExtExtBinop1541 = fsub <8 x float> %124, %127, !dbg !25 + %498 = extractelement <8 x float> %foldExtExtBinop1541, i64 0, !dbg !25 + %foldExtExtBinop1543 = fsub <8 x float> %124, %127, !dbg !25 + %499 = extractelement <8 x float> %foldExtExtBinop1543, i64 1, !dbg !25 + %foldExtExtBinop1545 = fsub <8 x float> %124, %127, !dbg !25 + %500 = extractelement <8 x float> %foldExtExtBinop1545, i64 2, !dbg !25 + %foldExtExtBinop1547 = fsub <8 x float> %124, %127, !dbg !25 + %501 = extractelement <8 x float> %foldExtExtBinop1547, i64 3, !dbg !25 + %foldExtExtBinop1549 = fsub <8 x float> %124, %127, !dbg !25 + %502 = extractelement <8 x float> %foldExtExtBinop1549, i64 4, !dbg !25 + %foldExtExtBinop1551 = fsub <8 x float> %124, %127, !dbg !25 + %503 = extractelement <8 x float> %foldExtExtBinop1551, i64 5, !dbg !25 + %foldExtExtBinop1553 = fsub <8 x float> %124, %127, !dbg !25 + %504 = extractelement <8 x float> %foldExtExtBinop1553, i64 6, !dbg !25 + %foldExtExtBinop1555 = fsub <8 x float> %124, %127, !dbg !25 + %505 = extractelement <8 x float> %foldExtExtBinop1555, i64 7, !dbg !25 + %506 = fsub float %102, %128, !dbg !25 + %507 = fsub float %103, %129, !dbg !25 + %508 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1014 = icmp eq i32 %508, 0, !dbg !28 + %509 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1016 = icmp eq i32 %509, 0, !dbg !28 + %510 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %511 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1020 = icmp eq i32 %511, 0, !dbg !28 + %512 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1022 = icmp eq i32 %512, 0, !dbg !28 + %513 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1024 = icmp eq i32 %513, 0, !dbg !28 + %514 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1026 = icmp eq i32 %514, 0, !dbg !28 + %515 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %516 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1030 = icmp eq i32 %516, 0, !dbg !28 + %517 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1032 = icmp eq i32 %517, 0, !dbg !28 + %518 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1034 = icmp eq i32 %518, 0, !dbg !28 + %519 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1036 = icmp eq i32 %519, 0, !dbg !28 + %520 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %521 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1040 = icmp eq i32 %521, 0, !dbg !28 + %522 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1042 = icmp eq i32 %522, 0, !dbg !28 + %523 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1044 = icmp eq i32 %523, 0, !dbg !28 + %524 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1046 = icmp eq i32 %524, 0, !dbg !28 + %525 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %526 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1050 = icmp eq i32 %526, 0, !dbg !28 + %527 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1052 = icmp eq i32 %527, 0, !dbg !28 + %528 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1054 = icmp eq i32 %528, 0, !dbg !28 + %529 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1056 = icmp eq i32 %529, 0, !dbg !28 + %530 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %531 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1060 = icmp eq i32 %531, 0, !dbg !28 + %532 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1062 = icmp eq i32 %532, 0, !dbg !28 + %533 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1064 = icmp eq i32 %533, 0, !dbg !28 + %534 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1066 = icmp eq i32 %534, 0, !dbg !28 + %535 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %536 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1070 = icmp eq i32 %536, 0, !dbg !28 + %537 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1072 = icmp eq i32 %537, 0, !dbg !28 + %538 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1074 = icmp eq i32 %538, 0, !dbg !28 + %539 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1076 = icmp eq i32 %539, 0, !dbg !28 + %540 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %541 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1080 = icmp eq i32 %541, 0, !dbg !28 + %542 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1082 = icmp eq i32 %542, 0, !dbg !28 + %543 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1084 = icmp eq i32 %543, 0, !dbg !28 + %544 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1086 = icmp eq i32 %544, 0, !dbg !28 + %545 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %546 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1090 = icmp eq i32 %546, 0, !dbg !28 + %547 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1092 = icmp eq i32 %547, 0, !dbg !28 + %548 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1094 = icmp eq i32 %548, 0, !dbg !28 + %549 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1096 = icmp eq i32 %549, 0, !dbg !28 + %550 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %551 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1100 = icmp eq i32 %551, 0, !dbg !28 + %552 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1102 = icmp eq i32 %552, 0, !dbg !28 + %553 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1104 = icmp eq i32 %553, 0, !dbg !28 + %554 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1106 = icmp eq i32 %554, 0, !dbg !28 + %555 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %556 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1110 = icmp eq i32 %556, 0, !dbg !28 + %557 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1112 = icmp eq i32 %557, 0, !dbg !28 + %558 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1114 = icmp eq i32 %558, 0, !dbg !28 + %559 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1116 = icmp eq i32 %559, 0, !dbg !28 + %560 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %561 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1120 = icmp eq i32 %561, 0, !dbg !28 + %562 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1122 = icmp eq i32 %562, 0, !dbg !28 + %563 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1124 = icmp eq i32 %563, 0, !dbg !28 + %564 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1126 = icmp eq i32 %564, 0, !dbg !28 + %565 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %566 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1130 = icmp eq i32 %566, 0, !dbg !28 + %567 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1132 = icmp eq i32 %567, 0, !dbg !28 + %568 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1134 = icmp eq i32 %568, 0, !dbg !28 + %569 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1136 = icmp eq i32 %569, 0, !dbg !28 + %570 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %571 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1140 = icmp eq i32 %571, 0, !dbg !28 + %572 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1142 = icmp eq i32 %572, 0, !dbg !28 + %573 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1144 = icmp eq i32 %573, 0, !dbg !28 + %574 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1146 = icmp eq i32 %574, 0, !dbg !28 + %575 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %576 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1150 = icmp eq i32 %576, 0, !dbg !28 + %577 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1152 = icmp eq i32 %577, 0, !dbg !28 + %578 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1154 = icmp eq i32 %578, 0, !dbg !28 + %579 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1156 = icmp eq i32 %579, 0, !dbg !28 + %580 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %581 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1160 = icmp eq i32 %581, 0, !dbg !28 + %582 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1162 = icmp eq i32 %582, 0, !dbg !28 + %583 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1164 = icmp eq i32 %583, 0, !dbg !28 + %584 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1166 = icmp eq i32 %584, 0, !dbg !28 + %585 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %586 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1170 = icmp eq i32 %586, 0, !dbg !28 + %587 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1172 = icmp eq i32 %587, 0, !dbg !28 + %588 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1174 = icmp eq i32 %588, 0, !dbg !28 + %589 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %498, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %590 = tail call float @llvm.nvvm.fma.rn.f(float %498, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1175 = select i1 %.not.i1174, float %590, float %589, !dbg !28 + %591 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1176 = icmp eq i32 %591, 0, !dbg !28 + %592 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1175) #6, !dbg !28 + %593 = tail call float @llvm.nvvm.saturate.f(float %.02.i1175) #6, !dbg !28 + %.03.i1177 = select i1 %.not1.i1176, float %593, float %592, !dbg !28 + %594 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1178 = icmp eq i32 %594, 0, !dbg !28 + %595 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1177, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %596 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1177, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1179 = select i1 %.not2.i1178, float %596, float %595, !dbg !28 + %597 = fadd float %.04.i1179, 0xC168000FE0000000, !dbg !28 + %598 = fneg float %597, !dbg !28 + %599 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1180 = icmp eq i32 %599, 0, !dbg !28 + %600 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %498, float 0x3FF7154760000000, float %598) #6, !dbg !28 + %601 = tail call float @llvm.nvvm.fma.rn.f(float %498, float 0x3FF7154760000000, float %598) #6, !dbg !28 + %.0.i1181 = select i1 %.not3.i1180, float %601, float %600, !dbg !28 + %602 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1182 = icmp eq i32 %602, 0, !dbg !28 + %603 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %498, float 0x3E54AE0C00000000, float %.0.i1181) #6, !dbg !28 + %604 = tail call float @llvm.nvvm.fma.rn.f(float %498, float 0x3E54AE0C00000000, float %.0.i1181) #6, !dbg !28 + %.01.i1183 = select i1 %.not4.i1182, float %604, float %603, !dbg !28 + %605 = bitcast float %.04.i1179 to i32, !dbg !28 + %606 = shl i32 %605, 23, !dbg !28 + %607 = bitcast i32 %606 to float, !dbg !28 + %608 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1183) #6, !dbg !28 + %609 = fmul float %608, %607, !dbg !28 + %610 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1184 = icmp eq i32 %610, 0, !dbg !28 + %611 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %499, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %612 = tail call float @llvm.nvvm.fma.rn.f(float %499, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1185 = select i1 %.not.i1184, float %612, float %611, !dbg !28 + %613 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1186 = icmp eq i32 %613, 0, !dbg !28 + %614 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1185) #6, !dbg !28 + %615 = tail call float @llvm.nvvm.saturate.f(float %.02.i1185) #6, !dbg !28 + %.03.i1187 = select i1 %.not1.i1186, float %615, float %614, !dbg !28 + %616 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1188 = icmp eq i32 %616, 0, !dbg !28 + %617 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1187, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %618 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1187, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1189 = select i1 %.not2.i1188, float %618, float %617, !dbg !28 + %619 = fadd float %.04.i1189, 0xC168000FE0000000, !dbg !28 + %620 = fneg float %619, !dbg !28 + %621 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1190 = icmp eq i32 %621, 0, !dbg !28 + %622 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %499, float 0x3FF7154760000000, float %620) #6, !dbg !28 + %623 = tail call float @llvm.nvvm.fma.rn.f(float %499, float 0x3FF7154760000000, float %620) #6, !dbg !28 + %.0.i1191 = select i1 %.not3.i1190, float %623, float %622, !dbg !28 + %624 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1192 = icmp eq i32 %624, 0, !dbg !28 + %625 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %499, float 0x3E54AE0C00000000, float %.0.i1191) #6, !dbg !28 + %626 = tail call float @llvm.nvvm.fma.rn.f(float %499, float 0x3E54AE0C00000000, float %.0.i1191) #6, !dbg !28 + %.01.i1193 = select i1 %.not4.i1192, float %626, float %625, !dbg !28 + %627 = bitcast float %.04.i1189 to i32, !dbg !28 + %628 = shl i32 %627, 23, !dbg !28 + %629 = bitcast i32 %628 to float, !dbg !28 + %630 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1193) #6, !dbg !28 + %631 = fmul float %630, %629, !dbg !28 + %632 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1194 = icmp eq i32 %632, 0, !dbg !28 + %633 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %500, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %634 = tail call float @llvm.nvvm.fma.rn.f(float %500, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1195 = select i1 %.not.i1194, float %634, float %633, !dbg !28 + %635 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1196 = icmp eq i32 %635, 0, !dbg !28 + %636 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1195) #6, !dbg !28 + %637 = tail call float @llvm.nvvm.saturate.f(float %.02.i1195) #6, !dbg !28 + %.03.i1197 = select i1 %.not1.i1196, float %637, float %636, !dbg !28 + %638 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1198 = icmp eq i32 %638, 0, !dbg !28 + %639 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1197, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %640 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1197, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1199 = select i1 %.not2.i1198, float %640, float %639, !dbg !28 + %641 = fadd float %.04.i1199, 0xC168000FE0000000, !dbg !28 + %642 = fneg float %641, !dbg !28 + %643 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1200 = icmp eq i32 %643, 0, !dbg !28 + %644 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %500, float 0x3FF7154760000000, float %642) #6, !dbg !28 + %645 = tail call float @llvm.nvvm.fma.rn.f(float %500, float 0x3FF7154760000000, float %642) #6, !dbg !28 + %.0.i1201 = select i1 %.not3.i1200, float %645, float %644, !dbg !28 + %646 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1202 = icmp eq i32 %646, 0, !dbg !28 + %647 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %500, float 0x3E54AE0C00000000, float %.0.i1201) #6, !dbg !28 + %648 = tail call float @llvm.nvvm.fma.rn.f(float %500, float 0x3E54AE0C00000000, float %.0.i1201) #6, !dbg !28 + %.01.i1203 = select i1 %.not4.i1202, float %648, float %647, !dbg !28 + %649 = bitcast float %.04.i1199 to i32, !dbg !28 + %650 = shl i32 %649, 23, !dbg !28 + %651 = bitcast i32 %650 to float, !dbg !28 + %652 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1203) #6, !dbg !28 + %653 = fmul float %652, %651, !dbg !28 + %654 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1204 = icmp eq i32 %654, 0, !dbg !28 + %655 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %501, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %656 = tail call float @llvm.nvvm.fma.rn.f(float %501, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1205 = select i1 %.not.i1204, float %656, float %655, !dbg !28 + %657 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1206 = icmp eq i32 %657, 0, !dbg !28 + %658 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1205) #6, !dbg !28 + %659 = tail call float @llvm.nvvm.saturate.f(float %.02.i1205) #6, !dbg !28 + %.03.i1207 = select i1 %.not1.i1206, float %659, float %658, !dbg !28 + %660 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1208 = icmp eq i32 %660, 0, !dbg !28 + %661 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1207, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %662 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1207, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1209 = select i1 %.not2.i1208, float %662, float %661, !dbg !28 + %663 = fadd float %.04.i1209, 0xC168000FE0000000, !dbg !28 + %664 = fneg float %663, !dbg !28 + %665 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1210 = icmp eq i32 %665, 0, !dbg !28 + %666 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %501, float 0x3FF7154760000000, float %664) #6, !dbg !28 + %667 = tail call float @llvm.nvvm.fma.rn.f(float %501, float 0x3FF7154760000000, float %664) #6, !dbg !28 + %.0.i1211 = select i1 %.not3.i1210, float %667, float %666, !dbg !28 + %668 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1212 = icmp eq i32 %668, 0, !dbg !28 + %669 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %501, float 0x3E54AE0C00000000, float %.0.i1211) #6, !dbg !28 + %670 = tail call float @llvm.nvvm.fma.rn.f(float %501, float 0x3E54AE0C00000000, float %.0.i1211) #6, !dbg !28 + %.01.i1213 = select i1 %.not4.i1212, float %670, float %669, !dbg !28 + %671 = bitcast float %.04.i1209 to i32, !dbg !28 + %672 = shl i32 %671, 23, !dbg !28 + %673 = bitcast i32 %672 to float, !dbg !28 + %674 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1213) #6, !dbg !28 + %675 = fmul float %674, %673, !dbg !28 + %676 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1214 = icmp eq i32 %676, 0, !dbg !28 + %677 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %502, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %678 = tail call float @llvm.nvvm.fma.rn.f(float %502, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1215 = select i1 %.not.i1214, float %678, float %677, !dbg !28 + %679 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1216 = icmp eq i32 %679, 0, !dbg !28 + %680 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1215) #6, !dbg !28 + %681 = tail call float @llvm.nvvm.saturate.f(float %.02.i1215) #6, !dbg !28 + %.03.i1217 = select i1 %.not1.i1216, float %681, float %680, !dbg !28 + %682 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1218 = icmp eq i32 %682, 0, !dbg !28 + %683 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1217, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %684 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1217, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1219 = select i1 %.not2.i1218, float %684, float %683, !dbg !28 + %685 = fadd float %.04.i1219, 0xC168000FE0000000, !dbg !28 + %686 = fneg float %685, !dbg !28 + %687 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1220 = icmp eq i32 %687, 0, !dbg !28 + %688 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %502, float 0x3FF7154760000000, float %686) #6, !dbg !28 + %689 = tail call float @llvm.nvvm.fma.rn.f(float %502, float 0x3FF7154760000000, float %686) #6, !dbg !28 + %.0.i1221 = select i1 %.not3.i1220, float %689, float %688, !dbg !28 + %690 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1222 = icmp eq i32 %690, 0, !dbg !28 + %691 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %502, float 0x3E54AE0C00000000, float %.0.i1221) #6, !dbg !28 + %692 = tail call float @llvm.nvvm.fma.rn.f(float %502, float 0x3E54AE0C00000000, float %.0.i1221) #6, !dbg !28 + %.01.i1223 = select i1 %.not4.i1222, float %692, float %691, !dbg !28 + %693 = bitcast float %.04.i1219 to i32, !dbg !28 + %694 = shl i32 %693, 23, !dbg !28 + %695 = bitcast i32 %694 to float, !dbg !28 + %696 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1223) #6, !dbg !28 + %697 = fmul float %696, %695, !dbg !28 + %698 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1224 = icmp eq i32 %698, 0, !dbg !28 + %699 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %503, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %700 = tail call float @llvm.nvvm.fma.rn.f(float %503, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1225 = select i1 %.not.i1224, float %700, float %699, !dbg !28 + %701 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1226 = icmp eq i32 %701, 0, !dbg !28 + %702 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1225) #6, !dbg !28 + %703 = tail call float @llvm.nvvm.saturate.f(float %.02.i1225) #6, !dbg !28 + %.03.i1227 = select i1 %.not1.i1226, float %703, float %702, !dbg !28 + %704 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1228 = icmp eq i32 %704, 0, !dbg !28 + %705 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1227, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %706 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1227, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1229 = select i1 %.not2.i1228, float %706, float %705, !dbg !28 + %707 = fadd float %.04.i1229, 0xC168000FE0000000, !dbg !28 + %708 = fneg float %707, !dbg !28 + %709 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1230 = icmp eq i32 %709, 0, !dbg !28 + %710 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %503, float 0x3FF7154760000000, float %708) #6, !dbg !28 + %711 = tail call float @llvm.nvvm.fma.rn.f(float %503, float 0x3FF7154760000000, float %708) #6, !dbg !28 + %.0.i1231 = select i1 %.not3.i1230, float %711, float %710, !dbg !28 + %712 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1232 = icmp eq i32 %712, 0, !dbg !28 + %713 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %503, float 0x3E54AE0C00000000, float %.0.i1231) #6, !dbg !28 + %714 = tail call float @llvm.nvvm.fma.rn.f(float %503, float 0x3E54AE0C00000000, float %.0.i1231) #6, !dbg !28 + %.01.i1233 = select i1 %.not4.i1232, float %714, float %713, !dbg !28 + %715 = bitcast float %.04.i1229 to i32, !dbg !28 + %716 = shl i32 %715, 23, !dbg !28 + %717 = bitcast i32 %716 to float, !dbg !28 + %718 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1233) #6, !dbg !28 + %719 = fmul float %718, %717, !dbg !28 + %720 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1234 = icmp eq i32 %720, 0, !dbg !28 + %721 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %504, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %722 = tail call float @llvm.nvvm.fma.rn.f(float %504, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1235 = select i1 %.not.i1234, float %722, float %721, !dbg !28 + %723 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1236 = icmp eq i32 %723, 0, !dbg !28 + %724 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1235) #6, !dbg !28 + %725 = tail call float @llvm.nvvm.saturate.f(float %.02.i1235) #6, !dbg !28 + %.03.i1237 = select i1 %.not1.i1236, float %725, float %724, !dbg !28 + %726 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1238 = icmp eq i32 %726, 0, !dbg !28 + %727 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1237, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %728 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1237, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1239 = select i1 %.not2.i1238, float %728, float %727, !dbg !28 + %729 = fadd float %.04.i1239, 0xC168000FE0000000, !dbg !28 + %730 = fneg float %729, !dbg !28 + %731 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1240 = icmp eq i32 %731, 0, !dbg !28 + %732 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %504, float 0x3FF7154760000000, float %730) #6, !dbg !28 + %733 = tail call float @llvm.nvvm.fma.rn.f(float %504, float 0x3FF7154760000000, float %730) #6, !dbg !28 + %.0.i1241 = select i1 %.not3.i1240, float %733, float %732, !dbg !28 + %734 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1242 = icmp eq i32 %734, 0, !dbg !28 + %735 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %504, float 0x3E54AE0C00000000, float %.0.i1241) #6, !dbg !28 + %736 = tail call float @llvm.nvvm.fma.rn.f(float %504, float 0x3E54AE0C00000000, float %.0.i1241) #6, !dbg !28 + %.01.i1243 = select i1 %.not4.i1242, float %736, float %735, !dbg !28 + %737 = bitcast float %.04.i1239 to i32, !dbg !28 + %738 = shl i32 %737, 23, !dbg !28 + %739 = bitcast i32 %738 to float, !dbg !28 + %740 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1243) #6, !dbg !28 + %741 = fmul float %740, %739, !dbg !28 + %742 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1244 = icmp eq i32 %742, 0, !dbg !28 + %743 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %505, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %744 = tail call float @llvm.nvvm.fma.rn.f(float %505, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1245 = select i1 %.not.i1244, float %744, float %743, !dbg !28 + %745 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1246 = icmp eq i32 %745, 0, !dbg !28 + %746 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1245) #6, !dbg !28 + %747 = tail call float @llvm.nvvm.saturate.f(float %.02.i1245) #6, !dbg !28 + %.03.i1247 = select i1 %.not1.i1246, float %747, float %746, !dbg !28 + %748 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1248 = icmp eq i32 %748, 0, !dbg !28 + %749 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1247, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %750 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1247, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1249 = select i1 %.not2.i1248, float %750, float %749, !dbg !28 + %751 = fadd float %.04.i1249, 0xC168000FE0000000, !dbg !28 + %752 = fneg float %751, !dbg !28 + %753 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1250 = icmp eq i32 %753, 0, !dbg !28 + %754 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %505, float 0x3FF7154760000000, float %752) #6, !dbg !28 + %755 = tail call float @llvm.nvvm.fma.rn.f(float %505, float 0x3FF7154760000000, float %752) #6, !dbg !28 + %.0.i1251 = select i1 %.not3.i1250, float %755, float %754, !dbg !28 + %756 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1252 = icmp eq i32 %756, 0, !dbg !28 + %757 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %505, float 0x3E54AE0C00000000, float %.0.i1251) #6, !dbg !28 + %758 = tail call float @llvm.nvvm.fma.rn.f(float %505, float 0x3E54AE0C00000000, float %.0.i1251) #6, !dbg !28 + %.01.i1253 = select i1 %.not4.i1252, float %758, float %757, !dbg !28 + %759 = bitcast float %.04.i1249 to i32, !dbg !28 + %760 = shl i32 %759, 23, !dbg !28 + %761 = bitcast i32 %760 to float, !dbg !28 + %762 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1253) #6, !dbg !28 + %763 = fmul float %762, %761, !dbg !28 + %764 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1254 = icmp eq i32 %764, 0, !dbg !28 + %765 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %506, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %766 = tail call float @llvm.nvvm.fma.rn.f(float %506, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1255 = select i1 %.not.i1254, float %766, float %765, !dbg !28 + %767 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1256 = icmp eq i32 %767, 0, !dbg !28 + %768 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1255) #6, !dbg !28 + %769 = tail call float @llvm.nvvm.saturate.f(float %.02.i1255) #6, !dbg !28 + %.03.i1257 = select i1 %.not1.i1256, float %769, float %768, !dbg !28 + %770 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1258 = icmp eq i32 %770, 0, !dbg !28 + %771 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1257, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %772 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1257, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1259 = select i1 %.not2.i1258, float %772, float %771, !dbg !28 + %773 = fadd float %.04.i1259, 0xC168000FE0000000, !dbg !28 + %774 = fneg float %773, !dbg !28 + %775 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1260 = icmp eq i32 %775, 0, !dbg !28 + %776 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %506, float 0x3FF7154760000000, float %774) #6, !dbg !28 + %777 = tail call float @llvm.nvvm.fma.rn.f(float %506, float 0x3FF7154760000000, float %774) #6, !dbg !28 + %.0.i1261 = select i1 %.not3.i1260, float %777, float %776, !dbg !28 + %778 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1262 = icmp eq i32 %778, 0, !dbg !28 + %779 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %506, float 0x3E54AE0C00000000, float %.0.i1261) #6, !dbg !28 + %780 = tail call float @llvm.nvvm.fma.rn.f(float %506, float 0x3E54AE0C00000000, float %.0.i1261) #6, !dbg !28 + %.01.i1263 = select i1 %.not4.i1262, float %780, float %779, !dbg !28 + %781 = bitcast float %.04.i1259 to i32, !dbg !28 + %782 = shl i32 %781, 23, !dbg !28 + %783 = bitcast i32 %782 to float, !dbg !28 + %784 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1263) #6, !dbg !28 + %785 = fmul float %784, %783, !dbg !28 + %786 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1264 = icmp eq i32 %786, 0, !dbg !28 + %787 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %507, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %788 = tail call float @llvm.nvvm.fma.rn.f(float %507, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1265 = select i1 %.not.i1264, float %788, float %787, !dbg !28 + %789 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1266 = icmp eq i32 %789, 0, !dbg !28 + %790 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1265) #6, !dbg !28 + %791 = tail call float @llvm.nvvm.saturate.f(float %.02.i1265) #6, !dbg !28 + %.03.i1267 = select i1 %.not1.i1266, float %791, float %790, !dbg !28 + %792 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1268 = icmp eq i32 %792, 0, !dbg !28 + %793 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1267, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %794 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1267, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1269 = select i1 %.not2.i1268, float %794, float %793, !dbg !28 + %795 = fadd float %.04.i1269, 0xC168000FE0000000, !dbg !28 + %796 = fneg float %795, !dbg !28 + %797 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1270 = icmp eq i32 %797, 0, !dbg !28 + %798 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %507, float 0x3FF7154760000000, float %796) #6, !dbg !28 + %799 = tail call float @llvm.nvvm.fma.rn.f(float %507, float 0x3FF7154760000000, float %796) #6, !dbg !28 + %.0.i1271 = select i1 %.not3.i1270, float %799, float %798, !dbg !28 + %800 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1272 = icmp eq i32 %800, 0, !dbg !28 + %801 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %507, float 0x3E54AE0C00000000, float %.0.i1271) #6, !dbg !28 + %802 = tail call float @llvm.nvvm.fma.rn.f(float %507, float 0x3E54AE0C00000000, float %.0.i1271) #6, !dbg !28 + %.01.i1273 = select i1 %.not4.i1272, float %802, float %801, !dbg !28 + %803 = bitcast float %.04.i1269 to i32, !dbg !28 + %804 = shl i32 %803, 23, !dbg !28 + %805 = bitcast i32 %804 to float, !dbg !28 + %806 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1273) #6, !dbg !28 + %807 = fmul float %806, %805, !dbg !28 + %808 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1274 = icmp eq i32 %808, 0, !dbg !28 + %809 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1276 = icmp eq i32 %809, 0, !dbg !28 + %810 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %811 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1280 = icmp eq i32 %811, 0, !dbg !28 + %812 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1282 = icmp eq i32 %812, 0, !dbg !28 + %813 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1284 = icmp eq i32 %813, 0, !dbg !28 + %814 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1286 = icmp eq i32 %814, 0, !dbg !28 + %815 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %816 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1290 = icmp eq i32 %816, 0, !dbg !28 + %817 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1292 = icmp eq i32 %817, 0, !dbg !28 + %818 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1294 = icmp eq i32 %818, 0, !dbg !28 + %819 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1296 = icmp eq i32 %819, 0, !dbg !28 + %820 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %821 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1300 = icmp eq i32 %821, 0, !dbg !28 + %822 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1302 = icmp eq i32 %822, 0, !dbg !28 + %823 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1304 = icmp eq i32 %823, 0, !dbg !28 + %824 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1306 = icmp eq i32 %824, 0, !dbg !28 + %825 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %826 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1310 = icmp eq i32 %826, 0, !dbg !28 + %827 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1312 = icmp eq i32 %827, 0, !dbg !28 + %828 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1314 = icmp eq i32 %828, 0, !dbg !28 + %829 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1316 = icmp eq i32 %829, 0, !dbg !28 + %830 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %831 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1320 = icmp eq i32 %831, 0, !dbg !28 + %832 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1322 = icmp eq i32 %832, 0, !dbg !28 + %833 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1324 = icmp eq i32 %833, 0, !dbg !28 + %834 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1326 = icmp eq i32 %834, 0, !dbg !28 + %835 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %836 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1330 = icmp eq i32 %836, 0, !dbg !28 + %837 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1332 = icmp eq i32 %837, 0, !dbg !28 + %838 = select i1 %131, float 1.000000e+00, float %609, !dbg !30 + %839 = select i1 %133, float 1.000000e+00, float %631, !dbg !30 + %840 = select i1 %135, float 1.000000e+00, float %653, !dbg !30 + %841 = select i1 %137, float 1.000000e+00, float %675, !dbg !30 + %842 = select i1 %139, float 1.000000e+00, float %697, !dbg !30 + %843 = select i1 %141, float 1.000000e+00, float %719, !dbg !30 + %844 = select i1 %143, float 1.000000e+00, float %741, !dbg !30 + %845 = select i1 %145, float 1.000000e+00, float %763, !dbg !30 + %846 = select i1 %146, float 1.000000e+00, float %785, !dbg !30 + %847 = select i1 %147, float 1.000000e+00, float %807, !dbg !30 + %848 = shufflevector <2 x bfloat> %63, <2 x bfloat> %65, <16 x i32> , !dbg !17 + %849 = shufflevector <2 x bfloat> %67, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %850 = shufflevector <16 x bfloat> %848, <16 x bfloat> %849, <16 x i32> , !dbg !17 + %851 = shufflevector <2 x bfloat> %67, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %852 = shufflevector <16 x bfloat> %850, <16 x bfloat> %851, <16 x i32> , !dbg !17 + %853 = shufflevector <2 x bfloat> %69, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %854 = shufflevector <16 x bfloat> %852, <16 x bfloat> %853, <16 x i32> , !dbg !17 + %855 = shufflevector <2 x bfloat> %69, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %856 = shufflevector <16 x bfloat> %854, <16 x bfloat> %855, <16 x i32> , !dbg !17 + %857 = shufflevector <2 x bfloat> %73, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %858 = shufflevector <16 x bfloat> %856, <16 x bfloat> %857, <16 x i32> , !dbg !17 + %859 = shufflevector <2 x bfloat> %73, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %860 = shufflevector <16 x bfloat> %858, <16 x bfloat> %859, <16 x i32> , !dbg !17 + %861 = shufflevector <2 x bfloat> %75, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %862 = shufflevector <16 x bfloat> %860, <16 x bfloat> %861, <16 x i32> , !dbg !17 + %863 = shufflevector <2 x bfloat> %75, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %864 = shufflevector <16 x bfloat> %862, <16 x bfloat> %863, <16 x i32> , !dbg !17 + %865 = shufflevector <2 x bfloat> %77, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %866 = shufflevector <16 x bfloat> %864, <16 x bfloat> %865, <16 x i32> , !dbg !17 + %867 = shufflevector <2 x bfloat> %77, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %868 = shufflevector <16 x bfloat> %866, <16 x bfloat> %867, <16 x i32> , !dbg !17 + %869 = shufflevector <2 x bfloat> %79, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %870 = shufflevector <16 x bfloat> %868, <16 x bfloat> %869, <16 x i32> , !dbg !17 + %871 = shufflevector <2 x bfloat> %79, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %872 = shufflevector <16 x bfloat> %870, <16 x bfloat> %871, <16 x i32> , !dbg !17 + %873 = fpext <16 x bfloat> %872 to <16 x float>, !dbg !17 + %874 = fcmp ogt <16 x float> %37, %873, !dbg !18 + %875 = or <16 x i1> %106, %874, !dbg !23 + %876 = select <16 x i1> %875, <16 x float> %37, <16 x float> %873, !dbg !24 + %877 = fcmp oeq <16 x float> %876, splat (float 0xFFF0000000000000), !dbg !26 + %foldExtExtBinop1557 = fsub <16 x float> %37, %876, !dbg !27 + %878 = extractelement <16 x float> %foldExtExtBinop1557, i64 0, !dbg !27 + %foldExtExtBinop1559 = fsub <16 x float> %37, %876, !dbg !27 + %879 = extractelement <16 x float> %foldExtExtBinop1559, i64 1, !dbg !27 + %foldExtExtBinop1561 = fsub <16 x float> %37, %876, !dbg !27 + %880 = extractelement <16 x float> %foldExtExtBinop1561, i64 2, !dbg !27 + %foldExtExtBinop1563 = fsub <16 x float> %37, %876, !dbg !27 + %881 = extractelement <16 x float> %foldExtExtBinop1563, i64 3, !dbg !27 + %foldExtExtBinop1565 = fsub <16 x float> %37, %876, !dbg !27 + %882 = extractelement <16 x float> %foldExtExtBinop1565, i64 4, !dbg !27 + %foldExtExtBinop1567 = fsub <16 x float> %37, %876, !dbg !27 + %883 = extractelement <16 x float> %foldExtExtBinop1567, i64 5, !dbg !27 + %foldExtExtBinop1569 = fsub <16 x float> %37, %876, !dbg !27 + %884 = extractelement <16 x float> %foldExtExtBinop1569, i64 6, !dbg !27 + %foldExtExtBinop1571 = fsub <16 x float> %37, %876, !dbg !27 + %885 = extractelement <16 x float> %foldExtExtBinop1571, i64 7, !dbg !27 + %foldExtExtBinop1573 = fsub <16 x float> %37, %876, !dbg !27 + %886 = extractelement <16 x float> %foldExtExtBinop1573, i64 8, !dbg !27 + %foldExtExtBinop1575 = fsub <16 x float> %37, %876, !dbg !27 + %887 = extractelement <16 x float> %foldExtExtBinop1575, i64 9, !dbg !27 + %foldExtExtBinop1577 = fsub <16 x float> %37, %876, !dbg !27 + %888 = extractelement <16 x float> %foldExtExtBinop1577, i64 10, !dbg !27 + %foldExtExtBinop1579 = fsub <16 x float> %37, %876, !dbg !27 + %889 = extractelement <16 x float> %foldExtExtBinop1579, i64 11, !dbg !27 + %foldExtExtBinop1581 = fsub <16 x float> %37, %876, !dbg !27 + %890 = extractelement <16 x float> %foldExtExtBinop1581, i64 12, !dbg !27 + %foldExtExtBinop1583 = fsub <16 x float> %37, %876, !dbg !27 + %891 = extractelement <16 x float> %foldExtExtBinop1583, i64 13, !dbg !27 + %foldExtExtBinop1585 = fsub <16 x float> %37, %876, !dbg !27 + %892 = extractelement <16 x float> %foldExtExtBinop1585, i64 14, !dbg !27 + %foldExtExtBinop1587 = fsub <16 x float> %37, %876, !dbg !27 + %893 = extractelement <16 x float> %foldExtExtBinop1587, i64 15, !dbg !27 + %894 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %878, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %895 = tail call float @llvm.nvvm.fma.rn.f(float %878, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i695 = select i1 %.not.i694, float %895, float %894, !dbg !28 + %896 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i695) #6, !dbg !28 + %897 = tail call float @llvm.nvvm.saturate.f(float %.02.i695) #6, !dbg !28 + %.03.i697 = select i1 %.not1.i696, float %897, float %896, !dbg !28 + %898 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i697, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %899 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i697, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %900 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %879, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %901 = tail call float @llvm.nvvm.fma.rn.f(float %879, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i705 = select i1 %.not.i704, float %901, float %900, !dbg !28 + %902 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i705) #6, !dbg !28 + %903 = tail call float @llvm.nvvm.saturate.f(float %.02.i705) #6, !dbg !28 + %.03.i707 = select i1 %.not1.i706, float %903, float %902, !dbg !28 + %904 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i707, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %905 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i707, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %906 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %880, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %907 = tail call float @llvm.nvvm.fma.rn.f(float %880, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i715 = select i1 %.not.i714, float %907, float %906, !dbg !28 + %908 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i715) #6, !dbg !28 + %909 = tail call float @llvm.nvvm.saturate.f(float %.02.i715) #6, !dbg !28 + %.03.i717 = select i1 %.not1.i716, float %909, float %908, !dbg !28 + %910 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i717, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %911 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i717, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %912 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %881, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %913 = tail call float @llvm.nvvm.fma.rn.f(float %881, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i725 = select i1 %.not.i724, float %913, float %912, !dbg !28 + %914 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i725) #6, !dbg !28 + %915 = tail call float @llvm.nvvm.saturate.f(float %.02.i725) #6, !dbg !28 + %.03.i727 = select i1 %.not1.i726, float %915, float %914, !dbg !28 + %916 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i727, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %917 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i727, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %918 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %882, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %919 = tail call float @llvm.nvvm.fma.rn.f(float %882, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i735 = select i1 %.not.i734, float %919, float %918, !dbg !28 + %920 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i735) #6, !dbg !28 + %921 = tail call float @llvm.nvvm.saturate.f(float %.02.i735) #6, !dbg !28 + %.03.i737 = select i1 %.not1.i736, float %921, float %920, !dbg !28 + %922 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i737, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %923 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i737, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %924 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %883, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %925 = tail call float @llvm.nvvm.fma.rn.f(float %883, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i745 = select i1 %.not.i744, float %925, float %924, !dbg !28 + %926 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i745) #6, !dbg !28 + %927 = tail call float @llvm.nvvm.saturate.f(float %.02.i745) #6, !dbg !28 + %.03.i747 = select i1 %.not1.i746, float %927, float %926, !dbg !28 + %928 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i747, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %929 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i747, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %930 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %884, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %931 = tail call float @llvm.nvvm.fma.rn.f(float %884, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i755 = select i1 %.not.i754, float %931, float %930, !dbg !28 + %932 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i755) #6, !dbg !28 + %933 = tail call float @llvm.nvvm.saturate.f(float %.02.i755) #6, !dbg !28 + %.03.i757 = select i1 %.not1.i756, float %933, float %932, !dbg !28 + %934 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i757, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %935 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i757, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %936 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %885, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %937 = tail call float @llvm.nvvm.fma.rn.f(float %885, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i765 = select i1 %.not.i764, float %937, float %936, !dbg !28 + %938 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i765) #6, !dbg !28 + %939 = tail call float @llvm.nvvm.saturate.f(float %.02.i765) #6, !dbg !28 + %.03.i767 = select i1 %.not1.i766, float %939, float %938, !dbg !28 + %940 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i767, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %941 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i767, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %942 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %886, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %943 = tail call float @llvm.nvvm.fma.rn.f(float %886, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i775 = select i1 %.not.i774, float %943, float %942, !dbg !28 + %944 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i775) #6, !dbg !28 + %945 = tail call float @llvm.nvvm.saturate.f(float %.02.i775) #6, !dbg !28 + %.03.i777 = select i1 %.not1.i776, float %945, float %944, !dbg !28 + %946 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i777, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %947 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i777, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %948 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %887, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %949 = tail call float @llvm.nvvm.fma.rn.f(float %887, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i785 = select i1 %.not.i784, float %949, float %948, !dbg !28 + %950 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i785) #6, !dbg !28 + %951 = tail call float @llvm.nvvm.saturate.f(float %.02.i785) #6, !dbg !28 + %.03.i787 = select i1 %.not1.i786, float %951, float %950, !dbg !28 + %952 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i787, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %953 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i787, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %954 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %888, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %955 = tail call float @llvm.nvvm.fma.rn.f(float %888, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i795 = select i1 %.not.i794, float %955, float %954, !dbg !28 + %956 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i795) #6, !dbg !28 + %957 = tail call float @llvm.nvvm.saturate.f(float %.02.i795) #6, !dbg !28 + %.03.i797 = select i1 %.not1.i796, float %957, float %956, !dbg !28 + %958 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i797, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %959 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i797, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %960 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %889, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %961 = tail call float @llvm.nvvm.fma.rn.f(float %889, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i805 = select i1 %.not.i804, float %961, float %960, !dbg !28 + %962 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i805) #6, !dbg !28 + %963 = tail call float @llvm.nvvm.saturate.f(float %.02.i805) #6, !dbg !28 + %.03.i807 = select i1 %.not1.i806, float %963, float %962, !dbg !28 + %964 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i807, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %965 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i807, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %966 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %890, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %967 = tail call float @llvm.nvvm.fma.rn.f(float %890, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i815 = select i1 %.not.i814, float %967, float %966, !dbg !28 + %968 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i815) #6, !dbg !28 + %969 = tail call float @llvm.nvvm.saturate.f(float %.02.i815) #6, !dbg !28 + %.03.i817 = select i1 %.not1.i816, float %969, float %968, !dbg !28 + %970 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i817, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %971 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i817, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %972 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %891, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %973 = tail call float @llvm.nvvm.fma.rn.f(float %891, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i825 = select i1 %.not.i824, float %973, float %972, !dbg !28 + %974 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i825) #6, !dbg !28 + %975 = tail call float @llvm.nvvm.saturate.f(float %.02.i825) #6, !dbg !28 + %.03.i827 = select i1 %.not1.i826, float %975, float %974, !dbg !28 + %976 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i827, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %977 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i827, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %978 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %892, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %979 = tail call float @llvm.nvvm.fma.rn.f(float %892, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i835 = select i1 %.not.i834, float %979, float %978, !dbg !28 + %980 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i835) #6, !dbg !28 + %981 = tail call float @llvm.nvvm.saturate.f(float %.02.i835) #6, !dbg !28 + %.03.i837 = select i1 %.not1.i836, float %981, float %980, !dbg !28 + %982 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i837, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %983 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i837, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %984 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %893, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %985 = tail call float @llvm.nvvm.fma.rn.f(float %893, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i845 = select i1 %.not.i844, float %985, float %984, !dbg !28 + %986 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i845) #6, !dbg !28 + %987 = tail call float @llvm.nvvm.saturate.f(float %.02.i845) #6, !dbg !28 + %.03.i847 = select i1 %.not1.i846, float %987, float %986, !dbg !28 + %988 = insertelement <16 x i32> poison, i32 %160, i64 0, !dbg !28 + %989 = insertelement <16 x i32> %988, i32 %165, i64 1, !dbg !28 + %990 = insertelement <16 x i32> %989, i32 %170, i64 2, !dbg !28 + %991 = insertelement <16 x i32> %990, i32 %175, i64 3, !dbg !28 + %992 = insertelement <16 x i32> %991, i32 %180, i64 4, !dbg !28 + %993 = insertelement <16 x i32> %992, i32 %185, i64 5, !dbg !28 + %994 = insertelement <16 x i32> %993, i32 %190, i64 6, !dbg !28 + %995 = insertelement <16 x i32> %994, i32 %195, i64 7, !dbg !28 + %996 = insertelement <16 x i32> %995, i32 %200, i64 8, !dbg !28 + %997 = insertelement <16 x i32> %996, i32 %205, i64 9, !dbg !28 + %998 = insertelement <16 x i32> %997, i32 %210, i64 10, !dbg !28 + %999 = insertelement <16 x i32> %998, i32 %215, i64 11, !dbg !28 + %1000 = insertelement <16 x i32> %999, i32 %220, i64 12, !dbg !28 + %1001 = insertelement <16 x i32> %1000, i32 %225, i64 13, !dbg !28 + %1002 = insertelement <16 x i32> %1001, i32 %230, i64 14, !dbg !28 + %1003 = insertelement <16 x i32> %1002, i32 %235, i64 15, !dbg !28 + %1004 = icmp eq <16 x i32> %1003, zeroinitializer, !dbg !28 + %1005 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i847, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1006 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i847, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1007 = insertelement <16 x float> poison, float %899, i64 0, !dbg !28 + %1008 = insertelement <16 x float> %1007, float %905, i64 1, !dbg !28 + %1009 = insertelement <16 x float> %1008, float %911, i64 2, !dbg !28 + %1010 = insertelement <16 x float> %1009, float %917, i64 3, !dbg !28 + %1011 = insertelement <16 x float> %1010, float %923, i64 4, !dbg !28 + %1012 = insertelement <16 x float> %1011, float %929, i64 5, !dbg !28 + %1013 = insertelement <16 x float> %1012, float %935, i64 6, !dbg !28 + %1014 = insertelement <16 x float> %1013, float %941, i64 7, !dbg !28 + %1015 = insertelement <16 x float> %1014, float %947, i64 8, !dbg !28 + %1016 = insertelement <16 x float> %1015, float %953, i64 9, !dbg !28 + %1017 = insertelement <16 x float> %1016, float %959, i64 10, !dbg !28 + %1018 = insertelement <16 x float> %1017, float %965, i64 11, !dbg !28 + %1019 = insertelement <16 x float> %1018, float %971, i64 12, !dbg !28 + %1020 = insertelement <16 x float> %1019, float %977, i64 13, !dbg !28 + %1021 = insertelement <16 x float> %1020, float %983, i64 14, !dbg !28 + %1022 = insertelement <16 x float> %1021, float %1006, i64 15, !dbg !28 + %1023 = insertelement <16 x float> poison, float %898, i64 0, !dbg !28 + %1024 = insertelement <16 x float> %1023, float %904, i64 1, !dbg !28 + %1025 = insertelement <16 x float> %1024, float %910, i64 2, !dbg !28 + %1026 = insertelement <16 x float> %1025, float %916, i64 3, !dbg !28 + %1027 = insertelement <16 x float> %1026, float %922, i64 4, !dbg !28 + %1028 = insertelement <16 x float> %1027, float %928, i64 5, !dbg !28 + %1029 = insertelement <16 x float> %1028, float %934, i64 6, !dbg !28 + %1030 = insertelement <16 x float> %1029, float %940, i64 7, !dbg !28 + %1031 = insertelement <16 x float> %1030, float %946, i64 8, !dbg !28 + %1032 = insertelement <16 x float> %1031, float %952, i64 9, !dbg !28 + %1033 = insertelement <16 x float> %1032, float %958, i64 10, !dbg !28 + %1034 = insertelement <16 x float> %1033, float %964, i64 11, !dbg !28 + %1035 = insertelement <16 x float> %1034, float %970, i64 12, !dbg !28 + %1036 = insertelement <16 x float> %1035, float %976, i64 13, !dbg !28 + %1037 = insertelement <16 x float> %1036, float %982, i64 14, !dbg !28 + %1038 = insertelement <16 x float> %1037, float %1005, i64 15, !dbg !28 + %1039 = select <16 x i1> %1004, <16 x float> %1022, <16 x float> %1038, !dbg !28 + %1040 = extractelement <16 x float> %1039, i64 0, !dbg !28 + %1041 = fadd float %1040, 0xC168000FE0000000, !dbg !28 + %1042 = fneg float %1041, !dbg !28 + %1043 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %878, float 0x3FF7154760000000, float %1042) #6, !dbg !28 + %1044 = tail call float @llvm.nvvm.fma.rn.f(float %878, float 0x3FF7154760000000, float %1042) #6, !dbg !28 + %.0.i701 = select i1 %.not3.i700, float %1044, float %1043, !dbg !28 + %1045 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %878, float 0x3E54AE0C00000000, float %.0.i701) #6, !dbg !28 + %1046 = tail call float @llvm.nvvm.fma.rn.f(float %878, float 0x3E54AE0C00000000, float %.0.i701) #6, !dbg !28 + %.01.i703 = select i1 %.not4.i702, float %1046, float %1045, !dbg !28 + %1047 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i703) #6, !dbg !28 + %1048 = extractelement <16 x float> %1039, i64 1, !dbg !28 + %1049 = fadd float %1048, 0xC168000FE0000000, !dbg !28 + %1050 = fneg float %1049, !dbg !28 + %1051 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %879, float 0x3FF7154760000000, float %1050) #6, !dbg !28 + %1052 = tail call float @llvm.nvvm.fma.rn.f(float %879, float 0x3FF7154760000000, float %1050) #6, !dbg !28 + %.0.i711 = select i1 %.not3.i710, float %1052, float %1051, !dbg !28 + %1053 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %879, float 0x3E54AE0C00000000, float %.0.i711) #6, !dbg !28 + %1054 = tail call float @llvm.nvvm.fma.rn.f(float %879, float 0x3E54AE0C00000000, float %.0.i711) #6, !dbg !28 + %.01.i713 = select i1 %.not4.i712, float %1054, float %1053, !dbg !28 + %1055 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i713) #6, !dbg !28 + %1056 = extractelement <16 x float> %1039, i64 2, !dbg !28 + %1057 = fadd float %1056, 0xC168000FE0000000, !dbg !28 + %1058 = fneg float %1057, !dbg !28 + %1059 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %880, float 0x3FF7154760000000, float %1058) #6, !dbg !28 + %1060 = tail call float @llvm.nvvm.fma.rn.f(float %880, float 0x3FF7154760000000, float %1058) #6, !dbg !28 + %.0.i721 = select i1 %.not3.i720, float %1060, float %1059, !dbg !28 + %1061 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %880, float 0x3E54AE0C00000000, float %.0.i721) #6, !dbg !28 + %1062 = tail call float @llvm.nvvm.fma.rn.f(float %880, float 0x3E54AE0C00000000, float %.0.i721) #6, !dbg !28 + %.01.i723 = select i1 %.not4.i722, float %1062, float %1061, !dbg !28 + %1063 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i723) #6, !dbg !28 + %1064 = extractelement <16 x float> %1039, i64 3, !dbg !28 + %1065 = fadd float %1064, 0xC168000FE0000000, !dbg !28 + %1066 = fneg float %1065, !dbg !28 + %1067 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %881, float 0x3FF7154760000000, float %1066) #6, !dbg !28 + %1068 = tail call float @llvm.nvvm.fma.rn.f(float %881, float 0x3FF7154760000000, float %1066) #6, !dbg !28 + %.0.i731 = select i1 %.not3.i730, float %1068, float %1067, !dbg !28 + %1069 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %881, float 0x3E54AE0C00000000, float %.0.i731) #6, !dbg !28 + %1070 = tail call float @llvm.nvvm.fma.rn.f(float %881, float 0x3E54AE0C00000000, float %.0.i731) #6, !dbg !28 + %.01.i733 = select i1 %.not4.i732, float %1070, float %1069, !dbg !28 + %1071 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i733) #6, !dbg !28 + %1072 = extractelement <16 x float> %1039, i64 4, !dbg !28 + %1073 = fadd float %1072, 0xC168000FE0000000, !dbg !28 + %1074 = fneg float %1073, !dbg !28 + %1075 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %882, float 0x3FF7154760000000, float %1074) #6, !dbg !28 + %1076 = tail call float @llvm.nvvm.fma.rn.f(float %882, float 0x3FF7154760000000, float %1074) #6, !dbg !28 + %.0.i741 = select i1 %.not3.i740, float %1076, float %1075, !dbg !28 + %1077 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %882, float 0x3E54AE0C00000000, float %.0.i741) #6, !dbg !28 + %1078 = tail call float @llvm.nvvm.fma.rn.f(float %882, float 0x3E54AE0C00000000, float %.0.i741) #6, !dbg !28 + %.01.i743 = select i1 %.not4.i742, float %1078, float %1077, !dbg !28 + %1079 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i743) #6, !dbg !28 + %1080 = extractelement <16 x float> %1039, i64 5, !dbg !28 + %1081 = fadd float %1080, 0xC168000FE0000000, !dbg !28 + %1082 = fneg float %1081, !dbg !28 + %1083 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %883, float 0x3FF7154760000000, float %1082) #6, !dbg !28 + %1084 = tail call float @llvm.nvvm.fma.rn.f(float %883, float 0x3FF7154760000000, float %1082) #6, !dbg !28 + %.0.i751 = select i1 %.not3.i750, float %1084, float %1083, !dbg !28 + %1085 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %883, float 0x3E54AE0C00000000, float %.0.i751) #6, !dbg !28 + %1086 = tail call float @llvm.nvvm.fma.rn.f(float %883, float 0x3E54AE0C00000000, float %.0.i751) #6, !dbg !28 + %.01.i753 = select i1 %.not4.i752, float %1086, float %1085, !dbg !28 + %1087 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i753) #6, !dbg !28 + %1088 = extractelement <16 x float> %1039, i64 6, !dbg !28 + %1089 = fadd float %1088, 0xC168000FE0000000, !dbg !28 + %1090 = fneg float %1089, !dbg !28 + %1091 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %884, float 0x3FF7154760000000, float %1090) #6, !dbg !28 + %1092 = tail call float @llvm.nvvm.fma.rn.f(float %884, float 0x3FF7154760000000, float %1090) #6, !dbg !28 + %.0.i761 = select i1 %.not3.i760, float %1092, float %1091, !dbg !28 + %1093 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %884, float 0x3E54AE0C00000000, float %.0.i761) #6, !dbg !28 + %1094 = tail call float @llvm.nvvm.fma.rn.f(float %884, float 0x3E54AE0C00000000, float %.0.i761) #6, !dbg !28 + %.01.i763 = select i1 %.not4.i762, float %1094, float %1093, !dbg !28 + %1095 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i763) #6, !dbg !28 + %1096 = extractelement <16 x float> %1039, i64 7, !dbg !28 + %1097 = fadd float %1096, 0xC168000FE0000000, !dbg !28 + %1098 = fneg float %1097, !dbg !28 + %1099 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %885, float 0x3FF7154760000000, float %1098) #6, !dbg !28 + %1100 = tail call float @llvm.nvvm.fma.rn.f(float %885, float 0x3FF7154760000000, float %1098) #6, !dbg !28 + %.0.i771 = select i1 %.not3.i770, float %1100, float %1099, !dbg !28 + %1101 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %885, float 0x3E54AE0C00000000, float %.0.i771) #6, !dbg !28 + %1102 = tail call float @llvm.nvvm.fma.rn.f(float %885, float 0x3E54AE0C00000000, float %.0.i771) #6, !dbg !28 + %.01.i773 = select i1 %.not4.i772, float %1102, float %1101, !dbg !28 + %1103 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i773) #6, !dbg !28 + %1104 = extractelement <16 x float> %1039, i64 8, !dbg !28 + %1105 = fadd float %1104, 0xC168000FE0000000, !dbg !28 + %1106 = fneg float %1105, !dbg !28 + %1107 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %886, float 0x3FF7154760000000, float %1106) #6, !dbg !28 + %1108 = tail call float @llvm.nvvm.fma.rn.f(float %886, float 0x3FF7154760000000, float %1106) #6, !dbg !28 + %.0.i781 = select i1 %.not3.i780, float %1108, float %1107, !dbg !28 + %1109 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %886, float 0x3E54AE0C00000000, float %.0.i781) #6, !dbg !28 + %1110 = tail call float @llvm.nvvm.fma.rn.f(float %886, float 0x3E54AE0C00000000, float %.0.i781) #6, !dbg !28 + %.01.i783 = select i1 %.not4.i782, float %1110, float %1109, !dbg !28 + %1111 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i783) #6, !dbg !28 + %1112 = extractelement <16 x float> %1039, i64 9, !dbg !28 + %1113 = fadd float %1112, 0xC168000FE0000000, !dbg !28 + %1114 = fneg float %1113, !dbg !28 + %1115 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %887, float 0x3FF7154760000000, float %1114) #6, !dbg !28 + %1116 = tail call float @llvm.nvvm.fma.rn.f(float %887, float 0x3FF7154760000000, float %1114) #6, !dbg !28 + %.0.i791 = select i1 %.not3.i790, float %1116, float %1115, !dbg !28 + %1117 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %887, float 0x3E54AE0C00000000, float %.0.i791) #6, !dbg !28 + %1118 = tail call float @llvm.nvvm.fma.rn.f(float %887, float 0x3E54AE0C00000000, float %.0.i791) #6, !dbg !28 + %.01.i793 = select i1 %.not4.i792, float %1118, float %1117, !dbg !28 + %1119 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i793) #6, !dbg !28 + %1120 = extractelement <16 x float> %1039, i64 10, !dbg !28 + %1121 = fadd float %1120, 0xC168000FE0000000, !dbg !28 + %1122 = fneg float %1121, !dbg !28 + %1123 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %888, float 0x3FF7154760000000, float %1122) #6, !dbg !28 + %1124 = tail call float @llvm.nvvm.fma.rn.f(float %888, float 0x3FF7154760000000, float %1122) #6, !dbg !28 + %.0.i801 = select i1 %.not3.i800, float %1124, float %1123, !dbg !28 + %1125 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %888, float 0x3E54AE0C00000000, float %.0.i801) #6, !dbg !28 + %1126 = tail call float @llvm.nvvm.fma.rn.f(float %888, float 0x3E54AE0C00000000, float %.0.i801) #6, !dbg !28 + %.01.i803 = select i1 %.not4.i802, float %1126, float %1125, !dbg !28 + %1127 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i803) #6, !dbg !28 + %1128 = extractelement <16 x float> %1039, i64 11, !dbg !28 + %1129 = fadd float %1128, 0xC168000FE0000000, !dbg !28 + %1130 = fneg float %1129, !dbg !28 + %1131 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %889, float 0x3FF7154760000000, float %1130) #6, !dbg !28 + %1132 = tail call float @llvm.nvvm.fma.rn.f(float %889, float 0x3FF7154760000000, float %1130) #6, !dbg !28 + %.0.i811 = select i1 %.not3.i810, float %1132, float %1131, !dbg !28 + %1133 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %889, float 0x3E54AE0C00000000, float %.0.i811) #6, !dbg !28 + %1134 = tail call float @llvm.nvvm.fma.rn.f(float %889, float 0x3E54AE0C00000000, float %.0.i811) #6, !dbg !28 + %.01.i813 = select i1 %.not4.i812, float %1134, float %1133, !dbg !28 + %1135 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i813) #6, !dbg !28 + %1136 = extractelement <16 x float> %1039, i64 12, !dbg !28 + %1137 = fadd float %1136, 0xC168000FE0000000, !dbg !28 + %1138 = fneg float %1137, !dbg !28 + %1139 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %890, float 0x3FF7154760000000, float %1138) #6, !dbg !28 + %1140 = tail call float @llvm.nvvm.fma.rn.f(float %890, float 0x3FF7154760000000, float %1138) #6, !dbg !28 + %.0.i821 = select i1 %.not3.i820, float %1140, float %1139, !dbg !28 + %1141 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %890, float 0x3E54AE0C00000000, float %.0.i821) #6, !dbg !28 + %1142 = tail call float @llvm.nvvm.fma.rn.f(float %890, float 0x3E54AE0C00000000, float %.0.i821) #6, !dbg !28 + %.01.i823 = select i1 %.not4.i822, float %1142, float %1141, !dbg !28 + %1143 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i823) #6, !dbg !28 + %1144 = extractelement <16 x float> %1039, i64 13, !dbg !28 + %1145 = fadd float %1144, 0xC168000FE0000000, !dbg !28 + %1146 = fneg float %1145, !dbg !28 + %1147 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %891, float 0x3FF7154760000000, float %1146) #6, !dbg !28 + %1148 = tail call float @llvm.nvvm.fma.rn.f(float %891, float 0x3FF7154760000000, float %1146) #6, !dbg !28 + %.0.i831 = select i1 %.not3.i830, float %1148, float %1147, !dbg !28 + %1149 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %891, float 0x3E54AE0C00000000, float %.0.i831) #6, !dbg !28 + %1150 = tail call float @llvm.nvvm.fma.rn.f(float %891, float 0x3E54AE0C00000000, float %.0.i831) #6, !dbg !28 + %.01.i833 = select i1 %.not4.i832, float %1150, float %1149, !dbg !28 + %1151 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i833) #6, !dbg !28 + %1152 = extractelement <16 x float> %1039, i64 14, !dbg !28 + %1153 = fadd float %1152, 0xC168000FE0000000, !dbg !28 + %1154 = fneg float %1153, !dbg !28 + %1155 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %892, float 0x3FF7154760000000, float %1154) #6, !dbg !28 + %1156 = tail call float @llvm.nvvm.fma.rn.f(float %892, float 0x3FF7154760000000, float %1154) #6, !dbg !28 + %.0.i841 = select i1 %.not3.i840, float %1156, float %1155, !dbg !28 + %1157 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %892, float 0x3E54AE0C00000000, float %.0.i841) #6, !dbg !28 + %1158 = tail call float @llvm.nvvm.fma.rn.f(float %892, float 0x3E54AE0C00000000, float %.0.i841) #6, !dbg !28 + %.01.i843 = select i1 %.not4.i842, float %1158, float %1157, !dbg !28 + %1159 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i843) #6, !dbg !28 + %1160 = extractelement <16 x float> %1039, i64 15, !dbg !28 + %1161 = fadd float %1160, 0xC168000FE0000000, !dbg !28 + %1162 = fneg float %1161, !dbg !28 + %1163 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %893, float 0x3FF7154760000000, float %1162) #6, !dbg !28 + %1164 = tail call float @llvm.nvvm.fma.rn.f(float %893, float 0x3FF7154760000000, float %1162) #6, !dbg !28 + %.0.i851 = select i1 %.not3.i850, float %1164, float %1163, !dbg !28 + %1165 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %893, float 0x3E54AE0C00000000, float %.0.i851) #6, !dbg !28 + %1166 = tail call float @llvm.nvvm.fma.rn.f(float %893, float 0x3E54AE0C00000000, float %.0.i851) #6, !dbg !28 + %.01.i853 = select i1 %.not4.i852, float %1166, float %1165, !dbg !28 + %1167 = bitcast <16 x float> %1039 to <16 x i32>, !dbg !28 + %1168 = shl <16 x i32> %1167, splat (i32 23), !dbg !28 + %1169 = bitcast <16 x i32> %1168 to <16 x float>, !dbg !28 + %1170 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i853) #6, !dbg !28 + %1171 = insertelement <16 x float> poison, float %1047, i64 0, !dbg !28 + %1172 = insertelement <16 x float> %1171, float %1055, i64 1, !dbg !28 + %1173 = insertelement <16 x float> %1172, float %1063, i64 2, !dbg !28 + %1174 = insertelement <16 x float> %1173, float %1071, i64 3, !dbg !28 + %1175 = insertelement <16 x float> %1174, float %1079, i64 4, !dbg !28 + %1176 = insertelement <16 x float> %1175, float %1087, i64 5, !dbg !28 + %1177 = insertelement <16 x float> %1176, float %1095, i64 6, !dbg !28 + %1178 = insertelement <16 x float> %1177, float %1103, i64 7, !dbg !28 + %1179 = insertelement <16 x float> %1178, float %1111, i64 8, !dbg !28 + %1180 = insertelement <16 x float> %1179, float %1119, i64 9, !dbg !28 + %1181 = insertelement <16 x float> %1180, float %1127, i64 10, !dbg !28 + %1182 = insertelement <16 x float> %1181, float %1135, i64 11, !dbg !28 + %1183 = insertelement <16 x float> %1182, float %1143, i64 12, !dbg !28 + %1184 = insertelement <16 x float> %1183, float %1151, i64 13, !dbg !28 + %1185 = insertelement <16 x float> %1184, float %1159, i64 14, !dbg !28 + %1186 = insertelement <16 x float> %1185, float %1170, i64 15, !dbg !28 + %1187 = fmul <16 x float> %1186, %1169, !dbg !28 + %1188 = select <16 x i1> %877, <16 x float> splat (float 1.000000e+00), <16 x float> %1187, !dbg !29 + %foldExtExtBinop1589 = fsub <16 x float> %873, %876, !dbg !25 + %1189 = extractelement <16 x float> %foldExtExtBinop1589, i64 0, !dbg !25 + %foldExtExtBinop1591 = fsub <16 x float> %873, %876, !dbg !25 + %1190 = extractelement <16 x float> %foldExtExtBinop1591, i64 1, !dbg !25 + %foldExtExtBinop1593 = fsub <16 x float> %873, %876, !dbg !25 + %1191 = extractelement <16 x float> %foldExtExtBinop1593, i64 2, !dbg !25 + %foldExtExtBinop1595 = fsub <16 x float> %873, %876, !dbg !25 + %1192 = extractelement <16 x float> %foldExtExtBinop1595, i64 3, !dbg !25 + %foldExtExtBinop1597 = fsub <16 x float> %873, %876, !dbg !25 + %1193 = extractelement <16 x float> %foldExtExtBinop1597, i64 4, !dbg !25 + %foldExtExtBinop1599 = fsub <16 x float> %873, %876, !dbg !25 + %1194 = extractelement <16 x float> %foldExtExtBinop1599, i64 5, !dbg !25 + %foldExtExtBinop1601 = fsub <16 x float> %873, %876, !dbg !25 + %1195 = extractelement <16 x float> %foldExtExtBinop1601, i64 6, !dbg !25 + %foldExtExtBinop1603 = fsub <16 x float> %873, %876, !dbg !25 + %1196 = extractelement <16 x float> %foldExtExtBinop1603, i64 7, !dbg !25 + %foldExtExtBinop1605 = fsub <16 x float> %873, %876, !dbg !25 + %1197 = extractelement <16 x float> %foldExtExtBinop1605, i64 8, !dbg !25 + %foldExtExtBinop1607 = fsub <16 x float> %873, %876, !dbg !25 + %1198 = extractelement <16 x float> %foldExtExtBinop1607, i64 9, !dbg !25 + %foldExtExtBinop1609 = fsub <16 x float> %873, %876, !dbg !25 + %1199 = extractelement <16 x float> %foldExtExtBinop1609, i64 10, !dbg !25 + %foldExtExtBinop1611 = fsub <16 x float> %873, %876, !dbg !25 + %1200 = extractelement <16 x float> %foldExtExtBinop1611, i64 11, !dbg !25 + %foldExtExtBinop1613 = fsub <16 x float> %873, %876, !dbg !25 + %1201 = extractelement <16 x float> %foldExtExtBinop1613, i64 12, !dbg !25 + %foldExtExtBinop1615 = fsub <16 x float> %873, %876, !dbg !25 + %1202 = extractelement <16 x float> %foldExtExtBinop1615, i64 13, !dbg !25 + %foldExtExtBinop1617 = fsub <16 x float> %873, %876, !dbg !25 + %1203 = extractelement <16 x float> %foldExtExtBinop1617, i64 14, !dbg !25 + %foldExtExtBinop1619 = fsub <16 x float> %873, %876, !dbg !25 + %1204 = extractelement <16 x float> %foldExtExtBinop1619, i64 15, !dbg !25 + %1205 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1189, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1206 = tail call float @llvm.nvvm.fma.rn.f(float %1189, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1015 = select i1 %.not.i1014, float %1206, float %1205, !dbg !28 + %1207 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1015) #6, !dbg !28 + %1208 = tail call float @llvm.nvvm.saturate.f(float %.02.i1015) #6, !dbg !28 + %.03.i1017 = select i1 %.not1.i1016, float %1208, float %1207, !dbg !28 + %1209 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1017, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1210 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1017, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1211 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1190, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1212 = tail call float @llvm.nvvm.fma.rn.f(float %1190, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1025 = select i1 %.not.i1024, float %1212, float %1211, !dbg !28 + %1213 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1025) #6, !dbg !28 + %1214 = tail call float @llvm.nvvm.saturate.f(float %.02.i1025) #6, !dbg !28 + %.03.i1027 = select i1 %.not1.i1026, float %1214, float %1213, !dbg !28 + %1215 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1027, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1216 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1027, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1217 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1191, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1218 = tail call float @llvm.nvvm.fma.rn.f(float %1191, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1035 = select i1 %.not.i1034, float %1218, float %1217, !dbg !28 + %1219 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1035) #6, !dbg !28 + %1220 = tail call float @llvm.nvvm.saturate.f(float %.02.i1035) #6, !dbg !28 + %.03.i1037 = select i1 %.not1.i1036, float %1220, float %1219, !dbg !28 + %1221 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1037, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1222 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1037, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1223 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1192, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1224 = tail call float @llvm.nvvm.fma.rn.f(float %1192, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1045 = select i1 %.not.i1044, float %1224, float %1223, !dbg !28 + %1225 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1045) #6, !dbg !28 + %1226 = tail call float @llvm.nvvm.saturate.f(float %.02.i1045) #6, !dbg !28 + %.03.i1047 = select i1 %.not1.i1046, float %1226, float %1225, !dbg !28 + %1227 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1047, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1228 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1047, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1229 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1193, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1230 = tail call float @llvm.nvvm.fma.rn.f(float %1193, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1055 = select i1 %.not.i1054, float %1230, float %1229, !dbg !28 + %1231 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1055) #6, !dbg !28 + %1232 = tail call float @llvm.nvvm.saturate.f(float %.02.i1055) #6, !dbg !28 + %.03.i1057 = select i1 %.not1.i1056, float %1232, float %1231, !dbg !28 + %1233 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1057, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1234 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1057, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1235 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1194, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1236 = tail call float @llvm.nvvm.fma.rn.f(float %1194, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1065 = select i1 %.not.i1064, float %1236, float %1235, !dbg !28 + %1237 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1065) #6, !dbg !28 + %1238 = tail call float @llvm.nvvm.saturate.f(float %.02.i1065) #6, !dbg !28 + %.03.i1067 = select i1 %.not1.i1066, float %1238, float %1237, !dbg !28 + %1239 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1067, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1240 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1067, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1241 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1195, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1242 = tail call float @llvm.nvvm.fma.rn.f(float %1195, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1075 = select i1 %.not.i1074, float %1242, float %1241, !dbg !28 + %1243 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1075) #6, !dbg !28 + %1244 = tail call float @llvm.nvvm.saturate.f(float %.02.i1075) #6, !dbg !28 + %.03.i1077 = select i1 %.not1.i1076, float %1244, float %1243, !dbg !28 + %1245 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1077, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1246 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1077, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1247 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1196, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1248 = tail call float @llvm.nvvm.fma.rn.f(float %1196, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1085 = select i1 %.not.i1084, float %1248, float %1247, !dbg !28 + %1249 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1085) #6, !dbg !28 + %1250 = tail call float @llvm.nvvm.saturate.f(float %.02.i1085) #6, !dbg !28 + %.03.i1087 = select i1 %.not1.i1086, float %1250, float %1249, !dbg !28 + %1251 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1087, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1252 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1087, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1253 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1197, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1254 = tail call float @llvm.nvvm.fma.rn.f(float %1197, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1095 = select i1 %.not.i1094, float %1254, float %1253, !dbg !28 + %1255 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1095) #6, !dbg !28 + %1256 = tail call float @llvm.nvvm.saturate.f(float %.02.i1095) #6, !dbg !28 + %.03.i1097 = select i1 %.not1.i1096, float %1256, float %1255, !dbg !28 + %1257 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1097, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1258 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1097, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1259 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1198, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1260 = tail call float @llvm.nvvm.fma.rn.f(float %1198, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1105 = select i1 %.not.i1104, float %1260, float %1259, !dbg !28 + %1261 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1105) #6, !dbg !28 + %1262 = tail call float @llvm.nvvm.saturate.f(float %.02.i1105) #6, !dbg !28 + %.03.i1107 = select i1 %.not1.i1106, float %1262, float %1261, !dbg !28 + %1263 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1107, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1264 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1107, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1265 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1199, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1266 = tail call float @llvm.nvvm.fma.rn.f(float %1199, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1115 = select i1 %.not.i1114, float %1266, float %1265, !dbg !28 + %1267 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1115) #6, !dbg !28 + %1268 = tail call float @llvm.nvvm.saturate.f(float %.02.i1115) #6, !dbg !28 + %.03.i1117 = select i1 %.not1.i1116, float %1268, float %1267, !dbg !28 + %1269 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1117, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1270 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1117, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1271 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1200, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1272 = tail call float @llvm.nvvm.fma.rn.f(float %1200, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1125 = select i1 %.not.i1124, float %1272, float %1271, !dbg !28 + %1273 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1125) #6, !dbg !28 + %1274 = tail call float @llvm.nvvm.saturate.f(float %.02.i1125) #6, !dbg !28 + %.03.i1127 = select i1 %.not1.i1126, float %1274, float %1273, !dbg !28 + %1275 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1127, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1276 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1127, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1277 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1201, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1278 = tail call float @llvm.nvvm.fma.rn.f(float %1201, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1135 = select i1 %.not.i1134, float %1278, float %1277, !dbg !28 + %1279 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1135) #6, !dbg !28 + %1280 = tail call float @llvm.nvvm.saturate.f(float %.02.i1135) #6, !dbg !28 + %.03.i1137 = select i1 %.not1.i1136, float %1280, float %1279, !dbg !28 + %1281 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1137, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1282 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1137, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1283 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1202, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1284 = tail call float @llvm.nvvm.fma.rn.f(float %1202, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1145 = select i1 %.not.i1144, float %1284, float %1283, !dbg !28 + %1285 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1145) #6, !dbg !28 + %1286 = tail call float @llvm.nvvm.saturate.f(float %.02.i1145) #6, !dbg !28 + %.03.i1147 = select i1 %.not1.i1146, float %1286, float %1285, !dbg !28 + %1287 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1147, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1288 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1147, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1289 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1203, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1290 = tail call float @llvm.nvvm.fma.rn.f(float %1203, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1155 = select i1 %.not.i1154, float %1290, float %1289, !dbg !28 + %1291 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1155) #6, !dbg !28 + %1292 = tail call float @llvm.nvvm.saturate.f(float %.02.i1155) #6, !dbg !28 + %.03.i1157 = select i1 %.not1.i1156, float %1292, float %1291, !dbg !28 + %1293 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1157, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1294 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1157, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1295 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1204, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1296 = tail call float @llvm.nvvm.fma.rn.f(float %1204, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1165 = select i1 %.not.i1164, float %1296, float %1295, !dbg !28 + %1297 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1165) #6, !dbg !28 + %1298 = tail call float @llvm.nvvm.saturate.f(float %.02.i1165) #6, !dbg !28 + %.03.i1167 = select i1 %.not1.i1166, float %1298, float %1297, !dbg !28 + %1299 = insertelement <16 x i32> poison, i32 %510, i64 0, !dbg !28 + %1300 = insertelement <16 x i32> %1299, i32 %515, i64 1, !dbg !28 + %1301 = insertelement <16 x i32> %1300, i32 %520, i64 2, !dbg !28 + %1302 = insertelement <16 x i32> %1301, i32 %525, i64 3, !dbg !28 + %1303 = insertelement <16 x i32> %1302, i32 %530, i64 4, !dbg !28 + %1304 = insertelement <16 x i32> %1303, i32 %535, i64 5, !dbg !28 + %1305 = insertelement <16 x i32> %1304, i32 %540, i64 6, !dbg !28 + %1306 = insertelement <16 x i32> %1305, i32 %545, i64 7, !dbg !28 + %1307 = insertelement <16 x i32> %1306, i32 %550, i64 8, !dbg !28 + %1308 = insertelement <16 x i32> %1307, i32 %555, i64 9, !dbg !28 + %1309 = insertelement <16 x i32> %1308, i32 %560, i64 10, !dbg !28 + %1310 = insertelement <16 x i32> %1309, i32 %565, i64 11, !dbg !28 + %1311 = insertelement <16 x i32> %1310, i32 %570, i64 12, !dbg !28 + %1312 = insertelement <16 x i32> %1311, i32 %575, i64 13, !dbg !28 + %1313 = insertelement <16 x i32> %1312, i32 %580, i64 14, !dbg !28 + %1314 = insertelement <16 x i32> %1313, i32 %585, i64 15, !dbg !28 + %1315 = icmp eq <16 x i32> %1314, zeroinitializer, !dbg !28 + %1316 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1167, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1317 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1167, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1318 = insertelement <16 x float> poison, float %1210, i64 0, !dbg !28 + %1319 = insertelement <16 x float> %1318, float %1216, i64 1, !dbg !28 + %1320 = insertelement <16 x float> %1319, float %1222, i64 2, !dbg !28 + %1321 = insertelement <16 x float> %1320, float %1228, i64 3, !dbg !28 + %1322 = insertelement <16 x float> %1321, float %1234, i64 4, !dbg !28 + %1323 = insertelement <16 x float> %1322, float %1240, i64 5, !dbg !28 + %1324 = insertelement <16 x float> %1323, float %1246, i64 6, !dbg !28 + %1325 = insertelement <16 x float> %1324, float %1252, i64 7, !dbg !28 + %1326 = insertelement <16 x float> %1325, float %1258, i64 8, !dbg !28 + %1327 = insertelement <16 x float> %1326, float %1264, i64 9, !dbg !28 + %1328 = insertelement <16 x float> %1327, float %1270, i64 10, !dbg !28 + %1329 = insertelement <16 x float> %1328, float %1276, i64 11, !dbg !28 + %1330 = insertelement <16 x float> %1329, float %1282, i64 12, !dbg !28 + %1331 = insertelement <16 x float> %1330, float %1288, i64 13, !dbg !28 + %1332 = insertelement <16 x float> %1331, float %1294, i64 14, !dbg !28 + %1333 = insertelement <16 x float> %1332, float %1317, i64 15, !dbg !28 + %1334 = insertelement <16 x float> poison, float %1209, i64 0, !dbg !28 + %1335 = insertelement <16 x float> %1334, float %1215, i64 1, !dbg !28 + %1336 = insertelement <16 x float> %1335, float %1221, i64 2, !dbg !28 + %1337 = insertelement <16 x float> %1336, float %1227, i64 3, !dbg !28 + %1338 = insertelement <16 x float> %1337, float %1233, i64 4, !dbg !28 + %1339 = insertelement <16 x float> %1338, float %1239, i64 5, !dbg !28 + %1340 = insertelement <16 x float> %1339, float %1245, i64 6, !dbg !28 + %1341 = insertelement <16 x float> %1340, float %1251, i64 7, !dbg !28 + %1342 = insertelement <16 x float> %1341, float %1257, i64 8, !dbg !28 + %1343 = insertelement <16 x float> %1342, float %1263, i64 9, !dbg !28 + %1344 = insertelement <16 x float> %1343, float %1269, i64 10, !dbg !28 + %1345 = insertelement <16 x float> %1344, float %1275, i64 11, !dbg !28 + %1346 = insertelement <16 x float> %1345, float %1281, i64 12, !dbg !28 + %1347 = insertelement <16 x float> %1346, float %1287, i64 13, !dbg !28 + %1348 = insertelement <16 x float> %1347, float %1293, i64 14, !dbg !28 + %1349 = insertelement <16 x float> %1348, float %1316, i64 15, !dbg !28 + %1350 = select <16 x i1> %1315, <16 x float> %1333, <16 x float> %1349, !dbg !28 + %1351 = extractelement <16 x float> %1350, i64 0, !dbg !28 + %1352 = fadd float %1351, 0xC168000FE0000000, !dbg !28 + %1353 = fneg float %1352, !dbg !28 + %1354 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1189, float 0x3FF7154760000000, float %1353) #6, !dbg !28 + %1355 = tail call float @llvm.nvvm.fma.rn.f(float %1189, float 0x3FF7154760000000, float %1353) #6, !dbg !28 + %.0.i1021 = select i1 %.not3.i1020, float %1355, float %1354, !dbg !28 + %1356 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1189, float 0x3E54AE0C00000000, float %.0.i1021) #6, !dbg !28 + %1357 = tail call float @llvm.nvvm.fma.rn.f(float %1189, float 0x3E54AE0C00000000, float %.0.i1021) #6, !dbg !28 + %.01.i1023 = select i1 %.not4.i1022, float %1357, float %1356, !dbg !28 + %1358 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1023) #6, !dbg !28 + %1359 = extractelement <16 x float> %1350, i64 1, !dbg !28 + %1360 = fadd float %1359, 0xC168000FE0000000, !dbg !28 + %1361 = fneg float %1360, !dbg !28 + %1362 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1190, float 0x3FF7154760000000, float %1361) #6, !dbg !28 + %1363 = tail call float @llvm.nvvm.fma.rn.f(float %1190, float 0x3FF7154760000000, float %1361) #6, !dbg !28 + %.0.i1031 = select i1 %.not3.i1030, float %1363, float %1362, !dbg !28 + %1364 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1190, float 0x3E54AE0C00000000, float %.0.i1031) #6, !dbg !28 + %1365 = tail call float @llvm.nvvm.fma.rn.f(float %1190, float 0x3E54AE0C00000000, float %.0.i1031) #6, !dbg !28 + %.01.i1033 = select i1 %.not4.i1032, float %1365, float %1364, !dbg !28 + %1366 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1033) #6, !dbg !28 + %1367 = extractelement <16 x float> %1350, i64 2, !dbg !28 + %1368 = fadd float %1367, 0xC168000FE0000000, !dbg !28 + %1369 = fneg float %1368, !dbg !28 + %1370 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1191, float 0x3FF7154760000000, float %1369) #6, !dbg !28 + %1371 = tail call float @llvm.nvvm.fma.rn.f(float %1191, float 0x3FF7154760000000, float %1369) #6, !dbg !28 + %.0.i1041 = select i1 %.not3.i1040, float %1371, float %1370, !dbg !28 + %1372 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1191, float 0x3E54AE0C00000000, float %.0.i1041) #6, !dbg !28 + %1373 = tail call float @llvm.nvvm.fma.rn.f(float %1191, float 0x3E54AE0C00000000, float %.0.i1041) #6, !dbg !28 + %.01.i1043 = select i1 %.not4.i1042, float %1373, float %1372, !dbg !28 + %1374 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1043) #6, !dbg !28 + %1375 = extractelement <16 x float> %1350, i64 3, !dbg !28 + %1376 = fadd float %1375, 0xC168000FE0000000, !dbg !28 + %1377 = fneg float %1376, !dbg !28 + %1378 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1192, float 0x3FF7154760000000, float %1377) #6, !dbg !28 + %1379 = tail call float @llvm.nvvm.fma.rn.f(float %1192, float 0x3FF7154760000000, float %1377) #6, !dbg !28 + %.0.i1051 = select i1 %.not3.i1050, float %1379, float %1378, !dbg !28 + %1380 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1192, float 0x3E54AE0C00000000, float %.0.i1051) #6, !dbg !28 + %1381 = tail call float @llvm.nvvm.fma.rn.f(float %1192, float 0x3E54AE0C00000000, float %.0.i1051) #6, !dbg !28 + %.01.i1053 = select i1 %.not4.i1052, float %1381, float %1380, !dbg !28 + %1382 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1053) #6, !dbg !28 + %1383 = extractelement <16 x float> %1350, i64 4, !dbg !28 + %1384 = fadd float %1383, 0xC168000FE0000000, !dbg !28 + %1385 = fneg float %1384, !dbg !28 + %1386 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1193, float 0x3FF7154760000000, float %1385) #6, !dbg !28 + %1387 = tail call float @llvm.nvvm.fma.rn.f(float %1193, float 0x3FF7154760000000, float %1385) #6, !dbg !28 + %.0.i1061 = select i1 %.not3.i1060, float %1387, float %1386, !dbg !28 + %1388 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1193, float 0x3E54AE0C00000000, float %.0.i1061) #6, !dbg !28 + %1389 = tail call float @llvm.nvvm.fma.rn.f(float %1193, float 0x3E54AE0C00000000, float %.0.i1061) #6, !dbg !28 + %.01.i1063 = select i1 %.not4.i1062, float %1389, float %1388, !dbg !28 + %1390 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1063) #6, !dbg !28 + %1391 = extractelement <16 x float> %1350, i64 5, !dbg !28 + %1392 = fadd float %1391, 0xC168000FE0000000, !dbg !28 + %1393 = fneg float %1392, !dbg !28 + %1394 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1194, float 0x3FF7154760000000, float %1393) #6, !dbg !28 + %1395 = tail call float @llvm.nvvm.fma.rn.f(float %1194, float 0x3FF7154760000000, float %1393) #6, !dbg !28 + %.0.i1071 = select i1 %.not3.i1070, float %1395, float %1394, !dbg !28 + %1396 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1194, float 0x3E54AE0C00000000, float %.0.i1071) #6, !dbg !28 + %1397 = tail call float @llvm.nvvm.fma.rn.f(float %1194, float 0x3E54AE0C00000000, float %.0.i1071) #6, !dbg !28 + %.01.i1073 = select i1 %.not4.i1072, float %1397, float %1396, !dbg !28 + %1398 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1073) #6, !dbg !28 + %1399 = extractelement <16 x float> %1350, i64 6, !dbg !28 + %1400 = fadd float %1399, 0xC168000FE0000000, !dbg !28 + %1401 = fneg float %1400, !dbg !28 + %1402 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1195, float 0x3FF7154760000000, float %1401) #6, !dbg !28 + %1403 = tail call float @llvm.nvvm.fma.rn.f(float %1195, float 0x3FF7154760000000, float %1401) #6, !dbg !28 + %.0.i1081 = select i1 %.not3.i1080, float %1403, float %1402, !dbg !28 + %1404 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1195, float 0x3E54AE0C00000000, float %.0.i1081) #6, !dbg !28 + %1405 = tail call float @llvm.nvvm.fma.rn.f(float %1195, float 0x3E54AE0C00000000, float %.0.i1081) #6, !dbg !28 + %.01.i1083 = select i1 %.not4.i1082, float %1405, float %1404, !dbg !28 + %1406 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1083) #6, !dbg !28 + %1407 = extractelement <16 x float> %1350, i64 7, !dbg !28 + %1408 = fadd float %1407, 0xC168000FE0000000, !dbg !28 + %1409 = fneg float %1408, !dbg !28 + %1410 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1196, float 0x3FF7154760000000, float %1409) #6, !dbg !28 + %1411 = tail call float @llvm.nvvm.fma.rn.f(float %1196, float 0x3FF7154760000000, float %1409) #6, !dbg !28 + %.0.i1091 = select i1 %.not3.i1090, float %1411, float %1410, !dbg !28 + %1412 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1196, float 0x3E54AE0C00000000, float %.0.i1091) #6, !dbg !28 + %1413 = tail call float @llvm.nvvm.fma.rn.f(float %1196, float 0x3E54AE0C00000000, float %.0.i1091) #6, !dbg !28 + %.01.i1093 = select i1 %.not4.i1092, float %1413, float %1412, !dbg !28 + %1414 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1093) #6, !dbg !28 + %1415 = extractelement <16 x float> %1350, i64 8, !dbg !28 + %1416 = fadd float %1415, 0xC168000FE0000000, !dbg !28 + %1417 = fneg float %1416, !dbg !28 + %1418 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1197, float 0x3FF7154760000000, float %1417) #6, !dbg !28 + %1419 = tail call float @llvm.nvvm.fma.rn.f(float %1197, float 0x3FF7154760000000, float %1417) #6, !dbg !28 + %.0.i1101 = select i1 %.not3.i1100, float %1419, float %1418, !dbg !28 + %1420 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1197, float 0x3E54AE0C00000000, float %.0.i1101) #6, !dbg !28 + %1421 = tail call float @llvm.nvvm.fma.rn.f(float %1197, float 0x3E54AE0C00000000, float %.0.i1101) #6, !dbg !28 + %.01.i1103 = select i1 %.not4.i1102, float %1421, float %1420, !dbg !28 + %1422 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1103) #6, !dbg !28 + %1423 = extractelement <16 x float> %1350, i64 9, !dbg !28 + %1424 = fadd float %1423, 0xC168000FE0000000, !dbg !28 + %1425 = fneg float %1424, !dbg !28 + %1426 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1198, float 0x3FF7154760000000, float %1425) #6, !dbg !28 + %1427 = tail call float @llvm.nvvm.fma.rn.f(float %1198, float 0x3FF7154760000000, float %1425) #6, !dbg !28 + %.0.i1111 = select i1 %.not3.i1110, float %1427, float %1426, !dbg !28 + %1428 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1198, float 0x3E54AE0C00000000, float %.0.i1111) #6, !dbg !28 + %1429 = tail call float @llvm.nvvm.fma.rn.f(float %1198, float 0x3E54AE0C00000000, float %.0.i1111) #6, !dbg !28 + %.01.i1113 = select i1 %.not4.i1112, float %1429, float %1428, !dbg !28 + %1430 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1113) #6, !dbg !28 + %1431 = extractelement <16 x float> %1350, i64 10, !dbg !28 + %1432 = fadd float %1431, 0xC168000FE0000000, !dbg !28 + %1433 = fneg float %1432, !dbg !28 + %1434 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1199, float 0x3FF7154760000000, float %1433) #6, !dbg !28 + %1435 = tail call float @llvm.nvvm.fma.rn.f(float %1199, float 0x3FF7154760000000, float %1433) #6, !dbg !28 + %.0.i1121 = select i1 %.not3.i1120, float %1435, float %1434, !dbg !28 + %1436 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1199, float 0x3E54AE0C00000000, float %.0.i1121) #6, !dbg !28 + %1437 = tail call float @llvm.nvvm.fma.rn.f(float %1199, float 0x3E54AE0C00000000, float %.0.i1121) #6, !dbg !28 + %.01.i1123 = select i1 %.not4.i1122, float %1437, float %1436, !dbg !28 + %1438 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1123) #6, !dbg !28 + %1439 = extractelement <16 x float> %1350, i64 11, !dbg !28 + %1440 = fadd float %1439, 0xC168000FE0000000, !dbg !28 + %1441 = fneg float %1440, !dbg !28 + %1442 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1200, float 0x3FF7154760000000, float %1441) #6, !dbg !28 + %1443 = tail call float @llvm.nvvm.fma.rn.f(float %1200, float 0x3FF7154760000000, float %1441) #6, !dbg !28 + %.0.i1131 = select i1 %.not3.i1130, float %1443, float %1442, !dbg !28 + %1444 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1200, float 0x3E54AE0C00000000, float %.0.i1131) #6, !dbg !28 + %1445 = tail call float @llvm.nvvm.fma.rn.f(float %1200, float 0x3E54AE0C00000000, float %.0.i1131) #6, !dbg !28 + %.01.i1133 = select i1 %.not4.i1132, float %1445, float %1444, !dbg !28 + %1446 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1133) #6, !dbg !28 + %1447 = extractelement <16 x float> %1350, i64 12, !dbg !28 + %1448 = fadd float %1447, 0xC168000FE0000000, !dbg !28 + %1449 = fneg float %1448, !dbg !28 + %1450 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1201, float 0x3FF7154760000000, float %1449) #6, !dbg !28 + %1451 = tail call float @llvm.nvvm.fma.rn.f(float %1201, float 0x3FF7154760000000, float %1449) #6, !dbg !28 + %.0.i1141 = select i1 %.not3.i1140, float %1451, float %1450, !dbg !28 + %1452 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1201, float 0x3E54AE0C00000000, float %.0.i1141) #6, !dbg !28 + %1453 = tail call float @llvm.nvvm.fma.rn.f(float %1201, float 0x3E54AE0C00000000, float %.0.i1141) #6, !dbg !28 + %.01.i1143 = select i1 %.not4.i1142, float %1453, float %1452, !dbg !28 + %1454 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1143) #6, !dbg !28 + %1455 = extractelement <16 x float> %1350, i64 13, !dbg !28 + %1456 = fadd float %1455, 0xC168000FE0000000, !dbg !28 + %1457 = fneg float %1456, !dbg !28 + %1458 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1202, float 0x3FF7154760000000, float %1457) #6, !dbg !28 + %1459 = tail call float @llvm.nvvm.fma.rn.f(float %1202, float 0x3FF7154760000000, float %1457) #6, !dbg !28 + %.0.i1151 = select i1 %.not3.i1150, float %1459, float %1458, !dbg !28 + %1460 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1202, float 0x3E54AE0C00000000, float %.0.i1151) #6, !dbg !28 + %1461 = tail call float @llvm.nvvm.fma.rn.f(float %1202, float 0x3E54AE0C00000000, float %.0.i1151) #6, !dbg !28 + %.01.i1153 = select i1 %.not4.i1152, float %1461, float %1460, !dbg !28 + %1462 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1153) #6, !dbg !28 + %1463 = extractelement <16 x float> %1350, i64 14, !dbg !28 + %1464 = fadd float %1463, 0xC168000FE0000000, !dbg !28 + %1465 = fneg float %1464, !dbg !28 + %1466 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1203, float 0x3FF7154760000000, float %1465) #6, !dbg !28 + %1467 = tail call float @llvm.nvvm.fma.rn.f(float %1203, float 0x3FF7154760000000, float %1465) #6, !dbg !28 + %.0.i1161 = select i1 %.not3.i1160, float %1467, float %1466, !dbg !28 + %1468 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1203, float 0x3E54AE0C00000000, float %.0.i1161) #6, !dbg !28 + %1469 = tail call float @llvm.nvvm.fma.rn.f(float %1203, float 0x3E54AE0C00000000, float %.0.i1161) #6, !dbg !28 + %.01.i1163 = select i1 %.not4.i1162, float %1469, float %1468, !dbg !28 + %1470 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1163) #6, !dbg !28 + %1471 = extractelement <16 x float> %1350, i64 15, !dbg !28 + %1472 = fadd float %1471, 0xC168000FE0000000, !dbg !28 + %1473 = fneg float %1472, !dbg !28 + %1474 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1204, float 0x3FF7154760000000, float %1473) #6, !dbg !28 + %1475 = tail call float @llvm.nvvm.fma.rn.f(float %1204, float 0x3FF7154760000000, float %1473) #6, !dbg !28 + %.0.i1171 = select i1 %.not3.i1170, float %1475, float %1474, !dbg !28 + %1476 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1204, float 0x3E54AE0C00000000, float %.0.i1171) #6, !dbg !28 + %1477 = tail call float @llvm.nvvm.fma.rn.f(float %1204, float 0x3E54AE0C00000000, float %.0.i1171) #6, !dbg !28 + %.01.i1173 = select i1 %.not4.i1172, float %1477, float %1476, !dbg !28 + %1478 = bitcast <16 x float> %1350 to <16 x i32>, !dbg !28 + %1479 = shl <16 x i32> %1478, splat (i32 23), !dbg !28 + %1480 = bitcast <16 x i32> %1479 to <16 x float>, !dbg !28 + %1481 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1173) #6, !dbg !28 + %1482 = insertelement <16 x float> poison, float %1358, i64 0, !dbg !28 + %1483 = insertelement <16 x float> %1482, float %1366, i64 1, !dbg !28 + %1484 = insertelement <16 x float> %1483, float %1374, i64 2, !dbg !28 + %1485 = insertelement <16 x float> %1484, float %1382, i64 3, !dbg !28 + %1486 = insertelement <16 x float> %1485, float %1390, i64 4, !dbg !28 + %1487 = insertelement <16 x float> %1486, float %1398, i64 5, !dbg !28 + %1488 = insertelement <16 x float> %1487, float %1406, i64 6, !dbg !28 + %1489 = insertelement <16 x float> %1488, float %1414, i64 7, !dbg !28 + %1490 = insertelement <16 x float> %1489, float %1422, i64 8, !dbg !28 + %1491 = insertelement <16 x float> %1490, float %1430, i64 9, !dbg !28 + %1492 = insertelement <16 x float> %1491, float %1438, i64 10, !dbg !28 + %1493 = insertelement <16 x float> %1492, float %1446, i64 11, !dbg !28 + %1494 = insertelement <16 x float> %1493, float %1454, i64 12, !dbg !28 + %1495 = insertelement <16 x float> %1494, float %1462, i64 13, !dbg !28 + %1496 = insertelement <16 x float> %1495, float %1470, i64 14, !dbg !28 + %1497 = insertelement <16 x float> %1496, float %1481, i64 15, !dbg !28 + %1498 = fmul <16 x float> %1497, %1480, !dbg !28 + %1499 = select <16 x i1> %877, <16 x float> splat (float 1.000000e+00), <16 x float> %1498, !dbg !30 + %1500 = fmul <16 x float> %36, %1188, !dbg !31 + %1501 = fmul float %18, %488, !dbg !31 + %1502 = fmul float %19, %489, !dbg !31 + %1503 = fmul float %20, %490, !dbg !31 + %1504 = fmul float %21, %491, !dbg !31 + %1505 = fmul float %22, %492, !dbg !31 + %1506 = fmul float %23, %493, !dbg !31 + %1507 = fmul float %24, %494, !dbg !31 + %1508 = fmul float %25, %495, !dbg !31 + %1509 = fmul float %26, %496, !dbg !31 + %1510 = fmul float %27, %497, !dbg !31 + %1511 = fadd <16 x float> %1500, %1499, !dbg !32 + %1512 = fadd float %1501, %838, !dbg !32 + %1513 = fadd float %1502, %839, !dbg !32 + %1514 = fadd float %1503, %840, !dbg !32 + %1515 = fadd float %1504, %841, !dbg !32 + %1516 = fadd float %1505, %842, !dbg !32 + %1517 = fadd float %1506, %843, !dbg !32 + %1518 = fadd float %1507, %844, !dbg !32 + %1519 = fadd float %1508, %845, !dbg !32 + %1520 = fadd float %1509, %846, !dbg !32 + %1521 = fadd float %1510, %847, !dbg !32 + %1522 = fpext <2 x bfloat> %95 to <2 x float>, !dbg !17 + %1523 = fcmp ogt <2 x float> %35, %1522, !dbg !18 + %1524 = or <2 x i1> %110, %1523, !dbg !23 + %1525 = select <2 x i1> %1524, <2 x float> %35, <2 x float> %1522, !dbg !24 + %1526 = fcmp oeq <2 x float> %1525, splat (float 0xFFF0000000000000), !dbg !26 + %foldExtExtBinop1621 = fsub <2 x float> %35, %1525, !dbg !27 + %1527 = extractelement <2 x float> %foldExtExtBinop1621, i64 0, !dbg !27 + %foldExtExtBinop1623 = fsub <2 x float> %35, %1525, !dbg !27 + %1528 = extractelement <2 x float> %foldExtExtBinop1623, i64 1, !dbg !27 + %1529 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1527, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1530 = tail call float @llvm.nvvm.fma.rn.f(float %1527, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i955 = select i1 %.not.i954, float %1530, float %1529, !dbg !28 + %1531 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i955) #6, !dbg !28 + %1532 = tail call float @llvm.nvvm.saturate.f(float %.02.i955) #6, !dbg !28 + %.03.i957 = select i1 %.not1.i956, float %1532, float %1531, !dbg !28 + %1533 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i957, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1534 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i957, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1535 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1528, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1536 = tail call float @llvm.nvvm.fma.rn.f(float %1528, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i965 = select i1 %.not.i964, float %1536, float %1535, !dbg !28 + %1537 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i965) #6, !dbg !28 + %1538 = tail call float @llvm.nvvm.saturate.f(float %.02.i965) #6, !dbg !28 + %.03.i967 = select i1 %.not1.i966, float %1538, float %1537, !dbg !28 + %1539 = insertelement <2 x i32> poison, i32 %460, i64 0, !dbg !28 + %1540 = insertelement <2 x i32> %1539, i32 %465, i64 1, !dbg !28 + %1541 = icmp eq <2 x i32> %1540, zeroinitializer, !dbg !28 + %1542 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i967, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1543 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i967, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1544 = insertelement <2 x float> poison, float %1534, i64 0, !dbg !28 + %1545 = insertelement <2 x float> %1544, float %1543, i64 1, !dbg !28 + %1546 = insertelement <2 x float> poison, float %1533, i64 0, !dbg !28 + %1547 = insertelement <2 x float> %1546, float %1542, i64 1, !dbg !28 + %1548 = select <2 x i1> %1541, <2 x float> %1545, <2 x float> %1547, !dbg !28 + %1549 = extractelement <2 x float> %1548, i64 0, !dbg !28 + %1550 = fadd float %1549, 0xC168000FE0000000, !dbg !28 + %1551 = fneg float %1550, !dbg !28 + %1552 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1527, float 0x3FF7154760000000, float %1551) #6, !dbg !28 + %1553 = tail call float @llvm.nvvm.fma.rn.f(float %1527, float 0x3FF7154760000000, float %1551) #6, !dbg !28 + %.0.i961 = select i1 %.not3.i960, float %1553, float %1552, !dbg !28 + %1554 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1527, float 0x3E54AE0C00000000, float %.0.i961) #6, !dbg !28 + %1555 = tail call float @llvm.nvvm.fma.rn.f(float %1527, float 0x3E54AE0C00000000, float %.0.i961) #6, !dbg !28 + %.01.i963 = select i1 %.not4.i962, float %1555, float %1554, !dbg !28 + %1556 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i963) #6, !dbg !28 + %1557 = extractelement <2 x float> %1548, i64 1, !dbg !28 + %1558 = fadd float %1557, 0xC168000FE0000000, !dbg !28 + %1559 = fneg float %1558, !dbg !28 + %1560 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1528, float 0x3FF7154760000000, float %1559) #6, !dbg !28 + %1561 = tail call float @llvm.nvvm.fma.rn.f(float %1528, float 0x3FF7154760000000, float %1559) #6, !dbg !28 + %.0.i971 = select i1 %.not3.i970, float %1561, float %1560, !dbg !28 + %1562 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1528, float 0x3E54AE0C00000000, float %.0.i971) #6, !dbg !28 + %1563 = tail call float @llvm.nvvm.fma.rn.f(float %1528, float 0x3E54AE0C00000000, float %.0.i971) #6, !dbg !28 + %.01.i973 = select i1 %.not4.i972, float %1563, float %1562, !dbg !28 + %1564 = bitcast <2 x float> %1548 to <2 x i32>, !dbg !28 + %1565 = shl <2 x i32> %1564, splat (i32 23), !dbg !28 + %1566 = bitcast <2 x i32> %1565 to <2 x float>, !dbg !28 + %1567 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i973) #6, !dbg !28 + %1568 = insertelement <2 x float> poison, float %1556, i64 0, !dbg !28 + %1569 = insertelement <2 x float> %1568, float %1567, i64 1, !dbg !28 + %1570 = fmul <2 x float> %1569, %1566, !dbg !28 + %1571 = select <2 x i1> %1526, <2 x float> splat (float 1.000000e+00), <2 x float> %1570, !dbg !29 + %foldExtExtBinop1625 = fsub <2 x float> %1522, %1525, !dbg !25 + %1572 = extractelement <2 x float> %foldExtExtBinop1625, i64 0, !dbg !25 + %foldExtExtBinop1627 = fsub <2 x float> %1522, %1525, !dbg !25 + %1573 = extractelement <2 x float> %foldExtExtBinop1627, i64 1, !dbg !25 + %1574 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1572, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1575 = tail call float @llvm.nvvm.fma.rn.f(float %1572, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1275 = select i1 %.not.i1274, float %1575, float %1574, !dbg !28 + %1576 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1275) #6, !dbg !28 + %1577 = tail call float @llvm.nvvm.saturate.f(float %.02.i1275) #6, !dbg !28 + %.03.i1277 = select i1 %.not1.i1276, float %1577, float %1576, !dbg !28 + %1578 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1277, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1579 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1277, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1580 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1573, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1581 = tail call float @llvm.nvvm.fma.rn.f(float %1573, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1285 = select i1 %.not.i1284, float %1581, float %1580, !dbg !28 + %1582 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1285) #6, !dbg !28 + %1583 = tail call float @llvm.nvvm.saturate.f(float %.02.i1285) #6, !dbg !28 + %.03.i1287 = select i1 %.not1.i1286, float %1583, float %1582, !dbg !28 + %1584 = insertelement <2 x i32> poison, i32 %810, i64 0, !dbg !28 + %1585 = insertelement <2 x i32> %1584, i32 %815, i64 1, !dbg !28 + %1586 = icmp eq <2 x i32> %1585, zeroinitializer, !dbg !28 + %1587 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1287, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1588 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1287, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1589 = insertelement <2 x float> poison, float %1579, i64 0, !dbg !28 + %1590 = insertelement <2 x float> %1589, float %1588, i64 1, !dbg !28 + %1591 = insertelement <2 x float> poison, float %1578, i64 0, !dbg !28 + %1592 = insertelement <2 x float> %1591, float %1587, i64 1, !dbg !28 + %1593 = select <2 x i1> %1586, <2 x float> %1590, <2 x float> %1592, !dbg !28 + %1594 = extractelement <2 x float> %1593, i64 0, !dbg !28 + %1595 = fadd float %1594, 0xC168000FE0000000, !dbg !28 + %1596 = fneg float %1595, !dbg !28 + %1597 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1572, float 0x3FF7154760000000, float %1596) #6, !dbg !28 + %1598 = tail call float @llvm.nvvm.fma.rn.f(float %1572, float 0x3FF7154760000000, float %1596) #6, !dbg !28 + %.0.i1281 = select i1 %.not3.i1280, float %1598, float %1597, !dbg !28 + %1599 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1572, float 0x3E54AE0C00000000, float %.0.i1281) #6, !dbg !28 + %1600 = tail call float @llvm.nvvm.fma.rn.f(float %1572, float 0x3E54AE0C00000000, float %.0.i1281) #6, !dbg !28 + %.01.i1283 = select i1 %.not4.i1282, float %1600, float %1599, !dbg !28 + %1601 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1283) #6, !dbg !28 + %1602 = extractelement <2 x float> %1593, i64 1, !dbg !28 + %1603 = fadd float %1602, 0xC168000FE0000000, !dbg !28 + %1604 = fneg float %1603, !dbg !28 + %1605 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1573, float 0x3FF7154760000000, float %1604) #6, !dbg !28 + %1606 = tail call float @llvm.nvvm.fma.rn.f(float %1573, float 0x3FF7154760000000, float %1604) #6, !dbg !28 + %.0.i1291 = select i1 %.not3.i1290, float %1606, float %1605, !dbg !28 + %1607 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1573, float 0x3E54AE0C00000000, float %.0.i1291) #6, !dbg !28 + %1608 = tail call float @llvm.nvvm.fma.rn.f(float %1573, float 0x3E54AE0C00000000, float %.0.i1291) #6, !dbg !28 + %.01.i1293 = select i1 %.not4.i1292, float %1608, float %1607, !dbg !28 + %1609 = bitcast <2 x float> %1593 to <2 x i32>, !dbg !28 + %1610 = shl <2 x i32> %1609, splat (i32 23), !dbg !28 + %1611 = bitcast <2 x i32> %1610 to <2 x float>, !dbg !28 + %1612 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1293) #6, !dbg !28 + %1613 = insertelement <2 x float> poison, float %1601, i64 0, !dbg !28 + %1614 = insertelement <2 x float> %1613, float %1612, i64 1, !dbg !28 + %1615 = fmul <2 x float> %1614, %1611, !dbg !28 + %1616 = select <2 x i1> %1526, <2 x float> splat (float 1.000000e+00), <2 x float> %1615, !dbg !30 + %1617 = fmul <2 x float> %34, %1571, !dbg !31 + %1618 = fadd <2 x float> %1617, %1616, !dbg !32 + %1619 = fpext <2 x bfloat> %97 to <2 x float>, !dbg !17 + %1620 = fcmp ogt <2 x float> %33, %1619, !dbg !18 + %1621 = or <2 x i1> %111, %1620, !dbg !23 + %1622 = select <2 x i1> %1621, <2 x float> %33, <2 x float> %1619, !dbg !24 + %1623 = fcmp oeq <2 x float> %1622, splat (float 0xFFF0000000000000), !dbg !26 + %foldExtExtBinop1629 = fsub <2 x float> %33, %1622, !dbg !27 + %1624 = extractelement <2 x float> %foldExtExtBinop1629, i64 0, !dbg !27 + %foldExtExtBinop1631 = fsub <2 x float> %33, %1622, !dbg !27 + %1625 = extractelement <2 x float> %foldExtExtBinop1631, i64 1, !dbg !27 + %1626 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1624, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1627 = tail call float @llvm.nvvm.fma.rn.f(float %1624, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i975 = select i1 %.not.i974, float %1627, float %1626, !dbg !28 + %1628 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i975) #6, !dbg !28 + %1629 = tail call float @llvm.nvvm.saturate.f(float %.02.i975) #6, !dbg !28 + %.03.i977 = select i1 %.not1.i976, float %1629, float %1628, !dbg !28 + %1630 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i977, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1631 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i977, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1632 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1625, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1633 = tail call float @llvm.nvvm.fma.rn.f(float %1625, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i985 = select i1 %.not.i984, float %1633, float %1632, !dbg !28 + %1634 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i985) #6, !dbg !28 + %1635 = tail call float @llvm.nvvm.saturate.f(float %.02.i985) #6, !dbg !28 + %.03.i987 = select i1 %.not1.i986, float %1635, float %1634, !dbg !28 + %1636 = insertelement <2 x i32> poison, i32 %470, i64 0, !dbg !28 + %1637 = insertelement <2 x i32> %1636, i32 %475, i64 1, !dbg !28 + %1638 = icmp eq <2 x i32> %1637, zeroinitializer, !dbg !28 + %1639 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i987, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1640 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i987, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1641 = insertelement <2 x float> poison, float %1631, i64 0, !dbg !28 + %1642 = insertelement <2 x float> %1641, float %1640, i64 1, !dbg !28 + %1643 = insertelement <2 x float> poison, float %1630, i64 0, !dbg !28 + %1644 = insertelement <2 x float> %1643, float %1639, i64 1, !dbg !28 + %1645 = select <2 x i1> %1638, <2 x float> %1642, <2 x float> %1644, !dbg !28 + %1646 = extractelement <2 x float> %1645, i64 0, !dbg !28 + %1647 = fadd float %1646, 0xC168000FE0000000, !dbg !28 + %1648 = fneg float %1647, !dbg !28 + %1649 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1624, float 0x3FF7154760000000, float %1648) #6, !dbg !28 + %1650 = tail call float @llvm.nvvm.fma.rn.f(float %1624, float 0x3FF7154760000000, float %1648) #6, !dbg !28 + %.0.i981 = select i1 %.not3.i980, float %1650, float %1649, !dbg !28 + %1651 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1624, float 0x3E54AE0C00000000, float %.0.i981) #6, !dbg !28 + %1652 = tail call float @llvm.nvvm.fma.rn.f(float %1624, float 0x3E54AE0C00000000, float %.0.i981) #6, !dbg !28 + %.01.i983 = select i1 %.not4.i982, float %1652, float %1651, !dbg !28 + %1653 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i983) #6, !dbg !28 + %1654 = extractelement <2 x float> %1645, i64 1, !dbg !28 + %1655 = fadd float %1654, 0xC168000FE0000000, !dbg !28 + %1656 = fneg float %1655, !dbg !28 + %1657 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1625, float 0x3FF7154760000000, float %1656) #6, !dbg !28 + %1658 = tail call float @llvm.nvvm.fma.rn.f(float %1625, float 0x3FF7154760000000, float %1656) #6, !dbg !28 + %.0.i991 = select i1 %.not3.i990, float %1658, float %1657, !dbg !28 + %1659 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1625, float 0x3E54AE0C00000000, float %.0.i991) #6, !dbg !28 + %1660 = tail call float @llvm.nvvm.fma.rn.f(float %1625, float 0x3E54AE0C00000000, float %.0.i991) #6, !dbg !28 + %.01.i993 = select i1 %.not4.i992, float %1660, float %1659, !dbg !28 + %1661 = bitcast <2 x float> %1645 to <2 x i32>, !dbg !28 + %1662 = shl <2 x i32> %1661, splat (i32 23), !dbg !28 + %1663 = bitcast <2 x i32> %1662 to <2 x float>, !dbg !28 + %1664 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i993) #6, !dbg !28 + %1665 = insertelement <2 x float> poison, float %1653, i64 0, !dbg !28 + %1666 = insertelement <2 x float> %1665, float %1664, i64 1, !dbg !28 + %1667 = fmul <2 x float> %1666, %1663, !dbg !28 + %1668 = select <2 x i1> %1623, <2 x float> splat (float 1.000000e+00), <2 x float> %1667, !dbg !29 + %foldExtExtBinop1633 = fsub <2 x float> %1619, %1622, !dbg !25 + %1669 = extractelement <2 x float> %foldExtExtBinop1633, i64 0, !dbg !25 + %foldExtExtBinop1635 = fsub <2 x float> %1619, %1622, !dbg !25 + %1670 = extractelement <2 x float> %foldExtExtBinop1635, i64 1, !dbg !25 + %1671 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1669, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1672 = tail call float @llvm.nvvm.fma.rn.f(float %1669, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1295 = select i1 %.not.i1294, float %1672, float %1671, !dbg !28 + %1673 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1295) #6, !dbg !28 + %1674 = tail call float @llvm.nvvm.saturate.f(float %.02.i1295) #6, !dbg !28 + %.03.i1297 = select i1 %.not1.i1296, float %1674, float %1673, !dbg !28 + %1675 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1297, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1676 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1297, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1677 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1670, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1678 = tail call float @llvm.nvvm.fma.rn.f(float %1670, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1305 = select i1 %.not.i1304, float %1678, float %1677, !dbg !28 + %1679 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1305) #6, !dbg !28 + %1680 = tail call float @llvm.nvvm.saturate.f(float %.02.i1305) #6, !dbg !28 + %.03.i1307 = select i1 %.not1.i1306, float %1680, float %1679, !dbg !28 + %1681 = insertelement <2 x i32> poison, i32 %820, i64 0, !dbg !28 + %1682 = insertelement <2 x i32> %1681, i32 %825, i64 1, !dbg !28 + %1683 = icmp eq <2 x i32> %1682, zeroinitializer, !dbg !28 + %1684 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1307, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1685 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1307, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1686 = insertelement <2 x float> poison, float %1676, i64 0, !dbg !28 + %1687 = insertelement <2 x float> %1686, float %1685, i64 1, !dbg !28 + %1688 = insertelement <2 x float> poison, float %1675, i64 0, !dbg !28 + %1689 = insertelement <2 x float> %1688, float %1684, i64 1, !dbg !28 + %1690 = select <2 x i1> %1683, <2 x float> %1687, <2 x float> %1689, !dbg !28 + %1691 = extractelement <2 x float> %1690, i64 0, !dbg !28 + %1692 = fadd float %1691, 0xC168000FE0000000, !dbg !28 + %1693 = fneg float %1692, !dbg !28 + %1694 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1669, float 0x3FF7154760000000, float %1693) #6, !dbg !28 + %1695 = tail call float @llvm.nvvm.fma.rn.f(float %1669, float 0x3FF7154760000000, float %1693) #6, !dbg !28 + %.0.i1301 = select i1 %.not3.i1300, float %1695, float %1694, !dbg !28 + %1696 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1669, float 0x3E54AE0C00000000, float %.0.i1301) #6, !dbg !28 + %1697 = tail call float @llvm.nvvm.fma.rn.f(float %1669, float 0x3E54AE0C00000000, float %.0.i1301) #6, !dbg !28 + %.01.i1303 = select i1 %.not4.i1302, float %1697, float %1696, !dbg !28 + %1698 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1303) #6, !dbg !28 + %1699 = extractelement <2 x float> %1690, i64 1, !dbg !28 + %1700 = fadd float %1699, 0xC168000FE0000000, !dbg !28 + %1701 = fneg float %1700, !dbg !28 + %1702 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1670, float 0x3FF7154760000000, float %1701) #6, !dbg !28 + %1703 = tail call float @llvm.nvvm.fma.rn.f(float %1670, float 0x3FF7154760000000, float %1701) #6, !dbg !28 + %.0.i1311 = select i1 %.not3.i1310, float %1703, float %1702, !dbg !28 + %1704 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1670, float 0x3E54AE0C00000000, float %.0.i1311) #6, !dbg !28 + %1705 = tail call float @llvm.nvvm.fma.rn.f(float %1670, float 0x3E54AE0C00000000, float %.0.i1311) #6, !dbg !28 + %.01.i1313 = select i1 %.not4.i1312, float %1705, float %1704, !dbg !28 + %1706 = bitcast <2 x float> %1690 to <2 x i32>, !dbg !28 + %1707 = shl <2 x i32> %1706, splat (i32 23), !dbg !28 + %1708 = bitcast <2 x i32> %1707 to <2 x float>, !dbg !28 + %1709 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1313) #6, !dbg !28 + %1710 = insertelement <2 x float> poison, float %1698, i64 0, !dbg !28 + %1711 = insertelement <2 x float> %1710, float %1709, i64 1, !dbg !28 + %1712 = fmul <2 x float> %1711, %1708, !dbg !28 + %1713 = select <2 x i1> %1623, <2 x float> splat (float 1.000000e+00), <2 x float> %1712, !dbg !30 + %1714 = fmul <2 x float> %32, %1668, !dbg !31 + %1715 = fadd <2 x float> %1714, %1713, !dbg !32 + %1716 = fpext <2 x bfloat> %99 to <2 x float>, !dbg !17 + %1717 = fcmp ogt <2 x float> %31, %1716, !dbg !18 + %1718 = or <2 x i1> %112, %1717, !dbg !23 + %1719 = select <2 x i1> %1718, <2 x float> %31, <2 x float> %1716, !dbg !24 + %1720 = fcmp oeq <2 x float> %1719, splat (float 0xFFF0000000000000), !dbg !26 + %foldExtExtBinop1637 = fsub <2 x float> %31, %1719, !dbg !27 + %1721 = extractelement <2 x float> %foldExtExtBinop1637, i64 0, !dbg !27 + %foldExtExtBinop1639 = fsub <2 x float> %31, %1719, !dbg !27 + %1722 = extractelement <2 x float> %foldExtExtBinop1639, i64 1, !dbg !27 + %1723 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1721, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1724 = tail call float @llvm.nvvm.fma.rn.f(float %1721, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i995 = select i1 %.not.i994, float %1724, float %1723, !dbg !28 + %1725 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i995) #6, !dbg !28 + %1726 = tail call float @llvm.nvvm.saturate.f(float %.02.i995) #6, !dbg !28 + %.03.i997 = select i1 %.not1.i996, float %1726, float %1725, !dbg !28 + %1727 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i997, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1728 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i997, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1729 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1722, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1730 = tail call float @llvm.nvvm.fma.rn.f(float %1722, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1005 = select i1 %.not.i1004, float %1730, float %1729, !dbg !28 + %1731 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1005) #6, !dbg !28 + %1732 = tail call float @llvm.nvvm.saturate.f(float %.02.i1005) #6, !dbg !28 + %.03.i1007 = select i1 %.not1.i1006, float %1732, float %1731, !dbg !28 + %1733 = insertelement <2 x i32> poison, i32 %480, i64 0, !dbg !28 + %1734 = insertelement <2 x i32> %1733, i32 %485, i64 1, !dbg !28 + %1735 = icmp eq <2 x i32> %1734, zeroinitializer, !dbg !28 + %1736 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1007, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1737 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1007, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1738 = insertelement <2 x float> poison, float %1728, i64 0, !dbg !28 + %1739 = insertelement <2 x float> %1738, float %1737, i64 1, !dbg !28 + %1740 = insertelement <2 x float> poison, float %1727, i64 0, !dbg !28 + %1741 = insertelement <2 x float> %1740, float %1736, i64 1, !dbg !28 + %1742 = select <2 x i1> %1735, <2 x float> %1739, <2 x float> %1741, !dbg !28 + %1743 = extractelement <2 x float> %1742, i64 0, !dbg !28 + %1744 = fadd float %1743, 0xC168000FE0000000, !dbg !28 + %1745 = fneg float %1744, !dbg !28 + %1746 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1721, float 0x3FF7154760000000, float %1745) #6, !dbg !28 + %1747 = tail call float @llvm.nvvm.fma.rn.f(float %1721, float 0x3FF7154760000000, float %1745) #6, !dbg !28 + %.0.i1001 = select i1 %.not3.i1000, float %1747, float %1746, !dbg !28 + %1748 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1721, float 0x3E54AE0C00000000, float %.0.i1001) #6, !dbg !28 + %1749 = tail call float @llvm.nvvm.fma.rn.f(float %1721, float 0x3E54AE0C00000000, float %.0.i1001) #6, !dbg !28 + %.01.i1003 = select i1 %.not4.i1002, float %1749, float %1748, !dbg !28 + %1750 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1003) #6, !dbg !28 + %1751 = extractelement <2 x float> %1742, i64 1, !dbg !28 + %1752 = fadd float %1751, 0xC168000FE0000000, !dbg !28 + %1753 = fneg float %1752, !dbg !28 + %1754 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1722, float 0x3FF7154760000000, float %1753) #6, !dbg !28 + %1755 = tail call float @llvm.nvvm.fma.rn.f(float %1722, float 0x3FF7154760000000, float %1753) #6, !dbg !28 + %.0.i1011 = select i1 %.not3.i1010, float %1755, float %1754, !dbg !28 + %1756 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1722, float 0x3E54AE0C00000000, float %.0.i1011) #6, !dbg !28 + %1757 = tail call float @llvm.nvvm.fma.rn.f(float %1722, float 0x3E54AE0C00000000, float %.0.i1011) #6, !dbg !28 + %.01.i1013 = select i1 %.not4.i1012, float %1757, float %1756, !dbg !28 + %1758 = bitcast <2 x float> %1742 to <2 x i32>, !dbg !28 + %1759 = shl <2 x i32> %1758, splat (i32 23), !dbg !28 + %1760 = bitcast <2 x i32> %1759 to <2 x float>, !dbg !28 + %1761 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1013) #6, !dbg !28 + %1762 = insertelement <2 x float> poison, float %1750, i64 0, !dbg !28 + %1763 = insertelement <2 x float> %1762, float %1761, i64 1, !dbg !28 + %1764 = fmul <2 x float> %1763, %1760, !dbg !28 + %1765 = select <2 x i1> %1720, <2 x float> splat (float 1.000000e+00), <2 x float> %1764, !dbg !29 + %foldExtExtBinop1641 = fsub <2 x float> %1716, %1719, !dbg !25 + %1766 = extractelement <2 x float> %foldExtExtBinop1641, i64 0, !dbg !25 + %foldExtExtBinop1643 = fsub <2 x float> %1716, %1719, !dbg !25 + %1767 = extractelement <2 x float> %foldExtExtBinop1643, i64 1, !dbg !25 + %1768 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1766, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1769 = tail call float @llvm.nvvm.fma.rn.f(float %1766, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1315 = select i1 %.not.i1314, float %1769, float %1768, !dbg !28 + %1770 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1315) #6, !dbg !28 + %1771 = tail call float @llvm.nvvm.saturate.f(float %.02.i1315) #6, !dbg !28 + %.03.i1317 = select i1 %.not1.i1316, float %1771, float %1770, !dbg !28 + %1772 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1317, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1773 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1317, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1774 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1767, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1775 = tail call float @llvm.nvvm.fma.rn.f(float %1767, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1325 = select i1 %.not.i1324, float %1775, float %1774, !dbg !28 + %1776 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1325) #6, !dbg !28 + %1777 = tail call float @llvm.nvvm.saturate.f(float %.02.i1325) #6, !dbg !28 + %.03.i1327 = select i1 %.not1.i1326, float %1777, float %1776, !dbg !28 + %1778 = insertelement <2 x i32> poison, i32 %830, i64 0, !dbg !28 + %1779 = insertelement <2 x i32> %1778, i32 %835, i64 1, !dbg !28 + %1780 = icmp eq <2 x i32> %1779, zeroinitializer, !dbg !28 + %1781 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1327, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1782 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1327, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1783 = insertelement <2 x float> poison, float %1773, i64 0, !dbg !28 + %1784 = insertelement <2 x float> %1783, float %1782, i64 1, !dbg !28 + %1785 = insertelement <2 x float> poison, float %1772, i64 0, !dbg !28 + %1786 = insertelement <2 x float> %1785, float %1781, i64 1, !dbg !28 + %1787 = select <2 x i1> %1780, <2 x float> %1784, <2 x float> %1786, !dbg !28 + %1788 = extractelement <2 x float> %1787, i64 0, !dbg !28 + %1789 = fadd float %1788, 0xC168000FE0000000, !dbg !28 + %1790 = fneg float %1789, !dbg !28 + %1791 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1766, float 0x3FF7154760000000, float %1790) #6, !dbg !28 + %1792 = tail call float @llvm.nvvm.fma.rn.f(float %1766, float 0x3FF7154760000000, float %1790) #6, !dbg !28 + %.0.i1321 = select i1 %.not3.i1320, float %1792, float %1791, !dbg !28 + %1793 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1766, float 0x3E54AE0C00000000, float %.0.i1321) #6, !dbg !28 + %1794 = tail call float @llvm.nvvm.fma.rn.f(float %1766, float 0x3E54AE0C00000000, float %.0.i1321) #6, !dbg !28 + %.01.i1323 = select i1 %.not4.i1322, float %1794, float %1793, !dbg !28 + %1795 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1323) #6, !dbg !28 + %1796 = extractelement <2 x float> %1787, i64 1, !dbg !28 + %1797 = fadd float %1796, 0xC168000FE0000000, !dbg !28 + %1798 = fneg float %1797, !dbg !28 + %1799 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1767, float 0x3FF7154760000000, float %1798) #6, !dbg !28 + %1800 = tail call float @llvm.nvvm.fma.rn.f(float %1767, float 0x3FF7154760000000, float %1798) #6, !dbg !28 + %.0.i1331 = select i1 %.not3.i1330, float %1800, float %1799, !dbg !28 + %1801 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1767, float 0x3E54AE0C00000000, float %.0.i1331) #6, !dbg !28 + %1802 = tail call float @llvm.nvvm.fma.rn.f(float %1767, float 0x3E54AE0C00000000, float %.0.i1331) #6, !dbg !28 + %.01.i1333 = select i1 %.not4.i1332, float %1802, float %1801, !dbg !28 + %1803 = bitcast <2 x float> %1787 to <2 x i32>, !dbg !28 + %1804 = shl <2 x i32> %1803, splat (i32 23), !dbg !28 + %1805 = bitcast <2 x i32> %1804 to <2 x float>, !dbg !28 + %1806 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1333) #6, !dbg !28 + %1807 = insertelement <2 x float> poison, float %1795, i64 0, !dbg !28 + %1808 = insertelement <2 x float> %1807, float %1806, i64 1, !dbg !28 + %1809 = fmul <2 x float> %1808, %1805, !dbg !28 + %1810 = select <2 x i1> %1720, <2 x float> splat (float 1.000000e+00), <2 x float> %1809, !dbg !30 + %1811 = fmul <2 x float> %30, %1765, !dbg !31 + %1812 = fadd <2 x float> %1811, %1810, !dbg !32 + %1813 = select i1 %40, float %128, float %28, !dbg !33 + %1814 = select i1 %40, float %129, float %29, !dbg !33 + %1815 = insertelement <2 x i1> poison, i1 %40, i64 0, !dbg !33 + %1816 = shufflevector <2 x i1> %1815, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !33 + %1817 = select <2 x i1> %1816, <2 x float> %1525, <2 x float> %35, !dbg !33 + %1818 = select <2 x i1> %1816, <2 x float> %1622, <2 x float> %33, !dbg !33 + %1819 = select <2 x i1> %1816, <2 x float> %1719, <2 x float> %31, !dbg !33 + %1820 = select i1 %40, float %1520, float %26, !dbg !34 + %1821 = select i1 %40, float %1521, float %27, !dbg !34 + %1822 = select <2 x i1> %1816, <2 x float> %1618, <2 x float> %34, !dbg !34 + %1823 = select <2 x i1> %1816, <2 x float> %1715, <2 x float> %32, !dbg !34 + %1824 = select <2 x i1> %1816, <2 x float> %1812, <2 x float> %30, !dbg !34 + br i1 %17, label %16, label %1825, !dbg !11 + +1825: ; preds = %16 + %1826 = extractelement <16 x float> %876, i64 15, !dbg !25 + %1827 = extractelement <16 x float> %876, i64 14, !dbg !25 + %1828 = extractelement <16 x float> %876, i64 13, !dbg !25 + %1829 = extractelement <16 x float> %876, i64 12, !dbg !25 + %1830 = extractelement <16 x float> %876, i64 11, !dbg !25 + %1831 = extractelement <16 x float> %876, i64 10, !dbg !25 + %1832 = extractelement <16 x float> %876, i64 9, !dbg !25 + %1833 = extractelement <16 x float> %876, i64 8, !dbg !25 + %1834 = extractelement <16 x float> %876, i64 7, !dbg !25 + %1835 = extractelement <16 x float> %876, i64 6, !dbg !25 + %1836 = extractelement <16 x float> %876, i64 5, !dbg !25 + %1837 = extractelement <16 x float> %876, i64 4, !dbg !25 + %1838 = extractelement <16 x float> %876, i64 3, !dbg !25 + %1839 = extractelement <16 x float> %876, i64 2, !dbg !25 + %1840 = extractelement <16 x float> %876, i64 1, !dbg !25 + %1841 = extractelement <16 x float> %876, i64 0, !dbg !25 + %1842 = and i32 %8, 31, !dbg !9 + %1843 = lshr i32 %8, 5, !dbg !9 + %1844 = shl nuw nsw i32 %9, 2, !dbg !9 + %1845 = fcmp ogt float %1841, %1840, !dbg !35 + %1846 = fcmp uno float %1841, 0.000000e+00, !dbg !37 + %1847 = or i1 %1845, %1846, !dbg !38 + %1848 = select i1 %1847, float %1841, float %1840, !dbg !39 + %1849 = fcmp ogt float %1848, %1839, !dbg !35 + %1850 = fcmp uno float %1848, 0.000000e+00, !dbg !37 + %1851 = or i1 %1849, %1850, !dbg !38 + %1852 = select i1 %1851, float %1848, float %1839, !dbg !39 + %1853 = fcmp ogt float %1852, %1838, !dbg !35 + %1854 = fcmp uno float %1852, 0.000000e+00, !dbg !37 + %1855 = or i1 %1853, %1854, !dbg !38 + %1856 = select i1 %1855, float %1852, float %1838, !dbg !39 + %1857 = fcmp ogt float %1856, %1837, !dbg !35 + %1858 = fcmp uno float %1856, 0.000000e+00, !dbg !37 + %1859 = or i1 %1857, %1858, !dbg !38 + %1860 = select i1 %1859, float %1856, float %1837, !dbg !39 + %1861 = fcmp ogt float %1860, %1836, !dbg !35 + %1862 = fcmp uno float %1860, 0.000000e+00, !dbg !37 + %1863 = or i1 %1861, %1862, !dbg !38 + %1864 = select i1 %1863, float %1860, float %1836, !dbg !39 + %1865 = fcmp ogt float %1864, %1835, !dbg !35 + %1866 = fcmp uno float %1864, 0.000000e+00, !dbg !37 + %1867 = or i1 %1865, %1866, !dbg !38 + %1868 = select i1 %1867, float %1864, float %1835, !dbg !39 + %1869 = fcmp ogt float %1868, %1834, !dbg !35 + %1870 = fcmp uno float %1868, 0.000000e+00, !dbg !37 + %1871 = or i1 %1869, %1870, !dbg !38 + %1872 = select i1 %1871, float %1868, float %1834, !dbg !39 + %1873 = fcmp ogt float %1872, %1833, !dbg !35 + %1874 = fcmp uno float %1872, 0.000000e+00, !dbg !37 + %1875 = or i1 %1873, %1874, !dbg !38 + %1876 = select i1 %1875, float %1872, float %1833, !dbg !39 + %1877 = fcmp ogt float %1876, %1832, !dbg !35 + %1878 = fcmp uno float %1876, 0.000000e+00, !dbg !37 + %1879 = or i1 %1877, %1878, !dbg !38 + %1880 = select i1 %1879, float %1876, float %1832, !dbg !39 + %1881 = fcmp ogt float %1880, %1831, !dbg !35 + %1882 = fcmp uno float %1880, 0.000000e+00, !dbg !37 + %1883 = or i1 %1881, %1882, !dbg !38 + %1884 = select i1 %1883, float %1880, float %1831, !dbg !39 + %1885 = fcmp ogt float %1884, %1830, !dbg !35 + %1886 = fcmp uno float %1884, 0.000000e+00, !dbg !37 + %1887 = or i1 %1885, %1886, !dbg !38 + %1888 = select i1 %1887, float %1884, float %1830, !dbg !39 + %1889 = fcmp ogt float %1888, %1829, !dbg !35 + %1890 = fcmp uno float %1888, 0.000000e+00, !dbg !37 + %1891 = or i1 %1889, %1890, !dbg !38 + %1892 = select i1 %1891, float %1888, float %1829, !dbg !39 + %1893 = fcmp ogt float %1892, %1828, !dbg !35 + %1894 = fcmp uno float %1892, 0.000000e+00, !dbg !37 + %1895 = or i1 %1893, %1894, !dbg !38 + %1896 = select i1 %1895, float %1892, float %1828, !dbg !39 + %1897 = fcmp ogt float %1896, %1827, !dbg !35 + %1898 = fcmp uno float %1896, 0.000000e+00, !dbg !37 + %1899 = or i1 %1897, %1898, !dbg !38 + %1900 = select i1 %1899, float %1896, float %1827, !dbg !39 + %1901 = fcmp ogt float %1900, %1826, !dbg !35 + %1902 = fcmp uno float %1900, 0.000000e+00, !dbg !37 + %1903 = or i1 %1901, %1902, !dbg !38 + %1904 = select i1 %1903, float %1900, float %1826, !dbg !39 + %1905 = fcmp ogt float %1904, %130, !dbg !35 + %1906 = fcmp uno float %1904, 0.000000e+00, !dbg !37 + %1907 = or i1 %1905, %1906, !dbg !38 + %1908 = select i1 %1907, float %1904, float %130, !dbg !39 + %1909 = fcmp ogt float %1908, %132, !dbg !35 + %1910 = fcmp uno float %1908, 0.000000e+00, !dbg !37 + %1911 = or i1 %1909, %1910, !dbg !38 + %1912 = select i1 %1911, float %1908, float %132, !dbg !39 + %1913 = fcmp ogt float %1912, %134, !dbg !35 + %1914 = fcmp uno float %1912, 0.000000e+00, !dbg !37 + %1915 = or i1 %1913, %1914, !dbg !38 + %1916 = select i1 %1915, float %1912, float %134, !dbg !39 + %1917 = fcmp ogt float %1916, %136, !dbg !35 + %1918 = fcmp uno float %1916, 0.000000e+00, !dbg !37 + %1919 = or i1 %1917, %1918, !dbg !38 + %1920 = select i1 %1919, float %1916, float %136, !dbg !39 + %1921 = fcmp ogt float %1920, %138, !dbg !35 + %1922 = fcmp uno float %1920, 0.000000e+00, !dbg !37 + %1923 = or i1 %1921, %1922, !dbg !38 + %1924 = select i1 %1923, float %1920, float %138, !dbg !39 + %1925 = fcmp ogt float %1924, %140, !dbg !35 + %1926 = fcmp uno float %1924, 0.000000e+00, !dbg !37 + %1927 = or i1 %1925, %1926, !dbg !38 + %1928 = select i1 %1927, float %1924, float %140, !dbg !39 + %1929 = fcmp ogt float %1928, %142, !dbg !35 + %1930 = fcmp uno float %1928, 0.000000e+00, !dbg !37 + %1931 = or i1 %1929, %1930, !dbg !38 + %1932 = select i1 %1931, float %1928, float %142, !dbg !39 + %1933 = fcmp ogt float %1932, %144, !dbg !35 + %1934 = fcmp uno float %1932, 0.000000e+00, !dbg !37 + %1935 = or i1 %1933, %1934, !dbg !38 + %1936 = select i1 %1935, float %1932, float %144, !dbg !39 + %1937 = fcmp ogt float %1936, %1813, !dbg !35 + %1938 = fcmp uno float %1936, 0.000000e+00, !dbg !37 + %1939 = or i1 %1937, %1938, !dbg !38 + %1940 = select i1 %1939, float %1936, float %1813, !dbg !39 + %1941 = fcmp ogt float %1940, %1814, !dbg !35 + %1942 = fcmp uno float %1940, 0.000000e+00, !dbg !37 + %1943 = or i1 %1941, %1942, !dbg !38 + %1944 = select i1 %1943, float %1940, float %1814, !dbg !39 + %1945 = extractelement <2 x float> %1817, i64 0, !dbg !35 + %1946 = fcmp ogt float %1944, %1945, !dbg !35 + %1947 = fcmp uno float %1944, 0.000000e+00, !dbg !37 + %1948 = or i1 %1946, %1947, !dbg !38 + %1949 = select i1 %1948, float %1944, float %1945, !dbg !39 + %1950 = extractelement <2 x float> %1817, i64 1, !dbg !35 + %1951 = fcmp ogt float %1949, %1950, !dbg !35 + %1952 = fcmp uno float %1949, 0.000000e+00, !dbg !37 + %1953 = or i1 %1951, %1952, !dbg !38 + %1954 = select i1 %1953, float %1949, float %1950, !dbg !39 + %1955 = extractelement <2 x float> %1818, i64 0, !dbg !35 + %1956 = fcmp ogt float %1954, %1955, !dbg !35 + %1957 = fcmp uno float %1954, 0.000000e+00, !dbg !37 + %1958 = or i1 %1956, %1957, !dbg !38 + %1959 = select i1 %1958, float %1954, float %1955, !dbg !39 + %1960 = extractelement <2 x float> %1818, i64 1, !dbg !35 + %1961 = fcmp ogt float %1959, %1960, !dbg !35 + %1962 = fcmp uno float %1959, 0.000000e+00, !dbg !37 + %1963 = or i1 %1961, %1962, !dbg !38 + %1964 = select i1 %1963, float %1959, float %1960, !dbg !39 + %1965 = extractelement <2 x float> %1819, i64 0, !dbg !35 + %1966 = fcmp ogt float %1964, %1965, !dbg !35 + %1967 = fcmp uno float %1964, 0.000000e+00, !dbg !37 + %1968 = or i1 %1966, %1967, !dbg !38 + %1969 = select i1 %1968, float %1964, float %1965, !dbg !39 + %1970 = extractelement <2 x float> %1819, i64 1, !dbg !35 + %1971 = fcmp ogt float %1969, %1970, !dbg !35 + %1972 = fcmp uno float %1969, 0.000000e+00, !dbg !37 + %1973 = or i1 %1971, %1972, !dbg !38 + %1974 = select i1 %1973, float %1969, float %1970, !dbg !39 + %1975 = bitcast float %1974 to i32, !dbg !40 + %1976 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1975, i32 16, i32 31), !dbg !40 + %1977 = bitcast i32 %1976 to float, !dbg !40 + %1978 = fcmp ogt float %1974, %1977, !dbg !35 + %1979 = fcmp uno float %1974, 0.000000e+00, !dbg !37 + %1980 = or i1 %1979, %1978, !dbg !38 + %1981 = select i1 %1980, float %1974, float %1977, !dbg !39 + %1982 = bitcast float %1981 to i32, !dbg !40 + %1983 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1982, i32 8, i32 31), !dbg !40 + %1984 = bitcast i32 %1983 to float, !dbg !40 + %1985 = fcmp ogt float %1981, %1984, !dbg !35 + %1986 = fcmp uno float %1981, 0.000000e+00, !dbg !37 + %1987 = or i1 %1985, %1986, !dbg !38 + %1988 = select i1 %1987, float %1981, float %1984, !dbg !39 + %1989 = bitcast float %1988 to i32, !dbg !40 + %1990 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1989, i32 4, i32 31), !dbg !40 + %1991 = bitcast i32 %1990 to float, !dbg !40 + %1992 = fcmp ogt float %1988, %1991, !dbg !35 + %1993 = fcmp uno float %1988, 0.000000e+00, !dbg !37 + %1994 = or i1 %1992, %1993, !dbg !38 + %1995 = select i1 %1994, float %1988, float %1991, !dbg !39 + %1996 = bitcast float %1995 to i32, !dbg !40 + %1997 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1996, i32 2, i32 31), !dbg !40 + %1998 = bitcast i32 %1997 to float, !dbg !40 + %1999 = fcmp ogt float %1995, %1998, !dbg !35 + %2000 = fcmp uno float %1995, 0.000000e+00, !dbg !37 + %2001 = or i1 %1999, %2000, !dbg !38 + %2002 = select i1 %2001, float %1995, float %1998, !dbg !39 + %2003 = bitcast float %2002 to i32, !dbg !40 + %2004 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2003, i32 1, i32 31), !dbg !40 + %2005 = bitcast i32 %2004 to float, !dbg !40 + %2006 = fcmp ogt float %2002, %2005, !dbg !35 + %2007 = fcmp uno float %2002, 0.000000e+00, !dbg !37 + %2008 = or i1 %2006, %2007, !dbg !38 + %2009 = and i32 %1843, 15, !dbg !40 + %2010 = icmp eq i32 %1842, 0, !dbg !40 + %2011 = getelementptr float, ptr addrspace(3) @global_smem, i32 %2009, !dbg !40 + %2012 = select i1 %2008, i32 %2003, i32 %2004, !dbg !39 + %2013 = insertelement <1 x i32> poison, i32 %2012, i64 0, !dbg !40 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %2011, <1 x i32> %2013, i1 %2010) #6, !dbg !40 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !40 + %2014 = icmp samesign ult i32 %8, 16, !dbg !40 + %2015 = getelementptr float, ptr addrspace(3) @global_smem, i32 %8, !dbg !40 + %2016 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %2015, i1 %2014) #6, !dbg !40 + %2017 = bitcast i32 %2016 to float, !dbg !40 + %2018 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2016, i32 8, i32 31), !dbg !40 + %2019 = bitcast i32 %2018 to float, !dbg !40 + %2020 = fcmp ogt float %2017, %2019, !dbg !35 + %2021 = fcmp uno float %2017, 0.000000e+00, !dbg !37 + %2022 = or i1 %2021, %2020, !dbg !38 + %2023 = select i1 %2022, float %2017, float %2019, !dbg !39 + %2024 = bitcast float %2023 to i32, !dbg !40 + %2025 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2024, i32 4, i32 31), !dbg !40 + %2026 = bitcast i32 %2025 to float, !dbg !40 + %2027 = fcmp ogt float %2023, %2026, !dbg !35 + %2028 = fcmp uno float %2023, 0.000000e+00, !dbg !37 + %2029 = or i1 %2027, %2028, !dbg !38 + %2030 = select i1 %2029, float %2023, float %2026, !dbg !39 + %2031 = bitcast float %2030 to i32, !dbg !40 + %2032 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2031, i32 2, i32 31), !dbg !40 + %2033 = bitcast i32 %2032 to float, !dbg !40 + %2034 = fcmp ogt float %2030, %2033, !dbg !35 + %2035 = fcmp uno float %2030, 0.000000e+00, !dbg !37 + %2036 = or i1 %2034, %2035, !dbg !38 + %2037 = select i1 %2036, float %2030, float %2033, !dbg !39 + %2038 = bitcast float %2037 to i32, !dbg !40 + %2039 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2038, i32 1, i32 31), !dbg !40 + %2040 = bitcast i32 %2039 to float, !dbg !40 + %2041 = fcmp ogt float %2037, %2040, !dbg !35 + %2042 = fcmp uno float %2037, 0.000000e+00, !dbg !37 + %2043 = or i1 %2041, %2042, !dbg !38 + %2044 = icmp eq i32 %8, 0, !dbg !40 + %2045 = select i1 %2043, i32 %2038, i32 %2039, !dbg !39 + %2046 = insertelement <1 x i32> poison, i32 %2045, i64 0, !dbg !40 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %2015, <1 x i32> %2046, i1 %2044) #6, !dbg !40 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !40 + %2047 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !40 + %2048 = fcmp oeq float %2047, 0xFFF0000000000000, !dbg !41 + %2049 = fsub float %1841, %2047, !dbg !42 + %2050 = fsub float %1840, %2047, !dbg !42 + %2051 = fsub float %1839, %2047, !dbg !42 + %2052 = fsub float %1838, %2047, !dbg !42 + %2053 = fsub float %1837, %2047, !dbg !42 + %2054 = fsub float %1836, %2047, !dbg !42 + %2055 = fsub float %1835, %2047, !dbg !42 + %2056 = fsub float %1834, %2047, !dbg !42 + %2057 = fsub float %1833, %2047, !dbg !42 + %2058 = fsub float %1832, %2047, !dbg !42 + %2059 = fsub float %1831, %2047, !dbg !42 + %2060 = fsub float %1830, %2047, !dbg !42 + %2061 = fsub float %1829, %2047, !dbg !42 + %2062 = fsub float %1828, %2047, !dbg !42 + %2063 = fsub float %1827, %2047, !dbg !42 + %2064 = fsub float %1826, %2047, !dbg !42 + %2065 = fsub float %130, %2047, !dbg !42 + %2066 = fsub float %132, %2047, !dbg !42 + %2067 = fsub float %134, %2047, !dbg !42 + %2068 = fsub float %136, %2047, !dbg !42 + %2069 = fsub float %138, %2047, !dbg !42 + %2070 = fsub float %140, %2047, !dbg !42 + %2071 = fsub float %142, %2047, !dbg !42 + %2072 = fsub float %144, %2047, !dbg !42 + %2073 = fsub float %1813, %2047, !dbg !42 + %2074 = fsub float %1814, %2047, !dbg !42 + %2075 = fsub float %1945, %2047, !dbg !42 + %2076 = fsub float %1950, %2047, !dbg !42 + %2077 = fsub float %1955, %2047, !dbg !42 + %2078 = fsub float %1960, %2047, !dbg !42 + %2079 = fsub float %1965, %2047, !dbg !42 + %2080 = fsub float %1970, %2047, !dbg !42 + %2081 = select i1 %2048, float 0.000000e+00, float %2049, !dbg !43 + %2082 = select i1 %2048, float 0.000000e+00, float %2050, !dbg !43 + %2083 = select i1 %2048, float 0.000000e+00, float %2051, !dbg !43 + %2084 = select i1 %2048, float 0.000000e+00, float %2052, !dbg !43 + %2085 = select i1 %2048, float 0.000000e+00, float %2053, !dbg !43 + %2086 = select i1 %2048, float 0.000000e+00, float %2054, !dbg !43 + %2087 = select i1 %2048, float 0.000000e+00, float %2055, !dbg !43 + %2088 = select i1 %2048, float 0.000000e+00, float %2056, !dbg !43 + %2089 = select i1 %2048, float 0.000000e+00, float %2057, !dbg !43 + %2090 = select i1 %2048, float 0.000000e+00, float %2058, !dbg !43 + %2091 = select i1 %2048, float 0.000000e+00, float %2059, !dbg !43 + %2092 = select i1 %2048, float 0.000000e+00, float %2060, !dbg !43 + %2093 = select i1 %2048, float 0.000000e+00, float %2061, !dbg !43 + %2094 = select i1 %2048, float 0.000000e+00, float %2062, !dbg !43 + %2095 = select i1 %2048, float 0.000000e+00, float %2063, !dbg !43 + %2096 = select i1 %2048, float 0.000000e+00, float %2064, !dbg !43 + %2097 = select i1 %2048, float 0.000000e+00, float %2065, !dbg !43 + %2098 = select i1 %2048, float 0.000000e+00, float %2066, !dbg !43 + %2099 = select i1 %2048, float 0.000000e+00, float %2067, !dbg !43 + %2100 = select i1 %2048, float 0.000000e+00, float %2068, !dbg !43 + %2101 = select i1 %2048, float 0.000000e+00, float %2069, !dbg !43 + %2102 = select i1 %2048, float 0.000000e+00, float %2070, !dbg !43 + %2103 = select i1 %2048, float 0.000000e+00, float %2071, !dbg !43 + %2104 = select i1 %2048, float 0.000000e+00, float %2072, !dbg !43 + %2105 = select i1 %2048, float 0.000000e+00, float %2073, !dbg !43 + %2106 = select i1 %2048, float 0.000000e+00, float %2074, !dbg !43 + %2107 = select i1 %2048, float 0.000000e+00, float %2075, !dbg !43 + %2108 = select i1 %2048, float 0.000000e+00, float %2076, !dbg !43 + %2109 = select i1 %2048, float 0.000000e+00, float %2077, !dbg !43 + %2110 = select i1 %2048, float 0.000000e+00, float %2078, !dbg !43 + %2111 = select i1 %2048, float 0.000000e+00, float %2079, !dbg !43 + %2112 = select i1 %2048, float 0.000000e+00, float %2080, !dbg !43 + %2113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i = icmp eq i32 %2113, 0, !dbg !44 + %2114 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2081, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2115 = tail call float @llvm.nvvm.fma.rn.f(float %2081, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i = select i1 %.not.i, float %2115, float %2114, !dbg !44 + %2116 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i = icmp eq i32 %2116, 0, !dbg !44 + %2117 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i) #6, !dbg !44 + %2118 = tail call float @llvm.nvvm.saturate.f(float %.02.i) #6, !dbg !44 + %.03.i = select i1 %.not1.i, float %2118, float %2117, !dbg !44 + %2119 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i = icmp eq i32 %2119, 0, !dbg !44 + %2120 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2121 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i = select i1 %.not2.i, float %2121, float %2120, !dbg !44 + %2122 = fadd float %.04.i, 0xC168000FE0000000, !dbg !44 + %2123 = fneg float %2122, !dbg !44 + %2124 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i = icmp eq i32 %2124, 0, !dbg !44 + %2125 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2081, float 0x3FF7154760000000, float %2123) #6, !dbg !44 + %2126 = tail call float @llvm.nvvm.fma.rn.f(float %2081, float 0x3FF7154760000000, float %2123) #6, !dbg !44 + %.0.i = select i1 %.not3.i, float %2126, float %2125, !dbg !44 + %2127 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i = icmp eq i32 %2127, 0, !dbg !44 + %2128 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2081, float 0x3E54AE0C00000000, float %.0.i) #6, !dbg !44 + %2129 = tail call float @llvm.nvvm.fma.rn.f(float %2081, float 0x3E54AE0C00000000, float %.0.i) #6, !dbg !44 + %.01.i = select i1 %.not4.i, float %2129, float %2128, !dbg !44 + %2130 = bitcast float %.04.i to i32, !dbg !44 + %2131 = shl i32 %2130, 23, !dbg !44 + %2132 = bitcast i32 %2131 to float, !dbg !44 + %2133 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i) #6, !dbg !44 + %2134 = fmul float %2133, %2132, !dbg !44 + %2135 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i64 = icmp eq i32 %2135, 0, !dbg !44 + %2136 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2082, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2137 = tail call float @llvm.nvvm.fma.rn.f(float %2082, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i65 = select i1 %.not.i64, float %2137, float %2136, !dbg !44 + %2138 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i66 = icmp eq i32 %2138, 0, !dbg !44 + %2139 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i65) #6, !dbg !44 + %2140 = tail call float @llvm.nvvm.saturate.f(float %.02.i65) #6, !dbg !44 + %.03.i67 = select i1 %.not1.i66, float %2140, float %2139, !dbg !44 + %2141 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i68 = icmp eq i32 %2141, 0, !dbg !44 + %2142 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i67, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2143 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i67, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i69 = select i1 %.not2.i68, float %2143, float %2142, !dbg !44 + %2144 = fadd float %.04.i69, 0xC168000FE0000000, !dbg !44 + %2145 = fneg float %2144, !dbg !44 + %2146 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i70 = icmp eq i32 %2146, 0, !dbg !44 + %2147 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2082, float 0x3FF7154760000000, float %2145) #6, !dbg !44 + %2148 = tail call float @llvm.nvvm.fma.rn.f(float %2082, float 0x3FF7154760000000, float %2145) #6, !dbg !44 + %.0.i71 = select i1 %.not3.i70, float %2148, float %2147, !dbg !44 + %2149 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i72 = icmp eq i32 %2149, 0, !dbg !44 + %2150 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2082, float 0x3E54AE0C00000000, float %.0.i71) #6, !dbg !44 + %2151 = tail call float @llvm.nvvm.fma.rn.f(float %2082, float 0x3E54AE0C00000000, float %.0.i71) #6, !dbg !44 + %.01.i73 = select i1 %.not4.i72, float %2151, float %2150, !dbg !44 + %2152 = bitcast float %.04.i69 to i32, !dbg !44 + %2153 = shl i32 %2152, 23, !dbg !44 + %2154 = bitcast i32 %2153 to float, !dbg !44 + %2155 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i73) #6, !dbg !44 + %2156 = fmul float %2155, %2154, !dbg !44 + %2157 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i74 = icmp eq i32 %2157, 0, !dbg !44 + %2158 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2083, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2159 = tail call float @llvm.nvvm.fma.rn.f(float %2083, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i75 = select i1 %.not.i74, float %2159, float %2158, !dbg !44 + %2160 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i76 = icmp eq i32 %2160, 0, !dbg !44 + %2161 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i75) #6, !dbg !44 + %2162 = tail call float @llvm.nvvm.saturate.f(float %.02.i75) #6, !dbg !44 + %.03.i77 = select i1 %.not1.i76, float %2162, float %2161, !dbg !44 + %2163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i78 = icmp eq i32 %2163, 0, !dbg !44 + %2164 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i77, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2165 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i77, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i79 = select i1 %.not2.i78, float %2165, float %2164, !dbg !44 + %2166 = fadd float %.04.i79, 0xC168000FE0000000, !dbg !44 + %2167 = fneg float %2166, !dbg !44 + %2168 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i80 = icmp eq i32 %2168, 0, !dbg !44 + %2169 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2083, float 0x3FF7154760000000, float %2167) #6, !dbg !44 + %2170 = tail call float @llvm.nvvm.fma.rn.f(float %2083, float 0x3FF7154760000000, float %2167) #6, !dbg !44 + %.0.i81 = select i1 %.not3.i80, float %2170, float %2169, !dbg !44 + %2171 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i82 = icmp eq i32 %2171, 0, !dbg !44 + %2172 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2083, float 0x3E54AE0C00000000, float %.0.i81) #6, !dbg !44 + %2173 = tail call float @llvm.nvvm.fma.rn.f(float %2083, float 0x3E54AE0C00000000, float %.0.i81) #6, !dbg !44 + %.01.i83 = select i1 %.not4.i82, float %2173, float %2172, !dbg !44 + %2174 = bitcast float %.04.i79 to i32, !dbg !44 + %2175 = shl i32 %2174, 23, !dbg !44 + %2176 = bitcast i32 %2175 to float, !dbg !44 + %2177 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i83) #6, !dbg !44 + %2178 = fmul float %2177, %2176, !dbg !44 + %2179 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i84 = icmp eq i32 %2179, 0, !dbg !44 + %2180 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2084, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2181 = tail call float @llvm.nvvm.fma.rn.f(float %2084, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i85 = select i1 %.not.i84, float %2181, float %2180, !dbg !44 + %2182 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i86 = icmp eq i32 %2182, 0, !dbg !44 + %2183 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i85) #6, !dbg !44 + %2184 = tail call float @llvm.nvvm.saturate.f(float %.02.i85) #6, !dbg !44 + %.03.i87 = select i1 %.not1.i86, float %2184, float %2183, !dbg !44 + %2185 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i88 = icmp eq i32 %2185, 0, !dbg !44 + %2186 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i87, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2187 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i87, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i89 = select i1 %.not2.i88, float %2187, float %2186, !dbg !44 + %2188 = fadd float %.04.i89, 0xC168000FE0000000, !dbg !44 + %2189 = fneg float %2188, !dbg !44 + %2190 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i90 = icmp eq i32 %2190, 0, !dbg !44 + %2191 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2084, float 0x3FF7154760000000, float %2189) #6, !dbg !44 + %2192 = tail call float @llvm.nvvm.fma.rn.f(float %2084, float 0x3FF7154760000000, float %2189) #6, !dbg !44 + %.0.i91 = select i1 %.not3.i90, float %2192, float %2191, !dbg !44 + %2193 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i92 = icmp eq i32 %2193, 0, !dbg !44 + %2194 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2084, float 0x3E54AE0C00000000, float %.0.i91) #6, !dbg !44 + %2195 = tail call float @llvm.nvvm.fma.rn.f(float %2084, float 0x3E54AE0C00000000, float %.0.i91) #6, !dbg !44 + %.01.i93 = select i1 %.not4.i92, float %2195, float %2194, !dbg !44 + %2196 = bitcast float %.04.i89 to i32, !dbg !44 + %2197 = shl i32 %2196, 23, !dbg !44 + %2198 = bitcast i32 %2197 to float, !dbg !44 + %2199 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i93) #6, !dbg !44 + %2200 = fmul float %2199, %2198, !dbg !44 + %2201 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i94 = icmp eq i32 %2201, 0, !dbg !44 + %2202 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2085, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2203 = tail call float @llvm.nvvm.fma.rn.f(float %2085, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i95 = select i1 %.not.i94, float %2203, float %2202, !dbg !44 + %2204 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i96 = icmp eq i32 %2204, 0, !dbg !44 + %2205 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i95) #6, !dbg !44 + %2206 = tail call float @llvm.nvvm.saturate.f(float %.02.i95) #6, !dbg !44 + %.03.i97 = select i1 %.not1.i96, float %2206, float %2205, !dbg !44 + %2207 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i98 = icmp eq i32 %2207, 0, !dbg !44 + %2208 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i97, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2209 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i97, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i99 = select i1 %.not2.i98, float %2209, float %2208, !dbg !44 + %2210 = fadd float %.04.i99, 0xC168000FE0000000, !dbg !44 + %2211 = fneg float %2210, !dbg !44 + %2212 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i100 = icmp eq i32 %2212, 0, !dbg !44 + %2213 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2085, float 0x3FF7154760000000, float %2211) #6, !dbg !44 + %2214 = tail call float @llvm.nvvm.fma.rn.f(float %2085, float 0x3FF7154760000000, float %2211) #6, !dbg !44 + %.0.i101 = select i1 %.not3.i100, float %2214, float %2213, !dbg !44 + %2215 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i102 = icmp eq i32 %2215, 0, !dbg !44 + %2216 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2085, float 0x3E54AE0C00000000, float %.0.i101) #6, !dbg !44 + %2217 = tail call float @llvm.nvvm.fma.rn.f(float %2085, float 0x3E54AE0C00000000, float %.0.i101) #6, !dbg !44 + %.01.i103 = select i1 %.not4.i102, float %2217, float %2216, !dbg !44 + %2218 = bitcast float %.04.i99 to i32, !dbg !44 + %2219 = shl i32 %2218, 23, !dbg !44 + %2220 = bitcast i32 %2219 to float, !dbg !44 + %2221 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i103) #6, !dbg !44 + %2222 = fmul float %2221, %2220, !dbg !44 + %2223 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i104 = icmp eq i32 %2223, 0, !dbg !44 + %2224 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2086, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2225 = tail call float @llvm.nvvm.fma.rn.f(float %2086, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i105 = select i1 %.not.i104, float %2225, float %2224, !dbg !44 + %2226 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i106 = icmp eq i32 %2226, 0, !dbg !44 + %2227 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i105) #6, !dbg !44 + %2228 = tail call float @llvm.nvvm.saturate.f(float %.02.i105) #6, !dbg !44 + %.03.i107 = select i1 %.not1.i106, float %2228, float %2227, !dbg !44 + %2229 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i108 = icmp eq i32 %2229, 0, !dbg !44 + %2230 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i107, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2231 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i107, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i109 = select i1 %.not2.i108, float %2231, float %2230, !dbg !44 + %2232 = fadd float %.04.i109, 0xC168000FE0000000, !dbg !44 + %2233 = fneg float %2232, !dbg !44 + %2234 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i110 = icmp eq i32 %2234, 0, !dbg !44 + %2235 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2086, float 0x3FF7154760000000, float %2233) #6, !dbg !44 + %2236 = tail call float @llvm.nvvm.fma.rn.f(float %2086, float 0x3FF7154760000000, float %2233) #6, !dbg !44 + %.0.i111 = select i1 %.not3.i110, float %2236, float %2235, !dbg !44 + %2237 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i112 = icmp eq i32 %2237, 0, !dbg !44 + %2238 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2086, float 0x3E54AE0C00000000, float %.0.i111) #6, !dbg !44 + %2239 = tail call float @llvm.nvvm.fma.rn.f(float %2086, float 0x3E54AE0C00000000, float %.0.i111) #6, !dbg !44 + %.01.i113 = select i1 %.not4.i112, float %2239, float %2238, !dbg !44 + %2240 = bitcast float %.04.i109 to i32, !dbg !44 + %2241 = shl i32 %2240, 23, !dbg !44 + %2242 = bitcast i32 %2241 to float, !dbg !44 + %2243 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i113) #6, !dbg !44 + %2244 = fmul float %2243, %2242, !dbg !44 + %2245 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i114 = icmp eq i32 %2245, 0, !dbg !44 + %2246 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2087, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2247 = tail call float @llvm.nvvm.fma.rn.f(float %2087, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i115 = select i1 %.not.i114, float %2247, float %2246, !dbg !44 + %2248 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i116 = icmp eq i32 %2248, 0, !dbg !44 + %2249 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i115) #6, !dbg !44 + %2250 = tail call float @llvm.nvvm.saturate.f(float %.02.i115) #6, !dbg !44 + %.03.i117 = select i1 %.not1.i116, float %2250, float %2249, !dbg !44 + %2251 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i118 = icmp eq i32 %2251, 0, !dbg !44 + %2252 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i117, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2253 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i117, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i119 = select i1 %.not2.i118, float %2253, float %2252, !dbg !44 + %2254 = fadd float %.04.i119, 0xC168000FE0000000, !dbg !44 + %2255 = fneg float %2254, !dbg !44 + %2256 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i120 = icmp eq i32 %2256, 0, !dbg !44 + %2257 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2087, float 0x3FF7154760000000, float %2255) #6, !dbg !44 + %2258 = tail call float @llvm.nvvm.fma.rn.f(float %2087, float 0x3FF7154760000000, float %2255) #6, !dbg !44 + %.0.i121 = select i1 %.not3.i120, float %2258, float %2257, !dbg !44 + %2259 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i122 = icmp eq i32 %2259, 0, !dbg !44 + %2260 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2087, float 0x3E54AE0C00000000, float %.0.i121) #6, !dbg !44 + %2261 = tail call float @llvm.nvvm.fma.rn.f(float %2087, float 0x3E54AE0C00000000, float %.0.i121) #6, !dbg !44 + %.01.i123 = select i1 %.not4.i122, float %2261, float %2260, !dbg !44 + %2262 = bitcast float %.04.i119 to i32, !dbg !44 + %2263 = shl i32 %2262, 23, !dbg !44 + %2264 = bitcast i32 %2263 to float, !dbg !44 + %2265 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i123) #6, !dbg !44 + %2266 = fmul float %2265, %2264, !dbg !44 + %2267 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i124 = icmp eq i32 %2267, 0, !dbg !44 + %2268 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2088, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2269 = tail call float @llvm.nvvm.fma.rn.f(float %2088, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i125 = select i1 %.not.i124, float %2269, float %2268, !dbg !44 + %2270 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i126 = icmp eq i32 %2270, 0, !dbg !44 + %2271 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i125) #6, !dbg !44 + %2272 = tail call float @llvm.nvvm.saturate.f(float %.02.i125) #6, !dbg !44 + %.03.i127 = select i1 %.not1.i126, float %2272, float %2271, !dbg !44 + %2273 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i128 = icmp eq i32 %2273, 0, !dbg !44 + %2274 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i127, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2275 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i127, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i129 = select i1 %.not2.i128, float %2275, float %2274, !dbg !44 + %2276 = fadd float %.04.i129, 0xC168000FE0000000, !dbg !44 + %2277 = fneg float %2276, !dbg !44 + %2278 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i130 = icmp eq i32 %2278, 0, !dbg !44 + %2279 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2088, float 0x3FF7154760000000, float %2277) #6, !dbg !44 + %2280 = tail call float @llvm.nvvm.fma.rn.f(float %2088, float 0x3FF7154760000000, float %2277) #6, !dbg !44 + %.0.i131 = select i1 %.not3.i130, float %2280, float %2279, !dbg !44 + %2281 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i132 = icmp eq i32 %2281, 0, !dbg !44 + %2282 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2088, float 0x3E54AE0C00000000, float %.0.i131) #6, !dbg !44 + %2283 = tail call float @llvm.nvvm.fma.rn.f(float %2088, float 0x3E54AE0C00000000, float %.0.i131) #6, !dbg !44 + %.01.i133 = select i1 %.not4.i132, float %2283, float %2282, !dbg !44 + %2284 = bitcast float %.04.i129 to i32, !dbg !44 + %2285 = shl i32 %2284, 23, !dbg !44 + %2286 = bitcast i32 %2285 to float, !dbg !44 + %2287 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i133) #6, !dbg !44 + %2288 = fmul float %2287, %2286, !dbg !44 + %2289 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i134 = icmp eq i32 %2289, 0, !dbg !44 + %2290 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2089, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2291 = tail call float @llvm.nvvm.fma.rn.f(float %2089, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i135 = select i1 %.not.i134, float %2291, float %2290, !dbg !44 + %2292 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i136 = icmp eq i32 %2292, 0, !dbg !44 + %2293 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i135) #6, !dbg !44 + %2294 = tail call float @llvm.nvvm.saturate.f(float %.02.i135) #6, !dbg !44 + %.03.i137 = select i1 %.not1.i136, float %2294, float %2293, !dbg !44 + %2295 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i138 = icmp eq i32 %2295, 0, !dbg !44 + %2296 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i137, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2297 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i137, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i139 = select i1 %.not2.i138, float %2297, float %2296, !dbg !44 + %2298 = fadd float %.04.i139, 0xC168000FE0000000, !dbg !44 + %2299 = fneg float %2298, !dbg !44 + %2300 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i140 = icmp eq i32 %2300, 0, !dbg !44 + %2301 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2089, float 0x3FF7154760000000, float %2299) #6, !dbg !44 + %2302 = tail call float @llvm.nvvm.fma.rn.f(float %2089, float 0x3FF7154760000000, float %2299) #6, !dbg !44 + %.0.i141 = select i1 %.not3.i140, float %2302, float %2301, !dbg !44 + %2303 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i142 = icmp eq i32 %2303, 0, !dbg !44 + %2304 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2089, float 0x3E54AE0C00000000, float %.0.i141) #6, !dbg !44 + %2305 = tail call float @llvm.nvvm.fma.rn.f(float %2089, float 0x3E54AE0C00000000, float %.0.i141) #6, !dbg !44 + %.01.i143 = select i1 %.not4.i142, float %2305, float %2304, !dbg !44 + %2306 = bitcast float %.04.i139 to i32, !dbg !44 + %2307 = shl i32 %2306, 23, !dbg !44 + %2308 = bitcast i32 %2307 to float, !dbg !44 + %2309 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i143) #6, !dbg !44 + %2310 = fmul float %2309, %2308, !dbg !44 + %2311 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i144 = icmp eq i32 %2311, 0, !dbg !44 + %2312 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2090, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2313 = tail call float @llvm.nvvm.fma.rn.f(float %2090, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i145 = select i1 %.not.i144, float %2313, float %2312, !dbg !44 + %2314 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i146 = icmp eq i32 %2314, 0, !dbg !44 + %2315 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i145) #6, !dbg !44 + %2316 = tail call float @llvm.nvvm.saturate.f(float %.02.i145) #6, !dbg !44 + %.03.i147 = select i1 %.not1.i146, float %2316, float %2315, !dbg !44 + %2317 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i148 = icmp eq i32 %2317, 0, !dbg !44 + %2318 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i147, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2319 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i147, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i149 = select i1 %.not2.i148, float %2319, float %2318, !dbg !44 + %2320 = fadd float %.04.i149, 0xC168000FE0000000, !dbg !44 + %2321 = fneg float %2320, !dbg !44 + %2322 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i150 = icmp eq i32 %2322, 0, !dbg !44 + %2323 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2090, float 0x3FF7154760000000, float %2321) #6, !dbg !44 + %2324 = tail call float @llvm.nvvm.fma.rn.f(float %2090, float 0x3FF7154760000000, float %2321) #6, !dbg !44 + %.0.i151 = select i1 %.not3.i150, float %2324, float %2323, !dbg !44 + %2325 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i152 = icmp eq i32 %2325, 0, !dbg !44 + %2326 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2090, float 0x3E54AE0C00000000, float %.0.i151) #6, !dbg !44 + %2327 = tail call float @llvm.nvvm.fma.rn.f(float %2090, float 0x3E54AE0C00000000, float %.0.i151) #6, !dbg !44 + %.01.i153 = select i1 %.not4.i152, float %2327, float %2326, !dbg !44 + %2328 = bitcast float %.04.i149 to i32, !dbg !44 + %2329 = shl i32 %2328, 23, !dbg !44 + %2330 = bitcast i32 %2329 to float, !dbg !44 + %2331 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i153) #6, !dbg !44 + %2332 = fmul float %2331, %2330, !dbg !44 + %2333 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i154 = icmp eq i32 %2333, 0, !dbg !44 + %2334 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2091, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2335 = tail call float @llvm.nvvm.fma.rn.f(float %2091, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i155 = select i1 %.not.i154, float %2335, float %2334, !dbg !44 + %2336 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i156 = icmp eq i32 %2336, 0, !dbg !44 + %2337 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i155) #6, !dbg !44 + %2338 = tail call float @llvm.nvvm.saturate.f(float %.02.i155) #6, !dbg !44 + %.03.i157 = select i1 %.not1.i156, float %2338, float %2337, !dbg !44 + %2339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i158 = icmp eq i32 %2339, 0, !dbg !44 + %2340 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i157, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2341 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i157, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i159 = select i1 %.not2.i158, float %2341, float %2340, !dbg !44 + %2342 = fadd float %.04.i159, 0xC168000FE0000000, !dbg !44 + %2343 = fneg float %2342, !dbg !44 + %2344 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i160 = icmp eq i32 %2344, 0, !dbg !44 + %2345 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2091, float 0x3FF7154760000000, float %2343) #6, !dbg !44 + %2346 = tail call float @llvm.nvvm.fma.rn.f(float %2091, float 0x3FF7154760000000, float %2343) #6, !dbg !44 + %.0.i161 = select i1 %.not3.i160, float %2346, float %2345, !dbg !44 + %2347 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i162 = icmp eq i32 %2347, 0, !dbg !44 + %2348 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2091, float 0x3E54AE0C00000000, float %.0.i161) #6, !dbg !44 + %2349 = tail call float @llvm.nvvm.fma.rn.f(float %2091, float 0x3E54AE0C00000000, float %.0.i161) #6, !dbg !44 + %.01.i163 = select i1 %.not4.i162, float %2349, float %2348, !dbg !44 + %2350 = bitcast float %.04.i159 to i32, !dbg !44 + %2351 = shl i32 %2350, 23, !dbg !44 + %2352 = bitcast i32 %2351 to float, !dbg !44 + %2353 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i163) #6, !dbg !44 + %2354 = fmul float %2353, %2352, !dbg !44 + %2355 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i164 = icmp eq i32 %2355, 0, !dbg !44 + %2356 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2092, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2357 = tail call float @llvm.nvvm.fma.rn.f(float %2092, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i165 = select i1 %.not.i164, float %2357, float %2356, !dbg !44 + %2358 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i166 = icmp eq i32 %2358, 0, !dbg !44 + %2359 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i165) #6, !dbg !44 + %2360 = tail call float @llvm.nvvm.saturate.f(float %.02.i165) #6, !dbg !44 + %.03.i167 = select i1 %.not1.i166, float %2360, float %2359, !dbg !44 + %2361 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i168 = icmp eq i32 %2361, 0, !dbg !44 + %2362 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i167, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2363 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i167, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i169 = select i1 %.not2.i168, float %2363, float %2362, !dbg !44 + %2364 = fadd float %.04.i169, 0xC168000FE0000000, !dbg !44 + %2365 = fneg float %2364, !dbg !44 + %2366 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i170 = icmp eq i32 %2366, 0, !dbg !44 + %2367 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2092, float 0x3FF7154760000000, float %2365) #6, !dbg !44 + %2368 = tail call float @llvm.nvvm.fma.rn.f(float %2092, float 0x3FF7154760000000, float %2365) #6, !dbg !44 + %.0.i171 = select i1 %.not3.i170, float %2368, float %2367, !dbg !44 + %2369 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i172 = icmp eq i32 %2369, 0, !dbg !44 + %2370 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2092, float 0x3E54AE0C00000000, float %.0.i171) #6, !dbg !44 + %2371 = tail call float @llvm.nvvm.fma.rn.f(float %2092, float 0x3E54AE0C00000000, float %.0.i171) #6, !dbg !44 + %.01.i173 = select i1 %.not4.i172, float %2371, float %2370, !dbg !44 + %2372 = bitcast float %.04.i169 to i32, !dbg !44 + %2373 = shl i32 %2372, 23, !dbg !44 + %2374 = bitcast i32 %2373 to float, !dbg !44 + %2375 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i173) #6, !dbg !44 + %2376 = fmul float %2375, %2374, !dbg !44 + %2377 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i174 = icmp eq i32 %2377, 0, !dbg !44 + %2378 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2093, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2379 = tail call float @llvm.nvvm.fma.rn.f(float %2093, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i175 = select i1 %.not.i174, float %2379, float %2378, !dbg !44 + %2380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i176 = icmp eq i32 %2380, 0, !dbg !44 + %2381 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i175) #6, !dbg !44 + %2382 = tail call float @llvm.nvvm.saturate.f(float %.02.i175) #6, !dbg !44 + %.03.i177 = select i1 %.not1.i176, float %2382, float %2381, !dbg !44 + %2383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i178 = icmp eq i32 %2383, 0, !dbg !44 + %2384 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i177, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2385 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i177, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i179 = select i1 %.not2.i178, float %2385, float %2384, !dbg !44 + %2386 = fadd float %.04.i179, 0xC168000FE0000000, !dbg !44 + %2387 = fneg float %2386, !dbg !44 + %2388 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i180 = icmp eq i32 %2388, 0, !dbg !44 + %2389 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2093, float 0x3FF7154760000000, float %2387) #6, !dbg !44 + %2390 = tail call float @llvm.nvvm.fma.rn.f(float %2093, float 0x3FF7154760000000, float %2387) #6, !dbg !44 + %.0.i181 = select i1 %.not3.i180, float %2390, float %2389, !dbg !44 + %2391 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i182 = icmp eq i32 %2391, 0, !dbg !44 + %2392 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2093, float 0x3E54AE0C00000000, float %.0.i181) #6, !dbg !44 + %2393 = tail call float @llvm.nvvm.fma.rn.f(float %2093, float 0x3E54AE0C00000000, float %.0.i181) #6, !dbg !44 + %.01.i183 = select i1 %.not4.i182, float %2393, float %2392, !dbg !44 + %2394 = bitcast float %.04.i179 to i32, !dbg !44 + %2395 = shl i32 %2394, 23, !dbg !44 + %2396 = bitcast i32 %2395 to float, !dbg !44 + %2397 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i183) #6, !dbg !44 + %2398 = fmul float %2397, %2396, !dbg !44 + %2399 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i184 = icmp eq i32 %2399, 0, !dbg !44 + %2400 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2094, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2401 = tail call float @llvm.nvvm.fma.rn.f(float %2094, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i185 = select i1 %.not.i184, float %2401, float %2400, !dbg !44 + %2402 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i186 = icmp eq i32 %2402, 0, !dbg !44 + %2403 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i185) #6, !dbg !44 + %2404 = tail call float @llvm.nvvm.saturate.f(float %.02.i185) #6, !dbg !44 + %.03.i187 = select i1 %.not1.i186, float %2404, float %2403, !dbg !44 + %2405 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i188 = icmp eq i32 %2405, 0, !dbg !44 + %2406 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i187, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2407 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i187, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i189 = select i1 %.not2.i188, float %2407, float %2406, !dbg !44 + %2408 = fadd float %.04.i189, 0xC168000FE0000000, !dbg !44 + %2409 = fneg float %2408, !dbg !44 + %2410 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i190 = icmp eq i32 %2410, 0, !dbg !44 + %2411 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2094, float 0x3FF7154760000000, float %2409) #6, !dbg !44 + %2412 = tail call float @llvm.nvvm.fma.rn.f(float %2094, float 0x3FF7154760000000, float %2409) #6, !dbg !44 + %.0.i191 = select i1 %.not3.i190, float %2412, float %2411, !dbg !44 + %2413 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i192 = icmp eq i32 %2413, 0, !dbg !44 + %2414 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2094, float 0x3E54AE0C00000000, float %.0.i191) #6, !dbg !44 + %2415 = tail call float @llvm.nvvm.fma.rn.f(float %2094, float 0x3E54AE0C00000000, float %.0.i191) #6, !dbg !44 + %.01.i193 = select i1 %.not4.i192, float %2415, float %2414, !dbg !44 + %2416 = bitcast float %.04.i189 to i32, !dbg !44 + %2417 = shl i32 %2416, 23, !dbg !44 + %2418 = bitcast i32 %2417 to float, !dbg !44 + %2419 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i193) #6, !dbg !44 + %2420 = fmul float %2419, %2418, !dbg !44 + %2421 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i194 = icmp eq i32 %2421, 0, !dbg !44 + %2422 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2095, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2423 = tail call float @llvm.nvvm.fma.rn.f(float %2095, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i195 = select i1 %.not.i194, float %2423, float %2422, !dbg !44 + %2424 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i196 = icmp eq i32 %2424, 0, !dbg !44 + %2425 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i195) #6, !dbg !44 + %2426 = tail call float @llvm.nvvm.saturate.f(float %.02.i195) #6, !dbg !44 + %.03.i197 = select i1 %.not1.i196, float %2426, float %2425, !dbg !44 + %2427 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i198 = icmp eq i32 %2427, 0, !dbg !44 + %2428 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i197, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2429 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i197, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i199 = select i1 %.not2.i198, float %2429, float %2428, !dbg !44 + %2430 = fadd float %.04.i199, 0xC168000FE0000000, !dbg !44 + %2431 = fneg float %2430, !dbg !44 + %2432 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i200 = icmp eq i32 %2432, 0, !dbg !44 + %2433 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2095, float 0x3FF7154760000000, float %2431) #6, !dbg !44 + %2434 = tail call float @llvm.nvvm.fma.rn.f(float %2095, float 0x3FF7154760000000, float %2431) #6, !dbg !44 + %.0.i201 = select i1 %.not3.i200, float %2434, float %2433, !dbg !44 + %2435 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i202 = icmp eq i32 %2435, 0, !dbg !44 + %2436 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2095, float 0x3E54AE0C00000000, float %.0.i201) #6, !dbg !44 + %2437 = tail call float @llvm.nvvm.fma.rn.f(float %2095, float 0x3E54AE0C00000000, float %.0.i201) #6, !dbg !44 + %.01.i203 = select i1 %.not4.i202, float %2437, float %2436, !dbg !44 + %2438 = bitcast float %.04.i199 to i32, !dbg !44 + %2439 = shl i32 %2438, 23, !dbg !44 + %2440 = bitcast i32 %2439 to float, !dbg !44 + %2441 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i203) #6, !dbg !44 + %2442 = fmul float %2441, %2440, !dbg !44 + %2443 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i204 = icmp eq i32 %2443, 0, !dbg !44 + %2444 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2096, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2445 = tail call float @llvm.nvvm.fma.rn.f(float %2096, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i205 = select i1 %.not.i204, float %2445, float %2444, !dbg !44 + %2446 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i206 = icmp eq i32 %2446, 0, !dbg !44 + %2447 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i205) #6, !dbg !44 + %2448 = tail call float @llvm.nvvm.saturate.f(float %.02.i205) #6, !dbg !44 + %.03.i207 = select i1 %.not1.i206, float %2448, float %2447, !dbg !44 + %2449 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i208 = icmp eq i32 %2449, 0, !dbg !44 + %2450 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i207, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2451 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i207, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i209 = select i1 %.not2.i208, float %2451, float %2450, !dbg !44 + %2452 = fadd float %.04.i209, 0xC168000FE0000000, !dbg !44 + %2453 = fneg float %2452, !dbg !44 + %2454 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i210 = icmp eq i32 %2454, 0, !dbg !44 + %2455 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2096, float 0x3FF7154760000000, float %2453) #6, !dbg !44 + %2456 = tail call float @llvm.nvvm.fma.rn.f(float %2096, float 0x3FF7154760000000, float %2453) #6, !dbg !44 + %.0.i211 = select i1 %.not3.i210, float %2456, float %2455, !dbg !44 + %2457 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i212 = icmp eq i32 %2457, 0, !dbg !44 + %2458 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2096, float 0x3E54AE0C00000000, float %.0.i211) #6, !dbg !44 + %2459 = tail call float @llvm.nvvm.fma.rn.f(float %2096, float 0x3E54AE0C00000000, float %.0.i211) #6, !dbg !44 + %.01.i213 = select i1 %.not4.i212, float %2459, float %2458, !dbg !44 + %2460 = bitcast float %.04.i209 to i32, !dbg !44 + %2461 = shl i32 %2460, 23, !dbg !44 + %2462 = bitcast i32 %2461 to float, !dbg !44 + %2463 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i213) #6, !dbg !44 + %2464 = fmul float %2463, %2462, !dbg !44 + %2465 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i214 = icmp eq i32 %2465, 0, !dbg !44 + %2466 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2097, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2467 = tail call float @llvm.nvvm.fma.rn.f(float %2097, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i215 = select i1 %.not.i214, float %2467, float %2466, !dbg !44 + %2468 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i216 = icmp eq i32 %2468, 0, !dbg !44 + %2469 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i215) #6, !dbg !44 + %2470 = tail call float @llvm.nvvm.saturate.f(float %.02.i215) #6, !dbg !44 + %.03.i217 = select i1 %.not1.i216, float %2470, float %2469, !dbg !44 + %2471 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i218 = icmp eq i32 %2471, 0, !dbg !44 + %2472 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i217, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2473 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i217, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i219 = select i1 %.not2.i218, float %2473, float %2472, !dbg !44 + %2474 = fadd float %.04.i219, 0xC168000FE0000000, !dbg !44 + %2475 = fneg float %2474, !dbg !44 + %2476 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i220 = icmp eq i32 %2476, 0, !dbg !44 + %2477 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2097, float 0x3FF7154760000000, float %2475) #6, !dbg !44 + %2478 = tail call float @llvm.nvvm.fma.rn.f(float %2097, float 0x3FF7154760000000, float %2475) #6, !dbg !44 + %.0.i221 = select i1 %.not3.i220, float %2478, float %2477, !dbg !44 + %2479 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i222 = icmp eq i32 %2479, 0, !dbg !44 + %2480 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2097, float 0x3E54AE0C00000000, float %.0.i221) #6, !dbg !44 + %2481 = tail call float @llvm.nvvm.fma.rn.f(float %2097, float 0x3E54AE0C00000000, float %.0.i221) #6, !dbg !44 + %.01.i223 = select i1 %.not4.i222, float %2481, float %2480, !dbg !44 + %2482 = bitcast float %.04.i219 to i32, !dbg !44 + %2483 = shl i32 %2482, 23, !dbg !44 + %2484 = bitcast i32 %2483 to float, !dbg !44 + %2485 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i223) #6, !dbg !44 + %2486 = fmul float %2485, %2484, !dbg !44 + %2487 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i224 = icmp eq i32 %2487, 0, !dbg !44 + %2488 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2098, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2489 = tail call float @llvm.nvvm.fma.rn.f(float %2098, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i225 = select i1 %.not.i224, float %2489, float %2488, !dbg !44 + %2490 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i226 = icmp eq i32 %2490, 0, !dbg !44 + %2491 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i225) #6, !dbg !44 + %2492 = tail call float @llvm.nvvm.saturate.f(float %.02.i225) #6, !dbg !44 + %.03.i227 = select i1 %.not1.i226, float %2492, float %2491, !dbg !44 + %2493 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i228 = icmp eq i32 %2493, 0, !dbg !44 + %2494 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i227, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2495 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i227, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i229 = select i1 %.not2.i228, float %2495, float %2494, !dbg !44 + %2496 = fadd float %.04.i229, 0xC168000FE0000000, !dbg !44 + %2497 = fneg float %2496, !dbg !44 + %2498 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i230 = icmp eq i32 %2498, 0, !dbg !44 + %2499 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2098, float 0x3FF7154760000000, float %2497) #6, !dbg !44 + %2500 = tail call float @llvm.nvvm.fma.rn.f(float %2098, float 0x3FF7154760000000, float %2497) #6, !dbg !44 + %.0.i231 = select i1 %.not3.i230, float %2500, float %2499, !dbg !44 + %2501 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i232 = icmp eq i32 %2501, 0, !dbg !44 + %2502 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2098, float 0x3E54AE0C00000000, float %.0.i231) #6, !dbg !44 + %2503 = tail call float @llvm.nvvm.fma.rn.f(float %2098, float 0x3E54AE0C00000000, float %.0.i231) #6, !dbg !44 + %.01.i233 = select i1 %.not4.i232, float %2503, float %2502, !dbg !44 + %2504 = bitcast float %.04.i229 to i32, !dbg !44 + %2505 = shl i32 %2504, 23, !dbg !44 + %2506 = bitcast i32 %2505 to float, !dbg !44 + %2507 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i233) #6, !dbg !44 + %2508 = fmul float %2507, %2506, !dbg !44 + %2509 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i234 = icmp eq i32 %2509, 0, !dbg !44 + %2510 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2099, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2511 = tail call float @llvm.nvvm.fma.rn.f(float %2099, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i235 = select i1 %.not.i234, float %2511, float %2510, !dbg !44 + %2512 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i236 = icmp eq i32 %2512, 0, !dbg !44 + %2513 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i235) #6, !dbg !44 + %2514 = tail call float @llvm.nvvm.saturate.f(float %.02.i235) #6, !dbg !44 + %.03.i237 = select i1 %.not1.i236, float %2514, float %2513, !dbg !44 + %2515 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i238 = icmp eq i32 %2515, 0, !dbg !44 + %2516 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i237, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2517 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i237, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i239 = select i1 %.not2.i238, float %2517, float %2516, !dbg !44 + %2518 = fadd float %.04.i239, 0xC168000FE0000000, !dbg !44 + %2519 = fneg float %2518, !dbg !44 + %2520 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i240 = icmp eq i32 %2520, 0, !dbg !44 + %2521 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2099, float 0x3FF7154760000000, float %2519) #6, !dbg !44 + %2522 = tail call float @llvm.nvvm.fma.rn.f(float %2099, float 0x3FF7154760000000, float %2519) #6, !dbg !44 + %.0.i241 = select i1 %.not3.i240, float %2522, float %2521, !dbg !44 + %2523 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i242 = icmp eq i32 %2523, 0, !dbg !44 + %2524 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2099, float 0x3E54AE0C00000000, float %.0.i241) #6, !dbg !44 + %2525 = tail call float @llvm.nvvm.fma.rn.f(float %2099, float 0x3E54AE0C00000000, float %.0.i241) #6, !dbg !44 + %.01.i243 = select i1 %.not4.i242, float %2525, float %2524, !dbg !44 + %2526 = bitcast float %.04.i239 to i32, !dbg !44 + %2527 = shl i32 %2526, 23, !dbg !44 + %2528 = bitcast i32 %2527 to float, !dbg !44 + %2529 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i243) #6, !dbg !44 + %2530 = fmul float %2529, %2528, !dbg !44 + %2531 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i244 = icmp eq i32 %2531, 0, !dbg !44 + %2532 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2100, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2533 = tail call float @llvm.nvvm.fma.rn.f(float %2100, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i245 = select i1 %.not.i244, float %2533, float %2532, !dbg !44 + %2534 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i246 = icmp eq i32 %2534, 0, !dbg !44 + %2535 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i245) #6, !dbg !44 + %2536 = tail call float @llvm.nvvm.saturate.f(float %.02.i245) #6, !dbg !44 + %.03.i247 = select i1 %.not1.i246, float %2536, float %2535, !dbg !44 + %2537 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i248 = icmp eq i32 %2537, 0, !dbg !44 + %2538 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i247, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2539 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i247, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i249 = select i1 %.not2.i248, float %2539, float %2538, !dbg !44 + %2540 = fadd float %.04.i249, 0xC168000FE0000000, !dbg !44 + %2541 = fneg float %2540, !dbg !44 + %2542 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i250 = icmp eq i32 %2542, 0, !dbg !44 + %2543 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2100, float 0x3FF7154760000000, float %2541) #6, !dbg !44 + %2544 = tail call float @llvm.nvvm.fma.rn.f(float %2100, float 0x3FF7154760000000, float %2541) #6, !dbg !44 + %.0.i251 = select i1 %.not3.i250, float %2544, float %2543, !dbg !44 + %2545 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i252 = icmp eq i32 %2545, 0, !dbg !44 + %2546 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2100, float 0x3E54AE0C00000000, float %.0.i251) #6, !dbg !44 + %2547 = tail call float @llvm.nvvm.fma.rn.f(float %2100, float 0x3E54AE0C00000000, float %.0.i251) #6, !dbg !44 + %.01.i253 = select i1 %.not4.i252, float %2547, float %2546, !dbg !44 + %2548 = bitcast float %.04.i249 to i32, !dbg !44 + %2549 = shl i32 %2548, 23, !dbg !44 + %2550 = bitcast i32 %2549 to float, !dbg !44 + %2551 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i253) #6, !dbg !44 + %2552 = fmul float %2551, %2550, !dbg !44 + %2553 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i254 = icmp eq i32 %2553, 0, !dbg !44 + %2554 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2101, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2555 = tail call float @llvm.nvvm.fma.rn.f(float %2101, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i255 = select i1 %.not.i254, float %2555, float %2554, !dbg !44 + %2556 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i256 = icmp eq i32 %2556, 0, !dbg !44 + %2557 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i255) #6, !dbg !44 + %2558 = tail call float @llvm.nvvm.saturate.f(float %.02.i255) #6, !dbg !44 + %.03.i257 = select i1 %.not1.i256, float %2558, float %2557, !dbg !44 + %2559 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i258 = icmp eq i32 %2559, 0, !dbg !44 + %2560 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i257, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2561 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i257, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i259 = select i1 %.not2.i258, float %2561, float %2560, !dbg !44 + %2562 = fadd float %.04.i259, 0xC168000FE0000000, !dbg !44 + %2563 = fneg float %2562, !dbg !44 + %2564 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i260 = icmp eq i32 %2564, 0, !dbg !44 + %2565 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2101, float 0x3FF7154760000000, float %2563) #6, !dbg !44 + %2566 = tail call float @llvm.nvvm.fma.rn.f(float %2101, float 0x3FF7154760000000, float %2563) #6, !dbg !44 + %.0.i261 = select i1 %.not3.i260, float %2566, float %2565, !dbg !44 + %2567 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i262 = icmp eq i32 %2567, 0, !dbg !44 + %2568 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2101, float 0x3E54AE0C00000000, float %.0.i261) #6, !dbg !44 + %2569 = tail call float @llvm.nvvm.fma.rn.f(float %2101, float 0x3E54AE0C00000000, float %.0.i261) #6, !dbg !44 + %.01.i263 = select i1 %.not4.i262, float %2569, float %2568, !dbg !44 + %2570 = bitcast float %.04.i259 to i32, !dbg !44 + %2571 = shl i32 %2570, 23, !dbg !44 + %2572 = bitcast i32 %2571 to float, !dbg !44 + %2573 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i263) #6, !dbg !44 + %2574 = fmul float %2573, %2572, !dbg !44 + %2575 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i264 = icmp eq i32 %2575, 0, !dbg !44 + %2576 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2102, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2577 = tail call float @llvm.nvvm.fma.rn.f(float %2102, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i265 = select i1 %.not.i264, float %2577, float %2576, !dbg !44 + %2578 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i266 = icmp eq i32 %2578, 0, !dbg !44 + %2579 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i265) #6, !dbg !44 + %2580 = tail call float @llvm.nvvm.saturate.f(float %.02.i265) #6, !dbg !44 + %.03.i267 = select i1 %.not1.i266, float %2580, float %2579, !dbg !44 + %2581 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i268 = icmp eq i32 %2581, 0, !dbg !44 + %2582 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i267, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2583 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i267, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i269 = select i1 %.not2.i268, float %2583, float %2582, !dbg !44 + %2584 = fadd float %.04.i269, 0xC168000FE0000000, !dbg !44 + %2585 = fneg float %2584, !dbg !44 + %2586 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i270 = icmp eq i32 %2586, 0, !dbg !44 + %2587 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2102, float 0x3FF7154760000000, float %2585) #6, !dbg !44 + %2588 = tail call float @llvm.nvvm.fma.rn.f(float %2102, float 0x3FF7154760000000, float %2585) #6, !dbg !44 + %.0.i271 = select i1 %.not3.i270, float %2588, float %2587, !dbg !44 + %2589 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i272 = icmp eq i32 %2589, 0, !dbg !44 + %2590 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2102, float 0x3E54AE0C00000000, float %.0.i271) #6, !dbg !44 + %2591 = tail call float @llvm.nvvm.fma.rn.f(float %2102, float 0x3E54AE0C00000000, float %.0.i271) #6, !dbg !44 + %.01.i273 = select i1 %.not4.i272, float %2591, float %2590, !dbg !44 + %2592 = bitcast float %.04.i269 to i32, !dbg !44 + %2593 = shl i32 %2592, 23, !dbg !44 + %2594 = bitcast i32 %2593 to float, !dbg !44 + %2595 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i273) #6, !dbg !44 + %2596 = fmul float %2595, %2594, !dbg !44 + %2597 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i274 = icmp eq i32 %2597, 0, !dbg !44 + %2598 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2103, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2599 = tail call float @llvm.nvvm.fma.rn.f(float %2103, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i275 = select i1 %.not.i274, float %2599, float %2598, !dbg !44 + %2600 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i276 = icmp eq i32 %2600, 0, !dbg !44 + %2601 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i275) #6, !dbg !44 + %2602 = tail call float @llvm.nvvm.saturate.f(float %.02.i275) #6, !dbg !44 + %.03.i277 = select i1 %.not1.i276, float %2602, float %2601, !dbg !44 + %2603 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i278 = icmp eq i32 %2603, 0, !dbg !44 + %2604 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i277, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2605 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i277, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i279 = select i1 %.not2.i278, float %2605, float %2604, !dbg !44 + %2606 = fadd float %.04.i279, 0xC168000FE0000000, !dbg !44 + %2607 = fneg float %2606, !dbg !44 + %2608 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i280 = icmp eq i32 %2608, 0, !dbg !44 + %2609 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2103, float 0x3FF7154760000000, float %2607) #6, !dbg !44 + %2610 = tail call float @llvm.nvvm.fma.rn.f(float %2103, float 0x3FF7154760000000, float %2607) #6, !dbg !44 + %.0.i281 = select i1 %.not3.i280, float %2610, float %2609, !dbg !44 + %2611 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i282 = icmp eq i32 %2611, 0, !dbg !44 + %2612 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2103, float 0x3E54AE0C00000000, float %.0.i281) #6, !dbg !44 + %2613 = tail call float @llvm.nvvm.fma.rn.f(float %2103, float 0x3E54AE0C00000000, float %.0.i281) #6, !dbg !44 + %.01.i283 = select i1 %.not4.i282, float %2613, float %2612, !dbg !44 + %2614 = bitcast float %.04.i279 to i32, !dbg !44 + %2615 = shl i32 %2614, 23, !dbg !44 + %2616 = bitcast i32 %2615 to float, !dbg !44 + %2617 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i283) #6, !dbg !44 + %2618 = fmul float %2617, %2616, !dbg !44 + %2619 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i284 = icmp eq i32 %2619, 0, !dbg !44 + %2620 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2104, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2621 = tail call float @llvm.nvvm.fma.rn.f(float %2104, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i285 = select i1 %.not.i284, float %2621, float %2620, !dbg !44 + %2622 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i286 = icmp eq i32 %2622, 0, !dbg !44 + %2623 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i285) #6, !dbg !44 + %2624 = tail call float @llvm.nvvm.saturate.f(float %.02.i285) #6, !dbg !44 + %.03.i287 = select i1 %.not1.i286, float %2624, float %2623, !dbg !44 + %2625 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i288 = icmp eq i32 %2625, 0, !dbg !44 + %2626 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i287, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2627 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i287, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i289 = select i1 %.not2.i288, float %2627, float %2626, !dbg !44 + %2628 = fadd float %.04.i289, 0xC168000FE0000000, !dbg !44 + %2629 = fneg float %2628, !dbg !44 + %2630 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i290 = icmp eq i32 %2630, 0, !dbg !44 + %2631 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2104, float 0x3FF7154760000000, float %2629) #6, !dbg !44 + %2632 = tail call float @llvm.nvvm.fma.rn.f(float %2104, float 0x3FF7154760000000, float %2629) #6, !dbg !44 + %.0.i291 = select i1 %.not3.i290, float %2632, float %2631, !dbg !44 + %2633 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i292 = icmp eq i32 %2633, 0, !dbg !44 + %2634 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2104, float 0x3E54AE0C00000000, float %.0.i291) #6, !dbg !44 + %2635 = tail call float @llvm.nvvm.fma.rn.f(float %2104, float 0x3E54AE0C00000000, float %.0.i291) #6, !dbg !44 + %.01.i293 = select i1 %.not4.i292, float %2635, float %2634, !dbg !44 + %2636 = bitcast float %.04.i289 to i32, !dbg !44 + %2637 = shl i32 %2636, 23, !dbg !44 + %2638 = bitcast i32 %2637 to float, !dbg !44 + %2639 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i293) #6, !dbg !44 + %2640 = fmul float %2639, %2638, !dbg !44 + %2641 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i294 = icmp eq i32 %2641, 0, !dbg !44 + %2642 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2105, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2643 = tail call float @llvm.nvvm.fma.rn.f(float %2105, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i295 = select i1 %.not.i294, float %2643, float %2642, !dbg !44 + %2644 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i296 = icmp eq i32 %2644, 0, !dbg !44 + %2645 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i295) #6, !dbg !44 + %2646 = tail call float @llvm.nvvm.saturate.f(float %.02.i295) #6, !dbg !44 + %.03.i297 = select i1 %.not1.i296, float %2646, float %2645, !dbg !44 + %2647 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i298 = icmp eq i32 %2647, 0, !dbg !44 + %2648 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i297, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2649 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i297, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i299 = select i1 %.not2.i298, float %2649, float %2648, !dbg !44 + %2650 = fadd float %.04.i299, 0xC168000FE0000000, !dbg !44 + %2651 = fneg float %2650, !dbg !44 + %2652 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i300 = icmp eq i32 %2652, 0, !dbg !44 + %2653 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2105, float 0x3FF7154760000000, float %2651) #6, !dbg !44 + %2654 = tail call float @llvm.nvvm.fma.rn.f(float %2105, float 0x3FF7154760000000, float %2651) #6, !dbg !44 + %.0.i301 = select i1 %.not3.i300, float %2654, float %2653, !dbg !44 + %2655 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i302 = icmp eq i32 %2655, 0, !dbg !44 + %2656 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2105, float 0x3E54AE0C00000000, float %.0.i301) #6, !dbg !44 + %2657 = tail call float @llvm.nvvm.fma.rn.f(float %2105, float 0x3E54AE0C00000000, float %.0.i301) #6, !dbg !44 + %.01.i303 = select i1 %.not4.i302, float %2657, float %2656, !dbg !44 + %2658 = bitcast float %.04.i299 to i32, !dbg !44 + %2659 = shl i32 %2658, 23, !dbg !44 + %2660 = bitcast i32 %2659 to float, !dbg !44 + %2661 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i303) #6, !dbg !44 + %2662 = fmul float %2661, %2660, !dbg !44 + %2663 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i304 = icmp eq i32 %2663, 0, !dbg !44 + %2664 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2106, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2665 = tail call float @llvm.nvvm.fma.rn.f(float %2106, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i305 = select i1 %.not.i304, float %2665, float %2664, !dbg !44 + %2666 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i306 = icmp eq i32 %2666, 0, !dbg !44 + %2667 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i305) #6, !dbg !44 + %2668 = tail call float @llvm.nvvm.saturate.f(float %.02.i305) #6, !dbg !44 + %.03.i307 = select i1 %.not1.i306, float %2668, float %2667, !dbg !44 + %2669 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i308 = icmp eq i32 %2669, 0, !dbg !44 + %2670 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i307, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2671 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i307, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i309 = select i1 %.not2.i308, float %2671, float %2670, !dbg !44 + %2672 = fadd float %.04.i309, 0xC168000FE0000000, !dbg !44 + %2673 = fneg float %2672, !dbg !44 + %2674 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i310 = icmp eq i32 %2674, 0, !dbg !44 + %2675 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2106, float 0x3FF7154760000000, float %2673) #6, !dbg !44 + %2676 = tail call float @llvm.nvvm.fma.rn.f(float %2106, float 0x3FF7154760000000, float %2673) #6, !dbg !44 + %.0.i311 = select i1 %.not3.i310, float %2676, float %2675, !dbg !44 + %2677 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i312 = icmp eq i32 %2677, 0, !dbg !44 + %2678 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2106, float 0x3E54AE0C00000000, float %.0.i311) #6, !dbg !44 + %2679 = tail call float @llvm.nvvm.fma.rn.f(float %2106, float 0x3E54AE0C00000000, float %.0.i311) #6, !dbg !44 + %.01.i313 = select i1 %.not4.i312, float %2679, float %2678, !dbg !44 + %2680 = bitcast float %.04.i309 to i32, !dbg !44 + %2681 = shl i32 %2680, 23, !dbg !44 + %2682 = bitcast i32 %2681 to float, !dbg !44 + %2683 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i313) #6, !dbg !44 + %2684 = fmul float %2683, %2682, !dbg !44 + %2685 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i314 = icmp eq i32 %2685, 0, !dbg !44 + %2686 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2107, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2687 = tail call float @llvm.nvvm.fma.rn.f(float %2107, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i315 = select i1 %.not.i314, float %2687, float %2686, !dbg !44 + %2688 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i316 = icmp eq i32 %2688, 0, !dbg !44 + %2689 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i315) #6, !dbg !44 + %2690 = tail call float @llvm.nvvm.saturate.f(float %.02.i315) #6, !dbg !44 + %.03.i317 = select i1 %.not1.i316, float %2690, float %2689, !dbg !44 + %2691 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %2692 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i317, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2693 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i317, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2694 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i320 = icmp eq i32 %2694, 0, !dbg !44 + %2695 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i322 = icmp eq i32 %2695, 0, !dbg !44 + %2696 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i324 = icmp eq i32 %2696, 0, !dbg !44 + %2697 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2108, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2698 = tail call float @llvm.nvvm.fma.rn.f(float %2108, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i325 = select i1 %.not.i324, float %2698, float %2697, !dbg !44 + %2699 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i326 = icmp eq i32 %2699, 0, !dbg !44 + %2700 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i325) #6, !dbg !44 + %2701 = tail call float @llvm.nvvm.saturate.f(float %.02.i325) #6, !dbg !44 + %.03.i327 = select i1 %.not1.i326, float %2701, float %2700, !dbg !44 + %2702 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %2703 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i327, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2704 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i327, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2705 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i330 = icmp eq i32 %2705, 0, !dbg !44 + %2706 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i332 = icmp eq i32 %2706, 0, !dbg !44 + %2707 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i334 = icmp eq i32 %2707, 0, !dbg !44 + %2708 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2109, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2709 = tail call float @llvm.nvvm.fma.rn.f(float %2109, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i335 = select i1 %.not.i334, float %2709, float %2708, !dbg !44 + %2710 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i336 = icmp eq i32 %2710, 0, !dbg !44 + %2711 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i335) #6, !dbg !44 + %2712 = tail call float @llvm.nvvm.saturate.f(float %.02.i335) #6, !dbg !44 + %.03.i337 = select i1 %.not1.i336, float %2712, float %2711, !dbg !44 + %2713 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %2714 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i337, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2715 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i337, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2716 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i340 = icmp eq i32 %2716, 0, !dbg !44 + %2717 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i342 = icmp eq i32 %2717, 0, !dbg !44 + %2718 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i344 = icmp eq i32 %2718, 0, !dbg !44 + %2719 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2110, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2720 = tail call float @llvm.nvvm.fma.rn.f(float %2110, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i345 = select i1 %.not.i344, float %2720, float %2719, !dbg !44 + %2721 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i346 = icmp eq i32 %2721, 0, !dbg !44 + %2722 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i345) #6, !dbg !44 + %2723 = tail call float @llvm.nvvm.saturate.f(float %.02.i345) #6, !dbg !44 + %.03.i347 = select i1 %.not1.i346, float %2723, float %2722, !dbg !44 + %2724 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %2725 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i347, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2726 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i347, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2727 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i350 = icmp eq i32 %2727, 0, !dbg !44 + %2728 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i352 = icmp eq i32 %2728, 0, !dbg !44 + %2729 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i354 = icmp eq i32 %2729, 0, !dbg !44 + %2730 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2111, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2731 = tail call float @llvm.nvvm.fma.rn.f(float %2111, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i355 = select i1 %.not.i354, float %2731, float %2730, !dbg !44 + %2732 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i356 = icmp eq i32 %2732, 0, !dbg !44 + %2733 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i355) #6, !dbg !44 + %2734 = tail call float @llvm.nvvm.saturate.f(float %.02.i355) #6, !dbg !44 + %.03.i357 = select i1 %.not1.i356, float %2734, float %2733, !dbg !44 + %2735 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %2736 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i357, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2737 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i357, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2738 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i360 = icmp eq i32 %2738, 0, !dbg !44 + %2739 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i362 = icmp eq i32 %2739, 0, !dbg !44 + %2740 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i364 = icmp eq i32 %2740, 0, !dbg !44 + %2741 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2112, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2742 = tail call float @llvm.nvvm.fma.rn.f(float %2112, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i365 = select i1 %.not.i364, float %2742, float %2741, !dbg !44 + %2743 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i366 = icmp eq i32 %2743, 0, !dbg !44 + %2744 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i365) #6, !dbg !44 + %2745 = tail call float @llvm.nvvm.saturate.f(float %.02.i365) #6, !dbg !44 + %.03.i367 = select i1 %.not1.i366, float %2745, float %2744, !dbg !44 + %2746 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %2747 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i367, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2748 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i367, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2749 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i370 = icmp eq i32 %2749, 0, !dbg !44 + %2750 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i372 = icmp eq i32 %2750, 0, !dbg !44 + %2751 = extractelement <16 x float> %1511, i64 0, !dbg !45 + %2752 = fmul float %2751, %2134, !dbg !45 + %2753 = extractelement <16 x float> %1511, i64 1, !dbg !45 + %2754 = fmul float %2753, %2156, !dbg !45 + %2755 = extractelement <16 x float> %1511, i64 2, !dbg !45 + %2756 = fmul float %2755, %2178, !dbg !45 + %2757 = extractelement <16 x float> %1511, i64 3, !dbg !45 + %2758 = fmul float %2757, %2200, !dbg !45 + %2759 = extractelement <16 x float> %1511, i64 4, !dbg !45 + %2760 = fmul float %2759, %2222, !dbg !45 + %2761 = extractelement <16 x float> %1511, i64 5, !dbg !45 + %2762 = fmul float %2761, %2244, !dbg !45 + %2763 = extractelement <16 x float> %1511, i64 6, !dbg !45 + %2764 = fmul float %2763, %2266, !dbg !45 + %2765 = extractelement <16 x float> %1511, i64 7, !dbg !45 + %2766 = fmul float %2765, %2288, !dbg !45 + %2767 = extractelement <16 x float> %1511, i64 8, !dbg !45 + %2768 = fmul float %2767, %2310, !dbg !45 + %2769 = extractelement <16 x float> %1511, i64 9, !dbg !45 + %2770 = fmul float %2769, %2332, !dbg !45 + %2771 = extractelement <16 x float> %1511, i64 10, !dbg !45 + %2772 = fmul float %2771, %2354, !dbg !45 + %2773 = extractelement <16 x float> %1511, i64 11, !dbg !45 + %2774 = fmul float %2773, %2376, !dbg !45 + %2775 = extractelement <16 x float> %1511, i64 12, !dbg !45 + %2776 = fmul float %2775, %2398, !dbg !45 + %2777 = extractelement <16 x float> %1511, i64 13, !dbg !45 + %2778 = fmul float %2777, %2420, !dbg !45 + %2779 = extractelement <16 x float> %1511, i64 14, !dbg !45 + %2780 = fmul float %2779, %2442, !dbg !45 + %2781 = extractelement <16 x float> %1511, i64 15, !dbg !45 + %2782 = fmul float %2781, %2464, !dbg !45 + %2783 = fmul float %1512, %2486, !dbg !45 + %2784 = fmul float %1513, %2508, !dbg !45 + %2785 = fmul float %1514, %2530, !dbg !45 + %2786 = fmul float %1515, %2552, !dbg !45 + %2787 = fmul float %1516, %2574, !dbg !45 + %2788 = fmul float %1517, %2596, !dbg !45 + %2789 = fmul float %1518, %2618, !dbg !45 + %2790 = fmul float %1519, %2640, !dbg !45 + %2791 = fmul float %1820, %2662, !dbg !45 + %2792 = fmul float %1821, %2684, !dbg !45 + %2793 = insertelement <2 x i32> poison, i32 %2691, i64 0, !dbg !44 + %2794 = insertelement <2 x i32> %2793, i32 %2702, i64 1, !dbg !44 + %2795 = icmp eq <2 x i32> %2794, zeroinitializer, !dbg !44 + %2796 = insertelement <2 x float> poison, float %2693, i64 0, !dbg !44 + %2797 = insertelement <2 x float> %2796, float %2704, i64 1, !dbg !44 + %2798 = insertelement <2 x float> poison, float %2692, i64 0, !dbg !44 + %2799 = insertelement <2 x float> %2798, float %2703, i64 1, !dbg !44 + %2800 = select <2 x i1> %2795, <2 x float> %2797, <2 x float> %2799, !dbg !44 + %2801 = extractelement <2 x float> %2800, i64 0, !dbg !44 + %2802 = fadd float %2801, 0xC168000FE0000000, !dbg !44 + %2803 = fneg float %2802, !dbg !44 + %2804 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2107, float 0x3FF7154760000000, float %2803) #6, !dbg !44 + %2805 = tail call float @llvm.nvvm.fma.rn.f(float %2107, float 0x3FF7154760000000, float %2803) #6, !dbg !44 + %.0.i321 = select i1 %.not3.i320, float %2805, float %2804, !dbg !44 + %2806 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2107, float 0x3E54AE0C00000000, float %.0.i321) #6, !dbg !44 + %2807 = tail call float @llvm.nvvm.fma.rn.f(float %2107, float 0x3E54AE0C00000000, float %.0.i321) #6, !dbg !44 + %.01.i323 = select i1 %.not4.i322, float %2807, float %2806, !dbg !44 + %2808 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i323) #6, !dbg !44 + %2809 = extractelement <2 x float> %2800, i64 1, !dbg !44 + %2810 = fadd float %2809, 0xC168000FE0000000, !dbg !44 + %2811 = fneg float %2810, !dbg !44 + %2812 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2108, float 0x3FF7154760000000, float %2811) #6, !dbg !44 + %2813 = tail call float @llvm.nvvm.fma.rn.f(float %2108, float 0x3FF7154760000000, float %2811) #6, !dbg !44 + %.0.i331 = select i1 %.not3.i330, float %2813, float %2812, !dbg !44 + %2814 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2108, float 0x3E54AE0C00000000, float %.0.i331) #6, !dbg !44 + %2815 = tail call float @llvm.nvvm.fma.rn.f(float %2108, float 0x3E54AE0C00000000, float %.0.i331) #6, !dbg !44 + %.01.i333 = select i1 %.not4.i332, float %2815, float %2814, !dbg !44 + %2816 = bitcast <2 x float> %2800 to <2 x i32>, !dbg !44 + %2817 = shl <2 x i32> %2816, splat (i32 23), !dbg !44 + %2818 = bitcast <2 x i32> %2817 to <2 x float>, !dbg !44 + %2819 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i333) #6, !dbg !44 + %2820 = insertelement <2 x float> poison, float %2808, i64 0, !dbg !44 + %2821 = insertelement <2 x float> %2820, float %2819, i64 1, !dbg !44 + %2822 = fmul <2 x float> %2821, %2818, !dbg !44 + %2823 = fmul <2 x float> %1822, %2822, !dbg !45 + %2824 = insertelement <2 x i32> poison, i32 %2713, i64 0, !dbg !44 + %2825 = insertelement <2 x i32> %2824, i32 %2724, i64 1, !dbg !44 + %2826 = icmp eq <2 x i32> %2825, zeroinitializer, !dbg !44 + %2827 = insertelement <2 x float> poison, float %2715, i64 0, !dbg !44 + %2828 = insertelement <2 x float> %2827, float %2726, i64 1, !dbg !44 + %2829 = insertelement <2 x float> poison, float %2714, i64 0, !dbg !44 + %2830 = insertelement <2 x float> %2829, float %2725, i64 1, !dbg !44 + %2831 = select <2 x i1> %2826, <2 x float> %2828, <2 x float> %2830, !dbg !44 + %2832 = extractelement <2 x float> %2831, i64 0, !dbg !44 + %2833 = fadd float %2832, 0xC168000FE0000000, !dbg !44 + %2834 = fneg float %2833, !dbg !44 + %2835 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2109, float 0x3FF7154760000000, float %2834) #6, !dbg !44 + %2836 = tail call float @llvm.nvvm.fma.rn.f(float %2109, float 0x3FF7154760000000, float %2834) #6, !dbg !44 + %.0.i341 = select i1 %.not3.i340, float %2836, float %2835, !dbg !44 + %2837 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2109, float 0x3E54AE0C00000000, float %.0.i341) #6, !dbg !44 + %2838 = tail call float @llvm.nvvm.fma.rn.f(float %2109, float 0x3E54AE0C00000000, float %.0.i341) #6, !dbg !44 + %.01.i343 = select i1 %.not4.i342, float %2838, float %2837, !dbg !44 + %2839 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i343) #6, !dbg !44 + %2840 = extractelement <2 x float> %2831, i64 1, !dbg !44 + %2841 = fadd float %2840, 0xC168000FE0000000, !dbg !44 + %2842 = fneg float %2841, !dbg !44 + %2843 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2110, float 0x3FF7154760000000, float %2842) #6, !dbg !44 + %2844 = tail call float @llvm.nvvm.fma.rn.f(float %2110, float 0x3FF7154760000000, float %2842) #6, !dbg !44 + %.0.i351 = select i1 %.not3.i350, float %2844, float %2843, !dbg !44 + %2845 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2110, float 0x3E54AE0C00000000, float %.0.i351) #6, !dbg !44 + %2846 = tail call float @llvm.nvvm.fma.rn.f(float %2110, float 0x3E54AE0C00000000, float %.0.i351) #6, !dbg !44 + %.01.i353 = select i1 %.not4.i352, float %2846, float %2845, !dbg !44 + %2847 = bitcast <2 x float> %2831 to <2 x i32>, !dbg !44 + %2848 = shl <2 x i32> %2847, splat (i32 23), !dbg !44 + %2849 = bitcast <2 x i32> %2848 to <2 x float>, !dbg !44 + %2850 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i353) #6, !dbg !44 + %2851 = insertelement <2 x float> poison, float %2839, i64 0, !dbg !44 + %2852 = insertelement <2 x float> %2851, float %2850, i64 1, !dbg !44 + %2853 = fmul <2 x float> %2852, %2849, !dbg !44 + %2854 = fmul <2 x float> %1823, %2853, !dbg !45 + %2855 = insertelement <2 x i32> poison, i32 %2735, i64 0, !dbg !44 + %2856 = insertelement <2 x i32> %2855, i32 %2746, i64 1, !dbg !44 + %2857 = icmp eq <2 x i32> %2856, zeroinitializer, !dbg !44 + %2858 = insertelement <2 x float> poison, float %2737, i64 0, !dbg !44 + %2859 = insertelement <2 x float> %2858, float %2748, i64 1, !dbg !44 + %2860 = insertelement <2 x float> poison, float %2736, i64 0, !dbg !44 + %2861 = insertelement <2 x float> %2860, float %2747, i64 1, !dbg !44 + %2862 = select <2 x i1> %2857, <2 x float> %2859, <2 x float> %2861, !dbg !44 + %2863 = extractelement <2 x float> %2862, i64 0, !dbg !44 + %2864 = fadd float %2863, 0xC168000FE0000000, !dbg !44 + %2865 = fneg float %2864, !dbg !44 + %2866 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2111, float 0x3FF7154760000000, float %2865) #6, !dbg !44 + %2867 = tail call float @llvm.nvvm.fma.rn.f(float %2111, float 0x3FF7154760000000, float %2865) #6, !dbg !44 + %.0.i361 = select i1 %.not3.i360, float %2867, float %2866, !dbg !44 + %2868 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2111, float 0x3E54AE0C00000000, float %.0.i361) #6, !dbg !44 + %2869 = tail call float @llvm.nvvm.fma.rn.f(float %2111, float 0x3E54AE0C00000000, float %.0.i361) #6, !dbg !44 + %.01.i363 = select i1 %.not4.i362, float %2869, float %2868, !dbg !44 + %2870 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i363) #6, !dbg !44 + %2871 = extractelement <2 x float> %2862, i64 1, !dbg !44 + %2872 = fadd float %2871, 0xC168000FE0000000, !dbg !44 + %2873 = fneg float %2872, !dbg !44 + %2874 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2112, float 0x3FF7154760000000, float %2873) #6, !dbg !44 + %2875 = tail call float @llvm.nvvm.fma.rn.f(float %2112, float 0x3FF7154760000000, float %2873) #6, !dbg !44 + %.0.i371 = select i1 %.not3.i370, float %2875, float %2874, !dbg !44 + %2876 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2112, float 0x3E54AE0C00000000, float %.0.i371) #6, !dbg !44 + %2877 = tail call float @llvm.nvvm.fma.rn.f(float %2112, float 0x3E54AE0C00000000, float %.0.i371) #6, !dbg !44 + %.01.i373 = select i1 %.not4.i372, float %2877, float %2876, !dbg !44 + %2878 = bitcast <2 x float> %2862 to <2 x i32>, !dbg !44 + %2879 = shl <2 x i32> %2878, splat (i32 23), !dbg !44 + %2880 = bitcast <2 x i32> %2879 to <2 x float>, !dbg !44 + %2881 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i373) #6, !dbg !44 + %2882 = insertelement <2 x float> poison, float %2870, i64 0, !dbg !44 + %2883 = insertelement <2 x float> %2882, float %2881, i64 1, !dbg !44 + %2884 = fmul <2 x float> %2883, %2880, !dbg !44 + %2885 = fmul <2 x float> %1824, %2884, !dbg !45 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %2886 = fadd float %2752, %2754, !dbg !49 + %2887 = fadd float %2886, %2756, !dbg !49 + %2888 = fadd float %2887, %2758, !dbg !49 + %2889 = fadd float %2888, %2760, !dbg !49 + %2890 = fadd float %2889, %2762, !dbg !49 + %2891 = fadd float %2890, %2764, !dbg !49 + %2892 = fadd float %2891, %2766, !dbg !49 + %2893 = fadd float %2892, %2768, !dbg !49 + %2894 = fadd float %2893, %2770, !dbg !49 + %2895 = fadd float %2894, %2772, !dbg !49 + %2896 = fadd float %2895, %2774, !dbg !49 + %2897 = fadd float %2896, %2776, !dbg !49 + %2898 = fadd float %2897, %2778, !dbg !49 + %2899 = fadd float %2898, %2780, !dbg !49 + %2900 = fadd float %2899, %2782, !dbg !49 + %2901 = fadd float %2900, %2783, !dbg !49 + %2902 = fadd float %2901, %2784, !dbg !49 + %2903 = fadd float %2902, %2785, !dbg !49 + %2904 = fadd float %2903, %2786, !dbg !49 + %2905 = fadd float %2904, %2787, !dbg !49 + %2906 = fadd float %2905, %2788, !dbg !49 + %2907 = fadd float %2906, %2789, !dbg !49 + %2908 = fadd float %2907, %2790, !dbg !49 + %2909 = fadd float %2908, %2791, !dbg !49 + %2910 = fadd float %2909, %2792, !dbg !49 + %2911 = extractelement <2 x float> %2823, i64 0, !dbg !49 + %2912 = fadd float %2910, %2911, !dbg !49 + %2913 = extractelement <2 x float> %2823, i64 1, !dbg !49 + %2914 = fadd float %2912, %2913, !dbg !49 + %2915 = extractelement <2 x float> %2854, i64 0, !dbg !49 + %2916 = fadd float %2914, %2915, !dbg !49 + %2917 = extractelement <2 x float> %2854, i64 1, !dbg !49 + %2918 = fadd float %2916, %2917, !dbg !49 + %2919 = extractelement <2 x float> %2885, i64 0, !dbg !49 + %2920 = fadd float %2918, %2919, !dbg !49 + %2921 = extractelement <2 x float> %2885, i64 1, !dbg !49 + %2922 = fadd float %2920, %2921, !dbg !49 + %2923 = bitcast float %2922 to i32, !dbg !46 + %2924 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2923, i32 16, i32 31), !dbg !46 + %2925 = bitcast i32 %2924 to float, !dbg !46 + %2926 = fadd float %2922, %2925, !dbg !49 + %2927 = bitcast float %2926 to i32, !dbg !46 + %2928 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2927, i32 8, i32 31), !dbg !46 + %2929 = bitcast i32 %2928 to float, !dbg !46 + %2930 = fadd float %2926, %2929, !dbg !49 + %2931 = bitcast float %2930 to i32, !dbg !46 + %2932 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2931, i32 4, i32 31), !dbg !46 + %2933 = bitcast i32 %2932 to float, !dbg !46 + %2934 = fadd float %2930, %2933, !dbg !49 + %2935 = bitcast float %2934 to i32, !dbg !46 + %2936 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2935, i32 2, i32 31), !dbg !46 + %2937 = bitcast i32 %2936 to float, !dbg !46 + %2938 = fadd float %2934, %2937, !dbg !49 + %2939 = bitcast float %2938 to i32, !dbg !46 + %2940 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2939, i32 1, i32 31), !dbg !46 + %2941 = bitcast i32 %2940 to float, !dbg !46 + %2942 = fadd float %2938, %2941, !dbg !49 + %2943 = bitcast float %2942 to <1 x i32>, !dbg !46 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %2011, <1 x i32> %2943, i1 %2010) #6, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %2944 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %2015, i1 %2014) #6, !dbg !46 + %2945 = bitcast i32 %2944 to float, !dbg !46 + %2946 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2944, i32 8, i32 31), !dbg !46 + %2947 = bitcast i32 %2946 to float, !dbg !46 + %2948 = fadd float %2945, %2947, !dbg !49 + %2949 = bitcast float %2948 to i32, !dbg !46 + %2950 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2949, i32 4, i32 31), !dbg !46 + %2951 = bitcast i32 %2950 to float, !dbg !46 + %2952 = fadd float %2948, %2951, !dbg !49 + %2953 = bitcast float %2952 to i32, !dbg !46 + %2954 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2953, i32 2, i32 31), !dbg !46 + %2955 = bitcast i32 %2954 to float, !dbg !46 + %2956 = fadd float %2952, %2955, !dbg !49 + %2957 = bitcast float %2956 to i32, !dbg !46 + %2958 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2957, i32 1, i32 31), !dbg !46 + %2959 = bitcast i32 %2958 to float, !dbg !46 + %2960 = fadd float %2956, %2959, !dbg !49 + %2961 = bitcast float %2960 to <1 x i32>, !dbg !46 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %2015, <1 x i32> %2961, i1 %2044) #6, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %2962 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !46 + %2963 = shl nuw nsw i32 %9, 4 + %2964 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2963 + %2965 = xor i32 %2963, 8256 + %2966 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2965 + %2967 = shl nuw nsw i32 %8, 3 + %2968 = and i32 %2967, 4080 + %2969 = and i32 %8, 1 + %2970 = icmp eq i32 %2969, 0 + %2971 = select i1 %2970, i32 0, i32 8256 + %2972 = xor i32 %2971, %2968 + %2973 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2972 + %2974 = getelementptr inbounds nuw i8, ptr addrspace(3) %2973, i32 4096 + %2975 = zext nneg i32 %1844 to i64, !dbg !50 + br label %2976, !dbg !50 + +2976: ; preds = %1825, %2976 + %2977 = phi i1 [ true, %1825 ], [ false, %2976 ] + %indvars.iv1398 = phi i64 [ 0, %1825 ], [ 16384, %2976 ] + %2978 = or disjoint i64 %indvars.iv1398, %15, !dbg !51 + %2979 = or disjoint i64 %indvars.iv1398, %2975, !dbg !51 + %2980 = or disjoint i64 %2979, 14336, !dbg !51 + %2981 = icmp samesign ult i64 %2978, 32000, !dbg !52 + %2982 = icmp samesign ult i64 %2980, 32000, !dbg !52 + %2983 = trunc nuw nsw i64 %indvars.iv1398 to i32, !dbg !53 + %2984 = or disjoint i32 %10, %2983, !dbg !53 + %2985 = add i32 %2984, %14, !dbg !53 + %2986 = trunc nuw nsw i64 %indvars.iv1398 to i32, !dbg !53 + %2987 = or disjoint i32 %11, %2986, !dbg !53 + %2988 = add i32 %2987, %14, !dbg !53 + %2989 = trunc nuw nsw i64 %indvars.iv1398 to i32, !dbg !53 + %2990 = or disjoint i32 %12, %2989, !dbg !53 + %2991 = add i32 %2990, %14, !dbg !53 + %2992 = trunc nuw nsw i64 %2978 to i32, !dbg !53 + %2993 = add i32 %14, %2992, !dbg !53 + %2994 = trunc nuw nsw i64 %2979 to i32, !dbg !53 + %2995 = add i32 %14, %2994, !dbg !53 + %2996 = trunc nuw nsw i64 %2979 to i32, !dbg !53 + %2997 = or disjoint i32 %2996, 2048, !dbg !53 + %2998 = add i32 %2997, %14, !dbg !53 + %2999 = trunc nuw nsw i64 %2979 to i32, !dbg !53 + %3000 = or disjoint i32 %2999, 4096, !dbg !53 + %3001 = add i32 %3000, %14, !dbg !53 + %3002 = trunc nuw nsw i64 %2979 to i32, !dbg !53 + %3003 = or disjoint i32 %3002, 6144, !dbg !53 + %3004 = add i32 %3003, %14, !dbg !53 + %3005 = trunc nuw nsw i64 %2979 to i32, !dbg !53 + %3006 = or disjoint i32 %3005, 8192, !dbg !53 + %3007 = add i32 %3006, %14, !dbg !53 + %3008 = trunc nuw nsw i64 %2979 to i32, !dbg !53 + %3009 = or disjoint i32 %3008, 10240, !dbg !53 + %3010 = add i32 %3009, %14, !dbg !53 + %3011 = trunc nuw nsw i64 %2979 to i32, !dbg !53 + %3012 = or disjoint i32 %3011, 12288, !dbg !53 + %3013 = add i32 %3012, %14, !dbg !53 + %3014 = trunc nuw nsw i64 %2980 to i32, !dbg !53 + %3015 = add i32 %14, %3014, !dbg !53 + %3016 = sext i32 %2985 to i64, !dbg !54 + %3017 = getelementptr bfloat, ptr addrspace(1) %0, i64 %3016, !dbg !54 + %3018 = sext i32 %2988 to i64, !dbg !54 + %3019 = getelementptr bfloat, ptr addrspace(1) %0, i64 %3018, !dbg !54 + %3020 = sext i32 %2991 to i64, !dbg !54 + %3021 = getelementptr bfloat, ptr addrspace(1) %0, i64 %3020, !dbg !54 + %3022 = sext i32 %2993 to i64, !dbg !54 + %3023 = getelementptr bfloat, ptr addrspace(1) %0, i64 %3022, !dbg !54 + %3024 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !55 + %3025 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %3017, i64 %3024, i1 true) #6, !dbg !55 + %3026 = extractvalue { i32, i32, i32, i32 } %3025, 0, !dbg !55 + %3027 = bitcast i32 %3026 to <2 x bfloat>, !dbg !55 + %3028 = extractvalue { i32, i32, i32, i32 } %3025, 1, !dbg !55 + %3029 = bitcast i32 %3028 to <2 x bfloat>, !dbg !55 + %3030 = extractvalue { i32, i32, i32, i32 } %3025, 2, !dbg !55 + %3031 = bitcast i32 %3030 to <2 x bfloat>, !dbg !55 + %3032 = extractvalue { i32, i32, i32, i32 } %3025, 3, !dbg !55 + %3033 = bitcast i32 %3032 to <2 x bfloat>, !dbg !55 + %3034 = extractelement <2 x bfloat> %3027, i64 0, !dbg !55 + %3035 = extractelement <2 x bfloat> %3027, i64 1, !dbg !55 + %3036 = extractelement <2 x bfloat> %3029, i64 0, !dbg !55 + %3037 = extractelement <2 x bfloat> %3029, i64 1, !dbg !55 + %3038 = extractelement <2 x bfloat> %3031, i64 0, !dbg !55 + %3039 = extractelement <2 x bfloat> %3031, i64 1, !dbg !55 + %3040 = extractelement <2 x bfloat> %3033, i64 0, !dbg !55 + %3041 = extractelement <2 x bfloat> %3033, i64 1, !dbg !55 + %3042 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !55 + %3043 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %3019, i64 %3042, i1 true) #6, !dbg !55 + %3044 = extractvalue { i32, i32, i32, i32 } %3043, 0, !dbg !55 + %3045 = bitcast i32 %3044 to <2 x bfloat>, !dbg !55 + %3046 = extractvalue { i32, i32, i32, i32 } %3043, 1, !dbg !55 + %3047 = bitcast i32 %3046 to <2 x bfloat>, !dbg !55 + %3048 = extractvalue { i32, i32, i32, i32 } %3043, 2, !dbg !55 + %3049 = bitcast i32 %3048 to <2 x bfloat>, !dbg !55 + %3050 = extractvalue { i32, i32, i32, i32 } %3043, 3, !dbg !55 + %3051 = bitcast i32 %3050 to <2 x bfloat>, !dbg !55 + %3052 = extractelement <2 x bfloat> %3045, i64 0, !dbg !55 + %3053 = extractelement <2 x bfloat> %3045, i64 1, !dbg !55 + %3054 = extractelement <2 x bfloat> %3047, i64 0, !dbg !55 + %3055 = extractelement <2 x bfloat> %3047, i64 1, !dbg !55 + %3056 = extractelement <2 x bfloat> %3049, i64 0, !dbg !55 + %3057 = extractelement <2 x bfloat> %3049, i64 1, !dbg !55 + %3058 = extractelement <2 x bfloat> %3051, i64 0, !dbg !55 + %3059 = extractelement <2 x bfloat> %3051, i64 1, !dbg !55 + %3060 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !55 + %3061 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %3021, i64 %3060, i1 true) #6, !dbg !55 + %3062 = extractvalue { i32, i32, i32, i32 } %3061, 0, !dbg !55 + %3063 = bitcast i32 %3062 to <2 x bfloat>, !dbg !55 + %3064 = extractvalue { i32, i32, i32, i32 } %3061, 1, !dbg !55 + %3065 = bitcast i32 %3064 to <2 x bfloat>, !dbg !55 + %3066 = extractvalue { i32, i32, i32, i32 } %3061, 2, !dbg !55 + %3067 = bitcast i32 %3066 to <2 x bfloat>, !dbg !55 + %3068 = extractvalue { i32, i32, i32, i32 } %3061, 3, !dbg !55 + %3069 = bitcast i32 %3068 to <2 x bfloat>, !dbg !55 + %3070 = extractelement <2 x bfloat> %3063, i64 0, !dbg !55 + %3071 = extractelement <2 x bfloat> %3063, i64 1, !dbg !55 + %3072 = extractelement <2 x bfloat> %3065, i64 0, !dbg !55 + %3073 = extractelement <2 x bfloat> %3065, i64 1, !dbg !55 + %3074 = extractelement <2 x bfloat> %3067, i64 0, !dbg !55 + %3075 = extractelement <2 x bfloat> %3067, i64 1, !dbg !55 + %3076 = extractelement <2 x bfloat> %3069, i64 0, !dbg !55 + %3077 = extractelement <2 x bfloat> %3069, i64 1, !dbg !55 + %3078 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !55 + %3079 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %3023, i64 %3078, i1 %2981) #6, !dbg !55 + %3080 = extractvalue { i32, i32, i32, i32 } %3079, 0, !dbg !55 + %3081 = bitcast i32 %3080 to <2 x bfloat>, !dbg !55 + %3082 = extractvalue { i32, i32, i32, i32 } %3079, 1, !dbg !55 + %3083 = bitcast i32 %3082 to <2 x bfloat>, !dbg !55 + %3084 = extractvalue { i32, i32, i32, i32 } %3079, 2, !dbg !55 + %3085 = bitcast i32 %3084 to <2 x bfloat>, !dbg !55 + %3086 = extractvalue { i32, i32, i32, i32 } %3079, 3, !dbg !55 + %3087 = bitcast i32 %3086 to <2 x bfloat>, !dbg !55 + %3088 = extractelement <2 x bfloat> %3081, i64 0, !dbg !55 + %3089 = extractelement <2 x bfloat> %3081, i64 1, !dbg !55 + %3090 = extractelement <2 x bfloat> %3083, i64 0, !dbg !55 + %3091 = extractelement <2 x bfloat> %3083, i64 1, !dbg !55 + %3092 = extractelement <2 x bfloat> %3085, i64 0, !dbg !55 + %3093 = extractelement <2 x bfloat> %3085, i64 1, !dbg !55 + %3094 = extractelement <2 x bfloat> %3087, i64 0, !dbg !55 + %3095 = extractelement <2 x bfloat> %3087, i64 1, !dbg !55 + %3096 = fpext bfloat %3034 to float, !dbg !56 + %3097 = fpext bfloat %3035 to float, !dbg !56 + %3098 = fpext bfloat %3036 to float, !dbg !56 + %3099 = fpext bfloat %3037 to float, !dbg !56 + %3100 = fpext bfloat %3038 to float, !dbg !56 + %3101 = fpext bfloat %3039 to float, !dbg !56 + %3102 = fpext bfloat %3040 to float, !dbg !56 + %3103 = fpext bfloat %3041 to float, !dbg !56 + %3104 = fpext bfloat %3052 to float, !dbg !56 + %3105 = fpext bfloat %3053 to float, !dbg !56 + %3106 = fpext bfloat %3054 to float, !dbg !56 + %3107 = fpext bfloat %3055 to float, !dbg !56 + %3108 = fpext bfloat %3056 to float, !dbg !56 + %3109 = fpext bfloat %3057 to float, !dbg !56 + %3110 = fpext bfloat %3058 to float, !dbg !56 + %3111 = fpext bfloat %3059 to float, !dbg !56 + %3112 = fpext bfloat %3070 to float, !dbg !56 + %3113 = fpext bfloat %3071 to float, !dbg !56 + %3114 = fpext bfloat %3072 to float, !dbg !56 + %3115 = fpext bfloat %3073 to float, !dbg !56 + %3116 = fpext bfloat %3074 to float, !dbg !56 + %3117 = fpext bfloat %3075 to float, !dbg !56 + %3118 = fpext bfloat %3076 to float, !dbg !56 + %3119 = fpext bfloat %3077 to float, !dbg !56 + %3120 = fpext bfloat %3088 to float, !dbg !56 + %3121 = fpext bfloat %3089 to float, !dbg !56 + %3122 = fpext bfloat %3090 to float, !dbg !56 + %3123 = fpext bfloat %3091 to float, !dbg !56 + %3124 = fpext bfloat %3092 to float, !dbg !56 + %3125 = fpext bfloat %3093 to float, !dbg !56 + %3126 = fpext bfloat %3094 to float, !dbg !56 + %3127 = fpext bfloat %3095 to float, !dbg !56 + %3128 = fsub float %3096, %2047, !dbg !57 + %3129 = fsub float %3097, %2047, !dbg !57 + %3130 = fsub float %3098, %2047, !dbg !57 + %3131 = fsub float %3099, %2047, !dbg !57 + %3132 = fsub float %3100, %2047, !dbg !57 + %3133 = fsub float %3101, %2047, !dbg !57 + %3134 = fsub float %3102, %2047, !dbg !57 + %3135 = fsub float %3103, %2047, !dbg !57 + %3136 = fsub float %3104, %2047, !dbg !57 + %3137 = fsub float %3105, %2047, !dbg !57 + %3138 = fsub float %3106, %2047, !dbg !57 + %3139 = fsub float %3107, %2047, !dbg !57 + %3140 = fsub float %3108, %2047, !dbg !57 + %3141 = fsub float %3109, %2047, !dbg !57 + %3142 = fsub float %3110, %2047, !dbg !57 + %3143 = fsub float %3111, %2047, !dbg !57 + %3144 = fsub float %3112, %2047, !dbg !57 + %3145 = fsub float %3113, %2047, !dbg !57 + %3146 = fsub float %3114, %2047, !dbg !57 + %3147 = fsub float %3115, %2047, !dbg !57 + %3148 = fsub float %3116, %2047, !dbg !57 + %3149 = fsub float %3117, %2047, !dbg !57 + %3150 = fsub float %3118, %2047, !dbg !57 + %3151 = fsub float %3119, %2047, !dbg !57 + %3152 = fsub float %3120, %2047, !dbg !57 + %3153 = fsub float %3121, %2047, !dbg !57 + %3154 = fsub float %3122, %2047, !dbg !57 + %3155 = fsub float %3123, %2047, !dbg !57 + %3156 = fsub float %3124, %2047, !dbg !57 + %3157 = fsub float %3125, %2047, !dbg !57 + %3158 = fsub float %3126, %2047, !dbg !57 + %3159 = fsub float %3127, %2047, !dbg !57 + %3160 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i374 = icmp eq i32 %3160, 0, !dbg !58 + %3161 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3128, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3162 = tail call float @llvm.nvvm.fma.rn.f(float %3128, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i375 = select i1 %.not.i374, float %3162, float %3161, !dbg !58 + %3163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i376 = icmp eq i32 %3163, 0, !dbg !58 + %3164 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i375) #6, !dbg !58 + %3165 = tail call float @llvm.nvvm.saturate.f(float %.02.i375) #6, !dbg !58 + %.03.i377 = select i1 %.not1.i376, float %3165, float %3164, !dbg !58 + %3166 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i378 = icmp eq i32 %3166, 0, !dbg !58 + %3167 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i377, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3168 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i377, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i379 = select i1 %.not2.i378, float %3168, float %3167, !dbg !58 + %3169 = fadd float %.04.i379, 0xC168000FE0000000, !dbg !58 + %3170 = fneg float %3169, !dbg !58 + %3171 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i380 = icmp eq i32 %3171, 0, !dbg !58 + %3172 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3128, float 0x3FF7154760000000, float %3170) #6, !dbg !58 + %3173 = tail call float @llvm.nvvm.fma.rn.f(float %3128, float 0x3FF7154760000000, float %3170) #6, !dbg !58 + %.0.i381 = select i1 %.not3.i380, float %3173, float %3172, !dbg !58 + %3174 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i382 = icmp eq i32 %3174, 0, !dbg !58 + %3175 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3128, float 0x3E54AE0C00000000, float %.0.i381) #6, !dbg !58 + %3176 = tail call float @llvm.nvvm.fma.rn.f(float %3128, float 0x3E54AE0C00000000, float %.0.i381) #6, !dbg !58 + %.01.i383 = select i1 %.not4.i382, float %3176, float %3175, !dbg !58 + %3177 = bitcast float %.04.i379 to i32, !dbg !58 + %3178 = shl i32 %3177, 23, !dbg !58 + %3179 = bitcast i32 %3178 to float, !dbg !58 + %3180 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i383) #6, !dbg !58 + %3181 = fmul float %3180, %3179, !dbg !58 + %3182 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i384 = icmp eq i32 %3182, 0, !dbg !58 + %3183 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3129, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3184 = tail call float @llvm.nvvm.fma.rn.f(float %3129, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i385 = select i1 %.not.i384, float %3184, float %3183, !dbg !58 + %3185 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i386 = icmp eq i32 %3185, 0, !dbg !58 + %3186 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i385) #6, !dbg !58 + %3187 = tail call float @llvm.nvvm.saturate.f(float %.02.i385) #6, !dbg !58 + %.03.i387 = select i1 %.not1.i386, float %3187, float %3186, !dbg !58 + %3188 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i388 = icmp eq i32 %3188, 0, !dbg !58 + %3189 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i387, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3190 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i387, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i389 = select i1 %.not2.i388, float %3190, float %3189, !dbg !58 + %3191 = fadd float %.04.i389, 0xC168000FE0000000, !dbg !58 + %3192 = fneg float %3191, !dbg !58 + %3193 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i390 = icmp eq i32 %3193, 0, !dbg !58 + %3194 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3129, float 0x3FF7154760000000, float %3192) #6, !dbg !58 + %3195 = tail call float @llvm.nvvm.fma.rn.f(float %3129, float 0x3FF7154760000000, float %3192) #6, !dbg !58 + %.0.i391 = select i1 %.not3.i390, float %3195, float %3194, !dbg !58 + %3196 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i392 = icmp eq i32 %3196, 0, !dbg !58 + %3197 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3129, float 0x3E54AE0C00000000, float %.0.i391) #6, !dbg !58 + %3198 = tail call float @llvm.nvvm.fma.rn.f(float %3129, float 0x3E54AE0C00000000, float %.0.i391) #6, !dbg !58 + %.01.i393 = select i1 %.not4.i392, float %3198, float %3197, !dbg !58 + %3199 = bitcast float %.04.i389 to i32, !dbg !58 + %3200 = shl i32 %3199, 23, !dbg !58 + %3201 = bitcast i32 %3200 to float, !dbg !58 + %3202 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i393) #6, !dbg !58 + %3203 = fmul float %3202, %3201, !dbg !58 + %3204 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i394 = icmp eq i32 %3204, 0, !dbg !58 + %3205 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3130, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3206 = tail call float @llvm.nvvm.fma.rn.f(float %3130, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i395 = select i1 %.not.i394, float %3206, float %3205, !dbg !58 + %3207 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i396 = icmp eq i32 %3207, 0, !dbg !58 + %3208 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i395) #6, !dbg !58 + %3209 = tail call float @llvm.nvvm.saturate.f(float %.02.i395) #6, !dbg !58 + %.03.i397 = select i1 %.not1.i396, float %3209, float %3208, !dbg !58 + %3210 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i398 = icmp eq i32 %3210, 0, !dbg !58 + %3211 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i397, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3212 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i397, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i399 = select i1 %.not2.i398, float %3212, float %3211, !dbg !58 + %3213 = fadd float %.04.i399, 0xC168000FE0000000, !dbg !58 + %3214 = fneg float %3213, !dbg !58 + %3215 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i400 = icmp eq i32 %3215, 0, !dbg !58 + %3216 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3130, float 0x3FF7154760000000, float %3214) #6, !dbg !58 + %3217 = tail call float @llvm.nvvm.fma.rn.f(float %3130, float 0x3FF7154760000000, float %3214) #6, !dbg !58 + %.0.i401 = select i1 %.not3.i400, float %3217, float %3216, !dbg !58 + %3218 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i402 = icmp eq i32 %3218, 0, !dbg !58 + %3219 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3130, float 0x3E54AE0C00000000, float %.0.i401) #6, !dbg !58 + %3220 = tail call float @llvm.nvvm.fma.rn.f(float %3130, float 0x3E54AE0C00000000, float %.0.i401) #6, !dbg !58 + %.01.i403 = select i1 %.not4.i402, float %3220, float %3219, !dbg !58 + %3221 = bitcast float %.04.i399 to i32, !dbg !58 + %3222 = shl i32 %3221, 23, !dbg !58 + %3223 = bitcast i32 %3222 to float, !dbg !58 + %3224 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i403) #6, !dbg !58 + %3225 = fmul float %3224, %3223, !dbg !58 + %3226 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i404 = icmp eq i32 %3226, 0, !dbg !58 + %3227 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3131, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3228 = tail call float @llvm.nvvm.fma.rn.f(float %3131, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i405 = select i1 %.not.i404, float %3228, float %3227, !dbg !58 + %3229 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i406 = icmp eq i32 %3229, 0, !dbg !58 + %3230 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i405) #6, !dbg !58 + %3231 = tail call float @llvm.nvvm.saturate.f(float %.02.i405) #6, !dbg !58 + %.03.i407 = select i1 %.not1.i406, float %3231, float %3230, !dbg !58 + %3232 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i408 = icmp eq i32 %3232, 0, !dbg !58 + %3233 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i407, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3234 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i407, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i409 = select i1 %.not2.i408, float %3234, float %3233, !dbg !58 + %3235 = fadd float %.04.i409, 0xC168000FE0000000, !dbg !58 + %3236 = fneg float %3235, !dbg !58 + %3237 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i410 = icmp eq i32 %3237, 0, !dbg !58 + %3238 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3131, float 0x3FF7154760000000, float %3236) #6, !dbg !58 + %3239 = tail call float @llvm.nvvm.fma.rn.f(float %3131, float 0x3FF7154760000000, float %3236) #6, !dbg !58 + %.0.i411 = select i1 %.not3.i410, float %3239, float %3238, !dbg !58 + %3240 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i412 = icmp eq i32 %3240, 0, !dbg !58 + %3241 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3131, float 0x3E54AE0C00000000, float %.0.i411) #6, !dbg !58 + %3242 = tail call float @llvm.nvvm.fma.rn.f(float %3131, float 0x3E54AE0C00000000, float %.0.i411) #6, !dbg !58 + %.01.i413 = select i1 %.not4.i412, float %3242, float %3241, !dbg !58 + %3243 = bitcast float %.04.i409 to i32, !dbg !58 + %3244 = shl i32 %3243, 23, !dbg !58 + %3245 = bitcast i32 %3244 to float, !dbg !58 + %3246 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i413) #6, !dbg !58 + %3247 = fmul float %3246, %3245, !dbg !58 + %3248 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i414 = icmp eq i32 %3248, 0, !dbg !58 + %3249 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3132, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3250 = tail call float @llvm.nvvm.fma.rn.f(float %3132, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i415 = select i1 %.not.i414, float %3250, float %3249, !dbg !58 + %3251 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i416 = icmp eq i32 %3251, 0, !dbg !58 + %3252 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i415) #6, !dbg !58 + %3253 = tail call float @llvm.nvvm.saturate.f(float %.02.i415) #6, !dbg !58 + %.03.i417 = select i1 %.not1.i416, float %3253, float %3252, !dbg !58 + %3254 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i418 = icmp eq i32 %3254, 0, !dbg !58 + %3255 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i417, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3256 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i417, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i419 = select i1 %.not2.i418, float %3256, float %3255, !dbg !58 + %3257 = fadd float %.04.i419, 0xC168000FE0000000, !dbg !58 + %3258 = fneg float %3257, !dbg !58 + %3259 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i420 = icmp eq i32 %3259, 0, !dbg !58 + %3260 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3132, float 0x3FF7154760000000, float %3258) #6, !dbg !58 + %3261 = tail call float @llvm.nvvm.fma.rn.f(float %3132, float 0x3FF7154760000000, float %3258) #6, !dbg !58 + %.0.i421 = select i1 %.not3.i420, float %3261, float %3260, !dbg !58 + %3262 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i422 = icmp eq i32 %3262, 0, !dbg !58 + %3263 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3132, float 0x3E54AE0C00000000, float %.0.i421) #6, !dbg !58 + %3264 = tail call float @llvm.nvvm.fma.rn.f(float %3132, float 0x3E54AE0C00000000, float %.0.i421) #6, !dbg !58 + %.01.i423 = select i1 %.not4.i422, float %3264, float %3263, !dbg !58 + %3265 = bitcast float %.04.i419 to i32, !dbg !58 + %3266 = shl i32 %3265, 23, !dbg !58 + %3267 = bitcast i32 %3266 to float, !dbg !58 + %3268 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i423) #6, !dbg !58 + %3269 = fmul float %3268, %3267, !dbg !58 + %3270 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i424 = icmp eq i32 %3270, 0, !dbg !58 + %3271 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3133, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3272 = tail call float @llvm.nvvm.fma.rn.f(float %3133, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i425 = select i1 %.not.i424, float %3272, float %3271, !dbg !58 + %3273 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i426 = icmp eq i32 %3273, 0, !dbg !58 + %3274 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i425) #6, !dbg !58 + %3275 = tail call float @llvm.nvvm.saturate.f(float %.02.i425) #6, !dbg !58 + %.03.i427 = select i1 %.not1.i426, float %3275, float %3274, !dbg !58 + %3276 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i428 = icmp eq i32 %3276, 0, !dbg !58 + %3277 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i427, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3278 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i427, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i429 = select i1 %.not2.i428, float %3278, float %3277, !dbg !58 + %3279 = fadd float %.04.i429, 0xC168000FE0000000, !dbg !58 + %3280 = fneg float %3279, !dbg !58 + %3281 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i430 = icmp eq i32 %3281, 0, !dbg !58 + %3282 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3133, float 0x3FF7154760000000, float %3280) #6, !dbg !58 + %3283 = tail call float @llvm.nvvm.fma.rn.f(float %3133, float 0x3FF7154760000000, float %3280) #6, !dbg !58 + %.0.i431 = select i1 %.not3.i430, float %3283, float %3282, !dbg !58 + %3284 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i432 = icmp eq i32 %3284, 0, !dbg !58 + %3285 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3133, float 0x3E54AE0C00000000, float %.0.i431) #6, !dbg !58 + %3286 = tail call float @llvm.nvvm.fma.rn.f(float %3133, float 0x3E54AE0C00000000, float %.0.i431) #6, !dbg !58 + %.01.i433 = select i1 %.not4.i432, float %3286, float %3285, !dbg !58 + %3287 = bitcast float %.04.i429 to i32, !dbg !58 + %3288 = shl i32 %3287, 23, !dbg !58 + %3289 = bitcast i32 %3288 to float, !dbg !58 + %3290 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i433) #6, !dbg !58 + %3291 = fmul float %3290, %3289, !dbg !58 + %3292 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i434 = icmp eq i32 %3292, 0, !dbg !58 + %3293 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3134, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3294 = tail call float @llvm.nvvm.fma.rn.f(float %3134, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i435 = select i1 %.not.i434, float %3294, float %3293, !dbg !58 + %3295 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i436 = icmp eq i32 %3295, 0, !dbg !58 + %3296 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i435) #6, !dbg !58 + %3297 = tail call float @llvm.nvvm.saturate.f(float %.02.i435) #6, !dbg !58 + %.03.i437 = select i1 %.not1.i436, float %3297, float %3296, !dbg !58 + %3298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i438 = icmp eq i32 %3298, 0, !dbg !58 + %3299 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i437, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3300 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i437, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i439 = select i1 %.not2.i438, float %3300, float %3299, !dbg !58 + %3301 = fadd float %.04.i439, 0xC168000FE0000000, !dbg !58 + %3302 = fneg float %3301, !dbg !58 + %3303 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i440 = icmp eq i32 %3303, 0, !dbg !58 + %3304 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3134, float 0x3FF7154760000000, float %3302) #6, !dbg !58 + %3305 = tail call float @llvm.nvvm.fma.rn.f(float %3134, float 0x3FF7154760000000, float %3302) #6, !dbg !58 + %.0.i441 = select i1 %.not3.i440, float %3305, float %3304, !dbg !58 + %3306 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i442 = icmp eq i32 %3306, 0, !dbg !58 + %3307 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3134, float 0x3E54AE0C00000000, float %.0.i441) #6, !dbg !58 + %3308 = tail call float @llvm.nvvm.fma.rn.f(float %3134, float 0x3E54AE0C00000000, float %.0.i441) #6, !dbg !58 + %.01.i443 = select i1 %.not4.i442, float %3308, float %3307, !dbg !58 + %3309 = bitcast float %.04.i439 to i32, !dbg !58 + %3310 = shl i32 %3309, 23, !dbg !58 + %3311 = bitcast i32 %3310 to float, !dbg !58 + %3312 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i443) #6, !dbg !58 + %3313 = fmul float %3312, %3311, !dbg !58 + %3314 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i444 = icmp eq i32 %3314, 0, !dbg !58 + %3315 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3135, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3316 = tail call float @llvm.nvvm.fma.rn.f(float %3135, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i445 = select i1 %.not.i444, float %3316, float %3315, !dbg !58 + %3317 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i446 = icmp eq i32 %3317, 0, !dbg !58 + %3318 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i445) #6, !dbg !58 + %3319 = tail call float @llvm.nvvm.saturate.f(float %.02.i445) #6, !dbg !58 + %.03.i447 = select i1 %.not1.i446, float %3319, float %3318, !dbg !58 + %3320 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i448 = icmp eq i32 %3320, 0, !dbg !58 + %3321 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i447, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3322 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i447, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i449 = select i1 %.not2.i448, float %3322, float %3321, !dbg !58 + %3323 = fadd float %.04.i449, 0xC168000FE0000000, !dbg !58 + %3324 = fneg float %3323, !dbg !58 + %3325 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i450 = icmp eq i32 %3325, 0, !dbg !58 + %3326 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3135, float 0x3FF7154760000000, float %3324) #6, !dbg !58 + %3327 = tail call float @llvm.nvvm.fma.rn.f(float %3135, float 0x3FF7154760000000, float %3324) #6, !dbg !58 + %.0.i451 = select i1 %.not3.i450, float %3327, float %3326, !dbg !58 + %3328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i452 = icmp eq i32 %3328, 0, !dbg !58 + %3329 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3135, float 0x3E54AE0C00000000, float %.0.i451) #6, !dbg !58 + %3330 = tail call float @llvm.nvvm.fma.rn.f(float %3135, float 0x3E54AE0C00000000, float %.0.i451) #6, !dbg !58 + %.01.i453 = select i1 %.not4.i452, float %3330, float %3329, !dbg !58 + %3331 = bitcast float %.04.i449 to i32, !dbg !58 + %3332 = shl i32 %3331, 23, !dbg !58 + %3333 = bitcast i32 %3332 to float, !dbg !58 + %3334 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i453) #6, !dbg !58 + %3335 = fmul float %3334, %3333, !dbg !58 + %3336 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i454 = icmp eq i32 %3336, 0, !dbg !58 + %3337 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3136, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3338 = tail call float @llvm.nvvm.fma.rn.f(float %3136, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i455 = select i1 %.not.i454, float %3338, float %3337, !dbg !58 + %3339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i456 = icmp eq i32 %3339, 0, !dbg !58 + %3340 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i455) #6, !dbg !58 + %3341 = tail call float @llvm.nvvm.saturate.f(float %.02.i455) #6, !dbg !58 + %.03.i457 = select i1 %.not1.i456, float %3341, float %3340, !dbg !58 + %3342 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i458 = icmp eq i32 %3342, 0, !dbg !58 + %3343 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i457, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3344 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i457, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i459 = select i1 %.not2.i458, float %3344, float %3343, !dbg !58 + %3345 = fadd float %.04.i459, 0xC168000FE0000000, !dbg !58 + %3346 = fneg float %3345, !dbg !58 + %3347 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i460 = icmp eq i32 %3347, 0, !dbg !58 + %3348 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3136, float 0x3FF7154760000000, float %3346) #6, !dbg !58 + %3349 = tail call float @llvm.nvvm.fma.rn.f(float %3136, float 0x3FF7154760000000, float %3346) #6, !dbg !58 + %.0.i461 = select i1 %.not3.i460, float %3349, float %3348, !dbg !58 + %3350 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i462 = icmp eq i32 %3350, 0, !dbg !58 + %3351 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3136, float 0x3E54AE0C00000000, float %.0.i461) #6, !dbg !58 + %3352 = tail call float @llvm.nvvm.fma.rn.f(float %3136, float 0x3E54AE0C00000000, float %.0.i461) #6, !dbg !58 + %.01.i463 = select i1 %.not4.i462, float %3352, float %3351, !dbg !58 + %3353 = bitcast float %.04.i459 to i32, !dbg !58 + %3354 = shl i32 %3353, 23, !dbg !58 + %3355 = bitcast i32 %3354 to float, !dbg !58 + %3356 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i463) #6, !dbg !58 + %3357 = fmul float %3356, %3355, !dbg !58 + %3358 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i464 = icmp eq i32 %3358, 0, !dbg !58 + %3359 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3137, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3360 = tail call float @llvm.nvvm.fma.rn.f(float %3137, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i465 = select i1 %.not.i464, float %3360, float %3359, !dbg !58 + %3361 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i466 = icmp eq i32 %3361, 0, !dbg !58 + %3362 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i465) #6, !dbg !58 + %3363 = tail call float @llvm.nvvm.saturate.f(float %.02.i465) #6, !dbg !58 + %.03.i467 = select i1 %.not1.i466, float %3363, float %3362, !dbg !58 + %3364 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i468 = icmp eq i32 %3364, 0, !dbg !58 + %3365 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i467, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3366 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i467, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i469 = select i1 %.not2.i468, float %3366, float %3365, !dbg !58 + %3367 = fadd float %.04.i469, 0xC168000FE0000000, !dbg !58 + %3368 = fneg float %3367, !dbg !58 + %3369 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i470 = icmp eq i32 %3369, 0, !dbg !58 + %3370 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3137, float 0x3FF7154760000000, float %3368) #6, !dbg !58 + %3371 = tail call float @llvm.nvvm.fma.rn.f(float %3137, float 0x3FF7154760000000, float %3368) #6, !dbg !58 + %.0.i471 = select i1 %.not3.i470, float %3371, float %3370, !dbg !58 + %3372 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i472 = icmp eq i32 %3372, 0, !dbg !58 + %3373 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3137, float 0x3E54AE0C00000000, float %.0.i471) #6, !dbg !58 + %3374 = tail call float @llvm.nvvm.fma.rn.f(float %3137, float 0x3E54AE0C00000000, float %.0.i471) #6, !dbg !58 + %.01.i473 = select i1 %.not4.i472, float %3374, float %3373, !dbg !58 + %3375 = bitcast float %.04.i469 to i32, !dbg !58 + %3376 = shl i32 %3375, 23, !dbg !58 + %3377 = bitcast i32 %3376 to float, !dbg !58 + %3378 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i473) #6, !dbg !58 + %3379 = fmul float %3378, %3377, !dbg !58 + %3380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i474 = icmp eq i32 %3380, 0, !dbg !58 + %3381 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3138, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3382 = tail call float @llvm.nvvm.fma.rn.f(float %3138, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i475 = select i1 %.not.i474, float %3382, float %3381, !dbg !58 + %3383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i476 = icmp eq i32 %3383, 0, !dbg !58 + %3384 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i475) #6, !dbg !58 + %3385 = tail call float @llvm.nvvm.saturate.f(float %.02.i475) #6, !dbg !58 + %.03.i477 = select i1 %.not1.i476, float %3385, float %3384, !dbg !58 + %3386 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i478 = icmp eq i32 %3386, 0, !dbg !58 + %3387 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i477, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3388 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i477, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i479 = select i1 %.not2.i478, float %3388, float %3387, !dbg !58 + %3389 = fadd float %.04.i479, 0xC168000FE0000000, !dbg !58 + %3390 = fneg float %3389, !dbg !58 + %3391 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i480 = icmp eq i32 %3391, 0, !dbg !58 + %3392 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3138, float 0x3FF7154760000000, float %3390) #6, !dbg !58 + %3393 = tail call float @llvm.nvvm.fma.rn.f(float %3138, float 0x3FF7154760000000, float %3390) #6, !dbg !58 + %.0.i481 = select i1 %.not3.i480, float %3393, float %3392, !dbg !58 + %3394 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i482 = icmp eq i32 %3394, 0, !dbg !58 + %3395 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3138, float 0x3E54AE0C00000000, float %.0.i481) #6, !dbg !58 + %3396 = tail call float @llvm.nvvm.fma.rn.f(float %3138, float 0x3E54AE0C00000000, float %.0.i481) #6, !dbg !58 + %.01.i483 = select i1 %.not4.i482, float %3396, float %3395, !dbg !58 + %3397 = bitcast float %.04.i479 to i32, !dbg !58 + %3398 = shl i32 %3397, 23, !dbg !58 + %3399 = bitcast i32 %3398 to float, !dbg !58 + %3400 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i483) #6, !dbg !58 + %3401 = fmul float %3400, %3399, !dbg !58 + %3402 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i484 = icmp eq i32 %3402, 0, !dbg !58 + %3403 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3139, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3404 = tail call float @llvm.nvvm.fma.rn.f(float %3139, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i485 = select i1 %.not.i484, float %3404, float %3403, !dbg !58 + %3405 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i486 = icmp eq i32 %3405, 0, !dbg !58 + %3406 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i485) #6, !dbg !58 + %3407 = tail call float @llvm.nvvm.saturate.f(float %.02.i485) #6, !dbg !58 + %.03.i487 = select i1 %.not1.i486, float %3407, float %3406, !dbg !58 + %3408 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i488 = icmp eq i32 %3408, 0, !dbg !58 + %3409 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i487, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3410 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i487, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i489 = select i1 %.not2.i488, float %3410, float %3409, !dbg !58 + %3411 = fadd float %.04.i489, 0xC168000FE0000000, !dbg !58 + %3412 = fneg float %3411, !dbg !58 + %3413 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i490 = icmp eq i32 %3413, 0, !dbg !58 + %3414 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3139, float 0x3FF7154760000000, float %3412) #6, !dbg !58 + %3415 = tail call float @llvm.nvvm.fma.rn.f(float %3139, float 0x3FF7154760000000, float %3412) #6, !dbg !58 + %.0.i491 = select i1 %.not3.i490, float %3415, float %3414, !dbg !58 + %3416 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i492 = icmp eq i32 %3416, 0, !dbg !58 + %3417 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3139, float 0x3E54AE0C00000000, float %.0.i491) #6, !dbg !58 + %3418 = tail call float @llvm.nvvm.fma.rn.f(float %3139, float 0x3E54AE0C00000000, float %.0.i491) #6, !dbg !58 + %.01.i493 = select i1 %.not4.i492, float %3418, float %3417, !dbg !58 + %3419 = bitcast float %.04.i489 to i32, !dbg !58 + %3420 = shl i32 %3419, 23, !dbg !58 + %3421 = bitcast i32 %3420 to float, !dbg !58 + %3422 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i493) #6, !dbg !58 + %3423 = fmul float %3422, %3421, !dbg !58 + %3424 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i494 = icmp eq i32 %3424, 0, !dbg !58 + %3425 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3140, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3426 = tail call float @llvm.nvvm.fma.rn.f(float %3140, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i495 = select i1 %.not.i494, float %3426, float %3425, !dbg !58 + %3427 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i496 = icmp eq i32 %3427, 0, !dbg !58 + %3428 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i495) #6, !dbg !58 + %3429 = tail call float @llvm.nvvm.saturate.f(float %.02.i495) #6, !dbg !58 + %.03.i497 = select i1 %.not1.i496, float %3429, float %3428, !dbg !58 + %3430 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i498 = icmp eq i32 %3430, 0, !dbg !58 + %3431 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i497, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3432 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i497, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i499 = select i1 %.not2.i498, float %3432, float %3431, !dbg !58 + %3433 = fadd float %.04.i499, 0xC168000FE0000000, !dbg !58 + %3434 = fneg float %3433, !dbg !58 + %3435 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i500 = icmp eq i32 %3435, 0, !dbg !58 + %3436 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3140, float 0x3FF7154760000000, float %3434) #6, !dbg !58 + %3437 = tail call float @llvm.nvvm.fma.rn.f(float %3140, float 0x3FF7154760000000, float %3434) #6, !dbg !58 + %.0.i501 = select i1 %.not3.i500, float %3437, float %3436, !dbg !58 + %3438 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i502 = icmp eq i32 %3438, 0, !dbg !58 + %3439 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3140, float 0x3E54AE0C00000000, float %.0.i501) #6, !dbg !58 + %3440 = tail call float @llvm.nvvm.fma.rn.f(float %3140, float 0x3E54AE0C00000000, float %.0.i501) #6, !dbg !58 + %.01.i503 = select i1 %.not4.i502, float %3440, float %3439, !dbg !58 + %3441 = bitcast float %.04.i499 to i32, !dbg !58 + %3442 = shl i32 %3441, 23, !dbg !58 + %3443 = bitcast i32 %3442 to float, !dbg !58 + %3444 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i503) #6, !dbg !58 + %3445 = fmul float %3444, %3443, !dbg !58 + %3446 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i504 = icmp eq i32 %3446, 0, !dbg !58 + %3447 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3141, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3448 = tail call float @llvm.nvvm.fma.rn.f(float %3141, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i505 = select i1 %.not.i504, float %3448, float %3447, !dbg !58 + %3449 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i506 = icmp eq i32 %3449, 0, !dbg !58 + %3450 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i505) #6, !dbg !58 + %3451 = tail call float @llvm.nvvm.saturate.f(float %.02.i505) #6, !dbg !58 + %.03.i507 = select i1 %.not1.i506, float %3451, float %3450, !dbg !58 + %3452 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i508 = icmp eq i32 %3452, 0, !dbg !58 + %3453 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i507, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3454 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i507, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i509 = select i1 %.not2.i508, float %3454, float %3453, !dbg !58 + %3455 = fadd float %.04.i509, 0xC168000FE0000000, !dbg !58 + %3456 = fneg float %3455, !dbg !58 + %3457 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i510 = icmp eq i32 %3457, 0, !dbg !58 + %3458 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3141, float 0x3FF7154760000000, float %3456) #6, !dbg !58 + %3459 = tail call float @llvm.nvvm.fma.rn.f(float %3141, float 0x3FF7154760000000, float %3456) #6, !dbg !58 + %.0.i511 = select i1 %.not3.i510, float %3459, float %3458, !dbg !58 + %3460 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i512 = icmp eq i32 %3460, 0, !dbg !58 + %3461 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3141, float 0x3E54AE0C00000000, float %.0.i511) #6, !dbg !58 + %3462 = tail call float @llvm.nvvm.fma.rn.f(float %3141, float 0x3E54AE0C00000000, float %.0.i511) #6, !dbg !58 + %.01.i513 = select i1 %.not4.i512, float %3462, float %3461, !dbg !58 + %3463 = bitcast float %.04.i509 to i32, !dbg !58 + %3464 = shl i32 %3463, 23, !dbg !58 + %3465 = bitcast i32 %3464 to float, !dbg !58 + %3466 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i513) #6, !dbg !58 + %3467 = fmul float %3466, %3465, !dbg !58 + %3468 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i514 = icmp eq i32 %3468, 0, !dbg !58 + %3469 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3142, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3470 = tail call float @llvm.nvvm.fma.rn.f(float %3142, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i515 = select i1 %.not.i514, float %3470, float %3469, !dbg !58 + %3471 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i516 = icmp eq i32 %3471, 0, !dbg !58 + %3472 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i515) #6, !dbg !58 + %3473 = tail call float @llvm.nvvm.saturate.f(float %.02.i515) #6, !dbg !58 + %.03.i517 = select i1 %.not1.i516, float %3473, float %3472, !dbg !58 + %3474 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i518 = icmp eq i32 %3474, 0, !dbg !58 + %3475 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i517, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3476 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i517, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i519 = select i1 %.not2.i518, float %3476, float %3475, !dbg !58 + %3477 = fadd float %.04.i519, 0xC168000FE0000000, !dbg !58 + %3478 = fneg float %3477, !dbg !58 + %3479 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i520 = icmp eq i32 %3479, 0, !dbg !58 + %3480 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3142, float 0x3FF7154760000000, float %3478) #6, !dbg !58 + %3481 = tail call float @llvm.nvvm.fma.rn.f(float %3142, float 0x3FF7154760000000, float %3478) #6, !dbg !58 + %.0.i521 = select i1 %.not3.i520, float %3481, float %3480, !dbg !58 + %3482 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i522 = icmp eq i32 %3482, 0, !dbg !58 + %3483 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3142, float 0x3E54AE0C00000000, float %.0.i521) #6, !dbg !58 + %3484 = tail call float @llvm.nvvm.fma.rn.f(float %3142, float 0x3E54AE0C00000000, float %.0.i521) #6, !dbg !58 + %.01.i523 = select i1 %.not4.i522, float %3484, float %3483, !dbg !58 + %3485 = bitcast float %.04.i519 to i32, !dbg !58 + %3486 = shl i32 %3485, 23, !dbg !58 + %3487 = bitcast i32 %3486 to float, !dbg !58 + %3488 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i523) #6, !dbg !58 + %3489 = fmul float %3488, %3487, !dbg !58 + %3490 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i524 = icmp eq i32 %3490, 0, !dbg !58 + %3491 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3143, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3492 = tail call float @llvm.nvvm.fma.rn.f(float %3143, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i525 = select i1 %.not.i524, float %3492, float %3491, !dbg !58 + %3493 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i526 = icmp eq i32 %3493, 0, !dbg !58 + %3494 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i525) #6, !dbg !58 + %3495 = tail call float @llvm.nvvm.saturate.f(float %.02.i525) #6, !dbg !58 + %.03.i527 = select i1 %.not1.i526, float %3495, float %3494, !dbg !58 + %3496 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i528 = icmp eq i32 %3496, 0, !dbg !58 + %3497 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i527, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3498 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i527, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i529 = select i1 %.not2.i528, float %3498, float %3497, !dbg !58 + %3499 = fadd float %.04.i529, 0xC168000FE0000000, !dbg !58 + %3500 = fneg float %3499, !dbg !58 + %3501 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i530 = icmp eq i32 %3501, 0, !dbg !58 + %3502 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3143, float 0x3FF7154760000000, float %3500) #6, !dbg !58 + %3503 = tail call float @llvm.nvvm.fma.rn.f(float %3143, float 0x3FF7154760000000, float %3500) #6, !dbg !58 + %.0.i531 = select i1 %.not3.i530, float %3503, float %3502, !dbg !58 + %3504 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i532 = icmp eq i32 %3504, 0, !dbg !58 + %3505 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3143, float 0x3E54AE0C00000000, float %.0.i531) #6, !dbg !58 + %3506 = tail call float @llvm.nvvm.fma.rn.f(float %3143, float 0x3E54AE0C00000000, float %.0.i531) #6, !dbg !58 + %.01.i533 = select i1 %.not4.i532, float %3506, float %3505, !dbg !58 + %3507 = bitcast float %.04.i529 to i32, !dbg !58 + %3508 = shl i32 %3507, 23, !dbg !58 + %3509 = bitcast i32 %3508 to float, !dbg !58 + %3510 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i533) #6, !dbg !58 + %3511 = fmul float %3510, %3509, !dbg !58 + %3512 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i534 = icmp eq i32 %3512, 0, !dbg !58 + %3513 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3144, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3514 = tail call float @llvm.nvvm.fma.rn.f(float %3144, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i535 = select i1 %.not.i534, float %3514, float %3513, !dbg !58 + %3515 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i536 = icmp eq i32 %3515, 0, !dbg !58 + %3516 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i535) #6, !dbg !58 + %3517 = tail call float @llvm.nvvm.saturate.f(float %.02.i535) #6, !dbg !58 + %.03.i537 = select i1 %.not1.i536, float %3517, float %3516, !dbg !58 + %3518 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i538 = icmp eq i32 %3518, 0, !dbg !58 + %3519 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i537, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3520 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i537, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i539 = select i1 %.not2.i538, float %3520, float %3519, !dbg !58 + %3521 = fadd float %.04.i539, 0xC168000FE0000000, !dbg !58 + %3522 = fneg float %3521, !dbg !58 + %3523 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i540 = icmp eq i32 %3523, 0, !dbg !58 + %3524 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3144, float 0x3FF7154760000000, float %3522) #6, !dbg !58 + %3525 = tail call float @llvm.nvvm.fma.rn.f(float %3144, float 0x3FF7154760000000, float %3522) #6, !dbg !58 + %.0.i541 = select i1 %.not3.i540, float %3525, float %3524, !dbg !58 + %3526 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i542 = icmp eq i32 %3526, 0, !dbg !58 + %3527 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3144, float 0x3E54AE0C00000000, float %.0.i541) #6, !dbg !58 + %3528 = tail call float @llvm.nvvm.fma.rn.f(float %3144, float 0x3E54AE0C00000000, float %.0.i541) #6, !dbg !58 + %.01.i543 = select i1 %.not4.i542, float %3528, float %3527, !dbg !58 + %3529 = bitcast float %.04.i539 to i32, !dbg !58 + %3530 = shl i32 %3529, 23, !dbg !58 + %3531 = bitcast i32 %3530 to float, !dbg !58 + %3532 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i543) #6, !dbg !58 + %3533 = fmul float %3532, %3531, !dbg !58 + %3534 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i544 = icmp eq i32 %3534, 0, !dbg !58 + %3535 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3145, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3536 = tail call float @llvm.nvvm.fma.rn.f(float %3145, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i545 = select i1 %.not.i544, float %3536, float %3535, !dbg !58 + %3537 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i546 = icmp eq i32 %3537, 0, !dbg !58 + %3538 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i545) #6, !dbg !58 + %3539 = tail call float @llvm.nvvm.saturate.f(float %.02.i545) #6, !dbg !58 + %.03.i547 = select i1 %.not1.i546, float %3539, float %3538, !dbg !58 + %3540 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i548 = icmp eq i32 %3540, 0, !dbg !58 + %3541 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i547, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3542 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i547, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i549 = select i1 %.not2.i548, float %3542, float %3541, !dbg !58 + %3543 = fadd float %.04.i549, 0xC168000FE0000000, !dbg !58 + %3544 = fneg float %3543, !dbg !58 + %3545 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i550 = icmp eq i32 %3545, 0, !dbg !58 + %3546 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3145, float 0x3FF7154760000000, float %3544) #6, !dbg !58 + %3547 = tail call float @llvm.nvvm.fma.rn.f(float %3145, float 0x3FF7154760000000, float %3544) #6, !dbg !58 + %.0.i551 = select i1 %.not3.i550, float %3547, float %3546, !dbg !58 + %3548 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i552 = icmp eq i32 %3548, 0, !dbg !58 + %3549 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3145, float 0x3E54AE0C00000000, float %.0.i551) #6, !dbg !58 + %3550 = tail call float @llvm.nvvm.fma.rn.f(float %3145, float 0x3E54AE0C00000000, float %.0.i551) #6, !dbg !58 + %.01.i553 = select i1 %.not4.i552, float %3550, float %3549, !dbg !58 + %3551 = bitcast float %.04.i549 to i32, !dbg !58 + %3552 = shl i32 %3551, 23, !dbg !58 + %3553 = bitcast i32 %3552 to float, !dbg !58 + %3554 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i553) #6, !dbg !58 + %3555 = fmul float %3554, %3553, !dbg !58 + %3556 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i554 = icmp eq i32 %3556, 0, !dbg !58 + %3557 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3146, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3558 = tail call float @llvm.nvvm.fma.rn.f(float %3146, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i555 = select i1 %.not.i554, float %3558, float %3557, !dbg !58 + %3559 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i556 = icmp eq i32 %3559, 0, !dbg !58 + %3560 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i555) #6, !dbg !58 + %3561 = tail call float @llvm.nvvm.saturate.f(float %.02.i555) #6, !dbg !58 + %.03.i557 = select i1 %.not1.i556, float %3561, float %3560, !dbg !58 + %3562 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i558 = icmp eq i32 %3562, 0, !dbg !58 + %3563 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i557, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3564 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i557, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i559 = select i1 %.not2.i558, float %3564, float %3563, !dbg !58 + %3565 = fadd float %.04.i559, 0xC168000FE0000000, !dbg !58 + %3566 = fneg float %3565, !dbg !58 + %3567 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i560 = icmp eq i32 %3567, 0, !dbg !58 + %3568 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3146, float 0x3FF7154760000000, float %3566) #6, !dbg !58 + %3569 = tail call float @llvm.nvvm.fma.rn.f(float %3146, float 0x3FF7154760000000, float %3566) #6, !dbg !58 + %.0.i561 = select i1 %.not3.i560, float %3569, float %3568, !dbg !58 + %3570 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i562 = icmp eq i32 %3570, 0, !dbg !58 + %3571 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3146, float 0x3E54AE0C00000000, float %.0.i561) #6, !dbg !58 + %3572 = tail call float @llvm.nvvm.fma.rn.f(float %3146, float 0x3E54AE0C00000000, float %.0.i561) #6, !dbg !58 + %.01.i563 = select i1 %.not4.i562, float %3572, float %3571, !dbg !58 + %3573 = bitcast float %.04.i559 to i32, !dbg !58 + %3574 = shl i32 %3573, 23, !dbg !58 + %3575 = bitcast i32 %3574 to float, !dbg !58 + %3576 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i563) #6, !dbg !58 + %3577 = fmul float %3576, %3575, !dbg !58 + %3578 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i564 = icmp eq i32 %3578, 0, !dbg !58 + %3579 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3147, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3580 = tail call float @llvm.nvvm.fma.rn.f(float %3147, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i565 = select i1 %.not.i564, float %3580, float %3579, !dbg !58 + %3581 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i566 = icmp eq i32 %3581, 0, !dbg !58 + %3582 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i565) #6, !dbg !58 + %3583 = tail call float @llvm.nvvm.saturate.f(float %.02.i565) #6, !dbg !58 + %.03.i567 = select i1 %.not1.i566, float %3583, float %3582, !dbg !58 + %3584 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i568 = icmp eq i32 %3584, 0, !dbg !58 + %3585 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i567, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3586 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i567, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i569 = select i1 %.not2.i568, float %3586, float %3585, !dbg !58 + %3587 = fadd float %.04.i569, 0xC168000FE0000000, !dbg !58 + %3588 = fneg float %3587, !dbg !58 + %3589 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i570 = icmp eq i32 %3589, 0, !dbg !58 + %3590 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3147, float 0x3FF7154760000000, float %3588) #6, !dbg !58 + %3591 = tail call float @llvm.nvvm.fma.rn.f(float %3147, float 0x3FF7154760000000, float %3588) #6, !dbg !58 + %.0.i571 = select i1 %.not3.i570, float %3591, float %3590, !dbg !58 + %3592 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i572 = icmp eq i32 %3592, 0, !dbg !58 + %3593 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3147, float 0x3E54AE0C00000000, float %.0.i571) #6, !dbg !58 + %3594 = tail call float @llvm.nvvm.fma.rn.f(float %3147, float 0x3E54AE0C00000000, float %.0.i571) #6, !dbg !58 + %.01.i573 = select i1 %.not4.i572, float %3594, float %3593, !dbg !58 + %3595 = bitcast float %.04.i569 to i32, !dbg !58 + %3596 = shl i32 %3595, 23, !dbg !58 + %3597 = bitcast i32 %3596 to float, !dbg !58 + %3598 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i573) #6, !dbg !58 + %3599 = fmul float %3598, %3597, !dbg !58 + %3600 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i574 = icmp eq i32 %3600, 0, !dbg !58 + %3601 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3148, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3602 = tail call float @llvm.nvvm.fma.rn.f(float %3148, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i575 = select i1 %.not.i574, float %3602, float %3601, !dbg !58 + %3603 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i576 = icmp eq i32 %3603, 0, !dbg !58 + %3604 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i575) #6, !dbg !58 + %3605 = tail call float @llvm.nvvm.saturate.f(float %.02.i575) #6, !dbg !58 + %.03.i577 = select i1 %.not1.i576, float %3605, float %3604, !dbg !58 + %3606 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i578 = icmp eq i32 %3606, 0, !dbg !58 + %3607 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i577, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3608 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i577, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i579 = select i1 %.not2.i578, float %3608, float %3607, !dbg !58 + %3609 = fadd float %.04.i579, 0xC168000FE0000000, !dbg !58 + %3610 = fneg float %3609, !dbg !58 + %3611 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i580 = icmp eq i32 %3611, 0, !dbg !58 + %3612 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3148, float 0x3FF7154760000000, float %3610) #6, !dbg !58 + %3613 = tail call float @llvm.nvvm.fma.rn.f(float %3148, float 0x3FF7154760000000, float %3610) #6, !dbg !58 + %.0.i581 = select i1 %.not3.i580, float %3613, float %3612, !dbg !58 + %3614 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i582 = icmp eq i32 %3614, 0, !dbg !58 + %3615 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3148, float 0x3E54AE0C00000000, float %.0.i581) #6, !dbg !58 + %3616 = tail call float @llvm.nvvm.fma.rn.f(float %3148, float 0x3E54AE0C00000000, float %.0.i581) #6, !dbg !58 + %.01.i583 = select i1 %.not4.i582, float %3616, float %3615, !dbg !58 + %3617 = bitcast float %.04.i579 to i32, !dbg !58 + %3618 = shl i32 %3617, 23, !dbg !58 + %3619 = bitcast i32 %3618 to float, !dbg !58 + %3620 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i583) #6, !dbg !58 + %3621 = fmul float %3620, %3619, !dbg !58 + %3622 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i584 = icmp eq i32 %3622, 0, !dbg !58 + %3623 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3149, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3624 = tail call float @llvm.nvvm.fma.rn.f(float %3149, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i585 = select i1 %.not.i584, float %3624, float %3623, !dbg !58 + %3625 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i586 = icmp eq i32 %3625, 0, !dbg !58 + %3626 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i585) #6, !dbg !58 + %3627 = tail call float @llvm.nvvm.saturate.f(float %.02.i585) #6, !dbg !58 + %.03.i587 = select i1 %.not1.i586, float %3627, float %3626, !dbg !58 + %3628 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i588 = icmp eq i32 %3628, 0, !dbg !58 + %3629 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i587, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3630 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i587, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i589 = select i1 %.not2.i588, float %3630, float %3629, !dbg !58 + %3631 = fadd float %.04.i589, 0xC168000FE0000000, !dbg !58 + %3632 = fneg float %3631, !dbg !58 + %3633 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i590 = icmp eq i32 %3633, 0, !dbg !58 + %3634 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3149, float 0x3FF7154760000000, float %3632) #6, !dbg !58 + %3635 = tail call float @llvm.nvvm.fma.rn.f(float %3149, float 0x3FF7154760000000, float %3632) #6, !dbg !58 + %.0.i591 = select i1 %.not3.i590, float %3635, float %3634, !dbg !58 + %3636 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i592 = icmp eq i32 %3636, 0, !dbg !58 + %3637 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3149, float 0x3E54AE0C00000000, float %.0.i591) #6, !dbg !58 + %3638 = tail call float @llvm.nvvm.fma.rn.f(float %3149, float 0x3E54AE0C00000000, float %.0.i591) #6, !dbg !58 + %.01.i593 = select i1 %.not4.i592, float %3638, float %3637, !dbg !58 + %3639 = bitcast float %.04.i589 to i32, !dbg !58 + %3640 = shl i32 %3639, 23, !dbg !58 + %3641 = bitcast i32 %3640 to float, !dbg !58 + %3642 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i593) #6, !dbg !58 + %3643 = fmul float %3642, %3641, !dbg !58 + %3644 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i594 = icmp eq i32 %3644, 0, !dbg !58 + %3645 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3150, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3646 = tail call float @llvm.nvvm.fma.rn.f(float %3150, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i595 = select i1 %.not.i594, float %3646, float %3645, !dbg !58 + %3647 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i596 = icmp eq i32 %3647, 0, !dbg !58 + %3648 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i595) #6, !dbg !58 + %3649 = tail call float @llvm.nvvm.saturate.f(float %.02.i595) #6, !dbg !58 + %.03.i597 = select i1 %.not1.i596, float %3649, float %3648, !dbg !58 + %3650 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i598 = icmp eq i32 %3650, 0, !dbg !58 + %3651 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i597, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3652 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i597, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i599 = select i1 %.not2.i598, float %3652, float %3651, !dbg !58 + %3653 = fadd float %.04.i599, 0xC168000FE0000000, !dbg !58 + %3654 = fneg float %3653, !dbg !58 + %3655 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i600 = icmp eq i32 %3655, 0, !dbg !58 + %3656 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3150, float 0x3FF7154760000000, float %3654) #6, !dbg !58 + %3657 = tail call float @llvm.nvvm.fma.rn.f(float %3150, float 0x3FF7154760000000, float %3654) #6, !dbg !58 + %.0.i601 = select i1 %.not3.i600, float %3657, float %3656, !dbg !58 + %3658 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i602 = icmp eq i32 %3658, 0, !dbg !58 + %3659 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3150, float 0x3E54AE0C00000000, float %.0.i601) #6, !dbg !58 + %3660 = tail call float @llvm.nvvm.fma.rn.f(float %3150, float 0x3E54AE0C00000000, float %.0.i601) #6, !dbg !58 + %.01.i603 = select i1 %.not4.i602, float %3660, float %3659, !dbg !58 + %3661 = bitcast float %.04.i599 to i32, !dbg !58 + %3662 = shl i32 %3661, 23, !dbg !58 + %3663 = bitcast i32 %3662 to float, !dbg !58 + %3664 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i603) #6, !dbg !58 + %3665 = fmul float %3664, %3663, !dbg !58 + %3666 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i604 = icmp eq i32 %3666, 0, !dbg !58 + %3667 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3151, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3668 = tail call float @llvm.nvvm.fma.rn.f(float %3151, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i605 = select i1 %.not.i604, float %3668, float %3667, !dbg !58 + %3669 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i606 = icmp eq i32 %3669, 0, !dbg !58 + %3670 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i605) #6, !dbg !58 + %3671 = tail call float @llvm.nvvm.saturate.f(float %.02.i605) #6, !dbg !58 + %.03.i607 = select i1 %.not1.i606, float %3671, float %3670, !dbg !58 + %3672 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i608 = icmp eq i32 %3672, 0, !dbg !58 + %3673 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i607, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3674 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i607, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i609 = select i1 %.not2.i608, float %3674, float %3673, !dbg !58 + %3675 = fadd float %.04.i609, 0xC168000FE0000000, !dbg !58 + %3676 = fneg float %3675, !dbg !58 + %3677 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i610 = icmp eq i32 %3677, 0, !dbg !58 + %3678 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3151, float 0x3FF7154760000000, float %3676) #6, !dbg !58 + %3679 = tail call float @llvm.nvvm.fma.rn.f(float %3151, float 0x3FF7154760000000, float %3676) #6, !dbg !58 + %.0.i611 = select i1 %.not3.i610, float %3679, float %3678, !dbg !58 + %3680 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i612 = icmp eq i32 %3680, 0, !dbg !58 + %3681 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3151, float 0x3E54AE0C00000000, float %.0.i611) #6, !dbg !58 + %3682 = tail call float @llvm.nvvm.fma.rn.f(float %3151, float 0x3E54AE0C00000000, float %.0.i611) #6, !dbg !58 + %.01.i613 = select i1 %.not4.i612, float %3682, float %3681, !dbg !58 + %3683 = bitcast float %.04.i609 to i32, !dbg !58 + %3684 = shl i32 %3683, 23, !dbg !58 + %3685 = bitcast i32 %3684 to float, !dbg !58 + %3686 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i613) #6, !dbg !58 + %3687 = fmul float %3686, %3685, !dbg !58 + %3688 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i614 = icmp eq i32 %3688, 0, !dbg !58 + %3689 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3152, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3690 = tail call float @llvm.nvvm.fma.rn.f(float %3152, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i615 = select i1 %.not.i614, float %3690, float %3689, !dbg !58 + %3691 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i616 = icmp eq i32 %3691, 0, !dbg !58 + %3692 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i615) #6, !dbg !58 + %3693 = tail call float @llvm.nvvm.saturate.f(float %.02.i615) #6, !dbg !58 + %.03.i617 = select i1 %.not1.i616, float %3693, float %3692, !dbg !58 + %3694 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i618 = icmp eq i32 %3694, 0, !dbg !58 + %3695 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i617, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3696 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i617, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i619 = select i1 %.not2.i618, float %3696, float %3695, !dbg !58 + %3697 = fadd float %.04.i619, 0xC168000FE0000000, !dbg !58 + %3698 = fneg float %3697, !dbg !58 + %3699 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i620 = icmp eq i32 %3699, 0, !dbg !58 + %3700 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3152, float 0x3FF7154760000000, float %3698) #6, !dbg !58 + %3701 = tail call float @llvm.nvvm.fma.rn.f(float %3152, float 0x3FF7154760000000, float %3698) #6, !dbg !58 + %.0.i621 = select i1 %.not3.i620, float %3701, float %3700, !dbg !58 + %3702 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i622 = icmp eq i32 %3702, 0, !dbg !58 + %3703 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3152, float 0x3E54AE0C00000000, float %.0.i621) #6, !dbg !58 + %3704 = tail call float @llvm.nvvm.fma.rn.f(float %3152, float 0x3E54AE0C00000000, float %.0.i621) #6, !dbg !58 + %.01.i623 = select i1 %.not4.i622, float %3704, float %3703, !dbg !58 + %3705 = bitcast float %.04.i619 to i32, !dbg !58 + %3706 = shl i32 %3705, 23, !dbg !58 + %3707 = bitcast i32 %3706 to float, !dbg !58 + %3708 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i623) #6, !dbg !58 + %3709 = fmul float %3708, %3707, !dbg !58 + %3710 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i624 = icmp eq i32 %3710, 0, !dbg !58 + %3711 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3153, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3712 = tail call float @llvm.nvvm.fma.rn.f(float %3153, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i625 = select i1 %.not.i624, float %3712, float %3711, !dbg !58 + %3713 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i626 = icmp eq i32 %3713, 0, !dbg !58 + %3714 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i625) #6, !dbg !58 + %3715 = tail call float @llvm.nvvm.saturate.f(float %.02.i625) #6, !dbg !58 + %.03.i627 = select i1 %.not1.i626, float %3715, float %3714, !dbg !58 + %3716 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i628 = icmp eq i32 %3716, 0, !dbg !58 + %3717 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i627, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3718 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i627, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i629 = select i1 %.not2.i628, float %3718, float %3717, !dbg !58 + %3719 = fadd float %.04.i629, 0xC168000FE0000000, !dbg !58 + %3720 = fneg float %3719, !dbg !58 + %3721 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i630 = icmp eq i32 %3721, 0, !dbg !58 + %3722 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3153, float 0x3FF7154760000000, float %3720) #6, !dbg !58 + %3723 = tail call float @llvm.nvvm.fma.rn.f(float %3153, float 0x3FF7154760000000, float %3720) #6, !dbg !58 + %.0.i631 = select i1 %.not3.i630, float %3723, float %3722, !dbg !58 + %3724 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i632 = icmp eq i32 %3724, 0, !dbg !58 + %3725 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3153, float 0x3E54AE0C00000000, float %.0.i631) #6, !dbg !58 + %3726 = tail call float @llvm.nvvm.fma.rn.f(float %3153, float 0x3E54AE0C00000000, float %.0.i631) #6, !dbg !58 + %.01.i633 = select i1 %.not4.i632, float %3726, float %3725, !dbg !58 + %3727 = bitcast float %.04.i629 to i32, !dbg !58 + %3728 = shl i32 %3727, 23, !dbg !58 + %3729 = bitcast i32 %3728 to float, !dbg !58 + %3730 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i633) #6, !dbg !58 + %3731 = fmul float %3730, %3729, !dbg !58 + %3732 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i634 = icmp eq i32 %3732, 0, !dbg !58 + %3733 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3154, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3734 = tail call float @llvm.nvvm.fma.rn.f(float %3154, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i635 = select i1 %.not.i634, float %3734, float %3733, !dbg !58 + %3735 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i636 = icmp eq i32 %3735, 0, !dbg !58 + %3736 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i635) #6, !dbg !58 + %3737 = tail call float @llvm.nvvm.saturate.f(float %.02.i635) #6, !dbg !58 + %.03.i637 = select i1 %.not1.i636, float %3737, float %3736, !dbg !58 + %3738 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i638 = icmp eq i32 %3738, 0, !dbg !58 + %3739 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i637, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3740 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i637, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i639 = select i1 %.not2.i638, float %3740, float %3739, !dbg !58 + %3741 = fadd float %.04.i639, 0xC168000FE0000000, !dbg !58 + %3742 = fneg float %3741, !dbg !58 + %3743 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i640 = icmp eq i32 %3743, 0, !dbg !58 + %3744 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3154, float 0x3FF7154760000000, float %3742) #6, !dbg !58 + %3745 = tail call float @llvm.nvvm.fma.rn.f(float %3154, float 0x3FF7154760000000, float %3742) #6, !dbg !58 + %.0.i641 = select i1 %.not3.i640, float %3745, float %3744, !dbg !58 + %3746 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i642 = icmp eq i32 %3746, 0, !dbg !58 + %3747 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3154, float 0x3E54AE0C00000000, float %.0.i641) #6, !dbg !58 + %3748 = tail call float @llvm.nvvm.fma.rn.f(float %3154, float 0x3E54AE0C00000000, float %.0.i641) #6, !dbg !58 + %.01.i643 = select i1 %.not4.i642, float %3748, float %3747, !dbg !58 + %3749 = bitcast float %.04.i639 to i32, !dbg !58 + %3750 = shl i32 %3749, 23, !dbg !58 + %3751 = bitcast i32 %3750 to float, !dbg !58 + %3752 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i643) #6, !dbg !58 + %3753 = fmul float %3752, %3751, !dbg !58 + %3754 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i644 = icmp eq i32 %3754, 0, !dbg !58 + %3755 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3155, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3756 = tail call float @llvm.nvvm.fma.rn.f(float %3155, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i645 = select i1 %.not.i644, float %3756, float %3755, !dbg !58 + %3757 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i646 = icmp eq i32 %3757, 0, !dbg !58 + %3758 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i645) #6, !dbg !58 + %3759 = tail call float @llvm.nvvm.saturate.f(float %.02.i645) #6, !dbg !58 + %.03.i647 = select i1 %.not1.i646, float %3759, float %3758, !dbg !58 + %3760 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i648 = icmp eq i32 %3760, 0, !dbg !58 + %3761 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i647, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3762 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i647, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i649 = select i1 %.not2.i648, float %3762, float %3761, !dbg !58 + %3763 = fadd float %.04.i649, 0xC168000FE0000000, !dbg !58 + %3764 = fneg float %3763, !dbg !58 + %3765 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i650 = icmp eq i32 %3765, 0, !dbg !58 + %3766 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3155, float 0x3FF7154760000000, float %3764) #6, !dbg !58 + %3767 = tail call float @llvm.nvvm.fma.rn.f(float %3155, float 0x3FF7154760000000, float %3764) #6, !dbg !58 + %.0.i651 = select i1 %.not3.i650, float %3767, float %3766, !dbg !58 + %3768 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i652 = icmp eq i32 %3768, 0, !dbg !58 + %3769 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3155, float 0x3E54AE0C00000000, float %.0.i651) #6, !dbg !58 + %3770 = tail call float @llvm.nvvm.fma.rn.f(float %3155, float 0x3E54AE0C00000000, float %.0.i651) #6, !dbg !58 + %.01.i653 = select i1 %.not4.i652, float %3770, float %3769, !dbg !58 + %3771 = bitcast float %.04.i649 to i32, !dbg !58 + %3772 = shl i32 %3771, 23, !dbg !58 + %3773 = bitcast i32 %3772 to float, !dbg !58 + %3774 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i653) #6, !dbg !58 + %3775 = fmul float %3774, %3773, !dbg !58 + %3776 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i654 = icmp eq i32 %3776, 0, !dbg !58 + %3777 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3156, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3778 = tail call float @llvm.nvvm.fma.rn.f(float %3156, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i655 = select i1 %.not.i654, float %3778, float %3777, !dbg !58 + %3779 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i656 = icmp eq i32 %3779, 0, !dbg !58 + %3780 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i655) #6, !dbg !58 + %3781 = tail call float @llvm.nvvm.saturate.f(float %.02.i655) #6, !dbg !58 + %.03.i657 = select i1 %.not1.i656, float %3781, float %3780, !dbg !58 + %3782 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i658 = icmp eq i32 %3782, 0, !dbg !58 + %3783 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i657, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3784 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i657, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i659 = select i1 %.not2.i658, float %3784, float %3783, !dbg !58 + %3785 = fadd float %.04.i659, 0xC168000FE0000000, !dbg !58 + %3786 = fneg float %3785, !dbg !58 + %3787 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i660 = icmp eq i32 %3787, 0, !dbg !58 + %3788 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3156, float 0x3FF7154760000000, float %3786) #6, !dbg !58 + %3789 = tail call float @llvm.nvvm.fma.rn.f(float %3156, float 0x3FF7154760000000, float %3786) #6, !dbg !58 + %.0.i661 = select i1 %.not3.i660, float %3789, float %3788, !dbg !58 + %3790 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i662 = icmp eq i32 %3790, 0, !dbg !58 + %3791 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3156, float 0x3E54AE0C00000000, float %.0.i661) #6, !dbg !58 + %3792 = tail call float @llvm.nvvm.fma.rn.f(float %3156, float 0x3E54AE0C00000000, float %.0.i661) #6, !dbg !58 + %.01.i663 = select i1 %.not4.i662, float %3792, float %3791, !dbg !58 + %3793 = bitcast float %.04.i659 to i32, !dbg !58 + %3794 = shl i32 %3793, 23, !dbg !58 + %3795 = bitcast i32 %3794 to float, !dbg !58 + %3796 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i663) #6, !dbg !58 + %3797 = fmul float %3796, %3795, !dbg !58 + %3798 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i664 = icmp eq i32 %3798, 0, !dbg !58 + %3799 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3157, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3800 = tail call float @llvm.nvvm.fma.rn.f(float %3157, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i665 = select i1 %.not.i664, float %3800, float %3799, !dbg !58 + %3801 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i666 = icmp eq i32 %3801, 0, !dbg !58 + %3802 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i665) #6, !dbg !58 + %3803 = tail call float @llvm.nvvm.saturate.f(float %.02.i665) #6, !dbg !58 + %.03.i667 = select i1 %.not1.i666, float %3803, float %3802, !dbg !58 + %3804 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i668 = icmp eq i32 %3804, 0, !dbg !58 + %3805 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i667, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3806 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i667, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i669 = select i1 %.not2.i668, float %3806, float %3805, !dbg !58 + %3807 = fadd float %.04.i669, 0xC168000FE0000000, !dbg !58 + %3808 = fneg float %3807, !dbg !58 + %3809 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i670 = icmp eq i32 %3809, 0, !dbg !58 + %3810 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3157, float 0x3FF7154760000000, float %3808) #6, !dbg !58 + %3811 = tail call float @llvm.nvvm.fma.rn.f(float %3157, float 0x3FF7154760000000, float %3808) #6, !dbg !58 + %.0.i671 = select i1 %.not3.i670, float %3811, float %3810, !dbg !58 + %3812 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i672 = icmp eq i32 %3812, 0, !dbg !58 + %3813 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3157, float 0x3E54AE0C00000000, float %.0.i671) #6, !dbg !58 + %3814 = tail call float @llvm.nvvm.fma.rn.f(float %3157, float 0x3E54AE0C00000000, float %.0.i671) #6, !dbg !58 + %.01.i673 = select i1 %.not4.i672, float %3814, float %3813, !dbg !58 + %3815 = bitcast float %.04.i669 to i32, !dbg !58 + %3816 = shl i32 %3815, 23, !dbg !58 + %3817 = bitcast i32 %3816 to float, !dbg !58 + %3818 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i673) #6, !dbg !58 + %3819 = fmul float %3818, %3817, !dbg !58 + %3820 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i674 = icmp eq i32 %3820, 0, !dbg !58 + %3821 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3158, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3822 = tail call float @llvm.nvvm.fma.rn.f(float %3158, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i675 = select i1 %.not.i674, float %3822, float %3821, !dbg !58 + %3823 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i676 = icmp eq i32 %3823, 0, !dbg !58 + %3824 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i675) #6, !dbg !58 + %3825 = tail call float @llvm.nvvm.saturate.f(float %.02.i675) #6, !dbg !58 + %.03.i677 = select i1 %.not1.i676, float %3825, float %3824, !dbg !58 + %3826 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i678 = icmp eq i32 %3826, 0, !dbg !58 + %3827 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i677, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3828 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i677, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i679 = select i1 %.not2.i678, float %3828, float %3827, !dbg !58 + %3829 = fadd float %.04.i679, 0xC168000FE0000000, !dbg !58 + %3830 = fneg float %3829, !dbg !58 + %3831 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i680 = icmp eq i32 %3831, 0, !dbg !58 + %3832 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3158, float 0x3FF7154760000000, float %3830) #6, !dbg !58 + %3833 = tail call float @llvm.nvvm.fma.rn.f(float %3158, float 0x3FF7154760000000, float %3830) #6, !dbg !58 + %.0.i681 = select i1 %.not3.i680, float %3833, float %3832, !dbg !58 + %3834 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i682 = icmp eq i32 %3834, 0, !dbg !58 + %3835 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3158, float 0x3E54AE0C00000000, float %.0.i681) #6, !dbg !58 + %3836 = tail call float @llvm.nvvm.fma.rn.f(float %3158, float 0x3E54AE0C00000000, float %.0.i681) #6, !dbg !58 + %.01.i683 = select i1 %.not4.i682, float %3836, float %3835, !dbg !58 + %3837 = bitcast float %.04.i679 to i32, !dbg !58 + %3838 = shl i32 %3837, 23, !dbg !58 + %3839 = bitcast i32 %3838 to float, !dbg !58 + %3840 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i683) #6, !dbg !58 + %3841 = fmul float %3840, %3839, !dbg !58 + %3842 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i684 = icmp eq i32 %3842, 0, !dbg !58 + %3843 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3159, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3844 = tail call float @llvm.nvvm.fma.rn.f(float %3159, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i685 = select i1 %.not.i684, float %3844, float %3843, !dbg !58 + %3845 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i686 = icmp eq i32 %3845, 0, !dbg !58 + %3846 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i685) #6, !dbg !58 + %3847 = tail call float @llvm.nvvm.saturate.f(float %.02.i685) #6, !dbg !58 + %.03.i687 = select i1 %.not1.i686, float %3847, float %3846, !dbg !58 + %3848 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i688 = icmp eq i32 %3848, 0, !dbg !58 + %3849 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i687, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3850 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i687, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i689 = select i1 %.not2.i688, float %3850, float %3849, !dbg !58 + %3851 = fadd float %.04.i689, 0xC168000FE0000000, !dbg !58 + %3852 = fneg float %3851, !dbg !58 + %3853 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i690 = icmp eq i32 %3853, 0, !dbg !58 + %3854 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3159, float 0x3FF7154760000000, float %3852) #6, !dbg !58 + %3855 = tail call float @llvm.nvvm.fma.rn.f(float %3159, float 0x3FF7154760000000, float %3852) #6, !dbg !58 + %.0.i691 = select i1 %.not3.i690, float %3855, float %3854, !dbg !58 + %3856 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i692 = icmp eq i32 %3856, 0, !dbg !58 + %3857 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3159, float 0x3E54AE0C00000000, float %.0.i691) #6, !dbg !58 + %3858 = tail call float @llvm.nvvm.fma.rn.f(float %3159, float 0x3E54AE0C00000000, float %.0.i691) #6, !dbg !58 + %.01.i693 = select i1 %.not4.i692, float %3858, float %3857, !dbg !58 + %3859 = bitcast float %.04.i689 to i32, !dbg !58 + %3860 = shl i32 %3859, 23, !dbg !58 + %3861 = bitcast i32 %3860 to float, !dbg !58 + %3862 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i693) #6, !dbg !58 + %3863 = fmul float %3862, %3861, !dbg !58 + %3864 = tail call float @llvm.nvvm.div.full(float %3181, float %2962), !dbg !59 + %3865 = tail call float @llvm.nvvm.div.full(float %3203, float %2962), !dbg !59 + %3866 = tail call float @llvm.nvvm.div.full(float %3225, float %2962), !dbg !59 + %3867 = tail call float @llvm.nvvm.div.full(float %3247, float %2962), !dbg !59 + %3868 = tail call float @llvm.nvvm.div.full(float %3269, float %2962), !dbg !59 + %3869 = tail call float @llvm.nvvm.div.full(float %3291, float %2962), !dbg !59 + %3870 = tail call float @llvm.nvvm.div.full(float %3313, float %2962), !dbg !59 + %3871 = tail call float @llvm.nvvm.div.full(float %3335, float %2962), !dbg !59 + %3872 = tail call float @llvm.nvvm.div.full(float %3357, float %2962), !dbg !59 + %3873 = tail call float @llvm.nvvm.div.full(float %3379, float %2962), !dbg !59 + %3874 = tail call float @llvm.nvvm.div.full(float %3401, float %2962), !dbg !59 + %3875 = tail call float @llvm.nvvm.div.full(float %3423, float %2962), !dbg !59 + %3876 = tail call float @llvm.nvvm.div.full(float %3445, float %2962), !dbg !59 + %3877 = tail call float @llvm.nvvm.div.full(float %3467, float %2962), !dbg !59 + %3878 = tail call float @llvm.nvvm.div.full(float %3489, float %2962), !dbg !59 + %3879 = tail call float @llvm.nvvm.div.full(float %3511, float %2962), !dbg !59 + %3880 = tail call float @llvm.nvvm.div.full(float %3533, float %2962), !dbg !59 + %3881 = tail call float @llvm.nvvm.div.full(float %3555, float %2962), !dbg !59 + %3882 = tail call float @llvm.nvvm.div.full(float %3577, float %2962), !dbg !59 + %3883 = tail call float @llvm.nvvm.div.full(float %3599, float %2962), !dbg !59 + %3884 = tail call float @llvm.nvvm.div.full(float %3621, float %2962), !dbg !59 + %3885 = tail call float @llvm.nvvm.div.full(float %3643, float %2962), !dbg !59 + %3886 = tail call float @llvm.nvvm.div.full(float %3665, float %2962), !dbg !59 + %3887 = tail call float @llvm.nvvm.div.full(float %3687, float %2962), !dbg !59 + %3888 = tail call float @llvm.nvvm.div.full(float %3709, float %2962), !dbg !59 + %3889 = tail call float @llvm.nvvm.div.full(float %3731, float %2962), !dbg !59 + %3890 = tail call float @llvm.nvvm.div.full(float %3753, float %2962), !dbg !59 + %3891 = tail call float @llvm.nvvm.div.full(float %3775, float %2962), !dbg !59 + %3892 = tail call float @llvm.nvvm.div.full(float %3797, float %2962), !dbg !59 + %3893 = tail call float @llvm.nvvm.div.full(float %3819, float %2962), !dbg !59 + %3894 = tail call float @llvm.nvvm.div.full(float %3841, float %2962), !dbg !59 + %3895 = tail call float @llvm.nvvm.div.full(float %3863, float %2962), !dbg !59 + %3896 = sext i32 %2995 to i64, !dbg !60 + %3897 = getelementptr float, ptr addrspace(1) %1, i64 %3896, !dbg !60 + %3898 = sext i32 %2998 to i64, !dbg !60 + %3899 = getelementptr float, ptr addrspace(1) %1, i64 %3898, !dbg !60 + %3900 = sext i32 %3001 to i64, !dbg !60 + %3901 = getelementptr float, ptr addrspace(1) %1, i64 %3900, !dbg !60 + %3902 = sext i32 %3004 to i64, !dbg !60 + %3903 = getelementptr float, ptr addrspace(1) %1, i64 %3902, !dbg !60 + %3904 = sext i32 %3007 to i64, !dbg !60 + %3905 = getelementptr float, ptr addrspace(1) %1, i64 %3904, !dbg !60 + %3906 = sext i32 %3010 to i64, !dbg !60 + %3907 = getelementptr float, ptr addrspace(1) %1, i64 %3906, !dbg !60 + %3908 = sext i32 %3013 to i64, !dbg !60 + %3909 = getelementptr float, ptr addrspace(1) %1, i64 %3908, !dbg !60 + %3910 = sext i32 %3015 to i64, !dbg !60 + %3911 = getelementptr float, ptr addrspace(1) %1, i64 %3910, !dbg !60 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %3912 = insertelement <4 x float> poison, float %3864, i64 0, !dbg !61 + %3913 = insertelement <4 x float> %3912, float %3865, i64 1, !dbg !61 + %3914 = insertelement <4 x float> %3913, float %3866, i64 2, !dbg !61 + %3915 = insertelement <4 x float> %3914, float %3867, i64 3, !dbg !61 + store <4 x float> %3915, ptr addrspace(3) %2964, align 16, !dbg !61 + %3916 = insertelement <4 x float> poison, float %3868, i64 0, !dbg !61 + %3917 = insertelement <4 x float> %3916, float %3869, i64 1, !dbg !61 + %3918 = insertelement <4 x float> %3917, float %3870, i64 2, !dbg !61 + %3919 = insertelement <4 x float> %3918, float %3871, i64 3, !dbg !61 + store <4 x float> %3919, ptr addrspace(3) %2966, align 16, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %3920 = load <4 x i32>, ptr addrspace(3) %2973, align 16, !dbg !61 + %3921 = load <4 x i32>, ptr addrspace(3) %2974, align 16, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %3922 = insertelement <4 x float> poison, float %3872, i64 0, !dbg !61 + %3923 = insertelement <4 x float> %3922, float %3873, i64 1, !dbg !61 + %3924 = insertelement <4 x float> %3923, float %3874, i64 2, !dbg !61 + %3925 = insertelement <4 x float> %3924, float %3875, i64 3, !dbg !61 + store <4 x float> %3925, ptr addrspace(3) %2964, align 16, !dbg !61 + %3926 = insertelement <4 x float> poison, float %3876, i64 0, !dbg !61 + %3927 = insertelement <4 x float> %3926, float %3877, i64 1, !dbg !61 + %3928 = insertelement <4 x float> %3927, float %3878, i64 2, !dbg !61 + %3929 = insertelement <4 x float> %3928, float %3879, i64 3, !dbg !61 + store <4 x float> %3929, ptr addrspace(3) %2966, align 16, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %3930 = load <4 x i32>, ptr addrspace(3) %2973, align 16, !dbg !61 + %3931 = load <4 x i32>, ptr addrspace(3) %2974, align 16, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %3932 = insertelement <4 x float> poison, float %3880, i64 0, !dbg !61 + %3933 = insertelement <4 x float> %3932, float %3881, i64 1, !dbg !61 + %3934 = insertelement <4 x float> %3933, float %3882, i64 2, !dbg !61 + %3935 = insertelement <4 x float> %3934, float %3883, i64 3, !dbg !61 + store <4 x float> %3935, ptr addrspace(3) %2964, align 16, !dbg !61 + %3936 = insertelement <4 x float> poison, float %3884, i64 0, !dbg !61 + %3937 = insertelement <4 x float> %3936, float %3885, i64 1, !dbg !61 + %3938 = insertelement <4 x float> %3937, float %3886, i64 2, !dbg !61 + %3939 = insertelement <4 x float> %3938, float %3887, i64 3, !dbg !61 + store <4 x float> %3939, ptr addrspace(3) %2966, align 16, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %3940 = load <4 x i32>, ptr addrspace(3) %2973, align 16, !dbg !61 + %3941 = load <4 x i32>, ptr addrspace(3) %2974, align 16, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %3942 = insertelement <4 x float> poison, float %3888, i64 0, !dbg !61 + %3943 = insertelement <4 x float> %3942, float %3889, i64 1, !dbg !61 + %3944 = insertelement <4 x float> %3943, float %3890, i64 2, !dbg !61 + %3945 = insertelement <4 x float> %3944, float %3891, i64 3, !dbg !61 + store <4 x float> %3945, ptr addrspace(3) %2964, align 16, !dbg !61 + %3946 = insertelement <4 x float> poison, float %3892, i64 0, !dbg !61 + %3947 = insertelement <4 x float> %3946, float %3893, i64 1, !dbg !61 + %3948 = insertelement <4 x float> %3947, float %3894, i64 2, !dbg !61 + %3949 = insertelement <4 x float> %3948, float %3895, i64 3, !dbg !61 + store <4 x float> %3949, ptr addrspace(3) %2966, align 16, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %3950 = load <4 x i32>, ptr addrspace(3) %2973, align 16, !dbg !61 + %3951 = load <4 x i32>, ptr addrspace(3) %2974, align 16, !dbg !61 + %.extract = extractelement <4 x i32> %3920, i64 0, !dbg !61 + %.extract33 = extractelement <4 x i32> %3920, i64 1, !dbg !61 + %.extract34 = extractelement <4 x i32> %3920, i64 2, !dbg !61 + %.extract35 = extractelement <4 x i32> %3920, i64 3, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract, i32 %.extract33, i32 %.extract34, i32 %.extract35, ptr addrspace(1) %3897, i1 true) #6, !dbg !61 + %.extract36 = extractelement <4 x i32> %3921, i64 0, !dbg !61 + %.extract37 = extractelement <4 x i32> %3921, i64 1, !dbg !61 + %.extract38 = extractelement <4 x i32> %3921, i64 2, !dbg !61 + %.extract39 = extractelement <4 x i32> %3921, i64 3, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract36, i32 %.extract37, i32 %.extract38, i32 %.extract39, ptr addrspace(1) %3899, i1 true) #6, !dbg !61 + %.extract40 = extractelement <4 x i32> %3930, i64 0, !dbg !61 + %.extract41 = extractelement <4 x i32> %3930, i64 1, !dbg !61 + %.extract42 = extractelement <4 x i32> %3930, i64 2, !dbg !61 + %.extract43 = extractelement <4 x i32> %3930, i64 3, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract40, i32 %.extract41, i32 %.extract42, i32 %.extract43, ptr addrspace(1) %3901, i1 true) #6, !dbg !61 + %.extract44 = extractelement <4 x i32> %3931, i64 0, !dbg !61 + %.extract45 = extractelement <4 x i32> %3931, i64 1, !dbg !61 + %.extract46 = extractelement <4 x i32> %3931, i64 2, !dbg !61 + %.extract47 = extractelement <4 x i32> %3931, i64 3, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract44, i32 %.extract45, i32 %.extract46, i32 %.extract47, ptr addrspace(1) %3903, i1 true) #6, !dbg !61 + %.extract48 = extractelement <4 x i32> %3940, i64 0, !dbg !61 + %.extract49 = extractelement <4 x i32> %3940, i64 1, !dbg !61 + %.extract50 = extractelement <4 x i32> %3940, i64 2, !dbg !61 + %.extract51 = extractelement <4 x i32> %3940, i64 3, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract48, i32 %.extract49, i32 %.extract50, i32 %.extract51, ptr addrspace(1) %3905, i1 true) #6, !dbg !61 + %.extract52 = extractelement <4 x i32> %3941, i64 0, !dbg !61 + %.extract53 = extractelement <4 x i32> %3941, i64 1, !dbg !61 + %.extract54 = extractelement <4 x i32> %3941, i64 2, !dbg !61 + %.extract55 = extractelement <4 x i32> %3941, i64 3, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract52, i32 %.extract53, i32 %.extract54, i32 %.extract55, ptr addrspace(1) %3907, i1 true) #6, !dbg !61 + %.extract56 = extractelement <4 x i32> %3950, i64 0, !dbg !61 + %.extract57 = extractelement <4 x i32> %3950, i64 1, !dbg !61 + %.extract58 = extractelement <4 x i32> %3950, i64 2, !dbg !61 + %.extract59 = extractelement <4 x i32> %3950, i64 3, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract56, i32 %.extract57, i32 %.extract58, i32 %.extract59, ptr addrspace(1) %3909, i1 true) #6, !dbg !61 + %.extract60 = extractelement <4 x i32> %3951, i64 0, !dbg !61 + %.extract61 = extractelement <4 x i32> %3951, i64 1, !dbg !61 + %.extract62 = extractelement <4 x i32> %3951, i64 2, !dbg !61 + %.extract63 = extractelement <4 x i32> %3951, i64 3, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract60, i32 %.extract61, i32 %.extract62, i32 %.extract63, ptr addrspace(1) %3911, i1 %2982) #6, !dbg !61 + br i1 %2977, label %2976, label %3952, !dbg !50 + +3952: ; preds = %2976 + ret void, !dbg !62 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.saturate.ftz.f(float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.saturate.f(float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rm.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rm.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0", linkageName: "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 26, column: 37, scope: !5) +!10 = !DILocation(line: 37, column: 47, scope: !5) +!11 = !DILocation(line: 31, column: 40, scope: !5) +!12 = !DILocation(line: 32, column: 31, scope: !5) +!13 = !DILocation(line: 33, column: 29, scope: !5) +!14 = !DILocation(line: 37, column: 41, scope: !5) +!15 = !DILocation(line: 37, column: 34, scope: !5) +!16 = !DILocation(line: 37, column: 52, scope: !5) +!17 = !DILocation(line: 37, column: 105, scope: !5) +!18 = !DILocation(line: 110, column: 15, scope: !19, inlinedAt: !21) +!19 = distinct !DILexicalBlockFile(scope: !5, file: !20, discriminator: 0) +!20 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!21 = !DILocation(line: 42, column: 40, scope: !5) +!22 = !DILocation(line: 112, column: 21, scope: !19, inlinedAt: !21) +!23 = !DILocation(line: 112, column: 16, scope: !19, inlinedAt: !21) +!24 = !DILocation(line: 113, column: 29, scope: !19, inlinedAt: !21) +!25 = !DILocation(line: 199, column: 53, scope: !19, inlinedAt: !21) +!26 = !DILocation(line: 196, column: 19, scope: !19, inlinedAt: !21) +!27 = !DILocation(line: 196, column: 53, scope: !19, inlinedAt: !21) +!28 = !DILocation(line: 173, column: 29, scope: !19, inlinedAt: !21) +!29 = !DILocation(line: 196, column: 39, scope: !19, inlinedAt: !21) +!30 = !DILocation(line: 199, column: 39, scope: !19, inlinedAt: !21) +!31 = !DILocation(line: 205, column: 24, scope: !19, inlinedAt: !21) +!32 = !DILocation(line: 205, column: 36, scope: !19, inlinedAt: !21) +!33 = !DILocation(line: 45, column: 54, scope: !5) +!34 = !DILocation(line: 46, column: 54, scope: !5) +!35 = !DILocation(line: 110, column: 15, scope: !19, inlinedAt: !36) +!36 = !DILocation(line: 49, column: 33, scope: !5) +!37 = !DILocation(line: 112, column: 21, scope: !19, inlinedAt: !36) +!38 = !DILocation(line: 112, column: 16, scope: !19, inlinedAt: !36) +!39 = !DILocation(line: 113, column: 29, scope: !19, inlinedAt: !36) +!40 = !DILocation(line: 123, column: 29, scope: !19, inlinedAt: !36) +!41 = !DILocation(line: 180, column: 40, scope: !19, inlinedAt: !36) +!42 = !DILocation(line: 180, column: 68, scope: !19, inlinedAt: !36) +!43 = !DILocation(line: 180, column: 58, scope: !19, inlinedAt: !36) +!44 = !DILocation(line: 173, column: 29, scope: !19, inlinedAt: !36) +!45 = !DILocation(line: 181, column: 31, scope: !19, inlinedAt: !36) +!46 = !DILocation(line: 291, column: 36, scope: !47, inlinedAt: !36) +!47 = distinct !DILexicalBlockFile(scope: !5, file: !48, discriminator: 0) +!48 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!49 = !DILocation(line: 261, column: 15, scope: !47, inlinedAt: !36) +!50 = !DILocation(line: 52, column: 40, scope: !5) +!51 = !DILocation(line: 53, column: 31, scope: !5) +!52 = !DILocation(line: 54, column: 29, scope: !5) +!53 = !DILocation(line: 58, column: 41, scope: !5) +!54 = !DILocation(line: 58, column: 34, scope: !5) +!55 = !DILocation(line: 58, column: 52, scope: !5) +!56 = !DILocation(line: 58, column: 106, scope: !5) +!57 = !DILocation(line: 60, column: 22, scope: !5) +!58 = !DILocation(line: 61, column: 29, scope: !5) +!59 = !DILocation(line: 62, column: 23, scope: !5) +!60 = !DILocation(line: 63, column: 29, scope: !5) +!61 = !DILocation(line: 63, column: 53, scope: !5) +!62 = !DILocation(line: 52, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..ba2eb372cb3e9ef79515991d03caabf69388831e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx @@ -0,0 +1,3186 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 // -- Begin function triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 +.visible .entry triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0( + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_1, + .param .u32 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_2, + .param .u32 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_5 +) +.reqntid 512 +{ + .reg .pred %p<208>; + .reg .b16 %rs<65>; + .reg .b32 %r<2200>; + .reg .b64 %rd<274>; + .loc 1 18 0 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:18:0 + +// %bb.0: + ld.param.b64 %rd58, [triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_1]; + ld.param.b64 %rd57, [triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_0]; +$L__tmp0: + .loc 1 23 28 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:23:28 + mov.u32 %r46, %ctaid.x; + .loc 1 26 37 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:26:37 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 511; + shl.b32 %r3, %r2, 3; + or.b32 %r4, %r3, 4096; + or.b32 %r5, %r3, 8192; + or.b32 %r47, %r3, 12288; + .loc 1 37 47 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:47 + mul.lo.s32 %r6, %r46, 32000; + mov.b32 %r2198, 0fFF800000; + mov.b64 %rd248, {%r2198, %r2198}; + mov.b32 %r2188, 0f00000000; + mov.b64 %rd247, {%r2188, %r2188}; + .loc 1 31 40 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:31:40 + cvt.u64.u32 %rd1, %r47; + mov.b64 %rd246, 0; + mov.pred %p4, -1; + mov.pred %p206, %p4; + mov.b32 %r2189, %r2188; + mov.b32 %r2190, %r2188; + mov.b32 %r2191, %r2188; + mov.b32 %r2192, %r2188; + mov.b32 %r2193, %r2188; + mov.b32 %r2194, %r2188; + mov.b32 %r2195, %r2188; + mov.b32 %r2196, %r2188; + mov.b32 %r2197, %r2188; + mov.b32 %r2199, %r2198; + mov.b64 %rd249, %rd247; + mov.b64 %rd250, %rd248; + mov.b64 %rd251, %rd247; + mov.b64 %rd252, %rd248; + mov.b64 %rd253, %rd247; + mov.b64 %rd254, %rd247; + mov.b64 %rd255, %rd247; + mov.b64 %rd256, %rd247; + mov.b64 %rd257, %rd247; + mov.b64 %rd258, %rd247; + mov.b64 %rd259, %rd247; + mov.b64 %rd260, %rd247; + mov.b64 %rd261, %rd248; + mov.b64 %rd262, %rd248; + mov.b64 %rd263, %rd248; + mov.b64 %rd264, %rd248; + mov.b64 %rd265, %rd248; + mov.b64 %rd266, %rd248; + mov.b64 %rd267, %rd248; + mov.b64 %rd268, %rd248; + mov.b64 %rd269, %rd248; + mov.b64 %rd270, %rd248; + mov.b64 %rd271, %rd248; + mov.b64 %rd272, %rd248; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 0 40 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:0:40 + mov.pred %p1, %p206; + .loc 1 32 31 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:32:31 + or.b64 %rd95, %rd246, %rd1; + .loc 1 33 29 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:33:29 + setp.lt.u64 %p7, %rd95, 32000; + .loc 1 37 41 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:41 + cvt.u32.u64 %r80, %rd246; + or.b32 %r81, %r3, %r80; + add.s32 %r82, %r81, %r6; + or.b32 %r83, %r4, %r80; + add.s32 %r84, %r83, %r6; + or.b32 %r85, %r5, %r80; + add.s32 %r86, %r85, %r6; + cvt.u32.u64 %r87, %rd95; + add.s32 %r88, %r6, %r87; + .loc 1 37 34 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:34 + mad.wide.s32 %rd83, %r82, 2, %rd57; + mad.wide.s32 %rd86, %r84, 2, %rd57; + mad.wide.s32 %rd89, %r86, 2, %rd57; + mad.wide.s32 %rd92, %r88, 2, %rd57; + .loc 1 37 52 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:52 + // begin inline asm + mov.u64 %rd82, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd82, 1.0; + // end inline asm + mov.b32 %r52, 0; + // begin inline asm + mov.u32 %r48, %r52; + mov.u32 %r49, %r52; + mov.u32 %r50, %r52; + mov.u32 %r51, %r52; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r48, %r49, %r50, %r51 }, [ %rd83 + 0 ], %rd82; + // end inline asm + // begin inline asm + mov.u64 %rd85, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd85, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r56, %r52; + mov.u32 %r57, %r52; + mov.u32 %r58, %r52; + mov.u32 %r59, %r52; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r56, %r57, %r58, %r59 }, [ %rd86 + 0 ], %rd85; + // end inline asm + // begin inline asm + mov.u64 %rd88, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd88, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r64, %r52; + mov.u32 %r65, %r52; + mov.u32 %r66, %r52; + mov.u32 %r67, %r52; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r64, %r65, %r66, %r67 }, [ %rd89 + 0 ], %rd88; + // end inline asm + // begin inline asm + mov.u64 %rd91, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd91, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r72, %r52; + mov.u32 %r73, %r52; + mov.u32 %r74, %r52; + mov.u32 %r75, %r52; + @%p7 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r72, %r73, %r74, %r75 }, [ %rd92 + 0 ], %rd91; + // end inline asm + mov.b32 {%rs1, %rs2}, %r72; + .loc 1 37 105 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:105 + cvt.f32.bf16 %r89, %rs1; + cvt.f32.bf16 %r90, %rs2; +$L__tmp1: + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.gt.f32 %p9, %r2198, %r89; + setp.gt.f32 %p10, %r2199, %r90; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + mov.b64 {%r91, %r92}, %rd265; + setp.nan.f32 %p11, %r91, %r91; + setp.nan.f32 %p12, %r92, %r92; + mov.b64 {%r93, %r94}, %rd266; + setp.nan.f32 %p13, %r93, %r93; + setp.nan.f32 %p14, %r94, %r94; + mov.b64 {%r95, %r96}, %rd267; + setp.nan.f32 %p15, %r95, %r95; + setp.nan.f32 %p16, %r96, %r96; + mov.b64 {%r97, %r98}, %rd268; + setp.nan.f32 %p17, %r97, %r97; + setp.nan.f32 %p18, %r98, %r98; + mov.b64 {%r99, %r100}, %rd261; + setp.nan.f32 %p19, %r99, %r99; + setp.nan.f32 %p20, %r100, %r100; + mov.b64 {%r101, %r102}, %rd262; + setp.nan.f32 %p21, %r101, %r101; + setp.nan.f32 %p22, %r102, %r102; + mov.b64 {%r103, %r104}, %rd263; + setp.nan.f32 %p23, %r103, %r103; + setp.nan.f32 %p24, %r104, %r104; + mov.b64 {%r105, %r106}, %rd264; + setp.nan.f32 %p25, %r105, %r105; + setp.nan.f32 %p26, %r106, %r106; + mov.b64 {%r107, %r108}, %rd269; + setp.nan.f32 %p27, %r107, %r107; + setp.nan.f32 %p28, %r108, %r108; + mov.b64 {%r109, %r110}, %rd270; + setp.nan.f32 %p29, %r109, %r109; + setp.nan.f32 %p30, %r110, %r110; + mov.b64 {%r111, %r112}, %rd271; + setp.nan.f32 %p31, %r111, %r111; + setp.nan.f32 %p32, %r112, %r112; + mov.b64 {%r113, %r114}, %rd272; + setp.nan.f32 %p33, %r113, %r113; + setp.nan.f32 %p34, %r114, %r114; + setp.nan.f32 %p35, %r2198, %r2198; + setp.nan.f32 %p36, %r2199, %r2199; + mov.b64 {%r115, %r116}, %rd252; + setp.nan.f32 %p37, %r115, %r115; + setp.nan.f32 %p38, %r116, %r116; + mov.b64 {%r117, %r118}, %rd250; + setp.nan.f32 %p39, %r117, %r117; + setp.nan.f32 %p40, %r118, %r118; + mov.b64 {%r119, %r120}, %rd248; + setp.nan.f32 %p41, %r119, %r119; + setp.nan.f32 %p42, %r120, %r120; +$L__tmp2: + .loc 1 37 105 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:105 + mov.b32 {%rs3, %rs4}, %r67; + cvt.f32.bf16 %r121, %rs4; + cvt.f32.bf16 %r122, %rs3; + mov.b32 {%rs5, %rs6}, %r66; + cvt.f32.bf16 %r123, %rs6; + cvt.f32.bf16 %r124, %rs5; + mov.b32 {%rs7, %rs8}, %r65; + cvt.f32.bf16 %r125, %rs8; + cvt.f32.bf16 %r126, %rs7; + mov.b32 {%rs9, %rs10}, %r64; + cvt.f32.bf16 %r127, %rs10; + cvt.f32.bf16 %r128, %rs9; +$L__tmp3: + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.gt.f32 %p43, %r107, %r128; + setp.gt.f32 %p44, %r108, %r127; + setp.gt.f32 %p45, %r109, %r126; + setp.gt.f32 %p46, %r110, %r125; + setp.gt.f32 %p47, %r111, %r124; + setp.gt.f32 %p48, %r112, %r123; + setp.gt.f32 %p49, %r113, %r122; + setp.gt.f32 %p50, %r114, %r121; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r129, %r114, %r121, %p50; + selp.f32 %r26, %r114, %r129, %p34; + selp.f32 %r130, %r113, %r122, %p49; + selp.f32 %r25, %r113, %r130, %p33; + mov.b64 %rd272, {%r25, %r26}; + selp.f32 %r131, %r112, %r123, %p48; + selp.f32 %r24, %r112, %r131, %p32; + selp.f32 %r132, %r111, %r124, %p47; + selp.f32 %r23, %r111, %r132, %p31; + mov.b64 %rd271, {%r23, %r24}; + selp.f32 %r133, %r110, %r125, %p46; + selp.f32 %r22, %r110, %r133, %p30; + selp.f32 %r134, %r109, %r126, %p45; + selp.f32 %r21, %r109, %r134, %p29; + mov.b64 %rd270, {%r21, %r22}; + selp.f32 %r135, %r108, %r127, %p44; + selp.f32 %r20, %r108, %r135, %p28; + selp.f32 %r136, %r107, %r128, %p43; + selp.f32 %r19, %r107, %r136, %p27; + mov.b64 %rd269, {%r19, %r20}; + selp.f32 %r137, %r2198, %r89, %p9; + selp.f32 %r138, %r2198, %r137, %p35; + selp.f32 %r139, %r2199, %r90, %p10; + selp.f32 %r140, %r2199, %r139, %p36; + .loc 2 196 19 // triton_helpers.py:196:19 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.eq.f32 %p51, %r19, 0fFF800000; + setp.eq.f32 %p52, %r20, 0fFF800000; + setp.eq.f32 %p53, %r21, 0fFF800000; + setp.eq.f32 %p54, %r22, 0fFF800000; + setp.eq.f32 %p55, %r23, 0fFF800000; + setp.eq.f32 %p56, %r24, 0fFF800000; + setp.eq.f32 %p57, %r25, 0fFF800000; + setp.eq.f32 %p58, %r26, 0fFF800000; + setp.eq.f32 %p59, %r138, 0fFF800000; + setp.eq.f32 %p60, %r140, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + sub.f32 %r141, %r114, %r26; + sub.f32 %r142, %r113, %r25; + sub.f32 %r143, %r112, %r24; + sub.f32 %r144, %r111, %r23; + sub.f32 %r145, %r110, %r22; + sub.f32 %r146, %r109, %r21; + sub.f32 %r147, %r108, %r20; + sub.f32 %r148, %r107, %r19; + sub.f32 %r149, %r2198, %r138; + sub.f32 %r150, %r2199, %r140; + mov.b32 %r151, 0f3F000000; + mov.b32 %r152, 0f3BBB989D; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.ftz.f32 %r153, %r148, %r152, %r151; + cvt.ftz.sat.f32.f32 %r154, %r153; + mov.b32 %r155, 0f4B400001; + mov.b32 %r156, 0f437C0000; + fma.rm.ftz.f32 %r157, %r154, %r156, %r155; + add.f32 %r158, %r157, 0fCB40007F; + neg.f32 %r159, %r158; + mov.b32 %r160, 0f3FB8AA3B; + fma.rn.ftz.f32 %r161, %r148, %r160, %r159; + mov.b32 %r162, 0f32A57060; + fma.rn.ftz.f32 %r163, %r148, %r162, %r161; + shl.b32 %r164, %r157, 23; + ex2.approx.ftz.f32 %r165, %r163; + mul.f32 %r166, %r165, %r164; + fma.rn.ftz.f32 %r167, %r147, %r152, %r151; + cvt.ftz.sat.f32.f32 %r168, %r167; + fma.rm.ftz.f32 %r169, %r168, %r156, %r155; + add.f32 %r170, %r169, 0fCB40007F; + neg.f32 %r171, %r170; + fma.rn.ftz.f32 %r172, %r147, %r160, %r171; + fma.rn.ftz.f32 %r173, %r147, %r162, %r172; + shl.b32 %r174, %r169, 23; + ex2.approx.ftz.f32 %r175, %r173; + mul.f32 %r176, %r175, %r174; + fma.rn.ftz.f32 %r177, %r146, %r152, %r151; + cvt.ftz.sat.f32.f32 %r178, %r177; + fma.rm.ftz.f32 %r179, %r178, %r156, %r155; + add.f32 %r180, %r179, 0fCB40007F; + neg.f32 %r181, %r180; + fma.rn.ftz.f32 %r182, %r146, %r160, %r181; + fma.rn.ftz.f32 %r183, %r146, %r162, %r182; + shl.b32 %r184, %r179, 23; + ex2.approx.ftz.f32 %r185, %r183; + mul.f32 %r186, %r185, %r184; + fma.rn.ftz.f32 %r187, %r145, %r152, %r151; + cvt.ftz.sat.f32.f32 %r188, %r187; + fma.rm.ftz.f32 %r189, %r188, %r156, %r155; + add.f32 %r190, %r189, 0fCB40007F; + neg.f32 %r191, %r190; + fma.rn.ftz.f32 %r192, %r145, %r160, %r191; + fma.rn.ftz.f32 %r193, %r145, %r162, %r192; + shl.b32 %r194, %r189, 23; + ex2.approx.ftz.f32 %r195, %r193; + mul.f32 %r196, %r195, %r194; + fma.rn.ftz.f32 %r197, %r144, %r152, %r151; + cvt.ftz.sat.f32.f32 %r198, %r197; + fma.rm.ftz.f32 %r199, %r198, %r156, %r155; + add.f32 %r200, %r199, 0fCB40007F; + neg.f32 %r201, %r200; + fma.rn.ftz.f32 %r202, %r144, %r160, %r201; + fma.rn.ftz.f32 %r203, %r144, %r162, %r202; + shl.b32 %r204, %r199, 23; + ex2.approx.ftz.f32 %r205, %r203; + mul.f32 %r206, %r205, %r204; + fma.rn.ftz.f32 %r207, %r143, %r152, %r151; + cvt.ftz.sat.f32.f32 %r208, %r207; + fma.rm.ftz.f32 %r209, %r208, %r156, %r155; + add.f32 %r210, %r209, 0fCB40007F; + neg.f32 %r211, %r210; + fma.rn.ftz.f32 %r212, %r143, %r160, %r211; + fma.rn.ftz.f32 %r213, %r143, %r162, %r212; + shl.b32 %r214, %r209, 23; + ex2.approx.ftz.f32 %r215, %r213; + mul.f32 %r216, %r215, %r214; + fma.rn.ftz.f32 %r217, %r142, %r152, %r151; + cvt.ftz.sat.f32.f32 %r218, %r217; + fma.rm.ftz.f32 %r219, %r218, %r156, %r155; + add.f32 %r220, %r219, 0fCB40007F; + neg.f32 %r221, %r220; + fma.rn.ftz.f32 %r222, %r142, %r160, %r221; + fma.rn.ftz.f32 %r223, %r142, %r162, %r222; + shl.b32 %r224, %r219, 23; + ex2.approx.ftz.f32 %r225, %r223; + mul.f32 %r226, %r225, %r224; + fma.rn.ftz.f32 %r227, %r141, %r152, %r151; + cvt.ftz.sat.f32.f32 %r228, %r227; + fma.rm.ftz.f32 %r229, %r228, %r156, %r155; + add.f32 %r230, %r229, 0fCB40007F; + neg.f32 %r231, %r230; + fma.rn.ftz.f32 %r232, %r141, %r160, %r231; + fma.rn.ftz.f32 %r233, %r141, %r162, %r232; + shl.b32 %r234, %r229, 23; + ex2.approx.ftz.f32 %r235, %r233; + mul.f32 %r236, %r235, %r234; + fma.rn.ftz.f32 %r237, %r149, %r152, %r151; + cvt.ftz.sat.f32.f32 %r238, %r237; + fma.rm.ftz.f32 %r239, %r238, %r156, %r155; + add.f32 %r240, %r239, 0fCB40007F; + neg.f32 %r241, %r240; + fma.rn.ftz.f32 %r242, %r149, %r160, %r241; + fma.rn.ftz.f32 %r243, %r149, %r162, %r242; + shl.b32 %r244, %r239, 23; + ex2.approx.ftz.f32 %r245, %r243; + mul.f32 %r246, %r245, %r244; + fma.rn.ftz.f32 %r247, %r150, %r152, %r151; + cvt.ftz.sat.f32.f32 %r248, %r247; + fma.rm.ftz.f32 %r249, %r248, %r156, %r155; + add.f32 %r250, %r249, 0fCB40007F; + neg.f32 %r251, %r250; + fma.rn.ftz.f32 %r252, %r150, %r160, %r251; + fma.rn.ftz.f32 %r253, %r150, %r162, %r252; + shl.b32 %r254, %r249, 23; + ex2.approx.ftz.f32 %r255, %r253; + mul.f32 %r256, %r255, %r254; + .loc 2 196 39 // triton_helpers.py:196:39 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r257, 0f3F800000, %r166, %p51; + selp.f32 %r258, 0f3F800000, %r176, %p52; + selp.f32 %r259, 0f3F800000, %r186, %p53; + selp.f32 %r260, 0f3F800000, %r196, %p54; + selp.f32 %r261, 0f3F800000, %r206, %p55; + selp.f32 %r262, 0f3F800000, %r216, %p56; + selp.f32 %r263, 0f3F800000, %r226, %p57; + selp.f32 %r264, 0f3F800000, %r236, %p58; + selp.f32 %r265, 0f3F800000, %r246, %p59; + selp.f32 %r266, 0f3F800000, %r256, %p60; + .loc 2 199 53 // triton_helpers.py:199:53 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + sub.f32 %r267, %r121, %r26; + sub.f32 %r268, %r122, %r25; + sub.f32 %r269, %r123, %r24; + sub.f32 %r270, %r124, %r23; + sub.f32 %r271, %r125, %r22; + sub.f32 %r272, %r126, %r21; + sub.f32 %r273, %r127, %r20; + sub.f32 %r274, %r128, %r19; + sub.f32 %r275, %r89, %r138; + sub.f32 %r276, %r90, %r140; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.ftz.f32 %r277, %r274, %r152, %r151; + cvt.ftz.sat.f32.f32 %r278, %r277; + fma.rm.ftz.f32 %r279, %r278, %r156, %r155; + add.f32 %r280, %r279, 0fCB40007F; + neg.f32 %r281, %r280; + fma.rn.ftz.f32 %r282, %r274, %r160, %r281; + fma.rn.ftz.f32 %r283, %r274, %r162, %r282; + shl.b32 %r284, %r279, 23; + ex2.approx.ftz.f32 %r285, %r283; + mul.f32 %r286, %r285, %r284; + fma.rn.ftz.f32 %r287, %r273, %r152, %r151; + cvt.ftz.sat.f32.f32 %r288, %r287; + fma.rm.ftz.f32 %r289, %r288, %r156, %r155; + add.f32 %r290, %r289, 0fCB40007F; + neg.f32 %r291, %r290; + fma.rn.ftz.f32 %r292, %r273, %r160, %r291; + fma.rn.ftz.f32 %r293, %r273, %r162, %r292; + shl.b32 %r294, %r289, 23; + ex2.approx.ftz.f32 %r295, %r293; + mul.f32 %r296, %r295, %r294; + fma.rn.ftz.f32 %r297, %r272, %r152, %r151; + cvt.ftz.sat.f32.f32 %r298, %r297; + fma.rm.ftz.f32 %r299, %r298, %r156, %r155; + add.f32 %r300, %r299, 0fCB40007F; + neg.f32 %r301, %r300; + fma.rn.ftz.f32 %r302, %r272, %r160, %r301; + fma.rn.ftz.f32 %r303, %r272, %r162, %r302; + shl.b32 %r304, %r299, 23; + ex2.approx.ftz.f32 %r305, %r303; + mul.f32 %r306, %r305, %r304; + fma.rn.ftz.f32 %r307, %r271, %r152, %r151; + cvt.ftz.sat.f32.f32 %r308, %r307; + fma.rm.ftz.f32 %r309, %r308, %r156, %r155; + add.f32 %r310, %r309, 0fCB40007F; + neg.f32 %r311, %r310; + fma.rn.ftz.f32 %r312, %r271, %r160, %r311; + fma.rn.ftz.f32 %r313, %r271, %r162, %r312; + shl.b32 %r314, %r309, 23; + ex2.approx.ftz.f32 %r315, %r313; + mul.f32 %r316, %r315, %r314; + fma.rn.ftz.f32 %r317, %r270, %r152, %r151; + cvt.ftz.sat.f32.f32 %r318, %r317; + fma.rm.ftz.f32 %r319, %r318, %r156, %r155; + add.f32 %r320, %r319, 0fCB40007F; + neg.f32 %r321, %r320; + fma.rn.ftz.f32 %r322, %r270, %r160, %r321; + fma.rn.ftz.f32 %r323, %r270, %r162, %r322; + shl.b32 %r324, %r319, 23; + ex2.approx.ftz.f32 %r325, %r323; + mul.f32 %r326, %r325, %r324; + fma.rn.ftz.f32 %r327, %r269, %r152, %r151; + cvt.ftz.sat.f32.f32 %r328, %r327; + fma.rm.ftz.f32 %r329, %r328, %r156, %r155; + add.f32 %r330, %r329, 0fCB40007F; + neg.f32 %r331, %r330; + fma.rn.ftz.f32 %r332, %r269, %r160, %r331; + fma.rn.ftz.f32 %r333, %r269, %r162, %r332; + shl.b32 %r334, %r329, 23; + ex2.approx.ftz.f32 %r335, %r333; + mul.f32 %r336, %r335, %r334; + fma.rn.ftz.f32 %r337, %r268, %r152, %r151; + cvt.ftz.sat.f32.f32 %r338, %r337; + fma.rm.ftz.f32 %r339, %r338, %r156, %r155; + add.f32 %r340, %r339, 0fCB40007F; + neg.f32 %r341, %r340; + fma.rn.ftz.f32 %r342, %r268, %r160, %r341; + fma.rn.ftz.f32 %r343, %r268, %r162, %r342; + shl.b32 %r344, %r339, 23; + ex2.approx.ftz.f32 %r345, %r343; + mul.f32 %r346, %r345, %r344; + fma.rn.ftz.f32 %r347, %r267, %r152, %r151; + cvt.ftz.sat.f32.f32 %r348, %r347; + fma.rm.ftz.f32 %r349, %r348, %r156, %r155; + add.f32 %r350, %r349, 0fCB40007F; + neg.f32 %r351, %r350; + fma.rn.ftz.f32 %r352, %r267, %r160, %r351; + fma.rn.ftz.f32 %r353, %r267, %r162, %r352; + shl.b32 %r354, %r349, 23; + ex2.approx.ftz.f32 %r355, %r353; + mul.f32 %r356, %r355, %r354; + fma.rn.ftz.f32 %r357, %r275, %r152, %r151; + cvt.ftz.sat.f32.f32 %r358, %r357; + fma.rm.ftz.f32 %r359, %r358, %r156, %r155; + add.f32 %r360, %r359, 0fCB40007F; + neg.f32 %r361, %r360; + fma.rn.ftz.f32 %r362, %r275, %r160, %r361; + fma.rn.ftz.f32 %r363, %r275, %r162, %r362; + shl.b32 %r364, %r359, 23; + ex2.approx.ftz.f32 %r365, %r363; + mul.f32 %r366, %r365, %r364; + fma.rn.ftz.f32 %r367, %r276, %r152, %r151; + cvt.ftz.sat.f32.f32 %r368, %r367; + fma.rm.ftz.f32 %r369, %r368, %r156, %r155; + add.f32 %r370, %r369, 0fCB40007F; + neg.f32 %r371, %r370; + fma.rn.ftz.f32 %r372, %r276, %r160, %r371; + fma.rn.ftz.f32 %r373, %r276, %r162, %r372; + shl.b32 %r374, %r369, 23; + ex2.approx.ftz.f32 %r375, %r373; + mul.f32 %r376, %r375, %r374; + .loc 2 199 39 // triton_helpers.py:199:39 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r377, 0f3F800000, %r286, %p51; + selp.f32 %r378, 0f3F800000, %r296, %p52; + selp.f32 %r379, 0f3F800000, %r306, %p53; + selp.f32 %r380, 0f3F800000, %r316, %p54; + selp.f32 %r381, 0f3F800000, %r326, %p55; + selp.f32 %r382, 0f3F800000, %r336, %p56; + selp.f32 %r383, 0f3F800000, %r346, %p57; + selp.f32 %r384, 0f3F800000, %r356, %p58; + selp.f32 %r385, 0f3F800000, %r366, %p59; + selp.f32 %r386, 0f3F800000, %r376, %p60; +$L__tmp4: + .loc 1 37 105 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:105 + mov.b32 {%rs11, %rs12}, %r51; + cvt.f32.bf16 %r387, %rs12; + cvt.f32.bf16 %r388, %rs11; + mov.b32 {%rs13, %rs14}, %r50; + cvt.f32.bf16 %r389, %rs14; + cvt.f32.bf16 %r390, %rs13; + mov.b32 {%rs15, %rs16}, %r49; + cvt.f32.bf16 %r391, %rs16; + cvt.f32.bf16 %r392, %rs15; + mov.b32 {%rs17, %rs18}, %r48; + cvt.f32.bf16 %r393, %rs18; + cvt.f32.bf16 %r394, %rs17; + mov.b32 {%rs19, %rs20}, %r59; + cvt.f32.bf16 %r395, %rs20; + cvt.f32.bf16 %r396, %rs19; + mov.b32 {%rs21, %rs22}, %r58; + cvt.f32.bf16 %r397, %rs22; + cvt.f32.bf16 %r398, %rs21; + mov.b32 {%rs23, %rs24}, %r57; + cvt.f32.bf16 %r399, %rs24; + cvt.f32.bf16 %r400, %rs23; + mov.b32 {%rs25, %rs26}, %r56; + cvt.f32.bf16 %r401, %rs26; + cvt.f32.bf16 %r402, %rs25; +$L__tmp5: + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.gt.f32 %p61, %r91, %r402; + setp.gt.f32 %p62, %r92, %r401; + setp.gt.f32 %p63, %r93, %r400; + setp.gt.f32 %p64, %r94, %r399; + setp.gt.f32 %p65, %r95, %r398; + setp.gt.f32 %p66, %r96, %r397; + setp.gt.f32 %p67, %r97, %r396; + setp.gt.f32 %p68, %r98, %r395; + setp.gt.f32 %p69, %r99, %r394; + setp.gt.f32 %p70, %r100, %r393; + setp.gt.f32 %p71, %r101, %r392; + setp.gt.f32 %p72, %r102, %r391; + setp.gt.f32 %p73, %r103, %r390; + setp.gt.f32 %p74, %r104, %r389; + setp.gt.f32 %p75, %r105, %r388; + setp.gt.f32 %p76, %r106, %r387; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r403, %r106, %r387, %p76; + selp.f32 %r404, %r106, %r403, %p26; + selp.f32 %r405, %r105, %r388, %p75; + selp.f32 %r406, %r105, %r405, %p25; + mov.b64 %rd264, {%r406, %r404}; + selp.f32 %r407, %r104, %r389, %p74; + selp.f32 %r408, %r104, %r407, %p24; + selp.f32 %r409, %r103, %r390, %p73; + selp.f32 %r410, %r103, %r409, %p23; + mov.b64 %rd263, {%r410, %r408}; + selp.f32 %r411, %r102, %r391, %p72; + selp.f32 %r412, %r102, %r411, %p22; + selp.f32 %r413, %r101, %r392, %p71; + selp.f32 %r414, %r101, %r413, %p21; + mov.b64 %rd262, {%r414, %r412}; + selp.f32 %r415, %r100, %r393, %p70; + selp.f32 %r416, %r100, %r415, %p20; + selp.f32 %r417, %r99, %r394, %p69; + selp.f32 %r418, %r99, %r417, %p19; + mov.b64 %rd261, {%r418, %r416}; + selp.f32 %r419, %r98, %r395, %p68; + selp.f32 %r420, %r98, %r419, %p18; + selp.f32 %r421, %r97, %r396, %p67; + selp.f32 %r422, %r97, %r421, %p17; + mov.b64 %rd268, {%r422, %r420}; + selp.f32 %r423, %r96, %r397, %p66; + selp.f32 %r424, %r96, %r423, %p16; + selp.f32 %r425, %r95, %r398, %p65; + selp.f32 %r426, %r95, %r425, %p15; + mov.b64 %rd267, {%r426, %r424}; + selp.f32 %r427, %r94, %r399, %p64; + selp.f32 %r428, %r94, %r427, %p14; + selp.f32 %r429, %r93, %r400, %p63; + selp.f32 %r430, %r93, %r429, %p13; + mov.b64 %rd266, {%r430, %r428}; + selp.f32 %r431, %r92, %r401, %p62; + selp.f32 %r432, %r92, %r431, %p12; + selp.f32 %r433, %r91, %r402, %p61; + selp.f32 %r434, %r91, %r433, %p11; + mov.b64 %rd265, {%r434, %r432}; + .loc 2 196 19 // triton_helpers.py:196:19 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.eq.f32 %p77, %r404, 0fFF800000; + setp.eq.f32 %p78, %r406, 0fFF800000; + setp.eq.f32 %p79, %r408, 0fFF800000; + setp.eq.f32 %p80, %r410, 0fFF800000; + setp.eq.f32 %p81, %r412, 0fFF800000; + setp.eq.f32 %p82, %r414, 0fFF800000; + setp.eq.f32 %p83, %r416, 0fFF800000; + setp.eq.f32 %p84, %r418, 0fFF800000; + setp.eq.f32 %p85, %r420, 0fFF800000; + setp.eq.f32 %p86, %r422, 0fFF800000; + setp.eq.f32 %p87, %r424, 0fFF800000; + setp.eq.f32 %p88, %r426, 0fFF800000; + setp.eq.f32 %p89, %r428, 0fFF800000; + setp.eq.f32 %p90, %r430, 0fFF800000; + setp.eq.f32 %p91, %r432, 0fFF800000; + setp.eq.f32 %p92, %r434, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + sub.f32 %r435, %r98, %r420; + sub.f32 %r436, %r97, %r422; + sub.f32 %r437, %r96, %r424; + sub.f32 %r438, %r95, %r426; + sub.f32 %r439, %r94, %r428; + sub.f32 %r440, %r93, %r430; + sub.f32 %r441, %r92, %r432; + sub.f32 %r442, %r91, %r434; + sub.f32 %r443, %r106, %r404; + sub.f32 %r444, %r105, %r406; + sub.f32 %r445, %r104, %r408; + sub.f32 %r446, %r103, %r410; + sub.f32 %r447, %r102, %r412; + sub.f32 %r448, %r101, %r414; + sub.f32 %r449, %r100, %r416; + sub.f32 %r450, %r99, %r418; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.ftz.f32 %r451, %r450, %r152, %r151; + cvt.ftz.sat.f32.f32 %r452, %r451; + fma.rm.ftz.f32 %r453, %r452, %r156, %r155; + fma.rn.ftz.f32 %r454, %r449, %r152, %r151; + cvt.ftz.sat.f32.f32 %r455, %r454; + fma.rm.ftz.f32 %r456, %r455, %r156, %r155; + fma.rn.ftz.f32 %r457, %r448, %r152, %r151; + cvt.ftz.sat.f32.f32 %r458, %r457; + fma.rm.ftz.f32 %r459, %r458, %r156, %r155; + fma.rn.ftz.f32 %r460, %r447, %r152, %r151; + cvt.ftz.sat.f32.f32 %r461, %r460; + fma.rm.ftz.f32 %r462, %r461, %r156, %r155; + fma.rn.ftz.f32 %r463, %r446, %r152, %r151; + cvt.ftz.sat.f32.f32 %r464, %r463; + fma.rm.ftz.f32 %r465, %r464, %r156, %r155; + fma.rn.ftz.f32 %r466, %r445, %r152, %r151; + cvt.ftz.sat.f32.f32 %r467, %r466; + fma.rm.ftz.f32 %r468, %r467, %r156, %r155; + fma.rn.ftz.f32 %r469, %r444, %r152, %r151; + cvt.ftz.sat.f32.f32 %r470, %r469; + fma.rm.ftz.f32 %r471, %r470, %r156, %r155; + fma.rn.ftz.f32 %r472, %r443, %r152, %r151; + cvt.ftz.sat.f32.f32 %r473, %r472; + fma.rm.ftz.f32 %r474, %r473, %r156, %r155; + fma.rn.ftz.f32 %r475, %r442, %r152, %r151; + cvt.ftz.sat.f32.f32 %r476, %r475; + fma.rm.ftz.f32 %r477, %r476, %r156, %r155; + fma.rn.ftz.f32 %r478, %r441, %r152, %r151; + cvt.ftz.sat.f32.f32 %r479, %r478; + fma.rm.ftz.f32 %r480, %r479, %r156, %r155; + fma.rn.ftz.f32 %r481, %r440, %r152, %r151; + cvt.ftz.sat.f32.f32 %r482, %r481; + fma.rm.ftz.f32 %r483, %r482, %r156, %r155; + fma.rn.ftz.f32 %r484, %r439, %r152, %r151; + cvt.ftz.sat.f32.f32 %r485, %r484; + fma.rm.ftz.f32 %r486, %r485, %r156, %r155; + fma.rn.ftz.f32 %r487, %r438, %r152, %r151; + cvt.ftz.sat.f32.f32 %r488, %r487; + fma.rm.ftz.f32 %r489, %r488, %r156, %r155; + fma.rn.ftz.f32 %r490, %r437, %r152, %r151; + cvt.ftz.sat.f32.f32 %r491, %r490; + fma.rm.ftz.f32 %r492, %r491, %r156, %r155; + fma.rn.ftz.f32 %r493, %r436, %r152, %r151; + cvt.ftz.sat.f32.f32 %r494, %r493; + fma.rm.ftz.f32 %r495, %r494, %r156, %r155; + fma.rn.ftz.f32 %r496, %r435, %r152, %r151; + cvt.ftz.sat.f32.f32 %r497, %r496; + fma.rm.ftz.f32 %r498, %r497, %r156, %r155; + mov.b64 %rd96, {%r471, %r474}; + cvt.u32.u64 %r499, %rd96; + mov.b64 %rd97, {%r465, %r468}; + cvt.u32.u64 %r500, %rd97; + mov.b64 %rd98, {%r459, %r462}; + cvt.u32.u64 %r501, %rd98; + mov.b64 %rd99, {%r453, %r456}; + cvt.u32.u64 %r502, %rd99; + mov.b64 %rd100, {%r495, %r498}; + cvt.u32.u64 %r503, %rd100; + mov.b64 %rd101, {%r489, %r492}; + cvt.u32.u64 %r504, %rd101; + mov.b64 %rd102, {%r483, %r486}; + cvt.u32.u64 %r505, %rd102; + mov.b64 %rd103, {%r477, %r480}; + cvt.u32.u64 %r506, %rd103; + add.f32 %r507, %r453, 0fCB40007F; + neg.f32 %r508, %r507; + fma.rn.ftz.f32 %r509, %r450, %r160, %r508; + fma.rn.ftz.f32 %r510, %r450, %r162, %r509; + ex2.approx.ftz.f32 %r511, %r510; + add.f32 %r512, %r456, 0fCB40007F; + neg.f32 %r513, %r512; + fma.rn.ftz.f32 %r514, %r449, %r160, %r513; + fma.rn.ftz.f32 %r515, %r449, %r162, %r514; + ex2.approx.ftz.f32 %r516, %r515; + add.f32 %r517, %r459, 0fCB40007F; + neg.f32 %r518, %r517; + fma.rn.ftz.f32 %r519, %r448, %r160, %r518; + fma.rn.ftz.f32 %r520, %r448, %r162, %r519; + ex2.approx.ftz.f32 %r521, %r520; + add.f32 %r522, %r462, 0fCB40007F; + neg.f32 %r523, %r522; + fma.rn.ftz.f32 %r524, %r447, %r160, %r523; + fma.rn.ftz.f32 %r525, %r447, %r162, %r524; + ex2.approx.ftz.f32 %r526, %r525; + add.f32 %r527, %r465, 0fCB40007F; + neg.f32 %r528, %r527; + fma.rn.ftz.f32 %r529, %r446, %r160, %r528; + fma.rn.ftz.f32 %r530, %r446, %r162, %r529; + ex2.approx.ftz.f32 %r531, %r530; + add.f32 %r532, %r468, 0fCB40007F; + neg.f32 %r533, %r532; + fma.rn.ftz.f32 %r534, %r445, %r160, %r533; + fma.rn.ftz.f32 %r535, %r445, %r162, %r534; + ex2.approx.ftz.f32 %r536, %r535; + add.f32 %r537, %r471, 0fCB40007F; + neg.f32 %r538, %r537; + fma.rn.ftz.f32 %r539, %r444, %r160, %r538; + fma.rn.ftz.f32 %r540, %r444, %r162, %r539; + ex2.approx.ftz.f32 %r541, %r540; + add.f32 %r542, %r474, 0fCB40007F; + neg.f32 %r543, %r542; + fma.rn.ftz.f32 %r544, %r443, %r160, %r543; + fma.rn.ftz.f32 %r545, %r443, %r162, %r544; + ex2.approx.ftz.f32 %r546, %r545; + add.f32 %r547, %r477, 0fCB40007F; + neg.f32 %r548, %r547; + fma.rn.ftz.f32 %r549, %r442, %r160, %r548; + fma.rn.ftz.f32 %r550, %r442, %r162, %r549; + ex2.approx.ftz.f32 %r551, %r550; + add.f32 %r552, %r480, 0fCB40007F; + neg.f32 %r553, %r552; + fma.rn.ftz.f32 %r554, %r441, %r160, %r553; + fma.rn.ftz.f32 %r555, %r441, %r162, %r554; + ex2.approx.ftz.f32 %r556, %r555; + add.f32 %r557, %r483, 0fCB40007F; + neg.f32 %r558, %r557; + fma.rn.ftz.f32 %r559, %r440, %r160, %r558; + fma.rn.ftz.f32 %r560, %r440, %r162, %r559; + ex2.approx.ftz.f32 %r561, %r560; + add.f32 %r562, %r486, 0fCB40007F; + neg.f32 %r563, %r562; + fma.rn.ftz.f32 %r564, %r439, %r160, %r563; + fma.rn.ftz.f32 %r565, %r439, %r162, %r564; + ex2.approx.ftz.f32 %r566, %r565; + add.f32 %r567, %r489, 0fCB40007F; + neg.f32 %r568, %r567; + fma.rn.ftz.f32 %r569, %r438, %r160, %r568; + fma.rn.ftz.f32 %r570, %r438, %r162, %r569; + ex2.approx.ftz.f32 %r571, %r570; + add.f32 %r572, %r492, 0fCB40007F; + neg.f32 %r573, %r572; + fma.rn.ftz.f32 %r574, %r437, %r160, %r573; + fma.rn.ftz.f32 %r575, %r437, %r162, %r574; + ex2.approx.ftz.f32 %r576, %r575; + add.f32 %r577, %r495, 0fCB40007F; + neg.f32 %r578, %r577; + fma.rn.ftz.f32 %r579, %r436, %r160, %r578; + fma.rn.ftz.f32 %r580, %r436, %r162, %r579; + ex2.approx.ftz.f32 %r581, %r580; + add.f32 %r582, %r498, 0fCB40007F; + neg.f32 %r583, %r582; + fma.rn.ftz.f32 %r584, %r435, %r160, %r583; + fma.rn.ftz.f32 %r585, %r435, %r162, %r584; + shl.b64 %rd104, %rd103, 23; + and.b64 %rd105, %rd104, -36028797018963968; + shl.b32 %r586, %r506, 23; + cvt.u64.u32 %rd106, %r586; + or.b64 %rd107, %rd106, %rd105; + shl.b64 %rd108, %rd102, 23; + and.b64 %rd109, %rd108, -36028797018963968; + shl.b32 %r587, %r505, 23; + cvt.u64.u32 %rd110, %r587; + or.b64 %rd111, %rd110, %rd109; + shl.b64 %rd112, %rd101, 23; + and.b64 %rd113, %rd112, -36028797018963968; + shl.b32 %r588, %r504, 23; + cvt.u64.u32 %rd114, %r588; + or.b64 %rd115, %rd114, %rd113; + shl.b64 %rd116, %rd100, 23; + and.b64 %rd117, %rd116, -36028797018963968; + shl.b32 %r589, %r503, 23; + cvt.u64.u32 %rd118, %r589; + or.b64 %rd119, %rd118, %rd117; + shl.b64 %rd120, %rd99, 23; + and.b64 %rd121, %rd120, -36028797018963968; + shl.b32 %r590, %r502, 23; + cvt.u64.u32 %rd122, %r590; + or.b64 %rd123, %rd122, %rd121; + shl.b64 %rd124, %rd98, 23; + and.b64 %rd125, %rd124, -36028797018963968; + shl.b32 %r591, %r501, 23; + cvt.u64.u32 %rd126, %r591; + or.b64 %rd127, %rd126, %rd125; + shl.b64 %rd128, %rd97, 23; + and.b64 %rd129, %rd128, -36028797018963968; + shl.b32 %r592, %r500, 23; + cvt.u64.u32 %rd130, %r592; + or.b64 %rd131, %rd130, %rd129; + shl.b64 %rd132, %rd96, 23; + and.b64 %rd133, %rd132, -36028797018963968; + shl.b32 %r593, %r499, 23; + cvt.u64.u32 %rd134, %r593; + or.b64 %rd135, %rd134, %rd133; + ex2.approx.ftz.f32 %r594, %r585; + mov.b64 {%r595, %r596}, %rd135; + mul.f32 %r597, %r546, %r596; + mul.f32 %r598, %r541, %r595; + mov.b64 {%r599, %r600}, %rd131; + mul.f32 %r601, %r536, %r600; + mul.f32 %r602, %r531, %r599; + mov.b64 {%r603, %r604}, %rd127; + mul.f32 %r605, %r526, %r604; + mul.f32 %r606, %r521, %r603; + mov.b64 {%r607, %r608}, %rd123; + mul.f32 %r609, %r516, %r608; + mul.f32 %r610, %r511, %r607; + mov.b64 {%r611, %r612}, %rd119; + mul.f32 %r613, %r594, %r612; + mul.f32 %r614, %r581, %r611; + mov.b64 {%r615, %r616}, %rd115; + mul.f32 %r617, %r576, %r616; + mul.f32 %r618, %r571, %r615; + mov.b64 {%r619, %r620}, %rd111; + mul.f32 %r621, %r566, %r620; + mul.f32 %r622, %r561, %r619; + mov.b64 {%r623, %r624}, %rd107; + mul.f32 %r625, %r556, %r624; + mul.f32 %r626, %r551, %r623; + .loc 2 196 39 // triton_helpers.py:196:39 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r627, 0f3F800000, %r626, %p92; + selp.f32 %r628, 0f3F800000, %r625, %p91; + selp.f32 %r629, 0f3F800000, %r622, %p90; + selp.f32 %r630, 0f3F800000, %r621, %p89; + selp.f32 %r631, 0f3F800000, %r618, %p88; + selp.f32 %r632, 0f3F800000, %r617, %p87; + selp.f32 %r633, 0f3F800000, %r614, %p86; + selp.f32 %r634, 0f3F800000, %r613, %p85; + selp.f32 %r635, 0f3F800000, %r610, %p84; + selp.f32 %r636, 0f3F800000, %r609, %p83; + selp.f32 %r637, 0f3F800000, %r606, %p82; + selp.f32 %r638, 0f3F800000, %r605, %p81; + selp.f32 %r639, 0f3F800000, %r602, %p80; + selp.f32 %r640, 0f3F800000, %r601, %p79; + selp.f32 %r641, 0f3F800000, %r598, %p78; + selp.f32 %r642, 0f3F800000, %r597, %p77; + .loc 2 199 53 // triton_helpers.py:199:53 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + sub.f32 %r643, %r395, %r420; + sub.f32 %r644, %r396, %r422; + sub.f32 %r645, %r397, %r424; + sub.f32 %r646, %r398, %r426; + sub.f32 %r647, %r399, %r428; + sub.f32 %r648, %r400, %r430; + sub.f32 %r649, %r401, %r432; + sub.f32 %r650, %r402, %r434; + sub.f32 %r651, %r387, %r404; + sub.f32 %r652, %r388, %r406; + sub.f32 %r653, %r389, %r408; + sub.f32 %r654, %r390, %r410; + sub.f32 %r655, %r391, %r412; + sub.f32 %r656, %r392, %r414; + sub.f32 %r657, %r393, %r416; + sub.f32 %r658, %r394, %r418; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.ftz.f32 %r659, %r658, %r152, %r151; + cvt.ftz.sat.f32.f32 %r660, %r659; + fma.rm.ftz.f32 %r661, %r660, %r156, %r155; + fma.rn.ftz.f32 %r662, %r657, %r152, %r151; + cvt.ftz.sat.f32.f32 %r663, %r662; + fma.rm.ftz.f32 %r664, %r663, %r156, %r155; + fma.rn.ftz.f32 %r665, %r656, %r152, %r151; + cvt.ftz.sat.f32.f32 %r666, %r665; + fma.rm.ftz.f32 %r667, %r666, %r156, %r155; + fma.rn.ftz.f32 %r668, %r655, %r152, %r151; + cvt.ftz.sat.f32.f32 %r669, %r668; + fma.rm.ftz.f32 %r670, %r669, %r156, %r155; + fma.rn.ftz.f32 %r671, %r654, %r152, %r151; + cvt.ftz.sat.f32.f32 %r672, %r671; + fma.rm.ftz.f32 %r673, %r672, %r156, %r155; + fma.rn.ftz.f32 %r674, %r653, %r152, %r151; + cvt.ftz.sat.f32.f32 %r675, %r674; + fma.rm.ftz.f32 %r676, %r675, %r156, %r155; + fma.rn.ftz.f32 %r677, %r652, %r152, %r151; + cvt.ftz.sat.f32.f32 %r678, %r677; + fma.rm.ftz.f32 %r679, %r678, %r156, %r155; + fma.rn.ftz.f32 %r680, %r651, %r152, %r151; + cvt.ftz.sat.f32.f32 %r681, %r680; + fma.rm.ftz.f32 %r682, %r681, %r156, %r155; + fma.rn.ftz.f32 %r683, %r650, %r152, %r151; + cvt.ftz.sat.f32.f32 %r684, %r683; + fma.rm.ftz.f32 %r685, %r684, %r156, %r155; + fma.rn.ftz.f32 %r686, %r649, %r152, %r151; + cvt.ftz.sat.f32.f32 %r687, %r686; + fma.rm.ftz.f32 %r688, %r687, %r156, %r155; + fma.rn.ftz.f32 %r689, %r648, %r152, %r151; + cvt.ftz.sat.f32.f32 %r690, %r689; + fma.rm.ftz.f32 %r691, %r690, %r156, %r155; + fma.rn.ftz.f32 %r692, %r647, %r152, %r151; + cvt.ftz.sat.f32.f32 %r693, %r692; + fma.rm.ftz.f32 %r694, %r693, %r156, %r155; + fma.rn.ftz.f32 %r695, %r646, %r152, %r151; + cvt.ftz.sat.f32.f32 %r696, %r695; + fma.rm.ftz.f32 %r697, %r696, %r156, %r155; + fma.rn.ftz.f32 %r698, %r645, %r152, %r151; + cvt.ftz.sat.f32.f32 %r699, %r698; + fma.rm.ftz.f32 %r700, %r699, %r156, %r155; + fma.rn.ftz.f32 %r701, %r644, %r152, %r151; + cvt.ftz.sat.f32.f32 %r702, %r701; + fma.rm.ftz.f32 %r703, %r702, %r156, %r155; + fma.rn.ftz.f32 %r704, %r643, %r152, %r151; + cvt.ftz.sat.f32.f32 %r705, %r704; + fma.rm.ftz.f32 %r706, %r705, %r156, %r155; + mov.b64 %rd136, {%r679, %r682}; + cvt.u32.u64 %r707, %rd136; + mov.b64 %rd137, {%r673, %r676}; + cvt.u32.u64 %r708, %rd137; + mov.b64 %rd138, {%r667, %r670}; + cvt.u32.u64 %r709, %rd138; + mov.b64 %rd139, {%r661, %r664}; + cvt.u32.u64 %r710, %rd139; + mov.b64 %rd140, {%r703, %r706}; + cvt.u32.u64 %r711, %rd140; + mov.b64 %rd141, {%r697, %r700}; + cvt.u32.u64 %r712, %rd141; + mov.b64 %rd142, {%r691, %r694}; + cvt.u32.u64 %r713, %rd142; + mov.b64 %rd143, {%r685, %r688}; + cvt.u32.u64 %r714, %rd143; + add.f32 %r715, %r661, 0fCB40007F; + neg.f32 %r716, %r715; + fma.rn.ftz.f32 %r717, %r658, %r160, %r716; + fma.rn.ftz.f32 %r718, %r658, %r162, %r717; + ex2.approx.ftz.f32 %r719, %r718; + add.f32 %r720, %r664, 0fCB40007F; + neg.f32 %r721, %r720; + fma.rn.ftz.f32 %r722, %r657, %r160, %r721; + fma.rn.ftz.f32 %r723, %r657, %r162, %r722; + ex2.approx.ftz.f32 %r724, %r723; + add.f32 %r725, %r667, 0fCB40007F; + neg.f32 %r726, %r725; + fma.rn.ftz.f32 %r727, %r656, %r160, %r726; + fma.rn.ftz.f32 %r728, %r656, %r162, %r727; + ex2.approx.ftz.f32 %r729, %r728; + add.f32 %r730, %r670, 0fCB40007F; + neg.f32 %r731, %r730; + fma.rn.ftz.f32 %r732, %r655, %r160, %r731; + fma.rn.ftz.f32 %r733, %r655, %r162, %r732; + ex2.approx.ftz.f32 %r734, %r733; + add.f32 %r735, %r673, 0fCB40007F; + neg.f32 %r736, %r735; + fma.rn.ftz.f32 %r737, %r654, %r160, %r736; + fma.rn.ftz.f32 %r738, %r654, %r162, %r737; + ex2.approx.ftz.f32 %r739, %r738; + add.f32 %r740, %r676, 0fCB40007F; + neg.f32 %r741, %r740; + fma.rn.ftz.f32 %r742, %r653, %r160, %r741; + fma.rn.ftz.f32 %r743, %r653, %r162, %r742; + ex2.approx.ftz.f32 %r744, %r743; + add.f32 %r745, %r679, 0fCB40007F; + neg.f32 %r746, %r745; + fma.rn.ftz.f32 %r747, %r652, %r160, %r746; + fma.rn.ftz.f32 %r748, %r652, %r162, %r747; + ex2.approx.ftz.f32 %r749, %r748; + add.f32 %r750, %r682, 0fCB40007F; + neg.f32 %r751, %r750; + fma.rn.ftz.f32 %r752, %r651, %r160, %r751; + fma.rn.ftz.f32 %r753, %r651, %r162, %r752; + ex2.approx.ftz.f32 %r754, %r753; + add.f32 %r755, %r685, 0fCB40007F; + neg.f32 %r756, %r755; + fma.rn.ftz.f32 %r757, %r650, %r160, %r756; + fma.rn.ftz.f32 %r758, %r650, %r162, %r757; + ex2.approx.ftz.f32 %r759, %r758; + add.f32 %r760, %r688, 0fCB40007F; + neg.f32 %r761, %r760; + fma.rn.ftz.f32 %r762, %r649, %r160, %r761; + fma.rn.ftz.f32 %r763, %r649, %r162, %r762; + ex2.approx.ftz.f32 %r764, %r763; + add.f32 %r765, %r691, 0fCB40007F; + neg.f32 %r766, %r765; + fma.rn.ftz.f32 %r767, %r648, %r160, %r766; + fma.rn.ftz.f32 %r768, %r648, %r162, %r767; + ex2.approx.ftz.f32 %r769, %r768; + add.f32 %r770, %r694, 0fCB40007F; + neg.f32 %r771, %r770; + fma.rn.ftz.f32 %r772, %r647, %r160, %r771; + fma.rn.ftz.f32 %r773, %r647, %r162, %r772; + ex2.approx.ftz.f32 %r774, %r773; + add.f32 %r775, %r697, 0fCB40007F; + neg.f32 %r776, %r775; + fma.rn.ftz.f32 %r777, %r646, %r160, %r776; + fma.rn.ftz.f32 %r778, %r646, %r162, %r777; + ex2.approx.ftz.f32 %r779, %r778; + add.f32 %r780, %r700, 0fCB40007F; + neg.f32 %r781, %r780; + fma.rn.ftz.f32 %r782, %r645, %r160, %r781; + fma.rn.ftz.f32 %r783, %r645, %r162, %r782; + ex2.approx.ftz.f32 %r784, %r783; + add.f32 %r785, %r703, 0fCB40007F; + neg.f32 %r786, %r785; + fma.rn.ftz.f32 %r787, %r644, %r160, %r786; + fma.rn.ftz.f32 %r788, %r644, %r162, %r787; + ex2.approx.ftz.f32 %r789, %r788; + add.f32 %r790, %r706, 0fCB40007F; + neg.f32 %r791, %r790; + fma.rn.ftz.f32 %r792, %r643, %r160, %r791; + fma.rn.ftz.f32 %r793, %r643, %r162, %r792; + shl.b64 %rd144, %rd143, 23; + and.b64 %rd145, %rd144, -36028797018963968; + shl.b32 %r794, %r714, 23; + cvt.u64.u32 %rd146, %r794; + or.b64 %rd147, %rd146, %rd145; + shl.b64 %rd148, %rd142, 23; + and.b64 %rd149, %rd148, -36028797018963968; + shl.b32 %r795, %r713, 23; + cvt.u64.u32 %rd150, %r795; + or.b64 %rd151, %rd150, %rd149; + shl.b64 %rd152, %rd141, 23; + and.b64 %rd153, %rd152, -36028797018963968; + shl.b32 %r796, %r712, 23; + cvt.u64.u32 %rd154, %r796; + or.b64 %rd155, %rd154, %rd153; + shl.b64 %rd156, %rd140, 23; + and.b64 %rd157, %rd156, -36028797018963968; + shl.b32 %r797, %r711, 23; + cvt.u64.u32 %rd158, %r797; + or.b64 %rd159, %rd158, %rd157; + shl.b64 %rd160, %rd139, 23; + and.b64 %rd161, %rd160, -36028797018963968; + shl.b32 %r798, %r710, 23; + cvt.u64.u32 %rd162, %r798; + or.b64 %rd163, %rd162, %rd161; + shl.b64 %rd164, %rd138, 23; + and.b64 %rd165, %rd164, -36028797018963968; + shl.b32 %r799, %r709, 23; + cvt.u64.u32 %rd166, %r799; + or.b64 %rd167, %rd166, %rd165; + shl.b64 %rd168, %rd137, 23; + and.b64 %rd169, %rd168, -36028797018963968; + shl.b32 %r800, %r708, 23; + cvt.u64.u32 %rd170, %r800; + or.b64 %rd171, %rd170, %rd169; + shl.b64 %rd172, %rd136, 23; + and.b64 %rd173, %rd172, -36028797018963968; + shl.b32 %r801, %r707, 23; + cvt.u64.u32 %rd174, %r801; + or.b64 %rd175, %rd174, %rd173; + ex2.approx.ftz.f32 %r802, %r793; + mov.b64 {%r803, %r804}, %rd175; + mul.f32 %r805, %r754, %r804; + mul.f32 %r806, %r749, %r803; + mov.b64 {%r807, %r808}, %rd171; + mul.f32 %r809, %r744, %r808; + mul.f32 %r810, %r739, %r807; + mov.b64 {%r811, %r812}, %rd167; + mul.f32 %r813, %r734, %r812; + mul.f32 %r814, %r729, %r811; + mov.b64 {%r815, %r816}, %rd163; + mul.f32 %r817, %r724, %r816; + mul.f32 %r818, %r719, %r815; + mov.b64 {%r819, %r820}, %rd159; + mul.f32 %r821, %r802, %r820; + mul.f32 %r822, %r789, %r819; + mov.b64 {%r823, %r824}, %rd155; + mul.f32 %r825, %r784, %r824; + mul.f32 %r826, %r779, %r823; + mov.b64 {%r827, %r828}, %rd151; + mul.f32 %r829, %r774, %r828; + mul.f32 %r830, %r769, %r827; + mov.b64 {%r831, %r832}, %rd147; + mul.f32 %r833, %r764, %r832; + mul.f32 %r834, %r759, %r831; + .loc 2 199 39 // triton_helpers.py:199:39 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r835, 0f3F800000, %r834, %p92; + selp.f32 %r836, 0f3F800000, %r833, %p91; + selp.f32 %r837, 0f3F800000, %r830, %p90; + selp.f32 %r838, 0f3F800000, %r829, %p89; + selp.f32 %r839, 0f3F800000, %r826, %p88; + selp.f32 %r840, 0f3F800000, %r825, %p87; + selp.f32 %r841, 0f3F800000, %r822, %p86; + selp.f32 %r842, 0f3F800000, %r821, %p85; + selp.f32 %r843, 0f3F800000, %r818, %p84; + selp.f32 %r844, 0f3F800000, %r817, %p83; + selp.f32 %r845, 0f3F800000, %r814, %p82; + selp.f32 %r846, 0f3F800000, %r813, %p81; + selp.f32 %r847, 0f3F800000, %r810, %p80; + selp.f32 %r848, 0f3F800000, %r809, %p79; + selp.f32 %r849, 0f3F800000, %r806, %p78; + selp.f32 %r850, 0f3F800000, %r805, %p77; + .loc 2 205 36 // triton_helpers.py:205:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + mov.b64 {%r851, %r852}, %rd256; + fma.rn.f32 %r853, %r852, %r642, %r850; + fma.rn.f32 %r854, %r851, %r641, %r849; + mov.b64 %rd256, {%r854, %r853}; + mov.b64 {%r855, %r856}, %rd255; + fma.rn.f32 %r857, %r856, %r640, %r848; + fma.rn.f32 %r858, %r855, %r639, %r847; + mov.b64 %rd255, {%r858, %r857}; + mov.b64 {%r859, %r860}, %rd254; + fma.rn.f32 %r861, %r860, %r638, %r846; + fma.rn.f32 %r862, %r859, %r637, %r845; + mov.b64 %rd254, {%r862, %r861}; + mov.b64 {%r863, %r864}, %rd253; + fma.rn.f32 %r865, %r864, %r636, %r844; + fma.rn.f32 %r866, %r863, %r635, %r843; + mov.b64 %rd253, {%r866, %r865}; + mov.b64 {%r867, %r868}, %rd260; + fma.rn.f32 %r869, %r868, %r634, %r842; + fma.rn.f32 %r870, %r867, %r633, %r841; + mov.b64 %rd260, {%r870, %r869}; + mov.b64 {%r871, %r872}, %rd259; + fma.rn.f32 %r873, %r872, %r632, %r840; + fma.rn.f32 %r874, %r871, %r631, %r839; + mov.b64 %rd259, {%r874, %r873}; + mov.b64 {%r875, %r876}, %rd258; + fma.rn.f32 %r877, %r876, %r630, %r838; + fma.rn.f32 %r878, %r875, %r629, %r837; + mov.b64 %rd258, {%r878, %r877}; + mov.b64 {%r879, %r880}, %rd257; + fma.rn.f32 %r881, %r880, %r628, %r836; + fma.rn.f32 %r882, %r879, %r627, %r835; + mov.b64 %rd257, {%r882, %r881}; + fma.rn.f32 %r2188, %r2188, %r257, %r377; + fma.rn.f32 %r2189, %r2189, %r258, %r378; + fma.rn.f32 %r2190, %r2190, %r259, %r379; + fma.rn.f32 %r2191, %r2191, %r260, %r380; + fma.rn.f32 %r2192, %r2192, %r261, %r381; + fma.rn.f32 %r2193, %r2193, %r262, %r382; + fma.rn.f32 %r2194, %r2194, %r263, %r383; + fma.rn.f32 %r2195, %r2195, %r264, %r384; + fma.rn.f32 %r883, %r2196, %r265, %r385; + fma.rn.f32 %r884, %r2197, %r266, %r386; +$L__tmp6: + .loc 1 37 105 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:105 + mov.b32 {%rs27, %rs28}, %r73; + cvt.f32.bf16 %r885, %rs28; + cvt.f32.bf16 %r886, %rs27; +$L__tmp7: + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.gt.f32 %p93, %r115, %r886; + setp.gt.f32 %p94, %r116, %r885; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r887, %r116, %r885, %p94; + selp.f32 %r888, %r116, %r887, %p38; + selp.f32 %r889, %r115, %r886, %p93; + selp.f32 %r890, %r115, %r889, %p37; + .loc 2 196 19 // triton_helpers.py:196:19 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.eq.f32 %p95, %r890, 0fFF800000; + setp.eq.f32 %p96, %r888, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + sub.f32 %r891, %r116, %r888; + sub.f32 %r892, %r115, %r890; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.ftz.f32 %r893, %r892, %r152, %r151; + cvt.ftz.sat.f32.f32 %r894, %r893; + fma.rm.ftz.f32 %r895, %r894, %r156, %r155; + fma.rn.ftz.f32 %r896, %r891, %r152, %r151; + cvt.ftz.sat.f32.f32 %r897, %r896; + fma.rm.ftz.f32 %r898, %r897, %r156, %r155; + mov.b64 %rd176, {%r895, %r898}; + cvt.u32.u64 %r899, %rd176; + add.f32 %r900, %r895, 0fCB40007F; + neg.f32 %r901, %r900; + fma.rn.ftz.f32 %r902, %r892, %r160, %r901; + fma.rn.ftz.f32 %r903, %r892, %r162, %r902; + ex2.approx.ftz.f32 %r904, %r903; + add.f32 %r905, %r898, 0fCB40007F; + neg.f32 %r906, %r905; + fma.rn.ftz.f32 %r907, %r891, %r160, %r906; + fma.rn.ftz.f32 %r908, %r891, %r162, %r907; + shl.b64 %rd177, %rd176, 23; + and.b64 %rd178, %rd177, -36028797018963968; + shl.b32 %r909, %r899, 23; + cvt.u64.u32 %rd179, %r909; + or.b64 %rd180, %rd179, %rd178; + ex2.approx.ftz.f32 %r910, %r908; + mov.b64 {%r911, %r912}, %rd180; + mul.f32 %r913, %r904, %r911; + mul.f32 %r914, %r910, %r912; + .loc 2 196 39 // triton_helpers.py:196:39 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r915, 0f3F800000, %r914, %p96; + selp.f32 %r916, 0f3F800000, %r913, %p95; + .loc 2 199 53 // triton_helpers.py:199:53 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + sub.f32 %r917, %r885, %r888; + sub.f32 %r918, %r886, %r890; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.ftz.f32 %r919, %r918, %r152, %r151; + cvt.ftz.sat.f32.f32 %r920, %r919; + fma.rm.ftz.f32 %r921, %r920, %r156, %r155; + fma.rn.ftz.f32 %r922, %r917, %r152, %r151; + cvt.ftz.sat.f32.f32 %r923, %r922; + fma.rm.ftz.f32 %r924, %r923, %r156, %r155; + mov.b64 %rd181, {%r921, %r924}; + cvt.u32.u64 %r925, %rd181; + add.f32 %r926, %r921, 0fCB40007F; + neg.f32 %r927, %r926; + fma.rn.ftz.f32 %r928, %r918, %r160, %r927; + fma.rn.ftz.f32 %r929, %r918, %r162, %r928; + ex2.approx.ftz.f32 %r930, %r929; + add.f32 %r931, %r924, 0fCB40007F; + neg.f32 %r932, %r931; + fma.rn.ftz.f32 %r933, %r917, %r160, %r932; + fma.rn.ftz.f32 %r934, %r917, %r162, %r933; + shl.b64 %rd182, %rd181, 23; + and.b64 %rd183, %rd182, -36028797018963968; + shl.b32 %r935, %r925, 23; + cvt.u64.u32 %rd184, %r935; + or.b64 %rd185, %rd184, %rd183; + ex2.approx.ftz.f32 %r936, %r934; + mov.b64 {%r937, %r938}, %rd185; + mul.f32 %r939, %r930, %r937; + mul.f32 %r940, %r936, %r938; + .loc 2 199 39 // triton_helpers.py:199:39 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r941, 0f3F800000, %r940, %p96; + selp.f32 %r942, 0f3F800000, %r939, %p95; + .loc 2 205 36 // triton_helpers.py:205:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + mov.b64 {%r943, %r944}, %rd251; + fma.rn.f32 %r945, %r943, %r916, %r942; + fma.rn.f32 %r946, %r944, %r915, %r941; +$L__tmp8: + .loc 1 37 105 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:105 + mov.b32 {%rs29, %rs30}, %r74; + cvt.f32.bf16 %r947, %rs30; + cvt.f32.bf16 %r948, %rs29; +$L__tmp9: + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.gt.f32 %p97, %r117, %r948; + setp.gt.f32 %p98, %r118, %r947; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r949, %r118, %r947, %p98; + selp.f32 %r950, %r118, %r949, %p40; + selp.f32 %r951, %r117, %r948, %p97; + selp.f32 %r952, %r117, %r951, %p39; + .loc 2 196 19 // triton_helpers.py:196:19 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.eq.f32 %p99, %r952, 0fFF800000; + setp.eq.f32 %p100, %r950, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + sub.f32 %r953, %r118, %r950; + sub.f32 %r954, %r117, %r952; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.ftz.f32 %r955, %r954, %r152, %r151; + cvt.ftz.sat.f32.f32 %r956, %r955; + fma.rm.ftz.f32 %r957, %r956, %r156, %r155; + fma.rn.ftz.f32 %r958, %r953, %r152, %r151; + cvt.ftz.sat.f32.f32 %r959, %r958; + fma.rm.ftz.f32 %r960, %r959, %r156, %r155; + mov.b64 %rd186, {%r957, %r960}; + cvt.u32.u64 %r961, %rd186; + add.f32 %r962, %r957, 0fCB40007F; + neg.f32 %r963, %r962; + fma.rn.ftz.f32 %r964, %r954, %r160, %r963; + fma.rn.ftz.f32 %r965, %r954, %r162, %r964; + ex2.approx.ftz.f32 %r966, %r965; + add.f32 %r967, %r960, 0fCB40007F; + neg.f32 %r968, %r967; + fma.rn.ftz.f32 %r969, %r953, %r160, %r968; + fma.rn.ftz.f32 %r970, %r953, %r162, %r969; + shl.b64 %rd187, %rd186, 23; + and.b64 %rd188, %rd187, -36028797018963968; + shl.b32 %r971, %r961, 23; + cvt.u64.u32 %rd189, %r971; + or.b64 %rd190, %rd189, %rd188; + ex2.approx.ftz.f32 %r972, %r970; + mov.b64 {%r973, %r974}, %rd190; + mul.f32 %r975, %r966, %r973; + mul.f32 %r976, %r972, %r974; + .loc 2 196 39 // triton_helpers.py:196:39 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r977, 0f3F800000, %r976, %p100; + selp.f32 %r978, 0f3F800000, %r975, %p99; + .loc 2 199 53 // triton_helpers.py:199:53 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + sub.f32 %r979, %r947, %r950; + sub.f32 %r980, %r948, %r952; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.ftz.f32 %r981, %r980, %r152, %r151; + cvt.ftz.sat.f32.f32 %r982, %r981; + fma.rm.ftz.f32 %r983, %r982, %r156, %r155; + fma.rn.ftz.f32 %r984, %r979, %r152, %r151; + cvt.ftz.sat.f32.f32 %r985, %r984; + fma.rm.ftz.f32 %r986, %r985, %r156, %r155; + mov.b64 %rd191, {%r983, %r986}; + cvt.u32.u64 %r987, %rd191; + add.f32 %r988, %r983, 0fCB40007F; + neg.f32 %r989, %r988; + fma.rn.ftz.f32 %r990, %r980, %r160, %r989; + fma.rn.ftz.f32 %r991, %r980, %r162, %r990; + ex2.approx.ftz.f32 %r992, %r991; + add.f32 %r993, %r986, 0fCB40007F; + neg.f32 %r994, %r993; + fma.rn.ftz.f32 %r995, %r979, %r160, %r994; + fma.rn.ftz.f32 %r996, %r979, %r162, %r995; + shl.b64 %rd192, %rd191, 23; + and.b64 %rd193, %rd192, -36028797018963968; + shl.b32 %r997, %r987, 23; + cvt.u64.u32 %rd194, %r997; + or.b64 %rd195, %rd194, %rd193; + ex2.approx.ftz.f32 %r998, %r996; + mov.b64 {%r999, %r1000}, %rd195; + mul.f32 %r1001, %r992, %r999; + mul.f32 %r1002, %r998, %r1000; + .loc 2 199 39 // triton_helpers.py:199:39 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r1003, 0f3F800000, %r1002, %p100; + selp.f32 %r1004, 0f3F800000, %r1001, %p99; + .loc 2 205 36 // triton_helpers.py:205:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + mov.b64 {%r1005, %r1006}, %rd249; + fma.rn.f32 %r1007, %r1005, %r978, %r1004; + fma.rn.f32 %r1008, %r1006, %r977, %r1003; +$L__tmp10: + .loc 1 37 105 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:105 + mov.b32 {%rs31, %rs32}, %r75; + cvt.f32.bf16 %r1009, %rs32; + cvt.f32.bf16 %r1010, %rs31; +$L__tmp11: + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.gt.f32 %p101, %r119, %r1010; + setp.gt.f32 %p102, %r120, %r1009; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r1011, %r120, %r1009, %p102; + selp.f32 %r1012, %r120, %r1011, %p42; + selp.f32 %r1013, %r119, %r1010, %p101; + selp.f32 %r1014, %r119, %r1013, %p41; + .loc 2 196 19 // triton_helpers.py:196:19 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.eq.f32 %p103, %r1014, 0fFF800000; + setp.eq.f32 %p104, %r1012, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + sub.f32 %r1015, %r120, %r1012; + sub.f32 %r1016, %r119, %r1014; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.ftz.f32 %r1017, %r1016, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1018, %r1017; + fma.rm.ftz.f32 %r1019, %r1018, %r156, %r155; + fma.rn.ftz.f32 %r1020, %r1015, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1021, %r1020; + fma.rm.ftz.f32 %r1022, %r1021, %r156, %r155; + mov.b64 %rd196, {%r1019, %r1022}; + cvt.u32.u64 %r1023, %rd196; + add.f32 %r1024, %r1019, 0fCB40007F; + neg.f32 %r1025, %r1024; + fma.rn.ftz.f32 %r1026, %r1016, %r160, %r1025; + fma.rn.ftz.f32 %r1027, %r1016, %r162, %r1026; + ex2.approx.ftz.f32 %r1028, %r1027; + add.f32 %r1029, %r1022, 0fCB40007F; + neg.f32 %r1030, %r1029; + fma.rn.ftz.f32 %r1031, %r1015, %r160, %r1030; + fma.rn.ftz.f32 %r1032, %r1015, %r162, %r1031; + shl.b64 %rd197, %rd196, 23; + and.b64 %rd198, %rd197, -36028797018963968; + shl.b32 %r1033, %r1023, 23; + cvt.u64.u32 %rd199, %r1033; + or.b64 %rd200, %rd199, %rd198; + ex2.approx.ftz.f32 %r1034, %r1032; + mov.b64 {%r1035, %r1036}, %rd200; + mul.f32 %r1037, %r1028, %r1035; + mul.f32 %r1038, %r1034, %r1036; + .loc 2 196 39 // triton_helpers.py:196:39 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r1039, 0f3F800000, %r1038, %p104; + selp.f32 %r1040, 0f3F800000, %r1037, %p103; + .loc 2 199 53 // triton_helpers.py:199:53 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + sub.f32 %r1041, %r1009, %r1012; + sub.f32 %r1042, %r1010, %r1014; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.ftz.f32 %r1043, %r1042, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1044, %r1043; + fma.rm.ftz.f32 %r1045, %r1044, %r156, %r155; + fma.rn.ftz.f32 %r1046, %r1041, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1047, %r1046; + fma.rm.ftz.f32 %r1048, %r1047, %r156, %r155; + mov.b64 %rd201, {%r1045, %r1048}; + cvt.u32.u64 %r1049, %rd201; + add.f32 %r1050, %r1045, 0fCB40007F; + neg.f32 %r1051, %r1050; + fma.rn.ftz.f32 %r1052, %r1042, %r160, %r1051; + fma.rn.ftz.f32 %r1053, %r1042, %r162, %r1052; + ex2.approx.ftz.f32 %r1054, %r1053; + add.f32 %r1055, %r1048, 0fCB40007F; + neg.f32 %r1056, %r1055; + fma.rn.ftz.f32 %r1057, %r1041, %r160, %r1056; + fma.rn.ftz.f32 %r1058, %r1041, %r162, %r1057; + shl.b64 %rd202, %rd201, 23; + and.b64 %rd203, %rd202, -36028797018963968; + shl.b32 %r1059, %r1049, 23; + cvt.u64.u32 %rd204, %r1059; + or.b64 %rd205, %rd204, %rd203; + ex2.approx.ftz.f32 %r1060, %r1058; + mov.b64 {%r1061, %r1062}, %rd205; + mul.f32 %r1063, %r1054, %r1061; + mul.f32 %r1064, %r1060, %r1062; + .loc 2 199 39 // triton_helpers.py:199:39 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + selp.f32 %r1065, 0f3F800000, %r1064, %p104; + selp.f32 %r1066, 0f3F800000, %r1063, %p103; + .loc 2 205 36 // triton_helpers.py:205:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + mov.b64 {%r1067, %r1068}, %rd247; + fma.rn.f32 %r1069, %r1067, %r1040, %r1066; + fma.rn.f32 %r1070, %r1068, %r1039, %r1065; +$L__tmp12: + .loc 1 45 54 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:45:54 + selp.f32 %r2198, %r138, %r2198, %p7; + selp.f32 %r2199, %r140, %r2199, %p7; + selp.f32 %r1071, %r888, %r116, %p7; + selp.f32 %r1072, %r890, %r115, %p7; + mov.b64 %rd252, {%r1072, %r1071}; + selp.f32 %r1073, %r950, %r118, %p7; + selp.f32 %r1074, %r952, %r117, %p7; + mov.b64 %rd250, {%r1074, %r1073}; + selp.f32 %r1075, %r1012, %r120, %p7; + selp.f32 %r1076, %r1014, %r119, %p7; + mov.b64 %rd248, {%r1076, %r1075}; + .loc 1 46 54 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:46:54 + selp.f32 %r2196, %r883, %r2196, %p7; + selp.f32 %r2197, %r884, %r2197, %p7; + selp.f32 %r1077, %r946, %r944, %p7; + selp.f32 %r1078, %r945, %r943, %p7; + mov.b64 %rd251, {%r1078, %r1077}; + selp.f32 %r1079, %r1008, %r1006, %p7; + selp.f32 %r1080, %r1007, %r1005, %p7; + mov.b64 %rd249, {%r1080, %r1079}; + selp.f32 %r1081, %r1070, %r1068, %p7; + selp.f32 %r1082, %r1069, %r1067, %p7; + mov.b64 %rd247, {%r1082, %r1081}; + mov.b64 %rd246, 16384; + mov.pred %p206, 0; + .loc 1 31 40 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:31:40 + @%p1 bra $L__BB0_1; +// %bb.2: +$L__tmp13: + .loc 2 199 53 // triton_helpers.py:199:53 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + mov.b64 {%r1095, %r1096}, %rd268; + mov.b64 {%r1097, %r1098}, %rd267; + mov.b64 {%r1099, %r1100}, %rd266; + mov.b64 {%r1101, %r1102}, %rd265; + mov.b64 {%r1103, %r1104}, %rd264; + mov.b64 {%r1105, %r1106}, %rd263; + mov.b64 {%r1107, %r1108}, %rd262; + mov.b64 {%r1109, %r1110}, %rd261; +$L__tmp14: + .loc 1 26 37 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:26:37 + and.b32 %r1111, %r1, 31; +$L__tmp15: + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p112, %r1109, %r1110; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p113, %r1109, %r1109; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1112, %r1109, %r1110, %p113; + selp.f32 %r1113, %r1109, %r1112, %p112; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p114, %r1113, %r1107; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p115, %r1113, %r1113; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1114, %r1113, %r1107, %p115; + selp.f32 %r1115, %r1113, %r1114, %p114; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p116, %r1115, %r1108; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p117, %r1115, %r1115; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1116, %r1115, %r1108, %p117; + selp.f32 %r1117, %r1115, %r1116, %p116; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p118, %r1117, %r1105; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p119, %r1117, %r1117; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1118, %r1117, %r1105, %p119; + selp.f32 %r1119, %r1117, %r1118, %p118; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p120, %r1119, %r1106; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p121, %r1119, %r1119; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1120, %r1119, %r1106, %p121; + selp.f32 %r1121, %r1119, %r1120, %p120; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p122, %r1121, %r1103; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p123, %r1121, %r1121; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1122, %r1121, %r1103, %p123; + selp.f32 %r1123, %r1121, %r1122, %p122; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p124, %r1123, %r1104; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p125, %r1123, %r1123; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1124, %r1123, %r1104, %p125; + selp.f32 %r1125, %r1123, %r1124, %p124; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p126, %r1125, %r1101; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p127, %r1125, %r1125; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1126, %r1125, %r1101, %p127; + selp.f32 %r1127, %r1125, %r1126, %p126; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p128, %r1127, %r1102; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p129, %r1127, %r1127; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1128, %r1127, %r1102, %p129; + selp.f32 %r1129, %r1127, %r1128, %p128; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p130, %r1129, %r1099; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p131, %r1129, %r1129; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1130, %r1129, %r1099, %p131; + selp.f32 %r1131, %r1129, %r1130, %p130; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p132, %r1131, %r1100; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p133, %r1131, %r1131; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1132, %r1131, %r1100, %p133; + selp.f32 %r1133, %r1131, %r1132, %p132; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p134, %r1133, %r1097; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p135, %r1133, %r1133; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1134, %r1133, %r1097, %p135; + selp.f32 %r1135, %r1133, %r1134, %p134; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p136, %r1135, %r1098; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p137, %r1135, %r1135; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1136, %r1135, %r1098, %p137; + selp.f32 %r1137, %r1135, %r1136, %p136; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p138, %r1137, %r1095; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p139, %r1137, %r1137; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1138, %r1137, %r1095, %p139; + selp.f32 %r1139, %r1137, %r1138, %p138; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p140, %r1139, %r1096; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p141, %r1139, %r1139; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1140, %r1139, %r1096, %p141; + selp.f32 %r1141, %r1139, %r1140, %p140; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p142, %r1141, %r19; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p143, %r1141, %r1141; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1142, %r1141, %r19, %p143; + selp.f32 %r1143, %r1141, %r1142, %p142; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p144, %r1143, %r20; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p145, %r1143, %r1143; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1144, %r1143, %r20, %p145; + selp.f32 %r1145, %r1143, %r1144, %p144; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p146, %r1145, %r21; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p147, %r1145, %r1145; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1146, %r1145, %r21, %p147; + selp.f32 %r1147, %r1145, %r1146, %p146; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p148, %r1147, %r22; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p149, %r1147, %r1147; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1148, %r1147, %r22, %p149; + selp.f32 %r1149, %r1147, %r1148, %p148; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p150, %r1149, %r23; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p151, %r1149, %r1149; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1150, %r1149, %r23, %p151; + selp.f32 %r1151, %r1149, %r1150, %p150; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p152, %r1151, %r24; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p153, %r1151, %r1151; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1152, %r1151, %r24, %p153; + selp.f32 %r1153, %r1151, %r1152, %p152; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p154, %r1153, %r25; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p155, %r1153, %r1153; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1154, %r1153, %r25, %p155; + selp.f32 %r1155, %r1153, %r1154, %p154; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p156, %r1155, %r26; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p157, %r1155, %r1155; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1156, %r1155, %r26, %p157; + selp.f32 %r1157, %r1155, %r1156, %p156; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p158, %r1157, %r2198; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p159, %r1157, %r1157; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1158, %r1157, %r2198, %p159; + selp.f32 %r1159, %r1157, %r1158, %p158; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p160, %r1159, %r2199; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p161, %r1159, %r1159; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1160, %r1159, %r2199, %p161; + selp.f32 %r1161, %r1159, %r1160, %p160; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mov.b64 {%r1162, %r1163}, %rd252; + setp.gt.f32 %p162, %r1161, %r1162; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p163, %r1161, %r1161; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1164, %r1161, %r1162, %p163; + selp.f32 %r1165, %r1161, %r1164, %p162; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p164, %r1165, %r1163; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p165, %r1165, %r1165; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1166, %r1165, %r1163, %p165; + selp.f32 %r1167, %r1165, %r1166, %p164; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mov.b64 {%r1168, %r1169}, %rd250; + setp.gt.f32 %p166, %r1167, %r1168; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p167, %r1167, %r1167; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1170, %r1167, %r1168, %p167; + selp.f32 %r1171, %r1167, %r1170, %p166; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p168, %r1171, %r1169; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p169, %r1171, %r1171; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1172, %r1171, %r1169, %p169; + selp.f32 %r1173, %r1171, %r1172, %p168; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mov.b64 {%r1174, %r1175}, %rd248; + setp.gt.f32 %p170, %r1173, %r1174; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p171, %r1173, %r1173; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1176, %r1173, %r1174, %p171; + selp.f32 %r1177, %r1173, %r1176, %p170; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p172, %r1177, %r1175; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p173, %r1177, %r1177; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1178, %r1177, %r1175, %p173; + selp.f32 %r1179, %r1177, %r1178, %p172; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1180, %r1179, 16, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p174, %r1179, %r1180; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p175, %r1179, %r1179; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1181, %r1179, %r1180, %p174; + selp.f32 %r1182, %r1179, %r1181, %p175; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1183, %r1182, 8, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p176, %r1182, %r1183; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p177, %r1182, %r1182; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1184, %r1182, %r1183, %p177; + selp.f32 %r1185, %r1182, %r1184, %p176; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1186, %r1185, 4, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p178, %r1185, %r1186; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p179, %r1185, %r1185; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1187, %r1185, %r1186, %p179; + selp.f32 %r1188, %r1185, %r1187, %p178; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1189, %r1188, 2, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p180, %r1188, %r1189; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p181, %r1188, %r1188; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1190, %r1188, %r1189, %p181; + selp.f32 %r1191, %r1188, %r1190, %p180; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1192, %r1191, 1, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p182, %r1191, %r1192; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p183, %r1191, %r1191; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.eq.b32 %p105, %r1111, 0; + shr.u32 %r1193, %r1, 3; + and.b32 %r1194, %r1193, 60; + mov.b32 %r1195, global_smem; + add.s32 %r1083, %r1195, %r1194; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.b32 %r1196, %r1191, %r1192, %p183; + selp.b32 %r1084, %r1191, %r1196, %p182; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + // begin inline asm + @%p105 st.shared.b32 [ %r1083 + 0 ], %r1084; + // end inline asm + bar.sync 0; + setp.lt.u32 %p106, %r1, 16; + shl.b32 %r1197, %r1, 2; + add.s32 %r1086, %r1195, %r1197; + // begin inline asm + @%p106 ld.shared.b32 %r1085, [ %r1086 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1198, %r1085, 8, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p184, %r1085, %r1198; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p185, %r1085, %r1085; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1199, %r1085, %r1198, %p184; + selp.f32 %r1200, %r1085, %r1199, %p185; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1201, %r1200, 4, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p186, %r1200, %r1201; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p187, %r1200, %r1200; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1202, %r1200, %r1201, %p187; + selp.f32 %r1203, %r1200, %r1202, %p186; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1204, %r1203, 2, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p188, %r1203, %r1204; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p189, %r1203, %r1203; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1205, %r1203, %r1204, %p189; + selp.f32 %r1206, %r1203, %r1205, %p188; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1207, %r1206, 1, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p190, %r1206, %r1207; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p191, %r1206, %r1206; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.eq.b32 %p107, %r1, 0; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.b32 %r1208, %r1206, %r1207, %p191; + selp.b32 %r1088, %r1206, %r1208, %p190; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + // begin inline asm + @%p107 st.shared.b32 [ %r1086 + 0 ], %r1088; + // end inline asm + bar.sync 0; + ld.shared.b32 %r39, [global_smem]; + .loc 2 180 40 // triton_helpers.py:180:40 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.eq.f32 %p192, %r39, 0fFF800000; + .loc 2 180 68 // triton_helpers.py:180:68 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + sub.f32 %r1209, %r1109, %r39; + sub.f32 %r1210, %r1110, %r39; + sub.f32 %r1211, %r1107, %r39; + sub.f32 %r1212, %r1108, %r39; + sub.f32 %r1213, %r1105, %r39; + sub.f32 %r1214, %r1106, %r39; + sub.f32 %r1215, %r1103, %r39; + sub.f32 %r1216, %r1104, %r39; + sub.f32 %r1217, %r1101, %r39; + sub.f32 %r1218, %r1102, %r39; + sub.f32 %r1219, %r1099, %r39; + sub.f32 %r1220, %r1100, %r39; + sub.f32 %r1221, %r1097, %r39; + sub.f32 %r1222, %r1098, %r39; + sub.f32 %r1223, %r1095, %r39; + sub.f32 %r1224, %r1096, %r39; + sub.f32 %r1225, %r19, %r39; + sub.f32 %r1226, %r20, %r39; + sub.f32 %r1227, %r21, %r39; + sub.f32 %r1228, %r22, %r39; + sub.f32 %r1229, %r23, %r39; + sub.f32 %r1230, %r24, %r39; + sub.f32 %r1231, %r25, %r39; + sub.f32 %r1232, %r26, %r39; + sub.f32 %r1233, %r2198, %r39; + sub.f32 %r1234, %r2199, %r39; + sub.f32 %r1235, %r1162, %r39; + sub.f32 %r1236, %r1163, %r39; + sub.f32 %r1237, %r1168, %r39; + sub.f32 %r1238, %r1169, %r39; + sub.f32 %r1239, %r1174, %r39; + sub.f32 %r1240, %r1175, %r39; + .loc 2 180 58 // triton_helpers.py:180:58 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1241, 0f00000000, %r1209, %p192; + selp.f32 %r1242, 0f00000000, %r1210, %p192; + selp.f32 %r1243, 0f00000000, %r1211, %p192; + selp.f32 %r1244, 0f00000000, %r1212, %p192; + selp.f32 %r1245, 0f00000000, %r1213, %p192; + selp.f32 %r1246, 0f00000000, %r1214, %p192; + selp.f32 %r1247, 0f00000000, %r1215, %p192; + selp.f32 %r1248, 0f00000000, %r1216, %p192; + selp.f32 %r1249, 0f00000000, %r1217, %p192; + selp.f32 %r1250, 0f00000000, %r1218, %p192; + selp.f32 %r1251, 0f00000000, %r1219, %p192; + selp.f32 %r1252, 0f00000000, %r1220, %p192; + selp.f32 %r1253, 0f00000000, %r1221, %p192; + selp.f32 %r1254, 0f00000000, %r1222, %p192; + selp.f32 %r1255, 0f00000000, %r1223, %p192; + selp.f32 %r1256, 0f00000000, %r1224, %p192; + selp.f32 %r1257, 0f00000000, %r1225, %p192; + selp.f32 %r1258, 0f00000000, %r1226, %p192; + selp.f32 %r1259, 0f00000000, %r1227, %p192; + selp.f32 %r1260, 0f00000000, %r1228, %p192; + selp.f32 %r1261, 0f00000000, %r1229, %p192; + selp.f32 %r1262, 0f00000000, %r1230, %p192; + selp.f32 %r1263, 0f00000000, %r1231, %p192; + selp.f32 %r1264, 0f00000000, %r1232, %p192; + selp.f32 %r1265, 0f00000000, %r1233, %p192; + selp.f32 %r1266, 0f00000000, %r1234, %p192; + selp.f32 %r1267, 0f00000000, %r1235, %p192; + selp.f32 %r1268, 0f00000000, %r1236, %p192; + selp.f32 %r1269, 0f00000000, %r1237, %p192; + selp.f32 %r1270, 0f00000000, %r1238, %p192; + selp.f32 %r1271, 0f00000000, %r1239, %p192; + selp.f32 %r1272, 0f00000000, %r1240, %p192; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + fma.rn.ftz.f32 %r1275, %r1241, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1276, %r1275; + fma.rm.ftz.f32 %r1279, %r1276, %r156, %r155; + add.f32 %r1280, %r1279, 0fCB40007F; + neg.f32 %r1281, %r1280; + fma.rn.ftz.f32 %r1283, %r1241, %r160, %r1281; + fma.rn.ftz.f32 %r1285, %r1241, %r162, %r1283; + shl.b32 %r1286, %r1279, 23; + ex2.approx.ftz.f32 %r1287, %r1285; + mul.f32 %r1288, %r1287, %r1286; + fma.rn.ftz.f32 %r1289, %r1242, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1290, %r1289; + fma.rm.ftz.f32 %r1291, %r1290, %r156, %r155; + add.f32 %r1292, %r1291, 0fCB40007F; + neg.f32 %r1293, %r1292; + fma.rn.ftz.f32 %r1294, %r1242, %r160, %r1293; + fma.rn.ftz.f32 %r1295, %r1242, %r162, %r1294; + shl.b32 %r1296, %r1291, 23; + ex2.approx.ftz.f32 %r1297, %r1295; + mul.f32 %r1298, %r1297, %r1296; + fma.rn.ftz.f32 %r1299, %r1243, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1300, %r1299; + fma.rm.ftz.f32 %r1301, %r1300, %r156, %r155; + add.f32 %r1302, %r1301, 0fCB40007F; + neg.f32 %r1303, %r1302; + fma.rn.ftz.f32 %r1304, %r1243, %r160, %r1303; + fma.rn.ftz.f32 %r1305, %r1243, %r162, %r1304; + shl.b32 %r1306, %r1301, 23; + ex2.approx.ftz.f32 %r1307, %r1305; + mul.f32 %r1308, %r1307, %r1306; + fma.rn.ftz.f32 %r1309, %r1244, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1310, %r1309; + fma.rm.ftz.f32 %r1311, %r1310, %r156, %r155; + add.f32 %r1312, %r1311, 0fCB40007F; + neg.f32 %r1313, %r1312; + fma.rn.ftz.f32 %r1314, %r1244, %r160, %r1313; + fma.rn.ftz.f32 %r1315, %r1244, %r162, %r1314; + shl.b32 %r1316, %r1311, 23; + ex2.approx.ftz.f32 %r1317, %r1315; + mul.f32 %r1318, %r1317, %r1316; + fma.rn.ftz.f32 %r1319, %r1245, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1320, %r1319; + fma.rm.ftz.f32 %r1321, %r1320, %r156, %r155; + add.f32 %r1322, %r1321, 0fCB40007F; + neg.f32 %r1323, %r1322; + fma.rn.ftz.f32 %r1324, %r1245, %r160, %r1323; + fma.rn.ftz.f32 %r1325, %r1245, %r162, %r1324; + shl.b32 %r1326, %r1321, 23; + ex2.approx.ftz.f32 %r1327, %r1325; + mul.f32 %r1328, %r1327, %r1326; + fma.rn.ftz.f32 %r1329, %r1246, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1330, %r1329; + fma.rm.ftz.f32 %r1331, %r1330, %r156, %r155; + add.f32 %r1332, %r1331, 0fCB40007F; + neg.f32 %r1333, %r1332; + fma.rn.ftz.f32 %r1334, %r1246, %r160, %r1333; + fma.rn.ftz.f32 %r1335, %r1246, %r162, %r1334; + shl.b32 %r1336, %r1331, 23; + ex2.approx.ftz.f32 %r1337, %r1335; + mul.f32 %r1338, %r1337, %r1336; + fma.rn.ftz.f32 %r1339, %r1247, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1340, %r1339; + fma.rm.ftz.f32 %r1341, %r1340, %r156, %r155; + add.f32 %r1342, %r1341, 0fCB40007F; + neg.f32 %r1343, %r1342; + fma.rn.ftz.f32 %r1344, %r1247, %r160, %r1343; + fma.rn.ftz.f32 %r1345, %r1247, %r162, %r1344; + shl.b32 %r1346, %r1341, 23; + ex2.approx.ftz.f32 %r1347, %r1345; + mul.f32 %r1348, %r1347, %r1346; + fma.rn.ftz.f32 %r1349, %r1248, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1350, %r1349; + fma.rm.ftz.f32 %r1351, %r1350, %r156, %r155; + add.f32 %r1352, %r1351, 0fCB40007F; + neg.f32 %r1353, %r1352; + fma.rn.ftz.f32 %r1354, %r1248, %r160, %r1353; + fma.rn.ftz.f32 %r1355, %r1248, %r162, %r1354; + shl.b32 %r1356, %r1351, 23; + ex2.approx.ftz.f32 %r1357, %r1355; + mul.f32 %r1358, %r1357, %r1356; + fma.rn.ftz.f32 %r1359, %r1249, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1360, %r1359; + fma.rm.ftz.f32 %r1361, %r1360, %r156, %r155; + add.f32 %r1362, %r1361, 0fCB40007F; + neg.f32 %r1363, %r1362; + fma.rn.ftz.f32 %r1364, %r1249, %r160, %r1363; + fma.rn.ftz.f32 %r1365, %r1249, %r162, %r1364; + shl.b32 %r1366, %r1361, 23; + ex2.approx.ftz.f32 %r1367, %r1365; + mul.f32 %r1368, %r1367, %r1366; + fma.rn.ftz.f32 %r1369, %r1250, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1370, %r1369; + fma.rm.ftz.f32 %r1371, %r1370, %r156, %r155; + add.f32 %r1372, %r1371, 0fCB40007F; + neg.f32 %r1373, %r1372; + fma.rn.ftz.f32 %r1374, %r1250, %r160, %r1373; + fma.rn.ftz.f32 %r1375, %r1250, %r162, %r1374; + shl.b32 %r1376, %r1371, 23; + ex2.approx.ftz.f32 %r1377, %r1375; + mul.f32 %r1378, %r1377, %r1376; + fma.rn.ftz.f32 %r1379, %r1251, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1380, %r1379; + fma.rm.ftz.f32 %r1381, %r1380, %r156, %r155; + add.f32 %r1382, %r1381, 0fCB40007F; + neg.f32 %r1383, %r1382; + fma.rn.ftz.f32 %r1384, %r1251, %r160, %r1383; + fma.rn.ftz.f32 %r1385, %r1251, %r162, %r1384; + shl.b32 %r1386, %r1381, 23; + ex2.approx.ftz.f32 %r1387, %r1385; + mul.f32 %r1388, %r1387, %r1386; + fma.rn.ftz.f32 %r1389, %r1252, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1390, %r1389; + fma.rm.ftz.f32 %r1391, %r1390, %r156, %r155; + add.f32 %r1392, %r1391, 0fCB40007F; + neg.f32 %r1393, %r1392; + fma.rn.ftz.f32 %r1394, %r1252, %r160, %r1393; + fma.rn.ftz.f32 %r1395, %r1252, %r162, %r1394; + shl.b32 %r1396, %r1391, 23; + ex2.approx.ftz.f32 %r1397, %r1395; + mul.f32 %r1398, %r1397, %r1396; + fma.rn.ftz.f32 %r1399, %r1253, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1400, %r1399; + fma.rm.ftz.f32 %r1401, %r1400, %r156, %r155; + add.f32 %r1402, %r1401, 0fCB40007F; + neg.f32 %r1403, %r1402; + fma.rn.ftz.f32 %r1404, %r1253, %r160, %r1403; + fma.rn.ftz.f32 %r1405, %r1253, %r162, %r1404; + shl.b32 %r1406, %r1401, 23; + ex2.approx.ftz.f32 %r1407, %r1405; + mul.f32 %r1408, %r1407, %r1406; + fma.rn.ftz.f32 %r1409, %r1254, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1410, %r1409; + fma.rm.ftz.f32 %r1411, %r1410, %r156, %r155; + add.f32 %r1412, %r1411, 0fCB40007F; + neg.f32 %r1413, %r1412; + fma.rn.ftz.f32 %r1414, %r1254, %r160, %r1413; + fma.rn.ftz.f32 %r1415, %r1254, %r162, %r1414; + shl.b32 %r1416, %r1411, 23; + ex2.approx.ftz.f32 %r1417, %r1415; + mul.f32 %r1418, %r1417, %r1416; + fma.rn.ftz.f32 %r1419, %r1255, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1420, %r1419; + fma.rm.ftz.f32 %r1421, %r1420, %r156, %r155; + add.f32 %r1422, %r1421, 0fCB40007F; + neg.f32 %r1423, %r1422; + fma.rn.ftz.f32 %r1424, %r1255, %r160, %r1423; + fma.rn.ftz.f32 %r1425, %r1255, %r162, %r1424; + shl.b32 %r1426, %r1421, 23; + ex2.approx.ftz.f32 %r1427, %r1425; + mul.f32 %r1428, %r1427, %r1426; + fma.rn.ftz.f32 %r1429, %r1256, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1430, %r1429; + fma.rm.ftz.f32 %r1431, %r1430, %r156, %r155; + add.f32 %r1432, %r1431, 0fCB40007F; + neg.f32 %r1433, %r1432; + fma.rn.ftz.f32 %r1434, %r1256, %r160, %r1433; + fma.rn.ftz.f32 %r1435, %r1256, %r162, %r1434; + shl.b32 %r1436, %r1431, 23; + ex2.approx.ftz.f32 %r1437, %r1435; + mul.f32 %r1438, %r1437, %r1436; + fma.rn.ftz.f32 %r1439, %r1257, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1440, %r1439; + fma.rm.ftz.f32 %r1441, %r1440, %r156, %r155; + add.f32 %r1442, %r1441, 0fCB40007F; + neg.f32 %r1443, %r1442; + fma.rn.ftz.f32 %r1444, %r1257, %r160, %r1443; + fma.rn.ftz.f32 %r1445, %r1257, %r162, %r1444; + shl.b32 %r1446, %r1441, 23; + ex2.approx.ftz.f32 %r1447, %r1445; + mul.f32 %r1448, %r1447, %r1446; + fma.rn.ftz.f32 %r1449, %r1258, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1450, %r1449; + fma.rm.ftz.f32 %r1451, %r1450, %r156, %r155; + add.f32 %r1452, %r1451, 0fCB40007F; + neg.f32 %r1453, %r1452; + fma.rn.ftz.f32 %r1454, %r1258, %r160, %r1453; + fma.rn.ftz.f32 %r1455, %r1258, %r162, %r1454; + shl.b32 %r1456, %r1451, 23; + ex2.approx.ftz.f32 %r1457, %r1455; + mul.f32 %r1458, %r1457, %r1456; + fma.rn.ftz.f32 %r1459, %r1259, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1460, %r1459; + fma.rm.ftz.f32 %r1461, %r1460, %r156, %r155; + add.f32 %r1462, %r1461, 0fCB40007F; + neg.f32 %r1463, %r1462; + fma.rn.ftz.f32 %r1464, %r1259, %r160, %r1463; + fma.rn.ftz.f32 %r1465, %r1259, %r162, %r1464; + shl.b32 %r1466, %r1461, 23; + ex2.approx.ftz.f32 %r1467, %r1465; + mul.f32 %r1468, %r1467, %r1466; + fma.rn.ftz.f32 %r1469, %r1260, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1470, %r1469; + fma.rm.ftz.f32 %r1471, %r1470, %r156, %r155; + add.f32 %r1472, %r1471, 0fCB40007F; + neg.f32 %r1473, %r1472; + fma.rn.ftz.f32 %r1474, %r1260, %r160, %r1473; + fma.rn.ftz.f32 %r1475, %r1260, %r162, %r1474; + shl.b32 %r1476, %r1471, 23; + ex2.approx.ftz.f32 %r1477, %r1475; + mul.f32 %r1478, %r1477, %r1476; + fma.rn.ftz.f32 %r1479, %r1261, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1480, %r1479; + fma.rm.ftz.f32 %r1481, %r1480, %r156, %r155; + add.f32 %r1482, %r1481, 0fCB40007F; + neg.f32 %r1483, %r1482; + fma.rn.ftz.f32 %r1484, %r1261, %r160, %r1483; + fma.rn.ftz.f32 %r1485, %r1261, %r162, %r1484; + shl.b32 %r1486, %r1481, 23; + ex2.approx.ftz.f32 %r1487, %r1485; + mul.f32 %r1488, %r1487, %r1486; + fma.rn.ftz.f32 %r1489, %r1262, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1490, %r1489; + fma.rm.ftz.f32 %r1491, %r1490, %r156, %r155; + add.f32 %r1492, %r1491, 0fCB40007F; + neg.f32 %r1493, %r1492; + fma.rn.ftz.f32 %r1494, %r1262, %r160, %r1493; + fma.rn.ftz.f32 %r1495, %r1262, %r162, %r1494; + shl.b32 %r1496, %r1491, 23; + ex2.approx.ftz.f32 %r1497, %r1495; + mul.f32 %r1498, %r1497, %r1496; + fma.rn.ftz.f32 %r1499, %r1263, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1500, %r1499; + fma.rm.ftz.f32 %r1501, %r1500, %r156, %r155; + add.f32 %r1502, %r1501, 0fCB40007F; + neg.f32 %r1503, %r1502; + fma.rn.ftz.f32 %r1504, %r1263, %r160, %r1503; + fma.rn.ftz.f32 %r1505, %r1263, %r162, %r1504; + shl.b32 %r1506, %r1501, 23; + ex2.approx.ftz.f32 %r1507, %r1505; + mul.f32 %r1508, %r1507, %r1506; + fma.rn.ftz.f32 %r1509, %r1264, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1510, %r1509; + fma.rm.ftz.f32 %r1511, %r1510, %r156, %r155; + add.f32 %r1512, %r1511, 0fCB40007F; + neg.f32 %r1513, %r1512; + fma.rn.ftz.f32 %r1514, %r1264, %r160, %r1513; + fma.rn.ftz.f32 %r1515, %r1264, %r162, %r1514; + shl.b32 %r1516, %r1511, 23; + ex2.approx.ftz.f32 %r1517, %r1515; + mul.f32 %r1518, %r1517, %r1516; + fma.rn.ftz.f32 %r1519, %r1265, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1520, %r1519; + fma.rm.ftz.f32 %r1521, %r1520, %r156, %r155; + add.f32 %r1522, %r1521, 0fCB40007F; + neg.f32 %r1523, %r1522; + fma.rn.ftz.f32 %r1524, %r1265, %r160, %r1523; + fma.rn.ftz.f32 %r1525, %r1265, %r162, %r1524; + shl.b32 %r1526, %r1521, 23; + ex2.approx.ftz.f32 %r1527, %r1525; + mul.f32 %r1528, %r1527, %r1526; + fma.rn.ftz.f32 %r1529, %r1266, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1530, %r1529; + fma.rm.ftz.f32 %r1531, %r1530, %r156, %r155; + add.f32 %r1532, %r1531, 0fCB40007F; + neg.f32 %r1533, %r1532; + fma.rn.ftz.f32 %r1534, %r1266, %r160, %r1533; + fma.rn.ftz.f32 %r1535, %r1266, %r162, %r1534; + shl.b32 %r1536, %r1531, 23; + ex2.approx.ftz.f32 %r1537, %r1535; + mul.f32 %r1538, %r1537, %r1536; + fma.rn.ftz.f32 %r1539, %r1267, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1540, %r1539; + fma.rm.ftz.f32 %r1541, %r1540, %r156, %r155; + fma.rn.ftz.f32 %r1542, %r1268, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1543, %r1542; + fma.rm.ftz.f32 %r1544, %r1543, %r156, %r155; + fma.rn.ftz.f32 %r1545, %r1269, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1546, %r1545; + fma.rm.ftz.f32 %r1547, %r1546, %r156, %r155; + fma.rn.ftz.f32 %r1548, %r1270, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1549, %r1548; + fma.rm.ftz.f32 %r1550, %r1549, %r156, %r155; + fma.rn.ftz.f32 %r1551, %r1271, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1552, %r1551; + fma.rm.ftz.f32 %r1553, %r1552, %r156, %r155; + fma.rn.ftz.f32 %r1554, %r1272, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1555, %r1554; + fma.rm.ftz.f32 %r1556, %r1555, %r156, %r155; + .loc 2 181 31 // triton_helpers.py:181:31 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mov.b64 {%r1557, %r1558}, %rd253; + mul.f32 %r1559, %r1558, %r1298; + mov.b64 {%r1560, %r1561}, %rd254; + mov.b64 {%r1562, %r1563}, %rd255; + mov.b64 {%r1564, %r1565}, %rd256; + mov.b64 {%r1566, %r1567}, %rd257; + mov.b64 {%r1568, %r1569}, %rd258; + mov.b64 {%r1570, %r1571}, %rd259; + mov.b64 {%r1572, %r1573}, %rd260; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mov.b64 %rd207, {%r1541, %r1544}; + cvt.u32.u64 %r1574, %rd207; + add.f32 %r1575, %r1541, 0fCB40007F; + neg.f32 %r1576, %r1575; + fma.rn.ftz.f32 %r1577, %r1267, %r160, %r1576; + fma.rn.ftz.f32 %r1578, %r1267, %r162, %r1577; + ex2.approx.ftz.f32 %r1579, %r1578; + add.f32 %r1580, %r1544, 0fCB40007F; + neg.f32 %r1581, %r1580; + fma.rn.ftz.f32 %r1582, %r1268, %r160, %r1581; + fma.rn.ftz.f32 %r1583, %r1268, %r162, %r1582; + shl.b64 %rd208, %rd207, 23; + and.b64 %rd209, %rd208, -36028797018963968; + shl.b32 %r1584, %r1574, 23; + cvt.u64.u32 %rd210, %r1584; + or.b64 %rd211, %rd210, %rd209; + ex2.approx.ftz.f32 %r1585, %r1583; + mov.b64 {%r1586, %r1587}, %rd211; + mul.f32 %r1588, %r1585, %r1587; + mul.f32 %r1589, %r1579, %r1586; + .loc 2 181 31 // triton_helpers.py:181:31 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mov.b64 {%r1590, %r1591}, %rd251; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mov.b64 %rd212, {%r1547, %r1550}; + cvt.u32.u64 %r1592, %rd212; + add.f32 %r1593, %r1547, 0fCB40007F; + neg.f32 %r1594, %r1593; + fma.rn.ftz.f32 %r1595, %r1269, %r160, %r1594; + fma.rn.ftz.f32 %r1596, %r1269, %r162, %r1595; + ex2.approx.ftz.f32 %r1597, %r1596; + add.f32 %r1598, %r1550, 0fCB40007F; + neg.f32 %r1599, %r1598; + fma.rn.ftz.f32 %r1600, %r1270, %r160, %r1599; + fma.rn.ftz.f32 %r1601, %r1270, %r162, %r1600; + shl.b64 %rd213, %rd212, 23; + and.b64 %rd214, %rd213, -36028797018963968; + shl.b32 %r1602, %r1592, 23; + cvt.u64.u32 %rd215, %r1602; + or.b64 %rd216, %rd215, %rd214; + ex2.approx.ftz.f32 %r1603, %r1601; + mov.b64 {%r1604, %r1605}, %rd216; + mul.f32 %r1606, %r1603, %r1605; + mul.f32 %r1607, %r1597, %r1604; + .loc 2 181 31 // triton_helpers.py:181:31 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mov.b64 {%r1608, %r1609}, %rd249; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mov.b64 %rd217, {%r1553, %r1556}; + cvt.u32.u64 %r1610, %rd217; + add.f32 %r1611, %r1553, 0fCB40007F; + neg.f32 %r1612, %r1611; + fma.rn.ftz.f32 %r1613, %r1271, %r160, %r1612; + fma.rn.ftz.f32 %r1614, %r1271, %r162, %r1613; + ex2.approx.ftz.f32 %r1615, %r1614; + add.f32 %r1616, %r1556, 0fCB40007F; + neg.f32 %r1617, %r1616; + fma.rn.ftz.f32 %r1618, %r1272, %r160, %r1617; + fma.rn.ftz.f32 %r1619, %r1272, %r162, %r1618; + shl.b64 %rd218, %rd217, 23; + and.b64 %rd219, %rd218, -36028797018963968; + shl.b32 %r1620, %r1610, 23; + cvt.u64.u32 %rd220, %r1620; + or.b64 %rd221, %rd220, %rd219; + ex2.approx.ftz.f32 %r1621, %r1619; + mov.b64 {%r1622, %r1623}, %rd221; + mul.f32 %r1624, %r1621, %r1623; + mul.f32 %r1625, %r1615, %r1622; + .loc 2 181 31 // triton_helpers.py:181:31 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mov.b64 {%r1626, %r1627}, %rd247; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + bar.sync 0; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + fma.rn.f32 %r1628, %r1557, %r1288, %r1559; + fma.rn.f32 %r1629, %r1560, %r1308, %r1628; + fma.rn.f32 %r1630, %r1561, %r1318, %r1629; + fma.rn.f32 %r1631, %r1562, %r1328, %r1630; + fma.rn.f32 %r1632, %r1563, %r1338, %r1631; + fma.rn.f32 %r1633, %r1564, %r1348, %r1632; + fma.rn.f32 %r1634, %r1565, %r1358, %r1633; + fma.rn.f32 %r1635, %r1566, %r1368, %r1634; + fma.rn.f32 %r1636, %r1567, %r1378, %r1635; + fma.rn.f32 %r1637, %r1568, %r1388, %r1636; + fma.rn.f32 %r1638, %r1569, %r1398, %r1637; + fma.rn.f32 %r1639, %r1570, %r1408, %r1638; + fma.rn.f32 %r1640, %r1571, %r1418, %r1639; + fma.rn.f32 %r1641, %r1572, %r1428, %r1640; + fma.rn.f32 %r1642, %r1573, %r1438, %r1641; + fma.rn.f32 %r1643, %r2188, %r1448, %r1642; + fma.rn.f32 %r1644, %r2189, %r1458, %r1643; + fma.rn.f32 %r1645, %r2190, %r1468, %r1644; + fma.rn.f32 %r1646, %r2191, %r1478, %r1645; + fma.rn.f32 %r1647, %r2192, %r1488, %r1646; + fma.rn.f32 %r1648, %r2193, %r1498, %r1647; + fma.rn.f32 %r1649, %r2194, %r1508, %r1648; + fma.rn.f32 %r1650, %r2195, %r1518, %r1649; + fma.rn.f32 %r1651, %r2196, %r1528, %r1650; + fma.rn.f32 %r1652, %r2197, %r1538, %r1651; + fma.rn.f32 %r1653, %r1590, %r1589, %r1652; + fma.rn.f32 %r1654, %r1591, %r1588, %r1653; + fma.rn.f32 %r1655, %r1608, %r1607, %r1654; + fma.rn.f32 %r1656, %r1609, %r1606, %r1655; + fma.rn.f32 %r1657, %r1626, %r1625, %r1656; + fma.rn.f32 %r1658, %r1627, %r1624, %r1657; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1659, %r1658, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r1660, %r1658, %r1659; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1661, %r1660, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r1662, %r1660, %r1661; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1663, %r1662, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r1664, %r1662, %r1663; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1665, %r1664, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r1666, %r1664, %r1665; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1667, %r1666, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r1090, %r1666, %r1667; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + // begin inline asm + @%p105 st.shared.b32 [ %r1083 + 0 ], %r1090; + // end inline asm + bar.sync 0; + // begin inline asm + @%p106 ld.shared.b32 %r1091, [ %r1086 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1668, %r1091, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r1669, %r1091, %r1668; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1670, %r1669, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r1671, %r1669, %r1670; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1672, %r1671, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r1673, %r1671, %r1672; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1674, %r1673, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r1094, %r1673, %r1674; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + // begin inline asm + @%p107 st.shared.b32 [ %r1086 + 0 ], %r1094; + // end inline asm + bar.sync 0; + ld.shared.b32 %r40, [global_smem]; + shl.b32 %r1675, %r2, 4; + add.s32 %r41, %r1195, %r1675; + xor.b32 %r1676, %r1675, 64; + add.s32 %r42, %r1195, %r1676; + shl.b32 %r1677, %r1, 3; + and.b32 %r1678, %r1677, 4080; + and.b32 %r1679, %r1, 1; + neg.s32 %r1680, %r1679; + and.b32 %r1681, %r1680, 8256; + xor.b32 %r1682, %r1681, %r1678; + add.s32 %r43, %r1195, %r1682; +$L__tmp16: + .loc 1 52 40 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:52:40 + mul.wide.u32 %rd55, %r2, 4; + mov.b64 %rd273, 0; + mov.pred %p193, -1; + mov.pred %p207, %p193; +$L__BB0_3: // =>This Inner Loop Header: Depth=1 + .loc 1 0 40 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:0:40 + mov.pred %p2, %p207; + .loc 1 53 31 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:53:31 + or.b64 %rd243, %rd273, %rd1; + or.b64 %rd244, %rd273, %rd55; + or.b64 %rd245, %rd244, 14336; + .loc 1 54 29 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:54:29 + setp.lt.u64 %p196, %rd243, 32000; + setp.lt.u64 %p204, %rd245, 32000; + .loc 1 58 41 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:58:41 + cvt.u32.u64 %r1747, %rd273; + or.b32 %r1748, %r3, %r1747; + add.s32 %r1749, %r1748, %r6; + or.b32 %r1750, %r4, %r1747; + add.s32 %r1751, %r1750, %r6; + or.b32 %r1752, %r5, %r1747; + add.s32 %r1753, %r1752, %r6; + cvt.u32.u64 %r1754, %rd243; + add.s32 %r1755, %r6, %r1754; + cvt.u32.u64 %r1756, %rd244; + add.s32 %r1757, %r6, %r1756; + add.s32 %r1758, %r1757, 2048; + add.s32 %r1759, %r1757, 4096; + add.s32 %r1760, %r1757, 6144; + add.s32 %r1761, %r1757, 8192; + add.s32 %r1762, %r1757, 10240; + add.s32 %r1763, %r1757, 12288; + cvt.u32.u64 %r1764, %rd245; + add.s32 %r1765, %r6, %r1764; + .loc 1 58 34 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:58:34 + mad.wide.s32 %rd223, %r1749, 2, %rd57; + mad.wide.s32 %rd226, %r1751, 2, %rd57; + mad.wide.s32 %rd229, %r1753, 2, %rd57; + mad.wide.s32 %rd232, %r1755, 2, %rd57; + .loc 1 58 52 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:58:52 + // begin inline asm + mov.u64 %rd222, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd222, 1.0; + // end inline asm + mov.b32 %r1687, 0; + // begin inline asm + mov.u32 %r1683, %r1687; + mov.u32 %r1684, %r1687; + mov.u32 %r1685, %r1687; + mov.u32 %r1686, %r1687; + @%p193 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1683, %r1684, %r1685, %r1686 }, [ %rd223 + 0 ], %rd222; + // end inline asm + mov.b32 {%rs33, %rs34}, %r1683; + mov.b32 {%rs35, %rs36}, %r1684; + mov.b32 {%rs37, %rs38}, %r1685; + mov.b32 {%rs39, %rs40}, %r1686; + // begin inline asm + mov.u64 %rd225, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd225, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r1691, %r1687; + mov.u32 %r1692, %r1687; + mov.u32 %r1693, %r1687; + mov.u32 %r1694, %r1687; + @%p193 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1691, %r1692, %r1693, %r1694 }, [ %rd226 + 0 ], %rd225; + // end inline asm + mov.b32 {%rs41, %rs42}, %r1691; + mov.b32 {%rs43, %rs44}, %r1692; + mov.b32 {%rs45, %rs46}, %r1693; + mov.b32 {%rs47, %rs48}, %r1694; + // begin inline asm + mov.u64 %rd228, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd228, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r1699, %r1687; + mov.u32 %r1700, %r1687; + mov.u32 %r1701, %r1687; + mov.u32 %r1702, %r1687; + @%p193 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1699, %r1700, %r1701, %r1702 }, [ %rd229 + 0 ], %rd228; + // end inline asm + mov.b32 {%rs49, %rs50}, %r1699; + mov.b32 {%rs51, %rs52}, %r1700; + mov.b32 {%rs53, %rs54}, %r1701; + mov.b32 {%rs55, %rs56}, %r1702; + // begin inline asm + mov.u64 %rd231, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd231, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r1707, %r1687; + mov.u32 %r1708, %r1687; + mov.u32 %r1709, %r1687; + mov.u32 %r1710, %r1687; + @%p196 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1707, %r1708, %r1709, %r1710 }, [ %rd232 + 0 ], %rd231; + // end inline asm + mov.b32 {%rs57, %rs58}, %r1707; + mov.b32 {%rs59, %rs60}, %r1708; + mov.b32 {%rs61, %rs62}, %r1709; + mov.b32 {%rs63, %rs64}, %r1710; + .loc 1 58 106 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:58:106 + cvt.f32.bf16 %r1766, %rs33; + cvt.f32.bf16 %r1767, %rs34; + cvt.f32.bf16 %r1768, %rs35; + cvt.f32.bf16 %r1769, %rs36; + cvt.f32.bf16 %r1770, %rs37; + cvt.f32.bf16 %r1771, %rs38; + cvt.f32.bf16 %r1772, %rs39; + cvt.f32.bf16 %r1773, %rs40; + cvt.f32.bf16 %r1774, %rs41; + cvt.f32.bf16 %r1775, %rs42; + cvt.f32.bf16 %r1776, %rs43; + cvt.f32.bf16 %r1777, %rs44; + cvt.f32.bf16 %r1778, %rs45; + cvt.f32.bf16 %r1779, %rs46; + cvt.f32.bf16 %r1780, %rs47; + cvt.f32.bf16 %r1781, %rs48; + cvt.f32.bf16 %r1782, %rs49; + cvt.f32.bf16 %r1783, %rs50; + cvt.f32.bf16 %r1784, %rs51; + cvt.f32.bf16 %r1785, %rs52; + cvt.f32.bf16 %r1786, %rs53; + cvt.f32.bf16 %r1787, %rs54; + cvt.f32.bf16 %r1788, %rs55; + cvt.f32.bf16 %r1789, %rs56; + cvt.f32.bf16 %r1790, %rs57; + cvt.f32.bf16 %r1791, %rs58; + cvt.f32.bf16 %r1792, %rs59; + cvt.f32.bf16 %r1793, %rs60; + cvt.f32.bf16 %r1794, %rs61; + cvt.f32.bf16 %r1795, %rs62; + cvt.f32.bf16 %r1796, %rs63; + cvt.f32.bf16 %r1797, %rs64; + .loc 1 60 22 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:60:22 + sub.f32 %r1798, %r1766, %r39; + sub.f32 %r1799, %r1767, %r39; + sub.f32 %r1800, %r1768, %r39; + sub.f32 %r1801, %r1769, %r39; + sub.f32 %r1802, %r1770, %r39; + sub.f32 %r1803, %r1771, %r39; + sub.f32 %r1804, %r1772, %r39; + sub.f32 %r1805, %r1773, %r39; + sub.f32 %r1806, %r1774, %r39; + sub.f32 %r1807, %r1775, %r39; + sub.f32 %r1808, %r1776, %r39; + sub.f32 %r1809, %r1777, %r39; + sub.f32 %r1810, %r1778, %r39; + sub.f32 %r1811, %r1779, %r39; + sub.f32 %r1812, %r1780, %r39; + sub.f32 %r1813, %r1781, %r39; + sub.f32 %r1814, %r1782, %r39; + sub.f32 %r1815, %r1783, %r39; + sub.f32 %r1816, %r1784, %r39; + sub.f32 %r1817, %r1785, %r39; + sub.f32 %r1818, %r1786, %r39; + sub.f32 %r1819, %r1787, %r39; + sub.f32 %r1820, %r1788, %r39; + sub.f32 %r1821, %r1789, %r39; + sub.f32 %r1822, %r1790, %r39; + sub.f32 %r1823, %r1791, %r39; + sub.f32 %r1824, %r1792, %r39; + sub.f32 %r1825, %r1793, %r39; + sub.f32 %r1826, %r1794, %r39; + sub.f32 %r1827, %r1795, %r39; + sub.f32 %r1828, %r1796, %r39; + sub.f32 %r1829, %r1797, %r39; + mov.b32 %r1830, 0f3F000000; + mov.b32 %r1831, 0f3BBB989D; + .loc 1 61 29 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:61:29 + fma.rn.ftz.f32 %r1832, %r1798, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1833, %r1832; + mov.b32 %r1834, 0f4B400001; + mov.b32 %r1835, 0f437C0000; + fma.rm.ftz.f32 %r1836, %r1833, %r1835, %r1834; + add.f32 %r1837, %r1836, 0fCB40007F; + neg.f32 %r1838, %r1837; + mov.b32 %r1839, 0f3FB8AA3B; + fma.rn.ftz.f32 %r1840, %r1798, %r1839, %r1838; + mov.b32 %r1841, 0f32A57060; + fma.rn.ftz.f32 %r1842, %r1798, %r1841, %r1840; + shl.b32 %r1843, %r1836, 23; + ex2.approx.ftz.f32 %r1844, %r1842; + mul.f32 %r1845, %r1844, %r1843; + fma.rn.ftz.f32 %r1846, %r1799, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1847, %r1846; + fma.rm.ftz.f32 %r1848, %r1847, %r1835, %r1834; + add.f32 %r1849, %r1848, 0fCB40007F; + neg.f32 %r1850, %r1849; + fma.rn.ftz.f32 %r1851, %r1799, %r1839, %r1850; + fma.rn.ftz.f32 %r1852, %r1799, %r1841, %r1851; + shl.b32 %r1853, %r1848, 23; + ex2.approx.ftz.f32 %r1854, %r1852; + mul.f32 %r1855, %r1854, %r1853; + fma.rn.ftz.f32 %r1856, %r1800, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1857, %r1856; + fma.rm.ftz.f32 %r1858, %r1857, %r1835, %r1834; + add.f32 %r1859, %r1858, 0fCB40007F; + neg.f32 %r1860, %r1859; + fma.rn.ftz.f32 %r1861, %r1800, %r1839, %r1860; + fma.rn.ftz.f32 %r1862, %r1800, %r1841, %r1861; + shl.b32 %r1863, %r1858, 23; + ex2.approx.ftz.f32 %r1864, %r1862; + mul.f32 %r1865, %r1864, %r1863; + fma.rn.ftz.f32 %r1866, %r1801, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1867, %r1866; + fma.rm.ftz.f32 %r1868, %r1867, %r1835, %r1834; + add.f32 %r1869, %r1868, 0fCB40007F; + neg.f32 %r1870, %r1869; + fma.rn.ftz.f32 %r1871, %r1801, %r1839, %r1870; + fma.rn.ftz.f32 %r1872, %r1801, %r1841, %r1871; + shl.b32 %r1873, %r1868, 23; + ex2.approx.ftz.f32 %r1874, %r1872; + mul.f32 %r1875, %r1874, %r1873; + fma.rn.ftz.f32 %r1876, %r1802, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1877, %r1876; + fma.rm.ftz.f32 %r1878, %r1877, %r1835, %r1834; + add.f32 %r1879, %r1878, 0fCB40007F; + neg.f32 %r1880, %r1879; + fma.rn.ftz.f32 %r1881, %r1802, %r1839, %r1880; + fma.rn.ftz.f32 %r1882, %r1802, %r1841, %r1881; + shl.b32 %r1883, %r1878, 23; + ex2.approx.ftz.f32 %r1884, %r1882; + mul.f32 %r1885, %r1884, %r1883; + fma.rn.ftz.f32 %r1886, %r1803, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1887, %r1886; + fma.rm.ftz.f32 %r1888, %r1887, %r1835, %r1834; + add.f32 %r1889, %r1888, 0fCB40007F; + neg.f32 %r1890, %r1889; + fma.rn.ftz.f32 %r1891, %r1803, %r1839, %r1890; + fma.rn.ftz.f32 %r1892, %r1803, %r1841, %r1891; + shl.b32 %r1893, %r1888, 23; + ex2.approx.ftz.f32 %r1894, %r1892; + mul.f32 %r1895, %r1894, %r1893; + fma.rn.ftz.f32 %r1896, %r1804, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1897, %r1896; + fma.rm.ftz.f32 %r1898, %r1897, %r1835, %r1834; + add.f32 %r1899, %r1898, 0fCB40007F; + neg.f32 %r1900, %r1899; + fma.rn.ftz.f32 %r1901, %r1804, %r1839, %r1900; + fma.rn.ftz.f32 %r1902, %r1804, %r1841, %r1901; + shl.b32 %r1903, %r1898, 23; + ex2.approx.ftz.f32 %r1904, %r1902; + mul.f32 %r1905, %r1904, %r1903; + fma.rn.ftz.f32 %r1906, %r1805, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1907, %r1906; + fma.rm.ftz.f32 %r1908, %r1907, %r1835, %r1834; + add.f32 %r1909, %r1908, 0fCB40007F; + neg.f32 %r1910, %r1909; + fma.rn.ftz.f32 %r1911, %r1805, %r1839, %r1910; + fma.rn.ftz.f32 %r1912, %r1805, %r1841, %r1911; + shl.b32 %r1913, %r1908, 23; + ex2.approx.ftz.f32 %r1914, %r1912; + mul.f32 %r1915, %r1914, %r1913; + fma.rn.ftz.f32 %r1916, %r1806, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1917, %r1916; + fma.rm.ftz.f32 %r1918, %r1917, %r1835, %r1834; + add.f32 %r1919, %r1918, 0fCB40007F; + neg.f32 %r1920, %r1919; + fma.rn.ftz.f32 %r1921, %r1806, %r1839, %r1920; + fma.rn.ftz.f32 %r1922, %r1806, %r1841, %r1921; + shl.b32 %r1923, %r1918, 23; + ex2.approx.ftz.f32 %r1924, %r1922; + mul.f32 %r1925, %r1924, %r1923; + fma.rn.ftz.f32 %r1926, %r1807, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1927, %r1926; + fma.rm.ftz.f32 %r1928, %r1927, %r1835, %r1834; + add.f32 %r1929, %r1928, 0fCB40007F; + neg.f32 %r1930, %r1929; + fma.rn.ftz.f32 %r1931, %r1807, %r1839, %r1930; + fma.rn.ftz.f32 %r1932, %r1807, %r1841, %r1931; + shl.b32 %r1933, %r1928, 23; + ex2.approx.ftz.f32 %r1934, %r1932; + mul.f32 %r1935, %r1934, %r1933; + fma.rn.ftz.f32 %r1936, %r1808, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1937, %r1936; + fma.rm.ftz.f32 %r1938, %r1937, %r1835, %r1834; + add.f32 %r1939, %r1938, 0fCB40007F; + neg.f32 %r1940, %r1939; + fma.rn.ftz.f32 %r1941, %r1808, %r1839, %r1940; + fma.rn.ftz.f32 %r1942, %r1808, %r1841, %r1941; + shl.b32 %r1943, %r1938, 23; + ex2.approx.ftz.f32 %r1944, %r1942; + mul.f32 %r1945, %r1944, %r1943; + fma.rn.ftz.f32 %r1946, %r1809, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1947, %r1946; + fma.rm.ftz.f32 %r1948, %r1947, %r1835, %r1834; + add.f32 %r1949, %r1948, 0fCB40007F; + neg.f32 %r1950, %r1949; + fma.rn.ftz.f32 %r1951, %r1809, %r1839, %r1950; + fma.rn.ftz.f32 %r1952, %r1809, %r1841, %r1951; + shl.b32 %r1953, %r1948, 23; + ex2.approx.ftz.f32 %r1954, %r1952; + mul.f32 %r1955, %r1954, %r1953; + fma.rn.ftz.f32 %r1956, %r1810, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1957, %r1956; + fma.rm.ftz.f32 %r1958, %r1957, %r1835, %r1834; + add.f32 %r1959, %r1958, 0fCB40007F; + neg.f32 %r1960, %r1959; + fma.rn.ftz.f32 %r1961, %r1810, %r1839, %r1960; + fma.rn.ftz.f32 %r1962, %r1810, %r1841, %r1961; + shl.b32 %r1963, %r1958, 23; + ex2.approx.ftz.f32 %r1964, %r1962; + mul.f32 %r1965, %r1964, %r1963; + fma.rn.ftz.f32 %r1966, %r1811, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1967, %r1966; + fma.rm.ftz.f32 %r1968, %r1967, %r1835, %r1834; + add.f32 %r1969, %r1968, 0fCB40007F; + neg.f32 %r1970, %r1969; + fma.rn.ftz.f32 %r1971, %r1811, %r1839, %r1970; + fma.rn.ftz.f32 %r1972, %r1811, %r1841, %r1971; + shl.b32 %r1973, %r1968, 23; + ex2.approx.ftz.f32 %r1974, %r1972; + mul.f32 %r1975, %r1974, %r1973; + fma.rn.ftz.f32 %r1976, %r1812, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1977, %r1976; + fma.rm.ftz.f32 %r1978, %r1977, %r1835, %r1834; + add.f32 %r1979, %r1978, 0fCB40007F; + neg.f32 %r1980, %r1979; + fma.rn.ftz.f32 %r1981, %r1812, %r1839, %r1980; + fma.rn.ftz.f32 %r1982, %r1812, %r1841, %r1981; + shl.b32 %r1983, %r1978, 23; + ex2.approx.ftz.f32 %r1984, %r1982; + mul.f32 %r1985, %r1984, %r1983; + fma.rn.ftz.f32 %r1986, %r1813, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1987, %r1986; + fma.rm.ftz.f32 %r1988, %r1987, %r1835, %r1834; + add.f32 %r1989, %r1988, 0fCB40007F; + neg.f32 %r1990, %r1989; + fma.rn.ftz.f32 %r1991, %r1813, %r1839, %r1990; + fma.rn.ftz.f32 %r1992, %r1813, %r1841, %r1991; + shl.b32 %r1993, %r1988, 23; + ex2.approx.ftz.f32 %r1994, %r1992; + mul.f32 %r1995, %r1994, %r1993; + fma.rn.ftz.f32 %r1996, %r1814, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1997, %r1996; + fma.rm.ftz.f32 %r1998, %r1997, %r1835, %r1834; + add.f32 %r1999, %r1998, 0fCB40007F; + neg.f32 %r2000, %r1999; + fma.rn.ftz.f32 %r2001, %r1814, %r1839, %r2000; + fma.rn.ftz.f32 %r2002, %r1814, %r1841, %r2001; + shl.b32 %r2003, %r1998, 23; + ex2.approx.ftz.f32 %r2004, %r2002; + mul.f32 %r2005, %r2004, %r2003; + fma.rn.ftz.f32 %r2006, %r1815, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2007, %r2006; + fma.rm.ftz.f32 %r2008, %r2007, %r1835, %r1834; + add.f32 %r2009, %r2008, 0fCB40007F; + neg.f32 %r2010, %r2009; + fma.rn.ftz.f32 %r2011, %r1815, %r1839, %r2010; + fma.rn.ftz.f32 %r2012, %r1815, %r1841, %r2011; + shl.b32 %r2013, %r2008, 23; + ex2.approx.ftz.f32 %r2014, %r2012; + mul.f32 %r2015, %r2014, %r2013; + fma.rn.ftz.f32 %r2016, %r1816, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2017, %r2016; + fma.rm.ftz.f32 %r2018, %r2017, %r1835, %r1834; + add.f32 %r2019, %r2018, 0fCB40007F; + neg.f32 %r2020, %r2019; + fma.rn.ftz.f32 %r2021, %r1816, %r1839, %r2020; + fma.rn.ftz.f32 %r2022, %r1816, %r1841, %r2021; + shl.b32 %r2023, %r2018, 23; + ex2.approx.ftz.f32 %r2024, %r2022; + mul.f32 %r2025, %r2024, %r2023; + fma.rn.ftz.f32 %r2026, %r1817, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2027, %r2026; + fma.rm.ftz.f32 %r2028, %r2027, %r1835, %r1834; + add.f32 %r2029, %r2028, 0fCB40007F; + neg.f32 %r2030, %r2029; + fma.rn.ftz.f32 %r2031, %r1817, %r1839, %r2030; + fma.rn.ftz.f32 %r2032, %r1817, %r1841, %r2031; + shl.b32 %r2033, %r2028, 23; + ex2.approx.ftz.f32 %r2034, %r2032; + mul.f32 %r2035, %r2034, %r2033; + fma.rn.ftz.f32 %r2036, %r1818, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2037, %r2036; + fma.rm.ftz.f32 %r2038, %r2037, %r1835, %r1834; + add.f32 %r2039, %r2038, 0fCB40007F; + neg.f32 %r2040, %r2039; + fma.rn.ftz.f32 %r2041, %r1818, %r1839, %r2040; + fma.rn.ftz.f32 %r2042, %r1818, %r1841, %r2041; + shl.b32 %r2043, %r2038, 23; + ex2.approx.ftz.f32 %r2044, %r2042; + mul.f32 %r2045, %r2044, %r2043; + fma.rn.ftz.f32 %r2046, %r1819, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2047, %r2046; + fma.rm.ftz.f32 %r2048, %r2047, %r1835, %r1834; + add.f32 %r2049, %r2048, 0fCB40007F; + neg.f32 %r2050, %r2049; + fma.rn.ftz.f32 %r2051, %r1819, %r1839, %r2050; + fma.rn.ftz.f32 %r2052, %r1819, %r1841, %r2051; + shl.b32 %r2053, %r2048, 23; + ex2.approx.ftz.f32 %r2054, %r2052; + mul.f32 %r2055, %r2054, %r2053; + fma.rn.ftz.f32 %r2056, %r1820, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2057, %r2056; + fma.rm.ftz.f32 %r2058, %r2057, %r1835, %r1834; + add.f32 %r2059, %r2058, 0fCB40007F; + neg.f32 %r2060, %r2059; + fma.rn.ftz.f32 %r2061, %r1820, %r1839, %r2060; + fma.rn.ftz.f32 %r2062, %r1820, %r1841, %r2061; + shl.b32 %r2063, %r2058, 23; + ex2.approx.ftz.f32 %r2064, %r2062; + mul.f32 %r2065, %r2064, %r2063; + fma.rn.ftz.f32 %r2066, %r1821, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2067, %r2066; + fma.rm.ftz.f32 %r2068, %r2067, %r1835, %r1834; + add.f32 %r2069, %r2068, 0fCB40007F; + neg.f32 %r2070, %r2069; + fma.rn.ftz.f32 %r2071, %r1821, %r1839, %r2070; + fma.rn.ftz.f32 %r2072, %r1821, %r1841, %r2071; + shl.b32 %r2073, %r2068, 23; + ex2.approx.ftz.f32 %r2074, %r2072; + mul.f32 %r2075, %r2074, %r2073; + fma.rn.ftz.f32 %r2076, %r1822, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2077, %r2076; + fma.rm.ftz.f32 %r2078, %r2077, %r1835, %r1834; + add.f32 %r2079, %r2078, 0fCB40007F; + neg.f32 %r2080, %r2079; + fma.rn.ftz.f32 %r2081, %r1822, %r1839, %r2080; + fma.rn.ftz.f32 %r2082, %r1822, %r1841, %r2081; + shl.b32 %r2083, %r2078, 23; + ex2.approx.ftz.f32 %r2084, %r2082; + mul.f32 %r2085, %r2084, %r2083; + fma.rn.ftz.f32 %r2086, %r1823, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2087, %r2086; + fma.rm.ftz.f32 %r2088, %r2087, %r1835, %r1834; + add.f32 %r2089, %r2088, 0fCB40007F; + neg.f32 %r2090, %r2089; + fma.rn.ftz.f32 %r2091, %r1823, %r1839, %r2090; + fma.rn.ftz.f32 %r2092, %r1823, %r1841, %r2091; + shl.b32 %r2093, %r2088, 23; + ex2.approx.ftz.f32 %r2094, %r2092; + mul.f32 %r2095, %r2094, %r2093; + fma.rn.ftz.f32 %r2096, %r1824, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2097, %r2096; + fma.rm.ftz.f32 %r2098, %r2097, %r1835, %r1834; + add.f32 %r2099, %r2098, 0fCB40007F; + neg.f32 %r2100, %r2099; + fma.rn.ftz.f32 %r2101, %r1824, %r1839, %r2100; + fma.rn.ftz.f32 %r2102, %r1824, %r1841, %r2101; + shl.b32 %r2103, %r2098, 23; + ex2.approx.ftz.f32 %r2104, %r2102; + mul.f32 %r2105, %r2104, %r2103; + fma.rn.ftz.f32 %r2106, %r1825, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2107, %r2106; + fma.rm.ftz.f32 %r2108, %r2107, %r1835, %r1834; + add.f32 %r2109, %r2108, 0fCB40007F; + neg.f32 %r2110, %r2109; + fma.rn.ftz.f32 %r2111, %r1825, %r1839, %r2110; + fma.rn.ftz.f32 %r2112, %r1825, %r1841, %r2111; + shl.b32 %r2113, %r2108, 23; + ex2.approx.ftz.f32 %r2114, %r2112; + mul.f32 %r2115, %r2114, %r2113; + fma.rn.ftz.f32 %r2116, %r1826, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2117, %r2116; + fma.rm.ftz.f32 %r2118, %r2117, %r1835, %r1834; + add.f32 %r2119, %r2118, 0fCB40007F; + neg.f32 %r2120, %r2119; + fma.rn.ftz.f32 %r2121, %r1826, %r1839, %r2120; + fma.rn.ftz.f32 %r2122, %r1826, %r1841, %r2121; + shl.b32 %r2123, %r2118, 23; + ex2.approx.ftz.f32 %r2124, %r2122; + mul.f32 %r2125, %r2124, %r2123; + fma.rn.ftz.f32 %r2126, %r1827, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2127, %r2126; + fma.rm.ftz.f32 %r2128, %r2127, %r1835, %r1834; + add.f32 %r2129, %r2128, 0fCB40007F; + neg.f32 %r2130, %r2129; + fma.rn.ftz.f32 %r2131, %r1827, %r1839, %r2130; + fma.rn.ftz.f32 %r2132, %r1827, %r1841, %r2131; + shl.b32 %r2133, %r2128, 23; + ex2.approx.ftz.f32 %r2134, %r2132; + mul.f32 %r2135, %r2134, %r2133; + fma.rn.ftz.f32 %r2136, %r1828, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2137, %r2136; + fma.rm.ftz.f32 %r2138, %r2137, %r1835, %r1834; + add.f32 %r2139, %r2138, 0fCB40007F; + neg.f32 %r2140, %r2139; + fma.rn.ftz.f32 %r2141, %r1828, %r1839, %r2140; + fma.rn.ftz.f32 %r2142, %r1828, %r1841, %r2141; + shl.b32 %r2143, %r2138, 23; + ex2.approx.ftz.f32 %r2144, %r2142; + mul.f32 %r2145, %r2144, %r2143; + fma.rn.ftz.f32 %r2146, %r1829, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2147, %r2146; + fma.rm.ftz.f32 %r2148, %r2147, %r1835, %r1834; + add.f32 %r2149, %r2148, 0fCB40007F; + neg.f32 %r2150, %r2149; + fma.rn.ftz.f32 %r2151, %r1829, %r1839, %r2150; + fma.rn.ftz.f32 %r2152, %r1829, %r1841, %r2151; + shl.b32 %r2153, %r2148, 23; + ex2.approx.ftz.f32 %r2154, %r2152; + mul.f32 %r2155, %r2154, %r2153; + .loc 1 62 23 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:62:23 + div.full.f32 %r2156, %r1845, %r40; + div.full.f32 %r2157, %r1855, %r40; + div.full.f32 %r2158, %r1865, %r40; + div.full.f32 %r2159, %r1875, %r40; + div.full.f32 %r2160, %r1885, %r40; + div.full.f32 %r2161, %r1895, %r40; + div.full.f32 %r2162, %r1905, %r40; + div.full.f32 %r2163, %r1915, %r40; + div.full.f32 %r2164, %r1925, %r40; + div.full.f32 %r2165, %r1935, %r40; + div.full.f32 %r2166, %r1945, %r40; + div.full.f32 %r2167, %r1955, %r40; + div.full.f32 %r2168, %r1965, %r40; + div.full.f32 %r2169, %r1975, %r40; + div.full.f32 %r2170, %r1985, %r40; + div.full.f32 %r2171, %r1995, %r40; + div.full.f32 %r2172, %r2005, %r40; + div.full.f32 %r2173, %r2015, %r40; + div.full.f32 %r2174, %r2025, %r40; + div.full.f32 %r2175, %r2035, %r40; + div.full.f32 %r2176, %r2045, %r40; + div.full.f32 %r2177, %r2055, %r40; + div.full.f32 %r2178, %r2065, %r40; + div.full.f32 %r2179, %r2075, %r40; + div.full.f32 %r2180, %r2085, %r40; + div.full.f32 %r2181, %r2095, %r40; + div.full.f32 %r2182, %r2105, %r40; + div.full.f32 %r2183, %r2115, %r40; + div.full.f32 %r2184, %r2125, %r40; + div.full.f32 %r2185, %r2135, %r40; + div.full.f32 %r2186, %r2145, %r40; + div.full.f32 %r2187, %r2155, %r40; + .loc 1 63 29 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:63:29 + mad.wide.s32 %rd234, %r1757, 4, %rd58; + mad.wide.s32 %rd235, %r1758, 4, %rd58; + mad.wide.s32 %rd236, %r1759, 4, %rd58; + mad.wide.s32 %rd237, %r1760, 4, %rd58; + mad.wide.s32 %rd238, %r1761, 4, %rd58; + mad.wide.s32 %rd239, %r1762, 4, %rd58; + mad.wide.s32 %rd240, %r1763, 4, %rd58; + mad.wide.s32 %rd241, %r1765, 4, %rd58; + .loc 1 63 53 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:63:53 + bar.sync 0; + st.shared.v4.b32 [%r41], {%r2156, %r2157, %r2158, %r2159}; + st.shared.v4.b32 [%r42+8192], {%r2160, %r2161, %r2162, %r2163}; + bar.sync 0; + ld.shared.v4.b32 {%r1715, %r1716, %r1717, %r1718}, [%r43]; + ld.shared.v4.b32 {%r1719, %r1720, %r1721, %r1722}, [%r43+4096]; + bar.sync 0; + st.shared.v4.b32 [%r41], {%r2164, %r2165, %r2166, %r2167}; + st.shared.v4.b32 [%r42+8192], {%r2168, %r2169, %r2170, %r2171}; + bar.sync 0; + ld.shared.v4.b32 {%r1723, %r1724, %r1725, %r1726}, [%r43]; + ld.shared.v4.b32 {%r1727, %r1728, %r1729, %r1730}, [%r43+4096]; + bar.sync 0; + st.shared.v4.b32 [%r41], {%r2172, %r2173, %r2174, %r2175}; + st.shared.v4.b32 [%r42+8192], {%r2176, %r2177, %r2178, %r2179}; + bar.sync 0; + ld.shared.v4.b32 {%r1731, %r1732, %r1733, %r1734}, [%r43]; + ld.shared.v4.b32 {%r1735, %r1736, %r1737, %r1738}, [%r43+4096]; + bar.sync 0; + st.shared.v4.b32 [%r41], {%r2180, %r2181, %r2182, %r2183}; + st.shared.v4.b32 [%r42+8192], {%r2184, %r2185, %r2186, %r2187}; + bar.sync 0; + ld.shared.v4.b32 {%r1739, %r1740, %r1741, %r1742}, [%r43]; + ld.shared.v4.b32 {%r1743, %r1744, %r1745, %r1746}, [%r43+4096]; + // begin inline asm + @%p193 st.global.v4.b32 [ %rd234 + 0 ], { %r1715, %r1716, %r1717, %r1718 }; + // end inline asm + // begin inline asm + @%p193 st.global.v4.b32 [ %rd235 + 0 ], { %r1719, %r1720, %r1721, %r1722 }; + // end inline asm + // begin inline asm + @%p193 st.global.v4.b32 [ %rd236 + 0 ], { %r1723, %r1724, %r1725, %r1726 }; + // end inline asm + // begin inline asm + @%p193 st.global.v4.b32 [ %rd237 + 0 ], { %r1727, %r1728, %r1729, %r1730 }; + // end inline asm + // begin inline asm + @%p193 st.global.v4.b32 [ %rd238 + 0 ], { %r1731, %r1732, %r1733, %r1734 }; + // end inline asm + // begin inline asm + @%p193 st.global.v4.b32 [ %rd239 + 0 ], { %r1735, %r1736, %r1737, %r1738 }; + // end inline asm + // begin inline asm + @%p193 st.global.v4.b32 [ %rd240 + 0 ], { %r1739, %r1740, %r1741, %r1742 }; + // end inline asm + // begin inline asm + @%p204 st.global.v4.b32 [ %rd241 + 0 ], { %r1743, %r1744, %r1745, %r1746 }; + // end inline asm + mov.b64 %rd273, 16384; + mov.pred %p207, 0; + .loc 1 52 40 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:52:40 + @%p2 bra $L__BB0_3; +// %bb.4: + .loc 1 52 4 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:52:4 + ret; +$L__tmp17: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 276 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x10d DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 99 +.b8 122 +.b8 109 +.b8 109 +.b8 114 +.b8 108 +.b8 117 +.b8 51 +.b8 117 +.b8 54 +.b8 55 +.b8 106 +.b8 110 +.b8 51 +.b8 103 +.b8 105 +.b8 116 +.b8 102 +.b8 119 +.b8 121 +.b8 117 +.b8 51 +.b8 106 +.b8 105 +.b8 102 +.b8 51 +.b8 98 +.b8 51 +.b8 112 +.b8 119 +.b8 111 +.b8 120 +.b8 106 +.b8 51 +.b8 55 +.b8 120 +.b8 112 +.b8 52 +.b8 100 +.b8 100 +.b8 97 +.b8 99 +.b8 107 +.b8 99 +.b8 117 +.b8 102 +.b8 119 +.b8 105 +.b8 113 +.b8 114 +.b8 105 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 99 +.b8 122 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x46 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 101 +.b8 120 +.b8 112 +.b8 95 +.b8 112 +.b8 114 +.b8 101 +.b8 112 +.b8 97 +.b8 114 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 111 +.b8 110 +.b8 108 +.b8 105 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 117 +.b8 98 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xd1:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xe6:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp14 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 40 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xfe:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp15 // DW_AT_low_pc +.b64 $L__tmp16 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 49 // DW_AT_call_line +.b8 33 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source new file mode 100644 index 0000000000000000000000000000000000000000..7ff042ca65e13230a7fa9bca537538eae0c28d90 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source @@ -0,0 +1,449 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":18:0) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":186:0) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":109:0) +#loc68 = loc(unknown) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":169:0) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":177:0) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":122:0) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc108 = loc("in_ptr0"(#loc)) +#loc109 = loc("out_ptr2"(#loc)) +#loc110 = loc("xnumel"(#loc)) +#loc111 = loc("r0_numel"(#loc)) +#loc146 = loc("lhs_max"(#loc48)) +#loc147 = loc("lhs_sum"(#loc48)) +#loc148 = loc("rhs_max"(#loc48)) +#loc160 = loc("a"(#loc62)) +#loc161 = loc("b"(#loc62)) +#loc165 = loc("x"(#loc72)) +#loc166 = loc("x"(#loc76)) +#loc167 = loc("x"(#loc81)) +#loc168 = loc("lhs_max"(#loc85)) +#loc169 = loc("lhs_sum"(#loc85)) +#loc178 = loc("a"(#loc96)) +#loc179 = loc("input"(#loc100)) +#loc180 = loc("a"(#loc104)) +#loc181 = loc("b"(#loc104)) +module { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 4096 : i32 loc(#loc112) + %r0_numel_1 = arith.constant 32000 : i32 loc(#loc113) + %xoffset = tt.get_program_id x : i32 loc(#loc114) + %xoffset_2 = arith.constant 1 : i32 loc(#loc115) + %xoffset_3 = arith.constant 1 : i32 loc(#loc115) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc115) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc116) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc117) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc118) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc118) + %xmask = arith.constant true loc(#loc119) + %xmask_8 = arith.constant dense : tensor<1x16384xi1> loc(#loc119) + %r0_base = tt.make_range {end = 16384 : i32, start = 0 : i32} : tensor<16384xi32> loc(#loc120) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16384xi32> -> tensor<1x16384xi32> loc(#loc121) + %_tmp3_max = arith.constant 0xFF800000 : f32 loc(#loc122) + %_tmp3_max_10 = arith.constant dense<0xFF800000> : tensor<1x16384xf32> loc(#loc122) + %_tmp3_sum = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16384__(1,)cconstexpr_fp32_"() : () -> tensor<1x16384xf32> loc(#loc123) + %c0_i32 = arith.constant 0 : i32 loc(#loc13) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc13) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc13) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc13) + %2 = arith.bitcast %c16384_i32 : i32 to i32 loc(#loc13) + %3 = ub.poison : i32 loc(#loc13) + %_tmp3_sum_11:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_max_14 = %_tmp3_max_10, %_tmp3_sum_15 = %_tmp3_sum) -> (tensor<1x16384xf32>, tensor<1x16384xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16384xi32> loc(#loc125) + %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x16384xi32> loc(#loc125) + %r0_mask = arith.constant dense<32000> : tensor<1x16384xi32> loc(#loc126) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x16384xi32> loc(#loc126) + %tmp0 = arith.constant 32000 : i32 loc(#loc127) + %tmp0_18 = arith.constant 32000 : i32 loc(#loc127) + %tmp0_19 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc127) + %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc127) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x16384xi32> loc(#loc128) + %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x16384xi32> loc(#loc128) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16384x!tt.ptr> loc(#loc129) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x16384x!tt.ptr>, tensor<1x16384xi32> loc(#loc129) + %tmp0_25 = arith.constant 0.000000e+00 : f32 loc(#loc130) + %tmp0_26 = arith.constant dense<0.000000e+00> : tensor<1x16384xf32> loc(#loc130) + %tmp0_27 = arith.truncf %tmp0_26 : tensor<1x16384xf32> to tensor<1x16384xbf16> loc(#loc130) + %tmp0_28 = tt.load %tmp0_24, %r0_mask_17, %tmp0_27 evictionPolicy = evict_last : tensor<1x16384x!tt.ptr> loc(#loc130) + %tmp0_29 = arith.extf %tmp0_28 : tensor<1x16384xbf16> to tensor<1x16384xf32> loc(#loc131) + %9:2 = tt.call @"torch._inductor.runtime.triton_helpers.online_softmax_combine__fp32S1_16384S_fp32S1_16384S_fp32S1_16384S__(3,)cconstexpr_False_"(%_tmp3_max_14, %_tmp3_sum_15, %tmp0_29) : (tensor<1x16384xf32>, tensor<1x16384xf32>, tensor<1x16384xf32>) -> (tensor<1x16384xf32>, tensor<1x16384xf32>) loc(#loc21) + %_tmp3_max_30 = arith.select %r0_mask_17, %9#0, %_tmp3_max_14 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc132) + %_tmp3_sum_31 = arith.select %r0_mask_17, %9#1, %_tmp3_sum_15 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc133) + scf.yield %_tmp3_max_30, %_tmp3_sum_31 : tensor<1x16384xf32>, tensor<1x16384xf32> loc(#loc24) + } loc(#loc182) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.online_softmax_reduce__fp32S1_16384S_fp32S1_16384S__(2,)cconstexpr_1__(3,)cconstexpr_False_"(%_tmp3_sum_11#0, %_tmp3_sum_11#1) : (tensor<1x16384xf32>, tensor<1x16384xf32>) -> (tensor<1xf32>, tensor<1xf32>) loc(#loc25) + %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc134) + %tmp4 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc135) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc28) + %c16384_i32_13 = arith.constant 16384 : i32 loc(#loc28) + %5 = arith.bitcast %c0_i32_12 : i32 to i32 loc(#loc28) + %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc28) + %7 = arith.bitcast %c16384_i32_13 : i32 to i32 loc(#loc28) + %8 = ub.poison : i32 loc(#loc28) + scf.for %r0_offset = %5 to %6 step %7 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16384xi32> loc(#loc136) + %r0_index_14 = arith.addi %r0_index, %r0_base_9 : tensor<1x16384xi32> loc(#loc136) + %r0_mask = arith.constant dense<32000> : tensor<1x16384xi32> loc(#loc137) + %r0_mask_15 = arith.cmpi slt, %r0_index_14, %r0_mask : tensor<1x16384xi32> loc(#loc137) + %tmp5 = arith.constant 32000 : i32 loc(#loc138) + %tmp5_16 = arith.constant 32000 : i32 loc(#loc138) + %tmp5_17 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc138) + %tmp5_18 = arith.muli %tmp5_17, %xindex_7 : tensor<1x1xi32> loc(#loc138) + %tmp5_19 = tt.broadcast %tmp5_18 : tensor<1x1xi32> -> tensor<1x16384xi32> loc(#loc139) + %tmp5_20 = arith.addi %r0_index_14, %tmp5_19 : tensor<1x16384xi32> loc(#loc139) + %tmp5_21 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16384x!tt.ptr> loc(#loc140) + %tmp5_22 = tt.addptr %tmp5_21, %tmp5_20 : tensor<1x16384x!tt.ptr>, tensor<1x16384xi32> loc(#loc140) + %tmp5_23 = arith.constant 0.000000e+00 : f32 loc(#loc141) + %tmp5_24 = arith.constant dense<0.000000e+00> : tensor<1x16384xf32> loc(#loc141) + %tmp5_25 = arith.truncf %tmp5_24 : tensor<1x16384xf32> to tensor<1x16384xbf16> loc(#loc141) + %tmp5_26 = tt.load %tmp5_22, %r0_mask_15, %tmp5_25 evictionPolicy = evict_first : tensor<1x16384x!tt.ptr> loc(#loc141) + %tmp5_27 = arith.extf %tmp5_26 : tensor<1x16384xbf16> to tensor<1x16384xf32> loc(#loc142) + %tmp7 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x16384xf32> loc(#loc143) + %tmp7_28 = arith.subf %tmp5_27, %tmp7 : tensor<1x16384xf32> loc(#loc143) + %tmp8 = tt.extern_elementwise %tmp7_28 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc144) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32> -> tensor<1x16384xf32> loc(#loc145) + %tmp9_29 = arith.divf %tmp8, %tmp9 : tensor<1x16384xf32> loc(#loc145) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc39) + %c32000_i32_30 = arith.constant 32000 : i32 loc(#loc39) + %cst = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc39) + %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc39) + %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x16384xi32> loc(#loc40) + %11 = arith.addi %r0_index_14, %10 : tensor<1x16384xi32> loc(#loc40) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16384x!tt.ptr> loc(#loc41) + %13 = tt.addptr %12, %11 : tensor<1x16384x!tt.ptr>, tensor<1x16384xi32> loc(#loc41) + tt.store %13, %tmp9_29, %r0_mask_15 : tensor<1x16384x!tt.ptr> loc(#loc42) + } loc(#loc28) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16384__(1,)cconstexpr_fp32_"() -> tensor<1x16384xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc45) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x16384xf32> loc(#loc45) + tt.return %cst_0 : tensor<1x16384xf32> loc(#loc46) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16384xf32> loc(#loc47) + tt.return %0 : tensor<1x16384xf32> loc(#loc47) + } loc(#loc44) + tt.func private @"torch._inductor.runtime.triton_helpers.online_softmax_combine__fp32S1_16384S_fp32S1_16384S_fp32S1_16384S__(3,)cconstexpr_False_"(%lhs_max: tensor<1x16384xf32> loc("lhs_max"(#loc48)), %lhs_sum: tensor<1x16384xf32> loc("lhs_sum"(#loc48)), %rhs_max: tensor<1x16384xf32> loc("rhs_max"(#loc48))) -> (tensor<1x16384xf32>, tensor<1x16384xf32>) attributes {noinline = false} { + %out_max = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32S1_16384S_fp32S1_16384S__(%lhs_max, %rhs_max) : (tensor<1x16384xf32>, tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc149) + %lhs_scale = arith.constant 0xFF800000 : f32 loc(#loc150) + %lhs_scale_0 = arith.constant dense<0xFF800000> : tensor<1x16384xf32> loc(#loc150) + %lhs_scale_1 = arith.cmpf oeq, %out_max, %lhs_scale_0 : tensor<1x16384xf32> loc(#loc150) + %lhs_scale_2 = arith.subf %lhs_max, %out_max : tensor<1x16384xf32> loc(#loc151) + %lhs_scale_3 = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_16384S__(1,)cconstexpr_False_"(%lhs_scale_2) : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc152) + %lhs_scale_4 = arith.constant 1.000000e+00 : f32 loc(#loc153) + %lhs_scale_5 = arith.constant 1.000000e+00 : f32 loc(#loc153) + %lhs_scale_6 = arith.constant dense<1.000000e+00> : tensor<1x16384xf32> loc(#loc153) + %lhs_scale_7 = arith.select %lhs_scale_1, %lhs_scale_6, %lhs_scale_3 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc153) + %rhs_scale = arith.constant 0xFF800000 : f32 loc(#loc154) + %rhs_scale_8 = arith.constant dense<0xFF800000> : tensor<1x16384xf32> loc(#loc154) + %rhs_scale_9 = arith.cmpf oeq, %out_max, %rhs_scale_8 : tensor<1x16384xf32> loc(#loc154) + %rhs_scale_10 = arith.subf %rhs_max, %out_max : tensor<1x16384xf32> loc(#loc155) + %rhs_scale_11 = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_16384S__(1,)cconstexpr_False_"(%rhs_scale_10) : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc156) + %rhs_scale_12 = arith.constant 1.000000e+00 : f32 loc(#loc157) + %rhs_scale_13 = arith.constant 1.000000e+00 : f32 loc(#loc157) + %rhs_scale_14 = arith.constant dense<1.000000e+00> : tensor<1x16384xf32> loc(#loc157) + %rhs_scale_15 = arith.select %rhs_scale_9, %rhs_scale_14, %rhs_scale_11 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc157) + %out_sum = arith.mulf %lhs_sum, %lhs_scale_7 : tensor<1x16384xf32> loc(#loc158) + %out_sum_16 = arith.addf %out_sum, %rhs_scale_15 : tensor<1x16384xf32> loc(#loc159) + tt.return %out_max, %out_sum_16 : tensor<1x16384xf32>, tensor<1x16384xf32> loc(#loc60) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16384xf32> loc(#loc61) + %1 = ub.poison : tensor<1x16384xf32> loc(#loc61) + tt.return %0, %1 : tensor<1x16384xf32>, tensor<1x16384xf32> loc(#loc61) + } loc(#loc48) + tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32S1_16384S_fp32S1_16384S__(%a: tensor<1x16384xf32> loc("a"(#loc62)), %b: tensor<1x16384xf32> loc("b"(#loc62))) -> tensor<1x16384xf32> attributes {noinline = false} { + %mask = arith.cmpf ogt, %a, %b : tensor<1x16384xf32> loc(#loc183) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_16384S__(%a) : (tensor<1x16384xf32>) -> i1 loc(#loc64) + %1 = scf.if %0 -> (tensor<1x16384xi1>) { + %mask_0 = arith.cmpf une, %a, %a : tensor<1x16384xf32> loc(#loc163) + %mask_1 = arith.ori %mask, %mask_0 : tensor<1x16384xi1> loc(#loc184) + scf.yield %mask_1 : tensor<1x16384xi1> loc(#loc184) + } else { + scf.yield %mask : tensor<1x16384xi1> loc(#loc68) + } loc(#loc65) + %2 = arith.select %1, %a, %b : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc69) + tt.return %2 : tensor<1x16384xf32> loc(#loc70) + ^bb1: // no predecessors + %3 = ub.poison : tensor<1x16384xf32> loc(#loc71) + tt.return %3 : tensor<1x16384xf32> loc(#loc71) + } loc(#loc62) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_16384S__(%x: tensor<1x16384xf32> loc("x"(#loc72))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_16384S__(%x) : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc73) + %true = arith.constant true loc(#loc74) + tt.return %true : i1 loc(#loc74) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc75) + tt.return %1 : i1 loc(#loc75) + } loc(#loc72) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_16384S__(%x: tensor<1x16384xf32> loc("x"(#loc76))) -> tensor<1x16384xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc77) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc78) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc78) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x16384xf32> loc(#loc78) + %4 = arith.addf %x, %3 : tensor<1x16384xf32> loc(#loc78) + tt.return %4 : tensor<1x16384xf32> loc(#loc79) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x16384xf32> loc(#loc80) + tt.return %5 : tensor<1x16384xf32> loc(#loc80) + } loc(#loc76) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc45) + %cst = arith.constant dense : tensor<1xi1> loc(#loc45) + tt.return %cst : tensor<1xi1> loc(#loc46) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc47) + tt.return %0 : tensor<1xi1> loc(#loc47) + } loc(#loc44) + tt.func private @"torch._inductor.runtime.triton_helpers.exp__fp32S1_16384S__(1,)cconstexpr_False_"(%x: tensor<1x16384xf32> loc("x"(#loc81))) -> tensor<1x16384xf32> attributes {noinline = false} { + %0 = tt.extern_elementwise %x {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc82) + tt.return %0 : tensor<1x16384xf32> loc(#loc83) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16384xf32> loc(#loc84) + tt.return %1 : tensor<1x16384xf32> loc(#loc84) + } loc(#loc81) + tt.func private @"torch._inductor.runtime.triton_helpers.online_softmax_reduce__fp32S1_16384S_fp32S1_16384S__(2,)cconstexpr_1__(3,)cconstexpr_False_"(%lhs_max: tensor<1x16384xf32> loc("lhs_max"(#loc85)), %lhs_sum: tensor<1x16384xf32> loc("lhs_sum"(#loc85))) -> (tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} { + %out_max = tt.call @"torch._inductor.runtime.triton_helpers.max2__fp32S1_16384S__(1,)cconstexpr_1_"(%lhs_max) : (tensor<1x16384xf32>) -> tensor<1xf32> loc(#loc170) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc171) + %delta = arith.constant 0xFF800000 : f32 loc(#loc172) + %delta_0 = arith.constant dense<0xFF800000> : tensor<1x1xf32> loc(#loc172) + %delta_1 = arith.cmpf oeq, %out_max_keepdim, %delta_0 : tensor<1x1xf32> loc(#loc172) + %delta_2 = tt.broadcast %out_max_keepdim : tensor<1x1xf32> -> tensor<1x16384xf32> loc(#loc173) + %delta_3 = arith.subf %lhs_max, %delta_2 : tensor<1x16384xf32> loc(#loc173) + %delta_4 = arith.constant 0 : i32 loc(#loc174) + %delta_5 = arith.constant 0.000000e+00 : f32 loc(#loc174) + %delta_6 = arith.constant dense<0.000000e+00> : tensor<1x16384xf32> loc(#loc174) + %delta_7 = tt.broadcast %delta_1 : tensor<1x1xi1> -> tensor<1x16384xi1> loc(#loc174) + %delta_8 = arith.select %delta_7, %delta_6, %delta_3 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc174) + %out_sum = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_16384S__(1,)cconstexpr_False_"(%delta_8) : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc175) + %out_sum_9 = arith.mulf %lhs_sum, %out_sum : tensor<1x16384xf32> loc(#loc176) + %out_sum_10 = tt.call @"triton.language.standard.sum__fp32S1_16384S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%out_sum_9) : (tensor<1x16384xf32>) -> tensor<1xf32> loc(#loc177) + tt.return %out_max, %out_sum_10 : tensor<1xf32>, tensor<1xf32> loc(#loc94) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xf32> loc(#loc95) + %1 = ub.poison : tensor<1xf32> loc(#loc95) + tt.return %0, %1 : tensor<1xf32>, tensor<1xf32> loc(#loc95) + } loc(#loc85) + tt.func private @"torch._inductor.runtime.triton_helpers.max2__fp32S1_16384S__(1,)cconstexpr_1_"(%a: tensor<1x16384xf32> loc("a"(#loc96))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%a) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc97) + tt.reduce.return %2 : f32 loc(#loc97) + }) : (tensor<1x16384xf32>) -> tensor<1xf32> loc(#loc97) + tt.return %0 : tensor<1xf32> loc(#loc98) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc99) + tt.return %1 : tensor<1xf32> loc(#loc99) + } loc(#loc96) + tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%a: f32 loc("a"(#loc62)), %b: f32 loc("b"(#loc62))) -> f32 attributes {noinline = false} { + %mask = arith.cmpf ogt, %a, %b : f32 loc(#loc183) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a) : (f32) -> i1 loc(#loc64) + %1 = scf.if %0 -> (i1) { + %mask_0 = arith.cmpf une, %a, %a : f32 loc(#loc163) + %mask_1 = arith.ori %mask, %mask_0 : i1 loc(#loc184) + scf.yield %mask_1 : i1 loc(#loc184) + } else { + scf.yield %mask : i1 loc(#loc68) + } loc(#loc65) + %2 = arith.select %1, %a, %b : f32 loc(#loc69) + tt.return %2 : f32 loc(#loc70) + ^bb1: // no predecessors + %3 = ub.poison : f32 loc(#loc71) + tt.return %3 : f32 loc(#loc71) + } loc(#loc62) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc72))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc73) + %true = arith.constant true loc(#loc74) + tt.return %true : i1 loc(#loc74) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc75) + tt.return %1 : i1 loc(#loc75) + } loc(#loc72) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc76))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc77) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc78) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc78) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc78) + tt.return %3 : tensor<1xf32> loc(#loc79) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc80) + tt.return %4 : tensor<1xf32> loc(#loc80) + } loc(#loc76) + tt.func private @"triton.language.standard.sum__fp32S1_16384S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x16384xf32> loc("input"(#loc100))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc101) + tt.reduce.return %2 : f32 loc(#loc101) + }) : (tensor<1x16384xf32>) -> tensor<1xf32> loc(#loc101) + tt.return %0 : tensor<1xf32> loc(#loc102) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc103) + tt.return %1 : tensor<1xf32> loc(#loc103) + } loc(#loc100) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc104)), %b: f32 loc("b"(#loc104))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc105) + tt.return %0 : f32 loc(#loc106) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc107) + tt.return %1 : f32 loc(#loc107) + } loc(#loc104) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":29:59) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":30:45) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":31:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":32:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":33:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:47) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:52) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:105) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":42:40) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":45:54) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":46:54) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":46:8) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":49:33) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":50:16) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":51:16) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":52:40) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":53:31) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":54:29) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:47) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:52) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:106) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":60:22) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":61:29) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":62:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:42) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:36) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:29) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:53) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":52:4) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:19) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":206:11) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":206:4) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:19) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:7) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:11) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:4) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:15) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":170:4) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":182:11) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":182:4) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:11) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:4) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc112 = loc("xnumel"(#loc1)) +#loc113 = loc("r0_numel"(#loc2)) +#loc114 = loc("xoffset"(#loc3)) +#loc115 = loc("xoffset"(#loc4)) +#loc116 = loc("xindex"(#loc5)) +#loc117 = loc("xindex"(#loc6)) +#loc118 = loc("xindex"(#loc7)) +#loc119 = loc("xmask"(#loc8)) +#loc120 = loc("r0_base"(#loc9)) +#loc121 = loc("r0_base"(#loc10)) +#loc122 = loc("_tmp3_max"(#loc11)) +#loc123 = loc("_tmp3_sum"(#loc12)) +#loc124 = loc("_tmp3_max"(#loc13)) +#loc125 = loc("r0_index"(#loc14)) +#loc126 = loc("r0_mask"(#loc15)) +#loc127 = loc("tmp0"(#loc16)) +#loc128 = loc("tmp0"(#loc17)) +#loc129 = loc("tmp0"(#loc18)) +#loc130 = loc("tmp0"(#loc19)) +#loc131 = loc("tmp0"(#loc20)) +#loc132 = loc("_tmp3_max"(#loc22)) +#loc133 = loc("_tmp3_sum"(#loc23)) +#loc134 = loc("tmp3"(#loc26)) +#loc135 = loc("tmp4"(#loc27)) +#loc136 = loc("r0_index"(#loc29)) +#loc137 = loc("r0_mask"(#loc30)) +#loc138 = loc("tmp5"(#loc31)) +#loc139 = loc("tmp5"(#loc32)) +#loc140 = loc("tmp5"(#loc33)) +#loc141 = loc("tmp5"(#loc34)) +#loc142 = loc("tmp5"(#loc35)) +#loc143 = loc("tmp7"(#loc36)) +#loc144 = loc("tmp8"(#loc37)) +#loc145 = loc("tmp9"(#loc38)) +#loc149 = loc("out_max"(#loc49)) +#loc150 = loc("lhs_scale"(#loc50)) +#loc151 = loc("lhs_scale"(#loc51)) +#loc152 = loc("lhs_scale"(#loc52)) +#loc153 = loc("lhs_scale"(#loc53)) +#loc154 = loc("rhs_scale"(#loc54)) +#loc155 = loc("rhs_scale"(#loc55)) +#loc156 = loc("rhs_scale"(#loc56)) +#loc157 = loc("rhs_scale"(#loc57)) +#loc158 = loc("out_sum"(#loc58)) +#loc159 = loc("out_sum"(#loc59)) +#loc162 = loc("mask"(#loc63)) +#loc163 = loc("mask"(#loc66)) +#loc164 = loc("mask"(#loc67)) +#loc170 = loc("out_max"(#loc86)) +#loc171 = loc("out_max_keepdim"(#loc87)) +#loc172 = loc("delta"(#loc88)) +#loc173 = loc("delta"(#loc89)) +#loc174 = loc("delta"(#loc90)) +#loc175 = loc("out_sum"(#loc91)) +#loc176 = loc("out_sum"(#loc92)) +#loc177 = loc("out_sum"(#loc93)) +#loc182 = loc("_tmp3_sum"(#loc124)) +#loc183 = loc("mask"(#loc162)) +#loc184 = loc("mask"(#loc164)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..1b8ede453029fd4691323534f1812956217dbfca --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir @@ -0,0 +1,239 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":18:0) +#loc1 = loc(unknown) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":49:33) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc58 = loc("in_ptr0"(#loc)) +#loc59 = loc("out_ptr2"(#loc)) +#loc60 = loc("xnumel"(#loc)) +#loc61 = loc("r0_numel"(#loc)) +#loc87 = loc("out_max"(#loc32)) +#loc94 = loc("out_sum"(#loc41)) +#loc120 = loc(callsite(#loc87 at #loc33)) +#loc127 = loc(callsite(#loc94 at #loc33)) +#loc136 = loc(callsite(#loc1 at #loc120)) +#loc139 = loc(callsite(#loc1 at #loc127)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32000> : tensor<1x16384xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<32000> : tensor<1x16384xi32, #blocked1> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x16384xbf16, #blocked1> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc1) + %cst_2 = arith.constant dense<0xFF800000> : tensor<1x1xf32, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x16384xf32, #blocked1> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<1x16384xf32, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<0xFF800000> : tensor<1x16384xf32, #blocked1> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc62) + %r0_base = tt.make_range {end = 16384 : i32, start = 0 : i32} : tensor<16384xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc63) + %r0_base_6 = tt.make_range {end = 16384 : i32, start = 0 : i32} : tensor<16384xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc63) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16384xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x16384xi32, #blocked1> loc(#loc63) + %r0_base_8 = tt.expand_dims %r0_base_6 {axis = 0 : i32} : tensor<16384xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16384xi32, #blocked> loc(#loc63) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc64) + %tmp0_9 = tt.splat %tmp0 : i32 -> tensor<1x16384xi32, #blocked1> loc(#loc106) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16384x!tt.ptr, #blocked1> loc(#loc66) + %_tmp3_sum:2 = scf.for %_tmp3_sum_17 = %c0_i32 to %c32000_i32 step %c16384_i32 iter_args(%arg5 = %cst_5, %arg6 = %cst_4) -> (tensor<1x16384xf32, #blocked1>, tensor<1x16384xf32, #blocked1>) : i32 { + %r0_index = tt.splat %_tmp3_sum_17 : i32 -> tensor<1x16384xi32, #blocked1> loc(#loc68) + %r0_index_18 = arith.addi %r0_index, %r0_base_7 : tensor<1x16384xi32, #blocked1> loc(#loc68) + %r0_mask = arith.cmpi slt, %r0_index_18, %cst_0 : tensor<1x16384xi32, #blocked1> loc(#loc69) + %tmp0_19 = arith.addi %r0_index_18, %tmp0_9 : tensor<1x16384xi32, #blocked1> loc(#loc65) + %tmp0_20 = tt.addptr %tmp0_10, %tmp0_19 : tensor<1x16384x!tt.ptr, #blocked1>, tensor<1x16384xi32, #blocked1> loc(#loc66) + %tmp0_21 = tt.load %tmp0_20, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x16384x!tt.ptr, #blocked1> loc(#loc70) + %tmp0_22 = arith.extf %tmp0_21 : tensor<1x16384xbf16, #blocked1> to tensor<1x16384xf32, #blocked1> loc(#loc71) + %mask = arith.cmpf ogt, %arg5, %tmp0_22 : tensor<1x16384xf32, #blocked1> loc(#loc129) + %mask_23 = arith.cmpf une, %arg5, %arg5 : tensor<1x16384xf32, #blocked1> loc(#loc130) + %mask_24 = arith.ori %mask, %mask_23 : tensor<1x16384xi1, #blocked1> loc(#loc131) + %out_max_25 = arith.select %mask_24, %arg5, %tmp0_22 : tensor<1x16384xi1, #blocked1>, tensor<1x16384xf32, #blocked1> loc(#loc132) + %lhs_scale = arith.cmpf oeq, %out_max_25, %cst_5 : tensor<1x16384xf32, #blocked1> loc(#loc111) + %lhs_scale_26 = arith.subf %arg5, %out_max_25 : tensor<1x16384xf32, #blocked1> loc(#loc112) + %lhs_scale_27 = tt.extern_elementwise %lhs_scale_26 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32, #blocked1>) -> tensor<1x16384xf32, #blocked1> loc(#loc133) + %lhs_scale_28 = arith.select %lhs_scale, %cst_3, %lhs_scale_27 : tensor<1x16384xi1, #blocked1>, tensor<1x16384xf32, #blocked1> loc(#loc114) + %rhs_scale = arith.subf %tmp0_22, %out_max_25 : tensor<1x16384xf32, #blocked1> loc(#loc115) + %rhs_scale_29 = tt.extern_elementwise %rhs_scale {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32, #blocked1>) -> tensor<1x16384xf32, #blocked1> loc(#loc134) + %rhs_scale_30 = arith.select %lhs_scale, %cst_3, %rhs_scale_29 : tensor<1x16384xi1, #blocked1>, tensor<1x16384xf32, #blocked1> loc(#loc117) + %out_sum_31 = arith.mulf %arg6, %lhs_scale_28 : tensor<1x16384xf32, #blocked1> loc(#loc118) + %out_sum_32 = arith.addf %out_sum_31, %rhs_scale_30 : tensor<1x16384xf32, #blocked1> loc(#loc119) + %_tmp3_max = arith.select %r0_mask, %out_max_25, %arg5 : tensor<1x16384xi1, #blocked1>, tensor<1x16384xf32, #blocked1> loc(#loc85) + %_tmp3_sum_33 = arith.select %r0_mask, %out_sum_32, %arg6 : tensor<1x16384xi1, #blocked1>, tensor<1x16384xf32, #blocked1> loc(#loc86) + scf.yield %_tmp3_max, %_tmp3_sum_33 : tensor<1x16384xf32, #blocked1>, tensor<1x16384xf32, #blocked1> loc(#loc30) + } loc(#loc107) + %out_max = "tt.reduce"(%_tmp3_sum#0) <{axis = 1 : i32}> ({ + ^bb0(%out_max_17: f32 loc(callsite(#loc1 at #loc120)), %out_max_18: f32 loc(callsite(#loc1 at #loc120))): + %mask = arith.cmpf ogt, %out_max_17, %out_max_18 : f32 loc(#loc140) + %mask_19 = arith.cmpf une, %out_max_17, %out_max_17 : f32 loc(#loc141) + %mask_20 = arith.ori %mask, %mask_19 : i1 loc(#loc142) + %out_max_21 = arith.select %mask_20, %out_max_17, %out_max_18 : f32 loc(#loc143) + tt.reduce.return %out_max_21 : f32 loc(#loc135) + }) : (tensor<1x16384xf32, #blocked1>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc135) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xf32, #blocked1> loc(#loc121) + %delta = arith.cmpf oeq, %out_max_keepdim, %cst_2 : tensor<1x1xf32, #blocked1> loc(#loc122) + %delta_11 = tt.broadcast %out_max_keepdim : tensor<1x1xf32, #blocked1> -> tensor<1x16384xf32, #blocked1> loc(#loc123) + %delta_12 = arith.subf %_tmp3_sum#0, %delta_11 : tensor<1x16384xf32, #blocked1> loc(#loc123) + %delta_13 = tt.broadcast %delta : tensor<1x1xi1, #blocked1> -> tensor<1x16384xi1, #blocked1> loc(#loc124) + %delta_14 = arith.select %delta_13, %cst_4, %delta_12 : tensor<1x16384xi1, #blocked1>, tensor<1x16384xf32, #blocked1> loc(#loc124) + %out_sum = tt.extern_elementwise %delta_14 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32, #blocked1>) -> tensor<1x16384xf32, #blocked1> loc(#loc137) + %out_sum_15 = arith.mulf %_tmp3_sum#1, %out_sum : tensor<1x16384xf32, #blocked1> loc(#loc126) + %out_sum_16 = "tt.reduce"(%out_sum_15) <{axis = 1 : i32}> ({ + ^bb0(%out_sum_17: f32 loc(callsite(#loc1 at #loc127)), %out_sum_18: f32 loc(callsite(#loc1 at #loc127))): + %out_sum_19 = arith.addf %out_sum_17, %out_sum_18 : f32 loc(#loc144) + tt.reduce.return %out_sum_19 : f32 loc(#loc138) + }) : (tensor<1x16384xf32, #blocked1>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc138) + %tmp4 = tt.expand_dims %out_sum_16 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xf32, #blocked1> loc(#loc95) + %tmp5 = tt.splat %tmp0 : i32 -> tensor<1x16384xi32, #blocked> loc(#loc128) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32, #blocked1> -> tensor<1x16384xf32, #blocked1> loc(#loc98) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16384x!tt.ptr, #blocked> loc(#loc47) + scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c16384_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16384xi32, #blocked1> loc(#loc99) + %r0_index_17 = tt.splat %r0_offset : i32 -> tensor<1x16384xi32, #blocked> loc(#loc99) + %r0_index_18 = arith.addi %r0_index, %r0_base_7 : tensor<1x16384xi32, #blocked1> loc(#loc99) + %r0_index_19 = arith.addi %r0_index_17, %r0_base_8 : tensor<1x16384xi32, #blocked> loc(#loc99) + %r0_mask = arith.cmpi slt, %r0_index_18, %cst_0 : tensor<1x16384xi32, #blocked1> loc(#loc100) + %r0_mask_20 = arith.cmpi slt, %r0_index_19, %cst : tensor<1x16384xi32, #blocked> loc(#loc100) + %tmp5_21 = arith.addi %r0_index_18, %tmp0_9 : tensor<1x16384xi32, #blocked1> loc(#loc96) + %tmp5_22 = arith.addi %r0_index_19, %tmp5 : tensor<1x16384xi32, #blocked> loc(#loc96) + %tmp5_23 = tt.addptr %tmp0_10, %tmp5_21 : tensor<1x16384x!tt.ptr, #blocked1>, tensor<1x16384xi32, #blocked1> loc(#loc101) + %tmp5_24 = tt.load %tmp5_23, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x16384x!tt.ptr, #blocked1> loc(#loc102) + %tmp5_25 = arith.extf %tmp5_24 : tensor<1x16384xbf16, #blocked1> to tensor<1x16384xf32, #blocked1> loc(#loc103) + %tmp7 = arith.subf %tmp5_25, %delta_11 : tensor<1x16384xf32, #blocked1> loc(#loc104) + %tmp8 = tt.extern_elementwise %tmp7 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32, #blocked1>) -> tensor<1x16384xf32, #blocked1> loc(#loc105) + %tmp9_26 = arith.divf %tmp8, %tmp9 : tensor<1x16384xf32, #blocked1> loc(#loc98) + %1 = tt.addptr %0, %tmp5_22 : tensor<1x16384x!tt.ptr, #blocked>, tensor<1x16384xi32, #blocked> loc(#loc47) + %2 = ttg.convert_layout %tmp9_26 : tensor<1x16384xf32, #blocked1> -> tensor<1x16384xf32, #blocked> loc(#loc56) + tt.store %1, %2, %r0_mask_20 : tensor<1x16384x!tt.ptr, #blocked> loc(#loc56) + } loc(#loc48) + tt.return loc(#loc57) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":26:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:47) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:41) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:34) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":31:40) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":32:31) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":33:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:52) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:105) +#loc12 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc13 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":42:40) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":45:54) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":46:54) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":46:8) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":51:16) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:41) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:47) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":62:23) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:29) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":52:40) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":53:31) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":54:29) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:34) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:52) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:106) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":60:22) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":61:29) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:53) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":52:4) +#loc62 = loc("xoffset"(#loc2)) +#loc63 = loc("r0_base"(#loc3)) +#loc64 = loc("tmp0"(#loc4)) +#loc65 = loc("tmp0"(#loc5)) +#loc66 = loc("tmp0"(#loc6)) +#loc67 = loc("_tmp3_max"(#loc7)) +#loc68 = loc("r0_index"(#loc8)) +#loc69 = loc("r0_mask"(#loc9)) +#loc70 = loc("tmp0"(#loc10)) +#loc71 = loc("tmp0"(#loc11)) +#loc72 = loc("mask"(#loc12)) +#loc73 = loc("out_max"(#loc13)) +#loc74 = loc("mask"(#loc15)) +#loc75 = loc("mask"(#loc16)) +#loc76 = loc("lhs_scale"(#loc18)) +#loc77 = loc("lhs_scale"(#loc19)) +#loc78 = loc("lhs_scale"(#loc21)) +#loc79 = loc("lhs_scale"(#loc22)) +#loc80 = loc("rhs_scale"(#loc23)) +#loc81 = loc("rhs_scale"(#loc24)) +#loc82 = loc("rhs_scale"(#loc25)) +#loc83 = loc("out_sum"(#loc26)) +#loc84 = loc("out_sum"(#loc27)) +#loc85 = loc("_tmp3_max"(#loc28)) +#loc86 = loc("_tmp3_sum"(#loc29)) +#loc88 = loc("out_max_keepdim"(#loc34)) +#loc89 = loc("delta"(#loc35)) +#loc90 = loc("delta"(#loc36)) +#loc91 = loc("delta"(#loc37)) +#loc92 = loc("out_sum"(#loc38)) +#loc93 = loc("out_sum"(#loc39)) +#loc95 = loc("tmp4"(#loc43)) +#loc96 = loc("tmp5"(#loc44)) +#loc97 = loc("tmp5"(#loc45)) +#loc98 = loc("tmp9"(#loc46)) +#loc99 = loc("r0_index"(#loc49)) +#loc100 = loc("r0_mask"(#loc50)) +#loc101 = loc("tmp5"(#loc51)) +#loc102 = loc("tmp5"(#loc52)) +#loc103 = loc("tmp5"(#loc53)) +#loc104 = loc("tmp7"(#loc54)) +#loc105 = loc("tmp8"(#loc55)) +#loc106 = loc(fused[#loc65, #loc64]) +#loc107 = loc("_tmp3_sum"(#loc67)) +#loc108 = loc("mask"(#loc72)) +#loc109 = loc(callsite(#loc73 at #loc14)) +#loc110 = loc("mask"(#loc75)) +#loc111 = loc(callsite(#loc76 at #loc14)) +#loc112 = loc(callsite(#loc77 at #loc14)) +#loc113 = loc(callsite(#loc78 at #loc14)) +#loc114 = loc(callsite(#loc79 at #loc14)) +#loc115 = loc(callsite(#loc80 at #loc14)) +#loc116 = loc(callsite(#loc81 at #loc14)) +#loc117 = loc(callsite(#loc82 at #loc14)) +#loc118 = loc(callsite(#loc83 at #loc14)) +#loc119 = loc(callsite(#loc84 at #loc14)) +#loc121 = loc(callsite(#loc88 at #loc33)) +#loc122 = loc(callsite(#loc89 at #loc33)) +#loc123 = loc(callsite(#loc90 at #loc33)) +#loc124 = loc(callsite(#loc91 at #loc33)) +#loc125 = loc(callsite(#loc92 at #loc33)) +#loc126 = loc(callsite(#loc93 at #loc33)) +#loc128 = loc(fused[#loc96, #loc97]) +#loc129 = loc(callsite(#loc108 at #loc109)) +#loc130 = loc(callsite(#loc74 at #loc109)) +#loc131 = loc(callsite(#loc110 at #loc109)) +#loc132 = loc(callsite(#loc17 at #loc109)) +#loc133 = loc(callsite(#loc20 at #loc113)) +#loc134 = loc(callsite(#loc20 at #loc116)) +#loc135 = loc(callsite(#loc31 at #loc120)) +#loc137 = loc(callsite(#loc20 at #loc125)) +#loc138 = loc(callsite(#loc40 at #loc127)) +#loc140 = loc(callsite(#loc108 at #loc135)) +#loc141 = loc(callsite(#loc74 at #loc135)) +#loc142 = loc(callsite(#loc110 at #loc135)) +#loc143 = loc(callsite(#loc17 at #loc135)) +#loc144 = loc(callsite(#loc42 at #loc138)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..a2a92e4ef67f2870989c3e55c818cf129984e357 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CAJAL4G4QCCA2OHTKHFQ6XGSKEBIWX75FTZDOT4AZMGRWIXBIYPA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir @@ -0,0 +1,233 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":18:0) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":49:33) +#loc3 = loc(unknown) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc59 = loc("in_ptr0"(#loc)) +#loc60 = loc("out_ptr2"(#loc)) +#loc61 = loc("xnumel"(#loc)) +#loc62 = loc("r0_numel"(#loc)) +#loc90 = loc("out_max"(#loc35)) +#loc96 = loc("out_sum"(#loc42)) +#loc123 = loc(callsite(#loc90 at #loc2)) +#loc129 = loc(callsite(#loc96 at #loc2)) +#loc138 = loc(callsite(#loc3 at #loc123)) +#loc141 = loc(callsite(#loc3 at #loc129)) +module { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %delta = arith.constant dense<0xFF800000> : tensor<1x1xf32> loc(#loc108) + %cst = arith.constant dense<1.000000e+00> : tensor<1x16384xf32> loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x16384xf32> loc(#loc3) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x16384xbf16> loc(#loc3) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc3) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_2 = arith.constant dense<32000> : tensor<1x16384xi32> loc(#loc3) + %cst_3 = arith.constant dense<0xFF800000> : tensor<1x16384xf32> loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc64) + %r0_base = tt.make_range {end = 16384 : i32, start = 0 : i32} : tensor<16384xi32> loc(#loc65) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16384xi32> -> tensor<1x16384xi32> loc(#loc66) + %_tmp3_sum:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c16384_i32 iter_args(%_tmp3_max = %cst_3, %_tmp3_sum_12 = %cst_0) -> (tensor<1x16384xf32>, tensor<1x16384xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16384xi32> loc(#loc68) + %r0_index_13 = arith.addi %r0_index, %r0_base_4 : tensor<1x16384xi32> loc(#loc68) + %r0_mask = arith.cmpi slt, %r0_index_13, %cst_2 : tensor<1x16384xi32> loc(#loc69) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc70) + %tmp0_14 = tt.splat %tmp0 : i32 -> tensor<1x16384xi32> loc(#loc110) + %tmp0_15 = arith.addi %r0_index_13, %tmp0_14 : tensor<1x16384xi32> loc(#loc71) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16384x!tt.ptr> loc(#loc72) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<1x16384x!tt.ptr>, tensor<1x16384xi32> loc(#loc72) + %tmp0_18 = tt.load %tmp0_17, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x16384x!tt.ptr> loc(#loc73) + %tmp0_19 = arith.extf %tmp0_18 : tensor<1x16384xbf16> to tensor<1x16384xf32> loc(#loc74) + %mask = arith.cmpf ogt, %_tmp3_max, %tmp0_19 : tensor<1x16384xf32> loc(#loc131) + %mask_20 = arith.cmpf une, %_tmp3_max, %_tmp3_max : tensor<1x16384xf32> loc(#loc132) + %mask_21 = arith.ori %mask, %mask_20 : tensor<1x16384xi1> loc(#loc133) + %out_max_22 = arith.select %mask_21, %_tmp3_max, %tmp0_19 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc134) + %lhs_scale = arith.cmpf oeq, %out_max_22, %cst_3 : tensor<1x16384xf32> loc(#loc114) + %lhs_scale_23 = arith.subf %_tmp3_max, %out_max_22 : tensor<1x16384xf32> loc(#loc115) + %lhs_scale_24 = tt.extern_elementwise %lhs_scale_23 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc135) + %lhs_scale_25 = arith.select %lhs_scale, %cst, %lhs_scale_24 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc117) + %rhs_scale = arith.subf %tmp0_19, %out_max_22 : tensor<1x16384xf32> loc(#loc118) + %rhs_scale_26 = tt.extern_elementwise %rhs_scale {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc136) + %rhs_scale_27 = arith.select %lhs_scale, %cst, %rhs_scale_26 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc120) + %out_sum_28 = arith.mulf %_tmp3_sum_12, %lhs_scale_25 : tensor<1x16384xf32> loc(#loc121) + %out_sum_29 = arith.addf %out_sum_28, %rhs_scale_27 : tensor<1x16384xf32> loc(#loc122) + %_tmp3_max_30 = arith.select %r0_mask, %out_max_22, %_tmp3_max : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc88) + %_tmp3_sum_31 = arith.select %r0_mask, %out_sum_29, %_tmp3_sum_12 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc89) + scf.yield %_tmp3_max_30, %_tmp3_sum_31 : tensor<1x16384xf32>, tensor<1x16384xf32> loc(#loc33) + } loc(#loc109) + %out_max = "tt.reduce"(%_tmp3_sum#0) <{axis = 1 : i32}> ({ + ^bb0(%out_max_12: f32 loc(callsite(#loc3 at #loc123)), %out_max_13: f32 loc(callsite(#loc3 at #loc123))): + %mask = arith.cmpf ogt, %out_max_12, %out_max_13 : f32 loc(#loc142) + %mask_14 = arith.cmpf une, %out_max_12, %out_max_12 : f32 loc(#loc143) + %mask_15 = arith.ori %mask, %mask_14 : i1 loc(#loc144) + %out_max_16 = arith.select %mask_15, %out_max_12, %out_max_13 : f32 loc(#loc145) + tt.reduce.return %out_max_16 : f32 loc(#loc137) + }) : (tensor<1x16384xf32>) -> tensor<1xf32> loc(#loc137) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc124) + %delta_5 = arith.cmpf oeq, %out_max_keepdim, %delta : tensor<1x1xf32> loc(#loc108) + %delta_6 = tt.broadcast %out_max_keepdim : tensor<1x1xf32> -> tensor<1x16384xf32> loc(#loc125) + %delta_7 = arith.subf %_tmp3_sum#0, %delta_6 : tensor<1x16384xf32> loc(#loc125) + %delta_8 = tt.broadcast %delta_5 : tensor<1x1xi1> -> tensor<1x16384xi1> loc(#loc126) + %delta_9 = arith.select %delta_8, %cst_0, %delta_7 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc126) + %out_sum = tt.extern_elementwise %delta_9 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc139) + %out_sum_10 = arith.mulf %_tmp3_sum#1, %out_sum : tensor<1x16384xf32> loc(#loc128) + %out_sum_11 = "tt.reduce"(%out_sum_10) <{axis = 1 : i32}> ({ + ^bb0(%out_sum_12: f32 loc(callsite(#loc3 at #loc129)), %out_sum_13: f32 loc(callsite(#loc3 at #loc129))): + %out_sum_14 = arith.addf %out_sum_12, %out_sum_13 : f32 loc(#loc146) + tt.reduce.return %out_sum_14 : f32 loc(#loc140) + }) : (tensor<1x16384xf32>) -> tensor<1xf32> loc(#loc140) + %tmp4 = tt.expand_dims %out_sum_11 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc97) + scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c16384_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16384xi32> loc(#loc98) + %r0_index_12 = arith.addi %r0_index, %r0_base_4 : tensor<1x16384xi32> loc(#loc98) + %r0_mask = arith.cmpi slt, %r0_index_12, %cst_2 : tensor<1x16384xi32> loc(#loc99) + %tmp5 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc100) + %tmp5_13 = tt.splat %tmp5 : i32 -> tensor<1x16384xi32> loc(#loc130) + %tmp5_14 = arith.addi %r0_index_12, %tmp5_13 : tensor<1x16384xi32> loc(#loc101) + %tmp5_15 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16384x!tt.ptr> loc(#loc102) + %tmp5_16 = tt.addptr %tmp5_15, %tmp5_14 : tensor<1x16384x!tt.ptr>, tensor<1x16384xi32> loc(#loc102) + %tmp5_17 = tt.load %tmp5_16, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x16384x!tt.ptr> loc(#loc103) + %tmp5_18 = arith.extf %tmp5_17 : tensor<1x16384xbf16> to tensor<1x16384xf32> loc(#loc104) + %tmp7 = arith.subf %tmp5_18, %delta_6 : tensor<1x16384xf32> loc(#loc105) + %tmp8 = tt.extern_elementwise %tmp7 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc106) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32> -> tensor<1x16384xf32> loc(#loc107) + %tmp9_19 = arith.divf %tmp8, %tmp9 : tensor<1x16384xf32> loc(#loc107) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16384x!tt.ptr> loc(#loc56) + %1 = tt.addptr %0, %tmp5_14 : tensor<1x16384x!tt.ptr>, tensor<1x16384xi32> loc(#loc56) + tt.store %1, %tmp9_19, %r0_mask : tensor<1x16384x!tt.ptr> loc(#loc57) + } loc(#loc45) + tt.return loc(#loc58) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":23:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":26:27) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":31:40) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":32:31) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":33:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:47) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:41) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:34) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:52) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:105) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":42:40) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":45:54) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":46:54) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":46:8) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":51:16) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":52:40) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":53:31) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":54:29) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:47) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:41) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:34) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:52) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:106) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":60:22) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":61:29) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":62:23) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:29) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:53) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":52:4) +#loc63 = loc("delta"(#loc1)) +#loc64 = loc("xoffset"(#loc4)) +#loc65 = loc("r0_base"(#loc5)) +#loc66 = loc("r0_base"(#loc6)) +#loc67 = loc("_tmp3_max"(#loc7)) +#loc68 = loc("r0_index"(#loc8)) +#loc69 = loc("r0_mask"(#loc9)) +#loc70 = loc("tmp0"(#loc10)) +#loc71 = loc("tmp0"(#loc11)) +#loc72 = loc("tmp0"(#loc12)) +#loc73 = loc("tmp0"(#loc13)) +#loc74 = loc("tmp0"(#loc14)) +#loc75 = loc("mask"(#loc15)) +#loc76 = loc("out_max"(#loc16)) +#loc77 = loc("mask"(#loc18)) +#loc78 = loc("mask"(#loc19)) +#loc79 = loc("lhs_scale"(#loc21)) +#loc80 = loc("lhs_scale"(#loc22)) +#loc81 = loc("lhs_scale"(#loc24)) +#loc82 = loc("lhs_scale"(#loc25)) +#loc83 = loc("rhs_scale"(#loc26)) +#loc84 = loc("rhs_scale"(#loc27)) +#loc85 = loc("rhs_scale"(#loc28)) +#loc86 = loc("out_sum"(#loc29)) +#loc87 = loc("out_sum"(#loc30)) +#loc88 = loc("_tmp3_max"(#loc31)) +#loc89 = loc("_tmp3_sum"(#loc32)) +#loc91 = loc("out_max_keepdim"(#loc36)) +#loc92 = loc("delta"(#loc37)) +#loc93 = loc("delta"(#loc38)) +#loc94 = loc("out_sum"(#loc39)) +#loc95 = loc("out_sum"(#loc40)) +#loc97 = loc("tmp4"(#loc44)) +#loc98 = loc("r0_index"(#loc46)) +#loc99 = loc("r0_mask"(#loc47)) +#loc100 = loc("tmp5"(#loc48)) +#loc101 = loc("tmp5"(#loc49)) +#loc102 = loc("tmp5"(#loc50)) +#loc103 = loc("tmp5"(#loc51)) +#loc104 = loc("tmp5"(#loc52)) +#loc105 = loc("tmp7"(#loc53)) +#loc106 = loc("tmp8"(#loc54)) +#loc107 = loc("tmp9"(#loc55)) +#loc108 = loc(callsite(#loc63 at #loc2)) +#loc109 = loc("_tmp3_sum"(#loc67)) +#loc110 = loc(fused[#loc71, #loc70]) +#loc111 = loc("mask"(#loc75)) +#loc112 = loc(callsite(#loc76 at #loc17)) +#loc113 = loc("mask"(#loc78)) +#loc114 = loc(callsite(#loc79 at #loc17)) +#loc115 = loc(callsite(#loc80 at #loc17)) +#loc116 = loc(callsite(#loc81 at #loc17)) +#loc117 = loc(callsite(#loc82 at #loc17)) +#loc118 = loc(callsite(#loc83 at #loc17)) +#loc119 = loc(callsite(#loc84 at #loc17)) +#loc120 = loc(callsite(#loc85 at #loc17)) +#loc121 = loc(callsite(#loc86 at #loc17)) +#loc122 = loc(callsite(#loc87 at #loc17)) +#loc124 = loc(callsite(#loc91 at #loc2)) +#loc125 = loc(callsite(#loc92 at #loc2)) +#loc126 = loc(callsite(#loc93 at #loc2)) +#loc127 = loc(callsite(#loc94 at #loc2)) +#loc128 = loc(callsite(#loc95 at #loc2)) +#loc130 = loc(fused[#loc101, #loc100]) +#loc131 = loc(callsite(#loc111 at #loc112)) +#loc132 = loc(callsite(#loc77 at #loc112)) +#loc133 = loc(callsite(#loc113 at #loc112)) +#loc134 = loc(callsite(#loc20 at #loc112)) +#loc135 = loc(callsite(#loc23 at #loc116)) +#loc136 = loc(callsite(#loc23 at #loc119)) +#loc137 = loc(callsite(#loc34 at #loc123)) +#loc139 = loc(callsite(#loc23 at #loc127)) +#loc140 = loc(callsite(#loc41 at #loc129)) +#loc142 = loc(callsite(#loc111 at #loc137)) +#loc143 = loc(callsite(#loc77 at #loc137)) +#loc144 = loc(callsite(#loc113 at #loc137)) +#loc145 = loc(callsite(#loc20 at #loc137)) +#loc146 = loc(callsite(#loc43 at #loc140)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/__grp__triton_tem_fused_zeros_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/__grp__triton_tem_fused_zeros_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b6aa7b821e26006bbd04c55706b37eb8b1b8286d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/__grp__triton_tem_fused_zeros_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_tem_fused_zeros_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.source", "triton_tem_fused_zeros_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.ttir", "triton_tem_fused_zeros_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.ttgir", "triton_tem_fused_zeros_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.llir", "triton_tem_fused_zeros_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.ptx", "triton_tem_fused_zeros_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.cubin", "triton_tem_fused_zeros_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4e1cf257358f929d6e74b972a5d5129c655bc8c2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.json @@ -0,0 +1 @@ +{"hash": "11439e487665d4a718fc7fe07145020ffe9025dda8f202bdb1ad8d5fd8d73bef", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 164864, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_zeros_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..c0184308031a59520ac06820a0c6913261bf2972 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.llir @@ -0,0 +1,14439 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_tem_fused_zeros_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, ptr addrspace(1) %10, ptr addrspace(1) %11, ptr addrspace(1) %12, ptr addrspace(1) %13, ptr addrspace(1) %14, ptr addrspace(1) %15, ptr addrspace(1) %16, ptr addrspace(1) %17, i32 %18, i32 %19, i32 %20, i32 %21, i32 %22, i32 %23, i32 %24, i32 %25, i32 %26, ptr addrspace(1) readnone captures(none) %27, ptr addrspace(1) readnone captures(none) %28) local_unnamed_addr #0 !dbg !5 { + %30 = shl i32 %18, 12, !dbg !8 + %31 = shl i32 %19, 10, !dbg !9 + %32 = shl i32 %19, 7, !dbg !10 + %33 = icmp slt i32 %18, 2, !dbg !11 + %34 = zext i1 %33 to i32, !dbg !12 + %35 = icmp sgt i32 %18, 1, !dbg !13 + %36 = select i1 %35, i32 %18, i32 0, !dbg !14 + %37 = add i32 %36, %34, !dbg !15 + %38 = shl i32 %37, 12, !dbg !16 + %39 = shl i32 %37, 7, !dbg !17 + %40 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !18 + %41 = add i32 %19, 127, !dbg !19 + %42 = sdiv i32 %41, 128, !dbg !23 + %43 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !24 + %44 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !25 + %45 = and i32 %43, 1, !dbg !26 + %46 = mul i32 %32, %44, !dbg !27 + %47 = mul nuw nsw i32 %31, %45, !dbg !28 + %48 = add i32 %46, %47, !dbg !29 + %49 = sext i32 %48 to i64, !dbg !30 + %50 = mul i32 %31, %43, !dbg !31 + %51 = add i32 %46, %50, !dbg !32 + %52 = sext i32 %51 to i64, !dbg !33 + %53 = getelementptr bfloat, ptr addrspace(1) %1, i64 %49, !dbg !34 + %54 = getelementptr bfloat, ptr addrspace(1) %2, i64 %49, !dbg !35 + %55 = getelementptr bfloat, ptr addrspace(1) %7, i64 %52, !dbg !36 + %56 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !37 + %57 = lshr i32 %56, 5, !dbg !37 + %58 = and i32 %56, 240, !dbg !37 + %59 = lshr exact i32 %58, 4, !dbg !37 + %60 = or disjoint i32 %59, 16, !dbg !37 + %61 = or disjoint i32 %59, 32, !dbg !37 + %62 = or disjoint i32 %59, 48, !dbg !37 + %63 = or disjoint i32 %59, 64, !dbg !37 + %64 = or disjoint i32 %59, 80, !dbg !37 + %65 = or disjoint i32 %59, 96, !dbg !37 + %66 = or disjoint i32 %59, 112, !dbg !37 + %67 = lshr i32 %56, 1, !dbg !37 + %68 = and i32 %67, 112, !dbg !37 + %69 = lshr i32 %56, 2, !dbg !37 + %70 = and i32 %69, 7, !dbg !37 + %71 = or disjoint i32 %68, %70, !dbg !37 + %72 = or disjoint i32 %71, 8, !dbg !37 + %.not = icmp slt i32 %40, %42, !dbg !38 + br i1 %.not, label %4875, label %73, !dbg !39 + +73: ; preds = %29 + %74 = add i32 %18, 127, !dbg !40 + %75 = sdiv i32 %74, 128, !dbg !42 + %76 = sub i32 %40, %42, !dbg !43 + %.frozen = freeze i32 %76, !dbg !44 + %.frozen4059 = freeze i32 %75, !dbg !44 + %77 = sdiv i32 %.frozen, %.frozen4059, !dbg !44 + %78 = shl nuw nsw i32 %44, 2, !dbg !45 + %79 = add i32 %77, %78, !dbg !46 + %80 = mul i32 %77, %.frozen4059, !dbg !47 + %.decomposed = sub i32 %.frozen, %80, !dbg !47 + %81 = mul nuw nsw i32 %20, %45, !dbg !48 + %82 = add i32 %.decomposed, %81, !dbg !49 + %83 = mul nuw nsw i32 %21, %45, !dbg !50 + %reass.add = add i32 %.decomposed, %83, !dbg !51 + %reass.mul = mul i32 %reass.add, %22, !dbg !51 + %84 = shl i32 %79, 7, !dbg !52 + %85 = mul i32 %30, %43, !dbg !53 + %86 = add i32 %84, %85, !dbg !54 + %87 = sext i32 %86 to i64, !dbg !55 + %88 = mul i32 %79, %39, !dbg !56 + %89 = mul i32 %38, %43, !dbg !57 + %90 = add i32 %88, %89, !dbg !58 + %91 = sext i32 %90 to i64, !dbg !59 + %92 = shl nuw nsw i32 %43, 5, !dbg !60 + %93 = add i32 %79, %92, !dbg !61 + %94 = mul i32 %93, %18, !dbg !62 + %95 = sext i32 %94 to i64, !dbg !63 + %96 = getelementptr bfloat, ptr addrspace(1) %0, i64 %87, !dbg !64 + %97 = getelementptr bfloat, ptr addrspace(1) %5, i64 %91, !dbg !65 + %98 = getelementptr bfloat, ptr addrspace(1) %6, i64 %87, !dbg !66 + %99 = getelementptr float, ptr addrspace(1) %3, i64 %95, !dbg !67 + %100 = getelementptr float, ptr addrspace(1) %4, i64 %95, !dbg !68 + %101 = shl nsw i32 %.decomposed, 7, !dbg !69 + %102 = or disjoint i32 %101, %59, !dbg !70 + %103 = or disjoint i32 %101, %60, !dbg !70 + %104 = or disjoint i32 %101, %61, !dbg !70 + %105 = or disjoint i32 %101, %62, !dbg !70 + %106 = or disjoint i32 %101, %63, !dbg !70 + %107 = or disjoint i32 %101, %64, !dbg !70 + %108 = or disjoint i32 %101, %65, !dbg !70 + %109 = or disjoint i32 %101, %66, !dbg !70 + %110 = or disjoint i32 %101, %71, !dbg !70 + %111 = or disjoint i32 %101, %72, !dbg !70 + %112 = shl i32 %102, 12, !dbg !71 + %113 = shl i32 %103, 12, !dbg !71 + %114 = shl i32 %104, 12, !dbg !71 + %115 = shl i32 %105, 12, !dbg !71 + %116 = shl i32 %106, 12, !dbg !71 + %117 = shl i32 %107, 12, !dbg !71 + %118 = shl i32 %108, 12, !dbg !71 + %119 = shl i32 %109, 12, !dbg !71 + %120 = sext i32 %112 to i64, !dbg !74 + %121 = getelementptr bfloat, ptr addrspace(1) %96, i64 %120, !dbg !74 + %122 = sext i32 %113 to i64, !dbg !74 + %123 = getelementptr bfloat, ptr addrspace(1) %96, i64 %122, !dbg !74 + %124 = sext i32 %114 to i64, !dbg !74 + %125 = getelementptr bfloat, ptr addrspace(1) %96, i64 %124, !dbg !74 + %126 = sext i32 %115 to i64, !dbg !74 + %127 = getelementptr bfloat, ptr addrspace(1) %96, i64 %126, !dbg !74 + %128 = sext i32 %116 to i64, !dbg !74 + %129 = getelementptr bfloat, ptr addrspace(1) %96, i64 %128, !dbg !74 + %130 = sext i32 %117 to i64, !dbg !74 + %131 = getelementptr bfloat, ptr addrspace(1) %96, i64 %130, !dbg !74 + %132 = sext i32 %118 to i64, !dbg !74 + %133 = getelementptr bfloat, ptr addrspace(1) %96, i64 %132, !dbg !74 + %134 = sext i32 %119 to i64, !dbg !74 + %135 = getelementptr bfloat, ptr addrspace(1) %96, i64 %134, !dbg !74 + %136 = shl nuw nsw i32 %56, 3, !dbg !75 + %137 = and i32 %136, 120, !dbg !75 + %138 = zext nneg i32 %137 to i64, !dbg !76 + %139 = getelementptr bfloat, ptr addrspace(1) %121, i64 %138, !dbg !76 + %140 = getelementptr bfloat, ptr addrspace(1) %123, i64 %138, !dbg !76 + %141 = getelementptr bfloat, ptr addrspace(1) %125, i64 %138, !dbg !76 + %142 = getelementptr bfloat, ptr addrspace(1) %127, i64 %138, !dbg !76 + %143 = getelementptr bfloat, ptr addrspace(1) %129, i64 %138, !dbg !76 + %144 = getelementptr bfloat, ptr addrspace(1) %131, i64 %138, !dbg !76 + %145 = getelementptr bfloat, ptr addrspace(1) %133, i64 %138, !dbg !76 + %146 = getelementptr bfloat, ptr addrspace(1) %135, i64 %138, !dbg !76 + %147 = icmp slt i32 %102, %18, !dbg !77 + %148 = icmp slt i32 %103, %18, !dbg !77 + %149 = icmp slt i32 %104, %18, !dbg !77 + %150 = icmp slt i32 %105, %18, !dbg !77 + %151 = icmp slt i32 %106, %18, !dbg !77 + %152 = icmp slt i32 %107, %18, !dbg !77 + %153 = icmp slt i32 %108, %18, !dbg !77 + %154 = icmp slt i32 %109, %18, !dbg !77 + %155 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %139, i1 %147) #3, !dbg !78 + %156 = extractvalue { i32, i32, i32, i32 } %155, 0, !dbg !78 + %157 = extractvalue { i32, i32, i32, i32 } %155, 1, !dbg !78 + %158 = extractvalue { i32, i32, i32, i32 } %155, 2, !dbg !78 + %159 = extractvalue { i32, i32, i32, i32 } %155, 3, !dbg !78 + %160 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %140, i1 %148) #3, !dbg !78 + %161 = extractvalue { i32, i32, i32, i32 } %160, 0, !dbg !78 + %162 = extractvalue { i32, i32, i32, i32 } %160, 1, !dbg !78 + %163 = extractvalue { i32, i32, i32, i32 } %160, 2, !dbg !78 + %164 = extractvalue { i32, i32, i32, i32 } %160, 3, !dbg !78 + %165 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %141, i1 %149) #3, !dbg !78 + %166 = extractvalue { i32, i32, i32, i32 } %165, 0, !dbg !78 + %167 = extractvalue { i32, i32, i32, i32 } %165, 1, !dbg !78 + %168 = extractvalue { i32, i32, i32, i32 } %165, 2, !dbg !78 + %169 = extractvalue { i32, i32, i32, i32 } %165, 3, !dbg !78 + %170 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %142, i1 %150) #3, !dbg !78 + %171 = extractvalue { i32, i32, i32, i32 } %170, 0, !dbg !78 + %172 = extractvalue { i32, i32, i32, i32 } %170, 1, !dbg !78 + %173 = extractvalue { i32, i32, i32, i32 } %170, 2, !dbg !78 + %174 = extractvalue { i32, i32, i32, i32 } %170, 3, !dbg !78 + %175 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %143, i1 %151) #3, !dbg !78 + %176 = extractvalue { i32, i32, i32, i32 } %175, 0, !dbg !78 + %177 = extractvalue { i32, i32, i32, i32 } %175, 1, !dbg !78 + %178 = extractvalue { i32, i32, i32, i32 } %175, 2, !dbg !78 + %179 = extractvalue { i32, i32, i32, i32 } %175, 3, !dbg !78 + %180 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %144, i1 %152) #3, !dbg !78 + %181 = extractvalue { i32, i32, i32, i32 } %180, 0, !dbg !78 + %182 = extractvalue { i32, i32, i32, i32 } %180, 1, !dbg !78 + %183 = extractvalue { i32, i32, i32, i32 } %180, 2, !dbg !78 + %184 = extractvalue { i32, i32, i32, i32 } %180, 3, !dbg !78 + %185 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %145, i1 %153) #3, !dbg !78 + %186 = extractvalue { i32, i32, i32, i32 } %185, 0, !dbg !78 + %187 = extractvalue { i32, i32, i32, i32 } %185, 1, !dbg !78 + %188 = extractvalue { i32, i32, i32, i32 } %185, 2, !dbg !78 + %189 = extractvalue { i32, i32, i32, i32 } %185, 3, !dbg !78 + %190 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %146, i1 %154) #3, !dbg !78 + %191 = extractvalue { i32, i32, i32, i32 } %190, 0, !dbg !78 + %192 = extractvalue { i32, i32, i32, i32 } %190, 1, !dbg !78 + %193 = extractvalue { i32, i32, i32, i32 } %190, 2, !dbg !78 + %194 = extractvalue { i32, i32, i32, i32 } %190, 3, !dbg !78 + %195 = shl nuw nsw i32 %56, 4, !dbg !78 + %196 = and i32 %195, 112, !dbg !78 + %197 = shl nuw nsw i32 %58, 3, !dbg !78 + %198 = and i32 %56, 112, !dbg !78 + %199 = and i32 %56, 8, !dbg !78 + %200 = shl nuw nsw i32 %199, 11, !dbg !78 + %201 = or disjoint i32 %196, %197, !dbg !78 + %202 = xor i32 %201, %198, !dbg !78 + %203 = or disjoint i32 %202, %200, !dbg !78 + %204 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %203, !dbg !78 + %205 = insertelement <4 x i32> poison, i32 %156, i64 0, !dbg !78 + %206 = insertelement <4 x i32> %205, i32 %157, i64 1, !dbg !78 + %207 = insertelement <4 x i32> %206, i32 %158, i64 2, !dbg !78 + %208 = insertelement <4 x i32> %207, i32 %159, i64 3, !dbg !78 + store <4 x i32> %208, ptr addrspace(3) %204, align 16, !dbg !78 + %209 = or disjoint i32 %203, 2048, !dbg !78 + %210 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %209, !dbg !78 + %211 = insertelement <4 x i32> poison, i32 %161, i64 0, !dbg !78 + %212 = insertelement <4 x i32> %211, i32 %162, i64 1, !dbg !78 + %213 = insertelement <4 x i32> %212, i32 %163, i64 2, !dbg !78 + %214 = insertelement <4 x i32> %213, i32 %164, i64 3, !dbg !78 + store <4 x i32> %214, ptr addrspace(3) %210, align 16, !dbg !78 + %215 = or disjoint i32 %203, 4096, !dbg !78 + %216 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %215, !dbg !78 + %217 = insertelement <4 x i32> poison, i32 %166, i64 0, !dbg !78 + %218 = insertelement <4 x i32> %217, i32 %167, i64 1, !dbg !78 + %219 = insertelement <4 x i32> %218, i32 %168, i64 2, !dbg !78 + %220 = insertelement <4 x i32> %219, i32 %169, i64 3, !dbg !78 + store <4 x i32> %220, ptr addrspace(3) %216, align 16, !dbg !78 + %221 = or disjoint i32 %203, 6144, !dbg !78 + %222 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %221, !dbg !78 + %223 = insertelement <4 x i32> poison, i32 %171, i64 0, !dbg !78 + %224 = insertelement <4 x i32> %223, i32 %172, i64 1, !dbg !78 + %225 = insertelement <4 x i32> %224, i32 %173, i64 2, !dbg !78 + %226 = insertelement <4 x i32> %225, i32 %174, i64 3, !dbg !78 + store <4 x i32> %226, ptr addrspace(3) %222, align 16, !dbg !78 + %227 = or disjoint i32 %203, 8192, !dbg !78 + %228 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %227, !dbg !78 + %229 = insertelement <4 x i32> poison, i32 %176, i64 0, !dbg !78 + %230 = insertelement <4 x i32> %229, i32 %177, i64 1, !dbg !78 + %231 = insertelement <4 x i32> %230, i32 %178, i64 2, !dbg !78 + %232 = insertelement <4 x i32> %231, i32 %179, i64 3, !dbg !78 + store <4 x i32> %232, ptr addrspace(3) %228, align 16, !dbg !78 + %233 = or disjoint i32 %203, 10240, !dbg !78 + %234 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %233, !dbg !78 + %235 = insertelement <4 x i32> poison, i32 %181, i64 0, !dbg !78 + %236 = insertelement <4 x i32> %235, i32 %182, i64 1, !dbg !78 + %237 = insertelement <4 x i32> %236, i32 %183, i64 2, !dbg !78 + %238 = insertelement <4 x i32> %237, i32 %184, i64 3, !dbg !78 + store <4 x i32> %238, ptr addrspace(3) %234, align 16, !dbg !78 + %239 = or disjoint i32 %203, 12288, !dbg !78 + %240 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %239, !dbg !78 + %241 = insertelement <4 x i32> poison, i32 %186, i64 0, !dbg !78 + %242 = insertelement <4 x i32> %241, i32 %187, i64 1, !dbg !78 + %243 = insertelement <4 x i32> %242, i32 %188, i64 2, !dbg !78 + %244 = insertelement <4 x i32> %243, i32 %189, i64 3, !dbg !78 + store <4 x i32> %244, ptr addrspace(3) %240, align 16, !dbg !78 + %245 = or disjoint i32 %203, 14336, !dbg !78 + %246 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %245, !dbg !78 + %247 = insertelement <4 x i32> poison, i32 %191, i64 0, !dbg !78 + %248 = insertelement <4 x i32> %247, i32 %192, i64 1, !dbg !78 + %249 = insertelement <4 x i32> %248, i32 %193, i64 2, !dbg !78 + %250 = insertelement <4 x i32> %249, i32 %194, i64 3, !dbg !78 + store <4 x i32> %250, ptr addrspace(3) %246, align 16, !dbg !78 + %251 = shl i32 %102, 7, !dbg !79 + %252 = shl i32 %103, 7, !dbg !79 + %253 = shl i32 %104, 7, !dbg !79 + %254 = shl i32 %105, 7, !dbg !79 + %255 = shl i32 %106, 7, !dbg !79 + %256 = shl i32 %107, 7, !dbg !79 + %257 = shl i32 %108, 7, !dbg !79 + %258 = shl i32 %109, 7, !dbg !79 + %259 = sext i32 %251 to i64, !dbg !81 + %260 = getelementptr bfloat, ptr addrspace(1) %97, i64 %259, !dbg !81 + %261 = sext i32 %252 to i64, !dbg !81 + %262 = getelementptr bfloat, ptr addrspace(1) %97, i64 %261, !dbg !81 + %263 = sext i32 %253 to i64, !dbg !81 + %264 = getelementptr bfloat, ptr addrspace(1) %97, i64 %263, !dbg !81 + %265 = sext i32 %254 to i64, !dbg !81 + %266 = getelementptr bfloat, ptr addrspace(1) %97, i64 %265, !dbg !81 + %267 = sext i32 %255 to i64, !dbg !81 + %268 = getelementptr bfloat, ptr addrspace(1) %97, i64 %267, !dbg !81 + %269 = sext i32 %256 to i64, !dbg !81 + %270 = getelementptr bfloat, ptr addrspace(1) %97, i64 %269, !dbg !81 + %271 = sext i32 %257 to i64, !dbg !81 + %272 = getelementptr bfloat, ptr addrspace(1) %97, i64 %271, !dbg !81 + %273 = sext i32 %258 to i64, !dbg !81 + %274 = getelementptr bfloat, ptr addrspace(1) %97, i64 %273, !dbg !81 + %275 = getelementptr bfloat, ptr addrspace(1) %260, i64 %138, !dbg !82 + %276 = getelementptr bfloat, ptr addrspace(1) %262, i64 %138, !dbg !82 + %277 = getelementptr bfloat, ptr addrspace(1) %264, i64 %138, !dbg !82 + %278 = getelementptr bfloat, ptr addrspace(1) %266, i64 %138, !dbg !82 + %279 = getelementptr bfloat, ptr addrspace(1) %268, i64 %138, !dbg !82 + %280 = getelementptr bfloat, ptr addrspace(1) %270, i64 %138, !dbg !82 + %281 = getelementptr bfloat, ptr addrspace(1) %272, i64 %138, !dbg !82 + %282 = getelementptr bfloat, ptr addrspace(1) %274, i64 %138, !dbg !82 + %283 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %275, i1 %147) #3, !dbg !83 + %284 = extractvalue { i32, i32, i32, i32 } %283, 0, !dbg !83 + %285 = extractvalue { i32, i32, i32, i32 } %283, 1, !dbg !83 + %286 = extractvalue { i32, i32, i32, i32 } %283, 2, !dbg !83 + %287 = extractvalue { i32, i32, i32, i32 } %283, 3, !dbg !83 + %288 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %276, i1 %148) #3, !dbg !83 + %289 = extractvalue { i32, i32, i32, i32 } %288, 0, !dbg !83 + %290 = extractvalue { i32, i32, i32, i32 } %288, 1, !dbg !83 + %291 = extractvalue { i32, i32, i32, i32 } %288, 2, !dbg !83 + %292 = extractvalue { i32, i32, i32, i32 } %288, 3, !dbg !83 + %293 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %277, i1 %149) #3, !dbg !83 + %294 = extractvalue { i32, i32, i32, i32 } %293, 0, !dbg !83 + %295 = extractvalue { i32, i32, i32, i32 } %293, 1, !dbg !83 + %296 = extractvalue { i32, i32, i32, i32 } %293, 2, !dbg !83 + %297 = extractvalue { i32, i32, i32, i32 } %293, 3, !dbg !83 + %298 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %278, i1 %150) #3, !dbg !83 + %299 = extractvalue { i32, i32, i32, i32 } %298, 0, !dbg !83 + %300 = extractvalue { i32, i32, i32, i32 } %298, 1, !dbg !83 + %301 = extractvalue { i32, i32, i32, i32 } %298, 2, !dbg !83 + %302 = extractvalue { i32, i32, i32, i32 } %298, 3, !dbg !83 + %303 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %279, i1 %151) #3, !dbg !83 + %304 = extractvalue { i32, i32, i32, i32 } %303, 0, !dbg !83 + %305 = extractvalue { i32, i32, i32, i32 } %303, 1, !dbg !83 + %306 = extractvalue { i32, i32, i32, i32 } %303, 2, !dbg !83 + %307 = extractvalue { i32, i32, i32, i32 } %303, 3, !dbg !83 + %308 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %280, i1 %152) #3, !dbg !83 + %309 = extractvalue { i32, i32, i32, i32 } %308, 0, !dbg !83 + %310 = extractvalue { i32, i32, i32, i32 } %308, 1, !dbg !83 + %311 = extractvalue { i32, i32, i32, i32 } %308, 2, !dbg !83 + %312 = extractvalue { i32, i32, i32, i32 } %308, 3, !dbg !83 + %313 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %281, i1 %153) #3, !dbg !83 + %314 = extractvalue { i32, i32, i32, i32 } %313, 0, !dbg !83 + %315 = extractvalue { i32, i32, i32, i32 } %313, 1, !dbg !83 + %316 = extractvalue { i32, i32, i32, i32 } %313, 2, !dbg !83 + %317 = extractvalue { i32, i32, i32, i32 } %313, 3, !dbg !83 + %318 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %282, i1 %154) #3, !dbg !83 + %319 = extractvalue { i32, i32, i32, i32 } %318, 0, !dbg !83 + %320 = extractvalue { i32, i32, i32, i32 } %318, 1, !dbg !83 + %321 = extractvalue { i32, i32, i32, i32 } %318, 2, !dbg !83 + %322 = extractvalue { i32, i32, i32, i32 } %318, 3, !dbg !83 + %323 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %203, !dbg !83 + %324 = insertelement <4 x i32> poison, i32 %284, i64 0, !dbg !83 + %325 = insertelement <4 x i32> %324, i32 %285, i64 1, !dbg !83 + %326 = insertelement <4 x i32> %325, i32 %286, i64 2, !dbg !83 + %327 = insertelement <4 x i32> %326, i32 %287, i64 3, !dbg !83 + store <4 x i32> %327, ptr addrspace(3) %323, align 16, !dbg !83 + %328 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %209, !dbg !83 + %329 = insertelement <4 x i32> poison, i32 %289, i64 0, !dbg !83 + %330 = insertelement <4 x i32> %329, i32 %290, i64 1, !dbg !83 + %331 = insertelement <4 x i32> %330, i32 %291, i64 2, !dbg !83 + %332 = insertelement <4 x i32> %331, i32 %292, i64 3, !dbg !83 + store <4 x i32> %332, ptr addrspace(3) %328, align 16, !dbg !83 + %333 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %215, !dbg !83 + %334 = insertelement <4 x i32> poison, i32 %294, i64 0, !dbg !83 + %335 = insertelement <4 x i32> %334, i32 %295, i64 1, !dbg !83 + %336 = insertelement <4 x i32> %335, i32 %296, i64 2, !dbg !83 + %337 = insertelement <4 x i32> %336, i32 %297, i64 3, !dbg !83 + store <4 x i32> %337, ptr addrspace(3) %333, align 16, !dbg !83 + %338 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %221, !dbg !83 + %339 = insertelement <4 x i32> poison, i32 %299, i64 0, !dbg !83 + %340 = insertelement <4 x i32> %339, i32 %300, i64 1, !dbg !83 + %341 = insertelement <4 x i32> %340, i32 %301, i64 2, !dbg !83 + %342 = insertelement <4 x i32> %341, i32 %302, i64 3, !dbg !83 + store <4 x i32> %342, ptr addrspace(3) %338, align 16, !dbg !83 + %343 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %227, !dbg !83 + %344 = insertelement <4 x i32> poison, i32 %304, i64 0, !dbg !83 + %345 = insertelement <4 x i32> %344, i32 %305, i64 1, !dbg !83 + %346 = insertelement <4 x i32> %345, i32 %306, i64 2, !dbg !83 + %347 = insertelement <4 x i32> %346, i32 %307, i64 3, !dbg !83 + store <4 x i32> %347, ptr addrspace(3) %343, align 16, !dbg !83 + %348 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %233, !dbg !83 + %349 = insertelement <4 x i32> poison, i32 %309, i64 0, !dbg !83 + %350 = insertelement <4 x i32> %349, i32 %310, i64 1, !dbg !83 + %351 = insertelement <4 x i32> %350, i32 %311, i64 2, !dbg !83 + %352 = insertelement <4 x i32> %351, i32 %312, i64 3, !dbg !83 + store <4 x i32> %352, ptr addrspace(3) %348, align 16, !dbg !83 + %353 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %239, !dbg !83 + %354 = insertelement <4 x i32> poison, i32 %314, i64 0, !dbg !83 + %355 = insertelement <4 x i32> %354, i32 %315, i64 1, !dbg !83 + %356 = insertelement <4 x i32> %355, i32 %316, i64 2, !dbg !83 + %357 = insertelement <4 x i32> %356, i32 %317, i64 3, !dbg !83 + store <4 x i32> %357, ptr addrspace(3) %353, align 16, !dbg !83 + %358 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %245, !dbg !83 + %359 = insertelement <4 x i32> poison, i32 %319, i64 0, !dbg !83 + %360 = insertelement <4 x i32> %359, i32 %320, i64 1, !dbg !83 + %361 = insertelement <4 x i32> %360, i32 %321, i64 2, !dbg !83 + %362 = insertelement <4 x i32> %361, i32 %322, i64 3, !dbg !83 + store <4 x i32> %362, ptr addrspace(3) %358, align 16, !dbg !83 + %363 = icmp slt i32 %110, %18, !dbg !84 + %364 = icmp slt i32 %111, %18, !dbg !84 + %365 = sext i32 %110 to i64, !dbg !85 + %366 = getelementptr float, ptr addrspace(1) %100, i64 %365, !dbg !85 + %367 = sext i32 %111 to i64, !dbg !85 + %368 = getelementptr float, ptr addrspace(1) %100, i64 %367, !dbg !85 + %369 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %366, i1 %363) #3, !dbg !86 + %370 = bitcast i32 %369 to float, !dbg !86 + %371 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %368, i1 %364) #3, !dbg !86 + %372 = bitcast i32 %371 to float, !dbg !86 + %373 = getelementptr float, ptr addrspace(1) %99, i64 %365, !dbg !87 + %374 = getelementptr float, ptr addrspace(1) %99, i64 %367, !dbg !87 + %375 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %373, i1 %363) #3, !dbg !88 + %376 = bitcast i32 %375 to float, !dbg !88 + %377 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %374, i1 %364) #3, !dbg !88 + %378 = bitcast i32 %377 to float, !dbg !88 + %379 = fcmp oeq float %376, 0xFFF0000000000000, !dbg !89 + %380 = fcmp oeq float %378, 0xFFF0000000000000, !dbg !89 + %381 = select i1 %379, float 0.000000e+00, float %376, !dbg !90 + %382 = select i1 %380, float 0.000000e+00, float %378, !dbg !90 + %383 = sext i32 %reass.mul to i64, !dbg !91 + %384 = getelementptr i32, ptr addrspace(1) %9, i64 %383, !dbg !91 + %385 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %384) #3, !dbg !92 + %386 = shl i32 %385, 7, !dbg !93 + %387 = sext i32 %82 to i64, !dbg !94 + %388 = getelementptr i32, ptr addrspace(1) %8, i64 %387, !dbg !94 + %389 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %388) #3, !dbg !95 + %390 = and i32 %56, 3, !dbg !96 + %391 = shl nuw nsw i32 %390, 1, !dbg !96 + %392 = or disjoint i32 %391, 1, !dbg !96 + %393 = insertelement <2 x i32> poison, i32 %391, i64 0, !dbg !96 + %394 = shufflevector <2 x i32> %393, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !96 + %395 = or disjoint <2 x i32> %394, , !dbg !96 + %396 = insertelement <4 x i32> poison, i32 %391, i64 0, !dbg !96 + %397 = shufflevector <4 x i32> %396, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !96 + %398 = or disjoint <4 x i32> %397, , !dbg !96 + %399 = insertelement <8 x i32> poison, i32 %391, i64 0, !dbg !96 + %400 = shufflevector <8 x i32> %399, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !96 + %401 = or disjoint <8 x i32> %400, , !dbg !96 + %402 = or disjoint i32 %386, %59, !dbg !97 + %403 = or disjoint i32 %386, %60, !dbg !97 + %404 = or disjoint i32 %386, %61, !dbg !97 + %405 = or disjoint i32 %386, %62, !dbg !97 + %406 = shl i32 %402, 7, !dbg !98 + %407 = shl i32 %403, 7, !dbg !98 + %408 = shl i32 %404, 7, !dbg !98 + %409 = shl i32 %405, 7, !dbg !98 + %410 = sext i32 %406 to i64, !dbg !100 + %411 = getelementptr bfloat, ptr addrspace(1) %53, i64 %410, !dbg !100 + %412 = sext i32 %407 to i64, !dbg !100 + %413 = getelementptr bfloat, ptr addrspace(1) %53, i64 %412, !dbg !100 + %414 = sext i32 %408 to i64, !dbg !100 + %415 = getelementptr bfloat, ptr addrspace(1) %53, i64 %414, !dbg !100 + %416 = sext i32 %409 to i64, !dbg !100 + %417 = getelementptr bfloat, ptr addrspace(1) %53, i64 %416, !dbg !100 + %418 = getelementptr bfloat, ptr addrspace(1) %411, i64 %138, !dbg !101 + %419 = getelementptr bfloat, ptr addrspace(1) %413, i64 %138, !dbg !101 + %420 = getelementptr bfloat, ptr addrspace(1) %415, i64 %138, !dbg !101 + %421 = getelementptr bfloat, ptr addrspace(1) %417, i64 %138, !dbg !101 + %422 = getelementptr bfloat, ptr addrspace(1) %54, i64 %410, !dbg !102 + %423 = getelementptr bfloat, ptr addrspace(1) %54, i64 %412, !dbg !102 + %424 = getelementptr bfloat, ptr addrspace(1) %54, i64 %414, !dbg !102 + %425 = getelementptr bfloat, ptr addrspace(1) %54, i64 %416, !dbg !102 + %426 = getelementptr bfloat, ptr addrspace(1) %422, i64 %138, !dbg !103 + %427 = getelementptr bfloat, ptr addrspace(1) %423, i64 %138, !dbg !103 + %428 = getelementptr bfloat, ptr addrspace(1) %424, i64 %138, !dbg !103 + %429 = getelementptr bfloat, ptr addrspace(1) %425, i64 %138, !dbg !103 + %430 = shl i32 %389, 1, !dbg !104 + %431 = add i32 %19, 63, !dbg !105 + %432 = sdiv i32 %431, 64, !dbg !106 + %433 = tail call i32 @llvm.smax.i32(i32 %432, i32 1), !dbg !107 + %434 = tail call i32 @llvm.smin.i32(i32 %430, i32 %433), !dbg !108 + %435 = zext nneg i32 %43 to i64, !dbg !109 + %436 = getelementptr i64, ptr addrspace(1) %16, i64 %435, !dbg !109 + %437 = icmp sgt i32 %430, 0, !dbg !110 + %438 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %436, i1 %437) #3, !dbg !111 + %439 = icmp slt i32 %402, %19, !dbg !112 + %440 = icmp slt i32 %403, %19, !dbg !112 + %441 = icmp slt i32 %404, %19, !dbg !112 + %442 = icmp slt i32 %405, %19, !dbg !112 + %443 = and i1 %437, %439, !dbg !110 + %444 = and i1 %437, %440, !dbg !110 + %445 = and i1 %437, %441, !dbg !110 + %446 = and i1 %437, %442, !dbg !110 + %447 = shl nuw nsw i32 %199, 10, !dbg !113 + %448 = or disjoint i32 %202, %447, !dbg !113 + %449 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %448, !dbg !113 + %450 = select i1 %443, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %449, ptr addrspace(1) %418, i32 %450) #3, !dbg !113 + %451 = or disjoint i32 %448, 2048, !dbg !113 + %452 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %451, !dbg !113 + %453 = select i1 %444, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %452, ptr addrspace(1) %419, i32 %453) #3, !dbg !113 + %454 = or disjoint i32 %448, 4096, !dbg !113 + %455 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %454, !dbg !113 + %456 = select i1 %445, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %455, ptr addrspace(1) %420, i32 %456) #3, !dbg !113 + %457 = or disjoint i32 %448, 6144, !dbg !113 + %458 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %457, !dbg !113 + %459 = select i1 %446, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %458, ptr addrspace(1) %421, i32 %459) #3, !dbg !113 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !113 + %460 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %448, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %460, ptr addrspace(1) %426, i32 %450) #3, !dbg !113 + %461 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %451, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %461, ptr addrspace(1) %427, i32 %453) #3, !dbg !113 + %462 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %454, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %462, ptr addrspace(1) %428, i32 %456) #3, !dbg !113 + %463 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %457, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %463, ptr addrspace(1) %429, i32 %459) #3, !dbg !113 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !113 + %464 = icmp sgt i32 %434, 1, !dbg !110 + %465 = getelementptr i8, ptr addrspace(1) %418, i64 16384, !dbg !114 + %466 = getelementptr i8, ptr addrspace(1) %419, i64 16384, !dbg !114 + %467 = getelementptr i8, ptr addrspace(1) %420, i64 16384, !dbg !114 + %468 = getelementptr i8, ptr addrspace(1) %421, i64 16384, !dbg !114 + %469 = getelementptr i8, ptr addrspace(1) %426, i64 16384, !dbg !115 + %470 = getelementptr i8, ptr addrspace(1) %427, i64 16384, !dbg !115 + %471 = getelementptr i8, ptr addrspace(1) %428, i64 16384, !dbg !115 + %472 = getelementptr i8, ptr addrspace(1) %429, i64 16384, !dbg !115 + %473 = or disjoint i32 %402, 64, !dbg !116 + %474 = or disjoint i32 %403, 64, !dbg !116 + %475 = or disjoint i32 %404, 64, !dbg !116 + %476 = or disjoint i32 %405, 64, !dbg !116 + %477 = icmp slt i32 %473, %19, !dbg !112 + %478 = icmp slt i32 %474, %19, !dbg !112 + %479 = icmp slt i32 %475, %19, !dbg !112 + %480 = icmp slt i32 %476, %19, !dbg !112 + %481 = and i1 %464, %477, !dbg !110 + %482 = and i1 %464, %478, !dbg !110 + %483 = and i1 %464, %479, !dbg !110 + %484 = and i1 %464, %480, !dbg !110 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !113 + %485 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %448, !dbg !113 + %486 = select i1 %481, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %485, ptr addrspace(1) %465, i32 %486) #3, !dbg !113 + %487 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %451, !dbg !113 + %488 = select i1 %482, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %487, ptr addrspace(1) %466, i32 %488) #3, !dbg !113 + %489 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %454, !dbg !113 + %490 = select i1 %483, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %489, ptr addrspace(1) %467, i32 %490) #3, !dbg !113 + %491 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %457, !dbg !113 + %492 = select i1 %484, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %491, ptr addrspace(1) %468, i32 %492) #3, !dbg !113 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !113 + %493 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %448, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %493, ptr addrspace(1) %469, i32 %486) #3, !dbg !113 + %494 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %451, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %494, ptr addrspace(1) %470, i32 %488) #3, !dbg !113 + %495 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %454, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %495, ptr addrspace(1) %471, i32 %490) #3, !dbg !113 + %496 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %457, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %496, ptr addrspace(1) %472, i32 %492) #3, !dbg !113 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !113 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #3, !dbg !117 + br i1 %437, label %.lr.ph, label %._crit_edge, !dbg !110 + +.lr.ph: ; preds = %73 + %497 = srem i32 %111, %18, !dbg !118 + %498 = sext i32 %497 to i64, !dbg !119 + %499 = icmp sgt i64 %438, %498, !dbg !120 + %500 = srem i32 %110, %18, !dbg !118 + %501 = sext i32 %500 to i64, !dbg !119 + %502 = icmp sgt i64 %438, %501, !dbg !120 + %503 = insertelement <2 x i32> poison, i32 %386, i64 0, !dbg !97 + %504 = shufflevector <2 x i32> %503, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !97 + %505 = shufflevector <8 x i32> %401, <8 x i32> poison, <2 x i32> , !dbg !97 + %506 = or disjoint <2 x i32> %504, %505, !dbg !97 + %507 = shufflevector <8 x i32> %401, <8 x i32> poison, <2 x i32> , !dbg !97 + %508 = or disjoint <2 x i32> %504, %507, !dbg !97 + %509 = shufflevector <8 x i32> %401, <8 x i32> poison, <2 x i32> , !dbg !97 + %510 = or disjoint <2 x i32> %504, %509, !dbg !97 + %511 = shufflevector <8 x i32> %401, <8 x i32> poison, <2 x i32> , !dbg !97 + %512 = or disjoint <2 x i32> %504, %511, !dbg !97 + %513 = shufflevector <4 x i32> %398, <4 x i32> poison, <2 x i32> , !dbg !97 + %514 = or disjoint <2 x i32> %504, %513, !dbg !97 + %515 = shufflevector <4 x i32> %398, <4 x i32> poison, <2 x i32> , !dbg !97 + %516 = or disjoint <2 x i32> %504, %515, !dbg !97 + %517 = or disjoint <2 x i32> %504, %395, !dbg !97 + %518 = insertelement <2 x i32> %393, i32 %392, i64 1, !dbg !97 + %519 = or disjoint <2 x i32> %504, %518, !dbg !97 + %520 = add nsw i32 %434, -2 + %521 = add nsw i32 %434, -1 + %smax = tail call i32 @llvm.smax.i32(i32 %434, i32 1), !dbg !110 + %522 = insertelement <2 x i1> poison, i1 %499, i64 0, !dbg !121 + %523 = shufflevector <2 x i1> %522, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !121 + %524 = insertelement <2 x i32> poison, i32 %497, i64 0, !dbg !122 + %525 = shufflevector <2 x i32> %524, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !122 + %526 = insertelement <2 x i32> poison, i32 %26, i64 0, !dbg !123 + %527 = shufflevector <2 x i32> %526, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !123 + %528 = insertelement <2 x i64> poison, i64 %438, i64 0, !dbg !124 + %529 = shufflevector <2 x i64> %528, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !124 + %530 = insertelement <2 x i32> poison, i32 %19, i64 0, !dbg !112 + %531 = shufflevector <2 x i32> %530, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !112 + %532 = insertelement <2 x float> poison, float %372, i64 0, !dbg !125 + %533 = shufflevector <2 x float> %532, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !125 + %534 = insertelement <2 x i1> poison, i1 %502, i64 0, !dbg !121 + %535 = shufflevector <2 x i1> %534, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !121 + %536 = insertelement <2 x i32> poison, i32 %500, i64 0, !dbg !122 + %537 = shufflevector <2 x i32> %536, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !122 + %538 = insertelement <2 x float> poison, float %370, i64 0, !dbg !125 + %539 = shufflevector <2 x float> %538, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !125 + br label %540, !dbg !110 + +540: ; preds = %.lr.ph, %__nv_exp2f.exit1527 + %541 = phi i32 [ 64, %.lr.ph ], [ %2525, %__nv_exp2f.exit1527 ] + %542 = phi i32 [ -1, %.lr.ph ], [ %621, %__nv_exp2f.exit1527 ] + %543 = phi i32 [ 1, %.lr.ph ], [ %2542, %__nv_exp2f.exit1527 ] + %.pn8981546 = phi ptr addrspace(1) [ %472, %.lr.ph ], [ %2535, %__nv_exp2f.exit1527 ] + %.pn9141545 = phi ptr addrspace(1) [ %471, %.lr.ph ], [ %2534, %__nv_exp2f.exit1527 ] + %.pn9301544 = phi ptr addrspace(1) [ %470, %.lr.ph ], [ %2533, %__nv_exp2f.exit1527 ] + %.pn9461543 = phi ptr addrspace(1) [ %469, %.lr.ph ], [ %2532, %__nv_exp2f.exit1527 ] + %.pn8761542 = phi i32 [ %476, %.lr.ph ], [ %2539, %__nv_exp2f.exit1527 ] + %.pn8781541 = phi i32 [ %475, %.lr.ph ], [ %2538, %__nv_exp2f.exit1527 ] + %.pn8801540 = phi i32 [ %474, %.lr.ph ], [ %2537, %__nv_exp2f.exit1527 ] + %.pn8821539 = phi i32 [ %473, %.lr.ph ], [ %2536, %__nv_exp2f.exit1527 ] + %.pn8261538 = phi ptr addrspace(1) [ %468, %.lr.ph ], [ %2531, %__nv_exp2f.exit1527 ] + %.pn8421537 = phi ptr addrspace(1) [ %467, %.lr.ph ], [ %2530, %__nv_exp2f.exit1527 ] + %.pn8581536 = phi ptr addrspace(1) [ %466, %.lr.ph ], [ %2529, %__nv_exp2f.exit1527 ] + %.pn8741535 = phi ptr addrspace(1) [ %465, %.lr.ph ], [ %2528, %__nv_exp2f.exit1527 ] + %544 = phi float [ 0.000000e+00, %.lr.ph ], [ %2432, %__nv_exp2f.exit1527 ] + %545 = phi float [ 0.000000e+00, %.lr.ph ], [ %2433, %__nv_exp2f.exit1527 ] + %546 = phi float [ 0.000000e+00, %.lr.ph ], [ %2434, %__nv_exp2f.exit1527 ] + %547 = phi float [ 0.000000e+00, %.lr.ph ], [ %2435, %__nv_exp2f.exit1527 ] + %548 = phi float [ 0.000000e+00, %.lr.ph ], [ %2436, %__nv_exp2f.exit1527 ] + %549 = phi float [ 0.000000e+00, %.lr.ph ], [ %2437, %__nv_exp2f.exit1527 ] + %550 = phi float [ 0.000000e+00, %.lr.ph ], [ %2438, %__nv_exp2f.exit1527 ] + %551 = phi float [ 0.000000e+00, %.lr.ph ], [ %2439, %__nv_exp2f.exit1527 ] + %552 = phi float [ 0.000000e+00, %.lr.ph ], [ %2440, %__nv_exp2f.exit1527 ] + %553 = phi float [ 0.000000e+00, %.lr.ph ], [ %2441, %__nv_exp2f.exit1527 ] + %554 = phi float [ 0.000000e+00, %.lr.ph ], [ %2442, %__nv_exp2f.exit1527 ] + %555 = phi float [ 0.000000e+00, %.lr.ph ], [ %2443, %__nv_exp2f.exit1527 ] + %556 = phi float [ 0.000000e+00, %.lr.ph ], [ %2444, %__nv_exp2f.exit1527 ] + %557 = phi float [ 0.000000e+00, %.lr.ph ], [ %2445, %__nv_exp2f.exit1527 ] + %558 = phi float [ 0.000000e+00, %.lr.ph ], [ %2446, %__nv_exp2f.exit1527 ] + %559 = phi float [ 0.000000e+00, %.lr.ph ], [ %2447, %__nv_exp2f.exit1527 ] + %560 = phi float [ 0.000000e+00, %.lr.ph ], [ %2448, %__nv_exp2f.exit1527 ] + %561 = phi float [ 0.000000e+00, %.lr.ph ], [ %2449, %__nv_exp2f.exit1527 ] + %562 = phi float [ 0.000000e+00, %.lr.ph ], [ %2450, %__nv_exp2f.exit1527 ] + %563 = phi float [ 0.000000e+00, %.lr.ph ], [ %2451, %__nv_exp2f.exit1527 ] + %564 = phi float [ 0.000000e+00, %.lr.ph ], [ %2452, %__nv_exp2f.exit1527 ] + %565 = phi float [ 0.000000e+00, %.lr.ph ], [ %2453, %__nv_exp2f.exit1527 ] + %566 = phi float [ 0.000000e+00, %.lr.ph ], [ %2454, %__nv_exp2f.exit1527 ] + %567 = phi float [ 0.000000e+00, %.lr.ph ], [ %2455, %__nv_exp2f.exit1527 ] + %568 = phi float [ 0.000000e+00, %.lr.ph ], [ %2456, %__nv_exp2f.exit1527 ] + %569 = phi float [ 0.000000e+00, %.lr.ph ], [ %2457, %__nv_exp2f.exit1527 ] + %570 = phi float [ 0.000000e+00, %.lr.ph ], [ %2458, %__nv_exp2f.exit1527 ] + %571 = phi float [ 0.000000e+00, %.lr.ph ], [ %2459, %__nv_exp2f.exit1527 ] + %572 = phi float [ 0.000000e+00, %.lr.ph ], [ %2460, %__nv_exp2f.exit1527 ] + %573 = phi float [ 0.000000e+00, %.lr.ph ], [ %2461, %__nv_exp2f.exit1527 ] + %574 = phi float [ 0.000000e+00, %.lr.ph ], [ %2462, %__nv_exp2f.exit1527 ] + %575 = phi float [ 0.000000e+00, %.lr.ph ], [ %2463, %__nv_exp2f.exit1527 ] + %576 = phi float [ 0.000000e+00, %.lr.ph ], [ %2464, %__nv_exp2f.exit1527 ] + %577 = phi float [ 0.000000e+00, %.lr.ph ], [ %2465, %__nv_exp2f.exit1527 ] + %578 = phi float [ 0.000000e+00, %.lr.ph ], [ %2466, %__nv_exp2f.exit1527 ] + %579 = phi float [ 0.000000e+00, %.lr.ph ], [ %2467, %__nv_exp2f.exit1527 ] + %580 = phi float [ 0.000000e+00, %.lr.ph ], [ %2468, %__nv_exp2f.exit1527 ] + %581 = phi float [ 0.000000e+00, %.lr.ph ], [ %2469, %__nv_exp2f.exit1527 ] + %582 = phi float [ 0.000000e+00, %.lr.ph ], [ %2470, %__nv_exp2f.exit1527 ] + %583 = phi float [ 0.000000e+00, %.lr.ph ], [ %2471, %__nv_exp2f.exit1527 ] + %584 = phi float [ 0.000000e+00, %.lr.ph ], [ %2472, %__nv_exp2f.exit1527 ] + %585 = phi float [ 0.000000e+00, %.lr.ph ], [ %2473, %__nv_exp2f.exit1527 ] + %586 = phi float [ 0.000000e+00, %.lr.ph ], [ %2474, %__nv_exp2f.exit1527 ] + %587 = phi float [ 0.000000e+00, %.lr.ph ], [ %2475, %__nv_exp2f.exit1527 ] + %588 = phi float [ 0.000000e+00, %.lr.ph ], [ %2476, %__nv_exp2f.exit1527 ] + %589 = phi float [ 0.000000e+00, %.lr.ph ], [ %2477, %__nv_exp2f.exit1527 ] + %590 = phi float [ 0.000000e+00, %.lr.ph ], [ %2478, %__nv_exp2f.exit1527 ] + %591 = phi float [ 0.000000e+00, %.lr.ph ], [ %2479, %__nv_exp2f.exit1527 ] + %592 = phi float [ 0.000000e+00, %.lr.ph ], [ %2480, %__nv_exp2f.exit1527 ] + %593 = phi float [ 0.000000e+00, %.lr.ph ], [ %2481, %__nv_exp2f.exit1527 ] + %594 = phi float [ 0.000000e+00, %.lr.ph ], [ %2482, %__nv_exp2f.exit1527 ] + %595 = phi float [ 0.000000e+00, %.lr.ph ], [ %2483, %__nv_exp2f.exit1527 ] + %596 = phi float [ 0.000000e+00, %.lr.ph ], [ %2484, %__nv_exp2f.exit1527 ] + %597 = phi float [ 0.000000e+00, %.lr.ph ], [ %2485, %__nv_exp2f.exit1527 ] + %598 = phi float [ 0.000000e+00, %.lr.ph ], [ %2486, %__nv_exp2f.exit1527 ] + %599 = phi float [ 0.000000e+00, %.lr.ph ], [ %2487, %__nv_exp2f.exit1527 ] + %600 = phi float [ 0.000000e+00, %.lr.ph ], [ %2488, %__nv_exp2f.exit1527 ] + %601 = phi float [ 0.000000e+00, %.lr.ph ], [ %2489, %__nv_exp2f.exit1527 ] + %602 = phi float [ 0.000000e+00, %.lr.ph ], [ %2490, %__nv_exp2f.exit1527 ] + %603 = phi float [ 0.000000e+00, %.lr.ph ], [ %2491, %__nv_exp2f.exit1527 ] + %604 = phi float [ 0.000000e+00, %.lr.ph ], [ %2492, %__nv_exp2f.exit1527 ] + %605 = phi float [ 0.000000e+00, %.lr.ph ], [ %2493, %__nv_exp2f.exit1527 ] + %606 = phi float [ 0.000000e+00, %.lr.ph ], [ %2494, %__nv_exp2f.exit1527 ] + %607 = phi float [ 0.000000e+00, %.lr.ph ], [ %2495, %__nv_exp2f.exit1527 ] + %608 = phi i32 [ 0, %.lr.ph ], [ %2506, %__nv_exp2f.exit1527 ] + %609 = phi <2 x i32> [ %506, %.lr.ph ], [ %2505, %__nv_exp2f.exit1527 ] + %610 = phi <2 x i32> [ %508, %.lr.ph ], [ %2504, %__nv_exp2f.exit1527 ] + %611 = phi <2 x i32> [ %510, %.lr.ph ], [ %2503, %__nv_exp2f.exit1527 ] + %612 = phi <2 x i32> [ %512, %.lr.ph ], [ %2502, %__nv_exp2f.exit1527 ] + %613 = phi <2 x i32> [ %514, %.lr.ph ], [ %2501, %__nv_exp2f.exit1527 ] + %614 = phi <2 x i32> [ %516, %.lr.ph ], [ %2500, %__nv_exp2f.exit1527 ] + %615 = phi <2 x i32> [ %517, %.lr.ph ], [ %2499, %__nv_exp2f.exit1527 ] + %616 = phi <2 x i32> [ %519, %.lr.ph ], [ %2498, %__nv_exp2f.exit1527 ] + %617 = icmp slt i32 %608, %520, !dbg !110 + %618 = icmp slt i32 %608, %521, !dbg !110 + %619 = add i32 %542, 1, !dbg !110 + %620 = icmp sgt i32 %619, 2, !dbg !110 + %621 = select i1 %620, i32 0, i32 %619, !dbg !110 + %622 = icmp slt <2 x i32> %616, %531, !dbg !112 + %623 = icmp slt <2 x i32> %615, %531, !dbg !112 + %624 = icmp slt <2 x i32> %614, %531, !dbg !112 + %625 = icmp slt <2 x i32> %613, %531, !dbg !112 + %626 = icmp slt <2 x i32> %612, %531, !dbg !112 + %627 = icmp slt <2 x i32> %611, %531, !dbg !112 + %628 = icmp slt <2 x i32> %610, %531, !dbg !112 + %629 = icmp slt <2 x i32> %609, %531, !dbg !112 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !113 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !113 + %630 = shl i32 %621, 13, !dbg !113 + %631 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %630, !dbg !113 + %632 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %57, i32 0, i32 31), !dbg !117 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !117 + %633 = shl i32 %632, 11, !dbg !117 + %634 = and i32 %633, 8192, !dbg !117 + %635 = add i32 %634, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !117 + %636 = lshr exact i32 %635, 4, !dbg !117 + %637 = and i32 %636, 16383, !dbg !117 + %638 = zext nneg i32 %637 to i64, !dbg !117 + %639 = or disjoint i64 %638, 4611686293372403712, !dbg !117 + %640 = ptrtoint ptr addrspace(3) %631 to i32, !dbg !117 + %641 = lshr exact i32 %640, 4, !dbg !117 + %642 = and i32 %641, 16383, !dbg !117 + %643 = zext nneg i32 %642 to i64, !dbg !117 + %644 = or disjoint i64 %643, 4611686293338849280, !dbg !117 + %645 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %639, i64 %644) #3, !dbg !117 + %646 = or disjoint i32 %634, 32, !dbg !117 + %647 = add i32 %646, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !117 + %648 = lshr exact i32 %647, 4, !dbg !117 + %649 = and i32 %648, 16383, !dbg !117 + %650 = zext nneg i32 %649 to i64, !dbg !117 + %651 = or disjoint i64 %650, 4611686293372403712, !dbg !117 + %652 = add i32 %640, 32, !dbg !117 + %653 = lshr exact i32 %652, 4, !dbg !117 + %654 = and i32 %653, 16383, !dbg !117 + %655 = zext nneg i32 %654 to i64, !dbg !117 + %656 = or disjoint i64 %655, 4611686293338849280, !dbg !117 + %657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 0, !dbg !117 + %658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 1, !dbg !117 + %659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 2, !dbg !117 + %660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 3, !dbg !117 + %661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 4, !dbg !117 + %662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 5, !dbg !117 + %663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 6, !dbg !117 + %664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 7, !dbg !117 + %665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 8, !dbg !117 + %666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 9, !dbg !117 + %667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 10, !dbg !117 + %668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 11, !dbg !117 + %669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 12, !dbg !117 + %670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 13, !dbg !117 + %671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 14, !dbg !117 + %672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 15, !dbg !117 + %673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 16, !dbg !117 + %674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 17, !dbg !117 + %675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 18, !dbg !117 + %676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 19, !dbg !117 + %677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 20, !dbg !117 + %678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 21, !dbg !117 + %679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 22, !dbg !117 + %680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 23, !dbg !117 + %681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 24, !dbg !117 + %682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 25, !dbg !117 + %683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 26, !dbg !117 + %684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 27, !dbg !117 + %685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 28, !dbg !117 + %686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 29, !dbg !117 + %687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 30, !dbg !117 + %688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %645, 31, !dbg !117 + %689 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %657, float %658, float %659, float %660, float %661, float %662, float %663, float %664, float %665, float %666, float %667, float %668, float %669, float %670, float %671, float %672, float %673, float %674, float %675, float %676, float %677, float %678, float %679, float %680, float %681, float %682, float %683, float %684, float %685, float %686, float %687, float %688, i64 %651, i64 %656, i1 true) #3, !dbg !117 + %690 = or disjoint i32 %634, 64, !dbg !117 + %691 = add i32 %690, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !117 + %692 = lshr exact i32 %691, 4, !dbg !117 + %693 = and i32 %692, 16383, !dbg !117 + %694 = zext nneg i32 %693 to i64, !dbg !117 + %695 = or disjoint i64 %694, 4611686293372403712, !dbg !117 + %696 = add i32 %640, 64, !dbg !117 + %697 = lshr exact i32 %696, 4, !dbg !117 + %698 = and i32 %697, 16383, !dbg !117 + %699 = zext nneg i32 %698 to i64, !dbg !117 + %700 = or disjoint i64 %699, 4611686293338849280, !dbg !117 + %701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 0, !dbg !117 + %702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 1, !dbg !117 + %703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 2, !dbg !117 + %704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 3, !dbg !117 + %705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 4, !dbg !117 + %706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 5, !dbg !117 + %707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 6, !dbg !117 + %708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 7, !dbg !117 + %709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 8, !dbg !117 + %710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 9, !dbg !117 + %711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 10, !dbg !117 + %712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 11, !dbg !117 + %713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 12, !dbg !117 + %714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 13, !dbg !117 + %715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 14, !dbg !117 + %716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 15, !dbg !117 + %717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 16, !dbg !117 + %718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 17, !dbg !117 + %719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 18, !dbg !117 + %720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 19, !dbg !117 + %721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 20, !dbg !117 + %722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 21, !dbg !117 + %723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 22, !dbg !117 + %724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 23, !dbg !117 + %725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 24, !dbg !117 + %726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 25, !dbg !117 + %727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 26, !dbg !117 + %728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 27, !dbg !117 + %729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 28, !dbg !117 + %730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 29, !dbg !117 + %731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 30, !dbg !117 + %732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %689, 31, !dbg !117 + %733 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %701, float %702, float %703, float %704, float %705, float %706, float %707, float %708, float %709, float %710, float %711, float %712, float %713, float %714, float %715, float %716, float %717, float %718, float %719, float %720, float %721, float %722, float %723, float %724, float %725, float %726, float %727, float %728, float %729, float %730, float %731, float %732, i64 %695, i64 %700, i1 true) #3, !dbg !117 + %734 = or disjoint i32 %634, 96, !dbg !117 + %735 = add i32 %734, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !117 + %736 = lshr exact i32 %735, 4, !dbg !117 + %737 = and i32 %736, 16383, !dbg !117 + %738 = zext nneg i32 %737 to i64, !dbg !117 + %739 = or disjoint i64 %738, 4611686293372403712, !dbg !117 + %740 = add i32 %640, 96, !dbg !117 + %741 = lshr exact i32 %740, 4, !dbg !117 + %742 = and i32 %741, 16383, !dbg !117 + %743 = zext nneg i32 %742 to i64, !dbg !117 + %744 = or disjoint i64 %743, 4611686293338849280, !dbg !117 + %745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 0, !dbg !117 + %746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 1, !dbg !117 + %747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 2, !dbg !117 + %748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 3, !dbg !117 + %749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 4, !dbg !117 + %750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 5, !dbg !117 + %751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 6, !dbg !117 + %752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 7, !dbg !117 + %753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 8, !dbg !117 + %754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 9, !dbg !117 + %755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 10, !dbg !117 + %756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 11, !dbg !117 + %757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 12, !dbg !117 + %758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 13, !dbg !117 + %759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 14, !dbg !117 + %760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 15, !dbg !117 + %761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 16, !dbg !117 + %762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 17, !dbg !117 + %763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 18, !dbg !117 + %764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 19, !dbg !117 + %765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 20, !dbg !117 + %766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 21, !dbg !117 + %767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 22, !dbg !117 + %768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 23, !dbg !117 + %769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 24, !dbg !117 + %770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 25, !dbg !117 + %771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 26, !dbg !117 + %772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 27, !dbg !117 + %773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 28, !dbg !117 + %774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 29, !dbg !117 + %775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 30, !dbg !117 + %776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %733, 31, !dbg !117 + %777 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %745, float %746, float %747, float %748, float %749, float %750, float %751, float %752, float %753, float %754, float %755, float %756, float %757, float %758, float %759, float %760, float %761, float %762, float %763, float %764, float %765, float %766, float %767, float %768, float %769, float %770, float %771, float %772, float %773, float %774, float %775, float %776, i64 %739, i64 %744, i1 true) #3, !dbg !117 + %778 = or disjoint i32 %634, 16384, !dbg !117 + %779 = add i32 %778, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !117 + %780 = lshr exact i32 %779, 4, !dbg !117 + %781 = and i32 %780, 16383, !dbg !117 + %782 = zext nneg i32 %781 to i64, !dbg !117 + %783 = or disjoint i64 %782, 4611686293372403712, !dbg !117 + %784 = add i32 %640, 8192, !dbg !117 + %785 = lshr exact i32 %784, 4, !dbg !117 + %786 = and i32 %785, 16383, !dbg !117 + %787 = zext nneg i32 %786 to i64, !dbg !117 + %788 = or disjoint i64 %787, 4611686293338849280, !dbg !117 + %789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 0, !dbg !117 + %790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 1, !dbg !117 + %791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 2, !dbg !117 + %792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 3, !dbg !117 + %793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 4, !dbg !117 + %794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 5, !dbg !117 + %795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 6, !dbg !117 + %796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 7, !dbg !117 + %797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 8, !dbg !117 + %798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 9, !dbg !117 + %799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 10, !dbg !117 + %800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 11, !dbg !117 + %801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 12, !dbg !117 + %802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 13, !dbg !117 + %803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 14, !dbg !117 + %804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 15, !dbg !117 + %805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 16, !dbg !117 + %806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 17, !dbg !117 + %807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 18, !dbg !117 + %808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 19, !dbg !117 + %809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 20, !dbg !117 + %810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 21, !dbg !117 + %811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 22, !dbg !117 + %812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 23, !dbg !117 + %813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 24, !dbg !117 + %814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 25, !dbg !117 + %815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 26, !dbg !117 + %816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 27, !dbg !117 + %817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 28, !dbg !117 + %818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 29, !dbg !117 + %819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 30, !dbg !117 + %820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %777, 31, !dbg !117 + %821 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %789, float %790, float %791, float %792, float %793, float %794, float %795, float %796, float %797, float %798, float %799, float %800, float %801, float %802, float %803, float %804, float %805, float %806, float %807, float %808, float %809, float %810, float %811, float %812, float %813, float %814, float %815, float %816, float %817, float %818, float %819, float %820, i64 %783, i64 %788, i1 true) #3, !dbg !117 + %822 = or disjoint i32 %634, 16416, !dbg !117 + %823 = add i32 %822, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !117 + %824 = lshr exact i32 %823, 4, !dbg !117 + %825 = and i32 %824, 16383, !dbg !117 + %826 = zext nneg i32 %825 to i64, !dbg !117 + %827 = or disjoint i64 %826, 4611686293372403712, !dbg !117 + %828 = add i32 %640, 8224, !dbg !117 + %829 = lshr exact i32 %828, 4, !dbg !117 + %830 = and i32 %829, 16383, !dbg !117 + %831 = zext nneg i32 %830 to i64, !dbg !117 + %832 = or disjoint i64 %831, 4611686293338849280, !dbg !117 + %833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 0, !dbg !117 + %834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 1, !dbg !117 + %835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 2, !dbg !117 + %836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 3, !dbg !117 + %837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 4, !dbg !117 + %838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 5, !dbg !117 + %839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 6, !dbg !117 + %840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 7, !dbg !117 + %841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 8, !dbg !117 + %842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 9, !dbg !117 + %843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 10, !dbg !117 + %844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 11, !dbg !117 + %845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 12, !dbg !117 + %846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 13, !dbg !117 + %847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 14, !dbg !117 + %848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 15, !dbg !117 + %849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 16, !dbg !117 + %850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 17, !dbg !117 + %851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 18, !dbg !117 + %852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 19, !dbg !117 + %853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 20, !dbg !117 + %854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 21, !dbg !117 + %855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 22, !dbg !117 + %856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 23, !dbg !117 + %857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 24, !dbg !117 + %858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 25, !dbg !117 + %859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 26, !dbg !117 + %860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 27, !dbg !117 + %861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 28, !dbg !117 + %862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 29, !dbg !117 + %863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 30, !dbg !117 + %864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %821, 31, !dbg !117 + %865 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %833, float %834, float %835, float %836, float %837, float %838, float %839, float %840, float %841, float %842, float %843, float %844, float %845, float %846, float %847, float %848, float %849, float %850, float %851, float %852, float %853, float %854, float %855, float %856, float %857, float %858, float %859, float %860, float %861, float %862, float %863, float %864, i64 %827, i64 %832, i1 true) #3, !dbg !117 + %866 = or disjoint i32 %634, 16448, !dbg !117 + %867 = add i32 %866, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !117 + %868 = lshr exact i32 %867, 4, !dbg !117 + %869 = and i32 %868, 16383, !dbg !117 + %870 = zext nneg i32 %869 to i64, !dbg !117 + %871 = or disjoint i64 %870, 4611686293372403712, !dbg !117 + %872 = add i32 %640, 8256, !dbg !117 + %873 = lshr exact i32 %872, 4, !dbg !117 + %874 = and i32 %873, 16383, !dbg !117 + %875 = zext nneg i32 %874 to i64, !dbg !117 + %876 = or disjoint i64 %875, 4611686293338849280, !dbg !117 + %877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 0, !dbg !117 + %878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 1, !dbg !117 + %879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 2, !dbg !117 + %880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 3, !dbg !117 + %881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 4, !dbg !117 + %882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 5, !dbg !117 + %883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 6, !dbg !117 + %884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 7, !dbg !117 + %885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 8, !dbg !117 + %886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 9, !dbg !117 + %887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 10, !dbg !117 + %888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 11, !dbg !117 + %889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 12, !dbg !117 + %890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 13, !dbg !117 + %891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 14, !dbg !117 + %892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 15, !dbg !117 + %893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 16, !dbg !117 + %894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 17, !dbg !117 + %895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 18, !dbg !117 + %896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 19, !dbg !117 + %897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 20, !dbg !117 + %898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 21, !dbg !117 + %899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 22, !dbg !117 + %900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 23, !dbg !117 + %901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 24, !dbg !117 + %902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 25, !dbg !117 + %903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 26, !dbg !117 + %904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 27, !dbg !117 + %905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 28, !dbg !117 + %906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 29, !dbg !117 + %907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 30, !dbg !117 + %908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %865, 31, !dbg !117 + %909 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %877, float %878, float %879, float %880, float %881, float %882, float %883, float %884, float %885, float %886, float %887, float %888, float %889, float %890, float %891, float %892, float %893, float %894, float %895, float %896, float %897, float %898, float %899, float %900, float %901, float %902, float %903, float %904, float %905, float %906, float %907, float %908, i64 %871, i64 %876, i1 true) #3, !dbg !117 + %910 = or disjoint i32 %634, 16480, !dbg !117 + %911 = add i32 %910, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !117 + %912 = lshr exact i32 %911, 4, !dbg !117 + %913 = and i32 %912, 16383, !dbg !117 + %914 = zext nneg i32 %913 to i64, !dbg !117 + %915 = or disjoint i64 %914, 4611686293372403712, !dbg !117 + %916 = add i32 %640, 8288, !dbg !117 + %917 = lshr exact i32 %916, 4, !dbg !117 + %918 = and i32 %917, 16383, !dbg !117 + %919 = zext nneg i32 %918 to i64, !dbg !117 + %920 = or disjoint i64 %919, 4611686293338849280, !dbg !117 + %921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 0, !dbg !117 + %922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 1, !dbg !117 + %923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 2, !dbg !117 + %924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 3, !dbg !117 + %925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 4, !dbg !117 + %926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 5, !dbg !117 + %927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 6, !dbg !117 + %928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 7, !dbg !117 + %929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 8, !dbg !117 + %930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 9, !dbg !117 + %931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 10, !dbg !117 + %932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 11, !dbg !117 + %933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 12, !dbg !117 + %934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 13, !dbg !117 + %935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 14, !dbg !117 + %936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 15, !dbg !117 + %937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 16, !dbg !117 + %938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 17, !dbg !117 + %939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 18, !dbg !117 + %940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 19, !dbg !117 + %941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 20, !dbg !117 + %942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 21, !dbg !117 + %943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 22, !dbg !117 + %944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 23, !dbg !117 + %945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 24, !dbg !117 + %946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 25, !dbg !117 + %947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 26, !dbg !117 + %948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 27, !dbg !117 + %949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 28, !dbg !117 + %950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 29, !dbg !117 + %951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 30, !dbg !117 + %952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %909, 31, !dbg !117 + %953 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %921, float %922, float %923, float %924, float %925, float %926, float %927, float %928, float %929, float %930, float %931, float %932, float %933, float %934, float %935, float %936, float %937, float %938, float %939, float %940, float %941, float %942, float %943, float %944, float %945, float %946, float %947, float %948, float %949, float %950, float %951, float %952, i64 %915, i64 %920, i1 true) #3, !dbg !117 + %954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 0, !dbg !117 + %955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 1, !dbg !117 + %956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 2, !dbg !117 + %957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 3, !dbg !117 + %958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 4, !dbg !117 + %959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 5, !dbg !117 + %960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 6, !dbg !117 + %961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 7, !dbg !117 + %962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 8, !dbg !117 + %963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 9, !dbg !117 + %964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 10, !dbg !117 + %965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 11, !dbg !117 + %966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 12, !dbg !117 + %967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 13, !dbg !117 + %968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 14, !dbg !117 + %969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 15, !dbg !117 + %970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 16, !dbg !117 + %971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 17, !dbg !117 + %972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 18, !dbg !117 + %973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 19, !dbg !117 + %974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 20, !dbg !117 + %975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 21, !dbg !117 + %976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 22, !dbg !117 + %977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 23, !dbg !117 + %978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 24, !dbg !117 + %979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 25, !dbg !117 + %980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 26, !dbg !117 + %981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 27, !dbg !117 + %982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 28, !dbg !117 + %983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 29, !dbg !117 + %984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 30, !dbg !117 + %985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %953, 31, !dbg !117 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !117 + %986 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %954, float %955, float %956, float %957, float %958, float %959, float %960, float %961, float %962, float %963, float %964, float %965, float %966, float %967, float %968, float %969, float %970, float %971, float %972, float %973, float %974, float %975, float %976, float %977, float %978, float %979, float %980, float %981, float %982, float %983, float %984, float %985, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %631, i32 0, i32 0) #3, !dbg !117 + %987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 0, !dbg !117 + %988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 1, !dbg !117 + %989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 2, !dbg !117 + %990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 3, !dbg !117 + %991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 4, !dbg !117 + %992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 5, !dbg !117 + %993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 6, !dbg !117 + %994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 7, !dbg !117 + %995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 8, !dbg !117 + %996 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 9, !dbg !117 + %997 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 10, !dbg !117 + %998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 11, !dbg !117 + %999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 12, !dbg !117 + %1000 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 13, !dbg !117 + %1001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 14, !dbg !117 + %1002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 15, !dbg !117 + %1003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 16, !dbg !117 + %1004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 17, !dbg !117 + %1005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 18, !dbg !117 + %1006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 19, !dbg !117 + %1007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 20, !dbg !117 + %1008 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 21, !dbg !117 + %1009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 22, !dbg !117 + %1010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 23, !dbg !117 + %1011 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 24, !dbg !117 + %1012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 25, !dbg !117 + %1013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 26, !dbg !117 + %1014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 27, !dbg !117 + %1015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 28, !dbg !117 + %1016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 29, !dbg !117 + %1017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 30, !dbg !117 + %1018 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %986, 31, !dbg !117 + %1019 = fmul float %987, 0x3FB6A09E60000000, !dbg !126 + %1020 = fmul float %988, 0x3FB6A09E60000000, !dbg !126 + %1021 = fmul float %989, 0x3FB6A09E60000000, !dbg !126 + %1022 = fmul float %990, 0x3FB6A09E60000000, !dbg !126 + %1023 = fmul float %991, 0x3FB6A09E60000000, !dbg !126 + %1024 = fmul float %992, 0x3FB6A09E60000000, !dbg !126 + %1025 = fmul float %993, 0x3FB6A09E60000000, !dbg !126 + %1026 = fmul float %994, 0x3FB6A09E60000000, !dbg !126 + %1027 = fmul float %995, 0x3FB6A09E60000000, !dbg !126 + %1028 = fmul float %996, 0x3FB6A09E60000000, !dbg !126 + %1029 = fmul float %997, 0x3FB6A09E60000000, !dbg !126 + %1030 = fmul float %998, 0x3FB6A09E60000000, !dbg !126 + %1031 = fmul float %999, 0x3FB6A09E60000000, !dbg !126 + %1032 = fmul float %1000, 0x3FB6A09E60000000, !dbg !126 + %1033 = fmul float %1001, 0x3FB6A09E60000000, !dbg !126 + %1034 = fmul float %1002, 0x3FB6A09E60000000, !dbg !126 + %1035 = fmul float %1003, 0x3FB6A09E60000000, !dbg !126 + %1036 = fmul float %1004, 0x3FB6A09E60000000, !dbg !126 + %1037 = fmul float %1005, 0x3FB6A09E60000000, !dbg !126 + %1038 = fmul float %1006, 0x3FB6A09E60000000, !dbg !126 + %1039 = fmul float %1007, 0x3FB6A09E60000000, !dbg !126 + %1040 = fmul float %1008, 0x3FB6A09E60000000, !dbg !126 + %1041 = fmul float %1009, 0x3FB6A09E60000000, !dbg !126 + %1042 = fmul float %1010, 0x3FB6A09E60000000, !dbg !126 + %1043 = fmul float %1011, 0x3FB6A09E60000000, !dbg !126 + %1044 = fmul float %1012, 0x3FB6A09E60000000, !dbg !126 + %1045 = fmul float %1013, 0x3FB6A09E60000000, !dbg !126 + %1046 = fmul float %1014, 0x3FB6A09E60000000, !dbg !126 + %1047 = fmul float %1015, 0x3FB6A09E60000000, !dbg !126 + %1048 = fmul float %1016, 0x3FB6A09E60000000, !dbg !126 + %1049 = fmul float %1017, 0x3FB6A09E60000000, !dbg !126 + %1050 = fmul float %1018, 0x3FB6A09E60000000, !dbg !126 + %1051 = srem <2 x i32> %616, %531, !dbg !118 + %1052 = icmp sge <2 x i32> %525, %1051, !dbg !122 + %1053 = and <2 x i1> %523, %1052, !dbg !121 + %1054 = icmp sge <2 x i32> %1051, %527, !dbg !123 + %1055 = srem <2 x i32> %1051, %527, !dbg !127 + %1056 = icmp ne <2 x i32> %1055, zeroinitializer, !dbg !128 + %1057 = extractelement <2 x i32> %1055, i64 0, !dbg !129 + %1058 = xor i32 %1057, %26, !dbg !129 + %1059 = extractelement <2 x i32> %1055, i64 1, !dbg !129 + %1060 = xor i32 %1059, %26, !dbg !129 + %1061 = insertelement <2 x i32> poison, i32 %1058, i64 0, !dbg !129 + %1062 = insertelement <2 x i32> %1061, i32 %1060, i64 1, !dbg !129 + %1063 = icmp slt <2 x i32> %1062, zeroinitializer, !dbg !129 + %1064 = and <2 x i1> %1056, %1063, !dbg !130 + %1065 = select <2 x i1> %1064, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !131 + %1066 = add <2 x i32> %1065, %1055, !dbg !131 + %1067 = sext <2 x i32> %1066 to <2 x i64>, !dbg !124 + %1068 = icmp sgt <2 x i64> %529, %1067, !dbg !124 + %1069 = and <2 x i1> %1054, %1068, !dbg !132 + %1070 = sub <2 x i32> %1051, %525, !dbg !133 + %1071 = srem <2 x i32> %1070, %527, !dbg !134 + %1072 = icmp ne <2 x i32> %1071, zeroinitializer, !dbg !135 + %1073 = xor <2 x i32> %1071, %527, !dbg !136 + %1074 = icmp slt <2 x i32> %1073, zeroinitializer, !dbg !136 + %1075 = and <2 x i1> %1072, %1074, !dbg !137 + %1076 = select <2 x i1> %1075, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1077 = sub <2 x i32> zeroinitializer, %1076, !dbg !139 + %1078 = icmp eq <2 x i32> %1071, %1077, !dbg !139 + %1079 = and <2 x i1> %1069, %1078, !dbg !140 + %1080 = or <2 x i1> %1053, %1079, !dbg !141 + %1081 = icmp sge <2 x i32> %537, %1051, !dbg !122 + %1082 = and <2 x i1> %535, %1081, !dbg !121 + %1083 = sub <2 x i32> %1051, %537, !dbg !133 + %1084 = srem <2 x i32> %1083, %527, !dbg !134 + %1085 = icmp ne <2 x i32> %1084, zeroinitializer, !dbg !135 + %1086 = xor <2 x i32> %1084, %527, !dbg !136 + %1087 = icmp slt <2 x i32> %1086, zeroinitializer, !dbg !136 + %1088 = and <2 x i1> %1085, %1087, !dbg !137 + %1089 = select <2 x i1> %1088, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1090 = sub <2 x i32> zeroinitializer, %1089, !dbg !139 + %1091 = icmp eq <2 x i32> %1084, %1090, !dbg !139 + %1092 = and <2 x i1> %1069, %1091, !dbg !140 + %1093 = or <2 x i1> %1082, %1092, !dbg !141 + %1094 = select <2 x i1> %1093, <2 x i1> %622, <2 x i1> zeroinitializer, !dbg !142 + %1095 = select <2 x i1> %1080, <2 x i1> %622, <2 x i1> zeroinitializer, !dbg !142 + %1096 = srem <2 x i32> %615, %531, !dbg !118 + %1097 = icmp sge <2 x i32> %525, %1096, !dbg !122 + %1098 = and <2 x i1> %523, %1097, !dbg !121 + %1099 = icmp sge <2 x i32> %1096, %527, !dbg !123 + %1100 = srem <2 x i32> %1096, %527, !dbg !127 + %1101 = icmp ne <2 x i32> %1100, zeroinitializer, !dbg !128 + %1102 = extractelement <2 x i32> %1100, i64 0, !dbg !129 + %1103 = xor i32 %1102, %26, !dbg !129 + %1104 = extractelement <2 x i32> %1100, i64 1, !dbg !129 + %1105 = xor i32 %1104, %26, !dbg !129 + %1106 = insertelement <2 x i32> poison, i32 %1103, i64 0, !dbg !129 + %1107 = insertelement <2 x i32> %1106, i32 %1105, i64 1, !dbg !129 + %1108 = icmp slt <2 x i32> %1107, zeroinitializer, !dbg !129 + %1109 = and <2 x i1> %1101, %1108, !dbg !130 + %1110 = select <2 x i1> %1109, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !131 + %1111 = add <2 x i32> %1110, %1100, !dbg !131 + %1112 = sext <2 x i32> %1111 to <2 x i64>, !dbg !124 + %1113 = icmp sgt <2 x i64> %529, %1112, !dbg !124 + %1114 = and <2 x i1> %1099, %1113, !dbg !132 + %1115 = sub <2 x i32> %1096, %525, !dbg !133 + %1116 = srem <2 x i32> %1115, %527, !dbg !134 + %1117 = icmp ne <2 x i32> %1116, zeroinitializer, !dbg !135 + %1118 = xor <2 x i32> %1116, %527, !dbg !136 + %1119 = icmp slt <2 x i32> %1118, zeroinitializer, !dbg !136 + %1120 = and <2 x i1> %1117, %1119, !dbg !137 + %1121 = select <2 x i1> %1120, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1122 = sub <2 x i32> zeroinitializer, %1121, !dbg !139 + %1123 = icmp eq <2 x i32> %1116, %1122, !dbg !139 + %1124 = and <2 x i1> %1114, %1123, !dbg !140 + %1125 = or <2 x i1> %1098, %1124, !dbg !141 + %1126 = icmp sge <2 x i32> %537, %1096, !dbg !122 + %1127 = and <2 x i1> %535, %1126, !dbg !121 + %1128 = sub <2 x i32> %1096, %537, !dbg !133 + %1129 = srem <2 x i32> %1128, %527, !dbg !134 + %1130 = icmp ne <2 x i32> %1129, zeroinitializer, !dbg !135 + %1131 = xor <2 x i32> %1129, %527, !dbg !136 + %1132 = icmp slt <2 x i32> %1131, zeroinitializer, !dbg !136 + %1133 = and <2 x i1> %1130, %1132, !dbg !137 + %1134 = select <2 x i1> %1133, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1135 = sub <2 x i32> zeroinitializer, %1134, !dbg !139 + %1136 = icmp eq <2 x i32> %1129, %1135, !dbg !139 + %1137 = and <2 x i1> %1114, %1136, !dbg !140 + %1138 = or <2 x i1> %1127, %1137, !dbg !141 + %1139 = select <2 x i1> %1138, <2 x i1> %623, <2 x i1> zeroinitializer, !dbg !142 + %1140 = select <2 x i1> %1125, <2 x i1> %623, <2 x i1> zeroinitializer, !dbg !142 + %1141 = srem <2 x i32> %614, %531, !dbg !118 + %1142 = icmp sge <2 x i32> %525, %1141, !dbg !122 + %1143 = and <2 x i1> %523, %1142, !dbg !121 + %1144 = icmp sge <2 x i32> %1141, %527, !dbg !123 + %1145 = srem <2 x i32> %1141, %527, !dbg !127 + %1146 = icmp ne <2 x i32> %1145, zeroinitializer, !dbg !128 + %1147 = extractelement <2 x i32> %1145, i64 0, !dbg !129 + %1148 = xor i32 %1147, %26, !dbg !129 + %1149 = extractelement <2 x i32> %1145, i64 1, !dbg !129 + %1150 = xor i32 %1149, %26, !dbg !129 + %1151 = insertelement <2 x i32> poison, i32 %1148, i64 0, !dbg !129 + %1152 = insertelement <2 x i32> %1151, i32 %1150, i64 1, !dbg !129 + %1153 = icmp slt <2 x i32> %1152, zeroinitializer, !dbg !129 + %1154 = and <2 x i1> %1146, %1153, !dbg !130 + %1155 = select <2 x i1> %1154, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !131 + %1156 = add <2 x i32> %1155, %1145, !dbg !131 + %1157 = sext <2 x i32> %1156 to <2 x i64>, !dbg !124 + %1158 = icmp sgt <2 x i64> %529, %1157, !dbg !124 + %1159 = and <2 x i1> %1144, %1158, !dbg !132 + %1160 = sub <2 x i32> %1141, %525, !dbg !133 + %1161 = srem <2 x i32> %1160, %527, !dbg !134 + %1162 = icmp ne <2 x i32> %1161, zeroinitializer, !dbg !135 + %1163 = xor <2 x i32> %1161, %527, !dbg !136 + %1164 = icmp slt <2 x i32> %1163, zeroinitializer, !dbg !136 + %1165 = and <2 x i1> %1162, %1164, !dbg !137 + %1166 = select <2 x i1> %1165, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1167 = sub <2 x i32> zeroinitializer, %1166, !dbg !139 + %1168 = icmp eq <2 x i32> %1161, %1167, !dbg !139 + %1169 = and <2 x i1> %1159, %1168, !dbg !140 + %1170 = or <2 x i1> %1143, %1169, !dbg !141 + %1171 = icmp sge <2 x i32> %537, %1141, !dbg !122 + %1172 = and <2 x i1> %535, %1171, !dbg !121 + %1173 = sub <2 x i32> %1141, %537, !dbg !133 + %1174 = srem <2 x i32> %1173, %527, !dbg !134 + %1175 = icmp ne <2 x i32> %1174, zeroinitializer, !dbg !135 + %1176 = xor <2 x i32> %1174, %527, !dbg !136 + %1177 = icmp slt <2 x i32> %1176, zeroinitializer, !dbg !136 + %1178 = and <2 x i1> %1175, %1177, !dbg !137 + %1179 = select <2 x i1> %1178, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1180 = sub <2 x i32> zeroinitializer, %1179, !dbg !139 + %1181 = icmp eq <2 x i32> %1174, %1180, !dbg !139 + %1182 = and <2 x i1> %1159, %1181, !dbg !140 + %1183 = or <2 x i1> %1172, %1182, !dbg !141 + %1184 = select <2 x i1> %1183, <2 x i1> %624, <2 x i1> zeroinitializer, !dbg !142 + %1185 = select <2 x i1> %1170, <2 x i1> %624, <2 x i1> zeroinitializer, !dbg !142 + %1186 = srem <2 x i32> %613, %531, !dbg !118 + %1187 = icmp sge <2 x i32> %525, %1186, !dbg !122 + %1188 = and <2 x i1> %523, %1187, !dbg !121 + %1189 = icmp sge <2 x i32> %1186, %527, !dbg !123 + %1190 = srem <2 x i32> %1186, %527, !dbg !127 + %1191 = icmp ne <2 x i32> %1190, zeroinitializer, !dbg !128 + %1192 = extractelement <2 x i32> %1190, i64 0, !dbg !129 + %1193 = xor i32 %1192, %26, !dbg !129 + %1194 = extractelement <2 x i32> %1190, i64 1, !dbg !129 + %1195 = xor i32 %1194, %26, !dbg !129 + %1196 = insertelement <2 x i32> poison, i32 %1193, i64 0, !dbg !129 + %1197 = insertelement <2 x i32> %1196, i32 %1195, i64 1, !dbg !129 + %1198 = icmp slt <2 x i32> %1197, zeroinitializer, !dbg !129 + %1199 = and <2 x i1> %1191, %1198, !dbg !130 + %1200 = select <2 x i1> %1199, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !131 + %1201 = add <2 x i32> %1200, %1190, !dbg !131 + %1202 = sext <2 x i32> %1201 to <2 x i64>, !dbg !124 + %1203 = icmp sgt <2 x i64> %529, %1202, !dbg !124 + %1204 = and <2 x i1> %1189, %1203, !dbg !132 + %1205 = sub <2 x i32> %1186, %525, !dbg !133 + %1206 = srem <2 x i32> %1205, %527, !dbg !134 + %1207 = icmp ne <2 x i32> %1206, zeroinitializer, !dbg !135 + %1208 = xor <2 x i32> %1206, %527, !dbg !136 + %1209 = icmp slt <2 x i32> %1208, zeroinitializer, !dbg !136 + %1210 = and <2 x i1> %1207, %1209, !dbg !137 + %1211 = select <2 x i1> %1210, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1212 = sub <2 x i32> zeroinitializer, %1211, !dbg !139 + %1213 = icmp eq <2 x i32> %1206, %1212, !dbg !139 + %1214 = and <2 x i1> %1204, %1213, !dbg !140 + %1215 = or <2 x i1> %1188, %1214, !dbg !141 + %1216 = icmp sge <2 x i32> %537, %1186, !dbg !122 + %1217 = and <2 x i1> %535, %1216, !dbg !121 + %1218 = sub <2 x i32> %1186, %537, !dbg !133 + %1219 = srem <2 x i32> %1218, %527, !dbg !134 + %1220 = icmp ne <2 x i32> %1219, zeroinitializer, !dbg !135 + %1221 = xor <2 x i32> %1219, %527, !dbg !136 + %1222 = icmp slt <2 x i32> %1221, zeroinitializer, !dbg !136 + %1223 = and <2 x i1> %1220, %1222, !dbg !137 + %1224 = select <2 x i1> %1223, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1225 = sub <2 x i32> zeroinitializer, %1224, !dbg !139 + %1226 = icmp eq <2 x i32> %1219, %1225, !dbg !139 + %1227 = and <2 x i1> %1204, %1226, !dbg !140 + %1228 = or <2 x i1> %1217, %1227, !dbg !141 + %1229 = select <2 x i1> %1228, <2 x i1> %625, <2 x i1> zeroinitializer, !dbg !142 + %1230 = select <2 x i1> %1215, <2 x i1> %625, <2 x i1> zeroinitializer, !dbg !142 + %1231 = srem <2 x i32> %612, %531, !dbg !118 + %1232 = icmp sge <2 x i32> %525, %1231, !dbg !122 + %1233 = and <2 x i1> %523, %1232, !dbg !121 + %1234 = icmp sge <2 x i32> %1231, %527, !dbg !123 + %1235 = srem <2 x i32> %1231, %527, !dbg !127 + %1236 = icmp ne <2 x i32> %1235, zeroinitializer, !dbg !128 + %1237 = extractelement <2 x i32> %1235, i64 0, !dbg !129 + %1238 = xor i32 %1237, %26, !dbg !129 + %1239 = extractelement <2 x i32> %1235, i64 1, !dbg !129 + %1240 = xor i32 %1239, %26, !dbg !129 + %1241 = insertelement <2 x i32> poison, i32 %1238, i64 0, !dbg !129 + %1242 = insertelement <2 x i32> %1241, i32 %1240, i64 1, !dbg !129 + %1243 = icmp slt <2 x i32> %1242, zeroinitializer, !dbg !129 + %1244 = and <2 x i1> %1236, %1243, !dbg !130 + %1245 = select <2 x i1> %1244, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !131 + %1246 = add <2 x i32> %1245, %1235, !dbg !131 + %1247 = sext <2 x i32> %1246 to <2 x i64>, !dbg !124 + %1248 = icmp sgt <2 x i64> %529, %1247, !dbg !124 + %1249 = and <2 x i1> %1234, %1248, !dbg !132 + %1250 = sub <2 x i32> %1231, %525, !dbg !133 + %1251 = srem <2 x i32> %1250, %527, !dbg !134 + %1252 = icmp ne <2 x i32> %1251, zeroinitializer, !dbg !135 + %1253 = xor <2 x i32> %1251, %527, !dbg !136 + %1254 = icmp slt <2 x i32> %1253, zeroinitializer, !dbg !136 + %1255 = and <2 x i1> %1252, %1254, !dbg !137 + %1256 = select <2 x i1> %1255, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1257 = sub <2 x i32> zeroinitializer, %1256, !dbg !139 + %1258 = icmp eq <2 x i32> %1251, %1257, !dbg !139 + %1259 = and <2 x i1> %1249, %1258, !dbg !140 + %1260 = or <2 x i1> %1233, %1259, !dbg !141 + %1261 = icmp sge <2 x i32> %537, %1231, !dbg !122 + %1262 = and <2 x i1> %535, %1261, !dbg !121 + %1263 = sub <2 x i32> %1231, %537, !dbg !133 + %1264 = srem <2 x i32> %1263, %527, !dbg !134 + %1265 = icmp ne <2 x i32> %1264, zeroinitializer, !dbg !135 + %1266 = xor <2 x i32> %1264, %527, !dbg !136 + %1267 = icmp slt <2 x i32> %1266, zeroinitializer, !dbg !136 + %1268 = and <2 x i1> %1265, %1267, !dbg !137 + %1269 = select <2 x i1> %1268, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1270 = sub <2 x i32> zeroinitializer, %1269, !dbg !139 + %1271 = icmp eq <2 x i32> %1264, %1270, !dbg !139 + %1272 = and <2 x i1> %1249, %1271, !dbg !140 + %1273 = or <2 x i1> %1262, %1272, !dbg !141 + %1274 = select <2 x i1> %1273, <2 x i1> %626, <2 x i1> zeroinitializer, !dbg !142 + %1275 = select <2 x i1> %1260, <2 x i1> %626, <2 x i1> zeroinitializer, !dbg !142 + %1276 = srem <2 x i32> %611, %531, !dbg !118 + %1277 = icmp sge <2 x i32> %525, %1276, !dbg !122 + %1278 = and <2 x i1> %523, %1277, !dbg !121 + %1279 = icmp sge <2 x i32> %1276, %527, !dbg !123 + %1280 = srem <2 x i32> %1276, %527, !dbg !127 + %1281 = icmp ne <2 x i32> %1280, zeroinitializer, !dbg !128 + %1282 = extractelement <2 x i32> %1280, i64 0, !dbg !129 + %1283 = xor i32 %1282, %26, !dbg !129 + %1284 = extractelement <2 x i32> %1280, i64 1, !dbg !129 + %1285 = xor i32 %1284, %26, !dbg !129 + %1286 = insertelement <2 x i32> poison, i32 %1283, i64 0, !dbg !129 + %1287 = insertelement <2 x i32> %1286, i32 %1285, i64 1, !dbg !129 + %1288 = icmp slt <2 x i32> %1287, zeroinitializer, !dbg !129 + %1289 = and <2 x i1> %1281, %1288, !dbg !130 + %1290 = select <2 x i1> %1289, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !131 + %1291 = add <2 x i32> %1290, %1280, !dbg !131 + %1292 = sext <2 x i32> %1291 to <2 x i64>, !dbg !124 + %1293 = icmp sgt <2 x i64> %529, %1292, !dbg !124 + %1294 = and <2 x i1> %1279, %1293, !dbg !132 + %1295 = sub <2 x i32> %1276, %525, !dbg !133 + %1296 = srem <2 x i32> %1295, %527, !dbg !134 + %1297 = icmp ne <2 x i32> %1296, zeroinitializer, !dbg !135 + %1298 = xor <2 x i32> %1296, %527, !dbg !136 + %1299 = icmp slt <2 x i32> %1298, zeroinitializer, !dbg !136 + %1300 = and <2 x i1> %1297, %1299, !dbg !137 + %1301 = select <2 x i1> %1300, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1302 = sub <2 x i32> zeroinitializer, %1301, !dbg !139 + %1303 = icmp eq <2 x i32> %1296, %1302, !dbg !139 + %1304 = and <2 x i1> %1294, %1303, !dbg !140 + %1305 = or <2 x i1> %1278, %1304, !dbg !141 + %1306 = icmp sge <2 x i32> %537, %1276, !dbg !122 + %1307 = and <2 x i1> %535, %1306, !dbg !121 + %1308 = sub <2 x i32> %1276, %537, !dbg !133 + %1309 = srem <2 x i32> %1308, %527, !dbg !134 + %1310 = icmp ne <2 x i32> %1309, zeroinitializer, !dbg !135 + %1311 = xor <2 x i32> %1309, %527, !dbg !136 + %1312 = icmp slt <2 x i32> %1311, zeroinitializer, !dbg !136 + %1313 = and <2 x i1> %1310, %1312, !dbg !137 + %1314 = select <2 x i1> %1313, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1315 = sub <2 x i32> zeroinitializer, %1314, !dbg !139 + %1316 = icmp eq <2 x i32> %1309, %1315, !dbg !139 + %1317 = and <2 x i1> %1294, %1316, !dbg !140 + %1318 = or <2 x i1> %1307, %1317, !dbg !141 + %1319 = select <2 x i1> %1318, <2 x i1> %627, <2 x i1> zeroinitializer, !dbg !142 + %1320 = select <2 x i1> %1305, <2 x i1> %627, <2 x i1> zeroinitializer, !dbg !142 + %1321 = srem <2 x i32> %610, %531, !dbg !118 + %1322 = icmp sge <2 x i32> %525, %1321, !dbg !122 + %1323 = and <2 x i1> %523, %1322, !dbg !121 + %1324 = icmp sge <2 x i32> %1321, %527, !dbg !123 + %1325 = srem <2 x i32> %1321, %527, !dbg !127 + %1326 = icmp ne <2 x i32> %1325, zeroinitializer, !dbg !128 + %1327 = extractelement <2 x i32> %1325, i64 0, !dbg !129 + %1328 = xor i32 %1327, %26, !dbg !129 + %1329 = extractelement <2 x i32> %1325, i64 1, !dbg !129 + %1330 = xor i32 %1329, %26, !dbg !129 + %1331 = insertelement <2 x i32> poison, i32 %1328, i64 0, !dbg !129 + %1332 = insertelement <2 x i32> %1331, i32 %1330, i64 1, !dbg !129 + %1333 = icmp slt <2 x i32> %1332, zeroinitializer, !dbg !129 + %1334 = and <2 x i1> %1326, %1333, !dbg !130 + %1335 = select <2 x i1> %1334, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !131 + %1336 = add <2 x i32> %1335, %1325, !dbg !131 + %1337 = sext <2 x i32> %1336 to <2 x i64>, !dbg !124 + %1338 = icmp sgt <2 x i64> %529, %1337, !dbg !124 + %1339 = and <2 x i1> %1324, %1338, !dbg !132 + %1340 = sub <2 x i32> %1321, %525, !dbg !133 + %1341 = srem <2 x i32> %1340, %527, !dbg !134 + %1342 = icmp ne <2 x i32> %1341, zeroinitializer, !dbg !135 + %1343 = xor <2 x i32> %1341, %527, !dbg !136 + %1344 = icmp slt <2 x i32> %1343, zeroinitializer, !dbg !136 + %1345 = and <2 x i1> %1342, %1344, !dbg !137 + %1346 = select <2 x i1> %1345, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1347 = sub <2 x i32> zeroinitializer, %1346, !dbg !139 + %1348 = icmp eq <2 x i32> %1341, %1347, !dbg !139 + %1349 = and <2 x i1> %1339, %1348, !dbg !140 + %1350 = or <2 x i1> %1323, %1349, !dbg !141 + %1351 = icmp sge <2 x i32> %537, %1321, !dbg !122 + %1352 = and <2 x i1> %535, %1351, !dbg !121 + %1353 = sub <2 x i32> %1321, %537, !dbg !133 + %1354 = srem <2 x i32> %1353, %527, !dbg !134 + %1355 = icmp ne <2 x i32> %1354, zeroinitializer, !dbg !135 + %1356 = xor <2 x i32> %1354, %527, !dbg !136 + %1357 = icmp slt <2 x i32> %1356, zeroinitializer, !dbg !136 + %1358 = and <2 x i1> %1355, %1357, !dbg !137 + %1359 = select <2 x i1> %1358, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1360 = sub <2 x i32> zeroinitializer, %1359, !dbg !139 + %1361 = icmp eq <2 x i32> %1354, %1360, !dbg !139 + %1362 = and <2 x i1> %1339, %1361, !dbg !140 + %1363 = or <2 x i1> %1352, %1362, !dbg !141 + %1364 = select <2 x i1> %1363, <2 x i1> %628, <2 x i1> zeroinitializer, !dbg !142 + %1365 = select <2 x i1> %1350, <2 x i1> %628, <2 x i1> zeroinitializer, !dbg !142 + %1366 = srem <2 x i32> %609, %531, !dbg !118 + %1367 = icmp sge <2 x i32> %525, %1366, !dbg !122 + %1368 = and <2 x i1> %523, %1367, !dbg !121 + %1369 = icmp sge <2 x i32> %1366, %527, !dbg !123 + %1370 = srem <2 x i32> %1366, %527, !dbg !127 + %1371 = icmp ne <2 x i32> %1370, zeroinitializer, !dbg !128 + %1372 = extractelement <2 x i32> %1370, i64 0, !dbg !129 + %1373 = xor i32 %1372, %26, !dbg !129 + %1374 = extractelement <2 x i32> %1370, i64 1, !dbg !129 + %1375 = xor i32 %1374, %26, !dbg !129 + %1376 = insertelement <2 x i32> poison, i32 %1373, i64 0, !dbg !129 + %1377 = insertelement <2 x i32> %1376, i32 %1375, i64 1, !dbg !129 + %1378 = icmp slt <2 x i32> %1377, zeroinitializer, !dbg !129 + %1379 = and <2 x i1> %1371, %1378, !dbg !130 + %1380 = select <2 x i1> %1379, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !131 + %1381 = add <2 x i32> %1380, %1370, !dbg !131 + %1382 = sext <2 x i32> %1381 to <2 x i64>, !dbg !124 + %1383 = icmp sgt <2 x i64> %529, %1382, !dbg !124 + %1384 = and <2 x i1> %1369, %1383, !dbg !132 + %1385 = sub <2 x i32> %1366, %525, !dbg !133 + %1386 = srem <2 x i32> %1385, %527, !dbg !134 + %1387 = icmp ne <2 x i32> %1386, zeroinitializer, !dbg !135 + %1388 = xor <2 x i32> %1386, %527, !dbg !136 + %1389 = icmp slt <2 x i32> %1388, zeroinitializer, !dbg !136 + %1390 = and <2 x i1> %1387, %1389, !dbg !137 + %1391 = select <2 x i1> %1390, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1392 = sub <2 x i32> zeroinitializer, %1391, !dbg !139 + %1393 = icmp eq <2 x i32> %1386, %1392, !dbg !139 + %1394 = and <2 x i1> %1384, %1393, !dbg !140 + %1395 = or <2 x i1> %1368, %1394, !dbg !141 + %1396 = icmp sge <2 x i32> %537, %1366, !dbg !122 + %1397 = and <2 x i1> %535, %1396, !dbg !121 + %1398 = sub <2 x i32> %1366, %537, !dbg !133 + %1399 = srem <2 x i32> %1398, %527, !dbg !134 + %1400 = icmp ne <2 x i32> %1399, zeroinitializer, !dbg !135 + %1401 = xor <2 x i32> %1399, %527, !dbg !136 + %1402 = icmp slt <2 x i32> %1401, zeroinitializer, !dbg !136 + %1403 = and <2 x i1> %1400, %1402, !dbg !137 + %1404 = select <2 x i1> %1403, <2 x i32> %527, <2 x i32> zeroinitializer, !dbg !138 + %1405 = sub <2 x i32> zeroinitializer, %1404, !dbg !139 + %1406 = icmp eq <2 x i32> %1399, %1405, !dbg !139 + %1407 = and <2 x i1> %1384, %1406, !dbg !140 + %1408 = or <2 x i1> %1397, %1407, !dbg !141 + %1409 = select <2 x i1> %1408, <2 x i1> %629, <2 x i1> zeroinitializer, !dbg !142 + %1410 = select <2 x i1> %1395, <2 x i1> %629, <2 x i1> zeroinitializer, !dbg !142 + %1411 = fmul float %1019, 0x3FF7154760000000, !dbg !143 + %1412 = extractelement <2 x i1> %1094, i64 0, !dbg !142 + %1413 = select i1 %1412, float %1411, float 0xFFF0000000000000, !dbg !142 + %1414 = fmul float %1020, 0x3FF7154760000000, !dbg !143 + %1415 = extractelement <2 x i1> %1094, i64 1, !dbg !142 + %1416 = select i1 %1415, float %1414, float 0xFFF0000000000000, !dbg !142 + %1417 = fmul float %1021, 0x3FF7154760000000, !dbg !143 + %1418 = extractelement <2 x i1> %1095, i64 0, !dbg !142 + %1419 = select i1 %1418, float %1417, float 0xFFF0000000000000, !dbg !142 + %1420 = fmul float %1022, 0x3FF7154760000000, !dbg !143 + %1421 = extractelement <2 x i1> %1095, i64 1, !dbg !142 + %1422 = select i1 %1421, float %1420, float 0xFFF0000000000000, !dbg !142 + %1423 = fmul float %1023, 0x3FF7154760000000, !dbg !143 + %1424 = extractelement <2 x i1> %1139, i64 0, !dbg !142 + %1425 = select i1 %1424, float %1423, float 0xFFF0000000000000, !dbg !142 + %1426 = fmul float %1024, 0x3FF7154760000000, !dbg !143 + %1427 = extractelement <2 x i1> %1139, i64 1, !dbg !142 + %1428 = select i1 %1427, float %1426, float 0xFFF0000000000000, !dbg !142 + %1429 = fmul float %1025, 0x3FF7154760000000, !dbg !143 + %1430 = extractelement <2 x i1> %1140, i64 0, !dbg !142 + %1431 = select i1 %1430, float %1429, float 0xFFF0000000000000, !dbg !142 + %1432 = fmul float %1026, 0x3FF7154760000000, !dbg !143 + %1433 = extractelement <2 x i1> %1140, i64 1, !dbg !142 + %1434 = select i1 %1433, float %1432, float 0xFFF0000000000000, !dbg !142 + %1435 = fmul float %1027, 0x3FF7154760000000, !dbg !143 + %1436 = extractelement <2 x i1> %1184, i64 0, !dbg !142 + %1437 = select i1 %1436, float %1435, float 0xFFF0000000000000, !dbg !142 + %1438 = fmul float %1028, 0x3FF7154760000000, !dbg !143 + %1439 = extractelement <2 x i1> %1184, i64 1, !dbg !142 + %1440 = select i1 %1439, float %1438, float 0xFFF0000000000000, !dbg !142 + %1441 = fmul float %1029, 0x3FF7154760000000, !dbg !143 + %1442 = extractelement <2 x i1> %1185, i64 0, !dbg !142 + %1443 = select i1 %1442, float %1441, float 0xFFF0000000000000, !dbg !142 + %1444 = fmul float %1030, 0x3FF7154760000000, !dbg !143 + %1445 = extractelement <2 x i1> %1185, i64 1, !dbg !142 + %1446 = select i1 %1445, float %1444, float 0xFFF0000000000000, !dbg !142 + %1447 = fmul float %1031, 0x3FF7154760000000, !dbg !143 + %1448 = extractelement <2 x i1> %1229, i64 0, !dbg !142 + %1449 = select i1 %1448, float %1447, float 0xFFF0000000000000, !dbg !142 + %1450 = fmul float %1032, 0x3FF7154760000000, !dbg !143 + %1451 = extractelement <2 x i1> %1229, i64 1, !dbg !142 + %1452 = select i1 %1451, float %1450, float 0xFFF0000000000000, !dbg !142 + %1453 = fmul float %1033, 0x3FF7154760000000, !dbg !143 + %1454 = extractelement <2 x i1> %1230, i64 0, !dbg !142 + %1455 = select i1 %1454, float %1453, float 0xFFF0000000000000, !dbg !142 + %1456 = fmul float %1034, 0x3FF7154760000000, !dbg !143 + %1457 = extractelement <2 x i1> %1230, i64 1, !dbg !142 + %1458 = select i1 %1457, float %1456, float 0xFFF0000000000000, !dbg !142 + %1459 = fmul float %1035, 0x3FF7154760000000, !dbg !143 + %1460 = extractelement <2 x i1> %1274, i64 0, !dbg !142 + %1461 = select i1 %1460, float %1459, float 0xFFF0000000000000, !dbg !142 + %1462 = fmul float %1036, 0x3FF7154760000000, !dbg !143 + %1463 = extractelement <2 x i1> %1274, i64 1, !dbg !142 + %1464 = select i1 %1463, float %1462, float 0xFFF0000000000000, !dbg !142 + %1465 = fmul float %1037, 0x3FF7154760000000, !dbg !143 + %1466 = extractelement <2 x i1> %1275, i64 0, !dbg !142 + %1467 = select i1 %1466, float %1465, float 0xFFF0000000000000, !dbg !142 + %1468 = fmul float %1038, 0x3FF7154760000000, !dbg !143 + %1469 = extractelement <2 x i1> %1275, i64 1, !dbg !142 + %1470 = select i1 %1469, float %1468, float 0xFFF0000000000000, !dbg !142 + %1471 = fmul float %1039, 0x3FF7154760000000, !dbg !143 + %1472 = extractelement <2 x i1> %1319, i64 0, !dbg !142 + %1473 = select i1 %1472, float %1471, float 0xFFF0000000000000, !dbg !142 + %1474 = fmul float %1040, 0x3FF7154760000000, !dbg !143 + %1475 = extractelement <2 x i1> %1319, i64 1, !dbg !142 + %1476 = select i1 %1475, float %1474, float 0xFFF0000000000000, !dbg !142 + %1477 = fmul float %1041, 0x3FF7154760000000, !dbg !143 + %1478 = extractelement <2 x i1> %1320, i64 0, !dbg !142 + %1479 = select i1 %1478, float %1477, float 0xFFF0000000000000, !dbg !142 + %1480 = fmul float %1042, 0x3FF7154760000000, !dbg !143 + %1481 = extractelement <2 x i1> %1320, i64 1, !dbg !142 + %1482 = select i1 %1481, float %1480, float 0xFFF0000000000000, !dbg !142 + %1483 = fmul float %1043, 0x3FF7154760000000, !dbg !143 + %1484 = extractelement <2 x i1> %1364, i64 0, !dbg !142 + %1485 = select i1 %1484, float %1483, float 0xFFF0000000000000, !dbg !142 + %1486 = fmul float %1044, 0x3FF7154760000000, !dbg !143 + %1487 = extractelement <2 x i1> %1364, i64 1, !dbg !142 + %1488 = select i1 %1487, float %1486, float 0xFFF0000000000000, !dbg !142 + %1489 = fmul float %1045, 0x3FF7154760000000, !dbg !143 + %1490 = extractelement <2 x i1> %1365, i64 0, !dbg !142 + %1491 = select i1 %1490, float %1489, float 0xFFF0000000000000, !dbg !142 + %1492 = fmul float %1046, 0x3FF7154760000000, !dbg !143 + %1493 = extractelement <2 x i1> %1365, i64 1, !dbg !142 + %1494 = select i1 %1493, float %1492, float 0xFFF0000000000000, !dbg !142 + %1495 = fmul float %1047, 0x3FF7154760000000, !dbg !143 + %1496 = extractelement <2 x i1> %1409, i64 0, !dbg !142 + %1497 = select i1 %1496, float %1495, float 0xFFF0000000000000, !dbg !142 + %1498 = fmul float %1048, 0x3FF7154760000000, !dbg !143 + %1499 = extractelement <2 x i1> %1409, i64 1, !dbg !142 + %1500 = select i1 %1499, float %1498, float 0xFFF0000000000000, !dbg !142 + %1501 = fmul float %1049, 0x3FF7154760000000, !dbg !143 + %1502 = extractelement <2 x i1> %1410, i64 0, !dbg !142 + %1503 = select i1 %1502, float %1501, float 0xFFF0000000000000, !dbg !142 + %1504 = fmul float %1050, 0x3FF7154760000000, !dbg !143 + %1505 = extractelement <2 x i1> %1410, i64 1, !dbg !142 + %1506 = select i1 %1505, float %1504, float 0xFFF0000000000000, !dbg !142 + %1507 = fsub float %1413, %381, !dbg !144 + %1508 = fsub float %1416, %381, !dbg !144 + %1509 = fsub float %1419, %382, !dbg !144 + %1510 = fsub float %1422, %382, !dbg !144 + %1511 = fsub float %1425, %381, !dbg !144 + %1512 = fsub float %1428, %381, !dbg !144 + %1513 = fsub float %1431, %382, !dbg !144 + %1514 = fsub float %1434, %382, !dbg !144 + %1515 = fsub float %1437, %381, !dbg !144 + %1516 = fsub float %1440, %381, !dbg !144 + %1517 = fsub float %1443, %382, !dbg !144 + %1518 = fsub float %1446, %382, !dbg !144 + %1519 = fsub float %1449, %381, !dbg !144 + %1520 = fsub float %1452, %381, !dbg !144 + %1521 = fsub float %1455, %382, !dbg !144 + %1522 = fsub float %1458, %382, !dbg !144 + %1523 = fsub float %1461, %381, !dbg !144 + %1524 = fsub float %1464, %381, !dbg !144 + %1525 = fsub float %1467, %382, !dbg !144 + %1526 = fsub float %1470, %382, !dbg !144 + %1527 = fsub float %1473, %381, !dbg !144 + %1528 = fsub float %1476, %381, !dbg !144 + %1529 = fsub float %1479, %382, !dbg !144 + %1530 = fsub float %1482, %382, !dbg !144 + %1531 = fsub float %1485, %381, !dbg !144 + %1532 = fsub float %1488, %381, !dbg !144 + %1533 = fsub float %1491, %382, !dbg !144 + %1534 = fsub float %1494, %382, !dbg !144 + %1535 = fsub float %1497, %381, !dbg !144 + %1536 = fsub float %1500, %381, !dbg !144 + %1537 = fsub float %1503, %382, !dbg !144 + %1538 = fsub float %1506, %382, !dbg !144 + %1539 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1432 = icmp eq i32 %1539, 0, !dbg !145 + br i1 %.not.i1432, label %1542, label %1540, !dbg !145 + +1540: ; preds = %540 + %1541 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1507) #3, !dbg !145 + br label %__nv_exp2f.exit1434, !dbg !145 + +1542: ; preds = %540 + %1543 = tail call float @llvm.nvvm.ex2.approx.f(float %1507) #3, !dbg !145 + br label %__nv_exp2f.exit1434, !dbg !145 + +__nv_exp2f.exit1434: ; preds = %1540, %1542 + %.0.i1433 = phi float [ %1541, %1540 ], [ %1543, %1542 ], !dbg !145 + %1544 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1435 = icmp eq i32 %1544, 0, !dbg !145 + br i1 %.not.i1435, label %1547, label %1545, !dbg !145 + +1545: ; preds = %__nv_exp2f.exit1434 + %1546 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1508) #3, !dbg !145 + br label %__nv_exp2f.exit1437, !dbg !145 + +1547: ; preds = %__nv_exp2f.exit1434 + %1548 = tail call float @llvm.nvvm.ex2.approx.f(float %1508) #3, !dbg !145 + br label %__nv_exp2f.exit1437, !dbg !145 + +__nv_exp2f.exit1437: ; preds = %1545, %1547 + %.0.i1436 = phi float [ %1546, %1545 ], [ %1548, %1547 ], !dbg !145 + %1549 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1438 = icmp eq i32 %1549, 0, !dbg !145 + br i1 %.not.i1438, label %1552, label %1550, !dbg !145 + +1550: ; preds = %__nv_exp2f.exit1437 + %1551 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1509) #3, !dbg !145 + br label %__nv_exp2f.exit1440, !dbg !145 + +1552: ; preds = %__nv_exp2f.exit1437 + %1553 = tail call float @llvm.nvvm.ex2.approx.f(float %1509) #3, !dbg !145 + br label %__nv_exp2f.exit1440, !dbg !145 + +__nv_exp2f.exit1440: ; preds = %1550, %1552 + %.0.i1439 = phi float [ %1551, %1550 ], [ %1553, %1552 ], !dbg !145 + %1554 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1441 = icmp eq i32 %1554, 0, !dbg !145 + br i1 %.not.i1441, label %1557, label %1555, !dbg !145 + +1555: ; preds = %__nv_exp2f.exit1440 + %1556 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1510) #3, !dbg !145 + br label %__nv_exp2f.exit1443, !dbg !145 + +1557: ; preds = %__nv_exp2f.exit1440 + %1558 = tail call float @llvm.nvvm.ex2.approx.f(float %1510) #3, !dbg !145 + br label %__nv_exp2f.exit1443, !dbg !145 + +__nv_exp2f.exit1443: ; preds = %1555, %1557 + %.0.i1442 = phi float [ %1556, %1555 ], [ %1558, %1557 ], !dbg !145 + %1559 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1444 = icmp eq i32 %1559, 0, !dbg !145 + br i1 %.not.i1444, label %1562, label %1560, !dbg !145 + +1560: ; preds = %__nv_exp2f.exit1443 + %1561 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1511) #3, !dbg !145 + br label %__nv_exp2f.exit1446, !dbg !145 + +1562: ; preds = %__nv_exp2f.exit1443 + %1563 = tail call float @llvm.nvvm.ex2.approx.f(float %1511) #3, !dbg !145 + br label %__nv_exp2f.exit1446, !dbg !145 + +__nv_exp2f.exit1446: ; preds = %1560, %1562 + %.0.i1445 = phi float [ %1561, %1560 ], [ %1563, %1562 ], !dbg !145 + %1564 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1447 = icmp eq i32 %1564, 0, !dbg !145 + br i1 %.not.i1447, label %1567, label %1565, !dbg !145 + +1565: ; preds = %__nv_exp2f.exit1446 + %1566 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1512) #3, !dbg !145 + br label %__nv_exp2f.exit1449, !dbg !145 + +1567: ; preds = %__nv_exp2f.exit1446 + %1568 = tail call float @llvm.nvvm.ex2.approx.f(float %1512) #3, !dbg !145 + br label %__nv_exp2f.exit1449, !dbg !145 + +__nv_exp2f.exit1449: ; preds = %1565, %1567 + %.0.i1448 = phi float [ %1566, %1565 ], [ %1568, %1567 ], !dbg !145 + %1569 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1450 = icmp eq i32 %1569, 0, !dbg !145 + br i1 %.not.i1450, label %1572, label %1570, !dbg !145 + +1570: ; preds = %__nv_exp2f.exit1449 + %1571 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1513) #3, !dbg !145 + br label %__nv_exp2f.exit1452, !dbg !145 + +1572: ; preds = %__nv_exp2f.exit1449 + %1573 = tail call float @llvm.nvvm.ex2.approx.f(float %1513) #3, !dbg !145 + br label %__nv_exp2f.exit1452, !dbg !145 + +__nv_exp2f.exit1452: ; preds = %1570, %1572 + %.0.i1451 = phi float [ %1571, %1570 ], [ %1573, %1572 ], !dbg !145 + %1574 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1453 = icmp eq i32 %1574, 0, !dbg !145 + br i1 %.not.i1453, label %1577, label %1575, !dbg !145 + +1575: ; preds = %__nv_exp2f.exit1452 + %1576 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1514) #3, !dbg !145 + br label %__nv_exp2f.exit1455, !dbg !145 + +1577: ; preds = %__nv_exp2f.exit1452 + %1578 = tail call float @llvm.nvvm.ex2.approx.f(float %1514) #3, !dbg !145 + br label %__nv_exp2f.exit1455, !dbg !145 + +__nv_exp2f.exit1455: ; preds = %1575, %1577 + %.0.i1454 = phi float [ %1576, %1575 ], [ %1578, %1577 ], !dbg !145 + %1579 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1456 = icmp eq i32 %1579, 0, !dbg !145 + br i1 %.not.i1456, label %1582, label %1580, !dbg !145 + +1580: ; preds = %__nv_exp2f.exit1455 + %1581 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1515) #3, !dbg !145 + br label %__nv_exp2f.exit1458, !dbg !145 + +1582: ; preds = %__nv_exp2f.exit1455 + %1583 = tail call float @llvm.nvvm.ex2.approx.f(float %1515) #3, !dbg !145 + br label %__nv_exp2f.exit1458, !dbg !145 + +__nv_exp2f.exit1458: ; preds = %1580, %1582 + %.0.i1457 = phi float [ %1581, %1580 ], [ %1583, %1582 ], !dbg !145 + %1584 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1459 = icmp eq i32 %1584, 0, !dbg !145 + br i1 %.not.i1459, label %1587, label %1585, !dbg !145 + +1585: ; preds = %__nv_exp2f.exit1458 + %1586 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1516) #3, !dbg !145 + br label %__nv_exp2f.exit1461, !dbg !145 + +1587: ; preds = %__nv_exp2f.exit1458 + %1588 = tail call float @llvm.nvvm.ex2.approx.f(float %1516) #3, !dbg !145 + br label %__nv_exp2f.exit1461, !dbg !145 + +__nv_exp2f.exit1461: ; preds = %1585, %1587 + %.0.i1460 = phi float [ %1586, %1585 ], [ %1588, %1587 ], !dbg !145 + %1589 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1462 = icmp eq i32 %1589, 0, !dbg !145 + br i1 %.not.i1462, label %1592, label %1590, !dbg !145 + +1590: ; preds = %__nv_exp2f.exit1461 + %1591 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1517) #3, !dbg !145 + br label %__nv_exp2f.exit1464, !dbg !145 + +1592: ; preds = %__nv_exp2f.exit1461 + %1593 = tail call float @llvm.nvvm.ex2.approx.f(float %1517) #3, !dbg !145 + br label %__nv_exp2f.exit1464, !dbg !145 + +__nv_exp2f.exit1464: ; preds = %1590, %1592 + %.0.i1463 = phi float [ %1591, %1590 ], [ %1593, %1592 ], !dbg !145 + %1594 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1465 = icmp eq i32 %1594, 0, !dbg !145 + br i1 %.not.i1465, label %1597, label %1595, !dbg !145 + +1595: ; preds = %__nv_exp2f.exit1464 + %1596 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1518) #3, !dbg !145 + br label %__nv_exp2f.exit1467, !dbg !145 + +1597: ; preds = %__nv_exp2f.exit1464 + %1598 = tail call float @llvm.nvvm.ex2.approx.f(float %1518) #3, !dbg !145 + br label %__nv_exp2f.exit1467, !dbg !145 + +__nv_exp2f.exit1467: ; preds = %1595, %1597 + %.0.i1466 = phi float [ %1596, %1595 ], [ %1598, %1597 ], !dbg !145 + %1599 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1468 = icmp eq i32 %1599, 0, !dbg !145 + br i1 %.not.i1468, label %1602, label %1600, !dbg !145 + +1600: ; preds = %__nv_exp2f.exit1467 + %1601 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1519) #3, !dbg !145 + br label %__nv_exp2f.exit1470, !dbg !145 + +1602: ; preds = %__nv_exp2f.exit1467 + %1603 = tail call float @llvm.nvvm.ex2.approx.f(float %1519) #3, !dbg !145 + br label %__nv_exp2f.exit1470, !dbg !145 + +__nv_exp2f.exit1470: ; preds = %1600, %1602 + %.0.i1469 = phi float [ %1601, %1600 ], [ %1603, %1602 ], !dbg !145 + %1604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1471 = icmp eq i32 %1604, 0, !dbg !145 + br i1 %.not.i1471, label %1607, label %1605, !dbg !145 + +1605: ; preds = %__nv_exp2f.exit1470 + %1606 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1520) #3, !dbg !145 + br label %__nv_exp2f.exit1473, !dbg !145 + +1607: ; preds = %__nv_exp2f.exit1470 + %1608 = tail call float @llvm.nvvm.ex2.approx.f(float %1520) #3, !dbg !145 + br label %__nv_exp2f.exit1473, !dbg !145 + +__nv_exp2f.exit1473: ; preds = %1605, %1607 + %.0.i1472 = phi float [ %1606, %1605 ], [ %1608, %1607 ], !dbg !145 + %1609 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1474 = icmp eq i32 %1609, 0, !dbg !145 + br i1 %.not.i1474, label %1612, label %1610, !dbg !145 + +1610: ; preds = %__nv_exp2f.exit1473 + %1611 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1521) #3, !dbg !145 + br label %__nv_exp2f.exit1476, !dbg !145 + +1612: ; preds = %__nv_exp2f.exit1473 + %1613 = tail call float @llvm.nvvm.ex2.approx.f(float %1521) #3, !dbg !145 + br label %__nv_exp2f.exit1476, !dbg !145 + +__nv_exp2f.exit1476: ; preds = %1610, %1612 + %.0.i1475 = phi float [ %1611, %1610 ], [ %1613, %1612 ], !dbg !145 + %1614 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1477 = icmp eq i32 %1614, 0, !dbg !145 + br i1 %.not.i1477, label %1617, label %1615, !dbg !145 + +1615: ; preds = %__nv_exp2f.exit1476 + %1616 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1522) #3, !dbg !145 + br label %__nv_exp2f.exit1479, !dbg !145 + +1617: ; preds = %__nv_exp2f.exit1476 + %1618 = tail call float @llvm.nvvm.ex2.approx.f(float %1522) #3, !dbg !145 + br label %__nv_exp2f.exit1479, !dbg !145 + +__nv_exp2f.exit1479: ; preds = %1615, %1617 + %.0.i1478 = phi float [ %1616, %1615 ], [ %1618, %1617 ], !dbg !145 + %1619 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1480 = icmp eq i32 %1619, 0, !dbg !145 + br i1 %.not.i1480, label %1622, label %1620, !dbg !145 + +1620: ; preds = %__nv_exp2f.exit1479 + %1621 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1523) #3, !dbg !145 + br label %__nv_exp2f.exit1482, !dbg !145 + +1622: ; preds = %__nv_exp2f.exit1479 + %1623 = tail call float @llvm.nvvm.ex2.approx.f(float %1523) #3, !dbg !145 + br label %__nv_exp2f.exit1482, !dbg !145 + +__nv_exp2f.exit1482: ; preds = %1620, %1622 + %.0.i1481 = phi float [ %1621, %1620 ], [ %1623, %1622 ], !dbg !145 + %1624 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1483 = icmp eq i32 %1624, 0, !dbg !145 + br i1 %.not.i1483, label %1627, label %1625, !dbg !145 + +1625: ; preds = %__nv_exp2f.exit1482 + %1626 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1524) #3, !dbg !145 + br label %__nv_exp2f.exit1485, !dbg !145 + +1627: ; preds = %__nv_exp2f.exit1482 + %1628 = tail call float @llvm.nvvm.ex2.approx.f(float %1524) #3, !dbg !145 + br label %__nv_exp2f.exit1485, !dbg !145 + +__nv_exp2f.exit1485: ; preds = %1625, %1627 + %.0.i1484 = phi float [ %1626, %1625 ], [ %1628, %1627 ], !dbg !145 + %1629 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1486 = icmp eq i32 %1629, 0, !dbg !145 + br i1 %.not.i1486, label %1632, label %1630, !dbg !145 + +1630: ; preds = %__nv_exp2f.exit1485 + %1631 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1525) #3, !dbg !145 + br label %__nv_exp2f.exit1488, !dbg !145 + +1632: ; preds = %__nv_exp2f.exit1485 + %1633 = tail call float @llvm.nvvm.ex2.approx.f(float %1525) #3, !dbg !145 + br label %__nv_exp2f.exit1488, !dbg !145 + +__nv_exp2f.exit1488: ; preds = %1630, %1632 + %.0.i1487 = phi float [ %1631, %1630 ], [ %1633, %1632 ], !dbg !145 + %1634 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1489 = icmp eq i32 %1634, 0, !dbg !145 + br i1 %.not.i1489, label %1637, label %1635, !dbg !145 + +1635: ; preds = %__nv_exp2f.exit1488 + %1636 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1526) #3, !dbg !145 + br label %__nv_exp2f.exit1491, !dbg !145 + +1637: ; preds = %__nv_exp2f.exit1488 + %1638 = tail call float @llvm.nvvm.ex2.approx.f(float %1526) #3, !dbg !145 + br label %__nv_exp2f.exit1491, !dbg !145 + +__nv_exp2f.exit1491: ; preds = %1635, %1637 + %.0.i1490 = phi float [ %1636, %1635 ], [ %1638, %1637 ], !dbg !145 + %1639 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1492 = icmp eq i32 %1639, 0, !dbg !145 + br i1 %.not.i1492, label %1642, label %1640, !dbg !145 + +1640: ; preds = %__nv_exp2f.exit1491 + %1641 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1527) #3, !dbg !145 + br label %__nv_exp2f.exit1494, !dbg !145 + +1642: ; preds = %__nv_exp2f.exit1491 + %1643 = tail call float @llvm.nvvm.ex2.approx.f(float %1527) #3, !dbg !145 + br label %__nv_exp2f.exit1494, !dbg !145 + +__nv_exp2f.exit1494: ; preds = %1640, %1642 + %.0.i1493 = phi float [ %1641, %1640 ], [ %1643, %1642 ], !dbg !145 + %1644 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1495 = icmp eq i32 %1644, 0, !dbg !145 + br i1 %.not.i1495, label %1647, label %1645, !dbg !145 + +1645: ; preds = %__nv_exp2f.exit1494 + %1646 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1528) #3, !dbg !145 + br label %__nv_exp2f.exit1497, !dbg !145 + +1647: ; preds = %__nv_exp2f.exit1494 + %1648 = tail call float @llvm.nvvm.ex2.approx.f(float %1528) #3, !dbg !145 + br label %__nv_exp2f.exit1497, !dbg !145 + +__nv_exp2f.exit1497: ; preds = %1645, %1647 + %.0.i1496 = phi float [ %1646, %1645 ], [ %1648, %1647 ], !dbg !145 + %1649 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1498 = icmp eq i32 %1649, 0, !dbg !145 + br i1 %.not.i1498, label %1652, label %1650, !dbg !145 + +1650: ; preds = %__nv_exp2f.exit1497 + %1651 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1529) #3, !dbg !145 + br label %__nv_exp2f.exit1500, !dbg !145 + +1652: ; preds = %__nv_exp2f.exit1497 + %1653 = tail call float @llvm.nvvm.ex2.approx.f(float %1529) #3, !dbg !145 + br label %__nv_exp2f.exit1500, !dbg !145 + +__nv_exp2f.exit1500: ; preds = %1650, %1652 + %.0.i1499 = phi float [ %1651, %1650 ], [ %1653, %1652 ], !dbg !145 + %1654 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1501 = icmp eq i32 %1654, 0, !dbg !145 + br i1 %.not.i1501, label %1657, label %1655, !dbg !145 + +1655: ; preds = %__nv_exp2f.exit1500 + %1656 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1530) #3, !dbg !145 + br label %__nv_exp2f.exit1503, !dbg !145 + +1657: ; preds = %__nv_exp2f.exit1500 + %1658 = tail call float @llvm.nvvm.ex2.approx.f(float %1530) #3, !dbg !145 + br label %__nv_exp2f.exit1503, !dbg !145 + +__nv_exp2f.exit1503: ; preds = %1655, %1657 + %.0.i1502 = phi float [ %1656, %1655 ], [ %1658, %1657 ], !dbg !145 + %1659 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1504 = icmp eq i32 %1659, 0, !dbg !145 + br i1 %.not.i1504, label %1662, label %1660, !dbg !145 + +1660: ; preds = %__nv_exp2f.exit1503 + %1661 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1531) #3, !dbg !145 + br label %__nv_exp2f.exit1506, !dbg !145 + +1662: ; preds = %__nv_exp2f.exit1503 + %1663 = tail call float @llvm.nvvm.ex2.approx.f(float %1531) #3, !dbg !145 + br label %__nv_exp2f.exit1506, !dbg !145 + +__nv_exp2f.exit1506: ; preds = %1660, %1662 + %.0.i1505 = phi float [ %1661, %1660 ], [ %1663, %1662 ], !dbg !145 + %1664 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1507 = icmp eq i32 %1664, 0, !dbg !145 + br i1 %.not.i1507, label %1667, label %1665, !dbg !145 + +1665: ; preds = %__nv_exp2f.exit1506 + %1666 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1532) #3, !dbg !145 + br label %__nv_exp2f.exit1509, !dbg !145 + +1667: ; preds = %__nv_exp2f.exit1506 + %1668 = tail call float @llvm.nvvm.ex2.approx.f(float %1532) #3, !dbg !145 + br label %__nv_exp2f.exit1509, !dbg !145 + +__nv_exp2f.exit1509: ; preds = %1665, %1667 + %.0.i1508 = phi float [ %1666, %1665 ], [ %1668, %1667 ], !dbg !145 + %1669 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1510 = icmp eq i32 %1669, 0, !dbg !145 + br i1 %.not.i1510, label %1672, label %1670, !dbg !145 + +1670: ; preds = %__nv_exp2f.exit1509 + %1671 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1533) #3, !dbg !145 + br label %__nv_exp2f.exit1512, !dbg !145 + +1672: ; preds = %__nv_exp2f.exit1509 + %1673 = tail call float @llvm.nvvm.ex2.approx.f(float %1533) #3, !dbg !145 + br label %__nv_exp2f.exit1512, !dbg !145 + +__nv_exp2f.exit1512: ; preds = %1670, %1672 + %.0.i1511 = phi float [ %1671, %1670 ], [ %1673, %1672 ], !dbg !145 + %1674 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1513 = icmp eq i32 %1674, 0, !dbg !145 + br i1 %.not.i1513, label %1677, label %1675, !dbg !145 + +1675: ; preds = %__nv_exp2f.exit1512 + %1676 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1534) #3, !dbg !145 + br label %__nv_exp2f.exit1515, !dbg !145 + +1677: ; preds = %__nv_exp2f.exit1512 + %1678 = tail call float @llvm.nvvm.ex2.approx.f(float %1534) #3, !dbg !145 + br label %__nv_exp2f.exit1515, !dbg !145 + +__nv_exp2f.exit1515: ; preds = %1675, %1677 + %.0.i1514 = phi float [ %1676, %1675 ], [ %1678, %1677 ], !dbg !145 + %1679 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1516 = icmp eq i32 %1679, 0, !dbg !145 + br i1 %.not.i1516, label %1682, label %1680, !dbg !145 + +1680: ; preds = %__nv_exp2f.exit1515 + %1681 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1535) #3, !dbg !145 + br label %__nv_exp2f.exit1518, !dbg !145 + +1682: ; preds = %__nv_exp2f.exit1515 + %1683 = tail call float @llvm.nvvm.ex2.approx.f(float %1535) #3, !dbg !145 + br label %__nv_exp2f.exit1518, !dbg !145 + +__nv_exp2f.exit1518: ; preds = %1680, %1682 + %.0.i1517 = phi float [ %1681, %1680 ], [ %1683, %1682 ], !dbg !145 + %1684 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1519 = icmp eq i32 %1684, 0, !dbg !145 + br i1 %.not.i1519, label %1687, label %1685, !dbg !145 + +1685: ; preds = %__nv_exp2f.exit1518 + %1686 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1536) #3, !dbg !145 + br label %__nv_exp2f.exit1521, !dbg !145 + +1687: ; preds = %__nv_exp2f.exit1518 + %1688 = tail call float @llvm.nvvm.ex2.approx.f(float %1536) #3, !dbg !145 + br label %__nv_exp2f.exit1521, !dbg !145 + +__nv_exp2f.exit1521: ; preds = %1685, %1687 + %.0.i1520 = phi float [ %1686, %1685 ], [ %1688, %1687 ], !dbg !145 + %1689 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1522 = icmp eq i32 %1689, 0, !dbg !145 + br i1 %.not.i1522, label %1692, label %1690, !dbg !145 + +1690: ; preds = %__nv_exp2f.exit1521 + %1691 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1537) #3, !dbg !145 + br label %__nv_exp2f.exit1524, !dbg !145 + +1692: ; preds = %__nv_exp2f.exit1521 + %1693 = tail call float @llvm.nvvm.ex2.approx.f(float %1537) #3, !dbg !145 + br label %__nv_exp2f.exit1524, !dbg !145 + +__nv_exp2f.exit1524: ; preds = %1690, %1692 + %.0.i1523 = phi float [ %1691, %1690 ], [ %1693, %1692 ], !dbg !145 + %1694 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !145 + %.not.i1525 = icmp eq i32 %1694, 0, !dbg !145 + br i1 %.not.i1525, label %1697, label %1695, !dbg !145 + +1695: ; preds = %__nv_exp2f.exit1524 + %1696 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1538) #3, !dbg !145 + br label %__nv_exp2f.exit1527, !dbg !145 + +1697: ; preds = %__nv_exp2f.exit1524 + %1698 = tail call float @llvm.nvvm.ex2.approx.f(float %1538) #3, !dbg !145 + br label %__nv_exp2f.exit1527, !dbg !145 + +__nv_exp2f.exit1527: ; preds = %1695, %1697 + %.0.i1526 = phi float [ %1696, %1695 ], [ %1698, %1697 ], !dbg !145 + %1699 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %630, !dbg !113 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !146 + %1700 = add i32 %634, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !146 + %1701 = lshr exact i32 %1700, 4, !dbg !146 + %1702 = and i32 %1701, 16383, !dbg !146 + %1703 = zext nneg i32 %1702 to i64, !dbg !146 + %1704 = or disjoint i64 %1703, 4611686293372403712, !dbg !146 + %1705 = ptrtoint ptr addrspace(3) %1699 to i32, !dbg !146 + %1706 = lshr exact i32 %1705, 4, !dbg !146 + %1707 = and i32 %1706, 16383, !dbg !146 + %1708 = zext nneg i32 %1707 to i64, !dbg !146 + %1709 = or disjoint i64 %1708, 4611686293338849280, !dbg !146 + %1710 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %1704, i64 %1709) #3, !dbg !146 + %1711 = add i32 %646, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !146 + %1712 = lshr exact i32 %1711, 4, !dbg !146 + %1713 = and i32 %1712, 16383, !dbg !146 + %1714 = zext nneg i32 %1713 to i64, !dbg !146 + %1715 = or disjoint i64 %1714, 4611686293372403712, !dbg !146 + %1716 = add i32 %1705, 32, !dbg !146 + %1717 = lshr exact i32 %1716, 4, !dbg !146 + %1718 = and i32 %1717, 16383, !dbg !146 + %1719 = zext nneg i32 %1718 to i64, !dbg !146 + %1720 = or disjoint i64 %1719, 4611686293338849280, !dbg !146 + %1721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 0, !dbg !146 + %1722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 1, !dbg !146 + %1723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 2, !dbg !146 + %1724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 3, !dbg !146 + %1725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 4, !dbg !146 + %1726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 5, !dbg !146 + %1727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 6, !dbg !146 + %1728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 7, !dbg !146 + %1729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 8, !dbg !146 + %1730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 9, !dbg !146 + %1731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 10, !dbg !146 + %1732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 11, !dbg !146 + %1733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 12, !dbg !146 + %1734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 13, !dbg !146 + %1735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 14, !dbg !146 + %1736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 15, !dbg !146 + %1737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 16, !dbg !146 + %1738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 17, !dbg !146 + %1739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 18, !dbg !146 + %1740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 19, !dbg !146 + %1741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 20, !dbg !146 + %1742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 21, !dbg !146 + %1743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 22, !dbg !146 + %1744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 23, !dbg !146 + %1745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 24, !dbg !146 + %1746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 25, !dbg !146 + %1747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 26, !dbg !146 + %1748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 27, !dbg !146 + %1749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 28, !dbg !146 + %1750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 29, !dbg !146 + %1751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 30, !dbg !146 + %1752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1710, 31, !dbg !146 + %1753 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1721, float %1722, float %1723, float %1724, float %1725, float %1726, float %1727, float %1728, float %1729, float %1730, float %1731, float %1732, float %1733, float %1734, float %1735, float %1736, float %1737, float %1738, float %1739, float %1740, float %1741, float %1742, float %1743, float %1744, float %1745, float %1746, float %1747, float %1748, float %1749, float %1750, float %1751, float %1752, i64 %1715, i64 %1720, i1 true) #3, !dbg !146 + %1754 = add i32 %690, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !146 + %1755 = lshr exact i32 %1754, 4, !dbg !146 + %1756 = and i32 %1755, 16383, !dbg !146 + %1757 = zext nneg i32 %1756 to i64, !dbg !146 + %1758 = or disjoint i64 %1757, 4611686293372403712, !dbg !146 + %1759 = add i32 %1705, 64, !dbg !146 + %1760 = lshr exact i32 %1759, 4, !dbg !146 + %1761 = and i32 %1760, 16383, !dbg !146 + %1762 = zext nneg i32 %1761 to i64, !dbg !146 + %1763 = or disjoint i64 %1762, 4611686293338849280, !dbg !146 + %1764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 0, !dbg !146 + %1765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 1, !dbg !146 + %1766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 2, !dbg !146 + %1767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 3, !dbg !146 + %1768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 4, !dbg !146 + %1769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 5, !dbg !146 + %1770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 6, !dbg !146 + %1771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 7, !dbg !146 + %1772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 8, !dbg !146 + %1773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 9, !dbg !146 + %1774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 10, !dbg !146 + %1775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 11, !dbg !146 + %1776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 12, !dbg !146 + %1777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 13, !dbg !146 + %1778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 14, !dbg !146 + %1779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 15, !dbg !146 + %1780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 16, !dbg !146 + %1781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 17, !dbg !146 + %1782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 18, !dbg !146 + %1783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 19, !dbg !146 + %1784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 20, !dbg !146 + %1785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 21, !dbg !146 + %1786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 22, !dbg !146 + %1787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 23, !dbg !146 + %1788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 24, !dbg !146 + %1789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 25, !dbg !146 + %1790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 26, !dbg !146 + %1791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 27, !dbg !146 + %1792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 28, !dbg !146 + %1793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 29, !dbg !146 + %1794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 30, !dbg !146 + %1795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1753, 31, !dbg !146 + %1796 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1764, float %1765, float %1766, float %1767, float %1768, float %1769, float %1770, float %1771, float %1772, float %1773, float %1774, float %1775, float %1776, float %1777, float %1778, float %1779, float %1780, float %1781, float %1782, float %1783, float %1784, float %1785, float %1786, float %1787, float %1788, float %1789, float %1790, float %1791, float %1792, float %1793, float %1794, float %1795, i64 %1758, i64 %1763, i1 true) #3, !dbg !146 + %1797 = add i32 %734, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !146 + %1798 = lshr exact i32 %1797, 4, !dbg !146 + %1799 = and i32 %1798, 16383, !dbg !146 + %1800 = zext nneg i32 %1799 to i64, !dbg !146 + %1801 = or disjoint i64 %1800, 4611686293372403712, !dbg !146 + %1802 = add i32 %1705, 96, !dbg !146 + %1803 = lshr exact i32 %1802, 4, !dbg !146 + %1804 = and i32 %1803, 16383, !dbg !146 + %1805 = zext nneg i32 %1804 to i64, !dbg !146 + %1806 = or disjoint i64 %1805, 4611686293338849280, !dbg !146 + %1807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 0, !dbg !146 + %1808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 1, !dbg !146 + %1809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 2, !dbg !146 + %1810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 3, !dbg !146 + %1811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 4, !dbg !146 + %1812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 5, !dbg !146 + %1813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 6, !dbg !146 + %1814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 7, !dbg !146 + %1815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 8, !dbg !146 + %1816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 9, !dbg !146 + %1817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 10, !dbg !146 + %1818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 11, !dbg !146 + %1819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 12, !dbg !146 + %1820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 13, !dbg !146 + %1821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 14, !dbg !146 + %1822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 15, !dbg !146 + %1823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 16, !dbg !146 + %1824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 17, !dbg !146 + %1825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 18, !dbg !146 + %1826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 19, !dbg !146 + %1827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 20, !dbg !146 + %1828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 21, !dbg !146 + %1829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 22, !dbg !146 + %1830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 23, !dbg !146 + %1831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 24, !dbg !146 + %1832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 25, !dbg !146 + %1833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 26, !dbg !146 + %1834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 27, !dbg !146 + %1835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 28, !dbg !146 + %1836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 29, !dbg !146 + %1837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 30, !dbg !146 + %1838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1796, 31, !dbg !146 + %1839 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1807, float %1808, float %1809, float %1810, float %1811, float %1812, float %1813, float %1814, float %1815, float %1816, float %1817, float %1818, float %1819, float %1820, float %1821, float %1822, float %1823, float %1824, float %1825, float %1826, float %1827, float %1828, float %1829, float %1830, float %1831, float %1832, float %1833, float %1834, float %1835, float %1836, float %1837, float %1838, i64 %1801, i64 %1806, i1 true) #3, !dbg !146 + %1840 = add i32 %778, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !146 + %1841 = lshr exact i32 %1840, 4, !dbg !146 + %1842 = and i32 %1841, 16383, !dbg !146 + %1843 = zext nneg i32 %1842 to i64, !dbg !146 + %1844 = or disjoint i64 %1843, 4611686293372403712, !dbg !146 + %1845 = add i32 %1705, 8192, !dbg !146 + %1846 = lshr exact i32 %1845, 4, !dbg !146 + %1847 = and i32 %1846, 16383, !dbg !146 + %1848 = zext nneg i32 %1847 to i64, !dbg !146 + %1849 = or disjoint i64 %1848, 4611686293338849280, !dbg !146 + %1850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 0, !dbg !146 + %1851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 1, !dbg !146 + %1852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 2, !dbg !146 + %1853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 3, !dbg !146 + %1854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 4, !dbg !146 + %1855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 5, !dbg !146 + %1856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 6, !dbg !146 + %1857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 7, !dbg !146 + %1858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 8, !dbg !146 + %1859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 9, !dbg !146 + %1860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 10, !dbg !146 + %1861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 11, !dbg !146 + %1862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 12, !dbg !146 + %1863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 13, !dbg !146 + %1864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 14, !dbg !146 + %1865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 15, !dbg !146 + %1866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 16, !dbg !146 + %1867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 17, !dbg !146 + %1868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 18, !dbg !146 + %1869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 19, !dbg !146 + %1870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 20, !dbg !146 + %1871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 21, !dbg !146 + %1872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 22, !dbg !146 + %1873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 23, !dbg !146 + %1874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 24, !dbg !146 + %1875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 25, !dbg !146 + %1876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 26, !dbg !146 + %1877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 27, !dbg !146 + %1878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 28, !dbg !146 + %1879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 29, !dbg !146 + %1880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 30, !dbg !146 + %1881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1839, 31, !dbg !146 + %1882 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1850, float %1851, float %1852, float %1853, float %1854, float %1855, float %1856, float %1857, float %1858, float %1859, float %1860, float %1861, float %1862, float %1863, float %1864, float %1865, float %1866, float %1867, float %1868, float %1869, float %1870, float %1871, float %1872, float %1873, float %1874, float %1875, float %1876, float %1877, float %1878, float %1879, float %1880, float %1881, i64 %1844, i64 %1849, i1 true) #3, !dbg !146 + %1883 = add i32 %822, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !146 + %1884 = lshr exact i32 %1883, 4, !dbg !146 + %1885 = and i32 %1884, 16383, !dbg !146 + %1886 = zext nneg i32 %1885 to i64, !dbg !146 + %1887 = or disjoint i64 %1886, 4611686293372403712, !dbg !146 + %1888 = add i32 %1705, 8224, !dbg !146 + %1889 = lshr exact i32 %1888, 4, !dbg !146 + %1890 = and i32 %1889, 16383, !dbg !146 + %1891 = zext nneg i32 %1890 to i64, !dbg !146 + %1892 = or disjoint i64 %1891, 4611686293338849280, !dbg !146 + %1893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 0, !dbg !146 + %1894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 1, !dbg !146 + %1895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 2, !dbg !146 + %1896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 3, !dbg !146 + %1897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 4, !dbg !146 + %1898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 5, !dbg !146 + %1899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 6, !dbg !146 + %1900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 7, !dbg !146 + %1901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 8, !dbg !146 + %1902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 9, !dbg !146 + %1903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 10, !dbg !146 + %1904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 11, !dbg !146 + %1905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 12, !dbg !146 + %1906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 13, !dbg !146 + %1907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 14, !dbg !146 + %1908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 15, !dbg !146 + %1909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 16, !dbg !146 + %1910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 17, !dbg !146 + %1911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 18, !dbg !146 + %1912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 19, !dbg !146 + %1913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 20, !dbg !146 + %1914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 21, !dbg !146 + %1915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 22, !dbg !146 + %1916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 23, !dbg !146 + %1917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 24, !dbg !146 + %1918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 25, !dbg !146 + %1919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 26, !dbg !146 + %1920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 27, !dbg !146 + %1921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 28, !dbg !146 + %1922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 29, !dbg !146 + %1923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 30, !dbg !146 + %1924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1882, 31, !dbg !146 + %1925 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1893, float %1894, float %1895, float %1896, float %1897, float %1898, float %1899, float %1900, float %1901, float %1902, float %1903, float %1904, float %1905, float %1906, float %1907, float %1908, float %1909, float %1910, float %1911, float %1912, float %1913, float %1914, float %1915, float %1916, float %1917, float %1918, float %1919, float %1920, float %1921, float %1922, float %1923, float %1924, i64 %1887, i64 %1892, i1 true) #3, !dbg !146 + %1926 = add i32 %866, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !146 + %1927 = lshr exact i32 %1926, 4, !dbg !146 + %1928 = and i32 %1927, 16383, !dbg !146 + %1929 = zext nneg i32 %1928 to i64, !dbg !146 + %1930 = or disjoint i64 %1929, 4611686293372403712, !dbg !146 + %1931 = add i32 %1705, 8256, !dbg !146 + %1932 = lshr exact i32 %1931, 4, !dbg !146 + %1933 = and i32 %1932, 16383, !dbg !146 + %1934 = zext nneg i32 %1933 to i64, !dbg !146 + %1935 = or disjoint i64 %1934, 4611686293338849280, !dbg !146 + %1936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 0, !dbg !146 + %1937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 1, !dbg !146 + %1938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 2, !dbg !146 + %1939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 3, !dbg !146 + %1940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 4, !dbg !146 + %1941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 5, !dbg !146 + %1942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 6, !dbg !146 + %1943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 7, !dbg !146 + %1944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 8, !dbg !146 + %1945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 9, !dbg !146 + %1946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 10, !dbg !146 + %1947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 11, !dbg !146 + %1948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 12, !dbg !146 + %1949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 13, !dbg !146 + %1950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 14, !dbg !146 + %1951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 15, !dbg !146 + %1952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 16, !dbg !146 + %1953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 17, !dbg !146 + %1954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 18, !dbg !146 + %1955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 19, !dbg !146 + %1956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 20, !dbg !146 + %1957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 21, !dbg !146 + %1958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 22, !dbg !146 + %1959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 23, !dbg !146 + %1960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 24, !dbg !146 + %1961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 25, !dbg !146 + %1962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 26, !dbg !146 + %1963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 27, !dbg !146 + %1964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 28, !dbg !146 + %1965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 29, !dbg !146 + %1966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 30, !dbg !146 + %1967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1925, 31, !dbg !146 + %1968 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1936, float %1937, float %1938, float %1939, float %1940, float %1941, float %1942, float %1943, float %1944, float %1945, float %1946, float %1947, float %1948, float %1949, float %1950, float %1951, float %1952, float %1953, float %1954, float %1955, float %1956, float %1957, float %1958, float %1959, float %1960, float %1961, float %1962, float %1963, float %1964, float %1965, float %1966, float %1967, i64 %1930, i64 %1935, i1 true) #3, !dbg !146 + %1969 = add i32 %910, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !146 + %1970 = lshr exact i32 %1969, 4, !dbg !146 + %1971 = and i32 %1970, 16383, !dbg !146 + %1972 = zext nneg i32 %1971 to i64, !dbg !146 + %1973 = or disjoint i64 %1972, 4611686293372403712, !dbg !146 + %1974 = add i32 %1705, 8288, !dbg !146 + %1975 = lshr exact i32 %1974, 4, !dbg !146 + %1976 = and i32 %1975, 16383, !dbg !146 + %1977 = zext nneg i32 %1976 to i64, !dbg !146 + %1978 = or disjoint i64 %1977, 4611686293338849280, !dbg !146 + %1979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 0, !dbg !146 + %1980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 1, !dbg !146 + %1981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 2, !dbg !146 + %1982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 3, !dbg !146 + %1983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 4, !dbg !146 + %1984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 5, !dbg !146 + %1985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 6, !dbg !146 + %1986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 7, !dbg !146 + %1987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 8, !dbg !146 + %1988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 9, !dbg !146 + %1989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 10, !dbg !146 + %1990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 11, !dbg !146 + %1991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 12, !dbg !146 + %1992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 13, !dbg !146 + %1993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 14, !dbg !146 + %1994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 15, !dbg !146 + %1995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 16, !dbg !146 + %1996 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 17, !dbg !146 + %1997 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 18, !dbg !146 + %1998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 19, !dbg !146 + %1999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 20, !dbg !146 + %2000 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 21, !dbg !146 + %2001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 22, !dbg !146 + %2002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 23, !dbg !146 + %2003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 24, !dbg !146 + %2004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 25, !dbg !146 + %2005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 26, !dbg !146 + %2006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 27, !dbg !146 + %2007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 28, !dbg !146 + %2008 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 29, !dbg !146 + %2009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 30, !dbg !146 + %2010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1968, 31, !dbg !146 + %2011 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1979, float %1980, float %1981, float %1982, float %1983, float %1984, float %1985, float %1986, float %1987, float %1988, float %1989, float %1990, float %1991, float %1992, float %1993, float %1994, float %1995, float %1996, float %1997, float %1998, float %1999, float %2000, float %2001, float %2002, float %2003, float %2004, float %2005, float %2006, float %2007, float %2008, float %2009, float %2010, i64 %1973, i64 %1978, i1 true) #3, !dbg !146 + %2012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 0, !dbg !146 + %2013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 1, !dbg !146 + %2014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 2, !dbg !146 + %2015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 3, !dbg !146 + %2016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 4, !dbg !146 + %2017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 5, !dbg !146 + %2018 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 6, !dbg !146 + %2019 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 7, !dbg !146 + %2020 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 8, !dbg !146 + %2021 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 9, !dbg !146 + %2022 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 10, !dbg !146 + %2023 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 11, !dbg !146 + %2024 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 12, !dbg !146 + %2025 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 13, !dbg !146 + %2026 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 14, !dbg !146 + %2027 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 15, !dbg !146 + %2028 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 16, !dbg !146 + %2029 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 17, !dbg !146 + %2030 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 18, !dbg !146 + %2031 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 19, !dbg !146 + %2032 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 20, !dbg !146 + %2033 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 21, !dbg !146 + %2034 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 22, !dbg !146 + %2035 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 23, !dbg !146 + %2036 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 24, !dbg !146 + %2037 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 25, !dbg !146 + %2038 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 26, !dbg !146 + %2039 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 27, !dbg !146 + %2040 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 28, !dbg !146 + %2041 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 29, !dbg !146 + %2042 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 30, !dbg !146 + %2043 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2011, 31, !dbg !146 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !146 + %2044 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %2012, float %2013, float %2014, float %2015, float %2016, float %2017, float %2018, float %2019, float %2020, float %2021, float %2022, float %2023, float %2024, float %2025, float %2026, float %2027, float %2028, float %2029, float %2030, float %2031, float %2032, float %2033, float %2034, float %2035, float %2036, float %2037, float %2038, float %2039, float %2040, float %2041, float %2042, float %2043, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 0, i32 0, ptr addrspace(3) %1699, i32 0, i32 0) #3, !dbg !146 + %2045 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 0, !dbg !146 + %2046 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 1, !dbg !146 + %2047 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 2, !dbg !146 + %2048 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 3, !dbg !146 + %2049 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 4, !dbg !146 + %2050 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 5, !dbg !146 + %2051 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 6, !dbg !146 + %2052 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 7, !dbg !146 + %2053 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 8, !dbg !146 + %2054 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 9, !dbg !146 + %2055 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 10, !dbg !146 + %2056 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 11, !dbg !146 + %2057 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 12, !dbg !146 + %2058 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 13, !dbg !146 + %2059 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 14, !dbg !146 + %2060 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 15, !dbg !146 + %2061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 16, !dbg !146 + %2062 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 17, !dbg !146 + %2063 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 18, !dbg !146 + %2064 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 19, !dbg !146 + %2065 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 20, !dbg !146 + %2066 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 21, !dbg !146 + %2067 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 22, !dbg !146 + %2068 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 23, !dbg !146 + %2069 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 24, !dbg !146 + %2070 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 25, !dbg !146 + %2071 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 26, !dbg !146 + %2072 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 27, !dbg !146 + %2073 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 28, !dbg !146 + %2074 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 29, !dbg !146 + %2075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 30, !dbg !146 + %2076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2044, 31, !dbg !146 + %2077 = insertelement <2 x float> poison, float %2045, i64 0, !dbg !125 + %2078 = insertelement <2 x float> %2077, float %2046, i64 1, !dbg !125 + %2079 = fsub <2 x float> %2078, %539, !dbg !125 + %2080 = insertelement <2 x float> poison, float %.0.i1433, i64 0, !dbg !147 + %2081 = insertelement <2 x float> %2080, float %.0.i1436, i64 1, !dbg !147 + %2082 = fmul <2 x float> %2081, %2079, !dbg !147 + %2083 = fptrunc <2 x float> %2082 to <2 x bfloat>, !dbg !148 + %2084 = select <2 x i1> %1094, <2 x bfloat> %2083, <2 x bfloat> zeroinitializer, !dbg !149 + %2085 = insertelement <2 x float> poison, float %2047, i64 0, !dbg !125 + %2086 = insertelement <2 x float> %2085, float %2048, i64 1, !dbg !125 + %2087 = fsub <2 x float> %2086, %533, !dbg !125 + %2088 = insertelement <2 x float> poison, float %.0.i1439, i64 0, !dbg !147 + %2089 = insertelement <2 x float> %2088, float %.0.i1442, i64 1, !dbg !147 + %2090 = fmul <2 x float> %2089, %2087, !dbg !147 + %2091 = fptrunc <2 x float> %2090 to <2 x bfloat>, !dbg !148 + %2092 = select <2 x i1> %1095, <2 x bfloat> %2091, <2 x bfloat> zeroinitializer, !dbg !149 + %2093 = insertelement <2 x float> poison, float %2049, i64 0, !dbg !125 + %2094 = insertelement <2 x float> %2093, float %2050, i64 1, !dbg !125 + %2095 = fsub <2 x float> %2094, %539, !dbg !125 + %2096 = insertelement <2 x float> poison, float %.0.i1445, i64 0, !dbg !147 + %2097 = insertelement <2 x float> %2096, float %.0.i1448, i64 1, !dbg !147 + %2098 = fmul <2 x float> %2097, %2095, !dbg !147 + %2099 = fptrunc <2 x float> %2098 to <2 x bfloat>, !dbg !148 + %2100 = select <2 x i1> %1139, <2 x bfloat> %2099, <2 x bfloat> zeroinitializer, !dbg !149 + %2101 = insertelement <2 x float> poison, float %2051, i64 0, !dbg !125 + %2102 = insertelement <2 x float> %2101, float %2052, i64 1, !dbg !125 + %2103 = fsub <2 x float> %2102, %533, !dbg !125 + %2104 = insertelement <2 x float> poison, float %.0.i1451, i64 0, !dbg !147 + %2105 = insertelement <2 x float> %2104, float %.0.i1454, i64 1, !dbg !147 + %2106 = fmul <2 x float> %2105, %2103, !dbg !147 + %2107 = fptrunc <2 x float> %2106 to <2 x bfloat>, !dbg !148 + %2108 = select <2 x i1> %1140, <2 x bfloat> %2107, <2 x bfloat> zeroinitializer, !dbg !149 + %2109 = insertelement <2 x float> poison, float %2053, i64 0, !dbg !125 + %2110 = insertelement <2 x float> %2109, float %2054, i64 1, !dbg !125 + %2111 = fsub <2 x float> %2110, %539, !dbg !125 + %2112 = insertelement <2 x float> poison, float %.0.i1457, i64 0, !dbg !147 + %2113 = insertelement <2 x float> %2112, float %.0.i1460, i64 1, !dbg !147 + %2114 = fmul <2 x float> %2113, %2111, !dbg !147 + %2115 = fptrunc <2 x float> %2114 to <2 x bfloat>, !dbg !148 + %2116 = select <2 x i1> %1184, <2 x bfloat> %2115, <2 x bfloat> zeroinitializer, !dbg !149 + %2117 = insertelement <2 x float> poison, float %2055, i64 0, !dbg !125 + %2118 = insertelement <2 x float> %2117, float %2056, i64 1, !dbg !125 + %2119 = fsub <2 x float> %2118, %533, !dbg !125 + %2120 = insertelement <2 x float> poison, float %.0.i1463, i64 0, !dbg !147 + %2121 = insertelement <2 x float> %2120, float %.0.i1466, i64 1, !dbg !147 + %2122 = fmul <2 x float> %2121, %2119, !dbg !147 + %2123 = fptrunc <2 x float> %2122 to <2 x bfloat>, !dbg !148 + %2124 = select <2 x i1> %1185, <2 x bfloat> %2123, <2 x bfloat> zeroinitializer, !dbg !149 + %2125 = insertelement <2 x float> poison, float %2057, i64 0, !dbg !125 + %2126 = insertelement <2 x float> %2125, float %2058, i64 1, !dbg !125 + %2127 = fsub <2 x float> %2126, %539, !dbg !125 + %2128 = insertelement <2 x float> poison, float %.0.i1469, i64 0, !dbg !147 + %2129 = insertelement <2 x float> %2128, float %.0.i1472, i64 1, !dbg !147 + %2130 = fmul <2 x float> %2129, %2127, !dbg !147 + %2131 = fptrunc <2 x float> %2130 to <2 x bfloat>, !dbg !148 + %2132 = select <2 x i1> %1229, <2 x bfloat> %2131, <2 x bfloat> zeroinitializer, !dbg !149 + %2133 = insertelement <2 x float> poison, float %2059, i64 0, !dbg !125 + %2134 = insertelement <2 x float> %2133, float %2060, i64 1, !dbg !125 + %2135 = fsub <2 x float> %2134, %533, !dbg !125 + %2136 = insertelement <2 x float> poison, float %.0.i1475, i64 0, !dbg !147 + %2137 = insertelement <2 x float> %2136, float %.0.i1478, i64 1, !dbg !147 + %2138 = fmul <2 x float> %2137, %2135, !dbg !147 + %2139 = fptrunc <2 x float> %2138 to <2 x bfloat>, !dbg !148 + %2140 = select <2 x i1> %1230, <2 x bfloat> %2139, <2 x bfloat> zeroinitializer, !dbg !149 + %2141 = insertelement <2 x float> poison, float %2061, i64 0, !dbg !125 + %2142 = insertelement <2 x float> %2141, float %2062, i64 1, !dbg !125 + %2143 = fsub <2 x float> %2142, %539, !dbg !125 + %2144 = insertelement <2 x float> poison, float %.0.i1481, i64 0, !dbg !147 + %2145 = insertelement <2 x float> %2144, float %.0.i1484, i64 1, !dbg !147 + %2146 = fmul <2 x float> %2145, %2143, !dbg !147 + %2147 = fptrunc <2 x float> %2146 to <2 x bfloat>, !dbg !148 + %2148 = select <2 x i1> %1274, <2 x bfloat> %2147, <2 x bfloat> zeroinitializer, !dbg !149 + %2149 = insertelement <2 x float> poison, float %2063, i64 0, !dbg !125 + %2150 = insertelement <2 x float> %2149, float %2064, i64 1, !dbg !125 + %2151 = fsub <2 x float> %2150, %533, !dbg !125 + %2152 = insertelement <2 x float> poison, float %.0.i1487, i64 0, !dbg !147 + %2153 = insertelement <2 x float> %2152, float %.0.i1490, i64 1, !dbg !147 + %2154 = fmul <2 x float> %2153, %2151, !dbg !147 + %2155 = fptrunc <2 x float> %2154 to <2 x bfloat>, !dbg !148 + %2156 = select <2 x i1> %1275, <2 x bfloat> %2155, <2 x bfloat> zeroinitializer, !dbg !149 + %2157 = insertelement <2 x float> poison, float %2065, i64 0, !dbg !125 + %2158 = insertelement <2 x float> %2157, float %2066, i64 1, !dbg !125 + %2159 = fsub <2 x float> %2158, %539, !dbg !125 + %2160 = insertelement <2 x float> poison, float %.0.i1493, i64 0, !dbg !147 + %2161 = insertelement <2 x float> %2160, float %.0.i1496, i64 1, !dbg !147 + %2162 = fmul <2 x float> %2161, %2159, !dbg !147 + %2163 = fptrunc <2 x float> %2162 to <2 x bfloat>, !dbg !148 + %2164 = select <2 x i1> %1319, <2 x bfloat> %2163, <2 x bfloat> zeroinitializer, !dbg !149 + %2165 = insertelement <2 x float> poison, float %2067, i64 0, !dbg !125 + %2166 = insertelement <2 x float> %2165, float %2068, i64 1, !dbg !125 + %2167 = fsub <2 x float> %2166, %533, !dbg !125 + %2168 = insertelement <2 x float> poison, float %.0.i1499, i64 0, !dbg !147 + %2169 = insertelement <2 x float> %2168, float %.0.i1502, i64 1, !dbg !147 + %2170 = fmul <2 x float> %2169, %2167, !dbg !147 + %2171 = fptrunc <2 x float> %2170 to <2 x bfloat>, !dbg !148 + %2172 = select <2 x i1> %1320, <2 x bfloat> %2171, <2 x bfloat> zeroinitializer, !dbg !149 + %2173 = insertelement <2 x float> poison, float %2069, i64 0, !dbg !125 + %2174 = insertelement <2 x float> %2173, float %2070, i64 1, !dbg !125 + %2175 = fsub <2 x float> %2174, %539, !dbg !125 + %2176 = insertelement <2 x float> poison, float %.0.i1505, i64 0, !dbg !147 + %2177 = insertelement <2 x float> %2176, float %.0.i1508, i64 1, !dbg !147 + %2178 = fmul <2 x float> %2177, %2175, !dbg !147 + %2179 = fptrunc <2 x float> %2178 to <2 x bfloat>, !dbg !148 + %2180 = select <2 x i1> %1364, <2 x bfloat> %2179, <2 x bfloat> zeroinitializer, !dbg !149 + %2181 = insertelement <2 x float> poison, float %2071, i64 0, !dbg !125 + %2182 = insertelement <2 x float> %2181, float %2072, i64 1, !dbg !125 + %2183 = fsub <2 x float> %2182, %533, !dbg !125 + %2184 = insertelement <2 x float> poison, float %.0.i1511, i64 0, !dbg !147 + %2185 = insertelement <2 x float> %2184, float %.0.i1514, i64 1, !dbg !147 + %2186 = fmul <2 x float> %2185, %2183, !dbg !147 + %2187 = fptrunc <2 x float> %2186 to <2 x bfloat>, !dbg !148 + %2188 = select <2 x i1> %1365, <2 x bfloat> %2187, <2 x bfloat> zeroinitializer, !dbg !149 + %2189 = insertelement <2 x float> poison, float %2073, i64 0, !dbg !125 + %2190 = insertelement <2 x float> %2189, float %2074, i64 1, !dbg !125 + %2191 = fsub <2 x float> %2190, %539, !dbg !125 + %2192 = insertelement <2 x float> poison, float %.0.i1517, i64 0, !dbg !147 + %2193 = insertelement <2 x float> %2192, float %.0.i1520, i64 1, !dbg !147 + %2194 = fmul <2 x float> %2193, %2191, !dbg !147 + %2195 = fptrunc <2 x float> %2194 to <2 x bfloat>, !dbg !148 + %2196 = select <2 x i1> %1409, <2 x bfloat> %2195, <2 x bfloat> zeroinitializer, !dbg !149 + %2197 = insertelement <2 x float> poison, float %2075, i64 0, !dbg !125 + %2198 = insertelement <2 x float> %2197, float %2076, i64 1, !dbg !125 + %2199 = fsub <2 x float> %2198, %533, !dbg !125 + %2200 = insertelement <2 x float> poison, float %.0.i1523, i64 0, !dbg !147 + %2201 = insertelement <2 x float> %2200, float %.0.i1526, i64 1, !dbg !147 + %2202 = fmul <2 x float> %2201, %2199, !dbg !147 + %2203 = fptrunc <2 x float> %2202 to <2 x bfloat>, !dbg !148 + %2204 = select <2 x i1> %1410, <2 x bfloat> %2203, <2 x bfloat> zeroinitializer, !dbg !149 + %2205 = bitcast <2 x bfloat> %2084 to i32, !dbg !150 + %2206 = bitcast <2 x bfloat> %2092 to i32, !dbg !150 + %2207 = bitcast <2 x bfloat> %2100 to i32, !dbg !150 + %2208 = bitcast <2 x bfloat> %2108 to i32, !dbg !150 + %2209 = bitcast <2 x bfloat> %2116 to i32, !dbg !150 + %2210 = bitcast <2 x bfloat> %2124 to i32, !dbg !150 + %2211 = bitcast <2 x bfloat> %2132 to i32, !dbg !150 + %2212 = bitcast <2 x bfloat> %2140 to i32, !dbg !150 + %2213 = bitcast <2 x bfloat> %2148 to i32, !dbg !150 + %2214 = bitcast <2 x bfloat> %2156 to i32, !dbg !150 + %2215 = bitcast <2 x bfloat> %2164 to i32, !dbg !150 + %2216 = bitcast <2 x bfloat> %2172 to i32, !dbg !150 + %2217 = bitcast <2 x bfloat> %2180 to i32, !dbg !150 + %2218 = bitcast <2 x bfloat> %2188 to i32, !dbg !150 + %2219 = bitcast <2 x bfloat> %2196 to i32, !dbg !150 + %2220 = bitcast <2 x bfloat> %2204 to i32, !dbg !150 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !150 + %2221 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %544, float %545, float %546, float %547, float %548, float %549, float %550, float %551, float %552, float %553, float %554, float %555, float %556, float %557, float %558, float %559, float %560, float %561, float %562, float %563, float %564, float %565, float %566, float %567, float %568, float %569, float %570, float %571, float %572, float %573, float %574, float %575, float %576, float %577, float %578, float %579, float %580, float %581, float %582, float %583, float %584, float %585, float %586, float %587, float %588, float %589, float %590, float %591, float %592, float %593, float %594, float %595, float %596, float %597, float %598, float %599, float %600, float %601, float %602, float %603, float %604, float %605, float %606, float %607, i32 %2205, i32 %2206, i32 %2207, i32 %2208, i64 %644, i1 true) #3, !dbg !150 + %2222 = add i32 %640, 2048, !dbg !150 + %2223 = lshr exact i32 %2222, 4, !dbg !150 + %2224 = and i32 %2223, 16383, !dbg !150 + %2225 = zext nneg i32 %2224 to i64, !dbg !150 + %2226 = or disjoint i64 %2225, 4611686293338849280, !dbg !150 + %2227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 0, !dbg !150 + %2228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 1, !dbg !150 + %2229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 2, !dbg !150 + %2230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 3, !dbg !150 + %2231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 4, !dbg !150 + %2232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 5, !dbg !150 + %2233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 6, !dbg !150 + %2234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 7, !dbg !150 + %2235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 8, !dbg !150 + %2236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 9, !dbg !150 + %2237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 10, !dbg !150 + %2238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 11, !dbg !150 + %2239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 12, !dbg !150 + %2240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 13, !dbg !150 + %2241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 14, !dbg !150 + %2242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 15, !dbg !150 + %2243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 16, !dbg !150 + %2244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 17, !dbg !150 + %2245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 18, !dbg !150 + %2246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 19, !dbg !150 + %2247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 20, !dbg !150 + %2248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 21, !dbg !150 + %2249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 22, !dbg !150 + %2250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 23, !dbg !150 + %2251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 24, !dbg !150 + %2252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 25, !dbg !150 + %2253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 26, !dbg !150 + %2254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 27, !dbg !150 + %2255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 28, !dbg !150 + %2256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 29, !dbg !150 + %2257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 30, !dbg !150 + %2258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 31, !dbg !150 + %2259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 32, !dbg !150 + %2260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 33, !dbg !150 + %2261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 34, !dbg !150 + %2262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 35, !dbg !150 + %2263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 36, !dbg !150 + %2264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 37, !dbg !150 + %2265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 38, !dbg !150 + %2266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 39, !dbg !150 + %2267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 40, !dbg !150 + %2268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 41, !dbg !150 + %2269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 42, !dbg !150 + %2270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 43, !dbg !150 + %2271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 44, !dbg !150 + %2272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 45, !dbg !150 + %2273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 46, !dbg !150 + %2274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 47, !dbg !150 + %2275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 48, !dbg !150 + %2276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 49, !dbg !150 + %2277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 50, !dbg !150 + %2278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 51, !dbg !150 + %2279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 52, !dbg !150 + %2280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 53, !dbg !150 + %2281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 54, !dbg !150 + %2282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 55, !dbg !150 + %2283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 56, !dbg !150 + %2284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 57, !dbg !150 + %2285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 58, !dbg !150 + %2286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 59, !dbg !150 + %2287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 60, !dbg !150 + %2288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 61, !dbg !150 + %2289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 62, !dbg !150 + %2290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2221, 63, !dbg !150 + %2291 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %2227, float %2228, float %2229, float %2230, float %2231, float %2232, float %2233, float %2234, float %2235, float %2236, float %2237, float %2238, float %2239, float %2240, float %2241, float %2242, float %2243, float %2244, float %2245, float %2246, float %2247, float %2248, float %2249, float %2250, float %2251, float %2252, float %2253, float %2254, float %2255, float %2256, float %2257, float %2258, float %2259, float %2260, float %2261, float %2262, float %2263, float %2264, float %2265, float %2266, float %2267, float %2268, float %2269, float %2270, float %2271, float %2272, float %2273, float %2274, float %2275, float %2276, float %2277, float %2278, float %2279, float %2280, float %2281, float %2282, float %2283, float %2284, float %2285, float %2286, float %2287, float %2288, float %2289, float %2290, i32 %2209, i32 %2210, i32 %2211, i32 %2212, i64 %2226, i1 true) #3, !dbg !150 + %2292 = add i32 %640, 4096, !dbg !150 + %2293 = lshr exact i32 %2292, 4, !dbg !150 + %2294 = and i32 %2293, 16383, !dbg !150 + %2295 = zext nneg i32 %2294 to i64, !dbg !150 + %2296 = or disjoint i64 %2295, 4611686293338849280, !dbg !150 + %2297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 0, !dbg !150 + %2298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 1, !dbg !150 + %2299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 2, !dbg !150 + %2300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 3, !dbg !150 + %2301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 4, !dbg !150 + %2302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 5, !dbg !150 + %2303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 6, !dbg !150 + %2304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 7, !dbg !150 + %2305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 8, !dbg !150 + %2306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 9, !dbg !150 + %2307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 10, !dbg !150 + %2308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 11, !dbg !150 + %2309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 12, !dbg !150 + %2310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 13, !dbg !150 + %2311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 14, !dbg !150 + %2312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 15, !dbg !150 + %2313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 16, !dbg !150 + %2314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 17, !dbg !150 + %2315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 18, !dbg !150 + %2316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 19, !dbg !150 + %2317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 20, !dbg !150 + %2318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 21, !dbg !150 + %2319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 22, !dbg !150 + %2320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 23, !dbg !150 + %2321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 24, !dbg !150 + %2322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 25, !dbg !150 + %2323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 26, !dbg !150 + %2324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 27, !dbg !150 + %2325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 28, !dbg !150 + %2326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 29, !dbg !150 + %2327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 30, !dbg !150 + %2328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 31, !dbg !150 + %2329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 32, !dbg !150 + %2330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 33, !dbg !150 + %2331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 34, !dbg !150 + %2332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 35, !dbg !150 + %2333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 36, !dbg !150 + %2334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 37, !dbg !150 + %2335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 38, !dbg !150 + %2336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 39, !dbg !150 + %2337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 40, !dbg !150 + %2338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 41, !dbg !150 + %2339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 42, !dbg !150 + %2340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 43, !dbg !150 + %2341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 44, !dbg !150 + %2342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 45, !dbg !150 + %2343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 46, !dbg !150 + %2344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 47, !dbg !150 + %2345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 48, !dbg !150 + %2346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 49, !dbg !150 + %2347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 50, !dbg !150 + %2348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 51, !dbg !150 + %2349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 52, !dbg !150 + %2350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 53, !dbg !150 + %2351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 54, !dbg !150 + %2352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 55, !dbg !150 + %2353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 56, !dbg !150 + %2354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 57, !dbg !150 + %2355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 58, !dbg !150 + %2356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 59, !dbg !150 + %2357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 60, !dbg !150 + %2358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 61, !dbg !150 + %2359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 62, !dbg !150 + %2360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2291, 63, !dbg !150 + %2361 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %2297, float %2298, float %2299, float %2300, float %2301, float %2302, float %2303, float %2304, float %2305, float %2306, float %2307, float %2308, float %2309, float %2310, float %2311, float %2312, float %2313, float %2314, float %2315, float %2316, float %2317, float %2318, float %2319, float %2320, float %2321, float %2322, float %2323, float %2324, float %2325, float %2326, float %2327, float %2328, float %2329, float %2330, float %2331, float %2332, float %2333, float %2334, float %2335, float %2336, float %2337, float %2338, float %2339, float %2340, float %2341, float %2342, float %2343, float %2344, float %2345, float %2346, float %2347, float %2348, float %2349, float %2350, float %2351, float %2352, float %2353, float %2354, float %2355, float %2356, float %2357, float %2358, float %2359, float %2360, i32 %2213, i32 %2214, i32 %2215, i32 %2216, i64 %2296, i1 true) #3, !dbg !150 + %2362 = add i32 %640, 6144, !dbg !150 + %2363 = lshr exact i32 %2362, 4, !dbg !150 + %2364 = and i32 %2363, 16383, !dbg !150 + %2365 = zext nneg i32 %2364 to i64, !dbg !150 + %2366 = or disjoint i64 %2365, 4611686293338849280, !dbg !150 + %2367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 0, !dbg !150 + %2368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 1, !dbg !150 + %2369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 2, !dbg !150 + %2370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 3, !dbg !150 + %2371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 4, !dbg !150 + %2372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 5, !dbg !150 + %2373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 6, !dbg !150 + %2374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 7, !dbg !150 + %2375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 8, !dbg !150 + %2376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 9, !dbg !150 + %2377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 10, !dbg !150 + %2378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 11, !dbg !150 + %2379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 12, !dbg !150 + %2380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 13, !dbg !150 + %2381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 14, !dbg !150 + %2382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 15, !dbg !150 + %2383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 16, !dbg !150 + %2384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 17, !dbg !150 + %2385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 18, !dbg !150 + %2386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 19, !dbg !150 + %2387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 20, !dbg !150 + %2388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 21, !dbg !150 + %2389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 22, !dbg !150 + %2390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 23, !dbg !150 + %2391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 24, !dbg !150 + %2392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 25, !dbg !150 + %2393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 26, !dbg !150 + %2394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 27, !dbg !150 + %2395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 28, !dbg !150 + %2396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 29, !dbg !150 + %2397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 30, !dbg !150 + %2398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 31, !dbg !150 + %2399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 32, !dbg !150 + %2400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 33, !dbg !150 + %2401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 34, !dbg !150 + %2402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 35, !dbg !150 + %2403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 36, !dbg !150 + %2404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 37, !dbg !150 + %2405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 38, !dbg !150 + %2406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 39, !dbg !150 + %2407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 40, !dbg !150 + %2408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 41, !dbg !150 + %2409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 42, !dbg !150 + %2410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 43, !dbg !150 + %2411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 44, !dbg !150 + %2412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 45, !dbg !150 + %2413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 46, !dbg !150 + %2414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 47, !dbg !150 + %2415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 48, !dbg !150 + %2416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 49, !dbg !150 + %2417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 50, !dbg !150 + %2418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 51, !dbg !150 + %2419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 52, !dbg !150 + %2420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 53, !dbg !150 + %2421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 54, !dbg !150 + %2422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 55, !dbg !150 + %2423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 56, !dbg !150 + %2424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 57, !dbg !150 + %2425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 58, !dbg !150 + %2426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 59, !dbg !150 + %2427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 60, !dbg !150 + %2428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 61, !dbg !150 + %2429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 62, !dbg !150 + %2430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2361, 63, !dbg !150 + %2431 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %2367, float %2368, float %2369, float %2370, float %2371, float %2372, float %2373, float %2374, float %2375, float %2376, float %2377, float %2378, float %2379, float %2380, float %2381, float %2382, float %2383, float %2384, float %2385, float %2386, float %2387, float %2388, float %2389, float %2390, float %2391, float %2392, float %2393, float %2394, float %2395, float %2396, float %2397, float %2398, float %2399, float %2400, float %2401, float %2402, float %2403, float %2404, float %2405, float %2406, float %2407, float %2408, float %2409, float %2410, float %2411, float %2412, float %2413, float %2414, float %2415, float %2416, float %2417, float %2418, float %2419, float %2420, float %2421, float %2422, float %2423, float %2424, float %2425, float %2426, float %2427, float %2428, float %2429, float %2430, i32 %2217, i32 %2218, i32 %2219, i32 %2220, i64 %2366, i1 true) #3, !dbg !150 + %2432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 0, !dbg !150 + %2433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 1, !dbg !150 + %2434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 2, !dbg !150 + %2435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 3, !dbg !150 + %2436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 4, !dbg !150 + %2437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 5, !dbg !150 + %2438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 6, !dbg !150 + %2439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 7, !dbg !150 + %2440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 8, !dbg !150 + %2441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 9, !dbg !150 + %2442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 10, !dbg !150 + %2443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 11, !dbg !150 + %2444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 12, !dbg !150 + %2445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 13, !dbg !150 + %2446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 14, !dbg !150 + %2447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 15, !dbg !150 + %2448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 16, !dbg !150 + %2449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 17, !dbg !150 + %2450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 18, !dbg !150 + %2451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 19, !dbg !150 + %2452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 20, !dbg !150 + %2453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 21, !dbg !150 + %2454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 22, !dbg !150 + %2455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 23, !dbg !150 + %2456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 24, !dbg !150 + %2457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 25, !dbg !150 + %2458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 26, !dbg !150 + %2459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 27, !dbg !150 + %2460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 28, !dbg !150 + %2461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 29, !dbg !150 + %2462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 30, !dbg !150 + %2463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 31, !dbg !150 + %2464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 32, !dbg !150 + %2465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 33, !dbg !150 + %2466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 34, !dbg !150 + %2467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 35, !dbg !150 + %2468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 36, !dbg !150 + %2469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 37, !dbg !150 + %2470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 38, !dbg !150 + %2471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 39, !dbg !150 + %2472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 40, !dbg !150 + %2473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 41, !dbg !150 + %2474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 42, !dbg !150 + %2475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 43, !dbg !150 + %2476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 44, !dbg !150 + %2477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 45, !dbg !150 + %2478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 46, !dbg !150 + %2479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 47, !dbg !150 + %2480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 48, !dbg !150 + %2481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 49, !dbg !150 + %2482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 50, !dbg !150 + %2483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 51, !dbg !150 + %2484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 52, !dbg !150 + %2485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 53, !dbg !150 + %2486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 54, !dbg !150 + %2487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 55, !dbg !150 + %2488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 56, !dbg !150 + %2489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 57, !dbg !150 + %2490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 58, !dbg !150 + %2491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 59, !dbg !150 + %2492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 60, !dbg !150 + %2493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 61, !dbg !150 + %2494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 62, !dbg !150 + %2495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2431, 63, !dbg !150 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !150 + %2496 = insertelement <2 x i32> poison, i32 %541, i64 0, !dbg !116 + %2497 = shufflevector <2 x i32> %2496, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !116 + %2498 = add <2 x i32> %2497, %616, !dbg !116 + %2499 = add <2 x i32> %2497, %615, !dbg !116 + %2500 = add <2 x i32> %2497, %614, !dbg !116 + %2501 = add <2 x i32> %2497, %613, !dbg !116 + %2502 = add <2 x i32> %2497, %612, !dbg !116 + %2503 = add <2 x i32> %2497, %611, !dbg !116 + %2504 = add <2 x i32> %2497, %610, !dbg !116 + %2505 = add <2 x i32> %2497, %609, !dbg !116 + %2506 = add nuw nsw i32 %608, 1, !dbg !110 + %2507 = lshr i32 %2506, 1, !dbg !151 + %2508 = zext nneg i32 %2507 to i64, !dbg !152 + %2509 = getelementptr i32, ptr addrspace(1) %384, i64 %2508, !dbg !152 + %2510 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !153 + %2511 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %2509, i64 %2510, i1 %618) #3, !dbg !153 + %2512 = add nuw nsw i32 %2507, 1, !dbg !154 + %2513 = icmp slt i32 %2512, %389, !dbg !155 + %2514 = getelementptr i8, ptr addrspace(1) %2509, i64 4, !dbg !156 + %2515 = and i1 %618, %2513, !dbg !110 + %2516 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !157 + %2517 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %2514, i64 %2516, i1 %2515) #3, !dbg !157 + %2518 = and i32 %608, 1, !dbg !158 + %2519 = sub i32 %2517, %2511, !dbg !159 + %2520 = shl i32 %2519, 7, !dbg !160 + %2521 = add i32 %2520, -64, !dbg !161 + %2522 = xor i32 %2518, 1, !dbg !162 + %2523 = mul nuw nsw i32 %2521, %2522, !dbg !162 + %2524 = shl nuw nsw i32 %2518, 6, !dbg !163 + %2525 = add i32 %2523, %2524, !dbg !164 + %2526 = shl i32 %2525, 7, !dbg !165 + %2527 = sext i32 %2526 to i64, !dbg !114 + %2528 = getelementptr bfloat, ptr addrspace(1) %.pn8741535, i64 %2527, !dbg !114 + %2529 = getelementptr bfloat, ptr addrspace(1) %.pn8581536, i64 %2527, !dbg !114 + %2530 = getelementptr bfloat, ptr addrspace(1) %.pn8421537, i64 %2527, !dbg !114 + %2531 = getelementptr bfloat, ptr addrspace(1) %.pn8261538, i64 %2527, !dbg !114 + %2532 = getelementptr bfloat, ptr addrspace(1) %.pn9461543, i64 %2527, !dbg !115 + %2533 = getelementptr bfloat, ptr addrspace(1) %.pn9301544, i64 %2527, !dbg !115 + %2534 = getelementptr bfloat, ptr addrspace(1) %.pn9141545, i64 %2527, !dbg !115 + %2535 = getelementptr bfloat, ptr addrspace(1) %.pn8981546, i64 %2527, !dbg !115 + %2536 = add i32 %2525, %.pn8821539, !dbg !116 + %2537 = add i32 %2525, %.pn8801540, !dbg !116 + %2538 = add i32 %2525, %.pn8781541, !dbg !116 + %2539 = add i32 %2525, %.pn8761542, !dbg !116 + %2540 = add i32 %543, 1, !dbg !110 + %2541 = icmp sgt i32 %2540, 2, !dbg !110 + %2542 = select i1 %2541, i32 0, i32 %2540, !dbg !110 + %2543 = icmp slt i32 %2536, %19, !dbg !112 + %2544 = icmp slt i32 %2537, %19, !dbg !112 + %2545 = icmp slt i32 %2538, %19, !dbg !112 + %2546 = icmp slt i32 %2539, %19, !dbg !112 + %2547 = shl i32 %2542, 13, !dbg !113 + %2548 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %2547, !dbg !113 + %2549 = and i1 %617, %2543, !dbg !110 + %2550 = and i1 %617, %2544, !dbg !110 + %2551 = and i1 %617, %2545, !dbg !110 + %2552 = and i1 %617, %2546, !dbg !110 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !113 + %2553 = getelementptr inbounds nuw i8, ptr addrspace(3) %2548, i32 %448, !dbg !113 + %2554 = select i1 %2549, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %2553, ptr addrspace(1) %2528, i32 %2554) #3, !dbg !113 + %2555 = getelementptr inbounds nuw i8, ptr addrspace(3) %2548, i32 %451, !dbg !113 + %2556 = select i1 %2550, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2555, ptr addrspace(1) %2529, i32 %2556) #3, !dbg !113 + %2557 = getelementptr inbounds nuw i8, ptr addrspace(3) %2548, i32 %454, !dbg !113 + %2558 = select i1 %2551, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2557, ptr addrspace(1) %2530, i32 %2558) #3, !dbg !113 + %2559 = getelementptr inbounds nuw i8, ptr addrspace(3) %2548, i32 %457, !dbg !113 + %2560 = select i1 %2552, i32 16, i32 0, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2559, ptr addrspace(1) %2531, i32 %2560) #3, !dbg !113 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !113 + %2561 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %2547, !dbg !113 + %2562 = getelementptr inbounds nuw i8, ptr addrspace(3) %2561, i32 %448, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %2562, ptr addrspace(1) %2532, i32 %2554) #3, !dbg !113 + %2563 = getelementptr inbounds nuw i8, ptr addrspace(3) %2561, i32 %451, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2563, ptr addrspace(1) %2533, i32 %2556) #3, !dbg !113 + %2564 = getelementptr inbounds nuw i8, ptr addrspace(3) %2561, i32 %454, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2564, ptr addrspace(1) %2534, i32 %2558) #3, !dbg !113 + %2565 = getelementptr inbounds nuw i8, ptr addrspace(3) %2561, i32 %457, !dbg !113 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2565, ptr addrspace(1) %2535, i32 %2560) #3, !dbg !113 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !113 + %exitcond.not = icmp eq i32 %2506, %smax, !dbg !110 + br i1 %exitcond.not, label %._crit_edge, label %540, !dbg !110 + +._crit_edge: ; preds = %__nv_exp2f.exit1527, %73 + %2566 = phi float [ 0.000000e+00, %73 ], [ %2432, %__nv_exp2f.exit1527 ] + %2567 = phi float [ 0.000000e+00, %73 ], [ %2433, %__nv_exp2f.exit1527 ] + %2568 = phi float [ 0.000000e+00, %73 ], [ %2434, %__nv_exp2f.exit1527 ] + %2569 = phi float [ 0.000000e+00, %73 ], [ %2435, %__nv_exp2f.exit1527 ] + %2570 = phi float [ 0.000000e+00, %73 ], [ %2436, %__nv_exp2f.exit1527 ] + %2571 = phi float [ 0.000000e+00, %73 ], [ %2437, %__nv_exp2f.exit1527 ] + %2572 = phi float [ 0.000000e+00, %73 ], [ %2438, %__nv_exp2f.exit1527 ] + %2573 = phi float [ 0.000000e+00, %73 ], [ %2439, %__nv_exp2f.exit1527 ] + %2574 = phi float [ 0.000000e+00, %73 ], [ %2440, %__nv_exp2f.exit1527 ] + %2575 = phi float [ 0.000000e+00, %73 ], [ %2441, %__nv_exp2f.exit1527 ] + %2576 = phi float [ 0.000000e+00, %73 ], [ %2442, %__nv_exp2f.exit1527 ] + %2577 = phi float [ 0.000000e+00, %73 ], [ %2443, %__nv_exp2f.exit1527 ] + %2578 = phi float [ 0.000000e+00, %73 ], [ %2444, %__nv_exp2f.exit1527 ] + %2579 = phi float [ 0.000000e+00, %73 ], [ %2445, %__nv_exp2f.exit1527 ] + %2580 = phi float [ 0.000000e+00, %73 ], [ %2446, %__nv_exp2f.exit1527 ] + %2581 = phi float [ 0.000000e+00, %73 ], [ %2447, %__nv_exp2f.exit1527 ] + %2582 = phi float [ 0.000000e+00, %73 ], [ %2448, %__nv_exp2f.exit1527 ] + %2583 = phi float [ 0.000000e+00, %73 ], [ %2449, %__nv_exp2f.exit1527 ] + %2584 = phi float [ 0.000000e+00, %73 ], [ %2450, %__nv_exp2f.exit1527 ] + %2585 = phi float [ 0.000000e+00, %73 ], [ %2451, %__nv_exp2f.exit1527 ] + %2586 = phi float [ 0.000000e+00, %73 ], [ %2452, %__nv_exp2f.exit1527 ] + %2587 = phi float [ 0.000000e+00, %73 ], [ %2453, %__nv_exp2f.exit1527 ] + %2588 = phi float [ 0.000000e+00, %73 ], [ %2454, %__nv_exp2f.exit1527 ] + %2589 = phi float [ 0.000000e+00, %73 ], [ %2455, %__nv_exp2f.exit1527 ] + %2590 = phi float [ 0.000000e+00, %73 ], [ %2456, %__nv_exp2f.exit1527 ] + %2591 = phi float [ 0.000000e+00, %73 ], [ %2457, %__nv_exp2f.exit1527 ] + %2592 = phi float [ 0.000000e+00, %73 ], [ %2458, %__nv_exp2f.exit1527 ] + %2593 = phi float [ 0.000000e+00, %73 ], [ %2459, %__nv_exp2f.exit1527 ] + %2594 = phi float [ 0.000000e+00, %73 ], [ %2460, %__nv_exp2f.exit1527 ] + %2595 = phi float [ 0.000000e+00, %73 ], [ %2461, %__nv_exp2f.exit1527 ] + %2596 = phi float [ 0.000000e+00, %73 ], [ %2462, %__nv_exp2f.exit1527 ] + %2597 = phi float [ 0.000000e+00, %73 ], [ %2463, %__nv_exp2f.exit1527 ] + %2598 = phi float [ 0.000000e+00, %73 ], [ %2464, %__nv_exp2f.exit1527 ] + %2599 = phi float [ 0.000000e+00, %73 ], [ %2465, %__nv_exp2f.exit1527 ] + %2600 = phi float [ 0.000000e+00, %73 ], [ %2466, %__nv_exp2f.exit1527 ] + %2601 = phi float [ 0.000000e+00, %73 ], [ %2467, %__nv_exp2f.exit1527 ] + %2602 = phi float [ 0.000000e+00, %73 ], [ %2468, %__nv_exp2f.exit1527 ] + %2603 = phi float [ 0.000000e+00, %73 ], [ %2469, %__nv_exp2f.exit1527 ] + %2604 = phi float [ 0.000000e+00, %73 ], [ %2470, %__nv_exp2f.exit1527 ] + %2605 = phi float [ 0.000000e+00, %73 ], [ %2471, %__nv_exp2f.exit1527 ] + %2606 = phi float [ 0.000000e+00, %73 ], [ %2472, %__nv_exp2f.exit1527 ] + %2607 = phi float [ 0.000000e+00, %73 ], [ %2473, %__nv_exp2f.exit1527 ] + %2608 = phi float [ 0.000000e+00, %73 ], [ %2474, %__nv_exp2f.exit1527 ] + %2609 = phi float [ 0.000000e+00, %73 ], [ %2475, %__nv_exp2f.exit1527 ] + %2610 = phi float [ 0.000000e+00, %73 ], [ %2476, %__nv_exp2f.exit1527 ] + %2611 = phi float [ 0.000000e+00, %73 ], [ %2477, %__nv_exp2f.exit1527 ] + %2612 = phi float [ 0.000000e+00, %73 ], [ %2478, %__nv_exp2f.exit1527 ] + %2613 = phi float [ 0.000000e+00, %73 ], [ %2479, %__nv_exp2f.exit1527 ] + %2614 = phi float [ 0.000000e+00, %73 ], [ %2480, %__nv_exp2f.exit1527 ] + %2615 = phi float [ 0.000000e+00, %73 ], [ %2481, %__nv_exp2f.exit1527 ] + %2616 = phi float [ 0.000000e+00, %73 ], [ %2482, %__nv_exp2f.exit1527 ] + %2617 = phi float [ 0.000000e+00, %73 ], [ %2483, %__nv_exp2f.exit1527 ] + %2618 = phi float [ 0.000000e+00, %73 ], [ %2484, %__nv_exp2f.exit1527 ] + %2619 = phi float [ 0.000000e+00, %73 ], [ %2485, %__nv_exp2f.exit1527 ] + %2620 = phi float [ 0.000000e+00, %73 ], [ %2486, %__nv_exp2f.exit1527 ] + %2621 = phi float [ 0.000000e+00, %73 ], [ %2487, %__nv_exp2f.exit1527 ] + %2622 = phi float [ 0.000000e+00, %73 ], [ %2488, %__nv_exp2f.exit1527 ] + %2623 = phi float [ 0.000000e+00, %73 ], [ %2489, %__nv_exp2f.exit1527 ] + %2624 = phi float [ 0.000000e+00, %73 ], [ %2490, %__nv_exp2f.exit1527 ] + %2625 = phi float [ 0.000000e+00, %73 ], [ %2491, %__nv_exp2f.exit1527 ] + %2626 = phi float [ 0.000000e+00, %73 ], [ %2492, %__nv_exp2f.exit1527 ] + %2627 = phi float [ 0.000000e+00, %73 ], [ %2493, %__nv_exp2f.exit1527 ] + %2628 = phi float [ 0.000000e+00, %73 ], [ %2494, %__nv_exp2f.exit1527 ] + %2629 = phi float [ 0.000000e+00, %73 ], [ %2495, %__nv_exp2f.exit1527 ] + %2630 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %2566, float %2567, float %2568, float %2569, float %2570, float %2571, float %2572, float %2573, float %2574, float %2575, float %2576, float %2577, float %2578, float %2579, float %2580, float %2581, float %2582, float %2583, float %2584, float %2585, float %2586, float %2587, float %2588, float %2589, float %2590, float %2591, float %2592, float %2593, float %2594, float %2595, float %2596, float %2597, float %2598, float %2599, float %2600, float %2601, float %2602, float %2603, float %2604, float %2605, float %2606, float %2607, float %2608, float %2609, float %2610, float %2611, float %2612, float %2613, float %2614, float %2615, float %2616, float %2617, float %2618, float %2619, float %2620, float %2621, float %2622, float %2623, float %2624, float %2625, float %2626, float %2627, float %2628, float %2629) #3, !dbg !110 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !110 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !110 + %2631 = getelementptr i32, ptr addrspace(1) %13, i64 %383, !dbg !166 + %2632 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %2631) #3, !dbg !167 + %2633 = shl i32 %2632, 7, !dbg !168 + %2634 = getelementptr i32, ptr addrspace(1) %12, i64 %387, !dbg !169 + %2635 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %2634) #3, !dbg !170 + %2636 = or disjoint i32 %2633, %59, !dbg !171 + %2637 = or disjoint i32 %2633, %60, !dbg !171 + %2638 = or disjoint i32 %2633, %61, !dbg !171 + %2639 = or disjoint i32 %2633, %62, !dbg !171 + %2640 = shl i32 %2636, 7, !dbg !172 + %2641 = shl i32 %2637, 7, !dbg !172 + %2642 = shl i32 %2638, 7, !dbg !172 + %2643 = shl i32 %2639, 7, !dbg !172 + %2644 = sext i32 %2640 to i64, !dbg !174 + %2645 = getelementptr bfloat, ptr addrspace(1) %53, i64 %2644, !dbg !174 + %2646 = sext i32 %2641 to i64, !dbg !174 + %2647 = getelementptr bfloat, ptr addrspace(1) %53, i64 %2646, !dbg !174 + %2648 = sext i32 %2642 to i64, !dbg !174 + %2649 = getelementptr bfloat, ptr addrspace(1) %53, i64 %2648, !dbg !174 + %2650 = sext i32 %2643 to i64, !dbg !174 + %2651 = getelementptr bfloat, ptr addrspace(1) %53, i64 %2650, !dbg !174 + %2652 = getelementptr bfloat, ptr addrspace(1) %2645, i64 %138, !dbg !175 + %2653 = getelementptr bfloat, ptr addrspace(1) %2647, i64 %138, !dbg !175 + %2654 = getelementptr bfloat, ptr addrspace(1) %2649, i64 %138, !dbg !175 + %2655 = getelementptr bfloat, ptr addrspace(1) %2651, i64 %138, !dbg !175 + %2656 = getelementptr bfloat, ptr addrspace(1) %54, i64 %2644, !dbg !176 + %2657 = getelementptr bfloat, ptr addrspace(1) %54, i64 %2646, !dbg !176 + %2658 = getelementptr bfloat, ptr addrspace(1) %54, i64 %2648, !dbg !176 + %2659 = getelementptr bfloat, ptr addrspace(1) %54, i64 %2650, !dbg !176 + %2660 = getelementptr bfloat, ptr addrspace(1) %2656, i64 %138, !dbg !177 + %2661 = getelementptr bfloat, ptr addrspace(1) %2657, i64 %138, !dbg !177 + %2662 = getelementptr bfloat, ptr addrspace(1) %2658, i64 %138, !dbg !177 + %2663 = getelementptr bfloat, ptr addrspace(1) %2659, i64 %138, !dbg !177 + %2664 = shl i32 %2635, 1, !dbg !178 + %2665 = tail call i32 @llvm.smin.i32(i32 %2664, i32 %433), !dbg !179 + %2666 = icmp sgt i32 %2664, 0, !dbg !180 + %2667 = icmp slt i32 %2636, %19, !dbg !181 + %2668 = icmp slt i32 %2637, %19, !dbg !181 + %2669 = icmp slt i32 %2638, %19, !dbg !181 + %2670 = icmp slt i32 %2639, %19, !dbg !181 + %2671 = and i1 %2666, %2667, !dbg !180 + %2672 = and i1 %2666, %2668, !dbg !180 + %2673 = and i1 %2666, %2669, !dbg !180 + %2674 = and i1 %2666, %2670, !dbg !180 + %2675 = select i1 %2671, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %449, ptr addrspace(1) %2652, i32 %2675) #3, !dbg !182 + %2676 = select i1 %2672, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %452, ptr addrspace(1) %2653, i32 %2676) #3, !dbg !182 + %2677 = select i1 %2673, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %455, ptr addrspace(1) %2654, i32 %2677) #3, !dbg !182 + %2678 = select i1 %2674, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %458, ptr addrspace(1) %2655, i32 %2678) #3, !dbg !182 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %460, ptr addrspace(1) %2660, i32 %2675) #3, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %461, ptr addrspace(1) %2661, i32 %2676) #3, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %462, ptr addrspace(1) %2662, i32 %2677) #3, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %463, ptr addrspace(1) %2663, i32 %2678) #3, !dbg !182 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !182 + %2679 = icmp sgt i32 %2665, 1, !dbg !180 + %2680 = getelementptr i8, ptr addrspace(1) %2652, i64 16384, !dbg !183 + %2681 = getelementptr i8, ptr addrspace(1) %2653, i64 16384, !dbg !183 + %2682 = getelementptr i8, ptr addrspace(1) %2654, i64 16384, !dbg !183 + %2683 = getelementptr i8, ptr addrspace(1) %2655, i64 16384, !dbg !183 + %2684 = getelementptr i8, ptr addrspace(1) %2660, i64 16384, !dbg !184 + %2685 = getelementptr i8, ptr addrspace(1) %2661, i64 16384, !dbg !184 + %2686 = getelementptr i8, ptr addrspace(1) %2662, i64 16384, !dbg !184 + %2687 = getelementptr i8, ptr addrspace(1) %2663, i64 16384, !dbg !184 + %2688 = or disjoint i32 %2636, 64, !dbg !185 + %2689 = or disjoint i32 %2637, 64, !dbg !185 + %2690 = or disjoint i32 %2638, 64, !dbg !185 + %2691 = or disjoint i32 %2639, 64, !dbg !185 + %2692 = icmp slt i32 %2688, %19, !dbg !181 + %2693 = icmp slt i32 %2689, %19, !dbg !181 + %2694 = icmp slt i32 %2690, %19, !dbg !181 + %2695 = icmp slt i32 %2691, %19, !dbg !181 + %2696 = and i1 %2679, %2692, !dbg !180 + %2697 = and i1 %2679, %2693, !dbg !180 + %2698 = and i1 %2679, %2694, !dbg !180 + %2699 = and i1 %2679, %2695, !dbg !180 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !182 + %2700 = select i1 %2696, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %485, ptr addrspace(1) %2680, i32 %2700) #3, !dbg !182 + %2701 = select i1 %2697, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %487, ptr addrspace(1) %2681, i32 %2701) #3, !dbg !182 + %2702 = select i1 %2698, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %489, ptr addrspace(1) %2682, i32 %2702) #3, !dbg !182 + %2703 = select i1 %2699, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %491, ptr addrspace(1) %2683, i32 %2703) #3, !dbg !182 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %493, ptr addrspace(1) %2684, i32 %2700) #3, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %494, ptr addrspace(1) %2685, i32 %2701) #3, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %495, ptr addrspace(1) %2686, i32 %2702) #3, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %496, ptr addrspace(1) %2687, i32 %2703) #3, !dbg !182 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !182 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #3, !dbg !186 + %2704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 0, !dbg !180 + %2705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 1, !dbg !180 + %2706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 2, !dbg !180 + %2707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 3, !dbg !180 + %2708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 4, !dbg !180 + %2709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 5, !dbg !180 + %2710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 6, !dbg !180 + %2711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 7, !dbg !180 + %2712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 8, !dbg !180 + %2713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 9, !dbg !180 + %2714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 10, !dbg !180 + %2715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 11, !dbg !180 + %2716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 12, !dbg !180 + %2717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 13, !dbg !180 + %2718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 14, !dbg !180 + %2719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 15, !dbg !180 + %2720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 16, !dbg !180 + %2721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 17, !dbg !180 + %2722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 18, !dbg !180 + %2723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 19, !dbg !180 + %2724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 20, !dbg !180 + %2725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 21, !dbg !180 + %2726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 22, !dbg !180 + %2727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 23, !dbg !180 + %2728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 24, !dbg !180 + %2729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 25, !dbg !180 + %2730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 26, !dbg !180 + %2731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 27, !dbg !180 + %2732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 28, !dbg !180 + %2733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 29, !dbg !180 + %2734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 30, !dbg !180 + %2735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 31, !dbg !180 + %2736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 32, !dbg !180 + %2737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 33, !dbg !180 + %2738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 34, !dbg !180 + %2739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 35, !dbg !180 + %2740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 36, !dbg !180 + %2741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 37, !dbg !180 + %2742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 38, !dbg !180 + %2743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 39, !dbg !180 + %2744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 40, !dbg !180 + %2745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 41, !dbg !180 + %2746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 42, !dbg !180 + %2747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 43, !dbg !180 + %2748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 44, !dbg !180 + %2749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 45, !dbg !180 + %2750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 46, !dbg !180 + %2751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 47, !dbg !180 + %2752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 48, !dbg !180 + %2753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 49, !dbg !180 + %2754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 50, !dbg !180 + %2755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 51, !dbg !180 + %2756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 52, !dbg !180 + %2757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 53, !dbg !180 + %2758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 54, !dbg !180 + %2759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 55, !dbg !180 + %2760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 56, !dbg !180 + %2761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 57, !dbg !180 + %2762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 58, !dbg !180 + %2763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 59, !dbg !180 + %2764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 60, !dbg !180 + %2765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 61, !dbg !180 + %2766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 62, !dbg !180 + %2767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 63, !dbg !180 + br i1 %2666, label %.lr.ph1592, label %._crit_edge1593, !dbg !180 + +.lr.ph1592: ; preds = %._crit_edge + %2768 = insertelement <16 x i32> poison, i32 %2633, i64 0, !dbg !171 + %2769 = shufflevector <16 x i32> %2768, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !171 + %2770 = shufflevector <2 x i32> %395, <2 x i32> poison, <16 x i32> , !dbg !171 + %2771 = insertelement <16 x i32> %2770, i32 %392, i64 14, !dbg !171 + %2772 = insertelement <16 x i32> %2771, i32 %391, i64 15, !dbg !171 + %2773 = shufflevector <8 x i32> %401, <8 x i32> poison, <16 x i32> , !dbg !171 + %2774 = shufflevector <16 x i32> %2773, <16 x i32> %2772, <16 x i32> , !dbg !171 + %2775 = shufflevector <4 x i32> %398, <4 x i32> poison, <16 x i32> , !dbg !171 + %2776 = shufflevector <16 x i32> %2774, <16 x i32> %2775, <16 x i32> , !dbg !171 + %2777 = or disjoint <16 x i32> %2769, %2776, !dbg !171 + %2778 = add nsw i32 %2665, -2 + %2779 = add nsw i32 %2665, -1 + %smax2183 = tail call i32 @llvm.smax.i32(i32 %2665, i32 1), !dbg !180 + %2780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 0, !dbg !180 + %2781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 1, !dbg !180 + %2782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 2, !dbg !180 + %2783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 3, !dbg !180 + %2784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 4, !dbg !180 + %2785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 5, !dbg !180 + %2786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 6, !dbg !180 + %2787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 7, !dbg !180 + %2788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 8, !dbg !180 + %2789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 9, !dbg !180 + %2790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 10, !dbg !180 + %2791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 11, !dbg !180 + %2792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 12, !dbg !180 + %2793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 13, !dbg !180 + %2794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 14, !dbg !180 + %2795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 15, !dbg !180 + %2796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 16, !dbg !180 + %2797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 17, !dbg !180 + %2798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 18, !dbg !180 + %2799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 19, !dbg !180 + %2800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 20, !dbg !180 + %2801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 21, !dbg !180 + %2802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 22, !dbg !180 + %2803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 23, !dbg !180 + %2804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 24, !dbg !180 + %2805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 25, !dbg !180 + %2806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 26, !dbg !180 + %2807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 27, !dbg !180 + %2808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 28, !dbg !180 + %2809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 29, !dbg !180 + %2810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 30, !dbg !180 + %2811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 31, !dbg !180 + %2812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 32, !dbg !180 + %2813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 33, !dbg !180 + %2814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 34, !dbg !180 + %2815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 35, !dbg !180 + %2816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 36, !dbg !180 + %2817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 37, !dbg !180 + %2818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 38, !dbg !180 + %2819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 39, !dbg !180 + %2820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 40, !dbg !180 + %2821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 41, !dbg !180 + %2822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 42, !dbg !180 + %2823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 43, !dbg !180 + %2824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 44, !dbg !180 + %2825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 45, !dbg !180 + %2826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 46, !dbg !180 + %2827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 47, !dbg !180 + %2828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 48, !dbg !180 + %2829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 49, !dbg !180 + %2830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 50, !dbg !180 + %2831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 51, !dbg !180 + %2832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 52, !dbg !180 + %2833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 53, !dbg !180 + %2834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 54, !dbg !180 + %2835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 55, !dbg !180 + %2836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 56, !dbg !180 + %2837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 57, !dbg !180 + %2838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 58, !dbg !180 + %2839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 59, !dbg !180 + %2840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 60, !dbg !180 + %2841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 61, !dbg !180 + %2842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 62, !dbg !180 + %2843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2630, 63, !dbg !180 + br label %2844, !dbg !180 + +2844: ; preds = %.lr.ph1592, %__nv_exp2f.exit1431 + %2845 = phi i32 [ 64, %.lr.ph1592 ], [ %4415, %__nv_exp2f.exit1431 ] + %2846 = phi i32 [ -1, %.lr.ph1592 ], [ %2854, %__nv_exp2f.exit1431 ] + %2847 = phi i32 [ 1, %.lr.ph1592 ], [ %4432, %__nv_exp2f.exit1431 ] + %.pn10661574 = phi ptr addrspace(1) [ %2687, %.lr.ph1592 ], [ %4425, %__nv_exp2f.exit1431 ] + %.pn10821573 = phi ptr addrspace(1) [ %2686, %.lr.ph1592 ], [ %4424, %__nv_exp2f.exit1431 ] + %.pn10981572 = phi ptr addrspace(1) [ %2685, %.lr.ph1592 ], [ %4423, %__nv_exp2f.exit1431 ] + %.pn11141571 = phi ptr addrspace(1) [ %2684, %.lr.ph1592 ], [ %4422, %__nv_exp2f.exit1431 ] + %.pn10441570 = phi i32 [ %2691, %.lr.ph1592 ], [ %4429, %__nv_exp2f.exit1431 ] + %.pn10461569 = phi i32 [ %2690, %.lr.ph1592 ], [ %4428, %__nv_exp2f.exit1431 ] + %.pn10481568 = phi i32 [ %2689, %.lr.ph1592 ], [ %4427, %__nv_exp2f.exit1431 ] + %.pn10501567 = phi i32 [ %2688, %.lr.ph1592 ], [ %4426, %__nv_exp2f.exit1431 ] + %.pn9941566 = phi ptr addrspace(1) [ %2683, %.lr.ph1592 ], [ %4421, %__nv_exp2f.exit1431 ] + %.pn10101565 = phi ptr addrspace(1) [ %2682, %.lr.ph1592 ], [ %4420, %__nv_exp2f.exit1431 ] + %.pn10261564 = phi ptr addrspace(1) [ %2681, %.lr.ph1592 ], [ %4419, %__nv_exp2f.exit1431 ] + %.pn10421563 = phi ptr addrspace(1) [ %2680, %.lr.ph1592 ], [ %4418, %__nv_exp2f.exit1431 ] + %.pn = phi float [ %2780, %.lr.ph1592 ], [ %4329, %__nv_exp2f.exit1431 ] + %.pn2450 = phi float [ %2781, %.lr.ph1592 ], [ %4330, %__nv_exp2f.exit1431 ] + %.pn2451 = phi float [ %2782, %.lr.ph1592 ], [ %4331, %__nv_exp2f.exit1431 ] + %.pn2452 = phi float [ %2783, %.lr.ph1592 ], [ %4332, %__nv_exp2f.exit1431 ] + %.pn2453 = phi float [ %2784, %.lr.ph1592 ], [ %4333, %__nv_exp2f.exit1431 ] + %.pn2454 = phi float [ %2785, %.lr.ph1592 ], [ %4334, %__nv_exp2f.exit1431 ] + %.pn2455 = phi float [ %2786, %.lr.ph1592 ], [ %4335, %__nv_exp2f.exit1431 ] + %.pn2456 = phi float [ %2787, %.lr.ph1592 ], [ %4336, %__nv_exp2f.exit1431 ] + %.pn2457 = phi float [ %2788, %.lr.ph1592 ], [ %4337, %__nv_exp2f.exit1431 ] + %.pn2458 = phi float [ %2789, %.lr.ph1592 ], [ %4338, %__nv_exp2f.exit1431 ] + %.pn2459 = phi float [ %2790, %.lr.ph1592 ], [ %4339, %__nv_exp2f.exit1431 ] + %.pn2460 = phi float [ %2791, %.lr.ph1592 ], [ %4340, %__nv_exp2f.exit1431 ] + %.pn2461 = phi float [ %2792, %.lr.ph1592 ], [ %4341, %__nv_exp2f.exit1431 ] + %.pn2462 = phi float [ %2793, %.lr.ph1592 ], [ %4342, %__nv_exp2f.exit1431 ] + %.pn2463 = phi float [ %2794, %.lr.ph1592 ], [ %4343, %__nv_exp2f.exit1431 ] + %.pn2464 = phi float [ %2795, %.lr.ph1592 ], [ %4344, %__nv_exp2f.exit1431 ] + %.pn2465 = phi float [ %2796, %.lr.ph1592 ], [ %4345, %__nv_exp2f.exit1431 ] + %.pn2466 = phi float [ %2797, %.lr.ph1592 ], [ %4346, %__nv_exp2f.exit1431 ] + %.pn2467 = phi float [ %2798, %.lr.ph1592 ], [ %4347, %__nv_exp2f.exit1431 ] + %.pn2468 = phi float [ %2799, %.lr.ph1592 ], [ %4348, %__nv_exp2f.exit1431 ] + %.pn2469 = phi float [ %2800, %.lr.ph1592 ], [ %4349, %__nv_exp2f.exit1431 ] + %.pn2470 = phi float [ %2801, %.lr.ph1592 ], [ %4350, %__nv_exp2f.exit1431 ] + %.pn2471 = phi float [ %2802, %.lr.ph1592 ], [ %4351, %__nv_exp2f.exit1431 ] + %.pn2472 = phi float [ %2803, %.lr.ph1592 ], [ %4352, %__nv_exp2f.exit1431 ] + %.pn2473 = phi float [ %2804, %.lr.ph1592 ], [ %4353, %__nv_exp2f.exit1431 ] + %.pn2474 = phi float [ %2805, %.lr.ph1592 ], [ %4354, %__nv_exp2f.exit1431 ] + %.pn2475 = phi float [ %2806, %.lr.ph1592 ], [ %4355, %__nv_exp2f.exit1431 ] + %.pn2476 = phi float [ %2807, %.lr.ph1592 ], [ %4356, %__nv_exp2f.exit1431 ] + %.pn2477 = phi float [ %2808, %.lr.ph1592 ], [ %4357, %__nv_exp2f.exit1431 ] + %.pn2478 = phi float [ %2809, %.lr.ph1592 ], [ %4358, %__nv_exp2f.exit1431 ] + %.pn2479 = phi float [ %2810, %.lr.ph1592 ], [ %4359, %__nv_exp2f.exit1431 ] + %.pn2480 = phi float [ %2811, %.lr.ph1592 ], [ %4360, %__nv_exp2f.exit1431 ] + %.pn2481 = phi float [ %2812, %.lr.ph1592 ], [ %4361, %__nv_exp2f.exit1431 ] + %.pn2482 = phi float [ %2813, %.lr.ph1592 ], [ %4362, %__nv_exp2f.exit1431 ] + %.pn2483 = phi float [ %2814, %.lr.ph1592 ], [ %4363, %__nv_exp2f.exit1431 ] + %.pn2484 = phi float [ %2815, %.lr.ph1592 ], [ %4364, %__nv_exp2f.exit1431 ] + %.pn2485 = phi float [ %2816, %.lr.ph1592 ], [ %4365, %__nv_exp2f.exit1431 ] + %.pn2486 = phi float [ %2817, %.lr.ph1592 ], [ %4366, %__nv_exp2f.exit1431 ] + %.pn2487 = phi float [ %2818, %.lr.ph1592 ], [ %4367, %__nv_exp2f.exit1431 ] + %.pn2488 = phi float [ %2819, %.lr.ph1592 ], [ %4368, %__nv_exp2f.exit1431 ] + %.pn2489 = phi float [ %2820, %.lr.ph1592 ], [ %4369, %__nv_exp2f.exit1431 ] + %.pn2490 = phi float [ %2821, %.lr.ph1592 ], [ %4370, %__nv_exp2f.exit1431 ] + %.pn2491 = phi float [ %2822, %.lr.ph1592 ], [ %4371, %__nv_exp2f.exit1431 ] + %.pn2492 = phi float [ %2823, %.lr.ph1592 ], [ %4372, %__nv_exp2f.exit1431 ] + %.pn2493 = phi float [ %2824, %.lr.ph1592 ], [ %4373, %__nv_exp2f.exit1431 ] + %.pn2494 = phi float [ %2825, %.lr.ph1592 ], [ %4374, %__nv_exp2f.exit1431 ] + %.pn2495 = phi float [ %2826, %.lr.ph1592 ], [ %4375, %__nv_exp2f.exit1431 ] + %.pn2496 = phi float [ %2827, %.lr.ph1592 ], [ %4376, %__nv_exp2f.exit1431 ] + %.pn2497 = phi float [ %2828, %.lr.ph1592 ], [ %4377, %__nv_exp2f.exit1431 ] + %.pn2498 = phi float [ %2829, %.lr.ph1592 ], [ %4378, %__nv_exp2f.exit1431 ] + %.pn2499 = phi float [ %2830, %.lr.ph1592 ], [ %4379, %__nv_exp2f.exit1431 ] + %.pn2500 = phi float [ %2831, %.lr.ph1592 ], [ %4380, %__nv_exp2f.exit1431 ] + %.pn2501 = phi float [ %2832, %.lr.ph1592 ], [ %4381, %__nv_exp2f.exit1431 ] + %.pn2502 = phi float [ %2833, %.lr.ph1592 ], [ %4382, %__nv_exp2f.exit1431 ] + %.pn2503 = phi float [ %2834, %.lr.ph1592 ], [ %4383, %__nv_exp2f.exit1431 ] + %.pn2504 = phi float [ %2835, %.lr.ph1592 ], [ %4384, %__nv_exp2f.exit1431 ] + %.pn2505 = phi float [ %2836, %.lr.ph1592 ], [ %4385, %__nv_exp2f.exit1431 ] + %.pn2506 = phi float [ %2837, %.lr.ph1592 ], [ %4386, %__nv_exp2f.exit1431 ] + %.pn2507 = phi float [ %2838, %.lr.ph1592 ], [ %4387, %__nv_exp2f.exit1431 ] + %.pn2508 = phi float [ %2839, %.lr.ph1592 ], [ %4388, %__nv_exp2f.exit1431 ] + %.pn2509 = phi float [ %2840, %.lr.ph1592 ], [ %4389, %__nv_exp2f.exit1431 ] + %.pn2510 = phi float [ %2841, %.lr.ph1592 ], [ %4390, %__nv_exp2f.exit1431 ] + %.pn2511 = phi float [ %2842, %.lr.ph1592 ], [ %4391, %__nv_exp2f.exit1431 ] + %.pn2512 = phi float [ %2843, %.lr.ph1592 ], [ %4392, %__nv_exp2f.exit1431 ] + %2848 = phi i32 [ 0, %.lr.ph1592 ], [ %4396, %__nv_exp2f.exit1431 ] + %2849 = phi <16 x i32> [ %2777, %.lr.ph1592 ], [ %4395, %__nv_exp2f.exit1431 ] + %2850 = icmp slt i32 %2848, %2778, !dbg !180 + %2851 = icmp slt i32 %2848, %2779, !dbg !180 + %2852 = add i32 %2846, 1, !dbg !180 + %2853 = icmp sgt i32 %2852, 2, !dbg !180 + %2854 = select i1 %2853, i32 0, i32 %2852, !dbg !180 + %2855 = extractelement <16 x i32> %2849, i64 15, !dbg !181 + %2856 = icmp slt i32 %2855, %19, !dbg !181 + %2857 = extractelement <16 x i32> %2849, i64 14, !dbg !181 + %2858 = icmp slt i32 %2857, %19, !dbg !181 + %2859 = extractelement <16 x i32> %2849, i64 13, !dbg !181 + %2860 = icmp slt i32 %2859, %19, !dbg !181 + %2861 = extractelement <16 x i32> %2849, i64 12, !dbg !181 + %2862 = icmp slt i32 %2861, %19, !dbg !181 + %2863 = extractelement <16 x i32> %2849, i64 11, !dbg !181 + %2864 = icmp slt i32 %2863, %19, !dbg !181 + %2865 = extractelement <16 x i32> %2849, i64 10, !dbg !181 + %2866 = icmp slt i32 %2865, %19, !dbg !181 + %2867 = extractelement <16 x i32> %2849, i64 9, !dbg !181 + %2868 = icmp slt i32 %2867, %19, !dbg !181 + %2869 = extractelement <16 x i32> %2849, i64 8, !dbg !181 + %2870 = icmp slt i32 %2869, %19, !dbg !181 + %2871 = extractelement <16 x i32> %2849, i64 7, !dbg !181 + %2872 = icmp slt i32 %2871, %19, !dbg !181 + %2873 = extractelement <16 x i32> %2849, i64 6, !dbg !181 + %2874 = icmp slt i32 %2873, %19, !dbg !181 + %2875 = extractelement <16 x i32> %2849, i64 5, !dbg !181 + %2876 = icmp slt i32 %2875, %19, !dbg !181 + %2877 = extractelement <16 x i32> %2849, i64 4, !dbg !181 + %2878 = icmp slt i32 %2877, %19, !dbg !181 + %2879 = extractelement <16 x i32> %2849, i64 3, !dbg !181 + %2880 = icmp slt i32 %2879, %19, !dbg !181 + %2881 = extractelement <16 x i32> %2849, i64 2, !dbg !181 + %2882 = icmp slt i32 %2881, %19, !dbg !181 + %2883 = extractelement <16 x i32> %2849, i64 1, !dbg !181 + %2884 = icmp slt i32 %2883, %19, !dbg !181 + %2885 = extractelement <16 x i32> %2849, i64 0, !dbg !181 + %2886 = icmp slt i32 %2885, %19, !dbg !181 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !182 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !182 + %2887 = shl i32 %2854, 13, !dbg !182 + %2888 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %2887, !dbg !182 + %2889 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %57, i32 0, i32 31), !dbg !186 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !186 + %2890 = shl i32 %2889, 11, !dbg !186 + %2891 = and i32 %2890, 8192, !dbg !186 + %2892 = add i32 %2891, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !186 + %2893 = lshr exact i32 %2892, 4, !dbg !186 + %2894 = and i32 %2893, 16383, !dbg !186 + %2895 = zext nneg i32 %2894 to i64, !dbg !186 + %2896 = or disjoint i64 %2895, 4611686293372403712, !dbg !186 + %2897 = ptrtoint ptr addrspace(3) %2888 to i32, !dbg !186 + %2898 = lshr exact i32 %2897, 4, !dbg !186 + %2899 = and i32 %2898, 16383, !dbg !186 + %2900 = zext nneg i32 %2899 to i64, !dbg !186 + %2901 = or disjoint i64 %2900, 4611686293338849280, !dbg !186 + %2902 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %2896, i64 %2901) #3, !dbg !186 + %2903 = or disjoint i32 %2891, 32, !dbg !186 + %2904 = add i32 %2903, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !186 + %2905 = lshr exact i32 %2904, 4, !dbg !186 + %2906 = and i32 %2905, 16383, !dbg !186 + %2907 = zext nneg i32 %2906 to i64, !dbg !186 + %2908 = or disjoint i64 %2907, 4611686293372403712, !dbg !186 + %2909 = add i32 %2897, 32, !dbg !186 + %2910 = lshr exact i32 %2909, 4, !dbg !186 + %2911 = and i32 %2910, 16383, !dbg !186 + %2912 = zext nneg i32 %2911 to i64, !dbg !186 + %2913 = or disjoint i64 %2912, 4611686293338849280, !dbg !186 + %2914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 0, !dbg !186 + %2915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 1, !dbg !186 + %2916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 2, !dbg !186 + %2917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 3, !dbg !186 + %2918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 4, !dbg !186 + %2919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 5, !dbg !186 + %2920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 6, !dbg !186 + %2921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 7, !dbg !186 + %2922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 8, !dbg !186 + %2923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 9, !dbg !186 + %2924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 10, !dbg !186 + %2925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 11, !dbg !186 + %2926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 12, !dbg !186 + %2927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 13, !dbg !186 + %2928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 14, !dbg !186 + %2929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 15, !dbg !186 + %2930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 16, !dbg !186 + %2931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 17, !dbg !186 + %2932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 18, !dbg !186 + %2933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 19, !dbg !186 + %2934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 20, !dbg !186 + %2935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 21, !dbg !186 + %2936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 22, !dbg !186 + %2937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 23, !dbg !186 + %2938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 24, !dbg !186 + %2939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 25, !dbg !186 + %2940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 26, !dbg !186 + %2941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 27, !dbg !186 + %2942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 28, !dbg !186 + %2943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 29, !dbg !186 + %2944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 30, !dbg !186 + %2945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2902, 31, !dbg !186 + %2946 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2914, float %2915, float %2916, float %2917, float %2918, float %2919, float %2920, float %2921, float %2922, float %2923, float %2924, float %2925, float %2926, float %2927, float %2928, float %2929, float %2930, float %2931, float %2932, float %2933, float %2934, float %2935, float %2936, float %2937, float %2938, float %2939, float %2940, float %2941, float %2942, float %2943, float %2944, float %2945, i64 %2908, i64 %2913, i1 true) #3, !dbg !186 + %2947 = or disjoint i32 %2891, 64, !dbg !186 + %2948 = add i32 %2947, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !186 + %2949 = lshr exact i32 %2948, 4, !dbg !186 + %2950 = and i32 %2949, 16383, !dbg !186 + %2951 = zext nneg i32 %2950 to i64, !dbg !186 + %2952 = or disjoint i64 %2951, 4611686293372403712, !dbg !186 + %2953 = add i32 %2897, 64, !dbg !186 + %2954 = lshr exact i32 %2953, 4, !dbg !186 + %2955 = and i32 %2954, 16383, !dbg !186 + %2956 = zext nneg i32 %2955 to i64, !dbg !186 + %2957 = or disjoint i64 %2956, 4611686293338849280, !dbg !186 + %2958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 0, !dbg !186 + %2959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 1, !dbg !186 + %2960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 2, !dbg !186 + %2961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 3, !dbg !186 + %2962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 4, !dbg !186 + %2963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 5, !dbg !186 + %2964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 6, !dbg !186 + %2965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 7, !dbg !186 + %2966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 8, !dbg !186 + %2967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 9, !dbg !186 + %2968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 10, !dbg !186 + %2969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 11, !dbg !186 + %2970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 12, !dbg !186 + %2971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 13, !dbg !186 + %2972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 14, !dbg !186 + %2973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 15, !dbg !186 + %2974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 16, !dbg !186 + %2975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 17, !dbg !186 + %2976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 18, !dbg !186 + %2977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 19, !dbg !186 + %2978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 20, !dbg !186 + %2979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 21, !dbg !186 + %2980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 22, !dbg !186 + %2981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 23, !dbg !186 + %2982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 24, !dbg !186 + %2983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 25, !dbg !186 + %2984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 26, !dbg !186 + %2985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 27, !dbg !186 + %2986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 28, !dbg !186 + %2987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 29, !dbg !186 + %2988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 30, !dbg !186 + %2989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2946, 31, !dbg !186 + %2990 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2958, float %2959, float %2960, float %2961, float %2962, float %2963, float %2964, float %2965, float %2966, float %2967, float %2968, float %2969, float %2970, float %2971, float %2972, float %2973, float %2974, float %2975, float %2976, float %2977, float %2978, float %2979, float %2980, float %2981, float %2982, float %2983, float %2984, float %2985, float %2986, float %2987, float %2988, float %2989, i64 %2952, i64 %2957, i1 true) #3, !dbg !186 + %2991 = or disjoint i32 %2891, 96, !dbg !186 + %2992 = add i32 %2991, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !186 + %2993 = lshr exact i32 %2992, 4, !dbg !186 + %2994 = and i32 %2993, 16383, !dbg !186 + %2995 = zext nneg i32 %2994 to i64, !dbg !186 + %2996 = or disjoint i64 %2995, 4611686293372403712, !dbg !186 + %2997 = add i32 %2897, 96, !dbg !186 + %2998 = lshr exact i32 %2997, 4, !dbg !186 + %2999 = and i32 %2998, 16383, !dbg !186 + %3000 = zext nneg i32 %2999 to i64, !dbg !186 + %3001 = or disjoint i64 %3000, 4611686293338849280, !dbg !186 + %3002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 0, !dbg !186 + %3003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 1, !dbg !186 + %3004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 2, !dbg !186 + %3005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 3, !dbg !186 + %3006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 4, !dbg !186 + %3007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 5, !dbg !186 + %3008 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 6, !dbg !186 + %3009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 7, !dbg !186 + %3010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 8, !dbg !186 + %3011 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 9, !dbg !186 + %3012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 10, !dbg !186 + %3013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 11, !dbg !186 + %3014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 12, !dbg !186 + %3015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 13, !dbg !186 + %3016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 14, !dbg !186 + %3017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 15, !dbg !186 + %3018 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 16, !dbg !186 + %3019 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 17, !dbg !186 + %3020 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 18, !dbg !186 + %3021 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 19, !dbg !186 + %3022 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 20, !dbg !186 + %3023 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 21, !dbg !186 + %3024 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 22, !dbg !186 + %3025 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 23, !dbg !186 + %3026 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 24, !dbg !186 + %3027 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 25, !dbg !186 + %3028 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 26, !dbg !186 + %3029 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 27, !dbg !186 + %3030 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 28, !dbg !186 + %3031 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 29, !dbg !186 + %3032 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 30, !dbg !186 + %3033 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2990, 31, !dbg !186 + %3034 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3002, float %3003, float %3004, float %3005, float %3006, float %3007, float %3008, float %3009, float %3010, float %3011, float %3012, float %3013, float %3014, float %3015, float %3016, float %3017, float %3018, float %3019, float %3020, float %3021, float %3022, float %3023, float %3024, float %3025, float %3026, float %3027, float %3028, float %3029, float %3030, float %3031, float %3032, float %3033, i64 %2996, i64 %3001, i1 true) #3, !dbg !186 + %3035 = or disjoint i32 %2891, 16384, !dbg !186 + %3036 = add i32 %3035, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !186 + %3037 = lshr exact i32 %3036, 4, !dbg !186 + %3038 = and i32 %3037, 16383, !dbg !186 + %3039 = zext nneg i32 %3038 to i64, !dbg !186 + %3040 = or disjoint i64 %3039, 4611686293372403712, !dbg !186 + %3041 = add i32 %2897, 8192, !dbg !186 + %3042 = lshr exact i32 %3041, 4, !dbg !186 + %3043 = and i32 %3042, 16383, !dbg !186 + %3044 = zext nneg i32 %3043 to i64, !dbg !186 + %3045 = or disjoint i64 %3044, 4611686293338849280, !dbg !186 + %3046 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 0, !dbg !186 + %3047 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 1, !dbg !186 + %3048 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 2, !dbg !186 + %3049 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 3, !dbg !186 + %3050 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 4, !dbg !186 + %3051 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 5, !dbg !186 + %3052 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 6, !dbg !186 + %3053 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 7, !dbg !186 + %3054 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 8, !dbg !186 + %3055 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 9, !dbg !186 + %3056 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 10, !dbg !186 + %3057 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 11, !dbg !186 + %3058 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 12, !dbg !186 + %3059 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 13, !dbg !186 + %3060 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 14, !dbg !186 + %3061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 15, !dbg !186 + %3062 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 16, !dbg !186 + %3063 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 17, !dbg !186 + %3064 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 18, !dbg !186 + %3065 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 19, !dbg !186 + %3066 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 20, !dbg !186 + %3067 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 21, !dbg !186 + %3068 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 22, !dbg !186 + %3069 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 23, !dbg !186 + %3070 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 24, !dbg !186 + %3071 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 25, !dbg !186 + %3072 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 26, !dbg !186 + %3073 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 27, !dbg !186 + %3074 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 28, !dbg !186 + %3075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 29, !dbg !186 + %3076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 30, !dbg !186 + %3077 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3034, 31, !dbg !186 + %3078 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3046, float %3047, float %3048, float %3049, float %3050, float %3051, float %3052, float %3053, float %3054, float %3055, float %3056, float %3057, float %3058, float %3059, float %3060, float %3061, float %3062, float %3063, float %3064, float %3065, float %3066, float %3067, float %3068, float %3069, float %3070, float %3071, float %3072, float %3073, float %3074, float %3075, float %3076, float %3077, i64 %3040, i64 %3045, i1 true) #3, !dbg !186 + %3079 = or disjoint i32 %2891, 16416, !dbg !186 + %3080 = add i32 %3079, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !186 + %3081 = lshr exact i32 %3080, 4, !dbg !186 + %3082 = and i32 %3081, 16383, !dbg !186 + %3083 = zext nneg i32 %3082 to i64, !dbg !186 + %3084 = or disjoint i64 %3083, 4611686293372403712, !dbg !186 + %3085 = add i32 %2897, 8224, !dbg !186 + %3086 = lshr exact i32 %3085, 4, !dbg !186 + %3087 = and i32 %3086, 16383, !dbg !186 + %3088 = zext nneg i32 %3087 to i64, !dbg !186 + %3089 = or disjoint i64 %3088, 4611686293338849280, !dbg !186 + %3090 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 0, !dbg !186 + %3091 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 1, !dbg !186 + %3092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 2, !dbg !186 + %3093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 3, !dbg !186 + %3094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 4, !dbg !186 + %3095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 5, !dbg !186 + %3096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 6, !dbg !186 + %3097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 7, !dbg !186 + %3098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 8, !dbg !186 + %3099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 9, !dbg !186 + %3100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 10, !dbg !186 + %3101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 11, !dbg !186 + %3102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 12, !dbg !186 + %3103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 13, !dbg !186 + %3104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 14, !dbg !186 + %3105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 15, !dbg !186 + %3106 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 16, !dbg !186 + %3107 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 17, !dbg !186 + %3108 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 18, !dbg !186 + %3109 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 19, !dbg !186 + %3110 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 20, !dbg !186 + %3111 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 21, !dbg !186 + %3112 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 22, !dbg !186 + %3113 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 23, !dbg !186 + %3114 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 24, !dbg !186 + %3115 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 25, !dbg !186 + %3116 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 26, !dbg !186 + %3117 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 27, !dbg !186 + %3118 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 28, !dbg !186 + %3119 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 29, !dbg !186 + %3120 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 30, !dbg !186 + %3121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3078, 31, !dbg !186 + %3122 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3090, float %3091, float %3092, float %3093, float %3094, float %3095, float %3096, float %3097, float %3098, float %3099, float %3100, float %3101, float %3102, float %3103, float %3104, float %3105, float %3106, float %3107, float %3108, float %3109, float %3110, float %3111, float %3112, float %3113, float %3114, float %3115, float %3116, float %3117, float %3118, float %3119, float %3120, float %3121, i64 %3084, i64 %3089, i1 true) #3, !dbg !186 + %3123 = or disjoint i32 %2891, 16448, !dbg !186 + %3124 = add i32 %3123, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !186 + %3125 = lshr exact i32 %3124, 4, !dbg !186 + %3126 = and i32 %3125, 16383, !dbg !186 + %3127 = zext nneg i32 %3126 to i64, !dbg !186 + %3128 = or disjoint i64 %3127, 4611686293372403712, !dbg !186 + %3129 = add i32 %2897, 8256, !dbg !186 + %3130 = lshr exact i32 %3129, 4, !dbg !186 + %3131 = and i32 %3130, 16383, !dbg !186 + %3132 = zext nneg i32 %3131 to i64, !dbg !186 + %3133 = or disjoint i64 %3132, 4611686293338849280, !dbg !186 + %3134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 0, !dbg !186 + %3135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 1, !dbg !186 + %3136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 2, !dbg !186 + %3137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 3, !dbg !186 + %3138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 4, !dbg !186 + %3139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 5, !dbg !186 + %3140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 6, !dbg !186 + %3141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 7, !dbg !186 + %3142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 8, !dbg !186 + %3143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 9, !dbg !186 + %3144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 10, !dbg !186 + %3145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 11, !dbg !186 + %3146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 12, !dbg !186 + %3147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 13, !dbg !186 + %3148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 14, !dbg !186 + %3149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 15, !dbg !186 + %3150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 16, !dbg !186 + %3151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 17, !dbg !186 + %3152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 18, !dbg !186 + %3153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 19, !dbg !186 + %3154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 20, !dbg !186 + %3155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 21, !dbg !186 + %3156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 22, !dbg !186 + %3157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 23, !dbg !186 + %3158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 24, !dbg !186 + %3159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 25, !dbg !186 + %3160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 26, !dbg !186 + %3161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 27, !dbg !186 + %3162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 28, !dbg !186 + %3163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 29, !dbg !186 + %3164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 30, !dbg !186 + %3165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3122, 31, !dbg !186 + %3166 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3134, float %3135, float %3136, float %3137, float %3138, float %3139, float %3140, float %3141, float %3142, float %3143, float %3144, float %3145, float %3146, float %3147, float %3148, float %3149, float %3150, float %3151, float %3152, float %3153, float %3154, float %3155, float %3156, float %3157, float %3158, float %3159, float %3160, float %3161, float %3162, float %3163, float %3164, float %3165, i64 %3128, i64 %3133, i1 true) #3, !dbg !186 + %3167 = or disjoint i32 %2891, 16480, !dbg !186 + %3168 = add i32 %3167, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !186 + %3169 = lshr exact i32 %3168, 4, !dbg !186 + %3170 = and i32 %3169, 16383, !dbg !186 + %3171 = zext nneg i32 %3170 to i64, !dbg !186 + %3172 = or disjoint i64 %3171, 4611686293372403712, !dbg !186 + %3173 = add i32 %2897, 8288, !dbg !186 + %3174 = lshr exact i32 %3173, 4, !dbg !186 + %3175 = and i32 %3174, 16383, !dbg !186 + %3176 = zext nneg i32 %3175 to i64, !dbg !186 + %3177 = or disjoint i64 %3176, 4611686293338849280, !dbg !186 + %3178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 0, !dbg !186 + %3179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 1, !dbg !186 + %3180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 2, !dbg !186 + %3181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 3, !dbg !186 + %3182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 4, !dbg !186 + %3183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 5, !dbg !186 + %3184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 6, !dbg !186 + %3185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 7, !dbg !186 + %3186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 8, !dbg !186 + %3187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 9, !dbg !186 + %3188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 10, !dbg !186 + %3189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 11, !dbg !186 + %3190 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 12, !dbg !186 + %3191 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 13, !dbg !186 + %3192 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 14, !dbg !186 + %3193 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 15, !dbg !186 + %3194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 16, !dbg !186 + %3195 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 17, !dbg !186 + %3196 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 18, !dbg !186 + %3197 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 19, !dbg !186 + %3198 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 20, !dbg !186 + %3199 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 21, !dbg !186 + %3200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 22, !dbg !186 + %3201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 23, !dbg !186 + %3202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 24, !dbg !186 + %3203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 25, !dbg !186 + %3204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 26, !dbg !186 + %3205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 27, !dbg !186 + %3206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 28, !dbg !186 + %3207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 29, !dbg !186 + %3208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 30, !dbg !186 + %3209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3166, 31, !dbg !186 + %3210 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3178, float %3179, float %3180, float %3181, float %3182, float %3183, float %3184, float %3185, float %3186, float %3187, float %3188, float %3189, float %3190, float %3191, float %3192, float %3193, float %3194, float %3195, float %3196, float %3197, float %3198, float %3199, float %3200, float %3201, float %3202, float %3203, float %3204, float %3205, float %3206, float %3207, float %3208, float %3209, i64 %3172, i64 %3177, i1 true) #3, !dbg !186 + %3211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 0, !dbg !186 + %3212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 1, !dbg !186 + %3213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 2, !dbg !186 + %3214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 3, !dbg !186 + %3215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 4, !dbg !186 + %3216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 5, !dbg !186 + %3217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 6, !dbg !186 + %3218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 7, !dbg !186 + %3219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 8, !dbg !186 + %3220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 9, !dbg !186 + %3221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 10, !dbg !186 + %3222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 11, !dbg !186 + %3223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 12, !dbg !186 + %3224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 13, !dbg !186 + %3225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 14, !dbg !186 + %3226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 15, !dbg !186 + %3227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 16, !dbg !186 + %3228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 17, !dbg !186 + %3229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 18, !dbg !186 + %3230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 19, !dbg !186 + %3231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 20, !dbg !186 + %3232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 21, !dbg !186 + %3233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 22, !dbg !186 + %3234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 23, !dbg !186 + %3235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 24, !dbg !186 + %3236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 25, !dbg !186 + %3237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 26, !dbg !186 + %3238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 27, !dbg !186 + %3239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 28, !dbg !186 + %3240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 29, !dbg !186 + %3241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 30, !dbg !186 + %3242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3210, 31, !dbg !186 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !186 + %3243 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %3211, float %3212, float %3213, float %3214, float %3215, float %3216, float %3217, float %3218, float %3219, float %3220, float %3221, float %3222, float %3223, float %3224, float %3225, float %3226, float %3227, float %3228, float %3229, float %3230, float %3231, float %3232, float %3233, float %3234, float %3235, float %3236, float %3237, float %3238, float %3239, float %3240, float %3241, float %3242, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %2888, i32 0, i32 0) #3, !dbg !186 + %3244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 0, !dbg !186 + %3245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 1, !dbg !186 + %3246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 2, !dbg !186 + %3247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 3, !dbg !186 + %3248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 4, !dbg !186 + %3249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 5, !dbg !186 + %3250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 6, !dbg !186 + %3251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 7, !dbg !186 + %3252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 8, !dbg !186 + %3253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 9, !dbg !186 + %3254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 10, !dbg !186 + %3255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 11, !dbg !186 + %3256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 12, !dbg !186 + %3257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 13, !dbg !186 + %3258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 14, !dbg !186 + %3259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 15, !dbg !186 + %3260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 16, !dbg !186 + %3261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 17, !dbg !186 + %3262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 18, !dbg !186 + %3263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 19, !dbg !186 + %3264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 20, !dbg !186 + %3265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 21, !dbg !186 + %3266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 22, !dbg !186 + %3267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 23, !dbg !186 + %3268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 24, !dbg !186 + %3269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 25, !dbg !186 + %3270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 26, !dbg !186 + %3271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 27, !dbg !186 + %3272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 28, !dbg !186 + %3273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 29, !dbg !186 + %3274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 30, !dbg !186 + %3275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3243, 31, !dbg !186 + %3276 = fmul float %3244, 0x3FB6A09E60000000, !dbg !187 + %3277 = fmul float %3245, 0x3FB6A09E60000000, !dbg !187 + %3278 = fmul float %3246, 0x3FB6A09E60000000, !dbg !187 + %3279 = fmul float %3247, 0x3FB6A09E60000000, !dbg !187 + %3280 = fmul float %3248, 0x3FB6A09E60000000, !dbg !187 + %3281 = fmul float %3249, 0x3FB6A09E60000000, !dbg !187 + %3282 = fmul float %3250, 0x3FB6A09E60000000, !dbg !187 + %3283 = fmul float %3251, 0x3FB6A09E60000000, !dbg !187 + %3284 = fmul float %3252, 0x3FB6A09E60000000, !dbg !187 + %3285 = fmul float %3253, 0x3FB6A09E60000000, !dbg !187 + %3286 = fmul float %3254, 0x3FB6A09E60000000, !dbg !187 + %3287 = fmul float %3255, 0x3FB6A09E60000000, !dbg !187 + %3288 = fmul float %3256, 0x3FB6A09E60000000, !dbg !187 + %3289 = fmul float %3257, 0x3FB6A09E60000000, !dbg !187 + %3290 = fmul float %3258, 0x3FB6A09E60000000, !dbg !187 + %3291 = fmul float %3259, 0x3FB6A09E60000000, !dbg !187 + %3292 = fmul float %3260, 0x3FB6A09E60000000, !dbg !187 + %3293 = fmul float %3261, 0x3FB6A09E60000000, !dbg !187 + %3294 = fmul float %3262, 0x3FB6A09E60000000, !dbg !187 + %3295 = fmul float %3263, 0x3FB6A09E60000000, !dbg !187 + %3296 = fmul float %3264, 0x3FB6A09E60000000, !dbg !187 + %3297 = fmul float %3265, 0x3FB6A09E60000000, !dbg !187 + %3298 = fmul float %3266, 0x3FB6A09E60000000, !dbg !187 + %3299 = fmul float %3267, 0x3FB6A09E60000000, !dbg !187 + %3300 = fmul float %3268, 0x3FB6A09E60000000, !dbg !187 + %3301 = fmul float %3269, 0x3FB6A09E60000000, !dbg !187 + %3302 = fmul float %3270, 0x3FB6A09E60000000, !dbg !187 + %3303 = fmul float %3271, 0x3FB6A09E60000000, !dbg !187 + %3304 = fmul float %3272, 0x3FB6A09E60000000, !dbg !187 + %3305 = fmul float %3273, 0x3FB6A09E60000000, !dbg !187 + %3306 = fmul float %3274, 0x3FB6A09E60000000, !dbg !187 + %3307 = fmul float %3275, 0x3FB6A09E60000000, !dbg !187 + %3308 = fmul float %3276, 0x3FF7154760000000, !dbg !188 + %3309 = select i1 %2856, float %3308, float 0xFFF0000000000000, !dbg !189 + %3310 = fmul float %3277, 0x3FF7154760000000, !dbg !188 + %3311 = select i1 %2858, float %3310, float 0xFFF0000000000000, !dbg !189 + %3312 = fmul float %3278, 0x3FF7154760000000, !dbg !188 + %3313 = select i1 %2856, float %3312, float 0xFFF0000000000000, !dbg !189 + %3314 = fmul float %3279, 0x3FF7154760000000, !dbg !188 + %3315 = select i1 %2858, float %3314, float 0xFFF0000000000000, !dbg !189 + %3316 = fmul float %3280, 0x3FF7154760000000, !dbg !188 + %3317 = select i1 %2860, float %3316, float 0xFFF0000000000000, !dbg !189 + %3318 = fmul float %3281, 0x3FF7154760000000, !dbg !188 + %3319 = select i1 %2862, float %3318, float 0xFFF0000000000000, !dbg !189 + %3320 = fmul float %3282, 0x3FF7154760000000, !dbg !188 + %3321 = select i1 %2860, float %3320, float 0xFFF0000000000000, !dbg !189 + %3322 = fmul float %3283, 0x3FF7154760000000, !dbg !188 + %3323 = select i1 %2862, float %3322, float 0xFFF0000000000000, !dbg !189 + %3324 = fmul float %3284, 0x3FF7154760000000, !dbg !188 + %3325 = select i1 %2864, float %3324, float 0xFFF0000000000000, !dbg !189 + %3326 = fmul float %3285, 0x3FF7154760000000, !dbg !188 + %3327 = select i1 %2866, float %3326, float 0xFFF0000000000000, !dbg !189 + %3328 = fmul float %3286, 0x3FF7154760000000, !dbg !188 + %3329 = select i1 %2864, float %3328, float 0xFFF0000000000000, !dbg !189 + %3330 = fmul float %3287, 0x3FF7154760000000, !dbg !188 + %3331 = select i1 %2866, float %3330, float 0xFFF0000000000000, !dbg !189 + %3332 = fmul float %3288, 0x3FF7154760000000, !dbg !188 + %3333 = select i1 %2868, float %3332, float 0xFFF0000000000000, !dbg !189 + %3334 = fmul float %3289, 0x3FF7154760000000, !dbg !188 + %3335 = select i1 %2870, float %3334, float 0xFFF0000000000000, !dbg !189 + %3336 = fmul float %3290, 0x3FF7154760000000, !dbg !188 + %3337 = select i1 %2868, float %3336, float 0xFFF0000000000000, !dbg !189 + %3338 = fmul float %3291, 0x3FF7154760000000, !dbg !188 + %3339 = select i1 %2870, float %3338, float 0xFFF0000000000000, !dbg !189 + %3340 = fmul float %3292, 0x3FF7154760000000, !dbg !188 + %3341 = select i1 %2872, float %3340, float 0xFFF0000000000000, !dbg !189 + %3342 = fmul float %3293, 0x3FF7154760000000, !dbg !188 + %3343 = select i1 %2874, float %3342, float 0xFFF0000000000000, !dbg !189 + %3344 = fmul float %3294, 0x3FF7154760000000, !dbg !188 + %3345 = select i1 %2872, float %3344, float 0xFFF0000000000000, !dbg !189 + %3346 = fmul float %3295, 0x3FF7154760000000, !dbg !188 + %3347 = select i1 %2874, float %3346, float 0xFFF0000000000000, !dbg !189 + %3348 = fmul float %3296, 0x3FF7154760000000, !dbg !188 + %3349 = select i1 %2876, float %3348, float 0xFFF0000000000000, !dbg !189 + %3350 = fmul float %3297, 0x3FF7154760000000, !dbg !188 + %3351 = select i1 %2878, float %3350, float 0xFFF0000000000000, !dbg !189 + %3352 = fmul float %3298, 0x3FF7154760000000, !dbg !188 + %3353 = select i1 %2876, float %3352, float 0xFFF0000000000000, !dbg !189 + %3354 = fmul float %3299, 0x3FF7154760000000, !dbg !188 + %3355 = select i1 %2878, float %3354, float 0xFFF0000000000000, !dbg !189 + %3356 = fmul float %3300, 0x3FF7154760000000, !dbg !188 + %3357 = select i1 %2880, float %3356, float 0xFFF0000000000000, !dbg !189 + %3358 = fmul float %3301, 0x3FF7154760000000, !dbg !188 + %3359 = select i1 %2882, float %3358, float 0xFFF0000000000000, !dbg !189 + %3360 = fmul float %3302, 0x3FF7154760000000, !dbg !188 + %3361 = select i1 %2880, float %3360, float 0xFFF0000000000000, !dbg !189 + %3362 = fmul float %3303, 0x3FF7154760000000, !dbg !188 + %3363 = select i1 %2882, float %3362, float 0xFFF0000000000000, !dbg !189 + %3364 = fmul float %3304, 0x3FF7154760000000, !dbg !188 + %3365 = select i1 %2884, float %3364, float 0xFFF0000000000000, !dbg !189 + %3366 = fmul float %3305, 0x3FF7154760000000, !dbg !188 + %3367 = select i1 %2886, float %3366, float 0xFFF0000000000000, !dbg !189 + %3368 = fmul float %3306, 0x3FF7154760000000, !dbg !188 + %3369 = select i1 %2884, float %3368, float 0xFFF0000000000000, !dbg !189 + %3370 = fmul float %3307, 0x3FF7154760000000, !dbg !188 + %3371 = select i1 %2886, float %3370, float 0xFFF0000000000000, !dbg !189 + %3372 = fsub float %3309, %381, !dbg !190 + %3373 = fsub float %3311, %381, !dbg !190 + %3374 = fsub float %3313, %382, !dbg !190 + %3375 = fsub float %3315, %382, !dbg !190 + %3376 = fsub float %3317, %381, !dbg !190 + %3377 = fsub float %3319, %381, !dbg !190 + %3378 = fsub float %3321, %382, !dbg !190 + %3379 = fsub float %3323, %382, !dbg !190 + %3380 = fsub float %3325, %381, !dbg !190 + %3381 = fsub float %3327, %381, !dbg !190 + %3382 = fsub float %3329, %382, !dbg !190 + %3383 = fsub float %3331, %382, !dbg !190 + %3384 = fsub float %3333, %381, !dbg !190 + %3385 = fsub float %3335, %381, !dbg !190 + %3386 = fsub float %3337, %382, !dbg !190 + %3387 = fsub float %3339, %382, !dbg !190 + %3388 = fsub float %3341, %381, !dbg !190 + %3389 = fsub float %3343, %381, !dbg !190 + %3390 = fsub float %3345, %382, !dbg !190 + %3391 = fsub float %3347, %382, !dbg !190 + %3392 = fsub float %3349, %381, !dbg !190 + %3393 = fsub float %3351, %381, !dbg !190 + %3394 = fsub float %3353, %382, !dbg !190 + %3395 = fsub float %3355, %382, !dbg !190 + %3396 = fsub float %3357, %381, !dbg !190 + %3397 = fsub float %3359, %381, !dbg !190 + %3398 = fsub float %3361, %382, !dbg !190 + %3399 = fsub float %3363, %382, !dbg !190 + %3400 = fsub float %3365, %381, !dbg !190 + %3401 = fsub float %3367, %381, !dbg !190 + %3402 = fsub float %3369, %382, !dbg !190 + %3403 = fsub float %3371, %382, !dbg !190 + %3404 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1336 = icmp eq i32 %3404, 0, !dbg !191 + br i1 %.not.i1336, label %3407, label %3405, !dbg !191 + +3405: ; preds = %2844 + %3406 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3372) #3, !dbg !191 + br label %__nv_exp2f.exit1338, !dbg !191 + +3407: ; preds = %2844 + %3408 = tail call float @llvm.nvvm.ex2.approx.f(float %3372) #3, !dbg !191 + br label %__nv_exp2f.exit1338, !dbg !191 + +__nv_exp2f.exit1338: ; preds = %3405, %3407 + %.0.i1337 = phi float [ %3406, %3405 ], [ %3408, %3407 ], !dbg !191 + %3409 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1339 = icmp eq i32 %3409, 0, !dbg !191 + br i1 %.not.i1339, label %3412, label %3410, !dbg !191 + +3410: ; preds = %__nv_exp2f.exit1338 + %3411 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3373) #3, !dbg !191 + br label %__nv_exp2f.exit1341, !dbg !191 + +3412: ; preds = %__nv_exp2f.exit1338 + %3413 = tail call float @llvm.nvvm.ex2.approx.f(float %3373) #3, !dbg !191 + br label %__nv_exp2f.exit1341, !dbg !191 + +__nv_exp2f.exit1341: ; preds = %3410, %3412 + %.0.i1340 = phi float [ %3411, %3410 ], [ %3413, %3412 ], !dbg !191 + %3414 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1342 = icmp eq i32 %3414, 0, !dbg !191 + br i1 %.not.i1342, label %3417, label %3415, !dbg !191 + +3415: ; preds = %__nv_exp2f.exit1341 + %3416 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3374) #3, !dbg !191 + br label %__nv_exp2f.exit1344, !dbg !191 + +3417: ; preds = %__nv_exp2f.exit1341 + %3418 = tail call float @llvm.nvvm.ex2.approx.f(float %3374) #3, !dbg !191 + br label %__nv_exp2f.exit1344, !dbg !191 + +__nv_exp2f.exit1344: ; preds = %3415, %3417 + %.0.i1343 = phi float [ %3416, %3415 ], [ %3418, %3417 ], !dbg !191 + %3419 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1345 = icmp eq i32 %3419, 0, !dbg !191 + br i1 %.not.i1345, label %3422, label %3420, !dbg !191 + +3420: ; preds = %__nv_exp2f.exit1344 + %3421 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3375) #3, !dbg !191 + br label %__nv_exp2f.exit1347, !dbg !191 + +3422: ; preds = %__nv_exp2f.exit1344 + %3423 = tail call float @llvm.nvvm.ex2.approx.f(float %3375) #3, !dbg !191 + br label %__nv_exp2f.exit1347, !dbg !191 + +__nv_exp2f.exit1347: ; preds = %3420, %3422 + %.0.i1346 = phi float [ %3421, %3420 ], [ %3423, %3422 ], !dbg !191 + %3424 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1348 = icmp eq i32 %3424, 0, !dbg !191 + br i1 %.not.i1348, label %3427, label %3425, !dbg !191 + +3425: ; preds = %__nv_exp2f.exit1347 + %3426 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3376) #3, !dbg !191 + br label %__nv_exp2f.exit1350, !dbg !191 + +3427: ; preds = %__nv_exp2f.exit1347 + %3428 = tail call float @llvm.nvvm.ex2.approx.f(float %3376) #3, !dbg !191 + br label %__nv_exp2f.exit1350, !dbg !191 + +__nv_exp2f.exit1350: ; preds = %3425, %3427 + %.0.i1349 = phi float [ %3426, %3425 ], [ %3428, %3427 ], !dbg !191 + %3429 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1351 = icmp eq i32 %3429, 0, !dbg !191 + br i1 %.not.i1351, label %3432, label %3430, !dbg !191 + +3430: ; preds = %__nv_exp2f.exit1350 + %3431 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3377) #3, !dbg !191 + br label %__nv_exp2f.exit1353, !dbg !191 + +3432: ; preds = %__nv_exp2f.exit1350 + %3433 = tail call float @llvm.nvvm.ex2.approx.f(float %3377) #3, !dbg !191 + br label %__nv_exp2f.exit1353, !dbg !191 + +__nv_exp2f.exit1353: ; preds = %3430, %3432 + %.0.i1352 = phi float [ %3431, %3430 ], [ %3433, %3432 ], !dbg !191 + %3434 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1354 = icmp eq i32 %3434, 0, !dbg !191 + br i1 %.not.i1354, label %3437, label %3435, !dbg !191 + +3435: ; preds = %__nv_exp2f.exit1353 + %3436 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3378) #3, !dbg !191 + br label %__nv_exp2f.exit1356, !dbg !191 + +3437: ; preds = %__nv_exp2f.exit1353 + %3438 = tail call float @llvm.nvvm.ex2.approx.f(float %3378) #3, !dbg !191 + br label %__nv_exp2f.exit1356, !dbg !191 + +__nv_exp2f.exit1356: ; preds = %3435, %3437 + %.0.i1355 = phi float [ %3436, %3435 ], [ %3438, %3437 ], !dbg !191 + %3439 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1357 = icmp eq i32 %3439, 0, !dbg !191 + br i1 %.not.i1357, label %3442, label %3440, !dbg !191 + +3440: ; preds = %__nv_exp2f.exit1356 + %3441 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3379) #3, !dbg !191 + br label %__nv_exp2f.exit1359, !dbg !191 + +3442: ; preds = %__nv_exp2f.exit1356 + %3443 = tail call float @llvm.nvvm.ex2.approx.f(float %3379) #3, !dbg !191 + br label %__nv_exp2f.exit1359, !dbg !191 + +__nv_exp2f.exit1359: ; preds = %3440, %3442 + %.0.i1358 = phi float [ %3441, %3440 ], [ %3443, %3442 ], !dbg !191 + %3444 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1360 = icmp eq i32 %3444, 0, !dbg !191 + br i1 %.not.i1360, label %3447, label %3445, !dbg !191 + +3445: ; preds = %__nv_exp2f.exit1359 + %3446 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3380) #3, !dbg !191 + br label %__nv_exp2f.exit1362, !dbg !191 + +3447: ; preds = %__nv_exp2f.exit1359 + %3448 = tail call float @llvm.nvvm.ex2.approx.f(float %3380) #3, !dbg !191 + br label %__nv_exp2f.exit1362, !dbg !191 + +__nv_exp2f.exit1362: ; preds = %3445, %3447 + %.0.i1361 = phi float [ %3446, %3445 ], [ %3448, %3447 ], !dbg !191 + %3449 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1363 = icmp eq i32 %3449, 0, !dbg !191 + br i1 %.not.i1363, label %3452, label %3450, !dbg !191 + +3450: ; preds = %__nv_exp2f.exit1362 + %3451 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3381) #3, !dbg !191 + br label %__nv_exp2f.exit1365, !dbg !191 + +3452: ; preds = %__nv_exp2f.exit1362 + %3453 = tail call float @llvm.nvvm.ex2.approx.f(float %3381) #3, !dbg !191 + br label %__nv_exp2f.exit1365, !dbg !191 + +__nv_exp2f.exit1365: ; preds = %3450, %3452 + %.0.i1364 = phi float [ %3451, %3450 ], [ %3453, %3452 ], !dbg !191 + %3454 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1366 = icmp eq i32 %3454, 0, !dbg !191 + br i1 %.not.i1366, label %3457, label %3455, !dbg !191 + +3455: ; preds = %__nv_exp2f.exit1365 + %3456 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3382) #3, !dbg !191 + br label %__nv_exp2f.exit1368, !dbg !191 + +3457: ; preds = %__nv_exp2f.exit1365 + %3458 = tail call float @llvm.nvvm.ex2.approx.f(float %3382) #3, !dbg !191 + br label %__nv_exp2f.exit1368, !dbg !191 + +__nv_exp2f.exit1368: ; preds = %3455, %3457 + %.0.i1367 = phi float [ %3456, %3455 ], [ %3458, %3457 ], !dbg !191 + %3459 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1369 = icmp eq i32 %3459, 0, !dbg !191 + br i1 %.not.i1369, label %3462, label %3460, !dbg !191 + +3460: ; preds = %__nv_exp2f.exit1368 + %3461 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3383) #3, !dbg !191 + br label %__nv_exp2f.exit1371, !dbg !191 + +3462: ; preds = %__nv_exp2f.exit1368 + %3463 = tail call float @llvm.nvvm.ex2.approx.f(float %3383) #3, !dbg !191 + br label %__nv_exp2f.exit1371, !dbg !191 + +__nv_exp2f.exit1371: ; preds = %3460, %3462 + %.0.i1370 = phi float [ %3461, %3460 ], [ %3463, %3462 ], !dbg !191 + %3464 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1372 = icmp eq i32 %3464, 0, !dbg !191 + br i1 %.not.i1372, label %3467, label %3465, !dbg !191 + +3465: ; preds = %__nv_exp2f.exit1371 + %3466 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3384) #3, !dbg !191 + br label %__nv_exp2f.exit1374, !dbg !191 + +3467: ; preds = %__nv_exp2f.exit1371 + %3468 = tail call float @llvm.nvvm.ex2.approx.f(float %3384) #3, !dbg !191 + br label %__nv_exp2f.exit1374, !dbg !191 + +__nv_exp2f.exit1374: ; preds = %3465, %3467 + %.0.i1373 = phi float [ %3466, %3465 ], [ %3468, %3467 ], !dbg !191 + %3469 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1375 = icmp eq i32 %3469, 0, !dbg !191 + br i1 %.not.i1375, label %3472, label %3470, !dbg !191 + +3470: ; preds = %__nv_exp2f.exit1374 + %3471 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3385) #3, !dbg !191 + br label %__nv_exp2f.exit1377, !dbg !191 + +3472: ; preds = %__nv_exp2f.exit1374 + %3473 = tail call float @llvm.nvvm.ex2.approx.f(float %3385) #3, !dbg !191 + br label %__nv_exp2f.exit1377, !dbg !191 + +__nv_exp2f.exit1377: ; preds = %3470, %3472 + %.0.i1376 = phi float [ %3471, %3470 ], [ %3473, %3472 ], !dbg !191 + %3474 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1378 = icmp eq i32 %3474, 0, !dbg !191 + br i1 %.not.i1378, label %3477, label %3475, !dbg !191 + +3475: ; preds = %__nv_exp2f.exit1377 + %3476 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3386) #3, !dbg !191 + br label %__nv_exp2f.exit1380, !dbg !191 + +3477: ; preds = %__nv_exp2f.exit1377 + %3478 = tail call float @llvm.nvvm.ex2.approx.f(float %3386) #3, !dbg !191 + br label %__nv_exp2f.exit1380, !dbg !191 + +__nv_exp2f.exit1380: ; preds = %3475, %3477 + %.0.i1379 = phi float [ %3476, %3475 ], [ %3478, %3477 ], !dbg !191 + %3479 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1381 = icmp eq i32 %3479, 0, !dbg !191 + br i1 %.not.i1381, label %3482, label %3480, !dbg !191 + +3480: ; preds = %__nv_exp2f.exit1380 + %3481 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3387) #3, !dbg !191 + br label %__nv_exp2f.exit1383, !dbg !191 + +3482: ; preds = %__nv_exp2f.exit1380 + %3483 = tail call float @llvm.nvvm.ex2.approx.f(float %3387) #3, !dbg !191 + br label %__nv_exp2f.exit1383, !dbg !191 + +__nv_exp2f.exit1383: ; preds = %3480, %3482 + %.0.i1382 = phi float [ %3481, %3480 ], [ %3483, %3482 ], !dbg !191 + %3484 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1384 = icmp eq i32 %3484, 0, !dbg !191 + br i1 %.not.i1384, label %3487, label %3485, !dbg !191 + +3485: ; preds = %__nv_exp2f.exit1383 + %3486 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3388) #3, !dbg !191 + br label %__nv_exp2f.exit1386, !dbg !191 + +3487: ; preds = %__nv_exp2f.exit1383 + %3488 = tail call float @llvm.nvvm.ex2.approx.f(float %3388) #3, !dbg !191 + br label %__nv_exp2f.exit1386, !dbg !191 + +__nv_exp2f.exit1386: ; preds = %3485, %3487 + %.0.i1385 = phi float [ %3486, %3485 ], [ %3488, %3487 ], !dbg !191 + %3489 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1387 = icmp eq i32 %3489, 0, !dbg !191 + br i1 %.not.i1387, label %3492, label %3490, !dbg !191 + +3490: ; preds = %__nv_exp2f.exit1386 + %3491 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3389) #3, !dbg !191 + br label %__nv_exp2f.exit1389, !dbg !191 + +3492: ; preds = %__nv_exp2f.exit1386 + %3493 = tail call float @llvm.nvvm.ex2.approx.f(float %3389) #3, !dbg !191 + br label %__nv_exp2f.exit1389, !dbg !191 + +__nv_exp2f.exit1389: ; preds = %3490, %3492 + %.0.i1388 = phi float [ %3491, %3490 ], [ %3493, %3492 ], !dbg !191 + %3494 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1390 = icmp eq i32 %3494, 0, !dbg !191 + br i1 %.not.i1390, label %3497, label %3495, !dbg !191 + +3495: ; preds = %__nv_exp2f.exit1389 + %3496 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3390) #3, !dbg !191 + br label %__nv_exp2f.exit1392, !dbg !191 + +3497: ; preds = %__nv_exp2f.exit1389 + %3498 = tail call float @llvm.nvvm.ex2.approx.f(float %3390) #3, !dbg !191 + br label %__nv_exp2f.exit1392, !dbg !191 + +__nv_exp2f.exit1392: ; preds = %3495, %3497 + %.0.i1391 = phi float [ %3496, %3495 ], [ %3498, %3497 ], !dbg !191 + %3499 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1393 = icmp eq i32 %3499, 0, !dbg !191 + br i1 %.not.i1393, label %3502, label %3500, !dbg !191 + +3500: ; preds = %__nv_exp2f.exit1392 + %3501 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3391) #3, !dbg !191 + br label %__nv_exp2f.exit1395, !dbg !191 + +3502: ; preds = %__nv_exp2f.exit1392 + %3503 = tail call float @llvm.nvvm.ex2.approx.f(float %3391) #3, !dbg !191 + br label %__nv_exp2f.exit1395, !dbg !191 + +__nv_exp2f.exit1395: ; preds = %3500, %3502 + %.0.i1394 = phi float [ %3501, %3500 ], [ %3503, %3502 ], !dbg !191 + %3504 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1396 = icmp eq i32 %3504, 0, !dbg !191 + br i1 %.not.i1396, label %3507, label %3505, !dbg !191 + +3505: ; preds = %__nv_exp2f.exit1395 + %3506 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3392) #3, !dbg !191 + br label %__nv_exp2f.exit1398, !dbg !191 + +3507: ; preds = %__nv_exp2f.exit1395 + %3508 = tail call float @llvm.nvvm.ex2.approx.f(float %3392) #3, !dbg !191 + br label %__nv_exp2f.exit1398, !dbg !191 + +__nv_exp2f.exit1398: ; preds = %3505, %3507 + %.0.i1397 = phi float [ %3506, %3505 ], [ %3508, %3507 ], !dbg !191 + %3509 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1399 = icmp eq i32 %3509, 0, !dbg !191 + br i1 %.not.i1399, label %3512, label %3510, !dbg !191 + +3510: ; preds = %__nv_exp2f.exit1398 + %3511 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3393) #3, !dbg !191 + br label %__nv_exp2f.exit1401, !dbg !191 + +3512: ; preds = %__nv_exp2f.exit1398 + %3513 = tail call float @llvm.nvvm.ex2.approx.f(float %3393) #3, !dbg !191 + br label %__nv_exp2f.exit1401, !dbg !191 + +__nv_exp2f.exit1401: ; preds = %3510, %3512 + %.0.i1400 = phi float [ %3511, %3510 ], [ %3513, %3512 ], !dbg !191 + %3514 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1402 = icmp eq i32 %3514, 0, !dbg !191 + br i1 %.not.i1402, label %3517, label %3515, !dbg !191 + +3515: ; preds = %__nv_exp2f.exit1401 + %3516 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3394) #3, !dbg !191 + br label %__nv_exp2f.exit1404, !dbg !191 + +3517: ; preds = %__nv_exp2f.exit1401 + %3518 = tail call float @llvm.nvvm.ex2.approx.f(float %3394) #3, !dbg !191 + br label %__nv_exp2f.exit1404, !dbg !191 + +__nv_exp2f.exit1404: ; preds = %3515, %3517 + %.0.i1403 = phi float [ %3516, %3515 ], [ %3518, %3517 ], !dbg !191 + %3519 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1405 = icmp eq i32 %3519, 0, !dbg !191 + br i1 %.not.i1405, label %3522, label %3520, !dbg !191 + +3520: ; preds = %__nv_exp2f.exit1404 + %3521 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3395) #3, !dbg !191 + br label %__nv_exp2f.exit1407, !dbg !191 + +3522: ; preds = %__nv_exp2f.exit1404 + %3523 = tail call float @llvm.nvvm.ex2.approx.f(float %3395) #3, !dbg !191 + br label %__nv_exp2f.exit1407, !dbg !191 + +__nv_exp2f.exit1407: ; preds = %3520, %3522 + %.0.i1406 = phi float [ %3521, %3520 ], [ %3523, %3522 ], !dbg !191 + %3524 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1408 = icmp eq i32 %3524, 0, !dbg !191 + br i1 %.not.i1408, label %3527, label %3525, !dbg !191 + +3525: ; preds = %__nv_exp2f.exit1407 + %3526 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3396) #3, !dbg !191 + br label %__nv_exp2f.exit1410, !dbg !191 + +3527: ; preds = %__nv_exp2f.exit1407 + %3528 = tail call float @llvm.nvvm.ex2.approx.f(float %3396) #3, !dbg !191 + br label %__nv_exp2f.exit1410, !dbg !191 + +__nv_exp2f.exit1410: ; preds = %3525, %3527 + %.0.i1409 = phi float [ %3526, %3525 ], [ %3528, %3527 ], !dbg !191 + %3529 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1411 = icmp eq i32 %3529, 0, !dbg !191 + br i1 %.not.i1411, label %3532, label %3530, !dbg !191 + +3530: ; preds = %__nv_exp2f.exit1410 + %3531 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3397) #3, !dbg !191 + br label %__nv_exp2f.exit1413, !dbg !191 + +3532: ; preds = %__nv_exp2f.exit1410 + %3533 = tail call float @llvm.nvvm.ex2.approx.f(float %3397) #3, !dbg !191 + br label %__nv_exp2f.exit1413, !dbg !191 + +__nv_exp2f.exit1413: ; preds = %3530, %3532 + %.0.i1412 = phi float [ %3531, %3530 ], [ %3533, %3532 ], !dbg !191 + %3534 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1414 = icmp eq i32 %3534, 0, !dbg !191 + br i1 %.not.i1414, label %3537, label %3535, !dbg !191 + +3535: ; preds = %__nv_exp2f.exit1413 + %3536 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3398) #3, !dbg !191 + br label %__nv_exp2f.exit1416, !dbg !191 + +3537: ; preds = %__nv_exp2f.exit1413 + %3538 = tail call float @llvm.nvvm.ex2.approx.f(float %3398) #3, !dbg !191 + br label %__nv_exp2f.exit1416, !dbg !191 + +__nv_exp2f.exit1416: ; preds = %3535, %3537 + %.0.i1415 = phi float [ %3536, %3535 ], [ %3538, %3537 ], !dbg !191 + %3539 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1417 = icmp eq i32 %3539, 0, !dbg !191 + br i1 %.not.i1417, label %3542, label %3540, !dbg !191 + +3540: ; preds = %__nv_exp2f.exit1416 + %3541 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3399) #3, !dbg !191 + br label %__nv_exp2f.exit1419, !dbg !191 + +3542: ; preds = %__nv_exp2f.exit1416 + %3543 = tail call float @llvm.nvvm.ex2.approx.f(float %3399) #3, !dbg !191 + br label %__nv_exp2f.exit1419, !dbg !191 + +__nv_exp2f.exit1419: ; preds = %3540, %3542 + %.0.i1418 = phi float [ %3541, %3540 ], [ %3543, %3542 ], !dbg !191 + %3544 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1420 = icmp eq i32 %3544, 0, !dbg !191 + br i1 %.not.i1420, label %3547, label %3545, !dbg !191 + +3545: ; preds = %__nv_exp2f.exit1419 + %3546 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3400) #3, !dbg !191 + br label %__nv_exp2f.exit1422, !dbg !191 + +3547: ; preds = %__nv_exp2f.exit1419 + %3548 = tail call float @llvm.nvvm.ex2.approx.f(float %3400) #3, !dbg !191 + br label %__nv_exp2f.exit1422, !dbg !191 + +__nv_exp2f.exit1422: ; preds = %3545, %3547 + %.0.i1421 = phi float [ %3546, %3545 ], [ %3548, %3547 ], !dbg !191 + %3549 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1423 = icmp eq i32 %3549, 0, !dbg !191 + br i1 %.not.i1423, label %3552, label %3550, !dbg !191 + +3550: ; preds = %__nv_exp2f.exit1422 + %3551 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3401) #3, !dbg !191 + br label %__nv_exp2f.exit1425, !dbg !191 + +3552: ; preds = %__nv_exp2f.exit1422 + %3553 = tail call float @llvm.nvvm.ex2.approx.f(float %3401) #3, !dbg !191 + br label %__nv_exp2f.exit1425, !dbg !191 + +__nv_exp2f.exit1425: ; preds = %3550, %3552 + %.0.i1424 = phi float [ %3551, %3550 ], [ %3553, %3552 ], !dbg !191 + %3554 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1426 = icmp eq i32 %3554, 0, !dbg !191 + br i1 %.not.i1426, label %3557, label %3555, !dbg !191 + +3555: ; preds = %__nv_exp2f.exit1425 + %3556 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3402) #3, !dbg !191 + br label %__nv_exp2f.exit1428, !dbg !191 + +3557: ; preds = %__nv_exp2f.exit1425 + %3558 = tail call float @llvm.nvvm.ex2.approx.f(float %3402) #3, !dbg !191 + br label %__nv_exp2f.exit1428, !dbg !191 + +__nv_exp2f.exit1428: ; preds = %3555, %3557 + %.0.i1427 = phi float [ %3556, %3555 ], [ %3558, %3557 ], !dbg !191 + %3559 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !191 + %.not.i1429 = icmp eq i32 %3559, 0, !dbg !191 + br i1 %.not.i1429, label %3562, label %3560, !dbg !191 + +3560: ; preds = %__nv_exp2f.exit1428 + %3561 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3403) #3, !dbg !191 + br label %__nv_exp2f.exit1431, !dbg !191 + +3562: ; preds = %__nv_exp2f.exit1428 + %3563 = tail call float @llvm.nvvm.ex2.approx.f(float %3403) #3, !dbg !191 + br label %__nv_exp2f.exit1431, !dbg !191 + +__nv_exp2f.exit1431: ; preds = %3560, %3562 + %.0.i1430 = phi float [ %3561, %3560 ], [ %3563, %3562 ], !dbg !191 + %3564 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %2887, !dbg !182 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !192 + %3565 = add i32 %2891, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !192 + %3566 = lshr exact i32 %3565, 4, !dbg !192 + %3567 = and i32 %3566, 16383, !dbg !192 + %3568 = zext nneg i32 %3567 to i64, !dbg !192 + %3569 = or disjoint i64 %3568, 4611686293372403712, !dbg !192 + %3570 = ptrtoint ptr addrspace(3) %3564 to i32, !dbg !192 + %3571 = lshr exact i32 %3570, 4, !dbg !192 + %3572 = and i32 %3571, 16383, !dbg !192 + %3573 = zext nneg i32 %3572 to i64, !dbg !192 + %3574 = or disjoint i64 %3573, 4611686293338849280, !dbg !192 + %3575 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %3569, i64 %3574) #3, !dbg !192 + %3576 = add i32 %2903, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !192 + %3577 = lshr exact i32 %3576, 4, !dbg !192 + %3578 = and i32 %3577, 16383, !dbg !192 + %3579 = zext nneg i32 %3578 to i64, !dbg !192 + %3580 = or disjoint i64 %3579, 4611686293372403712, !dbg !192 + %3581 = add i32 %3570, 32, !dbg !192 + %3582 = lshr exact i32 %3581, 4, !dbg !192 + %3583 = and i32 %3582, 16383, !dbg !192 + %3584 = zext nneg i32 %3583 to i64, !dbg !192 + %3585 = or disjoint i64 %3584, 4611686293338849280, !dbg !192 + %3586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 0, !dbg !192 + %3587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 1, !dbg !192 + %3588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 2, !dbg !192 + %3589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 3, !dbg !192 + %3590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 4, !dbg !192 + %3591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 5, !dbg !192 + %3592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 6, !dbg !192 + %3593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 7, !dbg !192 + %3594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 8, !dbg !192 + %3595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 9, !dbg !192 + %3596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 10, !dbg !192 + %3597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 11, !dbg !192 + %3598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 12, !dbg !192 + %3599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 13, !dbg !192 + %3600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 14, !dbg !192 + %3601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 15, !dbg !192 + %3602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 16, !dbg !192 + %3603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 17, !dbg !192 + %3604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 18, !dbg !192 + %3605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 19, !dbg !192 + %3606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 20, !dbg !192 + %3607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 21, !dbg !192 + %3608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 22, !dbg !192 + %3609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 23, !dbg !192 + %3610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 24, !dbg !192 + %3611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 25, !dbg !192 + %3612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 26, !dbg !192 + %3613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 27, !dbg !192 + %3614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 28, !dbg !192 + %3615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 29, !dbg !192 + %3616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 30, !dbg !192 + %3617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3575, 31, !dbg !192 + %3618 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3586, float %3587, float %3588, float %3589, float %3590, float %3591, float %3592, float %3593, float %3594, float %3595, float %3596, float %3597, float %3598, float %3599, float %3600, float %3601, float %3602, float %3603, float %3604, float %3605, float %3606, float %3607, float %3608, float %3609, float %3610, float %3611, float %3612, float %3613, float %3614, float %3615, float %3616, float %3617, i64 %3580, i64 %3585, i1 true) #3, !dbg !192 + %3619 = add i32 %2947, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !192 + %3620 = lshr exact i32 %3619, 4, !dbg !192 + %3621 = and i32 %3620, 16383, !dbg !192 + %3622 = zext nneg i32 %3621 to i64, !dbg !192 + %3623 = or disjoint i64 %3622, 4611686293372403712, !dbg !192 + %3624 = add i32 %3570, 64, !dbg !192 + %3625 = lshr exact i32 %3624, 4, !dbg !192 + %3626 = and i32 %3625, 16383, !dbg !192 + %3627 = zext nneg i32 %3626 to i64, !dbg !192 + %3628 = or disjoint i64 %3627, 4611686293338849280, !dbg !192 + %3629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 0, !dbg !192 + %3630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 1, !dbg !192 + %3631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 2, !dbg !192 + %3632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 3, !dbg !192 + %3633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 4, !dbg !192 + %3634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 5, !dbg !192 + %3635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 6, !dbg !192 + %3636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 7, !dbg !192 + %3637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 8, !dbg !192 + %3638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 9, !dbg !192 + %3639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 10, !dbg !192 + %3640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 11, !dbg !192 + %3641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 12, !dbg !192 + %3642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 13, !dbg !192 + %3643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 14, !dbg !192 + %3644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 15, !dbg !192 + %3645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 16, !dbg !192 + %3646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 17, !dbg !192 + %3647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 18, !dbg !192 + %3648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 19, !dbg !192 + %3649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 20, !dbg !192 + %3650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 21, !dbg !192 + %3651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 22, !dbg !192 + %3652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 23, !dbg !192 + %3653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 24, !dbg !192 + %3654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 25, !dbg !192 + %3655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 26, !dbg !192 + %3656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 27, !dbg !192 + %3657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 28, !dbg !192 + %3658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 29, !dbg !192 + %3659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 30, !dbg !192 + %3660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3618, 31, !dbg !192 + %3661 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3629, float %3630, float %3631, float %3632, float %3633, float %3634, float %3635, float %3636, float %3637, float %3638, float %3639, float %3640, float %3641, float %3642, float %3643, float %3644, float %3645, float %3646, float %3647, float %3648, float %3649, float %3650, float %3651, float %3652, float %3653, float %3654, float %3655, float %3656, float %3657, float %3658, float %3659, float %3660, i64 %3623, i64 %3628, i1 true) #3, !dbg !192 + %3662 = add i32 %2991, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !192 + %3663 = lshr exact i32 %3662, 4, !dbg !192 + %3664 = and i32 %3663, 16383, !dbg !192 + %3665 = zext nneg i32 %3664 to i64, !dbg !192 + %3666 = or disjoint i64 %3665, 4611686293372403712, !dbg !192 + %3667 = add i32 %3570, 96, !dbg !192 + %3668 = lshr exact i32 %3667, 4, !dbg !192 + %3669 = and i32 %3668, 16383, !dbg !192 + %3670 = zext nneg i32 %3669 to i64, !dbg !192 + %3671 = or disjoint i64 %3670, 4611686293338849280, !dbg !192 + %3672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 0, !dbg !192 + %3673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 1, !dbg !192 + %3674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 2, !dbg !192 + %3675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 3, !dbg !192 + %3676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 4, !dbg !192 + %3677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 5, !dbg !192 + %3678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 6, !dbg !192 + %3679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 7, !dbg !192 + %3680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 8, !dbg !192 + %3681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 9, !dbg !192 + %3682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 10, !dbg !192 + %3683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 11, !dbg !192 + %3684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 12, !dbg !192 + %3685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 13, !dbg !192 + %3686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 14, !dbg !192 + %3687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 15, !dbg !192 + %3688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 16, !dbg !192 + %3689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 17, !dbg !192 + %3690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 18, !dbg !192 + %3691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 19, !dbg !192 + %3692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 20, !dbg !192 + %3693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 21, !dbg !192 + %3694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 22, !dbg !192 + %3695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 23, !dbg !192 + %3696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 24, !dbg !192 + %3697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 25, !dbg !192 + %3698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 26, !dbg !192 + %3699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 27, !dbg !192 + %3700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 28, !dbg !192 + %3701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 29, !dbg !192 + %3702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 30, !dbg !192 + %3703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3661, 31, !dbg !192 + %3704 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3672, float %3673, float %3674, float %3675, float %3676, float %3677, float %3678, float %3679, float %3680, float %3681, float %3682, float %3683, float %3684, float %3685, float %3686, float %3687, float %3688, float %3689, float %3690, float %3691, float %3692, float %3693, float %3694, float %3695, float %3696, float %3697, float %3698, float %3699, float %3700, float %3701, float %3702, float %3703, i64 %3666, i64 %3671, i1 true) #3, !dbg !192 + %3705 = add i32 %3035, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !192 + %3706 = lshr exact i32 %3705, 4, !dbg !192 + %3707 = and i32 %3706, 16383, !dbg !192 + %3708 = zext nneg i32 %3707 to i64, !dbg !192 + %3709 = or disjoint i64 %3708, 4611686293372403712, !dbg !192 + %3710 = add i32 %3570, 8192, !dbg !192 + %3711 = lshr exact i32 %3710, 4, !dbg !192 + %3712 = and i32 %3711, 16383, !dbg !192 + %3713 = zext nneg i32 %3712 to i64, !dbg !192 + %3714 = or disjoint i64 %3713, 4611686293338849280, !dbg !192 + %3715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 0, !dbg !192 + %3716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 1, !dbg !192 + %3717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 2, !dbg !192 + %3718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 3, !dbg !192 + %3719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 4, !dbg !192 + %3720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 5, !dbg !192 + %3721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 6, !dbg !192 + %3722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 7, !dbg !192 + %3723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 8, !dbg !192 + %3724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 9, !dbg !192 + %3725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 10, !dbg !192 + %3726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 11, !dbg !192 + %3727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 12, !dbg !192 + %3728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 13, !dbg !192 + %3729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 14, !dbg !192 + %3730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 15, !dbg !192 + %3731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 16, !dbg !192 + %3732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 17, !dbg !192 + %3733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 18, !dbg !192 + %3734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 19, !dbg !192 + %3735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 20, !dbg !192 + %3736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 21, !dbg !192 + %3737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 22, !dbg !192 + %3738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 23, !dbg !192 + %3739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 24, !dbg !192 + %3740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 25, !dbg !192 + %3741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 26, !dbg !192 + %3742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 27, !dbg !192 + %3743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 28, !dbg !192 + %3744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 29, !dbg !192 + %3745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 30, !dbg !192 + %3746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3704, 31, !dbg !192 + %3747 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3715, float %3716, float %3717, float %3718, float %3719, float %3720, float %3721, float %3722, float %3723, float %3724, float %3725, float %3726, float %3727, float %3728, float %3729, float %3730, float %3731, float %3732, float %3733, float %3734, float %3735, float %3736, float %3737, float %3738, float %3739, float %3740, float %3741, float %3742, float %3743, float %3744, float %3745, float %3746, i64 %3709, i64 %3714, i1 true) #3, !dbg !192 + %3748 = add i32 %3079, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !192 + %3749 = lshr exact i32 %3748, 4, !dbg !192 + %3750 = and i32 %3749, 16383, !dbg !192 + %3751 = zext nneg i32 %3750 to i64, !dbg !192 + %3752 = or disjoint i64 %3751, 4611686293372403712, !dbg !192 + %3753 = add i32 %3570, 8224, !dbg !192 + %3754 = lshr exact i32 %3753, 4, !dbg !192 + %3755 = and i32 %3754, 16383, !dbg !192 + %3756 = zext nneg i32 %3755 to i64, !dbg !192 + %3757 = or disjoint i64 %3756, 4611686293338849280, !dbg !192 + %3758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 0, !dbg !192 + %3759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 1, !dbg !192 + %3760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 2, !dbg !192 + %3761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 3, !dbg !192 + %3762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 4, !dbg !192 + %3763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 5, !dbg !192 + %3764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 6, !dbg !192 + %3765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 7, !dbg !192 + %3766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 8, !dbg !192 + %3767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 9, !dbg !192 + %3768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 10, !dbg !192 + %3769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 11, !dbg !192 + %3770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 12, !dbg !192 + %3771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 13, !dbg !192 + %3772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 14, !dbg !192 + %3773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 15, !dbg !192 + %3774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 16, !dbg !192 + %3775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 17, !dbg !192 + %3776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 18, !dbg !192 + %3777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 19, !dbg !192 + %3778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 20, !dbg !192 + %3779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 21, !dbg !192 + %3780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 22, !dbg !192 + %3781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 23, !dbg !192 + %3782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 24, !dbg !192 + %3783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 25, !dbg !192 + %3784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 26, !dbg !192 + %3785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 27, !dbg !192 + %3786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 28, !dbg !192 + %3787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 29, !dbg !192 + %3788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 30, !dbg !192 + %3789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3747, 31, !dbg !192 + %3790 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3758, float %3759, float %3760, float %3761, float %3762, float %3763, float %3764, float %3765, float %3766, float %3767, float %3768, float %3769, float %3770, float %3771, float %3772, float %3773, float %3774, float %3775, float %3776, float %3777, float %3778, float %3779, float %3780, float %3781, float %3782, float %3783, float %3784, float %3785, float %3786, float %3787, float %3788, float %3789, i64 %3752, i64 %3757, i1 true) #3, !dbg !192 + %3791 = add i32 %3123, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !192 + %3792 = lshr exact i32 %3791, 4, !dbg !192 + %3793 = and i32 %3792, 16383, !dbg !192 + %3794 = zext nneg i32 %3793 to i64, !dbg !192 + %3795 = or disjoint i64 %3794, 4611686293372403712, !dbg !192 + %3796 = add i32 %3570, 8256, !dbg !192 + %3797 = lshr exact i32 %3796, 4, !dbg !192 + %3798 = and i32 %3797, 16383, !dbg !192 + %3799 = zext nneg i32 %3798 to i64, !dbg !192 + %3800 = or disjoint i64 %3799, 4611686293338849280, !dbg !192 + %3801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 0, !dbg !192 + %3802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 1, !dbg !192 + %3803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 2, !dbg !192 + %3804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 3, !dbg !192 + %3805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 4, !dbg !192 + %3806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 5, !dbg !192 + %3807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 6, !dbg !192 + %3808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 7, !dbg !192 + %3809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 8, !dbg !192 + %3810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 9, !dbg !192 + %3811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 10, !dbg !192 + %3812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 11, !dbg !192 + %3813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 12, !dbg !192 + %3814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 13, !dbg !192 + %3815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 14, !dbg !192 + %3816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 15, !dbg !192 + %3817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 16, !dbg !192 + %3818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 17, !dbg !192 + %3819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 18, !dbg !192 + %3820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 19, !dbg !192 + %3821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 20, !dbg !192 + %3822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 21, !dbg !192 + %3823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 22, !dbg !192 + %3824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 23, !dbg !192 + %3825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 24, !dbg !192 + %3826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 25, !dbg !192 + %3827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 26, !dbg !192 + %3828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 27, !dbg !192 + %3829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 28, !dbg !192 + %3830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 29, !dbg !192 + %3831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 30, !dbg !192 + %3832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3790, 31, !dbg !192 + %3833 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3801, float %3802, float %3803, float %3804, float %3805, float %3806, float %3807, float %3808, float %3809, float %3810, float %3811, float %3812, float %3813, float %3814, float %3815, float %3816, float %3817, float %3818, float %3819, float %3820, float %3821, float %3822, float %3823, float %3824, float %3825, float %3826, float %3827, float %3828, float %3829, float %3830, float %3831, float %3832, i64 %3795, i64 %3800, i1 true) #3, !dbg !192 + %3834 = add i32 %3167, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !192 + %3835 = lshr exact i32 %3834, 4, !dbg !192 + %3836 = and i32 %3835, 16383, !dbg !192 + %3837 = zext nneg i32 %3836 to i64, !dbg !192 + %3838 = or disjoint i64 %3837, 4611686293372403712, !dbg !192 + %3839 = add i32 %3570, 8288, !dbg !192 + %3840 = lshr exact i32 %3839, 4, !dbg !192 + %3841 = and i32 %3840, 16383, !dbg !192 + %3842 = zext nneg i32 %3841 to i64, !dbg !192 + %3843 = or disjoint i64 %3842, 4611686293338849280, !dbg !192 + %3844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 0, !dbg !192 + %3845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 1, !dbg !192 + %3846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 2, !dbg !192 + %3847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 3, !dbg !192 + %3848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 4, !dbg !192 + %3849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 5, !dbg !192 + %3850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 6, !dbg !192 + %3851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 7, !dbg !192 + %3852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 8, !dbg !192 + %3853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 9, !dbg !192 + %3854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 10, !dbg !192 + %3855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 11, !dbg !192 + %3856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 12, !dbg !192 + %3857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 13, !dbg !192 + %3858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 14, !dbg !192 + %3859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 15, !dbg !192 + %3860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 16, !dbg !192 + %3861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 17, !dbg !192 + %3862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 18, !dbg !192 + %3863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 19, !dbg !192 + %3864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 20, !dbg !192 + %3865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 21, !dbg !192 + %3866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 22, !dbg !192 + %3867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 23, !dbg !192 + %3868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 24, !dbg !192 + %3869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 25, !dbg !192 + %3870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 26, !dbg !192 + %3871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 27, !dbg !192 + %3872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 28, !dbg !192 + %3873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 29, !dbg !192 + %3874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 30, !dbg !192 + %3875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3833, 31, !dbg !192 + %3876 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3844, float %3845, float %3846, float %3847, float %3848, float %3849, float %3850, float %3851, float %3852, float %3853, float %3854, float %3855, float %3856, float %3857, float %3858, float %3859, float %3860, float %3861, float %3862, float %3863, float %3864, float %3865, float %3866, float %3867, float %3868, float %3869, float %3870, float %3871, float %3872, float %3873, float %3874, float %3875, i64 %3838, i64 %3843, i1 true) #3, !dbg !192 + %3877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 0, !dbg !192 + %3878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 1, !dbg !192 + %3879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 2, !dbg !192 + %3880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 3, !dbg !192 + %3881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 4, !dbg !192 + %3882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 5, !dbg !192 + %3883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 6, !dbg !192 + %3884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 7, !dbg !192 + %3885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 8, !dbg !192 + %3886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 9, !dbg !192 + %3887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 10, !dbg !192 + %3888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 11, !dbg !192 + %3889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 12, !dbg !192 + %3890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 13, !dbg !192 + %3891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 14, !dbg !192 + %3892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 15, !dbg !192 + %3893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 16, !dbg !192 + %3894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 17, !dbg !192 + %3895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 18, !dbg !192 + %3896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 19, !dbg !192 + %3897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 20, !dbg !192 + %3898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 21, !dbg !192 + %3899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 22, !dbg !192 + %3900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 23, !dbg !192 + %3901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 24, !dbg !192 + %3902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 25, !dbg !192 + %3903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 26, !dbg !192 + %3904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 27, !dbg !192 + %3905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 28, !dbg !192 + %3906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 29, !dbg !192 + %3907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 30, !dbg !192 + %3908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3876, 31, !dbg !192 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !192 + %3909 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %3877, float %3878, float %3879, float %3880, float %3881, float %3882, float %3883, float %3884, float %3885, float %3886, float %3887, float %3888, float %3889, float %3890, float %3891, float %3892, float %3893, float %3894, float %3895, float %3896, float %3897, float %3898, float %3899, float %3900, float %3901, float %3902, float %3903, float %3904, float %3905, float %3906, float %3907, float %3908, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 0, i32 0, ptr addrspace(3) %3564, i32 0, i32 0) #3, !dbg !192 + %3910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 0, !dbg !192 + %3911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 1, !dbg !192 + %3912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 2, !dbg !192 + %3913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 3, !dbg !192 + %3914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 4, !dbg !192 + %3915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 5, !dbg !192 + %3916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 6, !dbg !192 + %3917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 7, !dbg !192 + %3918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 8, !dbg !192 + %3919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 9, !dbg !192 + %3920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 10, !dbg !192 + %3921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 11, !dbg !192 + %3922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 12, !dbg !192 + %3923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 13, !dbg !192 + %3924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 14, !dbg !192 + %3925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 15, !dbg !192 + %3926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 16, !dbg !192 + %3927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 17, !dbg !192 + %3928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 18, !dbg !192 + %3929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 19, !dbg !192 + %3930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 20, !dbg !192 + %3931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 21, !dbg !192 + %3932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 22, !dbg !192 + %3933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 23, !dbg !192 + %3934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 24, !dbg !192 + %3935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 25, !dbg !192 + %3936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 26, !dbg !192 + %3937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 27, !dbg !192 + %3938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 28, !dbg !192 + %3939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 29, !dbg !192 + %3940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 30, !dbg !192 + %3941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3909, 31, !dbg !192 + %3942 = fsub float %3910, %370, !dbg !193 + %3943 = fsub float %3911, %370, !dbg !193 + %3944 = fsub float %3912, %372, !dbg !193 + %3945 = fsub float %3913, %372, !dbg !193 + %3946 = fsub float %3914, %370, !dbg !193 + %3947 = fsub float %3915, %370, !dbg !193 + %3948 = fsub float %3916, %372, !dbg !193 + %3949 = fsub float %3917, %372, !dbg !193 + %3950 = fsub float %3918, %370, !dbg !193 + %3951 = fsub float %3919, %370, !dbg !193 + %3952 = fsub float %3920, %372, !dbg !193 + %3953 = fsub float %3921, %372, !dbg !193 + %3954 = fsub float %3922, %370, !dbg !193 + %3955 = fsub float %3923, %370, !dbg !193 + %3956 = fsub float %3924, %372, !dbg !193 + %3957 = fsub float %3925, %372, !dbg !193 + %3958 = fsub float %3926, %370, !dbg !193 + %3959 = fsub float %3927, %370, !dbg !193 + %3960 = fsub float %3928, %372, !dbg !193 + %3961 = fsub float %3929, %372, !dbg !193 + %3962 = fsub float %3930, %370, !dbg !193 + %3963 = fsub float %3931, %370, !dbg !193 + %3964 = fsub float %3932, %372, !dbg !193 + %3965 = fsub float %3933, %372, !dbg !193 + %3966 = fsub float %3934, %370, !dbg !193 + %3967 = fsub float %3935, %370, !dbg !193 + %3968 = fsub float %3936, %372, !dbg !193 + %3969 = fsub float %3937, %372, !dbg !193 + %3970 = fsub float %3938, %370, !dbg !193 + %3971 = fsub float %3939, %370, !dbg !193 + %3972 = fsub float %3940, %372, !dbg !193 + %3973 = fsub float %3941, %372, !dbg !193 + %3974 = fmul float %.0.i1337, %3942, !dbg !194 + %3975 = fmul float %.0.i1340, %3943, !dbg !194 + %3976 = fmul float %.0.i1343, %3944, !dbg !194 + %3977 = fmul float %.0.i1346, %3945, !dbg !194 + %3978 = fmul float %.0.i1349, %3946, !dbg !194 + %3979 = fmul float %.0.i1352, %3947, !dbg !194 + %3980 = fmul float %.0.i1355, %3948, !dbg !194 + %3981 = fmul float %.0.i1358, %3949, !dbg !194 + %3982 = fmul float %.0.i1361, %3950, !dbg !194 + %3983 = fmul float %.0.i1364, %3951, !dbg !194 + %3984 = fmul float %.0.i1367, %3952, !dbg !194 + %3985 = fmul float %.0.i1370, %3953, !dbg !194 + %3986 = fmul float %.0.i1373, %3954, !dbg !194 + %3987 = fmul float %.0.i1376, %3955, !dbg !194 + %3988 = fmul float %.0.i1379, %3956, !dbg !194 + %3989 = fmul float %.0.i1382, %3957, !dbg !194 + %3990 = fmul float %.0.i1385, %3958, !dbg !194 + %3991 = fmul float %.0.i1388, %3959, !dbg !194 + %3992 = fmul float %.0.i1391, %3960, !dbg !194 + %3993 = fmul float %.0.i1394, %3961, !dbg !194 + %3994 = fmul float %.0.i1397, %3962, !dbg !194 + %3995 = fmul float %.0.i1400, %3963, !dbg !194 + %3996 = fmul float %.0.i1403, %3964, !dbg !194 + %3997 = fmul float %.0.i1406, %3965, !dbg !194 + %3998 = fmul float %.0.i1409, %3966, !dbg !194 + %3999 = fmul float %.0.i1412, %3967, !dbg !194 + %4000 = fmul float %.0.i1415, %3968, !dbg !194 + %4001 = fmul float %.0.i1418, %3969, !dbg !194 + %4002 = fmul float %.0.i1421, %3970, !dbg !194 + %4003 = fmul float %.0.i1424, %3971, !dbg !194 + %4004 = fmul float %.0.i1427, %3972, !dbg !194 + %4005 = fmul float %.0.i1430, %3973, !dbg !194 + %4006 = fptrunc float %3974 to bfloat, !dbg !195 + %4007 = select i1 %2856, bfloat %4006, bfloat 0xR0000, !dbg !196 + %4008 = fptrunc float %3975 to bfloat, !dbg !195 + %4009 = select i1 %2858, bfloat %4008, bfloat 0xR0000, !dbg !196 + %4010 = fptrunc float %3976 to bfloat, !dbg !195 + %4011 = select i1 %2856, bfloat %4010, bfloat 0xR0000, !dbg !196 + %4012 = fptrunc float %3977 to bfloat, !dbg !195 + %4013 = select i1 %2858, bfloat %4012, bfloat 0xR0000, !dbg !196 + %4014 = fptrunc float %3978 to bfloat, !dbg !195 + %4015 = select i1 %2860, bfloat %4014, bfloat 0xR0000, !dbg !196 + %4016 = fptrunc float %3979 to bfloat, !dbg !195 + %4017 = select i1 %2862, bfloat %4016, bfloat 0xR0000, !dbg !196 + %4018 = fptrunc float %3980 to bfloat, !dbg !195 + %4019 = select i1 %2860, bfloat %4018, bfloat 0xR0000, !dbg !196 + %4020 = fptrunc float %3981 to bfloat, !dbg !195 + %4021 = select i1 %2862, bfloat %4020, bfloat 0xR0000, !dbg !196 + %4022 = fptrunc float %3982 to bfloat, !dbg !195 + %4023 = select i1 %2864, bfloat %4022, bfloat 0xR0000, !dbg !196 + %4024 = fptrunc float %3983 to bfloat, !dbg !195 + %4025 = select i1 %2866, bfloat %4024, bfloat 0xR0000, !dbg !196 + %4026 = fptrunc float %3984 to bfloat, !dbg !195 + %4027 = select i1 %2864, bfloat %4026, bfloat 0xR0000, !dbg !196 + %4028 = fptrunc float %3985 to bfloat, !dbg !195 + %4029 = select i1 %2866, bfloat %4028, bfloat 0xR0000, !dbg !196 + %4030 = fptrunc float %3986 to bfloat, !dbg !195 + %4031 = select i1 %2868, bfloat %4030, bfloat 0xR0000, !dbg !196 + %4032 = fptrunc float %3987 to bfloat, !dbg !195 + %4033 = select i1 %2870, bfloat %4032, bfloat 0xR0000, !dbg !196 + %4034 = fptrunc float %3988 to bfloat, !dbg !195 + %4035 = select i1 %2868, bfloat %4034, bfloat 0xR0000, !dbg !196 + %4036 = fptrunc float %3989 to bfloat, !dbg !195 + %4037 = select i1 %2870, bfloat %4036, bfloat 0xR0000, !dbg !196 + %4038 = fptrunc float %3990 to bfloat, !dbg !195 + %4039 = select i1 %2872, bfloat %4038, bfloat 0xR0000, !dbg !196 + %4040 = fptrunc float %3991 to bfloat, !dbg !195 + %4041 = select i1 %2874, bfloat %4040, bfloat 0xR0000, !dbg !196 + %4042 = fptrunc float %3992 to bfloat, !dbg !195 + %4043 = select i1 %2872, bfloat %4042, bfloat 0xR0000, !dbg !196 + %4044 = fptrunc float %3993 to bfloat, !dbg !195 + %4045 = select i1 %2874, bfloat %4044, bfloat 0xR0000, !dbg !196 + %4046 = fptrunc float %3994 to bfloat, !dbg !195 + %4047 = select i1 %2876, bfloat %4046, bfloat 0xR0000, !dbg !196 + %4048 = fptrunc float %3995 to bfloat, !dbg !195 + %4049 = select i1 %2878, bfloat %4048, bfloat 0xR0000, !dbg !196 + %4050 = fptrunc float %3996 to bfloat, !dbg !195 + %4051 = select i1 %2876, bfloat %4050, bfloat 0xR0000, !dbg !196 + %4052 = fptrunc float %3997 to bfloat, !dbg !195 + %4053 = select i1 %2878, bfloat %4052, bfloat 0xR0000, !dbg !196 + %4054 = fptrunc float %3998 to bfloat, !dbg !195 + %4055 = select i1 %2880, bfloat %4054, bfloat 0xR0000, !dbg !196 + %4056 = fptrunc float %3999 to bfloat, !dbg !195 + %4057 = select i1 %2882, bfloat %4056, bfloat 0xR0000, !dbg !196 + %4058 = fptrunc float %4000 to bfloat, !dbg !195 + %4059 = select i1 %2880, bfloat %4058, bfloat 0xR0000, !dbg !196 + %4060 = fptrunc float %4001 to bfloat, !dbg !195 + %4061 = select i1 %2882, bfloat %4060, bfloat 0xR0000, !dbg !196 + %4062 = fptrunc float %4002 to bfloat, !dbg !195 + %4063 = select i1 %2884, bfloat %4062, bfloat 0xR0000, !dbg !196 + %4064 = fptrunc float %4003 to bfloat, !dbg !195 + %4065 = select i1 %2886, bfloat %4064, bfloat 0xR0000, !dbg !196 + %4066 = fptrunc float %4004 to bfloat, !dbg !195 + %4067 = select i1 %2884, bfloat %4066, bfloat 0xR0000, !dbg !196 + %4068 = fptrunc float %4005 to bfloat, !dbg !195 + %4069 = select i1 %2886, bfloat %4068, bfloat 0xR0000, !dbg !196 + %4070 = insertelement <2 x bfloat> poison, bfloat %4007, i64 0, !dbg !197 + %4071 = insertelement <2 x bfloat> %4070, bfloat %4009, i64 1, !dbg !197 + %4072 = bitcast <2 x bfloat> %4071 to i32, !dbg !197 + %4073 = insertelement <2 x bfloat> poison, bfloat %4011, i64 0, !dbg !197 + %4074 = insertelement <2 x bfloat> %4073, bfloat %4013, i64 1, !dbg !197 + %4075 = bitcast <2 x bfloat> %4074 to i32, !dbg !197 + %4076 = insertelement <2 x bfloat> poison, bfloat %4015, i64 0, !dbg !197 + %4077 = insertelement <2 x bfloat> %4076, bfloat %4017, i64 1, !dbg !197 + %4078 = bitcast <2 x bfloat> %4077 to i32, !dbg !197 + %4079 = insertelement <2 x bfloat> poison, bfloat %4019, i64 0, !dbg !197 + %4080 = insertelement <2 x bfloat> %4079, bfloat %4021, i64 1, !dbg !197 + %4081 = bitcast <2 x bfloat> %4080 to i32, !dbg !197 + %4082 = insertelement <2 x bfloat> poison, bfloat %4023, i64 0, !dbg !197 + %4083 = insertelement <2 x bfloat> %4082, bfloat %4025, i64 1, !dbg !197 + %4084 = bitcast <2 x bfloat> %4083 to i32, !dbg !197 + %4085 = insertelement <2 x bfloat> poison, bfloat %4027, i64 0, !dbg !197 + %4086 = insertelement <2 x bfloat> %4085, bfloat %4029, i64 1, !dbg !197 + %4087 = bitcast <2 x bfloat> %4086 to i32, !dbg !197 + %4088 = insertelement <2 x bfloat> poison, bfloat %4031, i64 0, !dbg !197 + %4089 = insertelement <2 x bfloat> %4088, bfloat %4033, i64 1, !dbg !197 + %4090 = bitcast <2 x bfloat> %4089 to i32, !dbg !197 + %4091 = insertelement <2 x bfloat> poison, bfloat %4035, i64 0, !dbg !197 + %4092 = insertelement <2 x bfloat> %4091, bfloat %4037, i64 1, !dbg !197 + %4093 = bitcast <2 x bfloat> %4092 to i32, !dbg !197 + %4094 = insertelement <2 x bfloat> poison, bfloat %4039, i64 0, !dbg !197 + %4095 = insertelement <2 x bfloat> %4094, bfloat %4041, i64 1, !dbg !197 + %4096 = bitcast <2 x bfloat> %4095 to i32, !dbg !197 + %4097 = insertelement <2 x bfloat> poison, bfloat %4043, i64 0, !dbg !197 + %4098 = insertelement <2 x bfloat> %4097, bfloat %4045, i64 1, !dbg !197 + %4099 = bitcast <2 x bfloat> %4098 to i32, !dbg !197 + %4100 = insertelement <2 x bfloat> poison, bfloat %4047, i64 0, !dbg !197 + %4101 = insertelement <2 x bfloat> %4100, bfloat %4049, i64 1, !dbg !197 + %4102 = bitcast <2 x bfloat> %4101 to i32, !dbg !197 + %4103 = insertelement <2 x bfloat> poison, bfloat %4051, i64 0, !dbg !197 + %4104 = insertelement <2 x bfloat> %4103, bfloat %4053, i64 1, !dbg !197 + %4105 = bitcast <2 x bfloat> %4104 to i32, !dbg !197 + %4106 = insertelement <2 x bfloat> poison, bfloat %4055, i64 0, !dbg !197 + %4107 = insertelement <2 x bfloat> %4106, bfloat %4057, i64 1, !dbg !197 + %4108 = bitcast <2 x bfloat> %4107 to i32, !dbg !197 + %4109 = insertelement <2 x bfloat> poison, bfloat %4059, i64 0, !dbg !197 + %4110 = insertelement <2 x bfloat> %4109, bfloat %4061, i64 1, !dbg !197 + %4111 = bitcast <2 x bfloat> %4110 to i32, !dbg !197 + %4112 = insertelement <2 x bfloat> poison, bfloat %4063, i64 0, !dbg !197 + %4113 = insertelement <2 x bfloat> %4112, bfloat %4065, i64 1, !dbg !197 + %4114 = bitcast <2 x bfloat> %4113 to i32, !dbg !197 + %4115 = insertelement <2 x bfloat> poison, bfloat %4067, i64 0, !dbg !197 + %4116 = insertelement <2 x bfloat> %4115, bfloat %4069, i64 1, !dbg !197 + %4117 = bitcast <2 x bfloat> %4116 to i32, !dbg !197 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !197 + %4118 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %.pn, float %.pn2450, float %.pn2451, float %.pn2452, float %.pn2453, float %.pn2454, float %.pn2455, float %.pn2456, float %.pn2457, float %.pn2458, float %.pn2459, float %.pn2460, float %.pn2461, float %.pn2462, float %.pn2463, float %.pn2464, float %.pn2465, float %.pn2466, float %.pn2467, float %.pn2468, float %.pn2469, float %.pn2470, float %.pn2471, float %.pn2472, float %.pn2473, float %.pn2474, float %.pn2475, float %.pn2476, float %.pn2477, float %.pn2478, float %.pn2479, float %.pn2480, float %.pn2481, float %.pn2482, float %.pn2483, float %.pn2484, float %.pn2485, float %.pn2486, float %.pn2487, float %.pn2488, float %.pn2489, float %.pn2490, float %.pn2491, float %.pn2492, float %.pn2493, float %.pn2494, float %.pn2495, float %.pn2496, float %.pn2497, float %.pn2498, float %.pn2499, float %.pn2500, float %.pn2501, float %.pn2502, float %.pn2503, float %.pn2504, float %.pn2505, float %.pn2506, float %.pn2507, float %.pn2508, float %.pn2509, float %.pn2510, float %.pn2511, float %.pn2512, i32 %4072, i32 %4075, i32 %4078, i32 %4081, i64 %2901, i1 true) #3, !dbg !197 + %4119 = add i32 %2897, 2048, !dbg !197 + %4120 = lshr exact i32 %4119, 4, !dbg !197 + %4121 = and i32 %4120, 16383, !dbg !197 + %4122 = zext nneg i32 %4121 to i64, !dbg !197 + %4123 = or disjoint i64 %4122, 4611686293338849280, !dbg !197 + %4124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 0, !dbg !197 + %4125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 1, !dbg !197 + %4126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 2, !dbg !197 + %4127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 3, !dbg !197 + %4128 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 4, !dbg !197 + %4129 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 5, !dbg !197 + %4130 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 6, !dbg !197 + %4131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 7, !dbg !197 + %4132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 8, !dbg !197 + %4133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 9, !dbg !197 + %4134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 10, !dbg !197 + %4135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 11, !dbg !197 + %4136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 12, !dbg !197 + %4137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 13, !dbg !197 + %4138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 14, !dbg !197 + %4139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 15, !dbg !197 + %4140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 16, !dbg !197 + %4141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 17, !dbg !197 + %4142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 18, !dbg !197 + %4143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 19, !dbg !197 + %4144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 20, !dbg !197 + %4145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 21, !dbg !197 + %4146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 22, !dbg !197 + %4147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 23, !dbg !197 + %4148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 24, !dbg !197 + %4149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 25, !dbg !197 + %4150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 26, !dbg !197 + %4151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 27, !dbg !197 + %4152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 28, !dbg !197 + %4153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 29, !dbg !197 + %4154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 30, !dbg !197 + %4155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 31, !dbg !197 + %4156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 32, !dbg !197 + %4157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 33, !dbg !197 + %4158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 34, !dbg !197 + %4159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 35, !dbg !197 + %4160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 36, !dbg !197 + %4161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 37, !dbg !197 + %4162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 38, !dbg !197 + %4163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 39, !dbg !197 + %4164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 40, !dbg !197 + %4165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 41, !dbg !197 + %4166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 42, !dbg !197 + %4167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 43, !dbg !197 + %4168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 44, !dbg !197 + %4169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 45, !dbg !197 + %4170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 46, !dbg !197 + %4171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 47, !dbg !197 + %4172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 48, !dbg !197 + %4173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 49, !dbg !197 + %4174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 50, !dbg !197 + %4175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 51, !dbg !197 + %4176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 52, !dbg !197 + %4177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 53, !dbg !197 + %4178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 54, !dbg !197 + %4179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 55, !dbg !197 + %4180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 56, !dbg !197 + %4181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 57, !dbg !197 + %4182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 58, !dbg !197 + %4183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 59, !dbg !197 + %4184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 60, !dbg !197 + %4185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 61, !dbg !197 + %4186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 62, !dbg !197 + %4187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4118, 63, !dbg !197 + %4188 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %4124, float %4125, float %4126, float %4127, float %4128, float %4129, float %4130, float %4131, float %4132, float %4133, float %4134, float %4135, float %4136, float %4137, float %4138, float %4139, float %4140, float %4141, float %4142, float %4143, float %4144, float %4145, float %4146, float %4147, float %4148, float %4149, float %4150, float %4151, float %4152, float %4153, float %4154, float %4155, float %4156, float %4157, float %4158, float %4159, float %4160, float %4161, float %4162, float %4163, float %4164, float %4165, float %4166, float %4167, float %4168, float %4169, float %4170, float %4171, float %4172, float %4173, float %4174, float %4175, float %4176, float %4177, float %4178, float %4179, float %4180, float %4181, float %4182, float %4183, float %4184, float %4185, float %4186, float %4187, i32 %4084, i32 %4087, i32 %4090, i32 %4093, i64 %4123, i1 true) #3, !dbg !197 + %4189 = add i32 %2897, 4096, !dbg !197 + %4190 = lshr exact i32 %4189, 4, !dbg !197 + %4191 = and i32 %4190, 16383, !dbg !197 + %4192 = zext nneg i32 %4191 to i64, !dbg !197 + %4193 = or disjoint i64 %4192, 4611686293338849280, !dbg !197 + %4194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 0, !dbg !197 + %4195 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 1, !dbg !197 + %4196 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 2, !dbg !197 + %4197 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 3, !dbg !197 + %4198 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 4, !dbg !197 + %4199 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 5, !dbg !197 + %4200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 6, !dbg !197 + %4201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 7, !dbg !197 + %4202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 8, !dbg !197 + %4203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 9, !dbg !197 + %4204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 10, !dbg !197 + %4205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 11, !dbg !197 + %4206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 12, !dbg !197 + %4207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 13, !dbg !197 + %4208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 14, !dbg !197 + %4209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 15, !dbg !197 + %4210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 16, !dbg !197 + %4211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 17, !dbg !197 + %4212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 18, !dbg !197 + %4213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 19, !dbg !197 + %4214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 20, !dbg !197 + %4215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 21, !dbg !197 + %4216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 22, !dbg !197 + %4217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 23, !dbg !197 + %4218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 24, !dbg !197 + %4219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 25, !dbg !197 + %4220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 26, !dbg !197 + %4221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 27, !dbg !197 + %4222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 28, !dbg !197 + %4223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 29, !dbg !197 + %4224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 30, !dbg !197 + %4225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 31, !dbg !197 + %4226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 32, !dbg !197 + %4227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 33, !dbg !197 + %4228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 34, !dbg !197 + %4229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 35, !dbg !197 + %4230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 36, !dbg !197 + %4231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 37, !dbg !197 + %4232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 38, !dbg !197 + %4233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 39, !dbg !197 + %4234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 40, !dbg !197 + %4235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 41, !dbg !197 + %4236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 42, !dbg !197 + %4237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 43, !dbg !197 + %4238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 44, !dbg !197 + %4239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 45, !dbg !197 + %4240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 46, !dbg !197 + %4241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 47, !dbg !197 + %4242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 48, !dbg !197 + %4243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 49, !dbg !197 + %4244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 50, !dbg !197 + %4245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 51, !dbg !197 + %4246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 52, !dbg !197 + %4247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 53, !dbg !197 + %4248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 54, !dbg !197 + %4249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 55, !dbg !197 + %4250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 56, !dbg !197 + %4251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 57, !dbg !197 + %4252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 58, !dbg !197 + %4253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 59, !dbg !197 + %4254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 60, !dbg !197 + %4255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 61, !dbg !197 + %4256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 62, !dbg !197 + %4257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4188, 63, !dbg !197 + %4258 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %4194, float %4195, float %4196, float %4197, float %4198, float %4199, float %4200, float %4201, float %4202, float %4203, float %4204, float %4205, float %4206, float %4207, float %4208, float %4209, float %4210, float %4211, float %4212, float %4213, float %4214, float %4215, float %4216, float %4217, float %4218, float %4219, float %4220, float %4221, float %4222, float %4223, float %4224, float %4225, float %4226, float %4227, float %4228, float %4229, float %4230, float %4231, float %4232, float %4233, float %4234, float %4235, float %4236, float %4237, float %4238, float %4239, float %4240, float %4241, float %4242, float %4243, float %4244, float %4245, float %4246, float %4247, float %4248, float %4249, float %4250, float %4251, float %4252, float %4253, float %4254, float %4255, float %4256, float %4257, i32 %4096, i32 %4099, i32 %4102, i32 %4105, i64 %4193, i1 true) #3, !dbg !197 + %4259 = add i32 %2897, 6144, !dbg !197 + %4260 = lshr exact i32 %4259, 4, !dbg !197 + %4261 = and i32 %4260, 16383, !dbg !197 + %4262 = zext nneg i32 %4261 to i64, !dbg !197 + %4263 = or disjoint i64 %4262, 4611686293338849280, !dbg !197 + %4264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 0, !dbg !197 + %4265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 1, !dbg !197 + %4266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 2, !dbg !197 + %4267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 3, !dbg !197 + %4268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 4, !dbg !197 + %4269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 5, !dbg !197 + %4270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 6, !dbg !197 + %4271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 7, !dbg !197 + %4272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 8, !dbg !197 + %4273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 9, !dbg !197 + %4274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 10, !dbg !197 + %4275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 11, !dbg !197 + %4276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 12, !dbg !197 + %4277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 13, !dbg !197 + %4278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 14, !dbg !197 + %4279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 15, !dbg !197 + %4280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 16, !dbg !197 + %4281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 17, !dbg !197 + %4282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 18, !dbg !197 + %4283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 19, !dbg !197 + %4284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 20, !dbg !197 + %4285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 21, !dbg !197 + %4286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 22, !dbg !197 + %4287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 23, !dbg !197 + %4288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 24, !dbg !197 + %4289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 25, !dbg !197 + %4290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 26, !dbg !197 + %4291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 27, !dbg !197 + %4292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 28, !dbg !197 + %4293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 29, !dbg !197 + %4294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 30, !dbg !197 + %4295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 31, !dbg !197 + %4296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 32, !dbg !197 + %4297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 33, !dbg !197 + %4298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 34, !dbg !197 + %4299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 35, !dbg !197 + %4300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 36, !dbg !197 + %4301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 37, !dbg !197 + %4302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 38, !dbg !197 + %4303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 39, !dbg !197 + %4304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 40, !dbg !197 + %4305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 41, !dbg !197 + %4306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 42, !dbg !197 + %4307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 43, !dbg !197 + %4308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 44, !dbg !197 + %4309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 45, !dbg !197 + %4310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 46, !dbg !197 + %4311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 47, !dbg !197 + %4312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 48, !dbg !197 + %4313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 49, !dbg !197 + %4314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 50, !dbg !197 + %4315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 51, !dbg !197 + %4316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 52, !dbg !197 + %4317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 53, !dbg !197 + %4318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 54, !dbg !197 + %4319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 55, !dbg !197 + %4320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 56, !dbg !197 + %4321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 57, !dbg !197 + %4322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 58, !dbg !197 + %4323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 59, !dbg !197 + %4324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 60, !dbg !197 + %4325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 61, !dbg !197 + %4326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 62, !dbg !197 + %4327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4258, 63, !dbg !197 + %4328 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %4264, float %4265, float %4266, float %4267, float %4268, float %4269, float %4270, float %4271, float %4272, float %4273, float %4274, float %4275, float %4276, float %4277, float %4278, float %4279, float %4280, float %4281, float %4282, float %4283, float %4284, float %4285, float %4286, float %4287, float %4288, float %4289, float %4290, float %4291, float %4292, float %4293, float %4294, float %4295, float %4296, float %4297, float %4298, float %4299, float %4300, float %4301, float %4302, float %4303, float %4304, float %4305, float %4306, float %4307, float %4308, float %4309, float %4310, float %4311, float %4312, float %4313, float %4314, float %4315, float %4316, float %4317, float %4318, float %4319, float %4320, float %4321, float %4322, float %4323, float %4324, float %4325, float %4326, float %4327, i32 %4108, i32 %4111, i32 %4114, i32 %4117, i64 %4263, i1 true) #3, !dbg !197 + %4329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 0, !dbg !197 + %4330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 1, !dbg !197 + %4331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 2, !dbg !197 + %4332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 3, !dbg !197 + %4333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 4, !dbg !197 + %4334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 5, !dbg !197 + %4335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 6, !dbg !197 + %4336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 7, !dbg !197 + %4337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 8, !dbg !197 + %4338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 9, !dbg !197 + %4339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 10, !dbg !197 + %4340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 11, !dbg !197 + %4341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 12, !dbg !197 + %4342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 13, !dbg !197 + %4343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 14, !dbg !197 + %4344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 15, !dbg !197 + %4345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 16, !dbg !197 + %4346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 17, !dbg !197 + %4347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 18, !dbg !197 + %4348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 19, !dbg !197 + %4349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 20, !dbg !197 + %4350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 21, !dbg !197 + %4351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 22, !dbg !197 + %4352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 23, !dbg !197 + %4353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 24, !dbg !197 + %4354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 25, !dbg !197 + %4355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 26, !dbg !197 + %4356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 27, !dbg !197 + %4357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 28, !dbg !197 + %4358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 29, !dbg !197 + %4359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 30, !dbg !197 + %4360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 31, !dbg !197 + %4361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 32, !dbg !197 + %4362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 33, !dbg !197 + %4363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 34, !dbg !197 + %4364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 35, !dbg !197 + %4365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 36, !dbg !197 + %4366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 37, !dbg !197 + %4367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 38, !dbg !197 + %4368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 39, !dbg !197 + %4369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 40, !dbg !197 + %4370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 41, !dbg !197 + %4371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 42, !dbg !197 + %4372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 43, !dbg !197 + %4373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 44, !dbg !197 + %4374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 45, !dbg !197 + %4375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 46, !dbg !197 + %4376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 47, !dbg !197 + %4377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 48, !dbg !197 + %4378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 49, !dbg !197 + %4379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 50, !dbg !197 + %4380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 51, !dbg !197 + %4381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 52, !dbg !197 + %4382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 53, !dbg !197 + %4383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 54, !dbg !197 + %4384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 55, !dbg !197 + %4385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 56, !dbg !197 + %4386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 57, !dbg !197 + %4387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 58, !dbg !197 + %4388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 59, !dbg !197 + %4389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 60, !dbg !197 + %4390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 61, !dbg !197 + %4391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 62, !dbg !197 + %4392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4328, 63, !dbg !197 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !197 + %4393 = insertelement <16 x i32> poison, i32 %2845, i64 0, !dbg !185 + %4394 = shufflevector <16 x i32> %4393, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !185 + %4395 = add <16 x i32> %4394, %2849, !dbg !185 + %4396 = add nuw nsw i32 %2848, 1, !dbg !180 + %4397 = lshr i32 %4396, 1, !dbg !198 + %4398 = zext nneg i32 %4397 to i64, !dbg !199 + %4399 = getelementptr i32, ptr addrspace(1) %2631, i64 %4398, !dbg !199 + %4400 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !200 + %4401 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %4399, i64 %4400, i1 %2851) #3, !dbg !200 + %4402 = add nuw nsw i32 %4397, 1, !dbg !201 + %4403 = icmp slt i32 %4402, %2635, !dbg !202 + %4404 = getelementptr i8, ptr addrspace(1) %4399, i64 4, !dbg !203 + %4405 = and i1 %2851, %4403, !dbg !180 + %4406 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !204 + %4407 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %4404, i64 %4406, i1 %4405) #3, !dbg !204 + %4408 = and i32 %2848, 1, !dbg !205 + %4409 = sub i32 %4407, %4401, !dbg !206 + %4410 = shl i32 %4409, 7, !dbg !207 + %4411 = add i32 %4410, -64, !dbg !208 + %4412 = xor i32 %4408, 1, !dbg !209 + %4413 = mul nuw nsw i32 %4411, %4412, !dbg !209 + %4414 = shl nuw nsw i32 %4408, 6, !dbg !210 + %4415 = add i32 %4413, %4414, !dbg !211 + %4416 = shl i32 %4415, 7, !dbg !212 + %4417 = sext i32 %4416 to i64, !dbg !183 + %4418 = getelementptr bfloat, ptr addrspace(1) %.pn10421563, i64 %4417, !dbg !183 + %4419 = getelementptr bfloat, ptr addrspace(1) %.pn10261564, i64 %4417, !dbg !183 + %4420 = getelementptr bfloat, ptr addrspace(1) %.pn10101565, i64 %4417, !dbg !183 + %4421 = getelementptr bfloat, ptr addrspace(1) %.pn9941566, i64 %4417, !dbg !183 + %4422 = getelementptr bfloat, ptr addrspace(1) %.pn11141571, i64 %4417, !dbg !184 + %4423 = getelementptr bfloat, ptr addrspace(1) %.pn10981572, i64 %4417, !dbg !184 + %4424 = getelementptr bfloat, ptr addrspace(1) %.pn10821573, i64 %4417, !dbg !184 + %4425 = getelementptr bfloat, ptr addrspace(1) %.pn10661574, i64 %4417, !dbg !184 + %4426 = add i32 %4415, %.pn10501567, !dbg !185 + %4427 = add i32 %4415, %.pn10481568, !dbg !185 + %4428 = add i32 %4415, %.pn10461569, !dbg !185 + %4429 = add i32 %4415, %.pn10441570, !dbg !185 + %4430 = add i32 %2847, 1, !dbg !180 + %4431 = icmp sgt i32 %4430, 2, !dbg !180 + %4432 = select i1 %4431, i32 0, i32 %4430, !dbg !180 + %4433 = icmp slt i32 %4426, %19, !dbg !181 + %4434 = icmp slt i32 %4427, %19, !dbg !181 + %4435 = icmp slt i32 %4428, %19, !dbg !181 + %4436 = icmp slt i32 %4429, %19, !dbg !181 + %4437 = shl i32 %4432, 13, !dbg !182 + %4438 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %4437, !dbg !182 + %4439 = and i1 %2850, %4433, !dbg !180 + %4440 = and i1 %2850, %4434, !dbg !180 + %4441 = and i1 %2850, %4435, !dbg !180 + %4442 = and i1 %2850, %4436, !dbg !180 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !182 + %4443 = getelementptr inbounds nuw i8, ptr addrspace(3) %4438, i32 %448, !dbg !182 + %4444 = select i1 %4439, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4443, ptr addrspace(1) %4418, i32 %4444) #3, !dbg !182 + %4445 = getelementptr inbounds nuw i8, ptr addrspace(3) %4438, i32 %451, !dbg !182 + %4446 = select i1 %4440, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4445, ptr addrspace(1) %4419, i32 %4446) #3, !dbg !182 + %4447 = getelementptr inbounds nuw i8, ptr addrspace(3) %4438, i32 %454, !dbg !182 + %4448 = select i1 %4441, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4447, ptr addrspace(1) %4420, i32 %4448) #3, !dbg !182 + %4449 = getelementptr inbounds nuw i8, ptr addrspace(3) %4438, i32 %457, !dbg !182 + %4450 = select i1 %4442, i32 16, i32 0, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4449, ptr addrspace(1) %4421, i32 %4450) #3, !dbg !182 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !182 + %4451 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %4437, !dbg !182 + %4452 = getelementptr inbounds nuw i8, ptr addrspace(3) %4451, i32 %448, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4452, ptr addrspace(1) %4422, i32 %4444) #3, !dbg !182 + %4453 = getelementptr inbounds nuw i8, ptr addrspace(3) %4451, i32 %451, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4453, ptr addrspace(1) %4423, i32 %4446) #3, !dbg !182 + %4454 = getelementptr inbounds nuw i8, ptr addrspace(3) %4451, i32 %454, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4454, ptr addrspace(1) %4424, i32 %4448) #3, !dbg !182 + %4455 = getelementptr inbounds nuw i8, ptr addrspace(3) %4451, i32 %457, !dbg !182 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4455, ptr addrspace(1) %4425, i32 %4450) #3, !dbg !182 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !182 + %exitcond2184.not = icmp eq i32 %4396, %smax2183, !dbg !180 + br i1 %exitcond2184.not, label %._crit_edge1593, label %2844, !dbg !180 + +._crit_edge1593: ; preds = %__nv_exp2f.exit1431, %._crit_edge + %4456 = phi float [ %2704, %._crit_edge ], [ %4329, %__nv_exp2f.exit1431 ], !dbg !99 + %4457 = phi float [ %2705, %._crit_edge ], [ %4330, %__nv_exp2f.exit1431 ], !dbg !99 + %4458 = phi float [ %2706, %._crit_edge ], [ %4331, %__nv_exp2f.exit1431 ], !dbg !99 + %4459 = phi float [ %2707, %._crit_edge ], [ %4332, %__nv_exp2f.exit1431 ], !dbg !99 + %4460 = phi float [ %2708, %._crit_edge ], [ %4333, %__nv_exp2f.exit1431 ], !dbg !99 + %4461 = phi float [ %2709, %._crit_edge ], [ %4334, %__nv_exp2f.exit1431 ], !dbg !99 + %4462 = phi float [ %2710, %._crit_edge ], [ %4335, %__nv_exp2f.exit1431 ], !dbg !99 + %4463 = phi float [ %2711, %._crit_edge ], [ %4336, %__nv_exp2f.exit1431 ], !dbg !99 + %4464 = phi float [ %2712, %._crit_edge ], [ %4337, %__nv_exp2f.exit1431 ], !dbg !99 + %4465 = phi float [ %2713, %._crit_edge ], [ %4338, %__nv_exp2f.exit1431 ], !dbg !99 + %4466 = phi float [ %2714, %._crit_edge ], [ %4339, %__nv_exp2f.exit1431 ], !dbg !99 + %4467 = phi float [ %2715, %._crit_edge ], [ %4340, %__nv_exp2f.exit1431 ], !dbg !99 + %4468 = phi float [ %2716, %._crit_edge ], [ %4341, %__nv_exp2f.exit1431 ], !dbg !99 + %4469 = phi float [ %2717, %._crit_edge ], [ %4342, %__nv_exp2f.exit1431 ], !dbg !99 + %4470 = phi float [ %2718, %._crit_edge ], [ %4343, %__nv_exp2f.exit1431 ], !dbg !99 + %4471 = phi float [ %2719, %._crit_edge ], [ %4344, %__nv_exp2f.exit1431 ], !dbg !99 + %4472 = phi float [ %2720, %._crit_edge ], [ %4345, %__nv_exp2f.exit1431 ], !dbg !99 + %4473 = phi float [ %2721, %._crit_edge ], [ %4346, %__nv_exp2f.exit1431 ], !dbg !99 + %4474 = phi float [ %2722, %._crit_edge ], [ %4347, %__nv_exp2f.exit1431 ], !dbg !99 + %4475 = phi float [ %2723, %._crit_edge ], [ %4348, %__nv_exp2f.exit1431 ], !dbg !99 + %4476 = phi float [ %2724, %._crit_edge ], [ %4349, %__nv_exp2f.exit1431 ], !dbg !99 + %4477 = phi float [ %2725, %._crit_edge ], [ %4350, %__nv_exp2f.exit1431 ], !dbg !99 + %4478 = phi float [ %2726, %._crit_edge ], [ %4351, %__nv_exp2f.exit1431 ], !dbg !99 + %4479 = phi float [ %2727, %._crit_edge ], [ %4352, %__nv_exp2f.exit1431 ], !dbg !99 + %4480 = phi float [ %2728, %._crit_edge ], [ %4353, %__nv_exp2f.exit1431 ], !dbg !99 + %4481 = phi float [ %2729, %._crit_edge ], [ %4354, %__nv_exp2f.exit1431 ], !dbg !99 + %4482 = phi float [ %2730, %._crit_edge ], [ %4355, %__nv_exp2f.exit1431 ], !dbg !99 + %4483 = phi float [ %2731, %._crit_edge ], [ %4356, %__nv_exp2f.exit1431 ], !dbg !99 + %4484 = phi float [ %2732, %._crit_edge ], [ %4357, %__nv_exp2f.exit1431 ], !dbg !99 + %4485 = phi float [ %2733, %._crit_edge ], [ %4358, %__nv_exp2f.exit1431 ], !dbg !99 + %4486 = phi float [ %2734, %._crit_edge ], [ %4359, %__nv_exp2f.exit1431 ], !dbg !99 + %4487 = phi float [ %2735, %._crit_edge ], [ %4360, %__nv_exp2f.exit1431 ], !dbg !99 + %4488 = phi float [ %2736, %._crit_edge ], [ %4361, %__nv_exp2f.exit1431 ], !dbg !99 + %4489 = phi float [ %2737, %._crit_edge ], [ %4362, %__nv_exp2f.exit1431 ], !dbg !99 + %4490 = phi float [ %2738, %._crit_edge ], [ %4363, %__nv_exp2f.exit1431 ], !dbg !99 + %4491 = phi float [ %2739, %._crit_edge ], [ %4364, %__nv_exp2f.exit1431 ], !dbg !99 + %4492 = phi float [ %2740, %._crit_edge ], [ %4365, %__nv_exp2f.exit1431 ], !dbg !99 + %4493 = phi float [ %2741, %._crit_edge ], [ %4366, %__nv_exp2f.exit1431 ], !dbg !99 + %4494 = phi float [ %2742, %._crit_edge ], [ %4367, %__nv_exp2f.exit1431 ], !dbg !99 + %4495 = phi float [ %2743, %._crit_edge ], [ %4368, %__nv_exp2f.exit1431 ], !dbg !99 + %4496 = phi float [ %2744, %._crit_edge ], [ %4369, %__nv_exp2f.exit1431 ], !dbg !99 + %4497 = phi float [ %2745, %._crit_edge ], [ %4370, %__nv_exp2f.exit1431 ], !dbg !99 + %4498 = phi float [ %2746, %._crit_edge ], [ %4371, %__nv_exp2f.exit1431 ], !dbg !99 + %4499 = phi float [ %2747, %._crit_edge ], [ %4372, %__nv_exp2f.exit1431 ], !dbg !99 + %4500 = phi float [ %2748, %._crit_edge ], [ %4373, %__nv_exp2f.exit1431 ], !dbg !99 + %4501 = phi float [ %2749, %._crit_edge ], [ %4374, %__nv_exp2f.exit1431 ], !dbg !99 + %4502 = phi float [ %2750, %._crit_edge ], [ %4375, %__nv_exp2f.exit1431 ], !dbg !99 + %4503 = phi float [ %2751, %._crit_edge ], [ %4376, %__nv_exp2f.exit1431 ], !dbg !99 + %4504 = phi float [ %2752, %._crit_edge ], [ %4377, %__nv_exp2f.exit1431 ], !dbg !99 + %4505 = phi float [ %2753, %._crit_edge ], [ %4378, %__nv_exp2f.exit1431 ], !dbg !99 + %4506 = phi float [ %2754, %._crit_edge ], [ %4379, %__nv_exp2f.exit1431 ], !dbg !99 + %4507 = phi float [ %2755, %._crit_edge ], [ %4380, %__nv_exp2f.exit1431 ], !dbg !99 + %4508 = phi float [ %2756, %._crit_edge ], [ %4381, %__nv_exp2f.exit1431 ], !dbg !99 + %4509 = phi float [ %2757, %._crit_edge ], [ %4382, %__nv_exp2f.exit1431 ], !dbg !99 + %4510 = phi float [ %2758, %._crit_edge ], [ %4383, %__nv_exp2f.exit1431 ], !dbg !99 + %4511 = phi float [ %2759, %._crit_edge ], [ %4384, %__nv_exp2f.exit1431 ], !dbg !99 + %4512 = phi float [ %2760, %._crit_edge ], [ %4385, %__nv_exp2f.exit1431 ], !dbg !99 + %4513 = phi float [ %2761, %._crit_edge ], [ %4386, %__nv_exp2f.exit1431 ], !dbg !99 + %4514 = phi float [ %2762, %._crit_edge ], [ %4387, %__nv_exp2f.exit1431 ], !dbg !99 + %4515 = phi float [ %2763, %._crit_edge ], [ %4388, %__nv_exp2f.exit1431 ], !dbg !99 + %4516 = phi float [ %2764, %._crit_edge ], [ %4389, %__nv_exp2f.exit1431 ], !dbg !99 + %4517 = phi float [ %2765, %._crit_edge ], [ %4390, %__nv_exp2f.exit1431 ], !dbg !99 + %4518 = phi float [ %2766, %._crit_edge ], [ %4391, %__nv_exp2f.exit1431 ], !dbg !99 + %4519 = phi float [ %2767, %._crit_edge ], [ %4392, %__nv_exp2f.exit1431 ], !dbg !99 + %4520 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %4456, float %4457, float %4458, float %4459, float %4460, float %4461, float %4462, float %4463, float %4464, float %4465, float %4466, float %4467, float %4468, float %4469, float %4470, float %4471, float %4472, float %4473, float %4474, float %4475, float %4476, float %4477, float %4478, float %4479, float %4480, float %4481, float %4482, float %4483, float %4484, float %4485, float %4486, float %4487, float %4488, float %4489, float %4490, float %4491, float %4492, float %4493, float %4494, float %4495, float %4496, float %4497, float %4498, float %4499, float %4500, float %4501, float %4502, float %4503, float %4504, float %4505, float %4506, float %4507, float %4508, float %4509, float %4510, float %4511, float %4512, float %4513, float %4514, float %4515, float %4516, float %4517, float %4518, float %4519) #3, !dbg !180 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !180 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !180 + %4521 = getelementptr bfloat, ptr addrspace(1) %98, i64 %120, !dbg !213 + %4522 = getelementptr bfloat, ptr addrspace(1) %98, i64 %122, !dbg !213 + %4523 = getelementptr bfloat, ptr addrspace(1) %98, i64 %124, !dbg !213 + %4524 = getelementptr bfloat, ptr addrspace(1) %98, i64 %126, !dbg !213 + %4525 = getelementptr bfloat, ptr addrspace(1) %98, i64 %128, !dbg !213 + %4526 = getelementptr bfloat, ptr addrspace(1) %98, i64 %130, !dbg !213 + %4527 = getelementptr bfloat, ptr addrspace(1) %98, i64 %132, !dbg !213 + %4528 = getelementptr bfloat, ptr addrspace(1) %98, i64 %134, !dbg !213 + %4529 = getelementptr bfloat, ptr addrspace(1) %4521, i64 %138, !dbg !214 + %4530 = getelementptr bfloat, ptr addrspace(1) %4522, i64 %138, !dbg !214 + %4531 = getelementptr bfloat, ptr addrspace(1) %4523, i64 %138, !dbg !214 + %4532 = getelementptr bfloat, ptr addrspace(1) %4524, i64 %138, !dbg !214 + %4533 = getelementptr bfloat, ptr addrspace(1) %4525, i64 %138, !dbg !214 + %4534 = getelementptr bfloat, ptr addrspace(1) %4526, i64 %138, !dbg !214 + %4535 = getelementptr bfloat, ptr addrspace(1) %4527, i64 %138, !dbg !214 + %4536 = getelementptr bfloat, ptr addrspace(1) %4528, i64 %138, !dbg !214 + %4537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 0, !dbg !215 + %4538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 1, !dbg !215 + %4539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 2, !dbg !215 + %4540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 3, !dbg !215 + %4541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 4, !dbg !215 + %4542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 5, !dbg !215 + %4543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 6, !dbg !215 + %4544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 7, !dbg !215 + %4545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 8, !dbg !215 + %4546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 9, !dbg !215 + %4547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 10, !dbg !215 + %4548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 11, !dbg !215 + %4549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 12, !dbg !215 + %4550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 13, !dbg !215 + %4551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 14, !dbg !215 + %4552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 15, !dbg !215 + %4553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 16, !dbg !215 + %4554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 17, !dbg !215 + %4555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 18, !dbg !215 + %4556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 19, !dbg !215 + %4557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 20, !dbg !215 + %4558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 21, !dbg !215 + %4559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 22, !dbg !215 + %4560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 23, !dbg !215 + %4561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 24, !dbg !215 + %4562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 25, !dbg !215 + %4563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 26, !dbg !215 + %4564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 27, !dbg !215 + %4565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 28, !dbg !215 + %4566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 29, !dbg !215 + %4567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 30, !dbg !215 + %4568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 31, !dbg !215 + %4569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 32, !dbg !215 + %4570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 33, !dbg !215 + %4571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 34, !dbg !215 + %4572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 35, !dbg !215 + %4573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 36, !dbg !215 + %4574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 37, !dbg !215 + %4575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 38, !dbg !215 + %4576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 39, !dbg !215 + %4577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 40, !dbg !215 + %4578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 41, !dbg !215 + %4579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 42, !dbg !215 + %4580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 43, !dbg !215 + %4581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 44, !dbg !215 + %4582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 45, !dbg !215 + %4583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 46, !dbg !215 + %4584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 47, !dbg !215 + %4585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 48, !dbg !215 + %4586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 49, !dbg !215 + %4587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 50, !dbg !215 + %4588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 51, !dbg !215 + %4589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 52, !dbg !215 + %4590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 53, !dbg !215 + %4591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 54, !dbg !215 + %4592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 55, !dbg !215 + %4593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 56, !dbg !215 + %4594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 57, !dbg !215 + %4595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 58, !dbg !215 + %4596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 59, !dbg !215 + %4597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 60, !dbg !215 + %4598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 61, !dbg !215 + %4599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 62, !dbg !215 + %4600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4520, 63, !dbg !215 + %4601 = insertelement <2 x float> poison, float %4537, i64 0, !dbg !215 + %4602 = insertelement <2 x float> %4601, float %4538, i64 1, !dbg !215 + %4603 = fmul <2 x float> %4602, splat (float 0x3FB6A09E60000000), !dbg !215 + %4604 = fptrunc <2 x float> %4603 to <2 x bfloat>, !dbg !216 + %4605 = insertelement <2 x float> poison, float %4539, i64 0, !dbg !215 + %4606 = insertelement <2 x float> %4605, float %4540, i64 1, !dbg !215 + %4607 = fmul <2 x float> %4606, splat (float 0x3FB6A09E60000000), !dbg !215 + %4608 = fptrunc <2 x float> %4607 to <2 x bfloat>, !dbg !216 + %4609 = insertelement <2 x float> poison, float %4541, i64 0, !dbg !215 + %4610 = insertelement <2 x float> %4609, float %4542, i64 1, !dbg !215 + %4611 = fmul <2 x float> %4610, splat (float 0x3FB6A09E60000000), !dbg !215 + %4612 = fptrunc <2 x float> %4611 to <2 x bfloat>, !dbg !216 + %4613 = insertelement <2 x float> poison, float %4543, i64 0, !dbg !215 + %4614 = insertelement <2 x float> %4613, float %4544, i64 1, !dbg !215 + %4615 = fmul <2 x float> %4614, splat (float 0x3FB6A09E60000000), !dbg !215 + %4616 = fptrunc <2 x float> %4615 to <2 x bfloat>, !dbg !216 + %4617 = insertelement <2 x float> poison, float %4545, i64 0, !dbg !215 + %4618 = insertelement <2 x float> %4617, float %4546, i64 1, !dbg !215 + %4619 = fmul <2 x float> %4618, splat (float 0x3FB6A09E60000000), !dbg !215 + %4620 = fptrunc <2 x float> %4619 to <2 x bfloat>, !dbg !216 + %4621 = insertelement <2 x float> poison, float %4547, i64 0, !dbg !215 + %4622 = insertelement <2 x float> %4621, float %4548, i64 1, !dbg !215 + %4623 = fmul <2 x float> %4622, splat (float 0x3FB6A09E60000000), !dbg !215 + %4624 = fptrunc <2 x float> %4623 to <2 x bfloat>, !dbg !216 + %4625 = insertelement <2 x float> poison, float %4549, i64 0, !dbg !215 + %4626 = insertelement <2 x float> %4625, float %4550, i64 1, !dbg !215 + %4627 = fmul <2 x float> %4626, splat (float 0x3FB6A09E60000000), !dbg !215 + %4628 = fptrunc <2 x float> %4627 to <2 x bfloat>, !dbg !216 + %4629 = insertelement <2 x float> poison, float %4551, i64 0, !dbg !215 + %4630 = insertelement <2 x float> %4629, float %4552, i64 1, !dbg !215 + %4631 = fmul <2 x float> %4630, splat (float 0x3FB6A09E60000000), !dbg !215 + %4632 = fptrunc <2 x float> %4631 to <2 x bfloat>, !dbg !216 + %4633 = insertelement <2 x float> poison, float %4553, i64 0, !dbg !215 + %4634 = insertelement <2 x float> %4633, float %4554, i64 1, !dbg !215 + %4635 = fmul <2 x float> %4634, splat (float 0x3FB6A09E60000000), !dbg !215 + %4636 = fptrunc <2 x float> %4635 to <2 x bfloat>, !dbg !216 + %4637 = insertelement <2 x float> poison, float %4555, i64 0, !dbg !215 + %4638 = insertelement <2 x float> %4637, float %4556, i64 1, !dbg !215 + %4639 = fmul <2 x float> %4638, splat (float 0x3FB6A09E60000000), !dbg !215 + %4640 = fptrunc <2 x float> %4639 to <2 x bfloat>, !dbg !216 + %4641 = insertelement <2 x float> poison, float %4557, i64 0, !dbg !215 + %4642 = insertelement <2 x float> %4641, float %4558, i64 1, !dbg !215 + %4643 = fmul <2 x float> %4642, splat (float 0x3FB6A09E60000000), !dbg !215 + %4644 = fptrunc <2 x float> %4643 to <2 x bfloat>, !dbg !216 + %4645 = insertelement <2 x float> poison, float %4559, i64 0, !dbg !215 + %4646 = insertelement <2 x float> %4645, float %4560, i64 1, !dbg !215 + %4647 = fmul <2 x float> %4646, splat (float 0x3FB6A09E60000000), !dbg !215 + %4648 = fptrunc <2 x float> %4647 to <2 x bfloat>, !dbg !216 + %4649 = insertelement <2 x float> poison, float %4561, i64 0, !dbg !215 + %4650 = insertelement <2 x float> %4649, float %4562, i64 1, !dbg !215 + %4651 = fmul <2 x float> %4650, splat (float 0x3FB6A09E60000000), !dbg !215 + %4652 = fptrunc <2 x float> %4651 to <2 x bfloat>, !dbg !216 + %4653 = insertelement <2 x float> poison, float %4563, i64 0, !dbg !215 + %4654 = insertelement <2 x float> %4653, float %4564, i64 1, !dbg !215 + %4655 = fmul <2 x float> %4654, splat (float 0x3FB6A09E60000000), !dbg !215 + %4656 = fptrunc <2 x float> %4655 to <2 x bfloat>, !dbg !216 + %4657 = insertelement <2 x float> poison, float %4565, i64 0, !dbg !215 + %4658 = insertelement <2 x float> %4657, float %4566, i64 1, !dbg !215 + %4659 = fmul <2 x float> %4658, splat (float 0x3FB6A09E60000000), !dbg !215 + %4660 = fptrunc <2 x float> %4659 to <2 x bfloat>, !dbg !216 + %4661 = insertelement <2 x float> poison, float %4567, i64 0, !dbg !215 + %4662 = insertelement <2 x float> %4661, float %4568, i64 1, !dbg !215 + %4663 = fmul <2 x float> %4662, splat (float 0x3FB6A09E60000000), !dbg !215 + %4664 = fptrunc <2 x float> %4663 to <2 x bfloat>, !dbg !216 + %4665 = insertelement <2 x float> poison, float %4569, i64 0, !dbg !215 + %4666 = insertelement <2 x float> %4665, float %4570, i64 1, !dbg !215 + %4667 = fmul <2 x float> %4666, splat (float 0x3FB6A09E60000000), !dbg !215 + %4668 = fptrunc <2 x float> %4667 to <2 x bfloat>, !dbg !216 + %4669 = insertelement <2 x float> poison, float %4571, i64 0, !dbg !215 + %4670 = insertelement <2 x float> %4669, float %4572, i64 1, !dbg !215 + %4671 = fmul <2 x float> %4670, splat (float 0x3FB6A09E60000000), !dbg !215 + %4672 = fptrunc <2 x float> %4671 to <2 x bfloat>, !dbg !216 + %4673 = insertelement <2 x float> poison, float %4573, i64 0, !dbg !215 + %4674 = insertelement <2 x float> %4673, float %4574, i64 1, !dbg !215 + %4675 = fmul <2 x float> %4674, splat (float 0x3FB6A09E60000000), !dbg !215 + %4676 = fptrunc <2 x float> %4675 to <2 x bfloat>, !dbg !216 + %4677 = insertelement <2 x float> poison, float %4575, i64 0, !dbg !215 + %4678 = insertelement <2 x float> %4677, float %4576, i64 1, !dbg !215 + %4679 = fmul <2 x float> %4678, splat (float 0x3FB6A09E60000000), !dbg !215 + %4680 = fptrunc <2 x float> %4679 to <2 x bfloat>, !dbg !216 + %4681 = insertelement <2 x float> poison, float %4577, i64 0, !dbg !215 + %4682 = insertelement <2 x float> %4681, float %4578, i64 1, !dbg !215 + %4683 = fmul <2 x float> %4682, splat (float 0x3FB6A09E60000000), !dbg !215 + %4684 = fptrunc <2 x float> %4683 to <2 x bfloat>, !dbg !216 + %4685 = insertelement <2 x float> poison, float %4579, i64 0, !dbg !215 + %4686 = insertelement <2 x float> %4685, float %4580, i64 1, !dbg !215 + %4687 = fmul <2 x float> %4686, splat (float 0x3FB6A09E60000000), !dbg !215 + %4688 = fptrunc <2 x float> %4687 to <2 x bfloat>, !dbg !216 + %4689 = insertelement <2 x float> poison, float %4581, i64 0, !dbg !215 + %4690 = insertelement <2 x float> %4689, float %4582, i64 1, !dbg !215 + %4691 = fmul <2 x float> %4690, splat (float 0x3FB6A09E60000000), !dbg !215 + %4692 = fptrunc <2 x float> %4691 to <2 x bfloat>, !dbg !216 + %4693 = insertelement <2 x float> poison, float %4583, i64 0, !dbg !215 + %4694 = insertelement <2 x float> %4693, float %4584, i64 1, !dbg !215 + %4695 = fmul <2 x float> %4694, splat (float 0x3FB6A09E60000000), !dbg !215 + %4696 = fptrunc <2 x float> %4695 to <2 x bfloat>, !dbg !216 + %4697 = insertelement <2 x float> poison, float %4585, i64 0, !dbg !215 + %4698 = insertelement <2 x float> %4697, float %4586, i64 1, !dbg !215 + %4699 = fmul <2 x float> %4698, splat (float 0x3FB6A09E60000000), !dbg !215 + %4700 = fptrunc <2 x float> %4699 to <2 x bfloat>, !dbg !216 + %4701 = insertelement <2 x float> poison, float %4587, i64 0, !dbg !215 + %4702 = insertelement <2 x float> %4701, float %4588, i64 1, !dbg !215 + %4703 = fmul <2 x float> %4702, splat (float 0x3FB6A09E60000000), !dbg !215 + %4704 = fptrunc <2 x float> %4703 to <2 x bfloat>, !dbg !216 + %4705 = insertelement <2 x float> poison, float %4589, i64 0, !dbg !215 + %4706 = insertelement <2 x float> %4705, float %4590, i64 1, !dbg !215 + %4707 = fmul <2 x float> %4706, splat (float 0x3FB6A09E60000000), !dbg !215 + %4708 = fptrunc <2 x float> %4707 to <2 x bfloat>, !dbg !216 + %4709 = insertelement <2 x float> poison, float %4591, i64 0, !dbg !215 + %4710 = insertelement <2 x float> %4709, float %4592, i64 1, !dbg !215 + %4711 = fmul <2 x float> %4710, splat (float 0x3FB6A09E60000000), !dbg !215 + %4712 = fptrunc <2 x float> %4711 to <2 x bfloat>, !dbg !216 + %4713 = insertelement <2 x float> poison, float %4593, i64 0, !dbg !215 + %4714 = insertelement <2 x float> %4713, float %4594, i64 1, !dbg !215 + %4715 = fmul <2 x float> %4714, splat (float 0x3FB6A09E60000000), !dbg !215 + %4716 = fptrunc <2 x float> %4715 to <2 x bfloat>, !dbg !216 + %4717 = insertelement <2 x float> poison, float %4595, i64 0, !dbg !215 + %4718 = insertelement <2 x float> %4717, float %4596, i64 1, !dbg !215 + %4719 = fmul <2 x float> %4718, splat (float 0x3FB6A09E60000000), !dbg !215 + %4720 = fptrunc <2 x float> %4719 to <2 x bfloat>, !dbg !216 + %4721 = insertelement <2 x float> poison, float %4597, i64 0, !dbg !215 + %4722 = insertelement <2 x float> %4721, float %4598, i64 1, !dbg !215 + %4723 = fmul <2 x float> %4722, splat (float 0x3FB6A09E60000000), !dbg !215 + %4724 = fptrunc <2 x float> %4723 to <2 x bfloat>, !dbg !216 + %4725 = insertelement <2 x float> poison, float %4599, i64 0, !dbg !215 + %4726 = insertelement <2 x float> %4725, float %4600, i64 1, !dbg !215 + %4727 = fmul <2 x float> %4726, splat (float 0x3FB6A09E60000000), !dbg !215 + %4728 = fptrunc <2 x float> %4727 to <2 x bfloat>, !dbg !216 + %4729 = shl nuw nsw i32 %390, 13, !dbg !216 + %4730 = shl nuw nsw i32 %56, 5, !dbg !216 + %4731 = and i32 %4730, 7264, !dbg !216 + %4732 = and i32 %56, 24, !dbg !216 + %4733 = shl nuw nsw i32 %4732, 4, !dbg !216 + %4734 = shl nuw nsw i32 %56, 2, !dbg !216 + %4735 = and i32 %4734, 16, !dbg !216 + %4736 = or disjoint i32 %4729, %4735, !dbg !216 + %4737 = or disjoint i32 %4731, %4733, !dbg !216 + %4738 = or disjoint i32 %4736, %4737, !dbg !216 + %4739 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4738, !dbg !216 + %4740 = bitcast <2 x bfloat> %4604 to i32, !dbg !216 + %4741 = bitcast <2 x bfloat> %4612 to i32, !dbg !216 + %4742 = bitcast <2 x bfloat> %4620 to i32, !dbg !216 + %4743 = bitcast <2 x bfloat> %4628 to i32, !dbg !216 + %4744 = insertelement <4 x i32> poison, i32 %4740, i64 0, !dbg !216 + %4745 = insertelement <4 x i32> %4744, i32 %4741, i64 1, !dbg !216 + %4746 = insertelement <4 x i32> %4745, i32 %4742, i64 2, !dbg !216 + %4747 = insertelement <4 x i32> %4746, i32 %4743, i64 3, !dbg !216 + store <4 x i32> %4747, ptr addrspace(3) %4739, align 16, !dbg !216 + %4748 = getelementptr inbounds nuw i8, ptr addrspace(3) %4739, i32 512, !dbg !216 + %4749 = bitcast <2 x bfloat> %4608 to i32, !dbg !216 + %4750 = bitcast <2 x bfloat> %4616 to i32, !dbg !216 + %4751 = bitcast <2 x bfloat> %4624 to i32, !dbg !216 + %4752 = bitcast <2 x bfloat> %4632 to i32, !dbg !216 + %4753 = insertelement <4 x i32> poison, i32 %4749, i64 0, !dbg !216 + %4754 = insertelement <4 x i32> %4753, i32 %4750, i64 1, !dbg !216 + %4755 = insertelement <4 x i32> %4754, i32 %4751, i64 2, !dbg !216 + %4756 = insertelement <4 x i32> %4755, i32 %4752, i64 3, !dbg !216 + store <4 x i32> %4756, ptr addrspace(3) %4748, align 16, !dbg !216 + %4757 = xor i32 %4738, 32, !dbg !216 + %4758 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4757, !dbg !216 + %4759 = bitcast <2 x bfloat> %4636 to i32, !dbg !216 + %4760 = bitcast <2 x bfloat> %4644 to i32, !dbg !216 + %4761 = bitcast <2 x bfloat> %4652 to i32, !dbg !216 + %4762 = bitcast <2 x bfloat> %4660 to i32, !dbg !216 + %4763 = insertelement <4 x i32> poison, i32 %4759, i64 0, !dbg !216 + %4764 = insertelement <4 x i32> %4763, i32 %4760, i64 1, !dbg !216 + %4765 = insertelement <4 x i32> %4764, i32 %4761, i64 2, !dbg !216 + %4766 = insertelement <4 x i32> %4765, i32 %4762, i64 3, !dbg !216 + store <4 x i32> %4766, ptr addrspace(3) %4758, align 16, !dbg !216 + %4767 = getelementptr inbounds nuw i8, ptr addrspace(3) %4758, i32 512, !dbg !216 + %4768 = bitcast <2 x bfloat> %4640 to i32, !dbg !216 + %4769 = bitcast <2 x bfloat> %4648 to i32, !dbg !216 + %4770 = bitcast <2 x bfloat> %4656 to i32, !dbg !216 + %4771 = bitcast <2 x bfloat> %4664 to i32, !dbg !216 + %4772 = insertelement <4 x i32> poison, i32 %4768, i64 0, !dbg !216 + %4773 = insertelement <4 x i32> %4772, i32 %4769, i64 1, !dbg !216 + %4774 = insertelement <4 x i32> %4773, i32 %4770, i64 2, !dbg !216 + %4775 = insertelement <4 x i32> %4774, i32 %4771, i64 3, !dbg !216 + store <4 x i32> %4775, ptr addrspace(3) %4767, align 16, !dbg !216 + %4776 = xor i32 %4738, 64, !dbg !216 + %4777 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4776, !dbg !216 + %4778 = bitcast <2 x bfloat> %4668 to i32, !dbg !216 + %4779 = bitcast <2 x bfloat> %4676 to i32, !dbg !216 + %4780 = bitcast <2 x bfloat> %4684 to i32, !dbg !216 + %4781 = bitcast <2 x bfloat> %4692 to i32, !dbg !216 + %4782 = insertelement <4 x i32> poison, i32 %4778, i64 0, !dbg !216 + %4783 = insertelement <4 x i32> %4782, i32 %4779, i64 1, !dbg !216 + %4784 = insertelement <4 x i32> %4783, i32 %4780, i64 2, !dbg !216 + %4785 = insertelement <4 x i32> %4784, i32 %4781, i64 3, !dbg !216 + store <4 x i32> %4785, ptr addrspace(3) %4777, align 16, !dbg !216 + %4786 = getelementptr inbounds nuw i8, ptr addrspace(3) %4777, i32 512, !dbg !216 + %4787 = bitcast <2 x bfloat> %4672 to i32, !dbg !216 + %4788 = bitcast <2 x bfloat> %4680 to i32, !dbg !216 + %4789 = bitcast <2 x bfloat> %4688 to i32, !dbg !216 + %4790 = bitcast <2 x bfloat> %4696 to i32, !dbg !216 + %4791 = insertelement <4 x i32> poison, i32 %4787, i64 0, !dbg !216 + %4792 = insertelement <4 x i32> %4791, i32 %4788, i64 1, !dbg !216 + %4793 = insertelement <4 x i32> %4792, i32 %4789, i64 2, !dbg !216 + %4794 = insertelement <4 x i32> %4793, i32 %4790, i64 3, !dbg !216 + store <4 x i32> %4794, ptr addrspace(3) %4786, align 16, !dbg !216 + %4795 = xor i32 %4738, 96, !dbg !216 + %4796 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4795, !dbg !216 + %4797 = bitcast <2 x bfloat> %4700 to i32, !dbg !216 + %4798 = bitcast <2 x bfloat> %4708 to i32, !dbg !216 + %4799 = bitcast <2 x bfloat> %4716 to i32, !dbg !216 + %4800 = bitcast <2 x bfloat> %4724 to i32, !dbg !216 + %4801 = insertelement <4 x i32> poison, i32 %4797, i64 0, !dbg !216 + %4802 = insertelement <4 x i32> %4801, i32 %4798, i64 1, !dbg !216 + %4803 = insertelement <4 x i32> %4802, i32 %4799, i64 2, !dbg !216 + %4804 = insertelement <4 x i32> %4803, i32 %4800, i64 3, !dbg !216 + store <4 x i32> %4804, ptr addrspace(3) %4796, align 16, !dbg !216 + %4805 = getelementptr inbounds nuw i8, ptr addrspace(3) %4796, i32 512, !dbg !216 + %4806 = bitcast <2 x bfloat> %4704 to i32, !dbg !216 + %4807 = bitcast <2 x bfloat> %4712 to i32, !dbg !216 + %4808 = bitcast <2 x bfloat> %4720 to i32, !dbg !216 + %4809 = bitcast <2 x bfloat> %4728 to i32, !dbg !216 + %4810 = insertelement <4 x i32> poison, i32 %4806, i64 0, !dbg !216 + %4811 = insertelement <4 x i32> %4810, i32 %4807, i64 1, !dbg !216 + %4812 = insertelement <4 x i32> %4811, i32 %4808, i64 2, !dbg !216 + %4813 = insertelement <4 x i32> %4812, i32 %4809, i64 3, !dbg !216 + store <4 x i32> %4813, ptr addrspace(3) %4805, align 16, !dbg !216 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !216 + %4814 = shl nuw nsw i32 %4732, 10, !dbg !216 + %4815 = shl nuw nsw i32 %390, 5, !dbg !216 + %4816 = and i32 %4734, 1008, !dbg !216 + %4817 = or disjoint i32 %4814, %4815, !dbg !216 + %4818 = xor i32 %4817, %4816, !dbg !216 + %4819 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4818, !dbg !216 + %4820 = ptrtoint ptr addrspace(3) %4819 to i32, !dbg !216 + %4821 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4820) #3, !dbg !216 + %4822 = extractvalue { i32, i32, i32, i32 } %4821, 0, !dbg !216 + %4823 = extractvalue { i32, i32, i32, i32 } %4821, 1, !dbg !216 + %4824 = extractvalue { i32, i32, i32, i32 } %4821, 2, !dbg !216 + %4825 = extractvalue { i32, i32, i32, i32 } %4821, 3, !dbg !216 + %4826 = getelementptr inbounds nuw i8, ptr addrspace(3) %4819, i32 1024, !dbg !216 + %4827 = ptrtoint ptr addrspace(3) %4826 to i32, !dbg !216 + %4828 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4827) #3, !dbg !216 + %4829 = extractvalue { i32, i32, i32, i32 } %4828, 0, !dbg !216 + %4830 = extractvalue { i32, i32, i32, i32 } %4828, 1, !dbg !216 + %4831 = extractvalue { i32, i32, i32, i32 } %4828, 2, !dbg !216 + %4832 = extractvalue { i32, i32, i32, i32 } %4828, 3, !dbg !216 + %4833 = getelementptr inbounds nuw i8, ptr addrspace(3) %4819, i32 2048, !dbg !216 + %4834 = ptrtoint ptr addrspace(3) %4833 to i32, !dbg !216 + %4835 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4834) #3, !dbg !216 + %4836 = extractvalue { i32, i32, i32, i32 } %4835, 0, !dbg !216 + %4837 = extractvalue { i32, i32, i32, i32 } %4835, 1, !dbg !216 + %4838 = extractvalue { i32, i32, i32, i32 } %4835, 2, !dbg !216 + %4839 = extractvalue { i32, i32, i32, i32 } %4835, 3, !dbg !216 + %4840 = getelementptr inbounds nuw i8, ptr addrspace(3) %4819, i32 3072, !dbg !216 + %4841 = ptrtoint ptr addrspace(3) %4840 to i32, !dbg !216 + %4842 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4841) #3, !dbg !216 + %4843 = extractvalue { i32, i32, i32, i32 } %4842, 0, !dbg !216 + %4844 = extractvalue { i32, i32, i32, i32 } %4842, 1, !dbg !216 + %4845 = extractvalue { i32, i32, i32, i32 } %4842, 2, !dbg !216 + %4846 = extractvalue { i32, i32, i32, i32 } %4842, 3, !dbg !216 + %4847 = getelementptr inbounds nuw i8, ptr addrspace(3) %4819, i32 4096, !dbg !216 + %4848 = ptrtoint ptr addrspace(3) %4847 to i32, !dbg !216 + %4849 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4848) #3, !dbg !216 + %4850 = extractvalue { i32, i32, i32, i32 } %4849, 0, !dbg !216 + %4851 = extractvalue { i32, i32, i32, i32 } %4849, 1, !dbg !216 + %4852 = extractvalue { i32, i32, i32, i32 } %4849, 2, !dbg !216 + %4853 = extractvalue { i32, i32, i32, i32 } %4849, 3, !dbg !216 + %4854 = getelementptr inbounds nuw i8, ptr addrspace(3) %4819, i32 5120, !dbg !216 + %4855 = ptrtoint ptr addrspace(3) %4854 to i32, !dbg !216 + %4856 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4855) #3, !dbg !216 + %4857 = extractvalue { i32, i32, i32, i32 } %4856, 0, !dbg !216 + %4858 = extractvalue { i32, i32, i32, i32 } %4856, 1, !dbg !216 + %4859 = extractvalue { i32, i32, i32, i32 } %4856, 2, !dbg !216 + %4860 = extractvalue { i32, i32, i32, i32 } %4856, 3, !dbg !216 + %4861 = getelementptr inbounds nuw i8, ptr addrspace(3) %4819, i32 6144, !dbg !216 + %4862 = ptrtoint ptr addrspace(3) %4861 to i32, !dbg !216 + %4863 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4862) #3, !dbg !216 + %4864 = extractvalue { i32, i32, i32, i32 } %4863, 0, !dbg !216 + %4865 = extractvalue { i32, i32, i32, i32 } %4863, 1, !dbg !216 + %4866 = extractvalue { i32, i32, i32, i32 } %4863, 2, !dbg !216 + %4867 = extractvalue { i32, i32, i32, i32 } %4863, 3, !dbg !216 + %4868 = getelementptr inbounds nuw i8, ptr addrspace(3) %4819, i32 7168, !dbg !216 + %4869 = ptrtoint ptr addrspace(3) %4868 to i32, !dbg !216 + %4870 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4869) #3, !dbg !216 + %4871 = extractvalue { i32, i32, i32, i32 } %4870, 0, !dbg !216 + %4872 = extractvalue { i32, i32, i32, i32 } %4870, 1, !dbg !216 + %4873 = extractvalue { i32, i32, i32, i32 } %4870, 2, !dbg !216 + %4874 = extractvalue { i32, i32, i32, i32 } %4870, 3, !dbg !216 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4822, i32 %4823, i32 %4824, i32 %4825, ptr addrspace(1) %4529, i1 %147) #3, !dbg !216 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4829, i32 %4830, i32 %4831, i32 %4832, ptr addrspace(1) %4530, i1 %148) #3, !dbg !216 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4836, i32 %4837, i32 %4838, i32 %4839, ptr addrspace(1) %4531, i1 %149) #3, !dbg !216 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4843, i32 %4844, i32 %4845, i32 %4846, ptr addrspace(1) %4532, i1 %150) #3, !dbg !216 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4850, i32 %4851, i32 %4852, i32 %4853, ptr addrspace(1) %4533, i1 %151) #3, !dbg !216 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4857, i32 %4858, i32 %4859, i32 %4860, ptr addrspace(1) %4534, i1 %152) #3, !dbg !216 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4864, i32 %4865, i32 %4866, i32 %4867, ptr addrspace(1) %4535, i1 %153) #3, !dbg !216 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4871, i32 %4872, i32 %4873, i32 %4874, ptr addrspace(1) %4536, i1 %154) #3, !dbg !216 + br label %11899, !dbg !39 + +4875: ; preds = %29 + %4876 = shl nuw nsw i32 %40, 7, !dbg !217 + %4877 = or disjoint i32 %59, %4876, !dbg !218 + %4878 = or disjoint i32 %60, %4876, !dbg !218 + %4879 = or disjoint i32 %61, %4876, !dbg !218 + %4880 = or disjoint i32 %62, %4876, !dbg !218 + %4881 = or disjoint i32 %63, %4876, !dbg !218 + %4882 = or disjoint i32 %64, %4876, !dbg !218 + %4883 = or disjoint i32 %65, %4876, !dbg !218 + %4884 = or disjoint i32 %66, %4876, !dbg !218 + %4885 = shl i32 %4877, 7, !dbg !219 + %4886 = shl i32 %4878, 7, !dbg !219 + %4887 = shl i32 %4879, 7, !dbg !219 + %4888 = shl i32 %4880, 7, !dbg !219 + %4889 = shl i32 %4881, 7, !dbg !219 + %4890 = shl i32 %4882, 7, !dbg !219 + %4891 = shl i32 %4883, 7, !dbg !219 + %4892 = shl i32 %4884, 7, !dbg !219 + %4893 = sext i32 %4885 to i64, !dbg !221 + %4894 = getelementptr bfloat, ptr addrspace(1) %53, i64 %4893, !dbg !221 + %4895 = sext i32 %4886 to i64, !dbg !221 + %4896 = getelementptr bfloat, ptr addrspace(1) %53, i64 %4895, !dbg !221 + %4897 = sext i32 %4887 to i64, !dbg !221 + %4898 = getelementptr bfloat, ptr addrspace(1) %53, i64 %4897, !dbg !221 + %4899 = sext i32 %4888 to i64, !dbg !221 + %4900 = getelementptr bfloat, ptr addrspace(1) %53, i64 %4899, !dbg !221 + %4901 = sext i32 %4889 to i64, !dbg !221 + %4902 = getelementptr bfloat, ptr addrspace(1) %53, i64 %4901, !dbg !221 + %4903 = sext i32 %4890 to i64, !dbg !221 + %4904 = getelementptr bfloat, ptr addrspace(1) %53, i64 %4903, !dbg !221 + %4905 = sext i32 %4891 to i64, !dbg !221 + %4906 = getelementptr bfloat, ptr addrspace(1) %53, i64 %4905, !dbg !221 + %4907 = sext i32 %4892 to i64, !dbg !221 + %4908 = getelementptr bfloat, ptr addrspace(1) %53, i64 %4907, !dbg !221 + %4909 = shl nuw nsw i32 %56, 3, !dbg !222 + %4910 = and i32 %4909, 120, !dbg !222 + %4911 = zext nneg i32 %4910 to i64, !dbg !223 + %4912 = getelementptr bfloat, ptr addrspace(1) %4894, i64 %4911, !dbg !223 + %4913 = getelementptr bfloat, ptr addrspace(1) %4896, i64 %4911, !dbg !223 + %4914 = getelementptr bfloat, ptr addrspace(1) %4898, i64 %4911, !dbg !223 + %4915 = getelementptr bfloat, ptr addrspace(1) %4900, i64 %4911, !dbg !223 + %4916 = getelementptr bfloat, ptr addrspace(1) %4902, i64 %4911, !dbg !223 + %4917 = getelementptr bfloat, ptr addrspace(1) %4904, i64 %4911, !dbg !223 + %4918 = getelementptr bfloat, ptr addrspace(1) %4906, i64 %4911, !dbg !223 + %4919 = getelementptr bfloat, ptr addrspace(1) %4908, i64 %4911, !dbg !223 + %4920 = icmp slt i32 %4877, %19, !dbg !224 + %4921 = icmp slt i32 %4878, %19, !dbg !224 + %4922 = icmp slt i32 %4879, %19, !dbg !224 + %4923 = icmp slt i32 %4880, %19, !dbg !224 + %4924 = icmp slt i32 %4881, %19, !dbg !224 + %4925 = icmp slt i32 %4882, %19, !dbg !224 + %4926 = icmp slt i32 %4883, %19, !dbg !224 + %4927 = icmp slt i32 %4884, %19, !dbg !224 + %4928 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4912, i1 %4920) #3, !dbg !225 + %4929 = extractvalue { i32, i32, i32, i32 } %4928, 0, !dbg !225 + %4930 = extractvalue { i32, i32, i32, i32 } %4928, 1, !dbg !225 + %4931 = extractvalue { i32, i32, i32, i32 } %4928, 2, !dbg !225 + %4932 = extractvalue { i32, i32, i32, i32 } %4928, 3, !dbg !225 + %4933 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4913, i1 %4921) #3, !dbg !225 + %4934 = extractvalue { i32, i32, i32, i32 } %4933, 0, !dbg !225 + %4935 = extractvalue { i32, i32, i32, i32 } %4933, 1, !dbg !225 + %4936 = extractvalue { i32, i32, i32, i32 } %4933, 2, !dbg !225 + %4937 = extractvalue { i32, i32, i32, i32 } %4933, 3, !dbg !225 + %4938 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4914, i1 %4922) #3, !dbg !225 + %4939 = extractvalue { i32, i32, i32, i32 } %4938, 0, !dbg !225 + %4940 = extractvalue { i32, i32, i32, i32 } %4938, 1, !dbg !225 + %4941 = extractvalue { i32, i32, i32, i32 } %4938, 2, !dbg !225 + %4942 = extractvalue { i32, i32, i32, i32 } %4938, 3, !dbg !225 + %4943 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4915, i1 %4923) #3, !dbg !225 + %4944 = extractvalue { i32, i32, i32, i32 } %4943, 0, !dbg !225 + %4945 = extractvalue { i32, i32, i32, i32 } %4943, 1, !dbg !225 + %4946 = extractvalue { i32, i32, i32, i32 } %4943, 2, !dbg !225 + %4947 = extractvalue { i32, i32, i32, i32 } %4943, 3, !dbg !225 + %4948 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4916, i1 %4924) #3, !dbg !225 + %4949 = extractvalue { i32, i32, i32, i32 } %4948, 0, !dbg !225 + %4950 = extractvalue { i32, i32, i32, i32 } %4948, 1, !dbg !225 + %4951 = extractvalue { i32, i32, i32, i32 } %4948, 2, !dbg !225 + %4952 = extractvalue { i32, i32, i32, i32 } %4948, 3, !dbg !225 + %4953 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4917, i1 %4925) #3, !dbg !225 + %4954 = extractvalue { i32, i32, i32, i32 } %4953, 0, !dbg !225 + %4955 = extractvalue { i32, i32, i32, i32 } %4953, 1, !dbg !225 + %4956 = extractvalue { i32, i32, i32, i32 } %4953, 2, !dbg !225 + %4957 = extractvalue { i32, i32, i32, i32 } %4953, 3, !dbg !225 + %4958 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4918, i1 %4926) #3, !dbg !225 + %4959 = extractvalue { i32, i32, i32, i32 } %4958, 0, !dbg !225 + %4960 = extractvalue { i32, i32, i32, i32 } %4958, 1, !dbg !225 + %4961 = extractvalue { i32, i32, i32, i32 } %4958, 2, !dbg !225 + %4962 = extractvalue { i32, i32, i32, i32 } %4958, 3, !dbg !225 + %4963 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4919, i1 %4927) #3, !dbg !225 + %4964 = extractvalue { i32, i32, i32, i32 } %4963, 0, !dbg !225 + %4965 = extractvalue { i32, i32, i32, i32 } %4963, 1, !dbg !225 + %4966 = extractvalue { i32, i32, i32, i32 } %4963, 2, !dbg !225 + %4967 = extractvalue { i32, i32, i32, i32 } %4963, 3, !dbg !225 + %4968 = shl nuw nsw i32 %56, 4, !dbg !225 + %4969 = and i32 %4968, 112, !dbg !225 + %4970 = shl nuw nsw i32 %58, 3, !dbg !225 + %4971 = and i32 %56, 112, !dbg !225 + %4972 = and i32 %56, 8, !dbg !225 + %4973 = shl nuw nsw i32 %4972, 11, !dbg !225 + %4974 = or disjoint i32 %4969, %4970, !dbg !225 + %4975 = xor i32 %4974, %4971, !dbg !225 + %4976 = or disjoint i32 %4975, %4973, !dbg !225 + %4977 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4976, !dbg !225 + %4978 = insertelement <4 x i32> poison, i32 %4929, i64 0, !dbg !225 + %4979 = insertelement <4 x i32> %4978, i32 %4930, i64 1, !dbg !225 + %4980 = insertelement <4 x i32> %4979, i32 %4931, i64 2, !dbg !225 + %4981 = insertelement <4 x i32> %4980, i32 %4932, i64 3, !dbg !225 + store <4 x i32> %4981, ptr addrspace(3) %4977, align 16, !dbg !225 + %4982 = or disjoint i32 %4976, 2048, !dbg !225 + %4983 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4982, !dbg !225 + %4984 = insertelement <4 x i32> poison, i32 %4934, i64 0, !dbg !225 + %4985 = insertelement <4 x i32> %4984, i32 %4935, i64 1, !dbg !225 + %4986 = insertelement <4 x i32> %4985, i32 %4936, i64 2, !dbg !225 + %4987 = insertelement <4 x i32> %4986, i32 %4937, i64 3, !dbg !225 + store <4 x i32> %4987, ptr addrspace(3) %4983, align 16, !dbg !225 + %4988 = or disjoint i32 %4976, 4096, !dbg !225 + %4989 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4988, !dbg !225 + %4990 = insertelement <4 x i32> poison, i32 %4939, i64 0, !dbg !225 + %4991 = insertelement <4 x i32> %4990, i32 %4940, i64 1, !dbg !225 + %4992 = insertelement <4 x i32> %4991, i32 %4941, i64 2, !dbg !225 + %4993 = insertelement <4 x i32> %4992, i32 %4942, i64 3, !dbg !225 + store <4 x i32> %4993, ptr addrspace(3) %4989, align 16, !dbg !225 + %4994 = or disjoint i32 %4976, 6144, !dbg !225 + %4995 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4994, !dbg !225 + %4996 = insertelement <4 x i32> poison, i32 %4944, i64 0, !dbg !225 + %4997 = insertelement <4 x i32> %4996, i32 %4945, i64 1, !dbg !225 + %4998 = insertelement <4 x i32> %4997, i32 %4946, i64 2, !dbg !225 + %4999 = insertelement <4 x i32> %4998, i32 %4947, i64 3, !dbg !225 + store <4 x i32> %4999, ptr addrspace(3) %4995, align 16, !dbg !225 + %5000 = or disjoint i32 %4976, 8192, !dbg !225 + %5001 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %5000, !dbg !225 + %5002 = insertelement <4 x i32> poison, i32 %4949, i64 0, !dbg !225 + %5003 = insertelement <4 x i32> %5002, i32 %4950, i64 1, !dbg !225 + %5004 = insertelement <4 x i32> %5003, i32 %4951, i64 2, !dbg !225 + %5005 = insertelement <4 x i32> %5004, i32 %4952, i64 3, !dbg !225 + store <4 x i32> %5005, ptr addrspace(3) %5001, align 16, !dbg !225 + %5006 = or disjoint i32 %4976, 10240, !dbg !225 + %5007 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %5006, !dbg !225 + %5008 = insertelement <4 x i32> poison, i32 %4954, i64 0, !dbg !225 + %5009 = insertelement <4 x i32> %5008, i32 %4955, i64 1, !dbg !225 + %5010 = insertelement <4 x i32> %5009, i32 %4956, i64 2, !dbg !225 + %5011 = insertelement <4 x i32> %5010, i32 %4957, i64 3, !dbg !225 + store <4 x i32> %5011, ptr addrspace(3) %5007, align 16, !dbg !225 + %5012 = or disjoint i32 %4976, 12288, !dbg !225 + %5013 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %5012, !dbg !225 + %5014 = insertelement <4 x i32> poison, i32 %4959, i64 0, !dbg !225 + %5015 = insertelement <4 x i32> %5014, i32 %4960, i64 1, !dbg !225 + %5016 = insertelement <4 x i32> %5015, i32 %4961, i64 2, !dbg !225 + %5017 = insertelement <4 x i32> %5016, i32 %4962, i64 3, !dbg !225 + store <4 x i32> %5017, ptr addrspace(3) %5013, align 16, !dbg !225 + %5018 = or disjoint i32 %4976, 14336, !dbg !225 + %5019 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %5018, !dbg !225 + %5020 = insertelement <4 x i32> poison, i32 %4964, i64 0, !dbg !225 + %5021 = insertelement <4 x i32> %5020, i32 %4965, i64 1, !dbg !225 + %5022 = insertelement <4 x i32> %5021, i32 %4966, i64 2, !dbg !225 + %5023 = insertelement <4 x i32> %5022, i32 %4967, i64 3, !dbg !225 + store <4 x i32> %5023, ptr addrspace(3) %5019, align 16, !dbg !225 + %5024 = getelementptr bfloat, ptr addrspace(1) %54, i64 %4893, !dbg !226 + %5025 = getelementptr bfloat, ptr addrspace(1) %54, i64 %4895, !dbg !226 + %5026 = getelementptr bfloat, ptr addrspace(1) %54, i64 %4897, !dbg !226 + %5027 = getelementptr bfloat, ptr addrspace(1) %54, i64 %4899, !dbg !226 + %5028 = getelementptr bfloat, ptr addrspace(1) %54, i64 %4901, !dbg !226 + %5029 = getelementptr bfloat, ptr addrspace(1) %54, i64 %4903, !dbg !226 + %5030 = getelementptr bfloat, ptr addrspace(1) %54, i64 %4905, !dbg !226 + %5031 = getelementptr bfloat, ptr addrspace(1) %54, i64 %4907, !dbg !226 + %5032 = getelementptr bfloat, ptr addrspace(1) %5024, i64 %4911, !dbg !228 + %5033 = getelementptr bfloat, ptr addrspace(1) %5025, i64 %4911, !dbg !228 + %5034 = getelementptr bfloat, ptr addrspace(1) %5026, i64 %4911, !dbg !228 + %5035 = getelementptr bfloat, ptr addrspace(1) %5027, i64 %4911, !dbg !228 + %5036 = getelementptr bfloat, ptr addrspace(1) %5028, i64 %4911, !dbg !228 + %5037 = getelementptr bfloat, ptr addrspace(1) %5029, i64 %4911, !dbg !228 + %5038 = getelementptr bfloat, ptr addrspace(1) %5030, i64 %4911, !dbg !228 + %5039 = getelementptr bfloat, ptr addrspace(1) %5031, i64 %4911, !dbg !228 + %5040 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %5032, i1 %4920) #3, !dbg !229 + %5041 = extractvalue { i32, i32, i32, i32 } %5040, 0, !dbg !229 + %5042 = extractvalue { i32, i32, i32, i32 } %5040, 1, !dbg !229 + %5043 = extractvalue { i32, i32, i32, i32 } %5040, 2, !dbg !229 + %5044 = extractvalue { i32, i32, i32, i32 } %5040, 3, !dbg !229 + %5045 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %5033, i1 %4921) #3, !dbg !229 + %5046 = extractvalue { i32, i32, i32, i32 } %5045, 0, !dbg !229 + %5047 = extractvalue { i32, i32, i32, i32 } %5045, 1, !dbg !229 + %5048 = extractvalue { i32, i32, i32, i32 } %5045, 2, !dbg !229 + %5049 = extractvalue { i32, i32, i32, i32 } %5045, 3, !dbg !229 + %5050 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %5034, i1 %4922) #3, !dbg !229 + %5051 = extractvalue { i32, i32, i32, i32 } %5050, 0, !dbg !229 + %5052 = extractvalue { i32, i32, i32, i32 } %5050, 1, !dbg !229 + %5053 = extractvalue { i32, i32, i32, i32 } %5050, 2, !dbg !229 + %5054 = extractvalue { i32, i32, i32, i32 } %5050, 3, !dbg !229 + %5055 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %5035, i1 %4923) #3, !dbg !229 + %5056 = extractvalue { i32, i32, i32, i32 } %5055, 0, !dbg !229 + %5057 = extractvalue { i32, i32, i32, i32 } %5055, 1, !dbg !229 + %5058 = extractvalue { i32, i32, i32, i32 } %5055, 2, !dbg !229 + %5059 = extractvalue { i32, i32, i32, i32 } %5055, 3, !dbg !229 + %5060 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %5036, i1 %4924) #3, !dbg !229 + %5061 = extractvalue { i32, i32, i32, i32 } %5060, 0, !dbg !229 + %5062 = extractvalue { i32, i32, i32, i32 } %5060, 1, !dbg !229 + %5063 = extractvalue { i32, i32, i32, i32 } %5060, 2, !dbg !229 + %5064 = extractvalue { i32, i32, i32, i32 } %5060, 3, !dbg !229 + %5065 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %5037, i1 %4925) #3, !dbg !229 + %5066 = extractvalue { i32, i32, i32, i32 } %5065, 0, !dbg !229 + %5067 = extractvalue { i32, i32, i32, i32 } %5065, 1, !dbg !229 + %5068 = extractvalue { i32, i32, i32, i32 } %5065, 2, !dbg !229 + %5069 = extractvalue { i32, i32, i32, i32 } %5065, 3, !dbg !229 + %5070 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %5038, i1 %4926) #3, !dbg !229 + %5071 = extractvalue { i32, i32, i32, i32 } %5070, 0, !dbg !229 + %5072 = extractvalue { i32, i32, i32, i32 } %5070, 1, !dbg !229 + %5073 = extractvalue { i32, i32, i32, i32 } %5070, 2, !dbg !229 + %5074 = extractvalue { i32, i32, i32, i32 } %5070, 3, !dbg !229 + %5075 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %5039, i1 %4927) #3, !dbg !229 + %5076 = extractvalue { i32, i32, i32, i32 } %5075, 0, !dbg !229 + %5077 = extractvalue { i32, i32, i32, i32 } %5075, 1, !dbg !229 + %5078 = extractvalue { i32, i32, i32, i32 } %5075, 2, !dbg !229 + %5079 = extractvalue { i32, i32, i32, i32 } %5075, 3, !dbg !229 + %5080 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4976, !dbg !229 + %5081 = insertelement <4 x i32> poison, i32 %5041, i64 0, !dbg !229 + %5082 = insertelement <4 x i32> %5081, i32 %5042, i64 1, !dbg !229 + %5083 = insertelement <4 x i32> %5082, i32 %5043, i64 2, !dbg !229 + %5084 = insertelement <4 x i32> %5083, i32 %5044, i64 3, !dbg !229 + store <4 x i32> %5084, ptr addrspace(3) %5080, align 16, !dbg !229 + %5085 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4982, !dbg !229 + %5086 = insertelement <4 x i32> poison, i32 %5046, i64 0, !dbg !229 + %5087 = insertelement <4 x i32> %5086, i32 %5047, i64 1, !dbg !229 + %5088 = insertelement <4 x i32> %5087, i32 %5048, i64 2, !dbg !229 + %5089 = insertelement <4 x i32> %5088, i32 %5049, i64 3, !dbg !229 + store <4 x i32> %5089, ptr addrspace(3) %5085, align 16, !dbg !229 + %5090 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4988, !dbg !229 + %5091 = insertelement <4 x i32> poison, i32 %5051, i64 0, !dbg !229 + %5092 = insertelement <4 x i32> %5091, i32 %5052, i64 1, !dbg !229 + %5093 = insertelement <4 x i32> %5092, i32 %5053, i64 2, !dbg !229 + %5094 = insertelement <4 x i32> %5093, i32 %5054, i64 3, !dbg !229 + store <4 x i32> %5094, ptr addrspace(3) %5090, align 16, !dbg !229 + %5095 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4994, !dbg !229 + %5096 = insertelement <4 x i32> poison, i32 %5056, i64 0, !dbg !229 + %5097 = insertelement <4 x i32> %5096, i32 %5057, i64 1, !dbg !229 + %5098 = insertelement <4 x i32> %5097, i32 %5058, i64 2, !dbg !229 + %5099 = insertelement <4 x i32> %5098, i32 %5059, i64 3, !dbg !229 + store <4 x i32> %5099, ptr addrspace(3) %5095, align 16, !dbg !229 + %5100 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %5000, !dbg !229 + %5101 = insertelement <4 x i32> poison, i32 %5061, i64 0, !dbg !229 + %5102 = insertelement <4 x i32> %5101, i32 %5062, i64 1, !dbg !229 + %5103 = insertelement <4 x i32> %5102, i32 %5063, i64 2, !dbg !229 + %5104 = insertelement <4 x i32> %5103, i32 %5064, i64 3, !dbg !229 + store <4 x i32> %5104, ptr addrspace(3) %5100, align 16, !dbg !229 + %5105 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %5006, !dbg !229 + %5106 = insertelement <4 x i32> poison, i32 %5066, i64 0, !dbg !229 + %5107 = insertelement <4 x i32> %5106, i32 %5067, i64 1, !dbg !229 + %5108 = insertelement <4 x i32> %5107, i32 %5068, i64 2, !dbg !229 + %5109 = insertelement <4 x i32> %5108, i32 %5069, i64 3, !dbg !229 + store <4 x i32> %5109, ptr addrspace(3) %5105, align 16, !dbg !229 + %5110 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %5012, !dbg !229 + %5111 = insertelement <4 x i32> poison, i32 %5071, i64 0, !dbg !229 + %5112 = insertelement <4 x i32> %5111, i32 %5072, i64 1, !dbg !229 + %5113 = insertelement <4 x i32> %5112, i32 %5073, i64 2, !dbg !229 + %5114 = insertelement <4 x i32> %5113, i32 %5074, i64 3, !dbg !229 + store <4 x i32> %5114, ptr addrspace(3) %5110, align 16, !dbg !229 + %5115 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %5018, !dbg !229 + %5116 = insertelement <4 x i32> poison, i32 %5076, i64 0, !dbg !229 + %5117 = insertelement <4 x i32> %5116, i32 %5077, i64 1, !dbg !229 + %5118 = insertelement <4 x i32> %5117, i32 %5078, i64 2, !dbg !229 + %5119 = insertelement <4 x i32> %5118, i32 %5079, i64 3, !dbg !229 + store <4 x i32> %5119, ptr addrspace(3) %5115, align 16, !dbg !229 + %5120 = shl nuw nsw i32 %44, 2, !dbg !230 + %5121 = mul i32 %30, %43, !dbg !231 + %5122 = mul i32 %38, %43, !dbg !232 + %5123 = shl nuw nsw i32 %43, 5, !dbg !233 + %5124 = mul nuw nsw i32 %23, %45, !dbg !234 + %5125 = add i32 %5124, %40, !dbg !235 + %5126 = mul nuw nsw i32 %25, %45, !dbg !236 + %reass.add1528 = add i32 %5126, %40, !dbg !237 + %reass.mul1529 = mul i32 %reass.add1528, %24, !dbg !237 + %5127 = sext i32 %reass.mul1529 to i64, !dbg !238 + %5128 = getelementptr i32, ptr addrspace(1) %11, i64 %5127, !dbg !238 + %5129 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %5128, i1 true) #3, !dbg !239 + %5130 = shl i32 %5129, 7, !dbg !240 + %5131 = sext i32 %5125 to i64, !dbg !241 + %5132 = getelementptr i32, ptr addrspace(1) %10, i64 %5131, !dbg !241 + %5133 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %5132, i1 true) #3, !dbg !242 + %5134 = and i32 %56, 3, !dbg !243 + %5135 = shl nuw nsw i32 %5134, 1, !dbg !243 + %5136 = or disjoint i32 %5135, 1, !dbg !243 + %5137 = or disjoint i32 %5135, 8, !dbg !243 + %5138 = or disjoint i32 %5135, 9, !dbg !243 + %5139 = or disjoint i32 %5130, %5137, !dbg !244 + %5140 = or disjoint i32 %5130, %5138, !dbg !244 + %5141 = insertelement <4 x i32> poison, i32 %5135, i64 0, !dbg !243 + %5142 = shufflevector <4 x i32> %5141, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !243 + %5143 = or disjoint <4 x i32> %5142, , !dbg !243 + %5144 = insertelement <4 x i32> poison, i32 %5130, i64 0, !dbg !244 + %5145 = shufflevector <4 x i32> %5144, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !244 + %5146 = or disjoint <4 x i32> %5145, %5143, !dbg !244 + %5147 = or disjoint i32 %5130, %59, !dbg !244 + %5148 = or disjoint i32 %5130, %60, !dbg !244 + %5149 = or disjoint i32 %5130, %61, !dbg !244 + %5150 = or disjoint i32 %5130, %62, !dbg !244 + %5151 = shl i32 %5147, 12, !dbg !245 + %5152 = shl i32 %5148, 12, !dbg !245 + %5153 = shl i32 %5149, 12, !dbg !245 + %5154 = shl i32 %5150, 12, !dbg !245 + %5155 = shl i32 %5147, 7, !dbg !247 + %5156 = shl i32 %5148, 7, !dbg !247 + %5157 = shl i32 %5149, 7, !dbg !247 + %5158 = shl i32 %5150, 7, !dbg !247 + %5159 = shl i32 %5133, 1, !dbg !248 + %5160 = add i32 %18, 63, !dbg !249 + %5161 = sdiv i32 %5160, 64, !dbg !250 + %5162 = tail call i32 @llvm.smax.i32(i32 %5161, i32 1), !dbg !251 + %5163 = tail call i32 @llvm.smin.i32(i32 %5159, i32 %5162), !dbg !252 + %5164 = insertelement <2 x i32> poison, i32 %72, i64 0, !dbg !218 + %5165 = insertelement <2 x i32> %5164, i32 %71, i64 1, !dbg !218 + %5166 = insertelement <2 x i32> poison, i32 %4876, i64 0, !dbg !218 + %5167 = shufflevector <2 x i32> %5166, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !218 + %5168 = or disjoint <2 x i32> %5165, %5167, !dbg !218 + %5169 = insertelement <8 x i32> poison, i32 %5135, i64 0, !dbg !243 + %5170 = shufflevector <8 x i32> %5169, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !243 + %5171 = or disjoint <8 x i32> %5170, , !dbg !243 + %5172 = insertelement <8 x i32> poison, i32 %5130, i64 0, !dbg !244 + %5173 = shufflevector <8 x i32> %5172, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !244 + %5174 = or disjoint <8 x i32> %5173, %5171, !dbg !244 + %5175 = insertelement <2 x i32> poison, i32 %19, i64 0, !dbg !253 + %5176 = shufflevector <2 x i32> %5175, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !253 + %5177 = srem <2 x i32> %5168, %5176, !dbg !253 + %5178 = shufflevector <2 x i32> %5177, <2 x i32> poison, <16 x i32> , !dbg !253 + %5179 = zext nneg i32 %43 to i64, !dbg !254 + %5180 = getelementptr i64, ptr addrspace(1) %16, i64 %5179, !dbg !254 + %5181 = icmp sgt i32 %5159, 0, !dbg !255 + %5182 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %5180, i1 %5181) #3, !dbg !256 + %5183 = extractelement <2 x i32> %5177, i64 1, !dbg !257 + %5184 = icmp sge i32 %5183, %26, !dbg !258 + %5185 = extractelement <2 x i32> %5177, i64 0, !dbg !257 + %5186 = icmp sge i32 %5185, %26, !dbg !258 + %5187 = srem i32 %5183, %26, !dbg !257 + %5188 = srem i32 %5185, %26, !dbg !257 + %.not59 = icmp eq i32 %5187, 0, !dbg !259 + %.not60 = icmp eq i32 %5188, 0, !dbg !259 + %5189 = tail call i32 @llvm.smin.i32(i32 %26, i32 0), !dbg !260 + %5190 = select i1 %.not59, i32 0, i32 %5189, !dbg !260 + %5191 = add nsw i32 %5190, %5187, !dbg !260 + %5192 = select i1 %.not60, i32 0, i32 %5189, !dbg !260 + %5193 = add nsw i32 %5192, %5188, !dbg !260 + %5194 = sext i32 %5191 to i64, !dbg !261 + %5195 = sext i32 %5193 to i64, !dbg !261 + %5196 = icmp sgt i64 %5182, %5194, !dbg !262 + %5197 = icmp sgt i64 %5182, %5195, !dbg !262 + %5198 = and i1 %5184, %5196, !dbg !263 + %5199 = and i1 %5186, %5197, !dbg !263 + %5200 = getelementptr i32, ptr addrspace(1) %15, i64 %5127, !dbg !264 + %5201 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %5200, i1 true) #3, !dbg !265 + %5202 = shl i32 %5201, 7, !dbg !266 + %5203 = getelementptr i32, ptr addrspace(1) %14, i64 %5131, !dbg !267 + %5204 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %5203, i1 true) #3, !dbg !268 + %5205 = or disjoint i32 %5202, %5135, !dbg !269 + %5206 = or disjoint i32 %5202, %5136, !dbg !269 + %5207 = or disjoint i32 %5202, %5137, !dbg !269 + %5208 = or disjoint i32 %5202, %5138, !dbg !269 + %5209 = extractelement <4 x i32> %5143, i64 3, !dbg !269 + %5210 = or disjoint i32 %5202, %5209, !dbg !269 + %5211 = extractelement <4 x i32> %5143, i64 2, !dbg !269 + %5212 = or disjoint i32 %5202, %5211, !dbg !269 + %5213 = extractelement <4 x i32> %5143, i64 1, !dbg !269 + %5214 = or disjoint i32 %5202, %5213, !dbg !269 + %5215 = extractelement <4 x i32> %5143, i64 0, !dbg !269 + %5216 = or disjoint i32 %5202, %5215, !dbg !269 + %5217 = extractelement <8 x i32> %5171, i64 7, !dbg !269 + %5218 = or disjoint i32 %5202, %5217, !dbg !269 + %5219 = extractelement <8 x i32> %5171, i64 6, !dbg !269 + %5220 = or disjoint i32 %5202, %5219, !dbg !269 + %5221 = extractelement <8 x i32> %5171, i64 5, !dbg !269 + %5222 = or disjoint i32 %5202, %5221, !dbg !269 + %5223 = extractelement <8 x i32> %5171, i64 4, !dbg !269 + %5224 = or disjoint i32 %5202, %5223, !dbg !269 + %5225 = extractelement <8 x i32> %5171, i64 3, !dbg !269 + %5226 = or disjoint i32 %5202, %5225, !dbg !269 + %5227 = extractelement <8 x i32> %5171, i64 2, !dbg !269 + %5228 = or disjoint i32 %5202, %5227, !dbg !269 + %5229 = extractelement <8 x i32> %5171, i64 1, !dbg !269 + %5230 = or disjoint i32 %5202, %5229, !dbg !269 + %5231 = extractelement <8 x i32> %5171, i64 0, !dbg !269 + %5232 = or disjoint i32 %5202, %5231, !dbg !269 + %5233 = or disjoint i32 %5202, %59, !dbg !269 + %5234 = or disjoint i32 %5202, %60, !dbg !269 + %5235 = or disjoint i32 %5202, %61, !dbg !269 + %5236 = or disjoint i32 %5202, %62, !dbg !269 + %5237 = shl i32 %5233, 12, !dbg !270 + %5238 = shl i32 %5234, 12, !dbg !270 + %5239 = shl i32 %5235, 12, !dbg !270 + %5240 = shl i32 %5236, 12, !dbg !270 + %5241 = shl i32 %5233, 7, !dbg !272 + %5242 = shl i32 %5234, 7, !dbg !272 + %5243 = shl i32 %5235, 7, !dbg !272 + %5244 = shl i32 %5236, 7, !dbg !272 + %5245 = shl i32 %5204, 1, !dbg !273 + %5246 = tail call i32 @llvm.smin.i32(i32 %5245, i32 %5162), !dbg !274 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #3, !dbg !275 + %5247 = sext i32 %5151 to i64 + %5248 = sext i32 %5152 to i64 + %5249 = sext i32 %5153 to i64 + %5250 = sext i32 %5154 to i64 + %5251 = sext i32 %5155 to i64 + %5252 = sext i32 %5156 to i64 + %5253 = sext i32 %5157 to i64 + %5254 = sext i32 %5158 to i64 + %5255 = icmp slt i32 %5147, %18 + %5256 = icmp slt i32 %5148, %18 + %5257 = icmp slt i32 %5149, %18 + %5258 = icmp slt i32 %5150, %18 + %5259 = and i1 %5181, %5255 + %5260 = and i1 %5181, %5256 + %5261 = and i1 %5181, %5257 + %5262 = and i1 %5181, %5258 + %5263 = shl nuw nsw i32 %4972, 10 + %5264 = or disjoint i32 %4975, %5263 + %5265 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %5264 + %5266 = select i1 %5259, i32 16, i32 0 + %5267 = or disjoint i32 %5264, 2048 + %5268 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %5267 + %5269 = select i1 %5260, i32 16, i32 0 + %5270 = or disjoint i32 %5264, 4096 + %5271 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %5270 + %5272 = select i1 %5261, i32 16, i32 0 + %5273 = or disjoint i32 %5264, 6144 + %5274 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %5273 + %5275 = select i1 %5262, i32 16, i32 0 + %5276 = icmp slt i32 %5139, %18 + %5277 = icmp slt i32 %5140, %18 + %5278 = extractelement <4 x i32> %5146, i64 3 + %5279 = icmp slt i32 %5278, %18 + %5280 = extractelement <4 x i32> %5146, i64 2 + %5281 = icmp slt i32 %5280, %18 + %5282 = extractelement <4 x i32> %5146, i64 1 + %5283 = icmp slt i32 %5282, %18 + %5284 = extractelement <4 x i32> %5146, i64 0 + %5285 = icmp slt i32 %5284, %18 + %5286 = extractelement <8 x i32> %5174, i64 7 + %5287 = icmp slt i32 %5286, %18 + %5288 = extractelement <8 x i32> %5174, i64 6 + %5289 = icmp slt i32 %5288, %18 + %5290 = extractelement <8 x i32> %5174, i64 5 + %5291 = icmp slt i32 %5290, %18 + %5292 = extractelement <8 x i32> %5174, i64 4 + %5293 = icmp slt i32 %5292, %18 + %5294 = extractelement <8 x i32> %5174, i64 3 + %5295 = icmp slt i32 %5294, %18 + %5296 = extractelement <8 x i32> %5174, i64 2 + %5297 = icmp slt i32 %5296, %18 + %5298 = extractelement <8 x i32> %5174, i64 1 + %5299 = icmp slt i32 %5298, %18 + %5300 = extractelement <8 x i32> %5174, i64 0 + %5301 = icmp slt i32 %5300, %18 + %5302 = sext i32 %5139 to i64 + %5303 = sext i32 %5140 to i64 + %5304 = sext i32 %5278 to i64 + %5305 = sext i32 %5280 to i64 + %5306 = sext i32 %5282 to i64 + %5307 = sext i32 %5284 to i64 + %5308 = sext i32 %5286 to i64 + %5309 = sext i32 %5288 to i64 + %5310 = sext i32 %5290 to i64 + %5311 = sext i32 %5292 to i64 + %5312 = sext i32 %5294 to i64 + %5313 = sext i32 %5296 to i64 + %5314 = sext i32 %5298 to i64 + %5315 = sext i32 %5300 to i64 + %5316 = and i1 %5181, %5276 + %5317 = and i1 %5181, %5277 + %5318 = and i1 %5181, %5279 + %5319 = and i1 %5181, %5281 + %5320 = and i1 %5181, %5283 + %5321 = and i1 %5181, %5285 + %5322 = and i1 %5181, %5287 + %5323 = and i1 %5181, %5289 + %5324 = and i1 %5181, %5291 + %5325 = and i1 %5181, %5293 + %5326 = and i1 %5181, %5295 + %5327 = and i1 %5181, %5297 + %5328 = and i1 %5181, %5299 + %5329 = and i1 %5181, %5301 + %5330 = and i32 %56, 252 + %5331 = icmp eq i32 %5330, 0 + %5332 = shl nuw nsw i32 %5134, 3 + %5333 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5332 + %5334 = or disjoint i32 %5332, 4 + %5335 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5334 + %5336 = or disjoint i32 %5332, 32 + %5337 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5336 + %5338 = select i1 %5316, i32 4, i32 0 + %5339 = or disjoint i32 %5332, 36 + %5340 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5339 + %5341 = select i1 %5317, i32 4, i32 0 + %5342 = or disjoint i32 %5332, 64 + %5343 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5342 + %5344 = select i1 %5318, i32 4, i32 0 + %5345 = or disjoint i32 %5332, 68 + %5346 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5345 + %5347 = select i1 %5319, i32 4, i32 0 + %5348 = or disjoint i32 %5332, 96 + %5349 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5348 + %5350 = select i1 %5320, i32 4, i32 0 + %5351 = or disjoint i32 %5332, 100 + %5352 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5351 + %5353 = select i1 %5321, i32 4, i32 0 + %5354 = or disjoint i32 %5332, 128 + %5355 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5354 + %5356 = select i1 %5322, i32 4, i32 0 + %5357 = or disjoint i32 %5332, 132 + %5358 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5357 + %5359 = select i1 %5323, i32 4, i32 0 + %5360 = or disjoint i32 %5332, 160 + %5361 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5360 + %5362 = select i1 %5324, i32 4, i32 0 + %5363 = or disjoint i32 %5332, 164 + %5364 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5363 + %5365 = select i1 %5325, i32 4, i32 0 + %5366 = or disjoint i32 %5332, 192 + %5367 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5366 + %5368 = select i1 %5326, i32 4, i32 0 + %5369 = or disjoint i32 %5332, 196 + %5370 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5369 + %5371 = select i1 %5327, i32 4, i32 0 + %5372 = or disjoint i32 %5332, 224 + %5373 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5372 + %5374 = select i1 %5328, i32 4, i32 0 + %5375 = or disjoint i32 %5332, 228 + %5376 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5375 + %5377 = select i1 %5329, i32 4, i32 0 + %5378 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %5264 + %5379 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %5267 + %5380 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %5270 + %5381 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %5273 + %5382 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5332 + %5383 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5334 + %5384 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5336 + %5385 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5339 + %5386 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5342 + %5387 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5345 + %5388 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5348 + %5389 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5351 + %5390 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5354 + %5391 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5357 + %5392 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5360 + %5393 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5363 + %5394 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5366 + %5395 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5369 + %5396 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5372 + %5397 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5375 + %5398 = icmp sgt i32 %5163, 1 + %5399 = insertelement <2 x i32> poison, i32 %5130, i64 0, !dbg !244 + %5400 = shufflevector <2 x i32> %5399, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !244 + %5401 = insertelement <2 x i32> poison, i32 %5135, i64 0, !dbg !244 + %5402 = insertelement <2 x i32> %5401, i32 %5136, i64 1, !dbg !244 + %5403 = or disjoint <2 x i32> %5400, %5402, !dbg !244 + %5404 = extractelement <2 x i32> %5403, i64 0 + %5405 = icmp slt i32 %5404, %18 + %5406 = extractelement <2 x i32> %5403, i64 1 + %5407 = icmp slt i32 %5406, %18 + %5408 = sext i32 %5404 to i64 + %5409 = sext i32 %5406 to i64 + %5410 = and i1 %5181, %5405 + %5411 = and i1 %5181, %5407 + %5412 = select i1 %5410, i32 4, i32 0 + %5413 = select i1 %5411, i32 4, i32 0 + %5414 = or disjoint <2 x i32> %5403, splat (i32 64) + %5415 = or disjoint i32 %5139, 64 + %5416 = or disjoint i32 %5140, 64 + %5417 = or disjoint i32 %5278, 64 + %5418 = or disjoint i32 %5280, 64 + %5419 = or disjoint i32 %5282, 64 + %5420 = or disjoint i32 %5284, 64 + %5421 = or disjoint i32 %5286, 64 + %5422 = or disjoint i32 %5288, 64 + %5423 = or disjoint i32 %5290, 64 + %5424 = or disjoint i32 %5292, 64 + %5425 = or disjoint i32 %5294, 64 + %5426 = or disjoint i32 %5296, 64 + %5427 = or disjoint i32 %5298, 64 + %5428 = or disjoint i32 %5300, 64 + %5429 = or disjoint i32 %5147, 64 + %5430 = or disjoint i32 %5148, 64 + %5431 = or disjoint i32 %5149, 64 + %5432 = or disjoint i32 %5150, 64 + %5433 = icmp slt i32 %5429, %18 + %5434 = icmp slt i32 %5430, %18 + %5435 = icmp slt i32 %5431, %18 + %5436 = icmp slt i32 %5432, %18 + %5437 = and i1 %5398, %5433 + %5438 = and i1 %5398, %5434 + %5439 = and i1 %5398, %5435 + %5440 = and i1 %5398, %5436 + %5441 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %5264 + %5442 = select i1 %5437, i32 16, i32 0 + %5443 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %5267 + %5444 = select i1 %5438, i32 16, i32 0 + %5445 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %5270 + %5446 = select i1 %5439, i32 16, i32 0 + %5447 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %5273 + %5448 = select i1 %5440, i32 16, i32 0 + %5449 = extractelement <2 x i32> %5414, i64 0 + %5450 = icmp slt i32 %5449, %18 + %5451 = extractelement <2 x i32> %5414, i64 1 + %5452 = icmp slt i32 %5451, %18 + %5453 = icmp slt i32 %5415, %18 + %5454 = icmp slt i32 %5416, %18 + %5455 = icmp slt i32 %5417, %18 + %5456 = icmp slt i32 %5418, %18 + %5457 = icmp slt i32 %5419, %18 + %5458 = icmp slt i32 %5420, %18 + %5459 = icmp slt i32 %5421, %18 + %5460 = icmp slt i32 %5422, %18 + %5461 = icmp slt i32 %5423, %18 + %5462 = icmp slt i32 %5424, %18 + %5463 = icmp slt i32 %5425, %18 + %5464 = icmp slt i32 %5426, %18 + %5465 = icmp slt i32 %5427, %18 + %5466 = icmp slt i32 %5428, %18 + %5467 = sext i32 %5449 to i64 + %5468 = sext i32 %5451 to i64 + %5469 = sext i32 %5415 to i64 + %5470 = sext i32 %5416 to i64 + %5471 = sext i32 %5417 to i64 + %5472 = sext i32 %5418 to i64 + %5473 = sext i32 %5419 to i64 + %5474 = sext i32 %5420 to i64 + %5475 = sext i32 %5421 to i64 + %5476 = sext i32 %5422 to i64 + %5477 = sext i32 %5423 to i64 + %5478 = sext i32 %5424 to i64 + %5479 = sext i32 %5425 to i64 + %5480 = sext i32 %5426 to i64 + %5481 = sext i32 %5427 to i64 + %5482 = sext i32 %5428 to i64 + %5483 = and i1 %5398, %5450 + %5484 = and i1 %5398, %5452 + %5485 = and i1 %5398, %5453 + %5486 = and i1 %5398, %5454 + %5487 = and i1 %5398, %5455 + %5488 = and i1 %5398, %5456 + %5489 = and i1 %5398, %5457 + %5490 = and i1 %5398, %5458 + %5491 = and i1 %5398, %5459 + %5492 = and i1 %5398, %5460 + %5493 = and i1 %5398, %5461 + %5494 = and i1 %5398, %5462 + %5495 = and i1 %5398, %5463 + %5496 = and i1 %5398, %5464 + %5497 = and i1 %5398, %5465 + %5498 = and i1 %5398, %5466 + %5499 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5332 + %5500 = select i1 %5483, i32 4, i32 0 + %5501 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5334 + %5502 = select i1 %5484, i32 4, i32 0 + %5503 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5336 + %5504 = select i1 %5485, i32 4, i32 0 + %5505 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5339 + %5506 = select i1 %5486, i32 4, i32 0 + %5507 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5342 + %5508 = select i1 %5487, i32 4, i32 0 + %5509 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5345 + %5510 = select i1 %5488, i32 4, i32 0 + %5511 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5348 + %5512 = select i1 %5489, i32 4, i32 0 + %5513 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5351 + %5514 = select i1 %5490, i32 4, i32 0 + %5515 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5354 + %5516 = select i1 %5491, i32 4, i32 0 + %5517 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5357 + %5518 = select i1 %5492, i32 4, i32 0 + %5519 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5360 + %5520 = select i1 %5493, i32 4, i32 0 + %5521 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5363 + %5522 = select i1 %5494, i32 4, i32 0 + %5523 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5366 + %5524 = select i1 %5495, i32 4, i32 0 + %5525 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5369 + %5526 = select i1 %5496, i32 4, i32 0 + %5527 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5372 + %5528 = select i1 %5497, i32 4, i32 0 + %5529 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %5375 + %5530 = select i1 %5498, i32 4, i32 0 + %5531 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %5264 + %5532 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %5267 + %5533 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %5270 + %5534 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %5273 + %5535 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5332 + %5536 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5334 + %5537 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5336 + %5538 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5339 + %5539 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5342 + %5540 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5345 + %5541 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5348 + %5542 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5351 + %5543 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5354 + %5544 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5357 + %5545 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5360 + %5546 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5363 + %5547 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5366 + %5548 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5369 + %5549 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5372 + %5550 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %5375 + %5551 = add i32 %5163, -2 + %5552 = add nsw i32 %5163, -1 + %5553 = sext i32 %5237 to i64 + %5554 = sext i32 %5238 to i64 + %5555 = sext i32 %5239 to i64 + %5556 = sext i32 %5240 to i64 + %5557 = sext i32 %5241 to i64 + %5558 = sext i32 %5242 to i64 + %5559 = sext i32 %5243 to i64 + %5560 = sext i32 %5244 to i64 + %5561 = icmp sgt i32 %5245, 0 + %5562 = icmp slt i32 %5233, %18 + %5563 = icmp slt i32 %5234, %18 + %5564 = icmp slt i32 %5235, %18 + %5565 = icmp slt i32 %5236, %18 + %5566 = and i1 %5561, %5562 + %5567 = and i1 %5561, %5563 + %5568 = and i1 %5561, %5564 + %5569 = and i1 %5561, %5565 + %5570 = select i1 %5566, i32 16, i32 0 + %5571 = select i1 %5567, i32 16, i32 0 + %5572 = select i1 %5568, i32 16, i32 0 + %5573 = select i1 %5569, i32 16, i32 0 + %5574 = icmp slt i32 %5205, %18 + %5575 = icmp slt i32 %5206, %18 + %5576 = icmp slt i32 %5207, %18 + %5577 = icmp slt i32 %5208, %18 + %5578 = icmp slt i32 %5210, %18 + %5579 = icmp slt i32 %5212, %18 + %5580 = icmp slt i32 %5214, %18 + %5581 = icmp slt i32 %5216, %18 + %5582 = icmp slt i32 %5218, %18 + %5583 = icmp slt i32 %5220, %18 + %5584 = icmp slt i32 %5222, %18 + %5585 = icmp slt i32 %5224, %18 + %5586 = icmp slt i32 %5226, %18 + %5587 = icmp slt i32 %5228, %18 + %5588 = icmp slt i32 %5230, %18 + %5589 = icmp slt i32 %5232, %18 + %5590 = sext i32 %5205 to i64 + %5591 = sext i32 %5206 to i64 + %5592 = sext i32 %5207 to i64 + %5593 = sext i32 %5208 to i64 + %5594 = sext i32 %5210 to i64 + %5595 = sext i32 %5212 to i64 + %5596 = sext i32 %5214 to i64 + %5597 = sext i32 %5216 to i64 + %5598 = sext i32 %5218 to i64 + %5599 = sext i32 %5220 to i64 + %5600 = sext i32 %5222 to i64 + %5601 = sext i32 %5224 to i64 + %5602 = sext i32 %5226 to i64 + %5603 = sext i32 %5228 to i64 + %5604 = sext i32 %5230 to i64 + %5605 = sext i32 %5232 to i64 + %5606 = and i1 %5561, %5574 + %5607 = and i1 %5561, %5575 + %5608 = and i1 %5561, %5576 + %5609 = and i1 %5561, %5577 + %5610 = and i1 %5561, %5578 + %5611 = and i1 %5561, %5579 + %5612 = and i1 %5561, %5580 + %5613 = and i1 %5561, %5581 + %5614 = and i1 %5561, %5582 + %5615 = and i1 %5561, %5583 + %5616 = and i1 %5561, %5584 + %5617 = and i1 %5561, %5585 + %5618 = and i1 %5561, %5586 + %5619 = and i1 %5561, %5587 + %5620 = and i1 %5561, %5588 + %5621 = and i1 %5561, %5589 + %5622 = select i1 %5606, i32 4, i32 0 + %5623 = select i1 %5607, i32 4, i32 0 + %5624 = select i1 %5608, i32 4, i32 0 + %5625 = select i1 %5609, i32 4, i32 0 + %5626 = select i1 %5610, i32 4, i32 0 + %5627 = select i1 %5611, i32 4, i32 0 + %5628 = select i1 %5612, i32 4, i32 0 + %5629 = select i1 %5613, i32 4, i32 0 + %5630 = select i1 %5614, i32 4, i32 0 + %5631 = select i1 %5615, i32 4, i32 0 + %5632 = select i1 %5616, i32 4, i32 0 + %5633 = select i1 %5617, i32 4, i32 0 + %5634 = select i1 %5618, i32 4, i32 0 + %5635 = select i1 %5619, i32 4, i32 0 + %5636 = select i1 %5620, i32 4, i32 0 + %5637 = select i1 %5621, i32 4, i32 0 + %5638 = icmp sgt i32 %5246, 1 + %5639 = or disjoint i32 %5205, 64 + %5640 = or disjoint i32 %5206, 64 + %5641 = or disjoint i32 %5207, 64 + %5642 = or disjoint i32 %5208, 64 + %5643 = or disjoint i32 %5210, 64 + %5644 = or disjoint i32 %5212, 64 + %5645 = or disjoint i32 %5214, 64 + %5646 = or disjoint i32 %5216, 64 + %5647 = or disjoint i32 %5218, 64 + %5648 = or disjoint i32 %5220, 64 + %5649 = or disjoint i32 %5222, 64 + %5650 = or disjoint i32 %5224, 64 + %5651 = or disjoint i32 %5226, 64 + %5652 = or disjoint i32 %5228, 64 + %5653 = or disjoint i32 %5230, 64 + %5654 = or disjoint i32 %5232, 64 + %5655 = or disjoint i32 %5233, 64 + %5656 = or disjoint i32 %5234, 64 + %5657 = or disjoint i32 %5235, 64 + %5658 = or disjoint i32 %5236, 64 + %5659 = icmp slt i32 %5655, %18 + %5660 = icmp slt i32 %5656, %18 + %5661 = icmp slt i32 %5657, %18 + %5662 = icmp slt i32 %5658, %18 + %5663 = and i1 %5638, %5659 + %5664 = and i1 %5638, %5660 + %5665 = and i1 %5638, %5661 + %5666 = and i1 %5638, %5662 + %5667 = select i1 %5663, i32 16, i32 0 + %5668 = select i1 %5664, i32 16, i32 0 + %5669 = select i1 %5665, i32 16, i32 0 + %5670 = select i1 %5666, i32 16, i32 0 + %5671 = icmp slt i32 %5639, %18 + %5672 = icmp slt i32 %5640, %18 + %5673 = icmp slt i32 %5641, %18 + %5674 = icmp slt i32 %5642, %18 + %5675 = icmp slt i32 %5643, %18 + %5676 = icmp slt i32 %5644, %18 + %5677 = icmp slt i32 %5645, %18 + %5678 = icmp slt i32 %5646, %18 + %5679 = icmp slt i32 %5647, %18 + %5680 = icmp slt i32 %5648, %18 + %5681 = icmp slt i32 %5649, %18 + %5682 = icmp slt i32 %5650, %18 + %5683 = icmp slt i32 %5651, %18 + %5684 = icmp slt i32 %5652, %18 + %5685 = icmp slt i32 %5653, %18 + %5686 = icmp slt i32 %5654, %18 + %5687 = sext i32 %5639 to i64 + %5688 = sext i32 %5640 to i64 + %5689 = sext i32 %5641 to i64 + %5690 = sext i32 %5642 to i64 + %5691 = sext i32 %5643 to i64 + %5692 = sext i32 %5644 to i64 + %5693 = sext i32 %5645 to i64 + %5694 = sext i32 %5646 to i64 + %5695 = sext i32 %5647 to i64 + %5696 = sext i32 %5648 to i64 + %5697 = sext i32 %5649 to i64 + %5698 = sext i32 %5650 to i64 + %5699 = sext i32 %5651 to i64 + %5700 = sext i32 %5652 to i64 + %5701 = sext i32 %5653 to i64 + %5702 = sext i32 %5654 to i64 + %5703 = and i1 %5638, %5671 + %5704 = and i1 %5638, %5672 + %5705 = and i1 %5638, %5673 + %5706 = and i1 %5638, %5674 + %5707 = and i1 %5638, %5675 + %5708 = and i1 %5638, %5676 + %5709 = and i1 %5638, %5677 + %5710 = and i1 %5638, %5678 + %5711 = and i1 %5638, %5679 + %5712 = and i1 %5638, %5680 + %5713 = and i1 %5638, %5681 + %5714 = and i1 %5638, %5682 + %5715 = and i1 %5638, %5683 + %5716 = and i1 %5638, %5684 + %5717 = and i1 %5638, %5685 + %5718 = and i1 %5638, %5686 + %5719 = select i1 %5703, i32 4, i32 0 + %5720 = select i1 %5704, i32 4, i32 0 + %5721 = select i1 %5705, i32 4, i32 0 + %5722 = select i1 %5706, i32 4, i32 0 + %5723 = select i1 %5707, i32 4, i32 0 + %5724 = select i1 %5708, i32 4, i32 0 + %5725 = select i1 %5709, i32 4, i32 0 + %5726 = select i1 %5710, i32 4, i32 0 + %5727 = select i1 %5711, i32 4, i32 0 + %5728 = select i1 %5712, i32 4, i32 0 + %5729 = select i1 %5713, i32 4, i32 0 + %5730 = select i1 %5714, i32 4, i32 0 + %5731 = select i1 %5715, i32 4, i32 0 + %5732 = select i1 %5716, i32 4, i32 0 + %5733 = select i1 %5717, i32 4, i32 0 + %5734 = select i1 %5718, i32 4, i32 0 + %5735 = add i32 %5246, -2 + %5736 = add nsw i32 %5246, -1 + %smax2185 = tail call i32 @llvm.smax.i32(i32 %5163, i32 1), !dbg !276 + %smax2187 = tail call i32 @llvm.smax.i32(i32 %5246, i32 1), !dbg !276 + %5737 = zext nneg i32 %5120 to i64, !dbg !276 + %5738 = insertelement <2 x i32> poison, i32 %18, i64 0 + %5739 = shufflevector <2 x i32> %5738, <2 x i32> poison, <2 x i32> zeroinitializer + %5740 = insertelement <4 x i32> poison, i32 %18, i64 0 + %5741 = shufflevector <4 x i32> %5740, <4 x i32> poison, <8 x i32> zeroinitializer + %5742 = shufflevector <2 x i32> %5177, <2 x i32> poison, <8 x i32> + %5743 = insertelement <8 x i32> poison, i32 %26, i64 0 + %5744 = shufflevector <8 x i32> %5743, <8 x i32> poison, <8 x i32> zeroinitializer + %5745 = insertelement <8 x i32> poison, i32 %18, i64 0 + %5746 = shufflevector <8 x i32> %5745, <8 x i32> poison, <8 x i32> zeroinitializer + %5747 = insertelement <8 x i64> poison, i64 %5182, i64 0 + %5748 = shufflevector <8 x i64> %5747, <8 x i64> poison, <8 x i32> zeroinitializer + %5749 = insertelement <16 x i32> poison, i32 %26, i64 0 + %5750 = shufflevector <16 x i32> %5749, <16 x i32> poison, <16 x i32> zeroinitializer + %5751 = shufflevector <2 x i32> %5177, <2 x i32> poison, <2 x i32> zeroinitializer + %5752 = insertelement <2 x i64> poison, i64 %5182, i64 0 + %5753 = shufflevector <2 x i64> %5752, <2 x i64> poison, <2 x i32> zeroinitializer + %5754 = insertelement <2 x i32> poison, i32 %26, i64 0 + %5755 = shufflevector <2 x i32> %5754, <2 x i32> poison, <2 x i32> zeroinitializer + %5756 = insertelement <2 x i1> poison, i1 %5199, i64 0 + %5757 = shufflevector <2 x i1> %5756, <2 x i1> poison, <2 x i32> zeroinitializer + %5758 = shufflevector <2 x i32> %5177, <2 x i32> poison, <2 x i32> + %5759 = insertelement <2 x i1> poison, i1 %5198, i64 0 + %5760 = shufflevector <2 x i1> %5759, <2 x i1> poison, <2 x i32> zeroinitializer + br label %5761, !dbg !276 + +5761: ; preds = %4875, %._crit_edge1794 + %indvars.iv = phi i64 [ 0, %4875 ], [ %indvars.iv.next, %._crit_edge1794 ] + %5762 = phi float [ 0.000000e+00, %4875 ], [ %11300, %._crit_edge1794 ] + %5763 = phi float [ 0.000000e+00, %4875 ], [ %11301, %._crit_edge1794 ] + %5764 = phi float [ 0.000000e+00, %4875 ], [ %11302, %._crit_edge1794 ] + %5765 = phi float [ 0.000000e+00, %4875 ], [ %11303, %._crit_edge1794 ] + %5766 = phi float [ 0.000000e+00, %4875 ], [ %11304, %._crit_edge1794 ] + %5767 = phi float [ 0.000000e+00, %4875 ], [ %11305, %._crit_edge1794 ] + %5768 = phi float [ 0.000000e+00, %4875 ], [ %11306, %._crit_edge1794 ] + %5769 = phi float [ 0.000000e+00, %4875 ], [ %11307, %._crit_edge1794 ] + %5770 = phi float [ 0.000000e+00, %4875 ], [ %11308, %._crit_edge1794 ] + %5771 = phi float [ 0.000000e+00, %4875 ], [ %11309, %._crit_edge1794 ] + %5772 = phi float [ 0.000000e+00, %4875 ], [ %11310, %._crit_edge1794 ] + %5773 = phi float [ 0.000000e+00, %4875 ], [ %11311, %._crit_edge1794 ] + %5774 = phi float [ 0.000000e+00, %4875 ], [ %11312, %._crit_edge1794 ] + %5775 = phi float [ 0.000000e+00, %4875 ], [ %11313, %._crit_edge1794 ] + %5776 = phi float [ 0.000000e+00, %4875 ], [ %11314, %._crit_edge1794 ] + %5777 = phi float [ 0.000000e+00, %4875 ], [ %11315, %._crit_edge1794 ] + %5778 = phi float [ 0.000000e+00, %4875 ], [ %11316, %._crit_edge1794 ] + %5779 = phi float [ 0.000000e+00, %4875 ], [ %11317, %._crit_edge1794 ] + %5780 = phi float [ 0.000000e+00, %4875 ], [ %11318, %._crit_edge1794 ] + %5781 = phi float [ 0.000000e+00, %4875 ], [ %11319, %._crit_edge1794 ] + %5782 = phi float [ 0.000000e+00, %4875 ], [ %11320, %._crit_edge1794 ] + %5783 = phi float [ 0.000000e+00, %4875 ], [ %11321, %._crit_edge1794 ] + %5784 = phi float [ 0.000000e+00, %4875 ], [ %11322, %._crit_edge1794 ] + %5785 = phi float [ 0.000000e+00, %4875 ], [ %11323, %._crit_edge1794 ] + %5786 = phi float [ 0.000000e+00, %4875 ], [ %11324, %._crit_edge1794 ] + %5787 = phi float [ 0.000000e+00, %4875 ], [ %11325, %._crit_edge1794 ] + %5788 = phi float [ 0.000000e+00, %4875 ], [ %11326, %._crit_edge1794 ] + %5789 = phi float [ 0.000000e+00, %4875 ], [ %11327, %._crit_edge1794 ] + %5790 = phi float [ 0.000000e+00, %4875 ], [ %11328, %._crit_edge1794 ] + %5791 = phi float [ 0.000000e+00, %4875 ], [ %11329, %._crit_edge1794 ] + %5792 = phi float [ 0.000000e+00, %4875 ], [ %11330, %._crit_edge1794 ] + %5793 = phi float [ 0.000000e+00, %4875 ], [ %11331, %._crit_edge1794 ] + %5794 = phi float [ 0.000000e+00, %4875 ], [ %11332, %._crit_edge1794 ] + %5795 = phi float [ 0.000000e+00, %4875 ], [ %11333, %._crit_edge1794 ] + %5796 = phi float [ 0.000000e+00, %4875 ], [ %11334, %._crit_edge1794 ] + %5797 = phi float [ 0.000000e+00, %4875 ], [ %11335, %._crit_edge1794 ] + %5798 = phi float [ 0.000000e+00, %4875 ], [ %11336, %._crit_edge1794 ] + %5799 = phi float [ 0.000000e+00, %4875 ], [ %11337, %._crit_edge1794 ] + %5800 = phi float [ 0.000000e+00, %4875 ], [ %11338, %._crit_edge1794 ] + %5801 = phi float [ 0.000000e+00, %4875 ], [ %11339, %._crit_edge1794 ] + %5802 = phi float [ 0.000000e+00, %4875 ], [ %11340, %._crit_edge1794 ] + %5803 = phi float [ 0.000000e+00, %4875 ], [ %11341, %._crit_edge1794 ] + %5804 = phi float [ 0.000000e+00, %4875 ], [ %11342, %._crit_edge1794 ] + %5805 = phi float [ 0.000000e+00, %4875 ], [ %11343, %._crit_edge1794 ] + %5806 = phi float [ 0.000000e+00, %4875 ], [ %11344, %._crit_edge1794 ] + %5807 = phi float [ 0.000000e+00, %4875 ], [ %11345, %._crit_edge1794 ] + %5808 = phi float [ 0.000000e+00, %4875 ], [ %11346, %._crit_edge1794 ] + %5809 = phi float [ 0.000000e+00, %4875 ], [ %11347, %._crit_edge1794 ] + %5810 = phi float [ 0.000000e+00, %4875 ], [ %11348, %._crit_edge1794 ] + %5811 = phi float [ 0.000000e+00, %4875 ], [ %11349, %._crit_edge1794 ] + %5812 = phi float [ 0.000000e+00, %4875 ], [ %11350, %._crit_edge1794 ] + %5813 = phi float [ 0.000000e+00, %4875 ], [ %11351, %._crit_edge1794 ] + %5814 = phi float [ 0.000000e+00, %4875 ], [ %11352, %._crit_edge1794 ] + %5815 = phi float [ 0.000000e+00, %4875 ], [ %11353, %._crit_edge1794 ] + %5816 = phi float [ 0.000000e+00, %4875 ], [ %11354, %._crit_edge1794 ] + %5817 = phi float [ 0.000000e+00, %4875 ], [ %11355, %._crit_edge1794 ] + %5818 = phi float [ 0.000000e+00, %4875 ], [ %11356, %._crit_edge1794 ] + %5819 = phi float [ 0.000000e+00, %4875 ], [ %11357, %._crit_edge1794 ] + %5820 = phi float [ 0.000000e+00, %4875 ], [ %11358, %._crit_edge1794 ] + %5821 = phi float [ 0.000000e+00, %4875 ], [ %11359, %._crit_edge1794 ] + %5822 = phi float [ 0.000000e+00, %4875 ], [ %11360, %._crit_edge1794 ] + %5823 = phi float [ 0.000000e+00, %4875 ], [ %11361, %._crit_edge1794 ] + %5824 = phi float [ 0.000000e+00, %4875 ], [ %11362, %._crit_edge1794 ] + %5825 = phi float [ 0.000000e+00, %4875 ], [ %11363, %._crit_edge1794 ] + %5826 = phi float [ 0.000000e+00, %4875 ], [ %11236, %._crit_edge1794 ] + %5827 = phi float [ 0.000000e+00, %4875 ], [ %11237, %._crit_edge1794 ] + %5828 = phi float [ 0.000000e+00, %4875 ], [ %11238, %._crit_edge1794 ] + %5829 = phi float [ 0.000000e+00, %4875 ], [ %11239, %._crit_edge1794 ] + %5830 = phi float [ 0.000000e+00, %4875 ], [ %11240, %._crit_edge1794 ] + %5831 = phi float [ 0.000000e+00, %4875 ], [ %11241, %._crit_edge1794 ] + %5832 = phi float [ 0.000000e+00, %4875 ], [ %11242, %._crit_edge1794 ] + %5833 = phi float [ 0.000000e+00, %4875 ], [ %11243, %._crit_edge1794 ] + %5834 = phi float [ 0.000000e+00, %4875 ], [ %11244, %._crit_edge1794 ] + %5835 = phi float [ 0.000000e+00, %4875 ], [ %11245, %._crit_edge1794 ] + %5836 = phi float [ 0.000000e+00, %4875 ], [ %11246, %._crit_edge1794 ] + %5837 = phi float [ 0.000000e+00, %4875 ], [ %11247, %._crit_edge1794 ] + %5838 = phi float [ 0.000000e+00, %4875 ], [ %11248, %._crit_edge1794 ] + %5839 = phi float [ 0.000000e+00, %4875 ], [ %11249, %._crit_edge1794 ] + %5840 = phi float [ 0.000000e+00, %4875 ], [ %11250, %._crit_edge1794 ] + %5841 = phi float [ 0.000000e+00, %4875 ], [ %11251, %._crit_edge1794 ] + %5842 = phi float [ 0.000000e+00, %4875 ], [ %11252, %._crit_edge1794 ] + %5843 = phi float [ 0.000000e+00, %4875 ], [ %11253, %._crit_edge1794 ] + %5844 = phi float [ 0.000000e+00, %4875 ], [ %11254, %._crit_edge1794 ] + %5845 = phi float [ 0.000000e+00, %4875 ], [ %11255, %._crit_edge1794 ] + %5846 = phi float [ 0.000000e+00, %4875 ], [ %11256, %._crit_edge1794 ] + %5847 = phi float [ 0.000000e+00, %4875 ], [ %11257, %._crit_edge1794 ] + %5848 = phi float [ 0.000000e+00, %4875 ], [ %11258, %._crit_edge1794 ] + %5849 = phi float [ 0.000000e+00, %4875 ], [ %11259, %._crit_edge1794 ] + %5850 = phi float [ 0.000000e+00, %4875 ], [ %11260, %._crit_edge1794 ] + %5851 = phi float [ 0.000000e+00, %4875 ], [ %11261, %._crit_edge1794 ] + %5852 = phi float [ 0.000000e+00, %4875 ], [ %11262, %._crit_edge1794 ] + %5853 = phi float [ 0.000000e+00, %4875 ], [ %11263, %._crit_edge1794 ] + %5854 = phi float [ 0.000000e+00, %4875 ], [ %11264, %._crit_edge1794 ] + %5855 = phi float [ 0.000000e+00, %4875 ], [ %11265, %._crit_edge1794 ] + %5856 = phi float [ 0.000000e+00, %4875 ], [ %11266, %._crit_edge1794 ] + %5857 = phi float [ 0.000000e+00, %4875 ], [ %11267, %._crit_edge1794 ] + %5858 = phi float [ 0.000000e+00, %4875 ], [ %11268, %._crit_edge1794 ] + %5859 = phi float [ 0.000000e+00, %4875 ], [ %11269, %._crit_edge1794 ] + %5860 = phi float [ 0.000000e+00, %4875 ], [ %11270, %._crit_edge1794 ] + %5861 = phi float [ 0.000000e+00, %4875 ], [ %11271, %._crit_edge1794 ] + %5862 = phi float [ 0.000000e+00, %4875 ], [ %11272, %._crit_edge1794 ] + %5863 = phi float [ 0.000000e+00, %4875 ], [ %11273, %._crit_edge1794 ] + %5864 = phi float [ 0.000000e+00, %4875 ], [ %11274, %._crit_edge1794 ] + %5865 = phi float [ 0.000000e+00, %4875 ], [ %11275, %._crit_edge1794 ] + %5866 = phi float [ 0.000000e+00, %4875 ], [ %11276, %._crit_edge1794 ] + %5867 = phi float [ 0.000000e+00, %4875 ], [ %11277, %._crit_edge1794 ] + %5868 = phi float [ 0.000000e+00, %4875 ], [ %11278, %._crit_edge1794 ] + %5869 = phi float [ 0.000000e+00, %4875 ], [ %11279, %._crit_edge1794 ] + %5870 = phi float [ 0.000000e+00, %4875 ], [ %11280, %._crit_edge1794 ] + %5871 = phi float [ 0.000000e+00, %4875 ], [ %11281, %._crit_edge1794 ] + %5872 = phi float [ 0.000000e+00, %4875 ], [ %11282, %._crit_edge1794 ] + %5873 = phi float [ 0.000000e+00, %4875 ], [ %11283, %._crit_edge1794 ] + %5874 = phi float [ 0.000000e+00, %4875 ], [ %11284, %._crit_edge1794 ] + %5875 = phi float [ 0.000000e+00, %4875 ], [ %11285, %._crit_edge1794 ] + %5876 = phi float [ 0.000000e+00, %4875 ], [ %11286, %._crit_edge1794 ] + %5877 = phi float [ 0.000000e+00, %4875 ], [ %11287, %._crit_edge1794 ] + %5878 = phi float [ 0.000000e+00, %4875 ], [ %11288, %._crit_edge1794 ] + %5879 = phi float [ 0.000000e+00, %4875 ], [ %11289, %._crit_edge1794 ] + %5880 = phi float [ 0.000000e+00, %4875 ], [ %11290, %._crit_edge1794 ] + %5881 = phi float [ 0.000000e+00, %4875 ], [ %11291, %._crit_edge1794 ] + %5882 = phi float [ 0.000000e+00, %4875 ], [ %11292, %._crit_edge1794 ] + %5883 = phi float [ 0.000000e+00, %4875 ], [ %11293, %._crit_edge1794 ] + %5884 = phi float [ 0.000000e+00, %4875 ], [ %11294, %._crit_edge1794 ] + %5885 = phi float [ 0.000000e+00, %4875 ], [ %11295, %._crit_edge1794 ] + %5886 = phi float [ 0.000000e+00, %4875 ], [ %11296, %._crit_edge1794 ] + %5887 = phi float [ 0.000000e+00, %4875 ], [ %11297, %._crit_edge1794 ] + %5888 = phi float [ 0.000000e+00, %4875 ], [ %11298, %._crit_edge1794 ] + %5889 = phi float [ 0.000000e+00, %4875 ], [ %11299, %._crit_edge1794 ] + %5890 = add nuw nsw i64 %indvars.iv, %5737, !dbg !277 + %.tr = trunc i64 %5890 to i32, !dbg !278 + %5891 = shl i32 %.tr, 7, !dbg !278 + %5892 = add i32 %5891, %5121, !dbg !278 + %5893 = sext i32 %5892 to i64, !dbg !279 + %5894 = trunc nuw nsw i64 %5890 to i32, !dbg !280 + %5895 = mul i32 %39, %5894, !dbg !280 + %5896 = add i32 %5895, %5122, !dbg !281 + %5897 = sext i32 %5896 to i64, !dbg !282 + %5898 = trunc i64 %5890 to i32, !dbg !283 + %5899 = add i32 %5123, %5898, !dbg !283 + %5900 = mul i32 %5899, %18, !dbg !283 + %5901 = sext i32 %5900 to i64, !dbg !284 + %5902 = getelementptr bfloat, ptr addrspace(1) %0, i64 %5893, !dbg !285 + %5903 = getelementptr bfloat, ptr addrspace(1) %5, i64 %5897, !dbg !286 + %5904 = getelementptr float, ptr addrspace(1) %3, i64 %5901, !dbg !287 + %5905 = getelementptr float, ptr addrspace(1) %4, i64 %5901, !dbg !288 + %5906 = getelementptr bfloat, ptr addrspace(1) %5902, i64 %5247, !dbg !289 + %5907 = getelementptr bfloat, ptr addrspace(1) %5902, i64 %5248, !dbg !289 + %5908 = getelementptr bfloat, ptr addrspace(1) %5902, i64 %5249, !dbg !289 + %5909 = getelementptr bfloat, ptr addrspace(1) %5902, i64 %5250, !dbg !289 + %5910 = getelementptr bfloat, ptr addrspace(1) %5906, i64 %4911, !dbg !290 + %5911 = getelementptr bfloat, ptr addrspace(1) %5907, i64 %4911, !dbg !290 + %5912 = getelementptr bfloat, ptr addrspace(1) %5908, i64 %4911, !dbg !290 + %5913 = getelementptr bfloat, ptr addrspace(1) %5909, i64 %4911, !dbg !290 + %5914 = getelementptr bfloat, ptr addrspace(1) %5903, i64 %5251, !dbg !291 + %5915 = getelementptr bfloat, ptr addrspace(1) %5903, i64 %5252, !dbg !291 + %5916 = getelementptr bfloat, ptr addrspace(1) %5903, i64 %5253, !dbg !291 + %5917 = getelementptr bfloat, ptr addrspace(1) %5903, i64 %5254, !dbg !291 + %5918 = getelementptr bfloat, ptr addrspace(1) %5914, i64 %4911, !dbg !292 + %5919 = getelementptr bfloat, ptr addrspace(1) %5915, i64 %4911, !dbg !292 + %5920 = getelementptr bfloat, ptr addrspace(1) %5916, i64 %4911, !dbg !292 + %5921 = getelementptr bfloat, ptr addrspace(1) %5917, i64 %4911, !dbg !292 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5265, ptr addrspace(1) %5910, i32 %5266) #3, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5268, ptr addrspace(1) %5911, i32 %5269) #3, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5271, ptr addrspace(1) %5912, i32 %5272) #3, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5274, ptr addrspace(1) %5913, i32 %5275) #3, !dbg !293 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !293 + %5922 = getelementptr float, ptr addrspace(1) %5904, i64 %5408, !dbg !294 + %5923 = getelementptr float, ptr addrspace(1) %5904, i64 %5409, !dbg !294 + %5924 = getelementptr float, ptr addrspace(1) %5904, i64 %5302, !dbg !294 + %5925 = getelementptr float, ptr addrspace(1) %5904, i64 %5303, !dbg !294 + %5926 = getelementptr float, ptr addrspace(1) %5904, i64 %5304, !dbg !294 + %5927 = getelementptr float, ptr addrspace(1) %5904, i64 %5305, !dbg !294 + %5928 = getelementptr float, ptr addrspace(1) %5904, i64 %5306, !dbg !294 + %5929 = getelementptr float, ptr addrspace(1) %5904, i64 %5307, !dbg !294 + %5930 = getelementptr float, ptr addrspace(1) %5904, i64 %5308, !dbg !294 + %5931 = getelementptr float, ptr addrspace(1) %5904, i64 %5309, !dbg !294 + %5932 = getelementptr float, ptr addrspace(1) %5904, i64 %5310, !dbg !294 + %5933 = getelementptr float, ptr addrspace(1) %5904, i64 %5311, !dbg !294 + %5934 = getelementptr float, ptr addrspace(1) %5904, i64 %5312, !dbg !294 + %5935 = getelementptr float, ptr addrspace(1) %5904, i64 %5313, !dbg !294 + %5936 = getelementptr float, ptr addrspace(1) %5904, i64 %5314, !dbg !294 + %5937 = getelementptr float, ptr addrspace(1) %5904, i64 %5315, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %5333, ptr addrspace(1) %5922, i32 %5412, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5335, ptr addrspace(1) %5923, i32 %5413, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5337, ptr addrspace(1) %5924, i32 %5338, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5340, ptr addrspace(1) %5925, i32 %5341, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5343, ptr addrspace(1) %5926, i32 %5344, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5346, ptr addrspace(1) %5927, i32 %5347, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5349, ptr addrspace(1) %5928, i32 %5350, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5352, ptr addrspace(1) %5929, i32 %5353, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5355, ptr addrspace(1) %5930, i32 %5356, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5358, ptr addrspace(1) %5931, i32 %5359, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5361, ptr addrspace(1) %5932, i32 %5362, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5364, ptr addrspace(1) %5933, i32 %5365, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5367, ptr addrspace(1) %5934, i32 %5368, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5370, ptr addrspace(1) %5935, i32 %5371, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5373, ptr addrspace(1) %5936, i32 %5374, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5376, ptr addrspace(1) %5937, i32 %5377, i1 %5331) #3, !dbg !295 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !295 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5378, ptr addrspace(1) %5918, i32 %5266) #3, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5379, ptr addrspace(1) %5919, i32 %5269) #3, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5380, ptr addrspace(1) %5920, i32 %5272) #3, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5381, ptr addrspace(1) %5921, i32 %5275) #3, !dbg !296 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !296 + %5938 = getelementptr float, ptr addrspace(1) %5905, i64 %5408, !dbg !297 + %5939 = getelementptr float, ptr addrspace(1) %5905, i64 %5409, !dbg !297 + %5940 = getelementptr float, ptr addrspace(1) %5905, i64 %5302, !dbg !297 + %5941 = getelementptr float, ptr addrspace(1) %5905, i64 %5303, !dbg !297 + %5942 = getelementptr float, ptr addrspace(1) %5905, i64 %5304, !dbg !297 + %5943 = getelementptr float, ptr addrspace(1) %5905, i64 %5305, !dbg !297 + %5944 = getelementptr float, ptr addrspace(1) %5905, i64 %5306, !dbg !297 + %5945 = getelementptr float, ptr addrspace(1) %5905, i64 %5307, !dbg !297 + %5946 = getelementptr float, ptr addrspace(1) %5905, i64 %5308, !dbg !297 + %5947 = getelementptr float, ptr addrspace(1) %5905, i64 %5309, !dbg !297 + %5948 = getelementptr float, ptr addrspace(1) %5905, i64 %5310, !dbg !297 + %5949 = getelementptr float, ptr addrspace(1) %5905, i64 %5311, !dbg !297 + %5950 = getelementptr float, ptr addrspace(1) %5905, i64 %5312, !dbg !297 + %5951 = getelementptr float, ptr addrspace(1) %5905, i64 %5313, !dbg !297 + %5952 = getelementptr float, ptr addrspace(1) %5905, i64 %5314, !dbg !297 + %5953 = getelementptr float, ptr addrspace(1) %5905, i64 %5315, !dbg !297 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %5382, ptr addrspace(1) %5938, i32 %5412, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5383, ptr addrspace(1) %5939, i32 %5413, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5384, ptr addrspace(1) %5940, i32 %5338, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5385, ptr addrspace(1) %5941, i32 %5341, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5386, ptr addrspace(1) %5942, i32 %5344, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5387, ptr addrspace(1) %5943, i32 %5347, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5388, ptr addrspace(1) %5944, i32 %5350, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5389, ptr addrspace(1) %5945, i32 %5353, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5390, ptr addrspace(1) %5946, i32 %5356, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5391, ptr addrspace(1) %5947, i32 %5359, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5392, ptr addrspace(1) %5948, i32 %5362, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5393, ptr addrspace(1) %5949, i32 %5365, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5394, ptr addrspace(1) %5950, i32 %5368, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5395, ptr addrspace(1) %5951, i32 %5371, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5396, ptr addrspace(1) %5952, i32 %5374, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5397, ptr addrspace(1) %5953, i32 %5377, i1 %5331) #3, !dbg !298 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !298 + %5954 = getelementptr i8, ptr addrspace(1) %5910, i64 524288, !dbg !299 + %5955 = getelementptr i8, ptr addrspace(1) %5911, i64 524288, !dbg !299 + %5956 = getelementptr i8, ptr addrspace(1) %5912, i64 524288, !dbg !299 + %5957 = getelementptr i8, ptr addrspace(1) %5913, i64 524288, !dbg !299 + %5958 = getelementptr i8, ptr addrspace(1) %5918, i64 16384, !dbg !300 + %5959 = getelementptr i8, ptr addrspace(1) %5919, i64 16384, !dbg !300 + %5960 = getelementptr i8, ptr addrspace(1) %5920, i64 16384, !dbg !300 + %5961 = getelementptr i8, ptr addrspace(1) %5921, i64 16384, !dbg !300 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5441, ptr addrspace(1) %5954, i32 %5442) #3, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5443, ptr addrspace(1) %5955, i32 %5444) #3, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5445, ptr addrspace(1) %5956, i32 %5446) #3, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5447, ptr addrspace(1) %5957, i32 %5448) #3, !dbg !293 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !293 + %5962 = getelementptr float, ptr addrspace(1) %5904, i64 %5467, !dbg !294 + %5963 = getelementptr float, ptr addrspace(1) %5904, i64 %5468, !dbg !294 + %5964 = getelementptr float, ptr addrspace(1) %5904, i64 %5469, !dbg !294 + %5965 = getelementptr float, ptr addrspace(1) %5904, i64 %5470, !dbg !294 + %5966 = getelementptr float, ptr addrspace(1) %5904, i64 %5471, !dbg !294 + %5967 = getelementptr float, ptr addrspace(1) %5904, i64 %5472, !dbg !294 + %5968 = getelementptr float, ptr addrspace(1) %5904, i64 %5473, !dbg !294 + %5969 = getelementptr float, ptr addrspace(1) %5904, i64 %5474, !dbg !294 + %5970 = getelementptr float, ptr addrspace(1) %5904, i64 %5475, !dbg !294 + %5971 = getelementptr float, ptr addrspace(1) %5904, i64 %5476, !dbg !294 + %5972 = getelementptr float, ptr addrspace(1) %5904, i64 %5477, !dbg !294 + %5973 = getelementptr float, ptr addrspace(1) %5904, i64 %5478, !dbg !294 + %5974 = getelementptr float, ptr addrspace(1) %5904, i64 %5479, !dbg !294 + %5975 = getelementptr float, ptr addrspace(1) %5904, i64 %5480, !dbg !294 + %5976 = getelementptr float, ptr addrspace(1) %5904, i64 %5481, !dbg !294 + %5977 = getelementptr float, ptr addrspace(1) %5904, i64 %5482, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %5499, ptr addrspace(1) %5962, i32 %5500, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5501, ptr addrspace(1) %5963, i32 %5502, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5503, ptr addrspace(1) %5964, i32 %5504, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5505, ptr addrspace(1) %5965, i32 %5506, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5507, ptr addrspace(1) %5966, i32 %5508, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5509, ptr addrspace(1) %5967, i32 %5510, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5511, ptr addrspace(1) %5968, i32 %5512, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5513, ptr addrspace(1) %5969, i32 %5514, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5515, ptr addrspace(1) %5970, i32 %5516, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5517, ptr addrspace(1) %5971, i32 %5518, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5519, ptr addrspace(1) %5972, i32 %5520, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5521, ptr addrspace(1) %5973, i32 %5522, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5523, ptr addrspace(1) %5974, i32 %5524, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5525, ptr addrspace(1) %5975, i32 %5526, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5527, ptr addrspace(1) %5976, i32 %5528, i1 %5331) #3, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5529, ptr addrspace(1) %5977, i32 %5530, i1 %5331) #3, !dbg !295 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !295 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5531, ptr addrspace(1) %5958, i32 %5442) #3, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5532, ptr addrspace(1) %5959, i32 %5444) #3, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5533, ptr addrspace(1) %5960, i32 %5446) #3, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5534, ptr addrspace(1) %5961, i32 %5448) #3, !dbg !296 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !296 + %5978 = getelementptr float, ptr addrspace(1) %5905, i64 %5467, !dbg !297 + %5979 = getelementptr float, ptr addrspace(1) %5905, i64 %5468, !dbg !297 + %5980 = getelementptr float, ptr addrspace(1) %5905, i64 %5469, !dbg !297 + %5981 = getelementptr float, ptr addrspace(1) %5905, i64 %5470, !dbg !297 + %5982 = getelementptr float, ptr addrspace(1) %5905, i64 %5471, !dbg !297 + %5983 = getelementptr float, ptr addrspace(1) %5905, i64 %5472, !dbg !297 + %5984 = getelementptr float, ptr addrspace(1) %5905, i64 %5473, !dbg !297 + %5985 = getelementptr float, ptr addrspace(1) %5905, i64 %5474, !dbg !297 + %5986 = getelementptr float, ptr addrspace(1) %5905, i64 %5475, !dbg !297 + %5987 = getelementptr float, ptr addrspace(1) %5905, i64 %5476, !dbg !297 + %5988 = getelementptr float, ptr addrspace(1) %5905, i64 %5477, !dbg !297 + %5989 = getelementptr float, ptr addrspace(1) %5905, i64 %5478, !dbg !297 + %5990 = getelementptr float, ptr addrspace(1) %5905, i64 %5479, !dbg !297 + %5991 = getelementptr float, ptr addrspace(1) %5905, i64 %5480, !dbg !297 + %5992 = getelementptr float, ptr addrspace(1) %5905, i64 %5481, !dbg !297 + %5993 = getelementptr float, ptr addrspace(1) %5905, i64 %5482, !dbg !297 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %5535, ptr addrspace(1) %5978, i32 %5500, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5536, ptr addrspace(1) %5979, i32 %5502, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5537, ptr addrspace(1) %5980, i32 %5504, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5538, ptr addrspace(1) %5981, i32 %5506, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5539, ptr addrspace(1) %5982, i32 %5508, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5540, ptr addrspace(1) %5983, i32 %5510, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5541, ptr addrspace(1) %5984, i32 %5512, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5542, ptr addrspace(1) %5985, i32 %5514, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5543, ptr addrspace(1) %5986, i32 %5516, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5544, ptr addrspace(1) %5987, i32 %5518, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5545, ptr addrspace(1) %5988, i32 %5520, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5546, ptr addrspace(1) %5989, i32 %5522, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5547, ptr addrspace(1) %5990, i32 %5524, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5548, ptr addrspace(1) %5991, i32 %5526, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5549, ptr addrspace(1) %5992, i32 %5528, i1 %5331) #3, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5550, ptr addrspace(1) %5993, i32 %5530, i1 %5331) #3, !dbg !298 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !298 + br i1 %5181, label %.lr.ph1620, label %._crit_edge1621, !dbg !255 + +.lr.ph1620: ; preds = %5761, %__nv_exp2f.exit1335 + %5994 = phi i32 [ %7716, %__nv_exp2f.exit1335 ], [ %5139, %5761 ] + %5995 = phi i32 [ %7717, %__nv_exp2f.exit1335 ], [ %5140, %5761 ] + %5996 = phi i32 [ %8469, %__nv_exp2f.exit1335 ], [ 64, %5761 ] + %5997 = phi i32 [ %.pn2151605, %__nv_exp2f.exit1335 ], [ %5139, %5761 ] + %5998 = phi i32 [ %.pn2131606, %__nv_exp2f.exit1335 ], [ %5140, %5761 ] + %5999 = phi i32 [ %.pn2111607, %__nv_exp2f.exit1335 ], [ %5278, %5761 ] + %6000 = phi i32 [ %.pn2091608, %__nv_exp2f.exit1335 ], [ %5280, %5761 ] + %6001 = phi i32 [ %.pn2071609, %__nv_exp2f.exit1335 ], [ %5282, %5761 ] + %6002 = phi i32 [ %.pn2051610, %__nv_exp2f.exit1335 ], [ %5284, %5761 ] + %6003 = phi i32 [ %.pn2031611, %__nv_exp2f.exit1335 ], [ %5286, %5761 ] + %6004 = phi i32 [ %.pn2011612, %__nv_exp2f.exit1335 ], [ %5288, %5761 ] + %6005 = phi i32 [ %.pn1991613, %__nv_exp2f.exit1335 ], [ %5290, %5761 ] + %6006 = phi i32 [ %.pn1971614, %__nv_exp2f.exit1335 ], [ %5292, %5761 ] + %6007 = phi i32 [ %.pn1951615, %__nv_exp2f.exit1335 ], [ %5294, %5761 ] + %6008 = phi i32 [ %.pn1931616, %__nv_exp2f.exit1335 ], [ %5296, %5761 ] + %6009 = phi i32 [ %.pn1911617, %__nv_exp2f.exit1335 ], [ %5298, %5761 ] + %6010 = phi i32 [ %.pn1891618, %__nv_exp2f.exit1335 ], [ %5300, %5761 ] + %6011 = phi i32 [ %6161, %__nv_exp2f.exit1335 ], [ -1, %5761 ] + %6012 = phi i32 [ %8509, %__nv_exp2f.exit1335 ], [ 1, %5761 ] + %6013 = phi i32 [ %6164, %__nv_exp2f.exit1335 ], [ -1, %5761 ] + %6014 = phi i32 [ %8512, %__nv_exp2f.exit1335 ], [ 1, %5761 ] + %.pn1891618 = phi i32 [ %8498, %__nv_exp2f.exit1335 ], [ %5428, %5761 ] + %.pn1911617 = phi i32 [ %8497, %__nv_exp2f.exit1335 ], [ %5427, %5761 ] + %.pn1931616 = phi i32 [ %8496, %__nv_exp2f.exit1335 ], [ %5426, %5761 ] + %.pn1951615 = phi i32 [ %8495, %__nv_exp2f.exit1335 ], [ %5425, %5761 ] + %.pn1971614 = phi i32 [ %8494, %__nv_exp2f.exit1335 ], [ %5424, %5761 ] + %.pn1991613 = phi i32 [ %8493, %__nv_exp2f.exit1335 ], [ %5423, %5761 ] + %.pn2011612 = phi i32 [ %8492, %__nv_exp2f.exit1335 ], [ %5422, %5761 ] + %.pn2031611 = phi i32 [ %8491, %__nv_exp2f.exit1335 ], [ %5421, %5761 ] + %.pn2051610 = phi i32 [ %8490, %__nv_exp2f.exit1335 ], [ %5420, %5761 ] + %.pn2071609 = phi i32 [ %8489, %__nv_exp2f.exit1335 ], [ %5419, %5761 ] + %.pn2091608 = phi i32 [ %8488, %__nv_exp2f.exit1335 ], [ %5418, %5761 ] + %.pn2111607 = phi i32 [ %8487, %__nv_exp2f.exit1335 ], [ %5417, %5761 ] + %.pn2131606 = phi i32 [ %8486, %__nv_exp2f.exit1335 ], [ %5416, %5761 ] + %.pn2151605 = phi i32 [ %8485, %__nv_exp2f.exit1335 ], [ %5415, %5761 ] + %6015 = phi i32 [ %8503, %__nv_exp2f.exit1335 ], [ %5429, %5761 ] + %6016 = phi i32 [ %8504, %__nv_exp2f.exit1335 ], [ %5430, %5761 ] + %6017 = phi i32 [ %8505, %__nv_exp2f.exit1335 ], [ %5431, %5761 ] + %6018 = phi i32 [ %8506, %__nv_exp2f.exit1335 ], [ %5432, %5761 ] + %.pn1391602 = phi ptr addrspace(1) [ %8481, %__nv_exp2f.exit1335 ], [ %5961, %5761 ] + %.pn1551601 = phi ptr addrspace(1) [ %8480, %__nv_exp2f.exit1335 ], [ %5960, %5761 ] + %.pn1711600 = phi ptr addrspace(1) [ %8479, %__nv_exp2f.exit1335 ], [ %5959, %5761 ] + %.pn1871599 = phi ptr addrspace(1) [ %8478, %__nv_exp2f.exit1335 ], [ %5958, %5761 ] + %6019 = phi i32 [ %8499, %__nv_exp2f.exit1335 ], [ %5429, %5761 ] + %6020 = phi i32 [ %8500, %__nv_exp2f.exit1335 ], [ %5430, %5761 ] + %6021 = phi i32 [ %8501, %__nv_exp2f.exit1335 ], [ %5431, %5761 ] + %6022 = phi i32 [ %8502, %__nv_exp2f.exit1335 ], [ %5432, %5761 ] + %.pn751598 = phi ptr addrspace(1) [ %8475, %__nv_exp2f.exit1335 ], [ %5957, %5761 ] + %.pn911597 = phi ptr addrspace(1) [ %8474, %__nv_exp2f.exit1335 ], [ %5956, %5761 ] + %.pn1071596 = phi ptr addrspace(1) [ %8473, %__nv_exp2f.exit1335 ], [ %5955, %5761 ] + %.pn1231595 = phi ptr addrspace(1) [ %8472, %__nv_exp2f.exit1335 ], [ %5954, %5761 ] + %6023 = phi float [ %7529, %__nv_exp2f.exit1335 ], [ %5826, %5761 ] + %6024 = phi float [ %7530, %__nv_exp2f.exit1335 ], [ %5827, %5761 ] + %6025 = phi float [ %7531, %__nv_exp2f.exit1335 ], [ %5828, %5761 ] + %6026 = phi float [ %7532, %__nv_exp2f.exit1335 ], [ %5829, %5761 ] + %6027 = phi float [ %7533, %__nv_exp2f.exit1335 ], [ %5830, %5761 ] + %6028 = phi float [ %7534, %__nv_exp2f.exit1335 ], [ %5831, %5761 ] + %6029 = phi float [ %7535, %__nv_exp2f.exit1335 ], [ %5832, %5761 ] + %6030 = phi float [ %7536, %__nv_exp2f.exit1335 ], [ %5833, %5761 ] + %6031 = phi float [ %7537, %__nv_exp2f.exit1335 ], [ %5834, %5761 ] + %6032 = phi float [ %7538, %__nv_exp2f.exit1335 ], [ %5835, %5761 ] + %6033 = phi float [ %7539, %__nv_exp2f.exit1335 ], [ %5836, %5761 ] + %6034 = phi float [ %7540, %__nv_exp2f.exit1335 ], [ %5837, %5761 ] + %6035 = phi float [ %7541, %__nv_exp2f.exit1335 ], [ %5838, %5761 ] + %6036 = phi float [ %7542, %__nv_exp2f.exit1335 ], [ %5839, %5761 ] + %6037 = phi float [ %7543, %__nv_exp2f.exit1335 ], [ %5840, %5761 ] + %6038 = phi float [ %7544, %__nv_exp2f.exit1335 ], [ %5841, %5761 ] + %6039 = phi float [ %7545, %__nv_exp2f.exit1335 ], [ %5842, %5761 ] + %6040 = phi float [ %7546, %__nv_exp2f.exit1335 ], [ %5843, %5761 ] + %6041 = phi float [ %7547, %__nv_exp2f.exit1335 ], [ %5844, %5761 ] + %6042 = phi float [ %7548, %__nv_exp2f.exit1335 ], [ %5845, %5761 ] + %6043 = phi float [ %7549, %__nv_exp2f.exit1335 ], [ %5846, %5761 ] + %6044 = phi float [ %7550, %__nv_exp2f.exit1335 ], [ %5847, %5761 ] + %6045 = phi float [ %7551, %__nv_exp2f.exit1335 ], [ %5848, %5761 ] + %6046 = phi float [ %7552, %__nv_exp2f.exit1335 ], [ %5849, %5761 ] + %6047 = phi float [ %7553, %__nv_exp2f.exit1335 ], [ %5850, %5761 ] + %6048 = phi float [ %7554, %__nv_exp2f.exit1335 ], [ %5851, %5761 ] + %6049 = phi float [ %7555, %__nv_exp2f.exit1335 ], [ %5852, %5761 ] + %6050 = phi float [ %7556, %__nv_exp2f.exit1335 ], [ %5853, %5761 ] + %6051 = phi float [ %7557, %__nv_exp2f.exit1335 ], [ %5854, %5761 ] + %6052 = phi float [ %7558, %__nv_exp2f.exit1335 ], [ %5855, %5761 ] + %6053 = phi float [ %7559, %__nv_exp2f.exit1335 ], [ %5856, %5761 ] + %6054 = phi float [ %7560, %__nv_exp2f.exit1335 ], [ %5857, %5761 ] + %6055 = phi float [ %7561, %__nv_exp2f.exit1335 ], [ %5858, %5761 ] + %6056 = phi float [ %7562, %__nv_exp2f.exit1335 ], [ %5859, %5761 ] + %6057 = phi float [ %7563, %__nv_exp2f.exit1335 ], [ %5860, %5761 ] + %6058 = phi float [ %7564, %__nv_exp2f.exit1335 ], [ %5861, %5761 ] + %6059 = phi float [ %7565, %__nv_exp2f.exit1335 ], [ %5862, %5761 ] + %6060 = phi float [ %7566, %__nv_exp2f.exit1335 ], [ %5863, %5761 ] + %6061 = phi float [ %7567, %__nv_exp2f.exit1335 ], [ %5864, %5761 ] + %6062 = phi float [ %7568, %__nv_exp2f.exit1335 ], [ %5865, %5761 ] + %6063 = phi float [ %7569, %__nv_exp2f.exit1335 ], [ %5866, %5761 ] + %6064 = phi float [ %7570, %__nv_exp2f.exit1335 ], [ %5867, %5761 ] + %6065 = phi float [ %7571, %__nv_exp2f.exit1335 ], [ %5868, %5761 ] + %6066 = phi float [ %7572, %__nv_exp2f.exit1335 ], [ %5869, %5761 ] + %6067 = phi float [ %7573, %__nv_exp2f.exit1335 ], [ %5870, %5761 ] + %6068 = phi float [ %7574, %__nv_exp2f.exit1335 ], [ %5871, %5761 ] + %6069 = phi float [ %7575, %__nv_exp2f.exit1335 ], [ %5872, %5761 ] + %6070 = phi float [ %7576, %__nv_exp2f.exit1335 ], [ %5873, %5761 ] + %6071 = phi float [ %7577, %__nv_exp2f.exit1335 ], [ %5874, %5761 ] + %6072 = phi float [ %7578, %__nv_exp2f.exit1335 ], [ %5875, %5761 ] + %6073 = phi float [ %7579, %__nv_exp2f.exit1335 ], [ %5876, %5761 ] + %6074 = phi float [ %7580, %__nv_exp2f.exit1335 ], [ %5877, %5761 ] + %6075 = phi float [ %7581, %__nv_exp2f.exit1335 ], [ %5878, %5761 ] + %6076 = phi float [ %7582, %__nv_exp2f.exit1335 ], [ %5879, %5761 ] + %6077 = phi float [ %7583, %__nv_exp2f.exit1335 ], [ %5880, %5761 ] + %6078 = phi float [ %7584, %__nv_exp2f.exit1335 ], [ %5881, %5761 ] + %6079 = phi float [ %7585, %__nv_exp2f.exit1335 ], [ %5882, %5761 ] + %6080 = phi float [ %7586, %__nv_exp2f.exit1335 ], [ %5883, %5761 ] + %6081 = phi float [ %7587, %__nv_exp2f.exit1335 ], [ %5884, %5761 ] + %6082 = phi float [ %7588, %__nv_exp2f.exit1335 ], [ %5885, %5761 ] + %6083 = phi float [ %7589, %__nv_exp2f.exit1335 ], [ %5886, %5761 ] + %6084 = phi float [ %7590, %__nv_exp2f.exit1335 ], [ %5887, %5761 ] + %6085 = phi float [ %7591, %__nv_exp2f.exit1335 ], [ %5888, %5761 ] + %6086 = phi float [ %7592, %__nv_exp2f.exit1335 ], [ %5889, %5761 ] + %6087 = phi float [ %8397, %__nv_exp2f.exit1335 ], [ %5762, %5761 ] + %6088 = phi float [ %8398, %__nv_exp2f.exit1335 ], [ %5763, %5761 ] + %6089 = phi float [ %8399, %__nv_exp2f.exit1335 ], [ %5764, %5761 ] + %6090 = phi float [ %8400, %__nv_exp2f.exit1335 ], [ %5765, %5761 ] + %6091 = phi float [ %8401, %__nv_exp2f.exit1335 ], [ %5766, %5761 ] + %6092 = phi float [ %8402, %__nv_exp2f.exit1335 ], [ %5767, %5761 ] + %6093 = phi float [ %8403, %__nv_exp2f.exit1335 ], [ %5768, %5761 ] + %6094 = phi float [ %8404, %__nv_exp2f.exit1335 ], [ %5769, %5761 ] + %6095 = phi float [ %8405, %__nv_exp2f.exit1335 ], [ %5770, %5761 ] + %6096 = phi float [ %8406, %__nv_exp2f.exit1335 ], [ %5771, %5761 ] + %6097 = phi float [ %8407, %__nv_exp2f.exit1335 ], [ %5772, %5761 ] + %6098 = phi float [ %8408, %__nv_exp2f.exit1335 ], [ %5773, %5761 ] + %6099 = phi float [ %8409, %__nv_exp2f.exit1335 ], [ %5774, %5761 ] + %6100 = phi float [ %8410, %__nv_exp2f.exit1335 ], [ %5775, %5761 ] + %6101 = phi float [ %8411, %__nv_exp2f.exit1335 ], [ %5776, %5761 ] + %6102 = phi float [ %8412, %__nv_exp2f.exit1335 ], [ %5777, %5761 ] + %6103 = phi float [ %8413, %__nv_exp2f.exit1335 ], [ %5778, %5761 ] + %6104 = phi float [ %8414, %__nv_exp2f.exit1335 ], [ %5779, %5761 ] + %6105 = phi float [ %8415, %__nv_exp2f.exit1335 ], [ %5780, %5761 ] + %6106 = phi float [ %8416, %__nv_exp2f.exit1335 ], [ %5781, %5761 ] + %6107 = phi float [ %8417, %__nv_exp2f.exit1335 ], [ %5782, %5761 ] + %6108 = phi float [ %8418, %__nv_exp2f.exit1335 ], [ %5783, %5761 ] + %6109 = phi float [ %8419, %__nv_exp2f.exit1335 ], [ %5784, %5761 ] + %6110 = phi float [ %8420, %__nv_exp2f.exit1335 ], [ %5785, %5761 ] + %6111 = phi float [ %8421, %__nv_exp2f.exit1335 ], [ %5786, %5761 ] + %6112 = phi float [ %8422, %__nv_exp2f.exit1335 ], [ %5787, %5761 ] + %6113 = phi float [ %8423, %__nv_exp2f.exit1335 ], [ %5788, %5761 ] + %6114 = phi float [ %8424, %__nv_exp2f.exit1335 ], [ %5789, %5761 ] + %6115 = phi float [ %8425, %__nv_exp2f.exit1335 ], [ %5790, %5761 ] + %6116 = phi float [ %8426, %__nv_exp2f.exit1335 ], [ %5791, %5761 ] + %6117 = phi float [ %8427, %__nv_exp2f.exit1335 ], [ %5792, %5761 ] + %6118 = phi float [ %8428, %__nv_exp2f.exit1335 ], [ %5793, %5761 ] + %6119 = phi float [ %8429, %__nv_exp2f.exit1335 ], [ %5794, %5761 ] + %6120 = phi float [ %8430, %__nv_exp2f.exit1335 ], [ %5795, %5761 ] + %6121 = phi float [ %8431, %__nv_exp2f.exit1335 ], [ %5796, %5761 ] + %6122 = phi float [ %8432, %__nv_exp2f.exit1335 ], [ %5797, %5761 ] + %6123 = phi float [ %8433, %__nv_exp2f.exit1335 ], [ %5798, %5761 ] + %6124 = phi float [ %8434, %__nv_exp2f.exit1335 ], [ %5799, %5761 ] + %6125 = phi float [ %8435, %__nv_exp2f.exit1335 ], [ %5800, %5761 ] + %6126 = phi float [ %8436, %__nv_exp2f.exit1335 ], [ %5801, %5761 ] + %6127 = phi float [ %8437, %__nv_exp2f.exit1335 ], [ %5802, %5761 ] + %6128 = phi float [ %8438, %__nv_exp2f.exit1335 ], [ %5803, %5761 ] + %6129 = phi float [ %8439, %__nv_exp2f.exit1335 ], [ %5804, %5761 ] + %6130 = phi float [ %8440, %__nv_exp2f.exit1335 ], [ %5805, %5761 ] + %6131 = phi float [ %8441, %__nv_exp2f.exit1335 ], [ %5806, %5761 ] + %6132 = phi float [ %8442, %__nv_exp2f.exit1335 ], [ %5807, %5761 ] + %6133 = phi float [ %8443, %__nv_exp2f.exit1335 ], [ %5808, %5761 ] + %6134 = phi float [ %8444, %__nv_exp2f.exit1335 ], [ %5809, %5761 ] + %6135 = phi float [ %8445, %__nv_exp2f.exit1335 ], [ %5810, %5761 ] + %6136 = phi float [ %8446, %__nv_exp2f.exit1335 ], [ %5811, %5761 ] + %6137 = phi float [ %8447, %__nv_exp2f.exit1335 ], [ %5812, %5761 ] + %6138 = phi float [ %8448, %__nv_exp2f.exit1335 ], [ %5813, %5761 ] + %6139 = phi float [ %8449, %__nv_exp2f.exit1335 ], [ %5814, %5761 ] + %6140 = phi float [ %8450, %__nv_exp2f.exit1335 ], [ %5815, %5761 ] + %6141 = phi float [ %8451, %__nv_exp2f.exit1335 ], [ %5816, %5761 ] + %6142 = phi float [ %8452, %__nv_exp2f.exit1335 ], [ %5817, %5761 ] + %6143 = phi float [ %8453, %__nv_exp2f.exit1335 ], [ %5818, %5761 ] + %6144 = phi float [ %8454, %__nv_exp2f.exit1335 ], [ %5819, %5761 ] + %6145 = phi float [ %8455, %__nv_exp2f.exit1335 ], [ %5820, %5761 ] + %6146 = phi float [ %8456, %__nv_exp2f.exit1335 ], [ %5821, %5761 ] + %6147 = phi float [ %8457, %__nv_exp2f.exit1335 ], [ %5822, %5761 ] + %6148 = phi float [ %8458, %__nv_exp2f.exit1335 ], [ %5823, %5761 ] + %6149 = phi float [ %8459, %__nv_exp2f.exit1335 ], [ %5824, %5761 ] + %6150 = phi float [ %8460, %__nv_exp2f.exit1335 ], [ %5825, %5761 ] + %6151 = phi i32 [ %7724, %__nv_exp2f.exit1335 ], [ 0, %5761 ] + %6152 = phi <2 x i32> [ %7715, %__nv_exp2f.exit1335 ], [ %5403, %5761 ] + %6153 = phi <2 x i32> [ %6154, %__nv_exp2f.exit1335 ], [ %5403, %5761 ] + %6154 = phi <2 x i32> [ %8484, %__nv_exp2f.exit1335 ], [ %5414, %5761 ] + %6155 = phi <8 x i32> [ %7723, %__nv_exp2f.exit1335 ], [ %5174, %5761 ] + %6156 = phi <4 x i32> [ %7720, %__nv_exp2f.exit1335 ], [ %5146, %5761 ] + %6157 = icmp slt i32 %6151, %5551, !dbg !255 + %6158 = icmp slt i32 %6151, %5552, !dbg !255 + %6159 = add i32 %6011, 1, !dbg !255 + %6160 = icmp sgt i32 %6159, 1, !dbg !255 + %6161 = select i1 %6160, i32 0, i32 %6159, !dbg !255 + %6162 = add i32 %6013, 1, !dbg !255 + %6163 = icmp sgt i32 %6162, 2, !dbg !255 + %6164 = select i1 %6163, i32 0, i32 %6162, !dbg !255 + %6165 = icmp slt <2 x i32> %6153, %5739, !dbg !301 + %6166 = icmp slt i32 %5997, %18, !dbg !301 + %6167 = icmp slt i32 %5998, %18, !dbg !301 + %6168 = icmp slt i32 %5999, %18, !dbg !301 + %6169 = icmp slt i32 %6000, %18, !dbg !301 + %6170 = icmp slt i32 %6001, %18, !dbg !301 + %6171 = icmp slt i32 %6002, %18, !dbg !301 + %6172 = icmp slt i32 %6003, %18, !dbg !301 + %6173 = icmp slt i32 %6004, %18, !dbg !301 + %6174 = icmp slt i32 %6005, %18, !dbg !301 + %6175 = icmp slt i32 %6006, %18, !dbg !301 + %6176 = icmp slt i32 %6007, %18, !dbg !301 + %6177 = icmp slt i32 %6008, %18, !dbg !301 + %6178 = icmp slt i32 %6009, %18, !dbg !301 + %6179 = icmp slt i32 %6010, %18, !dbg !301 + tail call void @llvm.nvvm.cp.async.wait.group(i32 4), !dbg !293 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !293 + %6180 = shl i32 %6164, 13, !dbg !293 + %6181 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %6180, !dbg !293 + %6182 = shl i32 %6161, 6, !dbg !295 + %6183 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %6182, !dbg !295 + %6184 = getelementptr inbounds nuw i8, ptr addrspace(3) %6183, i32 %5332, !dbg !295 + %6185 = load float, ptr addrspace(3) %6184, align 8, !dbg !295 + %6186 = getelementptr inbounds nuw i8, ptr addrspace(3) %6184, i32 4, !dbg !295 + %6187 = load float, ptr addrspace(3) %6186, align 4, !dbg !295 + %6188 = getelementptr inbounds nuw i8, ptr addrspace(3) %6183, i32 %5336, !dbg !295 + %6189 = load float, ptr addrspace(3) %6188, align 8, !dbg !295 + %6190 = getelementptr inbounds nuw i8, ptr addrspace(3) %6188, i32 4, !dbg !295 + %6191 = load float, ptr addrspace(3) %6190, align 4, !dbg !295 + %6192 = getelementptr inbounds nuw i8, ptr addrspace(3) %6183, i32 %5342, !dbg !295 + %6193 = load float, ptr addrspace(3) %6192, align 8, !dbg !295 + %6194 = getelementptr inbounds nuw i8, ptr addrspace(3) %6192, i32 4, !dbg !295 + %6195 = load float, ptr addrspace(3) %6194, align 4, !dbg !295 + %6196 = getelementptr inbounds nuw i8, ptr addrspace(3) %6183, i32 %5348, !dbg !295 + %6197 = load float, ptr addrspace(3) %6196, align 8, !dbg !295 + %6198 = getelementptr inbounds nuw i8, ptr addrspace(3) %6196, i32 4, !dbg !295 + %6199 = load float, ptr addrspace(3) %6198, align 4, !dbg !295 + %6200 = getelementptr inbounds nuw i8, ptr addrspace(3) %6183, i32 %5354, !dbg !295 + %6201 = load float, ptr addrspace(3) %6200, align 8, !dbg !295 + %6202 = getelementptr inbounds nuw i8, ptr addrspace(3) %6200, i32 4, !dbg !295 + %6203 = load float, ptr addrspace(3) %6202, align 4, !dbg !295 + %6204 = getelementptr inbounds nuw i8, ptr addrspace(3) %6183, i32 %5360, !dbg !295 + %6205 = load float, ptr addrspace(3) %6204, align 8, !dbg !295 + %6206 = getelementptr inbounds nuw i8, ptr addrspace(3) %6204, i32 4, !dbg !295 + %6207 = load float, ptr addrspace(3) %6206, align 4, !dbg !295 + %6208 = getelementptr inbounds nuw i8, ptr addrspace(3) %6183, i32 %5366, !dbg !295 + %6209 = load float, ptr addrspace(3) %6208, align 8, !dbg !295 + %6210 = getelementptr inbounds nuw i8, ptr addrspace(3) %6208, i32 4, !dbg !295 + %6211 = load float, ptr addrspace(3) %6210, align 4, !dbg !295 + %6212 = getelementptr inbounds nuw i8, ptr addrspace(3) %6183, i32 %5372, !dbg !295 + %6213 = load float, ptr addrspace(3) %6212, align 8, !dbg !295 + %6214 = getelementptr inbounds nuw i8, ptr addrspace(3) %6212, i32 4, !dbg !295 + %6215 = load float, ptr addrspace(3) %6214, align 4, !dbg !295 + %6216 = fcmp oeq float %6185, 0xFFF0000000000000, !dbg !302 + %6217 = fcmp oeq float %6187, 0xFFF0000000000000, !dbg !302 + %6218 = fcmp oeq float %6189, 0xFFF0000000000000, !dbg !302 + %6219 = fcmp oeq float %6191, 0xFFF0000000000000, !dbg !302 + %6220 = fcmp oeq float %6193, 0xFFF0000000000000, !dbg !302 + %6221 = fcmp oeq float %6195, 0xFFF0000000000000, !dbg !302 + %6222 = fcmp oeq float %6197, 0xFFF0000000000000, !dbg !302 + %6223 = fcmp oeq float %6199, 0xFFF0000000000000, !dbg !302 + %6224 = fcmp oeq float %6201, 0xFFF0000000000000, !dbg !302 + %6225 = fcmp oeq float %6203, 0xFFF0000000000000, !dbg !302 + %6226 = fcmp oeq float %6205, 0xFFF0000000000000, !dbg !302 + %6227 = fcmp oeq float %6207, 0xFFF0000000000000, !dbg !302 + %6228 = fcmp oeq float %6209, 0xFFF0000000000000, !dbg !302 + %6229 = fcmp oeq float %6211, 0xFFF0000000000000, !dbg !302 + %6230 = fcmp oeq float %6213, 0xFFF0000000000000, !dbg !302 + %6231 = fcmp oeq float %6215, 0xFFF0000000000000, !dbg !302 + %6232 = select i1 %6216, float 0.000000e+00, float %6185, !dbg !303 + %6233 = select i1 %6217, float 0.000000e+00, float %6187, !dbg !303 + %6234 = select i1 %6218, float 0.000000e+00, float %6189, !dbg !303 + %6235 = select i1 %6219, float 0.000000e+00, float %6191, !dbg !303 + %6236 = select i1 %6220, float 0.000000e+00, float %6193, !dbg !303 + %6237 = select i1 %6221, float 0.000000e+00, float %6195, !dbg !303 + %6238 = select i1 %6222, float 0.000000e+00, float %6197, !dbg !303 + %6239 = select i1 %6223, float 0.000000e+00, float %6199, !dbg !303 + %6240 = select i1 %6224, float 0.000000e+00, float %6201, !dbg !303 + %6241 = select i1 %6225, float 0.000000e+00, float %6203, !dbg !303 + %6242 = select i1 %6226, float 0.000000e+00, float %6205, !dbg !303 + %6243 = select i1 %6227, float 0.000000e+00, float %6207, !dbg !303 + %6244 = select i1 %6228, float 0.000000e+00, float %6209, !dbg !303 + %6245 = select i1 %6229, float 0.000000e+00, float %6211, !dbg !303 + %6246 = select i1 %6230, float 0.000000e+00, float %6213, !dbg !303 + %6247 = select i1 %6231, float 0.000000e+00, float %6215, !dbg !303 + %6248 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %57, i32 0, i32 31), !dbg !275 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !275 + %6249 = shl i32 %6248, 11, !dbg !275 + %6250 = and i32 %6249, 8192, !dbg !275 + %6251 = add i32 %6250, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !275 + %6252 = lshr exact i32 %6251, 4, !dbg !275 + %6253 = and i32 %6252, 16383, !dbg !275 + %6254 = zext nneg i32 %6253 to i64, !dbg !275 + %6255 = or disjoint i64 %6254, 4611686293372403712, !dbg !275 + %6256 = ptrtoint ptr addrspace(3) %6181 to i32, !dbg !275 + %6257 = lshr exact i32 %6256, 4, !dbg !275 + %6258 = and i32 %6257, 16383, !dbg !275 + %6259 = zext nneg i32 %6258 to i64, !dbg !275 + %6260 = or disjoint i64 %6259, 4611686293338849280, !dbg !275 + %6261 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %6255, i64 %6260) #3, !dbg !275 + %6262 = or disjoint i32 %6250, 32, !dbg !275 + %6263 = add i32 %6262, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !275 + %6264 = lshr exact i32 %6263, 4, !dbg !275 + %6265 = and i32 %6264, 16383, !dbg !275 + %6266 = zext nneg i32 %6265 to i64, !dbg !275 + %6267 = or disjoint i64 %6266, 4611686293372403712, !dbg !275 + %6268 = add i32 %6256, 32, !dbg !275 + %6269 = lshr exact i32 %6268, 4, !dbg !275 + %6270 = and i32 %6269, 16383, !dbg !275 + %6271 = zext nneg i32 %6270 to i64, !dbg !275 + %6272 = or disjoint i64 %6271, 4611686293338849280, !dbg !275 + %6273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 0, !dbg !275 + %6274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 1, !dbg !275 + %6275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 2, !dbg !275 + %6276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 3, !dbg !275 + %6277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 4, !dbg !275 + %6278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 5, !dbg !275 + %6279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 6, !dbg !275 + %6280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 7, !dbg !275 + %6281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 8, !dbg !275 + %6282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 9, !dbg !275 + %6283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 10, !dbg !275 + %6284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 11, !dbg !275 + %6285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 12, !dbg !275 + %6286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 13, !dbg !275 + %6287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 14, !dbg !275 + %6288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 15, !dbg !275 + %6289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 16, !dbg !275 + %6290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 17, !dbg !275 + %6291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 18, !dbg !275 + %6292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 19, !dbg !275 + %6293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 20, !dbg !275 + %6294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 21, !dbg !275 + %6295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 22, !dbg !275 + %6296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 23, !dbg !275 + %6297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 24, !dbg !275 + %6298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 25, !dbg !275 + %6299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 26, !dbg !275 + %6300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 27, !dbg !275 + %6301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 28, !dbg !275 + %6302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 29, !dbg !275 + %6303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 30, !dbg !275 + %6304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6261, 31, !dbg !275 + %6305 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6273, float %6274, float %6275, float %6276, float %6277, float %6278, float %6279, float %6280, float %6281, float %6282, float %6283, float %6284, float %6285, float %6286, float %6287, float %6288, float %6289, float %6290, float %6291, float %6292, float %6293, float %6294, float %6295, float %6296, float %6297, float %6298, float %6299, float %6300, float %6301, float %6302, float %6303, float %6304, i64 %6267, i64 %6272, i1 true) #3, !dbg !275 + %6306 = or disjoint i32 %6250, 64, !dbg !275 + %6307 = add i32 %6306, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !275 + %6308 = lshr exact i32 %6307, 4, !dbg !275 + %6309 = and i32 %6308, 16383, !dbg !275 + %6310 = zext nneg i32 %6309 to i64, !dbg !275 + %6311 = or disjoint i64 %6310, 4611686293372403712, !dbg !275 + %6312 = add i32 %6256, 64, !dbg !275 + %6313 = lshr exact i32 %6312, 4, !dbg !275 + %6314 = and i32 %6313, 16383, !dbg !275 + %6315 = zext nneg i32 %6314 to i64, !dbg !275 + %6316 = or disjoint i64 %6315, 4611686293338849280, !dbg !275 + %6317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 0, !dbg !275 + %6318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 1, !dbg !275 + %6319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 2, !dbg !275 + %6320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 3, !dbg !275 + %6321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 4, !dbg !275 + %6322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 5, !dbg !275 + %6323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 6, !dbg !275 + %6324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 7, !dbg !275 + %6325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 8, !dbg !275 + %6326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 9, !dbg !275 + %6327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 10, !dbg !275 + %6328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 11, !dbg !275 + %6329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 12, !dbg !275 + %6330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 13, !dbg !275 + %6331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 14, !dbg !275 + %6332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 15, !dbg !275 + %6333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 16, !dbg !275 + %6334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 17, !dbg !275 + %6335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 18, !dbg !275 + %6336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 19, !dbg !275 + %6337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 20, !dbg !275 + %6338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 21, !dbg !275 + %6339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 22, !dbg !275 + %6340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 23, !dbg !275 + %6341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 24, !dbg !275 + %6342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 25, !dbg !275 + %6343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 26, !dbg !275 + %6344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 27, !dbg !275 + %6345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 28, !dbg !275 + %6346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 29, !dbg !275 + %6347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 30, !dbg !275 + %6348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6305, 31, !dbg !275 + %6349 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6317, float %6318, float %6319, float %6320, float %6321, float %6322, float %6323, float %6324, float %6325, float %6326, float %6327, float %6328, float %6329, float %6330, float %6331, float %6332, float %6333, float %6334, float %6335, float %6336, float %6337, float %6338, float %6339, float %6340, float %6341, float %6342, float %6343, float %6344, float %6345, float %6346, float %6347, float %6348, i64 %6311, i64 %6316, i1 true) #3, !dbg !275 + %6350 = or disjoint i32 %6250, 96, !dbg !275 + %6351 = add i32 %6350, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !275 + %6352 = lshr exact i32 %6351, 4, !dbg !275 + %6353 = and i32 %6352, 16383, !dbg !275 + %6354 = zext nneg i32 %6353 to i64, !dbg !275 + %6355 = or disjoint i64 %6354, 4611686293372403712, !dbg !275 + %6356 = add i32 %6256, 96, !dbg !275 + %6357 = lshr exact i32 %6356, 4, !dbg !275 + %6358 = and i32 %6357, 16383, !dbg !275 + %6359 = zext nneg i32 %6358 to i64, !dbg !275 + %6360 = or disjoint i64 %6359, 4611686293338849280, !dbg !275 + %6361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 0, !dbg !275 + %6362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 1, !dbg !275 + %6363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 2, !dbg !275 + %6364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 3, !dbg !275 + %6365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 4, !dbg !275 + %6366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 5, !dbg !275 + %6367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 6, !dbg !275 + %6368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 7, !dbg !275 + %6369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 8, !dbg !275 + %6370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 9, !dbg !275 + %6371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 10, !dbg !275 + %6372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 11, !dbg !275 + %6373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 12, !dbg !275 + %6374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 13, !dbg !275 + %6375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 14, !dbg !275 + %6376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 15, !dbg !275 + %6377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 16, !dbg !275 + %6378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 17, !dbg !275 + %6379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 18, !dbg !275 + %6380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 19, !dbg !275 + %6381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 20, !dbg !275 + %6382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 21, !dbg !275 + %6383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 22, !dbg !275 + %6384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 23, !dbg !275 + %6385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 24, !dbg !275 + %6386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 25, !dbg !275 + %6387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 26, !dbg !275 + %6388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 27, !dbg !275 + %6389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 28, !dbg !275 + %6390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 29, !dbg !275 + %6391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 30, !dbg !275 + %6392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6349, 31, !dbg !275 + %6393 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6361, float %6362, float %6363, float %6364, float %6365, float %6366, float %6367, float %6368, float %6369, float %6370, float %6371, float %6372, float %6373, float %6374, float %6375, float %6376, float %6377, float %6378, float %6379, float %6380, float %6381, float %6382, float %6383, float %6384, float %6385, float %6386, float %6387, float %6388, float %6389, float %6390, float %6391, float %6392, i64 %6355, i64 %6360, i1 true) #3, !dbg !275 + %6394 = or disjoint i32 %6250, 16384, !dbg !275 + %6395 = add i32 %6394, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !275 + %6396 = lshr exact i32 %6395, 4, !dbg !275 + %6397 = and i32 %6396, 16383, !dbg !275 + %6398 = zext nneg i32 %6397 to i64, !dbg !275 + %6399 = or disjoint i64 %6398, 4611686293372403712, !dbg !275 + %6400 = add i32 %6256, 8192, !dbg !275 + %6401 = lshr exact i32 %6400, 4, !dbg !275 + %6402 = and i32 %6401, 16383, !dbg !275 + %6403 = zext nneg i32 %6402 to i64, !dbg !275 + %6404 = or disjoint i64 %6403, 4611686293338849280, !dbg !275 + %6405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 0, !dbg !275 + %6406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 1, !dbg !275 + %6407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 2, !dbg !275 + %6408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 3, !dbg !275 + %6409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 4, !dbg !275 + %6410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 5, !dbg !275 + %6411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 6, !dbg !275 + %6412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 7, !dbg !275 + %6413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 8, !dbg !275 + %6414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 9, !dbg !275 + %6415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 10, !dbg !275 + %6416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 11, !dbg !275 + %6417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 12, !dbg !275 + %6418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 13, !dbg !275 + %6419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 14, !dbg !275 + %6420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 15, !dbg !275 + %6421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 16, !dbg !275 + %6422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 17, !dbg !275 + %6423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 18, !dbg !275 + %6424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 19, !dbg !275 + %6425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 20, !dbg !275 + %6426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 21, !dbg !275 + %6427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 22, !dbg !275 + %6428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 23, !dbg !275 + %6429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 24, !dbg !275 + %6430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 25, !dbg !275 + %6431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 26, !dbg !275 + %6432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 27, !dbg !275 + %6433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 28, !dbg !275 + %6434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 29, !dbg !275 + %6435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 30, !dbg !275 + %6436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6393, 31, !dbg !275 + %6437 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6405, float %6406, float %6407, float %6408, float %6409, float %6410, float %6411, float %6412, float %6413, float %6414, float %6415, float %6416, float %6417, float %6418, float %6419, float %6420, float %6421, float %6422, float %6423, float %6424, float %6425, float %6426, float %6427, float %6428, float %6429, float %6430, float %6431, float %6432, float %6433, float %6434, float %6435, float %6436, i64 %6399, i64 %6404, i1 true) #3, !dbg !275 + %6438 = or disjoint i32 %6250, 16416, !dbg !275 + %6439 = add i32 %6438, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !275 + %6440 = lshr exact i32 %6439, 4, !dbg !275 + %6441 = and i32 %6440, 16383, !dbg !275 + %6442 = zext nneg i32 %6441 to i64, !dbg !275 + %6443 = or disjoint i64 %6442, 4611686293372403712, !dbg !275 + %6444 = add i32 %6256, 8224, !dbg !275 + %6445 = lshr exact i32 %6444, 4, !dbg !275 + %6446 = and i32 %6445, 16383, !dbg !275 + %6447 = zext nneg i32 %6446 to i64, !dbg !275 + %6448 = or disjoint i64 %6447, 4611686293338849280, !dbg !275 + %6449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 0, !dbg !275 + %6450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 1, !dbg !275 + %6451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 2, !dbg !275 + %6452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 3, !dbg !275 + %6453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 4, !dbg !275 + %6454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 5, !dbg !275 + %6455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 6, !dbg !275 + %6456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 7, !dbg !275 + %6457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 8, !dbg !275 + %6458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 9, !dbg !275 + %6459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 10, !dbg !275 + %6460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 11, !dbg !275 + %6461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 12, !dbg !275 + %6462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 13, !dbg !275 + %6463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 14, !dbg !275 + %6464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 15, !dbg !275 + %6465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 16, !dbg !275 + %6466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 17, !dbg !275 + %6467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 18, !dbg !275 + %6468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 19, !dbg !275 + %6469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 20, !dbg !275 + %6470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 21, !dbg !275 + %6471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 22, !dbg !275 + %6472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 23, !dbg !275 + %6473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 24, !dbg !275 + %6474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 25, !dbg !275 + %6475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 26, !dbg !275 + %6476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 27, !dbg !275 + %6477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 28, !dbg !275 + %6478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 29, !dbg !275 + %6479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 30, !dbg !275 + %6480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6437, 31, !dbg !275 + %6481 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6449, float %6450, float %6451, float %6452, float %6453, float %6454, float %6455, float %6456, float %6457, float %6458, float %6459, float %6460, float %6461, float %6462, float %6463, float %6464, float %6465, float %6466, float %6467, float %6468, float %6469, float %6470, float %6471, float %6472, float %6473, float %6474, float %6475, float %6476, float %6477, float %6478, float %6479, float %6480, i64 %6443, i64 %6448, i1 true) #3, !dbg !275 + %6482 = or disjoint i32 %6250, 16448, !dbg !275 + %6483 = add i32 %6482, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !275 + %6484 = lshr exact i32 %6483, 4, !dbg !275 + %6485 = and i32 %6484, 16383, !dbg !275 + %6486 = zext nneg i32 %6485 to i64, !dbg !275 + %6487 = or disjoint i64 %6486, 4611686293372403712, !dbg !275 + %6488 = add i32 %6256, 8256, !dbg !275 + %6489 = lshr exact i32 %6488, 4, !dbg !275 + %6490 = and i32 %6489, 16383, !dbg !275 + %6491 = zext nneg i32 %6490 to i64, !dbg !275 + %6492 = or disjoint i64 %6491, 4611686293338849280, !dbg !275 + %6493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 0, !dbg !275 + %6494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 1, !dbg !275 + %6495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 2, !dbg !275 + %6496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 3, !dbg !275 + %6497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 4, !dbg !275 + %6498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 5, !dbg !275 + %6499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 6, !dbg !275 + %6500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 7, !dbg !275 + %6501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 8, !dbg !275 + %6502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 9, !dbg !275 + %6503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 10, !dbg !275 + %6504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 11, !dbg !275 + %6505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 12, !dbg !275 + %6506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 13, !dbg !275 + %6507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 14, !dbg !275 + %6508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 15, !dbg !275 + %6509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 16, !dbg !275 + %6510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 17, !dbg !275 + %6511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 18, !dbg !275 + %6512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 19, !dbg !275 + %6513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 20, !dbg !275 + %6514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 21, !dbg !275 + %6515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 22, !dbg !275 + %6516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 23, !dbg !275 + %6517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 24, !dbg !275 + %6518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 25, !dbg !275 + %6519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 26, !dbg !275 + %6520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 27, !dbg !275 + %6521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 28, !dbg !275 + %6522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 29, !dbg !275 + %6523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 30, !dbg !275 + %6524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6481, 31, !dbg !275 + %6525 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6493, float %6494, float %6495, float %6496, float %6497, float %6498, float %6499, float %6500, float %6501, float %6502, float %6503, float %6504, float %6505, float %6506, float %6507, float %6508, float %6509, float %6510, float %6511, float %6512, float %6513, float %6514, float %6515, float %6516, float %6517, float %6518, float %6519, float %6520, float %6521, float %6522, float %6523, float %6524, i64 %6487, i64 %6492, i1 true) #3, !dbg !275 + %6526 = or disjoint i32 %6250, 16480, !dbg !275 + %6527 = add i32 %6526, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !275 + %6528 = lshr exact i32 %6527, 4, !dbg !275 + %6529 = and i32 %6528, 16383, !dbg !275 + %6530 = zext nneg i32 %6529 to i64, !dbg !275 + %6531 = or disjoint i64 %6530, 4611686293372403712, !dbg !275 + %6532 = add i32 %6256, 8288, !dbg !275 + %6533 = lshr exact i32 %6532, 4, !dbg !275 + %6534 = and i32 %6533, 16383, !dbg !275 + %6535 = zext nneg i32 %6534 to i64, !dbg !275 + %6536 = or disjoint i64 %6535, 4611686293338849280, !dbg !275 + %6537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 0, !dbg !275 + %6538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 1, !dbg !275 + %6539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 2, !dbg !275 + %6540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 3, !dbg !275 + %6541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 4, !dbg !275 + %6542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 5, !dbg !275 + %6543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 6, !dbg !275 + %6544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 7, !dbg !275 + %6545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 8, !dbg !275 + %6546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 9, !dbg !275 + %6547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 10, !dbg !275 + %6548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 11, !dbg !275 + %6549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 12, !dbg !275 + %6550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 13, !dbg !275 + %6551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 14, !dbg !275 + %6552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 15, !dbg !275 + %6553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 16, !dbg !275 + %6554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 17, !dbg !275 + %6555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 18, !dbg !275 + %6556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 19, !dbg !275 + %6557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 20, !dbg !275 + %6558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 21, !dbg !275 + %6559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 22, !dbg !275 + %6560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 23, !dbg !275 + %6561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 24, !dbg !275 + %6562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 25, !dbg !275 + %6563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 26, !dbg !275 + %6564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 27, !dbg !275 + %6565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 28, !dbg !275 + %6566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 29, !dbg !275 + %6567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 30, !dbg !275 + %6568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6525, 31, !dbg !275 + %6569 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6537, float %6538, float %6539, float %6540, float %6541, float %6542, float %6543, float %6544, float %6545, float %6546, float %6547, float %6548, float %6549, float %6550, float %6551, float %6552, float %6553, float %6554, float %6555, float %6556, float %6557, float %6558, float %6559, float %6560, float %6561, float %6562, float %6563, float %6564, float %6565, float %6566, float %6567, float %6568, i64 %6531, i64 %6536, i1 true) #3, !dbg !275 + %6570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 0, !dbg !275 + %6571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 1, !dbg !275 + %6572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 2, !dbg !275 + %6573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 3, !dbg !275 + %6574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 4, !dbg !275 + %6575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 5, !dbg !275 + %6576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 6, !dbg !275 + %6577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 7, !dbg !275 + %6578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 8, !dbg !275 + %6579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 9, !dbg !275 + %6580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 10, !dbg !275 + %6581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 11, !dbg !275 + %6582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 12, !dbg !275 + %6583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 13, !dbg !275 + %6584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 14, !dbg !275 + %6585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 15, !dbg !275 + %6586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 16, !dbg !275 + %6587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 17, !dbg !275 + %6588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 18, !dbg !275 + %6589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 19, !dbg !275 + %6590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 20, !dbg !275 + %6591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 21, !dbg !275 + %6592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 22, !dbg !275 + %6593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 23, !dbg !275 + %6594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 24, !dbg !275 + %6595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 25, !dbg !275 + %6596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 26, !dbg !275 + %6597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 27, !dbg !275 + %6598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 28, !dbg !275 + %6599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 29, !dbg !275 + %6600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 30, !dbg !275 + %6601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6569, 31, !dbg !275 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !275 + %6602 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %6570, float %6571, float %6572, float %6573, float %6574, float %6575, float %6576, float %6577, float %6578, float %6579, float %6580, float %6581, float %6582, float %6583, float %6584, float %6585, float %6586, float %6587, float %6588, float %6589, float %6590, float %6591, float %6592, float %6593, float %6594, float %6595, float %6596, float %6597, float %6598, float %6599, float %6600, float %6601, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 0, i32 0, ptr addrspace(3) %6181, i32 0, i32 0) #3, !dbg !275 + %6603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 0, !dbg !275 + %6604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 1, !dbg !275 + %6605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 2, !dbg !275 + %6606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 3, !dbg !275 + %6607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 4, !dbg !275 + %6608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 5, !dbg !275 + %6609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 6, !dbg !275 + %6610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 7, !dbg !275 + %6611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 8, !dbg !275 + %6612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 9, !dbg !275 + %6613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 10, !dbg !275 + %6614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 11, !dbg !275 + %6615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 12, !dbg !275 + %6616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 13, !dbg !275 + %6617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 14, !dbg !275 + %6618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 15, !dbg !275 + %6619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 16, !dbg !275 + %6620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 17, !dbg !275 + %6621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 18, !dbg !275 + %6622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 19, !dbg !275 + %6623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 20, !dbg !275 + %6624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 21, !dbg !275 + %6625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 22, !dbg !275 + %6626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 23, !dbg !275 + %6627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 24, !dbg !275 + %6628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 25, !dbg !275 + %6629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 26, !dbg !275 + %6630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 27, !dbg !275 + %6631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 28, !dbg !275 + %6632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 29, !dbg !275 + %6633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 30, !dbg !275 + %6634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6602, 31, !dbg !275 + %6635 = fmul float %6603, 0x3FB6A09E60000000, !dbg !304 + %6636 = fmul float %6604, 0x3FB6A09E60000000, !dbg !304 + %6637 = fmul float %6605, 0x3FB6A09E60000000, !dbg !304 + %6638 = fmul float %6606, 0x3FB6A09E60000000, !dbg !304 + %6639 = fmul float %6607, 0x3FB6A09E60000000, !dbg !304 + %6640 = fmul float %6608, 0x3FB6A09E60000000, !dbg !304 + %6641 = fmul float %6609, 0x3FB6A09E60000000, !dbg !304 + %6642 = fmul float %6610, 0x3FB6A09E60000000, !dbg !304 + %6643 = fmul float %6611, 0x3FB6A09E60000000, !dbg !304 + %6644 = fmul float %6612, 0x3FB6A09E60000000, !dbg !304 + %6645 = fmul float %6613, 0x3FB6A09E60000000, !dbg !304 + %6646 = fmul float %6614, 0x3FB6A09E60000000, !dbg !304 + %6647 = fmul float %6615, 0x3FB6A09E60000000, !dbg !304 + %6648 = fmul float %6616, 0x3FB6A09E60000000, !dbg !304 + %6649 = fmul float %6617, 0x3FB6A09E60000000, !dbg !304 + %6650 = fmul float %6618, 0x3FB6A09E60000000, !dbg !304 + %6651 = fmul float %6619, 0x3FB6A09E60000000, !dbg !304 + %6652 = fmul float %6620, 0x3FB6A09E60000000, !dbg !304 + %6653 = fmul float %6621, 0x3FB6A09E60000000, !dbg !304 + %6654 = fmul float %6622, 0x3FB6A09E60000000, !dbg !304 + %6655 = fmul float %6623, 0x3FB6A09E60000000, !dbg !304 + %6656 = fmul float %6624, 0x3FB6A09E60000000, !dbg !304 + %6657 = fmul float %6625, 0x3FB6A09E60000000, !dbg !304 + %6658 = fmul float %6626, 0x3FB6A09E60000000, !dbg !304 + %6659 = fmul float %6627, 0x3FB6A09E60000000, !dbg !304 + %6660 = fmul float %6628, 0x3FB6A09E60000000, !dbg !304 + %6661 = fmul float %6629, 0x3FB6A09E60000000, !dbg !304 + %6662 = fmul float %6630, 0x3FB6A09E60000000, !dbg !304 + %6663 = fmul float %6631, 0x3FB6A09E60000000, !dbg !304 + %6664 = fmul float %6632, 0x3FB6A09E60000000, !dbg !304 + %6665 = fmul float %6633, 0x3FB6A09E60000000, !dbg !304 + %6666 = fmul float %6634, 0x3FB6A09E60000000, !dbg !304 + %6667 = srem i32 %5994, %18, !dbg !253 + %6668 = srem i32 %5995, %18, !dbg !253 + %6669 = icmp sge i32 %6667, %5183, !dbg !305 + %6670 = icmp sge i32 %6668, %5183, !dbg !305 + %6671 = icmp sge i32 %6667, %5185, !dbg !305 + %6672 = icmp sge i32 %6668, %5185, !dbg !305 + %6673 = sext i32 %6667 to i64, !dbg !306 + %6674 = sext i32 %6668 to i64, !dbg !306 + %6675 = icmp sgt i64 %5182, %6673, !dbg !307 + %6676 = icmp sgt i64 %5182, %6674, !dbg !307 + %6677 = and i1 %6669, %6675, !dbg !308 + %6678 = and i1 %6670, %6676, !dbg !308 + %6679 = and i1 %6671, %6675, !dbg !308 + %6680 = and i1 %6672, %6676, !dbg !308 + %6681 = sub i32 %5183, %6667, !dbg !309 + %6682 = sub i32 %5183, %6668, !dbg !309 + %6683 = sub i32 %5185, %6667, !dbg !309 + %6684 = sub i32 %5185, %6668, !dbg !309 + %6685 = srem i32 %6681, %26, !dbg !310 + %6686 = srem i32 %6682, %26, !dbg !310 + %6687 = srem i32 %6683, %26, !dbg !310 + %6688 = srem i32 %6684, %26, !dbg !310 + %6689 = icmp ne i32 %6685, 0, !dbg !311 + %6690 = icmp ne i32 %6686, 0, !dbg !311 + %6691 = icmp ne i32 %6687, 0, !dbg !311 + %6692 = icmp ne i32 %6688, 0, !dbg !311 + %6693 = shufflevector <4 x i32> %6156, <4 x i32> poison, <8 x i32> , !dbg !253 + %6694 = srem <8 x i32> %6693, %5741, !dbg !253 + %6695 = extractelement <8 x i32> %6694, i64 5, !dbg !306 + %6696 = icmp sge i32 %6695, %5183, !dbg !305 + %6697 = extractelement <8 x i32> %6694, i64 4, !dbg !306 + %6698 = icmp sge i32 %6697, %5183, !dbg !305 + %6699 = icmp sge i32 %6695, %5185, !dbg !305 + %6700 = icmp sge i32 %6697, %5185, !dbg !305 + %6701 = extractelement <8 x i32> %6694, i64 1, !dbg !306 + %6702 = icmp sge i32 %6701, %5183, !dbg !305 + %6703 = extractelement <8 x i32> %6694, i64 0, !dbg !306 + %6704 = icmp sge i32 %6703, %5183, !dbg !305 + %6705 = icmp sge i32 %6701, %5185, !dbg !305 + %6706 = icmp sge i32 %6703, %5185, !dbg !305 + %6707 = sext i32 %6695 to i64, !dbg !306 + %6708 = sext i32 %6697 to i64, !dbg !306 + %6709 = sext i32 %6701 to i64, !dbg !306 + %6710 = sext i32 %6703 to i64, !dbg !306 + %6711 = icmp sgt i64 %5182, %6707, !dbg !307 + %6712 = icmp sgt i64 %5182, %6708, !dbg !307 + %6713 = icmp sgt i64 %5182, %6709, !dbg !307 + %6714 = icmp sgt i64 %5182, %6710, !dbg !307 + %6715 = and i1 %6696, %6711, !dbg !308 + %6716 = and i1 %6698, %6712, !dbg !308 + %6717 = and i1 %6699, %6711, !dbg !308 + %6718 = and i1 %6700, %6712, !dbg !308 + %6719 = and i1 %6702, %6713, !dbg !308 + %6720 = and i1 %6704, %6714, !dbg !308 + %6721 = and i1 %6705, %6713, !dbg !308 + %6722 = and i1 %6706, %6714, !dbg !308 + %6723 = sub <8 x i32> %5742, %6694, !dbg !309 + %6724 = srem <8 x i32> %6723, %5744, !dbg !310 + %6725 = icmp ne <8 x i32> %6724, zeroinitializer, !dbg !311 + %6726 = xor i32 %6685, %26, !dbg !312 + %6727 = icmp slt i32 %6726, 0, !dbg !312 + %6728 = xor i32 %6686, %26, !dbg !312 + %6729 = icmp slt i32 %6728, 0, !dbg !312 + %6730 = xor i32 %6687, %26, !dbg !312 + %6731 = icmp slt i32 %6730, 0, !dbg !312 + %6732 = extractelement <8 x i32> %6724, i64 7, !dbg !313 + %6733 = extractelement <8 x i32> %6724, i64 6, !dbg !313 + %6734 = extractelement <8 x i32> %6724, i64 5, !dbg !313 + %6735 = extractelement <8 x i32> %6724, i64 4, !dbg !313 + %6736 = extractelement <8 x i32> %6724, i64 3, !dbg !313 + %6737 = extractelement <8 x i32> %6724, i64 2, !dbg !313 + %6738 = extractelement <8 x i32> %6724, i64 1, !dbg !313 + %6739 = insertelement <8 x i32> %6724, i32 %6688, i64 0, !dbg !312 + %6740 = shufflevector <8 x i32> %6739, <8 x i32> %6724, <8 x i32> , !dbg !312 + %6741 = xor <8 x i32> %6740, %5744, !dbg !312 + %6742 = icmp slt <8 x i32> %6741, zeroinitializer, !dbg !312 + %6743 = extractelement <8 x i32> %6724, i64 0, !dbg !313 + %6744 = xor i32 %6743, %26, !dbg !312 + %6745 = icmp slt i32 %6744, 0, !dbg !312 + %6746 = and i1 %6689, %6727, !dbg !314 + %6747 = and i1 %6690, %6729, !dbg !314 + %6748 = and i1 %6691, %6731, !dbg !314 + %6749 = extractelement <8 x i1> %6742, i64 0, !dbg !314 + %6750 = and i1 %6692, %6749, !dbg !314 + %foldExtExtBinop = and <8 x i1> %6725, %6742, !dbg !314 + %6751 = extractelement <8 x i1> %foldExtExtBinop, i64 7, !dbg !314 + %foldExtExtBinop3025 = and <8 x i1> %6725, %6742, !dbg !314 + %6752 = extractelement <8 x i1> %foldExtExtBinop3025, i64 6, !dbg !314 + %foldExtExtBinop3027 = and <8 x i1> %6725, %6742, !dbg !314 + %6753 = extractelement <8 x i1> %foldExtExtBinop3027, i64 5, !dbg !314 + %foldExtExtBinop3029 = and <8 x i1> %6725, %6742, !dbg !314 + %6754 = extractelement <8 x i1> %foldExtExtBinop3029, i64 4, !dbg !314 + %foldExtExtBinop3031 = and <8 x i1> %6725, %6742, !dbg !314 + %6755 = extractelement <8 x i1> %foldExtExtBinop3031, i64 3, !dbg !314 + %foldExtExtBinop3033 = and <8 x i1> %6725, %6742, !dbg !314 + %6756 = extractelement <8 x i1> %foldExtExtBinop3033, i64 2, !dbg !314 + %foldExtExtBinop3035 = and <8 x i1> %6725, %6742, !dbg !314 + %6757 = extractelement <8 x i1> %foldExtExtBinop3035, i64 1, !dbg !314 + %6758 = extractelement <8 x i1> %6725, i64 0, !dbg !314 + %6759 = and i1 %6758, %6745, !dbg !314 + %6760 = select i1 %6746, i32 %26, i32 0, !dbg !315 + %6761 = select i1 %6747, i32 %26, i32 0, !dbg !315 + %6762 = select i1 %6748, i32 %26, i32 0, !dbg !315 + %6763 = select i1 %6750, i32 %26, i32 0, !dbg !315 + %6764 = select i1 %6751, i32 %26, i32 0, !dbg !315 + %6765 = select i1 %6752, i32 %26, i32 0, !dbg !315 + %6766 = select i1 %6753, i32 %26, i32 0, !dbg !315 + %6767 = select i1 %6754, i32 %26, i32 0, !dbg !315 + %6768 = select i1 %6755, i32 %26, i32 0, !dbg !315 + %6769 = select i1 %6756, i32 %26, i32 0, !dbg !315 + %6770 = select i1 %6757, i32 %26, i32 0, !dbg !315 + %6771 = select i1 %6759, i32 %26, i32 0, !dbg !315 + %6772 = sub i32 0, %6760, !dbg !313 + %6773 = icmp eq i32 %6685, %6772, !dbg !313 + %6774 = sub i32 0, %6761, !dbg !313 + %6775 = icmp eq i32 %6686, %6774, !dbg !313 + %6776 = sub i32 0, %6762, !dbg !313 + %6777 = icmp eq i32 %6687, %6776, !dbg !313 + %6778 = sub i32 0, %6763, !dbg !313 + %6779 = icmp eq i32 %6688, %6778, !dbg !313 + %6780 = sub i32 0, %6764, !dbg !313 + %6781 = icmp eq i32 %6732, %6780, !dbg !313 + %6782 = sub i32 0, %6765, !dbg !313 + %6783 = icmp eq i32 %6733, %6782, !dbg !313 + %6784 = sub i32 0, %6766, !dbg !313 + %6785 = icmp eq i32 %6734, %6784, !dbg !313 + %6786 = sub i32 0, %6767, !dbg !313 + %6787 = icmp eq i32 %6735, %6786, !dbg !313 + %6788 = sub i32 0, %6768, !dbg !313 + %6789 = icmp eq i32 %6736, %6788, !dbg !313 + %6790 = sub i32 0, %6769, !dbg !313 + %6791 = icmp eq i32 %6737, %6790, !dbg !313 + %6792 = sub i32 0, %6770, !dbg !313 + %6793 = icmp eq i32 %6738, %6792, !dbg !313 + %6794 = sub i32 0, %6771, !dbg !313 + %6795 = icmp eq i32 %6743, %6794, !dbg !313 + %6796 = srem <8 x i32> %6155, %5746, !dbg !253 + %6797 = shufflevector <8 x i32> %6796, <8 x i32> poison, <16 x i32> , !dbg !253 + %6798 = extractelement <8 x i32> %6796, i64 7, !dbg !306 + %6799 = icmp sge i32 %6798, %5183, !dbg !305 + %6800 = extractelement <8 x i32> %6796, i64 6, !dbg !306 + %6801 = icmp sge i32 %6800, %5183, !dbg !305 + %6802 = icmp sge i32 %6798, %5185, !dbg !305 + %6803 = icmp sge i32 %6800, %5185, !dbg !305 + %6804 = extractelement <8 x i32> %6796, i64 5, !dbg !306 + %6805 = icmp sge i32 %6804, %5183, !dbg !305 + %6806 = extractelement <8 x i32> %6796, i64 4, !dbg !306 + %6807 = icmp sge i32 %6806, %5183, !dbg !305 + %6808 = icmp sge i32 %6804, %5185, !dbg !305 + %6809 = icmp sge i32 %6806, %5185, !dbg !305 + %6810 = extractelement <8 x i32> %6796, i64 3, !dbg !306 + %6811 = icmp sge i32 %6810, %5183, !dbg !305 + %6812 = extractelement <8 x i32> %6796, i64 2, !dbg !306 + %6813 = icmp sge i32 %6812, %5183, !dbg !305 + %6814 = icmp sge i32 %6810, %5185, !dbg !305 + %6815 = icmp sge i32 %6812, %5185, !dbg !305 + %6816 = extractelement <8 x i32> %6796, i64 1, !dbg !306 + %6817 = icmp sge i32 %6816, %5183, !dbg !305 + %6818 = extractelement <8 x i32> %6796, i64 0, !dbg !306 + %6819 = icmp sge i32 %6818, %5183, !dbg !305 + %6820 = icmp sge i32 %6816, %5185, !dbg !305 + %6821 = icmp sge i32 %6818, %5185, !dbg !305 + %6822 = sext <8 x i32> %6796 to <8 x i64>, !dbg !307 + %6823 = icmp sgt <8 x i64> %5748, %6822, !dbg !307 + %6824 = extractelement <8 x i1> %6823, i64 7, !dbg !308 + %6825 = and i1 %6799, %6824, !dbg !308 + %6826 = extractelement <8 x i1> %6823, i64 6, !dbg !308 + %6827 = and i1 %6801, %6826, !dbg !308 + %6828 = and i1 %6802, %6824, !dbg !308 + %6829 = and i1 %6803, %6826, !dbg !308 + %6830 = extractelement <8 x i1> %6823, i64 5, !dbg !308 + %6831 = and i1 %6805, %6830, !dbg !308 + %6832 = extractelement <8 x i1> %6823, i64 4, !dbg !308 + %6833 = and i1 %6807, %6832, !dbg !308 + %6834 = and i1 %6808, %6830, !dbg !308 + %6835 = and i1 %6809, %6832, !dbg !308 + %6836 = extractelement <8 x i1> %6823, i64 3, !dbg !308 + %6837 = and i1 %6811, %6836, !dbg !308 + %6838 = extractelement <8 x i1> %6823, i64 2, !dbg !308 + %6839 = and i1 %6813, %6838, !dbg !308 + %6840 = and i1 %6814, %6836, !dbg !308 + %6841 = and i1 %6815, %6838, !dbg !308 + %6842 = extractelement <8 x i1> %6823, i64 1, !dbg !308 + %6843 = and i1 %6817, %6842, !dbg !308 + %6844 = extractelement <8 x i1> %6823, i64 0, !dbg !308 + %6845 = and i1 %6819, %6844, !dbg !308 + %6846 = and i1 %6820, %6842, !dbg !308 + %6847 = and i1 %6821, %6844, !dbg !308 + %6848 = sub <16 x i32> %5178, %6797, !dbg !309 + %6849 = srem <16 x i32> %6848, %5750, !dbg !310 + %6850 = icmp ne <16 x i32> %6849, zeroinitializer, !dbg !311 + %6851 = xor <16 x i32> %6849, %5750, !dbg !312 + %6852 = icmp slt <16 x i32> %6851, zeroinitializer, !dbg !312 + %6853 = and <16 x i1> %6850, %6852, !dbg !314 + %6854 = select <16 x i1> %6853, <16 x i32> %5750, <16 x i32> zeroinitializer, !dbg !315 + %6855 = sub <16 x i32> zeroinitializer, %6854, !dbg !313 + %6856 = icmp eq <16 x i32> %6849, %6855, !dbg !313 + %6857 = and i1 %5198, %6773, !dbg !316 + %6858 = and i1 %5198, %6775, !dbg !316 + %6859 = and i1 %5199, %6777, !dbg !316 + %6860 = and i1 %5199, %6779, !dbg !316 + %6861 = and i1 %5198, %6781, !dbg !316 + %6862 = and i1 %5198, %6783, !dbg !316 + %6863 = and i1 %5199, %6785, !dbg !316 + %6864 = and i1 %5199, %6787, !dbg !316 + %6865 = and i1 %5198, %6789, !dbg !316 + %6866 = and i1 %5198, %6791, !dbg !316 + %6867 = and i1 %5199, %6793, !dbg !316 + %6868 = and i1 %5199, %6795, !dbg !316 + %6869 = extractelement <16 x i1> %6856, i64 15, !dbg !316 + %6870 = and i1 %5198, %6869, !dbg !316 + %6871 = extractelement <16 x i1> %6856, i64 14, !dbg !316 + %6872 = and i1 %5198, %6871, !dbg !316 + %6873 = extractelement <16 x i1> %6856, i64 13, !dbg !316 + %6874 = and i1 %5199, %6873, !dbg !316 + %6875 = extractelement <16 x i1> %6856, i64 12, !dbg !316 + %6876 = and i1 %5199, %6875, !dbg !316 + %6877 = extractelement <16 x i1> %6856, i64 11, !dbg !316 + %6878 = and i1 %5198, %6877, !dbg !316 + %6879 = extractelement <16 x i1> %6856, i64 10, !dbg !316 + %6880 = and i1 %5198, %6879, !dbg !316 + %6881 = extractelement <16 x i1> %6856, i64 9, !dbg !316 + %6882 = and i1 %5199, %6881, !dbg !316 + %6883 = extractelement <16 x i1> %6856, i64 8, !dbg !316 + %6884 = and i1 %5199, %6883, !dbg !316 + %6885 = extractelement <16 x i1> %6856, i64 7, !dbg !316 + %6886 = and i1 %5198, %6885, !dbg !316 + %6887 = extractelement <16 x i1> %6856, i64 6, !dbg !316 + %6888 = and i1 %5198, %6887, !dbg !316 + %6889 = extractelement <16 x i1> %6856, i64 5, !dbg !316 + %6890 = and i1 %5199, %6889, !dbg !316 + %6891 = extractelement <16 x i1> %6856, i64 4, !dbg !316 + %6892 = and i1 %5199, %6891, !dbg !316 + %6893 = extractelement <16 x i1> %6856, i64 3, !dbg !316 + %6894 = and i1 %5198, %6893, !dbg !316 + %6895 = extractelement <16 x i1> %6856, i64 2, !dbg !316 + %6896 = and i1 %5198, %6895, !dbg !316 + %6897 = extractelement <16 x i1> %6856, i64 1, !dbg !316 + %6898 = and i1 %5199, %6897, !dbg !316 + %6899 = extractelement <16 x i1> %6856, i64 0, !dbg !316 + %6900 = and i1 %5199, %6899, !dbg !316 + %6901 = or i1 %6677, %6857, !dbg !317 + %6902 = or i1 %6678, %6858, !dbg !317 + %6903 = or i1 %6679, %6859, !dbg !317 + %6904 = or i1 %6680, %6860, !dbg !317 + %6905 = or i1 %6715, %6861, !dbg !317 + %6906 = or i1 %6716, %6862, !dbg !317 + %6907 = or i1 %6717, %6863, !dbg !317 + %6908 = or i1 %6718, %6864, !dbg !317 + %6909 = or i1 %6719, %6865, !dbg !317 + %6910 = or i1 %6720, %6866, !dbg !317 + %6911 = or i1 %6721, %6867, !dbg !317 + %6912 = or i1 %6722, %6868, !dbg !317 + %6913 = or i1 %6825, %6870, !dbg !317 + %6914 = or i1 %6827, %6872, !dbg !317 + %6915 = or i1 %6828, %6874, !dbg !317 + %6916 = or i1 %6829, %6876, !dbg !317 + %6917 = or i1 %6831, %6878, !dbg !317 + %6918 = or i1 %6833, %6880, !dbg !317 + %6919 = or i1 %6834, %6882, !dbg !317 + %6920 = or i1 %6835, %6884, !dbg !317 + %6921 = or i1 %6837, %6886, !dbg !317 + %6922 = or i1 %6839, %6888, !dbg !317 + %6923 = or i1 %6840, %6890, !dbg !317 + %6924 = or i1 %6841, %6892, !dbg !317 + %6925 = or i1 %6843, %6894, !dbg !317 + %6926 = or i1 %6845, %6896, !dbg !317 + %6927 = or i1 %6846, %6898, !dbg !317 + %6928 = or i1 %6847, %6900, !dbg !317 + %6929 = srem <2 x i32> %6152, %5739, !dbg !253 + %6930 = icmp sge <2 x i32> %6929, %5751, !dbg !305 + %6931 = sext <2 x i32> %6929 to <2 x i64>, !dbg !307 + %6932 = icmp sgt <2 x i64> %5753, %6931, !dbg !307 + %6933 = and <2 x i1> %6930, %6932, !dbg !308 + %6934 = sub <2 x i32> %5751, %6929, !dbg !309 + %6935 = srem <2 x i32> %6934, %5755, !dbg !310 + %6936 = icmp ne <2 x i32> %6935, zeroinitializer, !dbg !311 + %6937 = xor <2 x i32> %6935, %5755, !dbg !312 + %6938 = icmp slt <2 x i32> %6937, zeroinitializer, !dbg !312 + %6939 = and <2 x i1> %6936, %6938, !dbg !314 + %6940 = select <2 x i1> %6939, <2 x i32> %5755, <2 x i32> zeroinitializer, !dbg !315 + %6941 = sub <2 x i32> zeroinitializer, %6940, !dbg !313 + %6942 = icmp eq <2 x i32> %6935, %6941, !dbg !313 + %6943 = and <2 x i1> %5757, %6942, !dbg !316 + %6944 = or <2 x i1> %6933, %6943, !dbg !317 + %6945 = icmp sge <2 x i32> %6929, %5758, !dbg !305 + %6946 = and <2 x i1> %6945, %6932, !dbg !308 + %6947 = sub <2 x i32> %5758, %6929, !dbg !309 + %6948 = srem <2 x i32> %6947, %5755, !dbg !310 + %6949 = icmp ne <2 x i32> %6948, zeroinitializer, !dbg !311 + %6950 = xor <2 x i32> %6948, %5755, !dbg !312 + %6951 = icmp slt <2 x i32> %6950, zeroinitializer, !dbg !312 + %6952 = and <2 x i1> %6949, %6951, !dbg !314 + %6953 = select <2 x i1> %6952, <2 x i32> %5755, <2 x i32> zeroinitializer, !dbg !315 + %6954 = sub <2 x i32> zeroinitializer, %6953, !dbg !313 + %6955 = icmp eq <2 x i32> %6948, %6954, !dbg !313 + %6956 = and <2 x i1> %5760, %6955, !dbg !316 + %6957 = or <2 x i1> %6946, %6956, !dbg !317 + %6958 = select <2 x i1> %6957, <2 x i1> %6165, <2 x i1> zeroinitializer, !dbg !318 + %6959 = select <2 x i1> %6944, <2 x i1> %6165, <2 x i1> zeroinitializer, !dbg !318 + %6960 = select i1 %6901, i1 %6166, i1 false, !dbg !318 + %6961 = select i1 %6902, i1 %6167, i1 false, !dbg !318 + %6962 = select i1 %6903, i1 %6166, i1 false, !dbg !318 + %6963 = select i1 %6904, i1 %6167, i1 false, !dbg !318 + %6964 = select i1 %6905, i1 %6168, i1 false, !dbg !318 + %6965 = select i1 %6906, i1 %6169, i1 false, !dbg !318 + %6966 = select i1 %6907, i1 %6168, i1 false, !dbg !318 + %6967 = select i1 %6908, i1 %6169, i1 false, !dbg !318 + %6968 = select i1 %6909, i1 %6170, i1 false, !dbg !318 + %6969 = select i1 %6910, i1 %6171, i1 false, !dbg !318 + %6970 = select i1 %6911, i1 %6170, i1 false, !dbg !318 + %6971 = select i1 %6912, i1 %6171, i1 false, !dbg !318 + %6972 = select i1 %6913, i1 %6172, i1 false, !dbg !318 + %6973 = select i1 %6914, i1 %6173, i1 false, !dbg !318 + %6974 = select i1 %6915, i1 %6172, i1 false, !dbg !318 + %6975 = select i1 %6916, i1 %6173, i1 false, !dbg !318 + %6976 = select i1 %6917, i1 %6174, i1 false, !dbg !318 + %6977 = select i1 %6918, i1 %6175, i1 false, !dbg !318 + %6978 = select i1 %6919, i1 %6174, i1 false, !dbg !318 + %6979 = select i1 %6920, i1 %6175, i1 false, !dbg !318 + %6980 = select i1 %6921, i1 %6176, i1 false, !dbg !318 + %6981 = select i1 %6922, i1 %6177, i1 false, !dbg !318 + %6982 = select i1 %6923, i1 %6176, i1 false, !dbg !318 + %6983 = select i1 %6924, i1 %6177, i1 false, !dbg !318 + %6984 = select i1 %6925, i1 %6178, i1 false, !dbg !318 + %6985 = select i1 %6926, i1 %6179, i1 false, !dbg !318 + %6986 = select i1 %6927, i1 %6178, i1 false, !dbg !318 + %6987 = select i1 %6928, i1 %6179, i1 false, !dbg !318 + %6988 = fmul float %6635, 0x3FF7154760000000, !dbg !319 + %6989 = extractelement <2 x i1> %6958, i64 0, !dbg !318 + %6990 = select i1 %6989, float %6988, float 0xFFF0000000000000, !dbg !318 + %6991 = fmul float %6636, 0x3FF7154760000000, !dbg !319 + %6992 = extractelement <2 x i1> %6958, i64 1, !dbg !318 + %6993 = select i1 %6992, float %6991, float 0xFFF0000000000000, !dbg !318 + %6994 = fmul float %6637, 0x3FF7154760000000, !dbg !319 + %6995 = extractelement <2 x i1> %6959, i64 0, !dbg !318 + %6996 = select i1 %6995, float %6994, float 0xFFF0000000000000, !dbg !318 + %6997 = fmul float %6638, 0x3FF7154760000000, !dbg !319 + %6998 = extractelement <2 x i1> %6959, i64 1, !dbg !318 + %6999 = select i1 %6998, float %6997, float 0xFFF0000000000000, !dbg !318 + %7000 = fmul float %6639, 0x3FF7154760000000, !dbg !319 + %7001 = select i1 %6960, float %7000, float 0xFFF0000000000000, !dbg !318 + %7002 = fmul float %6640, 0x3FF7154760000000, !dbg !319 + %7003 = select i1 %6961, float %7002, float 0xFFF0000000000000, !dbg !318 + %7004 = fmul float %6641, 0x3FF7154760000000, !dbg !319 + %7005 = select i1 %6962, float %7004, float 0xFFF0000000000000, !dbg !318 + %7006 = fmul float %6642, 0x3FF7154760000000, !dbg !319 + %7007 = select i1 %6963, float %7006, float 0xFFF0000000000000, !dbg !318 + %7008 = fmul float %6643, 0x3FF7154760000000, !dbg !319 + %7009 = select i1 %6964, float %7008, float 0xFFF0000000000000, !dbg !318 + %7010 = fmul float %6644, 0x3FF7154760000000, !dbg !319 + %7011 = select i1 %6965, float %7010, float 0xFFF0000000000000, !dbg !318 + %7012 = fmul float %6645, 0x3FF7154760000000, !dbg !319 + %7013 = select i1 %6966, float %7012, float 0xFFF0000000000000, !dbg !318 + %7014 = fmul float %6646, 0x3FF7154760000000, !dbg !319 + %7015 = select i1 %6967, float %7014, float 0xFFF0000000000000, !dbg !318 + %7016 = fmul float %6647, 0x3FF7154760000000, !dbg !319 + %7017 = select i1 %6968, float %7016, float 0xFFF0000000000000, !dbg !318 + %7018 = fmul float %6648, 0x3FF7154760000000, !dbg !319 + %7019 = select i1 %6969, float %7018, float 0xFFF0000000000000, !dbg !318 + %7020 = fmul float %6649, 0x3FF7154760000000, !dbg !319 + %7021 = select i1 %6970, float %7020, float 0xFFF0000000000000, !dbg !318 + %7022 = fmul float %6650, 0x3FF7154760000000, !dbg !319 + %7023 = select i1 %6971, float %7022, float 0xFFF0000000000000, !dbg !318 + %7024 = fmul float %6651, 0x3FF7154760000000, !dbg !319 + %7025 = select i1 %6972, float %7024, float 0xFFF0000000000000, !dbg !318 + %7026 = fmul float %6652, 0x3FF7154760000000, !dbg !319 + %7027 = select i1 %6973, float %7026, float 0xFFF0000000000000, !dbg !318 + %7028 = fmul float %6653, 0x3FF7154760000000, !dbg !319 + %7029 = select i1 %6974, float %7028, float 0xFFF0000000000000, !dbg !318 + %7030 = fmul float %6654, 0x3FF7154760000000, !dbg !319 + %7031 = select i1 %6975, float %7030, float 0xFFF0000000000000, !dbg !318 + %7032 = fmul float %6655, 0x3FF7154760000000, !dbg !319 + %7033 = select i1 %6976, float %7032, float 0xFFF0000000000000, !dbg !318 + %7034 = fmul float %6656, 0x3FF7154760000000, !dbg !319 + %7035 = select i1 %6977, float %7034, float 0xFFF0000000000000, !dbg !318 + %7036 = fmul float %6657, 0x3FF7154760000000, !dbg !319 + %7037 = select i1 %6978, float %7036, float 0xFFF0000000000000, !dbg !318 + %7038 = fmul float %6658, 0x3FF7154760000000, !dbg !319 + %7039 = select i1 %6979, float %7038, float 0xFFF0000000000000, !dbg !318 + %7040 = fmul float %6659, 0x3FF7154760000000, !dbg !319 + %7041 = select i1 %6980, float %7040, float 0xFFF0000000000000, !dbg !318 + %7042 = fmul float %6660, 0x3FF7154760000000, !dbg !319 + %7043 = select i1 %6981, float %7042, float 0xFFF0000000000000, !dbg !318 + %7044 = fmul float %6661, 0x3FF7154760000000, !dbg !319 + %7045 = select i1 %6982, float %7044, float 0xFFF0000000000000, !dbg !318 + %7046 = fmul float %6662, 0x3FF7154760000000, !dbg !319 + %7047 = select i1 %6983, float %7046, float 0xFFF0000000000000, !dbg !318 + %7048 = fmul float %6663, 0x3FF7154760000000, !dbg !319 + %7049 = select i1 %6984, float %7048, float 0xFFF0000000000000, !dbg !318 + %7050 = fmul float %6664, 0x3FF7154760000000, !dbg !319 + %7051 = select i1 %6985, float %7050, float 0xFFF0000000000000, !dbg !318 + %7052 = fmul float %6665, 0x3FF7154760000000, !dbg !319 + %7053 = select i1 %6986, float %7052, float 0xFFF0000000000000, !dbg !318 + %7054 = fmul float %6666, 0x3FF7154760000000, !dbg !319 + %7055 = select i1 %6987, float %7054, float 0xFFF0000000000000, !dbg !318 + %7056 = fsub float %6990, %6232, !dbg !320 + %7057 = fsub float %6993, %6233, !dbg !320 + %7058 = fsub float %6996, %6232, !dbg !320 + %7059 = fsub float %6999, %6233, !dbg !320 + %7060 = fsub float %7001, %6234, !dbg !320 + %7061 = fsub float %7003, %6235, !dbg !320 + %7062 = fsub float %7005, %6234, !dbg !320 + %7063 = fsub float %7007, %6235, !dbg !320 + %7064 = fsub float %7009, %6236, !dbg !320 + %7065 = fsub float %7011, %6237, !dbg !320 + %7066 = fsub float %7013, %6236, !dbg !320 + %7067 = fsub float %7015, %6237, !dbg !320 + %7068 = fsub float %7017, %6238, !dbg !320 + %7069 = fsub float %7019, %6239, !dbg !320 + %7070 = fsub float %7021, %6238, !dbg !320 + %7071 = fsub float %7023, %6239, !dbg !320 + %7072 = fsub float %7025, %6240, !dbg !320 + %7073 = fsub float %7027, %6241, !dbg !320 + %7074 = fsub float %7029, %6240, !dbg !320 + %7075 = fsub float %7031, %6241, !dbg !320 + %7076 = fsub float %7033, %6242, !dbg !320 + %7077 = fsub float %7035, %6243, !dbg !320 + %7078 = fsub float %7037, %6242, !dbg !320 + %7079 = fsub float %7039, %6243, !dbg !320 + %7080 = fsub float %7041, %6244, !dbg !320 + %7081 = fsub float %7043, %6245, !dbg !320 + %7082 = fsub float %7045, %6244, !dbg !320 + %7083 = fsub float %7047, %6245, !dbg !320 + %7084 = fsub float %7049, %6246, !dbg !320 + %7085 = fsub float %7051, %6247, !dbg !320 + %7086 = fsub float %7053, %6246, !dbg !320 + %7087 = fsub float %7055, %6247, !dbg !320 + %7088 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1240 = icmp eq i32 %7088, 0, !dbg !321 + br i1 %.not.i1240, label %7091, label %7089, !dbg !321 + +7089: ; preds = %.lr.ph1620 + %7090 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7056) #3, !dbg !321 + br label %__nv_exp2f.exit1242, !dbg !321 + +7091: ; preds = %.lr.ph1620 + %7092 = tail call float @llvm.nvvm.ex2.approx.f(float %7056) #3, !dbg !321 + br label %__nv_exp2f.exit1242, !dbg !321 + +__nv_exp2f.exit1242: ; preds = %7089, %7091 + %.0.i1241 = phi float [ %7090, %7089 ], [ %7092, %7091 ], !dbg !321 + %7093 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1243 = icmp eq i32 %7093, 0, !dbg !321 + br i1 %.not.i1243, label %7096, label %7094, !dbg !321 + +7094: ; preds = %__nv_exp2f.exit1242 + %7095 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7057) #3, !dbg !321 + br label %__nv_exp2f.exit1245, !dbg !321 + +7096: ; preds = %__nv_exp2f.exit1242 + %7097 = tail call float @llvm.nvvm.ex2.approx.f(float %7057) #3, !dbg !321 + br label %__nv_exp2f.exit1245, !dbg !321 + +__nv_exp2f.exit1245: ; preds = %7094, %7096 + %.0.i1244 = phi float [ %7095, %7094 ], [ %7097, %7096 ], !dbg !321 + %7098 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1246 = icmp eq i32 %7098, 0, !dbg !321 + br i1 %.not.i1246, label %7101, label %7099, !dbg !321 + +7099: ; preds = %__nv_exp2f.exit1245 + %7100 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7058) #3, !dbg !321 + br label %__nv_exp2f.exit1248, !dbg !321 + +7101: ; preds = %__nv_exp2f.exit1245 + %7102 = tail call float @llvm.nvvm.ex2.approx.f(float %7058) #3, !dbg !321 + br label %__nv_exp2f.exit1248, !dbg !321 + +__nv_exp2f.exit1248: ; preds = %7099, %7101 + %.0.i1247 = phi float [ %7100, %7099 ], [ %7102, %7101 ], !dbg !321 + %7103 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1249 = icmp eq i32 %7103, 0, !dbg !321 + br i1 %.not.i1249, label %7106, label %7104, !dbg !321 + +7104: ; preds = %__nv_exp2f.exit1248 + %7105 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7059) #3, !dbg !321 + br label %__nv_exp2f.exit1251, !dbg !321 + +7106: ; preds = %__nv_exp2f.exit1248 + %7107 = tail call float @llvm.nvvm.ex2.approx.f(float %7059) #3, !dbg !321 + br label %__nv_exp2f.exit1251, !dbg !321 + +__nv_exp2f.exit1251: ; preds = %7104, %7106 + %.0.i1250 = phi float [ %7105, %7104 ], [ %7107, %7106 ], !dbg !321 + %7108 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1252 = icmp eq i32 %7108, 0, !dbg !321 + br i1 %.not.i1252, label %7111, label %7109, !dbg !321 + +7109: ; preds = %__nv_exp2f.exit1251 + %7110 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7060) #3, !dbg !321 + br label %__nv_exp2f.exit1254, !dbg !321 + +7111: ; preds = %__nv_exp2f.exit1251 + %7112 = tail call float @llvm.nvvm.ex2.approx.f(float %7060) #3, !dbg !321 + br label %__nv_exp2f.exit1254, !dbg !321 + +__nv_exp2f.exit1254: ; preds = %7109, %7111 + %.0.i1253 = phi float [ %7110, %7109 ], [ %7112, %7111 ], !dbg !321 + %7113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1255 = icmp eq i32 %7113, 0, !dbg !321 + br i1 %.not.i1255, label %7116, label %7114, !dbg !321 + +7114: ; preds = %__nv_exp2f.exit1254 + %7115 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7061) #3, !dbg !321 + br label %__nv_exp2f.exit1257, !dbg !321 + +7116: ; preds = %__nv_exp2f.exit1254 + %7117 = tail call float @llvm.nvvm.ex2.approx.f(float %7061) #3, !dbg !321 + br label %__nv_exp2f.exit1257, !dbg !321 + +__nv_exp2f.exit1257: ; preds = %7114, %7116 + %.0.i1256 = phi float [ %7115, %7114 ], [ %7117, %7116 ], !dbg !321 + %7118 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1258 = icmp eq i32 %7118, 0, !dbg !321 + br i1 %.not.i1258, label %7121, label %7119, !dbg !321 + +7119: ; preds = %__nv_exp2f.exit1257 + %7120 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7062) #3, !dbg !321 + br label %__nv_exp2f.exit1260, !dbg !321 + +7121: ; preds = %__nv_exp2f.exit1257 + %7122 = tail call float @llvm.nvvm.ex2.approx.f(float %7062) #3, !dbg !321 + br label %__nv_exp2f.exit1260, !dbg !321 + +__nv_exp2f.exit1260: ; preds = %7119, %7121 + %.0.i1259 = phi float [ %7120, %7119 ], [ %7122, %7121 ], !dbg !321 + %7123 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1261 = icmp eq i32 %7123, 0, !dbg !321 + br i1 %.not.i1261, label %7126, label %7124, !dbg !321 + +7124: ; preds = %__nv_exp2f.exit1260 + %7125 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7063) #3, !dbg !321 + br label %__nv_exp2f.exit1263, !dbg !321 + +7126: ; preds = %__nv_exp2f.exit1260 + %7127 = tail call float @llvm.nvvm.ex2.approx.f(float %7063) #3, !dbg !321 + br label %__nv_exp2f.exit1263, !dbg !321 + +__nv_exp2f.exit1263: ; preds = %7124, %7126 + %.0.i1262 = phi float [ %7125, %7124 ], [ %7127, %7126 ], !dbg !321 + %7128 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1264 = icmp eq i32 %7128, 0, !dbg !321 + br i1 %.not.i1264, label %7131, label %7129, !dbg !321 + +7129: ; preds = %__nv_exp2f.exit1263 + %7130 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7064) #3, !dbg !321 + br label %__nv_exp2f.exit1266, !dbg !321 + +7131: ; preds = %__nv_exp2f.exit1263 + %7132 = tail call float @llvm.nvvm.ex2.approx.f(float %7064) #3, !dbg !321 + br label %__nv_exp2f.exit1266, !dbg !321 + +__nv_exp2f.exit1266: ; preds = %7129, %7131 + %.0.i1265 = phi float [ %7130, %7129 ], [ %7132, %7131 ], !dbg !321 + %7133 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1267 = icmp eq i32 %7133, 0, !dbg !321 + br i1 %.not.i1267, label %7136, label %7134, !dbg !321 + +7134: ; preds = %__nv_exp2f.exit1266 + %7135 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7065) #3, !dbg !321 + br label %__nv_exp2f.exit1269, !dbg !321 + +7136: ; preds = %__nv_exp2f.exit1266 + %7137 = tail call float @llvm.nvvm.ex2.approx.f(float %7065) #3, !dbg !321 + br label %__nv_exp2f.exit1269, !dbg !321 + +__nv_exp2f.exit1269: ; preds = %7134, %7136 + %.0.i1268 = phi float [ %7135, %7134 ], [ %7137, %7136 ], !dbg !321 + %7138 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1270 = icmp eq i32 %7138, 0, !dbg !321 + br i1 %.not.i1270, label %7141, label %7139, !dbg !321 + +7139: ; preds = %__nv_exp2f.exit1269 + %7140 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7066) #3, !dbg !321 + br label %__nv_exp2f.exit1272, !dbg !321 + +7141: ; preds = %__nv_exp2f.exit1269 + %7142 = tail call float @llvm.nvvm.ex2.approx.f(float %7066) #3, !dbg !321 + br label %__nv_exp2f.exit1272, !dbg !321 + +__nv_exp2f.exit1272: ; preds = %7139, %7141 + %.0.i1271 = phi float [ %7140, %7139 ], [ %7142, %7141 ], !dbg !321 + %7143 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1273 = icmp eq i32 %7143, 0, !dbg !321 + br i1 %.not.i1273, label %7146, label %7144, !dbg !321 + +7144: ; preds = %__nv_exp2f.exit1272 + %7145 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7067) #3, !dbg !321 + br label %__nv_exp2f.exit1275, !dbg !321 + +7146: ; preds = %__nv_exp2f.exit1272 + %7147 = tail call float @llvm.nvvm.ex2.approx.f(float %7067) #3, !dbg !321 + br label %__nv_exp2f.exit1275, !dbg !321 + +__nv_exp2f.exit1275: ; preds = %7144, %7146 + %.0.i1274 = phi float [ %7145, %7144 ], [ %7147, %7146 ], !dbg !321 + %7148 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1276 = icmp eq i32 %7148, 0, !dbg !321 + br i1 %.not.i1276, label %7151, label %7149, !dbg !321 + +7149: ; preds = %__nv_exp2f.exit1275 + %7150 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7068) #3, !dbg !321 + br label %__nv_exp2f.exit1278, !dbg !321 + +7151: ; preds = %__nv_exp2f.exit1275 + %7152 = tail call float @llvm.nvvm.ex2.approx.f(float %7068) #3, !dbg !321 + br label %__nv_exp2f.exit1278, !dbg !321 + +__nv_exp2f.exit1278: ; preds = %7149, %7151 + %.0.i1277 = phi float [ %7150, %7149 ], [ %7152, %7151 ], !dbg !321 + %7153 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1279 = icmp eq i32 %7153, 0, !dbg !321 + br i1 %.not.i1279, label %7156, label %7154, !dbg !321 + +7154: ; preds = %__nv_exp2f.exit1278 + %7155 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7069) #3, !dbg !321 + br label %__nv_exp2f.exit1281, !dbg !321 + +7156: ; preds = %__nv_exp2f.exit1278 + %7157 = tail call float @llvm.nvvm.ex2.approx.f(float %7069) #3, !dbg !321 + br label %__nv_exp2f.exit1281, !dbg !321 + +__nv_exp2f.exit1281: ; preds = %7154, %7156 + %.0.i1280 = phi float [ %7155, %7154 ], [ %7157, %7156 ], !dbg !321 + %7158 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1282 = icmp eq i32 %7158, 0, !dbg !321 + br i1 %.not.i1282, label %7161, label %7159, !dbg !321 + +7159: ; preds = %__nv_exp2f.exit1281 + %7160 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7070) #3, !dbg !321 + br label %__nv_exp2f.exit1284, !dbg !321 + +7161: ; preds = %__nv_exp2f.exit1281 + %7162 = tail call float @llvm.nvvm.ex2.approx.f(float %7070) #3, !dbg !321 + br label %__nv_exp2f.exit1284, !dbg !321 + +__nv_exp2f.exit1284: ; preds = %7159, %7161 + %.0.i1283 = phi float [ %7160, %7159 ], [ %7162, %7161 ], !dbg !321 + %7163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1285 = icmp eq i32 %7163, 0, !dbg !321 + br i1 %.not.i1285, label %7166, label %7164, !dbg !321 + +7164: ; preds = %__nv_exp2f.exit1284 + %7165 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7071) #3, !dbg !321 + br label %__nv_exp2f.exit1287, !dbg !321 + +7166: ; preds = %__nv_exp2f.exit1284 + %7167 = tail call float @llvm.nvvm.ex2.approx.f(float %7071) #3, !dbg !321 + br label %__nv_exp2f.exit1287, !dbg !321 + +__nv_exp2f.exit1287: ; preds = %7164, %7166 + %.0.i1286 = phi float [ %7165, %7164 ], [ %7167, %7166 ], !dbg !321 + %7168 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1288 = icmp eq i32 %7168, 0, !dbg !321 + br i1 %.not.i1288, label %7171, label %7169, !dbg !321 + +7169: ; preds = %__nv_exp2f.exit1287 + %7170 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7072) #3, !dbg !321 + br label %__nv_exp2f.exit1290, !dbg !321 + +7171: ; preds = %__nv_exp2f.exit1287 + %7172 = tail call float @llvm.nvvm.ex2.approx.f(float %7072) #3, !dbg !321 + br label %__nv_exp2f.exit1290, !dbg !321 + +__nv_exp2f.exit1290: ; preds = %7169, %7171 + %.0.i1289 = phi float [ %7170, %7169 ], [ %7172, %7171 ], !dbg !321 + %7173 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1291 = icmp eq i32 %7173, 0, !dbg !321 + br i1 %.not.i1291, label %7176, label %7174, !dbg !321 + +7174: ; preds = %__nv_exp2f.exit1290 + %7175 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7073) #3, !dbg !321 + br label %__nv_exp2f.exit1293, !dbg !321 + +7176: ; preds = %__nv_exp2f.exit1290 + %7177 = tail call float @llvm.nvvm.ex2.approx.f(float %7073) #3, !dbg !321 + br label %__nv_exp2f.exit1293, !dbg !321 + +__nv_exp2f.exit1293: ; preds = %7174, %7176 + %.0.i1292 = phi float [ %7175, %7174 ], [ %7177, %7176 ], !dbg !321 + %7178 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1294 = icmp eq i32 %7178, 0, !dbg !321 + br i1 %.not.i1294, label %7181, label %7179, !dbg !321 + +7179: ; preds = %__nv_exp2f.exit1293 + %7180 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7074) #3, !dbg !321 + br label %__nv_exp2f.exit1296, !dbg !321 + +7181: ; preds = %__nv_exp2f.exit1293 + %7182 = tail call float @llvm.nvvm.ex2.approx.f(float %7074) #3, !dbg !321 + br label %__nv_exp2f.exit1296, !dbg !321 + +__nv_exp2f.exit1296: ; preds = %7179, %7181 + %.0.i1295 = phi float [ %7180, %7179 ], [ %7182, %7181 ], !dbg !321 + %7183 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1297 = icmp eq i32 %7183, 0, !dbg !321 + br i1 %.not.i1297, label %7186, label %7184, !dbg !321 + +7184: ; preds = %__nv_exp2f.exit1296 + %7185 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7075) #3, !dbg !321 + br label %__nv_exp2f.exit1299, !dbg !321 + +7186: ; preds = %__nv_exp2f.exit1296 + %7187 = tail call float @llvm.nvvm.ex2.approx.f(float %7075) #3, !dbg !321 + br label %__nv_exp2f.exit1299, !dbg !321 + +__nv_exp2f.exit1299: ; preds = %7184, %7186 + %.0.i1298 = phi float [ %7185, %7184 ], [ %7187, %7186 ], !dbg !321 + %7188 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1300 = icmp eq i32 %7188, 0, !dbg !321 + br i1 %.not.i1300, label %7191, label %7189, !dbg !321 + +7189: ; preds = %__nv_exp2f.exit1299 + %7190 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7076) #3, !dbg !321 + br label %__nv_exp2f.exit1302, !dbg !321 + +7191: ; preds = %__nv_exp2f.exit1299 + %7192 = tail call float @llvm.nvvm.ex2.approx.f(float %7076) #3, !dbg !321 + br label %__nv_exp2f.exit1302, !dbg !321 + +__nv_exp2f.exit1302: ; preds = %7189, %7191 + %.0.i1301 = phi float [ %7190, %7189 ], [ %7192, %7191 ], !dbg !321 + %7193 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1303 = icmp eq i32 %7193, 0, !dbg !321 + br i1 %.not.i1303, label %7196, label %7194, !dbg !321 + +7194: ; preds = %__nv_exp2f.exit1302 + %7195 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7077) #3, !dbg !321 + br label %__nv_exp2f.exit1305, !dbg !321 + +7196: ; preds = %__nv_exp2f.exit1302 + %7197 = tail call float @llvm.nvvm.ex2.approx.f(float %7077) #3, !dbg !321 + br label %__nv_exp2f.exit1305, !dbg !321 + +__nv_exp2f.exit1305: ; preds = %7194, %7196 + %.0.i1304 = phi float [ %7195, %7194 ], [ %7197, %7196 ], !dbg !321 + %7198 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1306 = icmp eq i32 %7198, 0, !dbg !321 + br i1 %.not.i1306, label %7201, label %7199, !dbg !321 + +7199: ; preds = %__nv_exp2f.exit1305 + %7200 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7078) #3, !dbg !321 + br label %__nv_exp2f.exit1308, !dbg !321 + +7201: ; preds = %__nv_exp2f.exit1305 + %7202 = tail call float @llvm.nvvm.ex2.approx.f(float %7078) #3, !dbg !321 + br label %__nv_exp2f.exit1308, !dbg !321 + +__nv_exp2f.exit1308: ; preds = %7199, %7201 + %.0.i1307 = phi float [ %7200, %7199 ], [ %7202, %7201 ], !dbg !321 + %7203 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1309 = icmp eq i32 %7203, 0, !dbg !321 + br i1 %.not.i1309, label %7206, label %7204, !dbg !321 + +7204: ; preds = %__nv_exp2f.exit1308 + %7205 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7079) #3, !dbg !321 + br label %__nv_exp2f.exit1311, !dbg !321 + +7206: ; preds = %__nv_exp2f.exit1308 + %7207 = tail call float @llvm.nvvm.ex2.approx.f(float %7079) #3, !dbg !321 + br label %__nv_exp2f.exit1311, !dbg !321 + +__nv_exp2f.exit1311: ; preds = %7204, %7206 + %.0.i1310 = phi float [ %7205, %7204 ], [ %7207, %7206 ], !dbg !321 + %7208 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1312 = icmp eq i32 %7208, 0, !dbg !321 + br i1 %.not.i1312, label %7211, label %7209, !dbg !321 + +7209: ; preds = %__nv_exp2f.exit1311 + %7210 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7080) #3, !dbg !321 + br label %__nv_exp2f.exit1314, !dbg !321 + +7211: ; preds = %__nv_exp2f.exit1311 + %7212 = tail call float @llvm.nvvm.ex2.approx.f(float %7080) #3, !dbg !321 + br label %__nv_exp2f.exit1314, !dbg !321 + +__nv_exp2f.exit1314: ; preds = %7209, %7211 + %.0.i1313 = phi float [ %7210, %7209 ], [ %7212, %7211 ], !dbg !321 + %7213 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1315 = icmp eq i32 %7213, 0, !dbg !321 + br i1 %.not.i1315, label %7216, label %7214, !dbg !321 + +7214: ; preds = %__nv_exp2f.exit1314 + %7215 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7081) #3, !dbg !321 + br label %__nv_exp2f.exit1317, !dbg !321 + +7216: ; preds = %__nv_exp2f.exit1314 + %7217 = tail call float @llvm.nvvm.ex2.approx.f(float %7081) #3, !dbg !321 + br label %__nv_exp2f.exit1317, !dbg !321 + +__nv_exp2f.exit1317: ; preds = %7214, %7216 + %.0.i1316 = phi float [ %7215, %7214 ], [ %7217, %7216 ], !dbg !321 + %7218 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1318 = icmp eq i32 %7218, 0, !dbg !321 + br i1 %.not.i1318, label %7221, label %7219, !dbg !321 + +7219: ; preds = %__nv_exp2f.exit1317 + %7220 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7082) #3, !dbg !321 + br label %__nv_exp2f.exit1320, !dbg !321 + +7221: ; preds = %__nv_exp2f.exit1317 + %7222 = tail call float @llvm.nvvm.ex2.approx.f(float %7082) #3, !dbg !321 + br label %__nv_exp2f.exit1320, !dbg !321 + +__nv_exp2f.exit1320: ; preds = %7219, %7221 + %.0.i1319 = phi float [ %7220, %7219 ], [ %7222, %7221 ], !dbg !321 + %7223 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1321 = icmp eq i32 %7223, 0, !dbg !321 + br i1 %.not.i1321, label %7226, label %7224, !dbg !321 + +7224: ; preds = %__nv_exp2f.exit1320 + %7225 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7083) #3, !dbg !321 + br label %__nv_exp2f.exit1323, !dbg !321 + +7226: ; preds = %__nv_exp2f.exit1320 + %7227 = tail call float @llvm.nvvm.ex2.approx.f(float %7083) #3, !dbg !321 + br label %__nv_exp2f.exit1323, !dbg !321 + +__nv_exp2f.exit1323: ; preds = %7224, %7226 + %.0.i1322 = phi float [ %7225, %7224 ], [ %7227, %7226 ], !dbg !321 + %7228 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1324 = icmp eq i32 %7228, 0, !dbg !321 + br i1 %.not.i1324, label %7231, label %7229, !dbg !321 + +7229: ; preds = %__nv_exp2f.exit1323 + %7230 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7084) #3, !dbg !321 + br label %__nv_exp2f.exit1326, !dbg !321 + +7231: ; preds = %__nv_exp2f.exit1323 + %7232 = tail call float @llvm.nvvm.ex2.approx.f(float %7084) #3, !dbg !321 + br label %__nv_exp2f.exit1326, !dbg !321 + +__nv_exp2f.exit1326: ; preds = %7229, %7231 + %.0.i1325 = phi float [ %7230, %7229 ], [ %7232, %7231 ], !dbg !321 + %7233 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1327 = icmp eq i32 %7233, 0, !dbg !321 + br i1 %.not.i1327, label %7236, label %7234, !dbg !321 + +7234: ; preds = %__nv_exp2f.exit1326 + %7235 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7085) #3, !dbg !321 + br label %__nv_exp2f.exit1329, !dbg !321 + +7236: ; preds = %__nv_exp2f.exit1326 + %7237 = tail call float @llvm.nvvm.ex2.approx.f(float %7085) #3, !dbg !321 + br label %__nv_exp2f.exit1329, !dbg !321 + +__nv_exp2f.exit1329: ; preds = %7234, %7236 + %.0.i1328 = phi float [ %7235, %7234 ], [ %7237, %7236 ], !dbg !321 + %7238 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1330 = icmp eq i32 %7238, 0, !dbg !321 + br i1 %.not.i1330, label %7241, label %7239, !dbg !321 + +7239: ; preds = %__nv_exp2f.exit1329 + %7240 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7086) #3, !dbg !321 + br label %__nv_exp2f.exit1332, !dbg !321 + +7241: ; preds = %__nv_exp2f.exit1329 + %7242 = tail call float @llvm.nvvm.ex2.approx.f(float %7086) #3, !dbg !321 + br label %__nv_exp2f.exit1332, !dbg !321 + +__nv_exp2f.exit1332: ; preds = %7239, %7241 + %.0.i1331 = phi float [ %7240, %7239 ], [ %7242, %7241 ], !dbg !321 + %7243 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !321 + %.not.i1333 = icmp eq i32 %7243, 0, !dbg !321 + br i1 %.not.i1333, label %7246, label %7244, !dbg !321 + +7244: ; preds = %__nv_exp2f.exit1332 + %7245 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %7087) #3, !dbg !321 + br label %__nv_exp2f.exit1335, !dbg !321 + +7246: ; preds = %__nv_exp2f.exit1332 + %7247 = tail call float @llvm.nvvm.ex2.approx.f(float %7087) #3, !dbg !321 + br label %__nv_exp2f.exit1335, !dbg !321 + +__nv_exp2f.exit1335: ; preds = %7244, %7246 + %.0.i1334 = phi float [ %7245, %7244 ], [ %7247, %7246 ], !dbg !321 + %7248 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %6180, !dbg !296 + %7249 = insertelement <2 x float> poison, float %.0.i1241, i64 0, !dbg !322 + %7250 = insertelement <2 x float> %7249, float %.0.i1244, i64 1, !dbg !322 + %7251 = fptrunc <2 x float> %7250 to <2 x bfloat>, !dbg !322 + %7252 = insertelement <2 x float> poison, float %.0.i1247, i64 0, !dbg !322 + %7253 = insertelement <2 x float> %7252, float %.0.i1250, i64 1, !dbg !322 + %7254 = fptrunc <2 x float> %7253 to <2 x bfloat>, !dbg !322 + %7255 = insertelement <2 x float> poison, float %.0.i1253, i64 0, !dbg !322 + %7256 = insertelement <2 x float> %7255, float %.0.i1256, i64 1, !dbg !322 + %7257 = fptrunc <2 x float> %7256 to <2 x bfloat>, !dbg !322 + %7258 = insertelement <2 x float> poison, float %.0.i1259, i64 0, !dbg !322 + %7259 = insertelement <2 x float> %7258, float %.0.i1262, i64 1, !dbg !322 + %7260 = fptrunc <2 x float> %7259 to <2 x bfloat>, !dbg !322 + %7261 = insertelement <2 x float> poison, float %.0.i1265, i64 0, !dbg !322 + %7262 = insertelement <2 x float> %7261, float %.0.i1268, i64 1, !dbg !322 + %7263 = fptrunc <2 x float> %7262 to <2 x bfloat>, !dbg !322 + %7264 = insertelement <2 x float> poison, float %.0.i1271, i64 0, !dbg !322 + %7265 = insertelement <2 x float> %7264, float %.0.i1274, i64 1, !dbg !322 + %7266 = fptrunc <2 x float> %7265 to <2 x bfloat>, !dbg !322 + %7267 = insertelement <2 x float> poison, float %.0.i1277, i64 0, !dbg !322 + %7268 = insertelement <2 x float> %7267, float %.0.i1280, i64 1, !dbg !322 + %7269 = fptrunc <2 x float> %7268 to <2 x bfloat>, !dbg !322 + %7270 = insertelement <2 x float> poison, float %.0.i1283, i64 0, !dbg !322 + %7271 = insertelement <2 x float> %7270, float %.0.i1286, i64 1, !dbg !322 + %7272 = fptrunc <2 x float> %7271 to <2 x bfloat>, !dbg !322 + %7273 = insertelement <2 x float> poison, float %.0.i1289, i64 0, !dbg !322 + %7274 = insertelement <2 x float> %7273, float %.0.i1292, i64 1, !dbg !322 + %7275 = fptrunc <2 x float> %7274 to <2 x bfloat>, !dbg !322 + %7276 = insertelement <2 x float> poison, float %.0.i1295, i64 0, !dbg !322 + %7277 = insertelement <2 x float> %7276, float %.0.i1298, i64 1, !dbg !322 + %7278 = fptrunc <2 x float> %7277 to <2 x bfloat>, !dbg !322 + %7279 = insertelement <2 x float> poison, float %.0.i1301, i64 0, !dbg !322 + %7280 = insertelement <2 x float> %7279, float %.0.i1304, i64 1, !dbg !322 + %7281 = fptrunc <2 x float> %7280 to <2 x bfloat>, !dbg !322 + %7282 = insertelement <2 x float> poison, float %.0.i1307, i64 0, !dbg !322 + %7283 = insertelement <2 x float> %7282, float %.0.i1310, i64 1, !dbg !322 + %7284 = fptrunc <2 x float> %7283 to <2 x bfloat>, !dbg !322 + %7285 = insertelement <2 x float> poison, float %.0.i1313, i64 0, !dbg !322 + %7286 = insertelement <2 x float> %7285, float %.0.i1316, i64 1, !dbg !322 + %7287 = fptrunc <2 x float> %7286 to <2 x bfloat>, !dbg !322 + %7288 = insertelement <2 x float> poison, float %.0.i1319, i64 0, !dbg !322 + %7289 = insertelement <2 x float> %7288, float %.0.i1322, i64 1, !dbg !322 + %7290 = fptrunc <2 x float> %7289 to <2 x bfloat>, !dbg !322 + %7291 = insertelement <2 x float> poison, float %.0.i1325, i64 0, !dbg !322 + %7292 = insertelement <2 x float> %7291, float %.0.i1328, i64 1, !dbg !322 + %7293 = fptrunc <2 x float> %7292 to <2 x bfloat>, !dbg !322 + %7294 = insertelement <2 x float> poison, float %.0.i1331, i64 0, !dbg !322 + %7295 = insertelement <2 x float> %7294, float %.0.i1334, i64 1, !dbg !322 + %7296 = fptrunc <2 x float> %7295 to <2 x bfloat>, !dbg !322 + %7297 = bitcast <2 x bfloat> %7251 to i32, !dbg !323 + %7298 = bitcast <2 x bfloat> %7254 to i32, !dbg !323 + %7299 = bitcast <2 x bfloat> %7257 to i32, !dbg !323 + %7300 = bitcast <2 x bfloat> %7260 to i32, !dbg !323 + %7301 = bitcast <2 x bfloat> %7263 to i32, !dbg !323 + %7302 = bitcast <2 x bfloat> %7266 to i32, !dbg !323 + %7303 = bitcast <2 x bfloat> %7269 to i32, !dbg !323 + %7304 = bitcast <2 x bfloat> %7272 to i32, !dbg !323 + %7305 = bitcast <2 x bfloat> %7275 to i32, !dbg !323 + %7306 = bitcast <2 x bfloat> %7278 to i32, !dbg !323 + %7307 = bitcast <2 x bfloat> %7281 to i32, !dbg !323 + %7308 = bitcast <2 x bfloat> %7284 to i32, !dbg !323 + %7309 = bitcast <2 x bfloat> %7287 to i32, !dbg !323 + %7310 = bitcast <2 x bfloat> %7290 to i32, !dbg !323 + %7311 = bitcast <2 x bfloat> %7293 to i32, !dbg !323 + %7312 = bitcast <2 x bfloat> %7296 to i32, !dbg !323 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !323 + %7313 = ptrtoint ptr addrspace(3) %7248 to i32, !dbg !323 + %7314 = lshr exact i32 %7313, 4, !dbg !323 + %7315 = and i32 %7314, 16383, !dbg !323 + %7316 = zext nneg i32 %7315 to i64, !dbg !323 + %7317 = or disjoint i64 %7316, 4611686293338849280, !dbg !323 + %7318 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %6023, float %6024, float %6025, float %6026, float %6027, float %6028, float %6029, float %6030, float %6031, float %6032, float %6033, float %6034, float %6035, float %6036, float %6037, float %6038, float %6039, float %6040, float %6041, float %6042, float %6043, float %6044, float %6045, float %6046, float %6047, float %6048, float %6049, float %6050, float %6051, float %6052, float %6053, float %6054, float %6055, float %6056, float %6057, float %6058, float %6059, float %6060, float %6061, float %6062, float %6063, float %6064, float %6065, float %6066, float %6067, float %6068, float %6069, float %6070, float %6071, float %6072, float %6073, float %6074, float %6075, float %6076, float %6077, float %6078, float %6079, float %6080, float %6081, float %6082, float %6083, float %6084, float %6085, float %6086, i32 %7297, i32 %7298, i32 %7299, i32 %7300, i64 %7317, i1 true) #3, !dbg !323 + %7319 = add i32 %7313, 2048, !dbg !323 + %7320 = lshr exact i32 %7319, 4, !dbg !323 + %7321 = and i32 %7320, 16383, !dbg !323 + %7322 = zext nneg i32 %7321 to i64, !dbg !323 + %7323 = or disjoint i64 %7322, 4611686293338849280, !dbg !323 + %7324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 0, !dbg !323 + %7325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 1, !dbg !323 + %7326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 2, !dbg !323 + %7327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 3, !dbg !323 + %7328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 4, !dbg !323 + %7329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 5, !dbg !323 + %7330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 6, !dbg !323 + %7331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 7, !dbg !323 + %7332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 8, !dbg !323 + %7333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 9, !dbg !323 + %7334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 10, !dbg !323 + %7335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 11, !dbg !323 + %7336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 12, !dbg !323 + %7337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 13, !dbg !323 + %7338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 14, !dbg !323 + %7339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 15, !dbg !323 + %7340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 16, !dbg !323 + %7341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 17, !dbg !323 + %7342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 18, !dbg !323 + %7343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 19, !dbg !323 + %7344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 20, !dbg !323 + %7345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 21, !dbg !323 + %7346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 22, !dbg !323 + %7347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 23, !dbg !323 + %7348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 24, !dbg !323 + %7349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 25, !dbg !323 + %7350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 26, !dbg !323 + %7351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 27, !dbg !323 + %7352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 28, !dbg !323 + %7353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 29, !dbg !323 + %7354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 30, !dbg !323 + %7355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 31, !dbg !323 + %7356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 32, !dbg !323 + %7357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 33, !dbg !323 + %7358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 34, !dbg !323 + %7359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 35, !dbg !323 + %7360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 36, !dbg !323 + %7361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 37, !dbg !323 + %7362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 38, !dbg !323 + %7363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 39, !dbg !323 + %7364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 40, !dbg !323 + %7365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 41, !dbg !323 + %7366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 42, !dbg !323 + %7367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 43, !dbg !323 + %7368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 44, !dbg !323 + %7369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 45, !dbg !323 + %7370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 46, !dbg !323 + %7371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 47, !dbg !323 + %7372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 48, !dbg !323 + %7373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 49, !dbg !323 + %7374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 50, !dbg !323 + %7375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 51, !dbg !323 + %7376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 52, !dbg !323 + %7377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 53, !dbg !323 + %7378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 54, !dbg !323 + %7379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 55, !dbg !323 + %7380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 56, !dbg !323 + %7381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 57, !dbg !323 + %7382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 58, !dbg !323 + %7383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 59, !dbg !323 + %7384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 60, !dbg !323 + %7385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 61, !dbg !323 + %7386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 62, !dbg !323 + %7387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7318, 63, !dbg !323 + %7388 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %7324, float %7325, float %7326, float %7327, float %7328, float %7329, float %7330, float %7331, float %7332, float %7333, float %7334, float %7335, float %7336, float %7337, float %7338, float %7339, float %7340, float %7341, float %7342, float %7343, float %7344, float %7345, float %7346, float %7347, float %7348, float %7349, float %7350, float %7351, float %7352, float %7353, float %7354, float %7355, float %7356, float %7357, float %7358, float %7359, float %7360, float %7361, float %7362, float %7363, float %7364, float %7365, float %7366, float %7367, float %7368, float %7369, float %7370, float %7371, float %7372, float %7373, float %7374, float %7375, float %7376, float %7377, float %7378, float %7379, float %7380, float %7381, float %7382, float %7383, float %7384, float %7385, float %7386, float %7387, i32 %7301, i32 %7302, i32 %7303, i32 %7304, i64 %7323, i1 true) #3, !dbg !323 + %7389 = add i32 %7313, 4096, !dbg !323 + %7390 = lshr exact i32 %7389, 4, !dbg !323 + %7391 = and i32 %7390, 16383, !dbg !323 + %7392 = zext nneg i32 %7391 to i64, !dbg !323 + %7393 = or disjoint i64 %7392, 4611686293338849280, !dbg !323 + %7394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 0, !dbg !323 + %7395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 1, !dbg !323 + %7396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 2, !dbg !323 + %7397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 3, !dbg !323 + %7398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 4, !dbg !323 + %7399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 5, !dbg !323 + %7400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 6, !dbg !323 + %7401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 7, !dbg !323 + %7402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 8, !dbg !323 + %7403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 9, !dbg !323 + %7404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 10, !dbg !323 + %7405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 11, !dbg !323 + %7406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 12, !dbg !323 + %7407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 13, !dbg !323 + %7408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 14, !dbg !323 + %7409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 15, !dbg !323 + %7410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 16, !dbg !323 + %7411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 17, !dbg !323 + %7412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 18, !dbg !323 + %7413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 19, !dbg !323 + %7414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 20, !dbg !323 + %7415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 21, !dbg !323 + %7416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 22, !dbg !323 + %7417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 23, !dbg !323 + %7418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 24, !dbg !323 + %7419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 25, !dbg !323 + %7420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 26, !dbg !323 + %7421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 27, !dbg !323 + %7422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 28, !dbg !323 + %7423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 29, !dbg !323 + %7424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 30, !dbg !323 + %7425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 31, !dbg !323 + %7426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 32, !dbg !323 + %7427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 33, !dbg !323 + %7428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 34, !dbg !323 + %7429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 35, !dbg !323 + %7430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 36, !dbg !323 + %7431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 37, !dbg !323 + %7432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 38, !dbg !323 + %7433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 39, !dbg !323 + %7434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 40, !dbg !323 + %7435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 41, !dbg !323 + %7436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 42, !dbg !323 + %7437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 43, !dbg !323 + %7438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 44, !dbg !323 + %7439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 45, !dbg !323 + %7440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 46, !dbg !323 + %7441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 47, !dbg !323 + %7442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 48, !dbg !323 + %7443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 49, !dbg !323 + %7444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 50, !dbg !323 + %7445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 51, !dbg !323 + %7446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 52, !dbg !323 + %7447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 53, !dbg !323 + %7448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 54, !dbg !323 + %7449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 55, !dbg !323 + %7450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 56, !dbg !323 + %7451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 57, !dbg !323 + %7452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 58, !dbg !323 + %7453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 59, !dbg !323 + %7454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 60, !dbg !323 + %7455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 61, !dbg !323 + %7456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 62, !dbg !323 + %7457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7388, 63, !dbg !323 + %7458 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %7394, float %7395, float %7396, float %7397, float %7398, float %7399, float %7400, float %7401, float %7402, float %7403, float %7404, float %7405, float %7406, float %7407, float %7408, float %7409, float %7410, float %7411, float %7412, float %7413, float %7414, float %7415, float %7416, float %7417, float %7418, float %7419, float %7420, float %7421, float %7422, float %7423, float %7424, float %7425, float %7426, float %7427, float %7428, float %7429, float %7430, float %7431, float %7432, float %7433, float %7434, float %7435, float %7436, float %7437, float %7438, float %7439, float %7440, float %7441, float %7442, float %7443, float %7444, float %7445, float %7446, float %7447, float %7448, float %7449, float %7450, float %7451, float %7452, float %7453, float %7454, float %7455, float %7456, float %7457, i32 %7305, i32 %7306, i32 %7307, i32 %7308, i64 %7393, i1 true) #3, !dbg !323 + %7459 = add i32 %7313, 6144, !dbg !323 + %7460 = lshr exact i32 %7459, 4, !dbg !323 + %7461 = and i32 %7460, 16383, !dbg !323 + %7462 = zext nneg i32 %7461 to i64, !dbg !323 + %7463 = or disjoint i64 %7462, 4611686293338849280, !dbg !323 + %7464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 0, !dbg !323 + %7465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 1, !dbg !323 + %7466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 2, !dbg !323 + %7467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 3, !dbg !323 + %7468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 4, !dbg !323 + %7469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 5, !dbg !323 + %7470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 6, !dbg !323 + %7471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 7, !dbg !323 + %7472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 8, !dbg !323 + %7473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 9, !dbg !323 + %7474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 10, !dbg !323 + %7475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 11, !dbg !323 + %7476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 12, !dbg !323 + %7477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 13, !dbg !323 + %7478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 14, !dbg !323 + %7479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 15, !dbg !323 + %7480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 16, !dbg !323 + %7481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 17, !dbg !323 + %7482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 18, !dbg !323 + %7483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 19, !dbg !323 + %7484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 20, !dbg !323 + %7485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 21, !dbg !323 + %7486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 22, !dbg !323 + %7487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 23, !dbg !323 + %7488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 24, !dbg !323 + %7489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 25, !dbg !323 + %7490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 26, !dbg !323 + %7491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 27, !dbg !323 + %7492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 28, !dbg !323 + %7493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 29, !dbg !323 + %7494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 30, !dbg !323 + %7495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 31, !dbg !323 + %7496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 32, !dbg !323 + %7497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 33, !dbg !323 + %7498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 34, !dbg !323 + %7499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 35, !dbg !323 + %7500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 36, !dbg !323 + %7501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 37, !dbg !323 + %7502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 38, !dbg !323 + %7503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 39, !dbg !323 + %7504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 40, !dbg !323 + %7505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 41, !dbg !323 + %7506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 42, !dbg !323 + %7507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 43, !dbg !323 + %7508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 44, !dbg !323 + %7509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 45, !dbg !323 + %7510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 46, !dbg !323 + %7511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 47, !dbg !323 + %7512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 48, !dbg !323 + %7513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 49, !dbg !323 + %7514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 50, !dbg !323 + %7515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 51, !dbg !323 + %7516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 52, !dbg !323 + %7517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 53, !dbg !323 + %7518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 54, !dbg !323 + %7519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 55, !dbg !323 + %7520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 56, !dbg !323 + %7521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 57, !dbg !323 + %7522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 58, !dbg !323 + %7523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 59, !dbg !323 + %7524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 60, !dbg !323 + %7525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 61, !dbg !323 + %7526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 62, !dbg !323 + %7527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7458, 63, !dbg !323 + %7528 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %7464, float %7465, float %7466, float %7467, float %7468, float %7469, float %7470, float %7471, float %7472, float %7473, float %7474, float %7475, float %7476, float %7477, float %7478, float %7479, float %7480, float %7481, float %7482, float %7483, float %7484, float %7485, float %7486, float %7487, float %7488, float %7489, float %7490, float %7491, float %7492, float %7493, float %7494, float %7495, float %7496, float %7497, float %7498, float %7499, float %7500, float %7501, float %7502, float %7503, float %7504, float %7505, float %7506, float %7507, float %7508, float %7509, float %7510, float %7511, float %7512, float %7513, float %7514, float %7515, float %7516, float %7517, float %7518, float %7519, float %7520, float %7521, float %7522, float %7523, float %7524, float %7525, float %7526, float %7527, i32 %7309, i32 %7310, i32 %7311, i32 %7312, i64 %7463, i1 true) #3, !dbg !323 + %7529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 0, !dbg !323 + %7530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 1, !dbg !323 + %7531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 2, !dbg !323 + %7532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 3, !dbg !323 + %7533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 4, !dbg !323 + %7534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 5, !dbg !323 + %7535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 6, !dbg !323 + %7536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 7, !dbg !323 + %7537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 8, !dbg !323 + %7538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 9, !dbg !323 + %7539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 10, !dbg !323 + %7540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 11, !dbg !323 + %7541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 12, !dbg !323 + %7542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 13, !dbg !323 + %7543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 14, !dbg !323 + %7544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 15, !dbg !323 + %7545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 16, !dbg !323 + %7546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 17, !dbg !323 + %7547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 18, !dbg !323 + %7548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 19, !dbg !323 + %7549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 20, !dbg !323 + %7550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 21, !dbg !323 + %7551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 22, !dbg !323 + %7552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 23, !dbg !323 + %7553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 24, !dbg !323 + %7554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 25, !dbg !323 + %7555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 26, !dbg !323 + %7556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 27, !dbg !323 + %7557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 28, !dbg !323 + %7558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 29, !dbg !323 + %7559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 30, !dbg !323 + %7560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 31, !dbg !323 + %7561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 32, !dbg !323 + %7562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 33, !dbg !323 + %7563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 34, !dbg !323 + %7564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 35, !dbg !323 + %7565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 36, !dbg !323 + %7566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 37, !dbg !323 + %7567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 38, !dbg !323 + %7568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 39, !dbg !323 + %7569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 40, !dbg !323 + %7570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 41, !dbg !323 + %7571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 42, !dbg !323 + %7572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 43, !dbg !323 + %7573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 44, !dbg !323 + %7574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 45, !dbg !323 + %7575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 46, !dbg !323 + %7576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 47, !dbg !323 + %7577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 48, !dbg !323 + %7578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 49, !dbg !323 + %7579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 50, !dbg !323 + %7580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 51, !dbg !323 + %7581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 52, !dbg !323 + %7582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 53, !dbg !323 + %7583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 54, !dbg !323 + %7584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 55, !dbg !323 + %7585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 56, !dbg !323 + %7586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 57, !dbg !323 + %7587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 58, !dbg !323 + %7588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 59, !dbg !323 + %7589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 60, !dbg !323 + %7590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 61, !dbg !323 + %7591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 62, !dbg !323 + %7592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7528, 63, !dbg !323 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !323 + %7593 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %6182, !dbg !298 + %7594 = getelementptr inbounds nuw i8, ptr addrspace(3) %7593, i32 %5332, !dbg !298 + %7595 = getelementptr inbounds nuw i8, ptr addrspace(3) %7593, i32 %5336, !dbg !298 + %7596 = load float, ptr addrspace(3) %7595, align 8, !dbg !298 + %7597 = getelementptr inbounds nuw i8, ptr addrspace(3) %7595, i32 4, !dbg !298 + %7598 = load float, ptr addrspace(3) %7597, align 4, !dbg !298 + %7599 = getelementptr inbounds nuw i8, ptr addrspace(3) %7593, i32 %5342, !dbg !298 + %7600 = load float, ptr addrspace(3) %7599, align 8, !dbg !298 + %7601 = getelementptr inbounds nuw i8, ptr addrspace(3) %7599, i32 4, !dbg !298 + %7602 = load float, ptr addrspace(3) %7601, align 4, !dbg !298 + %7603 = getelementptr inbounds nuw i8, ptr addrspace(3) %7593, i32 %5348, !dbg !298 + %7604 = load float, ptr addrspace(3) %7603, align 8, !dbg !298 + %7605 = getelementptr inbounds nuw i8, ptr addrspace(3) %7603, i32 4, !dbg !298 + %7606 = load float, ptr addrspace(3) %7605, align 4, !dbg !298 + %7607 = getelementptr inbounds nuw i8, ptr addrspace(3) %7593, i32 %5354, !dbg !298 + %7608 = load float, ptr addrspace(3) %7607, align 8, !dbg !298 + %7609 = getelementptr inbounds nuw i8, ptr addrspace(3) %7607, i32 4, !dbg !298 + %7610 = load float, ptr addrspace(3) %7609, align 4, !dbg !298 + %7611 = getelementptr inbounds nuw i8, ptr addrspace(3) %7593, i32 %5360, !dbg !298 + %7612 = load float, ptr addrspace(3) %7611, align 8, !dbg !298 + %7613 = getelementptr inbounds nuw i8, ptr addrspace(3) %7611, i32 4, !dbg !298 + %7614 = load float, ptr addrspace(3) %7613, align 4, !dbg !298 + %7615 = getelementptr inbounds nuw i8, ptr addrspace(3) %7593, i32 %5366, !dbg !298 + %7616 = load float, ptr addrspace(3) %7615, align 8, !dbg !298 + %7617 = getelementptr inbounds nuw i8, ptr addrspace(3) %7615, i32 4, !dbg !298 + %7618 = load float, ptr addrspace(3) %7617, align 4, !dbg !298 + %7619 = getelementptr inbounds nuw i8, ptr addrspace(3) %7593, i32 %5372, !dbg !298 + %7620 = load float, ptr addrspace(3) %7619, align 8, !dbg !298 + %7621 = getelementptr inbounds nuw i8, ptr addrspace(3) %7619, i32 4, !dbg !298 + %7622 = load float, ptr addrspace(3) %7621, align 4, !dbg !298 + %7623 = add i32 %6250, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !324 + %7624 = lshr exact i32 %7623, 4, !dbg !324 + %7625 = and i32 %7624, 16383, !dbg !324 + %7626 = zext nneg i32 %7625 to i64, !dbg !324 + %7627 = or disjoint i64 %7626, 4611686293372403712, !dbg !324 + %7628 = add i32 %6262, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !324 + %7629 = lshr exact i32 %7628, 4, !dbg !324 + %7630 = and i32 %7629, 16383, !dbg !324 + %7631 = zext nneg i32 %7630 to i64, !dbg !324 + %7632 = or disjoint i64 %7631, 4611686293372403712, !dbg !324 + %7633 = add i32 %7313, 32, !dbg !324 + %7634 = lshr exact i32 %7633, 4, !dbg !324 + %7635 = and i32 %7634, 16383, !dbg !324 + %7636 = zext nneg i32 %7635 to i64, !dbg !324 + %7637 = or disjoint i64 %7636, 4611686293338849280, !dbg !324 + %7638 = add i32 %6306, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !324 + %7639 = lshr exact i32 %7638, 4, !dbg !324 + %7640 = and i32 %7639, 16383, !dbg !324 + %7641 = zext nneg i32 %7640 to i64, !dbg !324 + %7642 = or disjoint i64 %7641, 4611686293372403712, !dbg !324 + %7643 = add i32 %7313, 64, !dbg !324 + %7644 = lshr exact i32 %7643, 4, !dbg !324 + %7645 = and i32 %7644, 16383, !dbg !324 + %7646 = zext nneg i32 %7645 to i64, !dbg !324 + %7647 = or disjoint i64 %7646, 4611686293338849280, !dbg !324 + %7648 = add i32 %6350, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !324 + %7649 = lshr exact i32 %7648, 4, !dbg !324 + %7650 = and i32 %7649, 16383, !dbg !324 + %7651 = zext nneg i32 %7650 to i64, !dbg !324 + %7652 = or disjoint i64 %7651, 4611686293372403712, !dbg !324 + %7653 = add i32 %7313, 96, !dbg !324 + %7654 = lshr exact i32 %7653, 4, !dbg !324 + %7655 = and i32 %7654, 16383, !dbg !324 + %7656 = zext nneg i32 %7655 to i64, !dbg !324 + %7657 = or disjoint i64 %7656, 4611686293338849280, !dbg !324 + %7658 = add i32 %6394, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !324 + %7659 = lshr exact i32 %7658, 4, !dbg !324 + %7660 = and i32 %7659, 16383, !dbg !324 + %7661 = zext nneg i32 %7660 to i64, !dbg !324 + %7662 = or disjoint i64 %7661, 4611686293372403712, !dbg !324 + %7663 = add i32 %7313, 8192, !dbg !324 + %7664 = lshr exact i32 %7663, 4, !dbg !324 + %7665 = and i32 %7664, 16383, !dbg !324 + %7666 = zext nneg i32 %7665 to i64, !dbg !324 + %7667 = or disjoint i64 %7666, 4611686293338849280, !dbg !324 + %7668 = add i32 %6438, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !324 + %7669 = lshr exact i32 %7668, 4, !dbg !324 + %7670 = and i32 %7669, 16383, !dbg !324 + %7671 = zext nneg i32 %7670 to i64, !dbg !324 + %7672 = or disjoint i64 %7671, 4611686293372403712, !dbg !324 + %7673 = add i32 %7313, 8224, !dbg !324 + %7674 = lshr exact i32 %7673, 4, !dbg !324 + %7675 = and i32 %7674, 16383, !dbg !324 + %7676 = zext nneg i32 %7675 to i64, !dbg !324 + %7677 = or disjoint i64 %7676, 4611686293338849280, !dbg !324 + %7678 = add i32 %6482, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !324 + %7679 = lshr exact i32 %7678, 4, !dbg !324 + %7680 = and i32 %7679, 16383, !dbg !324 + %7681 = zext nneg i32 %7680 to i64, !dbg !324 + %7682 = or disjoint i64 %7681, 4611686293372403712, !dbg !324 + %7683 = add i32 %7313, 8256, !dbg !324 + %7684 = lshr exact i32 %7683, 4, !dbg !324 + %7685 = and i32 %7684, 16383, !dbg !324 + %7686 = zext nneg i32 %7685 to i64, !dbg !324 + %7687 = or disjoint i64 %7686, 4611686293338849280, !dbg !324 + %7688 = add i32 %6526, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !324 + %7689 = lshr exact i32 %7688, 4, !dbg !324 + %7690 = and i32 %7689, 16383, !dbg !324 + %7691 = zext nneg i32 %7690 to i64, !dbg !324 + %7692 = or disjoint i64 %7691, 4611686293372403712, !dbg !324 + %7693 = add i32 %7313, 8288, !dbg !324 + %7694 = lshr exact i32 %7693, 4, !dbg !324 + %7695 = and i32 %7694, 16383, !dbg !324 + %7696 = zext nneg i32 %7695 to i64, !dbg !324 + %7697 = or disjoint i64 %7696, 4611686293338849280, !dbg !324 + %7698 = add i32 %6256, 2048, !dbg !325 + %7699 = lshr exact i32 %7698, 4, !dbg !325 + %7700 = and i32 %7699, 16383, !dbg !325 + %7701 = zext nneg i32 %7700 to i64, !dbg !325 + %7702 = or disjoint i64 %7701, 4611686293338849280, !dbg !325 + %7703 = add i32 %6256, 4096, !dbg !325 + %7704 = lshr exact i32 %7703, 4, !dbg !325 + %7705 = and i32 %7704, 16383, !dbg !325 + %7706 = zext nneg i32 %7705 to i64, !dbg !325 + %7707 = or disjoint i64 %7706, 4611686293338849280, !dbg !325 + %7708 = add i32 %6256, 6144, !dbg !325 + %7709 = lshr exact i32 %7708, 4, !dbg !325 + %7710 = and i32 %7709, 16383, !dbg !325 + %7711 = zext nneg i32 %7710 to i64, !dbg !325 + %7712 = or disjoint i64 %7711, 4611686293338849280, !dbg !325 + %7713 = insertelement <2 x i32> poison, i32 %5996, i64 0, !dbg !326 + %7714 = shufflevector <2 x i32> %7713, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !326 + %7715 = add <2 x i32> %6152, %7714, !dbg !326 + %7716 = add i32 %5994, %5996, !dbg !326 + %7717 = add i32 %5995, %5996, !dbg !326 + %7718 = insertelement <4 x i32> poison, i32 %5996, i64 0, !dbg !326 + %7719 = shufflevector <4 x i32> %7718, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !326 + %7720 = add <4 x i32> %6156, %7719, !dbg !326 + %7721 = insertelement <8 x i32> poison, i32 %5996, i64 0, !dbg !326 + %7722 = shufflevector <8 x i32> %7721, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !326 + %7723 = add <8 x i32> %6155, %7722, !dbg !326 + %7724 = add nuw nsw i32 %6151, 1, !dbg !255 + %7725 = lshr i32 %7724, 1, !dbg !327 + %7726 = zext nneg i32 %7725 to i64, !dbg !328 + %7727 = getelementptr i32, ptr addrspace(1) %5128, i64 %7726, !dbg !328 + %7728 = add nuw nsw i32 %7725, 1, !dbg !329 + %7729 = icmp slt i32 %7728, %5133, !dbg !330 + %7730 = getelementptr i8, ptr addrspace(1) %7727, i64 4, !dbg !331 + %7731 = and i1 %6158, %7729, !dbg !255 + %7732 = and i32 %6151, 1, !dbg !332 + %7733 = xor i32 %7732, 1, !dbg !333 + %7734 = shl nuw nsw i32 %7732, 6, !dbg !334 + %7735 = load <2 x float>, ptr addrspace(3) %7594, align 8, !dbg !298 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !324 + %7736 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %7627, i64 %7317) #3, !dbg !324 + %7737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 0, !dbg !324 + %7738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 1, !dbg !324 + %7739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 2, !dbg !324 + %7740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 3, !dbg !324 + %7741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 4, !dbg !324 + %7742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 5, !dbg !324 + %7743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 6, !dbg !324 + %7744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 7, !dbg !324 + %7745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 8, !dbg !324 + %7746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 9, !dbg !324 + %7747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 10, !dbg !324 + %7748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 11, !dbg !324 + %7749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 12, !dbg !324 + %7750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 13, !dbg !324 + %7751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 14, !dbg !324 + %7752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 15, !dbg !324 + %7753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 16, !dbg !324 + %7754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 17, !dbg !324 + %7755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 18, !dbg !324 + %7756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 19, !dbg !324 + %7757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 20, !dbg !324 + %7758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 21, !dbg !324 + %7759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 22, !dbg !324 + %7760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 23, !dbg !324 + %7761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 24, !dbg !324 + %7762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 25, !dbg !324 + %7763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 26, !dbg !324 + %7764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 27, !dbg !324 + %7765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 28, !dbg !324 + %7766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 29, !dbg !324 + %7767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 30, !dbg !324 + %7768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7736, 31, !dbg !324 + %7769 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7737, float %7738, float %7739, float %7740, float %7741, float %7742, float %7743, float %7744, float %7745, float %7746, float %7747, float %7748, float %7749, float %7750, float %7751, float %7752, float %7753, float %7754, float %7755, float %7756, float %7757, float %7758, float %7759, float %7760, float %7761, float %7762, float %7763, float %7764, float %7765, float %7766, float %7767, float %7768, i64 %7632, i64 %7637, i1 true) #3, !dbg !324 + %7770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 0, !dbg !324 + %7771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 1, !dbg !324 + %7772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 2, !dbg !324 + %7773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 3, !dbg !324 + %7774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 4, !dbg !324 + %7775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 5, !dbg !324 + %7776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 6, !dbg !324 + %7777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 7, !dbg !324 + %7778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 8, !dbg !324 + %7779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 9, !dbg !324 + %7780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 10, !dbg !324 + %7781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 11, !dbg !324 + %7782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 12, !dbg !324 + %7783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 13, !dbg !324 + %7784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 14, !dbg !324 + %7785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 15, !dbg !324 + %7786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 16, !dbg !324 + %7787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 17, !dbg !324 + %7788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 18, !dbg !324 + %7789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 19, !dbg !324 + %7790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 20, !dbg !324 + %7791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 21, !dbg !324 + %7792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 22, !dbg !324 + %7793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 23, !dbg !324 + %7794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 24, !dbg !324 + %7795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 25, !dbg !324 + %7796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 26, !dbg !324 + %7797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 27, !dbg !324 + %7798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 28, !dbg !324 + %7799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 29, !dbg !324 + %7800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 30, !dbg !324 + %7801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7769, 31, !dbg !324 + %7802 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7770, float %7771, float %7772, float %7773, float %7774, float %7775, float %7776, float %7777, float %7778, float %7779, float %7780, float %7781, float %7782, float %7783, float %7784, float %7785, float %7786, float %7787, float %7788, float %7789, float %7790, float %7791, float %7792, float %7793, float %7794, float %7795, float %7796, float %7797, float %7798, float %7799, float %7800, float %7801, i64 %7642, i64 %7647, i1 true) #3, !dbg !324 + %7803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 0, !dbg !324 + %7804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 1, !dbg !324 + %7805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 2, !dbg !324 + %7806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 3, !dbg !324 + %7807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 4, !dbg !324 + %7808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 5, !dbg !324 + %7809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 6, !dbg !324 + %7810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 7, !dbg !324 + %7811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 8, !dbg !324 + %7812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 9, !dbg !324 + %7813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 10, !dbg !324 + %7814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 11, !dbg !324 + %7815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 12, !dbg !324 + %7816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 13, !dbg !324 + %7817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 14, !dbg !324 + %7818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 15, !dbg !324 + %7819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 16, !dbg !324 + %7820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 17, !dbg !324 + %7821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 18, !dbg !324 + %7822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 19, !dbg !324 + %7823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 20, !dbg !324 + %7824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 21, !dbg !324 + %7825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 22, !dbg !324 + %7826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 23, !dbg !324 + %7827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 24, !dbg !324 + %7828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 25, !dbg !324 + %7829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 26, !dbg !324 + %7830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 27, !dbg !324 + %7831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 28, !dbg !324 + %7832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 29, !dbg !324 + %7833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 30, !dbg !324 + %7834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7802, 31, !dbg !324 + %7835 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7803, float %7804, float %7805, float %7806, float %7807, float %7808, float %7809, float %7810, float %7811, float %7812, float %7813, float %7814, float %7815, float %7816, float %7817, float %7818, float %7819, float %7820, float %7821, float %7822, float %7823, float %7824, float %7825, float %7826, float %7827, float %7828, float %7829, float %7830, float %7831, float %7832, float %7833, float %7834, i64 %7652, i64 %7657, i1 true) #3, !dbg !324 + %7836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 0, !dbg !324 + %7837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 1, !dbg !324 + %7838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 2, !dbg !324 + %7839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 3, !dbg !324 + %7840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 4, !dbg !324 + %7841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 5, !dbg !324 + %7842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 6, !dbg !324 + %7843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 7, !dbg !324 + %7844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 8, !dbg !324 + %7845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 9, !dbg !324 + %7846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 10, !dbg !324 + %7847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 11, !dbg !324 + %7848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 12, !dbg !324 + %7849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 13, !dbg !324 + %7850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 14, !dbg !324 + %7851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 15, !dbg !324 + %7852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 16, !dbg !324 + %7853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 17, !dbg !324 + %7854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 18, !dbg !324 + %7855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 19, !dbg !324 + %7856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 20, !dbg !324 + %7857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 21, !dbg !324 + %7858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 22, !dbg !324 + %7859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 23, !dbg !324 + %7860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 24, !dbg !324 + %7861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 25, !dbg !324 + %7862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 26, !dbg !324 + %7863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 27, !dbg !324 + %7864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 28, !dbg !324 + %7865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 29, !dbg !324 + %7866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 30, !dbg !324 + %7867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7835, 31, !dbg !324 + %7868 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7836, float %7837, float %7838, float %7839, float %7840, float %7841, float %7842, float %7843, float %7844, float %7845, float %7846, float %7847, float %7848, float %7849, float %7850, float %7851, float %7852, float %7853, float %7854, float %7855, float %7856, float %7857, float %7858, float %7859, float %7860, float %7861, float %7862, float %7863, float %7864, float %7865, float %7866, float %7867, i64 %7662, i64 %7667, i1 true) #3, !dbg !324 + %7869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 0, !dbg !324 + %7870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 1, !dbg !324 + %7871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 2, !dbg !324 + %7872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 3, !dbg !324 + %7873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 4, !dbg !324 + %7874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 5, !dbg !324 + %7875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 6, !dbg !324 + %7876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 7, !dbg !324 + %7877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 8, !dbg !324 + %7878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 9, !dbg !324 + %7879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 10, !dbg !324 + %7880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 11, !dbg !324 + %7881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 12, !dbg !324 + %7882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 13, !dbg !324 + %7883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 14, !dbg !324 + %7884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 15, !dbg !324 + %7885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 16, !dbg !324 + %7886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 17, !dbg !324 + %7887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 18, !dbg !324 + %7888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 19, !dbg !324 + %7889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 20, !dbg !324 + %7890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 21, !dbg !324 + %7891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 22, !dbg !324 + %7892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 23, !dbg !324 + %7893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 24, !dbg !324 + %7894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 25, !dbg !324 + %7895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 26, !dbg !324 + %7896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 27, !dbg !324 + %7897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 28, !dbg !324 + %7898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 29, !dbg !324 + %7899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 30, !dbg !324 + %7900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7868, 31, !dbg !324 + %7901 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7869, float %7870, float %7871, float %7872, float %7873, float %7874, float %7875, float %7876, float %7877, float %7878, float %7879, float %7880, float %7881, float %7882, float %7883, float %7884, float %7885, float %7886, float %7887, float %7888, float %7889, float %7890, float %7891, float %7892, float %7893, float %7894, float %7895, float %7896, float %7897, float %7898, float %7899, float %7900, i64 %7672, i64 %7677, i1 true) #3, !dbg !324 + %7902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 0, !dbg !324 + %7903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 1, !dbg !324 + %7904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 2, !dbg !324 + %7905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 3, !dbg !324 + %7906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 4, !dbg !324 + %7907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 5, !dbg !324 + %7908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 6, !dbg !324 + %7909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 7, !dbg !324 + %7910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 8, !dbg !324 + %7911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 9, !dbg !324 + %7912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 10, !dbg !324 + %7913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 11, !dbg !324 + %7914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 12, !dbg !324 + %7915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 13, !dbg !324 + %7916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 14, !dbg !324 + %7917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 15, !dbg !324 + %7918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 16, !dbg !324 + %7919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 17, !dbg !324 + %7920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 18, !dbg !324 + %7921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 19, !dbg !324 + %7922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 20, !dbg !324 + %7923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 21, !dbg !324 + %7924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 22, !dbg !324 + %7925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 23, !dbg !324 + %7926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 24, !dbg !324 + %7927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 25, !dbg !324 + %7928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 26, !dbg !324 + %7929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 27, !dbg !324 + %7930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 28, !dbg !324 + %7931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 29, !dbg !324 + %7932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 30, !dbg !324 + %7933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7901, 31, !dbg !324 + %7934 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7902, float %7903, float %7904, float %7905, float %7906, float %7907, float %7908, float %7909, float %7910, float %7911, float %7912, float %7913, float %7914, float %7915, float %7916, float %7917, float %7918, float %7919, float %7920, float %7921, float %7922, float %7923, float %7924, float %7925, float %7926, float %7927, float %7928, float %7929, float %7930, float %7931, float %7932, float %7933, i64 %7682, i64 %7687, i1 true) #3, !dbg !324 + %7935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 0, !dbg !324 + %7936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 1, !dbg !324 + %7937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 2, !dbg !324 + %7938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 3, !dbg !324 + %7939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 4, !dbg !324 + %7940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 5, !dbg !324 + %7941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 6, !dbg !324 + %7942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 7, !dbg !324 + %7943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 8, !dbg !324 + %7944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 9, !dbg !324 + %7945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 10, !dbg !324 + %7946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 11, !dbg !324 + %7947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 12, !dbg !324 + %7948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 13, !dbg !324 + %7949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 14, !dbg !324 + %7950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 15, !dbg !324 + %7951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 16, !dbg !324 + %7952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 17, !dbg !324 + %7953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 18, !dbg !324 + %7954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 19, !dbg !324 + %7955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 20, !dbg !324 + %7956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 21, !dbg !324 + %7957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 22, !dbg !324 + %7958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 23, !dbg !324 + %7959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 24, !dbg !324 + %7960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 25, !dbg !324 + %7961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 26, !dbg !324 + %7962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 27, !dbg !324 + %7963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 28, !dbg !324 + %7964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 29, !dbg !324 + %7965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 30, !dbg !324 + %7966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7934, 31, !dbg !324 + %7967 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7935, float %7936, float %7937, float %7938, float %7939, float %7940, float %7941, float %7942, float %7943, float %7944, float %7945, float %7946, float %7947, float %7948, float %7949, float %7950, float %7951, float %7952, float %7953, float %7954, float %7955, float %7956, float %7957, float %7958, float %7959, float %7960, float %7961, float %7962, float %7963, float %7964, float %7965, float %7966, i64 %7692, i64 %7697, i1 true) #3, !dbg !324 + %7968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 0, !dbg !324 + %7969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 1, !dbg !324 + %7970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 2, !dbg !324 + %7971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 3, !dbg !324 + %7972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 4, !dbg !324 + %7973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 5, !dbg !324 + %7974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 6, !dbg !324 + %7975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 7, !dbg !324 + %7976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 8, !dbg !324 + %7977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 9, !dbg !324 + %7978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 10, !dbg !324 + %7979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 11, !dbg !324 + %7980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 12, !dbg !324 + %7981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 13, !dbg !324 + %7982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 14, !dbg !324 + %7983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 15, !dbg !324 + %7984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 16, !dbg !324 + %7985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 17, !dbg !324 + %7986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 18, !dbg !324 + %7987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 19, !dbg !324 + %7988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 20, !dbg !324 + %7989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 21, !dbg !324 + %7990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 22, !dbg !324 + %7991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 23, !dbg !324 + %7992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 24, !dbg !324 + %7993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 25, !dbg !324 + %7994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 26, !dbg !324 + %7995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 27, !dbg !324 + %7996 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 28, !dbg !324 + %7997 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 29, !dbg !324 + %7998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 30, !dbg !324 + %7999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7967, 31, !dbg !324 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !324 + %8000 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %7968, float %7969, float %7970, float %7971, float %7972, float %7973, float %7974, float %7975, float %7976, float %7977, float %7978, float %7979, float %7980, float %7981, float %7982, float %7983, float %7984, float %7985, float %7986, float %7987, float %7988, float %7989, float %7990, float %7991, float %7992, float %7993, float %7994, float %7995, float %7996, float %7997, float %7998, float %7999, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 0, i32 0, ptr addrspace(3) %7248, i32 0, i32 0) #3, !dbg !324 + %8001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 0, !dbg !324 + %8002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 1, !dbg !324 + %8003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 2, !dbg !324 + %8004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 3, !dbg !324 + %8005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 4, !dbg !324 + %8006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 5, !dbg !324 + %8007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 6, !dbg !324 + %8008 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 7, !dbg !324 + %8009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 8, !dbg !324 + %8010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 9, !dbg !324 + %8011 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 10, !dbg !324 + %8012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 11, !dbg !324 + %8013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 12, !dbg !324 + %8014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 13, !dbg !324 + %8015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 14, !dbg !324 + %8016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 15, !dbg !324 + %8017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 16, !dbg !324 + %8018 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 17, !dbg !324 + %8019 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 18, !dbg !324 + %8020 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 19, !dbg !324 + %8021 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 20, !dbg !324 + %8022 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 21, !dbg !324 + %8023 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 22, !dbg !324 + %8024 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 23, !dbg !324 + %8025 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 24, !dbg !324 + %8026 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 25, !dbg !324 + %8027 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 26, !dbg !324 + %8028 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 27, !dbg !324 + %8029 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 28, !dbg !324 + %8030 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 29, !dbg !324 + %8031 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 30, !dbg !324 + %8032 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8000, 31, !dbg !324 + %8033 = insertelement <2 x float> poison, float %8003, i64 0, !dbg !335 + %8034 = insertelement <2 x float> %8033, float %8004, i64 1, !dbg !335 + %8035 = fsub <2 x float> %8034, %7735, !dbg !335 + %8036 = fsub float %8005, %7596, !dbg !335 + %8037 = fsub float %8006, %7598, !dbg !335 + %8038 = fsub float %8007, %7596, !dbg !335 + %8039 = fsub float %8008, %7598, !dbg !335 + %8040 = fsub float %8009, %7600, !dbg !335 + %8041 = fsub float %8010, %7602, !dbg !335 + %8042 = fsub float %8011, %7600, !dbg !335 + %8043 = fsub float %8012, %7602, !dbg !335 + %8044 = fsub float %8013, %7604, !dbg !335 + %8045 = fsub float %8014, %7606, !dbg !335 + %8046 = fsub float %8015, %7604, !dbg !335 + %8047 = fsub float %8016, %7606, !dbg !335 + %8048 = fsub float %8017, %7608, !dbg !335 + %8049 = fsub float %8018, %7610, !dbg !335 + %8050 = fsub float %8019, %7608, !dbg !335 + %8051 = fsub float %8020, %7610, !dbg !335 + %8052 = fsub float %8021, %7612, !dbg !335 + %8053 = fsub float %8022, %7614, !dbg !335 + %8054 = fsub float %8023, %7612, !dbg !335 + %8055 = fsub float %8024, %7614, !dbg !335 + %8056 = fsub float %8025, %7616, !dbg !335 + %8057 = fsub float %8026, %7618, !dbg !335 + %8058 = fsub float %8027, %7616, !dbg !335 + %8059 = fsub float %8028, %7618, !dbg !335 + %8060 = fsub float %8029, %7620, !dbg !335 + %8061 = fsub float %8030, %7622, !dbg !335 + %8062 = fsub float %8031, %7620, !dbg !335 + %8063 = fsub float %8032, %7622, !dbg !335 + %8064 = fmul <2 x float> %7253, %8035, !dbg !336 + %8065 = fmul float %.0.i1253, %8036, !dbg !336 + %8066 = fmul float %.0.i1256, %8037, !dbg !336 + %8067 = fmul float %.0.i1259, %8038, !dbg !336 + %8068 = fmul float %.0.i1262, %8039, !dbg !336 + %8069 = fmul float %.0.i1265, %8040, !dbg !336 + %8070 = fmul float %.0.i1268, %8041, !dbg !336 + %8071 = fmul float %.0.i1271, %8042, !dbg !336 + %8072 = fmul float %.0.i1274, %8043, !dbg !336 + %8073 = fmul float %.0.i1277, %8044, !dbg !336 + %8074 = fmul float %.0.i1280, %8045, !dbg !336 + %8075 = fmul float %.0.i1283, %8046, !dbg !336 + %8076 = fmul float %.0.i1286, %8047, !dbg !336 + %8077 = fmul float %.0.i1289, %8048, !dbg !336 + %8078 = fmul float %.0.i1292, %8049, !dbg !336 + %8079 = fmul float %.0.i1295, %8050, !dbg !336 + %8080 = fmul float %.0.i1298, %8051, !dbg !336 + %8081 = fmul float %.0.i1301, %8052, !dbg !336 + %8082 = fmul float %.0.i1304, %8053, !dbg !336 + %8083 = fmul float %.0.i1307, %8054, !dbg !336 + %8084 = fmul float %.0.i1310, %8055, !dbg !336 + %8085 = fmul float %.0.i1313, %8056, !dbg !336 + %8086 = fmul float %.0.i1316, %8057, !dbg !336 + %8087 = fmul float %.0.i1319, %8058, !dbg !336 + %8088 = fmul float %.0.i1322, %8059, !dbg !336 + %8089 = fmul float %.0.i1325, %8060, !dbg !336 + %8090 = fmul float %.0.i1328, %8061, !dbg !336 + %8091 = fmul float %.0.i1331, %8062, !dbg !336 + %8092 = fmul float %.0.i1334, %8063, !dbg !336 + %8093 = insertelement <2 x float> poison, float %8001, i64 0, !dbg !335 + %8094 = insertelement <2 x float> %8093, float %8002, i64 1, !dbg !335 + %8095 = fsub <2 x float> %8094, %7735, !dbg !335 + %8096 = fmul <2 x float> %7250, %8095, !dbg !336 + %8097 = fptrunc <2 x float> %8096 to <2 x bfloat>, !dbg !337 + %8098 = select <2 x i1> %6958, <2 x bfloat> %8097, <2 x bfloat> zeroinitializer, !dbg !338 + %8099 = fptrunc <2 x float> %8064 to <2 x bfloat>, !dbg !337 + %8100 = select <2 x i1> %6959, <2 x bfloat> %8099, <2 x bfloat> zeroinitializer, !dbg !338 + %8101 = fptrunc float %8065 to bfloat, !dbg !337 + %8102 = select i1 %6960, bfloat %8101, bfloat 0xR0000, !dbg !338 + %8103 = fptrunc float %8066 to bfloat, !dbg !337 + %8104 = select i1 %6961, bfloat %8103, bfloat 0xR0000, !dbg !338 + %8105 = fptrunc float %8067 to bfloat, !dbg !337 + %8106 = select i1 %6962, bfloat %8105, bfloat 0xR0000, !dbg !338 + %8107 = fptrunc float %8068 to bfloat, !dbg !337 + %8108 = select i1 %6963, bfloat %8107, bfloat 0xR0000, !dbg !338 + %8109 = fptrunc float %8069 to bfloat, !dbg !337 + %8110 = select i1 %6964, bfloat %8109, bfloat 0xR0000, !dbg !338 + %8111 = fptrunc float %8070 to bfloat, !dbg !337 + %8112 = select i1 %6965, bfloat %8111, bfloat 0xR0000, !dbg !338 + %8113 = fptrunc float %8071 to bfloat, !dbg !337 + %8114 = select i1 %6966, bfloat %8113, bfloat 0xR0000, !dbg !338 + %8115 = fptrunc float %8072 to bfloat, !dbg !337 + %8116 = select i1 %6967, bfloat %8115, bfloat 0xR0000, !dbg !338 + %8117 = fptrunc float %8073 to bfloat, !dbg !337 + %8118 = select i1 %6968, bfloat %8117, bfloat 0xR0000, !dbg !338 + %8119 = fptrunc float %8074 to bfloat, !dbg !337 + %8120 = select i1 %6969, bfloat %8119, bfloat 0xR0000, !dbg !338 + %8121 = fptrunc float %8075 to bfloat, !dbg !337 + %8122 = select i1 %6970, bfloat %8121, bfloat 0xR0000, !dbg !338 + %8123 = fptrunc float %8076 to bfloat, !dbg !337 + %8124 = select i1 %6971, bfloat %8123, bfloat 0xR0000, !dbg !338 + %8125 = fptrunc float %8077 to bfloat, !dbg !337 + %8126 = select i1 %6972, bfloat %8125, bfloat 0xR0000, !dbg !338 + %8127 = fptrunc float %8078 to bfloat, !dbg !337 + %8128 = select i1 %6973, bfloat %8127, bfloat 0xR0000, !dbg !338 + %8129 = fptrunc float %8079 to bfloat, !dbg !337 + %8130 = select i1 %6974, bfloat %8129, bfloat 0xR0000, !dbg !338 + %8131 = fptrunc float %8080 to bfloat, !dbg !337 + %8132 = select i1 %6975, bfloat %8131, bfloat 0xR0000, !dbg !338 + %8133 = fptrunc float %8081 to bfloat, !dbg !337 + %8134 = select i1 %6976, bfloat %8133, bfloat 0xR0000, !dbg !338 + %8135 = fptrunc float %8082 to bfloat, !dbg !337 + %8136 = select i1 %6977, bfloat %8135, bfloat 0xR0000, !dbg !338 + %8137 = fptrunc float %8083 to bfloat, !dbg !337 + %8138 = select i1 %6978, bfloat %8137, bfloat 0xR0000, !dbg !338 + %8139 = fptrunc float %8084 to bfloat, !dbg !337 + %8140 = select i1 %6979, bfloat %8139, bfloat 0xR0000, !dbg !338 + %8141 = fptrunc float %8085 to bfloat, !dbg !337 + %8142 = select i1 %6980, bfloat %8141, bfloat 0xR0000, !dbg !338 + %8143 = fptrunc float %8086 to bfloat, !dbg !337 + %8144 = select i1 %6981, bfloat %8143, bfloat 0xR0000, !dbg !338 + %8145 = fptrunc float %8087 to bfloat, !dbg !337 + %8146 = select i1 %6982, bfloat %8145, bfloat 0xR0000, !dbg !338 + %8147 = fptrunc float %8088 to bfloat, !dbg !337 + %8148 = select i1 %6983, bfloat %8147, bfloat 0xR0000, !dbg !338 + %8149 = fptrunc float %8089 to bfloat, !dbg !337 + %8150 = select i1 %6984, bfloat %8149, bfloat 0xR0000, !dbg !338 + %8151 = fptrunc float %8090 to bfloat, !dbg !337 + %8152 = select i1 %6985, bfloat %8151, bfloat 0xR0000, !dbg !338 + %8153 = fptrunc float %8091 to bfloat, !dbg !337 + %8154 = select i1 %6986, bfloat %8153, bfloat 0xR0000, !dbg !338 + %8155 = fptrunc float %8092 to bfloat, !dbg !337 + %8156 = select i1 %6987, bfloat %8155, bfloat 0xR0000, !dbg !338 + %8157 = bitcast <2 x bfloat> %8098 to i32, !dbg !325 + %8158 = bitcast <2 x bfloat> %8100 to i32, !dbg !325 + %8159 = insertelement <2 x bfloat> poison, bfloat %8102, i64 0, !dbg !325 + %8160 = insertelement <2 x bfloat> %8159, bfloat %8104, i64 1, !dbg !325 + %8161 = bitcast <2 x bfloat> %8160 to i32, !dbg !325 + %8162 = insertelement <2 x bfloat> poison, bfloat %8106, i64 0, !dbg !325 + %8163 = insertelement <2 x bfloat> %8162, bfloat %8108, i64 1, !dbg !325 + %8164 = bitcast <2 x bfloat> %8163 to i32, !dbg !325 + %8165 = insertelement <2 x bfloat> poison, bfloat %8110, i64 0, !dbg !325 + %8166 = insertelement <2 x bfloat> %8165, bfloat %8112, i64 1, !dbg !325 + %8167 = bitcast <2 x bfloat> %8166 to i32, !dbg !325 + %8168 = insertelement <2 x bfloat> poison, bfloat %8114, i64 0, !dbg !325 + %8169 = insertelement <2 x bfloat> %8168, bfloat %8116, i64 1, !dbg !325 + %8170 = bitcast <2 x bfloat> %8169 to i32, !dbg !325 + %8171 = insertelement <2 x bfloat> poison, bfloat %8118, i64 0, !dbg !325 + %8172 = insertelement <2 x bfloat> %8171, bfloat %8120, i64 1, !dbg !325 + %8173 = bitcast <2 x bfloat> %8172 to i32, !dbg !325 + %8174 = insertelement <2 x bfloat> poison, bfloat %8122, i64 0, !dbg !325 + %8175 = insertelement <2 x bfloat> %8174, bfloat %8124, i64 1, !dbg !325 + %8176 = bitcast <2 x bfloat> %8175 to i32, !dbg !325 + %8177 = insertelement <2 x bfloat> poison, bfloat %8126, i64 0, !dbg !325 + %8178 = insertelement <2 x bfloat> %8177, bfloat %8128, i64 1, !dbg !325 + %8179 = bitcast <2 x bfloat> %8178 to i32, !dbg !325 + %8180 = insertelement <2 x bfloat> poison, bfloat %8130, i64 0, !dbg !325 + %8181 = insertelement <2 x bfloat> %8180, bfloat %8132, i64 1, !dbg !325 + %8182 = bitcast <2 x bfloat> %8181 to i32, !dbg !325 + %8183 = insertelement <2 x bfloat> poison, bfloat %8134, i64 0, !dbg !325 + %8184 = insertelement <2 x bfloat> %8183, bfloat %8136, i64 1, !dbg !325 + %8185 = bitcast <2 x bfloat> %8184 to i32, !dbg !325 + %8186 = insertelement <2 x bfloat> poison, bfloat %8138, i64 0, !dbg !325 + %8187 = insertelement <2 x bfloat> %8186, bfloat %8140, i64 1, !dbg !325 + %8188 = bitcast <2 x bfloat> %8187 to i32, !dbg !325 + %8189 = insertelement <2 x bfloat> poison, bfloat %8142, i64 0, !dbg !325 + %8190 = insertelement <2 x bfloat> %8189, bfloat %8144, i64 1, !dbg !325 + %8191 = bitcast <2 x bfloat> %8190 to i32, !dbg !325 + %8192 = insertelement <2 x bfloat> poison, bfloat %8146, i64 0, !dbg !325 + %8193 = insertelement <2 x bfloat> %8192, bfloat %8148, i64 1, !dbg !325 + %8194 = bitcast <2 x bfloat> %8193 to i32, !dbg !325 + %8195 = insertelement <2 x bfloat> poison, bfloat %8150, i64 0, !dbg !325 + %8196 = insertelement <2 x bfloat> %8195, bfloat %8152, i64 1, !dbg !325 + %8197 = bitcast <2 x bfloat> %8196 to i32, !dbg !325 + %8198 = insertelement <2 x bfloat> poison, bfloat %8154, i64 0, !dbg !325 + %8199 = insertelement <2 x bfloat> %8198, bfloat %8156, i64 1, !dbg !325 + %8200 = bitcast <2 x bfloat> %8199 to i32, !dbg !325 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !325 + %8201 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %6087, float %6088, float %6089, float %6090, float %6091, float %6092, float %6093, float %6094, float %6095, float %6096, float %6097, float %6098, float %6099, float %6100, float %6101, float %6102, float %6103, float %6104, float %6105, float %6106, float %6107, float %6108, float %6109, float %6110, float %6111, float %6112, float %6113, float %6114, float %6115, float %6116, float %6117, float %6118, float %6119, float %6120, float %6121, float %6122, float %6123, float %6124, float %6125, float %6126, float %6127, float %6128, float %6129, float %6130, float %6131, float %6132, float %6133, float %6134, float %6135, float %6136, float %6137, float %6138, float %6139, float %6140, float %6141, float %6142, float %6143, float %6144, float %6145, float %6146, float %6147, float %6148, float %6149, float %6150, i32 %8157, i32 %8158, i32 %8161, i32 %8164, i64 %6260, i1 true) #3, !dbg !325 + %8202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 0, !dbg !325 + %8203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 1, !dbg !325 + %8204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 2, !dbg !325 + %8205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 3, !dbg !325 + %8206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 4, !dbg !325 + %8207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 5, !dbg !325 + %8208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 6, !dbg !325 + %8209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 7, !dbg !325 + %8210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 8, !dbg !325 + %8211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 9, !dbg !325 + %8212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 10, !dbg !325 + %8213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 11, !dbg !325 + %8214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 12, !dbg !325 + %8215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 13, !dbg !325 + %8216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 14, !dbg !325 + %8217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 15, !dbg !325 + %8218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 16, !dbg !325 + %8219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 17, !dbg !325 + %8220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 18, !dbg !325 + %8221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 19, !dbg !325 + %8222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 20, !dbg !325 + %8223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 21, !dbg !325 + %8224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 22, !dbg !325 + %8225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 23, !dbg !325 + %8226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 24, !dbg !325 + %8227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 25, !dbg !325 + %8228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 26, !dbg !325 + %8229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 27, !dbg !325 + %8230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 28, !dbg !325 + %8231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 29, !dbg !325 + %8232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 30, !dbg !325 + %8233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 31, !dbg !325 + %8234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 32, !dbg !325 + %8235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 33, !dbg !325 + %8236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 34, !dbg !325 + %8237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 35, !dbg !325 + %8238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 36, !dbg !325 + %8239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 37, !dbg !325 + %8240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 38, !dbg !325 + %8241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 39, !dbg !325 + %8242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 40, !dbg !325 + %8243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 41, !dbg !325 + %8244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 42, !dbg !325 + %8245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 43, !dbg !325 + %8246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 44, !dbg !325 + %8247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 45, !dbg !325 + %8248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 46, !dbg !325 + %8249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 47, !dbg !325 + %8250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 48, !dbg !325 + %8251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 49, !dbg !325 + %8252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 50, !dbg !325 + %8253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 51, !dbg !325 + %8254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 52, !dbg !325 + %8255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 53, !dbg !325 + %8256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 54, !dbg !325 + %8257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 55, !dbg !325 + %8258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 56, !dbg !325 + %8259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 57, !dbg !325 + %8260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 58, !dbg !325 + %8261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 59, !dbg !325 + %8262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 60, !dbg !325 + %8263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 61, !dbg !325 + %8264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 62, !dbg !325 + %8265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8201, 63, !dbg !325 + %8266 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %8202, float %8203, float %8204, float %8205, float %8206, float %8207, float %8208, float %8209, float %8210, float %8211, float %8212, float %8213, float %8214, float %8215, float %8216, float %8217, float %8218, float %8219, float %8220, float %8221, float %8222, float %8223, float %8224, float %8225, float %8226, float %8227, float %8228, float %8229, float %8230, float %8231, float %8232, float %8233, float %8234, float %8235, float %8236, float %8237, float %8238, float %8239, float %8240, float %8241, float %8242, float %8243, float %8244, float %8245, float %8246, float %8247, float %8248, float %8249, float %8250, float %8251, float %8252, float %8253, float %8254, float %8255, float %8256, float %8257, float %8258, float %8259, float %8260, float %8261, float %8262, float %8263, float %8264, float %8265, i32 %8167, i32 %8170, i32 %8173, i32 %8176, i64 %7702, i1 true) #3, !dbg !325 + %8267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 0, !dbg !325 + %8268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 1, !dbg !325 + %8269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 2, !dbg !325 + %8270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 3, !dbg !325 + %8271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 4, !dbg !325 + %8272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 5, !dbg !325 + %8273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 6, !dbg !325 + %8274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 7, !dbg !325 + %8275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 8, !dbg !325 + %8276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 9, !dbg !325 + %8277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 10, !dbg !325 + %8278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 11, !dbg !325 + %8279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 12, !dbg !325 + %8280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 13, !dbg !325 + %8281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 14, !dbg !325 + %8282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 15, !dbg !325 + %8283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 16, !dbg !325 + %8284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 17, !dbg !325 + %8285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 18, !dbg !325 + %8286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 19, !dbg !325 + %8287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 20, !dbg !325 + %8288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 21, !dbg !325 + %8289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 22, !dbg !325 + %8290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 23, !dbg !325 + %8291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 24, !dbg !325 + %8292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 25, !dbg !325 + %8293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 26, !dbg !325 + %8294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 27, !dbg !325 + %8295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 28, !dbg !325 + %8296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 29, !dbg !325 + %8297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 30, !dbg !325 + %8298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 31, !dbg !325 + %8299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 32, !dbg !325 + %8300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 33, !dbg !325 + %8301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 34, !dbg !325 + %8302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 35, !dbg !325 + %8303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 36, !dbg !325 + %8304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 37, !dbg !325 + %8305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 38, !dbg !325 + %8306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 39, !dbg !325 + %8307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 40, !dbg !325 + %8308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 41, !dbg !325 + %8309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 42, !dbg !325 + %8310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 43, !dbg !325 + %8311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 44, !dbg !325 + %8312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 45, !dbg !325 + %8313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 46, !dbg !325 + %8314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 47, !dbg !325 + %8315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 48, !dbg !325 + %8316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 49, !dbg !325 + %8317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 50, !dbg !325 + %8318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 51, !dbg !325 + %8319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 52, !dbg !325 + %8320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 53, !dbg !325 + %8321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 54, !dbg !325 + %8322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 55, !dbg !325 + %8323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 56, !dbg !325 + %8324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 57, !dbg !325 + %8325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 58, !dbg !325 + %8326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 59, !dbg !325 + %8327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 60, !dbg !325 + %8328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 61, !dbg !325 + %8329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 62, !dbg !325 + %8330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8266, 63, !dbg !325 + %8331 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %8267, float %8268, float %8269, float %8270, float %8271, float %8272, float %8273, float %8274, float %8275, float %8276, float %8277, float %8278, float %8279, float %8280, float %8281, float %8282, float %8283, float %8284, float %8285, float %8286, float %8287, float %8288, float %8289, float %8290, float %8291, float %8292, float %8293, float %8294, float %8295, float %8296, float %8297, float %8298, float %8299, float %8300, float %8301, float %8302, float %8303, float %8304, float %8305, float %8306, float %8307, float %8308, float %8309, float %8310, float %8311, float %8312, float %8313, float %8314, float %8315, float %8316, float %8317, float %8318, float %8319, float %8320, float %8321, float %8322, float %8323, float %8324, float %8325, float %8326, float %8327, float %8328, float %8329, float %8330, i32 %8179, i32 %8182, i32 %8185, i32 %8188, i64 %7707, i1 true) #3, !dbg !325 + %8332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 0, !dbg !325 + %8333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 1, !dbg !325 + %8334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 2, !dbg !325 + %8335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 3, !dbg !325 + %8336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 4, !dbg !325 + %8337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 5, !dbg !325 + %8338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 6, !dbg !325 + %8339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 7, !dbg !325 + %8340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 8, !dbg !325 + %8341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 9, !dbg !325 + %8342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 10, !dbg !325 + %8343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 11, !dbg !325 + %8344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 12, !dbg !325 + %8345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 13, !dbg !325 + %8346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 14, !dbg !325 + %8347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 15, !dbg !325 + %8348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 16, !dbg !325 + %8349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 17, !dbg !325 + %8350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 18, !dbg !325 + %8351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 19, !dbg !325 + %8352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 20, !dbg !325 + %8353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 21, !dbg !325 + %8354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 22, !dbg !325 + %8355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 23, !dbg !325 + %8356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 24, !dbg !325 + %8357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 25, !dbg !325 + %8358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 26, !dbg !325 + %8359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 27, !dbg !325 + %8360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 28, !dbg !325 + %8361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 29, !dbg !325 + %8362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 30, !dbg !325 + %8363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 31, !dbg !325 + %8364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 32, !dbg !325 + %8365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 33, !dbg !325 + %8366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 34, !dbg !325 + %8367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 35, !dbg !325 + %8368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 36, !dbg !325 + %8369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 37, !dbg !325 + %8370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 38, !dbg !325 + %8371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 39, !dbg !325 + %8372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 40, !dbg !325 + %8373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 41, !dbg !325 + %8374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 42, !dbg !325 + %8375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 43, !dbg !325 + %8376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 44, !dbg !325 + %8377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 45, !dbg !325 + %8378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 46, !dbg !325 + %8379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 47, !dbg !325 + %8380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 48, !dbg !325 + %8381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 49, !dbg !325 + %8382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 50, !dbg !325 + %8383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 51, !dbg !325 + %8384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 52, !dbg !325 + %8385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 53, !dbg !325 + %8386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 54, !dbg !325 + %8387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 55, !dbg !325 + %8388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 56, !dbg !325 + %8389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 57, !dbg !325 + %8390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 58, !dbg !325 + %8391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 59, !dbg !325 + %8392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 60, !dbg !325 + %8393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 61, !dbg !325 + %8394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 62, !dbg !325 + %8395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8331, 63, !dbg !325 + %8396 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %8332, float %8333, float %8334, float %8335, float %8336, float %8337, float %8338, float %8339, float %8340, float %8341, float %8342, float %8343, float %8344, float %8345, float %8346, float %8347, float %8348, float %8349, float %8350, float %8351, float %8352, float %8353, float %8354, float %8355, float %8356, float %8357, float %8358, float %8359, float %8360, float %8361, float %8362, float %8363, float %8364, float %8365, float %8366, float %8367, float %8368, float %8369, float %8370, float %8371, float %8372, float %8373, float %8374, float %8375, float %8376, float %8377, float %8378, float %8379, float %8380, float %8381, float %8382, float %8383, float %8384, float %8385, float %8386, float %8387, float %8388, float %8389, float %8390, float %8391, float %8392, float %8393, float %8394, float %8395, i32 %8191, i32 %8194, i32 %8197, i32 %8200, i64 %7712, i1 true) #3, !dbg !325 + %8397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 0, !dbg !325 + %8398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 1, !dbg !325 + %8399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 2, !dbg !325 + %8400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 3, !dbg !325 + %8401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 4, !dbg !325 + %8402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 5, !dbg !325 + %8403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 6, !dbg !325 + %8404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 7, !dbg !325 + %8405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 8, !dbg !325 + %8406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 9, !dbg !325 + %8407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 10, !dbg !325 + %8408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 11, !dbg !325 + %8409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 12, !dbg !325 + %8410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 13, !dbg !325 + %8411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 14, !dbg !325 + %8412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 15, !dbg !325 + %8413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 16, !dbg !325 + %8414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 17, !dbg !325 + %8415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 18, !dbg !325 + %8416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 19, !dbg !325 + %8417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 20, !dbg !325 + %8418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 21, !dbg !325 + %8419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 22, !dbg !325 + %8420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 23, !dbg !325 + %8421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 24, !dbg !325 + %8422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 25, !dbg !325 + %8423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 26, !dbg !325 + %8424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 27, !dbg !325 + %8425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 28, !dbg !325 + %8426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 29, !dbg !325 + %8427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 30, !dbg !325 + %8428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 31, !dbg !325 + %8429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 32, !dbg !325 + %8430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 33, !dbg !325 + %8431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 34, !dbg !325 + %8432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 35, !dbg !325 + %8433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 36, !dbg !325 + %8434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 37, !dbg !325 + %8435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 38, !dbg !325 + %8436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 39, !dbg !325 + %8437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 40, !dbg !325 + %8438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 41, !dbg !325 + %8439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 42, !dbg !325 + %8440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 43, !dbg !325 + %8441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 44, !dbg !325 + %8442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 45, !dbg !325 + %8443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 46, !dbg !325 + %8444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 47, !dbg !325 + %8445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 48, !dbg !325 + %8446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 49, !dbg !325 + %8447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 50, !dbg !325 + %8448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 51, !dbg !325 + %8449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 52, !dbg !325 + %8450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 53, !dbg !325 + %8451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 54, !dbg !325 + %8452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 55, !dbg !325 + %8453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 56, !dbg !325 + %8454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 57, !dbg !325 + %8455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 58, !dbg !325 + %8456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 59, !dbg !325 + %8457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 60, !dbg !325 + %8458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 61, !dbg !325 + %8459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 62, !dbg !325 + %8460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8396, 63, !dbg !325 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !325 + %8461 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !339 + %8462 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %7727, i64 %8461, i1 %6158) #3, !dbg !339 + %8463 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !340 + %8464 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %7730, i64 %8463, i1 %7731) #3, !dbg !340 + %8465 = sub i32 %8464, %8462, !dbg !341 + %8466 = shl i32 %8465, 7, !dbg !342 + %8467 = add i32 %8466, -64, !dbg !343 + %8468 = mul nuw nsw i32 %8467, %7733, !dbg !333 + %8469 = add i32 %8468, %7734, !dbg !344 + %8470 = shl i32 %8469, 12, !dbg !345 + %8471 = sext i32 %8470 to i64, !dbg !299 + %8472 = getelementptr bfloat, ptr addrspace(1) %.pn1231595, i64 %8471, !dbg !299 + %8473 = getelementptr bfloat, ptr addrspace(1) %.pn1071596, i64 %8471, !dbg !299 + %8474 = getelementptr bfloat, ptr addrspace(1) %.pn911597, i64 %8471, !dbg !299 + %8475 = getelementptr bfloat, ptr addrspace(1) %.pn751598, i64 %8471, !dbg !299 + %8476 = shl i32 %8469, 7, !dbg !346 + %8477 = sext i32 %8476 to i64, !dbg !300 + %8478 = getelementptr bfloat, ptr addrspace(1) %.pn1871599, i64 %8477, !dbg !300 + %8479 = getelementptr bfloat, ptr addrspace(1) %.pn1711600, i64 %8477, !dbg !300 + %8480 = getelementptr bfloat, ptr addrspace(1) %.pn1551601, i64 %8477, !dbg !300 + %8481 = getelementptr bfloat, ptr addrspace(1) %.pn1391602, i64 %8477, !dbg !300 + %8482 = insertelement <2 x i32> poison, i32 %8469, i64 0, !dbg !326 + %8483 = shufflevector <2 x i32> %8482, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !326 + %8484 = add <2 x i32> %8483, %6154, !dbg !326 + %8485 = add i32 %8469, %.pn2151605, !dbg !326 + %8486 = add i32 %8469, %.pn2131606, !dbg !326 + %8487 = add i32 %8469, %.pn2111607, !dbg !326 + %8488 = add i32 %8469, %.pn2091608, !dbg !326 + %8489 = add i32 %8469, %.pn2071609, !dbg !326 + %8490 = add i32 %8469, %.pn2051610, !dbg !326 + %8491 = add i32 %8469, %.pn2031611, !dbg !326 + %8492 = add i32 %8469, %.pn2011612, !dbg !326 + %8493 = add i32 %8469, %.pn1991613, !dbg !326 + %8494 = add i32 %8469, %.pn1971614, !dbg !326 + %8495 = add i32 %8469, %.pn1951615, !dbg !326 + %8496 = add i32 %8469, %.pn1931616, !dbg !326 + %8497 = add i32 %8469, %.pn1911617, !dbg !326 + %8498 = add i32 %8469, %.pn1891618, !dbg !326 + %8499 = add i32 %8469, %6019, !dbg !326 + %8500 = add i32 %8469, %6020, !dbg !326 + %8501 = add i32 %8469, %6021, !dbg !326 + %8502 = add i32 %8469, %6022, !dbg !326 + %8503 = add i32 %8469, %6015, !dbg !326 + %8504 = add i32 %8469, %6016, !dbg !326 + %8505 = add i32 %8469, %6017, !dbg !326 + %8506 = add i32 %8469, %6018, !dbg !326 + %8507 = add i32 %6012, 1, !dbg !255 + %8508 = icmp sgt i32 %8507, 1, !dbg !255 + %8509 = select i1 %8508, i32 0, i32 %8507, !dbg !255 + %8510 = add i32 %6014, 1, !dbg !255 + %8511 = icmp sgt i32 %8510, 2, !dbg !255 + %8512 = select i1 %8511, i32 0, i32 %8510, !dbg !255 + %8513 = icmp slt i32 %8499, %18, !dbg !301 + %8514 = icmp slt i32 %8500, %18, !dbg !301 + %8515 = icmp slt i32 %8501, %18, !dbg !301 + %8516 = icmp slt i32 %8502, %18, !dbg !301 + %8517 = shl i32 %8512, 13, !dbg !293 + %8518 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %8517, !dbg !293 + %8519 = and i1 %6157, %8513, !dbg !255 + %8520 = and i1 %6157, %8514, !dbg !255 + %8521 = and i1 %6157, %8515, !dbg !255 + %8522 = and i1 %6157, %8516, !dbg !255 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !293 + %8523 = getelementptr inbounds nuw i8, ptr addrspace(3) %8518, i32 %5264, !dbg !293 + %8524 = select i1 %8519, i32 16, i32 0, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %8523, ptr addrspace(1) %8472, i32 %8524) #3, !dbg !293 + %8525 = getelementptr inbounds nuw i8, ptr addrspace(3) %8518, i32 %5267, !dbg !293 + %8526 = select i1 %8520, i32 16, i32 0, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %8525, ptr addrspace(1) %8473, i32 %8526) #3, !dbg !293 + %8527 = getelementptr inbounds nuw i8, ptr addrspace(3) %8518, i32 %5270, !dbg !293 + %8528 = select i1 %8521, i32 16, i32 0, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %8527, ptr addrspace(1) %8474, i32 %8528) #3, !dbg !293 + %8529 = getelementptr inbounds nuw i8, ptr addrspace(3) %8518, i32 %5273, !dbg !293 + %8530 = select i1 %8522, i32 16, i32 0, !dbg !293 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %8529, ptr addrspace(1) %8475, i32 %8530) #3, !dbg !293 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !293 + %8531 = extractelement <2 x i32> %8484, i64 0, !dbg !294 + %8532 = icmp slt i32 %8531, %18, !dbg !347 + %8533 = extractelement <2 x i32> %8484, i64 1, !dbg !294 + %8534 = icmp slt i32 %8533, %18, !dbg !347 + %8535 = icmp slt i32 %8485, %18, !dbg !347 + %8536 = icmp slt i32 %8486, %18, !dbg !347 + %8537 = icmp slt i32 %8487, %18, !dbg !347 + %8538 = icmp slt i32 %8488, %18, !dbg !347 + %8539 = icmp slt i32 %8489, %18, !dbg !347 + %8540 = icmp slt i32 %8490, %18, !dbg !347 + %8541 = icmp slt i32 %8491, %18, !dbg !347 + %8542 = icmp slt i32 %8492, %18, !dbg !347 + %8543 = icmp slt i32 %8493, %18, !dbg !347 + %8544 = icmp slt i32 %8494, %18, !dbg !347 + %8545 = icmp slt i32 %8495, %18, !dbg !347 + %8546 = icmp slt i32 %8496, %18, !dbg !347 + %8547 = icmp slt i32 %8497, %18, !dbg !347 + %8548 = icmp slt i32 %8498, %18, !dbg !347 + %8549 = sext i32 %8531 to i64, !dbg !294 + %8550 = getelementptr float, ptr addrspace(1) %5904, i64 %8549, !dbg !294 + %8551 = sext i32 %8533 to i64, !dbg !294 + %8552 = getelementptr float, ptr addrspace(1) %5904, i64 %8551, !dbg !294 + %8553 = sext i32 %8485 to i64, !dbg !294 + %8554 = getelementptr float, ptr addrspace(1) %5904, i64 %8553, !dbg !294 + %8555 = sext i32 %8486 to i64, !dbg !294 + %8556 = getelementptr float, ptr addrspace(1) %5904, i64 %8555, !dbg !294 + %8557 = sext i32 %8487 to i64, !dbg !294 + %8558 = getelementptr float, ptr addrspace(1) %5904, i64 %8557, !dbg !294 + %8559 = sext i32 %8488 to i64, !dbg !294 + %8560 = getelementptr float, ptr addrspace(1) %5904, i64 %8559, !dbg !294 + %8561 = sext i32 %8489 to i64, !dbg !294 + %8562 = getelementptr float, ptr addrspace(1) %5904, i64 %8561, !dbg !294 + %8563 = sext i32 %8490 to i64, !dbg !294 + %8564 = getelementptr float, ptr addrspace(1) %5904, i64 %8563, !dbg !294 + %8565 = sext i32 %8491 to i64, !dbg !294 + %8566 = getelementptr float, ptr addrspace(1) %5904, i64 %8565, !dbg !294 + %8567 = sext i32 %8492 to i64, !dbg !294 + %8568 = getelementptr float, ptr addrspace(1) %5904, i64 %8567, !dbg !294 + %8569 = sext i32 %8493 to i64, !dbg !294 + %8570 = getelementptr float, ptr addrspace(1) %5904, i64 %8569, !dbg !294 + %8571 = sext i32 %8494 to i64, !dbg !294 + %8572 = getelementptr float, ptr addrspace(1) %5904, i64 %8571, !dbg !294 + %8573 = sext i32 %8495 to i64, !dbg !294 + %8574 = getelementptr float, ptr addrspace(1) %5904, i64 %8573, !dbg !294 + %8575 = sext i32 %8496 to i64, !dbg !294 + %8576 = getelementptr float, ptr addrspace(1) %5904, i64 %8575, !dbg !294 + %8577 = sext i32 %8497 to i64, !dbg !294 + %8578 = getelementptr float, ptr addrspace(1) %5904, i64 %8577, !dbg !294 + %8579 = sext i32 %8498 to i64, !dbg !294 + %8580 = getelementptr float, ptr addrspace(1) %5904, i64 %8579, !dbg !294 + %8581 = shl i32 %8509, 6, !dbg !295 + %8582 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %8581, !dbg !295 + %8583 = and i1 %6157, %8532, !dbg !255 + %8584 = and i1 %6157, %8534, !dbg !255 + %8585 = and i1 %6157, %8535, !dbg !255 + %8586 = and i1 %6157, %8536, !dbg !255 + %8587 = and i1 %6157, %8537, !dbg !255 + %8588 = and i1 %6157, %8538, !dbg !255 + %8589 = and i1 %6157, %8539, !dbg !255 + %8590 = and i1 %6157, %8540, !dbg !255 + %8591 = and i1 %6157, %8541, !dbg !255 + %8592 = and i1 %6157, %8542, !dbg !255 + %8593 = and i1 %6157, %8543, !dbg !255 + %8594 = and i1 %6157, %8544, !dbg !255 + %8595 = and i1 %6157, %8545, !dbg !255 + %8596 = and i1 %6157, %8546, !dbg !255 + %8597 = and i1 %6157, %8547, !dbg !255 + %8598 = and i1 %6157, %8548, !dbg !255 + %8599 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5332, !dbg !295 + %8600 = select i1 %8583, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %8599, ptr addrspace(1) %8550, i32 %8600, i1 %5331) #3, !dbg !295 + %8601 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5334, !dbg !295 + %8602 = select i1 %8584, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8601, ptr addrspace(1) %8552, i32 %8602, i1 %5331) #3, !dbg !295 + %8603 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5336, !dbg !295 + %8604 = select i1 %8585, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8603, ptr addrspace(1) %8554, i32 %8604, i1 %5331) #3, !dbg !295 + %8605 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5339, !dbg !295 + %8606 = select i1 %8586, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8605, ptr addrspace(1) %8556, i32 %8606, i1 %5331) #3, !dbg !295 + %8607 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5342, !dbg !295 + %8608 = select i1 %8587, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8607, ptr addrspace(1) %8558, i32 %8608, i1 %5331) #3, !dbg !295 + %8609 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5345, !dbg !295 + %8610 = select i1 %8588, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8609, ptr addrspace(1) %8560, i32 %8610, i1 %5331) #3, !dbg !295 + %8611 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5348, !dbg !295 + %8612 = select i1 %8589, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8611, ptr addrspace(1) %8562, i32 %8612, i1 %5331) #3, !dbg !295 + %8613 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5351, !dbg !295 + %8614 = select i1 %8590, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8613, ptr addrspace(1) %8564, i32 %8614, i1 %5331) #3, !dbg !295 + %8615 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5354, !dbg !295 + %8616 = select i1 %8591, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8615, ptr addrspace(1) %8566, i32 %8616, i1 %5331) #3, !dbg !295 + %8617 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5357, !dbg !295 + %8618 = select i1 %8592, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8617, ptr addrspace(1) %8568, i32 %8618, i1 %5331) #3, !dbg !295 + %8619 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5360, !dbg !295 + %8620 = select i1 %8593, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8619, ptr addrspace(1) %8570, i32 %8620, i1 %5331) #3, !dbg !295 + %8621 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5363, !dbg !295 + %8622 = select i1 %8594, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8621, ptr addrspace(1) %8572, i32 %8622, i1 %5331) #3, !dbg !295 + %8623 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5366, !dbg !295 + %8624 = select i1 %8595, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8623, ptr addrspace(1) %8574, i32 %8624, i1 %5331) #3, !dbg !295 + %8625 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5369, !dbg !295 + %8626 = select i1 %8596, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8625, ptr addrspace(1) %8576, i32 %8626, i1 %5331) #3, !dbg !295 + %8627 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5372, !dbg !295 + %8628 = select i1 %8597, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8627, ptr addrspace(1) %8578, i32 %8628, i1 %5331) #3, !dbg !295 + %8629 = getelementptr inbounds nuw i8, ptr addrspace(3) %8582, i32 %5375, !dbg !295 + %8630 = select i1 %8598, i32 4, i32 0, !dbg !295 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8629, ptr addrspace(1) %8580, i32 %8630, i1 %5331) #3, !dbg !295 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !295 + %8631 = icmp slt i32 %8503, %18, !dbg !348 + %8632 = icmp slt i32 %8504, %18, !dbg !348 + %8633 = icmp slt i32 %8505, %18, !dbg !348 + %8634 = icmp slt i32 %8506, %18, !dbg !348 + %8635 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %8517, !dbg !296 + %8636 = and i1 %6157, %8631, !dbg !255 + %8637 = and i1 %6157, %8632, !dbg !255 + %8638 = and i1 %6157, %8633, !dbg !255 + %8639 = and i1 %6157, %8634, !dbg !255 + %8640 = getelementptr inbounds nuw i8, ptr addrspace(3) %8635, i32 %5264, !dbg !296 + %8641 = select i1 %8636, i32 16, i32 0, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %8640, ptr addrspace(1) %8478, i32 %8641) #3, !dbg !296 + %8642 = getelementptr inbounds nuw i8, ptr addrspace(3) %8635, i32 %5267, !dbg !296 + %8643 = select i1 %8637, i32 16, i32 0, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %8642, ptr addrspace(1) %8479, i32 %8643) #3, !dbg !296 + %8644 = getelementptr inbounds nuw i8, ptr addrspace(3) %8635, i32 %5270, !dbg !296 + %8645 = select i1 %8638, i32 16, i32 0, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %8644, ptr addrspace(1) %8480, i32 %8645) #3, !dbg !296 + %8646 = getelementptr inbounds nuw i8, ptr addrspace(3) %8635, i32 %5273, !dbg !296 + %8647 = select i1 %8639, i32 16, i32 0, !dbg !296 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %8646, ptr addrspace(1) %8481, i32 %8647) #3, !dbg !296 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !296 + %8648 = getelementptr float, ptr addrspace(1) %5905, i64 %8549, !dbg !297 + %8649 = getelementptr float, ptr addrspace(1) %5905, i64 %8551, !dbg !297 + %8650 = getelementptr float, ptr addrspace(1) %5905, i64 %8553, !dbg !297 + %8651 = getelementptr float, ptr addrspace(1) %5905, i64 %8555, !dbg !297 + %8652 = getelementptr float, ptr addrspace(1) %5905, i64 %8557, !dbg !297 + %8653 = getelementptr float, ptr addrspace(1) %5905, i64 %8559, !dbg !297 + %8654 = getelementptr float, ptr addrspace(1) %5905, i64 %8561, !dbg !297 + %8655 = getelementptr float, ptr addrspace(1) %5905, i64 %8563, !dbg !297 + %8656 = getelementptr float, ptr addrspace(1) %5905, i64 %8565, !dbg !297 + %8657 = getelementptr float, ptr addrspace(1) %5905, i64 %8567, !dbg !297 + %8658 = getelementptr float, ptr addrspace(1) %5905, i64 %8569, !dbg !297 + %8659 = getelementptr float, ptr addrspace(1) %5905, i64 %8571, !dbg !297 + %8660 = getelementptr float, ptr addrspace(1) %5905, i64 %8573, !dbg !297 + %8661 = getelementptr float, ptr addrspace(1) %5905, i64 %8575, !dbg !297 + %8662 = getelementptr float, ptr addrspace(1) %5905, i64 %8577, !dbg !297 + %8663 = getelementptr float, ptr addrspace(1) %5905, i64 %8579, !dbg !297 + %8664 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %8581, !dbg !298 + %8665 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5332, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %8665, ptr addrspace(1) %8648, i32 %8600, i1 %5331) #3, !dbg !298 + %8666 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5334, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8666, ptr addrspace(1) %8649, i32 %8602, i1 %5331) #3, !dbg !298 + %8667 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5336, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8667, ptr addrspace(1) %8650, i32 %8604, i1 %5331) #3, !dbg !298 + %8668 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5339, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8668, ptr addrspace(1) %8651, i32 %8606, i1 %5331) #3, !dbg !298 + %8669 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5342, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8669, ptr addrspace(1) %8652, i32 %8608, i1 %5331) #3, !dbg !298 + %8670 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5345, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8670, ptr addrspace(1) %8653, i32 %8610, i1 %5331) #3, !dbg !298 + %8671 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5348, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8671, ptr addrspace(1) %8654, i32 %8612, i1 %5331) #3, !dbg !298 + %8672 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5351, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8672, ptr addrspace(1) %8655, i32 %8614, i1 %5331) #3, !dbg !298 + %8673 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5354, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8673, ptr addrspace(1) %8656, i32 %8616, i1 %5331) #3, !dbg !298 + %8674 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5357, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8674, ptr addrspace(1) %8657, i32 %8618, i1 %5331) #3, !dbg !298 + %8675 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5360, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8675, ptr addrspace(1) %8658, i32 %8620, i1 %5331) #3, !dbg !298 + %8676 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5363, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8676, ptr addrspace(1) %8659, i32 %8622, i1 %5331) #3, !dbg !298 + %8677 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5366, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8677, ptr addrspace(1) %8660, i32 %8624, i1 %5331) #3, !dbg !298 + %8678 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5369, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8678, ptr addrspace(1) %8661, i32 %8626, i1 %5331) #3, !dbg !298 + %8679 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5372, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8679, ptr addrspace(1) %8662, i32 %8628, i1 %5331) #3, !dbg !298 + %8680 = getelementptr inbounds nuw i8, ptr addrspace(3) %8664, i32 %5375, !dbg !298 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8680, ptr addrspace(1) %8663, i32 %8630, i1 %5331) #3, !dbg !298 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !298 + %exitcond2186.not = icmp eq i32 %7724, %smax2185, !dbg !255 + br i1 %exitcond2186.not, label %._crit_edge1621, label %.lr.ph1620, !dbg !255 + +._crit_edge1621: ; preds = %__nv_exp2f.exit1335, %5761 + %8681 = phi float [ %5762, %5761 ], [ %8397, %__nv_exp2f.exit1335 ] + %8682 = phi float [ %5763, %5761 ], [ %8398, %__nv_exp2f.exit1335 ] + %8683 = phi float [ %5764, %5761 ], [ %8399, %__nv_exp2f.exit1335 ] + %8684 = phi float [ %5765, %5761 ], [ %8400, %__nv_exp2f.exit1335 ] + %8685 = phi float [ %5766, %5761 ], [ %8401, %__nv_exp2f.exit1335 ] + %8686 = phi float [ %5767, %5761 ], [ %8402, %__nv_exp2f.exit1335 ] + %8687 = phi float [ %5768, %5761 ], [ %8403, %__nv_exp2f.exit1335 ] + %8688 = phi float [ %5769, %5761 ], [ %8404, %__nv_exp2f.exit1335 ] + %8689 = phi float [ %5770, %5761 ], [ %8405, %__nv_exp2f.exit1335 ] + %8690 = phi float [ %5771, %5761 ], [ %8406, %__nv_exp2f.exit1335 ] + %8691 = phi float [ %5772, %5761 ], [ %8407, %__nv_exp2f.exit1335 ] + %8692 = phi float [ %5773, %5761 ], [ %8408, %__nv_exp2f.exit1335 ] + %8693 = phi float [ %5774, %5761 ], [ %8409, %__nv_exp2f.exit1335 ] + %8694 = phi float [ %5775, %5761 ], [ %8410, %__nv_exp2f.exit1335 ] + %8695 = phi float [ %5776, %5761 ], [ %8411, %__nv_exp2f.exit1335 ] + %8696 = phi float [ %5777, %5761 ], [ %8412, %__nv_exp2f.exit1335 ] + %8697 = phi float [ %5778, %5761 ], [ %8413, %__nv_exp2f.exit1335 ] + %8698 = phi float [ %5779, %5761 ], [ %8414, %__nv_exp2f.exit1335 ] + %8699 = phi float [ %5780, %5761 ], [ %8415, %__nv_exp2f.exit1335 ] + %8700 = phi float [ %5781, %5761 ], [ %8416, %__nv_exp2f.exit1335 ] + %8701 = phi float [ %5782, %5761 ], [ %8417, %__nv_exp2f.exit1335 ] + %8702 = phi float [ %5783, %5761 ], [ %8418, %__nv_exp2f.exit1335 ] + %8703 = phi float [ %5784, %5761 ], [ %8419, %__nv_exp2f.exit1335 ] + %8704 = phi float [ %5785, %5761 ], [ %8420, %__nv_exp2f.exit1335 ] + %8705 = phi float [ %5786, %5761 ], [ %8421, %__nv_exp2f.exit1335 ] + %8706 = phi float [ %5787, %5761 ], [ %8422, %__nv_exp2f.exit1335 ] + %8707 = phi float [ %5788, %5761 ], [ %8423, %__nv_exp2f.exit1335 ] + %8708 = phi float [ %5789, %5761 ], [ %8424, %__nv_exp2f.exit1335 ] + %8709 = phi float [ %5790, %5761 ], [ %8425, %__nv_exp2f.exit1335 ] + %8710 = phi float [ %5791, %5761 ], [ %8426, %__nv_exp2f.exit1335 ] + %8711 = phi float [ %5792, %5761 ], [ %8427, %__nv_exp2f.exit1335 ] + %8712 = phi float [ %5793, %5761 ], [ %8428, %__nv_exp2f.exit1335 ] + %8713 = phi float [ %5794, %5761 ], [ %8429, %__nv_exp2f.exit1335 ] + %8714 = phi float [ %5795, %5761 ], [ %8430, %__nv_exp2f.exit1335 ] + %8715 = phi float [ %5796, %5761 ], [ %8431, %__nv_exp2f.exit1335 ] + %8716 = phi float [ %5797, %5761 ], [ %8432, %__nv_exp2f.exit1335 ] + %8717 = phi float [ %5798, %5761 ], [ %8433, %__nv_exp2f.exit1335 ] + %8718 = phi float [ %5799, %5761 ], [ %8434, %__nv_exp2f.exit1335 ] + %8719 = phi float [ %5800, %5761 ], [ %8435, %__nv_exp2f.exit1335 ] + %8720 = phi float [ %5801, %5761 ], [ %8436, %__nv_exp2f.exit1335 ] + %8721 = phi float [ %5802, %5761 ], [ %8437, %__nv_exp2f.exit1335 ] + %8722 = phi float [ %5803, %5761 ], [ %8438, %__nv_exp2f.exit1335 ] + %8723 = phi float [ %5804, %5761 ], [ %8439, %__nv_exp2f.exit1335 ] + %8724 = phi float [ %5805, %5761 ], [ %8440, %__nv_exp2f.exit1335 ] + %8725 = phi float [ %5806, %5761 ], [ %8441, %__nv_exp2f.exit1335 ] + %8726 = phi float [ %5807, %5761 ], [ %8442, %__nv_exp2f.exit1335 ] + %8727 = phi float [ %5808, %5761 ], [ %8443, %__nv_exp2f.exit1335 ] + %8728 = phi float [ %5809, %5761 ], [ %8444, %__nv_exp2f.exit1335 ] + %8729 = phi float [ %5810, %5761 ], [ %8445, %__nv_exp2f.exit1335 ] + %8730 = phi float [ %5811, %5761 ], [ %8446, %__nv_exp2f.exit1335 ] + %8731 = phi float [ %5812, %5761 ], [ %8447, %__nv_exp2f.exit1335 ] + %8732 = phi float [ %5813, %5761 ], [ %8448, %__nv_exp2f.exit1335 ] + %8733 = phi float [ %5814, %5761 ], [ %8449, %__nv_exp2f.exit1335 ] + %8734 = phi float [ %5815, %5761 ], [ %8450, %__nv_exp2f.exit1335 ] + %8735 = phi float [ %5816, %5761 ], [ %8451, %__nv_exp2f.exit1335 ] + %8736 = phi float [ %5817, %5761 ], [ %8452, %__nv_exp2f.exit1335 ] + %8737 = phi float [ %5818, %5761 ], [ %8453, %__nv_exp2f.exit1335 ] + %8738 = phi float [ %5819, %5761 ], [ %8454, %__nv_exp2f.exit1335 ] + %8739 = phi float [ %5820, %5761 ], [ %8455, %__nv_exp2f.exit1335 ] + %8740 = phi float [ %5821, %5761 ], [ %8456, %__nv_exp2f.exit1335 ] + %8741 = phi float [ %5822, %5761 ], [ %8457, %__nv_exp2f.exit1335 ] + %8742 = phi float [ %5823, %5761 ], [ %8458, %__nv_exp2f.exit1335 ] + %8743 = phi float [ %5824, %5761 ], [ %8459, %__nv_exp2f.exit1335 ] + %8744 = phi float [ %5825, %5761 ], [ %8460, %__nv_exp2f.exit1335 ] + %8745 = phi float [ %5826, %5761 ], [ %7529, %__nv_exp2f.exit1335 ] + %8746 = phi float [ %5827, %5761 ], [ %7530, %__nv_exp2f.exit1335 ] + %8747 = phi float [ %5828, %5761 ], [ %7531, %__nv_exp2f.exit1335 ] + %8748 = phi float [ %5829, %5761 ], [ %7532, %__nv_exp2f.exit1335 ] + %8749 = phi float [ %5830, %5761 ], [ %7533, %__nv_exp2f.exit1335 ] + %8750 = phi float [ %5831, %5761 ], [ %7534, %__nv_exp2f.exit1335 ] + %8751 = phi float [ %5832, %5761 ], [ %7535, %__nv_exp2f.exit1335 ] + %8752 = phi float [ %5833, %5761 ], [ %7536, %__nv_exp2f.exit1335 ] + %8753 = phi float [ %5834, %5761 ], [ %7537, %__nv_exp2f.exit1335 ] + %8754 = phi float [ %5835, %5761 ], [ %7538, %__nv_exp2f.exit1335 ] + %8755 = phi float [ %5836, %5761 ], [ %7539, %__nv_exp2f.exit1335 ] + %8756 = phi float [ %5837, %5761 ], [ %7540, %__nv_exp2f.exit1335 ] + %8757 = phi float [ %5838, %5761 ], [ %7541, %__nv_exp2f.exit1335 ] + %8758 = phi float [ %5839, %5761 ], [ %7542, %__nv_exp2f.exit1335 ] + %8759 = phi float [ %5840, %5761 ], [ %7543, %__nv_exp2f.exit1335 ] + %8760 = phi float [ %5841, %5761 ], [ %7544, %__nv_exp2f.exit1335 ] + %8761 = phi float [ %5842, %5761 ], [ %7545, %__nv_exp2f.exit1335 ] + %8762 = phi float [ %5843, %5761 ], [ %7546, %__nv_exp2f.exit1335 ] + %8763 = phi float [ %5844, %5761 ], [ %7547, %__nv_exp2f.exit1335 ] + %8764 = phi float [ %5845, %5761 ], [ %7548, %__nv_exp2f.exit1335 ] + %8765 = phi float [ %5846, %5761 ], [ %7549, %__nv_exp2f.exit1335 ] + %8766 = phi float [ %5847, %5761 ], [ %7550, %__nv_exp2f.exit1335 ] + %8767 = phi float [ %5848, %5761 ], [ %7551, %__nv_exp2f.exit1335 ] + %8768 = phi float [ %5849, %5761 ], [ %7552, %__nv_exp2f.exit1335 ] + %8769 = phi float [ %5850, %5761 ], [ %7553, %__nv_exp2f.exit1335 ] + %8770 = phi float [ %5851, %5761 ], [ %7554, %__nv_exp2f.exit1335 ] + %8771 = phi float [ %5852, %5761 ], [ %7555, %__nv_exp2f.exit1335 ] + %8772 = phi float [ %5853, %5761 ], [ %7556, %__nv_exp2f.exit1335 ] + %8773 = phi float [ %5854, %5761 ], [ %7557, %__nv_exp2f.exit1335 ] + %8774 = phi float [ %5855, %5761 ], [ %7558, %__nv_exp2f.exit1335 ] + %8775 = phi float [ %5856, %5761 ], [ %7559, %__nv_exp2f.exit1335 ] + %8776 = phi float [ %5857, %5761 ], [ %7560, %__nv_exp2f.exit1335 ] + %8777 = phi float [ %5858, %5761 ], [ %7561, %__nv_exp2f.exit1335 ] + %8778 = phi float [ %5859, %5761 ], [ %7562, %__nv_exp2f.exit1335 ] + %8779 = phi float [ %5860, %5761 ], [ %7563, %__nv_exp2f.exit1335 ] + %8780 = phi float [ %5861, %5761 ], [ %7564, %__nv_exp2f.exit1335 ] + %8781 = phi float [ %5862, %5761 ], [ %7565, %__nv_exp2f.exit1335 ] + %8782 = phi float [ %5863, %5761 ], [ %7566, %__nv_exp2f.exit1335 ] + %8783 = phi float [ %5864, %5761 ], [ %7567, %__nv_exp2f.exit1335 ] + %8784 = phi float [ %5865, %5761 ], [ %7568, %__nv_exp2f.exit1335 ] + %8785 = phi float [ %5866, %5761 ], [ %7569, %__nv_exp2f.exit1335 ] + %8786 = phi float [ %5867, %5761 ], [ %7570, %__nv_exp2f.exit1335 ] + %8787 = phi float [ %5868, %5761 ], [ %7571, %__nv_exp2f.exit1335 ] + %8788 = phi float [ %5869, %5761 ], [ %7572, %__nv_exp2f.exit1335 ] + %8789 = phi float [ %5870, %5761 ], [ %7573, %__nv_exp2f.exit1335 ] + %8790 = phi float [ %5871, %5761 ], [ %7574, %__nv_exp2f.exit1335 ] + %8791 = phi float [ %5872, %5761 ], [ %7575, %__nv_exp2f.exit1335 ] + %8792 = phi float [ %5873, %5761 ], [ %7576, %__nv_exp2f.exit1335 ] + %8793 = phi float [ %5874, %5761 ], [ %7577, %__nv_exp2f.exit1335 ] + %8794 = phi float [ %5875, %5761 ], [ %7578, %__nv_exp2f.exit1335 ] + %8795 = phi float [ %5876, %5761 ], [ %7579, %__nv_exp2f.exit1335 ] + %8796 = phi float [ %5877, %5761 ], [ %7580, %__nv_exp2f.exit1335 ] + %8797 = phi float [ %5878, %5761 ], [ %7581, %__nv_exp2f.exit1335 ] + %8798 = phi float [ %5879, %5761 ], [ %7582, %__nv_exp2f.exit1335 ] + %8799 = phi float [ %5880, %5761 ], [ %7583, %__nv_exp2f.exit1335 ] + %8800 = phi float [ %5881, %5761 ], [ %7584, %__nv_exp2f.exit1335 ] + %8801 = phi float [ %5882, %5761 ], [ %7585, %__nv_exp2f.exit1335 ] + %8802 = phi float [ %5883, %5761 ], [ %7586, %__nv_exp2f.exit1335 ] + %8803 = phi float [ %5884, %5761 ], [ %7587, %__nv_exp2f.exit1335 ] + %8804 = phi float [ %5885, %5761 ], [ %7588, %__nv_exp2f.exit1335 ] + %8805 = phi float [ %5886, %5761 ], [ %7589, %__nv_exp2f.exit1335 ] + %8806 = phi float [ %5887, %5761 ], [ %7590, %__nv_exp2f.exit1335 ] + %8807 = phi float [ %5888, %5761 ], [ %7591, %__nv_exp2f.exit1335 ] + %8808 = phi float [ %5889, %5761 ], [ %7592, %__nv_exp2f.exit1335 ] + %8809 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101,$102,$103,$104,$105,$106,$107,$108,$109,$110,$111,$112,$113,$114,$115,$116,$117,$118,$119,$120,$121,$122,$123,$124,$125,$126,$127\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127"(float %8745, float %8746, float %8747, float %8748, float %8749, float %8750, float %8751, float %8752, float %8753, float %8754, float %8755, float %8756, float %8757, float %8758, float %8759, float %8760, float %8761, float %8762, float %8763, float %8764, float %8765, float %8766, float %8767, float %8768, float %8769, float %8770, float %8771, float %8772, float %8773, float %8774, float %8775, float %8776, float %8777, float %8778, float %8779, float %8780, float %8781, float %8782, float %8783, float %8784, float %8785, float %8786, float %8787, float %8788, float %8789, float %8790, float %8791, float %8792, float %8793, float %8794, float %8795, float %8796, float %8797, float %8798, float %8799, float %8800, float %8801, float %8802, float %8803, float %8804, float %8805, float %8806, float %8807, float %8808, float %8681, float %8682, float %8683, float %8684, float %8685, float %8686, float %8687, float %8688, float %8689, float %8690, float %8691, float %8692, float %8693, float %8694, float %8695, float %8696, float %8697, float %8698, float %8699, float %8700, float %8701, float %8702, float %8703, float %8704, float %8705, float %8706, float %8707, float %8708, float %8709, float %8710, float %8711, float %8712, float %8713, float %8714, float %8715, float %8716, float %8717, float %8718, float %8719, float %8720, float %8721, float %8722, float %8723, float %8724, float %8725, float %8726, float %8727, float %8728, float %8729, float %8730, float %8731, float %8732, float %8733, float %8734, float %8735, float %8736, float %8737, float %8738, float %8739, float %8740, float %8741, float %8742, float %8743, float %8744) #3, !dbg !255 + %8810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 0, !dbg !255 + %8811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 1, !dbg !255 + %8812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 2, !dbg !255 + %8813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 3, !dbg !255 + %8814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 4, !dbg !255 + %8815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 5, !dbg !255 + %8816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 6, !dbg !255 + %8817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 7, !dbg !255 + %8818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 8, !dbg !255 + %8819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 9, !dbg !255 + %8820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 10, !dbg !255 + %8821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 11, !dbg !255 + %8822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 12, !dbg !255 + %8823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 13, !dbg !255 + %8824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 14, !dbg !255 + %8825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 15, !dbg !255 + %8826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 16, !dbg !255 + %8827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 17, !dbg !255 + %8828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 18, !dbg !255 + %8829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 19, !dbg !255 + %8830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 20, !dbg !255 + %8831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 21, !dbg !255 + %8832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 22, !dbg !255 + %8833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 23, !dbg !255 + %8834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 24, !dbg !255 + %8835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 25, !dbg !255 + %8836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 26, !dbg !255 + %8837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 27, !dbg !255 + %8838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 28, !dbg !255 + %8839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 29, !dbg !255 + %8840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 30, !dbg !255 + %8841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 31, !dbg !255 + %8842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 32, !dbg !255 + %8843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 33, !dbg !255 + %8844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 34, !dbg !255 + %8845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 35, !dbg !255 + %8846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 36, !dbg !255 + %8847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 37, !dbg !255 + %8848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 38, !dbg !255 + %8849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 39, !dbg !255 + %8850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 40, !dbg !255 + %8851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 41, !dbg !255 + %8852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 42, !dbg !255 + %8853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 43, !dbg !255 + %8854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 44, !dbg !255 + %8855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 45, !dbg !255 + %8856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 46, !dbg !255 + %8857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 47, !dbg !255 + %8858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 48, !dbg !255 + %8859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 49, !dbg !255 + %8860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 50, !dbg !255 + %8861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 51, !dbg !255 + %8862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 52, !dbg !255 + %8863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 53, !dbg !255 + %8864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 54, !dbg !255 + %8865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 55, !dbg !255 + %8866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 56, !dbg !255 + %8867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 57, !dbg !255 + %8868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 58, !dbg !255 + %8869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 59, !dbg !255 + %8870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 60, !dbg !255 + %8871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 61, !dbg !255 + %8872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 62, !dbg !255 + %8873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 63, !dbg !255 + %8874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 64, !dbg !255 + %8875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 65, !dbg !255 + %8876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 66, !dbg !255 + %8877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 67, !dbg !255 + %8878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 68, !dbg !255 + %8879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 69, !dbg !255 + %8880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 70, !dbg !255 + %8881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 71, !dbg !255 + %8882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 72, !dbg !255 + %8883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 73, !dbg !255 + %8884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 74, !dbg !255 + %8885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 75, !dbg !255 + %8886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 76, !dbg !255 + %8887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 77, !dbg !255 + %8888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 78, !dbg !255 + %8889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 79, !dbg !255 + %8890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 80, !dbg !255 + %8891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 81, !dbg !255 + %8892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 82, !dbg !255 + %8893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 83, !dbg !255 + %8894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 84, !dbg !255 + %8895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 85, !dbg !255 + %8896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 86, !dbg !255 + %8897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 87, !dbg !255 + %8898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 88, !dbg !255 + %8899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 89, !dbg !255 + %8900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 90, !dbg !255 + %8901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 91, !dbg !255 + %8902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 92, !dbg !255 + %8903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 93, !dbg !255 + %8904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 94, !dbg !255 + %8905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 95, !dbg !255 + %8906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 96, !dbg !255 + %8907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 97, !dbg !255 + %8908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 98, !dbg !255 + %8909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 99, !dbg !255 + %8910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 100, !dbg !255 + %8911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 101, !dbg !255 + %8912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 102, !dbg !255 + %8913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 103, !dbg !255 + %8914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 104, !dbg !255 + %8915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 105, !dbg !255 + %8916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 106, !dbg !255 + %8917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 107, !dbg !255 + %8918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 108, !dbg !255 + %8919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 109, !dbg !255 + %8920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 110, !dbg !255 + %8921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 111, !dbg !255 + %8922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 112, !dbg !255 + %8923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 113, !dbg !255 + %8924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 114, !dbg !255 + %8925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 115, !dbg !255 + %8926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 116, !dbg !255 + %8927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 117, !dbg !255 + %8928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 118, !dbg !255 + %8929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 119, !dbg !255 + %8930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 120, !dbg !255 + %8931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 121, !dbg !255 + %8932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 122, !dbg !255 + %8933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 123, !dbg !255 + %8934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 124, !dbg !255 + %8935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 125, !dbg !255 + %8936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 126, !dbg !255 + %8937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8809, 127, !dbg !255 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !255 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !255 + %8938 = getelementptr bfloat, ptr addrspace(1) %5902, i64 %5553, !dbg !349 + %8939 = getelementptr bfloat, ptr addrspace(1) %5902, i64 %5554, !dbg !349 + %8940 = getelementptr bfloat, ptr addrspace(1) %5902, i64 %5555, !dbg !349 + %8941 = getelementptr bfloat, ptr addrspace(1) %5902, i64 %5556, !dbg !349 + %8942 = getelementptr bfloat, ptr addrspace(1) %8938, i64 %4911, !dbg !350 + %8943 = getelementptr bfloat, ptr addrspace(1) %8939, i64 %4911, !dbg !350 + %8944 = getelementptr bfloat, ptr addrspace(1) %8940, i64 %4911, !dbg !350 + %8945 = getelementptr bfloat, ptr addrspace(1) %8941, i64 %4911, !dbg !350 + %8946 = getelementptr bfloat, ptr addrspace(1) %5903, i64 %5557, !dbg !351 + %8947 = getelementptr bfloat, ptr addrspace(1) %5903, i64 %5558, !dbg !351 + %8948 = getelementptr bfloat, ptr addrspace(1) %5903, i64 %5559, !dbg !351 + %8949 = getelementptr bfloat, ptr addrspace(1) %5903, i64 %5560, !dbg !351 + %8950 = getelementptr bfloat, ptr addrspace(1) %8946, i64 %4911, !dbg !352 + %8951 = getelementptr bfloat, ptr addrspace(1) %8947, i64 %4911, !dbg !352 + %8952 = getelementptr bfloat, ptr addrspace(1) %8948, i64 %4911, !dbg !352 + %8953 = getelementptr bfloat, ptr addrspace(1) %8949, i64 %4911, !dbg !352 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5265, ptr addrspace(1) %8942, i32 %5570) #3, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5268, ptr addrspace(1) %8943, i32 %5571) #3, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5271, ptr addrspace(1) %8944, i32 %5572) #3, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5274, ptr addrspace(1) %8945, i32 %5573) #3, !dbg !353 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !353 + %8954 = getelementptr float, ptr addrspace(1) %5904, i64 %5590, !dbg !354 + %8955 = getelementptr float, ptr addrspace(1) %5904, i64 %5591, !dbg !354 + %8956 = getelementptr float, ptr addrspace(1) %5904, i64 %5592, !dbg !354 + %8957 = getelementptr float, ptr addrspace(1) %5904, i64 %5593, !dbg !354 + %8958 = getelementptr float, ptr addrspace(1) %5904, i64 %5594, !dbg !354 + %8959 = getelementptr float, ptr addrspace(1) %5904, i64 %5595, !dbg !354 + %8960 = getelementptr float, ptr addrspace(1) %5904, i64 %5596, !dbg !354 + %8961 = getelementptr float, ptr addrspace(1) %5904, i64 %5597, !dbg !354 + %8962 = getelementptr float, ptr addrspace(1) %5904, i64 %5598, !dbg !354 + %8963 = getelementptr float, ptr addrspace(1) %5904, i64 %5599, !dbg !354 + %8964 = getelementptr float, ptr addrspace(1) %5904, i64 %5600, !dbg !354 + %8965 = getelementptr float, ptr addrspace(1) %5904, i64 %5601, !dbg !354 + %8966 = getelementptr float, ptr addrspace(1) %5904, i64 %5602, !dbg !354 + %8967 = getelementptr float, ptr addrspace(1) %5904, i64 %5603, !dbg !354 + %8968 = getelementptr float, ptr addrspace(1) %5904, i64 %5604, !dbg !354 + %8969 = getelementptr float, ptr addrspace(1) %5904, i64 %5605, !dbg !354 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %5333, ptr addrspace(1) %8954, i32 %5622, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5335, ptr addrspace(1) %8955, i32 %5623, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5337, ptr addrspace(1) %8956, i32 %5624, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5340, ptr addrspace(1) %8957, i32 %5625, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5343, ptr addrspace(1) %8958, i32 %5626, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5346, ptr addrspace(1) %8959, i32 %5627, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5349, ptr addrspace(1) %8960, i32 %5628, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5352, ptr addrspace(1) %8961, i32 %5629, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5355, ptr addrspace(1) %8962, i32 %5630, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5358, ptr addrspace(1) %8963, i32 %5631, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5361, ptr addrspace(1) %8964, i32 %5632, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5364, ptr addrspace(1) %8965, i32 %5633, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5367, ptr addrspace(1) %8966, i32 %5634, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5370, ptr addrspace(1) %8967, i32 %5635, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5373, ptr addrspace(1) %8968, i32 %5636, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5376, ptr addrspace(1) %8969, i32 %5637, i1 %5331) #3, !dbg !355 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !355 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5378, ptr addrspace(1) %8950, i32 %5570) #3, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5379, ptr addrspace(1) %8951, i32 %5571) #3, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5380, ptr addrspace(1) %8952, i32 %5572) #3, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5381, ptr addrspace(1) %8953, i32 %5573) #3, !dbg !356 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !356 + %8970 = getelementptr float, ptr addrspace(1) %5905, i64 %5590, !dbg !357 + %8971 = getelementptr float, ptr addrspace(1) %5905, i64 %5591, !dbg !357 + %8972 = getelementptr float, ptr addrspace(1) %5905, i64 %5592, !dbg !357 + %8973 = getelementptr float, ptr addrspace(1) %5905, i64 %5593, !dbg !357 + %8974 = getelementptr float, ptr addrspace(1) %5905, i64 %5594, !dbg !357 + %8975 = getelementptr float, ptr addrspace(1) %5905, i64 %5595, !dbg !357 + %8976 = getelementptr float, ptr addrspace(1) %5905, i64 %5596, !dbg !357 + %8977 = getelementptr float, ptr addrspace(1) %5905, i64 %5597, !dbg !357 + %8978 = getelementptr float, ptr addrspace(1) %5905, i64 %5598, !dbg !357 + %8979 = getelementptr float, ptr addrspace(1) %5905, i64 %5599, !dbg !357 + %8980 = getelementptr float, ptr addrspace(1) %5905, i64 %5600, !dbg !357 + %8981 = getelementptr float, ptr addrspace(1) %5905, i64 %5601, !dbg !357 + %8982 = getelementptr float, ptr addrspace(1) %5905, i64 %5602, !dbg !357 + %8983 = getelementptr float, ptr addrspace(1) %5905, i64 %5603, !dbg !357 + %8984 = getelementptr float, ptr addrspace(1) %5905, i64 %5604, !dbg !357 + %8985 = getelementptr float, ptr addrspace(1) %5905, i64 %5605, !dbg !357 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %5382, ptr addrspace(1) %8970, i32 %5622, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5383, ptr addrspace(1) %8971, i32 %5623, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5384, ptr addrspace(1) %8972, i32 %5624, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5385, ptr addrspace(1) %8973, i32 %5625, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5386, ptr addrspace(1) %8974, i32 %5626, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5387, ptr addrspace(1) %8975, i32 %5627, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5388, ptr addrspace(1) %8976, i32 %5628, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5389, ptr addrspace(1) %8977, i32 %5629, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5390, ptr addrspace(1) %8978, i32 %5630, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5391, ptr addrspace(1) %8979, i32 %5631, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5392, ptr addrspace(1) %8980, i32 %5632, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5393, ptr addrspace(1) %8981, i32 %5633, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5394, ptr addrspace(1) %8982, i32 %5634, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5395, ptr addrspace(1) %8983, i32 %5635, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5396, ptr addrspace(1) %8984, i32 %5636, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5397, ptr addrspace(1) %8985, i32 %5637, i1 %5331) #3, !dbg !358 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !358 + %8986 = getelementptr i8, ptr addrspace(1) %8942, i64 524288, !dbg !359 + %8987 = getelementptr i8, ptr addrspace(1) %8943, i64 524288, !dbg !359 + %8988 = getelementptr i8, ptr addrspace(1) %8944, i64 524288, !dbg !359 + %8989 = getelementptr i8, ptr addrspace(1) %8945, i64 524288, !dbg !359 + %8990 = getelementptr i8, ptr addrspace(1) %8950, i64 16384, !dbg !360 + %8991 = getelementptr i8, ptr addrspace(1) %8951, i64 16384, !dbg !360 + %8992 = getelementptr i8, ptr addrspace(1) %8952, i64 16384, !dbg !360 + %8993 = getelementptr i8, ptr addrspace(1) %8953, i64 16384, !dbg !360 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5441, ptr addrspace(1) %8986, i32 %5667) #3, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5443, ptr addrspace(1) %8987, i32 %5668) #3, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5445, ptr addrspace(1) %8988, i32 %5669) #3, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5447, ptr addrspace(1) %8989, i32 %5670) #3, !dbg !353 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !353 + %8994 = getelementptr float, ptr addrspace(1) %5904, i64 %5687, !dbg !354 + %8995 = getelementptr float, ptr addrspace(1) %5904, i64 %5688, !dbg !354 + %8996 = getelementptr float, ptr addrspace(1) %5904, i64 %5689, !dbg !354 + %8997 = getelementptr float, ptr addrspace(1) %5904, i64 %5690, !dbg !354 + %8998 = getelementptr float, ptr addrspace(1) %5904, i64 %5691, !dbg !354 + %8999 = getelementptr float, ptr addrspace(1) %5904, i64 %5692, !dbg !354 + %9000 = getelementptr float, ptr addrspace(1) %5904, i64 %5693, !dbg !354 + %9001 = getelementptr float, ptr addrspace(1) %5904, i64 %5694, !dbg !354 + %9002 = getelementptr float, ptr addrspace(1) %5904, i64 %5695, !dbg !354 + %9003 = getelementptr float, ptr addrspace(1) %5904, i64 %5696, !dbg !354 + %9004 = getelementptr float, ptr addrspace(1) %5904, i64 %5697, !dbg !354 + %9005 = getelementptr float, ptr addrspace(1) %5904, i64 %5698, !dbg !354 + %9006 = getelementptr float, ptr addrspace(1) %5904, i64 %5699, !dbg !354 + %9007 = getelementptr float, ptr addrspace(1) %5904, i64 %5700, !dbg !354 + %9008 = getelementptr float, ptr addrspace(1) %5904, i64 %5701, !dbg !354 + %9009 = getelementptr float, ptr addrspace(1) %5904, i64 %5702, !dbg !354 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %5499, ptr addrspace(1) %8994, i32 %5719, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5501, ptr addrspace(1) %8995, i32 %5720, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5503, ptr addrspace(1) %8996, i32 %5721, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5505, ptr addrspace(1) %8997, i32 %5722, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5507, ptr addrspace(1) %8998, i32 %5723, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5509, ptr addrspace(1) %8999, i32 %5724, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5511, ptr addrspace(1) %9000, i32 %5725, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5513, ptr addrspace(1) %9001, i32 %5726, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5515, ptr addrspace(1) %9002, i32 %5727, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5517, ptr addrspace(1) %9003, i32 %5728, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5519, ptr addrspace(1) %9004, i32 %5729, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5521, ptr addrspace(1) %9005, i32 %5730, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5523, ptr addrspace(1) %9006, i32 %5731, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5525, ptr addrspace(1) %9007, i32 %5732, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5527, ptr addrspace(1) %9008, i32 %5733, i1 %5331) #3, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5529, ptr addrspace(1) %9009, i32 %5734, i1 %5331) #3, !dbg !355 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !355 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5531, ptr addrspace(1) %8990, i32 %5667) #3, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5532, ptr addrspace(1) %8991, i32 %5668) #3, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5533, ptr addrspace(1) %8992, i32 %5669) #3, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5534, ptr addrspace(1) %8993, i32 %5670) #3, !dbg !356 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !356 + %9010 = getelementptr float, ptr addrspace(1) %5905, i64 %5687, !dbg !357 + %9011 = getelementptr float, ptr addrspace(1) %5905, i64 %5688, !dbg !357 + %9012 = getelementptr float, ptr addrspace(1) %5905, i64 %5689, !dbg !357 + %9013 = getelementptr float, ptr addrspace(1) %5905, i64 %5690, !dbg !357 + %9014 = getelementptr float, ptr addrspace(1) %5905, i64 %5691, !dbg !357 + %9015 = getelementptr float, ptr addrspace(1) %5905, i64 %5692, !dbg !357 + %9016 = getelementptr float, ptr addrspace(1) %5905, i64 %5693, !dbg !357 + %9017 = getelementptr float, ptr addrspace(1) %5905, i64 %5694, !dbg !357 + %9018 = getelementptr float, ptr addrspace(1) %5905, i64 %5695, !dbg !357 + %9019 = getelementptr float, ptr addrspace(1) %5905, i64 %5696, !dbg !357 + %9020 = getelementptr float, ptr addrspace(1) %5905, i64 %5697, !dbg !357 + %9021 = getelementptr float, ptr addrspace(1) %5905, i64 %5698, !dbg !357 + %9022 = getelementptr float, ptr addrspace(1) %5905, i64 %5699, !dbg !357 + %9023 = getelementptr float, ptr addrspace(1) %5905, i64 %5700, !dbg !357 + %9024 = getelementptr float, ptr addrspace(1) %5905, i64 %5701, !dbg !357 + %9025 = getelementptr float, ptr addrspace(1) %5905, i64 %5702, !dbg !357 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %5535, ptr addrspace(1) %9010, i32 %5719, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5536, ptr addrspace(1) %9011, i32 %5720, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5537, ptr addrspace(1) %9012, i32 %5721, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5538, ptr addrspace(1) %9013, i32 %5722, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5539, ptr addrspace(1) %9014, i32 %5723, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5540, ptr addrspace(1) %9015, i32 %5724, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5541, ptr addrspace(1) %9016, i32 %5725, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5542, ptr addrspace(1) %9017, i32 %5726, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5543, ptr addrspace(1) %9018, i32 %5727, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5544, ptr addrspace(1) %9019, i32 %5728, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5545, ptr addrspace(1) %9020, i32 %5729, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5546, ptr addrspace(1) %9021, i32 %5730, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5547, ptr addrspace(1) %9022, i32 %5731, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5548, ptr addrspace(1) %9023, i32 %5732, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5549, ptr addrspace(1) %9024, i32 %5733, i1 %5331) #3, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5550, ptr addrspace(1) %9025, i32 %5734, i1 %5331) #3, !dbg !358 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !358 + br i1 %5561, label %.lr.ph1793, label %._crit_edge1794, !dbg !361 + +.lr.ph1793: ; preds = %._crit_edge1621, %__nv_exp2f.exit1239 + %.pn605.pn1791 = phi i32 [ %.pn6051775, %__nv_exp2f.exit1239 ], [ %5232, %._crit_edge1621 ] + %.pn607.pn1790 = phi i32 [ %.pn6071774, %__nv_exp2f.exit1239 ], [ %5230, %._crit_edge1621 ] + %.pn609.pn1789 = phi i32 [ %.pn6091773, %__nv_exp2f.exit1239 ], [ %5228, %._crit_edge1621 ] + %.pn611.pn1788 = phi i32 [ %.pn6111772, %__nv_exp2f.exit1239 ], [ %5226, %._crit_edge1621 ] + %.pn613.pn1787 = phi i32 [ %.pn6131771, %__nv_exp2f.exit1239 ], [ %5224, %._crit_edge1621 ] + %.pn615.pn1786 = phi i32 [ %.pn6151770, %__nv_exp2f.exit1239 ], [ %5222, %._crit_edge1621 ] + %.pn617.pn1785 = phi i32 [ %.pn6171769, %__nv_exp2f.exit1239 ], [ %5220, %._crit_edge1621 ] + %.pn619.pn1784 = phi i32 [ %.pn6191768, %__nv_exp2f.exit1239 ], [ %5218, %._crit_edge1621 ] + %.pn621.pn1783 = phi i32 [ %.pn6211767, %__nv_exp2f.exit1239 ], [ %5216, %._crit_edge1621 ] + %.pn623.pn1782 = phi i32 [ %.pn6231766, %__nv_exp2f.exit1239 ], [ %5214, %._crit_edge1621 ] + %.pn625.pn1781 = phi i32 [ %.pn6251765, %__nv_exp2f.exit1239 ], [ %5212, %._crit_edge1621 ] + %.pn627.pn1780 = phi i32 [ %.pn6271764, %__nv_exp2f.exit1239 ], [ %5210, %._crit_edge1621 ] + %.pn629.pn1779 = phi i32 [ %.pn6291763, %__nv_exp2f.exit1239 ], [ %5208, %._crit_edge1621 ] + %.pn631.pn1778 = phi i32 [ %.pn6311762, %__nv_exp2f.exit1239 ], [ %5207, %._crit_edge1621 ] + %.pn633.pn1777 = phi i32 [ %.pn6331761, %__nv_exp2f.exit1239 ], [ %5206, %._crit_edge1621 ] + %.pn635.pn1776 = phi i32 [ %.pn6351760, %__nv_exp2f.exit1239 ], [ %5205, %._crit_edge1621 ] + %9026 = phi i32 [ %9043, %__nv_exp2f.exit1239 ], [ -1, %._crit_edge1621 ] + %9027 = phi i32 [ %11065, %__nv_exp2f.exit1239 ], [ 1, %._crit_edge1621 ] + %9028 = phi i32 [ %9046, %__nv_exp2f.exit1239 ], [ -1, %._crit_edge1621 ] + %9029 = phi i32 [ %11068, %__nv_exp2f.exit1239 ], [ 1, %._crit_edge1621 ] + %.pn6051775 = phi i32 [ %11054, %__nv_exp2f.exit1239 ], [ %5654, %._crit_edge1621 ] + %.pn6071774 = phi i32 [ %11053, %__nv_exp2f.exit1239 ], [ %5653, %._crit_edge1621 ] + %.pn6091773 = phi i32 [ %11052, %__nv_exp2f.exit1239 ], [ %5652, %._crit_edge1621 ] + %.pn6111772 = phi i32 [ %11051, %__nv_exp2f.exit1239 ], [ %5651, %._crit_edge1621 ] + %.pn6131771 = phi i32 [ %11050, %__nv_exp2f.exit1239 ], [ %5650, %._crit_edge1621 ] + %.pn6151770 = phi i32 [ %11049, %__nv_exp2f.exit1239 ], [ %5649, %._crit_edge1621 ] + %.pn6171769 = phi i32 [ %11048, %__nv_exp2f.exit1239 ], [ %5648, %._crit_edge1621 ] + %.pn6191768 = phi i32 [ %11047, %__nv_exp2f.exit1239 ], [ %5647, %._crit_edge1621 ] + %.pn6211767 = phi i32 [ %11046, %__nv_exp2f.exit1239 ], [ %5646, %._crit_edge1621 ] + %.pn6231766 = phi i32 [ %11045, %__nv_exp2f.exit1239 ], [ %5645, %._crit_edge1621 ] + %.pn6251765 = phi i32 [ %11044, %__nv_exp2f.exit1239 ], [ %5644, %._crit_edge1621 ] + %.pn6271764 = phi i32 [ %11043, %__nv_exp2f.exit1239 ], [ %5643, %._crit_edge1621 ] + %.pn6291763 = phi i32 [ %11042, %__nv_exp2f.exit1239 ], [ %5642, %._crit_edge1621 ] + %.pn6311762 = phi i32 [ %11041, %__nv_exp2f.exit1239 ], [ %5641, %._crit_edge1621 ] + %.pn6331761 = phi i32 [ %11040, %__nv_exp2f.exit1239 ], [ %5640, %._crit_edge1621 ] + %.pn6351760 = phi i32 [ %11039, %__nv_exp2f.exit1239 ], [ %5639, %._crit_edge1621 ] + %9030 = phi i32 [ %11059, %__nv_exp2f.exit1239 ], [ %5655, %._crit_edge1621 ] + %9031 = phi i32 [ %11060, %__nv_exp2f.exit1239 ], [ %5656, %._crit_edge1621 ] + %9032 = phi i32 [ %11061, %__nv_exp2f.exit1239 ], [ %5657, %._crit_edge1621 ] + %9033 = phi i32 [ %11062, %__nv_exp2f.exit1239 ], [ %5658, %._crit_edge1621 ] + %.pn5551759 = phi ptr addrspace(1) [ %11038, %__nv_exp2f.exit1239 ], [ %8993, %._crit_edge1621 ] + %.pn5711758 = phi ptr addrspace(1) [ %11037, %__nv_exp2f.exit1239 ], [ %8992, %._crit_edge1621 ] + %.pn5871757 = phi ptr addrspace(1) [ %11036, %__nv_exp2f.exit1239 ], [ %8991, %._crit_edge1621 ] + %.pn6031756 = phi ptr addrspace(1) [ %11035, %__nv_exp2f.exit1239 ], [ %8990, %._crit_edge1621 ] + %9034 = phi i32 [ %11055, %__nv_exp2f.exit1239 ], [ %5655, %._crit_edge1621 ] + %9035 = phi i32 [ %11056, %__nv_exp2f.exit1239 ], [ %5656, %._crit_edge1621 ] + %9036 = phi i32 [ %11057, %__nv_exp2f.exit1239 ], [ %5657, %._crit_edge1621 ] + %9037 = phi i32 [ %11058, %__nv_exp2f.exit1239 ], [ %5658, %._crit_edge1621 ] + %.pn4911755 = phi ptr addrspace(1) [ %11032, %__nv_exp2f.exit1239 ], [ %8989, %._crit_edge1621 ] + %.pn5071754 = phi ptr addrspace(1) [ %11031, %__nv_exp2f.exit1239 ], [ %8988, %._crit_edge1621 ] + %.pn5231753 = phi ptr addrspace(1) [ %11030, %__nv_exp2f.exit1239 ], [ %8987, %._crit_edge1621 ] + %.pn5391752 = phi ptr addrspace(1) [ %11029, %__nv_exp2f.exit1239 ], [ %8986, %._crit_edge1621 ] + %.pn3491751 = phi float [ %10150, %__nv_exp2f.exit1239 ], [ %8873, %._crit_edge1621 ] + %.pn3511750 = phi float [ %10149, %__nv_exp2f.exit1239 ], [ %8872, %._crit_edge1621 ] + %.pn3531749 = phi float [ %10148, %__nv_exp2f.exit1239 ], [ %8871, %._crit_edge1621 ] + %.pn3551748 = phi float [ %10147, %__nv_exp2f.exit1239 ], [ %8870, %._crit_edge1621 ] + %.pn3571747 = phi float [ %10146, %__nv_exp2f.exit1239 ], [ %8869, %._crit_edge1621 ] + %.pn3591746 = phi float [ %10145, %__nv_exp2f.exit1239 ], [ %8868, %._crit_edge1621 ] + %.pn3611745 = phi float [ %10144, %__nv_exp2f.exit1239 ], [ %8867, %._crit_edge1621 ] + %.pn3631744 = phi float [ %10143, %__nv_exp2f.exit1239 ], [ %8866, %._crit_edge1621 ] + %.pn3651743 = phi float [ %10142, %__nv_exp2f.exit1239 ], [ %8865, %._crit_edge1621 ] + %.pn3671742 = phi float [ %10141, %__nv_exp2f.exit1239 ], [ %8864, %._crit_edge1621 ] + %.pn3691741 = phi float [ %10140, %__nv_exp2f.exit1239 ], [ %8863, %._crit_edge1621 ] + %.pn3711740 = phi float [ %10139, %__nv_exp2f.exit1239 ], [ %8862, %._crit_edge1621 ] + %.pn3731739 = phi float [ %10138, %__nv_exp2f.exit1239 ], [ %8861, %._crit_edge1621 ] + %.pn3751738 = phi float [ %10137, %__nv_exp2f.exit1239 ], [ %8860, %._crit_edge1621 ] + %.pn3771737 = phi float [ %10136, %__nv_exp2f.exit1239 ], [ %8859, %._crit_edge1621 ] + %.pn3791736 = phi float [ %10135, %__nv_exp2f.exit1239 ], [ %8858, %._crit_edge1621 ] + %.pn3811735 = phi float [ %10134, %__nv_exp2f.exit1239 ], [ %8857, %._crit_edge1621 ] + %.pn3831734 = phi float [ %10133, %__nv_exp2f.exit1239 ], [ %8856, %._crit_edge1621 ] + %.pn3851733 = phi float [ %10132, %__nv_exp2f.exit1239 ], [ %8855, %._crit_edge1621 ] + %.pn3871732 = phi float [ %10131, %__nv_exp2f.exit1239 ], [ %8854, %._crit_edge1621 ] + %.pn3891731 = phi float [ %10130, %__nv_exp2f.exit1239 ], [ %8853, %._crit_edge1621 ] + %.pn3911730 = phi float [ %10129, %__nv_exp2f.exit1239 ], [ %8852, %._crit_edge1621 ] + %.pn3931729 = phi float [ %10128, %__nv_exp2f.exit1239 ], [ %8851, %._crit_edge1621 ] + %.pn3951728 = phi float [ %10127, %__nv_exp2f.exit1239 ], [ %8850, %._crit_edge1621 ] + %.pn3971727 = phi float [ %10126, %__nv_exp2f.exit1239 ], [ %8849, %._crit_edge1621 ] + %.pn3991726 = phi float [ %10125, %__nv_exp2f.exit1239 ], [ %8848, %._crit_edge1621 ] + %.pn4011725 = phi float [ %10124, %__nv_exp2f.exit1239 ], [ %8847, %._crit_edge1621 ] + %.pn4031724 = phi float [ %10123, %__nv_exp2f.exit1239 ], [ %8846, %._crit_edge1621 ] + %.pn4051723 = phi float [ %10122, %__nv_exp2f.exit1239 ], [ %8845, %._crit_edge1621 ] + %.pn4071722 = phi float [ %10121, %__nv_exp2f.exit1239 ], [ %8844, %._crit_edge1621 ] + %.pn4091721 = phi float [ %10120, %__nv_exp2f.exit1239 ], [ %8843, %._crit_edge1621 ] + %.pn4111720 = phi float [ %10119, %__nv_exp2f.exit1239 ], [ %8842, %._crit_edge1621 ] + %.pn4131719 = phi float [ %10118, %__nv_exp2f.exit1239 ], [ %8841, %._crit_edge1621 ] + %.pn4151718 = phi float [ %10117, %__nv_exp2f.exit1239 ], [ %8840, %._crit_edge1621 ] + %.pn4171717 = phi float [ %10116, %__nv_exp2f.exit1239 ], [ %8839, %._crit_edge1621 ] + %.pn4191716 = phi float [ %10115, %__nv_exp2f.exit1239 ], [ %8838, %._crit_edge1621 ] + %.pn4211715 = phi float [ %10114, %__nv_exp2f.exit1239 ], [ %8837, %._crit_edge1621 ] + %.pn4231714 = phi float [ %10113, %__nv_exp2f.exit1239 ], [ %8836, %._crit_edge1621 ] + %.pn4251713 = phi float [ %10112, %__nv_exp2f.exit1239 ], [ %8835, %._crit_edge1621 ] + %.pn4271712 = phi float [ %10111, %__nv_exp2f.exit1239 ], [ %8834, %._crit_edge1621 ] + %.pn4291711 = phi float [ %10110, %__nv_exp2f.exit1239 ], [ %8833, %._crit_edge1621 ] + %.pn4311710 = phi float [ %10109, %__nv_exp2f.exit1239 ], [ %8832, %._crit_edge1621 ] + %.pn4331709 = phi float [ %10108, %__nv_exp2f.exit1239 ], [ %8831, %._crit_edge1621 ] + %.pn4351708 = phi float [ %10107, %__nv_exp2f.exit1239 ], [ %8830, %._crit_edge1621 ] + %.pn4371707 = phi float [ %10106, %__nv_exp2f.exit1239 ], [ %8829, %._crit_edge1621 ] + %.pn4391706 = phi float [ %10105, %__nv_exp2f.exit1239 ], [ %8828, %._crit_edge1621 ] + %.pn4411705 = phi float [ %10104, %__nv_exp2f.exit1239 ], [ %8827, %._crit_edge1621 ] + %.pn4431704 = phi float [ %10103, %__nv_exp2f.exit1239 ], [ %8826, %._crit_edge1621 ] + %.pn4451703 = phi float [ %10102, %__nv_exp2f.exit1239 ], [ %8825, %._crit_edge1621 ] + %.pn4471702 = phi float [ %10101, %__nv_exp2f.exit1239 ], [ %8824, %._crit_edge1621 ] + %.pn4491701 = phi float [ %10100, %__nv_exp2f.exit1239 ], [ %8823, %._crit_edge1621 ] + %.pn4511700 = phi float [ %10099, %__nv_exp2f.exit1239 ], [ %8822, %._crit_edge1621 ] + %.pn4531699 = phi float [ %10098, %__nv_exp2f.exit1239 ], [ %8821, %._crit_edge1621 ] + %.pn4551698 = phi float [ %10097, %__nv_exp2f.exit1239 ], [ %8820, %._crit_edge1621 ] + %.pn4571697 = phi float [ %10096, %__nv_exp2f.exit1239 ], [ %8819, %._crit_edge1621 ] + %.pn4591696 = phi float [ %10095, %__nv_exp2f.exit1239 ], [ %8818, %._crit_edge1621 ] + %.pn4611695 = phi float [ %10094, %__nv_exp2f.exit1239 ], [ %8817, %._crit_edge1621 ] + %.pn4631694 = phi float [ %10093, %__nv_exp2f.exit1239 ], [ %8816, %._crit_edge1621 ] + %.pn4651693 = phi float [ %10092, %__nv_exp2f.exit1239 ], [ %8815, %._crit_edge1621 ] + %.pn4671692 = phi float [ %10091, %__nv_exp2f.exit1239 ], [ %8814, %._crit_edge1621 ] + %.pn4691691 = phi float [ %10090, %__nv_exp2f.exit1239 ], [ %8813, %._crit_edge1621 ] + %.pn4711690 = phi float [ %10089, %__nv_exp2f.exit1239 ], [ %8812, %._crit_edge1621 ] + %.pn4731689 = phi float [ %10088, %__nv_exp2f.exit1239 ], [ %8811, %._crit_edge1621 ] + %.pn4751688 = phi float [ %10087, %__nv_exp2f.exit1239 ], [ %8810, %._crit_edge1621 ] + %.pn2211687 = phi float [ %11006, %__nv_exp2f.exit1239 ], [ %8937, %._crit_edge1621 ] + %.pn2231686 = phi float [ %11005, %__nv_exp2f.exit1239 ], [ %8936, %._crit_edge1621 ] + %.pn2251685 = phi float [ %11004, %__nv_exp2f.exit1239 ], [ %8935, %._crit_edge1621 ] + %.pn2271684 = phi float [ %11003, %__nv_exp2f.exit1239 ], [ %8934, %._crit_edge1621 ] + %.pn2291683 = phi float [ %11002, %__nv_exp2f.exit1239 ], [ %8933, %._crit_edge1621 ] + %.pn2311682 = phi float [ %11001, %__nv_exp2f.exit1239 ], [ %8932, %._crit_edge1621 ] + %.pn2331681 = phi float [ %11000, %__nv_exp2f.exit1239 ], [ %8931, %._crit_edge1621 ] + %.pn2351680 = phi float [ %10999, %__nv_exp2f.exit1239 ], [ %8930, %._crit_edge1621 ] + %.pn2371679 = phi float [ %10998, %__nv_exp2f.exit1239 ], [ %8929, %._crit_edge1621 ] + %.pn2391678 = phi float [ %10997, %__nv_exp2f.exit1239 ], [ %8928, %._crit_edge1621 ] + %.pn2411677 = phi float [ %10996, %__nv_exp2f.exit1239 ], [ %8927, %._crit_edge1621 ] + %.pn2431676 = phi float [ %10995, %__nv_exp2f.exit1239 ], [ %8926, %._crit_edge1621 ] + %.pn2451675 = phi float [ %10994, %__nv_exp2f.exit1239 ], [ %8925, %._crit_edge1621 ] + %.pn2471674 = phi float [ %10993, %__nv_exp2f.exit1239 ], [ %8924, %._crit_edge1621 ] + %.pn2491673 = phi float [ %10992, %__nv_exp2f.exit1239 ], [ %8923, %._crit_edge1621 ] + %.pn2511672 = phi float [ %10991, %__nv_exp2f.exit1239 ], [ %8922, %._crit_edge1621 ] + %.pn2531671 = phi float [ %10990, %__nv_exp2f.exit1239 ], [ %8921, %._crit_edge1621 ] + %.pn2551670 = phi float [ %10989, %__nv_exp2f.exit1239 ], [ %8920, %._crit_edge1621 ] + %.pn2571669 = phi float [ %10988, %__nv_exp2f.exit1239 ], [ %8919, %._crit_edge1621 ] + %.pn2591668 = phi float [ %10987, %__nv_exp2f.exit1239 ], [ %8918, %._crit_edge1621 ] + %.pn2611667 = phi float [ %10986, %__nv_exp2f.exit1239 ], [ %8917, %._crit_edge1621 ] + %.pn2631666 = phi float [ %10985, %__nv_exp2f.exit1239 ], [ %8916, %._crit_edge1621 ] + %.pn2651665 = phi float [ %10984, %__nv_exp2f.exit1239 ], [ %8915, %._crit_edge1621 ] + %.pn2671664 = phi float [ %10983, %__nv_exp2f.exit1239 ], [ %8914, %._crit_edge1621 ] + %.pn2691663 = phi float [ %10982, %__nv_exp2f.exit1239 ], [ %8913, %._crit_edge1621 ] + %.pn2711662 = phi float [ %10981, %__nv_exp2f.exit1239 ], [ %8912, %._crit_edge1621 ] + %.pn2731661 = phi float [ %10980, %__nv_exp2f.exit1239 ], [ %8911, %._crit_edge1621 ] + %.pn2751660 = phi float [ %10979, %__nv_exp2f.exit1239 ], [ %8910, %._crit_edge1621 ] + %.pn2771659 = phi float [ %10978, %__nv_exp2f.exit1239 ], [ %8909, %._crit_edge1621 ] + %.pn2791658 = phi float [ %10977, %__nv_exp2f.exit1239 ], [ %8908, %._crit_edge1621 ] + %.pn2811657 = phi float [ %10976, %__nv_exp2f.exit1239 ], [ %8907, %._crit_edge1621 ] + %.pn2831656 = phi float [ %10975, %__nv_exp2f.exit1239 ], [ %8906, %._crit_edge1621 ] + %.pn2851655 = phi float [ %10974, %__nv_exp2f.exit1239 ], [ %8905, %._crit_edge1621 ] + %.pn2871654 = phi float [ %10973, %__nv_exp2f.exit1239 ], [ %8904, %._crit_edge1621 ] + %.pn2891653 = phi float [ %10972, %__nv_exp2f.exit1239 ], [ %8903, %._crit_edge1621 ] + %.pn2911652 = phi float [ %10971, %__nv_exp2f.exit1239 ], [ %8902, %._crit_edge1621 ] + %.pn2931651 = phi float [ %10970, %__nv_exp2f.exit1239 ], [ %8901, %._crit_edge1621 ] + %.pn2951650 = phi float [ %10969, %__nv_exp2f.exit1239 ], [ %8900, %._crit_edge1621 ] + %.pn2971649 = phi float [ %10968, %__nv_exp2f.exit1239 ], [ %8899, %._crit_edge1621 ] + %.pn2991648 = phi float [ %10967, %__nv_exp2f.exit1239 ], [ %8898, %._crit_edge1621 ] + %.pn3011647 = phi float [ %10966, %__nv_exp2f.exit1239 ], [ %8897, %._crit_edge1621 ] + %.pn3031646 = phi float [ %10965, %__nv_exp2f.exit1239 ], [ %8896, %._crit_edge1621 ] + %.pn3051645 = phi float [ %10964, %__nv_exp2f.exit1239 ], [ %8895, %._crit_edge1621 ] + %.pn3071644 = phi float [ %10963, %__nv_exp2f.exit1239 ], [ %8894, %._crit_edge1621 ] + %.pn3091643 = phi float [ %10962, %__nv_exp2f.exit1239 ], [ %8893, %._crit_edge1621 ] + %.pn3111642 = phi float [ %10961, %__nv_exp2f.exit1239 ], [ %8892, %._crit_edge1621 ] + %.pn3131641 = phi float [ %10960, %__nv_exp2f.exit1239 ], [ %8891, %._crit_edge1621 ] + %.pn3151640 = phi float [ %10959, %__nv_exp2f.exit1239 ], [ %8890, %._crit_edge1621 ] + %.pn3171639 = phi float [ %10958, %__nv_exp2f.exit1239 ], [ %8889, %._crit_edge1621 ] + %.pn3191638 = phi float [ %10957, %__nv_exp2f.exit1239 ], [ %8888, %._crit_edge1621 ] + %.pn3211637 = phi float [ %10956, %__nv_exp2f.exit1239 ], [ %8887, %._crit_edge1621 ] + %.pn3231636 = phi float [ %10955, %__nv_exp2f.exit1239 ], [ %8886, %._crit_edge1621 ] + %.pn3251635 = phi float [ %10954, %__nv_exp2f.exit1239 ], [ %8885, %._crit_edge1621 ] + %.pn3271634 = phi float [ %10953, %__nv_exp2f.exit1239 ], [ %8884, %._crit_edge1621 ] + %.pn3291633 = phi float [ %10952, %__nv_exp2f.exit1239 ], [ %8883, %._crit_edge1621 ] + %.pn3311632 = phi float [ %10951, %__nv_exp2f.exit1239 ], [ %8882, %._crit_edge1621 ] + %.pn3331631 = phi float [ %10950, %__nv_exp2f.exit1239 ], [ %8881, %._crit_edge1621 ] + %.pn3351630 = phi float [ %10949, %__nv_exp2f.exit1239 ], [ %8880, %._crit_edge1621 ] + %.pn3371629 = phi float [ %10948, %__nv_exp2f.exit1239 ], [ %8879, %._crit_edge1621 ] + %.pn3391628 = phi float [ %10947, %__nv_exp2f.exit1239 ], [ %8878, %._crit_edge1621 ] + %.pn3411627 = phi float [ %10946, %__nv_exp2f.exit1239 ], [ %8877, %._crit_edge1621 ] + %.pn3431626 = phi float [ %10945, %__nv_exp2f.exit1239 ], [ %8876, %._crit_edge1621 ] + %.pn3451625 = phi float [ %10944, %__nv_exp2f.exit1239 ], [ %8875, %._crit_edge1621 ] + %.pn3471624 = phi float [ %10943, %__nv_exp2f.exit1239 ], [ %8874, %._crit_edge1621 ] + %9038 = phi i32 [ %11007, %__nv_exp2f.exit1239 ], [ 0, %._crit_edge1621 ] + %9039 = icmp slt i32 %9038, %5735, !dbg !361 + %9040 = icmp slt i32 %9038, %5736, !dbg !361 + %9041 = add i32 %9026, 1, !dbg !361 + %9042 = icmp sgt i32 %9041, 1, !dbg !361 + %9043 = select i1 %9042, i32 0, i32 %9041, !dbg !361 + %9044 = add i32 %9028, 1, !dbg !361 + %9045 = icmp sgt i32 %9044, 2, !dbg !361 + %9046 = select i1 %9045, i32 0, i32 %9044, !dbg !361 + %9047 = icmp slt i32 %.pn635.pn1776, %18, !dbg !362 + %9048 = icmp slt i32 %.pn633.pn1777, %18, !dbg !362 + %9049 = icmp slt i32 %.pn631.pn1778, %18, !dbg !362 + %9050 = icmp slt i32 %.pn629.pn1779, %18, !dbg !362 + %9051 = icmp slt i32 %.pn627.pn1780, %18, !dbg !362 + %9052 = icmp slt i32 %.pn625.pn1781, %18, !dbg !362 + %9053 = icmp slt i32 %.pn623.pn1782, %18, !dbg !362 + %9054 = icmp slt i32 %.pn621.pn1783, %18, !dbg !362 + %9055 = icmp slt i32 %.pn619.pn1784, %18, !dbg !362 + %9056 = icmp slt i32 %.pn617.pn1785, %18, !dbg !362 + %9057 = icmp slt i32 %.pn615.pn1786, %18, !dbg !362 + %9058 = icmp slt i32 %.pn613.pn1787, %18, !dbg !362 + %9059 = icmp slt i32 %.pn611.pn1788, %18, !dbg !362 + %9060 = icmp slt i32 %.pn609.pn1789, %18, !dbg !362 + %9061 = icmp slt i32 %.pn607.pn1790, %18, !dbg !362 + %9062 = icmp slt i32 %.pn605.pn1791, %18, !dbg !362 + tail call void @llvm.nvvm.cp.async.wait.group(i32 4), !dbg !353 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !353 + %9063 = shl i32 %9046, 13, !dbg !353 + %9064 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %9063, !dbg !353 + %9065 = shl i32 %9043, 6, !dbg !355 + %9066 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %9065, !dbg !355 + %9067 = getelementptr inbounds nuw i8, ptr addrspace(3) %9066, i32 %5332, !dbg !355 + %9068 = load float, ptr addrspace(3) %9067, align 8, !dbg !355 + %9069 = getelementptr inbounds nuw i8, ptr addrspace(3) %9067, i32 4, !dbg !355 + %9070 = load float, ptr addrspace(3) %9069, align 4, !dbg !355 + %9071 = getelementptr inbounds nuw i8, ptr addrspace(3) %9066, i32 %5336, !dbg !355 + %9072 = load float, ptr addrspace(3) %9071, align 8, !dbg !355 + %9073 = getelementptr inbounds nuw i8, ptr addrspace(3) %9071, i32 4, !dbg !355 + %9074 = load float, ptr addrspace(3) %9073, align 4, !dbg !355 + %9075 = getelementptr inbounds nuw i8, ptr addrspace(3) %9066, i32 %5342, !dbg !355 + %9076 = load float, ptr addrspace(3) %9075, align 8, !dbg !355 + %9077 = getelementptr inbounds nuw i8, ptr addrspace(3) %9075, i32 4, !dbg !355 + %9078 = load float, ptr addrspace(3) %9077, align 4, !dbg !355 + %9079 = getelementptr inbounds nuw i8, ptr addrspace(3) %9066, i32 %5348, !dbg !355 + %9080 = load float, ptr addrspace(3) %9079, align 8, !dbg !355 + %9081 = getelementptr inbounds nuw i8, ptr addrspace(3) %9079, i32 4, !dbg !355 + %9082 = load float, ptr addrspace(3) %9081, align 4, !dbg !355 + %9083 = getelementptr inbounds nuw i8, ptr addrspace(3) %9066, i32 %5354, !dbg !355 + %9084 = load float, ptr addrspace(3) %9083, align 8, !dbg !355 + %9085 = getelementptr inbounds nuw i8, ptr addrspace(3) %9083, i32 4, !dbg !355 + %9086 = load float, ptr addrspace(3) %9085, align 4, !dbg !355 + %9087 = getelementptr inbounds nuw i8, ptr addrspace(3) %9066, i32 %5360, !dbg !355 + %9088 = load float, ptr addrspace(3) %9087, align 8, !dbg !355 + %9089 = getelementptr inbounds nuw i8, ptr addrspace(3) %9087, i32 4, !dbg !355 + %9090 = load float, ptr addrspace(3) %9089, align 4, !dbg !355 + %9091 = getelementptr inbounds nuw i8, ptr addrspace(3) %9066, i32 %5366, !dbg !355 + %9092 = load float, ptr addrspace(3) %9091, align 8, !dbg !355 + %9093 = getelementptr inbounds nuw i8, ptr addrspace(3) %9091, i32 4, !dbg !355 + %9094 = load float, ptr addrspace(3) %9093, align 4, !dbg !355 + %9095 = getelementptr inbounds nuw i8, ptr addrspace(3) %9066, i32 %5372, !dbg !355 + %9096 = load float, ptr addrspace(3) %9095, align 8, !dbg !355 + %9097 = getelementptr inbounds nuw i8, ptr addrspace(3) %9095, i32 4, !dbg !355 + %9098 = load float, ptr addrspace(3) %9097, align 4, !dbg !355 + %9099 = fcmp oeq float %9068, 0xFFF0000000000000, !dbg !363 + %9100 = fcmp oeq float %9070, 0xFFF0000000000000, !dbg !363 + %9101 = fcmp oeq float %9072, 0xFFF0000000000000, !dbg !363 + %9102 = fcmp oeq float %9074, 0xFFF0000000000000, !dbg !363 + %9103 = fcmp oeq float %9076, 0xFFF0000000000000, !dbg !363 + %9104 = fcmp oeq float %9078, 0xFFF0000000000000, !dbg !363 + %9105 = fcmp oeq float %9080, 0xFFF0000000000000, !dbg !363 + %9106 = fcmp oeq float %9082, 0xFFF0000000000000, !dbg !363 + %9107 = fcmp oeq float %9084, 0xFFF0000000000000, !dbg !363 + %9108 = fcmp oeq float %9086, 0xFFF0000000000000, !dbg !363 + %9109 = fcmp oeq float %9088, 0xFFF0000000000000, !dbg !363 + %9110 = fcmp oeq float %9090, 0xFFF0000000000000, !dbg !363 + %9111 = fcmp oeq float %9092, 0xFFF0000000000000, !dbg !363 + %9112 = fcmp oeq float %9094, 0xFFF0000000000000, !dbg !363 + %9113 = fcmp oeq float %9096, 0xFFF0000000000000, !dbg !363 + %9114 = fcmp oeq float %9098, 0xFFF0000000000000, !dbg !363 + %9115 = select i1 %9099, float 0.000000e+00, float %9068, !dbg !364 + %9116 = select i1 %9100, float 0.000000e+00, float %9070, !dbg !364 + %9117 = select i1 %9101, float 0.000000e+00, float %9072, !dbg !364 + %9118 = select i1 %9102, float 0.000000e+00, float %9074, !dbg !364 + %9119 = select i1 %9103, float 0.000000e+00, float %9076, !dbg !364 + %9120 = select i1 %9104, float 0.000000e+00, float %9078, !dbg !364 + %9121 = select i1 %9105, float 0.000000e+00, float %9080, !dbg !364 + %9122 = select i1 %9106, float 0.000000e+00, float %9082, !dbg !364 + %9123 = select i1 %9107, float 0.000000e+00, float %9084, !dbg !364 + %9124 = select i1 %9108, float 0.000000e+00, float %9086, !dbg !364 + %9125 = select i1 %9109, float 0.000000e+00, float %9088, !dbg !364 + %9126 = select i1 %9110, float 0.000000e+00, float %9090, !dbg !364 + %9127 = select i1 %9111, float 0.000000e+00, float %9092, !dbg !364 + %9128 = select i1 %9112, float 0.000000e+00, float %9094, !dbg !364 + %9129 = select i1 %9113, float 0.000000e+00, float %9096, !dbg !364 + %9130 = select i1 %9114, float 0.000000e+00, float %9098, !dbg !364 + %9131 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %57, i32 0, i32 31), !dbg !365 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !365 + %9132 = shl i32 %9131, 11, !dbg !365 + %9133 = and i32 %9132, 8192, !dbg !365 + %9134 = add i32 %9133, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !365 + %9135 = lshr exact i32 %9134, 4, !dbg !365 + %9136 = and i32 %9135, 16383, !dbg !365 + %9137 = zext nneg i32 %9136 to i64, !dbg !365 + %9138 = or disjoint i64 %9137, 4611686293372403712, !dbg !365 + %9139 = ptrtoint ptr addrspace(3) %9064 to i32, !dbg !365 + %9140 = lshr exact i32 %9139, 4, !dbg !365 + %9141 = and i32 %9140, 16383, !dbg !365 + %9142 = zext nneg i32 %9141 to i64, !dbg !365 + %9143 = or disjoint i64 %9142, 4611686293338849280, !dbg !365 + %9144 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %9138, i64 %9143) #3, !dbg !365 + %9145 = or disjoint i32 %9133, 32, !dbg !365 + %9146 = add i32 %9145, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !365 + %9147 = lshr exact i32 %9146, 4, !dbg !365 + %9148 = and i32 %9147, 16383, !dbg !365 + %9149 = zext nneg i32 %9148 to i64, !dbg !365 + %9150 = or disjoint i64 %9149, 4611686293372403712, !dbg !365 + %9151 = add i32 %9139, 32, !dbg !365 + %9152 = lshr exact i32 %9151, 4, !dbg !365 + %9153 = and i32 %9152, 16383, !dbg !365 + %9154 = zext nneg i32 %9153 to i64, !dbg !365 + %9155 = or disjoint i64 %9154, 4611686293338849280, !dbg !365 + %9156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 0, !dbg !365 + %9157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 1, !dbg !365 + %9158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 2, !dbg !365 + %9159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 3, !dbg !365 + %9160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 4, !dbg !365 + %9161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 5, !dbg !365 + %9162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 6, !dbg !365 + %9163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 7, !dbg !365 + %9164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 8, !dbg !365 + %9165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 9, !dbg !365 + %9166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 10, !dbg !365 + %9167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 11, !dbg !365 + %9168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 12, !dbg !365 + %9169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 13, !dbg !365 + %9170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 14, !dbg !365 + %9171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 15, !dbg !365 + %9172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 16, !dbg !365 + %9173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 17, !dbg !365 + %9174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 18, !dbg !365 + %9175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 19, !dbg !365 + %9176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 20, !dbg !365 + %9177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 21, !dbg !365 + %9178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 22, !dbg !365 + %9179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 23, !dbg !365 + %9180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 24, !dbg !365 + %9181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 25, !dbg !365 + %9182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 26, !dbg !365 + %9183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 27, !dbg !365 + %9184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 28, !dbg !365 + %9185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 29, !dbg !365 + %9186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 30, !dbg !365 + %9187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9144, 31, !dbg !365 + %9188 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9156, float %9157, float %9158, float %9159, float %9160, float %9161, float %9162, float %9163, float %9164, float %9165, float %9166, float %9167, float %9168, float %9169, float %9170, float %9171, float %9172, float %9173, float %9174, float %9175, float %9176, float %9177, float %9178, float %9179, float %9180, float %9181, float %9182, float %9183, float %9184, float %9185, float %9186, float %9187, i64 %9150, i64 %9155, i1 true) #3, !dbg !365 + %9189 = or disjoint i32 %9133, 64, !dbg !365 + %9190 = add i32 %9189, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !365 + %9191 = lshr exact i32 %9190, 4, !dbg !365 + %9192 = and i32 %9191, 16383, !dbg !365 + %9193 = zext nneg i32 %9192 to i64, !dbg !365 + %9194 = or disjoint i64 %9193, 4611686293372403712, !dbg !365 + %9195 = add i32 %9139, 64, !dbg !365 + %9196 = lshr exact i32 %9195, 4, !dbg !365 + %9197 = and i32 %9196, 16383, !dbg !365 + %9198 = zext nneg i32 %9197 to i64, !dbg !365 + %9199 = or disjoint i64 %9198, 4611686293338849280, !dbg !365 + %9200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 0, !dbg !365 + %9201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 1, !dbg !365 + %9202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 2, !dbg !365 + %9203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 3, !dbg !365 + %9204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 4, !dbg !365 + %9205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 5, !dbg !365 + %9206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 6, !dbg !365 + %9207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 7, !dbg !365 + %9208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 8, !dbg !365 + %9209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 9, !dbg !365 + %9210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 10, !dbg !365 + %9211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 11, !dbg !365 + %9212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 12, !dbg !365 + %9213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 13, !dbg !365 + %9214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 14, !dbg !365 + %9215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 15, !dbg !365 + %9216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 16, !dbg !365 + %9217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 17, !dbg !365 + %9218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 18, !dbg !365 + %9219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 19, !dbg !365 + %9220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 20, !dbg !365 + %9221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 21, !dbg !365 + %9222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 22, !dbg !365 + %9223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 23, !dbg !365 + %9224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 24, !dbg !365 + %9225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 25, !dbg !365 + %9226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 26, !dbg !365 + %9227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 27, !dbg !365 + %9228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 28, !dbg !365 + %9229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 29, !dbg !365 + %9230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 30, !dbg !365 + %9231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9188, 31, !dbg !365 + %9232 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9200, float %9201, float %9202, float %9203, float %9204, float %9205, float %9206, float %9207, float %9208, float %9209, float %9210, float %9211, float %9212, float %9213, float %9214, float %9215, float %9216, float %9217, float %9218, float %9219, float %9220, float %9221, float %9222, float %9223, float %9224, float %9225, float %9226, float %9227, float %9228, float %9229, float %9230, float %9231, i64 %9194, i64 %9199, i1 true) #3, !dbg !365 + %9233 = or disjoint i32 %9133, 96, !dbg !365 + %9234 = add i32 %9233, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !365 + %9235 = lshr exact i32 %9234, 4, !dbg !365 + %9236 = and i32 %9235, 16383, !dbg !365 + %9237 = zext nneg i32 %9236 to i64, !dbg !365 + %9238 = or disjoint i64 %9237, 4611686293372403712, !dbg !365 + %9239 = add i32 %9139, 96, !dbg !365 + %9240 = lshr exact i32 %9239, 4, !dbg !365 + %9241 = and i32 %9240, 16383, !dbg !365 + %9242 = zext nneg i32 %9241 to i64, !dbg !365 + %9243 = or disjoint i64 %9242, 4611686293338849280, !dbg !365 + %9244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 0, !dbg !365 + %9245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 1, !dbg !365 + %9246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 2, !dbg !365 + %9247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 3, !dbg !365 + %9248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 4, !dbg !365 + %9249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 5, !dbg !365 + %9250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 6, !dbg !365 + %9251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 7, !dbg !365 + %9252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 8, !dbg !365 + %9253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 9, !dbg !365 + %9254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 10, !dbg !365 + %9255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 11, !dbg !365 + %9256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 12, !dbg !365 + %9257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 13, !dbg !365 + %9258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 14, !dbg !365 + %9259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 15, !dbg !365 + %9260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 16, !dbg !365 + %9261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 17, !dbg !365 + %9262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 18, !dbg !365 + %9263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 19, !dbg !365 + %9264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 20, !dbg !365 + %9265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 21, !dbg !365 + %9266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 22, !dbg !365 + %9267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 23, !dbg !365 + %9268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 24, !dbg !365 + %9269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 25, !dbg !365 + %9270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 26, !dbg !365 + %9271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 27, !dbg !365 + %9272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 28, !dbg !365 + %9273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 29, !dbg !365 + %9274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 30, !dbg !365 + %9275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9232, 31, !dbg !365 + %9276 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9244, float %9245, float %9246, float %9247, float %9248, float %9249, float %9250, float %9251, float %9252, float %9253, float %9254, float %9255, float %9256, float %9257, float %9258, float %9259, float %9260, float %9261, float %9262, float %9263, float %9264, float %9265, float %9266, float %9267, float %9268, float %9269, float %9270, float %9271, float %9272, float %9273, float %9274, float %9275, i64 %9238, i64 %9243, i1 true) #3, !dbg !365 + %9277 = or disjoint i32 %9133, 16384, !dbg !365 + %9278 = add i32 %9277, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !365 + %9279 = lshr exact i32 %9278, 4, !dbg !365 + %9280 = and i32 %9279, 16383, !dbg !365 + %9281 = zext nneg i32 %9280 to i64, !dbg !365 + %9282 = or disjoint i64 %9281, 4611686293372403712, !dbg !365 + %9283 = add i32 %9139, 8192, !dbg !365 + %9284 = lshr exact i32 %9283, 4, !dbg !365 + %9285 = and i32 %9284, 16383, !dbg !365 + %9286 = zext nneg i32 %9285 to i64, !dbg !365 + %9287 = or disjoint i64 %9286, 4611686293338849280, !dbg !365 + %9288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 0, !dbg !365 + %9289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 1, !dbg !365 + %9290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 2, !dbg !365 + %9291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 3, !dbg !365 + %9292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 4, !dbg !365 + %9293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 5, !dbg !365 + %9294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 6, !dbg !365 + %9295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 7, !dbg !365 + %9296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 8, !dbg !365 + %9297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 9, !dbg !365 + %9298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 10, !dbg !365 + %9299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 11, !dbg !365 + %9300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 12, !dbg !365 + %9301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 13, !dbg !365 + %9302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 14, !dbg !365 + %9303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 15, !dbg !365 + %9304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 16, !dbg !365 + %9305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 17, !dbg !365 + %9306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 18, !dbg !365 + %9307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 19, !dbg !365 + %9308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 20, !dbg !365 + %9309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 21, !dbg !365 + %9310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 22, !dbg !365 + %9311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 23, !dbg !365 + %9312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 24, !dbg !365 + %9313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 25, !dbg !365 + %9314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 26, !dbg !365 + %9315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 27, !dbg !365 + %9316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 28, !dbg !365 + %9317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 29, !dbg !365 + %9318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 30, !dbg !365 + %9319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9276, 31, !dbg !365 + %9320 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9288, float %9289, float %9290, float %9291, float %9292, float %9293, float %9294, float %9295, float %9296, float %9297, float %9298, float %9299, float %9300, float %9301, float %9302, float %9303, float %9304, float %9305, float %9306, float %9307, float %9308, float %9309, float %9310, float %9311, float %9312, float %9313, float %9314, float %9315, float %9316, float %9317, float %9318, float %9319, i64 %9282, i64 %9287, i1 true) #3, !dbg !365 + %9321 = or disjoint i32 %9133, 16416, !dbg !365 + %9322 = add i32 %9321, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !365 + %9323 = lshr exact i32 %9322, 4, !dbg !365 + %9324 = and i32 %9323, 16383, !dbg !365 + %9325 = zext nneg i32 %9324 to i64, !dbg !365 + %9326 = or disjoint i64 %9325, 4611686293372403712, !dbg !365 + %9327 = add i32 %9139, 8224, !dbg !365 + %9328 = lshr exact i32 %9327, 4, !dbg !365 + %9329 = and i32 %9328, 16383, !dbg !365 + %9330 = zext nneg i32 %9329 to i64, !dbg !365 + %9331 = or disjoint i64 %9330, 4611686293338849280, !dbg !365 + %9332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 0, !dbg !365 + %9333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 1, !dbg !365 + %9334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 2, !dbg !365 + %9335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 3, !dbg !365 + %9336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 4, !dbg !365 + %9337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 5, !dbg !365 + %9338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 6, !dbg !365 + %9339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 7, !dbg !365 + %9340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 8, !dbg !365 + %9341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 9, !dbg !365 + %9342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 10, !dbg !365 + %9343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 11, !dbg !365 + %9344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 12, !dbg !365 + %9345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 13, !dbg !365 + %9346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 14, !dbg !365 + %9347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 15, !dbg !365 + %9348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 16, !dbg !365 + %9349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 17, !dbg !365 + %9350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 18, !dbg !365 + %9351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 19, !dbg !365 + %9352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 20, !dbg !365 + %9353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 21, !dbg !365 + %9354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 22, !dbg !365 + %9355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 23, !dbg !365 + %9356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 24, !dbg !365 + %9357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 25, !dbg !365 + %9358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 26, !dbg !365 + %9359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 27, !dbg !365 + %9360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 28, !dbg !365 + %9361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 29, !dbg !365 + %9362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 30, !dbg !365 + %9363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9320, 31, !dbg !365 + %9364 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9332, float %9333, float %9334, float %9335, float %9336, float %9337, float %9338, float %9339, float %9340, float %9341, float %9342, float %9343, float %9344, float %9345, float %9346, float %9347, float %9348, float %9349, float %9350, float %9351, float %9352, float %9353, float %9354, float %9355, float %9356, float %9357, float %9358, float %9359, float %9360, float %9361, float %9362, float %9363, i64 %9326, i64 %9331, i1 true) #3, !dbg !365 + %9365 = or disjoint i32 %9133, 16448, !dbg !365 + %9366 = add i32 %9365, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !365 + %9367 = lshr exact i32 %9366, 4, !dbg !365 + %9368 = and i32 %9367, 16383, !dbg !365 + %9369 = zext nneg i32 %9368 to i64, !dbg !365 + %9370 = or disjoint i64 %9369, 4611686293372403712, !dbg !365 + %9371 = add i32 %9139, 8256, !dbg !365 + %9372 = lshr exact i32 %9371, 4, !dbg !365 + %9373 = and i32 %9372, 16383, !dbg !365 + %9374 = zext nneg i32 %9373 to i64, !dbg !365 + %9375 = or disjoint i64 %9374, 4611686293338849280, !dbg !365 + %9376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 0, !dbg !365 + %9377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 1, !dbg !365 + %9378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 2, !dbg !365 + %9379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 3, !dbg !365 + %9380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 4, !dbg !365 + %9381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 5, !dbg !365 + %9382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 6, !dbg !365 + %9383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 7, !dbg !365 + %9384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 8, !dbg !365 + %9385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 9, !dbg !365 + %9386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 10, !dbg !365 + %9387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 11, !dbg !365 + %9388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 12, !dbg !365 + %9389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 13, !dbg !365 + %9390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 14, !dbg !365 + %9391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 15, !dbg !365 + %9392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 16, !dbg !365 + %9393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 17, !dbg !365 + %9394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 18, !dbg !365 + %9395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 19, !dbg !365 + %9396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 20, !dbg !365 + %9397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 21, !dbg !365 + %9398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 22, !dbg !365 + %9399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 23, !dbg !365 + %9400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 24, !dbg !365 + %9401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 25, !dbg !365 + %9402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 26, !dbg !365 + %9403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 27, !dbg !365 + %9404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 28, !dbg !365 + %9405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 29, !dbg !365 + %9406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 30, !dbg !365 + %9407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9364, 31, !dbg !365 + %9408 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9376, float %9377, float %9378, float %9379, float %9380, float %9381, float %9382, float %9383, float %9384, float %9385, float %9386, float %9387, float %9388, float %9389, float %9390, float %9391, float %9392, float %9393, float %9394, float %9395, float %9396, float %9397, float %9398, float %9399, float %9400, float %9401, float %9402, float %9403, float %9404, float %9405, float %9406, float %9407, i64 %9370, i64 %9375, i1 true) #3, !dbg !365 + %9409 = or disjoint i32 %9133, 16480, !dbg !365 + %9410 = add i32 %9409, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !365 + %9411 = lshr exact i32 %9410, 4, !dbg !365 + %9412 = and i32 %9411, 16383, !dbg !365 + %9413 = zext nneg i32 %9412 to i64, !dbg !365 + %9414 = or disjoint i64 %9413, 4611686293372403712, !dbg !365 + %9415 = add i32 %9139, 8288, !dbg !365 + %9416 = lshr exact i32 %9415, 4, !dbg !365 + %9417 = and i32 %9416, 16383, !dbg !365 + %9418 = zext nneg i32 %9417 to i64, !dbg !365 + %9419 = or disjoint i64 %9418, 4611686293338849280, !dbg !365 + %9420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 0, !dbg !365 + %9421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 1, !dbg !365 + %9422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 2, !dbg !365 + %9423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 3, !dbg !365 + %9424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 4, !dbg !365 + %9425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 5, !dbg !365 + %9426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 6, !dbg !365 + %9427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 7, !dbg !365 + %9428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 8, !dbg !365 + %9429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 9, !dbg !365 + %9430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 10, !dbg !365 + %9431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 11, !dbg !365 + %9432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 12, !dbg !365 + %9433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 13, !dbg !365 + %9434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 14, !dbg !365 + %9435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 15, !dbg !365 + %9436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 16, !dbg !365 + %9437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 17, !dbg !365 + %9438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 18, !dbg !365 + %9439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 19, !dbg !365 + %9440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 20, !dbg !365 + %9441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 21, !dbg !365 + %9442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 22, !dbg !365 + %9443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 23, !dbg !365 + %9444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 24, !dbg !365 + %9445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 25, !dbg !365 + %9446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 26, !dbg !365 + %9447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 27, !dbg !365 + %9448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 28, !dbg !365 + %9449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 29, !dbg !365 + %9450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 30, !dbg !365 + %9451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9408, 31, !dbg !365 + %9452 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9420, float %9421, float %9422, float %9423, float %9424, float %9425, float %9426, float %9427, float %9428, float %9429, float %9430, float %9431, float %9432, float %9433, float %9434, float %9435, float %9436, float %9437, float %9438, float %9439, float %9440, float %9441, float %9442, float %9443, float %9444, float %9445, float %9446, float %9447, float %9448, float %9449, float %9450, float %9451, i64 %9414, i64 %9419, i1 true) #3, !dbg !365 + %9453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 0, !dbg !365 + %9454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 1, !dbg !365 + %9455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 2, !dbg !365 + %9456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 3, !dbg !365 + %9457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 4, !dbg !365 + %9458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 5, !dbg !365 + %9459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 6, !dbg !365 + %9460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 7, !dbg !365 + %9461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 8, !dbg !365 + %9462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 9, !dbg !365 + %9463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 10, !dbg !365 + %9464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 11, !dbg !365 + %9465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 12, !dbg !365 + %9466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 13, !dbg !365 + %9467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 14, !dbg !365 + %9468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 15, !dbg !365 + %9469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 16, !dbg !365 + %9470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 17, !dbg !365 + %9471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 18, !dbg !365 + %9472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 19, !dbg !365 + %9473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 20, !dbg !365 + %9474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 21, !dbg !365 + %9475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 22, !dbg !365 + %9476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 23, !dbg !365 + %9477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 24, !dbg !365 + %9478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 25, !dbg !365 + %9479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 26, !dbg !365 + %9480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 27, !dbg !365 + %9481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 28, !dbg !365 + %9482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 29, !dbg !365 + %9483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 30, !dbg !365 + %9484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9452, 31, !dbg !365 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !365 + %9485 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %9453, float %9454, float %9455, float %9456, float %9457, float %9458, float %9459, float %9460, float %9461, float %9462, float %9463, float %9464, float %9465, float %9466, float %9467, float %9468, float %9469, float %9470, float %9471, float %9472, float %9473, float %9474, float %9475, float %9476, float %9477, float %9478, float %9479, float %9480, float %9481, float %9482, float %9483, float %9484, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 0, i32 0, ptr addrspace(3) %9064, i32 0, i32 0) #3, !dbg !365 + %9486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 0, !dbg !365 + %9487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 1, !dbg !365 + %9488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 2, !dbg !365 + %9489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 3, !dbg !365 + %9490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 4, !dbg !365 + %9491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 5, !dbg !365 + %9492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 6, !dbg !365 + %9493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 7, !dbg !365 + %9494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 8, !dbg !365 + %9495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 9, !dbg !365 + %9496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 10, !dbg !365 + %9497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 11, !dbg !365 + %9498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 12, !dbg !365 + %9499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 13, !dbg !365 + %9500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 14, !dbg !365 + %9501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 15, !dbg !365 + %9502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 16, !dbg !365 + %9503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 17, !dbg !365 + %9504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 18, !dbg !365 + %9505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 19, !dbg !365 + %9506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 20, !dbg !365 + %9507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 21, !dbg !365 + %9508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 22, !dbg !365 + %9509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 23, !dbg !365 + %9510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 24, !dbg !365 + %9511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 25, !dbg !365 + %9512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 26, !dbg !365 + %9513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 27, !dbg !365 + %9514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 28, !dbg !365 + %9515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 29, !dbg !365 + %9516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 30, !dbg !365 + %9517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9485, 31, !dbg !365 + %9518 = fmul float %9486, 0x3FB6A09E60000000, !dbg !366 + %9519 = fmul float %9487, 0x3FB6A09E60000000, !dbg !366 + %9520 = fmul float %9488, 0x3FB6A09E60000000, !dbg !366 + %9521 = fmul float %9489, 0x3FB6A09E60000000, !dbg !366 + %9522 = fmul float %9490, 0x3FB6A09E60000000, !dbg !366 + %9523 = fmul float %9491, 0x3FB6A09E60000000, !dbg !366 + %9524 = fmul float %9492, 0x3FB6A09E60000000, !dbg !366 + %9525 = fmul float %9493, 0x3FB6A09E60000000, !dbg !366 + %9526 = fmul float %9494, 0x3FB6A09E60000000, !dbg !366 + %9527 = fmul float %9495, 0x3FB6A09E60000000, !dbg !366 + %9528 = fmul float %9496, 0x3FB6A09E60000000, !dbg !366 + %9529 = fmul float %9497, 0x3FB6A09E60000000, !dbg !366 + %9530 = fmul float %9498, 0x3FB6A09E60000000, !dbg !366 + %9531 = fmul float %9499, 0x3FB6A09E60000000, !dbg !366 + %9532 = fmul float %9500, 0x3FB6A09E60000000, !dbg !366 + %9533 = fmul float %9501, 0x3FB6A09E60000000, !dbg !366 + %9534 = fmul float %9502, 0x3FB6A09E60000000, !dbg !366 + %9535 = fmul float %9503, 0x3FB6A09E60000000, !dbg !366 + %9536 = fmul float %9504, 0x3FB6A09E60000000, !dbg !366 + %9537 = fmul float %9505, 0x3FB6A09E60000000, !dbg !366 + %9538 = fmul float %9506, 0x3FB6A09E60000000, !dbg !366 + %9539 = fmul float %9507, 0x3FB6A09E60000000, !dbg !366 + %9540 = fmul float %9508, 0x3FB6A09E60000000, !dbg !366 + %9541 = fmul float %9509, 0x3FB6A09E60000000, !dbg !366 + %9542 = fmul float %9510, 0x3FB6A09E60000000, !dbg !366 + %9543 = fmul float %9511, 0x3FB6A09E60000000, !dbg !366 + %9544 = fmul float %9512, 0x3FB6A09E60000000, !dbg !366 + %9545 = fmul float %9513, 0x3FB6A09E60000000, !dbg !366 + %9546 = fmul float %9514, 0x3FB6A09E60000000, !dbg !366 + %9547 = fmul float %9515, 0x3FB6A09E60000000, !dbg !366 + %9548 = fmul float %9516, 0x3FB6A09E60000000, !dbg !366 + %9549 = fmul float %9517, 0x3FB6A09E60000000, !dbg !366 + %9550 = fmul float %9518, 0x3FF7154760000000, !dbg !367 + %9551 = select i1 %9047, float %9550, float 0xFFF0000000000000, !dbg !368 + %9552 = fmul float %9519, 0x3FF7154760000000, !dbg !367 + %9553 = select i1 %9048, float %9552, float 0xFFF0000000000000, !dbg !368 + %9554 = fmul float %9520, 0x3FF7154760000000, !dbg !367 + %9555 = select i1 %9047, float %9554, float 0xFFF0000000000000, !dbg !368 + %9556 = fmul float %9521, 0x3FF7154760000000, !dbg !367 + %9557 = select i1 %9048, float %9556, float 0xFFF0000000000000, !dbg !368 + %9558 = fmul float %9522, 0x3FF7154760000000, !dbg !367 + %9559 = select i1 %9049, float %9558, float 0xFFF0000000000000, !dbg !368 + %9560 = fmul float %9523, 0x3FF7154760000000, !dbg !367 + %9561 = select i1 %9050, float %9560, float 0xFFF0000000000000, !dbg !368 + %9562 = fmul float %9524, 0x3FF7154760000000, !dbg !367 + %9563 = select i1 %9049, float %9562, float 0xFFF0000000000000, !dbg !368 + %9564 = fmul float %9525, 0x3FF7154760000000, !dbg !367 + %9565 = select i1 %9050, float %9564, float 0xFFF0000000000000, !dbg !368 + %9566 = fmul float %9526, 0x3FF7154760000000, !dbg !367 + %9567 = select i1 %9051, float %9566, float 0xFFF0000000000000, !dbg !368 + %9568 = fmul float %9527, 0x3FF7154760000000, !dbg !367 + %9569 = select i1 %9052, float %9568, float 0xFFF0000000000000, !dbg !368 + %9570 = fmul float %9528, 0x3FF7154760000000, !dbg !367 + %9571 = select i1 %9051, float %9570, float 0xFFF0000000000000, !dbg !368 + %9572 = fmul float %9529, 0x3FF7154760000000, !dbg !367 + %9573 = select i1 %9052, float %9572, float 0xFFF0000000000000, !dbg !368 + %9574 = fmul float %9530, 0x3FF7154760000000, !dbg !367 + %9575 = select i1 %9053, float %9574, float 0xFFF0000000000000, !dbg !368 + %9576 = fmul float %9531, 0x3FF7154760000000, !dbg !367 + %9577 = select i1 %9054, float %9576, float 0xFFF0000000000000, !dbg !368 + %9578 = fmul float %9532, 0x3FF7154760000000, !dbg !367 + %9579 = select i1 %9053, float %9578, float 0xFFF0000000000000, !dbg !368 + %9580 = fmul float %9533, 0x3FF7154760000000, !dbg !367 + %9581 = select i1 %9054, float %9580, float 0xFFF0000000000000, !dbg !368 + %9582 = fmul float %9534, 0x3FF7154760000000, !dbg !367 + %9583 = select i1 %9055, float %9582, float 0xFFF0000000000000, !dbg !368 + %9584 = fmul float %9535, 0x3FF7154760000000, !dbg !367 + %9585 = select i1 %9056, float %9584, float 0xFFF0000000000000, !dbg !368 + %9586 = fmul float %9536, 0x3FF7154760000000, !dbg !367 + %9587 = select i1 %9055, float %9586, float 0xFFF0000000000000, !dbg !368 + %9588 = fmul float %9537, 0x3FF7154760000000, !dbg !367 + %9589 = select i1 %9056, float %9588, float 0xFFF0000000000000, !dbg !368 + %9590 = fmul float %9538, 0x3FF7154760000000, !dbg !367 + %9591 = select i1 %9057, float %9590, float 0xFFF0000000000000, !dbg !368 + %9592 = fmul float %9539, 0x3FF7154760000000, !dbg !367 + %9593 = select i1 %9058, float %9592, float 0xFFF0000000000000, !dbg !368 + %9594 = fmul float %9540, 0x3FF7154760000000, !dbg !367 + %9595 = select i1 %9057, float %9594, float 0xFFF0000000000000, !dbg !368 + %9596 = fmul float %9541, 0x3FF7154760000000, !dbg !367 + %9597 = select i1 %9058, float %9596, float 0xFFF0000000000000, !dbg !368 + %9598 = fmul float %9542, 0x3FF7154760000000, !dbg !367 + %9599 = select i1 %9059, float %9598, float 0xFFF0000000000000, !dbg !368 + %9600 = fmul float %9543, 0x3FF7154760000000, !dbg !367 + %9601 = select i1 %9060, float %9600, float 0xFFF0000000000000, !dbg !368 + %9602 = fmul float %9544, 0x3FF7154760000000, !dbg !367 + %9603 = select i1 %9059, float %9602, float 0xFFF0000000000000, !dbg !368 + %9604 = fmul float %9545, 0x3FF7154760000000, !dbg !367 + %9605 = select i1 %9060, float %9604, float 0xFFF0000000000000, !dbg !368 + %9606 = fmul float %9546, 0x3FF7154760000000, !dbg !367 + %9607 = select i1 %9061, float %9606, float 0xFFF0000000000000, !dbg !368 + %9608 = fmul float %9547, 0x3FF7154760000000, !dbg !367 + %9609 = select i1 %9062, float %9608, float 0xFFF0000000000000, !dbg !368 + %9610 = fmul float %9548, 0x3FF7154760000000, !dbg !367 + %9611 = select i1 %9061, float %9610, float 0xFFF0000000000000, !dbg !368 + %9612 = fmul float %9549, 0x3FF7154760000000, !dbg !367 + %9613 = select i1 %9062, float %9612, float 0xFFF0000000000000, !dbg !368 + %9614 = fsub float %9551, %9115, !dbg !369 + %9615 = fsub float %9553, %9116, !dbg !369 + %9616 = fsub float %9555, %9115, !dbg !369 + %9617 = fsub float %9557, %9116, !dbg !369 + %9618 = fsub float %9559, %9117, !dbg !369 + %9619 = fsub float %9561, %9118, !dbg !369 + %9620 = fsub float %9563, %9117, !dbg !369 + %9621 = fsub float %9565, %9118, !dbg !369 + %9622 = fsub float %9567, %9119, !dbg !369 + %9623 = fsub float %9569, %9120, !dbg !369 + %9624 = fsub float %9571, %9119, !dbg !369 + %9625 = fsub float %9573, %9120, !dbg !369 + %9626 = fsub float %9575, %9121, !dbg !369 + %9627 = fsub float %9577, %9122, !dbg !369 + %9628 = fsub float %9579, %9121, !dbg !369 + %9629 = fsub float %9581, %9122, !dbg !369 + %9630 = fsub float %9583, %9123, !dbg !369 + %9631 = fsub float %9585, %9124, !dbg !369 + %9632 = fsub float %9587, %9123, !dbg !369 + %9633 = fsub float %9589, %9124, !dbg !369 + %9634 = fsub float %9591, %9125, !dbg !369 + %9635 = fsub float %9593, %9126, !dbg !369 + %9636 = fsub float %9595, %9125, !dbg !369 + %9637 = fsub float %9597, %9126, !dbg !369 + %9638 = fsub float %9599, %9127, !dbg !369 + %9639 = fsub float %9601, %9128, !dbg !369 + %9640 = fsub float %9603, %9127, !dbg !369 + %9641 = fsub float %9605, %9128, !dbg !369 + %9642 = fsub float %9607, %9129, !dbg !369 + %9643 = fsub float %9609, %9130, !dbg !369 + %9644 = fsub float %9611, %9129, !dbg !369 + %9645 = fsub float %9613, %9130, !dbg !369 + %9646 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i = icmp eq i32 %9646, 0, !dbg !370 + br i1 %.not.i, label %9649, label %9647, !dbg !370 + +9647: ; preds = %.lr.ph1793 + %9648 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9614) #3, !dbg !370 + br label %__nv_exp2f.exit, !dbg !370 + +9649: ; preds = %.lr.ph1793 + %9650 = tail call float @llvm.nvvm.ex2.approx.f(float %9614) #3, !dbg !370 + br label %__nv_exp2f.exit, !dbg !370 + +__nv_exp2f.exit: ; preds = %9647, %9649 + %.0.i = phi float [ %9648, %9647 ], [ %9650, %9649 ], !dbg !370 + %9651 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1147 = icmp eq i32 %9651, 0, !dbg !370 + br i1 %.not.i1147, label %9654, label %9652, !dbg !370 + +9652: ; preds = %__nv_exp2f.exit + %9653 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9615) #3, !dbg !370 + br label %__nv_exp2f.exit1149, !dbg !370 + +9654: ; preds = %__nv_exp2f.exit + %9655 = tail call float @llvm.nvvm.ex2.approx.f(float %9615) #3, !dbg !370 + br label %__nv_exp2f.exit1149, !dbg !370 + +__nv_exp2f.exit1149: ; preds = %9652, %9654 + %.0.i1148 = phi float [ %9653, %9652 ], [ %9655, %9654 ], !dbg !370 + %9656 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1150 = icmp eq i32 %9656, 0, !dbg !370 + br i1 %.not.i1150, label %9659, label %9657, !dbg !370 + +9657: ; preds = %__nv_exp2f.exit1149 + %9658 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9616) #3, !dbg !370 + br label %__nv_exp2f.exit1152, !dbg !370 + +9659: ; preds = %__nv_exp2f.exit1149 + %9660 = tail call float @llvm.nvvm.ex2.approx.f(float %9616) #3, !dbg !370 + br label %__nv_exp2f.exit1152, !dbg !370 + +__nv_exp2f.exit1152: ; preds = %9657, %9659 + %.0.i1151 = phi float [ %9658, %9657 ], [ %9660, %9659 ], !dbg !370 + %9661 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1153 = icmp eq i32 %9661, 0, !dbg !370 + br i1 %.not.i1153, label %9664, label %9662, !dbg !370 + +9662: ; preds = %__nv_exp2f.exit1152 + %9663 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9617) #3, !dbg !370 + br label %__nv_exp2f.exit1155, !dbg !370 + +9664: ; preds = %__nv_exp2f.exit1152 + %9665 = tail call float @llvm.nvvm.ex2.approx.f(float %9617) #3, !dbg !370 + br label %__nv_exp2f.exit1155, !dbg !370 + +__nv_exp2f.exit1155: ; preds = %9662, %9664 + %.0.i1154 = phi float [ %9663, %9662 ], [ %9665, %9664 ], !dbg !370 + %9666 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1156 = icmp eq i32 %9666, 0, !dbg !370 + br i1 %.not.i1156, label %9669, label %9667, !dbg !370 + +9667: ; preds = %__nv_exp2f.exit1155 + %9668 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9618) #3, !dbg !370 + br label %__nv_exp2f.exit1158, !dbg !370 + +9669: ; preds = %__nv_exp2f.exit1155 + %9670 = tail call float @llvm.nvvm.ex2.approx.f(float %9618) #3, !dbg !370 + br label %__nv_exp2f.exit1158, !dbg !370 + +__nv_exp2f.exit1158: ; preds = %9667, %9669 + %.0.i1157 = phi float [ %9668, %9667 ], [ %9670, %9669 ], !dbg !370 + %9671 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1159 = icmp eq i32 %9671, 0, !dbg !370 + br i1 %.not.i1159, label %9674, label %9672, !dbg !370 + +9672: ; preds = %__nv_exp2f.exit1158 + %9673 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9619) #3, !dbg !370 + br label %__nv_exp2f.exit1161, !dbg !370 + +9674: ; preds = %__nv_exp2f.exit1158 + %9675 = tail call float @llvm.nvvm.ex2.approx.f(float %9619) #3, !dbg !370 + br label %__nv_exp2f.exit1161, !dbg !370 + +__nv_exp2f.exit1161: ; preds = %9672, %9674 + %.0.i1160 = phi float [ %9673, %9672 ], [ %9675, %9674 ], !dbg !370 + %9676 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1162 = icmp eq i32 %9676, 0, !dbg !370 + br i1 %.not.i1162, label %9679, label %9677, !dbg !370 + +9677: ; preds = %__nv_exp2f.exit1161 + %9678 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9620) #3, !dbg !370 + br label %__nv_exp2f.exit1164, !dbg !370 + +9679: ; preds = %__nv_exp2f.exit1161 + %9680 = tail call float @llvm.nvvm.ex2.approx.f(float %9620) #3, !dbg !370 + br label %__nv_exp2f.exit1164, !dbg !370 + +__nv_exp2f.exit1164: ; preds = %9677, %9679 + %.0.i1163 = phi float [ %9678, %9677 ], [ %9680, %9679 ], !dbg !370 + %9681 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1165 = icmp eq i32 %9681, 0, !dbg !370 + br i1 %.not.i1165, label %9684, label %9682, !dbg !370 + +9682: ; preds = %__nv_exp2f.exit1164 + %9683 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9621) #3, !dbg !370 + br label %__nv_exp2f.exit1167, !dbg !370 + +9684: ; preds = %__nv_exp2f.exit1164 + %9685 = tail call float @llvm.nvvm.ex2.approx.f(float %9621) #3, !dbg !370 + br label %__nv_exp2f.exit1167, !dbg !370 + +__nv_exp2f.exit1167: ; preds = %9682, %9684 + %.0.i1166 = phi float [ %9683, %9682 ], [ %9685, %9684 ], !dbg !370 + %9686 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1168 = icmp eq i32 %9686, 0, !dbg !370 + br i1 %.not.i1168, label %9689, label %9687, !dbg !370 + +9687: ; preds = %__nv_exp2f.exit1167 + %9688 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9622) #3, !dbg !370 + br label %__nv_exp2f.exit1170, !dbg !370 + +9689: ; preds = %__nv_exp2f.exit1167 + %9690 = tail call float @llvm.nvvm.ex2.approx.f(float %9622) #3, !dbg !370 + br label %__nv_exp2f.exit1170, !dbg !370 + +__nv_exp2f.exit1170: ; preds = %9687, %9689 + %.0.i1169 = phi float [ %9688, %9687 ], [ %9690, %9689 ], !dbg !370 + %9691 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1171 = icmp eq i32 %9691, 0, !dbg !370 + br i1 %.not.i1171, label %9694, label %9692, !dbg !370 + +9692: ; preds = %__nv_exp2f.exit1170 + %9693 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9623) #3, !dbg !370 + br label %__nv_exp2f.exit1173, !dbg !370 + +9694: ; preds = %__nv_exp2f.exit1170 + %9695 = tail call float @llvm.nvvm.ex2.approx.f(float %9623) #3, !dbg !370 + br label %__nv_exp2f.exit1173, !dbg !370 + +__nv_exp2f.exit1173: ; preds = %9692, %9694 + %.0.i1172 = phi float [ %9693, %9692 ], [ %9695, %9694 ], !dbg !370 + %9696 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1174 = icmp eq i32 %9696, 0, !dbg !370 + br i1 %.not.i1174, label %9699, label %9697, !dbg !370 + +9697: ; preds = %__nv_exp2f.exit1173 + %9698 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9624) #3, !dbg !370 + br label %__nv_exp2f.exit1176, !dbg !370 + +9699: ; preds = %__nv_exp2f.exit1173 + %9700 = tail call float @llvm.nvvm.ex2.approx.f(float %9624) #3, !dbg !370 + br label %__nv_exp2f.exit1176, !dbg !370 + +__nv_exp2f.exit1176: ; preds = %9697, %9699 + %.0.i1175 = phi float [ %9698, %9697 ], [ %9700, %9699 ], !dbg !370 + %9701 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1177 = icmp eq i32 %9701, 0, !dbg !370 + br i1 %.not.i1177, label %9704, label %9702, !dbg !370 + +9702: ; preds = %__nv_exp2f.exit1176 + %9703 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9625) #3, !dbg !370 + br label %__nv_exp2f.exit1179, !dbg !370 + +9704: ; preds = %__nv_exp2f.exit1176 + %9705 = tail call float @llvm.nvvm.ex2.approx.f(float %9625) #3, !dbg !370 + br label %__nv_exp2f.exit1179, !dbg !370 + +__nv_exp2f.exit1179: ; preds = %9702, %9704 + %.0.i1178 = phi float [ %9703, %9702 ], [ %9705, %9704 ], !dbg !370 + %9706 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1180 = icmp eq i32 %9706, 0, !dbg !370 + br i1 %.not.i1180, label %9709, label %9707, !dbg !370 + +9707: ; preds = %__nv_exp2f.exit1179 + %9708 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9626) #3, !dbg !370 + br label %__nv_exp2f.exit1182, !dbg !370 + +9709: ; preds = %__nv_exp2f.exit1179 + %9710 = tail call float @llvm.nvvm.ex2.approx.f(float %9626) #3, !dbg !370 + br label %__nv_exp2f.exit1182, !dbg !370 + +__nv_exp2f.exit1182: ; preds = %9707, %9709 + %.0.i1181 = phi float [ %9708, %9707 ], [ %9710, %9709 ], !dbg !370 + %9711 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1183 = icmp eq i32 %9711, 0, !dbg !370 + br i1 %.not.i1183, label %9714, label %9712, !dbg !370 + +9712: ; preds = %__nv_exp2f.exit1182 + %9713 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9627) #3, !dbg !370 + br label %__nv_exp2f.exit1185, !dbg !370 + +9714: ; preds = %__nv_exp2f.exit1182 + %9715 = tail call float @llvm.nvvm.ex2.approx.f(float %9627) #3, !dbg !370 + br label %__nv_exp2f.exit1185, !dbg !370 + +__nv_exp2f.exit1185: ; preds = %9712, %9714 + %.0.i1184 = phi float [ %9713, %9712 ], [ %9715, %9714 ], !dbg !370 + %9716 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1186 = icmp eq i32 %9716, 0, !dbg !370 + br i1 %.not.i1186, label %9719, label %9717, !dbg !370 + +9717: ; preds = %__nv_exp2f.exit1185 + %9718 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9628) #3, !dbg !370 + br label %__nv_exp2f.exit1188, !dbg !370 + +9719: ; preds = %__nv_exp2f.exit1185 + %9720 = tail call float @llvm.nvvm.ex2.approx.f(float %9628) #3, !dbg !370 + br label %__nv_exp2f.exit1188, !dbg !370 + +__nv_exp2f.exit1188: ; preds = %9717, %9719 + %.0.i1187 = phi float [ %9718, %9717 ], [ %9720, %9719 ], !dbg !370 + %9721 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1189 = icmp eq i32 %9721, 0, !dbg !370 + br i1 %.not.i1189, label %9724, label %9722, !dbg !370 + +9722: ; preds = %__nv_exp2f.exit1188 + %9723 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9629) #3, !dbg !370 + br label %__nv_exp2f.exit1191, !dbg !370 + +9724: ; preds = %__nv_exp2f.exit1188 + %9725 = tail call float @llvm.nvvm.ex2.approx.f(float %9629) #3, !dbg !370 + br label %__nv_exp2f.exit1191, !dbg !370 + +__nv_exp2f.exit1191: ; preds = %9722, %9724 + %.0.i1190 = phi float [ %9723, %9722 ], [ %9725, %9724 ], !dbg !370 + %9726 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1192 = icmp eq i32 %9726, 0, !dbg !370 + br i1 %.not.i1192, label %9729, label %9727, !dbg !370 + +9727: ; preds = %__nv_exp2f.exit1191 + %9728 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9630) #3, !dbg !370 + br label %__nv_exp2f.exit1194, !dbg !370 + +9729: ; preds = %__nv_exp2f.exit1191 + %9730 = tail call float @llvm.nvvm.ex2.approx.f(float %9630) #3, !dbg !370 + br label %__nv_exp2f.exit1194, !dbg !370 + +__nv_exp2f.exit1194: ; preds = %9727, %9729 + %.0.i1193 = phi float [ %9728, %9727 ], [ %9730, %9729 ], !dbg !370 + %9731 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1195 = icmp eq i32 %9731, 0, !dbg !370 + br i1 %.not.i1195, label %9734, label %9732, !dbg !370 + +9732: ; preds = %__nv_exp2f.exit1194 + %9733 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9631) #3, !dbg !370 + br label %__nv_exp2f.exit1197, !dbg !370 + +9734: ; preds = %__nv_exp2f.exit1194 + %9735 = tail call float @llvm.nvvm.ex2.approx.f(float %9631) #3, !dbg !370 + br label %__nv_exp2f.exit1197, !dbg !370 + +__nv_exp2f.exit1197: ; preds = %9732, %9734 + %.0.i1196 = phi float [ %9733, %9732 ], [ %9735, %9734 ], !dbg !370 + %9736 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1198 = icmp eq i32 %9736, 0, !dbg !370 + br i1 %.not.i1198, label %9739, label %9737, !dbg !370 + +9737: ; preds = %__nv_exp2f.exit1197 + %9738 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9632) #3, !dbg !370 + br label %__nv_exp2f.exit1200, !dbg !370 + +9739: ; preds = %__nv_exp2f.exit1197 + %9740 = tail call float @llvm.nvvm.ex2.approx.f(float %9632) #3, !dbg !370 + br label %__nv_exp2f.exit1200, !dbg !370 + +__nv_exp2f.exit1200: ; preds = %9737, %9739 + %.0.i1199 = phi float [ %9738, %9737 ], [ %9740, %9739 ], !dbg !370 + %9741 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1201 = icmp eq i32 %9741, 0, !dbg !370 + br i1 %.not.i1201, label %9744, label %9742, !dbg !370 + +9742: ; preds = %__nv_exp2f.exit1200 + %9743 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9633) #3, !dbg !370 + br label %__nv_exp2f.exit1203, !dbg !370 + +9744: ; preds = %__nv_exp2f.exit1200 + %9745 = tail call float @llvm.nvvm.ex2.approx.f(float %9633) #3, !dbg !370 + br label %__nv_exp2f.exit1203, !dbg !370 + +__nv_exp2f.exit1203: ; preds = %9742, %9744 + %.0.i1202 = phi float [ %9743, %9742 ], [ %9745, %9744 ], !dbg !370 + %9746 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1204 = icmp eq i32 %9746, 0, !dbg !370 + br i1 %.not.i1204, label %9749, label %9747, !dbg !370 + +9747: ; preds = %__nv_exp2f.exit1203 + %9748 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9634) #3, !dbg !370 + br label %__nv_exp2f.exit1206, !dbg !370 + +9749: ; preds = %__nv_exp2f.exit1203 + %9750 = tail call float @llvm.nvvm.ex2.approx.f(float %9634) #3, !dbg !370 + br label %__nv_exp2f.exit1206, !dbg !370 + +__nv_exp2f.exit1206: ; preds = %9747, %9749 + %.0.i1205 = phi float [ %9748, %9747 ], [ %9750, %9749 ], !dbg !370 + %9751 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1207 = icmp eq i32 %9751, 0, !dbg !370 + br i1 %.not.i1207, label %9754, label %9752, !dbg !370 + +9752: ; preds = %__nv_exp2f.exit1206 + %9753 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9635) #3, !dbg !370 + br label %__nv_exp2f.exit1209, !dbg !370 + +9754: ; preds = %__nv_exp2f.exit1206 + %9755 = tail call float @llvm.nvvm.ex2.approx.f(float %9635) #3, !dbg !370 + br label %__nv_exp2f.exit1209, !dbg !370 + +__nv_exp2f.exit1209: ; preds = %9752, %9754 + %.0.i1208 = phi float [ %9753, %9752 ], [ %9755, %9754 ], !dbg !370 + %9756 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1210 = icmp eq i32 %9756, 0, !dbg !370 + br i1 %.not.i1210, label %9759, label %9757, !dbg !370 + +9757: ; preds = %__nv_exp2f.exit1209 + %9758 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9636) #3, !dbg !370 + br label %__nv_exp2f.exit1212, !dbg !370 + +9759: ; preds = %__nv_exp2f.exit1209 + %9760 = tail call float @llvm.nvvm.ex2.approx.f(float %9636) #3, !dbg !370 + br label %__nv_exp2f.exit1212, !dbg !370 + +__nv_exp2f.exit1212: ; preds = %9757, %9759 + %.0.i1211 = phi float [ %9758, %9757 ], [ %9760, %9759 ], !dbg !370 + %9761 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1213 = icmp eq i32 %9761, 0, !dbg !370 + br i1 %.not.i1213, label %9764, label %9762, !dbg !370 + +9762: ; preds = %__nv_exp2f.exit1212 + %9763 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9637) #3, !dbg !370 + br label %__nv_exp2f.exit1215, !dbg !370 + +9764: ; preds = %__nv_exp2f.exit1212 + %9765 = tail call float @llvm.nvvm.ex2.approx.f(float %9637) #3, !dbg !370 + br label %__nv_exp2f.exit1215, !dbg !370 + +__nv_exp2f.exit1215: ; preds = %9762, %9764 + %.0.i1214 = phi float [ %9763, %9762 ], [ %9765, %9764 ], !dbg !370 + %9766 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1216 = icmp eq i32 %9766, 0, !dbg !370 + br i1 %.not.i1216, label %9769, label %9767, !dbg !370 + +9767: ; preds = %__nv_exp2f.exit1215 + %9768 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9638) #3, !dbg !370 + br label %__nv_exp2f.exit1218, !dbg !370 + +9769: ; preds = %__nv_exp2f.exit1215 + %9770 = tail call float @llvm.nvvm.ex2.approx.f(float %9638) #3, !dbg !370 + br label %__nv_exp2f.exit1218, !dbg !370 + +__nv_exp2f.exit1218: ; preds = %9767, %9769 + %.0.i1217 = phi float [ %9768, %9767 ], [ %9770, %9769 ], !dbg !370 + %9771 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1219 = icmp eq i32 %9771, 0, !dbg !370 + br i1 %.not.i1219, label %9774, label %9772, !dbg !370 + +9772: ; preds = %__nv_exp2f.exit1218 + %9773 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9639) #3, !dbg !370 + br label %__nv_exp2f.exit1221, !dbg !370 + +9774: ; preds = %__nv_exp2f.exit1218 + %9775 = tail call float @llvm.nvvm.ex2.approx.f(float %9639) #3, !dbg !370 + br label %__nv_exp2f.exit1221, !dbg !370 + +__nv_exp2f.exit1221: ; preds = %9772, %9774 + %.0.i1220 = phi float [ %9773, %9772 ], [ %9775, %9774 ], !dbg !370 + %9776 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1222 = icmp eq i32 %9776, 0, !dbg !370 + br i1 %.not.i1222, label %9779, label %9777, !dbg !370 + +9777: ; preds = %__nv_exp2f.exit1221 + %9778 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9640) #3, !dbg !370 + br label %__nv_exp2f.exit1224, !dbg !370 + +9779: ; preds = %__nv_exp2f.exit1221 + %9780 = tail call float @llvm.nvvm.ex2.approx.f(float %9640) #3, !dbg !370 + br label %__nv_exp2f.exit1224, !dbg !370 + +__nv_exp2f.exit1224: ; preds = %9777, %9779 + %.0.i1223 = phi float [ %9778, %9777 ], [ %9780, %9779 ], !dbg !370 + %9781 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1225 = icmp eq i32 %9781, 0, !dbg !370 + br i1 %.not.i1225, label %9784, label %9782, !dbg !370 + +9782: ; preds = %__nv_exp2f.exit1224 + %9783 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9641) #3, !dbg !370 + br label %__nv_exp2f.exit1227, !dbg !370 + +9784: ; preds = %__nv_exp2f.exit1224 + %9785 = tail call float @llvm.nvvm.ex2.approx.f(float %9641) #3, !dbg !370 + br label %__nv_exp2f.exit1227, !dbg !370 + +__nv_exp2f.exit1227: ; preds = %9782, %9784 + %.0.i1226 = phi float [ %9783, %9782 ], [ %9785, %9784 ], !dbg !370 + %9786 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1228 = icmp eq i32 %9786, 0, !dbg !370 + br i1 %.not.i1228, label %9789, label %9787, !dbg !370 + +9787: ; preds = %__nv_exp2f.exit1227 + %9788 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9642) #3, !dbg !370 + br label %__nv_exp2f.exit1230, !dbg !370 + +9789: ; preds = %__nv_exp2f.exit1227 + %9790 = tail call float @llvm.nvvm.ex2.approx.f(float %9642) #3, !dbg !370 + br label %__nv_exp2f.exit1230, !dbg !370 + +__nv_exp2f.exit1230: ; preds = %9787, %9789 + %.0.i1229 = phi float [ %9788, %9787 ], [ %9790, %9789 ], !dbg !370 + %9791 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1231 = icmp eq i32 %9791, 0, !dbg !370 + br i1 %.not.i1231, label %9794, label %9792, !dbg !370 + +9792: ; preds = %__nv_exp2f.exit1230 + %9793 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9643) #3, !dbg !370 + br label %__nv_exp2f.exit1233, !dbg !370 + +9794: ; preds = %__nv_exp2f.exit1230 + %9795 = tail call float @llvm.nvvm.ex2.approx.f(float %9643) #3, !dbg !370 + br label %__nv_exp2f.exit1233, !dbg !370 + +__nv_exp2f.exit1233: ; preds = %9792, %9794 + %.0.i1232 = phi float [ %9793, %9792 ], [ %9795, %9794 ], !dbg !370 + %9796 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1234 = icmp eq i32 %9796, 0, !dbg !370 + br i1 %.not.i1234, label %9799, label %9797, !dbg !370 + +9797: ; preds = %__nv_exp2f.exit1233 + %9798 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9644) #3, !dbg !370 + br label %__nv_exp2f.exit1236, !dbg !370 + +9799: ; preds = %__nv_exp2f.exit1233 + %9800 = tail call float @llvm.nvvm.ex2.approx.f(float %9644) #3, !dbg !370 + br label %__nv_exp2f.exit1236, !dbg !370 + +__nv_exp2f.exit1236: ; preds = %9797, %9799 + %.0.i1235 = phi float [ %9798, %9797 ], [ %9800, %9799 ], !dbg !370 + %9801 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !370 + %.not.i1237 = icmp eq i32 %9801, 0, !dbg !370 + br i1 %.not.i1237, label %9804, label %9802, !dbg !370 + +9802: ; preds = %__nv_exp2f.exit1236 + %9803 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9645) #3, !dbg !370 + br label %__nv_exp2f.exit1239, !dbg !370 + +9804: ; preds = %__nv_exp2f.exit1236 + %9805 = tail call float @llvm.nvvm.ex2.approx.f(float %9645) #3, !dbg !370 + br label %__nv_exp2f.exit1239, !dbg !370 + +__nv_exp2f.exit1239: ; preds = %9802, %9804 + %.0.i1238 = phi float [ %9803, %9802 ], [ %9805, %9804 ], !dbg !370 + %9806 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %9063, !dbg !356 + %9807 = insertelement <2 x float> poison, float %.0.i, i64 0, !dbg !371 + %9808 = insertelement <2 x float> %9807, float %.0.i1148, i64 1, !dbg !371 + %9809 = fptrunc <2 x float> %9808 to <2 x bfloat>, !dbg !371 + %9810 = insertelement <2 x float> poison, float %.0.i1151, i64 0, !dbg !371 + %9811 = insertelement <2 x float> %9810, float %.0.i1154, i64 1, !dbg !371 + %9812 = fptrunc <2 x float> %9811 to <2 x bfloat>, !dbg !371 + %9813 = insertelement <2 x float> poison, float %.0.i1157, i64 0, !dbg !371 + %9814 = insertelement <2 x float> %9813, float %.0.i1160, i64 1, !dbg !371 + %9815 = fptrunc <2 x float> %9814 to <2 x bfloat>, !dbg !371 + %9816 = insertelement <2 x float> poison, float %.0.i1163, i64 0, !dbg !371 + %9817 = insertelement <2 x float> %9816, float %.0.i1166, i64 1, !dbg !371 + %9818 = fptrunc <2 x float> %9817 to <2 x bfloat>, !dbg !371 + %9819 = insertelement <2 x float> poison, float %.0.i1169, i64 0, !dbg !371 + %9820 = insertelement <2 x float> %9819, float %.0.i1172, i64 1, !dbg !371 + %9821 = fptrunc <2 x float> %9820 to <2 x bfloat>, !dbg !371 + %9822 = insertelement <2 x float> poison, float %.0.i1175, i64 0, !dbg !371 + %9823 = insertelement <2 x float> %9822, float %.0.i1178, i64 1, !dbg !371 + %9824 = fptrunc <2 x float> %9823 to <2 x bfloat>, !dbg !371 + %9825 = insertelement <2 x float> poison, float %.0.i1181, i64 0, !dbg !371 + %9826 = insertelement <2 x float> %9825, float %.0.i1184, i64 1, !dbg !371 + %9827 = fptrunc <2 x float> %9826 to <2 x bfloat>, !dbg !371 + %9828 = insertelement <2 x float> poison, float %.0.i1187, i64 0, !dbg !371 + %9829 = insertelement <2 x float> %9828, float %.0.i1190, i64 1, !dbg !371 + %9830 = fptrunc <2 x float> %9829 to <2 x bfloat>, !dbg !371 + %9831 = insertelement <2 x float> poison, float %.0.i1193, i64 0, !dbg !371 + %9832 = insertelement <2 x float> %9831, float %.0.i1196, i64 1, !dbg !371 + %9833 = fptrunc <2 x float> %9832 to <2 x bfloat>, !dbg !371 + %9834 = insertelement <2 x float> poison, float %.0.i1199, i64 0, !dbg !371 + %9835 = insertelement <2 x float> %9834, float %.0.i1202, i64 1, !dbg !371 + %9836 = fptrunc <2 x float> %9835 to <2 x bfloat>, !dbg !371 + %9837 = insertelement <2 x float> poison, float %.0.i1205, i64 0, !dbg !371 + %9838 = insertelement <2 x float> %9837, float %.0.i1208, i64 1, !dbg !371 + %9839 = fptrunc <2 x float> %9838 to <2 x bfloat>, !dbg !371 + %9840 = insertelement <2 x float> poison, float %.0.i1211, i64 0, !dbg !371 + %9841 = insertelement <2 x float> %9840, float %.0.i1214, i64 1, !dbg !371 + %9842 = fptrunc <2 x float> %9841 to <2 x bfloat>, !dbg !371 + %9843 = insertelement <2 x float> poison, float %.0.i1217, i64 0, !dbg !371 + %9844 = insertelement <2 x float> %9843, float %.0.i1220, i64 1, !dbg !371 + %9845 = fptrunc <2 x float> %9844 to <2 x bfloat>, !dbg !371 + %9846 = insertelement <2 x float> poison, float %.0.i1223, i64 0, !dbg !371 + %9847 = insertelement <2 x float> %9846, float %.0.i1226, i64 1, !dbg !371 + %9848 = fptrunc <2 x float> %9847 to <2 x bfloat>, !dbg !371 + %9849 = insertelement <2 x float> poison, float %.0.i1229, i64 0, !dbg !371 + %9850 = insertelement <2 x float> %9849, float %.0.i1232, i64 1, !dbg !371 + %9851 = fptrunc <2 x float> %9850 to <2 x bfloat>, !dbg !371 + %9852 = insertelement <2 x float> poison, float %.0.i1235, i64 0, !dbg !371 + %9853 = insertelement <2 x float> %9852, float %.0.i1238, i64 1, !dbg !371 + %9854 = fptrunc <2 x float> %9853 to <2 x bfloat>, !dbg !371 + %9855 = bitcast <2 x bfloat> %9809 to i32, !dbg !372 + %9856 = bitcast <2 x bfloat> %9812 to i32, !dbg !372 + %9857 = bitcast <2 x bfloat> %9815 to i32, !dbg !372 + %9858 = bitcast <2 x bfloat> %9818 to i32, !dbg !372 + %9859 = bitcast <2 x bfloat> %9821 to i32, !dbg !372 + %9860 = bitcast <2 x bfloat> %9824 to i32, !dbg !372 + %9861 = bitcast <2 x bfloat> %9827 to i32, !dbg !372 + %9862 = bitcast <2 x bfloat> %9830 to i32, !dbg !372 + %9863 = bitcast <2 x bfloat> %9833 to i32, !dbg !372 + %9864 = bitcast <2 x bfloat> %9836 to i32, !dbg !372 + %9865 = bitcast <2 x bfloat> %9839 to i32, !dbg !372 + %9866 = bitcast <2 x bfloat> %9842 to i32, !dbg !372 + %9867 = bitcast <2 x bfloat> %9845 to i32, !dbg !372 + %9868 = bitcast <2 x bfloat> %9848 to i32, !dbg !372 + %9869 = bitcast <2 x bfloat> %9851 to i32, !dbg !372 + %9870 = bitcast <2 x bfloat> %9854 to i32, !dbg !372 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !372 + %9871 = ptrtoint ptr addrspace(3) %9806 to i32, !dbg !372 + %9872 = lshr exact i32 %9871, 4, !dbg !372 + %9873 = and i32 %9872, 16383, !dbg !372 + %9874 = zext nneg i32 %9873 to i64, !dbg !372 + %9875 = or disjoint i64 %9874, 4611686293338849280, !dbg !372 + %9876 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %.pn4751688, float %.pn4731689, float %.pn4711690, float %.pn4691691, float %.pn4671692, float %.pn4651693, float %.pn4631694, float %.pn4611695, float %.pn4591696, float %.pn4571697, float %.pn4551698, float %.pn4531699, float %.pn4511700, float %.pn4491701, float %.pn4471702, float %.pn4451703, float %.pn4431704, float %.pn4411705, float %.pn4391706, float %.pn4371707, float %.pn4351708, float %.pn4331709, float %.pn4311710, float %.pn4291711, float %.pn4271712, float %.pn4251713, float %.pn4231714, float %.pn4211715, float %.pn4191716, float %.pn4171717, float %.pn4151718, float %.pn4131719, float %.pn4111720, float %.pn4091721, float %.pn4071722, float %.pn4051723, float %.pn4031724, float %.pn4011725, float %.pn3991726, float %.pn3971727, float %.pn3951728, float %.pn3931729, float %.pn3911730, float %.pn3891731, float %.pn3871732, float %.pn3851733, float %.pn3831734, float %.pn3811735, float %.pn3791736, float %.pn3771737, float %.pn3751738, float %.pn3731739, float %.pn3711740, float %.pn3691741, float %.pn3671742, float %.pn3651743, float %.pn3631744, float %.pn3611745, float %.pn3591746, float %.pn3571747, float %.pn3551748, float %.pn3531749, float %.pn3511750, float %.pn3491751, i32 %9855, i32 %9856, i32 %9857, i32 %9858, i64 %9875, i1 true) #3, !dbg !372 + %9877 = add i32 %9871, 2048, !dbg !372 + %9878 = lshr exact i32 %9877, 4, !dbg !372 + %9879 = and i32 %9878, 16383, !dbg !372 + %9880 = zext nneg i32 %9879 to i64, !dbg !372 + %9881 = or disjoint i64 %9880, 4611686293338849280, !dbg !372 + %9882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 0, !dbg !372 + %9883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 1, !dbg !372 + %9884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 2, !dbg !372 + %9885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 3, !dbg !372 + %9886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 4, !dbg !372 + %9887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 5, !dbg !372 + %9888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 6, !dbg !372 + %9889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 7, !dbg !372 + %9890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 8, !dbg !372 + %9891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 9, !dbg !372 + %9892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 10, !dbg !372 + %9893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 11, !dbg !372 + %9894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 12, !dbg !372 + %9895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 13, !dbg !372 + %9896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 14, !dbg !372 + %9897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 15, !dbg !372 + %9898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 16, !dbg !372 + %9899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 17, !dbg !372 + %9900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 18, !dbg !372 + %9901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 19, !dbg !372 + %9902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 20, !dbg !372 + %9903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 21, !dbg !372 + %9904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 22, !dbg !372 + %9905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 23, !dbg !372 + %9906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 24, !dbg !372 + %9907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 25, !dbg !372 + %9908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 26, !dbg !372 + %9909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 27, !dbg !372 + %9910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 28, !dbg !372 + %9911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 29, !dbg !372 + %9912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 30, !dbg !372 + %9913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 31, !dbg !372 + %9914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 32, !dbg !372 + %9915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 33, !dbg !372 + %9916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 34, !dbg !372 + %9917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 35, !dbg !372 + %9918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 36, !dbg !372 + %9919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 37, !dbg !372 + %9920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 38, !dbg !372 + %9921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 39, !dbg !372 + %9922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 40, !dbg !372 + %9923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 41, !dbg !372 + %9924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 42, !dbg !372 + %9925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 43, !dbg !372 + %9926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 44, !dbg !372 + %9927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 45, !dbg !372 + %9928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 46, !dbg !372 + %9929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 47, !dbg !372 + %9930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 48, !dbg !372 + %9931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 49, !dbg !372 + %9932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 50, !dbg !372 + %9933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 51, !dbg !372 + %9934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 52, !dbg !372 + %9935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 53, !dbg !372 + %9936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 54, !dbg !372 + %9937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 55, !dbg !372 + %9938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 56, !dbg !372 + %9939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 57, !dbg !372 + %9940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 58, !dbg !372 + %9941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 59, !dbg !372 + %9942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 60, !dbg !372 + %9943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 61, !dbg !372 + %9944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 62, !dbg !372 + %9945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9876, 63, !dbg !372 + %9946 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %9882, float %9883, float %9884, float %9885, float %9886, float %9887, float %9888, float %9889, float %9890, float %9891, float %9892, float %9893, float %9894, float %9895, float %9896, float %9897, float %9898, float %9899, float %9900, float %9901, float %9902, float %9903, float %9904, float %9905, float %9906, float %9907, float %9908, float %9909, float %9910, float %9911, float %9912, float %9913, float %9914, float %9915, float %9916, float %9917, float %9918, float %9919, float %9920, float %9921, float %9922, float %9923, float %9924, float %9925, float %9926, float %9927, float %9928, float %9929, float %9930, float %9931, float %9932, float %9933, float %9934, float %9935, float %9936, float %9937, float %9938, float %9939, float %9940, float %9941, float %9942, float %9943, float %9944, float %9945, i32 %9859, i32 %9860, i32 %9861, i32 %9862, i64 %9881, i1 true) #3, !dbg !372 + %9947 = add i32 %9871, 4096, !dbg !372 + %9948 = lshr exact i32 %9947, 4, !dbg !372 + %9949 = and i32 %9948, 16383, !dbg !372 + %9950 = zext nneg i32 %9949 to i64, !dbg !372 + %9951 = or disjoint i64 %9950, 4611686293338849280, !dbg !372 + %9952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 0, !dbg !372 + %9953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 1, !dbg !372 + %9954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 2, !dbg !372 + %9955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 3, !dbg !372 + %9956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 4, !dbg !372 + %9957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 5, !dbg !372 + %9958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 6, !dbg !372 + %9959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 7, !dbg !372 + %9960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 8, !dbg !372 + %9961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 9, !dbg !372 + %9962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 10, !dbg !372 + %9963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 11, !dbg !372 + %9964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 12, !dbg !372 + %9965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 13, !dbg !372 + %9966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 14, !dbg !372 + %9967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 15, !dbg !372 + %9968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 16, !dbg !372 + %9969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 17, !dbg !372 + %9970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 18, !dbg !372 + %9971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 19, !dbg !372 + %9972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 20, !dbg !372 + %9973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 21, !dbg !372 + %9974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 22, !dbg !372 + %9975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 23, !dbg !372 + %9976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 24, !dbg !372 + %9977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 25, !dbg !372 + %9978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 26, !dbg !372 + %9979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 27, !dbg !372 + %9980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 28, !dbg !372 + %9981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 29, !dbg !372 + %9982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 30, !dbg !372 + %9983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 31, !dbg !372 + %9984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 32, !dbg !372 + %9985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 33, !dbg !372 + %9986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 34, !dbg !372 + %9987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 35, !dbg !372 + %9988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 36, !dbg !372 + %9989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 37, !dbg !372 + %9990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 38, !dbg !372 + %9991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 39, !dbg !372 + %9992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 40, !dbg !372 + %9993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 41, !dbg !372 + %9994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 42, !dbg !372 + %9995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 43, !dbg !372 + %9996 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 44, !dbg !372 + %9997 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 45, !dbg !372 + %9998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 46, !dbg !372 + %9999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 47, !dbg !372 + %10000 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 48, !dbg !372 + %10001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 49, !dbg !372 + %10002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 50, !dbg !372 + %10003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 51, !dbg !372 + %10004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 52, !dbg !372 + %10005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 53, !dbg !372 + %10006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 54, !dbg !372 + %10007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 55, !dbg !372 + %10008 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 56, !dbg !372 + %10009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 57, !dbg !372 + %10010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 58, !dbg !372 + %10011 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 59, !dbg !372 + %10012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 60, !dbg !372 + %10013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 61, !dbg !372 + %10014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 62, !dbg !372 + %10015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9946, 63, !dbg !372 + %10016 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %9952, float %9953, float %9954, float %9955, float %9956, float %9957, float %9958, float %9959, float %9960, float %9961, float %9962, float %9963, float %9964, float %9965, float %9966, float %9967, float %9968, float %9969, float %9970, float %9971, float %9972, float %9973, float %9974, float %9975, float %9976, float %9977, float %9978, float %9979, float %9980, float %9981, float %9982, float %9983, float %9984, float %9985, float %9986, float %9987, float %9988, float %9989, float %9990, float %9991, float %9992, float %9993, float %9994, float %9995, float %9996, float %9997, float %9998, float %9999, float %10000, float %10001, float %10002, float %10003, float %10004, float %10005, float %10006, float %10007, float %10008, float %10009, float %10010, float %10011, float %10012, float %10013, float %10014, float %10015, i32 %9863, i32 %9864, i32 %9865, i32 %9866, i64 %9951, i1 true) #3, !dbg !372 + %10017 = add i32 %9871, 6144, !dbg !372 + %10018 = lshr exact i32 %10017, 4, !dbg !372 + %10019 = and i32 %10018, 16383, !dbg !372 + %10020 = zext nneg i32 %10019 to i64, !dbg !372 + %10021 = or disjoint i64 %10020, 4611686293338849280, !dbg !372 + %10022 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 0, !dbg !372 + %10023 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 1, !dbg !372 + %10024 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 2, !dbg !372 + %10025 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 3, !dbg !372 + %10026 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 4, !dbg !372 + %10027 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 5, !dbg !372 + %10028 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 6, !dbg !372 + %10029 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 7, !dbg !372 + %10030 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 8, !dbg !372 + %10031 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 9, !dbg !372 + %10032 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 10, !dbg !372 + %10033 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 11, !dbg !372 + %10034 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 12, !dbg !372 + %10035 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 13, !dbg !372 + %10036 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 14, !dbg !372 + %10037 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 15, !dbg !372 + %10038 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 16, !dbg !372 + %10039 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 17, !dbg !372 + %10040 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 18, !dbg !372 + %10041 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 19, !dbg !372 + %10042 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 20, !dbg !372 + %10043 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 21, !dbg !372 + %10044 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 22, !dbg !372 + %10045 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 23, !dbg !372 + %10046 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 24, !dbg !372 + %10047 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 25, !dbg !372 + %10048 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 26, !dbg !372 + %10049 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 27, !dbg !372 + %10050 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 28, !dbg !372 + %10051 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 29, !dbg !372 + %10052 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 30, !dbg !372 + %10053 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 31, !dbg !372 + %10054 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 32, !dbg !372 + %10055 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 33, !dbg !372 + %10056 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 34, !dbg !372 + %10057 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 35, !dbg !372 + %10058 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 36, !dbg !372 + %10059 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 37, !dbg !372 + %10060 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 38, !dbg !372 + %10061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 39, !dbg !372 + %10062 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 40, !dbg !372 + %10063 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 41, !dbg !372 + %10064 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 42, !dbg !372 + %10065 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 43, !dbg !372 + %10066 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 44, !dbg !372 + %10067 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 45, !dbg !372 + %10068 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 46, !dbg !372 + %10069 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 47, !dbg !372 + %10070 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 48, !dbg !372 + %10071 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 49, !dbg !372 + %10072 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 50, !dbg !372 + %10073 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 51, !dbg !372 + %10074 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 52, !dbg !372 + %10075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 53, !dbg !372 + %10076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 54, !dbg !372 + %10077 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 55, !dbg !372 + %10078 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 56, !dbg !372 + %10079 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 57, !dbg !372 + %10080 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 58, !dbg !372 + %10081 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 59, !dbg !372 + %10082 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 60, !dbg !372 + %10083 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 61, !dbg !372 + %10084 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 62, !dbg !372 + %10085 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10016, 63, !dbg !372 + %10086 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %10022, float %10023, float %10024, float %10025, float %10026, float %10027, float %10028, float %10029, float %10030, float %10031, float %10032, float %10033, float %10034, float %10035, float %10036, float %10037, float %10038, float %10039, float %10040, float %10041, float %10042, float %10043, float %10044, float %10045, float %10046, float %10047, float %10048, float %10049, float %10050, float %10051, float %10052, float %10053, float %10054, float %10055, float %10056, float %10057, float %10058, float %10059, float %10060, float %10061, float %10062, float %10063, float %10064, float %10065, float %10066, float %10067, float %10068, float %10069, float %10070, float %10071, float %10072, float %10073, float %10074, float %10075, float %10076, float %10077, float %10078, float %10079, float %10080, float %10081, float %10082, float %10083, float %10084, float %10085, i32 %9867, i32 %9868, i32 %9869, i32 %9870, i64 %10021, i1 true) #3, !dbg !372 + %10087 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 0, !dbg !372 + %10088 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 1, !dbg !372 + %10089 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 2, !dbg !372 + %10090 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 3, !dbg !372 + %10091 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 4, !dbg !372 + %10092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 5, !dbg !372 + %10093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 6, !dbg !372 + %10094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 7, !dbg !372 + %10095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 8, !dbg !372 + %10096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 9, !dbg !372 + %10097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 10, !dbg !372 + %10098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 11, !dbg !372 + %10099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 12, !dbg !372 + %10100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 13, !dbg !372 + %10101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 14, !dbg !372 + %10102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 15, !dbg !372 + %10103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 16, !dbg !372 + %10104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 17, !dbg !372 + %10105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 18, !dbg !372 + %10106 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 19, !dbg !372 + %10107 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 20, !dbg !372 + %10108 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 21, !dbg !372 + %10109 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 22, !dbg !372 + %10110 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 23, !dbg !372 + %10111 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 24, !dbg !372 + %10112 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 25, !dbg !372 + %10113 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 26, !dbg !372 + %10114 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 27, !dbg !372 + %10115 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 28, !dbg !372 + %10116 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 29, !dbg !372 + %10117 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 30, !dbg !372 + %10118 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 31, !dbg !372 + %10119 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 32, !dbg !372 + %10120 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 33, !dbg !372 + %10121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 34, !dbg !372 + %10122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 35, !dbg !372 + %10123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 36, !dbg !372 + %10124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 37, !dbg !372 + %10125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 38, !dbg !372 + %10126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 39, !dbg !372 + %10127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 40, !dbg !372 + %10128 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 41, !dbg !372 + %10129 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 42, !dbg !372 + %10130 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 43, !dbg !372 + %10131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 44, !dbg !372 + %10132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 45, !dbg !372 + %10133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 46, !dbg !372 + %10134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 47, !dbg !372 + %10135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 48, !dbg !372 + %10136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 49, !dbg !372 + %10137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 50, !dbg !372 + %10138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 51, !dbg !372 + %10139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 52, !dbg !372 + %10140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 53, !dbg !372 + %10141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 54, !dbg !372 + %10142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 55, !dbg !372 + %10143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 56, !dbg !372 + %10144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 57, !dbg !372 + %10145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 58, !dbg !372 + %10146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 59, !dbg !372 + %10147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 60, !dbg !372 + %10148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 61, !dbg !372 + %10149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 62, !dbg !372 + %10150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10086, 63, !dbg !372 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !372 + %10151 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %9065, !dbg !358 + %10152 = getelementptr inbounds nuw i8, ptr addrspace(3) %10151, i32 %5332, !dbg !358 + %10153 = load float, ptr addrspace(3) %10152, align 8, !dbg !358 + %10154 = getelementptr inbounds nuw i8, ptr addrspace(3) %10152, i32 4, !dbg !358 + %10155 = load float, ptr addrspace(3) %10154, align 4, !dbg !358 + %10156 = getelementptr inbounds nuw i8, ptr addrspace(3) %10151, i32 %5336, !dbg !358 + %10157 = load float, ptr addrspace(3) %10156, align 8, !dbg !358 + %10158 = getelementptr inbounds nuw i8, ptr addrspace(3) %10156, i32 4, !dbg !358 + %10159 = load float, ptr addrspace(3) %10158, align 4, !dbg !358 + %10160 = getelementptr inbounds nuw i8, ptr addrspace(3) %10151, i32 %5342, !dbg !358 + %10161 = load float, ptr addrspace(3) %10160, align 8, !dbg !358 + %10162 = getelementptr inbounds nuw i8, ptr addrspace(3) %10160, i32 4, !dbg !358 + %10163 = load float, ptr addrspace(3) %10162, align 4, !dbg !358 + %10164 = getelementptr inbounds nuw i8, ptr addrspace(3) %10151, i32 %5348, !dbg !358 + %10165 = load float, ptr addrspace(3) %10164, align 8, !dbg !358 + %10166 = getelementptr inbounds nuw i8, ptr addrspace(3) %10164, i32 4, !dbg !358 + %10167 = load float, ptr addrspace(3) %10166, align 4, !dbg !358 + %10168 = getelementptr inbounds nuw i8, ptr addrspace(3) %10151, i32 %5354, !dbg !358 + %10169 = load float, ptr addrspace(3) %10168, align 8, !dbg !358 + %10170 = getelementptr inbounds nuw i8, ptr addrspace(3) %10168, i32 4, !dbg !358 + %10171 = load float, ptr addrspace(3) %10170, align 4, !dbg !358 + %10172 = getelementptr inbounds nuw i8, ptr addrspace(3) %10151, i32 %5360, !dbg !358 + %10173 = load float, ptr addrspace(3) %10172, align 8, !dbg !358 + %10174 = getelementptr inbounds nuw i8, ptr addrspace(3) %10172, i32 4, !dbg !358 + %10175 = load float, ptr addrspace(3) %10174, align 4, !dbg !358 + %10176 = getelementptr inbounds nuw i8, ptr addrspace(3) %10151, i32 %5366, !dbg !358 + %10177 = load float, ptr addrspace(3) %10176, align 8, !dbg !358 + %10178 = getelementptr inbounds nuw i8, ptr addrspace(3) %10176, i32 4, !dbg !358 + %10179 = load float, ptr addrspace(3) %10178, align 4, !dbg !358 + %10180 = getelementptr inbounds nuw i8, ptr addrspace(3) %10151, i32 %5372, !dbg !358 + %10181 = load float, ptr addrspace(3) %10180, align 8, !dbg !358 + %10182 = getelementptr inbounds nuw i8, ptr addrspace(3) %10180, i32 4, !dbg !358 + %10183 = load float, ptr addrspace(3) %10182, align 4, !dbg !358 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !373 + %10184 = add i32 %9133, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !373 + %10185 = lshr exact i32 %10184, 4, !dbg !373 + %10186 = and i32 %10185, 16383, !dbg !373 + %10187 = zext nneg i32 %10186 to i64, !dbg !373 + %10188 = or disjoint i64 %10187, 4611686293372403712, !dbg !373 + %10189 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %10188, i64 %9875) #3, !dbg !373 + %10190 = add i32 %9145, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !373 + %10191 = lshr exact i32 %10190, 4, !dbg !373 + %10192 = and i32 %10191, 16383, !dbg !373 + %10193 = zext nneg i32 %10192 to i64, !dbg !373 + %10194 = or disjoint i64 %10193, 4611686293372403712, !dbg !373 + %10195 = add i32 %9871, 32, !dbg !373 + %10196 = lshr exact i32 %10195, 4, !dbg !373 + %10197 = and i32 %10196, 16383, !dbg !373 + %10198 = zext nneg i32 %10197 to i64, !dbg !373 + %10199 = or disjoint i64 %10198, 4611686293338849280, !dbg !373 + %10200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 0, !dbg !373 + %10201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 1, !dbg !373 + %10202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 2, !dbg !373 + %10203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 3, !dbg !373 + %10204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 4, !dbg !373 + %10205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 5, !dbg !373 + %10206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 6, !dbg !373 + %10207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 7, !dbg !373 + %10208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 8, !dbg !373 + %10209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 9, !dbg !373 + %10210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 10, !dbg !373 + %10211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 11, !dbg !373 + %10212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 12, !dbg !373 + %10213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 13, !dbg !373 + %10214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 14, !dbg !373 + %10215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 15, !dbg !373 + %10216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 16, !dbg !373 + %10217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 17, !dbg !373 + %10218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 18, !dbg !373 + %10219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 19, !dbg !373 + %10220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 20, !dbg !373 + %10221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 21, !dbg !373 + %10222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 22, !dbg !373 + %10223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 23, !dbg !373 + %10224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 24, !dbg !373 + %10225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 25, !dbg !373 + %10226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 26, !dbg !373 + %10227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 27, !dbg !373 + %10228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 28, !dbg !373 + %10229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 29, !dbg !373 + %10230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 30, !dbg !373 + %10231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10189, 31, !dbg !373 + %10232 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %10200, float %10201, float %10202, float %10203, float %10204, float %10205, float %10206, float %10207, float %10208, float %10209, float %10210, float %10211, float %10212, float %10213, float %10214, float %10215, float %10216, float %10217, float %10218, float %10219, float %10220, float %10221, float %10222, float %10223, float %10224, float %10225, float %10226, float %10227, float %10228, float %10229, float %10230, float %10231, i64 %10194, i64 %10199, i1 true) #3, !dbg !373 + %10233 = add i32 %9189, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !373 + %10234 = lshr exact i32 %10233, 4, !dbg !373 + %10235 = and i32 %10234, 16383, !dbg !373 + %10236 = zext nneg i32 %10235 to i64, !dbg !373 + %10237 = or disjoint i64 %10236, 4611686293372403712, !dbg !373 + %10238 = add i32 %9871, 64, !dbg !373 + %10239 = lshr exact i32 %10238, 4, !dbg !373 + %10240 = and i32 %10239, 16383, !dbg !373 + %10241 = zext nneg i32 %10240 to i64, !dbg !373 + %10242 = or disjoint i64 %10241, 4611686293338849280, !dbg !373 + %10243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 0, !dbg !373 + %10244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 1, !dbg !373 + %10245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 2, !dbg !373 + %10246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 3, !dbg !373 + %10247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 4, !dbg !373 + %10248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 5, !dbg !373 + %10249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 6, !dbg !373 + %10250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 7, !dbg !373 + %10251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 8, !dbg !373 + %10252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 9, !dbg !373 + %10253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 10, !dbg !373 + %10254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 11, !dbg !373 + %10255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 12, !dbg !373 + %10256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 13, !dbg !373 + %10257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 14, !dbg !373 + %10258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 15, !dbg !373 + %10259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 16, !dbg !373 + %10260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 17, !dbg !373 + %10261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 18, !dbg !373 + %10262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 19, !dbg !373 + %10263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 20, !dbg !373 + %10264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 21, !dbg !373 + %10265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 22, !dbg !373 + %10266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 23, !dbg !373 + %10267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 24, !dbg !373 + %10268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 25, !dbg !373 + %10269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 26, !dbg !373 + %10270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 27, !dbg !373 + %10271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 28, !dbg !373 + %10272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 29, !dbg !373 + %10273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 30, !dbg !373 + %10274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10232, 31, !dbg !373 + %10275 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %10243, float %10244, float %10245, float %10246, float %10247, float %10248, float %10249, float %10250, float %10251, float %10252, float %10253, float %10254, float %10255, float %10256, float %10257, float %10258, float %10259, float %10260, float %10261, float %10262, float %10263, float %10264, float %10265, float %10266, float %10267, float %10268, float %10269, float %10270, float %10271, float %10272, float %10273, float %10274, i64 %10237, i64 %10242, i1 true) #3, !dbg !373 + %10276 = add i32 %9233, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !373 + %10277 = lshr exact i32 %10276, 4, !dbg !373 + %10278 = and i32 %10277, 16383, !dbg !373 + %10279 = zext nneg i32 %10278 to i64, !dbg !373 + %10280 = or disjoint i64 %10279, 4611686293372403712, !dbg !373 + %10281 = add i32 %9871, 96, !dbg !373 + %10282 = lshr exact i32 %10281, 4, !dbg !373 + %10283 = and i32 %10282, 16383, !dbg !373 + %10284 = zext nneg i32 %10283 to i64, !dbg !373 + %10285 = or disjoint i64 %10284, 4611686293338849280, !dbg !373 + %10286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 0, !dbg !373 + %10287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 1, !dbg !373 + %10288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 2, !dbg !373 + %10289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 3, !dbg !373 + %10290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 4, !dbg !373 + %10291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 5, !dbg !373 + %10292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 6, !dbg !373 + %10293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 7, !dbg !373 + %10294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 8, !dbg !373 + %10295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 9, !dbg !373 + %10296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 10, !dbg !373 + %10297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 11, !dbg !373 + %10298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 12, !dbg !373 + %10299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 13, !dbg !373 + %10300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 14, !dbg !373 + %10301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 15, !dbg !373 + %10302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 16, !dbg !373 + %10303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 17, !dbg !373 + %10304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 18, !dbg !373 + %10305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 19, !dbg !373 + %10306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 20, !dbg !373 + %10307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 21, !dbg !373 + %10308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 22, !dbg !373 + %10309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 23, !dbg !373 + %10310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 24, !dbg !373 + %10311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 25, !dbg !373 + %10312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 26, !dbg !373 + %10313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 27, !dbg !373 + %10314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 28, !dbg !373 + %10315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 29, !dbg !373 + %10316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 30, !dbg !373 + %10317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10275, 31, !dbg !373 + %10318 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %10286, float %10287, float %10288, float %10289, float %10290, float %10291, float %10292, float %10293, float %10294, float %10295, float %10296, float %10297, float %10298, float %10299, float %10300, float %10301, float %10302, float %10303, float %10304, float %10305, float %10306, float %10307, float %10308, float %10309, float %10310, float %10311, float %10312, float %10313, float %10314, float %10315, float %10316, float %10317, i64 %10280, i64 %10285, i1 true) #3, !dbg !373 + %10319 = add i32 %9277, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !373 + %10320 = lshr exact i32 %10319, 4, !dbg !373 + %10321 = and i32 %10320, 16383, !dbg !373 + %10322 = zext nneg i32 %10321 to i64, !dbg !373 + %10323 = or disjoint i64 %10322, 4611686293372403712, !dbg !373 + %10324 = add i32 %9871, 8192, !dbg !373 + %10325 = lshr exact i32 %10324, 4, !dbg !373 + %10326 = and i32 %10325, 16383, !dbg !373 + %10327 = zext nneg i32 %10326 to i64, !dbg !373 + %10328 = or disjoint i64 %10327, 4611686293338849280, !dbg !373 + %10329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 0, !dbg !373 + %10330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 1, !dbg !373 + %10331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 2, !dbg !373 + %10332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 3, !dbg !373 + %10333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 4, !dbg !373 + %10334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 5, !dbg !373 + %10335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 6, !dbg !373 + %10336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 7, !dbg !373 + %10337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 8, !dbg !373 + %10338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 9, !dbg !373 + %10339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 10, !dbg !373 + %10340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 11, !dbg !373 + %10341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 12, !dbg !373 + %10342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 13, !dbg !373 + %10343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 14, !dbg !373 + %10344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 15, !dbg !373 + %10345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 16, !dbg !373 + %10346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 17, !dbg !373 + %10347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 18, !dbg !373 + %10348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 19, !dbg !373 + %10349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 20, !dbg !373 + %10350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 21, !dbg !373 + %10351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 22, !dbg !373 + %10352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 23, !dbg !373 + %10353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 24, !dbg !373 + %10354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 25, !dbg !373 + %10355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 26, !dbg !373 + %10356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 27, !dbg !373 + %10357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 28, !dbg !373 + %10358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 29, !dbg !373 + %10359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 30, !dbg !373 + %10360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10318, 31, !dbg !373 + %10361 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %10329, float %10330, float %10331, float %10332, float %10333, float %10334, float %10335, float %10336, float %10337, float %10338, float %10339, float %10340, float %10341, float %10342, float %10343, float %10344, float %10345, float %10346, float %10347, float %10348, float %10349, float %10350, float %10351, float %10352, float %10353, float %10354, float %10355, float %10356, float %10357, float %10358, float %10359, float %10360, i64 %10323, i64 %10328, i1 true) #3, !dbg !373 + %10362 = add i32 %9321, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !373 + %10363 = lshr exact i32 %10362, 4, !dbg !373 + %10364 = and i32 %10363, 16383, !dbg !373 + %10365 = zext nneg i32 %10364 to i64, !dbg !373 + %10366 = or disjoint i64 %10365, 4611686293372403712, !dbg !373 + %10367 = add i32 %9871, 8224, !dbg !373 + %10368 = lshr exact i32 %10367, 4, !dbg !373 + %10369 = and i32 %10368, 16383, !dbg !373 + %10370 = zext nneg i32 %10369 to i64, !dbg !373 + %10371 = or disjoint i64 %10370, 4611686293338849280, !dbg !373 + %10372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 0, !dbg !373 + %10373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 1, !dbg !373 + %10374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 2, !dbg !373 + %10375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 3, !dbg !373 + %10376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 4, !dbg !373 + %10377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 5, !dbg !373 + %10378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 6, !dbg !373 + %10379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 7, !dbg !373 + %10380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 8, !dbg !373 + %10381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 9, !dbg !373 + %10382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 10, !dbg !373 + %10383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 11, !dbg !373 + %10384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 12, !dbg !373 + %10385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 13, !dbg !373 + %10386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 14, !dbg !373 + %10387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 15, !dbg !373 + %10388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 16, !dbg !373 + %10389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 17, !dbg !373 + %10390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 18, !dbg !373 + %10391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 19, !dbg !373 + %10392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 20, !dbg !373 + %10393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 21, !dbg !373 + %10394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 22, !dbg !373 + %10395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 23, !dbg !373 + %10396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 24, !dbg !373 + %10397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 25, !dbg !373 + %10398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 26, !dbg !373 + %10399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 27, !dbg !373 + %10400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 28, !dbg !373 + %10401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 29, !dbg !373 + %10402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 30, !dbg !373 + %10403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10361, 31, !dbg !373 + %10404 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %10372, float %10373, float %10374, float %10375, float %10376, float %10377, float %10378, float %10379, float %10380, float %10381, float %10382, float %10383, float %10384, float %10385, float %10386, float %10387, float %10388, float %10389, float %10390, float %10391, float %10392, float %10393, float %10394, float %10395, float %10396, float %10397, float %10398, float %10399, float %10400, float %10401, float %10402, float %10403, i64 %10366, i64 %10371, i1 true) #3, !dbg !373 + %10405 = add i32 %9365, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !373 + %10406 = lshr exact i32 %10405, 4, !dbg !373 + %10407 = and i32 %10406, 16383, !dbg !373 + %10408 = zext nneg i32 %10407 to i64, !dbg !373 + %10409 = or disjoint i64 %10408, 4611686293372403712, !dbg !373 + %10410 = add i32 %9871, 8256, !dbg !373 + %10411 = lshr exact i32 %10410, 4, !dbg !373 + %10412 = and i32 %10411, 16383, !dbg !373 + %10413 = zext nneg i32 %10412 to i64, !dbg !373 + %10414 = or disjoint i64 %10413, 4611686293338849280, !dbg !373 + %10415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 0, !dbg !373 + %10416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 1, !dbg !373 + %10417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 2, !dbg !373 + %10418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 3, !dbg !373 + %10419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 4, !dbg !373 + %10420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 5, !dbg !373 + %10421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 6, !dbg !373 + %10422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 7, !dbg !373 + %10423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 8, !dbg !373 + %10424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 9, !dbg !373 + %10425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 10, !dbg !373 + %10426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 11, !dbg !373 + %10427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 12, !dbg !373 + %10428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 13, !dbg !373 + %10429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 14, !dbg !373 + %10430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 15, !dbg !373 + %10431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 16, !dbg !373 + %10432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 17, !dbg !373 + %10433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 18, !dbg !373 + %10434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 19, !dbg !373 + %10435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 20, !dbg !373 + %10436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 21, !dbg !373 + %10437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 22, !dbg !373 + %10438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 23, !dbg !373 + %10439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 24, !dbg !373 + %10440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 25, !dbg !373 + %10441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 26, !dbg !373 + %10442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 27, !dbg !373 + %10443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 28, !dbg !373 + %10444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 29, !dbg !373 + %10445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 30, !dbg !373 + %10446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10404, 31, !dbg !373 + %10447 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %10415, float %10416, float %10417, float %10418, float %10419, float %10420, float %10421, float %10422, float %10423, float %10424, float %10425, float %10426, float %10427, float %10428, float %10429, float %10430, float %10431, float %10432, float %10433, float %10434, float %10435, float %10436, float %10437, float %10438, float %10439, float %10440, float %10441, float %10442, float %10443, float %10444, float %10445, float %10446, i64 %10409, i64 %10414, i1 true) #3, !dbg !373 + %10448 = add i32 %9409, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !373 + %10449 = lshr exact i32 %10448, 4, !dbg !373 + %10450 = and i32 %10449, 16383, !dbg !373 + %10451 = zext nneg i32 %10450 to i64, !dbg !373 + %10452 = or disjoint i64 %10451, 4611686293372403712, !dbg !373 + %10453 = add i32 %9871, 8288, !dbg !373 + %10454 = lshr exact i32 %10453, 4, !dbg !373 + %10455 = and i32 %10454, 16383, !dbg !373 + %10456 = zext nneg i32 %10455 to i64, !dbg !373 + %10457 = or disjoint i64 %10456, 4611686293338849280, !dbg !373 + %10458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 0, !dbg !373 + %10459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 1, !dbg !373 + %10460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 2, !dbg !373 + %10461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 3, !dbg !373 + %10462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 4, !dbg !373 + %10463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 5, !dbg !373 + %10464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 6, !dbg !373 + %10465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 7, !dbg !373 + %10466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 8, !dbg !373 + %10467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 9, !dbg !373 + %10468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 10, !dbg !373 + %10469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 11, !dbg !373 + %10470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 12, !dbg !373 + %10471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 13, !dbg !373 + %10472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 14, !dbg !373 + %10473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 15, !dbg !373 + %10474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 16, !dbg !373 + %10475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 17, !dbg !373 + %10476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 18, !dbg !373 + %10477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 19, !dbg !373 + %10478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 20, !dbg !373 + %10479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 21, !dbg !373 + %10480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 22, !dbg !373 + %10481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 23, !dbg !373 + %10482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 24, !dbg !373 + %10483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 25, !dbg !373 + %10484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 26, !dbg !373 + %10485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 27, !dbg !373 + %10486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 28, !dbg !373 + %10487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 29, !dbg !373 + %10488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 30, !dbg !373 + %10489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10447, 31, !dbg !373 + %10490 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %10458, float %10459, float %10460, float %10461, float %10462, float %10463, float %10464, float %10465, float %10466, float %10467, float %10468, float %10469, float %10470, float %10471, float %10472, float %10473, float %10474, float %10475, float %10476, float %10477, float %10478, float %10479, float %10480, float %10481, float %10482, float %10483, float %10484, float %10485, float %10486, float %10487, float %10488, float %10489, i64 %10452, i64 %10457, i1 true) #3, !dbg !373 + %10491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 0, !dbg !373 + %10492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 1, !dbg !373 + %10493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 2, !dbg !373 + %10494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 3, !dbg !373 + %10495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 4, !dbg !373 + %10496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 5, !dbg !373 + %10497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 6, !dbg !373 + %10498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 7, !dbg !373 + %10499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 8, !dbg !373 + %10500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 9, !dbg !373 + %10501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 10, !dbg !373 + %10502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 11, !dbg !373 + %10503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 12, !dbg !373 + %10504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 13, !dbg !373 + %10505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 14, !dbg !373 + %10506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 15, !dbg !373 + %10507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 16, !dbg !373 + %10508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 17, !dbg !373 + %10509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 18, !dbg !373 + %10510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 19, !dbg !373 + %10511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 20, !dbg !373 + %10512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 21, !dbg !373 + %10513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 22, !dbg !373 + %10514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 23, !dbg !373 + %10515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 24, !dbg !373 + %10516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 25, !dbg !373 + %10517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 26, !dbg !373 + %10518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 27, !dbg !373 + %10519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 28, !dbg !373 + %10520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 29, !dbg !373 + %10521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 30, !dbg !373 + %10522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10490, 31, !dbg !373 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !373 + %10523 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %10491, float %10492, float %10493, float %10494, float %10495, float %10496, float %10497, float %10498, float %10499, float %10500, float %10501, float %10502, float %10503, float %10504, float %10505, float %10506, float %10507, float %10508, float %10509, float %10510, float %10511, float %10512, float %10513, float %10514, float %10515, float %10516, float %10517, float %10518, float %10519, float %10520, float %10521, float %10522, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 0, i32 0, ptr addrspace(3) %9806, i32 0, i32 0) #3, !dbg !373 + %10524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 0, !dbg !373 + %10525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 1, !dbg !373 + %10526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 2, !dbg !373 + %10527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 3, !dbg !373 + %10528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 4, !dbg !373 + %10529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 5, !dbg !373 + %10530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 6, !dbg !373 + %10531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 7, !dbg !373 + %10532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 8, !dbg !373 + %10533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 9, !dbg !373 + %10534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 10, !dbg !373 + %10535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 11, !dbg !373 + %10536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 12, !dbg !373 + %10537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 13, !dbg !373 + %10538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 14, !dbg !373 + %10539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 15, !dbg !373 + %10540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 16, !dbg !373 + %10541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 17, !dbg !373 + %10542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 18, !dbg !373 + %10543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 19, !dbg !373 + %10544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 20, !dbg !373 + %10545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 21, !dbg !373 + %10546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 22, !dbg !373 + %10547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 23, !dbg !373 + %10548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 24, !dbg !373 + %10549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 25, !dbg !373 + %10550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 26, !dbg !373 + %10551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 27, !dbg !373 + %10552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 28, !dbg !373 + %10553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 29, !dbg !373 + %10554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 30, !dbg !373 + %10555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %10523, 31, !dbg !373 + %10556 = fsub float %10524, %10153, !dbg !374 + %10557 = fsub float %10525, %10155, !dbg !374 + %10558 = fsub float %10526, %10153, !dbg !374 + %10559 = fsub float %10527, %10155, !dbg !374 + %10560 = fsub float %10528, %10157, !dbg !374 + %10561 = fsub float %10529, %10159, !dbg !374 + %10562 = fsub float %10530, %10157, !dbg !374 + %10563 = fsub float %10531, %10159, !dbg !374 + %10564 = fsub float %10532, %10161, !dbg !374 + %10565 = fsub float %10533, %10163, !dbg !374 + %10566 = fsub float %10534, %10161, !dbg !374 + %10567 = fsub float %10535, %10163, !dbg !374 + %10568 = fsub float %10536, %10165, !dbg !374 + %10569 = fsub float %10537, %10167, !dbg !374 + %10570 = fsub float %10538, %10165, !dbg !374 + %10571 = fsub float %10539, %10167, !dbg !374 + %10572 = fsub float %10540, %10169, !dbg !374 + %10573 = fsub float %10541, %10171, !dbg !374 + %10574 = fsub float %10542, %10169, !dbg !374 + %10575 = fsub float %10543, %10171, !dbg !374 + %10576 = fsub float %10544, %10173, !dbg !374 + %10577 = fsub float %10545, %10175, !dbg !374 + %10578 = fsub float %10546, %10173, !dbg !374 + %10579 = fsub float %10547, %10175, !dbg !374 + %10580 = fsub float %10548, %10177, !dbg !374 + %10581 = fsub float %10549, %10179, !dbg !374 + %10582 = fsub float %10550, %10177, !dbg !374 + %10583 = fsub float %10551, %10179, !dbg !374 + %10584 = fsub float %10552, %10181, !dbg !374 + %10585 = fsub float %10553, %10183, !dbg !374 + %10586 = fsub float %10554, %10181, !dbg !374 + %10587 = fsub float %10555, %10183, !dbg !374 + %10588 = fmul float %.0.i, %10556, !dbg !375 + %10589 = fmul float %.0.i1148, %10557, !dbg !375 + %10590 = fmul float %.0.i1151, %10558, !dbg !375 + %10591 = fmul float %.0.i1154, %10559, !dbg !375 + %10592 = fmul float %.0.i1157, %10560, !dbg !375 + %10593 = fmul float %.0.i1160, %10561, !dbg !375 + %10594 = fmul float %.0.i1163, %10562, !dbg !375 + %10595 = fmul float %.0.i1166, %10563, !dbg !375 + %10596 = fmul float %.0.i1169, %10564, !dbg !375 + %10597 = fmul float %.0.i1172, %10565, !dbg !375 + %10598 = fmul float %.0.i1175, %10566, !dbg !375 + %10599 = fmul float %.0.i1178, %10567, !dbg !375 + %10600 = fmul float %.0.i1181, %10568, !dbg !375 + %10601 = fmul float %.0.i1184, %10569, !dbg !375 + %10602 = fmul float %.0.i1187, %10570, !dbg !375 + %10603 = fmul float %.0.i1190, %10571, !dbg !375 + %10604 = fmul float %.0.i1193, %10572, !dbg !375 + %10605 = fmul float %.0.i1196, %10573, !dbg !375 + %10606 = fmul float %.0.i1199, %10574, !dbg !375 + %10607 = fmul float %.0.i1202, %10575, !dbg !375 + %10608 = fmul float %.0.i1205, %10576, !dbg !375 + %10609 = fmul float %.0.i1208, %10577, !dbg !375 + %10610 = fmul float %.0.i1211, %10578, !dbg !375 + %10611 = fmul float %.0.i1214, %10579, !dbg !375 + %10612 = fmul float %.0.i1217, %10580, !dbg !375 + %10613 = fmul float %.0.i1220, %10581, !dbg !375 + %10614 = fmul float %.0.i1223, %10582, !dbg !375 + %10615 = fmul float %.0.i1226, %10583, !dbg !375 + %10616 = fmul float %.0.i1229, %10584, !dbg !375 + %10617 = fmul float %.0.i1232, %10585, !dbg !375 + %10618 = fmul float %.0.i1235, %10586, !dbg !375 + %10619 = fmul float %.0.i1238, %10587, !dbg !375 + %10620 = fptrunc float %10588 to bfloat, !dbg !376 + %10621 = select i1 %9047, bfloat %10620, bfloat 0xR0000, !dbg !377 + %10622 = fptrunc float %10589 to bfloat, !dbg !376 + %10623 = select i1 %9048, bfloat %10622, bfloat 0xR0000, !dbg !377 + %10624 = fptrunc float %10590 to bfloat, !dbg !376 + %10625 = select i1 %9047, bfloat %10624, bfloat 0xR0000, !dbg !377 + %10626 = fptrunc float %10591 to bfloat, !dbg !376 + %10627 = select i1 %9048, bfloat %10626, bfloat 0xR0000, !dbg !377 + %10628 = fptrunc float %10592 to bfloat, !dbg !376 + %10629 = select i1 %9049, bfloat %10628, bfloat 0xR0000, !dbg !377 + %10630 = fptrunc float %10593 to bfloat, !dbg !376 + %10631 = select i1 %9050, bfloat %10630, bfloat 0xR0000, !dbg !377 + %10632 = fptrunc float %10594 to bfloat, !dbg !376 + %10633 = select i1 %9049, bfloat %10632, bfloat 0xR0000, !dbg !377 + %10634 = fptrunc float %10595 to bfloat, !dbg !376 + %10635 = select i1 %9050, bfloat %10634, bfloat 0xR0000, !dbg !377 + %10636 = fptrunc float %10596 to bfloat, !dbg !376 + %10637 = select i1 %9051, bfloat %10636, bfloat 0xR0000, !dbg !377 + %10638 = fptrunc float %10597 to bfloat, !dbg !376 + %10639 = select i1 %9052, bfloat %10638, bfloat 0xR0000, !dbg !377 + %10640 = fptrunc float %10598 to bfloat, !dbg !376 + %10641 = select i1 %9051, bfloat %10640, bfloat 0xR0000, !dbg !377 + %10642 = fptrunc float %10599 to bfloat, !dbg !376 + %10643 = select i1 %9052, bfloat %10642, bfloat 0xR0000, !dbg !377 + %10644 = fptrunc float %10600 to bfloat, !dbg !376 + %10645 = select i1 %9053, bfloat %10644, bfloat 0xR0000, !dbg !377 + %10646 = fptrunc float %10601 to bfloat, !dbg !376 + %10647 = select i1 %9054, bfloat %10646, bfloat 0xR0000, !dbg !377 + %10648 = fptrunc float %10602 to bfloat, !dbg !376 + %10649 = select i1 %9053, bfloat %10648, bfloat 0xR0000, !dbg !377 + %10650 = fptrunc float %10603 to bfloat, !dbg !376 + %10651 = select i1 %9054, bfloat %10650, bfloat 0xR0000, !dbg !377 + %10652 = fptrunc float %10604 to bfloat, !dbg !376 + %10653 = select i1 %9055, bfloat %10652, bfloat 0xR0000, !dbg !377 + %10654 = fptrunc float %10605 to bfloat, !dbg !376 + %10655 = select i1 %9056, bfloat %10654, bfloat 0xR0000, !dbg !377 + %10656 = fptrunc float %10606 to bfloat, !dbg !376 + %10657 = select i1 %9055, bfloat %10656, bfloat 0xR0000, !dbg !377 + %10658 = fptrunc float %10607 to bfloat, !dbg !376 + %10659 = select i1 %9056, bfloat %10658, bfloat 0xR0000, !dbg !377 + %10660 = fptrunc float %10608 to bfloat, !dbg !376 + %10661 = select i1 %9057, bfloat %10660, bfloat 0xR0000, !dbg !377 + %10662 = fptrunc float %10609 to bfloat, !dbg !376 + %10663 = select i1 %9058, bfloat %10662, bfloat 0xR0000, !dbg !377 + %10664 = fptrunc float %10610 to bfloat, !dbg !376 + %10665 = select i1 %9057, bfloat %10664, bfloat 0xR0000, !dbg !377 + %10666 = fptrunc float %10611 to bfloat, !dbg !376 + %10667 = select i1 %9058, bfloat %10666, bfloat 0xR0000, !dbg !377 + %10668 = fptrunc float %10612 to bfloat, !dbg !376 + %10669 = select i1 %9059, bfloat %10668, bfloat 0xR0000, !dbg !377 + %10670 = fptrunc float %10613 to bfloat, !dbg !376 + %10671 = select i1 %9060, bfloat %10670, bfloat 0xR0000, !dbg !377 + %10672 = fptrunc float %10614 to bfloat, !dbg !376 + %10673 = select i1 %9059, bfloat %10672, bfloat 0xR0000, !dbg !377 + %10674 = fptrunc float %10615 to bfloat, !dbg !376 + %10675 = select i1 %9060, bfloat %10674, bfloat 0xR0000, !dbg !377 + %10676 = fptrunc float %10616 to bfloat, !dbg !376 + %10677 = select i1 %9061, bfloat %10676, bfloat 0xR0000, !dbg !377 + %10678 = fptrunc float %10617 to bfloat, !dbg !376 + %10679 = select i1 %9062, bfloat %10678, bfloat 0xR0000, !dbg !377 + %10680 = fptrunc float %10618 to bfloat, !dbg !376 + %10681 = select i1 %9061, bfloat %10680, bfloat 0xR0000, !dbg !377 + %10682 = fptrunc float %10619 to bfloat, !dbg !376 + %10683 = select i1 %9062, bfloat %10682, bfloat 0xR0000, !dbg !377 + %10684 = insertelement <2 x bfloat> poison, bfloat %10621, i64 0, !dbg !378 + %10685 = insertelement <2 x bfloat> %10684, bfloat %10623, i64 1, !dbg !378 + %10686 = bitcast <2 x bfloat> %10685 to i32, !dbg !378 + %10687 = insertelement <2 x bfloat> poison, bfloat %10625, i64 0, !dbg !378 + %10688 = insertelement <2 x bfloat> %10687, bfloat %10627, i64 1, !dbg !378 + %10689 = bitcast <2 x bfloat> %10688 to i32, !dbg !378 + %10690 = insertelement <2 x bfloat> poison, bfloat %10629, i64 0, !dbg !378 + %10691 = insertelement <2 x bfloat> %10690, bfloat %10631, i64 1, !dbg !378 + %10692 = bitcast <2 x bfloat> %10691 to i32, !dbg !378 + %10693 = insertelement <2 x bfloat> poison, bfloat %10633, i64 0, !dbg !378 + %10694 = insertelement <2 x bfloat> %10693, bfloat %10635, i64 1, !dbg !378 + %10695 = bitcast <2 x bfloat> %10694 to i32, !dbg !378 + %10696 = insertelement <2 x bfloat> poison, bfloat %10637, i64 0, !dbg !378 + %10697 = insertelement <2 x bfloat> %10696, bfloat %10639, i64 1, !dbg !378 + %10698 = bitcast <2 x bfloat> %10697 to i32, !dbg !378 + %10699 = insertelement <2 x bfloat> poison, bfloat %10641, i64 0, !dbg !378 + %10700 = insertelement <2 x bfloat> %10699, bfloat %10643, i64 1, !dbg !378 + %10701 = bitcast <2 x bfloat> %10700 to i32, !dbg !378 + %10702 = insertelement <2 x bfloat> poison, bfloat %10645, i64 0, !dbg !378 + %10703 = insertelement <2 x bfloat> %10702, bfloat %10647, i64 1, !dbg !378 + %10704 = bitcast <2 x bfloat> %10703 to i32, !dbg !378 + %10705 = insertelement <2 x bfloat> poison, bfloat %10649, i64 0, !dbg !378 + %10706 = insertelement <2 x bfloat> %10705, bfloat %10651, i64 1, !dbg !378 + %10707 = bitcast <2 x bfloat> %10706 to i32, !dbg !378 + %10708 = insertelement <2 x bfloat> poison, bfloat %10653, i64 0, !dbg !378 + %10709 = insertelement <2 x bfloat> %10708, bfloat %10655, i64 1, !dbg !378 + %10710 = bitcast <2 x bfloat> %10709 to i32, !dbg !378 + %10711 = insertelement <2 x bfloat> poison, bfloat %10657, i64 0, !dbg !378 + %10712 = insertelement <2 x bfloat> %10711, bfloat %10659, i64 1, !dbg !378 + %10713 = bitcast <2 x bfloat> %10712 to i32, !dbg !378 + %10714 = insertelement <2 x bfloat> poison, bfloat %10661, i64 0, !dbg !378 + %10715 = insertelement <2 x bfloat> %10714, bfloat %10663, i64 1, !dbg !378 + %10716 = bitcast <2 x bfloat> %10715 to i32, !dbg !378 + %10717 = insertelement <2 x bfloat> poison, bfloat %10665, i64 0, !dbg !378 + %10718 = insertelement <2 x bfloat> %10717, bfloat %10667, i64 1, !dbg !378 + %10719 = bitcast <2 x bfloat> %10718 to i32, !dbg !378 + %10720 = insertelement <2 x bfloat> poison, bfloat %10669, i64 0, !dbg !378 + %10721 = insertelement <2 x bfloat> %10720, bfloat %10671, i64 1, !dbg !378 + %10722 = bitcast <2 x bfloat> %10721 to i32, !dbg !378 + %10723 = insertelement <2 x bfloat> poison, bfloat %10673, i64 0, !dbg !378 + %10724 = insertelement <2 x bfloat> %10723, bfloat %10675, i64 1, !dbg !378 + %10725 = bitcast <2 x bfloat> %10724 to i32, !dbg !378 + %10726 = insertelement <2 x bfloat> poison, bfloat %10677, i64 0, !dbg !378 + %10727 = insertelement <2 x bfloat> %10726, bfloat %10679, i64 1, !dbg !378 + %10728 = bitcast <2 x bfloat> %10727 to i32, !dbg !378 + %10729 = insertelement <2 x bfloat> poison, bfloat %10681, i64 0, !dbg !378 + %10730 = insertelement <2 x bfloat> %10729, bfloat %10683, i64 1, !dbg !378 + %10731 = bitcast <2 x bfloat> %10730 to i32, !dbg !378 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !378 + %10732 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %.pn3471624, float %.pn3451625, float %.pn3431626, float %.pn3411627, float %.pn3391628, float %.pn3371629, float %.pn3351630, float %.pn3331631, float %.pn3311632, float %.pn3291633, float %.pn3271634, float %.pn3251635, float %.pn3231636, float %.pn3211637, float %.pn3191638, float %.pn3171639, float %.pn3151640, float %.pn3131641, float %.pn3111642, float %.pn3091643, float %.pn3071644, float %.pn3051645, float %.pn3031646, float %.pn3011647, float %.pn2991648, float %.pn2971649, float %.pn2951650, float %.pn2931651, float %.pn2911652, float %.pn2891653, float %.pn2871654, float %.pn2851655, float %.pn2831656, float %.pn2811657, float %.pn2791658, float %.pn2771659, float %.pn2751660, float %.pn2731661, float %.pn2711662, float %.pn2691663, float %.pn2671664, float %.pn2651665, float %.pn2631666, float %.pn2611667, float %.pn2591668, float %.pn2571669, float %.pn2551670, float %.pn2531671, float %.pn2511672, float %.pn2491673, float %.pn2471674, float %.pn2451675, float %.pn2431676, float %.pn2411677, float %.pn2391678, float %.pn2371679, float %.pn2351680, float %.pn2331681, float %.pn2311682, float %.pn2291683, float %.pn2271684, float %.pn2251685, float %.pn2231686, float %.pn2211687, i32 %10686, i32 %10689, i32 %10692, i32 %10695, i64 %9143, i1 true) #3, !dbg !378 + %10733 = add i32 %9139, 2048, !dbg !378 + %10734 = lshr exact i32 %10733, 4, !dbg !378 + %10735 = and i32 %10734, 16383, !dbg !378 + %10736 = zext nneg i32 %10735 to i64, !dbg !378 + %10737 = or disjoint i64 %10736, 4611686293338849280, !dbg !378 + %10738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 0, !dbg !378 + %10739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 1, !dbg !378 + %10740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 2, !dbg !378 + %10741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 3, !dbg !378 + %10742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 4, !dbg !378 + %10743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 5, !dbg !378 + %10744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 6, !dbg !378 + %10745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 7, !dbg !378 + %10746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 8, !dbg !378 + %10747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 9, !dbg !378 + %10748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 10, !dbg !378 + %10749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 11, !dbg !378 + %10750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 12, !dbg !378 + %10751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 13, !dbg !378 + %10752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 14, !dbg !378 + %10753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 15, !dbg !378 + %10754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 16, !dbg !378 + %10755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 17, !dbg !378 + %10756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 18, !dbg !378 + %10757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 19, !dbg !378 + %10758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 20, !dbg !378 + %10759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 21, !dbg !378 + %10760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 22, !dbg !378 + %10761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 23, !dbg !378 + %10762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 24, !dbg !378 + %10763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 25, !dbg !378 + %10764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 26, !dbg !378 + %10765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 27, !dbg !378 + %10766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 28, !dbg !378 + %10767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 29, !dbg !378 + %10768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 30, !dbg !378 + %10769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 31, !dbg !378 + %10770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 32, !dbg !378 + %10771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 33, !dbg !378 + %10772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 34, !dbg !378 + %10773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 35, !dbg !378 + %10774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 36, !dbg !378 + %10775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 37, !dbg !378 + %10776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 38, !dbg !378 + %10777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 39, !dbg !378 + %10778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 40, !dbg !378 + %10779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 41, !dbg !378 + %10780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 42, !dbg !378 + %10781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 43, !dbg !378 + %10782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 44, !dbg !378 + %10783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 45, !dbg !378 + %10784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 46, !dbg !378 + %10785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 47, !dbg !378 + %10786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 48, !dbg !378 + %10787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 49, !dbg !378 + %10788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 50, !dbg !378 + %10789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 51, !dbg !378 + %10790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 52, !dbg !378 + %10791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 53, !dbg !378 + %10792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 54, !dbg !378 + %10793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 55, !dbg !378 + %10794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 56, !dbg !378 + %10795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 57, !dbg !378 + %10796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 58, !dbg !378 + %10797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 59, !dbg !378 + %10798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 60, !dbg !378 + %10799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 61, !dbg !378 + %10800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 62, !dbg !378 + %10801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10732, 63, !dbg !378 + %10802 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %10738, float %10739, float %10740, float %10741, float %10742, float %10743, float %10744, float %10745, float %10746, float %10747, float %10748, float %10749, float %10750, float %10751, float %10752, float %10753, float %10754, float %10755, float %10756, float %10757, float %10758, float %10759, float %10760, float %10761, float %10762, float %10763, float %10764, float %10765, float %10766, float %10767, float %10768, float %10769, float %10770, float %10771, float %10772, float %10773, float %10774, float %10775, float %10776, float %10777, float %10778, float %10779, float %10780, float %10781, float %10782, float %10783, float %10784, float %10785, float %10786, float %10787, float %10788, float %10789, float %10790, float %10791, float %10792, float %10793, float %10794, float %10795, float %10796, float %10797, float %10798, float %10799, float %10800, float %10801, i32 %10698, i32 %10701, i32 %10704, i32 %10707, i64 %10737, i1 true) #3, !dbg !378 + %10803 = add i32 %9139, 4096, !dbg !378 + %10804 = lshr exact i32 %10803, 4, !dbg !378 + %10805 = and i32 %10804, 16383, !dbg !378 + %10806 = zext nneg i32 %10805 to i64, !dbg !378 + %10807 = or disjoint i64 %10806, 4611686293338849280, !dbg !378 + %10808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 0, !dbg !378 + %10809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 1, !dbg !378 + %10810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 2, !dbg !378 + %10811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 3, !dbg !378 + %10812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 4, !dbg !378 + %10813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 5, !dbg !378 + %10814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 6, !dbg !378 + %10815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 7, !dbg !378 + %10816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 8, !dbg !378 + %10817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 9, !dbg !378 + %10818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 10, !dbg !378 + %10819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 11, !dbg !378 + %10820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 12, !dbg !378 + %10821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 13, !dbg !378 + %10822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 14, !dbg !378 + %10823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 15, !dbg !378 + %10824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 16, !dbg !378 + %10825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 17, !dbg !378 + %10826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 18, !dbg !378 + %10827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 19, !dbg !378 + %10828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 20, !dbg !378 + %10829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 21, !dbg !378 + %10830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 22, !dbg !378 + %10831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 23, !dbg !378 + %10832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 24, !dbg !378 + %10833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 25, !dbg !378 + %10834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 26, !dbg !378 + %10835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 27, !dbg !378 + %10836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 28, !dbg !378 + %10837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 29, !dbg !378 + %10838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 30, !dbg !378 + %10839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 31, !dbg !378 + %10840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 32, !dbg !378 + %10841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 33, !dbg !378 + %10842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 34, !dbg !378 + %10843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 35, !dbg !378 + %10844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 36, !dbg !378 + %10845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 37, !dbg !378 + %10846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 38, !dbg !378 + %10847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 39, !dbg !378 + %10848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 40, !dbg !378 + %10849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 41, !dbg !378 + %10850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 42, !dbg !378 + %10851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 43, !dbg !378 + %10852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 44, !dbg !378 + %10853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 45, !dbg !378 + %10854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 46, !dbg !378 + %10855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 47, !dbg !378 + %10856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 48, !dbg !378 + %10857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 49, !dbg !378 + %10858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 50, !dbg !378 + %10859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 51, !dbg !378 + %10860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 52, !dbg !378 + %10861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 53, !dbg !378 + %10862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 54, !dbg !378 + %10863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 55, !dbg !378 + %10864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 56, !dbg !378 + %10865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 57, !dbg !378 + %10866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 58, !dbg !378 + %10867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 59, !dbg !378 + %10868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 60, !dbg !378 + %10869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 61, !dbg !378 + %10870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 62, !dbg !378 + %10871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10802, 63, !dbg !378 + %10872 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %10808, float %10809, float %10810, float %10811, float %10812, float %10813, float %10814, float %10815, float %10816, float %10817, float %10818, float %10819, float %10820, float %10821, float %10822, float %10823, float %10824, float %10825, float %10826, float %10827, float %10828, float %10829, float %10830, float %10831, float %10832, float %10833, float %10834, float %10835, float %10836, float %10837, float %10838, float %10839, float %10840, float %10841, float %10842, float %10843, float %10844, float %10845, float %10846, float %10847, float %10848, float %10849, float %10850, float %10851, float %10852, float %10853, float %10854, float %10855, float %10856, float %10857, float %10858, float %10859, float %10860, float %10861, float %10862, float %10863, float %10864, float %10865, float %10866, float %10867, float %10868, float %10869, float %10870, float %10871, i32 %10710, i32 %10713, i32 %10716, i32 %10719, i64 %10807, i1 true) #3, !dbg !378 + %10873 = add i32 %9139, 6144, !dbg !378 + %10874 = lshr exact i32 %10873, 4, !dbg !378 + %10875 = and i32 %10874, 16383, !dbg !378 + %10876 = zext nneg i32 %10875 to i64, !dbg !378 + %10877 = or disjoint i64 %10876, 4611686293338849280, !dbg !378 + %10878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 0, !dbg !378 + %10879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 1, !dbg !378 + %10880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 2, !dbg !378 + %10881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 3, !dbg !378 + %10882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 4, !dbg !378 + %10883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 5, !dbg !378 + %10884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 6, !dbg !378 + %10885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 7, !dbg !378 + %10886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 8, !dbg !378 + %10887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 9, !dbg !378 + %10888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 10, !dbg !378 + %10889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 11, !dbg !378 + %10890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 12, !dbg !378 + %10891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 13, !dbg !378 + %10892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 14, !dbg !378 + %10893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 15, !dbg !378 + %10894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 16, !dbg !378 + %10895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 17, !dbg !378 + %10896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 18, !dbg !378 + %10897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 19, !dbg !378 + %10898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 20, !dbg !378 + %10899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 21, !dbg !378 + %10900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 22, !dbg !378 + %10901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 23, !dbg !378 + %10902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 24, !dbg !378 + %10903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 25, !dbg !378 + %10904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 26, !dbg !378 + %10905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 27, !dbg !378 + %10906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 28, !dbg !378 + %10907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 29, !dbg !378 + %10908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 30, !dbg !378 + %10909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 31, !dbg !378 + %10910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 32, !dbg !378 + %10911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 33, !dbg !378 + %10912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 34, !dbg !378 + %10913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 35, !dbg !378 + %10914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 36, !dbg !378 + %10915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 37, !dbg !378 + %10916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 38, !dbg !378 + %10917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 39, !dbg !378 + %10918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 40, !dbg !378 + %10919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 41, !dbg !378 + %10920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 42, !dbg !378 + %10921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 43, !dbg !378 + %10922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 44, !dbg !378 + %10923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 45, !dbg !378 + %10924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 46, !dbg !378 + %10925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 47, !dbg !378 + %10926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 48, !dbg !378 + %10927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 49, !dbg !378 + %10928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 50, !dbg !378 + %10929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 51, !dbg !378 + %10930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 52, !dbg !378 + %10931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 53, !dbg !378 + %10932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 54, !dbg !378 + %10933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 55, !dbg !378 + %10934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 56, !dbg !378 + %10935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 57, !dbg !378 + %10936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 58, !dbg !378 + %10937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 59, !dbg !378 + %10938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 60, !dbg !378 + %10939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 61, !dbg !378 + %10940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 62, !dbg !378 + %10941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10872, 63, !dbg !378 + %10942 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %10878, float %10879, float %10880, float %10881, float %10882, float %10883, float %10884, float %10885, float %10886, float %10887, float %10888, float %10889, float %10890, float %10891, float %10892, float %10893, float %10894, float %10895, float %10896, float %10897, float %10898, float %10899, float %10900, float %10901, float %10902, float %10903, float %10904, float %10905, float %10906, float %10907, float %10908, float %10909, float %10910, float %10911, float %10912, float %10913, float %10914, float %10915, float %10916, float %10917, float %10918, float %10919, float %10920, float %10921, float %10922, float %10923, float %10924, float %10925, float %10926, float %10927, float %10928, float %10929, float %10930, float %10931, float %10932, float %10933, float %10934, float %10935, float %10936, float %10937, float %10938, float %10939, float %10940, float %10941, i32 %10722, i32 %10725, i32 %10728, i32 %10731, i64 %10877, i1 true) #3, !dbg !378 + %10943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 0, !dbg !378 + %10944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 1, !dbg !378 + %10945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 2, !dbg !378 + %10946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 3, !dbg !378 + %10947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 4, !dbg !378 + %10948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 5, !dbg !378 + %10949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 6, !dbg !378 + %10950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 7, !dbg !378 + %10951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 8, !dbg !378 + %10952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 9, !dbg !378 + %10953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 10, !dbg !378 + %10954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 11, !dbg !378 + %10955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 12, !dbg !378 + %10956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 13, !dbg !378 + %10957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 14, !dbg !378 + %10958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 15, !dbg !378 + %10959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 16, !dbg !378 + %10960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 17, !dbg !378 + %10961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 18, !dbg !378 + %10962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 19, !dbg !378 + %10963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 20, !dbg !378 + %10964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 21, !dbg !378 + %10965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 22, !dbg !378 + %10966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 23, !dbg !378 + %10967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 24, !dbg !378 + %10968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 25, !dbg !378 + %10969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 26, !dbg !378 + %10970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 27, !dbg !378 + %10971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 28, !dbg !378 + %10972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 29, !dbg !378 + %10973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 30, !dbg !378 + %10974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 31, !dbg !378 + %10975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 32, !dbg !378 + %10976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 33, !dbg !378 + %10977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 34, !dbg !378 + %10978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 35, !dbg !378 + %10979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 36, !dbg !378 + %10980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 37, !dbg !378 + %10981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 38, !dbg !378 + %10982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 39, !dbg !378 + %10983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 40, !dbg !378 + %10984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 41, !dbg !378 + %10985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 42, !dbg !378 + %10986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 43, !dbg !378 + %10987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 44, !dbg !378 + %10988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 45, !dbg !378 + %10989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 46, !dbg !378 + %10990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 47, !dbg !378 + %10991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 48, !dbg !378 + %10992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 49, !dbg !378 + %10993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 50, !dbg !378 + %10994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 51, !dbg !378 + %10995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 52, !dbg !378 + %10996 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 53, !dbg !378 + %10997 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 54, !dbg !378 + %10998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 55, !dbg !378 + %10999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 56, !dbg !378 + %11000 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 57, !dbg !378 + %11001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 58, !dbg !378 + %11002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 59, !dbg !378 + %11003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 60, !dbg !378 + %11004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 61, !dbg !378 + %11005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 62, !dbg !378 + %11006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10942, 63, !dbg !378 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !378 + %11007 = add nuw nsw i32 %9038, 1, !dbg !361 + %11008 = lshr i32 %11007, 1, !dbg !379 + %11009 = zext nneg i32 %11008 to i64, !dbg !380 + %11010 = getelementptr i32, ptr addrspace(1) %5200, i64 %11009, !dbg !380 + %11011 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !381 + %11012 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %11010, i64 %11011, i1 %9040) #3, !dbg !381 + %11013 = add nuw nsw i32 %11008, 1, !dbg !382 + %11014 = icmp slt i32 %11013, %5204, !dbg !383 + %11015 = getelementptr i8, ptr addrspace(1) %11010, i64 4, !dbg !384 + %11016 = and i1 %9040, %11014, !dbg !361 + %11017 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !385 + %11018 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %11015, i64 %11017, i1 %11016) #3, !dbg !385 + %11019 = and i32 %9038, 1, !dbg !386 + %11020 = sub i32 %11018, %11012, !dbg !387 + %11021 = shl i32 %11020, 7, !dbg !388 + %11022 = add i32 %11021, -64, !dbg !389 + %11023 = xor i32 %11019, 1, !dbg !390 + %11024 = mul nuw nsw i32 %11022, %11023, !dbg !390 + %11025 = shl nuw nsw i32 %11019, 6, !dbg !391 + %11026 = add i32 %11024, %11025, !dbg !392 + %11027 = shl i32 %11026, 12, !dbg !393 + %11028 = sext i32 %11027 to i64, !dbg !359 + %11029 = getelementptr bfloat, ptr addrspace(1) %.pn5391752, i64 %11028, !dbg !359 + %11030 = getelementptr bfloat, ptr addrspace(1) %.pn5231753, i64 %11028, !dbg !359 + %11031 = getelementptr bfloat, ptr addrspace(1) %.pn5071754, i64 %11028, !dbg !359 + %11032 = getelementptr bfloat, ptr addrspace(1) %.pn4911755, i64 %11028, !dbg !359 + %11033 = shl i32 %11026, 7, !dbg !394 + %11034 = sext i32 %11033 to i64, !dbg !360 + %11035 = getelementptr bfloat, ptr addrspace(1) %.pn6031756, i64 %11034, !dbg !360 + %11036 = getelementptr bfloat, ptr addrspace(1) %.pn5871757, i64 %11034, !dbg !360 + %11037 = getelementptr bfloat, ptr addrspace(1) %.pn5711758, i64 %11034, !dbg !360 + %11038 = getelementptr bfloat, ptr addrspace(1) %.pn5551759, i64 %11034, !dbg !360 + %11039 = add i32 %11026, %.pn6351760, !dbg !395 + %11040 = add i32 %11026, %.pn6331761, !dbg !395 + %11041 = add i32 %11026, %.pn6311762, !dbg !395 + %11042 = add i32 %11026, %.pn6291763, !dbg !395 + %11043 = add i32 %11026, %.pn6271764, !dbg !395 + %11044 = add i32 %11026, %.pn6251765, !dbg !395 + %11045 = add i32 %11026, %.pn6231766, !dbg !395 + %11046 = add i32 %11026, %.pn6211767, !dbg !395 + %11047 = add i32 %11026, %.pn6191768, !dbg !395 + %11048 = add i32 %11026, %.pn6171769, !dbg !395 + %11049 = add i32 %11026, %.pn6151770, !dbg !395 + %11050 = add i32 %11026, %.pn6131771, !dbg !395 + %11051 = add i32 %11026, %.pn6111772, !dbg !395 + %11052 = add i32 %11026, %.pn6091773, !dbg !395 + %11053 = add i32 %11026, %.pn6071774, !dbg !395 + %11054 = add i32 %11026, %.pn6051775, !dbg !395 + %11055 = add i32 %11026, %9034, !dbg !395 + %11056 = add i32 %11026, %9035, !dbg !395 + %11057 = add i32 %11026, %9036, !dbg !395 + %11058 = add i32 %11026, %9037, !dbg !395 + %11059 = add i32 %11026, %9030, !dbg !395 + %11060 = add i32 %11026, %9031, !dbg !395 + %11061 = add i32 %11026, %9032, !dbg !395 + %11062 = add i32 %11026, %9033, !dbg !395 + %11063 = add i32 %9027, 1, !dbg !361 + %11064 = icmp sgt i32 %11063, 1, !dbg !361 + %11065 = select i1 %11064, i32 0, i32 %11063, !dbg !361 + %11066 = add i32 %9029, 1, !dbg !361 + %11067 = icmp sgt i32 %11066, 2, !dbg !361 + %11068 = select i1 %11067, i32 0, i32 %11066, !dbg !361 + %11069 = icmp slt i32 %11055, %18, !dbg !362 + %11070 = icmp slt i32 %11056, %18, !dbg !362 + %11071 = icmp slt i32 %11057, %18, !dbg !362 + %11072 = icmp slt i32 %11058, %18, !dbg !362 + %11073 = shl i32 %11068, 13, !dbg !353 + %11074 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %11073, !dbg !353 + %11075 = and i1 %9039, %11069, !dbg !361 + %11076 = and i1 %9039, %11070, !dbg !361 + %11077 = and i1 %9039, %11071, !dbg !361 + %11078 = and i1 %9039, %11072, !dbg !361 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !353 + %11079 = getelementptr inbounds nuw i8, ptr addrspace(3) %11074, i32 %5264, !dbg !353 + %11080 = select i1 %11075, i32 16, i32 0, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %11079, ptr addrspace(1) %11029, i32 %11080) #3, !dbg !353 + %11081 = getelementptr inbounds nuw i8, ptr addrspace(3) %11074, i32 %5267, !dbg !353 + %11082 = select i1 %11076, i32 16, i32 0, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %11081, ptr addrspace(1) %11030, i32 %11082) #3, !dbg !353 + %11083 = getelementptr inbounds nuw i8, ptr addrspace(3) %11074, i32 %5270, !dbg !353 + %11084 = select i1 %11077, i32 16, i32 0, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %11083, ptr addrspace(1) %11031, i32 %11084) #3, !dbg !353 + %11085 = getelementptr inbounds nuw i8, ptr addrspace(3) %11074, i32 %5273, !dbg !353 + %11086 = select i1 %11078, i32 16, i32 0, !dbg !353 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %11085, ptr addrspace(1) %11032, i32 %11086) #3, !dbg !353 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !353 + %11087 = icmp slt i32 %11039, %18, !dbg !396 + %11088 = icmp slt i32 %11040, %18, !dbg !396 + %11089 = icmp slt i32 %11041, %18, !dbg !396 + %11090 = icmp slt i32 %11042, %18, !dbg !396 + %11091 = icmp slt i32 %11043, %18, !dbg !396 + %11092 = icmp slt i32 %11044, %18, !dbg !396 + %11093 = icmp slt i32 %11045, %18, !dbg !396 + %11094 = icmp slt i32 %11046, %18, !dbg !396 + %11095 = icmp slt i32 %11047, %18, !dbg !396 + %11096 = icmp slt i32 %11048, %18, !dbg !396 + %11097 = icmp slt i32 %11049, %18, !dbg !396 + %11098 = icmp slt i32 %11050, %18, !dbg !396 + %11099 = icmp slt i32 %11051, %18, !dbg !396 + %11100 = icmp slt i32 %11052, %18, !dbg !396 + %11101 = icmp slt i32 %11053, %18, !dbg !396 + %11102 = icmp slt i32 %11054, %18, !dbg !396 + %11103 = sext i32 %11039 to i64, !dbg !354 + %11104 = getelementptr float, ptr addrspace(1) %5904, i64 %11103, !dbg !354 + %11105 = sext i32 %11040 to i64, !dbg !354 + %11106 = getelementptr float, ptr addrspace(1) %5904, i64 %11105, !dbg !354 + %11107 = sext i32 %11041 to i64, !dbg !354 + %11108 = getelementptr float, ptr addrspace(1) %5904, i64 %11107, !dbg !354 + %11109 = sext i32 %11042 to i64, !dbg !354 + %11110 = getelementptr float, ptr addrspace(1) %5904, i64 %11109, !dbg !354 + %11111 = sext i32 %11043 to i64, !dbg !354 + %11112 = getelementptr float, ptr addrspace(1) %5904, i64 %11111, !dbg !354 + %11113 = sext i32 %11044 to i64, !dbg !354 + %11114 = getelementptr float, ptr addrspace(1) %5904, i64 %11113, !dbg !354 + %11115 = sext i32 %11045 to i64, !dbg !354 + %11116 = getelementptr float, ptr addrspace(1) %5904, i64 %11115, !dbg !354 + %11117 = sext i32 %11046 to i64, !dbg !354 + %11118 = getelementptr float, ptr addrspace(1) %5904, i64 %11117, !dbg !354 + %11119 = sext i32 %11047 to i64, !dbg !354 + %11120 = getelementptr float, ptr addrspace(1) %5904, i64 %11119, !dbg !354 + %11121 = sext i32 %11048 to i64, !dbg !354 + %11122 = getelementptr float, ptr addrspace(1) %5904, i64 %11121, !dbg !354 + %11123 = sext i32 %11049 to i64, !dbg !354 + %11124 = getelementptr float, ptr addrspace(1) %5904, i64 %11123, !dbg !354 + %11125 = sext i32 %11050 to i64, !dbg !354 + %11126 = getelementptr float, ptr addrspace(1) %5904, i64 %11125, !dbg !354 + %11127 = sext i32 %11051 to i64, !dbg !354 + %11128 = getelementptr float, ptr addrspace(1) %5904, i64 %11127, !dbg !354 + %11129 = sext i32 %11052 to i64, !dbg !354 + %11130 = getelementptr float, ptr addrspace(1) %5904, i64 %11129, !dbg !354 + %11131 = sext i32 %11053 to i64, !dbg !354 + %11132 = getelementptr float, ptr addrspace(1) %5904, i64 %11131, !dbg !354 + %11133 = sext i32 %11054 to i64, !dbg !354 + %11134 = getelementptr float, ptr addrspace(1) %5904, i64 %11133, !dbg !354 + %11135 = shl i32 %11065, 6, !dbg !355 + %11136 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %11135, !dbg !355 + %11137 = and i1 %9039, %11087, !dbg !361 + %11138 = and i1 %9039, %11088, !dbg !361 + %11139 = and i1 %9039, %11089, !dbg !361 + %11140 = and i1 %9039, %11090, !dbg !361 + %11141 = and i1 %9039, %11091, !dbg !361 + %11142 = and i1 %9039, %11092, !dbg !361 + %11143 = and i1 %9039, %11093, !dbg !361 + %11144 = and i1 %9039, %11094, !dbg !361 + %11145 = and i1 %9039, %11095, !dbg !361 + %11146 = and i1 %9039, %11096, !dbg !361 + %11147 = and i1 %9039, %11097, !dbg !361 + %11148 = and i1 %9039, %11098, !dbg !361 + %11149 = and i1 %9039, %11099, !dbg !361 + %11150 = and i1 %9039, %11100, !dbg !361 + %11151 = and i1 %9039, %11101, !dbg !361 + %11152 = and i1 %9039, %11102, !dbg !361 + %11153 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5332, !dbg !355 + %11154 = select i1 %11137, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %11153, ptr addrspace(1) %11104, i32 %11154, i1 %5331) #3, !dbg !355 + %11155 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5334, !dbg !355 + %11156 = select i1 %11138, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11155, ptr addrspace(1) %11106, i32 %11156, i1 %5331) #3, !dbg !355 + %11157 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5336, !dbg !355 + %11158 = select i1 %11139, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11157, ptr addrspace(1) %11108, i32 %11158, i1 %5331) #3, !dbg !355 + %11159 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5339, !dbg !355 + %11160 = select i1 %11140, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11159, ptr addrspace(1) %11110, i32 %11160, i1 %5331) #3, !dbg !355 + %11161 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5342, !dbg !355 + %11162 = select i1 %11141, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11161, ptr addrspace(1) %11112, i32 %11162, i1 %5331) #3, !dbg !355 + %11163 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5345, !dbg !355 + %11164 = select i1 %11142, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11163, ptr addrspace(1) %11114, i32 %11164, i1 %5331) #3, !dbg !355 + %11165 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5348, !dbg !355 + %11166 = select i1 %11143, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11165, ptr addrspace(1) %11116, i32 %11166, i1 %5331) #3, !dbg !355 + %11167 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5351, !dbg !355 + %11168 = select i1 %11144, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11167, ptr addrspace(1) %11118, i32 %11168, i1 %5331) #3, !dbg !355 + %11169 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5354, !dbg !355 + %11170 = select i1 %11145, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11169, ptr addrspace(1) %11120, i32 %11170, i1 %5331) #3, !dbg !355 + %11171 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5357, !dbg !355 + %11172 = select i1 %11146, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11171, ptr addrspace(1) %11122, i32 %11172, i1 %5331) #3, !dbg !355 + %11173 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5360, !dbg !355 + %11174 = select i1 %11147, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11173, ptr addrspace(1) %11124, i32 %11174, i1 %5331) #3, !dbg !355 + %11175 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5363, !dbg !355 + %11176 = select i1 %11148, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11175, ptr addrspace(1) %11126, i32 %11176, i1 %5331) #3, !dbg !355 + %11177 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5366, !dbg !355 + %11178 = select i1 %11149, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11177, ptr addrspace(1) %11128, i32 %11178, i1 %5331) #3, !dbg !355 + %11179 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5369, !dbg !355 + %11180 = select i1 %11150, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11179, ptr addrspace(1) %11130, i32 %11180, i1 %5331) #3, !dbg !355 + %11181 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5372, !dbg !355 + %11182 = select i1 %11151, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11181, ptr addrspace(1) %11132, i32 %11182, i1 %5331) #3, !dbg !355 + %11183 = getelementptr inbounds nuw i8, ptr addrspace(3) %11136, i32 %5375, !dbg !355 + %11184 = select i1 %11152, i32 4, i32 0, !dbg !355 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11183, ptr addrspace(1) %11134, i32 %11184, i1 %5331) #3, !dbg !355 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !355 + %11185 = icmp slt i32 %11059, %18, !dbg !397 + %11186 = icmp slt i32 %11060, %18, !dbg !397 + %11187 = icmp slt i32 %11061, %18, !dbg !397 + %11188 = icmp slt i32 %11062, %18, !dbg !397 + %11189 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %11073, !dbg !356 + %11190 = and i1 %9039, %11185, !dbg !361 + %11191 = and i1 %9039, %11186, !dbg !361 + %11192 = and i1 %9039, %11187, !dbg !361 + %11193 = and i1 %9039, %11188, !dbg !361 + %11194 = getelementptr inbounds nuw i8, ptr addrspace(3) %11189, i32 %5264, !dbg !356 + %11195 = select i1 %11190, i32 16, i32 0, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %11194, ptr addrspace(1) %11035, i32 %11195) #3, !dbg !356 + %11196 = getelementptr inbounds nuw i8, ptr addrspace(3) %11189, i32 %5267, !dbg !356 + %11197 = select i1 %11191, i32 16, i32 0, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %11196, ptr addrspace(1) %11036, i32 %11197) #3, !dbg !356 + %11198 = getelementptr inbounds nuw i8, ptr addrspace(3) %11189, i32 %5270, !dbg !356 + %11199 = select i1 %11192, i32 16, i32 0, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %11198, ptr addrspace(1) %11037, i32 %11199) #3, !dbg !356 + %11200 = getelementptr inbounds nuw i8, ptr addrspace(3) %11189, i32 %5273, !dbg !356 + %11201 = select i1 %11193, i32 16, i32 0, !dbg !356 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %11200, ptr addrspace(1) %11038, i32 %11201) #3, !dbg !356 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !356 + %11202 = getelementptr float, ptr addrspace(1) %5905, i64 %11103, !dbg !357 + %11203 = getelementptr float, ptr addrspace(1) %5905, i64 %11105, !dbg !357 + %11204 = getelementptr float, ptr addrspace(1) %5905, i64 %11107, !dbg !357 + %11205 = getelementptr float, ptr addrspace(1) %5905, i64 %11109, !dbg !357 + %11206 = getelementptr float, ptr addrspace(1) %5905, i64 %11111, !dbg !357 + %11207 = getelementptr float, ptr addrspace(1) %5905, i64 %11113, !dbg !357 + %11208 = getelementptr float, ptr addrspace(1) %5905, i64 %11115, !dbg !357 + %11209 = getelementptr float, ptr addrspace(1) %5905, i64 %11117, !dbg !357 + %11210 = getelementptr float, ptr addrspace(1) %5905, i64 %11119, !dbg !357 + %11211 = getelementptr float, ptr addrspace(1) %5905, i64 %11121, !dbg !357 + %11212 = getelementptr float, ptr addrspace(1) %5905, i64 %11123, !dbg !357 + %11213 = getelementptr float, ptr addrspace(1) %5905, i64 %11125, !dbg !357 + %11214 = getelementptr float, ptr addrspace(1) %5905, i64 %11127, !dbg !357 + %11215 = getelementptr float, ptr addrspace(1) %5905, i64 %11129, !dbg !357 + %11216 = getelementptr float, ptr addrspace(1) %5905, i64 %11131, !dbg !357 + %11217 = getelementptr float, ptr addrspace(1) %5905, i64 %11133, !dbg !357 + %11218 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %11135, !dbg !358 + %11219 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5332, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) %11219, ptr addrspace(1) %11202, i32 %11154, i1 %5331) #3, !dbg !358 + %11220 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5334, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11220, ptr addrspace(1) %11203, i32 %11156, i1 %5331) #3, !dbg !358 + %11221 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5336, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11221, ptr addrspace(1) %11204, i32 %11158, i1 %5331) #3, !dbg !358 + %11222 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5339, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11222, ptr addrspace(1) %11205, i32 %11160, i1 %5331) #3, !dbg !358 + %11223 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5342, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11223, ptr addrspace(1) %11206, i32 %11162, i1 %5331) #3, !dbg !358 + %11224 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5345, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11224, ptr addrspace(1) %11207, i32 %11164, i1 %5331) #3, !dbg !358 + %11225 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5348, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11225, ptr addrspace(1) %11208, i32 %11166, i1 %5331) #3, !dbg !358 + %11226 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5351, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11226, ptr addrspace(1) %11209, i32 %11168, i1 %5331) #3, !dbg !358 + %11227 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5354, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11227, ptr addrspace(1) %11210, i32 %11170, i1 %5331) #3, !dbg !358 + %11228 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5357, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11228, ptr addrspace(1) %11211, i32 %11172, i1 %5331) #3, !dbg !358 + %11229 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5360, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11229, ptr addrspace(1) %11212, i32 %11174, i1 %5331) #3, !dbg !358 + %11230 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5363, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11230, ptr addrspace(1) %11213, i32 %11176, i1 %5331) #3, !dbg !358 + %11231 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5366, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11231, ptr addrspace(1) %11214, i32 %11178, i1 %5331) #3, !dbg !358 + %11232 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5369, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11232, ptr addrspace(1) %11215, i32 %11180, i1 %5331) #3, !dbg !358 + %11233 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5372, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11233, ptr addrspace(1) %11216, i32 %11182, i1 %5331) #3, !dbg !358 + %11234 = getelementptr inbounds nuw i8, ptr addrspace(3) %11218, i32 %5375, !dbg !358 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x4, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %11234, ptr addrspace(1) %11217, i32 %11184, i1 %5331) #3, !dbg !358 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !358 + %exitcond2188.not = icmp eq i32 %11007, %smax2187, !dbg !361 + br i1 %exitcond2188.not, label %._crit_edge1794, label %.lr.ph1793, !dbg !361 + +._crit_edge1794: ; preds = %__nv_exp2f.exit1239, %._crit_edge1621 + %.pn347.lcssa = phi float [ %8874, %._crit_edge1621 ], [ %10943, %__nv_exp2f.exit1239 ] + %.pn345.lcssa = phi float [ %8875, %._crit_edge1621 ], [ %10944, %__nv_exp2f.exit1239 ] + %.pn343.lcssa = phi float [ %8876, %._crit_edge1621 ], [ %10945, %__nv_exp2f.exit1239 ] + %.pn341.lcssa = phi float [ %8877, %._crit_edge1621 ], [ %10946, %__nv_exp2f.exit1239 ] + %.pn339.lcssa = phi float [ %8878, %._crit_edge1621 ], [ %10947, %__nv_exp2f.exit1239 ] + %.pn337.lcssa = phi float [ %8879, %._crit_edge1621 ], [ %10948, %__nv_exp2f.exit1239 ] + %.pn335.lcssa = phi float [ %8880, %._crit_edge1621 ], [ %10949, %__nv_exp2f.exit1239 ] + %.pn333.lcssa = phi float [ %8881, %._crit_edge1621 ], [ %10950, %__nv_exp2f.exit1239 ] + %.pn331.lcssa = phi float [ %8882, %._crit_edge1621 ], [ %10951, %__nv_exp2f.exit1239 ] + %.pn329.lcssa = phi float [ %8883, %._crit_edge1621 ], [ %10952, %__nv_exp2f.exit1239 ] + %.pn327.lcssa = phi float [ %8884, %._crit_edge1621 ], [ %10953, %__nv_exp2f.exit1239 ] + %.pn325.lcssa = phi float [ %8885, %._crit_edge1621 ], [ %10954, %__nv_exp2f.exit1239 ] + %.pn323.lcssa = phi float [ %8886, %._crit_edge1621 ], [ %10955, %__nv_exp2f.exit1239 ] + %.pn321.lcssa = phi float [ %8887, %._crit_edge1621 ], [ %10956, %__nv_exp2f.exit1239 ] + %.pn319.lcssa = phi float [ %8888, %._crit_edge1621 ], [ %10957, %__nv_exp2f.exit1239 ] + %.pn317.lcssa = phi float [ %8889, %._crit_edge1621 ], [ %10958, %__nv_exp2f.exit1239 ] + %.pn315.lcssa = phi float [ %8890, %._crit_edge1621 ], [ %10959, %__nv_exp2f.exit1239 ] + %.pn313.lcssa = phi float [ %8891, %._crit_edge1621 ], [ %10960, %__nv_exp2f.exit1239 ] + %.pn311.lcssa = phi float [ %8892, %._crit_edge1621 ], [ %10961, %__nv_exp2f.exit1239 ] + %.pn309.lcssa = phi float [ %8893, %._crit_edge1621 ], [ %10962, %__nv_exp2f.exit1239 ] + %.pn307.lcssa = phi float [ %8894, %._crit_edge1621 ], [ %10963, %__nv_exp2f.exit1239 ] + %.pn305.lcssa = phi float [ %8895, %._crit_edge1621 ], [ %10964, %__nv_exp2f.exit1239 ] + %.pn303.lcssa = phi float [ %8896, %._crit_edge1621 ], [ %10965, %__nv_exp2f.exit1239 ] + %.pn301.lcssa = phi float [ %8897, %._crit_edge1621 ], [ %10966, %__nv_exp2f.exit1239 ] + %.pn299.lcssa = phi float [ %8898, %._crit_edge1621 ], [ %10967, %__nv_exp2f.exit1239 ] + %.pn297.lcssa = phi float [ %8899, %._crit_edge1621 ], [ %10968, %__nv_exp2f.exit1239 ] + %.pn295.lcssa = phi float [ %8900, %._crit_edge1621 ], [ %10969, %__nv_exp2f.exit1239 ] + %.pn293.lcssa = phi float [ %8901, %._crit_edge1621 ], [ %10970, %__nv_exp2f.exit1239 ] + %.pn291.lcssa = phi float [ %8902, %._crit_edge1621 ], [ %10971, %__nv_exp2f.exit1239 ] + %.pn289.lcssa = phi float [ %8903, %._crit_edge1621 ], [ %10972, %__nv_exp2f.exit1239 ] + %.pn287.lcssa = phi float [ %8904, %._crit_edge1621 ], [ %10973, %__nv_exp2f.exit1239 ] + %.pn285.lcssa = phi float [ %8905, %._crit_edge1621 ], [ %10974, %__nv_exp2f.exit1239 ] + %.pn283.lcssa = phi float [ %8906, %._crit_edge1621 ], [ %10975, %__nv_exp2f.exit1239 ] + %.pn281.lcssa = phi float [ %8907, %._crit_edge1621 ], [ %10976, %__nv_exp2f.exit1239 ] + %.pn279.lcssa = phi float [ %8908, %._crit_edge1621 ], [ %10977, %__nv_exp2f.exit1239 ] + %.pn277.lcssa = phi float [ %8909, %._crit_edge1621 ], [ %10978, %__nv_exp2f.exit1239 ] + %.pn275.lcssa = phi float [ %8910, %._crit_edge1621 ], [ %10979, %__nv_exp2f.exit1239 ] + %.pn273.lcssa = phi float [ %8911, %._crit_edge1621 ], [ %10980, %__nv_exp2f.exit1239 ] + %.pn271.lcssa = phi float [ %8912, %._crit_edge1621 ], [ %10981, %__nv_exp2f.exit1239 ] + %.pn269.lcssa = phi float [ %8913, %._crit_edge1621 ], [ %10982, %__nv_exp2f.exit1239 ] + %.pn267.lcssa = phi float [ %8914, %._crit_edge1621 ], [ %10983, %__nv_exp2f.exit1239 ] + %.pn265.lcssa = phi float [ %8915, %._crit_edge1621 ], [ %10984, %__nv_exp2f.exit1239 ] + %.pn263.lcssa = phi float [ %8916, %._crit_edge1621 ], [ %10985, %__nv_exp2f.exit1239 ] + %.pn261.lcssa = phi float [ %8917, %._crit_edge1621 ], [ %10986, %__nv_exp2f.exit1239 ] + %.pn259.lcssa = phi float [ %8918, %._crit_edge1621 ], [ %10987, %__nv_exp2f.exit1239 ] + %.pn257.lcssa = phi float [ %8919, %._crit_edge1621 ], [ %10988, %__nv_exp2f.exit1239 ] + %.pn255.lcssa = phi float [ %8920, %._crit_edge1621 ], [ %10989, %__nv_exp2f.exit1239 ] + %.pn253.lcssa = phi float [ %8921, %._crit_edge1621 ], [ %10990, %__nv_exp2f.exit1239 ] + %.pn251.lcssa = phi float [ %8922, %._crit_edge1621 ], [ %10991, %__nv_exp2f.exit1239 ] + %.pn249.lcssa = phi float [ %8923, %._crit_edge1621 ], [ %10992, %__nv_exp2f.exit1239 ] + %.pn247.lcssa = phi float [ %8924, %._crit_edge1621 ], [ %10993, %__nv_exp2f.exit1239 ] + %.pn245.lcssa = phi float [ %8925, %._crit_edge1621 ], [ %10994, %__nv_exp2f.exit1239 ] + %.pn243.lcssa = phi float [ %8926, %._crit_edge1621 ], [ %10995, %__nv_exp2f.exit1239 ] + %.pn241.lcssa = phi float [ %8927, %._crit_edge1621 ], [ %10996, %__nv_exp2f.exit1239 ] + %.pn239.lcssa = phi float [ %8928, %._crit_edge1621 ], [ %10997, %__nv_exp2f.exit1239 ] + %.pn237.lcssa = phi float [ %8929, %._crit_edge1621 ], [ %10998, %__nv_exp2f.exit1239 ] + %.pn235.lcssa = phi float [ %8930, %._crit_edge1621 ], [ %10999, %__nv_exp2f.exit1239 ] + %.pn233.lcssa = phi float [ %8931, %._crit_edge1621 ], [ %11000, %__nv_exp2f.exit1239 ] + %.pn231.lcssa = phi float [ %8932, %._crit_edge1621 ], [ %11001, %__nv_exp2f.exit1239 ] + %.pn229.lcssa = phi float [ %8933, %._crit_edge1621 ], [ %11002, %__nv_exp2f.exit1239 ] + %.pn227.lcssa = phi float [ %8934, %._crit_edge1621 ], [ %11003, %__nv_exp2f.exit1239 ] + %.pn225.lcssa = phi float [ %8935, %._crit_edge1621 ], [ %11004, %__nv_exp2f.exit1239 ] + %.pn223.lcssa = phi float [ %8936, %._crit_edge1621 ], [ %11005, %__nv_exp2f.exit1239 ] + %.pn221.lcssa = phi float [ %8937, %._crit_edge1621 ], [ %11006, %__nv_exp2f.exit1239 ] + %.pn475.lcssa = phi float [ %8810, %._crit_edge1621 ], [ %10087, %__nv_exp2f.exit1239 ] + %.pn473.lcssa = phi float [ %8811, %._crit_edge1621 ], [ %10088, %__nv_exp2f.exit1239 ] + %.pn471.lcssa = phi float [ %8812, %._crit_edge1621 ], [ %10089, %__nv_exp2f.exit1239 ] + %.pn469.lcssa = phi float [ %8813, %._crit_edge1621 ], [ %10090, %__nv_exp2f.exit1239 ] + %.pn467.lcssa = phi float [ %8814, %._crit_edge1621 ], [ %10091, %__nv_exp2f.exit1239 ] + %.pn465.lcssa = phi float [ %8815, %._crit_edge1621 ], [ %10092, %__nv_exp2f.exit1239 ] + %.pn463.lcssa = phi float [ %8816, %._crit_edge1621 ], [ %10093, %__nv_exp2f.exit1239 ] + %.pn461.lcssa = phi float [ %8817, %._crit_edge1621 ], [ %10094, %__nv_exp2f.exit1239 ] + %.pn459.lcssa = phi float [ %8818, %._crit_edge1621 ], [ %10095, %__nv_exp2f.exit1239 ] + %.pn457.lcssa = phi float [ %8819, %._crit_edge1621 ], [ %10096, %__nv_exp2f.exit1239 ] + %.pn455.lcssa = phi float [ %8820, %._crit_edge1621 ], [ %10097, %__nv_exp2f.exit1239 ] + %.pn453.lcssa = phi float [ %8821, %._crit_edge1621 ], [ %10098, %__nv_exp2f.exit1239 ] + %.pn451.lcssa = phi float [ %8822, %._crit_edge1621 ], [ %10099, %__nv_exp2f.exit1239 ] + %.pn449.lcssa = phi float [ %8823, %._crit_edge1621 ], [ %10100, %__nv_exp2f.exit1239 ] + %.pn447.lcssa = phi float [ %8824, %._crit_edge1621 ], [ %10101, %__nv_exp2f.exit1239 ] + %.pn445.lcssa = phi float [ %8825, %._crit_edge1621 ], [ %10102, %__nv_exp2f.exit1239 ] + %.pn443.lcssa = phi float [ %8826, %._crit_edge1621 ], [ %10103, %__nv_exp2f.exit1239 ] + %.pn441.lcssa = phi float [ %8827, %._crit_edge1621 ], [ %10104, %__nv_exp2f.exit1239 ] + %.pn439.lcssa = phi float [ %8828, %._crit_edge1621 ], [ %10105, %__nv_exp2f.exit1239 ] + %.pn437.lcssa = phi float [ %8829, %._crit_edge1621 ], [ %10106, %__nv_exp2f.exit1239 ] + %.pn435.lcssa = phi float [ %8830, %._crit_edge1621 ], [ %10107, %__nv_exp2f.exit1239 ] + %.pn433.lcssa = phi float [ %8831, %._crit_edge1621 ], [ %10108, %__nv_exp2f.exit1239 ] + %.pn431.lcssa = phi float [ %8832, %._crit_edge1621 ], [ %10109, %__nv_exp2f.exit1239 ] + %.pn429.lcssa = phi float [ %8833, %._crit_edge1621 ], [ %10110, %__nv_exp2f.exit1239 ] + %.pn427.lcssa = phi float [ %8834, %._crit_edge1621 ], [ %10111, %__nv_exp2f.exit1239 ] + %.pn425.lcssa = phi float [ %8835, %._crit_edge1621 ], [ %10112, %__nv_exp2f.exit1239 ] + %.pn423.lcssa = phi float [ %8836, %._crit_edge1621 ], [ %10113, %__nv_exp2f.exit1239 ] + %.pn421.lcssa = phi float [ %8837, %._crit_edge1621 ], [ %10114, %__nv_exp2f.exit1239 ] + %.pn419.lcssa = phi float [ %8838, %._crit_edge1621 ], [ %10115, %__nv_exp2f.exit1239 ] + %.pn417.lcssa = phi float [ %8839, %._crit_edge1621 ], [ %10116, %__nv_exp2f.exit1239 ] + %.pn415.lcssa = phi float [ %8840, %._crit_edge1621 ], [ %10117, %__nv_exp2f.exit1239 ] + %.pn413.lcssa = phi float [ %8841, %._crit_edge1621 ], [ %10118, %__nv_exp2f.exit1239 ] + %.pn411.lcssa = phi float [ %8842, %._crit_edge1621 ], [ %10119, %__nv_exp2f.exit1239 ] + %.pn409.lcssa = phi float [ %8843, %._crit_edge1621 ], [ %10120, %__nv_exp2f.exit1239 ] + %.pn407.lcssa = phi float [ %8844, %._crit_edge1621 ], [ %10121, %__nv_exp2f.exit1239 ] + %.pn405.lcssa = phi float [ %8845, %._crit_edge1621 ], [ %10122, %__nv_exp2f.exit1239 ] + %.pn403.lcssa = phi float [ %8846, %._crit_edge1621 ], [ %10123, %__nv_exp2f.exit1239 ] + %.pn401.lcssa = phi float [ %8847, %._crit_edge1621 ], [ %10124, %__nv_exp2f.exit1239 ] + %.pn399.lcssa = phi float [ %8848, %._crit_edge1621 ], [ %10125, %__nv_exp2f.exit1239 ] + %.pn397.lcssa = phi float [ %8849, %._crit_edge1621 ], [ %10126, %__nv_exp2f.exit1239 ] + %.pn395.lcssa = phi float [ %8850, %._crit_edge1621 ], [ %10127, %__nv_exp2f.exit1239 ] + %.pn393.lcssa = phi float [ %8851, %._crit_edge1621 ], [ %10128, %__nv_exp2f.exit1239 ] + %.pn391.lcssa = phi float [ %8852, %._crit_edge1621 ], [ %10129, %__nv_exp2f.exit1239 ] + %.pn389.lcssa = phi float [ %8853, %._crit_edge1621 ], [ %10130, %__nv_exp2f.exit1239 ] + %.pn387.lcssa = phi float [ %8854, %._crit_edge1621 ], [ %10131, %__nv_exp2f.exit1239 ] + %.pn385.lcssa = phi float [ %8855, %._crit_edge1621 ], [ %10132, %__nv_exp2f.exit1239 ] + %.pn383.lcssa = phi float [ %8856, %._crit_edge1621 ], [ %10133, %__nv_exp2f.exit1239 ] + %.pn381.lcssa = phi float [ %8857, %._crit_edge1621 ], [ %10134, %__nv_exp2f.exit1239 ] + %.pn379.lcssa = phi float [ %8858, %._crit_edge1621 ], [ %10135, %__nv_exp2f.exit1239 ] + %.pn377.lcssa = phi float [ %8859, %._crit_edge1621 ], [ %10136, %__nv_exp2f.exit1239 ] + %.pn375.lcssa = phi float [ %8860, %._crit_edge1621 ], [ %10137, %__nv_exp2f.exit1239 ] + %.pn373.lcssa = phi float [ %8861, %._crit_edge1621 ], [ %10138, %__nv_exp2f.exit1239 ] + %.pn371.lcssa = phi float [ %8862, %._crit_edge1621 ], [ %10139, %__nv_exp2f.exit1239 ] + %.pn369.lcssa = phi float [ %8863, %._crit_edge1621 ], [ %10140, %__nv_exp2f.exit1239 ] + %.pn367.lcssa = phi float [ %8864, %._crit_edge1621 ], [ %10141, %__nv_exp2f.exit1239 ] + %.pn365.lcssa = phi float [ %8865, %._crit_edge1621 ], [ %10142, %__nv_exp2f.exit1239 ] + %.pn363.lcssa = phi float [ %8866, %._crit_edge1621 ], [ %10143, %__nv_exp2f.exit1239 ] + %.pn361.lcssa = phi float [ %8867, %._crit_edge1621 ], [ %10144, %__nv_exp2f.exit1239 ] + %.pn359.lcssa = phi float [ %8868, %._crit_edge1621 ], [ %10145, %__nv_exp2f.exit1239 ] + %.pn357.lcssa = phi float [ %8869, %._crit_edge1621 ], [ %10146, %__nv_exp2f.exit1239 ] + %.pn355.lcssa = phi float [ %8870, %._crit_edge1621 ], [ %10147, %__nv_exp2f.exit1239 ] + %.pn353.lcssa = phi float [ %8871, %._crit_edge1621 ], [ %10148, %__nv_exp2f.exit1239 ] + %.pn351.lcssa = phi float [ %8872, %._crit_edge1621 ], [ %10149, %__nv_exp2f.exit1239 ] + %.pn349.lcssa = phi float [ %8873, %._crit_edge1621 ], [ %10150, %__nv_exp2f.exit1239 ] + %11235 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101,$102,$103,$104,$105,$106,$107,$108,$109,$110,$111,$112,$113,$114,$115,$116,$117,$118,$119,$120,$121,$122,$123,$124,$125,$126,$127\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127"(float %.pn475.lcssa, float %.pn473.lcssa, float %.pn471.lcssa, float %.pn469.lcssa, float %.pn467.lcssa, float %.pn465.lcssa, float %.pn463.lcssa, float %.pn461.lcssa, float %.pn459.lcssa, float %.pn457.lcssa, float %.pn455.lcssa, float %.pn453.lcssa, float %.pn451.lcssa, float %.pn449.lcssa, float %.pn447.lcssa, float %.pn445.lcssa, float %.pn443.lcssa, float %.pn441.lcssa, float %.pn439.lcssa, float %.pn437.lcssa, float %.pn435.lcssa, float %.pn433.lcssa, float %.pn431.lcssa, float %.pn429.lcssa, float %.pn427.lcssa, float %.pn425.lcssa, float %.pn423.lcssa, float %.pn421.lcssa, float %.pn419.lcssa, float %.pn417.lcssa, float %.pn415.lcssa, float %.pn413.lcssa, float %.pn411.lcssa, float %.pn409.lcssa, float %.pn407.lcssa, float %.pn405.lcssa, float %.pn403.lcssa, float %.pn401.lcssa, float %.pn399.lcssa, float %.pn397.lcssa, float %.pn395.lcssa, float %.pn393.lcssa, float %.pn391.lcssa, float %.pn389.lcssa, float %.pn387.lcssa, float %.pn385.lcssa, float %.pn383.lcssa, float %.pn381.lcssa, float %.pn379.lcssa, float %.pn377.lcssa, float %.pn375.lcssa, float %.pn373.lcssa, float %.pn371.lcssa, float %.pn369.lcssa, float %.pn367.lcssa, float %.pn365.lcssa, float %.pn363.lcssa, float %.pn361.lcssa, float %.pn359.lcssa, float %.pn357.lcssa, float %.pn355.lcssa, float %.pn353.lcssa, float %.pn351.lcssa, float %.pn349.lcssa, float %.pn347.lcssa, float %.pn345.lcssa, float %.pn343.lcssa, float %.pn341.lcssa, float %.pn339.lcssa, float %.pn337.lcssa, float %.pn335.lcssa, float %.pn333.lcssa, float %.pn331.lcssa, float %.pn329.lcssa, float %.pn327.lcssa, float %.pn325.lcssa, float %.pn323.lcssa, float %.pn321.lcssa, float %.pn319.lcssa, float %.pn317.lcssa, float %.pn315.lcssa, float %.pn313.lcssa, float %.pn311.lcssa, float %.pn309.lcssa, float %.pn307.lcssa, float %.pn305.lcssa, float %.pn303.lcssa, float %.pn301.lcssa, float %.pn299.lcssa, float %.pn297.lcssa, float %.pn295.lcssa, float %.pn293.lcssa, float %.pn291.lcssa, float %.pn289.lcssa, float %.pn287.lcssa, float %.pn285.lcssa, float %.pn283.lcssa, float %.pn281.lcssa, float %.pn279.lcssa, float %.pn277.lcssa, float %.pn275.lcssa, float %.pn273.lcssa, float %.pn271.lcssa, float %.pn269.lcssa, float %.pn267.lcssa, float %.pn265.lcssa, float %.pn263.lcssa, float %.pn261.lcssa, float %.pn259.lcssa, float %.pn257.lcssa, float %.pn255.lcssa, float %.pn253.lcssa, float %.pn251.lcssa, float %.pn249.lcssa, float %.pn247.lcssa, float %.pn245.lcssa, float %.pn243.lcssa, float %.pn241.lcssa, float %.pn239.lcssa, float %.pn237.lcssa, float %.pn235.lcssa, float %.pn233.lcssa, float %.pn231.lcssa, float %.pn229.lcssa, float %.pn227.lcssa, float %.pn225.lcssa, float %.pn223.lcssa, float %.pn221.lcssa) #3, !dbg !361 + %11236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 0, !dbg !361 + %11237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 1, !dbg !361 + %11238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 2, !dbg !361 + %11239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 3, !dbg !361 + %11240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 4, !dbg !361 + %11241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 5, !dbg !361 + %11242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 6, !dbg !361 + %11243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 7, !dbg !361 + %11244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 8, !dbg !361 + %11245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 9, !dbg !361 + %11246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 10, !dbg !361 + %11247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 11, !dbg !361 + %11248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 12, !dbg !361 + %11249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 13, !dbg !361 + %11250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 14, !dbg !361 + %11251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 15, !dbg !361 + %11252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 16, !dbg !361 + %11253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 17, !dbg !361 + %11254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 18, !dbg !361 + %11255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 19, !dbg !361 + %11256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 20, !dbg !361 + %11257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 21, !dbg !361 + %11258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 22, !dbg !361 + %11259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 23, !dbg !361 + %11260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 24, !dbg !361 + %11261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 25, !dbg !361 + %11262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 26, !dbg !361 + %11263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 27, !dbg !361 + %11264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 28, !dbg !361 + %11265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 29, !dbg !361 + %11266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 30, !dbg !361 + %11267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 31, !dbg !361 + %11268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 32, !dbg !361 + %11269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 33, !dbg !361 + %11270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 34, !dbg !361 + %11271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 35, !dbg !361 + %11272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 36, !dbg !361 + %11273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 37, !dbg !361 + %11274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 38, !dbg !361 + %11275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 39, !dbg !361 + %11276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 40, !dbg !361 + %11277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 41, !dbg !361 + %11278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 42, !dbg !361 + %11279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 43, !dbg !361 + %11280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 44, !dbg !361 + %11281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 45, !dbg !361 + %11282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 46, !dbg !361 + %11283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 47, !dbg !361 + %11284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 48, !dbg !361 + %11285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 49, !dbg !361 + %11286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 50, !dbg !361 + %11287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 51, !dbg !361 + %11288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 52, !dbg !361 + %11289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 53, !dbg !361 + %11290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 54, !dbg !361 + %11291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 55, !dbg !361 + %11292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 56, !dbg !361 + %11293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 57, !dbg !361 + %11294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 58, !dbg !361 + %11295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 59, !dbg !361 + %11296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 60, !dbg !361 + %11297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 61, !dbg !361 + %11298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 62, !dbg !361 + %11299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 63, !dbg !361 + %11300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 64, !dbg !361 + %11301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 65, !dbg !361 + %11302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 66, !dbg !361 + %11303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 67, !dbg !361 + %11304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 68, !dbg !361 + %11305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 69, !dbg !361 + %11306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 70, !dbg !361 + %11307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 71, !dbg !361 + %11308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 72, !dbg !361 + %11309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 73, !dbg !361 + %11310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 74, !dbg !361 + %11311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 75, !dbg !361 + %11312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 76, !dbg !361 + %11313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 77, !dbg !361 + %11314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 78, !dbg !361 + %11315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 79, !dbg !361 + %11316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 80, !dbg !361 + %11317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 81, !dbg !361 + %11318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 82, !dbg !361 + %11319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 83, !dbg !361 + %11320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 84, !dbg !361 + %11321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 85, !dbg !361 + %11322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 86, !dbg !361 + %11323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 87, !dbg !361 + %11324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 88, !dbg !361 + %11325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 89, !dbg !361 + %11326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 90, !dbg !361 + %11327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 91, !dbg !361 + %11328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 92, !dbg !361 + %11329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 93, !dbg !361 + %11330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 94, !dbg !361 + %11331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 95, !dbg !361 + %11332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 96, !dbg !361 + %11333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 97, !dbg !361 + %11334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 98, !dbg !361 + %11335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 99, !dbg !361 + %11336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 100, !dbg !361 + %11337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 101, !dbg !361 + %11338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 102, !dbg !361 + %11339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 103, !dbg !361 + %11340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 104, !dbg !361 + %11341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 105, !dbg !361 + %11342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 106, !dbg !361 + %11343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 107, !dbg !361 + %11344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 108, !dbg !361 + %11345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 109, !dbg !361 + %11346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 110, !dbg !361 + %11347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 111, !dbg !361 + %11348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 112, !dbg !361 + %11349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 113, !dbg !361 + %11350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 114, !dbg !361 + %11351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 115, !dbg !361 + %11352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 116, !dbg !361 + %11353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 117, !dbg !361 + %11354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 118, !dbg !361 + %11355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 119, !dbg !361 + %11356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 120, !dbg !361 + %11357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 121, !dbg !361 + %11358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 122, !dbg !361 + %11359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 123, !dbg !361 + %11360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 124, !dbg !361 + %11361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 125, !dbg !361 + %11362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 126, !dbg !361 + %11363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %11235, 127, !dbg !361 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !361 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !361 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !276 + %exitcond2189.not = icmp eq i64 %indvars.iv.next, 4, !dbg !276 + br i1 %exitcond2189.not, label %11364, label %5761, !dbg !276 + +11364: ; preds = %._crit_edge1794 + %11365 = getelementptr bfloat, ptr addrspace(1) %55, i64 %4893, !dbg !398 + %11366 = getelementptr bfloat, ptr addrspace(1) %55, i64 %4895, !dbg !398 + %11367 = getelementptr bfloat, ptr addrspace(1) %55, i64 %4897, !dbg !398 + %11368 = getelementptr bfloat, ptr addrspace(1) %55, i64 %4899, !dbg !398 + %11369 = getelementptr bfloat, ptr addrspace(1) %55, i64 %4901, !dbg !398 + %11370 = getelementptr bfloat, ptr addrspace(1) %55, i64 %4903, !dbg !398 + %11371 = getelementptr bfloat, ptr addrspace(1) %55, i64 %4905, !dbg !398 + %11372 = getelementptr bfloat, ptr addrspace(1) %55, i64 %4907, !dbg !398 + %11373 = getelementptr bfloat, ptr addrspace(1) %11365, i64 %4911, !dbg !399 + %11374 = getelementptr bfloat, ptr addrspace(1) %11366, i64 %4911, !dbg !399 + %11375 = getelementptr bfloat, ptr addrspace(1) %11367, i64 %4911, !dbg !399 + %11376 = getelementptr bfloat, ptr addrspace(1) %11368, i64 %4911, !dbg !399 + %11377 = getelementptr bfloat, ptr addrspace(1) %11369, i64 %4911, !dbg !399 + %11378 = getelementptr bfloat, ptr addrspace(1) %11370, i64 %4911, !dbg !399 + %11379 = getelementptr bfloat, ptr addrspace(1) %11371, i64 %4911, !dbg !399 + %11380 = getelementptr bfloat, ptr addrspace(1) %11372, i64 %4911, !dbg !399 + %11381 = insertelement <2 x float> poison, float %11236, i64 0, !dbg !400 + %11382 = insertelement <2 x float> %11381, float %11237, i64 1, !dbg !400 + %11383 = fptrunc <2 x float> %11382 to <2 x bfloat>, !dbg !400 + %11384 = insertelement <2 x float> poison, float %11238, i64 0, !dbg !400 + %11385 = insertelement <2 x float> %11384, float %11239, i64 1, !dbg !400 + %11386 = fptrunc <2 x float> %11385 to <2 x bfloat>, !dbg !400 + %11387 = insertelement <2 x float> poison, float %11240, i64 0, !dbg !400 + %11388 = insertelement <2 x float> %11387, float %11241, i64 1, !dbg !400 + %11389 = fptrunc <2 x float> %11388 to <2 x bfloat>, !dbg !400 + %11390 = insertelement <2 x float> poison, float %11242, i64 0, !dbg !400 + %11391 = insertelement <2 x float> %11390, float %11243, i64 1, !dbg !400 + %11392 = fptrunc <2 x float> %11391 to <2 x bfloat>, !dbg !400 + %11393 = insertelement <2 x float> poison, float %11244, i64 0, !dbg !400 + %11394 = insertelement <2 x float> %11393, float %11245, i64 1, !dbg !400 + %11395 = fptrunc <2 x float> %11394 to <2 x bfloat>, !dbg !400 + %11396 = insertelement <2 x float> poison, float %11246, i64 0, !dbg !400 + %11397 = insertelement <2 x float> %11396, float %11247, i64 1, !dbg !400 + %11398 = fptrunc <2 x float> %11397 to <2 x bfloat>, !dbg !400 + %11399 = insertelement <2 x float> poison, float %11248, i64 0, !dbg !400 + %11400 = insertelement <2 x float> %11399, float %11249, i64 1, !dbg !400 + %11401 = fptrunc <2 x float> %11400 to <2 x bfloat>, !dbg !400 + %11402 = insertelement <2 x float> poison, float %11250, i64 0, !dbg !400 + %11403 = insertelement <2 x float> %11402, float %11251, i64 1, !dbg !400 + %11404 = fptrunc <2 x float> %11403 to <2 x bfloat>, !dbg !400 + %11405 = insertelement <2 x float> poison, float %11252, i64 0, !dbg !400 + %11406 = insertelement <2 x float> %11405, float %11253, i64 1, !dbg !400 + %11407 = fptrunc <2 x float> %11406 to <2 x bfloat>, !dbg !400 + %11408 = insertelement <2 x float> poison, float %11254, i64 0, !dbg !400 + %11409 = insertelement <2 x float> %11408, float %11255, i64 1, !dbg !400 + %11410 = fptrunc <2 x float> %11409 to <2 x bfloat>, !dbg !400 + %11411 = insertelement <2 x float> poison, float %11256, i64 0, !dbg !400 + %11412 = insertelement <2 x float> %11411, float %11257, i64 1, !dbg !400 + %11413 = fptrunc <2 x float> %11412 to <2 x bfloat>, !dbg !400 + %11414 = insertelement <2 x float> poison, float %11258, i64 0, !dbg !400 + %11415 = insertelement <2 x float> %11414, float %11259, i64 1, !dbg !400 + %11416 = fptrunc <2 x float> %11415 to <2 x bfloat>, !dbg !400 + %11417 = insertelement <2 x float> poison, float %11260, i64 0, !dbg !400 + %11418 = insertelement <2 x float> %11417, float %11261, i64 1, !dbg !400 + %11419 = fptrunc <2 x float> %11418 to <2 x bfloat>, !dbg !400 + %11420 = insertelement <2 x float> poison, float %11262, i64 0, !dbg !400 + %11421 = insertelement <2 x float> %11420, float %11263, i64 1, !dbg !400 + %11422 = fptrunc <2 x float> %11421 to <2 x bfloat>, !dbg !400 + %11423 = insertelement <2 x float> poison, float %11264, i64 0, !dbg !400 + %11424 = insertelement <2 x float> %11423, float %11265, i64 1, !dbg !400 + %11425 = fptrunc <2 x float> %11424 to <2 x bfloat>, !dbg !400 + %11426 = insertelement <2 x float> poison, float %11266, i64 0, !dbg !400 + %11427 = insertelement <2 x float> %11426, float %11267, i64 1, !dbg !400 + %11428 = fptrunc <2 x float> %11427 to <2 x bfloat>, !dbg !400 + %11429 = insertelement <2 x float> poison, float %11268, i64 0, !dbg !400 + %11430 = insertelement <2 x float> %11429, float %11269, i64 1, !dbg !400 + %11431 = fptrunc <2 x float> %11430 to <2 x bfloat>, !dbg !400 + %11432 = insertelement <2 x float> poison, float %11270, i64 0, !dbg !400 + %11433 = insertelement <2 x float> %11432, float %11271, i64 1, !dbg !400 + %11434 = fptrunc <2 x float> %11433 to <2 x bfloat>, !dbg !400 + %11435 = insertelement <2 x float> poison, float %11272, i64 0, !dbg !400 + %11436 = insertelement <2 x float> %11435, float %11273, i64 1, !dbg !400 + %11437 = fptrunc <2 x float> %11436 to <2 x bfloat>, !dbg !400 + %11438 = insertelement <2 x float> poison, float %11274, i64 0, !dbg !400 + %11439 = insertelement <2 x float> %11438, float %11275, i64 1, !dbg !400 + %11440 = fptrunc <2 x float> %11439 to <2 x bfloat>, !dbg !400 + %11441 = insertelement <2 x float> poison, float %11276, i64 0, !dbg !400 + %11442 = insertelement <2 x float> %11441, float %11277, i64 1, !dbg !400 + %11443 = fptrunc <2 x float> %11442 to <2 x bfloat>, !dbg !400 + %11444 = insertelement <2 x float> poison, float %11278, i64 0, !dbg !400 + %11445 = insertelement <2 x float> %11444, float %11279, i64 1, !dbg !400 + %11446 = fptrunc <2 x float> %11445 to <2 x bfloat>, !dbg !400 + %11447 = insertelement <2 x float> poison, float %11280, i64 0, !dbg !400 + %11448 = insertelement <2 x float> %11447, float %11281, i64 1, !dbg !400 + %11449 = fptrunc <2 x float> %11448 to <2 x bfloat>, !dbg !400 + %11450 = insertelement <2 x float> poison, float %11282, i64 0, !dbg !400 + %11451 = insertelement <2 x float> %11450, float %11283, i64 1, !dbg !400 + %11452 = fptrunc <2 x float> %11451 to <2 x bfloat>, !dbg !400 + %11453 = insertelement <2 x float> poison, float %11284, i64 0, !dbg !400 + %11454 = insertelement <2 x float> %11453, float %11285, i64 1, !dbg !400 + %11455 = fptrunc <2 x float> %11454 to <2 x bfloat>, !dbg !400 + %11456 = insertelement <2 x float> poison, float %11286, i64 0, !dbg !400 + %11457 = insertelement <2 x float> %11456, float %11287, i64 1, !dbg !400 + %11458 = fptrunc <2 x float> %11457 to <2 x bfloat>, !dbg !400 + %11459 = insertelement <2 x float> poison, float %11288, i64 0, !dbg !400 + %11460 = insertelement <2 x float> %11459, float %11289, i64 1, !dbg !400 + %11461 = fptrunc <2 x float> %11460 to <2 x bfloat>, !dbg !400 + %11462 = insertelement <2 x float> poison, float %11290, i64 0, !dbg !400 + %11463 = insertelement <2 x float> %11462, float %11291, i64 1, !dbg !400 + %11464 = fptrunc <2 x float> %11463 to <2 x bfloat>, !dbg !400 + %11465 = insertelement <2 x float> poison, float %11292, i64 0, !dbg !400 + %11466 = insertelement <2 x float> %11465, float %11293, i64 1, !dbg !400 + %11467 = fptrunc <2 x float> %11466 to <2 x bfloat>, !dbg !400 + %11468 = insertelement <2 x float> poison, float %11294, i64 0, !dbg !400 + %11469 = insertelement <2 x float> %11468, float %11295, i64 1, !dbg !400 + %11470 = fptrunc <2 x float> %11469 to <2 x bfloat>, !dbg !400 + %11471 = insertelement <2 x float> poison, float %11296, i64 0, !dbg !400 + %11472 = insertelement <2 x float> %11471, float %11297, i64 1, !dbg !400 + %11473 = fptrunc <2 x float> %11472 to <2 x bfloat>, !dbg !400 + %11474 = insertelement <2 x float> poison, float %11298, i64 0, !dbg !400 + %11475 = insertelement <2 x float> %11474, float %11299, i64 1, !dbg !400 + %11476 = fptrunc <2 x float> %11475 to <2 x bfloat>, !dbg !400 + %11477 = shl nuw nsw i32 %5134, 13, !dbg !400 + %11478 = shl nuw nsw i32 %56, 5, !dbg !400 + %11479 = and i32 %11478, 7264, !dbg !400 + %11480 = and i32 %56, 24, !dbg !400 + %11481 = shl nuw nsw i32 %11480, 4, !dbg !400 + %11482 = shl nuw nsw i32 %56, 2, !dbg !400 + %11483 = and i32 %11482, 16, !dbg !400 + %11484 = or disjoint i32 %11477, %11483, !dbg !400 + %11485 = or disjoint i32 %11479, %11481, !dbg !400 + %11486 = or disjoint i32 %11484, %11485, !dbg !400 + %11487 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %11486, !dbg !400 + %11488 = bitcast <2 x bfloat> %11383 to i32, !dbg !400 + %11489 = bitcast <2 x bfloat> %11389 to i32, !dbg !400 + %11490 = bitcast <2 x bfloat> %11395 to i32, !dbg !400 + %11491 = bitcast <2 x bfloat> %11401 to i32, !dbg !400 + %11492 = insertelement <4 x i32> poison, i32 %11488, i64 0, !dbg !400 + %11493 = insertelement <4 x i32> %11492, i32 %11489, i64 1, !dbg !400 + %11494 = insertelement <4 x i32> %11493, i32 %11490, i64 2, !dbg !400 + %11495 = insertelement <4 x i32> %11494, i32 %11491, i64 3, !dbg !400 + store <4 x i32> %11495, ptr addrspace(3) %11487, align 16, !dbg !400 + %11496 = getelementptr inbounds nuw i8, ptr addrspace(3) %11487, i32 512, !dbg !400 + %11497 = bitcast <2 x bfloat> %11386 to i32, !dbg !400 + %11498 = bitcast <2 x bfloat> %11392 to i32, !dbg !400 + %11499 = bitcast <2 x bfloat> %11398 to i32, !dbg !400 + %11500 = bitcast <2 x bfloat> %11404 to i32, !dbg !400 + %11501 = insertelement <4 x i32> poison, i32 %11497, i64 0, !dbg !400 + %11502 = insertelement <4 x i32> %11501, i32 %11498, i64 1, !dbg !400 + %11503 = insertelement <4 x i32> %11502, i32 %11499, i64 2, !dbg !400 + %11504 = insertelement <4 x i32> %11503, i32 %11500, i64 3, !dbg !400 + store <4 x i32> %11504, ptr addrspace(3) %11496, align 16, !dbg !400 + %11505 = xor i32 %11486, 32, !dbg !400 + %11506 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %11505, !dbg !400 + %11507 = bitcast <2 x bfloat> %11407 to i32, !dbg !400 + %11508 = bitcast <2 x bfloat> %11413 to i32, !dbg !400 + %11509 = bitcast <2 x bfloat> %11419 to i32, !dbg !400 + %11510 = bitcast <2 x bfloat> %11425 to i32, !dbg !400 + %11511 = insertelement <4 x i32> poison, i32 %11507, i64 0, !dbg !400 + %11512 = insertelement <4 x i32> %11511, i32 %11508, i64 1, !dbg !400 + %11513 = insertelement <4 x i32> %11512, i32 %11509, i64 2, !dbg !400 + %11514 = insertelement <4 x i32> %11513, i32 %11510, i64 3, !dbg !400 + store <4 x i32> %11514, ptr addrspace(3) %11506, align 16, !dbg !400 + %11515 = getelementptr inbounds nuw i8, ptr addrspace(3) %11506, i32 512, !dbg !400 + %11516 = bitcast <2 x bfloat> %11410 to i32, !dbg !400 + %11517 = bitcast <2 x bfloat> %11416 to i32, !dbg !400 + %11518 = bitcast <2 x bfloat> %11422 to i32, !dbg !400 + %11519 = bitcast <2 x bfloat> %11428 to i32, !dbg !400 + %11520 = insertelement <4 x i32> poison, i32 %11516, i64 0, !dbg !400 + %11521 = insertelement <4 x i32> %11520, i32 %11517, i64 1, !dbg !400 + %11522 = insertelement <4 x i32> %11521, i32 %11518, i64 2, !dbg !400 + %11523 = insertelement <4 x i32> %11522, i32 %11519, i64 3, !dbg !400 + store <4 x i32> %11523, ptr addrspace(3) %11515, align 16, !dbg !400 + %11524 = xor i32 %11486, 64, !dbg !400 + %11525 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %11524, !dbg !400 + %11526 = bitcast <2 x bfloat> %11431 to i32, !dbg !400 + %11527 = bitcast <2 x bfloat> %11437 to i32, !dbg !400 + %11528 = bitcast <2 x bfloat> %11443 to i32, !dbg !400 + %11529 = bitcast <2 x bfloat> %11449 to i32, !dbg !400 + %11530 = insertelement <4 x i32> poison, i32 %11526, i64 0, !dbg !400 + %11531 = insertelement <4 x i32> %11530, i32 %11527, i64 1, !dbg !400 + %11532 = insertelement <4 x i32> %11531, i32 %11528, i64 2, !dbg !400 + %11533 = insertelement <4 x i32> %11532, i32 %11529, i64 3, !dbg !400 + store <4 x i32> %11533, ptr addrspace(3) %11525, align 16, !dbg !400 + %11534 = getelementptr inbounds nuw i8, ptr addrspace(3) %11525, i32 512, !dbg !400 + %11535 = bitcast <2 x bfloat> %11434 to i32, !dbg !400 + %11536 = bitcast <2 x bfloat> %11440 to i32, !dbg !400 + %11537 = bitcast <2 x bfloat> %11446 to i32, !dbg !400 + %11538 = bitcast <2 x bfloat> %11452 to i32, !dbg !400 + %11539 = insertelement <4 x i32> poison, i32 %11535, i64 0, !dbg !400 + %11540 = insertelement <4 x i32> %11539, i32 %11536, i64 1, !dbg !400 + %11541 = insertelement <4 x i32> %11540, i32 %11537, i64 2, !dbg !400 + %11542 = insertelement <4 x i32> %11541, i32 %11538, i64 3, !dbg !400 + store <4 x i32> %11542, ptr addrspace(3) %11534, align 16, !dbg !400 + %11543 = xor i32 %11486, 96, !dbg !400 + %11544 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %11543, !dbg !400 + %11545 = bitcast <2 x bfloat> %11455 to i32, !dbg !400 + %11546 = bitcast <2 x bfloat> %11461 to i32, !dbg !400 + %11547 = bitcast <2 x bfloat> %11467 to i32, !dbg !400 + %11548 = bitcast <2 x bfloat> %11473 to i32, !dbg !400 + %11549 = insertelement <4 x i32> poison, i32 %11545, i64 0, !dbg !400 + %11550 = insertelement <4 x i32> %11549, i32 %11546, i64 1, !dbg !400 + %11551 = insertelement <4 x i32> %11550, i32 %11547, i64 2, !dbg !400 + %11552 = insertelement <4 x i32> %11551, i32 %11548, i64 3, !dbg !400 + store <4 x i32> %11552, ptr addrspace(3) %11544, align 16, !dbg !400 + %11553 = getelementptr inbounds nuw i8, ptr addrspace(3) %11544, i32 512, !dbg !400 + %11554 = bitcast <2 x bfloat> %11458 to i32, !dbg !400 + %11555 = bitcast <2 x bfloat> %11464 to i32, !dbg !400 + %11556 = bitcast <2 x bfloat> %11470 to i32, !dbg !400 + %11557 = bitcast <2 x bfloat> %11476 to i32, !dbg !400 + %11558 = insertelement <4 x i32> poison, i32 %11554, i64 0, !dbg !400 + %11559 = insertelement <4 x i32> %11558, i32 %11555, i64 1, !dbg !400 + %11560 = insertelement <4 x i32> %11559, i32 %11556, i64 2, !dbg !400 + %11561 = insertelement <4 x i32> %11560, i32 %11557, i64 3, !dbg !400 + store <4 x i32> %11561, ptr addrspace(3) %11553, align 16, !dbg !400 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !400 + %11562 = shl nuw nsw i32 %11480, 10, !dbg !400 + %11563 = shl nuw nsw i32 %5134, 5, !dbg !400 + %11564 = and i32 %11482, 1008, !dbg !400 + %11565 = or disjoint i32 %11562, %11563, !dbg !400 + %11566 = xor i32 %11565, %11564, !dbg !400 + %11567 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %11566, !dbg !400 + %11568 = ptrtoint ptr addrspace(3) %11567 to i32, !dbg !400 + %11569 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11568) #3, !dbg !400 + %11570 = extractvalue { i32, i32, i32, i32 } %11569, 0, !dbg !400 + %11571 = extractvalue { i32, i32, i32, i32 } %11569, 1, !dbg !400 + %11572 = extractvalue { i32, i32, i32, i32 } %11569, 2, !dbg !400 + %11573 = extractvalue { i32, i32, i32, i32 } %11569, 3, !dbg !400 + %11574 = getelementptr inbounds nuw i8, ptr addrspace(3) %11567, i32 1024, !dbg !400 + %11575 = ptrtoint ptr addrspace(3) %11574 to i32, !dbg !400 + %11576 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11575) #3, !dbg !400 + %11577 = extractvalue { i32, i32, i32, i32 } %11576, 0, !dbg !400 + %11578 = extractvalue { i32, i32, i32, i32 } %11576, 1, !dbg !400 + %11579 = extractvalue { i32, i32, i32, i32 } %11576, 2, !dbg !400 + %11580 = extractvalue { i32, i32, i32, i32 } %11576, 3, !dbg !400 + %11581 = getelementptr inbounds nuw i8, ptr addrspace(3) %11567, i32 2048, !dbg !400 + %11582 = ptrtoint ptr addrspace(3) %11581 to i32, !dbg !400 + %11583 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11582) #3, !dbg !400 + %11584 = extractvalue { i32, i32, i32, i32 } %11583, 0, !dbg !400 + %11585 = extractvalue { i32, i32, i32, i32 } %11583, 1, !dbg !400 + %11586 = extractvalue { i32, i32, i32, i32 } %11583, 2, !dbg !400 + %11587 = extractvalue { i32, i32, i32, i32 } %11583, 3, !dbg !400 + %11588 = getelementptr inbounds nuw i8, ptr addrspace(3) %11567, i32 3072, !dbg !400 + %11589 = ptrtoint ptr addrspace(3) %11588 to i32, !dbg !400 + %11590 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11589) #3, !dbg !400 + %11591 = extractvalue { i32, i32, i32, i32 } %11590, 0, !dbg !400 + %11592 = extractvalue { i32, i32, i32, i32 } %11590, 1, !dbg !400 + %11593 = extractvalue { i32, i32, i32, i32 } %11590, 2, !dbg !400 + %11594 = extractvalue { i32, i32, i32, i32 } %11590, 3, !dbg !400 + %11595 = getelementptr inbounds nuw i8, ptr addrspace(3) %11567, i32 4096, !dbg !400 + %11596 = ptrtoint ptr addrspace(3) %11595 to i32, !dbg !400 + %11597 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11596) #3, !dbg !400 + %11598 = extractvalue { i32, i32, i32, i32 } %11597, 0, !dbg !400 + %11599 = extractvalue { i32, i32, i32, i32 } %11597, 1, !dbg !400 + %11600 = extractvalue { i32, i32, i32, i32 } %11597, 2, !dbg !400 + %11601 = extractvalue { i32, i32, i32, i32 } %11597, 3, !dbg !400 + %11602 = getelementptr inbounds nuw i8, ptr addrspace(3) %11567, i32 5120, !dbg !400 + %11603 = ptrtoint ptr addrspace(3) %11602 to i32, !dbg !400 + %11604 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11603) #3, !dbg !400 + %11605 = extractvalue { i32, i32, i32, i32 } %11604, 0, !dbg !400 + %11606 = extractvalue { i32, i32, i32, i32 } %11604, 1, !dbg !400 + %11607 = extractvalue { i32, i32, i32, i32 } %11604, 2, !dbg !400 + %11608 = extractvalue { i32, i32, i32, i32 } %11604, 3, !dbg !400 + %11609 = getelementptr inbounds nuw i8, ptr addrspace(3) %11567, i32 6144, !dbg !400 + %11610 = ptrtoint ptr addrspace(3) %11609 to i32, !dbg !400 + %11611 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11610) #3, !dbg !400 + %11612 = extractvalue { i32, i32, i32, i32 } %11611, 0, !dbg !400 + %11613 = extractvalue { i32, i32, i32, i32 } %11611, 1, !dbg !400 + %11614 = extractvalue { i32, i32, i32, i32 } %11611, 2, !dbg !400 + %11615 = extractvalue { i32, i32, i32, i32 } %11611, 3, !dbg !400 + %11616 = getelementptr inbounds nuw i8, ptr addrspace(3) %11567, i32 7168, !dbg !400 + %11617 = ptrtoint ptr addrspace(3) %11616 to i32, !dbg !400 + %11618 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11617) #3, !dbg !400 + %11619 = extractvalue { i32, i32, i32, i32 } %11618, 0, !dbg !400 + %11620 = extractvalue { i32, i32, i32, i32 } %11618, 1, !dbg !400 + %11621 = extractvalue { i32, i32, i32, i32 } %11618, 2, !dbg !400 + %11622 = extractvalue { i32, i32, i32, i32 } %11618, 3, !dbg !400 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11570, i32 %11571, i32 %11572, i32 %11573, ptr addrspace(1) %11373, i1 %4920) #3, !dbg !400 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11577, i32 %11578, i32 %11579, i32 %11580, ptr addrspace(1) %11374, i1 %4921) #3, !dbg !400 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11584, i32 %11585, i32 %11586, i32 %11587, ptr addrspace(1) %11375, i1 %4922) #3, !dbg !400 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11591, i32 %11592, i32 %11593, i32 %11594, ptr addrspace(1) %11376, i1 %4923) #3, !dbg !400 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11598, i32 %11599, i32 %11600, i32 %11601, ptr addrspace(1) %11377, i1 %4924) #3, !dbg !400 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11605, i32 %11606, i32 %11607, i32 %11608, ptr addrspace(1) %11378, i1 %4925) #3, !dbg !400 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11612, i32 %11613, i32 %11614, i32 %11615, ptr addrspace(1) %11379, i1 %4926) #3, !dbg !400 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11619, i32 %11620, i32 %11621, i32 %11622, ptr addrspace(1) %11380, i1 %4927) #3, !dbg !400 + %11623 = insertelement <2 x float> poison, float %11300, i64 0, !dbg !401 + %11624 = insertelement <2 x float> %11623, float %11301, i64 1, !dbg !401 + %11625 = fmul <2 x float> %11624, splat (float 0x3FB6A09E60000000), !dbg !401 + %11626 = insertelement <2 x float> poison, float %11302, i64 0, !dbg !401 + %11627 = insertelement <2 x float> %11626, float %11303, i64 1, !dbg !401 + %11628 = fmul <2 x float> %11627, splat (float 0x3FB6A09E60000000), !dbg !401 + %11629 = insertelement <2 x float> poison, float %11304, i64 0, !dbg !401 + %11630 = insertelement <2 x float> %11629, float %11305, i64 1, !dbg !401 + %11631 = fmul <2 x float> %11630, splat (float 0x3FB6A09E60000000), !dbg !401 + %11632 = insertelement <2 x float> poison, float %11306, i64 0, !dbg !401 + %11633 = insertelement <2 x float> %11632, float %11307, i64 1, !dbg !401 + %11634 = fmul <2 x float> %11633, splat (float 0x3FB6A09E60000000), !dbg !401 + %11635 = insertelement <2 x float> poison, float %11308, i64 0, !dbg !401 + %11636 = insertelement <2 x float> %11635, float %11309, i64 1, !dbg !401 + %11637 = fmul <2 x float> %11636, splat (float 0x3FB6A09E60000000), !dbg !401 + %11638 = insertelement <2 x float> poison, float %11310, i64 0, !dbg !401 + %11639 = insertelement <2 x float> %11638, float %11311, i64 1, !dbg !401 + %11640 = fmul <2 x float> %11639, splat (float 0x3FB6A09E60000000), !dbg !401 + %11641 = insertelement <2 x float> poison, float %11312, i64 0, !dbg !401 + %11642 = insertelement <2 x float> %11641, float %11313, i64 1, !dbg !401 + %11643 = fmul <2 x float> %11642, splat (float 0x3FB6A09E60000000), !dbg !401 + %11644 = insertelement <2 x float> poison, float %11314, i64 0, !dbg !401 + %11645 = insertelement <2 x float> %11644, float %11315, i64 1, !dbg !401 + %11646 = fmul <2 x float> %11645, splat (float 0x3FB6A09E60000000), !dbg !401 + %11647 = insertelement <2 x float> poison, float %11316, i64 0, !dbg !401 + %11648 = insertelement <2 x float> %11647, float %11317, i64 1, !dbg !401 + %11649 = fmul <2 x float> %11648, splat (float 0x3FB6A09E60000000), !dbg !401 + %11650 = insertelement <2 x float> poison, float %11318, i64 0, !dbg !401 + %11651 = insertelement <2 x float> %11650, float %11319, i64 1, !dbg !401 + %11652 = fmul <2 x float> %11651, splat (float 0x3FB6A09E60000000), !dbg !401 + %11653 = insertelement <2 x float> poison, float %11320, i64 0, !dbg !401 + %11654 = insertelement <2 x float> %11653, float %11321, i64 1, !dbg !401 + %11655 = fmul <2 x float> %11654, splat (float 0x3FB6A09E60000000), !dbg !401 + %11656 = insertelement <2 x float> poison, float %11322, i64 0, !dbg !401 + %11657 = insertelement <2 x float> %11656, float %11323, i64 1, !dbg !401 + %11658 = fmul <2 x float> %11657, splat (float 0x3FB6A09E60000000), !dbg !401 + %11659 = insertelement <2 x float> poison, float %11324, i64 0, !dbg !401 + %11660 = insertelement <2 x float> %11659, float %11325, i64 1, !dbg !401 + %11661 = fmul <2 x float> %11660, splat (float 0x3FB6A09E60000000), !dbg !401 + %11662 = insertelement <2 x float> poison, float %11326, i64 0, !dbg !401 + %11663 = insertelement <2 x float> %11662, float %11327, i64 1, !dbg !401 + %11664 = fmul <2 x float> %11663, splat (float 0x3FB6A09E60000000), !dbg !401 + %11665 = insertelement <2 x float> poison, float %11328, i64 0, !dbg !401 + %11666 = insertelement <2 x float> %11665, float %11329, i64 1, !dbg !401 + %11667 = fmul <2 x float> %11666, splat (float 0x3FB6A09E60000000), !dbg !401 + %11668 = insertelement <2 x float> poison, float %11330, i64 0, !dbg !401 + %11669 = insertelement <2 x float> %11668, float %11331, i64 1, !dbg !401 + %11670 = fmul <2 x float> %11669, splat (float 0x3FB6A09E60000000), !dbg !401 + %11671 = insertelement <2 x float> poison, float %11332, i64 0, !dbg !401 + %11672 = insertelement <2 x float> %11671, float %11333, i64 1, !dbg !401 + %11673 = fmul <2 x float> %11672, splat (float 0x3FB6A09E60000000), !dbg !401 + %11674 = insertelement <2 x float> poison, float %11334, i64 0, !dbg !401 + %11675 = insertelement <2 x float> %11674, float %11335, i64 1, !dbg !401 + %11676 = fmul <2 x float> %11675, splat (float 0x3FB6A09E60000000), !dbg !401 + %11677 = insertelement <2 x float> poison, float %11336, i64 0, !dbg !401 + %11678 = insertelement <2 x float> %11677, float %11337, i64 1, !dbg !401 + %11679 = fmul <2 x float> %11678, splat (float 0x3FB6A09E60000000), !dbg !401 + %11680 = insertelement <2 x float> poison, float %11338, i64 0, !dbg !401 + %11681 = insertelement <2 x float> %11680, float %11339, i64 1, !dbg !401 + %11682 = fmul <2 x float> %11681, splat (float 0x3FB6A09E60000000), !dbg !401 + %11683 = insertelement <2 x float> poison, float %11340, i64 0, !dbg !401 + %11684 = insertelement <2 x float> %11683, float %11341, i64 1, !dbg !401 + %11685 = fmul <2 x float> %11684, splat (float 0x3FB6A09E60000000), !dbg !401 + %11686 = insertelement <2 x float> poison, float %11342, i64 0, !dbg !401 + %11687 = insertelement <2 x float> %11686, float %11343, i64 1, !dbg !401 + %11688 = fmul <2 x float> %11687, splat (float 0x3FB6A09E60000000), !dbg !401 + %11689 = insertelement <2 x float> poison, float %11344, i64 0, !dbg !401 + %11690 = insertelement <2 x float> %11689, float %11345, i64 1, !dbg !401 + %11691 = fmul <2 x float> %11690, splat (float 0x3FB6A09E60000000), !dbg !401 + %11692 = insertelement <2 x float> poison, float %11346, i64 0, !dbg !401 + %11693 = insertelement <2 x float> %11692, float %11347, i64 1, !dbg !401 + %11694 = fmul <2 x float> %11693, splat (float 0x3FB6A09E60000000), !dbg !401 + %11695 = insertelement <2 x float> poison, float %11348, i64 0, !dbg !401 + %11696 = insertelement <2 x float> %11695, float %11349, i64 1, !dbg !401 + %11697 = fmul <2 x float> %11696, splat (float 0x3FB6A09E60000000), !dbg !401 + %11698 = insertelement <2 x float> poison, float %11350, i64 0, !dbg !401 + %11699 = insertelement <2 x float> %11698, float %11351, i64 1, !dbg !401 + %11700 = fmul <2 x float> %11699, splat (float 0x3FB6A09E60000000), !dbg !401 + %11701 = insertelement <2 x float> poison, float %11352, i64 0, !dbg !401 + %11702 = insertelement <2 x float> %11701, float %11353, i64 1, !dbg !401 + %11703 = fmul <2 x float> %11702, splat (float 0x3FB6A09E60000000), !dbg !401 + %11704 = insertelement <2 x float> poison, float %11354, i64 0, !dbg !401 + %11705 = insertelement <2 x float> %11704, float %11355, i64 1, !dbg !401 + %11706 = fmul <2 x float> %11705, splat (float 0x3FB6A09E60000000), !dbg !401 + %11707 = insertelement <2 x float> poison, float %11356, i64 0, !dbg !401 + %11708 = insertelement <2 x float> %11707, float %11357, i64 1, !dbg !401 + %11709 = fmul <2 x float> %11708, splat (float 0x3FB6A09E60000000), !dbg !401 + %11710 = insertelement <2 x float> poison, float %11358, i64 0, !dbg !401 + %11711 = insertelement <2 x float> %11710, float %11359, i64 1, !dbg !401 + %11712 = fmul <2 x float> %11711, splat (float 0x3FB6A09E60000000), !dbg !401 + %11713 = insertelement <2 x float> poison, float %11360, i64 0, !dbg !401 + %11714 = insertelement <2 x float> %11713, float %11361, i64 1, !dbg !401 + %11715 = fmul <2 x float> %11714, splat (float 0x3FB6A09E60000000), !dbg !401 + %11716 = insertelement <2 x float> poison, float %11362, i64 0, !dbg !401 + %11717 = insertelement <2 x float> %11716, float %11363, i64 1, !dbg !401 + %11718 = fmul <2 x float> %11717, splat (float 0x3FB6A09E60000000), !dbg !401 + %11719 = or disjoint i32 %4885, %4910, !dbg !402 + %11720 = or disjoint i32 %4886, %4910, !dbg !402 + %11721 = or disjoint i32 %4887, %4910, !dbg !402 + %11722 = or disjoint i32 %4888, %4910, !dbg !402 + %11723 = or disjoint i32 %4889, %4910, !dbg !402 + %11724 = or disjoint i32 %4890, %4910, !dbg !402 + %11725 = or disjoint i32 %4891, %4910, !dbg !402 + %11726 = or disjoint i32 %4892, %4910, !dbg !402 + %11727 = shl nuw nsw i32 %44, 7, !dbg !403 + %11728 = mul i32 %19, %11727, !dbg !404 + %11729 = add i32 %11719, %11728, !dbg !405 + %11730 = add i32 %11720, %11728, !dbg !405 + %11731 = add i32 %11721, %11728, !dbg !405 + %11732 = add i32 %11722, %11728, !dbg !405 + %11733 = add i32 %11723, %11728, !dbg !405 + %11734 = add i32 %11724, %11728, !dbg !405 + %11735 = add i32 %11725, %11728, !dbg !405 + %11736 = add i32 %11726, %11728, !dbg !405 + %11737 = shl nuw nsw i32 %43, 10, !dbg !406 + %11738 = mul i32 %19, %11737, !dbg !407 + %11739 = add i32 %11729, %11738, !dbg !408 + %11740 = add i32 %11730, %11738, !dbg !408 + %11741 = add i32 %11731, %11738, !dbg !408 + %11742 = add i32 %11732, %11738, !dbg !408 + %11743 = add i32 %11733, %11738, !dbg !408 + %11744 = add i32 %11734, %11738, !dbg !408 + %11745 = add i32 %11735, %11738, !dbg !408 + %11746 = add i32 %11736, %11738, !dbg !408 + %11747 = sext i32 %11739 to i64, !dbg !409 + %11748 = getelementptr bfloat, ptr addrspace(1) %17, i64 %11747, !dbg !409 + %11749 = sext i32 %11740 to i64, !dbg !409 + %11750 = getelementptr bfloat, ptr addrspace(1) %17, i64 %11749, !dbg !409 + %11751 = sext i32 %11741 to i64, !dbg !409 + %11752 = getelementptr bfloat, ptr addrspace(1) %17, i64 %11751, !dbg !409 + %11753 = sext i32 %11742 to i64, !dbg !409 + %11754 = getelementptr bfloat, ptr addrspace(1) %17, i64 %11753, !dbg !409 + %11755 = sext i32 %11743 to i64, !dbg !409 + %11756 = getelementptr bfloat, ptr addrspace(1) %17, i64 %11755, !dbg !409 + %11757 = sext i32 %11744 to i64, !dbg !409 + %11758 = getelementptr bfloat, ptr addrspace(1) %17, i64 %11757, !dbg !409 + %11759 = sext i32 %11745 to i64, !dbg !409 + %11760 = getelementptr bfloat, ptr addrspace(1) %17, i64 %11759, !dbg !409 + %11761 = sext i32 %11746 to i64, !dbg !409 + %11762 = getelementptr bfloat, ptr addrspace(1) %17, i64 %11761, !dbg !409 + %11763 = fptrunc <2 x float> %11625 to <2 x bfloat>, !dbg !410 + %11764 = fptrunc <2 x float> %11628 to <2 x bfloat>, !dbg !410 + %11765 = fptrunc <2 x float> %11631 to <2 x bfloat>, !dbg !410 + %11766 = fptrunc <2 x float> %11634 to <2 x bfloat>, !dbg !410 + %11767 = fptrunc <2 x float> %11637 to <2 x bfloat>, !dbg !410 + %11768 = fptrunc <2 x float> %11640 to <2 x bfloat>, !dbg !410 + %11769 = fptrunc <2 x float> %11643 to <2 x bfloat>, !dbg !410 + %11770 = fptrunc <2 x float> %11646 to <2 x bfloat>, !dbg !410 + %11771 = fptrunc <2 x float> %11649 to <2 x bfloat>, !dbg !410 + %11772 = fptrunc <2 x float> %11652 to <2 x bfloat>, !dbg !410 + %11773 = fptrunc <2 x float> %11655 to <2 x bfloat>, !dbg !410 + %11774 = fptrunc <2 x float> %11658 to <2 x bfloat>, !dbg !410 + %11775 = fptrunc <2 x float> %11661 to <2 x bfloat>, !dbg !410 + %11776 = fptrunc <2 x float> %11664 to <2 x bfloat>, !dbg !410 + %11777 = fptrunc <2 x float> %11667 to <2 x bfloat>, !dbg !410 + %11778 = fptrunc <2 x float> %11670 to <2 x bfloat>, !dbg !410 + %11779 = fptrunc <2 x float> %11673 to <2 x bfloat>, !dbg !410 + %11780 = fptrunc <2 x float> %11676 to <2 x bfloat>, !dbg !410 + %11781 = fptrunc <2 x float> %11679 to <2 x bfloat>, !dbg !410 + %11782 = fptrunc <2 x float> %11682 to <2 x bfloat>, !dbg !410 + %11783 = fptrunc <2 x float> %11685 to <2 x bfloat>, !dbg !410 + %11784 = fptrunc <2 x float> %11688 to <2 x bfloat>, !dbg !410 + %11785 = fptrunc <2 x float> %11691 to <2 x bfloat>, !dbg !410 + %11786 = fptrunc <2 x float> %11694 to <2 x bfloat>, !dbg !410 + %11787 = fptrunc <2 x float> %11697 to <2 x bfloat>, !dbg !410 + %11788 = fptrunc <2 x float> %11700 to <2 x bfloat>, !dbg !410 + %11789 = fptrunc <2 x float> %11703 to <2 x bfloat>, !dbg !410 + %11790 = fptrunc <2 x float> %11706 to <2 x bfloat>, !dbg !410 + %11791 = fptrunc <2 x float> %11709 to <2 x bfloat>, !dbg !410 + %11792 = fptrunc <2 x float> %11712 to <2 x bfloat>, !dbg !410 + %11793 = fptrunc <2 x float> %11715 to <2 x bfloat>, !dbg !410 + %11794 = fptrunc <2 x float> %11718 to <2 x bfloat>, !dbg !410 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !410 + %11795 = bitcast <2 x bfloat> %11763 to i32, !dbg !410 + %11796 = bitcast <2 x bfloat> %11765 to i32, !dbg !410 + %11797 = bitcast <2 x bfloat> %11767 to i32, !dbg !410 + %11798 = bitcast <2 x bfloat> %11769 to i32, !dbg !410 + %11799 = insertelement <4 x i32> poison, i32 %11795, i64 0, !dbg !410 + %11800 = insertelement <4 x i32> %11799, i32 %11796, i64 1, !dbg !410 + %11801 = insertelement <4 x i32> %11800, i32 %11797, i64 2, !dbg !410 + %11802 = insertelement <4 x i32> %11801, i32 %11798, i64 3, !dbg !410 + store <4 x i32> %11802, ptr addrspace(3) %11487, align 16, !dbg !410 + %11803 = bitcast <2 x bfloat> %11764 to i32, !dbg !410 + %11804 = bitcast <2 x bfloat> %11766 to i32, !dbg !410 + %11805 = bitcast <2 x bfloat> %11768 to i32, !dbg !410 + %11806 = bitcast <2 x bfloat> %11770 to i32, !dbg !410 + %11807 = insertelement <4 x i32> poison, i32 %11803, i64 0, !dbg !410 + %11808 = insertelement <4 x i32> %11807, i32 %11804, i64 1, !dbg !410 + %11809 = insertelement <4 x i32> %11808, i32 %11805, i64 2, !dbg !410 + %11810 = insertelement <4 x i32> %11809, i32 %11806, i64 3, !dbg !410 + store <4 x i32> %11810, ptr addrspace(3) %11496, align 16, !dbg !410 + %11811 = bitcast <2 x bfloat> %11771 to i32, !dbg !410 + %11812 = bitcast <2 x bfloat> %11773 to i32, !dbg !410 + %11813 = bitcast <2 x bfloat> %11775 to i32, !dbg !410 + %11814 = bitcast <2 x bfloat> %11777 to i32, !dbg !410 + %11815 = insertelement <4 x i32> poison, i32 %11811, i64 0, !dbg !410 + %11816 = insertelement <4 x i32> %11815, i32 %11812, i64 1, !dbg !410 + %11817 = insertelement <4 x i32> %11816, i32 %11813, i64 2, !dbg !410 + %11818 = insertelement <4 x i32> %11817, i32 %11814, i64 3, !dbg !410 + store <4 x i32> %11818, ptr addrspace(3) %11506, align 16, !dbg !410 + %11819 = bitcast <2 x bfloat> %11772 to i32, !dbg !410 + %11820 = bitcast <2 x bfloat> %11774 to i32, !dbg !410 + %11821 = bitcast <2 x bfloat> %11776 to i32, !dbg !410 + %11822 = bitcast <2 x bfloat> %11778 to i32, !dbg !410 + %11823 = insertelement <4 x i32> poison, i32 %11819, i64 0, !dbg !410 + %11824 = insertelement <4 x i32> %11823, i32 %11820, i64 1, !dbg !410 + %11825 = insertelement <4 x i32> %11824, i32 %11821, i64 2, !dbg !410 + %11826 = insertelement <4 x i32> %11825, i32 %11822, i64 3, !dbg !410 + store <4 x i32> %11826, ptr addrspace(3) %11515, align 16, !dbg !410 + %11827 = bitcast <2 x bfloat> %11779 to i32, !dbg !410 + %11828 = bitcast <2 x bfloat> %11781 to i32, !dbg !410 + %11829 = bitcast <2 x bfloat> %11783 to i32, !dbg !410 + %11830 = bitcast <2 x bfloat> %11785 to i32, !dbg !410 + %11831 = insertelement <4 x i32> poison, i32 %11827, i64 0, !dbg !410 + %11832 = insertelement <4 x i32> %11831, i32 %11828, i64 1, !dbg !410 + %11833 = insertelement <4 x i32> %11832, i32 %11829, i64 2, !dbg !410 + %11834 = insertelement <4 x i32> %11833, i32 %11830, i64 3, !dbg !410 + store <4 x i32> %11834, ptr addrspace(3) %11525, align 16, !dbg !410 + %11835 = bitcast <2 x bfloat> %11780 to i32, !dbg !410 + %11836 = bitcast <2 x bfloat> %11782 to i32, !dbg !410 + %11837 = bitcast <2 x bfloat> %11784 to i32, !dbg !410 + %11838 = bitcast <2 x bfloat> %11786 to i32, !dbg !410 + %11839 = insertelement <4 x i32> poison, i32 %11835, i64 0, !dbg !410 + %11840 = insertelement <4 x i32> %11839, i32 %11836, i64 1, !dbg !410 + %11841 = insertelement <4 x i32> %11840, i32 %11837, i64 2, !dbg !410 + %11842 = insertelement <4 x i32> %11841, i32 %11838, i64 3, !dbg !410 + store <4 x i32> %11842, ptr addrspace(3) %11534, align 16, !dbg !410 + %11843 = bitcast <2 x bfloat> %11787 to i32, !dbg !410 + %11844 = bitcast <2 x bfloat> %11789 to i32, !dbg !410 + %11845 = bitcast <2 x bfloat> %11791 to i32, !dbg !410 + %11846 = bitcast <2 x bfloat> %11793 to i32, !dbg !410 + %11847 = insertelement <4 x i32> poison, i32 %11843, i64 0, !dbg !410 + %11848 = insertelement <4 x i32> %11847, i32 %11844, i64 1, !dbg !410 + %11849 = insertelement <4 x i32> %11848, i32 %11845, i64 2, !dbg !410 + %11850 = insertelement <4 x i32> %11849, i32 %11846, i64 3, !dbg !410 + store <4 x i32> %11850, ptr addrspace(3) %11544, align 16, !dbg !410 + %11851 = bitcast <2 x bfloat> %11788 to i32, !dbg !410 + %11852 = bitcast <2 x bfloat> %11790 to i32, !dbg !410 + %11853 = bitcast <2 x bfloat> %11792 to i32, !dbg !410 + %11854 = bitcast <2 x bfloat> %11794 to i32, !dbg !410 + %11855 = insertelement <4 x i32> poison, i32 %11851, i64 0, !dbg !410 + %11856 = insertelement <4 x i32> %11855, i32 %11852, i64 1, !dbg !410 + %11857 = insertelement <4 x i32> %11856, i32 %11853, i64 2, !dbg !410 + %11858 = insertelement <4 x i32> %11857, i32 %11854, i64 3, !dbg !410 + store <4 x i32> %11858, ptr addrspace(3) %11553, align 16, !dbg !410 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !410 + %11859 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11568) #3, !dbg !410 + %11860 = extractvalue { i32, i32, i32, i32 } %11859, 0, !dbg !410 + %11861 = extractvalue { i32, i32, i32, i32 } %11859, 1, !dbg !410 + %11862 = extractvalue { i32, i32, i32, i32 } %11859, 2, !dbg !410 + %11863 = extractvalue { i32, i32, i32, i32 } %11859, 3, !dbg !410 + %11864 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11575) #3, !dbg !410 + %11865 = extractvalue { i32, i32, i32, i32 } %11864, 0, !dbg !410 + %11866 = extractvalue { i32, i32, i32, i32 } %11864, 1, !dbg !410 + %11867 = extractvalue { i32, i32, i32, i32 } %11864, 2, !dbg !410 + %11868 = extractvalue { i32, i32, i32, i32 } %11864, 3, !dbg !410 + %11869 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11582) #3, !dbg !410 + %11870 = extractvalue { i32, i32, i32, i32 } %11869, 0, !dbg !410 + %11871 = extractvalue { i32, i32, i32, i32 } %11869, 1, !dbg !410 + %11872 = extractvalue { i32, i32, i32, i32 } %11869, 2, !dbg !410 + %11873 = extractvalue { i32, i32, i32, i32 } %11869, 3, !dbg !410 + %11874 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11589) #3, !dbg !410 + %11875 = extractvalue { i32, i32, i32, i32 } %11874, 0, !dbg !410 + %11876 = extractvalue { i32, i32, i32, i32 } %11874, 1, !dbg !410 + %11877 = extractvalue { i32, i32, i32, i32 } %11874, 2, !dbg !410 + %11878 = extractvalue { i32, i32, i32, i32 } %11874, 3, !dbg !410 + %11879 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11596) #3, !dbg !410 + %11880 = extractvalue { i32, i32, i32, i32 } %11879, 0, !dbg !410 + %11881 = extractvalue { i32, i32, i32, i32 } %11879, 1, !dbg !410 + %11882 = extractvalue { i32, i32, i32, i32 } %11879, 2, !dbg !410 + %11883 = extractvalue { i32, i32, i32, i32 } %11879, 3, !dbg !410 + %11884 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11603) #3, !dbg !410 + %11885 = extractvalue { i32, i32, i32, i32 } %11884, 0, !dbg !410 + %11886 = extractvalue { i32, i32, i32, i32 } %11884, 1, !dbg !410 + %11887 = extractvalue { i32, i32, i32, i32 } %11884, 2, !dbg !410 + %11888 = extractvalue { i32, i32, i32, i32 } %11884, 3, !dbg !410 + %11889 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11610) #3, !dbg !410 + %11890 = extractvalue { i32, i32, i32, i32 } %11889, 0, !dbg !410 + %11891 = extractvalue { i32, i32, i32, i32 } %11889, 1, !dbg !410 + %11892 = extractvalue { i32, i32, i32, i32 } %11889, 2, !dbg !410 + %11893 = extractvalue { i32, i32, i32, i32 } %11889, 3, !dbg !410 + %11894 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %11617) #3, !dbg !410 + %11895 = extractvalue { i32, i32, i32, i32 } %11894, 0, !dbg !410 + %11896 = extractvalue { i32, i32, i32, i32 } %11894, 1, !dbg !410 + %11897 = extractvalue { i32, i32, i32, i32 } %11894, 2, !dbg !410 + %11898 = extractvalue { i32, i32, i32, i32 } %11894, 3, !dbg !410 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11860, i32 %11861, i32 %11862, i32 %11863, ptr addrspace(1) %11748, i1 %4920) #3, !dbg !410 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11865, i32 %11866, i32 %11867, i32 %11868, ptr addrspace(1) %11750, i1 %4921) #3, !dbg !410 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11870, i32 %11871, i32 %11872, i32 %11873, ptr addrspace(1) %11752, i1 %4922) #3, !dbg !410 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11875, i32 %11876, i32 %11877, i32 %11878, ptr addrspace(1) %11754, i1 %4923) #3, !dbg !410 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11880, i32 %11881, i32 %11882, i32 %11883, ptr addrspace(1) %11756, i1 %4924) #3, !dbg !410 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11885, i32 %11886, i32 %11887, i32 %11888, ptr addrspace(1) %11758, i1 %4925) #3, !dbg !410 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11890, i32 %11891, i32 %11892, i32 %11893, ptr addrspace(1) %11760, i1 %4926) #3, !dbg !410 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11895, i32 %11896, i32 %11897, i32 %11898, ptr addrspace(1) %11762, i1 %4927) #3, !dbg !410 + br label %11899, !dbg !39 + +11899: ; preds = %._crit_edge1593, %11364 + ret void, !dbg !411 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smax.i32(i32, i32) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smin.i32(i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.commit.group() #3 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.wait.group(i32 immarg) #3 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.idx.i32(i32, i32, i32, i32) #4 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.fence.sync.aligned() #5 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.commit_group.sync.aligned() #5 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #6 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #7 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #7 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } +attributes #4 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #5 = { convergent nounwind } +attributes #6 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #7 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_tem_fused_zeros_1", linkageName: "triton_tem_fused_zeros_1", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 94, column: 54, scope: !5) +!9 = !DILocation(line: 95, column: 54, scope: !5) +!10 = !DILocation(line: 95, column: 63, scope: !5) +!11 = !DILocation(line: 97, column: 74, scope: !5) +!12 = !DILocation(line: 97, column: 66, scope: !5) +!13 = !DILocation(line: 97, column: 100, scope: !5) +!14 = !DILocation(line: 97, column: 91, scope: !5) +!15 = !DILocation(line: 97, column: 82, scope: !5) +!16 = !DILocation(line: 97, column: 59, scope: !5) +!17 = !DILocation(line: 97, column: 111, scope: !5) +!18 = !DILocation(line: 111, column: 24, scope: !5) +!19 = !DILocation(line: 41, column: 22, scope: !20, inlinedAt: !22) +!20 = distinct !DILexicalBlockFile(scope: !5, file: !21, discriminator: 0) +!21 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!22 = !DILocation(line: 112, column: 36, scope: !5) +!23 = !DILocation(line: 41, column: 28, scope: !20, inlinedAt: !22) +!24 = !DILocation(line: 115, column: 27, scope: !5) +!25 = !DILocation(line: 116, column: 28, scope: !5) +!26 = !DILocation(line: 117, column: 23, scope: !5) +!27 = !DILocation(line: 124, column: 25, scope: !5) +!28 = !DILocation(line: 124, column: 47, scope: !5) +!29 = !DILocation(line: 124, column: 35, scope: !5) +!30 = !DILocation(line: 124, column: 59, scope: !5) +!31 = !DILocation(line: 128, column: 50, scope: !5) +!32 = !DILocation(line: 128, column: 37, scope: !5) +!33 = !DILocation(line: 128, column: 61, scope: !5) +!34 = !DILocation(line: 131, column: 9, scope: !5) +!35 = !DILocation(line: 132, column: 9, scope: !5) +!36 = !DILocation(line: 133, column: 10, scope: !5) +!37 = !DILocation(line: 136, column: 26, scope: !5) +!38 = !DILocation(line: 139, column: 14, scope: !5) +!39 = !DILocation(line: 139, column: 7, scope: !5) +!40 = !DILocation(line: 41, column: 22, scope: !20, inlinedAt: !41) +!41 = !DILocation(line: 113, column: 34, scope: !5) +!42 = !DILocation(line: 41, column: 28, scope: !20, inlinedAt: !41) +!43 = !DILocation(line: 140, column: 24, scope: !5) +!44 = !DILocation(line: 144, column: 29, scope: !5) +!45 = !DILocation(line: 144, column: 54, scope: !5) +!46 = !DILocation(line: 144, column: 44, scope: !5) +!47 = !DILocation(line: 145, column: 35, scope: !5) +!48 = !DILocation(line: 154, column: 55, scope: !5) +!49 = !DILocation(line: 154, column: 78, scope: !5) +!50 = !DILocation(line: 155, column: 50, scope: !5) +!51 = !DILocation(line: 155, column: 68, scope: !5) +!52 = !DILocation(line: 158, column: 30, scope: !5) +!53 = !DILocation(line: 158, column: 52, scope: !5) +!54 = !DILocation(line: 158, column: 40, scope: !5) +!55 = !DILocation(line: 158, column: 63, scope: !5) +!56 = !DILocation(line: 159, column: 32, scope: !5) +!57 = !DILocation(line: 159, column: 55, scope: !5) +!58 = !DILocation(line: 159, column: 42, scope: !5) +!59 = !DILocation(line: 159, column: 66, scope: !5) +!60 = !DILocation(line: 161, column: 30, scope: !5) +!61 = !DILocation(line: 161, column: 35, scope: !5) +!62 = !DILocation(line: 161, column: 46, scope: !5) +!63 = !DILocation(line: 161, column: 56, scope: !5) +!64 = !DILocation(line: 163, column: 17, scope: !5) +!65 = !DILocation(line: 164, column: 19, scope: !5) +!66 = !DILocation(line: 167, column: 19, scope: !5) +!67 = !DILocation(line: 168, column: 21, scope: !5) +!68 = !DILocation(line: 169, column: 25, scope: !5) +!69 = !DILocation(line: 174, column: 36, scope: !5) +!70 = !DILocation(line: 175, column: 29, scope: !5) +!71 = !DILocation(line: 825, column: 38, scope: !72, inlinedAt: !73) +!72 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!73 = !DILocation(line: 178, column: 107, scope: !5) +!74 = !DILocation(line: 825, column: 20, scope: !72, inlinedAt: !73) +!75 = !DILocation(line: 825, column: 56, scope: !72, inlinedAt: !73) +!76 = !DILocation(line: 825, column: 49, scope: !72, inlinedAt: !73) +!77 = !DILocation(line: 833, column: 52, scope: !72, inlinedAt: !73) +!78 = !DILocation(line: 833, column: 23, scope: !72, inlinedAt: !73) +!79 = !DILocation(line: 825, column: 38, scope: !72, inlinedAt: !80) +!80 = !DILocation(line: 179, column: 111, scope: !5) +!81 = !DILocation(line: 825, column: 20, scope: !72, inlinedAt: !80) +!82 = !DILocation(line: 825, column: 49, scope: !72, inlinedAt: !80) +!83 = !DILocation(line: 833, column: 23, scope: !72, inlinedAt: !80) +!84 = !DILocation(line: 188, column: 58, scope: !5) +!85 = !DILocation(line: 188, column: 34, scope: !5) +!86 = !DILocation(line: 188, column: 25, scope: !5) +!87 = !DILocation(line: 189, column: 33, scope: !5) +!88 = !DILocation(line: 189, column: 26, scope: !5) +!89 = !DILocation(line: 190, column: 30, scope: !5) +!90 = !DILocation(line: 190, column: 50, scope: !5) +!91 = !DILocation(line: 195, column: 30, scope: !5) +!92 = !DILocation(line: 196, column: 27, scope: !5) +!93 = !DILocation(line: 196, column: 41, scope: !5) +!94 = !DILocation(line: 197, column: 53, scope: !5) +!95 = !DILocation(line: 197, column: 39, scope: !5) +!96 = !DILocation(line: 199, column: 42, scope: !5) +!97 = !DILocation(line: 199, column: 29, scope: !5) +!98 = !DILocation(line: 390, column: 37, scope: !72, inlinedAt: !99) +!99 = !DILocation(line: 207, column: 12, scope: !5) +!100 = !DILocation(line: 390, column: 18, scope: !72, inlinedAt: !99) +!101 = !DILocation(line: 390, column: 49, scope: !72, inlinedAt: !99) +!102 = !DILocation(line: 391, column: 18, scope: !72, inlinedAt: !99) +!103 = !DILocation(line: 391, column: 49, scope: !72, inlinedAt: !99) +!104 = !DILocation(line: 395, column: 43, scope: !72, inlinedAt: !99) +!105 = !DILocation(line: 41, column: 22, scope: !20, inlinedAt: !99) +!106 = !DILocation(line: 41, column: 28, scope: !20, inlinedAt: !99) +!107 = !DILocation(line: 395, column: 101, scope: !72, inlinedAt: !99) +!108 = !DILocation(line: 395, column: 63, scope: !72, inlinedAt: !99) +!109 = !DILocation(line: 485, column: 34, scope: !72, inlinedAt: !99) +!110 = !DILocation(line: 397, column: 28, scope: !72, inlinedAt: !99) +!111 = !DILocation(line: 485, column: 23, scope: !72, inlinedAt: !99) +!112 = !DILocation(line: 831, column: 52, scope: !72, inlinedAt: !99) +!113 = !DILocation(line: 831, column: 23, scope: !72, inlinedAt: !99) +!114 = !DILocation(line: 414, column: 19, scope: !72, inlinedAt: !99) +!115 = !DILocation(line: 415, column: 19, scope: !72, inlinedAt: !99) +!116 = !DILocation(line: 417, column: 19, scope: !72, inlinedAt: !99) +!117 = !DILocation(line: 459, column: 19, scope: !72, inlinedAt: !99) +!118 = !DILocation(line: 798, column: 21, scope: !72, inlinedAt: !99) +!119 = !DILocation(line: 487, column: 23, scope: !72, inlinedAt: !99) +!120 = !DILocation(line: 488, column: 23, scope: !72, inlinedAt: !99) +!121 = !DILocation(line: 490, column: 23, scope: !72, inlinedAt: !99) +!122 = !DILocation(line: 482, column: 23, scope: !72, inlinedAt: !99) +!123 = !DILocation(line: 493, column: 24, scope: !72, inlinedAt: !99) +!124 = !DILocation(line: 504, column: 24, scope: !72, inlinedAt: !99) +!125 = !DILocation(line: 531, column: 19, scope: !72, inlinedAt: !99) +!126 = !DILocation(line: 461, column: 14, scope: !72, inlinedAt: !99) +!127 = !DILocation(line: 494, column: 24, scope: !72, inlinedAt: !99) +!128 = !DILocation(line: 496, column: 25, scope: !72, inlinedAt: !99) +!129 = !DILocation(line: 499, column: 25, scope: !72, inlinedAt: !99) +!130 = !DILocation(line: 500, column: 24, scope: !72, inlinedAt: !99) +!131 = !DILocation(line: 502, column: 39, scope: !72, inlinedAt: !99) +!132 = !DILocation(line: 505, column: 24, scope: !72, inlinedAt: !99) +!133 = !DILocation(line: 506, column: 23, scope: !72, inlinedAt: !99) +!134 = !DILocation(line: 507, column: 25, scope: !72, inlinedAt: !99) +!135 = !DILocation(line: 508, column: 25, scope: !72, inlinedAt: !99) +!136 = !DILocation(line: 510, column: 25, scope: !72, inlinedAt: !99) +!137 = !DILocation(line: 511, column: 24, scope: !72, inlinedAt: !99) +!138 = !DILocation(line: 513, column: 39, scope: !72, inlinedAt: !99) +!139 = !DILocation(line: 514, column: 25, scope: !72, inlinedAt: !99) +!140 = !DILocation(line: 515, column: 24, scope: !72, inlinedAt: !99) +!141 = !DILocation(line: 516, column: 24, scope: !72, inlinedAt: !99) +!142 = !DILocation(line: 521, column: 69, scope: !72, inlinedAt: !99) +!143 = !DILocation(line: 524, column: 27, scope: !72, inlinedAt: !99) +!144 = !DILocation(line: 525, column: 39, scope: !72, inlinedAt: !99) +!145 = !DILocation(line: 525, column: 21, scope: !72, inlinedAt: !99) +!146 = !DILocation(line: 530, column: 20, scope: !72, inlinedAt: !99) +!147 = !DILocation(line: 531, column: 14, scope: !72, inlinedAt: !99) +!148 = !DILocation(line: 551, column: 15, scope: !72, inlinedAt: !99) +!149 = !DILocation(line: 549, column: 43, scope: !72, inlinedAt: !99) +!150 = !DILocation(line: 553, column: 21, scope: !72, inlinedAt: !99) +!151 = !DILocation(line: 788, column: 33, scope: !72, inlinedAt: !99) +!152 = !DILocation(line: 789, column: 38, scope: !72, inlinedAt: !99) +!153 = !DILocation(line: 789, column: 24, scope: !72, inlinedAt: !99) +!154 = !DILocation(line: 790, column: 109, scope: !72, inlinedAt: !99) +!155 = !DILocation(line: 790, column: 113, scope: !72, inlinedAt: !99) +!156 = !DILocation(line: 790, column: 55, scope: !72, inlinedAt: !99) +!157 = !DILocation(line: 790, column: 25, scope: !72, inlinedAt: !99) +!158 = !DILocation(line: 791, column: 35, scope: !72, inlinedAt: !99) +!159 = !DILocation(line: 792, column: 34, scope: !72, inlinedAt: !99) +!160 = !DILocation(line: 792, column: 48, scope: !72, inlinedAt: !99) +!161 = !DILocation(line: 792, column: 63, scope: !72, inlinedAt: !99) +!162 = !DILocation(line: 793, column: 29, scope: !72, inlinedAt: !99) +!163 = !DILocation(line: 793, column: 61, scope: !72, inlinedAt: !99) +!164 = !DILocation(line: 793, column: 42, scope: !72, inlinedAt: !99) +!165 = !DILocation(line: 414, column: 28, scope: !72, inlinedAt: !99) +!166 = !DILocation(line: 214, column: 39, scope: !5) +!167 = !DILocation(line: 215, column: 31, scope: !5) +!168 = !DILocation(line: 215, column: 45, scope: !5) +!169 = !DILocation(line: 216, column: 62, scope: !5) +!170 = !DILocation(line: 216, column: 43, scope: !5) +!171 = !DILocation(line: 218, column: 33, scope: !5) +!172 = !DILocation(line: 390, column: 37, scope: !72, inlinedAt: !173) +!173 = !DILocation(line: 226, column: 16, scope: !5) +!174 = !DILocation(line: 390, column: 18, scope: !72, inlinedAt: !173) +!175 = !DILocation(line: 390, column: 49, scope: !72, inlinedAt: !173) +!176 = !DILocation(line: 391, column: 18, scope: !72, inlinedAt: !173) +!177 = !DILocation(line: 391, column: 49, scope: !72, inlinedAt: !173) +!178 = !DILocation(line: 395, column: 43, scope: !72, inlinedAt: !173) +!179 = !DILocation(line: 395, column: 63, scope: !72, inlinedAt: !173) +!180 = !DILocation(line: 397, column: 28, scope: !72, inlinedAt: !173) +!181 = !DILocation(line: 831, column: 52, scope: !72, inlinedAt: !173) +!182 = !DILocation(line: 831, column: 23, scope: !72, inlinedAt: !173) +!183 = !DILocation(line: 414, column: 19, scope: !72, inlinedAt: !173) +!184 = !DILocation(line: 415, column: 19, scope: !72, inlinedAt: !173) +!185 = !DILocation(line: 417, column: 19, scope: !72, inlinedAt: !173) +!186 = !DILocation(line: 459, column: 19, scope: !72, inlinedAt: !173) +!187 = !DILocation(line: 461, column: 14, scope: !72, inlinedAt: !173) +!188 = !DILocation(line: 524, column: 27, scope: !72, inlinedAt: !173) +!189 = !DILocation(line: 476, column: 79, scope: !72, inlinedAt: !173) +!190 = !DILocation(line: 525, column: 39, scope: !72, inlinedAt: !173) +!191 = !DILocation(line: 525, column: 21, scope: !72, inlinedAt: !173) +!192 = !DILocation(line: 530, column: 20, scope: !72, inlinedAt: !173) +!193 = !DILocation(line: 531, column: 19, scope: !72, inlinedAt: !173) +!194 = !DILocation(line: 531, column: 14, scope: !72, inlinedAt: !173) +!195 = !DILocation(line: 551, column: 15, scope: !72, inlinedAt: !173) +!196 = !DILocation(line: 538, column: 71, scope: !72, inlinedAt: !173) +!197 = !DILocation(line: 553, column: 21, scope: !72, inlinedAt: !173) +!198 = !DILocation(line: 788, column: 33, scope: !72, inlinedAt: !173) +!199 = !DILocation(line: 789, column: 38, scope: !72, inlinedAt: !173) +!200 = !DILocation(line: 789, column: 24, scope: !72, inlinedAt: !173) +!201 = !DILocation(line: 790, column: 109, scope: !72, inlinedAt: !173) +!202 = !DILocation(line: 790, column: 113, scope: !72, inlinedAt: !173) +!203 = !DILocation(line: 790, column: 55, scope: !72, inlinedAt: !173) +!204 = !DILocation(line: 790, column: 25, scope: !72, inlinedAt: !173) +!205 = !DILocation(line: 791, column: 35, scope: !72, inlinedAt: !173) +!206 = !DILocation(line: 792, column: 34, scope: !72, inlinedAt: !173) +!207 = !DILocation(line: 792, column: 48, scope: !72, inlinedAt: !173) +!208 = !DILocation(line: 792, column: 63, scope: !72, inlinedAt: !173) +!209 = !DILocation(line: 793, column: 29, scope: !72, inlinedAt: !173) +!210 = !DILocation(line: 793, column: 61, scope: !72, inlinedAt: !173) +!211 = !DILocation(line: 793, column: 42, scope: !72, inlinedAt: !173) +!212 = !DILocation(line: 414, column: 28, scope: !72, inlinedAt: !173) +!213 = !DILocation(line: 231, column: 24, scope: !5) +!214 = !DILocation(line: 231, column: 56, scope: !5) +!215 = !DILocation(line: 232, column: 14, scope: !5) +!216 = !DILocation(line: 236, column: 30, scope: !5) +!217 = !DILocation(line: 252, column: 25, scope: !5) +!218 = !DILocation(line: 253, column: 29, scope: !5) +!219 = !DILocation(line: 825, column: 38, scope: !72, inlinedAt: !220) +!220 = !DILocation(line: 256, column: 107, scope: !5) +!221 = !DILocation(line: 825, column: 20, scope: !72, inlinedAt: !220) +!222 = !DILocation(line: 825, column: 56, scope: !72, inlinedAt: !220) +!223 = !DILocation(line: 825, column: 49, scope: !72, inlinedAt: !220) +!224 = !DILocation(line: 833, column: 52, scope: !72, inlinedAt: !220) +!225 = !DILocation(line: 833, column: 23, scope: !72, inlinedAt: !220) +!226 = !DILocation(line: 825, column: 20, scope: !72, inlinedAt: !227) +!227 = !DILocation(line: 257, column: 107, scope: !5) +!228 = !DILocation(line: 825, column: 49, scope: !72, inlinedAt: !227) +!229 = !DILocation(line: 833, column: 23, scope: !72, inlinedAt: !227) +!230 = !DILocation(line: 263, column: 32, scope: !5) +!231 = !DILocation(line: 266, column: 56, scope: !5) +!232 = !DILocation(line: 267, column: 59, scope: !5) +!233 = !DILocation(line: 269, column: 34, scope: !5) +!234 = !DILocation(line: 281, column: 58, scope: !5) +!235 = !DILocation(line: 281, column: 80, scope: !5) +!236 = !DILocation(line: 282, column: 53, scope: !5) +!237 = !DILocation(line: 282, column: 70, scope: !5) +!238 = !DILocation(line: 286, column: 32, scope: !5) +!239 = !DILocation(line: 287, column: 30, scope: !5) +!240 = !DILocation(line: 287, column: 43, scope: !5) +!241 = !DILocation(line: 288, column: 55, scope: !5) +!242 = !DILocation(line: 288, column: 42, scope: !5) +!243 = !DILocation(line: 290, column: 45, scope: !5) +!244 = !DILocation(line: 290, column: 32, scope: !5) +!245 = !DILocation(line: 601, column: 37, scope: !72, inlinedAt: !246) +!246 = !DILocation(line: 298, column: 16, scope: !5) +!247 = !DILocation(line: 602, column: 38, scope: !72, inlinedAt: !246) +!248 = !DILocation(line: 608, column: 42, scope: !72, inlinedAt: !246) +!249 = !DILocation(line: 41, column: 22, scope: !20, inlinedAt: !246) +!250 = !DILocation(line: 41, column: 28, scope: !20, inlinedAt: !246) +!251 = !DILocation(line: 608, column: 98, scope: !72, inlinedAt: !246) +!252 = !DILocation(line: 608, column: 61, scope: !72, inlinedAt: !246) +!253 = !DILocation(line: 798, column: 21, scope: !72, inlinedAt: !246) +!254 = !DILocation(line: 701, column: 35, scope: !72, inlinedAt: !246) +!255 = !DILocation(line: 610, column: 28, scope: !72, inlinedAt: !246) +!256 = !DILocation(line: 701, column: 24, scope: !72, inlinedAt: !246) +!257 = !DILocation(line: 710, column: 25, scope: !72, inlinedAt: !246) +!258 = !DILocation(line: 709, column: 25, scope: !72, inlinedAt: !246) +!259 = !DILocation(line: 712, column: 25, scope: !72, inlinedAt: !246) +!260 = !DILocation(line: 718, column: 39, scope: !72, inlinedAt: !246) +!261 = !DILocation(line: 719, column: 25, scope: !72, inlinedAt: !246) +!262 = !DILocation(line: 720, column: 24, scope: !72, inlinedAt: !246) +!263 = !DILocation(line: 721, column: 24, scope: !72, inlinedAt: !246) +!264 = !DILocation(line: 306, column: 41, scope: !5) +!265 = !DILocation(line: 307, column: 34, scope: !5) +!266 = !DILocation(line: 307, column: 47, scope: !5) +!267 = !DILocation(line: 308, column: 64, scope: !5) +!268 = !DILocation(line: 308, column: 46, scope: !5) +!269 = !DILocation(line: 310, column: 36, scope: !5) +!270 = !DILocation(line: 601, column: 37, scope: !72, inlinedAt: !271) +!271 = !DILocation(line: 318, column: 20, scope: !5) +!272 = !DILocation(line: 602, column: 38, scope: !72, inlinedAt: !271) +!273 = !DILocation(line: 608, column: 42, scope: !72, inlinedAt: !271) +!274 = !DILocation(line: 608, column: 61, scope: !72, inlinedAt: !271) +!275 = !DILocation(line: 676, column: 20, scope: !72, inlinedAt: !246) +!276 = !DILocation(line: 262, column: 30, scope: !5) +!277 = !DILocation(line: 263, column: 51, scope: !5) +!278 = !DILocation(line: 266, column: 44, scope: !5) +!279 = !DILocation(line: 266, column: 67, scope: !5) +!280 = !DILocation(line: 267, column: 36, scope: !5) +!281 = !DILocation(line: 267, column: 46, scope: !5) +!282 = !DILocation(line: 267, column: 70, scope: !5) +!283 = !DILocation(line: 269, column: 50, scope: !5) +!284 = !DILocation(line: 269, column: 60, scope: !5) +!285 = !DILocation(line: 271, column: 21, scope: !5) +!286 = !DILocation(line: 272, column: 23, scope: !5) +!287 = !DILocation(line: 275, column: 25, scope: !5) +!288 = !DILocation(line: 276, column: 29, scope: !5) +!289 = !DILocation(line: 601, column: 18, scope: !72, inlinedAt: !246) +!290 = !DILocation(line: 601, column: 49, scope: !72, inlinedAt: !246) +!291 = !DILocation(line: 602, column: 19, scope: !72, inlinedAt: !246) +!292 = !DILocation(line: 602, column: 51, scope: !72, inlinedAt: !246) +!293 = !DILocation(line: 831, column: 23, scope: !72, inlinedAt: !246) +!294 = !DILocation(line: 674, column: 28, scope: !72, inlinedAt: !246) +!295 = !DILocation(line: 674, column: 22, scope: !72, inlinedAt: !246) +!296 = !DILocation(line: 833, column: 23, scope: !72, inlinedAt: !246) +!297 = !DILocation(line: 748, column: 29, scope: !72, inlinedAt: !246) +!298 = !DILocation(line: 748, column: 21, scope: !72, inlinedAt: !246) +!299 = !DILocation(line: 626, column: 19, scope: !72, inlinedAt: !246) +!300 = !DILocation(line: 627, column: 19, scope: !72, inlinedAt: !246) +!301 = !DILocation(line: 831, column: 52, scope: !72, inlinedAt: !246) +!302 = !DILocation(line: 675, column: 26, scope: !72, inlinedAt: !246) +!303 = !DILocation(line: 675, column: 46, scope: !72, inlinedAt: !246) +!304 = !DILocation(line: 678, column: 15, scope: !72, inlinedAt: !246) +!305 = !DILocation(line: 698, column: 25, scope: !72, inlinedAt: !246) +!306 = !DILocation(line: 703, column: 25, scope: !72, inlinedAt: !246) +!307 = !DILocation(line: 704, column: 24, scope: !72, inlinedAt: !246) +!308 = !DILocation(line: 706, column: 24, scope: !72, inlinedAt: !246) +!309 = !DILocation(line: 722, column: 24, scope: !72, inlinedAt: !246) +!310 = !DILocation(line: 723, column: 25, scope: !72, inlinedAt: !246) +!311 = !DILocation(line: 724, column: 25, scope: !72, inlinedAt: !246) +!312 = !DILocation(line: 726, column: 25, scope: !72, inlinedAt: !246) +!313 = !DILocation(line: 730, column: 25, scope: !72, inlinedAt: !246) +!314 = !DILocation(line: 727, column: 24, scope: !72, inlinedAt: !246) +!315 = !DILocation(line: 729, column: 39, scope: !72, inlinedAt: !246) +!316 = !DILocation(line: 731, column: 24, scope: !72, inlinedAt: !246) +!317 = !DILocation(line: 732, column: 24, scope: !72, inlinedAt: !246) +!318 = !DILocation(line: 736, column: 69, scope: !72, inlinedAt: !246) +!319 = !DILocation(line: 739, column: 27, scope: !72, inlinedAt: !246) +!320 = !DILocation(line: 740, column: 40, scope: !72, inlinedAt: !246) +!321 = !DILocation(line: 740, column: 22, scope: !72, inlinedAt: !246) +!322 = !DILocation(line: 744, column: 24, scope: !72, inlinedAt: !246) +!323 = !DILocation(line: 744, column: 43, scope: !72, inlinedAt: !246) +!324 = !DILocation(line: 750, column: 20, scope: !72, inlinedAt: !246) +!325 = !DILocation(line: 775, column: 43, scope: !72, inlinedAt: !246) +!326 = !DILocation(line: 628, column: 19, scope: !72, inlinedAt: !246) +!327 = !DILocation(line: 788, column: 33, scope: !72, inlinedAt: !246) +!328 = !DILocation(line: 789, column: 38, scope: !72, inlinedAt: !246) +!329 = !DILocation(line: 790, column: 109, scope: !72, inlinedAt: !246) +!330 = !DILocation(line: 790, column: 113, scope: !72, inlinedAt: !246) +!331 = !DILocation(line: 790, column: 55, scope: !72, inlinedAt: !246) +!332 = !DILocation(line: 791, column: 35, scope: !72, inlinedAt: !246) +!333 = !DILocation(line: 793, column: 29, scope: !72, inlinedAt: !246) +!334 = !DILocation(line: 793, column: 61, scope: !72, inlinedAt: !246) +!335 = !DILocation(line: 751, column: 22, scope: !72, inlinedAt: !246) +!336 = !DILocation(line: 751, column: 16, scope: !72, inlinedAt: !246) +!337 = !DILocation(line: 775, column: 24, scope: !72, inlinedAt: !246) +!338 = !DILocation(line: 773, column: 45, scope: !72, inlinedAt: !246) +!339 = !DILocation(line: 789, column: 24, scope: !72, inlinedAt: !246) +!340 = !DILocation(line: 790, column: 25, scope: !72, inlinedAt: !246) +!341 = !DILocation(line: 792, column: 34, scope: !72, inlinedAt: !246) +!342 = !DILocation(line: 792, column: 48, scope: !72, inlinedAt: !246) +!343 = !DILocation(line: 792, column: 63, scope: !72, inlinedAt: !246) +!344 = !DILocation(line: 793, column: 42, scope: !72, inlinedAt: !246) +!345 = !DILocation(line: 626, column: 28, scope: !72, inlinedAt: !246) +!346 = !DILocation(line: 627, column: 28, scope: !72, inlinedAt: !246) +!347 = !DILocation(line: 674, column: 52, scope: !72, inlinedAt: !246) +!348 = !DILocation(line: 833, column: 52, scope: !72, inlinedAt: !246) +!349 = !DILocation(line: 601, column: 18, scope: !72, inlinedAt: !271) +!350 = !DILocation(line: 601, column: 49, scope: !72, inlinedAt: !271) +!351 = !DILocation(line: 602, column: 19, scope: !72, inlinedAt: !271) +!352 = !DILocation(line: 602, column: 51, scope: !72, inlinedAt: !271) +!353 = !DILocation(line: 831, column: 23, scope: !72, inlinedAt: !271) +!354 = !DILocation(line: 674, column: 28, scope: !72, inlinedAt: !271) +!355 = !DILocation(line: 674, column: 22, scope: !72, inlinedAt: !271) +!356 = !DILocation(line: 833, column: 23, scope: !72, inlinedAt: !271) +!357 = !DILocation(line: 748, column: 29, scope: !72, inlinedAt: !271) +!358 = !DILocation(line: 748, column: 21, scope: !72, inlinedAt: !271) +!359 = !DILocation(line: 626, column: 19, scope: !72, inlinedAt: !271) +!360 = !DILocation(line: 627, column: 19, scope: !72, inlinedAt: !271) +!361 = !DILocation(line: 610, column: 28, scope: !72, inlinedAt: !271) +!362 = !DILocation(line: 831, column: 52, scope: !72, inlinedAt: !271) +!363 = !DILocation(line: 675, column: 26, scope: !72, inlinedAt: !271) +!364 = !DILocation(line: 675, column: 46, scope: !72, inlinedAt: !271) +!365 = !DILocation(line: 676, column: 20, scope: !72, inlinedAt: !271) +!366 = !DILocation(line: 678, column: 15, scope: !72, inlinedAt: !271) +!367 = !DILocation(line: 739, column: 27, scope: !72, inlinedAt: !271) +!368 = !DILocation(line: 692, column: 78, scope: !72, inlinedAt: !271) +!369 = !DILocation(line: 740, column: 40, scope: !72, inlinedAt: !271) +!370 = !DILocation(line: 740, column: 22, scope: !72, inlinedAt: !271) +!371 = !DILocation(line: 744, column: 24, scope: !72, inlinedAt: !271) +!372 = !DILocation(line: 744, column: 43, scope: !72, inlinedAt: !271) +!373 = !DILocation(line: 750, column: 20, scope: !72, inlinedAt: !271) +!374 = !DILocation(line: 751, column: 22, scope: !72, inlinedAt: !271) +!375 = !DILocation(line: 751, column: 16, scope: !72, inlinedAt: !271) +!376 = !DILocation(line: 775, column: 24, scope: !72, inlinedAt: !271) +!377 = !DILocation(line: 759, column: 70, scope: !72, inlinedAt: !271) +!378 = !DILocation(line: 775, column: 43, scope: !72, inlinedAt: !271) +!379 = !DILocation(line: 788, column: 33, scope: !72, inlinedAt: !271) +!380 = !DILocation(line: 789, column: 38, scope: !72, inlinedAt: !271) +!381 = !DILocation(line: 789, column: 24, scope: !72, inlinedAt: !271) +!382 = !DILocation(line: 790, column: 109, scope: !72, inlinedAt: !271) +!383 = !DILocation(line: 790, column: 113, scope: !72, inlinedAt: !271) +!384 = !DILocation(line: 790, column: 55, scope: !72, inlinedAt: !271) +!385 = !DILocation(line: 790, column: 25, scope: !72, inlinedAt: !271) +!386 = !DILocation(line: 791, column: 35, scope: !72, inlinedAt: !271) +!387 = !DILocation(line: 792, column: 34, scope: !72, inlinedAt: !271) +!388 = !DILocation(line: 792, column: 48, scope: !72, inlinedAt: !271) +!389 = !DILocation(line: 792, column: 63, scope: !72, inlinedAt: !271) +!390 = !DILocation(line: 793, column: 29, scope: !72, inlinedAt: !271) +!391 = !DILocation(line: 793, column: 61, scope: !72, inlinedAt: !271) +!392 = !DILocation(line: 793, column: 42, scope: !72, inlinedAt: !271) +!393 = !DILocation(line: 626, column: 28, scope: !72, inlinedAt: !271) +!394 = !DILocation(line: 627, column: 28, scope: !72, inlinedAt: !271) +!395 = !DILocation(line: 628, column: 19, scope: !72, inlinedAt: !271) +!396 = !DILocation(line: 674, column: 52, scope: !72, inlinedAt: !271) +!397 = !DILocation(line: 833, column: 52, scope: !72, inlinedAt: !271) +!398 = !DILocation(line: 323, column: 23, scope: !5) +!399 = !DILocation(line: 323, column: 55, scope: !5) +!400 = !DILocation(line: 332, column: 30, scope: !5) +!401 = !DILocation(line: 334, column: 14, scope: !5) +!402 = !DILocation(line: 344, column: 27, scope: !5) +!403 = !DILocation(line: 344, column: 45, scope: !5) +!404 = !DILocation(line: 344, column: 53, scope: !5) +!405 = !DILocation(line: 344, column: 41, scope: !5) +!406 = !DILocation(line: 344, column: 64, scope: !5) +!407 = !DILocation(line: 344, column: 71, scope: !5) +!408 = !DILocation(line: 344, column: 59, scope: !5) +!409 = !DILocation(line: 345, column: 29, scope: !5) +!410 = !DILocation(line: 345, column: 69, scope: !5) +!411 = !DILocation(line: 139, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..14c2241b63c28e4b7c054d2a2e52fa39cd4a9406 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.ptx @@ -0,0 +1,9592 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_tem_fused_zeros_1 // -- Begin function triton_tem_fused_zeros_1 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_tem_fused_zeros_1 +.visible .entry triton_tem_fused_zeros_1( + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_0, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_1, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_2, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_3, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_4, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_5, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_6, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_7, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_8, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_9, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_10, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_11, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_12, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_13, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_14, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_15, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_16, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_17, + .param .u32 triton_tem_fused_zeros_1_param_18, + .param .u32 triton_tem_fused_zeros_1_param_19, + .param .u32 triton_tem_fused_zeros_1_param_20, + .param .u32 triton_tem_fused_zeros_1_param_21, + .param .u32 triton_tem_fused_zeros_1_param_22, + .param .u32 triton_tem_fused_zeros_1_param_23, + .param .u32 triton_tem_fused_zeros_1_param_24, + .param .u32 triton_tem_fused_zeros_1_param_25, + .param .u32 triton_tem_fused_zeros_1_param_26, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_27, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_28 +) +.reqntid 256 +{ + .reg .pred %p<1179>; + .reg .b16 %rs<257>; + .reg .b32 %r<15500>; + .reg .b64 %rd<1210>; + .loc 1 18 0 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:18:0 +$L__func_begin0: + .loc 1 18 0 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:18:0 + +// %bb.0: + ld.param.b32 %r2366, [triton_tem_fused_zeros_1_param_26]; + ld.param.b32 %r2359, [triton_tem_fused_zeros_1_param_19]; + ld.param.b32 %r2358, [triton_tem_fused_zeros_1_param_18]; + ld.param.b64 %rd207, [triton_tem_fused_zeros_1_param_16]; + ld.param.b64 %rd197, [triton_tem_fused_zeros_1_param_5]; + ld.param.b64 %rd196, [triton_tem_fused_zeros_1_param_4]; + ld.param.b64 %rd195, [triton_tem_fused_zeros_1_param_3]; + ld.param.b64 %rd194, [triton_tem_fused_zeros_1_param_0]; +$L__tmp0: + .loc 1 94 54 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:94:54 + shl.b32 %r1, %r2358, 12; + ld.param.b64 %rd209, [triton_tem_fused_zeros_1_param_1]; + .loc 1 95 54 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:95:54 + shl.b32 %r2367, %r2359, 10; + ld.param.b64 %rd210, [triton_tem_fused_zeros_1_param_2]; + .loc 1 97 74 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:97:74 + setp.lt.s32 %p11, %r2358, 2; + .loc 1 97 66 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:97:66 + selp.b32 %r2368, 1, 0, %p11; + .loc 1 97 100 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:97:100 + setp.gt.s32 %p12, %r2358, 1; + .loc 1 97 91 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:97:91 + selp.b32 %r2369, %r2358, 0, %p12; + .loc 1 97 82 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:97:82 + add.s32 %r2370, %r2369, %r2368; + .loc 1 97 59 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:97:59 + shl.b32 %r2, %r2370, 12; + .loc 1 97 111 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:97:111 + shl.b32 %r3, %r2370, 7; + .loc 1 111 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:111:24 + mov.u32 %r4, %ctaid.x; +$L__tmp1: + .loc 2 41 22 // standard.py:41:22 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:112:36 ] + add.s32 %r2371, %r2359, 127; + .loc 2 41 28 // standard.py:41:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:112:36 ] + shr.s32 %r2372, %r2371, 31; + shr.u32 %r2373, %r2372, 25; + add.s32 %r2374, %r2371, %r2373; + shr.s32 %r5, %r2374, 7; +$L__tmp2: + .loc 1 115 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:115:27 + mov.u32 %r6, %ctaid.y; + .loc 1 116 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:116:28 + mov.u32 %r7, %ctaid.z; + .loc 1 117 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:117:23 + and.b32 %r8, %r6, 1; + .loc 1 124 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:124:25 + mul.lo.s32 %r2375, %r2359, %r7; + shl.b32 %r2376, %r2375, 7; + .loc 1 124 35 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:124:35 + mad.lo.s32 %r2377, %r2367, %r8, %r2376; + .loc 1 131 9 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:131:9 + mul.wide.s32 %rd212, %r2377, 2; + add.s64 %rd1, %rd209, %rd212; + .loc 1 132 9 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:132:9 + add.s64 %rd2, %rd210, %rd212; + .loc 1 136 26 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:136:26 + mov.u32 %r10, %tid.x; + shr.u32 %r11, %r10, 5; + and.b32 %r12, %r10, 240; + bfe.u32 %r13, %r10, 4, 4; + or.b32 %r14, %r13, 16; + or.b32 %r15, %r13, 32; + or.b32 %r16, %r13, 48; + or.b32 %r17, %r13, 64; + or.b32 %r18, %r13, 80; + or.b32 %r19, %r13, 96; + or.b32 %r20, %r13, 112; + shr.u32 %r2378, %r10, 1; + and.b32 %r2379, %r2378, 112; + bfe.u32 %r2380, %r10, 2, 3; + or.b32 %r21, %r2379, %r2380; + or.b32 %r22, %r21, 8; + .loc 1 139 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:139:14 + setp.lt.s32 %p13, %r4, %r5; + .loc 1 139 7 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:139:7 + @%p13 bra $L__BB0_8; + bra.uni $L__BB0_1; +$L__BB0_8: + .loc 1 0 7 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:0:7 + ld.param.b32 %r2365, [triton_tem_fused_zeros_1_param_25]; + ld.param.b32 %r2364, [triton_tem_fused_zeros_1_param_24]; + ld.param.b32 %r2363, [triton_tem_fused_zeros_1_param_23]; + ld.param.b64 %rd208, [triton_tem_fused_zeros_1_param_17]; + ld.param.b64 %rd206, [triton_tem_fused_zeros_1_param_15]; + ld.param.b64 %rd205, [triton_tem_fused_zeros_1_param_14]; + ld.param.b64 %rd202, [triton_tem_fused_zeros_1_param_11]; + ld.param.b64 %rd201, [triton_tem_fused_zeros_1_param_10]; + ld.param.b64 %rd211, [triton_tem_fused_zeros_1_param_7]; + mad.lo.s32 %r9, %r2367, %r6, %r2376; + mad.wide.s32 %rd3, %r9, 2, %rd211; + .loc 1 252 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:252:25 + shl.b32 %r7565, %r4, 7; + .loc 1 253 29 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:253:29 + or.b32 %r678, %r13, %r7565; + or.b32 %r679, %r14, %r7565; + or.b32 %r680, %r15, %r7565; + or.b32 %r681, %r16, %r7565; + or.b32 %r682, %r17, %r7565; + or.b32 %r683, %r18, %r7565; + or.b32 %r684, %r19, %r7565; + or.b32 %r685, %r20, %r7565; +$L__tmp3: + .loc 1 825 38 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:825:38 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:256:107 ] + shl.b32 %r7566, %r678, 7; + shl.b32 %r7567, %r679, 7; + shl.b32 %r7568, %r680, 7; + shl.b32 %r7569, %r681, 7; + shl.b32 %r7570, %r682, 7; + shl.b32 %r7571, %r683, 7; + shl.b32 %r7572, %r684, 7; + shl.b32 %r7573, %r685, 7; + .loc 1 825 20 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:825:20 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:256:107 ] + cvt.s64.s32 %rd71, %r7566; + mul.wide.s32 %rd566, %r7566, 2; + add.s64 %rd567, %rd1, %rd566; + cvt.s64.s32 %rd72, %r7567; + mul.wide.s32 %rd568, %r7567, 2; + add.s64 %rd569, %rd1, %rd568; + cvt.s64.s32 %rd73, %r7568; + mul.wide.s32 %rd570, %r7568, 2; + add.s64 %rd571, %rd1, %rd570; + cvt.s64.s32 %rd74, %r7569; + mul.wide.s32 %rd572, %r7569, 2; + add.s64 %rd573, %rd1, %rd572; + cvt.s64.s32 %rd75, %r7570; + mul.wide.s32 %rd574, %r7570, 2; + add.s64 %rd575, %rd1, %rd574; + cvt.s64.s32 %rd76, %r7571; + mul.wide.s32 %rd576, %r7571, 2; + add.s64 %rd577, %rd1, %rd576; + cvt.s64.s32 %rd77, %r7572; + mul.wide.s32 %rd578, %r7572, 2; + add.s64 %rd579, %rd1, %rd578; + cvt.s64.s32 %rd78, %r7573; + mul.wide.s32 %rd580, %r7573, 2; + add.s64 %rd581, %rd1, %rd580; + .loc 1 825 56 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:825:56 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:256:107 ] + shl.b32 %r7574, %r10, 3; + and.b32 %r7575, %r7574, 120; + .loc 1 825 49 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:825:49 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:256:107 ] + cvt.u64.u32 %rd79, %r7575; + mul.wide.u32 %rd582, %r7575, 2; + add.s64 %rd543, %rd567, %rd582; + add.s64 %rd544, %rd569, %rd582; + add.s64 %rd545, %rd571, %rd582; + add.s64 %rd546, %rd573, %rd582; + add.s64 %rd547, %rd575, %rd582; + add.s64 %rd548, %rd577, %rd582; + add.s64 %rd549, %rd579, %rd582; + add.s64 %rd550, %rd581, %rd582; + .loc 1 833 52 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:833:52 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:256:107 ] + setp.lt.s32 %p1163, %r678, %r2359; + setp.lt.s32 %p1164, %r679, %r2359; + setp.lt.s32 %p1165, %r680, %r2359; + setp.lt.s32 %p1166, %r681, %r2359; + setp.lt.s32 %p1167, %r682, %r2359; + setp.lt.s32 %p1168, %r683, %r2359; + setp.lt.s32 %p1169, %r684, %r2359; + setp.lt.s32 %p1170, %r685, %r2359; + mov.b32 %r7436, 0; + .loc 1 833 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:833:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:256:107 ] + // begin inline asm + mov.u32 %r7432, %r7436; + mov.u32 %r7433, %r7436; + mov.u32 %r7434, %r7436; + mov.u32 %r7435, %r7436; + @%p1163 ld.global.v4.b32 { %r7432, %r7433, %r7434, %r7435 }, [ %rd543 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7440, %r7436; + mov.u32 %r7441, %r7436; + mov.u32 %r7442, %r7436; + mov.u32 %r7443, %r7436; + @%p1164 ld.global.v4.b32 { %r7440, %r7441, %r7442, %r7443 }, [ %rd544 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7448, %r7436; + mov.u32 %r7449, %r7436; + mov.u32 %r7450, %r7436; + mov.u32 %r7451, %r7436; + @%p1165 ld.global.v4.b32 { %r7448, %r7449, %r7450, %r7451 }, [ %rd545 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7456, %r7436; + mov.u32 %r7457, %r7436; + mov.u32 %r7458, %r7436; + mov.u32 %r7459, %r7436; + @%p1166 ld.global.v4.b32 { %r7456, %r7457, %r7458, %r7459 }, [ %rd546 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7464, %r7436; + mov.u32 %r7465, %r7436; + mov.u32 %r7466, %r7436; + mov.u32 %r7467, %r7436; + @%p1167 ld.global.v4.b32 { %r7464, %r7465, %r7466, %r7467 }, [ %rd547 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7472, %r7436; + mov.u32 %r7473, %r7436; + mov.u32 %r7474, %r7436; + mov.u32 %r7475, %r7436; + @%p1168 ld.global.v4.b32 { %r7472, %r7473, %r7474, %r7475 }, [ %rd548 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7480, %r7436; + mov.u32 %r7481, %r7436; + mov.u32 %r7482, %r7436; + mov.u32 %r7483, %r7436; + @%p1169 ld.global.v4.b32 { %r7480, %r7481, %r7482, %r7483 }, [ %rd549 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7488, %r7436; + mov.u32 %r7489, %r7436; + mov.u32 %r7490, %r7436; + mov.u32 %r7491, %r7436; + @%p1170 ld.global.v4.b32 { %r7488, %r7489, %r7490, %r7491 }, [ %rd550 + 0 ]; + // end inline asm + shl.b32 %r7576, %r10, 4; + and.b32 %r7577, %r7576, 112; + shl.b32 %r7578, %r12, 3; + and.b32 %r7579, %r10, 112; + and.b32 %r7580, %r10, 8; + shl.b32 %r7581, %r7580, 11; + or.b32 %r7582, %r7577, %r7578; + xor.b32 %r7583, %r7582, %r7579; + or.b32 %r7584, %r7583, %r7581; + mov.b32 %r7585, global_smem; + add.s32 %r7586, %r7585, %r7584; + st.shared.v4.b32 [%r7586+99328], {%r7432, %r7433, %r7434, %r7435}; + st.shared.v4.b32 [%r7586+101376], {%r7440, %r7441, %r7442, %r7443}; + st.shared.v4.b32 [%r7586+103424], {%r7448, %r7449, %r7450, %r7451}; + st.shared.v4.b32 [%r7586+105472], {%r7456, %r7457, %r7458, %r7459}; + st.shared.v4.b32 [%r7586+107520], {%r7464, %r7465, %r7466, %r7467}; + st.shared.v4.b32 [%r7586+109568], {%r7472, %r7473, %r7474, %r7475}; + st.shared.v4.b32 [%r7586+111616], {%r7480, %r7481, %r7482, %r7483}; + st.shared.v4.b32 [%r7586+113664], {%r7488, %r7489, %r7490, %r7491}; +$L__tmp4: + .loc 1 825 20 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:825:20 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:257:107 ] + add.s64 %rd583, %rd2, %rd566; + add.s64 %rd584, %rd2, %rd568; + add.s64 %rd585, %rd2, %rd570; + add.s64 %rd586, %rd2, %rd572; + add.s64 %rd587, %rd2, %rd574; + add.s64 %rd588, %rd2, %rd576; + add.s64 %rd589, %rd2, %rd578; + add.s64 %rd590, %rd2, %rd580; + .loc 1 825 49 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:825:49 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:257:107 ] + add.s64 %rd551, %rd583, %rd582; + add.s64 %rd552, %rd584, %rd582; + add.s64 %rd553, %rd585, %rd582; + add.s64 %rd554, %rd586, %rd582; + add.s64 %rd555, %rd587, %rd582; + add.s64 %rd556, %rd588, %rd582; + add.s64 %rd557, %rd589, %rd582; + add.s64 %rd558, %rd590, %rd582; + .loc 1 833 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:833:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:257:107 ] + // begin inline asm + mov.u32 %r7496, %r7436; + mov.u32 %r7497, %r7436; + mov.u32 %r7498, %r7436; + mov.u32 %r7499, %r7436; + @%p1163 ld.global.v4.b32 { %r7496, %r7497, %r7498, %r7499 }, [ %rd551 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7504, %r7436; + mov.u32 %r7505, %r7436; + mov.u32 %r7506, %r7436; + mov.u32 %r7507, %r7436; + @%p1164 ld.global.v4.b32 { %r7504, %r7505, %r7506, %r7507 }, [ %rd552 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7512, %r7436; + mov.u32 %r7513, %r7436; + mov.u32 %r7514, %r7436; + mov.u32 %r7515, %r7436; + @%p1165 ld.global.v4.b32 { %r7512, %r7513, %r7514, %r7515 }, [ %rd553 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7520, %r7436; + mov.u32 %r7521, %r7436; + mov.u32 %r7522, %r7436; + mov.u32 %r7523, %r7436; + @%p1166 ld.global.v4.b32 { %r7520, %r7521, %r7522, %r7523 }, [ %rd554 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7528, %r7436; + mov.u32 %r7529, %r7436; + mov.u32 %r7530, %r7436; + mov.u32 %r7531, %r7436; + @%p1167 ld.global.v4.b32 { %r7528, %r7529, %r7530, %r7531 }, [ %rd555 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7536, %r7436; + mov.u32 %r7537, %r7436; + mov.u32 %r7538, %r7436; + mov.u32 %r7539, %r7436; + @%p1168 ld.global.v4.b32 { %r7536, %r7537, %r7538, %r7539 }, [ %rd556 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7544, %r7436; + mov.u32 %r7545, %r7436; + mov.u32 %r7546, %r7436; + mov.u32 %r7547, %r7436; + @%p1169 ld.global.v4.b32 { %r7544, %r7545, %r7546, %r7547 }, [ %rd557 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7552, %r7436; + mov.u32 %r7553, %r7436; + mov.u32 %r7554, %r7436; + mov.u32 %r7555, %r7436; + @%p1170 ld.global.v4.b32 { %r7552, %r7553, %r7554, %r7555 }, [ %rd558 + 0 ]; + // end inline asm + st.shared.v4.b32 [%r7586+132096], {%r7496, %r7497, %r7498, %r7499}; + st.shared.v4.b32 [%r7586+134144], {%r7504, %r7505, %r7506, %r7507}; + st.shared.v4.b32 [%r7586+136192], {%r7512, %r7513, %r7514, %r7515}; + st.shared.v4.b32 [%r7586+138240], {%r7520, %r7521, %r7522, %r7523}; + st.shared.v4.b32 [%r7586+140288], {%r7528, %r7529, %r7530, %r7531}; + st.shared.v4.b32 [%r7586+142336], {%r7536, %r7537, %r7538, %r7539}; + st.shared.v4.b32 [%r7586+144384], {%r7544, %r7545, %r7546, %r7547}; + st.shared.v4.b32 [%r7586+146432], {%r7552, %r7553, %r7554, %r7555}; +$L__tmp5: + .loc 1 266 56 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:266:56 + mul.lo.s32 %r686, %r1, %r6; + .loc 1 267 59 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:267:59 + mul.lo.s32 %r687, %r2, %r6; + .loc 1 269 34 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:269:34 + shl.b32 %r688, %r6, 5; + .loc 1 281 80 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:281:80 + mad.lo.s32 %r7587, %r2363, %r8, %r4; + .loc 1 282 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:282:70 + mad.lo.s32 %r7588, %r2365, %r8, %r4; + mul.lo.s32 %r7589, %r7588, %r2364; + .loc 1 286 32 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:286:32 + mul.wide.s32 %rd591, %r7589, 4; + add.s64 %rd559, %rd202, %rd591; + mov.pred %p460, -1; + .loc 1 287 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:287:30 + // begin inline asm + mov.u32 %r7560, 0x0; + @%p460 ld.global.b32 { %r7560 }, [ %rd559 + 0 ]; + // end inline asm + .loc 1 287 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:287:43 + shl.b32 %r689, %r7560, 7; + .loc 1 288 55 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:288:55 + mul.wide.s32 %rd592, %r7587, 4; + add.s64 %rd560, %rd201, %rd592; + .loc 1 288 42 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:288:42 + // begin inline asm + mov.u32 %r7561, 0x0; + @%p460 ld.global.b32 { %r7561 }, [ %rd560 + 0 ]; + // end inline asm + .loc 1 290 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:290:45 + and.b32 %r691, %r10, 3; + shl.b32 %r692, %r691, 1; + or.b32 %r7590, %r692, 1; + or.b32 %r7591, %r692, 8; + or.b32 %r7592, %r692, 9; + .loc 1 290 32 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:290:32 + or.b32 %r693, %r689, %r7591; + or.b32 %r694, %r689, %r7592; + .loc 1 290 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:290:45 + or.b32 %r7593, %r692, 16; + or.b32 %r7594, %r692, 17; + or.b32 %r7595, %r692, 24; + or.b32 %r7596, %r692, 25; + .loc 1 290 32 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:290:32 + or.b32 %r757, %r689, %r7596; + or.b32 %r756, %r689, %r7595; + or.b32 %r755, %r689, %r7594; + or.b32 %r754, %r689, %r7593; + or.b32 %r7597, %r689, %r13; + or.b32 %r7598, %r689, %r14; + or.b32 %r7599, %r689, %r15; + or.b32 %r7600, %r689, %r16; +$L__tmp6: + .loc 1 601 37 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:601:37 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + shl.b32 %r7601, %r7597, 12; + shl.b32 %r7602, %r7598, 12; + shl.b32 %r7603, %r7599, 12; + shl.b32 %r7604, %r7600, 12; + .loc 1 602 38 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:602:38 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + shl.b32 %r7605, %r7597, 7; + shl.b32 %r7606, %r7598, 7; + shl.b32 %r7607, %r7599, 7; + shl.b32 %r7608, %r7600, 7; + .loc 1 608 42 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:608:42 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + shl.b32 %r699, %r7561, 1; + .loc 2 41 22 // standard.py:41:22 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s32 %r7609, %r2358, 63; + .loc 2 41 28 // standard.py:41:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + shr.s32 %r7610, %r7609, 31; + shr.u32 %r7611, %r7610, 26; + add.s32 %r7612, %r7609, %r7611; + shr.s32 %r7613, %r7612, 6; + .loc 1 608 98 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:608:98 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + max.s32 %r7614, %r7613, 1; + .loc 1 608 61 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:608:61 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + min.s32 %r7615, %r699, %r7614; +$L__tmp7: + .loc 1 253 29 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:253:29 + or.b32 %r7616, %r21, %r7565; + or.b32 %r7617, %r22, %r7565; + .loc 1 290 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:290:45 + or.b32 %r7618, %r692, 32; + or.b32 %r7619, %r692, 33; + or.b32 %r7620, %r692, 40; + or.b32 %r7621, %r692, 41; + or.b32 %r7622, %r692, 48; + or.b32 %r7623, %r692, 49; + or.b32 %r7624, %r692, 56; + or.b32 %r7625, %r692, 57; + .loc 1 290 32 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:290:32 + or.b32 %r765, %r689, %r7625; + or.b32 %r764, %r689, %r7624; + or.b32 %r763, %r689, %r7623; + or.b32 %r762, %r689, %r7622; + or.b32 %r761, %r689, %r7621; + or.b32 %r760, %r689, %r7620; + or.b32 %r759, %r689, %r7619; + or.b32 %r758, %r689, %r7618; +$L__tmp8: + .loc 1 798 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:798:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + rem.s32 %r725, %r7617, %r2359; + rem.s32 %r980, %r7616, %r2359; + .loc 1 701 35 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:701:35 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mad.wide.u32 %rd562, %r6, 8, %rd207; + .loc 1 610 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:610:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.gt.s32 %p462, %r699, 0; + .loc 1 701 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:701:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + // begin inline asm + mov.u64 %rd561, 0x0; + @%p462 ld.global.b64 { %rd561 }, [ %rd562 + 0 ]; + // end inline asm + .loc 1 709 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:709:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.ge.s32 %p465, %r980, %r2366; + setp.ge.s32 %p466, %r725, %r2366; + .loc 1 710 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:710:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + rem.s32 %r7626, %r980, %r2366; + rem.s32 %r7627, %r725, %r2366; + .loc 1 712 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:712:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.eq.b32 %p467, %r7626, 0; + setp.eq.b32 %p468, %r7627, 0; + .loc 1 718 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:718:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + min.s32 %r7628, %r2366, 0; + selp.b32 %r7629, 0, %r7628, %p467; + add.s32 %r7630, %r7629, %r7626; + selp.b32 %r7631, 0, %r7628, %p468; + add.s32 %r7632, %r7631, %r7627; + .loc 1 719 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:719:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.s64.s32 %rd593, %r7630; + cvt.s64.s32 %rd594, %r7632; + .loc 1 720 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:720:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.gt.s64 %p469, %rd561, %rd593; + setp.gt.s64 %p470, %rd561, %rd594; + .loc 1 721 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:721:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + and.pred %p9, %p465, %p469; + and.pred %p7, %p466, %p470; +$L__tmp9: + .loc 1 306 41 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:306:41 + add.s64 %rd563, %rd206, %rd591; + .loc 1 307 34 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:307:34 + // begin inline asm + mov.u32 %r7562, 0x0; + @%p460 ld.global.b32 { %r7562 }, [ %rd563 + 0 ]; + // end inline asm + .loc 1 307 47 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:307:47 + shl.b32 %r726, %r7562, 7; + .loc 1 308 64 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:308:64 + add.s64 %rd564, %rd205, %rd592; + .loc 1 308 46 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:308:46 + // begin inline asm + mov.u32 %r7563, 0x0; + @%p460 ld.global.b32 { %r7563 }, [ %rd564 + 0 ]; + // end inline asm + .loc 1 310 36 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:310:36 + or.b32 %r728, %r726, %r692; + or.b32 %r729, %r726, %r7590; + or.b32 %r730, %r726, %r7591; + or.b32 %r731, %r726, %r7592; + or.b32 %r732, %r726, %r7593; + or.b32 %r733, %r726, %r7594; + or.b32 %r734, %r726, %r7595; + or.b32 %r735, %r726, %r7596; + or.b32 %r736, %r726, %r7618; + or.b32 %r737, %r726, %r7619; + or.b32 %r738, %r726, %r7620; + or.b32 %r739, %r726, %r7621; + or.b32 %r740, %r726, %r7622; + or.b32 %r741, %r726, %r7623; + or.b32 %r742, %r726, %r7624; + or.b32 %r743, %r726, %r7625; + or.b32 %r7633, %r726, %r13; + or.b32 %r7634, %r726, %r14; + or.b32 %r7635, %r726, %r15; + or.b32 %r7636, %r726, %r16; +$L__tmp10: + .loc 1 601 37 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:601:37 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + shl.b32 %r7637, %r7633, 12; + shl.b32 %r7638, %r7634, 12; + shl.b32 %r7639, %r7635, 12; + shl.b32 %r7640, %r7636, 12; + .loc 1 602 38 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:602:38 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + shl.b32 %r7641, %r7633, 7; + shl.b32 %r7642, %r7634, 7; + shl.b32 %r7643, %r7635, 7; + shl.b32 %r7644, %r7636, 7; + .loc 1 608 42 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:608:42 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + shl.b32 %r744, %r7563, 1; + .loc 1 608 61 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:608:61 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + min.s32 %r7645, %r744, %r7614; +$L__tmp11: + .loc 1 676 20 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:676:20 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + cvt.s64.s32 %rd83, %r7601; + cvt.s64.s32 %rd84, %r7602; + cvt.s64.s32 %rd85, %r7603; + cvt.s64.s32 %rd86, %r7604; + cvt.s64.s32 %rd87, %r7605; + cvt.s64.s32 %rd88, %r7606; + cvt.s64.s32 %rd89, %r7607; + cvt.s64.s32 %rd90, %r7608; + setp.lt.s32 %p471, %r7597, %r2358; + setp.lt.s32 %p472, %r7598, %r2358; + setp.lt.s32 %p473, %r7599, %r2358; + setp.lt.s32 %p474, %r7600, %r2358; + shl.b32 %r7646, %r7580, 10; + or.b32 %r745, %r7583, %r7646; + add.s32 %r11058, %r7585, %r745; + selp.b32 %r7647, 16, 0, %p471; + selp.b32 %r7729, %r7647, 0, %p462; + add.s32 %r11060, %r11058, 2048; + selp.b32 %r7648, 16, 0, %p472; + selp.b32 %r7731, %r7648, 0, %p462; + add.s32 %r11062, %r11058, 4096; + selp.b32 %r7649, 16, 0, %p473; + selp.b32 %r7733, %r7649, 0, %p462; + add.s32 %r11064, %r11058, 6144; + selp.b32 %r7650, 16, 0, %p474; + selp.b32 %r7735, %r7650, 0, %p462; + setp.lt.s32 %p475, %r693, %r2358; + setp.lt.s32 %p476, %r694, %r2358; + setp.lt.s32 %p477, %r754, %r2358; + setp.lt.s32 %p478, %r755, %r2358; + setp.lt.s32 %p479, %r756, %r2358; + setp.lt.s32 %p480, %r757, %r2358; + setp.lt.s32 %p481, %r758, %r2358; + setp.lt.s32 %p482, %r759, %r2358; + setp.lt.s32 %p483, %r760, %r2358; + setp.lt.s32 %p484, %r761, %r2358; + setp.lt.s32 %p485, %r762, %r2358; + setp.lt.s32 %p486, %r763, %r2358; + setp.lt.s32 %p487, %r764, %r2358; + setp.lt.s32 %p488, %r765, %r2358; + cvt.s64.s32 %rd91, %r754; + cvt.s64.s32 %rd92, %r755; + cvt.s64.s32 %rd93, %r756; + cvt.s64.s32 %rd94, %r757; + cvt.s64.s32 %rd95, %r758; + cvt.s64.s32 %rd96, %r759; + cvt.s64.s32 %rd97, %r760; + cvt.s64.s32 %rd98, %r761; + cvt.s64.s32 %rd99, %r762; + cvt.s64.s32 %rd100, %r763; + cvt.s64.s32 %rd101, %r764; + cvt.s64.s32 %rd102, %r765; + and.b32 %r766, %r10, 252; + shl.b32 %r767, %r691, 3; + add.s32 %r7651, %r7585, %r767; + add.s32 %r11066, %r7651, 98304; + add.s32 %r11068, %r7651, 98308; + add.s32 %r11070, %r7651, 98336; + selp.b32 %r7652, 4, 0, %p475; + selp.b32 %r7741, %r7652, 0, %p462; + add.s32 %r11072, %r7651, 98340; + selp.b32 %r7653, 4, 0, %p476; + selp.b32 %r7743, %r7653, 0, %p462; + add.s32 %r11074, %r7651, 98368; + selp.b32 %r7654, 4, 0, %p477; + selp.b32 %r7745, %r7654, 0, %p462; + add.s32 %r11076, %r7651, 98372; + selp.b32 %r7655, 4, 0, %p478; + selp.b32 %r7747, %r7655, 0, %p462; + add.s32 %r11078, %r7651, 98400; + selp.b32 %r7656, 4, 0, %p479; + selp.b32 %r7749, %r7656, 0, %p462; + add.s32 %r11080, %r7651, 98404; + selp.b32 %r7657, 4, 0, %p480; + selp.b32 %r7751, %r7657, 0, %p462; + add.s32 %r11082, %r7651, 98432; + selp.b32 %r7658, 4, 0, %p481; + selp.b32 %r7753, %r7658, 0, %p462; + add.s32 %r11084, %r7651, 98436; + selp.b32 %r7659, 4, 0, %p482; + selp.b32 %r7755, %r7659, 0, %p462; + add.s32 %r11086, %r7651, 98464; + selp.b32 %r7660, 4, 0, %p483; + selp.b32 %r7757, %r7660, 0, %p462; + add.s32 %r11088, %r7651, 98468; + selp.b32 %r7661, 4, 0, %p484; + selp.b32 %r7759, %r7661, 0, %p462; + add.s32 %r11090, %r7651, 98496; + selp.b32 %r7662, 4, 0, %p485; + selp.b32 %r7761, %r7662, 0, %p462; + add.s32 %r11092, %r7651, 98500; + selp.b32 %r7663, 4, 0, %p486; + selp.b32 %r7763, %r7663, 0, %p462; + add.s32 %r11094, %r7651, 98528; + selp.b32 %r7664, 4, 0, %p487; + selp.b32 %r7765, %r7664, 0, %p462; + add.s32 %r11096, %r7651, 98532; + selp.b32 %r7665, 4, 0, %p488; + selp.b32 %r7767, %r7665, 0, %p462; + add.s32 %r11098, %r11058, 49152; + add.s32 %r11100, %r11058, 51200; + add.s32 %r11102, %r11058, 53248; + add.s32 %r11104, %r11058, 55296; + add.s32 %r11106, %r7651, 98816; + add.s32 %r11108, %r7651, 98820; + add.s32 %r11110, %r7651, 98848; + add.s32 %r11112, %r7651, 98852; + add.s32 %r11114, %r7651, 98880; + add.s32 %r11116, %r7651, 98884; + add.s32 %r11118, %r7651, 98912; + add.s32 %r11120, %r7651, 98916; + add.s32 %r11122, %r7651, 98944; + add.s32 %r11124, %r7651, 98948; + add.s32 %r11126, %r7651, 98976; + add.s32 %r11128, %r7651, 98980; + add.s32 %r11130, %r7651, 99008; + add.s32 %r11132, %r7651, 99012; + add.s32 %r11134, %r7651, 99040; + add.s32 %r11136, %r7651, 99044; + setp.gt.s32 %p489, %r7615, 1; +$L__tmp12: + .loc 1 290 32 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:290:32 + or.b32 %r819, %r689, %r7590; + or.b32 %r818, %r689, %r692; + setp.lt.s32 %p490, %r818, %r2358; + setp.lt.s32 %p491, %r819, %r2358; + cvt.s64.s32 %rd103, %r818; + cvt.s64.s32 %rd104, %r819; + selp.b32 %r7666, 4, 0, %p490; + selp.b32 %r7737, %r7666, 0, %p462; + selp.b32 %r7667, 4, 0, %p491; + selp.b32 %r7739, %r7667, 0, %p462; + or.b32 %r823, %r819, 64; + or.b32 %r822, %r818, 64; + or.b32 %r824, %r693, 64; + or.b32 %r825, %r694, 64; + or.b32 %r826, %r754, 64; + or.b32 %r827, %r755, 64; + or.b32 %r828, %r756, 64; + or.b32 %r829, %r757, 64; + or.b32 %r830, %r758, 64; + or.b32 %r831, %r759, 64; + or.b32 %r832, %r760, 64; + or.b32 %r833, %r761, 64; + or.b32 %r834, %r762, 64; + or.b32 %r835, %r763, 64; + or.b32 %r836, %r764, 64; + or.b32 %r837, %r765, 64; + or.b32 %r838, %r7597, 64; + or.b32 %r839, %r7598, 64; + or.b32 %r840, %r7599, 64; + or.b32 %r841, %r7600, 64; + setp.lt.s32 %p492, %r838, %r2358; + setp.lt.s32 %p493, %r839, %r2358; + setp.lt.s32 %p494, %r840, %r2358; + setp.lt.s32 %p495, %r841, %r2358; + add.s32 %r11138, %r11058, 16384; + selp.b32 %r7668, 16, 0, %p492; + selp.b32 %r7809, %r7668, 0, %p489; + add.s32 %r11140, %r11058, 18432; + selp.b32 %r7669, 16, 0, %p493; + selp.b32 %r7811, %r7669, 0, %p489; + add.s32 %r11142, %r11058, 20480; + selp.b32 %r7670, 16, 0, %p494; + selp.b32 %r7813, %r7670, 0, %p489; + add.s32 %r11144, %r11058, 22528; + selp.b32 %r7671, 16, 0, %p495; + selp.b32 %r7815, %r7671, 0, %p489; + setp.lt.s32 %p496, %r822, %r2358; + setp.lt.s32 %p497, %r823, %r2358; + setp.lt.s32 %p498, %r824, %r2358; + setp.lt.s32 %p499, %r825, %r2358; + setp.lt.s32 %p500, %r826, %r2358; + setp.lt.s32 %p501, %r827, %r2358; + setp.lt.s32 %p502, %r828, %r2358; + setp.lt.s32 %p503, %r829, %r2358; + setp.lt.s32 %p504, %r830, %r2358; + setp.lt.s32 %p505, %r831, %r2358; + setp.lt.s32 %p506, %r832, %r2358; + setp.lt.s32 %p507, %r833, %r2358; + setp.lt.s32 %p508, %r834, %r2358; + setp.lt.s32 %p509, %r835, %r2358; + setp.lt.s32 %p510, %r836, %r2358; + setp.lt.s32 %p511, %r837, %r2358; + cvt.s64.s32 %rd105, %r822; + cvt.s64.s32 %rd106, %r823; + add.s32 %r11146, %r7651, 98560; + selp.b32 %r7672, 4, 0, %p496; + selp.b32 %r7817, %r7672, 0, %p489; + add.s32 %r11148, %r7651, 98564; + selp.b32 %r7673, 4, 0, %p497; + selp.b32 %r7819, %r7673, 0, %p489; + add.s32 %r11150, %r7651, 98592; + selp.b32 %r7674, 4, 0, %p498; + selp.b32 %r7821, %r7674, 0, %p489; + add.s32 %r11152, %r7651, 98596; + selp.b32 %r7675, 4, 0, %p499; + selp.b32 %r7823, %r7675, 0, %p489; + add.s32 %r11154, %r7651, 98624; + selp.b32 %r7676, 4, 0, %p500; + selp.b32 %r7825, %r7676, 0, %p489; + add.s32 %r11156, %r7651, 98628; + selp.b32 %r7677, 4, 0, %p501; + selp.b32 %r7827, %r7677, 0, %p489; + add.s32 %r11158, %r7651, 98656; + selp.b32 %r7678, 4, 0, %p502; + selp.b32 %r7829, %r7678, 0, %p489; + add.s32 %r11160, %r7651, 98660; + selp.b32 %r7679, 4, 0, %p503; + selp.b32 %r7831, %r7679, 0, %p489; + add.s32 %r11162, %r7651, 98688; + selp.b32 %r7680, 4, 0, %p504; + selp.b32 %r7833, %r7680, 0, %p489; + add.s32 %r11164, %r7651, 98692; + selp.b32 %r7681, 4, 0, %p505; + selp.b32 %r7835, %r7681, 0, %p489; + add.s32 %r11166, %r7651, 98720; + selp.b32 %r7682, 4, 0, %p506; + selp.b32 %r7837, %r7682, 0, %p489; + add.s32 %r11168, %r7651, 98724; + selp.b32 %r7683, 4, 0, %p507; + selp.b32 %r7839, %r7683, 0, %p489; + add.s32 %r11170, %r7651, 98752; + selp.b32 %r7684, 4, 0, %p508; + selp.b32 %r7841, %r7684, 0, %p489; + add.s32 %r11172, %r7651, 98756; + selp.b32 %r7685, 4, 0, %p509; + selp.b32 %r7843, %r7685, 0, %p489; + add.s32 %r11174, %r7651, 98784; + selp.b32 %r7686, 4, 0, %p510; + selp.b32 %r7845, %r7686, 0, %p489; + add.s32 %r11176, %r7651, 98788; + selp.b32 %r7687, 4, 0, %p511; + selp.b32 %r7847, %r7687, 0, %p489; + add.s32 %r11178, %r11058, 65536; + add.s32 %r11180, %r11058, 67584; + add.s32 %r11182, %r11058, 69632; + add.s32 %r11184, %r11058, 71680; + add.s32 %r11186, %r7651, 99072; + add.s32 %r11188, %r7651, 99076; + add.s32 %r11190, %r7651, 99104; + add.s32 %r11192, %r7651, 99108; + add.s32 %r11194, %r7651, 99136; + add.s32 %r11196, %r7651, 99140; + add.s32 %r11198, %r7651, 99168; + add.s32 %r11200, %r7651, 99172; + add.s32 %r11202, %r7651, 99200; + add.s32 %r11204, %r7651, 99204; + add.s32 %r11206, %r7651, 99232; + add.s32 %r11208, %r7651, 99236; + add.s32 %r11210, %r7651, 99264; + add.s32 %r11212, %r7651, 99268; + add.s32 %r11214, %r7651, 99296; + add.s32 %r11216, %r7651, 99300; + add.s32 %r902, %r7615, -2; + add.s32 %r903, %r7615, -1; + cvt.s64.s32 %rd107, %r7637; + cvt.s64.s32 %rd108, %r7638; + cvt.s64.s32 %rd109, %r7639; + cvt.s64.s32 %rd110, %r7640; + cvt.s64.s32 %rd111, %r7641; + cvt.s64.s32 %rd112, %r7642; + cvt.s64.s32 %rd113, %r7643; + cvt.s64.s32 %rd114, %r7644; + setp.gt.s32 %p512, %r744, 0; + setp.lt.s32 %p513, %r7633, %r2358; + setp.lt.s32 %p514, %r7634, %r2358; + setp.lt.s32 %p515, %r7635, %r2358; + setp.lt.s32 %p516, %r7636, %r2358; + selp.b32 %r7688, 16, 0, %p513; + selp.b32 %r11059, %r7688, 0, %p512; + selp.b32 %r7689, 16, 0, %p514; + selp.b32 %r11061, %r7689, 0, %p512; + selp.b32 %r7690, 16, 0, %p515; + selp.b32 %r11063, %r7690, 0, %p512; + selp.b32 %r7691, 16, 0, %p516; + selp.b32 %r11065, %r7691, 0, %p512; + setp.lt.s32 %p517, %r728, %r2358; + setp.lt.s32 %p518, %r729, %r2358; + setp.lt.s32 %p519, %r730, %r2358; + setp.lt.s32 %p520, %r731, %r2358; + setp.lt.s32 %p521, %r732, %r2358; + setp.lt.s32 %p522, %r733, %r2358; + setp.lt.s32 %p523, %r734, %r2358; + setp.lt.s32 %p524, %r735, %r2358; + setp.lt.s32 %p525, %r736, %r2358; + setp.lt.s32 %p526, %r737, %r2358; + setp.lt.s32 %p527, %r738, %r2358; + setp.lt.s32 %p528, %r739, %r2358; + setp.lt.s32 %p529, %r740, %r2358; + setp.lt.s32 %p530, %r741, %r2358; + setp.lt.s32 %p531, %r742, %r2358; + setp.lt.s32 %p532, %r743, %r2358; + cvt.s64.s32 %rd115, %r728; + cvt.s64.s32 %rd116, %r732; + cvt.s64.s32 %rd117, %r733; + cvt.s64.s32 %rd118, %r734; + cvt.s64.s32 %rd119, %r735; + cvt.s64.s32 %rd120, %r736; + cvt.s64.s32 %rd121, %r737; + cvt.s64.s32 %rd122, %r738; + cvt.s64.s32 %rd123, %r739; + cvt.s64.s32 %rd124, %r740; + cvt.s64.s32 %rd125, %r741; + cvt.s64.s32 %rd126, %r742; + cvt.s64.s32 %rd127, %r743; + selp.b32 %r7692, 4, 0, %p517; + selp.b32 %r11067, %r7692, 0, %p512; + selp.b32 %r7693, 4, 0, %p518; + selp.b32 %r11069, %r7693, 0, %p512; + selp.b32 %r7694, 4, 0, %p519; + selp.b32 %r11071, %r7694, 0, %p512; + selp.b32 %r7695, 4, 0, %p520; + selp.b32 %r11073, %r7695, 0, %p512; + selp.b32 %r7696, 4, 0, %p521; + selp.b32 %r11075, %r7696, 0, %p512; + selp.b32 %r7697, 4, 0, %p522; + selp.b32 %r11077, %r7697, 0, %p512; + selp.b32 %r7698, 4, 0, %p523; + selp.b32 %r11079, %r7698, 0, %p512; + selp.b32 %r7699, 4, 0, %p524; + selp.b32 %r11081, %r7699, 0, %p512; + selp.b32 %r7700, 4, 0, %p525; + selp.b32 %r11083, %r7700, 0, %p512; + selp.b32 %r7701, 4, 0, %p526; + selp.b32 %r11085, %r7701, 0, %p512; + selp.b32 %r7702, 4, 0, %p527; + selp.b32 %r11087, %r7702, 0, %p512; + selp.b32 %r7703, 4, 0, %p528; + selp.b32 %r11089, %r7703, 0, %p512; + selp.b32 %r7704, 4, 0, %p529; + selp.b32 %r11091, %r7704, 0, %p512; + selp.b32 %r7705, 4, 0, %p530; + selp.b32 %r11093, %r7705, 0, %p512; + selp.b32 %r7706, 4, 0, %p531; + selp.b32 %r11095, %r7706, 0, %p512; + selp.b32 %r7707, 4, 0, %p532; + selp.b32 %r11097, %r7707, 0, %p512; + setp.gt.s32 %p533, %r7645, 1; + or.b32 %r924, %r728, 64; + or.b32 %r925, %r729, 64; + or.b32 %r926, %r730, 64; + or.b32 %r927, %r731, 64; + or.b32 %r928, %r732, 64; + or.b32 %r929, %r733, 64; + or.b32 %r930, %r734, 64; + or.b32 %r931, %r735, 64; + or.b32 %r932, %r736, 64; + or.b32 %r933, %r737, 64; + or.b32 %r934, %r738, 64; + or.b32 %r935, %r739, 64; + or.b32 %r936, %r740, 64; + or.b32 %r937, %r741, 64; + or.b32 %r938, %r742, 64; + or.b32 %r939, %r743, 64; + or.b32 %r940, %r7633, 64; + or.b32 %r941, %r7634, 64; + or.b32 %r942, %r7635, 64; + or.b32 %r943, %r7636, 64; + setp.lt.s32 %p534, %r940, %r2358; + setp.lt.s32 %p535, %r941, %r2358; + setp.lt.s32 %p536, %r942, %r2358; + setp.lt.s32 %p537, %r943, %r2358; + selp.b32 %r7708, 16, 0, %p534; + selp.b32 %r11139, %r7708, 0, %p533; + selp.b32 %r7709, 16, 0, %p535; + selp.b32 %r11141, %r7709, 0, %p533; + selp.b32 %r7710, 16, 0, %p536; + selp.b32 %r11143, %r7710, 0, %p533; + selp.b32 %r7711, 16, 0, %p537; + selp.b32 %r11145, %r7711, 0, %p533; + setp.lt.s32 %p538, %r924, %r2358; + setp.lt.s32 %p539, %r925, %r2358; + setp.lt.s32 %p540, %r926, %r2358; + setp.lt.s32 %p541, %r927, %r2358; + setp.lt.s32 %p542, %r928, %r2358; + setp.lt.s32 %p543, %r929, %r2358; + setp.lt.s32 %p544, %r930, %r2358; + setp.lt.s32 %p545, %r931, %r2358; + setp.lt.s32 %p546, %r932, %r2358; + setp.lt.s32 %p547, %r933, %r2358; + setp.lt.s32 %p548, %r934, %r2358; + setp.lt.s32 %p549, %r935, %r2358; + setp.lt.s32 %p550, %r936, %r2358; + setp.lt.s32 %p551, %r937, %r2358; + setp.lt.s32 %p552, %r938, %r2358; + setp.lt.s32 %p553, %r939, %r2358; + selp.b32 %r7712, 4, 0, %p538; + selp.b32 %r11147, %r7712, 0, %p533; + selp.b32 %r7713, 4, 0, %p539; + selp.b32 %r11149, %r7713, 0, %p533; + selp.b32 %r7714, 4, 0, %p540; + selp.b32 %r11151, %r7714, 0, %p533; + selp.b32 %r7715, 4, 0, %p541; + selp.b32 %r11153, %r7715, 0, %p533; + selp.b32 %r7716, 4, 0, %p542; + selp.b32 %r11155, %r7716, 0, %p533; + selp.b32 %r7717, 4, 0, %p543; + selp.b32 %r11157, %r7717, 0, %p533; + selp.b32 %r7718, 4, 0, %p544; + selp.b32 %r11159, %r7718, 0, %p533; + selp.b32 %r7719, 4, 0, %p545; + selp.b32 %r11161, %r7719, 0, %p533; + selp.b32 %r7720, 4, 0, %p546; + selp.b32 %r11163, %r7720, 0, %p533; + selp.b32 %r7721, 4, 0, %p547; + selp.b32 %r11165, %r7721, 0, %p533; + selp.b32 %r7722, 4, 0, %p548; + selp.b32 %r11167, %r7722, 0, %p533; + selp.b32 %r7723, 4, 0, %p549; + selp.b32 %r11169, %r7723, 0, %p533; + selp.b32 %r7724, 4, 0, %p550; + selp.b32 %r11171, %r7724, 0, %p533; + selp.b32 %r7725, 4, 0, %p551; + selp.b32 %r11173, %r7725, 0, %p533; + selp.b32 %r7726, 4, 0, %p552; + selp.b32 %r11175, %r7726, 0, %p533; + selp.b32 %r7727, 4, 0, %p553; + selp.b32 %r11177, %r7727, 0, %p533; + add.s32 %r964, %r7645, -2; + add.s32 %r965, %r7645, -1; + .loc 1 262 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:262:30 + max.s32 %r966, %r7615, 1; + max.s32 %r967, %r7645, 1; + mul.wide.u32 %rd128, %r7, 4; + mov.b32 %r14988, 0f00000000; + mov.b64 %rd1193, 0; + shl.b64 %rd677, %rd83, 1; + shl.b64 %rd679, %rd84, 1; + shl.b64 %rd681, %rd85, 1; + shl.b64 %rd683, %rd86, 1; + shl.b64 %rd686, %rd87, 1; + shl.b64 %rd688, %rd88, 1; + shl.b64 %rd690, %rd89, 1; + shl.b64 %rd692, %rd90, 1; + shl.b64 %rd694, %rd103, 2; + shl.b64 %rd695, %rd104, 2; + shl.b64 %rd700, %rd91, 2; + shl.b64 %rd701, %rd92, 2; + shl.b64 %rd702, %rd93, 2; + shl.b64 %rd703, %rd94, 2; + shl.b64 %rd704, %rd95, 2; + shl.b64 %rd705, %rd96, 2; + shl.b64 %rd706, %rd97, 2; + shl.b64 %rd707, %rd98, 2; + shl.b64 %rd708, %rd99, 2; + shl.b64 %rd709, %rd100, 2; + shl.b64 %rd710, %rd101, 2; + shl.b64 %rd711, %rd102, 2; + shl.b64 %rd713, %rd105, 2; + shl.b64 %rd714, %rd106, 2; + shl.b64 %rd956, %rd107, 1; + shl.b64 %rd958, %rd108, 1; + shl.b64 %rd960, %rd109, 1; + shl.b64 %rd962, %rd110, 1; + shl.b64 %rd965, %rd111, 1; + shl.b64 %rd967, %rd112, 1; + shl.b64 %rd969, %rd113, 1; + shl.b64 %rd971, %rd114, 1; + shl.b64 %rd973, %rd115, 2; + shl.b64 %rd978, %rd116, 2; + shl.b64 %rd979, %rd117, 2; + shl.b64 %rd980, %rd118, 2; + shl.b64 %rd981, %rd119, 2; + shl.b64 %rd982, %rd120, 2; + shl.b64 %rd983, %rd121, 2; + shl.b64 %rd984, %rd122, 2; + shl.b64 %rd985, %rd123, 2; + shl.b64 %rd986, %rd124, 2; + shl.b64 %rd987, %rd125, 2; + shl.b64 %rd988, %rd126, 2; + shl.b64 %rd989, %rd127, 2; + mov.b32 %r14989, %r14988; + mov.b32 %r14990, %r14988; + mov.b32 %r14991, %r14988; + mov.b32 %r14992, %r14988; + mov.b32 %r14993, %r14988; + mov.b32 %r14994, %r14988; + mov.b32 %r14995, %r14988; + mov.b32 %r14996, %r14988; + mov.b32 %r14997, %r14988; + mov.b32 %r14998, %r14988; + mov.b32 %r14999, %r14988; + mov.b32 %r15000, %r14988; + mov.b32 %r15001, %r14988; + mov.b32 %r15002, %r14988; + mov.b32 %r15003, %r14988; + mov.b32 %r15004, %r14988; + mov.b32 %r15005, %r14988; + mov.b32 %r15006, %r14988; + mov.b32 %r15007, %r14988; + mov.b32 %r15008, %r14988; + mov.b32 %r15009, %r14988; + mov.b32 %r15010, %r14988; + mov.b32 %r15011, %r14988; + mov.b32 %r15012, %r14988; + mov.b32 %r15013, %r14988; + mov.b32 %r15014, %r14988; + mov.b32 %r15015, %r14988; + mov.b32 %r15016, %r14988; + mov.b32 %r15017, %r14988; + mov.b32 %r15018, %r14988; + mov.b32 %r15019, %r14988; + mov.b32 %r15020, %r14988; + mov.b32 %r15021, %r14988; + mov.b32 %r15022, %r14988; + mov.b32 %r15023, %r14988; + mov.b32 %r15024, %r14988; + mov.b32 %r15025, %r14988; + mov.b32 %r15026, %r14988; + mov.b32 %r15027, %r14988; + mov.b32 %r15028, %r14988; + mov.b32 %r15029, %r14988; + mov.b32 %r15030, %r14988; + mov.b32 %r15031, %r14988; + mov.b32 %r15032, %r14988; + mov.b32 %r15033, %r14988; + mov.b32 %r15034, %r14988; + mov.b32 %r15035, %r14988; + mov.b32 %r15036, %r14988; + mov.b32 %r15037, %r14988; + mov.b32 %r15038, %r14988; + mov.b32 %r15039, %r14988; + mov.b32 %r15040, %r14988; + mov.b32 %r15041, %r14988; + mov.b32 %r15042, %r14988; + mov.b32 %r15043, %r14988; + mov.b32 %r15044, %r14988; + mov.b32 %r15045, %r14988; + mov.b32 %r15046, %r14988; + mov.b32 %r15047, %r14988; + mov.b32 %r15048, %r14988; + mov.b32 %r15049, %r14988; + mov.b32 %r15050, %r14988; + mov.b32 %r15051, %r14988; + mov.b32 %r14924, %r14988; + mov.b32 %r14925, %r14988; + mov.b32 %r14926, %r14988; + mov.b32 %r14927, %r14988; + mov.b32 %r14928, %r14988; + mov.b32 %r14929, %r14988; + mov.b32 %r14930, %r14988; + mov.b32 %r14931, %r14988; + mov.b32 %r14932, %r14988; + mov.b32 %r14933, %r14988; + mov.b32 %r14934, %r14988; + mov.b32 %r14935, %r14988; + mov.b32 %r14936, %r14988; + mov.b32 %r14937, %r14988; + mov.b32 %r14938, %r14988; + mov.b32 %r14939, %r14988; + mov.b32 %r14940, %r14988; + mov.b32 %r14941, %r14988; + mov.b32 %r14942, %r14988; + mov.b32 %r14943, %r14988; + mov.b32 %r14944, %r14988; + mov.b32 %r14945, %r14988; + mov.b32 %r14946, %r14988; + mov.b32 %r14947, %r14988; + mov.b32 %r14948, %r14988; + mov.b32 %r14949, %r14988; + mov.b32 %r14950, %r14988; + mov.b32 %r14951, %r14988; + mov.b32 %r14952, %r14988; + mov.b32 %r14953, %r14988; + mov.b32 %r14954, %r14988; + mov.b32 %r14955, %r14988; + mov.b32 %r14956, %r14988; + mov.b32 %r14957, %r14988; + mov.b32 %r14958, %r14988; + mov.b32 %r14959, %r14988; + mov.b32 %r14960, %r14988; + mov.b32 %r14961, %r14988; + mov.b32 %r14962, %r14988; + mov.b32 %r14963, %r14988; + mov.b32 %r14964, %r14988; + mov.b32 %r14965, %r14988; + mov.b32 %r14966, %r14988; + mov.b32 %r14967, %r14988; + mov.b32 %r14968, %r14988; + mov.b32 %r14969, %r14988; + mov.b32 %r14970, %r14988; + mov.b32 %r14971, %r14988; + mov.b32 %r14972, %r14988; + mov.b32 %r14973, %r14988; + mov.b32 %r14974, %r14988; + mov.b32 %r14975, %r14988; + mov.b32 %r14976, %r14988; + mov.b32 %r14977, %r14988; + mov.b32 %r14978, %r14988; + mov.b32 %r14979, %r14988; + mov.b32 %r14980, %r14988; + mov.b32 %r14981, %r14988; + mov.b32 %r14982, %r14988; + mov.b32 %r14983, %r14988; + mov.b32 %r14984, %r14988; + mov.b32 %r14985, %r14988; + mov.b32 %r14986, %r14988; + mov.b32 %r14987, %r14988; + bra.uni $L__BB0_9; +$L__BB0_15: // %._crit_edge1794 + // in Loop: Header=BB0_9 Depth=1 +$L__tmp13: + .loc 1 610 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:610:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + // begin inline asm + // wait for regs: %r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987,%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp14: + .loc 1 262 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:262:30 + add.s64 %rd1193, %rd1193, 1; + setp.ne.b64 %p1162, %rd1193, 4; + @%p1162 bra $L__BB0_9; + bra.uni $L__BB0_16; +$L__BB0_9: // =>This Loop Header: Depth=1 + // Child Loop BB0_11 Depth 2 + // Child Loop BB0_14 Depth 2 + .loc 1 0 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:0:30 + setp.eq.b32 %p978, %r766, 0; +$L__tmp15: + .loc 1 610 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:610:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.lt.s32 %p618, %r699, 1; +$L__tmp16: + .loc 1 263 51 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:263:51 + add.s64 %rd675, %rd1193, %rd128; + .loc 1 266 44 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:266:44 + cvt.u32.u64 %r7888, %rd675; + shl.b32 %r7889, %r7888, 7; + add.s32 %r7890, %r7889, %r686; + .loc 1 267 46 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:267:46 + mad.lo.s32 %r7891, %r3, %r7888, %r687; + .loc 1 269 50 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:269:50 + add.s32 %r7892, %r688, %r7888; + mul.lo.s32 %r7893, %r7892, %r2358; + .loc 1 271 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:271:21 + mad.wide.s32 %rd140, %r7890, 2, %rd194; + .loc 1 272 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:272:23 + mad.wide.s32 %rd141, %r7891, 2, %rd197; + .loc 1 275 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:275:25 + mul.wide.s32 %rd676, %r7893, 4; + add.s64 %rd142, %rd195, %rd676; + .loc 1 276 29 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:276:29 + add.s64 %rd143, %rd196, %rd676; +$L__tmp17: + .loc 1 601 18 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:601:18 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s64 %rd678, %rd140, %rd677; + add.s64 %rd680, %rd140, %rd679; + add.s64 %rd682, %rd140, %rd681; + add.s64 %rd684, %rd140, %rd683; + .loc 1 601 49 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:601:49 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + shl.b64 %rd685, %rd79, 1; + add.s64 %rd595, %rd678, %rd685; + add.s64 %rd596, %rd680, %rd685; + add.s64 %rd597, %rd682, %rd685; + add.s64 %rd598, %rd684, %rd685; + .loc 1 602 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:602:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s64 %rd687, %rd141, %rd686; + add.s64 %rd689, %rd141, %rd688; + add.s64 %rd691, %rd141, %rd690; + add.s64 %rd693, %rd141, %rd692; + .loc 1 602 51 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:602:51 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s64 %rd615, %rd687, %rd685; + add.s64 %rd616, %rd689, %rd685; + add.s64 %rd617, %rd691, %rd685; + add.s64 %rd618, %rd693, %rd685; + .loc 1 831 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + // begin inline asm + cp.async.cg.shared.global [ %r11058 + 0 ], [ %rd595 + 0 ], 0x10, %r7729; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11060 + 0 ], [ %rd596 + 0 ], 0x10, %r7731; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11062 + 0 ], [ %rd597 + 0 ], 0x10, %r7733; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11064 + 0 ], [ %rd598 + 0 ], 0x10, %r7735; + // end inline asm + cp.async.commit_group; + .loc 1 674 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:674:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s64 %rd599, %rd142, %rd694; + add.s64 %rd600, %rd142, %rd695; + cvt.s64.s32 %rd696, %r689; + cvt.u64.u32 %rd144, %r692; + add.s64 %rd697, %rd696, %rd144; + shl.b64 %rd698, %rd697, 2; + add.s64 %rd699, %rd142, %rd698; + add.s64 %rd601, %rd699, 32; + add.s64 %rd602, %rd699, 36; + add.s64 %rd603, %rd142, %rd700; + add.s64 %rd604, %rd142, %rd701; + add.s64 %rd605, %rd142, %rd702; + add.s64 %rd606, %rd142, %rd703; + add.s64 %rd607, %rd142, %rd704; + add.s64 %rd608, %rd142, %rd705; + add.s64 %rd609, %rd142, %rd706; + add.s64 %rd610, %rd142, %rd707; + add.s64 %rd611, %rd142, %rd708; + add.s64 %rd612, %rd142, %rd709; + add.s64 %rd613, %rd142, %rd710; + add.s64 %rd614, %rd142, %rd711; + .loc 1 674 22 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:674:22 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11066 + 0 ], [ %rd599 + 0 ], 0x4, %r7737; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11068 + 0 ], [ %rd600 + 0 ], 0x4, %r7739; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11070 + 0 ], [ %rd601 + 0 ], 0x4, %r7741; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11072 + 0 ], [ %rd602 + 0 ], 0x4, %r7743; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11074 + 0 ], [ %rd603 + 0 ], 0x4, %r7745; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11076 + 0 ], [ %rd604 + 0 ], 0x4, %r7747; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11078 + 0 ], [ %rd605 + 0 ], 0x4, %r7749; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11080 + 0 ], [ %rd606 + 0 ], 0x4, %r7751; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11082 + 0 ], [ %rd607 + 0 ], 0x4, %r7753; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11084 + 0 ], [ %rd608 + 0 ], 0x4, %r7755; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11086 + 0 ], [ %rd609 + 0 ], 0x4, %r7757; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11088 + 0 ], [ %rd610 + 0 ], 0x4, %r7759; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11090 + 0 ], [ %rd611 + 0 ], 0x4, %r7761; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11092 + 0 ], [ %rd612 + 0 ], 0x4, %r7763; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11094 + 0 ], [ %rd613 + 0 ], 0x4, %r7765; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11096 + 0 ], [ %rd614 + 0 ], 0x4, %r7767; + // end inline asm + cp.async.commit_group; + .loc 1 833 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:833:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + // begin inline asm + cp.async.cg.shared.global [ %r11098 + 0 ], [ %rd615 + 0 ], 0x10, %r7729; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11100 + 0 ], [ %rd616 + 0 ], 0x10, %r7731; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11102 + 0 ], [ %rd617 + 0 ], 0x10, %r7733; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11104 + 0 ], [ %rd618 + 0 ], 0x10, %r7735; + // end inline asm + cp.async.commit_group; + .loc 1 748 29 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:748:29 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s64 %rd619, %rd143, %rd694; + add.s64 %rd620, %rd143, %rd695; + add.s64 %rd712, %rd143, %rd698; + add.s64 %rd621, %rd712, 32; + add.s64 %rd622, %rd712, 36; + add.s64 %rd623, %rd143, %rd700; + add.s64 %rd624, %rd143, %rd701; + add.s64 %rd625, %rd143, %rd702; + add.s64 %rd626, %rd143, %rd703; + add.s64 %rd627, %rd143, %rd704; + add.s64 %rd628, %rd143, %rd705; + add.s64 %rd629, %rd143, %rd706; + add.s64 %rd630, %rd143, %rd707; + add.s64 %rd631, %rd143, %rd708; + add.s64 %rd632, %rd143, %rd709; + add.s64 %rd633, %rd143, %rd710; + add.s64 %rd634, %rd143, %rd711; + .loc 1 748 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:748:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11106 + 0 ], [ %rd619 + 0 ], 0x4, %r7737; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11108 + 0 ], [ %rd620 + 0 ], 0x4, %r7739; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11110 + 0 ], [ %rd621 + 0 ], 0x4, %r7741; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11112 + 0 ], [ %rd622 + 0 ], 0x4, %r7743; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11114 + 0 ], [ %rd623 + 0 ], 0x4, %r7745; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11116 + 0 ], [ %rd624 + 0 ], 0x4, %r7747; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11118 + 0 ], [ %rd625 + 0 ], 0x4, %r7749; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11120 + 0 ], [ %rd626 + 0 ], 0x4, %r7751; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11122 + 0 ], [ %rd627 + 0 ], 0x4, %r7753; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11124 + 0 ], [ %rd628 + 0 ], 0x4, %r7755; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11126 + 0 ], [ %rd629 + 0 ], 0x4, %r7757; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11128 + 0 ], [ %rd630 + 0 ], 0x4, %r7759; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11130 + 0 ], [ %rd631 + 0 ], 0x4, %r7761; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11132 + 0 ], [ %rd632 + 0 ], 0x4, %r7763; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11134 + 0 ], [ %rd633 + 0 ], 0x4, %r7765; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11136 + 0 ], [ %rd634 + 0 ], 0x4, %r7767; + // end inline asm + cp.async.commit_group; + .loc 1 626 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:626:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s64 %rd1201, %rd595, 524288; + add.s64 %rd1200, %rd596, 524288; + add.s64 %rd1199, %rd597, 524288; + add.s64 %rd1198, %rd598, 524288; + .loc 1 627 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:627:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s64 %rd1197, %rd615, 16384; + add.s64 %rd1196, %rd616, 16384; + add.s64 %rd1195, %rd617, 16384; + add.s64 %rd1194, %rd618, 16384; + .loc 1 831 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + bar.sync 0; + // begin inline asm + cp.async.cg.shared.global [ %r11138 + 0 ], [ %rd1201 + 0 ], 0x10, %r7809; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11140 + 0 ], [ %rd1200 + 0 ], 0x10, %r7811; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11142 + 0 ], [ %rd1199 + 0 ], 0x10, %r7813; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11144 + 0 ], [ %rd1198 + 0 ], 0x10, %r7815; + // end inline asm + cp.async.commit_group; + .loc 1 674 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:674:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s64 %rd639, %rd142, %rd713; + add.s64 %rd640, %rd142, %rd714; + add.s64 %rd715, %rd698, 256; + add.s64 %rd716, %rd142, %rd715; + add.s64 %rd641, %rd716, 32; + add.s64 %rd642, %rd716, 36; + add.s64 %rd643, %rd603, 256; + add.s64 %rd644, %rd604, 256; + add.s64 %rd645, %rd605, 256; + add.s64 %rd646, %rd606, 256; + add.s64 %rd647, %rd607, 256; + add.s64 %rd648, %rd608, 256; + add.s64 %rd649, %rd609, 256; + add.s64 %rd650, %rd610, 256; + add.s64 %rd651, %rd611, 256; + add.s64 %rd652, %rd612, 256; + add.s64 %rd653, %rd613, 256; + add.s64 %rd654, %rd614, 256; + .loc 1 674 22 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:674:22 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11146 + 0 ], [ %rd639 + 0 ], 0x4, %r7817; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11148 + 0 ], [ %rd640 + 0 ], 0x4, %r7819; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11150 + 0 ], [ %rd641 + 0 ], 0x4, %r7821; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11152 + 0 ], [ %rd642 + 0 ], 0x4, %r7823; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11154 + 0 ], [ %rd643 + 0 ], 0x4, %r7825; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11156 + 0 ], [ %rd644 + 0 ], 0x4, %r7827; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11158 + 0 ], [ %rd645 + 0 ], 0x4, %r7829; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11160 + 0 ], [ %rd646 + 0 ], 0x4, %r7831; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11162 + 0 ], [ %rd647 + 0 ], 0x4, %r7833; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11164 + 0 ], [ %rd648 + 0 ], 0x4, %r7835; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11166 + 0 ], [ %rd649 + 0 ], 0x4, %r7837; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11168 + 0 ], [ %rd650 + 0 ], 0x4, %r7839; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11170 + 0 ], [ %rd651 + 0 ], 0x4, %r7841; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11172 + 0 ], [ %rd652 + 0 ], 0x4, %r7843; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11174 + 0 ], [ %rd653 + 0 ], 0x4, %r7845; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11176 + 0 ], [ %rd654 + 0 ], 0x4, %r7847; + // end inline asm + cp.async.commit_group; + .loc 1 833 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:833:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + // begin inline asm + cp.async.cg.shared.global [ %r11178 + 0 ], [ %rd1197 + 0 ], 0x10, %r7809; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11180 + 0 ], [ %rd1196 + 0 ], 0x10, %r7811; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11182 + 0 ], [ %rd1195 + 0 ], 0x10, %r7813; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11184 + 0 ], [ %rd1194 + 0 ], 0x10, %r7815; + // end inline asm + cp.async.commit_group; + .loc 1 748 29 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:748:29 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s64 %rd659, %rd143, %rd713; + add.s64 %rd660, %rd143, %rd714; + add.s64 %rd717, %rd143, %rd715; + add.s64 %rd661, %rd717, 32; + add.s64 %rd662, %rd717, 36; + add.s64 %rd663, %rd623, 256; + add.s64 %rd664, %rd624, 256; + add.s64 %rd665, %rd625, 256; + add.s64 %rd666, %rd626, 256; + add.s64 %rd667, %rd627, 256; + add.s64 %rd668, %rd628, 256; + add.s64 %rd669, %rd629, 256; + add.s64 %rd670, %rd630, 256; + add.s64 %rd671, %rd631, 256; + add.s64 %rd672, %rd632, 256; + add.s64 %rd673, %rd633, 256; + add.s64 %rd674, %rd634, 256; + .loc 1 748 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:748:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11186 + 0 ], [ %rd659 + 0 ], 0x4, %r7817; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11188 + 0 ], [ %rd660 + 0 ], 0x4, %r7819; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11190 + 0 ], [ %rd661 + 0 ], 0x4, %r7821; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11192 + 0 ], [ %rd662 + 0 ], 0x4, %r7823; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11194 + 0 ], [ %rd663 + 0 ], 0x4, %r7825; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11196 + 0 ], [ %rd664 + 0 ], 0x4, %r7827; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11198 + 0 ], [ %rd665 + 0 ], 0x4, %r7829; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11200 + 0 ], [ %rd666 + 0 ], 0x4, %r7831; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11202 + 0 ], [ %rd667 + 0 ], 0x4, %r7833; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11204 + 0 ], [ %rd668 + 0 ], 0x4, %r7835; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11206 + 0 ], [ %rd669 + 0 ], 0x4, %r7837; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11208 + 0 ], [ %rd670 + 0 ], 0x4, %r7839; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11210 + 0 ], [ %rd671 + 0 ], 0x4, %r7841; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11212 + 0 ], [ %rd672 + 0 ], 0x4, %r7843; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11214 + 0 ], [ %rd673 + 0 ], 0x4, %r7845; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11216 + 0 ], [ %rd674 + 0 ], 0x4, %r7847; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:610:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + @%p618 bra $L__BB0_12; +// %bb.10: // %.lr.ph1620.preheader + // in Loop: Header=BB0_9 Depth=1 + .loc 1 0 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:0:28 + mov.b32 %r8449, 0; + mov.b32 %r14899, 1; + mov.b32 %r14898, -1; + mov.b32 %r14883, 64; + mov.b32 %r14881, %r693; + mov.b32 %r14882, %r694; + mov.b32 %r14884, %r693; + mov.b32 %r14885, %r694; + mov.b32 %r14886, %r754; + mov.b32 %r14887, %r755; + mov.b32 %r14888, %r756; + mov.b32 %r14889, %r757; + mov.b32 %r14890, %r758; + mov.b32 %r14891, %r759; + mov.b32 %r14892, %r760; + mov.b32 %r14893, %r761; + mov.b32 %r14894, %r762; + mov.b32 %r14895, %r763; + mov.b32 %r14896, %r764; + mov.b32 %r14897, %r765; + mov.b32 %r14900, %r14898; + mov.b32 %r14901, %r14899; + mov.b32 %r14902, %r837; + mov.b32 %r14903, %r836; + mov.b32 %r14904, %r835; + mov.b32 %r14905, %r834; + mov.b32 %r14906, %r833; + mov.b32 %r14907, %r832; + mov.b32 %r14908, %r831; + mov.b32 %r14909, %r830; + mov.b32 %r14910, %r829; + mov.b32 %r14911, %r828; + mov.b32 %r14912, %r827; + mov.b32 %r14913, %r826; + mov.b32 %r14914, %r825; + mov.b32 %r14915, %r824; + mov.b32 %r14916, %r838; + mov.b32 %r14917, %r839; + mov.b32 %r14918, %r840; + mov.b32 %r14919, %r841; + mov.b32 %r14920, %r838; + mov.b32 %r14921, %r839; + mov.b32 %r14922, %r840; + mov.b32 %r14923, %r841; + mov.b32 %r15052, %r8449; + mov.b32 %r15053, %r818; + mov.b32 %r15054, %r819; + mov.b32 %r15055, %r818; + mov.b32 %r15056, %r819; + mov.b32 %r15057, %r822; + mov.b32 %r15058, %r823; + mov.b32 %r15059, %r765; + mov.b32 %r15060, %r764; + mov.b32 %r15061, %r763; + mov.b32 %r15062, %r762; + mov.b32 %r15063, %r761; + mov.b32 %r15064, %r760; + mov.b32 %r15065, %r759; + mov.b32 %r15066, %r758; + mov.b32 %r15067, %r757; + mov.b32 %r15068, %r756; + mov.b32 %r15069, %r755; + mov.b32 %r15070, %r754; +$L__BB0_11: // %.lr.ph1620 + // Parent Loop BB0_9 Depth=1 + // => This Inner Loop Header: Depth=2 + .loc 1 610 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:610:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.lt.s32 %p675, %r15052, %r902; + setp.lt.s32 %p641, %r15052, %r903; + add.s32 %r10148, %r14898, 1; + setp.gt.s32 %p676, %r10148, 1; + selp.b32 %r14898, 0, %r10148, %p676; + add.s32 %r10149, %r14900, 1; + setp.gt.s32 %p677, %r10149, 2; + selp.b32 %r14900, 0, %r10149, %p677; + .loc 1 831 52 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:52 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.lt.s32 %p678, %r15055, %r2358; + setp.lt.s32 %p679, %r15056, %r2358; + setp.lt.s32 %p680, %r14884, %r2358; + setp.lt.s32 %p681, %r14885, %r2358; + setp.lt.s32 %p682, %r14886, %r2358; + setp.lt.s32 %p683, %r14887, %r2358; + setp.lt.s32 %p684, %r14888, %r2358; + setp.lt.s32 %p685, %r14889, %r2358; + setp.lt.s32 %p686, %r14890, %r2358; + setp.lt.s32 %p687, %r14891, %r2358; + setp.lt.s32 %p688, %r14892, %r2358; + setp.lt.s32 %p689, %r14893, %r2358; + setp.lt.s32 %p690, %r14894, %r2358; + setp.lt.s32 %p691, %r14895, %r2358; + setp.lt.s32 %p692, %r14896, %r2358; + setp.lt.s32 %p693, %r14897, %r2358; + .loc 1 831 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cp.async.wait_group 4; + bar.sync 0; + shl.b32 %r10150, %r14900, 14; + add.s32 %r8451, %r7585, %r10150; + .loc 1 674 22 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:674:22 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + shl.b32 %r10152, %r14898, 8; + add.s32 %r10153, %r7585, 98304; + add.s32 %r10154, %r10153, %r10152; + add.s32 %r10155, %r10154, %r767; + ld.shared.v2.b32 {%r10156, %r10157}, [%r10155]; + ld.shared.v2.b32 {%r10158, %r10159}, [%r10155+32]; + ld.shared.v2.b32 {%r10160, %r10161}, [%r10155+64]; + ld.shared.v2.b32 {%r10162, %r10163}, [%r10155+96]; + ld.shared.v2.b32 {%r10164, %r10165}, [%r10155+128]; + ld.shared.v2.b32 {%r10166, %r10167}, [%r10155+160]; + ld.shared.v2.b32 {%r10168, %r10169}, [%r10155+192]; + ld.shared.v2.b32 {%r10170, %r10171}, [%r10155+224]; + .loc 1 675 26 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:675:26 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.eq.f32 %p694, %r10156, 0fFF800000; + setp.eq.f32 %p695, %r10157, 0fFF800000; + setp.eq.f32 %p696, %r10158, 0fFF800000; + setp.eq.f32 %p697, %r10159, 0fFF800000; + setp.eq.f32 %p698, %r10160, 0fFF800000; + setp.eq.f32 %p699, %r10161, 0fFF800000; + setp.eq.f32 %p700, %r10162, 0fFF800000; + setp.eq.f32 %p701, %r10163, 0fFF800000; + setp.eq.f32 %p702, %r10164, 0fFF800000; + setp.eq.f32 %p703, %r10165, 0fFF800000; + setp.eq.f32 %p704, %r10166, 0fFF800000; + setp.eq.f32 %p705, %r10167, 0fFF800000; + setp.eq.f32 %p706, %r10168, 0fFF800000; + setp.eq.f32 %p707, %r10169, 0fFF800000; + setp.eq.f32 %p708, %r10170, 0fFF800000; + setp.eq.f32 %p709, %r10171, 0fFF800000; + .loc 1 675 46 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:675:46 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10172, 0f00000000, %r10156, %p694; + selp.f32 %r10173, 0f00000000, %r10157, %p695; + selp.f32 %r10174, 0f00000000, %r10158, %p696; + selp.f32 %r10175, 0f00000000, %r10159, %p697; + selp.f32 %r10176, 0f00000000, %r10160, %p698; + selp.f32 %r10177, 0f00000000, %r10161, %p699; + selp.f32 %r10178, 0f00000000, %r10162, %p700; + selp.f32 %r10179, 0f00000000, %r10163, %p701; + selp.f32 %r10180, 0f00000000, %r10164, %p702; + selp.f32 %r10181, 0f00000000, %r10165, %p703; + selp.f32 %r10182, 0f00000000, %r10166, %p704; + selp.f32 %r10183, 0f00000000, %r10167, %p705; + selp.f32 %r10184, 0f00000000, %r10168, %p706; + selp.f32 %r10185, 0f00000000, %r10169, %p707; + selp.f32 %r10186, 0f00000000, %r10170, %p708; + selp.f32 %r10187, 0f00000000, %r10171, %p709; + .loc 1 676 20 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:676:20 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + shfl.sync.idx.b32 %r10188, %r11, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r10189, %r10188, 11; + and.b32 %r10190, %r10189, 8192; + add.s32 %r8410, %r7585, 99328; + add.s32 %r10191, %r10190, %r8410; + bfe.u32 %r10192, %r10191, 4, 14; + cvt.u64.u32 %rd804, %r10192; + or.b64 %rd718, %rd804, 4611686293372403712; + bfe.u32 %r10193, %r8451, 4, 14; + cvt.u64.u32 %rd805, %r10193; + or.b64 %rd719, %rd805, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7994,%r7995,%r7996,%r7997,%r7998,%r7999,%r8000,%r8001,%r8002,%r8003,%r8004,%r8005,%r8006,%r8007,%r8008,%r8009,%r8010,%r8011,%r8012,%r8013,%r8014,%r8015,%r8016,%r8017,%r8018,%r8019,%r8020,%r8021,%r8022,%r8023,%r8024,%r8025}, %rd718, %rd719, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r10194, %r10190, 32; + add.s32 %r10195, %r10194, %r8410; + bfe.u32 %r10196, %r10195, 4, 14; + cvt.u64.u32 %rd806, %r10196; + or.b64 %rd720, %rd806, 4611686293372403712; + add.s32 %r10197, %r8451, 32; + bfe.u32 %r10198, %r10197, 4, 14; + cvt.u64.u32 %rd807, %r10198; + or.b64 %rd721, %rd807, 4611686293338849280; + mov.pred %p619, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7994,%r7995,%r7996,%r7997,%r7998,%r7999,%r8000,%r8001,%r8002,%r8003,%r8004,%r8005,%r8006,%r8007,%r8008,%r8009,%r8010,%r8011,%r8012,%r8013,%r8014,%r8015,%r8016,%r8017,%r8018,%r8019,%r8020,%r8021,%r8022,%r8023,%r8024,%r8025}, %rd720, %rd721, %p619, 1, 1, 0, 0; + // end inline asm + or.b32 %r10199, %r10190, 64; + add.s32 %r10200, %r10199, %r8410; + bfe.u32 %r10201, %r10200, 4, 14; + cvt.u64.u32 %rd808, %r10201; + or.b64 %rd722, %rd808, 4611686293372403712; + add.s32 %r10202, %r8451, 64; + bfe.u32 %r10203, %r10202, 4, 14; + cvt.u64.u32 %rd809, %r10203; + or.b64 %rd723, %rd809, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7994,%r7995,%r7996,%r7997,%r7998,%r7999,%r8000,%r8001,%r8002,%r8003,%r8004,%r8005,%r8006,%r8007,%r8008,%r8009,%r8010,%r8011,%r8012,%r8013,%r8014,%r8015,%r8016,%r8017,%r8018,%r8019,%r8020,%r8021,%r8022,%r8023,%r8024,%r8025}, %rd722, %rd723, %p619, 1, 1, 0, 0; + // end inline asm + or.b32 %r10204, %r10190, 96; + add.s32 %r10205, %r10204, %r8410; + bfe.u32 %r10206, %r10205, 4, 14; + cvt.u64.u32 %rd810, %r10206; + or.b64 %rd724, %rd810, 4611686293372403712; + add.s32 %r10207, %r8451, 96; + bfe.u32 %r10208, %r10207, 4, 14; + cvt.u64.u32 %rd811, %r10208; + or.b64 %rd725, %rd811, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7994,%r7995,%r7996,%r7997,%r7998,%r7999,%r8000,%r8001,%r8002,%r8003,%r8004,%r8005,%r8006,%r8007,%r8008,%r8009,%r8010,%r8011,%r8012,%r8013,%r8014,%r8015,%r8016,%r8017,%r8018,%r8019,%r8020,%r8021,%r8022,%r8023,%r8024,%r8025}, %rd724, %rd725, %p619, 1, 1, 0, 0; + // end inline asm + or.b32 %r10209, %r10190, 16384; + add.s32 %r10210, %r10209, %r8410; + bfe.u32 %r10211, %r10210, 4, 14; + cvt.u64.u32 %rd812, %r10211; + or.b64 %rd726, %rd812, 4611686293372403712; + add.s32 %r10212, %r8451, 8192; + bfe.u32 %r10213, %r10212, 4, 14; + cvt.u64.u32 %rd813, %r10213; + or.b64 %rd727, %rd813, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7994,%r7995,%r7996,%r7997,%r7998,%r7999,%r8000,%r8001,%r8002,%r8003,%r8004,%r8005,%r8006,%r8007,%r8008,%r8009,%r8010,%r8011,%r8012,%r8013,%r8014,%r8015,%r8016,%r8017,%r8018,%r8019,%r8020,%r8021,%r8022,%r8023,%r8024,%r8025}, %rd726, %rd727, %p619, 1, 1, 0, 0; + // end inline asm + or.b32 %r10214, %r10190, 16416; + add.s32 %r10215, %r10214, %r8410; + bfe.u32 %r10216, %r10215, 4, 14; + cvt.u64.u32 %rd814, %r10216; + or.b64 %rd728, %rd814, 4611686293372403712; + add.s32 %r10217, %r8451, 8224; + bfe.u32 %r10218, %r10217, 4, 14; + cvt.u64.u32 %rd815, %r10218; + or.b64 %rd729, %rd815, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7994,%r7995,%r7996,%r7997,%r7998,%r7999,%r8000,%r8001,%r8002,%r8003,%r8004,%r8005,%r8006,%r8007,%r8008,%r8009,%r8010,%r8011,%r8012,%r8013,%r8014,%r8015,%r8016,%r8017,%r8018,%r8019,%r8020,%r8021,%r8022,%r8023,%r8024,%r8025}, %rd728, %rd729, %p619, 1, 1, 0, 0; + // end inline asm + or.b32 %r10219, %r10190, 16448; + add.s32 %r10220, %r10219, %r8410; + bfe.u32 %r10221, %r10220, 4, 14; + cvt.u64.u32 %rd816, %r10221; + or.b64 %rd730, %rd816, 4611686293372403712; + add.s32 %r10222, %r8451, 8256; + bfe.u32 %r10223, %r10222, 4, 14; + cvt.u64.u32 %rd817, %r10223; + or.b64 %rd731, %rd817, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7994,%r7995,%r7996,%r7997,%r7998,%r7999,%r8000,%r8001,%r8002,%r8003,%r8004,%r8005,%r8006,%r8007,%r8008,%r8009,%r8010,%r8011,%r8012,%r8013,%r8014,%r8015,%r8016,%r8017,%r8018,%r8019,%r8020,%r8021,%r8022,%r8023,%r8024,%r8025}, %rd730, %rd731, %p619, 1, 1, 0, 0; + // end inline asm + or.b32 %r10224, %r10190, 16480; + add.s32 %r10225, %r10224, %r8410; + bfe.u32 %r10226, %r10225, 4, 14; + cvt.u64.u32 %rd818, %r10226; + or.b64 %rd732, %rd818, 4611686293372403712; + add.s32 %r10227, %r8451, 8288; + bfe.u32 %r10228, %r10227, 4, 14; + cvt.u64.u32 %rd819, %r10228; + or.b64 %rd733, %rd819, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7994,%r7995,%r7996,%r7997,%r7998,%r7999,%r8000,%r8001,%r8002,%r8003,%r8004,%r8005,%r8006,%r8007,%r8008,%r8009,%r8010,%r8011,%r8012,%r8013,%r8014,%r8015,%r8016,%r8017,%r8018,%r8019,%r8020,%r8021,%r8022,%r8023,%r8024,%r8025}, %rd732, %rd733, %p619, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r8413, %r8451; + mov.b32 %r8411, %r8449; + mov.b32 %r8412, %r8449; + mov.b32 %r8414, %r8449; + mov.b32 %r8415, %r8449; + // begin inline asm + // wait for regs: %r7994,%r7995,%r7996,%r7997,%r7998,%r7999,%r8000,%r8001,%r8002,%r8003,%r8004,%r8005,%r8006,%r8007,%r8008,%r8009,%r8010,%r8011,%r8012,%r8013,%r8014,%r8015,%r8016,%r8017,%r8018,%r8019,%r8020,%r8021,%r8022,%r8023,%r8024,%r8025,%r8410,%r8411,%r8412,%r8413,%r8414,%r8415 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 678 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:678:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10229, %r7994, 0f3DB504F3; + mul.f32 %r10230, %r7995, 0f3DB504F3; + mul.f32 %r10231, %r7996, 0f3DB504F3; + mul.f32 %r10232, %r7997, 0f3DB504F3; + mul.f32 %r10233, %r7998, 0f3DB504F3; + mul.f32 %r10234, %r7999, 0f3DB504F3; + mul.f32 %r10235, %r8000, 0f3DB504F3; + mul.f32 %r10236, %r8001, 0f3DB504F3; + mul.f32 %r10237, %r8002, 0f3DB504F3; + mul.f32 %r10238, %r8003, 0f3DB504F3; + mul.f32 %r10239, %r8004, 0f3DB504F3; + mul.f32 %r10240, %r8005, 0f3DB504F3; + mul.f32 %r10241, %r8006, 0f3DB504F3; + mul.f32 %r10242, %r8007, 0f3DB504F3; + mul.f32 %r10243, %r8008, 0f3DB504F3; + mul.f32 %r10244, %r8009, 0f3DB504F3; + mul.f32 %r10245, %r8010, 0f3DB504F3; + mul.f32 %r10246, %r8011, 0f3DB504F3; + mul.f32 %r10247, %r8012, 0f3DB504F3; + mul.f32 %r10248, %r8013, 0f3DB504F3; + mul.f32 %r10249, %r8014, 0f3DB504F3; + mul.f32 %r10250, %r8015, 0f3DB504F3; + mul.f32 %r10251, %r8016, 0f3DB504F3; + mul.f32 %r10252, %r8017, 0f3DB504F3; + mul.f32 %r10253, %r8018, 0f3DB504F3; + mul.f32 %r10254, %r8019, 0f3DB504F3; + mul.f32 %r10255, %r8020, 0f3DB504F3; + mul.f32 %r10256, %r8021, 0f3DB504F3; + mul.f32 %r10257, %r8022, 0f3DB504F3; + mul.f32 %r10258, %r8023, 0f3DB504F3; + mul.f32 %r10259, %r8024, 0f3DB504F3; + mul.f32 %r10260, %r8025, 0f3DB504F3; + .loc 1 798 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:798:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + rem.s32 %r10261, %r14881, %r2358; + rem.s32 %r10262, %r14882, %r2358; + .loc 1 698 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:698:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.le.s32 %p710, %r980, %r10261; + setp.le.s32 %p711, %r980, %r10262; + setp.le.s32 %p712, %r725, %r10261; + setp.le.s32 %p713, %r725, %r10262; + .loc 1 703 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:703:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.s64.s32 %rd820, %r10261; + cvt.s64.s32 %rd821, %r10262; + .loc 1 704 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:704:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.gt.s64 %p714, %rd561, %rd820; + setp.gt.s64 %p715, %rd561, %rd821; + .loc 1 706 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:706:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + and.pred %p716, %p710, %p714; + and.pred %p717, %p711, %p715; + and.pred %p718, %p712, %p714; + and.pred %p719, %p713, %p715; + .loc 1 722 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:722:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + sub.s32 %r10263, %r980, %r10261; + sub.s32 %r10264, %r980, %r10262; + sub.s32 %r10265, %r725, %r10261; + sub.s32 %r10266, %r725, %r10262; + .loc 1 723 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:723:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + rem.s32 %r10267, %r10263, %r2366; + rem.s32 %r10268, %r10264, %r2366; + rem.s32 %r10269, %r10265, %r2366; + rem.s32 %r10270, %r10266, %r2366; + .loc 1 724 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:724:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.ne.b32 %p720, %r10267, 0; + setp.ne.b32 %p721, %r10268, 0; + setp.ne.b32 %p722, %r10269, 0; + setp.ne.b32 %p723, %r10270, 0; + .loc 1 798 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:798:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + rem.s32 %r10271, %r15070, %r2358; + rem.s32 %r10272, %r15069, %r2358; + rem.s32 %r10273, %r15068, %r2358; + rem.s32 %r10274, %r15067, %r2358; + .loc 1 698 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:698:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.ge.s32 %p724, %r10271, %r980; + setp.ge.s32 %p725, %r10272, %r980; + setp.ge.s32 %p726, %r10271, %r725; + setp.ge.s32 %p727, %r10272, %r725; + setp.ge.s32 %p728, %r10273, %r980; + setp.ge.s32 %p729, %r10274, %r980; + setp.ge.s32 %p730, %r10273, %r725; + setp.ge.s32 %p731, %r10274, %r725; + .loc 1 703 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:703:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.s64.s32 %rd822, %r10271; + cvt.s64.s32 %rd823, %r10272; + cvt.s64.s32 %rd824, %r10273; + cvt.s64.s32 %rd825, %r10274; + .loc 1 704 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:704:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.gt.s64 %p732, %rd561, %rd822; + setp.gt.s64 %p733, %rd561, %rd823; + setp.gt.s64 %p734, %rd561, %rd824; + setp.gt.s64 %p735, %rd561, %rd825; + .loc 1 706 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:706:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + and.pred %p736, %p724, %p732; + and.pred %p737, %p725, %p733; + and.pred %p738, %p726, %p732; + and.pred %p739, %p727, %p733; + and.pred %p740, %p728, %p734; + and.pred %p741, %p729, %p735; + and.pred %p742, %p730, %p734; + and.pred %p743, %p731, %p735; + .loc 1 722 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:722:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + sub.s32 %r10279, %r725, %r10274; + sub.s32 %r10280, %r725, %r10273; + sub.s32 %r10281, %r980, %r10274; + sub.s32 %r10282, %r980, %r10273; + sub.s32 %r10283, %r725, %r10272; + sub.s32 %r10284, %r725, %r10271; + sub.s32 %r10285, %r980, %r10272; + sub.s32 %r10286, %r980, %r10271; + .loc 1 723 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:723:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + rem.s32 %r10287, %r10286, %r2366; + rem.s32 %r10288, %r10285, %r2366; + rem.s32 %r10289, %r10284, %r2366; + rem.s32 %r10290, %r10283, %r2366; + rem.s32 %r10291, %r10282, %r2366; + rem.s32 %r10292, %r10281, %r2366; + rem.s32 %r10293, %r10280, %r2366; + rem.s32 %r10294, %r10279, %r2366; + .loc 1 724 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:724:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.ne.b32 %p744, %r10294, 0; + setp.ne.b32 %p745, %r10293, 0; + setp.ne.b32 %p746, %r10292, 0; + setp.ne.b32 %p747, %r10291, 0; + setp.ne.b32 %p748, %r10290, 0; + setp.ne.b32 %p749, %r10289, 0; + setp.ne.b32 %p750, %r10288, 0; + setp.ne.b32 %p751, %r10287, 0; + .loc 1 726 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:726:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + xor.b32 %r10295, %r10267, %r2366; + xor.b32 %r10296, %r10268, %r2366; + xor.b32 %r10297, %r10269, %r2366; + xor.b32 %r10298, %r10293, %r2366; + xor.b32 %r10299, %r10292, %r2366; + xor.b32 %r10300, %r10291, %r2366; + xor.b32 %r10301, %r10290, %r2366; + xor.b32 %r10302, %r10289, %r2366; + xor.b32 %r10303, %r10288, %r2366; + xor.b32 %r10304, %r10287, %r2366; + xor.b32 %r10305, %r10270, %r2366; + xor.b32 %r10306, %r10294, %r2366; + .loc 1 729 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:729:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + shr.s32 %r10307, %r10295, 31; + and.b32 %r10308, %r10307, %r2366; + selp.b32 %r10309, %r10308, 0, %p720; + shr.s32 %r10310, %r10296, 31; + and.b32 %r10311, %r10310, %r2366; + selp.b32 %r10312, %r10311, 0, %p721; + shr.s32 %r10313, %r10297, 31; + and.b32 %r10314, %r10313, %r2366; + selp.b32 %r10315, %r10314, 0, %p722; + shr.s32 %r10316, %r10305, 31; + and.b32 %r10317, %r10316, %r2366; + selp.b32 %r10318, %r10317, 0, %p723; + shr.s32 %r10319, %r10304, 31; + and.b32 %r10320, %r10319, %r2366; + selp.b32 %r10321, %r10320, 0, %p751; + shr.s32 %r10322, %r10303, 31; + and.b32 %r10323, %r10322, %r2366; + selp.b32 %r10324, %r10323, 0, %p750; + shr.s32 %r10325, %r10302, 31; + and.b32 %r10326, %r10325, %r2366; + selp.b32 %r10327, %r10326, 0, %p749; + shr.s32 %r10328, %r10301, 31; + and.b32 %r10329, %r10328, %r2366; + selp.b32 %r10330, %r10329, 0, %p748; + shr.s32 %r10331, %r10300, 31; + and.b32 %r10332, %r10331, %r2366; + selp.b32 %r10333, %r10332, 0, %p747; + shr.s32 %r10334, %r10299, 31; + and.b32 %r10335, %r10334, %r2366; + selp.b32 %r10336, %r10335, 0, %p746; + shr.s32 %r10337, %r10298, 31; + and.b32 %r10338, %r10337, %r2366; + selp.b32 %r10339, %r10338, 0, %p745; + shr.s32 %r10340, %r10306, 31; + and.b32 %r10341, %r10340, %r2366; + selp.b32 %r10342, %r10341, 0, %p744; + .loc 1 730 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:730:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + neg.s32 %r10343, %r10309; + setp.eq.b32 %p752, %r10267, %r10343; + neg.s32 %r10344, %r10312; + setp.eq.b32 %p753, %r10268, %r10344; + neg.s32 %r10345, %r10315; + setp.eq.b32 %p754, %r10269, %r10345; + neg.s32 %r10346, %r10318; + setp.eq.b32 %p755, %r10270, %r10346; + neg.s32 %r10347, %r10321; + setp.eq.b32 %p756, %r10287, %r10347; + neg.s32 %r10348, %r10324; + setp.eq.b32 %p757, %r10288, %r10348; + neg.s32 %r10349, %r10327; + setp.eq.b32 %p758, %r10289, %r10349; + neg.s32 %r10350, %r10330; + setp.eq.b32 %p759, %r10290, %r10350; + neg.s32 %r10351, %r10333; + setp.eq.b32 %p760, %r10291, %r10351; + neg.s32 %r10352, %r10336; + setp.eq.b32 %p761, %r10292, %r10352; + neg.s32 %r10353, %r10339; + setp.eq.b32 %p762, %r10293, %r10353; + neg.s32 %r10354, %r10342; + setp.eq.b32 %p763, %r10294, %r10354; + .loc 1 798 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:798:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + rem.s32 %r10355, %r15059, %r2358; + rem.s32 %r10356, %r15060, %r2358; + rem.s32 %r10357, %r15061, %r2358; + rem.s32 %r10358, %r15062, %r2358; + rem.s32 %r10359, %r15063, %r2358; + rem.s32 %r10360, %r15064, %r2358; + rem.s32 %r10361, %r15065, %r2358; + rem.s32 %r10362, %r15066, %r2358; + .loc 1 698 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:698:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.ge.s32 %p764, %r10362, %r980; + setp.ge.s32 %p765, %r10361, %r980; + setp.ge.s32 %p766, %r10362, %r725; + setp.ge.s32 %p767, %r10361, %r725; + setp.ge.s32 %p768, %r10360, %r980; + setp.ge.s32 %p769, %r10359, %r980; + setp.ge.s32 %p770, %r10360, %r725; + setp.ge.s32 %p771, %r10359, %r725; + setp.ge.s32 %p772, %r10358, %r980; + setp.ge.s32 %p773, %r10357, %r980; + setp.ge.s32 %p774, %r10358, %r725; + setp.ge.s32 %p775, %r10357, %r725; + setp.ge.s32 %p776, %r10356, %r980; + setp.ge.s32 %p777, %r10355, %r980; + setp.ge.s32 %p778, %r10356, %r725; + setp.ge.s32 %p779, %r10355, %r725; + .loc 1 704 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:704:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.s64.s32 %rd826, %r10362; + cvt.s64.s32 %rd827, %r10361; + cvt.s64.s32 %rd828, %r10360; + cvt.s64.s32 %rd829, %r10359; + cvt.s64.s32 %rd830, %r10358; + cvt.s64.s32 %rd831, %r10357; + cvt.s64.s32 %rd832, %r10356; + cvt.s64.s32 %rd833, %r10355; + setp.gt.s64 %p780, %rd561, %rd833; + setp.gt.s64 %p781, %rd561, %rd832; + setp.gt.s64 %p782, %rd561, %rd831; + setp.gt.s64 %p783, %rd561, %rd830; + setp.gt.s64 %p784, %rd561, %rd829; + setp.gt.s64 %p785, %rd561, %rd828; + setp.gt.s64 %p786, %rd561, %rd827; + setp.gt.s64 %p787, %rd561, %rd826; + .loc 1 706 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:706:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + and.pred %p788, %p764, %p787; + and.pred %p789, %p765, %p786; + and.pred %p790, %p766, %p787; + and.pred %p791, %p767, %p786; + and.pred %p792, %p768, %p785; + and.pred %p793, %p769, %p784; + and.pred %p794, %p770, %p785; + and.pred %p795, %p771, %p784; + and.pred %p796, %p772, %p783; + and.pred %p797, %p773, %p782; + and.pred %p798, %p774, %p783; + and.pred %p799, %p775, %p782; + and.pred %p800, %p776, %p781; + and.pred %p801, %p777, %p780; + and.pred %p802, %p778, %p781; + and.pred %p803, %p779, %p780; + .loc 1 722 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:722:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + sub.s32 %r10363, %r980, %r10362; + sub.s32 %r10364, %r980, %r10361; + sub.s32 %r10365, %r725, %r10362; + sub.s32 %r10366, %r725, %r10361; + sub.s32 %r10367, %r980, %r10360; + sub.s32 %r10368, %r980, %r10359; + sub.s32 %r10369, %r725, %r10360; + sub.s32 %r10370, %r725, %r10359; + sub.s32 %r10371, %r980, %r10358; + sub.s32 %r10372, %r980, %r10357; + sub.s32 %r10373, %r725, %r10358; + sub.s32 %r10374, %r725, %r10357; + sub.s32 %r10375, %r980, %r10356; + sub.s32 %r10376, %r980, %r10355; + sub.s32 %r10377, %r725, %r10356; + sub.s32 %r10378, %r725, %r10355; + .loc 1 723 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:723:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + rem.s32 %r10379, %r10378, %r2366; + rem.s32 %r10380, %r10377, %r2366; + rem.s32 %r10381, %r10376, %r2366; + rem.s32 %r10382, %r10375, %r2366; + rem.s32 %r10383, %r10374, %r2366; + rem.s32 %r10384, %r10373, %r2366; + rem.s32 %r10385, %r10372, %r2366; + rem.s32 %r10386, %r10371, %r2366; + rem.s32 %r10387, %r10370, %r2366; + rem.s32 %r10388, %r10369, %r2366; + rem.s32 %r10389, %r10368, %r2366; + rem.s32 %r10390, %r10367, %r2366; + rem.s32 %r10391, %r10366, %r2366; + rem.s32 %r10392, %r10365, %r2366; + rem.s32 %r10393, %r10364, %r2366; + rem.s32 %r10394, %r10363, %r2366; + .loc 1 724 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:724:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.ne.b32 %p804, %r10394, 0; + setp.ne.b32 %p805, %r10393, 0; + setp.ne.b32 %p806, %r10392, 0; + setp.ne.b32 %p807, %r10391, 0; + setp.ne.b32 %p808, %r10390, 0; + setp.ne.b32 %p809, %r10389, 0; + setp.ne.b32 %p810, %r10388, 0; + setp.ne.b32 %p811, %r10387, 0; + setp.ne.b32 %p812, %r10386, 0; + setp.ne.b32 %p813, %r10385, 0; + setp.ne.b32 %p814, %r10384, 0; + setp.ne.b32 %p815, %r10383, 0; + setp.ne.b32 %p816, %r10382, 0; + setp.ne.b32 %p817, %r10381, 0; + setp.ne.b32 %p818, %r10380, 0; + setp.ne.b32 %p819, %r10379, 0; + .loc 1 726 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:726:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + xor.b32 %r10395, %r10394, %r2366; + xor.b32 %r10396, %r10393, %r2366; + xor.b32 %r10397, %r10392, %r2366; + xor.b32 %r10398, %r10391, %r2366; + xor.b32 %r10399, %r10390, %r2366; + xor.b32 %r10400, %r10389, %r2366; + xor.b32 %r10401, %r10388, %r2366; + xor.b32 %r10402, %r10387, %r2366; + xor.b32 %r10403, %r10386, %r2366; + xor.b32 %r10404, %r10385, %r2366; + xor.b32 %r10405, %r10384, %r2366; + xor.b32 %r10406, %r10383, %r2366; + xor.b32 %r10407, %r10382, %r2366; + xor.b32 %r10408, %r10381, %r2366; + xor.b32 %r10409, %r10380, %r2366; + xor.b32 %r10410, %r10379, %r2366; + .loc 1 729 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:729:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + shr.s32 %r10411, %r10410, 31; + and.b32 %r10412, %r10411, %r2366; + selp.b32 %r10413, %r10412, 0, %p819; + shr.s32 %r10414, %r10409, 31; + and.b32 %r10415, %r10414, %r2366; + selp.b32 %r10416, %r10415, 0, %p818; + shr.s32 %r10417, %r10408, 31; + and.b32 %r10418, %r10417, %r2366; + selp.b32 %r10419, %r10418, 0, %p817; + shr.s32 %r10420, %r10407, 31; + and.b32 %r10421, %r10420, %r2366; + selp.b32 %r10422, %r10421, 0, %p816; + shr.s32 %r10423, %r10406, 31; + and.b32 %r10424, %r10423, %r2366; + selp.b32 %r10425, %r10424, 0, %p815; + shr.s32 %r10426, %r10405, 31; + and.b32 %r10427, %r10426, %r2366; + selp.b32 %r10428, %r10427, 0, %p814; + shr.s32 %r10429, %r10404, 31; + and.b32 %r10430, %r10429, %r2366; + selp.b32 %r10431, %r10430, 0, %p813; + shr.s32 %r10432, %r10403, 31; + and.b32 %r10433, %r10432, %r2366; + selp.b32 %r10434, %r10433, 0, %p812; + shr.s32 %r10435, %r10402, 31; + and.b32 %r10436, %r10435, %r2366; + selp.b32 %r10437, %r10436, 0, %p811; + shr.s32 %r10438, %r10401, 31; + and.b32 %r10439, %r10438, %r2366; + selp.b32 %r10440, %r10439, 0, %p810; + shr.s32 %r10441, %r10400, 31; + and.b32 %r10442, %r10441, %r2366; + selp.b32 %r10443, %r10442, 0, %p809; + shr.s32 %r10444, %r10399, 31; + and.b32 %r10445, %r10444, %r2366; + selp.b32 %r10446, %r10445, 0, %p808; + shr.s32 %r10447, %r10398, 31; + and.b32 %r10448, %r10447, %r2366; + selp.b32 %r10449, %r10448, 0, %p807; + shr.s32 %r10450, %r10397, 31; + and.b32 %r10451, %r10450, %r2366; + selp.b32 %r10452, %r10451, 0, %p806; + shr.s32 %r10453, %r10396, 31; + and.b32 %r10454, %r10453, %r2366; + selp.b32 %r10455, %r10454, 0, %p805; + shr.s32 %r10456, %r10395, 31; + and.b32 %r10457, %r10456, %r2366; + selp.b32 %r10458, %r10457, 0, %p804; + .loc 1 730 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:730:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + neg.s32 %r10459, %r10458; + neg.s32 %r10460, %r10455; + neg.s32 %r10461, %r10452; + neg.s32 %r10462, %r10449; + neg.s32 %r10463, %r10446; + neg.s32 %r10464, %r10443; + neg.s32 %r10465, %r10440; + neg.s32 %r10466, %r10437; + neg.s32 %r10467, %r10434; + neg.s32 %r10468, %r10431; + neg.s32 %r10469, %r10428; + neg.s32 %r10470, %r10425; + neg.s32 %r10471, %r10422; + neg.s32 %r10472, %r10419; + neg.s32 %r10473, %r10416; + neg.s32 %r10474, %r10413; + setp.eq.b32 %p820, %r10379, %r10474; + setp.eq.b32 %p821, %r10380, %r10473; + setp.eq.b32 %p822, %r10381, %r10472; + setp.eq.b32 %p823, %r10382, %r10471; + setp.eq.b32 %p824, %r10383, %r10470; + setp.eq.b32 %p825, %r10384, %r10469; + setp.eq.b32 %p826, %r10385, %r10468; + setp.eq.b32 %p827, %r10386, %r10467; + setp.eq.b32 %p828, %r10387, %r10466; + setp.eq.b32 %p829, %r10388, %r10465; + setp.eq.b32 %p830, %r10389, %r10464; + setp.eq.b32 %p831, %r10390, %r10463; + setp.eq.b32 %p832, %r10391, %r10462; + setp.eq.b32 %p833, %r10392, %r10461; + setp.eq.b32 %p834, %r10393, %r10460; + setp.eq.b32 %p835, %r10394, %r10459; + .loc 1 731 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:731:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + and.pred %p836, %p9, %p752; + and.pred %p837, %p9, %p753; + and.pred %p838, %p7, %p754; + and.pred %p839, %p7, %p755; + and.pred %p840, %p9, %p756; + and.pred %p841, %p9, %p757; + and.pred %p842, %p7, %p758; + and.pred %p843, %p7, %p759; + and.pred %p844, %p9, %p760; + and.pred %p845, %p9, %p761; + and.pred %p846, %p7, %p762; + and.pred %p847, %p7, %p763; + and.pred %p848, %p9, %p835; + and.pred %p849, %p9, %p834; + and.pred %p850, %p7, %p833; + and.pred %p851, %p7, %p832; + and.pred %p852, %p9, %p831; + and.pred %p853, %p9, %p830; + and.pred %p854, %p7, %p829; + and.pred %p855, %p7, %p828; + and.pred %p856, %p9, %p827; + and.pred %p857, %p9, %p826; + and.pred %p858, %p7, %p825; + and.pred %p859, %p7, %p824; + and.pred %p860, %p9, %p823; + and.pred %p861, %p9, %p822; + and.pred %p862, %p7, %p821; + and.pred %p863, %p7, %p820; + .loc 1 732 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:732:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + or.pred %p864, %p716, %p836; + or.pred %p865, %p717, %p837; + or.pred %p866, %p718, %p838; + or.pred %p867, %p719, %p839; + or.pred %p868, %p736, %p840; + or.pred %p869, %p737, %p841; + or.pred %p870, %p738, %p842; + or.pred %p871, %p739, %p843; + or.pred %p872, %p740, %p844; + or.pred %p873, %p741, %p845; + or.pred %p874, %p742, %p846; + or.pred %p875, %p743, %p847; + or.pred %p876, %p788, %p848; + or.pred %p877, %p789, %p849; + or.pred %p878, %p790, %p850; + or.pred %p879, %p791, %p851; + or.pred %p880, %p792, %p852; + or.pred %p881, %p793, %p853; + or.pred %p882, %p794, %p854; + or.pred %p883, %p795, %p855; + or.pred %p884, %p796, %p856; + or.pred %p885, %p797, %p857; + or.pred %p886, %p798, %p858; + or.pred %p887, %p799, %p859; + or.pred %p888, %p800, %p860; + or.pred %p889, %p801, %p861; + or.pred %p890, %p802, %p862; + or.pred %p891, %p803, %p863; + .loc 1 798 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:798:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + rem.s32 %r10475, %r15054, %r2358; + rem.s32 %r10476, %r15053, %r2358; + .loc 1 698 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:698:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.le.s32 %p892, %r725, %r10476; + setp.le.s32 %p893, %r725, %r10475; + .loc 1 704 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:704:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.s64.s32 %rd834, %r10475; + cvt.s64.s32 %rd835, %r10476; + setp.gt.s64 %p894, %rd561, %rd835; + setp.gt.s64 %p895, %rd561, %rd834; + .loc 1 706 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:706:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + and.pred %p896, %p893, %p895; + and.pred %p897, %p892, %p894; + .loc 1 722 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:722:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + sub.s32 %r10477, %r725, %r10475; + sub.s32 %r10478, %r725, %r10476; + .loc 1 723 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:723:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + rem.s32 %r10479, %r10478, %r2366; + rem.s32 %r10480, %r10477, %r2366; + .loc 1 724 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:724:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.ne.b32 %p898, %r10480, 0; + setp.ne.b32 %p899, %r10479, 0; + .loc 1 726 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:726:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + xor.b32 %r10481, %r10480, %r2366; + xor.b32 %r10482, %r10479, %r2366; + .loc 1 729 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:729:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + shr.s32 %r10483, %r10482, 31; + and.b32 %r10484, %r10483, %r2366; + selp.b32 %r10485, %r10484, 0, %p899; + shr.s32 %r10486, %r10481, 31; + and.b32 %r10487, %r10486, %r2366; + selp.b32 %r10488, %r10487, 0, %p898; + .loc 1 730 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:730:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + neg.s32 %r10489, %r10488; + neg.s32 %r10490, %r10485; + setp.eq.b32 %p900, %r10479, %r10490; + setp.eq.b32 %p901, %r10480, %r10489; + .loc 1 731 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:731:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + and.pred %p902, %p7, %p901; + and.pred %p903, %p7, %p900; + .loc 1 732 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:732:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + or.pred %p904, %p897, %p903; + or.pred %p905, %p896, %p902; + .loc 1 698 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:698:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.le.s32 %p906, %r980, %r10476; + setp.le.s32 %p907, %r980, %r10475; + .loc 1 706 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:706:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + and.pred %p908, %p907, %p895; + and.pred %p909, %p906, %p894; + .loc 1 722 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:722:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + sub.s32 %r10491, %r980, %r10475; + sub.s32 %r10492, %r980, %r10476; + .loc 1 723 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:723:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + rem.s32 %r10493, %r10492, %r2366; + rem.s32 %r10494, %r10491, %r2366; + .loc 1 724 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:724:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.ne.b32 %p910, %r10494, 0; + setp.ne.b32 %p911, %r10493, 0; + .loc 1 726 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:726:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + xor.b32 %r10495, %r10494, %r2366; + xor.b32 %r10496, %r10493, %r2366; + .loc 1 729 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:729:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + shr.s32 %r10497, %r10496, 31; + and.b32 %r10498, %r10497, %r2366; + selp.b32 %r10499, %r10498, 0, %p911; + shr.s32 %r10500, %r10495, 31; + and.b32 %r10501, %r10500, %r2366; + selp.b32 %r10502, %r10501, 0, %p910; + .loc 1 730 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:730:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + neg.s32 %r10503, %r10502; + neg.s32 %r10504, %r10499; + setp.eq.b32 %p912, %r10493, %r10504; + setp.eq.b32 %p913, %r10494, %r10503; + .loc 1 731 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:731:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + and.pred %p914, %p9, %p913; + and.pred %p915, %p9, %p912; + .loc 1 732 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:732:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + or.pred %p916, %p909, %p915; + or.pred %p917, %p908, %p914; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + and.pred %p918, %p917, %p679; + and.pred %p919, %p916, %p678; + and.pred %p920, %p905, %p679; + and.pred %p921, %p904, %p678; + and.pred %p922, %p864, %p680; + and.pred %p923, %p865, %p681; + and.pred %p924, %p866, %p680; + and.pred %p925, %p867, %p681; + and.pred %p926, %p868, %p682; + and.pred %p927, %p869, %p683; + and.pred %p928, %p870, %p682; + and.pred %p929, %p871, %p683; + and.pred %p930, %p872, %p684; + and.pred %p931, %p873, %p685; + and.pred %p932, %p874, %p684; + and.pred %p933, %p875, %p685; + and.pred %p934, %p876, %p686; + and.pred %p935, %p877, %p687; + and.pred %p936, %p878, %p686; + and.pred %p937, %p879, %p687; + and.pred %p938, %p880, %p688; + and.pred %p939, %p881, %p689; + and.pred %p940, %p882, %p688; + and.pred %p941, %p883, %p689; + and.pred %p942, %p884, %p690; + and.pred %p943, %p885, %p691; + and.pred %p944, %p886, %p690; + and.pred %p945, %p887, %p691; + and.pred %p946, %p888, %p692; + and.pred %p947, %p889, %p693; + and.pred %p948, %p890, %p692; + and.pred %p949, %p891, %p693; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10505, %r10229, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10506, %r10505, 0fFF800000, %p919; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10507, %r10230, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10508, %r10507, 0fFF800000, %p918; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10509, %r10231, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10510, %r10509, 0fFF800000, %p921; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10511, %r10232, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10512, %r10511, 0fFF800000, %p920; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10513, %r10233, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10514, %r10513, 0fFF800000, %p922; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10515, %r10234, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10516, %r10515, 0fFF800000, %p923; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10517, %r10235, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10518, %r10517, 0fFF800000, %p924; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10519, %r10236, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10520, %r10519, 0fFF800000, %p925; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10521, %r10237, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10522, %r10521, 0fFF800000, %p926; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10523, %r10238, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10524, %r10523, 0fFF800000, %p927; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10525, %r10239, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10526, %r10525, 0fFF800000, %p928; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10527, %r10240, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10528, %r10527, 0fFF800000, %p929; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10529, %r10241, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10530, %r10529, 0fFF800000, %p930; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10531, %r10242, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10532, %r10531, 0fFF800000, %p931; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10533, %r10243, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10534, %r10533, 0fFF800000, %p932; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10535, %r10244, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10536, %r10535, 0fFF800000, %p933; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10537, %r10245, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10538, %r10537, 0fFF800000, %p934; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10539, %r10246, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10540, %r10539, 0fFF800000, %p935; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10541, %r10247, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10542, %r10541, 0fFF800000, %p936; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10543, %r10248, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10544, %r10543, 0fFF800000, %p937; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10545, %r10249, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10546, %r10545, 0fFF800000, %p938; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10547, %r10250, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10548, %r10547, 0fFF800000, %p939; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10549, %r10251, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10550, %r10549, 0fFF800000, %p940; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10551, %r10252, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10552, %r10551, 0fFF800000, %p941; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10553, %r10253, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10554, %r10553, 0fFF800000, %p942; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10555, %r10254, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10556, %r10555, 0fFF800000, %p943; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10557, %r10255, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10558, %r10557, 0fFF800000, %p944; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10559, %r10256, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10560, %r10559, 0fFF800000, %p945; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10561, %r10257, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10562, %r10561, 0fFF800000, %p946; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10563, %r10258, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10564, %r10563, 0fFF800000, %p947; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10565, %r10259, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10566, %r10565, 0fFF800000, %p948; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10567, %r10260, 0f3FB8AA3B; + .loc 1 736 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:736:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.f32 %r10568, %r10567, 0fFF800000, %p949; + .loc 1 740 40 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:740:40 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + sub.f32 %r10569, %r10506, %r10172; + sub.f32 %r10570, %r10508, %r10173; + sub.f32 %r10571, %r10510, %r10172; + sub.f32 %r10572, %r10512, %r10173; + sub.f32 %r10573, %r10514, %r10174; + sub.f32 %r10574, %r10516, %r10175; + sub.f32 %r10575, %r10518, %r10174; + sub.f32 %r10576, %r10520, %r10175; + sub.f32 %r10577, %r10522, %r10176; + sub.f32 %r10578, %r10524, %r10177; + sub.f32 %r10579, %r10526, %r10176; + sub.f32 %r10580, %r10528, %r10177; + sub.f32 %r10581, %r10530, %r10178; + sub.f32 %r10582, %r10532, %r10179; + sub.f32 %r10583, %r10534, %r10178; + sub.f32 %r10584, %r10536, %r10179; + sub.f32 %r10585, %r10538, %r10180; + sub.f32 %r10586, %r10540, %r10181; + sub.f32 %r10587, %r10542, %r10180; + sub.f32 %r10588, %r10544, %r10181; + sub.f32 %r10589, %r10546, %r10182; + sub.f32 %r10590, %r10548, %r10183; + sub.f32 %r10591, %r10550, %r10182; + sub.f32 %r10592, %r10552, %r10183; + sub.f32 %r10593, %r10554, %r10184; + sub.f32 %r10594, %r10556, %r10185; + sub.f32 %r10595, %r10558, %r10184; + sub.f32 %r10596, %r10560, %r10185; + sub.f32 %r10597, %r10562, %r10186; + sub.f32 %r10598, %r10564, %r10187; + sub.f32 %r10599, %r10566, %r10186; + sub.f32 %r10600, %r10568, %r10187; + .loc 1 740 22 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:740:22 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + ex2.approx.ftz.f32 %r10601, %r10569; + ex2.approx.ftz.f32 %r10602, %r10570; + ex2.approx.ftz.f32 %r10603, %r10571; + ex2.approx.ftz.f32 %r10604, %r10572; + ex2.approx.ftz.f32 %r10605, %r10573; + ex2.approx.ftz.f32 %r10606, %r10574; + ex2.approx.ftz.f32 %r10607, %r10575; + ex2.approx.ftz.f32 %r10608, %r10576; + ex2.approx.ftz.f32 %r10609, %r10577; + ex2.approx.ftz.f32 %r10610, %r10578; + ex2.approx.ftz.f32 %r10611, %r10579; + ex2.approx.ftz.f32 %r10612, %r10580; + ex2.approx.ftz.f32 %r10613, %r10581; + ex2.approx.ftz.f32 %r10614, %r10582; + ex2.approx.ftz.f32 %r10615, %r10583; + ex2.approx.ftz.f32 %r10616, %r10584; + ex2.approx.ftz.f32 %r10617, %r10585; + ex2.approx.ftz.f32 %r10618, %r10586; + ex2.approx.ftz.f32 %r10619, %r10587; + ex2.approx.ftz.f32 %r10620, %r10588; + ex2.approx.ftz.f32 %r10621, %r10589; + ex2.approx.ftz.f32 %r10622, %r10590; + ex2.approx.ftz.f32 %r10623, %r10591; + ex2.approx.ftz.f32 %r10624, %r10592; + ex2.approx.ftz.f32 %r10625, %r10593; + ex2.approx.ftz.f32 %r10626, %r10594; + ex2.approx.ftz.f32 %r10627, %r10595; + ex2.approx.ftz.f32 %r10628, %r10596; + ex2.approx.ftz.f32 %r10629, %r10597; + ex2.approx.ftz.f32 %r10630, %r10598; + ex2.approx.ftz.f32 %r10631, %r10599; + ex2.approx.ftz.f32 %r10632, %r10600; + .loc 1 833 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:833:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s32 %r10633, %r7585, 49152; + add.s32 %r9497, %r10633, %r10150; + .loc 1 744 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:744:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16x2.f32 %r8582, %r10602, %r10601; + cvt.rn.bf16x2.f32 %r8583, %r10604, %r10603; + cvt.rn.bf16x2.f32 %r8584, %r10606, %r10605; + cvt.rn.bf16x2.f32 %r8585, %r10608, %r10607; + cvt.rn.bf16x2.f32 %r8714, %r10610, %r10609; + cvt.rn.bf16x2.f32 %r8715, %r10612, %r10611; + cvt.rn.bf16x2.f32 %r8716, %r10614, %r10613; + cvt.rn.bf16x2.f32 %r8717, %r10616, %r10615; + cvt.rn.bf16x2.f32 %r8846, %r10618, %r10617; + cvt.rn.bf16x2.f32 %r8847, %r10620, %r10619; + cvt.rn.bf16x2.f32 %r8848, %r10622, %r10621; + cvt.rn.bf16x2.f32 %r8849, %r10624, %r10623; + cvt.rn.bf16x2.f32 %r8978, %r10626, %r10625; + cvt.rn.bf16x2.f32 %r8979, %r10628, %r10627; + cvt.rn.bf16x2.f32 %r8980, %r10630, %r10629; + cvt.rn.bf16x2.f32 %r8981, %r10632, %r10631; + .loc 1 744 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:744:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + wgmma.fence.sync.aligned; + bfe.u32 %r10634, %r9497, 4, 14; + cvt.u64.u32 %rd836, %r10634; + or.b64 %rd734, %rd836, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987}, {%r8582,%r8583,%r8584,%r8585}, %rd734, %p619, 1, 1, 1; + // end inline asm + add.s32 %r10635, %r9497, 2048; + bfe.u32 %r10636, %r10635, 4, 14; + cvt.u64.u32 %rd837, %r10636; + or.b64 %rd735, %rd837, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987}, {%r8714,%r8715,%r8716,%r8717}, %rd735, %p619, 1, 1, 1; + // end inline asm + add.s32 %r10637, %r9497, 4096; + bfe.u32 %r10638, %r10637, 4, 14; + cvt.u64.u32 %rd838, %r10638; + or.b64 %rd736, %rd838, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987}, {%r8846,%r8847,%r8848,%r8849}, %rd736, %p619, 1, 1, 1; + // end inline asm + add.s32 %r10639, %r9497, 6144; + bfe.u32 %r10640, %r10639, 4, 14; + cvt.u64.u32 %rd839, %r10640; + or.b64 %rd737, %rd839, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987}, {%r8978,%r8979,%r8980,%r8981}, %rd737, %p619, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 748 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:748:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s32 %r10641, %r7585, 98816; + add.s32 %r10642, %r10641, %r10152; + add.s32 %r10643, %r10642, %r767; + ld.shared.v2.b32 {%r10644, %r10645}, [%r10643+32]; + ld.shared.v2.b32 {%r10646, %r10647}, [%r10643+64]; + ld.shared.v2.b32 {%r10648, %r10649}, [%r10643+96]; + ld.shared.v2.b32 {%r10650, %r10651}, [%r10643+128]; + ld.shared.v2.b32 {%r10652, %r10653}, [%r10643+160]; + ld.shared.v2.b32 {%r10654, %r10655}, [%r10643+192]; + ld.shared.v2.b32 {%r10656, %r10657}, [%r10643+224]; + .loc 1 750 20 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:750:20 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s32 %r9494, %r7585, 132096; + add.s32 %r10658, %r10190, %r9494; + bfe.u32 %r10659, %r10658, 4, 14; + cvt.u64.u32 %rd840, %r10659; + or.b64 %rd738, %rd840, 4611686293372403712; + add.s32 %r10660, %r10194, %r9494; + bfe.u32 %r10661, %r10660, 4, 14; + cvt.u64.u32 %rd841, %r10661; + or.b64 %rd740, %rd841, 4611686293372403712; + add.s32 %r10662, %r9497, 32; + bfe.u32 %r10663, %r10662, 4, 14; + cvt.u64.u32 %rd842, %r10663; + or.b64 %rd741, %rd842, 4611686293338849280; + add.s32 %r10664, %r10199, %r9494; + bfe.u32 %r10665, %r10664, 4, 14; + cvt.u64.u32 %rd843, %r10665; + or.b64 %rd742, %rd843, 4611686293372403712; + add.s32 %r10666, %r9497, 64; + bfe.u32 %r10667, %r10666, 4, 14; + cvt.u64.u32 %rd844, %r10667; + or.b64 %rd743, %rd844, 4611686293338849280; + add.s32 %r10668, %r10204, %r9494; + bfe.u32 %r10669, %r10668, 4, 14; + cvt.u64.u32 %rd845, %r10669; + or.b64 %rd744, %rd845, 4611686293372403712; + add.s32 %r10670, %r9497, 96; + bfe.u32 %r10671, %r10670, 4, 14; + cvt.u64.u32 %rd846, %r10671; + or.b64 %rd745, %rd846, 4611686293338849280; + add.s32 %r10672, %r10209, %r9494; + bfe.u32 %r10673, %r10672, 4, 14; + cvt.u64.u32 %rd847, %r10673; + or.b64 %rd746, %rd847, 4611686293372403712; + add.s32 %r10674, %r9497, 8192; + bfe.u32 %r10675, %r10674, 4, 14; + cvt.u64.u32 %rd848, %r10675; + or.b64 %rd747, %rd848, 4611686293338849280; + add.s32 %r10676, %r10214, %r9494; + bfe.u32 %r10677, %r10676, 4, 14; + cvt.u64.u32 %rd849, %r10677; + or.b64 %rd748, %rd849, 4611686293372403712; + add.s32 %r10678, %r9497, 8224; + bfe.u32 %r10679, %r10678, 4, 14; + cvt.u64.u32 %rd850, %r10679; + or.b64 %rd749, %rd850, 4611686293338849280; + add.s32 %r10680, %r10219, %r9494; + bfe.u32 %r10681, %r10680, 4, 14; + cvt.u64.u32 %rd851, %r10681; + or.b64 %rd750, %rd851, 4611686293372403712; + add.s32 %r10682, %r9497, 8256; + bfe.u32 %r10683, %r10682, 4, 14; + cvt.u64.u32 %rd852, %r10683; + or.b64 %rd751, %rd852, 4611686293338849280; + add.s32 %r10684, %r10224, %r9494; + bfe.u32 %r10685, %r10684, 4, 14; + cvt.u64.u32 %rd853, %r10685; + or.b64 %rd752, %rd853, 4611686293372403712; + add.s32 %r10686, %r9497, 8288; + bfe.u32 %r10687, %r10686, 4, 14; + cvt.u64.u32 %rd854, %r10687; + or.b64 %rd753, %rd854, 4611686293338849280; + .loc 1 775 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s32 %r10688, %r8451, 2048; + bfe.u32 %r10689, %r10688, 4, 14; + cvt.u64.u32 %rd855, %r10689; + or.b64 %rd755, %rd855, 4611686293338849280; + add.s32 %r10690, %r8451, 4096; + bfe.u32 %r10691, %r10690, 4, 14; + cvt.u64.u32 %rd856, %r10691; + or.b64 %rd756, %rd856, 4611686293338849280; + add.s32 %r10692, %r8451, 6144; + bfe.u32 %r10693, %r10692, 4, 14; + cvt.u64.u32 %rd857, %r10693; + or.b64 %rd757, %rd857, 4611686293338849280; + .loc 1 628 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:628:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s32 %r15053, %r15053, %r14883; + add.s32 %r15054, %r15054, %r14883; + add.s32 %r14881, %r14881, %r14883; + add.s32 %r14882, %r14882, %r14883; + add.s32 %r15067, %r15067, %r14883; + add.s32 %r15068, %r15068, %r14883; + add.s32 %r15069, %r15069, %r14883; + add.s32 %r15070, %r15070, %r14883; + add.s32 %r15059, %r15059, %r14883; + add.s32 %r15060, %r15060, %r14883; + add.s32 %r15061, %r15061, %r14883; + add.s32 %r15062, %r15062, %r14883; + add.s32 %r15063, %r15063, %r14883; + add.s32 %r15064, %r15064, %r14883; + add.s32 %r15065, %r15065, %r14883; + add.s32 %r15066, %r15066, %r14883; + .loc 1 610 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:610:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s32 %r1424, %r15052, 1; + .loc 1 788 33 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:788:33 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + shr.u32 %r10694, %r1424, 1; + .loc 1 789 38 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:789:38 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mad.wide.u32 %rd759, %r10694, 4, %rd559; + .loc 1 790 109 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:790:109 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s32 %r10695, %r10694, 1; + .loc 1 790 113 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:790:113 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.lt.s32 %p950, %r10695, %r7561; + .loc 1 790 55 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:790:55 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s64 %rd762, %rd759, 4; + .loc 1 610 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:610:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + and.pred %p642, %p641, %p950; + .loc 1 791 35 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:791:35 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + and.b32 %r10696, %r15052, 1; + .loc 1 793 29 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:793:29 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + xor.b32 %r10697, %r10696, 1; + .loc 1 793 61 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:793:61 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + shl.b32 %r10698, %r10696, 6; + .loc 1 748 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:748:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + ld.shared.v2.b32 {%r10699, %r10700}, [%r10643]; + .loc 1 750 20 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:750:20 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9078,%r9079,%r9080,%r9081,%r9082,%r9083,%r9084,%r9085,%r9086,%r9087,%r9088,%r9089,%r9090,%r9091,%r9092,%r9093,%r9094,%r9095,%r9096,%r9097,%r9098,%r9099,%r9100,%r9101,%r9102,%r9103,%r9104,%r9105,%r9106,%r9107,%r9108,%r9109}, %rd738, %rd734, 0, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9078,%r9079,%r9080,%r9081,%r9082,%r9083,%r9084,%r9085,%r9086,%r9087,%r9088,%r9089,%r9090,%r9091,%r9092,%r9093,%r9094,%r9095,%r9096,%r9097,%r9098,%r9099,%r9100,%r9101,%r9102,%r9103,%r9104,%r9105,%r9106,%r9107,%r9108,%r9109}, %rd740, %rd741, %p619, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9078,%r9079,%r9080,%r9081,%r9082,%r9083,%r9084,%r9085,%r9086,%r9087,%r9088,%r9089,%r9090,%r9091,%r9092,%r9093,%r9094,%r9095,%r9096,%r9097,%r9098,%r9099,%r9100,%r9101,%r9102,%r9103,%r9104,%r9105,%r9106,%r9107,%r9108,%r9109}, %rd742, %rd743, %p619, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9078,%r9079,%r9080,%r9081,%r9082,%r9083,%r9084,%r9085,%r9086,%r9087,%r9088,%r9089,%r9090,%r9091,%r9092,%r9093,%r9094,%r9095,%r9096,%r9097,%r9098,%r9099,%r9100,%r9101,%r9102,%r9103,%r9104,%r9105,%r9106,%r9107,%r9108,%r9109}, %rd744, %rd745, %p619, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9078,%r9079,%r9080,%r9081,%r9082,%r9083,%r9084,%r9085,%r9086,%r9087,%r9088,%r9089,%r9090,%r9091,%r9092,%r9093,%r9094,%r9095,%r9096,%r9097,%r9098,%r9099,%r9100,%r9101,%r9102,%r9103,%r9104,%r9105,%r9106,%r9107,%r9108,%r9109}, %rd746, %rd747, %p619, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9078,%r9079,%r9080,%r9081,%r9082,%r9083,%r9084,%r9085,%r9086,%r9087,%r9088,%r9089,%r9090,%r9091,%r9092,%r9093,%r9094,%r9095,%r9096,%r9097,%r9098,%r9099,%r9100,%r9101,%r9102,%r9103,%r9104,%r9105,%r9106,%r9107,%r9108,%r9109}, %rd748, %rd749, %p619, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9078,%r9079,%r9080,%r9081,%r9082,%r9083,%r9084,%r9085,%r9086,%r9087,%r9088,%r9089,%r9090,%r9091,%r9092,%r9093,%r9094,%r9095,%r9096,%r9097,%r9098,%r9099,%r9100,%r9101,%r9102,%r9103,%r9104,%r9105,%r9106,%r9107,%r9108,%r9109}, %rd750, %rd751, %p619, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9078,%r9079,%r9080,%r9081,%r9082,%r9083,%r9084,%r9085,%r9086,%r9087,%r9088,%r9089,%r9090,%r9091,%r9092,%r9093,%r9094,%r9095,%r9096,%r9097,%r9098,%r9099,%r9100,%r9101,%r9102,%r9103,%r9104,%r9105,%r9106,%r9107,%r9108,%r9109}, %rd752, %rd753, %p619, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r9499, %r8449; + mov.b32 %r9495, %r8449; + mov.b32 %r9496, %r8449; + mov.b32 %r9498, %r8449; + // begin inline asm + // wait for regs: %r9078,%r9079,%r9080,%r9081,%r9082,%r9083,%r9084,%r9085,%r9086,%r9087,%r9088,%r9089,%r9090,%r9091,%r9092,%r9093,%r9094,%r9095,%r9096,%r9097,%r9098,%r9099,%r9100,%r9101,%r9102,%r9103,%r9104,%r9105,%r9106,%r9107,%r9108,%r9109,%r9494,%r9495,%r9496,%r9497,%r9498,%r9499 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 751 22 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:751:22 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + sub.f32 %r10701, %r9080, %r10699; + sub.f32 %r10702, %r9081, %r10700; + sub.f32 %r10703, %r9082, %r10644; + sub.f32 %r10704, %r9083, %r10645; + sub.f32 %r10705, %r9084, %r10644; + sub.f32 %r10706, %r9085, %r10645; + sub.f32 %r10707, %r9086, %r10646; + sub.f32 %r10708, %r9087, %r10647; + sub.f32 %r10709, %r9088, %r10646; + sub.f32 %r10710, %r9089, %r10647; + sub.f32 %r10711, %r9090, %r10648; + sub.f32 %r10712, %r9091, %r10649; + sub.f32 %r10713, %r9092, %r10648; + sub.f32 %r10714, %r9093, %r10649; + sub.f32 %r10715, %r9094, %r10650; + sub.f32 %r10716, %r9095, %r10651; + sub.f32 %r10717, %r9096, %r10650; + sub.f32 %r10718, %r9097, %r10651; + sub.f32 %r10719, %r9098, %r10652; + sub.f32 %r10720, %r9099, %r10653; + sub.f32 %r10721, %r9100, %r10652; + sub.f32 %r10722, %r9101, %r10653; + sub.f32 %r10723, %r9102, %r10654; + sub.f32 %r10724, %r9103, %r10655; + sub.f32 %r10725, %r9104, %r10654; + sub.f32 %r10726, %r9105, %r10655; + sub.f32 %r10727, %r9106, %r10656; + sub.f32 %r10728, %r9107, %r10657; + sub.f32 %r10729, %r9108, %r10656; + sub.f32 %r10730, %r9109, %r10657; + .loc 1 751 16 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:751:16 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10731, %r10604, %r10702; + mul.f32 %r10732, %r10603, %r10701; + mul.f32 %r10733, %r10605, %r10703; + mul.f32 %r10734, %r10606, %r10704; + mul.f32 %r10735, %r10607, %r10705; + mul.f32 %r10736, %r10608, %r10706; + mul.f32 %r10737, %r10609, %r10707; + mul.f32 %r10738, %r10610, %r10708; + mul.f32 %r10739, %r10611, %r10709; + mul.f32 %r10740, %r10612, %r10710; + mul.f32 %r10741, %r10613, %r10711; + mul.f32 %r10742, %r10614, %r10712; + mul.f32 %r10743, %r10615, %r10713; + mul.f32 %r10744, %r10616, %r10714; + mul.f32 %r10745, %r10617, %r10715; + mul.f32 %r10746, %r10618, %r10716; + mul.f32 %r10747, %r10619, %r10717; + mul.f32 %r10748, %r10620, %r10718; + mul.f32 %r10749, %r10621, %r10719; + mul.f32 %r10750, %r10622, %r10720; + mul.f32 %r10751, %r10623, %r10721; + mul.f32 %r10752, %r10624, %r10722; + mul.f32 %r10753, %r10625, %r10723; + mul.f32 %r10754, %r10626, %r10724; + mul.f32 %r10755, %r10627, %r10725; + mul.f32 %r10756, %r10628, %r10726; + mul.f32 %r10757, %r10629, %r10727; + mul.f32 %r10758, %r10630, %r10728; + mul.f32 %r10759, %r10631, %r10729; + mul.f32 %r10760, %r10632, %r10730; + .loc 1 751 22 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:751:22 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + sub.f32 %r10761, %r9078, %r10699; + sub.f32 %r10762, %r9079, %r10700; + .loc 1 751 16 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:751:16 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.f32 %r10763, %r10602, %r10762; + mul.f32 %r10764, %r10601, %r10761; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs129, %r10764; + cvt.rn.bf16.f32 %rs130, %r10763; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs131, %rs130, 0x0000, %p918; + selp.b16 %rs132, %rs129, 0x0000, %p919; + mov.b32 %r9666, {%rs132, %rs131}; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs133, %r10732; + cvt.rn.bf16.f32 %rs134, %r10731; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs135, %rs134, 0x0000, %p920; + selp.b16 %rs136, %rs133, 0x0000, %p921; + mov.b32 %r9667, {%rs136, %rs135}; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs137, %r10733; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs138, %rs137, 0x0000, %p922; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs139, %r10734; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs140, %rs139, 0x0000, %p923; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs141, %r10735; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs142, %rs141, 0x0000, %p924; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs143, %r10736; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs144, %rs143, 0x0000, %p925; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs145, %r10737; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs146, %rs145, 0x0000, %p926; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs147, %r10738; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs148, %rs147, 0x0000, %p927; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs149, %r10739; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs150, %rs149, 0x0000, %p928; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs151, %r10740; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs152, %rs151, 0x0000, %p929; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs153, %r10741; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs154, %rs153, 0x0000, %p930; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs155, %r10742; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs156, %rs155, 0x0000, %p931; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs157, %r10743; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs158, %rs157, 0x0000, %p932; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs159, %r10744; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs160, %rs159, 0x0000, %p933; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs161, %r10745; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs162, %rs161, 0x0000, %p934; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs163, %r10746; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs164, %rs163, 0x0000, %p935; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs165, %r10747; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs166, %rs165, 0x0000, %p936; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs167, %r10748; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs168, %rs167, 0x0000, %p937; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs169, %r10749; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs170, %rs169, 0x0000, %p938; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs171, %r10750; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs172, %rs171, 0x0000, %p939; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs173, %r10751; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs174, %rs173, 0x0000, %p940; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs175, %r10752; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs176, %rs175, 0x0000, %p941; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs177, %r10753; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs178, %rs177, 0x0000, %p942; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs179, %r10754; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs180, %rs179, 0x0000, %p943; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs181, %r10755; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs182, %rs181, 0x0000, %p944; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs183, %r10756; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs184, %rs183, 0x0000, %p945; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs185, %r10757; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs186, %rs185, 0x0000, %p946; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs187, %r10758; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs188, %rs187, 0x0000, %p947; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs189, %r10759; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs190, %rs189, 0x0000, %p948; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + cvt.rn.bf16.f32 %rs191, %r10760; + .loc 1 773 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:773:45 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + selp.b16 %rs192, %rs191, 0x0000, %p949; + .loc 1 775 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mov.b32 %r9668, {%rs138, %rs140}; + mov.b32 %r9669, {%rs142, %rs144}; + mov.b32 %r9798, {%rs146, %rs148}; + mov.b32 %r9799, {%rs150, %rs152}; + mov.b32 %r9800, {%rs154, %rs156}; + mov.b32 %r9801, {%rs158, %rs160}; + mov.b32 %r9930, {%rs162, %rs164}; + mov.b32 %r9931, {%rs166, %rs168}; + mov.b32 %r9932, {%rs170, %rs172}; + mov.b32 %r9933, {%rs174, %rs176}; + mov.b32 %r10062, {%rs178, %rs180}; + mov.b32 %r10063, {%rs182, %rs184}; + mov.b32 %r10064, {%rs186, %rs188}; + mov.b32 %r10065, {%rs190, %rs192}; + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051}, {%r9666,%r9667,%r9668,%r9669}, %rd719, %p619, 1, 1, 1; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051}, {%r9798,%r9799,%r9800,%r9801}, %rd755, %p619, 1, 1, 1; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051}, {%r9930,%r9931,%r9932,%r9933}, %rd756, %p619, 1, 1, 1; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051}, {%r10062,%r10063,%r10064,%r10065}, %rd757, %p619, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 789 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:789:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + // begin inline asm + mov.u64 %rd758, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd758, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10066, 0x0; + @%p641 ld.global.L1::evict_last.L2::cache_hint.b32 { %r10066 }, [ %rd759 + 0 ], %rd758; + // end inline asm + .loc 1 790 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:790:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + // begin inline asm + mov.u64 %rd761, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd761, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10067, 0x0; + @%p642 ld.global.L1::evict_last.L2::cache_hint.b32 { %r10067 }, [ %rd762 + 0 ], %rd761; + // end inline asm + .loc 1 792 34 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:792:34 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + sub.s32 %r10765, %r10067, %r10066; + .loc 1 792 48 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:792:48 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + shl.b32 %r10766, %r10765, 7; + .loc 1 792 63 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:792:63 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s32 %r10767, %r10766, -64; + .loc 1 793 42 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:793:42 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mad.lo.s32 %r14883, %r10767, %r10697, %r10698; + .loc 1 626 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:626:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + shl.b32 %r10768, %r14883, 12; + .loc 1 626 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:626:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.wide.s32 %rd858, %r10768, 2; + add.s64 %rd1201, %rd1201, %rd858; + add.s64 %rd1200, %rd1200, %rd858; + add.s64 %rd1199, %rd1199, %rd858; + add.s64 %rd1198, %rd1198, %rd858; + .loc 1 627 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:627:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + shl.b32 %r10769, %r14883, 7; + .loc 1 627 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:627:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.wide.s32 %rd859, %r10769, 2; + add.s64 %rd1197, %rd1197, %rd859; + add.s64 %rd1196, %rd1196, %rd859; + add.s64 %rd1195, %rd1195, %rd859; + add.s64 %rd1194, %rd1194, %rd859; + .loc 1 628 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:628:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s32 %r1491, %r14883, %r15058; + add.s32 %r1490, %r14883, %r15057; + add.s32 %r1492, %r14883, %r14915; + add.s32 %r1493, %r14883, %r14914; + add.s32 %r1494, %r14883, %r14913; + add.s32 %r1495, %r14883, %r14912; + add.s32 %r1496, %r14883, %r14911; + add.s32 %r1497, %r14883, %r14910; + add.s32 %r1498, %r14883, %r14909; + add.s32 %r1499, %r14883, %r14908; + add.s32 %r1500, %r14883, %r14907; + add.s32 %r1501, %r14883, %r14906; + add.s32 %r1502, %r14883, %r14905; + add.s32 %r1503, %r14883, %r14904; + add.s32 %r1504, %r14883, %r14903; + add.s32 %r1505, %r14883, %r14902; + add.s32 %r14920, %r14883, %r14920; + add.s32 %r14921, %r14883, %r14921; + add.s32 %r14922, %r14883, %r14922; + add.s32 %r14923, %r14883, %r14923; + add.s32 %r14916, %r14883, %r14916; + add.s32 %r14917, %r14883, %r14917; + add.s32 %r14918, %r14883, %r14918; + add.s32 %r14919, %r14883, %r14919; + .loc 1 610 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:610:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s32 %r10770, %r14899, 1; + setp.gt.s32 %p951, %r10770, 1; + selp.b32 %r14899, 0, %r10770, %p951; + add.s32 %r10771, %r14901, 1; + setp.gt.s32 %p952, %r10771, 2; + selp.b32 %r14901, 0, %r10771, %p952; + .loc 1 831 52 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:52 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.lt.s32 %p953, %r14920, %r2358; + setp.lt.s32 %p954, %r14921, %r2358; + setp.lt.s32 %p955, %r14922, %r2358; + setp.lt.s32 %p956, %r14923, %r2358; + .loc 1 831 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + shl.b32 %r10772, %r14901, 14; + add.s32 %r10773, %r7585, %r10772; + bar.sync 0; + add.s32 %r10068, %r10773, %r745; + selp.b32 %r10774, 16, 0, %p953; + selp.b32 %r10069, %r10774, 0, %p675; + // begin inline asm + cp.async.cg.shared.global [ %r10068 + 0 ], [ %rd1201 + 0 ], 0x10, %r10069; + // end inline asm + add.s32 %r10070, %r10068, 2048; + selp.b32 %r10775, 16, 0, %p954; + selp.b32 %r10071, %r10775, 0, %p675; + // begin inline asm + cp.async.cg.shared.global [ %r10070 + 0 ], [ %rd1200 + 0 ], 0x10, %r10071; + // end inline asm + add.s32 %r10072, %r10068, 4096; + selp.b32 %r10776, 16, 0, %p955; + selp.b32 %r10073, %r10776, 0, %p675; + // begin inline asm + cp.async.cg.shared.global [ %r10072 + 0 ], [ %rd1199 + 0 ], 0x10, %r10073; + // end inline asm + add.s32 %r10074, %r10068, 6144; + selp.b32 %r10777, 16, 0, %p956; + selp.b32 %r10075, %r10777, 0, %p675; + // begin inline asm + cp.async.cg.shared.global [ %r10074 + 0 ], [ %rd1198 + 0 ], 0x10, %r10075; + // end inline asm + cp.async.commit_group; + .loc 1 674 52 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:674:52 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.lt.s32 %p957, %r1490, %r2358; + setp.lt.s32 %p958, %r1491, %r2358; + setp.lt.s32 %p959, %r1492, %r2358; + setp.lt.s32 %p960, %r1493, %r2358; + setp.lt.s32 %p961, %r1494, %r2358; + setp.lt.s32 %p962, %r1495, %r2358; + setp.lt.s32 %p963, %r1496, %r2358; + setp.lt.s32 %p964, %r1497, %r2358; + setp.lt.s32 %p965, %r1498, %r2358; + setp.lt.s32 %p966, %r1499, %r2358; + setp.lt.s32 %p967, %r1500, %r2358; + setp.lt.s32 %p968, %r1501, %r2358; + setp.lt.s32 %p969, %r1502, %r2358; + setp.lt.s32 %p970, %r1503, %r2358; + setp.lt.s32 %p971, %r1504, %r2358; + setp.lt.s32 %p972, %r1505, %r2358; + .loc 1 674 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:674:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + mul.wide.s32 %rd860, %r1490, 4; + add.s64 %rd768, %rd142, %rd860; + mul.wide.s32 %rd861, %r1491, 4; + add.s64 %rd769, %rd142, %rd861; + mul.wide.s32 %rd862, %r1492, 4; + add.s64 %rd770, %rd142, %rd862; + mul.wide.s32 %rd863, %r1493, 4; + add.s64 %rd771, %rd142, %rd863; + mul.wide.s32 %rd864, %r1494, 4; + add.s64 %rd772, %rd142, %rd864; + mul.wide.s32 %rd865, %r1495, 4; + add.s64 %rd773, %rd142, %rd865; + mul.wide.s32 %rd866, %r1496, 4; + add.s64 %rd774, %rd142, %rd866; + mul.wide.s32 %rd867, %r1497, 4; + add.s64 %rd775, %rd142, %rd867; + mul.wide.s32 %rd868, %r1498, 4; + add.s64 %rd776, %rd142, %rd868; + mul.wide.s32 %rd869, %r1499, 4; + add.s64 %rd777, %rd142, %rd869; + mul.wide.s32 %rd870, %r1500, 4; + add.s64 %rd778, %rd142, %rd870; + mul.wide.s32 %rd871, %r1501, 4; + add.s64 %rd779, %rd142, %rd871; + mul.wide.s32 %rd872, %r1502, 4; + add.s64 %rd780, %rd142, %rd872; + mul.wide.s32 %rd873, %r1503, 4; + add.s64 %rd781, %rd142, %rd873; + mul.wide.s32 %rd874, %r1504, 4; + add.s64 %rd782, %rd142, %rd874; + mul.wide.s32 %rd875, %r1505, 4; + add.s64 %rd783, %rd142, %rd875; + .loc 1 674 22 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:674:22 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + shl.b32 %r10778, %r14899, 8; + add.s32 %r10779, %r10153, %r10778; + add.s32 %r10076, %r10779, %r767; + selp.b32 %r10780, 4, 0, %p957; + selp.b32 %r10117, %r10780, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10076 + 0 ], [ %rd768 + 0 ], 0x4, %r10117; + // end inline asm + add.s32 %r10078, %r10076, 4; + selp.b32 %r10781, 4, 0, %p958; + selp.b32 %r10119, %r10781, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10078 + 0 ], [ %rd769 + 0 ], 0x4, %r10119; + // end inline asm + add.s32 %r10080, %r10076, 32; + selp.b32 %r10782, 4, 0, %p959; + selp.b32 %r10121, %r10782, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10080 + 0 ], [ %rd770 + 0 ], 0x4, %r10121; + // end inline asm + add.s32 %r10082, %r10076, 36; + selp.b32 %r10783, 4, 0, %p960; + selp.b32 %r10123, %r10783, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10082 + 0 ], [ %rd771 + 0 ], 0x4, %r10123; + // end inline asm + add.s32 %r10084, %r10076, 64; + selp.b32 %r10784, 4, 0, %p961; + selp.b32 %r10125, %r10784, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10084 + 0 ], [ %rd772 + 0 ], 0x4, %r10125; + // end inline asm + add.s32 %r10086, %r10076, 68; + selp.b32 %r10785, 4, 0, %p962; + selp.b32 %r10127, %r10785, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10086 + 0 ], [ %rd773 + 0 ], 0x4, %r10127; + // end inline asm + add.s32 %r10088, %r10076, 96; + selp.b32 %r10786, 4, 0, %p963; + selp.b32 %r10129, %r10786, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10088 + 0 ], [ %rd774 + 0 ], 0x4, %r10129; + // end inline asm + add.s32 %r10090, %r10076, 100; + selp.b32 %r10787, 4, 0, %p964; + selp.b32 %r10131, %r10787, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10090 + 0 ], [ %rd775 + 0 ], 0x4, %r10131; + // end inline asm + add.s32 %r10092, %r10076, 128; + selp.b32 %r10788, 4, 0, %p965; + selp.b32 %r10133, %r10788, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10092 + 0 ], [ %rd776 + 0 ], 0x4, %r10133; + // end inline asm + add.s32 %r10094, %r10076, 132; + selp.b32 %r10789, 4, 0, %p966; + selp.b32 %r10135, %r10789, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10094 + 0 ], [ %rd777 + 0 ], 0x4, %r10135; + // end inline asm + add.s32 %r10096, %r10076, 160; + selp.b32 %r10790, 4, 0, %p967; + selp.b32 %r10137, %r10790, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10096 + 0 ], [ %rd778 + 0 ], 0x4, %r10137; + // end inline asm + add.s32 %r10098, %r10076, 164; + selp.b32 %r10791, 4, 0, %p968; + selp.b32 %r10139, %r10791, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10098 + 0 ], [ %rd779 + 0 ], 0x4, %r10139; + // end inline asm + add.s32 %r10100, %r10076, 192; + selp.b32 %r10792, 4, 0, %p969; + selp.b32 %r10141, %r10792, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10100 + 0 ], [ %rd780 + 0 ], 0x4, %r10141; + // end inline asm + add.s32 %r10102, %r10076, 196; + selp.b32 %r10793, 4, 0, %p970; + selp.b32 %r10143, %r10793, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10102 + 0 ], [ %rd781 + 0 ], 0x4, %r10143; + // end inline asm + add.s32 %r10104, %r10076, 224; + selp.b32 %r10794, 4, 0, %p971; + selp.b32 %r10145, %r10794, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10104 + 0 ], [ %rd782 + 0 ], 0x4, %r10145; + // end inline asm + add.s32 %r10106, %r10076, 228; + selp.b32 %r10795, 4, 0, %p972; + selp.b32 %r10147, %r10795, 0, %p675; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10106 + 0 ], [ %rd783 + 0 ], 0x4, %r10147; + // end inline asm + cp.async.commit_group; + .loc 1 833 52 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:833:52 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.lt.s32 %p973, %r14916, %r2358; + setp.lt.s32 %p974, %r14917, %r2358; + setp.lt.s32 %p975, %r14918, %r2358; + setp.lt.s32 %p976, %r14919, %r2358; + .loc 1 833 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:833:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s32 %r10796, %r10633, %r10772; + add.s32 %r10108, %r10796, %r745; + selp.b32 %r10797, 16, 0, %p973; + selp.b32 %r10109, %r10797, 0, %p675; + // begin inline asm + cp.async.cg.shared.global [ %r10108 + 0 ], [ %rd1197 + 0 ], 0x10, %r10109; + // end inline asm + add.s32 %r10110, %r10108, 2048; + selp.b32 %r10798, 16, 0, %p974; + selp.b32 %r10111, %r10798, 0, %p675; + // begin inline asm + cp.async.cg.shared.global [ %r10110 + 0 ], [ %rd1196 + 0 ], 0x10, %r10111; + // end inline asm + add.s32 %r10112, %r10108, 4096; + selp.b32 %r10799, 16, 0, %p975; + selp.b32 %r10113, %r10799, 0, %p675; + // begin inline asm + cp.async.cg.shared.global [ %r10112 + 0 ], [ %rd1195 + 0 ], 0x10, %r10113; + // end inline asm + add.s32 %r10114, %r10108, 6144; + selp.b32 %r10800, 16, 0, %p976; + selp.b32 %r10115, %r10800, 0, %p675; + // begin inline asm + cp.async.cg.shared.global [ %r10114 + 0 ], [ %rd1194 + 0 ], 0x10, %r10115; + // end inline asm + cp.async.commit_group; + .loc 1 748 29 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:748:29 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s64 %rd788, %rd143, %rd860; + add.s64 %rd789, %rd143, %rd861; + add.s64 %rd790, %rd143, %rd862; + add.s64 %rd791, %rd143, %rd863; + add.s64 %rd792, %rd143, %rd864; + add.s64 %rd793, %rd143, %rd865; + add.s64 %rd794, %rd143, %rd866; + add.s64 %rd795, %rd143, %rd867; + add.s64 %rd796, %rd143, %rd868; + add.s64 %rd797, %rd143, %rd869; + add.s64 %rd798, %rd143, %rd870; + add.s64 %rd799, %rd143, %rd871; + add.s64 %rd800, %rd143, %rd872; + add.s64 %rd801, %rd143, %rd873; + add.s64 %rd802, %rd143, %rd874; + add.s64 %rd803, %rd143, %rd875; + .loc 1 748 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:748:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + add.s32 %r10801, %r10641, %r10778; + add.s32 %r10116, %r10801, %r767; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10116 + 0 ], [ %rd788 + 0 ], 0x4, %r10117; + // end inline asm + add.s32 %r10118, %r10116, 4; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10118 + 0 ], [ %rd789 + 0 ], 0x4, %r10119; + // end inline asm + add.s32 %r10120, %r10116, 32; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10120 + 0 ], [ %rd790 + 0 ], 0x4, %r10121; + // end inline asm + add.s32 %r10122, %r10116, 36; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10122 + 0 ], [ %rd791 + 0 ], 0x4, %r10123; + // end inline asm + add.s32 %r10124, %r10116, 64; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10124 + 0 ], [ %rd792 + 0 ], 0x4, %r10125; + // end inline asm + add.s32 %r10126, %r10116, 68; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10126 + 0 ], [ %rd793 + 0 ], 0x4, %r10127; + // end inline asm + add.s32 %r10128, %r10116, 96; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10128 + 0 ], [ %rd794 + 0 ], 0x4, %r10129; + // end inline asm + add.s32 %r10130, %r10116, 100; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10130 + 0 ], [ %rd795 + 0 ], 0x4, %r10131; + // end inline asm + add.s32 %r10132, %r10116, 128; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10132 + 0 ], [ %rd796 + 0 ], 0x4, %r10133; + // end inline asm + add.s32 %r10134, %r10116, 132; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10134 + 0 ], [ %rd797 + 0 ], 0x4, %r10135; + // end inline asm + add.s32 %r10136, %r10116, 160; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10136 + 0 ], [ %rd798 + 0 ], 0x4, %r10137; + // end inline asm + add.s32 %r10138, %r10116, 164; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10138 + 0 ], [ %rd799 + 0 ], 0x4, %r10139; + // end inline asm + add.s32 %r10140, %r10116, 192; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10140 + 0 ], [ %rd800 + 0 ], 0x4, %r10141; + // end inline asm + add.s32 %r10142, %r10116, 196; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10142 + 0 ], [ %rd801 + 0 ], 0x4, %r10143; + // end inline asm + add.s32 %r10144, %r10116, 224; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10144 + 0 ], [ %rd802 + 0 ], 0x4, %r10145; + // end inline asm + add.s32 %r10146, %r10116, 228; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r10146 + 0 ], [ %rd803 + 0 ], 0x4, %r10147; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:610:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + setp.ne.b32 %p977, %r966, %r1424; + mov.b32 %r14884, %r14915; + mov.b32 %r14885, %r14914; + mov.b32 %r14886, %r14913; + mov.b32 %r14887, %r14912; + mov.b32 %r14888, %r14911; + mov.b32 %r14889, %r14910; + mov.b32 %r14890, %r14909; + mov.b32 %r14891, %r14908; + mov.b32 %r14892, %r14907; + mov.b32 %r14893, %r14906; + mov.b32 %r14894, %r14905; + mov.b32 %r14895, %r14904; + mov.b32 %r14896, %r14903; + mov.b32 %r14897, %r14902; + mov.b32 %r14902, %r1505; + mov.b32 %r14903, %r1504; + mov.b32 %r14904, %r1503; + mov.b32 %r14905, %r1502; + mov.b32 %r14906, %r1501; + mov.b32 %r14907, %r1500; + mov.b32 %r14908, %r1499; + mov.b32 %r14909, %r1498; + mov.b32 %r14910, %r1497; + mov.b32 %r14911, %r1496; + mov.b32 %r14912, %r1495; + mov.b32 %r14913, %r1494; + mov.b32 %r14914, %r1493; + mov.b32 %r14915, %r1492; + mov.b32 %r15052, %r1424; + mov.b32 %r15055, %r15057; + mov.b32 %r15056, %r15058; + mov.b32 %r15057, %r1490; + mov.b32 %r15058, %r1491; + @%p977 bra $L__BB0_11; +$L__BB0_12: // %._crit_edge1621 + // in Loop: Header=BB0_9 Depth=1 + .loc 1 0 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:0:28 + setp.lt.s32 %p1042, %r744, 1; + .loc 1 610 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:610:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:298:16 ] + // begin inline asm + // wait for regs: %r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987,%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp18: + .loc 1 601 18 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:601:18 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s64 %rd957, %rd140, %rd956; + add.s64 %rd959, %rd140, %rd958; + add.s64 %rd961, %rd140, %rd960; + add.s64 %rd963, %rd140, %rd962; + .loc 1 601 49 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:601:49 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s64 %rd876, %rd957, %rd685; + add.s64 %rd877, %rd959, %rd685; + add.s64 %rd878, %rd961, %rd685; + add.s64 %rd879, %rd963, %rd685; + .loc 1 602 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:602:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s64 %rd966, %rd141, %rd965; + add.s64 %rd968, %rd141, %rd967; + add.s64 %rd970, %rd141, %rd969; + add.s64 %rd972, %rd141, %rd971; + .loc 1 602 51 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:602:51 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s64 %rd896, %rd966, %rd685; + add.s64 %rd897, %rd968, %rd685; + add.s64 %rd898, %rd970, %rd685; + add.s64 %rd899, %rd972, %rd685; + .loc 1 831 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + // begin inline asm + cp.async.cg.shared.global [ %r11058 + 0 ], [ %rd876 + 0 ], 0x10, %r11059; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11060 + 0 ], [ %rd877 + 0 ], 0x10, %r11061; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11062 + 0 ], [ %rd878 + 0 ], 0x10, %r11063; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11064 + 0 ], [ %rd879 + 0 ], 0x10, %r11065; + // end inline asm + cp.async.commit_group; + .loc 1 674 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:674:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s64 %rd880, %rd142, %rd973; + cvt.s64.s32 %rd974, %r726; + add.s64 %rd975, %rd974, %rd144; + shl.b64 %rd976, %rd975, 2; + add.s64 %rd977, %rd142, %rd976; + add.s64 %rd881, %rd977, 4; + add.s64 %rd882, %rd977, 32; + add.s64 %rd883, %rd977, 36; + add.s64 %rd884, %rd142, %rd978; + add.s64 %rd885, %rd142, %rd979; + add.s64 %rd886, %rd142, %rd980; + add.s64 %rd887, %rd142, %rd981; + add.s64 %rd888, %rd142, %rd982; + add.s64 %rd889, %rd142, %rd983; + add.s64 %rd890, %rd142, %rd984; + add.s64 %rd891, %rd142, %rd985; + add.s64 %rd892, %rd142, %rd986; + add.s64 %rd893, %rd142, %rd987; + add.s64 %rd894, %rd142, %rd988; + add.s64 %rd895, %rd142, %rd989; + .loc 1 674 22 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:674:22 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11066 + 0 ], [ %rd880 + 0 ], 0x4, %r11067; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11068 + 0 ], [ %rd881 + 0 ], 0x4, %r11069; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11070 + 0 ], [ %rd882 + 0 ], 0x4, %r11071; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11072 + 0 ], [ %rd883 + 0 ], 0x4, %r11073; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11074 + 0 ], [ %rd884 + 0 ], 0x4, %r11075; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11076 + 0 ], [ %rd885 + 0 ], 0x4, %r11077; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11078 + 0 ], [ %rd886 + 0 ], 0x4, %r11079; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11080 + 0 ], [ %rd887 + 0 ], 0x4, %r11081; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11082 + 0 ], [ %rd888 + 0 ], 0x4, %r11083; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11084 + 0 ], [ %rd889 + 0 ], 0x4, %r11085; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11086 + 0 ], [ %rd890 + 0 ], 0x4, %r11087; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11088 + 0 ], [ %rd891 + 0 ], 0x4, %r11089; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11090 + 0 ], [ %rd892 + 0 ], 0x4, %r11091; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11092 + 0 ], [ %rd893 + 0 ], 0x4, %r11093; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11094 + 0 ], [ %rd894 + 0 ], 0x4, %r11095; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11096 + 0 ], [ %rd895 + 0 ], 0x4, %r11097; + // end inline asm + cp.async.commit_group; + .loc 1 833 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:833:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + // begin inline asm + cp.async.cg.shared.global [ %r11098 + 0 ], [ %rd896 + 0 ], 0x10, %r11059; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11100 + 0 ], [ %rd897 + 0 ], 0x10, %r11061; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11102 + 0 ], [ %rd898 + 0 ], 0x10, %r11063; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11104 + 0 ], [ %rd899 + 0 ], 0x10, %r11065; + // end inline asm + cp.async.commit_group; + .loc 1 748 29 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:748:29 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s64 %rd900, %rd143, %rd973; + add.s64 %rd990, %rd143, %rd976; + add.s64 %rd901, %rd990, 4; + add.s64 %rd902, %rd990, 32; + add.s64 %rd903, %rd990, 36; + add.s64 %rd904, %rd143, %rd978; + add.s64 %rd905, %rd143, %rd979; + add.s64 %rd906, %rd143, %rd980; + add.s64 %rd907, %rd143, %rd981; + add.s64 %rd908, %rd143, %rd982; + add.s64 %rd909, %rd143, %rd983; + add.s64 %rd910, %rd143, %rd984; + add.s64 %rd911, %rd143, %rd985; + add.s64 %rd912, %rd143, %rd986; + add.s64 %rd913, %rd143, %rd987; + add.s64 %rd914, %rd143, %rd988; + add.s64 %rd915, %rd143, %rd989; + .loc 1 748 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:748:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11106 + 0 ], [ %rd900 + 0 ], 0x4, %r11067; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11108 + 0 ], [ %rd901 + 0 ], 0x4, %r11069; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11110 + 0 ], [ %rd902 + 0 ], 0x4, %r11071; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11112 + 0 ], [ %rd903 + 0 ], 0x4, %r11073; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11114 + 0 ], [ %rd904 + 0 ], 0x4, %r11075; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11116 + 0 ], [ %rd905 + 0 ], 0x4, %r11077; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11118 + 0 ], [ %rd906 + 0 ], 0x4, %r11079; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11120 + 0 ], [ %rd907 + 0 ], 0x4, %r11081; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11122 + 0 ], [ %rd908 + 0 ], 0x4, %r11083; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11124 + 0 ], [ %rd909 + 0 ], 0x4, %r11085; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11126 + 0 ], [ %rd910 + 0 ], 0x4, %r11087; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11128 + 0 ], [ %rd911 + 0 ], 0x4, %r11089; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11130 + 0 ], [ %rd912 + 0 ], 0x4, %r11091; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11132 + 0 ], [ %rd913 + 0 ], 0x4, %r11093; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11134 + 0 ], [ %rd914 + 0 ], 0x4, %r11095; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11136 + 0 ], [ %rd915 + 0 ], 0x4, %r11097; + // end inline asm + cp.async.commit_group; + .loc 1 626 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:626:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s64 %rd1209, %rd876, 524288; + add.s64 %rd1208, %rd877, 524288; + add.s64 %rd1207, %rd878, 524288; + add.s64 %rd1206, %rd879, 524288; + .loc 1 627 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:627:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s64 %rd1205, %rd896, 16384; + add.s64 %rd1204, %rd897, 16384; + add.s64 %rd1203, %rd898, 16384; + add.s64 %rd1202, %rd899, 16384; + .loc 1 831 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + bar.sync 0; + // begin inline asm + cp.async.cg.shared.global [ %r11138 + 0 ], [ %rd1209 + 0 ], 0x10, %r11139; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11140 + 0 ], [ %rd1208 + 0 ], 0x10, %r11141; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11142 + 0 ], [ %rd1207 + 0 ], 0x10, %r11143; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11144 + 0 ], [ %rd1206 + 0 ], 0x10, %r11145; + // end inline asm + cp.async.commit_group; + .loc 1 674 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:674:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s64 %rd920, %rd880, 256; + add.s64 %rd991, %rd976, 256; + add.s64 %rd992, %rd142, %rd991; + add.s64 %rd921, %rd992, 4; + add.s64 %rd922, %rd992, 32; + add.s64 %rd923, %rd992, 36; + add.s64 %rd924, %rd884, 256; + add.s64 %rd925, %rd885, 256; + add.s64 %rd926, %rd886, 256; + add.s64 %rd927, %rd887, 256; + add.s64 %rd928, %rd888, 256; + add.s64 %rd929, %rd889, 256; + add.s64 %rd930, %rd890, 256; + add.s64 %rd931, %rd891, 256; + add.s64 %rd932, %rd892, 256; + add.s64 %rd933, %rd893, 256; + add.s64 %rd934, %rd894, 256; + add.s64 %rd935, %rd895, 256; + .loc 1 674 22 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:674:22 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11146 + 0 ], [ %rd920 + 0 ], 0x4, %r11147; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11148 + 0 ], [ %rd921 + 0 ], 0x4, %r11149; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11150 + 0 ], [ %rd922 + 0 ], 0x4, %r11151; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11152 + 0 ], [ %rd923 + 0 ], 0x4, %r11153; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11154 + 0 ], [ %rd924 + 0 ], 0x4, %r11155; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11156 + 0 ], [ %rd925 + 0 ], 0x4, %r11157; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11158 + 0 ], [ %rd926 + 0 ], 0x4, %r11159; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11160 + 0 ], [ %rd927 + 0 ], 0x4, %r11161; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11162 + 0 ], [ %rd928 + 0 ], 0x4, %r11163; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11164 + 0 ], [ %rd929 + 0 ], 0x4, %r11165; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11166 + 0 ], [ %rd930 + 0 ], 0x4, %r11167; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11168 + 0 ], [ %rd931 + 0 ], 0x4, %r11169; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11170 + 0 ], [ %rd932 + 0 ], 0x4, %r11171; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11172 + 0 ], [ %rd933 + 0 ], 0x4, %r11173; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11174 + 0 ], [ %rd934 + 0 ], 0x4, %r11175; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11176 + 0 ], [ %rd935 + 0 ], 0x4, %r11177; + // end inline asm + cp.async.commit_group; + .loc 1 833 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:833:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + // begin inline asm + cp.async.cg.shared.global [ %r11178 + 0 ], [ %rd1205 + 0 ], 0x10, %r11139; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11180 + 0 ], [ %rd1204 + 0 ], 0x10, %r11141; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11182 + 0 ], [ %rd1203 + 0 ], 0x10, %r11143; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r11184 + 0 ], [ %rd1202 + 0 ], 0x10, %r11145; + // end inline asm + cp.async.commit_group; + .loc 1 748 29 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:748:29 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s64 %rd940, %rd900, 256; + add.s64 %rd993, %rd143, %rd991; + add.s64 %rd941, %rd993, 4; + add.s64 %rd942, %rd993, 32; + add.s64 %rd943, %rd993, 36; + add.s64 %rd944, %rd904, 256; + add.s64 %rd945, %rd905, 256; + add.s64 %rd946, %rd906, 256; + add.s64 %rd947, %rd907, 256; + add.s64 %rd948, %rd908, 256; + add.s64 %rd949, %rd909, 256; + add.s64 %rd950, %rd910, 256; + add.s64 %rd951, %rd911, 256; + add.s64 %rd952, %rd912, 256; + add.s64 %rd953, %rd913, 256; + add.s64 %rd954, %rd914, 256; + add.s64 %rd955, %rd915, 256; + .loc 1 748 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:748:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11186 + 0 ], [ %rd940 + 0 ], 0x4, %r11147; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11188 + 0 ], [ %rd941 + 0 ], 0x4, %r11149; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11190 + 0 ], [ %rd942 + 0 ], 0x4, %r11151; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11192 + 0 ], [ %rd943 + 0 ], 0x4, %r11153; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11194 + 0 ], [ %rd944 + 0 ], 0x4, %r11155; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11196 + 0 ], [ %rd945 + 0 ], 0x4, %r11157; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11198 + 0 ], [ %rd946 + 0 ], 0x4, %r11159; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11200 + 0 ], [ %rd947 + 0 ], 0x4, %r11161; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11202 + 0 ], [ %rd948 + 0 ], 0x4, %r11163; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11204 + 0 ], [ %rd949 + 0 ], 0x4, %r11165; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11206 + 0 ], [ %rd950 + 0 ], 0x4, %r11167; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11208 + 0 ], [ %rd951 + 0 ], 0x4, %r11169; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11210 + 0 ], [ %rd952 + 0 ], 0x4, %r11171; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11212 + 0 ], [ %rd953 + 0 ], 0x4, %r11173; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11214 + 0 ], [ %rd954 + 0 ], 0x4, %r11175; + // end inline asm + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r11216 + 0 ], [ %rd955 + 0 ], 0x4, %r11177; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:610:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + @%p1042 bra $L__BB0_15; +// %bb.13: // %.lr.ph1793.preheader + // in Loop: Header=BB0_9 Depth=1 + .loc 1 0 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:0:28 + mov.b32 %r11772, 0; + mov.b32 %r15216, 1; + mov.b32 %r15215, -1; + mov.b32 %r15199, %r743; + mov.b32 %r15200, %r742; + mov.b32 %r15201, %r741; + mov.b32 %r15202, %r740; + mov.b32 %r15203, %r739; + mov.b32 %r15204, %r738; + mov.b32 %r15205, %r737; + mov.b32 %r15206, %r736; + mov.b32 %r15207, %r735; + mov.b32 %r15208, %r734; + mov.b32 %r15209, %r733; + mov.b32 %r15210, %r732; + mov.b32 %r15211, %r731; + mov.b32 %r15212, %r730; + mov.b32 %r15213, %r729; + mov.b32 %r15214, %r728; + mov.b32 %r15217, %r15215; + mov.b32 %r15218, %r15216; + mov.b32 %r15219, %r939; + mov.b32 %r15220, %r938; + mov.b32 %r15221, %r937; + mov.b32 %r15222, %r936; + mov.b32 %r15223, %r935; + mov.b32 %r15224, %r934; + mov.b32 %r15225, %r933; + mov.b32 %r15226, %r932; + mov.b32 %r15227, %r931; + mov.b32 %r15228, %r930; + mov.b32 %r15229, %r929; + mov.b32 %r15230, %r928; + mov.b32 %r15231, %r927; + mov.b32 %r15232, %r926; + mov.b32 %r15233, %r925; + mov.b32 %r15234, %r924; + mov.b32 %r15235, %r940; + mov.b32 %r15236, %r941; + mov.b32 %r15237, %r942; + mov.b32 %r15238, %r943; + mov.b32 %r15239, %r940; + mov.b32 %r15240, %r941; + mov.b32 %r15241, %r942; + mov.b32 %r15242, %r943; + mov.b32 %r15371, %r11772; +$L__BB0_14: // %.lr.ph1793 + // Parent Loop BB0_9 Depth=1 + // => This Inner Loop Header: Depth=2 + .loc 1 610 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:610:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + setp.lt.s32 %p1099, %r15371, %r964; + setp.lt.s32 %p1065, %r15371, %r965; + add.s32 %r13471, %r15215, 1; + setp.gt.s32 %p1100, %r13471, 1; + selp.b32 %r15215, 0, %r13471, %p1100; + add.s32 %r13472, %r15217, 1; + setp.gt.s32 %p1101, %r13472, 2; + selp.b32 %r15217, 0, %r13472, %p1101; + .loc 1 831 52 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:52 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + setp.lt.s32 %p1102, %r15214, %r2358; + setp.lt.s32 %p1103, %r15213, %r2358; + setp.lt.s32 %p1104, %r15212, %r2358; + setp.lt.s32 %p1105, %r15211, %r2358; + setp.lt.s32 %p1106, %r15210, %r2358; + setp.lt.s32 %p1107, %r15209, %r2358; + setp.lt.s32 %p1108, %r15208, %r2358; + setp.lt.s32 %p1109, %r15207, %r2358; + setp.lt.s32 %p1110, %r15206, %r2358; + setp.lt.s32 %p1111, %r15205, %r2358; + setp.lt.s32 %p1112, %r15204, %r2358; + setp.lt.s32 %p1113, %r15203, %r2358; + setp.lt.s32 %p1114, %r15202, %r2358; + setp.lt.s32 %p1115, %r15201, %r2358; + setp.lt.s32 %p1116, %r15200, %r2358; + setp.lt.s32 %p1117, %r15199, %r2358; + .loc 1 831 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cp.async.wait_group 4; + bar.sync 0; + shl.b32 %r13473, %r15217, 14; + add.s32 %r11774, %r7585, %r13473; + .loc 1 674 22 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:674:22 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + shl.b32 %r13475, %r15215, 8; + add.s32 %r13476, %r7585, 98304; + add.s32 %r13477, %r13476, %r13475; + add.s32 %r13478, %r13477, %r767; + ld.shared.v2.b32 {%r13479, %r13480}, [%r13478]; + ld.shared.v2.b32 {%r13481, %r13482}, [%r13478+32]; + ld.shared.v2.b32 {%r13483, %r13484}, [%r13478+64]; + ld.shared.v2.b32 {%r13485, %r13486}, [%r13478+96]; + ld.shared.v2.b32 {%r13487, %r13488}, [%r13478+128]; + ld.shared.v2.b32 {%r13489, %r13490}, [%r13478+160]; + ld.shared.v2.b32 {%r13491, %r13492}, [%r13478+192]; + ld.shared.v2.b32 {%r13493, %r13494}, [%r13478+224]; + .loc 1 675 26 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:675:26 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + setp.eq.f32 %p1118, %r13479, 0fFF800000; + setp.eq.f32 %p1119, %r13480, 0fFF800000; + setp.eq.f32 %p1120, %r13481, 0fFF800000; + setp.eq.f32 %p1121, %r13482, 0fFF800000; + setp.eq.f32 %p1122, %r13483, 0fFF800000; + setp.eq.f32 %p1123, %r13484, 0fFF800000; + setp.eq.f32 %p1124, %r13485, 0fFF800000; + setp.eq.f32 %p1125, %r13486, 0fFF800000; + setp.eq.f32 %p1126, %r13487, 0fFF800000; + setp.eq.f32 %p1127, %r13488, 0fFF800000; + setp.eq.f32 %p1128, %r13489, 0fFF800000; + setp.eq.f32 %p1129, %r13490, 0fFF800000; + setp.eq.f32 %p1130, %r13491, 0fFF800000; + setp.eq.f32 %p1131, %r13492, 0fFF800000; + setp.eq.f32 %p1132, %r13493, 0fFF800000; + setp.eq.f32 %p1133, %r13494, 0fFF800000; + .loc 1 675 46 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:675:46 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13495, 0f00000000, %r13479, %p1118; + selp.f32 %r13496, 0f00000000, %r13480, %p1119; + selp.f32 %r13497, 0f00000000, %r13481, %p1120; + selp.f32 %r13498, 0f00000000, %r13482, %p1121; + selp.f32 %r13499, 0f00000000, %r13483, %p1122; + selp.f32 %r13500, 0f00000000, %r13484, %p1123; + selp.f32 %r13501, 0f00000000, %r13485, %p1124; + selp.f32 %r13502, 0f00000000, %r13486, %p1125; + selp.f32 %r13503, 0f00000000, %r13487, %p1126; + selp.f32 %r13504, 0f00000000, %r13488, %p1127; + selp.f32 %r13505, 0f00000000, %r13489, %p1128; + selp.f32 %r13506, 0f00000000, %r13490, %p1129; + selp.f32 %r13507, 0f00000000, %r13491, %p1130; + selp.f32 %r13508, 0f00000000, %r13492, %p1131; + selp.f32 %r13509, 0f00000000, %r13493, %p1132; + selp.f32 %r13510, 0f00000000, %r13494, %p1133; + .loc 1 676 20 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:676:20 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + shfl.sync.idx.b32 %r13511, %r11, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r13512, %r13511, 11; + and.b32 %r13513, %r13512, 8192; + add.s32 %r11733, %r7585, 99328; + add.s32 %r13514, %r13513, %r11733; + bfe.u32 %r13515, %r13514, 4, 14; + cvt.u64.u32 %rd1080, %r13515; + or.b64 %rd994, %rd1080, 4611686293372403712; + bfe.u32 %r13516, %r11774, 4, 14; + cvt.u64.u32 %rd1081, %r13516; + or.b64 %rd995, %rd1081, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11317,%r11318,%r11319,%r11320,%r11321,%r11322,%r11323,%r11324,%r11325,%r11326,%r11327,%r11328,%r11329,%r11330,%r11331,%r11332,%r11333,%r11334,%r11335,%r11336,%r11337,%r11338,%r11339,%r11340,%r11341,%r11342,%r11343,%r11344,%r11345,%r11346,%r11347,%r11348}, %rd994, %rd995, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r13517, %r13513, 32; + add.s32 %r13518, %r13517, %r11733; + bfe.u32 %r13519, %r13518, 4, 14; + cvt.u64.u32 %rd1082, %r13519; + or.b64 %rd996, %rd1082, 4611686293372403712; + add.s32 %r13520, %r11774, 32; + bfe.u32 %r13521, %r13520, 4, 14; + cvt.u64.u32 %rd1083, %r13521; + or.b64 %rd997, %rd1083, 4611686293338849280; + mov.pred %p1043, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11317,%r11318,%r11319,%r11320,%r11321,%r11322,%r11323,%r11324,%r11325,%r11326,%r11327,%r11328,%r11329,%r11330,%r11331,%r11332,%r11333,%r11334,%r11335,%r11336,%r11337,%r11338,%r11339,%r11340,%r11341,%r11342,%r11343,%r11344,%r11345,%r11346,%r11347,%r11348}, %rd996, %rd997, %p1043, 1, 1, 0, 0; + // end inline asm + or.b32 %r13522, %r13513, 64; + add.s32 %r13523, %r13522, %r11733; + bfe.u32 %r13524, %r13523, 4, 14; + cvt.u64.u32 %rd1084, %r13524; + or.b64 %rd998, %rd1084, 4611686293372403712; + add.s32 %r13525, %r11774, 64; + bfe.u32 %r13526, %r13525, 4, 14; + cvt.u64.u32 %rd1085, %r13526; + or.b64 %rd999, %rd1085, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11317,%r11318,%r11319,%r11320,%r11321,%r11322,%r11323,%r11324,%r11325,%r11326,%r11327,%r11328,%r11329,%r11330,%r11331,%r11332,%r11333,%r11334,%r11335,%r11336,%r11337,%r11338,%r11339,%r11340,%r11341,%r11342,%r11343,%r11344,%r11345,%r11346,%r11347,%r11348}, %rd998, %rd999, %p1043, 1, 1, 0, 0; + // end inline asm + or.b32 %r13527, %r13513, 96; + add.s32 %r13528, %r13527, %r11733; + bfe.u32 %r13529, %r13528, 4, 14; + cvt.u64.u32 %rd1086, %r13529; + or.b64 %rd1000, %rd1086, 4611686293372403712; + add.s32 %r13530, %r11774, 96; + bfe.u32 %r13531, %r13530, 4, 14; + cvt.u64.u32 %rd1087, %r13531; + or.b64 %rd1001, %rd1087, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11317,%r11318,%r11319,%r11320,%r11321,%r11322,%r11323,%r11324,%r11325,%r11326,%r11327,%r11328,%r11329,%r11330,%r11331,%r11332,%r11333,%r11334,%r11335,%r11336,%r11337,%r11338,%r11339,%r11340,%r11341,%r11342,%r11343,%r11344,%r11345,%r11346,%r11347,%r11348}, %rd1000, %rd1001, %p1043, 1, 1, 0, 0; + // end inline asm + or.b32 %r13532, %r13513, 16384; + add.s32 %r13533, %r13532, %r11733; + bfe.u32 %r13534, %r13533, 4, 14; + cvt.u64.u32 %rd1088, %r13534; + or.b64 %rd1002, %rd1088, 4611686293372403712; + add.s32 %r13535, %r11774, 8192; + bfe.u32 %r13536, %r13535, 4, 14; + cvt.u64.u32 %rd1089, %r13536; + or.b64 %rd1003, %rd1089, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11317,%r11318,%r11319,%r11320,%r11321,%r11322,%r11323,%r11324,%r11325,%r11326,%r11327,%r11328,%r11329,%r11330,%r11331,%r11332,%r11333,%r11334,%r11335,%r11336,%r11337,%r11338,%r11339,%r11340,%r11341,%r11342,%r11343,%r11344,%r11345,%r11346,%r11347,%r11348}, %rd1002, %rd1003, %p1043, 1, 1, 0, 0; + // end inline asm + or.b32 %r13537, %r13513, 16416; + add.s32 %r13538, %r13537, %r11733; + bfe.u32 %r13539, %r13538, 4, 14; + cvt.u64.u32 %rd1090, %r13539; + or.b64 %rd1004, %rd1090, 4611686293372403712; + add.s32 %r13540, %r11774, 8224; + bfe.u32 %r13541, %r13540, 4, 14; + cvt.u64.u32 %rd1091, %r13541; + or.b64 %rd1005, %rd1091, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11317,%r11318,%r11319,%r11320,%r11321,%r11322,%r11323,%r11324,%r11325,%r11326,%r11327,%r11328,%r11329,%r11330,%r11331,%r11332,%r11333,%r11334,%r11335,%r11336,%r11337,%r11338,%r11339,%r11340,%r11341,%r11342,%r11343,%r11344,%r11345,%r11346,%r11347,%r11348}, %rd1004, %rd1005, %p1043, 1, 1, 0, 0; + // end inline asm + or.b32 %r13542, %r13513, 16448; + add.s32 %r13543, %r13542, %r11733; + bfe.u32 %r13544, %r13543, 4, 14; + cvt.u64.u32 %rd1092, %r13544; + or.b64 %rd1006, %rd1092, 4611686293372403712; + add.s32 %r13545, %r11774, 8256; + bfe.u32 %r13546, %r13545, 4, 14; + cvt.u64.u32 %rd1093, %r13546; + or.b64 %rd1007, %rd1093, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11317,%r11318,%r11319,%r11320,%r11321,%r11322,%r11323,%r11324,%r11325,%r11326,%r11327,%r11328,%r11329,%r11330,%r11331,%r11332,%r11333,%r11334,%r11335,%r11336,%r11337,%r11338,%r11339,%r11340,%r11341,%r11342,%r11343,%r11344,%r11345,%r11346,%r11347,%r11348}, %rd1006, %rd1007, %p1043, 1, 1, 0, 0; + // end inline asm + or.b32 %r13547, %r13513, 16480; + add.s32 %r13548, %r13547, %r11733; + bfe.u32 %r13549, %r13548, 4, 14; + cvt.u64.u32 %rd1094, %r13549; + or.b64 %rd1008, %rd1094, 4611686293372403712; + add.s32 %r13550, %r11774, 8288; + bfe.u32 %r13551, %r13550, 4, 14; + cvt.u64.u32 %rd1095, %r13551; + or.b64 %rd1009, %rd1095, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11317,%r11318,%r11319,%r11320,%r11321,%r11322,%r11323,%r11324,%r11325,%r11326,%r11327,%r11328,%r11329,%r11330,%r11331,%r11332,%r11333,%r11334,%r11335,%r11336,%r11337,%r11338,%r11339,%r11340,%r11341,%r11342,%r11343,%r11344,%r11345,%r11346,%r11347,%r11348}, %rd1008, %rd1009, %p1043, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r11736, %r11774; + mov.b32 %r11734, %r11772; + mov.b32 %r11735, %r11772; + mov.b32 %r11737, %r11772; + mov.b32 %r11738, %r11772; + // begin inline asm + // wait for regs: %r11317,%r11318,%r11319,%r11320,%r11321,%r11322,%r11323,%r11324,%r11325,%r11326,%r11327,%r11328,%r11329,%r11330,%r11331,%r11332,%r11333,%r11334,%r11335,%r11336,%r11337,%r11338,%r11339,%r11340,%r11341,%r11342,%r11343,%r11344,%r11345,%r11346,%r11347,%r11348,%r11733,%r11734,%r11735,%r11736,%r11737,%r11738 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 678 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:678:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13552, %r11317, 0f3DB504F3; + mul.f32 %r13553, %r11318, 0f3DB504F3; + mul.f32 %r13554, %r11319, 0f3DB504F3; + mul.f32 %r13555, %r11320, 0f3DB504F3; + mul.f32 %r13556, %r11321, 0f3DB504F3; + mul.f32 %r13557, %r11322, 0f3DB504F3; + mul.f32 %r13558, %r11323, 0f3DB504F3; + mul.f32 %r13559, %r11324, 0f3DB504F3; + mul.f32 %r13560, %r11325, 0f3DB504F3; + mul.f32 %r13561, %r11326, 0f3DB504F3; + mul.f32 %r13562, %r11327, 0f3DB504F3; + mul.f32 %r13563, %r11328, 0f3DB504F3; + mul.f32 %r13564, %r11329, 0f3DB504F3; + mul.f32 %r13565, %r11330, 0f3DB504F3; + mul.f32 %r13566, %r11331, 0f3DB504F3; + mul.f32 %r13567, %r11332, 0f3DB504F3; + mul.f32 %r13568, %r11333, 0f3DB504F3; + mul.f32 %r13569, %r11334, 0f3DB504F3; + mul.f32 %r13570, %r11335, 0f3DB504F3; + mul.f32 %r13571, %r11336, 0f3DB504F3; + mul.f32 %r13572, %r11337, 0f3DB504F3; + mul.f32 %r13573, %r11338, 0f3DB504F3; + mul.f32 %r13574, %r11339, 0f3DB504F3; + mul.f32 %r13575, %r11340, 0f3DB504F3; + mul.f32 %r13576, %r11341, 0f3DB504F3; + mul.f32 %r13577, %r11342, 0f3DB504F3; + mul.f32 %r13578, %r11343, 0f3DB504F3; + mul.f32 %r13579, %r11344, 0f3DB504F3; + mul.f32 %r13580, %r11345, 0f3DB504F3; + mul.f32 %r13581, %r11346, 0f3DB504F3; + mul.f32 %r13582, %r11347, 0f3DB504F3; + mul.f32 %r13583, %r11348, 0f3DB504F3; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13584, %r13552, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13585, %r13584, 0fFF800000, %p1102; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13586, %r13553, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13587, %r13586, 0fFF800000, %p1103; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13588, %r13554, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13589, %r13588, 0fFF800000, %p1102; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13590, %r13555, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13591, %r13590, 0fFF800000, %p1103; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13592, %r13556, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13593, %r13592, 0fFF800000, %p1104; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13594, %r13557, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13595, %r13594, 0fFF800000, %p1105; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13596, %r13558, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13597, %r13596, 0fFF800000, %p1104; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13598, %r13559, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13599, %r13598, 0fFF800000, %p1105; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13600, %r13560, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13601, %r13600, 0fFF800000, %p1106; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13602, %r13561, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13603, %r13602, 0fFF800000, %p1107; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13604, %r13562, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13605, %r13604, 0fFF800000, %p1106; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13606, %r13563, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13607, %r13606, 0fFF800000, %p1107; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13608, %r13564, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13609, %r13608, 0fFF800000, %p1108; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13610, %r13565, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13611, %r13610, 0fFF800000, %p1109; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13612, %r13566, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13613, %r13612, 0fFF800000, %p1108; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13614, %r13567, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13615, %r13614, 0fFF800000, %p1109; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13616, %r13568, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13617, %r13616, 0fFF800000, %p1110; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13618, %r13569, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13619, %r13618, 0fFF800000, %p1111; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13620, %r13570, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13621, %r13620, 0fFF800000, %p1110; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13622, %r13571, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13623, %r13622, 0fFF800000, %p1111; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13624, %r13572, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13625, %r13624, 0fFF800000, %p1112; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13626, %r13573, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13627, %r13626, 0fFF800000, %p1113; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13628, %r13574, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13629, %r13628, 0fFF800000, %p1112; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13630, %r13575, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13631, %r13630, 0fFF800000, %p1113; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13632, %r13576, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13633, %r13632, 0fFF800000, %p1114; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13634, %r13577, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13635, %r13634, 0fFF800000, %p1115; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13636, %r13578, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13637, %r13636, 0fFF800000, %p1114; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13638, %r13579, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13639, %r13638, 0fFF800000, %p1115; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13640, %r13580, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13641, %r13640, 0fFF800000, %p1116; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13642, %r13581, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13643, %r13642, 0fFF800000, %p1117; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13644, %r13582, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13645, %r13644, 0fFF800000, %p1116; + .loc 1 739 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:739:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13646, %r13583, 0f3FB8AA3B; + .loc 1 692 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:692:78 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.f32 %r13647, %r13646, 0fFF800000, %p1117; + .loc 1 740 40 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:740:40 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + sub.f32 %r13648, %r13585, %r13495; + sub.f32 %r13649, %r13587, %r13496; + sub.f32 %r13650, %r13589, %r13495; + sub.f32 %r13651, %r13591, %r13496; + sub.f32 %r13652, %r13593, %r13497; + sub.f32 %r13653, %r13595, %r13498; + sub.f32 %r13654, %r13597, %r13497; + sub.f32 %r13655, %r13599, %r13498; + sub.f32 %r13656, %r13601, %r13499; + sub.f32 %r13657, %r13603, %r13500; + sub.f32 %r13658, %r13605, %r13499; + sub.f32 %r13659, %r13607, %r13500; + sub.f32 %r13660, %r13609, %r13501; + sub.f32 %r13661, %r13611, %r13502; + sub.f32 %r13662, %r13613, %r13501; + sub.f32 %r13663, %r13615, %r13502; + sub.f32 %r13664, %r13617, %r13503; + sub.f32 %r13665, %r13619, %r13504; + sub.f32 %r13666, %r13621, %r13503; + sub.f32 %r13667, %r13623, %r13504; + sub.f32 %r13668, %r13625, %r13505; + sub.f32 %r13669, %r13627, %r13506; + sub.f32 %r13670, %r13629, %r13505; + sub.f32 %r13671, %r13631, %r13506; + sub.f32 %r13672, %r13633, %r13507; + sub.f32 %r13673, %r13635, %r13508; + sub.f32 %r13674, %r13637, %r13507; + sub.f32 %r13675, %r13639, %r13508; + sub.f32 %r13676, %r13641, %r13509; + sub.f32 %r13677, %r13643, %r13510; + sub.f32 %r13678, %r13645, %r13509; + sub.f32 %r13679, %r13647, %r13510; + .loc 1 740 22 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:740:22 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + ex2.approx.ftz.f32 %r13680, %r13648; + ex2.approx.ftz.f32 %r13681, %r13649; + ex2.approx.ftz.f32 %r13682, %r13650; + ex2.approx.ftz.f32 %r13683, %r13651; + ex2.approx.ftz.f32 %r13684, %r13652; + ex2.approx.ftz.f32 %r13685, %r13653; + ex2.approx.ftz.f32 %r13686, %r13654; + ex2.approx.ftz.f32 %r13687, %r13655; + ex2.approx.ftz.f32 %r13688, %r13656; + ex2.approx.ftz.f32 %r13689, %r13657; + ex2.approx.ftz.f32 %r13690, %r13658; + ex2.approx.ftz.f32 %r13691, %r13659; + ex2.approx.ftz.f32 %r13692, %r13660; + ex2.approx.ftz.f32 %r13693, %r13661; + ex2.approx.ftz.f32 %r13694, %r13662; + ex2.approx.ftz.f32 %r13695, %r13663; + ex2.approx.ftz.f32 %r13696, %r13664; + ex2.approx.ftz.f32 %r13697, %r13665; + ex2.approx.ftz.f32 %r13698, %r13666; + ex2.approx.ftz.f32 %r13699, %r13667; + ex2.approx.ftz.f32 %r13700, %r13668; + ex2.approx.ftz.f32 %r13701, %r13669; + ex2.approx.ftz.f32 %r13702, %r13670; + ex2.approx.ftz.f32 %r13703, %r13671; + ex2.approx.ftz.f32 %r13704, %r13672; + ex2.approx.ftz.f32 %r13705, %r13673; + ex2.approx.ftz.f32 %r13706, %r13674; + ex2.approx.ftz.f32 %r13707, %r13675; + ex2.approx.ftz.f32 %r13708, %r13676; + ex2.approx.ftz.f32 %r13709, %r13677; + ex2.approx.ftz.f32 %r13710, %r13678; + ex2.approx.ftz.f32 %r13711, %r13679; + .loc 1 833 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:833:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s32 %r13712, %r7585, 49152; + add.s32 %r12820, %r13712, %r13473; + .loc 1 744 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:744:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16x2.f32 %r11905, %r13681, %r13680; + cvt.rn.bf16x2.f32 %r11906, %r13683, %r13682; + cvt.rn.bf16x2.f32 %r11907, %r13685, %r13684; + cvt.rn.bf16x2.f32 %r11908, %r13687, %r13686; + cvt.rn.bf16x2.f32 %r12037, %r13689, %r13688; + cvt.rn.bf16x2.f32 %r12038, %r13691, %r13690; + cvt.rn.bf16x2.f32 %r12039, %r13693, %r13692; + cvt.rn.bf16x2.f32 %r12040, %r13695, %r13694; + cvt.rn.bf16x2.f32 %r12169, %r13697, %r13696; + cvt.rn.bf16x2.f32 %r12170, %r13699, %r13698; + cvt.rn.bf16x2.f32 %r12171, %r13701, %r13700; + cvt.rn.bf16x2.f32 %r12172, %r13703, %r13702; + cvt.rn.bf16x2.f32 %r12301, %r13705, %r13704; + cvt.rn.bf16x2.f32 %r12302, %r13707, %r13706; + cvt.rn.bf16x2.f32 %r12303, %r13709, %r13708; + cvt.rn.bf16x2.f32 %r12304, %r13711, %r13710; + .loc 1 744 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:744:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + wgmma.fence.sync.aligned; + bfe.u32 %r13713, %r12820, 4, 14; + cvt.u64.u32 %rd1096, %r13713; + or.b64 %rd1010, %rd1096, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987}, {%r11905,%r11906,%r11907,%r11908}, %rd1010, %p1043, 1, 1, 1; + // end inline asm + add.s32 %r13714, %r12820, 2048; + bfe.u32 %r13715, %r13714, 4, 14; + cvt.u64.u32 %rd1097, %r13715; + or.b64 %rd1011, %rd1097, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987}, {%r12037,%r12038,%r12039,%r12040}, %rd1011, %p1043, 1, 1, 1; + // end inline asm + add.s32 %r13716, %r12820, 4096; + bfe.u32 %r13717, %r13716, 4, 14; + cvt.u64.u32 %rd1098, %r13717; + or.b64 %rd1012, %rd1098, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987}, {%r12169,%r12170,%r12171,%r12172}, %rd1012, %p1043, 1, 1, 1; + // end inline asm + add.s32 %r13718, %r12820, 6144; + bfe.u32 %r13719, %r13718, 4, 14; + cvt.u64.u32 %rd1099, %r13719; + or.b64 %rd1013, %rd1099, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14924,%r14925,%r14926,%r14927,%r14928,%r14929,%r14930,%r14931,%r14932,%r14933,%r14934,%r14935,%r14936,%r14937,%r14938,%r14939,%r14940,%r14941,%r14942,%r14943,%r14944,%r14945,%r14946,%r14947,%r14948,%r14949,%r14950,%r14951,%r14952,%r14953,%r14954,%r14955,%r14956,%r14957,%r14958,%r14959,%r14960,%r14961,%r14962,%r14963,%r14964,%r14965,%r14966,%r14967,%r14968,%r14969,%r14970,%r14971,%r14972,%r14973,%r14974,%r14975,%r14976,%r14977,%r14978,%r14979,%r14980,%r14981,%r14982,%r14983,%r14984,%r14985,%r14986,%r14987}, {%r12301,%r12302,%r12303,%r12304}, %rd1013, %p1043, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 748 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:748:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s32 %r13720, %r7585, 98816; + add.s32 %r13721, %r13720, %r13475; + add.s32 %r13722, %r13721, %r767; + ld.shared.v2.b32 {%r13723, %r13724}, [%r13722]; + ld.shared.v2.b32 {%r13725, %r13726}, [%r13722+32]; + ld.shared.v2.b32 {%r13727, %r13728}, [%r13722+64]; + ld.shared.v2.b32 {%r13729, %r13730}, [%r13722+96]; + ld.shared.v2.b32 {%r13731, %r13732}, [%r13722+128]; + ld.shared.v2.b32 {%r13733, %r13734}, [%r13722+160]; + ld.shared.v2.b32 {%r13735, %r13736}, [%r13722+192]; + ld.shared.v2.b32 {%r13737, %r13738}, [%r13722+224]; + .loc 1 750 20 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:750:20 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + wgmma.fence.sync.aligned; + add.s32 %r12817, %r7585, 132096; + add.s32 %r13739, %r13513, %r12817; + bfe.u32 %r13740, %r13739, 4, 14; + cvt.u64.u32 %rd1100, %r13740; + or.b64 %rd1014, %rd1100, 4611686293372403712; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r12401,%r12402,%r12403,%r12404,%r12405,%r12406,%r12407,%r12408,%r12409,%r12410,%r12411,%r12412,%r12413,%r12414,%r12415,%r12416,%r12417,%r12418,%r12419,%r12420,%r12421,%r12422,%r12423,%r12424,%r12425,%r12426,%r12427,%r12428,%r12429,%r12430,%r12431,%r12432}, %rd1014, %rd1010, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r13741, %r13517, %r12817; + bfe.u32 %r13742, %r13741, 4, 14; + cvt.u64.u32 %rd1101, %r13742; + or.b64 %rd1016, %rd1101, 4611686293372403712; + add.s32 %r13743, %r12820, 32; + bfe.u32 %r13744, %r13743, 4, 14; + cvt.u64.u32 %rd1102, %r13744; + or.b64 %rd1017, %rd1102, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r12401,%r12402,%r12403,%r12404,%r12405,%r12406,%r12407,%r12408,%r12409,%r12410,%r12411,%r12412,%r12413,%r12414,%r12415,%r12416,%r12417,%r12418,%r12419,%r12420,%r12421,%r12422,%r12423,%r12424,%r12425,%r12426,%r12427,%r12428,%r12429,%r12430,%r12431,%r12432}, %rd1016, %rd1017, %p1043, 1, 1, 0, 0; + // end inline asm + add.s32 %r13745, %r13522, %r12817; + bfe.u32 %r13746, %r13745, 4, 14; + cvt.u64.u32 %rd1103, %r13746; + or.b64 %rd1018, %rd1103, 4611686293372403712; + add.s32 %r13747, %r12820, 64; + bfe.u32 %r13748, %r13747, 4, 14; + cvt.u64.u32 %rd1104, %r13748; + or.b64 %rd1019, %rd1104, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r12401,%r12402,%r12403,%r12404,%r12405,%r12406,%r12407,%r12408,%r12409,%r12410,%r12411,%r12412,%r12413,%r12414,%r12415,%r12416,%r12417,%r12418,%r12419,%r12420,%r12421,%r12422,%r12423,%r12424,%r12425,%r12426,%r12427,%r12428,%r12429,%r12430,%r12431,%r12432}, %rd1018, %rd1019, %p1043, 1, 1, 0, 0; + // end inline asm + add.s32 %r13749, %r13527, %r12817; + bfe.u32 %r13750, %r13749, 4, 14; + cvt.u64.u32 %rd1105, %r13750; + or.b64 %rd1020, %rd1105, 4611686293372403712; + add.s32 %r13751, %r12820, 96; + bfe.u32 %r13752, %r13751, 4, 14; + cvt.u64.u32 %rd1106, %r13752; + or.b64 %rd1021, %rd1106, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r12401,%r12402,%r12403,%r12404,%r12405,%r12406,%r12407,%r12408,%r12409,%r12410,%r12411,%r12412,%r12413,%r12414,%r12415,%r12416,%r12417,%r12418,%r12419,%r12420,%r12421,%r12422,%r12423,%r12424,%r12425,%r12426,%r12427,%r12428,%r12429,%r12430,%r12431,%r12432}, %rd1020, %rd1021, %p1043, 1, 1, 0, 0; + // end inline asm + add.s32 %r13753, %r13532, %r12817; + bfe.u32 %r13754, %r13753, 4, 14; + cvt.u64.u32 %rd1107, %r13754; + or.b64 %rd1022, %rd1107, 4611686293372403712; + add.s32 %r13755, %r12820, 8192; + bfe.u32 %r13756, %r13755, 4, 14; + cvt.u64.u32 %rd1108, %r13756; + or.b64 %rd1023, %rd1108, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r12401,%r12402,%r12403,%r12404,%r12405,%r12406,%r12407,%r12408,%r12409,%r12410,%r12411,%r12412,%r12413,%r12414,%r12415,%r12416,%r12417,%r12418,%r12419,%r12420,%r12421,%r12422,%r12423,%r12424,%r12425,%r12426,%r12427,%r12428,%r12429,%r12430,%r12431,%r12432}, %rd1022, %rd1023, %p1043, 1, 1, 0, 0; + // end inline asm + add.s32 %r13757, %r13537, %r12817; + bfe.u32 %r13758, %r13757, 4, 14; + cvt.u64.u32 %rd1109, %r13758; + or.b64 %rd1024, %rd1109, 4611686293372403712; + add.s32 %r13759, %r12820, 8224; + bfe.u32 %r13760, %r13759, 4, 14; + cvt.u64.u32 %rd1110, %r13760; + or.b64 %rd1025, %rd1110, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r12401,%r12402,%r12403,%r12404,%r12405,%r12406,%r12407,%r12408,%r12409,%r12410,%r12411,%r12412,%r12413,%r12414,%r12415,%r12416,%r12417,%r12418,%r12419,%r12420,%r12421,%r12422,%r12423,%r12424,%r12425,%r12426,%r12427,%r12428,%r12429,%r12430,%r12431,%r12432}, %rd1024, %rd1025, %p1043, 1, 1, 0, 0; + // end inline asm + add.s32 %r13761, %r13542, %r12817; + bfe.u32 %r13762, %r13761, 4, 14; + cvt.u64.u32 %rd1111, %r13762; + or.b64 %rd1026, %rd1111, 4611686293372403712; + add.s32 %r13763, %r12820, 8256; + bfe.u32 %r13764, %r13763, 4, 14; + cvt.u64.u32 %rd1112, %r13764; + or.b64 %rd1027, %rd1112, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r12401,%r12402,%r12403,%r12404,%r12405,%r12406,%r12407,%r12408,%r12409,%r12410,%r12411,%r12412,%r12413,%r12414,%r12415,%r12416,%r12417,%r12418,%r12419,%r12420,%r12421,%r12422,%r12423,%r12424,%r12425,%r12426,%r12427,%r12428,%r12429,%r12430,%r12431,%r12432}, %rd1026, %rd1027, %p1043, 1, 1, 0, 0; + // end inline asm + add.s32 %r13765, %r13547, %r12817; + bfe.u32 %r13766, %r13765, 4, 14; + cvt.u64.u32 %rd1113, %r13766; + or.b64 %rd1028, %rd1113, 4611686293372403712; + add.s32 %r13767, %r12820, 8288; + bfe.u32 %r13768, %r13767, 4, 14; + cvt.u64.u32 %rd1114, %r13768; + or.b64 %rd1029, %rd1114, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r12401,%r12402,%r12403,%r12404,%r12405,%r12406,%r12407,%r12408,%r12409,%r12410,%r12411,%r12412,%r12413,%r12414,%r12415,%r12416,%r12417,%r12418,%r12419,%r12420,%r12421,%r12422,%r12423,%r12424,%r12425,%r12426,%r12427,%r12428,%r12429,%r12430,%r12431,%r12432}, %rd1028, %rd1029, %p1043, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r12822, %r11772; + mov.b32 %r12818, %r11772; + mov.b32 %r12819, %r11772; + mov.b32 %r12821, %r11772; + // begin inline asm + // wait for regs: %r12401,%r12402,%r12403,%r12404,%r12405,%r12406,%r12407,%r12408,%r12409,%r12410,%r12411,%r12412,%r12413,%r12414,%r12415,%r12416,%r12417,%r12418,%r12419,%r12420,%r12421,%r12422,%r12423,%r12424,%r12425,%r12426,%r12427,%r12428,%r12429,%r12430,%r12431,%r12432,%r12817,%r12818,%r12819,%r12820,%r12821,%r12822 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 751 22 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:751:22 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + sub.f32 %r13769, %r12401, %r13723; + sub.f32 %r13770, %r12402, %r13724; + sub.f32 %r13771, %r12403, %r13723; + sub.f32 %r13772, %r12404, %r13724; + sub.f32 %r13773, %r12405, %r13725; + sub.f32 %r13774, %r12406, %r13726; + sub.f32 %r13775, %r12407, %r13725; + sub.f32 %r13776, %r12408, %r13726; + sub.f32 %r13777, %r12409, %r13727; + sub.f32 %r13778, %r12410, %r13728; + sub.f32 %r13779, %r12411, %r13727; + sub.f32 %r13780, %r12412, %r13728; + sub.f32 %r13781, %r12413, %r13729; + sub.f32 %r13782, %r12414, %r13730; + sub.f32 %r13783, %r12415, %r13729; + sub.f32 %r13784, %r12416, %r13730; + sub.f32 %r13785, %r12417, %r13731; + sub.f32 %r13786, %r12418, %r13732; + sub.f32 %r13787, %r12419, %r13731; + sub.f32 %r13788, %r12420, %r13732; + sub.f32 %r13789, %r12421, %r13733; + sub.f32 %r13790, %r12422, %r13734; + sub.f32 %r13791, %r12423, %r13733; + sub.f32 %r13792, %r12424, %r13734; + sub.f32 %r13793, %r12425, %r13735; + sub.f32 %r13794, %r12426, %r13736; + sub.f32 %r13795, %r12427, %r13735; + sub.f32 %r13796, %r12428, %r13736; + sub.f32 %r13797, %r12429, %r13737; + sub.f32 %r13798, %r12430, %r13738; + sub.f32 %r13799, %r12431, %r13737; + sub.f32 %r13800, %r12432, %r13738; + .loc 1 751 16 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:751:16 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.f32 %r13801, %r13680, %r13769; + mul.f32 %r13802, %r13681, %r13770; + mul.f32 %r13803, %r13682, %r13771; + mul.f32 %r13804, %r13683, %r13772; + mul.f32 %r13805, %r13684, %r13773; + mul.f32 %r13806, %r13685, %r13774; + mul.f32 %r13807, %r13686, %r13775; + mul.f32 %r13808, %r13687, %r13776; + mul.f32 %r13809, %r13688, %r13777; + mul.f32 %r13810, %r13689, %r13778; + mul.f32 %r13811, %r13690, %r13779; + mul.f32 %r13812, %r13691, %r13780; + mul.f32 %r13813, %r13692, %r13781; + mul.f32 %r13814, %r13693, %r13782; + mul.f32 %r13815, %r13694, %r13783; + mul.f32 %r13816, %r13695, %r13784; + mul.f32 %r13817, %r13696, %r13785; + mul.f32 %r13818, %r13697, %r13786; + mul.f32 %r13819, %r13698, %r13787; + mul.f32 %r13820, %r13699, %r13788; + mul.f32 %r13821, %r13700, %r13789; + mul.f32 %r13822, %r13701, %r13790; + mul.f32 %r13823, %r13702, %r13791; + mul.f32 %r13824, %r13703, %r13792; + mul.f32 %r13825, %r13704, %r13793; + mul.f32 %r13826, %r13705, %r13794; + mul.f32 %r13827, %r13706, %r13795; + mul.f32 %r13828, %r13707, %r13796; + mul.f32 %r13829, %r13708, %r13797; + mul.f32 %r13830, %r13709, %r13798; + mul.f32 %r13831, %r13710, %r13799; + mul.f32 %r13832, %r13711, %r13800; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs193, %r13801; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs194, %rs193, 0x0000, %p1102; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs195, %r13802; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs196, %rs195, 0x0000, %p1103; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs197, %r13803; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs198, %rs197, 0x0000, %p1102; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs199, %r13804; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs200, %rs199, 0x0000, %p1103; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs201, %r13805; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs202, %rs201, 0x0000, %p1104; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs203, %r13806; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs204, %rs203, 0x0000, %p1105; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs205, %r13807; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs206, %rs205, 0x0000, %p1104; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs207, %r13808; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs208, %rs207, 0x0000, %p1105; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs209, %r13809; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs210, %rs209, 0x0000, %p1106; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs211, %r13810; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs212, %rs211, 0x0000, %p1107; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs213, %r13811; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs214, %rs213, 0x0000, %p1106; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs215, %r13812; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs216, %rs215, 0x0000, %p1107; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs217, %r13813; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs218, %rs217, 0x0000, %p1108; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs219, %r13814; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs220, %rs219, 0x0000, %p1109; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs221, %r13815; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs222, %rs221, 0x0000, %p1108; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs223, %r13816; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs224, %rs223, 0x0000, %p1109; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs225, %r13817; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs226, %rs225, 0x0000, %p1110; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs227, %r13818; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs228, %rs227, 0x0000, %p1111; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs229, %r13819; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs230, %rs229, 0x0000, %p1110; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs231, %r13820; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs232, %rs231, 0x0000, %p1111; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs233, %r13821; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs234, %rs233, 0x0000, %p1112; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs235, %r13822; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs236, %rs235, 0x0000, %p1113; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs237, %r13823; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs238, %rs237, 0x0000, %p1112; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs239, %r13824; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs240, %rs239, 0x0000, %p1113; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs241, %r13825; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs242, %rs241, 0x0000, %p1114; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs243, %r13826; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs244, %rs243, 0x0000, %p1115; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs245, %r13827; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs246, %rs245, 0x0000, %p1114; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs247, %r13828; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs248, %rs247, 0x0000, %p1115; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs249, %r13829; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs250, %rs249, 0x0000, %p1116; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs251, %r13830; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs252, %rs251, 0x0000, %p1117; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs253, %r13831; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs254, %rs253, 0x0000, %p1116; + .loc 1 775 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + cvt.rn.bf16.f32 %rs255, %r13832; + .loc 1 759 70 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:759:70 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + selp.b16 %rs256, %rs255, 0x0000, %p1117; + .loc 1 775 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:775:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mov.b32 %r12989, {%rs194, %rs196}; + mov.b32 %r12990, {%rs198, %rs200}; + mov.b32 %r12991, {%rs202, %rs204}; + mov.b32 %r12992, {%rs206, %rs208}; + mov.b32 %r13121, {%rs210, %rs212}; + mov.b32 %r13122, {%rs214, %rs216}; + mov.b32 %r13123, {%rs218, %rs220}; + mov.b32 %r13124, {%rs222, %rs224}; + mov.b32 %r13253, {%rs226, %rs228}; + mov.b32 %r13254, {%rs230, %rs232}; + mov.b32 %r13255, {%rs234, %rs236}; + mov.b32 %r13256, {%rs238, %rs240}; + mov.b32 %r13385, {%rs242, %rs244}; + mov.b32 %r13386, {%rs246, %rs248}; + mov.b32 %r13387, {%rs250, %rs252}; + mov.b32 %r13388, {%rs254, %rs256}; + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051}, {%r12989,%r12990,%r12991,%r12992}, %rd995, %p1043, 1, 1, 1; + // end inline asm + add.s32 %r13833, %r11774, 2048; + bfe.u32 %r13834, %r13833, 4, 14; + cvt.u64.u32 %rd1115, %r13834; + or.b64 %rd1031, %rd1115, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051}, {%r13121,%r13122,%r13123,%r13124}, %rd1031, %p1043, 1, 1, 1; + // end inline asm + add.s32 %r13835, %r11774, 4096; + bfe.u32 %r13836, %r13835, 4, 14; + cvt.u64.u32 %rd1116, %r13836; + or.b64 %rd1032, %rd1116, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051}, {%r13253,%r13254,%r13255,%r13256}, %rd1032, %p1043, 1, 1, 1; + // end inline asm + add.s32 %r13837, %r11774, 6144; + bfe.u32 %r13838, %r13837, 4, 14; + cvt.u64.u32 %rd1117, %r13838; + or.b64 %rd1033, %rd1117, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14988,%r14989,%r14990,%r14991,%r14992,%r14993,%r14994,%r14995,%r14996,%r14997,%r14998,%r14999,%r15000,%r15001,%r15002,%r15003,%r15004,%r15005,%r15006,%r15007,%r15008,%r15009,%r15010,%r15011,%r15012,%r15013,%r15014,%r15015,%r15016,%r15017,%r15018,%r15019,%r15020,%r15021,%r15022,%r15023,%r15024,%r15025,%r15026,%r15027,%r15028,%r15029,%r15030,%r15031,%r15032,%r15033,%r15034,%r15035,%r15036,%r15037,%r15038,%r15039,%r15040,%r15041,%r15042,%r15043,%r15044,%r15045,%r15046,%r15047,%r15048,%r15049,%r15050,%r15051}, {%r13385,%r13386,%r13387,%r13388}, %rd1033, %p1043, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 610 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:610:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s32 %r2075, %r15371, 1; + .loc 1 788 33 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:788:33 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + shr.u32 %r13839, %r2075, 1; + .loc 1 789 38 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:789:38 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mad.wide.u32 %rd1035, %r13839, 4, %rd563; + .loc 1 789 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:789:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + // begin inline asm + mov.u64 %rd1034, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd1034, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r13389, 0x0; + @%p1065 ld.global.L1::evict_last.L2::cache_hint.b32 { %r13389 }, [ %rd1035 + 0 ], %rd1034; + // end inline asm + .loc 1 790 109 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:790:109 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s32 %r13840, %r13839, 1; + .loc 1 790 113 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:790:113 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + setp.lt.s32 %p1134, %r13840, %r7563; + .loc 1 790 55 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:790:55 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s64 %rd1038, %rd1035, 4; + .loc 1 610 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:610:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + and.pred %p1066, %p1065, %p1134; + .loc 1 790 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:790:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + // begin inline asm + mov.u64 %rd1037, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd1037, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r13390, 0x0; + @%p1066 ld.global.L1::evict_last.L2::cache_hint.b32 { %r13390 }, [ %rd1038 + 0 ], %rd1037; + // end inline asm + .loc 1 791 35 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:791:35 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + and.b32 %r13841, %r15371, 1; + .loc 1 792 34 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:792:34 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + sub.s32 %r13842, %r13390, %r13389; + .loc 1 792 48 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:792:48 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + shl.b32 %r13843, %r13842, 7; + .loc 1 792 63 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:792:63 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s32 %r13844, %r13843, -64; + .loc 1 793 29 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:793:29 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + xor.b32 %r13845, %r13841, 1; + .loc 1 793 61 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:793:61 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + shl.b32 %r13846, %r13841, 6; + .loc 1 793 42 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:793:42 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mad.lo.s32 %r13847, %r13844, %r13845, %r13846; + .loc 1 626 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:626:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + shl.b32 %r13848, %r13847, 12; + .loc 1 626 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:626:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.wide.s32 %rd1118, %r13848, 2; + add.s64 %rd1209, %rd1209, %rd1118; + add.s64 %rd1208, %rd1208, %rd1118; + add.s64 %rd1207, %rd1207, %rd1118; + add.s64 %rd1206, %rd1206, %rd1118; + .loc 1 627 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:627:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + shl.b32 %r13849, %r13847, 7; + .loc 1 627 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:627:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.wide.s32 %rd1119, %r13849, 2; + add.s64 %rd1205, %rd1205, %rd1119; + add.s64 %rd1204, %rd1204, %rd1119; + add.s64 %rd1203, %rd1203, %rd1119; + add.s64 %rd1202, %rd1202, %rd1119; + .loc 1 628 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:628:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s32 %r2076, %r13847, %r15234; + add.s32 %r2077, %r13847, %r15233; + add.s32 %r2078, %r13847, %r15232; + add.s32 %r2079, %r13847, %r15231; + add.s32 %r2080, %r13847, %r15230; + add.s32 %r2081, %r13847, %r15229; + add.s32 %r2082, %r13847, %r15228; + add.s32 %r2083, %r13847, %r15227; + add.s32 %r2084, %r13847, %r15226; + add.s32 %r2085, %r13847, %r15225; + add.s32 %r2086, %r13847, %r15224; + add.s32 %r2087, %r13847, %r15223; + add.s32 %r2088, %r13847, %r15222; + add.s32 %r2089, %r13847, %r15221; + add.s32 %r2090, %r13847, %r15220; + add.s32 %r2091, %r13847, %r15219; + add.s32 %r15239, %r13847, %r15239; + add.s32 %r15240, %r13847, %r15240; + add.s32 %r15241, %r13847, %r15241; + add.s32 %r15242, %r13847, %r15242; + add.s32 %r15235, %r13847, %r15235; + add.s32 %r15236, %r13847, %r15236; + add.s32 %r15237, %r13847, %r15237; + add.s32 %r15238, %r13847, %r15238; + .loc 1 610 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:610:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s32 %r13850, %r15216, 1; + setp.gt.s32 %p1135, %r13850, 1; + selp.b32 %r15216, 0, %r13850, %p1135; + add.s32 %r13851, %r15218, 1; + setp.gt.s32 %p1136, %r13851, 2; + selp.b32 %r15218, 0, %r13851, %p1136; + .loc 1 831 52 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:52 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + setp.lt.s32 %p1137, %r15239, %r2358; + setp.lt.s32 %p1138, %r15240, %r2358; + setp.lt.s32 %p1139, %r15241, %r2358; + setp.lt.s32 %p1140, %r15242, %r2358; + .loc 1 831 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + shl.b32 %r13852, %r15218, 14; + add.s32 %r13853, %r7585, %r13852; + bar.sync 0; + add.s32 %r13391, %r13853, %r745; + selp.b32 %r13854, 16, 0, %p1137; + selp.b32 %r13392, %r13854, 0, %p1099; + // begin inline asm + cp.async.cg.shared.global [ %r13391 + 0 ], [ %rd1209 + 0 ], 0x10, %r13392; + // end inline asm + add.s32 %r13393, %r13391, 2048; + selp.b32 %r13855, 16, 0, %p1138; + selp.b32 %r13394, %r13855, 0, %p1099; + // begin inline asm + cp.async.cg.shared.global [ %r13393 + 0 ], [ %rd1208 + 0 ], 0x10, %r13394; + // end inline asm + add.s32 %r13395, %r13391, 4096; + selp.b32 %r13856, 16, 0, %p1139; + selp.b32 %r13396, %r13856, 0, %p1099; + // begin inline asm + cp.async.cg.shared.global [ %r13395 + 0 ], [ %rd1207 + 0 ], 0x10, %r13396; + // end inline asm + add.s32 %r13397, %r13391, 6144; + selp.b32 %r13857, 16, 0, %p1140; + selp.b32 %r13398, %r13857, 0, %p1099; + // begin inline asm + cp.async.cg.shared.global [ %r13397 + 0 ], [ %rd1206 + 0 ], 0x10, %r13398; + // end inline asm + cp.async.commit_group; + .loc 1 674 52 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:674:52 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + setp.lt.s32 %p1141, %r2076, %r2358; + setp.lt.s32 %p1142, %r2077, %r2358; + setp.lt.s32 %p1143, %r2078, %r2358; + setp.lt.s32 %p1144, %r2079, %r2358; + setp.lt.s32 %p1145, %r2080, %r2358; + setp.lt.s32 %p1146, %r2081, %r2358; + setp.lt.s32 %p1147, %r2082, %r2358; + setp.lt.s32 %p1148, %r2083, %r2358; + setp.lt.s32 %p1149, %r2084, %r2358; + setp.lt.s32 %p1150, %r2085, %r2358; + setp.lt.s32 %p1151, %r2086, %r2358; + setp.lt.s32 %p1152, %r2087, %r2358; + setp.lt.s32 %p1153, %r2088, %r2358; + setp.lt.s32 %p1154, %r2089, %r2358; + setp.lt.s32 %p1155, %r2090, %r2358; + setp.lt.s32 %p1156, %r2091, %r2358; + .loc 1 674 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:674:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + mul.wide.s32 %rd1120, %r2076, 4; + add.s64 %rd1044, %rd142, %rd1120; + mul.wide.s32 %rd1121, %r2077, 4; + add.s64 %rd1045, %rd142, %rd1121; + mul.wide.s32 %rd1122, %r2078, 4; + add.s64 %rd1046, %rd142, %rd1122; + mul.wide.s32 %rd1123, %r2079, 4; + add.s64 %rd1047, %rd142, %rd1123; + mul.wide.s32 %rd1124, %r2080, 4; + add.s64 %rd1048, %rd142, %rd1124; + mul.wide.s32 %rd1125, %r2081, 4; + add.s64 %rd1049, %rd142, %rd1125; + mul.wide.s32 %rd1126, %r2082, 4; + add.s64 %rd1050, %rd142, %rd1126; + mul.wide.s32 %rd1127, %r2083, 4; + add.s64 %rd1051, %rd142, %rd1127; + mul.wide.s32 %rd1128, %r2084, 4; + add.s64 %rd1052, %rd142, %rd1128; + mul.wide.s32 %rd1129, %r2085, 4; + add.s64 %rd1053, %rd142, %rd1129; + mul.wide.s32 %rd1130, %r2086, 4; + add.s64 %rd1054, %rd142, %rd1130; + mul.wide.s32 %rd1131, %r2087, 4; + add.s64 %rd1055, %rd142, %rd1131; + mul.wide.s32 %rd1132, %r2088, 4; + add.s64 %rd1056, %rd142, %rd1132; + mul.wide.s32 %rd1133, %r2089, 4; + add.s64 %rd1057, %rd142, %rd1133; + mul.wide.s32 %rd1134, %r2090, 4; + add.s64 %rd1058, %rd142, %rd1134; + mul.wide.s32 %rd1135, %r2091, 4; + add.s64 %rd1059, %rd142, %rd1135; + .loc 1 674 22 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:674:22 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + shl.b32 %r13858, %r15216, 8; + add.s32 %r13859, %r13476, %r13858; + add.s32 %r13399, %r13859, %r767; + selp.b32 %r13860, 4, 0, %p1141; + selp.b32 %r13440, %r13860, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13399 + 0 ], [ %rd1044 + 0 ], 0x4, %r13440; + // end inline asm + add.s32 %r13401, %r13399, 4; + selp.b32 %r13861, 4, 0, %p1142; + selp.b32 %r13442, %r13861, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13401 + 0 ], [ %rd1045 + 0 ], 0x4, %r13442; + // end inline asm + add.s32 %r13403, %r13399, 32; + selp.b32 %r13862, 4, 0, %p1143; + selp.b32 %r13444, %r13862, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13403 + 0 ], [ %rd1046 + 0 ], 0x4, %r13444; + // end inline asm + add.s32 %r13405, %r13399, 36; + selp.b32 %r13863, 4, 0, %p1144; + selp.b32 %r13446, %r13863, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13405 + 0 ], [ %rd1047 + 0 ], 0x4, %r13446; + // end inline asm + add.s32 %r13407, %r13399, 64; + selp.b32 %r13864, 4, 0, %p1145; + selp.b32 %r13448, %r13864, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13407 + 0 ], [ %rd1048 + 0 ], 0x4, %r13448; + // end inline asm + add.s32 %r13409, %r13399, 68; + selp.b32 %r13865, 4, 0, %p1146; + selp.b32 %r13450, %r13865, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13409 + 0 ], [ %rd1049 + 0 ], 0x4, %r13450; + // end inline asm + add.s32 %r13411, %r13399, 96; + selp.b32 %r13866, 4, 0, %p1147; + selp.b32 %r13452, %r13866, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13411 + 0 ], [ %rd1050 + 0 ], 0x4, %r13452; + // end inline asm + add.s32 %r13413, %r13399, 100; + selp.b32 %r13867, 4, 0, %p1148; + selp.b32 %r13454, %r13867, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13413 + 0 ], [ %rd1051 + 0 ], 0x4, %r13454; + // end inline asm + add.s32 %r13415, %r13399, 128; + selp.b32 %r13868, 4, 0, %p1149; + selp.b32 %r13456, %r13868, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13415 + 0 ], [ %rd1052 + 0 ], 0x4, %r13456; + // end inline asm + add.s32 %r13417, %r13399, 132; + selp.b32 %r13869, 4, 0, %p1150; + selp.b32 %r13458, %r13869, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13417 + 0 ], [ %rd1053 + 0 ], 0x4, %r13458; + // end inline asm + add.s32 %r13419, %r13399, 160; + selp.b32 %r13870, 4, 0, %p1151; + selp.b32 %r13460, %r13870, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13419 + 0 ], [ %rd1054 + 0 ], 0x4, %r13460; + // end inline asm + add.s32 %r13421, %r13399, 164; + selp.b32 %r13871, 4, 0, %p1152; + selp.b32 %r13462, %r13871, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13421 + 0 ], [ %rd1055 + 0 ], 0x4, %r13462; + // end inline asm + add.s32 %r13423, %r13399, 192; + selp.b32 %r13872, 4, 0, %p1153; + selp.b32 %r13464, %r13872, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13423 + 0 ], [ %rd1056 + 0 ], 0x4, %r13464; + // end inline asm + add.s32 %r13425, %r13399, 196; + selp.b32 %r13873, 4, 0, %p1154; + selp.b32 %r13466, %r13873, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13425 + 0 ], [ %rd1057 + 0 ], 0x4, %r13466; + // end inline asm + add.s32 %r13427, %r13399, 224; + selp.b32 %r13874, 4, 0, %p1155; + selp.b32 %r13468, %r13874, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13427 + 0 ], [ %rd1058 + 0 ], 0x4, %r13468; + // end inline asm + add.s32 %r13429, %r13399, 228; + selp.b32 %r13875, 4, 0, %p1156; + selp.b32 %r13470, %r13875, 0, %p1099; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13429 + 0 ], [ %rd1059 + 0 ], 0x4, %r13470; + // end inline asm + cp.async.commit_group; + .loc 1 833 52 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:833:52 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + setp.lt.s32 %p1157, %r15235, %r2358; + setp.lt.s32 %p1158, %r15236, %r2358; + setp.lt.s32 %p1159, %r15237, %r2358; + setp.lt.s32 %p1160, %r15238, %r2358; + .loc 1 833 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:833:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s32 %r13876, %r13712, %r13852; + add.s32 %r13431, %r13876, %r745; + selp.b32 %r13877, 16, 0, %p1157; + selp.b32 %r13432, %r13877, 0, %p1099; + // begin inline asm + cp.async.cg.shared.global [ %r13431 + 0 ], [ %rd1205 + 0 ], 0x10, %r13432; + // end inline asm + add.s32 %r13433, %r13431, 2048; + selp.b32 %r13878, 16, 0, %p1158; + selp.b32 %r13434, %r13878, 0, %p1099; + // begin inline asm + cp.async.cg.shared.global [ %r13433 + 0 ], [ %rd1204 + 0 ], 0x10, %r13434; + // end inline asm + add.s32 %r13435, %r13431, 4096; + selp.b32 %r13879, 16, 0, %p1159; + selp.b32 %r13436, %r13879, 0, %p1099; + // begin inline asm + cp.async.cg.shared.global [ %r13435 + 0 ], [ %rd1203 + 0 ], 0x10, %r13436; + // end inline asm + add.s32 %r13437, %r13431, 6144; + selp.b32 %r13880, 16, 0, %p1160; + selp.b32 %r13438, %r13880, 0, %p1099; + // begin inline asm + cp.async.cg.shared.global [ %r13437 + 0 ], [ %rd1202 + 0 ], 0x10, %r13438; + // end inline asm + cp.async.commit_group; + .loc 1 748 29 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:748:29 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s64 %rd1064, %rd143, %rd1120; + add.s64 %rd1065, %rd143, %rd1121; + add.s64 %rd1066, %rd143, %rd1122; + add.s64 %rd1067, %rd143, %rd1123; + add.s64 %rd1068, %rd143, %rd1124; + add.s64 %rd1069, %rd143, %rd1125; + add.s64 %rd1070, %rd143, %rd1126; + add.s64 %rd1071, %rd143, %rd1127; + add.s64 %rd1072, %rd143, %rd1128; + add.s64 %rd1073, %rd143, %rd1129; + add.s64 %rd1074, %rd143, %rd1130; + add.s64 %rd1075, %rd143, %rd1131; + add.s64 %rd1076, %rd143, %rd1132; + add.s64 %rd1077, %rd143, %rd1133; + add.s64 %rd1078, %rd143, %rd1134; + add.s64 %rd1079, %rd143, %rd1135; + .loc 1 748 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:748:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + add.s32 %r13881, %r13720, %r13858; + add.s32 %r13439, %r13881, %r767; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13439 + 0 ], [ %rd1064 + 0 ], 0x4, %r13440; + // end inline asm + add.s32 %r13441, %r13439, 4; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13441 + 0 ], [ %rd1065 + 0 ], 0x4, %r13442; + // end inline asm + add.s32 %r13443, %r13439, 32; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13443 + 0 ], [ %rd1066 + 0 ], 0x4, %r13444; + // end inline asm + add.s32 %r13445, %r13439, 36; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13445 + 0 ], [ %rd1067 + 0 ], 0x4, %r13446; + // end inline asm + add.s32 %r13447, %r13439, 64; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13447 + 0 ], [ %rd1068 + 0 ], 0x4, %r13448; + // end inline asm + add.s32 %r13449, %r13439, 68; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13449 + 0 ], [ %rd1069 + 0 ], 0x4, %r13450; + // end inline asm + add.s32 %r13451, %r13439, 96; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13451 + 0 ], [ %rd1070 + 0 ], 0x4, %r13452; + // end inline asm + add.s32 %r13453, %r13439, 100; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13453 + 0 ], [ %rd1071 + 0 ], 0x4, %r13454; + // end inline asm + add.s32 %r13455, %r13439, 128; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13455 + 0 ], [ %rd1072 + 0 ], 0x4, %r13456; + // end inline asm + add.s32 %r13457, %r13439, 132; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13457 + 0 ], [ %rd1073 + 0 ], 0x4, %r13458; + // end inline asm + add.s32 %r13459, %r13439, 160; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13459 + 0 ], [ %rd1074 + 0 ], 0x4, %r13460; + // end inline asm + add.s32 %r13461, %r13439, 164; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13461 + 0 ], [ %rd1075 + 0 ], 0x4, %r13462; + // end inline asm + add.s32 %r13463, %r13439, 192; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13463 + 0 ], [ %rd1076 + 0 ], 0x4, %r13464; + // end inline asm + add.s32 %r13465, %r13439, 196; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13465 + 0 ], [ %rd1077 + 0 ], 0x4, %r13466; + // end inline asm + add.s32 %r13467, %r13439, 224; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13467 + 0 ], [ %rd1078 + 0 ], 0x4, %r13468; + // end inline asm + add.s32 %r13469, %r13439, 228; + // begin inline asm + @%p978 cp.async.ca.shared.global [ %r13469 + 0 ], [ %rd1079 + 0 ], 0x4, %r13470; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:610:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:318:20 ] + setp.ne.b32 %p1161, %r967, %r2075; + mov.b32 %r15199, %r15219; + mov.b32 %r15200, %r15220; + mov.b32 %r15201, %r15221; + mov.b32 %r15202, %r15222; + mov.b32 %r15203, %r15223; + mov.b32 %r15204, %r15224; + mov.b32 %r15205, %r15225; + mov.b32 %r15206, %r15226; + mov.b32 %r15207, %r15227; + mov.b32 %r15208, %r15228; + mov.b32 %r15209, %r15229; + mov.b32 %r15210, %r15230; + mov.b32 %r15211, %r15231; + mov.b32 %r15212, %r15232; + mov.b32 %r15213, %r15233; + mov.b32 %r15214, %r15234; + mov.b32 %r15219, %r2091; + mov.b32 %r15220, %r2090; + mov.b32 %r15221, %r2089; + mov.b32 %r15222, %r2088; + mov.b32 %r15223, %r2087; + mov.b32 %r15224, %r2086; + mov.b32 %r15225, %r2085; + mov.b32 %r15226, %r2084; + mov.b32 %r15227, %r2083; + mov.b32 %r15228, %r2082; + mov.b32 %r15229, %r2081; + mov.b32 %r15230, %r2080; + mov.b32 %r15231, %r2079; + mov.b32 %r15232, %r2078; + mov.b32 %r15233, %r2077; + mov.b32 %r15234, %r2076; + mov.b32 %r15371, %r2075; + @%p1161 bra $L__BB0_14; + bra.uni $L__BB0_15; +$L__tmp19: +$L__BB0_1: + .loc 1 0 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:0:28 + ld.param.b32 %r2362, [triton_tem_fused_zeros_1_param_22]; + ld.param.b32 %r2361, [triton_tem_fused_zeros_1_param_21]; + ld.param.b32 %r2360, [triton_tem_fused_zeros_1_param_20]; + ld.param.b64 %rd204, [triton_tem_fused_zeros_1_param_13]; + ld.param.b64 %rd203, [triton_tem_fused_zeros_1_param_12]; + ld.param.b64 %rd200, [triton_tem_fused_zeros_1_param_9]; + ld.param.b64 %rd199, [triton_tem_fused_zeros_1_param_8]; + ld.param.b64 %rd198, [triton_tem_fused_zeros_1_param_6]; +$L__tmp20: + .loc 2 41 22 // standard.py:41:22 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:113:34 ] + add.s32 %r2548, %r2358, 127; + .loc 2 41 28 // standard.py:41:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:113:34 ] + shr.s32 %r2549, %r2548, 31; + shr.u32 %r2550, %r2549, 25; + add.s32 %r2551, %r2548, %r2550; + shr.s32 %r2552, %r2551, 7; +$L__tmp21: + .loc 1 140 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:140:24 + sub.s32 %r2553, %r4, %r5; + .loc 1 144 29 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:144:29 + div.s32 %r2555, %r2553, %r2552; + .loc 1 144 54 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:144:54 + shl.b32 %r2556, %r7, 2; + .loc 1 144 44 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:144:44 + add.s32 %r2557, %r2555, %r2556; + .loc 1 145 35 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:145:35 + mul.lo.s32 %r2558, %r2555, %r2552; + sub.s32 %r2559, %r2553, %r2558; + .loc 1 154 78 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:154:78 + mad.lo.s32 %r2560, %r2360, %r8, %r2559; + .loc 1 155 68 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:155:68 + mad.lo.s32 %r2561, %r2361, %r8, %r2559; + mul.lo.s32 %r2562, %r2561, %r2362; + .loc 1 158 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:158:30 + shl.b32 %r2563, %r2557, 7; + .loc 1 158 40 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:158:40 + mad.lo.s32 %r2564, %r1, %r6, %r2563; + .loc 1 159 55 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:159:55 + mul.lo.s32 %r2565, %r2, %r6; + .loc 1 159 42 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:159:42 + mad.lo.s32 %r2566, %r2557, %r3, %r2565; + .loc 1 161 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:161:30 + shl.b32 %r2567, %r6, 5; + .loc 1 161 35 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:161:35 + add.s32 %r2568, %r2557, %r2567; + .loc 1 161 46 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:161:46 + mul.lo.s32 %r2569, %r2568, %r2358; + .loc 1 163 17 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:163:17 + mul.wide.s32 %rd253, %r2564, 2; + add.s64 %rd254, %rd194, %rd253; + .loc 1 164 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:164:19 + mad.wide.s32 %rd255, %r2566, 2, %rd197; + .loc 1 168 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:168:21 + mul.wide.s32 %rd256, %r2569, 4; + add.s64 %rd257, %rd195, %rd256; + .loc 1 169 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:169:25 + add.s64 %rd258, %rd196, %rd256; + .loc 1 174 36 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:174:36 + shl.b32 %r2570, %r2559, 7; + .loc 1 175 29 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:175:29 + or.b32 %r23, %r2570, %r13; + or.b32 %r24, %r2570, %r14; + or.b32 %r25, %r2570, %r15; + or.b32 %r26, %r2570, %r16; + or.b32 %r27, %r2570, %r17; + or.b32 %r28, %r2570, %r18; + or.b32 %r29, %r2570, %r19; + or.b32 %r30, %r2570, %r20; + or.b32 %r31, %r2570, %r21; + or.b32 %r32, %r2570, %r22; +$L__tmp22: + .loc 1 825 38 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:825:38 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:178:107 ] + shl.b32 %r2571, %r23, 12; + shl.b32 %r2572, %r24, 12; + shl.b32 %r2573, %r25, 12; + shl.b32 %r2574, %r26, 12; + shl.b32 %r2575, %r27, 12; + shl.b32 %r2576, %r28, 12; + shl.b32 %r2577, %r29, 12; + shl.b32 %r2578, %r30, 12; + .loc 1 825 20 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:825:20 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:178:107 ] + mad.wide.s32 %rd259, %r2571, 2, %rd254; + mad.wide.s32 %rd260, %r2572, 2, %rd254; + mad.wide.s32 %rd261, %r2573, 2, %rd254; + mad.wide.s32 %rd262, %r2574, 2, %rd254; + mad.wide.s32 %rd263, %r2575, 2, %rd254; + mad.wide.s32 %rd264, %r2576, 2, %rd254; + mad.wide.s32 %rd265, %r2577, 2, %rd254; + mad.wide.s32 %rd266, %r2578, 2, %rd254; + .loc 1 825 56 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:825:56 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:178:107 ] + shl.b32 %r2579, %r10, 3; + and.b32 %r2580, %r2579, 120; + .loc 1 825 49 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:825:49 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:178:107 ] + cvt.u64.u32 %rd13, %r2580; + mul.wide.u32 %rd267, %r2580, 2; + add.s64 %rd213, %rd259, %rd267; + add.s64 %rd214, %rd260, %rd267; + add.s64 %rd215, %rd261, %rd267; + add.s64 %rd216, %rd262, %rd267; + add.s64 %rd217, %rd263, %rd267; + add.s64 %rd218, %rd264, %rd267; + add.s64 %rd219, %rd265, %rd267; + add.s64 %rd220, %rd266, %rd267; + .loc 1 833 52 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:833:52 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:178:107 ] + setp.lt.s32 %p14, %r23, %r2358; + setp.lt.s32 %p15, %r24, %r2358; + setp.lt.s32 %p16, %r25, %r2358; + setp.lt.s32 %p17, %r26, %r2358; + setp.lt.s32 %p18, %r27, %r2358; + setp.lt.s32 %p19, %r28, %r2358; + setp.lt.s32 %p20, %r29, %r2358; + setp.lt.s32 %p21, %r30, %r2358; + mov.b32 %r14520, 0; + .loc 1 833 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:833:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:178:107 ] + // begin inline asm + mov.u32 %r2381, %r14520; + mov.u32 %r2382, %r14520; + mov.u32 %r2383, %r14520; + mov.u32 %r2384, %r14520; + @%p14 ld.global.v4.b32 { %r2381, %r2382, %r2383, %r2384 }, [ %rd213 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2389, %r14520; + mov.u32 %r2390, %r14520; + mov.u32 %r2391, %r14520; + mov.u32 %r2392, %r14520; + @%p15 ld.global.v4.b32 { %r2389, %r2390, %r2391, %r2392 }, [ %rd214 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2397, %r14520; + mov.u32 %r2398, %r14520; + mov.u32 %r2399, %r14520; + mov.u32 %r2400, %r14520; + @%p16 ld.global.v4.b32 { %r2397, %r2398, %r2399, %r2400 }, [ %rd215 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2405, %r14520; + mov.u32 %r2406, %r14520; + mov.u32 %r2407, %r14520; + mov.u32 %r2408, %r14520; + @%p17 ld.global.v4.b32 { %r2405, %r2406, %r2407, %r2408 }, [ %rd216 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2413, %r14520; + mov.u32 %r2414, %r14520; + mov.u32 %r2415, %r14520; + mov.u32 %r2416, %r14520; + @%p18 ld.global.v4.b32 { %r2413, %r2414, %r2415, %r2416 }, [ %rd217 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2421, %r14520; + mov.u32 %r2422, %r14520; + mov.u32 %r2423, %r14520; + mov.u32 %r2424, %r14520; + @%p19 ld.global.v4.b32 { %r2421, %r2422, %r2423, %r2424 }, [ %rd218 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2429, %r14520; + mov.u32 %r2430, %r14520; + mov.u32 %r2431, %r14520; + mov.u32 %r2432, %r14520; + @%p20 ld.global.v4.b32 { %r2429, %r2430, %r2431, %r2432 }, [ %rd219 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2437, %r14520; + mov.u32 %r2438, %r14520; + mov.u32 %r2439, %r14520; + mov.u32 %r2440, %r14520; + @%p21 ld.global.v4.b32 { %r2437, %r2438, %r2439, %r2440 }, [ %rd220 + 0 ]; + // end inline asm + shl.b32 %r2581, %r10, 4; + and.b32 %r2582, %r2581, 112; + shl.b32 %r2583, %r12, 3; + and.b32 %r2584, %r10, 112; + and.b32 %r2585, %r10, 8; + shl.b32 %r2586, %r2585, 11; + or.b32 %r2587, %r2582, %r2583; + xor.b32 %r2588, %r2587, %r2584; + or.b32 %r2589, %r2588, %r2586; + mov.b32 %r2590, global_smem; + add.s32 %r2591, %r2590, %r2589; + st.shared.v4.b32 [%r2591+98304], {%r2381, %r2382, %r2383, %r2384}; + st.shared.v4.b32 [%r2591+100352], {%r2389, %r2390, %r2391, %r2392}; + st.shared.v4.b32 [%r2591+102400], {%r2397, %r2398, %r2399, %r2400}; + st.shared.v4.b32 [%r2591+104448], {%r2405, %r2406, %r2407, %r2408}; + st.shared.v4.b32 [%r2591+106496], {%r2413, %r2414, %r2415, %r2416}; + st.shared.v4.b32 [%r2591+108544], {%r2421, %r2422, %r2423, %r2424}; + st.shared.v4.b32 [%r2591+110592], {%r2429, %r2430, %r2431, %r2432}; + st.shared.v4.b32 [%r2591+112640], {%r2437, %r2438, %r2439, %r2440}; +$L__tmp23: + .loc 1 825 38 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:825:38 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:179:111 ] + shl.b32 %r2592, %r23, 7; + shl.b32 %r2593, %r24, 7; + shl.b32 %r2594, %r25, 7; + shl.b32 %r2595, %r26, 7; + shl.b32 %r2596, %r27, 7; + shl.b32 %r2597, %r28, 7; + shl.b32 %r2598, %r29, 7; + shl.b32 %r2599, %r30, 7; + .loc 1 825 20 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:825:20 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:179:111 ] + mad.wide.s32 %rd268, %r2592, 2, %rd255; + mad.wide.s32 %rd269, %r2593, 2, %rd255; + mad.wide.s32 %rd270, %r2594, 2, %rd255; + mad.wide.s32 %rd271, %r2595, 2, %rd255; + mad.wide.s32 %rd272, %r2596, 2, %rd255; + mad.wide.s32 %rd273, %r2597, 2, %rd255; + mad.wide.s32 %rd274, %r2598, 2, %rd255; + mad.wide.s32 %rd275, %r2599, 2, %rd255; + .loc 1 825 49 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:825:49 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:179:111 ] + add.s64 %rd221, %rd268, %rd267; + add.s64 %rd222, %rd269, %rd267; + add.s64 %rd223, %rd270, %rd267; + add.s64 %rd224, %rd271, %rd267; + add.s64 %rd225, %rd272, %rd267; + add.s64 %rd226, %rd273, %rd267; + add.s64 %rd227, %rd274, %rd267; + add.s64 %rd228, %rd275, %rd267; + .loc 1 833 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:833:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:179:111 ] + // begin inline asm + mov.u32 %r2445, %r14520; + mov.u32 %r2446, %r14520; + mov.u32 %r2447, %r14520; + mov.u32 %r2448, %r14520; + @%p14 ld.global.v4.b32 { %r2445, %r2446, %r2447, %r2448 }, [ %rd221 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2453, %r14520; + mov.u32 %r2454, %r14520; + mov.u32 %r2455, %r14520; + mov.u32 %r2456, %r14520; + @%p15 ld.global.v4.b32 { %r2453, %r2454, %r2455, %r2456 }, [ %rd222 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2461, %r14520; + mov.u32 %r2462, %r14520; + mov.u32 %r2463, %r14520; + mov.u32 %r2464, %r14520; + @%p16 ld.global.v4.b32 { %r2461, %r2462, %r2463, %r2464 }, [ %rd223 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2469, %r14520; + mov.u32 %r2470, %r14520; + mov.u32 %r2471, %r14520; + mov.u32 %r2472, %r14520; + @%p17 ld.global.v4.b32 { %r2469, %r2470, %r2471, %r2472 }, [ %rd224 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2477, %r14520; + mov.u32 %r2478, %r14520; + mov.u32 %r2479, %r14520; + mov.u32 %r2480, %r14520; + @%p18 ld.global.v4.b32 { %r2477, %r2478, %r2479, %r2480 }, [ %rd225 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2485, %r14520; + mov.u32 %r2486, %r14520; + mov.u32 %r2487, %r14520; + mov.u32 %r2488, %r14520; + @%p19 ld.global.v4.b32 { %r2485, %r2486, %r2487, %r2488 }, [ %rd226 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2493, %r14520; + mov.u32 %r2494, %r14520; + mov.u32 %r2495, %r14520; + mov.u32 %r2496, %r14520; + @%p20 ld.global.v4.b32 { %r2493, %r2494, %r2495, %r2496 }, [ %rd227 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2501, %r14520; + mov.u32 %r2502, %r14520; + mov.u32 %r2503, %r14520; + mov.u32 %r2504, %r14520; + @%p21 ld.global.v4.b32 { %r2501, %r2502, %r2503, %r2504 }, [ %rd228 + 0 ]; + // end inline asm + st.shared.v4.b32 [%r2591+131072], {%r2445, %r2446, %r2447, %r2448}; + st.shared.v4.b32 [%r2591+133120], {%r2453, %r2454, %r2455, %r2456}; + st.shared.v4.b32 [%r2591+135168], {%r2461, %r2462, %r2463, %r2464}; + st.shared.v4.b32 [%r2591+137216], {%r2469, %r2470, %r2471, %r2472}; + st.shared.v4.b32 [%r2591+139264], {%r2477, %r2478, %r2479, %r2480}; + st.shared.v4.b32 [%r2591+141312], {%r2485, %r2486, %r2487, %r2488}; + st.shared.v4.b32 [%r2591+143360], {%r2493, %r2494, %r2495, %r2496}; + st.shared.v4.b32 [%r2591+145408], {%r2501, %r2502, %r2503, %r2504}; +$L__tmp24: + .loc 1 188 58 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:188:58 + setp.lt.s32 %p30, %r31, %r2358; + setp.lt.s32 %p31, %r32, %r2358; + .loc 1 188 34 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:188:34 + mul.wide.s32 %rd276, %r31, 4; + add.s64 %rd229, %rd258, %rd276; + cvt.s64.s32 %rd277, %r2570; + cvt.u64.u32 %rd278, %r21; + or.b64 %rd279, %rd277, %rd278; + shl.b64 %rd280, %rd279, 2; + add.s64 %rd281, %rd258, %rd280; + add.s64 %rd230, %rd281, 32; + .loc 1 188 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:188:25 + // begin inline asm + mov.u32 %r2509, 0x0; + @%p30 ld.global.b32 { %r2509 }, [ %rd229 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2510, 0x0; + @%p31 ld.global.b32 { %r2510 }, [ %rd230 + 0 ]; + // end inline asm + .loc 1 189 33 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:189:33 + add.s64 %rd231, %rd257, %rd276; + add.s64 %rd282, %rd257, %rd280; + add.s64 %rd232, %rd282, 32; + .loc 1 189 26 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:189:26 + // begin inline asm + mov.u32 %r2511, 0x0; + @%p30 ld.global.b32 { %r2511 }, [ %rd231 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2512, 0x0; + @%p31 ld.global.b32 { %r2512 }, [ %rd232 + 0 ]; + // end inline asm + .loc 1 190 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:190:30 + setp.eq.f32 %p35, %r2511, 0fFF800000; + setp.eq.f32 %p36, %r2512, 0fFF800000; + .loc 1 190 50 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:190:50 + selp.f32 %r35, 0f00000000, %r2511, %p35; + selp.f32 %r36, 0f00000000, %r2512, %p36; + .loc 1 195 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:195:30 + cvt.s64.s32 %rd14, %r2562; + mad.wide.s32 %rd233, %r2562, 4, %rd200; + .loc 1 196 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:196:27 + // begin inline asm + mov.u32 %r2513, 0x0; + ld.global.b32 { %r2513 }, [ %rd233 + 0 ]; + // end inline asm + .loc 1 196 41 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:196:41 + shl.b32 %r37, %r2513, 7; + .loc 1 197 53 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:197:53 + cvt.s64.s32 %rd16, %r2560; + mad.wide.s32 %rd234, %r2560, 4, %rd199; + .loc 1 197 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:197:39 + // begin inline asm + mov.u32 %r2514, 0x0; + ld.global.b32 { %r2514 }, [ %rd234 + 0 ]; + // end inline asm + .loc 1 199 42 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:199:42 + and.b32 %r39, %r10, 3; + shl.b32 %r42, %r39, 1; + or.b32 %r41, %r42, 1; + or.b32 %r45, %r42, 9; + or.b32 %r44, %r42, 8; + or.b32 %r49, %r42, 16; + or.b32 %r48, %r42, 17; + or.b32 %r47, %r42, 24; + or.b32 %r46, %r42, 25; + or.b32 %r57, %r42, 32; + or.b32 %r56, %r42, 33; + or.b32 %r55, %r42, 40; + or.b32 %r54, %r42, 41; + or.b32 %r53, %r42, 48; + or.b32 %r52, %r42, 49; + or.b32 %r51, %r42, 56; + or.b32 %r50, %r42, 57; + .loc 1 199 29 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:199:29 + or.b32 %r2600, %r37, %r13; + or.b32 %r2601, %r37, %r14; + or.b32 %r2602, %r37, %r15; + or.b32 %r2603, %r37, %r16; +$L__tmp25: + .loc 1 390 37 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:390:37 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shl.b32 %r2604, %r2600, 7; + shl.b32 %r2605, %r2601, 7; + shl.b32 %r2606, %r2602, 7; + shl.b32 %r2607, %r2603, 7; + .loc 1 390 18 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:390:18 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.wide.s32 %rd283, %r2604, 2; + add.s64 %rd284, %rd1, %rd283; + mul.wide.s32 %rd285, %r2605, 2; + add.s64 %rd286, %rd1, %rd285; + mul.wide.s32 %rd287, %r2606, 2; + add.s64 %rd288, %rd1, %rd287; + mul.wide.s32 %rd289, %r2607, 2; + add.s64 %rd290, %rd1, %rd289; + .loc 1 390 49 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:390:49 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + add.s64 %rd237, %rd284, %rd267; + add.s64 %rd238, %rd286, %rd267; + add.s64 %rd239, %rd288, %rd267; + add.s64 %rd240, %rd290, %rd267; + .loc 1 391 18 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:391:18 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + add.s64 %rd291, %rd2, %rd283; + add.s64 %rd292, %rd2, %rd285; + add.s64 %rd293, %rd2, %rd287; + add.s64 %rd294, %rd2, %rd289; + .loc 1 391 49 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:391:49 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + add.s64 %rd241, %rd291, %rd267; + add.s64 %rd242, %rd292, %rd267; + add.s64 %rd243, %rd293, %rd267; + add.s64 %rd244, %rd294, %rd267; + .loc 1 395 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:395:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shl.b32 %r2608, %r2514, 1; + .loc 2 41 22 // standard.py:41:22 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + add.s32 %r2609, %r2359, 63; + .loc 2 41 28 // standard.py:41:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r2610, %r2609, 31; + shr.u32 %r2611, %r2610, 26; + add.s32 %r2612, %r2609, %r2611; + shr.s32 %r2613, %r2612, 6; + .loc 1 395 101 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:395:101 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + max.s32 %r58, %r2613, 1; + .loc 1 395 63 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:395:63 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + min.s32 %r59, %r2608, %r58; + .loc 1 485 34 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:485:34 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mad.wide.u32 %rd236, %r6, 8, %rd207; + .loc 1 397 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:397:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.lt.s32 %p37, %r2608, 1; + setp.gt.s32 %p34, %r2608, 0; + .loc 1 485 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:485:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + // begin inline asm + mov.u64 %rd17, 0x0; + @%p34 ld.global.b64 { %rd17 }, [ %rd236 + 0 ]; + // end inline asm + .loc 1 831 52 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:52 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.lt.s32 %p38, %r2600, %r2359; + setp.lt.s32 %p39, %r2601, %r2359; + setp.lt.s32 %p40, %r2602, %r2359; + setp.lt.s32 %p41, %r2603, %r2359; + .loc 1 831 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shl.b32 %r2614, %r2585, 10; + or.b32 %r60, %r2588, %r2614; + add.s32 %r5079, %r2590, %r60; + selp.b32 %r2615, 16, 0, %p38; + selp.b32 %r2524, %r2615, 0, %p34; + // begin inline asm + cp.async.cg.shared.global [ %r5079 + 0 ], [ %rd237 + 0 ], 0x10, %r2524; + // end inline asm + add.s32 %r5081, %r5079, 2048; + selp.b32 %r2616, 16, 0, %p39; + selp.b32 %r2526, %r2616, 0, %p34; + // begin inline asm + cp.async.cg.shared.global [ %r5081 + 0 ], [ %rd238 + 0 ], 0x10, %r2526; + // end inline asm + add.s32 %r5083, %r5079, 4096; + selp.b32 %r2617, 16, 0, %p40; + selp.b32 %r2528, %r2617, 0, %p34; + // begin inline asm + cp.async.cg.shared.global [ %r5083 + 0 ], [ %rd239 + 0 ], 0x10, %r2528; + // end inline asm + add.s32 %r5085, %r5079, 6144; + selp.b32 %r2618, 16, 0, %p41; + selp.b32 %r2530, %r2618, 0, %p34; + // begin inline asm + cp.async.cg.shared.global [ %r5085 + 0 ], [ %rd240 + 0 ], 0x10, %r2530; + // end inline asm + cp.async.commit_group; + add.s32 %r2523, %r5079, 49152; + // begin inline asm + cp.async.cg.shared.global [ %r2523 + 0 ], [ %rd241 + 0 ], 0x10, %r2524; + // end inline asm + add.s32 %r2525, %r5079, 51200; + // begin inline asm + cp.async.cg.shared.global [ %r2525 + 0 ], [ %rd242 + 0 ], 0x10, %r2526; + // end inline asm + add.s32 %r2527, %r5079, 53248; + // begin inline asm + cp.async.cg.shared.global [ %r2527 + 0 ], [ %rd243 + 0 ], 0x10, %r2528; + // end inline asm + add.s32 %r2529, %r5079, 55296; + // begin inline asm + cp.async.cg.shared.global [ %r2529 + 0 ], [ %rd244 + 0 ], 0x10, %r2530; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:397:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.gt.s32 %p42, %r59, 1; + .loc 1 414 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:414:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + add.s64 %rd1184, %rd237, 16384; + add.s64 %rd1183, %rd238, 16384; + add.s64 %rd1182, %rd239, 16384; + add.s64 %rd1181, %rd240, 16384; + .loc 1 415 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:415:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + add.s64 %rd1180, %rd241, 16384; + add.s64 %rd1179, %rd242, 16384; + add.s64 %rd1178, %rd243, 16384; + add.s64 %rd1177, %rd244, 16384; + .loc 1 417 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:417:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + or.b32 %r14455, %r2600, 64; + or.b32 %r14454, %r2601, 64; + or.b32 %r14453, %r2602, 64; + or.b32 %r14452, %r2603, 64; + .loc 1 831 52 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:52 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.lt.s32 %p43, %r14455, %r2359; + setp.lt.s32 %p44, %r14454, %r2359; + setp.lt.s32 %p45, %r14453, %r2359; + setp.lt.s32 %p46, %r14452, %r2359; + .loc 1 831 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + bar.sync 0; + add.s32 %r2531, %r5079, 16384; + selp.b32 %r2619, 16, 0, %p43; + selp.b32 %r2540, %r2619, 0, %p42; + // begin inline asm + cp.async.cg.shared.global [ %r2531 + 0 ], [ %rd1184 + 0 ], 0x10, %r2540; + // end inline asm + add.s32 %r2533, %r5079, 18432; + selp.b32 %r2620, 16, 0, %p44; + selp.b32 %r2542, %r2620, 0, %p42; + // begin inline asm + cp.async.cg.shared.global [ %r2533 + 0 ], [ %rd1183 + 0 ], 0x10, %r2542; + // end inline asm + add.s32 %r2535, %r5079, 20480; + selp.b32 %r2621, 16, 0, %p45; + selp.b32 %r2544, %r2621, 0, %p42; + // begin inline asm + cp.async.cg.shared.global [ %r2535 + 0 ], [ %rd1182 + 0 ], 0x10, %r2544; + // end inline asm + add.s32 %r2537, %r5079, 22528; + selp.b32 %r2622, 16, 0, %p46; + selp.b32 %r2546, %r2622, 0, %p42; + // begin inline asm + cp.async.cg.shared.global [ %r2537 + 0 ], [ %rd1181 + 0 ], 0x10, %r2546; + // end inline asm + cp.async.commit_group; + add.s32 %r2539, %r5079, 65536; + // begin inline asm + cp.async.cg.shared.global [ %r2539 + 0 ], [ %rd1180 + 0 ], 0x10, %r2540; + // end inline asm + add.s32 %r2541, %r5079, 67584; + // begin inline asm + cp.async.cg.shared.global [ %r2541 + 0 ], [ %rd1179 + 0 ], 0x10, %r2542; + // end inline asm + add.s32 %r2543, %r5079, 69632; + // begin inline asm + cp.async.cg.shared.global [ %r2543 + 0 ], [ %rd1178 + 0 ], 0x10, %r2544; + // end inline asm + add.s32 %r2545, %r5079, 71680; + // begin inline asm + cp.async.cg.shared.global [ %r2545 + 0 ], [ %rd1177 + 0 ], 0x10, %r2546; + // end inline asm + cp.async.commit_group; + .loc 1 459 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:459:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + mov.b32 %r14456, 0f00000000; + mov.b32 %r14457, %r14456; + mov.b32 %r14458, %r14456; + mov.b32 %r14459, %r14456; + mov.b32 %r14460, %r14456; + mov.b32 %r14461, %r14456; + mov.b32 %r14462, %r14456; + mov.b32 %r14463, %r14456; + mov.b32 %r14464, %r14456; + mov.b32 %r14465, %r14456; + mov.b32 %r14466, %r14456; + mov.b32 %r14467, %r14456; + mov.b32 %r14468, %r14456; + mov.b32 %r14469, %r14456; + mov.b32 %r14470, %r14456; + mov.b32 %r14471, %r14456; + mov.b32 %r14472, %r14456; + mov.b32 %r14473, %r14456; + mov.b32 %r14474, %r14456; + mov.b32 %r14475, %r14456; + mov.b32 %r14476, %r14456; + mov.b32 %r14477, %r14456; + mov.b32 %r14478, %r14456; + mov.b32 %r14479, %r14456; + mov.b32 %r14480, %r14456; + mov.b32 %r14481, %r14456; + mov.b32 %r14482, %r14456; + mov.b32 %r14483, %r14456; + mov.b32 %r14484, %r14456; + mov.b32 %r14485, %r14456; + mov.b32 %r14486, %r14456; + mov.b32 %r14487, %r14456; + mov.b32 %r14488, %r14456; + mov.b32 %r14489, %r14456; + mov.b32 %r14490, %r14456; + mov.b32 %r14491, %r14456; + mov.b32 %r14492, %r14456; + mov.b32 %r14493, %r14456; + mov.b32 %r14494, %r14456; + mov.b32 %r14495, %r14456; + mov.b32 %r14496, %r14456; + mov.b32 %r14497, %r14456; + mov.b32 %r14498, %r14456; + mov.b32 %r14499, %r14456; + mov.b32 %r14500, %r14456; + mov.b32 %r14501, %r14456; + mov.b32 %r14502, %r14456; + mov.b32 %r14503, %r14456; + mov.b32 %r14504, %r14456; + mov.b32 %r14505, %r14456; + mov.b32 %r14506, %r14456; + mov.b32 %r14507, %r14456; + mov.b32 %r14508, %r14456; + mov.b32 %r14509, %r14456; + mov.b32 %r14510, %r14456; + mov.b32 %r14511, %r14456; + mov.b32 %r14512, %r14456; + mov.b32 %r14513, %r14456; + mov.b32 %r14514, %r14456; + mov.b32 %r14515, %r14456; + mov.b32 %r14516, %r14456; + mov.b32 %r14517, %r14456; + mov.b32 %r14518, %r14456; + mov.b32 %r14519, %r14456; + .loc 1 397 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:397:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + @%p37 bra $L__BB0_4; +// %bb.2: // %.lr.ph + .loc 1 798 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:798:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r100, %r32, %r2358; + .loc 1 487 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:487:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.s64.s32 %rd295, %r100; + .loc 1 488 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:488:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.gt.s64 %p1, %rd17, %rd295; + .loc 1 798 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:798:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r106, %r31, %r2358; + .loc 1 487 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:487:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.s64.s32 %rd296, %r106; + .loc 1 488 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:488:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.gt.s64 %p3, %rd17, %rd296; +$L__tmp26: + .loc 1 199 29 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:199:29 + or.b32 %r14522, %r37, %r50; + or.b32 %r14521, %r37, %r51; + or.b32 %r14524, %r37, %r52; + or.b32 %r14523, %r37, %r53; + or.b32 %r14526, %r37, %r54; + or.b32 %r14525, %r37, %r55; + or.b32 %r14528, %r37, %r56; + or.b32 %r14527, %r37, %r57; + or.b32 %r14530, %r37, %r46; + or.b32 %r14529, %r37, %r47; + or.b32 %r14532, %r37, %r48; + or.b32 %r14531, %r37, %r49; + or.b32 %r14533, %r37, %r44; + or.b32 %r14534, %r37, %r45; + or.b32 %r14536, %r37, %r41; + or.b32 %r14535, %r37, %r42; + add.s32 %r97, %r59, -2; + add.s32 %r98, %r59, -1; +$L__tmp27: + .loc 1 397 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:397:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + max.s32 %r99, %r59, 1; + .loc 1 531 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mov.b64 %rd28, {%r2510, %r2510}; + mov.b64 %rd29, {%r2509, %r2509}; + mov.b32 %r14456, 0f00000000; + mov.b32 %r14451, 1; + mov.b32 %r14450, -1; + mov.b32 %r14449, 64; + mov.b32 %r14457, %r14456; + mov.b32 %r14458, %r14456; + mov.b32 %r14459, %r14456; + mov.b32 %r14460, %r14456; + mov.b32 %r14461, %r14456; + mov.b32 %r14462, %r14456; + mov.b32 %r14463, %r14456; + mov.b32 %r14464, %r14456; + mov.b32 %r14465, %r14456; + mov.b32 %r14466, %r14456; + mov.b32 %r14467, %r14456; + mov.b32 %r14468, %r14456; + mov.b32 %r14469, %r14456; + mov.b32 %r14470, %r14456; + mov.b32 %r14471, %r14456; + mov.b32 %r14472, %r14456; + mov.b32 %r14473, %r14456; + mov.b32 %r14474, %r14456; + mov.b32 %r14475, %r14456; + mov.b32 %r14476, %r14456; + mov.b32 %r14477, %r14456; + mov.b32 %r14478, %r14456; + mov.b32 %r14479, %r14456; + mov.b32 %r14480, %r14456; + mov.b32 %r14481, %r14456; + mov.b32 %r14482, %r14456; + mov.b32 %r14483, %r14456; + mov.b32 %r14484, %r14456; + mov.b32 %r14485, %r14456; + mov.b32 %r14486, %r14456; + mov.b32 %r14487, %r14456; + mov.b32 %r14488, %r14456; + mov.b32 %r14489, %r14456; + mov.b32 %r14490, %r14456; + mov.b32 %r14491, %r14456; + mov.b32 %r14492, %r14456; + mov.b32 %r14493, %r14456; + mov.b32 %r14494, %r14456; + mov.b32 %r14495, %r14456; + mov.b32 %r14496, %r14456; + mov.b32 %r14497, %r14456; + mov.b32 %r14498, %r14456; + mov.b32 %r14499, %r14456; + mov.b32 %r14500, %r14456; + mov.b32 %r14501, %r14456; + mov.b32 %r14502, %r14456; + mov.b32 %r14503, %r14456; + mov.b32 %r14504, %r14456; + mov.b32 %r14505, %r14456; + mov.b32 %r14506, %r14456; + mov.b32 %r14507, %r14456; + mov.b32 %r14508, %r14456; + mov.b32 %r14509, %r14456; + mov.b32 %r14510, %r14456; + mov.b32 %r14511, %r14456; + mov.b32 %r14512, %r14456; + mov.b32 %r14513, %r14456; + mov.b32 %r14514, %r14456; + mov.b32 %r14515, %r14456; + mov.b32 %r14516, %r14456; + mov.b32 %r14517, %r14456; + mov.b32 %r14518, %r14456; + mov.b32 %r14519, %r14456; +$L__BB0_3: // %__nv_exp2f.exit1434 + // =>This Inner Loop Header: Depth=1 + .loc 1 397 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:397:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.lt.s32 %p67, %r14520, %r97; + setp.lt.s32 %p65, %r14520, %r98; + add.s32 %r4286, %r14450, 1; + setp.gt.s32 %p68, %r4286, 2; + selp.b32 %r14450, 0, %r4286, %p68; + .loc 1 831 52 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:52 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.lt.s32 %p69, %r14535, %r2359; + setp.lt.s32 %p70, %r14536, %r2359; + setp.lt.s32 %p71, %r14533, %r2359; + setp.lt.s32 %p72, %r14534, %r2359; + setp.lt.s32 %p73, %r14531, %r2359; + setp.lt.s32 %p74, %r14532, %r2359; + setp.lt.s32 %p75, %r14529, %r2359; + setp.lt.s32 %p76, %r14530, %r2359; + setp.lt.s32 %p77, %r14527, %r2359; + setp.lt.s32 %p78, %r14528, %r2359; + setp.lt.s32 %p79, %r14525, %r2359; + setp.lt.s32 %p80, %r14526, %r2359; + setp.lt.s32 %p81, %r14523, %r2359; + setp.lt.s32 %p82, %r14524, %r2359; + setp.lt.s32 %p83, %r14521, %r2359; + setp.lt.s32 %p84, %r14522, %r2359; + .loc 1 831 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r4287, %r14450, 14; + add.s32 %r3181, %r2590, %r4287; + .loc 1 459 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:459:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shfl.sync.idx.b32 %r4289, %r11, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r4290, %r4289, 11; + and.b32 %r4291, %r4290, 8192; + add.s32 %r3140, %r2590, 98304; + add.s32 %r4292, %r4291, %r3140; + bfe.u32 %r4293, %r4292, 4, 14; + cvt.u64.u32 %rd347, %r4293; + or.b64 %rd297, %rd347, 4611686293372403712; + bfe.u32 %r4294, %r3181, 4, 14; + cvt.u64.u32 %rd348, %r4294; + or.b64 %rd298, %rd348, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2724,%r2725,%r2726,%r2727,%r2728,%r2729,%r2730,%r2731,%r2732,%r2733,%r2734,%r2735,%r2736,%r2737,%r2738,%r2739,%r2740,%r2741,%r2742,%r2743,%r2744,%r2745,%r2746,%r2747,%r2748,%r2749,%r2750,%r2751,%r2752,%r2753,%r2754,%r2755}, %rd297, %rd298, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r4295, %r4291, 32; + add.s32 %r4296, %r4295, %r3140; + bfe.u32 %r4297, %r4296, 4, 14; + cvt.u64.u32 %rd349, %r4297; + or.b64 %rd299, %rd349, 4611686293372403712; + add.s32 %r4298, %r3181, 32; + bfe.u32 %r4299, %r4298, 4, 14; + cvt.u64.u32 %rd350, %r4299; + or.b64 %rd300, %rd350, 4611686293338849280; + mov.pred %p47, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2724,%r2725,%r2726,%r2727,%r2728,%r2729,%r2730,%r2731,%r2732,%r2733,%r2734,%r2735,%r2736,%r2737,%r2738,%r2739,%r2740,%r2741,%r2742,%r2743,%r2744,%r2745,%r2746,%r2747,%r2748,%r2749,%r2750,%r2751,%r2752,%r2753,%r2754,%r2755}, %rd299, %rd300, %p47, 1, 1, 0, 0; + // end inline asm + or.b32 %r4300, %r4291, 64; + add.s32 %r4301, %r4300, %r3140; + bfe.u32 %r4302, %r4301, 4, 14; + cvt.u64.u32 %rd351, %r4302; + or.b64 %rd301, %rd351, 4611686293372403712; + add.s32 %r4303, %r3181, 64; + bfe.u32 %r4304, %r4303, 4, 14; + cvt.u64.u32 %rd352, %r4304; + or.b64 %rd302, %rd352, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2724,%r2725,%r2726,%r2727,%r2728,%r2729,%r2730,%r2731,%r2732,%r2733,%r2734,%r2735,%r2736,%r2737,%r2738,%r2739,%r2740,%r2741,%r2742,%r2743,%r2744,%r2745,%r2746,%r2747,%r2748,%r2749,%r2750,%r2751,%r2752,%r2753,%r2754,%r2755}, %rd301, %rd302, %p47, 1, 1, 0, 0; + // end inline asm + or.b32 %r4305, %r4291, 96; + add.s32 %r4306, %r4305, %r3140; + bfe.u32 %r4307, %r4306, 4, 14; + cvt.u64.u32 %rd353, %r4307; + or.b64 %rd303, %rd353, 4611686293372403712; + add.s32 %r4308, %r3181, 96; + bfe.u32 %r4309, %r4308, 4, 14; + cvt.u64.u32 %rd354, %r4309; + or.b64 %rd304, %rd354, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2724,%r2725,%r2726,%r2727,%r2728,%r2729,%r2730,%r2731,%r2732,%r2733,%r2734,%r2735,%r2736,%r2737,%r2738,%r2739,%r2740,%r2741,%r2742,%r2743,%r2744,%r2745,%r2746,%r2747,%r2748,%r2749,%r2750,%r2751,%r2752,%r2753,%r2754,%r2755}, %rd303, %rd304, %p47, 1, 1, 0, 0; + // end inline asm + or.b32 %r4310, %r4291, 16384; + add.s32 %r4311, %r4310, %r3140; + bfe.u32 %r4312, %r4311, 4, 14; + cvt.u64.u32 %rd355, %r4312; + or.b64 %rd305, %rd355, 4611686293372403712; + add.s32 %r4313, %r3181, 8192; + bfe.u32 %r4314, %r4313, 4, 14; + cvt.u64.u32 %rd356, %r4314; + or.b64 %rd306, %rd356, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2724,%r2725,%r2726,%r2727,%r2728,%r2729,%r2730,%r2731,%r2732,%r2733,%r2734,%r2735,%r2736,%r2737,%r2738,%r2739,%r2740,%r2741,%r2742,%r2743,%r2744,%r2745,%r2746,%r2747,%r2748,%r2749,%r2750,%r2751,%r2752,%r2753,%r2754,%r2755}, %rd305, %rd306, %p47, 1, 1, 0, 0; + // end inline asm + or.b32 %r4315, %r4291, 16416; + add.s32 %r4316, %r4315, %r3140; + bfe.u32 %r4317, %r4316, 4, 14; + cvt.u64.u32 %rd357, %r4317; + or.b64 %rd307, %rd357, 4611686293372403712; + add.s32 %r4318, %r3181, 8224; + bfe.u32 %r4319, %r4318, 4, 14; + cvt.u64.u32 %rd358, %r4319; + or.b64 %rd308, %rd358, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2724,%r2725,%r2726,%r2727,%r2728,%r2729,%r2730,%r2731,%r2732,%r2733,%r2734,%r2735,%r2736,%r2737,%r2738,%r2739,%r2740,%r2741,%r2742,%r2743,%r2744,%r2745,%r2746,%r2747,%r2748,%r2749,%r2750,%r2751,%r2752,%r2753,%r2754,%r2755}, %rd307, %rd308, %p47, 1, 1, 0, 0; + // end inline asm + or.b32 %r4320, %r4291, 16448; + add.s32 %r4321, %r4320, %r3140; + bfe.u32 %r4322, %r4321, 4, 14; + cvt.u64.u32 %rd359, %r4322; + or.b64 %rd309, %rd359, 4611686293372403712; + add.s32 %r4323, %r3181, 8256; + bfe.u32 %r4324, %r4323, 4, 14; + cvt.u64.u32 %rd360, %r4324; + or.b64 %rd310, %rd360, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2724,%r2725,%r2726,%r2727,%r2728,%r2729,%r2730,%r2731,%r2732,%r2733,%r2734,%r2735,%r2736,%r2737,%r2738,%r2739,%r2740,%r2741,%r2742,%r2743,%r2744,%r2745,%r2746,%r2747,%r2748,%r2749,%r2750,%r2751,%r2752,%r2753,%r2754,%r2755}, %rd309, %rd310, %p47, 1, 1, 0, 0; + // end inline asm + or.b32 %r4325, %r4291, 16480; + add.s32 %r4326, %r4325, %r3140; + bfe.u32 %r4327, %r4326, 4, 14; + cvt.u64.u32 %rd361, %r4327; + or.b64 %rd311, %rd361, 4611686293372403712; + add.s32 %r4328, %r3181, 8288; + bfe.u32 %r4329, %r4328, 4, 14; + cvt.u64.u32 %rd362, %r4329; + or.b64 %rd312, %rd362, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2724,%r2725,%r2726,%r2727,%r2728,%r2729,%r2730,%r2731,%r2732,%r2733,%r2734,%r2735,%r2736,%r2737,%r2738,%r2739,%r2740,%r2741,%r2742,%r2743,%r2744,%r2745,%r2746,%r2747,%r2748,%r2749,%r2750,%r2751,%r2752,%r2753,%r2754,%r2755}, %rd311, %rd312, %p47, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r3697, 0; + mov.b32 %r3141, %r3697; + mov.b32 %r3142, %r3697; + mov.b32 %r3144, %r3697; + mov.b32 %r3145, %r3697; + mov.b32 %r3143, %r3181; + // begin inline asm + // wait for regs: %r2724,%r2725,%r2726,%r2727,%r2728,%r2729,%r2730,%r2731,%r2732,%r2733,%r2734,%r2735,%r2736,%r2737,%r2738,%r2739,%r2740,%r2741,%r2742,%r2743,%r2744,%r2745,%r2746,%r2747,%r2748,%r2749,%r2750,%r2751,%r2752,%r2753,%r2754,%r2755,%r3140,%r3141,%r3142,%r3143,%r3144,%r3145 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 461 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:461:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4330, %r2724, 0f3DB504F3; + mul.f32 %r4331, %r2725, 0f3DB504F3; + mul.f32 %r4332, %r2726, 0f3DB504F3; + mul.f32 %r4333, %r2727, 0f3DB504F3; + mul.f32 %r4334, %r2728, 0f3DB504F3; + mul.f32 %r4335, %r2729, 0f3DB504F3; + mul.f32 %r4336, %r2730, 0f3DB504F3; + mul.f32 %r4337, %r2731, 0f3DB504F3; + mul.f32 %r4338, %r2732, 0f3DB504F3; + mul.f32 %r4339, %r2733, 0f3DB504F3; + mul.f32 %r4340, %r2734, 0f3DB504F3; + mul.f32 %r4341, %r2735, 0f3DB504F3; + mul.f32 %r4342, %r2736, 0f3DB504F3; + mul.f32 %r4343, %r2737, 0f3DB504F3; + mul.f32 %r4344, %r2738, 0f3DB504F3; + mul.f32 %r4345, %r2739, 0f3DB504F3; + mul.f32 %r4346, %r2740, 0f3DB504F3; + mul.f32 %r4347, %r2741, 0f3DB504F3; + mul.f32 %r4348, %r2742, 0f3DB504F3; + mul.f32 %r4349, %r2743, 0f3DB504F3; + mul.f32 %r4350, %r2744, 0f3DB504F3; + mul.f32 %r4351, %r2745, 0f3DB504F3; + mul.f32 %r4352, %r2746, 0f3DB504F3; + mul.f32 %r4353, %r2747, 0f3DB504F3; + mul.f32 %r4354, %r2748, 0f3DB504F3; + mul.f32 %r4355, %r2749, 0f3DB504F3; + mul.f32 %r4356, %r2750, 0f3DB504F3; + mul.f32 %r4357, %r2751, 0f3DB504F3; + mul.f32 %r4358, %r2752, 0f3DB504F3; + mul.f32 %r4359, %r2753, 0f3DB504F3; + mul.f32 %r4360, %r2754, 0f3DB504F3; + mul.f32 %r4361, %r2755, 0f3DB504F3; + .loc 1 798 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:798:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4362, %r14536, %r2359; + rem.s32 %r4363, %r14535, %r2359; + .loc 1 482 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:482:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.le.s32 %p85, %r4363, %r100; + setp.le.s32 %p86, %r4362, %r100; + .loc 1 490 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:490:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p87, %p1, %p86; + and.pred %p88, %p1, %p85; + .loc 1 493 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:493:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ge.s32 %p89, %r4362, %r2366; + setp.ge.s32 %p90, %r4363, %r2366; + .loc 1 494 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:494:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4364, %r4363, %r2366; + rem.s32 %r4365, %r4362, %r2366; + .loc 1 496 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:496:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p91, %r4365, 0; + setp.ne.b32 %p92, %r4364, 0; + .loc 1 499 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:499:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4366, %r4364, %r2366; + xor.b32 %r4367, %r4365, %r2366; + .loc 1 502 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:502:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4368, %r4366, 31; + and.b32 %r4369, %r4368, %r2366; + selp.b32 %r4370, %r4369, 0, %p92; + shr.s32 %r4371, %r4367, 31; + and.b32 %r4372, %r4371, %r2366; + selp.b32 %r4373, %r4372, 0, %p91; + add.s32 %r4374, %r4373, %r4365; + add.s32 %r4375, %r4370, %r4364; + .loc 1 504 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:504:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.s64.s32 %rd363, %r4375; + cvt.s64.s32 %rd364, %r4374; + setp.gt.s64 %p93, %rd17, %rd364; + setp.gt.s64 %p94, %rd17, %rd363; + .loc 1 505 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:505:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p95, %p90, %p94; + and.pred %p96, %p89, %p93; + .loc 1 506 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:506:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.s32 %r4376, %r4362, %r100; + sub.s32 %r4377, %r4363, %r100; + .loc 1 507 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:507:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4378, %r4377, %r2366; + rem.s32 %r4379, %r4376, %r2366; + .loc 1 508 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:508:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p97, %r4379, 0; + setp.ne.b32 %p98, %r4378, 0; + .loc 1 510 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:510:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4380, %r4379, %r2366; + xor.b32 %r4381, %r4378, %r2366; + .loc 1 513 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:513:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4382, %r4381, 31; + and.b32 %r4383, %r4382, %r2366; + selp.b32 %r4384, %r4383, 0, %p98; + shr.s32 %r4385, %r4380, 31; + and.b32 %r4386, %r4385, %r2366; + selp.b32 %r4387, %r4386, 0, %p97; + .loc 1 514 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:514:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + neg.s32 %r4388, %r4387; + neg.s32 %r4389, %r4384; + setp.eq.b32 %p99, %r4378, %r4389; + setp.eq.b32 %p100, %r4379, %r4388; + .loc 1 515 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:515:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p101, %p96, %p100; + and.pred %p102, %p95, %p99; + .loc 1 516 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:516:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + or.pred %p103, %p88, %p102; + or.pred %p104, %p87, %p101; + .loc 1 482 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:482:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.le.s32 %p105, %r4363, %r106; + setp.le.s32 %p106, %r4362, %r106; + .loc 1 490 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:490:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p107, %p3, %p106; + and.pred %p108, %p3, %p105; + .loc 1 506 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:506:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.s32 %r4390, %r4362, %r106; + sub.s32 %r4391, %r4363, %r106; + .loc 1 507 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:507:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4392, %r4391, %r2366; + rem.s32 %r4393, %r4390, %r2366; + .loc 1 508 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:508:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p109, %r4393, 0; + setp.ne.b32 %p110, %r4392, 0; + .loc 1 510 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:510:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4394, %r4393, %r2366; + xor.b32 %r4395, %r4392, %r2366; + .loc 1 513 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:513:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4396, %r4395, 31; + and.b32 %r4397, %r4396, %r2366; + selp.b32 %r4398, %r4397, 0, %p110; + shr.s32 %r4399, %r4394, 31; + and.b32 %r4400, %r4399, %r2366; + selp.b32 %r4401, %r4400, 0, %p109; + .loc 1 514 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:514:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + neg.s32 %r4402, %r4401; + neg.s32 %r4403, %r4398; + setp.eq.b32 %p111, %r4392, %r4403; + setp.eq.b32 %p112, %r4393, %r4402; + .loc 1 515 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:515:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p113, %p96, %p112; + and.pred %p114, %p95, %p111; + .loc 1 516 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:516:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + or.pred %p115, %p108, %p114; + or.pred %p116, %p107, %p113; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p117, %p116, %p70; + and.pred %p118, %p115, %p69; + and.pred %p119, %p104, %p70; + and.pred %p120, %p103, %p69; + .loc 1 798 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:798:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4404, %r14534, %r2359; + rem.s32 %r4405, %r14533, %r2359; + .loc 1 482 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:482:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.le.s32 %p121, %r4405, %r100; + setp.le.s32 %p122, %r4404, %r100; + .loc 1 490 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:490:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p123, %p1, %p122; + and.pred %p124, %p1, %p121; + .loc 1 493 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:493:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ge.s32 %p125, %r4404, %r2366; + setp.ge.s32 %p126, %r4405, %r2366; + .loc 1 494 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:494:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4406, %r4405, %r2366; + rem.s32 %r4407, %r4404, %r2366; + .loc 1 496 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:496:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p127, %r4407, 0; + setp.ne.b32 %p128, %r4406, 0; + .loc 1 499 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:499:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4408, %r4406, %r2366; + xor.b32 %r4409, %r4407, %r2366; + .loc 1 502 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:502:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4410, %r4408, 31; + and.b32 %r4411, %r4410, %r2366; + selp.b32 %r4412, %r4411, 0, %p128; + shr.s32 %r4413, %r4409, 31; + and.b32 %r4414, %r4413, %r2366; + selp.b32 %r4415, %r4414, 0, %p127; + add.s32 %r4416, %r4415, %r4407; + add.s32 %r4417, %r4412, %r4406; + .loc 1 504 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:504:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.s64.s32 %rd365, %r4417; + cvt.s64.s32 %rd366, %r4416; + setp.gt.s64 %p129, %rd17, %rd366; + setp.gt.s64 %p130, %rd17, %rd365; + .loc 1 505 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:505:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p131, %p126, %p130; + and.pred %p132, %p125, %p129; + .loc 1 506 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:506:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.s32 %r4418, %r4404, %r100; + sub.s32 %r4419, %r4405, %r100; + .loc 1 507 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:507:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4420, %r4419, %r2366; + rem.s32 %r4421, %r4418, %r2366; + .loc 1 508 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:508:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p133, %r4421, 0; + setp.ne.b32 %p134, %r4420, 0; + .loc 1 510 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:510:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4422, %r4421, %r2366; + xor.b32 %r4423, %r4420, %r2366; + .loc 1 513 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:513:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4424, %r4423, 31; + and.b32 %r4425, %r4424, %r2366; + selp.b32 %r4426, %r4425, 0, %p134; + shr.s32 %r4427, %r4422, 31; + and.b32 %r4428, %r4427, %r2366; + selp.b32 %r4429, %r4428, 0, %p133; + .loc 1 514 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:514:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + neg.s32 %r4430, %r4429; + neg.s32 %r4431, %r4426; + setp.eq.b32 %p135, %r4420, %r4431; + setp.eq.b32 %p136, %r4421, %r4430; + .loc 1 515 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:515:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p137, %p132, %p136; + and.pred %p138, %p131, %p135; + .loc 1 516 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:516:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + or.pred %p139, %p124, %p138; + or.pred %p140, %p123, %p137; + .loc 1 482 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:482:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.le.s32 %p141, %r4405, %r106; + setp.le.s32 %p142, %r4404, %r106; + .loc 1 490 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:490:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p143, %p3, %p142; + and.pred %p144, %p3, %p141; + .loc 1 506 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:506:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.s32 %r4432, %r4404, %r106; + sub.s32 %r4433, %r4405, %r106; + .loc 1 507 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:507:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4434, %r4433, %r2366; + rem.s32 %r4435, %r4432, %r2366; + .loc 1 508 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:508:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p145, %r4435, 0; + setp.ne.b32 %p146, %r4434, 0; + .loc 1 510 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:510:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4436, %r4435, %r2366; + xor.b32 %r4437, %r4434, %r2366; + .loc 1 513 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:513:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4438, %r4437, 31; + and.b32 %r4439, %r4438, %r2366; + selp.b32 %r4440, %r4439, 0, %p146; + shr.s32 %r4441, %r4436, 31; + and.b32 %r4442, %r4441, %r2366; + selp.b32 %r4443, %r4442, 0, %p145; + .loc 1 514 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:514:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + neg.s32 %r4444, %r4443; + neg.s32 %r4445, %r4440; + setp.eq.b32 %p147, %r4434, %r4445; + setp.eq.b32 %p148, %r4435, %r4444; + .loc 1 515 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:515:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p149, %p132, %p148; + and.pred %p150, %p131, %p147; + .loc 1 516 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:516:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + or.pred %p151, %p144, %p150; + or.pred %p152, %p143, %p149; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p153, %p152, %p72; + and.pred %p154, %p151, %p71; + and.pred %p155, %p140, %p72; + and.pred %p156, %p139, %p71; + .loc 1 798 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:798:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4446, %r14532, %r2359; + rem.s32 %r4447, %r14531, %r2359; + .loc 1 482 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:482:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.le.s32 %p157, %r4447, %r100; + setp.le.s32 %p158, %r4446, %r100; + .loc 1 490 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:490:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p159, %p1, %p158; + and.pred %p160, %p1, %p157; + .loc 1 493 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:493:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ge.s32 %p161, %r4446, %r2366; + setp.ge.s32 %p162, %r4447, %r2366; + .loc 1 494 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:494:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4448, %r4447, %r2366; + rem.s32 %r4449, %r4446, %r2366; + .loc 1 496 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:496:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p163, %r4449, 0; + setp.ne.b32 %p164, %r4448, 0; + .loc 1 499 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:499:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4450, %r4448, %r2366; + xor.b32 %r4451, %r4449, %r2366; + .loc 1 502 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:502:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4452, %r4450, 31; + and.b32 %r4453, %r4452, %r2366; + selp.b32 %r4454, %r4453, 0, %p164; + shr.s32 %r4455, %r4451, 31; + and.b32 %r4456, %r4455, %r2366; + selp.b32 %r4457, %r4456, 0, %p163; + add.s32 %r4458, %r4457, %r4449; + add.s32 %r4459, %r4454, %r4448; + .loc 1 504 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:504:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.s64.s32 %rd367, %r4459; + cvt.s64.s32 %rd368, %r4458; + setp.gt.s64 %p165, %rd17, %rd368; + setp.gt.s64 %p166, %rd17, %rd367; + .loc 1 505 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:505:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p167, %p162, %p166; + and.pred %p168, %p161, %p165; + .loc 1 506 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:506:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.s32 %r4460, %r4446, %r100; + sub.s32 %r4461, %r4447, %r100; + .loc 1 507 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:507:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4462, %r4461, %r2366; + rem.s32 %r4463, %r4460, %r2366; + .loc 1 508 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:508:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p169, %r4463, 0; + setp.ne.b32 %p170, %r4462, 0; + .loc 1 510 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:510:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4464, %r4463, %r2366; + xor.b32 %r4465, %r4462, %r2366; + .loc 1 513 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:513:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4466, %r4465, 31; + and.b32 %r4467, %r4466, %r2366; + selp.b32 %r4468, %r4467, 0, %p170; + shr.s32 %r4469, %r4464, 31; + and.b32 %r4470, %r4469, %r2366; + selp.b32 %r4471, %r4470, 0, %p169; + .loc 1 514 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:514:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + neg.s32 %r4472, %r4471; + neg.s32 %r4473, %r4468; + setp.eq.b32 %p171, %r4462, %r4473; + setp.eq.b32 %p172, %r4463, %r4472; + .loc 1 515 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:515:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p173, %p168, %p172; + and.pred %p174, %p167, %p171; + .loc 1 516 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:516:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + or.pred %p175, %p160, %p174; + or.pred %p176, %p159, %p173; + .loc 1 482 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:482:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.le.s32 %p177, %r4447, %r106; + setp.le.s32 %p178, %r4446, %r106; + .loc 1 490 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:490:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p179, %p3, %p178; + and.pred %p180, %p3, %p177; + .loc 1 506 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:506:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.s32 %r4474, %r4446, %r106; + sub.s32 %r4475, %r4447, %r106; + .loc 1 507 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:507:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4476, %r4475, %r2366; + rem.s32 %r4477, %r4474, %r2366; + .loc 1 508 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:508:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p181, %r4477, 0; + setp.ne.b32 %p182, %r4476, 0; + .loc 1 510 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:510:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4478, %r4477, %r2366; + xor.b32 %r4479, %r4476, %r2366; + .loc 1 513 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:513:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4480, %r4479, 31; + and.b32 %r4481, %r4480, %r2366; + selp.b32 %r4482, %r4481, 0, %p182; + shr.s32 %r4483, %r4478, 31; + and.b32 %r4484, %r4483, %r2366; + selp.b32 %r4485, %r4484, 0, %p181; + .loc 1 514 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:514:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + neg.s32 %r4486, %r4485; + neg.s32 %r4487, %r4482; + setp.eq.b32 %p183, %r4476, %r4487; + setp.eq.b32 %p184, %r4477, %r4486; + .loc 1 515 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:515:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p185, %p168, %p184; + and.pred %p186, %p167, %p183; + .loc 1 516 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:516:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + or.pred %p187, %p180, %p186; + or.pred %p188, %p179, %p185; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p189, %p188, %p74; + and.pred %p190, %p187, %p73; + and.pred %p191, %p176, %p74; + and.pred %p192, %p175, %p73; + .loc 1 798 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:798:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4488, %r14530, %r2359; + rem.s32 %r4489, %r14529, %r2359; + .loc 1 482 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:482:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.le.s32 %p193, %r4489, %r100; + setp.le.s32 %p194, %r4488, %r100; + .loc 1 490 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:490:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p195, %p1, %p194; + and.pred %p196, %p1, %p193; + .loc 1 493 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:493:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ge.s32 %p197, %r4488, %r2366; + setp.ge.s32 %p198, %r4489, %r2366; + .loc 1 494 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:494:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4490, %r4489, %r2366; + rem.s32 %r4491, %r4488, %r2366; + .loc 1 496 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:496:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p199, %r4491, 0; + setp.ne.b32 %p200, %r4490, 0; + .loc 1 499 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:499:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4492, %r4490, %r2366; + xor.b32 %r4493, %r4491, %r2366; + .loc 1 502 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:502:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4494, %r4492, 31; + and.b32 %r4495, %r4494, %r2366; + selp.b32 %r4496, %r4495, 0, %p200; + shr.s32 %r4497, %r4493, 31; + and.b32 %r4498, %r4497, %r2366; + selp.b32 %r4499, %r4498, 0, %p199; + add.s32 %r4500, %r4499, %r4491; + add.s32 %r4501, %r4496, %r4490; + .loc 1 504 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:504:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.s64.s32 %rd369, %r4501; + cvt.s64.s32 %rd370, %r4500; + setp.gt.s64 %p201, %rd17, %rd370; + setp.gt.s64 %p202, %rd17, %rd369; + .loc 1 505 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:505:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p203, %p198, %p202; + and.pred %p204, %p197, %p201; + .loc 1 506 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:506:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.s32 %r4502, %r4488, %r100; + sub.s32 %r4503, %r4489, %r100; + .loc 1 507 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:507:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4504, %r4503, %r2366; + rem.s32 %r4505, %r4502, %r2366; + .loc 1 508 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:508:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p205, %r4505, 0; + setp.ne.b32 %p206, %r4504, 0; + .loc 1 510 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:510:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4506, %r4505, %r2366; + xor.b32 %r4507, %r4504, %r2366; + .loc 1 513 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:513:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4508, %r4507, 31; + and.b32 %r4509, %r4508, %r2366; + selp.b32 %r4510, %r4509, 0, %p206; + shr.s32 %r4511, %r4506, 31; + and.b32 %r4512, %r4511, %r2366; + selp.b32 %r4513, %r4512, 0, %p205; + .loc 1 514 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:514:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + neg.s32 %r4514, %r4513; + neg.s32 %r4515, %r4510; + setp.eq.b32 %p207, %r4504, %r4515; + setp.eq.b32 %p208, %r4505, %r4514; + .loc 1 515 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:515:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p209, %p204, %p208; + and.pred %p210, %p203, %p207; + .loc 1 516 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:516:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + or.pred %p211, %p196, %p210; + or.pred %p212, %p195, %p209; + .loc 1 482 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:482:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.le.s32 %p213, %r4489, %r106; + setp.le.s32 %p214, %r4488, %r106; + .loc 1 490 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:490:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p215, %p3, %p214; + and.pred %p216, %p3, %p213; + .loc 1 506 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:506:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.s32 %r4516, %r4488, %r106; + sub.s32 %r4517, %r4489, %r106; + .loc 1 507 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:507:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4518, %r4517, %r2366; + rem.s32 %r4519, %r4516, %r2366; + .loc 1 508 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:508:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p217, %r4519, 0; + setp.ne.b32 %p218, %r4518, 0; + .loc 1 510 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:510:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4520, %r4519, %r2366; + xor.b32 %r4521, %r4518, %r2366; + .loc 1 513 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:513:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4522, %r4521, 31; + and.b32 %r4523, %r4522, %r2366; + selp.b32 %r4524, %r4523, 0, %p218; + shr.s32 %r4525, %r4520, 31; + and.b32 %r4526, %r4525, %r2366; + selp.b32 %r4527, %r4526, 0, %p217; + .loc 1 514 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:514:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + neg.s32 %r4528, %r4527; + neg.s32 %r4529, %r4524; + setp.eq.b32 %p219, %r4518, %r4529; + setp.eq.b32 %p220, %r4519, %r4528; + .loc 1 515 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:515:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p221, %p204, %p220; + and.pred %p222, %p203, %p219; + .loc 1 516 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:516:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + or.pred %p223, %p216, %p222; + or.pred %p224, %p215, %p221; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p225, %p224, %p76; + and.pred %p226, %p223, %p75; + and.pred %p227, %p212, %p76; + and.pred %p228, %p211, %p75; + .loc 1 798 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:798:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4530, %r14528, %r2359; + rem.s32 %r4531, %r14527, %r2359; + .loc 1 482 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:482:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.le.s32 %p229, %r4531, %r100; + setp.le.s32 %p230, %r4530, %r100; + .loc 1 490 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:490:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p231, %p1, %p230; + and.pred %p232, %p1, %p229; + .loc 1 493 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:493:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ge.s32 %p233, %r4530, %r2366; + setp.ge.s32 %p234, %r4531, %r2366; + .loc 1 494 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:494:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4532, %r4531, %r2366; + rem.s32 %r4533, %r4530, %r2366; + .loc 1 496 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:496:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p235, %r4533, 0; + setp.ne.b32 %p236, %r4532, 0; + .loc 1 499 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:499:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4534, %r4532, %r2366; + xor.b32 %r4535, %r4533, %r2366; + .loc 1 502 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:502:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4536, %r4534, 31; + and.b32 %r4537, %r4536, %r2366; + selp.b32 %r4538, %r4537, 0, %p236; + shr.s32 %r4539, %r4535, 31; + and.b32 %r4540, %r4539, %r2366; + selp.b32 %r4541, %r4540, 0, %p235; + add.s32 %r4542, %r4541, %r4533; + add.s32 %r4543, %r4538, %r4532; + .loc 1 504 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:504:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.s64.s32 %rd371, %r4543; + cvt.s64.s32 %rd372, %r4542; + setp.gt.s64 %p237, %rd17, %rd372; + setp.gt.s64 %p238, %rd17, %rd371; + .loc 1 505 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:505:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p239, %p234, %p238; + and.pred %p240, %p233, %p237; + .loc 1 506 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:506:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.s32 %r4544, %r4530, %r100; + sub.s32 %r4545, %r4531, %r100; + .loc 1 507 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:507:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4546, %r4545, %r2366; + rem.s32 %r4547, %r4544, %r2366; + .loc 1 508 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:508:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p241, %r4547, 0; + setp.ne.b32 %p242, %r4546, 0; + .loc 1 510 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:510:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4548, %r4547, %r2366; + xor.b32 %r4549, %r4546, %r2366; + .loc 1 513 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:513:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4550, %r4549, 31; + and.b32 %r4551, %r4550, %r2366; + selp.b32 %r4552, %r4551, 0, %p242; + shr.s32 %r4553, %r4548, 31; + and.b32 %r4554, %r4553, %r2366; + selp.b32 %r4555, %r4554, 0, %p241; + .loc 1 514 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:514:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + neg.s32 %r4556, %r4555; + neg.s32 %r4557, %r4552; + setp.eq.b32 %p243, %r4546, %r4557; + setp.eq.b32 %p244, %r4547, %r4556; + .loc 1 515 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:515:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p245, %p240, %p244; + and.pred %p246, %p239, %p243; + .loc 1 516 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:516:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + or.pred %p247, %p232, %p246; + or.pred %p248, %p231, %p245; + .loc 1 482 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:482:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.le.s32 %p249, %r4531, %r106; + setp.le.s32 %p250, %r4530, %r106; + .loc 1 490 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:490:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p251, %p3, %p250; + and.pred %p252, %p3, %p249; + .loc 1 506 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:506:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.s32 %r4558, %r4530, %r106; + sub.s32 %r4559, %r4531, %r106; + .loc 1 507 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:507:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4560, %r4559, %r2366; + rem.s32 %r4561, %r4558, %r2366; + .loc 1 508 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:508:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p253, %r4561, 0; + setp.ne.b32 %p254, %r4560, 0; + .loc 1 510 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:510:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4562, %r4561, %r2366; + xor.b32 %r4563, %r4560, %r2366; + .loc 1 513 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:513:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4564, %r4563, 31; + and.b32 %r4565, %r4564, %r2366; + selp.b32 %r4566, %r4565, 0, %p254; + shr.s32 %r4567, %r4562, 31; + and.b32 %r4568, %r4567, %r2366; + selp.b32 %r4569, %r4568, 0, %p253; + .loc 1 514 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:514:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + neg.s32 %r4570, %r4569; + neg.s32 %r4571, %r4566; + setp.eq.b32 %p255, %r4560, %r4571; + setp.eq.b32 %p256, %r4561, %r4570; + .loc 1 515 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:515:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p257, %p240, %p256; + and.pred %p258, %p239, %p255; + .loc 1 516 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:516:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + or.pred %p259, %p252, %p258; + or.pred %p260, %p251, %p257; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p261, %p260, %p78; + and.pred %p262, %p259, %p77; + and.pred %p263, %p248, %p78; + and.pred %p264, %p247, %p77; + .loc 1 798 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:798:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4572, %r14526, %r2359; + rem.s32 %r4573, %r14525, %r2359; + .loc 1 482 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:482:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.le.s32 %p265, %r4573, %r100; + setp.le.s32 %p266, %r4572, %r100; + .loc 1 490 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:490:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p267, %p1, %p266; + and.pred %p268, %p1, %p265; + .loc 1 493 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:493:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ge.s32 %p269, %r4572, %r2366; + setp.ge.s32 %p270, %r4573, %r2366; + .loc 1 494 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:494:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4574, %r4573, %r2366; + rem.s32 %r4575, %r4572, %r2366; + .loc 1 496 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:496:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p271, %r4575, 0; + setp.ne.b32 %p272, %r4574, 0; + .loc 1 499 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:499:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4576, %r4574, %r2366; + xor.b32 %r4577, %r4575, %r2366; + .loc 1 502 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:502:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4578, %r4576, 31; + and.b32 %r4579, %r4578, %r2366; + selp.b32 %r4580, %r4579, 0, %p272; + shr.s32 %r4581, %r4577, 31; + and.b32 %r4582, %r4581, %r2366; + selp.b32 %r4583, %r4582, 0, %p271; + add.s32 %r4584, %r4583, %r4575; + add.s32 %r4585, %r4580, %r4574; + .loc 1 504 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:504:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.s64.s32 %rd373, %r4585; + cvt.s64.s32 %rd374, %r4584; + setp.gt.s64 %p273, %rd17, %rd374; + setp.gt.s64 %p274, %rd17, %rd373; + .loc 1 505 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:505:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p275, %p270, %p274; + and.pred %p276, %p269, %p273; + .loc 1 506 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:506:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.s32 %r4586, %r4572, %r100; + sub.s32 %r4587, %r4573, %r100; + .loc 1 507 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:507:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4588, %r4587, %r2366; + rem.s32 %r4589, %r4586, %r2366; + .loc 1 508 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:508:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p277, %r4589, 0; + setp.ne.b32 %p278, %r4588, 0; + .loc 1 510 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:510:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4590, %r4589, %r2366; + xor.b32 %r4591, %r4588, %r2366; + .loc 1 513 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:513:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4592, %r4591, 31; + and.b32 %r4593, %r4592, %r2366; + selp.b32 %r4594, %r4593, 0, %p278; + shr.s32 %r4595, %r4590, 31; + and.b32 %r4596, %r4595, %r2366; + selp.b32 %r4597, %r4596, 0, %p277; + .loc 1 514 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:514:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + neg.s32 %r4598, %r4597; + neg.s32 %r4599, %r4594; + setp.eq.b32 %p279, %r4588, %r4599; + setp.eq.b32 %p280, %r4589, %r4598; + .loc 1 515 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:515:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p281, %p276, %p280; + and.pred %p282, %p275, %p279; + .loc 1 516 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:516:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + or.pred %p283, %p268, %p282; + or.pred %p284, %p267, %p281; + .loc 1 482 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:482:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.le.s32 %p285, %r4573, %r106; + setp.le.s32 %p286, %r4572, %r106; + .loc 1 490 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:490:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p287, %p3, %p286; + and.pred %p288, %p3, %p285; + .loc 1 506 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:506:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.s32 %r4600, %r4572, %r106; + sub.s32 %r4601, %r4573, %r106; + .loc 1 507 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:507:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4602, %r4601, %r2366; + rem.s32 %r4603, %r4600, %r2366; + .loc 1 508 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:508:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p289, %r4603, 0; + setp.ne.b32 %p290, %r4602, 0; + .loc 1 510 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:510:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4604, %r4603, %r2366; + xor.b32 %r4605, %r4602, %r2366; + .loc 1 513 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:513:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4606, %r4605, 31; + and.b32 %r4607, %r4606, %r2366; + selp.b32 %r4608, %r4607, 0, %p290; + shr.s32 %r4609, %r4604, 31; + and.b32 %r4610, %r4609, %r2366; + selp.b32 %r4611, %r4610, 0, %p289; + .loc 1 514 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:514:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + neg.s32 %r4612, %r4611; + neg.s32 %r4613, %r4608; + setp.eq.b32 %p291, %r4602, %r4613; + setp.eq.b32 %p292, %r4603, %r4612; + .loc 1 515 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:515:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p293, %p276, %p292; + and.pred %p294, %p275, %p291; + .loc 1 516 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:516:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + or.pred %p295, %p288, %p294; + or.pred %p296, %p287, %p293; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p297, %p296, %p80; + and.pred %p298, %p295, %p79; + and.pred %p299, %p284, %p80; + and.pred %p300, %p283, %p79; + .loc 1 798 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:798:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4614, %r14524, %r2359; + rem.s32 %r4615, %r14523, %r2359; + .loc 1 482 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:482:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.le.s32 %p301, %r4615, %r100; + setp.le.s32 %p302, %r4614, %r100; + .loc 1 490 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:490:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p303, %p1, %p302; + and.pred %p304, %p1, %p301; + .loc 1 493 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:493:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ge.s32 %p305, %r4614, %r2366; + setp.ge.s32 %p306, %r4615, %r2366; + .loc 1 494 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:494:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4616, %r4615, %r2366; + rem.s32 %r4617, %r4614, %r2366; + .loc 1 496 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:496:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p307, %r4617, 0; + setp.ne.b32 %p308, %r4616, 0; + .loc 1 499 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:499:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4618, %r4616, %r2366; + xor.b32 %r4619, %r4617, %r2366; + .loc 1 502 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:502:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4620, %r4618, 31; + and.b32 %r4621, %r4620, %r2366; + selp.b32 %r4622, %r4621, 0, %p308; + shr.s32 %r4623, %r4619, 31; + and.b32 %r4624, %r4623, %r2366; + selp.b32 %r4625, %r4624, 0, %p307; + add.s32 %r4626, %r4625, %r4617; + add.s32 %r4627, %r4622, %r4616; + .loc 1 504 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:504:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.s64.s32 %rd375, %r4627; + cvt.s64.s32 %rd376, %r4626; + setp.gt.s64 %p309, %rd17, %rd376; + setp.gt.s64 %p310, %rd17, %rd375; + .loc 1 505 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:505:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p311, %p306, %p310; + and.pred %p312, %p305, %p309; + .loc 1 506 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:506:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.s32 %r4628, %r4614, %r100; + sub.s32 %r4629, %r4615, %r100; + .loc 1 507 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:507:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4630, %r4629, %r2366; + rem.s32 %r4631, %r4628, %r2366; + .loc 1 508 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:508:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p313, %r4631, 0; + setp.ne.b32 %p314, %r4630, 0; + .loc 1 510 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:510:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4632, %r4631, %r2366; + xor.b32 %r4633, %r4630, %r2366; + .loc 1 513 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:513:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4634, %r4633, 31; + and.b32 %r4635, %r4634, %r2366; + selp.b32 %r4636, %r4635, 0, %p314; + shr.s32 %r4637, %r4632, 31; + and.b32 %r4638, %r4637, %r2366; + selp.b32 %r4639, %r4638, 0, %p313; + .loc 1 514 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:514:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + neg.s32 %r4640, %r4639; + neg.s32 %r4641, %r4636; + setp.eq.b32 %p315, %r4630, %r4641; + setp.eq.b32 %p316, %r4631, %r4640; + .loc 1 515 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:515:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p317, %p312, %p316; + and.pred %p318, %p311, %p315; + .loc 1 516 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:516:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + or.pred %p319, %p304, %p318; + or.pred %p320, %p303, %p317; + .loc 1 482 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:482:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.le.s32 %p321, %r4615, %r106; + setp.le.s32 %p322, %r4614, %r106; + .loc 1 490 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:490:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p323, %p3, %p322; + and.pred %p324, %p3, %p321; + .loc 1 506 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:506:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.s32 %r4642, %r4614, %r106; + sub.s32 %r4643, %r4615, %r106; + .loc 1 507 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:507:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4644, %r4643, %r2366; + rem.s32 %r4645, %r4642, %r2366; + .loc 1 508 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:508:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p325, %r4645, 0; + setp.ne.b32 %p326, %r4644, 0; + .loc 1 510 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:510:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4646, %r4645, %r2366; + xor.b32 %r4647, %r4644, %r2366; + .loc 1 513 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:513:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4648, %r4647, 31; + and.b32 %r4649, %r4648, %r2366; + selp.b32 %r4650, %r4649, 0, %p326; + shr.s32 %r4651, %r4646, 31; + and.b32 %r4652, %r4651, %r2366; + selp.b32 %r4653, %r4652, 0, %p325; + .loc 1 514 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:514:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + neg.s32 %r4654, %r4653; + neg.s32 %r4655, %r4650; + setp.eq.b32 %p327, %r4644, %r4655; + setp.eq.b32 %p328, %r4645, %r4654; + .loc 1 515 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:515:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p329, %p312, %p328; + and.pred %p330, %p311, %p327; + .loc 1 516 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:516:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + or.pred %p331, %p324, %p330; + or.pred %p332, %p323, %p329; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p333, %p332, %p82; + and.pred %p334, %p331, %p81; + and.pred %p335, %p320, %p82; + and.pred %p336, %p319, %p81; + .loc 1 798 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:798:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4656, %r14522, %r2359; + rem.s32 %r4657, %r14521, %r2359; + .loc 1 482 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:482:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.le.s32 %p337, %r4657, %r100; + setp.le.s32 %p338, %r4656, %r100; + .loc 1 490 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:490:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p339, %p1, %p338; + and.pred %p340, %p1, %p337; + .loc 1 493 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:493:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ge.s32 %p341, %r4656, %r2366; + setp.ge.s32 %p342, %r4657, %r2366; + .loc 1 494 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:494:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4658, %r4657, %r2366; + rem.s32 %r4659, %r4656, %r2366; + .loc 1 496 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:496:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p343, %r4659, 0; + setp.ne.b32 %p344, %r4658, 0; + .loc 1 499 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:499:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4660, %r4658, %r2366; + xor.b32 %r4661, %r4659, %r2366; + .loc 1 502 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:502:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4662, %r4660, 31; + and.b32 %r4663, %r4662, %r2366; + selp.b32 %r4664, %r4663, 0, %p344; + shr.s32 %r4665, %r4661, 31; + and.b32 %r4666, %r4665, %r2366; + selp.b32 %r4667, %r4666, 0, %p343; + add.s32 %r4668, %r4667, %r4659; + add.s32 %r4669, %r4664, %r4658; + .loc 1 504 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:504:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.s64.s32 %rd377, %r4669; + cvt.s64.s32 %rd378, %r4668; + setp.gt.s64 %p345, %rd17, %rd378; + setp.gt.s64 %p346, %rd17, %rd377; + .loc 1 505 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:505:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p347, %p342, %p346; + and.pred %p348, %p341, %p345; + .loc 1 506 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:506:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.s32 %r4670, %r4656, %r100; + sub.s32 %r4671, %r4657, %r100; + .loc 1 507 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:507:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4672, %r4671, %r2366; + rem.s32 %r4673, %r4670, %r2366; + .loc 1 508 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:508:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p349, %r4673, 0; + setp.ne.b32 %p350, %r4672, 0; + .loc 1 510 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:510:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4674, %r4673, %r2366; + xor.b32 %r4675, %r4672, %r2366; + .loc 1 513 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:513:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4676, %r4675, 31; + and.b32 %r4677, %r4676, %r2366; + selp.b32 %r4678, %r4677, 0, %p350; + shr.s32 %r4679, %r4674, 31; + and.b32 %r4680, %r4679, %r2366; + selp.b32 %r4681, %r4680, 0, %p349; + .loc 1 514 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:514:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + neg.s32 %r4682, %r4681; + neg.s32 %r4683, %r4678; + setp.eq.b32 %p351, %r4672, %r4683; + setp.eq.b32 %p352, %r4673, %r4682; + .loc 1 515 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:515:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p353, %p348, %p352; + and.pred %p354, %p347, %p351; + .loc 1 516 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:516:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + or.pred %p355, %p340, %p354; + or.pred %p356, %p339, %p353; + .loc 1 482 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:482:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.le.s32 %p357, %r4657, %r106; + setp.le.s32 %p358, %r4656, %r106; + .loc 1 490 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:490:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p359, %p3, %p358; + and.pred %p360, %p3, %p357; + .loc 1 506 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:506:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.s32 %r4684, %r4656, %r106; + sub.s32 %r4685, %r4657, %r106; + .loc 1 507 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:507:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + rem.s32 %r4686, %r4685, %r2366; + rem.s32 %r4687, %r4684, %r2366; + .loc 1 508 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:508:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p361, %r4687, 0; + setp.ne.b32 %p362, %r4686, 0; + .loc 1 510 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:510:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4688, %r4687, %r2366; + xor.b32 %r4689, %r4686, %r2366; + .loc 1 513 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:513:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.s32 %r4690, %r4689, 31; + and.b32 %r4691, %r4690, %r2366; + selp.b32 %r4692, %r4691, 0, %p362; + shr.s32 %r4693, %r4688, 31; + and.b32 %r4694, %r4693, %r2366; + selp.b32 %r4695, %r4694, 0, %p361; + .loc 1 514 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:514:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + neg.s32 %r4696, %r4695; + neg.s32 %r4697, %r4692; + setp.eq.b32 %p363, %r4686, %r4697; + setp.eq.b32 %p364, %r4687, %r4696; + .loc 1 515 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:515:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p365, %p348, %p364; + and.pred %p366, %p347, %p363; + .loc 1 516 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:516:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + or.pred %p367, %p360, %p366; + or.pred %p368, %p359, %p365; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p369, %p368, %p84; + and.pred %p370, %p367, %p83; + and.pred %p371, %p356, %p84; + and.pred %p372, %p355, %p83; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4698, %r4330, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4699, %r4698, 0fFF800000, %p118; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4700, %r4331, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4701, %r4700, 0fFF800000, %p117; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4702, %r4332, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4703, %r4702, 0fFF800000, %p120; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4704, %r4333, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4705, %r4704, 0fFF800000, %p119; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4706, %r4334, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4707, %r4706, 0fFF800000, %p154; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4708, %r4335, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4709, %r4708, 0fFF800000, %p153; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4710, %r4336, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4711, %r4710, 0fFF800000, %p156; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4712, %r4337, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4713, %r4712, 0fFF800000, %p155; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4714, %r4338, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4715, %r4714, 0fFF800000, %p190; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4716, %r4339, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4717, %r4716, 0fFF800000, %p189; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4718, %r4340, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4719, %r4718, 0fFF800000, %p192; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4720, %r4341, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4721, %r4720, 0fFF800000, %p191; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4722, %r4342, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4723, %r4722, 0fFF800000, %p226; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4724, %r4343, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4725, %r4724, 0fFF800000, %p225; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4726, %r4344, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4727, %r4726, 0fFF800000, %p228; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4728, %r4345, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4729, %r4728, 0fFF800000, %p227; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4730, %r4346, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4731, %r4730, 0fFF800000, %p262; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4732, %r4347, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4733, %r4732, 0fFF800000, %p261; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4734, %r4348, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4735, %r4734, 0fFF800000, %p264; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4736, %r4349, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4737, %r4736, 0fFF800000, %p263; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4738, %r4350, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4739, %r4738, 0fFF800000, %p298; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4740, %r4351, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4741, %r4740, 0fFF800000, %p297; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4742, %r4352, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4743, %r4742, 0fFF800000, %p300; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4744, %r4353, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4745, %r4744, 0fFF800000, %p299; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4746, %r4354, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4747, %r4746, 0fFF800000, %p334; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4748, %r4355, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4749, %r4748, 0fFF800000, %p333; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4750, %r4356, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4751, %r4750, 0fFF800000, %p336; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4752, %r4357, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4753, %r4752, 0fFF800000, %p335; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4754, %r4358, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4755, %r4754, 0fFF800000, %p370; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4756, %r4359, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4757, %r4756, 0fFF800000, %p369; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4758, %r4360, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4759, %r4758, 0fFF800000, %p372; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4760, %r4361, 0f3FB8AA3B; + .loc 1 521 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:521:69 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.f32 %r4761, %r4760, 0fFF800000, %p371; + .loc 1 525 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:525:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.f32 %r4762, %r4699, %r35; + sub.f32 %r4763, %r4701, %r35; + sub.f32 %r4764, %r4703, %r36; + sub.f32 %r4765, %r4705, %r36; + sub.f32 %r4766, %r4707, %r35; + sub.f32 %r4767, %r4709, %r35; + sub.f32 %r4768, %r4711, %r36; + sub.f32 %r4769, %r4713, %r36; + sub.f32 %r4770, %r4715, %r35; + sub.f32 %r4771, %r4717, %r35; + sub.f32 %r4772, %r4719, %r36; + sub.f32 %r4773, %r4721, %r36; + sub.f32 %r4774, %r4723, %r35; + sub.f32 %r4775, %r4725, %r35; + sub.f32 %r4776, %r4727, %r36; + sub.f32 %r4777, %r4729, %r36; + sub.f32 %r4778, %r4731, %r35; + sub.f32 %r4779, %r4733, %r35; + sub.f32 %r4780, %r4735, %r36; + sub.f32 %r4781, %r4737, %r36; + sub.f32 %r4782, %r4739, %r35; + sub.f32 %r4783, %r4741, %r35; + sub.f32 %r4784, %r4743, %r36; + sub.f32 %r4785, %r4745, %r36; + sub.f32 %r4786, %r4747, %r35; + sub.f32 %r4787, %r4749, %r35; + sub.f32 %r4788, %r4751, %r36; + sub.f32 %r4789, %r4753, %r36; + sub.f32 %r4790, %r4755, %r35; + sub.f32 %r4791, %r4757, %r35; + sub.f32 %r4792, %r4759, %r36; + sub.f32 %r4793, %r4761, %r36; + .loc 1 525 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:525:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + ex2.approx.ftz.f32 %r4794, %r4762; + ex2.approx.ftz.f32 %r4795, %r4763; + ex2.approx.ftz.f32 %r4796, %r4764; + ex2.approx.ftz.f32 %r4797, %r4765; + ex2.approx.ftz.f32 %r4798, %r4766; + ex2.approx.ftz.f32 %r4799, %r4767; + ex2.approx.ftz.f32 %r4800, %r4768; + ex2.approx.ftz.f32 %r4801, %r4769; + ex2.approx.ftz.f32 %r4802, %r4770; + ex2.approx.ftz.f32 %r4803, %r4771; + ex2.approx.ftz.f32 %r4804, %r4772; + ex2.approx.ftz.f32 %r4805, %r4773; + ex2.approx.ftz.f32 %r4806, %r4774; + ex2.approx.ftz.f32 %r4807, %r4775; + ex2.approx.ftz.f32 %r4808, %r4776; + ex2.approx.ftz.f32 %r4809, %r4777; + ex2.approx.ftz.f32 %r4810, %r4778; + ex2.approx.ftz.f32 %r4811, %r4779; + ex2.approx.ftz.f32 %r4812, %r4780; + ex2.approx.ftz.f32 %r4813, %r4781; + ex2.approx.ftz.f32 %r4814, %r4782; + ex2.approx.ftz.f32 %r4815, %r4783; + ex2.approx.ftz.f32 %r4816, %r4784; + ex2.approx.ftz.f32 %r4817, %r4785; + ex2.approx.ftz.f32 %r4818, %r4786; + ex2.approx.ftz.f32 %r4819, %r4787; + ex2.approx.ftz.f32 %r4820, %r4788; + ex2.approx.ftz.f32 %r4821, %r4789; + ex2.approx.ftz.f32 %r4822, %r4790; + ex2.approx.ftz.f32 %r4823, %r4791; + ex2.approx.ftz.f32 %r4824, %r4792; + ex2.approx.ftz.f32 %r4825, %r4793; + .loc 1 831 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + add.s32 %r4826, %r2590, 49152; + add.s32 %r3699, %r4826, %r4287; + .loc 1 530 20 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:530:20 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + wgmma.fence.sync.aligned; + add.s32 %r3696, %r2590, 131072; + add.s32 %r4827, %r4291, %r3696; + bfe.u32 %r4828, %r4827, 4, 14; + cvt.u64.u32 %rd379, %r4828; + or.b64 %rd313, %rd379, 4611686293372403712; + bfe.u32 %r4829, %r3699, 4, 14; + cvt.u64.u32 %rd380, %r4829; + or.b64 %rd314, %rd380, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3280,%r3281,%r3282,%r3283,%r3284,%r3285,%r3286,%r3287,%r3288,%r3289,%r3290,%r3291,%r3292,%r3293,%r3294,%r3295,%r3296,%r3297,%r3298,%r3299,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r3306,%r3307,%r3308,%r3309,%r3310,%r3311}, %rd313, %rd314, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r4830, %r4295, %r3696; + bfe.u32 %r4831, %r4830, 4, 14; + cvt.u64.u32 %rd381, %r4831; + or.b64 %rd315, %rd381, 4611686293372403712; + add.s32 %r4832, %r3699, 32; + bfe.u32 %r4833, %r4832, 4, 14; + cvt.u64.u32 %rd382, %r4833; + or.b64 %rd316, %rd382, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3280,%r3281,%r3282,%r3283,%r3284,%r3285,%r3286,%r3287,%r3288,%r3289,%r3290,%r3291,%r3292,%r3293,%r3294,%r3295,%r3296,%r3297,%r3298,%r3299,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r3306,%r3307,%r3308,%r3309,%r3310,%r3311}, %rd315, %rd316, %p47, 1, 1, 0, 0; + // end inline asm + add.s32 %r4834, %r4300, %r3696; + bfe.u32 %r4835, %r4834, 4, 14; + cvt.u64.u32 %rd383, %r4835; + or.b64 %rd317, %rd383, 4611686293372403712; + add.s32 %r4836, %r3699, 64; + bfe.u32 %r4837, %r4836, 4, 14; + cvt.u64.u32 %rd384, %r4837; + or.b64 %rd318, %rd384, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3280,%r3281,%r3282,%r3283,%r3284,%r3285,%r3286,%r3287,%r3288,%r3289,%r3290,%r3291,%r3292,%r3293,%r3294,%r3295,%r3296,%r3297,%r3298,%r3299,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r3306,%r3307,%r3308,%r3309,%r3310,%r3311}, %rd317, %rd318, %p47, 1, 1, 0, 0; + // end inline asm + add.s32 %r4838, %r4305, %r3696; + bfe.u32 %r4839, %r4838, 4, 14; + cvt.u64.u32 %rd385, %r4839; + or.b64 %rd319, %rd385, 4611686293372403712; + add.s32 %r4840, %r3699, 96; + bfe.u32 %r4841, %r4840, 4, 14; + cvt.u64.u32 %rd386, %r4841; + or.b64 %rd320, %rd386, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3280,%r3281,%r3282,%r3283,%r3284,%r3285,%r3286,%r3287,%r3288,%r3289,%r3290,%r3291,%r3292,%r3293,%r3294,%r3295,%r3296,%r3297,%r3298,%r3299,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r3306,%r3307,%r3308,%r3309,%r3310,%r3311}, %rd319, %rd320, %p47, 1, 1, 0, 0; + // end inline asm + add.s32 %r4842, %r4310, %r3696; + bfe.u32 %r4843, %r4842, 4, 14; + cvt.u64.u32 %rd387, %r4843; + or.b64 %rd321, %rd387, 4611686293372403712; + add.s32 %r4844, %r3699, 8192; + bfe.u32 %r4845, %r4844, 4, 14; + cvt.u64.u32 %rd388, %r4845; + or.b64 %rd322, %rd388, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3280,%r3281,%r3282,%r3283,%r3284,%r3285,%r3286,%r3287,%r3288,%r3289,%r3290,%r3291,%r3292,%r3293,%r3294,%r3295,%r3296,%r3297,%r3298,%r3299,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r3306,%r3307,%r3308,%r3309,%r3310,%r3311}, %rd321, %rd322, %p47, 1, 1, 0, 0; + // end inline asm + add.s32 %r4846, %r4315, %r3696; + bfe.u32 %r4847, %r4846, 4, 14; + cvt.u64.u32 %rd389, %r4847; + or.b64 %rd323, %rd389, 4611686293372403712; + add.s32 %r4848, %r3699, 8224; + bfe.u32 %r4849, %r4848, 4, 14; + cvt.u64.u32 %rd390, %r4849; + or.b64 %rd324, %rd390, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3280,%r3281,%r3282,%r3283,%r3284,%r3285,%r3286,%r3287,%r3288,%r3289,%r3290,%r3291,%r3292,%r3293,%r3294,%r3295,%r3296,%r3297,%r3298,%r3299,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r3306,%r3307,%r3308,%r3309,%r3310,%r3311}, %rd323, %rd324, %p47, 1, 1, 0, 0; + // end inline asm + add.s32 %r4850, %r4320, %r3696; + bfe.u32 %r4851, %r4850, 4, 14; + cvt.u64.u32 %rd391, %r4851; + or.b64 %rd325, %rd391, 4611686293372403712; + add.s32 %r4852, %r3699, 8256; + bfe.u32 %r4853, %r4852, 4, 14; + cvt.u64.u32 %rd392, %r4853; + or.b64 %rd326, %rd392, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3280,%r3281,%r3282,%r3283,%r3284,%r3285,%r3286,%r3287,%r3288,%r3289,%r3290,%r3291,%r3292,%r3293,%r3294,%r3295,%r3296,%r3297,%r3298,%r3299,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r3306,%r3307,%r3308,%r3309,%r3310,%r3311}, %rd325, %rd326, %p47, 1, 1, 0, 0; + // end inline asm + add.s32 %r4854, %r4325, %r3696; + bfe.u32 %r4855, %r4854, 4, 14; + cvt.u64.u32 %rd393, %r4855; + or.b64 %rd327, %rd393, 4611686293372403712; + add.s32 %r4856, %r3699, 8288; + bfe.u32 %r4857, %r4856, 4, 14; + cvt.u64.u32 %rd394, %r4857; + or.b64 %rd328, %rd394, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3280,%r3281,%r3282,%r3283,%r3284,%r3285,%r3286,%r3287,%r3288,%r3289,%r3290,%r3291,%r3292,%r3293,%r3294,%r3295,%r3296,%r3297,%r3298,%r3299,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r3306,%r3307,%r3308,%r3309,%r3310,%r3311}, %rd327, %rd328, %p47, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r3698, %r3697; + mov.b32 %r3700, %r3697; + mov.b32 %r3701, %r3697; + // begin inline asm + // wait for regs: %r3280,%r3281,%r3282,%r3283,%r3284,%r3285,%r3286,%r3287,%r3288,%r3289,%r3290,%r3291,%r3292,%r3293,%r3294,%r3295,%r3296,%r3297,%r3298,%r3299,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r3306,%r3307,%r3308,%r3309,%r3310,%r3311,%r3696,%r3697,%r3698,%r3699,%r3700,%r3701 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 531 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mov.b64 {%r4858, %r4859}, %rd29; + sub.f32 %r4860, %r3280, %r4858; + sub.f32 %r4861, %r3281, %r4859; + .loc 1 531 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4862, %r4795, %r4861; + mul.f32 %r4863, %r4794, %r4860; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.rn.bf16.f32 %rs1, %r4863; + cvt.rn.bf16.f32 %rs2, %r4862; + .loc 1 549 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:549:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.b16 %rs3, %rs2, 0x0000, %p117; + selp.b16 %rs4, %rs1, 0x0000, %p118; + mov.b32 %r3868, {%rs4, %rs3}; + .loc 1 531 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mov.b64 {%r4864, %r4865}, %rd28; + sub.f32 %r4866, %r3282, %r4864; + sub.f32 %r4867, %r3283, %r4865; + .loc 1 531 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4868, %r4797, %r4867; + mul.f32 %r4869, %r4796, %r4866; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.rn.bf16.f32 %rs5, %r4869; + cvt.rn.bf16.f32 %rs6, %r4868; + .loc 1 549 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:549:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.b16 %rs7, %rs6, 0x0000, %p119; + selp.b16 %rs8, %rs5, 0x0000, %p120; + mov.b32 %r3869, {%rs8, %rs7}; + .loc 1 531 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.f32 %r4870, %r3284, %r4858; + sub.f32 %r4871, %r3285, %r4859; + .loc 1 531 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4872, %r4799, %r4871; + mul.f32 %r4873, %r4798, %r4870; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.rn.bf16.f32 %rs9, %r4873; + cvt.rn.bf16.f32 %rs10, %r4872; + .loc 1 549 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:549:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.b16 %rs11, %rs10, 0x0000, %p153; + selp.b16 %rs12, %rs9, 0x0000, %p154; + mov.b32 %r3870, {%rs12, %rs11}; + .loc 1 531 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.f32 %r4874, %r3286, %r4864; + sub.f32 %r4875, %r3287, %r4865; + .loc 1 531 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4876, %r4801, %r4875; + mul.f32 %r4877, %r4800, %r4874; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.rn.bf16.f32 %rs13, %r4877; + cvt.rn.bf16.f32 %rs14, %r4876; + .loc 1 549 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:549:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.b16 %rs15, %rs14, 0x0000, %p155; + selp.b16 %rs16, %rs13, 0x0000, %p156; + mov.b32 %r3871, {%rs16, %rs15}; + .loc 1 531 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.f32 %r4878, %r3288, %r4858; + sub.f32 %r4879, %r3289, %r4859; + .loc 1 531 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4880, %r4803, %r4879; + mul.f32 %r4881, %r4802, %r4878; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.rn.bf16.f32 %rs17, %r4881; + cvt.rn.bf16.f32 %rs18, %r4880; + .loc 1 549 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:549:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.b16 %rs19, %rs18, 0x0000, %p189; + selp.b16 %rs20, %rs17, 0x0000, %p190; + mov.b32 %r4000, {%rs20, %rs19}; + .loc 1 531 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.f32 %r4882, %r3290, %r4864; + sub.f32 %r4883, %r3291, %r4865; + .loc 1 531 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4884, %r4805, %r4883; + mul.f32 %r4885, %r4804, %r4882; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.rn.bf16.f32 %rs21, %r4885; + cvt.rn.bf16.f32 %rs22, %r4884; + .loc 1 549 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:549:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.b16 %rs23, %rs22, 0x0000, %p191; + selp.b16 %rs24, %rs21, 0x0000, %p192; + mov.b32 %r4001, {%rs24, %rs23}; + .loc 1 531 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.f32 %r4886, %r3292, %r4858; + sub.f32 %r4887, %r3293, %r4859; + .loc 1 531 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4888, %r4807, %r4887; + mul.f32 %r4889, %r4806, %r4886; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.rn.bf16.f32 %rs25, %r4889; + cvt.rn.bf16.f32 %rs26, %r4888; + .loc 1 549 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:549:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.b16 %rs27, %rs26, 0x0000, %p225; + selp.b16 %rs28, %rs25, 0x0000, %p226; + mov.b32 %r4002, {%rs28, %rs27}; + .loc 1 531 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.f32 %r4890, %r3294, %r4864; + sub.f32 %r4891, %r3295, %r4865; + .loc 1 531 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4892, %r4809, %r4891; + mul.f32 %r4893, %r4808, %r4890; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.rn.bf16.f32 %rs29, %r4893; + cvt.rn.bf16.f32 %rs30, %r4892; + .loc 1 549 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:549:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.b16 %rs31, %rs30, 0x0000, %p227; + selp.b16 %rs32, %rs29, 0x0000, %p228; + mov.b32 %r4003, {%rs32, %rs31}; + .loc 1 531 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.f32 %r4894, %r3296, %r4858; + sub.f32 %r4895, %r3297, %r4859; + .loc 1 531 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4896, %r4811, %r4895; + mul.f32 %r4897, %r4810, %r4894; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.rn.bf16.f32 %rs33, %r4897; + cvt.rn.bf16.f32 %rs34, %r4896; + .loc 1 549 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:549:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.b16 %rs35, %rs34, 0x0000, %p261; + selp.b16 %rs36, %rs33, 0x0000, %p262; + mov.b32 %r4132, {%rs36, %rs35}; + .loc 1 531 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.f32 %r4898, %r3298, %r4864; + sub.f32 %r4899, %r3299, %r4865; + .loc 1 531 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4900, %r4813, %r4899; + mul.f32 %r4901, %r4812, %r4898; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.rn.bf16.f32 %rs37, %r4901; + cvt.rn.bf16.f32 %rs38, %r4900; + .loc 1 549 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:549:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.b16 %rs39, %rs38, 0x0000, %p263; + selp.b16 %rs40, %rs37, 0x0000, %p264; + mov.b32 %r4133, {%rs40, %rs39}; + .loc 1 531 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.f32 %r4902, %r3300, %r4858; + sub.f32 %r4903, %r3301, %r4859; + .loc 1 531 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4904, %r4815, %r4903; + mul.f32 %r4905, %r4814, %r4902; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.rn.bf16.f32 %rs41, %r4905; + cvt.rn.bf16.f32 %rs42, %r4904; + .loc 1 549 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:549:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.b16 %rs43, %rs42, 0x0000, %p297; + selp.b16 %rs44, %rs41, 0x0000, %p298; + mov.b32 %r4134, {%rs44, %rs43}; + .loc 1 531 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.f32 %r4906, %r3302, %r4864; + sub.f32 %r4907, %r3303, %r4865; + .loc 1 531 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4908, %r4817, %r4907; + mul.f32 %r4909, %r4816, %r4906; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.rn.bf16.f32 %rs45, %r4909; + cvt.rn.bf16.f32 %rs46, %r4908; + .loc 1 549 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:549:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.b16 %rs47, %rs46, 0x0000, %p299; + selp.b16 %rs48, %rs45, 0x0000, %p300; + mov.b32 %r4135, {%rs48, %rs47}; + .loc 1 531 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.f32 %r4910, %r3304, %r4858; + sub.f32 %r4911, %r3305, %r4859; + .loc 1 531 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4912, %r4819, %r4911; + mul.f32 %r4913, %r4818, %r4910; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.rn.bf16.f32 %rs49, %r4913; + cvt.rn.bf16.f32 %rs50, %r4912; + .loc 1 549 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:549:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.b16 %rs51, %rs50, 0x0000, %p333; + selp.b16 %rs52, %rs49, 0x0000, %p334; + mov.b32 %r4264, {%rs52, %rs51}; + .loc 1 531 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.f32 %r4914, %r3306, %r4864; + sub.f32 %r4915, %r3307, %r4865; + .loc 1 531 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4916, %r4821, %r4915; + mul.f32 %r4917, %r4820, %r4914; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.rn.bf16.f32 %rs53, %r4917; + cvt.rn.bf16.f32 %rs54, %r4916; + .loc 1 549 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:549:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.b16 %rs55, %rs54, 0x0000, %p335; + selp.b16 %rs56, %rs53, 0x0000, %p336; + mov.b32 %r4265, {%rs56, %rs55}; + .loc 1 531 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.f32 %r4918, %r3308, %r4858; + sub.f32 %r4919, %r3309, %r4859; + .loc 1 531 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4920, %r4823, %r4919; + mul.f32 %r4921, %r4822, %r4918; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.rn.bf16.f32 %rs57, %r4921; + cvt.rn.bf16.f32 %rs58, %r4920; + .loc 1 549 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:549:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.b16 %rs59, %rs58, 0x0000, %p369; + selp.b16 %rs60, %rs57, 0x0000, %p370; + mov.b32 %r4266, {%rs60, %rs59}; + .loc 1 531 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.f32 %r4922, %r3310, %r4864; + sub.f32 %r4923, %r3311, %r4865; + .loc 1 531 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.f32 %r4924, %r4825, %r4923; + mul.f32 %r4925, %r4824, %r4922; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + cvt.rn.bf16.f32 %rs61, %r4925; + cvt.rn.bf16.f32 %rs62, %r4924; + .loc 1 549 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:549:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + selp.b16 %rs63, %rs62, 0x0000, %p371; + selp.b16 %rs64, %rs61, 0x0000, %p372; + mov.b32 %r4267, {%rs64, %rs63}; + .loc 1 553 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:553:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519}, {%r3868,%r3869,%r3870,%r3871}, %rd298, %p47, 1, 1, 1; + // end inline asm + add.s32 %r4926, %r3181, 2048; + bfe.u32 %r4927, %r4926, 4, 14; + cvt.u64.u32 %rd395, %r4927; + or.b64 %rd330, %rd395, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519}, {%r4000,%r4001,%r4002,%r4003}, %rd330, %p47, 1, 1, 1; + // end inline asm + add.s32 %r4928, %r3181, 4096; + bfe.u32 %r4929, %r4928, 4, 14; + cvt.u64.u32 %rd396, %r4929; + or.b64 %rd331, %rd396, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519}, {%r4132,%r4133,%r4134,%r4135}, %rd331, %p47, 1, 1, 1; + // end inline asm + add.s32 %r4930, %r3181, 6144; + bfe.u32 %r4931, %r4930, 4, 14; + cvt.u64.u32 %rd397, %r4931; + or.b64 %rd332, %rd397, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519}, {%r4264,%r4265,%r4266,%r4267}, %rd332, %p47, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 417 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:417:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + add.s32 %r14535, %r14449, %r14535; + add.s32 %r14536, %r14449, %r14536; + add.s32 %r14533, %r14449, %r14533; + add.s32 %r14534, %r14449, %r14534; + add.s32 %r14531, %r14449, %r14531; + add.s32 %r14532, %r14449, %r14532; + add.s32 %r14529, %r14449, %r14529; + add.s32 %r14530, %r14449, %r14530; + add.s32 %r14527, %r14449, %r14527; + add.s32 %r14528, %r14449, %r14528; + add.s32 %r14525, %r14449, %r14525; + add.s32 %r14526, %r14449, %r14526; + add.s32 %r14523, %r14449, %r14523; + add.s32 %r14524, %r14449, %r14524; + add.s32 %r14521, %r14449, %r14521; + add.s32 %r14522, %r14449, %r14522; + .loc 1 397 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:397:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + add.s32 %r277, %r14520, 1; + .loc 1 788 33 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:788:33 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shr.u32 %r4932, %r277, 1; + .loc 1 789 38 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:789:38 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mad.wide.u32 %rd334, %r4932, 4, %rd233; + .loc 1 789 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:789:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + // begin inline asm + mov.u64 %rd333, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd333, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4268, 0x0; + @%p65 ld.global.L1::evict_last.L2::cache_hint.b32 { %r4268 }, [ %rd334 + 0 ], %rd333; + // end inline asm + .loc 1 790 109 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:790:109 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + add.s32 %r4933, %r4932, 1; + .loc 1 790 113 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:790:113 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.lt.s32 %p373, %r4933, %r2514; + .loc 1 790 55 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:790:55 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + add.s64 %rd337, %rd334, 4; + .loc 1 397 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:397:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.pred %p66, %p65, %p373; + .loc 1 790 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:790:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + // begin inline asm + mov.u64 %rd336, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd336, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4269, 0x0; + @%p66 ld.global.L1::evict_last.L2::cache_hint.b32 { %r4269 }, [ %rd337 + 0 ], %rd336; + // end inline asm + .loc 1 791 35 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:791:35 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + and.b32 %r4934, %r14520, 1; + .loc 1 792 34 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:792:34 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + sub.s32 %r4935, %r4269, %r4268; + .loc 1 792 48 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:792:48 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shl.b32 %r4936, %r4935, 7; + .loc 1 792 63 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:792:63 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + add.s32 %r4937, %r4936, -64; + .loc 1 793 29 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:793:29 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + xor.b32 %r4938, %r4934, 1; + .loc 1 793 61 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:793:61 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shl.b32 %r4939, %r4934, 6; + .loc 1 793 42 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:793:42 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mad.lo.s32 %r14449, %r4937, %r4938, %r4939; + .loc 1 414 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:414:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shl.b32 %r4940, %r14449, 7; + .loc 1 414 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:414:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + mul.wide.s32 %rd398, %r4940, 2; + add.s64 %rd1184, %rd1184, %rd398; + add.s64 %rd1183, %rd1183, %rd398; + add.s64 %rd1182, %rd1182, %rd398; + add.s64 %rd1181, %rd1181, %rd398; + .loc 1 415 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:415:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + add.s64 %rd1180, %rd1180, %rd398; + add.s64 %rd1179, %rd1179, %rd398; + add.s64 %rd1178, %rd1178, %rd398; + add.s64 %rd1177, %rd1177, %rd398; + .loc 1 417 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:417:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + add.s32 %r14455, %r14449, %r14455; + add.s32 %r14454, %r14449, %r14454; + add.s32 %r14453, %r14449, %r14453; + add.s32 %r14452, %r14449, %r14452; + .loc 1 397 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:397:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + add.s32 %r4941, %r14451, 1; + setp.gt.s32 %p374, %r4941, 2; + selp.b32 %r14451, 0, %r4941, %p374; + .loc 1 831 52 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:52 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.lt.s32 %p375, %r14455, %r2359; + setp.lt.s32 %p376, %r14454, %r2359; + setp.lt.s32 %p377, %r14453, %r2359; + setp.lt.s32 %p378, %r14452, %r2359; + .loc 1 831 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + shl.b32 %r4942, %r14451, 14; + add.s32 %r4943, %r2590, %r4942; + bar.sync 0; + add.s32 %r4270, %r4943, %r60; + selp.b32 %r4944, 16, 0, %p375; + selp.b32 %r4279, %r4944, 0, %p67; + // begin inline asm + cp.async.cg.shared.global [ %r4270 + 0 ], [ %rd1184 + 0 ], 0x10, %r4279; + // end inline asm + add.s32 %r4272, %r4270, 2048; + selp.b32 %r4945, 16, 0, %p376; + selp.b32 %r4281, %r4945, 0, %p67; + // begin inline asm + cp.async.cg.shared.global [ %r4272 + 0 ], [ %rd1183 + 0 ], 0x10, %r4281; + // end inline asm + add.s32 %r4274, %r4270, 4096; + selp.b32 %r4946, 16, 0, %p377; + selp.b32 %r4283, %r4946, 0, %p67; + // begin inline asm + cp.async.cg.shared.global [ %r4274 + 0 ], [ %rd1182 + 0 ], 0x10, %r4283; + // end inline asm + add.s32 %r4276, %r4270, 6144; + selp.b32 %r4947, 16, 0, %p378; + selp.b32 %r4285, %r4947, 0, %p67; + // begin inline asm + cp.async.cg.shared.global [ %r4276 + 0 ], [ %rd1181 + 0 ], 0x10, %r4285; + // end inline asm + cp.async.commit_group; + add.s32 %r4948, %r4826, %r4942; + add.s32 %r4278, %r4948, %r60; + // begin inline asm + cp.async.cg.shared.global [ %r4278 + 0 ], [ %rd1180 + 0 ], 0x10, %r4279; + // end inline asm + add.s32 %r4280, %r4278, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r4280 + 0 ], [ %rd1179 + 0 ], 0x10, %r4281; + // end inline asm + add.s32 %r4282, %r4278, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r4282 + 0 ], [ %rd1178 + 0 ], 0x10, %r4283; + // end inline asm + add.s32 %r4284, %r4278, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r4284 + 0 ], [ %rd1177 + 0 ], 0x10, %r4285; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:397:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + setp.ne.b32 %p379, %r99, %r277; + mov.b32 %r14520, %r277; + @%p379 bra $L__BB0_3; +$L__tmp28: +$L__BB0_4: // %._crit_edge + .loc 1 0 0 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:0 + add.s64 %rd4, %rd198, %rd253; +$L__tmp29: + cvt.s64.s32 %rd5, %r2571; + cvt.s64.s32 %rd6, %r2572; + cvt.s64.s32 %rd7, %r2573; + cvt.s64.s32 %rd8, %r2574; + cvt.s64.s32 %rd9, %r2575; + cvt.s64.s32 %rd10, %r2576; + cvt.s64.s32 %rd11, %r2577; + cvt.s64.s32 %rd12, %r2578; +$L__tmp30: + .loc 1 397 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:397:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:207:12 ] + // begin inline asm + // wait for regs: %r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp31: + .loc 1 214 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:214:39 + shl.b64 %rd417, %rd14, 2; + add.s64 %rd399, %rd204, %rd417; + .loc 1 215 31 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:215:31 + // begin inline asm + mov.u32 %r5077, 0x0; + ld.global.b32 { %r5077 }, [ %rd399 + 0 ]; + // end inline asm + .loc 1 215 45 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:215:45 + shl.b32 %r348, %r5077, 7; + .loc 1 216 62 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:216:62 + shl.b64 %rd418, %rd16, 2; + add.s64 %rd400, %rd203, %rd418; + .loc 1 216 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:216:43 + // begin inline asm + mov.u32 %r5078, 0x0; + ld.global.b32 { %r5078 }, [ %rd400 + 0 ]; + // end inline asm + .loc 1 218 33 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:218:33 + or.b32 %r5111, %r348, %r13; + or.b32 %r5112, %r348, %r14; + or.b32 %r5113, %r348, %r15; + or.b32 %r5114, %r348, %r16; +$L__tmp32: + .loc 1 390 37 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:390:37 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + shl.b32 %r5115, %r5111, 7; + shl.b32 %r5116, %r5112, 7; + shl.b32 %r5117, %r5113, 7; + shl.b32 %r5118, %r5114, 7; + .loc 1 390 18 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:390:18 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.wide.s32 %rd419, %r5115, 2; + add.s64 %rd420, %rd1, %rd419; + mul.wide.s32 %rd421, %r5116, 2; + add.s64 %rd422, %rd1, %rd421; + mul.wide.s32 %rd423, %r5117, 2; + add.s64 %rd424, %rd1, %rd423; + mul.wide.s32 %rd425, %r5118, 2; + add.s64 %rd426, %rd1, %rd425; + .loc 1 390 49 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:390:49 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + shl.b64 %rd427, %rd13, 1; + add.s64 %rd401, %rd420, %rd427; + add.s64 %rd402, %rd422, %rd427; + add.s64 %rd403, %rd424, %rd427; + add.s64 %rd404, %rd426, %rd427; + .loc 1 391 18 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:391:18 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + add.s64 %rd428, %rd2, %rd419; + add.s64 %rd429, %rd2, %rd421; + add.s64 %rd430, %rd2, %rd423; + add.s64 %rd431, %rd2, %rd425; + .loc 1 391 49 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:391:49 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + add.s64 %rd405, %rd428, %rd427; + add.s64 %rd406, %rd429, %rd427; + add.s64 %rd407, %rd430, %rd427; + add.s64 %rd408, %rd431, %rd427; + .loc 1 395 43 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:395:43 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + shl.b32 %r5119, %r5078, 1; + .loc 1 395 63 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:395:63 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + min.s32 %r350, %r5119, %r58; + .loc 1 397 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:397:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + setp.lt.s32 %p380, %r5119, 1; + setp.gt.s32 %p381, %r5119, 0; + .loc 1 831 52 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:52 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + setp.lt.s32 %p382, %r5111, %r2359; + setp.lt.s32 %p383, %r5112, %r2359; + setp.lt.s32 %p384, %r5113, %r2359; + setp.lt.s32 %p385, %r5114, %r2359; + .loc 1 831 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b32 %r5120, 16, 0, %p382; + selp.b32 %r5088, %r5120, 0, %p381; + // begin inline asm + cp.async.cg.shared.global [ %r5079 + 0 ], [ %rd401 + 0 ], 0x10, %r5088; + // end inline asm + selp.b32 %r5121, 16, 0, %p383; + selp.b32 %r5090, %r5121, 0, %p381; + // begin inline asm + cp.async.cg.shared.global [ %r5081 + 0 ], [ %rd402 + 0 ], 0x10, %r5090; + // end inline asm + selp.b32 %r5122, 16, 0, %p384; + selp.b32 %r5092, %r5122, 0, %p381; + // begin inline asm + cp.async.cg.shared.global [ %r5083 + 0 ], [ %rd403 + 0 ], 0x10, %r5092; + // end inline asm + selp.b32 %r5123, 16, 0, %p385; + selp.b32 %r5094, %r5123, 0, %p381; + // begin inline asm + cp.async.cg.shared.global [ %r5085 + 0 ], [ %rd404 + 0 ], 0x10, %r5094; + // end inline asm + cp.async.commit_group; + // begin inline asm + cp.async.cg.shared.global [ %r2523 + 0 ], [ %rd405 + 0 ], 0x10, %r5088; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2525 + 0 ], [ %rd406 + 0 ], 0x10, %r5090; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2527 + 0 ], [ %rd407 + 0 ], 0x10, %r5092; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2529 + 0 ], [ %rd408 + 0 ], 0x10, %r5094; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:397:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + setp.gt.s32 %p386, %r350, 1; + .loc 1 414 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:414:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + add.s64 %rd1192, %rd401, 16384; + add.s64 %rd1191, %rd402, 16384; + add.s64 %rd1190, %rd403, 16384; + add.s64 %rd1189, %rd404, 16384; + .loc 1 415 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:415:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + add.s64 %rd1188, %rd405, 16384; + add.s64 %rd1187, %rd406, 16384; + add.s64 %rd1186, %rd407, 16384; + add.s64 %rd1185, %rd408, 16384; + .loc 1 417 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:417:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + or.b32 %r14607, %r5111, 64; + or.b32 %r14606, %r5112, 64; + or.b32 %r14605, %r5113, 64; + or.b32 %r14604, %r5114, 64; + .loc 1 831 52 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:52 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + setp.lt.s32 %p387, %r14607, %r2359; + setp.lt.s32 %p388, %r14606, %r2359; + setp.lt.s32 %p389, %r14605, %r2359; + setp.lt.s32 %p390, %r14604, %r2359; + .loc 1 831 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + bar.sync 0; + selp.b32 %r5124, 16, 0, %p387; + selp.b32 %r5104, %r5124, 0, %p386; + // begin inline asm + cp.async.cg.shared.global [ %r2531 + 0 ], [ %rd1192 + 0 ], 0x10, %r5104; + // end inline asm + selp.b32 %r5125, 16, 0, %p388; + selp.b32 %r5106, %r5125, 0, %p386; + // begin inline asm + cp.async.cg.shared.global [ %r2533 + 0 ], [ %rd1191 + 0 ], 0x10, %r5106; + // end inline asm + selp.b32 %r5126, 16, 0, %p389; + selp.b32 %r5108, %r5126, 0, %p386; + // begin inline asm + cp.async.cg.shared.global [ %r2535 + 0 ], [ %rd1190 + 0 ], 0x10, %r5108; + // end inline asm + selp.b32 %r5127, 16, 0, %p390; + selp.b32 %r5110, %r5127, 0, %p386; + // begin inline asm + cp.async.cg.shared.global [ %r2537 + 0 ], [ %rd1189 + 0 ], 0x10, %r5110; + // end inline asm + cp.async.commit_group; + // begin inline asm + cp.async.cg.shared.global [ %r2539 + 0 ], [ %rd1188 + 0 ], 0x10, %r5104; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2541 + 0 ], [ %rd1187 + 0 ], 0x10, %r5106; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2543 + 0 ], [ %rd1186 + 0 ], 0x10, %r5108; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2545 + 0 ], [ %rd1185 + 0 ], 0x10, %r5110; + // end inline asm + cp.async.commit_group; + .loc 1 459 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:459:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + .loc 1 397 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:397:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + @%p380 bra $L__BB0_7; +$L__tmp33: +// %bb.5: // %.lr.ph1592 + .loc 1 218 33 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:218:33 + or.b32 %r14688, %r348, %r42; + or.b32 %r14687, %r348, %r41; + or.b32 %r14686, %r348, %r44; + or.b32 %r14681, %r348, %r46; + or.b32 %r14673, %r348, %r50; + or.b32 %r14685, %r348, %r45; + or.b32 %r14682, %r348, %r47; + or.b32 %r14674, %r348, %r51; + or.b32 %r14683, %r348, %r48; + or.b32 %r14675, %r348, %r52; + or.b32 %r14684, %r348, %r49; + or.b32 %r14676, %r348, %r53; + or.b32 %r14677, %r348, %r54; + or.b32 %r14678, %r348, %r55; + or.b32 %r14679, %r348, %r56; + or.b32 %r14680, %r348, %r57; + add.s32 %r435, %r350, -2; + add.s32 %r436, %r350, -1; +$L__tmp34: + .loc 1 397 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:397:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + max.s32 %r437, %r350, 1; + mov.b32 %r5683, 0; + mov.b32 %r14603, 1; + mov.b32 %r14602, -1; + mov.b32 %r14601, 64; + mov.b32 %r14672, %r5683; +$L__BB0_6: // %__nv_exp2f.exit1338 + // =>This Inner Loop Header: Depth=1 + setp.lt.s32 %p411, %r14672, %r435; + setp.lt.s32 %p409, %r14672, %r436; + add.s32 %r6790, %r14602, 1; + setp.gt.s32 %p412, %r6790, 2; + selp.b32 %r14602, 0, %r6790, %p412; + .loc 1 831 52 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:52 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + setp.lt.s32 %p413, %r14688, %r2359; + setp.lt.s32 %p414, %r14687, %r2359; + setp.lt.s32 %p415, %r14686, %r2359; + setp.lt.s32 %p416, %r14685, %r2359; + setp.lt.s32 %p417, %r14684, %r2359; + setp.lt.s32 %p418, %r14683, %r2359; + setp.lt.s32 %p419, %r14682, %r2359; + setp.lt.s32 %p420, %r14681, %r2359; + setp.lt.s32 %p421, %r14680, %r2359; + setp.lt.s32 %p422, %r14679, %r2359; + setp.lt.s32 %p423, %r14678, %r2359; + setp.lt.s32 %p424, %r14677, %r2359; + setp.lt.s32 %p425, %r14676, %r2359; + setp.lt.s32 %p426, %r14675, %r2359; + setp.lt.s32 %p427, %r14674, %r2359; + setp.lt.s32 %p428, %r14673, %r2359; + .loc 1 831 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r6791, %r14602, 14; + add.s32 %r5685, %r2590, %r6791; + .loc 1 459 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:459:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + shfl.sync.idx.b32 %r6793, %r11, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r6794, %r6793, 11; + and.b32 %r6795, %r6794, 8192; + add.s32 %r5644, %r2590, 98304; + add.s32 %r6796, %r6795, %r5644; + bfe.u32 %r6797, %r6796, 4, 14; + cvt.u64.u32 %rd482, %r6797; + or.b64 %rd432, %rd482, 4611686293372403712; + bfe.u32 %r6798, %r5685, 4, 14; + cvt.u64.u32 %rd483, %r6798; + or.b64 %rd433, %rd483, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5228,%r5229,%r5230,%r5231,%r5232,%r5233,%r5234,%r5235,%r5236,%r5237,%r5238,%r5239,%r5240,%r5241,%r5242,%r5243,%r5244,%r5245,%r5246,%r5247,%r5248,%r5249,%r5250,%r5251,%r5252,%r5253,%r5254,%r5255,%r5256,%r5257,%r5258,%r5259}, %rd432, %rd433, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r6799, %r6795, 32; + add.s32 %r6800, %r6799, %r5644; + bfe.u32 %r6801, %r6800, 4, 14; + cvt.u64.u32 %rd484, %r6801; + or.b64 %rd434, %rd484, 4611686293372403712; + add.s32 %r6802, %r5685, 32; + bfe.u32 %r6803, %r6802, 4, 14; + cvt.u64.u32 %rd485, %r6803; + or.b64 %rd435, %rd485, 4611686293338849280; + mov.pred %p391, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5228,%r5229,%r5230,%r5231,%r5232,%r5233,%r5234,%r5235,%r5236,%r5237,%r5238,%r5239,%r5240,%r5241,%r5242,%r5243,%r5244,%r5245,%r5246,%r5247,%r5248,%r5249,%r5250,%r5251,%r5252,%r5253,%r5254,%r5255,%r5256,%r5257,%r5258,%r5259}, %rd434, %rd435, %p391, 1, 1, 0, 0; + // end inline asm + or.b32 %r6804, %r6795, 64; + add.s32 %r6805, %r6804, %r5644; + bfe.u32 %r6806, %r6805, 4, 14; + cvt.u64.u32 %rd486, %r6806; + or.b64 %rd436, %rd486, 4611686293372403712; + add.s32 %r6807, %r5685, 64; + bfe.u32 %r6808, %r6807, 4, 14; + cvt.u64.u32 %rd487, %r6808; + or.b64 %rd437, %rd487, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5228,%r5229,%r5230,%r5231,%r5232,%r5233,%r5234,%r5235,%r5236,%r5237,%r5238,%r5239,%r5240,%r5241,%r5242,%r5243,%r5244,%r5245,%r5246,%r5247,%r5248,%r5249,%r5250,%r5251,%r5252,%r5253,%r5254,%r5255,%r5256,%r5257,%r5258,%r5259}, %rd436, %rd437, %p391, 1, 1, 0, 0; + // end inline asm + or.b32 %r6809, %r6795, 96; + add.s32 %r6810, %r6809, %r5644; + bfe.u32 %r6811, %r6810, 4, 14; + cvt.u64.u32 %rd488, %r6811; + or.b64 %rd438, %rd488, 4611686293372403712; + add.s32 %r6812, %r5685, 96; + bfe.u32 %r6813, %r6812, 4, 14; + cvt.u64.u32 %rd489, %r6813; + or.b64 %rd439, %rd489, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5228,%r5229,%r5230,%r5231,%r5232,%r5233,%r5234,%r5235,%r5236,%r5237,%r5238,%r5239,%r5240,%r5241,%r5242,%r5243,%r5244,%r5245,%r5246,%r5247,%r5248,%r5249,%r5250,%r5251,%r5252,%r5253,%r5254,%r5255,%r5256,%r5257,%r5258,%r5259}, %rd438, %rd439, %p391, 1, 1, 0, 0; + // end inline asm + or.b32 %r6814, %r6795, 16384; + add.s32 %r6815, %r6814, %r5644; + bfe.u32 %r6816, %r6815, 4, 14; + cvt.u64.u32 %rd490, %r6816; + or.b64 %rd440, %rd490, 4611686293372403712; + add.s32 %r6817, %r5685, 8192; + bfe.u32 %r6818, %r6817, 4, 14; + cvt.u64.u32 %rd491, %r6818; + or.b64 %rd441, %rd491, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5228,%r5229,%r5230,%r5231,%r5232,%r5233,%r5234,%r5235,%r5236,%r5237,%r5238,%r5239,%r5240,%r5241,%r5242,%r5243,%r5244,%r5245,%r5246,%r5247,%r5248,%r5249,%r5250,%r5251,%r5252,%r5253,%r5254,%r5255,%r5256,%r5257,%r5258,%r5259}, %rd440, %rd441, %p391, 1, 1, 0, 0; + // end inline asm + or.b32 %r6819, %r6795, 16416; + add.s32 %r6820, %r6819, %r5644; + bfe.u32 %r6821, %r6820, 4, 14; + cvt.u64.u32 %rd492, %r6821; + or.b64 %rd442, %rd492, 4611686293372403712; + add.s32 %r6822, %r5685, 8224; + bfe.u32 %r6823, %r6822, 4, 14; + cvt.u64.u32 %rd493, %r6823; + or.b64 %rd443, %rd493, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5228,%r5229,%r5230,%r5231,%r5232,%r5233,%r5234,%r5235,%r5236,%r5237,%r5238,%r5239,%r5240,%r5241,%r5242,%r5243,%r5244,%r5245,%r5246,%r5247,%r5248,%r5249,%r5250,%r5251,%r5252,%r5253,%r5254,%r5255,%r5256,%r5257,%r5258,%r5259}, %rd442, %rd443, %p391, 1, 1, 0, 0; + // end inline asm + or.b32 %r6824, %r6795, 16448; + add.s32 %r6825, %r6824, %r5644; + bfe.u32 %r6826, %r6825, 4, 14; + cvt.u64.u32 %rd494, %r6826; + or.b64 %rd444, %rd494, 4611686293372403712; + add.s32 %r6827, %r5685, 8256; + bfe.u32 %r6828, %r6827, 4, 14; + cvt.u64.u32 %rd495, %r6828; + or.b64 %rd445, %rd495, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5228,%r5229,%r5230,%r5231,%r5232,%r5233,%r5234,%r5235,%r5236,%r5237,%r5238,%r5239,%r5240,%r5241,%r5242,%r5243,%r5244,%r5245,%r5246,%r5247,%r5248,%r5249,%r5250,%r5251,%r5252,%r5253,%r5254,%r5255,%r5256,%r5257,%r5258,%r5259}, %rd444, %rd445, %p391, 1, 1, 0, 0; + // end inline asm + or.b32 %r6829, %r6795, 16480; + add.s32 %r6830, %r6829, %r5644; + bfe.u32 %r6831, %r6830, 4, 14; + cvt.u64.u32 %rd496, %r6831; + or.b64 %rd446, %rd496, 4611686293372403712; + add.s32 %r6832, %r5685, 8288; + bfe.u32 %r6833, %r6832, 4, 14; + cvt.u64.u32 %rd497, %r6833; + or.b64 %rd447, %rd497, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5228,%r5229,%r5230,%r5231,%r5232,%r5233,%r5234,%r5235,%r5236,%r5237,%r5238,%r5239,%r5240,%r5241,%r5242,%r5243,%r5244,%r5245,%r5246,%r5247,%r5248,%r5249,%r5250,%r5251,%r5252,%r5253,%r5254,%r5255,%r5256,%r5257,%r5258,%r5259}, %rd446, %rd447, %p391, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r5647, %r5685; + mov.b32 %r5645, %r5683; + mov.b32 %r5646, %r5683; + mov.b32 %r5648, %r5683; + mov.b32 %r5649, %r5683; + // begin inline asm + // wait for regs: %r5228,%r5229,%r5230,%r5231,%r5232,%r5233,%r5234,%r5235,%r5236,%r5237,%r5238,%r5239,%r5240,%r5241,%r5242,%r5243,%r5244,%r5245,%r5246,%r5247,%r5248,%r5249,%r5250,%r5251,%r5252,%r5253,%r5254,%r5255,%r5256,%r5257,%r5258,%r5259,%r5644,%r5645,%r5646,%r5647,%r5648,%r5649 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 461 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:461:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6834, %r5228, 0f3DB504F3; + mul.f32 %r6835, %r5229, 0f3DB504F3; + mul.f32 %r6836, %r5230, 0f3DB504F3; + mul.f32 %r6837, %r5231, 0f3DB504F3; + mul.f32 %r6838, %r5232, 0f3DB504F3; + mul.f32 %r6839, %r5233, 0f3DB504F3; + mul.f32 %r6840, %r5234, 0f3DB504F3; + mul.f32 %r6841, %r5235, 0f3DB504F3; + mul.f32 %r6842, %r5236, 0f3DB504F3; + mul.f32 %r6843, %r5237, 0f3DB504F3; + mul.f32 %r6844, %r5238, 0f3DB504F3; + mul.f32 %r6845, %r5239, 0f3DB504F3; + mul.f32 %r6846, %r5240, 0f3DB504F3; + mul.f32 %r6847, %r5241, 0f3DB504F3; + mul.f32 %r6848, %r5242, 0f3DB504F3; + mul.f32 %r6849, %r5243, 0f3DB504F3; + mul.f32 %r6850, %r5244, 0f3DB504F3; + mul.f32 %r6851, %r5245, 0f3DB504F3; + mul.f32 %r6852, %r5246, 0f3DB504F3; + mul.f32 %r6853, %r5247, 0f3DB504F3; + mul.f32 %r6854, %r5248, 0f3DB504F3; + mul.f32 %r6855, %r5249, 0f3DB504F3; + mul.f32 %r6856, %r5250, 0f3DB504F3; + mul.f32 %r6857, %r5251, 0f3DB504F3; + mul.f32 %r6858, %r5252, 0f3DB504F3; + mul.f32 %r6859, %r5253, 0f3DB504F3; + mul.f32 %r6860, %r5254, 0f3DB504F3; + mul.f32 %r6861, %r5255, 0f3DB504F3; + mul.f32 %r6862, %r5256, 0f3DB504F3; + mul.f32 %r6863, %r5257, 0f3DB504F3; + mul.f32 %r6864, %r5258, 0f3DB504F3; + mul.f32 %r6865, %r5259, 0f3DB504F3; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6866, %r6834, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6867, %r6866, 0fFF800000, %p413; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6868, %r6835, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6869, %r6868, 0fFF800000, %p414; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6870, %r6836, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6871, %r6870, 0fFF800000, %p413; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6872, %r6837, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6873, %r6872, 0fFF800000, %p414; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6874, %r6838, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6875, %r6874, 0fFF800000, %p415; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6876, %r6839, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6877, %r6876, 0fFF800000, %p416; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6878, %r6840, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6879, %r6878, 0fFF800000, %p415; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6880, %r6841, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6881, %r6880, 0fFF800000, %p416; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6882, %r6842, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6883, %r6882, 0fFF800000, %p417; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6884, %r6843, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6885, %r6884, 0fFF800000, %p418; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6886, %r6844, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6887, %r6886, 0fFF800000, %p417; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6888, %r6845, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6889, %r6888, 0fFF800000, %p418; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6890, %r6846, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6891, %r6890, 0fFF800000, %p419; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6892, %r6847, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6893, %r6892, 0fFF800000, %p420; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6894, %r6848, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6895, %r6894, 0fFF800000, %p419; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6896, %r6849, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6897, %r6896, 0fFF800000, %p420; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6898, %r6850, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6899, %r6898, 0fFF800000, %p421; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6900, %r6851, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6901, %r6900, 0fFF800000, %p422; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6902, %r6852, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6903, %r6902, 0fFF800000, %p421; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6904, %r6853, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6905, %r6904, 0fFF800000, %p422; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6906, %r6854, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6907, %r6906, 0fFF800000, %p423; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6908, %r6855, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6909, %r6908, 0fFF800000, %p424; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6910, %r6856, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6911, %r6910, 0fFF800000, %p423; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6912, %r6857, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6913, %r6912, 0fFF800000, %p424; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6914, %r6858, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6915, %r6914, 0fFF800000, %p425; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6916, %r6859, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6917, %r6916, 0fFF800000, %p426; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6918, %r6860, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6919, %r6918, 0fFF800000, %p425; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6920, %r6861, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6921, %r6920, 0fFF800000, %p426; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6922, %r6862, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6923, %r6922, 0fFF800000, %p427; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6924, %r6863, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6925, %r6924, 0fFF800000, %p428; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6926, %r6864, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6927, %r6926, 0fFF800000, %p427; + .loc 1 524 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:524:27 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r6928, %r6865, 0f3FB8AA3B; + .loc 1 476 79 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:476:79 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.f32 %r6929, %r6928, 0fFF800000, %p428; + .loc 1 525 39 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:525:39 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + sub.f32 %r6930, %r6867, %r35; + sub.f32 %r6931, %r6869, %r35; + sub.f32 %r6932, %r6871, %r36; + sub.f32 %r6933, %r6873, %r36; + sub.f32 %r6934, %r6875, %r35; + sub.f32 %r6935, %r6877, %r35; + sub.f32 %r6936, %r6879, %r36; + sub.f32 %r6937, %r6881, %r36; + sub.f32 %r6938, %r6883, %r35; + sub.f32 %r6939, %r6885, %r35; + sub.f32 %r6940, %r6887, %r36; + sub.f32 %r6941, %r6889, %r36; + sub.f32 %r6942, %r6891, %r35; + sub.f32 %r6943, %r6893, %r35; + sub.f32 %r6944, %r6895, %r36; + sub.f32 %r6945, %r6897, %r36; + sub.f32 %r6946, %r6899, %r35; + sub.f32 %r6947, %r6901, %r35; + sub.f32 %r6948, %r6903, %r36; + sub.f32 %r6949, %r6905, %r36; + sub.f32 %r6950, %r6907, %r35; + sub.f32 %r6951, %r6909, %r35; + sub.f32 %r6952, %r6911, %r36; + sub.f32 %r6953, %r6913, %r36; + sub.f32 %r6954, %r6915, %r35; + sub.f32 %r6955, %r6917, %r35; + sub.f32 %r6956, %r6919, %r36; + sub.f32 %r6957, %r6921, %r36; + sub.f32 %r6958, %r6923, %r35; + sub.f32 %r6959, %r6925, %r35; + sub.f32 %r6960, %r6927, %r36; + sub.f32 %r6961, %r6929, %r36; + .loc 1 525 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:525:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + ex2.approx.ftz.f32 %r6962, %r6930; + ex2.approx.ftz.f32 %r6963, %r6931; + ex2.approx.ftz.f32 %r6964, %r6932; + ex2.approx.ftz.f32 %r6965, %r6933; + ex2.approx.ftz.f32 %r6966, %r6934; + ex2.approx.ftz.f32 %r6967, %r6935; + ex2.approx.ftz.f32 %r6968, %r6936; + ex2.approx.ftz.f32 %r6969, %r6937; + ex2.approx.ftz.f32 %r6970, %r6938; + ex2.approx.ftz.f32 %r6971, %r6939; + ex2.approx.ftz.f32 %r6972, %r6940; + ex2.approx.ftz.f32 %r6973, %r6941; + ex2.approx.ftz.f32 %r6974, %r6942; + ex2.approx.ftz.f32 %r6975, %r6943; + ex2.approx.ftz.f32 %r6976, %r6944; + ex2.approx.ftz.f32 %r6977, %r6945; + ex2.approx.ftz.f32 %r6978, %r6946; + ex2.approx.ftz.f32 %r6979, %r6947; + ex2.approx.ftz.f32 %r6980, %r6948; + ex2.approx.ftz.f32 %r6981, %r6949; + ex2.approx.ftz.f32 %r6982, %r6950; + ex2.approx.ftz.f32 %r6983, %r6951; + ex2.approx.ftz.f32 %r6984, %r6952; + ex2.approx.ftz.f32 %r6985, %r6953; + ex2.approx.ftz.f32 %r6986, %r6954; + ex2.approx.ftz.f32 %r6987, %r6955; + ex2.approx.ftz.f32 %r6988, %r6956; + ex2.approx.ftz.f32 %r6989, %r6957; + ex2.approx.ftz.f32 %r6990, %r6958; + ex2.approx.ftz.f32 %r6991, %r6959; + ex2.approx.ftz.f32 %r6992, %r6960; + ex2.approx.ftz.f32 %r6993, %r6961; + .loc 1 831 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + add.s32 %r6994, %r2590, 49152; + add.s32 %r6203, %r6994, %r6791; + .loc 1 530 20 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:530:20 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + wgmma.fence.sync.aligned; + add.s32 %r6200, %r2590, 131072; + add.s32 %r6995, %r6795, %r6200; + bfe.u32 %r6996, %r6995, 4, 14; + cvt.u64.u32 %rd498, %r6996; + or.b64 %rd448, %rd498, 4611686293372403712; + bfe.u32 %r6997, %r6203, 4, 14; + cvt.u64.u32 %rd499, %r6997; + or.b64 %rd449, %rd499, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5784,%r5785,%r5786,%r5787,%r5788,%r5789,%r5790,%r5791,%r5792,%r5793,%r5794,%r5795,%r5796,%r5797,%r5798,%r5799,%r5800,%r5801,%r5802,%r5803,%r5804,%r5805,%r5806,%r5807,%r5808,%r5809,%r5810,%r5811,%r5812,%r5813,%r5814,%r5815}, %rd448, %rd449, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r6998, %r6799, %r6200; + bfe.u32 %r6999, %r6998, 4, 14; + cvt.u64.u32 %rd500, %r6999; + or.b64 %rd450, %rd500, 4611686293372403712; + add.s32 %r7000, %r6203, 32; + bfe.u32 %r7001, %r7000, 4, 14; + cvt.u64.u32 %rd501, %r7001; + or.b64 %rd451, %rd501, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5784,%r5785,%r5786,%r5787,%r5788,%r5789,%r5790,%r5791,%r5792,%r5793,%r5794,%r5795,%r5796,%r5797,%r5798,%r5799,%r5800,%r5801,%r5802,%r5803,%r5804,%r5805,%r5806,%r5807,%r5808,%r5809,%r5810,%r5811,%r5812,%r5813,%r5814,%r5815}, %rd450, %rd451, %p391, 1, 1, 0, 0; + // end inline asm + add.s32 %r7002, %r6804, %r6200; + bfe.u32 %r7003, %r7002, 4, 14; + cvt.u64.u32 %rd502, %r7003; + or.b64 %rd452, %rd502, 4611686293372403712; + add.s32 %r7004, %r6203, 64; + bfe.u32 %r7005, %r7004, 4, 14; + cvt.u64.u32 %rd503, %r7005; + or.b64 %rd453, %rd503, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5784,%r5785,%r5786,%r5787,%r5788,%r5789,%r5790,%r5791,%r5792,%r5793,%r5794,%r5795,%r5796,%r5797,%r5798,%r5799,%r5800,%r5801,%r5802,%r5803,%r5804,%r5805,%r5806,%r5807,%r5808,%r5809,%r5810,%r5811,%r5812,%r5813,%r5814,%r5815}, %rd452, %rd453, %p391, 1, 1, 0, 0; + // end inline asm + add.s32 %r7006, %r6809, %r6200; + bfe.u32 %r7007, %r7006, 4, 14; + cvt.u64.u32 %rd504, %r7007; + or.b64 %rd454, %rd504, 4611686293372403712; + add.s32 %r7008, %r6203, 96; + bfe.u32 %r7009, %r7008, 4, 14; + cvt.u64.u32 %rd505, %r7009; + or.b64 %rd455, %rd505, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5784,%r5785,%r5786,%r5787,%r5788,%r5789,%r5790,%r5791,%r5792,%r5793,%r5794,%r5795,%r5796,%r5797,%r5798,%r5799,%r5800,%r5801,%r5802,%r5803,%r5804,%r5805,%r5806,%r5807,%r5808,%r5809,%r5810,%r5811,%r5812,%r5813,%r5814,%r5815}, %rd454, %rd455, %p391, 1, 1, 0, 0; + // end inline asm + add.s32 %r7010, %r6814, %r6200; + bfe.u32 %r7011, %r7010, 4, 14; + cvt.u64.u32 %rd506, %r7011; + or.b64 %rd456, %rd506, 4611686293372403712; + add.s32 %r7012, %r6203, 8192; + bfe.u32 %r7013, %r7012, 4, 14; + cvt.u64.u32 %rd507, %r7013; + or.b64 %rd457, %rd507, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5784,%r5785,%r5786,%r5787,%r5788,%r5789,%r5790,%r5791,%r5792,%r5793,%r5794,%r5795,%r5796,%r5797,%r5798,%r5799,%r5800,%r5801,%r5802,%r5803,%r5804,%r5805,%r5806,%r5807,%r5808,%r5809,%r5810,%r5811,%r5812,%r5813,%r5814,%r5815}, %rd456, %rd457, %p391, 1, 1, 0, 0; + // end inline asm + add.s32 %r7014, %r6819, %r6200; + bfe.u32 %r7015, %r7014, 4, 14; + cvt.u64.u32 %rd508, %r7015; + or.b64 %rd458, %rd508, 4611686293372403712; + add.s32 %r7016, %r6203, 8224; + bfe.u32 %r7017, %r7016, 4, 14; + cvt.u64.u32 %rd509, %r7017; + or.b64 %rd459, %rd509, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5784,%r5785,%r5786,%r5787,%r5788,%r5789,%r5790,%r5791,%r5792,%r5793,%r5794,%r5795,%r5796,%r5797,%r5798,%r5799,%r5800,%r5801,%r5802,%r5803,%r5804,%r5805,%r5806,%r5807,%r5808,%r5809,%r5810,%r5811,%r5812,%r5813,%r5814,%r5815}, %rd458, %rd459, %p391, 1, 1, 0, 0; + // end inline asm + add.s32 %r7018, %r6824, %r6200; + bfe.u32 %r7019, %r7018, 4, 14; + cvt.u64.u32 %rd510, %r7019; + or.b64 %rd460, %rd510, 4611686293372403712; + add.s32 %r7020, %r6203, 8256; + bfe.u32 %r7021, %r7020, 4, 14; + cvt.u64.u32 %rd511, %r7021; + or.b64 %rd461, %rd511, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5784,%r5785,%r5786,%r5787,%r5788,%r5789,%r5790,%r5791,%r5792,%r5793,%r5794,%r5795,%r5796,%r5797,%r5798,%r5799,%r5800,%r5801,%r5802,%r5803,%r5804,%r5805,%r5806,%r5807,%r5808,%r5809,%r5810,%r5811,%r5812,%r5813,%r5814,%r5815}, %rd460, %rd461, %p391, 1, 1, 0, 0; + // end inline asm + add.s32 %r7022, %r6829, %r6200; + bfe.u32 %r7023, %r7022, 4, 14; + cvt.u64.u32 %rd512, %r7023; + or.b64 %rd462, %rd512, 4611686293372403712; + add.s32 %r7024, %r6203, 8288; + bfe.u32 %r7025, %r7024, 4, 14; + cvt.u64.u32 %rd513, %r7025; + or.b64 %rd463, %rd513, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5784,%r5785,%r5786,%r5787,%r5788,%r5789,%r5790,%r5791,%r5792,%r5793,%r5794,%r5795,%r5796,%r5797,%r5798,%r5799,%r5800,%r5801,%r5802,%r5803,%r5804,%r5805,%r5806,%r5807,%r5808,%r5809,%r5810,%r5811,%r5812,%r5813,%r5814,%r5815}, %rd462, %rd463, %p391, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r6205, %r5683; + mov.b32 %r6201, %r5683; + mov.b32 %r6202, %r5683; + mov.b32 %r6204, %r5683; + // begin inline asm + // wait for regs: %r5784,%r5785,%r5786,%r5787,%r5788,%r5789,%r5790,%r5791,%r5792,%r5793,%r5794,%r5795,%r5796,%r5797,%r5798,%r5799,%r5800,%r5801,%r5802,%r5803,%r5804,%r5805,%r5806,%r5807,%r5808,%r5809,%r5810,%r5811,%r5812,%r5813,%r5814,%r5815,%r6200,%r6201,%r6202,%r6203,%r6204,%r6205 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 531 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + sub.f32 %r7026, %r5784, %r2509; + sub.f32 %r7027, %r5785, %r2509; + sub.f32 %r7028, %r5786, %r2510; + sub.f32 %r7029, %r5787, %r2510; + sub.f32 %r7030, %r5788, %r2509; + sub.f32 %r7031, %r5789, %r2509; + sub.f32 %r7032, %r5790, %r2510; + sub.f32 %r7033, %r5791, %r2510; + sub.f32 %r7034, %r5792, %r2509; + sub.f32 %r7035, %r5793, %r2509; + sub.f32 %r7036, %r5794, %r2510; + sub.f32 %r7037, %r5795, %r2510; + sub.f32 %r7038, %r5796, %r2509; + sub.f32 %r7039, %r5797, %r2509; + sub.f32 %r7040, %r5798, %r2510; + sub.f32 %r7041, %r5799, %r2510; + sub.f32 %r7042, %r5800, %r2509; + sub.f32 %r7043, %r5801, %r2509; + sub.f32 %r7044, %r5802, %r2510; + sub.f32 %r7045, %r5803, %r2510; + sub.f32 %r7046, %r5804, %r2509; + sub.f32 %r7047, %r5805, %r2509; + sub.f32 %r7048, %r5806, %r2510; + sub.f32 %r7049, %r5807, %r2510; + sub.f32 %r7050, %r5808, %r2509; + sub.f32 %r7051, %r5809, %r2509; + sub.f32 %r7052, %r5810, %r2510; + sub.f32 %r7053, %r5811, %r2510; + sub.f32 %r7054, %r5812, %r2509; + sub.f32 %r7055, %r5813, %r2509; + sub.f32 %r7056, %r5814, %r2510; + sub.f32 %r7057, %r5815, %r2510; + .loc 1 531 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:531:14 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.f32 %r7058, %r6962, %r7026; + mul.f32 %r7059, %r6963, %r7027; + mul.f32 %r7060, %r6964, %r7028; + mul.f32 %r7061, %r6965, %r7029; + mul.f32 %r7062, %r6966, %r7030; + mul.f32 %r7063, %r6967, %r7031; + mul.f32 %r7064, %r6968, %r7032; + mul.f32 %r7065, %r6969, %r7033; + mul.f32 %r7066, %r6970, %r7034; + mul.f32 %r7067, %r6971, %r7035; + mul.f32 %r7068, %r6972, %r7036; + mul.f32 %r7069, %r6973, %r7037; + mul.f32 %r7070, %r6974, %r7038; + mul.f32 %r7071, %r6975, %r7039; + mul.f32 %r7072, %r6976, %r7040; + mul.f32 %r7073, %r6977, %r7041; + mul.f32 %r7074, %r6978, %r7042; + mul.f32 %r7075, %r6979, %r7043; + mul.f32 %r7076, %r6980, %r7044; + mul.f32 %r7077, %r6981, %r7045; + mul.f32 %r7078, %r6982, %r7046; + mul.f32 %r7079, %r6983, %r7047; + mul.f32 %r7080, %r6984, %r7048; + mul.f32 %r7081, %r6985, %r7049; + mul.f32 %r7082, %r6986, %r7050; + mul.f32 %r7083, %r6987, %r7051; + mul.f32 %r7084, %r6988, %r7052; + mul.f32 %r7085, %r6989, %r7053; + mul.f32 %r7086, %r6990, %r7054; + mul.f32 %r7087, %r6991, %r7055; + mul.f32 %r7088, %r6992, %r7056; + mul.f32 %r7089, %r6993, %r7057; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs65, %r7058; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs66, %rs65, 0x0000, %p413; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs67, %r7059; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs68, %rs67, 0x0000, %p414; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs69, %r7060; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs70, %rs69, 0x0000, %p413; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs71, %r7061; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs72, %rs71, 0x0000, %p414; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs73, %r7062; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs74, %rs73, 0x0000, %p415; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs75, %r7063; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs76, %rs75, 0x0000, %p416; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs77, %r7064; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs78, %rs77, 0x0000, %p415; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs79, %r7065; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs80, %rs79, 0x0000, %p416; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs81, %r7066; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs82, %rs81, 0x0000, %p417; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs83, %r7067; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs84, %rs83, 0x0000, %p418; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs85, %r7068; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs86, %rs85, 0x0000, %p417; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs87, %r7069; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs88, %rs87, 0x0000, %p418; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs89, %r7070; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs90, %rs89, 0x0000, %p419; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs91, %r7071; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs92, %rs91, 0x0000, %p420; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs93, %r7072; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs94, %rs93, 0x0000, %p419; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs95, %r7073; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs96, %rs95, 0x0000, %p420; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs97, %r7074; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs98, %rs97, 0x0000, %p421; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs99, %r7075; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs100, %rs99, 0x0000, %p422; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs101, %r7076; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs102, %rs101, 0x0000, %p421; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs103, %r7077; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs104, %rs103, 0x0000, %p422; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs105, %r7078; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs106, %rs105, 0x0000, %p423; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs107, %r7079; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs108, %rs107, 0x0000, %p424; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs109, %r7080; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs110, %rs109, 0x0000, %p423; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs111, %r7081; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs112, %rs111, 0x0000, %p424; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs113, %r7082; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs114, %rs113, 0x0000, %p425; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs115, %r7083; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs116, %rs115, 0x0000, %p426; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs117, %r7084; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs118, %rs117, 0x0000, %p425; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs119, %r7085; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs120, %rs119, 0x0000, %p426; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs121, %r7086; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs122, %rs121, 0x0000, %p427; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs123, %r7087; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs124, %rs123, 0x0000, %p428; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs125, %r7088; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs126, %rs125, 0x0000, %p427; + .loc 1 551 15 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:551:15 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + cvt.rn.bf16.f32 %rs127, %r7089; + .loc 1 538 71 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:538:71 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + selp.b16 %rs128, %rs127, 0x0000, %p428; + .loc 1 553 21 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:553:21 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mov.b32 %r6372, {%rs66, %rs68}; + mov.b32 %r6373, {%rs70, %rs72}; + mov.b32 %r6374, {%rs74, %rs76}; + mov.b32 %r6375, {%rs78, %rs80}; + mov.b32 %r6504, {%rs82, %rs84}; + mov.b32 %r6505, {%rs86, %rs88}; + mov.b32 %r6506, {%rs90, %rs92}; + mov.b32 %r6507, {%rs94, %rs96}; + mov.b32 %r6636, {%rs98, %rs100}; + mov.b32 %r6637, {%rs102, %rs104}; + mov.b32 %r6638, {%rs106, %rs108}; + mov.b32 %r6639, {%rs110, %rs112}; + mov.b32 %r6768, {%rs114, %rs116}; + mov.b32 %r6769, {%rs118, %rs120}; + mov.b32 %r6770, {%rs122, %rs124}; + mov.b32 %r6771, {%rs126, %rs128}; + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519}, {%r6372,%r6373,%r6374,%r6375}, %rd433, %p391, 1, 1, 1; + // end inline asm + add.s32 %r7090, %r5685, 2048; + bfe.u32 %r7091, %r7090, 4, 14; + cvt.u64.u32 %rd514, %r7091; + or.b64 %rd465, %rd514, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519}, {%r6504,%r6505,%r6506,%r6507}, %rd465, %p391, 1, 1, 1; + // end inline asm + add.s32 %r7092, %r5685, 4096; + bfe.u32 %r7093, %r7092, 4, 14; + cvt.u64.u32 %rd515, %r7093; + or.b64 %rd466, %rd515, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519}, {%r6636,%r6637,%r6638,%r6639}, %rd466, %p391, 1, 1, 1; + // end inline asm + add.s32 %r7094, %r5685, 6144; + bfe.u32 %r7095, %r7094, 4, 14; + cvt.u64.u32 %rd516, %r7095; + or.b64 %rd467, %rd516, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519}, {%r6768,%r6769,%r6770,%r6771}, %rd467, %p391, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 417 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:417:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + add.s32 %r14673, %r14601, %r14673; + add.s32 %r14674, %r14601, %r14674; + add.s32 %r14675, %r14601, %r14675; + add.s32 %r14676, %r14601, %r14676; + add.s32 %r14677, %r14601, %r14677; + add.s32 %r14678, %r14601, %r14678; + add.s32 %r14679, %r14601, %r14679; + add.s32 %r14680, %r14601, %r14680; + add.s32 %r14681, %r14601, %r14681; + add.s32 %r14682, %r14601, %r14682; + add.s32 %r14683, %r14601, %r14683; + add.s32 %r14684, %r14601, %r14684; + add.s32 %r14685, %r14601, %r14685; + add.s32 %r14686, %r14601, %r14686; + add.s32 %r14687, %r14601, %r14687; + add.s32 %r14688, %r14601, %r14688; + .loc 1 397 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:397:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + add.s32 %r607, %r14672, 1; + .loc 1 788 33 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:788:33 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + shr.u32 %r7096, %r607, 1; + .loc 1 789 38 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:789:38 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mad.wide.u32 %rd469, %r7096, 4, %rd399; + .loc 1 789 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:789:24 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + // begin inline asm + mov.u64 %rd468, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd468, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6772, 0x0; + @%p409 ld.global.L1::evict_last.L2::cache_hint.b32 { %r6772 }, [ %rd469 + 0 ], %rd468; + // end inline asm + .loc 1 790 109 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:790:109 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + add.s32 %r7097, %r7096, 1; + .loc 1 790 113 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:790:113 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + setp.lt.s32 %p429, %r7097, %r5078; + .loc 1 790 55 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:790:55 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + add.s64 %rd472, %rd469, 4; + .loc 1 397 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:397:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + and.pred %p410, %p409, %p429; + .loc 1 790 25 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:790:25 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + // begin inline asm + mov.u64 %rd471, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd471, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6773, 0x0; + @%p410 ld.global.L1::evict_last.L2::cache_hint.b32 { %r6773 }, [ %rd472 + 0 ], %rd471; + // end inline asm + .loc 1 791 35 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:791:35 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + and.b32 %r7098, %r14672, 1; + .loc 1 792 34 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:792:34 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + sub.s32 %r7099, %r6773, %r6772; + .loc 1 792 48 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:792:48 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + shl.b32 %r7100, %r7099, 7; + .loc 1 792 63 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:792:63 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + add.s32 %r7101, %r7100, -64; + .loc 1 793 29 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:793:29 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + xor.b32 %r7102, %r7098, 1; + .loc 1 793 61 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:793:61 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + shl.b32 %r7103, %r7098, 6; + .loc 1 793 42 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:793:42 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mad.lo.s32 %r14601, %r7101, %r7102, %r7103; + .loc 1 414 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:414:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + shl.b32 %r7104, %r14601, 7; + .loc 1 414 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:414:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + mul.wide.s32 %rd517, %r7104, 2; + add.s64 %rd1192, %rd1192, %rd517; + add.s64 %rd1191, %rd1191, %rd517; + add.s64 %rd1190, %rd1190, %rd517; + add.s64 %rd1189, %rd1189, %rd517; + .loc 1 415 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:415:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + add.s64 %rd1188, %rd1188, %rd517; + add.s64 %rd1187, %rd1187, %rd517; + add.s64 %rd1186, %rd1186, %rd517; + add.s64 %rd1185, %rd1185, %rd517; + .loc 1 417 19 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:417:19 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + add.s32 %r14607, %r14601, %r14607; + add.s32 %r14606, %r14601, %r14606; + add.s32 %r14605, %r14601, %r14605; + add.s32 %r14604, %r14601, %r14604; + .loc 1 397 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:397:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + add.s32 %r7105, %r14603, 1; + setp.gt.s32 %p430, %r7105, 2; + selp.b32 %r14603, 0, %r7105, %p430; + .loc 1 831 52 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:52 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + setp.lt.s32 %p431, %r14607, %r2359; + setp.lt.s32 %p432, %r14606, %r2359; + setp.lt.s32 %p433, %r14605, %r2359; + setp.lt.s32 %p434, %r14604, %r2359; + .loc 1 831 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:831:23 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + shl.b32 %r7106, %r14603, 14; + add.s32 %r7107, %r2590, %r7106; + bar.sync 0; + add.s32 %r6774, %r7107, %r60; + selp.b32 %r7108, 16, 0, %p431; + selp.b32 %r6783, %r7108, 0, %p411; + // begin inline asm + cp.async.cg.shared.global [ %r6774 + 0 ], [ %rd1192 + 0 ], 0x10, %r6783; + // end inline asm + add.s32 %r6776, %r6774, 2048; + selp.b32 %r7109, 16, 0, %p432; + selp.b32 %r6785, %r7109, 0, %p411; + // begin inline asm + cp.async.cg.shared.global [ %r6776 + 0 ], [ %rd1191 + 0 ], 0x10, %r6785; + // end inline asm + add.s32 %r6778, %r6774, 4096; + selp.b32 %r7110, 16, 0, %p433; + selp.b32 %r6787, %r7110, 0, %p411; + // begin inline asm + cp.async.cg.shared.global [ %r6778 + 0 ], [ %rd1190 + 0 ], 0x10, %r6787; + // end inline asm + add.s32 %r6780, %r6774, 6144; + selp.b32 %r7111, 16, 0, %p434; + selp.b32 %r6789, %r7111, 0, %p411; + // begin inline asm + cp.async.cg.shared.global [ %r6780 + 0 ], [ %rd1189 + 0 ], 0x10, %r6789; + // end inline asm + cp.async.commit_group; + add.s32 %r7112, %r6994, %r7106; + add.s32 %r6782, %r7112, %r60; + // begin inline asm + cp.async.cg.shared.global [ %r6782 + 0 ], [ %rd1188 + 0 ], 0x10, %r6783; + // end inline asm + add.s32 %r6784, %r6782, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r6784 + 0 ], [ %rd1187 + 0 ], 0x10, %r6785; + // end inline asm + add.s32 %r6786, %r6782, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r6786 + 0 ], [ %rd1186 + 0 ], 0x10, %r6787; + // end inline asm + add.s32 %r6788, %r6782, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r6788 + 0 ], [ %rd1185 + 0 ], 0x10, %r6789; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:397:28 @[ ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:226:16 ] + setp.ne.b32 %p435, %r437, %r607; + mov.b32 %r14672, %r607; + @%p435 bra $L__BB0_6; +$L__BB0_7: // %._crit_edge1593 + // begin inline asm + // wait for regs: %r14456,%r14457,%r14458,%r14459,%r14460,%r14461,%r14462,%r14463,%r14464,%r14465,%r14466,%r14467,%r14468,%r14469,%r14470,%r14471,%r14472,%r14473,%r14474,%r14475,%r14476,%r14477,%r14478,%r14479,%r14480,%r14481,%r14482,%r14483,%r14484,%r14485,%r14486,%r14487,%r14488,%r14489,%r14490,%r14491,%r14492,%r14493,%r14494,%r14495,%r14496,%r14497,%r14498,%r14499,%r14500,%r14501,%r14502,%r14503,%r14504,%r14505,%r14506,%r14507,%r14508,%r14509,%r14510,%r14511,%r14512,%r14513,%r14514,%r14515,%r14516,%r14517,%r14518,%r14519 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp35: + .loc 1 231 24 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:231:24 + shl.b64 %rd526, %rd5, 1; + add.s64 %rd527, %rd4, %rd526; + shl.b64 %rd528, %rd6, 1; + add.s64 %rd529, %rd4, %rd528; + shl.b64 %rd530, %rd7, 1; + add.s64 %rd531, %rd4, %rd530; + shl.b64 %rd532, %rd8, 1; + add.s64 %rd533, %rd4, %rd532; + shl.b64 %rd534, %rd9, 1; + add.s64 %rd535, %rd4, %rd534; + shl.b64 %rd536, %rd10, 1; + add.s64 %rd537, %rd4, %rd536; + shl.b64 %rd538, %rd11, 1; + add.s64 %rd539, %rd4, %rd538; + shl.b64 %rd540, %rd12, 1; + add.s64 %rd541, %rd4, %rd540; + .loc 1 231 56 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:231:56 + add.s64 %rd518, %rd527, %rd427; + add.s64 %rd519, %rd529, %rd427; + add.s64 %rd520, %rd531, %rd427; + add.s64 %rd521, %rd533, %rd427; + add.s64 %rd522, %rd535, %rd427; + add.s64 %rd523, %rd537, %rd427; + add.s64 %rd524, %rd539, %rd427; + add.s64 %rd525, %rd541, %rd427; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7313, %r14456, 0f3DB504F3; + mul.f32 %r7314, %r14457, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7315, %r7314, %r7313; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7316, %r14458, 0f3DB504F3; + mul.f32 %r7317, %r14459, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7318, %r7317, %r7316; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7319, %r14460, 0f3DB504F3; + mul.f32 %r7320, %r14461, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7321, %r7320, %r7319; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7322, %r14462, 0f3DB504F3; + mul.f32 %r7323, %r14463, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7324, %r7323, %r7322; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7325, %r14464, 0f3DB504F3; + mul.f32 %r7326, %r14465, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7327, %r7326, %r7325; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7328, %r14466, 0f3DB504F3; + mul.f32 %r7329, %r14467, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7330, %r7329, %r7328; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7331, %r14468, 0f3DB504F3; + mul.f32 %r7332, %r14469, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7333, %r7332, %r7331; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7334, %r14470, 0f3DB504F3; + mul.f32 %r7335, %r14471, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7336, %r7335, %r7334; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7337, %r14472, 0f3DB504F3; + mul.f32 %r7338, %r14473, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7339, %r7338, %r7337; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7340, %r14474, 0f3DB504F3; + mul.f32 %r7341, %r14475, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7342, %r7341, %r7340; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7343, %r14476, 0f3DB504F3; + mul.f32 %r7344, %r14477, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7345, %r7344, %r7343; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7346, %r14478, 0f3DB504F3; + mul.f32 %r7347, %r14479, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7348, %r7347, %r7346; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7349, %r14480, 0f3DB504F3; + mul.f32 %r7350, %r14481, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7351, %r7350, %r7349; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7352, %r14482, 0f3DB504F3; + mul.f32 %r7353, %r14483, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7354, %r7353, %r7352; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7355, %r14484, 0f3DB504F3; + mul.f32 %r7356, %r14485, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7357, %r7356, %r7355; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7358, %r14486, 0f3DB504F3; + mul.f32 %r7359, %r14487, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7360, %r7359, %r7358; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7361, %r14488, 0f3DB504F3; + mul.f32 %r7362, %r14489, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7363, %r7362, %r7361; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7364, %r14490, 0f3DB504F3; + mul.f32 %r7365, %r14491, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7366, %r7365, %r7364; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7367, %r14492, 0f3DB504F3; + mul.f32 %r7368, %r14493, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7369, %r7368, %r7367; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7370, %r14494, 0f3DB504F3; + mul.f32 %r7371, %r14495, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7372, %r7371, %r7370; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7373, %r14496, 0f3DB504F3; + mul.f32 %r7374, %r14497, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7375, %r7374, %r7373; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7376, %r14498, 0f3DB504F3; + mul.f32 %r7377, %r14499, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7378, %r7377, %r7376; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7379, %r14500, 0f3DB504F3; + mul.f32 %r7380, %r14501, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7381, %r7380, %r7379; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7382, %r14502, 0f3DB504F3; + mul.f32 %r7383, %r14503, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7384, %r7383, %r7382; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7385, %r14504, 0f3DB504F3; + mul.f32 %r7386, %r14505, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7387, %r7386, %r7385; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7388, %r14506, 0f3DB504F3; + mul.f32 %r7389, %r14507, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7390, %r7389, %r7388; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7391, %r14508, 0f3DB504F3; + mul.f32 %r7392, %r14509, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7393, %r7392, %r7391; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7394, %r14510, 0f3DB504F3; + mul.f32 %r7395, %r14511, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7396, %r7395, %r7394; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7397, %r14512, 0f3DB504F3; + mul.f32 %r7398, %r14513, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7399, %r7398, %r7397; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7400, %r14514, 0f3DB504F3; + mul.f32 %r7401, %r14515, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7402, %r7401, %r7400; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7403, %r14516, 0f3DB504F3; + mul.f32 %r7404, %r14517, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7405, %r7404, %r7403; + .loc 1 232 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:232:14 + mul.f32 %r7406, %r14518, 0f3DB504F3; + mul.f32 %r7407, %r14519, 0f3DB504F3; + .loc 1 236 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:236:30 + cvt.rn.bf16x2.f32 %r7408, %r7407, %r7406; + shl.b32 %r7409, %r39, 13; + shl.b32 %r7410, %r10, 5; + and.b32 %r7411, %r7410, 7264; + and.b32 %r7412, %r10, 24; + shl.b32 %r7413, %r7412, 4; + shl.b32 %r7414, %r10, 2; + and.b32 %r7415, %r7414, 16; + or.b32 %r7416, %r7409, %r7415; + or.b32 %r7417, %r7411, %r7413; + or.b32 %r7418, %r7416, %r7417; + add.s32 %r7420, %r2590, %r7418; + st.shared.v4.b32 [%r7420], {%r7315, %r7321, %r7327, %r7333}; + st.shared.v4.b32 [%r7420+512], {%r7318, %r7324, %r7330, %r7336}; + xor.b32 %r7421, %r7418, 32; + add.s32 %r7422, %r2590, %r7421; + st.shared.v4.b32 [%r7422], {%r7339, %r7345, %r7351, %r7357}; + st.shared.v4.b32 [%r7422+512], {%r7342, %r7348, %r7354, %r7360}; + xor.b32 %r7423, %r7418, 64; + add.s32 %r7424, %r2590, %r7423; + st.shared.v4.b32 [%r7424], {%r7363, %r7369, %r7375, %r7381}; + st.shared.v4.b32 [%r7424+512], {%r7366, %r7372, %r7378, %r7384}; + xor.b32 %r7425, %r7418, 96; + add.s32 %r7426, %r2590, %r7425; + st.shared.v4.b32 [%r7426], {%r7387, %r7393, %r7399, %r7405}; + st.shared.v4.b32 [%r7426+512], {%r7390, %r7396, %r7402, %r7408}; + bar.sync 0; + shl.b32 %r7427, %r7412, 10; + shl.b32 %r7428, %r39, 5; + and.b32 %r7429, %r7414, 1008; + or.b32 %r7430, %r7427, %r7428; + xor.b32 %r7431, %r7430, %r7429; + add.s32 %r7245, %r2590, %r7431; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r7281, %r7282, %r7283, %r7284}, [%r7245]; + // end inline asm + add.s32 %r7250, %r7245, 1024; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r7285, %r7286, %r7287, %r7288}, [%r7250]; + // end inline asm + add.s32 %r7255, %r7245, 2048; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r7289, %r7290, %r7291, %r7292}, [%r7255]; + // end inline asm + add.s32 %r7260, %r7245, 3072; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r7293, %r7294, %r7295, %r7296}, [%r7260]; + // end inline asm + add.s32 %r7265, %r7245, 4096; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r7297, %r7298, %r7299, %r7300}, [%r7265]; + // end inline asm + add.s32 %r7270, %r7245, 5120; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r7301, %r7302, %r7303, %r7304}, [%r7270]; + // end inline asm + add.s32 %r7275, %r7245, 6144; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r7305, %r7306, %r7307, %r7308}, [%r7275]; + // end inline asm + add.s32 %r7280, %r7245, 7168; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r7309, %r7310, %r7311, %r7312}, [%r7280]; + // end inline asm + // begin inline asm + @%p14 st.global.v4.b32 [ %rd518 + 0 ], { %r7281, %r7282, %r7283, %r7284 }; + // end inline asm + // begin inline asm + @%p15 st.global.v4.b32 [ %rd519 + 0 ], { %r7285, %r7286, %r7287, %r7288 }; + // end inline asm + // begin inline asm + @%p16 st.global.v4.b32 [ %rd520 + 0 ], { %r7289, %r7290, %r7291, %r7292 }; + // end inline asm + // begin inline asm + @%p17 st.global.v4.b32 [ %rd521 + 0 ], { %r7293, %r7294, %r7295, %r7296 }; + // end inline asm + // begin inline asm + @%p18 st.global.v4.b32 [ %rd522 + 0 ], { %r7297, %r7298, %r7299, %r7300 }; + // end inline asm + // begin inline asm + @%p19 st.global.v4.b32 [ %rd523 + 0 ], { %r7301, %r7302, %r7303, %r7304 }; + // end inline asm + // begin inline asm + @%p20 st.global.v4.b32 [ %rd524 + 0 ], { %r7305, %r7306, %r7307, %r7308 }; + // end inline asm + // begin inline asm + @%p21 st.global.v4.b32 [ %rd525 + 0 ], { %r7309, %r7310, %r7311, %r7312 }; + // end inline asm + .loc 1 139 7 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:139:7 + bra.uni $L__BB0_17; +$L__BB0_16: + .loc 1 323 23 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:323:23 + shl.b64 %rd1152, %rd71, 1; + add.s64 %rd1153, %rd3, %rd1152; + shl.b64 %rd1154, %rd72, 1; + add.s64 %rd1155, %rd3, %rd1154; + shl.b64 %rd1156, %rd73, 1; + add.s64 %rd1157, %rd3, %rd1156; + shl.b64 %rd1158, %rd74, 1; + add.s64 %rd1159, %rd3, %rd1158; + shl.b64 %rd1160, %rd75, 1; + add.s64 %rd1161, %rd3, %rd1160; + shl.b64 %rd1162, %rd76, 1; + add.s64 %rd1163, %rd3, %rd1162; + shl.b64 %rd1164, %rd77, 1; + add.s64 %rd1165, %rd3, %rd1164; + shl.b64 %rd1166, %rd78, 1; + add.s64 %rd1167, %rd3, %rd1166; + .loc 1 323 55 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:323:55 + add.s64 %rd1136, %rd1153, %rd685; + add.s64 %rd1137, %rd1155, %rd685; + add.s64 %rd1138, %rd1157, %rd685; + add.s64 %rd1139, %rd1159, %rd685; + add.s64 %rd1140, %rd1161, %rd685; + add.s64 %rd1141, %rd1163, %rd685; + add.s64 %rd1142, %rd1165, %rd685; + add.s64 %rd1143, %rd1167, %rd685; + .loc 1 332 30 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:332:30 + cvt.rn.bf16x2.f32 %r14282, %r14925, %r14924; + cvt.rn.bf16x2.f32 %r14283, %r14927, %r14926; + cvt.rn.bf16x2.f32 %r14284, %r14929, %r14928; + cvt.rn.bf16x2.f32 %r14285, %r14931, %r14930; + cvt.rn.bf16x2.f32 %r14286, %r14933, %r14932; + cvt.rn.bf16x2.f32 %r14287, %r14935, %r14934; + cvt.rn.bf16x2.f32 %r14288, %r14937, %r14936; + cvt.rn.bf16x2.f32 %r14289, %r14939, %r14938; + cvt.rn.bf16x2.f32 %r14290, %r14941, %r14940; + cvt.rn.bf16x2.f32 %r14291, %r14943, %r14942; + cvt.rn.bf16x2.f32 %r14292, %r14945, %r14944; + cvt.rn.bf16x2.f32 %r14293, %r14947, %r14946; + cvt.rn.bf16x2.f32 %r14294, %r14949, %r14948; + cvt.rn.bf16x2.f32 %r14295, %r14951, %r14950; + cvt.rn.bf16x2.f32 %r14296, %r14953, %r14952; + cvt.rn.bf16x2.f32 %r14297, %r14955, %r14954; + cvt.rn.bf16x2.f32 %r14298, %r14957, %r14956; + cvt.rn.bf16x2.f32 %r14299, %r14959, %r14958; + cvt.rn.bf16x2.f32 %r14300, %r14961, %r14960; + cvt.rn.bf16x2.f32 %r14301, %r14963, %r14962; + cvt.rn.bf16x2.f32 %r14302, %r14965, %r14964; + cvt.rn.bf16x2.f32 %r14303, %r14967, %r14966; + cvt.rn.bf16x2.f32 %r14304, %r14969, %r14968; + cvt.rn.bf16x2.f32 %r14305, %r14971, %r14970; + cvt.rn.bf16x2.f32 %r14306, %r14973, %r14972; + cvt.rn.bf16x2.f32 %r14307, %r14975, %r14974; + cvt.rn.bf16x2.f32 %r14308, %r14977, %r14976; + cvt.rn.bf16x2.f32 %r14309, %r14979, %r14978; + cvt.rn.bf16x2.f32 %r14310, %r14981, %r14980; + cvt.rn.bf16x2.f32 %r14311, %r14983, %r14982; + cvt.rn.bf16x2.f32 %r14312, %r14985, %r14984; + cvt.rn.bf16x2.f32 %r14313, %r14987, %r14986; + shl.b32 %r14314, %r691, 13; + shl.b32 %r14315, %r10, 5; + and.b32 %r14316, %r14315, 7264; + and.b32 %r14317, %r10, 24; + shl.b32 %r14318, %r14317, 4; + shl.b32 %r14319, %r10, 2; + and.b32 %r14320, %r14319, 16; + or.b32 %r14321, %r14314, %r14320; + or.b32 %r14322, %r14316, %r14318; + or.b32 %r14323, %r14321, %r14322; + add.s32 %r14325, %r7585, %r14323; + st.shared.v4.b32 [%r14325], {%r14282, %r14284, %r14286, %r14288}; + st.shared.v4.b32 [%r14325+512], {%r14283, %r14285, %r14287, %r14289}; + xor.b32 %r14326, %r14323, 32; + add.s32 %r14327, %r7585, %r14326; + st.shared.v4.b32 [%r14327], {%r14290, %r14292, %r14294, %r14296}; + st.shared.v4.b32 [%r14327+512], {%r14291, %r14293, %r14295, %r14297}; + xor.b32 %r14328, %r14323, 64; + add.s32 %r14329, %r7585, %r14328; + st.shared.v4.b32 [%r14329], {%r14298, %r14300, %r14302, %r14304}; + st.shared.v4.b32 [%r14329+512], {%r14299, %r14301, %r14303, %r14305}; + xor.b32 %r14330, %r14323, 96; + add.s32 %r14331, %r7585, %r14330; + st.shared.v4.b32 [%r14331], {%r14306, %r14308, %r14310, %r14312}; + st.shared.v4.b32 [%r14331+512], {%r14307, %r14309, %r14311, %r14313}; + bar.sync 0; + shl.b32 %r14332, %r14317, 10; + shl.b32 %r14333, %r691, 5; + and.b32 %r14334, %r14319, 1008; + or.b32 %r14335, %r14332, %r14333; + xor.b32 %r14336, %r14335, %r14334; + add.s32 %r14142, %r7585, %r14336; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14178, %r14179, %r14180, %r14181}, [%r14142]; + // end inline asm + add.s32 %r14147, %r14142, 1024; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14182, %r14183, %r14184, %r14185}, [%r14147]; + // end inline asm + add.s32 %r14152, %r14142, 2048; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14186, %r14187, %r14188, %r14189}, [%r14152]; + // end inline asm + add.s32 %r14157, %r14142, 3072; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14190, %r14191, %r14192, %r14193}, [%r14157]; + // end inline asm + add.s32 %r14162, %r14142, 4096; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14194, %r14195, %r14196, %r14197}, [%r14162]; + // end inline asm + add.s32 %r14167, %r14142, 5120; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14198, %r14199, %r14200, %r14201}, [%r14167]; + // end inline asm + add.s32 %r14172, %r14142, 6144; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14202, %r14203, %r14204, %r14205}, [%r14172]; + // end inline asm + add.s32 %r14177, %r14142, 7168; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14206, %r14207, %r14208, %r14209}, [%r14177]; + // end inline asm + // begin inline asm + @%p1163 st.global.v4.b32 [ %rd1136 + 0 ], { %r14178, %r14179, %r14180, %r14181 }; + // end inline asm + // begin inline asm + @%p1164 st.global.v4.b32 [ %rd1137 + 0 ], { %r14182, %r14183, %r14184, %r14185 }; + // end inline asm + // begin inline asm + @%p1165 st.global.v4.b32 [ %rd1138 + 0 ], { %r14186, %r14187, %r14188, %r14189 }; + // end inline asm + // begin inline asm + @%p1166 st.global.v4.b32 [ %rd1139 + 0 ], { %r14190, %r14191, %r14192, %r14193 }; + // end inline asm + // begin inline asm + @%p1167 st.global.v4.b32 [ %rd1140 + 0 ], { %r14194, %r14195, %r14196, %r14197 }; + // end inline asm + // begin inline asm + @%p1168 st.global.v4.b32 [ %rd1141 + 0 ], { %r14198, %r14199, %r14200, %r14201 }; + // end inline asm + // begin inline asm + @%p1169 st.global.v4.b32 [ %rd1142 + 0 ], { %r14202, %r14203, %r14204, %r14205 }; + // end inline asm + // begin inline asm + @%p1170 st.global.v4.b32 [ %rd1143 + 0 ], { %r14206, %r14207, %r14208, %r14209 }; + // end inline asm + .loc 1 334 14 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:334:14 + mul.f32 %r14337, %r14988, 0f3DB504F3; + mul.f32 %r14338, %r14989, 0f3DB504F3; + mul.f32 %r14339, %r14990, 0f3DB504F3; + mul.f32 %r14340, %r14991, 0f3DB504F3; + mul.f32 %r14341, %r14992, 0f3DB504F3; + mul.f32 %r14342, %r14993, 0f3DB504F3; + mul.f32 %r14343, %r14994, 0f3DB504F3; + mul.f32 %r14344, %r14995, 0f3DB504F3; + mul.f32 %r14345, %r14996, 0f3DB504F3; + mul.f32 %r14346, %r14997, 0f3DB504F3; + mul.f32 %r14347, %r14998, 0f3DB504F3; + mul.f32 %r14348, %r14999, 0f3DB504F3; + mul.f32 %r14349, %r15000, 0f3DB504F3; + mul.f32 %r14350, %r15001, 0f3DB504F3; + mul.f32 %r14351, %r15002, 0f3DB504F3; + mul.f32 %r14352, %r15003, 0f3DB504F3; + mul.f32 %r14353, %r15004, 0f3DB504F3; + mul.f32 %r14354, %r15005, 0f3DB504F3; + mul.f32 %r14355, %r15006, 0f3DB504F3; + mul.f32 %r14356, %r15007, 0f3DB504F3; + mul.f32 %r14357, %r15008, 0f3DB504F3; + mul.f32 %r14358, %r15009, 0f3DB504F3; + mul.f32 %r14359, %r15010, 0f3DB504F3; + mul.f32 %r14360, %r15011, 0f3DB504F3; + mul.f32 %r14361, %r15012, 0f3DB504F3; + mul.f32 %r14362, %r15013, 0f3DB504F3; + mul.f32 %r14363, %r15014, 0f3DB504F3; + mul.f32 %r14364, %r15015, 0f3DB504F3; + mul.f32 %r14365, %r15016, 0f3DB504F3; + mul.f32 %r14366, %r15017, 0f3DB504F3; + mul.f32 %r14367, %r15018, 0f3DB504F3; + mul.f32 %r14368, %r15019, 0f3DB504F3; + mul.f32 %r14369, %r15020, 0f3DB504F3; + mul.f32 %r14370, %r15021, 0f3DB504F3; + mul.f32 %r14371, %r15022, 0f3DB504F3; + mul.f32 %r14372, %r15023, 0f3DB504F3; + mul.f32 %r14373, %r15024, 0f3DB504F3; + mul.f32 %r14374, %r15025, 0f3DB504F3; + mul.f32 %r14375, %r15026, 0f3DB504F3; + mul.f32 %r14376, %r15027, 0f3DB504F3; + mul.f32 %r14377, %r15028, 0f3DB504F3; + mul.f32 %r14378, %r15029, 0f3DB504F3; + mul.f32 %r14379, %r15030, 0f3DB504F3; + mul.f32 %r14380, %r15031, 0f3DB504F3; + mul.f32 %r14381, %r15032, 0f3DB504F3; + mul.f32 %r14382, %r15033, 0f3DB504F3; + mul.f32 %r14383, %r15034, 0f3DB504F3; + mul.f32 %r14384, %r15035, 0f3DB504F3; + mul.f32 %r14385, %r15036, 0f3DB504F3; + mul.f32 %r14386, %r15037, 0f3DB504F3; + mul.f32 %r14387, %r15038, 0f3DB504F3; + mul.f32 %r14388, %r15039, 0f3DB504F3; + mul.f32 %r14389, %r15040, 0f3DB504F3; + mul.f32 %r14390, %r15041, 0f3DB504F3; + mul.f32 %r14391, %r15042, 0f3DB504F3; + mul.f32 %r14392, %r15043, 0f3DB504F3; + mul.f32 %r14393, %r15044, 0f3DB504F3; + mul.f32 %r14394, %r15045, 0f3DB504F3; + mul.f32 %r14395, %r15046, 0f3DB504F3; + mul.f32 %r14396, %r15047, 0f3DB504F3; + mul.f32 %r14397, %r15048, 0f3DB504F3; + mul.f32 %r14398, %r15049, 0f3DB504F3; + mul.f32 %r14399, %r15050, 0f3DB504F3; + mul.f32 %r14400, %r15051, 0f3DB504F3; + .loc 1 344 27 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:344:27 + or.b64 %rd1169, %rd71, %rd79; + cvt.u32.u64 %r14401, %rd1169; + or.b64 %rd1170, %rd72, %rd79; + cvt.u32.u64 %r14402, %rd1170; + or.b64 %rd1171, %rd73, %rd79; + cvt.u32.u64 %r14403, %rd1171; + or.b64 %rd1172, %rd74, %rd79; + cvt.u32.u64 %r14404, %rd1172; + or.b64 %rd1173, %rd75, %rd79; + cvt.u32.u64 %r14405, %rd1173; + or.b64 %rd1174, %rd76, %rd79; + cvt.u32.u64 %r14406, %rd1174; + or.b64 %rd1175, %rd77, %rd79; + cvt.u32.u64 %r14407, %rd1175; + or.b64 %rd1176, %rd78, %rd79; + cvt.u32.u64 %r14408, %rd1176; + .loc 1 344 59 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:344:59 + add.s32 %r14409, %r9, %r14401; + add.s32 %r14410, %r9, %r14402; + add.s32 %r14411, %r9, %r14403; + add.s32 %r14412, %r9, %r14404; + add.s32 %r14413, %r9, %r14405; + add.s32 %r14414, %r9, %r14406; + add.s32 %r14415, %r9, %r14407; + add.s32 %r14416, %r9, %r14408; + .loc 1 345 29 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:345:29 + mad.wide.s32 %rd1144, %r14409, 2, %rd208; + mad.wide.s32 %rd1145, %r14410, 2, %rd208; + mad.wide.s32 %rd1146, %r14411, 2, %rd208; + mad.wide.s32 %rd1147, %r14412, 2, %rd208; + mad.wide.s32 %rd1148, %r14413, 2, %rd208; + mad.wide.s32 %rd1149, %r14414, 2, %rd208; + mad.wide.s32 %rd1150, %r14415, 2, %rd208; + mad.wide.s32 %rd1151, %r14416, 2, %rd208; + .loc 1 345 69 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:345:69 + cvt.rn.bf16x2.f32 %r14417, %r14338, %r14337; + cvt.rn.bf16x2.f32 %r14418, %r14340, %r14339; + cvt.rn.bf16x2.f32 %r14419, %r14342, %r14341; + cvt.rn.bf16x2.f32 %r14420, %r14344, %r14343; + cvt.rn.bf16x2.f32 %r14421, %r14346, %r14345; + cvt.rn.bf16x2.f32 %r14422, %r14348, %r14347; + cvt.rn.bf16x2.f32 %r14423, %r14350, %r14349; + cvt.rn.bf16x2.f32 %r14424, %r14352, %r14351; + cvt.rn.bf16x2.f32 %r14425, %r14354, %r14353; + cvt.rn.bf16x2.f32 %r14426, %r14356, %r14355; + cvt.rn.bf16x2.f32 %r14427, %r14358, %r14357; + cvt.rn.bf16x2.f32 %r14428, %r14360, %r14359; + cvt.rn.bf16x2.f32 %r14429, %r14362, %r14361; + cvt.rn.bf16x2.f32 %r14430, %r14364, %r14363; + cvt.rn.bf16x2.f32 %r14431, %r14366, %r14365; + cvt.rn.bf16x2.f32 %r14432, %r14368, %r14367; + cvt.rn.bf16x2.f32 %r14433, %r14370, %r14369; + cvt.rn.bf16x2.f32 %r14434, %r14372, %r14371; + cvt.rn.bf16x2.f32 %r14435, %r14374, %r14373; + cvt.rn.bf16x2.f32 %r14436, %r14376, %r14375; + cvt.rn.bf16x2.f32 %r14437, %r14378, %r14377; + cvt.rn.bf16x2.f32 %r14438, %r14380, %r14379; + cvt.rn.bf16x2.f32 %r14439, %r14382, %r14381; + cvt.rn.bf16x2.f32 %r14440, %r14384, %r14383; + cvt.rn.bf16x2.f32 %r14441, %r14386, %r14385; + cvt.rn.bf16x2.f32 %r14442, %r14388, %r14387; + cvt.rn.bf16x2.f32 %r14443, %r14390, %r14389; + cvt.rn.bf16x2.f32 %r14444, %r14392, %r14391; + cvt.rn.bf16x2.f32 %r14445, %r14394, %r14393; + cvt.rn.bf16x2.f32 %r14446, %r14396, %r14395; + cvt.rn.bf16x2.f32 %r14447, %r14398, %r14397; + cvt.rn.bf16x2.f32 %r14448, %r14400, %r14399; + bar.sync 0; + st.shared.v4.b32 [%r14325], {%r14417, %r14419, %r14421, %r14423}; + st.shared.v4.b32 [%r14325+512], {%r14418, %r14420, %r14422, %r14424}; + st.shared.v4.b32 [%r14327], {%r14425, %r14427, %r14429, %r14431}; + st.shared.v4.b32 [%r14327+512], {%r14426, %r14428, %r14430, %r14432}; + st.shared.v4.b32 [%r14329], {%r14433, %r14435, %r14437, %r14439}; + st.shared.v4.b32 [%r14329+512], {%r14434, %r14436, %r14438, %r14440}; + st.shared.v4.b32 [%r14331], {%r14441, %r14443, %r14445, %r14447}; + st.shared.v4.b32 [%r14331+512], {%r14442, %r14444, %r14446, %r14448}; + bar.sync 0; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14250, %r14251, %r14252, %r14253}, [%r14142]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14254, %r14255, %r14256, %r14257}, [%r14147]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14258, %r14259, %r14260, %r14261}, [%r14152]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14262, %r14263, %r14264, %r14265}, [%r14157]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14266, %r14267, %r14268, %r14269}, [%r14162]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14270, %r14271, %r14272, %r14273}, [%r14167]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14274, %r14275, %r14276, %r14277}, [%r14172]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r14278, %r14279, %r14280, %r14281}, [%r14177]; + // end inline asm + // begin inline asm + @%p1163 st.global.v4.b32 [ %rd1144 + 0 ], { %r14250, %r14251, %r14252, %r14253 }; + // end inline asm + // begin inline asm + @%p1164 st.global.v4.b32 [ %rd1145 + 0 ], { %r14254, %r14255, %r14256, %r14257 }; + // end inline asm + // begin inline asm + @%p1165 st.global.v4.b32 [ %rd1146 + 0 ], { %r14258, %r14259, %r14260, %r14261 }; + // end inline asm + // begin inline asm + @%p1166 st.global.v4.b32 [ %rd1147 + 0 ], { %r14262, %r14263, %r14264, %r14265 }; + // end inline asm + // begin inline asm + @%p1167 st.global.v4.b32 [ %rd1148 + 0 ], { %r14266, %r14267, %r14268, %r14269 }; + // end inline asm + // begin inline asm + @%p1168 st.global.v4.b32 [ %rd1149 + 0 ], { %r14270, %r14271, %r14272, %r14273 }; + // end inline asm + // begin inline asm + @%p1169 st.global.v4.b32 [ %rd1150 + 0 ], { %r14274, %r14275, %r14276, %r14277 }; + // end inline asm + // begin inline asm + @%p1170 st.global.v4.b32 [ %rd1151 + 0 ], { %r14278, %r14279, %r14280, %r14281 }; + // end inline asm +$L__BB0_17: + .loc 1 139 4 // ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py:139:4 + ret; +$L__tmp36: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 6 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 452 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x1bd DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 116 +.b8 51 +.b8 102 +.b8 113 +.b8 53 +.b8 120 +.b8 116 +.b8 104 +.b8 122 +.b8 100 +.b8 101 +.b8 99 +.b8 51 +.b8 120 +.b8 50 +.b8 122 +.b8 121 +.b8 50 +.b8 119 +.b8 108 +.b8 109 +.b8 97 +.b8 50 +.b8 54 +.b8 109 +.b8 97 +.b8 97 +.b8 54 +.b8 114 +.b8 117 +.b8 102 +.b8 50 +.b8 118 +.b8 50 +.b8 116 +.b8 122 +.b8 115 +.b8 119 +.b8 104 +.b8 111 +.b8 111 +.b8 115 +.b8 51 +.b8 52 +.b8 50 +.b8 106 +.b8 115 +.b8 110 +.b8 55 +.b8 101 +.b8 118 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 116 +.b8 51 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 116 +.b8 101 +.b8 109 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x121 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 112 // DW_AT_call_line +.b8 36 // DW_AT_call_column +.b8 5 // Abbrev [5] 0xd3:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 0 // DW_AT_call_line +.b8 1 +.b8 107 // DW_AT_call_column +.b8 5 // Abbrev [5] 0xec:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp4 // DW_AT_low_pc +.b64 $L__tmp5 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 1 // DW_AT_call_line +.b8 1 +.b8 107 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x105:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp6 // DW_AT_low_pc +.b64 $L__tmp18 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 1 +.b8 16 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x11e:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp10 // DW_AT_low_pc +.b64 $L__tmp19 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 62 // DW_AT_call_line +.b8 1 +.b8 20 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x137:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp20 // DW_AT_low_pc +.b64 $L__tmp21 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 113 // DW_AT_call_line +.b8 34 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x14f:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp22 // DW_AT_low_pc +.b64 $L__tmp23 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 178 // DW_AT_call_line +.b8 107 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x167:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp23 // DW_AT_low_pc +.b64 $L__tmp24 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 179 // DW_AT_call_line +.b8 111 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x17f:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp25 // DW_AT_low_pc +.b64 $L__tmp31 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 207 // DW_AT_call_line +.b8 12 // DW_AT_call_column +.b8 6 // Abbrev [6] 0x197:0x17 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp29 // DW_AT_low_pc +.b64 $L__tmp30 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 0 // DW_AT_call_line +.b8 4 // Abbrev [4] 0x1ae:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp32 // DW_AT_low_pc +.b64 $L__tmp35 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 226 // DW_AT_call_line +.b8 16 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.source new file mode 100644 index 0000000000000000000000000000000000000000..0afac81810657ff77ec4382c142ac325e7e31c01 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.source @@ -0,0 +1,2294 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":18:0) +#loc227 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":32:0) +#loc237 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":812:0) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":348:0) +#loc279 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":423:0) +#loc353 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":797:0) +#loc357 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":781:0) +#loc378 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":559:0) +#loc408 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":634:0) +#loc486 = loc("arg_Q"(#loc)) +#loc487 = loc("arg_K"(#loc)) +#loc488 = loc("arg_V"(#loc)) +#loc489 = loc("arg_LSE"(#loc)) +#loc490 = loc("arg_DELTA"(#loc)) +#loc491 = loc("arg_DO"(#loc)) +#loc492 = loc("arg_DQ"(#loc)) +#loc493 = loc("arg_DV"(#loc)) +#loc494 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc495 = loc("arg_KV_IDX"(#loc)) +#loc496 = loc("arg_Q_NUM_BLKS"(#loc)) +#loc497 = loc("arg_Q_IDX"(#loc)) +#loc498 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc499 = loc("arg_FULL_KV_IDX"(#loc)) +#loc500 = loc("arg_FULL_Q_NUM_BLKS"(#loc)) +#loc501 = loc("arg_FULL_Q_IDX"(#loc)) +#loc502 = loc("in_ptr16"(#loc)) +#loc503 = loc("out_ptr0"(#loc)) +#loc504 = loc("ks0"(#loc)) +#loc505 = loc("ks1"(#loc)) +#loc506 = loc("ks2"(#loc)) +#loc507 = loc("ks3"(#loc)) +#loc508 = loc("ks4"(#loc)) +#loc509 = loc("ks5"(#loc)) +#loc510 = loc("ks6"(#loc)) +#loc511 = loc("ks7"(#loc)) +#loc512 = loc("ks8"(#loc)) +#loc695 = loc("x"(#loc227)) +#loc696 = loc("ptr"(#loc237)) +#loc697 = loc("offs_m"(#loc237)) +#loc698 = loc("offs_n"(#loc237)) +#loc699 = loc("stride_m"(#loc237)) +#loc700 = loc("stride_n"(#loc237)) +#loc701 = loc("M_LEN"(#loc237)) +#loc708 = loc("arg_Q"(#loc249)) +#loc709 = loc("arg_K"(#loc249)) +#loc710 = loc("arg_V"(#loc249)) +#loc711 = loc("arg_LSE"(#loc249)) +#loc712 = loc("arg_DELTA"(#loc249)) +#loc713 = loc("arg_DO"(#loc249)) +#loc714 = loc("arg_DQ"(#loc249)) +#loc715 = loc("arg_DV"(#loc249)) +#loc716 = loc("arg_KV_NUM_BLKS"(#loc249)) +#loc717 = loc("arg_KV_IDX"(#loc249)) +#loc718 = loc("arg_Q_NUM_BLKS"(#loc249)) +#loc719 = loc("arg_Q_IDX"(#loc249)) +#loc720 = loc("arg_FULL_KV_NUM_BLKS"(#loc249)) +#loc721 = loc("arg_FULL_KV_IDX"(#loc249)) +#loc722 = loc("arg_FULL_Q_NUM_BLKS"(#loc249)) +#loc723 = loc("arg_FULL_Q_IDX"(#loc249)) +#loc724 = loc("in_ptr16"(#loc249)) +#loc725 = loc("out_ptr0"(#loc249)) +#loc726 = loc("ks0"(#loc249)) +#loc727 = loc("ks1"(#loc249)) +#loc728 = loc("ks2"(#loc249)) +#loc729 = loc("ks3"(#loc249)) +#loc730 = loc("ks4"(#loc249)) +#loc731 = loc("ks5"(#loc249)) +#loc732 = loc("ks6"(#loc249)) +#loc733 = loc("ks7"(#loc249)) +#loc734 = loc("ks8"(#loc249)) +#loc735 = loc("K"(#loc249)) +#loc736 = loc("V"(#loc249)) +#loc737 = loc("dq"(#loc249)) +#loc738 = loc("q"(#loc249)) +#loc739 = loc("do"(#loc249)) +#loc740 = loc("Di"(#loc249)) +#loc741 = loc("lse"(#loc249)) +#loc742 = loc("off_z"(#loc249)) +#loc743 = loc("off_hq"(#loc249)) +#loc744 = loc("offs_m2"(#loc249)) +#loc745 = loc("offs_n2"(#loc249)) +#loc746 = loc("stride_kn"(#loc249)) +#loc747 = loc("stride_kd"(#loc249)) +#loc748 = loc("stride_vn"(#loc249)) +#loc749 = loc("stride_vd"(#loc249)) +#loc750 = loc("kv_indices"(#loc249)) +#loc751 = loc("sparse_kv_num_blocks"(#loc249)) +#loc778 = loc("arg_Q"(#loc279)) +#loc779 = loc("arg_K"(#loc279)) +#loc780 = loc("arg_V"(#loc279)) +#loc781 = loc("arg_LSE"(#loc279)) +#loc782 = loc("arg_DELTA"(#loc279)) +#loc783 = loc("arg_DO"(#loc279)) +#loc784 = loc("arg_DQ"(#loc279)) +#loc785 = loc("arg_DV"(#loc279)) +#loc786 = loc("arg_KV_NUM_BLKS"(#loc279)) +#loc787 = loc("arg_KV_IDX"(#loc279)) +#loc788 = loc("arg_Q_NUM_BLKS"(#loc279)) +#loc789 = loc("arg_Q_IDX"(#loc279)) +#loc790 = loc("arg_FULL_KV_NUM_BLKS"(#loc279)) +#loc791 = loc("arg_FULL_KV_IDX"(#loc279)) +#loc792 = loc("arg_FULL_Q_NUM_BLKS"(#loc279)) +#loc793 = loc("arg_FULL_Q_IDX"(#loc279)) +#loc794 = loc("in_ptr16"(#loc279)) +#loc795 = loc("out_ptr0"(#loc279)) +#loc796 = loc("ks0"(#loc279)) +#loc797 = loc("ks1"(#loc279)) +#loc798 = loc("ks2"(#loc279)) +#loc799 = loc("ks3"(#loc279)) +#loc800 = loc("ks4"(#loc279)) +#loc801 = loc("ks5"(#loc279)) +#loc802 = loc("ks6"(#loc279)) +#loc803 = loc("ks7"(#loc279)) +#loc804 = loc("ks8"(#loc279)) +#loc805 = loc("dq"(#loc279)) +#loc806 = loc("q"(#loc279)) +#loc807 = loc("kT_ptrs"(#loc279)) +#loc808 = loc("vT_ptrs"(#loc279)) +#loc809 = loc("do"(#loc279)) +#loc810 = loc("Di"(#loc279)) +#loc811 = loc("lse"(#loc279)) +#loc812 = loc("Q_LEN"(#loc279)) +#loc813 = loc("KV_LEN"(#loc279)) +#loc814 = loc("off_z"(#loc279)) +#loc815 = loc("off_hq"(#loc279)) +#loc816 = loc("offs_m2"(#loc279)) +#loc817 = loc("offs_n2"(#loc279)) +#loc818 = loc("offs_k"(#loc279)) +#loc819 = loc("offs_v"(#loc279)) +#loc820 = loc("stride_kn"(#loc279)) +#loc821 = loc("stride_kd"(#loc279)) +#loc822 = loc("stride_vn"(#loc279)) +#loc823 = loc("stride_vd"(#loc279)) +#loc824 = loc("kv_indices"(#loc279)) +#loc825 = loc("sparse_kv_num_blocks"(#loc279)) +#loc893 = loc("N_LEN"(#loc237)) +#loc894 = loc("indices"(#loc353)) +#loc895 = loc("max_len"(#loc353)) +#loc896 = loc("loop_iter"(#loc357)) +#loc897 = loc("col_indices"(#loc357)) +#loc898 = loc("total_blocks"(#loc357)) +#loc917 = loc("arg_Q"(#loc378)) +#loc918 = loc("arg_K"(#loc378)) +#loc919 = loc("arg_V"(#loc378)) +#loc920 = loc("arg_LSE"(#loc378)) +#loc921 = loc("arg_DELTA"(#loc378)) +#loc922 = loc("arg_DO"(#loc378)) +#loc923 = loc("arg_DQ"(#loc378)) +#loc924 = loc("arg_DV"(#loc378)) +#loc925 = loc("arg_KV_NUM_BLKS"(#loc378)) +#loc926 = loc("arg_KV_IDX"(#loc378)) +#loc927 = loc("arg_Q_NUM_BLKS"(#loc378)) +#loc928 = loc("arg_Q_IDX"(#loc378)) +#loc929 = loc("arg_FULL_KV_NUM_BLKS"(#loc378)) +#loc930 = loc("arg_FULL_KV_IDX"(#loc378)) +#loc931 = loc("arg_FULL_Q_NUM_BLKS"(#loc378)) +#loc932 = loc("arg_FULL_Q_IDX"(#loc378)) +#loc933 = loc("in_ptr16"(#loc378)) +#loc934 = loc("out_ptr0"(#loc378)) +#loc935 = loc("ks0"(#loc378)) +#loc936 = loc("ks1"(#loc378)) +#loc937 = loc("ks2"(#loc378)) +#loc938 = loc("ks3"(#loc378)) +#loc939 = loc("ks4"(#loc378)) +#loc940 = loc("ks5"(#loc378)) +#loc941 = loc("ks6"(#loc378)) +#loc942 = loc("ks7"(#loc378)) +#loc943 = loc("ks8"(#loc378)) +#loc944 = loc("Q"(#loc378)) +#loc945 = loc("DO"(#loc378)) +#loc946 = loc("DELTA"(#loc378)) +#loc947 = loc("LSE"(#loc378)) +#loc948 = loc("dk"(#loc378)) +#loc949 = loc("dv"(#loc378)) +#loc950 = loc("k"(#loc378)) +#loc951 = loc("v"(#loc378)) +#loc952 = loc("off_z"(#loc378)) +#loc953 = loc("off_hq"(#loc378)) +#loc954 = loc("offs_n1"(#loc378)) +#loc955 = loc("offs_m1"(#loc378)) +#loc956 = loc("stride_qm"(#loc378)) +#loc957 = loc("stride_qd"(#loc378)) +#loc958 = loc("stride_dom"(#loc378)) +#loc959 = loc("stride_dod"(#loc378)) +#loc960 = loc("q_indices"(#loc378)) +#loc961 = loc("sparse_q_num_blocks"(#loc378)) +#loc987 = loc("arg_Q"(#loc408)) +#loc988 = loc("arg_K"(#loc408)) +#loc989 = loc("arg_V"(#loc408)) +#loc990 = loc("arg_LSE"(#loc408)) +#loc991 = loc("arg_DELTA"(#loc408)) +#loc992 = loc("arg_DO"(#loc408)) +#loc993 = loc("arg_DQ"(#loc408)) +#loc994 = loc("arg_DV"(#loc408)) +#loc995 = loc("arg_KV_NUM_BLKS"(#loc408)) +#loc996 = loc("arg_KV_IDX"(#loc408)) +#loc997 = loc("arg_Q_NUM_BLKS"(#loc408)) +#loc998 = loc("arg_Q_IDX"(#loc408)) +#loc999 = loc("arg_FULL_KV_NUM_BLKS"(#loc408)) +#loc1000 = loc("arg_FULL_KV_IDX"(#loc408)) +#loc1001 = loc("arg_FULL_Q_NUM_BLKS"(#loc408)) +#loc1002 = loc("arg_FULL_Q_IDX"(#loc408)) +#loc1003 = loc("in_ptr16"(#loc408)) +#loc1004 = loc("out_ptr0"(#loc408)) +#loc1005 = loc("ks0"(#loc408)) +#loc1006 = loc("ks1"(#loc408)) +#loc1007 = loc("ks2"(#loc408)) +#loc1008 = loc("ks3"(#loc408)) +#loc1009 = loc("ks4"(#loc408)) +#loc1010 = loc("ks5"(#loc408)) +#loc1011 = loc("ks6"(#loc408)) +#loc1012 = loc("ks7"(#loc408)) +#loc1013 = loc("ks8"(#loc408)) +#loc1014 = loc("dk"(#loc408)) +#loc1015 = loc("dv"(#loc408)) +#loc1016 = loc("qT_ptrs"(#loc408)) +#loc1017 = loc("k"(#loc408)) +#loc1018 = loc("v"(#loc408)) +#loc1019 = loc("do_ptrs"(#loc408)) +#loc1020 = loc("DELTA"(#loc408)) +#loc1021 = loc("LSE"(#loc408)) +#loc1022 = loc("Q_LEN"(#loc408)) +#loc1023 = loc("KV_LEN"(#loc408)) +#loc1024 = loc("off_z"(#loc408)) +#loc1025 = loc("off_hq"(#loc408)) +#loc1026 = loc("offs_n1"(#loc408)) +#loc1027 = loc("offs_m1"(#loc408)) +#loc1028 = loc("offs_k"(#loc408)) +#loc1029 = loc("offs_v"(#loc408)) +#loc1030 = loc("stride_qm"(#loc408)) +#loc1031 = loc("stride_qd"(#loc408)) +#loc1032 = loc("stride_dom"(#loc408)) +#loc1033 = loc("stride_dod"(#loc408)) +#loc1034 = loc("q_indices"(#loc408)) +#loc1035 = loc("sparse_q_num_blocks"(#loc408)) +module { + tt.func public @triton_tem_fused_zeros_1(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_DELTA: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DELTA"(#loc)), %arg_DO: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DO"(#loc)), %arg_DQ: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DQ"(#loc)), %arg_DV: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DV"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_NUM_BLKS"(#loc)), %arg_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %arg_FULL_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_NUM_BLKS"(#loc)), %arg_FULL_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_IDX"(#loc)), %in_ptr16: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr16"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc)), %ks4: i32 loc("ks4"(#loc)), %ks5: i32 loc("ks5"(#loc)), %ks6: i32 loc("ks6"(#loc)), %ks7: i32 loc("ks7"(#loc)), %ks8: i32 loc("ks8"(#loc))) attributes {noinline = false} { + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c4096_i32_0 = arith.constant 4096 : i32 loc(#loc1) + %0 = arith.muli %c4096_i32_0, %ks0 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc2) + %c4096_i32_1 = arith.constant 4096 : i32 loc(#loc2) + %c1_i32 = arith.constant 1 : i32 loc(#loc2) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc3) + %c1024_i32_2 = arith.constant 1024 : i32 loc(#loc3) + %1 = arith.muli %c1024_i32_2, %ks1 : i32 loc(#loc3) + %c128_i32_3 = arith.constant 128 : i32 loc(#loc4) + %c128_i32_4 = arith.constant 128 : i32 loc(#loc4) + %2 = arith.muli %c128_i32_4, %ks1 : i32 loc(#loc4) + %c128_i32_5 = arith.constant 128 : i32 loc(#loc5) + %c1_i32_6 = arith.constant 1 : i32 loc(#loc5) + %c1024_i32_7 = arith.constant 1024 : i32 loc(#loc6) + %c1024_i32_8 = arith.constant 1024 : i32 loc(#loc6) + %3 = arith.muli %c1024_i32_8, %ks1 : i32 loc(#loc6) + %c128_i32_9 = arith.constant 128 : i32 loc(#loc7) + %c128_i32_10 = arith.constant 128 : i32 loc(#loc7) + %4 = arith.muli %c128_i32_10, %ks1 : i32 loc(#loc7) + %c128_i32_11 = arith.constant 128 : i32 loc(#loc8) + %c1_i32_12 = arith.constant 1 : i32 loc(#loc8) + %c1_i32_13 = arith.constant 1 : i32 loc(#loc9) + %5 = arith.cmpi sge, %c1_i32_13, %ks0 : i32 loc(#loc9) + %c1_i32_14 = arith.constant 1 : i32 loc(#loc10) + %c1_i32_15 = arith.constant 1 : i32 loc(#loc10) + %6 = arith.extui %5 : i1 to i32 loc(#loc10) + %7 = arith.muli %c1_i32_15, %6 : i32 loc(#loc10) + %c1_i32_16 = arith.constant 1 : i32 loc(#loc11) + %8 = arith.cmpi sgt, %ks0, %c1_i32_16 : i32 loc(#loc11) + %9 = arith.extui %8 : i1 to i32 loc(#loc12) + %10 = arith.muli %ks0, %9 : i32 loc(#loc12) + %11 = arith.addi %7, %10 : i32 loc(#loc13) + %c4096_i32_17 = arith.constant 4096 : i32 loc(#loc14) + %c4096_i32_18 = arith.constant 4096 : i32 loc(#loc14) + %12 = arith.muli %c4096_i32_18, %11 : i32 loc(#loc14) + %c1_i32_19 = arith.constant 1 : i32 loc(#loc15) + %13 = arith.cmpi sge, %c1_i32_19, %ks0 : i32 loc(#loc15) + %c1_i32_20 = arith.constant 1 : i32 loc(#loc16) + %c1_i32_21 = arith.constant 1 : i32 loc(#loc16) + %14 = arith.extui %13 : i1 to i32 loc(#loc16) + %15 = arith.muli %c1_i32_21, %14 : i32 loc(#loc16) + %c1_i32_22 = arith.constant 1 : i32 loc(#loc17) + %16 = arith.cmpi sgt, %ks0, %c1_i32_22 : i32 loc(#loc17) + %17 = arith.extui %16 : i1 to i32 loc(#loc18) + %18 = arith.muli %ks0, %17 : i32 loc(#loc18) + %19 = arith.addi %15, %18 : i32 loc(#loc19) + %c128_i32_23 = arith.constant 128 : i32 loc(#loc20) + %c128_i32_24 = arith.constant 128 : i32 loc(#loc20) + %20 = arith.muli %c128_i32_24, %19 : i32 loc(#loc20) + %c128_i32_25 = arith.constant 128 : i32 loc(#loc21) + %c1_i32_26 = arith.constant 1 : i32 loc(#loc21) + %c4096_i32_27 = arith.constant 4096 : i32 loc(#loc22) + %c4096_i32_28 = arith.constant 4096 : i32 loc(#loc22) + %21 = arith.muli %c4096_i32_28, %ks0 : i32 loc(#loc22) + %c128_i32_29 = arith.constant 128 : i32 loc(#loc23) + %c4096_i32_30 = arith.constant 4096 : i32 loc(#loc23) + %c1_i32_31 = arith.constant 1 : i32 loc(#loc23) + %c1024_i32_32 = arith.constant 1024 : i32 loc(#loc24) + %c1024_i32_33 = arith.constant 1024 : i32 loc(#loc24) + %22 = arith.muli %c1024_i32_33, %ks1 : i32 loc(#loc24) + %c128_i32_34 = arith.constant 128 : i32 loc(#loc25) + %c128_i32_35 = arith.constant 128 : i32 loc(#loc25) + %23 = arith.muli %c128_i32_35, %ks1 : i32 loc(#loc25) + %c128_i32_36 = arith.constant 128 : i32 loc(#loc26) + %c1_i32_37 = arith.constant 1 : i32 loc(#loc26) + %ZQ = arith.constant 2 : i32 loc(#loc513) + %HQ = arith.constant 32 : i32 loc(#loc514) + %HKV = arith.constant 8 : i32 loc(#loc515) + %ZKV = arith.constant 2 : i32 loc(#loc516) + %pid = tt.get_program_id x : i32 loc(#loc517) + %NUM_KV_BLOCKS = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_128_"(%ks1) : (i32) -> i32 loc(#loc518) + %NUM_Q_BLOCKS = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_128_"(%ks0) : (i32) -> i32 loc(#loc519) + %off_zq = tt.get_program_id y : i32 loc(#loc520) + %off_hkv = tt.get_program_id z : i32 loc(#loc521) + %off_zkv = arith.remsi %off_zq, %ZKV : i32 loc(#loc522) + %SPARSE_Z = arith.constant 2 : i32 loc(#loc523) + %SPARSE_HQ = arith.constant 1 : i32 loc(#loc524) + %sparse_idx_z = arith.remsi %off_zq, %SPARSE_Z : i32 loc(#loc525) + %k_adj = arith.muli %2, %off_hkv : i32 loc(#loc526) + %k_adj_38 = arith.muli %1, %off_zkv : i32 loc(#loc527) + %k_adj_39 = arith.addi %k_adj, %k_adj_38 : i32 loc(#loc528) + %k_adj_40 = arith.extsi %k_adj_39 : i32 to i64 loc(#loc529) + %v_adj = arith.muli %4, %off_hkv : i32 loc(#loc530) + %v_adj_41 = arith.muli %3, %off_zkv : i32 loc(#loc531) + %v_adj_42 = arith.addi %v_adj, %v_adj_41 : i32 loc(#loc532) + %v_adj_43 = arith.extsi %v_adj_42 : i32 to i64 loc(#loc533) + %dv_adj = arith.muli %23, %off_hkv : i32 loc(#loc534) + %dv_adj_44 = arith.muli %22, %off_zq : i32 loc(#loc535) + %dv_adj_45 = arith.addi %dv_adj, %dv_adj_44 : i32 loc(#loc536) + %dv_adj_46 = arith.extsi %dv_adj_45 : i32 to i64 loc(#loc537) + %K = tt.addptr %arg_K, %k_adj_40 : !tt.ptr, i64 loc(#loc538) + %V = tt.addptr %arg_V, %v_adj_43 : !tt.ptr, i64 loc(#loc539) + %DV = tt.addptr %arg_DV, %dv_adj_46 : !tt.ptr, i64 loc(#loc540) + %RCP_LN2 = arith.constant 1.44269502 : f32 loc(#loc541) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc542) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc543) + %24 = arith.cmpi sge, %pid, %NUM_KV_BLOCKS : i32 loc(#loc58) + %25:2 = scf.if %24 -> (i32, i32) { + %off_pid = arith.subi %pid, %NUM_KV_BLOCKS : i32 loc(#loc544) + %SPARSE_Q_MULTIPLE = arith.constant 1 : i32 loc(#loc1111) + %SPARSE_KV_MULTIPLE = arith.constant 2 : i32 loc(#loc1112) + %off_hq2 = arith.divsi %off_pid, %NUM_Q_BLOCKS : i32 loc(#loc547) + %off_hq2_47 = arith.constant 4 : i32 loc(#loc548) + %off_hq2_48 = arith.constant 4 : i32 loc(#loc548) + %off_hq2_49 = arith.muli %off_hkv, %off_hq2_48 : i32 loc(#loc548) + %off_hq2_50 = arith.addi %off_hq2, %off_hq2_49 : i32 loc(#loc549) + %start_m2_block = arith.remsi %off_pid, %NUM_Q_BLOCKS : i32 loc(#loc550) + %off_pid_mask = arith.divsi %start_m2_block, %SPARSE_Q_MULTIPLE : i32 loc(#loc551) + %stride_kv_idx_h = arith.muli %ks3, %ks4 : i32 loc(#loc552) + %sparse_idx_hq2 = arith.remsi %off_hq2_50, %SPARSE_HQ : i32 loc(#loc553) + %sparse_hz_offset = arith.muli %sparse_idx_z, %SPARSE_HQ : i32 loc(#loc554) + %sparse_hz_offset_51 = arith.addi %sparse_hz_offset, %sparse_idx_hq2 : i32 loc(#loc555) + %sparse_kv_num_blks_offset = arith.muli %sparse_hz_offset_51, %ks2 : i32 loc(#loc556) + %sparse_kv_num_blks_offset_52 = arith.addi %sparse_kv_num_blks_offset, %off_pid_mask : i32 loc(#loc557) + %sparse_kv_idx_offset = arith.muli %sparse_hz_offset_51, %stride_kv_idx_h : i32 loc(#loc558) + %sparse_kv_idx_offset_53 = arith.muli %off_pid_mask, %ks4 : i32 loc(#loc559) + %sparse_kv_idx_offset_54 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_53 : i32 loc(#loc560) + %q_adj2 = arith.muli %c128_i32, %off_hq2_50 : i32 loc(#loc561) + %q_adj2_55 = arith.muli %0, %off_zq : i32 loc(#loc562) + %q_adj2_56 = arith.addi %q_adj2, %q_adj2_55 : i32 loc(#loc563) + %q_adj2_57 = arith.extsi %q_adj2_56 : i32 to i64 loc(#loc564) + %do_adj2 = arith.muli %20, %off_hq2_50 : i32 loc(#loc565) + %do_adj2_58 = arith.muli %12, %off_zq : i32 loc(#loc566) + %do_adj2_59 = arith.addi %do_adj2, %do_adj2_58 : i32 loc(#loc567) + %do_adj2_60 = arith.extsi %do_adj2_59 : i32 to i64 loc(#loc568) + %dq_adj2 = arith.muli %c128_i32_29, %off_hq2_50 : i32 loc(#loc569) + %dq_adj2_61 = arith.muli %21, %off_zq : i32 loc(#loc570) + %dq_adj2_62 = arith.addi %dq_adj2, %dq_adj2_61 : i32 loc(#loc571) + %dq_adj2_63 = arith.extsi %dq_adj2_62 : i32 to i64 loc(#loc572) + %off_chz2 = arith.muli %off_zq, %HQ : i32 loc(#loc573) + %off_chz2_64 = arith.addi %off_chz2, %off_hq2_50 : i32 loc(#loc574) + %off_chz2_65 = arith.muli %off_chz2_64, %ks0 : i32 loc(#loc575) + %off_chz2_66 = arith.extsi %off_chz2_65 : i32 to i64 loc(#loc576) + %Q2 = tt.addptr %arg_Q, %q_adj2_57 : !tt.ptr, i64 loc(#loc577) + %DO2 = tt.addptr %arg_DO, %do_adj2_60 : !tt.ptr, i64 loc(#loc578) + %DQ2 = tt.addptr %arg_DQ, %dq_adj2_63 : !tt.ptr, i64 loc(#loc579) + %LSE2 = tt.addptr %arg_LSE, %off_chz2_66 : !tt.ptr, i64 loc(#loc580) + %DELTA2 = tt.addptr %arg_DELTA, %off_chz2_66 : !tt.ptr, i64 loc(#loc581) + %dq = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc582) + %start_m2 = arith.constant 128 : i32 loc(#loc583) + %start_m2_67 = arith.constant 128 : i32 loc(#loc583) + %start_m2_68 = arith.muli %start_m2_block, %start_m2_67 : i32 loc(#loc583) + %offs_m2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc584) + %offs_m2_69 = tt.splat %start_m2_68 : i32 -> tensor<128xi32> loc(#loc585) + %offs_m2_70 = arith.addi %offs_m2_69, %offs_m2 : tensor<128xi32> loc(#loc585) + %q = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%Q2, %offs_m2_70, %offs_k, %c4096_i32_1, %c1_i32, %ks0) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc586) + %do = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%DO2, %offs_m2_70, %offs_v, %c128_i32_25, %c1_i32_26, %ks0) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc587) + %Di = tt.splat %ks0 : i32 -> tensor<128xi32> loc(#loc588) + %Di_71 = arith.cmpi slt, %offs_m2_70, %Di : tensor<128xi32> loc(#loc588) + %Di_72 = tt.splat %DELTA2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc589) + %Di_73 = tt.addptr %Di_72, %offs_m2_70 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc589) + %Di_74 = tt.load %Di_73, %Di_71 : tensor<128x!tt.ptr> loc(#loc590) + %lse = tt.splat %ks0 : i32 -> tensor<128xi32> loc(#loc591) + %lse_75 = arith.cmpi slt, %offs_m2_70, %lse : tensor<128xi32> loc(#loc591) + %lse_76 = tt.splat %LSE2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc592) + %lse_77 = tt.addptr %lse_76, %offs_m2_70 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc592) + %lse_78 = tt.load %lse_77, %lse_75 : tensor<128x!tt.ptr> loc(#loc593) + %lse_79 = arith.constant 0xFF800000 : f32 loc(#loc594) + %lse_80 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc594) + %lse_81 = arith.cmpf oeq, %lse_78, %lse_80 : tensor<128xf32> loc(#loc594) + %lse_82 = arith.constant 0.000000e+00 : f32 loc(#loc595) + %lse_83 = arith.constant 0.000000e+00 : f32 loc(#loc595) + %lse_84 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc595) + %lse_85 = arith.select %lse_81, %lse_84, %lse_78 : tensor<128xi1>, tensor<128xf32> loc(#loc595) + %lse_86 = tt.expand_dims %lse_85 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc596) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_54 : !tt.ptr, i32 loc(#loc597) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc598) + %kv_start_87 = arith.constant 128 : i32 loc(#loc599) + %kv_start_88 = arith.constant 128 : i32 loc(#loc599) + %kv_start_89 = arith.muli %kv_start, %kv_start_88 : i32 loc(#loc599) + %sparse_kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_52 : !tt.ptr, i32 loc(#loc600) + %sparse_kv_num_blocks_90 = tt.load %sparse_kv_num_blocks : !tt.ptr loc(#loc601) + %offs_n2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc602) + %offs_n2_91 = tt.splat %kv_start_89 : i32 -> tensor<64xi32> loc(#loc603) + %offs_n2_92 = arith.addi %offs_n2_91, %offs_n2 : tensor<64xi32> loc(#loc603) + %dq_93 = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(44,)cconstexpr_bf16__(45,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %ks6, %ks7, %ks8, %K, %V, %dq, %q, %do, %Di_74, %lse_86, %off_zq, %off_hq2_50, %offs_m2_70, %offs_n2_92, %c128_i32_5, %c1_i32_6, %c128_i32_11, %c1_i32_12, %kv_indices, %sparse_kv_num_blocks_90) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc604) + %kv_indices_94 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_54 : !tt.ptr, i32 loc(#loc605) + %kv_start_95 = tt.load %kv_indices_94 : !tt.ptr loc(#loc606) + %kv_start_96 = arith.constant 128 : i32 loc(#loc607) + %kv_start_97 = arith.constant 128 : i32 loc(#loc607) + %kv_start_98 = arith.muli %kv_start_95, %kv_start_97 : i32 loc(#loc607) + %sparse_kv_num_blocks_99 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_52 : !tt.ptr, i32 loc(#loc608) + %sparse_kv_num_blocks_100 = tt.load %sparse_kv_num_blocks_99 : !tt.ptr loc(#loc609) + %offs_n2_101 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc610) + %offs_n2_102 = tt.splat %kv_start_98 : i32 -> tensor<64xi32> loc(#loc611) + %offs_n2_103 = arith.addi %offs_n2_102, %offs_n2_101 : tensor<64xi32> loc(#loc611) + %dq_104 = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(44,)cconstexpr_bf16__(45,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %ks6, %ks7, %ks8, %K, %V, %dq_93, %q, %do, %Di_74, %lse_86, %off_zq, %off_hq2_50, %offs_m2_70, %offs_n2_103, %c128_i32_5, %c1_i32_6, %c128_i32_11, %c1_i32_12, %kv_indices_94, %sparse_kv_num_blocks_100) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc612) + %dq_ptrs = tt.expand_dims %offs_m2_70 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc613) + %dq_ptrs_105 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc614) + %dq_ptrs_106 = arith.muli %dq_ptrs, %dq_ptrs_105 : tensor<128x1xi32> loc(#loc614) + %dq_ptrs_107 = tt.splat %DQ2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc615) + %dq_ptrs_108 = tt.addptr %dq_ptrs_107, %dq_ptrs_106 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc615) + %dq_ptrs_109 = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc616) + %dq_ptrs_110 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc617) + %dq_ptrs_111 = arith.muli %dq_ptrs_109, %dq_ptrs_110 : tensor<1x128xi32> loc(#loc617) + %dq_ptrs_112 = tt.broadcast %dq_ptrs_108 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc618) + %dq_ptrs_113 = tt.broadcast %dq_ptrs_111 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc618) + %dq_ptrs_114 = tt.addptr %dq_ptrs_112, %dq_ptrs_113 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc618) + %dq_115 = arith.constant 0.0883883461 : f32 loc(#loc619) + %dq_116 = arith.constant 0.0883883461 : f32 loc(#loc619) + %dq_117 = arith.constant dense<0.0883883461> : tensor<128x128xf32> loc(#loc619) + %dq_118 = arith.mulf %dq_104, %dq_117 : tensor<128x128xf32> loc(#loc619) + %26 = tt.expand_dims %offs_m2_70 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc136) + %27 = tt.splat %ks0 : i32 -> tensor<128x1xi32> loc(#loc137) + %28 = arith.cmpi slt, %26, %27 : tensor<128x1xi32> loc(#loc137) + %29 = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc138) + %c128_i32_119 = arith.constant 128 : i32 loc(#loc139) + %cst = arith.constant dense<128> : tensor<1x128xi32> loc(#loc139) + %30 = arith.cmpi slt, %29, %cst : tensor<1x128xi32> loc(#loc139) + %31 = tt.broadcast %28 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc140) + %32 = tt.broadcast %30 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc140) + %33 = arith.andi %31, %32 : tensor<128x128xi1> loc(#loc140) + %34 = arith.truncf %dq_118 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc141) + tt.store %dq_ptrs_114, %34, %33 : tensor<128x128x!tt.ptr> loc(#loc141) + scf.yield %SPARSE_KV_MULTIPLE, %SPARSE_Q_MULTIPLE : i32, i32 loc(#loc141) + } else { + %SPARSE_Q_MULTIPLE = arith.constant 2 : i32 loc(#loc1113) + %SPARSE_KV_MULTIPLE = arith.constant 1 : i32 loc(#loc1114) + %pid_mask = arith.divsi %pid, %SPARSE_KV_MULTIPLE : i32 loc(#loc622) + %stride_q_idx_h = arith.muli %ks6, %ks7 : i32 loc(#loc623) + %dv = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc624) + %dk = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc625) + %start_n1 = arith.constant 128 : i32 loc(#loc626) + %start_n1_47 = arith.constant 128 : i32 loc(#loc626) + %start_n1_48 = arith.muli %pid, %start_n1_47 : i32 loc(#loc626) + %offs_n1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc627) + %offs_n1_49 = tt.splat %start_n1_48 : i32 -> tensor<128xi32> loc(#loc628) + %offs_n1_50 = arith.addi %offs_n1_49, %offs_n1 : tensor<128xi32> loc(#loc628) + %k = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%K, %offs_n1_50, %offs_k, %c128_i32_5, %c1_i32_6, %ks1) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc629) + %v = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%V, %offs_n1_50, %offs_v, %c128_i32_11, %c1_i32_12, %ks1) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc630) + %c0_i32 = arith.constant 0 : i32 loc(#loc153) + %c4_i32 = arith.constant 4 : i32 loc(#loc153) + %c1_i32_51 = arith.constant 1 : i32 loc(#loc153) + %26 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc153) + %27 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc153) + %28 = arith.bitcast %c1_i32_51 : i32 to i32 loc(#loc153) + %29 = ub.poison : i32 loc(#loc153) + %dk_52:2 = scf.for %off_g = %26 to %27 step %28 iter_args(%dv_87 = %dv, %dk_88 = %dk) -> (tensor<128x128xf32>, tensor<128x128xf32>) : i32 { + %off_hq1 = arith.constant 4 : i32 loc(#loc632) + %off_hq1_89 = arith.constant 4 : i32 loc(#loc632) + %off_hq1_90 = arith.muli %off_hkv, %off_hq1_89 : i32 loc(#loc632) + %off_hq1_91 = arith.addi %off_hq1_90, %off_g : i32 loc(#loc633) + %q_adj1 = arith.muli %c128_i32, %off_hq1_91 : i32 loc(#loc634) + %q_adj1_92 = arith.muli %0, %off_zq : i32 loc(#loc635) + %q_adj1_93 = arith.addi %q_adj1, %q_adj1_92 : i32 loc(#loc636) + %q_adj1_94 = arith.extsi %q_adj1_93 : i32 to i64 loc(#loc637) + %do_adj1 = arith.muli %20, %off_hq1_91 : i32 loc(#loc638) + %do_adj1_95 = arith.muli %12, %off_zq : i32 loc(#loc639) + %do_adj1_96 = arith.addi %do_adj1, %do_adj1_95 : i32 loc(#loc640) + %do_adj1_97 = arith.extsi %do_adj1_96 : i32 to i64 loc(#loc641) + %dq_adj1 = arith.muli %c128_i32_29, %off_hq1_91 : i32 loc(#loc642) + %dq_adj1_98 = arith.muli %21, %off_zq : i32 loc(#loc643) + %dq_adj1_99 = arith.addi %dq_adj1, %dq_adj1_98 : i32 loc(#loc644) + %dq_adj1_100 = arith.extsi %dq_adj1_99 : i32 to i64 loc(#loc645) + %off_chz1 = arith.muli %off_zq, %HQ : i32 loc(#loc646) + %off_chz1_101 = arith.addi %off_chz1, %off_hq1_91 : i32 loc(#loc647) + %off_chz1_102 = arith.muli %off_chz1_101, %ks0 : i32 loc(#loc648) + %off_chz1_103 = arith.extsi %off_chz1_102 : i32 to i64 loc(#loc649) + %Q1 = tt.addptr %arg_Q, %q_adj1_94 : !tt.ptr, i64 loc(#loc650) + %DO1 = tt.addptr %arg_DO, %do_adj1_97 : !tt.ptr, i64 loc(#loc651) + %LSE1 = tt.addptr %arg_LSE, %off_chz1_103 : !tt.ptr, i64 loc(#loc652) + %DELTA1 = tt.addptr %arg_DELTA, %off_chz1_103 : !tt.ptr, i64 loc(#loc653) + %sparse_idx_hq1 = arith.remsi %off_hq1_91, %SPARSE_HQ : i32 loc(#loc654) + %sparse_hz_offset = arith.muli %sparse_idx_z, %SPARSE_HQ : i32 loc(#loc655) + %sparse_hz_offset_104 = arith.addi %sparse_hz_offset, %sparse_idx_hq1 : i32 loc(#loc656) + %sparse_q_num_blks_offset = arith.muli %sparse_hz_offset_104, %ks5 : i32 loc(#loc657) + %sparse_q_num_blks_offset_105 = arith.addi %sparse_q_num_blks_offset, %pid_mask : i32 loc(#loc658) + %sparse_q_idx_offset = arith.muli %sparse_hz_offset_104, %stride_q_idx_h : i32 loc(#loc659) + %sparse_q_idx_offset_106 = arith.muli %pid_mask, %ks6 : i32 loc(#loc660) + %sparse_q_idx_offset_107 = arith.addi %sparse_q_idx_offset, %sparse_q_idx_offset_106 : i32 loc(#loc661) + %q_indices = tt.addptr %arg_Q_IDX, %sparse_q_idx_offset_107 : !tt.ptr, i32 loc(#loc662) + %q_start = tt.load %q_indices : !tt.ptr loc(#loc663) + %q_start_108 = arith.constant 128 : i32 loc(#loc664) + %q_start_109 = arith.constant 128 : i32 loc(#loc664) + %q_start_110 = arith.muli %q_start, %q_start_109 : i32 loc(#loc664) + %sparse_q_num_blocks = tt.addptr %arg_Q_NUM_BLKS, %sparse_q_num_blks_offset_105 : !tt.ptr, i32 loc(#loc665) + %sparse_q_num_blocks_111 = tt.load %sparse_q_num_blocks : !tt.ptr loc(#loc666) + %offs_m1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc667) + %offs_m1_112 = tt.splat %q_start_110 : i32 -> tensor<64xi32> loc(#loc668) + %offs_m1_113 = arith.addi %offs_m1_112, %offs_m1 : tensor<64xi32> loc(#loc668) + %41:2 = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(45,)cconstexpr_bf16__(46,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %ks6, %ks7, %ks8, %Q1, %DO1, %DELTA1, %LSE1, %dk_88, %dv_87, %k, %v, %off_zq, %off_hq1_91, %offs_n1_50, %offs_m1_113, %c4096_i32_1, %c1_i32, %c128_i32_25, %c1_i32_26, %q_indices, %sparse_q_num_blocks_111) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc191) + %q_indices_114 = tt.addptr %arg_FULL_Q_IDX, %sparse_q_idx_offset_107 : !tt.ptr, i32 loc(#loc669) + %q_start_115 = tt.load %q_indices_114 : !tt.ptr loc(#loc670) + %q_start_116 = arith.constant 128 : i32 loc(#loc671) + %q_start_117 = arith.constant 128 : i32 loc(#loc671) + %q_start_118 = arith.muli %q_start_115, %q_start_117 : i32 loc(#loc671) + %sparse_q_num_blocks_119 = tt.addptr %arg_FULL_Q_NUM_BLKS, %sparse_q_num_blks_offset_105 : !tt.ptr, i32 loc(#loc672) + %sparse_q_num_blocks_120 = tt.load %sparse_q_num_blocks_119 : !tt.ptr loc(#loc673) + %offs_m1_121 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc674) + %offs_m1_122 = tt.splat %q_start_118 : i32 -> tensor<64xi32> loc(#loc675) + %offs_m1_123 = arith.addi %offs_m1_122, %offs_m1_121 : tensor<64xi32> loc(#loc675) + %42:2 = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(45,)cconstexpr_bf16__(46,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %ks6, %ks7, %ks8, %Q1, %DO1, %DELTA1, %LSE1, %41#0, %41#1, %k, %v, %off_zq, %off_hq1_91, %offs_n1_50, %offs_m1_123, %c4096_i32_1, %c1_i32, %c128_i32_25, %c1_i32_26, %q_indices_114, %sparse_q_num_blocks_120) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc199) + scf.yield %42#1, %42#0 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc200) + } loc(#loc1115) + %dv_ptrs = tt.expand_dims %offs_n1_50 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc676) + %dv_ptrs_53 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc677) + %dv_ptrs_54 = arith.muli %dv_ptrs, %dv_ptrs_53 : tensor<128x1xi32> loc(#loc677) + %dv_ptrs_55 = tt.splat %DV : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc678) + %dv_ptrs_56 = tt.addptr %dv_ptrs_55, %dv_ptrs_54 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc678) + %dv_ptrs_57 = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc679) + %dv_ptrs_58 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc680) + %dv_ptrs_59 = arith.muli %dv_ptrs_57, %dv_ptrs_58 : tensor<1x128xi32> loc(#loc680) + %dv_ptrs_60 = tt.broadcast %dv_ptrs_56 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc681) + %dv_ptrs_61 = tt.broadcast %dv_ptrs_59 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc681) + %dv_ptrs_62 = tt.addptr %dv_ptrs_60, %dv_ptrs_61 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc681) + %index_n = tt.expand_dims %offs_n1_50 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc682) + %index_k = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc683) + %index_v = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc684) + %30 = tt.splat %ks1 : i32 -> tensor<128x1xi32> loc(#loc210) + %31 = arith.cmpi slt, %index_n, %30 : tensor<128x1xi32> loc(#loc210) + %c128_i32_63 = arith.constant 128 : i32 loc(#loc211) + %cst = arith.constant dense<128> : tensor<1x128xi32> loc(#loc211) + %32 = arith.cmpi slt, %index_v, %cst : tensor<1x128xi32> loc(#loc211) + %33 = tt.broadcast %31 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc212) + %34 = tt.broadcast %32 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc212) + %35 = arith.andi %33, %34 : tensor<128x128xi1> loc(#loc212) + %36 = arith.truncf %dk_52#0 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc213) + tt.store %dv_ptrs_62, %36, %35 : tensor<128x128x!tt.ptr> loc(#loc213) + %dk_64 = arith.constant 0.0883883461 : f32 loc(#loc685) + %dk_65 = arith.constant 0.0883883461 : f32 loc(#loc685) + %dk_66 = arith.constant dense<0.0883883461> : tensor<128x128xf32> loc(#loc685) + %dk_67 = arith.mulf %dk_52#1, %dk_66 : tensor<128x128xf32> loc(#loc685) + %mask = tt.splat %ks1 : i32 -> tensor<128x1xi32> loc(#loc686) + %mask_68 = arith.cmpi slt, %index_n, %mask : tensor<128x1xi32> loc(#loc686) + %xindex = arith.constant 128 : i32 loc(#loc687) + %xindex_69 = arith.constant 128 : i32 loc(#loc687) + %xindex_70 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc687) + %xindex_71 = arith.muli %xindex_70, %index_n : tensor<128x1xi32> loc(#loc687) + %xindex_72 = tt.broadcast %index_k : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc688) + %xindex_73 = tt.broadcast %xindex_71 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc688) + %xindex_74 = arith.addi %xindex_72, %xindex_73 : tensor<128x128xi32> loc(#loc688) + %xindex_75 = arith.constant 128 : i32 loc(#loc689) + %xindex_76 = arith.constant 128 : i32 loc(#loc689) + %xindex_77 = arith.muli %xindex_76, %off_hkv : i32 loc(#loc689) + %xindex_78 = arith.muli %xindex_77, %ks1 : i32 loc(#loc690) + %xindex_79 = tt.splat %xindex_78 : i32 -> tensor<128x128xi32> loc(#loc691) + %xindex_80 = arith.addi %xindex_74, %xindex_79 : tensor<128x128xi32> loc(#loc691) + %xindex_81 = arith.constant 1024 : i32 loc(#loc692) + %xindex_82 = arith.constant 1024 : i32 loc(#loc692) + %xindex_83 = arith.muli %xindex_82, %off_zq : i32 loc(#loc692) + %xindex_84 = arith.muli %xindex_83, %ks1 : i32 loc(#loc693) + %xindex_85 = tt.splat %xindex_84 : i32 -> tensor<128x128xi32> loc(#loc694) + %xindex_86 = arith.addi %xindex_80, %xindex_85 : tensor<128x128xi32> loc(#loc694) + %37 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc224) + %38 = tt.addptr %37, %xindex_86 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc224) + %39 = tt.broadcast %mask_68 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc225) + %40 = arith.truncf %dk_67 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc225) + tt.store %38, %40, %39 : tensor<128x128x!tt.ptr> loc(#loc225) + scf.yield %SPARSE_KV_MULTIPLE, %SPARSE_Q_MULTIPLE : i32, i32 loc(#loc225) + } loc(#loc59) + tt.return loc(#loc226) + } loc(#loc) + tt.func private @"triton.language.standard.cdiv__i32__(1,)cconstexpr_128_"(%x: i32 loc("x"(#loc227))) -> i32 attributes {noinline = false} { + %c128_i32 = arith.constant 128 : i32 loc(#loc228) + %c128_i32_0 = arith.constant 128 : i32 loc(#loc228) + %0 = arith.addi %x, %c128_i32_0 : i32 loc(#loc228) + %c1_i32 = arith.constant 1 : i32 loc(#loc229) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc229) + %1 = arith.subi %0, %c1_i32_1 : i32 loc(#loc229) + %c128_i32_2 = arith.constant 128 : i32 loc(#loc230) + %c128_i32_3 = arith.constant 128 : i32 loc(#loc230) + %2 = arith.divsi %1, %c128_i32_3 : i32 loc(#loc230) + tt.return %2 : i32 loc(#loc231) + ^bb1: // no predecessors + %3 = ub.poison : i32 loc(#loc232) + tt.return %3 : i32 loc(#loc232) + } loc(#loc227) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() -> tensor<128x128xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc234) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc234) + tt.return %cst_0 : tensor<128x128xf32> loc(#loc235) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc236) + tt.return %0 : tensor<128x128xf32> loc(#loc236) + } loc(#loc233) + tt.func private @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: !tt.ptr loc("ptr"(#loc237)), %offs_m: tensor<128xi32> loc("offs_m"(#loc237)), %offs_n: tensor<128xi32> loc("offs_n"(#loc237)), %stride_m: i32 loc("stride_m"(#loc237)), %stride_n: i32 loc("stride_n"(#loc237)), %M_LEN: i32 loc("M_LEN"(#loc237))) -> tensor<128x128xbf16> attributes {noinline = false} { + %ptr_0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc702) + %ptr_1 = tt.splat %stride_m : i32 -> tensor<128x1xi32> loc(#loc703) + %ptr_2 = arith.muli %ptr_0, %ptr_1 : tensor<128x1xi32> loc(#loc703) + %ptr_3 = tt.splat %ptr : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc704) + %ptr_4 = tt.addptr %ptr_3, %ptr_2 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc704) + %ptr_5 = tt.expand_dims %offs_n {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc705) + %ptr_6 = tt.splat %stride_n : i32 -> tensor<1x128xi32> loc(#loc706) + %ptr_7 = arith.muli %ptr_5, %ptr_6 : tensor<1x128xi32> loc(#loc706) + %ptr_8 = tt.broadcast %ptr_4 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc707) + %ptr_9 = tt.broadcast %ptr_7 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc707) + %ptr_10 = tt.addptr %ptr_8, %ptr_9 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc707) + %0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc244) + %1 = tt.splat %M_LEN : i32 -> tensor<128x1xi32> loc(#loc245) + %2 = arith.cmpi slt, %0, %1 : tensor<128x1xi32> loc(#loc245) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc246) + %3 = tt.broadcast %2 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc246) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc246) + %4 = arith.truncf %cst_11 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc246) + %5 = tt.load %ptr_10, %3, %4 : tensor<128x128x!tt.ptr> loc(#loc246) + tt.return %5 : tensor<128x128xbf16> loc(#loc247) + ^bb1: // no predecessors + %6 = ub.poison : tensor<128x128xbf16> loc(#loc248) + tt.return %6 : tensor<128x128xbf16> loc(#loc248) + } loc(#loc237) + tt.func private @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(44,)cconstexpr_bf16__(45,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc249)), %arg_K: !tt.ptr loc("arg_K"(#loc249)), %arg_V: !tt.ptr loc("arg_V"(#loc249)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc249)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc249)), %arg_DO: !tt.ptr loc("arg_DO"(#loc249)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc249)), %arg_DV: !tt.ptr loc("arg_DV"(#loc249)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc249)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc249)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc249)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc249)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc249)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc249)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc249)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc249)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc249)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc249)), %ks0: i32 loc("ks0"(#loc249)), %ks1: i32 loc("ks1"(#loc249)), %ks2: i32 loc("ks2"(#loc249)), %ks3: i32 loc("ks3"(#loc249)), %ks4: i32 loc("ks4"(#loc249)), %ks5: i32 loc("ks5"(#loc249)), %ks6: i32 loc("ks6"(#loc249)), %ks7: i32 loc("ks7"(#loc249)), %ks8: i32 loc("ks8"(#loc249)), %K: !tt.ptr loc("K"(#loc249)), %V: !tt.ptr loc("V"(#loc249)), %dq: tensor<128x128xf32> loc("dq"(#loc249)), %q: tensor<128x128xbf16> loc("q"(#loc249)), %do: tensor<128x128xbf16> loc("do"(#loc249)), %Di: tensor<128xf32> loc("Di"(#loc249)), %lse: tensor<128x1xf32> loc("lse"(#loc249)), %off_z: i32 loc("off_z"(#loc249)), %off_hq: i32 loc("off_hq"(#loc249)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc249)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc249)), %stride_kn: i32 loc("stride_kn"(#loc249)), %stride_kd: i32 loc("stride_kd"(#loc249)), %stride_vn: i32 loc("stride_vn"(#loc249)), %stride_vd: i32 loc("stride_vd"(#loc249)), %kv_indices: !tt.ptr loc("kv_indices"(#loc249)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc249))) -> tensor<128x128xf32> attributes {noinline = false} { + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc752) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc753) + %kT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc754) + %kT_ptrs_0 = tt.splat %stride_kn : i32 -> tensor<1x64xi32> loc(#loc755) + %kT_ptrs_1 = arith.muli %kT_ptrs, %kT_ptrs_0 : tensor<1x64xi32> loc(#loc755) + %kT_ptrs_2 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc756) + %kT_ptrs_3 = tt.addptr %kT_ptrs_2, %kT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc756) + %kT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc757) + %kT_ptrs_5 = tt.splat %stride_kd : i32 -> tensor<128x1xi32> loc(#loc758) + %kT_ptrs_6 = arith.muli %kT_ptrs_4, %kT_ptrs_5 : tensor<128x1xi32> loc(#loc758) + %kT_ptrs_7 = tt.broadcast %kT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc759) + %kT_ptrs_8 = tt.broadcast %kT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc759) + %kT_ptrs_9 = tt.addptr %kT_ptrs_7, %kT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc759) + %vT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc760) + %vT_ptrs_10 = tt.splat %stride_vn : i32 -> tensor<1x64xi32> loc(#loc761) + %vT_ptrs_11 = arith.muli %vT_ptrs, %vT_ptrs_10 : tensor<1x64xi32> loc(#loc761) + %vT_ptrs_12 = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc762) + %vT_ptrs_13 = tt.addptr %vT_ptrs_12, %vT_ptrs_11 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc762) + %vT_ptrs_14 = tt.expand_dims %offs_v {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc763) + %vT_ptrs_15 = tt.splat %stride_vd : i32 -> tensor<128x1xi32> loc(#loc764) + %vT_ptrs_16 = arith.muli %vT_ptrs_14, %vT_ptrs_15 : tensor<128x1xi32> loc(#loc764) + %vT_ptrs_17 = tt.broadcast %vT_ptrs_13 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc765) + %vT_ptrs_18 = tt.broadcast %vT_ptrs_16 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc765) + %vT_ptrs_19 = tt.addptr %vT_ptrs_17, %vT_ptrs_18 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc765) + %hi = arith.constant 2 : i32 loc(#loc766) + %hi_20 = arith.constant 2 : i32 loc(#loc766) + %hi_21 = arith.muli %sparse_kv_num_blocks, %hi_20 : i32 loc(#loc766) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%ks1) : (i32) -> i32 loc(#loc767) + %hi_23 = arith.constant 1 : i32 loc(#loc768) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc768) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc769) + %c0_i32 = arith.constant 0 : i32 loc(#loc268) + %c1_i32 = arith.constant 1 : i32 loc(#loc268) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc268) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc268) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc268) + %3 = ub.poison : i32 loc(#loc268) + %vT_ptrs_26:4 = scf.for %start_n = %0 to %1 step %2 iter_args(%dq_27 = %dq, %offs_n2_28 = %offs_n2, %kT_ptrs_29 = %kT_ptrs_9, %vT_ptrs_30 = %vT_ptrs_19) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %dq_31 = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(48,)cconstexpr_bf16__(49,)cconstexpr_1_d_44269504__(50,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %ks6, %ks7, %ks8, %dq_27, %q, %kT_ptrs_29, %vT_ptrs_30, %do, %Di, %lse, %ks0, %ks1, %off_z, %off_hq, %offs_m2, %offs_n2_28, %offs_k, %offs_v, %stride_kn, %stride_kd, %stride_vn, %stride_vd, %kv_indices, %sparse_kv_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc771) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %sparse_kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc772) + %kT_ptrs_32 = arith.muli %offset, %stride_kn : i32 loc(#loc773) + %kT_ptrs_33 = tt.splat %kT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc774) + %kT_ptrs_34 = tt.addptr %kT_ptrs_29, %kT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc774) + %vT_ptrs_35 = arith.muli %offset, %stride_vn : i32 loc(#loc775) + %vT_ptrs_36 = tt.splat %vT_ptrs_35 : i32 -> tensor<128x64xi32> loc(#loc776) + %vT_ptrs_37 = tt.addptr %vT_ptrs_30, %vT_ptrs_36 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc776) + %offs_n2_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc777) + %offs_n2_39 = arith.addi %offs_n2_28, %offs_n2_38 : tensor<64xi32> loc(#loc777) + scf.yield %dq_31, %offs_n2_39, %kT_ptrs_34, %vT_ptrs_37 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc276) + } loc(#loc1120) + tt.return %vT_ptrs_26#0 : tensor<128x128xf32> loc(#loc277) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc278) + tt.return %4 : tensor<128x128xf32> loc(#loc278) + } loc(#loc249) + tt.func private @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%x: i32 loc("x"(#loc227))) -> i32 attributes {noinline = false} { + %c64_i32 = arith.constant 64 : i32 loc(#loc228) + %c64_i32_0 = arith.constant 64 : i32 loc(#loc228) + %0 = arith.addi %x, %c64_i32_0 : i32 loc(#loc228) + %c1_i32 = arith.constant 1 : i32 loc(#loc229) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc229) + %1 = arith.subi %0, %c1_i32_1 : i32 loc(#loc229) + %c64_i32_2 = arith.constant 64 : i32 loc(#loc230) + %c64_i32_3 = arith.constant 64 : i32 loc(#loc230) + %2 = arith.divsi %1, %c64_i32_3 : i32 loc(#loc230) + tt.return %2 : i32 loc(#loc231) + ^bb1: // no predecessors + %3 = ub.poison : i32 loc(#loc232) + tt.return %3 : i32 loc(#loc232) + } loc(#loc227) + tt.func private @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(48,)cconstexpr_bf16__(49,)cconstexpr_1_d_44269504__(50,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc279)), %arg_K: !tt.ptr loc("arg_K"(#loc279)), %arg_V: !tt.ptr loc("arg_V"(#loc279)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc279)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc279)), %arg_DO: !tt.ptr loc("arg_DO"(#loc279)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc279)), %arg_DV: !tt.ptr loc("arg_DV"(#loc279)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc279)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc279)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc279)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc279)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc279)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc279)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc279)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc279)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc279)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc279)), %ks0: i32 loc("ks0"(#loc279)), %ks1: i32 loc("ks1"(#loc279)), %ks2: i32 loc("ks2"(#loc279)), %ks3: i32 loc("ks3"(#loc279)), %ks4: i32 loc("ks4"(#loc279)), %ks5: i32 loc("ks5"(#loc279)), %ks6: i32 loc("ks6"(#loc279)), %ks7: i32 loc("ks7"(#loc279)), %ks8: i32 loc("ks8"(#loc279)), %dq: tensor<128x128xf32> loc("dq"(#loc279)), %q: tensor<128x128xbf16> loc("q"(#loc279)), %kT_ptrs: tensor<128x64x!tt.ptr> loc("kT_ptrs"(#loc279)), %vT_ptrs: tensor<128x64x!tt.ptr> loc("vT_ptrs"(#loc279)), %do: tensor<128x128xbf16> loc("do"(#loc279)), %Di: tensor<128xf32> loc("Di"(#loc279)), %lse: tensor<128x1xf32> loc("lse"(#loc279)), %Q_LEN: i32 loc("Q_LEN"(#loc279)), %KV_LEN: i32 loc("KV_LEN"(#loc279)), %off_z: i32 loc("off_z"(#loc279)), %off_hq: i32 loc("off_hq"(#loc279)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc279)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc279)), %offs_k: tensor<128xi32> loc("offs_k"(#loc279)), %offs_v: tensor<128xi32> loc("offs_v"(#loc279)), %stride_kn: i32 loc("stride_kn"(#loc279)), %stride_kd: i32 loc("stride_kd"(#loc279)), %stride_vn: i32 loc("stride_vn"(#loc279)), %stride_vd: i32 loc("stride_vd"(#loc279)), %kv_indices: !tt.ptr loc("kv_indices"(#loc279)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc279))) -> tensor<128x128xf32> attributes {noinline = false} { + %kT = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%kT_ptrs, %offs_k, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc826) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc827) + %qk_0 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc827) + %qk_1 = tt.dot %q, %kT, %qk_0, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc827) + %qk_2 = arith.constant 0.0883883461 : f32 loc(#loc828) + %qk_3 = arith.constant 0.0883883461 : f32 loc(#loc828) + %qk_4 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc828) + %qk_5 = arith.mulf %qk_1, %qk_4 : tensor<128x64xf32> loc(#loc828) + %n = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc829) + %n_6 = tt.call @torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.get_bounded_indices__i32S1_64S_i32__(%n, %KV_LEN) : (tensor<1x64xi32>, i32) -> tensor<1x64xi32> loc(#loc830) + %m = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc831) + %m_7 = tt.call @torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.get_bounded_indices__i32S128_1S_i32__(%m, %Q_LEN) : (tensor<128x1xi32>, i32) -> tensor<128x1xi32> loc(#loc832) + %post_mod_scores = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc833) + %post_mod_scores_8 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc834) + %post_mod_scores_9 = arith.cmpi slt, %post_mod_scores, %post_mod_scores_8 : tensor<1x64xi32> loc(#loc834) + %post_mod_scores_10 = arith.constant 0xFF800000 : f32 loc(#loc835) + %post_mod_scores_11 = arith.constant 0xFF800000 : f32 loc(#loc835) + %post_mod_scores_12 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc835) + %post_mod_scores_13 = tt.broadcast %post_mod_scores_9 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc835) + %post_mod_scores_14 = arith.select %post_mod_scores_13, %qk_5, %post_mod_scores_12 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc835) + %tmp1 = arith.constant false loc(#loc836) + %tmp1_15 = arith.constant dense : tensor<1xi1> loc(#loc836) + %tmp4 = tt.broadcast %m_7 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc837) + %tmp4_16 = tt.broadcast %n_6 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc837) + %tmp4_17 = arith.cmpi sge, %tmp4, %tmp4_16 : tensor<128x64xi32> loc(#loc837) + %tmp5 = arith.extsi %n_6 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc838) + %tmp7 = tt.addptr %in_ptr16, %off_z : !tt.ptr, i32 loc(#loc839) + %tmp7_18 = tt.load %tmp7 : !tt.ptr loc(#loc840) + %tmp8 = tt.splat %tmp7_18 : i64 -> tensor<1x64xi64> loc(#loc841) + %tmp8_19 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc841) + %tmp9 = arith.extsi %m_7 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc842) + %tmp10 = tt.splat %tmp7_18 : i64 -> tensor<128x1xi64> loc(#loc843) + %tmp10_20 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc843) + %tmp11 = tt.broadcast %tmp8_19 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc844) + %tmp11_21 = tt.broadcast %tmp10_20 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc844) + %tmp11_22 = arith.andi %tmp11, %tmp11_21 : tensor<128x64xi1> loc(#loc844) + %tmp12 = arith.andi %tmp4_17, %tmp11_22 : tensor<128x64xi1> loc(#loc845) + %tmp13 = tt.expand_dims %tmp1_15 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc846) + %tmp13_23 = tt.broadcast %tmp13 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc846) + %tmp13_24 = arith.ori %tmp13_23, %tmp12 : tensor<128x64xi1> loc(#loc846) + %tmp15 = tt.splat %ks8 : i32 -> tensor<1x64xi32> loc(#loc847) + %tmp15_25 = arith.cmpi sge, %n_6, %tmp15 : tensor<1x64xi32> loc(#loc847) + %tmp16 = tt.splat %ks8 : i32 -> tensor<1x64xi32> loc(#loc848) + %tmp16_26 = arith.remsi %n_6, %tmp16 : tensor<1x64xi32> loc(#loc848) + %tmp17 = arith.constant 0 : i32 loc(#loc849) + %tmp17_27 = arith.constant dense<0> : tensor<1xi32> loc(#loc849) + %tmp18 = tt.expand_dims %tmp17_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc850) + %tmp18_28 = tt.broadcast %tmp18 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc850) + %tmp18_29 = arith.cmpi ne, %tmp16_26, %tmp18_28 : tensor<1x64xi32> loc(#loc850) + %tmp19 = arith.constant 0 : i32 loc(#loc851) + %tmp19_30 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc851) + %tmp19_31 = arith.cmpi slt, %tmp16_26, %tmp19_30 : tensor<1x64xi32> loc(#loc851) + %tmp20 = arith.constant 0 : i32 loc(#loc852) + %tmp20_32 = arith.cmpi slt, %ks8, %tmp20 : i32 loc(#loc852) + %tmp21 = tt.splat %tmp20_32 : i1 -> tensor<1x64xi1> loc(#loc853) + %tmp21_33 = arith.cmpi ne, %tmp19_31, %tmp21 : tensor<1x64xi1> loc(#loc853) + %tmp22 = arith.andi %tmp18_29, %tmp21_33 : tensor<1x64xi1> loc(#loc854) + %tmp23 = tt.splat %ks8 : i32 -> tensor<1x64xi32> loc(#loc855) + %tmp23_34 = arith.addi %tmp16_26, %tmp23 : tensor<1x64xi32> loc(#loc855) + %tmp24 = arith.select %tmp22, %tmp23_34, %tmp16_26 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc856) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc857) + %tmp26 = tt.splat %tmp7_18 : i64 -> tensor<1x64xi64> loc(#loc858) + %tmp26_35 = arith.cmpi slt, %tmp25, %tmp26 : tensor<1x64xi64> loc(#loc858) + %tmp27 = arith.andi %tmp15_25, %tmp26_35 : tensor<1x64xi1> loc(#loc859) + %tmp28 = tt.broadcast %n_6 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc860) + %tmp28_36 = tt.broadcast %m_7 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc860) + %tmp28_37 = arith.subi %tmp28, %tmp28_36 : tensor<128x64xi32> loc(#loc860) + %tmp29 = tt.splat %ks8 : i32 -> tensor<128x64xi32> loc(#loc861) + %tmp29_38 = arith.remsi %tmp28_37, %tmp29 : tensor<128x64xi32> loc(#loc861) + %tmp30 = tt.expand_dims %tmp17_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc862) + %tmp30_39 = tt.broadcast %tmp30 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc862) + %tmp30_40 = arith.cmpi ne, %tmp29_38, %tmp30_39 : tensor<128x64xi32> loc(#loc862) + %tmp31 = arith.constant 0 : i32 loc(#loc863) + %tmp31_41 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc863) + %tmp31_42 = arith.cmpi slt, %tmp29_38, %tmp31_41 : tensor<128x64xi32> loc(#loc863) + %tmp32 = tt.splat %tmp20_32 : i1 -> tensor<128x64xi1> loc(#loc864) + %tmp32_43 = arith.cmpi ne, %tmp31_42, %tmp32 : tensor<128x64xi1> loc(#loc864) + %tmp33 = arith.andi %tmp30_40, %tmp32_43 : tensor<128x64xi1> loc(#loc865) + %tmp34 = tt.splat %ks8 : i32 -> tensor<128x64xi32> loc(#loc866) + %tmp34_44 = arith.addi %tmp29_38, %tmp34 : tensor<128x64xi32> loc(#loc866) + %tmp35 = arith.select %tmp33, %tmp34_44, %tmp29_38 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc867) + %tmp36 = tt.expand_dims %tmp17_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc868) + %tmp36_45 = tt.broadcast %tmp36 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc868) + %tmp36_46 = arith.cmpi eq, %tmp35, %tmp36_45 : tensor<128x64xi32> loc(#loc868) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc869) + %tmp37_47 = arith.andi %tmp37, %tmp36_46 : tensor<128x64xi1> loc(#loc869) + %tmp38 = arith.ori %tmp13_24, %tmp37_47 : tensor<128x64xi1> loc(#loc870) + %post_mod_scores_48 = arith.constant 0xFF800000 : f32 loc(#loc871) + %post_mod_scores_49 = arith.constant 0xFF800000 : f32 loc(#loc871) + %post_mod_scores_50 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc871) + %post_mod_scores_51 = arith.select %tmp38, %post_mod_scores_14, %post_mod_scores_50 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc871) + %post_mod_scores_52 = arith.constant 1.44269502 : f32 loc(#loc872) + %post_mod_scores_53 = arith.constant 1.44269502 : f32 loc(#loc872) + %post_mod_scores_54 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc872) + %post_mod_scores_55 = arith.mulf %post_mod_scores_51, %post_mod_scores_54 : tensor<128x64xf32> loc(#loc872) + %p = tt.broadcast %lse : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc873) + %p_56 = arith.subf %post_mod_scores_55, %p : tensor<128x64xf32> loc(#loc873) + %p_57 = math.exp2 %p_56 : tensor<128x64xf32> loc(#loc874) + %vT = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%vT_ptrs, %offs_v, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc875) + %dp = arith.constant 0.000000e+00 : f32 loc(#loc876) + %dp_58 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc876) + %dp_59 = tt.dot %do, %vT, %dp_58, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc876) + %ds = tt.expand_dims %Di {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc877) + %ds_60 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc878) + %ds_61 = arith.subf %dp_59, %ds_60 : tensor<128x64xf32> loc(#loc878) + %ds_62 = arith.mulf %p_57, %ds_61 : tensor<128x64xf32> loc(#loc879) + %grad_scores = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc880) + %grad_scores_63 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc881) + %grad_scores_64 = arith.cmpi slt, %grad_scores, %grad_scores_63 : tensor<1x64xi32> loc(#loc881) + %grad_scores_65 = arith.constant 0.000000e+00 : f32 loc(#loc882) + %grad_scores_66 = arith.constant 0.000000e+00 : f32 loc(#loc882) + %grad_scores_67 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc882) + %grad_scores_68 = tt.broadcast %grad_scores_64 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc882) + %grad_scores_69 = arith.select %grad_scores_68, %ds_62, %grad_scores_67 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc882) + %scatter_mask = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc883) + %scatter_mask_70 = tt.splat %Q_LEN : i32 -> tensor<128x1xi32> loc(#loc884) + %scatter_mask_71 = arith.cmpi slt, %scatter_mask, %scatter_mask_70 : tensor<128x1xi32> loc(#loc884) + %scatter_mask_72 = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc885) + %scatter_mask_73 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc886) + %scatter_mask_74 = arith.cmpi slt, %scatter_mask_72, %scatter_mask_73 : tensor<1x64xi32> loc(#loc886) + %scatter_mask_75 = tt.broadcast %scatter_mask_71 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc887) + %scatter_mask_76 = tt.broadcast %scatter_mask_74 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc887) + %scatter_mask_77 = arith.andi %scatter_mask_75, %scatter_mask_76 : tensor<128x64xi1> loc(#loc887) + %ds_78 = arith.constant 0.000000e+00 : f32 loc(#loc888) + %ds_79 = arith.constant 0.000000e+00 : f32 loc(#loc888) + %ds_80 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc888) + %ds_81 = arith.select %tmp38, %grad_scores_69, %ds_80 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc888) + %ds_82 = arith.truncf %ds_81 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc889) + %dq_83 = tt.trans %kT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc890) + %dq_84 = arith.constant 0.000000e+00 : f32 loc(#loc891) + %dq_85 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc891) + %dq_86 = tt.dot %ds_82, %dq_83, %dq_85, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc891) + %dq_87 = arith.addf %dq, %dq_86 : tensor<128x128xf32> loc(#loc892) + tt.return %dq_87 : tensor<128x128xf32> loc(#loc347) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc348) + tt.return %0 : tensor<128x128xf32> loc(#loc348) + } loc(#loc279) + tt.func private @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%ptr: tensor<128x64x!tt.ptr> loc("ptr"(#loc237)), %offs_m: tensor<128xi32> loc("offs_m"(#loc237)), %offs_n: tensor<64xi32> loc("offs_n"(#loc237)), %N_LEN: i32 loc("N_LEN"(#loc237))) -> tensor<128x64xbf16> attributes {noinline = false} { + %0 = tt.expand_dims %offs_n {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc349) + %1 = tt.splat %N_LEN : i32 -> tensor<1x64xi32> loc(#loc350) + %2 = arith.cmpi slt, %0, %1 : tensor<1x64xi32> loc(#loc350) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc351) + %3 = tt.broadcast %2 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc351) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc351) + %4 = arith.truncf %cst_0 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc351) + %5 = tt.load %ptr, %3, %4 : tensor<128x64x!tt.ptr> loc(#loc351) + tt.return %5 : tensor<128x64xbf16> loc(#loc352) + ^bb1: // no predecessors + %6 = ub.poison : tensor<128x64xbf16> loc(#loc248) + tt.return %6 : tensor<128x64xbf16> loc(#loc248) + } loc(#loc237) + tt.func private @torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.get_bounded_indices__i32S1_64S_i32__(%indices: tensor<1x64xi32> loc("indices"(#loc353)), %max_len: i32 loc("max_len"(#loc353))) -> tensor<1x64xi32> attributes {noinline = false} { + %0 = tt.splat %max_len : i32 -> tensor<1x64xi32> loc(#loc354) + %1 = arith.remsi %indices, %0 : tensor<1x64xi32> loc(#loc354) + tt.return %1 : tensor<1x64xi32> loc(#loc355) + ^bb1: // no predecessors + %2 = ub.poison : tensor<1x64xi32> loc(#loc356) + tt.return %2 : tensor<1x64xi32> loc(#loc356) + } loc(#loc353) + tt.func private @torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.get_bounded_indices__i32S128_1S_i32__(%indices: tensor<128x1xi32> loc("indices"(#loc353)), %max_len: i32 loc("max_len"(#loc353))) -> tensor<128x1xi32> attributes {noinline = false} { + %0 = tt.splat %max_len : i32 -> tensor<128x1xi32> loc(#loc354) + %1 = arith.remsi %indices, %0 : tensor<128x1xi32> loc(#loc354) + tt.return %1 : tensor<128x1xi32> loc(#loc355) + ^bb1: // no predecessors + %2 = ub.poison : tensor<128x1xi32> loc(#loc356) + tt.return %2 : tensor<128x1xi32> loc(#loc356) + } loc(#loc353) + tt.func private @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%loop_iter: i32 loc("loop_iter"(#loc357)), %col_indices: !tt.ptr loc("col_indices"(#loc357)), %total_blocks: i32 loc("total_blocks"(#loc357))) -> i32 attributes {noinline = false} { + %cur_block_idx = arith.constant 2 : i32 loc(#loc899) + %cur_block_idx_0 = arith.constant 2 : i32 loc(#loc899) + %cur_block_idx_1 = arith.divsi %loop_iter, %cur_block_idx_0 : i32 loc(#loc899) + %cur_block = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc900) + %cur_block_2 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc901) + %next_block = arith.constant 1 : i32 loc(#loc902) + %next_block_3 = arith.constant 1 : i32 loc(#loc902) + %next_block_4 = arith.addi %cur_block_idx_1, %next_block_3 : i32 loc(#loc902) + %next_block_5 = arith.cmpi slt, %next_block_4, %total_blocks : i32 loc(#loc903) + %next_block_6 = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc904) + %next_block_7 = arith.constant 1 : i32 loc(#loc905) + %next_block_8 = tt.addptr %next_block_6, %next_block_7 : !tt.ptr, i32 loc(#loc905) + %next_block_9 = tt.load %next_block_8, %next_block_5 evictionPolicy = evict_last : !tt.ptr loc(#loc906) + %needs_jump = arith.constant 1 : i32 loc(#loc907) + %needs_jump_10 = arith.constant 1 : i32 loc(#loc907) + %needs_jump_11 = arith.addi %loop_iter, %needs_jump_10 : i32 loc(#loc907) + %needs_jump_12 = arith.constant 2 : i32 loc(#loc908) + %needs_jump_13 = arith.constant 2 : i32 loc(#loc908) + %needs_jump_14 = arith.remsi %needs_jump_11, %needs_jump_13 : i32 loc(#loc908) + %needs_jump_15 = arith.constant 0 : i32 loc(#loc909) + %needs_jump_16 = arith.cmpi eq, %needs_jump_14, %needs_jump_15 : i32 loc(#loc909) + %jump_to_block = arith.subi %next_block_9, %cur_block_2 : i32 loc(#loc910) + %jump_to_block_17 = arith.constant 128 : i32 loc(#loc911) + %jump_to_block_18 = arith.constant 128 : i32 loc(#loc911) + %jump_to_block_19 = arith.muli %jump_to_block, %jump_to_block_18 : i32 loc(#loc911) + %jump_to_block_20 = arith.constant 64 : i32 loc(#loc912) + %jump_to_block_21 = arith.constant 64 : i32 loc(#loc912) + %jump_to_block_22 = arith.subi %jump_to_block_19, %jump_to_block_21 : i32 loc(#loc912) + %offset = arith.extui %needs_jump_16 : i1 to i32 loc(#loc913) + %offset_23 = arith.muli %jump_to_block_22, %offset : i32 loc(#loc913) + %offset_24 = arith.constant 1 : i32 loc(#loc914) + %offset_25 = arith.constant 1 : i32 loc(#loc914) + %offset_26 = arith.extui %needs_jump_16 : i1 to i32 loc(#loc914) + %offset_27 = arith.subi %offset_25, %offset_26 : i32 loc(#loc914) + %offset_28 = arith.constant 64 : i32 loc(#loc915) + %offset_29 = arith.constant 64 : i32 loc(#loc915) + %offset_30 = arith.muli %offset_27, %offset_29 : i32 loc(#loc915) + %offset_31 = arith.addi %offset_23, %offset_30 : i32 loc(#loc916) + tt.return %offset_31 : i32 loc(#loc376) + ^bb1: // no predecessors + %0 = ub.poison : i32 loc(#loc377) + tt.return %0 : i32 loc(#loc377) + } loc(#loc357) + tt.func private @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(44,)cconstexpr_bf16__(45,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc249)), %arg_K: !tt.ptr loc("arg_K"(#loc249)), %arg_V: !tt.ptr loc("arg_V"(#loc249)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc249)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc249)), %arg_DO: !tt.ptr loc("arg_DO"(#loc249)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc249)), %arg_DV: !tt.ptr loc("arg_DV"(#loc249)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc249)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc249)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc249)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc249)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc249)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc249)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc249)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc249)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc249)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc249)), %ks0: i32 loc("ks0"(#loc249)), %ks1: i32 loc("ks1"(#loc249)), %ks2: i32 loc("ks2"(#loc249)), %ks3: i32 loc("ks3"(#loc249)), %ks4: i32 loc("ks4"(#loc249)), %ks5: i32 loc("ks5"(#loc249)), %ks6: i32 loc("ks6"(#loc249)), %ks7: i32 loc("ks7"(#loc249)), %ks8: i32 loc("ks8"(#loc249)), %K: !tt.ptr loc("K"(#loc249)), %V: !tt.ptr loc("V"(#loc249)), %dq: tensor<128x128xf32> loc("dq"(#loc249)), %q: tensor<128x128xbf16> loc("q"(#loc249)), %do: tensor<128x128xbf16> loc("do"(#loc249)), %Di: tensor<128xf32> loc("Di"(#loc249)), %lse: tensor<128x1xf32> loc("lse"(#loc249)), %off_z: i32 loc("off_z"(#loc249)), %off_hq: i32 loc("off_hq"(#loc249)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc249)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc249)), %stride_kn: i32 loc("stride_kn"(#loc249)), %stride_kd: i32 loc("stride_kd"(#loc249)), %stride_vn: i32 loc("stride_vn"(#loc249)), %stride_vd: i32 loc("stride_vd"(#loc249)), %kv_indices: !tt.ptr loc("kv_indices"(#loc249)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc249))) -> tensor<128x128xf32> attributes {noinline = false} { + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc752) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc753) + %kT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc754) + %kT_ptrs_0 = tt.splat %stride_kn : i32 -> tensor<1x64xi32> loc(#loc755) + %kT_ptrs_1 = arith.muli %kT_ptrs, %kT_ptrs_0 : tensor<1x64xi32> loc(#loc755) + %kT_ptrs_2 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc756) + %kT_ptrs_3 = tt.addptr %kT_ptrs_2, %kT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc756) + %kT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc757) + %kT_ptrs_5 = tt.splat %stride_kd : i32 -> tensor<128x1xi32> loc(#loc758) + %kT_ptrs_6 = arith.muli %kT_ptrs_4, %kT_ptrs_5 : tensor<128x1xi32> loc(#loc758) + %kT_ptrs_7 = tt.broadcast %kT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc759) + %kT_ptrs_8 = tt.broadcast %kT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc759) + %kT_ptrs_9 = tt.addptr %kT_ptrs_7, %kT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc759) + %vT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc760) + %vT_ptrs_10 = tt.splat %stride_vn : i32 -> tensor<1x64xi32> loc(#loc761) + %vT_ptrs_11 = arith.muli %vT_ptrs, %vT_ptrs_10 : tensor<1x64xi32> loc(#loc761) + %vT_ptrs_12 = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc762) + %vT_ptrs_13 = tt.addptr %vT_ptrs_12, %vT_ptrs_11 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc762) + %vT_ptrs_14 = tt.expand_dims %offs_v {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc763) + %vT_ptrs_15 = tt.splat %stride_vd : i32 -> tensor<128x1xi32> loc(#loc764) + %vT_ptrs_16 = arith.muli %vT_ptrs_14, %vT_ptrs_15 : tensor<128x1xi32> loc(#loc764) + %vT_ptrs_17 = tt.broadcast %vT_ptrs_13 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc765) + %vT_ptrs_18 = tt.broadcast %vT_ptrs_16 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc765) + %vT_ptrs_19 = tt.addptr %vT_ptrs_17, %vT_ptrs_18 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc765) + %hi = arith.constant 2 : i32 loc(#loc766) + %hi_20 = arith.constant 2 : i32 loc(#loc766) + %hi_21 = arith.muli %sparse_kv_num_blocks, %hi_20 : i32 loc(#loc766) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%ks1) : (i32) -> i32 loc(#loc767) + %hi_23 = arith.constant 1 : i32 loc(#loc768) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc768) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc769) + %c0_i32 = arith.constant 0 : i32 loc(#loc268) + %c1_i32 = arith.constant 1 : i32 loc(#loc268) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc268) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc268) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc268) + %3 = ub.poison : i32 loc(#loc268) + %vT_ptrs_26:4 = scf.for %start_n = %0 to %1 step %2 iter_args(%dq_27 = %dq, %offs_n2_28 = %offs_n2, %kT_ptrs_29 = %kT_ptrs_9, %vT_ptrs_30 = %vT_ptrs_19) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %dq_31 = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(48,)cconstexpr_bf16__(49,)cconstexpr_1_d_44269504__(50,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %ks6, %ks7, %ks8, %dq_27, %q, %kT_ptrs_29, %vT_ptrs_30, %do, %Di, %lse, %ks0, %ks1, %off_z, %off_hq, %offs_m2, %offs_n2_28, %offs_k, %offs_v, %stride_kn, %stride_kd, %stride_vn, %stride_vd, %kv_indices, %sparse_kv_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc771) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %sparse_kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc772) + %kT_ptrs_32 = arith.muli %offset, %stride_kn : i32 loc(#loc773) + %kT_ptrs_33 = tt.splat %kT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc774) + %kT_ptrs_34 = tt.addptr %kT_ptrs_29, %kT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc774) + %vT_ptrs_35 = arith.muli %offset, %stride_vn : i32 loc(#loc775) + %vT_ptrs_36 = tt.splat %vT_ptrs_35 : i32 -> tensor<128x64xi32> loc(#loc776) + %vT_ptrs_37 = tt.addptr %vT_ptrs_30, %vT_ptrs_36 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc776) + %offs_n2_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc777) + %offs_n2_39 = arith.addi %offs_n2_28, %offs_n2_38 : tensor<64xi32> loc(#loc777) + scf.yield %dq_31, %offs_n2_39, %kT_ptrs_34, %vT_ptrs_37 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc276) + } loc(#loc1120) + tt.return %vT_ptrs_26#0 : tensor<128x128xf32> loc(#loc277) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc278) + tt.return %4 : tensor<128x128xf32> loc(#loc278) + } loc(#loc249) + tt.func private @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(48,)cconstexpr_bf16__(49,)cconstexpr_1_d_44269504__(50,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc279)), %arg_K: !tt.ptr loc("arg_K"(#loc279)), %arg_V: !tt.ptr loc("arg_V"(#loc279)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc279)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc279)), %arg_DO: !tt.ptr loc("arg_DO"(#loc279)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc279)), %arg_DV: !tt.ptr loc("arg_DV"(#loc279)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc279)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc279)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc279)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc279)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc279)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc279)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc279)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc279)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc279)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc279)), %ks0: i32 loc("ks0"(#loc279)), %ks1: i32 loc("ks1"(#loc279)), %ks2: i32 loc("ks2"(#loc279)), %ks3: i32 loc("ks3"(#loc279)), %ks4: i32 loc("ks4"(#loc279)), %ks5: i32 loc("ks5"(#loc279)), %ks6: i32 loc("ks6"(#loc279)), %ks7: i32 loc("ks7"(#loc279)), %ks8: i32 loc("ks8"(#loc279)), %dq: tensor<128x128xf32> loc("dq"(#loc279)), %q: tensor<128x128xbf16> loc("q"(#loc279)), %kT_ptrs: tensor<128x64x!tt.ptr> loc("kT_ptrs"(#loc279)), %vT_ptrs: tensor<128x64x!tt.ptr> loc("vT_ptrs"(#loc279)), %do: tensor<128x128xbf16> loc("do"(#loc279)), %Di: tensor<128xf32> loc("Di"(#loc279)), %lse: tensor<128x1xf32> loc("lse"(#loc279)), %Q_LEN: i32 loc("Q_LEN"(#loc279)), %KV_LEN: i32 loc("KV_LEN"(#loc279)), %off_z: i32 loc("off_z"(#loc279)), %off_hq: i32 loc("off_hq"(#loc279)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc279)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc279)), %offs_k: tensor<128xi32> loc("offs_k"(#loc279)), %offs_v: tensor<128xi32> loc("offs_v"(#loc279)), %stride_kn: i32 loc("stride_kn"(#loc279)), %stride_kd: i32 loc("stride_kd"(#loc279)), %stride_vn: i32 loc("stride_vn"(#loc279)), %stride_vd: i32 loc("stride_vd"(#loc279)), %kv_indices: !tt.ptr loc("kv_indices"(#loc279)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc279))) -> tensor<128x128xf32> attributes {noinline = false} { + %kT = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%kT_ptrs, %offs_k, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc826) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc827) + %qk_0 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc827) + %qk_1 = tt.dot %q, %kT, %qk_0, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc827) + %qk_2 = arith.constant 0.0883883461 : f32 loc(#loc828) + %qk_3 = arith.constant 0.0883883461 : f32 loc(#loc828) + %qk_4 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc828) + %qk_5 = arith.mulf %qk_1, %qk_4 : tensor<128x64xf32> loc(#loc828) + %n = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc829) + %n_6 = tt.call @torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.get_bounded_indices__i32S1_64S_i32__(%n, %KV_LEN) : (tensor<1x64xi32>, i32) -> tensor<1x64xi32> loc(#loc830) + %m = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc831) + %m_7 = tt.call @torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.get_bounded_indices__i32S128_1S_i32__(%m, %Q_LEN) : (tensor<128x1xi32>, i32) -> tensor<128x1xi32> loc(#loc832) + %post_mod_scores = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc833) + %post_mod_scores_8 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc834) + %post_mod_scores_9 = arith.cmpi slt, %post_mod_scores, %post_mod_scores_8 : tensor<1x64xi32> loc(#loc834) + %post_mod_scores_10 = arith.constant 0xFF800000 : f32 loc(#loc835) + %post_mod_scores_11 = arith.constant 0xFF800000 : f32 loc(#loc835) + %post_mod_scores_12 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc835) + %post_mod_scores_13 = tt.broadcast %post_mod_scores_9 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc835) + %post_mod_scores_14 = arith.select %post_mod_scores_13, %qk_5, %post_mod_scores_12 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc835) + %post_mod_scores_15 = arith.constant 1.44269502 : f32 loc(#loc872) + %post_mod_scores_16 = arith.constant 1.44269502 : f32 loc(#loc872) + %post_mod_scores_17 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc872) + %post_mod_scores_18 = arith.mulf %post_mod_scores_14, %post_mod_scores_17 : tensor<128x64xf32> loc(#loc872) + %p = tt.broadcast %lse : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc873) + %p_19 = arith.subf %post_mod_scores_18, %p : tensor<128x64xf32> loc(#loc873) + %p_20 = math.exp2 %p_19 : tensor<128x64xf32> loc(#loc874) + %vT = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%vT_ptrs, %offs_v, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc875) + %dp = arith.constant 0.000000e+00 : f32 loc(#loc876) + %dp_21 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc876) + %dp_22 = tt.dot %do, %vT, %dp_21, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc876) + %ds = tt.expand_dims %Di {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc877) + %ds_23 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc878) + %ds_24 = arith.subf %dp_22, %ds_23 : tensor<128x64xf32> loc(#loc878) + %ds_25 = arith.mulf %p_20, %ds_24 : tensor<128x64xf32> loc(#loc879) + %grad_scores = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc880) + %grad_scores_26 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc881) + %grad_scores_27 = arith.cmpi slt, %grad_scores, %grad_scores_26 : tensor<1x64xi32> loc(#loc881) + %grad_scores_28 = arith.constant 0.000000e+00 : f32 loc(#loc882) + %grad_scores_29 = arith.constant 0.000000e+00 : f32 loc(#loc882) + %grad_scores_30 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc882) + %grad_scores_31 = tt.broadcast %grad_scores_27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc882) + %grad_scores_32 = arith.select %grad_scores_31, %ds_25, %grad_scores_30 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc882) + %scatter_mask = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc883) + %scatter_mask_33 = tt.splat %Q_LEN : i32 -> tensor<128x1xi32> loc(#loc884) + %scatter_mask_34 = arith.cmpi slt, %scatter_mask, %scatter_mask_33 : tensor<128x1xi32> loc(#loc884) + %scatter_mask_35 = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc885) + %scatter_mask_36 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc886) + %scatter_mask_37 = arith.cmpi slt, %scatter_mask_35, %scatter_mask_36 : tensor<1x64xi32> loc(#loc886) + %scatter_mask_38 = tt.broadcast %scatter_mask_34 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc887) + %scatter_mask_39 = tt.broadcast %scatter_mask_37 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc887) + %scatter_mask_40 = arith.andi %scatter_mask_38, %scatter_mask_39 : tensor<128x64xi1> loc(#loc887) + %ds_41 = arith.truncf %grad_scores_32 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc889) + %dq_42 = tt.trans %kT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc890) + %dq_43 = arith.constant 0.000000e+00 : f32 loc(#loc891) + %dq_44 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc891) + %dq_45 = tt.dot %ds_41, %dq_42, %dq_44, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc891) + %dq_46 = arith.addf %dq, %dq_45 : tensor<128x128xf32> loc(#loc892) + tt.return %dq_46 : tensor<128x128xf32> loc(#loc347) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc348) + tt.return %0 : tensor<128x128xf32> loc(#loc348) + } loc(#loc279) + tt.func private @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(45,)cconstexpr_bf16__(46,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc378)), %arg_K: !tt.ptr loc("arg_K"(#loc378)), %arg_V: !tt.ptr loc("arg_V"(#loc378)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc378)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc378)), %arg_DO: !tt.ptr loc("arg_DO"(#loc378)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc378)), %arg_DV: !tt.ptr loc("arg_DV"(#loc378)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc378)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc378)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc378)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc378)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc378)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc378)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc378)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc378)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc378)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc378)), %ks0: i32 loc("ks0"(#loc378)), %ks1: i32 loc("ks1"(#loc378)), %ks2: i32 loc("ks2"(#loc378)), %ks3: i32 loc("ks3"(#loc378)), %ks4: i32 loc("ks4"(#loc378)), %ks5: i32 loc("ks5"(#loc378)), %ks6: i32 loc("ks6"(#loc378)), %ks7: i32 loc("ks7"(#loc378)), %ks8: i32 loc("ks8"(#loc378)), %Q: !tt.ptr loc("Q"(#loc378)), %DO: !tt.ptr loc("DO"(#loc378)), %DELTA: !tt.ptr loc("DELTA"(#loc378)), %LSE: !tt.ptr loc("LSE"(#loc378)), %dk: tensor<128x128xf32> loc("dk"(#loc378)), %dv: tensor<128x128xf32> loc("dv"(#loc378)), %k: tensor<128x128xbf16> loc("k"(#loc378)), %v: tensor<128x128xbf16> loc("v"(#loc378)), %off_z: i32 loc("off_z"(#loc378)), %off_hq: i32 loc("off_hq"(#loc378)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc378)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc378)), %stride_qm: i32 loc("stride_qm"(#loc378)), %stride_qd: i32 loc("stride_qd"(#loc378)), %stride_dom: i32 loc("stride_dom"(#loc378)), %stride_dod: i32 loc("stride_dod"(#loc378)), %q_indices: !tt.ptr loc("q_indices"(#loc378)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc378))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc962) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc963) + %qT_ptrs = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc964) + %qT_ptrs_0 = tt.splat %stride_qm : i32 -> tensor<1x64xi32> loc(#loc965) + %qT_ptrs_1 = arith.muli %qT_ptrs, %qT_ptrs_0 : tensor<1x64xi32> loc(#loc965) + %qT_ptrs_2 = tt.splat %Q : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc966) + %qT_ptrs_3 = tt.addptr %qT_ptrs_2, %qT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc966) + %qT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc967) + %qT_ptrs_5 = tt.splat %stride_qd : i32 -> tensor<128x1xi32> loc(#loc968) + %qT_ptrs_6 = arith.muli %qT_ptrs_4, %qT_ptrs_5 : tensor<128x1xi32> loc(#loc968) + %qT_ptrs_7 = tt.broadcast %qT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc969) + %qT_ptrs_8 = tt.broadcast %qT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc969) + %qT_ptrs_9 = tt.addptr %qT_ptrs_7, %qT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc969) + %do_ptrs = tt.expand_dims %offs_m1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc970) + %do_ptrs_10 = tt.splat %stride_dom : i32 -> tensor<64x1xi32> loc(#loc971) + %do_ptrs_11 = arith.muli %do_ptrs, %do_ptrs_10 : tensor<64x1xi32> loc(#loc971) + %do_ptrs_12 = tt.splat %DO : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc972) + %do_ptrs_13 = tt.addptr %do_ptrs_12, %do_ptrs_11 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc972) + %do_ptrs_14 = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc973) + %do_ptrs_15 = tt.splat %stride_dod : i32 -> tensor<1x128xi32> loc(#loc974) + %do_ptrs_16 = arith.muli %do_ptrs_14, %do_ptrs_15 : tensor<1x128xi32> loc(#loc974) + %do_ptrs_17 = tt.broadcast %do_ptrs_13 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc975) + %do_ptrs_18 = tt.broadcast %do_ptrs_16 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc975) + %do_ptrs_19 = tt.addptr %do_ptrs_17, %do_ptrs_18 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc975) + %hi = arith.constant 2 : i32 loc(#loc976) + %hi_20 = arith.constant 2 : i32 loc(#loc976) + %hi_21 = arith.muli %sparse_q_num_blocks, %hi_20 : i32 loc(#loc976) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%ks0) : (i32) -> i32 loc(#loc977) + %hi_23 = arith.constant 1 : i32 loc(#loc978) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc978) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc979) + %c0_i32 = arith.constant 0 : i32 loc(#loc397) + %c1_i32 = arith.constant 1 : i32 loc(#loc397) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc397) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc397) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc397) + %3 = ub.poison : i32 loc(#loc397) + %do_ptrs_26:5 = scf.for %start_m = %0 to %1 step %2 iter_args(%dk_27 = %dk, %dv_28 = %dv, %offs_m1_29 = %offs_m1, %qT_ptrs_30 = %qT_ptrs_9, %do_ptrs_31 = %do_ptrs_19) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %6:2 = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(49,)cconstexpr_bf16__(50,)cconstexpr_1_d_44269504__(51,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %ks6, %ks7, %ks8, %dk_27, %dv_28, %qT_ptrs_30, %k, %v, %do_ptrs_31, %DELTA, %LSE, %ks0, %ks1, %off_z, %off_hq, %offs_n1, %offs_m1_29, %offs_k, %offs_v, %stride_qm, %stride_qd, %stride_dom, %stride_dod, %q_indices, %sparse_q_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<64x128x!tt.ptr>, !tt.ptr, !tt.ptr, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc398) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_m, %q_indices, %sparse_q_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc981) + %qT_ptrs_32 = arith.muli %offset, %stride_qm : i32 loc(#loc982) + %qT_ptrs_33 = tt.splat %qT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc983) + %qT_ptrs_34 = tt.addptr %qT_ptrs_30, %qT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc983) + %do_ptrs_35 = arith.muli %offset, %stride_dom : i32 loc(#loc984) + %do_ptrs_36 = tt.splat %do_ptrs_35 : i32 -> tensor<64x128xi32> loc(#loc985) + %do_ptrs_37 = tt.addptr %do_ptrs_31, %do_ptrs_36 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc985) + %offs_m1_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc986) + %offs_m1_39 = arith.addi %offs_m1_29, %offs_m1_38 : tensor<64xi32> loc(#loc986) + scf.yield %6#0, %6#1, %offs_m1_39, %qT_ptrs_34, %do_ptrs_37 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc405) + } loc(#loc1122) + tt.return %do_ptrs_26#0, %do_ptrs_26#1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc406) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc407) + %5 = ub.poison : tensor<128x128xf32> loc(#loc407) + tt.return %4, %5 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc407) + } loc(#loc378) + tt.func private @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(49,)cconstexpr_bf16__(50,)cconstexpr_1_d_44269504__(51,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc408)), %arg_K: !tt.ptr loc("arg_K"(#loc408)), %arg_V: !tt.ptr loc("arg_V"(#loc408)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc408)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc408)), %arg_DO: !tt.ptr loc("arg_DO"(#loc408)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc408)), %arg_DV: !tt.ptr loc("arg_DV"(#loc408)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc408)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc408)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc408)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc408)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc408)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc408)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc408)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc408)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc408)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc408)), %ks0: i32 loc("ks0"(#loc408)), %ks1: i32 loc("ks1"(#loc408)), %ks2: i32 loc("ks2"(#loc408)), %ks3: i32 loc("ks3"(#loc408)), %ks4: i32 loc("ks4"(#loc408)), %ks5: i32 loc("ks5"(#loc408)), %ks6: i32 loc("ks6"(#loc408)), %ks7: i32 loc("ks7"(#loc408)), %ks8: i32 loc("ks8"(#loc408)), %dk: tensor<128x128xf32> loc("dk"(#loc408)), %dv: tensor<128x128xf32> loc("dv"(#loc408)), %qT_ptrs: tensor<128x64x!tt.ptr> loc("qT_ptrs"(#loc408)), %k: tensor<128x128xbf16> loc("k"(#loc408)), %v: tensor<128x128xbf16> loc("v"(#loc408)), %do_ptrs: tensor<64x128x!tt.ptr> loc("do_ptrs"(#loc408)), %DELTA: !tt.ptr loc("DELTA"(#loc408)), %LSE: !tt.ptr loc("LSE"(#loc408)), %Q_LEN: i32 loc("Q_LEN"(#loc408)), %KV_LEN: i32 loc("KV_LEN"(#loc408)), %off_z: i32 loc("off_z"(#loc408)), %off_hq: i32 loc("off_hq"(#loc408)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc408)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc408)), %offs_k: tensor<128xi32> loc("offs_k"(#loc408)), %offs_v: tensor<128xi32> loc("offs_v"(#loc408)), %stride_qm: i32 loc("stride_qm"(#loc408)), %stride_qd: i32 loc("stride_qd"(#loc408)), %stride_dom: i32 loc("stride_dom"(#loc408)), %stride_dod: i32 loc("stride_dod"(#loc408)), %q_indices: !tt.ptr loc("q_indices"(#loc408)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc408))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %qT = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%qT_ptrs, %offs_k, %offs_m1, %Q_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc1036) + %lse = tt.splat %Q_LEN : i32 -> tensor<64xi32> loc(#loc1037) + %lse_0 = arith.cmpi slt, %offs_m1, %lse : tensor<64xi32> loc(#loc1037) + %lse_1 = tt.splat %LSE : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc1038) + %lse_2 = tt.addptr %lse_1, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc1038) + %lse_3 = tt.load %lse_2, %lse_0 : tensor<64x!tt.ptr> loc(#loc1039) + %lse_4 = arith.constant 0xFF800000 : f32 loc(#loc1040) + %lse_5 = arith.constant dense<0xFF800000> : tensor<64xf32> loc(#loc1040) + %lse_6 = arith.cmpf oeq, %lse_3, %lse_5 : tensor<64xf32> loc(#loc1040) + %lse_7 = arith.constant 0.000000e+00 : f32 loc(#loc1041) + %lse_8 = arith.constant 0.000000e+00 : f32 loc(#loc1041) + %lse_9 = arith.constant dense<0.000000e+00> : tensor<64xf32> loc(#loc1041) + %lse_10 = arith.select %lse_6, %lse_9, %lse_3 : tensor<64xi1>, tensor<64xf32> loc(#loc1041) + %qkT = arith.constant 0.000000e+00 : f32 loc(#loc1042) + %qkT_11 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1042) + %qkT_12 = tt.dot %k, %qT, %qkT_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc1042) + %qkT_13 = arith.constant 0.0883883461 : f32 loc(#loc1043) + %qkT_14 = arith.constant 0.0883883461 : f32 loc(#loc1043) + %qkT_15 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1043) + %qkT_16 = arith.mulf %qkT_12, %qkT_15 : tensor<128x64xf32> loc(#loc1043) + %m = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1044) + %m_17 = tt.call @torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.get_bounded_indices__i32S1_64S_i32__(%m, %Q_LEN) : (tensor<1x64xi32>, i32) -> tensor<1x64xi32> loc(#loc1045) + %n = tt.expand_dims %offs_n1 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc1046) + %n_18 = tt.call @torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.get_bounded_indices__i32S128_1S_i32__(%n, %KV_LEN) : (tensor<128x1xi32>, i32) -> tensor<128x1xi32> loc(#loc1047) + %post_mod_scores = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1048) + %post_mod_scores_19 = tt.splat %Q_LEN : i32 -> tensor<1x64xi32> loc(#loc1049) + %post_mod_scores_20 = arith.cmpi slt, %post_mod_scores, %post_mod_scores_19 : tensor<1x64xi32> loc(#loc1049) + %post_mod_scores_21 = arith.constant 0xFF800000 : f32 loc(#loc1050) + %post_mod_scores_22 = arith.constant 0xFF800000 : f32 loc(#loc1050) + %post_mod_scores_23 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1050) + %post_mod_scores_24 = tt.broadcast %post_mod_scores_20 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1050) + %post_mod_scores_25 = arith.select %post_mod_scores_24, %qkT_16, %post_mod_scores_23 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1050) + %tmp41 = arith.constant false loc(#loc1051) + %tmp41_26 = arith.constant dense : tensor<1xi1> loc(#loc1051) + %tmp44 = tt.broadcast %m_17 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc1052) + %tmp44_27 = tt.broadcast %n_18 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc1052) + %tmp44_28 = arith.cmpi sge, %tmp44, %tmp44_27 : tensor<128x64xi32> loc(#loc1052) + %tmp45 = arith.extsi %n_18 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc1053) + %tmp47 = tt.addptr %in_ptr16, %off_z : !tt.ptr, i32 loc(#loc1054) + %tmp47_29 = tt.load %tmp47 : !tt.ptr loc(#loc1055) + %tmp48 = tt.splat %tmp47_29 : i64 -> tensor<128x1xi64> loc(#loc1056) + %tmp48_30 = arith.cmpi slt, %tmp45, %tmp48 : tensor<128x1xi64> loc(#loc1056) + %tmp49 = arith.extsi %m_17 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc1057) + %tmp50 = tt.splat %tmp47_29 : i64 -> tensor<1x64xi64> loc(#loc1058) + %tmp50_31 = arith.cmpi slt, %tmp49, %tmp50 : tensor<1x64xi64> loc(#loc1058) + %tmp51 = tt.broadcast %tmp48_30 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc1059) + %tmp51_32 = tt.broadcast %tmp50_31 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1059) + %tmp51_33 = arith.andi %tmp51, %tmp51_32 : tensor<128x64xi1> loc(#loc1059) + %tmp52 = arith.andi %tmp44_28, %tmp51_33 : tensor<128x64xi1> loc(#loc1060) + %tmp53 = tt.expand_dims %tmp41_26 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc1061) + %tmp53_34 = tt.broadcast %tmp53 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc1061) + %tmp53_35 = arith.ori %tmp53_34, %tmp52 : tensor<128x64xi1> loc(#loc1061) + %tmp55 = tt.splat %ks8 : i32 -> tensor<128x1xi32> loc(#loc1062) + %tmp55_36 = arith.cmpi sge, %n_18, %tmp55 : tensor<128x1xi32> loc(#loc1062) + %tmp56 = tt.splat %ks8 : i32 -> tensor<128x1xi32> loc(#loc1063) + %tmp56_37 = arith.remsi %n_18, %tmp56 : tensor<128x1xi32> loc(#loc1063) + %tmp57 = arith.constant 0 : i32 loc(#loc1064) + %tmp57_38 = arith.constant dense<0> : tensor<1xi32> loc(#loc1064) + %tmp58 = tt.expand_dims %tmp57_38 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc1065) + %tmp58_39 = tt.broadcast %tmp58 : tensor<1x1xi32> -> tensor<128x1xi32> loc(#loc1065) + %tmp58_40 = arith.cmpi ne, %tmp56_37, %tmp58_39 : tensor<128x1xi32> loc(#loc1065) + %tmp59 = arith.constant 0 : i32 loc(#loc1066) + %tmp59_41 = arith.constant dense<0> : tensor<128x1xi32> loc(#loc1066) + %tmp59_42 = arith.cmpi slt, %tmp56_37, %tmp59_41 : tensor<128x1xi32> loc(#loc1066) + %tmp60 = arith.constant 0 : i32 loc(#loc1067) + %tmp60_43 = arith.cmpi slt, %ks8, %tmp60 : i32 loc(#loc1067) + %tmp61 = tt.splat %tmp60_43 : i1 -> tensor<128x1xi1> loc(#loc1068) + %tmp61_44 = arith.cmpi ne, %tmp59_42, %tmp61 : tensor<128x1xi1> loc(#loc1068) + %tmp62 = arith.andi %tmp58_40, %tmp61_44 : tensor<128x1xi1> loc(#loc1069) + %tmp63 = tt.splat %ks8 : i32 -> tensor<128x1xi32> loc(#loc1070) + %tmp63_45 = arith.addi %tmp56_37, %tmp63 : tensor<128x1xi32> loc(#loc1070) + %tmp64 = arith.select %tmp62, %tmp63_45, %tmp56_37 : tensor<128x1xi1>, tensor<128x1xi32> loc(#loc1071) + %tmp65 = arith.extsi %tmp64 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc1072) + %tmp66 = tt.splat %tmp47_29 : i64 -> tensor<128x1xi64> loc(#loc1073) + %tmp66_46 = arith.cmpi slt, %tmp65, %tmp66 : tensor<128x1xi64> loc(#loc1073) + %tmp67 = arith.andi %tmp55_36, %tmp66_46 : tensor<128x1xi1> loc(#loc1074) + %tmp68 = tt.broadcast %n_18 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc1075) + %tmp68_47 = tt.broadcast %m_17 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc1075) + %tmp68_48 = arith.subi %tmp68, %tmp68_47 : tensor<128x64xi32> loc(#loc1075) + %tmp69 = tt.splat %ks8 : i32 -> tensor<128x64xi32> loc(#loc1076) + %tmp69_49 = arith.remsi %tmp68_48, %tmp69 : tensor<128x64xi32> loc(#loc1076) + %tmp70 = tt.expand_dims %tmp57_38 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc1077) + %tmp70_50 = tt.broadcast %tmp70 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc1077) + %tmp70_51 = arith.cmpi ne, %tmp69_49, %tmp70_50 : tensor<128x64xi32> loc(#loc1077) + %tmp71 = arith.constant 0 : i32 loc(#loc1078) + %tmp71_52 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc1078) + %tmp71_53 = arith.cmpi slt, %tmp69_49, %tmp71_52 : tensor<128x64xi32> loc(#loc1078) + %tmp72 = tt.splat %tmp60_43 : i1 -> tensor<128x64xi1> loc(#loc1079) + %tmp72_54 = arith.cmpi ne, %tmp71_53, %tmp72 : tensor<128x64xi1> loc(#loc1079) + %tmp73 = arith.andi %tmp70_51, %tmp72_54 : tensor<128x64xi1> loc(#loc1080) + %tmp74 = tt.splat %ks8 : i32 -> tensor<128x64xi32> loc(#loc1081) + %tmp74_55 = arith.addi %tmp69_49, %tmp74 : tensor<128x64xi32> loc(#loc1081) + %tmp75 = arith.select %tmp73, %tmp74_55, %tmp69_49 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc1082) + %tmp76 = tt.expand_dims %tmp57_38 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc1083) + %tmp76_56 = tt.broadcast %tmp76 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc1083) + %tmp76_57 = arith.cmpi eq, %tmp75, %tmp76_56 : tensor<128x64xi32> loc(#loc1083) + %tmp77 = tt.broadcast %tmp67 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc1084) + %tmp77_58 = arith.andi %tmp77, %tmp76_57 : tensor<128x64xi1> loc(#loc1084) + %tmp78 = arith.ori %tmp53_35, %tmp77_58 : tensor<128x64xi1> loc(#loc1085) + %post_mod_scores_59 = arith.constant 0xFF800000 : f32 loc(#loc1086) + %post_mod_scores_60 = arith.constant 0xFF800000 : f32 loc(#loc1086) + %post_mod_scores_61 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1086) + %post_mod_scores_62 = arith.select %tmp78, %post_mod_scores_25, %post_mod_scores_61 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1086) + %post_mod_scores_63 = arith.constant 1.44269502 : f32 loc(#loc1087) + %post_mod_scores_64 = arith.constant 1.44269502 : f32 loc(#loc1087) + %post_mod_scores_65 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1087) + %post_mod_scores_66 = arith.mulf %post_mod_scores_62, %post_mod_scores_65 : tensor<128x64xf32> loc(#loc1087) + %pT = tt.expand_dims %lse_10 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc1088) + %pT_67 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1089) + %pT_68 = arith.subf %post_mod_scores_66, %pT_67 : tensor<128x64xf32> loc(#loc1089) + %pT_69 = math.exp2 %pT_68 : tensor<128x64xf32> loc(#loc1090) + %do = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.load_checked_2d__Pbf16S64_128S_i32S64S_i32S128S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%do_ptrs, %offs_m1, %offs_v, %Q_LEN) : (tensor<64x128x!tt.ptr>, tensor<64xi32>, tensor<128xi32>, i32) -> tensor<64x128xbf16> loc(#loc1091) + %dv_70 = arith.truncf %pT_69 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1092) + %dv_71 = arith.constant 0.000000e+00 : f32 loc(#loc1093) + %dv_72 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1093) + %dv_73 = tt.dot %dv_70, %do, %dv_72, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1093) + %dv_74 = arith.addf %dv, %dv_73 : tensor<128x128xf32> loc(#loc1094) + %Di = tt.splat %Q_LEN : i32 -> tensor<64xi32> loc(#loc1095) + %Di_75 = arith.cmpi slt, %offs_m1, %Di : tensor<64xi32> loc(#loc1095) + %Di_76 = tt.splat %DELTA : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc1096) + %Di_77 = tt.addptr %Di_76, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc1096) + %Di_78 = tt.load %Di_77, %Di_75 : tensor<64x!tt.ptr> loc(#loc1097) + %dpT = tt.trans %do {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc1098) + %dpT_79 = arith.constant 0.000000e+00 : f32 loc(#loc1099) + %dpT_80 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1099) + %dpT_81 = tt.dot %v, %dpT, %dpT_80, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc1099) + %dsT = tt.expand_dims %Di_78 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc1100) + %dsT_82 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1101) + %dsT_83 = arith.subf %dpT_81, %dsT_82 : tensor<128x64xf32> loc(#loc1101) + %dsT_84 = arith.mulf %pT_69, %dsT_83 : tensor<128x64xf32> loc(#loc1102) + %grad_scores = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1103) + %grad_scores_85 = tt.splat %Q_LEN : i32 -> tensor<1x64xi32> loc(#loc1104) + %grad_scores_86 = arith.cmpi slt, %grad_scores, %grad_scores_85 : tensor<1x64xi32> loc(#loc1104) + %grad_scores_87 = arith.constant 0.000000e+00 : f32 loc(#loc1105) + %grad_scores_88 = arith.constant 0.000000e+00 : f32 loc(#loc1105) + %grad_scores_89 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1105) + %grad_scores_90 = tt.broadcast %grad_scores_86 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1105) + %grad_scores_91 = arith.select %grad_scores_90, %dsT_84, %grad_scores_89 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1105) + %dsT_92 = arith.constant 0.000000e+00 : f32 loc(#loc1106) + %dsT_93 = arith.constant 0.000000e+00 : f32 loc(#loc1106) + %dsT_94 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1106) + %dsT_95 = arith.select %tmp78, %grad_scores_91, %dsT_94 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1106) + %dk_96 = arith.truncf %dsT_95 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1107) + %dk_97 = tt.trans %qT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc1108) + %dk_98 = arith.constant 0.000000e+00 : f32 loc(#loc1109) + %dk_99 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1109) + %dk_100 = tt.dot %dk_96, %dk_97, %dk_99, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1109) + %dk_101 = arith.addf %dk, %dk_100 : tensor<128x128xf32> loc(#loc1110) + tt.return %dk_101, %dv_74 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc484) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc485) + %1 = ub.poison : tensor<128x128xf32> loc(#loc485) + tt.return %0, %1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc485) + } loc(#loc408) + tt.func private @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.load_checked_2d__Pbf16S64_128S_i32S64S_i32S128S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: tensor<64x128x!tt.ptr> loc("ptr"(#loc237)), %offs_m: tensor<64xi32> loc("offs_m"(#loc237)), %offs_n: tensor<128xi32> loc("offs_n"(#loc237)), %M_LEN: i32 loc("M_LEN"(#loc237))) -> tensor<64x128xbf16> attributes {noinline = false} { + %0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc244) + %1 = tt.splat %M_LEN : i32 -> tensor<64x1xi32> loc(#loc245) + %2 = arith.cmpi slt, %0, %1 : tensor<64x1xi32> loc(#loc245) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc246) + %3 = tt.broadcast %2 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc246) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xf32> loc(#loc246) + %4 = arith.truncf %cst_0 : tensor<64x128xf32> to tensor<64x128xbf16> loc(#loc246) + %5 = tt.load %ptr, %3, %4 : tensor<64x128x!tt.ptr> loc(#loc246) + tt.return %5 : tensor<64x128xbf16> loc(#loc247) + ^bb1: // no predecessors + %6 = ub.poison : tensor<64x128xbf16> loc(#loc248) + tt.return %6 : tensor<64x128xbf16> loc(#loc248) + } loc(#loc237) + tt.func private @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(45,)cconstexpr_bf16__(46,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc378)), %arg_K: !tt.ptr loc("arg_K"(#loc378)), %arg_V: !tt.ptr loc("arg_V"(#loc378)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc378)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc378)), %arg_DO: !tt.ptr loc("arg_DO"(#loc378)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc378)), %arg_DV: !tt.ptr loc("arg_DV"(#loc378)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc378)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc378)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc378)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc378)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc378)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc378)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc378)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc378)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc378)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc378)), %ks0: i32 loc("ks0"(#loc378)), %ks1: i32 loc("ks1"(#loc378)), %ks2: i32 loc("ks2"(#loc378)), %ks3: i32 loc("ks3"(#loc378)), %ks4: i32 loc("ks4"(#loc378)), %ks5: i32 loc("ks5"(#loc378)), %ks6: i32 loc("ks6"(#loc378)), %ks7: i32 loc("ks7"(#loc378)), %ks8: i32 loc("ks8"(#loc378)), %Q: !tt.ptr loc("Q"(#loc378)), %DO: !tt.ptr loc("DO"(#loc378)), %DELTA: !tt.ptr loc("DELTA"(#loc378)), %LSE: !tt.ptr loc("LSE"(#loc378)), %dk: tensor<128x128xf32> loc("dk"(#loc378)), %dv: tensor<128x128xf32> loc("dv"(#loc378)), %k: tensor<128x128xbf16> loc("k"(#loc378)), %v: tensor<128x128xbf16> loc("v"(#loc378)), %off_z: i32 loc("off_z"(#loc378)), %off_hq: i32 loc("off_hq"(#loc378)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc378)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc378)), %stride_qm: i32 loc("stride_qm"(#loc378)), %stride_qd: i32 loc("stride_qd"(#loc378)), %stride_dom: i32 loc("stride_dom"(#loc378)), %stride_dod: i32 loc("stride_dod"(#loc378)), %q_indices: !tt.ptr loc("q_indices"(#loc378)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc378))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc962) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc963) + %qT_ptrs = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc964) + %qT_ptrs_0 = tt.splat %stride_qm : i32 -> tensor<1x64xi32> loc(#loc965) + %qT_ptrs_1 = arith.muli %qT_ptrs, %qT_ptrs_0 : tensor<1x64xi32> loc(#loc965) + %qT_ptrs_2 = tt.splat %Q : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc966) + %qT_ptrs_3 = tt.addptr %qT_ptrs_2, %qT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc966) + %qT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc967) + %qT_ptrs_5 = tt.splat %stride_qd : i32 -> tensor<128x1xi32> loc(#loc968) + %qT_ptrs_6 = arith.muli %qT_ptrs_4, %qT_ptrs_5 : tensor<128x1xi32> loc(#loc968) + %qT_ptrs_7 = tt.broadcast %qT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc969) + %qT_ptrs_8 = tt.broadcast %qT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc969) + %qT_ptrs_9 = tt.addptr %qT_ptrs_7, %qT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc969) + %do_ptrs = tt.expand_dims %offs_m1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc970) + %do_ptrs_10 = tt.splat %stride_dom : i32 -> tensor<64x1xi32> loc(#loc971) + %do_ptrs_11 = arith.muli %do_ptrs, %do_ptrs_10 : tensor<64x1xi32> loc(#loc971) + %do_ptrs_12 = tt.splat %DO : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc972) + %do_ptrs_13 = tt.addptr %do_ptrs_12, %do_ptrs_11 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc972) + %do_ptrs_14 = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc973) + %do_ptrs_15 = tt.splat %stride_dod : i32 -> tensor<1x128xi32> loc(#loc974) + %do_ptrs_16 = arith.muli %do_ptrs_14, %do_ptrs_15 : tensor<1x128xi32> loc(#loc974) + %do_ptrs_17 = tt.broadcast %do_ptrs_13 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc975) + %do_ptrs_18 = tt.broadcast %do_ptrs_16 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc975) + %do_ptrs_19 = tt.addptr %do_ptrs_17, %do_ptrs_18 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc975) + %hi = arith.constant 2 : i32 loc(#loc976) + %hi_20 = arith.constant 2 : i32 loc(#loc976) + %hi_21 = arith.muli %sparse_q_num_blocks, %hi_20 : i32 loc(#loc976) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%ks0) : (i32) -> i32 loc(#loc977) + %hi_23 = arith.constant 1 : i32 loc(#loc978) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc978) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc979) + %c0_i32 = arith.constant 0 : i32 loc(#loc397) + %c1_i32 = arith.constant 1 : i32 loc(#loc397) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc397) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc397) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc397) + %3 = ub.poison : i32 loc(#loc397) + %do_ptrs_26:5 = scf.for %start_m = %0 to %1 step %2 iter_args(%dk_27 = %dk, %dv_28 = %dv, %offs_m1_29 = %offs_m1, %qT_ptrs_30 = %qT_ptrs_9, %do_ptrs_31 = %do_ptrs_19) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %6:2 = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(49,)cconstexpr_bf16__(50,)cconstexpr_1_d_44269504__(51,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %ks6, %ks7, %ks8, %dk_27, %dv_28, %qT_ptrs_30, %k, %v, %do_ptrs_31, %DELTA, %LSE, %ks0, %ks1, %off_z, %off_hq, %offs_n1, %offs_m1_29, %offs_k, %offs_v, %stride_qm, %stride_qd, %stride_dom, %stride_dod, %q_indices, %sparse_q_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<64x128x!tt.ptr>, !tt.ptr, !tt.ptr, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc398) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_m, %q_indices, %sparse_q_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc981) + %qT_ptrs_32 = arith.muli %offset, %stride_qm : i32 loc(#loc982) + %qT_ptrs_33 = tt.splat %qT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc983) + %qT_ptrs_34 = tt.addptr %qT_ptrs_30, %qT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc983) + %do_ptrs_35 = arith.muli %offset, %stride_dom : i32 loc(#loc984) + %do_ptrs_36 = tt.splat %do_ptrs_35 : i32 -> tensor<64x128xi32> loc(#loc985) + %do_ptrs_37 = tt.addptr %do_ptrs_31, %do_ptrs_36 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc985) + %offs_m1_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc986) + %offs_m1_39 = arith.addi %offs_m1_29, %offs_m1_38 : tensor<64xi32> loc(#loc986) + scf.yield %6#0, %6#1, %offs_m1_39, %qT_ptrs_34, %do_ptrs_37 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc405) + } loc(#loc1122) + tt.return %do_ptrs_26#0, %do_ptrs_26#1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc406) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc407) + %5 = ub.poison : tensor<128x128xf32> loc(#loc407) + tt.return %4, %5 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc407) + } loc(#loc378) + tt.func private @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_i32_i32_i32_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(49,)cconstexpr_bf16__(50,)cconstexpr_1_d_44269504__(51,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc408)), %arg_K: !tt.ptr loc("arg_K"(#loc408)), %arg_V: !tt.ptr loc("arg_V"(#loc408)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc408)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc408)), %arg_DO: !tt.ptr loc("arg_DO"(#loc408)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc408)), %arg_DV: !tt.ptr loc("arg_DV"(#loc408)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc408)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc408)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc408)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc408)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc408)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc408)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc408)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc408)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc408)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc408)), %ks0: i32 loc("ks0"(#loc408)), %ks1: i32 loc("ks1"(#loc408)), %ks2: i32 loc("ks2"(#loc408)), %ks3: i32 loc("ks3"(#loc408)), %ks4: i32 loc("ks4"(#loc408)), %ks5: i32 loc("ks5"(#loc408)), %ks6: i32 loc("ks6"(#loc408)), %ks7: i32 loc("ks7"(#loc408)), %ks8: i32 loc("ks8"(#loc408)), %dk: tensor<128x128xf32> loc("dk"(#loc408)), %dv: tensor<128x128xf32> loc("dv"(#loc408)), %qT_ptrs: tensor<128x64x!tt.ptr> loc("qT_ptrs"(#loc408)), %k: tensor<128x128xbf16> loc("k"(#loc408)), %v: tensor<128x128xbf16> loc("v"(#loc408)), %do_ptrs: tensor<64x128x!tt.ptr> loc("do_ptrs"(#loc408)), %DELTA: !tt.ptr loc("DELTA"(#loc408)), %LSE: !tt.ptr loc("LSE"(#loc408)), %Q_LEN: i32 loc("Q_LEN"(#loc408)), %KV_LEN: i32 loc("KV_LEN"(#loc408)), %off_z: i32 loc("off_z"(#loc408)), %off_hq: i32 loc("off_hq"(#loc408)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc408)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc408)), %offs_k: tensor<128xi32> loc("offs_k"(#loc408)), %offs_v: tensor<128xi32> loc("offs_v"(#loc408)), %stride_qm: i32 loc("stride_qm"(#loc408)), %stride_qd: i32 loc("stride_qd"(#loc408)), %stride_dom: i32 loc("stride_dom"(#loc408)), %stride_dod: i32 loc("stride_dod"(#loc408)), %q_indices: !tt.ptr loc("q_indices"(#loc408)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc408))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %qT = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%qT_ptrs, %offs_k, %offs_m1, %Q_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc1036) + %lse = tt.splat %Q_LEN : i32 -> tensor<64xi32> loc(#loc1037) + %lse_0 = arith.cmpi slt, %offs_m1, %lse : tensor<64xi32> loc(#loc1037) + %lse_1 = tt.splat %LSE : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc1038) + %lse_2 = tt.addptr %lse_1, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc1038) + %lse_3 = tt.load %lse_2, %lse_0 : tensor<64x!tt.ptr> loc(#loc1039) + %lse_4 = arith.constant 0xFF800000 : f32 loc(#loc1040) + %lse_5 = arith.constant dense<0xFF800000> : tensor<64xf32> loc(#loc1040) + %lse_6 = arith.cmpf oeq, %lse_3, %lse_5 : tensor<64xf32> loc(#loc1040) + %lse_7 = arith.constant 0.000000e+00 : f32 loc(#loc1041) + %lse_8 = arith.constant 0.000000e+00 : f32 loc(#loc1041) + %lse_9 = arith.constant dense<0.000000e+00> : tensor<64xf32> loc(#loc1041) + %lse_10 = arith.select %lse_6, %lse_9, %lse_3 : tensor<64xi1>, tensor<64xf32> loc(#loc1041) + %qkT = arith.constant 0.000000e+00 : f32 loc(#loc1042) + %qkT_11 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1042) + %qkT_12 = tt.dot %k, %qT, %qkT_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc1042) + %qkT_13 = arith.constant 0.0883883461 : f32 loc(#loc1043) + %qkT_14 = arith.constant 0.0883883461 : f32 loc(#loc1043) + %qkT_15 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1043) + %qkT_16 = arith.mulf %qkT_12, %qkT_15 : tensor<128x64xf32> loc(#loc1043) + %m = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1044) + %m_17 = tt.call @torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.get_bounded_indices__i32S1_64S_i32__(%m, %Q_LEN) : (tensor<1x64xi32>, i32) -> tensor<1x64xi32> loc(#loc1045) + %n = tt.expand_dims %offs_n1 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc1046) + %n_18 = tt.call @torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.get_bounded_indices__i32S128_1S_i32__(%n, %KV_LEN) : (tensor<128x1xi32>, i32) -> tensor<128x1xi32> loc(#loc1047) + %post_mod_scores = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1048) + %post_mod_scores_19 = tt.splat %Q_LEN : i32 -> tensor<1x64xi32> loc(#loc1049) + %post_mod_scores_20 = arith.cmpi slt, %post_mod_scores, %post_mod_scores_19 : tensor<1x64xi32> loc(#loc1049) + %post_mod_scores_21 = arith.constant 0xFF800000 : f32 loc(#loc1050) + %post_mod_scores_22 = arith.constant 0xFF800000 : f32 loc(#loc1050) + %post_mod_scores_23 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1050) + %post_mod_scores_24 = tt.broadcast %post_mod_scores_20 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1050) + %post_mod_scores_25 = arith.select %post_mod_scores_24, %qkT_16, %post_mod_scores_23 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1050) + %post_mod_scores_26 = arith.constant 1.44269502 : f32 loc(#loc1087) + %post_mod_scores_27 = arith.constant 1.44269502 : f32 loc(#loc1087) + %post_mod_scores_28 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1087) + %post_mod_scores_29 = arith.mulf %post_mod_scores_25, %post_mod_scores_28 : tensor<128x64xf32> loc(#loc1087) + %pT = tt.expand_dims %lse_10 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc1088) + %pT_30 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1089) + %pT_31 = arith.subf %post_mod_scores_29, %pT_30 : tensor<128x64xf32> loc(#loc1089) + %pT_32 = math.exp2 %pT_31 : tensor<128x64xf32> loc(#loc1090) + %do = tt.call @"torch._inductor.runtime.compile_tasks.ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.load_checked_2d__Pbf16S64_128S_i32S64S_i32S128S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%do_ptrs, %offs_m1, %offs_v, %Q_LEN) : (tensor<64x128x!tt.ptr>, tensor<64xi32>, tensor<128xi32>, i32) -> tensor<64x128xbf16> loc(#loc1091) + %dv_33 = arith.truncf %pT_32 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1092) + %dv_34 = arith.constant 0.000000e+00 : f32 loc(#loc1093) + %dv_35 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1093) + %dv_36 = tt.dot %dv_33, %do, %dv_35, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1093) + %dv_37 = arith.addf %dv, %dv_36 : tensor<128x128xf32> loc(#loc1094) + %Di = tt.splat %Q_LEN : i32 -> tensor<64xi32> loc(#loc1095) + %Di_38 = arith.cmpi slt, %offs_m1, %Di : tensor<64xi32> loc(#loc1095) + %Di_39 = tt.splat %DELTA : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc1096) + %Di_40 = tt.addptr %Di_39, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc1096) + %Di_41 = tt.load %Di_40, %Di_38 : tensor<64x!tt.ptr> loc(#loc1097) + %dpT = tt.trans %do {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc1098) + %dpT_42 = arith.constant 0.000000e+00 : f32 loc(#loc1099) + %dpT_43 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1099) + %dpT_44 = tt.dot %v, %dpT, %dpT_43, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc1099) + %dsT = tt.expand_dims %Di_41 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc1100) + %dsT_45 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1101) + %dsT_46 = arith.subf %dpT_44, %dsT_45 : tensor<128x64xf32> loc(#loc1101) + %dsT_47 = arith.mulf %pT_32, %dsT_46 : tensor<128x64xf32> loc(#loc1102) + %grad_scores = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1103) + %grad_scores_48 = tt.splat %Q_LEN : i32 -> tensor<1x64xi32> loc(#loc1104) + %grad_scores_49 = arith.cmpi slt, %grad_scores, %grad_scores_48 : tensor<1x64xi32> loc(#loc1104) + %grad_scores_50 = arith.constant 0.000000e+00 : f32 loc(#loc1105) + %grad_scores_51 = arith.constant 0.000000e+00 : f32 loc(#loc1105) + %grad_scores_52 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1105) + %grad_scores_53 = tt.broadcast %grad_scores_49 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1105) + %grad_scores_54 = arith.select %grad_scores_53, %dsT_47, %grad_scores_52 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1105) + %dk_55 = arith.truncf %grad_scores_54 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1107) + %dk_56 = tt.trans %qT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc1108) + %dk_57 = arith.constant 0.000000e+00 : f32 loc(#loc1109) + %dk_58 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1109) + %dk_59 = tt.dot %dk_55, %dk_56, %dk_58, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1109) + %dk_60 = arith.addf %dk, %dk_59 : tensor<128x128xf32> loc(#loc1110) + tt.return %dk_60, %dv_37 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc484) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc485) + %1 = ub.poison : tensor<128x128xf32> loc(#loc485) + tt.return %0, %1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc485) + } loc(#loc408) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":94:54) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":94:49) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":95:54) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":95:63) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":95:49) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":96:54) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":96:63) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":96:49) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:74) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:66) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:100) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:91) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:82) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:59) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:126) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:118) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:152) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:143) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:134) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:111) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":99:58) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":99:53) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":100:58) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":100:67) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":100:53) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":102:9) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":103:9) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":104:10) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":106:10) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":111:24) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":112:36) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":113:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":115:27) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":116:28) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":117:23) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":119:15) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":120:16) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":122:28) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":124:25) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":124:47) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":124:35) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":124:59) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":125:25) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":125:47) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":125:35) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":125:59) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":128:27) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":128:50) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":128:37) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":128:61) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":131:9) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":132:9) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":133:10) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":135:14) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":136:26) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":137:26) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":139:14) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":139:7) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":140:24) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":142:29) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":143:30) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":144:29) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":144:54) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":144:44) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":145:35) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":146:41) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":148:30) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":151:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":152:42) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":152:54) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":154:55) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":154:78) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":155:50) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":155:83) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":155:68) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":158:30) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":158:52) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":158:40) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":158:63) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":159:32) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":159:55) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":159:42) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":159:66) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":160:32) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":160:55) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":160:42) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":160:66) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":161:30) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":161:35) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":161:46) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":161:56) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":163:17) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":164:19) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":167:19) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":168:21) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":169:25) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":172:22) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":174:36) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":175:42) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":175:29) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":178:107) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":179:111) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":188:58) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":188:34) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":188:25) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":189:57) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":189:33) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":189:26) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":190:30) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":190:50) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":191:18) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":195:30) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":196:27) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":196:41) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":197:53) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":197:39) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":199:42) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":199:29) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":207:12) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":214:39) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":215:31) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":215:45) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":216:62) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":216:43) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":218:46) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":218:33) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":226:16) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":231:32) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":231:43) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":231:24) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":231:63) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":231:74) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":231:56) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":232:14) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":236:48) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":236:59) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":236:76) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":236:87) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":236:69) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":236:30) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":239:29) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":240:30) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":242:26) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":245:29) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":249:22) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":250:22) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":252:25) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":253:42) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":253:29) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":256:107) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":257:107) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":262:30) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":263:32) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":263:51) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":266:34) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":266:56) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":266:44) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":266:67) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":267:36) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":267:59) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":267:46) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":267:70) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":268:36) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":268:59) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":268:46) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":268:70) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":269:34) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":269:39) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":269:50) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":269:60) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":271:21) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":272:23) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":275:25) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":276:29) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":278:39) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":279:46) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":279:58) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":281:58) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":281:80) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":282:53) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":282:81) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":282:70) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":286:32) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":287:30) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":287:43) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":288:55) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":288:42) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":290:45) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":290:32) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":298:16) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":306:41) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":307:34) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":307:47) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":308:64) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":308:46) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":310:49) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":310:36) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":318:20) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":303:12) +#loc201 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":323:31) +#loc202 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":323:42) +#loc203 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":323:23) +#loc204 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":323:62) +#loc205 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":323:73) +#loc206 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":323:55) +#loc207 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":325:26) +#loc208 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":326:25) +#loc209 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":327:25) +#loc210 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":332:50) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":332:71) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":332:61) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":332:30) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":334:14) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":337:29) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:31) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:27) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:45) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:53) +#loc220 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:41) +#loc221 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:64) +#loc222 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:71) +#loc223 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:59) +#loc224 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":345:29) +#loc225 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":345:69) +#loc226 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":139:4) +#loc228 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:16) +#loc229 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc230 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc231 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:11) +#loc232 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:4) +#loc233 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc234 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc235 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc236 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc238 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":825:27) +#loc239 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":825:38) +#loc240 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":825:20) +#loc241 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":825:56) +#loc242 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":825:67) +#loc243 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":825:49) +#loc244 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":833:41) +#loc245 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":833:52) +#loc246 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":833:23) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":833:15) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":828:4) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":387:26) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":388:26) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":390:26) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":390:37) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":390:18) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":390:56) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":390:67) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":390:49) +#loc258 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":391:26) +#loc259 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":391:37) +#loc260 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":391:18) +#loc261 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":391:56) +#loc262 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":391:67) +#loc263 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":391:49) +#loc264 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":395:43) +#loc265 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":395:90) +#loc266 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":395:101) +#loc267 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":395:63) +#loc268 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":397:28) +#loc269 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":405:12) +#loc270 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":411:64) +#loc271 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":414:28) +#loc272 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":414:19) +#loc273 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":415:28) +#loc274 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":415:19) +#loc275 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":417:19) +#loc276 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":417:8) +#loc277 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":419:11) +#loc278 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":419:4) +#loc280 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":458:105) +#loc281 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":459:19) +#loc282 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":461:14) +#loc283 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":464:36) +#loc284 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":464:46) +#loc285 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":467:36) +#loc286 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":467:46) +#loc287 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":476:43) +#loc288 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":476:54) +#loc289 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":476:79) +#loc290 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":479:35) +#loc291 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":482:23) +#loc292 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":483:23) +#loc293 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":485:34) +#loc294 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":485:23) +#loc295 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":486:22) +#loc296 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":487:23) +#loc297 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":488:23) +#loc298 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":489:23) +#loc299 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":490:23) +#loc300 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":491:23) +#loc301 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":493:24) +#loc302 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":494:24) +#loc303 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":495:32) +#loc304 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":496:25) +#loc305 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":497:92) +#loc306 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":498:92) +#loc307 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":499:25) +#loc308 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":500:24) +#loc309 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":501:24) +#loc310 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":502:39) +#loc311 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":503:25) +#loc312 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":504:24) +#loc313 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":505:24) +#loc314 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":506:23) +#loc315 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":507:25) +#loc316 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":508:25) +#loc317 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":509:92) +#loc318 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":510:25) +#loc319 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":511:24) +#loc320 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":512:24) +#loc321 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":513:39) +#loc322 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":514:25) +#loc323 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":515:24) +#loc324 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":516:24) +#loc325 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":521:69) +#loc326 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":524:27) +#loc327 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":525:39) +#loc328 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":525:21) +#loc329 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":528:104) +#loc330 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":530:20) +#loc331 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":531:22) +#loc332 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":531:19) +#loc333 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":531:14) +#loc334 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":538:39) +#loc335 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":538:50) +#loc336 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":538:71) +#loc337 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":542:32) +#loc338 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":542:43) +#loc339 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":542:62) +#loc340 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":542:73) +#loc341 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":542:54) +#loc342 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":549:43) +#loc343 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":551:15) +#loc344 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":553:30) +#loc345 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":553:21) +#loc346 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":553:10) +#loc347 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":555:11) +#loc348 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":555:4) +#loc349 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":831:41) +#loc350 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":831:52) +#loc351 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":831:23) +#loc352 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":831:15) +#loc354 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":798:21) +#loc355 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":798:11) +#loc356 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":798:4) +#loc358 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":788:33) +#loc359 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":789:38) +#loc360 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":789:24) +#loc361 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":790:109) +#loc362 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":790:113) +#loc363 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":790:39) +#loc364 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":790:55) +#loc365 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":790:25) +#loc366 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":791:30) +#loc367 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":791:35) +#loc368 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":791:60) +#loc369 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":792:34) +#loc370 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":792:48) +#loc371 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":792:63) +#loc372 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":793:29) +#loc373 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":793:47) +#loc374 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":793:61) +#loc375 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":793:42) +#loc376 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":794:11) +#loc377 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":794:4) +#loc379 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":598:26) +#loc380 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":599:26) +#loc381 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":601:26) +#loc382 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":601:37) +#loc383 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":601:18) +#loc384 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":601:56) +#loc385 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":601:67) +#loc386 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":601:49) +#loc387 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":602:27) +#loc388 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":602:38) +#loc389 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":602:19) +#loc390 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":602:58) +#loc391 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":602:69) +#loc392 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":602:51) +#loc393 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":608:42) +#loc394 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":608:87) +#loc395 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":608:98) +#loc396 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":608:61) +#loc397 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":610:28) +#loc398 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":618:12) +#loc399 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":623:62) +#loc400 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":626:28) +#loc401 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":626:19) +#loc402 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":627:28) +#loc403 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":627:19) +#loc404 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":628:19) +#loc405 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":628:8) +#loc406 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":630:11) +#loc407 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":630:4) +#loc409 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":669:105) +#loc410 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":674:52) +#loc411 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":674:28) +#loc412 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":674:22) +#loc413 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":675:26) +#loc414 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":675:46) +#loc415 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":676:20) +#loc416 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":678:15) +#loc417 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":680:36) +#loc418 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":680:46) +#loc419 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":683:36) +#loc420 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":683:46) +#loc421 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":692:43) +#loc422 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":692:54) +#loc423 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":692:78) +#loc424 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":695:36) +#loc425 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":698:25) +#loc426 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":699:25) +#loc427 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":701:35) +#loc428 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":701:24) +#loc429 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":702:24) +#loc430 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":703:25) +#loc431 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":704:24) +#loc432 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":705:24) +#loc433 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":706:24) +#loc434 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":707:24) +#loc435 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":709:25) +#loc436 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":710:25) +#loc437 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":711:32) +#loc438 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":712:25) +#loc439 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":713:92) +#loc440 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":714:92) +#loc441 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":715:25) +#loc442 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":716:24) +#loc443 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":717:24) +#loc444 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":718:39) +#loc445 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":719:25) +#loc446 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":720:24) +#loc447 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":721:24) +#loc448 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":722:24) +#loc449 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":723:25) +#loc450 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":724:25) +#loc451 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":725:92) +#loc452 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":726:25) +#loc453 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":727:24) +#loc454 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":728:24) +#loc455 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":729:39) +#loc456 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":730:25) +#loc457 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":731:24) +#loc458 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":732:24) +#loc459 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":736:69) +#loc460 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":739:27) +#loc461 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":740:44) +#loc462 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":740:40) +#loc463 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":740:22) +#loc464 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":741:99) +#loc465 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":744:24) +#loc466 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":744:43) +#loc467 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":744:10) +#loc468 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":748:53) +#loc469 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":748:29) +#loc470 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":748:21) +#loc471 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":750:29) +#loc472 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":750:20) +#loc473 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":751:25) +#loc474 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":751:22) +#loc475 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":751:16) +#loc476 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":759:39) +#loc477 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":759:50) +#loc478 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":759:70) +#loc479 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":773:45) +#loc480 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":775:24) +#loc481 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":775:52) +#loc482 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":775:43) +#loc483 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":775:10) +#loc484 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":777:11) +#loc485 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":777:4) +#loc513 = loc("ZQ"(#loc27)) +#loc514 = loc("HQ"(#loc28)) +#loc515 = loc("HKV"(#loc29)) +#loc516 = loc("ZKV"(#loc30)) +#loc517 = loc("pid"(#loc31)) +#loc518 = loc("NUM_KV_BLOCKS"(#loc32)) +#loc519 = loc("NUM_Q_BLOCKS"(#loc33)) +#loc520 = loc("off_zq"(#loc34)) +#loc521 = loc("off_hkv"(#loc35)) +#loc522 = loc("off_zkv"(#loc36)) +#loc523 = loc("SPARSE_Z"(#loc37)) +#loc524 = loc("SPARSE_HQ"(#loc38)) +#loc525 = loc("sparse_idx_z"(#loc39)) +#loc526 = loc("k_adj"(#loc40)) +#loc527 = loc("k_adj"(#loc41)) +#loc528 = loc("k_adj"(#loc42)) +#loc529 = loc("k_adj"(#loc43)) +#loc530 = loc("v_adj"(#loc44)) +#loc531 = loc("v_adj"(#loc45)) +#loc532 = loc("v_adj"(#loc46)) +#loc533 = loc("v_adj"(#loc47)) +#loc534 = loc("dv_adj"(#loc48)) +#loc535 = loc("dv_adj"(#loc49)) +#loc536 = loc("dv_adj"(#loc50)) +#loc537 = loc("dv_adj"(#loc51)) +#loc538 = loc("K"(#loc52)) +#loc539 = loc("V"(#loc53)) +#loc540 = loc("DV"(#loc54)) +#loc541 = loc("RCP_LN2"(#loc55)) +#loc542 = loc("offs_k"(#loc56)) +#loc543 = loc("offs_v"(#loc57)) +#loc544 = loc("off_pid"(#loc60)) +#loc545 = loc("SPARSE_Q_MULTIPLE"(#loc61)) +#loc546 = loc("SPARSE_KV_MULTIPLE"(#loc62)) +#loc547 = loc("off_hq2"(#loc63)) +#loc548 = loc("off_hq2"(#loc64)) +#loc549 = loc("off_hq2"(#loc65)) +#loc550 = loc("start_m2_block"(#loc66)) +#loc551 = loc("off_pid_mask"(#loc67)) +#loc552 = loc("stride_kv_idx_h"(#loc68)) +#loc553 = loc("sparse_idx_hq2"(#loc69)) +#loc554 = loc("sparse_hz_offset"(#loc70)) +#loc555 = loc("sparse_hz_offset"(#loc71)) +#loc556 = loc("sparse_kv_num_blks_offset"(#loc72)) +#loc557 = loc("sparse_kv_num_blks_offset"(#loc73)) +#loc558 = loc("sparse_kv_idx_offset"(#loc74)) +#loc559 = loc("sparse_kv_idx_offset"(#loc75)) +#loc560 = loc("sparse_kv_idx_offset"(#loc76)) +#loc561 = loc("q_adj2"(#loc77)) +#loc562 = loc("q_adj2"(#loc78)) +#loc563 = loc("q_adj2"(#loc79)) +#loc564 = loc("q_adj2"(#loc80)) +#loc565 = loc("do_adj2"(#loc81)) +#loc566 = loc("do_adj2"(#loc82)) +#loc567 = loc("do_adj2"(#loc83)) +#loc568 = loc("do_adj2"(#loc84)) +#loc569 = loc("dq_adj2"(#loc85)) +#loc570 = loc("dq_adj2"(#loc86)) +#loc571 = loc("dq_adj2"(#loc87)) +#loc572 = loc("dq_adj2"(#loc88)) +#loc573 = loc("off_chz2"(#loc89)) +#loc574 = loc("off_chz2"(#loc90)) +#loc575 = loc("off_chz2"(#loc91)) +#loc576 = loc("off_chz2"(#loc92)) +#loc577 = loc("Q2"(#loc93)) +#loc578 = loc("DO2"(#loc94)) +#loc579 = loc("DQ2"(#loc95)) +#loc580 = loc("LSE2"(#loc96)) +#loc581 = loc("DELTA2"(#loc97)) +#loc582 = loc("dq"(#loc98)) +#loc583 = loc("start_m2"(#loc99)) +#loc584 = loc("offs_m2"(#loc100)) +#loc585 = loc("offs_m2"(#loc101)) +#loc586 = loc("q"(#loc102)) +#loc587 = loc("do"(#loc103)) +#loc588 = loc("Di"(#loc104)) +#loc589 = loc("Di"(#loc105)) +#loc590 = loc("Di"(#loc106)) +#loc591 = loc("lse"(#loc107)) +#loc592 = loc("lse"(#loc108)) +#loc593 = loc("lse"(#loc109)) +#loc594 = loc("lse"(#loc110)) +#loc595 = loc("lse"(#loc111)) +#loc596 = loc("lse"(#loc112)) +#loc597 = loc("kv_indices"(#loc113)) +#loc598 = loc("kv_start"(#loc114)) +#loc599 = loc("kv_start"(#loc115)) +#loc600 = loc("sparse_kv_num_blocks"(#loc116)) +#loc601 = loc("sparse_kv_num_blocks"(#loc117)) +#loc602 = loc("offs_n2"(#loc118)) +#loc603 = loc("offs_n2"(#loc119)) +#loc604 = loc("dq"(#loc120)) +#loc605 = loc("kv_indices"(#loc121)) +#loc606 = loc("kv_start"(#loc122)) +#loc607 = loc("kv_start"(#loc123)) +#loc608 = loc("sparse_kv_num_blocks"(#loc124)) +#loc609 = loc("sparse_kv_num_blocks"(#loc125)) +#loc610 = loc("offs_n2"(#loc126)) +#loc611 = loc("offs_n2"(#loc127)) +#loc612 = loc("dq"(#loc128)) +#loc613 = loc("dq_ptrs"(#loc129)) +#loc614 = loc("dq_ptrs"(#loc130)) +#loc615 = loc("dq_ptrs"(#loc131)) +#loc616 = loc("dq_ptrs"(#loc132)) +#loc617 = loc("dq_ptrs"(#loc133)) +#loc618 = loc("dq_ptrs"(#loc134)) +#loc619 = loc("dq"(#loc135)) +#loc620 = loc("SPARSE_Q_MULTIPLE"(#loc142)) +#loc621 = loc("SPARSE_KV_MULTIPLE"(#loc143)) +#loc622 = loc("pid_mask"(#loc144)) +#loc623 = loc("stride_q_idx_h"(#loc145)) +#loc624 = loc("dv"(#loc146)) +#loc625 = loc("dk"(#loc147)) +#loc626 = loc("start_n1"(#loc148)) +#loc627 = loc("offs_n1"(#loc149)) +#loc628 = loc("offs_n1"(#loc150)) +#loc629 = loc("k"(#loc151)) +#loc630 = loc("v"(#loc152)) +#loc631 = loc("dv"(#loc153)) +#loc632 = loc("off_hq1"(#loc154)) +#loc633 = loc("off_hq1"(#loc155)) +#loc634 = loc("q_adj1"(#loc156)) +#loc635 = loc("q_adj1"(#loc157)) +#loc636 = loc("q_adj1"(#loc158)) +#loc637 = loc("q_adj1"(#loc159)) +#loc638 = loc("do_adj1"(#loc160)) +#loc639 = loc("do_adj1"(#loc161)) +#loc640 = loc("do_adj1"(#loc162)) +#loc641 = loc("do_adj1"(#loc163)) +#loc642 = loc("dq_adj1"(#loc164)) +#loc643 = loc("dq_adj1"(#loc165)) +#loc644 = loc("dq_adj1"(#loc166)) +#loc645 = loc("dq_adj1"(#loc167)) +#loc646 = loc("off_chz1"(#loc168)) +#loc647 = loc("off_chz1"(#loc169)) +#loc648 = loc("off_chz1"(#loc170)) +#loc649 = loc("off_chz1"(#loc171)) +#loc650 = loc("Q1"(#loc172)) +#loc651 = loc("DO1"(#loc173)) +#loc652 = loc("LSE1"(#loc174)) +#loc653 = loc("DELTA1"(#loc175)) +#loc654 = loc("sparse_idx_hq1"(#loc176)) +#loc655 = loc("sparse_hz_offset"(#loc177)) +#loc656 = loc("sparse_hz_offset"(#loc178)) +#loc657 = loc("sparse_q_num_blks_offset"(#loc179)) +#loc658 = loc("sparse_q_num_blks_offset"(#loc180)) +#loc659 = loc("sparse_q_idx_offset"(#loc181)) +#loc660 = loc("sparse_q_idx_offset"(#loc182)) +#loc661 = loc("sparse_q_idx_offset"(#loc183)) +#loc662 = loc("q_indices"(#loc184)) +#loc663 = loc("q_start"(#loc185)) +#loc664 = loc("q_start"(#loc186)) +#loc665 = loc("sparse_q_num_blocks"(#loc187)) +#loc666 = loc("sparse_q_num_blocks"(#loc188)) +#loc667 = loc("offs_m1"(#loc189)) +#loc668 = loc("offs_m1"(#loc190)) +#loc669 = loc("q_indices"(#loc192)) +#loc670 = loc("q_start"(#loc193)) +#loc671 = loc("q_start"(#loc194)) +#loc672 = loc("sparse_q_num_blocks"(#loc195)) +#loc673 = loc("sparse_q_num_blocks"(#loc196)) +#loc674 = loc("offs_m1"(#loc197)) +#loc675 = loc("offs_m1"(#loc198)) +#loc676 = loc("dv_ptrs"(#loc201)) +#loc677 = loc("dv_ptrs"(#loc202)) +#loc678 = loc("dv_ptrs"(#loc203)) +#loc679 = loc("dv_ptrs"(#loc204)) +#loc680 = loc("dv_ptrs"(#loc205)) +#loc681 = loc("dv_ptrs"(#loc206)) +#loc682 = loc("index_n"(#loc207)) +#loc683 = loc("index_k"(#loc208)) +#loc684 = loc("index_v"(#loc209)) +#loc685 = loc("dk"(#loc214)) +#loc686 = loc("mask"(#loc215)) +#loc687 = loc("xindex"(#loc216)) +#loc688 = loc("xindex"(#loc217)) +#loc689 = loc("xindex"(#loc218)) +#loc690 = loc("xindex"(#loc219)) +#loc691 = loc("xindex"(#loc220)) +#loc692 = loc("xindex"(#loc221)) +#loc693 = loc("xindex"(#loc222)) +#loc694 = loc("xindex"(#loc223)) +#loc702 = loc("ptr"(#loc238)) +#loc703 = loc("ptr"(#loc239)) +#loc704 = loc("ptr"(#loc240)) +#loc705 = loc("ptr"(#loc241)) +#loc706 = loc("ptr"(#loc242)) +#loc707 = loc("ptr"(#loc243)) +#loc752 = loc("offs_k"(#loc250)) +#loc753 = loc("offs_v"(#loc251)) +#loc754 = loc("kT_ptrs"(#loc252)) +#loc755 = loc("kT_ptrs"(#loc253)) +#loc756 = loc("kT_ptrs"(#loc254)) +#loc757 = loc("kT_ptrs"(#loc255)) +#loc758 = loc("kT_ptrs"(#loc256)) +#loc759 = loc("kT_ptrs"(#loc257)) +#loc760 = loc("vT_ptrs"(#loc258)) +#loc761 = loc("vT_ptrs"(#loc259)) +#loc762 = loc("vT_ptrs"(#loc260)) +#loc763 = loc("vT_ptrs"(#loc261)) +#loc764 = loc("vT_ptrs"(#loc262)) +#loc765 = loc("vT_ptrs"(#loc263)) +#loc766 = loc("hi"(#loc264)) +#loc767 = loc("hi"(#loc265)) +#loc768 = loc("hi"(#loc266)) +#loc769 = loc("hi"(#loc267)) +#loc770 = loc("dq"(#loc268)) +#loc771 = loc("dq"(#loc269)) +#loc772 = loc("offset"(#loc270)) +#loc773 = loc("kT_ptrs"(#loc271)) +#loc774 = loc("kT_ptrs"(#loc272)) +#loc775 = loc("vT_ptrs"(#loc273)) +#loc776 = loc("vT_ptrs"(#loc274)) +#loc777 = loc("offs_n2"(#loc275)) +#loc826 = loc("kT"(#loc280)) +#loc827 = loc("qk"(#loc281)) +#loc828 = loc("qk"(#loc282)) +#loc829 = loc("n"(#loc283)) +#loc830 = loc("n"(#loc284)) +#loc831 = loc("m"(#loc285)) +#loc832 = loc("m"(#loc286)) +#loc833 = loc("post_mod_scores"(#loc287)) +#loc834 = loc("post_mod_scores"(#loc288)) +#loc835 = loc("post_mod_scores"(#loc289)) +#loc836 = loc("tmp1"(#loc290)) +#loc837 = loc("tmp4"(#loc291)) +#loc838 = loc("tmp5"(#loc292)) +#loc839 = loc("tmp7"(#loc293)) +#loc840 = loc("tmp7"(#loc294)) +#loc841 = loc("tmp8"(#loc295)) +#loc842 = loc("tmp9"(#loc296)) +#loc843 = loc("tmp10"(#loc297)) +#loc844 = loc("tmp11"(#loc298)) +#loc845 = loc("tmp12"(#loc299)) +#loc846 = loc("tmp13"(#loc300)) +#loc847 = loc("tmp15"(#loc301)) +#loc848 = loc("tmp16"(#loc302)) +#loc849 = loc("tmp17"(#loc303)) +#loc850 = loc("tmp18"(#loc304)) +#loc851 = loc("tmp19"(#loc305)) +#loc852 = loc("tmp20"(#loc306)) +#loc853 = loc("tmp21"(#loc307)) +#loc854 = loc("tmp22"(#loc308)) +#loc855 = loc("tmp23"(#loc309)) +#loc856 = loc("tmp24"(#loc310)) +#loc857 = loc("tmp25"(#loc311)) +#loc858 = loc("tmp26"(#loc312)) +#loc859 = loc("tmp27"(#loc313)) +#loc860 = loc("tmp28"(#loc314)) +#loc861 = loc("tmp29"(#loc315)) +#loc862 = loc("tmp30"(#loc316)) +#loc863 = loc("tmp31"(#loc317)) +#loc864 = loc("tmp32"(#loc318)) +#loc865 = loc("tmp33"(#loc319)) +#loc866 = loc("tmp34"(#loc320)) +#loc867 = loc("tmp35"(#loc321)) +#loc868 = loc("tmp36"(#loc322)) +#loc869 = loc("tmp37"(#loc323)) +#loc870 = loc("tmp38"(#loc324)) +#loc871 = loc("post_mod_scores"(#loc325)) +#loc872 = loc("post_mod_scores"(#loc326)) +#loc873 = loc("p"(#loc327)) +#loc874 = loc("p"(#loc328)) +#loc875 = loc("vT"(#loc329)) +#loc876 = loc("dp"(#loc330)) +#loc877 = loc("ds"(#loc331)) +#loc878 = loc("ds"(#loc332)) +#loc879 = loc("ds"(#loc333)) +#loc880 = loc("grad_scores"(#loc334)) +#loc881 = loc("grad_scores"(#loc335)) +#loc882 = loc("grad_scores"(#loc336)) +#loc883 = loc("scatter_mask"(#loc337)) +#loc884 = loc("scatter_mask"(#loc338)) +#loc885 = loc("scatter_mask"(#loc339)) +#loc886 = loc("scatter_mask"(#loc340)) +#loc887 = loc("scatter_mask"(#loc341)) +#loc888 = loc("ds"(#loc342)) +#loc889 = loc("ds"(#loc343)) +#loc890 = loc("dq"(#loc344)) +#loc891 = loc("dq"(#loc345)) +#loc892 = loc("dq"(#loc346)) +#loc899 = loc("cur_block_idx"(#loc358)) +#loc900 = loc("cur_block"(#loc359)) +#loc901 = loc("cur_block"(#loc360)) +#loc902 = loc("next_block"(#loc361)) +#loc903 = loc("next_block"(#loc362)) +#loc904 = loc("next_block"(#loc363)) +#loc905 = loc("next_block"(#loc364)) +#loc906 = loc("next_block"(#loc365)) +#loc907 = loc("needs_jump"(#loc366)) +#loc908 = loc("needs_jump"(#loc367)) +#loc909 = loc("needs_jump"(#loc368)) +#loc910 = loc("jump_to_block"(#loc369)) +#loc911 = loc("jump_to_block"(#loc370)) +#loc912 = loc("jump_to_block"(#loc371)) +#loc913 = loc("offset"(#loc372)) +#loc914 = loc("offset"(#loc373)) +#loc915 = loc("offset"(#loc374)) +#loc916 = loc("offset"(#loc375)) +#loc962 = loc("offs_k"(#loc379)) +#loc963 = loc("offs_v"(#loc380)) +#loc964 = loc("qT_ptrs"(#loc381)) +#loc965 = loc("qT_ptrs"(#loc382)) +#loc966 = loc("qT_ptrs"(#loc383)) +#loc967 = loc("qT_ptrs"(#loc384)) +#loc968 = loc("qT_ptrs"(#loc385)) +#loc969 = loc("qT_ptrs"(#loc386)) +#loc970 = loc("do_ptrs"(#loc387)) +#loc971 = loc("do_ptrs"(#loc388)) +#loc972 = loc("do_ptrs"(#loc389)) +#loc973 = loc("do_ptrs"(#loc390)) +#loc974 = loc("do_ptrs"(#loc391)) +#loc975 = loc("do_ptrs"(#loc392)) +#loc976 = loc("hi"(#loc393)) +#loc977 = loc("hi"(#loc394)) +#loc978 = loc("hi"(#loc395)) +#loc979 = loc("hi"(#loc396)) +#loc980 = loc("dk"(#loc397)) +#loc981 = loc("offset"(#loc399)) +#loc982 = loc("qT_ptrs"(#loc400)) +#loc983 = loc("qT_ptrs"(#loc401)) +#loc984 = loc("do_ptrs"(#loc402)) +#loc985 = loc("do_ptrs"(#loc403)) +#loc986 = loc("offs_m1"(#loc404)) +#loc1036 = loc("qT"(#loc409)) +#loc1037 = loc("lse"(#loc410)) +#loc1038 = loc("lse"(#loc411)) +#loc1039 = loc("lse"(#loc412)) +#loc1040 = loc("lse"(#loc413)) +#loc1041 = loc("lse"(#loc414)) +#loc1042 = loc("qkT"(#loc415)) +#loc1043 = loc("qkT"(#loc416)) +#loc1044 = loc("m"(#loc417)) +#loc1045 = loc("m"(#loc418)) +#loc1046 = loc("n"(#loc419)) +#loc1047 = loc("n"(#loc420)) +#loc1048 = loc("post_mod_scores"(#loc421)) +#loc1049 = loc("post_mod_scores"(#loc422)) +#loc1050 = loc("post_mod_scores"(#loc423)) +#loc1051 = loc("tmp41"(#loc424)) +#loc1052 = loc("tmp44"(#loc425)) +#loc1053 = loc("tmp45"(#loc426)) +#loc1054 = loc("tmp47"(#loc427)) +#loc1055 = loc("tmp47"(#loc428)) +#loc1056 = loc("tmp48"(#loc429)) +#loc1057 = loc("tmp49"(#loc430)) +#loc1058 = loc("tmp50"(#loc431)) +#loc1059 = loc("tmp51"(#loc432)) +#loc1060 = loc("tmp52"(#loc433)) +#loc1061 = loc("tmp53"(#loc434)) +#loc1062 = loc("tmp55"(#loc435)) +#loc1063 = loc("tmp56"(#loc436)) +#loc1064 = loc("tmp57"(#loc437)) +#loc1065 = loc("tmp58"(#loc438)) +#loc1066 = loc("tmp59"(#loc439)) +#loc1067 = loc("tmp60"(#loc440)) +#loc1068 = loc("tmp61"(#loc441)) +#loc1069 = loc("tmp62"(#loc442)) +#loc1070 = loc("tmp63"(#loc443)) +#loc1071 = loc("tmp64"(#loc444)) +#loc1072 = loc("tmp65"(#loc445)) +#loc1073 = loc("tmp66"(#loc446)) +#loc1074 = loc("tmp67"(#loc447)) +#loc1075 = loc("tmp68"(#loc448)) +#loc1076 = loc("tmp69"(#loc449)) +#loc1077 = loc("tmp70"(#loc450)) +#loc1078 = loc("tmp71"(#loc451)) +#loc1079 = loc("tmp72"(#loc452)) +#loc1080 = loc("tmp73"(#loc453)) +#loc1081 = loc("tmp74"(#loc454)) +#loc1082 = loc("tmp75"(#loc455)) +#loc1083 = loc("tmp76"(#loc456)) +#loc1084 = loc("tmp77"(#loc457)) +#loc1085 = loc("tmp78"(#loc458)) +#loc1086 = loc("post_mod_scores"(#loc459)) +#loc1087 = loc("post_mod_scores"(#loc460)) +#loc1088 = loc("pT"(#loc461)) +#loc1089 = loc("pT"(#loc462)) +#loc1090 = loc("pT"(#loc463)) +#loc1091 = loc("do"(#loc464)) +#loc1092 = loc("dv"(#loc465)) +#loc1093 = loc("dv"(#loc466)) +#loc1094 = loc("dv"(#loc467)) +#loc1095 = loc("Di"(#loc468)) +#loc1096 = loc("Di"(#loc469)) +#loc1097 = loc("Di"(#loc470)) +#loc1098 = loc("dpT"(#loc471)) +#loc1099 = loc("dpT"(#loc472)) +#loc1100 = loc("dsT"(#loc473)) +#loc1101 = loc("dsT"(#loc474)) +#loc1102 = loc("dsT"(#loc475)) +#loc1103 = loc("grad_scores"(#loc476)) +#loc1104 = loc("grad_scores"(#loc477)) +#loc1105 = loc("grad_scores"(#loc478)) +#loc1106 = loc("dsT"(#loc479)) +#loc1107 = loc("dk"(#loc480)) +#loc1108 = loc("dk"(#loc481)) +#loc1109 = loc("dk"(#loc482)) +#loc1110 = loc("dk"(#loc483)) +#loc1111 = loc("SPARSE_Q_MULTIPLE"(#loc545)) +#loc1112 = loc("SPARSE_KV_MULTIPLE"(#loc546)) +#loc1113 = loc("SPARSE_Q_MULTIPLE"(#loc620)) +#loc1114 = loc("SPARSE_KV_MULTIPLE"(#loc621)) +#loc1115 = loc("dk"(#loc631)) +#loc1116 = loc("offs_n2"(#loc770)) +#loc1117 = loc("dv"(#loc980)) +#loc1118 = loc("kT_ptrs"(#loc1116)) +#loc1119 = loc("offs_m1"(#loc1117)) +#loc1120 = loc("vT_ptrs"(#loc1118)) +#loc1121 = loc("qT_ptrs"(#loc1119)) +#loc1122 = loc("do_ptrs"(#loc1121)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..e7dc252ecacc712e5dde592b5b3c68ad548849e8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.ttgir @@ -0,0 +1,2068 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":18:0) +#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 64, 16]}> +#mma1 = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 128, 16]}> +#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}> +#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}> +#shared2 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}> +#smem = #ttg.shared_memory +#loc338 = loc("arg_Q"(#loc)) +#loc339 = loc("arg_K"(#loc)) +#loc340 = loc("arg_V"(#loc)) +#loc341 = loc("arg_LSE"(#loc)) +#loc342 = loc("arg_DELTA"(#loc)) +#loc343 = loc("arg_DO"(#loc)) +#loc344 = loc("arg_DQ"(#loc)) +#loc345 = loc("arg_DV"(#loc)) +#loc346 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc347 = loc("arg_KV_IDX"(#loc)) +#loc348 = loc("arg_Q_NUM_BLKS"(#loc)) +#loc349 = loc("arg_Q_IDX"(#loc)) +#loc350 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc351 = loc("arg_FULL_KV_IDX"(#loc)) +#loc352 = loc("arg_FULL_Q_NUM_BLKS"(#loc)) +#loc353 = loc("arg_FULL_Q_IDX"(#loc)) +#loc354 = loc("in_ptr16"(#loc)) +#loc355 = loc("out_ptr0"(#loc)) +#loc356 = loc("ks0"(#loc)) +#loc357 = loc("ks1"(#loc)) +#loc358 = loc("ks2"(#loc)) +#loc359 = loc("ks3"(#loc)) +#loc360 = loc("ks4"(#loc)) +#loc361 = loc("ks5"(#loc)) +#loc362 = loc("ks6"(#loc)) +#loc363 = loc("ks7"(#loc)) +#loc364 = loc("ks8"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_tem_fused_zeros_1(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_DELTA: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DELTA"(#loc)), %arg_DO: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DO"(#loc)), %arg_DQ: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DQ"(#loc)), %arg_DV: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DV"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_NUM_BLKS"(#loc)), %arg_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %arg_FULL_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_NUM_BLKS"(#loc)), %arg_FULL_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_IDX"(#loc)), %in_ptr16: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr16"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc)), %ks4: i32 loc("ks4"(#loc)), %ks5: i32 loc("ks5"(#loc)), %ks6: i32 loc("ks6"(#loc)), %ks7: i32 loc("ks7"(#loc)), %ks8: i32 loc("ks8"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<128x1xi32, #mma> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<4096> : tensor<128x1xi32, #blocked> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<128x128xbf16, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<128x64xbf16, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c127_i32 = arith.constant 127 : i32 loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %c63_i32 = arith.constant 63 : i32 loc(#loc1) + %cst_6 = arith.constant dense<0.0883883461> : tensor<128x128xf32, #mma1> loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma1> loc(#loc1) + %cst_8 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_9 = arith.constant dense<0.0883883461> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_10 = arith.constant dense<0xFF800000> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_11 = arith.constant dense<1.44269502> : tensor<128x64xf32, #mma> loc(#loc1) + %true = arith.constant true loc(#loc1) + %c-1_i32 = arith.constant -1 : i32 loc(#loc1) + %c3_i32 = arith.constant 3 : i32 loc(#loc1) + %cst_12 = arith.constant dense<8192> : tensor<128x64xi32, #blocked1> loc(#loc1) + %cst_13 = arith.constant dense<64> : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc1) + %cst_14 = arith.constant dense<262144> : tensor<128x64xi32, #blocked1> loc(#loc1) + %cst_15 = arith.constant dense<8192> : tensor<64x128xi32, #blocked> loc(#loc1) + %cst_16 = arith.constant dense<64> : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1) + %cst_17 = arith.constant dense<64> : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc1) + %cst_18 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_19 = arith.constant dense<4096> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_20 = arith.constant dense<128> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_21 = arith.constant dense<0xFF800000> : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1) + %cst_22 = arith.constant dense<0.000000e+00> : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1) + %cst_23 = arith.constant dense<0> : tensor<128x64xi32, #mma> loc(#loc1) + %cst_24 = arith.constant dense<0> : tensor<1x64xi32, #mma> loc(#loc1) + %cst_25 = arith.constant dense<0.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %cst_26 = arith.constant dense<0xFF800000> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %0 = arith.muli %ks0, %c4096_i32 : i32 loc(#loc2) + %1 = arith.muli %ks1, %c1024_i32 : i32 loc(#loc3) + %2 = arith.muli %ks1, %c128_i32 : i32 loc(#loc4) + %3 = arith.cmpi sle, %ks0, %c1_i32 : i32 loc(#loc5) + %4 = arith.extui %3 : i1 to i32 loc(#loc6) + %5 = arith.cmpi sgt, %ks0, %c1_i32 : i32 loc(#loc7) + %6 = arith.extui %5 : i1 to i32 loc(#loc8) + %7 = arith.muli %ks0, %6 : i32 loc(#loc8) + %8 = arith.addi %4, %7 : i32 loc(#loc9) + %9 = arith.muli %8, %c4096_i32 : i32 loc(#loc10) + %10 = arith.muli %8, %c128_i32 : i32 loc(#loc11) + %pid = tt.get_program_id x : i32 loc(#loc365) + %NUM_KV_BLOCKS = arith.addi %ks1, %c127_i32 : i32 loc(#loc669) + %NUM_KV_BLOCKS_27 = arith.divsi %NUM_KV_BLOCKS, %c128_i32 : i32 loc(#loc670) + %NUM_Q_BLOCKS = arith.addi %ks0, %c127_i32 : i32 loc(#loc671) + %NUM_Q_BLOCKS_28 = arith.divsi %NUM_Q_BLOCKS, %c128_i32 : i32 loc(#loc672) + %off_zq = tt.get_program_id y : i32 loc(#loc368) + %off_hkv = tt.get_program_id z : i32 loc(#loc369) + %off_zkv = arith.remsi %off_zq, %c2_i32 : i32 loc(#loc370) + %k_adj = arith.muli %2, %off_hkv : i32 loc(#loc371) + %k_adj_29 = arith.muli %1, %off_zkv : i32 loc(#loc372) + %k_adj_30 = arith.addi %k_adj, %k_adj_29 : i32 loc(#loc373) + %k_adj_31 = arith.extsi %k_adj_30 : i32 to i64 loc(#loc374) + %dv_adj = arith.muli %1, %off_zq : i32 loc(#loc375) + %dv_adj_32 = arith.addi %k_adj, %dv_adj : i32 loc(#loc376) + %dv_adj_33 = arith.extsi %dv_adj_32 : i32 to i64 loc(#loc377) + %K = tt.addptr %arg_K, %k_adj_31 : !tt.ptr, i64 loc(#loc378) + %V = tt.addptr %arg_V, %k_adj_31 : !tt.ptr, i64 loc(#loc379) + %DV = tt.addptr %arg_DV, %dv_adj_33 : !tt.ptr, i64 loc(#loc380) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc381) + %offs_k_34 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc381) + %11 = arith.cmpi sge, %pid, %NUM_KV_BLOCKS_27 : i32 loc(#loc31) + scf.if %11 { + %off_pid = arith.subi %pid, %NUM_KV_BLOCKS_27 : i32 loc(#loc382) + %off_hq2 = arith.divsi %off_pid, %NUM_Q_BLOCKS_28 : i32 loc(#loc383) + %off_hq2_35 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc384) + %off_hq2_36 = arith.addi %off_hq2, %off_hq2_35 : i32 loc(#loc385) + %start_m2_block = arith.remsi %off_pid, %NUM_Q_BLOCKS_28 : i32 loc(#loc386) + %stride_kv_idx_h = arith.muli %ks3, %ks4 : i32 loc(#loc387) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %ks2 : i32 loc(#loc388) + %sparse_kv_num_blks_offset_37 = arith.addi %sparse_kv_num_blks_offset, %start_m2_block : i32 loc(#loc389) + %sparse_kv_idx_offset = arith.muli %off_zkv, %stride_kv_idx_h : i32 loc(#loc390) + %sparse_kv_idx_offset_38 = arith.muli %start_m2_block, %ks4 : i32 loc(#loc391) + %sparse_kv_idx_offset_39 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_38 : i32 loc(#loc392) + %q_adj2 = arith.muli %off_hq2_36, %c128_i32 : i32 loc(#loc393) + %q_adj2_40 = arith.muli %0, %off_zq : i32 loc(#loc394) + %q_adj2_41 = arith.addi %q_adj2, %q_adj2_40 : i32 loc(#loc395) + %q_adj2_42 = arith.extsi %q_adj2_41 : i32 to i64 loc(#loc396) + %do_adj2 = arith.muli %10, %off_hq2_36 : i32 loc(#loc397) + %do_adj2_43 = arith.muli %9, %off_zq : i32 loc(#loc398) + %do_adj2_44 = arith.addi %do_adj2, %do_adj2_43 : i32 loc(#loc399) + %do_adj2_45 = arith.extsi %do_adj2_44 : i32 to i64 loc(#loc400) + %off_chz2 = arith.muli %off_zq, %c32_i32 : i32 loc(#loc401) + %off_chz2_46 = arith.addi %off_chz2, %off_hq2_36 : i32 loc(#loc402) + %off_chz2_47 = arith.muli %off_chz2_46, %ks0 : i32 loc(#loc403) + %off_chz2_48 = arith.extsi %off_chz2_47 : i32 to i64 loc(#loc404) + %Q2 = tt.addptr %arg_Q, %q_adj2_42 : !tt.ptr, i64 loc(#loc405) + %DO2 = tt.addptr %arg_DO, %do_adj2_45 : !tt.ptr, i64 loc(#loc406) + %DQ2 = tt.addptr %arg_DQ, %q_adj2_42 : !tt.ptr, i64 loc(#loc407) + %LSE2 = tt.addptr %arg_LSE, %off_chz2_48 : !tt.ptr, i64 loc(#loc408) + %DELTA2 = tt.addptr %arg_DELTA, %off_chz2_48 : !tt.ptr, i64 loc(#loc409) + %start_m2 = arith.muli %start_m2_block, %c128_i32 : i32 loc(#loc410) + %offs_m2 = tt.splat %start_m2 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc411) + %offs_m2_49 = tt.splat %start_m2 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc411) + %offs_m2_50 = arith.addi %offs_m2, %offs_k : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc411) + %offs_m2_51 = arith.addi %offs_m2_49, %offs_k_34 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc411) + %ptr = tt.expand_dims %offs_m2_50 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc673) + %ptr_52 = tt.expand_dims %offs_m2_51 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xi32, #mma> loc(#loc673) + %ptr_53 = arith.muli %ptr, %cst_2 : tensor<128x1xi32, #blocked> loc(#loc674) + %ptr_54 = tt.splat %Q2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc675) + %ptr_55 = tt.addptr %ptr_54, %ptr_53 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc675) + %ptr_56 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc676) + %ptr_57 = tt.expand_dims %ptr_56 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc676) + %ptr_58 = tt.broadcast %ptr_55 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc677) + %ptr_59 = tt.broadcast %ptr_57 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc677) + %ptr_60 = tt.addptr %ptr_58, %ptr_59 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc677) + %q = tt.splat %ks0 : i32 -> tensor<128x1xi32, #blocked> loc(#loc678) + %q_61 = tt.splat %ks0 : i32 -> tensor<128x1xi32, #mma> loc(#loc678) + %q_62 = arith.cmpi slt, %ptr, %q : tensor<128x1xi32, #blocked> loc(#loc678) + %q_63 = tt.broadcast %q_62 : tensor<128x1xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc679) + %q_64 = tt.load %ptr_60, %q_63, %cst_3 : tensor<128x128x!tt.ptr, #blocked> loc(#loc679) + %q_65 = ttg.local_alloc %q_64 : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc679) + %ptr_66 = arith.muli %ptr, %cst_0 : tensor<128x1xi32, #blocked> loc(#loc680) + %ptr_67 = tt.splat %DO2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc681) + %ptr_68 = tt.addptr %ptr_67, %ptr_66 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc681) + %ptr_69 = tt.broadcast %ptr_68 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc682) + %ptr_70 = tt.addptr %ptr_69, %ptr_59 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc682) + %do = tt.load %ptr_70, %q_63, %cst_3 : tensor<128x128x!tt.ptr, #blocked> loc(#loc683) + %do_71 = ttg.local_alloc %do : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc683) + %Di = tt.splat %ks0 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc419) + %Di_72 = arith.cmpi slt, %offs_m2_51, %Di : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc419) + %Di_73 = tt.splat %DELTA2 : !tt.ptr -> tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc420) + %Di_74 = tt.addptr %Di_73, %offs_m2_51 : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc420) + %Di_75 = tt.load %Di_74, %Di_72 : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc421) + %lse = tt.splat %LSE2 : !tt.ptr -> tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc422) + %lse_76 = tt.addptr %lse, %offs_m2_51 : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc422) + %lse_77 = tt.load %lse_76, %Di_72 : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc423) + %lse_78 = arith.cmpf oeq, %lse_77, %cst_26 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc424) + %lse_79 = arith.select %lse_78, %cst_25, %lse_77 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc425) + %lse_80 = tt.expand_dims %lse_79 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc426) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_39 : !tt.ptr, i32 loc(#loc427) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc428) + %kv_start_81 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc429) + %sparse_kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_37 : !tt.ptr, i32 loc(#loc430) + %sparse_kv_num_blocks_82 = tt.load %sparse_kv_num_blocks : !tt.ptr loc(#loc431) + %offs_n2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc432) + %offs_n2_83 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc432) + %offs_n2_84 = tt.splat %kv_start_81 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc433) + %offs_n2_85 = tt.splat %kv_start_81 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc433) + %offs_n2_86 = arith.addi %offs_n2_84, %offs_n2 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc433) + %offs_n2_87 = arith.addi %offs_n2_85, %offs_n2_83 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc433) + %kT_ptrs = tt.expand_dims %offs_n2_87 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc684) + %kT_ptrs_88 = arith.muli %kT_ptrs, %cst_20 : tensor<1x64xi32, #blocked1> loc(#loc685) + %kT_ptrs_89 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc686) + %kT_ptrs_90 = tt.addptr %kT_ptrs_89, %kT_ptrs_88 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc686) + %kT_ptrs_91 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc687) + %kT_ptrs_92 = tt.expand_dims %kT_ptrs_91 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xi32, #blocked1> loc(#loc687) + %kT_ptrs_93 = tt.broadcast %kT_ptrs_90 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc688) + %kT_ptrs_94 = tt.broadcast %kT_ptrs_92 : tensor<128x1xi32, #blocked1> -> tensor<128x64xi32, #blocked1> loc(#loc688) + %kT_ptrs_95 = tt.addptr %kT_ptrs_93, %kT_ptrs_94 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc688) + %vT_ptrs = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc689) + %vT_ptrs_96 = tt.addptr %vT_ptrs, %kT_ptrs_88 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc689) + %vT_ptrs_97 = tt.broadcast %vT_ptrs_96 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc690) + %vT_ptrs_98 = tt.addptr %vT_ptrs_97, %kT_ptrs_94 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc690) + %hi = arith.muli %sparse_kv_num_blocks_82, %c2_i32 : i32 loc(#loc691) + %hi_99 = arith.addi %ks1, %c63_i32 : i32 loc(#loc853) + %hi_100 = arith.divsi %hi_99, %c64_i32 : i32 loc(#loc854) + %hi_101 = arith.maxsi %hi_100, %c1_i32 : i32 loc(#loc693) + %hi_102 = arith.minsi %hi, %hi_101 : i32 loc(#loc694) + %kT = tt.splat %ks1 : i32 -> tensor<1x64xi32, #mma> loc(#loc1008) + %kT_103 = tt.splat %ks1 : i32 -> tensor<1x64xi32, #blocked1> loc(#loc1008) + %m = arith.remsi %ptr_52, %q_61 : tensor<128x1xi32, #mma> loc(#loc1009) + %tmp4 = tt.broadcast %m : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc857) + %tmp7 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc858) + %vT_ptrs_104 = arith.cmpi sgt, %hi_102, %c0_i32 : i32 loc(#loc1020) + %tmp7_105 = tt.load %tmp7, %vT_ptrs_104 : !tt.ptr loc(#loc860) + %tmp8 = tt.splat %tmp7_105 : i64 -> tensor<1x64xi64, #mma> loc(#loc861) + %tmp9 = arith.extsi %m : tensor<128x1xi32, #mma> to tensor<128x1xi64, #mma> loc(#loc862) + %tmp10 = tt.splat %tmp7_105 : i64 -> tensor<128x1xi64, #mma> loc(#loc863) + %tmp10_106 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64, #mma> loc(#loc863) + %tmp11 = tt.broadcast %tmp10_106 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc864) + %tmp15 = tt.splat %ks8 : i32 -> tensor<1x64xi32, #mma> loc(#loc865) + %tmp20 = arith.cmpi slt, %ks8, %c0_i32 : i32 loc(#loc866) + %tmp21 = tt.splat %tmp20 : i1 -> tensor<1x64xi1, #mma> loc(#loc867) + %tmp29 = tt.splat %ks8 : i32 -> tensor<128x64xi32, #mma> loc(#loc868) + %tmp32 = tt.splat %tmp20 : i1 -> tensor<128x64xi1, #mma> loc(#loc869) + %p = tt.broadcast %lse_80 : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc870) + %ds = tt.expand_dims %Di_75 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc871) + %ds_107 = tt.broadcast %ds : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc872) + %kT_108 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1011) + %vT = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1012) + %kT_109 = arith.cmpi slt, %kT_ptrs, %kT_103 : tensor<1x64xi32, #blocked1> loc(#loc1008) + %kT_110 = tt.broadcast %kT_109 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc1011) + %kT_111 = ttg.memdesc_index %kT_108[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1011) + %vT_ptrs_112 = tt.splat %vT_ptrs_104 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1020) + %vT_ptrs_113 = arith.andi %vT_ptrs_112, %kT_110 : tensor<128x64xi1, #blocked1> loc(#loc1020) + %kT_114 = ttg.async_copy_global_to_local %kT_ptrs_95, %kT_111 mask %vT_ptrs_113 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1011) + %kT_115 = ttg.async_commit_group tokens %kT_114 loc(#loc1011) + %vT_116 = ttg.memdesc_index %vT[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1012) + %vT_117 = ttg.async_copy_global_to_local %vT_ptrs_98, %vT_116 mask %vT_ptrs_113 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1012) + %vT_118 = ttg.async_commit_group tokens %vT_117 loc(#loc1012) + %vT_ptrs_119 = arith.cmpi sgt, %hi_102, %c1_i32 : i32 loc(#loc1020) + %kT_ptrs_120 = tt.addptr %kT_ptrs_95, %cst_12 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc697) + %vT_ptrs_121 = tt.addptr %vT_ptrs_98, %cst_12 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc698) + %offs_n2_122 = arith.addi %offs_n2_87, %cst_13 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc699) + %kT_123 = tt.expand_dims %offs_n2_122 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc1013) + %kT_124 = arith.cmpi slt, %kT_123, %kT_103 : tensor<1x64xi32, #blocked1> loc(#loc1008) + %kT_125 = tt.broadcast %kT_124 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc1011) + %kT_126 = ttg.memdesc_index %kT_108[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1011) + %vT_ptrs_127 = tt.splat %vT_ptrs_119 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1020) + %vT_ptrs_128 = arith.andi %vT_ptrs_127, %kT_125 : tensor<128x64xi1, #blocked1> loc(#loc1020) + %kT_129 = ttg.async_copy_global_to_local %kT_ptrs_120, %kT_126 mask %vT_ptrs_128 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1011) + %kT_130 = ttg.async_commit_group tokens %kT_129 loc(#loc1011) + %vT_131 = ttg.memdesc_index %vT[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1012) + %vT_132 = ttg.async_copy_global_to_local %vT_ptrs_121, %vT_131 mask %vT_ptrs_128 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1012) + %vT_133 = ttg.async_commit_group tokens %vT_132 loc(#loc1012) + ttng.fence_async_shared {bCluster = false} loc(#loc874) + %vT_ptrs_134:12 = scf.for %vT_ptrs_190 = %c0_i32 to %hi_102 step %c1_i32 iter_args(%arg28 = %cst_7, %kT_ptrs_191 = %kT_ptrs_120, %offs_n2_192 = %offs_n2_122, %vT_ptrs_193 = %vT_ptrs_121, %offs_n2_194 = %offs_n2_86, %arg33 = %c1_i32, %arg34 = %c-1_i32, %kT_195 = %kT_115, %kT_196 = %kT_130, %vT_197 = %vT_118, %vT_198 = %vT_133, %arg39 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 { + %vT_ptrs_199 = arith.subi %hi_102, %c2_i32 : i32 loc(#loc1020) + %vT_ptrs_200 = arith.cmpi slt, %vT_ptrs_190, %vT_ptrs_199 : i32 loc(#loc1020) + %vT_ptrs_201 = arith.subi %hi_102, %c1_i32 : i32 loc(#loc1020) + %vT_ptrs_202 = arith.cmpi slt, %vT_ptrs_190, %vT_ptrs_201 : i32 loc(#loc1020) + %vT_ptrs_203 = arith.addi %arg34, %c1_i32 : i32 loc(#loc1020) + %vT_ptrs_204 = arith.cmpi sge, %vT_ptrs_203, %c3_i32 : i32 loc(#loc1020) + %vT_ptrs_205 = arith.select %vT_ptrs_204, %c0_i32, %vT_ptrs_203 : i32 loc(#loc1020) + %kT_206 = tt.expand_dims %offs_n2_194 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc1013) + %kT_207 = arith.cmpi slt, %kT_206, %kT : tensor<1x64xi32, #mma> loc(#loc1008) + %kT_208 = tt.broadcast %kT_207 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc1011) + %kT_209 = ttg.async_wait %kT_195, %vT_197 {num = 2 : i32} loc(#loc1011) + %kT_210 = ttg.memdesc_index %kT_108[%vT_ptrs_205] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1011) + %dq_211 = ttg.memdesc_trans %kT_210 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc875) + %qk = ttng.warp_group_dot %q_65, %kT_210, %cst_8 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc874) + %qk_212:3 = ttng.warp_group_dot_wait %qk, %q_65, %kT_210 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc874) + %qk_213 = arith.mulf %qk_212#0, %cst_9 : tensor<128x64xf32, #mma> loc(#loc876) + %n = arith.remsi %kT_206, %kT : tensor<1x64xi32, #mma> loc(#loc1014) + %post_mod_scores = arith.select %kT_208, %qk_213, %cst_10 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc878) + %tmp4_214 = tt.broadcast %n : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc857) + %tmp4_215 = arith.cmpi sge, %tmp4, %tmp4_214 : tensor<128x64xi32, #mma> loc(#loc857) + %tmp5 = arith.extsi %n : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc879) + %tmp8_216 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64, #mma> loc(#loc861) + %tmp11_217 = tt.broadcast %tmp8_216 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc864) + %tmp11_218 = arith.andi %tmp11_217, %tmp11 : tensor<128x64xi1, #mma> loc(#loc864) + %tmp12 = arith.andi %tmp4_215, %tmp11_218 : tensor<128x64xi1, #mma> loc(#loc880) + %tmp15_219 = arith.cmpi sge, %n, %tmp15 : tensor<1x64xi32, #mma> loc(#loc865) + %tmp16 = arith.remsi %n, %tmp15 : tensor<1x64xi32, #mma> loc(#loc881) + %tmp18 = arith.cmpi ne, %tmp16, %cst_24 : tensor<1x64xi32, #mma> loc(#loc882) + %tmp19 = arith.cmpi slt, %tmp16, %cst_24 : tensor<1x64xi32, #mma> loc(#loc883) + %tmp21_220 = arith.cmpi ne, %tmp19, %tmp21 : tensor<1x64xi1, #mma> loc(#loc867) + %tmp22 = arith.andi %tmp18, %tmp21_220 : tensor<1x64xi1, #mma> loc(#loc884) + %tmp23 = arith.addi %tmp16, %tmp15 : tensor<1x64xi32, #mma> loc(#loc885) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1, #mma>, tensor<1x64xi32, #mma> loc(#loc886) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc887) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64, #mma> loc(#loc888) + %tmp27 = arith.andi %tmp15_219, %tmp26 : tensor<1x64xi1, #mma> loc(#loc889) + %tmp28 = arith.subi %tmp4_214, %tmp4 : tensor<128x64xi32, #mma> loc(#loc890) + %tmp29_221 = arith.remsi %tmp28, %tmp29 : tensor<128x64xi32, #mma> loc(#loc868) + %tmp30 = arith.cmpi ne, %tmp29_221, %cst_23 : tensor<128x64xi32, #mma> loc(#loc891) + %tmp31 = arith.cmpi slt, %tmp29_221, %cst_23 : tensor<128x64xi32, #mma> loc(#loc892) + %tmp32_222 = arith.cmpi ne, %tmp31, %tmp32 : tensor<128x64xi1, #mma> loc(#loc869) + %tmp33 = arith.andi %tmp30, %tmp32_222 : tensor<128x64xi1, #mma> loc(#loc893) + %tmp34 = arith.addi %tmp29_221, %tmp29 : tensor<128x64xi32, #mma> loc(#loc894) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29_221 : tensor<128x64xi1, #mma>, tensor<128x64xi32, #mma> loc(#loc895) + %tmp36 = arith.cmpi eq, %tmp35, %cst_23 : tensor<128x64xi32, #mma> loc(#loc896) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc897) + %tmp37_223 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1, #mma> loc(#loc897) + %tmp38 = arith.ori %tmp12, %tmp37_223 : tensor<128x64xi1, #mma> loc(#loc898) + %post_mod_scores_224 = arith.select %tmp38, %post_mod_scores, %cst_10 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc899) + %post_mod_scores_225 = arith.mulf %post_mod_scores_224, %cst_11 : tensor<128x64xf32, #mma> loc(#loc900) + %p_226 = arith.subf %post_mod_scores_225, %p : tensor<128x64xf32, #mma> loc(#loc870) + %p_227 = math.exp2 %p_226 : tensor<128x64xf32, #mma> loc(#loc901) + %vT_228 = ttg.memdesc_index %vT[%vT_ptrs_205] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1012) + %dp = ttng.warp_group_dot %do_71, %vT_228, %cst_8 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc902) + %dp_229:3 = ttng.warp_group_dot_wait %dp, %do_71, %vT_228 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc902) + %ds_230 = arith.subf %dp_229#0, %ds_107 : tensor<128x64xf32, #mma> loc(#loc872) + %ds_231 = arith.mulf %p_227, %ds_230 : tensor<128x64xf32, #mma> loc(#loc903) + %grad_scores = arith.select %kT_208, %ds_231, %cst_8 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc904) + %ds_232 = arith.select %tmp38, %grad_scores, %cst_8 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc905) + %ds_233 = arith.truncf %ds_232 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc906) + %ds_234 = ttg.convert_layout %ds_233 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc906) + %dq_235 = ttng.warp_group_dot %ds_234, %dq_211, %arg28 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc907) + %offs_n2_236 = tt.splat %arg39 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc699) + %offs_n2_237 = arith.addi %offs_n2_194, %offs_n2_236 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc699) + %vT_ptrs_238 = arith.addi %vT_ptrs_190, %c1_i32 : i32 loc(#loc1020) + %cur_block_idx = arith.divsi %vT_ptrs_238, %c2_i32 : i32 loc(#loc908) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc909) + %cur_block_239 = tt.load %cur_block, %vT_ptrs_202 evictionPolicy = evict_last : !tt.ptr loc(#loc910) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc911) + %next_block_240 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_82 : i32 loc(#loc912) + %next_block_241 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc913) + %vT_ptrs_242 = arith.andi %vT_ptrs_202, %next_block_240 : i1 loc(#loc1020) + %next_block_243 = tt.load %next_block_241, %vT_ptrs_242 evictionPolicy = evict_last : !tt.ptr loc(#loc914) + %needs_jump = arith.addi %vT_ptrs_190, %c2_i32 : i32 loc(#loc915) + %needs_jump_244 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc916) + %needs_jump_245 = arith.cmpi eq, %needs_jump_244, %c0_i32 : i32 loc(#loc917) + %jump_to_block = arith.subi %next_block_243, %cur_block_239 : i32 loc(#loc918) + %jump_to_block_246 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc919) + %jump_to_block_247 = arith.subi %jump_to_block_246, %c64_i32 : i32 loc(#loc920) + %offset = arith.extui %needs_jump_245 : i1 to i32 loc(#loc921) + %offset_248 = arith.muli %jump_to_block_247, %offset : i32 loc(#loc921) + %offset_249 = arith.subi %c1_i32, %offset : i32 loc(#loc922) + %offset_250 = arith.muli %offset_249, %c64_i32 : i32 loc(#loc923) + %offset_251 = arith.addi %offset_248, %offset_250 : i32 loc(#loc924) + %kT_ptrs_252 = arith.muli %offset_251, %c128_i32 : i32 loc(#loc701) + %kT_ptrs_253 = tt.splat %kT_ptrs_252 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc697) + %kT_ptrs_254 = tt.addptr %kT_ptrs_191, %kT_ptrs_253 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc697) + %vT_ptrs_255 = tt.addptr %vT_ptrs_193, %kT_ptrs_253 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc698) + %offs_n2_256 = tt.splat %offset_251 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc699) + %offs_n2_257 = arith.addi %offs_n2_192, %offs_n2_256 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc699) + %vT_ptrs_258 = arith.addi %arg33, %c1_i32 : i32 loc(#loc1020) + %vT_ptrs_259 = arith.cmpi sge, %vT_ptrs_258, %c3_i32 : i32 loc(#loc1020) + %vT_ptrs_260 = arith.select %vT_ptrs_259, %c0_i32, %vT_ptrs_258 : i32 loc(#loc1020) + %kT_261 = tt.expand_dims %offs_n2_257 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc1013) + %kT_262 = arith.cmpi slt, %kT_261, %kT_103 : tensor<1x64xi32, #blocked1> loc(#loc1008) + %kT_263 = tt.broadcast %kT_262 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc1011) + %kT_264 = ttg.memdesc_index %kT_108[%vT_ptrs_260] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1011) + %vT_ptrs_265 = tt.splat %vT_ptrs_200 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1020) + %vT_ptrs_266 = arith.andi %vT_ptrs_265, %kT_263 : tensor<128x64xi1, #blocked1> loc(#loc1020) + %kT_267 = ttg.async_copy_global_to_local %kT_ptrs_254, %kT_264 mask %vT_ptrs_266 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1011) + %kT_268 = ttg.async_commit_group tokens %kT_267 loc(#loc1011) + %vT_269 = ttg.memdesc_index %vT[%vT_ptrs_260] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1012) + %vT_270 = ttg.async_copy_global_to_local %vT_ptrs_255, %vT_269 mask %vT_ptrs_266 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1012) + %vT_271 = ttg.async_commit_group tokens %vT_270 loc(#loc1012) + scf.yield %dq_235, %kT_ptrs_254, %offs_n2_257, %vT_ptrs_255, %offs_n2_237, %vT_ptrs_260, %vT_ptrs_205, %kT_196, %kT_268, %vT_198, %vT_271, %offset_251 : tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc1020) + } loc(#loc1020) + %vT_ptrs_135 = ttng.warp_group_dot_wait %vT_ptrs_134#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc1020) + %vT_ptrs_136 = ttg.async_wait {num = 0 : i32} loc(#loc1020) + ttg.local_dealloc %vT : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1020) + ttg.local_dealloc %kT_108 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1020) + %kv_indices_137 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_39 : !tt.ptr, i32 loc(#loc522) + %kv_start_138 = tt.load %kv_indices_137 : !tt.ptr loc(#loc523) + %kv_start_139 = arith.muli %kv_start_138, %c128_i32 : i32 loc(#loc524) + %sparse_kv_num_blocks_140 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_37 : !tt.ptr, i32 loc(#loc525) + %sparse_kv_num_blocks_141 = tt.load %sparse_kv_num_blocks_140 : !tt.ptr loc(#loc526) + %offs_n2_142 = tt.splat %kv_start_139 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc527) + %offs_n2_143 = tt.splat %kv_start_139 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc527) + %offs_n2_144 = arith.addi %offs_n2_142, %offs_n2 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc527) + %offs_n2_145 = arith.addi %offs_n2_143, %offs_n2_83 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc527) + %kT_ptrs_146 = tt.expand_dims %offs_n2_145 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc702) + %kT_ptrs_147 = arith.muli %kT_ptrs_146, %cst_20 : tensor<1x64xi32, #blocked1> loc(#loc703) + %kT_ptrs_148 = tt.addptr %kT_ptrs_89, %kT_ptrs_147 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc704) + %kT_ptrs_149 = tt.broadcast %kT_ptrs_148 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc705) + %kT_ptrs_150 = tt.addptr %kT_ptrs_149, %kT_ptrs_94 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc705) + %vT_ptrs_151 = tt.addptr %vT_ptrs, %kT_ptrs_147 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc706) + %vT_ptrs_152 = tt.broadcast %vT_ptrs_151 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc707) + %vT_ptrs_153 = tt.addptr %vT_ptrs_152, %kT_ptrs_94 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc707) + %hi_154 = arith.muli %sparse_kv_num_blocks_141, %c2_i32 : i32 loc(#loc708) + %hi_155 = arith.minsi %hi_154, %hi_101 : i32 loc(#loc709) + %kT_156 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1015) + %vT_157 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1016) + %vT_ptrs_158 = arith.cmpi sgt, %hi_155, %c0_i32 : i32 loc(#loc1021) + %kT_159 = arith.cmpi slt, %kT_ptrs_146, %kT_103 : tensor<1x64xi32, #blocked1> loc(#loc1017) + %kT_160 = tt.broadcast %kT_159 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc1015) + %kT_161 = ttg.memdesc_index %kT_156[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1015) + %vT_ptrs_162 = tt.splat %vT_ptrs_158 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1021) + %vT_ptrs_163 = arith.andi %vT_ptrs_162, %kT_160 : tensor<128x64xi1, #blocked1> loc(#loc1021) + %kT_164 = ttg.async_copy_global_to_local %kT_ptrs_150, %kT_161 mask %vT_ptrs_163 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1015) + %kT_165 = ttg.async_commit_group tokens %kT_164 loc(#loc1015) + %vT_166 = ttg.memdesc_index %vT_157[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1016) + %vT_167 = ttg.async_copy_global_to_local %vT_ptrs_153, %vT_166 mask %vT_ptrs_163 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1016) + %vT_168 = ttg.async_commit_group tokens %vT_167 loc(#loc1016) + %vT_ptrs_169 = arith.cmpi sgt, %hi_155, %c1_i32 : i32 loc(#loc1021) + %kT_ptrs_170 = tt.addptr %kT_ptrs_150, %cst_12 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc711) + %vT_ptrs_171 = tt.addptr %vT_ptrs_153, %cst_12 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc712) + %offs_n2_172 = arith.addi %offs_n2_145, %cst_13 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc713) + %kT_173 = tt.expand_dims %offs_n2_172 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc1018) + %kT_174 = arith.cmpi slt, %kT_173, %kT_103 : tensor<1x64xi32, #blocked1> loc(#loc1017) + %kT_175 = tt.broadcast %kT_174 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc1015) + %kT_176 = ttg.memdesc_index %kT_156[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1015) + %vT_ptrs_177 = tt.splat %vT_ptrs_169 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1021) + %vT_ptrs_178 = arith.andi %vT_ptrs_177, %kT_175 : tensor<128x64xi1, #blocked1> loc(#loc1021) + %kT_179 = ttg.async_copy_global_to_local %kT_ptrs_170, %kT_176 mask %vT_ptrs_178 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1015) + %kT_180 = ttg.async_commit_group tokens %kT_179 loc(#loc1015) + %vT_181 = ttg.memdesc_index %vT_157[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1016) + %vT_182 = ttg.async_copy_global_to_local %vT_ptrs_171, %vT_181 mask %vT_ptrs_178 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1016) + %vT_183 = ttg.async_commit_group tokens %vT_182 loc(#loc1016) + ttng.fence_async_shared {bCluster = false} loc(#loc927) + %vT_ptrs_184:12 = scf.for %vT_ptrs_190 = %c0_i32 to %hi_155 step %c1_i32 iter_args(%vT_ptrs_191 = %vT_ptrs_135, %kT_ptrs_192 = %kT_ptrs_170, %offs_n2_193 = %offs_n2_172, %vT_ptrs_194 = %vT_ptrs_171, %offs_n2_195 = %offs_n2_144, %arg33 = %c1_i32, %arg34 = %c-1_i32, %kT_196 = %kT_165, %kT_197 = %kT_180, %vT_198 = %vT_168, %vT_199 = %vT_183, %arg39 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 { + %vT_ptrs_200 = arith.subi %hi_155, %c2_i32 : i32 loc(#loc1021) + %vT_ptrs_201 = arith.cmpi slt, %vT_ptrs_190, %vT_ptrs_200 : i32 loc(#loc1021) + %vT_ptrs_202 = arith.subi %hi_155, %c1_i32 : i32 loc(#loc1021) + %vT_ptrs_203 = arith.cmpi slt, %vT_ptrs_190, %vT_ptrs_202 : i32 loc(#loc1021) + %vT_ptrs_204 = arith.addi %arg34, %c1_i32 : i32 loc(#loc1021) + %vT_ptrs_205 = arith.cmpi sge, %vT_ptrs_204, %c3_i32 : i32 loc(#loc1021) + %vT_ptrs_206 = arith.select %vT_ptrs_205, %c0_i32, %vT_ptrs_204 : i32 loc(#loc1021) + %kT_207 = tt.expand_dims %offs_n2_195 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc1018) + %kT_208 = arith.cmpi slt, %kT_207, %kT : tensor<1x64xi32, #mma> loc(#loc1017) + %kT_209 = tt.broadcast %kT_208 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc1015) + %kT_210 = ttg.async_wait %kT_196, %vT_198 {num = 2 : i32} loc(#loc1015) + %kT_211 = ttg.memdesc_index %kT_156[%vT_ptrs_206] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1015) + %dq_212 = ttg.memdesc_trans %kT_211 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc928) + %qk = ttng.warp_group_dot %q_65, %kT_211, %cst_8 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc927) + %qk_213:3 = ttng.warp_group_dot_wait %qk, %q_65, %kT_211 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc927) + %qk_214 = arith.mulf %qk_213#0, %cst_9 : tensor<128x64xf32, #mma> loc(#loc929) + %post_mod_scores = arith.select %kT_209, %qk_214, %cst_10 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc930) + %post_mod_scores_215 = arith.mulf %post_mod_scores, %cst_11 : tensor<128x64xf32, #mma> loc(#loc931) + %p_216 = arith.subf %post_mod_scores_215, %p : tensor<128x64xf32, #mma> loc(#loc932) + %p_217 = math.exp2 %p_216 : tensor<128x64xf32, #mma> loc(#loc933) + %vT_218 = ttg.memdesc_index %vT_157[%vT_ptrs_206] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1016) + %dp = ttng.warp_group_dot %do_71, %vT_218, %cst_8 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc934) + %dp_219:3 = ttng.warp_group_dot_wait %dp, %do_71, %vT_218 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc934) + %ds_220 = arith.subf %dp_219#0, %ds_107 : tensor<128x64xf32, #mma> loc(#loc935) + %ds_221 = arith.mulf %p_217, %ds_220 : tensor<128x64xf32, #mma> loc(#loc936) + %grad_scores = arith.select %kT_209, %ds_221, %cst_8 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc937) + %ds_222 = arith.truncf %grad_scores : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc938) + %ds_223 = ttg.convert_layout %ds_222 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc938) + %dq_224 = ttng.warp_group_dot %ds_223, %dq_212, %vT_ptrs_191 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc939) + %offs_n2_225 = tt.splat %arg39 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc713) + %offs_n2_226 = arith.addi %offs_n2_195, %offs_n2_225 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc713) + %vT_ptrs_227 = arith.addi %vT_ptrs_190, %c1_i32 : i32 loc(#loc1021) + %cur_block_idx = arith.divsi %vT_ptrs_227, %c2_i32 : i32 loc(#loc940) + %cur_block = tt.addptr %kv_indices_137, %cur_block_idx : !tt.ptr, i32 loc(#loc941) + %cur_block_228 = tt.load %cur_block, %vT_ptrs_203 evictionPolicy = evict_last : !tt.ptr loc(#loc942) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc943) + %next_block_229 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_141 : i32 loc(#loc944) + %next_block_230 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc945) + %vT_ptrs_231 = arith.andi %vT_ptrs_203, %next_block_229 : i1 loc(#loc1021) + %next_block_232 = tt.load %next_block_230, %vT_ptrs_231 evictionPolicy = evict_last : !tt.ptr loc(#loc946) + %needs_jump = arith.addi %vT_ptrs_190, %c2_i32 : i32 loc(#loc947) + %needs_jump_233 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc948) + %needs_jump_234 = arith.cmpi eq, %needs_jump_233, %c0_i32 : i32 loc(#loc949) + %jump_to_block = arith.subi %next_block_232, %cur_block_228 : i32 loc(#loc950) + %jump_to_block_235 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc951) + %jump_to_block_236 = arith.subi %jump_to_block_235, %c64_i32 : i32 loc(#loc952) + %offset = arith.extui %needs_jump_234 : i1 to i32 loc(#loc953) + %offset_237 = arith.muli %jump_to_block_236, %offset : i32 loc(#loc953) + %offset_238 = arith.subi %c1_i32, %offset : i32 loc(#loc954) + %offset_239 = arith.muli %offset_238, %c64_i32 : i32 loc(#loc955) + %offset_240 = arith.addi %offset_237, %offset_239 : i32 loc(#loc956) + %kT_ptrs_241 = arith.muli %offset_240, %c128_i32 : i32 loc(#loc715) + %kT_ptrs_242 = tt.splat %kT_ptrs_241 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc711) + %kT_ptrs_243 = tt.addptr %kT_ptrs_192, %kT_ptrs_242 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc711) + %vT_ptrs_244 = tt.addptr %vT_ptrs_194, %kT_ptrs_242 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc712) + %offs_n2_245 = tt.splat %offset_240 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc713) + %offs_n2_246 = arith.addi %offs_n2_193, %offs_n2_245 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc713) + %vT_ptrs_247 = arith.addi %arg33, %c1_i32 : i32 loc(#loc1021) + %vT_ptrs_248 = arith.cmpi sge, %vT_ptrs_247, %c3_i32 : i32 loc(#loc1021) + %vT_ptrs_249 = arith.select %vT_ptrs_248, %c0_i32, %vT_ptrs_247 : i32 loc(#loc1021) + %kT_250 = tt.expand_dims %offs_n2_246 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc1018) + %kT_251 = arith.cmpi slt, %kT_250, %kT_103 : tensor<1x64xi32, #blocked1> loc(#loc1017) + %kT_252 = tt.broadcast %kT_251 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc1015) + %kT_253 = ttg.memdesc_index %kT_156[%vT_ptrs_249] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1015) + %vT_ptrs_254 = tt.splat %vT_ptrs_201 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1021) + %vT_ptrs_255 = arith.andi %vT_ptrs_254, %kT_252 : tensor<128x64xi1, #blocked1> loc(#loc1021) + %kT_256 = ttg.async_copy_global_to_local %kT_ptrs_243, %kT_253 mask %vT_ptrs_255 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1015) + %kT_257 = ttg.async_commit_group tokens %kT_256 loc(#loc1015) + %vT_258 = ttg.memdesc_index %vT_157[%vT_ptrs_249] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1016) + %vT_259 = ttg.async_copy_global_to_local %vT_ptrs_244, %vT_258 mask %vT_ptrs_255 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc1016) + %vT_260 = ttg.async_commit_group tokens %vT_259 loc(#loc1016) + scf.yield %dq_224, %kT_ptrs_243, %offs_n2_246, %vT_ptrs_244, %offs_n2_226, %vT_ptrs_249, %vT_ptrs_206, %kT_197, %kT_257, %vT_199, %vT_260, %offset_240 : tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc1021) + } loc(#loc1021) + %vT_ptrs_185 = ttng.warp_group_dot_wait %vT_ptrs_184#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc1021) + %vT_ptrs_186 = ttg.async_wait {num = 0 : i32} loc(#loc1021) + ttg.local_dealloc %vT_157 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1021) + ttg.local_dealloc %kT_156 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1021) + %dq_ptrs = tt.splat %DQ2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc529) + %dq_ptrs_187 = tt.addptr %dq_ptrs, %ptr_53 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc529) + %dq_ptrs_188 = tt.broadcast %dq_ptrs_187 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc530) + %dq_ptrs_189 = tt.addptr %dq_ptrs_188, %ptr_59 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc530) + %dq = arith.mulf %vT_ptrs_185, %cst_6 : tensor<128x128xf32, #mma1> loc(#loc531) + %12 = arith.cmpi slt, %ptr_57, %cst_1 : tensor<1x128xi32, #blocked> loc(#loc189) + %13 = tt.broadcast %12 : tensor<1x128xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc190) + %14 = arith.andi %q_63, %13 : tensor<128x128xi1, #blocked> loc(#loc190) + %15 = arith.truncf %dq : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc191) + %16 = ttg.convert_layout %15 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc191) + tt.store %dq_ptrs_189, %16, %14 : tensor<128x128x!tt.ptr, #blocked> loc(#loc191) + } else { + %stride_q_idx_h = arith.muli %ks6, %ks7 : i32 loc(#loc532) + %start_n1 = arith.muli %pid, %c128_i32 : i32 loc(#loc533) + %offs_n1 = tt.splat %start_n1 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc534) + %offs_n1_35 = tt.splat %start_n1 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc534) + %offs_n1_36 = arith.addi %offs_n1, %offs_k : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc534) + %offs_n1_37 = arith.addi %offs_n1_35, %offs_k_34 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc534) + %ptr = tt.expand_dims %offs_n1_36 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc716) + %ptr_38 = tt.expand_dims %offs_n1_37 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xi32, #mma> loc(#loc716) + %ptr_39 = arith.muli %ptr, %cst_0 : tensor<128x1xi32, #blocked> loc(#loc717) + %ptr_40 = tt.splat %K : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc718) + %ptr_41 = tt.addptr %ptr_40, %ptr_39 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc718) + %ptr_42 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc719) + %ptr_43 = tt.expand_dims %ptr_42 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc719) + %ptr_44 = tt.broadcast %ptr_41 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc720) + %ptr_45 = tt.broadcast %ptr_43 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc720) + %ptr_46 = tt.addptr %ptr_44, %ptr_45 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc720) + %k = tt.splat %ks1 : i32 -> tensor<128x1xi32, #blocked> loc(#loc721) + %k_47 = tt.splat %ks1 : i32 -> tensor<128x1xi32, #mma> loc(#loc721) + %k_48 = arith.cmpi slt, %ptr, %k : tensor<128x1xi32, #blocked> loc(#loc721) + %k_49 = tt.broadcast %k_48 : tensor<128x1xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc722) + %k_50 = tt.load %ptr_46, %k_49, %cst_3 : tensor<128x128x!tt.ptr, #blocked> loc(#loc722) + %k_51 = ttg.local_alloc %k_50 : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc722) + %ptr_52 = tt.splat %V : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc723) + %ptr_53 = tt.addptr %ptr_52, %ptr_39 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc723) + %ptr_54 = tt.broadcast %ptr_53 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc724) + %ptr_55 = tt.addptr %ptr_54, %ptr_45 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc724) + %v = tt.load %ptr_55, %k_49, %cst_3 : tensor<128x128x!tt.ptr, #blocked> loc(#loc725) + %v_56 = ttg.local_alloc %v : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc725) + %off_hq1 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc537) + %q_adj1 = arith.muli %0, %off_zq : i32 loc(#loc538) + %do_adj1 = arith.muli %9, %off_zq : i32 loc(#loc539) + %off_chz1 = arith.muli %off_zq, %c32_i32 : i32 loc(#loc540) + %sparse_q_num_blks_offset = arith.muli %off_zkv, %ks5 : i32 loc(#loc541) + %sparse_q_num_blks_offset_57 = arith.addi %sparse_q_num_blks_offset, %pid : i32 loc(#loc542) + %sparse_q_idx_offset = arith.muli %off_zkv, %stride_q_idx_h : i32 loc(#loc543) + %sparse_q_idx_offset_58 = arith.muli %pid, %ks6 : i32 loc(#loc544) + %sparse_q_idx_offset_59 = arith.addi %sparse_q_idx_offset, %sparse_q_idx_offset_58 : i32 loc(#loc545) + %q_indices = tt.addptr %arg_Q_IDX, %sparse_q_idx_offset_59 : !tt.ptr, i32 loc(#loc546) + %q_start = tt.load %q_indices, %true : !tt.ptr loc(#loc547) + %q_start_60 = arith.muli %q_start, %c128_i32 : i32 loc(#loc548) + %sparse_q_num_blocks = tt.addptr %arg_Q_NUM_BLKS, %sparse_q_num_blks_offset_57 : !tt.ptr, i32 loc(#loc549) + %sparse_q_num_blocks_61 = tt.load %sparse_q_num_blocks, %true : !tt.ptr loc(#loc550) + %offs_m1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc551) + %offs_m1_62 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc551) + %offs_m1_63 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc551) + %offs_m1_64 = tt.splat %q_start_60 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc552) + %offs_m1_65 = tt.splat %q_start_60 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc552) + %offs_m1_66 = tt.splat %q_start_60 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc552) + %offs_m1_67 = arith.addi %offs_m1_64, %offs_m1 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc552) + %offs_m1_68 = arith.addi %offs_m1_65, %offs_m1_62 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc552) + %offs_m1_69 = arith.addi %offs_m1_66, %offs_m1_63 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc552) + %qT_ptrs = tt.expand_dims %offs_m1_68 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc726) + %qT_ptrs_70 = arith.muli %qT_ptrs, %cst_19 : tensor<1x64xi32, #blocked1> loc(#loc727) + %qT_ptrs_71 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc728) + %qT_ptrs_72 = tt.expand_dims %qT_ptrs_71 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xi32, #blocked1> loc(#loc728) + %qT_ptrs_73 = tt.broadcast %qT_ptrs_72 : tensor<128x1xi32, #blocked1> -> tensor<128x64xi32, #blocked1> loc(#loc729) + %do_ptrs = tt.expand_dims %offs_m1_69 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc730) + %do_ptrs_74 = arith.muli %do_ptrs, %cst_18 : tensor<64x1xi32, #blocked> loc(#loc731) + %do_ptrs_75 = tt.broadcast %ptr_43 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked> loc(#loc732) + %hi = arith.muli %sparse_q_num_blocks_61, %c2_i32 : i32 loc(#loc733) + %hi_76 = arith.addi %ks0, %c63_i32 : i32 loc(#loc957) + %hi_77 = arith.divsi %hi_76, %c64_i32 : i32 loc(#loc958) + %hi_78 = arith.maxsi %hi_77, %c1_i32 : i32 loc(#loc735) + %hi_79 = arith.minsi %hi, %hi_78 : i32 loc(#loc736) + %qT = tt.splat %ks0 : i32 -> tensor<1x64xi32, #mma> loc(#loc959) + %qT_80 = tt.splat %ks0 : i32 -> tensor<1x64xi32, #blocked1> loc(#loc959) + %lse = tt.splat %ks0 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc738) + %n = arith.remsi %ptr_38, %k_47 : tensor<128x1xi32, #mma> loc(#loc960) + %tmp44 = tt.broadcast %n : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc740) + %tmp45 = arith.extsi %n : tensor<128x1xi32, #mma> to tensor<128x1xi64, #mma> loc(#loc741) + %tmp47 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc742) + %do_ptrs_81 = arith.cmpi sgt, %hi_79, %c0_i32 : i32 loc(#loc1023) + %tmp47_82 = tt.load %tmp47, %do_ptrs_81 : !tt.ptr loc(#loc744) + %tmp48 = tt.splat %tmp47_82 : i64 -> tensor<128x1xi64, #mma> loc(#loc745) + %tmp48_83 = arith.cmpi slt, %tmp45, %tmp48 : tensor<128x1xi64, #mma> loc(#loc745) + %tmp50 = tt.splat %tmp47_82 : i64 -> tensor<1x64xi64, #mma> loc(#loc746) + %tmp51 = tt.broadcast %tmp48_83 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc747) + %tmp55 = tt.splat %ks8 : i32 -> tensor<128x1xi32, #mma> loc(#loc748) + %tmp55_84 = arith.cmpi sge, %n, %tmp55 : tensor<128x1xi32, #mma> loc(#loc748) + %tmp56 = arith.remsi %n, %tmp55 : tensor<128x1xi32, #mma> loc(#loc749) + %tmp58 = arith.cmpi ne, %tmp56, %cst : tensor<128x1xi32, #mma> loc(#loc750) + %tmp59 = arith.cmpi slt, %tmp56, %cst : tensor<128x1xi32, #mma> loc(#loc751) + %tmp60 = arith.cmpi slt, %ks8, %c0_i32 : i32 loc(#loc752) + %tmp61 = tt.splat %tmp60 : i1 -> tensor<128x1xi1, #mma> loc(#loc753) + %tmp61_85 = arith.cmpi ne, %tmp59, %tmp61 : tensor<128x1xi1, #mma> loc(#loc753) + %tmp62 = arith.andi %tmp58, %tmp61_85 : tensor<128x1xi1, #mma> loc(#loc754) + %tmp63 = arith.addi %tmp56, %tmp55 : tensor<128x1xi32, #mma> loc(#loc755) + %tmp64 = arith.select %tmp62, %tmp63, %tmp56 : tensor<128x1xi1, #mma>, tensor<128x1xi32, #mma> loc(#loc756) + %tmp65 = arith.extsi %tmp64 : tensor<128x1xi32, #mma> to tensor<128x1xi64, #mma> loc(#loc757) + %tmp66 = arith.cmpi slt, %tmp65, %tmp48 : tensor<128x1xi64, #mma> loc(#loc758) + %tmp67 = arith.andi %tmp55_84, %tmp66 : tensor<128x1xi1, #mma> loc(#loc759) + %tmp69 = tt.splat %ks8 : i32 -> tensor<128x64xi32, #mma> loc(#loc760) + %tmp72 = tt.splat %tmp60 : i1 -> tensor<128x64xi1, #mma> loc(#loc761) + %tmp77 = tt.broadcast %tmp67 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc762) + %do = tt.splat %ks0 : i32 -> tensor<64x1xi32, #blocked> loc(#loc962) + %q_indices_86 = tt.addptr %arg_FULL_Q_IDX, %sparse_q_idx_offset_59 : !tt.ptr, i32 loc(#loc592) + %q_start_87 = tt.load %q_indices_86, %true : !tt.ptr loc(#loc593) + %q_start_88 = arith.muli %q_start_87, %c128_i32 : i32 loc(#loc594) + %sparse_q_num_blocks_89 = tt.addptr %arg_FULL_Q_NUM_BLKS, %sparse_q_num_blks_offset_57 : !tt.ptr, i32 loc(#loc595) + %sparse_q_num_blocks_90 = tt.load %sparse_q_num_blocks_89, %true : !tt.ptr loc(#loc596) + %offs_m1_91 = tt.splat %q_start_88 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc597) + %offs_m1_92 = tt.splat %q_start_88 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc597) + %offs_m1_93 = tt.splat %q_start_88 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc597) + %offs_m1_94 = arith.addi %offs_m1_91, %offs_m1 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc597) + %offs_m1_95 = arith.addi %offs_m1_92, %offs_m1_62 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc597) + %offs_m1_96 = arith.addi %offs_m1_93, %offs_m1_63 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc597) + %qT_ptrs_97 = tt.expand_dims %offs_m1_95 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc764) + %qT_ptrs_98 = arith.muli %qT_ptrs_97, %cst_19 : tensor<1x64xi32, #blocked1> loc(#loc765) + %do_ptrs_99 = tt.expand_dims %offs_m1_96 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc766) + %do_ptrs_100 = arith.muli %do_ptrs_99, %cst_18 : tensor<64x1xi32, #blocked> loc(#loc767) + %hi_101 = arith.muli %sparse_q_num_blocks_90, %c2_i32 : i32 loc(#loc768) + %hi_102 = arith.minsi %hi_101, %hi_78 : i32 loc(#loc769) + ttng.fence_async_shared {bCluster = false} loc(#loc770) + %dk:2 = scf.for %dk_116 = %c0_i32 to %c4_i32 step %c1_i32 iter_args(%arg28 = %cst_7, %arg29 = %cst_7) -> (tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>) : i32 { + %off_hq1_117 = arith.addi %off_hq1, %dk_116 : i32 loc(#loc600) + %q_adj1_118 = arith.muli %off_hq1_117, %c128_i32 : i32 loc(#loc601) + %q_adj1_119 = arith.addi %q_adj1_118, %q_adj1 : i32 loc(#loc602) + %q_adj1_120 = arith.extsi %q_adj1_119 : i32 to i64 loc(#loc603) + %do_adj1_121 = arith.muli %10, %off_hq1_117 : i32 loc(#loc604) + %do_adj1_122 = arith.addi %do_adj1_121, %do_adj1 : i32 loc(#loc605) + %do_adj1_123 = arith.extsi %do_adj1_122 : i32 to i64 loc(#loc606) + %off_chz1_124 = arith.addi %off_chz1, %off_hq1_117 : i32 loc(#loc607) + %off_chz1_125 = arith.muli %off_chz1_124, %ks0 : i32 loc(#loc608) + %off_chz1_126 = arith.extsi %off_chz1_125 : i32 to i64 loc(#loc609) + %Q1 = tt.addptr %arg_Q, %q_adj1_120 : !tt.ptr, i64 loc(#loc610) + %DO1 = tt.addptr %arg_DO, %do_adj1_123 : !tt.ptr, i64 loc(#loc611) + %LSE1 = tt.addptr %arg_LSE, %off_chz1_126 : !tt.ptr, i64 loc(#loc612) + %DELTA1 = tt.addptr %arg_DELTA, %off_chz1_126 : !tt.ptr, i64 loc(#loc613) + %qT_ptrs_127 = tt.splat %Q1 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc772) + %qT_ptrs_128 = tt.addptr %qT_ptrs_127, %qT_ptrs_70 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc772) + %qT_ptrs_129 = tt.broadcast %qT_ptrs_128 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc729) + %qT_ptrs_130 = tt.addptr %qT_ptrs_129, %qT_ptrs_73 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc729) + %do_ptrs_131 = tt.splat %DO1 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked> loc(#loc773) + %do_ptrs_132 = tt.addptr %do_ptrs_131, %do_ptrs_74 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc773) + %do_ptrs_133 = tt.broadcast %do_ptrs_132 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc732) + %do_ptrs_134 = tt.addptr %do_ptrs_133, %do_ptrs_75 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc732) + %lse_135 = tt.splat %LSE1 : !tt.ptr -> tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc774) + %Di = tt.splat %DELTA1 : !tt.ptr -> tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc775) + %qT_136 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc963) + %lse_137 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc776) + %do_138 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc964) + %Di_139 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc777) + %qT_140 = arith.cmpi slt, %qT_ptrs, %qT_80 : tensor<1x64xi32, #blocked1> loc(#loc959) + %qT_141 = tt.broadcast %qT_140 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc963) + %qT_142 = ttg.memdesc_index %qT_136[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc963) + %do_ptrs_143 = tt.splat %do_ptrs_81 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1023) + %do_ptrs_144 = arith.andi %do_ptrs_143, %qT_141 : tensor<128x64xi1, #blocked1> loc(#loc1023) + %qT_145 = ttg.async_copy_global_to_local %qT_ptrs_130, %qT_142 mask %do_ptrs_144 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc963) + %qT_146 = ttg.async_commit_group tokens %qT_145 loc(#loc963) + %lse_147 = arith.cmpi slt, %offs_m1_67, %lse : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc738) + %lse_148 = tt.addptr %lse_135, %offs_m1_67 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc774) + %lse_149 = ttg.memdesc_index %lse_137[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc776) + %do_ptrs_150 = tt.splat %do_ptrs_81 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1023) + %do_ptrs_151 = arith.andi %do_ptrs_150, %lse_147 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1023) + %lse_152 = ttg.async_copy_global_to_local %lse_148, %lse_149 mask %do_ptrs_151 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc776) + %lse_153 = ttg.async_commit_group tokens %lse_152 loc(#loc776) + %do_154 = arith.cmpi slt, %do_ptrs, %do : tensor<64x1xi32, #blocked> loc(#loc962) + %do_155 = tt.broadcast %do_154 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc964) + %do_156 = ttg.memdesc_index %do_138[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc964) + %do_ptrs_157 = tt.splat %do_ptrs_81 : i1 -> tensor<64x128xi1, #blocked> loc(#loc1023) + %do_ptrs_158 = arith.andi %do_ptrs_157, %do_155 : tensor<64x128xi1, #blocked> loc(#loc1023) + %do_159 = ttg.async_copy_global_to_local %do_ptrs_134, %do_156 mask %do_ptrs_158 other %cst_5 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc964) + %do_160 = ttg.async_commit_group tokens %do_159 loc(#loc964) + %Di_161 = tt.addptr %Di, %offs_m1_67 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc775) + %Di_162 = ttg.memdesc_index %Di_139[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc777) + %Di_163 = ttg.async_copy_global_to_local %Di_161, %Di_162 mask %do_ptrs_151 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc777) + %Di_164 = ttg.async_commit_group tokens %Di_163 loc(#loc777) + %do_ptrs_165 = arith.cmpi sgt, %hi_79, %c1_i32 : i32 loc(#loc1023) + %qT_ptrs_166 = tt.addptr %qT_ptrs_130, %cst_14 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc778) + %do_ptrs_167 = tt.addptr %do_ptrs_134, %cst_15 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc779) + %offs_m1_168 = arith.addi %offs_m1_67, %cst_16 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc780) + %offs_m1_169 = arith.addi %offs_m1_68, %cst_13 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc780) + %offs_m1_170 = arith.addi %offs_m1_69, %cst_17 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc780) + %qT_171 = tt.expand_dims %offs_m1_169 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc965) + %qT_172 = arith.cmpi slt, %qT_171, %qT_80 : tensor<1x64xi32, #blocked1> loc(#loc959) + %qT_173 = tt.broadcast %qT_172 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc963) + %qT_174 = ttg.memdesc_index %qT_136[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc963) + %do_ptrs_175 = tt.splat %do_ptrs_165 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1023) + %do_ptrs_176 = arith.andi %do_ptrs_175, %qT_173 : tensor<128x64xi1, #blocked1> loc(#loc1023) + %qT_177 = ttg.async_copy_global_to_local %qT_ptrs_166, %qT_174 mask %do_ptrs_176 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc963) + %qT_178 = ttg.async_commit_group tokens %qT_177 loc(#loc963) + %lse_179 = arith.cmpi slt, %offs_m1_168, %lse : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc738) + %lse_180 = tt.addptr %lse_135, %offs_m1_168 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc774) + %lse_181 = ttg.memdesc_index %lse_137[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc776) + %do_ptrs_182 = tt.splat %do_ptrs_165 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1023) + %do_ptrs_183 = arith.andi %do_ptrs_182, %lse_179 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1023) + %lse_184 = ttg.async_copy_global_to_local %lse_180, %lse_181 mask %do_ptrs_183 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc776) + %lse_185 = ttg.async_commit_group tokens %lse_184 loc(#loc776) + %do_186 = tt.expand_dims %offs_m1_170 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc966) + %do_187 = arith.cmpi slt, %do_186, %do : tensor<64x1xi32, #blocked> loc(#loc962) + %do_188 = tt.broadcast %do_187 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc964) + %do_189 = ttg.memdesc_index %do_138[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc964) + %do_ptrs_190 = tt.splat %do_ptrs_165 : i1 -> tensor<64x128xi1, #blocked> loc(#loc1023) + %do_ptrs_191 = arith.andi %do_ptrs_190, %do_188 : tensor<64x128xi1, #blocked> loc(#loc1023) + %do_192 = ttg.async_copy_global_to_local %do_ptrs_167, %do_189 mask %do_ptrs_191 other %cst_5 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc964) + %do_193 = ttg.async_commit_group tokens %do_192 loc(#loc964) + %Di_194 = tt.addptr %Di, %offs_m1_168 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc775) + %Di_195 = ttg.memdesc_index %Di_139[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc777) + %Di_196 = ttg.async_copy_global_to_local %Di_194, %Di_195 mask %do_ptrs_183 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc777) + %Di_197 = ttg.async_commit_group tokens %Di_196 loc(#loc777) + %do_ptrs_198:22 = scf.for %do_ptrs_273 = %c0_i32 to %hi_79 step %c1_i32 iter_args(%arg31 = %arg29, %arg32 = %arg28, %qT_ptrs_274 = %qT_ptrs_166, %offs_m1_275 = %offs_m1_169, %do_ptrs_276 = %do_ptrs_167, %offs_m1_277 = %offs_m1_170, %offs_m1_278 = %offs_m1_168, %arg38 = %c1_i32, %arg39 = %c-1_i32, %arg40 = %c1_i32, %arg41 = %c-1_i32, %offs_m1_279 = %offs_m1_67, %qT_280 = %qT_146, %qT_281 = %qT_178, %lse_282 = %lse_153, %lse_283 = %lse_185, %do_284 = %do_160, %do_285 = %do_193, %Di_286 = %Di_164, %Di_287 = %Di_197, %arg51 = %c64_i32, %offs_m1_288 = %offs_m1_67) -> (tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<64x128x!tt.ptr, #blocked>, tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, i32, i32, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>) : i32 { + %do_ptrs_289 = arith.subi %hi_79, %c2_i32 : i32 loc(#loc1023) + %do_ptrs_290 = arith.cmpi slt, %do_ptrs_273, %do_ptrs_289 : i32 loc(#loc1023) + %do_ptrs_291 = arith.subi %hi_79, %c1_i32 : i32 loc(#loc1023) + %do_ptrs_292 = arith.cmpi slt, %do_ptrs_273, %do_ptrs_291 : i32 loc(#loc1023) + %do_ptrs_293 = arith.addi %arg41, %c1_i32 : i32 loc(#loc1023) + %do_ptrs_294 = arith.cmpi sge, %do_ptrs_293, %c2_i32 : i32 loc(#loc1023) + %do_ptrs_295 = arith.select %do_ptrs_294, %c0_i32, %do_ptrs_293 : i32 loc(#loc1023) + %do_ptrs_296 = arith.addi %arg39, %c1_i32 : i32 loc(#loc1023) + %do_ptrs_297 = arith.cmpi sge, %do_ptrs_296, %c3_i32 : i32 loc(#loc1023) + %do_ptrs_298 = arith.select %do_ptrs_297, %c0_i32, %do_ptrs_296 : i32 loc(#loc1023) + %qT_299 = tt.expand_dims %offs_m1_279 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc965) + %qT_300 = tt.expand_dims %offs_m1_288 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc965) + %qT_301 = arith.cmpi slt, %qT_299, %qT : tensor<1x64xi32, #mma> loc(#loc959) + %qT_302 = tt.broadcast %qT_301 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc963) + %qT_303 = ttg.async_wait %qT_280, %lse_282, %do_284, %Di_286 {num = 4 : i32} loc(#loc963) + %qT_304 = ttg.memdesc_index %qT_136[%do_ptrs_298] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc963) + %dk_305 = ttg.memdesc_trans %qT_304 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc781) + %lse_306 = ttg.memdesc_index %lse_137[%do_ptrs_295] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc776) + %lse_307 = ttg.local_load %lse_306 token %qT_303 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc776) + %lse_308 = arith.cmpf oeq, %lse_307, %cst_21 : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc782) + %lse_309 = arith.select %lse_308, %cst_22, %lse_307 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc783) + %qkT = ttng.warp_group_dot %k_51, %qT_304, %cst_8 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc770) + %qkT_310:3 = ttng.warp_group_dot_wait %qkT, %k_51, %qT_304 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc770) + %qkT_311 = arith.mulf %qkT_310#0, %cst_9 : tensor<128x64xf32, #mma> loc(#loc784) + %m = arith.remsi %qT_300, %qT : tensor<1x64xi32, #mma> loc(#loc967) + %post_mod_scores = arith.select %qT_302, %qkT_311, %cst_10 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc786) + %tmp44_312 = tt.broadcast %m : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc740) + %tmp44_313 = arith.cmpi sge, %tmp44_312, %tmp44 : tensor<128x64xi32, #mma> loc(#loc740) + %tmp49 = arith.extsi %m : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc787) + %tmp50_314 = arith.cmpi slt, %tmp49, %tmp50 : tensor<1x64xi64, #mma> loc(#loc746) + %tmp51_315 = tt.broadcast %tmp50_314 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc747) + %tmp51_316 = arith.andi %tmp51, %tmp51_315 : tensor<128x64xi1, #mma> loc(#loc747) + %tmp52 = arith.andi %tmp44_313, %tmp51_316 : tensor<128x64xi1, #mma> loc(#loc788) + %tmp68 = arith.subi %tmp44, %tmp44_312 : tensor<128x64xi32, #mma> loc(#loc789) + %tmp69_317 = arith.remsi %tmp68, %tmp69 : tensor<128x64xi32, #mma> loc(#loc760) + %tmp70 = arith.cmpi ne, %tmp69_317, %cst_23 : tensor<128x64xi32, #mma> loc(#loc790) + %tmp71 = arith.cmpi slt, %tmp69_317, %cst_23 : tensor<128x64xi32, #mma> loc(#loc791) + %tmp72_318 = arith.cmpi ne, %tmp71, %tmp72 : tensor<128x64xi1, #mma> loc(#loc761) + %tmp73 = arith.andi %tmp70, %tmp72_318 : tensor<128x64xi1, #mma> loc(#loc792) + %tmp74 = arith.addi %tmp69_317, %tmp69 : tensor<128x64xi32, #mma> loc(#loc793) + %tmp75 = arith.select %tmp73, %tmp74, %tmp69_317 : tensor<128x64xi1, #mma>, tensor<128x64xi32, #mma> loc(#loc794) + %tmp76 = arith.cmpi eq, %tmp75, %cst_23 : tensor<128x64xi32, #mma> loc(#loc795) + %tmp77_319 = arith.andi %tmp77, %tmp76 : tensor<128x64xi1, #mma> loc(#loc762) + %tmp78 = arith.ori %tmp52, %tmp77_319 : tensor<128x64xi1, #mma> loc(#loc796) + %post_mod_scores_320 = arith.select %tmp78, %post_mod_scores, %cst_10 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc797) + %post_mod_scores_321 = arith.mulf %post_mod_scores_320, %cst_11 : tensor<128x64xf32, #mma> loc(#loc798) + %pT = tt.expand_dims %lse_309 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xf32, #mma> loc(#loc799) + %pT_322 = tt.broadcast %pT : tensor<1x64xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc800) + %pT_323 = arith.subf %post_mod_scores_321, %pT_322 : tensor<128x64xf32, #mma> loc(#loc800) + %pT_324 = math.exp2 %pT_323 : tensor<128x64xf32, #mma> loc(#loc801) + %do_325 = ttg.memdesc_index %do_138[%do_ptrs_298] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc964) + %dpT = ttg.memdesc_trans %do_325 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc802) + %dv = arith.truncf %pT_324 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc803) + %dv_326 = ttg.convert_layout %dv : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc803) + %dv_327 = ttng.warp_group_dot %dv_326, %do_325, %arg32 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc804) + %Di_328 = ttg.memdesc_index %Di_139[%do_ptrs_295] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc777) + %Di_329 = ttg.local_load %Di_328 token %qT_303 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc777) + %dpT_330 = ttng.warp_group_dot %v_56, %dpT, %cst_8 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc805) + %dpT_331:3 = ttng.warp_group_dot_wait %dpT_330, %v_56, %dpT {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc805) + %dsT = tt.expand_dims %Di_329 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xf32, #mma> loc(#loc806) + %dsT_332 = tt.broadcast %dsT : tensor<1x64xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc807) + %dsT_333 = arith.subf %dpT_331#0, %dsT_332 : tensor<128x64xf32, #mma> loc(#loc807) + %dsT_334 = arith.mulf %pT_324, %dsT_333 : tensor<128x64xf32, #mma> loc(#loc808) + %grad_scores = arith.select %qT_302, %dsT_334, %cst_8 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc809) + %dsT_335 = arith.select %tmp78, %grad_scores, %cst_8 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc810) + %dk_336 = arith.truncf %dsT_335 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc811) + %dk_337 = ttg.convert_layout %dk_336 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc811) + %dk_338 = ttng.warp_group_dot %dk_337, %dk_305, %arg31 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc812) + %offs_m1_339 = tt.splat %arg51 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc780) + %offs_m1_340 = arith.addi %offs_m1_288, %offs_m1_339 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc780) + %do_ptrs_341 = arith.addi %do_ptrs_273, %c1_i32 : i32 loc(#loc1023) + %cur_block_idx = arith.divsi %do_ptrs_341, %c2_i32 : i32 loc(#loc968) + %cur_block = tt.addptr %q_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc969) + %cur_block_342 = tt.load %cur_block, %do_ptrs_292 evictionPolicy = evict_last : !tt.ptr loc(#loc970) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc971) + %next_block_343 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_61 : i32 loc(#loc972) + %next_block_344 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc973) + %do_ptrs_345 = arith.andi %do_ptrs_292, %next_block_343 : i1 loc(#loc1023) + %next_block_346 = tt.load %next_block_344, %do_ptrs_345 evictionPolicy = evict_last : !tt.ptr loc(#loc974) + %needs_jump = arith.addi %do_ptrs_273, %c2_i32 : i32 loc(#loc975) + %needs_jump_347 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc976) + %needs_jump_348 = arith.cmpi eq, %needs_jump_347, %c0_i32 : i32 loc(#loc977) + %jump_to_block = arith.subi %next_block_346, %cur_block_342 : i32 loc(#loc978) + %jump_to_block_349 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc979) + %jump_to_block_350 = arith.subi %jump_to_block_349, %c64_i32 : i32 loc(#loc980) + %offset = arith.extui %needs_jump_348 : i1 to i32 loc(#loc981) + %offset_351 = arith.muli %jump_to_block_350, %offset : i32 loc(#loc981) + %offset_352 = arith.subi %c1_i32, %offset : i32 loc(#loc982) + %offset_353 = arith.muli %offset_352, %c64_i32 : i32 loc(#loc983) + %offset_354 = arith.addi %offset_351, %offset_353 : i32 loc(#loc984) + %qT_ptrs_355 = arith.muli %offset_354, %c4096_i32 : i32 loc(#loc814) + %qT_ptrs_356 = tt.splat %qT_ptrs_355 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc778) + %qT_ptrs_357 = tt.addptr %qT_ptrs_274, %qT_ptrs_356 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc778) + %do_ptrs_358 = arith.muli %offset_354, %c128_i32 : i32 loc(#loc815) + %do_ptrs_359 = tt.splat %do_ptrs_358 : i32 -> tensor<64x128xi32, #blocked> loc(#loc779) + %do_ptrs_360 = tt.addptr %do_ptrs_276, %do_ptrs_359 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc779) + %offs_m1_361 = tt.splat %offset_354 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc780) + %offs_m1_362 = tt.splat %offset_354 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc780) + %offs_m1_363 = tt.splat %offset_354 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc780) + %offs_m1_364 = arith.addi %offs_m1_278, %offs_m1_361 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc780) + %offs_m1_365 = arith.addi %offs_m1_275, %offs_m1_362 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc780) + %offs_m1_366 = arith.addi %offs_m1_277, %offs_m1_363 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc780) + %do_ptrs_367 = arith.addi %arg40, %c1_i32 : i32 loc(#loc1023) + %do_ptrs_368 = arith.cmpi sge, %do_ptrs_367, %c2_i32 : i32 loc(#loc1023) + %do_ptrs_369 = arith.select %do_ptrs_368, %c0_i32, %do_ptrs_367 : i32 loc(#loc1023) + %do_ptrs_370 = arith.addi %arg38, %c1_i32 : i32 loc(#loc1023) + %do_ptrs_371 = arith.cmpi sge, %do_ptrs_370, %c3_i32 : i32 loc(#loc1023) + %do_ptrs_372 = arith.select %do_ptrs_371, %c0_i32, %do_ptrs_370 : i32 loc(#loc1023) + %qT_373 = tt.expand_dims %offs_m1_365 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc965) + %qT_374 = arith.cmpi slt, %qT_373, %qT_80 : tensor<1x64xi32, #blocked1> loc(#loc959) + %qT_375 = tt.broadcast %qT_374 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc963) + %qT_376 = ttg.memdesc_index %qT_136[%do_ptrs_372] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc963) + %do_ptrs_377 = tt.splat %do_ptrs_290 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1023) + %do_ptrs_378 = arith.andi %do_ptrs_377, %qT_375 : tensor<128x64xi1, #blocked1> loc(#loc1023) + %qT_379 = ttg.async_copy_global_to_local %qT_ptrs_357, %qT_376 mask %do_ptrs_378 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc963) + %qT_380 = ttg.async_commit_group tokens %qT_379 loc(#loc963) + %lse_381 = arith.cmpi slt, %offs_m1_364, %lse : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc738) + %lse_382 = tt.addptr %lse_135, %offs_m1_364 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc774) + %lse_383 = ttg.memdesc_index %lse_137[%do_ptrs_369] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc776) + %do_ptrs_384 = tt.splat %do_ptrs_290 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1023) + %do_ptrs_385 = arith.andi %do_ptrs_384, %lse_381 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1023) + %lse_386 = ttg.async_copy_global_to_local %lse_382, %lse_383 mask %do_ptrs_385 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc776) + %lse_387 = ttg.async_commit_group tokens %lse_386 loc(#loc776) + %do_388 = tt.expand_dims %offs_m1_366 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc966) + %do_389 = arith.cmpi slt, %do_388, %do : tensor<64x1xi32, #blocked> loc(#loc962) + %do_390 = tt.broadcast %do_389 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc964) + %do_391 = ttg.memdesc_index %do_138[%do_ptrs_372] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc964) + %do_ptrs_392 = tt.splat %do_ptrs_290 : i1 -> tensor<64x128xi1, #blocked> loc(#loc1023) + %do_ptrs_393 = arith.andi %do_ptrs_392, %do_390 : tensor<64x128xi1, #blocked> loc(#loc1023) + %do_394 = ttg.async_copy_global_to_local %do_ptrs_360, %do_391 mask %do_ptrs_393 other %cst_5 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc964) + %do_395 = ttg.async_commit_group tokens %do_394 loc(#loc964) + %Di_396 = tt.addptr %Di, %offs_m1_364 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc775) + %Di_397 = ttg.memdesc_index %Di_139[%do_ptrs_369] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc777) + %Di_398 = ttg.async_copy_global_to_local %Di_396, %Di_397 mask %do_ptrs_385 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc777) + %Di_399 = ttg.async_commit_group tokens %Di_398 loc(#loc777) + scf.yield %dk_338, %dv_327, %qT_ptrs_357, %offs_m1_365, %do_ptrs_360, %offs_m1_366, %offs_m1_364, %do_ptrs_372, %do_ptrs_298, %do_ptrs_369, %do_ptrs_295, %offs_m1_278, %qT_281, %qT_380, %lse_283, %lse_387, %do_285, %do_395, %Di_287, %Di_399, %offset_354, %offs_m1_340 : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<64x128x!tt.ptr, #blocked>, tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, i32, i32, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1023) + } loc(#loc1023) + %do_ptrs_199:2 = ttng.warp_group_dot_wait %do_ptrs_198#1, %do_ptrs_198#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1> loc(#loc1023) + %do_ptrs_200 = ttg.async_wait {num = 0 : i32} loc(#loc1023) + ttg.local_dealloc %Di_139 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc1023) + ttg.local_dealloc %do_138 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc1023) + ttg.local_dealloc %lse_137 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc1023) + ttg.local_dealloc %qT_136 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1023) + %qT_ptrs_201 = tt.addptr %qT_ptrs_127, %qT_ptrs_98 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc816) + %qT_ptrs_202 = tt.broadcast %qT_ptrs_201 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc817) + %qT_ptrs_203 = tt.addptr %qT_ptrs_202, %qT_ptrs_73 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc817) + %do_ptrs_204 = tt.addptr %do_ptrs_131, %do_ptrs_100 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc818) + %do_ptrs_205 = tt.broadcast %do_ptrs_204 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc819) + %do_ptrs_206 = tt.addptr %do_ptrs_205, %do_ptrs_75 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc819) + %qT_207 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc985) + %lse_208 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc821) + %do_209 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc986) + %Di_210 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc823) + %do_ptrs_211 = arith.cmpi sgt, %hi_102, %c0_i32 : i32 loc(#loc1024) + %qT_212 = arith.cmpi slt, %qT_ptrs_97, %qT_80 : tensor<1x64xi32, #blocked1> loc(#loc987) + %qT_213 = tt.broadcast %qT_212 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc985) + %qT_214 = ttg.memdesc_index %qT_207[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc985) + %do_ptrs_215 = tt.splat %do_ptrs_211 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1024) + %do_ptrs_216 = arith.andi %do_ptrs_215, %qT_213 : tensor<128x64xi1, #blocked1> loc(#loc1024) + %qT_217 = ttg.async_copy_global_to_local %qT_ptrs_203, %qT_214 mask %do_ptrs_216 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc985) + %qT_218 = ttg.async_commit_group tokens %qT_217 loc(#loc985) + %lse_219 = arith.cmpi slt, %offs_m1_94, %lse : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc824) + %lse_220 = tt.addptr %lse_135, %offs_m1_94 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc825) + %lse_221 = ttg.memdesc_index %lse_208[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc821) + %do_ptrs_222 = tt.splat %do_ptrs_211 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1024) + %do_ptrs_223 = arith.andi %do_ptrs_222, %lse_219 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1024) + %lse_224 = ttg.async_copy_global_to_local %lse_220, %lse_221 mask %do_ptrs_223 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc821) + %lse_225 = ttg.async_commit_group tokens %lse_224 loc(#loc821) + %do_226 = arith.cmpi slt, %do_ptrs_99, %do : tensor<64x1xi32, #blocked> loc(#loc988) + %do_227 = tt.broadcast %do_226 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc986) + %do_228 = ttg.memdesc_index %do_209[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc986) + %do_ptrs_229 = tt.splat %do_ptrs_211 : i1 -> tensor<64x128xi1, #blocked> loc(#loc1024) + %do_ptrs_230 = arith.andi %do_ptrs_229, %do_227 : tensor<64x128xi1, #blocked> loc(#loc1024) + %do_231 = ttg.async_copy_global_to_local %do_ptrs_206, %do_228 mask %do_ptrs_230 other %cst_5 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc986) + %do_232 = ttg.async_commit_group tokens %do_231 loc(#loc986) + %Di_233 = tt.addptr %Di, %offs_m1_94 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc826) + %Di_234 = ttg.memdesc_index %Di_210[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc823) + %Di_235 = ttg.async_copy_global_to_local %Di_233, %Di_234 mask %do_ptrs_223 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc823) + %Di_236 = ttg.async_commit_group tokens %Di_235 loc(#loc823) + %do_ptrs_237 = arith.cmpi sgt, %hi_102, %c1_i32 : i32 loc(#loc1024) + %qT_ptrs_238 = tt.addptr %qT_ptrs_203, %cst_14 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc827) + %do_ptrs_239 = tt.addptr %do_ptrs_206, %cst_15 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc828) + %offs_m1_240 = arith.addi %offs_m1_94, %cst_16 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc829) + %offs_m1_241 = arith.addi %offs_m1_95, %cst_13 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc829) + %offs_m1_242 = arith.addi %offs_m1_96, %cst_17 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc829) + %qT_243 = tt.expand_dims %offs_m1_241 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc989) + %qT_244 = arith.cmpi slt, %qT_243, %qT_80 : tensor<1x64xi32, #blocked1> loc(#loc987) + %qT_245 = tt.broadcast %qT_244 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc985) + %qT_246 = ttg.memdesc_index %qT_207[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc985) + %do_ptrs_247 = tt.splat %do_ptrs_237 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1024) + %do_ptrs_248 = arith.andi %do_ptrs_247, %qT_245 : tensor<128x64xi1, #blocked1> loc(#loc1024) + %qT_249 = ttg.async_copy_global_to_local %qT_ptrs_238, %qT_246 mask %do_ptrs_248 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc985) + %qT_250 = ttg.async_commit_group tokens %qT_249 loc(#loc985) + %lse_251 = arith.cmpi slt, %offs_m1_240, %lse : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc824) + %lse_252 = tt.addptr %lse_135, %offs_m1_240 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc825) + %lse_253 = ttg.memdesc_index %lse_208[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc821) + %do_ptrs_254 = tt.splat %do_ptrs_237 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1024) + %do_ptrs_255 = arith.andi %do_ptrs_254, %lse_251 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1024) + %lse_256 = ttg.async_copy_global_to_local %lse_252, %lse_253 mask %do_ptrs_255 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc821) + %lse_257 = ttg.async_commit_group tokens %lse_256 loc(#loc821) + %do_258 = tt.expand_dims %offs_m1_242 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc990) + %do_259 = arith.cmpi slt, %do_258, %do : tensor<64x1xi32, #blocked> loc(#loc988) + %do_260 = tt.broadcast %do_259 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc986) + %do_261 = ttg.memdesc_index %do_209[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc986) + %do_ptrs_262 = tt.splat %do_ptrs_237 : i1 -> tensor<64x128xi1, #blocked> loc(#loc1024) + %do_ptrs_263 = arith.andi %do_ptrs_262, %do_260 : tensor<64x128xi1, #blocked> loc(#loc1024) + %do_264 = ttg.async_copy_global_to_local %do_ptrs_239, %do_261 mask %do_ptrs_263 other %cst_5 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc986) + %do_265 = ttg.async_commit_group tokens %do_264 loc(#loc986) + %Di_266 = tt.addptr %Di, %offs_m1_240 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc826) + %Di_267 = ttg.memdesc_index %Di_210[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc823) + %Di_268 = ttg.async_copy_global_to_local %Di_266, %Di_267 mask %do_ptrs_255 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc823) + %Di_269 = ttg.async_commit_group tokens %Di_268 loc(#loc823) + %do_ptrs_270:20 = scf.for %do_ptrs_273 = %c0_i32 to %hi_102 step %c1_i32 iter_args(%do_ptrs_274 = %do_ptrs_199#1, %do_ptrs_275 = %do_ptrs_199#0, %qT_ptrs_276 = %qT_ptrs_238, %offs_m1_277 = %offs_m1_241, %do_ptrs_278 = %do_ptrs_239, %offs_m1_279 = %offs_m1_242, %offs_m1_280 = %offs_m1_240, %arg38 = %c1_i32, %arg39 = %c-1_i32, %arg40 = %c1_i32, %arg41 = %c-1_i32, %offs_m1_281 = %offs_m1_94, %qT_282 = %qT_218, %qT_283 = %qT_250, %lse_284 = %lse_225, %lse_285 = %lse_257, %do_286 = %do_232, %do_287 = %do_265, %Di_288 = %Di_236, %Di_289 = %Di_269) -> (tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<64x128x!tt.ptr, #blocked>, tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, i32, i32, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token) : i32 { + %do_ptrs_290 = arith.subi %hi_102, %c2_i32 : i32 loc(#loc1024) + %do_ptrs_291 = arith.cmpi slt, %do_ptrs_273, %do_ptrs_290 : i32 loc(#loc1024) + %do_ptrs_292 = arith.subi %hi_102, %c1_i32 : i32 loc(#loc1024) + %do_ptrs_293 = arith.cmpi slt, %do_ptrs_273, %do_ptrs_292 : i32 loc(#loc1024) + %do_ptrs_294 = arith.addi %arg41, %c1_i32 : i32 loc(#loc1024) + %do_ptrs_295 = arith.cmpi sge, %do_ptrs_294, %c2_i32 : i32 loc(#loc1024) + %do_ptrs_296 = arith.select %do_ptrs_295, %c0_i32, %do_ptrs_294 : i32 loc(#loc1024) + %do_ptrs_297 = arith.addi %arg39, %c1_i32 : i32 loc(#loc1024) + %do_ptrs_298 = arith.cmpi sge, %do_ptrs_297, %c3_i32 : i32 loc(#loc1024) + %do_ptrs_299 = arith.select %do_ptrs_298, %c0_i32, %do_ptrs_297 : i32 loc(#loc1024) + %qT_300 = tt.expand_dims %offs_m1_281 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc989) + %qT_301 = arith.cmpi slt, %qT_300, %qT : tensor<1x64xi32, #mma> loc(#loc987) + %qT_302 = tt.broadcast %qT_301 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc985) + %qT_303 = ttg.async_wait %qT_282, %lse_284, %do_286, %Di_288 {num = 4 : i32} loc(#loc985) + %qT_304 = ttg.memdesc_index %qT_207[%do_ptrs_299] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc985) + %dk_305 = ttg.memdesc_trans %qT_304 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc830) + %lse_306 = ttg.memdesc_index %lse_208[%do_ptrs_296] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc821) + %lse_307 = ttg.local_load %lse_306 token %qT_303 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc821) + %lse_308 = arith.cmpf oeq, %lse_307, %cst_21 : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc831) + %lse_309 = arith.select %lse_308, %cst_22, %lse_307 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc832) + %qkT = ttng.warp_group_dot %k_51, %qT_304, %cst_8 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc833) + %qkT_310:3 = ttng.warp_group_dot_wait %qkT, %k_51, %qT_304 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc833) + %qkT_311 = arith.mulf %qkT_310#0, %cst_9 : tensor<128x64xf32, #mma> loc(#loc834) + %post_mod_scores = arith.select %qT_302, %qkT_311, %cst_10 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc835) + %post_mod_scores_312 = arith.mulf %post_mod_scores, %cst_11 : tensor<128x64xf32, #mma> loc(#loc836) + %pT = tt.expand_dims %lse_309 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xf32, #mma> loc(#loc837) + %pT_313 = tt.broadcast %pT : tensor<1x64xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc838) + %pT_314 = arith.subf %post_mod_scores_312, %pT_313 : tensor<128x64xf32, #mma> loc(#loc838) + %pT_315 = math.exp2 %pT_314 : tensor<128x64xf32, #mma> loc(#loc839) + %do_316 = ttg.memdesc_index %do_209[%do_ptrs_299] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc986) + %dpT = ttg.memdesc_trans %do_316 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc840) + %dv = arith.truncf %pT_315 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc841) + %dv_317 = ttg.convert_layout %dv : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc841) + %dv_318 = ttng.warp_group_dot %dv_317, %do_316, %do_ptrs_275 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc842) + %Di_319 = ttg.memdesc_index %Di_210[%do_ptrs_296] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc823) + %Di_320 = ttg.local_load %Di_319 token %qT_303 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc823) + %dpT_321 = ttng.warp_group_dot %v_56, %dpT, %cst_8 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc843) + %dpT_322:3 = ttng.warp_group_dot_wait %dpT_321, %v_56, %dpT {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc843) + %dsT = tt.expand_dims %Di_320 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xf32, #mma> loc(#loc844) + %dsT_323 = tt.broadcast %dsT : tensor<1x64xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc845) + %dsT_324 = arith.subf %dpT_322#0, %dsT_323 : tensor<128x64xf32, #mma> loc(#loc845) + %dsT_325 = arith.mulf %pT_315, %dsT_324 : tensor<128x64xf32, #mma> loc(#loc846) + %grad_scores = arith.select %qT_302, %dsT_325, %cst_8 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc847) + %dk_326 = arith.truncf %grad_scores : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc848) + %dk_327 = ttg.convert_layout %dk_326 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc848) + %dk_328 = ttng.warp_group_dot %dk_327, %dk_305, %do_ptrs_274 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc849) + %do_ptrs_329 = arith.addi %do_ptrs_273, %c1_i32 : i32 loc(#loc1024) + %cur_block_idx = arith.divsi %do_ptrs_329, %c2_i32 : i32 loc(#loc991) + %cur_block = tt.addptr %q_indices_86, %cur_block_idx : !tt.ptr, i32 loc(#loc992) + %cur_block_330 = tt.load %cur_block, %do_ptrs_293 evictionPolicy = evict_last : !tt.ptr loc(#loc993) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc994) + %next_block_331 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_90 : i32 loc(#loc995) + %next_block_332 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc996) + %do_ptrs_333 = arith.andi %do_ptrs_293, %next_block_331 : i1 loc(#loc1024) + %next_block_334 = tt.load %next_block_332, %do_ptrs_333 evictionPolicy = evict_last : !tt.ptr loc(#loc997) + %needs_jump = arith.addi %do_ptrs_273, %c2_i32 : i32 loc(#loc998) + %needs_jump_335 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc999) + %needs_jump_336 = arith.cmpi eq, %needs_jump_335, %c0_i32 : i32 loc(#loc1000) + %jump_to_block = arith.subi %next_block_334, %cur_block_330 : i32 loc(#loc1001) + %jump_to_block_337 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc1002) + %jump_to_block_338 = arith.subi %jump_to_block_337, %c64_i32 : i32 loc(#loc1003) + %offset = arith.extui %needs_jump_336 : i1 to i32 loc(#loc1004) + %offset_339 = arith.muli %jump_to_block_338, %offset : i32 loc(#loc1004) + %offset_340 = arith.subi %c1_i32, %offset : i32 loc(#loc1005) + %offset_341 = arith.muli %offset_340, %c64_i32 : i32 loc(#loc1006) + %offset_342 = arith.addi %offset_339, %offset_341 : i32 loc(#loc1007) + %qT_ptrs_343 = arith.muli %offset_342, %c4096_i32 : i32 loc(#loc851) + %qT_ptrs_344 = tt.splat %qT_ptrs_343 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc827) + %qT_ptrs_345 = tt.addptr %qT_ptrs_276, %qT_ptrs_344 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc827) + %do_ptrs_346 = arith.muli %offset_342, %c128_i32 : i32 loc(#loc852) + %do_ptrs_347 = tt.splat %do_ptrs_346 : i32 -> tensor<64x128xi32, #blocked> loc(#loc828) + %do_ptrs_348 = tt.addptr %do_ptrs_278, %do_ptrs_347 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc828) + %offs_m1_349 = tt.splat %offset_342 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc829) + %offs_m1_350 = tt.splat %offset_342 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc829) + %offs_m1_351 = tt.splat %offset_342 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc829) + %offs_m1_352 = arith.addi %offs_m1_280, %offs_m1_349 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc829) + %offs_m1_353 = arith.addi %offs_m1_277, %offs_m1_350 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc829) + %offs_m1_354 = arith.addi %offs_m1_279, %offs_m1_351 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc829) + %do_ptrs_355 = arith.addi %arg40, %c1_i32 : i32 loc(#loc1024) + %do_ptrs_356 = arith.cmpi sge, %do_ptrs_355, %c2_i32 : i32 loc(#loc1024) + %do_ptrs_357 = arith.select %do_ptrs_356, %c0_i32, %do_ptrs_355 : i32 loc(#loc1024) + %do_ptrs_358 = arith.addi %arg38, %c1_i32 : i32 loc(#loc1024) + %do_ptrs_359 = arith.cmpi sge, %do_ptrs_358, %c3_i32 : i32 loc(#loc1024) + %do_ptrs_360 = arith.select %do_ptrs_359, %c0_i32, %do_ptrs_358 : i32 loc(#loc1024) + %qT_361 = tt.expand_dims %offs_m1_353 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc989) + %qT_362 = arith.cmpi slt, %qT_361, %qT_80 : tensor<1x64xi32, #blocked1> loc(#loc987) + %qT_363 = tt.broadcast %qT_362 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc985) + %qT_364 = ttg.memdesc_index %qT_207[%do_ptrs_360] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc985) + %do_ptrs_365 = tt.splat %do_ptrs_291 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc1024) + %do_ptrs_366 = arith.andi %do_ptrs_365, %qT_363 : tensor<128x64xi1, #blocked1> loc(#loc1024) + %qT_367 = ttg.async_copy_global_to_local %qT_ptrs_345, %qT_364 mask %do_ptrs_366 other %cst_4 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc985) + %qT_368 = ttg.async_commit_group tokens %qT_367 loc(#loc985) + %lse_369 = arith.cmpi slt, %offs_m1_352, %lse : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc824) + %lse_370 = tt.addptr %lse_135, %offs_m1_352 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc825) + %lse_371 = ttg.memdesc_index %lse_208[%do_ptrs_357] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc821) + %do_ptrs_372 = tt.splat %do_ptrs_291 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1024) + %do_ptrs_373 = arith.andi %do_ptrs_372, %lse_369 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1024) + %lse_374 = ttg.async_copy_global_to_local %lse_370, %lse_371 mask %do_ptrs_373 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc821) + %lse_375 = ttg.async_commit_group tokens %lse_374 loc(#loc821) + %do_376 = tt.expand_dims %offs_m1_354 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc990) + %do_377 = arith.cmpi slt, %do_376, %do : tensor<64x1xi32, #blocked> loc(#loc988) + %do_378 = tt.broadcast %do_377 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc986) + %do_379 = ttg.memdesc_index %do_209[%do_ptrs_360] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc986) + %do_ptrs_380 = tt.splat %do_ptrs_291 : i1 -> tensor<64x128xi1, #blocked> loc(#loc1024) + %do_ptrs_381 = arith.andi %do_ptrs_380, %do_378 : tensor<64x128xi1, #blocked> loc(#loc1024) + %do_382 = ttg.async_copy_global_to_local %do_ptrs_348, %do_379 mask %do_ptrs_381 other %cst_5 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc986) + %do_383 = ttg.async_commit_group tokens %do_382 loc(#loc986) + %Di_384 = tt.addptr %Di, %offs_m1_352 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc826) + %Di_385 = ttg.memdesc_index %Di_210[%do_ptrs_357] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc823) + %Di_386 = ttg.async_copy_global_to_local %Di_384, %Di_385 mask %do_ptrs_373 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc823) + %Di_387 = ttg.async_commit_group tokens %Di_386 loc(#loc823) + scf.yield %dk_328, %dv_318, %qT_ptrs_345, %offs_m1_353, %do_ptrs_348, %offs_m1_354, %offs_m1_352, %do_ptrs_360, %do_ptrs_299, %do_ptrs_357, %do_ptrs_296, %offs_m1_280, %qT_283, %qT_368, %lse_285, %lse_375, %do_287, %do_383, %Di_289, %Di_387 : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<64x128x!tt.ptr, #blocked>, tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, i32, i32, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token loc(#loc1024) + } loc(#loc1024) + %do_ptrs_271:2 = ttng.warp_group_dot_wait %do_ptrs_270#1, %do_ptrs_270#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1> loc(#loc1024) + %do_ptrs_272 = ttg.async_wait {num = 0 : i32} loc(#loc1024) + ttg.local_dealloc %Di_210 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc1024) + ttg.local_dealloc %do_209 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc1024) + ttg.local_dealloc %lse_208 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc1024) + ttg.local_dealloc %qT_207 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc1024) + scf.yield %do_ptrs_271#0, %do_ptrs_271#1 : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1> loc(#loc321) + } loc(#loc771) + %dv_ptrs = tt.splat %DV : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc659) + %dv_ptrs_103 = tt.addptr %dv_ptrs, %ptr_39 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc659) + %dv_ptrs_104 = tt.broadcast %dv_ptrs_103 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc660) + %dv_ptrs_105 = tt.addptr %dv_ptrs_104, %ptr_45 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc660) + %12 = arith.cmpi slt, %ptr_43, %cst_1 : tensor<1x128xi32, #blocked> loc(#loc324) + %13 = tt.broadcast %12 : tensor<1x128xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc325) + %14 = arith.andi %k_49, %13 : tensor<128x128xi1, #blocked> loc(#loc325) + %15 = arith.truncf %dk#0 : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc326) + %16 = ttg.convert_layout %15 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc326) + tt.store %dv_ptrs_105, %16, %14 : tensor<128x128x!tt.ptr, #blocked> loc(#loc326) + %dk_106 = arith.mulf %dk#1, %cst_6 : tensor<128x128xf32, #mma1> loc(#loc661) + %xindex = tt.broadcast %ptr_39 : tensor<128x1xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc662) + %xindex_107 = arith.addi %ptr_45, %xindex : tensor<128x128xi32, #blocked> loc(#loc662) + %xindex_108 = arith.muli %off_hkv, %c128_i32 : i32 loc(#loc663) + %xindex_109 = arith.muli %xindex_108, %ks1 : i32 loc(#loc664) + %xindex_110 = tt.splat %xindex_109 : i32 -> tensor<128x128xi32, #blocked> loc(#loc665) + %xindex_111 = arith.addi %xindex_107, %xindex_110 : tensor<128x128xi32, #blocked> loc(#loc665) + %xindex_112 = arith.muli %off_zq, %c1024_i32 : i32 loc(#loc666) + %xindex_113 = arith.muli %xindex_112, %ks1 : i32 loc(#loc667) + %xindex_114 = tt.splat %xindex_113 : i32 -> tensor<128x128xi32, #blocked> loc(#loc668) + %xindex_115 = arith.addi %xindex_111, %xindex_114 : tensor<128x128xi32, #blocked> loc(#loc668) + %17 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr, #blocked> loc(#loc335) + %18 = tt.addptr %17, %xindex_115 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc335) + %19 = arith.truncf %dk_106 : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc336) + %20 = ttg.convert_layout %19 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc336) + tt.store %18, %20, %k_49 : tensor<128x128x!tt.ptr, #blocked> loc(#loc336) + } loc(#loc32) + tt.return loc(#loc337) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":94:54) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":95:54) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":95:63) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:74) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:66) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:100) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:91) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:82) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:59) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:111) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":111:24) +#loc13 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":112:36) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":113:34) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":115:27) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":116:28) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":117:23) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":124:25) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":124:47) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":124:35) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":124:59) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":128:50) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":128:37) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":128:61) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":131:9) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":132:9) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":133:10) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":136:26) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":139:14) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":139:7) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":140:24) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":144:29) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":144:54) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":144:44) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":145:35) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":148:30) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":154:55) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":154:78) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":155:50) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":155:83) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":155:68) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":158:30) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":158:52) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":158:40) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":158:63) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":159:32) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":159:55) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":159:42) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":159:66) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":161:30) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":161:35) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":161:46) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":161:56) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":163:17) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":164:19) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":167:19) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":168:21) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":169:25) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":174:36) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":175:29) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":825:27) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":178:107) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":825:38) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":825:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":825:56) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":825:49) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":833:52) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":833:23) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":179:111) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":188:58) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":188:34) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":188:25) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":189:33) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":189:26) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":190:30) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":190:50) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":191:18) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":195:30) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":196:27) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":196:41) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":197:53) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":197:39) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":199:42) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":199:29) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":390:26) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":207:12) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":390:37) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":390:18) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":390:56) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":390:49) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":391:18) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":391:49) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":395:43) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":395:90) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":395:101) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":395:63) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":831:52) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":458:105) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":405:12) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":798:21) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":467:46) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":482:23) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":485:34) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":397:28) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":485:23) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":486:22) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":487:23) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":488:23) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":489:23) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":493:24) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":498:92) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":499:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":507:25) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":510:25) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":525:39) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":531:22) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":531:19) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":831:23) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":528:104) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":414:19) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":415:19) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":417:19) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":831:41) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":459:19) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":553:30) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":461:14) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":464:46) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":476:79) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":483:23) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":490:23) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":494:24) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":496:25) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":497:92) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":500:24) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":501:24) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":502:39) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":503:25) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":504:24) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":505:24) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":506:23) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":508:25) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":509:92) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":511:24) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":512:24) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":513:39) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":514:25) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":515:24) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":516:24) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":521:69) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":524:27) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":525:21) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":530:20) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":531:14) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":538:71) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":549:43) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":551:15) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":553:21) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":788:33) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":411:64) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":789:38) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":789:24) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":790:109) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":790:113) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":790:55) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":790:25) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":791:30) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":791:35) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":791:60) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":792:34) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":792:48) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":792:63) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":793:29) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":793:47) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":793:61) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":793:42) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":414:28) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":214:39) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":215:31) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":215:45) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":216:62) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":216:43) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":218:33) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":226:16) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":231:24) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":231:56) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":232:14) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":236:87) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":236:69) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":236:30) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":245:29) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":252:25) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":253:29) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":256:107) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":257:107) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":263:32) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":266:56) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":267:59) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":269:34) +#loc201 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":281:58) +#loc202 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":281:80) +#loc203 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":282:53) +#loc204 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":282:81) +#loc205 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":282:70) +#loc206 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":286:32) +#loc207 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":287:30) +#loc208 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":287:43) +#loc209 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":288:55) +#loc210 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":288:42) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":290:45) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":290:32) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":601:26) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":298:16) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":601:37) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":601:56) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":601:49) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":602:27) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":602:38) +#loc220 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":602:51) +#loc221 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":608:42) +#loc222 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":608:87) +#loc223 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":608:98) +#loc224 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":608:61) +#loc225 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":669:105) +#loc226 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":618:12) +#loc227 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":674:52) +#loc228 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":683:46) +#loc229 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":698:25) +#loc230 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":699:25) +#loc231 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":701:35) +#loc232 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":610:28) +#loc233 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":701:24) +#loc234 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":702:24) +#loc235 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":704:24) +#loc236 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":705:24) +#loc237 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":709:25) +#loc238 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":710:25) +#loc239 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":712:25) +#loc240 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":713:92) +#loc241 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":714:92) +#loc242 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":715:25) +#loc243 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":716:24) +#loc244 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":717:24) +#loc245 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":718:39) +#loc246 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":719:25) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":720:24) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":721:24) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":723:25) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":726:25) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":731:24) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":741:99) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":306:41) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":307:34) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":307:47) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":308:64) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":308:46) +#loc258 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":310:36) +#loc259 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":318:20) +#loc260 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":676:20) +#loc261 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":262:30) +#loc262 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":263:51) +#loc263 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":266:34) +#loc264 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":266:44) +#loc265 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":266:67) +#loc266 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":267:36) +#loc267 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":267:46) +#loc268 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":267:70) +#loc269 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":269:39) +#loc270 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":269:50) +#loc271 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":269:60) +#loc272 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":271:21) +#loc273 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":272:23) +#loc274 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":275:25) +#loc275 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":276:29) +#loc276 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":601:18) +#loc277 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":602:19) +#loc278 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":674:28) +#loc279 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":748:29) +#loc280 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":674:22) +#loc281 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":748:21) +#loc282 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":626:19) +#loc283 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":627:19) +#loc284 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":628:19) +#loc285 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":833:41) +#loc286 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":775:52) +#loc287 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":675:26) +#loc288 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":675:46) +#loc289 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":678:15) +#loc290 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":680:46) +#loc291 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":692:78) +#loc292 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":703:25) +#loc293 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":706:24) +#loc294 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":722:24) +#loc295 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":724:25) +#loc296 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":725:92) +#loc297 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":727:24) +#loc298 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":728:24) +#loc299 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":729:39) +#loc300 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":730:25) +#loc301 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":732:24) +#loc302 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":736:69) +#loc303 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":739:27) +#loc304 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":740:44) +#loc305 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":740:40) +#loc306 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":740:22) +#loc307 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":750:29) +#loc308 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":744:24) +#loc309 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":744:43) +#loc310 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":750:20) +#loc311 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":751:25) +#loc312 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":751:22) +#loc313 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":751:16) +#loc314 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":759:70) +#loc315 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":773:45) +#loc316 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":775:24) +#loc317 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":775:43) +#loc318 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":623:62) +#loc319 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":626:28) +#loc320 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":627:28) +#loc321 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":303:12) +#loc322 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":323:23) +#loc323 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":323:55) +#loc324 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":332:71) +#loc325 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":332:61) +#loc326 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":332:30) +#loc327 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":334:14) +#loc328 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:27) +#loc329 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:45) +#loc330 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:53) +#loc331 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:41) +#loc332 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:64) +#loc333 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:71) +#loc334 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:59) +#loc335 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":345:29) +#loc336 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":345:69) +#loc337 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":139:4) +#loc365 = loc("pid"(#loc12)) +#loc366 = loc("NUM_KV_BLOCKS"(#loc14)) +#loc367 = loc("NUM_Q_BLOCKS"(#loc16)) +#loc368 = loc("off_zq"(#loc17)) +#loc369 = loc("off_hkv"(#loc18)) +#loc370 = loc("off_zkv"(#loc19)) +#loc371 = loc("k_adj"(#loc20)) +#loc372 = loc("k_adj"(#loc21)) +#loc373 = loc("k_adj"(#loc22)) +#loc374 = loc("k_adj"(#loc23)) +#loc375 = loc("dv_adj"(#loc24)) +#loc376 = loc("dv_adj"(#loc25)) +#loc377 = loc("dv_adj"(#loc26)) +#loc378 = loc("K"(#loc27)) +#loc379 = loc("V"(#loc28)) +#loc380 = loc("DV"(#loc29)) +#loc381 = loc("offs_k"(#loc30)) +#loc382 = loc("off_pid"(#loc33)) +#loc383 = loc("off_hq2"(#loc34)) +#loc384 = loc("off_hq2"(#loc35)) +#loc385 = loc("off_hq2"(#loc36)) +#loc386 = loc("start_m2_block"(#loc37)) +#loc387 = loc("stride_kv_idx_h"(#loc38)) +#loc388 = loc("sparse_kv_num_blks_offset"(#loc39)) +#loc389 = loc("sparse_kv_num_blks_offset"(#loc40)) +#loc390 = loc("sparse_kv_idx_offset"(#loc41)) +#loc391 = loc("sparse_kv_idx_offset"(#loc42)) +#loc392 = loc("sparse_kv_idx_offset"(#loc43)) +#loc393 = loc("q_adj2"(#loc44)) +#loc394 = loc("q_adj2"(#loc45)) +#loc395 = loc("q_adj2"(#loc46)) +#loc396 = loc("q_adj2"(#loc47)) +#loc397 = loc("do_adj2"(#loc48)) +#loc398 = loc("do_adj2"(#loc49)) +#loc399 = loc("do_adj2"(#loc50)) +#loc400 = loc("do_adj2"(#loc51)) +#loc401 = loc("off_chz2"(#loc52)) +#loc402 = loc("off_chz2"(#loc53)) +#loc403 = loc("off_chz2"(#loc54)) +#loc404 = loc("off_chz2"(#loc55)) +#loc405 = loc("Q2"(#loc56)) +#loc406 = loc("DO2"(#loc57)) +#loc407 = loc("DQ2"(#loc58)) +#loc408 = loc("LSE2"(#loc59)) +#loc409 = loc("DELTA2"(#loc60)) +#loc410 = loc("start_m2"(#loc61)) +#loc411 = loc("offs_m2"(#loc62)) +#loc412 = loc("ptr"(#loc63)) +#loc413 = loc("q"(#loc64)) +#loc414 = loc("ptr"(#loc65)) +#loc415 = loc("ptr"(#loc66)) +#loc416 = loc("ptr"(#loc67)) +#loc417 = loc("ptr"(#loc68)) +#loc418 = loc("do"(#loc71)) +#loc419 = loc("Di"(#loc72)) +#loc420 = loc("Di"(#loc73)) +#loc421 = loc("Di"(#loc74)) +#loc422 = loc("lse"(#loc75)) +#loc423 = loc("lse"(#loc76)) +#loc424 = loc("lse"(#loc77)) +#loc425 = loc("lse"(#loc78)) +#loc426 = loc("lse"(#loc79)) +#loc427 = loc("kv_indices"(#loc80)) +#loc428 = loc("kv_start"(#loc81)) +#loc429 = loc("kv_start"(#loc82)) +#loc430 = loc("sparse_kv_num_blocks"(#loc83)) +#loc431 = loc("sparse_kv_num_blocks"(#loc84)) +#loc432 = loc("offs_n2"(#loc85)) +#loc433 = loc("offs_n2"(#loc86)) +#loc434 = loc("kT_ptrs"(#loc87)) +#loc435 = loc("dq"(#loc88)) +#loc436 = loc("kT_ptrs"(#loc89)) +#loc437 = loc("kT_ptrs"(#loc90)) +#loc438 = loc("kT_ptrs"(#loc91)) +#loc439 = loc("kT_ptrs"(#loc92)) +#loc440 = loc("vT_ptrs"(#loc93)) +#loc441 = loc("vT_ptrs"(#loc94)) +#loc442 = loc("hi"(#loc95)) +#loc443 = loc("hi"(#loc96)) +#loc444 = loc("hi"(#loc97)) +#loc445 = loc("hi"(#loc98)) +#loc446 = loc("kT"(#loc100)) +#loc447 = loc("dq"(#loc101)) +#loc448 = loc("m"(#loc103)) +#loc449 = loc("tmp4"(#loc104)) +#loc450 = loc("tmp7"(#loc105)) +#loc451 = loc("dq"(#loc106)) +#loc452 = loc("tmp7"(#loc107)) +#loc453 = loc("tmp8"(#loc108)) +#loc454 = loc("tmp9"(#loc109)) +#loc455 = loc("tmp10"(#loc110)) +#loc456 = loc("tmp11"(#loc111)) +#loc457 = loc("tmp15"(#loc112)) +#loc458 = loc("tmp20"(#loc113)) +#loc459 = loc("tmp21"(#loc114)) +#loc460 = loc("tmp29"(#loc115)) +#loc461 = loc("tmp32"(#loc116)) +#loc462 = loc("p"(#loc117)) +#loc463 = loc("ds"(#loc118)) +#loc464 = loc("ds"(#loc119)) +#loc465 = loc("vT"(#loc121)) +#loc466 = loc("kT_ptrs"(#loc122)) +#loc467 = loc("vT_ptrs"(#loc123)) +#loc468 = loc("offs_n2"(#loc124)) +#loc469 = loc("qk"(#loc126)) +#loc470 = loc("dq"(#loc127)) +#loc471 = loc("qk"(#loc128)) +#loc472 = loc("n"(#loc129)) +#loc473 = loc("post_mod_scores"(#loc130)) +#loc474 = loc("tmp5"(#loc131)) +#loc475 = loc("tmp12"(#loc132)) +#loc476 = loc("tmp16"(#loc133)) +#loc477 = loc("tmp18"(#loc134)) +#loc478 = loc("tmp19"(#loc135)) +#loc479 = loc("tmp22"(#loc136)) +#loc480 = loc("tmp23"(#loc137)) +#loc481 = loc("tmp24"(#loc138)) +#loc482 = loc("tmp25"(#loc139)) +#loc483 = loc("tmp26"(#loc140)) +#loc484 = loc("tmp27"(#loc141)) +#loc485 = loc("tmp28"(#loc142)) +#loc486 = loc("tmp30"(#loc143)) +#loc487 = loc("tmp31"(#loc144)) +#loc488 = loc("tmp33"(#loc145)) +#loc489 = loc("tmp34"(#loc146)) +#loc490 = loc("tmp35"(#loc147)) +#loc491 = loc("tmp36"(#loc148)) +#loc492 = loc("tmp37"(#loc149)) +#loc493 = loc("tmp38"(#loc150)) +#loc494 = loc("post_mod_scores"(#loc151)) +#loc495 = loc("post_mod_scores"(#loc152)) +#loc496 = loc("p"(#loc153)) +#loc497 = loc("dp"(#loc154)) +#loc498 = loc("ds"(#loc155)) +#loc499 = loc("grad_scores"(#loc156)) +#loc500 = loc("ds"(#loc157)) +#loc501 = loc("ds"(#loc158)) +#loc502 = loc("dq"(#loc159)) +#loc503 = loc("cur_block_idx"(#loc160)) +#loc504 = loc("offset"(#loc161)) +#loc505 = loc("cur_block"(#loc162)) +#loc506 = loc("cur_block"(#loc163)) +#loc507 = loc("next_block"(#loc164)) +#loc508 = loc("next_block"(#loc165)) +#loc509 = loc("next_block"(#loc166)) +#loc510 = loc("next_block"(#loc167)) +#loc511 = loc("needs_jump"(#loc168)) +#loc512 = loc("needs_jump"(#loc169)) +#loc513 = loc("needs_jump"(#loc170)) +#loc514 = loc("jump_to_block"(#loc171)) +#loc515 = loc("jump_to_block"(#loc172)) +#loc516 = loc("jump_to_block"(#loc173)) +#loc517 = loc("offset"(#loc174)) +#loc518 = loc("offset"(#loc175)) +#loc519 = loc("offset"(#loc176)) +#loc520 = loc("offset"(#loc177)) +#loc521 = loc("kT_ptrs"(#loc178)) +#loc522 = loc("kv_indices"(#loc179)) +#loc523 = loc("kv_start"(#loc180)) +#loc524 = loc("kv_start"(#loc181)) +#loc525 = loc("sparse_kv_num_blocks"(#loc182)) +#loc526 = loc("sparse_kv_num_blocks"(#loc183)) +#loc527 = loc("offs_n2"(#loc184)) +#loc528 = loc("dq"(#loc185)) +#loc529 = loc("dq_ptrs"(#loc186)) +#loc530 = loc("dq_ptrs"(#loc187)) +#loc531 = loc("dq"(#loc188)) +#loc532 = loc("stride_q_idx_h"(#loc192)) +#loc533 = loc("start_n1"(#loc193)) +#loc534 = loc("offs_n1"(#loc194)) +#loc535 = loc("k"(#loc195)) +#loc536 = loc("v"(#loc196)) +#loc537 = loc("off_hq1"(#loc197)) +#loc538 = loc("q_adj1"(#loc198)) +#loc539 = loc("do_adj1"(#loc199)) +#loc540 = loc("off_chz1"(#loc200)) +#loc541 = loc("sparse_q_num_blks_offset"(#loc201)) +#loc542 = loc("sparse_q_num_blks_offset"(#loc202)) +#loc543 = loc("sparse_q_idx_offset"(#loc203)) +#loc544 = loc("sparse_q_idx_offset"(#loc204)) +#loc545 = loc("sparse_q_idx_offset"(#loc205)) +#loc546 = loc("q_indices"(#loc206)) +#loc547 = loc("q_start"(#loc207)) +#loc548 = loc("q_start"(#loc208)) +#loc549 = loc("sparse_q_num_blocks"(#loc209)) +#loc550 = loc("sparse_q_num_blocks"(#loc210)) +#loc551 = loc("offs_m1"(#loc211)) +#loc552 = loc("offs_m1"(#loc212)) +#loc553 = loc("qT_ptrs"(#loc213)) +#loc554 = loc("qT_ptrs"(#loc215)) +#loc555 = loc("qT_ptrs"(#loc216)) +#loc556 = loc("qT_ptrs"(#loc217)) +#loc557 = loc("do_ptrs"(#loc218)) +#loc558 = loc("do_ptrs"(#loc219)) +#loc559 = loc("do_ptrs"(#loc220)) +#loc560 = loc("hi"(#loc221)) +#loc561 = loc("hi"(#loc222)) +#loc562 = loc("hi"(#loc223)) +#loc563 = loc("hi"(#loc224)) +#loc564 = loc("qT"(#loc225)) +#loc565 = loc(callsite(#loc226 at #loc214)) +#loc566 = loc("lse"(#loc227)) +#loc567 = loc("n"(#loc228)) +#loc568 = loc("tmp44"(#loc229)) +#loc569 = loc("tmp45"(#loc230)) +#loc570 = loc("tmp47"(#loc231)) +#loc571 = loc("dk"(#loc232)) +#loc572 = loc("tmp47"(#loc233)) +#loc573 = loc("tmp48"(#loc234)) +#loc574 = loc("tmp50"(#loc235)) +#loc575 = loc("tmp51"(#loc236)) +#loc576 = loc("tmp55"(#loc237)) +#loc577 = loc("tmp56"(#loc238)) +#loc578 = loc("tmp58"(#loc239)) +#loc579 = loc("tmp59"(#loc240)) +#loc580 = loc("tmp60"(#loc241)) +#loc581 = loc("tmp61"(#loc242)) +#loc582 = loc("tmp62"(#loc243)) +#loc583 = loc("tmp63"(#loc244)) +#loc584 = loc("tmp64"(#loc245)) +#loc585 = loc("tmp65"(#loc246)) +#loc586 = loc("tmp66"(#loc247)) +#loc587 = loc("tmp67"(#loc248)) +#loc588 = loc("tmp69"(#loc249)) +#loc589 = loc("tmp72"(#loc250)) +#loc590 = loc("tmp77"(#loc251)) +#loc591 = loc("do"(#loc252)) +#loc592 = loc("q_indices"(#loc253)) +#loc593 = loc("q_start"(#loc254)) +#loc594 = loc("q_start"(#loc255)) +#loc595 = loc("sparse_q_num_blocks"(#loc256)) +#loc596 = loc("sparse_q_num_blocks"(#loc257)) +#loc597 = loc("offs_m1"(#loc258)) +#loc598 = loc("qkT"(#loc260)) +#loc599 = loc("dv"(#loc261)) +#loc600 = loc("off_hq1"(#loc262)) +#loc601 = loc("q_adj1"(#loc263)) +#loc602 = loc("q_adj1"(#loc264)) +#loc603 = loc("q_adj1"(#loc265)) +#loc604 = loc("do_adj1"(#loc266)) +#loc605 = loc("do_adj1"(#loc267)) +#loc606 = loc("do_adj1"(#loc268)) +#loc607 = loc("off_chz1"(#loc269)) +#loc608 = loc("off_chz1"(#loc270)) +#loc609 = loc("off_chz1"(#loc271)) +#loc610 = loc("Q1"(#loc272)) +#loc611 = loc("DO1"(#loc273)) +#loc612 = loc("LSE1"(#loc274)) +#loc613 = loc("DELTA1"(#loc275)) +#loc614 = loc("qT_ptrs"(#loc276)) +#loc615 = loc("do_ptrs"(#loc277)) +#loc616 = loc("lse"(#loc278)) +#loc617 = loc("Di"(#loc279)) +#loc618 = loc("lse"(#loc280)) +#loc619 = loc("Di"(#loc281)) +#loc620 = loc("qT_ptrs"(#loc282)) +#loc621 = loc("do_ptrs"(#loc283)) +#loc622 = loc("offs_m1"(#loc284)) +#loc623 = loc("dk"(#loc286)) +#loc624 = loc("lse"(#loc287)) +#loc625 = loc("lse"(#loc288)) +#loc626 = loc("qkT"(#loc289)) +#loc627 = loc("m"(#loc290)) +#loc628 = loc("post_mod_scores"(#loc291)) +#loc629 = loc("tmp49"(#loc292)) +#loc630 = loc("tmp52"(#loc293)) +#loc631 = loc("tmp68"(#loc294)) +#loc632 = loc("tmp70"(#loc295)) +#loc633 = loc("tmp71"(#loc296)) +#loc634 = loc("tmp73"(#loc297)) +#loc635 = loc("tmp74"(#loc298)) +#loc636 = loc("tmp75"(#loc299)) +#loc637 = loc("tmp76"(#loc300)) +#loc638 = loc("tmp78"(#loc301)) +#loc639 = loc("post_mod_scores"(#loc302)) +#loc640 = loc("post_mod_scores"(#loc303)) +#loc641 = loc("pT"(#loc304)) +#loc642 = loc("pT"(#loc305)) +#loc643 = loc("pT"(#loc306)) +#loc644 = loc("dpT"(#loc307)) +#loc645 = loc("dv"(#loc308)) +#loc646 = loc("dv"(#loc309)) +#loc647 = loc("dpT"(#loc310)) +#loc648 = loc("dsT"(#loc311)) +#loc649 = loc("dsT"(#loc312)) +#loc650 = loc("dsT"(#loc313)) +#loc651 = loc("grad_scores"(#loc314)) +#loc652 = loc("dsT"(#loc315)) +#loc653 = loc("dk"(#loc316)) +#loc654 = loc("dk"(#loc317)) +#loc655 = loc("offset"(#loc318)) +#loc656 = loc("qT_ptrs"(#loc319)) +#loc657 = loc("do_ptrs"(#loc320)) +#loc658 = loc(callsite(#loc226 at #loc259)) +#loc659 = loc("dv_ptrs"(#loc322)) +#loc660 = loc("dv_ptrs"(#loc323)) +#loc661 = loc("dk"(#loc327)) +#loc662 = loc("xindex"(#loc328)) +#loc663 = loc("xindex"(#loc329)) +#loc664 = loc("xindex"(#loc330)) +#loc665 = loc("xindex"(#loc331)) +#loc666 = loc("xindex"(#loc332)) +#loc667 = loc("xindex"(#loc333)) +#loc668 = loc("xindex"(#loc334)) +#loc669 = loc(callsite(#loc13 at #loc366)) +#loc670 = loc(callsite(#loc15 at #loc366)) +#loc671 = loc(callsite(#loc13 at #loc367)) +#loc672 = loc(callsite(#loc15 at #loc367)) +#loc673 = loc(callsite(#loc412 at #loc413)) +#loc674 = loc(callsite(#loc414 at #loc413)) +#loc675 = loc(callsite(#loc415 at #loc413)) +#loc676 = loc(callsite(#loc416 at #loc413)) +#loc677 = loc(callsite(#loc417 at #loc413)) +#loc678 = loc(callsite(#loc69 at #loc413)) +#loc679 = loc(callsite(#loc70 at #loc413)) +#loc680 = loc(callsite(#loc414 at #loc418)) +#loc681 = loc(callsite(#loc415 at #loc418)) +#loc682 = loc(callsite(#loc417 at #loc418)) +#loc683 = loc(callsite(#loc70 at #loc418)) +#loc684 = loc(callsite(#loc434 at #loc435)) +#loc685 = loc(callsite(#loc436 at #loc435)) +#loc686 = loc(callsite(#loc437 at #loc435)) +#loc687 = loc(callsite(#loc438 at #loc435)) +#loc688 = loc(callsite(#loc439 at #loc435)) +#loc689 = loc(callsite(#loc440 at #loc435)) +#loc690 = loc(callsite(#loc441 at #loc435)) +#loc691 = loc(callsite(#loc442 at #loc435)) +#loc692 = loc(callsite(#loc443 at #loc435)) +#loc693 = loc(callsite(#loc444 at #loc435)) +#loc694 = loc(callsite(#loc445 at #loc435)) +#loc695 = loc(callsite(#loc447 at #loc435)) +#loc696 = loc("offs_n2"(#loc451)) +#loc697 = loc(callsite(#loc466 at #loc435)) +#loc698 = loc(callsite(#loc467 at #loc435)) +#loc699 = loc(callsite(#loc468 at #loc435)) +#loc700 = loc(callsite(#loc504 at #loc435)) +#loc701 = loc(callsite(#loc521 at #loc435)) +#loc702 = loc(callsite(#loc434 at #loc528)) +#loc703 = loc(callsite(#loc436 at #loc528)) +#loc704 = loc(callsite(#loc437 at #loc528)) +#loc705 = loc(callsite(#loc439 at #loc528)) +#loc706 = loc(callsite(#loc440 at #loc528)) +#loc707 = loc(callsite(#loc441 at #loc528)) +#loc708 = loc(callsite(#loc442 at #loc528)) +#loc709 = loc(callsite(#loc445 at #loc528)) +#loc710 = loc(callsite(#loc447 at #loc528)) +#loc711 = loc(callsite(#loc466 at #loc528)) +#loc712 = loc(callsite(#loc467 at #loc528)) +#loc713 = loc(callsite(#loc468 at #loc528)) +#loc714 = loc(callsite(#loc504 at #loc528)) +#loc715 = loc(callsite(#loc521 at #loc528)) +#loc716 = loc(callsite(#loc412 at #loc535)) +#loc717 = loc(callsite(#loc414 at #loc535)) +#loc718 = loc(callsite(#loc415 at #loc535)) +#loc719 = loc(callsite(#loc416 at #loc535)) +#loc720 = loc(callsite(#loc417 at #loc535)) +#loc721 = loc(callsite(#loc69 at #loc535)) +#loc722 = loc(callsite(#loc70 at #loc535)) +#loc723 = loc(callsite(#loc415 at #loc536)) +#loc724 = loc(callsite(#loc417 at #loc536)) +#loc725 = loc(callsite(#loc70 at #loc536)) +#loc726 = loc(callsite(#loc553 at #loc214)) +#loc727 = loc(callsite(#loc554 at #loc214)) +#loc728 = loc(callsite(#loc555 at #loc214)) +#loc729 = loc(callsite(#loc556 at #loc214)) +#loc730 = loc(callsite(#loc557 at #loc214)) +#loc731 = loc(callsite(#loc558 at #loc214)) +#loc732 = loc(callsite(#loc559 at #loc214)) +#loc733 = loc(callsite(#loc560 at #loc214)) +#loc734 = loc(callsite(#loc561 at #loc214)) +#loc735 = loc(callsite(#loc562 at #loc214)) +#loc736 = loc(callsite(#loc563 at #loc214)) +#loc737 = loc(callsite(#loc564 at #loc565)) +#loc738 = loc(callsite(#loc566 at #loc565)) +#loc739 = loc(callsite(#loc567 at #loc565)) +#loc740 = loc(callsite(#loc568 at #loc565)) +#loc741 = loc(callsite(#loc569 at #loc565)) +#loc742 = loc(callsite(#loc570 at #loc565)) +#loc743 = loc("dv"(#loc571)) +#loc744 = loc(callsite(#loc572 at #loc565)) +#loc745 = loc(callsite(#loc573 at #loc565)) +#loc746 = loc(callsite(#loc574 at #loc565)) +#loc747 = loc(callsite(#loc575 at #loc565)) +#loc748 = loc(callsite(#loc576 at #loc565)) +#loc749 = loc(callsite(#loc577 at #loc565)) +#loc750 = loc(callsite(#loc578 at #loc565)) +#loc751 = loc(callsite(#loc579 at #loc565)) +#loc752 = loc(callsite(#loc580 at #loc565)) +#loc753 = loc(callsite(#loc581 at #loc565)) +#loc754 = loc(callsite(#loc582 at #loc565)) +#loc755 = loc(callsite(#loc583 at #loc565)) +#loc756 = loc(callsite(#loc584 at #loc565)) +#loc757 = loc(callsite(#loc585 at #loc565)) +#loc758 = loc(callsite(#loc586 at #loc565)) +#loc759 = loc(callsite(#loc587 at #loc565)) +#loc760 = loc(callsite(#loc588 at #loc565)) +#loc761 = loc(callsite(#loc589 at #loc565)) +#loc762 = loc(callsite(#loc590 at #loc565)) +#loc763 = loc(callsite(#loc591 at #loc565)) +#loc764 = loc(callsite(#loc553 at #loc259)) +#loc765 = loc(callsite(#loc554 at #loc259)) +#loc766 = loc(callsite(#loc557 at #loc259)) +#loc767 = loc(callsite(#loc558 at #loc259)) +#loc768 = loc(callsite(#loc560 at #loc259)) +#loc769 = loc(callsite(#loc563 at #loc259)) +#loc770 = loc(callsite(#loc598 at #loc565)) +#loc771 = loc("dk"(#loc599)) +#loc772 = loc(callsite(#loc614 at #loc214)) +#loc773 = loc(callsite(#loc615 at #loc214)) +#loc774 = loc(callsite(#loc616 at #loc565)) +#loc775 = loc(callsite(#loc617 at #loc565)) +#loc776 = loc(callsite(#loc618 at #loc565)) +#loc777 = loc(callsite(#loc619 at #loc565)) +#loc778 = loc(callsite(#loc620 at #loc214)) +#loc779 = loc(callsite(#loc621 at #loc214)) +#loc780 = loc(callsite(#loc622 at #loc214)) +#loc781 = loc(callsite(#loc623 at #loc565)) +#loc782 = loc(callsite(#loc624 at #loc565)) +#loc783 = loc(callsite(#loc625 at #loc565)) +#loc784 = loc(callsite(#loc626 at #loc565)) +#loc785 = loc(callsite(#loc627 at #loc565)) +#loc786 = loc(callsite(#loc628 at #loc565)) +#loc787 = loc(callsite(#loc629 at #loc565)) +#loc788 = loc(callsite(#loc630 at #loc565)) +#loc789 = loc(callsite(#loc631 at #loc565)) +#loc790 = loc(callsite(#loc632 at #loc565)) +#loc791 = loc(callsite(#loc633 at #loc565)) +#loc792 = loc(callsite(#loc634 at #loc565)) +#loc793 = loc(callsite(#loc635 at #loc565)) +#loc794 = loc(callsite(#loc636 at #loc565)) +#loc795 = loc(callsite(#loc637 at #loc565)) +#loc796 = loc(callsite(#loc638 at #loc565)) +#loc797 = loc(callsite(#loc639 at #loc565)) +#loc798 = loc(callsite(#loc640 at #loc565)) +#loc799 = loc(callsite(#loc641 at #loc565)) +#loc800 = loc(callsite(#loc642 at #loc565)) +#loc801 = loc(callsite(#loc643 at #loc565)) +#loc802 = loc(callsite(#loc644 at #loc565)) +#loc803 = loc(callsite(#loc645 at #loc565)) +#loc804 = loc(callsite(#loc646 at #loc565)) +#loc805 = loc(callsite(#loc647 at #loc565)) +#loc806 = loc(callsite(#loc648 at #loc565)) +#loc807 = loc(callsite(#loc649 at #loc565)) +#loc808 = loc(callsite(#loc650 at #loc565)) +#loc809 = loc(callsite(#loc651 at #loc565)) +#loc810 = loc(callsite(#loc652 at #loc565)) +#loc811 = loc(callsite(#loc653 at #loc565)) +#loc812 = loc(callsite(#loc654 at #loc565)) +#loc813 = loc(callsite(#loc655 at #loc214)) +#loc814 = loc(callsite(#loc656 at #loc214)) +#loc815 = loc(callsite(#loc657 at #loc214)) +#loc816 = loc(callsite(#loc614 at #loc259)) +#loc817 = loc(callsite(#loc556 at #loc259)) +#loc818 = loc(callsite(#loc615 at #loc259)) +#loc819 = loc(callsite(#loc559 at #loc259)) +#loc820 = loc(callsite(#loc564 at #loc658)) +#loc821 = loc(callsite(#loc618 at #loc658)) +#loc822 = loc(callsite(#loc591 at #loc658)) +#loc823 = loc(callsite(#loc619 at #loc658)) +#loc824 = loc(callsite(#loc566 at #loc658)) +#loc825 = loc(callsite(#loc616 at #loc658)) +#loc826 = loc(callsite(#loc617 at #loc658)) +#loc827 = loc(callsite(#loc620 at #loc259)) +#loc828 = loc(callsite(#loc621 at #loc259)) +#loc829 = loc(callsite(#loc622 at #loc259)) +#loc830 = loc(callsite(#loc623 at #loc658)) +#loc831 = loc(callsite(#loc624 at #loc658)) +#loc832 = loc(callsite(#loc625 at #loc658)) +#loc833 = loc(callsite(#loc598 at #loc658)) +#loc834 = loc(callsite(#loc626 at #loc658)) +#loc835 = loc(callsite(#loc628 at #loc658)) +#loc836 = loc(callsite(#loc640 at #loc658)) +#loc837 = loc(callsite(#loc641 at #loc658)) +#loc838 = loc(callsite(#loc642 at #loc658)) +#loc839 = loc(callsite(#loc643 at #loc658)) +#loc840 = loc(callsite(#loc644 at #loc658)) +#loc841 = loc(callsite(#loc645 at #loc658)) +#loc842 = loc(callsite(#loc646 at #loc658)) +#loc843 = loc(callsite(#loc647 at #loc658)) +#loc844 = loc(callsite(#loc648 at #loc658)) +#loc845 = loc(callsite(#loc649 at #loc658)) +#loc846 = loc(callsite(#loc650 at #loc658)) +#loc847 = loc(callsite(#loc651 at #loc658)) +#loc848 = loc(callsite(#loc653 at #loc658)) +#loc849 = loc(callsite(#loc654 at #loc658)) +#loc850 = loc(callsite(#loc655 at #loc259)) +#loc851 = loc(callsite(#loc656 at #loc259)) +#loc852 = loc(callsite(#loc657 at #loc259)) +#loc853 = loc(callsite(#loc13 at #loc692)) +#loc854 = loc(callsite(#loc15 at #loc692)) +#loc855 = loc(callsite(#loc446 at #loc695)) +#loc856 = loc(callsite(#loc448 at #loc695)) +#loc857 = loc(callsite(#loc449 at #loc695)) +#loc858 = loc(callsite(#loc450 at #loc695)) +#loc859 = loc("kT_ptrs"(#loc696)) +#loc860 = loc(callsite(#loc452 at #loc695)) +#loc861 = loc(callsite(#loc453 at #loc695)) +#loc862 = loc(callsite(#loc454 at #loc695)) +#loc863 = loc(callsite(#loc455 at #loc695)) +#loc864 = loc(callsite(#loc456 at #loc695)) +#loc865 = loc(callsite(#loc457 at #loc695)) +#loc866 = loc(callsite(#loc458 at #loc695)) +#loc867 = loc(callsite(#loc459 at #loc695)) +#loc868 = loc(callsite(#loc460 at #loc695)) +#loc869 = loc(callsite(#loc461 at #loc695)) +#loc870 = loc(callsite(#loc462 at #loc695)) +#loc871 = loc(callsite(#loc463 at #loc695)) +#loc872 = loc(callsite(#loc464 at #loc695)) +#loc873 = loc(callsite(#loc465 at #loc695)) +#loc874 = loc(callsite(#loc469 at #loc695)) +#loc875 = loc(callsite(#loc470 at #loc695)) +#loc876 = loc(callsite(#loc471 at #loc695)) +#loc877 = loc(callsite(#loc472 at #loc695)) +#loc878 = loc(callsite(#loc473 at #loc695)) +#loc879 = loc(callsite(#loc474 at #loc695)) +#loc880 = loc(callsite(#loc475 at #loc695)) +#loc881 = loc(callsite(#loc476 at #loc695)) +#loc882 = loc(callsite(#loc477 at #loc695)) +#loc883 = loc(callsite(#loc478 at #loc695)) +#loc884 = loc(callsite(#loc479 at #loc695)) +#loc885 = loc(callsite(#loc480 at #loc695)) +#loc886 = loc(callsite(#loc481 at #loc695)) +#loc887 = loc(callsite(#loc482 at #loc695)) +#loc888 = loc(callsite(#loc483 at #loc695)) +#loc889 = loc(callsite(#loc484 at #loc695)) +#loc890 = loc(callsite(#loc485 at #loc695)) +#loc891 = loc(callsite(#loc486 at #loc695)) +#loc892 = loc(callsite(#loc487 at #loc695)) +#loc893 = loc(callsite(#loc488 at #loc695)) +#loc894 = loc(callsite(#loc489 at #loc695)) +#loc895 = loc(callsite(#loc490 at #loc695)) +#loc896 = loc(callsite(#loc491 at #loc695)) +#loc897 = loc(callsite(#loc492 at #loc695)) +#loc898 = loc(callsite(#loc493 at #loc695)) +#loc899 = loc(callsite(#loc494 at #loc695)) +#loc900 = loc(callsite(#loc495 at #loc695)) +#loc901 = loc(callsite(#loc496 at #loc695)) +#loc902 = loc(callsite(#loc497 at #loc695)) +#loc903 = loc(callsite(#loc498 at #loc695)) +#loc904 = loc(callsite(#loc499 at #loc695)) +#loc905 = loc(callsite(#loc500 at #loc695)) +#loc906 = loc(callsite(#loc501 at #loc695)) +#loc907 = loc(callsite(#loc502 at #loc695)) +#loc908 = loc(callsite(#loc503 at #loc700)) +#loc909 = loc(callsite(#loc505 at #loc700)) +#loc910 = loc(callsite(#loc506 at #loc700)) +#loc911 = loc(callsite(#loc507 at #loc700)) +#loc912 = loc(callsite(#loc508 at #loc700)) +#loc913 = loc(callsite(#loc509 at #loc700)) +#loc914 = loc(callsite(#loc510 at #loc700)) +#loc915 = loc(callsite(#loc511 at #loc700)) +#loc916 = loc(callsite(#loc512 at #loc700)) +#loc917 = loc(callsite(#loc513 at #loc700)) +#loc918 = loc(callsite(#loc514 at #loc700)) +#loc919 = loc(callsite(#loc515 at #loc700)) +#loc920 = loc(callsite(#loc516 at #loc700)) +#loc921 = loc(callsite(#loc517 at #loc700)) +#loc922 = loc(callsite(#loc518 at #loc700)) +#loc923 = loc(callsite(#loc519 at #loc700)) +#loc924 = loc(callsite(#loc520 at #loc700)) +#loc925 = loc(callsite(#loc446 at #loc710)) +#loc926 = loc(callsite(#loc465 at #loc710)) +#loc927 = loc(callsite(#loc469 at #loc710)) +#loc928 = loc(callsite(#loc470 at #loc710)) +#loc929 = loc(callsite(#loc471 at #loc710)) +#loc930 = loc(callsite(#loc473 at #loc710)) +#loc931 = loc(callsite(#loc495 at #loc710)) +#loc932 = loc(callsite(#loc462 at #loc710)) +#loc933 = loc(callsite(#loc496 at #loc710)) +#loc934 = loc(callsite(#loc497 at #loc710)) +#loc935 = loc(callsite(#loc464 at #loc710)) +#loc936 = loc(callsite(#loc498 at #loc710)) +#loc937 = loc(callsite(#loc499 at #loc710)) +#loc938 = loc(callsite(#loc501 at #loc710)) +#loc939 = loc(callsite(#loc502 at #loc710)) +#loc940 = loc(callsite(#loc503 at #loc714)) +#loc941 = loc(callsite(#loc505 at #loc714)) +#loc942 = loc(callsite(#loc506 at #loc714)) +#loc943 = loc(callsite(#loc507 at #loc714)) +#loc944 = loc(callsite(#loc508 at #loc714)) +#loc945 = loc(callsite(#loc509 at #loc714)) +#loc946 = loc(callsite(#loc510 at #loc714)) +#loc947 = loc(callsite(#loc511 at #loc714)) +#loc948 = loc(callsite(#loc512 at #loc714)) +#loc949 = loc(callsite(#loc513 at #loc714)) +#loc950 = loc(callsite(#loc514 at #loc714)) +#loc951 = loc(callsite(#loc515 at #loc714)) +#loc952 = loc(callsite(#loc516 at #loc714)) +#loc953 = loc(callsite(#loc517 at #loc714)) +#loc954 = loc(callsite(#loc518 at #loc714)) +#loc955 = loc(callsite(#loc519 at #loc714)) +#loc956 = loc(callsite(#loc520 at #loc714)) +#loc957 = loc(callsite(#loc13 at #loc734)) +#loc958 = loc(callsite(#loc15 at #loc734)) +#loc959 = loc(callsite(#loc99 at #loc737)) +#loc960 = loc(callsite(#loc102 at #loc739)) +#loc961 = loc("offs_m1"(#loc743)) +#loc962 = loc(callsite(#loc69 at #loc763)) +#loc963 = loc(callsite(#loc120 at #loc737)) +#loc964 = loc(callsite(#loc70 at #loc763)) +#loc965 = loc(callsite(#loc125 at #loc737)) +#loc966 = loc(callsite(#loc285 at #loc763)) +#loc967 = loc(callsite(#loc102 at #loc785)) +#loc968 = loc(callsite(#loc503 at #loc813)) +#loc969 = loc(callsite(#loc505 at #loc813)) +#loc970 = loc(callsite(#loc506 at #loc813)) +#loc971 = loc(callsite(#loc507 at #loc813)) +#loc972 = loc(callsite(#loc508 at #loc813)) +#loc973 = loc(callsite(#loc509 at #loc813)) +#loc974 = loc(callsite(#loc510 at #loc813)) +#loc975 = loc(callsite(#loc511 at #loc813)) +#loc976 = loc(callsite(#loc512 at #loc813)) +#loc977 = loc(callsite(#loc513 at #loc813)) +#loc978 = loc(callsite(#loc514 at #loc813)) +#loc979 = loc(callsite(#loc515 at #loc813)) +#loc980 = loc(callsite(#loc516 at #loc813)) +#loc981 = loc(callsite(#loc517 at #loc813)) +#loc982 = loc(callsite(#loc518 at #loc813)) +#loc983 = loc(callsite(#loc519 at #loc813)) +#loc984 = loc(callsite(#loc520 at #loc813)) +#loc985 = loc(callsite(#loc120 at #loc820)) +#loc986 = loc(callsite(#loc70 at #loc822)) +#loc987 = loc(callsite(#loc99 at #loc820)) +#loc988 = loc(callsite(#loc69 at #loc822)) +#loc989 = loc(callsite(#loc125 at #loc820)) +#loc990 = loc(callsite(#loc285 at #loc822)) +#loc991 = loc(callsite(#loc503 at #loc850)) +#loc992 = loc(callsite(#loc505 at #loc850)) +#loc993 = loc(callsite(#loc506 at #loc850)) +#loc994 = loc(callsite(#loc507 at #loc850)) +#loc995 = loc(callsite(#loc508 at #loc850)) +#loc996 = loc(callsite(#loc509 at #loc850)) +#loc997 = loc(callsite(#loc510 at #loc850)) +#loc998 = loc(callsite(#loc511 at #loc850)) +#loc999 = loc(callsite(#loc512 at #loc850)) +#loc1000 = loc(callsite(#loc513 at #loc850)) +#loc1001 = loc(callsite(#loc514 at #loc850)) +#loc1002 = loc(callsite(#loc515 at #loc850)) +#loc1003 = loc(callsite(#loc516 at #loc850)) +#loc1004 = loc(callsite(#loc517 at #loc850)) +#loc1005 = loc(callsite(#loc518 at #loc850)) +#loc1006 = loc(callsite(#loc519 at #loc850)) +#loc1007 = loc(callsite(#loc520 at #loc850)) +#loc1008 = loc(callsite(#loc99 at #loc855)) +#loc1009 = loc(callsite(#loc102 at #loc856)) +#loc1010 = loc("vT_ptrs"(#loc859)) +#loc1011 = loc(callsite(#loc120 at #loc855)) +#loc1012 = loc(callsite(#loc120 at #loc873)) +#loc1013 = loc(callsite(#loc125 at #loc855)) +#loc1014 = loc(callsite(#loc102 at #loc877)) +#loc1015 = loc(callsite(#loc120 at #loc925)) +#loc1016 = loc(callsite(#loc120 at #loc926)) +#loc1017 = loc(callsite(#loc99 at #loc925)) +#loc1018 = loc(callsite(#loc125 at #loc925)) +#loc1019 = loc("qT_ptrs"(#loc961)) +#loc1020 = loc(callsite(#loc1010 at #loc435)) +#loc1021 = loc(callsite(#loc1010 at #loc528)) +#loc1022 = loc("do_ptrs"(#loc1019)) +#loc1023 = loc(callsite(#loc1022 at #loc214)) +#loc1024 = loc(callsite(#loc1022 at #loc259)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..fc69a0e1b63ed9b9a931577a565f24fb75af540c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CFBZ4SDWMXKKOGH4P7QHCRICB77JAJO5VDZAFPNRVWGV7WGXHPXQ/triton_tem_fused_zeros_1.ttir @@ -0,0 +1,1666 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":18:0) +#loc341 = loc("arg_Q"(#loc)) +#loc342 = loc("arg_K"(#loc)) +#loc343 = loc("arg_V"(#loc)) +#loc344 = loc("arg_LSE"(#loc)) +#loc345 = loc("arg_DELTA"(#loc)) +#loc346 = loc("arg_DO"(#loc)) +#loc347 = loc("arg_DQ"(#loc)) +#loc348 = loc("arg_DV"(#loc)) +#loc349 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc350 = loc("arg_KV_IDX"(#loc)) +#loc351 = loc("arg_Q_NUM_BLKS"(#loc)) +#loc352 = loc("arg_Q_IDX"(#loc)) +#loc353 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc354 = loc("arg_FULL_KV_IDX"(#loc)) +#loc355 = loc("arg_FULL_Q_NUM_BLKS"(#loc)) +#loc356 = loc("arg_FULL_Q_IDX"(#loc)) +#loc357 = loc("in_ptr16"(#loc)) +#loc358 = loc("out_ptr0"(#loc)) +#loc359 = loc("ks0"(#loc)) +#loc360 = loc("ks1"(#loc)) +#loc361 = loc("ks2"(#loc)) +#loc362 = loc("ks3"(#loc)) +#loc363 = loc("ks4"(#loc)) +#loc364 = loc("ks5"(#loc)) +#loc365 = loc("ks6"(#loc)) +#loc366 = loc("ks7"(#loc)) +#loc367 = loc("ks8"(#loc)) +module { + tt.func public @triton_tem_fused_zeros_1(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_DELTA: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DELTA"(#loc)), %arg_DO: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DO"(#loc)), %arg_DQ: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DQ"(#loc)), %arg_DV: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DV"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_NUM_BLKS"(#loc)), %arg_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %arg_FULL_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_NUM_BLKS"(#loc)), %arg_FULL_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_IDX"(#loc)), %in_ptr16: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr16"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc)), %ks4: i32 loc("ks4"(#loc)), %ks5: i32 loc("ks5"(#loc)), %ks6: i32 loc("ks6"(#loc)), %ks7: i32 loc("ks7"(#loc)), %ks8: i32 loc("ks8"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_0 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<128x1xi32> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64xf32> loc(#loc1) + %cst_5 = arith.constant dense<0xFF800000> : tensor<64xf32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %c63_i32 = arith.constant 63 : i32 loc(#loc1) + %cst_6 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1) + %cst_7 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc1) + %cst_8 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc1) + %cst_9 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1) + %cst_10 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1) + %cst_12 = arith.constant dense<0.000000e+00> : tensor<128x64xbf16> loc(#loc1) + %cst_13 = arith.constant dense<0.000000e+00> : tensor<128x128xbf16> loc(#loc1) + %cst_14 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1) + %c127_i32 = arith.constant 127 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_15 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc1) + %cst_16 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc1) + %cst_17 = arith.constant dense<0.0883883461> : tensor<128x128xf32> loc(#loc1) + %cst_18 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc1) + %cst_19 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc1) + %cst_20 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %HQ = arith.constant 32 : i32 loc(#loc368) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %0 = arith.muli %ks0, %c4096_i32 : i32 loc(#loc3) + %1 = arith.muli %ks1, %c1024_i32 : i32 loc(#loc4) + %2 = arith.muli %ks1, %c128_i32 : i32 loc(#loc5) + %3 = arith.cmpi sle, %ks0, %c1_i32 : i32 loc(#loc6) + %4 = arith.extui %3 : i1 to i32 loc(#loc7) + %5 = arith.cmpi sgt, %ks0, %c1_i32 : i32 loc(#loc8) + %6 = arith.extui %5 : i1 to i32 loc(#loc9) + %7 = arith.muli %ks0, %6 : i32 loc(#loc9) + %8 = arith.addi %4, %7 : i32 loc(#loc10) + %9 = arith.muli %8, %c4096_i32 : i32 loc(#loc11) + %10 = arith.muli %8, %c128_i32 : i32 loc(#loc12) + %pid = tt.get_program_id x : i32 loc(#loc369) + %NUM_KV_BLOCKS = arith.addi %ks1, %c127_i32 : i32 loc(#loc675) + %NUM_KV_BLOCKS_21 = arith.divsi %NUM_KV_BLOCKS, %c128_i32 : i32 loc(#loc676) + %NUM_Q_BLOCKS = arith.addi %ks0, %c127_i32 : i32 loc(#loc677) + %NUM_Q_BLOCKS_22 = arith.divsi %NUM_Q_BLOCKS, %c128_i32 : i32 loc(#loc678) + %off_zq = tt.get_program_id y : i32 loc(#loc372) + %off_hkv = tt.get_program_id z : i32 loc(#loc373) + %off_zkv = arith.remsi %off_zq, %c2_i32 : i32 loc(#loc374) + %k_adj = arith.muli %2, %off_hkv : i32 loc(#loc375) + %k_adj_23 = arith.muli %1, %off_zkv : i32 loc(#loc376) + %k_adj_24 = arith.addi %k_adj, %k_adj_23 : i32 loc(#loc377) + %k_adj_25 = arith.extsi %k_adj_24 : i32 to i64 loc(#loc378) + %dv_adj = arith.muli %1, %off_zq : i32 loc(#loc379) + %dv_adj_26 = arith.addi %k_adj, %dv_adj : i32 loc(#loc380) + %dv_adj_27 = arith.extsi %dv_adj_26 : i32 to i64 loc(#loc381) + %K = tt.addptr %arg_K, %k_adj_25 : !tt.ptr, i64 loc(#loc382) + %V = tt.addptr %arg_V, %k_adj_25 : !tt.ptr, i64 loc(#loc383) + %DV = tt.addptr %arg_DV, %dv_adj_27 : !tt.ptr, i64 loc(#loc384) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc385) + %11 = arith.cmpi sge, %pid, %NUM_KV_BLOCKS_21 : i32 loc(#loc32) + scf.if %11 { + %off_pid = arith.subi %pid, %NUM_KV_BLOCKS_21 : i32 loc(#loc386) + %off_hq2 = arith.divsi %off_pid, %NUM_Q_BLOCKS_22 : i32 loc(#loc387) + %off_hq2_28 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc388) + %off_hq2_29 = arith.addi %off_hq2, %off_hq2_28 : i32 loc(#loc389) + %start_m2_block = arith.remsi %off_pid, %NUM_Q_BLOCKS_22 : i32 loc(#loc390) + %stride_kv_idx_h = arith.muli %ks3, %ks4 : i32 loc(#loc391) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %ks2 : i32 loc(#loc392) + %sparse_kv_num_blks_offset_30 = arith.addi %sparse_kv_num_blks_offset, %start_m2_block : i32 loc(#loc393) + %sparse_kv_idx_offset = arith.muli %off_zkv, %stride_kv_idx_h : i32 loc(#loc394) + %sparse_kv_idx_offset_31 = arith.muli %start_m2_block, %ks4 : i32 loc(#loc395) + %sparse_kv_idx_offset_32 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_31 : i32 loc(#loc396) + %q_adj2 = arith.muli %off_hq2_29, %c128_i32 : i32 loc(#loc397) + %q_adj2_33 = arith.muli %0, %off_zq : i32 loc(#loc398) + %q_adj2_34 = arith.addi %q_adj2, %q_adj2_33 : i32 loc(#loc399) + %q_adj2_35 = arith.extsi %q_adj2_34 : i32 to i64 loc(#loc400) + %do_adj2 = arith.muli %10, %off_hq2_29 : i32 loc(#loc401) + %do_adj2_36 = arith.muli %9, %off_zq : i32 loc(#loc402) + %do_adj2_37 = arith.addi %do_adj2, %do_adj2_36 : i32 loc(#loc403) + %do_adj2_38 = arith.extsi %do_adj2_37 : i32 to i64 loc(#loc404) + %off_chz2 = arith.muli %off_zq, %HQ : i32 loc(#loc405) + %off_chz2_39 = arith.addi %off_chz2, %off_hq2_29 : i32 loc(#loc406) + %off_chz2_40 = arith.muli %off_chz2_39, %ks0 : i32 loc(#loc407) + %off_chz2_41 = arith.extsi %off_chz2_40 : i32 to i64 loc(#loc408) + %Q2 = tt.addptr %arg_Q, %q_adj2_35 : !tt.ptr, i64 loc(#loc409) + %DO2 = tt.addptr %arg_DO, %do_adj2_38 : !tt.ptr, i64 loc(#loc410) + %DQ2 = tt.addptr %arg_DQ, %q_adj2_35 : !tt.ptr, i64 loc(#loc411) + %LSE2 = tt.addptr %arg_LSE, %off_chz2_41 : !tt.ptr, i64 loc(#loc412) + %DELTA2 = tt.addptr %arg_DELTA, %off_chz2_41 : !tt.ptr, i64 loc(#loc413) + %start_m2 = arith.muli %start_m2_block, %c128_i32 : i32 loc(#loc414) + %offs_m2 = tt.splat %start_m2 : i32 -> tensor<128xi32> loc(#loc415) + %offs_m2_42 = arith.addi %offs_m2, %offs_k : tensor<128xi32> loc(#loc415) + %ptr = tt.expand_dims %offs_m2_42 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc679) + %ptr_43 = arith.muli %ptr, %cst_18 : tensor<128x1xi32> loc(#loc680) + %ptr_44 = tt.splat %Q2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc681) + %ptr_45 = tt.addptr %ptr_44, %ptr_43 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc681) + %ptr_46 = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc682) + %ptr_47 = tt.broadcast %ptr_45 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc683) + %ptr_48 = tt.broadcast %ptr_46 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc683) + %ptr_49 = tt.addptr %ptr_47, %ptr_48 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc683) + %q = tt.splat %ks0 : i32 -> tensor<128x1xi32> loc(#loc684) + %q_50 = arith.cmpi slt, %ptr, %q : tensor<128x1xi32> loc(#loc684) + %q_51 = tt.broadcast %q_50 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc685) + %q_52 = tt.load %ptr_49, %q_51, %cst_13 : tensor<128x128x!tt.ptr> loc(#loc685) + %ptr_53 = arith.muli %ptr, %cst_15 : tensor<128x1xi32> loc(#loc686) + %ptr_54 = tt.splat %DO2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc687) + %ptr_55 = tt.addptr %ptr_54, %ptr_53 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc687) + %ptr_56 = tt.broadcast %ptr_55 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc688) + %ptr_57 = tt.addptr %ptr_56, %ptr_48 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc688) + %do = tt.load %ptr_57, %q_51, %cst_13 : tensor<128x128x!tt.ptr> loc(#loc689) + %Di = tt.splat %ks0 : i32 -> tensor<128xi32> loc(#loc423) + %Di_58 = arith.cmpi slt, %offs_m2_42, %Di : tensor<128xi32> loc(#loc423) + %Di_59 = tt.splat %DELTA2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc424) + %Di_60 = tt.addptr %Di_59, %offs_m2_42 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc424) + %Di_61 = tt.load %Di_60, %Di_58 : tensor<128x!tt.ptr> loc(#loc425) + %lse = tt.splat %LSE2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc426) + %lse_62 = tt.addptr %lse, %offs_m2_42 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc426) + %lse_63 = tt.load %lse_62, %Di_58 : tensor<128x!tt.ptr> loc(#loc427) + %lse_64 = arith.cmpf oeq, %lse_63, %cst_20 : tensor<128xf32> loc(#loc428) + %lse_65 = arith.select %lse_64, %cst_19, %lse_63 : tensor<128xi1>, tensor<128xf32> loc(#loc429) + %lse_66 = tt.expand_dims %lse_65 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc430) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_32 : !tt.ptr, i32 loc(#loc431) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc432) + %kv_start_67 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc433) + %sparse_kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_30 : !tt.ptr, i32 loc(#loc434) + %sparse_kv_num_blocks_68 = tt.load %sparse_kv_num_blocks : !tt.ptr loc(#loc435) + %offs_n2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc436) + %offs_n2_69 = tt.splat %kv_start_67 : i32 -> tensor<64xi32> loc(#loc437) + %offs_n2_70 = arith.addi %offs_n2_69, %offs_n2 : tensor<64xi32> loc(#loc437) + %kT_ptrs = tt.expand_dims %offs_n2_70 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc690) + %kT_ptrs_71 = arith.muli %kT_ptrs, %cst_1 : tensor<1x64xi32> loc(#loc691) + %kT_ptrs_72 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc692) + %kT_ptrs_73 = tt.addptr %kT_ptrs_72, %kT_ptrs_71 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc692) + %kT_ptrs_74 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc693) + %kT_ptrs_75 = tt.broadcast %kT_ptrs_73 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc694) + %kT_ptrs_76 = tt.broadcast %kT_ptrs_74 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc694) + %kT_ptrs_77 = tt.addptr %kT_ptrs_75, %kT_ptrs_76 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc694) + %vT_ptrs = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc695) + %vT_ptrs_78 = tt.addptr %vT_ptrs, %kT_ptrs_71 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc695) + %vT_ptrs_79 = tt.broadcast %vT_ptrs_78 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc696) + %vT_ptrs_80 = tt.addptr %vT_ptrs_79, %kT_ptrs_76 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc696) + %hi = arith.muli %sparse_kv_num_blocks_68, %c2_i32 : i32 loc(#loc697) + %hi_81 = arith.addi %ks1, %c63_i32 : i32 loc(#loc861) + %hi_82 = arith.divsi %hi_81, %c64_i32 : i32 loc(#loc862) + %hi_83 = arith.maxsi %hi_82, %c1_i32 : i32 loc(#loc699) + %hi_84 = arith.minsi %hi, %hi_83 : i32 loc(#loc700) + %vT_ptrs_85:4 = scf.for %start_n = %c0_i32 to %hi_84 step %c1_i32 iter_args(%dq_107 = %cst_14, %offs_n2_108 = %offs_n2_70, %kT_ptrs_109 = %kT_ptrs_77, %vT_ptrs_110 = %vT_ptrs_80) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %kT = tt.expand_dims %offs_n2_108 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1018) + %kT_111 = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc1019) + %kT_112 = arith.cmpi slt, %kT, %kT_111 : tensor<1x64xi32> loc(#loc1019) + %kT_113 = tt.broadcast %kT_112 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1020) + %kT_114 = tt.load %kT_ptrs_109, %kT_113, %cst_12 : tensor<128x64x!tt.ptr> loc(#loc1020) + %qk = tt.dot %q_52, %kT_114, %cst_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc865) + %qk_115 = arith.mulf %qk, %cst_10 : tensor<128x64xf32> loc(#loc866) + %n = arith.remsi %kT, %kT_111 : tensor<1x64xi32> loc(#loc1021) + %m = arith.remsi %ptr, %q : tensor<128x1xi32> loc(#loc1022) + %post_mod_scores = arith.select %kT_113, %qk_115, %cst_9 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc869) + %tmp4 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc870) + %tmp4_116 = tt.broadcast %n : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc870) + %tmp4_117 = arith.cmpi sge, %tmp4, %tmp4_116 : tensor<128x64xi32> loc(#loc870) + %tmp5 = arith.extsi %n : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc871) + %tmp7 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc872) + %tmp7_118 = tt.load %tmp7 : !tt.ptr loc(#loc873) + %tmp8 = tt.splat %tmp7_118 : i64 -> tensor<1x64xi64> loc(#loc874) + %tmp8_119 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc874) + %tmp9 = arith.extsi %m : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc875) + %tmp10 = tt.splat %tmp7_118 : i64 -> tensor<128x1xi64> loc(#loc876) + %tmp10_120 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc876) + %tmp11 = tt.broadcast %tmp8_119 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc877) + %tmp11_121 = tt.broadcast %tmp10_120 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc877) + %tmp11_122 = arith.andi %tmp11, %tmp11_121 : tensor<128x64xi1> loc(#loc877) + %tmp12 = arith.andi %tmp4_117, %tmp11_122 : tensor<128x64xi1> loc(#loc878) + %tmp15 = tt.splat %ks8 : i32 -> tensor<1x64xi32> loc(#loc879) + %tmp15_123 = arith.cmpi sge, %n, %tmp15 : tensor<1x64xi32> loc(#loc879) + %tmp16 = arith.remsi %n, %tmp15 : tensor<1x64xi32> loc(#loc880) + %tmp18 = arith.cmpi ne, %tmp16, %cst_8 : tensor<1x64xi32> loc(#loc881) + %tmp19 = arith.cmpi slt, %tmp16, %cst_8 : tensor<1x64xi32> loc(#loc882) + %tmp20 = arith.cmpi slt, %ks8, %c0_i32 : i32 loc(#loc883) + %tmp21 = tt.splat %tmp20 : i1 -> tensor<1x64xi1> loc(#loc884) + %tmp21_124 = arith.cmpi ne, %tmp19, %tmp21 : tensor<1x64xi1> loc(#loc884) + %tmp22 = arith.andi %tmp18, %tmp21_124 : tensor<1x64xi1> loc(#loc885) + %tmp23 = arith.addi %tmp16, %tmp15 : tensor<1x64xi32> loc(#loc886) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc887) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc888) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64> loc(#loc889) + %tmp27 = arith.andi %tmp15_123, %tmp26 : tensor<1x64xi1> loc(#loc890) + %tmp28 = arith.subi %tmp4_116, %tmp4 : tensor<128x64xi32> loc(#loc891) + %tmp29 = tt.splat %ks8 : i32 -> tensor<128x64xi32> loc(#loc892) + %tmp29_125 = arith.remsi %tmp28, %tmp29 : tensor<128x64xi32> loc(#loc892) + %tmp30 = arith.cmpi ne, %tmp29_125, %cst_7 : tensor<128x64xi32> loc(#loc893) + %tmp31 = arith.cmpi slt, %tmp29_125, %cst_7 : tensor<128x64xi32> loc(#loc894) + %tmp32 = tt.splat %tmp20 : i1 -> tensor<128x64xi1> loc(#loc895) + %tmp32_126 = arith.cmpi ne, %tmp31, %tmp32 : tensor<128x64xi1> loc(#loc895) + %tmp33 = arith.andi %tmp30, %tmp32_126 : tensor<128x64xi1> loc(#loc896) + %tmp34 = arith.addi %tmp29_125, %tmp29 : tensor<128x64xi32> loc(#loc897) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29_125 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc898) + %tmp36 = arith.cmpi eq, %tmp35, %cst_7 : tensor<128x64xi32> loc(#loc899) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc900) + %tmp37_127 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1> loc(#loc900) + %tmp38 = arith.ori %tmp12, %tmp37_127 : tensor<128x64xi1> loc(#loc901) + %post_mod_scores_128 = arith.select %tmp38, %post_mod_scores, %cst_9 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc902) + %post_mod_scores_129 = arith.mulf %post_mod_scores_128, %cst_6 : tensor<128x64xf32> loc(#loc903) + %p = tt.broadcast %lse_66 : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc904) + %p_130 = arith.subf %post_mod_scores_129, %p : tensor<128x64xf32> loc(#loc904) + %p_131 = math.exp2 %p_130 : tensor<128x64xf32> loc(#loc905) + %vT = tt.load %vT_ptrs_110, %kT_113, %cst_12 : tensor<128x64x!tt.ptr> loc(#loc1023) + %dp = tt.dot %do, %vT, %cst_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc907) + %ds = tt.expand_dims %Di_61 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc908) + %ds_132 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc909) + %ds_133 = arith.subf %dp, %ds_132 : tensor<128x64xf32> loc(#loc909) + %ds_134 = arith.mulf %p_131, %ds_133 : tensor<128x64xf32> loc(#loc910) + %grad_scores = arith.select %kT_113, %ds_134, %cst_11 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc911) + %ds_135 = arith.select %tmp38, %grad_scores, %cst_11 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc912) + %ds_136 = arith.truncf %ds_135 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc913) + %dq_137 = tt.trans %kT_114 {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc914) + %dq_138 = tt.dot %ds_136, %dq_137, %dq_107, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc915) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc916) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc917) + %cur_block_139 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc918) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc919) + %next_block_140 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_68 : i32 loc(#loc920) + %next_block_141 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc921) + %next_block_142 = tt.load %next_block_141, %next_block_140 evictionPolicy = evict_last : !tt.ptr loc(#loc922) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc923) + %needs_jump_143 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc924) + %needs_jump_144 = arith.cmpi eq, %needs_jump_143, %c0_i32 : i32 loc(#loc925) + %jump_to_block = arith.subi %next_block_142, %cur_block_139 : i32 loc(#loc926) + %jump_to_block_145 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc927) + %jump_to_block_146 = arith.subi %jump_to_block_145, %c64_i32 : i32 loc(#loc928) + %offset = arith.extui %needs_jump_144 : i1 to i32 loc(#loc929) + %offset_147 = arith.muli %jump_to_block_146, %offset : i32 loc(#loc929) + %offset_148 = arith.subi %c1_i32, %offset : i32 loc(#loc930) + %offset_149 = arith.muli %offset_148, %c64_i32 : i32 loc(#loc931) + %offset_150 = arith.addi %offset_147, %offset_149 : i32 loc(#loc932) + %kT_ptrs_151 = arith.muli %offset_150, %c128_i32 : i32 loc(#loc704) + %kT_ptrs_152 = tt.splat %kT_ptrs_151 : i32 -> tensor<128x64xi32> loc(#loc705) + %kT_ptrs_153 = tt.addptr %kT_ptrs_109, %kT_ptrs_152 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc705) + %vT_ptrs_154 = tt.addptr %vT_ptrs_110, %kT_ptrs_152 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc706) + %offs_n2_155 = tt.splat %offset_150 : i32 -> tensor<64xi32> loc(#loc707) + %offs_n2_156 = arith.addi %offs_n2_108, %offs_n2_155 : tensor<64xi32> loc(#loc707) + scf.yield %dq_138, %offs_n2_156, %kT_ptrs_153, %vT_ptrs_154 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc708) + } loc(#loc1029) + %kv_indices_86 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_32 : !tt.ptr, i32 loc(#loc526) + %kv_start_87 = tt.load %kv_indices_86 : !tt.ptr loc(#loc527) + %kv_start_88 = arith.muli %kv_start_87, %c128_i32 : i32 loc(#loc528) + %sparse_kv_num_blocks_89 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_30 : !tt.ptr, i32 loc(#loc529) + %sparse_kv_num_blocks_90 = tt.load %sparse_kv_num_blocks_89 : !tt.ptr loc(#loc530) + %offs_n2_91 = tt.splat %kv_start_88 : i32 -> tensor<64xi32> loc(#loc531) + %offs_n2_92 = arith.addi %offs_n2_91, %offs_n2 : tensor<64xi32> loc(#loc531) + %kT_ptrs_93 = tt.expand_dims %offs_n2_92 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc709) + %kT_ptrs_94 = arith.muli %kT_ptrs_93, %cst_1 : tensor<1x64xi32> loc(#loc710) + %kT_ptrs_95 = tt.addptr %kT_ptrs_72, %kT_ptrs_94 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc711) + %kT_ptrs_96 = tt.broadcast %kT_ptrs_95 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc712) + %kT_ptrs_97 = tt.addptr %kT_ptrs_96, %kT_ptrs_76 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc712) + %vT_ptrs_98 = tt.addptr %vT_ptrs, %kT_ptrs_94 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc713) + %vT_ptrs_99 = tt.broadcast %vT_ptrs_98 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc714) + %vT_ptrs_100 = tt.addptr %vT_ptrs_99, %kT_ptrs_76 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc714) + %hi_101 = arith.muli %sparse_kv_num_blocks_90, %c2_i32 : i32 loc(#loc715) + %hi_102 = arith.minsi %hi_101, %hi_83 : i32 loc(#loc716) + %vT_ptrs_103:4 = scf.for %start_n = %c0_i32 to %hi_102 step %c1_i32 iter_args(%dq_107 = %vT_ptrs_85#0, %offs_n2_108 = %offs_n2_92, %kT_ptrs_109 = %kT_ptrs_97, %vT_ptrs_110 = %vT_ptrs_100) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %kT = tt.expand_dims %offs_n2_108 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1024) + %kT_111 = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc1025) + %kT_112 = arith.cmpi slt, %kT, %kT_111 : tensor<1x64xi32> loc(#loc1025) + %kT_113 = tt.broadcast %kT_112 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1026) + %kT_114 = tt.load %kT_ptrs_109, %kT_113, %cst_12 : tensor<128x64x!tt.ptr> loc(#loc1026) + %qk = tt.dot %q_52, %kT_114, %cst_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc934) + %qk_115 = arith.mulf %qk, %cst_10 : tensor<128x64xf32> loc(#loc935) + %post_mod_scores = arith.select %kT_113, %qk_115, %cst_9 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc936) + %post_mod_scores_116 = arith.mulf %post_mod_scores, %cst_6 : tensor<128x64xf32> loc(#loc937) + %p = tt.broadcast %lse_66 : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc938) + %p_117 = arith.subf %post_mod_scores_116, %p : tensor<128x64xf32> loc(#loc938) + %p_118 = math.exp2 %p_117 : tensor<128x64xf32> loc(#loc939) + %vT = tt.load %vT_ptrs_110, %kT_113, %cst_12 : tensor<128x64x!tt.ptr> loc(#loc1027) + %dp = tt.dot %do, %vT, %cst_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc941) + %ds = tt.expand_dims %Di_61 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc942) + %ds_119 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc943) + %ds_120 = arith.subf %dp, %ds_119 : tensor<128x64xf32> loc(#loc943) + %ds_121 = arith.mulf %p_118, %ds_120 : tensor<128x64xf32> loc(#loc944) + %grad_scores = arith.select %kT_113, %ds_121, %cst_11 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc945) + %ds_122 = arith.truncf %grad_scores : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc946) + %dq_123 = tt.trans %kT_114 {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc947) + %dq_124 = tt.dot %ds_122, %dq_123, %dq_107, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc948) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc949) + %cur_block = tt.addptr %kv_indices_86, %cur_block_idx : !tt.ptr, i32 loc(#loc950) + %cur_block_125 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc951) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc952) + %next_block_126 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_90 : i32 loc(#loc953) + %next_block_127 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc954) + %next_block_128 = tt.load %next_block_127, %next_block_126 evictionPolicy = evict_last : !tt.ptr loc(#loc955) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc956) + %needs_jump_129 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc957) + %needs_jump_130 = arith.cmpi eq, %needs_jump_129, %c0_i32 : i32 loc(#loc958) + %jump_to_block = arith.subi %next_block_128, %cur_block_125 : i32 loc(#loc959) + %jump_to_block_131 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc960) + %jump_to_block_132 = arith.subi %jump_to_block_131, %c64_i32 : i32 loc(#loc961) + %offset = arith.extui %needs_jump_130 : i1 to i32 loc(#loc962) + %offset_133 = arith.muli %jump_to_block_132, %offset : i32 loc(#loc962) + %offset_134 = arith.subi %c1_i32, %offset : i32 loc(#loc963) + %offset_135 = arith.muli %offset_134, %c64_i32 : i32 loc(#loc964) + %offset_136 = arith.addi %offset_133, %offset_135 : i32 loc(#loc965) + %kT_ptrs_137 = arith.muli %offset_136, %c128_i32 : i32 loc(#loc719) + %kT_ptrs_138 = tt.splat %kT_ptrs_137 : i32 -> tensor<128x64xi32> loc(#loc720) + %kT_ptrs_139 = tt.addptr %kT_ptrs_109, %kT_ptrs_138 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc720) + %vT_ptrs_140 = tt.addptr %vT_ptrs_110, %kT_ptrs_138 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc721) + %offs_n2_141 = tt.splat %offset_136 : i32 -> tensor<64xi32> loc(#loc722) + %offs_n2_142 = arith.addi %offs_n2_108, %offs_n2_141 : tensor<64xi32> loc(#loc722) + scf.yield %dq_124, %offs_n2_142, %kT_ptrs_139, %vT_ptrs_140 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc723) + } loc(#loc1030) + %dq_ptrs = tt.splat %DQ2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc533) + %dq_ptrs_104 = tt.addptr %dq_ptrs, %ptr_43 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc533) + %dq_ptrs_105 = tt.broadcast %dq_ptrs_104 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc534) + %dq_ptrs_106 = tt.addptr %dq_ptrs_105, %ptr_48 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc534) + %dq = arith.mulf %vT_ptrs_103#0, %cst_17 : tensor<128x128xf32> loc(#loc535) + %12 = arith.cmpi slt, %ptr_46, %cst_16 : tensor<1x128xi32> loc(#loc191) + %13 = tt.broadcast %12 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc192) + %14 = arith.andi %q_51, %13 : tensor<128x128xi1> loc(#loc192) + %15 = arith.truncf %dq : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc193) + tt.store %dq_ptrs_106, %15, %14 : tensor<128x128x!tt.ptr> loc(#loc193) + } else { + %stride_q_idx_h = arith.muli %ks6, %ks7 : i32 loc(#loc536) + %start_n1 = arith.muli %pid, %c128_i32 : i32 loc(#loc537) + %offs_n1 = tt.splat %start_n1 : i32 -> tensor<128xi32> loc(#loc538) + %offs_n1_28 = arith.addi %offs_n1, %offs_k : tensor<128xi32> loc(#loc538) + %ptr = tt.expand_dims %offs_n1_28 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc724) + %ptr_29 = arith.muli %ptr, %cst_15 : tensor<128x1xi32> loc(#loc725) + %ptr_30 = tt.splat %K : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc726) + %ptr_31 = tt.addptr %ptr_30, %ptr_29 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc726) + %ptr_32 = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc727) + %ptr_33 = tt.broadcast %ptr_31 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc728) + %ptr_34 = tt.broadcast %ptr_32 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc728) + %ptr_35 = tt.addptr %ptr_33, %ptr_34 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc728) + %k = tt.splat %ks1 : i32 -> tensor<128x1xi32> loc(#loc729) + %k_36 = arith.cmpi slt, %ptr, %k : tensor<128x1xi32> loc(#loc729) + %k_37 = tt.broadcast %k_36 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc730) + %k_38 = tt.load %ptr_35, %k_37, %cst_13 : tensor<128x128x!tt.ptr> loc(#loc730) + %ptr_39 = tt.splat %V : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc731) + %ptr_40 = tt.addptr %ptr_39, %ptr_29 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc731) + %ptr_41 = tt.broadcast %ptr_40 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc732) + %ptr_42 = tt.addptr %ptr_41, %ptr_34 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc732) + %v = tt.load %ptr_42, %k_37, %cst_13 : tensor<128x128x!tt.ptr> loc(#loc733) + %dk:2 = scf.for %off_g = %c0_i32 to %c4_i32 step %c1_i32 iter_args(%dv = %cst_14, %dk_56 = %cst_14) -> (tensor<128x128xf32>, tensor<128x128xf32>) : i32 { + %off_hq1 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc542) + %off_hq1_57 = arith.addi %off_hq1, %off_g : i32 loc(#loc543) + %q_adj1 = arith.muli %off_hq1_57, %c128_i32 : i32 loc(#loc544) + %q_adj1_58 = arith.muli %0, %off_zq : i32 loc(#loc545) + %q_adj1_59 = arith.addi %q_adj1, %q_adj1_58 : i32 loc(#loc546) + %q_adj1_60 = arith.extsi %q_adj1_59 : i32 to i64 loc(#loc547) + %do_adj1 = arith.muli %10, %off_hq1_57 : i32 loc(#loc548) + %do_adj1_61 = arith.muli %9, %off_zq : i32 loc(#loc549) + %do_adj1_62 = arith.addi %do_adj1, %do_adj1_61 : i32 loc(#loc550) + %do_adj1_63 = arith.extsi %do_adj1_62 : i32 to i64 loc(#loc551) + %off_chz1 = arith.muli %off_zq, %HQ : i32 loc(#loc552) + %off_chz1_64 = arith.addi %off_chz1, %off_hq1_57 : i32 loc(#loc553) + %off_chz1_65 = arith.muli %off_chz1_64, %ks0 : i32 loc(#loc554) + %off_chz1_66 = arith.extsi %off_chz1_65 : i32 to i64 loc(#loc555) + %Q1 = tt.addptr %arg_Q, %q_adj1_60 : !tt.ptr, i64 loc(#loc556) + %DO1 = tt.addptr %arg_DO, %do_adj1_63 : !tt.ptr, i64 loc(#loc557) + %LSE1 = tt.addptr %arg_LSE, %off_chz1_66 : !tt.ptr, i64 loc(#loc558) + %DELTA1 = tt.addptr %arg_DELTA, %off_chz1_66 : !tt.ptr, i64 loc(#loc559) + %sparse_q_num_blks_offset = arith.muli %off_zkv, %ks5 : i32 loc(#loc560) + %sparse_q_num_blks_offset_67 = arith.addi %sparse_q_num_blks_offset, %pid : i32 loc(#loc561) + %sparse_q_idx_offset = arith.muli %off_zkv, %stride_q_idx_h : i32 loc(#loc562) + %sparse_q_idx_offset_68 = arith.muli %pid, %ks6 : i32 loc(#loc563) + %sparse_q_idx_offset_69 = arith.addi %sparse_q_idx_offset, %sparse_q_idx_offset_68 : i32 loc(#loc564) + %q_indices = tt.addptr %arg_Q_IDX, %sparse_q_idx_offset_69 : !tt.ptr, i32 loc(#loc565) + %q_start = tt.load %q_indices : !tt.ptr loc(#loc566) + %q_start_70 = arith.muli %q_start, %c128_i32 : i32 loc(#loc567) + %sparse_q_num_blocks = tt.addptr %arg_Q_NUM_BLKS, %sparse_q_num_blks_offset_67 : !tt.ptr, i32 loc(#loc568) + %sparse_q_num_blocks_71 = tt.load %sparse_q_num_blocks : !tt.ptr loc(#loc569) + %offs_m1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc570) + %offs_m1_72 = tt.splat %q_start_70 : i32 -> tensor<64xi32> loc(#loc571) + %offs_m1_73 = arith.addi %offs_m1_72, %offs_m1 : tensor<64xi32> loc(#loc571) + %qT_ptrs = tt.expand_dims %offs_m1_73 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc735) + %qT_ptrs_74 = arith.muli %qT_ptrs, %cst_0 : tensor<1x64xi32> loc(#loc736) + %qT_ptrs_75 = tt.splat %Q1 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc737) + %qT_ptrs_76 = tt.addptr %qT_ptrs_75, %qT_ptrs_74 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc737) + %qT_ptrs_77 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc738) + %qT_ptrs_78 = tt.broadcast %qT_ptrs_76 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc739) + %qT_ptrs_79 = tt.broadcast %qT_ptrs_77 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc739) + %qT_ptrs_80 = tt.addptr %qT_ptrs_78, %qT_ptrs_79 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc739) + %do_ptrs = tt.expand_dims %offs_m1_73 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc740) + %do_ptrs_81 = arith.muli %do_ptrs, %cst : tensor<64x1xi32> loc(#loc741) + %do_ptrs_82 = tt.splat %DO1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc742) + %do_ptrs_83 = tt.addptr %do_ptrs_82, %do_ptrs_81 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc742) + %do_ptrs_84 = tt.broadcast %do_ptrs_83 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc743) + %do_ptrs_85 = tt.broadcast %ptr_32 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc743) + %do_ptrs_86 = tt.addptr %do_ptrs_84, %do_ptrs_85 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc743) + %hi = arith.muli %sparse_q_num_blocks_71, %c2_i32 : i32 loc(#loc744) + %hi_87 = arith.addi %ks0, %c63_i32 : i32 loc(#loc966) + %hi_88 = arith.divsi %hi_87, %c64_i32 : i32 loc(#loc967) + %hi_89 = arith.maxsi %hi_88, %c1_i32 : i32 loc(#loc746) + %hi_90 = arith.minsi %hi, %hi_89 : i32 loc(#loc747) + %do_ptrs_91:5 = scf.for %start_m = %c0_i32 to %hi_90 step %c1_i32 iter_args(%dk_112 = %dk_56, %dv_113 = %dv, %offs_m1_114 = %offs_m1_73, %qT_ptrs_115 = %qT_ptrs_80, %do_ptrs_116 = %do_ptrs_86) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %qT = tt.expand_dims %offs_m1_114 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc969) + %qT_117 = tt.splat %ks0 : i32 -> tensor<1x64xi32> loc(#loc970) + %qT_118 = arith.cmpi slt, %qT, %qT_117 : tensor<1x64xi32> loc(#loc970) + %qT_119 = tt.broadcast %qT_118 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc971) + %qT_120 = tt.load %qT_ptrs_115, %qT_119, %cst_12 : tensor<128x64x!tt.ptr> loc(#loc971) + %lse = tt.splat %ks0 : i32 -> tensor<64xi32> loc(#loc750) + %lse_121 = arith.cmpi slt, %offs_m1_114, %lse : tensor<64xi32> loc(#loc750) + %lse_122 = tt.splat %LSE1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc751) + %lse_123 = tt.addptr %lse_122, %offs_m1_114 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc751) + %lse_124 = tt.load %lse_123, %lse_121 : tensor<64x!tt.ptr> loc(#loc752) + %lse_125 = arith.cmpf oeq, %lse_124, %cst_5 : tensor<64xf32> loc(#loc753) + %lse_126 = arith.select %lse_125, %cst_4, %lse_124 : tensor<64xi1>, tensor<64xf32> loc(#loc754) + %qkT = tt.dot %k_38, %qT_120, %cst_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc755) + %qkT_127 = arith.mulf %qkT, %cst_10 : tensor<128x64xf32> loc(#loc756) + %m = arith.remsi %qT, %qT_117 : tensor<1x64xi32> loc(#loc972) + %n = arith.remsi %ptr, %k : tensor<128x1xi32> loc(#loc973) + %post_mod_scores = arith.select %qT_119, %qkT_127, %cst_9 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc759) + %tmp44 = tt.broadcast %m : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc760) + %tmp44_128 = tt.broadcast %n : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc760) + %tmp44_129 = arith.cmpi sge, %tmp44, %tmp44_128 : tensor<128x64xi32> loc(#loc760) + %tmp45 = arith.extsi %n : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc761) + %tmp47 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc762) + %tmp47_130 = tt.load %tmp47 : !tt.ptr loc(#loc763) + %tmp48 = tt.splat %tmp47_130 : i64 -> tensor<128x1xi64> loc(#loc764) + %tmp48_131 = arith.cmpi slt, %tmp45, %tmp48 : tensor<128x1xi64> loc(#loc764) + %tmp49 = arith.extsi %m : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc765) + %tmp50 = tt.splat %tmp47_130 : i64 -> tensor<1x64xi64> loc(#loc766) + %tmp50_132 = arith.cmpi slt, %tmp49, %tmp50 : tensor<1x64xi64> loc(#loc766) + %tmp51 = tt.broadcast %tmp48_131 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc767) + %tmp51_133 = tt.broadcast %tmp50_132 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc767) + %tmp51_134 = arith.andi %tmp51, %tmp51_133 : tensor<128x64xi1> loc(#loc767) + %tmp52 = arith.andi %tmp44_129, %tmp51_134 : tensor<128x64xi1> loc(#loc768) + %tmp55 = tt.splat %ks8 : i32 -> tensor<128x1xi32> loc(#loc769) + %tmp55_135 = arith.cmpi sge, %n, %tmp55 : tensor<128x1xi32> loc(#loc769) + %tmp56 = arith.remsi %n, %tmp55 : tensor<128x1xi32> loc(#loc770) + %tmp58 = arith.cmpi ne, %tmp56, %cst_3 : tensor<128x1xi32> loc(#loc771) + %tmp59 = arith.cmpi slt, %tmp56, %cst_3 : tensor<128x1xi32> loc(#loc772) + %tmp60 = arith.cmpi slt, %ks8, %c0_i32 : i32 loc(#loc773) + %tmp61 = tt.splat %tmp60 : i1 -> tensor<128x1xi1> loc(#loc774) + %tmp61_136 = arith.cmpi ne, %tmp59, %tmp61 : tensor<128x1xi1> loc(#loc774) + %tmp62 = arith.andi %tmp58, %tmp61_136 : tensor<128x1xi1> loc(#loc775) + %tmp63 = arith.addi %tmp56, %tmp55 : tensor<128x1xi32> loc(#loc776) + %tmp64 = arith.select %tmp62, %tmp63, %tmp56 : tensor<128x1xi1>, tensor<128x1xi32> loc(#loc777) + %tmp65 = arith.extsi %tmp64 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc778) + %tmp66 = arith.cmpi slt, %tmp65, %tmp48 : tensor<128x1xi64> loc(#loc779) + %tmp67 = arith.andi %tmp55_135, %tmp66 : tensor<128x1xi1> loc(#loc780) + %tmp68 = arith.subi %tmp44_128, %tmp44 : tensor<128x64xi32> loc(#loc781) + %tmp69 = tt.splat %ks8 : i32 -> tensor<128x64xi32> loc(#loc782) + %tmp69_137 = arith.remsi %tmp68, %tmp69 : tensor<128x64xi32> loc(#loc782) + %tmp70 = arith.cmpi ne, %tmp69_137, %cst_7 : tensor<128x64xi32> loc(#loc783) + %tmp71 = arith.cmpi slt, %tmp69_137, %cst_7 : tensor<128x64xi32> loc(#loc784) + %tmp72 = tt.splat %tmp60 : i1 -> tensor<128x64xi1> loc(#loc785) + %tmp72_138 = arith.cmpi ne, %tmp71, %tmp72 : tensor<128x64xi1> loc(#loc785) + %tmp73 = arith.andi %tmp70, %tmp72_138 : tensor<128x64xi1> loc(#loc786) + %tmp74 = arith.addi %tmp69_137, %tmp69 : tensor<128x64xi32> loc(#loc787) + %tmp75 = arith.select %tmp73, %tmp74, %tmp69_137 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc788) + %tmp76 = arith.cmpi eq, %tmp75, %cst_7 : tensor<128x64xi32> loc(#loc789) + %tmp77 = tt.broadcast %tmp67 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc790) + %tmp77_139 = arith.andi %tmp77, %tmp76 : tensor<128x64xi1> loc(#loc790) + %tmp78 = arith.ori %tmp52, %tmp77_139 : tensor<128x64xi1> loc(#loc791) + %post_mod_scores_140 = arith.select %tmp78, %post_mod_scores, %cst_9 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc792) + %post_mod_scores_141 = arith.mulf %post_mod_scores_140, %cst_6 : tensor<128x64xf32> loc(#loc793) + %pT = tt.expand_dims %lse_126 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc794) + %pT_142 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc795) + %pT_143 = arith.subf %post_mod_scores_141, %pT_142 : tensor<128x64xf32> loc(#loc795) + %pT_144 = math.exp2 %pT_143 : tensor<128x64xf32> loc(#loc796) + %do = tt.expand_dims %offs_m1_114 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc974) + %do_145 = tt.splat %ks0 : i32 -> tensor<64x1xi32> loc(#loc975) + %do_146 = arith.cmpi slt, %do, %do_145 : tensor<64x1xi32> loc(#loc975) + %do_147 = tt.broadcast %do_146 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc976) + %do_148 = tt.load %do_ptrs_116, %do_147, %cst_2 : tensor<64x128x!tt.ptr> loc(#loc976) + %dv_149 = arith.truncf %pT_144 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc798) + %dv_150 = tt.dot %dv_149, %do_148, %dv_113, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc799) + %Di = tt.splat %DELTA1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc800) + %Di_151 = tt.addptr %Di, %offs_m1_114 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc800) + %Di_152 = tt.load %Di_151, %lse_121 : tensor<64x!tt.ptr> loc(#loc801) + %dpT = tt.trans %do_148 {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc802) + %dpT_153 = tt.dot %v, %dpT, %cst_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc803) + %dsT = tt.expand_dims %Di_152 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc804) + %dsT_154 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc805) + %dsT_155 = arith.subf %dpT_153, %dsT_154 : tensor<128x64xf32> loc(#loc805) + %dsT_156 = arith.mulf %pT_144, %dsT_155 : tensor<128x64xf32> loc(#loc806) + %grad_scores = arith.select %qT_119, %dsT_156, %cst_11 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc807) + %dsT_157 = arith.select %tmp78, %grad_scores, %cst_11 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc808) + %dk_158 = arith.truncf %dsT_157 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc809) + %dk_159 = tt.trans %qT_120 {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc810) + %dk_160 = tt.dot %dk_158, %dk_159, %dk_112, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc811) + %cur_block_idx = arith.divsi %start_m, %c2_i32 : i32 loc(#loc977) + %cur_block = tt.addptr %q_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc978) + %cur_block_161 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc979) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc980) + %next_block_162 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_71 : i32 loc(#loc981) + %next_block_163 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc982) + %next_block_164 = tt.load %next_block_163, %next_block_162 evictionPolicy = evict_last : !tt.ptr loc(#loc983) + %needs_jump = arith.addi %start_m, %c1_i32 : i32 loc(#loc984) + %needs_jump_165 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc985) + %needs_jump_166 = arith.cmpi eq, %needs_jump_165, %c0_i32 : i32 loc(#loc986) + %jump_to_block = arith.subi %next_block_164, %cur_block_161 : i32 loc(#loc987) + %jump_to_block_167 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc988) + %jump_to_block_168 = arith.subi %jump_to_block_167, %c64_i32 : i32 loc(#loc989) + %offset = arith.extui %needs_jump_166 : i1 to i32 loc(#loc990) + %offset_169 = arith.muli %jump_to_block_168, %offset : i32 loc(#loc990) + %offset_170 = arith.subi %c1_i32, %offset : i32 loc(#loc991) + %offset_171 = arith.muli %offset_170, %c64_i32 : i32 loc(#loc992) + %offset_172 = arith.addi %offset_169, %offset_171 : i32 loc(#loc993) + %qT_ptrs_173 = arith.muli %offset_172, %c4096_i32 : i32 loc(#loc813) + %qT_ptrs_174 = tt.splat %qT_ptrs_173 : i32 -> tensor<128x64xi32> loc(#loc814) + %qT_ptrs_175 = tt.addptr %qT_ptrs_115, %qT_ptrs_174 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc814) + %do_ptrs_176 = arith.muli %offset_172, %c128_i32 : i32 loc(#loc815) + %do_ptrs_177 = tt.splat %do_ptrs_176 : i32 -> tensor<64x128xi32> loc(#loc816) + %do_ptrs_178 = tt.addptr %do_ptrs_116, %do_ptrs_177 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc816) + %offs_m1_179 = tt.splat %offset_172 : i32 -> tensor<64xi32> loc(#loc817) + %offs_m1_180 = arith.addi %offs_m1_114, %offs_m1_179 : tensor<64xi32> loc(#loc817) + scf.yield %dk_160, %dv_150, %offs_m1_180, %qT_ptrs_175, %do_ptrs_178 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc656) + } loc(#loc1032) + %q_indices_92 = tt.addptr %arg_FULL_Q_IDX, %sparse_q_idx_offset_69 : !tt.ptr, i32 loc(#loc657) + %q_start_93 = tt.load %q_indices_92 : !tt.ptr loc(#loc658) + %q_start_94 = arith.muli %q_start_93, %c128_i32 : i32 loc(#loc659) + %sparse_q_num_blocks_95 = tt.addptr %arg_FULL_Q_NUM_BLKS, %sparse_q_num_blks_offset_67 : !tt.ptr, i32 loc(#loc660) + %sparse_q_num_blocks_96 = tt.load %sparse_q_num_blocks_95 : !tt.ptr loc(#loc661) + %offs_m1_97 = tt.splat %q_start_94 : i32 -> tensor<64xi32> loc(#loc662) + %offs_m1_98 = arith.addi %offs_m1_97, %offs_m1 : tensor<64xi32> loc(#loc662) + %qT_ptrs_99 = tt.expand_dims %offs_m1_98 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc818) + %qT_ptrs_100 = arith.muli %qT_ptrs_99, %cst_0 : tensor<1x64xi32> loc(#loc819) + %qT_ptrs_101 = tt.addptr %qT_ptrs_75, %qT_ptrs_100 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc820) + %qT_ptrs_102 = tt.broadcast %qT_ptrs_101 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc821) + %qT_ptrs_103 = tt.addptr %qT_ptrs_102, %qT_ptrs_79 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc821) + %do_ptrs_104 = tt.expand_dims %offs_m1_98 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc822) + %do_ptrs_105 = arith.muli %do_ptrs_104, %cst : tensor<64x1xi32> loc(#loc823) + %do_ptrs_106 = tt.addptr %do_ptrs_82, %do_ptrs_105 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc824) + %do_ptrs_107 = tt.broadcast %do_ptrs_106 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc825) + %do_ptrs_108 = tt.addptr %do_ptrs_107, %do_ptrs_85 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc825) + %hi_109 = arith.muli %sparse_q_num_blocks_96, %c2_i32 : i32 loc(#loc826) + %hi_110 = arith.minsi %hi_109, %hi_89 : i32 loc(#loc827) + %do_ptrs_111:5 = scf.for %start_m = %c0_i32 to %hi_110 step %c1_i32 iter_args(%dk_112 = %do_ptrs_91#0, %dv_113 = %do_ptrs_91#1, %offs_m1_114 = %offs_m1_98, %qT_ptrs_115 = %qT_ptrs_103, %do_ptrs_116 = %do_ptrs_108) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %qT = tt.expand_dims %offs_m1_114 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc994) + %qT_117 = tt.splat %ks0 : i32 -> tensor<1x64xi32> loc(#loc995) + %qT_118 = arith.cmpi slt, %qT, %qT_117 : tensor<1x64xi32> loc(#loc995) + %qT_119 = tt.broadcast %qT_118 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc996) + %qT_120 = tt.load %qT_ptrs_115, %qT_119, %cst_12 : tensor<128x64x!tt.ptr> loc(#loc996) + %lse = tt.splat %ks0 : i32 -> tensor<64xi32> loc(#loc829) + %lse_121 = arith.cmpi slt, %offs_m1_114, %lse : tensor<64xi32> loc(#loc829) + %lse_122 = tt.splat %LSE1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc830) + %lse_123 = tt.addptr %lse_122, %offs_m1_114 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc830) + %lse_124 = tt.load %lse_123, %lse_121 : tensor<64x!tt.ptr> loc(#loc831) + %lse_125 = arith.cmpf oeq, %lse_124, %cst_5 : tensor<64xf32> loc(#loc832) + %lse_126 = arith.select %lse_125, %cst_4, %lse_124 : tensor<64xi1>, tensor<64xf32> loc(#loc833) + %qkT = tt.dot %k_38, %qT_120, %cst_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc834) + %qkT_127 = arith.mulf %qkT, %cst_10 : tensor<128x64xf32> loc(#loc835) + %post_mod_scores = arith.select %qT_119, %qkT_127, %cst_9 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc836) + %post_mod_scores_128 = arith.mulf %post_mod_scores, %cst_6 : tensor<128x64xf32> loc(#loc837) + %pT = tt.expand_dims %lse_126 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc838) + %pT_129 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc839) + %pT_130 = arith.subf %post_mod_scores_128, %pT_129 : tensor<128x64xf32> loc(#loc839) + %pT_131 = math.exp2 %pT_130 : tensor<128x64xf32> loc(#loc840) + %do = tt.expand_dims %offs_m1_114 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc997) + %do_132 = tt.splat %ks0 : i32 -> tensor<64x1xi32> loc(#loc998) + %do_133 = arith.cmpi slt, %do, %do_132 : tensor<64x1xi32> loc(#loc998) + %do_134 = tt.broadcast %do_133 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc999) + %do_135 = tt.load %do_ptrs_116, %do_134, %cst_2 : tensor<64x128x!tt.ptr> loc(#loc999) + %dv_136 = arith.truncf %pT_131 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc842) + %dv_137 = tt.dot %dv_136, %do_135, %dv_113, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc843) + %Di = tt.splat %DELTA1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc844) + %Di_138 = tt.addptr %Di, %offs_m1_114 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc844) + %Di_139 = tt.load %Di_138, %lse_121 : tensor<64x!tt.ptr> loc(#loc845) + %dpT = tt.trans %do_135 {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc846) + %dpT_140 = tt.dot %v, %dpT, %cst_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc847) + %dsT = tt.expand_dims %Di_139 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc848) + %dsT_141 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc849) + %dsT_142 = arith.subf %dpT_140, %dsT_141 : tensor<128x64xf32> loc(#loc849) + %dsT_143 = arith.mulf %pT_131, %dsT_142 : tensor<128x64xf32> loc(#loc850) + %grad_scores = arith.select %qT_119, %dsT_143, %cst_11 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc851) + %dk_144 = arith.truncf %grad_scores : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc852) + %dk_145 = tt.trans %qT_120 {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc853) + %dk_146 = tt.dot %dk_144, %dk_145, %dk_112, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc854) + %cur_block_idx = arith.divsi %start_m, %c2_i32 : i32 loc(#loc1000) + %cur_block = tt.addptr %q_indices_92, %cur_block_idx : !tt.ptr, i32 loc(#loc1001) + %cur_block_147 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc1002) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc1003) + %next_block_148 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_96 : i32 loc(#loc1004) + %next_block_149 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc1005) + %next_block_150 = tt.load %next_block_149, %next_block_148 evictionPolicy = evict_last : !tt.ptr loc(#loc1006) + %needs_jump = arith.addi %start_m, %c1_i32 : i32 loc(#loc1007) + %needs_jump_151 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc1008) + %needs_jump_152 = arith.cmpi eq, %needs_jump_151, %c0_i32 : i32 loc(#loc1009) + %jump_to_block = arith.subi %next_block_150, %cur_block_147 : i32 loc(#loc1010) + %jump_to_block_153 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc1011) + %jump_to_block_154 = arith.subi %jump_to_block_153, %c64_i32 : i32 loc(#loc1012) + %offset = arith.extui %needs_jump_152 : i1 to i32 loc(#loc1013) + %offset_155 = arith.muli %jump_to_block_154, %offset : i32 loc(#loc1013) + %offset_156 = arith.subi %c1_i32, %offset : i32 loc(#loc1014) + %offset_157 = arith.muli %offset_156, %c64_i32 : i32 loc(#loc1015) + %offset_158 = arith.addi %offset_155, %offset_157 : i32 loc(#loc1016) + %qT_ptrs_159 = arith.muli %offset_158, %c4096_i32 : i32 loc(#loc856) + %qT_ptrs_160 = tt.splat %qT_ptrs_159 : i32 -> tensor<128x64xi32> loc(#loc857) + %qT_ptrs_161 = tt.addptr %qT_ptrs_115, %qT_ptrs_160 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc857) + %do_ptrs_162 = arith.muli %offset_158, %c128_i32 : i32 loc(#loc858) + %do_ptrs_163 = tt.splat %do_ptrs_162 : i32 -> tensor<64x128xi32> loc(#loc859) + %do_ptrs_164 = tt.addptr %do_ptrs_116, %do_ptrs_163 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc859) + %offs_m1_165 = tt.splat %offset_158 : i32 -> tensor<64xi32> loc(#loc860) + %offs_m1_166 = arith.addi %offs_m1_114, %offs_m1_165 : tensor<64xi32> loc(#loc860) + scf.yield %dk_146, %dv_137, %offs_m1_166, %qT_ptrs_161, %do_ptrs_164 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc664) + } loc(#loc1033) + scf.yield %do_ptrs_111#1, %do_ptrs_111#0 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc324) + } loc(#loc734) + %dv_ptrs = tt.splat %DV : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc665) + %dv_ptrs_43 = tt.addptr %dv_ptrs, %ptr_29 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc665) + %dv_ptrs_44 = tt.broadcast %dv_ptrs_43 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc666) + %dv_ptrs_45 = tt.addptr %dv_ptrs_44, %ptr_34 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc666) + %12 = arith.cmpi slt, %ptr_32, %cst_16 : tensor<1x128xi32> loc(#loc327) + %13 = tt.broadcast %12 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc328) + %14 = arith.andi %k_37, %13 : tensor<128x128xi1> loc(#loc328) + %15 = arith.truncf %dk#0 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc329) + tt.store %dv_ptrs_45, %15, %14 : tensor<128x128x!tt.ptr> loc(#loc329) + %dk_46 = arith.mulf %dk#1, %cst_17 : tensor<128x128xf32> loc(#loc667) + %xindex = tt.broadcast %ptr_29 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc668) + %xindex_47 = arith.addi %ptr_34, %xindex : tensor<128x128xi32> loc(#loc668) + %xindex_48 = arith.muli %off_hkv, %c128_i32 : i32 loc(#loc669) + %xindex_49 = arith.muli %xindex_48, %ks1 : i32 loc(#loc670) + %xindex_50 = tt.splat %xindex_49 : i32 -> tensor<128x128xi32> loc(#loc671) + %xindex_51 = arith.addi %xindex_47, %xindex_50 : tensor<128x128xi32> loc(#loc671) + %xindex_52 = arith.muli %off_zq, %c1024_i32 : i32 loc(#loc672) + %xindex_53 = arith.muli %xindex_52, %ks1 : i32 loc(#loc673) + %xindex_54 = tt.splat %xindex_53 : i32 -> tensor<128x128xi32> loc(#loc674) + %xindex_55 = arith.addi %xindex_51, %xindex_54 : tensor<128x128xi32> loc(#loc674) + %16 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc338) + %17 = tt.addptr %16, %xindex_55 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc338) + %18 = arith.truncf %dk_46 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc339) + tt.store %17, %18, %k_37 : tensor<128x128x!tt.ptr> loc(#loc339) + } loc(#loc33) + tt.return loc(#loc340) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":103:9) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":94:54) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":95:54) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":95:63) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:74) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:66) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:100) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:91) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:82) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:59) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":97:111) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":111:24) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":112:36) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":113:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":115:27) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":116:28) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":117:23) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":124:25) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":124:47) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":124:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":124:59) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":128:50) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":128:37) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":128:61) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":131:9) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":132:9) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":133:10) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":136:26) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":139:14) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":139:7) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":140:24) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":144:29) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":144:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":144:44) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":145:35) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":148:30) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":154:55) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":154:78) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":155:50) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":155:83) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":155:68) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":158:30) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":158:52) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":158:40) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":158:63) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":159:32) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":159:55) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":159:42) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":159:66) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":161:30) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":161:35) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":161:46) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":161:56) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":163:17) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":164:19) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":167:19) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":168:21) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":169:25) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":174:36) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":175:29) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":825:27) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":178:107) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":825:38) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":825:20) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":825:56) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":825:49) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":833:52) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":833:23) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":179:111) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":188:58) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":188:34) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":188:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":189:33) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":189:26) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":190:30) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":190:50) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":191:18) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":195:30) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":196:27) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":196:41) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":197:53) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":197:39) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":199:42) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":199:29) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":390:26) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":207:12) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":390:37) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":390:18) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":390:56) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":390:49) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":391:18) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":391:49) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":395:43) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":395:90) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":395:101) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":395:63) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":397:28) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":831:41) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":458:105) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":405:12) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":831:52) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":831:23) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":459:19) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":461:14) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":798:21) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":464:46) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":467:46) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":476:79) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":482:23) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":483:23) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":485:34) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":485:23) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":486:22) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":487:23) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":488:23) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":489:23) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":490:23) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":493:24) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":494:24) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":496:25) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":497:92) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":498:92) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":499:25) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":500:24) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":501:24) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":502:39) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":503:25) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":504:24) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":505:24) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":506:23) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":507:25) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":508:25) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":509:92) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":510:25) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":511:24) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":512:24) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":513:39) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":514:25) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":515:24) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":516:24) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":521:69) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":524:27) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":525:39) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":525:21) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":528:104) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":530:20) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":531:22) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":531:19) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":531:14) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":538:71) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":549:43) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":551:15) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":553:30) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":553:21) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":788:33) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":411:64) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":789:38) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":789:24) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":790:109) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":790:113) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":790:55) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":790:25) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":791:30) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":791:35) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":791:60) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":792:34) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":792:48) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":792:63) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":793:29) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":793:47) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":793:61) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":793:42) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":414:28) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":414:19) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":415:19) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":417:19) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":417:8) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":214:39) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":215:31) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":215:45) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":216:62) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":216:43) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":218:33) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":226:16) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":231:24) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":231:56) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":232:14) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":236:87) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":236:69) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":236:30) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":245:29) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":252:25) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":253:29) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":256:107) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":257:107) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":262:30) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":263:32) +#loc201 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":263:51) +#loc202 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":266:34) +#loc203 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":266:56) +#loc204 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":266:44) +#loc205 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":266:67) +#loc206 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":267:36) +#loc207 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":267:59) +#loc208 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":267:46) +#loc209 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":267:70) +#loc210 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":269:34) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":269:39) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":269:50) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":269:60) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":271:21) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":272:23) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":275:25) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":276:29) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":281:58) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":281:80) +#loc220 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":282:53) +#loc221 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":282:81) +#loc222 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":282:70) +#loc223 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":286:32) +#loc224 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":287:30) +#loc225 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":287:43) +#loc226 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":288:55) +#loc227 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":288:42) +#loc228 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":290:45) +#loc229 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":290:32) +#loc230 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":601:26) +#loc231 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":298:16) +#loc232 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":601:37) +#loc233 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":601:18) +#loc234 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":601:56) +#loc235 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":601:49) +#loc236 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":602:27) +#loc237 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":602:38) +#loc238 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":602:19) +#loc239 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":602:51) +#loc240 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":608:42) +#loc241 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":608:87) +#loc242 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":608:98) +#loc243 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":608:61) +#loc244 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":610:28) +#loc245 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":669:105) +#loc246 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":618:12) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":674:52) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":674:28) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":674:22) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":675:26) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":675:46) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":676:20) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":678:15) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":680:46) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":683:46) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":692:78) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":698:25) +#loc258 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":699:25) +#loc259 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":701:35) +#loc260 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":701:24) +#loc261 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":702:24) +#loc262 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":703:25) +#loc263 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":704:24) +#loc264 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":705:24) +#loc265 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":706:24) +#loc266 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":709:25) +#loc267 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":710:25) +#loc268 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":712:25) +#loc269 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":713:92) +#loc270 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":714:92) +#loc271 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":715:25) +#loc272 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":716:24) +#loc273 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":717:24) +#loc274 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":718:39) +#loc275 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":719:25) +#loc276 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":720:24) +#loc277 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":721:24) +#loc278 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":722:24) +#loc279 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":723:25) +#loc280 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":724:25) +#loc281 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":725:92) +#loc282 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":726:25) +#loc283 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":727:24) +#loc284 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":728:24) +#loc285 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":729:39) +#loc286 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":730:25) +#loc287 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":731:24) +#loc288 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":732:24) +#loc289 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":736:69) +#loc290 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":739:27) +#loc291 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":740:44) +#loc292 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":740:40) +#loc293 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":740:22) +#loc294 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":833:41) +#loc295 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":741:99) +#loc296 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":744:24) +#loc297 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":744:43) +#loc298 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":748:29) +#loc299 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":748:21) +#loc300 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":750:29) +#loc301 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":750:20) +#loc302 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":751:25) +#loc303 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":751:22) +#loc304 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":751:16) +#loc305 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":759:70) +#loc306 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":773:45) +#loc307 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":775:24) +#loc308 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":775:52) +#loc309 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":775:43) +#loc310 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":623:62) +#loc311 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":626:28) +#loc312 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":626:19) +#loc313 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":627:28) +#loc314 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":627:19) +#loc315 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":628:19) +#loc316 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":628:8) +#loc317 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":306:41) +#loc318 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":307:34) +#loc319 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":307:47) +#loc320 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":308:64) +#loc321 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":308:46) +#loc322 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":310:36) +#loc323 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":318:20) +#loc324 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":303:12) +#loc325 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":323:23) +#loc326 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":323:55) +#loc327 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":332:71) +#loc328 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":332:61) +#loc329 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":332:30) +#loc330 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":334:14) +#loc331 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:27) +#loc332 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:45) +#loc333 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:53) +#loc334 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:41) +#loc335 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:64) +#loc336 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:71) +#loc337 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":344:59) +#loc338 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":345:29) +#loc339 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":345:69) +#loc340 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t3/ct3fq5xthzdec3x2zy2wlma26maa6ruf2v2tzswhoos342jsn7ev.py":139:4) +#loc368 = loc("HQ"(#loc2)) +#loc369 = loc("pid"(#loc13)) +#loc370 = loc("NUM_KV_BLOCKS"(#loc15)) +#loc371 = loc("NUM_Q_BLOCKS"(#loc17)) +#loc372 = loc("off_zq"(#loc18)) +#loc373 = loc("off_hkv"(#loc19)) +#loc374 = loc("off_zkv"(#loc20)) +#loc375 = loc("k_adj"(#loc21)) +#loc376 = loc("k_adj"(#loc22)) +#loc377 = loc("k_adj"(#loc23)) +#loc378 = loc("k_adj"(#loc24)) +#loc379 = loc("dv_adj"(#loc25)) +#loc380 = loc("dv_adj"(#loc26)) +#loc381 = loc("dv_adj"(#loc27)) +#loc382 = loc("K"(#loc28)) +#loc383 = loc("V"(#loc29)) +#loc384 = loc("DV"(#loc30)) +#loc385 = loc("offs_k"(#loc31)) +#loc386 = loc("off_pid"(#loc34)) +#loc387 = loc("off_hq2"(#loc35)) +#loc388 = loc("off_hq2"(#loc36)) +#loc389 = loc("off_hq2"(#loc37)) +#loc390 = loc("start_m2_block"(#loc38)) +#loc391 = loc("stride_kv_idx_h"(#loc39)) +#loc392 = loc("sparse_kv_num_blks_offset"(#loc40)) +#loc393 = loc("sparse_kv_num_blks_offset"(#loc41)) +#loc394 = loc("sparse_kv_idx_offset"(#loc42)) +#loc395 = loc("sparse_kv_idx_offset"(#loc43)) +#loc396 = loc("sparse_kv_idx_offset"(#loc44)) +#loc397 = loc("q_adj2"(#loc45)) +#loc398 = loc("q_adj2"(#loc46)) +#loc399 = loc("q_adj2"(#loc47)) +#loc400 = loc("q_adj2"(#loc48)) +#loc401 = loc("do_adj2"(#loc49)) +#loc402 = loc("do_adj2"(#loc50)) +#loc403 = loc("do_adj2"(#loc51)) +#loc404 = loc("do_adj2"(#loc52)) +#loc405 = loc("off_chz2"(#loc53)) +#loc406 = loc("off_chz2"(#loc54)) +#loc407 = loc("off_chz2"(#loc55)) +#loc408 = loc("off_chz2"(#loc56)) +#loc409 = loc("Q2"(#loc57)) +#loc410 = loc("DO2"(#loc58)) +#loc411 = loc("DQ2"(#loc59)) +#loc412 = loc("LSE2"(#loc60)) +#loc413 = loc("DELTA2"(#loc61)) +#loc414 = loc("start_m2"(#loc62)) +#loc415 = loc("offs_m2"(#loc63)) +#loc416 = loc("ptr"(#loc64)) +#loc417 = loc("q"(#loc65)) +#loc418 = loc("ptr"(#loc66)) +#loc419 = loc("ptr"(#loc67)) +#loc420 = loc("ptr"(#loc68)) +#loc421 = loc("ptr"(#loc69)) +#loc422 = loc("do"(#loc72)) +#loc423 = loc("Di"(#loc73)) +#loc424 = loc("Di"(#loc74)) +#loc425 = loc("Di"(#loc75)) +#loc426 = loc("lse"(#loc76)) +#loc427 = loc("lse"(#loc77)) +#loc428 = loc("lse"(#loc78)) +#loc429 = loc("lse"(#loc79)) +#loc430 = loc("lse"(#loc80)) +#loc431 = loc("kv_indices"(#loc81)) +#loc432 = loc("kv_start"(#loc82)) +#loc433 = loc("kv_start"(#loc83)) +#loc434 = loc("sparse_kv_num_blocks"(#loc84)) +#loc435 = loc("sparse_kv_num_blocks"(#loc85)) +#loc436 = loc("offs_n2"(#loc86)) +#loc437 = loc("offs_n2"(#loc87)) +#loc438 = loc("kT_ptrs"(#loc88)) +#loc439 = loc("dq"(#loc89)) +#loc440 = loc("kT_ptrs"(#loc90)) +#loc441 = loc("kT_ptrs"(#loc91)) +#loc442 = loc("kT_ptrs"(#loc92)) +#loc443 = loc("kT_ptrs"(#loc93)) +#loc444 = loc("vT_ptrs"(#loc94)) +#loc445 = loc("vT_ptrs"(#loc95)) +#loc446 = loc("hi"(#loc96)) +#loc447 = loc("hi"(#loc97)) +#loc448 = loc("hi"(#loc98)) +#loc449 = loc("hi"(#loc99)) +#loc450 = loc("dq"(#loc100)) +#loc451 = loc("kT"(#loc102)) +#loc452 = loc("dq"(#loc103)) +#loc453 = loc("qk"(#loc106)) +#loc454 = loc("qk"(#loc107)) +#loc455 = loc("n"(#loc109)) +#loc456 = loc("m"(#loc110)) +#loc457 = loc("post_mod_scores"(#loc111)) +#loc458 = loc("tmp4"(#loc112)) +#loc459 = loc("tmp5"(#loc113)) +#loc460 = loc("tmp7"(#loc114)) +#loc461 = loc("tmp7"(#loc115)) +#loc462 = loc("tmp8"(#loc116)) +#loc463 = loc("tmp9"(#loc117)) +#loc464 = loc("tmp10"(#loc118)) +#loc465 = loc("tmp11"(#loc119)) +#loc466 = loc("tmp12"(#loc120)) +#loc467 = loc("tmp15"(#loc121)) +#loc468 = loc("tmp16"(#loc122)) +#loc469 = loc("tmp18"(#loc123)) +#loc470 = loc("tmp19"(#loc124)) +#loc471 = loc("tmp20"(#loc125)) +#loc472 = loc("tmp21"(#loc126)) +#loc473 = loc("tmp22"(#loc127)) +#loc474 = loc("tmp23"(#loc128)) +#loc475 = loc("tmp24"(#loc129)) +#loc476 = loc("tmp25"(#loc130)) +#loc477 = loc("tmp26"(#loc131)) +#loc478 = loc("tmp27"(#loc132)) +#loc479 = loc("tmp28"(#loc133)) +#loc480 = loc("tmp29"(#loc134)) +#loc481 = loc("tmp30"(#loc135)) +#loc482 = loc("tmp31"(#loc136)) +#loc483 = loc("tmp32"(#loc137)) +#loc484 = loc("tmp33"(#loc138)) +#loc485 = loc("tmp34"(#loc139)) +#loc486 = loc("tmp35"(#loc140)) +#loc487 = loc("tmp36"(#loc141)) +#loc488 = loc("tmp37"(#loc142)) +#loc489 = loc("tmp38"(#loc143)) +#loc490 = loc("post_mod_scores"(#loc144)) +#loc491 = loc("post_mod_scores"(#loc145)) +#loc492 = loc("p"(#loc146)) +#loc493 = loc("p"(#loc147)) +#loc494 = loc("vT"(#loc148)) +#loc495 = loc("dp"(#loc149)) +#loc496 = loc("ds"(#loc150)) +#loc497 = loc("ds"(#loc151)) +#loc498 = loc("ds"(#loc152)) +#loc499 = loc("grad_scores"(#loc153)) +#loc500 = loc("ds"(#loc154)) +#loc501 = loc("ds"(#loc155)) +#loc502 = loc("dq"(#loc156)) +#loc503 = loc("dq"(#loc157)) +#loc504 = loc("cur_block_idx"(#loc158)) +#loc505 = loc("offset"(#loc159)) +#loc506 = loc("cur_block"(#loc160)) +#loc507 = loc("cur_block"(#loc161)) +#loc508 = loc("next_block"(#loc162)) +#loc509 = loc("next_block"(#loc163)) +#loc510 = loc("next_block"(#loc164)) +#loc511 = loc("next_block"(#loc165)) +#loc512 = loc("needs_jump"(#loc166)) +#loc513 = loc("needs_jump"(#loc167)) +#loc514 = loc("needs_jump"(#loc168)) +#loc515 = loc("jump_to_block"(#loc169)) +#loc516 = loc("jump_to_block"(#loc170)) +#loc517 = loc("jump_to_block"(#loc171)) +#loc518 = loc("offset"(#loc172)) +#loc519 = loc("offset"(#loc173)) +#loc520 = loc("offset"(#loc174)) +#loc521 = loc("offset"(#loc175)) +#loc522 = loc("kT_ptrs"(#loc176)) +#loc523 = loc("kT_ptrs"(#loc177)) +#loc524 = loc("vT_ptrs"(#loc178)) +#loc525 = loc("offs_n2"(#loc179)) +#loc526 = loc("kv_indices"(#loc181)) +#loc527 = loc("kv_start"(#loc182)) +#loc528 = loc("kv_start"(#loc183)) +#loc529 = loc("sparse_kv_num_blocks"(#loc184)) +#loc530 = loc("sparse_kv_num_blocks"(#loc185)) +#loc531 = loc("offs_n2"(#loc186)) +#loc532 = loc("dq"(#loc187)) +#loc533 = loc("dq_ptrs"(#loc188)) +#loc534 = loc("dq_ptrs"(#loc189)) +#loc535 = loc("dq"(#loc190)) +#loc536 = loc("stride_q_idx_h"(#loc194)) +#loc537 = loc("start_n1"(#loc195)) +#loc538 = loc("offs_n1"(#loc196)) +#loc539 = loc("k"(#loc197)) +#loc540 = loc("v"(#loc198)) +#loc541 = loc("dv"(#loc199)) +#loc542 = loc("off_hq1"(#loc200)) +#loc543 = loc("off_hq1"(#loc201)) +#loc544 = loc("q_adj1"(#loc202)) +#loc545 = loc("q_adj1"(#loc203)) +#loc546 = loc("q_adj1"(#loc204)) +#loc547 = loc("q_adj1"(#loc205)) +#loc548 = loc("do_adj1"(#loc206)) +#loc549 = loc("do_adj1"(#loc207)) +#loc550 = loc("do_adj1"(#loc208)) +#loc551 = loc("do_adj1"(#loc209)) +#loc552 = loc("off_chz1"(#loc210)) +#loc553 = loc("off_chz1"(#loc211)) +#loc554 = loc("off_chz1"(#loc212)) +#loc555 = loc("off_chz1"(#loc213)) +#loc556 = loc("Q1"(#loc214)) +#loc557 = loc("DO1"(#loc215)) +#loc558 = loc("LSE1"(#loc216)) +#loc559 = loc("DELTA1"(#loc217)) +#loc560 = loc("sparse_q_num_blks_offset"(#loc218)) +#loc561 = loc("sparse_q_num_blks_offset"(#loc219)) +#loc562 = loc("sparse_q_idx_offset"(#loc220)) +#loc563 = loc("sparse_q_idx_offset"(#loc221)) +#loc564 = loc("sparse_q_idx_offset"(#loc222)) +#loc565 = loc("q_indices"(#loc223)) +#loc566 = loc("q_start"(#loc224)) +#loc567 = loc("q_start"(#loc225)) +#loc568 = loc("sparse_q_num_blocks"(#loc226)) +#loc569 = loc("sparse_q_num_blocks"(#loc227)) +#loc570 = loc("offs_m1"(#loc228)) +#loc571 = loc("offs_m1"(#loc229)) +#loc572 = loc("qT_ptrs"(#loc230)) +#loc573 = loc("qT_ptrs"(#loc232)) +#loc574 = loc("qT_ptrs"(#loc233)) +#loc575 = loc("qT_ptrs"(#loc234)) +#loc576 = loc("qT_ptrs"(#loc235)) +#loc577 = loc("do_ptrs"(#loc236)) +#loc578 = loc("do_ptrs"(#loc237)) +#loc579 = loc("do_ptrs"(#loc238)) +#loc580 = loc("do_ptrs"(#loc239)) +#loc581 = loc("hi"(#loc240)) +#loc582 = loc("hi"(#loc241)) +#loc583 = loc("hi"(#loc242)) +#loc584 = loc("hi"(#loc243)) +#loc585 = loc("dk"(#loc244)) +#loc586 = loc("qT"(#loc245)) +#loc587 = loc(callsite(#loc246 at #loc231)) +#loc588 = loc("lse"(#loc247)) +#loc589 = loc("lse"(#loc248)) +#loc590 = loc("lse"(#loc249)) +#loc591 = loc("lse"(#loc250)) +#loc592 = loc("lse"(#loc251)) +#loc593 = loc("qkT"(#loc252)) +#loc594 = loc("qkT"(#loc253)) +#loc595 = loc("m"(#loc254)) +#loc596 = loc("n"(#loc255)) +#loc597 = loc("post_mod_scores"(#loc256)) +#loc598 = loc("tmp44"(#loc257)) +#loc599 = loc("tmp45"(#loc258)) +#loc600 = loc("tmp47"(#loc259)) +#loc601 = loc("tmp47"(#loc260)) +#loc602 = loc("tmp48"(#loc261)) +#loc603 = loc("tmp49"(#loc262)) +#loc604 = loc("tmp50"(#loc263)) +#loc605 = loc("tmp51"(#loc264)) +#loc606 = loc("tmp52"(#loc265)) +#loc607 = loc("tmp55"(#loc266)) +#loc608 = loc("tmp56"(#loc267)) +#loc609 = loc("tmp58"(#loc268)) +#loc610 = loc("tmp59"(#loc269)) +#loc611 = loc("tmp60"(#loc270)) +#loc612 = loc("tmp61"(#loc271)) +#loc613 = loc("tmp62"(#loc272)) +#loc614 = loc("tmp63"(#loc273)) +#loc615 = loc("tmp64"(#loc274)) +#loc616 = loc("tmp65"(#loc275)) +#loc617 = loc("tmp66"(#loc276)) +#loc618 = loc("tmp67"(#loc277)) +#loc619 = loc("tmp68"(#loc278)) +#loc620 = loc("tmp69"(#loc279)) +#loc621 = loc("tmp70"(#loc280)) +#loc622 = loc("tmp71"(#loc281)) +#loc623 = loc("tmp72"(#loc282)) +#loc624 = loc("tmp73"(#loc283)) +#loc625 = loc("tmp74"(#loc284)) +#loc626 = loc("tmp75"(#loc285)) +#loc627 = loc("tmp76"(#loc286)) +#loc628 = loc("tmp77"(#loc287)) +#loc629 = loc("tmp78"(#loc288)) +#loc630 = loc("post_mod_scores"(#loc289)) +#loc631 = loc("post_mod_scores"(#loc290)) +#loc632 = loc("pT"(#loc291)) +#loc633 = loc("pT"(#loc292)) +#loc634 = loc("pT"(#loc293)) +#loc635 = loc("do"(#loc295)) +#loc636 = loc("dv"(#loc296)) +#loc637 = loc("dv"(#loc297)) +#loc638 = loc("Di"(#loc298)) +#loc639 = loc("Di"(#loc299)) +#loc640 = loc("dpT"(#loc300)) +#loc641 = loc("dpT"(#loc301)) +#loc642 = loc("dsT"(#loc302)) +#loc643 = loc("dsT"(#loc303)) +#loc644 = loc("dsT"(#loc304)) +#loc645 = loc("grad_scores"(#loc305)) +#loc646 = loc("dsT"(#loc306)) +#loc647 = loc("dk"(#loc307)) +#loc648 = loc("dk"(#loc308)) +#loc649 = loc("dk"(#loc309)) +#loc650 = loc("offset"(#loc310)) +#loc651 = loc("qT_ptrs"(#loc311)) +#loc652 = loc("qT_ptrs"(#loc312)) +#loc653 = loc("do_ptrs"(#loc313)) +#loc654 = loc("do_ptrs"(#loc314)) +#loc655 = loc("offs_m1"(#loc315)) +#loc656 = loc(callsite(#loc316 at #loc231)) +#loc657 = loc("q_indices"(#loc317)) +#loc658 = loc("q_start"(#loc318)) +#loc659 = loc("q_start"(#loc319)) +#loc660 = loc("sparse_q_num_blocks"(#loc320)) +#loc661 = loc("sparse_q_num_blocks"(#loc321)) +#loc662 = loc("offs_m1"(#loc322)) +#loc663 = loc(callsite(#loc246 at #loc323)) +#loc664 = loc(callsite(#loc316 at #loc323)) +#loc665 = loc("dv_ptrs"(#loc325)) +#loc666 = loc("dv_ptrs"(#loc326)) +#loc667 = loc("dk"(#loc330)) +#loc668 = loc("xindex"(#loc331)) +#loc669 = loc("xindex"(#loc332)) +#loc670 = loc("xindex"(#loc333)) +#loc671 = loc("xindex"(#loc334)) +#loc672 = loc("xindex"(#loc335)) +#loc673 = loc("xindex"(#loc336)) +#loc674 = loc("xindex"(#loc337)) +#loc675 = loc(callsite(#loc14 at #loc370)) +#loc676 = loc(callsite(#loc16 at #loc370)) +#loc677 = loc(callsite(#loc14 at #loc371)) +#loc678 = loc(callsite(#loc16 at #loc371)) +#loc679 = loc(callsite(#loc416 at #loc417)) +#loc680 = loc(callsite(#loc418 at #loc417)) +#loc681 = loc(callsite(#loc419 at #loc417)) +#loc682 = loc(callsite(#loc420 at #loc417)) +#loc683 = loc(callsite(#loc421 at #loc417)) +#loc684 = loc(callsite(#loc70 at #loc417)) +#loc685 = loc(callsite(#loc71 at #loc417)) +#loc686 = loc(callsite(#loc418 at #loc422)) +#loc687 = loc(callsite(#loc419 at #loc422)) +#loc688 = loc(callsite(#loc421 at #loc422)) +#loc689 = loc(callsite(#loc71 at #loc422)) +#loc690 = loc(callsite(#loc438 at #loc439)) +#loc691 = loc(callsite(#loc440 at #loc439)) +#loc692 = loc(callsite(#loc441 at #loc439)) +#loc693 = loc(callsite(#loc442 at #loc439)) +#loc694 = loc(callsite(#loc443 at #loc439)) +#loc695 = loc(callsite(#loc444 at #loc439)) +#loc696 = loc(callsite(#loc445 at #loc439)) +#loc697 = loc(callsite(#loc446 at #loc439)) +#loc698 = loc(callsite(#loc447 at #loc439)) +#loc699 = loc(callsite(#loc448 at #loc439)) +#loc700 = loc(callsite(#loc449 at #loc439)) +#loc701 = loc("offs_n2"(#loc450)) +#loc702 = loc(callsite(#loc452 at #loc439)) +#loc703 = loc(callsite(#loc505 at #loc439)) +#loc704 = loc(callsite(#loc522 at #loc439)) +#loc705 = loc(callsite(#loc523 at #loc439)) +#loc706 = loc(callsite(#loc524 at #loc439)) +#loc707 = loc(callsite(#loc525 at #loc439)) +#loc708 = loc(callsite(#loc180 at #loc439)) +#loc709 = loc(callsite(#loc438 at #loc532)) +#loc710 = loc(callsite(#loc440 at #loc532)) +#loc711 = loc(callsite(#loc441 at #loc532)) +#loc712 = loc(callsite(#loc443 at #loc532)) +#loc713 = loc(callsite(#loc444 at #loc532)) +#loc714 = loc(callsite(#loc445 at #loc532)) +#loc715 = loc(callsite(#loc446 at #loc532)) +#loc716 = loc(callsite(#loc449 at #loc532)) +#loc717 = loc(callsite(#loc452 at #loc532)) +#loc718 = loc(callsite(#loc505 at #loc532)) +#loc719 = loc(callsite(#loc522 at #loc532)) +#loc720 = loc(callsite(#loc523 at #loc532)) +#loc721 = loc(callsite(#loc524 at #loc532)) +#loc722 = loc(callsite(#loc525 at #loc532)) +#loc723 = loc(callsite(#loc180 at #loc532)) +#loc724 = loc(callsite(#loc416 at #loc539)) +#loc725 = loc(callsite(#loc418 at #loc539)) +#loc726 = loc(callsite(#loc419 at #loc539)) +#loc727 = loc(callsite(#loc420 at #loc539)) +#loc728 = loc(callsite(#loc421 at #loc539)) +#loc729 = loc(callsite(#loc70 at #loc539)) +#loc730 = loc(callsite(#loc71 at #loc539)) +#loc731 = loc(callsite(#loc419 at #loc540)) +#loc732 = loc(callsite(#loc421 at #loc540)) +#loc733 = loc(callsite(#loc71 at #loc540)) +#loc734 = loc("dk"(#loc541)) +#loc735 = loc(callsite(#loc572 at #loc231)) +#loc736 = loc(callsite(#loc573 at #loc231)) +#loc737 = loc(callsite(#loc574 at #loc231)) +#loc738 = loc(callsite(#loc575 at #loc231)) +#loc739 = loc(callsite(#loc576 at #loc231)) +#loc740 = loc(callsite(#loc577 at #loc231)) +#loc741 = loc(callsite(#loc578 at #loc231)) +#loc742 = loc(callsite(#loc579 at #loc231)) +#loc743 = loc(callsite(#loc580 at #loc231)) +#loc744 = loc(callsite(#loc581 at #loc231)) +#loc745 = loc(callsite(#loc582 at #loc231)) +#loc746 = loc(callsite(#loc583 at #loc231)) +#loc747 = loc(callsite(#loc584 at #loc231)) +#loc748 = loc("dv"(#loc585)) +#loc749 = loc(callsite(#loc586 at #loc587)) +#loc750 = loc(callsite(#loc588 at #loc587)) +#loc751 = loc(callsite(#loc589 at #loc587)) +#loc752 = loc(callsite(#loc590 at #loc587)) +#loc753 = loc(callsite(#loc591 at #loc587)) +#loc754 = loc(callsite(#loc592 at #loc587)) +#loc755 = loc(callsite(#loc593 at #loc587)) +#loc756 = loc(callsite(#loc594 at #loc587)) +#loc757 = loc(callsite(#loc595 at #loc587)) +#loc758 = loc(callsite(#loc596 at #loc587)) +#loc759 = loc(callsite(#loc597 at #loc587)) +#loc760 = loc(callsite(#loc598 at #loc587)) +#loc761 = loc(callsite(#loc599 at #loc587)) +#loc762 = loc(callsite(#loc600 at #loc587)) +#loc763 = loc(callsite(#loc601 at #loc587)) +#loc764 = loc(callsite(#loc602 at #loc587)) +#loc765 = loc(callsite(#loc603 at #loc587)) +#loc766 = loc(callsite(#loc604 at #loc587)) +#loc767 = loc(callsite(#loc605 at #loc587)) +#loc768 = loc(callsite(#loc606 at #loc587)) +#loc769 = loc(callsite(#loc607 at #loc587)) +#loc770 = loc(callsite(#loc608 at #loc587)) +#loc771 = loc(callsite(#loc609 at #loc587)) +#loc772 = loc(callsite(#loc610 at #loc587)) +#loc773 = loc(callsite(#loc611 at #loc587)) +#loc774 = loc(callsite(#loc612 at #loc587)) +#loc775 = loc(callsite(#loc613 at #loc587)) +#loc776 = loc(callsite(#loc614 at #loc587)) +#loc777 = loc(callsite(#loc615 at #loc587)) +#loc778 = loc(callsite(#loc616 at #loc587)) +#loc779 = loc(callsite(#loc617 at #loc587)) +#loc780 = loc(callsite(#loc618 at #loc587)) +#loc781 = loc(callsite(#loc619 at #loc587)) +#loc782 = loc(callsite(#loc620 at #loc587)) +#loc783 = loc(callsite(#loc621 at #loc587)) +#loc784 = loc(callsite(#loc622 at #loc587)) +#loc785 = loc(callsite(#loc623 at #loc587)) +#loc786 = loc(callsite(#loc624 at #loc587)) +#loc787 = loc(callsite(#loc625 at #loc587)) +#loc788 = loc(callsite(#loc626 at #loc587)) +#loc789 = loc(callsite(#loc627 at #loc587)) +#loc790 = loc(callsite(#loc628 at #loc587)) +#loc791 = loc(callsite(#loc629 at #loc587)) +#loc792 = loc(callsite(#loc630 at #loc587)) +#loc793 = loc(callsite(#loc631 at #loc587)) +#loc794 = loc(callsite(#loc632 at #loc587)) +#loc795 = loc(callsite(#loc633 at #loc587)) +#loc796 = loc(callsite(#loc634 at #loc587)) +#loc797 = loc(callsite(#loc635 at #loc587)) +#loc798 = loc(callsite(#loc636 at #loc587)) +#loc799 = loc(callsite(#loc637 at #loc587)) +#loc800 = loc(callsite(#loc638 at #loc587)) +#loc801 = loc(callsite(#loc639 at #loc587)) +#loc802 = loc(callsite(#loc640 at #loc587)) +#loc803 = loc(callsite(#loc641 at #loc587)) +#loc804 = loc(callsite(#loc642 at #loc587)) +#loc805 = loc(callsite(#loc643 at #loc587)) +#loc806 = loc(callsite(#loc644 at #loc587)) +#loc807 = loc(callsite(#loc645 at #loc587)) +#loc808 = loc(callsite(#loc646 at #loc587)) +#loc809 = loc(callsite(#loc647 at #loc587)) +#loc810 = loc(callsite(#loc648 at #loc587)) +#loc811 = loc(callsite(#loc649 at #loc587)) +#loc812 = loc(callsite(#loc650 at #loc231)) +#loc813 = loc(callsite(#loc651 at #loc231)) +#loc814 = loc(callsite(#loc652 at #loc231)) +#loc815 = loc(callsite(#loc653 at #loc231)) +#loc816 = loc(callsite(#loc654 at #loc231)) +#loc817 = loc(callsite(#loc655 at #loc231)) +#loc818 = loc(callsite(#loc572 at #loc323)) +#loc819 = loc(callsite(#loc573 at #loc323)) +#loc820 = loc(callsite(#loc574 at #loc323)) +#loc821 = loc(callsite(#loc576 at #loc323)) +#loc822 = loc(callsite(#loc577 at #loc323)) +#loc823 = loc(callsite(#loc578 at #loc323)) +#loc824 = loc(callsite(#loc579 at #loc323)) +#loc825 = loc(callsite(#loc580 at #loc323)) +#loc826 = loc(callsite(#loc581 at #loc323)) +#loc827 = loc(callsite(#loc584 at #loc323)) +#loc828 = loc(callsite(#loc586 at #loc663)) +#loc829 = loc(callsite(#loc588 at #loc663)) +#loc830 = loc(callsite(#loc589 at #loc663)) +#loc831 = loc(callsite(#loc590 at #loc663)) +#loc832 = loc(callsite(#loc591 at #loc663)) +#loc833 = loc(callsite(#loc592 at #loc663)) +#loc834 = loc(callsite(#loc593 at #loc663)) +#loc835 = loc(callsite(#loc594 at #loc663)) +#loc836 = loc(callsite(#loc597 at #loc663)) +#loc837 = loc(callsite(#loc631 at #loc663)) +#loc838 = loc(callsite(#loc632 at #loc663)) +#loc839 = loc(callsite(#loc633 at #loc663)) +#loc840 = loc(callsite(#loc634 at #loc663)) +#loc841 = loc(callsite(#loc635 at #loc663)) +#loc842 = loc(callsite(#loc636 at #loc663)) +#loc843 = loc(callsite(#loc637 at #loc663)) +#loc844 = loc(callsite(#loc638 at #loc663)) +#loc845 = loc(callsite(#loc639 at #loc663)) +#loc846 = loc(callsite(#loc640 at #loc663)) +#loc847 = loc(callsite(#loc641 at #loc663)) +#loc848 = loc(callsite(#loc642 at #loc663)) +#loc849 = loc(callsite(#loc643 at #loc663)) +#loc850 = loc(callsite(#loc644 at #loc663)) +#loc851 = loc(callsite(#loc645 at #loc663)) +#loc852 = loc(callsite(#loc647 at #loc663)) +#loc853 = loc(callsite(#loc648 at #loc663)) +#loc854 = loc(callsite(#loc649 at #loc663)) +#loc855 = loc(callsite(#loc650 at #loc323)) +#loc856 = loc(callsite(#loc651 at #loc323)) +#loc857 = loc(callsite(#loc652 at #loc323)) +#loc858 = loc(callsite(#loc653 at #loc323)) +#loc859 = loc(callsite(#loc654 at #loc323)) +#loc860 = loc(callsite(#loc655 at #loc323)) +#loc861 = loc(callsite(#loc14 at #loc698)) +#loc862 = loc(callsite(#loc16 at #loc698)) +#loc863 = loc("kT_ptrs"(#loc701)) +#loc864 = loc(callsite(#loc451 at #loc702)) +#loc865 = loc(callsite(#loc453 at #loc702)) +#loc866 = loc(callsite(#loc454 at #loc702)) +#loc867 = loc(callsite(#loc455 at #loc702)) +#loc868 = loc(callsite(#loc456 at #loc702)) +#loc869 = loc(callsite(#loc457 at #loc702)) +#loc870 = loc(callsite(#loc458 at #loc702)) +#loc871 = loc(callsite(#loc459 at #loc702)) +#loc872 = loc(callsite(#loc460 at #loc702)) +#loc873 = loc(callsite(#loc461 at #loc702)) +#loc874 = loc(callsite(#loc462 at #loc702)) +#loc875 = loc(callsite(#loc463 at #loc702)) +#loc876 = loc(callsite(#loc464 at #loc702)) +#loc877 = loc(callsite(#loc465 at #loc702)) +#loc878 = loc(callsite(#loc466 at #loc702)) +#loc879 = loc(callsite(#loc467 at #loc702)) +#loc880 = loc(callsite(#loc468 at #loc702)) +#loc881 = loc(callsite(#loc469 at #loc702)) +#loc882 = loc(callsite(#loc470 at #loc702)) +#loc883 = loc(callsite(#loc471 at #loc702)) +#loc884 = loc(callsite(#loc472 at #loc702)) +#loc885 = loc(callsite(#loc473 at #loc702)) +#loc886 = loc(callsite(#loc474 at #loc702)) +#loc887 = loc(callsite(#loc475 at #loc702)) +#loc888 = loc(callsite(#loc476 at #loc702)) +#loc889 = loc(callsite(#loc477 at #loc702)) +#loc890 = loc(callsite(#loc478 at #loc702)) +#loc891 = loc(callsite(#loc479 at #loc702)) +#loc892 = loc(callsite(#loc480 at #loc702)) +#loc893 = loc(callsite(#loc481 at #loc702)) +#loc894 = loc(callsite(#loc482 at #loc702)) +#loc895 = loc(callsite(#loc483 at #loc702)) +#loc896 = loc(callsite(#loc484 at #loc702)) +#loc897 = loc(callsite(#loc485 at #loc702)) +#loc898 = loc(callsite(#loc486 at #loc702)) +#loc899 = loc(callsite(#loc487 at #loc702)) +#loc900 = loc(callsite(#loc488 at #loc702)) +#loc901 = loc(callsite(#loc489 at #loc702)) +#loc902 = loc(callsite(#loc490 at #loc702)) +#loc903 = loc(callsite(#loc491 at #loc702)) +#loc904 = loc(callsite(#loc492 at #loc702)) +#loc905 = loc(callsite(#loc493 at #loc702)) +#loc906 = loc(callsite(#loc494 at #loc702)) +#loc907 = loc(callsite(#loc495 at #loc702)) +#loc908 = loc(callsite(#loc496 at #loc702)) +#loc909 = loc(callsite(#loc497 at #loc702)) +#loc910 = loc(callsite(#loc498 at #loc702)) +#loc911 = loc(callsite(#loc499 at #loc702)) +#loc912 = loc(callsite(#loc500 at #loc702)) +#loc913 = loc(callsite(#loc501 at #loc702)) +#loc914 = loc(callsite(#loc502 at #loc702)) +#loc915 = loc(callsite(#loc503 at #loc702)) +#loc916 = loc(callsite(#loc504 at #loc703)) +#loc917 = loc(callsite(#loc506 at #loc703)) +#loc918 = loc(callsite(#loc507 at #loc703)) +#loc919 = loc(callsite(#loc508 at #loc703)) +#loc920 = loc(callsite(#loc509 at #loc703)) +#loc921 = loc(callsite(#loc510 at #loc703)) +#loc922 = loc(callsite(#loc511 at #loc703)) +#loc923 = loc(callsite(#loc512 at #loc703)) +#loc924 = loc(callsite(#loc513 at #loc703)) +#loc925 = loc(callsite(#loc514 at #loc703)) +#loc926 = loc(callsite(#loc515 at #loc703)) +#loc927 = loc(callsite(#loc516 at #loc703)) +#loc928 = loc(callsite(#loc517 at #loc703)) +#loc929 = loc(callsite(#loc518 at #loc703)) +#loc930 = loc(callsite(#loc519 at #loc703)) +#loc931 = loc(callsite(#loc520 at #loc703)) +#loc932 = loc(callsite(#loc521 at #loc703)) +#loc933 = loc(callsite(#loc451 at #loc717)) +#loc934 = loc(callsite(#loc453 at #loc717)) +#loc935 = loc(callsite(#loc454 at #loc717)) +#loc936 = loc(callsite(#loc457 at #loc717)) +#loc937 = loc(callsite(#loc491 at #loc717)) +#loc938 = loc(callsite(#loc492 at #loc717)) +#loc939 = loc(callsite(#loc493 at #loc717)) +#loc940 = loc(callsite(#loc494 at #loc717)) +#loc941 = loc(callsite(#loc495 at #loc717)) +#loc942 = loc(callsite(#loc496 at #loc717)) +#loc943 = loc(callsite(#loc497 at #loc717)) +#loc944 = loc(callsite(#loc498 at #loc717)) +#loc945 = loc(callsite(#loc499 at #loc717)) +#loc946 = loc(callsite(#loc501 at #loc717)) +#loc947 = loc(callsite(#loc502 at #loc717)) +#loc948 = loc(callsite(#loc503 at #loc717)) +#loc949 = loc(callsite(#loc504 at #loc718)) +#loc950 = loc(callsite(#loc506 at #loc718)) +#loc951 = loc(callsite(#loc507 at #loc718)) +#loc952 = loc(callsite(#loc508 at #loc718)) +#loc953 = loc(callsite(#loc509 at #loc718)) +#loc954 = loc(callsite(#loc510 at #loc718)) +#loc955 = loc(callsite(#loc511 at #loc718)) +#loc956 = loc(callsite(#loc512 at #loc718)) +#loc957 = loc(callsite(#loc513 at #loc718)) +#loc958 = loc(callsite(#loc514 at #loc718)) +#loc959 = loc(callsite(#loc515 at #loc718)) +#loc960 = loc(callsite(#loc516 at #loc718)) +#loc961 = loc(callsite(#loc517 at #loc718)) +#loc962 = loc(callsite(#loc518 at #loc718)) +#loc963 = loc(callsite(#loc519 at #loc718)) +#loc964 = loc(callsite(#loc520 at #loc718)) +#loc965 = loc(callsite(#loc521 at #loc718)) +#loc966 = loc(callsite(#loc14 at #loc745)) +#loc967 = loc(callsite(#loc16 at #loc745)) +#loc968 = loc("offs_m1"(#loc748)) +#loc969 = loc(callsite(#loc101 at #loc749)) +#loc970 = loc(callsite(#loc104 at #loc749)) +#loc971 = loc(callsite(#loc105 at #loc749)) +#loc972 = loc(callsite(#loc108 at #loc757)) +#loc973 = loc(callsite(#loc108 at #loc758)) +#loc974 = loc(callsite(#loc294 at #loc797)) +#loc975 = loc(callsite(#loc70 at #loc797)) +#loc976 = loc(callsite(#loc71 at #loc797)) +#loc977 = loc(callsite(#loc504 at #loc812)) +#loc978 = loc(callsite(#loc506 at #loc812)) +#loc979 = loc(callsite(#loc507 at #loc812)) +#loc980 = loc(callsite(#loc508 at #loc812)) +#loc981 = loc(callsite(#loc509 at #loc812)) +#loc982 = loc(callsite(#loc510 at #loc812)) +#loc983 = loc(callsite(#loc511 at #loc812)) +#loc984 = loc(callsite(#loc512 at #loc812)) +#loc985 = loc(callsite(#loc513 at #loc812)) +#loc986 = loc(callsite(#loc514 at #loc812)) +#loc987 = loc(callsite(#loc515 at #loc812)) +#loc988 = loc(callsite(#loc516 at #loc812)) +#loc989 = loc(callsite(#loc517 at #loc812)) +#loc990 = loc(callsite(#loc518 at #loc812)) +#loc991 = loc(callsite(#loc519 at #loc812)) +#loc992 = loc(callsite(#loc520 at #loc812)) +#loc993 = loc(callsite(#loc521 at #loc812)) +#loc994 = loc(callsite(#loc101 at #loc828)) +#loc995 = loc(callsite(#loc104 at #loc828)) +#loc996 = loc(callsite(#loc105 at #loc828)) +#loc997 = loc(callsite(#loc294 at #loc841)) +#loc998 = loc(callsite(#loc70 at #loc841)) +#loc999 = loc(callsite(#loc71 at #loc841)) +#loc1000 = loc(callsite(#loc504 at #loc855)) +#loc1001 = loc(callsite(#loc506 at #loc855)) +#loc1002 = loc(callsite(#loc507 at #loc855)) +#loc1003 = loc(callsite(#loc508 at #loc855)) +#loc1004 = loc(callsite(#loc509 at #loc855)) +#loc1005 = loc(callsite(#loc510 at #loc855)) +#loc1006 = loc(callsite(#loc511 at #loc855)) +#loc1007 = loc(callsite(#loc512 at #loc855)) +#loc1008 = loc(callsite(#loc513 at #loc855)) +#loc1009 = loc(callsite(#loc514 at #loc855)) +#loc1010 = loc(callsite(#loc515 at #loc855)) +#loc1011 = loc(callsite(#loc516 at #loc855)) +#loc1012 = loc(callsite(#loc517 at #loc855)) +#loc1013 = loc(callsite(#loc518 at #loc855)) +#loc1014 = loc(callsite(#loc519 at #loc855)) +#loc1015 = loc(callsite(#loc520 at #loc855)) +#loc1016 = loc(callsite(#loc521 at #loc855)) +#loc1017 = loc("vT_ptrs"(#loc863)) +#loc1018 = loc(callsite(#loc101 at #loc864)) +#loc1019 = loc(callsite(#loc104 at #loc864)) +#loc1020 = loc(callsite(#loc105 at #loc864)) +#loc1021 = loc(callsite(#loc108 at #loc867)) +#loc1022 = loc(callsite(#loc108 at #loc868)) +#loc1023 = loc(callsite(#loc105 at #loc906)) +#loc1024 = loc(callsite(#loc101 at #loc933)) +#loc1025 = loc(callsite(#loc104 at #loc933)) +#loc1026 = loc(callsite(#loc105 at #loc933)) +#loc1027 = loc(callsite(#loc105 at #loc940)) +#loc1028 = loc("qT_ptrs"(#loc968)) +#loc1029 = loc(callsite(#loc1017 at #loc439)) +#loc1030 = loc(callsite(#loc1017 at #loc532)) +#loc1031 = loc("do_ptrs"(#loc1028)) +#loc1032 = loc(callsite(#loc1031 at #loc231)) +#loc1033 = loc(callsite(#loc1031 at #loc323)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/__grp__triton_poi_fused_clone_slice_4.json b/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/__grp__triton_poi_fused_clone_slice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e10dfaa047112e6d8207bd4b3a4efc4b66e66df2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/__grp__triton_poi_fused_clone_slice_4.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_clone_slice_4.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.source", "triton_poi_fused_clone_slice_4.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.ttir", "triton_poi_fused_clone_slice_4.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.ttgir", "triton_poi_fused_clone_slice_4.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.llir", "triton_poi_fused_clone_slice_4.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.ptx", "triton_poi_fused_clone_slice_4.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.cubin", "triton_poi_fused_clone_slice_4.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.cubin new file mode 100644 index 0000000000000000000000000000000000000000..745ea83fd48f83ad1507e8788061521b77e1599d Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.json b/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6bd2da96ee96a49b2b4d1cf699e9a50d89695e80 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.json @@ -0,0 +1 @@ +{"hash": "12e71bb49d61defdbba0edb99e12139e6b8639fd43d33f379c778b30e4b75b1a", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_clone_slice_4"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.llir new file mode 100644 index 0000000000000000000000000000000000000000..974393bffbeefd63819fbc59d2317c29f36a8bbd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.llir @@ -0,0 +1,61 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_clone_slice_4(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 7, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = and i32 %9, 127, !dbg !9 + %11 = or disjoint i32 %8, %10, !dbg !10 + %12 = icmp slt i32 %11, %3, !dbg !11 + %13 = sext i32 %11 to i64, !dbg !12 + %.frozen = freeze i64 %2, !dbg !13 + %14 = sdiv i64 %13, %.frozen, !dbg !13 + %15 = mul i64 %14, %.frozen, !dbg !12 + %.decomposed = sub i64 %13, %15, !dbg !12 + %16 = mul i64 %14, %2, !dbg !14 + %17 = getelementptr i32, ptr addrspace(1) %0, i64 %.decomposed, !dbg !15 + %18 = getelementptr i32, ptr addrspace(1) %17, i64 %14, !dbg !15 + %19 = getelementptr i32, ptr addrspace(1) %18, i64 %16, !dbg !15 + %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !16 + %21 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %19, i64 %20, i1 %12) #2, !dbg !16 + %22 = getelementptr i32, ptr addrspace(1) %1, i64 %13, !dbg !17 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %21, ptr addrspace(1) %22, i1 %12) #2, !dbg !18 + ret void, !dbg !19 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_clone_slice_4", linkageName: "triton_poi_fused_clone_slice_4", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 19, column: 28, scope: !4) +!8 = !DILocation(line: 19, column: 33, scope: !4) +!9 = !DILocation(line: 20, column: 36, scope: !4) +!10 = !DILocation(line: 20, column: 23, scope: !4) +!11 = !DILocation(line: 21, column: 21, scope: !4) +!12 = !DILocation(line: 22, column: 19, scope: !4) +!13 = !DILocation(line: 23, column: 19, scope: !4) +!14 = !DILocation(line: 25, column: 44, scope: !4) +!15 = !DILocation(line: 25, column: 30, scope: !4) +!16 = !DILocation(line: 25, column: 49, scope: !4) +!17 = !DILocation(line: 26, column: 25, scope: !4) +!18 = !DILocation(line: 26, column: 36, scope: !4) +!19 = !DILocation(line: 26, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.ptx new file mode 100644 index 0000000000000000000000000000000000000000..fd3a3bb004985c57e90bc6524c33d9edce944b51 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.ptx @@ -0,0 +1,247 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_clone_slice_4 // -- Begin function triton_poi_fused_clone_slice_4 + // @triton_poi_fused_clone_slice_4 +.visible .entry triton_poi_fused_clone_slice_4( + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_slice_4_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_slice_4_param_1, + .param .u64 triton_poi_fused_clone_slice_4_param_2, + .param .u32 triton_poi_fused_clone_slice_4_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_slice_4_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_slice_4_param_5 +) +.reqntid 128 +{ + .reg .pred %p<4>; + .reg .b32 %r<14>; + .reg .b64 %rd<26>; + .loc 1 18 0 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:18:0 + +// %bb.0: + ld.param.b32 %r1, [triton_poi_fused_clone_slice_4_param_3]; + ld.param.b64 %rd7, [triton_poi_fused_clone_slice_4_param_1]; + ld.param.b64 %rd6, [triton_poi_fused_clone_slice_4_param_0]; + ld.param.b64 %rd8, [triton_poi_fused_clone_slice_4_param_2]; +$L__tmp0: + .loc 1 19 28 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:19:28 + mov.u32 %r2, %ctaid.x; + .loc 1 19 33 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:19:33 + shl.b32 %r3, %r2, 7; + .loc 1 20 36 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:20:36 + mov.u32 %r4, %tid.x; + and.b32 %r5, %r4, 127; + .loc 1 20 23 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:20:23 + or.b32 %r6, %r3, %r5; + .loc 1 22 19 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:22:19 + cvt.s64.s32 %rd1, %r6; + .loc 1 23 19 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:23:19 + or.b64 %rd10, %rd1, %rd8; + and.b64 %rd11, %rd10, -4294967296; + setp.ne.b64 %p1, %rd11, 0; + cvt.u32.u64 %r13, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd25, %rd1, %rd8; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r7, %rd8; + div.u32 %r9, %r13, %r7; + cvt.u64.u32 %rd25, %r9; +$L__BB0_3: + .loc 1 21 21 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:21:21 + setp.lt.s32 %p2, %r13, %r1; + .loc 1 22 19 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:22:19 + mul.lo.s64 %rd16, %rd25, %rd8; + sub.s64 %rd17, %rd1, %rd16; + .loc 1 25 30 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:25:30 + shl.b64 %rd19, %rd17, 2; + add.s64 %rd20, %rd6, %rd19; + shl.b64 %rd21, %rd25, 2; + add.s64 %rd22, %rd20, %rd21; + shl.b64 %rd23, %rd16, 2; + add.s64 %rd13, %rd22, %rd23; + .loc 1 25 49 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:25:49 + // begin inline asm + mov.u64 %rd14, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r11, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r11 }, [ %rd13 + 0 ], %rd14; + // end inline asm + .loc 1 26 25 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:26:25 + shl.b64 %rd24, %rd1, 2; + add.s64 %rd15, %rd7, %rd24; + .loc 1 26 36 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:26:36 + // begin inline asm + @%p2 st.global.b32 [ %rd15 + 0 ], { %r11 }; + // end inline asm + .loc 1 26 4 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:26:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 110 +.b8 52 +.b8 52 +.b8 122 +.b8 104 +.b8 102 +.b8 115 +.b8 113 +.b8 115 +.b8 119 +.b8 108 +.b8 102 +.b8 99 +.b8 97 +.b8 119 +.b8 101 +.b8 111 +.b8 100 +.b8 116 +.b8 111 +.b8 104 +.b8 113 +.b8 116 +.b8 55 +.b8 115 +.b8 116 +.b8 100 +.b8 110 +.b8 118 +.b8 107 +.b8 105 +.b8 99 +.b8 52 +.b8 115 +.b8 113 +.b8 120 +.b8 111 +.b8 52 +.b8 107 +.b8 97 +.b8 116 +.b8 109 +.b8 97 +.b8 119 +.b8 118 +.b8 51 +.b8 117 +.b8 114 +.b8 107 +.b8 101 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 110 +.b8 52 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.source b/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.source new file mode 100644 index 0000000000000000000000000000000000000000..d877ca1b493aafe6f756fb8d9efb219ff3711b21 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.source @@ -0,0 +1,62 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":18:0) +#loc16 = loc("in_ptr0"(#loc)) +#loc17 = loc("out_ptr0"(#loc)) +#loc18 = loc("ks0"(#loc)) +#loc19 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_clone_slice_4(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc20) + %xoffset_0 = arith.constant 128 : i32 loc(#loc21) + %xoffset_1 = arith.constant 128 : i32 loc(#loc21) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc21) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc22) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<128xi32> loc(#loc23) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<128xi32> loc(#loc23) + %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc24) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<128xi32> loc(#loc24) + %x0 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc25) + %x0_6 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc25) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<128xi64> loc(#loc25) + %x1 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc26) + %x1_8 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc26) + %x1_9 = arith.divsi %x1, %x1_8 : tensor<128xi64> loc(#loc26) + %tmp0 = arith.addi %x0_7, %x1_9 : tensor<128xi64> loc(#loc27) + %tmp0_10 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc28) + %tmp0_11 = arith.muli %tmp0_10, %x1_9 : tensor<128xi64> loc(#loc28) + %tmp0_12 = arith.addi %tmp0, %tmp0_11 : tensor<128xi64> loc(#loc29) + %tmp0_13 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc30) + %tmp0_14 = tt.addptr %tmp0_13, %tmp0_12 : tensor<128x!tt.ptr>, tensor<128xi64> loc(#loc30) + %tmp0_15 = tt.load %tmp0_14, %xmask_5 evictionPolicy = evict_last : tensor<128x!tt.ptr> loc(#loc31) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc13) + %1 = tt.addptr %0, %xindex_4 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc13) + tt.store %1, %tmp0_15, %xmask_5 : tensor<128x!tt.ptr> loc(#loc14) + tt.return loc(#loc15) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":22:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":23:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:35) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:44) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:30) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:49) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":26:25) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":26:36) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":26:4) +#loc20 = loc("xoffset"(#loc1)) +#loc21 = loc("xoffset"(#loc2)) +#loc22 = loc("xindex"(#loc3)) +#loc23 = loc("xindex"(#loc4)) +#loc24 = loc("xmask"(#loc5)) +#loc25 = loc("x0"(#loc6)) +#loc26 = loc("x1"(#loc7)) +#loc27 = loc("tmp0"(#loc8)) +#loc28 = loc("tmp0"(#loc9)) +#loc29 = loc("tmp0"(#loc10)) +#loc30 = loc("tmp0"(#loc11)) +#loc31 = loc("tmp0"(#loc12)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..ebf9caa423fbd97d6cbac188aa686d4ef5b89ba9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.ttgir @@ -0,0 +1,60 @@ +#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":18:0) +#loc17 = loc("in_ptr0"(#loc)) +#loc18 = loc("out_ptr0"(#loc)) +#loc19 = loc("ks0"(#loc)) +#loc20 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_clone_slice_4(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc21) + %xoffset_0 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc22) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked> loc(#loc23) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<128xi32, #blocked> loc(#loc24) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<128xi32, #blocked> loc(#loc24) + %xmask = tt.splat %xnumel : i32 -> tensor<128xi32, #blocked> loc(#loc25) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<128xi32, #blocked> loc(#loc25) + %x0 = arith.extsi %xindex_2 : tensor<128xi32, #blocked> to tensor<128xi64, #blocked> loc(#loc26) + %x0_4 = tt.splat %ks0 : i64 -> tensor<128xi64, #blocked> loc(#loc26) + %x0_5 = arith.remsi %x0, %x0_4 : tensor<128xi64, #blocked> loc(#loc26) + %x1 = arith.divsi %x0, %x0_4 : tensor<128xi64, #blocked> loc(#loc27) + %tmp0 = arith.addi %x0_5, %x1 : tensor<128xi64, #blocked> loc(#loc28) + %tmp0_6 = arith.muli %x0_4, %x1 : tensor<128xi64, #blocked> loc(#loc29) + %tmp0_7 = arith.addi %tmp0, %tmp0_6 : tensor<128xi64, #blocked> loc(#loc30) + %tmp0_8 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x!tt.ptr, #blocked> loc(#loc31) + %tmp0_9 = tt.addptr %tmp0_8, %tmp0_7 : tensor<128x!tt.ptr, #blocked>, tensor<128xi64, #blocked> loc(#loc31) + %tmp0_10 = tt.load %tmp0_9, %xmask_3 evictionPolicy = evict_last : tensor<128x!tt.ptr, #blocked> loc(#loc32) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr, #blocked> loc(#loc14) + %1 = tt.addptr %0, %xindex_2 : tensor<128x!tt.ptr, #blocked>, tensor<128xi32, #blocked> loc(#loc14) + tt.store %1, %tmp0_10, %xmask_3 : tensor<128x!tt.ptr, #blocked> loc(#loc15) + tt.return loc(#loc16) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":23:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:35) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:44) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:40) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:30) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:49) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":26:25) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":26:36) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":26:4) +#loc21 = loc("xoffset"(#loc2)) +#loc22 = loc("xoffset"(#loc3)) +#loc23 = loc("xindex"(#loc4)) +#loc24 = loc("xindex"(#loc5)) +#loc25 = loc("xmask"(#loc6)) +#loc26 = loc("x0"(#loc7)) +#loc27 = loc("x1"(#loc8)) +#loc28 = loc("tmp0"(#loc9)) +#loc29 = loc("tmp0"(#loc10)) +#loc30 = loc("tmp0"(#loc11)) +#loc31 = loc("tmp0"(#loc12)) +#loc32 = loc("tmp0"(#loc13)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.ttir new file mode 100644 index 0000000000000000000000000000000000000000..4962e067c36750eb1afff189650a8ea52719e9bb --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CLTRXNE5MHPP3O5A5W4Z4EQTTZVYMOP5IPJT6N44O6FTBZFXLMNA/triton_poi_fused_clone_slice_4.ttir @@ -0,0 +1,59 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":18:0) +#loc17 = loc("in_ptr0"(#loc)) +#loc18 = loc("out_ptr0"(#loc)) +#loc19 = loc("ks0"(#loc)) +#loc20 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_clone_slice_4(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc21) + %xoffset_0 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc22) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc23) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<128xi32> loc(#loc24) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<128xi32> loc(#loc24) + %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc25) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<128xi32> loc(#loc25) + %x0 = arith.extsi %xindex_2 : tensor<128xi32> to tensor<128xi64> loc(#loc26) + %x0_4 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc26) + %x0_5 = arith.remsi %x0, %x0_4 : tensor<128xi64> loc(#loc26) + %x1 = arith.divsi %x0, %x0_4 : tensor<128xi64> loc(#loc27) + %tmp0 = arith.addi %x0_5, %x1 : tensor<128xi64> loc(#loc28) + %tmp0_6 = arith.muli %x0_4, %x1 : tensor<128xi64> loc(#loc29) + %tmp0_7 = arith.addi %tmp0, %tmp0_6 : tensor<128xi64> loc(#loc30) + %tmp0_8 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc31) + %tmp0_9 = tt.addptr %tmp0_8, %tmp0_7 : tensor<128x!tt.ptr>, tensor<128xi64> loc(#loc31) + %tmp0_10 = tt.load %tmp0_9, %xmask_3 evictionPolicy = evict_last : tensor<128x!tt.ptr> loc(#loc32) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc14) + %1 = tt.addptr %0, %xindex_2 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc14) + tt.store %1, %tmp0_10, %xmask_3 : tensor<128x!tt.ptr> loc(#loc15) + tt.return loc(#loc16) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":23:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:35) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:44) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:40) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:30) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:49) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":26:25) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":26:36) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":26:4) +#loc21 = loc("xoffset"(#loc2)) +#loc22 = loc("xoffset"(#loc3)) +#loc23 = loc("xindex"(#loc4)) +#loc24 = loc("xindex"(#loc5)) +#loc25 = loc("xmask"(#loc6)) +#loc26 = loc("x0"(#loc7)) +#loc27 = loc("x1"(#loc8)) +#loc28 = loc("tmp0"(#loc9)) +#loc29 = loc("tmp0"(#loc10)) +#loc30 = loc("tmp0"(#loc11)) +#loc31 = loc("tmp0"(#loc12)) +#loc32 = loc("tmp0"(#loc13)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/__grp__triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/__grp__triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..28729a26cac57790e3545cfb086e73dc79dd45aa --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/__grp__triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.source", "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttir", "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttgir", "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.llir", "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ptx", "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.cubin", "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..23870b5e84d8a457153d0973f024e79363e97ce7 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f36bf0e114c2c29bc084338e13e17a54e290b439 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json @@ -0,0 +1 @@ +{"hash": "14179d4e76ada4b678859d686d85aa269c9b15a40363dbc335fc0531bbbee886", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..7d778bb3e98ec221df540a9cf173b68d22088d59 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.llir @@ -0,0 +1,527 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py\00" +@assertMessage_0 = internal constant [40 x i8] c"index out of bounds: 0 <= tmp6 < 151936\00" +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i64 %4, i64 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #1 !dbg !9 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %10 = zext nneg i32 %9 to i64, !dbg !11 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %12 = shl nuw nsw i32 %11, 2, !dbg !12 + %13 = and i32 %12, 2044, !dbg !12 + %.idx = mul nuw nsw i64 %10, 303872 + %14 = getelementptr i8, ptr addrspace(1) %1, i64 %.idx + %15 = zext nneg i32 %13 to i64, !dbg !13 + br label %16, !dbg !13 + +16: ; preds = %8, %16 + %indvars.iv = phi i64 [ 0, %8 ], [ %indvars.iv.next, %16 ] + %17 = phi i64 [ 9223372036854775807, %8 ], [ %93, %16 ] + %18 = phi i64 [ 9223372036854775807, %8 ], [ %94, %16 ] + %19 = phi float [ 0xFFF0000000000000, %8 ], [ %90, %16 ] + %20 = phi float [ 0xFFF0000000000000, %8 ], [ %91, %16 ] + %21 = phi <2 x float> [ splat (float 0xFFF0000000000000), %8 ], [ %89, %16 ] + %22 = phi <2 x i64> [ splat (i64 9223372036854775807), %8 ], [ %92, %16 ] + %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !14 + %24 = fcmp uno <2 x float> %21, zeroinitializer, !dbg !15 + %25 = fcmp uno float %19, 0.000000e+00, !dbg !15 + %26 = fcmp uno float %20, 0.000000e+00, !dbg !15 + %27 = or disjoint i64 %indvars.iv, %15, !dbg !19 + %28 = or disjoint i64 %27, 1, !dbg !19 + %29 = or disjoint i64 %27, 2, !dbg !19 + %30 = or disjoint i64 %27, 3, !dbg !19 + %31 = icmp samesign ult i64 %27, 151936, !dbg !20 + %32 = getelementptr bfloat, ptr addrspace(1) %14, i64 %27, !dbg !21 + %33 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %32, i64 %23, i1 %31) #5, !dbg !14 + %34 = extractvalue { i32, i32 } %33, 0, !dbg !14 + %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !14 + %36 = extractvalue { i32, i32 } %33, 1, !dbg !14 + %37 = bitcast i32 %36 to <2 x bfloat>, !dbg !14 + %38 = extractelement <2 x bfloat> %37, i64 0, !dbg !14 + %39 = extractelement <2 x bfloat> %37, i64 1, !dbg !14 + %40 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !22 + %41 = fpext bfloat %38 to float, !dbg !22 + %42 = fpext bfloat %39 to float, !dbg !22 + %43 = fcmp ogt <2 x float> %21, %40, !dbg !23 + %44 = fcmp ogt float %19, %41, !dbg !23 + %45 = fcmp ogt float %20, %42, !dbg !23 + %46 = fcmp oeq <2 x float> %21, %40, !dbg !24 + %47 = fcmp oeq float %19, %41, !dbg !24 + %48 = fcmp oeq float %20, %42, !dbg !24 + %49 = fcmp uno <2 x bfloat> %35, zeroinitializer, !dbg !25 + %50 = fcmp uno bfloat %38, 0xR0000, !dbg !25 + %51 = fcmp uno bfloat %39, 0xR0000, !dbg !25 + %52 = xor <2 x i1> %49, splat (i1 true), !dbg !26 + %53 = xor i1 %50, true, !dbg !26 + %54 = xor i1 %51, true, !dbg !26 + %55 = and <2 x i1> %24, %52, !dbg !27 + %56 = and i1 %25, %53, !dbg !27 + %57 = and i1 %26, %54, !dbg !27 + %58 = or <2 x i1> %43, %55, !dbg !28 + %59 = or i1 %44, %56, !dbg !28 + %60 = or i1 %45, %57, !dbg !28 + %61 = and <2 x i1> %24, %49, !dbg !29 + %62 = and i1 %25, %50, !dbg !29 + %63 = and i1 %26, %51, !dbg !29 + %64 = or <2 x i1> %46, %61, !dbg !30 + %65 = or i1 %47, %62, !dbg !30 + %66 = or i1 %48, %63, !dbg !30 + %67 = insertelement <2 x i64> poison, i64 %27, i64 0, !dbg !31 + %68 = shufflevector <2 x i64> %67, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !31 + %69 = icmp slt <2 x i64> %22, %68, !dbg !31 + %70 = icmp sle <2 x i64> %22, %68, !dbg !31 + %71 = shufflevector <2 x i1> %69, <2 x i1> %70, <2 x i32> , !dbg !31 + %72 = icmp slt i64 %17, %29, !dbg !31 + %73 = icmp slt i64 %18, %30, !dbg !31 + %74 = and <2 x i1> %71, %64, !dbg !32 + %75 = and i1 %72, %65, !dbg !32 + %76 = and i1 %73, %66, !dbg !32 + %77 = or <2 x i1> %58, %74, !dbg !33 + %78 = or i1 %59, %75, !dbg !33 + %79 = or i1 %60, %76, !dbg !33 + %80 = select <2 x i1> %77, <2 x float> %21, <2 x float> %40, !dbg !34 + %81 = select i1 %78, float %19, float %41, !dbg !34 + %82 = select i1 %79, float %20, float %42, !dbg !34 + %83 = insertelement <2 x i64> %68, i64 %28, i64 1, !dbg !35 + %84 = select <2 x i1> %77, <2 x i64> %22, <2 x i64> %83, !dbg !35 + %85 = select i1 %78, i64 %17, i64 %29, !dbg !35 + %86 = select i1 %79, i64 %18, i64 %30, !dbg !35 + %87 = insertelement <2 x i1> poison, i1 %31, i64 0, !dbg !36 + %88 = shufflevector <2 x i1> %87, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !36 + %89 = select <2 x i1> %88, <2 x float> %80, <2 x float> %21, !dbg !36 + %90 = select i1 %31, float %81, float %19, !dbg !36 + %91 = select i1 %31, float %82, float %20, !dbg !36 + %92 = select <2 x i1> %88, <2 x i64> %84, <2 x i64> %22, !dbg !37 + %93 = select i1 %31, i64 %85, i64 %17, !dbg !37 + %94 = select i1 %31, i64 %86, i64 %18, !dbg !37 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !13 + %95 = icmp samesign ult i64 %indvars.iv, 149888, !dbg !13 + br i1 %95, label %16, label %96, !dbg !13 + +96: ; preds = %16 + %97 = and i32 %11, 31, !dbg !12 + %98 = lshr i32 %11, 5, !dbg !12 + %99 = shufflevector <2 x float> %89, <2 x float> poison, <2 x i32> , !dbg !38 + %100 = fcmp ogt <2 x float> %89, %99, !dbg !38 + %101 = fcmp oeq <2 x float> %89, %99, !dbg !38 + %102 = shufflevector <2 x i1> %100, <2 x i1> %101, <2 x i32> , !dbg !38 + %103 = extractelement <2 x float> %89, i64 0, !dbg !40 + %104 = fcmp uno float %103, 0.000000e+00, !dbg !40 + %105 = extractelement <2 x float> %89, i64 1, !dbg !41 + %106 = fcmp uno float %105, 0.000000e+00, !dbg !41 + %107 = xor i1 %106, true, !dbg !42 + %108 = insertelement <2 x i1> poison, i1 %104, i64 0, !dbg !43 + %109 = shufflevector <2 x i1> %108, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !43 + %110 = insertelement <2 x i1> poison, i1 %107, i64 0, !dbg !43 + %111 = insertelement <2 x i1> %110, i1 %106, i64 1, !dbg !43 + %112 = and <2 x i1> %109, %111, !dbg !43 + %113 = or <2 x i1> %102, %112, !dbg !44 + %114 = extractelement <2 x i64> %92, i64 0, !dbg !45 + %115 = extractelement <2 x i64> %92, i64 1, !dbg !45 + %116 = icmp slt i64 %114, %115, !dbg !45 + %117 = extractelement <2 x i1> %113, i64 1, !dbg !46 + %118 = and i1 %116, %117, !dbg !46 + %119 = extractelement <2 x i1> %113, i64 0, !dbg !47 + %120 = or i1 %119, %118, !dbg !47 + %121 = select i1 %120, float %103, float %105, !dbg !48 + %122 = select i1 %120, i64 %114, i64 %115, !dbg !49 + %123 = fcmp ogt float %121, %90, !dbg !38 + %124 = fcmp oeq float %121, %90, !dbg !50 + %125 = fcmp uno float %121, 0.000000e+00, !dbg !40 + %126 = fcmp uno float %90, 0.000000e+00, !dbg !41 + %127 = xor i1 %126, true, !dbg !42 + %128 = and i1 %125, %127, !dbg !43 + %129 = or i1 %123, %128, !dbg !44 + %130 = and i1 %126, %125, !dbg !51 + %131 = or i1 %124, %130, !dbg !52 + %132 = icmp slt i64 %122, %93, !dbg !45 + %133 = and i1 %132, %131, !dbg !46 + %134 = or i1 %129, %133, !dbg !47 + %135 = select i1 %134, float %121, float %90, !dbg !48 + %136 = select i1 %134, i64 %122, i64 %93, !dbg !49 + %137 = fcmp ogt float %135, %91, !dbg !38 + %138 = fcmp oeq float %135, %91, !dbg !50 + %139 = fcmp uno float %135, 0.000000e+00, !dbg !40 + %140 = fcmp uno float %91, 0.000000e+00, !dbg !41 + %141 = xor i1 %140, true, !dbg !42 + %142 = and i1 %139, %141, !dbg !43 + %143 = or i1 %137, %142, !dbg !44 + %144 = and i1 %140, %139, !dbg !51 + %145 = or i1 %138, %144, !dbg !52 + %146 = icmp slt i64 %136, %94, !dbg !45 + %147 = and i1 %146, %145, !dbg !46 + %148 = or i1 %143, %147, !dbg !47 + %149 = select i1 %148, float %135, float %91, !dbg !48 + %150 = select i1 %148, i64 %136, i64 %94, !dbg !49 + %151 = bitcast float %149 to i32, !dbg !53 + %152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %151, i32 16, i32 31), !dbg !53 + %153 = bitcast i32 %152 to float, !dbg !53 + %extelt.offset = lshr i64 %150, 32, !dbg !53 + %154 = trunc nuw i64 %extelt.offset to i32, !dbg !53 + %155 = trunc i64 %150 to i32, !dbg !53 + %156 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %155, i32 16, i32 31), !dbg !53 + %157 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 16, i32 31), !dbg !53 + %158 = insertelement <2 x i32> poison, i32 %156, i64 0, !dbg !53 + %159 = insertelement <2 x i32> %158, i32 %157, i64 1, !dbg !53 + %160 = bitcast <2 x i32> %159 to i64, !dbg !53 + %161 = fcmp ogt float %149, %153, !dbg !38 + %162 = fcmp oeq float %149, %153, !dbg !50 + %163 = fcmp uno float %149, 0.000000e+00, !dbg !40 + %164 = fcmp uno float %153, 0.000000e+00, !dbg !41 + %165 = xor i1 %164, true, !dbg !42 + %166 = and i1 %163, %165, !dbg !43 + %167 = or i1 %161, %166, !dbg !44 + %168 = and i1 %163, %164, !dbg !51 + %169 = or i1 %162, %168, !dbg !52 + %170 = icmp slt i64 %150, %160, !dbg !45 + %171 = and i1 %169, %170, !dbg !46 + %172 = or i1 %167, %171, !dbg !47 + %173 = select i1 %172, float %149, float %153, !dbg !48 + %174 = select i1 %172, i64 %150, i64 %160, !dbg !49 + %175 = bitcast float %173 to i32, !dbg !53 + %176 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 8, i32 31), !dbg !53 + %177 = bitcast i32 %176 to float, !dbg !53 + %extelt.offset1 = lshr i64 %174, 32, !dbg !53 + %178 = trunc nuw i64 %extelt.offset1 to i32, !dbg !53 + %179 = trunc i64 %174 to i32, !dbg !53 + %180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %179, i32 8, i32 31), !dbg !53 + %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 8, i32 31), !dbg !53 + %182 = insertelement <2 x i32> poison, i32 %180, i64 0, !dbg !53 + %183 = insertelement <2 x i32> %182, i32 %181, i64 1, !dbg !53 + %184 = bitcast <2 x i32> %183 to i64, !dbg !53 + %185 = fcmp ogt float %173, %177, !dbg !38 + %186 = fcmp oeq float %173, %177, !dbg !50 + %187 = fcmp uno float %173, 0.000000e+00, !dbg !40 + %188 = fcmp uno float %177, 0.000000e+00, !dbg !41 + %189 = xor i1 %188, true, !dbg !42 + %190 = and i1 %187, %189, !dbg !43 + %191 = or i1 %185, %190, !dbg !44 + %192 = and i1 %188, %187, !dbg !51 + %193 = or i1 %186, %192, !dbg !52 + %194 = icmp slt i64 %174, %184, !dbg !45 + %195 = and i1 %194, %193, !dbg !46 + %196 = or i1 %191, %195, !dbg !47 + %197 = select i1 %196, float %173, float %177, !dbg !48 + %198 = select i1 %196, i64 %174, i64 %184, !dbg !49 + %199 = bitcast float %197 to i32, !dbg !53 + %200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %199, i32 4, i32 31), !dbg !53 + %201 = bitcast i32 %200 to float, !dbg !53 + %extelt.offset2 = lshr i64 %198, 32, !dbg !53 + %202 = trunc nuw i64 %extelt.offset2 to i32, !dbg !53 + %203 = trunc i64 %198 to i32, !dbg !53 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 4, i32 31), !dbg !53 + %205 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 4, i32 31), !dbg !53 + %206 = insertelement <2 x i32> poison, i32 %204, i64 0, !dbg !53 + %207 = insertelement <2 x i32> %206, i32 %205, i64 1, !dbg !53 + %208 = bitcast <2 x i32> %207 to i64, !dbg !53 + %209 = fcmp ogt float %197, %201, !dbg !38 + %210 = fcmp oeq float %197, %201, !dbg !50 + %211 = fcmp uno float %197, 0.000000e+00, !dbg !40 + %212 = fcmp uno float %201, 0.000000e+00, !dbg !41 + %213 = xor i1 %212, true, !dbg !42 + %214 = and i1 %211, %213, !dbg !43 + %215 = or i1 %209, %214, !dbg !44 + %216 = and i1 %212, %211, !dbg !51 + %217 = or i1 %210, %216, !dbg !52 + %218 = icmp slt i64 %198, %208, !dbg !45 + %219 = and i1 %218, %217, !dbg !46 + %220 = or i1 %215, %219, !dbg !47 + %221 = select i1 %220, float %197, float %201, !dbg !48 + %222 = select i1 %220, i64 %198, i64 %208, !dbg !49 + %223 = bitcast float %221 to i32, !dbg !53 + %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 2, i32 31), !dbg !53 + %225 = bitcast i32 %224 to float, !dbg !53 + %extelt.offset3 = lshr i64 %222, 32, !dbg !53 + %226 = trunc nuw i64 %extelt.offset3 to i32, !dbg !53 + %227 = trunc i64 %222 to i32, !dbg !53 + %228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %227, i32 2, i32 31), !dbg !53 + %229 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %226, i32 2, i32 31), !dbg !53 + %230 = insertelement <2 x i32> poison, i32 %228, i64 0, !dbg !53 + %231 = insertelement <2 x i32> %230, i32 %229, i64 1, !dbg !53 + %232 = bitcast <2 x i32> %231 to i64, !dbg !53 + %233 = fcmp ogt float %221, %225, !dbg !38 + %234 = fcmp oeq float %221, %225, !dbg !50 + %235 = fcmp uno float %221, 0.000000e+00, !dbg !40 + %236 = fcmp uno float %225, 0.000000e+00, !dbg !41 + %237 = xor i1 %236, true, !dbg !42 + %238 = and i1 %235, %237, !dbg !43 + %239 = or i1 %233, %238, !dbg !44 + %240 = and i1 %236, %235, !dbg !51 + %241 = or i1 %234, %240, !dbg !52 + %242 = icmp slt i64 %222, %232, !dbg !45 + %243 = and i1 %242, %241, !dbg !46 + %244 = or i1 %239, %243, !dbg !47 + %245 = select i1 %244, float %221, float %225, !dbg !48 + %246 = select i1 %244, i64 %222, i64 %232, !dbg !49 + %247 = bitcast float %245 to i32, !dbg !53 + %248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %247, i32 1, i32 31), !dbg !53 + %249 = bitcast i32 %248 to float, !dbg !53 + %extelt.offset4 = lshr i64 %246, 32, !dbg !53 + %250 = trunc nuw i64 %extelt.offset4 to i32, !dbg !53 + %251 = trunc i64 %246 to i32, !dbg !53 + %252 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %251, i32 1, i32 31), !dbg !53 + %253 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %250, i32 1, i32 31), !dbg !53 + %254 = insertelement <2 x i32> poison, i32 %252, i64 0, !dbg !53 + %255 = insertelement <2 x i32> %254, i32 %253, i64 1, !dbg !53 + %256 = bitcast <2 x i32> %255 to i64, !dbg !53 + %257 = fcmp ogt float %245, %249, !dbg !38 + %258 = fcmp oeq float %245, %249, !dbg !50 + %259 = fcmp uno float %245, 0.000000e+00, !dbg !40 + %260 = fcmp uno float %249, 0.000000e+00, !dbg !41 + %261 = xor i1 %260, true, !dbg !42 + %262 = and i1 %259, %261, !dbg !43 + %263 = or i1 %257, %262, !dbg !44 + %264 = and i1 %260, %259, !dbg !51 + %265 = or i1 %258, %264, !dbg !52 + %266 = icmp slt i64 %246, %256, !dbg !45 + %267 = and i1 %266, %265, !dbg !46 + %268 = or i1 %263, %267, !dbg !47 + %269 = select i1 %268, i64 %246, i64 %256, !dbg !49 + %270 = and i32 %98, 15, !dbg !53 + %271 = icmp eq i32 %97, 0, !dbg !53 + %272 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %270, !dbg !53 + %273 = select i1 %268, i32 %247, i32 %248, !dbg !48 + %274 = insertelement <1 x i32> poison, i32 %273, i64 0, !dbg !53 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %272, <1 x i32> %274, i1 %271) #5, !dbg !53 + %275 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %270, !dbg !53 + %276 = insertelement <1 x i64> poison, i64 %269, i64 0, !dbg !53 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %275, <1 x i64> %276, i1 %271) #5, !dbg !53 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %277 = icmp samesign ult i32 %11, 16, !dbg !53 + %278 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 128), i32 %11, !dbg !53 + %279 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %278, i1 %277) #5, !dbg !53 + %280 = bitcast i32 %279 to float, !dbg !53 + %281 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %11, !dbg !53 + %282 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %281, i1 %277) #5, !dbg !53 + %283 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %279, i32 8, i32 31), !dbg !53 + %284 = bitcast i32 %283 to float, !dbg !53 + %extelt.offset5 = lshr i64 %282, 32, !dbg !53 + %285 = trunc nuw i64 %extelt.offset5 to i32, !dbg !53 + %286 = trunc i64 %282 to i32, !dbg !53 + %287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %286, i32 8, i32 31), !dbg !53 + %288 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 8, i32 31), !dbg !53 + %289 = insertelement <2 x i32> poison, i32 %287, i64 0, !dbg !53 + %290 = insertelement <2 x i32> %289, i32 %288, i64 1, !dbg !53 + %291 = bitcast <2 x i32> %290 to i64, !dbg !53 + %292 = fcmp ogt float %280, %284, !dbg !38 + %293 = fcmp oeq float %280, %284, !dbg !50 + %294 = fcmp uno float %280, 0.000000e+00, !dbg !40 + %295 = fcmp uno float %284, 0.000000e+00, !dbg !41 + %296 = xor i1 %295, true, !dbg !42 + %297 = and i1 %294, %296, !dbg !43 + %298 = or i1 %292, %297, !dbg !44 + %299 = and i1 %294, %295, !dbg !51 + %300 = or i1 %293, %299, !dbg !52 + %301 = icmp slt i64 %282, %291, !dbg !45 + %302 = and i1 %300, %301, !dbg !46 + %303 = or i1 %298, %302, !dbg !47 + %304 = select i1 %303, float %280, float %284, !dbg !48 + %305 = select i1 %303, i64 %282, i64 %291, !dbg !49 + %306 = bitcast float %304 to i32, !dbg !53 + %307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %306, i32 4, i32 31), !dbg !53 + %308 = bitcast i32 %307 to float, !dbg !53 + %extelt.offset6 = lshr i64 %305, 32, !dbg !53 + %309 = trunc nuw i64 %extelt.offset6 to i32, !dbg !53 + %310 = trunc i64 %305 to i32, !dbg !53 + %311 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %310, i32 4, i32 31), !dbg !53 + %312 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %309, i32 4, i32 31), !dbg !53 + %313 = insertelement <2 x i32> poison, i32 %311, i64 0, !dbg !53 + %314 = insertelement <2 x i32> %313, i32 %312, i64 1, !dbg !53 + %315 = bitcast <2 x i32> %314 to i64, !dbg !53 + %316 = fcmp ogt float %304, %308, !dbg !38 + %317 = fcmp oeq float %304, %308, !dbg !50 + %318 = fcmp uno float %304, 0.000000e+00, !dbg !40 + %319 = fcmp uno float %308, 0.000000e+00, !dbg !41 + %320 = xor i1 %319, true, !dbg !42 + %321 = and i1 %318, %320, !dbg !43 + %322 = or i1 %316, %321, !dbg !44 + %323 = and i1 %319, %318, !dbg !51 + %324 = or i1 %317, %323, !dbg !52 + %325 = icmp slt i64 %305, %315, !dbg !45 + %326 = and i1 %325, %324, !dbg !46 + %327 = or i1 %322, %326, !dbg !47 + %328 = select i1 %327, float %304, float %308, !dbg !48 + %329 = select i1 %327, i64 %305, i64 %315, !dbg !49 + %330 = bitcast float %328 to i32, !dbg !53 + %331 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %330, i32 2, i32 31), !dbg !53 + %332 = bitcast i32 %331 to float, !dbg !53 + %extelt.offset7 = lshr i64 %329, 32, !dbg !53 + %333 = trunc nuw i64 %extelt.offset7 to i32, !dbg !53 + %334 = trunc i64 %329 to i32, !dbg !53 + %335 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %334, i32 2, i32 31), !dbg !53 + %336 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %333, i32 2, i32 31), !dbg !53 + %337 = insertelement <2 x i32> poison, i32 %335, i64 0, !dbg !53 + %338 = insertelement <2 x i32> %337, i32 %336, i64 1, !dbg !53 + %339 = bitcast <2 x i32> %338 to i64, !dbg !53 + %340 = fcmp ogt float %328, %332, !dbg !38 + %341 = fcmp oeq float %328, %332, !dbg !50 + %342 = fcmp uno float %328, 0.000000e+00, !dbg !40 + %343 = fcmp uno float %332, 0.000000e+00, !dbg !41 + %344 = xor i1 %343, true, !dbg !42 + %345 = and i1 %342, %344, !dbg !43 + %346 = or i1 %340, %345, !dbg !44 + %347 = and i1 %343, %342, !dbg !51 + %348 = or i1 %341, %347, !dbg !52 + %349 = icmp slt i64 %329, %339, !dbg !45 + %350 = and i1 %349, %348, !dbg !46 + %351 = or i1 %346, %350, !dbg !47 + %352 = select i1 %351, float %328, float %332, !dbg !48 + %353 = select i1 %351, i64 %329, i64 %339, !dbg !49 + %354 = bitcast float %352 to i32, !dbg !53 + %355 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %354, i32 1, i32 31), !dbg !53 + %356 = bitcast i32 %355 to float, !dbg !53 + %extelt.offset8 = lshr i64 %353, 32, !dbg !53 + %357 = trunc nuw i64 %extelt.offset8 to i32, !dbg !53 + %358 = trunc i64 %353 to i32, !dbg !53 + %359 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %358, i32 1, i32 31), !dbg !53 + %360 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %357, i32 1, i32 31), !dbg !53 + %361 = insertelement <2 x i32> poison, i32 %359, i64 0, !dbg !53 + %362 = insertelement <2 x i32> %361, i32 %360, i64 1, !dbg !53 + %363 = bitcast <2 x i32> %362 to i64, !dbg !53 + %364 = fcmp ogt float %352, %356, !dbg !38 + %365 = fcmp oeq float %352, %356, !dbg !50 + %366 = fcmp uno float %352, 0.000000e+00, !dbg !40 + %367 = fcmp uno float %356, 0.000000e+00, !dbg !41 + %368 = xor i1 %367, true, !dbg !42 + %369 = and i1 %366, %368, !dbg !43 + %370 = or i1 %364, %369, !dbg !44 + %371 = and i1 %367, %366, !dbg !51 + %372 = or i1 %365, %371, !dbg !52 + %373 = icmp slt i64 %353, %363, !dbg !45 + %374 = and i1 %373, %372, !dbg !46 + %375 = or i1 %370, %374, !dbg !47 + %376 = select i1 %375, i64 %353, i64 %363, !dbg !49 + %377 = icmp eq i32 %11, 0, !dbg !53 + %378 = select i1 %375, i32 %354, i32 %355, !dbg !48 + %379 = insertelement <1 x i32> poison, i32 %378, i64 0, !dbg !53 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %278, <1 x i32> %379, i1 %377) #5, !dbg !53 + %380 = insertelement <1 x i64> poison, i64 %376, i64 0, !dbg !53 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %281, <1 x i64> %380, i1 %377) #5, !dbg !53 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %381 = load i64, ptr addrspace(3) @global_smem, align 16, !dbg !53 + %382 = getelementptr i64, ptr addrspace(1) %3, i64 %10, !dbg !54 + %383 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !55 + %384 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l"(ptr addrspace(1) %382, i64 %383) #5, !dbg !55 + %385 = add i64 %381, 151936, !dbg !56 + %386 = icmp slt i64 %381, 0, !dbg !57 + %387 = select i1 %386, i64 %385, i64 %381, !dbg !58 + %388 = icmp ugt i64 %387, 151935, !dbg !59 + br i1 %388, label %389, label %390, !dbg !60 + +389: ; preds = %96 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 51, ptr nonnull @assertFunc_0, i64 1), !dbg !60 + unreachable, !dbg !60 + +390: ; preds = %96 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !60 + %391 = getelementptr i1, ptr addrspace(1) %2, i64 %387, !dbg !61 + %392 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !62 + %393 = tail call i8 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.b8 { $0 }, [ $1 + 0 ], $2;", "=c,l,l"(ptr addrspace(1) %391, i64 %392) #5, !dbg !62 + %.not = icmp eq i8 %393, 0, !dbg !62 + %394 = select i1 %.not, i64 0, i64 %384, !dbg !63 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !64 + %395 = getelementptr i64, ptr addrspace(1) %0, i64 %10, !dbg !65 + %396 = and i32 %11, 511, !dbg !66 + %397 = icmp eq i32 %396, 0, !dbg !66 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %394, ptr addrspace(1) %395, i1 %397) #5, !dbg !66 + ret void, !dbg !67 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="512" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0", linkageName: "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 23, column: 28, scope: !9) +!11 = !DILocation(line: 23, column: 34, scope: !9) +!12 = !DILocation(line: 26, column: 37, scope: !9) +!13 = !DILocation(line: 31, column: 40, scope: !9) +!14 = !DILocation(line: 37, column: 53, scope: !9) +!15 = !DILocation(line: 147, column: 29, scope: !16, inlinedAt: !18) +!16 = distinct !DILexicalBlockFile(scope: !9, file: !17, discriminator: 0) +!17 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!18 = !DILocation(line: 40, column: 38, scope: !9) +!19 = !DILocation(line: 32, column: 31, scope: !9) +!20 = !DILocation(line: 33, column: 29, scope: !9) +!21 = !DILocation(line: 37, column: 34, scope: !9) +!22 = !DILocation(line: 37, column: 107, scope: !9) +!23 = !DILocation(line: 144, column: 21, scope: !16, inlinedAt: !18) +!24 = !DILocation(line: 145, column: 23, scope: !16, inlinedAt: !18) +!25 = !DILocation(line: 148, column: 29, scope: !16, inlinedAt: !18) +!26 = !DILocation(line: 149, column: 31, scope: !16, inlinedAt: !18) +!27 = !DILocation(line: 149, column: 27, scope: !16, inlinedAt: !18) +!28 = !DILocation(line: 149, column: 16, scope: !16, inlinedAt: !18) +!29 = !DILocation(line: 151, column: 27, scope: !16, inlinedAt: !18) +!30 = !DILocation(line: 151, column: 17, scope: !16, inlinedAt: !18) +!31 = !DILocation(line: 154, column: 31, scope: !16, inlinedAt: !18) +!32 = !DILocation(line: 154, column: 21, scope: !16, inlinedAt: !18) +!33 = !DILocation(line: 154, column: 12, scope: !16, inlinedAt: !18) +!34 = !DILocation(line: 155, column: 35, scope: !16, inlinedAt: !18) +!35 = !DILocation(line: 155, column: 69, scope: !16, inlinedAt: !18) +!36 = !DILocation(line: 42, column: 46, scope: !9) +!37 = !DILocation(line: 43, column: 58, scope: !9) +!38 = !DILocation(line: 144, column: 21, scope: !16, inlinedAt: !39) +!39 = !DILocation(line: 44, column: 75, scope: !9) +!40 = !DILocation(line: 147, column: 29, scope: !16, inlinedAt: !39) +!41 = !DILocation(line: 148, column: 29, scope: !16, inlinedAt: !39) +!42 = !DILocation(line: 149, column: 31, scope: !16, inlinedAt: !39) +!43 = !DILocation(line: 149, column: 27, scope: !16, inlinedAt: !39) +!44 = !DILocation(line: 149, column: 16, scope: !16, inlinedAt: !39) +!45 = !DILocation(line: 154, column: 31, scope: !16, inlinedAt: !39) +!46 = !DILocation(line: 154, column: 21, scope: !16, inlinedAt: !39) +!47 = !DILocation(line: 154, column: 12, scope: !16, inlinedAt: !39) +!48 = !DILocation(line: 155, column: 35, scope: !16, inlinedAt: !39) +!49 = !DILocation(line: 155, column: 69, scope: !16, inlinedAt: !39) +!50 = !DILocation(line: 145, column: 23, scope: !16, inlinedAt: !39) +!51 = !DILocation(line: 151, column: 27, scope: !16, inlinedAt: !39) +!52 = !DILocation(line: 151, column: 17, scope: !16, inlinedAt: !39) +!53 = !DILocation(line: 165, column: 42, scope: !16, inlinedAt: !39) +!54 = !DILocation(line: 46, column: 31, scope: !9) +!55 = !DILocation(line: 46, column: 36, scope: !9) +!56 = !DILocation(line: 48, column: 18, scope: !9) +!57 = !DILocation(line: 49, column: 18, scope: !9) +!58 = !DILocation(line: 50, column: 32, scope: !9) +!59 = !DILocation(line: 51, column: 36, scope: !9) +!60 = !DILocation(line: 51, column: 52, scope: !9) +!61 = !DILocation(line: 52, column: 30, scope: !9) +!62 = !DILocation(line: 52, column: 37, scope: !9) +!63 = !DILocation(line: 55, column: 20, scope: !9) +!64 = !DILocation(line: 56, column: 4, scope: !9) +!65 = !DILocation(line: 57, column: 28, scope: !9) +!66 = !DILocation(line: 57, column: 40, scope: !9) +!67 = !DILocation(line: 57, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..215e3a42147008edd830fc8c1eb7dc23543c20f5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ptx @@ -0,0 +1,987 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0 // -- Begin function triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 122, 119, 47, 99, 122, 119, 104, 54, 116, 103, 107, 113, 54, 115, 99, 100, 115, 116, 103, 122, 117, 101, 98, 51, 103, 111, 113, 113, 110, 108, 108, 110, 100, 105, 107, 111, 97, 115, 106, 50, 105, 50, 105, 101, 104, 117, 50, 113, 121, 118, 111, 99, 99, 119, 116, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[40] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 54, 32, 60, 32, 49, 53, 49, 57, 51, 54}; +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0 +.visible .entry triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_3, + .param .u64 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_4, + .param .u64 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_7 +) +.reqntid 512 +{ + .reg .pred %p<205>; + .reg .b16 %rs<7>; + .reg .b32 %r<104>; + .reg .b64 %rd<118>; + .loc 1 18 0 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:18:0 +$L__func_begin0: + .loc 1 18 0 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:18:0 + +// %bb.0: + ld.param.b64 %rd22, [triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_3]; + ld.param.b64 %rd21, [triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_2]; + ld.param.b64 %rd20, [triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_0]; + ld.param.b64 %rd28, [triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_1]; +$L__tmp0: + .loc 1 23 28 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:23:28 + mov.u32 %r7, %ctaid.x; + .loc 1 23 34 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:23:34 + cvt.u64.u32 %rd1, %r7; + .loc 1 26 37 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:26:37 + mov.u32 %r1, %tid.x; + shl.b32 %r8, %r1, 2; + and.b32 %r9, %r8, 2044; + .loc 1 31 40 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:31:40 + cvt.u64.u32 %rd2, %r9; + and.b32 %r10, %r1, 511; + mul.wide.u32 %rd29, %r10, 8; + mad.wide.u32 %rd30, %r7, 303872, %rd29; + add.s64 %rd111, %rd28, %rd30; + mov.b32 %r102, 0fFF800000; + mov.b64 %rd115, {%r102, %r102}; + mov.b64 %rd113, 9223372036854775807; + mov.b64 %rd112, -2048; + mov.b64 %rd114, %rd113; + mov.b32 %r103, %r102; + mov.b64 %rd116, %rd113; + mov.b64 %rd117, %rd113; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 37 53 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:37:53 + // begin inline asm + mov.u64 %rd31, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd31, 1.0; + // end inline asm +$L__tmp1: + .loc 2 147 29 // triton_helpers.py:147:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:40:38 ] + mov.b64 {%r15, %r16}, %rd115; + setp.nan.f32 %p2, %r15, %r15; + setp.nan.f32 %p3, %r16, %r16; + setp.nan.f32 %p4, %r102, %r102; + setp.nan.f32 %p5, %r103, %r103; +$L__tmp2: + .loc 1 32 31 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:32:31 + add.s64 %rd34, %rd2, %rd112; + add.s64 %rd35, %rd34, 2048; + add.s64 %rd36, %rd34, 2049; + add.s64 %rd37, %rd34, 2050; + .loc 1 33 29 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:33:29 + add.s64 %rd38, %rd34, 2051; + setp.lt.u64 %p1, %rd35, 151936; + mov.b32 %r13, 0; + .loc 1 37 53 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:37:53 + // begin inline asm + mov.u32 %r11, %r13; + mov.u32 %r12, %r13; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r11, %r12 }, [ %rd111 + 0 ], %rd31; + // end inline asm + mov.b32 {%rs1, %rs2}, %r12; + .loc 1 37 107 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:37:107 + mov.b32 {%rs3, %rs4}, %r11; + cvt.f32.bf16 %r17, %rs3; + cvt.f32.bf16 %r18, %rs4; + cvt.f32.bf16 %r19, %rs1; + cvt.f32.bf16 %r20, %rs2; +$L__tmp3: + .loc 2 144 21 // triton_helpers.py:144:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:40:38 ] + setp.gt.f32 %p6, %r16, %r18; + setp.gt.f32 %p7, %r15, %r17; + setp.gt.f32 %p8, %r102, %r19; + setp.gt.f32 %p9, %r103, %r20; + .loc 2 145 23 // triton_helpers.py:145:23 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:40:38 ] + setp.eq.f32 %p10, %r15, %r17; + setp.eq.f32 %p11, %r16, %r18; + setp.eq.f32 %p12, %r102, %r19; + setp.eq.f32 %p13, %r103, %r20; + .loc 2 148 29 // triton_helpers.py:148:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:40:38 ] + setp.nan.bf16x2 %p14|%p15, %r11, %r13; + setp.num.bf16x2 %p16|%p17, %r11, %r13; + setp.nan.bf16 %p18, %rs1, %rs1; + setp.num.bf16 %p19, %rs1, %rs1; + setp.nan.bf16 %p20, %rs2, %rs2; + setp.num.bf16 %p21, %rs2, %rs2; + .loc 2 149 27 // triton_helpers.py:149:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:40:38 ] + and.pred %p22, %p3, %p17; + and.pred %p23, %p2, %p16; + and.pred %p24, %p4, %p19; + and.pred %p25, %p5, %p21; + .loc 2 149 16 // triton_helpers.py:149:16 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:40:38 ] + or.pred %p26, %p7, %p23; + or.pred %p27, %p6, %p22; + or.pred %p28, %p8, %p24; + or.pred %p29, %p9, %p25; + .loc 2 151 27 // triton_helpers.py:151:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:40:38 ] + and.pred %p30, %p2, %p14; + and.pred %p31, %p3, %p15; + and.pred %p32, %p4, %p18; + and.pred %p33, %p5, %p20; + .loc 2 151 17 // triton_helpers.py:151:17 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:40:38 ] + or.pred %p34, %p11, %p31; + or.pred %p35, %p10, %p30; + or.pred %p36, %p12, %p32; + or.pred %p37, %p13, %p33; + .loc 2 154 31 // triton_helpers.py:154:31 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:40:38 ] + setp.lt.s64 %p38, %rd116, %rd35; + setp.le.s64 %p39, %rd117, %rd35; + setp.lt.s64 %p40, %rd113, %rd37; + setp.lt.s64 %p41, %rd114, %rd38; + .loc 2 154 21 // triton_helpers.py:154:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:40:38 ] + and.pred %p42, %p38, %p35; + and.pred %p43, %p39, %p34; + and.pred %p44, %p40, %p36; + and.pred %p45, %p41, %p37; + .loc 2 154 12 // triton_helpers.py:154:12 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:40:38 ] + or.pred %p46, %p27, %p43; + or.pred %p47, %p26, %p42; + or.pred %p48, %p28, %p44; + or.pred %p49, %p29, %p45; + .loc 2 155 35 // triton_helpers.py:155:35 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:40:38 ] + selp.f32 %r21, %r15, %r17, %p47; + selp.f32 %r22, %r16, %r18, %p46; + selp.f32 %r23, %r102, %r19, %p48; + selp.f32 %r24, %r103, %r20, %p49; + .loc 2 155 69 // triton_helpers.py:155:69 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:40:38 ] + selp.b64 %rd39, %rd116, %rd35, %p47; + selp.b64 %rd40, %rd117, %rd36, %p46; + selp.b64 %rd41, %rd113, %rd37, %p48; + selp.b64 %rd42, %rd114, %rd38, %p49; +$L__tmp4: + .loc 1 42 46 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:42:46 + selp.f32 %r25, %r22, %r16, %p1; + selp.f32 %r26, %r21, %r15, %p1; + mov.b64 %rd115, {%r26, %r25}; + selp.f32 %r102, %r23, %r102, %p1; + selp.f32 %r103, %r24, %r103, %p1; + .loc 1 43 58 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:43:58 + selp.b64 %rd117, %rd40, %rd117, %p1; + selp.b64 %rd116, %rd39, %rd116, %p1; + selp.b64 %rd113, %rd41, %rd113, %p1; + selp.b64 %rd114, %rd42, %rd114, %p1; + .loc 1 31 40 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:31:40 + add.s64 %rd112, %rd112, 2048; + add.s64 %rd111, %rd111, 4096; + setp.lt.u64 %p50, %rd112, 149888; + @%p50 bra $L__BB0_1; +// %bb.2: + .loc 1 26 37 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:26:37 + and.b32 %r36, %r1, 31; +$L__tmp5: + .loc 2 144 21 // triton_helpers.py:144:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + mov.b64 {%r37, %r38}, %rd115; + setp.gt.f32 %p57, %r37, %r38; + setp.eq.f32 %p58, %r38, %r37; + .loc 2 147 29 // triton_helpers.py:147:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p59, %r37, %r37; + .loc 2 148 29 // triton_helpers.py:148:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.num.f32 %p60, %r38, %r38; + setp.nan.f32 %p61, %r38, %r38; + .loc 2 149 27 // triton_helpers.py:149:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p62, %p59, %p61; + and.pred %p63, %p59, %p60; + .loc 2 149 16 // triton_helpers.py:149:16 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p64, %p57, %p63; + or.pred %p65, %p58, %p62; + .loc 2 154 31 // triton_helpers.py:154:31 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.lt.s64 %p66, %rd116, %rd117; + .loc 2 154 21 // triton_helpers.py:154:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p67, %p66, %p65; + .loc 2 154 12 // triton_helpers.py:154:12 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p68, %p64, %p67; + .loc 2 155 35 // triton_helpers.py:155:35 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.f32 %r39, %r37, %r38, %p68; + .loc 2 155 69 // triton_helpers.py:155:69 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.b64 %rd50, %rd116, %rd117, %p68; + .loc 2 144 21 // triton_helpers.py:144:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.gt.f32 %p69, %r39, %r102; + .loc 2 145 23 // triton_helpers.py:145:23 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.eq.f32 %p70, %r39, %r102; + .loc 2 147 29 // triton_helpers.py:147:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p71, %r39, %r39; + .loc 2 148 29 // triton_helpers.py:148:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p72, %r102, %r102; + setp.num.f32 %p73, %r102, %r102; + .loc 2 149 27 // triton_helpers.py:149:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p74, %p71, %p73; + .loc 2 149 16 // triton_helpers.py:149:16 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p75, %p69, %p74; + .loc 2 151 27 // triton_helpers.py:151:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p76, %p72, %p71; + .loc 2 151 17 // triton_helpers.py:151:17 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p77, %p70, %p76; + .loc 2 154 31 // triton_helpers.py:154:31 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.lt.s64 %p78, %rd50, %rd113; + .loc 2 154 21 // triton_helpers.py:154:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p79, %p78, %p77; + .loc 2 154 12 // triton_helpers.py:154:12 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p80, %p75, %p79; + .loc 2 155 35 // triton_helpers.py:155:35 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.f32 %r40, %r39, %r102, %p80; + .loc 2 155 69 // triton_helpers.py:155:69 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.b64 %rd51, %rd50, %rd113, %p80; + .loc 2 144 21 // triton_helpers.py:144:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.gt.f32 %p81, %r40, %r103; + .loc 2 145 23 // triton_helpers.py:145:23 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.eq.f32 %p82, %r40, %r103; + .loc 2 147 29 // triton_helpers.py:147:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p83, %r40, %r40; + .loc 2 148 29 // triton_helpers.py:148:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p84, %r103, %r103; + setp.num.f32 %p85, %r103, %r103; + .loc 2 149 27 // triton_helpers.py:149:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p86, %p83, %p85; + .loc 2 149 16 // triton_helpers.py:149:16 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p87, %p81, %p86; + .loc 2 151 27 // triton_helpers.py:151:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p88, %p84, %p83; + .loc 2 151 17 // triton_helpers.py:151:17 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p89, %p82, %p88; + .loc 2 154 31 // triton_helpers.py:154:31 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.lt.s64 %p90, %rd51, %rd114; + .loc 2 154 21 // triton_helpers.py:154:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p91, %p90, %p89; + .loc 2 154 12 // triton_helpers.py:154:12 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p92, %p87, %p91; + .loc 2 155 35 // triton_helpers.py:155:35 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.f32 %r41, %r40, %r103, %p92; + .loc 2 155 69 // triton_helpers.py:155:69 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.b64 %rd52, %rd51, %rd114, %p92; + .loc 2 165 42 // triton_helpers.py:165:42 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + shfl.sync.bfly.b32 %r42, %r41, 16, 31, -1; + mov.b64 {_, %r43}, %rd52; + cvt.u32.u64 %r44, %rd52; + shfl.sync.bfly.b32 %r45, %r44, 16, 31, -1; + shfl.sync.bfly.b32 %r46, %r43, 16, 31, -1; + cvt.u64.u32 %rd53, %r45; + cvt.u64.u32 %rd54, %r46; + shl.b64 %rd55, %rd54, 32; + or.b64 %rd56, %rd53, %rd55; + .loc 2 144 21 // triton_helpers.py:144:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.gt.f32 %p93, %r41, %r42; + .loc 2 145 23 // triton_helpers.py:145:23 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.eq.f32 %p94, %r41, %r42; + .loc 2 147 29 // triton_helpers.py:147:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p95, %r41, %r41; + .loc 2 148 29 // triton_helpers.py:148:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p96, %r42, %r42; + setp.num.f32 %p97, %r42, %r42; + .loc 2 149 27 // triton_helpers.py:149:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p98, %p95, %p97; + .loc 2 149 16 // triton_helpers.py:149:16 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p99, %p93, %p98; + .loc 2 151 27 // triton_helpers.py:151:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p100, %p95, %p96; + .loc 2 151 17 // triton_helpers.py:151:17 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p101, %p94, %p100; + .loc 2 154 31 // triton_helpers.py:154:31 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.lt.s64 %p102, %rd52, %rd56; + .loc 2 154 21 // triton_helpers.py:154:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p103, %p101, %p102; + .loc 2 154 12 // triton_helpers.py:154:12 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p104, %p99, %p103; + .loc 2 155 35 // triton_helpers.py:155:35 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.f32 %r47, %r41, %r42, %p104; + .loc 2 155 69 // triton_helpers.py:155:69 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.b64 %rd57, %rd52, %rd56, %p104; + .loc 2 165 42 // triton_helpers.py:165:42 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + shfl.sync.bfly.b32 %r48, %r47, 8, 31, -1; + mov.b64 {_, %r49}, %rd57; + cvt.u32.u64 %r50, %rd57; + shfl.sync.bfly.b32 %r51, %r50, 8, 31, -1; + shfl.sync.bfly.b32 %r52, %r49, 8, 31, -1; + cvt.u64.u32 %rd58, %r51; + cvt.u64.u32 %rd59, %r52; + shl.b64 %rd60, %rd59, 32; + or.b64 %rd61, %rd58, %rd60; + .loc 2 144 21 // triton_helpers.py:144:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.gt.f32 %p105, %r47, %r48; + .loc 2 145 23 // triton_helpers.py:145:23 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.eq.f32 %p106, %r47, %r48; + .loc 2 147 29 // triton_helpers.py:147:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p107, %r47, %r47; + .loc 2 148 29 // triton_helpers.py:148:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p108, %r48, %r48; + setp.num.f32 %p109, %r48, %r48; + .loc 2 149 27 // triton_helpers.py:149:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p110, %p107, %p109; + .loc 2 149 16 // triton_helpers.py:149:16 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p111, %p105, %p110; + .loc 2 151 27 // triton_helpers.py:151:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p112, %p108, %p107; + .loc 2 151 17 // triton_helpers.py:151:17 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p113, %p106, %p112; + .loc 2 154 31 // triton_helpers.py:154:31 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.lt.s64 %p114, %rd57, %rd61; + .loc 2 154 21 // triton_helpers.py:154:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p115, %p114, %p113; + .loc 2 154 12 // triton_helpers.py:154:12 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p116, %p111, %p115; + .loc 2 155 35 // triton_helpers.py:155:35 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.f32 %r53, %r47, %r48, %p116; + .loc 2 155 69 // triton_helpers.py:155:69 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.b64 %rd62, %rd57, %rd61, %p116; + .loc 2 165 42 // triton_helpers.py:165:42 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + shfl.sync.bfly.b32 %r54, %r53, 4, 31, -1; + mov.b64 {_, %r55}, %rd62; + cvt.u32.u64 %r56, %rd62; + shfl.sync.bfly.b32 %r57, %r56, 4, 31, -1; + shfl.sync.bfly.b32 %r58, %r55, 4, 31, -1; + cvt.u64.u32 %rd63, %r57; + cvt.u64.u32 %rd64, %r58; + shl.b64 %rd65, %rd64, 32; + or.b64 %rd66, %rd63, %rd65; + .loc 2 144 21 // triton_helpers.py:144:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.gt.f32 %p117, %r53, %r54; + .loc 2 145 23 // triton_helpers.py:145:23 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.eq.f32 %p118, %r53, %r54; + .loc 2 147 29 // triton_helpers.py:147:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p119, %r53, %r53; + .loc 2 148 29 // triton_helpers.py:148:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p120, %r54, %r54; + setp.num.f32 %p121, %r54, %r54; + .loc 2 149 27 // triton_helpers.py:149:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p122, %p119, %p121; + .loc 2 149 16 // triton_helpers.py:149:16 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p123, %p117, %p122; + .loc 2 151 27 // triton_helpers.py:151:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p124, %p120, %p119; + .loc 2 151 17 // triton_helpers.py:151:17 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p125, %p118, %p124; + .loc 2 154 31 // triton_helpers.py:154:31 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.lt.s64 %p126, %rd62, %rd66; + .loc 2 154 21 // triton_helpers.py:154:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p127, %p126, %p125; + .loc 2 154 12 // triton_helpers.py:154:12 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p128, %p123, %p127; + .loc 2 155 35 // triton_helpers.py:155:35 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.f32 %r59, %r53, %r54, %p128; + .loc 2 155 69 // triton_helpers.py:155:69 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.b64 %rd67, %rd62, %rd66, %p128; + .loc 2 165 42 // triton_helpers.py:165:42 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + shfl.sync.bfly.b32 %r60, %r59, 2, 31, -1; + mov.b64 {_, %r61}, %rd67; + cvt.u32.u64 %r62, %rd67; + shfl.sync.bfly.b32 %r63, %r62, 2, 31, -1; + shfl.sync.bfly.b32 %r64, %r61, 2, 31, -1; + cvt.u64.u32 %rd68, %r63; + cvt.u64.u32 %rd69, %r64; + shl.b64 %rd70, %rd69, 32; + or.b64 %rd71, %rd68, %rd70; + .loc 2 144 21 // triton_helpers.py:144:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.gt.f32 %p129, %r59, %r60; + .loc 2 145 23 // triton_helpers.py:145:23 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.eq.f32 %p130, %r59, %r60; + .loc 2 147 29 // triton_helpers.py:147:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p131, %r59, %r59; + .loc 2 148 29 // triton_helpers.py:148:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p132, %r60, %r60; + setp.num.f32 %p133, %r60, %r60; + .loc 2 149 27 // triton_helpers.py:149:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p134, %p131, %p133; + .loc 2 149 16 // triton_helpers.py:149:16 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p135, %p129, %p134; + .loc 2 151 27 // triton_helpers.py:151:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p136, %p132, %p131; + .loc 2 151 17 // triton_helpers.py:151:17 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p137, %p130, %p136; + .loc 2 154 31 // triton_helpers.py:154:31 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.lt.s64 %p138, %rd67, %rd71; + .loc 2 154 21 // triton_helpers.py:154:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p139, %p138, %p137; + .loc 2 154 12 // triton_helpers.py:154:12 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p140, %p135, %p139; + .loc 2 155 35 // triton_helpers.py:155:35 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.f32 %r65, %r59, %r60, %p140; + .loc 2 155 69 // triton_helpers.py:155:69 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.b64 %rd72, %rd67, %rd71, %p140; + .loc 2 165 42 // triton_helpers.py:165:42 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + shfl.sync.bfly.b32 %r66, %r65, 1, 31, -1; + mov.b64 {_, %r67}, %rd72; + cvt.u32.u64 %r68, %rd72; + shfl.sync.bfly.b32 %r69, %r68, 1, 31, -1; + shfl.sync.bfly.b32 %r70, %r67, 1, 31, -1; + cvt.u64.u32 %rd73, %r69; + cvt.u64.u32 %rd74, %r70; + shl.b64 %rd75, %rd74, 32; + or.b64 %rd76, %rd73, %rd75; + .loc 2 144 21 // triton_helpers.py:144:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.gt.f32 %p141, %r65, %r66; + .loc 2 145 23 // triton_helpers.py:145:23 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.eq.f32 %p142, %r65, %r66; + .loc 2 147 29 // triton_helpers.py:147:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p143, %r65, %r65; + .loc 2 148 29 // triton_helpers.py:148:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p144, %r66, %r66; + setp.num.f32 %p145, %r66, %r66; + .loc 2 149 27 // triton_helpers.py:149:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p146, %p143, %p145; + .loc 2 149 16 // triton_helpers.py:149:16 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p147, %p141, %p146; + .loc 2 151 27 // triton_helpers.py:151:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p148, %p144, %p143; + .loc 2 151 17 // triton_helpers.py:151:17 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p149, %p142, %p148; + .loc 2 154 31 // triton_helpers.py:154:31 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.lt.s64 %p150, %rd72, %rd76; + .loc 2 154 21 // triton_helpers.py:154:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p151, %p150, %p149; + .loc 2 154 12 // triton_helpers.py:154:12 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p152, %p147, %p151; + .loc 2 155 69 // triton_helpers.py:155:69 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.b64 %rd43, %rd72, %rd76, %p152; + .loc 2 165 42 // triton_helpers.py:165:42 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + bfe.u32 %r71, %r1, 5, 4; + setp.eq.b32 %p51, %r36, 0; + mov.b32 %r72, global_smem; + add.s32 %r73, %r72, 128; + shl.b32 %r74, %r71, 2; + add.s32 %r27, %r73, %r74; + .loc 2 155 35 // triton_helpers.py:155:35 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.b32 %r28, %r65, %r66, %p152; + .loc 2 165 42 // triton_helpers.py:165:42 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + // begin inline asm + @%p51 st.shared.b32 [ %r27 + 0 ], %r28; + // end inline asm + shl.b32 %r75, %r71, 3; + add.s32 %r29, %r72, %r75; + // begin inline asm + @%p51 st.shared.b64 [ %r29 + 0 ], %rd43; + // end inline asm + bar.sync 0; + setp.lt.u32 %p53, %r1, 16; + add.s32 %r31, %r73, %r8; + // begin inline asm + @%p53 ld.shared.b32 %r30, [ %r31 + 0 ]; + // end inline asm + shl.b32 %r77, %r1, 3; + add.s32 %r32, %r72, %r77; + // begin inline asm + @%p53 ld.shared.b64 %rd44, [ %r32 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r78, %r30, 8, 31, -1; + mov.b64 {_, %r79}, %rd44; + cvt.u32.u64 %r80, %rd44; + shfl.sync.bfly.b32 %r81, %r80, 8, 31, -1; + shfl.sync.bfly.b32 %r82, %r79, 8, 31, -1; + cvt.u64.u32 %rd77, %r81; + cvt.u64.u32 %rd78, %r82; + shl.b64 %rd79, %rd78, 32; + or.b64 %rd80, %rd77, %rd79; + .loc 2 144 21 // triton_helpers.py:144:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.gt.f32 %p153, %r30, %r78; + .loc 2 145 23 // triton_helpers.py:145:23 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.eq.f32 %p154, %r30, %r78; + .loc 2 147 29 // triton_helpers.py:147:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p155, %r30, %r30; + .loc 2 148 29 // triton_helpers.py:148:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p156, %r78, %r78; + setp.num.f32 %p157, %r78, %r78; + .loc 2 149 27 // triton_helpers.py:149:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p158, %p155, %p157; + .loc 2 149 16 // triton_helpers.py:149:16 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p159, %p153, %p158; + .loc 2 151 27 // triton_helpers.py:151:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p160, %p155, %p156; + .loc 2 151 17 // triton_helpers.py:151:17 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p161, %p154, %p160; + .loc 2 154 31 // triton_helpers.py:154:31 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.lt.s64 %p162, %rd44, %rd80; + .loc 2 154 21 // triton_helpers.py:154:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p163, %p161, %p162; + .loc 2 154 12 // triton_helpers.py:154:12 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p164, %p159, %p163; + .loc 2 155 35 // triton_helpers.py:155:35 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.f32 %r83, %r30, %r78, %p164; + .loc 2 155 69 // triton_helpers.py:155:69 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.b64 %rd81, %rd44, %rd80, %p164; + .loc 2 165 42 // triton_helpers.py:165:42 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + shfl.sync.bfly.b32 %r84, %r83, 4, 31, -1; + mov.b64 {_, %r85}, %rd81; + cvt.u32.u64 %r86, %rd81; + shfl.sync.bfly.b32 %r87, %r86, 4, 31, -1; + shfl.sync.bfly.b32 %r88, %r85, 4, 31, -1; + cvt.u64.u32 %rd82, %r87; + cvt.u64.u32 %rd83, %r88; + shl.b64 %rd84, %rd83, 32; + or.b64 %rd85, %rd82, %rd84; + .loc 2 144 21 // triton_helpers.py:144:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.gt.f32 %p165, %r83, %r84; + .loc 2 145 23 // triton_helpers.py:145:23 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.eq.f32 %p166, %r83, %r84; + .loc 2 147 29 // triton_helpers.py:147:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p167, %r83, %r83; + .loc 2 148 29 // triton_helpers.py:148:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p168, %r84, %r84; + setp.num.f32 %p169, %r84, %r84; + .loc 2 149 27 // triton_helpers.py:149:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p170, %p167, %p169; + .loc 2 149 16 // triton_helpers.py:149:16 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p171, %p165, %p170; + .loc 2 151 27 // triton_helpers.py:151:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p172, %p168, %p167; + .loc 2 151 17 // triton_helpers.py:151:17 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p173, %p166, %p172; + .loc 2 154 31 // triton_helpers.py:154:31 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.lt.s64 %p174, %rd81, %rd85; + .loc 2 154 21 // triton_helpers.py:154:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p175, %p174, %p173; + .loc 2 154 12 // triton_helpers.py:154:12 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p176, %p171, %p175; + .loc 2 155 35 // triton_helpers.py:155:35 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.f32 %r89, %r83, %r84, %p176; + .loc 2 155 69 // triton_helpers.py:155:69 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.b64 %rd86, %rd81, %rd85, %p176; + .loc 2 165 42 // triton_helpers.py:165:42 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + shfl.sync.bfly.b32 %r90, %r89, 2, 31, -1; + mov.b64 {_, %r91}, %rd86; + cvt.u32.u64 %r92, %rd86; + shfl.sync.bfly.b32 %r93, %r92, 2, 31, -1; + shfl.sync.bfly.b32 %r94, %r91, 2, 31, -1; + cvt.u64.u32 %rd87, %r93; + cvt.u64.u32 %rd88, %r94; + shl.b64 %rd89, %rd88, 32; + or.b64 %rd90, %rd87, %rd89; + .loc 2 144 21 // triton_helpers.py:144:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.gt.f32 %p177, %r89, %r90; + .loc 2 145 23 // triton_helpers.py:145:23 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.eq.f32 %p178, %r89, %r90; + .loc 2 147 29 // triton_helpers.py:147:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p179, %r89, %r89; + .loc 2 148 29 // triton_helpers.py:148:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p180, %r90, %r90; + setp.num.f32 %p181, %r90, %r90; + .loc 2 149 27 // triton_helpers.py:149:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p182, %p179, %p181; + .loc 2 149 16 // triton_helpers.py:149:16 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p183, %p177, %p182; + .loc 2 151 27 // triton_helpers.py:151:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p184, %p180, %p179; + .loc 2 151 17 // triton_helpers.py:151:17 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p185, %p178, %p184; + .loc 2 154 31 // triton_helpers.py:154:31 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.lt.s64 %p186, %rd86, %rd90; + .loc 2 154 21 // triton_helpers.py:154:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p187, %p186, %p185; + .loc 2 154 12 // triton_helpers.py:154:12 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p188, %p183, %p187; + .loc 2 155 35 // triton_helpers.py:155:35 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.f32 %r95, %r89, %r90, %p188; + .loc 2 155 69 // triton_helpers.py:155:69 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.b64 %rd91, %rd86, %rd90, %p188; + .loc 2 165 42 // triton_helpers.py:165:42 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + shfl.sync.bfly.b32 %r96, %r95, 1, 31, -1; + mov.b64 {_, %r97}, %rd91; + cvt.u32.u64 %r98, %rd91; + shfl.sync.bfly.b32 %r99, %r98, 1, 31, -1; + shfl.sync.bfly.b32 %r100, %r97, 1, 31, -1; + cvt.u64.u32 %rd92, %r99; + cvt.u64.u32 %rd93, %r100; + shl.b64 %rd94, %rd93, 32; + or.b64 %rd95, %rd92, %rd94; + .loc 2 144 21 // triton_helpers.py:144:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.gt.f32 %p189, %r95, %r96; + .loc 2 145 23 // triton_helpers.py:145:23 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.eq.f32 %p190, %r95, %r96; + .loc 2 147 29 // triton_helpers.py:147:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p191, %r95, %r95; + .loc 2 148 29 // triton_helpers.py:148:29 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.nan.f32 %p192, %r96, %r96; + setp.num.f32 %p193, %r96, %r96; + .loc 2 149 27 // triton_helpers.py:149:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p194, %p191, %p193; + .loc 2 149 16 // triton_helpers.py:149:16 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p195, %p189, %p194; + .loc 2 151 27 // triton_helpers.py:151:27 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p196, %p192, %p191; + .loc 2 151 17 // triton_helpers.py:151:17 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p197, %p190, %p196; + .loc 2 154 31 // triton_helpers.py:154:31 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.lt.s64 %p198, %rd91, %rd95; + .loc 2 154 21 // triton_helpers.py:154:21 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + and.pred %p199, %p198, %p197; + .loc 2 154 12 // triton_helpers.py:154:12 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + or.pred %p200, %p195, %p199; + .loc 2 155 69 // triton_helpers.py:155:69 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.b64 %rd45, %rd91, %rd95, %p200; + .loc 2 165 42 // triton_helpers.py:165:42 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + setp.eq.b32 %p55, %r1, 0; + .loc 2 155 35 // triton_helpers.py:155:35 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + selp.b32 %r34, %r95, %r96, %p200; + .loc 2 165 42 // triton_helpers.py:165:42 @[ czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:44:75 ] + // begin inline asm + @%p55 st.shared.b32 [ %r31 + 0 ], %r34; + // end inline asm + // begin inline asm + @%p55 st.shared.b64 [ %r32 + 0 ], %rd45; + // end inline asm + bar.sync 0; + ld.shared.b64 %rd96, [global_smem]; +$L__tmp6: + .loc 1 46 31 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:46:31 + shl.b64 %rd97, %rd1, 3; + add.s64 %rd48, %rd22, %rd97; + .loc 1 46 36 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:46:36 + // begin inline asm + mov.u64 %rd46, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd46, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd47, 0x0; + ld.global.L1::evict_last.L2::cache_hint.b64 { %rd47 }, [ %rd48 + 0 ], %rd46; + // end inline asm + .loc 1 48 18 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:48:18 + add.s64 %rd98, %rd96, 151936; + .loc 1 49 18 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:49:18 + setp.lt.s64 %p201, %rd96, 0; + .loc 1 50 32 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:50:32 + selp.b64 %rd19, %rd98, %rd96, %p201; + .loc 1 51 36 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:51:36 + setp.lt.u64 %p202, %rd19, 151936; + .loc 1 51 52 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:51:52 + @%p202 bra $L__BB0_4; + bra.uni $L__BB0_3; +$L__BB0_4: + bar.sync 0; + .loc 1 52 30 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:52:30 + add.s64 %rd100, %rd21, %rd19; + .loc 1 52 37 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:52:37 + // begin inline asm + mov.u64 %rd101, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd101, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs5, 0x0; + ld.global.L1::evict_last.L2::cache_hint.b8 { %rs5 }, [ %rd100 + 0 ], %rd101; + // end inline asm + and.b16 %rs6, %rs5, 255; + setp.eq.b16 %p204, %rs6, 0; + .loc 1 55 20 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:55:20 + selp.b64 %rd102, 0, %rd47, %p204; + .loc 1 56 4 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:56:4 + bar.sync 0; + .loc 1 57 28 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:57:28 + add.s64 %rd103, %rd20, %rd97; + .loc 1 57 40 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:57:40 + setp.eq.b32 %p203, %r10, 0; + // begin inline asm + @%p203 st.global.b64 [ %rd103 + 0 ], { %rd102 }; + // end inline asm + .loc 1 57 4 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:57:4 + ret; +$L__BB0_3: + .loc 1 51 52 // czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py:51:52 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd105, assertFunc_0; + cvta.global.u64 %rd106, %rd105; + st.param.b64 [param3], %rd106; + mov.b64 %rd107, assertFile_0; + cvta.global.u64 %rd108, %rd107; + st.param.b64 [param1], %rd108; + mov.b64 %rd109, assertMessage_0; + cvta.global.u64 %rd110, %rd109; + st.param.b64 [param0], %rd110; + st.param.b64 [param4], 1; + st.param.b32 [param2], 51; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp7: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 263 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x100 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 122 +.b8 119 +.b8 104 +.b8 54 +.b8 116 +.b8 103 +.b8 107 +.b8 113 +.b8 54 +.b8 115 +.b8 99 +.b8 100 +.b8 115 +.b8 116 +.b8 103 +.b8 122 +.b8 117 +.b8 101 +.b8 98 +.b8 51 +.b8 103 +.b8 111 +.b8 113 +.b8 113 +.b8 110 +.b8 108 +.b8 108 +.b8 110 +.b8 100 +.b8 105 +.b8 107 +.b8 111 +.b8 97 +.b8 115 +.b8 106 +.b8 50 +.b8 105 +.b8 50 +.b8 105 +.b8 101 +.b8 104 +.b8 117 +.b8 50 +.b8 113 +.b8 121 +.b8 118 +.b8 111 +.b8 99 +.b8 99 +.b8 119 +.b8 116 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 122 +.b8 119 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x39 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc4:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xd9:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 40 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xf1:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.source new file mode 100644 index 0000000000000000000000000000000000000000..d5e81441b36e2213b0268071ae89ae4f5c960ffe --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.source @@ -0,0 +1,363 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":18:0) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc61 = loc(unknown) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc86 = loc("in_out_ptr0"(#loc)) +#loc87 = loc("in_ptr0"(#loc)) +#loc88 = loc("in_ptr1"(#loc)) +#loc89 = loc("in_ptr2"(#loc)) +#loc90 = loc("xnumel"(#loc)) +#loc91 = loc("r0_numel"(#loc)) +#loc129 = loc("a_value"(#loc49)) +#loc130 = loc("a_index"(#loc49)) +#loc131 = loc("b_value"(#loc49)) +#loc132 = loc("b_index"(#loc49)) +#loc145 = loc("x"(#loc69)) +#loc146 = loc("x"(#loc73)) +#loc147 = loc("value"(#loc82)) +#loc148 = loc("index"(#loc82)) +module { + tt.func public @triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %xnumel: i64 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i64 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 16384 : i32 loc(#loc92) + %r0_numel_1 = arith.constant 151936 : i32 loc(#loc93) + %xoffset = tt.get_program_id x : i32 loc(#loc94) + %xoffset_2 = arith.extsi %xoffset : i32 to i64 loc(#loc95) + %xoffset_3 = arith.constant 1 : i32 loc(#loc96) + %xoffset_4 = arith.constant 1 : i64 loc(#loc96) + %xoffset_5 = arith.muli %xoffset_2, %xoffset_4 : i64 loc(#loc96) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc97) + %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc98) + %xindex_7 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc99) + %xindex_8 = tt.splat %xoffset_5 : i64 -> tensor<1x1xi64> loc(#loc100) + %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<1x1xi64> loc(#loc100) + %xmask = arith.constant true loc(#loc101) + %xmask_10 = arith.constant dense : tensor<1x2048xi1> loc(#loc101) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc102) + %r0_base_11 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc103) + %r0_base_12 = arith.extsi %r0_base_11 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc104) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc105) + %_tmp2_13 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc105) + %_tmp2_index = arith.constant 9223372036854775807 : i64 loc(#loc106) + %_tmp2_index_14 = arith.constant dense<9223372036854775807> : tensor<1x2048xi64> loc(#loc106) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp2_index_15:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_30 = %_tmp2_13, %_tmp2_index_31 = %_tmp2_index_14) -> (tensor<1x2048xf32>, tensor<1x2048xi64>) : i32 { + %r0_index = arith.extsi %r0_offset : i32 to i64 loc(#loc108) + %r0_index_32 = tt.splat %r0_index : i64 -> tensor<1x2048xi64> loc(#loc108) + %r0_index_33 = arith.addi %r0_index_32, %r0_base_12 : tensor<1x2048xi64> loc(#loc108) + %r0_mask = arith.extsi %r0_numel_1 : i32 to i64 loc(#loc109) + %r0_mask_34 = tt.splat %r0_mask : i64 -> tensor<1x2048xi64> loc(#loc109) + %r0_mask_35 = arith.cmpi slt, %r0_index_33, %r0_mask_34 : tensor<1x2048xi64> loc(#loc109) + %tmp0 = arith.constant 151936 : i32 loc(#loc110) + %tmp0_36 = arith.constant 151936 : i64 loc(#loc110) + %tmp0_37 = arith.constant dense<151936> : tensor<1x1xi64> loc(#loc110) + %tmp0_38 = arith.muli %tmp0_37, %xindex_9 : tensor<1x1xi64> loc(#loc110) + %tmp0_39 = tt.broadcast %tmp0_38 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc111) + %tmp0_40 = arith.addi %r0_index_33, %tmp0_39 : tensor<1x2048xi64> loc(#loc111) + %tmp0_41 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc112) + %tmp0_42 = tt.addptr %tmp0_41, %tmp0_40 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc112) + %tmp0_43 = arith.constant 0.000000e+00 : f32 loc(#loc113) + %tmp0_44 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc113) + %tmp0_45 = arith.truncf %tmp0_44 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc113) + %tmp0_46 = tt.load %tmp0_42, %r0_mask_35, %tmp0_45 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc113) + %tmp0_47 = arith.extf %tmp0_46 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc114) + %14:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i64S1_2048S_fp32S1_2048S_i64S1_2048S__(%_tmp2_30, %_tmp2_index_31, %tmp0_47, %r0_index_33) : (tensor<1x2048xf32>, tensor<1x2048xi64>, tensor<1x2048xf32>, tensor<1x2048xi64>) -> (tensor<1x2048xf32>, tensor<1x2048xi64>) loc(#loc24) + %_tmp2_48 = arith.select %r0_mask_35, %14#0, %_tmp2_30 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc115) + %_tmp2_index_49 = arith.select %r0_mask_35, %14#1, %_tmp2_index_31 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc116) + scf.yield %_tmp2_48, %_tmp2_index_49 : tensor<1x2048xf32>, tensor<1x2048xi64> loc(#loc27) + } loc(#loc149) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i64S1_2048S__(2,)cconstexpr_1_"(%_tmp2_index_15#0, %_tmp2_index_15#1) : (tensor<1x2048xf32>, tensor<1x2048xi64>) -> (tensor<1xf32>, tensor<1xi64>) loc(#loc28) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc117) + %tmp11 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc118) + %tmp11_16 = tt.addptr %tmp11, %xindex_9 : tensor<1x1x!tt.ptr>, tensor<1x1xi64> loc(#loc118) + %tmp11_17 = tt.load %tmp11_16 evictionPolicy = evict_last : tensor<1x1x!tt.ptr> loc(#loc119) + %tmp3 = arith.constant 151936 : i32 loc(#loc120) + %tmp3_18 = arith.constant dense<151936> : tensor<1x1xi32> loc(#loc120) + %tmp4 = arith.extsi %tmp3_18 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc121) + %tmp4_19 = arith.addi %tmp2, %tmp4 : tensor<1x1xi64> loc(#loc121) + %tmp5 = arith.constant 0 : i32 loc(#loc122) + %tmp5_20 = arith.extsi %tmp5 : i32 to i64 loc(#loc122) + %tmp5_21 = tt.splat %tmp5_20 : i64 -> tensor<1x1xi64> loc(#loc122) + %tmp5_22 = arith.cmpi slt, %tmp2, %tmp5_21 : tensor<1x1xi64> loc(#loc122) + %tmp6 = arith.select %tmp5_22, %tmp4_19, %tmp2 : tensor<1x1xi1>, tensor<1x1xi64> loc(#loc123) + %c0_i32_23 = arith.constant 0 : i32 loc(#loc36) + %5 = arith.extsi %c0_i32_23 : i32 to i64 loc(#loc36) + %6 = tt.splat %5 : i64 -> tensor<1x1xi64> loc(#loc36) + %7 = arith.cmpi sle, %6, %tmp6 : tensor<1x1xi64> loc(#loc36) + %c151936_i32 = arith.constant 151936 : i32 loc(#loc37) + %8 = arith.extsi %c151936_i32 : i32 to i64 loc(#loc37) + %9 = tt.splat %8 : i64 -> tensor<1x1xi64> loc(#loc37) + %10 = arith.cmpi slt, %tmp6, %9 : tensor<1x1xi64> loc(#loc37) + %11 = arith.andi %7, %10 : tensor<1x1xi1> loc(#loc38) + tt.assert %11, "index out of bounds: 0 <= tmp6 < 151936" : tensor<1x1xi1> loc(#loc39) + %tmp8 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc124) + %tmp8_24 = tt.addptr %tmp8, %tmp6 : tensor<1x1x!tt.ptr>, tensor<1x1xi64> loc(#loc124) + %tmp8_25 = tt.bitcast %tmp8_24 : tensor<1x1x!tt.ptr> -> tensor<1x1x!tt.ptr> loc(#loc125) + %tmp8_26 = tt.load %tmp8_25 evictionPolicy = evict_last : tensor<1x1x!tt.ptr> loc(#loc125) + %tmp8_27 = arith.constant 0 : i8 loc(#loc125) + %tmp8_28 = arith.constant dense<0> : tensor<1x1xi8> loc(#loc125) + %tmp8_29 = arith.cmpi ne, %tmp8_26, %tmp8_28 : tensor<1x1xi8> loc(#loc125) + %tmp9 = arith.extui %tmp8_29 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc126) + %tmp10 = arith.extsi %tmp9 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc127) + %tmp12 = arith.muli %tmp10, %tmp11_17 : tensor<1x1xi64> loc(#loc128) + gpu.barrier loc(#loc45) + %12 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc46) + %13 = tt.addptr %12, %xindex_9 : tensor<1x1x!tt.ptr>, tensor<1x1xi64> loc(#loc46) + tt.store %13, %tmp12 : tensor<1x1x!tt.ptr> loc(#loc47) + tt.return loc(#loc48) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i64S1_2048S_fp32S1_2048S_i64S1_2048S__(%a_value: tensor<1x2048xf32> loc("a_value"(#loc49)), %a_index: tensor<1x2048xi64> loc("a_index"(#loc49)), %b_value: tensor<1x2048xf32> loc("b_value"(#loc49)), %b_index: tensor<1x2048xi64> loc("b_index"(#loc49))) -> (tensor<1x2048xf32>, tensor<1x2048xi64>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<1x2048xf32> loc(#loc150) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<1x2048xf32> loc(#loc151) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%a_value) : (tensor<1x2048xf32>) -> i1 loc(#loc52) + %1:2 = scf.if %0 -> (tensor<1x2048xi1>, tensor<1x2048xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<1x2048xf32> loc(#loc135) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<1x2048xf32> loc(#loc136) + %mask_3 = arith.constant true loc(#loc137) + %mask_4 = arith.constant dense : tensor<1x2048xi1> loc(#loc137) + %mask_5 = arith.xori %b_isnan, %mask_4 : tensor<1x2048xi1> loc(#loc137) + %mask_6 = arith.andi %a_isnan, %mask_5 : tensor<1x2048xi1> loc(#loc138) + %mask_7 = arith.ori %mask, %mask_6 : tensor<1x2048xi1> loc(#loc152) + %equal_8 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc140) + %equal_9 = arith.ori %equal, %equal_8 : tensor<1x2048xi1> loc(#loc153) + scf.yield %mask_7, %equal_9 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc153) + } else { + scf.yield %mask, %equal : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc61) + } loc(#loc53) + %mask_0 = arith.cmpi slt, %a_index, %b_index : tensor<1x2048xi64> loc(#loc142) + %mask_1 = arith.andi %1#1, %mask_0 : tensor<1x2048xi1> loc(#loc143) + %mask_2 = arith.ori %1#0, %mask_1 : tensor<1x2048xi1> loc(#loc144) + %2 = arith.select %mask_2, %a_value, %b_value : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc65) + %3 = arith.select %mask_2, %a_index, %b_index : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc66) + tt.return %2, %3 : tensor<1x2048xf32>, tensor<1x2048xi64> loc(#loc67) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x2048xf32> loc(#loc68) + %5 = ub.poison : tensor<1x2048xi64> loc(#loc68) + tt.return %4, %5 : tensor<1x2048xf32>, tensor<1x2048xi64> loc(#loc68) + } loc(#loc49) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc69))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc70) + %true = arith.constant true loc(#loc71) + tt.return %true : i1 loc(#loc71) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc72) + tt.return %1 : i1 loc(#loc72) + } loc(#loc69) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc73))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc74) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc75) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc75) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc75) + %4 = arith.addf %x, %3 : tensor<1x2048xf32> loc(#loc75) + tt.return %4 : tensor<1x2048xf32> loc(#loc76) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x2048xf32> loc(#loc77) + tt.return %5 : tensor<1x2048xf32> loc(#loc77) + } loc(#loc73) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc79) + %cst = arith.constant dense : tensor<1xi1> loc(#loc79) + tt.return %cst : tensor<1xi1> loc(#loc80) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc81) + tt.return %0 : tensor<1xi1> loc(#loc81) + } loc(#loc78) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i64S1_2048S__(2,)cconstexpr_1_"(%value: tensor<1x2048xf32> loc("value"(#loc82)), %index: tensor<1x2048xi64> loc("index"(#loc82))) -> (tensor<1xf32>, tensor<1xi64>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i64 loc(unknown), %arg4: f32 loc(unknown), %arg5: i64 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i64_fp32_i64__(%arg2, %arg3, %arg4, %arg5) : (f32, i64, f32, i64) -> (f32, i64) loc(#loc83) + tt.reduce.return %3#0, %3#1 : f32, i64 loc(#loc83) + }) : (tensor<1x2048xf32>, tensor<1x2048xi64>) -> (tensor<1xf32>, tensor<1xi64>) loc(#loc83) + tt.return %0#0, %0#1 : tensor<1xf32>, tensor<1xi64> loc(#loc84) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc85) + %2 = ub.poison : tensor<1xi64> loc(#loc85) + tt.return %1, %2 : tensor<1xf32>, tensor<1xi64> loc(#loc85) + } loc(#loc82) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i64_fp32_i64__(%a_value: f32 loc("a_value"(#loc49)), %a_index: i64 loc("a_index"(#loc49)), %b_value: f32 loc("b_value"(#loc49)), %b_index: i64 loc("b_index"(#loc49))) -> (f32, i64) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc150) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc151) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc52) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc135) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc136) + %mask_3 = arith.constant true loc(#loc137) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc137) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc138) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc152) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc140) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc153) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc153) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc61) + } loc(#loc53) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i64 loc(#loc142) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc143) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc144) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc65) + %3 = arith.select %mask_2, %a_index, %b_index : i64 loc(#loc66) + tt.return %2, %3 : f32, i64 loc(#loc67) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc68) + %5 = ub.poison : i64 loc(#loc68) + tt.return %4, %5 : f32, i64 loc(#loc68) + } loc(#loc49) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc69))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc70) + %true = arith.constant true loc(#loc71) + tt.return %true : i1 loc(#loc71) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc72) + tt.return %1 : i1 loc(#loc72) + } loc(#loc69) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc73))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc74) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc75) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc75) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc75) + tt.return %3 : tensor<1xf32> loc(#loc76) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc77) + tt.return %4 : tensor<1xf32> loc(#loc77) + } loc(#loc73) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":23:34) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":23:46) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":24:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":24:44) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":24:56) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":24:23) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":25:46) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":26:27) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":26:37) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":26:49) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":29:55) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":30:67) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":31:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":32:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":33:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":37:48) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":37:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":37:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":37:53) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":37:107) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":40:38) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":42:46) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":43:58) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":43:8) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":44:75) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":45:20) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":46:31) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":46:36) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":47:40) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":48:18) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":49:18) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":50:32) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":51:27) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":51:43) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":51:36) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":51:52) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":52:30) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":52:37) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":53:19) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":54:20) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":55:20) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":56:4) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":57:28) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":57:40) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":57:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc92 = loc("xnumel"(#loc1)) +#loc93 = loc("r0_numel"(#loc2)) +#loc94 = loc("xoffset"(#loc3)) +#loc95 = loc("xoffset"(#loc4)) +#loc96 = loc("xoffset"(#loc5)) +#loc97 = loc("xindex"(#loc6)) +#loc98 = loc("xindex"(#loc7)) +#loc99 = loc("xindex"(#loc8)) +#loc100 = loc("xindex"(#loc9)) +#loc101 = loc("xmask"(#loc10)) +#loc102 = loc("r0_base"(#loc11)) +#loc103 = loc("r0_base"(#loc12)) +#loc104 = loc("r0_base"(#loc13)) +#loc105 = loc("_tmp2"(#loc14)) +#loc106 = loc("_tmp2_index"(#loc15)) +#loc107 = loc("_tmp2"(#loc16)) +#loc108 = loc("r0_index"(#loc17)) +#loc109 = loc("r0_mask"(#loc18)) +#loc110 = loc("tmp0"(#loc19)) +#loc111 = loc("tmp0"(#loc20)) +#loc112 = loc("tmp0"(#loc21)) +#loc113 = loc("tmp0"(#loc22)) +#loc114 = loc("tmp0"(#loc23)) +#loc115 = loc("_tmp2"(#loc25)) +#loc116 = loc("_tmp2_index"(#loc26)) +#loc117 = loc("tmp2"(#loc29)) +#loc118 = loc("tmp11"(#loc30)) +#loc119 = loc("tmp11"(#loc31)) +#loc120 = loc("tmp3"(#loc32)) +#loc121 = loc("tmp4"(#loc33)) +#loc122 = loc("tmp5"(#loc34)) +#loc123 = loc("tmp6"(#loc35)) +#loc124 = loc("tmp8"(#loc40)) +#loc125 = loc("tmp8"(#loc41)) +#loc126 = loc("tmp9"(#loc42)) +#loc127 = loc("tmp10"(#loc43)) +#loc128 = loc("tmp12"(#loc44)) +#loc133 = loc("mask"(#loc50)) +#loc134 = loc("equal"(#loc51)) +#loc135 = loc("a_isnan"(#loc54)) +#loc136 = loc("b_isnan"(#loc55)) +#loc137 = loc("mask"(#loc56)) +#loc138 = loc("mask"(#loc57)) +#loc139 = loc("mask"(#loc58)) +#loc140 = loc("equal"(#loc59)) +#loc141 = loc("equal"(#loc60)) +#loc142 = loc("mask"(#loc62)) +#loc143 = loc("mask"(#loc63)) +#loc144 = loc("mask"(#loc64)) +#loc149 = loc("_tmp2_index"(#loc107)) +#loc150 = loc("mask"(#loc133)) +#loc151 = loc("equal"(#loc134)) +#loc152 = loc("mask"(#loc139)) +#loc153 = loc("equal"(#loc141)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..0a9125a52520501775fd2fa864b029a21046dc72 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttgir @@ -0,0 +1,235 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":18:0) +#loc1 = loc(unknown) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":44:75) +#loc53 = loc("in_out_ptr0"(#loc)) +#loc54 = loc("in_ptr0"(#loc)) +#loc55 = loc("in_ptr1"(#loc)) +#loc56 = loc("in_ptr2"(#loc)) +#loc57 = loc("xnumel"(#loc)) +#loc58 = loc("r0_numel"(#loc)) +#loc88 = loc(callsite(#loc1 at #loc33)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %xnumel: i64 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i64 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x1xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<151936> : tensor<1x1xi64, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<151936> : tensor<1x1xi64, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<9223372036854775807> : tensor<1x2048xi64, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked1> loc(#loc1) + %cst_4 = arith.constant dense<151936> : tensor<1x2048xi64, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<0> : tensor<1x1xi8, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked1> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c151936_i32 = arith.constant 151936 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c151936_i64 = arith.constant 151936 : i64 loc(#loc1) + %true = arith.constant true loc(#loc1) + %cst_7 = arith.constant dense : tensor<1x2048xi1, #blocked1> loc(#loc1) + %cst_8 = arith.constant dense<0> : tensor<1x1xi64, #blocked1> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc59) + %xoffset_9 = arith.extsi %xoffset : i32 to i64 loc(#loc60) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc61) + %r0_base_10 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x2048xi32, #blocked1> loc(#loc61) + %r0_base_11 = arith.extsi %r0_base_10 : tensor<1x2048xi32, #blocked1> to tensor<1x2048xi64, #blocked1> loc(#loc62) + %tmp0 = arith.muli %xoffset_9, %c151936_i64 : i64 loc(#loc63) + %tmp0_12 = tt.splat %tmp0 : i64 -> tensor<1x2048xi64, #blocked1> loc(#loc100) + %tmp0_13 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked1> loc(#loc65) + %_tmp2_index:2 = scf.for %_tmp2_index_25 = %c0_i32 to %c151936_i32 step %c2048_i32 iter_args(%_tmp2 = %cst_3, %_tmp2_index_26 = %cst_2) -> (tensor<1x2048xf32, #blocked1>, tensor<1x2048xi64, #blocked1>) : i32 { + %r0_index = arith.extsi %_tmp2_index_25 : i32 to i64 loc(#loc67) + %r0_index_27 = tt.splat %r0_index : i64 -> tensor<1x2048xi64, #blocked1> loc(#loc67) + %r0_index_28 = arith.addi %r0_index_27, %r0_base_11 : tensor<1x2048xi64, #blocked1> loc(#loc67) + %r0_mask = arith.cmpi slt, %r0_index_28, %cst_4 : tensor<1x2048xi64, #blocked1> loc(#loc68) + %tmp0_29 = arith.addi %r0_index_28, %tmp0_12 : tensor<1x2048xi64, #blocked1> loc(#loc64) + %tmp0_30 = tt.addptr %tmp0_13, %tmp0_29 : tensor<1x2048x!tt.ptr, #blocked1>, tensor<1x2048xi64, #blocked1> loc(#loc65) + %tmp0_31 = tt.load %tmp0_30, %r0_mask, %cst_6 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked1> loc(#loc69) + %tmp0_32 = arith.extf %tmp0_31 : tensor<1x2048xbf16, #blocked1> to tensor<1x2048xf32, #blocked1> loc(#loc70) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_32 : tensor<1x2048xf32, #blocked1> loc(#loc125) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_32 : tensor<1x2048xf32, #blocked1> loc(#loc126) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<1x2048xf32, #blocked1> loc(#loc104) + %b_isnan = arith.cmpf une, %tmp0_32, %tmp0_32 : tensor<1x2048xf32, #blocked1> loc(#loc105) + %mask_33 = arith.xori %b_isnan, %cst_7 : tensor<1x2048xi1, #blocked1> loc(#loc106) + %mask_34 = arith.andi %a_isnan, %mask_33 : tensor<1x2048xi1, #blocked1> loc(#loc107) + %mask_35 = arith.ori %mask, %mask_34 : tensor<1x2048xi1, #blocked1> loc(#loc127) + %equal_36 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1, #blocked1> loc(#loc109) + %equal_37 = arith.ori %equal, %equal_36 : tensor<1x2048xi1, #blocked1> loc(#loc128) + %mask_38 = arith.cmpi slt, %_tmp2_index_26, %r0_index_28 : tensor<1x2048xi64, #blocked1> loc(#loc111) + %mask_39 = arith.andi %equal_37, %mask_38 : tensor<1x2048xi1, #blocked1> loc(#loc112) + %mask_40 = arith.ori %mask_35, %mask_39 : tensor<1x2048xi1, #blocked1> loc(#loc113) + %6 = arith.select %mask_40, %_tmp2, %tmp0_32 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xf32, #blocked1> loc(#loc83) + %7 = arith.select %mask_40, %_tmp2_index_26, %r0_index_28 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xi64, #blocked1> loc(#loc84) + %_tmp2_41 = arith.select %r0_mask, %6, %_tmp2 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xf32, #blocked1> loc(#loc85) + %_tmp2_index_42 = arith.select %r0_mask, %7, %_tmp2_index_26 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xi64, #blocked1> loc(#loc86) + scf.yield %_tmp2_41, %_tmp2_index_42 : tensor<1x2048xf32, #blocked1>, tensor<1x2048xi64, #blocked1> loc(#loc31) + } loc(#loc101) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc33)), %arg7: i64 loc(callsite(#loc1 at #loc33)), %arg8: f32 loc(callsite(#loc1 at #loc33)), %arg9: i64 loc(callsite(#loc1 at #loc33))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc129) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc130) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc114) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc115) + %mask_25 = arith.xori %b_isnan, %true : i1 loc(#loc116) + %mask_26 = arith.andi %a_isnan, %mask_25 : i1 loc(#loc117) + %mask_27 = arith.ori %mask, %mask_26 : i1 loc(#loc131) + %equal_28 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc118) + %equal_29 = arith.ori %equal, %equal_28 : i1 loc(#loc132) + %mask_30 = arith.cmpi slt, %arg7, %arg9 : i64 loc(#loc119) + %mask_31 = arith.andi %equal_29, %mask_30 : i1 loc(#loc120) + %mask_32 = arith.ori %mask_27, %mask_31 : i1 loc(#loc121) + %6 = arith.select %mask_32, %arg6, %arg8 : f32 loc(#loc122) + %7 = arith.select %mask_32, %arg7, %arg9 : i64 loc(#loc123) + tt.reduce.return %6, %7 : f32, i64 loc(#loc87) + }) : (tensor<1x2048xf32, #blocked1>, tensor<1x2048xi64, #blocked1>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>>, tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>>) loc(#loc87) + %tmp8 = ttg.convert_layout %0#1 : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc89) + %tmp2 = tt.expand_dims %tmp8 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc90) + %tmp2_14 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi64, #blocked1> loc(#loc90) + %tmp11 = tt.addptr %in_ptr2, %xoffset_9 : !tt.ptr, i64 loc(#loc91) + %tmp11_15 = tt.splat %tmp11 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc92) + %tmp11_16 = tt.load %tmp11_15 evictionPolicy = evict_last : tensor<1x1x!tt.ptr, #blocked> loc(#loc92) + %tmp4 = arith.addi %tmp2, %cst_0 : tensor<1x1xi64, #blocked> loc(#loc93) + %tmp4_17 = arith.addi %tmp2_14, %cst_1 : tensor<1x1xi64, #blocked1> loc(#loc93) + %tmp5 = arith.cmpi slt, %tmp2, %cst : tensor<1x1xi64, #blocked> loc(#loc94) + %tmp5_18 = arith.cmpi slt, %tmp2_14, %cst_8 : tensor<1x1xi64, #blocked1> loc(#loc94) + %tmp6 = arith.select %tmp5, %tmp4, %tmp2 : tensor<1x1xi1, #blocked>, tensor<1x1xi64, #blocked> loc(#loc95) + %tmp6_19 = arith.select %tmp5_18, %tmp4_17, %tmp2_14 : tensor<1x1xi1, #blocked1>, tensor<1x1xi64, #blocked1> loc(#loc95) + %1 = arith.cmpi sge, %tmp6_19, %cst_8 : tensor<1x1xi64, #blocked1> loc(#loc41) + %2 = arith.cmpi slt, %tmp6_19, %cst_1 : tensor<1x1xi64, #blocked1> loc(#loc42) + %3 = arith.andi %1, %2 : tensor<1x1xi1, #blocked1> loc(#loc43) + tt.assert %3, "index out of bounds: 0 <= tmp6 < 151936" : tensor<1x1xi1, #blocked1> loc(#loc44) + %tmp8_20 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc96) + %tmp8_21 = tt.addptr %tmp8_20, %tmp6 : tensor<1x1x!tt.ptr, #blocked>, tensor<1x1xi64, #blocked> loc(#loc96) + %tmp8_22 = tt.bitcast %tmp8_21 : tensor<1x1x!tt.ptr, #blocked> -> tensor<1x1x!tt.ptr, #blocked> loc(#loc89) + %tmp8_23 = tt.load %tmp8_22 evictionPolicy = evict_last : tensor<1x1x!tt.ptr, #blocked> loc(#loc89) + %tmp8_24 = arith.cmpi ne, %tmp8_23, %cst_5 : tensor<1x1xi8, #blocked> loc(#loc89) + %tmp10 = arith.extui %tmp8_24 : tensor<1x1xi1, #blocked> to tensor<1x1xi64, #blocked> loc(#loc124) + %tmp12 = arith.muli %tmp10, %tmp11_16 : tensor<1x1xi64, #blocked> loc(#loc99) + gpu.barrier loc(#loc49) + %4 = tt.addptr %in_out_ptr0, %xoffset_9 : !tt.ptr, i64 loc(#loc50) + %5 = tt.splat %4 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc51) + tt.store %5, %tmp12 : tensor<1x1x!tt.ptr, #blocked> loc(#loc51) + tt.return loc(#loc52) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":23:34) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":26:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":26:49) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":37:48) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":37:41) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":37:34) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":31:40) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":32:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":33:29) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":37:53) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":37:107) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":40:38) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":42:46) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":43:58) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":43:8) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":52:37) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":45:20) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":46:31) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":46:36) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":48:18) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":49:18) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":50:32) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":51:27) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":51:43) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":51:36) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":51:52) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":52:30) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":54:20) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":53:19) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":55:20) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":56:4) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":57:28) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":57:40) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":57:4) +#loc59 = loc("xoffset"(#loc2)) +#loc60 = loc("xoffset"(#loc3)) +#loc61 = loc("r0_base"(#loc4)) +#loc62 = loc("r0_base"(#loc5)) +#loc63 = loc("tmp0"(#loc6)) +#loc64 = loc("tmp0"(#loc7)) +#loc65 = loc("tmp0"(#loc8)) +#loc66 = loc("_tmp2"(#loc9)) +#loc67 = loc("r0_index"(#loc10)) +#loc68 = loc("r0_mask"(#loc11)) +#loc69 = loc("tmp0"(#loc12)) +#loc70 = loc("tmp0"(#loc13)) +#loc71 = loc("mask"(#loc14)) +#loc72 = loc("equal"(#loc16)) +#loc73 = loc("a_isnan"(#loc17)) +#loc74 = loc("b_isnan"(#loc18)) +#loc75 = loc("mask"(#loc19)) +#loc76 = loc("mask"(#loc20)) +#loc77 = loc("mask"(#loc21)) +#loc78 = loc("equal"(#loc22)) +#loc79 = loc("equal"(#loc23)) +#loc80 = loc("mask"(#loc24)) +#loc81 = loc("mask"(#loc25)) +#loc82 = loc("mask"(#loc26)) +#loc83 = loc(callsite(#loc27 at #loc15)) +#loc84 = loc(callsite(#loc28 at #loc15)) +#loc85 = loc("_tmp2"(#loc29)) +#loc86 = loc("_tmp2_index"(#loc30)) +#loc87 = loc(callsite(#loc32 at #loc33)) +#loc89 = loc("tmp8"(#loc34)) +#loc90 = loc("tmp2"(#loc35)) +#loc91 = loc("tmp11"(#loc36)) +#loc92 = loc("tmp11"(#loc37)) +#loc93 = loc("tmp4"(#loc38)) +#loc94 = loc("tmp5"(#loc39)) +#loc95 = loc("tmp6"(#loc40)) +#loc96 = loc("tmp8"(#loc45)) +#loc97 = loc("tmp10"(#loc46)) +#loc98 = loc("tmp9"(#loc47)) +#loc99 = loc("tmp12"(#loc48)) +#loc100 = loc(fused[#loc64, #loc63]) +#loc101 = loc("_tmp2_index"(#loc66)) +#loc102 = loc("mask"(#loc71)) +#loc103 = loc("equal"(#loc72)) +#loc104 = loc(callsite(#loc73 at #loc15)) +#loc105 = loc(callsite(#loc74 at #loc15)) +#loc106 = loc(callsite(#loc75 at #loc15)) +#loc107 = loc(callsite(#loc76 at #loc15)) +#loc108 = loc("mask"(#loc77)) +#loc109 = loc(callsite(#loc78 at #loc15)) +#loc110 = loc("equal"(#loc79)) +#loc111 = loc(callsite(#loc80 at #loc15)) +#loc112 = loc(callsite(#loc81 at #loc15)) +#loc113 = loc(callsite(#loc82 at #loc15)) +#loc114 = loc(callsite(#loc73 at #loc87)) +#loc115 = loc(callsite(#loc74 at #loc87)) +#loc116 = loc(callsite(#loc75 at #loc87)) +#loc117 = loc(callsite(#loc76 at #loc87)) +#loc118 = loc(callsite(#loc78 at #loc87)) +#loc119 = loc(callsite(#loc80 at #loc87)) +#loc120 = loc(callsite(#loc81 at #loc87)) +#loc121 = loc(callsite(#loc82 at #loc87)) +#loc122 = loc(callsite(#loc27 at #loc87)) +#loc123 = loc(callsite(#loc28 at #loc87)) +#loc124 = loc(fused[#loc97, #loc98]) +#loc125 = loc(callsite(#loc102 at #loc15)) +#loc126 = loc(callsite(#loc103 at #loc15)) +#loc127 = loc(callsite(#loc108 at #loc15)) +#loc128 = loc(callsite(#loc110 at #loc15)) +#loc129 = loc(callsite(#loc102 at #loc87)) +#loc130 = loc(callsite(#loc103 at #loc87)) +#loc131 = loc(callsite(#loc108 at #loc87)) +#loc132 = loc(callsite(#loc110 at #loc87)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..080c6d7cdd53d1906efb47c314f862b506b51b5e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CQLZ2TTWVWSLM6EFTVUG3BNKE2OJWFNEANR5XQZV7QCTDO565CDA/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttir @@ -0,0 +1,232 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":44:75) +#loc56 = loc("in_out_ptr0"(#loc)) +#loc57 = loc("in_ptr0"(#loc)) +#loc58 = loc("in_ptr1"(#loc)) +#loc59 = loc("in_ptr2"(#loc)) +#loc60 = loc("xnumel"(#loc)) +#loc61 = loc("r0_numel"(#loc)) +#loc62 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %xnumel: i64 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i64 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c151936_i64 = arith.constant 151936 : i64 loc(#loc1) + %true = arith.constant true loc(#loc62) + %cst = arith.constant dense : tensor<1x2048xi1> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc1) + %cst_2 = arith.constant dense<151936> : tensor<1x2048xi64> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc3) + %c151936_i32 = arith.constant 151936 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %tmp8 = arith.constant dense<0> : tensor<1x1xi8> loc(#loc63) + %cst_3 = arith.constant dense<151936> : tensor<1x1xi64> loc(#loc1) + %_tmp2_index = arith.constant dense<9223372036854775807> : tensor<1x2048xi64> loc(#loc64) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc65) + %xoffset = tt.get_program_id x : i32 loc(#loc66) + %xoffset_4 = arith.extsi %xoffset : i32 to i64 loc(#loc67) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc68) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc69) + %r0_base_6 = arith.extsi %r0_base_5 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc70) + %_tmp2_index_7:2 = scf.for %r0_offset = %c0_i32 to %c151936_i32 step %c2048_i32 iter_args(%_tmp2_15 = %_tmp2, %_tmp2_index_16 = %_tmp2_index) -> (tensor<1x2048xf32>, tensor<1x2048xi64>) : i32 { + %r0_index = arith.extsi %r0_offset : i32 to i64 loc(#loc72) + %r0_index_17 = tt.splat %r0_index : i64 -> tensor<1x2048xi64> loc(#loc72) + %r0_index_18 = arith.addi %r0_index_17, %r0_base_6 : tensor<1x2048xi64> loc(#loc72) + %r0_mask = arith.cmpi slt, %r0_index_18, %cst_2 : tensor<1x2048xi64> loc(#loc73) + %tmp0 = arith.muli %xoffset_4, %c151936_i64 : i64 loc(#loc74) + %tmp0_19 = tt.splat %tmp0 : i64 -> tensor<1x2048xi64> loc(#loc107) + %tmp0_20 = arith.addi %r0_index_18, %tmp0_19 : tensor<1x2048xi64> loc(#loc75) + %tmp0_21 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc76) + %tmp0_22 = tt.addptr %tmp0_21, %tmp0_20 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc76) + %tmp0_23 = tt.load %tmp0_22, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc77) + %tmp0_24 = arith.extf %tmp0_23 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc78) + %mask = arith.cmpf ogt, %_tmp2_15, %tmp0_24 : tensor<1x2048xf32> loc(#loc131) + %equal = arith.cmpf oeq, %_tmp2_15, %tmp0_24 : tensor<1x2048xf32> loc(#loc132) + %a_isnan = arith.cmpf une, %_tmp2_15, %_tmp2_15 : tensor<1x2048xf32> loc(#loc110) + %b_isnan = arith.cmpf une, %tmp0_24, %tmp0_24 : tensor<1x2048xf32> loc(#loc111) + %mask_25 = arith.xori %b_isnan, %cst : tensor<1x2048xi1> loc(#loc112) + %mask_26 = arith.andi %a_isnan, %mask_25 : tensor<1x2048xi1> loc(#loc113) + %mask_27 = arith.ori %mask, %mask_26 : tensor<1x2048xi1> loc(#loc133) + %equal_28 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc115) + %equal_29 = arith.ori %equal, %equal_28 : tensor<1x2048xi1> loc(#loc134) + %mask_30 = arith.cmpi slt, %_tmp2_index_16, %r0_index_18 : tensor<1x2048xi64> loc(#loc117) + %mask_31 = arith.andi %equal_29, %mask_30 : tensor<1x2048xi1> loc(#loc118) + %mask_32 = arith.ori %mask_27, %mask_31 : tensor<1x2048xi1> loc(#loc119) + %6 = arith.select %mask_32, %_tmp2_15, %tmp0_24 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc91) + %7 = arith.select %mask_32, %_tmp2_index_16, %r0_index_18 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc92) + %_tmp2_33 = arith.select %r0_mask, %6, %_tmp2_15 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc93) + %_tmp2_index_34 = arith.select %r0_mask, %7, %_tmp2_index_16 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc94) + scf.yield %_tmp2_33, %_tmp2_index_34 : tensor<1x2048xf32>, tensor<1x2048xi64> loc(#loc36) + } loc(#loc106) + %0:2 = "tt.reduce"(%_tmp2_index_7#0, %_tmp2_index_7#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i64 loc(callsite(#loc1 at #loc2)), %arg8: f32 loc(callsite(#loc1 at #loc2)), %arg9: i64 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc135) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc136) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc120) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc121) + %mask_15 = arith.xori %b_isnan, %true : i1 loc(#loc122) + %mask_16 = arith.andi %a_isnan, %mask_15 : i1 loc(#loc123) + %mask_17 = arith.ori %mask, %mask_16 : i1 loc(#loc137) + %equal_18 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc124) + %equal_19 = arith.ori %equal, %equal_18 : i1 loc(#loc138) + %mask_20 = arith.cmpi slt, %arg7, %arg9 : i64 loc(#loc125) + %mask_21 = arith.andi %equal_19, %mask_20 : i1 loc(#loc126) + %mask_22 = arith.ori %mask_17, %mask_21 : i1 loc(#loc127) + %6 = arith.select %mask_22, %arg6, %arg8 : f32 loc(#loc128) + %7 = arith.select %mask_22, %arg7, %arg9 : i64 loc(#loc129) + tt.reduce.return %6, %7 : f32, i64 loc(#loc95) + }) : (tensor<1x2048xf32>, tensor<1x2048xi64>) -> (tensor<1xf32>, tensor<1xi64>) loc(#loc95) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc96) + %tmp11 = tt.addptr %in_ptr2, %xoffset_4 : !tt.ptr, i64 loc(#loc97) + %tmp11_8 = tt.splat %tmp11 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc97) + %tmp11_9 = tt.load %tmp11_8 evictionPolicy = evict_last : tensor<1x1x!tt.ptr> loc(#loc98) + %tmp4 = arith.addi %tmp2, %cst_3 : tensor<1x1xi64> loc(#loc99) + %tmp5 = arith.cmpi slt, %tmp2, %cst_0 : tensor<1x1xi64> loc(#loc100) + %tmp6 = arith.select %tmp5, %tmp4, %tmp2 : tensor<1x1xi1>, tensor<1x1xi64> loc(#loc101) + %1 = arith.cmpi sge, %tmp6, %cst_0 : tensor<1x1xi64> loc(#loc44) + %2 = arith.cmpi slt, %tmp6, %cst_3 : tensor<1x1xi64> loc(#loc45) + %3 = arith.andi %1, %2 : tensor<1x1xi1> loc(#loc46) + tt.assert %3, "index out of bounds: 0 <= tmp6 < 151936" : tensor<1x1xi1> loc(#loc47) + %tmp8_10 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc102) + %tmp8_11 = tt.addptr %tmp8_10, %tmp6 : tensor<1x1x!tt.ptr>, tensor<1x1xi64> loc(#loc102) + %tmp8_12 = tt.bitcast %tmp8_11 : tensor<1x1x!tt.ptr> -> tensor<1x1x!tt.ptr> loc(#loc63) + %tmp8_13 = tt.load %tmp8_12 evictionPolicy = evict_last : tensor<1x1x!tt.ptr> loc(#loc63) + %tmp8_14 = arith.cmpi ne, %tmp8_13, %tmp8 : tensor<1x1xi8> loc(#loc63) + %tmp10 = arith.extui %tmp8_14 : tensor<1x1xi1> to tensor<1x1xi64> loc(#loc130) + %tmp12 = arith.muli %tmp10, %tmp11_9 : tensor<1x1xi64> loc(#loc105) + gpu.barrier loc(#loc52) + %4 = tt.addptr %in_out_ptr0, %xoffset_4 : !tt.ptr, i64 loc(#loc53) + %5 = tt.splat %4 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc53) + tt.store %5, %tmp12 : tensor<1x1x!tt.ptr> loc(#loc54) + tt.return loc(#loc55) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":31:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":52:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":30:67) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":29:55) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":23:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":23:34) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":26:49) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":32:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":33:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":37:48) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":37:41) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":37:34) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":37:53) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":37:107) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":40:38) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":42:46) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":43:58) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":43:8) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":45:20) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":46:31) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":46:36) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":48:18) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":49:18) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":50:32) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":51:27) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":51:43) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":51:36) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":51:52) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":52:30) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":54:20) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":53:19) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":55:20) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":56:4) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":57:28) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":57:40) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zw/czwh6tgkq6scdstgzueb3goqqnllndikoasj2i2iehu2qyvoccwt.py":57:4) +#loc63 = loc("tmp8"(#loc4)) +#loc64 = loc("_tmp2_index"(#loc5)) +#loc65 = loc("_tmp2"(#loc6)) +#loc66 = loc("xoffset"(#loc7)) +#loc67 = loc("xoffset"(#loc8)) +#loc68 = loc("r0_base"(#loc9)) +#loc69 = loc("r0_base"(#loc10)) +#loc70 = loc("r0_base"(#loc11)) +#loc71 = loc("_tmp2"(#loc3)) +#loc72 = loc("r0_index"(#loc12)) +#loc73 = loc("r0_mask"(#loc13)) +#loc74 = loc("tmp0"(#loc14)) +#loc75 = loc("tmp0"(#loc15)) +#loc76 = loc("tmp0"(#loc16)) +#loc77 = loc("tmp0"(#loc17)) +#loc78 = loc("tmp0"(#loc18)) +#loc79 = loc("mask"(#loc19)) +#loc80 = loc("equal"(#loc21)) +#loc81 = loc("a_isnan"(#loc22)) +#loc82 = loc("b_isnan"(#loc23)) +#loc83 = loc("mask"(#loc24)) +#loc84 = loc("mask"(#loc25)) +#loc85 = loc("mask"(#loc26)) +#loc86 = loc("equal"(#loc27)) +#loc87 = loc("equal"(#loc28)) +#loc88 = loc("mask"(#loc29)) +#loc89 = loc("mask"(#loc30)) +#loc90 = loc("mask"(#loc31)) +#loc91 = loc(callsite(#loc32 at #loc20)) +#loc92 = loc(callsite(#loc33 at #loc20)) +#loc93 = loc("_tmp2"(#loc34)) +#loc94 = loc("_tmp2_index"(#loc35)) +#loc95 = loc(callsite(#loc37 at #loc2)) +#loc96 = loc("tmp2"(#loc38)) +#loc97 = loc("tmp11"(#loc39)) +#loc98 = loc("tmp11"(#loc40)) +#loc99 = loc("tmp4"(#loc41)) +#loc100 = loc("tmp5"(#loc42)) +#loc101 = loc("tmp6"(#loc43)) +#loc102 = loc("tmp8"(#loc48)) +#loc103 = loc("tmp10"(#loc49)) +#loc104 = loc("tmp9"(#loc50)) +#loc105 = loc("tmp12"(#loc51)) +#loc106 = loc("_tmp2_index"(#loc71)) +#loc107 = loc(fused[#loc75, #loc74]) +#loc108 = loc("mask"(#loc79)) +#loc109 = loc("equal"(#loc80)) +#loc110 = loc(callsite(#loc81 at #loc20)) +#loc111 = loc(callsite(#loc82 at #loc20)) +#loc112 = loc(callsite(#loc83 at #loc20)) +#loc113 = loc(callsite(#loc84 at #loc20)) +#loc114 = loc("mask"(#loc85)) +#loc115 = loc(callsite(#loc86 at #loc20)) +#loc116 = loc("equal"(#loc87)) +#loc117 = loc(callsite(#loc88 at #loc20)) +#loc118 = loc(callsite(#loc89 at #loc20)) +#loc119 = loc(callsite(#loc90 at #loc20)) +#loc120 = loc(callsite(#loc81 at #loc95)) +#loc121 = loc(callsite(#loc82 at #loc95)) +#loc122 = loc(callsite(#loc83 at #loc95)) +#loc123 = loc(callsite(#loc84 at #loc95)) +#loc124 = loc(callsite(#loc86 at #loc95)) +#loc125 = loc(callsite(#loc88 at #loc95)) +#loc126 = loc(callsite(#loc89 at #loc95)) +#loc127 = loc(callsite(#loc90 at #loc95)) +#loc128 = loc(callsite(#loc32 at #loc95)) +#loc129 = loc(callsite(#loc33 at #loc95)) +#loc130 = loc(fused[#loc103, #loc104]) +#loc131 = loc(callsite(#loc108 at #loc20)) +#loc132 = loc(callsite(#loc109 at #loc20)) +#loc133 = loc(callsite(#loc114 at #loc20)) +#loc134 = loc(callsite(#loc116 at #loc20)) +#loc135 = loc(callsite(#loc108 at #loc95)) +#loc136 = loc(callsite(#loc109 at #loc95)) +#loc137 = loc(callsite(#loc114 at #loc95)) +#loc138 = loc(callsite(#loc116 at #loc95)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/__grp__triton_red_fused__to_copy_clone_slice_sum_transpose_5.json b/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/__grp__triton_red_fused__to_copy_clone_slice_sum_transpose_5.json new file mode 100644 index 0000000000000000000000000000000000000000..81adf78cc0a8ad299052dca57914b48bcbf13e73 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/__grp__triton_red_fused__to_copy_clone_slice_sum_transpose_5.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_clone_slice_sum_transpose_5.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin new file mode 100644 index 0000000000000000000000000000000000000000..038ca881ed4e14e5e4b1c472c328b3cec3da4163 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json b/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8cf188ae53aeac10f7fae99021a59e720cbc52a1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json @@ -0,0 +1 @@ +{"hash": "148341bcbd36b461a244d5d1a1b61032d6536037b28ffd7bcdc97e4c9d896e4c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_clone_slice_sum_transpose_5"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir new file mode 100644 index 0000000000000000000000000000000000000000..dd17a0961c3e1a4ef9c148aa7cff90daa2bbfdb3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir @@ -0,0 +1,163 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__to_copy_clone_slice_sum_transpose_5(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %.fr5 = freeze i32 %4, !dbg !8 + %10 = icmp slt i32 %9, %.fr5, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 15, !dbg !9 + %13 = zext nneg i32 %9 to i64, !dbg !10 + %.frozen = freeze i64 %2, !dbg !11 + %14 = sdiv i64 %13, %.frozen, !dbg !11 + %15 = mul i64 %14, %.frozen, !dbg !10 + %.decomposed = sub i64 %13, %15, !dbg !10 + %16 = icmp sgt i32 %5, 0, !dbg !12 + br i1 %16, label %.lr.ph, label %._crit_edge, !dbg !12 + +.lr.ph: ; preds = %8 + %17 = mul i64 %3, %2, !dbg !13 + %18 = mul i64 %17, %14, !dbg !14 + %19 = getelementptr i32, ptr addrspace(1) %0, i64 %.decomposed + %invariant.gep = getelementptr i32, ptr addrspace(1) %19, i64 %18, !dbg !12 + br i1 %10, label %.lr.ph.split, label %.lr.ph.split.us + +.lr.ph.split.us: ; preds = %.lr.ph, %.lr.ph.split.us + %20 = phi i32 [ %26, %.lr.ph.split.us ], [ 0, %.lr.ph ] + %21 = or disjoint i32 %20, %12, !dbg !15 + %22 = sext i32 %21 to i64, !dbg !16 + %23 = mul i64 %2, %22, !dbg !16 + %gep.us = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %23, !dbg !17 + %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !18 + %25 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep.us, i64 %24, i1 false) #3, !dbg !18 + %26 = add i32 %20, 16, !dbg !12 + %27 = icmp slt i32 %26, %5, !dbg !12 + br i1 %27, label %.lr.ph.split.us, label %._crit_edge, !dbg !12 + +.lr.ph.split: ; preds = %.lr.ph, %.lr.ph.split + %28 = phi i64 [ %36, %.lr.ph.split ], [ 0, %.lr.ph ] + %29 = phi i32 [ %37, %.lr.ph.split ], [ 0, %.lr.ph ] + %30 = or disjoint i32 %29, %12, !dbg !15 + %31 = icmp slt i32 %30, %5, !dbg !19 + %32 = sext i32 %30 to i64, !dbg !16 + %33 = mul i64 %2, %32, !dbg !16 + %gep = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %33, !dbg !17 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !18 + %35 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep, i64 %34, i1 %31) #3, !dbg !18 + %narrow = select i1 %31, i32 %35, i32 0, !dbg !20 + %spec.select = sext i32 %narrow to i64, !dbg !20 + %36 = add i64 %28, %spec.select, !dbg !20 + %37 = add i32 %29, 16, !dbg !12 + %38 = icmp slt i32 %37, %5, !dbg !12 + br i1 %38, label %.lr.ph.split, label %._crit_edge, !dbg !12 + +._crit_edge: ; preds = %.lr.ph.split.us, %.lr.ph.split, %8 + %.lcssa = phi i64 [ 0, %8 ], [ %36, %.lr.ph.split ], [ 0, %.lr.ph.split.us ], !dbg !21 + %extelt.offset = lshr i64 %.lcssa, 32, !dbg !22 + %39 = trunc nuw i64 %extelt.offset to i32, !dbg !22 + %40 = trunc i64 %.lcssa to i32, !dbg !22 + %41 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %40, i32 8, i32 31), !dbg !22 + %42 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 8, i32 31), !dbg !22 + %43 = insertelement <2 x i32> poison, i32 %41, i64 0, !dbg !22 + %44 = insertelement <2 x i32> %43, i32 %42, i64 1, !dbg !22 + %45 = bitcast <2 x i32> %44 to i64, !dbg !22 + %46 = add i64 %.lcssa, %45, !dbg !26 + %extelt.offset2 = lshr i64 %46, 32, !dbg !22 + %47 = trunc nuw i64 %extelt.offset2 to i32, !dbg !22 + %48 = trunc i64 %46 to i32, !dbg !22 + %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 4, i32 31), !dbg !22 + %50 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %47, i32 4, i32 31), !dbg !22 + %51 = insertelement <2 x i32> poison, i32 %49, i64 0, !dbg !22 + %52 = insertelement <2 x i32> %51, i32 %50, i64 1, !dbg !22 + %53 = bitcast <2 x i32> %52 to i64, !dbg !22 + %54 = add i64 %46, %53, !dbg !26 + %extelt.offset3 = lshr i64 %54, 32, !dbg !22 + %55 = trunc nuw i64 %extelt.offset3 to i32, !dbg !22 + %56 = trunc i64 %54 to i32, !dbg !22 + %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 2, i32 31), !dbg !22 + %58 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 2, i32 31), !dbg !22 + %59 = insertelement <2 x i32> poison, i32 %57, i64 0, !dbg !22 + %60 = insertelement <2 x i32> %59, i32 %58, i64 1, !dbg !22 + %61 = bitcast <2 x i32> %60 to i64, !dbg !22 + %62 = add i64 %54, %61, !dbg !26 + %extelt.offset4 = lshr i64 %62, 32, !dbg !22 + %63 = trunc nuw i64 %extelt.offset4 to i32, !dbg !22 + %64 = trunc i64 %62 to i32, !dbg !22 + %65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 1, i32 31), !dbg !22 + %66 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 1, i32 31), !dbg !22 + %67 = insertelement <2 x i32> poison, i32 %65, i64 0, !dbg !22 + %68 = insertelement <2 x i32> %67, i32 %66, i64 1, !dbg !22 + %69 = bitcast <2 x i32> %68 to i64, !dbg !22 + %70 = add i64 %62, %69, !dbg !26 + %71 = trunc i64 %70 to i32, !dbg !27 + %72 = icmp slt i64 %2, 2, !dbg !28 + %73 = icmp sgt i64 %2, 1, !dbg !29 + %74 = select i1 %73, i64 %2, i64 0, !dbg !30 + %75 = zext i1 %72 to i64, !dbg !31 + %76 = add i64 %74, %75, !dbg !32 + %77 = mul i64 %14, %76, !dbg !33 + %78 = getelementptr i32, ptr addrspace(1) %1, i64 %.decomposed, !dbg !34 + %79 = getelementptr i32, ptr addrspace(1) %78, i64 %77, !dbg !34 + %80 = and i32 %11, 63, !dbg !35 + %81 = icmp eq i32 %80, 0, !dbg !35 + %82 = and i1 %81, %10, !dbg !35 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %71, ptr addrspace(1) %79, i1 %82) #3, !dbg !35 + ret void, !dbg !36 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_clone_slice_sum_transpose_5", linkageName: "triton_red_fused__to_copy_clone_slice_sum_transpose_5", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 21, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 21, scope: !4) +!9 = !DILocation(line: 24, column: 37, scope: !4) +!10 = !DILocation(line: 26, column: 19, scope: !4) +!11 = !DILocation(line: 27, column: 19, scope: !4) +!12 = !DILocation(line: 30, column: 40, scope: !4) +!13 = !DILocation(line: 36, column: 54, scope: !4) +!14 = !DILocation(line: 36, column: 58, scope: !4) +!15 = !DILocation(line: 31, column: 31, scope: !4) +!16 = !DILocation(line: 36, column: 43, scope: !4) +!17 = !DILocation(line: 36, column: 34, scope: !4) +!18 = !DILocation(line: 36, column: 63, scope: !4) +!19 = !DILocation(line: 32, column: 29, scope: !4) +!20 = !DILocation(line: 40, column: 48, scope: !4) +!21 = !DILocation(line: 28, column: 43, scope: !4) +!22 = !DILocation(line: 291, column: 36, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !4, file: !24, discriminator: 0) +!24 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!25 = !DILocation(line: 41, column: 25, scope: !4) +!26 = !DILocation(line: 261, column: 15, scope: !23, inlinedAt: !25) +!27 = !DILocation(line: 42, column: 19, scope: !4) +!28 = !DILocation(line: 43, column: 49, scope: !4) +!29 = !DILocation(line: 43, column: 75, scope: !4) +!30 = !DILocation(line: 43, column: 66, scope: !4) +!31 = !DILocation(line: 43, scope: !4) +!32 = !DILocation(line: 43, column: 57, scope: !4) +!33 = !DILocation(line: 43, column: 34, scope: !4) +!34 = !DILocation(line: 43, column: 25, scope: !4) +!35 = !DILocation(line: 43, column: 88, scope: !4) +!36 = !DILocation(line: 43, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx new file mode 100644 index 0000000000000000000000000000000000000000..0ed8f9c20d11296d47228a3d93459a53d4954dbd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx @@ -0,0 +1,477 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_clone_slice_sum_transpose_5 // -- Begin function triton_red_fused__to_copy_clone_slice_sum_transpose_5 + // @triton_red_fused__to_copy_clone_slice_sum_transpose_5 +.visible .entry triton_red_fused__to_copy_clone_slice_sum_transpose_5( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_1, + .param .u64 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_2, + .param .u64 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_3, + .param .u32 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_4, + .param .u32 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_7 +) +.reqntid 64 +{ + .reg .pred %p<13>; + .reg .b32 %r<45>; + .reg .b64 %rd<68>; + .loc 1 18 0 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:18:0 + +// %bb.0: + ld.param.b32 %r8, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_5]; + ld.param.b64 %rd13, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_2]; +$L__tmp0: + .loc 1 21 28 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:21:28 + mov.u32 %r9, %ctaid.x; + .loc 1 26 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:26:19 + cvt.u64.u32 %rd1, %r9; + .loc 1 27 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:27:19 + and.b64 %rd16, %rd13, -4294967296; + setp.ne.b64 %p1, %rd16, 0; + cvt.u32.u64 %r42, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd65, %rd1, %rd13; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r11, %rd13; + div.u32 %r13, %r42, %r11; + cvt.u64.u32 %rd65, %r13; +$L__BB0_3: + .loc 1 0 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:19 + mov.u32 %r2, %tid.x; + ld.param.b64 %rd12, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_1]; + ld.param.b32 %r10, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_4]; + .loc 1 26 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:26:19 + mul.lo.s64 %rd18, %rd65, %rd13; + sub.s64 %rd6, %rd1, %rd18; + .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40 + setp.lt.s32 %p2, %r8, 1; + mov.b64 %rd67, 0; + shl.b64 %rd64, %rd6, 2; + @%p2 bra $L__BB0_9; +// %bb.4: // %.lr.ph + .loc 1 0 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:40 + ld.param.b64 %rd14, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_3]; + ld.param.b64 %rd11, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_0]; + and.b32 %r3, %r2, 15; + .loc 1 23 21 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:23:21 + setp.lt.s32 %p3, %r42, %r10; + .loc 1 36 54 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:54 + mul.lo.s64 %rd19, %rd14, %rd13; + .loc 1 36 58 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:58 + mul.lo.s64 %rd20, %rd19, %rd65; + add.s64 %rd22, %rd11, %rd64; + .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40 + shl.b64 %rd23, %rd20, 2; + add.s64 %rd7, %rd22, %rd23; + @%p3 bra $L__BB0_7; + bra.uni $L__BB0_5; +$L__BB0_7: // %.lr.ph.split.preheader + .loc 1 0 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:40 + mov.b32 %r44, 0; + mov.b64 %rd67, 0; +$L__BB0_8: // %.lr.ph.split + // =>This Inner Loop Header: Depth=1 + .loc 1 32 29 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:32:29 + add.s32 %r20, %r3, %r44; + setp.lt.s32 %p6, %r20, %r8; + .loc 1 36 43 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:43 + cvt.s64.s32 %rd35, %r20; + mul.lo.s64 %rd36, %rd13, %rd35; + .loc 1 36 34 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:34 + shl.b64 %rd37, %rd36, 2; + add.s64 %rd33, %rd7, %rd37; + .loc 1 36 63 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:63 + // begin inline asm + mov.u64 %rd32, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd32, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r19, 0x0; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b32 { %r19 }, [ %rd33 + 0 ], %rd32; + // end inline asm + .loc 1 40 48 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:40:48 + selp.b32 %r21, %r19, 0, %p6; + cvt.s64.s32 %rd38, %r21; + add.s64 %rd67, %rd67, %rd38; + .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40 + add.s32 %r44, %r44, 16; + setp.lt.s32 %p7, %r44, %r8; + @%p7 bra $L__BB0_8; + bra.uni $L__BB0_9; +$L__BB0_5: // %.lr.ph.split.us.preheader + .loc 1 0 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:40 + mov.b32 %r43, 0; +$L__BB0_6: // %.lr.ph.split.us + // =>This Inner Loop Header: Depth=1 + .loc 1 36 43 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:43 + add.s32 %r17, %r3, %r43; + cvt.s64.s32 %rd28, %r17; + mul.lo.s64 %rd29, %rd13, %rd28; + .loc 1 36 34 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:34 + shl.b64 %rd30, %rd29, 2; + add.s64 %rd25, %rd7, %rd30; + .loc 1 36 63 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:63 + // begin inline asm + mov.u64 %rd24, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd24, 1.0; + // end inline asm + mov.pred %p4, 0; + // begin inline asm + mov.u32 %r16, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b32 { %r16 }, [ %rd25 + 0 ], %rd24; + // end inline asm + .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40 + add.s32 %r43, %r43, 16; + setp.lt.s32 %p5, %r43, %r8; + @%p5 bra $L__BB0_6; +$L__BB0_9: // %._crit_edge + .loc 1 23 21 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:23:21 + setp.lt.s32 %p9, %r42, %r10; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + mov.b64 {_, %r24}, %rd67; + cvt.u32.u64 %r25, %rd67; + shfl.sync.bfly.b32 %r26, %r25, 8, 31, -1; + shfl.sync.bfly.b32 %r27, %r24, 8, 31, -1; + cvt.u64.u32 %rd40, %r26; + cvt.u64.u32 %rd41, %r27; + shl.b64 %rd42, %rd41, 32; + or.b64 %rd43, %rd40, %rd42; + .loc 2 261 15 // standard.py:261:15 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + add.s64 %rd44, %rd67, %rd43; + .loc 2 291 36 // standard.py:291:36 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + mov.b64 {_, %r28}, %rd44; + cvt.u32.u64 %r29, %rd44; + shfl.sync.bfly.b32 %r30, %r29, 4, 31, -1; + shfl.sync.bfly.b32 %r31, %r28, 4, 31, -1; + cvt.u64.u32 %rd45, %r30; + cvt.u64.u32 %rd46, %r31; + shl.b64 %rd47, %rd46, 32; + or.b64 %rd48, %rd45, %rd47; + .loc 2 261 15 // standard.py:261:15 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + add.s64 %rd49, %rd44, %rd48; + .loc 2 291 36 // standard.py:291:36 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + mov.b64 {_, %r32}, %rd49; + cvt.u32.u64 %r33, %rd49; + shfl.sync.bfly.b32 %r34, %r33, 2, 31, -1; + shfl.sync.bfly.b32 %r35, %r32, 2, 31, -1; + cvt.u64.u32 %rd50, %r34; + cvt.u64.u32 %rd51, %r35; + shl.b64 %rd52, %rd51, 32; + or.b64 %rd53, %rd50, %rd52; + .loc 2 261 15 // standard.py:261:15 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + add.s64 %rd54, %rd49, %rd53; + .loc 2 291 36 // standard.py:291:36 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + mov.b64 {_, %r36}, %rd54; + cvt.u32.u64 %r37, %rd54; + shfl.sync.bfly.b32 %r38, %r37, 1, 31, -1; + shfl.sync.bfly.b32 %r39, %r36, 1, 31, -1; + cvt.u64.u32 %rd55, %r38; + .loc 2 261 15 // standard.py:261:15 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + add.s64 %rd56, %rd54, %rd55; +$L__tmp2: + .loc 1 42 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:42:19 + cvt.u32.u64 %r22, %rd56; + .loc 1 43 49 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:49 + setp.lt.s64 %p10, %rd13, 2; + .loc 1 43 75 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:75 + setp.gt.s64 %p11, %rd13, 1; + .loc 1 43 66 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:66 + selp.b64 %rd57, %rd13, 0, %p11; + .loc 1 43 0 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43 + selp.b64 %rd58, 1, 0, %p10; + .loc 1 43 57 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:57 + add.s64 %rd59, %rd57, %rd58; + .loc 1 43 34 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:34 + mul.lo.s64 %rd60, %rd65, %rd59; + .loc 1 43 25 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:25 + add.s64 %rd62, %rd12, %rd64; + shl.b64 %rd63, %rd60, 2; + add.s64 %rd39, %rd62, %rd63; + .loc 1 43 88 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:88 + and.b32 %r40, %r2, 63; + setp.eq.b32 %p12, %r40, 0; + and.pred %p8, %p12, %p9; + // begin inline asm + @%p8 st.global.b32 [ %rd39 + 0 ], { %r22 }; + // end inline asm + .loc 1 43 4 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 238 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe7 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 53 +.b8 117 +.b8 99 +.b8 102 +.b8 50 +.b8 105 +.b8 116 +.b8 54 +.b8 102 +.b8 119 +.b8 104 +.b8 109 +.b8 114 +.b8 117 +.b8 51 +.b8 50 +.b8 118 +.b8 112 +.b8 121 +.b8 50 +.b8 120 +.b8 109 +.b8 119 +.b8 107 +.b8 107 +.b8 52 +.b8 118 +.b8 103 +.b8 55 +.b8 112 +.b8 112 +.b8 110 +.b8 111 +.b8 120 +.b8 109 +.b8 104 +.b8 110 +.b8 107 +.b8 102 +.b8 54 +.b8 105 +.b8 102 +.b8 112 +.b8 98 +.b8 50 +.b8 104 +.b8 50 +.b8 116 +.b8 106 +.b8 115 +.b8 108 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 53 +.b8 117 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x38 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 99 +.b8 108 +.b8 111 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 108 +.b8 105 +.b8 99 +.b8 101 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 116 +.b8 114 +.b8 97 +.b8 110 +.b8 115 +.b8 112 +.b8 111 +.b8 115 +.b8 101 +.b8 95 +.b8 53 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc3:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xd8:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source b/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source new file mode 100644 index 0000000000000000000000000000000000000000..5a8f6740fdb5ff608f5737a38877a588abc7a3e5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source @@ -0,0 +1,190 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":18:0) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc43 = loc(unknown) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc50 = loc("in_ptr0"(#loc)) +#loc51 = loc("out_ptr1"(#loc)) +#loc52 = loc("ks0"(#loc)) +#loc53 = loc("ks1"(#loc)) +#loc54 = loc("xnumel"(#loc)) +#loc55 = loc("r0_numel"(#loc)) +#loc85 = loc("input"(#loc41)) +#loc86 = loc("a"(#loc46)) +#loc87 = loc("b"(#loc46)) +module { + tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc56) + %xoffset_0 = arith.constant 1 : i32 loc(#loc57) + %xoffset_1 = arith.constant 1 : i32 loc(#loc57) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc57) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc58) + %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc59) + %xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<1x1xi32> loc(#loc60) + %xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<1x1xi32> loc(#loc60) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc61) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<1x1xi32> loc(#loc61) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc62) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc63) + %x0 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc64) + %x0_8 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc64) + %x0_9 = arith.remsi %x0, %x0_8 : tensor<1x1xi64> loc(#loc64) + %x1 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc65) + %x1_10 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc65) + %x1_11 = arith.divsi %x1, %x1_10 : tensor<1x1xi64> loc(#loc65) + %_tmp3 = arith.constant 0 : i64 loc(#loc66) + %_tmp3_12 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc66) + %c0_i32 = arith.constant 0 : i32 loc(#loc12) + %c16_i32 = arith.constant 16 : i32 loc(#loc12) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc12) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc12) + %2 = arith.bitcast %c16_i32 : i32 to i32 loc(#loc12) + %3 = ub.poison : i32 loc(#loc12) + %_tmp3_13 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_18 = %_tmp3_12) -> (tensor<1x16xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32> loc(#loc68) + %r0_index_19 = arith.addi %r0_index, %r0_base_7 : tensor<1x16xi32> loc(#loc68) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32> loc(#loc69) + %r0_mask_20 = arith.cmpi slt, %r0_index_19, %r0_mask : tensor<1x16xi32> loc(#loc69) + %tmp0 = arith.extsi %r0_index_19 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc70) + %tmp0_21 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc70) + %tmp0_22 = arith.muli %tmp0_21, %tmp0 : tensor<1x16xi64> loc(#loc70) + %tmp0_23 = tt.broadcast %x0_9 : tensor<1x1xi64> -> tensor<1x16xi64> loc(#loc71) + %tmp0_24 = arith.addi %tmp0_23, %tmp0_22 : tensor<1x16xi64> loc(#loc71) + %tmp0_25 = arith.muli %ks0, %ks1 : i64 loc(#loc72) + %tmp0_26 = tt.splat %tmp0_25 : i64 -> tensor<1x1xi64> loc(#loc73) + %tmp0_27 = arith.muli %tmp0_26, %x1_11 : tensor<1x1xi64> loc(#loc73) + %tmp0_28 = tt.broadcast %tmp0_27 : tensor<1x1xi64> -> tensor<1x16xi64> loc(#loc74) + %tmp0_29 = arith.addi %tmp0_24, %tmp0_28 : tensor<1x16xi64> loc(#loc74) + %tmp0_30 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc75) + %tmp0_31 = tt.addptr %tmp0_30, %tmp0_29 : tensor<1x16x!tt.ptr>, tensor<1x16xi64> loc(#loc75) + %tmp0_32 = tt.broadcast %xmask_6 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc76) + %tmp0_33 = arith.andi %r0_mask_20, %tmp0_32 : tensor<1x16xi1> loc(#loc76) + %tmp0_34 = arith.constant 0.000000e+00 : f32 loc(#loc77) + %tmp0_35 = arith.constant dense<0.000000e+00> : tensor<1x16xf32> loc(#loc77) + %tmp0_36 = arith.fptosi %tmp0_35 : tensor<1x16xf32> to tensor<1x16xi32> loc(#loc77) + %tmp0_37 = tt.load %tmp0_31, %tmp0_33, %tmp0_36 evictionPolicy = evict_last : tensor<1x16x!tt.ptr> loc(#loc77) + %tmp1 = arith.extsi %tmp0_37 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc78) + %tmp4 = arith.addi %_tmp3_18, %tmp1 : tensor<1x16xi64> loc(#loc79) + %_tmp3_38 = tt.broadcast %xmask_6 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc80) + %_tmp3_39 = arith.andi %r0_mask_20, %_tmp3_38 : tensor<1x16xi1> loc(#loc80) + %_tmp3_40 = arith.select %_tmp3_39, %tmp4, %_tmp3_18 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc81) + scf.yield %_tmp3_40 : tensor<1x16xi64> loc(#loc27) + } loc(#loc67) + %tmp3 = tt.call @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp3_13) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc82) + %tmp3_14 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc83) + %tmp5 = arith.trunci %tmp3_14 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc84) + %c1_i32 = arith.constant 1 : i32 loc(#loc31) + %4 = arith.extsi %c1_i32 : i32 to i64 loc(#loc31) + %5 = arith.cmpi sge, %4, %ks0 : i64 loc(#loc31) + %c1_i32_15 = arith.constant 1 : i32 loc(#loc32) + %c1_i32_16 = arith.constant 1 : i32 loc(#loc32) + %6 = arith.extui %5 : i1 to i32 loc(#loc32) + %7 = arith.muli %c1_i32_16, %6 : i32 loc(#loc32) + %c1_i32_17 = arith.constant 1 : i32 loc(#loc33) + %8 = arith.extsi %c1_i32_17 : i32 to i64 loc(#loc33) + %9 = arith.cmpi sgt, %ks0, %8 : i64 loc(#loc33) + %10 = arith.extui %9 : i1 to i64 loc(#loc34) + %11 = arith.muli %ks0, %10 : i64 loc(#loc34) + %12 = arith.extsi %7 : i32 to i64 loc(#loc35) + %13 = arith.addi %12, %11 : i64 loc(#loc35) + %14 = tt.splat %13 : i64 -> tensor<1x1xi64> loc(#loc36) + %15 = arith.muli %x1_11, %14 : tensor<1x1xi64> loc(#loc36) + %16 = arith.addi %x0_9, %15 : tensor<1x1xi64> loc(#loc37) + %17 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc38) + %18 = tt.addptr %17, %16 : tensor<1x1x!tt.ptr>, tensor<1x1xi64> loc(#loc38) + tt.store %18, %tmp5, %xmask_6 : tensor<1x1x!tt.ptr> loc(#loc39) + tt.return loc(#loc40) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x16xi64> loc("input"(#loc41))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc42) + tt.reduce.return %2 : i64 loc(#loc42) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc42) + tt.return %0 : tensor<1xi64> loc(#loc44) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc45) + tt.return %1 : tensor<1xi64> loc(#loc45) + } loc(#loc41) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc46)), %b: i64 loc("b"(#loc46))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc47) + tt.return %0 : i64 loc(#loc48) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc49) + tt.return %1 : i64 loc(#loc49) + } loc(#loc46) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":26:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":28:43) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":30:40) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":31:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":32:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:39) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:54) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:58) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:50) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:73) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:63) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":37:23) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":39:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:48) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:8) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:25) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:28) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":42:19) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:49) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:75) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:66) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:57) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:34) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:30) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:88) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:4) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc56 = loc("xoffset"(#loc1)) +#loc57 = loc("xoffset"(#loc2)) +#loc58 = loc("xindex"(#loc3)) +#loc59 = loc("xindex"(#loc4)) +#loc60 = loc("xindex"(#loc5)) +#loc61 = loc("xmask"(#loc6)) +#loc62 = loc("r0_base"(#loc7)) +#loc63 = loc("r0_base"(#loc8)) +#loc64 = loc("x0"(#loc9)) +#loc65 = loc("x1"(#loc10)) +#loc66 = loc("_tmp3"(#loc11)) +#loc67 = loc("_tmp3"(#loc12)) +#loc68 = loc("r0_index"(#loc13)) +#loc69 = loc("r0_mask"(#loc14)) +#loc70 = loc("tmp0"(#loc15)) +#loc71 = loc("tmp0"(#loc16)) +#loc72 = loc("tmp0"(#loc17)) +#loc73 = loc("tmp0"(#loc18)) +#loc74 = loc("tmp0"(#loc19)) +#loc75 = loc("tmp0"(#loc20)) +#loc76 = loc("tmp0"(#loc21)) +#loc77 = loc("tmp0"(#loc22)) +#loc78 = loc("tmp1"(#loc23)) +#loc79 = loc("tmp4"(#loc24)) +#loc80 = loc("_tmp3"(#loc25)) +#loc81 = loc("_tmp3"(#loc26)) +#loc82 = loc("tmp3"(#loc28)) +#loc83 = loc("tmp3"(#loc29)) +#loc84 = loc("tmp5"(#loc30)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..fa240e1c4f18e8cd98ada63335476de7a8611fc5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir @@ -0,0 +1,135 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [2, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":18:0) +#loc1 = loc(unknown) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:25) +#loc37 = loc("in_ptr0"(#loc)) +#loc38 = loc("out_ptr1"(#loc)) +#loc39 = loc("ks0"(#loc)) +#loc40 = loc("ks1"(#loc)) +#loc41 = loc("xnumel"(#loc)) +#loc42 = loc("r0_numel"(#loc)) +#loc62 = loc("tmp3"(#loc23)) +#loc70 = loc(callsite(#loc1 at #loc62)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x16xi64, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc43) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc44) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc45) + %r0_base_1 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc45) + %x0 = arith.extsi %xoffset : i32 to i64 loc(#loc46) + %x0_2 = arith.remsi %x0, %ks0 : i64 loc(#loc46) + %x1 = arith.divsi %x0, %ks0 : i64 loc(#loc47) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32, #blocked> loc(#loc48) + %tmp0 = tt.splat %ks0 : i64 -> tensor<1x16xi64, #blocked> loc(#loc49) + %tmp0_3 = tt.splat %x0_2 : i64 -> tensor<1x16xi64, #blocked> loc(#loc66) + %tmp0_4 = arith.muli %ks0, %ks1 : i64 loc(#loc51) + %tmp0_5 = arith.muli %tmp0_4, %x1 : i64 loc(#loc52) + %tmp0_6 = tt.splat %tmp0_5 : i64 -> tensor<1x16xi64, #blocked> loc(#loc67) + %tmp0_7 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> loc(#loc54) + %tmp0_8 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked> loc(#loc68) + %_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_10 = %cst_0) -> (tensor<1x16xi64, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32, #blocked> loc(#loc57) + %r0_index_11 = arith.addi %r0_index, %r0_base_1 : tensor<1x16xi32, #blocked> loc(#loc57) + %r0_mask_12 = arith.cmpi slt, %r0_index_11, %r0_mask : tensor<1x16xi32, #blocked> loc(#loc48) + %tmp0_13 = arith.extsi %r0_index_11 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc49) + %tmp0_14 = arith.muli %tmp0, %tmp0_13 : tensor<1x16xi64, #blocked> loc(#loc49) + %tmp0_15 = arith.addi %tmp0_3, %tmp0_14 : tensor<1x16xi64, #blocked> loc(#loc50) + %tmp0_16 = arith.addi %tmp0_15, %tmp0_6 : tensor<1x16xi64, #blocked> loc(#loc53) + %tmp0_17 = tt.addptr %tmp0_7, %tmp0_16 : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi64, #blocked> loc(#loc54) + %tmp0_18 = arith.andi %r0_mask_12, %tmp0_8 : tensor<1x16xi1, #blocked> loc(#loc55) + %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %cst evictionPolicy = evict_last : tensor<1x16x!tt.ptr, #blocked> loc(#loc58) + %tmp1 = arith.extsi %tmp0_19 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc59) + %tmp4 = arith.addi %_tmp3_10, %tmp1 : tensor<1x16xi64, #blocked> loc(#loc60) + %_tmp3_20 = arith.select %tmp0_18, %tmp4, %_tmp3_10 : tensor<1x16xi1, #blocked>, tensor<1x16xi64, #blocked> loc(#loc61) + scf.yield %_tmp3_20 : tensor<1x16xi64, #blocked> loc(#loc21) + } loc(#loc56) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_10: i64 loc(callsite(#loc1 at #loc62)), %tmp3_11: i64 loc(callsite(#loc1 at #loc62))): + %tmp3_12 = arith.addi %tmp3_10, %tmp3_11 : i64 loc(#loc71) + tt.reduce.return %tmp3_12 : i64 loc(#loc69) + }) : (tensor<1x16xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc69) + %0 = ttg.convert_layout %tmp3 : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc25) + %tmp3_9 = tt.expand_dims %0 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi64, #blocked1> loc(#loc63) + %tmp5 = arith.trunci %tmp3_9 : tensor<1x1xi64, #blocked1> to tensor<1x1xi32, #blocked1> loc(#loc64) + %1 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc28) + %2 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc29) + %3 = arith.extui %2 : i1 to i64 loc(#loc30) + %4 = arith.muli %ks0, %3 : i64 loc(#loc30) + %5 = arith.extui %1 : i1 to i64 loc(#loc65) + %6 = arith.addi %5, %4 : i64 loc(#loc31) + %7 = arith.muli %x1, %6 : i64 loc(#loc33) + %8 = arith.addi %x0_2, %7 : i64 loc(#loc34) + %9 = tt.addptr %out_ptr1, %8 : !tt.ptr, i64 loc(#loc35) + %10 = tt.splat %9 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc25) + %11 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked1> loc(#loc25) + tt.store %10, %tmp5, %11 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc25) + tt.return loc(#loc36) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":23:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":26:19) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":27:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":32:29) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:43) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:39) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:54) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:58) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:50) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:34) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:73) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":30:40) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":31:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:63) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":37:23) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":39:23) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:48) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:8) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:88) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:28) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":42:19) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:49) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:75) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:66) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:57) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:30) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:4) +#loc43 = loc("xoffset"(#loc2)) +#loc44 = loc("xmask"(#loc3)) +#loc45 = loc("r0_base"(#loc4)) +#loc46 = loc("x0"(#loc5)) +#loc47 = loc("x1"(#loc6)) +#loc48 = loc("r0_mask"(#loc7)) +#loc49 = loc("tmp0"(#loc8)) +#loc50 = loc("tmp0"(#loc9)) +#loc51 = loc("tmp0"(#loc10)) +#loc52 = loc("tmp0"(#loc11)) +#loc53 = loc("tmp0"(#loc12)) +#loc54 = loc("tmp0"(#loc13)) +#loc55 = loc("tmp0"(#loc14)) +#loc56 = loc("_tmp3"(#loc15)) +#loc57 = loc("r0_index"(#loc16)) +#loc58 = loc("tmp0"(#loc17)) +#loc59 = loc("tmp1"(#loc18)) +#loc60 = loc("tmp4"(#loc19)) +#loc61 = loc("_tmp3"(#loc20)) +#loc63 = loc("tmp3"(#loc26)) +#loc64 = loc("tmp5"(#loc27)) +#loc65 = loc(fused[#loc31, #loc32]) +#loc66 = loc(fused[#loc50, #loc46]) +#loc67 = loc(fused[#loc53, #loc52]) +#loc68 = loc(fused[#loc55, #loc44]) +#loc69 = loc(callsite(#loc22 at #loc62)) +#loc71 = loc(callsite(#loc24 at #loc69)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir new file mode 100644 index 0000000000000000000000000000000000000000..768ad156deb687cab551f51e34e1113203abdf4f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CSBUDPF5G22GDISE2XI2DNQQGLLFGYBXWKH7266NZF7EZHMJNZGA/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir @@ -0,0 +1,136 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":18:0) +#loc1 = loc(unknown) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:25) +#loc39 = loc("in_ptr0"(#loc)) +#loc40 = loc("out_ptr1"(#loc)) +#loc41 = loc("ks0"(#loc)) +#loc42 = loc("ks1"(#loc)) +#loc43 = loc("xnumel"(#loc)) +#loc44 = loc("r0_numel"(#loc)) +#loc66 = loc("tmp3"(#loc25)) +#loc74 = loc(callsite(#loc1 at #loc66)) +module { + tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst = arith.constant dense<0> : tensor<1x16xi32> loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %_tmp3 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc45) + %xoffset = tt.get_program_id x : i32 loc(#loc46) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc47) + %xmask_0 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc47) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc48) + %r0_base_1 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc49) + %x0 = arith.extsi %xoffset : i32 to i64 loc(#loc50) + %x0_2 = arith.remsi %x0, %ks0 : i64 loc(#loc50) + %x1 = arith.divsi %x0, %ks0 : i64 loc(#loc51) + %_tmp3_3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_5 = %_tmp3) -> (tensor<1x16xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32> loc(#loc53) + %r0_index_6 = arith.addi %r0_index, %r0_base_1 : tensor<1x16xi32> loc(#loc53) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32> loc(#loc54) + %r0_mask_7 = arith.cmpi slt, %r0_index_6, %r0_mask : tensor<1x16xi32> loc(#loc54) + %tmp0 = arith.extsi %r0_index_6 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc55) + %tmp0_8 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc55) + %tmp0_9 = arith.muli %tmp0_8, %tmp0 : tensor<1x16xi64> loc(#loc55) + %tmp0_10 = tt.splat %x0_2 : i64 -> tensor<1x16xi64> loc(#loc70) + %tmp0_11 = arith.addi %tmp0_10, %tmp0_9 : tensor<1x16xi64> loc(#loc56) + %tmp0_12 = arith.muli %ks0, %ks1 : i64 loc(#loc57) + %tmp0_13 = arith.muli %tmp0_12, %x1 : i64 loc(#loc58) + %tmp0_14 = tt.splat %tmp0_13 : i64 -> tensor<1x16xi64> loc(#loc71) + %tmp0_15 = arith.addi %tmp0_11, %tmp0_14 : tensor<1x16xi64> loc(#loc59) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc60) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<1x16x!tt.ptr>, tensor<1x16xi64> loc(#loc60) + %tmp0_18 = tt.splat %xmask : i1 -> tensor<1x16xi1> loc(#loc72) + %tmp0_19 = arith.andi %r0_mask_7, %tmp0_18 : tensor<1x16xi1> loc(#loc61) + %tmp0_20 = tt.load %tmp0_17, %tmp0_19, %cst evictionPolicy = evict_last : tensor<1x16x!tt.ptr> loc(#loc62) + %tmp1 = arith.extsi %tmp0_20 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc63) + %tmp4 = arith.addi %_tmp3_5, %tmp1 : tensor<1x16xi64> loc(#loc64) + %_tmp3_21 = arith.select %tmp0_19, %tmp4, %_tmp3_5 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc65) + scf.yield %_tmp3_21 : tensor<1x16xi64> loc(#loc23) + } loc(#loc52) + %tmp3 = "tt.reduce"(%_tmp3_3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_5: i64 loc(callsite(#loc1 at #loc66)), %tmp3_6: i64 loc(callsite(#loc1 at #loc66))): + %tmp3_7 = arith.addi %tmp3_5, %tmp3_6 : i64 loc(#loc75) + tt.reduce.return %tmp3_7 : i64 loc(#loc73) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc73) + %tmp3_4 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc67) + %tmp5 = arith.trunci %tmp3_4 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc68) + %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc29) + %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc30) + %2 = arith.extui %1 : i1 to i64 loc(#loc31) + %3 = arith.muli %ks0, %2 : i64 loc(#loc31) + %4 = arith.extui %0 : i1 to i64 loc(#loc69) + %5 = arith.addi %4, %3 : i64 loc(#loc32) + %6 = arith.muli %x1, %5 : i64 loc(#loc34) + %7 = arith.addi %x0_2, %6 : i64 loc(#loc35) + %8 = tt.addptr %out_ptr1, %7 : !tt.ptr, i64 loc(#loc36) + %9 = tt.splat %8 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc36) + tt.store %9, %tmp5, %xmask_0 : tensor<1x1x!tt.ptr> loc(#loc37) + tt.return loc(#loc38) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":30:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":28:43) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":23:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":26:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":27:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":31:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":32:29) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:43) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:39) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:54) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:58) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:50) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:73) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:63) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":37:23) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":39:23) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:48) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:8) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:28) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":42:19) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:49) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:75) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:66) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:57) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:41) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:34) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:30) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:88) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:4) +#loc45 = loc("_tmp3"(#loc3)) +#loc46 = loc("xoffset"(#loc4)) +#loc47 = loc("xmask"(#loc5)) +#loc48 = loc("r0_base"(#loc6)) +#loc49 = loc("r0_base"(#loc7)) +#loc50 = loc("x0"(#loc8)) +#loc51 = loc("x1"(#loc9)) +#loc52 = loc("_tmp3"(#loc2)) +#loc53 = loc("r0_index"(#loc10)) +#loc54 = loc("r0_mask"(#loc11)) +#loc55 = loc("tmp0"(#loc12)) +#loc56 = loc("tmp0"(#loc13)) +#loc57 = loc("tmp0"(#loc14)) +#loc58 = loc("tmp0"(#loc15)) +#loc59 = loc("tmp0"(#loc16)) +#loc60 = loc("tmp0"(#loc17)) +#loc61 = loc("tmp0"(#loc18)) +#loc62 = loc("tmp0"(#loc19)) +#loc63 = loc("tmp1"(#loc20)) +#loc64 = loc("tmp4"(#loc21)) +#loc65 = loc("_tmp3"(#loc22)) +#loc67 = loc("tmp3"(#loc27)) +#loc68 = loc("tmp5"(#loc28)) +#loc69 = loc(fused[#loc32, #loc33]) +#loc70 = loc(fused[#loc56, #loc50]) +#loc71 = loc(fused[#loc59, #loc58]) +#loc72 = loc(fused[#loc61, #loc47]) +#loc73 = loc(callsite(#loc24 at #loc66)) +#loc75 = loc(callsite(#loc26 at #loc73)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..64455c308b2aae8c3a51ad6ad4fe03f2399fb014 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..c9a28b738846c28531b21338512563c6b6fac84a Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..fe8a9abf7640ac1e5cc940996bace35855ab2be3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "160db31d279b0204a932128afe88b2b0aceee14ec224ef05ecf948d32232fd5f", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 8, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..cd936d7a09589b8f755f9a0e64031c0212b0a8a4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.llir @@ -0,0 +1,139 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %10 = and i32 %9, 31, !dbg !8 + %11 = lshr i32 %9, 5, !dbg !8 + %12 = shl nuw nsw i32 %9, 1, !dbg !8 + %13 = and i32 %12, 126, !dbg !8 + %14 = lshr i32 %8, 4, !dbg !9 + %15 = and i32 %14, 3968, !dbg !9 + %16 = shl i32 %8, 12, !dbg !10 + %17 = and i32 %16, 8384512, !dbg !10 + %18 = or disjoint i32 %15, %17, !dbg !11 + %19 = shl i32 %8, 7, !dbg !12 + %20 = and i32 %19, -8388608, !dbg !12 + %21 = or disjoint i32 %18, %20, !dbg !13 + %22 = or disjoint i32 %21, %13, !dbg !13 + %23 = sext i32 %22 to i64, !dbg !14 + %24 = getelementptr bfloat, ptr addrspace(1) %0, i64 %23, !dbg !14 + %25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !15 + %26 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %24, i64 %25, i1 true) #4, !dbg !15 + %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !15 + %28 = or disjoint i32 %13, %19, !dbg !16 + %29 = sext i32 %28 to i64, !dbg !17 + %30 = getelementptr bfloat, ptr addrspace(1) %1, i64 %29, !dbg !17 + %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !18 + %32 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %30, i64 %31, i1 true) #4, !dbg !18 + %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !18 + %34 = fpext <2 x bfloat> %27 to <2 x float>, !dbg !19 + %35 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !20 + %36 = fmul <2 x float> %34, %35, !dbg !21 + %37 = fadd <2 x float> %36, zeroinitializer, !dbg !22 + %shift = shufflevector <2 x float> %37, <2 x float> poison, <2 x i32> , !dbg !23 + %foldExtExtBinop = fadd <2 x float> %37, %shift, !dbg !23 + %38 = extractelement <2 x float> %foldExtExtBinop, i64 0, !dbg !23 + %39 = bitcast float %38 to i32, !dbg !27 + %40 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 16, i32 31), !dbg !27 + %41 = bitcast i32 %40 to float, !dbg !27 + %42 = fadd float %38, %41, !dbg !23 + %43 = bitcast float %42 to i32, !dbg !27 + %44 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %43, i32 8, i32 31), !dbg !27 + %45 = bitcast i32 %44 to float, !dbg !27 + %46 = fadd float %42, %45, !dbg !23 + %47 = bitcast float %46 to i32, !dbg !27 + %48 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %47, i32 4, i32 31), !dbg !27 + %49 = bitcast i32 %48 to float, !dbg !27 + %50 = fadd float %46, %49, !dbg !23 + %51 = bitcast float %50 to i32, !dbg !27 + %52 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %51, i32 2, i32 31), !dbg !27 + %53 = bitcast i32 %52 to float, !dbg !27 + %54 = fadd float %50, %53, !dbg !23 + %55 = bitcast float %54 to i32, !dbg !27 + %56 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 1, i32 31), !dbg !27 + %57 = bitcast i32 %56 to float, !dbg !27 + %58 = fadd float %54, %57, !dbg !23 + %59 = and i32 %11, 1, !dbg !27 + %60 = icmp eq i32 %10, 0, !dbg !27 + %61 = getelementptr float, ptr addrspace(3) @global_smem, i32 %59, !dbg !27 + %62 = bitcast float %58 to <1 x i32>, !dbg !27 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %62, i1 %60) #4, !dbg !27 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !27 + %63 = icmp samesign ult i32 %9, 2, !dbg !27 + %64 = getelementptr float, ptr addrspace(3) @global_smem, i32 %9, !dbg !27 + %65 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %64, i1 %63) #4, !dbg !27 + %66 = bitcast i32 %65 to float, !dbg !27 + %67 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %65, i32 1, i32 31), !dbg !27 + %68 = bitcast i32 %67 to float, !dbg !27 + %69 = fadd float %66, %68, !dbg !23 + %70 = icmp eq i32 %9, 0, !dbg !27 + %71 = bitcast float %69 to <1 x i32>, !dbg !27 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %71, i1 %70) #4, !dbg !27 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !27 + %72 = load i32, ptr addrspace(3) @global_smem, align 16, !dbg !27 + %73 = zext nneg i32 %8 to i64, !dbg !28 + %74 = getelementptr float, ptr addrspace(1) %2, i64 %73, !dbg !28 + %75 = and i32 %9, 63, !dbg !29 + %76 = icmp eq i32 %75, 0, !dbg !29 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %72, ptr addrspace(1) %74, i1 %76) #4, !dbg !29 + ret void, !dbg !30 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 26, column: 37, scope: !4) +!9 = !DILocation(line: 39, column: 45, scope: !4) +!10 = !DILocation(line: 39, column: 55, scope: !4) +!11 = !DILocation(line: 39, column: 50, scope: !4) +!12 = !DILocation(line: 39, column: 68, scope: !4) +!13 = !DILocation(line: 39, column: 60, scope: !4) +!14 = !DILocation(line: 39, column: 34, scope: !4) +!15 = !DILocation(line: 39, column: 73, scope: !4) +!16 = !DILocation(line: 40, column: 41, scope: !4) +!17 = !DILocation(line: 40, column: 34, scope: !4) +!18 = !DILocation(line: 40, column: 50, scope: !4) +!19 = !DILocation(line: 39, column: 127, scope: !4) +!20 = !DILocation(line: 40, column: 104, scope: !4) +!21 = !DILocation(line: 41, column: 22, scope: !4) +!22 = !DILocation(line: 43, column: 23, scope: !4) +!23 = !DILocation(line: 261, column: 15, scope: !24, inlinedAt: !26) +!24 = distinct !DILexicalBlockFile(scope: !4, file: !25, discriminator: 0) +!25 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!26 = !DILocation(line: 45, column: 25, scope: !4) +!27 = !DILocation(line: 291, column: 36, scope: !24, inlinedAt: !26) +!28 = !DILocation(line: 49, column: 25, scope: !4) +!29 = !DILocation(line: 49, column: 36, scope: !4) +!30 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..1a9a7d26d9fc947582450cb0f7e20fca71fd69bd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.ptx @@ -0,0 +1,389 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u32 triton_red_fused_zeros_0_param_3, + .param .u32 triton_red_fused_zeros_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_6 +) +.reqntid 64 +{ + .reg .pred %p<7>; + .reg .b16 %rs<5>; + .reg .b32 %r<49>; + .reg .b64 %rd<11>; + .loc 1 18 0 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_red_fused_zeros_0_param_0]; + ld.param.b64 %rd9, [triton_red_fused_zeros_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:23:28 + mov.u32 %r12, %ctaid.x; + ld.param.b64 %rd10, [triton_red_fused_zeros_0_param_2]; + .loc 1 26 37 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:26:37 + mov.u32 %r13, %tid.x; + and.b32 %r14, %r13, 31; + shl.b32 %r15, %r13, 1; + and.b32 %r16, %r15, 126; + .loc 1 39 45 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:45 + shr.u32 %r17, %r12, 4; + and.b32 %r18, %r17, 3968; + .loc 1 39 55 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:55 + shl.b32 %r19, %r12, 12; + and.b32 %r20, %r19, 8384512; + .loc 1 39 50 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:50 + or.b32 %r21, %r18, %r20; + .loc 1 39 68 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:68 + shl.b32 %r22, %r12, 7; + and.b32 %r23, %r22, -8388608; + .loc 1 39 60 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:60 + or.b32 %r24, %r21, %r23; + or.b32 %r25, %r24, %r16; + .loc 1 39 34 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:34 + mad.wide.s32 %rd2, %r25, 2, %rd8; + .loc 1 39 73 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:73 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd3, 1.0; + // end inline asm + mov.b32 %r2, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r2; + @%p1 ld.global.L1::evict_first.L2::cache_hint.b32 { %r1 }, [ %rd2 + 0 ], %rd3; + // end inline asm + .loc 1 40 41 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:41 + or.b32 %r26, %r16, %r22; + .loc 1 40 34 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:34 + mad.wide.s32 %rd5, %r26, 2, %rd9; + .loc 1 40 50 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:50 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r3, %r2; + @%p1 ld.global.L1::evict_first.L2::cache_hint.b32 { %r3 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 39 127 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:127 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r27, %rs1; + cvt.f32.bf16 %r28, %rs2; + .loc 1 40 104 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:104 + mov.b32 {%rs3, %rs4}, %r3; + cvt.f32.bf16 %r29, %rs3; + cvt.f32.bf16 %r30, %rs4; + .loc 1 43 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:43:23 + fma.rn.f32 %r31, %r28, %r30, 0f00000000; + fma.rn.f32 %r32, %r27, %r29, 0f00000000; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r33, %r32, %r31; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r34, %r33, 16, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r35, %r33, %r34; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r36, %r35, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r37, %r35, %r36; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r38, %r37, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r39, %r37, %r38; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r40, %r39, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r41, %r39, %r40; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r6, %r41, %r42; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + setp.eq.b32 %p3, %r14, 0; + shr.u32 %r43, %r13, 3; + and.b32 %r44, %r43, 4; + mov.b32 %r45, global_smem; + add.s32 %r5, %r45, %r44; + // begin inline asm + @%p3 st.shared.b32 [ %r5 + 0 ], %r6; + // end inline asm + bar.sync 0; + setp.lt.u32 %p4, %r13, 2; + shl.b32 %r46, %r13, 2; + add.s32 %r8, %r45, %r46; + // begin inline asm + @%p4 ld.shared.b32 %r7, [ %r8 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r47, %r7, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r10, %r7, %r47; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + setp.eq.b32 %p5, %r13, 0; + // begin inline asm + @%p5 st.shared.b32 [ %r8 + 0 ], %r10; + // end inline asm + bar.sync 0; + ld.shared.b32 %r11, [global_smem]; +$L__tmp2: + .loc 1 49 25 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:49:25 + mad.wide.u32 %rd7, %r12, 4, %rd10; + .loc 1 49 36 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:49:36 + and.b32 %r48, %r13, 63; + setp.eq.b32 %p6, %r48, 0; + // begin inline asm + @%p6 st.global.b32 [ %rd7 + 0 ], { %r11 }; + // end inline asm + .loc 1 49 4 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:49:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 209 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xca DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 111 +.b8 103 +.b8 111 +.b8 108 +.b8 53 +.b8 53 +.b8 99 +.b8 116 +.b8 104 +.b8 107 +.b8 52 +.b8 122 +.b8 101 +.b8 118 +.b8 115 +.b8 121 +.b8 51 +.b8 100 +.b8 108 +.b8 113 +.b8 105 +.b8 121 +.b8 122 +.b8 105 +.b8 112 +.b8 101 +.b8 102 +.b8 118 +.b8 55 +.b8 51 +.b8 53 +.b8 103 +.b8 101 +.b8 50 +.b8 119 +.b8 116 +.b8 97 +.b8 100 +.b8 100 +.b8 118 +.b8 107 +.b8 52 +.b8 51 +.b8 54 +.b8 113 +.b8 104 +.b8 116 +.b8 53 +.b8 110 +.b8 111 +.b8 120 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 111 +.b8 103 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..61a3ccca1c29b41678f411f3c0e2104326854434 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.source @@ -0,0 +1,217 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":18:0) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc46 = loc(unknown) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc53 = loc("in_ptr0"(#loc)) +#loc54 = loc("in_ptr1"(#loc)) +#loc55 = loc("out_ptr1"(#loc)) +#loc56 = loc("xnumel"(#loc)) +#loc57 = loc("r0_numel"(#loc)) +#loc97 = loc("input"(#loc44)) +#loc98 = loc("a"(#loc49)) +#loc99 = loc("b"(#loc49)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 131072 : i32 loc(#loc58) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc59) + %xoffset = tt.get_program_id x : i32 loc(#loc60) + %xoffset_2 = arith.constant 1 : i32 loc(#loc61) + %xoffset_3 = arith.constant 1 : i32 loc(#loc61) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc61) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc62) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc63) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc64) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc64) + %xmask = arith.constant true loc(#loc65) + %xmask_8 = arith.constant dense : tensor<1x128xi1> loc(#loc65) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc66) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc67) + %x0 = arith.constant 2048 : i32 loc(#loc68) + %x0_10 = arith.constant 2048 : i32 loc(#loc68) + %x0_11 = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc68) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<1x1xi32> loc(#loc68) + %x1 = arith.constant 2048 : i32 loc(#loc69) + %x1_13 = arith.constant 2048 : i32 loc(#loc69) + %x1_14 = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc69) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<1x1xi32> loc(#loc69) + %x1_16 = arith.constant 32 : i32 loc(#loc70) + %x1_17 = arith.constant 32 : i32 loc(#loc70) + %x1_18 = arith.constant dense<32> : tensor<1x1xi32> loc(#loc70) + %x1_19 = arith.remsi %x1_15, %x1_18 : tensor<1x1xi32> loc(#loc70) + %x2 = arith.constant 65536 : i32 loc(#loc71) + %x2_20 = arith.constant 65536 : i32 loc(#loc71) + %x2_21 = arith.constant dense<65536> : tensor<1x1xi32> loc(#loc71) + %x2_22 = arith.divsi %xindex_7, %x2_21 : tensor<1x1xi32> loc(#loc71) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc72) + %_tmp4_23 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc72) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c128_i32 = arith.constant 128 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_24 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_27 = %_tmp4_23) -> (tensor<1x128xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc74) + %r0_index_28 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc74) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc75) + %r0_mask_29 = arith.cmpi slt, %r0_index_28, %r0_mask : tensor<1x128xi32> loc(#loc75) + %tmp0 = arith.constant 128 : i32 loc(#loc76) + %tmp0_30 = arith.constant 128 : i32 loc(#loc76) + %tmp0_31 = arith.constant dense<128> : tensor<1x1xi32> loc(#loc76) + %tmp0_32 = arith.muli %tmp0_31, %x1_19 : tensor<1x1xi32> loc(#loc76) + %tmp0_33 = tt.broadcast %tmp0_32 : tensor<1x1xi32> -> tensor<1x128xi32> loc(#loc77) + %tmp0_34 = arith.addi %r0_index_28, %tmp0_33 : tensor<1x128xi32> loc(#loc77) + %tmp0_35 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_36 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_37 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc78) + %tmp0_38 = arith.muli %tmp0_37, %x0_12 : tensor<1x1xi32> loc(#loc78) + %tmp0_39 = tt.broadcast %tmp0_38 : tensor<1x1xi32> -> tensor<1x128xi32> loc(#loc79) + %tmp0_40 = arith.addi %tmp0_34, %tmp0_39 : tensor<1x128xi32> loc(#loc79) + %tmp0_41 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_42 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_43 = arith.constant dense<8388608> : tensor<1x1xi32> loc(#loc80) + %tmp0_44 = arith.muli %tmp0_43, %x2_22 : tensor<1x1xi32> loc(#loc80) + %tmp0_45 = tt.broadcast %tmp0_44 : tensor<1x1xi32> -> tensor<1x128xi32> loc(#loc81) + %tmp0_46 = arith.addi %tmp0_40, %tmp0_45 : tensor<1x128xi32> loc(#loc81) + %tmp0_47 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc82) + %tmp0_48 = tt.addptr %tmp0_47, %tmp0_46 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc82) + %tmp0_49 = arith.constant 0.000000e+00 : f32 loc(#loc83) + %tmp0_50 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc83) + %tmp0_51 = arith.truncf %tmp0_50 : tensor<1x128xf32> to tensor<1x128xbf16> loc(#loc83) + %tmp0_52 = tt.load %tmp0_48, %r0_mask_29, %tmp0_51 evictionPolicy = evict_first : tensor<1x128x!tt.ptr> loc(#loc83) + %tmp0_53 = arith.extf %tmp0_52 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc84) + %tmp1 = arith.constant 128 : i32 loc(#loc85) + %tmp1_54 = arith.constant 128 : i32 loc(#loc85) + %tmp1_55 = arith.constant dense<128> : tensor<1x1xi32> loc(#loc85) + %tmp1_56 = arith.muli %tmp1_55, %xindex_7 : tensor<1x1xi32> loc(#loc85) + %tmp1_57 = tt.broadcast %tmp1_56 : tensor<1x1xi32> -> tensor<1x128xi32> loc(#loc86) + %tmp1_58 = arith.addi %r0_index_28, %tmp1_57 : tensor<1x128xi32> loc(#loc86) + %tmp1_59 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc87) + %tmp1_60 = tt.addptr %tmp1_59, %tmp1_58 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc87) + %tmp1_61 = arith.constant 0.000000e+00 : f32 loc(#loc88) + %tmp1_62 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc88) + %tmp1_63 = arith.truncf %tmp1_62 : tensor<1x128xf32> to tensor<1x128xbf16> loc(#loc88) + %tmp1_64 = tt.load %tmp1_60, %r0_mask_29, %tmp1_63 evictionPolicy = evict_first : tensor<1x128x!tt.ptr> loc(#loc88) + %tmp1_65 = arith.extf %tmp1_64 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc89) + %tmp2 = arith.mulf %tmp0_53, %tmp1_65 : tensor<1x128xf32> loc(#loc90) + %tmp5 = arith.addf %_tmp4_27, %tmp2 : tensor<1x128xf32> loc(#loc91) + %_tmp4_66 = arith.select %r0_mask_29, %tmp5, %_tmp4_27 : tensor<1x128xi1>, tensor<1x128xf32> loc(#loc92) + scf.yield %_tmp4_66 : tensor<1x128xf32> loc(#loc36) + } loc(#loc73) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S1_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_24) : (tensor<1x128xf32>) -> tensor<1xf32> loc(#loc93) + %tmp4_25 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc94) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc95) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<1x1xf32> loc(#loc96) + %tmp8_26 = arith.subf %tmp4_25, %tmp8 : tensor<1x1xf32> loc(#loc96) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc41) + %5 = tt.addptr %4, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc41) + tt.store %5, %tmp8_26 : tensor<1x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S1_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x128xf32> loc("input"(#loc44))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc45) + tt.reduce.return %2 : f32 loc(#loc45) + }) : (tensor<1x128xf32>) -> tensor<1xf32> loc(#loc45) + tt.return %0 : tensor<1xf32> loc(#loc47) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc48) + tt.return %1 : tensor<1xf32> loc(#loc48) + } loc(#loc44) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc49)), %b: f32 loc("b"(#loc49))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc50) + tt.return %0 : f32 loc(#loc51) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc52) + tt.return %1 : f32 loc(#loc52) + } loc(#loc49) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":30:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":32:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:68) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:60) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:73) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:127) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:45) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:50) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:104) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":41:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":43:23) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:40) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:28) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":47:11) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":48:18) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:4) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc58 = loc("xnumel"(#loc1)) +#loc59 = loc("r0_numel"(#loc2)) +#loc60 = loc("xoffset"(#loc3)) +#loc61 = loc("xoffset"(#loc4)) +#loc62 = loc("xindex"(#loc5)) +#loc63 = loc("xindex"(#loc6)) +#loc64 = loc("xindex"(#loc7)) +#loc65 = loc("xmask"(#loc8)) +#loc66 = loc("r0_base"(#loc9)) +#loc67 = loc("r0_base"(#loc10)) +#loc68 = loc("x0"(#loc11)) +#loc69 = loc("x1"(#loc12)) +#loc70 = loc("x1"(#loc13)) +#loc71 = loc("x2"(#loc14)) +#loc72 = loc("_tmp4"(#loc15)) +#loc73 = loc("_tmp4"(#loc16)) +#loc74 = loc("r0_index"(#loc17)) +#loc75 = loc("r0_mask"(#loc18)) +#loc76 = loc("tmp0"(#loc19)) +#loc77 = loc("tmp0"(#loc20)) +#loc78 = loc("tmp0"(#loc21)) +#loc79 = loc("tmp0"(#loc22)) +#loc80 = loc("tmp0"(#loc23)) +#loc81 = loc("tmp0"(#loc24)) +#loc82 = loc("tmp0"(#loc25)) +#loc83 = loc("tmp0"(#loc26)) +#loc84 = loc("tmp0"(#loc27)) +#loc85 = loc("tmp1"(#loc28)) +#loc86 = loc("tmp1"(#loc29)) +#loc87 = loc("tmp1"(#loc30)) +#loc88 = loc("tmp1"(#loc31)) +#loc89 = loc("tmp1"(#loc32)) +#loc90 = loc("tmp2"(#loc33)) +#loc91 = loc("tmp5"(#loc34)) +#loc92 = loc("_tmp4"(#loc35)) +#loc93 = loc("tmp4"(#loc37)) +#loc94 = loc("tmp4"(#loc38)) +#loc95 = loc("tmp7"(#loc39)) +#loc96 = loc("tmp8"(#loc40)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..91f039e59f64256a1fc4d2a6cf4d05977cbde69d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,128 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":18:0) +#loc1 = loc(unknown) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:25) +#loc33 = loc("in_ptr0"(#loc)) +#loc34 = loc("in_ptr1"(#loc)) +#loc35 = loc("out_ptr1"(#loc)) +#loc36 = loc("xnumel"(#loc)) +#loc37 = loc("r0_numel"(#loc)) +#loc62 = loc("tmp4"(#loc27)) +#loc69 = loc(callsite(#loc1 at #loc62)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c65536_i32 = arith.constant 65536 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x128xbf16, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x128xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc38) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc39) + %r0_base_2 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc39) + %x0 = arith.remsi %xoffset, %c2048_i32 : i32 loc(#loc40) + %x1 = arith.divsi %xoffset, %c2048_i32 : i32 loc(#loc41) + %x1_3 = arith.remsi %x1, %c32_i32 : i32 loc(#loc42) + %x2 = arith.divsi %xoffset, %c65536_i32 : i32 loc(#loc43) + %r0_mask = arith.cmpi slt, %r0_base_2, %cst : tensor<1x128xi32, #blocked> loc(#loc44) + %tmp0 = arith.muli %x1_3, %c128_i32 : i32 loc(#loc45) + %tmp0_4 = tt.splat %tmp0 : i32 -> tensor<1x128xi32, #blocked> loc(#loc64) + %tmp0_5 = arith.addi %r0_base_2, %tmp0_4 : tensor<1x128xi32, #blocked> loc(#loc46) + %tmp0_6 = arith.muli %x0, %c4096_i32 : i32 loc(#loc47) + %tmp0_7 = tt.splat %tmp0_6 : i32 -> tensor<1x128xi32, #blocked> loc(#loc65) + %tmp0_8 = arith.addi %tmp0_5, %tmp0_7 : tensor<1x128xi32, #blocked> loc(#loc48) + %tmp0_9 = arith.muli %x2, %c8388608_i32 : i32 loc(#loc49) + %tmp0_10 = tt.splat %tmp0_9 : i32 -> tensor<1x128xi32, #blocked> loc(#loc66) + %tmp0_11 = arith.addi %tmp0_8, %tmp0_10 : tensor<1x128xi32, #blocked> loc(#loc50) + %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked> loc(#loc51) + %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi32, #blocked> loc(#loc51) + %tmp0_14 = tt.load %tmp0_13, %r0_mask, %cst_0 evictionPolicy = evict_first : tensor<1x128x!tt.ptr, #blocked> loc(#loc52) + %tmp0_15 = arith.extf %tmp0_14 : tensor<1x128xbf16, #blocked> to tensor<1x128xf32, #blocked> loc(#loc53) + %tmp1 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc54) + %tmp1_16 = tt.splat %tmp1 : i32 -> tensor<1x128xi32, #blocked> loc(#loc67) + %tmp1_17 = arith.addi %r0_base_2, %tmp1_16 : tensor<1x128xi32, #blocked> loc(#loc55) + %tmp1_18 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked> loc(#loc56) + %tmp1_19 = tt.addptr %tmp1_18, %tmp1_17 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi32, #blocked> loc(#loc56) + %tmp1_20 = tt.load %tmp1_19, %r0_mask, %cst_0 evictionPolicy = evict_first : tensor<1x128x!tt.ptr, #blocked> loc(#loc57) + %tmp1_21 = arith.extf %tmp1_20 : tensor<1x128xbf16, #blocked> to tensor<1x128xf32, #blocked> loc(#loc58) + %tmp2 = arith.mulf %tmp0_15, %tmp1_21 : tensor<1x128xf32, #blocked> loc(#loc59) + %tmp5 = arith.addf %tmp2, %cst_1 : tensor<1x128xf32, #blocked> loc(#loc60) + %_tmp4 = arith.select %r0_mask, %tmp5, %cst_1 : tensor<1x128xi1, #blocked>, tensor<1x128xf32, #blocked> loc(#loc61) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_24: f32 loc(callsite(#loc1 at #loc62)), %tmp4_25: f32 loc(callsite(#loc1 at #loc62))): + %tmp4_26 = arith.addf %tmp4_24, %tmp4_25 : f32 loc(#loc70) + tt.reduce.return %tmp4_26 : f32 loc(#loc68) + }) : (tensor<1x128xf32, #blocked>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc68) + %tmp4_22 = ttg.convert_layout %tmp4 : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc63) + %tmp4_23 = tt.expand_dims %tmp4_22 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xf32, #blocked1> loc(#loc63) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc30) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc31) + tt.store %1, %tmp4_23 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":28:19) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:29) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":30:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":35:29) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:45) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:55) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:50) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:68) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:60) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:34) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:73) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:127) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:50) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:104) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":41:22) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":43:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:40) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:28) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:25) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:36) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:4) +#loc38 = loc("xoffset"(#loc2)) +#loc39 = loc("r0_base"(#loc3)) +#loc40 = loc("x0"(#loc4)) +#loc41 = loc("x1"(#loc5)) +#loc42 = loc("x1"(#loc6)) +#loc43 = loc("x2"(#loc7)) +#loc44 = loc("r0_mask"(#loc8)) +#loc45 = loc("tmp0"(#loc9)) +#loc46 = loc("tmp0"(#loc10)) +#loc47 = loc("tmp0"(#loc11)) +#loc48 = loc("tmp0"(#loc12)) +#loc49 = loc("tmp0"(#loc13)) +#loc50 = loc("tmp0"(#loc14)) +#loc51 = loc("tmp0"(#loc15)) +#loc52 = loc("tmp0"(#loc16)) +#loc53 = loc("tmp0"(#loc17)) +#loc54 = loc("tmp1"(#loc18)) +#loc55 = loc("tmp1"(#loc19)) +#loc56 = loc("tmp1"(#loc20)) +#loc57 = loc("tmp1"(#loc21)) +#loc58 = loc("tmp1"(#loc22)) +#loc59 = loc("tmp2"(#loc23)) +#loc60 = loc("tmp5"(#loc24)) +#loc61 = loc("_tmp4"(#loc25)) +#loc63 = loc("tmp4"(#loc29)) +#loc64 = loc(fused[#loc46, #loc45]) +#loc65 = loc(fused[#loc48, #loc47]) +#loc66 = loc(fused[#loc50, #loc49]) +#loc67 = loc(fused[#loc55, #loc54]) +#loc68 = loc(callsite(#loc26 at #loc62)) +#loc70 = loc(callsite(#loc28 at #loc68)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..191d3879e32e4035546386aebdb563994081670a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CYG3GHJHTMBAJKJSCKFP5CFSWCWO5YKOYISO6BPM7FENGIRS7VPQ/triton_red_fused_zeros_0.ttir @@ -0,0 +1,127 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":18:0) +#loc3 = loc(unknown) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:25) +#loc34 = loc("in_ptr0"(#loc)) +#loc35 = loc("in_ptr1"(#loc)) +#loc36 = loc("out_ptr1"(#loc)) +#loc37 = loc("xnumel"(#loc)) +#loc38 = loc("r0_numel"(#loc)) +#loc64 = loc("tmp4"(#loc28)) +#loc71 = loc(callsite(#loc3 at #loc64)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %tmp0 = arith.constant 8388608 : i32 loc(#loc39) + %tmp0_0 = arith.constant 4096 : i32 loc(#loc40) + %c128_i32 = arith.constant 128 : i32 loc(#loc3) + %x2 = arith.constant 65536 : i32 loc(#loc41) + %x1 = arith.constant 32 : i32 loc(#loc42) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc3) + %cst = arith.constant dense<0.000000e+00> : tensor<1x128xbf16> loc(#loc3) + %cst_1 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc3) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc43) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc44) + %r0_base_3 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc45) + %x0 = arith.remsi %xoffset, %c2048_i32 : i32 loc(#loc46) + %x1_4 = arith.divsi %xoffset, %c2048_i32 : i32 loc(#loc47) + %x1_5 = arith.remsi %x1_4, %x1 : i32 loc(#loc42) + %x2_6 = arith.divsi %xoffset, %x2 : i32 loc(#loc41) + %r0_mask = arith.cmpi slt, %r0_base_3, %cst_1 : tensor<1x128xi32> loc(#loc48) + %tmp0_7 = arith.muli %x1_5, %c128_i32 : i32 loc(#loc49) + %tmp0_8 = tt.splat %tmp0_7 : i32 -> tensor<1x128xi32> loc(#loc66) + %tmp0_9 = arith.addi %r0_base_3, %tmp0_8 : tensor<1x128xi32> loc(#loc50) + %tmp0_10 = arith.muli %x0, %tmp0_0 : i32 loc(#loc40) + %tmp0_11 = tt.splat %tmp0_10 : i32 -> tensor<1x128xi32> loc(#loc67) + %tmp0_12 = arith.addi %tmp0_9, %tmp0_11 : tensor<1x128xi32> loc(#loc51) + %tmp0_13 = arith.muli %x2_6, %tmp0 : i32 loc(#loc39) + %tmp0_14 = tt.splat %tmp0_13 : i32 -> tensor<1x128xi32> loc(#loc68) + %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<1x128xi32> loc(#loc52) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc53) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc53) + %tmp0_18 = tt.load %tmp0_17, %r0_mask, %cst evictionPolicy = evict_first : tensor<1x128x!tt.ptr> loc(#loc54) + %tmp0_19 = arith.extf %tmp0_18 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc55) + %tmp1 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc56) + %tmp1_20 = tt.splat %tmp1 : i32 -> tensor<1x128xi32> loc(#loc69) + %tmp1_21 = arith.addi %r0_base_3, %tmp1_20 : tensor<1x128xi32> loc(#loc57) + %tmp1_22 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc58) + %tmp1_23 = tt.addptr %tmp1_22, %tmp1_21 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc58) + %tmp1_24 = tt.load %tmp1_23, %r0_mask, %cst evictionPolicy = evict_first : tensor<1x128x!tt.ptr> loc(#loc59) + %tmp1_25 = arith.extf %tmp1_24 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc60) + %tmp2 = arith.mulf %tmp0_19, %tmp1_25 : tensor<1x128xf32> loc(#loc61) + %tmp5 = arith.addf %tmp2, %cst_2 : tensor<1x128xf32> loc(#loc62) + %_tmp4 = arith.select %r0_mask, %tmp5, %cst_2 : tensor<1x128xi1>, tensor<1x128xf32> loc(#loc63) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_27: f32 loc(callsite(#loc3 at #loc64)), %tmp4_28: f32 loc(callsite(#loc3 at #loc64))): + %tmp4_29 = arith.addf %tmp4_27, %tmp4_28 : f32 loc(#loc72) + tt.reduce.return %tmp4_29 : f32 loc(#loc70) + }) : (tensor<1x128xf32>) -> tensor<1xf32> loc(#loc70) + %tmp4_26 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc65) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc31) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc31) + tt.store %1, %tmp4_26 : tensor<1x1x!tt.ptr> loc(#loc32) + tt.return loc(#loc33) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:68) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:55) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":30:19) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:29) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":28:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:21) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":35:29) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:45) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:41) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:50) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:60) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:34) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:73) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:127) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:104) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":41:22) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":43:23) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:40) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:28) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:25) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:36) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:4) +#loc39 = loc("tmp0"(#loc1)) +#loc40 = loc("tmp0"(#loc2)) +#loc41 = loc("x2"(#loc4)) +#loc42 = loc("x1"(#loc5)) +#loc43 = loc("xoffset"(#loc6)) +#loc44 = loc("r0_base"(#loc7)) +#loc45 = loc("r0_base"(#loc8)) +#loc46 = loc("x0"(#loc9)) +#loc47 = loc("x1"(#loc10)) +#loc48 = loc("r0_mask"(#loc11)) +#loc49 = loc("tmp0"(#loc12)) +#loc50 = loc("tmp0"(#loc13)) +#loc51 = loc("tmp0"(#loc14)) +#loc52 = loc("tmp0"(#loc15)) +#loc53 = loc("tmp0"(#loc16)) +#loc54 = loc("tmp0"(#loc17)) +#loc55 = loc("tmp0"(#loc18)) +#loc56 = loc("tmp1"(#loc19)) +#loc57 = loc("tmp1"(#loc20)) +#loc58 = loc("tmp1"(#loc21)) +#loc59 = loc("tmp1"(#loc22)) +#loc60 = loc("tmp1"(#loc23)) +#loc61 = loc("tmp2"(#loc24)) +#loc62 = loc("tmp5"(#loc25)) +#loc63 = loc("_tmp4"(#loc26)) +#loc65 = loc("tmp4"(#loc30)) +#loc66 = loc(fused[#loc50, #loc49]) +#loc67 = loc(fused[#loc51, #loc40]) +#loc68 = loc(fused[#loc52, #loc39]) +#loc69 = loc(fused[#loc57, #loc56]) +#loc70 = loc(callsite(#loc27 at #loc64)) +#loc72 = loc(callsite(#loc29 at #loc70)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/__grp__triton_poi_fused_new_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/__grp__triton_poi_fused_new_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8e1c9abc860f4528e934554f784f02bd81c35ea5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/__grp__triton_poi_fused_new_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_new_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.source", "triton_poi_fused_new_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.ttir", "triton_poi_fused_new_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.ttgir", "triton_poi_fused_new_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.llir", "triton_poi_fused_new_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.ptx", "triton_poi_fused_new_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.cubin", "triton_poi_fused_new_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..81854f9e456ed5903e51ed4b620c74059cc3596c Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..95eca9117103f70b4665bebe53f053443359942b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.json @@ -0,0 +1 @@ +{"hash": "1617032f8bc50f6a1ef6a39e9ea0ef80a8015aec777ffa20dde29ccc3a0f0791", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_new_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..964988f233d499d3c84b3ec319ffe37421900461 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.llir @@ -0,0 +1,46 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_new_zeros_0(ptr addrspace(1) %0, i32 %1, ptr addrspace(1) readnone captures(none) %2, ptr addrspace(1) readnone captures(none) %3) local_unnamed_addr #0 !dbg !4 { + %5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %6 = shl i32 %5, 7, !dbg !8 + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %8 = and i32 %7, 127, !dbg !9 + %9 = or disjoint i32 %6, %8, !dbg !10 + %10 = icmp slt i32 %9, %1, !dbg !11 + %11 = sext i32 %9 to i64, !dbg !12 + %12 = getelementptr i32, ptr addrspace(1) %0, i64 %11, !dbg !12 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 0, ptr addrspace(1) %12, i1 %10) #2, !dbg !13 + ret void, !dbg !14 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_new_zeros_0", linkageName: "triton_poi_fused_new_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 19, column: 28, scope: !4) +!8 = !DILocation(line: 19, column: 33, scope: !4) +!9 = !DILocation(line: 20, column: 36, scope: !4) +!10 = !DILocation(line: 20, column: 23, scope: !4) +!11 = !DILocation(line: 21, column: 21, scope: !4) +!12 = !DILocation(line: 24, column: 25, scope: !4) +!13 = !DILocation(line: 24, column: 36, scope: !4) +!14 = !DILocation(line: 24, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..9a0134700dd0c6750420e554a66c211b15030809 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.ptx @@ -0,0 +1,207 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_new_zeros_0 // -- Begin function triton_poi_fused_new_zeros_0 + // @triton_poi_fused_new_zeros_0 +.visible .entry triton_poi_fused_new_zeros_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_0_param_0, + .param .u32 triton_poi_fused_new_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_0_param_3 +) +.reqntid 128 +{ + .reg .pred %p<2>; + .reg .b32 %r<8>; + .reg .b64 %rd<3>; + .loc 1 18 0 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:18:0 + +// %bb.0: + ld.param.b64 %rd2, [triton_poi_fused_new_zeros_0_param_0]; + ld.param.b32 %r2, [triton_poi_fused_new_zeros_0_param_1]; +$L__tmp0: + .loc 1 19 28 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:19:28 + mov.u32 %r3, %ctaid.x; + .loc 1 19 33 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:19:33 + shl.b32 %r4, %r3, 7; + .loc 1 20 36 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:20:36 + mov.u32 %r5, %tid.x; + and.b32 %r6, %r5, 127; + .loc 1 20 23 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:20:23 + or.b32 %r7, %r4, %r6; + .loc 1 21 21 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:21:21 + setp.lt.s32 %p1, %r7, %r2; + .loc 1 24 25 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:24:25 + mad.wide.s32 %rd1, %r7, 4, %rd2; + mov.b32 %r1, 0; + .loc 1 24 36 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:24:36 + // begin inline asm + @%p1 st.global.b32 [ %rd1 + 0 ], { %r1 }; + // end inline asm + .loc 1 24 4 // c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py:24:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 54 +.b8 106 +.b8 122 +.b8 120 +.b8 122 +.b8 116 +.b8 100 +.b8 120 +.b8 98 +.b8 106 +.b8 118 +.b8 53 +.b8 98 +.b8 50 +.b8 51 +.b8 110 +.b8 102 +.b8 109 +.b8 103 +.b8 122 +.b8 103 +.b8 116 +.b8 105 +.b8 122 +.b8 113 +.b8 112 +.b8 55 +.b8 55 +.b8 104 +.b8 55 +.b8 97 +.b8 101 +.b8 97 +.b8 107 +.b8 53 +.b8 106 +.b8 50 +.b8 106 +.b8 117 +.b8 107 +.b8 109 +.b8 122 +.b8 51 +.b8 114 +.b8 111 +.b8 113 +.b8 101 +.b8 105 +.b8 119 +.b8 51 +.b8 107 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 54 +.b8 106 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..72929f390a3d069d088fee80fe18cb0721bbf707 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.source @@ -0,0 +1,38 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_new_zeros_0(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc12) + %xoffset_0 = arith.constant 128 : i32 loc(#loc13) + %xoffset_1 = arith.constant 128 : i32 loc(#loc13) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc13) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc14) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<128xi32> loc(#loc15) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<128xi32> loc(#loc15) + %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc16) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<128xi32> loc(#loc16) + %tmp0 = arith.constant 0 : i32 loc(#loc17) + %tmp0_6 = arith.constant dense<0> : tensor<1xi32> loc(#loc17) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc7) + %1 = tt.addptr %0, %xindex_4 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc7) + %cst = arith.constant dense<0> : tensor<128xi32> loc(#loc8) + tt.store %1, %cst, %xmask_5 : tensor<128x!tt.ptr> loc(#loc8) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":23:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":24:25) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":24:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":24:4) +#loc12 = loc("xoffset"(#loc1)) +#loc13 = loc("xoffset"(#loc2)) +#loc14 = loc("xindex"(#loc3)) +#loc15 = loc("xindex"(#loc4)) +#loc16 = loc("xmask"(#loc5)) +#loc17 = loc("tmp0"(#loc6)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..17f42ff163a2a5ee66c39ce6cb8e4d5a18bdd558 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.ttgir @@ -0,0 +1,35 @@ +#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_new_zeros_0(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %cst = arith.constant dense<0> : tensor<128xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc12) + %xoffset_0 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc13) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked> loc(#loc14) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<128xi32, #blocked> loc(#loc15) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<128xi32, #blocked> loc(#loc15) + %xmask = tt.splat %xnumel : i32 -> tensor<128xi32, #blocked> loc(#loc16) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<128xi32, #blocked> loc(#loc16) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr, #blocked> loc(#loc7) + %1 = tt.addptr %0, %xindex_2 : tensor<128x!tt.ptr, #blocked>, tensor<128xi32, #blocked> loc(#loc7) + tt.store %1, %cst, %xmask_3 : tensor<128x!tt.ptr, #blocked> loc(#loc8) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":24:25) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":24:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":24:4) +#loc12 = loc("xoffset"(#loc2)) +#loc13 = loc("xoffset"(#loc3)) +#loc14 = loc("xindex"(#loc4)) +#loc15 = loc("xindex"(#loc5)) +#loc16 = loc("xmask"(#loc6)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..2593868e4664d2ccc78b5a55a8778ce36bb6d904 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/CYLQGL4LYUHWUHXWUOPJ5IHPQCUACWXMO577UIG54KOMYOQPA6IQ/triton_poi_fused_new_zeros_0.ttir @@ -0,0 +1,34 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_new_zeros_0(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<128xi32> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc12) + %xoffset_0 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc13) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc14) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<128xi32> loc(#loc15) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<128xi32> loc(#loc15) + %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc16) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<128xi32> loc(#loc16) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc8) + %1 = tt.addptr %0, %xindex_2 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc8) + tt.store %1, %cst, %xmask_3 : tensor<128x!tt.ptr> loc(#loc1) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":24:36) +#loc2 = loc(unknown) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":19:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":19:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":20:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":20:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":21:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":24:25) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6j/c6jzxztdxbjv5b23nfmgzgtizqp77h7aeak5j2jukmz3roqeiw3k.py":24:4) +#loc12 = loc("xoffset"(#loc3)) +#loc13 = loc("xoffset"(#loc4)) +#loc14 = loc("xindex"(#loc5)) +#loc15 = loc("xindex"(#loc6)) +#loc16 = loc("xmask"(#loc7)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0f78ed2da3aa1b41604267591cc738b525f5442d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..85d8945f1e33de9bfcfe6043fc67f8923e22770e Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..681b97131d893e6cbd7b2284882a5f397f41490b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json @@ -0,0 +1 @@ +{"hash": "193d794b0b97c3631a5ca3b76996d9137ceae2e83b0d9eaa3f7ef16cdd7504c7", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..a89983a9ca733af5086f9b7ee9b808a5c708f969 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir @@ -0,0 +1,318 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i32 %9, i32 %10, ptr addrspace(1) readnone captures(none) %11, ptr addrspace(1) readnone captures(none) %12) local_unnamed_addr #0 !dbg !4 { + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %15 = icmp slt i32 %14, %9, !dbg !8 + %16 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %17 = and i32 %16, 384, !dbg !9 + %18 = zext nneg i32 %14 to i64, !dbg !10 + %.frozen = freeze i64 %3, !dbg !10 + %19 = sdiv i64 %18, %.frozen, !dbg !10 + %20 = srem i64 %19, %4, !dbg !11 + %21 = mul i64 %19, %.frozen, !dbg !12 + %.decomposed = sub i64 %18, %21, !dbg !12 + %22 = sdiv i64 %18, %7, !dbg !13 + %23 = shl nsw i64 %20, 7, !dbg !14 + %24 = shl nuw nsw i64 %.decomposed, 7, !dbg !15 + %25 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !16 + %26 = and i32 %16, 127 + %27 = zext nneg i32 %26 to i64 + %28 = or disjoint i64 %24, %27 + %29 = icmp slt i64 %28, %6 + %30 = icmp sge i64 %28, %8 + %31 = tail call i64 @llvm.smin.i64(i64 %8, i64 0) + %32 = sub nsw i64 %.decomposed, %20 + %33 = shl nsw i64 %32, 7 + %34 = zext nneg i32 %17 to i64, !dbg !17 + %35 = zext nneg i32 %26 to i64, !dbg !17 + %36 = zext nneg i32 %16 to i64, !dbg !17 + %37 = insertelement <2 x i1> poison, i1 %15, i64 0, !dbg !18 + %38 = shufflevector <2 x i1> %37, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !18 + %39 = insertelement <2 x i1> poison, i1 %29, i64 0, !dbg !19 + %40 = shufflevector <2 x i1> %39, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !19 + %41 = insertelement <2 x i64> poison, i64 %23, i64 0, !dbg !20 + %42 = shufflevector <2 x i64> %41, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !20 + %43 = insertelement <2 x i64> poison, i64 %5, i64 0, !dbg !21 + %44 = shufflevector <2 x i64> %43, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !21 + %45 = insertelement <2 x i64> poison, i64 %28, i64 0, !dbg !22 + %46 = shufflevector <2 x i64> %45, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !22 + %47 = insertelement <2 x i1> poison, i1 %30, i64 0, !dbg !23 + %48 = shufflevector <2 x i1> %47, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !23 + %49 = insertelement <2 x i64> poison, i64 %33, i64 0, !dbg !24 + %50 = shufflevector <2 x i64> %49, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !24 + %51 = insertelement <2 x i64> poison, i64 %8, i64 0, !dbg !25 + %52 = shufflevector <2 x i64> %51, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !25 + br label %53, !dbg !17 + +53: ; preds = %13, %53 + %indvars.iv = phi i64 [ 0, %13 ], [ %indvars.iv.next, %53 ] + %54 = phi <2 x i64> [ zeroinitializer, %13 ], [ %113, %53 ] + %55 = or disjoint i64 %indvars.iv, %34, !dbg !26 + %56 = or disjoint i64 %indvars.iv, %36, !dbg !26 + %57 = lshr exact i64 %55, 7, !dbg !27 + %58 = lshr i64 %56, 7, !dbg !27 + %59 = trunc nuw nsw i64 %58 to i32, !dbg !27 + %60 = or i32 %59, 4, !dbg !27 + %61 = zext nneg i32 %60 to i64, !dbg !20 + %62 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28 + %63 = sub nsw i64 %35, %57, !dbg !29 + %64 = sub nsw i32 %26, %60, !dbg !29 + %65 = sext i32 %64 to i64, !dbg !30 + %66 = insertelement <2 x i64> poison, i64 %57, i64 0, !dbg !20 + %67 = insertelement <2 x i64> %66, i64 %61, i64 1, !dbg !20 + %68 = or disjoint <2 x i64> %42, %67, !dbg !20 + %69 = icmp slt <2 x i64> %68, %44, !dbg !21 + %70 = and <2 x i1> %40, %69, !dbg !19 + %71 = icmp sge <2 x i64> %68, %46, !dbg !22 + %72 = extractelement <2 x i1> %70, i64 0, !dbg !31 + %73 = and i1 %15, %72, !dbg !31 + %74 = extractelement <2 x i1> %70, i64 1, !dbg !31 + %75 = and i1 %15, %74, !dbg !31 + %76 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %62, i1 %73) #5, !dbg !28 + %77 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !28 + %78 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %25, i64 %77, i1 %75) #5, !dbg !28 + %79 = insertelement <2 x i64> poison, i64 %76, i64 0, !dbg !32 + %80 = insertelement <2 x i64> %79, i64 %78, i64 1, !dbg !32 + %81 = icmp slt <2 x i64> %46, %80, !dbg !32 + %82 = icmp slt <2 x i64> %68, %80, !dbg !33 + %83 = and <2 x i1> %81, %82, !dbg !34 + %84 = and <2 x i1> %71, %83, !dbg !35 + %85 = srem i64 %28, %8, !dbg !36 + %.not = icmp eq i64 %85, 0, !dbg !37 + %86 = select i1 %.not, i64 0, i64 %31, !dbg !38 + %87 = add nsw i64 %86, %85, !dbg !38 + %88 = insertelement <2 x i64> poison, i64 %87, i64 0, !dbg !39 + %89 = shufflevector <2 x i64> %88, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !39 + %90 = icmp slt <2 x i64> %89, %80, !dbg !39 + %91 = insertelement <2 x i64> poison, i64 %63, i64 0, !dbg !24 + %92 = insertelement <2 x i64> %91, i64 %65, i64 1, !dbg !24 + %93 = add nsw <2 x i64> %50, %92, !dbg !24 + %94 = srem <2 x i64> %93, %52, !dbg !25 + %95 = icmp ne <2 x i64> %94, zeroinitializer, !dbg !40 + %96 = extractelement <2 x i64> %94, i64 0, !dbg !41 + %97 = xor i64 %96, %8, !dbg !41 + %98 = extractelement <2 x i64> %94, i64 1, !dbg !41 + %99 = xor i64 %98, %8, !dbg !41 + %100 = insertelement <2 x i64> poison, i64 %97, i64 0, !dbg !41 + %101 = insertelement <2 x i64> %100, i64 %99, i64 1, !dbg !41 + %102 = icmp slt <2 x i64> %101, zeroinitializer, !dbg !41 + %103 = and <2 x i1> %95, %102, !dbg !42 + %104 = select <2 x i1> %103, <2 x i64> %52, <2 x i64> zeroinitializer, !dbg !43 + %105 = sub <2 x i64> zeroinitializer, %104, !dbg !44 + %106 = icmp eq <2 x i64> %94, %105, !dbg !44 + %107 = and <2 x i1> %90, %106, !dbg !23 + %108 = and <2 x i1> %48, %107, !dbg !23 + %109 = or <2 x i1> %84, %108, !dbg !45 + %110 = select <2 x i1> %38, <2 x i1> %70, <2 x i1> zeroinitializer, !dbg !18 + %111 = select <2 x i1> %110, <2 x i1> %109, <2 x i1> zeroinitializer, !dbg !18 + %112 = zext <2 x i1> %111 to <2 x i64>, !dbg !18 + %113 = add <2 x i64> %54, %112, !dbg !18 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1024, !dbg !17 + %114 = icmp samesign ult i64 %indvars.iv, 15360, !dbg !17 + br i1 %114, label %53, label %115, !dbg !17 + +115: ; preds = %53 + %116 = and i32 %16, 31, !dbg !9 + %117 = lshr i32 %16, 5, !dbg !9 + %shift = shufflevector <2 x i64> %113, <2 x i64> poison, <2 x i32> , !dbg !46 + %foldExtExtBinop = add <2 x i64> %113, %shift, !dbg !46 + %118 = extractelement <2 x i64> %foldExtExtBinop, i64 0, !dbg !46 + %119 = bitcast <2 x i64> %foldExtExtBinop to <4 x i32>, !dbg !50 + %120 = extractelement <4 x i32> %119, i64 1, !dbg !50 + %121 = trunc i64 %118 to i32, !dbg !50 + %122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 16, i32 31), !dbg !50 + %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 16, i32 31), !dbg !50 + %124 = insertelement <2 x i32> poison, i32 %122, i64 0, !dbg !50 + %125 = insertelement <2 x i32> %124, i32 %123, i64 1, !dbg !50 + %126 = bitcast <2 x i32> %125 to i64, !dbg !50 + %127 = add i64 %118, %126, !dbg !46 + %extelt.offset1 = lshr i64 %127, 32, !dbg !50 + %128 = trunc nuw i64 %extelt.offset1 to i32, !dbg !50 + %129 = trunc i64 %127 to i32, !dbg !50 + %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 8, i32 31), !dbg !50 + %131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 8, i32 31), !dbg !50 + %132 = insertelement <2 x i32> poison, i32 %130, i64 0, !dbg !50 + %133 = insertelement <2 x i32> %132, i32 %131, i64 1, !dbg !50 + %134 = bitcast <2 x i32> %133 to i64, !dbg !50 + %135 = add i64 %127, %134, !dbg !46 + %extelt.offset2 = lshr i64 %135, 32, !dbg !50 + %136 = trunc nuw i64 %extelt.offset2 to i32, !dbg !50 + %137 = trunc i64 %135 to i32, !dbg !50 + %138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 4, i32 31), !dbg !50 + %139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 4, i32 31), !dbg !50 + %140 = insertelement <2 x i32> poison, i32 %138, i64 0, !dbg !50 + %141 = insertelement <2 x i32> %140, i32 %139, i64 1, !dbg !50 + %142 = bitcast <2 x i32> %141 to i64, !dbg !50 + %143 = add i64 %135, %142, !dbg !46 + %extelt.offset3 = lshr i64 %143, 32, !dbg !50 + %144 = trunc nuw i64 %extelt.offset3 to i32, !dbg !50 + %145 = trunc i64 %143 to i32, !dbg !50 + %146 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %145, i32 2, i32 31), !dbg !50 + %147 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 2, i32 31), !dbg !50 + %148 = insertelement <2 x i32> poison, i32 %146, i64 0, !dbg !50 + %149 = insertelement <2 x i32> %148, i32 %147, i64 1, !dbg !50 + %150 = bitcast <2 x i32> %149 to i64, !dbg !50 + %151 = add i64 %143, %150, !dbg !46 + %extelt.offset4 = lshr i64 %151, 32, !dbg !50 + %152 = trunc nuw i64 %extelt.offset4 to i32, !dbg !50 + %153 = trunc i64 %151 to i32, !dbg !50 + %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 1, i32 31), !dbg !50 + %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 1, i32 31), !dbg !50 + %156 = insertelement <2 x i32> poison, i32 %154, i64 0, !dbg !50 + %157 = insertelement <2 x i32> %156, i32 %155, i64 1, !dbg !50 + %158 = bitcast <2 x i32> %157 to i64, !dbg !50 + %159 = add i64 %151, %158, !dbg !46 + %160 = and i32 %117, 15, !dbg !50 + %161 = icmp eq i32 %116, 0, !dbg !50 + %162 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %160, !dbg !50 + %163 = insertelement <1 x i64> poison, i64 %159, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %162, <1 x i64> %163, i1 %161) #5, !dbg !50 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50 + %164 = icmp samesign ult i32 %16, 16, !dbg !50 + %165 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %16, !dbg !50 + %166 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %165, i1 %164) #5, !dbg !50 + %extelt.offset5 = lshr i64 %166, 32, !dbg !50 + %167 = trunc nuw i64 %extelt.offset5 to i32, !dbg !50 + %168 = trunc i64 %166 to i32, !dbg !50 + %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 8, i32 31), !dbg !50 + %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %167, i32 8, i32 31), !dbg !50 + %171 = insertelement <2 x i32> poison, i32 %169, i64 0, !dbg !50 + %172 = insertelement <2 x i32> %171, i32 %170, i64 1, !dbg !50 + %173 = bitcast <2 x i32> %172 to i64, !dbg !50 + %174 = add i64 %166, %173, !dbg !46 + %extelt.offset6 = lshr i64 %174, 32, !dbg !50 + %175 = trunc nuw i64 %extelt.offset6 to i32, !dbg !50 + %176 = trunc i64 %174 to i32, !dbg !50 + %177 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 4, i32 31), !dbg !50 + %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 4, i32 31), !dbg !50 + %179 = insertelement <2 x i32> poison, i32 %177, i64 0, !dbg !50 + %180 = insertelement <2 x i32> %179, i32 %178, i64 1, !dbg !50 + %181 = bitcast <2 x i32> %180 to i64, !dbg !50 + %182 = add i64 %174, %181, !dbg !46 + %extelt.offset7 = lshr i64 %182, 32, !dbg !50 + %183 = trunc nuw i64 %extelt.offset7 to i32, !dbg !50 + %184 = trunc i64 %182 to i32, !dbg !50 + %185 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 2, i32 31), !dbg !50 + %186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %183, i32 2, i32 31), !dbg !50 + %187 = insertelement <2 x i32> poison, i32 %185, i64 0, !dbg !50 + %188 = insertelement <2 x i32> %187, i32 %186, i64 1, !dbg !50 + %189 = bitcast <2 x i32> %188 to i64, !dbg !50 + %190 = add i64 %182, %189, !dbg !46 + %extelt.offset8 = lshr i64 %190, 32, !dbg !50 + %191 = trunc nuw i64 %extelt.offset8 to i32, !dbg !50 + %192 = trunc i64 %190 to i32, !dbg !50 + %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 1, i32 31), !dbg !50 + %194 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %191, i32 1, i32 31), !dbg !50 + %195 = insertelement <2 x i32> poison, i32 %193, i64 0, !dbg !50 + %196 = insertelement <2 x i32> %195, i32 %194, i64 1, !dbg !50 + %197 = bitcast <2 x i32> %196 to i64, !dbg !50 + %198 = add i64 %190, %197, !dbg !46 + %199 = icmp eq i32 %16, 0, !dbg !50 + %200 = insertelement <1 x i64> poison, i64 %198, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %165, <1 x i64> %200, i1 %199) #5, !dbg !50 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50 + %201 = load i64, ptr addrspace(3) @global_smem, align 16, !dbg !50 + %202 = add i64 %201, -1, !dbg !51 + %203 = icmp ult i64 %202, 16383, !dbg !51 + %204 = zext i1 %203 to i32, !dbg !52 + %205 = icmp eq i64 %201, 16384, !dbg !53 + %206 = zext i1 %205 to i32, !dbg !52 + %207 = getelementptr i32, ptr addrspace(1) %1, i64 %18, !dbg !54 + %208 = and i32 %16, 511, !dbg !55 + %209 = icmp eq i32 %208, 0, !dbg !55 + %210 = and i1 %209, %15, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %204, ptr addrspace(1) %207, i1 %210) #5, !dbg !55 + %211 = getelementptr i32, ptr addrspace(1) %2, i64 %18, !dbg !56 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %206, ptr addrspace(1) %211, i1 %210) #5, !dbg !57 + ret void, !dbg !58 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.smin.i64(i64, i64) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1", linkageName: "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 24, column: 21, scope: !4) +!9 = !DILocation(line: 25, column: 37, scope: !4) +!10 = !DILocation(line: 27, column: 21, scope: !4) +!11 = !DILocation(line: 27, column: 28, scope: !4) +!12 = !DILocation(line: 28, column: 19, scope: !4) +!13 = !DILocation(line: 29, column: 19, scope: !4) +!14 = !DILocation(line: 39, column: 26, scope: !4) +!15 = !DILocation(line: 42, column: 26, scope: !4) +!16 = !DILocation(line: 49, column: 35, scope: !4) +!17 = !DILocation(line: 32, column: 40, scope: !4) +!18 = !DILocation(line: 86, column: 50, scope: !4) +!19 = !DILocation(line: 45, column: 22, scope: !4) +!20 = !DILocation(line: 39, column: 22, scope: !4) +!21 = !DILocation(line: 41, column: 22, scope: !4) +!22 = !DILocation(line: 48, column: 23, scope: !4) +!23 = !DILocation(line: 79, column: 24, scope: !4) +!24 = !DILocation(line: 69, column: 51, scope: !4) +!25 = !DILocation(line: 70, column: 25, scope: !4) +!26 = !DILocation(line: 33, column: 31, scope: !4) +!27 = !DILocation(line: 37, column: 27, scope: !4) +!28 = !DILocation(line: 49, column: 77, scope: !4) +!29 = !DILocation(line: 69, column: 24, scope: !4) +!30 = !DILocation(line: 69, column: 38, scope: !4) +!31 = !DILocation(line: 49, column: 94, scope: !4) +!32 = !DILocation(line: 50, column: 23, scope: !4) +!33 = !DILocation(line: 51, column: 23, scope: !4) +!34 = !DILocation(line: 52, column: 24, scope: !4) +!35 = !DILocation(line: 53, column: 23, scope: !4) +!36 = !DILocation(line: 58, column: 24, scope: !4) +!37 = !DILocation(line: 60, column: 25, scope: !4) +!38 = !DILocation(line: 66, column: 39, scope: !4) +!39 = !DILocation(line: 67, column: 24, scope: !4) +!40 = !DILocation(line: 71, column: 25, scope: !4) +!41 = !DILocation(line: 73, column: 25, scope: !4) +!42 = !DILocation(line: 74, column: 24, scope: !4) +!43 = !DILocation(line: 76, column: 39, scope: !4) +!44 = !DILocation(line: 78, column: 25, scope: !4) +!45 = !DILocation(line: 80, column: 24, scope: !4) +!46 = !DILocation(line: 261, column: 15, scope: !47, inlinedAt: !49) +!47 = distinct !DILexicalBlockFile(scope: !4, file: !48, discriminator: 0) +!48 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!49 = !DILocation(line: 87, column: 27, scope: !4) +!50 = !DILocation(line: 291, column: 36, scope: !47, inlinedAt: !49) +!51 = !DILocation(line: 92, column: 20, scope: !4) +!52 = !DILocation(line: 0, scope: !4) +!53 = !DILocation(line: 95, column: 21, scope: !4) +!54 = !DILocation(line: 98, column: 25, scope: !4) +!55 = !DILocation(line: 98, column: 37, scope: !4) +!56 = !DILocation(line: 99, column: 25, scope: !4) +!57 = !DILocation(line: 99, column: 37, scope: !4) +!58 = !DILocation(line: 99, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..526e5ebd957e3d32772ba5244dcbe4511b6b08c4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx @@ -0,0 +1,736 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 // -- Begin function triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 +.visible .entry triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_2, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_3, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_4, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_5, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_6, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_7, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_8, + .param .u32 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_9, + .param .u32 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_10, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_11, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_12 +) +.reqntid 512 +{ + .reg .pred %p<53>; + .reg .b32 %r<76>; + .reg .b64 %rd<162>; + .loc 1 18 0 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:18:0 +$L__func_begin0: + .loc 1 18 0 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:18:0 + +// %bb.0: + ld.param.b64 %rd47, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_4]; +$L__tmp0: + .loc 1 22 28 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:22:28 + mov.u32 %r7, %ctaid.x; + .loc 1 27 21 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:27:21 + cvt.u64.u32 %rd1, %r7; + ld.param.b64 %rd52, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_3]; + and.b64 %rd53, %rd52, -4294967296; + setp.ne.b64 %p11, %rd53, 0; + cvt.u32.u64 %r74, %rd1; + @%p11 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd153, %rd1, %rd52; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r8, %rd52; + div.u32 %r10, %r74, %r8; + cvt.u64.u32 %rd153, %r10; +$L__BB0_3: + .loc 1 0 21 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:0:21 + ld.param.b64 %rd50, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_7]; + .loc 1 27 28 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:27:28 + or.b64 %rd54, %rd153, %rd47; + and.b64 %rd55, %rd54, -4294967296; + setp.ne.b64 %p12, %rd55, 0; + @%p12 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + rem.s64 %rd154, %rd153, %rd47; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r11, %rd47; + cvt.u32.u64 %r12, %rd153; + rem.u32 %r13, %r12, %r11; + cvt.u64.u32 %rd154, %r13; +$L__BB0_6: + .loc 1 0 28 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:0:28 + ld.param.b32 %r6, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_9]; + ld.param.b64 %rd51, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_8]; + ld.param.b64 %rd49, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_6]; + ld.param.b64 %rd44, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_0]; + mov.u32 %r1, %tid.x; + .loc 1 28 19 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:28:19 + mul.lo.s64 %rd56, %rd153, %rd52; + sub.s64 %rd9, %rd1, %rd56; + .loc 1 29 19 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:29:19 + and.b64 %rd57, %rd50, -4294967296; + setp.ne.b64 %p13, %rd57, 0; + @%p13 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + div.s64 %rd155, %rd1, %rd50; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r14, %rd50; + div.u32 %r16, %r74, %r14; + cvt.u64.u32 %rd155, %r16; +$L__BB0_9: + .loc 1 0 19 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:0:19 + ld.param.b64 %rd48, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_5]; + ld.param.b64 %rd46, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_2]; + ld.param.b64 %rd45, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_1]; + .loc 1 24 21 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:24:21 + setp.lt.s32 %p1, %r74, %r6; + .loc 1 39 26 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:39:26 + shl.b64 %rd16, %rd154, 7; + .loc 1 42 26 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:42:26 + shl.b64 %rd61, %rd9, 7; + .loc 1 49 35 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:49:35 + shl.b64 %rd62, %rd155, 3; + add.s64 %rd70, %rd44, %rd62; + and.b32 %r2, %r1, 127; + cvt.u64.u32 %rd63, %r2; + or.b64 %rd20, %rd61, %rd63; + setp.lt.s64 %p3, %rd20, %rd49; + setp.ge.s64 %p5, %rd20, %rd51; + min.s64 %rd15, %rd51, 0; + sub.s64 %rd64, %rd9, %rd154; + shl.b64 %rd22, %rd64, 7; + .loc 1 32 40 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:32:40 + cvt.u64.u32 %rd65, %r1; + shr.u64 %rd66, %rd65, 7; + cvt.u32.u64 %r75, %rd66; + shr.u32 %r18, %r1, 7; + cvt.u64.u32 %rd67, %r18; + and.b64 %rd157, %rd67, 3; + sub.s64 %rd156, %rd63, %rd157; + mov.b64 %rd159, 0; + mov.b64 %rd158, -1024; + mov.b64 %rd160, %rd159; + bra.uni $L__BB0_10; +$L__BB0_12: // in Loop: Header=BB0_10 Depth=1 + .loc 1 58 24 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:58:24 + rem.s64 %rd161, %rd20, %rd51; +$L__BB0_13: // in Loop: Header=BB0_10 Depth=1 + .loc 1 0 0 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:0 + sub.s32 %r21, %r2, %r20; + cvt.s64.s32 %rd33, %r21; + .loc 1 60 25 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:60:25 + setp.eq.b64 %p24, %rd161, 0; + .loc 1 66 39 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:66:39 + selp.b64 %rd83, 0, %rd15, %p24; + add.s64 %rd84, %rd83, %rd161; + .loc 1 67 24 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:67:24 + setp.lt.s64 %p25, %rd84, %rd69; + setp.lt.s64 %p26, %rd84, %rd73; + .loc 1 69 51 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:69:51 + add.s64 %rd85, %rd22, %rd33; + add.s64 %rd86, %rd22, %rd156; + .loc 1 70 25 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:70:25 + rem.s64 %rd87, %rd86, %rd51; + rem.s64 %rd88, %rd85, %rd51; + .loc 1 71 25 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:71:25 + setp.ne.b64 %p27, %rd88, 0; + setp.ne.b64 %p28, %rd87, 0; + .loc 1 73 25 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:73:25 + xor.b64 %rd89, %rd87, %rd51; + xor.b64 %rd90, %rd88, %rd51; + .loc 1 76 39 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:76:39 + shr.s64 %rd91, %rd89, 63; + and.b64 %rd92, %rd91, %rd51; + selp.b64 %rd93, %rd92, 0, %p28; + shr.s64 %rd94, %rd90, 63; + and.b64 %rd95, %rd94, %rd51; + selp.b64 %rd96, %rd95, 0, %p27; + .loc 1 78 25 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:78:25 + neg.s64 %rd97, %rd96; + neg.s64 %rd98, %rd93; + setp.eq.b64 %p29, %rd87, %rd98; + setp.eq.b64 %p30, %rd88, %rd97; + .loc 1 79 24 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:79:24 + and.pred %p31, %p26, %p30; + and.pred %p33, %p25, %p29; + and.pred %p35, %p5, %p33; + and.pred %p36, %p5, %p31; + .loc 1 80 24 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:80:24 + or.pred %p37, %p10, %p36; + or.pred %p38, %p9, %p35; + .loc 1 86 50 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:86:50 + and.pred %p41, %p14, %p38; + and.pred %p42, %p15, %p37; + selp.b64 %rd99, 1, 0, %p42; + selp.b64 %rd100, 1, 0, %p41; + add.s64 %rd159, %rd159, %rd100; + add.s64 %rd160, %rd160, %rd99; + .loc 1 32 40 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:32:40 + add.s64 %rd158, %rd158, 1024; + add.s32 %r75, %r75, 8; + add.s64 %rd157, %rd157, 8; + add.s64 %rd156, %rd156, -8; + setp.lt.u64 %p43, %rd158, 15360; + @%p43 bra $L__BB0_10; + bra.uni $L__BB0_14; +$L__BB0_10: // =>This Inner Loop Header: Depth=1 + .loc 1 37 27 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:37:27 + or.b32 %r20, %r75, 4; + .loc 1 39 22 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:39:22 + cvt.u64.u32 %rd76, %r20; + .loc 1 49 77 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:49:77 + // begin inline asm + mov.u64 %rd68, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd68, 1.0; + // end inline asm + .loc 1 39 22 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:39:22 + or.b64 %rd77, %rd16, %rd76; + or.b64 %rd78, %rd16, %rd157; + .loc 1 41 22 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:41:22 + setp.lt.s64 %p17, %rd78, %rd48; + setp.lt.s64 %p18, %rd77, %rd48; + .loc 1 45 22 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:45:22 + and.pred %p8, %p3, %p18; + and.pred %p7, %p3, %p17; + .loc 1 48 23 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:48:23 + setp.ge.s64 %p19, %rd77, %rd20; + setp.ge.s64 %p20, %rd78, %rd20; + .loc 1 49 94 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:49:94 + and.pred %p14, %p1, %p7; + and.pred %p15, %p1, %p8; + .loc 1 49 77 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:49:77 + // begin inline asm + mov.u64 %rd69, 0x0; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd69 }, [ %rd70 + 0 ], %rd68; + // end inline asm + // begin inline asm + mov.u64 %rd72, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd72, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd73, 0x0; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd73 }, [ %rd70 + 0 ], %rd72; + // end inline asm + .loc 1 52 24 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:52:24 + max.s64 %rd79, %rd20, %rd77; + setp.lt.s64 %p21, %rd79, %rd73; + max.s64 %rd80, %rd20, %rd78; + setp.lt.s64 %p22, %rd80, %rd69; + .loc 1 53 23 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:53:23 + and.pred %p9, %p20, %p22; + and.pred %p10, %p19, %p21; + .loc 1 58 24 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:58:24 + or.b64 %rd81, %rd20, %rd51; + and.b64 %rd82, %rd81, -4294967296; + setp.ne.b64 %p23, %rd82, 0; + @%p23 bra $L__BB0_12; +// %bb.11: // in Loop: Header=BB0_10 Depth=1 + cvt.u32.u64 %r22, %rd51; + cvt.u32.u64 %r23, %rd20; + rem.u32 %r24, %r23, %r22; + cvt.u64.u32 %rd161, %r24; + bra.uni $L__BB0_13; +$L__BB0_14: + .loc 1 25 37 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:25:37 + and.b32 %r31, %r1, 31; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd106, %rd159, %rd160; + mov.b64 {_, %r32}, %rd106; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + cvt.u32.u64 %r33, %rd106; + shfl.sync.bfly.b32 %r34, %r33, 16, 31, -1; + shfl.sync.bfly.b32 %r35, %r32, 16, 31, -1; + cvt.u64.u32 %rd107, %r34; + cvt.u64.u32 %rd108, %r35; + shl.b64 %rd109, %rd108, 32; + or.b64 %rd110, %rd107, %rd109; + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd111, %rd106, %rd110; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + mov.b64 {_, %r36}, %rd111; + cvt.u32.u64 %r37, %rd111; + shfl.sync.bfly.b32 %r38, %r37, 8, 31, -1; + shfl.sync.bfly.b32 %r39, %r36, 8, 31, -1; + cvt.u64.u32 %rd112, %r38; + cvt.u64.u32 %rd113, %r39; + shl.b64 %rd114, %rd113, 32; + or.b64 %rd115, %rd112, %rd114; + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd116, %rd111, %rd115; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + mov.b64 {_, %r40}, %rd116; + cvt.u32.u64 %r41, %rd116; + shfl.sync.bfly.b32 %r42, %r41, 4, 31, -1; + shfl.sync.bfly.b32 %r43, %r40, 4, 31, -1; + cvt.u64.u32 %rd117, %r42; + cvt.u64.u32 %rd118, %r43; + shl.b64 %rd119, %rd118, 32; + or.b64 %rd120, %rd117, %rd119; + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd121, %rd116, %rd120; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + mov.b64 {_, %r44}, %rd121; + cvt.u32.u64 %r45, %rd121; + shfl.sync.bfly.b32 %r46, %r45, 2, 31, -1; + shfl.sync.bfly.b32 %r47, %r44, 2, 31, -1; + cvt.u64.u32 %rd122, %r46; + cvt.u64.u32 %rd123, %r47; + shl.b64 %rd124, %rd123, 32; + or.b64 %rd125, %rd122, %rd124; + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd126, %rd121, %rd125; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + mov.b64 {_, %r48}, %rd126; + cvt.u32.u64 %r49, %rd126; + shfl.sync.bfly.b32 %r50, %r49, 1, 31, -1; + shfl.sync.bfly.b32 %r51, %r48, 1, 31, -1; + cvt.u64.u32 %rd127, %r50; + cvt.u64.u32 %rd128, %r51; + shl.b64 %rd129, %rd128, 32; + or.b64 %rd130, %rd127, %rd129; + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd101, %rd126, %rd130; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + setp.eq.b32 %p44, %r31, 0; + shr.u32 %r52, %r1, 2; + and.b32 %r53, %r52, 120; + mov.b32 %r54, global_smem; + add.s32 %r25, %r54, %r53; + // begin inline asm + @%p44 st.shared.b64 [ %r25 + 0 ], %rd101; + // end inline asm + bar.sync 0; + setp.lt.u32 %p45, %r1, 16; + shl.b32 %r55, %r1, 3; + add.s32 %r26, %r54, %r55; + // begin inline asm + @%p45 ld.shared.b64 %rd102, [ %r26 + 0 ]; + // end inline asm + mov.b64 {_, %r56}, %rd102; + cvt.u32.u64 %r57, %rd102; + shfl.sync.bfly.b32 %r58, %r57, 8, 31, -1; + shfl.sync.bfly.b32 %r59, %r56, 8, 31, -1; + cvt.u64.u32 %rd131, %r58; + cvt.u64.u32 %rd132, %r59; + shl.b64 %rd133, %rd132, 32; + or.b64 %rd134, %rd131, %rd133; + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd135, %rd102, %rd134; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + mov.b64 {_, %r60}, %rd135; + cvt.u32.u64 %r61, %rd135; + shfl.sync.bfly.b32 %r62, %r61, 4, 31, -1; + shfl.sync.bfly.b32 %r63, %r60, 4, 31, -1; + cvt.u64.u32 %rd136, %r62; + cvt.u64.u32 %rd137, %r63; + shl.b64 %rd138, %rd137, 32; + or.b64 %rd139, %rd136, %rd138; + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd140, %rd135, %rd139; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + mov.b64 {_, %r64}, %rd140; + cvt.u32.u64 %r65, %rd140; + shfl.sync.bfly.b32 %r66, %r65, 2, 31, -1; + shfl.sync.bfly.b32 %r67, %r64, 2, 31, -1; + cvt.u64.u32 %rd141, %r66; + cvt.u64.u32 %rd142, %r67; + shl.b64 %rd143, %rd142, 32; + or.b64 %rd144, %rd141, %rd143; + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd145, %rd140, %rd144; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + mov.b64 {_, %r68}, %rd145; + cvt.u32.u64 %r69, %rd145; + shfl.sync.bfly.b32 %r70, %r69, 1, 31, -1; + shfl.sync.bfly.b32 %r71, %r68, 1, 31, -1; + cvt.u64.u32 %rd146, %r70; + cvt.u64.u32 %rd147, %r71; + shl.b64 %rd148, %rd147, 32; + or.b64 %rd149, %rd146, %rd148; + .loc 2 261 15 // standard.py:261:15 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + add.s64 %rd103, %rd145, %rd149; + .loc 2 291 36 // standard.py:291:36 @[ coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:87:27 ] + setp.eq.b32 %p46, %r1, 0; + // begin inline asm + @%p46 st.shared.b64 [ %r26 + 0 ], %rd103; + // end inline asm + bar.sync 0; + ld.shared.b64 %rd150, [global_smem]; +$L__tmp2: + .loc 1 92 20 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:92:20 + add.s64 %rd151, %rd150, -1; + setp.lt.u64 %p50, %rd151, 16383; + .loc 1 0 0 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:0 + selp.b32 %r28, 1, 0, %p50; + .loc 1 95 21 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:95:21 + setp.eq.b64 %p51, %rd150, 16384; + .loc 1 0 0 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:0 + selp.b32 %r29, 1, 0, %p51; + .loc 1 98 25 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:98:25 + shl.b64 %rd152, %rd1, 2; + add.s64 %rd104, %rd45, %rd152; + .loc 1 98 37 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:98:37 + and.b32 %r72, %r1, 511; + setp.eq.b32 %p52, %r72, 0; + and.pred %p47, %p52, %p1; + // begin inline asm + @%p47 st.global.b32 [ %rd104 + 0 ], { %r28 }; + // end inline asm + .loc 1 99 25 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:99:25 + add.s64 %rd105, %rd46, %rd152; + .loc 1 99 37 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:99:37 + // begin inline asm + @%p47 st.global.b32 [ %rd105 + 0 ], { %r29 }; + // end inline asm + .loc 1 99 4 // coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py:99:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 307 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x12c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 111 +.b8 113 +.b8 52 +.b8 118 +.b8 112 +.b8 109 +.b8 106 +.b8 121 +.b8 110 +.b8 104 +.b8 115 +.b8 98 +.b8 114 +.b8 51 +.b8 101 +.b8 104 +.b8 55 +.b8 51 +.b8 105 +.b8 55 +.b8 115 +.b8 116 +.b8 106 +.b8 105 +.b8 51 +.b8 110 +.b8 101 +.b8 115 +.b8 114 +.b8 115 +.b8 52 +.b8 107 +.b8 113 +.b8 55 +.b8 110 +.b8 120 +.b8 111 +.b8 113 +.b8 111 +.b8 53 +.b8 53 +.b8 103 +.b8 117 +.b8 120 +.b8 51 +.b8 50 +.b8 50 +.b8 97 +.b8 55 +.b8 113 +.b8 105 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 111 +.b8 113 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x7d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 111 +.b8 114 +.b8 95 +.b8 99 +.b8 111 +.b8 110 +.b8 115 +.b8 116 +.b8 97 +.b8 110 +.b8 116 +.b8 95 +.b8 112 +.b8 97 +.b8 100 +.b8 95 +.b8 110 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 101 +.b8 95 +.b8 103 +.b8 116 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 109 +.b8 117 +.b8 116 +.b8 101 +.b8 95 +.b8 114 +.b8 101 +.b8 109 +.b8 97 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 114 +.b8 95 +.b8 115 +.b8 117 +.b8 98 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x108:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11d:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 87 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source new file mode 100644 index 0000000000000000000000000000000000000000..f077d156bf4d4da3164502ddf40981ab0bbcac0a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source @@ -0,0 +1,418 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":18:0) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc99 = loc(unknown) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc106 = loc("in_ptr0"(#loc)) +#loc107 = loc("out_ptr1"(#loc)) +#loc108 = loc("out_ptr2"(#loc)) +#loc109 = loc("ks0"(#loc)) +#loc110 = loc("ks1"(#loc)) +#loc111 = loc("ks2"(#loc)) +#loc112 = loc("ks3"(#loc)) +#loc113 = loc("ks4"(#loc)) +#loc114 = loc("ks5"(#loc)) +#loc115 = loc("xnumel"(#loc)) +#loc116 = loc("r0_numel"(#loc)) +#loc207 = loc("input"(#loc97)) +#loc208 = loc("a"(#loc102)) +#loc209 = loc("b"(#loc102)) +module { + tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 16384 : i32 loc(#loc117) + %xoffset = tt.get_program_id x : i32 loc(#loc118) + %xoffset_1 = arith.constant 1 : i32 loc(#loc119) + %xoffset_2 = arith.constant 1 : i32 loc(#loc119) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc119) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc120) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc121) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc122) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc122) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc123) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc123) + %r0_base = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc124) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<1024xi32> -> tensor<1x1024xi32> loc(#loc125) + %x1 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc126) + %x1_9 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc126) + %x1_10 = arith.divsi %x1, %x1_9 : tensor<1x1xi64> loc(#loc126) + %x1_11 = tt.splat %ks1 : i64 -> tensor<1x1xi64> loc(#loc127) + %x1_12 = arith.remsi %x1_10, %x1_11 : tensor<1x1xi64> loc(#loc127) + %x0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc128) + %x0_13 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc128) + %x0_14 = arith.remsi %x0, %x0_13 : tensor<1x1xi64> loc(#loc128) + %x2 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc129) + %x2_15 = tt.splat %ks4 : i64 -> tensor<1x1xi64> loc(#loc129) + %x2_16 = arith.divsi %x2, %x2_15 : tensor<1x1xi64> loc(#loc129) + %_tmp46 = arith.constant 0 : i64 loc(#loc130) + %_tmp46_17 = arith.constant dense<0> : tensor<1x1024xi64> loc(#loc130) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c1024_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp46_18 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp46_22 = %_tmp46_17) -> (tensor<1x1024xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x1024xi32> loc(#loc132) + %r0_index_23 = arith.addi %r0_index, %r0_base_8 : tensor<1x1024xi32> loc(#loc132) + %r0_mask = arith.constant dense<16384> : tensor<1x1024xi32> loc(#loc133) + %r0_mask_24 = arith.cmpi slt, %r0_index_23, %r0_mask : tensor<1x1024xi32> loc(#loc133) + %r0_4 = arith.constant 128 : i32 loc(#loc134) + %r0_4_25 = arith.constant 128 : i32 loc(#loc134) + %r0_4_26 = arith.constant dense<128> : tensor<1x1024xi32> loc(#loc134) + %r0_4_27 = arith.divsi %r0_index_23, %r0_4_26 : tensor<1x1024xi32> loc(#loc134) + %r0_3 = arith.constant 128 : i32 loc(#loc135) + %r0_3_28 = arith.constant 128 : i32 loc(#loc135) + %r0_3_29 = arith.constant dense<128> : tensor<1x1024xi32> loc(#loc135) + %r0_3_30 = arith.remsi %r0_index_23, %r0_3_29 : tensor<1x1024xi32> loc(#loc135) + %tmp0 = arith.constant 128 : i32 loc(#loc136) + %tmp0_31 = arith.constant 128 : i64 loc(#loc136) + %tmp0_32 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc136) + %tmp0_33 = arith.muli %tmp0_32, %x1_12 : tensor<1x1xi64> loc(#loc136) + %tmp0_34 = arith.extsi %r0_4_27 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc137) + %tmp0_35 = tt.broadcast %tmp0_33 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc137) + %tmp0_36 = arith.addi %tmp0_34, %tmp0_35 : tensor<1x1024xi64> loc(#loc137) + %tmp2 = tt.splat %ks2 : i64 -> tensor<1x1024xi64> loc(#loc138) + %tmp2_37 = arith.cmpi slt, %tmp0_36, %tmp2 : tensor<1x1024xi64> loc(#loc138) + %tmp3 = arith.constant 128 : i32 loc(#loc139) + %tmp3_38 = arith.constant 128 : i64 loc(#loc139) + %tmp3_39 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc139) + %tmp3_40 = arith.muli %tmp3_39, %x0_14 : tensor<1x1xi64> loc(#loc139) + %tmp3_41 = arith.extsi %r0_3_30 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc140) + %tmp3_42 = tt.broadcast %tmp3_40 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc140) + %tmp3_43 = arith.addi %tmp3_41, %tmp3_42 : tensor<1x1024xi64> loc(#loc140) + %tmp5 = tt.splat %ks3 : i64 -> tensor<1x1024xi64> loc(#loc141) + %tmp5_44 = arith.cmpi slt, %tmp3_43, %tmp5 : tensor<1x1024xi64> loc(#loc141) + %tmp6 = arith.andi %tmp2_37, %tmp5_44 : tensor<1x1024xi1> loc(#loc142) + %tmp7 = arith.constant 128 : i32 loc(#loc143) + %tmp7_45 = arith.constant 128 : i64 loc(#loc143) + %tmp7_46 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc143) + %tmp7_47 = arith.muli %tmp7_46, %x1_12 : tensor<1x1xi64> loc(#loc143) + %tmp7_48 = arith.extsi %r0_4_27 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc144) + %tmp7_49 = tt.broadcast %tmp7_47 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc144) + %tmp7_50 = arith.addi %tmp7_48, %tmp7_49 : tensor<1x1024xi64> loc(#loc144) + %tmp8 = arith.constant 128 : i32 loc(#loc145) + %tmp8_51 = arith.constant 128 : i64 loc(#loc145) + %tmp8_52 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc145) + %tmp8_53 = arith.muli %tmp8_52, %x0_14 : tensor<1x1xi64> loc(#loc145) + %tmp8_54 = arith.extsi %r0_3_30 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc146) + %tmp8_55 = tt.broadcast %tmp8_53 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc146) + %tmp8_56 = arith.addi %tmp8_54, %tmp8_55 : tensor<1x1024xi64> loc(#loc146) + %tmp9 = arith.cmpi sge, %tmp7_50, %tmp8_56 : tensor<1x1024xi64> loc(#loc147) + %tmp10 = tt.broadcast %x2_16 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc148) + %tmp10_57 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x1024x!tt.ptr> loc(#loc149) + %tmp10_58 = tt.addptr %tmp10_57, %tmp10 : tensor<1x1024x!tt.ptr>, tensor<1x1024xi64> loc(#loc149) + %tmp10_59 = arith.andi %r0_mask_24, %tmp6 : tensor<1x1024xi1> loc(#loc150) + %tmp10_60 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x1024xi1> loc(#loc151) + %tmp10_61 = arith.andi %tmp10_59, %tmp10_60 : tensor<1x1024xi1> loc(#loc151) + %tmp10_62 = arith.constant 0.000000e+00 : f32 loc(#loc152) + %tmp10_63 = arith.constant dense<0.000000e+00> : tensor<1x1024xf32> loc(#loc152) + %tmp10_64 = arith.fptosi %tmp10_63 : tensor<1x1024xf32> to tensor<1x1024xi64> loc(#loc152) + %tmp10_65 = tt.load %tmp10_58, %tmp10_61, %tmp10_64 evictionPolicy = evict_last : tensor<1x1024x!tt.ptr> loc(#loc152) + %tmp11 = arith.cmpi slt, %tmp8_56, %tmp10_65 : tensor<1x1024xi64> loc(#loc153) + %tmp12 = arith.cmpi slt, %tmp7_50, %tmp10_65 : tensor<1x1024xi64> loc(#loc154) + %tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x1024xi1> loc(#loc155) + %tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x1024xi1> loc(#loc156) + %tmp15 = arith.constant false loc(#loc157) + %tmp15_66 = arith.constant dense : tensor<1x1xi1> loc(#loc157) + %tmp16 = arith.constant dense : tensor<1x1024xi1> loc(#loc158) + %tmp16_67 = arith.ori %tmp16, %tmp14 : tensor<1x1024xi1> loc(#loc158) + %tmp17 = tt.splat %ks5 : i64 -> tensor<1x1024xi64> loc(#loc159) + %tmp18 = arith.cmpi sge, %tmp8_56, %tmp17 : tensor<1x1024xi64> loc(#loc160) + %tmp19 = arith.remsi %tmp8_56, %tmp17 : tensor<1x1024xi64> loc(#loc161) + %tmp20 = arith.constant 0 : i32 loc(#loc162) + %tmp20_68 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc162) + %tmp21 = arith.extsi %tmp20_68 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc163) + %tmp21_69 = tt.broadcast %tmp21 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc163) + %tmp21_70 = arith.cmpi ne, %tmp19, %tmp21_69 : tensor<1x1024xi64> loc(#loc163) + %tmp22 = arith.constant 0 : i32 loc(#loc164) + %tmp22_71 = arith.extsi %tmp22 : i32 to i64 loc(#loc164) + %tmp22_72 = tt.splat %tmp22_71 : i64 -> tensor<1x1024xi64> loc(#loc164) + %tmp22_73 = arith.cmpi slt, %tmp19, %tmp22_72 : tensor<1x1024xi64> loc(#loc164) + %tmp23 = arith.constant 0 : i32 loc(#loc165) + %tmp23_74 = arith.extsi %tmp23 : i32 to i64 loc(#loc165) + %tmp23_75 = tt.splat %tmp23_74 : i64 -> tensor<1x1024xi64> loc(#loc165) + %tmp23_76 = arith.cmpi slt, %tmp17, %tmp23_75 : tensor<1x1024xi64> loc(#loc165) + %tmp24 = arith.cmpi ne, %tmp22_73, %tmp23_76 : tensor<1x1024xi1> loc(#loc166) + %tmp25 = arith.andi %tmp21_70, %tmp24 : tensor<1x1024xi1> loc(#loc167) + %tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x1024xi64> loc(#loc168) + %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc169) + %tmp28 = arith.cmpi slt, %tmp27, %tmp10_65 : tensor<1x1024xi64> loc(#loc170) + %tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x1024xi1> loc(#loc171) + %tmp30 = arith.constant -1 : i32 loc(#loc172) + %tmp30_77 = arith.constant -1 : i32 loc(#loc172) + %tmp30_78 = arith.constant dense<-1> : tensor<1x1024xi32> loc(#loc172) + %tmp30_79 = arith.muli %tmp30_78, %r0_4_27 : tensor<1x1024xi32> loc(#loc172) + %tmp30_80 = arith.addi %r0_3_30, %tmp30_79 : tensor<1x1024xi32> loc(#loc173) + %tmp30_81 = arith.constant -128 : i32 loc(#loc174) + %tmp30_82 = arith.constant -128 : i64 loc(#loc174) + %tmp30_83 = arith.constant dense<-128> : tensor<1x1xi64> loc(#loc174) + %tmp30_84 = arith.muli %tmp30_83, %x1_12 : tensor<1x1xi64> loc(#loc174) + %tmp30_85 = arith.extsi %tmp30_80 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc175) + %tmp30_86 = tt.broadcast %tmp30_84 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc175) + %tmp30_87 = arith.addi %tmp30_85, %tmp30_86 : tensor<1x1024xi64> loc(#loc175) + %tmp30_88 = arith.constant 128 : i32 loc(#loc176) + %tmp30_89 = arith.constant 128 : i64 loc(#loc176) + %tmp30_90 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc176) + %tmp30_91 = arith.muli %tmp30_90, %x0_14 : tensor<1x1xi64> loc(#loc176) + %tmp30_92 = tt.broadcast %tmp30_91 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc177) + %tmp30_93 = arith.addi %tmp30_87, %tmp30_92 : tensor<1x1024xi64> loc(#loc177) + %tmp31 = arith.remsi %tmp30_93, %tmp17 : tensor<1x1024xi64> loc(#loc178) + %tmp32 = arith.extsi %tmp20_68 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc179) + %tmp32_94 = tt.broadcast %tmp32 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc179) + %tmp32_95 = arith.cmpi ne, %tmp31, %tmp32_94 : tensor<1x1024xi64> loc(#loc179) + %tmp33 = arith.constant 0 : i32 loc(#loc180) + %tmp33_96 = arith.extsi %tmp33 : i32 to i64 loc(#loc180) + %tmp33_97 = tt.splat %tmp33_96 : i64 -> tensor<1x1024xi64> loc(#loc180) + %tmp33_98 = arith.cmpi slt, %tmp31, %tmp33_97 : tensor<1x1024xi64> loc(#loc180) + %tmp34 = arith.cmpi ne, %tmp33_98, %tmp23_76 : tensor<1x1024xi1> loc(#loc181) + %tmp35 = arith.andi %tmp32_95, %tmp34 : tensor<1x1024xi1> loc(#loc182) + %tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x1024xi64> loc(#loc183) + %tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc184) + %tmp38 = arith.constant 0 : i64 loc(#loc185) + %tmp38_99 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc185) + %tmp39 = arith.constant dense<0> : tensor<1x1024xi64> loc(#loc186) + %tmp39_100 = arith.cmpi eq, %tmp37, %tmp39 : tensor<1x1024xi64> loc(#loc186) + %tmp40 = arith.andi %tmp29, %tmp39_100 : tensor<1x1024xi1> loc(#loc187) + %tmp41 = arith.ori %tmp16_67, %tmp40 : tensor<1x1024xi1> loc(#loc188) + %tmp42 = arith.constant false loc(#loc189) + %tmp42_101 = arith.constant dense : tensor<1x1024xi1> loc(#loc189) + %tmp43 = arith.select %tmp6, %tmp41, %tmp42_101 : tensor<1x1024xi1>, tensor<1x1024xi1> loc(#loc190) + %tmp44 = arith.extui %tmp43 : tensor<1x1024xi1> to tensor<1x1024xi64> loc(#loc191) + %tmp47 = arith.addi %_tmp46_22, %tmp44 : tensor<1x1024xi64> loc(#loc192) + %_tmp46_102 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x1024xi1> loc(#loc193) + %_tmp46_103 = arith.andi %r0_mask_24, %_tmp46_102 : tensor<1x1024xi1> loc(#loc193) + %_tmp46_104 = arith.select %_tmp46_103, %tmp47, %_tmp46_22 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc194) + scf.yield %_tmp46_104 : tensor<1x1024xi64> loc(#loc79) + } loc(#loc131) + %tmp46 = tt.call @"triton.language.standard.sum__i64S1_1024S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp46_18) : (tensor<1x1024xi64>) -> tensor<1xi64> loc(#loc195) + %tmp46_19 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc196) + %tmp48 = arith.constant 0 : i64 loc(#loc197) + %tmp48_20 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc197) + %tmp49 = arith.cmpi sgt, %tmp46_19, %tmp48_20 : tensor<1x1xi64> loc(#loc198) + %tmp50 = arith.constant 16384 : i64 loc(#loc199) + %tmp50_21 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc199) + %tmp51 = arith.cmpi slt, %tmp46_19, %tmp50_21 : tensor<1x1xi64> loc(#loc200) + %tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1> loc(#loc201) + %tmp53 = arith.extui %tmp52 : tensor<1x1xi1> to tensor<1x1xi8> loc(#loc202) + %tmp54 = arith.extsi %tmp53 : tensor<1x1xi8> to tensor<1x1xi32> loc(#loc203) + %tmp55 = arith.cmpi eq, %tmp46_19, %tmp50_21 : tensor<1x1xi64> loc(#loc204) + %tmp56 = arith.extui %tmp55 : tensor<1x1xi1> to tensor<1x1xi8> loc(#loc205) + %tmp57 = arith.extsi %tmp56 : tensor<1x1xi8> to tensor<1x1xi32> loc(#loc206) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc92) + %5 = tt.addptr %4, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc92) + tt.store %5, %tmp54, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc93) + %6 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc94) + %7 = tt.addptr %6, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc94) + tt.store %7, %tmp57, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc95) + tt.return loc(#loc96) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_1024S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x1024xi64> loc("input"(#loc97))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc98) + tt.reduce.return %2 : i64 loc(#loc98) + }) : (tensor<1x1024xi64>) -> tensor<1xi64> loc(#loc98) + tt.return %0 : tensor<1xi64> loc(#loc100) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc101) + tt.return %1 : tensor<1xi64> loc(#loc101) + } loc(#loc97) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc102)), %b: i64 loc("b"(#loc102))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc103) + tt.return %0 : i64 loc(#loc104) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc105) + tt.return %1 : i64 loc(#loc105) + } loc(#loc102) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":27:21) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":27:28) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":28:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":29:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":30:44) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":32:40) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":33:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":34:29) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":37:27) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":38:27) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":39:26) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":39:22) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":41:22) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":42:26) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":42:22) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":44:22) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":45:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":46:26) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":46:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":47:26) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":47:22) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":48:23) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:55) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:35) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:87) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:94) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:77) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":50:23) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":51:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":52:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":53:23) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":54:39) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":55:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":56:37) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":57:24) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":58:24) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":59:35) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":60:25) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":61:92) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":62:92) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":63:25) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":64:24) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":65:24) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":66:39) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":67:24) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":68:24) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:29) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:24) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:45) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:38) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:55) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:51) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":70:25) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":71:25) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":72:92) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":73:25) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":74:24) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":75:24) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":76:39) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":77:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":78:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":79:24) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":80:24) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":81:44) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":82:38) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":83:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":85:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":86:36) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":86:50) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":86:8) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":87:27) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":87:30) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":88:31) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":89:20) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":90:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":91:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":92:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":93:21) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":94:21) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":95:21) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":96:21) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":97:21) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":98:25) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":98:37) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":99:25) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":99:37) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":99:4) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc117 = loc("r0_numel"(#loc1)) +#loc118 = loc("xoffset"(#loc2)) +#loc119 = loc("xoffset"(#loc3)) +#loc120 = loc("xindex"(#loc4)) +#loc121 = loc("xindex"(#loc5)) +#loc122 = loc("xindex"(#loc6)) +#loc123 = loc("xmask"(#loc7)) +#loc124 = loc("r0_base"(#loc8)) +#loc125 = loc("r0_base"(#loc9)) +#loc126 = loc("x1"(#loc10)) +#loc127 = loc("x1"(#loc11)) +#loc128 = loc("x0"(#loc12)) +#loc129 = loc("x2"(#loc13)) +#loc130 = loc("_tmp46"(#loc14)) +#loc131 = loc("_tmp46"(#loc15)) +#loc132 = loc("r0_index"(#loc16)) +#loc133 = loc("r0_mask"(#loc17)) +#loc134 = loc("r0_4"(#loc18)) +#loc135 = loc("r0_3"(#loc19)) +#loc136 = loc("tmp0"(#loc20)) +#loc137 = loc("tmp0"(#loc21)) +#loc138 = loc("tmp2"(#loc22)) +#loc139 = loc("tmp3"(#loc23)) +#loc140 = loc("tmp3"(#loc24)) +#loc141 = loc("tmp5"(#loc25)) +#loc142 = loc("tmp6"(#loc26)) +#loc143 = loc("tmp7"(#loc27)) +#loc144 = loc("tmp7"(#loc28)) +#loc145 = loc("tmp8"(#loc29)) +#loc146 = loc("tmp8"(#loc30)) +#loc147 = loc("tmp9"(#loc31)) +#loc148 = loc("tmp10"(#loc32)) +#loc149 = loc("tmp10"(#loc33)) +#loc150 = loc("tmp10"(#loc34)) +#loc151 = loc("tmp10"(#loc35)) +#loc152 = loc("tmp10"(#loc36)) +#loc153 = loc("tmp11"(#loc37)) +#loc154 = loc("tmp12"(#loc38)) +#loc155 = loc("tmp13"(#loc39)) +#loc156 = loc("tmp14"(#loc40)) +#loc157 = loc("tmp15"(#loc41)) +#loc158 = loc("tmp16"(#loc42)) +#loc159 = loc("tmp17"(#loc43)) +#loc160 = loc("tmp18"(#loc44)) +#loc161 = loc("tmp19"(#loc45)) +#loc162 = loc("tmp20"(#loc46)) +#loc163 = loc("tmp21"(#loc47)) +#loc164 = loc("tmp22"(#loc48)) +#loc165 = loc("tmp23"(#loc49)) +#loc166 = loc("tmp24"(#loc50)) +#loc167 = loc("tmp25"(#loc51)) +#loc168 = loc("tmp26"(#loc52)) +#loc169 = loc("tmp27"(#loc53)) +#loc170 = loc("tmp28"(#loc54)) +#loc171 = loc("tmp29"(#loc55)) +#loc172 = loc("tmp30"(#loc56)) +#loc173 = loc("tmp30"(#loc57)) +#loc174 = loc("tmp30"(#loc58)) +#loc175 = loc("tmp30"(#loc59)) +#loc176 = loc("tmp30"(#loc60)) +#loc177 = loc("tmp30"(#loc61)) +#loc178 = loc("tmp31"(#loc62)) +#loc179 = loc("tmp32"(#loc63)) +#loc180 = loc("tmp33"(#loc64)) +#loc181 = loc("tmp34"(#loc65)) +#loc182 = loc("tmp35"(#loc66)) +#loc183 = loc("tmp36"(#loc67)) +#loc184 = loc("tmp37"(#loc68)) +#loc185 = loc("tmp38"(#loc69)) +#loc186 = loc("tmp39"(#loc70)) +#loc187 = loc("tmp40"(#loc71)) +#loc188 = loc("tmp41"(#loc72)) +#loc189 = loc("tmp42"(#loc73)) +#loc190 = loc("tmp43"(#loc74)) +#loc191 = loc("tmp44"(#loc75)) +#loc192 = loc("tmp47"(#loc76)) +#loc193 = loc("_tmp46"(#loc77)) +#loc194 = loc("_tmp46"(#loc78)) +#loc195 = loc("tmp46"(#loc80)) +#loc196 = loc("tmp46"(#loc81)) +#loc197 = loc("tmp48"(#loc82)) +#loc198 = loc("tmp49"(#loc83)) +#loc199 = loc("tmp50"(#loc84)) +#loc200 = loc("tmp51"(#loc85)) +#loc201 = loc("tmp52"(#loc86)) +#loc202 = loc("tmp53"(#loc87)) +#loc203 = loc("tmp54"(#loc88)) +#loc204 = loc("tmp55"(#loc89)) +#loc205 = loc("tmp56"(#loc90)) +#loc206 = loc("tmp57"(#loc91)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..85a9e2f057858bb3f205256872e1332cc3a20e14 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir @@ -0,0 +1,280 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":18:0) +#loc1 = loc(unknown) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":87:27) +#loc79 = loc("in_ptr0"(#loc)) +#loc80 = loc("out_ptr1"(#loc)) +#loc81 = loc("out_ptr2"(#loc)) +#loc82 = loc("ks0"(#loc)) +#loc83 = loc("ks1"(#loc)) +#loc84 = loc("ks2"(#loc)) +#loc85 = loc("ks3"(#loc)) +#loc86 = loc("ks4"(#loc)) +#loc87 = loc("ks5"(#loc)) +#loc88 = loc("xnumel"(#loc)) +#loc89 = loc("r0_numel"(#loc)) +#loc149 = loc("tmp46"(#loc63)) +#loc164 = loc(callsite(#loc1 at #loc149)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x1024xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<16384> : tensor<1x1024xi32, #blocked> loc(#loc1) + %c-128_i64 = arith.constant -128 : i64 loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_1 = arith.constant dense<16384> : tensor<1x1xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<1x1xi64, #blocked> loc(#loc1) + %cst_3 = arith.constant dense : tensor<1x1024xi1, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0> : tensor<1x1024xi64, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc90) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc91) + %r0_base = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc92) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<1024xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x1024xi32, #blocked> loc(#loc92) + %x1 = arith.extsi %xoffset : i32 to i64 loc(#loc93) + %x1_6 = arith.divsi %x1, %ks0 : i64 loc(#loc93) + %x1_7 = arith.remsi %x1_6, %ks1 : i64 loc(#loc94) + %x0 = arith.remsi %x1, %ks0 : i64 loc(#loc95) + %x2 = arith.divsi %x1, %ks4 : i64 loc(#loc96) + %tmp0 = arith.muli %x1_7, %c128_i64 : i64 loc(#loc97) + %tmp0_8 = tt.splat %tmp0 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc159) + %tmp2 = tt.splat %ks2 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc99) + %tmp3 = arith.muli %x0, %c128_i64 : i64 loc(#loc100) + %tmp3_9 = tt.splat %tmp3 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc160) + %tmp5 = tt.splat %ks3 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc102) + %tmp10 = tt.addptr %in_ptr0, %x2 : !tt.ptr, i64 loc(#loc103) + %tmp10_10 = tt.splat %xmask : i1 -> tensor<1x1024xi1, #blocked> loc(#loc161) + %tmp10_11 = tt.splat %tmp10 : !tt.ptr -> tensor<1x1024x!tt.ptr, #blocked> loc(#loc105) + %tmp17 = tt.splat %ks5 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc106) + %tmp23 = arith.cmpi slt, %ks5, %c0_i64 : i64 loc(#loc107) + %tmp23_12 = tt.splat %tmp23 : i1 -> tensor<1x1024xi1, #blocked> loc(#loc107) + %tmp30 = arith.muli %x1_7, %c-128_i64 : i64 loc(#loc108) + %tmp30_13 = tt.splat %tmp30 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc162) + %_tmp46 = scf.for %_tmp46_15 = %c0_i32 to %c16384_i32 step %c1024_i32 iter_args(%arg12 = %cst_4) -> (tensor<1x1024xi64, #blocked>) : i32 { + %r0_index = tt.splat %_tmp46_15 : i32 -> tensor<1x1024xi32, #blocked> loc(#loc111) + %r0_index_16 = arith.addi %r0_index, %r0_base_5 : tensor<1x1024xi32, #blocked> loc(#loc111) + %r0_mask = arith.cmpi slt, %r0_index_16, %cst_0 : tensor<1x1024xi32, #blocked> loc(#loc112) + %r0_4 = arith.divsi %r0_index_16, %cst : tensor<1x1024xi32, #blocked> loc(#loc113) + %r0_3 = arith.remsi %r0_index_16, %cst : tensor<1x1024xi32, #blocked> loc(#loc114) + %tmp0_17 = arith.extsi %r0_4 : tensor<1x1024xi32, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc98) + %tmp0_18 = arith.addi %tmp0_17, %tmp0_8 : tensor<1x1024xi64, #blocked> loc(#loc98) + %tmp2_19 = arith.cmpi slt, %tmp0_18, %tmp2 : tensor<1x1024xi64, #blocked> loc(#loc99) + %tmp3_20 = arith.extsi %r0_3 : tensor<1x1024xi32, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc101) + %tmp3_21 = arith.addi %tmp3_20, %tmp3_9 : tensor<1x1024xi64, #blocked> loc(#loc101) + %tmp5_22 = arith.cmpi slt, %tmp3_21, %tmp5 : tensor<1x1024xi64, #blocked> loc(#loc102) + %tmp6 = arith.andi %tmp2_19, %tmp5_22 : tensor<1x1024xi1, #blocked> loc(#loc115) + %tmp9 = arith.cmpi sge, %tmp0_18, %tmp3_21 : tensor<1x1024xi64, #blocked> loc(#loc116) + %tmp10_23 = arith.andi %r0_mask, %tmp6 : tensor<1x1024xi1, #blocked> loc(#loc117) + %tmp10_24 = arith.andi %tmp10_23, %tmp10_10 : tensor<1x1024xi1, #blocked> loc(#loc104) + %tmp10_25 = tt.load %tmp10_11, %tmp10_24, %cst_4 evictionPolicy = evict_last : tensor<1x1024x!tt.ptr, #blocked> loc(#loc105) + %tmp11 = arith.cmpi slt, %tmp3_21, %tmp10_25 : tensor<1x1024xi64, #blocked> loc(#loc118) + %tmp12 = arith.cmpi slt, %tmp0_18, %tmp10_25 : tensor<1x1024xi64, #blocked> loc(#loc119) + %tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x1024xi1, #blocked> loc(#loc120) + %tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x1024xi1, #blocked> loc(#loc121) + %tmp18 = arith.cmpi sge, %tmp3_21, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc122) + %tmp19 = arith.remsi %tmp3_21, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc123) + %tmp21 = arith.cmpi ne, %tmp19, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc124) + %tmp22 = arith.cmpi slt, %tmp19, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc125) + %tmp24 = arith.cmpi ne, %tmp22, %tmp23_12 : tensor<1x1024xi1, #blocked> loc(#loc126) + %tmp25 = arith.andi %tmp21, %tmp24 : tensor<1x1024xi1, #blocked> loc(#loc127) + %tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc128) + %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi64, #blocked> loc(#loc129) + %tmp28 = arith.cmpi slt, %tmp27, %tmp10_25 : tensor<1x1024xi64, #blocked> loc(#loc130) + %tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x1024xi1, #blocked> loc(#loc131) + %tmp30_26 = arith.subi %r0_3, %r0_4 : tensor<1x1024xi32, #blocked> loc(#loc132) + %tmp30_27 = arith.extsi %tmp30_26 : tensor<1x1024xi32, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc109) + %tmp30_28 = arith.addi %tmp30_27, %tmp30_13 : tensor<1x1024xi64, #blocked> loc(#loc109) + %tmp30_29 = arith.addi %tmp30_28, %tmp3_9 : tensor<1x1024xi64, #blocked> loc(#loc133) + %tmp31 = arith.remsi %tmp30_29, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc134) + %tmp32 = arith.cmpi ne, %tmp31, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc135) + %tmp33 = arith.cmpi slt, %tmp31, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc136) + %tmp34 = arith.cmpi ne, %tmp33, %tmp23_12 : tensor<1x1024xi1, #blocked> loc(#loc137) + %tmp35 = arith.andi %tmp32, %tmp34 : tensor<1x1024xi1, #blocked> loc(#loc138) + %tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x1024xi64, #blocked> loc(#loc139) + %tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi64, #blocked> loc(#loc140) + %tmp39 = arith.cmpi eq, %tmp37, %cst_4 : tensor<1x1024xi64, #blocked> loc(#loc141) + %tmp40 = arith.andi %tmp29, %tmp39 : tensor<1x1024xi1, #blocked> loc(#loc142) + %tmp41 = arith.ori %tmp14, %tmp40 : tensor<1x1024xi1, #blocked> loc(#loc143) + %tmp43 = arith.select %tmp6, %tmp41, %cst_3 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi1, #blocked> loc(#loc144) + %tmp44 = arith.extui %tmp43 : tensor<1x1024xi1, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc145) + %tmp47 = arith.addi %arg12, %tmp44 : tensor<1x1024xi64, #blocked> loc(#loc146) + %_tmp46_30 = arith.andi %r0_mask, %tmp10_10 : tensor<1x1024xi1, #blocked> loc(#loc147) + %_tmp46_31 = arith.select %_tmp46_30, %tmp47, %arg12 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi64, #blocked> loc(#loc148) + scf.yield %_tmp46_31 : tensor<1x1024xi64, #blocked> loc(#loc61) + } loc(#loc110) + %tmp46 = "tt.reduce"(%_tmp46) <{axis = 1 : i32}> ({ + ^bb0(%tmp46_15: i64 loc(callsite(#loc1 at #loc149)), %tmp46_16: i64 loc(callsite(#loc1 at #loc149))): + %tmp46_17 = arith.addi %tmp46_15, %tmp46_16 : i64 loc(#loc167) + tt.reduce.return %tmp46_17 : i64 loc(#loc163) + }) : (tensor<1x1024xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc163) + %tmp46_14 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc150) + %tmp49 = arith.cmpi sgt, %tmp46_14, %cst_2 : tensor<1x1xi64, #blocked> loc(#loc151) + %tmp51 = arith.cmpi slt, %tmp46_14, %cst_1 : tensor<1x1xi64, #blocked> loc(#loc152) + %tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1, #blocked> loc(#loc153) + %tmp54 = arith.extui %tmp52 : tensor<1x1xi1, #blocked> to tensor<1x1xi32, #blocked> loc(#loc165) + %tmp55 = arith.cmpi eq, %tmp46_14, %cst_1 : tensor<1x1xi64, #blocked> loc(#loc156) + %tmp57 = arith.extui %tmp55 : tensor<1x1xi1, #blocked> to tensor<1x1xi32, #blocked> loc(#loc166) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc74) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc75) + %2 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc75) + tt.store %1, %tmp54, %2 : tensor<1x1x!tt.ptr, #blocked> loc(#loc75) + %3 = tt.addptr %out_ptr2, %xoffset : !tt.ptr, i32 loc(#loc76) + %4 = tt.splat %3 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc77) + tt.store %4, %tmp57, %2 : tensor<1x1x!tt.ptr, #blocked> loc(#loc77) + tt.return loc(#loc78) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":27:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":27:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":29:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":39:26) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":39:22) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":41:22) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":42:26) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":42:22) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":44:22) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:35) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:94) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:77) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":56:37) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":62:92) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:45) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:38) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":32:40) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":33:31) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":34:29) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":37:27) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":38:27) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":45:22) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":48:23) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:87) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":50:23) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":51:23) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":52:24) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":53:23) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":57:24) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":58:24) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":60:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":61:92) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":63:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":64:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":65:24) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":66:39) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":67:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":68:24) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:24) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:51) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":70:25) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":71:25) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":72:92) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":73:25) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":74:24) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":75:24) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":76:39) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":78:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":79:24) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":80:24) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":82:38) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":83:25) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":85:25) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":86:36) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":86:50) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":86:8) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":87:30) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":89:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":91:20) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":92:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":94:21) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":93:21) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":95:21) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":97:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":96:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":98:25) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":98:37) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":99:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":99:37) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":99:4) +#loc90 = loc("xoffset"(#loc2)) +#loc91 = loc("xmask"(#loc3)) +#loc92 = loc("r0_base"(#loc4)) +#loc93 = loc("x1"(#loc5)) +#loc94 = loc("x1"(#loc6)) +#loc95 = loc("x0"(#loc7)) +#loc96 = loc("x2"(#loc8)) +#loc97 = loc("tmp0"(#loc9)) +#loc98 = loc("tmp0"(#loc10)) +#loc99 = loc("tmp2"(#loc11)) +#loc100 = loc("tmp3"(#loc12)) +#loc101 = loc("tmp3"(#loc13)) +#loc102 = loc("tmp5"(#loc14)) +#loc103 = loc("tmp10"(#loc15)) +#loc104 = loc("tmp10"(#loc16)) +#loc105 = loc("tmp10"(#loc17)) +#loc106 = loc("tmp17"(#loc18)) +#loc107 = loc("tmp23"(#loc19)) +#loc108 = loc("tmp30"(#loc20)) +#loc109 = loc("tmp30"(#loc21)) +#loc110 = loc("_tmp46"(#loc22)) +#loc111 = loc("r0_index"(#loc23)) +#loc112 = loc("r0_mask"(#loc24)) +#loc113 = loc("r0_4"(#loc25)) +#loc114 = loc("r0_3"(#loc26)) +#loc115 = loc("tmp6"(#loc27)) +#loc116 = loc("tmp9"(#loc28)) +#loc117 = loc("tmp10"(#loc29)) +#loc118 = loc("tmp11"(#loc30)) +#loc119 = loc("tmp12"(#loc31)) +#loc120 = loc("tmp13"(#loc32)) +#loc121 = loc("tmp14"(#loc33)) +#loc122 = loc("tmp18"(#loc34)) +#loc123 = loc("tmp19"(#loc35)) +#loc124 = loc("tmp21"(#loc36)) +#loc125 = loc("tmp22"(#loc37)) +#loc126 = loc("tmp24"(#loc38)) +#loc127 = loc("tmp25"(#loc39)) +#loc128 = loc("tmp26"(#loc40)) +#loc129 = loc("tmp27"(#loc41)) +#loc130 = loc("tmp28"(#loc42)) +#loc131 = loc("tmp29"(#loc43)) +#loc132 = loc("tmp30"(#loc44)) +#loc133 = loc("tmp30"(#loc45)) +#loc134 = loc("tmp31"(#loc46)) +#loc135 = loc("tmp32"(#loc47)) +#loc136 = loc("tmp33"(#loc48)) +#loc137 = loc("tmp34"(#loc49)) +#loc138 = loc("tmp35"(#loc50)) +#loc139 = loc("tmp36"(#loc51)) +#loc140 = loc("tmp37"(#loc52)) +#loc141 = loc("tmp39"(#loc53)) +#loc142 = loc("tmp40"(#loc54)) +#loc143 = loc("tmp41"(#loc55)) +#loc144 = loc("tmp43"(#loc56)) +#loc145 = loc("tmp44"(#loc57)) +#loc146 = loc("tmp47"(#loc58)) +#loc147 = loc("_tmp46"(#loc59)) +#loc148 = loc("_tmp46"(#loc60)) +#loc150 = loc("tmp46"(#loc65)) +#loc151 = loc("tmp49"(#loc66)) +#loc152 = loc("tmp51"(#loc67)) +#loc153 = loc("tmp52"(#loc68)) +#loc154 = loc("tmp54"(#loc69)) +#loc155 = loc("tmp53"(#loc70)) +#loc156 = loc("tmp55"(#loc71)) +#loc157 = loc("tmp57"(#loc72)) +#loc158 = loc("tmp56"(#loc73)) +#loc159 = loc(fused[#loc98, #loc97]) +#loc160 = loc(fused[#loc101, #loc100]) +#loc161 = loc(fused[#loc104, #loc91]) +#loc162 = loc(fused[#loc109, #loc108]) +#loc163 = loc(callsite(#loc62 at #loc149)) +#loc165 = loc(fused[#loc154, #loc155]) +#loc166 = loc(fused[#loc157, #loc158]) +#loc167 = loc(callsite(#loc64 at #loc163)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..5664136587095c90eecff73a5331415cb8b56296 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DE6XSSYLS7BWGGS4UO3WTFWZCN6OVYXIHMGZ5KR7P3YWZXLVATDQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir @@ -0,0 +1,283 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":18:0) +#loc1 = loc(unknown) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":87:27) +#loc81 = loc("in_ptr0"(#loc)) +#loc82 = loc("out_ptr1"(#loc)) +#loc83 = loc("out_ptr2"(#loc)) +#loc84 = loc("ks0"(#loc)) +#loc85 = loc("ks1"(#loc)) +#loc86 = loc("ks2"(#loc)) +#loc87 = loc("ks3"(#loc)) +#loc88 = loc("ks4"(#loc)) +#loc89 = loc("ks5"(#loc)) +#loc90 = loc("xnumel"(#loc)) +#loc91 = loc("r0_numel"(#loc)) +#loc153 = loc("tmp46"(#loc65)) +#loc168 = loc(callsite(#loc1 at #loc153)) +module { + tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %ks5: i64 loc("ks5"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c-128_i64 = arith.constant -128 : i64 loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc2) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %tmp50 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc92) + %cst = arith.constant dense<0> : tensor<1x1xi64> loc(#loc1) + %cst_0 = arith.constant dense : tensor<1x1024xi1> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<1x1024xi32> loc(#loc1) + %cst_2 = arith.constant dense<16384> : tensor<1x1024xi32> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<1x1024xi64> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc93) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc94) + %xmask_4 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc94) + %r0_base = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc95) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<1024xi32> -> tensor<1x1024xi32> loc(#loc96) + %x1 = arith.extsi %xoffset : i32 to i64 loc(#loc97) + %x1_6 = arith.divsi %x1, %ks0 : i64 loc(#loc97) + %x1_7 = arith.remsi %x1_6, %ks1 : i64 loc(#loc98) + %x0 = arith.remsi %x1, %ks0 : i64 loc(#loc99) + %x2 = arith.divsi %x1, %ks4 : i64 loc(#loc100) + %_tmp46 = scf.for %r0_offset = %c0_i32 to %c16384_i32 step %c1024_i32 iter_args(%_tmp46_9 = %cst_3) -> (tensor<1x1024xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x1024xi32> loc(#loc102) + %r0_index_10 = arith.addi %r0_index, %r0_base_5 : tensor<1x1024xi32> loc(#loc102) + %r0_mask = arith.cmpi slt, %r0_index_10, %cst_2 : tensor<1x1024xi32> loc(#loc103) + %r0_4 = arith.divsi %r0_index_10, %cst_1 : tensor<1x1024xi32> loc(#loc104) + %r0_3 = arith.remsi %r0_index_10, %cst_1 : tensor<1x1024xi32> loc(#loc105) + %tmp0 = arith.muli %x1_7, %c128_i64 : i64 loc(#loc106) + %tmp0_11 = arith.extsi %r0_4 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc107) + %tmp0_12 = tt.splat %tmp0 : i64 -> tensor<1x1024xi64> loc(#loc163) + %tmp0_13 = arith.addi %tmp0_11, %tmp0_12 : tensor<1x1024xi64> loc(#loc107) + %tmp2 = tt.splat %ks2 : i64 -> tensor<1x1024xi64> loc(#loc108) + %tmp2_14 = arith.cmpi slt, %tmp0_13, %tmp2 : tensor<1x1024xi64> loc(#loc108) + %tmp3 = arith.muli %x0, %c128_i64 : i64 loc(#loc109) + %tmp3_15 = arith.extsi %r0_3 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc110) + %tmp3_16 = tt.splat %tmp3 : i64 -> tensor<1x1024xi64> loc(#loc164) + %tmp3_17 = arith.addi %tmp3_15, %tmp3_16 : tensor<1x1024xi64> loc(#loc110) + %tmp5 = tt.splat %ks3 : i64 -> tensor<1x1024xi64> loc(#loc111) + %tmp5_18 = arith.cmpi slt, %tmp3_17, %tmp5 : tensor<1x1024xi64> loc(#loc111) + %tmp6 = arith.andi %tmp2_14, %tmp5_18 : tensor<1x1024xi1> loc(#loc112) + %tmp9 = arith.cmpi sge, %tmp0_13, %tmp3_17 : tensor<1x1024xi64> loc(#loc113) + %tmp10 = tt.addptr %in_ptr0, %x2 : !tt.ptr, i64 loc(#loc114) + %tmp10_19 = tt.splat %tmp10 : !tt.ptr -> tensor<1x1024x!tt.ptr> loc(#loc114) + %tmp10_20 = arith.andi %r0_mask, %tmp6 : tensor<1x1024xi1> loc(#loc115) + %tmp10_21 = tt.splat %xmask : i1 -> tensor<1x1024xi1> loc(#loc165) + %tmp10_22 = arith.andi %tmp10_20, %tmp10_21 : tensor<1x1024xi1> loc(#loc116) + %tmp10_23 = tt.load %tmp10_19, %tmp10_22, %cst_3 evictionPolicy = evict_last : tensor<1x1024x!tt.ptr> loc(#loc117) + %tmp11 = arith.cmpi slt, %tmp3_17, %tmp10_23 : tensor<1x1024xi64> loc(#loc118) + %tmp12 = arith.cmpi slt, %tmp0_13, %tmp10_23 : tensor<1x1024xi64> loc(#loc119) + %tmp13 = arith.andi %tmp11, %tmp12 : tensor<1x1024xi1> loc(#loc120) + %tmp14 = arith.andi %tmp9, %tmp13 : tensor<1x1024xi1> loc(#loc121) + %tmp17 = tt.splat %ks5 : i64 -> tensor<1x1024xi64> loc(#loc122) + %tmp18 = arith.cmpi sge, %tmp3_17, %tmp17 : tensor<1x1024xi64> loc(#loc123) + %tmp19 = arith.remsi %tmp3_17, %tmp17 : tensor<1x1024xi64> loc(#loc124) + %tmp21 = arith.cmpi ne, %tmp19, %cst_3 : tensor<1x1024xi64> loc(#loc125) + %tmp22 = arith.cmpi slt, %tmp19, %cst_3 : tensor<1x1024xi64> loc(#loc126) + %tmp23 = arith.cmpi slt, %ks5, %c0_i64 : i64 loc(#loc127) + %tmp23_24 = tt.splat %tmp23 : i1 -> tensor<1x1024xi1> loc(#loc127) + %tmp24 = arith.cmpi ne, %tmp22, %tmp23_24 : tensor<1x1024xi1> loc(#loc128) + %tmp25 = arith.andi %tmp21, %tmp24 : tensor<1x1024xi1> loc(#loc129) + %tmp26 = arith.addi %tmp19, %tmp17 : tensor<1x1024xi64> loc(#loc130) + %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc131) + %tmp28 = arith.cmpi slt, %tmp27, %tmp10_23 : tensor<1x1024xi64> loc(#loc132) + %tmp29 = arith.andi %tmp18, %tmp28 : tensor<1x1024xi1> loc(#loc133) + %tmp30 = arith.subi %r0_3, %r0_4 : tensor<1x1024xi32> loc(#loc134) + %tmp30_25 = arith.muli %x1_7, %c-128_i64 : i64 loc(#loc135) + %tmp30_26 = arith.extsi %tmp30 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc136) + %tmp30_27 = tt.splat %tmp30_25 : i64 -> tensor<1x1024xi64> loc(#loc166) + %tmp30_28 = arith.addi %tmp30_26, %tmp30_27 : tensor<1x1024xi64> loc(#loc136) + %tmp30_29 = arith.addi %tmp30_28, %tmp3_16 : tensor<1x1024xi64> loc(#loc137) + %tmp31 = arith.remsi %tmp30_29, %tmp17 : tensor<1x1024xi64> loc(#loc138) + %tmp32 = arith.cmpi ne, %tmp31, %cst_3 : tensor<1x1024xi64> loc(#loc139) + %tmp33 = arith.cmpi slt, %tmp31, %cst_3 : tensor<1x1024xi64> loc(#loc140) + %tmp34 = arith.cmpi ne, %tmp33, %tmp23_24 : tensor<1x1024xi1> loc(#loc141) + %tmp35 = arith.andi %tmp32, %tmp34 : tensor<1x1024xi1> loc(#loc142) + %tmp36 = arith.addi %tmp31, %tmp17 : tensor<1x1024xi64> loc(#loc143) + %tmp37 = arith.select %tmp35, %tmp36, %tmp31 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc144) + %tmp39 = arith.cmpi eq, %tmp37, %cst_3 : tensor<1x1024xi64> loc(#loc145) + %tmp40 = arith.andi %tmp29, %tmp39 : tensor<1x1024xi1> loc(#loc146) + %tmp41 = arith.ori %tmp14, %tmp40 : tensor<1x1024xi1> loc(#loc147) + %tmp43 = arith.select %tmp6, %tmp41, %cst_0 : tensor<1x1024xi1>, tensor<1x1024xi1> loc(#loc148) + %tmp44 = arith.extui %tmp43 : tensor<1x1024xi1> to tensor<1x1024xi64> loc(#loc149) + %tmp47 = arith.addi %_tmp46_9, %tmp44 : tensor<1x1024xi64> loc(#loc150) + %_tmp46_30 = arith.andi %r0_mask, %tmp10_21 : tensor<1x1024xi1> loc(#loc151) + %_tmp46_31 = arith.select %_tmp46_30, %tmp47, %_tmp46_9 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc152) + scf.yield %_tmp46_31 : tensor<1x1024xi64> loc(#loc63) + } loc(#loc101) + %tmp46 = "tt.reduce"(%_tmp46) <{axis = 1 : i32}> ({ + ^bb0(%tmp46_9: i64 loc(callsite(#loc1 at #loc153)), %tmp46_10: i64 loc(callsite(#loc1 at #loc153))): + %tmp46_11 = arith.addi %tmp46_9, %tmp46_10 : i64 loc(#loc171) + tt.reduce.return %tmp46_11 : i64 loc(#loc167) + }) : (tensor<1x1024xi64>) -> tensor<1xi64> loc(#loc167) + %tmp46_8 = tt.expand_dims %tmp46 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc154) + %tmp49 = arith.cmpi sgt, %tmp46_8, %cst : tensor<1x1xi64> loc(#loc155) + %tmp51 = arith.cmpi slt, %tmp46_8, %tmp50 : tensor<1x1xi64> loc(#loc156) + %tmp52 = arith.andi %tmp49, %tmp51 : tensor<1x1xi1> loc(#loc157) + %tmp54 = arith.extui %tmp52 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc169) + %tmp55 = arith.cmpi eq, %tmp46_8, %tmp50 : tensor<1x1xi64> loc(#loc160) + %tmp57 = arith.extui %tmp55 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc170) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc76) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc76) + tt.store %1, %tmp54, %xmask_4 : tensor<1x1x!tt.ptr> loc(#loc77) + %2 = tt.addptr %out_ptr2, %xoffset : !tt.ptr, i32 loc(#loc78) + %3 = tt.splat %2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc78) + tt.store %3, %tmp57, %xmask_4 : tensor<1x1x!tt.ptr> loc(#loc79) + tt.return loc(#loc80) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":32:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":90:35) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":22:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":24:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":25:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":25:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":27:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":28:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":29:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":33:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":34:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":37:27) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":38:27) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":39:26) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":39:22) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":41:22) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":42:26) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":42:22) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":44:22) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":45:22) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":48:23) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:35) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:87) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:94) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":49:77) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":50:23) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":51:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":52:24) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":53:23) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":56:37) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":57:24) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":58:24) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":60:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":61:92) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":62:92) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":63:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":64:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":65:24) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":66:39) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":67:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":68:24) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:24) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:45) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:38) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":69:51) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":70:25) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":71:25) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":72:92) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":73:25) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":74:24) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":75:24) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":76:39) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":78:25) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":79:24) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":80:24) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":82:38) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":83:25) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":85:25) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":86:36) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":86:50) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":86:8) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":87:30) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":89:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":91:20) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":92:20) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":94:21) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":93:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":95:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":97:21) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":96:21) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":98:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":98:37) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":99:25) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":99:37) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/oq/coq4vpmjynhsbr3eh73i7stji3nesrs4kq7nxoqo55gux322a7qi.py":99:4) +#loc92 = loc("tmp50"(#loc3)) +#loc93 = loc("xoffset"(#loc4)) +#loc94 = loc("xmask"(#loc5)) +#loc95 = loc("r0_base"(#loc6)) +#loc96 = loc("r0_base"(#loc7)) +#loc97 = loc("x1"(#loc8)) +#loc98 = loc("x1"(#loc9)) +#loc99 = loc("x0"(#loc10)) +#loc100 = loc("x2"(#loc11)) +#loc101 = loc("_tmp46"(#loc2)) +#loc102 = loc("r0_index"(#loc12)) +#loc103 = loc("r0_mask"(#loc13)) +#loc104 = loc("r0_4"(#loc14)) +#loc105 = loc("r0_3"(#loc15)) +#loc106 = loc("tmp0"(#loc16)) +#loc107 = loc("tmp0"(#loc17)) +#loc108 = loc("tmp2"(#loc18)) +#loc109 = loc("tmp3"(#loc19)) +#loc110 = loc("tmp3"(#loc20)) +#loc111 = loc("tmp5"(#loc21)) +#loc112 = loc("tmp6"(#loc22)) +#loc113 = loc("tmp9"(#loc23)) +#loc114 = loc("tmp10"(#loc24)) +#loc115 = loc("tmp10"(#loc25)) +#loc116 = loc("tmp10"(#loc26)) +#loc117 = loc("tmp10"(#loc27)) +#loc118 = loc("tmp11"(#loc28)) +#loc119 = loc("tmp12"(#loc29)) +#loc120 = loc("tmp13"(#loc30)) +#loc121 = loc("tmp14"(#loc31)) +#loc122 = loc("tmp17"(#loc32)) +#loc123 = loc("tmp18"(#loc33)) +#loc124 = loc("tmp19"(#loc34)) +#loc125 = loc("tmp21"(#loc35)) +#loc126 = loc("tmp22"(#loc36)) +#loc127 = loc("tmp23"(#loc37)) +#loc128 = loc("tmp24"(#loc38)) +#loc129 = loc("tmp25"(#loc39)) +#loc130 = loc("tmp26"(#loc40)) +#loc131 = loc("tmp27"(#loc41)) +#loc132 = loc("tmp28"(#loc42)) +#loc133 = loc("tmp29"(#loc43)) +#loc134 = loc("tmp30"(#loc44)) +#loc135 = loc("tmp30"(#loc45)) +#loc136 = loc("tmp30"(#loc46)) +#loc137 = loc("tmp30"(#loc47)) +#loc138 = loc("tmp31"(#loc48)) +#loc139 = loc("tmp32"(#loc49)) +#loc140 = loc("tmp33"(#loc50)) +#loc141 = loc("tmp34"(#loc51)) +#loc142 = loc("tmp35"(#loc52)) +#loc143 = loc("tmp36"(#loc53)) +#loc144 = loc("tmp37"(#loc54)) +#loc145 = loc("tmp39"(#loc55)) +#loc146 = loc("tmp40"(#loc56)) +#loc147 = loc("tmp41"(#loc57)) +#loc148 = loc("tmp43"(#loc58)) +#loc149 = loc("tmp44"(#loc59)) +#loc150 = loc("tmp47"(#loc60)) +#loc151 = loc("_tmp46"(#loc61)) +#loc152 = loc("_tmp46"(#loc62)) +#loc154 = loc("tmp46"(#loc67)) +#loc155 = loc("tmp49"(#loc68)) +#loc156 = loc("tmp51"(#loc69)) +#loc157 = loc("tmp52"(#loc70)) +#loc158 = loc("tmp54"(#loc71)) +#loc159 = loc("tmp53"(#loc72)) +#loc160 = loc("tmp55"(#loc73)) +#loc161 = loc("tmp57"(#loc74)) +#loc162 = loc("tmp56"(#loc75)) +#loc163 = loc(fused[#loc107, #loc106]) +#loc164 = loc(fused[#loc110, #loc109]) +#loc165 = loc(fused[#loc116, #loc94]) +#loc166 = loc(fused[#loc136, #loc135]) +#loc167 = loc(callsite(#loc64 at #loc153)) +#loc169 = loc(fused[#loc158, #loc159]) +#loc170 = loc(fused[#loc161, #loc162]) +#loc171 = loc(callsite(#loc66 at #loc167)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e36c559b176ff59759fe891f578226927d429548 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..e2fd3151d776cb6bc1444eae3b71aa22f9332818 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"hash": "194cc6b63e58aa469567f2b6707c154639a26894fa3233f9b1a0193fd80ea4f3", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 8192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir new file mode 100644 index 0000000000000000000000000000000000000000..ceee780817d742fc8f38a69669f3303f2b8208f1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir @@ -0,0 +1,2212 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = shl i32 %9, 7, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 127, !dbg !9 + %13 = lshr i32 %11, 2, !dbg !9 + %14 = and i32 %13, 63, !dbg !9 + %15 = or disjoint i32 %10, %12, !dbg !10 + %16 = or disjoint i32 %14, %10, !dbg !10 + %17 = or disjoint i32 %16, 64, !dbg !10 + %18 = icmp slt i32 %15, %4, !dbg !11 + %19 = icmp slt i32 %16, %4, !dbg !11 + %20 = icmp slt i32 %17, %4, !dbg !11 + %21 = lshr i32 %11, 7, !dbg !12 + %22 = shl nuw nsw i32 %11, 2, !dbg !12 + %23 = and i32 %22, 12, !dbg !12 + %24 = sext i32 %15 to i64, !dbg !13 + %25 = sext i32 %16 to i64, !dbg !13 + %26 = sext i32 %17 to i64, !dbg !13 + %.frozen = freeze i64 %3, !dbg !14 + %27 = sdiv i64 %24, %.frozen, !dbg !14 + %28 = mul i64 %27, %.frozen, !dbg !13 + %.decomposed = sub i64 %24, %28, !dbg !13 + %.frozen89 = freeze i64 %3, !dbg !14 + %29 = sdiv i64 %25, %.frozen89, !dbg !14 + %30 = mul i64 %29, %.frozen89, !dbg !13 + %.decomposed90 = sub i64 %25, %30, !dbg !13 + %.frozen91 = freeze i64 %3, !dbg !14 + %31 = sdiv i64 %26, %.frozen91, !dbg !14 + %32 = mul i64 %31, %.frozen91, !dbg !13 + %.decomposed92 = sub i64 %26, %32, !dbg !13 + %33 = zext nneg i32 %23 to i64, !dbg !15 + %34 = shl nsw i64 %27, 4, !dbg !16 + %35 = shl i64 %3, 4, !dbg !17 + %36 = mul i64 %27, %35, !dbg !18 + %37 = getelementptr i32, ptr addrspace(1) %0, i64 %.decomposed, !dbg !19 + %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20 + %39 = shl nuw nsw i32 %12, 4, !dbg !21 + %40 = getelementptr i32, ptr addrspace(3) @global_smem, i32 %39, !dbg !21 + %41 = getelementptr i8, ptr addrspace(3) %40, i32 8, !dbg !21 + %42 = getelementptr i8, ptr addrspace(3) %40, i32 16, !dbg !21 + %43 = getelementptr i8, ptr addrspace(3) %40, i32 24, !dbg !21 + %44 = getelementptr i8, ptr addrspace(3) %40, i32 32, !dbg !21 + %45 = getelementptr i8, ptr addrspace(3) %40, i32 40, !dbg !21 + %46 = getelementptr i8, ptr addrspace(3) %40, i32 48, !dbg !21 + %47 = getelementptr i8, ptr addrspace(3) %40, i32 56, !dbg !21 + %48 = getelementptr i32, ptr addrspace(3) @global_smem, i32 %11, !dbg !21 + %49 = and i32 %11, 1, !dbg !21 + %50 = icmp eq i32 %49, 0, !dbg !21 + %51 = getelementptr i8, ptr addrspace(3) %48, i32 1024, !dbg !21 + %52 = getelementptr i8, ptr addrspace(3) %48, i32 2048, !dbg !21 + %53 = getelementptr i8, ptr addrspace(3) %48, i32 3072, !dbg !21 + %54 = getelementptr i8, ptr addrspace(3) %48, i32 4096, !dbg !21 + %55 = getelementptr i8, ptr addrspace(3) %48, i32 5120, !dbg !21 + %56 = getelementptr i8, ptr addrspace(3) %48, i32 6144, !dbg !21 + %57 = getelementptr i8, ptr addrspace(3) %48, i32 7168, !dbg !21 + %.lobit = and i32 %21, 1, !dbg !12 + %58 = or disjoint i32 %.lobit, 4, !dbg !12 + %59 = insertelement <2 x i32> poison, i32 %21, i64 0, !dbg !12 + %60 = insertelement <2 x i32> %59, i32 %.lobit, i64 1, !dbg !12 + %61 = or <2 x i32> %60, , !dbg !12 + %62 = zext nneg i32 %.lobit to i64, !dbg !15 + %63 = extractelement <2 x i32> %61, i64 1, !dbg !25 + %64 = zext nneg i32 %63 to i64, !dbg !15 + %65 = zext nneg i32 %58 to i64, !dbg !15 + %66 = extractelement <2 x i32> %61, i64 0, !dbg !25 + %67 = zext nneg i32 %66 to i64, !dbg !15 + %68 = mul nuw nsw i64 %3, %62, !dbg !28 + %69 = mul i64 %3, %64, !dbg !28 + %70 = mul i64 %3, %65, !dbg !28 + %71 = mul i64 %3, %67, !dbg !28 + %72 = getelementptr i32, ptr addrspace(1) %37, i64 %62, !dbg !19 + %73 = getelementptr i32, ptr addrspace(1) %72, i64 %34, !dbg !19 + %74 = getelementptr i32, ptr addrspace(1) %73, i64 %68, !dbg !19 + %75 = getelementptr i32, ptr addrspace(1) %74, i64 %36, !dbg !19 + %76 = getelementptr i32, ptr addrspace(1) %37, i64 %64, !dbg !19 + %77 = getelementptr i32, ptr addrspace(1) %76, i64 %34, !dbg !19 + %78 = getelementptr i32, ptr addrspace(1) %77, i64 %69, !dbg !19 + %79 = getelementptr i32, ptr addrspace(1) %78, i64 %36, !dbg !19 + %80 = getelementptr i32, ptr addrspace(1) %37, i64 %65, !dbg !19 + %81 = getelementptr i32, ptr addrspace(1) %80, i64 %34, !dbg !19 + %82 = getelementptr i32, ptr addrspace(1) %81, i64 %70, !dbg !19 + %83 = getelementptr i32, ptr addrspace(1) %82, i64 %36, !dbg !19 + %84 = getelementptr i32, ptr addrspace(1) %37, i64 %67, !dbg !19 + %85 = getelementptr i32, ptr addrspace(1) %84, i64 %34, !dbg !19 + %86 = getelementptr i32, ptr addrspace(1) %85, i64 %71, !dbg !19 + %87 = getelementptr i32, ptr addrspace(1) %86, i64 %36, !dbg !19 + %88 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %75, i64 %38, i1 %18) #5, !dbg !20 + %89 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20 + %90 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %79, i64 %89, i1 %18) #5, !dbg !20 + %91 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20 + %92 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %83, i64 %91, i1 %18) #5, !dbg !20 + %93 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20 + %94 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %87, i64 %93, i1 %18) #5, !dbg !20 + %95 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20 + %96 = xor i32 %.lobit, 1, !dbg !29 + %97 = mul nuw nsw i32 %88, %96, !dbg !30 + %98 = mul nuw nsw i32 %90, %96, !dbg !30 + %99 = mul nuw nsw i32 %92, %96, !dbg !30 + %100 = mul nuw nsw i32 %94, %96, !dbg !30 + %101 = getelementptr i32, ptr addrspace(3) %40, i32 %.lobit, !dbg !21 + %102 = insertelement <1 x i32> poison, i32 %97, i64 0, !dbg !21 + %103 = getelementptr i32, ptr addrspace(3) %41, i32 %.lobit, !dbg !21 + %104 = insertelement <1 x i32> poison, i32 %98, i64 0, !dbg !21 + %105 = getelementptr i32, ptr addrspace(3) %42, i32 %.lobit, !dbg !21 + %106 = insertelement <1 x i32> poison, i32 %99, i64 0, !dbg !21 + %107 = getelementptr i32, ptr addrspace(3) %43, i32 %.lobit, !dbg !21 + %108 = insertelement <1 x i32> poison, i32 %100, i64 0, !dbg !21 + %109 = getelementptr i32, ptr addrspace(3) %44, i32 %.lobit, !dbg !21 + %110 = getelementptr i32, ptr addrspace(3) %45, i32 %.lobit, !dbg !21 + %111 = getelementptr i32, ptr addrspace(3) %46, i32 %.lobit, !dbg !21 + %112 = getelementptr i32, ptr addrspace(3) %47, i32 %.lobit, !dbg !21 + %113 = mul nuw nsw i32 %88, %.lobit, !dbg !31 + %114 = mul nuw nsw i32 %90, %.lobit, !dbg !31 + %115 = mul nuw nsw i32 %92, %.lobit, !dbg !31 + %116 = mul nuw nsw i32 %94, %.lobit, !dbg !31 + %117 = insertelement <1 x i32> poison, i32 %113, i64 0, !dbg !21 + %118 = insertelement <1 x i32> poison, i32 %114, i64 0, !dbg !21 + %119 = insertelement <1 x i32> poison, i32 %115, i64 0, !dbg !21 + %120 = insertelement <1 x i32> poison, i32 %116, i64 0, !dbg !21 + %121 = mul nuw nsw i32 %96, %.lobit, !dbg !32 + %122 = mul nuw nsw i32 %63, %96, !dbg !32 + %123 = mul nuw nsw i32 %58, %96, !dbg !32 + %124 = mul nuw nsw i32 %96, %66, !dbg !32 + %125 = insertelement <1 x i32> poison, i32 %121, i64 0, !dbg !21 + %126 = insertelement <1 x i32> poison, i32 %122, i64 0, !dbg !21 + %127 = insertelement <1 x i32> poison, i32 %123, i64 0, !dbg !21 + %128 = insertelement <1 x i32> poison, i32 %124, i64 0, !dbg !21 + %129 = mul nuw nsw i32 %63, %.lobit, !dbg !25 + %130 = mul nuw nsw i32 %58, %.lobit, !dbg !25 + %131 = mul nuw nsw i32 %66, %.lobit, !dbg !25 + %132 = insertelement <1 x i32> poison, i32 %.lobit, i64 0, !dbg !21 + %133 = insertelement <1 x i32> poison, i32 %129, i64 0, !dbg !21 + %134 = insertelement <1 x i32> poison, i32 %130, i64 0, !dbg !21 + %135 = insertelement <1 x i32> poison, i32 %131, i64 0, !dbg !21 + %136 = insertelement <2 x i32> poison, i32 %92, i64 0, !dbg !33 + %137 = insertelement <2 x i32> %136, i32 %88, i64 1, !dbg !33 + %138 = insertelement <2 x i32> poison, i32 %94, i64 0, !dbg !33 + %139 = insertelement <2 x i32> %138, i32 %90, i64 1, !dbg !33 + %140 = insertelement <2 x i32> poison, i32 %58, i64 0, !dbg !34 + %141 = insertelement <2 x i32> %140, i32 %.lobit, i64 1, !dbg !34 + %142 = or <2 x i32> %60, , !dbg !12 + %143 = extractelement <2 x i32> %142, i64 0, !dbg !25 + %144 = zext nneg i32 %143 to i64, !dbg !15 + %145 = mul i64 %3, %144, !dbg !28 + %146 = getelementptr i32, ptr addrspace(1) %37, i64 %144, !dbg !19 + %147 = getelementptr i32, ptr addrspace(1) %146, i64 %34, !dbg !19 + %148 = getelementptr i32, ptr addrspace(1) %147, i64 %145, !dbg !19 + %149 = getelementptr i32, ptr addrspace(1) %148, i64 %36, !dbg !19 + %150 = insertelement <2 x i32> poison, i32 %.lobit, i64 0, !dbg !12 + %151 = shufflevector <2 x i32> %150, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !12 + %152 = or disjoint <2 x i32> %151, , !dbg !12 + %153 = extractelement <2 x i32> %152, i64 1, !dbg !25 + %154 = zext nneg i32 %153 to i64, !dbg !15 + %155 = extractelement <2 x i32> %142, i64 1, !dbg !25 + %156 = zext nneg i32 %155 to i64, !dbg !15 + %157 = extractelement <2 x i32> %152, i64 0, !dbg !25 + %158 = zext nneg i32 %157 to i64, !dbg !15 + %159 = mul i64 %3, %154, !dbg !28 + %160 = mul i64 %3, %156, !dbg !28 + %161 = mul i64 %3, %158, !dbg !28 + %162 = getelementptr i32, ptr addrspace(1) %37, i64 %154, !dbg !19 + %163 = getelementptr i32, ptr addrspace(1) %162, i64 %34, !dbg !19 + %164 = getelementptr i32, ptr addrspace(1) %163, i64 %159, !dbg !19 + %165 = getelementptr i32, ptr addrspace(1) %164, i64 %36, !dbg !19 + %166 = getelementptr i32, ptr addrspace(1) %37, i64 %156, !dbg !19 + %167 = getelementptr i32, ptr addrspace(1) %166, i64 %34, !dbg !19 + %168 = getelementptr i32, ptr addrspace(1) %167, i64 %160, !dbg !19 + %169 = getelementptr i32, ptr addrspace(1) %168, i64 %36, !dbg !19 + %170 = getelementptr i32, ptr addrspace(1) %37, i64 %158, !dbg !19 + %171 = getelementptr i32, ptr addrspace(1) %170, i64 %34, !dbg !19 + %172 = getelementptr i32, ptr addrspace(1) %171, i64 %161, !dbg !19 + %173 = getelementptr i32, ptr addrspace(1) %172, i64 %36, !dbg !19 + %174 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %165, i64 %95, i1 %18) #5, !dbg !20 + %175 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20 + %176 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %169, i64 %175, i1 %18) #5, !dbg !20 + %177 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20 + %178 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %173, i64 %177, i1 %18) #5, !dbg !20 + %179 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20 + %180 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %149, i64 %179, i1 %18) #5, !dbg !20 + %181 = mul nuw nsw i32 %174, %96, !dbg !30 + %182 = mul nuw nsw i32 %176, %96, !dbg !30 + %183 = mul nuw nsw i32 %178, %96, !dbg !30 + %184 = mul nuw nsw i32 %180, %96, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %101, <1 x i32> %102, i1 true) #5, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %104, i1 true) #5, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, <1 x i32> %106, i1 true) #5, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, <1 x i32> %108, i1 true) #5, !dbg !21 + %185 = insertelement <1 x i32> poison, i32 %181, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %109, <1 x i32> %185, i1 true) #5, !dbg !21 + %186 = insertelement <1 x i32> poison, i32 %182, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, <1 x i32> %186, i1 true) #5, !dbg !21 + %187 = insertelement <1 x i32> poison, i32 %183, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %111, <1 x i32> %187, i1 true) #5, !dbg !21 + %188 = insertelement <1 x i32> poison, i32 %184, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %112, <1 x i32> %188, i1 true) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %189 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %48, i1 true) #5, !dbg !21 + %190 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %189, i32 1, i32 31), !dbg !21 + %191 = add i32 %190, %189, !dbg !35 + %192 = insertelement <1 x i32> poison, i32 %191, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %48, <1 x i32> %192, i1 %50) #5, !dbg !21 + %193 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %51, i1 true) #5, !dbg !21 + %194 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %193, i32 1, i32 31), !dbg !21 + %195 = add i32 %194, %193, !dbg !35 + %196 = insertelement <1 x i32> poison, i32 %195, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %51, <1 x i32> %196, i1 %50) #5, !dbg !21 + %197 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %52, i1 true) #5, !dbg !21 + %198 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %197, i32 1, i32 31), !dbg !21 + %199 = add i32 %198, %197, !dbg !35 + %200 = insertelement <1 x i32> poison, i32 %199, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %52, <1 x i32> %200, i1 %50) #5, !dbg !21 + %201 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %53, i1 true) #5, !dbg !21 + %202 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %201, i32 1, i32 31), !dbg !21 + %203 = add i32 %202, %201, !dbg !35 + %204 = insertelement <1 x i32> poison, i32 %203, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %53, <1 x i32> %204, i1 %50) #5, !dbg !21 + %205 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %54, i1 true) #5, !dbg !21 + %206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %205, i32 1, i32 31), !dbg !21 + %207 = add i32 %206, %205, !dbg !35 + %208 = insertelement <1 x i32> poison, i32 %207, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %54, <1 x i32> %208, i1 %50) #5, !dbg !21 + %209 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %55, i1 true) #5, !dbg !21 + %210 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %209, i32 1, i32 31), !dbg !21 + %211 = add i32 %210, %209, !dbg !35 + %212 = insertelement <1 x i32> poison, i32 %211, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %55, <1 x i32> %212, i1 %50) #5, !dbg !21 + %213 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %56, i1 true) #5, !dbg !21 + %214 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %213, i32 1, i32 31), !dbg !21 + %215 = add i32 %214, %213, !dbg !35 + %216 = insertelement <1 x i32> poison, i32 %215, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %56, <1 x i32> %216, i1 %50) #5, !dbg !21 + %217 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %57, i1 true) #5, !dbg !21 + %218 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %217, i32 1, i32 31), !dbg !21 + %219 = add i32 %218, %217, !dbg !35 + %220 = insertelement <1 x i32> poison, i32 %219, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %57, <1 x i32> %220, i1 %50) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %221 = load i32, ptr addrspace(3) %40, align 16, !dbg !21 + %222 = load i32, ptr addrspace(3) %41, align 8, !dbg !21 + %223 = load i32, ptr addrspace(3) %42, align 16, !dbg !21 + %224 = load i32, ptr addrspace(3) %43, align 8, !dbg !21 + %225 = load i32, ptr addrspace(3) %44, align 16, !dbg !21 + %226 = load i32, ptr addrspace(3) %45, align 8, !dbg !21 + %227 = load i32, ptr addrspace(3) %46, align 16, !dbg !21 + %228 = load i32, ptr addrspace(3) %47, align 8, !dbg !21 + %229 = mul nuw nsw i32 %174, %.lobit, !dbg !31 + %230 = mul nuw nsw i32 %176, %.lobit, !dbg !31 + %231 = mul nuw nsw i32 %178, %.lobit, !dbg !31 + %232 = mul nuw nsw i32 %180, %.lobit, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %101, <1 x i32> %117, i1 true) #5, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %118, i1 true) #5, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, <1 x i32> %119, i1 true) #5, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, <1 x i32> %120, i1 true) #5, !dbg !21 + %233 = insertelement <1 x i32> poison, i32 %229, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %109, <1 x i32> %233, i1 true) #5, !dbg !21 + %234 = insertelement <1 x i32> poison, i32 %230, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, <1 x i32> %234, i1 true) #5, !dbg !21 + %235 = insertelement <1 x i32> poison, i32 %231, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %111, <1 x i32> %235, i1 true) #5, !dbg !21 + %236 = insertelement <1 x i32> poison, i32 %232, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %112, <1 x i32> %236, i1 true) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %237 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %48, i1 true) #5, !dbg !21 + %238 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %237, i32 1, i32 31), !dbg !21 + %239 = add i32 %238, %237, !dbg !35 + %240 = insertelement <1 x i32> poison, i32 %239, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %48, <1 x i32> %240, i1 %50) #5, !dbg !21 + %241 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %51, i1 true) #5, !dbg !21 + %242 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %241, i32 1, i32 31), !dbg !21 + %243 = add i32 %242, %241, !dbg !35 + %244 = insertelement <1 x i32> poison, i32 %243, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %51, <1 x i32> %244, i1 %50) #5, !dbg !21 + %245 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %52, i1 true) #5, !dbg !21 + %246 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %245, i32 1, i32 31), !dbg !21 + %247 = add i32 %246, %245, !dbg !35 + %248 = insertelement <1 x i32> poison, i32 %247, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %52, <1 x i32> %248, i1 %50) #5, !dbg !21 + %249 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %53, i1 true) #5, !dbg !21 + %250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %249, i32 1, i32 31), !dbg !21 + %251 = add i32 %250, %249, !dbg !35 + %252 = insertelement <1 x i32> poison, i32 %251, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %53, <1 x i32> %252, i1 %50) #5, !dbg !21 + %253 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %54, i1 true) #5, !dbg !21 + %254 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %253, i32 1, i32 31), !dbg !21 + %255 = add i32 %254, %253, !dbg !35 + %256 = insertelement <1 x i32> poison, i32 %255, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %54, <1 x i32> %256, i1 %50) #5, !dbg !21 + %257 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %55, i1 true) #5, !dbg !21 + %258 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %257, i32 1, i32 31), !dbg !21 + %259 = add i32 %258, %257, !dbg !35 + %260 = insertelement <1 x i32> poison, i32 %259, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %55, <1 x i32> %260, i1 %50) #5, !dbg !21 + %261 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %56, i1 true) #5, !dbg !21 + %262 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %261, i32 1, i32 31), !dbg !21 + %263 = add i32 %262, %261, !dbg !35 + %264 = insertelement <1 x i32> poison, i32 %263, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %56, <1 x i32> %264, i1 %50) #5, !dbg !21 + %265 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %57, i1 true) #5, !dbg !21 + %266 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %265, i32 1, i32 31), !dbg !21 + %267 = add i32 %266, %265, !dbg !35 + %268 = insertelement <1 x i32> poison, i32 %267, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %57, <1 x i32> %268, i1 %50) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %269 = load i32, ptr addrspace(3) %40, align 16, !dbg !21 + %270 = load i32, ptr addrspace(3) %41, align 8, !dbg !21 + %271 = load i32, ptr addrspace(3) %42, align 16, !dbg !21 + %272 = load i32, ptr addrspace(3) %43, align 8, !dbg !21 + %273 = load i32, ptr addrspace(3) %44, align 16, !dbg !21 + %274 = load i32, ptr addrspace(3) %45, align 8, !dbg !21 + %275 = load i32, ptr addrspace(3) %46, align 16, !dbg !21 + %276 = load i32, ptr addrspace(3) %47, align 8, !dbg !21 + %277 = mul nuw nsw i32 %153, %96, !dbg !32 + %278 = mul nuw nsw i32 %155, %96, !dbg !32 + %279 = mul nuw nsw i32 %157, %96, !dbg !32 + %280 = mul nuw nsw i32 %96, %143, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %101, <1 x i32> %125, i1 true) #5, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %126, i1 true) #5, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, <1 x i32> %127, i1 true) #5, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, <1 x i32> %128, i1 true) #5, !dbg !21 + %281 = insertelement <1 x i32> poison, i32 %277, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %109, <1 x i32> %281, i1 true) #5, !dbg !21 + %282 = insertelement <1 x i32> poison, i32 %278, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, <1 x i32> %282, i1 true) #5, !dbg !21 + %283 = insertelement <1 x i32> poison, i32 %279, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %111, <1 x i32> %283, i1 true) #5, !dbg !21 + %284 = insertelement <1 x i32> poison, i32 %280, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %112, <1 x i32> %284, i1 true) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %285 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %48, i1 true) #5, !dbg !21 + %286 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 1, i32 31), !dbg !21 + %287 = add i32 %286, %285, !dbg !35 + %288 = insertelement <1 x i32> poison, i32 %287, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %48, <1 x i32> %288, i1 %50) #5, !dbg !21 + %289 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %51, i1 true) #5, !dbg !21 + %290 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %289, i32 1, i32 31), !dbg !21 + %291 = add i32 %290, %289, !dbg !35 + %292 = insertelement <1 x i32> poison, i32 %291, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %51, <1 x i32> %292, i1 %50) #5, !dbg !21 + %293 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %52, i1 true) #5, !dbg !21 + %294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 1, i32 31), !dbg !21 + %295 = add i32 %294, %293, !dbg !35 + %296 = insertelement <1 x i32> poison, i32 %295, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %52, <1 x i32> %296, i1 %50) #5, !dbg !21 + %297 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %53, i1 true) #5, !dbg !21 + %298 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %297, i32 1, i32 31), !dbg !21 + %299 = add i32 %298, %297, !dbg !35 + %300 = insertelement <1 x i32> poison, i32 %299, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %53, <1 x i32> %300, i1 %50) #5, !dbg !21 + %301 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %54, i1 true) #5, !dbg !21 + %302 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %301, i32 1, i32 31), !dbg !21 + %303 = add i32 %302, %301, !dbg !35 + %304 = insertelement <1 x i32> poison, i32 %303, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %54, <1 x i32> %304, i1 %50) #5, !dbg !21 + %305 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %55, i1 true) #5, !dbg !21 + %306 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %305, i32 1, i32 31), !dbg !21 + %307 = add i32 %306, %305, !dbg !35 + %308 = insertelement <1 x i32> poison, i32 %307, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %55, <1 x i32> %308, i1 %50) #5, !dbg !21 + %309 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %56, i1 true) #5, !dbg !21 + %310 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %309, i32 1, i32 31), !dbg !21 + %311 = add i32 %310, %309, !dbg !35 + %312 = insertelement <1 x i32> poison, i32 %311, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %56, <1 x i32> %312, i1 %50) #5, !dbg !21 + %313 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %57, i1 true) #5, !dbg !21 + %314 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %313, i32 1, i32 31), !dbg !21 + %315 = add i32 %314, %313, !dbg !35 + %316 = insertelement <1 x i32> poison, i32 %315, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %57, <1 x i32> %316, i1 %50) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %317 = load i32, ptr addrspace(3) %40, align 16, !dbg !21 + %318 = load i32, ptr addrspace(3) %41, align 8, !dbg !21 + %319 = load i32, ptr addrspace(3) %42, align 16, !dbg !21 + %320 = load i32, ptr addrspace(3) %43, align 8, !dbg !21 + %321 = load i32, ptr addrspace(3) %44, align 16, !dbg !21 + %322 = load i32, ptr addrspace(3) %45, align 8, !dbg !21 + %323 = load i32, ptr addrspace(3) %46, align 16, !dbg !21 + %324 = load i32, ptr addrspace(3) %47, align 8, !dbg !21 + %325 = mul nuw nsw i32 %153, %.lobit, !dbg !25 + %326 = mul nuw nsw i32 %155, %.lobit, !dbg !25 + %327 = mul nuw nsw i32 %157, %.lobit, !dbg !25 + %328 = mul nuw nsw i32 %143, %.lobit, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %101, <1 x i32> %132, i1 true) #5, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %133, i1 true) #5, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, <1 x i32> %134, i1 true) #5, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, <1 x i32> %135, i1 true) #5, !dbg !21 + %329 = insertelement <1 x i32> poison, i32 %325, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %109, <1 x i32> %329, i1 true) #5, !dbg !21 + %330 = insertelement <1 x i32> poison, i32 %326, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, <1 x i32> %330, i1 true) #5, !dbg !21 + %331 = insertelement <1 x i32> poison, i32 %327, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %111, <1 x i32> %331, i1 true) #5, !dbg !21 + %332 = insertelement <1 x i32> poison, i32 %328, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %112, <1 x i32> %332, i1 true) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %333 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %48, i1 true) #5, !dbg !21 + %334 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %333, i32 1, i32 31), !dbg !21 + %335 = add i32 %334, %333, !dbg !35 + %336 = insertelement <1 x i32> poison, i32 %335, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %48, <1 x i32> %336, i1 %50) #5, !dbg !21 + %337 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %51, i1 true) #5, !dbg !21 + %338 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %337, i32 1, i32 31), !dbg !21 + %339 = add i32 %338, %337, !dbg !35 + %340 = insertelement <1 x i32> poison, i32 %339, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %51, <1 x i32> %340, i1 %50) #5, !dbg !21 + %341 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %52, i1 true) #5, !dbg !21 + %342 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %341, i32 1, i32 31), !dbg !21 + %343 = add i32 %342, %341, !dbg !35 + %344 = insertelement <1 x i32> poison, i32 %343, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %52, <1 x i32> %344, i1 %50) #5, !dbg !21 + %345 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %53, i1 true) #5, !dbg !21 + %346 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %345, i32 1, i32 31), !dbg !21 + %347 = add i32 %346, %345, !dbg !35 + %348 = insertelement <1 x i32> poison, i32 %347, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %53, <1 x i32> %348, i1 %50) #5, !dbg !21 + %349 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %54, i1 true) #5, !dbg !21 + %350 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %349, i32 1, i32 31), !dbg !21 + %351 = add i32 %350, %349, !dbg !35 + %352 = insertelement <1 x i32> poison, i32 %351, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %54, <1 x i32> %352, i1 %50) #5, !dbg !21 + %353 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %55, i1 true) #5, !dbg !21 + %354 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %353, i32 1, i32 31), !dbg !21 + %355 = add i32 %354, %353, !dbg !35 + %356 = insertelement <1 x i32> poison, i32 %355, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %55, <1 x i32> %356, i1 %50) #5, !dbg !21 + %357 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %56, i1 true) #5, !dbg !21 + %358 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %357, i32 1, i32 31), !dbg !21 + %359 = add i32 %358, %357, !dbg !35 + %360 = insertelement <1 x i32> poison, i32 %359, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %56, <1 x i32> %360, i1 %50) #5, !dbg !21 + %361 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %57, i1 true) #5, !dbg !21 + %362 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %361, i32 1, i32 31), !dbg !21 + %363 = add i32 %362, %361, !dbg !35 + %364 = insertelement <1 x i32> poison, i32 %363, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %57, <1 x i32> %364, i1 %50) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %365 = load i32, ptr addrspace(3) %40, align 16, !dbg !21 + %366 = load i32, ptr addrspace(3) %41, align 8, !dbg !21 + %367 = load i32, ptr addrspace(3) %42, align 16, !dbg !21 + %368 = load i32, ptr addrspace(3) %43, align 8, !dbg !21 + %369 = load i32, ptr addrspace(3) %44, align 16, !dbg !21 + %370 = load i32, ptr addrspace(3) %45, align 8, !dbg !21 + %371 = load i32, ptr addrspace(3) %46, align 16, !dbg !21 + %372 = load i32, ptr addrspace(3) %47, align 8, !dbg !21 + %373 = insertelement <2 x i32> poison, i32 %223, i64 0, !dbg !36 + %374 = insertelement <2 x i32> %373, i32 %221, i64 1, !dbg !36 + %375 = insertelement <2 x i32> poison, i32 %271, i64 0, !dbg !36 + %376 = insertelement <2 x i32> %375, i32 %269, i64 1, !dbg !36 + %377 = icmp slt <2 x i32> %374, %376, !dbg !36 + %378 = insertelement <2 x i32> poison, i32 %224, i64 0, !dbg !36 + %379 = insertelement <2 x i32> %378, i32 %222, i64 1, !dbg !36 + %380 = insertelement <2 x i32> poison, i32 %272, i64 0, !dbg !36 + %381 = insertelement <2 x i32> %380, i32 %270, i64 1, !dbg !36 + %382 = icmp sge <2 x i32> %379, %381, !dbg !36 + %383 = insertelement <2 x i32> poison, i32 %227, i64 0, !dbg !36 + %384 = insertelement <2 x i32> %383, i32 %225, i64 1, !dbg !36 + %385 = insertelement <2 x i32> poison, i32 %275, i64 0, !dbg !36 + %386 = insertelement <2 x i32> %385, i32 %273, i64 1, !dbg !36 + %387 = icmp slt <2 x i32> %384, %386, !dbg !36 + %388 = insertelement <2 x i32> poison, i32 %228, i64 0, !dbg !36 + %389 = insertelement <2 x i32> %388, i32 %226, i64 1, !dbg !36 + %390 = insertelement <2 x i32> poison, i32 %276, i64 0, !dbg !36 + %391 = insertelement <2 x i32> %390, i32 %274, i64 1, !dbg !36 + %392 = icmp sge <2 x i32> %389, %391, !dbg !36 + %393 = icmp eq <2 x i32> %374, %376, !dbg !37 + %394 = icmp ne <2 x i32> %379, %381, !dbg !37 + %395 = icmp eq <2 x i32> %384, %386, !dbg !37 + %396 = icmp ne <2 x i32> %389, %391, !dbg !37 + %397 = insertelement <2 x i32> poison, i32 %319, i64 0, !dbg !38 + %398 = insertelement <2 x i32> %397, i32 %317, i64 1, !dbg !38 + %399 = insertelement <2 x i32> poison, i32 %367, i64 0, !dbg !38 + %400 = insertelement <2 x i32> %399, i32 %365, i64 1, !dbg !38 + %401 = icmp sgt <2 x i32> %398, %400, !dbg !38 + %402 = insertelement <2 x i32> poison, i32 %320, i64 0, !dbg !38 + %403 = insertelement <2 x i32> %402, i32 %318, i64 1, !dbg !38 + %404 = insertelement <2 x i32> poison, i32 %368, i64 0, !dbg !38 + %405 = insertelement <2 x i32> %404, i32 %366, i64 1, !dbg !38 + %406 = icmp sle <2 x i32> %403, %405, !dbg !38 + %407 = insertelement <2 x i32> poison, i32 %323, i64 0, !dbg !38 + %408 = insertelement <2 x i32> %407, i32 %321, i64 1, !dbg !38 + %409 = insertelement <2 x i32> poison, i32 %371, i64 0, !dbg !38 + %410 = insertelement <2 x i32> %409, i32 %369, i64 1, !dbg !38 + %411 = icmp sgt <2 x i32> %408, %410, !dbg !38 + %412 = insertelement <2 x i32> poison, i32 %324, i64 0, !dbg !38 + %413 = insertelement <2 x i32> %412, i32 %322, i64 1, !dbg !38 + %414 = insertelement <2 x i32> poison, i32 %372, i64 0, !dbg !38 + %415 = insertelement <2 x i32> %414, i32 %370, i64 1, !dbg !38 + %416 = icmp sle <2 x i32> %413, %415, !dbg !38 + %417 = and <2 x i1> %393, %401, !dbg !39 + %418 = or <2 x i1> %394, %406, !dbg !39 + %419 = and <2 x i1> %395, %411, !dbg !39 + %420 = or <2 x i1> %396, %416, !dbg !40 + %421 = or <2 x i1> %377, %417, !dbg !40 + %422 = and <2 x i1> %382, %418, !dbg !40 + %423 = or <2 x i1> %387, %419, !dbg !40 + %424 = and <2 x i1> %392, %420, !dbg !41 + %425 = xor <2 x i32> %376, %374, !dbg !42 + %426 = xor <2 x i32> %381, %379, !dbg !42 + %427 = xor <2 x i32> %386, %384, !dbg !42 + %428 = xor <2 x i32> %391, %389, !dbg !42 + %429 = select <2 x i1> %421, <2 x i32> %425, <2 x i32> zeroinitializer, !dbg !43 + %430 = select <2 x i1> %422, <2 x i32> %426, <2 x i32> zeroinitializer, !dbg !43 + %431 = select <2 x i1> %423, <2 x i32> %427, <2 x i32> zeroinitializer, !dbg !43 + %432 = select <2 x i1> %424, <2 x i32> %428, <2 x i32> zeroinitializer, !dbg !43 + %433 = xor <2 x i32> %429, %137, !dbg !33 + %434 = xor <2 x i32> %430, %139, !dbg !33 + %435 = insertelement <2 x i32> poison, i32 %178, i64 0, !dbg !33 + %436 = insertelement <2 x i32> %435, i32 %174, i64 1, !dbg !33 + %437 = xor <2 x i32> %431, %436, !dbg !33 + %438 = insertelement <2 x i32> poison, i32 %180, i64 0, !dbg !33 + %439 = insertelement <2 x i32> %438, i32 %176, i64 1, !dbg !33 + %440 = xor <2 x i32> %432, %439, !dbg !33 + %441 = xor <2 x i32> %400, %398, !dbg !44 + %442 = xor <2 x i32> %405, %403, !dbg !44 + %443 = xor <2 x i32> %410, %408, !dbg !44 + %444 = xor <2 x i32> %415, %413, !dbg !44 + %445 = select <2 x i1> %421, <2 x i32> %441, <2 x i32> zeroinitializer, !dbg !45 + %446 = select <2 x i1> %422, <2 x i32> %442, <2 x i32> zeroinitializer, !dbg !45 + %447 = select <2 x i1> %423, <2 x i32> %443, <2 x i32> zeroinitializer, !dbg !45 + %448 = select <2 x i1> %424, <2 x i32> %444, <2 x i32> zeroinitializer, !dbg !45 + %449 = xor <2 x i32> %445, %141, !dbg !34 + %450 = xor <2 x i32> %446, %61, !dbg !34 + %451 = xor <2 x i32> %447, %152, !dbg !34 + %452 = xor <2 x i32> %448, %142, !dbg !34 + %453 = icmp sge <2 x i32> %433, %434, !dbg !36 + %454 = icmp slt <2 x i32> %433, %434, !dbg !36 + %455 = shufflevector <2 x i1> %453, <2 x i1> %454, <2 x i32> , !dbg !36 + %456 = icmp sge <2 x i32> %437, %440, !dbg !36 + %457 = icmp slt <2 x i32> %437, %440, !dbg !36 + %458 = shufflevector <2 x i1> %456, <2 x i1> %457, <2 x i32> , !dbg !36 + %459 = icmp ne <2 x i32> %433, %434, !dbg !37 + %460 = icmp eq <2 x i32> %433, %434, !dbg !37 + %461 = shufflevector <2 x i1> %459, <2 x i1> %460, <2 x i32> , !dbg !37 + %462 = icmp ne <2 x i32> %437, %440, !dbg !37 + %463 = icmp eq <2 x i32> %437, %440, !dbg !37 + %464 = shufflevector <2 x i1> %462, <2 x i1> %463, <2 x i32> , !dbg !37 + %465 = icmp sle <2 x i32> %449, %450, !dbg !38 + %466 = icmp sgt <2 x i32> %449, %450, !dbg !38 + %467 = shufflevector <2 x i1> %465, <2 x i1> %466, <2 x i32> , !dbg !38 + %468 = icmp sle <2 x i32> %451, %452, !dbg !38 + %469 = icmp sgt <2 x i32> %451, %452, !dbg !38 + %470 = shufflevector <2 x i1> %468, <2 x i1> %469, <2 x i32> , !dbg !38 + %471 = or <2 x i1> %461, %467, !dbg !39 + %472 = and <2 x i1> %461, %467, !dbg !39 + %473 = shufflevector <2 x i1> %471, <2 x i1> %472, <2 x i32> , !dbg !39 + %474 = or <2 x i1> %464, %470, !dbg !40 + %475 = and <2 x i1> %464, %470, !dbg !40 + %476 = shufflevector <2 x i1> %474, <2 x i1> %475, <2 x i32> , !dbg !40 + %477 = and <2 x i1> %455, %473, !dbg !40 + %478 = or <2 x i1> %455, %473, !dbg !40 + %479 = shufflevector <2 x i1> %477, <2 x i1> %478, <2 x i32> , !dbg !40 + %480 = and <2 x i1> %458, %476, !dbg !41 + %481 = or <2 x i1> %458, %476, !dbg !41 + %482 = shufflevector <2 x i1> %480, <2 x i1> %481, <2 x i32> , !dbg !41 + %483 = xor <2 x i32> %434, %433, !dbg !42 + %484 = xor <2 x i32> %440, %437, !dbg !42 + %485 = select <2 x i1> %479, <2 x i32> %483, <2 x i32> zeroinitializer, !dbg !43 + %486 = select <2 x i1> %482, <2 x i32> %484, <2 x i32> zeroinitializer, !dbg !43 + %foldExtExtBinop = xor <2 x i32> %485, %433, !dbg !33 + %487 = extractelement <2 x i32> %foldExtExtBinop, i64 1, !dbg !33 + %foldExtExtBinop83 = xor <2 x i32> %485, %433, !dbg !33 + %488 = extractelement <2 x i32> %foldExtExtBinop83, i64 0, !dbg !33 + %489 = xor <2 x i32> %485, %434, !dbg !33 + %foldExtExtBinop85 = xor <2 x i32> %486, %437, !dbg !33 + %490 = extractelement <2 x i32> %foldExtExtBinop85, i64 1, !dbg !33 + %foldExtExtBinop87 = xor <2 x i32> %486, %437, !dbg !33 + %491 = extractelement <2 x i32> %foldExtExtBinop87, i64 0, !dbg !33 + %492 = xor <2 x i32> %486, %440, !dbg !33 + %493 = extractelement <2 x i32> %449, i64 1, !dbg !44 + %494 = extractelement <2 x i32> %450, i64 1, !dbg !44 + %495 = xor i32 %494, %493, !dbg !44 + %496 = extractelement <2 x i32> %449, i64 0, !dbg !44 + %497 = extractelement <2 x i32> %450, i64 0, !dbg !44 + %498 = xor i32 %497, %496, !dbg !44 + %499 = extractelement <2 x i32> %451, i64 1, !dbg !44 + %500 = extractelement <2 x i32> %452, i64 1, !dbg !44 + %501 = xor i32 %500, %499, !dbg !44 + %502 = extractelement <2 x i32> %451, i64 0, !dbg !44 + %503 = extractelement <2 x i32> %452, i64 0, !dbg !44 + %504 = xor i32 %503, %502, !dbg !44 + %505 = extractelement <2 x i1> %478, i64 1, !dbg !45 + %506 = select i1 %505, i32 %495, i32 0, !dbg !45 + %507 = extractelement <2 x i1> %477, i64 0, !dbg !45 + %508 = select i1 %507, i32 %498, i32 0, !dbg !45 + %509 = extractelement <2 x i1> %481, i64 1, !dbg !45 + %510 = select i1 %509, i32 %501, i32 0, !dbg !45 + %511 = extractelement <2 x i1> %480, i64 0, !dbg !45 + %512 = select i1 %511, i32 %504, i32 0, !dbg !45 + %513 = xor i32 %506, %493, !dbg !34 + %514 = xor i32 %506, %494, !dbg !34 + %515 = xor i32 %508, %496, !dbg !34 + %516 = xor i32 %508, %497, !dbg !34 + %517 = xor i32 %510, %499, !dbg !34 + %518 = xor i32 %510, %500, !dbg !34 + %519 = xor i32 %512, %502, !dbg !34 + %520 = xor i32 %512, %503, !dbg !34 + %521 = mul nuw nsw i32 %487, %96, !dbg !30 + %522 = extractelement <2 x i32> %489, i64 1, !dbg !31 + %523 = mul nuw nsw i32 %522, %96, !dbg !30 + %524 = mul nuw nsw i32 %488, %96, !dbg !30 + %525 = extractelement <2 x i32> %489, i64 0, !dbg !31 + %526 = mul nuw nsw i32 %525, %96, !dbg !30 + %527 = mul nuw nsw i32 %490, %96, !dbg !30 + %528 = extractelement <2 x i32> %492, i64 1, !dbg !31 + %529 = mul nuw nsw i32 %528, %96, !dbg !30 + %530 = mul nuw nsw i32 %491, %96, !dbg !30 + %531 = extractelement <2 x i32> %492, i64 0, !dbg !31 + %532 = mul nuw nsw i32 %531, %96, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %533 = insertelement <1 x i32> poison, i32 %521, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %101, <1 x i32> %533, i1 true) #5, !dbg !21 + %534 = insertelement <1 x i32> poison, i32 %523, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %534, i1 true) #5, !dbg !21 + %535 = insertelement <1 x i32> poison, i32 %524, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, <1 x i32> %535, i1 true) #5, !dbg !21 + %536 = insertelement <1 x i32> poison, i32 %526, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, <1 x i32> %536, i1 true) #5, !dbg !21 + %537 = insertelement <1 x i32> poison, i32 %527, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %109, <1 x i32> %537, i1 true) #5, !dbg !21 + %538 = insertelement <1 x i32> poison, i32 %529, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, <1 x i32> %538, i1 true) #5, !dbg !21 + %539 = insertelement <1 x i32> poison, i32 %530, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %111, <1 x i32> %539, i1 true) #5, !dbg !21 + %540 = insertelement <1 x i32> poison, i32 %532, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %112, <1 x i32> %540, i1 true) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %541 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %48, i1 true) #5, !dbg !21 + %542 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %541, i32 1, i32 31), !dbg !21 + %543 = add i32 %542, %541, !dbg !35 + %544 = insertelement <1 x i32> poison, i32 %543, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %48, <1 x i32> %544, i1 %50) #5, !dbg !21 + %545 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %51, i1 true) #5, !dbg !21 + %546 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %545, i32 1, i32 31), !dbg !21 + %547 = add i32 %546, %545, !dbg !35 + %548 = insertelement <1 x i32> poison, i32 %547, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %51, <1 x i32> %548, i1 %50) #5, !dbg !21 + %549 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %52, i1 true) #5, !dbg !21 + %550 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %549, i32 1, i32 31), !dbg !21 + %551 = add i32 %550, %549, !dbg !35 + %552 = insertelement <1 x i32> poison, i32 %551, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %52, <1 x i32> %552, i1 %50) #5, !dbg !21 + %553 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %53, i1 true) #5, !dbg !21 + %554 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %553, i32 1, i32 31), !dbg !21 + %555 = add i32 %554, %553, !dbg !35 + %556 = insertelement <1 x i32> poison, i32 %555, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %53, <1 x i32> %556, i1 %50) #5, !dbg !21 + %557 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %54, i1 true) #5, !dbg !21 + %558 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %557, i32 1, i32 31), !dbg !21 + %559 = add i32 %558, %557, !dbg !35 + %560 = insertelement <1 x i32> poison, i32 %559, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %54, <1 x i32> %560, i1 %50) #5, !dbg !21 + %561 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %55, i1 true) #5, !dbg !21 + %562 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %561, i32 1, i32 31), !dbg !21 + %563 = add i32 %562, %561, !dbg !35 + %564 = insertelement <1 x i32> poison, i32 %563, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %55, <1 x i32> %564, i1 %50) #5, !dbg !21 + %565 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %56, i1 true) #5, !dbg !21 + %566 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %565, i32 1, i32 31), !dbg !21 + %567 = add i32 %566, %565, !dbg !35 + %568 = insertelement <1 x i32> poison, i32 %567, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %56, <1 x i32> %568, i1 %50) #5, !dbg !21 + %569 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %57, i1 true) #5, !dbg !21 + %570 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %569, i32 1, i32 31), !dbg !21 + %571 = add i32 %570, %569, !dbg !35 + %572 = insertelement <1 x i32> poison, i32 %571, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %57, <1 x i32> %572, i1 %50) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %573 = load i32, ptr addrspace(3) %40, align 16, !dbg !21 + %574 = load i32, ptr addrspace(3) %41, align 8, !dbg !21 + %575 = load i32, ptr addrspace(3) %42, align 16, !dbg !21 + %576 = load i32, ptr addrspace(3) %43, align 8, !dbg !21 + %577 = load i32, ptr addrspace(3) %44, align 16, !dbg !21 + %578 = load i32, ptr addrspace(3) %45, align 8, !dbg !21 + %579 = load i32, ptr addrspace(3) %46, align 16, !dbg !21 + %580 = load i32, ptr addrspace(3) %47, align 8, !dbg !21 + %581 = mul nuw nsw i32 %487, %.lobit, !dbg !31 + %582 = mul nuw nsw i32 %522, %.lobit, !dbg !31 + %583 = mul nuw nsw i32 %488, %.lobit, !dbg !31 + %584 = mul nuw nsw i32 %525, %.lobit, !dbg !31 + %585 = mul nuw nsw i32 %490, %.lobit, !dbg !31 + %586 = mul nuw nsw i32 %528, %.lobit, !dbg !31 + %587 = mul nuw nsw i32 %491, %.lobit, !dbg !31 + %588 = mul nuw nsw i32 %531, %.lobit, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %589 = insertelement <1 x i32> poison, i32 %581, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %101, <1 x i32> %589, i1 true) #5, !dbg !21 + %590 = insertelement <1 x i32> poison, i32 %582, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %590, i1 true) #5, !dbg !21 + %591 = insertelement <1 x i32> poison, i32 %583, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, <1 x i32> %591, i1 true) #5, !dbg !21 + %592 = insertelement <1 x i32> poison, i32 %584, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, <1 x i32> %592, i1 true) #5, !dbg !21 + %593 = insertelement <1 x i32> poison, i32 %585, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %109, <1 x i32> %593, i1 true) #5, !dbg !21 + %594 = insertelement <1 x i32> poison, i32 %586, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, <1 x i32> %594, i1 true) #5, !dbg !21 + %595 = insertelement <1 x i32> poison, i32 %587, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %111, <1 x i32> %595, i1 true) #5, !dbg !21 + %596 = insertelement <1 x i32> poison, i32 %588, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %112, <1 x i32> %596, i1 true) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %597 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %48, i1 true) #5, !dbg !21 + %598 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %597, i32 1, i32 31), !dbg !21 + %599 = add i32 %598, %597, !dbg !35 + %600 = insertelement <1 x i32> poison, i32 %599, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %48, <1 x i32> %600, i1 %50) #5, !dbg !21 + %601 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %51, i1 true) #5, !dbg !21 + %602 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %601, i32 1, i32 31), !dbg !21 + %603 = add i32 %602, %601, !dbg !35 + %604 = insertelement <1 x i32> poison, i32 %603, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %51, <1 x i32> %604, i1 %50) #5, !dbg !21 + %605 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %52, i1 true) #5, !dbg !21 + %606 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %605, i32 1, i32 31), !dbg !21 + %607 = add i32 %606, %605, !dbg !35 + %608 = insertelement <1 x i32> poison, i32 %607, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %52, <1 x i32> %608, i1 %50) #5, !dbg !21 + %609 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %53, i1 true) #5, !dbg !21 + %610 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %609, i32 1, i32 31), !dbg !21 + %611 = add i32 %610, %609, !dbg !35 + %612 = insertelement <1 x i32> poison, i32 %611, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %53, <1 x i32> %612, i1 %50) #5, !dbg !21 + %613 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %54, i1 true) #5, !dbg !21 + %614 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %613, i32 1, i32 31), !dbg !21 + %615 = add i32 %614, %613, !dbg !35 + %616 = insertelement <1 x i32> poison, i32 %615, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %54, <1 x i32> %616, i1 %50) #5, !dbg !21 + %617 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %55, i1 true) #5, !dbg !21 + %618 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %617, i32 1, i32 31), !dbg !21 + %619 = add i32 %618, %617, !dbg !35 + %620 = insertelement <1 x i32> poison, i32 %619, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %55, <1 x i32> %620, i1 %50) #5, !dbg !21 + %621 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %56, i1 true) #5, !dbg !21 + %622 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %621, i32 1, i32 31), !dbg !21 + %623 = add i32 %622, %621, !dbg !35 + %624 = insertelement <1 x i32> poison, i32 %623, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %56, <1 x i32> %624, i1 %50) #5, !dbg !21 + %625 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %57, i1 true) #5, !dbg !21 + %626 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %625, i32 1, i32 31), !dbg !21 + %627 = add i32 %626, %625, !dbg !35 + %628 = insertelement <1 x i32> poison, i32 %627, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %57, <1 x i32> %628, i1 %50) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %629 = load i32, ptr addrspace(3) %40, align 16, !dbg !21 + %630 = load i32, ptr addrspace(3) %41, align 8, !dbg !21 + %631 = load i32, ptr addrspace(3) %42, align 16, !dbg !21 + %632 = load i32, ptr addrspace(3) %43, align 8, !dbg !21 + %633 = load i32, ptr addrspace(3) %44, align 16, !dbg !21 + %634 = load i32, ptr addrspace(3) %45, align 8, !dbg !21 + %635 = load i32, ptr addrspace(3) %46, align 16, !dbg !21 + %636 = load i32, ptr addrspace(3) %47, align 8, !dbg !21 + %637 = mul nuw nsw i32 %513, %96, !dbg !32 + %638 = mul nuw nsw i32 %514, %96, !dbg !32 + %639 = mul nuw nsw i32 %515, %96, !dbg !32 + %640 = mul nuw nsw i32 %516, %96, !dbg !32 + %641 = mul nuw nsw i32 %517, %96, !dbg !32 + %642 = mul nuw nsw i32 %518, %96, !dbg !32 + %643 = mul nuw nsw i32 %519, %96, !dbg !32 + %644 = mul nuw nsw i32 %520, %96, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %645 = insertelement <1 x i32> poison, i32 %637, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %101, <1 x i32> %645, i1 true) #5, !dbg !21 + %646 = insertelement <1 x i32> poison, i32 %638, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %646, i1 true) #5, !dbg !21 + %647 = insertelement <1 x i32> poison, i32 %639, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, <1 x i32> %647, i1 true) #5, !dbg !21 + %648 = insertelement <1 x i32> poison, i32 %640, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, <1 x i32> %648, i1 true) #5, !dbg !21 + %649 = insertelement <1 x i32> poison, i32 %641, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %109, <1 x i32> %649, i1 true) #5, !dbg !21 + %650 = insertelement <1 x i32> poison, i32 %642, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, <1 x i32> %650, i1 true) #5, !dbg !21 + %651 = insertelement <1 x i32> poison, i32 %643, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %111, <1 x i32> %651, i1 true) #5, !dbg !21 + %652 = insertelement <1 x i32> poison, i32 %644, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %112, <1 x i32> %652, i1 true) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %653 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %48, i1 true) #5, !dbg !21 + %654 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %653, i32 1, i32 31), !dbg !21 + %655 = add i32 %654, %653, !dbg !35 + %656 = insertelement <1 x i32> poison, i32 %655, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %48, <1 x i32> %656, i1 %50) #5, !dbg !21 + %657 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %51, i1 true) #5, !dbg !21 + %658 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %657, i32 1, i32 31), !dbg !21 + %659 = add i32 %658, %657, !dbg !35 + %660 = insertelement <1 x i32> poison, i32 %659, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %51, <1 x i32> %660, i1 %50) #5, !dbg !21 + %661 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %52, i1 true) #5, !dbg !21 + %662 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %661, i32 1, i32 31), !dbg !21 + %663 = add i32 %662, %661, !dbg !35 + %664 = insertelement <1 x i32> poison, i32 %663, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %52, <1 x i32> %664, i1 %50) #5, !dbg !21 + %665 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %53, i1 true) #5, !dbg !21 + %666 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %665, i32 1, i32 31), !dbg !21 + %667 = add i32 %666, %665, !dbg !35 + %668 = insertelement <1 x i32> poison, i32 %667, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %53, <1 x i32> %668, i1 %50) #5, !dbg !21 + %669 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %54, i1 true) #5, !dbg !21 + %670 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %669, i32 1, i32 31), !dbg !21 + %671 = add i32 %670, %669, !dbg !35 + %672 = insertelement <1 x i32> poison, i32 %671, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %54, <1 x i32> %672, i1 %50) #5, !dbg !21 + %673 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %55, i1 true) #5, !dbg !21 + %674 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %673, i32 1, i32 31), !dbg !21 + %675 = add i32 %674, %673, !dbg !35 + %676 = insertelement <1 x i32> poison, i32 %675, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %55, <1 x i32> %676, i1 %50) #5, !dbg !21 + %677 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %56, i1 true) #5, !dbg !21 + %678 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %677, i32 1, i32 31), !dbg !21 + %679 = add i32 %678, %677, !dbg !35 + %680 = insertelement <1 x i32> poison, i32 %679, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %56, <1 x i32> %680, i1 %50) #5, !dbg !21 + %681 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %57, i1 true) #5, !dbg !21 + %682 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %681, i32 1, i32 31), !dbg !21 + %683 = add i32 %682, %681, !dbg !35 + %684 = insertelement <1 x i32> poison, i32 %683, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %57, <1 x i32> %684, i1 %50) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %685 = load i32, ptr addrspace(3) %40, align 16, !dbg !21 + %686 = load i32, ptr addrspace(3) %41, align 8, !dbg !21 + %687 = load i32, ptr addrspace(3) %42, align 16, !dbg !21 + %688 = load i32, ptr addrspace(3) %43, align 8, !dbg !21 + %689 = load i32, ptr addrspace(3) %44, align 16, !dbg !21 + %690 = load i32, ptr addrspace(3) %45, align 8, !dbg !21 + %691 = load i32, ptr addrspace(3) %46, align 16, !dbg !21 + %692 = load i32, ptr addrspace(3) %47, align 8, !dbg !21 + %693 = mul nuw nsw i32 %513, %.lobit, !dbg !25 + %694 = mul nuw nsw i32 %514, %.lobit, !dbg !25 + %695 = mul nuw nsw i32 %515, %.lobit, !dbg !25 + %696 = mul nuw nsw i32 %516, %.lobit, !dbg !25 + %697 = mul nuw nsw i32 %517, %.lobit, !dbg !25 + %698 = mul nuw nsw i32 %518, %.lobit, !dbg !25 + %699 = mul nuw nsw i32 %519, %.lobit, !dbg !25 + %700 = mul nuw nsw i32 %520, %.lobit, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %701 = insertelement <1 x i32> poison, i32 %693, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %101, <1 x i32> %701, i1 true) #5, !dbg !21 + %702 = insertelement <1 x i32> poison, i32 %694, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %702, i1 true) #5, !dbg !21 + %703 = insertelement <1 x i32> poison, i32 %695, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, <1 x i32> %703, i1 true) #5, !dbg !21 + %704 = insertelement <1 x i32> poison, i32 %696, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, <1 x i32> %704, i1 true) #5, !dbg !21 + %705 = insertelement <1 x i32> poison, i32 %697, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %109, <1 x i32> %705, i1 true) #5, !dbg !21 + %706 = insertelement <1 x i32> poison, i32 %698, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, <1 x i32> %706, i1 true) #5, !dbg !21 + %707 = insertelement <1 x i32> poison, i32 %699, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %111, <1 x i32> %707, i1 true) #5, !dbg !21 + %708 = insertelement <1 x i32> poison, i32 %700, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %112, <1 x i32> %708, i1 true) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %709 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %48, i1 true) #5, !dbg !21 + %710 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %709, i32 1, i32 31), !dbg !21 + %711 = add i32 %710, %709, !dbg !35 + %712 = insertelement <1 x i32> poison, i32 %711, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %48, <1 x i32> %712, i1 %50) #5, !dbg !21 + %713 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %51, i1 true) #5, !dbg !21 + %714 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %713, i32 1, i32 31), !dbg !21 + %715 = add i32 %714, %713, !dbg !35 + %716 = insertelement <1 x i32> poison, i32 %715, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %51, <1 x i32> %716, i1 %50) #5, !dbg !21 + %717 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %52, i1 true) #5, !dbg !21 + %718 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %717, i32 1, i32 31), !dbg !21 + %719 = add i32 %718, %717, !dbg !35 + %720 = insertelement <1 x i32> poison, i32 %719, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %52, <1 x i32> %720, i1 %50) #5, !dbg !21 + %721 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %53, i1 true) #5, !dbg !21 + %722 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %721, i32 1, i32 31), !dbg !21 + %723 = add i32 %722, %721, !dbg !35 + %724 = insertelement <1 x i32> poison, i32 %723, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %53, <1 x i32> %724, i1 %50) #5, !dbg !21 + %725 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %54, i1 true) #5, !dbg !21 + %726 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %725, i32 1, i32 31), !dbg !21 + %727 = add i32 %726, %725, !dbg !35 + %728 = insertelement <1 x i32> poison, i32 %727, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %54, <1 x i32> %728, i1 %50) #5, !dbg !21 + %729 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %55, i1 true) #5, !dbg !21 + %730 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %729, i32 1, i32 31), !dbg !21 + %731 = add i32 %730, %729, !dbg !35 + %732 = insertelement <1 x i32> poison, i32 %731, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %55, <1 x i32> %732, i1 %50) #5, !dbg !21 + %733 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %56, i1 true) #5, !dbg !21 + %734 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %733, i32 1, i32 31), !dbg !21 + %735 = add i32 %734, %733, !dbg !35 + %736 = insertelement <1 x i32> poison, i32 %735, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %56, <1 x i32> %736, i1 %50) #5, !dbg !21 + %737 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %57, i1 true) #5, !dbg !21 + %738 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %737, i32 1, i32 31), !dbg !21 + %739 = add i32 %738, %737, !dbg !35 + %740 = insertelement <1 x i32> poison, i32 %739, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %57, <1 x i32> %740, i1 %50) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %741 = load i32, ptr addrspace(3) %40, align 16, !dbg !21 + %742 = load i32, ptr addrspace(3) %41, align 8, !dbg !21 + %743 = load i32, ptr addrspace(3) %42, align 16, !dbg !21 + %744 = load i32, ptr addrspace(3) %43, align 8, !dbg !21 + %745 = load i32, ptr addrspace(3) %44, align 16, !dbg !21 + %746 = load i32, ptr addrspace(3) %45, align 8, !dbg !21 + %747 = load i32, ptr addrspace(3) %46, align 16, !dbg !21 + %748 = load i32, ptr addrspace(3) %47, align 8, !dbg !21 + %749 = icmp slt i32 %573, %629, !dbg !36 + %750 = icmp slt i32 %574, %630, !dbg !36 + %751 = icmp sge i32 %575, %631, !dbg !36 + %752 = icmp sge i32 %576, %632, !dbg !36 + %753 = icmp slt i32 %577, %633, !dbg !36 + %754 = icmp slt i32 %578, %634, !dbg !36 + %755 = icmp sge i32 %579, %635, !dbg !36 + %756 = icmp sge i32 %580, %636, !dbg !36 + %757 = icmp eq i32 %573, %629, !dbg !37 + %758 = icmp eq i32 %574, %630, !dbg !37 + %759 = icmp ne i32 %575, %631, !dbg !37 + %760 = icmp ne i32 %576, %632, !dbg !37 + %761 = icmp eq i32 %577, %633, !dbg !37 + %762 = icmp eq i32 %578, %634, !dbg !37 + %763 = icmp ne i32 %579, %635, !dbg !37 + %764 = icmp ne i32 %580, %636, !dbg !37 + %765 = icmp sgt i32 %685, %741, !dbg !38 + %766 = icmp sgt i32 %686, %742, !dbg !38 + %767 = icmp sle i32 %687, %743, !dbg !38 + %768 = icmp sle i32 %688, %744, !dbg !38 + %769 = icmp sgt i32 %689, %745, !dbg !38 + %770 = icmp sgt i32 %690, %746, !dbg !38 + %771 = icmp sle i32 %691, %747, !dbg !38 + %772 = icmp sle i32 %692, %748, !dbg !38 + %773 = and i1 %757, %765, !dbg !39 + %.not28 = or i1 %759, %767, !dbg !39 + %774 = insertelement <2 x i1> poison, i1 %760, i64 0, !dbg !39 + %775 = insertelement <2 x i1> %774, i1 %758, i64 1, !dbg !39 + %776 = insertelement <2 x i1> poison, i1 %768, i64 0, !dbg !39 + %777 = insertelement <2 x i1> %776, i1 %766, i64 1, !dbg !39 + %778 = or <2 x i1> %775, %777, !dbg !39 + %779 = and <2 x i1> %775, %777, !dbg !39 + %780 = shufflevector <2 x i1> %778, <2 x i1> %779, <2 x i32> , !dbg !39 + %781 = and i1 %761, %769, !dbg !39 + %.not36 = or i1 %763, %771, !dbg !39 + %782 = insertelement <2 x i1> poison, i1 %764, i64 0, !dbg !40 + %783 = insertelement <2 x i1> %782, i1 %762, i64 1, !dbg !40 + %784 = insertelement <2 x i1> poison, i1 %772, i64 0, !dbg !40 + %785 = insertelement <2 x i1> %784, i1 %770, i64 1, !dbg !40 + %786 = or <2 x i1> %783, %785, !dbg !40 + %787 = and <2 x i1> %783, %785, !dbg !40 + %788 = shufflevector <2 x i1> %786, <2 x i1> %787, <2 x i32> , !dbg !40 + %789 = or i1 %749, %773, !dbg !40 + %.not26 = and i1 %751, %.not28, !dbg !40 + %790 = insertelement <2 x i1> poison, i1 %752, i64 0, !dbg !40 + %791 = insertelement <2 x i1> %790, i1 %750, i64 1, !dbg !40 + %792 = and <2 x i1> %791, %780, !dbg !40 + %793 = or <2 x i1> %791, %780, !dbg !40 + %794 = shufflevector <2 x i1> %792, <2 x i1> %793, <2 x i32> , !dbg !40 + %795 = or i1 %753, %781, !dbg !40 + %.not34 = and i1 %755, %.not36, !dbg !40 + %796 = insertelement <2 x i1> poison, i1 %756, i64 0, !dbg !41 + %797 = insertelement <2 x i1> %796, i1 %754, i64 1, !dbg !41 + %798 = and <2 x i1> %797, %788, !dbg !41 + %799 = or <2 x i1> %797, %788, !dbg !41 + %800 = shufflevector <2 x i1> %798, <2 x i1> %799, <2 x i32> , !dbg !41 + %801 = xor i32 %629, %573, !dbg !42 + %802 = xor i32 %631, %575, !dbg !42 + %803 = insertelement <2 x i32> poison, i32 %632, i64 0, !dbg !42 + %804 = insertelement <2 x i32> %803, i32 %630, i64 1, !dbg !42 + %805 = insertelement <2 x i32> poison, i32 %576, i64 0, !dbg !42 + %806 = insertelement <2 x i32> %805, i32 %574, i64 1, !dbg !42 + %807 = xor <2 x i32> %804, %806, !dbg !42 + %808 = xor i32 %633, %577, !dbg !42 + %809 = xor i32 %635, %579, !dbg !42 + %810 = insertelement <2 x i32> poison, i32 %636, i64 0, !dbg !42 + %811 = insertelement <2 x i32> %810, i32 %634, i64 1, !dbg !42 + %812 = insertelement <2 x i32> poison, i32 %580, i64 0, !dbg !42 + %813 = insertelement <2 x i32> %812, i32 %578, i64 1, !dbg !42 + %814 = xor <2 x i32> %811, %813, !dbg !42 + %815 = select i1 %789, i32 %801, i32 0, !dbg !43 + %816 = select i1 %.not26, i32 %802, i32 0, !dbg !43 + %817 = select <2 x i1> %794, <2 x i32> %807, <2 x i32> zeroinitializer, !dbg !43 + %818 = select i1 %795, i32 %808, i32 0, !dbg !43 + %819 = select i1 %.not34, i32 %809, i32 0, !dbg !43 + %820 = select <2 x i1> %800, <2 x i32> %814, <2 x i32> zeroinitializer, !dbg !43 + %821 = xor i32 %815, %487, !dbg !33 + %822 = xor i32 %816, %488, !dbg !33 + %823 = xor <2 x i32> %817, %489, !dbg !33 + %824 = xor i32 %818, %490, !dbg !33 + %825 = xor i32 %819, %491, !dbg !33 + %826 = xor <2 x i32> %820, %492, !dbg !33 + %827 = xor i32 %741, %685, !dbg !44 + %828 = xor i32 %742, %686, !dbg !44 + %829 = xor i32 %743, %687, !dbg !44 + %830 = xor i32 %744, %688, !dbg !44 + %831 = xor i32 %745, %689, !dbg !44 + %832 = xor i32 %746, %690, !dbg !44 + %833 = xor i32 %747, %691, !dbg !44 + %834 = xor i32 %748, %692, !dbg !44 + %835 = select i1 %789, i32 %827, i32 0, !dbg !45 + %836 = extractelement <2 x i1> %793, i64 1, !dbg !45 + %837 = select i1 %836, i32 %828, i32 0, !dbg !45 + %838 = select i1 %.not26, i32 %829, i32 0, !dbg !45 + %839 = extractelement <2 x i1> %792, i64 0, !dbg !45 + %840 = select i1 %839, i32 %830, i32 0, !dbg !45 + %841 = select i1 %795, i32 %831, i32 0, !dbg !45 + %842 = extractelement <2 x i1> %799, i64 1, !dbg !45 + %843 = select i1 %842, i32 %832, i32 0, !dbg !45 + %844 = select i1 %.not34, i32 %833, i32 0, !dbg !45 + %845 = extractelement <2 x i1> %798, i64 0, !dbg !45 + %846 = select i1 %845, i32 %834, i32 0, !dbg !45 + %847 = xor i32 %835, %513, !dbg !34 + %848 = xor i32 %837, %514, !dbg !34 + %849 = xor i32 %838, %515, !dbg !34 + %850 = xor i32 %840, %516, !dbg !34 + %851 = xor i32 %841, %517, !dbg !34 + %852 = xor i32 %843, %518, !dbg !34 + %853 = xor i32 %844, %519, !dbg !34 + %854 = xor i32 %846, %520, !dbg !34 + %855 = icmp slt i32 %821, %822, !dbg !36 + %856 = extractelement <2 x i32> %823, i64 0, !dbg !36 + %857 = extractelement <2 x i32> %823, i64 1, !dbg !36 + %858 = icmp slt i32 %857, %856, !dbg !36 + %859 = icmp sge i32 %824, %825, !dbg !36 + %860 = extractelement <2 x i32> %826, i64 0, !dbg !36 + %861 = extractelement <2 x i32> %826, i64 1, !dbg !36 + %862 = icmp sge i32 %861, %860, !dbg !36 + %863 = icmp eq i32 %821, %822, !dbg !37 + %864 = icmp eq i32 %857, %856, !dbg !37 + %865 = icmp ne i32 %824, %825, !dbg !37 + %866 = icmp ne i32 %861, %860, !dbg !37 + %867 = icmp sgt i32 %847, %849, !dbg !38 + %868 = icmp sgt i32 %848, %850, !dbg !38 + %869 = icmp sle i32 %851, %853, !dbg !38 + %870 = icmp sle i32 %852, %854, !dbg !38 + %871 = and i1 %863, %867, !dbg !39 + %872 = and i1 %864, %868, !dbg !39 + %.not44 = or i1 %865, %869, !dbg !39 + %.not48 = or i1 %866, %870, !dbg !40 + %873 = or i1 %855, %871, !dbg !40 + %874 = or i1 %858, %872, !dbg !40 + %.not42 = and i1 %859, %.not44, !dbg !40 + %.not46 = and i1 %862, %.not48, !dbg !41 + %875 = xor i32 %822, %821, !dbg !42 + %876 = xor i32 %856, %857, !dbg !42 + %877 = xor i32 %825, %824, !dbg !42 + %878 = xor i32 %860, %861, !dbg !42 + %879 = select i1 %873, i32 %875, i32 0, !dbg !43 + %880 = select i1 %874, i32 %876, i32 0, !dbg !43 + %881 = select i1 %.not42, i32 %877, i32 0, !dbg !43 + %882 = select i1 %.not46, i32 %878, i32 0, !dbg !43 + %883 = xor i32 %879, %821, !dbg !33 + %884 = xor i32 %880, %857, !dbg !33 + %885 = xor i32 %879, %822, !dbg !33 + %886 = xor i32 %880, %856, !dbg !33 + %887 = xor i32 %881, %824, !dbg !33 + %888 = xor i32 %882, %861, !dbg !33 + %889 = xor i32 %881, %825, !dbg !33 + %890 = xor i32 %882, %860, !dbg !33 + %891 = xor i32 %849, %847, !dbg !44 + %892 = xor i32 %850, %848, !dbg !44 + %893 = xor i32 %853, %851, !dbg !44 + %894 = xor i32 %854, %852, !dbg !44 + %895 = select i1 %873, i32 %891, i32 0, !dbg !45 + %896 = select i1 %874, i32 %892, i32 0, !dbg !45 + %897 = select i1 %.not42, i32 %893, i32 0, !dbg !45 + %898 = select i1 %.not46, i32 %894, i32 0, !dbg !45 + %899 = xor i32 %895, %847, !dbg !34 + %900 = xor i32 %896, %848, !dbg !34 + %901 = xor i32 %895, %849, !dbg !34 + %902 = xor i32 %896, %850, !dbg !34 + %903 = xor i32 %897, %851, !dbg !34 + %904 = xor i32 %898, %852, !dbg !34 + %905 = xor i32 %897, %853, !dbg !34 + %906 = xor i32 %898, %854, !dbg !34 + %907 = icmp slt i32 %883, %884, !dbg !36 + %908 = icmp slt i32 %885, %886, !dbg !36 + %909 = icmp sge i32 %887, %888, !dbg !36 + %910 = icmp sge i32 %889, %890, !dbg !36 + %911 = icmp eq i32 %883, %884, !dbg !37 + %912 = icmp eq i32 %885, %886, !dbg !37 + %913 = icmp ne i32 %887, %888, !dbg !37 + %914 = icmp ne i32 %889, %890, !dbg !37 + %915 = icmp sgt i32 %899, %900, !dbg !38 + %916 = icmp sgt i32 %901, %902, !dbg !38 + %917 = icmp sle i32 %903, %904, !dbg !38 + %918 = icmp sle i32 %905, %906, !dbg !38 + %919 = and i1 %911, %915, !dbg !39 + %920 = and i1 %912, %916, !dbg !39 + %.not52 = or i1 %913, %917, !dbg !39 + %.not56 = or i1 %914, %918, !dbg !40 + %921 = or i1 %907, %919, !dbg !40 + %922 = or i1 %908, %920, !dbg !40 + %.not50 = and i1 %909, %.not52, !dbg !40 + %.not54 = and i1 %910, %.not56, !dbg !41 + %923 = xor i32 %884, %883, !dbg !42 + %924 = xor i32 %886, %885, !dbg !42 + %925 = xor i32 %888, %887, !dbg !42 + %926 = xor i32 %890, %889, !dbg !42 + %927 = select i1 %921, i32 %923, i32 0, !dbg !43 + %928 = select i1 %922, i32 %924, i32 0, !dbg !43 + %929 = select i1 %.not50, i32 %925, i32 0, !dbg !43 + %930 = select i1 %.not54, i32 %926, i32 0, !dbg !43 + %931 = xor i32 %927, %883, !dbg !33 + %932 = xor i32 %927, %884, !dbg !33 + %933 = xor i32 %928, %885, !dbg !33 + %934 = xor i32 %928, %886, !dbg !33 + %935 = xor i32 %929, %887, !dbg !33 + %936 = xor i32 %929, %888, !dbg !33 + %937 = xor i32 %930, %889, !dbg !33 + %938 = xor i32 %930, %890, !dbg !33 + %939 = xor i32 %900, %899, !dbg !44 + %940 = xor i32 %902, %901, !dbg !44 + %941 = xor i32 %904, %903, !dbg !44 + %942 = xor i32 %906, %905, !dbg !44 + %943 = select i1 %921, i32 %939, i32 0, !dbg !45 + %944 = select i1 %922, i32 %940, i32 0, !dbg !45 + %945 = select i1 %.not50, i32 %941, i32 0, !dbg !45 + %946 = select i1 %.not54, i32 %942, i32 0, !dbg !45 + %947 = xor i32 %943, %899, !dbg !34 + %948 = xor i32 %943, %900, !dbg !34 + %949 = xor i32 %944, %901, !dbg !34 + %950 = xor i32 %944, %902, !dbg !34 + %951 = xor i32 %945, %903, !dbg !34 + %952 = xor i32 %945, %904, !dbg !34 + %953 = xor i32 %946, %905, !dbg !34 + %954 = xor i32 %946, %906, !dbg !34 + %955 = mul nuw nsw i32 %931, %96, !dbg !30 + %956 = mul nuw nsw i32 %932, %96, !dbg !30 + %957 = mul nuw nsw i32 %933, %96, !dbg !30 + %958 = mul nuw nsw i32 %934, %96, !dbg !30 + %959 = mul nuw nsw i32 %935, %96, !dbg !30 + %960 = mul nuw nsw i32 %936, %96, !dbg !30 + %961 = mul nuw nsw i32 %937, %96, !dbg !30 + %962 = mul nuw nsw i32 %938, %96, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %963 = insertelement <1 x i32> poison, i32 %955, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %101, <1 x i32> %963, i1 true) #5, !dbg !21 + %964 = insertelement <1 x i32> poison, i32 %956, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %964, i1 true) #5, !dbg !21 + %965 = insertelement <1 x i32> poison, i32 %957, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, <1 x i32> %965, i1 true) #5, !dbg !21 + %966 = insertelement <1 x i32> poison, i32 %958, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, <1 x i32> %966, i1 true) #5, !dbg !21 + %967 = insertelement <1 x i32> poison, i32 %959, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %109, <1 x i32> %967, i1 true) #5, !dbg !21 + %968 = insertelement <1 x i32> poison, i32 %960, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, <1 x i32> %968, i1 true) #5, !dbg !21 + %969 = insertelement <1 x i32> poison, i32 %961, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %111, <1 x i32> %969, i1 true) #5, !dbg !21 + %970 = insertelement <1 x i32> poison, i32 %962, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %112, <1 x i32> %970, i1 true) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %971 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %48, i1 true) #5, !dbg !21 + %972 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %971, i32 1, i32 31), !dbg !21 + %973 = add i32 %972, %971, !dbg !35 + %974 = insertelement <1 x i32> poison, i32 %973, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %48, <1 x i32> %974, i1 %50) #5, !dbg !21 + %975 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %51, i1 true) #5, !dbg !21 + %976 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %975, i32 1, i32 31), !dbg !21 + %977 = add i32 %976, %975, !dbg !35 + %978 = insertelement <1 x i32> poison, i32 %977, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %51, <1 x i32> %978, i1 %50) #5, !dbg !21 + %979 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %52, i1 true) #5, !dbg !21 + %980 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %979, i32 1, i32 31), !dbg !21 + %981 = add i32 %980, %979, !dbg !35 + %982 = insertelement <1 x i32> poison, i32 %981, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %52, <1 x i32> %982, i1 %50) #5, !dbg !21 + %983 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %53, i1 true) #5, !dbg !21 + %984 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %983, i32 1, i32 31), !dbg !21 + %985 = add i32 %984, %983, !dbg !35 + %986 = insertelement <1 x i32> poison, i32 %985, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %53, <1 x i32> %986, i1 %50) #5, !dbg !21 + %987 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %54, i1 true) #5, !dbg !21 + %988 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %987, i32 1, i32 31), !dbg !21 + %989 = add i32 %988, %987, !dbg !35 + %990 = insertelement <1 x i32> poison, i32 %989, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %54, <1 x i32> %990, i1 %50) #5, !dbg !21 + %991 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %55, i1 true) #5, !dbg !21 + %992 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %991, i32 1, i32 31), !dbg !21 + %993 = add i32 %992, %991, !dbg !35 + %994 = insertelement <1 x i32> poison, i32 %993, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %55, <1 x i32> %994, i1 %50) #5, !dbg !21 + %995 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %56, i1 true) #5, !dbg !21 + %996 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %995, i32 1, i32 31), !dbg !21 + %997 = add i32 %996, %995, !dbg !35 + %998 = insertelement <1 x i32> poison, i32 %997, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %56, <1 x i32> %998, i1 %50) #5, !dbg !21 + %999 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %57, i1 true) #5, !dbg !21 + %1000 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %999, i32 1, i32 31), !dbg !21 + %1001 = add i32 %1000, %999, !dbg !35 + %1002 = insertelement <1 x i32> poison, i32 %1001, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %57, <1 x i32> %1002, i1 %50) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1003 = load i32, ptr addrspace(3) %40, align 16, !dbg !21 + %1004 = load i32, ptr addrspace(3) %41, align 8, !dbg !21 + %1005 = load i32, ptr addrspace(3) %42, align 16, !dbg !21 + %1006 = load i32, ptr addrspace(3) %43, align 8, !dbg !21 + %1007 = load i32, ptr addrspace(3) %44, align 16, !dbg !21 + %1008 = load i32, ptr addrspace(3) %45, align 8, !dbg !21 + %1009 = load i32, ptr addrspace(3) %46, align 16, !dbg !21 + %1010 = load i32, ptr addrspace(3) %47, align 8, !dbg !21 + %1011 = mul nuw nsw i32 %931, %.lobit, !dbg !31 + %1012 = mul nuw nsw i32 %932, %.lobit, !dbg !31 + %1013 = mul nuw nsw i32 %933, %.lobit, !dbg !31 + %1014 = mul nuw nsw i32 %934, %.lobit, !dbg !31 + %1015 = mul nuw nsw i32 %935, %.lobit, !dbg !31 + %1016 = mul nuw nsw i32 %936, %.lobit, !dbg !31 + %1017 = mul nuw nsw i32 %937, %.lobit, !dbg !31 + %1018 = mul nuw nsw i32 %938, %.lobit, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1019 = insertelement <1 x i32> poison, i32 %1011, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %101, <1 x i32> %1019, i1 true) #5, !dbg !21 + %1020 = insertelement <1 x i32> poison, i32 %1012, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %1020, i1 true) #5, !dbg !21 + %1021 = insertelement <1 x i32> poison, i32 %1013, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, <1 x i32> %1021, i1 true) #5, !dbg !21 + %1022 = insertelement <1 x i32> poison, i32 %1014, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, <1 x i32> %1022, i1 true) #5, !dbg !21 + %1023 = insertelement <1 x i32> poison, i32 %1015, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %109, <1 x i32> %1023, i1 true) #5, !dbg !21 + %1024 = insertelement <1 x i32> poison, i32 %1016, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, <1 x i32> %1024, i1 true) #5, !dbg !21 + %1025 = insertelement <1 x i32> poison, i32 %1017, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %111, <1 x i32> %1025, i1 true) #5, !dbg !21 + %1026 = insertelement <1 x i32> poison, i32 %1018, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %112, <1 x i32> %1026, i1 true) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1027 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %48, i1 true) #5, !dbg !21 + %1028 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1027, i32 1, i32 31), !dbg !21 + %1029 = add i32 %1028, %1027, !dbg !35 + %1030 = insertelement <1 x i32> poison, i32 %1029, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %48, <1 x i32> %1030, i1 %50) #5, !dbg !21 + %1031 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %51, i1 true) #5, !dbg !21 + %1032 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1031, i32 1, i32 31), !dbg !21 + %1033 = add i32 %1032, %1031, !dbg !35 + %1034 = insertelement <1 x i32> poison, i32 %1033, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %51, <1 x i32> %1034, i1 %50) #5, !dbg !21 + %1035 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %52, i1 true) #5, !dbg !21 + %1036 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1035, i32 1, i32 31), !dbg !21 + %1037 = add i32 %1036, %1035, !dbg !35 + %1038 = insertelement <1 x i32> poison, i32 %1037, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %52, <1 x i32> %1038, i1 %50) #5, !dbg !21 + %1039 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %53, i1 true) #5, !dbg !21 + %1040 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1039, i32 1, i32 31), !dbg !21 + %1041 = add i32 %1040, %1039, !dbg !35 + %1042 = insertelement <1 x i32> poison, i32 %1041, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %53, <1 x i32> %1042, i1 %50) #5, !dbg !21 + %1043 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %54, i1 true) #5, !dbg !21 + %1044 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1043, i32 1, i32 31), !dbg !21 + %1045 = add i32 %1044, %1043, !dbg !35 + %1046 = insertelement <1 x i32> poison, i32 %1045, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %54, <1 x i32> %1046, i1 %50) #5, !dbg !21 + %1047 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %55, i1 true) #5, !dbg !21 + %1048 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1047, i32 1, i32 31), !dbg !21 + %1049 = add i32 %1048, %1047, !dbg !35 + %1050 = insertelement <1 x i32> poison, i32 %1049, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %55, <1 x i32> %1050, i1 %50) #5, !dbg !21 + %1051 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %56, i1 true) #5, !dbg !21 + %1052 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1051, i32 1, i32 31), !dbg !21 + %1053 = add i32 %1052, %1051, !dbg !35 + %1054 = insertelement <1 x i32> poison, i32 %1053, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %56, <1 x i32> %1054, i1 %50) #5, !dbg !21 + %1055 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %57, i1 true) #5, !dbg !21 + %1056 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1055, i32 1, i32 31), !dbg !21 + %1057 = add i32 %1056, %1055, !dbg !35 + %1058 = insertelement <1 x i32> poison, i32 %1057, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %57, <1 x i32> %1058, i1 %50) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1059 = load i32, ptr addrspace(3) %40, align 16, !dbg !21 + %1060 = load i32, ptr addrspace(3) %41, align 8, !dbg !21 + %1061 = load i32, ptr addrspace(3) %42, align 16, !dbg !21 + %1062 = load i32, ptr addrspace(3) %43, align 8, !dbg !21 + %1063 = load i32, ptr addrspace(3) %44, align 16, !dbg !21 + %1064 = load i32, ptr addrspace(3) %45, align 8, !dbg !21 + %1065 = load i32, ptr addrspace(3) %46, align 16, !dbg !21 + %1066 = load i32, ptr addrspace(3) %47, align 8, !dbg !21 + %1067 = mul nuw nsw i32 %947, %96, !dbg !32 + %1068 = mul nuw nsw i32 %948, %96, !dbg !32 + %1069 = mul nuw nsw i32 %949, %96, !dbg !32 + %1070 = mul nuw nsw i32 %950, %96, !dbg !32 + %1071 = mul nuw nsw i32 %951, %96, !dbg !32 + %1072 = mul nuw nsw i32 %952, %96, !dbg !32 + %1073 = mul nuw nsw i32 %953, %96, !dbg !32 + %1074 = mul nuw nsw i32 %954, %96, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1075 = insertelement <1 x i32> poison, i32 %1067, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %101, <1 x i32> %1075, i1 true) #5, !dbg !21 + %1076 = insertelement <1 x i32> poison, i32 %1068, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %1076, i1 true) #5, !dbg !21 + %1077 = insertelement <1 x i32> poison, i32 %1069, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, <1 x i32> %1077, i1 true) #5, !dbg !21 + %1078 = insertelement <1 x i32> poison, i32 %1070, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, <1 x i32> %1078, i1 true) #5, !dbg !21 + %1079 = insertelement <1 x i32> poison, i32 %1071, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %109, <1 x i32> %1079, i1 true) #5, !dbg !21 + %1080 = insertelement <1 x i32> poison, i32 %1072, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, <1 x i32> %1080, i1 true) #5, !dbg !21 + %1081 = insertelement <1 x i32> poison, i32 %1073, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %111, <1 x i32> %1081, i1 true) #5, !dbg !21 + %1082 = insertelement <1 x i32> poison, i32 %1074, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %112, <1 x i32> %1082, i1 true) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1083 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %48, i1 true) #5, !dbg !21 + %1084 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1083, i32 1, i32 31), !dbg !21 + %1085 = add i32 %1084, %1083, !dbg !35 + %1086 = insertelement <1 x i32> poison, i32 %1085, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %48, <1 x i32> %1086, i1 %50) #5, !dbg !21 + %1087 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %51, i1 true) #5, !dbg !21 + %1088 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1087, i32 1, i32 31), !dbg !21 + %1089 = add i32 %1088, %1087, !dbg !35 + %1090 = insertelement <1 x i32> poison, i32 %1089, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %51, <1 x i32> %1090, i1 %50) #5, !dbg !21 + %1091 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %52, i1 true) #5, !dbg !21 + %1092 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1091, i32 1, i32 31), !dbg !21 + %1093 = add i32 %1092, %1091, !dbg !35 + %1094 = insertelement <1 x i32> poison, i32 %1093, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %52, <1 x i32> %1094, i1 %50) #5, !dbg !21 + %1095 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %53, i1 true) #5, !dbg !21 + %1096 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1095, i32 1, i32 31), !dbg !21 + %1097 = add i32 %1096, %1095, !dbg !35 + %1098 = insertelement <1 x i32> poison, i32 %1097, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %53, <1 x i32> %1098, i1 %50) #5, !dbg !21 + %1099 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %54, i1 true) #5, !dbg !21 + %1100 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1099, i32 1, i32 31), !dbg !21 + %1101 = add i32 %1100, %1099, !dbg !35 + %1102 = insertelement <1 x i32> poison, i32 %1101, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %54, <1 x i32> %1102, i1 %50) #5, !dbg !21 + %1103 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %55, i1 true) #5, !dbg !21 + %1104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1103, i32 1, i32 31), !dbg !21 + %1105 = add i32 %1104, %1103, !dbg !35 + %1106 = insertelement <1 x i32> poison, i32 %1105, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %55, <1 x i32> %1106, i1 %50) #5, !dbg !21 + %1107 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %56, i1 true) #5, !dbg !21 + %1108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1107, i32 1, i32 31), !dbg !21 + %1109 = add i32 %1108, %1107, !dbg !35 + %1110 = insertelement <1 x i32> poison, i32 %1109, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %56, <1 x i32> %1110, i1 %50) #5, !dbg !21 + %1111 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %57, i1 true) #5, !dbg !21 + %1112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1111, i32 1, i32 31), !dbg !21 + %1113 = add i32 %1112, %1111, !dbg !35 + %1114 = insertelement <1 x i32> poison, i32 %1113, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %57, <1 x i32> %1114, i1 %50) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1115 = load i32, ptr addrspace(3) %40, align 16, !dbg !21 + %1116 = load i32, ptr addrspace(3) %41, align 8, !dbg !21 + %1117 = load i32, ptr addrspace(3) %42, align 16, !dbg !21 + %1118 = load i32, ptr addrspace(3) %43, align 8, !dbg !21 + %1119 = load i32, ptr addrspace(3) %44, align 16, !dbg !21 + %1120 = load i32, ptr addrspace(3) %45, align 8, !dbg !21 + %1121 = load i32, ptr addrspace(3) %46, align 16, !dbg !21 + %1122 = load i32, ptr addrspace(3) %47, align 8, !dbg !21 + %1123 = mul nuw nsw i32 %947, %.lobit, !dbg !25 + %1124 = mul nuw nsw i32 %948, %.lobit, !dbg !25 + %1125 = mul nuw nsw i32 %949, %.lobit, !dbg !25 + %1126 = mul nuw nsw i32 %950, %.lobit, !dbg !25 + %1127 = mul nuw nsw i32 %951, %.lobit, !dbg !25 + %1128 = mul nuw nsw i32 %952, %.lobit, !dbg !25 + %1129 = mul nuw nsw i32 %953, %.lobit, !dbg !25 + %1130 = mul nuw nsw i32 %954, %.lobit, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1131 = insertelement <1 x i32> poison, i32 %1123, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %101, <1 x i32> %1131, i1 true) #5, !dbg !21 + %1132 = insertelement <1 x i32> poison, i32 %1124, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %1132, i1 true) #5, !dbg !21 + %1133 = insertelement <1 x i32> poison, i32 %1125, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, <1 x i32> %1133, i1 true) #5, !dbg !21 + %1134 = insertelement <1 x i32> poison, i32 %1126, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, <1 x i32> %1134, i1 true) #5, !dbg !21 + %1135 = insertelement <1 x i32> poison, i32 %1127, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %109, <1 x i32> %1135, i1 true) #5, !dbg !21 + %1136 = insertelement <1 x i32> poison, i32 %1128, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, <1 x i32> %1136, i1 true) #5, !dbg !21 + %1137 = insertelement <1 x i32> poison, i32 %1129, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %111, <1 x i32> %1137, i1 true) #5, !dbg !21 + %1138 = insertelement <1 x i32> poison, i32 %1130, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %112, <1 x i32> %1138, i1 true) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1139 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %48, i1 true) #5, !dbg !21 + %1140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1139, i32 1, i32 31), !dbg !21 + %1141 = add i32 %1140, %1139, !dbg !35 + %1142 = insertelement <1 x i32> poison, i32 %1141, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %48, <1 x i32> %1142, i1 %50) #5, !dbg !21 + %1143 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %51, i1 true) #5, !dbg !21 + %1144 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1143, i32 1, i32 31), !dbg !21 + %1145 = add i32 %1144, %1143, !dbg !35 + %1146 = insertelement <1 x i32> poison, i32 %1145, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %51, <1 x i32> %1146, i1 %50) #5, !dbg !21 + %1147 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %52, i1 true) #5, !dbg !21 + %1148 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1147, i32 1, i32 31), !dbg !21 + %1149 = add i32 %1148, %1147, !dbg !35 + %1150 = insertelement <1 x i32> poison, i32 %1149, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %52, <1 x i32> %1150, i1 %50) #5, !dbg !21 + %1151 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %53, i1 true) #5, !dbg !21 + %1152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1151, i32 1, i32 31), !dbg !21 + %1153 = add i32 %1152, %1151, !dbg !35 + %1154 = insertelement <1 x i32> poison, i32 %1153, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %53, <1 x i32> %1154, i1 %50) #5, !dbg !21 + %1155 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %54, i1 true) #5, !dbg !21 + %1156 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1155, i32 1, i32 31), !dbg !21 + %1157 = add i32 %1156, %1155, !dbg !35 + %1158 = insertelement <1 x i32> poison, i32 %1157, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %54, <1 x i32> %1158, i1 %50) #5, !dbg !21 + %1159 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %55, i1 true) #5, !dbg !21 + %1160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1159, i32 1, i32 31), !dbg !21 + %1161 = add i32 %1160, %1159, !dbg !35 + %1162 = insertelement <1 x i32> poison, i32 %1161, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %55, <1 x i32> %1162, i1 %50) #5, !dbg !21 + %1163 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %56, i1 true) #5, !dbg !21 + %1164 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1163, i32 1, i32 31), !dbg !21 + %1165 = add i32 %1164, %1163, !dbg !35 + %1166 = insertelement <1 x i32> poison, i32 %1165, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %56, <1 x i32> %1166, i1 %50) #5, !dbg !21 + %1167 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %57, i1 true) #5, !dbg !21 + %1168 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1167, i32 1, i32 31), !dbg !21 + %1169 = add i32 %1168, %1167, !dbg !35 + %1170 = insertelement <1 x i32> poison, i32 %1169, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %57, <1 x i32> %1170, i1 %50) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1171 = load i32, ptr addrspace(3) %40, align 16, !dbg !21 + %1172 = load i32, ptr addrspace(3) %41, align 8, !dbg !21 + %1173 = load i32, ptr addrspace(3) %42, align 16, !dbg !21 + %1174 = load i32, ptr addrspace(3) %43, align 8, !dbg !21 + %1175 = load i32, ptr addrspace(3) %44, align 16, !dbg !21 + %1176 = load i32, ptr addrspace(3) %45, align 8, !dbg !21 + %1177 = load i32, ptr addrspace(3) %46, align 16, !dbg !21 + %1178 = load i32, ptr addrspace(3) %47, align 8, !dbg !21 + %1179 = icmp slt i32 %1003, %1059, !dbg !36 + %1180 = icmp slt i32 %1004, %1060, !dbg !36 + %1181 = icmp slt i32 %1005, %1061, !dbg !36 + %1182 = icmp slt i32 %1006, %1062, !dbg !36 + %1183 = icmp sge i32 %1007, %1063, !dbg !36 + %1184 = icmp sge i32 %1008, %1064, !dbg !36 + %1185 = icmp sge i32 %1009, %1065, !dbg !36 + %1186 = icmp sge i32 %1010, %1066, !dbg !36 + %1187 = icmp eq i32 %1003, %1059, !dbg !37 + %1188 = icmp eq i32 %1004, %1060, !dbg !37 + %1189 = icmp eq i32 %1005, %1061, !dbg !37 + %1190 = icmp eq i32 %1006, %1062, !dbg !37 + %1191 = icmp ne i32 %1007, %1063, !dbg !37 + %1192 = icmp ne i32 %1008, %1064, !dbg !37 + %1193 = icmp ne i32 %1009, %1065, !dbg !37 + %1194 = icmp ne i32 %1010, %1066, !dbg !37 + %1195 = icmp sgt i32 %1115, %1171, !dbg !38 + %1196 = icmp sgt i32 %1116, %1172, !dbg !38 + %1197 = icmp sgt i32 %1117, %1173, !dbg !38 + %1198 = icmp sgt i32 %1118, %1174, !dbg !38 + %1199 = icmp sle i32 %1119, %1175, !dbg !38 + %1200 = icmp sle i32 %1120, %1176, !dbg !38 + %1201 = icmp sle i32 %1121, %1177, !dbg !38 + %1202 = icmp sle i32 %1122, %1178, !dbg !38 + %1203 = and i1 %1187, %1195, !dbg !39 + %1204 = and i1 %1188, %1196, !dbg !39 + %1205 = and i1 %1189, %1197, !dbg !39 + %1206 = and i1 %1190, %1198, !dbg !39 + %.not60 = or i1 %1191, %1199, !dbg !39 + %.not64 = or i1 %1192, %1200, !dbg !39 + %.not68 = or i1 %1193, %1201, !dbg !39 + %.not72 = or i1 %1194, %1202, !dbg !40 + %1207 = or i1 %1179, %1203, !dbg !40 + %1208 = or i1 %1180, %1204, !dbg !40 + %1209 = or i1 %1181, %1205, !dbg !40 + %1210 = or i1 %1182, %1206, !dbg !40 + %.not58 = and i1 %1183, %.not60, !dbg !40 + %.not62 = and i1 %1184, %.not64, !dbg !40 + %.not66 = and i1 %1185, %.not68, !dbg !40 + %.not70 = and i1 %1186, %.not72, !dbg !41 + %1211 = xor i32 %1059, %1003, !dbg !42 + %1212 = xor i32 %1060, %1004, !dbg !42 + %1213 = xor i32 %1061, %1005, !dbg !42 + %1214 = xor i32 %1062, %1006, !dbg !42 + %1215 = xor i32 %1063, %1007, !dbg !42 + %1216 = xor i32 %1064, %1008, !dbg !42 + %1217 = xor i32 %1065, %1009, !dbg !42 + %1218 = xor i32 %1066, %1010, !dbg !42 + %1219 = select i1 %1207, i32 %1211, i32 0, !dbg !43 + %1220 = select i1 %1208, i32 %1212, i32 0, !dbg !43 + %1221 = select i1 %1209, i32 %1213, i32 0, !dbg !43 + %1222 = select i1 %1210, i32 %1214, i32 0, !dbg !43 + %1223 = select i1 %.not58, i32 %1215, i32 0, !dbg !43 + %1224 = select i1 %.not62, i32 %1216, i32 0, !dbg !43 + %1225 = select i1 %.not66, i32 %1217, i32 0, !dbg !43 + %1226 = select i1 %.not70, i32 %1218, i32 0, !dbg !43 + %1227 = xor i32 %1219, %931, !dbg !33 + %1228 = xor i32 %1220, %932, !dbg !33 + %1229 = xor i32 %1221, %933, !dbg !33 + %1230 = xor i32 %1222, %934, !dbg !33 + %1231 = xor i32 %1223, %935, !dbg !33 + %1232 = xor i32 %1224, %936, !dbg !33 + %1233 = xor i32 %1225, %937, !dbg !33 + %1234 = xor i32 %1226, %938, !dbg !33 + %1235 = xor i32 %1171, %1115, !dbg !44 + %1236 = xor i32 %1172, %1116, !dbg !44 + %1237 = xor i32 %1173, %1117, !dbg !44 + %1238 = xor i32 %1174, %1118, !dbg !44 + %1239 = xor i32 %1175, %1119, !dbg !44 + %1240 = xor i32 %1176, %1120, !dbg !44 + %1241 = xor i32 %1177, %1121, !dbg !44 + %1242 = xor i32 %1178, %1122, !dbg !44 + %1243 = select i1 %1207, i32 %1235, i32 0, !dbg !45 + %1244 = select i1 %1208, i32 %1236, i32 0, !dbg !45 + %1245 = select i1 %1209, i32 %1237, i32 0, !dbg !45 + %1246 = select i1 %1210, i32 %1238, i32 0, !dbg !45 + %1247 = select i1 %.not58, i32 %1239, i32 0, !dbg !45 + %1248 = select i1 %.not62, i32 %1240, i32 0, !dbg !45 + %1249 = select i1 %.not66, i32 %1241, i32 0, !dbg !45 + %1250 = select i1 %.not70, i32 %1242, i32 0, !dbg !45 + %1251 = xor i32 %1243, %947, !dbg !34 + %1252 = xor i32 %1244, %948, !dbg !34 + %1253 = xor i32 %1245, %949, !dbg !34 + %1254 = xor i32 %1246, %950, !dbg !34 + %1255 = xor i32 %1247, %951, !dbg !34 + %1256 = xor i32 %1248, %952, !dbg !34 + %1257 = xor i32 %1249, %953, !dbg !34 + %1258 = xor i32 %1250, %954, !dbg !34 + %1259 = icmp slt i32 %1227, %1231, !dbg !36 + %1260 = icmp slt i32 %1228, %1232, !dbg !36 + %1261 = icmp slt i32 %1229, %1233, !dbg !36 + %1262 = icmp slt i32 %1230, %1234, !dbg !36 + %1263 = icmp eq i32 %1227, %1231, !dbg !37 + %1264 = icmp eq i32 %1228, %1232, !dbg !37 + %1265 = icmp eq i32 %1229, %1233, !dbg !37 + %1266 = icmp eq i32 %1230, %1234, !dbg !37 + %1267 = icmp sgt i32 %1251, %1255, !dbg !38 + %1268 = icmp sgt i32 %1252, %1256, !dbg !38 + %1269 = icmp sgt i32 %1253, %1257, !dbg !38 + %1270 = icmp sgt i32 %1254, %1258, !dbg !38 + %1271 = and i1 %1263, %1267, !dbg !39 + %1272 = and i1 %1264, %1268, !dbg !39 + %1273 = and i1 %1265, %1269, !dbg !39 + %1274 = and i1 %1266, %1270, !dbg !39 + %1275 = or i1 %1259, %1271, !dbg !40 + %1276 = or i1 %1260, %1272, !dbg !40 + %1277 = or i1 %1261, %1273, !dbg !40 + %1278 = or i1 %1262, %1274, !dbg !40 + %1279 = xor i32 %1231, %1227, !dbg !42 + %1280 = xor i32 %1232, %1228, !dbg !42 + %1281 = xor i32 %1233, %1229, !dbg !42 + %1282 = xor i32 %1234, %1230, !dbg !42 + %1283 = select i1 %1275, i32 %1279, i32 0, !dbg !43 + %1284 = select i1 %1276, i32 %1280, i32 0, !dbg !43 + %1285 = select i1 %1277, i32 %1281, i32 0, !dbg !43 + %1286 = select i1 %1278, i32 %1282, i32 0, !dbg !43 + %1287 = xor i32 %1283, %1227, !dbg !33 + %1288 = xor i32 %1284, %1228, !dbg !33 + %1289 = xor i32 %1285, %1229, !dbg !33 + %1290 = xor i32 %1286, %1230, !dbg !33 + %1291 = xor i32 %1283, %1231, !dbg !33 + %1292 = xor i32 %1284, %1232, !dbg !33 + %1293 = xor i32 %1285, %1233, !dbg !33 + %1294 = xor i32 %1286, %1234, !dbg !33 + %1295 = xor i32 %1255, %1251, !dbg !44 + %1296 = xor i32 %1256, %1252, !dbg !44 + %1297 = xor i32 %1257, %1253, !dbg !44 + %1298 = xor i32 %1258, %1254, !dbg !44 + %1299 = select i1 %1275, i32 %1295, i32 0, !dbg !45 + %1300 = select i1 %1276, i32 %1296, i32 0, !dbg !45 + %1301 = select i1 %1277, i32 %1297, i32 0, !dbg !45 + %1302 = select i1 %1278, i32 %1298, i32 0, !dbg !45 + %1303 = xor i32 %1299, %1251, !dbg !34 + %1304 = xor i32 %1300, %1252, !dbg !34 + %1305 = xor i32 %1301, %1253, !dbg !34 + %1306 = xor i32 %1302, %1254, !dbg !34 + %1307 = xor i32 %1299, %1255, !dbg !34 + %1308 = xor i32 %1300, %1256, !dbg !34 + %1309 = xor i32 %1301, %1257, !dbg !34 + %1310 = xor i32 %1302, %1258, !dbg !34 + %1311 = icmp slt i32 %1287, %1289, !dbg !36 + %1312 = icmp slt i32 %1288, %1290, !dbg !36 + %1313 = icmp slt i32 %1291, %1293, !dbg !36 + %1314 = icmp slt i32 %1292, %1294, !dbg !36 + %1315 = icmp eq i32 %1287, %1289, !dbg !37 + %1316 = icmp eq i32 %1288, %1290, !dbg !37 + %1317 = icmp eq i32 %1291, %1293, !dbg !37 + %1318 = icmp eq i32 %1292, %1294, !dbg !37 + %1319 = icmp sgt i32 %1303, %1305, !dbg !38 + %1320 = icmp sgt i32 %1304, %1306, !dbg !38 + %1321 = icmp sgt i32 %1307, %1309, !dbg !38 + %1322 = icmp sgt i32 %1308, %1310, !dbg !38 + %1323 = and i1 %1315, %1319, !dbg !39 + %1324 = and i1 %1316, %1320, !dbg !39 + %1325 = and i1 %1317, %1321, !dbg !39 + %1326 = and i1 %1318, %1322, !dbg !39 + %1327 = or i1 %1311, %1323, !dbg !40 + %1328 = or i1 %1312, %1324, !dbg !40 + %1329 = or i1 %1313, %1325, !dbg !40 + %1330 = or i1 %1314, %1326, !dbg !40 + %1331 = xor i32 %1289, %1287, !dbg !42 + %1332 = xor i32 %1290, %1288, !dbg !42 + %1333 = xor i32 %1293, %1291, !dbg !42 + %1334 = xor i32 %1294, %1292, !dbg !42 + %1335 = select i1 %1327, i32 %1331, i32 0, !dbg !43 + %1336 = select i1 %1328, i32 %1332, i32 0, !dbg !43 + %1337 = select i1 %1329, i32 %1333, i32 0, !dbg !43 + %1338 = select i1 %1330, i32 %1334, i32 0, !dbg !43 + %1339 = xor i32 %1335, %1287, !dbg !33 + %1340 = xor i32 %1336, %1288, !dbg !33 + %1341 = xor i32 %1335, %1289, !dbg !33 + %1342 = xor i32 %1336, %1290, !dbg !33 + %1343 = xor i32 %1337, %1291, !dbg !33 + %1344 = xor i32 %1338, %1292, !dbg !33 + %1345 = xor i32 %1337, %1293, !dbg !33 + %1346 = xor i32 %1338, %1294, !dbg !33 + %1347 = xor i32 %1305, %1303, !dbg !44 + %1348 = xor i32 %1306, %1304, !dbg !44 + %1349 = xor i32 %1309, %1307, !dbg !44 + %1350 = xor i32 %1310, %1308, !dbg !44 + %1351 = select i1 %1327, i32 %1347, i32 0, !dbg !45 + %1352 = select i1 %1328, i32 %1348, i32 0, !dbg !45 + %1353 = select i1 %1329, i32 %1349, i32 0, !dbg !45 + %1354 = select i1 %1330, i32 %1350, i32 0, !dbg !45 + %1355 = xor i32 %1351, %1303, !dbg !34 + %1356 = xor i32 %1352, %1304, !dbg !34 + %1357 = xor i32 %1351, %1305, !dbg !34 + %1358 = xor i32 %1352, %1306, !dbg !34 + %1359 = xor i32 %1353, %1307, !dbg !34 + %1360 = xor i32 %1354, %1308, !dbg !34 + %1361 = xor i32 %1353, %1309, !dbg !34 + %1362 = xor i32 %1354, %1310, !dbg !34 + %1363 = icmp slt i32 %1339, %1340, !dbg !36 + %1364 = icmp slt i32 %1341, %1342, !dbg !36 + %1365 = icmp slt i32 %1343, %1344, !dbg !36 + %1366 = icmp slt i32 %1345, %1346, !dbg !36 + %1367 = icmp eq i32 %1339, %1340, !dbg !37 + %1368 = icmp eq i32 %1341, %1342, !dbg !37 + %1369 = icmp eq i32 %1343, %1344, !dbg !37 + %1370 = icmp eq i32 %1345, %1346, !dbg !37 + %1371 = icmp sgt i32 %1355, %1356, !dbg !38 + %1372 = icmp sgt i32 %1357, %1358, !dbg !38 + %1373 = icmp sgt i32 %1359, %1360, !dbg !38 + %1374 = icmp sgt i32 %1361, %1362, !dbg !38 + %1375 = and i1 %1367, %1371, !dbg !39 + %1376 = and i1 %1368, %1372, !dbg !39 + %1377 = and i1 %1369, %1373, !dbg !39 + %1378 = and i1 %1370, %1374, !dbg !39 + %1379 = or i1 %1363, %1375, !dbg !40 + %1380 = or i1 %1364, %1376, !dbg !40 + %1381 = or i1 %1365, %1377, !dbg !40 + %1382 = or i1 %1366, %1378, !dbg !40 + %1383 = xor i32 %1340, %1339, !dbg !42 + %1384 = xor i32 %1342, %1341, !dbg !42 + %1385 = xor i32 %1344, %1343, !dbg !42 + %1386 = xor i32 %1346, %1345, !dbg !42 + %1387 = select i1 %1379, i32 %1383, i32 0, !dbg !43 + %1388 = select i1 %1380, i32 %1384, i32 0, !dbg !43 + %1389 = select i1 %1381, i32 %1385, i32 0, !dbg !43 + %1390 = select i1 %1382, i32 %1386, i32 0, !dbg !43 + %1391 = xor i32 %1387, %1339, !dbg !33 + %1392 = xor i32 %1387, %1340, !dbg !33 + %1393 = xor i32 %1388, %1341, !dbg !33 + %1394 = xor i32 %1388, %1342, !dbg !33 + %1395 = xor i32 %1389, %1343, !dbg !33 + %1396 = xor i32 %1389, %1344, !dbg !33 + %1397 = xor i32 %1390, %1345, !dbg !33 + %1398 = xor i32 %1390, %1346, !dbg !33 + %1399 = xor i32 %1356, %1355, !dbg !44 + %1400 = xor i32 %1358, %1357, !dbg !44 + %1401 = xor i32 %1360, %1359, !dbg !44 + %1402 = xor i32 %1362, %1361, !dbg !44 + %1403 = select i1 %1379, i32 %1399, i32 0, !dbg !45 + %1404 = select i1 %1380, i32 %1400, i32 0, !dbg !45 + %1405 = select i1 %1381, i32 %1401, i32 0, !dbg !45 + %1406 = select i1 %1382, i32 %1402, i32 0, !dbg !45 + %1407 = xor i32 %1403, %1355, !dbg !34 + %1408 = xor i32 %1403, %1356, !dbg !34 + %1409 = xor i32 %1404, %1357, !dbg !34 + %1410 = xor i32 %1404, %1358, !dbg !34 + %1411 = xor i32 %1405, %1359, !dbg !34 + %1412 = xor i32 %1405, %1360, !dbg !34 + %1413 = xor i32 %1406, %1361, !dbg !34 + %1414 = xor i32 %1406, %1362, !dbg !34 + %1415 = mul nuw nsw i32 %1391, %96, !dbg !30 + %1416 = mul nuw nsw i32 %1392, %96, !dbg !30 + %1417 = mul nuw nsw i32 %1393, %96, !dbg !30 + %1418 = mul nuw nsw i32 %1394, %96, !dbg !30 + %1419 = mul nuw nsw i32 %1395, %96, !dbg !30 + %1420 = mul nuw nsw i32 %1396, %96, !dbg !30 + %1421 = mul nuw nsw i32 %1397, %96, !dbg !30 + %1422 = mul nuw nsw i32 %1398, %96, !dbg !30 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1423 = insertelement <1 x i32> poison, i32 %1415, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %101, <1 x i32> %1423, i1 true) #5, !dbg !21 + %1424 = insertelement <1 x i32> poison, i32 %1416, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %1424, i1 true) #5, !dbg !21 + %1425 = insertelement <1 x i32> poison, i32 %1417, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, <1 x i32> %1425, i1 true) #5, !dbg !21 + %1426 = insertelement <1 x i32> poison, i32 %1418, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, <1 x i32> %1426, i1 true) #5, !dbg !21 + %1427 = insertelement <1 x i32> poison, i32 %1419, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %109, <1 x i32> %1427, i1 true) #5, !dbg !21 + %1428 = insertelement <1 x i32> poison, i32 %1420, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, <1 x i32> %1428, i1 true) #5, !dbg !21 + %1429 = insertelement <1 x i32> poison, i32 %1421, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %111, <1 x i32> %1429, i1 true) #5, !dbg !21 + %1430 = insertelement <1 x i32> poison, i32 %1422, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %112, <1 x i32> %1430, i1 true) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1431 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %48, i1 true) #5, !dbg !21 + %1432 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1431, i32 1, i32 31), !dbg !21 + %1433 = add i32 %1432, %1431, !dbg !35 + %1434 = insertelement <1 x i32> poison, i32 %1433, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %48, <1 x i32> %1434, i1 %50) #5, !dbg !21 + %1435 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %51, i1 true) #5, !dbg !21 + %1436 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1435, i32 1, i32 31), !dbg !21 + %1437 = add i32 %1436, %1435, !dbg !35 + %1438 = insertelement <1 x i32> poison, i32 %1437, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %51, <1 x i32> %1438, i1 %50) #5, !dbg !21 + %1439 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %52, i1 true) #5, !dbg !21 + %1440 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1439, i32 1, i32 31), !dbg !21 + %1441 = add i32 %1440, %1439, !dbg !35 + %1442 = insertelement <1 x i32> poison, i32 %1441, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %52, <1 x i32> %1442, i1 %50) #5, !dbg !21 + %1443 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %53, i1 true) #5, !dbg !21 + %1444 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1443, i32 1, i32 31), !dbg !21 + %1445 = add i32 %1444, %1443, !dbg !35 + %1446 = insertelement <1 x i32> poison, i32 %1445, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %53, <1 x i32> %1446, i1 %50) #5, !dbg !21 + %1447 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %54, i1 true) #5, !dbg !21 + %1448 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1447, i32 1, i32 31), !dbg !21 + %1449 = add i32 %1448, %1447, !dbg !35 + %1450 = insertelement <1 x i32> poison, i32 %1449, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %54, <1 x i32> %1450, i1 %50) #5, !dbg !21 + %1451 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %55, i1 true) #5, !dbg !21 + %1452 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1451, i32 1, i32 31), !dbg !21 + %1453 = add i32 %1452, %1451, !dbg !35 + %1454 = insertelement <1 x i32> poison, i32 %1453, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %55, <1 x i32> %1454, i1 %50) #5, !dbg !21 + %1455 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %56, i1 true) #5, !dbg !21 + %1456 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1455, i32 1, i32 31), !dbg !21 + %1457 = add i32 %1456, %1455, !dbg !35 + %1458 = insertelement <1 x i32> poison, i32 %1457, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %56, <1 x i32> %1458, i1 %50) #5, !dbg !21 + %1459 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %57, i1 true) #5, !dbg !21 + %1460 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1459, i32 1, i32 31), !dbg !21 + %1461 = add i32 %1460, %1459, !dbg !35 + %1462 = insertelement <1 x i32> poison, i32 %1461, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %57, <1 x i32> %1462, i1 %50) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1463 = load i32, ptr addrspace(3) %40, align 16, !dbg !21 + %1464 = load i32, ptr addrspace(3) %41, align 8, !dbg !21 + %1465 = load i32, ptr addrspace(3) %42, align 16, !dbg !21 + %1466 = load i32, ptr addrspace(3) %43, align 8, !dbg !21 + %1467 = load i32, ptr addrspace(3) %44, align 16, !dbg !21 + %1468 = load i32, ptr addrspace(3) %45, align 8, !dbg !21 + %1469 = load i32, ptr addrspace(3) %46, align 16, !dbg !21 + %1470 = load i32, ptr addrspace(3) %47, align 8, !dbg !21 + %1471 = mul nuw nsw i32 %1391, %.lobit, !dbg !31 + %1472 = mul nuw nsw i32 %1392, %.lobit, !dbg !31 + %1473 = mul nuw nsw i32 %1393, %.lobit, !dbg !31 + %1474 = mul nuw nsw i32 %1394, %.lobit, !dbg !31 + %1475 = mul nuw nsw i32 %1395, %.lobit, !dbg !31 + %1476 = mul nuw nsw i32 %1396, %.lobit, !dbg !31 + %1477 = mul nuw nsw i32 %1397, %.lobit, !dbg !31 + %1478 = mul nuw nsw i32 %1398, %.lobit, !dbg !31 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1479 = insertelement <1 x i32> poison, i32 %1471, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %101, <1 x i32> %1479, i1 true) #5, !dbg !21 + %1480 = insertelement <1 x i32> poison, i32 %1472, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %1480, i1 true) #5, !dbg !21 + %1481 = insertelement <1 x i32> poison, i32 %1473, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, <1 x i32> %1481, i1 true) #5, !dbg !21 + %1482 = insertelement <1 x i32> poison, i32 %1474, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, <1 x i32> %1482, i1 true) #5, !dbg !21 + %1483 = insertelement <1 x i32> poison, i32 %1475, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %109, <1 x i32> %1483, i1 true) #5, !dbg !21 + %1484 = insertelement <1 x i32> poison, i32 %1476, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, <1 x i32> %1484, i1 true) #5, !dbg !21 + %1485 = insertelement <1 x i32> poison, i32 %1477, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %111, <1 x i32> %1485, i1 true) #5, !dbg !21 + %1486 = insertelement <1 x i32> poison, i32 %1478, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %112, <1 x i32> %1486, i1 true) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1487 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %48, i1 true) #5, !dbg !21 + %1488 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1487, i32 1, i32 31), !dbg !21 + %1489 = add i32 %1488, %1487, !dbg !35 + %1490 = insertelement <1 x i32> poison, i32 %1489, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %48, <1 x i32> %1490, i1 %50) #5, !dbg !21 + %1491 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %51, i1 true) #5, !dbg !21 + %1492 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1491, i32 1, i32 31), !dbg !21 + %1493 = add i32 %1492, %1491, !dbg !35 + %1494 = insertelement <1 x i32> poison, i32 %1493, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %51, <1 x i32> %1494, i1 %50) #5, !dbg !21 + %1495 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %52, i1 true) #5, !dbg !21 + %1496 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1495, i32 1, i32 31), !dbg !21 + %1497 = add i32 %1496, %1495, !dbg !35 + %1498 = insertelement <1 x i32> poison, i32 %1497, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %52, <1 x i32> %1498, i1 %50) #5, !dbg !21 + %1499 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %53, i1 true) #5, !dbg !21 + %1500 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1499, i32 1, i32 31), !dbg !21 + %1501 = add i32 %1500, %1499, !dbg !35 + %1502 = insertelement <1 x i32> poison, i32 %1501, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %53, <1 x i32> %1502, i1 %50) #5, !dbg !21 + %1503 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %54, i1 true) #5, !dbg !21 + %1504 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1503, i32 1, i32 31), !dbg !21 + %1505 = add i32 %1504, %1503, !dbg !35 + %1506 = insertelement <1 x i32> poison, i32 %1505, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %54, <1 x i32> %1506, i1 %50) #5, !dbg !21 + %1507 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %55, i1 true) #5, !dbg !21 + %1508 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1507, i32 1, i32 31), !dbg !21 + %1509 = add i32 %1508, %1507, !dbg !35 + %1510 = insertelement <1 x i32> poison, i32 %1509, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %55, <1 x i32> %1510, i1 %50) #5, !dbg !21 + %1511 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %56, i1 true) #5, !dbg !21 + %1512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1511, i32 1, i32 31), !dbg !21 + %1513 = add i32 %1512, %1511, !dbg !35 + %1514 = insertelement <1 x i32> poison, i32 %1513, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %56, <1 x i32> %1514, i1 %50) #5, !dbg !21 + %1515 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %57, i1 true) #5, !dbg !21 + %1516 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1515, i32 1, i32 31), !dbg !21 + %1517 = add i32 %1516, %1515, !dbg !35 + %1518 = insertelement <1 x i32> poison, i32 %1517, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %57, <1 x i32> %1518, i1 %50) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1519 = load i32, ptr addrspace(3) %40, align 16, !dbg !21 + %1520 = load i32, ptr addrspace(3) %41, align 8, !dbg !21 + %1521 = load i32, ptr addrspace(3) %42, align 16, !dbg !21 + %1522 = load i32, ptr addrspace(3) %43, align 8, !dbg !21 + %1523 = load i32, ptr addrspace(3) %44, align 16, !dbg !21 + %1524 = load i32, ptr addrspace(3) %45, align 8, !dbg !21 + %1525 = load i32, ptr addrspace(3) %46, align 16, !dbg !21 + %1526 = load i32, ptr addrspace(3) %47, align 8, !dbg !21 + %1527 = mul nuw nsw i32 %1407, %96, !dbg !32 + %1528 = mul nuw nsw i32 %1408, %96, !dbg !32 + %1529 = mul nuw nsw i32 %1409, %96, !dbg !32 + %1530 = mul nuw nsw i32 %1410, %96, !dbg !32 + %1531 = mul nuw nsw i32 %1411, %96, !dbg !32 + %1532 = mul nuw nsw i32 %1412, %96, !dbg !32 + %1533 = mul nuw nsw i32 %1413, %96, !dbg !32 + %1534 = mul nuw nsw i32 %1414, %96, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1535 = insertelement <1 x i32> poison, i32 %1527, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %101, <1 x i32> %1535, i1 true) #5, !dbg !21 + %1536 = insertelement <1 x i32> poison, i32 %1528, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %1536, i1 true) #5, !dbg !21 + %1537 = insertelement <1 x i32> poison, i32 %1529, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, <1 x i32> %1537, i1 true) #5, !dbg !21 + %1538 = insertelement <1 x i32> poison, i32 %1530, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, <1 x i32> %1538, i1 true) #5, !dbg !21 + %1539 = insertelement <1 x i32> poison, i32 %1531, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %109, <1 x i32> %1539, i1 true) #5, !dbg !21 + %1540 = insertelement <1 x i32> poison, i32 %1532, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, <1 x i32> %1540, i1 true) #5, !dbg !21 + %1541 = insertelement <1 x i32> poison, i32 %1533, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %111, <1 x i32> %1541, i1 true) #5, !dbg !21 + %1542 = insertelement <1 x i32> poison, i32 %1534, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %112, <1 x i32> %1542, i1 true) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1543 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %48, i1 true) #5, !dbg !21 + %1544 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1543, i32 1, i32 31), !dbg !21 + %1545 = add i32 %1544, %1543, !dbg !35 + %1546 = insertelement <1 x i32> poison, i32 %1545, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %48, <1 x i32> %1546, i1 %50) #5, !dbg !21 + %1547 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %51, i1 true) #5, !dbg !21 + %1548 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1547, i32 1, i32 31), !dbg !21 + %1549 = add i32 %1548, %1547, !dbg !35 + %1550 = insertelement <1 x i32> poison, i32 %1549, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %51, <1 x i32> %1550, i1 %50) #5, !dbg !21 + %1551 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %52, i1 true) #5, !dbg !21 + %1552 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1551, i32 1, i32 31), !dbg !21 + %1553 = add i32 %1552, %1551, !dbg !35 + %1554 = insertelement <1 x i32> poison, i32 %1553, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %52, <1 x i32> %1554, i1 %50) #5, !dbg !21 + %1555 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %53, i1 true) #5, !dbg !21 + %1556 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1555, i32 1, i32 31), !dbg !21 + %1557 = add i32 %1556, %1555, !dbg !35 + %1558 = insertelement <1 x i32> poison, i32 %1557, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %53, <1 x i32> %1558, i1 %50) #5, !dbg !21 + %1559 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %54, i1 true) #5, !dbg !21 + %1560 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1559, i32 1, i32 31), !dbg !21 + %1561 = add i32 %1560, %1559, !dbg !35 + %1562 = insertelement <1 x i32> poison, i32 %1561, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %54, <1 x i32> %1562, i1 %50) #5, !dbg !21 + %1563 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %55, i1 true) #5, !dbg !21 + %1564 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1563, i32 1, i32 31), !dbg !21 + %1565 = add i32 %1564, %1563, !dbg !35 + %1566 = insertelement <1 x i32> poison, i32 %1565, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %55, <1 x i32> %1566, i1 %50) #5, !dbg !21 + %1567 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %56, i1 true) #5, !dbg !21 + %1568 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1567, i32 1, i32 31), !dbg !21 + %1569 = add i32 %1568, %1567, !dbg !35 + %1570 = insertelement <1 x i32> poison, i32 %1569, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %56, <1 x i32> %1570, i1 %50) #5, !dbg !21 + %1571 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %57, i1 true) #5, !dbg !21 + %1572 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1571, i32 1, i32 31), !dbg !21 + %1573 = add i32 %1572, %1571, !dbg !35 + %1574 = insertelement <1 x i32> poison, i32 %1573, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %57, <1 x i32> %1574, i1 %50) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1575 = load i32, ptr addrspace(3) %40, align 16, !dbg !21 + %1576 = load i32, ptr addrspace(3) %41, align 8, !dbg !21 + %1577 = load i32, ptr addrspace(3) %42, align 16, !dbg !21 + %1578 = load i32, ptr addrspace(3) %43, align 8, !dbg !21 + %1579 = load i32, ptr addrspace(3) %44, align 16, !dbg !21 + %1580 = load i32, ptr addrspace(3) %45, align 8, !dbg !21 + %1581 = load i32, ptr addrspace(3) %46, align 16, !dbg !21 + %1582 = load i32, ptr addrspace(3) %47, align 8, !dbg !21 + %1583 = mul nuw nsw i32 %1407, %.lobit, !dbg !25 + %1584 = mul nuw nsw i32 %1408, %.lobit, !dbg !25 + %1585 = mul nuw nsw i32 %1409, %.lobit, !dbg !25 + %1586 = mul nuw nsw i32 %1410, %.lobit, !dbg !25 + %1587 = mul nuw nsw i32 %1411, %.lobit, !dbg !25 + %1588 = mul nuw nsw i32 %1412, %.lobit, !dbg !25 + %1589 = mul nuw nsw i32 %1413, %.lobit, !dbg !25 + %1590 = mul nuw nsw i32 %1414, %.lobit, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1591 = insertelement <1 x i32> poison, i32 %1583, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %101, <1 x i32> %1591, i1 true) #5, !dbg !21 + %1592 = insertelement <1 x i32> poison, i32 %1584, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %1592, i1 true) #5, !dbg !21 + %1593 = insertelement <1 x i32> poison, i32 %1585, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, <1 x i32> %1593, i1 true) #5, !dbg !21 + %1594 = insertelement <1 x i32> poison, i32 %1586, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, <1 x i32> %1594, i1 true) #5, !dbg !21 + %1595 = insertelement <1 x i32> poison, i32 %1587, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %109, <1 x i32> %1595, i1 true) #5, !dbg !21 + %1596 = insertelement <1 x i32> poison, i32 %1588, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, <1 x i32> %1596, i1 true) #5, !dbg !21 + %1597 = insertelement <1 x i32> poison, i32 %1589, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %111, <1 x i32> %1597, i1 true) #5, !dbg !21 + %1598 = insertelement <1 x i32> poison, i32 %1590, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %112, <1 x i32> %1598, i1 true) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1599 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %48, i1 true) #5, !dbg !21 + %1600 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1599, i32 1, i32 31), !dbg !21 + %1601 = add i32 %1600, %1599, !dbg !35 + %1602 = insertelement <1 x i32> poison, i32 %1601, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %48, <1 x i32> %1602, i1 %50) #5, !dbg !21 + %1603 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %51, i1 true) #5, !dbg !21 + %1604 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1603, i32 1, i32 31), !dbg !21 + %1605 = add i32 %1604, %1603, !dbg !35 + %1606 = insertelement <1 x i32> poison, i32 %1605, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %51, <1 x i32> %1606, i1 %50) #5, !dbg !21 + %1607 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %52, i1 true) #5, !dbg !21 + %1608 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1607, i32 1, i32 31), !dbg !21 + %1609 = add i32 %1608, %1607, !dbg !35 + %1610 = insertelement <1 x i32> poison, i32 %1609, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %52, <1 x i32> %1610, i1 %50) #5, !dbg !21 + %1611 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %53, i1 true) #5, !dbg !21 + %1612 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1611, i32 1, i32 31), !dbg !21 + %1613 = add i32 %1612, %1611, !dbg !35 + %1614 = insertelement <1 x i32> poison, i32 %1613, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %53, <1 x i32> %1614, i1 %50) #5, !dbg !21 + %1615 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %54, i1 true) #5, !dbg !21 + %1616 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1615, i32 1, i32 31), !dbg !21 + %1617 = add i32 %1616, %1615, !dbg !35 + %1618 = insertelement <1 x i32> poison, i32 %1617, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %54, <1 x i32> %1618, i1 %50) #5, !dbg !21 + %1619 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %55, i1 true) #5, !dbg !21 + %1620 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1619, i32 1, i32 31), !dbg !21 + %1621 = add i32 %1620, %1619, !dbg !35 + %1622 = insertelement <1 x i32> poison, i32 %1621, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %55, <1 x i32> %1622, i1 %50) #5, !dbg !21 + %1623 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %56, i1 true) #5, !dbg !21 + %1624 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1623, i32 1, i32 31), !dbg !21 + %1625 = add i32 %1624, %1623, !dbg !35 + %1626 = insertelement <1 x i32> poison, i32 %1625, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %56, <1 x i32> %1626, i1 %50) #5, !dbg !21 + %1627 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %57, i1 true) #5, !dbg !21 + %1628 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1627, i32 1, i32 31), !dbg !21 + %1629 = add i32 %1628, %1627, !dbg !35 + %1630 = insertelement <1 x i32> poison, i32 %1629, i64 0, !dbg !21 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %57, <1 x i32> %1630, i1 %50) #5, !dbg !21 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !21 + %1631 = load i32, ptr addrspace(3) %40, align 16, !dbg !21 + %1632 = load i32, ptr addrspace(3) %41, align 8, !dbg !21 + %1633 = load i32, ptr addrspace(3) %42, align 16, !dbg !21 + %1634 = load i32, ptr addrspace(3) %43, align 8, !dbg !21 + %1635 = load i32, ptr addrspace(3) %44, align 16, !dbg !21 + %1636 = load i32, ptr addrspace(3) %45, align 8, !dbg !21 + %1637 = load i32, ptr addrspace(3) %46, align 16, !dbg !21 + %1638 = load i32, ptr addrspace(3) %47, align 8, !dbg !21 + %1639 = insertelement <4 x i32> poison, i32 %1463, i64 0, !dbg !36 + %1640 = insertelement <4 x i32> %1639, i32 %1465, i64 1, !dbg !36 + %1641 = insertelement <4 x i32> %1640, i32 %1467, i64 2, !dbg !36 + %1642 = insertelement <4 x i32> %1641, i32 %1469, i64 3, !dbg !36 + %1643 = insertelement <4 x i32> poison, i32 %1519, i64 0, !dbg !36 + %1644 = insertelement <4 x i32> %1643, i32 %1521, i64 1, !dbg !36 + %1645 = insertelement <4 x i32> %1644, i32 %1523, i64 2, !dbg !36 + %1646 = insertelement <4 x i32> %1645, i32 %1525, i64 3, !dbg !36 + %1647 = icmp slt <4 x i32> %1642, %1646, !dbg !36 + %1648 = insertelement <4 x i32> poison, i32 %1464, i64 0, !dbg !36 + %1649 = insertelement <4 x i32> %1648, i32 %1466, i64 1, !dbg !36 + %1650 = insertelement <4 x i32> %1649, i32 %1468, i64 2, !dbg !36 + %1651 = insertelement <4 x i32> %1650, i32 %1470, i64 3, !dbg !36 + %1652 = insertelement <4 x i32> poison, i32 %1520, i64 0, !dbg !36 + %1653 = insertelement <4 x i32> %1652, i32 %1522, i64 1, !dbg !36 + %1654 = insertelement <4 x i32> %1653, i32 %1524, i64 2, !dbg !36 + %1655 = insertelement <4 x i32> %1654, i32 %1526, i64 3, !dbg !36 + %1656 = icmp slt <4 x i32> %1651, %1655, !dbg !36 + %1657 = icmp eq <4 x i32> %1642, %1646, !dbg !37 + %1658 = icmp eq <4 x i32> %1651, %1655, !dbg !37 + %1659 = insertelement <4 x i32> poison, i32 %1575, i64 0, !dbg !38 + %1660 = insertelement <4 x i32> %1659, i32 %1577, i64 1, !dbg !38 + %1661 = insertelement <4 x i32> %1660, i32 %1579, i64 2, !dbg !38 + %1662 = insertelement <4 x i32> %1661, i32 %1581, i64 3, !dbg !38 + %1663 = insertelement <4 x i32> poison, i32 %1631, i64 0, !dbg !38 + %1664 = insertelement <4 x i32> %1663, i32 %1633, i64 1, !dbg !38 + %1665 = insertelement <4 x i32> %1664, i32 %1635, i64 2, !dbg !38 + %1666 = insertelement <4 x i32> %1665, i32 %1637, i64 3, !dbg !38 + %1667 = icmp sgt <4 x i32> %1662, %1666, !dbg !38 + %1668 = insertelement <4 x i32> poison, i32 %1576, i64 0, !dbg !38 + %1669 = insertelement <4 x i32> %1668, i32 %1578, i64 1, !dbg !38 + %1670 = insertelement <4 x i32> %1669, i32 %1580, i64 2, !dbg !38 + %1671 = insertelement <4 x i32> %1670, i32 %1582, i64 3, !dbg !38 + %1672 = insertelement <4 x i32> poison, i32 %1632, i64 0, !dbg !38 + %1673 = insertelement <4 x i32> %1672, i32 %1634, i64 1, !dbg !38 + %1674 = insertelement <4 x i32> %1673, i32 %1636, i64 2, !dbg !38 + %1675 = insertelement <4 x i32> %1674, i32 %1638, i64 3, !dbg !38 + %1676 = icmp sgt <4 x i32> %1671, %1675, !dbg !38 + %1677 = and <4 x i1> %1657, %1667, !dbg !39 + %1678 = and <4 x i1> %1658, %1676, !dbg !39 + %1679 = or <4 x i1> %1647, %1677, !dbg !40 + %1680 = or <4 x i1> %1656, %1678, !dbg !40 + %1681 = xor <4 x i32> %1666, %1662, !dbg !44 + %1682 = xor <4 x i32> %1675, %1671, !dbg !44 + %1683 = select <4 x i1> %1679, <4 x i32> %1681, <4 x i32> zeroinitializer, !dbg !45 + %1684 = insertelement <4 x i32> poison, i32 %1407, i64 0, !dbg !34 + %1685 = insertelement <4 x i32> %1684, i32 %1409, i64 1, !dbg !34 + %1686 = insertelement <4 x i32> %1685, i32 %1411, i64 2, !dbg !34 + %1687 = insertelement <4 x i32> %1686, i32 %1413, i64 3, !dbg !34 + %1688 = xor <4 x i32> %1683, %1687, !dbg !34 + %1689 = select <4 x i1> %1680, <4 x i32> %1682, <4 x i32> zeroinitializer, !dbg !45 + %1690 = insertelement <4 x i32> poison, i32 %1408, i64 0, !dbg !34 + %1691 = insertelement <4 x i32> %1690, i32 %1410, i64 1, !dbg !34 + %1692 = insertelement <4 x i32> %1691, i32 %1412, i64 2, !dbg !34 + %1693 = insertelement <4 x i32> %1692, i32 %1414, i64 3, !dbg !34 + %1694 = xor <4 x i32> %1689, %1693, !dbg !34 + %1695 = insertelement <8 x i1> poison, i1 %18, i64 0, !dbg !46 + %1696 = shufflevector <8 x i1> %1695, <8 x i1> poison, <8 x i32> zeroinitializer, !dbg !46 + %1697 = insertelement <8 x i32> poison, i32 %90, i64 0, !dbg !46 + %1698 = insertelement <8 x i32> %1697, i32 %88, i64 1, !dbg !46 + %1699 = insertelement <8 x i32> %1698, i32 %92, i64 2, !dbg !46 + %1700 = insertelement <8 x i32> %1699, i32 %94, i64 3, !dbg !46 + %1701 = insertelement <8 x i32> %1700, i32 %174, i64 4, !dbg !46 + %1702 = insertelement <8 x i32> %1701, i32 %176, i64 5, !dbg !46 + %1703 = insertelement <8 x i32> %1702, i32 %178, i64 6, !dbg !46 + %1704 = insertelement <8 x i32> %1703, i32 %180, i64 7, !dbg !46 + %1705 = sext <8 x i32> %1704 to <8 x i64>, !dbg !46 + %1706 = select <8 x i1> %1696, <8 x i64> %1705, <8 x i64> zeroinitializer, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !47 + %1707 = tail call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %1706), !dbg !49 + %1708 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %39, !dbg !47 + %1709 = getelementptr i64, ptr addrspace(3) %1708, i32 %.lobit, !dbg !47 + %1710 = insertelement <1 x i64> poison, i64 %1707, i64 0, !dbg !47 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %1709, <1 x i64> %1710, i1 true) #5, !dbg !47 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !47 + %1711 = icmp samesign ult i32 %11, 256, !dbg !47 + %1712 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %11, !dbg !47 + %1713 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %1712, i1 %1711) #5, !dbg !47 + %extelt.offset = lshr i64 %1713, 32, !dbg !47 + %1714 = trunc nuw i64 %extelt.offset to i32, !dbg !47 + %1715 = trunc i64 %1713 to i32, !dbg !47 + %1716 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1715, i32 1, i32 31), !dbg !47 + %1717 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1714, i32 1, i32 31), !dbg !47 + %1718 = insertelement <2 x i32> poison, i32 %1716, i64 0, !dbg !47 + %1719 = insertelement <2 x i32> %1718, i32 %1717, i64 1, !dbg !47 + %1720 = bitcast <2 x i32> %1719 to i64, !dbg !47 + %1721 = add i64 %1713, %1720, !dbg !49 + %1722 = and i32 %11, 769, !dbg !47 + %1723 = icmp eq i32 %1722, 0, !dbg !47 + %1724 = insertelement <1 x i64> poison, i64 %1721, i64 0, !dbg !47 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %1712, <1 x i64> %1724, i1 %1723) #5, !dbg !47 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !47 + %1725 = load i64, ptr addrspace(3) %1708, align 16, !dbg !47 + %1726 = trunc i64 %1725 to i32, !dbg !50 + %1727 = icmp slt i64 %3, 2, !dbg !51 + %1728 = icmp sgt i64 %3, 1, !dbg !52 + %1729 = select i1 %1728, i64 %3, i64 0, !dbg !53 + %1730 = zext i1 %1727 to i64, !dbg !54 + %1731 = add i64 %1729, %1730, !dbg !55 + %1732 = shl i64 %1731, 4, !dbg !16 + %1733 = mul i64 %1732, %29, !dbg !56 + %1734 = mul i64 %1732, %31, !dbg !56 + %.idx = shl nsw i64 %.decomposed90, 6, !dbg !57 + %1735 = getelementptr i8, ptr addrspace(1) %1, i64 %.idx, !dbg !57 + %1736 = getelementptr i32, ptr addrspace(1) %1735, i64 %33, !dbg !57 + %1737 = getelementptr i32, ptr addrspace(1) %1736, i64 %1733, !dbg !57 + %.idx80 = shl nsw i64 %.decomposed92, 6, !dbg !57 + %1738 = getelementptr i8, ptr addrspace(1) %1, i64 %.idx80, !dbg !57 + %1739 = getelementptr i32, ptr addrspace(1) %1738, i64 %33, !dbg !57 + %1740 = getelementptr i32, ptr addrspace(1) %1739, i64 %1734, !dbg !57 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !58 + %1741 = shl nuw nsw i32 %11, 4, !dbg !58 + %1742 = and i32 %1741, 4080, !dbg !58 + %1743 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1742, !dbg !58 + store <4 x i32> %1688, ptr addrspace(3) %1743, align 16, !dbg !58 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !58 + %1744 = and i32 %1741, 112, !dbg !58 + %1745 = and i32 %22, 896, !dbg !58 + %1746 = shl nuw nsw i32 %11, 8, !dbg !58 + %1747 = and i32 %1746, 2048, !dbg !58 + %1748 = shl nuw nsw i32 %11, 6, !dbg !58 + %1749 = and i32 %1748, 1024, !dbg !58 + %1750 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1744, !dbg !58 + %1751 = getelementptr inbounds nuw i8, ptr addrspace(3) %1750, i32 %1747, !dbg !58 + %1752 = getelementptr inbounds nuw i8, ptr addrspace(3) %1751, i32 %1749, !dbg !58 + %1753 = getelementptr inbounds nuw i8, ptr addrspace(3) %1752, i32 %1745, !dbg !58 + %1754 = ptrtoint ptr addrspace(3) %1753 to i32, !dbg !58 + %1755 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %1754) #5, !dbg !58 + %1756 = extractvalue { i32, i32, i32, i32 } %1755, 0, !dbg !58 + %1757 = extractvalue { i32, i32, i32, i32 } %1755, 1, !dbg !58 + %1758 = extractvalue { i32, i32, i32, i32 } %1755, 2, !dbg !58 + %1759 = extractvalue { i32, i32, i32, i32 } %1755, 3, !dbg !58 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !58 + store <4 x i32> %1694, ptr addrspace(3) %1743, align 16, !dbg !58 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !58 + %1760 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %1754) #5, !dbg !58 + %1761 = extractvalue { i32, i32, i32, i32 } %1760, 0, !dbg !58 + %1762 = extractvalue { i32, i32, i32, i32 } %1760, 1, !dbg !58 + %1763 = extractvalue { i32, i32, i32, i32 } %1760, 2, !dbg !58 + %1764 = extractvalue { i32, i32, i32, i32 } %1760, 3, !dbg !58 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1756, i32 %1757, i32 %1761, i32 %1762, ptr addrspace(1) %1737, i1 %19) #5, !dbg !58 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1758, i32 %1759, i32 %1763, i32 %1764, ptr addrspace(1) %1740, i1 %20) #5, !dbg !58 + %1765 = mul i64 %27, %1731, !dbg !59 + %1766 = getelementptr i32, ptr addrspace(1) %2, i64 %.decomposed, !dbg !60 + %1767 = getelementptr i32, ptr addrspace(1) %1766, i64 %1765, !dbg !60 + %1768 = and i32 %11, 128, !dbg !61 + %1769 = icmp eq i32 %1768, 0, !dbg !61 + %1770 = and i1 %1769, %18, !dbg !61 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %1726, ptr addrspace(1) %1767, i1 %1770) #5, !dbg !61 + ret void, !dbg !62 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", linkageName: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 25, column: 21, scope: !4) +!12 = !DILocation(line: 26, column: 38, scope: !4) +!13 = !DILocation(line: 32, column: 19, scope: !4) +!14 = !DILocation(line: 33, column: 19, scope: !4) +!15 = !DILocation(line: 35, column: 37, scope: !4) +!16 = !DILocation(line: 35, column: 45, scope: !4) +!17 = !DILocation(line: 35, column: 64, scope: !4) +!18 = !DILocation(line: 35, column: 68, scope: !4) +!19 = !DILocation(line: 35, column: 30, scope: !4) +!20 = !DILocation(line: 35, column: 73, scope: !4) +!21 = !DILocation(line: 291, column: 36, scope: !22, inlinedAt: !24) +!22 = distinct !DILexicalBlockFile(scope: !4, file: !23, discriminator: 0) +!23 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!24 = !DILocation(line: 40, column: 67, scope: !4) +!25 = !DILocation(line: 551, column: 23, scope: !26, inlinedAt: !24) +!26 = distinct !DILexicalBlockFile(scope: !4, file: !27, discriminator: 0) +!27 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!28 = !DILocation(line: 35, column: 54, scope: !4) +!29 = !DILocation(line: 537, column: 21, scope: !26, inlinedAt: !24) +!30 = !DILocation(line: 538, column: 40, scope: !26, inlinedAt: !24) +!31 = !DILocation(line: 539, column: 41, scope: !26, inlinedAt: !24) +!32 = !DILocation(line: 548, column: 23, scope: !26, inlinedAt: !24) +!33 = !DILocation(line: 600, column: 15, scope: !26, inlinedAt: !24) +!34 = !DILocation(line: 601, column: 22, scope: !26, inlinedAt: !24) +!35 = !DILocation(line: 261, column: 15, scope: !22, inlinedAt: !24) +!36 = !DILocation(line: 574, column: 22, scope: !26, inlinedAt: !24) +!37 = !DILocation(line: 591, column: 21, scope: !26, inlinedAt: !24) +!38 = !DILocation(line: 594, column: 40, scope: !26, inlinedAt: !24) +!39 = !DILocation(line: 594, column: 29, scope: !26, inlinedAt: !24) +!40 = !DILocation(line: 594, column: 23, scope: !26, inlinedAt: !24) +!41 = !DILocation(line: 599, column: 19, scope: !26, inlinedAt: !24) +!42 = !DILocation(line: 600, column: 38, scope: !26, inlinedAt: !24) +!43 = !DILocation(line: 600, column: 46, scope: !26, inlinedAt: !24) +!44 = !DILocation(line: 601, column: 48, scope: !26, inlinedAt: !24) +!45 = !DILocation(line: 601, column: 59, scope: !26, inlinedAt: !24) +!46 = !DILocation(line: 43, column: 34, scope: !4) +!47 = !DILocation(line: 291, column: 36, scope: !22, inlinedAt: !48) +!48 = !DILocation(line: 44, column: 26, scope: !4) +!49 = !DILocation(line: 261, column: 15, scope: !22, inlinedAt: !48) +!50 = !DILocation(line: 47, column: 21, scope: !4) +!51 = !DILocation(line: 48, column: 62, scope: !4) +!52 = !DILocation(line: 48, column: 88, scope: !4) +!53 = !DILocation(line: 48, column: 79, scope: !4) +!54 = !DILocation(line: 48, scope: !4) +!55 = !DILocation(line: 48, column: 70, scope: !4) +!56 = !DILocation(line: 48, column: 47, scope: !4) +!57 = !DILocation(line: 48, column: 25, scope: !4) +!58 = !DILocation(line: 48, column: 102, scope: !4) +!59 = !DILocation(line: 49, column: 34, scope: !4) +!60 = !DILocation(line: 49, column: 25, scope: !4) +!61 = !DILocation(line: 49, column: 89, scope: !4) +!62 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx new file mode 100644 index 0000000000000000000000000000000000000000..6595da22e644f930f3200854bf7a46d6ba5259f8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx @@ -0,0 +1,3476 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 // -- Begin function triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.visible .entry triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2, + .param .u64 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_6, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_7 +) +.reqntid 256 +{ + .reg .pred %p<670>; + .reg .b32 %r<1495>; + .reg .b64 %rd<160>; + .loc 1 18 0 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:18:0 + +// %bb.0: + ld.param.b64 %rd19, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3]; +$L__tmp0: + .loc 1 23 28 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:23:28 + mov.u32 %r7, %ctaid.x; + .loc 1 23 33 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:23:33 + shl.b32 %r8, %r7, 7; + .loc 1 24 44 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:24:44 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 127; + bfe.u32 %r9, %r1, 2, 6; + .loc 1 24 23 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:24:23 + or.b32 %r10, %r8, %r2; + or.b32 %r11, %r9, %r8; + or.b32 %r12, %r11, 64; + .loc 1 32 19 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:32:19 + cvt.s64.s32 %rd1, %r10; + cvt.s64.s32 %rd2, %r11; + .loc 1 33 19 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:33:19 + or.b64 %rd21, %rd1, %rd19; + and.b64 %rd22, %rd21, -4294967296; + setp.ne.b64 %p1, %rd22, 0; + cvt.u32.u64 %r1494, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd157, %rd1, %rd19; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r13, %rd19; + div.u32 %r15, %r1494, %r13; + cvt.u64.u32 %rd157, %r15; +$L__BB0_3: + .loc 1 0 0 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:0 + shl.b32 %r4, %r1, 2; + .loc 1 32 19 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:32:19 + cvt.s64.s32 %rd3, %r12; + mul.lo.s64 %rd23, %rd157, %rd19; + .loc 1 33 19 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:33:19 + or.b64 %rd24, %rd2, %rd19; + and.b64 %rd25, %rd24, -4294967296; + setp.ne.b64 %p2, %rd25, 0; + cvt.u32.u64 %r1493, %rd2; + @%p2 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd158, %rd2, %rd19; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r16, %rd19; + div.u32 %r18, %r1493, %r16; + cvt.u64.u32 %rd158, %r18; +$L__BB0_6: + .loc 1 0 19 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:0:19 + ld.param.b32 %r6, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4]; + ld.param.b64 %rd18, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2]; + ld.param.b64 %rd17, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1]; + ld.param.b64 %rd16, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0]; + shr.u32 %r3, %r1, 7; + and.b32 %r5, %r4, 12; + .loc 1 32 19 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:32:19 + sub.s64 %rd8, %rd1, %rd23; + mul.lo.s64 %rd26, %rd158, %rd19; + sub.s64 %rd12, %rd2, %rd26; + .loc 1 33 19 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:33:19 + or.b64 %rd27, %rd3, %rd19; + and.b64 %rd28, %rd27, -4294967296; + setp.ne.b64 %p3, %rd28, 0; + cvt.u32.u64 %r1492, %rd3; + @%p3 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + div.s64 %rd159, %rd3, %rd19; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r19, %rd19; + div.u32 %r21, %r1492, %r19; + cvt.u64.u32 %rd159, %r21; +$L__BB0_9: + .loc 1 25 21 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:25:21 + setp.lt.s32 %p400, %r1492, %r6; + setp.lt.s32 %p399, %r1493, %r6; + setp.lt.s32 %p4, %r1494, %r6; + .loc 1 32 19 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:32:19 + mul.lo.s64 %rd59, %rd159, %rd19; + sub.s64 %rd60, %rd3, %rd59; + .loc 1 35 68 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:35:68 + mul.lo.s64 %rd61, %rd19, %rd157; + .loc 1 35 30 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:35:30 + shl.b64 %rd62, %rd8, 2; + add.s64 %rd63, %rd16, %rd62; + .loc 1 35 73 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:35:73 + // begin inline asm + mov.u64 %rd31, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd31, 1.0; + // end inline asm +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + shl.b32 %r823, %r2, 4; + shl.b32 %r824, %r2, 6; + mov.b32 %r825, global_smem; + add.s32 %r826, %r825, %r824; + add.s32 %r47, %r825, %r4; + and.b32 %r828, %r1, 1; + setp.ne.b32 %p402, %r828, 0; + not.pred %p21, %p402; + add.s32 %r51, %r47, 1024; + add.s32 %r55, %r47, 2048; + add.s32 %r59, %r47, 3072; + add.s32 %r63, %r47, 4096; + add.s32 %r67, %r47, 5120; + add.s32 %r71, %r47, 6144; + add.s32 %r75, %r47, 7168; +$L__tmp2: + .loc 1 26 38 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:26:38 + and.b32 %r175, %r3, 1; + or.b32 %r829, %r175, 4; + or.b32 %r830, %r3, 6; + or.b32 %r831, %r175, 2; + .loc 1 35 37 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:35:37 + cvt.u64.u32 %rd64, %r175; +$L__tmp3: + .loc 3 551 23 // triton_helpers.py:551:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + cvt.u64.u32 %rd65, %r831; +$L__tmp4: + .loc 1 35 37 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:35:37 + cvt.u64.u32 %rd66, %r829; + cvt.u64.u32 %rd67, %r830; + .loc 1 35 54 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:35:54 + mul.lo.s64 %rd68, %rd19, %rd64; + mul.lo.s64 %rd69, %rd19, %rd65; + mul.lo.s64 %rd70, %rd19, %rd66; + mul.lo.s64 %rd71, %rd19, %rd67; + .loc 1 35 30 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:35:30 + mad.wide.u32 %rd72, %r175, 4, %rd63; + shl.b64 %rd73, %rd157, 6; + add.s64 %rd74, %rd72, %rd73; + shl.b64 %rd75, %rd68, 2; + add.s64 %rd76, %rd74, %rd75; + shl.b64 %rd77, %rd61, 6; + add.s64 %rd30, %rd76, %rd77; + mad.wide.u32 %rd78, %r831, 4, %rd63; + add.s64 %rd79, %rd78, %rd73; + shl.b64 %rd80, %rd69, 2; + add.s64 %rd81, %rd79, %rd80; + add.s64 %rd33, %rd81, %rd77; + shl.b64 %rd82, %rd70, 2; + add.s64 %rd83, %rd74, %rd82; + add.s64 %rd84, %rd83, %rd77; + add.s64 %rd36, %rd84, 16; + mad.wide.u32 %rd85, %r830, 4, %rd63; + add.s64 %rd86, %rd85, %rd73; + shl.b64 %rd87, %rd71, 2; + add.s64 %rd88, %rd86, %rd87; + add.s64 %rd39, %rd88, %rd77; + .loc 1 35 73 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:35:73 + // begin inline asm + mov.u32 %r22, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b32 { %r22 }, [ %rd30 + 0 ], %rd31; + // end inline asm + // begin inline asm + mov.u64 %rd34, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd34, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r23, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b32 { %r23 }, [ %rd33 + 0 ], %rd34; + // end inline asm + // begin inline asm + mov.u64 %rd37, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd37, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r24, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b32 { %r24 }, [ %rd36 + 0 ], %rd37; + // end inline asm + // begin inline asm + mov.u64 %rd40, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd40, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r25, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b32 { %r25 }, [ %rd39 + 0 ], %rd40; + // end inline asm + // begin inline asm + mov.u64 %rd43, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd43, 1.0; + // end inline asm +$L__tmp5: + .loc 3 537 21 // triton_helpers.py:537:21 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r832, %r175, 1; + .loc 3 538 40 // triton_helpers.py:538:40 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r31, %r22, %r832; + mul.lo.s32 %r33, %r23, %r832; + mul.lo.s32 %r35, %r24, %r832; + mul.lo.s32 %r37, %r25, %r832; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + shl.b32 %r833, %r175, 2; + add.s32 %r30, %r826, %r833; + add.s32 %r32, %r30, 8; + add.s32 %r34, %r30, 16; + add.s32 %r36, %r30, 24; + add.s32 %r38, %r30, 32; + add.s32 %r40, %r30, 40; + add.s32 %r42, %r30, 48; + add.s32 %r44, %r30, 56; + .loc 3 539 41 // triton_helpers.py:539:41 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r79, %r22, %r175; + mul.lo.s32 %r81, %r23, %r175; + mul.lo.s32 %r83, %r24, %r175; + mul.lo.s32 %r85, %r25, %r175; + .loc 3 548 23 // triton_helpers.py:548:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r127, %r832, %r175; + mul.lo.s32 %r129, %r831, %r832; + shl.b32 %r834, %r832, 2; + or.b32 %r131, %r127, %r834; + mul.lo.s32 %r133, %r832, %r830; + .loc 3 551 23 // triton_helpers.py:551:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r177, %r831, %r175; + mul.lo.s32 %r179, %r829, %r175; + mul.lo.s32 %r181, %r830, %r175; +$L__tmp6: + .loc 1 26 38 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:26:38 + or.b32 %r835, %r175, 10; + or.b32 %r836, %r3, 14; + .loc 1 35 37 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:35:37 + cvt.u64.u32 %rd89, %r836; + .loc 1 35 54 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:35:54 + mul.lo.s64 %rd90, %rd19, %rd89; + .loc 1 35 30 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:35:30 + mad.wide.u32 %rd91, %r836, 4, %rd63; + add.s64 %rd92, %rd91, %rd73; + shl.b64 %rd93, %rd90, 2; + add.s64 %rd94, %rd92, %rd93; + add.s64 %rd51, %rd94, %rd77; + .loc 1 26 38 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:26:38 + or.b32 %r837, %r175, 12; + or.b32 %r838, %r175, 8; +$L__tmp7: + .loc 3 551 23 // triton_helpers.py:551:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + cvt.u64.u32 %rd95, %r838; + cvt.u64.u32 %rd96, %r835; +$L__tmp8: + .loc 1 35 37 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:35:37 + cvt.u64.u32 %rd97, %r837; + .loc 1 35 54 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:35:54 + mul.lo.s64 %rd98, %rd19, %rd95; + mul.lo.s64 %rd99, %rd19, %rd96; + mul.lo.s64 %rd100, %rd19, %rd97; + .loc 1 35 30 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:35:30 + mad.wide.u32 %rd101, %r838, 4, %rd63; + add.s64 %rd102, %rd101, %rd73; + shl.b64 %rd103, %rd98, 2; + add.s64 %rd104, %rd102, %rd103; + add.s64 %rd42, %rd104, %rd77; + mad.wide.u32 %rd105, %r835, 4, %rd63; + add.s64 %rd106, %rd105, %rd73; + shl.b64 %rd107, %rd99, 2; + add.s64 %rd108, %rd106, %rd107; + add.s64 %rd45, %rd108, %rd77; + mad.wide.u32 %rd109, %r837, 4, %rd63; + add.s64 %rd110, %rd109, %rd73; + shl.b64 %rd111, %rd100, 2; + add.s64 %rd112, %rd110, %rd111; + add.s64 %rd48, %rd112, %rd77; + .loc 1 35 73 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:35:73 + // begin inline asm + mov.u32 %r26, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b32 { %r26 }, [ %rd42 + 0 ], %rd43; + // end inline asm + // begin inline asm + mov.u64 %rd46, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd46, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r27, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b32 { %r27 }, [ %rd45 + 0 ], %rd46; + // end inline asm + // begin inline asm + mov.u64 %rd49, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd49, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r28, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b32 { %r28 }, [ %rd48 + 0 ], %rd49; + // end inline asm + // begin inline asm + mov.u64 %rd52, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd52, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r29, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b32 { %r29 }, [ %rd51 + 0 ], %rd52; + // end inline asm +$L__tmp9: + .loc 3 538 40 // triton_helpers.py:538:40 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r39, %r26, %r832; + mul.lo.s32 %r41, %r27, %r832; + mul.lo.s32 %r43, %r28, %r832; + mul.lo.s32 %r45, %r29, %r832; + mov.pred %p12, -1; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r30 + 0 ], %r31; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r32 + 0 ], %r33; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r34 + 0 ], %r35; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r37; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r38 + 0 ], %r39; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r41; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r42 + 0 ], %r43; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r45; + // end inline asm + bar.sync 0; + // begin inline asm + @%p12 ld.shared.b32 %r46, [ %r47 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r839, %r46, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r49, %r839, %r46; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r47 + 0 ], %r49; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r50, [ %r51 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r840, %r50, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r53, %r840, %r50; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r51 + 0 ], %r53; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r54, [ %r55 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r841, %r54, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r57, %r841, %r54; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r55 + 0 ], %r57; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r58, [ %r59 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r842, %r58, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r61, %r842, %r58; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r59 + 0 ], %r61; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r62, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r843, %r62, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r65, %r843, %r62; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r63 + 0 ], %r65; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r66, [ %r67 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r844, %r66, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r69, %r844, %r66; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r67 + 0 ], %r69; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r70, [ %r71 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r845, %r70, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r73, %r845, %r70; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r71 + 0 ], %r73; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r74, [ %r75 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r846, %r74, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r77, %r846, %r74; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r75 + 0 ], %r77; + // end inline asm + bar.sync 0; + ld.shared.b32 %r847, [%r826]; + ld.shared.b32 %r848, [%r826+8]; + ld.shared.b32 %r849, [%r826+16]; + ld.shared.b32 %r850, [%r826+24]; + ld.shared.b32 %r851, [%r826+32]; + ld.shared.b32 %r852, [%r826+40]; + ld.shared.b32 %r853, [%r826+48]; + ld.shared.b32 %r854, [%r826+56]; + .loc 3 539 41 // triton_helpers.py:539:41 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r87, %r26, %r175; + mul.lo.s32 %r89, %r27, %r175; + mul.lo.s32 %r91, %r28, %r175; + mul.lo.s32 %r93, %r29, %r175; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p12 st.shared.b32 [ %r30 + 0 ], %r79; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r32 + 0 ], %r81; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r34 + 0 ], %r83; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r85; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r38 + 0 ], %r87; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r89; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r42 + 0 ], %r91; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r93; + // end inline asm + bar.sync 0; + // begin inline asm + @%p12 ld.shared.b32 %r94, [ %r47 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r855, %r94, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r97, %r855, %r94; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r47 + 0 ], %r97; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r98, [ %r51 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r856, %r98, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r101, %r856, %r98; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r51 + 0 ], %r101; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r102, [ %r55 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r857, %r102, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r105, %r857, %r102; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r55 + 0 ], %r105; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r106, [ %r59 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r858, %r106, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r109, %r858, %r106; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r59 + 0 ], %r109; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r110, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r859, %r110, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r113, %r859, %r110; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r63 + 0 ], %r113; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r114, [ %r67 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r860, %r114, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r117, %r860, %r114; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r67 + 0 ], %r117; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r118, [ %r71 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r861, %r118, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r121, %r861, %r118; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r71 + 0 ], %r121; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r122, [ %r75 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r862, %r122, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r125, %r862, %r122; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r75 + 0 ], %r125; + // end inline asm + bar.sync 0; + ld.shared.b32 %r863, [%r826]; + ld.shared.b32 %r864, [%r826+8]; + ld.shared.b32 %r865, [%r826+16]; + ld.shared.b32 %r866, [%r826+24]; + ld.shared.b32 %r867, [%r826+32]; + ld.shared.b32 %r868, [%r826+40]; + ld.shared.b32 %r869, [%r826+48]; + ld.shared.b32 %r870, [%r826+56]; + .loc 3 548 23 // triton_helpers.py:548:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r135, %r838, %r832; + mul.lo.s32 %r137, %r835, %r832; + mul.lo.s32 %r139, %r837, %r832; + mul.lo.s32 %r141, %r832, %r836; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p12 st.shared.b32 [ %r30 + 0 ], %r127; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r32 + 0 ], %r129; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r34 + 0 ], %r131; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r133; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r38 + 0 ], %r135; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r137; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r42 + 0 ], %r139; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r141; + // end inline asm + bar.sync 0; + // begin inline asm + @%p12 ld.shared.b32 %r142, [ %r47 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r871, %r142, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r145, %r871, %r142; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r47 + 0 ], %r145; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r146, [ %r51 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r872, %r146, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r149, %r872, %r146; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r51 + 0 ], %r149; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r150, [ %r55 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r873, %r150, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r153, %r873, %r150; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r55 + 0 ], %r153; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r154, [ %r59 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r874, %r154, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r157, %r874, %r154; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r59 + 0 ], %r157; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r158, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r875, %r158, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r161, %r875, %r158; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r63 + 0 ], %r161; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r162, [ %r67 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r876, %r162, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r165, %r876, %r162; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r67 + 0 ], %r165; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r166, [ %r71 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r877, %r166, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r169, %r877, %r166; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r71 + 0 ], %r169; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r170, [ %r75 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r878, %r170, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r173, %r878, %r170; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r75 + 0 ], %r173; + // end inline asm + bar.sync 0; + ld.shared.b32 %r879, [%r826]; + ld.shared.b32 %r880, [%r826+8]; + ld.shared.b32 %r881, [%r826+16]; + ld.shared.b32 %r882, [%r826+24]; + ld.shared.b32 %r883, [%r826+32]; + ld.shared.b32 %r884, [%r826+40]; + ld.shared.b32 %r885, [%r826+48]; + ld.shared.b32 %r886, [%r826+56]; + .loc 3 551 23 // triton_helpers.py:551:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r183, %r838, %r175; + mul.lo.s32 %r185, %r835, %r175; + mul.lo.s32 %r187, %r837, %r175; + mul.lo.s32 %r189, %r836, %r175; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p12 st.shared.b32 [ %r30 + 0 ], %r175; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r32 + 0 ], %r177; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r34 + 0 ], %r179; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r181; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r38 + 0 ], %r183; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r185; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r42 + 0 ], %r187; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r189; + // end inline asm + bar.sync 0; + // begin inline asm + @%p12 ld.shared.b32 %r190, [ %r47 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r887, %r190, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r193, %r887, %r190; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r47 + 0 ], %r193; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r194, [ %r51 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r888, %r194, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r197, %r888, %r194; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r51 + 0 ], %r197; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r198, [ %r55 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r889, %r198, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r201, %r889, %r198; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r55 + 0 ], %r201; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r202, [ %r59 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r890, %r202, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r205, %r890, %r202; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r59 + 0 ], %r205; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r206, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r891, %r206, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r209, %r891, %r206; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r63 + 0 ], %r209; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r210, [ %r67 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r892, %r210, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r213, %r892, %r210; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r67 + 0 ], %r213; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r214, [ %r71 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r893, %r214, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r217, %r893, %r214; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r71 + 0 ], %r217; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r218, [ %r75 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r894, %r218, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r221, %r894, %r218; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r75 + 0 ], %r221; + // end inline asm + bar.sync 0; + ld.shared.b32 %r895, [%r826]; + ld.shared.b32 %r896, [%r826+8]; + ld.shared.b32 %r897, [%r826+16]; + ld.shared.b32 %r898, [%r826+24]; + ld.shared.b32 %r899, [%r826+32]; + ld.shared.b32 %r900, [%r826+40]; + ld.shared.b32 %r901, [%r826+48]; + ld.shared.b32 %r902, [%r826+56]; + .loc 3 574 22 // triton_helpers.py:574:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.lt.s32 %p403, %r849, %r865; + setp.lt.s32 %p404, %r847, %r863; + setp.ge.s32 %p405, %r850, %r866; + setp.ge.s32 %p406, %r848, %r864; + setp.lt.s32 %p407, %r853, %r869; + setp.lt.s32 %p408, %r851, %r867; + setp.ge.s32 %p409, %r854, %r870; + setp.ge.s32 %p410, %r852, %r868; + .loc 3 591 21 // triton_helpers.py:591:21 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.eq.b32 %p411, %r847, %r863; + setp.eq.b32 %p412, %r849, %r865; + setp.ne.b32 %p413, %r848, %r864; + setp.ne.b32 %p414, %r850, %r866; + setp.eq.b32 %p415, %r851, %r867; + setp.eq.b32 %p416, %r853, %r869; + setp.ne.b32 %p417, %r852, %r868; + setp.ne.b32 %p418, %r854, %r870; + .loc 3 594 40 // triton_helpers.py:594:40 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.gt.s32 %p419, %r879, %r895; + setp.gt.s32 %p420, %r881, %r897; + setp.le.s32 %p421, %r880, %r896; + setp.le.s32 %p422, %r882, %r898; + setp.gt.s32 %p423, %r883, %r899; + setp.gt.s32 %p424, %r885, %r901; + setp.le.s32 %p425, %r884, %r900; + setp.le.s32 %p426, %r886, %r902; + .loc 3 594 29 // triton_helpers.py:594:29 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + and.pred %p427, %p412, %p420; + and.pred %p428, %p411, %p419; + or.pred %p429, %p414, %p422; + or.pred %p430, %p413, %p421; + and.pred %p431, %p416, %p424; + and.pred %p432, %p415, %p423; + .loc 3 594 23 // triton_helpers.py:594:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + or.pred %p433, %p418, %p426; + or.pred %p434, %p417, %p425; + or.pred %p435, %p404, %p428; + or.pred %p436, %p403, %p427; + and.pred %p437, %p406, %p430; + and.pred %p438, %p405, %p429; + or.pred %p439, %p408, %p432; + or.pred %p440, %p407, %p431; + .loc 3 599 19 // triton_helpers.py:599:19 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + and.pred %p441, %p410, %p434; + and.pred %p442, %p409, %p433; + .loc 3 600 38 // triton_helpers.py:600:38 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r903, %r863, %r847; + xor.b32 %r904, %r865, %r849; + xor.b32 %r905, %r864, %r848; + xor.b32 %r906, %r866, %r850; + xor.b32 %r907, %r867, %r851; + xor.b32 %r908, %r869, %r853; + xor.b32 %r909, %r868, %r852; + xor.b32 %r910, %r870, %r854; + .loc 3 600 46 // triton_helpers.py:600:46 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r911, %r904, 0, %p436; + selp.b32 %r912, %r903, 0, %p435; + selp.b32 %r913, %r906, 0, %p438; + selp.b32 %r914, %r905, 0, %p437; + selp.b32 %r915, %r908, 0, %p440; + selp.b32 %r916, %r907, 0, %p439; + selp.b32 %r917, %r910, 0, %p442; + selp.b32 %r918, %r909, 0, %p441; + .loc 3 600 15 // triton_helpers.py:600:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r919, %r912, %r22; + xor.b32 %r920, %r911, %r24; + xor.b32 %r921, %r914, %r23; + xor.b32 %r922, %r913, %r25; + xor.b32 %r923, %r916, %r26; + xor.b32 %r924, %r915, %r28; + xor.b32 %r925, %r918, %r27; + xor.b32 %r926, %r917, %r29; + .loc 3 601 48 // triton_helpers.py:601:48 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r927, %r895, %r879; + xor.b32 %r928, %r897, %r881; + xor.b32 %r929, %r896, %r880; + xor.b32 %r930, %r898, %r882; + xor.b32 %r931, %r899, %r883; + xor.b32 %r932, %r901, %r885; + xor.b32 %r933, %r900, %r884; + xor.b32 %r934, %r902, %r886; + .loc 3 601 59 // triton_helpers.py:601:59 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r935, %r928, 0, %p436; + selp.b32 %r936, %r927, 0, %p435; + selp.b32 %r937, %r930, 0, %p438; + selp.b32 %r938, %r929, 0, %p437; + selp.b32 %r939, %r932, 0, %p440; + selp.b32 %r940, %r931, 0, %p439; + selp.b32 %r941, %r934, 0, %p442; + selp.b32 %r942, %r933, 0, %p441; + .loc 3 601 22 // triton_helpers.py:601:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r943, %r936, %r175; + xor.b32 %r944, %r935, %r829; + xor.b32 %r945, %r938, %r831; + xor.b32 %r946, %r937, %r830; + xor.b32 %r947, %r940, %r838; + xor.b32 %r948, %r939, %r837; + xor.b32 %r949, %r942, %r835; + xor.b32 %r950, %r941, %r836; + .loc 3 574 22 // triton_helpers.py:574:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.ge.s32 %p443, %r920, %r922; + setp.lt.s32 %p444, %r919, %r921; + setp.ge.s32 %p445, %r924, %r926; + setp.lt.s32 %p446, %r923, %r925; + .loc 3 591 21 // triton_helpers.py:591:21 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.ne.b32 %p447, %r920, %r922; + setp.eq.b32 %p448, %r919, %r921; + setp.ne.b32 %p449, %r924, %r926; + setp.eq.b32 %p450, %r923, %r925; + .loc 3 594 40 // triton_helpers.py:594:40 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.le.s32 %p451, %r944, %r946; + setp.gt.s32 %p452, %r943, %r945; + setp.le.s32 %p453, %r948, %r950; + setp.gt.s32 %p454, %r947, %r949; + .loc 3 594 29 // triton_helpers.py:594:29 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + or.pred %p455, %p447, %p451; + and.pred %p456, %p448, %p452; + .loc 3 594 23 // triton_helpers.py:594:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + or.pred %p457, %p449, %p453; + and.pred %p458, %p450, %p454; + and.pred %p459, %p443, %p455; + or.pred %p460, %p444, %p456; + .loc 3 599 19 // triton_helpers.py:599:19 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + and.pred %p461, %p445, %p457; + or.pred %p462, %p446, %p458; + .loc 3 600 38 // triton_helpers.py:600:38 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r951, %r922, %r920; + xor.b32 %r952, %r921, %r919; + xor.b32 %r953, %r926, %r924; + xor.b32 %r954, %r925, %r923; + .loc 3 600 46 // triton_helpers.py:600:46 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r955, %r952, 0, %p460; + selp.b32 %r956, %r951, 0, %p459; + selp.b32 %r957, %r954, 0, %p462; + selp.b32 %r958, %r953, 0, %p461; + .loc 3 600 15 // triton_helpers.py:600:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r959, %r956, %r920; + xor.b32 %r960, %r955, %r919; + xor.b32 %r961, %r956, %r922; + xor.b32 %r962, %r955, %r921; + xor.b32 %r963, %r958, %r924; + xor.b32 %r964, %r957, %r923; + xor.b32 %r965, %r958, %r926; + xor.b32 %r966, %r957, %r925; + .loc 3 601 48 // triton_helpers.py:601:48 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r967, %r945, %r943; + xor.b32 %r968, %r946, %r944; + xor.b32 %r969, %r949, %r947; + xor.b32 %r970, %r950, %r948; + .loc 3 601 59 // triton_helpers.py:601:59 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r971, %r967, 0, %p460; + selp.b32 %r972, %r968, 0, %p459; + selp.b32 %r973, %r969, 0, %p462; + selp.b32 %r974, %r970, 0, %p461; + .loc 3 601 22 // triton_helpers.py:601:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r975, %r971, %r943; + xor.b32 %r976, %r971, %r945; + xor.b32 %r977, %r972, %r944; + xor.b32 %r978, %r972, %r946; + xor.b32 %r979, %r973, %r947; + xor.b32 %r980, %r973, %r949; + xor.b32 %r981, %r974, %r948; + xor.b32 %r982, %r974, %r950; + .loc 3 538 40 // triton_helpers.py:538:40 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r223, %r960, %r832; + mul.lo.s32 %r225, %r962, %r832; + mul.lo.s32 %r227, %r959, %r832; + mul.lo.s32 %r229, %r961, %r832; + mul.lo.s32 %r231, %r964, %r832; + mul.lo.s32 %r233, %r966, %r832; + mul.lo.s32 %r235, %r963, %r832; + mul.lo.s32 %r237, %r965, %r832; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p12 st.shared.b32 [ %r30 + 0 ], %r223; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r32 + 0 ], %r225; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r34 + 0 ], %r227; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r229; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r38 + 0 ], %r231; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r233; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r42 + 0 ], %r235; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r237; + // end inline asm + bar.sync 0; + // begin inline asm + @%p12 ld.shared.b32 %r238, [ %r47 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r983, %r238, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r241, %r983, %r238; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r47 + 0 ], %r241; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r242, [ %r51 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r984, %r242, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r245, %r984, %r242; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r51 + 0 ], %r245; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r246, [ %r55 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r985, %r246, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r249, %r985, %r246; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r55 + 0 ], %r249; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r250, [ %r59 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r986, %r250, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r253, %r986, %r250; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r59 + 0 ], %r253; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r254, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r987, %r254, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r257, %r987, %r254; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r63 + 0 ], %r257; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r258, [ %r67 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r988, %r258, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r261, %r988, %r258; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r67 + 0 ], %r261; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r262, [ %r71 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r989, %r262, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r265, %r989, %r262; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r71 + 0 ], %r265; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r266, [ %r75 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r990, %r266, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r269, %r990, %r266; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r75 + 0 ], %r269; + // end inline asm + bar.sync 0; + ld.shared.b32 %r991, [%r826]; + ld.shared.b32 %r992, [%r826+8]; + ld.shared.b32 %r993, [%r826+16]; + ld.shared.b32 %r994, [%r826+24]; + ld.shared.b32 %r995, [%r826+32]; + ld.shared.b32 %r996, [%r826+40]; + ld.shared.b32 %r997, [%r826+48]; + ld.shared.b32 %r998, [%r826+56]; + .loc 3 539 41 // triton_helpers.py:539:41 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r271, %r960, %r175; + mul.lo.s32 %r273, %r962, %r175; + mul.lo.s32 %r275, %r959, %r175; + mul.lo.s32 %r277, %r961, %r175; + mul.lo.s32 %r279, %r964, %r175; + mul.lo.s32 %r281, %r966, %r175; + mul.lo.s32 %r283, %r963, %r175; + mul.lo.s32 %r285, %r965, %r175; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p12 st.shared.b32 [ %r30 + 0 ], %r271; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r32 + 0 ], %r273; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r34 + 0 ], %r275; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r277; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r38 + 0 ], %r279; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r281; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r42 + 0 ], %r283; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r285; + // end inline asm + bar.sync 0; + // begin inline asm + @%p12 ld.shared.b32 %r286, [ %r47 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r999, %r286, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r289, %r999, %r286; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r47 + 0 ], %r289; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r290, [ %r51 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1000, %r290, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r293, %r1000, %r290; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r51 + 0 ], %r293; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r294, [ %r55 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1001, %r294, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r297, %r1001, %r294; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r55 + 0 ], %r297; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r298, [ %r59 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1002, %r298, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r301, %r1002, %r298; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r59 + 0 ], %r301; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r302, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1003, %r302, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r305, %r1003, %r302; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r63 + 0 ], %r305; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r306, [ %r67 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1004, %r306, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r309, %r1004, %r306; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r67 + 0 ], %r309; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r310, [ %r71 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1005, %r310, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r313, %r1005, %r310; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r71 + 0 ], %r313; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r314, [ %r75 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1006, %r314, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r317, %r1006, %r314; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r75 + 0 ], %r317; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1007, [%r826]; + ld.shared.b32 %r1008, [%r826+8]; + ld.shared.b32 %r1009, [%r826+16]; + ld.shared.b32 %r1010, [%r826+24]; + ld.shared.b32 %r1011, [%r826+32]; + ld.shared.b32 %r1012, [%r826+40]; + ld.shared.b32 %r1013, [%r826+48]; + ld.shared.b32 %r1014, [%r826+56]; + .loc 3 548 23 // triton_helpers.py:548:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r319, %r975, %r832; + mul.lo.s32 %r321, %r976, %r832; + mul.lo.s32 %r323, %r977, %r832; + mul.lo.s32 %r325, %r978, %r832; + mul.lo.s32 %r327, %r979, %r832; + mul.lo.s32 %r329, %r980, %r832; + mul.lo.s32 %r331, %r981, %r832; + mul.lo.s32 %r333, %r982, %r832; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p12 st.shared.b32 [ %r30 + 0 ], %r319; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r32 + 0 ], %r321; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r34 + 0 ], %r323; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r325; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r38 + 0 ], %r327; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r329; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r42 + 0 ], %r331; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r333; + // end inline asm + bar.sync 0; + // begin inline asm + @%p12 ld.shared.b32 %r334, [ %r47 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1015, %r334, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r337, %r1015, %r334; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r47 + 0 ], %r337; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r338, [ %r51 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1016, %r338, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r341, %r1016, %r338; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r51 + 0 ], %r341; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r342, [ %r55 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1017, %r342, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r345, %r1017, %r342; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r55 + 0 ], %r345; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r346, [ %r59 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1018, %r346, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r349, %r1018, %r346; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r59 + 0 ], %r349; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r350, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1019, %r350, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r353, %r1019, %r350; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r63 + 0 ], %r353; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r354, [ %r67 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1020, %r354, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r357, %r1020, %r354; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r67 + 0 ], %r357; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r358, [ %r71 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1021, %r358, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r361, %r1021, %r358; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r71 + 0 ], %r361; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r362, [ %r75 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1022, %r362, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r365, %r1022, %r362; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r75 + 0 ], %r365; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1023, [%r826]; + ld.shared.b32 %r1024, [%r826+8]; + ld.shared.b32 %r1025, [%r826+16]; + ld.shared.b32 %r1026, [%r826+24]; + ld.shared.b32 %r1027, [%r826+32]; + ld.shared.b32 %r1028, [%r826+40]; + ld.shared.b32 %r1029, [%r826+48]; + ld.shared.b32 %r1030, [%r826+56]; + .loc 3 551 23 // triton_helpers.py:551:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r367, %r975, %r175; + mul.lo.s32 %r369, %r976, %r175; + mul.lo.s32 %r371, %r977, %r175; + mul.lo.s32 %r373, %r978, %r175; + mul.lo.s32 %r375, %r979, %r175; + mul.lo.s32 %r377, %r980, %r175; + mul.lo.s32 %r379, %r981, %r175; + mul.lo.s32 %r381, %r982, %r175; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p12 st.shared.b32 [ %r30 + 0 ], %r367; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r32 + 0 ], %r369; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r34 + 0 ], %r371; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r373; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r38 + 0 ], %r375; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r377; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r42 + 0 ], %r379; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r381; + // end inline asm + bar.sync 0; + // begin inline asm + @%p12 ld.shared.b32 %r382, [ %r47 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1031, %r382, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r385, %r1031, %r382; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r47 + 0 ], %r385; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r386, [ %r51 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1032, %r386, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r389, %r1032, %r386; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r51 + 0 ], %r389; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r390, [ %r55 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1033, %r390, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r393, %r1033, %r390; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r55 + 0 ], %r393; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r394, [ %r59 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1034, %r394, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r397, %r1034, %r394; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r59 + 0 ], %r397; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r398, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1035, %r398, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r401, %r1035, %r398; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r63 + 0 ], %r401; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r402, [ %r67 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1036, %r402, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r405, %r1036, %r402; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r67 + 0 ], %r405; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r406, [ %r71 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1037, %r406, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r409, %r1037, %r406; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r71 + 0 ], %r409; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r410, [ %r75 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1038, %r410, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r413, %r1038, %r410; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r75 + 0 ], %r413; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1039, [%r826]; + ld.shared.b32 %r1040, [%r826+8]; + ld.shared.b32 %r1041, [%r826+16]; + ld.shared.b32 %r1042, [%r826+24]; + ld.shared.b32 %r1043, [%r826+32]; + ld.shared.b32 %r1044, [%r826+40]; + ld.shared.b32 %r1045, [%r826+48]; + ld.shared.b32 %r1046, [%r826+56]; + .loc 3 574 22 // triton_helpers.py:574:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.lt.s32 %p463, %r991, %r1007; + setp.lt.s32 %p464, %r992, %r1008; + setp.ge.s32 %p465, %r993, %r1009; + setp.ge.s32 %p466, %r994, %r1010; + setp.lt.s32 %p467, %r995, %r1011; + setp.lt.s32 %p468, %r996, %r1012; + setp.ge.s32 %p469, %r997, %r1013; + setp.ge.s32 %p470, %r998, %r1014; + .loc 3 591 21 // triton_helpers.py:591:21 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.eq.b32 %p471, %r991, %r1007; + setp.eq.b32 %p472, %r992, %r1008; + setp.ne.b32 %p473, %r993, %r1009; + setp.ne.b32 %p474, %r994, %r1010; + setp.eq.b32 %p475, %r995, %r1011; + setp.eq.b32 %p476, %r996, %r1012; + setp.ne.b32 %p477, %r997, %r1013; + setp.ne.b32 %p478, %r998, %r1014; + .loc 3 594 40 // triton_helpers.py:594:40 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.gt.s32 %p479, %r1023, %r1039; + setp.gt.s32 %p480, %r1024, %r1040; + setp.le.s32 %p481, %r1025, %r1041; + setp.le.s32 %p482, %r1026, %r1042; + setp.gt.s32 %p483, %r1027, %r1043; + setp.gt.s32 %p484, %r1028, %r1044; + setp.le.s32 %p485, %r1029, %r1045; + setp.le.s32 %p486, %r1030, %r1046; + .loc 3 594 29 // triton_helpers.py:594:29 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + and.pred %p487, %p471, %p479; + or.pred %p488, %p473, %p481; + or.pred %p489, %p474, %p482; + and.pred %p490, %p472, %p480; + and.pred %p491, %p475, %p483; + or.pred %p492, %p477, %p485; + .loc 3 594 23 // triton_helpers.py:594:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + or.pred %p493, %p478, %p486; + and.pred %p494, %p476, %p484; + or.pred %p495, %p463, %p487; + and.pred %p496, %p465, %p488; + and.pred %p497, %p466, %p489; + or.pred %p498, %p464, %p490; + or.pred %p499, %p467, %p491; + and.pred %p500, %p469, %p492; + .loc 3 599 19 // triton_helpers.py:599:19 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + and.pred %p501, %p470, %p493; + or.pred %p502, %p468, %p494; + .loc 3 600 38 // triton_helpers.py:600:38 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1047, %r1007, %r991; + xor.b32 %r1048, %r1009, %r993; + xor.b32 %r1049, %r1010, %r994; + xor.b32 %r1050, %r1008, %r992; + xor.b32 %r1051, %r1011, %r995; + xor.b32 %r1052, %r1013, %r997; + xor.b32 %r1053, %r1014, %r998; + xor.b32 %r1054, %r1012, %r996; + .loc 3 600 46 // triton_helpers.py:600:46 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r1055, %r1047, 0, %p495; + selp.b32 %r1056, %r1048, 0, %p496; + selp.b32 %r1057, %r1050, 0, %p498; + selp.b32 %r1058, %r1049, 0, %p497; + selp.b32 %r1059, %r1051, 0, %p499; + selp.b32 %r1060, %r1052, 0, %p500; + selp.b32 %r1061, %r1054, 0, %p502; + selp.b32 %r1062, %r1053, 0, %p501; + .loc 3 600 15 // triton_helpers.py:600:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1063, %r1055, %r960; + xor.b32 %r1064, %r1056, %r959; + xor.b32 %r1065, %r1058, %r961; + xor.b32 %r1066, %r1057, %r962; + xor.b32 %r1067, %r1059, %r964; + xor.b32 %r1068, %r1060, %r963; + xor.b32 %r1069, %r1062, %r965; + xor.b32 %r1070, %r1061, %r966; + .loc 3 601 48 // triton_helpers.py:601:48 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1071, %r1039, %r1023; + xor.b32 %r1072, %r1040, %r1024; + xor.b32 %r1073, %r1041, %r1025; + xor.b32 %r1074, %r1042, %r1026; + xor.b32 %r1075, %r1043, %r1027; + xor.b32 %r1076, %r1044, %r1028; + xor.b32 %r1077, %r1045, %r1029; + xor.b32 %r1078, %r1046, %r1030; + .loc 3 601 59 // triton_helpers.py:601:59 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r1079, %r1071, 0, %p495; + selp.b32 %r1080, %r1072, 0, %p498; + selp.b32 %r1081, %r1073, 0, %p496; + selp.b32 %r1082, %r1074, 0, %p497; + selp.b32 %r1083, %r1075, 0, %p499; + selp.b32 %r1084, %r1076, 0, %p502; + selp.b32 %r1085, %r1077, 0, %p500; + selp.b32 %r1086, %r1078, 0, %p501; + .loc 3 601 22 // triton_helpers.py:601:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1087, %r1079, %r975; + xor.b32 %r1088, %r1080, %r976; + xor.b32 %r1089, %r1081, %r977; + xor.b32 %r1090, %r1082, %r978; + xor.b32 %r1091, %r1083, %r979; + xor.b32 %r1092, %r1084, %r980; + xor.b32 %r1093, %r1085, %r981; + xor.b32 %r1094, %r1086, %r982; + .loc 3 574 22 // triton_helpers.py:574:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.lt.s32 %p503, %r1063, %r1064; + setp.lt.s32 %p504, %r1066, %r1065; + setp.ge.s32 %p505, %r1067, %r1068; + setp.ge.s32 %p506, %r1070, %r1069; + .loc 3 591 21 // triton_helpers.py:591:21 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.eq.b32 %p507, %r1063, %r1064; + setp.eq.b32 %p508, %r1066, %r1065; + setp.ne.b32 %p509, %r1067, %r1068; + setp.ne.b32 %p510, %r1070, %r1069; + .loc 3 594 40 // triton_helpers.py:594:40 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.gt.s32 %p511, %r1087, %r1089; + setp.gt.s32 %p512, %r1088, %r1090; + setp.le.s32 %p513, %r1091, %r1093; + setp.le.s32 %p514, %r1092, %r1094; + .loc 3 594 29 // triton_helpers.py:594:29 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + and.pred %p515, %p507, %p511; + and.pred %p516, %p508, %p512; + or.pred %p517, %p509, %p513; + .loc 3 594 23 // triton_helpers.py:594:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + or.pred %p518, %p510, %p514; + or.pred %p519, %p503, %p515; + or.pred %p520, %p504, %p516; + and.pred %p521, %p505, %p517; + .loc 3 599 19 // triton_helpers.py:599:19 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + and.pred %p522, %p506, %p518; + .loc 3 600 38 // triton_helpers.py:600:38 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1095, %r1064, %r1063; + xor.b32 %r1096, %r1065, %r1066; + xor.b32 %r1097, %r1068, %r1067; + xor.b32 %r1098, %r1069, %r1070; + .loc 3 600 46 // triton_helpers.py:600:46 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r1099, %r1095, 0, %p519; + selp.b32 %r1100, %r1096, 0, %p520; + selp.b32 %r1101, %r1097, 0, %p521; + selp.b32 %r1102, %r1098, 0, %p522; + .loc 3 600 15 // triton_helpers.py:600:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1103, %r1099, %r1063; + xor.b32 %r1104, %r1100, %r1066; + xor.b32 %r1105, %r1099, %r1064; + xor.b32 %r1106, %r1100, %r1065; + xor.b32 %r1107, %r1101, %r1067; + xor.b32 %r1108, %r1102, %r1070; + xor.b32 %r1109, %r1101, %r1068; + xor.b32 %r1110, %r1102, %r1069; + .loc 3 601 48 // triton_helpers.py:601:48 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1111, %r1089, %r1087; + xor.b32 %r1112, %r1090, %r1088; + xor.b32 %r1113, %r1093, %r1091; + xor.b32 %r1114, %r1094, %r1092; + .loc 3 601 59 // triton_helpers.py:601:59 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r1115, %r1111, 0, %p519; + selp.b32 %r1116, %r1112, 0, %p520; + selp.b32 %r1117, %r1113, 0, %p521; + selp.b32 %r1118, %r1114, 0, %p522; + .loc 3 601 22 // triton_helpers.py:601:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1119, %r1115, %r1087; + xor.b32 %r1120, %r1116, %r1088; + xor.b32 %r1121, %r1115, %r1089; + xor.b32 %r1122, %r1116, %r1090; + xor.b32 %r1123, %r1117, %r1091; + xor.b32 %r1124, %r1118, %r1092; + xor.b32 %r1125, %r1117, %r1093; + xor.b32 %r1126, %r1118, %r1094; + .loc 3 574 22 // triton_helpers.py:574:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.lt.s32 %p523, %r1103, %r1104; + setp.lt.s32 %p524, %r1105, %r1106; + setp.ge.s32 %p525, %r1107, %r1108; + setp.ge.s32 %p526, %r1109, %r1110; + .loc 3 591 21 // triton_helpers.py:591:21 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.eq.b32 %p527, %r1103, %r1104; + setp.eq.b32 %p528, %r1105, %r1106; + setp.ne.b32 %p529, %r1107, %r1108; + setp.ne.b32 %p530, %r1109, %r1110; + .loc 3 594 40 // triton_helpers.py:594:40 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.gt.s32 %p531, %r1119, %r1120; + setp.gt.s32 %p532, %r1121, %r1122; + setp.le.s32 %p533, %r1123, %r1124; + setp.le.s32 %p534, %r1125, %r1126; + .loc 3 594 29 // triton_helpers.py:594:29 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + and.pred %p535, %p527, %p531; + and.pred %p536, %p528, %p532; + or.pred %p537, %p529, %p533; + .loc 3 594 23 // triton_helpers.py:594:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + or.pred %p538, %p530, %p534; + or.pred %p539, %p523, %p535; + or.pred %p540, %p524, %p536; + and.pred %p541, %p525, %p537; + .loc 3 599 19 // triton_helpers.py:599:19 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + and.pred %p542, %p526, %p538; + .loc 3 600 38 // triton_helpers.py:600:38 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1127, %r1104, %r1103; + xor.b32 %r1128, %r1106, %r1105; + xor.b32 %r1129, %r1108, %r1107; + xor.b32 %r1130, %r1110, %r1109; + .loc 3 600 46 // triton_helpers.py:600:46 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r1131, %r1127, 0, %p539; + selp.b32 %r1132, %r1128, 0, %p540; + selp.b32 %r1133, %r1129, 0, %p541; + selp.b32 %r1134, %r1130, 0, %p542; + .loc 3 600 15 // triton_helpers.py:600:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1135, %r1131, %r1103; + xor.b32 %r1136, %r1131, %r1104; + xor.b32 %r1137, %r1132, %r1105; + xor.b32 %r1138, %r1132, %r1106; + xor.b32 %r1139, %r1133, %r1107; + xor.b32 %r1140, %r1133, %r1108; + xor.b32 %r1141, %r1134, %r1109; + xor.b32 %r1142, %r1134, %r1110; + .loc 3 601 48 // triton_helpers.py:601:48 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1143, %r1120, %r1119; + xor.b32 %r1144, %r1122, %r1121; + xor.b32 %r1145, %r1124, %r1123; + xor.b32 %r1146, %r1126, %r1125; + .loc 3 601 59 // triton_helpers.py:601:59 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r1147, %r1143, 0, %p539; + selp.b32 %r1148, %r1144, 0, %p540; + selp.b32 %r1149, %r1145, 0, %p541; + selp.b32 %r1150, %r1146, 0, %p542; + .loc 3 601 22 // triton_helpers.py:601:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1151, %r1147, %r1119; + xor.b32 %r1152, %r1147, %r1120; + xor.b32 %r1153, %r1148, %r1121; + xor.b32 %r1154, %r1148, %r1122; + xor.b32 %r1155, %r1149, %r1123; + xor.b32 %r1156, %r1149, %r1124; + xor.b32 %r1157, %r1150, %r1125; + xor.b32 %r1158, %r1150, %r1126; + .loc 3 538 40 // triton_helpers.py:538:40 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r415, %r1135, %r832; + mul.lo.s32 %r417, %r1136, %r832; + mul.lo.s32 %r419, %r1137, %r832; + mul.lo.s32 %r421, %r1138, %r832; + mul.lo.s32 %r423, %r1139, %r832; + mul.lo.s32 %r425, %r1140, %r832; + mul.lo.s32 %r427, %r1141, %r832; + mul.lo.s32 %r429, %r1142, %r832; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p12 st.shared.b32 [ %r30 + 0 ], %r415; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r32 + 0 ], %r417; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r34 + 0 ], %r419; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r421; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r38 + 0 ], %r423; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r425; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r42 + 0 ], %r427; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r429; + // end inline asm + bar.sync 0; + // begin inline asm + @%p12 ld.shared.b32 %r430, [ %r47 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1159, %r430, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r433, %r1159, %r430; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r47 + 0 ], %r433; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r434, [ %r51 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1160, %r434, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r437, %r1160, %r434; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r51 + 0 ], %r437; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r438, [ %r55 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1161, %r438, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r441, %r1161, %r438; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r55 + 0 ], %r441; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r442, [ %r59 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1162, %r442, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r445, %r1162, %r442; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r59 + 0 ], %r445; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r446, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1163, %r446, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r449, %r1163, %r446; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r63 + 0 ], %r449; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r450, [ %r67 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1164, %r450, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r453, %r1164, %r450; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r67 + 0 ], %r453; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r454, [ %r71 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1165, %r454, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r457, %r1165, %r454; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r71 + 0 ], %r457; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r458, [ %r75 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1166, %r458, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r461, %r1166, %r458; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r75 + 0 ], %r461; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1167, [%r826]; + ld.shared.b32 %r1168, [%r826+8]; + ld.shared.b32 %r1169, [%r826+16]; + ld.shared.b32 %r1170, [%r826+24]; + ld.shared.b32 %r1171, [%r826+32]; + ld.shared.b32 %r1172, [%r826+40]; + ld.shared.b32 %r1173, [%r826+48]; + ld.shared.b32 %r1174, [%r826+56]; + .loc 3 539 41 // triton_helpers.py:539:41 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r463, %r1135, %r175; + mul.lo.s32 %r465, %r1136, %r175; + mul.lo.s32 %r467, %r1137, %r175; + mul.lo.s32 %r469, %r1138, %r175; + mul.lo.s32 %r471, %r1139, %r175; + mul.lo.s32 %r473, %r1140, %r175; + mul.lo.s32 %r475, %r1141, %r175; + mul.lo.s32 %r477, %r1142, %r175; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p12 st.shared.b32 [ %r30 + 0 ], %r463; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r32 + 0 ], %r465; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r34 + 0 ], %r467; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r469; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r38 + 0 ], %r471; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r473; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r42 + 0 ], %r475; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r477; + // end inline asm + bar.sync 0; + // begin inline asm + @%p12 ld.shared.b32 %r478, [ %r47 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1175, %r478, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r481, %r1175, %r478; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r47 + 0 ], %r481; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r482, [ %r51 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1176, %r482, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r485, %r1176, %r482; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r51 + 0 ], %r485; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r486, [ %r55 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1177, %r486, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r489, %r1177, %r486; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r55 + 0 ], %r489; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r490, [ %r59 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1178, %r490, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r493, %r1178, %r490; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r59 + 0 ], %r493; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r494, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1179, %r494, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r497, %r1179, %r494; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r63 + 0 ], %r497; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r498, [ %r67 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1180, %r498, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r501, %r1180, %r498; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r67 + 0 ], %r501; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r502, [ %r71 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1181, %r502, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r505, %r1181, %r502; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r71 + 0 ], %r505; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r506, [ %r75 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1182, %r506, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r509, %r1182, %r506; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r75 + 0 ], %r509; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1183, [%r826]; + ld.shared.b32 %r1184, [%r826+8]; + ld.shared.b32 %r1185, [%r826+16]; + ld.shared.b32 %r1186, [%r826+24]; + ld.shared.b32 %r1187, [%r826+32]; + ld.shared.b32 %r1188, [%r826+40]; + ld.shared.b32 %r1189, [%r826+48]; + ld.shared.b32 %r1190, [%r826+56]; + .loc 3 548 23 // triton_helpers.py:548:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r511, %r1151, %r832; + mul.lo.s32 %r513, %r1152, %r832; + mul.lo.s32 %r515, %r1153, %r832; + mul.lo.s32 %r517, %r1154, %r832; + mul.lo.s32 %r519, %r1155, %r832; + mul.lo.s32 %r521, %r1156, %r832; + mul.lo.s32 %r523, %r1157, %r832; + mul.lo.s32 %r525, %r1158, %r832; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p12 st.shared.b32 [ %r30 + 0 ], %r511; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r32 + 0 ], %r513; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r34 + 0 ], %r515; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r517; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r38 + 0 ], %r519; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r521; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r42 + 0 ], %r523; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r525; + // end inline asm + bar.sync 0; + // begin inline asm + @%p12 ld.shared.b32 %r526, [ %r47 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1191, %r526, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r529, %r1191, %r526; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r47 + 0 ], %r529; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r530, [ %r51 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1192, %r530, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r533, %r1192, %r530; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r51 + 0 ], %r533; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r534, [ %r55 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1193, %r534, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r537, %r1193, %r534; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r55 + 0 ], %r537; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r538, [ %r59 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1194, %r538, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r541, %r1194, %r538; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r59 + 0 ], %r541; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r542, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1195, %r542, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r545, %r1195, %r542; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r63 + 0 ], %r545; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r546, [ %r67 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1196, %r546, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r549, %r1196, %r546; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r67 + 0 ], %r549; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r550, [ %r71 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1197, %r550, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r553, %r1197, %r550; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r71 + 0 ], %r553; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r554, [ %r75 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1198, %r554, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r557, %r1198, %r554; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r75 + 0 ], %r557; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1199, [%r826]; + ld.shared.b32 %r1200, [%r826+8]; + ld.shared.b32 %r1201, [%r826+16]; + ld.shared.b32 %r1202, [%r826+24]; + ld.shared.b32 %r1203, [%r826+32]; + ld.shared.b32 %r1204, [%r826+40]; + ld.shared.b32 %r1205, [%r826+48]; + ld.shared.b32 %r1206, [%r826+56]; + .loc 3 551 23 // triton_helpers.py:551:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r559, %r1151, %r175; + mul.lo.s32 %r561, %r1152, %r175; + mul.lo.s32 %r563, %r1153, %r175; + mul.lo.s32 %r565, %r1154, %r175; + mul.lo.s32 %r567, %r1155, %r175; + mul.lo.s32 %r569, %r1156, %r175; + mul.lo.s32 %r571, %r1157, %r175; + mul.lo.s32 %r573, %r1158, %r175; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p12 st.shared.b32 [ %r30 + 0 ], %r559; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r32 + 0 ], %r561; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r34 + 0 ], %r563; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r565; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r38 + 0 ], %r567; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r569; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r42 + 0 ], %r571; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r573; + // end inline asm + bar.sync 0; + // begin inline asm + @%p12 ld.shared.b32 %r574, [ %r47 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1207, %r574, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r577, %r1207, %r574; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r47 + 0 ], %r577; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r578, [ %r51 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1208, %r578, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r581, %r1208, %r578; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r51 + 0 ], %r581; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r582, [ %r55 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1209, %r582, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r585, %r1209, %r582; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r55 + 0 ], %r585; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r586, [ %r59 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1210, %r586, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r589, %r1210, %r586; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r59 + 0 ], %r589; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r590, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1211, %r590, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r593, %r1211, %r590; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r63 + 0 ], %r593; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r594, [ %r67 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1212, %r594, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r597, %r1212, %r594; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r67 + 0 ], %r597; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r598, [ %r71 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1213, %r598, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r601, %r1213, %r598; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r71 + 0 ], %r601; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r602, [ %r75 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1214, %r602, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r605, %r1214, %r602; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r75 + 0 ], %r605; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1215, [%r826]; + ld.shared.b32 %r1216, [%r826+8]; + ld.shared.b32 %r1217, [%r826+16]; + ld.shared.b32 %r1218, [%r826+24]; + ld.shared.b32 %r1219, [%r826+32]; + ld.shared.b32 %r1220, [%r826+40]; + ld.shared.b32 %r1221, [%r826+48]; + ld.shared.b32 %r1222, [%r826+56]; + .loc 3 574 22 // triton_helpers.py:574:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.lt.s32 %p543, %r1167, %r1183; + setp.lt.s32 %p544, %r1168, %r1184; + setp.lt.s32 %p545, %r1169, %r1185; + setp.lt.s32 %p546, %r1170, %r1186; + setp.ge.s32 %p547, %r1171, %r1187; + setp.ge.s32 %p548, %r1172, %r1188; + setp.ge.s32 %p549, %r1173, %r1189; + setp.ge.s32 %p550, %r1174, %r1190; + .loc 3 591 21 // triton_helpers.py:591:21 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.eq.b32 %p551, %r1167, %r1183; + setp.eq.b32 %p552, %r1168, %r1184; + setp.eq.b32 %p553, %r1169, %r1185; + setp.eq.b32 %p554, %r1170, %r1186; + setp.ne.b32 %p555, %r1171, %r1187; + setp.ne.b32 %p556, %r1172, %r1188; + setp.ne.b32 %p557, %r1173, %r1189; + setp.ne.b32 %p558, %r1174, %r1190; + .loc 3 594 40 // triton_helpers.py:594:40 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.gt.s32 %p559, %r1199, %r1215; + setp.gt.s32 %p560, %r1200, %r1216; + setp.gt.s32 %p561, %r1201, %r1217; + setp.gt.s32 %p562, %r1202, %r1218; + setp.le.s32 %p563, %r1203, %r1219; + setp.le.s32 %p564, %r1204, %r1220; + setp.le.s32 %p565, %r1205, %r1221; + setp.le.s32 %p566, %r1206, %r1222; + .loc 3 594 29 // triton_helpers.py:594:29 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + and.pred %p567, %p551, %p559; + and.pred %p568, %p552, %p560; + and.pred %p569, %p553, %p561; + and.pred %p570, %p554, %p562; + or.pred %p571, %p555, %p563; + or.pred %p572, %p556, %p564; + or.pred %p573, %p557, %p565; + .loc 3 594 23 // triton_helpers.py:594:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + or.pred %p574, %p558, %p566; + or.pred %p575, %p543, %p567; + or.pred %p576, %p544, %p568; + or.pred %p577, %p545, %p569; + or.pred %p578, %p546, %p570; + and.pred %p579, %p547, %p571; + and.pred %p580, %p548, %p572; + and.pred %p581, %p549, %p573; + .loc 3 599 19 // triton_helpers.py:599:19 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + and.pred %p582, %p550, %p574; + .loc 3 600 38 // triton_helpers.py:600:38 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1223, %r1183, %r1167; + xor.b32 %r1224, %r1184, %r1168; + xor.b32 %r1225, %r1185, %r1169; + xor.b32 %r1226, %r1186, %r1170; + xor.b32 %r1227, %r1187, %r1171; + xor.b32 %r1228, %r1188, %r1172; + xor.b32 %r1229, %r1189, %r1173; + xor.b32 %r1230, %r1190, %r1174; + .loc 3 600 46 // triton_helpers.py:600:46 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r1231, %r1223, 0, %p575; + selp.b32 %r1232, %r1224, 0, %p576; + selp.b32 %r1233, %r1225, 0, %p577; + selp.b32 %r1234, %r1226, 0, %p578; + selp.b32 %r1235, %r1227, 0, %p579; + selp.b32 %r1236, %r1228, 0, %p580; + selp.b32 %r1237, %r1229, 0, %p581; + selp.b32 %r1238, %r1230, 0, %p582; + .loc 3 600 15 // triton_helpers.py:600:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1239, %r1231, %r1135; + xor.b32 %r1240, %r1232, %r1136; + xor.b32 %r1241, %r1233, %r1137; + xor.b32 %r1242, %r1234, %r1138; + xor.b32 %r1243, %r1235, %r1139; + xor.b32 %r1244, %r1236, %r1140; + xor.b32 %r1245, %r1237, %r1141; + xor.b32 %r1246, %r1238, %r1142; + .loc 3 601 48 // triton_helpers.py:601:48 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1247, %r1215, %r1199; + xor.b32 %r1248, %r1216, %r1200; + xor.b32 %r1249, %r1217, %r1201; + xor.b32 %r1250, %r1218, %r1202; + xor.b32 %r1251, %r1219, %r1203; + xor.b32 %r1252, %r1220, %r1204; + xor.b32 %r1253, %r1221, %r1205; + xor.b32 %r1254, %r1222, %r1206; + .loc 3 601 59 // triton_helpers.py:601:59 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r1255, %r1247, 0, %p575; + selp.b32 %r1256, %r1248, 0, %p576; + selp.b32 %r1257, %r1249, 0, %p577; + selp.b32 %r1258, %r1250, 0, %p578; + selp.b32 %r1259, %r1251, 0, %p579; + selp.b32 %r1260, %r1252, 0, %p580; + selp.b32 %r1261, %r1253, 0, %p581; + selp.b32 %r1262, %r1254, 0, %p582; + .loc 3 601 22 // triton_helpers.py:601:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1263, %r1255, %r1151; + xor.b32 %r1264, %r1256, %r1152; + xor.b32 %r1265, %r1257, %r1153; + xor.b32 %r1266, %r1258, %r1154; + xor.b32 %r1267, %r1259, %r1155; + xor.b32 %r1268, %r1260, %r1156; + xor.b32 %r1269, %r1261, %r1157; + xor.b32 %r1270, %r1262, %r1158; + .loc 3 574 22 // triton_helpers.py:574:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.lt.s32 %p583, %r1239, %r1243; + setp.lt.s32 %p584, %r1240, %r1244; + setp.lt.s32 %p585, %r1241, %r1245; + setp.lt.s32 %p586, %r1242, %r1246; + .loc 3 591 21 // triton_helpers.py:591:21 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.eq.b32 %p587, %r1239, %r1243; + setp.eq.b32 %p588, %r1240, %r1244; + setp.eq.b32 %p589, %r1241, %r1245; + setp.eq.b32 %p590, %r1242, %r1246; + .loc 3 594 40 // triton_helpers.py:594:40 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.gt.s32 %p591, %r1263, %r1267; + setp.gt.s32 %p592, %r1264, %r1268; + setp.gt.s32 %p593, %r1265, %r1269; + setp.gt.s32 %p594, %r1266, %r1270; + .loc 3 594 29 // triton_helpers.py:594:29 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + and.pred %p595, %p587, %p591; + and.pred %p596, %p588, %p592; + and.pred %p597, %p589, %p593; + and.pred %p598, %p590, %p594; + .loc 3 594 23 // triton_helpers.py:594:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + or.pred %p599, %p583, %p595; + or.pred %p600, %p584, %p596; + or.pred %p601, %p585, %p597; + or.pred %p602, %p586, %p598; + .loc 3 600 38 // triton_helpers.py:600:38 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1271, %r1243, %r1239; + xor.b32 %r1272, %r1244, %r1240; + xor.b32 %r1273, %r1245, %r1241; + xor.b32 %r1274, %r1246, %r1242; + .loc 3 600 46 // triton_helpers.py:600:46 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r1275, %r1271, 0, %p599; + selp.b32 %r1276, %r1272, 0, %p600; + selp.b32 %r1277, %r1273, 0, %p601; + selp.b32 %r1278, %r1274, 0, %p602; + .loc 3 600 15 // triton_helpers.py:600:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1279, %r1275, %r1239; + xor.b32 %r1280, %r1276, %r1240; + xor.b32 %r1281, %r1277, %r1241; + xor.b32 %r1282, %r1278, %r1242; + xor.b32 %r1283, %r1275, %r1243; + xor.b32 %r1284, %r1276, %r1244; + xor.b32 %r1285, %r1277, %r1245; + xor.b32 %r1286, %r1278, %r1246; + .loc 3 601 48 // triton_helpers.py:601:48 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1287, %r1267, %r1263; + xor.b32 %r1288, %r1268, %r1264; + xor.b32 %r1289, %r1269, %r1265; + xor.b32 %r1290, %r1270, %r1266; + .loc 3 601 59 // triton_helpers.py:601:59 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r1291, %r1287, 0, %p599; + selp.b32 %r1292, %r1288, 0, %p600; + selp.b32 %r1293, %r1289, 0, %p601; + selp.b32 %r1294, %r1290, 0, %p602; + .loc 3 601 22 // triton_helpers.py:601:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1295, %r1291, %r1263; + xor.b32 %r1296, %r1292, %r1264; + xor.b32 %r1297, %r1293, %r1265; + xor.b32 %r1298, %r1294, %r1266; + xor.b32 %r1299, %r1291, %r1267; + xor.b32 %r1300, %r1292, %r1268; + xor.b32 %r1301, %r1293, %r1269; + xor.b32 %r1302, %r1294, %r1270; + .loc 3 574 22 // triton_helpers.py:574:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.lt.s32 %p603, %r1279, %r1281; + setp.lt.s32 %p604, %r1280, %r1282; + setp.lt.s32 %p605, %r1283, %r1285; + setp.lt.s32 %p606, %r1284, %r1286; + .loc 3 591 21 // triton_helpers.py:591:21 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.eq.b32 %p607, %r1279, %r1281; + setp.eq.b32 %p608, %r1280, %r1282; + setp.eq.b32 %p609, %r1283, %r1285; + setp.eq.b32 %p610, %r1284, %r1286; + .loc 3 594 40 // triton_helpers.py:594:40 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.gt.s32 %p611, %r1295, %r1297; + setp.gt.s32 %p612, %r1296, %r1298; + setp.gt.s32 %p613, %r1299, %r1301; + setp.gt.s32 %p614, %r1300, %r1302; + .loc 3 594 29 // triton_helpers.py:594:29 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + and.pred %p615, %p607, %p611; + and.pred %p616, %p608, %p612; + and.pred %p617, %p609, %p613; + and.pred %p618, %p610, %p614; + .loc 3 594 23 // triton_helpers.py:594:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + or.pred %p619, %p603, %p615; + or.pred %p620, %p604, %p616; + or.pred %p621, %p605, %p617; + or.pred %p622, %p606, %p618; + .loc 3 600 38 // triton_helpers.py:600:38 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1303, %r1281, %r1279; + xor.b32 %r1304, %r1282, %r1280; + xor.b32 %r1305, %r1285, %r1283; + xor.b32 %r1306, %r1286, %r1284; + .loc 3 600 46 // triton_helpers.py:600:46 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r1307, %r1303, 0, %p619; + selp.b32 %r1308, %r1304, 0, %p620; + selp.b32 %r1309, %r1305, 0, %p621; + selp.b32 %r1310, %r1306, 0, %p622; + .loc 3 600 15 // triton_helpers.py:600:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1311, %r1307, %r1279; + xor.b32 %r1312, %r1308, %r1280; + xor.b32 %r1313, %r1307, %r1281; + xor.b32 %r1314, %r1308, %r1282; + xor.b32 %r1315, %r1309, %r1283; + xor.b32 %r1316, %r1310, %r1284; + xor.b32 %r1317, %r1309, %r1285; + xor.b32 %r1318, %r1310, %r1286; + .loc 3 601 48 // triton_helpers.py:601:48 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1319, %r1297, %r1295; + xor.b32 %r1320, %r1298, %r1296; + xor.b32 %r1321, %r1301, %r1299; + xor.b32 %r1322, %r1302, %r1300; + .loc 3 601 59 // triton_helpers.py:601:59 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r1323, %r1319, 0, %p619; + selp.b32 %r1324, %r1320, 0, %p620; + selp.b32 %r1325, %r1321, 0, %p621; + selp.b32 %r1326, %r1322, 0, %p622; + .loc 3 601 22 // triton_helpers.py:601:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1327, %r1323, %r1295; + xor.b32 %r1328, %r1324, %r1296; + xor.b32 %r1329, %r1323, %r1297; + xor.b32 %r1330, %r1324, %r1298; + xor.b32 %r1331, %r1325, %r1299; + xor.b32 %r1332, %r1326, %r1300; + xor.b32 %r1333, %r1325, %r1301; + xor.b32 %r1334, %r1326, %r1302; + .loc 3 574 22 // triton_helpers.py:574:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.lt.s32 %p623, %r1311, %r1312; + setp.lt.s32 %p624, %r1313, %r1314; + setp.lt.s32 %p625, %r1315, %r1316; + setp.lt.s32 %p626, %r1317, %r1318; + .loc 3 591 21 // triton_helpers.py:591:21 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.eq.b32 %p627, %r1311, %r1312; + setp.eq.b32 %p628, %r1313, %r1314; + setp.eq.b32 %p629, %r1315, %r1316; + setp.eq.b32 %p630, %r1317, %r1318; + .loc 3 594 40 // triton_helpers.py:594:40 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.gt.s32 %p631, %r1327, %r1328; + setp.gt.s32 %p632, %r1329, %r1330; + setp.gt.s32 %p633, %r1331, %r1332; + setp.gt.s32 %p634, %r1333, %r1334; + .loc 3 594 29 // triton_helpers.py:594:29 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + and.pred %p635, %p627, %p631; + and.pred %p636, %p628, %p632; + and.pred %p637, %p629, %p633; + and.pred %p638, %p630, %p634; + .loc 3 594 23 // triton_helpers.py:594:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + or.pred %p639, %p623, %p635; + or.pred %p640, %p624, %p636; + or.pred %p641, %p625, %p637; + or.pred %p642, %p626, %p638; + .loc 3 600 38 // triton_helpers.py:600:38 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1335, %r1312, %r1311; + xor.b32 %r1336, %r1314, %r1313; + xor.b32 %r1337, %r1316, %r1315; + xor.b32 %r1338, %r1318, %r1317; + .loc 3 600 46 // triton_helpers.py:600:46 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r1339, %r1335, 0, %p639; + selp.b32 %r1340, %r1336, 0, %p640; + selp.b32 %r1341, %r1337, 0, %p641; + selp.b32 %r1342, %r1338, 0, %p642; + .loc 3 600 15 // triton_helpers.py:600:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1343, %r1339, %r1311; + xor.b32 %r1344, %r1339, %r1312; + xor.b32 %r1345, %r1340, %r1313; + xor.b32 %r1346, %r1340, %r1314; + xor.b32 %r1347, %r1341, %r1315; + xor.b32 %r1348, %r1341, %r1316; + xor.b32 %r1349, %r1342, %r1317; + xor.b32 %r1350, %r1342, %r1318; + .loc 3 601 48 // triton_helpers.py:601:48 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1351, %r1328, %r1327; + xor.b32 %r1352, %r1330, %r1329; + xor.b32 %r1353, %r1332, %r1331; + xor.b32 %r1354, %r1334, %r1333; + .loc 3 601 59 // triton_helpers.py:601:59 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r1355, %r1351, 0, %p639; + selp.b32 %r1356, %r1352, 0, %p640; + selp.b32 %r1357, %r1353, 0, %p641; + selp.b32 %r1358, %r1354, 0, %p642; + .loc 3 601 22 // triton_helpers.py:601:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1359, %r1355, %r1327; + xor.b32 %r1360, %r1355, %r1328; + xor.b32 %r1361, %r1356, %r1329; + xor.b32 %r1362, %r1356, %r1330; + xor.b32 %r1363, %r1357, %r1331; + xor.b32 %r1364, %r1357, %r1332; + xor.b32 %r1365, %r1358, %r1333; + xor.b32 %r1366, %r1358, %r1334; + .loc 3 538 40 // triton_helpers.py:538:40 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r607, %r1343, %r832; + mul.lo.s32 %r609, %r1344, %r832; + mul.lo.s32 %r611, %r1345, %r832; + mul.lo.s32 %r613, %r1346, %r832; + mul.lo.s32 %r615, %r1347, %r832; + mul.lo.s32 %r617, %r1348, %r832; + mul.lo.s32 %r619, %r1349, %r832; + mul.lo.s32 %r621, %r1350, %r832; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p12 st.shared.b32 [ %r30 + 0 ], %r607; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r32 + 0 ], %r609; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r34 + 0 ], %r611; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r613; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r38 + 0 ], %r615; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r617; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r42 + 0 ], %r619; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r621; + // end inline asm + bar.sync 0; + // begin inline asm + @%p12 ld.shared.b32 %r622, [ %r47 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1367, %r622, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r625, %r1367, %r622; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r47 + 0 ], %r625; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r626, [ %r51 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1368, %r626, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r629, %r1368, %r626; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r51 + 0 ], %r629; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r630, [ %r55 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1369, %r630, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r633, %r1369, %r630; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r55 + 0 ], %r633; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r634, [ %r59 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1370, %r634, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r637, %r1370, %r634; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r59 + 0 ], %r637; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r638, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1371, %r638, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r641, %r1371, %r638; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r63 + 0 ], %r641; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r642, [ %r67 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1372, %r642, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r645, %r1372, %r642; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r67 + 0 ], %r645; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r646, [ %r71 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1373, %r646, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r649, %r1373, %r646; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r71 + 0 ], %r649; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r650, [ %r75 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1374, %r650, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r653, %r1374, %r650; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r75 + 0 ], %r653; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1375, [%r826]; + ld.shared.b32 %r1376, [%r826+8]; + ld.shared.b32 %r1377, [%r826+16]; + ld.shared.b32 %r1378, [%r826+24]; + ld.shared.b32 %r1379, [%r826+32]; + ld.shared.b32 %r1380, [%r826+40]; + ld.shared.b32 %r1381, [%r826+48]; + ld.shared.b32 %r1382, [%r826+56]; + .loc 3 539 41 // triton_helpers.py:539:41 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r655, %r1343, %r175; + mul.lo.s32 %r657, %r1344, %r175; + mul.lo.s32 %r659, %r1345, %r175; + mul.lo.s32 %r661, %r1346, %r175; + mul.lo.s32 %r663, %r1347, %r175; + mul.lo.s32 %r665, %r1348, %r175; + mul.lo.s32 %r667, %r1349, %r175; + mul.lo.s32 %r669, %r1350, %r175; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p12 st.shared.b32 [ %r30 + 0 ], %r655; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r32 + 0 ], %r657; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r34 + 0 ], %r659; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r661; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r38 + 0 ], %r663; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r665; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r42 + 0 ], %r667; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r669; + // end inline asm + bar.sync 0; + // begin inline asm + @%p12 ld.shared.b32 %r670, [ %r47 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1383, %r670, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r673, %r1383, %r670; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r47 + 0 ], %r673; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r674, [ %r51 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1384, %r674, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r677, %r1384, %r674; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r51 + 0 ], %r677; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r678, [ %r55 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1385, %r678, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r681, %r1385, %r678; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r55 + 0 ], %r681; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r682, [ %r59 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1386, %r682, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r685, %r1386, %r682; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r59 + 0 ], %r685; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r686, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1387, %r686, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r689, %r1387, %r686; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r63 + 0 ], %r689; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r690, [ %r67 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1388, %r690, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r693, %r1388, %r690; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r67 + 0 ], %r693; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r694, [ %r71 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1389, %r694, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r697, %r1389, %r694; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r71 + 0 ], %r697; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r698, [ %r75 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1390, %r698, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r701, %r1390, %r698; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r75 + 0 ], %r701; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1391, [%r826]; + ld.shared.b32 %r1392, [%r826+8]; + ld.shared.b32 %r1393, [%r826+16]; + ld.shared.b32 %r1394, [%r826+24]; + ld.shared.b32 %r1395, [%r826+32]; + ld.shared.b32 %r1396, [%r826+40]; + ld.shared.b32 %r1397, [%r826+48]; + ld.shared.b32 %r1398, [%r826+56]; + .loc 3 548 23 // triton_helpers.py:548:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r703, %r1359, %r832; + mul.lo.s32 %r705, %r1360, %r832; + mul.lo.s32 %r707, %r1361, %r832; + mul.lo.s32 %r709, %r1362, %r832; + mul.lo.s32 %r711, %r1363, %r832; + mul.lo.s32 %r713, %r1364, %r832; + mul.lo.s32 %r715, %r1365, %r832; + mul.lo.s32 %r717, %r1366, %r832; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p12 st.shared.b32 [ %r30 + 0 ], %r703; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r32 + 0 ], %r705; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r34 + 0 ], %r707; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r709; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r38 + 0 ], %r711; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r713; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r42 + 0 ], %r715; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r717; + // end inline asm + bar.sync 0; + // begin inline asm + @%p12 ld.shared.b32 %r718, [ %r47 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1399, %r718, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r721, %r1399, %r718; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r47 + 0 ], %r721; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r722, [ %r51 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1400, %r722, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r725, %r1400, %r722; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r51 + 0 ], %r725; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r726, [ %r55 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1401, %r726, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r729, %r1401, %r726; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r55 + 0 ], %r729; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r730, [ %r59 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1402, %r730, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r733, %r1402, %r730; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r59 + 0 ], %r733; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r734, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1403, %r734, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r737, %r1403, %r734; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r63 + 0 ], %r737; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r738, [ %r67 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1404, %r738, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r741, %r1404, %r738; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r67 + 0 ], %r741; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r742, [ %r71 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1405, %r742, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r745, %r1405, %r742; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r71 + 0 ], %r745; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r746, [ %r75 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1406, %r746, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r749, %r1406, %r746; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r75 + 0 ], %r749; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1407, [%r826]; + ld.shared.b32 %r1408, [%r826+8]; + ld.shared.b32 %r1409, [%r826+16]; + ld.shared.b32 %r1410, [%r826+24]; + ld.shared.b32 %r1411, [%r826+32]; + ld.shared.b32 %r1412, [%r826+40]; + ld.shared.b32 %r1413, [%r826+48]; + ld.shared.b32 %r1414, [%r826+56]; + .loc 3 551 23 // triton_helpers.py:551:23 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + mul.lo.s32 %r751, %r1359, %r175; + mul.lo.s32 %r753, %r1360, %r175; + mul.lo.s32 %r755, %r1361, %r175; + mul.lo.s32 %r757, %r1362, %r175; + mul.lo.s32 %r759, %r1363, %r175; + mul.lo.s32 %r761, %r1364, %r175; + mul.lo.s32 %r763, %r1365, %r175; + mul.lo.s32 %r765, %r1366, %r175; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p12 st.shared.b32 [ %r30 + 0 ], %r751; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r32 + 0 ], %r753; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r34 + 0 ], %r755; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r757; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r38 + 0 ], %r759; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r761; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r42 + 0 ], %r763; + // end inline asm + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r765; + // end inline asm + bar.sync 0; + // begin inline asm + @%p12 ld.shared.b32 %r766, [ %r47 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1415, %r766, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r769, %r1415, %r766; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r47 + 0 ], %r769; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r770, [ %r51 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1416, %r770, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r773, %r1416, %r770; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r51 + 0 ], %r773; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r774, [ %r55 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1417, %r774, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r777, %r1417, %r774; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r55 + 0 ], %r777; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r778, [ %r59 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1418, %r778, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r781, %r1418, %r778; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r59 + 0 ], %r781; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r782, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1419, %r782, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r785, %r1419, %r782; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r63 + 0 ], %r785; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r786, [ %r67 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1420, %r786, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r789, %r1420, %r786; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r67 + 0 ], %r789; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r790, [ %r71 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1421, %r790, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r793, %r1421, %r790; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r71 + 0 ], %r793; + // end inline asm + // begin inline asm + @%p12 ld.shared.b32 %r794, [ %r75 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1422, %r794, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + add.s32 %r797, %r1422, %r794; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + // begin inline asm + @%p21 st.shared.b32 [ %r75 + 0 ], %r797; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1423, [%r826]; + ld.shared.b32 %r1424, [%r826+8]; + ld.shared.b32 %r1425, [%r826+16]; + ld.shared.b32 %r1426, [%r826+24]; + ld.shared.b32 %r1427, [%r826+32]; + ld.shared.b32 %r1428, [%r826+40]; + ld.shared.b32 %r1429, [%r826+48]; + ld.shared.b32 %r1430, [%r826+56]; + .loc 3 574 22 // triton_helpers.py:574:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.lt.s32 %p643, %r1381, %r1397; + setp.lt.s32 %p644, %r1379, %r1395; + setp.lt.s32 %p645, %r1377, %r1393; + setp.lt.s32 %p646, %r1375, %r1391; + setp.lt.s32 %p647, %r1382, %r1398; + setp.lt.s32 %p648, %r1380, %r1396; + setp.lt.s32 %p649, %r1378, %r1394; + setp.lt.s32 %p650, %r1376, %r1392; + .loc 3 591 21 // triton_helpers.py:591:21 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.eq.b32 %p651, %r1381, %r1397; + setp.eq.b32 %p652, %r1379, %r1395; + setp.eq.b32 %p653, %r1377, %r1393; + setp.eq.b32 %p654, %r1375, %r1391; + setp.eq.b32 %p655, %r1382, %r1398; + setp.eq.b32 %p656, %r1380, %r1396; + setp.eq.b32 %p657, %r1378, %r1394; + setp.eq.b32 %p658, %r1376, %r1392; + .loc 3 594 40 // triton_helpers.py:594:40 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + setp.gt.s32 %p659, %r1413, %r1429; + setp.gt.s32 %p660, %r1411, %r1427; + setp.gt.s32 %p661, %r1409, %r1425; + setp.gt.s32 %p662, %r1407, %r1423; + setp.gt.s32 %p663, %r1414, %r1430; + setp.gt.s32 %p664, %r1412, %r1428; + setp.gt.s32 %p665, %r1410, %r1426; + setp.gt.s32 %p666, %r1408, %r1424; + .loc 3 601 48 // triton_helpers.py:601:48 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1431, %r1429, %r1413; + xor.b32 %r1432, %r1427, %r1411; + xor.b32 %r1433, %r1425, %r1409; + xor.b32 %r1434, %r1423, %r1407; + xor.b32 %r1435, %r1430, %r1414; + xor.b32 %r1436, %r1428, %r1412; + xor.b32 %r1437, %r1426, %r1410; + xor.b32 %r1438, %r1424, %r1408; + .loc 3 601 59 // triton_helpers.py:601:59 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r1439, %r1434, 0, %p662; + selp.b32 %r1440, %r1439, 0, %p654; + selp.b32 %r1441, %r1434, %r1440, %p646; + selp.b32 %r1442, %r1433, 0, %p661; + selp.b32 %r1443, %r1442, 0, %p653; + selp.b32 %r1444, %r1433, %r1443, %p645; + selp.b32 %r1445, %r1432, 0, %p660; + selp.b32 %r1446, %r1445, 0, %p652; + selp.b32 %r1447, %r1432, %r1446, %p644; + selp.b32 %r1448, %r1431, 0, %p659; + selp.b32 %r1449, %r1448, 0, %p651; + selp.b32 %r1450, %r1431, %r1449, %p643; + .loc 3 601 22 // triton_helpers.py:601:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1451, %r1450, %r1365; + xor.b32 %r1452, %r1447, %r1363; + xor.b32 %r1453, %r1444, %r1361; + xor.b32 %r1454, %r1441, %r1359; + .loc 3 601 59 // triton_helpers.py:601:59 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + selp.b32 %r1455, %r1438, 0, %p666; + selp.b32 %r1456, %r1455, 0, %p658; + selp.b32 %r1457, %r1438, %r1456, %p650; + selp.b32 %r1458, %r1437, 0, %p665; + selp.b32 %r1459, %r1458, 0, %p657; + selp.b32 %r1460, %r1437, %r1459, %p649; + selp.b32 %r1461, %r1436, 0, %p664; + selp.b32 %r1462, %r1461, 0, %p656; + selp.b32 %r1463, %r1436, %r1462, %p648; + selp.b32 %r1464, %r1435, 0, %p663; + selp.b32 %r1465, %r1464, 0, %p655; + selp.b32 %r1466, %r1435, %r1465, %p647; + .loc 3 601 22 // triton_helpers.py:601:22 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:40:67 ] + xor.b32 %r1467, %r1466, %r1366; + xor.b32 %r1468, %r1463, %r1364; + xor.b32 %r1469, %r1460, %r1362; + xor.b32 %r1470, %r1457, %r1360; +$L__tmp10: + .loc 1 43 34 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:43:34 + cvt.s64.s32 %rd113, %r26; + cvt.s64.s32 %rd114, %r23; + cvt.s64.s32 %rd115, %r28; + cvt.s64.s32 %rd116, %r24; + cvt.s64.s32 %rd117, %r27; + cvt.s64.s32 %rd118, %r22; + cvt.s64.s32 %rd119, %r29; + cvt.s64.s32 %rd120, %r25; + selp.b64 %rd121, %rd120, 0, %p4; + selp.b64 %rd122, %rd119, 0, %p4; + selp.b64 %rd123, %rd118, 0, %p4; + selp.b64 %rd124, %rd117, 0, %p4; + selp.b64 %rd125, %rd116, 0, %p4; + selp.b64 %rd126, %rd115, 0, %p4; + selp.b64 %rd127, %rd114, 0, %p4; + selp.b64 %rd128, %rd113, 0, %p4; +$L__tmp11: + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:44:26 ] + bar.sync 0; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:44:26 ] + add.s64 %rd129, %rd127, %rd128; + add.s64 %rd130, %rd125, %rd126; + add.s64 %rd131, %rd129, %rd130; + add.s64 %rd132, %rd123, %rd124; + add.s64 %rd133, %rd121, %rd122; + add.s64 %rd134, %rd132, %rd133; + add.s64 %rd53, %rd131, %rd134; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:44:26 ] + add.s32 %r1471, %r825, %r823; + shl.b32 %r1472, %r175, 3; + add.s32 %r798, %r1471, %r1472; + // begin inline asm + @%p12 st.shared.b64 [ %r798 + 0 ], %rd53; + // end inline asm + bar.sync 0; + setp.lt.u32 %p397, %r1, 256; + shl.b32 %r1473, %r1, 3; + add.s32 %r799, %r825, %r1473; + // begin inline asm + @%p397 ld.shared.b64 %rd54, [ %r799 + 0 ]; + // end inline asm + mov.b64 {_, %r1474}, %rd54; + cvt.u32.u64 %r1475, %rd54; + shfl.sync.bfly.b32 %r1476, %r1475, 1, 31, -1; + shfl.sync.bfly.b32 %r1477, %r1474, 1, 31, -1; + cvt.u64.u32 %rd135, %r1476; + cvt.u64.u32 %rd136, %r1477; + shl.b64 %rd137, %rd136, 32; + or.b64 %rd138, %rd135, %rd137; + .loc 2 261 15 // standard.py:261:15 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:44:26 ] + add.s64 %rd55, %rd54, %rd138; + .loc 2 291 36 // standard.py:291:36 @[ cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:44:26 ] + and.b32 %r1478, %r1, 769; + setp.eq.b32 %p398, %r1478, 0; + // begin inline asm + @%p398 st.shared.b64 [ %r799 + 0 ], %rd55; + // end inline asm + bar.sync 0; + ld.shared.b32 %r819, [%r1471]; +$L__tmp12: + .loc 1 48 62 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:48:62 + setp.lt.s64 %p667, %rd19, 2; + .loc 1 48 88 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:48:88 + setp.gt.s64 %p668, %rd19, 1; + .loc 1 48 79 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:48:79 + selp.b64 %rd139, %rd19, 0, %p668; + .loc 1 48 0 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:48 + selp.b64 %rd140, 1, 0, %p667; + .loc 1 48 70 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:48:70 + add.s64 %rd141, %rd139, %rd140; + .loc 1 35 45 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:35:45 + shl.b64 %rd142, %rd141, 4; + .loc 1 48 47 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:48:47 + mul.lo.s64 %rd143, %rd142, %rd158; + mul.lo.s64 %rd144, %rd142, %rd159; + .loc 1 48 25 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:48:25 + shl.b64 %rd145, %rd12, 6; + add.s64 %rd146, %rd17, %rd145; + mul.wide.u32 %rd147, %r5, 4; + add.s64 %rd148, %rd146, %rd147; + shl.b64 %rd149, %rd143, 2; + add.s64 %rd56, %rd148, %rd149; + shl.b64 %rd150, %rd60, 6; + add.s64 %rd151, %rd17, %rd150; + add.s64 %rd152, %rd151, %rd147; + shl.b64 %rd153, %rd144, 2; + add.s64 %rd57, %rd152, %rd153; + .loc 1 48 102 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:48:102 + bar.sync 0; + shl.b32 %r1479, %r1, 4; + and.b32 %r1480, %r1479, 4080; + add.s32 %r1481, %r825, %r1480; + st.shared.v4.b32 [%r1481], {%r1454, %r1453, %r1452, %r1451}; + bar.sync 0; + and.b32 %r1482, %r1479, 112; + and.b32 %r1483, %r4, 896; + shl.b32 %r1484, %r1, 8; + and.b32 %r1485, %r1484, 2048; + shl.b32 %r1486, %r1, 6; + and.b32 %r1487, %r1486, 1024; + add.s32 %r1488, %r825, %r1482; + add.s32 %r1489, %r1488, %r1485; + add.s32 %r1490, %r1489, %r1487; + add.s32 %r805, %r1490, %r1483; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r811, %r812, %r815, %r816}, [%r805]; + // end inline asm + bar.sync 0; + st.shared.v4.b32 [%r1481], {%r1470, %r1469, %r1468, %r1467}; + bar.sync 0; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r813, %r814, %r817, %r818}, [%r805]; + // end inline asm + // begin inline asm + @%p399 st.global.v4.b32 [ %rd56 + 0 ], { %r811, %r812, %r813, %r814 }; + // end inline asm + // begin inline asm + @%p400 st.global.v4.b32 [ %rd57 + 0 ], { %r815, %r816, %r817, %r818 }; + // end inline asm + .loc 1 49 34 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:49:34 + mul.lo.s64 %rd154, %rd157, %rd141; + .loc 1 49 25 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:49:25 + add.s64 %rd155, %rd18, %rd62; + shl.b64 %rd156, %rd154, 2; + add.s64 %rd58, %rd155, %rd156; + .loc 1 49 89 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:49:89 + and.b32 %r1491, %r1, 128; + setp.eq.b32 %p669, %r1491, 0; + and.pred %p401, %p669, %p4; + // begin inline asm + @%p401 st.global.b32 [ %rd58 + 0 ], { %r819 }; + // end inline asm + .loc 1 49 4 // cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py:49:4 + ret; +$L__tmp13: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 267 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x104 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 112 +.b8 102 +.b8 104 +.b8 114 +.b8 110 +.b8 120 +.b8 116 +.b8 114 +.b8 116 +.b8 53 +.b8 102 +.b8 52 +.b8 111 +.b8 102 +.b8 100 +.b8 101 +.b8 53 +.b8 115 +.b8 118 +.b8 52 +.b8 122 +.b8 100 +.b8 114 +.b8 54 +.b8 50 +.b8 53 +.b8 118 +.b8 103 +.b8 119 +.b8 98 +.b8 55 +.b8 119 +.b8 50 +.b8 116 +.b8 106 +.b8 55 +.b8 116 +.b8 118 +.b8 105 +.b8 122 +.b8 51 +.b8 106 +.b8 51 +.b8 114 +.b8 117 +.b8 101 +.b8 120 +.b8 111 +.b8 117 +.b8 116 +.b8 51 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 112 +.b8 102 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x3d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 99 +.b8 108 +.b8 111 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 108 +.b8 105 +.b8 99 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 116 +.b8 114 +.b8 97 +.b8 110 +.b8 115 +.b8 112 +.b8 111 +.b8 115 +.b8 101 +.b8 95 +.b8 51 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc8:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xdd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp10 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 40 // DW_AT_call_line +.b8 67 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xf5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp11 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source b/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source new file mode 100644 index 0000000000000000000000000000000000000000..dfb943384f53f8983cb5e2ae59c450d6e40c22dd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source @@ -0,0 +1,1289 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":18:0) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc106 = loc(unknown) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc140 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc144 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc153 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc158 = loc("in_ptr0"(#loc)) +#loc159 = loc("out_ptr2"(#loc)) +#loc160 = loc("out_ptr3"(#loc)) +#loc161 = loc("ks0"(#loc)) +#loc162 = loc("xnumel"(#loc)) +#loc163 = loc("r0_numel"(#loc)) +#loc196 = loc("x"(#loc56)) +#loc197 = loc("idxs"(#loc56)) +#loc198 = loc("x"(#loc60)) +#loc199 = loc("idxs"(#loc60)) +#loc204 = loc("x"(#loc68)) +#loc205 = loc("idxs"(#loc68)) +#loc206 = loc("flip"(#loc68)) +#loc262 = loc("input"(#loc131)) +#loc263 = loc("a"(#loc135)) +#loc264 = loc("b"(#loc135)) +#loc266 = loc("x"(#loc140)) +#loc267 = loc("x"(#loc144)) +#loc268 = loc("input"(#loc153)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 16 : i32 loc(#loc164) + %xoffset = tt.get_program_id x : i32 loc(#loc165) + %xoffset_1 = arith.constant 128 : i32 loc(#loc166) + %xoffset_2 = arith.constant 128 : i32 loc(#loc166) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc166) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc167) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc168) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<128x1xi32> loc(#loc169) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<128x1xi32> loc(#loc169) + %xmask = tt.splat %xnumel : i32 -> tensor<128x1xi32> loc(#loc170) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<128x1xi32> loc(#loc170) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc171) + %r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc172) + %r0_offset = arith.constant 0 : i32 loc(#loc173) + %r0_mask = arith.constant true loc(#loc174) + %r0_mask_9 = arith.constant dense : tensor<128x16xi1> loc(#loc174) + %x0 = arith.extsi %xindex_6 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc175) + %x0_10 = tt.splat %ks0 : i64 -> tensor<128x1xi64> loc(#loc175) + %x0_11 = arith.remsi %x0, %x0_10 : tensor<128x1xi64> loc(#loc175) + %x1 = arith.extsi %xindex_6 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc176) + %x1_12 = tt.splat %ks0 : i64 -> tensor<128x1xi64> loc(#loc176) + %x1_13 = arith.divsi %x1, %x1_12 : tensor<128x1xi64> loc(#loc176) + %tmp0 = arith.extsi %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc177) + %tmp0_14 = tt.broadcast %tmp0 : tensor<1x16xi64> -> tensor<128x16xi64> loc(#loc177) + %tmp0_15 = tt.broadcast %x0_11 : tensor<128x1xi64> -> tensor<128x16xi64> loc(#loc177) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<128x16xi64> loc(#loc177) + %tmp0_17 = arith.constant 16 : i32 loc(#loc178) + %tmp0_18 = arith.constant 16 : i64 loc(#loc178) + %tmp0_19 = arith.constant dense<16> : tensor<128x1xi64> loc(#loc178) + %tmp0_20 = arith.muli %tmp0_19, %x1_13 : tensor<128x1xi64> loc(#loc178) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<128x1xi64> -> tensor<128x16xi64> loc(#loc179) + %tmp0_22 = arith.addi %tmp0_16, %tmp0_21 : tensor<128x16xi64> loc(#loc179) + %tmp0_23 = arith.extsi %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc180) + %tmp0_24 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc180) + %tmp0_25 = arith.muli %tmp0_24, %tmp0_23 : tensor<1x16xi64> loc(#loc180) + %tmp0_26 = tt.broadcast %tmp0_25 : tensor<1x16xi64> -> tensor<128x16xi64> loc(#loc181) + %tmp0_27 = arith.addi %tmp0_22, %tmp0_26 : tensor<128x16xi64> loc(#loc181) + %tmp0_28 = arith.constant 16 : i32 loc(#loc182) + %tmp0_29 = arith.constant 16 : i64 loc(#loc182) + %tmp0_30 = arith.muli %tmp0_29, %ks0 : i64 loc(#loc182) + %tmp0_31 = tt.splat %tmp0_30 : i64 -> tensor<128x1xi64> loc(#loc183) + %tmp0_32 = arith.muli %tmp0_31, %x1_13 : tensor<128x1xi64> loc(#loc183) + %tmp0_33 = tt.broadcast %tmp0_32 : tensor<128x1xi64> -> tensor<128x16xi64> loc(#loc184) + %tmp0_34 = arith.addi %tmp0_27, %tmp0_33 : tensor<128x16xi64> loc(#loc184) + %tmp0_35 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc185) + %tmp0_36 = tt.addptr %tmp0_35, %tmp0_34 : tensor<128x16x!tt.ptr>, tensor<128x16xi64> loc(#loc185) + %tmp0_37 = arith.constant 0.000000e+00 : f32 loc(#loc186) + %tmp0_38 = tt.broadcast %xmask_7 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc186) + %tmp0_39 = arith.constant dense<0.000000e+00> : tensor<128x16xf32> loc(#loc186) + %tmp0_40 = arith.fptosi %tmp0_39 : tensor<128x16xf32> to tensor<128x16xi32> loc(#loc186) + %tmp0_41 = tt.load %tmp0_36, %tmp0_38, %tmp0_40 evictionPolicy = evict_last : tensor<128x16x!tt.ptr> loc(#loc186) + %tmp2 = arith.trunci %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc187) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<128x16xi16> loc(#loc188) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S128_16S_i16S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp0_41, %tmp4) : (tensor<128x16xi32>, tensor<128x16xi16>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc26) + %tmp7 = arith.extsi %tmp0_41 : tensor<128x16xi32> to tensor<128x16xi64> loc(#loc189) + %tmp10 = arith.constant 0 : i32 loc(#loc190) + %tmp10_42 = arith.constant 0 : i64 loc(#loc190) + %tmp10_43 = arith.constant dense<0> : tensor<128x16xi64> loc(#loc190) + %tmp10_44 = tt.broadcast %xmask_7 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc190) + %tmp10_45 = arith.select %tmp10_44, %tmp7, %tmp10_43 : tensor<128x16xi1>, tensor<128x16xi64> loc(#loc190) + %tmp11 = tt.call @"triton.language.standard.sum__i64S128_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp10_45) : (tensor<128x16xi64>) -> tensor<128xi64> loc(#loc191) + %tmp11_46 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<128xi64> -> tensor<128x1xi64> loc(#loc192) + %tmp12 = arith.extsi %0#1 : tensor<128x16xi32> to tensor<128x16xi64> loc(#loc193) + %tmp13 = arith.trunci %tmp12 : tensor<128x16xi64> to tensor<128x16xi32> loc(#loc194) + %tmp14 = arith.trunci %tmp11_46 : tensor<128x1xi64> to tensor<128x1xi32> loc(#loc195) + %c16_i32 = arith.constant 16 : i32 loc(#loc34) + %c16_i64 = arith.constant 16 : i64 loc(#loc34) + %cst = arith.constant dense<16> : tensor<128x1xi64> loc(#loc34) + %1 = arith.muli %cst, %x0_11 : tensor<128x1xi64> loc(#loc34) + %2 = arith.extsi %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc35) + %3 = tt.broadcast %2 : tensor<1x16xi64> -> tensor<128x16xi64> loc(#loc35) + %4 = tt.broadcast %1 : tensor<128x1xi64> -> tensor<128x16xi64> loc(#loc35) + %5 = arith.addi %3, %4 : tensor<128x16xi64> loc(#loc35) + %c16_i32_47 = arith.constant 16 : i32 loc(#loc36) + %c16_i64_48 = arith.constant 16 : i64 loc(#loc36) + %cst_49 = arith.constant dense<16> : tensor<128x1xi64> loc(#loc36) + %6 = arith.muli %cst_49, %x1_13 : tensor<128x1xi64> loc(#loc36) + %c1_i32 = arith.constant 1 : i32 loc(#loc37) + %7 = arith.extsi %c1_i32 : i32 to i64 loc(#loc37) + %8 = arith.cmpi sge, %7, %ks0 : i64 loc(#loc37) + %c1_i32_50 = arith.constant 1 : i32 loc(#loc38) + %c1_i32_51 = arith.constant 1 : i32 loc(#loc38) + %9 = arith.extui %8 : i1 to i32 loc(#loc38) + %10 = arith.muli %c1_i32_51, %9 : i32 loc(#loc38) + %c1_i32_52 = arith.constant 1 : i32 loc(#loc39) + %11 = arith.extsi %c1_i32_52 : i32 to i64 loc(#loc39) + %12 = arith.cmpi sgt, %ks0, %11 : i64 loc(#loc39) + %13 = arith.extui %12 : i1 to i64 loc(#loc40) + %14 = arith.muli %ks0, %13 : i64 loc(#loc40) + %15 = arith.extsi %10 : i32 to i64 loc(#loc41) + %16 = arith.addi %15, %14 : i64 loc(#loc41) + %17 = tt.splat %16 : i64 -> tensor<128x1xi64> loc(#loc42) + %18 = arith.muli %6, %17 : tensor<128x1xi64> loc(#loc42) + %19 = tt.broadcast %18 : tensor<128x1xi64> -> tensor<128x16xi64> loc(#loc43) + %20 = arith.addi %5, %19 : tensor<128x16xi64> loc(#loc43) + %21 = tt.splat %out_ptr2 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc44) + %22 = tt.addptr %21, %20 : tensor<128x16x!tt.ptr>, tensor<128x16xi64> loc(#loc44) + %23 = tt.broadcast %xmask_7 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc45) + tt.store %22, %tmp13, %23 : tensor<128x16x!tt.ptr> loc(#loc45) + %c1_i32_53 = arith.constant 1 : i32 loc(#loc46) + %24 = arith.extsi %c1_i32_53 : i32 to i64 loc(#loc46) + %25 = arith.cmpi sge, %24, %ks0 : i64 loc(#loc46) + %c1_i32_54 = arith.constant 1 : i32 loc(#loc47) + %c1_i32_55 = arith.constant 1 : i32 loc(#loc47) + %26 = arith.extui %25 : i1 to i32 loc(#loc47) + %27 = arith.muli %c1_i32_55, %26 : i32 loc(#loc47) + %c1_i32_56 = arith.constant 1 : i32 loc(#loc48) + %28 = arith.extsi %c1_i32_56 : i32 to i64 loc(#loc48) + %29 = arith.cmpi sgt, %ks0, %28 : i64 loc(#loc48) + %30 = arith.extui %29 : i1 to i64 loc(#loc49) + %31 = arith.muli %ks0, %30 : i64 loc(#loc49) + %32 = arith.extsi %27 : i32 to i64 loc(#loc50) + %33 = arith.addi %32, %31 : i64 loc(#loc50) + %34 = tt.splat %33 : i64 -> tensor<128x1xi64> loc(#loc51) + %35 = arith.muli %x1_13, %34 : tensor<128x1xi64> loc(#loc51) + %36 = arith.addi %x0_11, %35 : tensor<128x1xi64> loc(#loc52) + %37 = tt.splat %out_ptr3 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc53) + %38 = tt.addptr %37, %36 : tensor<128x1x!tt.ptr>, tensor<128x1xi64> loc(#loc53) + tt.store %38, %tmp14, %xmask_7 : tensor<128x1x!tt.ptr> loc(#loc54) + tt.return loc(#loc55) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S128_16S_i16S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc56)), %idxs: tensor<128x16xi16> loc("idxs"(#loc56))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i16S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<128x16xi32>, tensor<128x16xi16>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc57) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc57) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc57) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc57) + tt.return %3#0, %3#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc58) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc59) + %5 = ub.poison : tensor<128x16xi32> loc(#loc59) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc59) + } loc(#loc56) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i16S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc60)), %idxs: tensor<128x16xi16> loc("idxs"(#loc60))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc200) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc201) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc201) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc202) + %flip_3 = tt.reshape %flip_2 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc203) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i16S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi16>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc65) + tt.return %0#0, %0#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc66) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x16xi32> loc(#loc67) + %2 = ub.poison : tensor<128x16xi32> loc(#loc67) + tt.return %1, %2 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc67) + } loc(#loc60) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i16S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc68)), %idxs: tensor<128x16xi16> loc("idxs"(#loc68)), %flip: tensor<128x16xi32> loc("flip"(#loc68))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<1024x2x1xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<1024x2x1xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<128x16xi16> -> tensor<1024x2x1xi16> loc(#loc221) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc222) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<1024x2x1xi16> loc(#loc223) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<1024x2x1xi16> loc(#loc223) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<1024x2x1xi16>) -> tensor<1024x1xi32> loc(#loc224) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc225) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc226) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc227) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<1024x2x1xi16> loc(#loc228) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<1024x2x1xi16> loc(#loc228) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<1024x2x1xi16>) -> tensor<1024x1xi32> loc(#loc229) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc230) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc231) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc232) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_27 = arith.constant dense : tensor<128x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_28 = arith.constant dense : tensor<128x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_49 = arith.constant true loc(#loc239) + %cond_50 = arith.constant dense : tensor<128x16xi1> loc(#loc239) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<128x16xi1> loc(#loc239) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<128x16xi1> loc(#loc240) + %cond_53 = arith.ori %cond, %cond_52 : tensor<128x16xi1> loc(#loc271) + scf.yield %cond_53 : tensor<128x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc243) + %eq_50 = arith.ori %eq, %eq_49 : tensor<128x16xi1> loc(#loc273) + scf.yield %eq_50 : tensor<128x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc106) + } loc(#loc109) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<128x16xi32> loc(#loc245) + %cond_30 = arith.andi %3, %cond_29 : tensor<128x16xi1> loc(#loc246) + %cond_31 = arith.ori %1, %cond_30 : tensor<128x16xi1> loc(#loc247) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<128x16xi1> loc(#loc248) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<128x16xi1> loc(#loc249) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<128x16xi1> loc(#loc250) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<128x16xi1> loc(#loc251) + %cond_36 = arith.extui %cond_35 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc252) + %cond_37 = arith.xori %cond_36, %flip : tensor<128x16xi32> loc(#loc252) + %cond_38 = arith.constant 0 : i32 loc(#loc253) + %cond_39 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc253) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<128x16xi32> loc(#loc253) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc254) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc255) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc256) + %ret_43 = arith.xori %x, %ret_42 : tensor<128x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<128x16xi32> loc(#loc258) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S128_16S__(%idxs) : (tensor<128x16xi16>) -> tensor<128x16xi16> loc(#loc259) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<128x16xi16> to tensor<128x16xi32> loc(#loc260) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc260) + %new_idxs_47 = arith.extsi %idxs : tensor<128x16xi16> to tensor<128x16xi32> loc(#loc261) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<128x16xi32> loc(#loc261) + tt.return %ret_43, %new_idxs_48 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc130) + %5 = ub.poison : tensor<128x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1024x2x1xi32> loc("input"(#loc131))) -> tensor<1024x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc132) + tt.reduce.return %2 : i32 loc(#loc132) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc132) + tt.return %0 : tensor<1024x1xi32> loc(#loc133) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1024x1xi32> loc(#loc134) + tt.return %1 : tensor<1024x1xi32> loc(#loc134) + } loc(#loc131) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc135)), %b: i32 loc("b"(#loc135))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc136) + tt.return %0 : i32 loc(#loc137) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc138) + tt.return %1 : i32 loc(#loc138) + } loc(#loc135) + tt.func private @"triton.language.standard.sum__i16S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1024x2x1xi16> loc("input"(#loc131))) -> tensor<1024x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<1024x2x1xi16> to tensor<1024x2x1xi32> loc(#loc265) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc132) + tt.reduce.return %2 : i32 loc(#loc132) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc132) + tt.return %0 : tensor<1024x1xi32> loc(#loc133) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1024x1xi32> loc(#loc134) + tt.return %1 : tensor<1024x1xi32> loc(#loc134) + } loc(#loc131) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%x: tensor<128x16xi32> loc("x"(#loc140))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc141) + %false = arith.constant false loc(#loc142) + tt.return %false : i1 loc(#loc142) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc143) + tt.return %1 : i1 loc(#loc143) + } loc(#loc140) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S128_16S__(%x: tensor<128x16xi32> loc("x"(#loc144))) -> tensor<128x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc145) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc146) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc146) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<128x16xi32> loc(#loc146) + %4 = arith.addi %x, %3 : tensor<128x16xi32> loc(#loc146) + tt.return %4 : tensor<128x16xi32> loc(#loc147) + ^bb1: // no predecessors + %5 = ub.poison : tensor<128x16xi32> loc(#loc148) + tt.return %5 : tensor<128x16xi32> loc(#loc148) + } loc(#loc144) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc150) + %cst = arith.constant dense : tensor<1xi1> loc(#loc150) + tt.return %cst : tensor<1xi1> loc(#loc151) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc152) + tt.return %0 : tensor<1xi1> loc(#loc152) + } loc(#loc149) + tt.func private @triton.language.standard.zeros_like__i32S128_16S__(%input: tensor<128x16xi32> loc("input"(#loc153))) -> tensor<128x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<128x16xi32> loc(#loc154) + tt.return %0 : tensor<128x16xi32> loc(#loc155) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x16xi32> loc(#loc156) + tt.return %1 : tensor<128x16xi32> loc(#loc156) + } loc(#loc153) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<128x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc150) + %cst = arith.constant dense<0> : tensor<128x16xi32> loc(#loc150) + tt.return %cst : tensor<128x16xi32> loc(#loc151) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x16xi32> loc(#loc152) + tt.return %0 : tensor<128x16xi32> loc(#loc152) + } loc(#loc149) + tt.func private @triton.language.standard.zeros_like__i16S128_16S__(%input: tensor<128x16xi16> loc("input"(#loc153))) -> tensor<128x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<128x16xi16> loc(#loc154) + tt.return %0 : tensor<128x16xi16> loc(#loc155) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x16xi16> loc(#loc156) + tt.return %1 : tensor<128x16xi16> loc(#loc156) + } loc(#loc153) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<128x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc150) + %cst = arith.constant dense<0> : tensor<128x16xi16> loc(#loc150) + tt.return %cst : tensor<128x16xi16> loc(#loc151) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x16xi16> loc(#loc152) + tt.return %0 : tensor<128x16xi16> loc(#loc152) + } loc(#loc149) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc60)), %idxs: tensor<128x16xi32> loc("idxs"(#loc60))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc200) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc201) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc201) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc202) + %flip_3 = tt.reshape %flip_2 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc203) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc65) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc65) + tt.return %1#0, %1#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc66) + ^bb1: // no predecessors + %2 = ub.poison : tensor<128x16xi32> loc(#loc67) + %3 = ub.poison : tensor<128x16xi32> loc(#loc67) + tt.return %2, %3 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc67) + } loc(#loc60) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc68)), %idxs: tensor<128x16xi32> loc("idxs"(#loc68)), %flip: tensor<128x16xi32> loc("flip"(#loc68))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<512x2x2xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<512x2x2xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<512x2x2xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<512x2x2xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_45 = arith.constant true loc(#loc239) + %cond_46 = arith.constant dense : tensor<128x16xi1> loc(#loc239) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<128x16xi1> loc(#loc239) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<128x16xi1> loc(#loc240) + %cond_49 = arith.ori %cond, %cond_48 : tensor<128x16xi1> loc(#loc271) + scf.yield %cond_49 : tensor<128x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc243) + %eq_46 = arith.ori %eq, %eq_45 : tensor<128x16xi1> loc(#loc273) + scf.yield %eq_46 : tensor<128x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc251) + %cond_34 = arith.extui %cond_33 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc252) + %cond_35 = arith.xori %cond_34, %flip : tensor<128x16xi32> loc(#loc252) + %cond_36 = arith.constant 0 : i32 loc(#loc253) + %cond_37 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc253) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<128x16xi32> loc(#loc253) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc254) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc255) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc256) + %ret_41 = arith.xori %x, %ret_40 : tensor<128x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc258) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc259) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc260) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<128x16xi32> loc(#loc261) + tt.return %ret_41, %new_idxs_44 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc130) + %5 = ub.poison : tensor<128x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<512x2x2xi32> loc("input"(#loc131))) -> tensor<512x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc132) + tt.reduce.return %2 : i32 loc(#loc132) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc132) + tt.return %0 : tensor<512x2xi32> loc(#loc133) + ^bb1: // no predecessors + %1 = ub.poison : tensor<512x2xi32> loc(#loc134) + tt.return %1 : tensor<512x2xi32> loc(#loc134) + } loc(#loc131) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc68)), %idxs: tensor<128x16xi32> loc("idxs"(#loc68)), %flip: tensor<128x16xi32> loc("flip"(#loc68))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<1024x2x1xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<1024x2x1xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<1024x2x1xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<1024x2x1xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_45 = arith.constant true loc(#loc239) + %cond_46 = arith.constant dense : tensor<128x16xi1> loc(#loc239) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<128x16xi1> loc(#loc239) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<128x16xi1> loc(#loc240) + %cond_49 = arith.ori %cond, %cond_48 : tensor<128x16xi1> loc(#loc271) + scf.yield %cond_49 : tensor<128x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc243) + %eq_46 = arith.ori %eq, %eq_45 : tensor<128x16xi1> loc(#loc273) + scf.yield %eq_46 : tensor<128x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc251) + %cond_34 = arith.extui %cond_33 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc252) + %cond_35 = arith.xori %cond_34, %flip : tensor<128x16xi32> loc(#loc252) + %cond_36 = arith.constant 0 : i32 loc(#loc253) + %cond_37 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc253) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<128x16xi32> loc(#loc253) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc254) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc255) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc256) + %ret_41 = arith.xori %x, %ret_40 : tensor<128x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc258) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc259) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc260) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<128x16xi32> loc(#loc261) + tt.return %ret_41, %new_idxs_44 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc130) + %5 = ub.poison : tensor<128x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc60)), %idxs: tensor<128x16xi32> loc("idxs"(#loc60))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc200) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc201) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc201) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc202) + %flip_3 = tt.reshape %flip_2 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc203) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc65) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc65) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc65) + tt.return %2#0, %2#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc66) + ^bb1: // no predecessors + %3 = ub.poison : tensor<128x16xi32> loc(#loc67) + %4 = ub.poison : tensor<128x16xi32> loc(#loc67) + tt.return %3, %4 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc67) + } loc(#loc60) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc68)), %idxs: tensor<128x16xi32> loc("idxs"(#loc68)), %flip: tensor<128x16xi32> loc("flip"(#loc68))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x4xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<256x2x4xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x4xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x4xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_45 = arith.constant true loc(#loc239) + %cond_46 = arith.constant dense : tensor<128x16xi1> loc(#loc239) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<128x16xi1> loc(#loc239) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<128x16xi1> loc(#loc240) + %cond_49 = arith.ori %cond, %cond_48 : tensor<128x16xi1> loc(#loc271) + scf.yield %cond_49 : tensor<128x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc243) + %eq_46 = arith.ori %eq, %eq_45 : tensor<128x16xi1> loc(#loc273) + scf.yield %eq_46 : tensor<128x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc251) + %cond_34 = arith.extui %cond_33 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc252) + %cond_35 = arith.xori %cond_34, %flip : tensor<128x16xi32> loc(#loc252) + %cond_36 = arith.constant 0 : i32 loc(#loc253) + %cond_37 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc253) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<128x16xi32> loc(#loc253) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc254) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc255) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc256) + %ret_41 = arith.xori %x, %ret_40 : tensor<128x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc258) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc259) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc260) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<128x16xi32> loc(#loc261) + tt.return %ret_41, %new_idxs_44 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc130) + %5 = ub.poison : tensor<128x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<256x2x4xi32> loc("input"(#loc131))) -> tensor<256x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc132) + tt.reduce.return %2 : i32 loc(#loc132) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc132) + tt.return %0 : tensor<256x4xi32> loc(#loc133) + ^bb1: // no predecessors + %1 = ub.poison : tensor<256x4xi32> loc(#loc134) + tt.return %1 : tensor<256x4xi32> loc(#loc134) + } loc(#loc131) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc60)), %idxs: tensor<128x16xi32> loc("idxs"(#loc60))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc269) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<128x16xi32>, tensor<128x16xi32>, i1) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc65) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<128x16xi32>, tensor<128x16xi32>, i1) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc65) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<128x16xi32>, tensor<128x16xi32>, i1) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc65) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<128x16xi32>, tensor<128x16xi32>, i1) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc65) + tt.return %3#0, %3#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc66) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc67) + %5 = ub.poison : tensor<128x16xi32> loc(#loc67) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc67) + } loc(#loc60) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc68)), %idxs: tensor<128x16xi32> loc("idxs"(#loc68)), %flip: i1 loc("flip"(#loc68))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<128x2x8xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<128x2x8xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<128x2x8xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<128x2x8xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_42 = arith.constant true loc(#loc239) + %cond_43 = arith.constant dense : tensor<128x16xi1> loc(#loc239) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<128x16xi1> loc(#loc239) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<128x16xi1> loc(#loc240) + %cond_46 = arith.ori %cond, %cond_45 : tensor<128x16xi1> loc(#loc271) + scf.yield %cond_46 : tensor<128x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc243) + %eq_43 = arith.ori %eq, %eq_42 : tensor<128x16xi1> loc(#loc273) + scf.yield %eq_43 : tensor<128x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc251) + %cond_34 = tt.splat %flip : i1 -> tensor<128x16xi1> loc(#loc252) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<128x16xi1> loc(#loc252) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc254) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc255) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc256) + %ret_38 = arith.xori %x, %ret_37 : tensor<128x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc258) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc259) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc260) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<128x16xi32> loc(#loc261) + tt.return %ret_38, %new_idxs_41 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc130) + %5 = ub.poison : tensor<128x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x2x8xi32> loc("input"(#loc131))) -> tensor<128x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc132) + tt.reduce.return %2 : i32 loc(#loc132) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc132) + tt.return %0 : tensor<128x8xi32> loc(#loc133) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x8xi32> loc(#loc134) + tt.return %1 : tensor<128x8xi32> loc(#loc134) + } loc(#loc131) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc68)), %idxs: tensor<128x16xi32> loc("idxs"(#loc68)), %flip: i1 loc("flip"(#loc68))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x4xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<256x2x4xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x4xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x4xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_42 = arith.constant true loc(#loc239) + %cond_43 = arith.constant dense : tensor<128x16xi1> loc(#loc239) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<128x16xi1> loc(#loc239) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<128x16xi1> loc(#loc240) + %cond_46 = arith.ori %cond, %cond_45 : tensor<128x16xi1> loc(#loc271) + scf.yield %cond_46 : tensor<128x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc243) + %eq_43 = arith.ori %eq, %eq_42 : tensor<128x16xi1> loc(#loc273) + scf.yield %eq_43 : tensor<128x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc251) + %cond_34 = tt.splat %flip : i1 -> tensor<128x16xi1> loc(#loc252) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<128x16xi1> loc(#loc252) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc254) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc255) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc256) + %ret_38 = arith.xori %x, %ret_37 : tensor<128x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc258) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc259) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc260) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<128x16xi32> loc(#loc261) + tt.return %ret_38, %new_idxs_41 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc130) + %5 = ub.poison : tensor<128x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc68)), %idxs: tensor<128x16xi32> loc("idxs"(#loc68)), %flip: i1 loc("flip"(#loc68))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<512x2x2xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<512x2x2xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<512x2x2xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<512x2x2xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_42 = arith.constant true loc(#loc239) + %cond_43 = arith.constant dense : tensor<128x16xi1> loc(#loc239) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<128x16xi1> loc(#loc239) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<128x16xi1> loc(#loc240) + %cond_46 = arith.ori %cond, %cond_45 : tensor<128x16xi1> loc(#loc271) + scf.yield %cond_46 : tensor<128x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc243) + %eq_43 = arith.ori %eq, %eq_42 : tensor<128x16xi1> loc(#loc273) + scf.yield %eq_43 : tensor<128x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc251) + %cond_34 = tt.splat %flip : i1 -> tensor<128x16xi1> loc(#loc252) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<128x16xi1> loc(#loc252) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc254) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc255) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc256) + %ret_38 = arith.xori %x, %ret_37 : tensor<128x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc258) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc259) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc260) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<128x16xi32> loc(#loc261) + tt.return %ret_38, %new_idxs_41 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc130) + %5 = ub.poison : tensor<128x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc68)), %idxs: tensor<128x16xi32> loc("idxs"(#loc68)), %flip: i1 loc("flip"(#loc68))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<1024x2x1xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<1024x2x1xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<1024x2x1xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<1024x2x1xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_42 = arith.constant true loc(#loc239) + %cond_43 = arith.constant dense : tensor<128x16xi1> loc(#loc239) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<128x16xi1> loc(#loc239) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<128x16xi1> loc(#loc240) + %cond_46 = arith.ori %cond, %cond_45 : tensor<128x16xi1> loc(#loc271) + scf.yield %cond_46 : tensor<128x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc243) + %eq_43 = arith.ori %eq, %eq_42 : tensor<128x16xi1> loc(#loc273) + scf.yield %eq_43 : tensor<128x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc251) + %cond_34 = tt.splat %flip : i1 -> tensor<128x16xi1> loc(#loc252) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<128x16xi1> loc(#loc252) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc254) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc255) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc256) + %ret_38 = arith.xori %x, %ret_37 : tensor<128x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc258) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc259) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc260) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<128x16xi32> loc(#loc261) + tt.return %ret_38, %new_idxs_41 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc130) + %5 = ub.poison : tensor<128x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"triton.language.standard.sum__i64S128_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x16xi64> loc("input"(#loc131))) -> tensor<128xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc132) + tt.reduce.return %2 : i64 loc(#loc132) + }) : (tensor<128x16xi64>) -> tensor<128xi64> loc(#loc132) + tt.return %0 : tensor<128xi64> loc(#loc133) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128xi64> loc(#loc134) + tt.return %1 : tensor<128xi64> loc(#loc134) + } loc(#loc131) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc135)), %b: i64 loc("b"(#loc135))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc136) + tt.return %0 : i64 loc(#loc137) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc138) + tt.return %1 : i64 loc(#loc138) + } loc(#loc135) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":24:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":24:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":24:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":25:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":26:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":26:38) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":27:16) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":28:48) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":32:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":33:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:45) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:42) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:54) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:50) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:64) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:68) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:61) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:30) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:73) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":37:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":39:33) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":40:67) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":41:19) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":43:34) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":44:26) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":44:29) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":45:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":46:21) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":47:21) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:35) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:32) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:43) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:62) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:54) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:88) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:79) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:70) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:47) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:40) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:25) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:102) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:49) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:41) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:75) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:66) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:57) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:34) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:30) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:89) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:4) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc141 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc142 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc143 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc145 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc146 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc147 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc148 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc149 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc150 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc151 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc152 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc154 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc155 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc156 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc157 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc164 = loc("r0_numel"(#loc1)) +#loc165 = loc("xoffset"(#loc2)) +#loc166 = loc("xoffset"(#loc3)) +#loc167 = loc("xindex"(#loc4)) +#loc168 = loc("xindex"(#loc5)) +#loc169 = loc("xindex"(#loc6)) +#loc170 = loc("xmask"(#loc7)) +#loc171 = loc("r0_index"(#loc8)) +#loc172 = loc("r0_index"(#loc9)) +#loc173 = loc("r0_offset"(#loc10)) +#loc174 = loc("r0_mask"(#loc11)) +#loc175 = loc("x0"(#loc12)) +#loc176 = loc("x1"(#loc13)) +#loc177 = loc("tmp0"(#loc14)) +#loc178 = loc("tmp0"(#loc15)) +#loc179 = loc("tmp0"(#loc16)) +#loc180 = loc("tmp0"(#loc17)) +#loc181 = loc("tmp0"(#loc18)) +#loc182 = loc("tmp0"(#loc19)) +#loc183 = loc("tmp0"(#loc20)) +#loc184 = loc("tmp0"(#loc21)) +#loc185 = loc("tmp0"(#loc22)) +#loc186 = loc("tmp0"(#loc23)) +#loc187 = loc("tmp2"(#loc24)) +#loc188 = loc("tmp4"(#loc25)) +#loc189 = loc("tmp7"(#loc27)) +#loc190 = loc("tmp10"(#loc28)) +#loc191 = loc("tmp11"(#loc29)) +#loc192 = loc("tmp11"(#loc30)) +#loc193 = loc("tmp12"(#loc31)) +#loc194 = loc("tmp13"(#loc32)) +#loc195 = loc("tmp14"(#loc33)) +#loc200 = loc("flip"(#loc61)) +#loc201 = loc("flip"(#loc62)) +#loc202 = loc("flip"(#loc63)) +#loc203 = loc("flip"(#loc64)) +#loc207 = loc("y"(#loc69)) +#loc208 = loc("right_mask"(#loc70)) +#loc209 = loc("right_mask"(#loc71)) +#loc210 = loc("left_mask"(#loc72)) +#loc211 = loc("ileft"(#loc73)) +#loc212 = loc("ileft"(#loc74)) +#loc213 = loc("ileft"(#loc75)) +#loc214 = loc("ileft"(#loc76)) +#loc215 = loc("iright"(#loc77)) +#loc216 = loc("iright"(#loc78)) +#loc217 = loc("iright"(#loc79)) +#loc218 = loc("iright"(#loc80)) +#loc219 = loc("ileft"(#loc81)) +#loc220 = loc("iright"(#loc82)) +#loc221 = loc("y_idx"(#loc83)) +#loc222 = loc("left_idx"(#loc84)) +#loc223 = loc("left_idx"(#loc85)) +#loc224 = loc("left_idx"(#loc86)) +#loc225 = loc("left_idx"(#loc87)) +#loc226 = loc("left_idx"(#loc88)) +#loc227 = loc("right_idx"(#loc89)) +#loc228 = loc("right_idx"(#loc90)) +#loc229 = loc("right_idx"(#loc91)) +#loc230 = loc("right_idx"(#loc92)) +#loc231 = loc("right_idx"(#loc93)) +#loc232 = loc("left_idx"(#loc94)) +#loc233 = loc("right_idx"(#loc95)) +#loc234 = loc("left_valid_mask"(#loc96)) +#loc235 = loc("right_valid_mask"(#loc97)) +#loc236 = loc("left_isnan"(#loc98)) +#loc237 = loc("right_isnan"(#loc99)) +#loc238 = loc("cond"(#loc100)) +#loc239 = loc("cond"(#loc103)) +#loc240 = loc("cond"(#loc104)) +#loc241 = loc("cond"(#loc105)) +#loc242 = loc("eq"(#loc107)) +#loc243 = loc("eq"(#loc110)) +#loc244 = loc("eq"(#loc111)) +#loc245 = loc("cond"(#loc112)) +#loc246 = loc("cond"(#loc113)) +#loc247 = loc("cond"(#loc114)) +#loc248 = loc("cond"(#loc115)) +#loc249 = loc("cond"(#loc116)) +#loc250 = loc("cond"(#loc117)) +#loc251 = loc("cond"(#loc118)) +#loc252 = loc("cond"(#loc119)) +#loc253 = loc("cond"(#loc120)) +#loc254 = loc("ret"(#loc121)) +#loc255 = loc("ret"(#loc122)) +#loc256 = loc("ret"(#loc123)) +#loc257 = loc("ret"(#loc124)) +#loc258 = loc("new_idxs"(#loc125)) +#loc259 = loc("new_idxs"(#loc126)) +#loc260 = loc("new_idxs"(#loc127)) +#loc261 = loc("new_idxs"(#loc128)) +#loc265 = loc("input"(#loc139)) +#loc269 = loc("flip"(#loc157)) +#loc270 = loc("cond"(#loc238)) +#loc271 = loc("cond"(#loc241)) +#loc272 = loc("eq"(#loc242)) +#loc273 = loc("eq"(#loc244)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..975b4d94f857619851b470435585c783e9bcde59 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir @@ -0,0 +1,890 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}> +#linear = #ttg.linear<{register = [[0, 2], [0, 4], [0, 8]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0]], warp = [[32, 0], [64, 0], [0, 1]], block = []}> +#linear1 = #ttg.linear<{register = [[1, 0, 0], [2, 0, 0], [4, 0, 0]], lane = [[8, 0, 0], [16, 0, 0], [32, 0, 0], [64, 0, 0], [128, 0, 0]], warp = [[256, 0, 0], [512, 0, 0], [0, 1, 0]], block = []}> +#linear2 = #ttg.linear<{register = [[0, 1, 0], [1, 0, 0], [2, 0, 0]], lane = [[4, 0, 0], [8, 0, 0], [16, 0, 0], [32, 0, 0], [64, 0, 0]], warp = [[128, 0, 0], [256, 0, 0], [0, 0, 1]], block = []}> +#linear3 = #ttg.linear<{register = [[0, 0, 2], [0, 1, 0], [1, 0, 0]], lane = [[2, 0, 0], [4, 0, 0], [8, 0, 0], [16, 0, 0], [32, 0, 0]], warp = [[64, 0, 0], [128, 0, 0], [0, 0, 1]], block = []}> +#linear4 = #ttg.linear<{register = [[0, 0, 2], [0, 0, 4], [0, 1, 0]], lane = [[1, 0, 0], [2, 0, 0], [4, 0, 0], [8, 0, 0], [16, 0, 0]], warp = [[32, 0, 0], [64, 0, 0], [0, 0, 1]], block = []}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":18:0) +#loc1 = loc(unknown) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":40:67) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":44:26) +#loc90 = loc("in_ptr0"(#loc)) +#loc91 = loc("out_ptr2"(#loc)) +#loc92 = loc("out_ptr3"(#loc)) +#loc93 = loc("ks0"(#loc)) +#loc94 = loc("xnumel"(#loc)) +#loc95 = loc("r0_numel"(#loc)) +#loc117 = loc(callsite(#loc23 at #loc24)) +#loc123 = loc("ileft"(#loc32)) +#loc127 = loc("iright"(#loc37)) +#loc136 = loc("left_idx"(#loc46)) +#loc141 = loc("right_idx"(#loc51)) +#loc161 = loc("tmp11"(#loc71)) +#loc168 = loc(callsite(#loc28 at #loc117)) +#loc172 = loc(callsite(#loc1 at #loc161)) +#loc176 = loc(callsite(#loc123 at #loc168)) +#loc180 = loc(callsite(#loc127 at #loc168)) +#loc188 = loc(callsite(#loc136 at #loc168)) +#loc193 = loc(callsite(#loc141 at #loc168)) +#loc213 = loc(callsite(#loc1 at #loc176)) +#loc215 = loc(callsite(#loc1 at #loc180)) +#loc218 = loc(callsite(#loc1 at #loc188)) +#loc221 = loc(callsite(#loc1 at #loc193)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<128x16xi32, #linear> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<128x16xi64, #blocked> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c16_i64 = arith.constant 16 : i64 loc(#loc1) + %cst_1 = arith.constant dense<16> : tensor<128x1xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<16> : tensor<128x1xi64, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x2x1xi32, #linear1> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x2x1xi32, #linear2> loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x2x1xi32, #linear3> loc(#loc1) + %cst_6 = arith.constant dense<1> : tensor<1x2x1xi32, #linear4> loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst_7 = arith.constant dense<0> : tensor<128x16xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc96) + %xoffset_8 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc97) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc98) + %xindex_9 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc98) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc98) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xi32, #blocked1> loc(#loc98) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<128x1xi32, #blocked> loc(#loc99) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<128x1xi32, #blocked1> loc(#loc99) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<128x1xi32, #blocked> loc(#loc99) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<128x1xi32, #blocked1> loc(#loc99) + %xmask = tt.splat %xnumel : i32 -> tensor<128x1xi32, #blocked> loc(#loc100) + %xmask_16 = tt.splat %xnumel : i32 -> tensor<128x1xi32, #blocked1> loc(#loc100) + %xmask_17 = arith.cmpi slt, %xindex_14, %xmask : tensor<128x1xi32, #blocked> loc(#loc100) + %xmask_18 = arith.cmpi slt, %xindex_15, %xmask_16 : tensor<128x1xi32, #blocked1> loc(#loc100) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc101) + %r0_index_19 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> loc(#loc101) + %r0_index_20 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc101) + %r0_index_21 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc101) + %r0_index_22 = tt.expand_dims %r0_index_19 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> -> tensor<1x16xi32, #linear> loc(#loc101) + %r0_index_23 = tt.expand_dims %r0_index_20 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x16xi32, #blocked1> loc(#loc101) + %x0 = arith.extsi %xindex_14 : tensor<128x1xi32, #blocked> to tensor<128x1xi64, #blocked> loc(#loc102) + %x0_24 = arith.extsi %xindex_15 : tensor<128x1xi32, #blocked1> to tensor<128x1xi64, #blocked1> loc(#loc102) + %x0_25 = tt.splat %ks0 : i64 -> tensor<128x1xi64, #blocked> loc(#loc102) + %x0_26 = tt.splat %ks0 : i64 -> tensor<128x1xi64, #blocked1> loc(#loc102) + %x0_27 = arith.remsi %x0, %x0_25 : tensor<128x1xi64, #blocked> loc(#loc102) + %x0_28 = arith.remsi %x0_24, %x0_26 : tensor<128x1xi64, #blocked1> loc(#loc102) + %x1 = arith.divsi %x0, %x0_25 : tensor<128x1xi64, #blocked> loc(#loc103) + %x1_29 = arith.divsi %x0_24, %x0_26 : tensor<128x1xi64, #blocked1> loc(#loc103) + %tmp0 = arith.extsi %r0_index_21 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc104) + %tmp0_30 = arith.extsi %r0_index_23 : tensor<1x16xi32, #blocked1> to tensor<1x16xi64, #blocked1> loc(#loc104) + %tmp0_31 = tt.broadcast %tmp0 : tensor<1x16xi64, #blocked> -> tensor<128x16xi64, #blocked> loc(#loc104) + %tmp0_32 = tt.broadcast %tmp0_30 : tensor<1x16xi64, #blocked1> -> tensor<128x16xi64, #blocked1> loc(#loc104) + %tmp0_33 = tt.broadcast %x0_27 : tensor<128x1xi64, #blocked> -> tensor<128x16xi64, #blocked> loc(#loc104) + %tmp0_34 = arith.addi %tmp0_31, %tmp0_33 : tensor<128x16xi64, #blocked> loc(#loc104) + %tmp0_35 = arith.muli %x1, %cst_1 : tensor<128x1xi64, #blocked> loc(#loc105) + %tmp0_36 = arith.muli %x1_29, %cst_2 : tensor<128x1xi64, #blocked1> loc(#loc105) + %tmp0_37 = tt.broadcast %tmp0_35 : tensor<128x1xi64, #blocked> -> tensor<128x16xi64, #blocked> loc(#loc106) + %tmp0_38 = arith.addi %tmp0_34, %tmp0_37 : tensor<128x16xi64, #blocked> loc(#loc106) + %tmp0_39 = tt.splat %ks0 : i64 -> tensor<1x16xi64, #blocked> loc(#loc107) + %tmp0_40 = arith.muli %tmp0_39, %tmp0 : tensor<1x16xi64, #blocked> loc(#loc107) + %tmp0_41 = tt.broadcast %tmp0_40 : tensor<1x16xi64, #blocked> -> tensor<128x16xi64, #blocked> loc(#loc108) + %tmp0_42 = arith.addi %tmp0_38, %tmp0_41 : tensor<128x16xi64, #blocked> loc(#loc108) + %tmp0_43 = arith.muli %ks0, %c16_i64 : i64 loc(#loc109) + %tmp0_44 = tt.splat %tmp0_43 : i64 -> tensor<128x1xi64, #blocked> loc(#loc110) + %tmp0_45 = arith.muli %tmp0_44, %x1 : tensor<128x1xi64, #blocked> loc(#loc110) + %tmp0_46 = tt.broadcast %tmp0_45 : tensor<128x1xi64, #blocked> -> tensor<128x16xi64, #blocked> loc(#loc111) + %tmp0_47 = arith.addi %tmp0_42, %tmp0_46 : tensor<128x16xi64, #blocked> loc(#loc111) + %tmp0_48 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x16x!tt.ptr, #blocked> loc(#loc112) + %tmp0_49 = tt.addptr %tmp0_48, %tmp0_47 : tensor<128x16x!tt.ptr, #blocked>, tensor<128x16xi64, #blocked> loc(#loc112) + %tmp0_50 = tt.broadcast %xmask_17 : tensor<128x1xi1, #blocked> -> tensor<128x16xi1, #blocked> loc(#loc113) + %tmp0_51 = tt.broadcast %xmask_18 : tensor<128x1xi1, #blocked1> -> tensor<128x16xi1, #blocked1> loc(#loc113) + %tmp0_52 = tt.load %tmp0_49, %tmp0_50, %cst_7 evictionPolicy = evict_last : tensor<128x16x!tt.ptr, #blocked> loc(#loc113) + %tmp2 = arith.trunci %r0_index_22 : tensor<1x16xi32, #linear> to tensor<1x16xi16, #linear> loc(#loc114) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16, #linear> -> tensor<128x16xi16, #linear> loc(#loc115) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> loc(#loc165) + %flip_53 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> loc(#loc165) + %flip_54 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> loc(#loc165) + %flip_55 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> loc(#loc165) + %flip_56 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> loc(#loc165) + %flip_57 = tt.expand_dims %flip_53 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> loc(#loc165) + %flip_58 = tt.expand_dims %flip_54 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> loc(#loc165) + %flip_59 = tt.expand_dims %flip_55 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> loc(#loc165) + %flip_60 = tt.expand_dims %flip_56 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> -> tensor<1x2x1xi32, #linear2> loc(#loc165) + %flip_61 = tt.expand_dims %flip_57 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> -> tensor<1x2x1xi32, #linear1> loc(#loc165) + %flip_62 = tt.expand_dims %flip_58 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> -> tensor<1x2x1xi32, #linear3> loc(#loc165) + %flip_63 = tt.expand_dims %flip_59 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> -> tensor<1x2x1xi32, #linear4> loc(#loc165) + %flip_64 = tt.broadcast %flip_60 : tensor<1x2x1xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc166) + %flip_65 = tt.reshape %flip_64 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #blocked> loc(#loc167) + %flip_66 = tt.reshape %flip_64 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc167) + %y = tt.reshape %tmp0_52 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #linear1> loc(#loc173) + %left_mask = arith.subi %cst_3, %flip_61 : tensor<1x2x1xi32, #linear1> loc(#loc174) + %left_mask_67 = arith.subi %cst_4, %flip_60 : tensor<1x2x1xi32, #linear2> loc(#loc174) + %left_mask_68 = arith.subi %cst_5, %flip_62 : tensor<1x2x1xi32, #linear3> loc(#loc174) + %left_mask_69 = arith.subi %cst_6, %flip_63 : tensor<1x2x1xi32, #linear4> loc(#loc174) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc175) + %ileft_70 = arith.muli %y, %ileft : tensor<1024x2x1xi32, #linear1> loc(#loc175) + %ileft_71 = "tt.reduce"(%ileft_70) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc212) + %ileft_72 = tt.expand_dims %ileft_71 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc177) + %ileft_73 = tt.broadcast %ileft_72 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc178) + %iright = tt.broadcast %flip_61 : tensor<1x2x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc179) + %iright_74 = arith.muli %y, %iright : tensor<1024x2x1xi32, #linear1> loc(#loc179) + %iright_75 = "tt.reduce"(%iright_74) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc214) + %iright_76 = tt.expand_dims %iright_75 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc181) + %iright_77 = tt.broadcast %iright_76 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc182) + %ileft_78 = tt.reshape %ileft_73 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #blocked> loc(#loc183) + %ileft_79 = tt.reshape %ileft_73 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc183) + %iright_80 = tt.reshape %iright_77 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #blocked> loc(#loc184) + %iright_81 = tt.reshape %iright_77 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc184) + %y_idx = tt.reshape %tmp4 : tensor<128x16xi16, #linear> -> tensor<1024x2x1xi16, #linear1> loc(#loc185) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc186) + %left_idx_82 = tt.broadcast %left_idx : tensor<1x2x1xi16, #linear1> -> tensor<1024x2x1xi16, #linear1> loc(#loc187) + %left_idx_83 = arith.muli %y_idx, %left_idx_82 : tensor<1024x2x1xi16, #linear1> loc(#loc187) + %input = arith.extsi %left_idx_83 : tensor<1024x2x1xi16, #linear1> to tensor<1024x2x1xi32, #linear1> loc(#loc216) + %left_idx_84 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc217) + %left_idx_85 = tt.expand_dims %left_idx_84 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc189) + %left_idx_86 = tt.broadcast %left_idx_85 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc190) + %right_idx = arith.trunci %flip_61 : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc191) + %right_idx_87 = tt.broadcast %right_idx : tensor<1x2x1xi16, #linear1> -> tensor<1024x2x1xi16, #linear1> loc(#loc192) + %right_idx_88 = arith.muli %y_idx, %right_idx_87 : tensor<1024x2x1xi16, #linear1> loc(#loc192) + %input_89 = arith.extsi %right_idx_88 : tensor<1024x2x1xi16, #linear1> to tensor<1024x2x1xi32, #linear1> loc(#loc219) + %right_idx_90 = "tt.reduce"(%input_89) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc220) + %right_idx_91 = tt.expand_dims %right_idx_90 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc194) + %right_idx_92 = tt.broadcast %right_idx_91 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc195) + %left_idx_93 = tt.reshape %left_idx_86 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #blocked> loc(#loc196) + %left_idx_94 = tt.reshape %left_idx_86 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc196) + %right_idx_95 = tt.reshape %right_idx_92 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #blocked> loc(#loc197) + %right_idx_96 = tt.reshape %right_idx_92 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc197) + %cond = arith.cmpi slt, %ileft_78, %iright_80 : tensor<128x16xi32, #blocked> loc(#loc198) + %cond_97 = arith.cmpi slt, %ileft_79, %iright_81 : tensor<128x16xi32, #linear> loc(#loc198) + %eq = arith.cmpi eq, %ileft_78, %iright_80 : tensor<128x16xi32, #blocked> loc(#loc199) + %eq_98 = arith.cmpi eq, %ileft_79, %iright_81 : tensor<128x16xi32, #linear> loc(#loc199) + %cond_99 = arith.cmpi sgt, %left_idx_93, %right_idx_95 : tensor<128x16xi32, #blocked> loc(#loc200) + %cond_100 = arith.cmpi sgt, %left_idx_94, %right_idx_96 : tensor<128x16xi32, #linear> loc(#loc200) + %cond_101 = arith.andi %eq, %cond_99 : tensor<128x16xi1, #blocked> loc(#loc201) + %cond_102 = arith.andi %eq_98, %cond_100 : tensor<128x16xi1, #linear> loc(#loc201) + %cond_103 = arith.ori %cond, %cond_101 : tensor<128x16xi1, #blocked> loc(#loc202) + %cond_104 = arith.ori %cond_97, %cond_102 : tensor<128x16xi1, #linear> loc(#loc202) + %cond_105 = arith.extui %cond_103 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc203) + %cond_106 = arith.extui %cond_104 : tensor<128x16xi1, #linear> to tensor<128x16xi32, #linear> loc(#loc203) + %cond_107 = arith.xori %cond_105, %flip_65 : tensor<128x16xi32, #blocked> loc(#loc203) + %cond_108 = arith.xori %cond_106, %flip_66 : tensor<128x16xi32, #linear> loc(#loc203) + %cond_109 = arith.cmpi ne, %cond_107, %cst_7 : tensor<128x16xi32, #blocked> loc(#loc204) + %cond_110 = arith.cmpi ne, %cond_108, %cst : tensor<128x16xi32, #linear> loc(#loc204) + %ret = arith.xori %ileft_78, %iright_80 : tensor<128x16xi32, #blocked> loc(#loc205) + %ret_111 = arith.select %cond_109, %ret, %cst_7 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc206) + %ret_112 = arith.xori %tmp0_52, %ret_111 : tensor<128x16xi32, #blocked> loc(#loc207) + %ret_113 = ttg.convert_layout %ret_112 : tensor<128x16xi32, #blocked> -> tensor<128x16xi32, #linear> loc(#loc207) + %new_idxs = arith.xori %left_idx_94, %right_idx_96 : tensor<128x16xi32, #linear> loc(#loc208) + %new_idxs_114 = arith.select %cond_110, %new_idxs, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc209) + %new_idxs_115 = arith.extsi %tmp2 : tensor<1x16xi16, #linear> to tensor<1x16xi32, #linear> loc(#loc210) + %new_idxs_116 = tt.broadcast %new_idxs_115 : tensor<1x16xi32, #linear> -> tensor<128x16xi32, #linear> loc(#loc210) + %new_idxs_117 = arith.xori %new_idxs_116, %new_idxs_114 : tensor<128x16xi32, #linear> loc(#loc210) + %flip_118 = tt.broadcast %flip_62 : tensor<1x2x1xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc166) + %flip_119 = tt.reshape %flip_118 : tensor<256x2x4xi32, #linear3> -> tensor<128x16xi32, #linear> loc(#loc167) + %y_120 = tt.reshape %ret_112 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #linear2> loc(#loc173) + %ileft_121 = tt.broadcast %left_mask_67 : tensor<1x2x1xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc175) + %ileft_122 = arith.muli %y_120, %ileft_121 : tensor<512x2x2xi32, #linear2> loc(#loc175) + %ileft_123 = "tt.reduce"(%ileft_122) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc212) + %ileft_124 = tt.expand_dims %ileft_123 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc177) + %ileft_125 = tt.broadcast %ileft_124 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc178) + %iright_126 = arith.muli %y_120, %flip_64 : tensor<512x2x2xi32, #linear2> loc(#loc179) + %iright_127 = "tt.reduce"(%iright_126) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc214) + %iright_128 = tt.expand_dims %iright_127 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc181) + %iright_129 = tt.broadcast %iright_128 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc182) + %ileft_130 = tt.reshape %ileft_125 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc183) + %iright_131 = tt.reshape %iright_129 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc184) + %y_idx_132 = tt.reshape %new_idxs_117 : tensor<128x16xi32, #linear> -> tensor<512x2x2xi32, #linear2> loc(#loc185) + %left_idx_133 = arith.muli %y_idx_132, %ileft_121 : tensor<512x2x2xi32, #linear2> loc(#loc187) + %left_idx_134 = "tt.reduce"(%left_idx_133) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc217) + %left_idx_135 = tt.expand_dims %left_idx_134 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc189) + %left_idx_136 = tt.broadcast %left_idx_135 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc190) + %right_idx_137 = arith.muli %y_idx_132, %flip_64 : tensor<512x2x2xi32, #linear2> loc(#loc192) + %right_idx_138 = "tt.reduce"(%right_idx_137) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc220) + %right_idx_139 = tt.expand_dims %right_idx_138 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc194) + %right_idx_140 = tt.broadcast %right_idx_139 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc195) + %left_idx_141 = tt.reshape %left_idx_136 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc196) + %right_idx_142 = tt.reshape %right_idx_140 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc197) + %cond_143 = arith.cmpi slt, %ileft_130, %iright_131 : tensor<128x16xi32, #linear> loc(#loc198) + %eq_144 = arith.cmpi eq, %ileft_130, %iright_131 : tensor<128x16xi32, #linear> loc(#loc199) + %cond_145 = arith.cmpi sgt, %left_idx_141, %right_idx_142 : tensor<128x16xi32, #linear> loc(#loc200) + %cond_146 = arith.andi %eq_144, %cond_145 : tensor<128x16xi1, #linear> loc(#loc201) + %cond_147 = arith.ori %cond_143, %cond_146 : tensor<128x16xi1, #linear> loc(#loc202) + %cond_148 = arith.extui %cond_147 : tensor<128x16xi1, #linear> to tensor<128x16xi32, #linear> loc(#loc203) + %cond_149 = arith.xori %cond_148, %flip_119 : tensor<128x16xi32, #linear> loc(#loc203) + %cond_150 = arith.cmpi ne, %cond_149, %cst : tensor<128x16xi32, #linear> loc(#loc204) + %ret_151 = arith.xori %ileft_130, %iright_131 : tensor<128x16xi32, #linear> loc(#loc205) + %ret_152 = arith.select %cond_150, %ret_151, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc206) + %ret_153 = arith.xori %ret_113, %ret_152 : tensor<128x16xi32, #linear> loc(#loc207) + %new_idxs_154 = arith.xori %left_idx_141, %right_idx_142 : tensor<128x16xi32, #linear> loc(#loc208) + %new_idxs_155 = arith.select %cond_150, %new_idxs_154, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc209) + %new_idxs_156 = arith.xori %new_idxs_117, %new_idxs_155 : tensor<128x16xi32, #linear> loc(#loc210) + %y_157 = tt.reshape %ret_153 : tensor<128x16xi32, #linear> -> tensor<1024x2x1xi32, #linear1> loc(#loc173) + %ileft_158 = arith.muli %y_157, %ileft : tensor<1024x2x1xi32, #linear1> loc(#loc175) + %ileft_159 = "tt.reduce"(%ileft_158) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc212) + %ileft_160 = tt.expand_dims %ileft_159 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc177) + %ileft_161 = tt.broadcast %ileft_160 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc178) + %iright_162 = arith.muli %y_157, %iright : tensor<1024x2x1xi32, #linear1> loc(#loc179) + %iright_163 = "tt.reduce"(%iright_162) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc214) + %iright_164 = tt.expand_dims %iright_163 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc181) + %iright_165 = tt.broadcast %iright_164 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc182) + %ileft_166 = tt.reshape %ileft_161 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc183) + %iright_167 = tt.reshape %iright_165 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc184) + %y_idx_168 = tt.reshape %new_idxs_156 : tensor<128x16xi32, #linear> -> tensor<1024x2x1xi32, #linear1> loc(#loc185) + %left_idx_169 = arith.muli %y_idx_168, %ileft : tensor<1024x2x1xi32, #linear1> loc(#loc187) + %left_idx_170 = "tt.reduce"(%left_idx_169) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc217) + %left_idx_171 = tt.expand_dims %left_idx_170 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc189) + %left_idx_172 = tt.broadcast %left_idx_171 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc190) + %right_idx_173 = arith.muli %y_idx_168, %iright : tensor<1024x2x1xi32, #linear1> loc(#loc192) + %right_idx_174 = "tt.reduce"(%right_idx_173) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc220) + %right_idx_175 = tt.expand_dims %right_idx_174 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc194) + %right_idx_176 = tt.broadcast %right_idx_175 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc195) + %left_idx_177 = tt.reshape %left_idx_172 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc196) + %right_idx_178 = tt.reshape %right_idx_176 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc197) + %cond_179 = arith.cmpi slt, %ileft_166, %iright_167 : tensor<128x16xi32, #linear> loc(#loc198) + %eq_180 = arith.cmpi eq, %ileft_166, %iright_167 : tensor<128x16xi32, #linear> loc(#loc199) + %cond_181 = arith.cmpi sgt, %left_idx_177, %right_idx_178 : tensor<128x16xi32, #linear> loc(#loc200) + %cond_182 = arith.andi %eq_180, %cond_181 : tensor<128x16xi1, #linear> loc(#loc201) + %cond_183 = arith.ori %cond_179, %cond_182 : tensor<128x16xi1, #linear> loc(#loc202) + %cond_184 = arith.extui %cond_183 : tensor<128x16xi1, #linear> to tensor<128x16xi32, #linear> loc(#loc203) + %cond_185 = arith.xori %cond_184, %flip_119 : tensor<128x16xi32, #linear> loc(#loc203) + %cond_186 = arith.cmpi ne, %cond_185, %cst : tensor<128x16xi32, #linear> loc(#loc204) + %ret_187 = arith.xori %ileft_166, %iright_167 : tensor<128x16xi32, #linear> loc(#loc205) + %ret_188 = arith.select %cond_186, %ret_187, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc206) + %ret_189 = arith.xori %ret_153, %ret_188 : tensor<128x16xi32, #linear> loc(#loc207) + %new_idxs_190 = arith.xori %left_idx_177, %right_idx_178 : tensor<128x16xi32, #linear> loc(#loc208) + %new_idxs_191 = arith.select %cond_186, %new_idxs_190, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc209) + %new_idxs_192 = arith.xori %new_idxs_156, %new_idxs_191 : tensor<128x16xi32, #linear> loc(#loc210) + %flip_193 = tt.broadcast %flip_63 : tensor<1x2x1xi32, #linear4> -> tensor<128x2x8xi32, #linear4> loc(#loc166) + %flip_194 = tt.reshape %flip_193 : tensor<128x2x8xi32, #linear4> -> tensor<128x16xi32, #linear> loc(#loc167) + %y_195 = tt.reshape %ret_189 : tensor<128x16xi32, #linear> -> tensor<256x2x4xi32, #linear3> loc(#loc173) + %ileft_196 = tt.broadcast %left_mask_68 : tensor<1x2x1xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc175) + %ileft_197 = arith.muli %y_195, %ileft_196 : tensor<256x2x4xi32, #linear3> loc(#loc175) + %ileft_198 = "tt.reduce"(%ileft_197) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<256x2x4xi32, #linear3>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc212) + %ileft_199 = tt.expand_dims %ileft_198 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<256x1x4xi32, #linear3> loc(#loc177) + %ileft_200 = tt.broadcast %ileft_199 : tensor<256x1x4xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc178) + %iright_201 = arith.muli %y_195, %flip_118 : tensor<256x2x4xi32, #linear3> loc(#loc179) + %iright_202 = "tt.reduce"(%iright_201) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<256x2x4xi32, #linear3>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc214) + %iright_203 = tt.expand_dims %iright_202 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<256x1x4xi32, #linear3> loc(#loc181) + %iright_204 = tt.broadcast %iright_203 : tensor<256x1x4xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc182) + %ileft_205 = tt.reshape %ileft_200 : tensor<256x2x4xi32, #linear3> -> tensor<128x16xi32, #linear> loc(#loc183) + %iright_206 = tt.reshape %iright_204 : tensor<256x2x4xi32, #linear3> -> tensor<128x16xi32, #linear> loc(#loc184) + %y_idx_207 = tt.reshape %new_idxs_192 : tensor<128x16xi32, #linear> -> tensor<256x2x4xi32, #linear3> loc(#loc185) + %left_idx_208 = arith.muli %y_idx_207, %ileft_196 : tensor<256x2x4xi32, #linear3> loc(#loc187) + %left_idx_209 = "tt.reduce"(%left_idx_208) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<256x2x4xi32, #linear3>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc217) + %left_idx_210 = tt.expand_dims %left_idx_209 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<256x1x4xi32, #linear3> loc(#loc189) + %left_idx_211 = tt.broadcast %left_idx_210 : tensor<256x1x4xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc190) + %right_idx_212 = arith.muli %y_idx_207, %flip_118 : tensor<256x2x4xi32, #linear3> loc(#loc192) + %right_idx_213 = "tt.reduce"(%right_idx_212) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<256x2x4xi32, #linear3>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc220) + %right_idx_214 = tt.expand_dims %right_idx_213 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<256x1x4xi32, #linear3> loc(#loc194) + %right_idx_215 = tt.broadcast %right_idx_214 : tensor<256x1x4xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc195) + %left_idx_216 = tt.reshape %left_idx_211 : tensor<256x2x4xi32, #linear3> -> tensor<128x16xi32, #linear> loc(#loc196) + %right_idx_217 = tt.reshape %right_idx_215 : tensor<256x2x4xi32, #linear3> -> tensor<128x16xi32, #linear> loc(#loc197) + %cond_218 = arith.cmpi slt, %ileft_205, %iright_206 : tensor<128x16xi32, #linear> loc(#loc198) + %eq_219 = arith.cmpi eq, %ileft_205, %iright_206 : tensor<128x16xi32, #linear> loc(#loc199) + %cond_220 = arith.cmpi sgt, %left_idx_216, %right_idx_217 : tensor<128x16xi32, #linear> loc(#loc200) + %cond_221 = arith.andi %eq_219, %cond_220 : tensor<128x16xi1, #linear> loc(#loc201) + %cond_222 = arith.ori %cond_218, %cond_221 : tensor<128x16xi1, #linear> loc(#loc202) + %cond_223 = arith.extui %cond_222 : tensor<128x16xi1, #linear> to tensor<128x16xi32, #linear> loc(#loc203) + %cond_224 = arith.xori %cond_223, %flip_194 : tensor<128x16xi32, #linear> loc(#loc203) + %cond_225 = arith.cmpi ne, %cond_224, %cst : tensor<128x16xi32, #linear> loc(#loc204) + %ret_226 = arith.xori %ileft_205, %iright_206 : tensor<128x16xi32, #linear> loc(#loc205) + %ret_227 = arith.select %cond_225, %ret_226, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc206) + %ret_228 = arith.xori %ret_189, %ret_227 : tensor<128x16xi32, #linear> loc(#loc207) + %new_idxs_229 = arith.xori %left_idx_216, %right_idx_217 : tensor<128x16xi32, #linear> loc(#loc208) + %new_idxs_230 = arith.select %cond_225, %new_idxs_229, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc209) + %new_idxs_231 = arith.xori %new_idxs_192, %new_idxs_230 : tensor<128x16xi32, #linear> loc(#loc210) + %y_232 = tt.reshape %ret_228 : tensor<128x16xi32, #linear> -> tensor<512x2x2xi32, #linear2> loc(#loc173) + %ileft_233 = arith.muli %y_232, %ileft_121 : tensor<512x2x2xi32, #linear2> loc(#loc175) + %ileft_234 = "tt.reduce"(%ileft_233) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc212) + %ileft_235 = tt.expand_dims %ileft_234 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc177) + %ileft_236 = tt.broadcast %ileft_235 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc178) + %iright_237 = arith.muli %y_232, %flip_64 : tensor<512x2x2xi32, #linear2> loc(#loc179) + %iright_238 = "tt.reduce"(%iright_237) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc214) + %iright_239 = tt.expand_dims %iright_238 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc181) + %iright_240 = tt.broadcast %iright_239 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc182) + %ileft_241 = tt.reshape %ileft_236 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc183) + %iright_242 = tt.reshape %iright_240 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc184) + %y_idx_243 = tt.reshape %new_idxs_231 : tensor<128x16xi32, #linear> -> tensor<512x2x2xi32, #linear2> loc(#loc185) + %left_idx_244 = arith.muli %y_idx_243, %ileft_121 : tensor<512x2x2xi32, #linear2> loc(#loc187) + %left_idx_245 = "tt.reduce"(%left_idx_244) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc217) + %left_idx_246 = tt.expand_dims %left_idx_245 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc189) + %left_idx_247 = tt.broadcast %left_idx_246 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc190) + %right_idx_248 = arith.muli %y_idx_243, %flip_64 : tensor<512x2x2xi32, #linear2> loc(#loc192) + %right_idx_249 = "tt.reduce"(%right_idx_248) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc220) + %right_idx_250 = tt.expand_dims %right_idx_249 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc194) + %right_idx_251 = tt.broadcast %right_idx_250 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc195) + %left_idx_252 = tt.reshape %left_idx_247 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc196) + %right_idx_253 = tt.reshape %right_idx_251 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc197) + %cond_254 = arith.cmpi slt, %ileft_241, %iright_242 : tensor<128x16xi32, #linear> loc(#loc198) + %eq_255 = arith.cmpi eq, %ileft_241, %iright_242 : tensor<128x16xi32, #linear> loc(#loc199) + %cond_256 = arith.cmpi sgt, %left_idx_252, %right_idx_253 : tensor<128x16xi32, #linear> loc(#loc200) + %cond_257 = arith.andi %eq_255, %cond_256 : tensor<128x16xi1, #linear> loc(#loc201) + %cond_258 = arith.ori %cond_254, %cond_257 : tensor<128x16xi1, #linear> loc(#loc202) + %cond_259 = arith.extui %cond_258 : tensor<128x16xi1, #linear> to tensor<128x16xi32, #linear> loc(#loc203) + %cond_260 = arith.xori %cond_259, %flip_194 : tensor<128x16xi32, #linear> loc(#loc203) + %cond_261 = arith.cmpi ne, %cond_260, %cst : tensor<128x16xi32, #linear> loc(#loc204) + %ret_262 = arith.xori %ileft_241, %iright_242 : tensor<128x16xi32, #linear> loc(#loc205) + %ret_263 = arith.select %cond_261, %ret_262, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc206) + %ret_264 = arith.xori %ret_228, %ret_263 : tensor<128x16xi32, #linear> loc(#loc207) + %new_idxs_265 = arith.xori %left_idx_252, %right_idx_253 : tensor<128x16xi32, #linear> loc(#loc208) + %new_idxs_266 = arith.select %cond_261, %new_idxs_265, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc209) + %new_idxs_267 = arith.xori %new_idxs_231, %new_idxs_266 : tensor<128x16xi32, #linear> loc(#loc210) + %y_268 = tt.reshape %ret_264 : tensor<128x16xi32, #linear> -> tensor<1024x2x1xi32, #linear1> loc(#loc173) + %ileft_269 = arith.muli %y_268, %ileft : tensor<1024x2x1xi32, #linear1> loc(#loc175) + %ileft_270 = "tt.reduce"(%ileft_269) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc212) + %ileft_271 = tt.expand_dims %ileft_270 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc177) + %ileft_272 = tt.broadcast %ileft_271 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc178) + %iright_273 = arith.muli %y_268, %iright : tensor<1024x2x1xi32, #linear1> loc(#loc179) + %iright_274 = "tt.reduce"(%iright_273) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc214) + %iright_275 = tt.expand_dims %iright_274 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc181) + %iright_276 = tt.broadcast %iright_275 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc182) + %ileft_277 = tt.reshape %ileft_272 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc183) + %iright_278 = tt.reshape %iright_276 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc184) + %y_idx_279 = tt.reshape %new_idxs_267 : tensor<128x16xi32, #linear> -> tensor<1024x2x1xi32, #linear1> loc(#loc185) + %left_idx_280 = arith.muli %y_idx_279, %ileft : tensor<1024x2x1xi32, #linear1> loc(#loc187) + %left_idx_281 = "tt.reduce"(%left_idx_280) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc217) + %left_idx_282 = tt.expand_dims %left_idx_281 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc189) + %left_idx_283 = tt.broadcast %left_idx_282 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc190) + %right_idx_284 = arith.muli %y_idx_279, %iright : tensor<1024x2x1xi32, #linear1> loc(#loc192) + %right_idx_285 = "tt.reduce"(%right_idx_284) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc220) + %right_idx_286 = tt.expand_dims %right_idx_285 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc194) + %right_idx_287 = tt.broadcast %right_idx_286 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc195) + %left_idx_288 = tt.reshape %left_idx_283 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc196) + %right_idx_289 = tt.reshape %right_idx_287 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc197) + %cond_290 = arith.cmpi slt, %ileft_277, %iright_278 : tensor<128x16xi32, #linear> loc(#loc198) + %eq_291 = arith.cmpi eq, %ileft_277, %iright_278 : tensor<128x16xi32, #linear> loc(#loc199) + %cond_292 = arith.cmpi sgt, %left_idx_288, %right_idx_289 : tensor<128x16xi32, #linear> loc(#loc200) + %cond_293 = arith.andi %eq_291, %cond_292 : tensor<128x16xi1, #linear> loc(#loc201) + %cond_294 = arith.ori %cond_290, %cond_293 : tensor<128x16xi1, #linear> loc(#loc202) + %cond_295 = arith.extui %cond_294 : tensor<128x16xi1, #linear> to tensor<128x16xi32, #linear> loc(#loc203) + %cond_296 = arith.xori %cond_295, %flip_194 : tensor<128x16xi32, #linear> loc(#loc203) + %cond_297 = arith.cmpi ne, %cond_296, %cst : tensor<128x16xi32, #linear> loc(#loc204) + %ret_298 = arith.xori %ileft_277, %iright_278 : tensor<128x16xi32, #linear> loc(#loc205) + %ret_299 = arith.select %cond_297, %ret_298, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc206) + %ret_300 = arith.xori %ret_264, %ret_299 : tensor<128x16xi32, #linear> loc(#loc207) + %new_idxs_301 = arith.xori %left_idx_288, %right_idx_289 : tensor<128x16xi32, #linear> loc(#loc208) + %new_idxs_302 = arith.select %cond_297, %new_idxs_301, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc209) + %new_idxs_303 = arith.xori %new_idxs_267, %new_idxs_302 : tensor<128x16xi32, #linear> loc(#loc210) + %y_304 = tt.reshape %ret_300 : tensor<128x16xi32, #linear> -> tensor<128x2x8xi32, #linear4> loc(#loc173) + %ileft_305 = tt.broadcast %left_mask_69 : tensor<1x2x1xi32, #linear4> -> tensor<128x2x8xi32, #linear4> loc(#loc175) + %ileft_306 = arith.muli %y_304, %ileft_305 : tensor<128x2x8xi32, #linear4> loc(#loc175) + %ileft_307 = "tt.reduce"(%ileft_306) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<128x2x8xi32, #linear4>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc212) + %ileft_308 = tt.expand_dims %ileft_307 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<128x1x8xi32, #linear4> loc(#loc177) + %ileft_309 = tt.broadcast %ileft_308 : tensor<128x1x8xi32, #linear4> -> tensor<128x2x8xi32, #linear4> loc(#loc178) + %iright_310 = arith.muli %y_304, %flip_193 : tensor<128x2x8xi32, #linear4> loc(#loc179) + %iright_311 = "tt.reduce"(%iright_310) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<128x2x8xi32, #linear4>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc214) + %iright_312 = tt.expand_dims %iright_311 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<128x1x8xi32, #linear4> loc(#loc181) + %iright_313 = tt.broadcast %iright_312 : tensor<128x1x8xi32, #linear4> -> tensor<128x2x8xi32, #linear4> loc(#loc182) + %ileft_314 = tt.reshape %ileft_309 : tensor<128x2x8xi32, #linear4> -> tensor<128x16xi32, #linear> loc(#loc183) + %iright_315 = tt.reshape %iright_313 : tensor<128x2x8xi32, #linear4> -> tensor<128x16xi32, #linear> loc(#loc184) + %y_idx_316 = tt.reshape %new_idxs_303 : tensor<128x16xi32, #linear> -> tensor<128x2x8xi32, #linear4> loc(#loc185) + %left_idx_317 = arith.muli %y_idx_316, %ileft_305 : tensor<128x2x8xi32, #linear4> loc(#loc187) + %left_idx_318 = "tt.reduce"(%left_idx_317) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<128x2x8xi32, #linear4>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc217) + %left_idx_319 = tt.expand_dims %left_idx_318 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<128x1x8xi32, #linear4> loc(#loc189) + %left_idx_320 = tt.broadcast %left_idx_319 : tensor<128x1x8xi32, #linear4> -> tensor<128x2x8xi32, #linear4> loc(#loc190) + %right_idx_321 = arith.muli %y_idx_316, %flip_193 : tensor<128x2x8xi32, #linear4> loc(#loc192) + %right_idx_322 = "tt.reduce"(%right_idx_321) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<128x2x8xi32, #linear4>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc220) + %right_idx_323 = tt.expand_dims %right_idx_322 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<128x1x8xi32, #linear4> loc(#loc194) + %right_idx_324 = tt.broadcast %right_idx_323 : tensor<128x1x8xi32, #linear4> -> tensor<128x2x8xi32, #linear4> loc(#loc195) + %left_idx_325 = tt.reshape %left_idx_320 : tensor<128x2x8xi32, #linear4> -> tensor<128x16xi32, #linear> loc(#loc196) + %right_idx_326 = tt.reshape %right_idx_324 : tensor<128x2x8xi32, #linear4> -> tensor<128x16xi32, #linear> loc(#loc197) + %cond_327 = arith.cmpi slt, %ileft_314, %iright_315 : tensor<128x16xi32, #linear> loc(#loc198) + %eq_328 = arith.cmpi eq, %ileft_314, %iright_315 : tensor<128x16xi32, #linear> loc(#loc199) + %cond_329 = arith.cmpi sgt, %left_idx_325, %right_idx_326 : tensor<128x16xi32, #linear> loc(#loc200) + %cond_330 = arith.andi %eq_328, %cond_329 : tensor<128x16xi1, #linear> loc(#loc201) + %cond_331 = arith.ori %cond_327, %cond_330 : tensor<128x16xi1, #linear> loc(#loc202) + %ret_332 = arith.xori %ileft_314, %iright_315 : tensor<128x16xi32, #linear> loc(#loc205) + %ret_333 = arith.select %cond_331, %ret_332, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc206) + %ret_334 = arith.xori %ret_300, %ret_333 : tensor<128x16xi32, #linear> loc(#loc207) + %new_idxs_335 = arith.xori %left_idx_325, %right_idx_326 : tensor<128x16xi32, #linear> loc(#loc208) + %new_idxs_336 = arith.select %cond_331, %new_idxs_335, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc209) + %new_idxs_337 = arith.xori %new_idxs_303, %new_idxs_336 : tensor<128x16xi32, #linear> loc(#loc210) + %y_338 = tt.reshape %ret_334 : tensor<128x16xi32, #linear> -> tensor<256x2x4xi32, #linear3> loc(#loc173) + %ileft_339 = arith.muli %y_338, %ileft_196 : tensor<256x2x4xi32, #linear3> loc(#loc175) + %ileft_340 = "tt.reduce"(%ileft_339) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<256x2x4xi32, #linear3>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc212) + %ileft_341 = tt.expand_dims %ileft_340 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<256x1x4xi32, #linear3> loc(#loc177) + %ileft_342 = tt.broadcast %ileft_341 : tensor<256x1x4xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc178) + %iright_343 = arith.muli %y_338, %flip_118 : tensor<256x2x4xi32, #linear3> loc(#loc179) + %iright_344 = "tt.reduce"(%iright_343) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<256x2x4xi32, #linear3>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc214) + %iright_345 = tt.expand_dims %iright_344 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<256x1x4xi32, #linear3> loc(#loc181) + %iright_346 = tt.broadcast %iright_345 : tensor<256x1x4xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc182) + %ileft_347 = tt.reshape %ileft_342 : tensor<256x2x4xi32, #linear3> -> tensor<128x16xi32, #linear> loc(#loc183) + %iright_348 = tt.reshape %iright_346 : tensor<256x2x4xi32, #linear3> -> tensor<128x16xi32, #linear> loc(#loc184) + %y_idx_349 = tt.reshape %new_idxs_337 : tensor<128x16xi32, #linear> -> tensor<256x2x4xi32, #linear3> loc(#loc185) + %left_idx_350 = arith.muli %y_idx_349, %ileft_196 : tensor<256x2x4xi32, #linear3> loc(#loc187) + %left_idx_351 = "tt.reduce"(%left_idx_350) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<256x2x4xi32, #linear3>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc217) + %left_idx_352 = tt.expand_dims %left_idx_351 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<256x1x4xi32, #linear3> loc(#loc189) + %left_idx_353 = tt.broadcast %left_idx_352 : tensor<256x1x4xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc190) + %right_idx_354 = arith.muli %y_idx_349, %flip_118 : tensor<256x2x4xi32, #linear3> loc(#loc192) + %right_idx_355 = "tt.reduce"(%right_idx_354) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<256x2x4xi32, #linear3>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc220) + %right_idx_356 = tt.expand_dims %right_idx_355 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<256x1x4xi32, #linear3> loc(#loc194) + %right_idx_357 = tt.broadcast %right_idx_356 : tensor<256x1x4xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc195) + %left_idx_358 = tt.reshape %left_idx_353 : tensor<256x2x4xi32, #linear3> -> tensor<128x16xi32, #linear> loc(#loc196) + %right_idx_359 = tt.reshape %right_idx_357 : tensor<256x2x4xi32, #linear3> -> tensor<128x16xi32, #linear> loc(#loc197) + %cond_360 = arith.cmpi slt, %ileft_347, %iright_348 : tensor<128x16xi32, #linear> loc(#loc198) + %eq_361 = arith.cmpi eq, %ileft_347, %iright_348 : tensor<128x16xi32, #linear> loc(#loc199) + %cond_362 = arith.cmpi sgt, %left_idx_358, %right_idx_359 : tensor<128x16xi32, #linear> loc(#loc200) + %cond_363 = arith.andi %eq_361, %cond_362 : tensor<128x16xi1, #linear> loc(#loc201) + %cond_364 = arith.ori %cond_360, %cond_363 : tensor<128x16xi1, #linear> loc(#loc202) + %ret_365 = arith.xori %ileft_347, %iright_348 : tensor<128x16xi32, #linear> loc(#loc205) + %ret_366 = arith.select %cond_364, %ret_365, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc206) + %ret_367 = arith.xori %ret_334, %ret_366 : tensor<128x16xi32, #linear> loc(#loc207) + %new_idxs_368 = arith.xori %left_idx_358, %right_idx_359 : tensor<128x16xi32, #linear> loc(#loc208) + %new_idxs_369 = arith.select %cond_364, %new_idxs_368, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc209) + %new_idxs_370 = arith.xori %new_idxs_337, %new_idxs_369 : tensor<128x16xi32, #linear> loc(#loc210) + %y_371 = tt.reshape %ret_367 : tensor<128x16xi32, #linear> -> tensor<512x2x2xi32, #linear2> loc(#loc173) + %ileft_372 = arith.muli %y_371, %ileft_121 : tensor<512x2x2xi32, #linear2> loc(#loc175) + %ileft_373 = "tt.reduce"(%ileft_372) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc212) + %ileft_374 = tt.expand_dims %ileft_373 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc177) + %ileft_375 = tt.broadcast %ileft_374 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc178) + %iright_376 = arith.muli %y_371, %flip_64 : tensor<512x2x2xi32, #linear2> loc(#loc179) + %iright_377 = "tt.reduce"(%iright_376) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc214) + %iright_378 = tt.expand_dims %iright_377 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc181) + %iright_379 = tt.broadcast %iright_378 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc182) + %ileft_380 = tt.reshape %ileft_375 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc183) + %iright_381 = tt.reshape %iright_379 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc184) + %y_idx_382 = tt.reshape %new_idxs_370 : tensor<128x16xi32, #linear> -> tensor<512x2x2xi32, #linear2> loc(#loc185) + %left_idx_383 = arith.muli %y_idx_382, %ileft_121 : tensor<512x2x2xi32, #linear2> loc(#loc187) + %left_idx_384 = "tt.reduce"(%left_idx_383) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc217) + %left_idx_385 = tt.expand_dims %left_idx_384 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc189) + %left_idx_386 = tt.broadcast %left_idx_385 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc190) + %right_idx_387 = arith.muli %y_idx_382, %flip_64 : tensor<512x2x2xi32, #linear2> loc(#loc192) + %right_idx_388 = "tt.reduce"(%right_idx_387) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc220) + %right_idx_389 = tt.expand_dims %right_idx_388 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc194) + %right_idx_390 = tt.broadcast %right_idx_389 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc195) + %left_idx_391 = tt.reshape %left_idx_386 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc196) + %right_idx_392 = tt.reshape %right_idx_390 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc197) + %cond_393 = arith.cmpi slt, %ileft_380, %iright_381 : tensor<128x16xi32, #linear> loc(#loc198) + %eq_394 = arith.cmpi eq, %ileft_380, %iright_381 : tensor<128x16xi32, #linear> loc(#loc199) + %cond_395 = arith.cmpi sgt, %left_idx_391, %right_idx_392 : tensor<128x16xi32, #linear> loc(#loc200) + %cond_396 = arith.andi %eq_394, %cond_395 : tensor<128x16xi1, #linear> loc(#loc201) + %cond_397 = arith.ori %cond_393, %cond_396 : tensor<128x16xi1, #linear> loc(#loc202) + %ret_398 = arith.xori %ileft_380, %iright_381 : tensor<128x16xi32, #linear> loc(#loc205) + %ret_399 = arith.select %cond_397, %ret_398, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc206) + %ret_400 = arith.xori %ret_367, %ret_399 : tensor<128x16xi32, #linear> loc(#loc207) + %new_idxs_401 = arith.xori %left_idx_391, %right_idx_392 : tensor<128x16xi32, #linear> loc(#loc208) + %new_idxs_402 = arith.select %cond_397, %new_idxs_401, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc209) + %new_idxs_403 = arith.xori %new_idxs_370, %new_idxs_402 : tensor<128x16xi32, #linear> loc(#loc210) + %y_404 = tt.reshape %ret_400 : tensor<128x16xi32, #linear> -> tensor<1024x2x1xi32, #linear1> loc(#loc173) + %ileft_405 = arith.muli %y_404, %ileft : tensor<1024x2x1xi32, #linear1> loc(#loc175) + %ileft_406 = "tt.reduce"(%ileft_405) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc212) + %ileft_407 = tt.expand_dims %ileft_406 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc177) + %ileft_408 = tt.broadcast %ileft_407 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc178) + %iright_409 = arith.muli %y_404, %iright : tensor<1024x2x1xi32, #linear1> loc(#loc179) + %iright_410 = "tt.reduce"(%iright_409) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc214) + %iright_411 = tt.expand_dims %iright_410 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc181) + %iright_412 = tt.broadcast %iright_411 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc182) + %ileft_413 = tt.reshape %ileft_408 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc183) + %iright_414 = tt.reshape %iright_412 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc184) + %y_idx_415 = tt.reshape %new_idxs_403 : tensor<128x16xi32, #linear> -> tensor<1024x2x1xi32, #linear1> loc(#loc185) + %left_idx_416 = arith.muli %y_idx_415, %ileft : tensor<1024x2x1xi32, #linear1> loc(#loc187) + %left_idx_417 = "tt.reduce"(%left_idx_416) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc217) + %left_idx_418 = tt.expand_dims %left_idx_417 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc189) + %left_idx_419 = tt.broadcast %left_idx_418 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc190) + %right_idx_420 = arith.muli %y_idx_415, %iright : tensor<1024x2x1xi32, #linear1> loc(#loc192) + %right_idx_421 = "tt.reduce"(%right_idx_420) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc220) + %right_idx_422 = tt.expand_dims %right_idx_421 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc194) + %right_idx_423 = tt.broadcast %right_idx_422 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc195) + %left_idx_424 = tt.reshape %left_idx_419 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc196) + %right_idx_425 = tt.reshape %right_idx_423 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc197) + %cond_426 = arith.cmpi slt, %ileft_413, %iright_414 : tensor<128x16xi32, #linear> loc(#loc198) + %eq_427 = arith.cmpi eq, %ileft_413, %iright_414 : tensor<128x16xi32, #linear> loc(#loc199) + %cond_428 = arith.cmpi sgt, %left_idx_424, %right_idx_425 : tensor<128x16xi32, #linear> loc(#loc200) + %cond_429 = arith.andi %eq_427, %cond_428 : tensor<128x16xi1, #linear> loc(#loc201) + %cond_430 = arith.ori %cond_426, %cond_429 : tensor<128x16xi1, #linear> loc(#loc202) + %new_idxs_431 = arith.xori %left_idx_424, %right_idx_425 : tensor<128x16xi32, #linear> loc(#loc208) + %new_idxs_432 = arith.select %cond_430, %new_idxs_431, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc209) + %new_idxs_433 = arith.xori %new_idxs_403, %new_idxs_432 : tensor<128x16xi32, #linear> loc(#loc210) + %tmp7 = arith.extsi %tmp0_52 : tensor<128x16xi32, #blocked> to tensor<128x16xi64, #blocked> loc(#loc159) + %tmp10 = arith.select %tmp0_50, %tmp7, %cst_0 : tensor<128x16xi1, #blocked>, tensor<128x16xi64, #blocked> loc(#loc160) + %tmp11 = "tt.reduce"(%tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_435: i64 loc(callsite(#loc1 at #loc161)), %tmp11_436: i64 loc(callsite(#loc1 at #loc161))): + %tmp11_437 = arith.addi %tmp11_435, %tmp11_436 : i64 loc(#loc211) + tt.reduce.return %tmp11_437 : i64 loc(#loc171) + }) : (tensor<128x16xi64, #blocked>) -> tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc171) + %tmp11_434 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi64, #blocked> loc(#loc162) + %tmp14 = arith.trunci %tmp11_434 : tensor<128x1xi64, #blocked> to tensor<128x1xi32, #blocked> loc(#loc163) + %0 = arith.muli %x0_28, %cst_2 : tensor<128x1xi64, #blocked1> loc(#loc74) + %1 = tt.broadcast %0 : tensor<128x1xi64, #blocked1> -> tensor<128x16xi64, #blocked1> loc(#loc75) + %2 = arith.addi %tmp0_32, %1 : tensor<128x16xi64, #blocked1> loc(#loc75) + %3 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc76) + %4 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc77) + %5 = arith.extui %4 : i1 to i64 loc(#loc78) + %6 = arith.muli %ks0, %5 : i64 loc(#loc78) + %7 = arith.extui %3 : i1 to i64 loc(#loc164) + %8 = arith.addi %7, %6 : i64 loc(#loc79) + %9 = tt.splat %8 : i64 -> tensor<128x1xi64, #blocked1> loc(#loc81) + %10 = tt.splat %8 : i64 -> tensor<128x1xi64, #blocked> loc(#loc81) + %11 = arith.muli %tmp0_36, %9 : tensor<128x1xi64, #blocked1> loc(#loc81) + %12 = tt.broadcast %11 : tensor<128x1xi64, #blocked1> -> tensor<128x16xi64, #blocked1> loc(#loc82) + %13 = arith.addi %2, %12 : tensor<128x16xi64, #blocked1> loc(#loc82) + %14 = tt.splat %out_ptr2 : !tt.ptr -> tensor<128x16x!tt.ptr, #blocked1> loc(#loc83) + %15 = tt.addptr %14, %13 : tensor<128x16x!tt.ptr, #blocked1>, tensor<128x16xi64, #blocked1> loc(#loc83) + %16 = ttg.convert_layout %new_idxs_433 : tensor<128x16xi32, #linear> -> tensor<128x16xi32, #blocked1> loc(#loc84) + tt.store %15, %16, %tmp0_51 : tensor<128x16x!tt.ptr, #blocked1> loc(#loc84) + %17 = arith.muli %x1, %10 : tensor<128x1xi64, #blocked> loc(#loc85) + %18 = arith.addi %x0_27, %17 : tensor<128x1xi64, #blocked> loc(#loc86) + %19 = tt.splat %out_ptr3 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc87) + %20 = tt.addptr %19, %18 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi64, #blocked> loc(#loc87) + tt.store %20, %tmp14, %xmask_17 : tensor<128x1x!tt.ptr, #blocked> loc(#loc88) + tt.return loc(#loc89) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":25:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":26:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":32:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":33:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:42) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:54) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:50) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:64) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:68) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:61) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:30) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:73) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":37:19) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":39:33) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":41:19) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":43:34) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":44:29) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":47:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:32) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:62) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:88) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:79) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:70) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:54) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:47) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:40) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:25) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:102) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:34) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:30) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:25) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:89) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:4) +#loc96 = loc("xoffset"(#loc2)) +#loc97 = loc("xoffset"(#loc3)) +#loc98 = loc("xindex"(#loc4)) +#loc99 = loc("xindex"(#loc5)) +#loc100 = loc("xmask"(#loc6)) +#loc101 = loc("r0_index"(#loc7)) +#loc102 = loc("x0"(#loc8)) +#loc103 = loc("x1"(#loc9)) +#loc104 = loc("tmp0"(#loc10)) +#loc105 = loc("tmp0"(#loc11)) +#loc106 = loc("tmp0"(#loc12)) +#loc107 = loc("tmp0"(#loc13)) +#loc108 = loc("tmp0"(#loc14)) +#loc109 = loc("tmp0"(#loc15)) +#loc110 = loc("tmp0"(#loc16)) +#loc111 = loc("tmp0"(#loc17)) +#loc112 = loc("tmp0"(#loc18)) +#loc113 = loc("tmp0"(#loc19)) +#loc114 = loc("tmp2"(#loc20)) +#loc115 = loc("tmp4"(#loc21)) +#loc116 = loc("flip"(#loc22)) +#loc118 = loc("flip"(#loc25)) +#loc119 = loc("flip"(#loc26)) +#loc120 = loc("y"(#loc27)) +#loc121 = loc("left_mask"(#loc29)) +#loc122 = loc("ileft"(#loc30)) +#loc124 = loc("ileft"(#loc34)) +#loc125 = loc("ileft"(#loc35)) +#loc126 = loc("iright"(#loc36)) +#loc128 = loc("iright"(#loc38)) +#loc129 = loc("iright"(#loc39)) +#loc130 = loc("ileft"(#loc40)) +#loc131 = loc("iright"(#loc41)) +#loc132 = loc("y_idx"(#loc42)) +#loc133 = loc("left_idx"(#loc43)) +#loc134 = loc("left_idx"(#loc44)) +#loc135 = loc("input"(#loc45)) +#loc137 = loc("left_idx"(#loc47)) +#loc138 = loc("left_idx"(#loc48)) +#loc139 = loc("right_idx"(#loc49)) +#loc140 = loc("right_idx"(#loc50)) +#loc142 = loc("right_idx"(#loc52)) +#loc143 = loc("right_idx"(#loc53)) +#loc144 = loc("left_idx"(#loc54)) +#loc145 = loc("right_idx"(#loc55)) +#loc146 = loc("cond"(#loc56)) +#loc147 = loc("eq"(#loc57)) +#loc148 = loc("cond"(#loc58)) +#loc149 = loc("cond"(#loc59)) +#loc150 = loc("cond"(#loc60)) +#loc151 = loc("cond"(#loc61)) +#loc152 = loc("cond"(#loc62)) +#loc153 = loc("ret"(#loc63)) +#loc154 = loc("ret"(#loc64)) +#loc155 = loc("ret"(#loc65)) +#loc156 = loc("new_idxs"(#loc66)) +#loc157 = loc("new_idxs"(#loc67)) +#loc158 = loc("new_idxs"(#loc68)) +#loc159 = loc("tmp7"(#loc69)) +#loc160 = loc("tmp10"(#loc70)) +#loc162 = loc("tmp11"(#loc72)) +#loc163 = loc("tmp14"(#loc73)) +#loc164 = loc(fused[#loc79, #loc80]) +#loc165 = loc(callsite(#loc116 at #loc117)) +#loc166 = loc(callsite(#loc118 at #loc117)) +#loc167 = loc(callsite(#loc119 at #loc117)) +#loc169 = loc("cond"(#loc146)) +#loc170 = loc("eq"(#loc147)) +#loc171 = loc(callsite(#loc31 at #loc161)) +#loc173 = loc(callsite(#loc120 at #loc168)) +#loc174 = loc(callsite(#loc121 at #loc168)) +#loc175 = loc(callsite(#loc122 at #loc168)) +#loc177 = loc(callsite(#loc124 at #loc168)) +#loc178 = loc(callsite(#loc125 at #loc168)) +#loc179 = loc(callsite(#loc126 at #loc168)) +#loc181 = loc(callsite(#loc128 at #loc168)) +#loc182 = loc(callsite(#loc129 at #loc168)) +#loc183 = loc(callsite(#loc130 at #loc168)) +#loc184 = loc(callsite(#loc131 at #loc168)) +#loc185 = loc(callsite(#loc132 at #loc168)) +#loc186 = loc(callsite(#loc133 at #loc168)) +#loc187 = loc(callsite(#loc134 at #loc168)) +#loc189 = loc(callsite(#loc137 at #loc168)) +#loc190 = loc(callsite(#loc138 at #loc168)) +#loc191 = loc(callsite(#loc139 at #loc168)) +#loc192 = loc(callsite(#loc140 at #loc168)) +#loc194 = loc(callsite(#loc142 at #loc168)) +#loc195 = loc(callsite(#loc143 at #loc168)) +#loc196 = loc(callsite(#loc144 at #loc168)) +#loc197 = loc(callsite(#loc145 at #loc168)) +#loc198 = loc(callsite(#loc169 at #loc168)) +#loc199 = loc(callsite(#loc170 at #loc168)) +#loc200 = loc(callsite(#loc148 at #loc168)) +#loc201 = loc(callsite(#loc149 at #loc168)) +#loc202 = loc(callsite(#loc150 at #loc168)) +#loc203 = loc(callsite(#loc151 at #loc168)) +#loc204 = loc(callsite(#loc152 at #loc168)) +#loc205 = loc(callsite(#loc153 at #loc168)) +#loc206 = loc(callsite(#loc154 at #loc168)) +#loc207 = loc(callsite(#loc155 at #loc168)) +#loc208 = loc(callsite(#loc156 at #loc168)) +#loc209 = loc(callsite(#loc157 at #loc168)) +#loc210 = loc(callsite(#loc158 at #loc168)) +#loc211 = loc(callsite(#loc33 at #loc171)) +#loc212 = loc(callsite(#loc31 at #loc176)) +#loc214 = loc(callsite(#loc31 at #loc180)) +#loc216 = loc(callsite(#loc135 at #loc188)) +#loc217 = loc(callsite(#loc31 at #loc188)) +#loc219 = loc(callsite(#loc135 at #loc193)) +#loc220 = loc(callsite(#loc31 at #loc193)) +#loc222 = loc(callsite(#loc33 at #loc212)) +#loc223 = loc(callsite(#loc33 at #loc214)) +#loc224 = loc(callsite(#loc33 at #loc217)) +#loc225 = loc(callsite(#loc33 at #loc220)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir new file mode 100644 index 0000000000000000000000000000000000000000..a08ead60f4797cade88ee5653b2b69f4502e6f82 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DFGMNNR6LCVENFLH6K3HA7AVIY42E2EU7IZDH6NRUAMT7WAOUTZQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir @@ -0,0 +1,840 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":40:67) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":44:26) +#loc93 = loc("in_ptr0"(#loc)) +#loc94 = loc("out_ptr2"(#loc)) +#loc95 = loc("out_ptr3"(#loc)) +#loc96 = loc("ks0"(#loc)) +#loc97 = loc("xnumel"(#loc)) +#loc98 = loc("r0_numel"(#loc)) +#loc124 = loc(callsite(#loc27 at #loc2)) +#loc131 = loc("ileft"(#loc36)) +#loc135 = loc("iright"(#loc41)) +#loc144 = loc("left_idx"(#loc50)) +#loc149 = loc("right_idx"(#loc55)) +#loc168 = loc("tmp11"(#loc74)) +#loc176 = loc(callsite(#loc32 at #loc124)) +#loc180 = loc(callsite(#loc1 at #loc168)) +#loc184 = loc(callsite(#loc131 at #loc176)) +#loc188 = loc(callsite(#loc135 at #loc176)) +#loc196 = loc(callsite(#loc144 at #loc176)) +#loc201 = loc(callsite(#loc149 at #loc176)) +#loc221 = loc(callsite(#loc1 at #loc184)) +#loc223 = loc(callsite(#loc1 at #loc188)) +#loc226 = loc(callsite(#loc1 at #loc196)) +#loc229 = loc(callsite(#loc1 at #loc201)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc99) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc1) + %tmp10 = arith.constant dense<0> : tensor<128x16xi64> loc(#loc100) + %cst_1 = arith.constant dense<16> : tensor<128x1xi64> loc(#loc1) + %c16_i64 = arith.constant 16 : i64 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc101) + %xoffset_2 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc102) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc103) + %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc104) + %xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<128x1xi32> loc(#loc105) + %xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<128x1xi32> loc(#loc105) + %xmask = tt.splat %xnumel : i32 -> tensor<128x1xi32> loc(#loc106) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<128x1xi32> loc(#loc106) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc107) + %r0_index_7 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc108) + %x0 = arith.extsi %xindex_5 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc109) + %x0_8 = tt.splat %ks0 : i64 -> tensor<128x1xi64> loc(#loc109) + %x0_9 = arith.remsi %x0, %x0_8 : tensor<128x1xi64> loc(#loc109) + %x1 = arith.divsi %x0, %x0_8 : tensor<128x1xi64> loc(#loc110) + %tmp0 = arith.extsi %r0_index_7 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc111) + %tmp0_10 = tt.broadcast %tmp0 : tensor<1x16xi64> -> tensor<128x16xi64> loc(#loc111) + %tmp0_11 = tt.broadcast %x0_9 : tensor<128x1xi64> -> tensor<128x16xi64> loc(#loc111) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<128x16xi64> loc(#loc111) + %tmp0_13 = arith.muli %x1, %cst_1 : tensor<128x1xi64> loc(#loc112) + %tmp0_14 = tt.broadcast %tmp0_13 : tensor<128x1xi64> -> tensor<128x16xi64> loc(#loc113) + %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<128x16xi64> loc(#loc113) + %tmp0_16 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc114) + %tmp0_17 = arith.muli %tmp0_16, %tmp0 : tensor<1x16xi64> loc(#loc114) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<1x16xi64> -> tensor<128x16xi64> loc(#loc115) + %tmp0_19 = arith.addi %tmp0_15, %tmp0_18 : tensor<128x16xi64> loc(#loc115) + %tmp0_20 = arith.muli %ks0, %c16_i64 : i64 loc(#loc116) + %tmp0_21 = tt.splat %tmp0_20 : i64 -> tensor<128x1xi64> loc(#loc117) + %tmp0_22 = arith.muli %tmp0_21, %x1 : tensor<128x1xi64> loc(#loc117) + %tmp0_23 = tt.broadcast %tmp0_22 : tensor<128x1xi64> -> tensor<128x16xi64> loc(#loc118) + %tmp0_24 = arith.addi %tmp0_19, %tmp0_23 : tensor<128x16xi64> loc(#loc118) + %tmp0_25 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc119) + %tmp0_26 = tt.addptr %tmp0_25, %tmp0_24 : tensor<128x16x!tt.ptr>, tensor<128x16xi64> loc(#loc119) + %tmp0_27 = tt.broadcast %xmask_6 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc120) + %tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst_0 evictionPolicy = evict_last : tensor<128x16x!tt.ptr> loc(#loc120) + %tmp2 = arith.trunci %r0_index_7 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc121) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<128x16xi16> loc(#loc122) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc172) + %flip_29 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc173) + %flip_30 = tt.expand_dims %flip_29 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc173) + %flip_31 = tt.broadcast %flip_30 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc174) + %flip_32 = tt.reshape %flip_31 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc175) + %y = tt.reshape %tmp0_28 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc181) + %left_mask = arith.subi %cst, %flip_30 : tensor<1x2x1xi32> loc(#loc182) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc183) + %ileft_33 = arith.muli %y, %ileft : tensor<1024x2x1xi32> loc(#loc183) + %ileft_34 = "tt.reduce"(%ileft_33) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc220) + %ileft_35 = tt.expand_dims %ileft_34 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc185) + %ileft_36 = tt.broadcast %ileft_35 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc186) + %iright = tt.broadcast %flip_30 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc187) + %iright_37 = arith.muli %y, %iright : tensor<1024x2x1xi32> loc(#loc187) + %iright_38 = "tt.reduce"(%iright_37) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc222) + %iright_39 = tt.expand_dims %iright_38 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc189) + %iright_40 = tt.broadcast %iright_39 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc190) + %ileft_41 = tt.reshape %ileft_36 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc191) + %iright_42 = tt.reshape %iright_40 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc192) + %y_idx = tt.reshape %tmp4 : tensor<128x16xi16> -> tensor<1024x2x1xi16> loc(#loc193) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc194) + %left_idx_43 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<1024x2x1xi16> loc(#loc195) + %left_idx_44 = arith.muli %y_idx, %left_idx_43 : tensor<1024x2x1xi16> loc(#loc195) + %input = arith.extsi %left_idx_44 : tensor<1024x2x1xi16> to tensor<1024x2x1xi32> loc(#loc224) + %left_idx_45 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc225) + %left_idx_46 = tt.expand_dims %left_idx_45 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc197) + %left_idx_47 = tt.broadcast %left_idx_46 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc198) + %right_idx = arith.trunci %flip_30 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc199) + %right_idx_48 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<1024x2x1xi16> loc(#loc200) + %right_idx_49 = arith.muli %y_idx, %right_idx_48 : tensor<1024x2x1xi16> loc(#loc200) + %input_50 = arith.extsi %right_idx_49 : tensor<1024x2x1xi16> to tensor<1024x2x1xi32> loc(#loc227) + %right_idx_51 = "tt.reduce"(%input_50) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc228) + %right_idx_52 = tt.expand_dims %right_idx_51 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc202) + %right_idx_53 = tt.broadcast %right_idx_52 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc203) + %left_idx_54 = tt.reshape %left_idx_47 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc204) + %right_idx_55 = tt.reshape %right_idx_53 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc205) + %cond = arith.cmpi slt, %ileft_41, %iright_42 : tensor<128x16xi32> loc(#loc206) + %eq = arith.cmpi eq, %ileft_41, %iright_42 : tensor<128x16xi32> loc(#loc207) + %cond_56 = arith.cmpi sgt, %left_idx_54, %right_idx_55 : tensor<128x16xi32> loc(#loc208) + %cond_57 = arith.andi %eq, %cond_56 : tensor<128x16xi1> loc(#loc209) + %cond_58 = arith.ori %cond, %cond_57 : tensor<128x16xi1> loc(#loc210) + %cond_59 = arith.extui %cond_58 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc211) + %cond_60 = arith.xori %cond_59, %flip_32 : tensor<128x16xi32> loc(#loc211) + %cond_61 = arith.cmpi ne, %cond_60, %cst_0 : tensor<128x16xi32> loc(#loc212) + %ret = arith.xori %ileft_41, %iright_42 : tensor<128x16xi32> loc(#loc213) + %ret_62 = arith.select %cond_61, %ret, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc214) + %ret_63 = arith.xori %tmp0_28, %ret_62 : tensor<128x16xi32> loc(#loc215) + %new_idxs = arith.xori %left_idx_54, %right_idx_55 : tensor<128x16xi32> loc(#loc216) + %new_idxs_64 = arith.select %cond_61, %new_idxs, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc217) + %new_idxs_65 = arith.extsi %tmp2 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc218) + %new_idxs_66 = tt.broadcast %new_idxs_65 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc218) + %new_idxs_67 = arith.xori %new_idxs_66, %new_idxs_64 : tensor<128x16xi32> loc(#loc218) + %flip_68 = tt.broadcast %flip_30 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc174) + %flip_69 = tt.reshape %flip_68 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc175) + %y_70 = tt.reshape %ret_63 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc181) + %ileft_71 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc183) + %ileft_72 = arith.muli %y_70, %ileft_71 : tensor<512x2x2xi32> loc(#loc183) + %ileft_73 = "tt.reduce"(%ileft_72) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc220) + %ileft_74 = tt.expand_dims %ileft_73 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc185) + %ileft_75 = tt.broadcast %ileft_74 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc186) + %iright_76 = arith.muli %y_70, %flip_31 : tensor<512x2x2xi32> loc(#loc187) + %iright_77 = "tt.reduce"(%iright_76) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc222) + %iright_78 = tt.expand_dims %iright_77 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc189) + %iright_79 = tt.broadcast %iright_78 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc190) + %ileft_80 = tt.reshape %ileft_75 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc191) + %iright_81 = tt.reshape %iright_79 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc192) + %y_idx_82 = tt.reshape %new_idxs_67 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc193) + %left_idx_83 = arith.muli %y_idx_82, %ileft_71 : tensor<512x2x2xi32> loc(#loc195) + %left_idx_84 = "tt.reduce"(%left_idx_83) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc225) + %left_idx_85 = tt.expand_dims %left_idx_84 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc197) + %left_idx_86 = tt.broadcast %left_idx_85 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc198) + %right_idx_87 = arith.muli %y_idx_82, %flip_31 : tensor<512x2x2xi32> loc(#loc200) + %right_idx_88 = "tt.reduce"(%right_idx_87) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc228) + %right_idx_89 = tt.expand_dims %right_idx_88 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc202) + %right_idx_90 = tt.broadcast %right_idx_89 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc203) + %left_idx_91 = tt.reshape %left_idx_86 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc204) + %right_idx_92 = tt.reshape %right_idx_90 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc205) + %cond_93 = arith.cmpi slt, %ileft_80, %iright_81 : tensor<128x16xi32> loc(#loc206) + %eq_94 = arith.cmpi eq, %ileft_80, %iright_81 : tensor<128x16xi32> loc(#loc207) + %cond_95 = arith.cmpi sgt, %left_idx_91, %right_idx_92 : tensor<128x16xi32> loc(#loc208) + %cond_96 = arith.andi %eq_94, %cond_95 : tensor<128x16xi1> loc(#loc209) + %cond_97 = arith.ori %cond_93, %cond_96 : tensor<128x16xi1> loc(#loc210) + %cond_98 = arith.extui %cond_97 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc211) + %cond_99 = arith.xori %cond_98, %flip_69 : tensor<128x16xi32> loc(#loc211) + %cond_100 = arith.cmpi ne, %cond_99, %cst_0 : tensor<128x16xi32> loc(#loc212) + %ret_101 = arith.xori %ileft_80, %iright_81 : tensor<128x16xi32> loc(#loc213) + %ret_102 = arith.select %cond_100, %ret_101, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc214) + %ret_103 = arith.xori %ret_63, %ret_102 : tensor<128x16xi32> loc(#loc215) + %new_idxs_104 = arith.xori %left_idx_91, %right_idx_92 : tensor<128x16xi32> loc(#loc216) + %new_idxs_105 = arith.select %cond_100, %new_idxs_104, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc217) + %new_idxs_106 = arith.xori %new_idxs_67, %new_idxs_105 : tensor<128x16xi32> loc(#loc218) + %y_107 = tt.reshape %ret_103 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc181) + %ileft_108 = arith.muli %y_107, %ileft : tensor<1024x2x1xi32> loc(#loc183) + %ileft_109 = "tt.reduce"(%ileft_108) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc220) + %ileft_110 = tt.expand_dims %ileft_109 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc185) + %ileft_111 = tt.broadcast %ileft_110 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc186) + %iright_112 = arith.muli %y_107, %iright : tensor<1024x2x1xi32> loc(#loc187) + %iright_113 = "tt.reduce"(%iright_112) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc222) + %iright_114 = tt.expand_dims %iright_113 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc189) + %iright_115 = tt.broadcast %iright_114 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc190) + %ileft_116 = tt.reshape %ileft_111 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc191) + %iright_117 = tt.reshape %iright_115 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc192) + %y_idx_118 = tt.reshape %new_idxs_106 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc193) + %left_idx_119 = arith.muli %y_idx_118, %ileft : tensor<1024x2x1xi32> loc(#loc195) + %left_idx_120 = "tt.reduce"(%left_idx_119) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc225) + %left_idx_121 = tt.expand_dims %left_idx_120 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc197) + %left_idx_122 = tt.broadcast %left_idx_121 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc198) + %right_idx_123 = arith.muli %y_idx_118, %iright : tensor<1024x2x1xi32> loc(#loc200) + %right_idx_124 = "tt.reduce"(%right_idx_123) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc228) + %right_idx_125 = tt.expand_dims %right_idx_124 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc202) + %right_idx_126 = tt.broadcast %right_idx_125 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc203) + %left_idx_127 = tt.reshape %left_idx_122 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc204) + %right_idx_128 = tt.reshape %right_idx_126 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc205) + %cond_129 = arith.cmpi slt, %ileft_116, %iright_117 : tensor<128x16xi32> loc(#loc206) + %eq_130 = arith.cmpi eq, %ileft_116, %iright_117 : tensor<128x16xi32> loc(#loc207) + %cond_131 = arith.cmpi sgt, %left_idx_127, %right_idx_128 : tensor<128x16xi32> loc(#loc208) + %cond_132 = arith.andi %eq_130, %cond_131 : tensor<128x16xi1> loc(#loc209) + %cond_133 = arith.ori %cond_129, %cond_132 : tensor<128x16xi1> loc(#loc210) + %cond_134 = arith.extui %cond_133 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc211) + %cond_135 = arith.xori %cond_134, %flip_69 : tensor<128x16xi32> loc(#loc211) + %cond_136 = arith.cmpi ne, %cond_135, %cst_0 : tensor<128x16xi32> loc(#loc212) + %ret_137 = arith.xori %ileft_116, %iright_117 : tensor<128x16xi32> loc(#loc213) + %ret_138 = arith.select %cond_136, %ret_137, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc214) + %ret_139 = arith.xori %ret_103, %ret_138 : tensor<128x16xi32> loc(#loc215) + %new_idxs_140 = arith.xori %left_idx_127, %right_idx_128 : tensor<128x16xi32> loc(#loc216) + %new_idxs_141 = arith.select %cond_136, %new_idxs_140, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc217) + %new_idxs_142 = arith.xori %new_idxs_106, %new_idxs_141 : tensor<128x16xi32> loc(#loc218) + %flip_143 = tt.broadcast %flip_30 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc174) + %flip_144 = tt.reshape %flip_143 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc175) + %y_145 = tt.reshape %ret_139 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc181) + %ileft_146 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc183) + %ileft_147 = arith.muli %y_145, %ileft_146 : tensor<256x2x4xi32> loc(#loc183) + %ileft_148 = "tt.reduce"(%ileft_147) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc220) + %ileft_149 = tt.expand_dims %ileft_148 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc185) + %ileft_150 = tt.broadcast %ileft_149 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc186) + %iright_151 = arith.muli %y_145, %flip_68 : tensor<256x2x4xi32> loc(#loc187) + %iright_152 = "tt.reduce"(%iright_151) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc222) + %iright_153 = tt.expand_dims %iright_152 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc189) + %iright_154 = tt.broadcast %iright_153 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc190) + %ileft_155 = tt.reshape %ileft_150 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc191) + %iright_156 = tt.reshape %iright_154 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc192) + %y_idx_157 = tt.reshape %new_idxs_142 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc193) + %left_idx_158 = arith.muli %y_idx_157, %ileft_146 : tensor<256x2x4xi32> loc(#loc195) + %left_idx_159 = "tt.reduce"(%left_idx_158) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc225) + %left_idx_160 = tt.expand_dims %left_idx_159 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc197) + %left_idx_161 = tt.broadcast %left_idx_160 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc198) + %right_idx_162 = arith.muli %y_idx_157, %flip_68 : tensor<256x2x4xi32> loc(#loc200) + %right_idx_163 = "tt.reduce"(%right_idx_162) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc228) + %right_idx_164 = tt.expand_dims %right_idx_163 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc202) + %right_idx_165 = tt.broadcast %right_idx_164 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc203) + %left_idx_166 = tt.reshape %left_idx_161 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc204) + %right_idx_167 = tt.reshape %right_idx_165 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc205) + %cond_168 = arith.cmpi slt, %ileft_155, %iright_156 : tensor<128x16xi32> loc(#loc206) + %eq_169 = arith.cmpi eq, %ileft_155, %iright_156 : tensor<128x16xi32> loc(#loc207) + %cond_170 = arith.cmpi sgt, %left_idx_166, %right_idx_167 : tensor<128x16xi32> loc(#loc208) + %cond_171 = arith.andi %eq_169, %cond_170 : tensor<128x16xi1> loc(#loc209) + %cond_172 = arith.ori %cond_168, %cond_171 : tensor<128x16xi1> loc(#loc210) + %cond_173 = arith.extui %cond_172 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc211) + %cond_174 = arith.xori %cond_173, %flip_144 : tensor<128x16xi32> loc(#loc211) + %cond_175 = arith.cmpi ne, %cond_174, %cst_0 : tensor<128x16xi32> loc(#loc212) + %ret_176 = arith.xori %ileft_155, %iright_156 : tensor<128x16xi32> loc(#loc213) + %ret_177 = arith.select %cond_175, %ret_176, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc214) + %ret_178 = arith.xori %ret_139, %ret_177 : tensor<128x16xi32> loc(#loc215) + %new_idxs_179 = arith.xori %left_idx_166, %right_idx_167 : tensor<128x16xi32> loc(#loc216) + %new_idxs_180 = arith.select %cond_175, %new_idxs_179, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc217) + %new_idxs_181 = arith.xori %new_idxs_142, %new_idxs_180 : tensor<128x16xi32> loc(#loc218) + %y_182 = tt.reshape %ret_178 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc181) + %ileft_183 = arith.muli %y_182, %ileft_71 : tensor<512x2x2xi32> loc(#loc183) + %ileft_184 = "tt.reduce"(%ileft_183) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc220) + %ileft_185 = tt.expand_dims %ileft_184 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc185) + %ileft_186 = tt.broadcast %ileft_185 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc186) + %iright_187 = arith.muli %y_182, %flip_31 : tensor<512x2x2xi32> loc(#loc187) + %iright_188 = "tt.reduce"(%iright_187) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc222) + %iright_189 = tt.expand_dims %iright_188 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc189) + %iright_190 = tt.broadcast %iright_189 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc190) + %ileft_191 = tt.reshape %ileft_186 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc191) + %iright_192 = tt.reshape %iright_190 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc192) + %y_idx_193 = tt.reshape %new_idxs_181 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc193) + %left_idx_194 = arith.muli %y_idx_193, %ileft_71 : tensor<512x2x2xi32> loc(#loc195) + %left_idx_195 = "tt.reduce"(%left_idx_194) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc225) + %left_idx_196 = tt.expand_dims %left_idx_195 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc197) + %left_idx_197 = tt.broadcast %left_idx_196 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc198) + %right_idx_198 = arith.muli %y_idx_193, %flip_31 : tensor<512x2x2xi32> loc(#loc200) + %right_idx_199 = "tt.reduce"(%right_idx_198) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc228) + %right_idx_200 = tt.expand_dims %right_idx_199 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc202) + %right_idx_201 = tt.broadcast %right_idx_200 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc203) + %left_idx_202 = tt.reshape %left_idx_197 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc204) + %right_idx_203 = tt.reshape %right_idx_201 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc205) + %cond_204 = arith.cmpi slt, %ileft_191, %iright_192 : tensor<128x16xi32> loc(#loc206) + %eq_205 = arith.cmpi eq, %ileft_191, %iright_192 : tensor<128x16xi32> loc(#loc207) + %cond_206 = arith.cmpi sgt, %left_idx_202, %right_idx_203 : tensor<128x16xi32> loc(#loc208) + %cond_207 = arith.andi %eq_205, %cond_206 : tensor<128x16xi1> loc(#loc209) + %cond_208 = arith.ori %cond_204, %cond_207 : tensor<128x16xi1> loc(#loc210) + %cond_209 = arith.extui %cond_208 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc211) + %cond_210 = arith.xori %cond_209, %flip_144 : tensor<128x16xi32> loc(#loc211) + %cond_211 = arith.cmpi ne, %cond_210, %cst_0 : tensor<128x16xi32> loc(#loc212) + %ret_212 = arith.xori %ileft_191, %iright_192 : tensor<128x16xi32> loc(#loc213) + %ret_213 = arith.select %cond_211, %ret_212, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc214) + %ret_214 = arith.xori %ret_178, %ret_213 : tensor<128x16xi32> loc(#loc215) + %new_idxs_215 = arith.xori %left_idx_202, %right_idx_203 : tensor<128x16xi32> loc(#loc216) + %new_idxs_216 = arith.select %cond_211, %new_idxs_215, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc217) + %new_idxs_217 = arith.xori %new_idxs_181, %new_idxs_216 : tensor<128x16xi32> loc(#loc218) + %y_218 = tt.reshape %ret_214 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc181) + %ileft_219 = arith.muli %y_218, %ileft : tensor<1024x2x1xi32> loc(#loc183) + %ileft_220 = "tt.reduce"(%ileft_219) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc220) + %ileft_221 = tt.expand_dims %ileft_220 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc185) + %ileft_222 = tt.broadcast %ileft_221 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc186) + %iright_223 = arith.muli %y_218, %iright : tensor<1024x2x1xi32> loc(#loc187) + %iright_224 = "tt.reduce"(%iright_223) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc222) + %iright_225 = tt.expand_dims %iright_224 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc189) + %iright_226 = tt.broadcast %iright_225 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc190) + %ileft_227 = tt.reshape %ileft_222 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc191) + %iright_228 = tt.reshape %iright_226 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc192) + %y_idx_229 = tt.reshape %new_idxs_217 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc193) + %left_idx_230 = arith.muli %y_idx_229, %ileft : tensor<1024x2x1xi32> loc(#loc195) + %left_idx_231 = "tt.reduce"(%left_idx_230) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc225) + %left_idx_232 = tt.expand_dims %left_idx_231 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc197) + %left_idx_233 = tt.broadcast %left_idx_232 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc198) + %right_idx_234 = arith.muli %y_idx_229, %iright : tensor<1024x2x1xi32> loc(#loc200) + %right_idx_235 = "tt.reduce"(%right_idx_234) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc228) + %right_idx_236 = tt.expand_dims %right_idx_235 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc202) + %right_idx_237 = tt.broadcast %right_idx_236 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc203) + %left_idx_238 = tt.reshape %left_idx_233 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc204) + %right_idx_239 = tt.reshape %right_idx_237 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc205) + %cond_240 = arith.cmpi slt, %ileft_227, %iright_228 : tensor<128x16xi32> loc(#loc206) + %eq_241 = arith.cmpi eq, %ileft_227, %iright_228 : tensor<128x16xi32> loc(#loc207) + %cond_242 = arith.cmpi sgt, %left_idx_238, %right_idx_239 : tensor<128x16xi32> loc(#loc208) + %cond_243 = arith.andi %eq_241, %cond_242 : tensor<128x16xi1> loc(#loc209) + %cond_244 = arith.ori %cond_240, %cond_243 : tensor<128x16xi1> loc(#loc210) + %cond_245 = arith.extui %cond_244 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc211) + %cond_246 = arith.xori %cond_245, %flip_144 : tensor<128x16xi32> loc(#loc211) + %cond_247 = arith.cmpi ne, %cond_246, %cst_0 : tensor<128x16xi32> loc(#loc212) + %ret_248 = arith.xori %ileft_227, %iright_228 : tensor<128x16xi32> loc(#loc213) + %ret_249 = arith.select %cond_247, %ret_248, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc214) + %ret_250 = arith.xori %ret_214, %ret_249 : tensor<128x16xi32> loc(#loc215) + %new_idxs_251 = arith.xori %left_idx_238, %right_idx_239 : tensor<128x16xi32> loc(#loc216) + %new_idxs_252 = arith.select %cond_247, %new_idxs_251, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc217) + %new_idxs_253 = arith.xori %new_idxs_217, %new_idxs_252 : tensor<128x16xi32> loc(#loc218) + %y_254 = tt.reshape %ret_250 : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc181) + %ileft_255 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc183) + %ileft_256 = arith.muli %y_254, %ileft_255 : tensor<128x2x8xi32> loc(#loc183) + %ileft_257 = "tt.reduce"(%ileft_256) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc220) + %ileft_258 = tt.expand_dims %ileft_257 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc185) + %ileft_259 = tt.broadcast %ileft_258 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc186) + %iright_260 = arith.muli %y_254, %flip_143 : tensor<128x2x8xi32> loc(#loc187) + %iright_261 = "tt.reduce"(%iright_260) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc222) + %iright_262 = tt.expand_dims %iright_261 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc189) + %iright_263 = tt.broadcast %iright_262 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc190) + %ileft_264 = tt.reshape %ileft_259 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc191) + %iright_265 = tt.reshape %iright_263 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc192) + %y_idx_266 = tt.reshape %new_idxs_253 : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc193) + %left_idx_267 = arith.muli %y_idx_266, %ileft_255 : tensor<128x2x8xi32> loc(#loc195) + %left_idx_268 = "tt.reduce"(%left_idx_267) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc225) + %left_idx_269 = tt.expand_dims %left_idx_268 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc197) + %left_idx_270 = tt.broadcast %left_idx_269 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc198) + %right_idx_271 = arith.muli %y_idx_266, %flip_143 : tensor<128x2x8xi32> loc(#loc200) + %right_idx_272 = "tt.reduce"(%right_idx_271) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc228) + %right_idx_273 = tt.expand_dims %right_idx_272 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc202) + %right_idx_274 = tt.broadcast %right_idx_273 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc203) + %left_idx_275 = tt.reshape %left_idx_270 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc204) + %right_idx_276 = tt.reshape %right_idx_274 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc205) + %cond_277 = arith.cmpi slt, %ileft_264, %iright_265 : tensor<128x16xi32> loc(#loc206) + %eq_278 = arith.cmpi eq, %ileft_264, %iright_265 : tensor<128x16xi32> loc(#loc207) + %cond_279 = arith.cmpi sgt, %left_idx_275, %right_idx_276 : tensor<128x16xi32> loc(#loc208) + %cond_280 = arith.andi %eq_278, %cond_279 : tensor<128x16xi1> loc(#loc209) + %cond_281 = arith.ori %cond_277, %cond_280 : tensor<128x16xi1> loc(#loc210) + %ret_282 = arith.xori %ileft_264, %iright_265 : tensor<128x16xi32> loc(#loc213) + %ret_283 = arith.select %cond_281, %ret_282, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc214) + %ret_284 = arith.xori %ret_250, %ret_283 : tensor<128x16xi32> loc(#loc215) + %new_idxs_285 = arith.xori %left_idx_275, %right_idx_276 : tensor<128x16xi32> loc(#loc216) + %new_idxs_286 = arith.select %cond_281, %new_idxs_285, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc217) + %new_idxs_287 = arith.xori %new_idxs_253, %new_idxs_286 : tensor<128x16xi32> loc(#loc218) + %y_288 = tt.reshape %ret_284 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc181) + %ileft_289 = arith.muli %y_288, %ileft_146 : tensor<256x2x4xi32> loc(#loc183) + %ileft_290 = "tt.reduce"(%ileft_289) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc220) + %ileft_291 = tt.expand_dims %ileft_290 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc185) + %ileft_292 = tt.broadcast %ileft_291 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc186) + %iright_293 = arith.muli %y_288, %flip_68 : tensor<256x2x4xi32> loc(#loc187) + %iright_294 = "tt.reduce"(%iright_293) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc222) + %iright_295 = tt.expand_dims %iright_294 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc189) + %iright_296 = tt.broadcast %iright_295 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc190) + %ileft_297 = tt.reshape %ileft_292 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc191) + %iright_298 = tt.reshape %iright_296 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc192) + %y_idx_299 = tt.reshape %new_idxs_287 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc193) + %left_idx_300 = arith.muli %y_idx_299, %ileft_146 : tensor<256x2x4xi32> loc(#loc195) + %left_idx_301 = "tt.reduce"(%left_idx_300) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc225) + %left_idx_302 = tt.expand_dims %left_idx_301 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc197) + %left_idx_303 = tt.broadcast %left_idx_302 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc198) + %right_idx_304 = arith.muli %y_idx_299, %flip_68 : tensor<256x2x4xi32> loc(#loc200) + %right_idx_305 = "tt.reduce"(%right_idx_304) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc228) + %right_idx_306 = tt.expand_dims %right_idx_305 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc202) + %right_idx_307 = tt.broadcast %right_idx_306 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc203) + %left_idx_308 = tt.reshape %left_idx_303 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc204) + %right_idx_309 = tt.reshape %right_idx_307 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc205) + %cond_310 = arith.cmpi slt, %ileft_297, %iright_298 : tensor<128x16xi32> loc(#loc206) + %eq_311 = arith.cmpi eq, %ileft_297, %iright_298 : tensor<128x16xi32> loc(#loc207) + %cond_312 = arith.cmpi sgt, %left_idx_308, %right_idx_309 : tensor<128x16xi32> loc(#loc208) + %cond_313 = arith.andi %eq_311, %cond_312 : tensor<128x16xi1> loc(#loc209) + %cond_314 = arith.ori %cond_310, %cond_313 : tensor<128x16xi1> loc(#loc210) + %ret_315 = arith.xori %ileft_297, %iright_298 : tensor<128x16xi32> loc(#loc213) + %ret_316 = arith.select %cond_314, %ret_315, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc214) + %ret_317 = arith.xori %ret_284, %ret_316 : tensor<128x16xi32> loc(#loc215) + %new_idxs_318 = arith.xori %left_idx_308, %right_idx_309 : tensor<128x16xi32> loc(#loc216) + %new_idxs_319 = arith.select %cond_314, %new_idxs_318, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc217) + %new_idxs_320 = arith.xori %new_idxs_287, %new_idxs_319 : tensor<128x16xi32> loc(#loc218) + %y_321 = tt.reshape %ret_317 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc181) + %ileft_322 = arith.muli %y_321, %ileft_71 : tensor<512x2x2xi32> loc(#loc183) + %ileft_323 = "tt.reduce"(%ileft_322) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc220) + %ileft_324 = tt.expand_dims %ileft_323 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc185) + %ileft_325 = tt.broadcast %ileft_324 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc186) + %iright_326 = arith.muli %y_321, %flip_31 : tensor<512x2x2xi32> loc(#loc187) + %iright_327 = "tt.reduce"(%iright_326) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc222) + %iright_328 = tt.expand_dims %iright_327 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc189) + %iright_329 = tt.broadcast %iright_328 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc190) + %ileft_330 = tt.reshape %ileft_325 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc191) + %iright_331 = tt.reshape %iright_329 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc192) + %y_idx_332 = tt.reshape %new_idxs_320 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc193) + %left_idx_333 = arith.muli %y_idx_332, %ileft_71 : tensor<512x2x2xi32> loc(#loc195) + %left_idx_334 = "tt.reduce"(%left_idx_333) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc225) + %left_idx_335 = tt.expand_dims %left_idx_334 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc197) + %left_idx_336 = tt.broadcast %left_idx_335 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc198) + %right_idx_337 = arith.muli %y_idx_332, %flip_31 : tensor<512x2x2xi32> loc(#loc200) + %right_idx_338 = "tt.reduce"(%right_idx_337) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc228) + %right_idx_339 = tt.expand_dims %right_idx_338 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc202) + %right_idx_340 = tt.broadcast %right_idx_339 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc203) + %left_idx_341 = tt.reshape %left_idx_336 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc204) + %right_idx_342 = tt.reshape %right_idx_340 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc205) + %cond_343 = arith.cmpi slt, %ileft_330, %iright_331 : tensor<128x16xi32> loc(#loc206) + %eq_344 = arith.cmpi eq, %ileft_330, %iright_331 : tensor<128x16xi32> loc(#loc207) + %cond_345 = arith.cmpi sgt, %left_idx_341, %right_idx_342 : tensor<128x16xi32> loc(#loc208) + %cond_346 = arith.andi %eq_344, %cond_345 : tensor<128x16xi1> loc(#loc209) + %cond_347 = arith.ori %cond_343, %cond_346 : tensor<128x16xi1> loc(#loc210) + %ret_348 = arith.xori %ileft_330, %iright_331 : tensor<128x16xi32> loc(#loc213) + %ret_349 = arith.select %cond_347, %ret_348, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc214) + %ret_350 = arith.xori %ret_317, %ret_349 : tensor<128x16xi32> loc(#loc215) + %new_idxs_351 = arith.xori %left_idx_341, %right_idx_342 : tensor<128x16xi32> loc(#loc216) + %new_idxs_352 = arith.select %cond_347, %new_idxs_351, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc217) + %new_idxs_353 = arith.xori %new_idxs_320, %new_idxs_352 : tensor<128x16xi32> loc(#loc218) + %y_354 = tt.reshape %ret_350 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc181) + %ileft_355 = arith.muli %y_354, %ileft : tensor<1024x2x1xi32> loc(#loc183) + %ileft_356 = "tt.reduce"(%ileft_355) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc220) + %ileft_357 = tt.expand_dims %ileft_356 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc185) + %ileft_358 = tt.broadcast %ileft_357 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc186) + %iright_359 = arith.muli %y_354, %iright : tensor<1024x2x1xi32> loc(#loc187) + %iright_360 = "tt.reduce"(%iright_359) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc222) + %iright_361 = tt.expand_dims %iright_360 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc189) + %iright_362 = tt.broadcast %iright_361 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc190) + %ileft_363 = tt.reshape %ileft_358 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc191) + %iright_364 = tt.reshape %iright_362 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc192) + %y_idx_365 = tt.reshape %new_idxs_353 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc193) + %left_idx_366 = arith.muli %y_idx_365, %ileft : tensor<1024x2x1xi32> loc(#loc195) + %left_idx_367 = "tt.reduce"(%left_idx_366) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc225) + %left_idx_368 = tt.expand_dims %left_idx_367 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc197) + %left_idx_369 = tt.broadcast %left_idx_368 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc198) + %right_idx_370 = arith.muli %y_idx_365, %iright : tensor<1024x2x1xi32> loc(#loc200) + %right_idx_371 = "tt.reduce"(%right_idx_370) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc228) + %right_idx_372 = tt.expand_dims %right_idx_371 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc202) + %right_idx_373 = tt.broadcast %right_idx_372 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc203) + %left_idx_374 = tt.reshape %left_idx_369 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc204) + %right_idx_375 = tt.reshape %right_idx_373 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc205) + %cond_376 = arith.cmpi slt, %ileft_363, %iright_364 : tensor<128x16xi32> loc(#loc206) + %eq_377 = arith.cmpi eq, %ileft_363, %iright_364 : tensor<128x16xi32> loc(#loc207) + %cond_378 = arith.cmpi sgt, %left_idx_374, %right_idx_375 : tensor<128x16xi32> loc(#loc208) + %cond_379 = arith.andi %eq_377, %cond_378 : tensor<128x16xi1> loc(#loc209) + %cond_380 = arith.ori %cond_376, %cond_379 : tensor<128x16xi1> loc(#loc210) + %new_idxs_381 = arith.xori %left_idx_374, %right_idx_375 : tensor<128x16xi32> loc(#loc216) + %new_idxs_382 = arith.select %cond_380, %new_idxs_381, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc217) + %new_idxs_383 = arith.xori %new_idxs_353, %new_idxs_382 : tensor<128x16xi32> loc(#loc218) + %tmp7 = arith.extsi %tmp0_28 : tensor<128x16xi32> to tensor<128x16xi64> loc(#loc167) + %tmp10_384 = arith.select %tmp0_27, %tmp7, %tmp10 : tensor<128x16xi1>, tensor<128x16xi64> loc(#loc100) + %tmp11 = "tt.reduce"(%tmp10_384) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_386: i64 loc(callsite(#loc1 at #loc168)), %tmp11_387: i64 loc(callsite(#loc1 at #loc168))): + %tmp11_388 = arith.addi %tmp11_386, %tmp11_387 : i64 loc(#loc219) + tt.reduce.return %tmp11_388 : i64 loc(#loc179) + }) : (tensor<128x16xi64>) -> tensor<128xi64> loc(#loc179) + %tmp11_385 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<128xi64> -> tensor<128x1xi64> loc(#loc169) + %tmp14 = arith.trunci %tmp11_385 : tensor<128x1xi64> to tensor<128x1xi32> loc(#loc170) + %0 = arith.muli %x0_9, %cst_1 : tensor<128x1xi64> loc(#loc77) + %1 = tt.broadcast %0 : tensor<128x1xi64> -> tensor<128x16xi64> loc(#loc78) + %2 = arith.addi %tmp0_10, %1 : tensor<128x16xi64> loc(#loc78) + %3 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc79) + %4 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc80) + %5 = arith.extui %4 : i1 to i64 loc(#loc81) + %6 = arith.muli %ks0, %5 : i64 loc(#loc81) + %7 = arith.extui %3 : i1 to i64 loc(#loc171) + %8 = arith.addi %7, %6 : i64 loc(#loc82) + %9 = tt.splat %8 : i64 -> tensor<128x1xi64> loc(#loc84) + %10 = arith.muli %tmp0_13, %9 : tensor<128x1xi64> loc(#loc84) + %11 = tt.broadcast %10 : tensor<128x1xi64> -> tensor<128x16xi64> loc(#loc85) + %12 = arith.addi %2, %11 : tensor<128x16xi64> loc(#loc85) + %13 = tt.splat %out_ptr2 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc86) + %14 = tt.addptr %13, %12 : tensor<128x16x!tt.ptr>, tensor<128x16xi64> loc(#loc86) + tt.store %14, %new_idxs_383, %tmp0_27 : tensor<128x16x!tt.ptr> loc(#loc87) + %15 = arith.muli %x1, %9 : tensor<128x1xi64> loc(#loc88) + %16 = arith.addi %x0_9, %15 : tensor<128x1xi64> loc(#loc89) + %17 = tt.splat %out_ptr3 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc90) + %18 = tt.addptr %17, %16 : tensor<128x1x!tt.ptr>, tensor<128x1xi64> loc(#loc90) + tt.store %18, %tmp14, %xmask_6 : tensor<128x1x!tt.ptr> loc(#loc91) + tt.return loc(#loc92) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":43:34) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":23:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":23:33) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":24:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":24:44) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":24:23) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":25:21) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":26:28) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":26:38) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":32:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":33:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:45) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:42) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:54) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:50) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:64) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:68) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:61) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:30) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":35:73) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":37:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":39:33) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":41:19) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":44:29) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":47:21) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:35) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:32) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:62) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:88) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:79) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:70) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:54) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:47) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:40) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:25) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":48:102) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:34) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:30) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:25) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:89) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/pf/cpfhrnxtrt5f4ofde5sv4zdr625vgwb7w2tj7tviz3j3ruexout3.py":49:4) +#loc99 = loc(callsite(#loc1 at #loc2)) +#loc100 = loc("tmp10"(#loc3)) +#loc101 = loc("xoffset"(#loc4)) +#loc102 = loc("xoffset"(#loc5)) +#loc103 = loc("xindex"(#loc6)) +#loc104 = loc("xindex"(#loc7)) +#loc105 = loc("xindex"(#loc8)) +#loc106 = loc("xmask"(#loc9)) +#loc107 = loc("r0_index"(#loc10)) +#loc108 = loc("r0_index"(#loc11)) +#loc109 = loc("x0"(#loc12)) +#loc110 = loc("x1"(#loc13)) +#loc111 = loc("tmp0"(#loc14)) +#loc112 = loc("tmp0"(#loc15)) +#loc113 = loc("tmp0"(#loc16)) +#loc114 = loc("tmp0"(#loc17)) +#loc115 = loc("tmp0"(#loc18)) +#loc116 = loc("tmp0"(#loc19)) +#loc117 = loc("tmp0"(#loc20)) +#loc118 = loc("tmp0"(#loc21)) +#loc119 = loc("tmp0"(#loc22)) +#loc120 = loc("tmp0"(#loc23)) +#loc121 = loc("tmp2"(#loc24)) +#loc122 = loc("tmp4"(#loc25)) +#loc123 = loc("flip"(#loc26)) +#loc125 = loc("flip"(#loc28)) +#loc126 = loc("flip"(#loc29)) +#loc127 = loc("flip"(#loc30)) +#loc128 = loc("y"(#loc31)) +#loc129 = loc("left_mask"(#loc33)) +#loc130 = loc("ileft"(#loc34)) +#loc132 = loc("ileft"(#loc38)) +#loc133 = loc("ileft"(#loc39)) +#loc134 = loc("iright"(#loc40)) +#loc136 = loc("iright"(#loc42)) +#loc137 = loc("iright"(#loc43)) +#loc138 = loc("ileft"(#loc44)) +#loc139 = loc("iright"(#loc45)) +#loc140 = loc("y_idx"(#loc46)) +#loc141 = loc("left_idx"(#loc47)) +#loc142 = loc("left_idx"(#loc48)) +#loc143 = loc("input"(#loc49)) +#loc145 = loc("left_idx"(#loc51)) +#loc146 = loc("left_idx"(#loc52)) +#loc147 = loc("right_idx"(#loc53)) +#loc148 = loc("right_idx"(#loc54)) +#loc150 = loc("right_idx"(#loc56)) +#loc151 = loc("right_idx"(#loc57)) +#loc152 = loc("left_idx"(#loc58)) +#loc153 = loc("right_idx"(#loc59)) +#loc154 = loc("cond"(#loc60)) +#loc155 = loc("eq"(#loc61)) +#loc156 = loc("cond"(#loc62)) +#loc157 = loc("cond"(#loc63)) +#loc158 = loc("cond"(#loc64)) +#loc159 = loc("cond"(#loc65)) +#loc160 = loc("cond"(#loc66)) +#loc161 = loc("ret"(#loc67)) +#loc162 = loc("ret"(#loc68)) +#loc163 = loc("ret"(#loc69)) +#loc164 = loc("new_idxs"(#loc70)) +#loc165 = loc("new_idxs"(#loc71)) +#loc166 = loc("new_idxs"(#loc72)) +#loc167 = loc("tmp7"(#loc73)) +#loc169 = loc("tmp11"(#loc75)) +#loc170 = loc("tmp14"(#loc76)) +#loc171 = loc(fused[#loc82, #loc83]) +#loc172 = loc(callsite(#loc123 at #loc124)) +#loc173 = loc(callsite(#loc125 at #loc124)) +#loc174 = loc(callsite(#loc126 at #loc124)) +#loc175 = loc(callsite(#loc127 at #loc124)) +#loc177 = loc("cond"(#loc154)) +#loc178 = loc("eq"(#loc155)) +#loc179 = loc(callsite(#loc35 at #loc168)) +#loc181 = loc(callsite(#loc128 at #loc176)) +#loc182 = loc(callsite(#loc129 at #loc176)) +#loc183 = loc(callsite(#loc130 at #loc176)) +#loc185 = loc(callsite(#loc132 at #loc176)) +#loc186 = loc(callsite(#loc133 at #loc176)) +#loc187 = loc(callsite(#loc134 at #loc176)) +#loc189 = loc(callsite(#loc136 at #loc176)) +#loc190 = loc(callsite(#loc137 at #loc176)) +#loc191 = loc(callsite(#loc138 at #loc176)) +#loc192 = loc(callsite(#loc139 at #loc176)) +#loc193 = loc(callsite(#loc140 at #loc176)) +#loc194 = loc(callsite(#loc141 at #loc176)) +#loc195 = loc(callsite(#loc142 at #loc176)) +#loc197 = loc(callsite(#loc145 at #loc176)) +#loc198 = loc(callsite(#loc146 at #loc176)) +#loc199 = loc(callsite(#loc147 at #loc176)) +#loc200 = loc(callsite(#loc148 at #loc176)) +#loc202 = loc(callsite(#loc150 at #loc176)) +#loc203 = loc(callsite(#loc151 at #loc176)) +#loc204 = loc(callsite(#loc152 at #loc176)) +#loc205 = loc(callsite(#loc153 at #loc176)) +#loc206 = loc(callsite(#loc177 at #loc176)) +#loc207 = loc(callsite(#loc178 at #loc176)) +#loc208 = loc(callsite(#loc156 at #loc176)) +#loc209 = loc(callsite(#loc157 at #loc176)) +#loc210 = loc(callsite(#loc158 at #loc176)) +#loc211 = loc(callsite(#loc159 at #loc176)) +#loc212 = loc(callsite(#loc160 at #loc176)) +#loc213 = loc(callsite(#loc161 at #loc176)) +#loc214 = loc(callsite(#loc162 at #loc176)) +#loc215 = loc(callsite(#loc163 at #loc176)) +#loc216 = loc(callsite(#loc164 at #loc176)) +#loc217 = loc(callsite(#loc165 at #loc176)) +#loc218 = loc(callsite(#loc166 at #loc176)) +#loc219 = loc(callsite(#loc37 at #loc179)) +#loc220 = loc(callsite(#loc35 at #loc184)) +#loc222 = loc(callsite(#loc35 at #loc188)) +#loc224 = loc(callsite(#loc143 at #loc196)) +#loc225 = loc(callsite(#loc35 at #loc196)) +#loc227 = loc(callsite(#loc143 at #loc201)) +#loc228 = loc(callsite(#loc35 at #loc201)) +#loc230 = loc(callsite(#loc37 at #loc220)) +#loc231 = loc(callsite(#loc37 at #loc222)) +#loc232 = loc(callsite(#loc37 at #loc225)) +#loc233 = loc(callsite(#loc37 at #loc228)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/__grp__triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/__grp__triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..47794943c349238aea11a1aebe0e92dfd9915e3e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/__grp__triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..e2c4157fc716541ff71b1bf898fcb009587e37c7 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ead4e7f09098cce79c47dba04873dd6b11e7c2ef --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"hash": "1ab6a7361920a76f6e0ad31660448905b8bbf2ed965464a1c58646d52c267bd4", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..21f078801cb897553d44afff1b99bdaa18b5b6e1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.llir @@ -0,0 +1,435 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_argmax_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %9 = shl nuw nsw i32 %8, 2, !dbg !8 + %10 = and i32 %9, 2044, !dbg !8 + %11 = lshr i32 %7, 11, !dbg !9 + %12 = mul i32 %7, 32000 + %13 = mul i32 %11, 224000 + %14 = add i32 %13, %12 + %15 = zext nneg i32 %10 to i64, !dbg !10 + br label %16, !dbg !10 + +16: ; preds = %6, %16 + %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %16 ] + %17 = phi i32 [ 2147483647, %6 ], [ %103, %16 ] + %18 = phi i32 [ 2147483647, %6 ], [ %104, %16 ] + %19 = phi float [ 0xFFF0000000000000, %6 ], [ %100, %16 ] + %20 = phi float [ 0xFFF0000000000000, %6 ], [ %101, %16 ] + %21 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %99, %16 ] + %22 = phi <2 x i32> [ splat (i32 2147483647), %6 ], [ %102, %16 ] + %23 = or disjoint i64 %indvars.iv, %15, !dbg !11 + %24 = or disjoint i64 %23, 2, !dbg !11 + %25 = or disjoint i64 %23, 3, !dbg !11 + %26 = icmp samesign ult i64 %23, 32000, !dbg !12 + %27 = trunc nuw nsw i64 %23 to i32, !dbg !13 + %28 = add i32 %14, %27, !dbg !13 + %29 = sext i32 %28 to i64, !dbg !14 + %30 = getelementptr float, ptr addrspace(1) %0, i64 %29, !dbg !14 + %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !15 + %32 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %30, i64 %31, i1 %26) #4, !dbg !15 + %33 = extractvalue { i32, i32, i32, i32 } %32, 0, !dbg !15 + %34 = extractvalue { i32, i32, i32, i32 } %32, 1, !dbg !15 + %35 = extractvalue { i32, i32, i32, i32 } %32, 2, !dbg !15 + %36 = extractvalue { i32, i32, i32, i32 } %32, 3, !dbg !15 + %37 = bitcast i32 %35 to float, !dbg !15 + %38 = bitcast i32 %36 to float, !dbg !15 + %39 = fcmp ogt float %19, %37, !dbg !16 + %40 = fcmp ogt float %20, %38, !dbg !16 + %41 = fcmp oeq float %19, %37, !dbg !20 + %42 = fcmp oeq float %20, %38, !dbg !20 + %43 = fcmp uno <2 x float> %21, zeroinitializer, !dbg !21 + %44 = fcmp uno float %19, 0.000000e+00, !dbg !21 + %45 = fcmp uno float %20, 0.000000e+00, !dbg !21 + %46 = fcmp uno float %37, 0.000000e+00, !dbg !22 + %47 = fcmp uno float %38, 0.000000e+00, !dbg !22 + %48 = xor i1 %46, true, !dbg !23 + %49 = xor i1 %47, true, !dbg !23 + %50 = and i1 %44, %48, !dbg !24 + %51 = and i1 %45, %49, !dbg !24 + %52 = or i1 %39, %50, !dbg !25 + %53 = or i1 %40, %51, !dbg !25 + %54 = and i1 %44, %46, !dbg !26 + %55 = and i1 %45, %47, !dbg !26 + %56 = or i1 %41, %54, !dbg !27 + %57 = or i1 %42, %55, !dbg !27 + %58 = sext <2 x i32> %22 to <2 x i64>, !dbg !28 + %59 = sext i32 %17 to i64, !dbg !28 + %60 = icmp sgt i64 %24, %59, !dbg !28 + %61 = sext i32 %18 to i64, !dbg !28 + %62 = icmp sgt i64 %25, %61, !dbg !28 + %63 = and i1 %60, %56, !dbg !29 + %64 = and i1 %62, %57, !dbg !29 + %65 = or i1 %52, %63, !dbg !30 + %66 = or i1 %53, %64, !dbg !30 + %67 = select i1 %65, float %19, float %37, !dbg !31 + %68 = select i1 %66, float %20, float %38, !dbg !31 + %69 = trunc nuw nsw i64 %23 to i32, !dbg !32 + %70 = or disjoint i32 %69, 1, !dbg !32 + %71 = insertelement <2 x i32> poison, i32 %33, i64 0, !dbg !15 + %72 = insertelement <2 x i32> %71, i32 %34, i64 1, !dbg !15 + %73 = bitcast <2 x i32> %72 to <2 x float>, !dbg !15 + %74 = fcmp ogt <2 x float> %21, %73, !dbg !16 + %75 = fcmp oeq <2 x float> %21, %73, !dbg !20 + %76 = fcmp uno <2 x float> %73, zeroinitializer, !dbg !22 + %77 = xor <2 x i1> %76, splat (i1 true), !dbg !23 + %78 = and <2 x i1> %43, %77, !dbg !24 + %79 = or <2 x i1> %74, %78, !dbg !25 + %80 = and <2 x i1> %43, %76, !dbg !26 + %81 = or <2 x i1> %75, %80, !dbg !27 + %82 = insertelement <2 x i64> poison, i64 %23, i64 0, !dbg !28 + %83 = shufflevector <2 x i64> %82, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !28 + %84 = icmp sgt <2 x i64> %83, %58, !dbg !28 + %85 = icmp sge <2 x i64> %83, %58, !dbg !28 + %86 = shufflevector <2 x i1> %84, <2 x i1> %85, <2 x i32> , !dbg !28 + %87 = and <2 x i1> %86, %81, !dbg !29 + %88 = or <2 x i1> %79, %87, !dbg !30 + %89 = select <2 x i1> %88, <2 x float> %21, <2 x float> %73, !dbg !31 + %90 = insertelement <2 x i32> poison, i32 %27, i64 0, !dbg !32 + %91 = insertelement <2 x i32> %90, i32 %70, i64 1, !dbg !32 + %92 = select <2 x i1> %88, <2 x i32> %22, <2 x i32> %91, !dbg !32 + %93 = trunc nuw nsw i64 %24 to i32, !dbg !32 + %94 = select i1 %65, i32 %17, i32 %93, !dbg !32 + %95 = trunc nuw nsw i64 %25 to i32, !dbg !32 + %96 = select i1 %66, i32 %18, i32 %95, !dbg !32 + %97 = insertelement <2 x i1> poison, i1 %26, i64 0, !dbg !33 + %98 = shufflevector <2 x i1> %97, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !33 + %99 = select <2 x i1> %98, <2 x float> %89, <2 x float> %21, !dbg !33 + %100 = select i1 %26, float %67, float %19, !dbg !33 + %101 = select i1 %26, float %68, float %20, !dbg !33 + %102 = select <2 x i1> %98, <2 x i32> %92, <2 x i32> %22, !dbg !34 + %103 = select i1 %26, i32 %94, i32 %17, !dbg !34 + %104 = select i1 %26, i32 %96, i32 %18, !dbg !34 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !10 + %105 = icmp samesign ult i64 %indvars.iv, 29952, !dbg !10 + br i1 %105, label %16, label %106, !dbg !10 + +106: ; preds = %16 + %107 = and i32 %8, 31, !dbg !8 + %108 = lshr i32 %8, 5, !dbg !8 + %109 = shufflevector <2 x float> %99, <2 x float> poison, <2 x i32> , !dbg !35 + %110 = fcmp ogt <2 x float> %99, %109, !dbg !35 + %111 = fcmp oeq <2 x float> %99, %109, !dbg !35 + %112 = shufflevector <2 x i1> %110, <2 x i1> %111, <2 x i32> , !dbg !35 + %113 = extractelement <2 x float> %99, i64 0, !dbg !37 + %114 = fcmp uno float %113, 0.000000e+00, !dbg !37 + %115 = extractelement <2 x float> %99, i64 1, !dbg !38 + %116 = fcmp uno float %115, 0.000000e+00, !dbg !38 + %117 = xor i1 %116, true, !dbg !39 + %118 = insertelement <2 x i1> poison, i1 %114, i64 0, !dbg !40 + %119 = shufflevector <2 x i1> %118, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !40 + %120 = insertelement <2 x i1> poison, i1 %117, i64 0, !dbg !40 + %121 = insertelement <2 x i1> %120, i1 %116, i64 1, !dbg !40 + %122 = and <2 x i1> %119, %121, !dbg !40 + %123 = or <2 x i1> %112, %122, !dbg !41 + %124 = extractelement <2 x i32> %102, i64 0, !dbg !42 + %125 = extractelement <2 x i32> %102, i64 1, !dbg !42 + %126 = icmp slt i32 %124, %125, !dbg !42 + %127 = extractelement <2 x i1> %123, i64 1, !dbg !43 + %128 = and i1 %126, %127, !dbg !43 + %129 = extractelement <2 x i1> %123, i64 0, !dbg !44 + %130 = or i1 %129, %128, !dbg !44 + %131 = select i1 %130, float %113, float %115, !dbg !45 + %132 = select i1 %130, i32 %124, i32 %125, !dbg !46 + %133 = fcmp ogt float %131, %100, !dbg !35 + %134 = fcmp oeq float %131, %100, !dbg !47 + %135 = fcmp uno float %131, 0.000000e+00, !dbg !37 + %136 = fcmp uno float %100, 0.000000e+00, !dbg !38 + %137 = xor i1 %136, true, !dbg !39 + %138 = and i1 %135, %137, !dbg !40 + %139 = or i1 %133, %138, !dbg !41 + %140 = and i1 %136, %135, !dbg !48 + %141 = or i1 %134, %140, !dbg !49 + %142 = icmp slt i32 %132, %103, !dbg !42 + %143 = and i1 %142, %141, !dbg !43 + %144 = or i1 %139, %143, !dbg !44 + %145 = select i1 %144, float %131, float %100, !dbg !45 + %146 = select i1 %144, i32 %132, i32 %103, !dbg !46 + %147 = fcmp ogt float %145, %101, !dbg !35 + %148 = fcmp oeq float %145, %101, !dbg !47 + %149 = fcmp uno float %145, 0.000000e+00, !dbg !37 + %150 = fcmp uno float %101, 0.000000e+00, !dbg !38 + %151 = xor i1 %150, true, !dbg !39 + %152 = and i1 %149, %151, !dbg !40 + %153 = or i1 %147, %152, !dbg !41 + %154 = and i1 %150, %149, !dbg !48 + %155 = or i1 %148, %154, !dbg !49 + %156 = icmp slt i32 %146, %104, !dbg !42 + %157 = and i1 %156, %155, !dbg !43 + %158 = or i1 %153, %157, !dbg !44 + %159 = select i1 %158, float %145, float %101, !dbg !45 + %160 = select i1 %158, i32 %146, i32 %104, !dbg !46 + %161 = bitcast float %159 to i32, !dbg !50 + %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 16, i32 31), !dbg !50 + %163 = bitcast i32 %162 to float, !dbg !50 + %164 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 16, i32 31), !dbg !50 + %165 = fcmp ogt float %159, %163, !dbg !35 + %166 = fcmp oeq float %159, %163, !dbg !47 + %167 = fcmp uno float %159, 0.000000e+00, !dbg !37 + %168 = fcmp uno float %163, 0.000000e+00, !dbg !38 + %169 = xor i1 %168, true, !dbg !39 + %170 = and i1 %167, %169, !dbg !40 + %171 = or i1 %165, %170, !dbg !41 + %172 = and i1 %167, %168, !dbg !48 + %173 = or i1 %166, %172, !dbg !49 + %174 = icmp slt i32 %160, %164, !dbg !42 + %175 = and i1 %174, %173, !dbg !43 + %176 = or i1 %171, %175, !dbg !44 + %177 = select i1 %176, float %159, float %163, !dbg !45 + %178 = select i1 %176, i32 %160, i32 %164, !dbg !46 + %179 = bitcast float %177 to i32, !dbg !50 + %180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %179, i32 8, i32 31), !dbg !50 + %181 = bitcast i32 %180 to float, !dbg !50 + %182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 8, i32 31), !dbg !50 + %183 = fcmp ogt float %177, %181, !dbg !35 + %184 = fcmp oeq float %177, %181, !dbg !47 + %185 = fcmp uno float %177, 0.000000e+00, !dbg !37 + %186 = fcmp uno float %181, 0.000000e+00, !dbg !38 + %187 = xor i1 %186, true, !dbg !39 + %188 = and i1 %185, %187, !dbg !40 + %189 = or i1 %183, %188, !dbg !41 + %190 = and i1 %186, %185, !dbg !48 + %191 = or i1 %184, %190, !dbg !49 + %192 = icmp slt i32 %178, %182, !dbg !42 + %193 = and i1 %192, %191, !dbg !43 + %194 = or i1 %189, %193, !dbg !44 + %195 = select i1 %194, float %177, float %181, !dbg !45 + %196 = select i1 %194, i32 %178, i32 %182, !dbg !46 + %197 = bitcast float %195 to i32, !dbg !50 + %198 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %197, i32 4, i32 31), !dbg !50 + %199 = bitcast i32 %198 to float, !dbg !50 + %200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 4, i32 31), !dbg !50 + %201 = fcmp ogt float %195, %199, !dbg !35 + %202 = fcmp oeq float %195, %199, !dbg !47 + %203 = fcmp uno float %195, 0.000000e+00, !dbg !37 + %204 = fcmp uno float %199, 0.000000e+00, !dbg !38 + %205 = xor i1 %204, true, !dbg !39 + %206 = and i1 %203, %205, !dbg !40 + %207 = or i1 %201, %206, !dbg !41 + %208 = and i1 %204, %203, !dbg !48 + %209 = or i1 %202, %208, !dbg !49 + %210 = icmp slt i32 %196, %200, !dbg !42 + %211 = and i1 %210, %209, !dbg !43 + %212 = or i1 %207, %211, !dbg !44 + %213 = select i1 %212, float %195, float %199, !dbg !45 + %214 = select i1 %212, i32 %196, i32 %200, !dbg !46 + %215 = bitcast float %213 to i32, !dbg !50 + %216 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %215, i32 2, i32 31), !dbg !50 + %217 = bitcast i32 %216 to float, !dbg !50 + %218 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %214, i32 2, i32 31), !dbg !50 + %219 = fcmp ogt float %213, %217, !dbg !35 + %220 = fcmp oeq float %213, %217, !dbg !47 + %221 = fcmp uno float %213, 0.000000e+00, !dbg !37 + %222 = fcmp uno float %217, 0.000000e+00, !dbg !38 + %223 = xor i1 %222, true, !dbg !39 + %224 = and i1 %221, %223, !dbg !40 + %225 = or i1 %219, %224, !dbg !41 + %226 = and i1 %222, %221, !dbg !48 + %227 = or i1 %220, %226, !dbg !49 + %228 = icmp slt i32 %214, %218, !dbg !42 + %229 = and i1 %228, %227, !dbg !43 + %230 = or i1 %225, %229, !dbg !44 + %231 = select i1 %230, float %213, float %217, !dbg !45 + %232 = select i1 %230, i32 %214, i32 %218, !dbg !46 + %233 = bitcast float %231 to i32, !dbg !50 + %234 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %233, i32 1, i32 31), !dbg !50 + %235 = bitcast i32 %234 to float, !dbg !50 + %236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !50 + %237 = fcmp ogt float %231, %235, !dbg !35 + %238 = fcmp oeq float %231, %235, !dbg !47 + %239 = fcmp uno float %231, 0.000000e+00, !dbg !37 + %240 = fcmp uno float %235, 0.000000e+00, !dbg !38 + %241 = xor i1 %240, true, !dbg !39 + %242 = and i1 %239, %241, !dbg !40 + %243 = or i1 %237, %242, !dbg !41 + %244 = and i1 %240, %239, !dbg !48 + %245 = or i1 %238, %244, !dbg !49 + %246 = icmp slt i32 %232, %236, !dbg !42 + %247 = and i1 %246, %245, !dbg !43 + %248 = or i1 %243, %247, !dbg !44 + %249 = select i1 %248, i32 %232, i32 %236, !dbg !46 + %250 = and i32 %108, 15, !dbg !50 + %251 = icmp eq i32 %107, 0, !dbg !50 + %252 = getelementptr float, ptr addrspace(3) @global_smem, i32 %250, !dbg !50 + %253 = select i1 %248, i32 %233, i32 %234, !dbg !45 + %254 = insertelement <1 x i32> poison, i32 %253, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %252, <1 x i32> %254, i1 %251) #4, !dbg !50 + %255 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %250, !dbg !50 + %256 = insertelement <1 x i32> poison, i32 %249, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %255, <1 x i32> %256, i1 %251) #4, !dbg !50 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50 + %257 = icmp samesign ult i32 %8, 16, !dbg !50 + %258 = getelementptr float, ptr addrspace(3) @global_smem, i32 %8, !dbg !50 + %259 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %258, i1 %257) #4, !dbg !50 + %260 = bitcast i32 %259 to float, !dbg !50 + %261 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %8, !dbg !50 + %262 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %261, i1 %257) #4, !dbg !50 + %263 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %259, i32 8, i32 31), !dbg !50 + %264 = bitcast i32 %263 to float, !dbg !50 + %265 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %262, i32 8, i32 31), !dbg !50 + %266 = fcmp ogt float %260, %264, !dbg !35 + %267 = fcmp oeq float %260, %264, !dbg !47 + %268 = fcmp uno float %260, 0.000000e+00, !dbg !37 + %269 = fcmp uno float %264, 0.000000e+00, !dbg !38 + %270 = xor i1 %269, true, !dbg !39 + %271 = and i1 %268, %270, !dbg !40 + %272 = or i1 %266, %271, !dbg !41 + %273 = and i1 %268, %269, !dbg !48 + %274 = or i1 %267, %273, !dbg !49 + %275 = icmp slt i32 %262, %265, !dbg !42 + %276 = and i1 %275, %274, !dbg !43 + %277 = or i1 %272, %276, !dbg !44 + %278 = select i1 %277, float %260, float %264, !dbg !45 + %279 = select i1 %277, i32 %262, i32 %265, !dbg !46 + %280 = bitcast float %278 to i32, !dbg !50 + %281 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %280, i32 4, i32 31), !dbg !50 + %282 = bitcast i32 %281 to float, !dbg !50 + %283 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %279, i32 4, i32 31), !dbg !50 + %284 = fcmp ogt float %278, %282, !dbg !35 + %285 = fcmp oeq float %278, %282, !dbg !47 + %286 = fcmp uno float %278, 0.000000e+00, !dbg !37 + %287 = fcmp uno float %282, 0.000000e+00, !dbg !38 + %288 = xor i1 %287, true, !dbg !39 + %289 = and i1 %286, %288, !dbg !40 + %290 = or i1 %284, %289, !dbg !41 + %291 = and i1 %287, %286, !dbg !48 + %292 = or i1 %285, %291, !dbg !49 + %293 = icmp slt i32 %279, %283, !dbg !42 + %294 = and i1 %293, %292, !dbg !43 + %295 = or i1 %290, %294, !dbg !44 + %296 = select i1 %295, float %278, float %282, !dbg !45 + %297 = select i1 %295, i32 %279, i32 %283, !dbg !46 + %298 = bitcast float %296 to i32, !dbg !50 + %299 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 2, i32 31), !dbg !50 + %300 = bitcast i32 %299 to float, !dbg !50 + %301 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %297, i32 2, i32 31), !dbg !50 + %302 = fcmp ogt float %296, %300, !dbg !35 + %303 = fcmp oeq float %296, %300, !dbg !47 + %304 = fcmp uno float %296, 0.000000e+00, !dbg !37 + %305 = fcmp uno float %300, 0.000000e+00, !dbg !38 + %306 = xor i1 %305, true, !dbg !39 + %307 = and i1 %304, %306, !dbg !40 + %308 = or i1 %302, %307, !dbg !41 + %309 = and i1 %305, %304, !dbg !48 + %310 = or i1 %303, %309, !dbg !49 + %311 = icmp slt i32 %297, %301, !dbg !42 + %312 = and i1 %311, %310, !dbg !43 + %313 = or i1 %308, %312, !dbg !44 + %314 = select i1 %313, float %296, float %300, !dbg !45 + %315 = select i1 %313, i32 %297, i32 %301, !dbg !46 + %316 = bitcast float %314 to i32, !dbg !50 + %317 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %316, i32 1, i32 31), !dbg !50 + %318 = bitcast i32 %317 to float, !dbg !50 + %319 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %315, i32 1, i32 31), !dbg !50 + %320 = fcmp ogt float %314, %318, !dbg !35 + %321 = fcmp oeq float %314, %318, !dbg !47 + %322 = fcmp uno float %314, 0.000000e+00, !dbg !37 + %323 = fcmp uno float %318, 0.000000e+00, !dbg !38 + %324 = xor i1 %323, true, !dbg !39 + %325 = and i1 %322, %324, !dbg !40 + %326 = or i1 %320, %325, !dbg !41 + %327 = and i1 %323, %322, !dbg !48 + %328 = or i1 %321, %327, !dbg !49 + %329 = icmp slt i32 %315, %319, !dbg !42 + %330 = and i1 %329, %328, !dbg !43 + %331 = or i1 %326, %330, !dbg !44 + %332 = select i1 %331, i32 %315, i32 %319, !dbg !46 + %333 = icmp eq i32 %8, 0, !dbg !50 + %334 = select i1 %331, i32 %316, i32 %317, !dbg !45 + %335 = insertelement <1 x i32> poison, i32 %334, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %258, <1 x i32> %335, i1 %333) #4, !dbg !50 + %336 = insertelement <1 x i32> poison, i32 %332, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %261, <1 x i32> %336, i1 %333) #4, !dbg !50 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50 + %337 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !50 + %338 = zext nneg i32 %7 to i64, !dbg !51 + %339 = getelementptr i64, ptr addrspace(1) %1, i64 %338, !dbg !51 + %340 = sext i32 %337 to i64, !dbg !52 + %341 = and i32 %8, 511, !dbg !52 + %342 = icmp eq i32 %341, 0, !dbg !52 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %340, ptr addrspace(1) %339, i1 %342) #4, !dbg !52 + ret void, !dbg !53 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_1", linkageName: "triton_red_fused_argmax_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 26, column: 37, scope: !4) +!9 = !DILocation(line: 29, column: 19, scope: !4) +!10 = !DILocation(line: 33, column: 40, scope: !4) +!11 = !DILocation(line: 34, column: 31, scope: !4) +!12 = !DILocation(line: 35, column: 29, scope: !4) +!13 = !DILocation(line: 39, column: 52, scope: !4) +!14 = !DILocation(line: 39, column: 34, scope: !4) +!15 = !DILocation(line: 39, column: 66, scope: !4) +!16 = !DILocation(line: 144, column: 21, scope: !17, inlinedAt: !19) +!17 = distinct !DILexicalBlockFile(scope: !4, file: !18, discriminator: 0) +!18 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!19 = !DILocation(line: 42, column: 38, scope: !4) +!20 = !DILocation(line: 145, column: 23, scope: !17, inlinedAt: !19) +!21 = !DILocation(line: 147, column: 29, scope: !17, inlinedAt: !19) +!22 = !DILocation(line: 148, column: 29, scope: !17, inlinedAt: !19) +!23 = !DILocation(line: 149, column: 31, scope: !17, inlinedAt: !19) +!24 = !DILocation(line: 149, column: 27, scope: !17, inlinedAt: !19) +!25 = !DILocation(line: 149, column: 16, scope: !17, inlinedAt: !19) +!26 = !DILocation(line: 151, column: 27, scope: !17, inlinedAt: !19) +!27 = !DILocation(line: 151, column: 17, scope: !17, inlinedAt: !19) +!28 = !DILocation(line: 154, column: 31, scope: !17, inlinedAt: !19) +!29 = !DILocation(line: 154, column: 21, scope: !17, inlinedAt: !19) +!30 = !DILocation(line: 154, column: 12, scope: !17, inlinedAt: !19) +!31 = !DILocation(line: 155, column: 35, scope: !17, inlinedAt: !19) +!32 = !DILocation(line: 155, column: 69, scope: !17, inlinedAt: !19) +!33 = !DILocation(line: 44, column: 46, scope: !4) +!34 = !DILocation(line: 45, column: 58, scope: !4) +!35 = !DILocation(line: 144, column: 21, scope: !17, inlinedAt: !36) +!36 = !DILocation(line: 46, column: 75, scope: !4) +!37 = !DILocation(line: 147, column: 29, scope: !17, inlinedAt: !36) +!38 = !DILocation(line: 148, column: 29, scope: !17, inlinedAt: !36) +!39 = !DILocation(line: 149, column: 31, scope: !17, inlinedAt: !36) +!40 = !DILocation(line: 149, column: 27, scope: !17, inlinedAt: !36) +!41 = !DILocation(line: 149, column: 16, scope: !17, inlinedAt: !36) +!42 = !DILocation(line: 154, column: 31, scope: !17, inlinedAt: !36) +!43 = !DILocation(line: 154, column: 21, scope: !17, inlinedAt: !36) +!44 = !DILocation(line: 154, column: 12, scope: !17, inlinedAt: !36) +!45 = !DILocation(line: 155, column: 35, scope: !17, inlinedAt: !36) +!46 = !DILocation(line: 155, column: 69, scope: !17, inlinedAt: !36) +!47 = !DILocation(line: 145, column: 23, scope: !17, inlinedAt: !36) +!48 = !DILocation(line: 151, column: 27, scope: !17, inlinedAt: !36) +!49 = !DILocation(line: 151, column: 17, scope: !17, inlinedAt: !36) +!50 = !DILocation(line: 165, column: 42, scope: !17, inlinedAt: !36) +!51 = !DILocation(line: 48, column: 25, scope: !4) +!52 = !DILocation(line: 48, column: 36, scope: !4) +!53 = !DILocation(line: 48, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..ff8d466cb17cc14efc8c1e4846a8df4170ee6757 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.ptx @@ -0,0 +1,839 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_argmax_1 // -- Begin function triton_red_fused_argmax_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_argmax_1 +.visible .entry triton_red_fused_argmax_1( + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_1, + .param .u32 triton_red_fused_argmax_1_param_2, + .param .u32 triton_red_fused_argmax_1_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_5 +) +.reqntid 512 +{ + .reg .pred %p<202>; + .reg .b32 %r<119>; + .reg .b64 %rd<31>; + .loc 1 18 0 // ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:18:0 +$L__func_begin0: + .loc 1 18 0 // ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_red_fused_argmax_1_param_1]; + ld.param.b64 %rd7, [triton_red_fused_argmax_1_param_0]; +$L__tmp0: + .loc 1 23 28 // ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:23:28 + mov.u32 %r1, %ctaid.x; + .loc 1 26 37 // ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:26:37 + mov.u32 %r2, %tid.x; + shl.b32 %r19, %r2, 2; + and.b32 %r20, %r19, 2044; + .loc 1 29 19 // ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:29:19 + shr.u32 %r21, %r1, 11; + mul.lo.s32 %r22, %r1, 32000; + mad.lo.s32 %r23, %r21, 224000, %r22; + .loc 1 33 40 // ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:33:40 + cvt.u64.u32 %rd1, %r20; + add.s32 %r24, %r23, %r20; + cvt.u64.u32 %rd2, %r24; + mov.b32 %r115, 0fFF800000; + mov.b64 %rd30, {%r115, %r115}; + mov.b32 %r113, 2147483647; + mov.b64 %rd29, -2048; + mov.b32 %r114, %r113; + mov.b32 %r116, %r115; + mov.b32 %r117, %r113; + mov.b32 %r118, %r113; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 34 31 // ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:34:31 + add.s64 %rd14, %rd1, %rd29; + add.s64 %rd15, %rd14, 2048; + add.s64 %rd16, %rd14, 2050; + .loc 1 35 29 // ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:35:29 + add.s64 %rd17, %rd14, 2051; + setp.lt.u64 %p1, %rd15, 32000; + .loc 1 39 34 // ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:39:34 + add.s64 %rd18, %rd2, %rd29; + cvt.u32.u64 %r33, %rd18; + add.s32 %r34, %r33, 2048; + mad.wide.s32 %rd12, %r34, 4, %rd7; + .loc 1 39 66 // ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:39:66 + // begin inline asm + mov.u64 %rd11, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd11, 1.0; + // end inline asm + mov.b32 %r29, 0; + // begin inline asm + mov.u32 %r25, %r29; + mov.u32 %r26, %r29; + mov.u32 %r27, %r29; + mov.u32 %r28, %r29; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd12 + 0 ], %rd11; + // end inline asm +$L__tmp1: + .loc 2 144 21 // triton_helpers.py:144:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + setp.gt.f32 %p2, %r115, %r27; + setp.gt.f32 %p3, %r116, %r28; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + setp.eq.f32 %p4, %r115, %r27; + setp.eq.f32 %p5, %r116, %r28; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + mov.b64 {%r35, %r36}, %rd30; + setp.nan.f32 %p6, %r35, %r35; + setp.nan.f32 %p7, %r36, %r36; + setp.nan.f32 %p8, %r115, %r115; + setp.nan.f32 %p9, %r116, %r116; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + setp.nan.f32 %p10, %r27, %r27; + setp.num.f32 %p11, %r27, %r27; + setp.nan.f32 %p12, %r28, %r28; + setp.num.f32 %p13, %r28, %r28; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + and.pred %p14, %p8, %p11; + and.pred %p15, %p9, %p13; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + or.pred %p16, %p2, %p14; + or.pred %p17, %p3, %p15; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + and.pred %p18, %p8, %p10; + and.pred %p19, %p9, %p12; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + or.pred %p20, %p4, %p18; + or.pred %p21, %p5, %p19; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + cvt.s64.s32 %rd19, %r118; + cvt.s64.s32 %rd20, %r117; + cvt.s64.s32 %rd21, %r113; + setp.gt.s64 %p22, %rd16, %rd21; + cvt.s64.s32 %rd22, %r114; + setp.gt.s64 %p23, %rd17, %rd22; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + and.pred %p24, %p22, %p20; + and.pred %p25, %p23, %p21; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + or.pred %p26, %p16, %p24; + or.pred %p27, %p17, %p25; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + selp.f32 %r37, %r115, %r27, %p26; + selp.f32 %r38, %r116, %r28, %p27; +$L__tmp2: + .loc 1 39 66 // ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:39:66 + cvt.u64.u32 %rd23, %r25; + cvt.u64.u32 %rd24, %r26; + shl.b64 %rd25, %rd24, 32; + or.b64 %rd26, %rd23, %rd25; +$L__tmp3: + .loc 2 148 29 // triton_helpers.py:148:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + mov.b64 {%r39, %r40}, %rd26; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + setp.gt.f32 %p28, %r36, %r40; + setp.gt.f32 %p29, %r35, %r39; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + setp.eq.f32 %p30, %r35, %r39; + setp.eq.f32 %p31, %r36, %r40; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + setp.nan.f32 %p32, %r40, %r40; + setp.nan.f32 %p33, %r39, %r39; + setp.num.f32 %p34, %r39, %r39; + setp.num.f32 %p35, %r40, %r40; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + and.pred %p36, %p7, %p35; + and.pred %p37, %p6, %p34; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + or.pred %p38, %p29, %p37; + or.pred %p39, %p28, %p36; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + and.pred %p40, %p6, %p33; + and.pred %p41, %p7, %p32; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + or.pred %p42, %p31, %p41; + or.pred %p43, %p30, %p40; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + setp.gt.s64 %p44, %rd15, %rd20; + setp.ge.s64 %p45, %rd15, %rd19; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + and.pred %p46, %p44, %p43; + and.pred %p47, %p45, %p42; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + or.pred %p48, %p39, %p47; + or.pred %p49, %p38, %p46; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + selp.f32 %r41, %r35, %r39, %p49; + selp.f32 %r42, %r36, %r40, %p48; + cvt.u32.u64 %r43, %rd15; + cvt.u32.u64 %r44, %rd14; + add.s32 %r45, %r44, 2049; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:42:38 ] + selp.b32 %r46, %r117, %r43, %p49; + selp.b32 %r47, %r118, %r45, %p48; + cvt.u32.u64 %r48, %rd16; + selp.b32 %r49, %r113, %r48, %p26; + cvt.u32.u64 %r50, %rd17; + selp.b32 %r51, %r114, %r50, %p27; +$L__tmp4: + .loc 1 44 46 // ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:44:46 + selp.f32 %r52, %r42, %r36, %p1; + selp.f32 %r53, %r41, %r35, %p1; + mov.b64 %rd30, {%r53, %r52}; + selp.f32 %r115, %r37, %r115, %p1; + selp.f32 %r116, %r38, %r116, %p1; + .loc 1 45 58 // ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:45:58 + selp.b32 %r118, %r47, %r118, %p1; + selp.b32 %r117, %r46, %r117, %p1; + selp.b32 %r113, %r49, %r113, %p1; + selp.b32 %r114, %r51, %r114, %p1; + .loc 1 33 40 // ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:33:40 + add.s64 %rd29, %rd29, 2048; + setp.lt.u64 %p50, %rd29, 29952; + @%p50 bra $L__BB0_1; +// %bb.2: + .loc 1 26 37 // ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:26:37 + and.b32 %r66, %r2, 31; +$L__tmp5: + .loc 2 144 21 // triton_helpers.py:144:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + mov.b64 {%r67, %r68}, %rd30; + setp.gt.f32 %p58, %r67, %r68; + setp.eq.f32 %p59, %r68, %r67; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p60, %r67, %r67; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.num.f32 %p61, %r68, %r68; + setp.nan.f32 %p62, %r68, %r68; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p63, %p60, %p62; + and.pred %p64, %p60, %p61; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p65, %p58, %p64; + or.pred %p66, %p59, %p63; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.lt.s32 %p67, %r117, %r118; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p68, %p67, %p66; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p69, %p65, %p68; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.f32 %r69, %r67, %r68, %p69; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.b32 %r70, %r117, %r118, %p69; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.gt.f32 %p70, %r69, %r115; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.eq.f32 %p71, %r69, %r115; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p72, %r69, %r69; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p73, %r115, %r115; + setp.num.f32 %p74, %r115, %r115; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p75, %p72, %p74; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p76, %p70, %p75; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p77, %p73, %p72; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p78, %p71, %p77; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.lt.s32 %p79, %r70, %r113; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p80, %p79, %p78; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p81, %p76, %p80; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.f32 %r71, %r69, %r115, %p81; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.b32 %r72, %r70, %r113, %p81; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.gt.f32 %p82, %r71, %r116; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.eq.f32 %p83, %r71, %r116; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p84, %r71, %r71; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p85, %r116, %r116; + setp.num.f32 %p86, %r116, %r116; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p87, %p84, %p86; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p88, %p82, %p87; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p89, %p85, %p84; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p90, %p83, %p89; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.lt.s32 %p91, %r72, %r114; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p92, %p91, %p90; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p93, %p88, %p92; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.f32 %r73, %r71, %r116, %p93; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.b32 %r74, %r72, %r114, %p93; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + shfl.sync.bfly.b32 %r75, %r73, 16, 31, -1; + shfl.sync.bfly.b32 %r76, %r74, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.gt.f32 %p94, %r73, %r75; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.eq.f32 %p95, %r73, %r75; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p96, %r73, %r73; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p97, %r75, %r75; + setp.num.f32 %p98, %r75, %r75; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p99, %p96, %p98; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p100, %p94, %p99; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p101, %p96, %p97; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p102, %p95, %p101; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.lt.s32 %p103, %r74, %r76; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p104, %p103, %p102; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p105, %p100, %p104; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.f32 %r77, %r73, %r75, %p105; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.b32 %r78, %r74, %r76, %p105; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + shfl.sync.bfly.b32 %r79, %r77, 8, 31, -1; + shfl.sync.bfly.b32 %r80, %r78, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.gt.f32 %p106, %r77, %r79; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.eq.f32 %p107, %r77, %r79; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p108, %r77, %r77; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p109, %r79, %r79; + setp.num.f32 %p110, %r79, %r79; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p111, %p108, %p110; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p112, %p106, %p111; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p113, %p109, %p108; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p114, %p107, %p113; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.lt.s32 %p115, %r78, %r80; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p116, %p115, %p114; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p117, %p112, %p116; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.f32 %r81, %r77, %r79, %p117; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.b32 %r82, %r78, %r80, %p117; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + shfl.sync.bfly.b32 %r83, %r81, 4, 31, -1; + shfl.sync.bfly.b32 %r84, %r82, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.gt.f32 %p118, %r81, %r83; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.eq.f32 %p119, %r81, %r83; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p120, %r81, %r81; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p121, %r83, %r83; + setp.num.f32 %p122, %r83, %r83; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p123, %p120, %p122; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p124, %p118, %p123; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p125, %p121, %p120; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p126, %p119, %p125; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.lt.s32 %p127, %r82, %r84; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p128, %p127, %p126; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p129, %p124, %p128; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.f32 %r85, %r81, %r83, %p129; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.b32 %r86, %r82, %r84, %p129; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + shfl.sync.bfly.b32 %r87, %r85, 2, 31, -1; + shfl.sync.bfly.b32 %r88, %r86, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.gt.f32 %p130, %r85, %r87; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.eq.f32 %p131, %r85, %r87; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p132, %r85, %r85; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p133, %r87, %r87; + setp.num.f32 %p134, %r87, %r87; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p135, %p132, %p134; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p136, %p130, %p135; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p137, %p133, %p132; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p138, %p131, %p137; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.lt.s32 %p139, %r86, %r88; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p140, %p139, %p138; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p141, %p136, %p140; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.f32 %r89, %r85, %r87, %p141; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.b32 %r90, %r86, %r88, %p141; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + shfl.sync.bfly.b32 %r91, %r89, 1, 31, -1; + shfl.sync.bfly.b32 %r92, %r90, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.gt.f32 %p142, %r89, %r91; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.eq.f32 %p143, %r89, %r91; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p144, %r89, %r89; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p145, %r91, %r91; + setp.num.f32 %p146, %r91, %r91; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p147, %p144, %p146; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p148, %p142, %p147; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p149, %p145, %p144; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p150, %p143, %p149; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.lt.s32 %p151, %r90, %r92; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p152, %p151, %p150; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p153, %p148, %p152; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.b32 %r57, %r90, %r92, %p153; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.eq.b32 %p51, %r66, 0; + shr.u32 %r93, %r2, 3; + and.b32 %r94, %r93, 60; + mov.b32 %r95, global_smem; + add.s32 %r54, %r95, %r94; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.b32 %r55, %r89, %r91, %p153; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + // begin inline asm + @%p51 st.shared.b32 [ %r54 + 0 ], %r55; + // end inline asm + add.s32 %r96, %r95, 64; + add.s32 %r56, %r96, %r94; + // begin inline asm + @%p51 st.shared.b32 [ %r56 + 0 ], %r57; + // end inline asm + bar.sync 0; + setp.lt.u32 %p53, %r2, 16; + add.s32 %r59, %r95, %r19; + // begin inline asm + @%p53 ld.shared.b32 %r58, [ %r59 + 0 ]; + // end inline asm + add.s32 %r61, %r96, %r19; + // begin inline asm + @%p53 ld.shared.b32 %r60, [ %r61 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r98, %r58, 8, 31, -1; + shfl.sync.bfly.b32 %r99, %r60, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.gt.f32 %p154, %r58, %r98; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.eq.f32 %p155, %r58, %r98; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p156, %r58, %r58; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p157, %r98, %r98; + setp.num.f32 %p158, %r98, %r98; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p159, %p156, %p158; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p160, %p154, %p159; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p161, %p156, %p157; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p162, %p155, %p161; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.lt.s32 %p163, %r60, %r99; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p164, %p163, %p162; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p165, %p160, %p164; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.f32 %r100, %r58, %r98, %p165; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.b32 %r101, %r60, %r99, %p165; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + shfl.sync.bfly.b32 %r102, %r100, 4, 31, -1; + shfl.sync.bfly.b32 %r103, %r101, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.gt.f32 %p166, %r100, %r102; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.eq.f32 %p167, %r100, %r102; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p168, %r100, %r100; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p169, %r102, %r102; + setp.num.f32 %p170, %r102, %r102; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p171, %p168, %p170; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p172, %p166, %p171; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p173, %p169, %p168; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p174, %p167, %p173; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.lt.s32 %p175, %r101, %r103; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p176, %p175, %p174; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p177, %p172, %p176; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.f32 %r104, %r100, %r102, %p177; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.b32 %r105, %r101, %r103, %p177; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + shfl.sync.bfly.b32 %r106, %r104, 2, 31, -1; + shfl.sync.bfly.b32 %r107, %r105, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.gt.f32 %p178, %r104, %r106; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.eq.f32 %p179, %r104, %r106; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p180, %r104, %r104; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p181, %r106, %r106; + setp.num.f32 %p182, %r106, %r106; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p183, %p180, %p182; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p184, %p178, %p183; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p185, %p181, %p180; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p186, %p179, %p185; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.lt.s32 %p187, %r105, %r107; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p188, %p187, %p186; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p189, %p184, %p188; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.f32 %r108, %r104, %r106, %p189; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.b32 %r109, %r105, %r107, %p189; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + shfl.sync.bfly.b32 %r110, %r108, 1, 31, -1; + shfl.sync.bfly.b32 %r111, %r109, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.gt.f32 %p190, %r108, %r110; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.eq.f32 %p191, %r108, %r110; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p192, %r108, %r108; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.nan.f32 %p193, %r110, %r110; + setp.num.f32 %p194, %r110, %r110; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p195, %p192, %p194; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p196, %p190, %p195; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p197, %p193, %p192; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p198, %p191, %p197; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.lt.s32 %p199, %r109, %r111; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + and.pred %p200, %p199, %p198; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + or.pred %p201, %p196, %p200; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.b32 %r65, %r109, %r111, %p201; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + setp.eq.b32 %p55, %r2, 0; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + selp.b32 %r63, %r108, %r110, %p201; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:46:75 ] + // begin inline asm + @%p55 st.shared.b32 [ %r59 + 0 ], %r63; + // end inline asm + // begin inline asm + @%p55 st.shared.b32 [ %r61 + 0 ], %r65; + // end inline asm + bar.sync 0; + ld.shared.s32 %rd27, [global_smem+64]; +$L__tmp6: + .loc 1 48 25 // ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:48:25 + mad.wide.u32 %rd28, %r1, 8, %rd8; + .loc 1 48 36 // ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:48:36 + and.b32 %r112, %r2, 511; + setp.eq.b32 %p57, %r112, 0; + // begin inline asm + @%p57 st.global.b64 [ %rd28 + 0 ], { %rd27 }; + // end inline asm + .loc 1 48 4 // ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py:48:4 + ret; +$L__tmp7: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 234 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 105 +.b8 105 +.b8 122 +.b8 55 +.b8 119 +.b8 121 +.b8 110 +.b8 106 +.b8 118 +.b8 113 +.b8 107 +.b8 110 +.b8 54 +.b8 117 +.b8 118 +.b8 53 +.b8 99 +.b8 115 +.b8 97 +.b8 104 +.b8 119 +.b8 114 +.b8 121 +.b8 116 +.b8 53 +.b8 120 +.b8 50 +.b8 100 +.b8 54 +.b8 54 +.b8 52 +.b8 117 +.b8 52 +.b8 111 +.b8 55 +.b8 117 +.b8 103 +.b8 109 +.b8 101 +.b8 112 +.b8 102 +.b8 99 +.b8 115 +.b8 102 +.b8 99 +.b8 110 +.b8 105 +.b8 117 +.b8 116 +.b8 52 +.b8 118 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 105 +.b8 105 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.source new file mode 100644 index 0000000000000000000000000000000000000000..35ac40d6037d05fb99e9e97a46d5da4b07716188 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.source @@ -0,0 +1,309 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":18:0) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc45 = loc(unknown) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("out_ptr0"(#loc)) +#loc72 = loc("xnumel"(#loc)) +#loc73 = loc("r0_numel"(#loc)) +#loc100 = loc("a_value"(#loc33)) +#loc101 = loc("a_index"(#loc33)) +#loc102 = loc("b_value"(#loc33)) +#loc103 = loc("b_index"(#loc33)) +#loc116 = loc("x"(#loc53)) +#loc117 = loc("x"(#loc57)) +#loc118 = loc("value"(#loc66)) +#loc119 = loc("index"(#loc66)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 4096 : i32 loc(#loc74) + %r0_numel_1 = arith.constant 32000 : i32 loc(#loc75) + %xoffset = tt.get_program_id x : i32 loc(#loc76) + %xoffset_2 = arith.constant 1 : i32 loc(#loc77) + %xoffset_3 = arith.constant 1 : i32 loc(#loc77) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc77) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc78) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc79) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc80) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc80) + %xmask = arith.constant true loc(#loc81) + %xmask_8 = arith.constant dense : tensor<1x2048xi1> loc(#loc81) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc82) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc83) + %x0 = arith.constant 2048 : i32 loc(#loc84) + %x0_10 = arith.constant 2048 : i32 loc(#loc84) + %x0_11 = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc84) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<1x1xi32> loc(#loc84) + %x1 = arith.constant 2048 : i32 loc(#loc85) + %x1_13 = arith.constant 2048 : i32 loc(#loc85) + %x1_14 = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc85) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<1x1xi32> loc(#loc85) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc86) + %_tmp2_16 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc86) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc87) + %_tmp2_index_17 = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc87) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp2_index_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_19 = %_tmp2_16, %_tmp2_index_20 = %_tmp2_index_17) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc89) + %r0_index_21 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc89) + %r0_mask = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc90) + %r0_mask_22 = arith.cmpi slt, %r0_index_21, %r0_mask : tensor<1x2048xi32> loc(#loc90) + %tmp0 = arith.constant 32000 : i32 loc(#loc91) + %tmp0_23 = arith.constant 32000 : i32 loc(#loc91) + %tmp0_24 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc91) + %tmp0_25 = arith.muli %tmp0_24, %x0_12 : tensor<1x1xi32> loc(#loc91) + %tmp0_26 = tt.broadcast %tmp0_25 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc92) + %tmp0_27 = arith.addi %r0_index_21, %tmp0_26 : tensor<1x2048xi32> loc(#loc92) + %tmp0_28 = arith.constant 65760000 : i32 loc(#loc93) + %tmp0_29 = arith.constant 65760000 : i32 loc(#loc93) + %tmp0_30 = arith.constant dense<65760000> : tensor<1x1xi32> loc(#loc93) + %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<1x1xi32> loc(#loc93) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc94) + %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<1x2048xi32> loc(#loc94) + %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc95) + %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc95) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc96) + %tmp0_37 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc96) + %tmp0_38 = tt.load %tmp0_35, %r0_mask_22, %tmp0_37 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc96) + %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%_tmp2_19, %_tmp2_index_20, %tmp0_38, %r0_index_21) : (tensor<1x2048xf32>, tensor<1x2048xi32>, tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) loc(#loc24) + %_tmp2_39 = arith.select %r0_mask_22, %8#0, %_tmp2_19 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc97) + %_tmp2_index_40 = arith.select %r0_mask_22, %8#1, %_tmp2_index_20 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc98) + scf.yield %_tmp2_39, %_tmp2_index_40 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc27) + } loc(#loc120) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%_tmp2_index_18#0, %_tmp2_index_18#1) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc28) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc99) + %5 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc30) + %6 = tt.addptr %5, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc30) + %7 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc31) + tt.store %6, %7 : tensor<1x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%a_value: tensor<1x2048xf32> loc("a_value"(#loc33)), %a_index: tensor<1x2048xi32> loc("a_index"(#loc33)), %b_value: tensor<1x2048xf32> loc("b_value"(#loc33)), %b_index: tensor<1x2048xi32> loc("b_index"(#loc33))) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<1x2048xf32> loc(#loc121) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<1x2048xf32> loc(#loc122) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%a_value) : (tensor<1x2048xf32>) -> i1 loc(#loc36) + %1:2 = scf.if %0 -> (tensor<1x2048xi1>, tensor<1x2048xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<1x2048xf32> loc(#loc106) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<1x2048xf32> loc(#loc107) + %mask_3 = arith.constant true loc(#loc108) + %mask_4 = arith.constant dense : tensor<1x2048xi1> loc(#loc108) + %mask_5 = arith.xori %b_isnan, %mask_4 : tensor<1x2048xi1> loc(#loc108) + %mask_6 = arith.andi %a_isnan, %mask_5 : tensor<1x2048xi1> loc(#loc109) + %mask_7 = arith.ori %mask, %mask_6 : tensor<1x2048xi1> loc(#loc123) + %equal_8 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc111) + %equal_9 = arith.ori %equal, %equal_8 : tensor<1x2048xi1> loc(#loc124) + scf.yield %mask_7, %equal_9 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc124) + } else { + scf.yield %mask, %equal : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc45) + } loc(#loc37) + %mask_0 = arith.cmpi slt, %a_index, %b_index : tensor<1x2048xi32> loc(#loc113) + %mask_1 = arith.andi %1#1, %mask_0 : tensor<1x2048xi1> loc(#loc114) + %mask_2 = arith.ori %1#0, %mask_1 : tensor<1x2048xi1> loc(#loc115) + %2 = arith.select %mask_2, %a_value, %b_value : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc49) + %3 = arith.select %mask_2, %a_index, %b_index : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc50) + tt.return %2, %3 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc51) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x2048xf32> loc(#loc52) + %5 = ub.poison : tensor<1x2048xi32> loc(#loc52) + tt.return %4, %5 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc52) + } loc(#loc33) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc53))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc54) + %true = arith.constant true loc(#loc55) + tt.return %true : i1 loc(#loc55) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc56) + tt.return %1 : i1 loc(#loc56) + } loc(#loc53) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc57))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc59) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc59) + %4 = arith.addf %x, %3 : tensor<1x2048xf32> loc(#loc59) + tt.return %4 : tensor<1x2048xf32> loc(#loc60) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x2048xf32> loc(#loc61) + tt.return %5 : tensor<1x2048xf32> loc(#loc61) + } loc(#loc57) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc63) + %cst = arith.constant dense : tensor<1xi1> loc(#loc63) + tt.return %cst : tensor<1xi1> loc(#loc64) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc65) + tt.return %0 : tensor<1xi1> loc(#loc65) + } loc(#loc62) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%value: tensor<1x2048xf32> loc("value"(#loc66)), %index: tensor<1x2048xi32> loc("index"(#loc66))) -> (tensor<1xf32>, tensor<1xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc67) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc67) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc67) + tt.return %0#0, %0#1 : tensor<1xf32>, tensor<1xi32> loc(#loc68) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc69) + %2 = ub.poison : tensor<1xi32> loc(#loc69) + tt.return %1, %2 : tensor<1xf32>, tensor<1xi32> loc(#loc69) + } loc(#loc66) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc33)), %a_index: i32 loc("a_index"(#loc33)), %b_value: f32 loc("b_value"(#loc33)), %b_index: i32 loc("b_index"(#loc33))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc121) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc122) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc36) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc106) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc107) + %mask_3 = arith.constant true loc(#loc108) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc108) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc109) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc123) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc111) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc124) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc124) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc45) + } loc(#loc37) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc113) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc114) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc115) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc49) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc50) + tt.return %2, %3 : f32, i32 loc(#loc51) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc52) + %5 = ub.poison : i32 loc(#loc52) + tt.return %4, %5 : f32, i32 loc(#loc52) + } loc(#loc33) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc53))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc54) + %true = arith.constant true loc(#loc55) + tt.return %true : i1 loc(#loc55) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc56) + tt.return %1 : i1 loc(#loc56) + } loc(#loc53) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc57))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc59) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc59) + tt.return %3 : tensor<1xf32> loc(#loc60) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc61) + tt.return %4 : tensor<1xf32> loc(#loc61) + } loc(#loc57) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":29:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":30:55) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":31:58) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":33:40) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":34:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":35:29) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":39:47) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":39:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":39:61) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":39:52) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":39:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":39:66) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":42:38) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":44:46) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":45:58) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":45:8) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":46:75) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":47:20) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":48:25) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":48:36) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":48:4) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc74 = loc("xnumel"(#loc1)) +#loc75 = loc("r0_numel"(#loc2)) +#loc76 = loc("xoffset"(#loc3)) +#loc77 = loc("xoffset"(#loc4)) +#loc78 = loc("xindex"(#loc5)) +#loc79 = loc("xindex"(#loc6)) +#loc80 = loc("xindex"(#loc7)) +#loc81 = loc("xmask"(#loc8)) +#loc82 = loc("r0_base"(#loc9)) +#loc83 = loc("r0_base"(#loc10)) +#loc84 = loc("x0"(#loc11)) +#loc85 = loc("x1"(#loc12)) +#loc86 = loc("_tmp2"(#loc13)) +#loc87 = loc("_tmp2_index"(#loc14)) +#loc88 = loc("_tmp2"(#loc15)) +#loc89 = loc("r0_index"(#loc16)) +#loc90 = loc("r0_mask"(#loc17)) +#loc91 = loc("tmp0"(#loc18)) +#loc92 = loc("tmp0"(#loc19)) +#loc93 = loc("tmp0"(#loc20)) +#loc94 = loc("tmp0"(#loc21)) +#loc95 = loc("tmp0"(#loc22)) +#loc96 = loc("tmp0"(#loc23)) +#loc97 = loc("_tmp2"(#loc25)) +#loc98 = loc("_tmp2_index"(#loc26)) +#loc99 = loc("tmp2"(#loc29)) +#loc104 = loc("mask"(#loc34)) +#loc105 = loc("equal"(#loc35)) +#loc106 = loc("a_isnan"(#loc38)) +#loc107 = loc("b_isnan"(#loc39)) +#loc108 = loc("mask"(#loc40)) +#loc109 = loc("mask"(#loc41)) +#loc110 = loc("mask"(#loc42)) +#loc111 = loc("equal"(#loc43)) +#loc112 = loc("equal"(#loc44)) +#loc113 = loc("mask"(#loc46)) +#loc114 = loc("mask"(#loc47)) +#loc115 = loc("mask"(#loc48)) +#loc120 = loc("_tmp2_index"(#loc88)) +#loc121 = loc("mask"(#loc104)) +#loc122 = loc("equal"(#loc105)) +#loc123 = loc("mask"(#loc110)) +#loc124 = loc("equal"(#loc112)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..ecc3da4ebd3e029a6229459e273e6c093d2b7525 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.ttgir @@ -0,0 +1,185 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":18:0) +#loc1 = loc(unknown) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":46:75) +#loc39 = loc("in_ptr0"(#loc)) +#loc40 = loc("out_ptr0"(#loc)) +#loc41 = loc("xnumel"(#loc)) +#loc42 = loc("r0_numel"(#loc)) +#loc73 = loc(callsite(#loc1 at #loc34)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32000> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c65760000_i32 = arith.constant 65760000 : i32 loc(#loc1) + %true = arith.constant true loc(#loc1) + %cst_1 = arith.constant dense : tensor<1x2048xi1, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<2147483647> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc43) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc44) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc44) + %x0 = arith.remsi %xoffset, %c2048_i32 : i32 loc(#loc45) + %x1 = arith.divsi %xoffset, %c2048_i32 : i32 loc(#loc46) + %tmp0 = arith.muli %x0, %c32000_i32 : i32 loc(#loc47) + %tmp0_5 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc75) + %tmp0_6 = arith.muli %x1, %c65760000_i32 : i32 loc(#loc49) + %tmp0_7 = tt.splat %tmp0_6 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc76) + %tmp0_8 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc51) + %_tmp2_index:2 = scf.for %_tmp2_index_9 = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp2 = %cst_3, %_tmp2_index_10 = %cst_2) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp2_index_9 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc53) + %r0_index_11 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32, #blocked> loc(#loc53) + %r0_mask = arith.cmpi slt, %r0_index_11, %cst : tensor<1x2048xi32, #blocked> loc(#loc54) + %tmp0_12 = arith.addi %r0_index_11, %tmp0_5 : tensor<1x2048xi32, #blocked> loc(#loc48) + %tmp0_13 = arith.addi %tmp0_12, %tmp0_7 : tensor<1x2048xi32, #blocked> loc(#loc50) + %tmp0_14 = tt.addptr %tmp0_8, %tmp0_13 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc51) + %tmp0_15 = tt.load %tmp0_14, %r0_mask, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc55) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_15 : tensor<1x2048xf32, #blocked> loc(#loc100) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_15 : tensor<1x2048xf32, #blocked> loc(#loc101) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<1x2048xf32, #blocked> loc(#loc80) + %b_isnan = arith.cmpf une, %tmp0_15, %tmp0_15 : tensor<1x2048xf32, #blocked> loc(#loc81) + %mask_16 = arith.xori %b_isnan, %cst_1 : tensor<1x2048xi1, #blocked> loc(#loc82) + %mask_17 = arith.andi %a_isnan, %mask_16 : tensor<1x2048xi1, #blocked> loc(#loc83) + %mask_18 = arith.ori %mask, %mask_17 : tensor<1x2048xi1, #blocked> loc(#loc102) + %equal_19 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1, #blocked> loc(#loc85) + %equal_20 = arith.ori %equal, %equal_19 : tensor<1x2048xi1, #blocked> loc(#loc103) + %mask_21 = arith.cmpi slt, %_tmp2_index_10, %r0_index_11 : tensor<1x2048xi32, #blocked> loc(#loc87) + %mask_22 = arith.andi %equal_20, %mask_21 : tensor<1x2048xi1, #blocked> loc(#loc88) + %mask_23 = arith.ori %mask_18, %mask_22 : tensor<1x2048xi1, #blocked> loc(#loc89) + %5 = arith.select %mask_23, %_tmp2, %tmp0_15 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc68) + %6 = arith.select %mask_23, %_tmp2_index_10, %r0_index_11 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc69) + %_tmp2_24 = arith.select %r0_mask, %5, %_tmp2 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc70) + %_tmp2_index_25 = arith.select %r0_mask, %6, %_tmp2_index_10 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc71) + scf.yield %_tmp2_24, %_tmp2_index_25 : tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc32) + } loc(#loc77) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc34)), %arg5: i32 loc(callsite(#loc1 at #loc34)), %arg6: f32 loc(callsite(#loc1 at #loc34)), %arg7: i32 loc(callsite(#loc1 at #loc34))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc104) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc105) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc90) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc91) + %mask_9 = arith.xori %b_isnan, %true : i1 loc(#loc92) + %mask_10 = arith.andi %a_isnan, %mask_9 : i1 loc(#loc93) + %mask_11 = arith.ori %mask, %mask_10 : i1 loc(#loc106) + %equal_12 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc94) + %equal_13 = arith.ori %equal, %equal_12 : i1 loc(#loc107) + %mask_14 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc95) + %mask_15 = arith.andi %equal_13, %mask_14 : i1 loc(#loc96) + %mask_16 = arith.ori %mask_11, %mask_15 : i1 loc(#loc97) + %5 = arith.select %mask_16, %arg4, %arg6 : f32 loc(#loc98) + %6 = arith.select %mask_16, %arg5, %arg7 : i32 loc(#loc99) + tt.reduce.return %5, %6 : f32, i32 loc(#loc72) + }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc72) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi32, #blocked> loc(#loc74) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc36) + %2 = ttg.convert_layout %tmp2 : tensor<1x1xi32, #blocked> -> tensor<1x1xi32, #blocked1> loc(#loc37) + %3 = arith.extsi %2 : tensor<1x1xi32, #blocked1> to tensor<1x1xi64, #blocked1> loc(#loc37) + %4 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc37) + tt.store %4, %3 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc37) + tt.return loc(#loc38) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":26:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":28:19) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":29:19) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":39:47) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":39:41) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":39:61) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":39:52) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":39:34) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":33:40) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":34:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":35:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":39:66) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":42:38) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":44:46) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":45:58) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":45:8) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":47:20) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":48:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":48:36) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":48:4) +#loc43 = loc("xoffset"(#loc2)) +#loc44 = loc("r0_base"(#loc3)) +#loc45 = loc("x0"(#loc4)) +#loc46 = loc("x1"(#loc5)) +#loc47 = loc("tmp0"(#loc6)) +#loc48 = loc("tmp0"(#loc7)) +#loc49 = loc("tmp0"(#loc8)) +#loc50 = loc("tmp0"(#loc9)) +#loc51 = loc("tmp0"(#loc10)) +#loc52 = loc("_tmp2"(#loc11)) +#loc53 = loc("r0_index"(#loc12)) +#loc54 = loc("r0_mask"(#loc13)) +#loc55 = loc("tmp0"(#loc14)) +#loc56 = loc("mask"(#loc15)) +#loc57 = loc("equal"(#loc17)) +#loc58 = loc("a_isnan"(#loc18)) +#loc59 = loc("b_isnan"(#loc19)) +#loc60 = loc("mask"(#loc20)) +#loc61 = loc("mask"(#loc21)) +#loc62 = loc("mask"(#loc22)) +#loc63 = loc("equal"(#loc23)) +#loc64 = loc("equal"(#loc24)) +#loc65 = loc("mask"(#loc25)) +#loc66 = loc("mask"(#loc26)) +#loc67 = loc("mask"(#loc27)) +#loc68 = loc(callsite(#loc28 at #loc16)) +#loc69 = loc(callsite(#loc29 at #loc16)) +#loc70 = loc("_tmp2"(#loc30)) +#loc71 = loc("_tmp2_index"(#loc31)) +#loc72 = loc(callsite(#loc33 at #loc34)) +#loc74 = loc("tmp2"(#loc35)) +#loc75 = loc(fused[#loc48, #loc47]) +#loc76 = loc(fused[#loc50, #loc49]) +#loc77 = loc("_tmp2_index"(#loc52)) +#loc78 = loc("mask"(#loc56)) +#loc79 = loc("equal"(#loc57)) +#loc80 = loc(callsite(#loc58 at #loc16)) +#loc81 = loc(callsite(#loc59 at #loc16)) +#loc82 = loc(callsite(#loc60 at #loc16)) +#loc83 = loc(callsite(#loc61 at #loc16)) +#loc84 = loc("mask"(#loc62)) +#loc85 = loc(callsite(#loc63 at #loc16)) +#loc86 = loc("equal"(#loc64)) +#loc87 = loc(callsite(#loc65 at #loc16)) +#loc88 = loc(callsite(#loc66 at #loc16)) +#loc89 = loc(callsite(#loc67 at #loc16)) +#loc90 = loc(callsite(#loc58 at #loc72)) +#loc91 = loc(callsite(#loc59 at #loc72)) +#loc92 = loc(callsite(#loc60 at #loc72)) +#loc93 = loc(callsite(#loc61 at #loc72)) +#loc94 = loc(callsite(#loc63 at #loc72)) +#loc95 = loc(callsite(#loc65 at #loc72)) +#loc96 = loc(callsite(#loc66 at #loc72)) +#loc97 = loc(callsite(#loc67 at #loc72)) +#loc98 = loc(callsite(#loc28 at #loc72)) +#loc99 = loc(callsite(#loc29 at #loc72)) +#loc100 = loc(callsite(#loc78 at #loc16)) +#loc101 = loc(callsite(#loc79 at #loc16)) +#loc102 = loc(callsite(#loc84 at #loc16)) +#loc103 = loc(callsite(#loc86 at #loc16)) +#loc104 = loc(callsite(#loc78 at #loc72)) +#loc105 = loc(callsite(#loc79 at #loc72)) +#loc106 = loc(callsite(#loc84 at #loc72)) +#loc107 = loc(callsite(#loc86 at #loc72)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..8af1415c578a931f4a935b5b81dc26e39ba2c307 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DK3KONQZECTW63QK2MLGAREJAW4LX4XNSZKGJIOFQZDNKLBGPPKA/triton_red_fused_argmax_1.ttir @@ -0,0 +1,188 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":46:75) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc46 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c65760000_i32 = arith.constant 65760000 : i32 loc(#loc1) + %true = arith.constant true loc(#loc46) + %cst = arith.constant dense : tensor<1x2048xi1> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc1) + %cst_1 = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc47) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc48) + %xoffset = tt.get_program_id x : i32 loc(#loc49) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc50) + %r0_base_2 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc51) + %x0 = arith.remsi %xoffset, %c2048_i32 : i32 loc(#loc52) + %x1 = arith.divsi %xoffset, %c2048_i32 : i32 loc(#loc53) + %_tmp2_index_3:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp2_4 = %_tmp2, %_tmp2_index_5 = %_tmp2_index) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc55) + %r0_index_6 = arith.addi %r0_index, %r0_base_2 : tensor<1x2048xi32> loc(#loc55) + %r0_mask = arith.cmpi slt, %r0_index_6, %cst_1 : tensor<1x2048xi32> loc(#loc56) + %tmp0 = arith.muli %x0, %c32000_i32 : i32 loc(#loc57) + %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc82) + %tmp0_8 = arith.addi %r0_index_6, %tmp0_7 : tensor<1x2048xi32> loc(#loc58) + %tmp0_9 = arith.muli %x1, %c65760000_i32 : i32 loc(#loc59) + %tmp0_10 = tt.splat %tmp0_9 : i32 -> tensor<1x2048xi32> loc(#loc83) + %tmp0_11 = arith.addi %tmp0_8, %tmp0_10 : tensor<1x2048xi32> loc(#loc60) + %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc61) + %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc61) + %tmp0_14 = tt.load %tmp0_13, %r0_mask, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc62) + %mask = arith.cmpf ogt, %_tmp2_4, %tmp0_14 : tensor<1x2048xf32> loc(#loc106) + %equal = arith.cmpf oeq, %_tmp2_4, %tmp0_14 : tensor<1x2048xf32> loc(#loc107) + %a_isnan = arith.cmpf une, %_tmp2_4, %_tmp2_4 : tensor<1x2048xf32> loc(#loc86) + %b_isnan = arith.cmpf une, %tmp0_14, %tmp0_14 : tensor<1x2048xf32> loc(#loc87) + %mask_15 = arith.xori %b_isnan, %cst : tensor<1x2048xi1> loc(#loc88) + %mask_16 = arith.andi %a_isnan, %mask_15 : tensor<1x2048xi1> loc(#loc89) + %mask_17 = arith.ori %mask, %mask_16 : tensor<1x2048xi1> loc(#loc108) + %equal_18 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc91) + %equal_19 = arith.ori %equal, %equal_18 : tensor<1x2048xi1> loc(#loc109) + %mask_20 = arith.cmpi slt, %_tmp2_index_5, %r0_index_6 : tensor<1x2048xi32> loc(#loc93) + %mask_21 = arith.andi %equal_19, %mask_20 : tensor<1x2048xi1> loc(#loc94) + %mask_22 = arith.ori %mask_17, %mask_21 : tensor<1x2048xi1> loc(#loc95) + %4 = arith.select %mask_22, %_tmp2_4, %tmp0_14 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc75) + %5 = arith.select %mask_22, %_tmp2_index_5, %r0_index_6 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc76) + %_tmp2_23 = arith.select %r0_mask, %4, %_tmp2_4 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc77) + %_tmp2_index_24 = arith.select %r0_mask, %5, %_tmp2_index_5 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc78) + scf.yield %_tmp2_23, %_tmp2_index_24 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc36) + } loc(#loc81) + %0:2 = "tt.reduce"(%_tmp2_index_3#0, %_tmp2_index_3#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc2)), %arg5: i32 loc(callsite(#loc1 at #loc2)), %arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc110) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc111) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc96) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc97) + %mask_4 = arith.xori %b_isnan, %true : i1 loc(#loc98) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc99) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc112) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc100) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc113) + %mask_9 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc101) + %mask_10 = arith.andi %equal_8, %mask_9 : i1 loc(#loc102) + %mask_11 = arith.ori %mask_6, %mask_10 : i1 loc(#loc103) + %4 = arith.select %mask_11, %arg4, %arg6 : f32 loc(#loc104) + %5 = arith.select %mask_11, %arg5, %arg7 : i32 loc(#loc105) + tt.reduce.return %4, %5 : f32, i32 loc(#loc79) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc79) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc80) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc39) + %2 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc39) + %3 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc40) + tt.store %2, %3 : tensor<1x1x!tt.ptr> loc(#loc40) + tt.return loc(#loc41) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":33:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":31:58) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":30:55) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":23:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":26:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":26:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":28:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":29:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":34:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":35:29) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":39:47) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":39:41) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":39:61) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":39:52) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":39:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":39:66) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":42:38) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":44:46) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":45:58) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":45:8) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":47:20) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":48:25) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":48:36) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ii/ciiz7wynjvqkn6uv5csahwryt5x2d664u4o7ugmepfcsfcniut4v.py":48:4) +#loc47 = loc("_tmp2_index"(#loc4)) +#loc48 = loc("_tmp2"(#loc5)) +#loc49 = loc("xoffset"(#loc6)) +#loc50 = loc("r0_base"(#loc7)) +#loc51 = loc("r0_base"(#loc8)) +#loc52 = loc("x0"(#loc9)) +#loc53 = loc("x1"(#loc10)) +#loc54 = loc("_tmp2"(#loc3)) +#loc55 = loc("r0_index"(#loc11)) +#loc56 = loc("r0_mask"(#loc12)) +#loc57 = loc("tmp0"(#loc13)) +#loc58 = loc("tmp0"(#loc14)) +#loc59 = loc("tmp0"(#loc15)) +#loc60 = loc("tmp0"(#loc16)) +#loc61 = loc("tmp0"(#loc17)) +#loc62 = loc("tmp0"(#loc18)) +#loc63 = loc("mask"(#loc19)) +#loc64 = loc("equal"(#loc21)) +#loc65 = loc("a_isnan"(#loc22)) +#loc66 = loc("b_isnan"(#loc23)) +#loc67 = loc("mask"(#loc24)) +#loc68 = loc("mask"(#loc25)) +#loc69 = loc("mask"(#loc26)) +#loc70 = loc("equal"(#loc27)) +#loc71 = loc("equal"(#loc28)) +#loc72 = loc("mask"(#loc29)) +#loc73 = loc("mask"(#loc30)) +#loc74 = loc("mask"(#loc31)) +#loc75 = loc(callsite(#loc32 at #loc20)) +#loc76 = loc(callsite(#loc33 at #loc20)) +#loc77 = loc("_tmp2"(#loc34)) +#loc78 = loc("_tmp2_index"(#loc35)) +#loc79 = loc(callsite(#loc37 at #loc2)) +#loc80 = loc("tmp2"(#loc38)) +#loc81 = loc("_tmp2_index"(#loc54)) +#loc82 = loc(fused[#loc58, #loc57]) +#loc83 = loc(fused[#loc60, #loc59]) +#loc84 = loc("mask"(#loc63)) +#loc85 = loc("equal"(#loc64)) +#loc86 = loc(callsite(#loc65 at #loc20)) +#loc87 = loc(callsite(#loc66 at #loc20)) +#loc88 = loc(callsite(#loc67 at #loc20)) +#loc89 = loc(callsite(#loc68 at #loc20)) +#loc90 = loc("mask"(#loc69)) +#loc91 = loc(callsite(#loc70 at #loc20)) +#loc92 = loc("equal"(#loc71)) +#loc93 = loc(callsite(#loc72 at #loc20)) +#loc94 = loc(callsite(#loc73 at #loc20)) +#loc95 = loc(callsite(#loc74 at #loc20)) +#loc96 = loc(callsite(#loc65 at #loc79)) +#loc97 = loc(callsite(#loc66 at #loc79)) +#loc98 = loc(callsite(#loc67 at #loc79)) +#loc99 = loc(callsite(#loc68 at #loc79)) +#loc100 = loc(callsite(#loc70 at #loc79)) +#loc101 = loc(callsite(#loc72 at #loc79)) +#loc102 = loc(callsite(#loc73 at #loc79)) +#loc103 = loc(callsite(#loc74 at #loc79)) +#loc104 = loc(callsite(#loc32 at #loc79)) +#loc105 = loc(callsite(#loc33 at #loc79)) +#loc106 = loc(callsite(#loc84 at #loc20)) +#loc107 = loc(callsite(#loc85 at #loc20)) +#loc108 = loc(callsite(#loc90 at #loc20)) +#loc109 = loc(callsite(#loc92 at #loc20)) +#loc110 = loc(callsite(#loc84 at #loc79)) +#loc111 = loc(callsite(#loc85 at #loc79)) +#loc112 = loc(callsite(#loc90 at #loc79)) +#loc113 = loc(callsite(#loc92 at #loc79)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2d77ffef6e2338394300c0582be0afe0575b833f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..d1d64c1a299c4909ff182125a5f772c1657a031d Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ba053c60a955b83d36a1f6296a066d6581f27134 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "1b178f5dfe84e5bcbc00bc20de563fd4427bac43d930736557805e3c7c715574", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 16, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..2207c4dbc87fd5f68d7a9dfb63b7837a458e1af7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.llir @@ -0,0 +1,158 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 2, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 96, !dbg !9 + %12 = lshr exact i32 %11, 5, !dbg !9 + %13 = and i32 %10, 3, !dbg !9 + %14 = or disjoint i32 %12, %9, !dbg !10 + %15 = or disjoint i32 %9, %13, !dbg !10 + %16 = shl nuw nsw i32 %10, 2, !dbg !11 + %17 = and i32 %16, 124, !dbg !11 + %18 = sdiv i32 %14, 2048, !dbg !12 + %19 = mul i32 %18, 2048, !dbg !13 + %.decomposed = sub i32 %14, %19, !dbg !13 + %20 = srem i32 %18, 32, !dbg !14 + %21 = sdiv i32 %14, 65536, !dbg !15 + %22 = shl nsw i32 %20, 7, !dbg !16 + %23 = shl nsw i32 %.decomposed, 12, !dbg !17 + %24 = shl i32 %21, 23, !dbg !18 + %25 = or disjoint i32 %23, %17, !dbg !19 + %26 = add i32 %25, %24, !dbg !20 + %27 = add i32 %26, %22, !dbg !21 + %28 = sext i32 %27 to i64, !dbg !22 + %29 = getelementptr bfloat, ptr addrspace(1) %0, i64 %28, !dbg !22 + %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !23 + %31 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %29, i64 %30, i1 true) #4, !dbg !23 + %32 = extractvalue { i32, i32 } %31, 0, !dbg !23 + %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !23 + %34 = extractvalue { i32, i32 } %31, 1, !dbg !23 + %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !23 + %36 = shl i32 %14, 7, !dbg !24 + %37 = or disjoint i32 %36, %17, !dbg !25 + %38 = sext i32 %37 to i64, !dbg !26 + %39 = getelementptr bfloat, ptr addrspace(1) %1, i64 %38, !dbg !26 + %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !27 + %41 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %39, i64 %40, i1 true) #4, !dbg !27 + %42 = extractvalue { i32, i32 } %41, 0, !dbg !27 + %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !27 + %44 = extractvalue { i32, i32 } %41, 1, !dbg !27 + %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !27 + %46 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !28 + %47 = fpext <2 x bfloat> %43 to <2 x float>, !dbg !29 + %48 = fmul <2 x float> %46, %47, !dbg !30 + %49 = fadd <2 x float> %48, zeroinitializer, !dbg !31 + %50 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !28 + %51 = fpext <2 x bfloat> %45 to <2 x float>, !dbg !29 + %52 = fmul <2 x float> %50, %51, !dbg !30 + %53 = fadd <2 x float> %52, zeroinitializer, !dbg !31 + %shift = shufflevector <2 x float> %49, <2 x float> poison, <2 x i32> , !dbg !32 + %foldExtExtBinop = fadd <2 x float> %49, %shift, !dbg !32 + %foldExtExtBinop2 = fadd <2 x float> %53, %foldExtExtBinop, !dbg !32 + %shift4 = shufflevector <2 x float> %53, <2 x float> poison, <2 x i32> , !dbg !32 + %foldExtExtBinop5 = fadd <2 x float> %shift4, %foldExtExtBinop2, !dbg !32 + %54 = extractelement <2 x float> %foldExtExtBinop5, i64 0, !dbg !32 + %55 = bitcast float %54 to i32, !dbg !36 + %56 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 16, i32 31), !dbg !36 + %57 = bitcast i32 %56 to float, !dbg !36 + %58 = fadd float %54, %57, !dbg !32 + %59 = bitcast float %58 to i32, !dbg !36 + %60 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %59, i32 8, i32 31), !dbg !36 + %61 = bitcast i32 %60 to float, !dbg !36 + %62 = fadd float %58, %61, !dbg !32 + %63 = bitcast float %62 to i32, !dbg !36 + %64 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 4, i32 31), !dbg !36 + %65 = bitcast i32 %64 to float, !dbg !36 + %66 = fadd float %62, %65, !dbg !32 + %67 = bitcast float %66 to i32, !dbg !36 + %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 2, i32 31), !dbg !36 + %69 = bitcast i32 %68 to float, !dbg !36 + %70 = fadd float %66, %69, !dbg !32 + %71 = bitcast float %70 to i32, !dbg !36 + %72 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 1, i32 31), !dbg !36 + %73 = bitcast i32 %72 to float, !dbg !36 + %74 = fadd float %70, %73, !dbg !32 + %75 = lshr exact i32 %11, 3, !dbg !37 + %76 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %75, !dbg !37 + store float %74, ptr addrspace(3) %76, align 4, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37 + %77 = shl nuw nsw i32 %13, 2, !dbg !37 + %78 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %77, !dbg !37 + %79 = load i32, ptr addrspace(3) %78, align 4, !dbg !37 + %80 = sext i32 %15 to i64, !dbg !38 + %81 = getelementptr float, ptr addrspace(1) %2, i64 %80, !dbg !38 + %82 = and i32 %10, 124, !dbg !39 + %83 = icmp eq i32 %82, 0, !dbg !39 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %79, ptr addrspace(1) %81, i1 %83) #4, !dbg !39 + ret void, !dbg !40 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 21, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 29, column: 29, scope: !4) +!15 = !DILocation(line: 30, column: 19, scope: !4) +!16 = !DILocation(line: 39, column: 45, scope: !4) +!17 = !DILocation(line: 39, column: 55, scope: !4) +!18 = !DILocation(line: 39, column: 68, scope: !4) +!19 = !DILocation(line: 39, column: 41, scope: !4) +!20 = !DILocation(line: 39, column: 50, scope: !4) +!21 = !DILocation(line: 39, column: 60, scope: !4) +!22 = !DILocation(line: 39, column: 34, scope: !4) +!23 = !DILocation(line: 39, column: 73, scope: !4) +!24 = !DILocation(line: 40, column: 45, scope: !4) +!25 = !DILocation(line: 40, column: 41, scope: !4) +!26 = !DILocation(line: 40, column: 34, scope: !4) +!27 = !DILocation(line: 40, column: 50, scope: !4) +!28 = !DILocation(line: 39, column: 127, scope: !4) +!29 = !DILocation(line: 40, column: 104, scope: !4) +!30 = !DILocation(line: 41, column: 22, scope: !4) +!31 = !DILocation(line: 43, column: 23, scope: !4) +!32 = !DILocation(line: 261, column: 15, scope: !33, inlinedAt: !35) +!33 = distinct !DILexicalBlockFile(scope: !4, file: !34, discriminator: 0) +!34 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!35 = !DILocation(line: 45, column: 25, scope: !4) +!36 = !DILocation(line: 291, column: 36, scope: !33, inlinedAt: !35) +!37 = !DILocation(line: 45, column: 28, scope: !4) +!38 = !DILocation(line: 49, column: 25, scope: !4) +!39 = !DILocation(line: 49, column: 36, scope: !4) +!40 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..19d4fe1ec07f756a6410ed130e0bf37e67502a20 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ptx @@ -0,0 +1,412 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u32 triton_red_fused_zeros_0_param_3, + .param .u32 triton_red_fused_zeros_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_6 +) +.reqntid 128 +{ + .reg .pred %p<4>; + .reg .b16 %rs<9>; + .reg .b32 %r<72>; + .reg .b64 %rd<11>; + .loc 1 18 0 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_red_fused_zeros_0_param_0]; + ld.param.b64 %rd9, [triton_red_fused_zeros_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:23:28 + mov.u32 %r10, %ctaid.x; + .loc 1 23 33 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:23:33 + shl.b32 %r11, %r10, 2; + ld.param.b64 %rd10, [triton_red_fused_zeros_0_param_2]; + .loc 1 24 44 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:24:44 + mov.u32 %r12, %tid.x; + and.b32 %r13, %r12, 96; + bfe.u32 %r14, %r12, 5, 2; + and.b32 %r15, %r12, 3; + .loc 1 24 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:24:23 + or.b32 %r16, %r14, %r11; + or.b32 %r17, %r11, %r15; + .loc 1 26 37 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:26:37 + shl.b32 %r18, %r12, 2; + and.b32 %r19, %r18, 124; + .loc 1 29 21 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:29:21 + bfe.s32 %r20, %r10, 29, 1; + shr.u32 %r21, %r20, 21; + add.s32 %r22, %r16, %r21; + shr.s32 %r23, %r22, 11; + .loc 1 28 19 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:28:19 + and.b32 %r24, %r22, 1046528; + sub.s32 %r25, %r16, %r24; + .loc 1 29 29 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:29:29 + shr.u32 %r26, %r23, 27; + add.s32 %r27, %r23, %r26; + and.b32 %r28, %r27, 33554400; + sub.s32 %r29, %r23, %r28; + .loc 1 30 19 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:30:19 + shr.u32 %r30, %r20, 16; + add.s32 %r31, %r16, %r30; + .loc 1 39 45 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:45 + shl.b32 %r32, %r29, 7; + .loc 1 39 55 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:55 + shl.b32 %r33, %r25, 12; + .loc 1 39 68 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:68 + shl.b32 %r34, %r31, 7; + and.b32 %r35, %r34, -8388608; + .loc 1 39 41 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:41 + or.b32 %r36, %r33, %r19; + .loc 1 39 50 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:50 + add.s32 %r37, %r36, %r35; + .loc 1 39 60 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:60 + add.s32 %r38, %r37, %r32; + .loc 1 39 34 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:34 + mad.wide.s32 %rd2, %r38, 2, %rd8; + .loc 1 39 73 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:73 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd3, 1.0; + // end inline asm + mov.b32 %r3, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd2 + 0 ], %rd3; + // end inline asm + .loc 1 40 45 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:45 + shl.b32 %r39, %r16, 7; + .loc 1 40 41 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:41 + or.b32 %r40, %r39, %r19; + .loc 1 40 34 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:34 + mad.wide.s32 %rd5, %r40, 2, %rd9; + .loc 1 40 50 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:50 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r5, %r3; + mov.u32 %r6, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r5, %r6 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 39 127 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:127 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r41, %rs1; + cvt.f32.bf16 %r42, %rs2; + .loc 1 40 104 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:104 + mov.b32 {%rs3, %rs4}, %r5; + cvt.f32.bf16 %r43, %rs3; + cvt.f32.bf16 %r44, %rs4; + .loc 1 43 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:43:23 + fma.rn.f32 %r45, %r42, %r44, 0f00000000; + fma.rn.f32 %r46, %r41, %r43, 0f00000000; + .loc 1 39 127 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:127 + mov.b32 {%rs5, %rs6}, %r2; + cvt.f32.bf16 %r47, %rs5; + cvt.f32.bf16 %r48, %rs6; + .loc 1 40 104 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:104 + mov.b32 {%rs7, %rs8}, %r6; + cvt.f32.bf16 %r49, %rs7; + cvt.f32.bf16 %r50, %rs8; + .loc 1 43 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:43:23 + fma.rn.f32 %r51, %r48, %r50, 0f00000000; + fma.rn.f32 %r52, %r47, %r49, 0f00000000; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r53, %r46, %r45; + add.f32 %r54, %r52, %r53; + add.f32 %r55, %r51, %r54; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r56, %r55, 16, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r57, %r55, %r56; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r58, %r57, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r59, %r57, %r58; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r60, %r59, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r61, %r59, %r60; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r62, %r61, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r63, %r61, %r62; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r64, %r63, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r65, %r63, %r64; +$L__tmp2: + .loc 1 45 28 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:28 + shr.u32 %r66, %r13, 3; + mov.b32 %r67, global_smem; + add.s32 %r68, %r67, %r66; + st.shared.b32 [%r68], %r65; + bar.sync 0; + shl.b32 %r69, %r15, 2; + add.s32 %r70, %r67, %r69; + ld.shared.b32 %r9, [%r70]; + .loc 1 49 25 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:49:25 + mad.wide.s32 %rd7, %r17, 4, %rd10; + .loc 1 49 36 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:49:36 + and.b32 %r71, %r12, 124; + setp.eq.b32 %p3, %r71, 0; + // begin inline asm + @%p3 st.global.b32 [ %rd7 + 0 ], { %r9 }; + // end inline asm + .loc 1 49 4 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:49:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 209 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xca DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 100 +.b8 113 +.b8 120 +.b8 120 +.b8 101 +.b8 118 +.b8 100 +.b8 121 +.b8 115 +.b8 115 +.b8 111 +.b8 121 +.b8 117 +.b8 116 +.b8 50 +.b8 101 +.b8 117 +.b8 119 +.b8 53 +.b8 53 +.b8 121 +.b8 50 +.b8 55 +.b8 99 +.b8 97 +.b8 104 +.b8 113 +.b8 113 +.b8 99 +.b8 103 +.b8 109 +.b8 118 +.b8 121 +.b8 117 +.b8 104 +.b8 100 +.b8 105 +.b8 104 +.b8 98 +.b8 52 +.b8 116 +.b8 109 +.b8 110 +.b8 101 +.b8 114 +.b8 55 +.b8 99 +.b8 102 +.b8 99 +.b8 55 +.b8 102 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 100 +.b8 113 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..03455f0c1ba0cfaff4e44af5fd714fd269173305 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.source @@ -0,0 +1,222 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":18:0) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc46 = loc(unknown) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc53 = loc("in_ptr0"(#loc)) +#loc54 = loc("in_ptr1"(#loc)) +#loc55 = loc("out_ptr1"(#loc)) +#loc56 = loc("xnumel"(#loc)) +#loc57 = loc("r0_numel"(#loc)) +#loc97 = loc("input"(#loc44)) +#loc98 = loc("a"(#loc49)) +#loc99 = loc("b"(#loc49)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 524288 : i32 loc(#loc58) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc59) + %xoffset = tt.get_program_id x : i32 loc(#loc60) + %xoffset_2 = arith.constant 4 : i32 loc(#loc61) + %xoffset_3 = arith.constant 4 : i32 loc(#loc61) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc61) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc62) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc63) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<4x1xi32> loc(#loc64) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<4x1xi32> loc(#loc64) + %xmask = arith.constant true loc(#loc65) + %xmask_8 = arith.constant dense : tensor<4x128xi1> loc(#loc65) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc66) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc67) + %x0 = arith.constant 2048 : i32 loc(#loc68) + %x0_10 = arith.constant 2048 : i32 loc(#loc68) + %x0_11 = arith.constant dense<2048> : tensor<4x1xi32> loc(#loc68) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<4x1xi32> loc(#loc68) + %x1 = arith.constant 2048 : i32 loc(#loc69) + %x1_13 = arith.constant 2048 : i32 loc(#loc69) + %x1_14 = arith.constant dense<2048> : tensor<4x1xi32> loc(#loc69) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<4x1xi32> loc(#loc69) + %x1_16 = arith.constant 32 : i32 loc(#loc70) + %x1_17 = arith.constant 32 : i32 loc(#loc70) + %x1_18 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc70) + %x1_19 = arith.remsi %x1_15, %x1_18 : tensor<4x1xi32> loc(#loc70) + %x2 = arith.constant 65536 : i32 loc(#loc71) + %x2_20 = arith.constant 65536 : i32 loc(#loc71) + %x2_21 = arith.constant dense<65536> : tensor<4x1xi32> loc(#loc71) + %x2_22 = arith.divsi %xindex_7, %x2_21 : tensor<4x1xi32> loc(#loc71) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc72) + %_tmp4_23 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc72) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c128_i32 = arith.constant 128 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_24 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_27 = %_tmp4_23) -> (tensor<4x128xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc74) + %r0_index_28 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc74) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc75) + %r0_mask_29 = arith.cmpi slt, %r0_index_28, %r0_mask : tensor<1x128xi32> loc(#loc75) + %tmp0 = arith.constant 128 : i32 loc(#loc76) + %tmp0_30 = arith.constant 128 : i32 loc(#loc76) + %tmp0_31 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc76) + %tmp0_32 = arith.muli %tmp0_31, %x1_19 : tensor<4x1xi32> loc(#loc76) + %tmp0_33 = tt.broadcast %r0_index_28 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc77) + %tmp0_34 = tt.broadcast %tmp0_32 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc77) + %tmp0_35 = arith.addi %tmp0_33, %tmp0_34 : tensor<4x128xi32> loc(#loc77) + %tmp0_36 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_37 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_38 = arith.constant dense<4096> : tensor<4x1xi32> loc(#loc78) + %tmp0_39 = arith.muli %tmp0_38, %x0_12 : tensor<4x1xi32> loc(#loc78) + %tmp0_40 = tt.broadcast %tmp0_39 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc79) + %tmp0_41 = arith.addi %tmp0_35, %tmp0_40 : tensor<4x128xi32> loc(#loc79) + %tmp0_42 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_43 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_44 = arith.constant dense<8388608> : tensor<4x1xi32> loc(#loc80) + %tmp0_45 = arith.muli %tmp0_44, %x2_22 : tensor<4x1xi32> loc(#loc80) + %tmp0_46 = tt.broadcast %tmp0_45 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc81) + %tmp0_47 = arith.addi %tmp0_41, %tmp0_46 : tensor<4x128xi32> loc(#loc81) + %tmp0_48 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc82) + %tmp0_49 = tt.addptr %tmp0_48, %tmp0_47 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc82) + %tmp0_50 = arith.constant 0.000000e+00 : f32 loc(#loc83) + %tmp0_51 = tt.broadcast %r0_mask_29 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc83) + %tmp0_52 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc83) + %tmp0_53 = arith.truncf %tmp0_52 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc83) + %tmp0_54 = tt.load %tmp0_49, %tmp0_51, %tmp0_53 evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc83) + %tmp0_55 = arith.extf %tmp0_54 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc84) + %tmp1 = arith.constant 128 : i32 loc(#loc85) + %tmp1_56 = arith.constant 128 : i32 loc(#loc85) + %tmp1_57 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc85) + %tmp1_58 = arith.muli %tmp1_57, %xindex_7 : tensor<4x1xi32> loc(#loc85) + %tmp1_59 = tt.broadcast %r0_index_28 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc86) + %tmp1_60 = tt.broadcast %tmp1_58 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc86) + %tmp1_61 = arith.addi %tmp1_59, %tmp1_60 : tensor<4x128xi32> loc(#loc86) + %tmp1_62 = tt.splat %in_ptr1 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc87) + %tmp1_63 = tt.addptr %tmp1_62, %tmp1_61 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc87) + %tmp1_64 = arith.constant 0.000000e+00 : f32 loc(#loc88) + %tmp1_65 = tt.broadcast %r0_mask_29 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc88) + %tmp1_66 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc88) + %tmp1_67 = arith.truncf %tmp1_66 : tensor<4x128xf32> to tensor<4x128xbf16> loc(#loc88) + %tmp1_68 = tt.load %tmp1_63, %tmp1_65, %tmp1_67 evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc88) + %tmp1_69 = arith.extf %tmp1_68 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc89) + %tmp2 = arith.mulf %tmp0_55, %tmp1_69 : tensor<4x128xf32> loc(#loc90) + %tmp5 = arith.addf %_tmp4_27, %tmp2 : tensor<4x128xf32> loc(#loc91) + %_tmp4_70 = tt.broadcast %r0_mask_29 : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc92) + %_tmp4_71 = arith.select %_tmp4_70, %tmp5, %_tmp4_27 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc92) + scf.yield %_tmp4_71 : tensor<4x128xf32> loc(#loc36) + } loc(#loc73) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_24) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc93) + %tmp4_25 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc94) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc95) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<4x1xf32> loc(#loc96) + %tmp8_26 = arith.subf %tmp4_25, %tmp8 : tensor<4x1xf32> loc(#loc96) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<4x1x!tt.ptr> loc(#loc41) + %5 = tt.addptr %4, %xindex_7 : tensor<4x1x!tt.ptr>, tensor<4x1xi32> loc(#loc41) + tt.store %5, %tmp8_26 : tensor<4x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S4_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x128xf32> loc("input"(#loc44))) -> tensor<4xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc45) + tt.reduce.return %2 : f32 loc(#loc45) + }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc45) + tt.return %0 : tensor<4xf32> loc(#loc47) + ^bb1: // no predecessors + %1 = ub.poison : tensor<4xf32> loc(#loc48) + tt.return %1 : tensor<4xf32> loc(#loc48) + } loc(#loc44) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc49)), %b: f32 loc("b"(#loc49))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc50) + tt.return %0 : f32 loc(#loc51) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc52) + tt.return %1 : f32 loc(#loc52) + } loc(#loc49) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":30:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":32:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:68) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:60) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:73) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:127) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:45) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:50) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:104) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":41:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":43:23) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:40) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:28) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":47:11) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":48:18) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:4) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc58 = loc("xnumel"(#loc1)) +#loc59 = loc("r0_numel"(#loc2)) +#loc60 = loc("xoffset"(#loc3)) +#loc61 = loc("xoffset"(#loc4)) +#loc62 = loc("xindex"(#loc5)) +#loc63 = loc("xindex"(#loc6)) +#loc64 = loc("xindex"(#loc7)) +#loc65 = loc("xmask"(#loc8)) +#loc66 = loc("r0_base"(#loc9)) +#loc67 = loc("r0_base"(#loc10)) +#loc68 = loc("x0"(#loc11)) +#loc69 = loc("x1"(#loc12)) +#loc70 = loc("x1"(#loc13)) +#loc71 = loc("x2"(#loc14)) +#loc72 = loc("_tmp4"(#loc15)) +#loc73 = loc("_tmp4"(#loc16)) +#loc74 = loc("r0_index"(#loc17)) +#loc75 = loc("r0_mask"(#loc18)) +#loc76 = loc("tmp0"(#loc19)) +#loc77 = loc("tmp0"(#loc20)) +#loc78 = loc("tmp0"(#loc21)) +#loc79 = loc("tmp0"(#loc22)) +#loc80 = loc("tmp0"(#loc23)) +#loc81 = loc("tmp0"(#loc24)) +#loc82 = loc("tmp0"(#loc25)) +#loc83 = loc("tmp0"(#loc26)) +#loc84 = loc("tmp0"(#loc27)) +#loc85 = loc("tmp1"(#loc28)) +#loc86 = loc("tmp1"(#loc29)) +#loc87 = loc("tmp1"(#loc30)) +#loc88 = loc("tmp1"(#loc31)) +#loc89 = loc("tmp1"(#loc32)) +#loc90 = loc("tmp2"(#loc33)) +#loc91 = loc("tmp5"(#loc34)) +#loc92 = loc("_tmp4"(#loc35)) +#loc93 = loc("tmp4"(#loc37)) +#loc94 = loc("tmp4"(#loc38)) +#loc95 = loc("tmp7"(#loc39)) +#loc96 = loc("tmp8"(#loc40)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..dae93f7b105d8717b5598ca26ee9b34eef3c2dba --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,142 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":18:0) +#loc1 = loc(unknown) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:25) +#loc36 = loc("in_ptr0"(#loc)) +#loc37 = loc("in_ptr1"(#loc)) +#loc38 = loc("out_ptr1"(#loc)) +#loc39 = loc("xnumel"(#loc)) +#loc40 = loc("r0_numel"(#loc)) +#loc68 = loc("tmp4"(#loc30)) +#loc71 = loc(callsite(#loc1 at #loc68)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<8388608> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<65536> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<32> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<2048> : tensor<4x1xi32, #blocked> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %cst_6 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc41) + %xoffset_8 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc42) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc43) + %xindex_9 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc43) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4x1xi32, #blocked> loc(#loc43) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc43) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<4x1xi32, #blocked> loc(#loc44) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<4x1xi32, #blocked1> loc(#loc44) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<4x1xi32, #blocked> loc(#loc44) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<4x1xi32, #blocked1> loc(#loc44) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc45) + %r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc45) + %x0 = arith.remsi %xindex_14, %cst_5 : tensor<4x1xi32, #blocked> loc(#loc46) + %x1 = arith.divsi %xindex_14, %cst_5 : tensor<4x1xi32, #blocked> loc(#loc47) + %x1_17 = arith.remsi %x1, %cst_4 : tensor<4x1xi32, #blocked> loc(#loc48) + %x2 = arith.divsi %xindex_14, %cst_3 : tensor<4x1xi32, #blocked> loc(#loc49) + %r0_mask = arith.cmpi slt, %r0_base_16, %cst : tensor<1x128xi32, #blocked> loc(#loc50) + %tmp0 = arith.muli %x1_17, %cst_0 : tensor<4x1xi32, #blocked> loc(#loc51) + %tmp0_18 = tt.broadcast %r0_base_16 : tensor<1x128xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc52) + %tmp0_19 = tt.broadcast %tmp0 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc52) + %tmp0_20 = arith.addi %tmp0_18, %tmp0_19 : tensor<4x128xi32, #blocked> loc(#loc52) + %tmp0_21 = arith.muli %x0, %cst_1 : tensor<4x1xi32, #blocked> loc(#loc53) + %tmp0_22 = tt.broadcast %tmp0_21 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc54) + %tmp0_23 = arith.addi %tmp0_20, %tmp0_22 : tensor<4x128xi32, #blocked> loc(#loc54) + %tmp0_24 = arith.muli %x2, %cst_2 : tensor<4x1xi32, #blocked> loc(#loc55) + %tmp0_25 = tt.broadcast %tmp0_24 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc56) + %tmp0_26 = arith.addi %tmp0_23, %tmp0_25 : tensor<4x128xi32, #blocked> loc(#loc56) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr, #blocked> loc(#loc57) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<4x128x!tt.ptr, #blocked>, tensor<4x128xi32, #blocked> loc(#loc57) + %tmp0_29 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<4x128xi1, #blocked> loc(#loc58) + %tmp0_30 = tt.load %tmp0_28, %tmp0_29, %cst_6 evictionPolicy = evict_first : tensor<4x128x!tt.ptr, #blocked> loc(#loc58) + %tmp0_31 = arith.extf %tmp0_30 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc59) + %tmp1 = arith.muli %xindex_14, %cst_0 : tensor<4x1xi32, #blocked> loc(#loc60) + %tmp1_32 = tt.broadcast %tmp1 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc61) + %tmp1_33 = arith.addi %tmp0_18, %tmp1_32 : tensor<4x128xi32, #blocked> loc(#loc61) + %tmp1_34 = tt.splat %in_ptr1 : !tt.ptr -> tensor<4x128x!tt.ptr, #blocked> loc(#loc62) + %tmp1_35 = tt.addptr %tmp1_34, %tmp1_33 : tensor<4x128x!tt.ptr, #blocked>, tensor<4x128xi32, #blocked> loc(#loc62) + %tmp1_36 = tt.load %tmp1_35, %tmp0_29, %cst_6 evictionPolicy = evict_first : tensor<4x128x!tt.ptr, #blocked> loc(#loc63) + %tmp1_37 = arith.extf %tmp1_36 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc64) + %tmp2 = arith.mulf %tmp0_31, %tmp1_37 : tensor<4x128xf32, #blocked> loc(#loc65) + %tmp5 = arith.addf %tmp2, %cst_7 : tensor<4x128xf32, #blocked> loc(#loc66) + %_tmp4 = arith.select %tmp0_29, %tmp5, %cst_7 : tensor<4x128xi1, #blocked>, tensor<4x128xf32, #blocked> loc(#loc67) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_40: f32 loc(callsite(#loc1 at #loc68)), %tmp4_41: f32 loc(callsite(#loc1 at #loc68))): + %tmp4_42 = arith.addf %tmp4_40, %tmp4_41 : f32 loc(#loc72) + tt.reduce.return %tmp4_42 : f32 loc(#loc70) + }) : (tensor<4x128xf32, #blocked>) -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc70) + %tmp4_38 = ttg.convert_layout %tmp4 : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc69) + %tmp4_39 = tt.expand_dims %tmp4_38 {axis = 1 : i32} : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xf32, #blocked1> loc(#loc69) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<4x1x!tt.ptr, #blocked1> loc(#loc33) + %1 = tt.addptr %0, %xindex_15 : tensor<4x1x!tt.ptr, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc33) + tt.store %1, %tmp4_39 : tensor<4x1x!tt.ptr, #blocked1> loc(#loc34) + tt.return loc(#loc35) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":30:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":35:29) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:45) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:41) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:55) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:50) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:68) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:60) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:73) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:127) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:45) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:41) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:34) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:50) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:104) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":41:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":43:23) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:40) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:28) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:4) +#loc41 = loc("xoffset"(#loc2)) +#loc42 = loc("xoffset"(#loc3)) +#loc43 = loc("xindex"(#loc4)) +#loc44 = loc("xindex"(#loc5)) +#loc45 = loc("r0_base"(#loc6)) +#loc46 = loc("x0"(#loc7)) +#loc47 = loc("x1"(#loc8)) +#loc48 = loc("x1"(#loc9)) +#loc49 = loc("x2"(#loc10)) +#loc50 = loc("r0_mask"(#loc11)) +#loc51 = loc("tmp0"(#loc12)) +#loc52 = loc("tmp0"(#loc13)) +#loc53 = loc("tmp0"(#loc14)) +#loc54 = loc("tmp0"(#loc15)) +#loc55 = loc("tmp0"(#loc16)) +#loc56 = loc("tmp0"(#loc17)) +#loc57 = loc("tmp0"(#loc18)) +#loc58 = loc("tmp0"(#loc19)) +#loc59 = loc("tmp0"(#loc20)) +#loc60 = loc("tmp1"(#loc21)) +#loc61 = loc("tmp1"(#loc22)) +#loc62 = loc("tmp1"(#loc23)) +#loc63 = loc("tmp1"(#loc24)) +#loc64 = loc("tmp1"(#loc25)) +#loc65 = loc("tmp2"(#loc26)) +#loc66 = loc("tmp5"(#loc27)) +#loc67 = loc("_tmp4"(#loc28)) +#loc69 = loc("tmp4"(#loc32)) +#loc70 = loc(callsite(#loc29 at #loc68)) +#loc72 = loc(callsite(#loc31 at #loc70)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..a226b52751ecb363981de2e609114e207724830d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ttir @@ -0,0 +1,139 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":18:0) +#loc1 = loc(unknown) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:25) +#loc38 = loc("in_ptr0"(#loc)) +#loc39 = loc("in_ptr1"(#loc)) +#loc40 = loc("out_ptr1"(#loc)) +#loc41 = loc("xnumel"(#loc)) +#loc42 = loc("r0_numel"(#loc)) +#loc72 = loc("tmp4"(#loc32)) +#loc75 = loc(callsite(#loc1 at #loc72)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<4x128xbf16> loc(#loc1) + %cst_0 = arith.constant dense<8388608> : tensor<4x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<4x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc1) + %x2 = arith.constant dense<65536> : tensor<4x1xi32> loc(#loc43) + %x1 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc44) + %cst_5 = arith.constant dense<2048> : tensor<4x1xi32> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc45) + %xoffset_6 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc46) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc47) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc48) + %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<4x1xi32> loc(#loc49) + %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<4x1xi32> loc(#loc49) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc50) + %r0_base_10 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc51) + %x0 = arith.remsi %xindex_9, %cst_5 : tensor<4x1xi32> loc(#loc52) + %x1_11 = arith.divsi %xindex_9, %cst_5 : tensor<4x1xi32> loc(#loc53) + %x1_12 = arith.remsi %x1_11, %x1 : tensor<4x1xi32> loc(#loc44) + %x2_13 = arith.divsi %xindex_9, %x2 : tensor<4x1xi32> loc(#loc43) + %r0_mask = arith.cmpi slt, %r0_base_10, %cst_3 : tensor<1x128xi32> loc(#loc54) + %tmp0 = arith.muli %x1_12, %cst_2 : tensor<4x1xi32> loc(#loc55) + %tmp0_14 = tt.broadcast %r0_base_10 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc56) + %tmp0_15 = tt.broadcast %tmp0 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc56) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<4x128xi32> loc(#loc56) + %tmp0_17 = arith.muli %x0, %cst_1 : tensor<4x1xi32> loc(#loc57) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc58) + %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<4x128xi32> loc(#loc58) + %tmp0_20 = arith.muli %x2_13, %cst_0 : tensor<4x1xi32> loc(#loc59) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc60) + %tmp0_22 = arith.addi %tmp0_19, %tmp0_21 : tensor<4x128xi32> loc(#loc60) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc61) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc61) + %tmp0_25 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc62) + %tmp0_26 = tt.load %tmp0_24, %tmp0_25, %cst evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc62) + %tmp0_27 = arith.extf %tmp0_26 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc63) + %tmp1 = arith.muli %xindex_9, %cst_2 : tensor<4x1xi32> loc(#loc64) + %tmp1_28 = tt.broadcast %tmp1 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc65) + %tmp1_29 = arith.addi %tmp0_14, %tmp1_28 : tensor<4x128xi32> loc(#loc65) + %tmp1_30 = tt.splat %in_ptr1 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc66) + %tmp1_31 = tt.addptr %tmp1_30, %tmp1_29 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc66) + %tmp1_32 = tt.load %tmp1_31, %tmp0_25, %cst evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc67) + %tmp1_33 = arith.extf %tmp1_32 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc68) + %tmp2 = arith.mulf %tmp0_27, %tmp1_33 : tensor<4x128xf32> loc(#loc69) + %tmp5 = arith.addf %tmp2, %cst_4 : tensor<4x128xf32> loc(#loc70) + %_tmp4 = arith.select %tmp0_25, %tmp5, %cst_4 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc71) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_35: f32 loc(callsite(#loc1 at #loc72)), %tmp4_36: f32 loc(callsite(#loc1 at #loc72))): + %tmp4_37 = arith.addf %tmp4_35, %tmp4_36 : f32 loc(#loc76) + tt.reduce.return %tmp4_37 : f32 loc(#loc74) + }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc74) + %tmp4_34 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc73) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<4x1x!tt.ptr> loc(#loc35) + %1 = tt.addptr %0, %xindex_9 : tensor<4x1x!tt.ptr>, tensor<4x1xi32> loc(#loc35) + tt.store %1, %tmp4_34 : tensor<4x1x!tt.ptr> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":30:19) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:29) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:33) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:44) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:23) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":35:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:45) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:41) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:55) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:50) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:68) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:60) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:73) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:127) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:45) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:41) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:104) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":41:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":43:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:28) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:36) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:4) +#loc43 = loc("x2"(#loc2)) +#loc44 = loc("x1"(#loc3)) +#loc45 = loc("xoffset"(#loc4)) +#loc46 = loc("xoffset"(#loc5)) +#loc47 = loc("xindex"(#loc6)) +#loc48 = loc("xindex"(#loc7)) +#loc49 = loc("xindex"(#loc8)) +#loc50 = loc("r0_base"(#loc9)) +#loc51 = loc("r0_base"(#loc10)) +#loc52 = loc("x0"(#loc11)) +#loc53 = loc("x1"(#loc12)) +#loc54 = loc("r0_mask"(#loc13)) +#loc55 = loc("tmp0"(#loc14)) +#loc56 = loc("tmp0"(#loc15)) +#loc57 = loc("tmp0"(#loc16)) +#loc58 = loc("tmp0"(#loc17)) +#loc59 = loc("tmp0"(#loc18)) +#loc60 = loc("tmp0"(#loc19)) +#loc61 = loc("tmp0"(#loc20)) +#loc62 = loc("tmp0"(#loc21)) +#loc63 = loc("tmp0"(#loc22)) +#loc64 = loc("tmp1"(#loc23)) +#loc65 = loc("tmp1"(#loc24)) +#loc66 = loc("tmp1"(#loc25)) +#loc67 = loc("tmp1"(#loc26)) +#loc68 = loc("tmp1"(#loc27)) +#loc69 = loc("tmp2"(#loc28)) +#loc70 = loc("tmp5"(#loc29)) +#loc71 = loc("_tmp4"(#loc30)) +#loc73 = loc("tmp4"(#loc34)) +#loc74 = loc(callsite(#loc31 at #loc72)) +#loc76 = loc(callsite(#loc33 at #loc74)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a37802557526b39d231884f5cd669d6cacdd9094 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin new file mode 100644 index 0000000000000000000000000000000000000000..7d41d3cd698ad31d457c5e11e75036206deb2111 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7482635e0a135773ff95fc77fbf09c0c21dfdb9a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"hash": "26988e7e0d3675248c1ba6c8a3b7e7d9311001a5f4fedec8b3d0b29c3420b09b", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir new file mode 100644 index 0000000000000000000000000000000000000000..fe136f8eabbe21c91db2d5711b5466c5edf70c76 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir @@ -0,0 +1,393 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = icmp samesign ult i32 %8, 128, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 15, !dbg !9 + %12 = and i32 %8, 15, !dbg !10 + %13 = lshr i32 %8, 4, !dbg !11 + %14 = mul nuw nsw i32 %11, 17, !dbg !12 + %15 = add nuw nsw i32 %14, %12, !dbg !13 + %16 = mul i32 %13, 272, !dbg !14 + %17 = add i32 %15, %16, !dbg !15 + %18 = sext i32 %17 to i64, !dbg !16 + %19 = getelementptr i32, ptr addrspace(1) %0, i64 %18, !dbg !16 + %20 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %19, i1 %9) #3, !dbg !17 + %21 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %19, i1 %9) #3, !dbg !17 + %22 = lshr i32 %10, 1, !dbg !18 + %.lobit = and i32 %22, 1, !dbg !18 + %23 = and i32 %10, 1, !dbg !18 + %24 = lshr i32 %10, 2, !dbg !18 + %.lobit1 = and i32 %24, 1, !dbg !18 + %25 = lshr i32 %10, 3, !dbg !18 + %.lobit2 = and i32 %25, 1, !dbg !18 + %26 = xor i32 %23, 1, !dbg !22 + %27 = xor i32 %.lobit, 1, !dbg !22 + %28 = xor i32 %.lobit1, 1, !dbg !22 + %29 = xor i32 %.lobit2, 1, !dbg !22 + %30 = mul nuw nsw i32 %20, %26, !dbg !23 + %31 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %30, i32 1, i32 31), !dbg !24 + %32 = add i32 %31, %30, !dbg !27 + %33 = mul nuw nsw i32 %20, %23, !dbg !28 + %34 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %33, i32 1, i32 31), !dbg !24 + %35 = add i32 %34, %33, !dbg !27 + %36 = mul nuw nsw i32 %26, %11, !dbg !29 + %37 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %36, i32 1, i32 31), !dbg !24 + %38 = add i32 %37, %36, !dbg !27 + %39 = mul nuw nsw i32 %11, %23, !dbg !30 + %40 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 1, i32 31), !dbg !24 + %41 = add i32 %40, %39, !dbg !27 + %42 = trunc i32 %22 to i1, !dbg !31 + %43 = icmp sge i32 %32, %35, !dbg !31 + %44 = icmp ne i32 %32, %35, !dbg !31 + %45 = icmp sle i32 %38, %41, !dbg !31 + %46 = or i1 %44, %45, !dbg !31 + %47 = and i1 %43, %46, !dbg !31 + %.not = xor i1 %47, %42, !dbg !31 + %48 = xor i32 %35, %32, !dbg !32 + %49 = select i1 %.not, i32 0, i32 %48, !dbg !33 + %50 = xor i32 %49, %20, !dbg !34 + %51 = xor i32 %41, %38, !dbg !35 + %52 = select i1 %.not, i32 0, i32 %51, !dbg !36 + %53 = xor i32 %52, %11, !dbg !37 + %54 = mul nuw nsw i32 %50, %27, !dbg !23 + %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 2, i32 31), !dbg !24 + %56 = add i32 %54, %55, !dbg !27 + %57 = mul nuw nsw i32 %50, %.lobit, !dbg !28 + %58 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %57, i32 2, i32 31), !dbg !24 + %59 = add i32 %57, %58, !dbg !27 + %60 = mul nuw nsw i32 %53, %27, !dbg !29 + %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 2, i32 31), !dbg !24 + %62 = add i32 %60, %61, !dbg !27 + %63 = mul nuw nsw i32 %53, %.lobit, !dbg !30 + %64 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 2, i32 31), !dbg !24 + %65 = add i32 %63, %64, !dbg !27 + %66 = trunc i32 %24 to i1, !dbg !31 + %67 = icmp sge i32 %56, %59, !dbg !31 + %68 = icmp ne i32 %56, %59, !dbg !31 + %69 = icmp sle i32 %62, %65, !dbg !31 + %70 = or i1 %68, %69, !dbg !31 + %71 = and i1 %67, %70, !dbg !31 + %.not3 = xor i1 %71, %66, !dbg !31 + %72 = xor i32 %56, %59, !dbg !32 + %73 = select i1 %.not3, i32 0, i32 %72, !dbg !33 + %74 = xor i32 %73, %50, !dbg !34 + %75 = xor i32 %62, %65, !dbg !35 + %76 = select i1 %.not3, i32 0, i32 %75, !dbg !36 + %77 = xor i32 %76, %53, !dbg !37 + %78 = mul nuw nsw i32 %74, %26, !dbg !23 + %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 1, i32 31), !dbg !24 + %80 = add i32 %78, %79, !dbg !27 + %81 = mul nuw nsw i32 %74, %23, !dbg !28 + %82 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %81, i32 1, i32 31), !dbg !24 + %83 = add i32 %81, %82, !dbg !27 + %84 = mul nuw nsw i32 %77, %26, !dbg !29 + %85 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %84, i32 1, i32 31), !dbg !24 + %86 = add i32 %84, %85, !dbg !27 + %87 = mul nuw nsw i32 %77, %23, !dbg !30 + %88 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 1, i32 31), !dbg !24 + %89 = add i32 %87, %88, !dbg !27 + %90 = icmp sge i32 %80, %83, !dbg !31 + %91 = icmp ne i32 %80, %83, !dbg !31 + %92 = icmp sle i32 %86, %89, !dbg !31 + %93 = or i1 %91, %92, !dbg !31 + %94 = and i1 %90, %93, !dbg !31 + %.not4 = xor i1 %94, %66, !dbg !31 + %95 = xor i32 %80, %83, !dbg !32 + %96 = select i1 %.not4, i32 0, i32 %95, !dbg !33 + %97 = xor i32 %96, %74, !dbg !34 + %98 = xor i32 %86, %89, !dbg !35 + %99 = select i1 %.not4, i32 0, i32 %98, !dbg !36 + %100 = xor i32 %99, %77, !dbg !37 + %101 = mul nuw nsw i32 %97, %28, !dbg !23 + %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 4, i32 31), !dbg !24 + %103 = add i32 %101, %102, !dbg !27 + %104 = mul nuw nsw i32 %97, %.lobit1, !dbg !28 + %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 4, i32 31), !dbg !24 + %106 = add i32 %104, %105, !dbg !27 + %107 = mul nuw nsw i32 %100, %28, !dbg !29 + %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 4, i32 31), !dbg !24 + %109 = add i32 %107, %108, !dbg !27 + %110 = mul nuw nsw i32 %100, %.lobit1, !dbg !30 + %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 4, i32 31), !dbg !24 + %112 = add i32 %110, %111, !dbg !27 + %113 = trunc i32 %25 to i1, !dbg !31 + %114 = icmp sge i32 %103, %106, !dbg !31 + %115 = icmp ne i32 %103, %106, !dbg !31 + %116 = icmp sle i32 %109, %112, !dbg !31 + %117 = or i1 %115, %116, !dbg !31 + %118 = and i1 %114, %117, !dbg !31 + %.not5 = xor i1 %118, %113, !dbg !31 + %119 = xor i32 %103, %106, !dbg !32 + %120 = select i1 %.not5, i32 0, i32 %119, !dbg !33 + %121 = xor i32 %120, %97, !dbg !34 + %122 = xor i32 %109, %112, !dbg !35 + %123 = select i1 %.not5, i32 0, i32 %122, !dbg !36 + %124 = xor i32 %123, %100, !dbg !37 + %125 = mul nuw nsw i32 %121, %27, !dbg !23 + %126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 2, i32 31), !dbg !24 + %127 = add i32 %125, %126, !dbg !27 + %128 = mul nuw nsw i32 %121, %.lobit, !dbg !28 + %129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 2, i32 31), !dbg !24 + %130 = add i32 %128, %129, !dbg !27 + %131 = mul nuw nsw i32 %124, %27, !dbg !29 + %132 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %131, i32 2, i32 31), !dbg !24 + %133 = add i32 %131, %132, !dbg !27 + %134 = mul nuw nsw i32 %124, %.lobit, !dbg !30 + %135 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 2, i32 31), !dbg !24 + %136 = add i32 %134, %135, !dbg !27 + %137 = icmp sge i32 %127, %130, !dbg !31 + %138 = icmp ne i32 %127, %130, !dbg !31 + %139 = icmp sle i32 %133, %136, !dbg !31 + %140 = or i1 %138, %139, !dbg !31 + %141 = and i1 %137, %140, !dbg !31 + %.not6 = xor i1 %141, %113, !dbg !31 + %142 = xor i32 %127, %130, !dbg !32 + %143 = select i1 %.not6, i32 0, i32 %142, !dbg !33 + %144 = xor i32 %143, %121, !dbg !34 + %145 = xor i32 %133, %136, !dbg !35 + %146 = select i1 %.not6, i32 0, i32 %145, !dbg !36 + %147 = xor i32 %146, %124, !dbg !37 + %148 = mul nuw nsw i32 %144, %26, !dbg !23 + %149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 1, i32 31), !dbg !24 + %150 = add i32 %148, %149, !dbg !27 + %151 = mul nuw nsw i32 %144, %23, !dbg !28 + %152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %151, i32 1, i32 31), !dbg !24 + %153 = add i32 %151, %152, !dbg !27 + %154 = mul nuw nsw i32 %147, %26, !dbg !29 + %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 1, i32 31), !dbg !24 + %156 = add i32 %154, %155, !dbg !27 + %157 = mul nuw nsw i32 %147, %23, !dbg !30 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !24 + %159 = add i32 %157, %158, !dbg !27 + %160 = icmp sge i32 %150, %153, !dbg !31 + %161 = icmp ne i32 %150, %153, !dbg !31 + %162 = icmp sle i32 %156, %159, !dbg !31 + %163 = or i1 %161, %162, !dbg !31 + %164 = and i1 %160, %163, !dbg !31 + %.not7 = xor i1 %164, %113, !dbg !31 + %165 = xor i32 %150, %153, !dbg !32 + %166 = select i1 %.not7, i32 0, i32 %165, !dbg !33 + %167 = xor i32 %166, %144, !dbg !34 + %168 = xor i32 %156, %159, !dbg !35 + %169 = select i1 %.not7, i32 0, i32 %168, !dbg !36 + %170 = xor i32 %169, %147, !dbg !37 + %171 = mul nuw nsw i32 %167, %29, !dbg !23 + %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %171, i32 8, i32 31), !dbg !24 + %173 = add i32 %171, %172, !dbg !27 + %174 = mul nuw nsw i32 %167, %.lobit2, !dbg !28 + %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 8, i32 31), !dbg !24 + %176 = add i32 %174, %175, !dbg !27 + %177 = mul nuw nsw i32 %170, %29, !dbg !29 + %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 8, i32 31), !dbg !24 + %179 = add i32 %177, %178, !dbg !27 + %180 = mul nuw nsw i32 %170, %.lobit2, !dbg !30 + %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 8, i32 31), !dbg !24 + %182 = add i32 %180, %181, !dbg !27 + %183 = icmp slt i32 %173, %176, !dbg !38 + %184 = icmp eq i32 %173, %176, !dbg !39 + %185 = icmp sgt i32 %179, %182, !dbg !40 + %186 = and i1 %184, %185, !dbg !41 + %187 = or i1 %183, %186, !dbg !42 + %188 = xor i32 %173, %176, !dbg !32 + %189 = select i1 %187, i32 %188, i32 0, !dbg !33 + %190 = xor i32 %189, %167, !dbg !34 + %191 = xor i32 %179, %182, !dbg !35 + %192 = select i1 %187, i32 %191, i32 0, !dbg !36 + %193 = xor i32 %192, %170, !dbg !37 + %194 = mul nuw nsw i32 %190, %28, !dbg !23 + %195 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %194, i32 4, i32 31), !dbg !24 + %196 = add i32 %194, %195, !dbg !27 + %197 = mul nuw nsw i32 %190, %.lobit1, !dbg !28 + %198 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %197, i32 4, i32 31), !dbg !24 + %199 = add i32 %197, %198, !dbg !27 + %200 = mul nuw nsw i32 %193, %28, !dbg !29 + %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 4, i32 31), !dbg !24 + %202 = add i32 %200, %201, !dbg !27 + %203 = mul nuw nsw i32 %193, %.lobit1, !dbg !30 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 4, i32 31), !dbg !24 + %205 = add i32 %203, %204, !dbg !27 + %206 = icmp slt i32 %196, %199, !dbg !38 + %207 = icmp eq i32 %196, %199, !dbg !39 + %208 = icmp sgt i32 %202, %205, !dbg !40 + %209 = and i1 %207, %208, !dbg !41 + %210 = or i1 %206, %209, !dbg !42 + %211 = xor i32 %196, %199, !dbg !32 + %212 = select i1 %210, i32 %211, i32 0, !dbg !33 + %213 = xor i32 %212, %190, !dbg !34 + %214 = xor i32 %202, %205, !dbg !35 + %215 = select i1 %210, i32 %214, i32 0, !dbg !36 + %216 = xor i32 %215, %193, !dbg !37 + %217 = mul nuw nsw i32 %213, %27, !dbg !23 + %218 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %217, i32 2, i32 31), !dbg !24 + %219 = add i32 %217, %218, !dbg !27 + %220 = mul nuw nsw i32 %213, %.lobit, !dbg !28 + %221 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 2, i32 31), !dbg !24 + %222 = add i32 %220, %221, !dbg !27 + %223 = mul nuw nsw i32 %216, %27, !dbg !29 + %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 2, i32 31), !dbg !24 + %225 = add i32 %223, %224, !dbg !27 + %226 = mul nuw nsw i32 %216, %.lobit, !dbg !30 + %227 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %226, i32 2, i32 31), !dbg !24 + %228 = add i32 %226, %227, !dbg !27 + %229 = icmp slt i32 %219, %222, !dbg !38 + %230 = icmp eq i32 %219, %222, !dbg !39 + %231 = icmp sgt i32 %225, %228, !dbg !40 + %232 = and i1 %230, %231, !dbg !41 + %233 = or i1 %229, %232, !dbg !42 + %234 = xor i32 %219, %222, !dbg !32 + %235 = select i1 %233, i32 %234, i32 0, !dbg !33 + %236 = xor i32 %235, %213, !dbg !34 + %237 = xor i32 %225, %228, !dbg !35 + %238 = select i1 %233, i32 %237, i32 0, !dbg !36 + %239 = xor i32 %238, %216, !dbg !37 + %240 = mul nuw nsw i32 %236, %26, !dbg !23 + %241 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 1, i32 31), !dbg !24 + %242 = add i32 %240, %241, !dbg !27 + %243 = mul nuw nsw i32 %236, %23, !dbg !28 + %244 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 1, i32 31), !dbg !24 + %245 = add i32 %243, %244, !dbg !27 + %246 = mul nuw nsw i32 %239, %26, !dbg !29 + %247 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 1, i32 31), !dbg !24 + %248 = add i32 %246, %247, !dbg !27 + %249 = mul nuw nsw i32 %239, %23, !dbg !30 + %250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %249, i32 1, i32 31), !dbg !24 + %251 = add i32 %249, %250, !dbg !27 + %252 = icmp slt i32 %242, %245, !dbg !38 + %253 = icmp eq i32 %242, %245, !dbg !39 + %254 = icmp sgt i32 %248, %251, !dbg !40 + %255 = and i1 %253, %254, !dbg !41 + %256 = or i1 %252, %255, !dbg !42 + %257 = xor i32 %248, %251, !dbg !35 + %258 = select i1 %256, i32 %257, i32 0, !dbg !36 + %259 = xor i32 %258, %239, !dbg !37 + %narrow = select i1 %9, i32 %21, i32 0, !dbg !43 + %260 = sext i32 %narrow to i64, !dbg !43 + %261 = ashr i32 %narrow, 31, !dbg !44 + %262 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %narrow, i32 8, i32 31), !dbg !44 + %263 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %261, i32 8, i32 31), !dbg !44 + %264 = insertelement <2 x i32> poison, i32 %262, i64 0, !dbg !44 + %265 = insertelement <2 x i32> %264, i32 %263, i64 1, !dbg !44 + %266 = bitcast <2 x i32> %265 to i64, !dbg !44 + %267 = add i64 %266, %260, !dbg !46 + %extelt.offset = lshr i64 %267, 32, !dbg !44 + %268 = trunc nuw i64 %extelt.offset to i32, !dbg !44 + %269 = trunc i64 %267 to i32, !dbg !44 + %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 4, i32 31), !dbg !44 + %271 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 4, i32 31), !dbg !44 + %272 = insertelement <2 x i32> poison, i32 %270, i64 0, !dbg !44 + %273 = insertelement <2 x i32> %272, i32 %271, i64 1, !dbg !44 + %274 = bitcast <2 x i32> %273 to i64, !dbg !44 + %275 = add i64 %267, %274, !dbg !46 + %extelt.offset8 = lshr i64 %275, 32, !dbg !44 + %276 = trunc nuw i64 %extelt.offset8 to i32, !dbg !44 + %277 = trunc i64 %275 to i32, !dbg !44 + %278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %277, i32 2, i32 31), !dbg !44 + %279 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 2, i32 31), !dbg !44 + %280 = insertelement <2 x i32> poison, i32 %278, i64 0, !dbg !44 + %281 = insertelement <2 x i32> %280, i32 %279, i64 1, !dbg !44 + %282 = bitcast <2 x i32> %281 to i64, !dbg !44 + %283 = add i64 %275, %282, !dbg !46 + %extelt.offset9 = lshr i64 %283, 32, !dbg !44 + %284 = trunc nuw i64 %extelt.offset9 to i32, !dbg !44 + %285 = trunc i64 %283 to i32, !dbg !44 + %286 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 1, i32 31), !dbg !44 + %287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %284, i32 1, i32 31), !dbg !44 + %288 = insertelement <2 x i32> poison, i32 %286, i64 0, !dbg !44 + %289 = insertelement <2 x i32> %288, i32 %287, i64 1, !dbg !44 + %290 = bitcast <2 x i32> %289 to i64, !dbg !44 + %291 = add i64 %283, %290, !dbg !46 + %292 = trunc i64 %291 to i32, !dbg !47 + %293 = shl i32 %8, 4, !dbg !48 + %294 = or disjoint i32 %11, %293, !dbg !49 + %295 = sext i32 %294 to i64, !dbg !50 + %296 = getelementptr i32, ptr addrspace(1) %1, i64 %295, !dbg !50 + %297 = and i32 %10, 48, !dbg !51 + %298 = icmp eq i32 %297, 0, !dbg !51 + %299 = and i1 %9, %298, !dbg !51 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %259, ptr addrspace(1) %296, i1 %299) #3, !dbg !51 + %300 = zext nneg i32 %8 to i64, !dbg !52 + %301 = getelementptr i32, ptr addrspace(1) %2, i64 %300, !dbg !52 + %302 = and i32 %10, 63, !dbg !53 + %303 = icmp eq i32 %302, 0, !dbg !53 + %304 = and i1 %9, %303, !dbg !53 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %292, ptr addrspace(1) %301, i1 %304) #3, !dbg !53 + ret void, !dbg !54 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", linkageName: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 24, column: 28, scope: !4) +!8 = !DILocation(line: 26, column: 21, scope: !4) +!9 = !DILocation(line: 27, column: 38, scope: !4) +!10 = !DILocation(line: 33, column: 19, scope: !4) +!11 = !DILocation(line: 34, column: 19, scope: !4) +!12 = !DILocation(line: 36, column: 38, scope: !4) +!13 = !DILocation(line: 36, column: 35, scope: !4) +!14 = !DILocation(line: 36, column: 49, scope: !4) +!15 = !DILocation(line: 36, column: 45, scope: !4) +!16 = !DILocation(line: 36, column: 30, scope: !4) +!17 = !DILocation(line: 36, column: 54, scope: !4) +!18 = !DILocation(line: 627, column: 44, scope: !19, inlinedAt: !21) +!19 = distinct !DILexicalBlockFile(scope: !4, file: !20, discriminator: 0) +!20 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!21 = !DILocation(line: 41, column: 67, scope: !4) +!22 = !DILocation(line: 537, column: 21, scope: !19, inlinedAt: !21) +!23 = !DILocation(line: 538, column: 40, scope: !19, inlinedAt: !21) +!24 = !DILocation(line: 291, column: 36, scope: !25, inlinedAt: !21) +!25 = distinct !DILexicalBlockFile(scope: !4, file: !26, discriminator: 0) +!26 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!27 = !DILocation(line: 261, column: 15, scope: !25, inlinedAt: !21) +!28 = !DILocation(line: 539, column: 41, scope: !19, inlinedAt: !21) +!29 = !DILocation(line: 548, column: 23, scope: !19, inlinedAt: !21) +!30 = !DILocation(line: 551, column: 23, scope: !19, inlinedAt: !21) +!31 = !DILocation(line: 599, column: 28, scope: !19, inlinedAt: !21) +!32 = !DILocation(line: 600, column: 38, scope: !19, inlinedAt: !21) +!33 = !DILocation(line: 600, column: 46, scope: !19, inlinedAt: !21) +!34 = !DILocation(line: 600, column: 15, scope: !19, inlinedAt: !21) +!35 = !DILocation(line: 601, column: 48, scope: !19, inlinedAt: !21) +!36 = !DILocation(line: 601, column: 59, scope: !19, inlinedAt: !21) +!37 = !DILocation(line: 601, column: 22, scope: !19, inlinedAt: !21) +!38 = !DILocation(line: 574, column: 22, scope: !19, inlinedAt: !21) +!39 = !DILocation(line: 591, column: 21, scope: !19, inlinedAt: !21) +!40 = !DILocation(line: 594, column: 40, scope: !19, inlinedAt: !21) +!41 = !DILocation(line: 594, column: 29, scope: !19, inlinedAt: !21) +!42 = !DILocation(line: 594, column: 23, scope: !19, inlinedAt: !21) +!43 = !DILocation(line: 44, column: 34, scope: !4) +!44 = !DILocation(line: 291, column: 36, scope: !25, inlinedAt: !45) +!45 = !DILocation(line: 45, column: 26, scope: !4) +!46 = !DILocation(line: 261, column: 15, scope: !25, inlinedAt: !45) +!47 = !DILocation(line: 48, column: 21, scope: !4) +!48 = !DILocation(line: 49, column: 35, scope: !4) +!49 = !DILocation(line: 49, column: 32, scope: !4) +!50 = !DILocation(line: 49, column: 25, scope: !4) +!51 = !DILocation(line: 49, column: 47, scope: !4) +!52 = !DILocation(line: 50, column: 25, scope: !4) +!53 = !DILocation(line: 50, column: 37, scope: !4) +!54 = !DILocation(line: 50, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx new file mode 100644 index 0000000000000000000000000000000000000000..03eceaee442f42a50f150dcb5089c7f5b2deeb09 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx @@ -0,0 +1,863 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 // -- Begin function triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 + // @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.visible .entry triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_6 +) +.reqntid 64 +{ + .reg .pred %p<64>; + .reg .b32 %r<224>; + .reg .b64 %rd<26>; + .loc 1 18 0 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:18:0 + +// %bb.0: + ld.param.b64 %rd5, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0]; + ld.param.b64 %rd6, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1]; +$L__tmp0: + .loc 1 24 28 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:24:28 + mov.u32 %r5, %ctaid.x; + .loc 1 26 21 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:26:21 + setp.lt.u32 %p1, %r5, 128; + ld.param.b64 %rd7, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2]; + .loc 1 27 38 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:27:38 + mov.u32 %r6, %tid.x; + and.b32 %r7, %r6, 15; + .loc 1 33 19 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:33:19 + and.b32 %r8, %r5, 15; + .loc 1 34 19 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:34:19 + shr.u32 %r9, %r5, 4; + .loc 1 36 35 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:36:35 + mad.lo.s32 %r10, %r7, 17, %r8; + .loc 1 36 45 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:36:45 + mad.lo.s32 %r11, %r9, 272, %r10; + .loc 1 36 30 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:36:30 + mad.wide.s32 %rd1, %r11, 4, %rd5; + .loc 1 36 54 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:36:54 + // begin inline asm + mov.u32 %r1, 0x0; + @%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2, 0x0; + @%p1 ld.global.b32 { %r2 }, [ %rd1 + 0 ]; + // end inline asm +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shr.u32 %r12, %r6, 1; + bfe.u32 %r13, %r6, 1, 1; + and.b32 %r14, %r6, 1; + shr.u32 %r15, %r6, 2; + bfe.u32 %r16, %r6, 2, 1; + shr.u32 %r17, %r6, 3; + bfe.u32 %r18, %r6, 3, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r19, %r14, 1; + xor.b32 %r20, %r13, 1; + xor.b32 %r21, %r16, 1; + xor.b32 %r22, %r18, 1; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r23, %r1, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r24, %r23, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r25, %r24, %r23; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r26, %r1, %r14; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r27, %r26, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r28, %r27, %r26; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r29, %r19, %r7; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r30, %r29, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r31, %r30, %r29; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r32, %r7, %r14; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r33, %r32, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r34, %r33, %r32; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.b32 %r35, %r12, 1; + setp.ne.b32 %p5, %r35, 0; + setp.ge.s32 %p6, %r25, %r28; + setp.ne.b32 %p7, %r25, %r28; + setp.le.s32 %p8, %r31, %r34; + or.pred %p9, %p7, %p8; + and.pred %p10, %p6, %p9; + xor.pred %p11, %p10, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r36, %r28, %r25; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r37, 0, %r36, %p11; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r38, %r37, %r1; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r39, %r34, %r31; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r40, 0, %r39, %p11; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r41, %r40, %r7; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r42, %r38, %r20; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r43, %r42, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r44, %r42, %r43; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r45, %r38, %r13; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r46, %r45, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r47, %r45, %r46; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r48, %r41, %r20; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r49, %r48, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r50, %r48, %r49; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r51, %r41, %r13; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r52, %r51, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r53, %r51, %r52; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.b32 %r54, %r15, 1; + setp.ne.b32 %p12, %r54, 0; + setp.ge.s32 %p13, %r44, %r47; + setp.ne.b32 %p14, %r44, %r47; + setp.le.s32 %p15, %r50, %r53; + or.pred %p16, %p14, %p15; + and.pred %p17, %p13, %p16; + xor.pred %p18, %p17, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r55, %r44, %r47; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r56, 0, %r55, %p18; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r57, %r56, %r38; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r58, %r50, %r53; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r59, 0, %r58, %p18; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r60, %r59, %r41; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r61, %r57, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r62, %r61, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r63, %r61, %r62; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r64, %r57, %r14; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r65, %r64, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r66, %r64, %r65; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r67, %r60, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r68, %r67, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r69, %r67, %r68; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r70, %r60, %r14; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r71, %r70, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r72, %r70, %r71; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.ge.s32 %p19, %r63, %r66; + setp.ne.b32 %p20, %r63, %r66; + setp.le.s32 %p21, %r69, %r72; + or.pred %p22, %p20, %p21; + and.pred %p23, %p19, %p22; + xor.pred %p24, %p23, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r73, %r63, %r66; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r74, 0, %r73, %p24; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r75, %r74, %r57; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r76, %r69, %r72; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r77, 0, %r76, %p24; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r78, %r77, %r60; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r79, %r75, %r21; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r80, %r79, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r81, %r79, %r80; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r82, %r75, %r16; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r83, %r82, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r84, %r82, %r83; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r85, %r78, %r21; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r86, %r85, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r87, %r85, %r86; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r88, %r78, %r16; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r89, %r88, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r90, %r88, %r89; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.b32 %r91, %r17, 1; + setp.ne.b32 %p25, %r91, 0; + setp.ge.s32 %p26, %r81, %r84; + setp.ne.b32 %p27, %r81, %r84; + setp.le.s32 %p28, %r87, %r90; + or.pred %p29, %p27, %p28; + and.pred %p30, %p26, %p29; + xor.pred %p31, %p30, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r92, %r81, %r84; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r93, 0, %r92, %p31; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r94, %r93, %r75; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r95, %r87, %r90; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r96, 0, %r95, %p31; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r97, %r96, %r78; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r98, %r94, %r20; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r99, %r98, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r100, %r98, %r99; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r101, %r94, %r13; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r102, %r101, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r103, %r101, %r102; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r104, %r97, %r20; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r105, %r104, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r106, %r104, %r105; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r107, %r97, %r13; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r108, %r107, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r109, %r107, %r108; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.ge.s32 %p32, %r100, %r103; + setp.ne.b32 %p33, %r100, %r103; + setp.le.s32 %p34, %r106, %r109; + or.pred %p35, %p33, %p34; + and.pred %p36, %p32, %p35; + xor.pred %p37, %p36, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r110, %r100, %r103; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r111, 0, %r110, %p37; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r112, %r111, %r94; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r113, %r106, %r109; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r114, 0, %r113, %p37; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r115, %r114, %r97; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r116, %r112, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r117, %r116, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r118, %r116, %r117; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r119, %r112, %r14; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r120, %r119, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r121, %r119, %r120; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r122, %r115, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r123, %r122, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r124, %r122, %r123; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r125, %r115, %r14; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r126, %r125, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r127, %r125, %r126; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.ge.s32 %p38, %r118, %r121; + setp.ne.b32 %p39, %r118, %r121; + setp.le.s32 %p40, %r124, %r127; + or.pred %p41, %p39, %p40; + and.pred %p42, %p38, %p41; + xor.pred %p43, %p42, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r128, %r118, %r121; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r129, 0, %r128, %p43; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r130, %r129, %r112; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r131, %r124, %r127; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r132, 0, %r131, %p43; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r133, %r132, %r115; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r134, %r130, %r22; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r135, %r134, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r136, %r134, %r135; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r137, %r130, %r18; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r138, %r137, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r139, %r137, %r138; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r140, %r133, %r22; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r141, %r140, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r142, %r140, %r141; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r143, %r133, %r18; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r144, %r143, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r145, %r143, %r144; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.lt.s32 %p44, %r136, %r139; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.eq.b32 %p45, %r136, %r139; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.gt.s32 %p46, %r142, %r145; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.pred %p47, %p45, %p46; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + or.pred %p48, %p44, %p47; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r146, %r136, %r139; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r147, %r146, 0, %p48; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r148, %r147, %r130; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r149, %r142, %r145; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r150, %r149, 0, %p48; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r151, %r150, %r133; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r152, %r148, %r21; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r153, %r152, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r154, %r152, %r153; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r155, %r148, %r16; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r156, %r155, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r157, %r155, %r156; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r158, %r151, %r21; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r159, %r158, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r160, %r158, %r159; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r161, %r151, %r16; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r162, %r161, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r163, %r161, %r162; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.lt.s32 %p49, %r154, %r157; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.eq.b32 %p50, %r154, %r157; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.gt.s32 %p51, %r160, %r163; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.pred %p52, %p50, %p51; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + or.pred %p53, %p49, %p52; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r164, %r154, %r157; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r165, %r164, 0, %p53; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r166, %r165, %r148; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r167, %r160, %r163; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r168, %r167, 0, %p53; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r169, %r168, %r151; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r170, %r166, %r20; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r171, %r170, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r172, %r170, %r171; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r173, %r166, %r13; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r174, %r173, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r175, %r173, %r174; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r176, %r169, %r20; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r177, %r176, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r178, %r176, %r177; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r179, %r169, %r13; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r180, %r179, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r181, %r179, %r180; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.lt.s32 %p54, %r172, %r175; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.eq.b32 %p55, %r172, %r175; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.gt.s32 %p56, %r178, %r181; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.pred %p57, %p55, %p56; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + or.pred %p58, %p54, %p57; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r182, %r172, %r175; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r183, %r182, 0, %p58; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r184, %r183, %r166; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r185, %r178, %r181; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r186, %r185, 0, %p58; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r187, %r186, %r169; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r188, %r184, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r189, %r188, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r190, %r188, %r189; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r191, %r184, %r14; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r192, %r191, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r193, %r191, %r192; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r194, %r187, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r195, %r194, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r196, %r194, %r195; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r197, %r187, %r14; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shfl.sync.bfly.b32 %r198, %r197, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r199, %r197, %r198; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.lt.s32 %p59, %r190, %r193; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.eq.b32 %p60, %r190, %r193; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.gt.s32 %p61, %r196, %r199; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r200, %r196, %r199; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r201, %r200, 0, %p61; + selp.b32 %r202, %r201, 0, %p60; + selp.b32 %r203, %r200, %r202, %p59; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r3, %r203, %r187; +$L__tmp2: + .loc 1 44 34 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:44:34 + selp.b32 %r204, %r2, 0, %p1; + cvt.s64.s32 %rd8, %r204; +$L__tmp3: + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:45:26 ] + shr.s32 %r205, %r204, 31; + shfl.sync.bfly.b32 %r206, %r204, 8, 31, -1; + shfl.sync.bfly.b32 %r207, %r205, 8, 31, -1; + cvt.u64.u32 %rd9, %r206; + cvt.u64.u32 %rd10, %r207; + shl.b64 %rd11, %rd10, 32; + or.b64 %rd12, %rd9, %rd11; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:45:26 ] + add.s64 %rd13, %rd12, %rd8; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:45:26 ] + mov.b64 {_, %r208}, %rd13; + cvt.u32.u64 %r209, %rd13; + shfl.sync.bfly.b32 %r210, %r209, 4, 31, -1; + shfl.sync.bfly.b32 %r211, %r208, 4, 31, -1; + cvt.u64.u32 %rd14, %r210; + cvt.u64.u32 %rd15, %r211; + shl.b64 %rd16, %rd15, 32; + or.b64 %rd17, %rd14, %rd16; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:45:26 ] + add.s64 %rd18, %rd13, %rd17; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:45:26 ] + mov.b64 {_, %r212}, %rd18; + cvt.u32.u64 %r213, %rd18; + shfl.sync.bfly.b32 %r214, %r213, 2, 31, -1; + shfl.sync.bfly.b32 %r215, %r212, 2, 31, -1; + cvt.u64.u32 %rd19, %r214; + cvt.u64.u32 %rd20, %r215; + shl.b64 %rd21, %rd20, 32; + or.b64 %rd22, %rd19, %rd21; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:45:26 ] + add.s64 %rd23, %rd18, %rd22; + .loc 3 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:45:26 ] + mov.b64 {_, %r216}, %rd23; + cvt.u32.u64 %r217, %rd23; + shfl.sync.bfly.b32 %r218, %r217, 1, 31, -1; + shfl.sync.bfly.b32 %r219, %r216, 1, 31, -1; + cvt.u64.u32 %rd24, %r218; + .loc 3 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:45:26 ] + add.s64 %rd25, %rd23, %rd24; +$L__tmp4: + .loc 1 48 21 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:48:21 + cvt.u32.u64 %r4, %rd25; + .loc 1 49 35 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:49:35 + shl.b32 %r220, %r5, 4; + .loc 1 49 32 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:49:32 + or.b32 %r221, %r7, %r220; + .loc 1 49 25 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:49:25 + mad.wide.s32 %rd3, %r221, 4, %rd6; + .loc 1 49 47 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:49:47 + and.b32 %r222, %r6, 48; + setp.eq.b32 %p62, %r222, 0; + and.pred %p3, %p1, %p62; + // begin inline asm + @%p3 st.global.b32 [ %rd3 + 0 ], { %r3 }; + // end inline asm + .loc 1 50 25 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:50:25 + mad.wide.u32 %rd4, %r5, 4, %rd7; + .loc 1 50 37 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:50:37 + and.b32 %r223, %r6, 63; + setp.eq.b32 %p63, %r223, 0; + and.pred %p4, %p1, %p63; + // begin inline asm + @%p4 st.global.b32 [ %rd4 + 0 ], { %r4 }; + // end inline asm + .loc 1 50 4 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:50:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 267 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x104 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 52 +.b8 121 +.b8 120 +.b8 103 +.b8 111 +.b8 116 +.b8 105 +.b8 104 +.b8 111 +.b8 120 +.b8 112 +.b8 110 +.b8 54 +.b8 111 +.b8 53 +.b8 120 +.b8 97 +.b8 52 +.b8 106 +.b8 118 +.b8 107 +.b8 99 +.b8 121 +.b8 55 +.b8 115 +.b8 104 +.b8 108 +.b8 103 +.b8 110 +.b8 121 +.b8 118 +.b8 52 +.b8 52 +.b8 117 +.b8 54 +.b8 100 +.b8 112 +.b8 109 +.b8 53 +.b8 101 +.b8 55 +.b8 52 +.b8 54 +.b8 102 +.b8 54 +.b8 100 +.b8 119 +.b8 103 +.b8 55 +.b8 111 +.b8 101 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 52 +.b8 121 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x3d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 99 +.b8 108 +.b8 111 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 108 +.b8 105 +.b8 99 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 116 +.b8 114 +.b8 97 +.b8 110 +.b8 115 +.b8 112 +.b8 111 +.b8 115 +.b8 101 +.b8 95 +.b8 51 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc8:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xdd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 67 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xf5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source b/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source new file mode 100644 index 0000000000000000000000000000000000000000..de3a5e86197377fcef690f19197d611ea7fb9237 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source @@ -0,0 +1,1216 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":18:0) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc87 = loc(unknown) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc139 = loc("in_ptr0"(#loc)) +#loc140 = loc("out_ptr2"(#loc)) +#loc141 = loc("out_ptr3"(#loc)) +#loc142 = loc("xnumel"(#loc)) +#loc143 = loc("r0_numel"(#loc)) +#loc172 = loc("x"(#loc37)) +#loc173 = loc("idxs"(#loc37)) +#loc174 = loc("x"(#loc41)) +#loc175 = loc("idxs"(#loc41)) +#loc180 = loc("x"(#loc49)) +#loc181 = loc("idxs"(#loc49)) +#loc182 = loc("flip"(#loc49)) +#loc238 = loc("input"(#loc112)) +#loc239 = loc("a"(#loc116)) +#loc240 = loc("b"(#loc116)) +#loc242 = loc("x"(#loc121)) +#loc243 = loc("x"(#loc125)) +#loc244 = loc("input"(#loc134)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 128 : i32 loc(#loc144) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc145) + %xoffset = tt.get_program_id x : i32 loc(#loc146) + %xoffset_2 = arith.constant 1 : i32 loc(#loc147) + %xoffset_3 = arith.constant 1 : i32 loc(#loc147) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc147) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc148) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc149) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc150) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc150) + %xmask = arith.constant dense<128> : tensor<1x1xi32> loc(#loc151) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc151) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc152) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc153) + %r0_offset = arith.constant 0 : i32 loc(#loc154) + %r0_mask = arith.constant true loc(#loc155) + %r0_mask_10 = arith.constant dense : tensor<1x16xi1> loc(#loc155) + %x0 = arith.constant 16 : i32 loc(#loc156) + %x0_11 = arith.constant 16 : i32 loc(#loc156) + %x0_12 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc156) + %x0_13 = arith.remsi %xindex_7, %x0_12 : tensor<1x1xi32> loc(#loc156) + %x1 = arith.constant 16 : i32 loc(#loc157) + %x1_14 = arith.constant 16 : i32 loc(#loc157) + %x1_15 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc157) + %x1_16 = arith.divsi %xindex_7, %x1_15 : tensor<1x1xi32> loc(#loc157) + %tmp0 = arith.constant 17 : i32 loc(#loc158) + %tmp0_17 = arith.constant 17 : i32 loc(#loc158) + %tmp0_18 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc158) + %tmp0_19 = arith.muli %tmp0_18, %r0_index_9 : tensor<1x16xi32> loc(#loc158) + %tmp0_20 = tt.broadcast %x0_13 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc159) + %tmp0_21 = arith.addi %tmp0_20, %tmp0_19 : tensor<1x16xi32> loc(#loc159) + %tmp0_22 = arith.constant 272 : i32 loc(#loc160) + %tmp0_23 = arith.constant 272 : i32 loc(#loc160) + %tmp0_24 = arith.constant dense<272> : tensor<1x1xi32> loc(#loc160) + %tmp0_25 = arith.muli %tmp0_24, %x1_16 : tensor<1x1xi32> loc(#loc160) + %tmp0_26 = tt.broadcast %tmp0_25 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc161) + %tmp0_27 = arith.addi %tmp0_21, %tmp0_26 : tensor<1x16xi32> loc(#loc161) + %tmp0_28 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc162) + %tmp0_29 = tt.addptr %tmp0_28, %tmp0_27 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc162) + %tmp0_30 = arith.constant 0.000000e+00 : f32 loc(#loc163) + %tmp0_31 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc163) + %tmp0_32 = arith.constant dense<0.000000e+00> : tensor<1x16xf32> loc(#loc163) + %tmp0_33 = arith.fptosi %tmp0_32 : tensor<1x16xf32> to tensor<1x16xi32> loc(#loc163) + %tmp0_34 = tt.load %tmp0_29, %tmp0_31, %tmp0_33 : tensor<1x16x!tt.ptr> loc(#loc163) + %tmp2 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc164) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp0_34, %tmp2) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc22) + %tmp7 = arith.extsi %tmp0_34 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc165) + %tmp10 = arith.constant 0 : i32 loc(#loc166) + %tmp10_35 = arith.constant 0 : i64 loc(#loc166) + %tmp10_36 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc166) + %tmp10_37 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc166) + %tmp10_38 = arith.select %tmp10_37, %tmp7, %tmp10_36 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc166) + %tmp11 = tt.call @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp10_38) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc167) + %tmp11_39 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc168) + %tmp12 = arith.extsi %0#1 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc169) + %tmp13 = arith.trunci %tmp12 : tensor<1x16xi64> to tensor<1x16xi32> loc(#loc170) + %tmp14 = arith.trunci %tmp11_39 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc171) + %c16_i32 = arith.constant 16 : i32 loc(#loc30) + %c16_i32_40 = arith.constant 16 : i32 loc(#loc30) + %cst = arith.constant dense<16> : tensor<1x1xi32> loc(#loc30) + %1 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc30) + %2 = tt.broadcast %1 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc31) + %3 = arith.addi %r0_index_9, %2 : tensor<1x16xi32> loc(#loc31) + %4 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc32) + %5 = tt.addptr %4, %3 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc32) + %6 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc33) + tt.store %5, %tmp13, %6 : tensor<1x16x!tt.ptr> loc(#loc33) + %7 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc34) + %8 = tt.addptr %7, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc34) + tt.store %8, %tmp14, %xmask_8 : tensor<1x1x!tt.ptr> loc(#loc35) + tt.return loc(#loc36) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc37)), %idxs: tensor<1x16xi16> loc("idxs"(#loc37))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc38) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc38) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc38) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc38) + tt.return %3#0, %3#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc39) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc40) + %5 = ub.poison : tensor<1x16xi32> loc(#loc40) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc40) + } loc(#loc37) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc41)), %idxs: tensor<1x16xi16> loc("idxs"(#loc41))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc176) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc177) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc177) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc178) + %flip_3 = tt.reshape %flip_2 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc179) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i16S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi16>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + tt.return %0#0, %0#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc47) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi32> loc(#loc48) + %2 = ub.poison : tensor<1x16xi32> loc(#loc48) + tt.return %1, %2 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc48) + } loc(#loc41) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i16S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi16> loc("idxs"(#loc49)), %flip: tensor<1x16xi32> loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi16> -> tensor<8x2x1xi16> loc(#loc197) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc198) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc199) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<8x2x1xi16> loc(#loc199) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<8x2x1xi16>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc201) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc202) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc203) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc204) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<8x2x1xi16> loc(#loc204) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<8x2x1xi16>) -> tensor<8x1xi32> loc(#loc205) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc206) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc207) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_27 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_28 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_49 = arith.constant true loc(#loc215) + %cond_50 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<1x16xi1> loc(#loc215) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<1x16xi1> loc(#loc216) + %cond_53 = arith.ori %cond, %cond_52 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_53 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_50 = arith.ori %eq, %eq_49 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_50 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<1x16xi32> loc(#loc221) + %cond_30 = arith.andi %3, %cond_29 : tensor<1x16xi1> loc(#loc222) + %cond_31 = arith.ori %1, %cond_30 : tensor<1x16xi1> loc(#loc223) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<1x16xi1> loc(#loc224) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<1x16xi1> loc(#loc225) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<1x16xi1> loc(#loc226) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<1x16xi1> loc(#loc227) + %cond_36 = arith.extui %cond_35 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc228) + %cond_37 = arith.xori %cond_36, %flip : tensor<1x16xi32> loc(#loc228) + %cond_38 = arith.constant 0 : i32 loc(#loc229) + %cond_39 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc229) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<1x16xi32> loc(#loc229) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_43 = arith.xori %x, %ret_42 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<1x16xi32> loc(#loc234) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S1_16S__(%idxs) : (tensor<1x16xi16>) -> tensor<1x16xi16> loc(#loc235) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc236) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_47 = arith.extsi %idxs : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc237) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_43, %new_idxs_48 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x1xi32> loc("input"(#loc112))) -> tensor<8x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc113) + tt.return %0 : tensor<8x1xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x1xi32> loc(#loc115) + tt.return %1 : tensor<8x1xi32> loc(#loc115) + } loc(#loc112) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc116)), %b: i32 loc("b"(#loc116))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc117) + tt.return %0 : i32 loc(#loc118) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc119) + tt.return %1 : i32 loc(#loc119) + } loc(#loc116) + tt.func private @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x1xi16> loc("input"(#loc112))) -> tensor<8x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc241) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc113) + tt.return %0 : tensor<8x1xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x1xi32> loc(#loc115) + tt.return %1 : tensor<8x1xi32> loc(#loc115) + } loc(#loc112) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%x: tensor<1x16xi32> loc("x"(#loc121))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc122) + %false = arith.constant false loc(#loc123) + tt.return %false : i1 loc(#loc123) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc124) + tt.return %1 : i1 loc(#loc124) + } loc(#loc121) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S1_16S__(%x: tensor<1x16xi32> loc("x"(#loc125))) -> tensor<1x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc126) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc127) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc127) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc127) + %4 = arith.addi %x, %3 : tensor<1x16xi32> loc(#loc127) + tt.return %4 : tensor<1x16xi32> loc(#loc128) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x16xi32> loc(#loc129) + tt.return %5 : tensor<1x16xi32> loc(#loc129) + } loc(#loc125) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc131) + %cst = arith.constant dense : tensor<1xi1> loc(#loc131) + tt.return %cst : tensor<1xi1> loc(#loc132) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc133) + tt.return %0 : tensor<1xi1> loc(#loc133) + } loc(#loc130) + tt.func private @triton.language.standard.zeros_like__i32S1_16S__(%input: tensor<1x16xi32> loc("input"(#loc134))) -> tensor<1x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<1x16xi32> loc(#loc135) + tt.return %0 : tensor<1x16xi32> loc(#loc136) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi32> loc(#loc137) + tt.return %1 : tensor<1x16xi32> loc(#loc137) + } loc(#loc134) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<1x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc131) + %cst = arith.constant dense<0> : tensor<1x16xi32> loc(#loc131) + tt.return %cst : tensor<1x16xi32> loc(#loc132) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16xi32> loc(#loc133) + tt.return %0 : tensor<1x16xi32> loc(#loc133) + } loc(#loc130) + tt.func private @triton.language.standard.zeros_like__i16S1_16S__(%input: tensor<1x16xi16> loc("input"(#loc134))) -> tensor<1x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<1x16xi16> loc(#loc135) + tt.return %0 : tensor<1x16xi16> loc(#loc136) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi16> loc(#loc137) + tt.return %1 : tensor<1x16xi16> loc(#loc137) + } loc(#loc134) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<1x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc131) + %cst = arith.constant dense<0> : tensor<1x16xi16> loc(#loc131) + tt.return %cst : tensor<1x16xi16> loc(#loc132) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16xi16> loc(#loc133) + tt.return %0 : tensor<1x16xi16> loc(#loc133) + } loc(#loc130) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc41)), %idxs: tensor<1x16xi32> loc("idxs"(#loc41))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc176) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc177) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc177) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc178) + %flip_3 = tt.reshape %flip_2 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc179) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + tt.return %1#0, %1#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc47) + ^bb1: // no predecessors + %2 = ub.poison : tensor<1x16xi32> loc(#loc48) + %3 = ub.poison : tensor<1x16xi32> loc(#loc48) + tt.return %2, %3 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc48) + } loc(#loc41) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: tensor<1x16xi32> loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<4x2x2xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<4x2x2xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<4x2x2xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<4x2x2xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc215) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc215) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc216) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc228) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc228) + %cond_36 = arith.constant 0 : i32 loc(#loc229) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc229) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc229) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x2x2xi32> loc("input"(#loc112))) -> tensor<4x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc113) + tt.return %0 : tensor<4x2xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<4x2xi32> loc(#loc115) + tt.return %1 : tensor<4x2xi32> loc(#loc115) + } loc(#loc112) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: tensor<1x16xi32> loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x1xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x1xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc215) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc215) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc216) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc228) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc228) + %cond_36 = arith.constant 0 : i32 loc(#loc229) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc229) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc229) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc41)), %idxs: tensor<1x16xi32> loc("idxs"(#loc41))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc176) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc177) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc177) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc178) + %flip_3 = tt.reshape %flip_2 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc179) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + tt.return %2#0, %2#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc47) + ^bb1: // no predecessors + %3 = ub.poison : tensor<1x16xi32> loc(#loc48) + %4 = ub.poison : tensor<1x16xi32> loc(#loc48) + tt.return %3, %4 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc48) + } loc(#loc41) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: tensor<1x16xi32> loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<2x2x4xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<2x2x4xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<2x2x4xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<2x2x4xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc215) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc215) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc216) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc228) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc228) + %cond_36 = arith.constant 0 : i32 loc(#loc229) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc229) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc229) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<2x2x4xi32> loc("input"(#loc112))) -> tensor<2x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc113) + tt.return %0 : tensor<2x4xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<2x4xi32> loc(#loc115) + tt.return %1 : tensor<2x4xi32> loc(#loc115) + } loc(#loc112) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc41)), %idxs: tensor<1x16xi32> loc("idxs"(#loc41))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc245) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + tt.return %3#0, %3#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc47) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc48) + %5 = ub.poison : tensor<1x16xi32> loc(#loc48) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc48) + } loc(#loc41) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: i1 loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<1x2x8xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<1x2x8xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<1x2x8xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<1x2x8xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc215) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc215) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc216) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc228) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc228) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2x8xi32> loc("input"(#loc112))) -> tensor<1x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc113) + tt.return %0 : tensor<1x8xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x8xi32> loc(#loc115) + tt.return %1 : tensor<1x8xi32> loc(#loc115) + } loc(#loc112) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: i1 loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<2x2x4xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<2x2x4xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<2x2x4xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<2x2x4xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc215) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc215) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc216) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc228) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc228) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: i1 loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<4x2x2xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<4x2x2xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<4x2x2xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<4x2x2xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc215) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc215) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc216) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc228) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc228) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: i1 loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x1xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x1xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc215) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc215) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc216) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc228) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc228) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x16xi64> loc("input"(#loc112))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc113) + tt.reduce.return %2 : i64 loc(#loc113) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc113) + tt.return %0 : tensor<1xi64> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc115) + tt.return %1 : tensor<1xi64> loc(#loc115) + } loc(#loc112) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc116)), %b: i64 loc("b"(#loc116))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc117) + tt.return %0 : i64 loc(#loc118) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc119) + tt.return %1 : i64 loc(#loc119) + } loc(#loc116) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":33:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":34:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:38) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:49) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:54) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":38:19) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":41:67) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":42:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":44:34) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":45:26) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":45:29) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":46:20) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":47:21) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":48:21) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:35) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:32) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:47) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:25) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:37) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:4) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc144 = loc("xnumel"(#loc1)) +#loc145 = loc("r0_numel"(#loc2)) +#loc146 = loc("xoffset"(#loc3)) +#loc147 = loc("xoffset"(#loc4)) +#loc148 = loc("xindex"(#loc5)) +#loc149 = loc("xindex"(#loc6)) +#loc150 = loc("xindex"(#loc7)) +#loc151 = loc("xmask"(#loc8)) +#loc152 = loc("r0_index"(#loc9)) +#loc153 = loc("r0_index"(#loc10)) +#loc154 = loc("r0_offset"(#loc11)) +#loc155 = loc("r0_mask"(#loc12)) +#loc156 = loc("x0"(#loc13)) +#loc157 = loc("x1"(#loc14)) +#loc158 = loc("tmp0"(#loc15)) +#loc159 = loc("tmp0"(#loc16)) +#loc160 = loc("tmp0"(#loc17)) +#loc161 = loc("tmp0"(#loc18)) +#loc162 = loc("tmp0"(#loc19)) +#loc163 = loc("tmp0"(#loc20)) +#loc164 = loc("tmp2"(#loc21)) +#loc165 = loc("tmp7"(#loc23)) +#loc166 = loc("tmp10"(#loc24)) +#loc167 = loc("tmp11"(#loc25)) +#loc168 = loc("tmp11"(#loc26)) +#loc169 = loc("tmp12"(#loc27)) +#loc170 = loc("tmp13"(#loc28)) +#loc171 = loc("tmp14"(#loc29)) +#loc176 = loc("flip"(#loc42)) +#loc177 = loc("flip"(#loc43)) +#loc178 = loc("flip"(#loc44)) +#loc179 = loc("flip"(#loc45)) +#loc183 = loc("y"(#loc50)) +#loc184 = loc("right_mask"(#loc51)) +#loc185 = loc("right_mask"(#loc52)) +#loc186 = loc("left_mask"(#loc53)) +#loc187 = loc("ileft"(#loc54)) +#loc188 = loc("ileft"(#loc55)) +#loc189 = loc("ileft"(#loc56)) +#loc190 = loc("ileft"(#loc57)) +#loc191 = loc("iright"(#loc58)) +#loc192 = loc("iright"(#loc59)) +#loc193 = loc("iright"(#loc60)) +#loc194 = loc("iright"(#loc61)) +#loc195 = loc("ileft"(#loc62)) +#loc196 = loc("iright"(#loc63)) +#loc197 = loc("y_idx"(#loc64)) +#loc198 = loc("left_idx"(#loc65)) +#loc199 = loc("left_idx"(#loc66)) +#loc200 = loc("left_idx"(#loc67)) +#loc201 = loc("left_idx"(#loc68)) +#loc202 = loc("left_idx"(#loc69)) +#loc203 = loc("right_idx"(#loc70)) +#loc204 = loc("right_idx"(#loc71)) +#loc205 = loc("right_idx"(#loc72)) +#loc206 = loc("right_idx"(#loc73)) +#loc207 = loc("right_idx"(#loc74)) +#loc208 = loc("left_idx"(#loc75)) +#loc209 = loc("right_idx"(#loc76)) +#loc210 = loc("left_valid_mask"(#loc77)) +#loc211 = loc("right_valid_mask"(#loc78)) +#loc212 = loc("left_isnan"(#loc79)) +#loc213 = loc("right_isnan"(#loc80)) +#loc214 = loc("cond"(#loc81)) +#loc215 = loc("cond"(#loc84)) +#loc216 = loc("cond"(#loc85)) +#loc217 = loc("cond"(#loc86)) +#loc218 = loc("eq"(#loc88)) +#loc219 = loc("eq"(#loc91)) +#loc220 = loc("eq"(#loc92)) +#loc221 = loc("cond"(#loc93)) +#loc222 = loc("cond"(#loc94)) +#loc223 = loc("cond"(#loc95)) +#loc224 = loc("cond"(#loc96)) +#loc225 = loc("cond"(#loc97)) +#loc226 = loc("cond"(#loc98)) +#loc227 = loc("cond"(#loc99)) +#loc228 = loc("cond"(#loc100)) +#loc229 = loc("cond"(#loc101)) +#loc230 = loc("ret"(#loc102)) +#loc231 = loc("ret"(#loc103)) +#loc232 = loc("ret"(#loc104)) +#loc233 = loc("ret"(#loc105)) +#loc234 = loc("new_idxs"(#loc106)) +#loc235 = loc("new_idxs"(#loc107)) +#loc236 = loc("new_idxs"(#loc108)) +#loc237 = loc("new_idxs"(#loc109)) +#loc241 = loc("input"(#loc120)) +#loc245 = loc("flip"(#loc138)) +#loc246 = loc("cond"(#loc214)) +#loc247 = loc("cond"(#loc217)) +#loc248 = loc("eq"(#loc218)) +#loc249 = loc("eq"(#loc220)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..5cbb79e7973fb549781ce683396cb78de7b0425e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir @@ -0,0 +1,812 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [2, 2, 8], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [4, 2, 4], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked3 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [8, 2, 2], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked4 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [16, 2, 1], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [2, 1], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":18:0) +#loc1 = loc(unknown) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":41:67) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":45:26) +#loc73 = loc("in_ptr0"(#loc)) +#loc74 = loc("out_ptr2"(#loc)) +#loc75 = loc("out_ptr3"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc91 = loc(callsite(#loc15 at #loc16)) +#loc97 = loc("ileft"(#loc24)) +#loc101 = loc("iright"(#loc29)) +#loc110 = loc("left_idx"(#loc38)) +#loc115 = loc("right_idx"(#loc43)) +#loc135 = loc("tmp11"(#loc63)) +#loc145 = loc(callsite(#loc20 at #loc91)) +#loc149 = loc(callsite(#loc1 at #loc135)) +#loc153 = loc(callsite(#loc97 at #loc145)) +#loc157 = loc(callsite(#loc101 at #loc145)) +#loc165 = loc(callsite(#loc110 at #loc145)) +#loc170 = loc(callsite(#loc115 at #loc145)) +#loc190 = loc(callsite(#loc1 at #loc153)) +#loc192 = loc(callsite(#loc1 at #loc157)) +#loc195 = loc(callsite(#loc1 at #loc165)) +#loc198 = loc(callsite(#loc1 at #loc170)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<17> : tensor<1x16xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x16xi64, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked2> loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked3> loc(#loc1) + %cst_4 = arith.constant dense<0> : tensor<1x16xi32, #blocked> loc(#loc1) + %c272_i32 = arith.constant 272 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked4> loc(#loc1) + %cst_6 = arith.constant dense<0> : tensor<1x16xi32, #blocked5> loc(#loc1) + %cst_7 = arith.constant dense<17> : tensor<1x16xi32, #blocked5> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc78) + %xmask = arith.cmpi slt, %xoffset, %c128_i32 : i32 loc(#loc79) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked5}>> loc(#loc80) + %r0_index_8 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc80) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x16xi32, #blocked5> loc(#loc80) + %r0_index_10 = tt.expand_dims %r0_index_8 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc80) + %x0 = arith.remsi %xoffset, %c16_i32 : i32 loc(#loc81) + %x1 = arith.divsi %xoffset, %c16_i32 : i32 loc(#loc82) + %tmp0 = arith.muli %r0_index_9, %cst_7 : tensor<1x16xi32, #blocked5> loc(#loc83) + %tmp0_11 = arith.muli %r0_index_10, %cst : tensor<1x16xi32, #blocked> loc(#loc83) + %tmp0_12 = tt.splat %x0 : i32 -> tensor<1x16xi32, #blocked5> loc(#loc139) + %tmp0_13 = tt.splat %x0 : i32 -> tensor<1x16xi32, #blocked> loc(#loc139) + %tmp0_14 = arith.addi %tmp0_12, %tmp0 : tensor<1x16xi32, #blocked5> loc(#loc84) + %tmp0_15 = arith.addi %tmp0_13, %tmp0_11 : tensor<1x16xi32, #blocked> loc(#loc84) + %tmp0_16 = arith.muli %x1, %c272_i32 : i32 loc(#loc85) + %tmp0_17 = tt.splat %tmp0_16 : i32 -> tensor<1x16xi32, #blocked5> loc(#loc140) + %tmp0_18 = tt.splat %tmp0_16 : i32 -> tensor<1x16xi32, #blocked> loc(#loc140) + %tmp0_19 = arith.addi %tmp0_14, %tmp0_17 : tensor<1x16xi32, #blocked5> loc(#loc86) + %tmp0_20 = arith.addi %tmp0_15, %tmp0_18 : tensor<1x16xi32, #blocked> loc(#loc86) + %tmp0_21 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc87) + %tmp0_22 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> loc(#loc87) + %tmp0_23 = tt.addptr %tmp0_21, %tmp0_19 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc87) + %tmp0_24 = tt.addptr %tmp0_22, %tmp0_20 : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> loc(#loc87) + %tmp0_25 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked> loc(#loc141) + %tmp0_26 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked5> loc(#loc141) + %tmp0_27 = tt.load %tmp0_23, %tmp0_26, %cst_6 : tensor<1x16x!tt.ptr, #blocked5> loc(#loc88) + %tmp0_28 = tt.load %tmp0_24, %tmp0_25, %cst_4 : tensor<1x16x!tt.ptr, #blocked> loc(#loc88) + %tmp2 = arith.trunci %r0_index_9 : tensor<1x16xi32, #blocked5> to tensor<1x16xi16, #blocked5> loc(#loc89) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> loc(#loc142) + %flip_29 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> loc(#loc142) + %flip_30 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> loc(#loc142) + %flip_31 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> loc(#loc142) + %flip_32 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> loc(#loc142) + %flip_33 = tt.expand_dims %flip_29 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> loc(#loc142) + %flip_34 = tt.expand_dims %flip_30 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> loc(#loc142) + %flip_35 = tt.expand_dims %flip_31 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> loc(#loc142) + %flip_36 = tt.expand_dims %flip_32 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> -> tensor<1x2x1xi32, #blocked3> loc(#loc142) + %flip_37 = tt.expand_dims %flip_33 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> -> tensor<1x2x1xi32, #blocked4> loc(#loc142) + %flip_38 = tt.expand_dims %flip_34 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> -> tensor<1x2x1xi32, #blocked2> loc(#loc142) + %flip_39 = tt.expand_dims %flip_35 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> -> tensor<1x2x1xi32, #blocked1> loc(#loc142) + %flip_40 = tt.broadcast %flip_36 : tensor<1x2x1xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc143) + %flip_41 = tt.reshape %flip_40 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc144) + %y = tt.reshape %tmp0_27 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc150) + %left_mask = arith.subi %cst_5, %flip_37 : tensor<1x2x1xi32, #blocked4> loc(#loc151) + %left_mask_42 = arith.subi %cst_3, %flip_36 : tensor<1x2x1xi32, #blocked3> loc(#loc151) + %left_mask_43 = arith.subi %cst_2, %flip_38 : tensor<1x2x1xi32, #blocked2> loc(#loc151) + %left_mask_44 = arith.subi %cst_1, %flip_39 : tensor<1x2x1xi32, #blocked1> loc(#loc151) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_45 = arith.muli %y, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_46 = "tt.reduce"(%ileft_45) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc189) + %ileft_47 = tt.expand_dims %ileft_46 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc154) + %ileft_48 = tt.broadcast %ileft_47 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc155) + %iright = tt.broadcast %flip_37 : tensor<1x2x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_49 = arith.muli %y, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_50 = "tt.reduce"(%iright_49) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc191) + %iright_51 = tt.expand_dims %iright_50 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc158) + %iright_52 = tt.broadcast %iright_51 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc159) + %ileft_53 = tt.reshape %ileft_48 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_54 = tt.reshape %iright_52 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx = tt.reshape %tmp2 : tensor<1x16xi16, #blocked5> -> tensor<8x2x1xi16, #blocked4> loc(#loc162) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc163) + %left_idx_55 = tt.broadcast %left_idx : tensor<1x2x1xi16, #blocked4> -> tensor<8x2x1xi16, #blocked4> loc(#loc164) + %left_idx_56 = arith.muli %y_idx, %left_idx_55 : tensor<8x2x1xi16, #blocked4> loc(#loc164) + %input = arith.extsi %left_idx_56 : tensor<8x2x1xi16, #blocked4> to tensor<8x2x1xi32, #blocked4> loc(#loc193) + %left_idx_57 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc194) + %left_idx_58 = tt.expand_dims %left_idx_57 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc166) + %left_idx_59 = tt.broadcast %left_idx_58 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc167) + %right_idx = arith.trunci %flip_37 : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc168) + %right_idx_60 = tt.broadcast %right_idx : tensor<1x2x1xi16, #blocked4> -> tensor<8x2x1xi16, #blocked4> loc(#loc169) + %right_idx_61 = arith.muli %y_idx, %right_idx_60 : tensor<8x2x1xi16, #blocked4> loc(#loc169) + %input_62 = arith.extsi %right_idx_61 : tensor<8x2x1xi16, #blocked4> to tensor<8x2x1xi32, #blocked4> loc(#loc196) + %right_idx_63 = "tt.reduce"(%input_62) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc197) + %right_idx_64 = tt.expand_dims %right_idx_63 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc171) + %right_idx_65 = tt.broadcast %right_idx_64 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc172) + %left_idx_66 = tt.reshape %left_idx_59 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_67 = tt.reshape %right_idx_65 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond = arith.cmpi slt, %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq = arith.cmpi eq, %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_68 = arith.cmpi sgt, %left_idx_66, %right_idx_67 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_69 = arith.andi %eq, %cond_68 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_70 = arith.ori %cond, %cond_69 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_71 = arith.extui %cond_70 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_72 = arith.xori %cond_71, %flip_41 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_73 = arith.cmpi ne, %cond_72, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret = arith.xori %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_74 = arith.select %cond_73, %ret, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_75 = arith.xori %tmp0_27, %ret_74 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs = arith.xori %left_idx_66, %right_idx_67 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_76 = arith.select %cond_73, %new_idxs, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_77 = arith.extsi %tmp2 : tensor<1x16xi16, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc187) + %new_idxs_78 = arith.xori %new_idxs_77, %new_idxs_76 : tensor<1x16xi32, #blocked5> loc(#loc187) + %flip_79 = tt.broadcast %flip_38 : tensor<1x2x1xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc143) + %flip_80 = tt.reshape %flip_79 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc144) + %y_81 = tt.reshape %ret_75 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc150) + %ileft_82 = tt.broadcast %left_mask_42 : tensor<1x2x1xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc152) + %ileft_83 = arith.muli %y_81, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc152) + %ileft_84 = "tt.reduce"(%ileft_83) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc189) + %ileft_85 = tt.expand_dims %ileft_84 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc154) + %ileft_86 = tt.broadcast %ileft_85 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc155) + %iright_87 = arith.muli %y_81, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc156) + %iright_88 = "tt.reduce"(%iright_87) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc191) + %iright_89 = tt.expand_dims %iright_88 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc158) + %iright_90 = tt.broadcast %iright_89 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc159) + %ileft_91 = tt.reshape %ileft_86 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_92 = tt.reshape %iright_90 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_93 = tt.reshape %new_idxs_78 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc162) + %left_idx_94 = arith.muli %y_idx_93, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc164) + %left_idx_95 = "tt.reduce"(%left_idx_94) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc194) + %left_idx_96 = tt.expand_dims %left_idx_95 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc166) + %left_idx_97 = tt.broadcast %left_idx_96 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc167) + %right_idx_98 = arith.muli %y_idx_93, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc169) + %right_idx_99 = "tt.reduce"(%right_idx_98) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc197) + %right_idx_100 = tt.expand_dims %right_idx_99 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc171) + %right_idx_101 = tt.broadcast %right_idx_100 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc172) + %left_idx_102 = tt.reshape %left_idx_97 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_103 = tt.reshape %right_idx_101 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_104 = arith.cmpi slt, %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_105 = arith.cmpi eq, %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_106 = arith.cmpi sgt, %left_idx_102, %right_idx_103 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_107 = arith.andi %eq_105, %cond_106 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_108 = arith.ori %cond_104, %cond_107 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_109 = arith.extui %cond_108 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_110 = arith.xori %cond_109, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_111 = arith.cmpi ne, %cond_110, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_112 = arith.xori %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_113 = arith.select %cond_111, %ret_112, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_114 = arith.xori %ret_75, %ret_113 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_115 = arith.xori %left_idx_102, %right_idx_103 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_116 = arith.select %cond_111, %new_idxs_115, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_117 = arith.xori %new_idxs_78, %new_idxs_116 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_118 = tt.reshape %ret_114 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc150) + %ileft_119 = arith.muli %y_118, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_120 = "tt.reduce"(%ileft_119) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc189) + %ileft_121 = tt.expand_dims %ileft_120 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc154) + %ileft_122 = tt.broadcast %ileft_121 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc155) + %iright_123 = arith.muli %y_118, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_124 = "tt.reduce"(%iright_123) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc191) + %iright_125 = tt.expand_dims %iright_124 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc158) + %iright_126 = tt.broadcast %iright_125 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc159) + %ileft_127 = tt.reshape %ileft_122 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_128 = tt.reshape %iright_126 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_129 = tt.reshape %new_idxs_117 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc162) + %left_idx_130 = arith.muli %y_idx_129, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc164) + %left_idx_131 = "tt.reduce"(%left_idx_130) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc194) + %left_idx_132 = tt.expand_dims %left_idx_131 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc166) + %left_idx_133 = tt.broadcast %left_idx_132 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc167) + %right_idx_134 = arith.muli %y_idx_129, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc169) + %right_idx_135 = "tt.reduce"(%right_idx_134) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc197) + %right_idx_136 = tt.expand_dims %right_idx_135 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc171) + %right_idx_137 = tt.broadcast %right_idx_136 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc172) + %left_idx_138 = tt.reshape %left_idx_133 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_139 = tt.reshape %right_idx_137 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_140 = arith.cmpi slt, %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_141 = arith.cmpi eq, %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_142 = arith.cmpi sgt, %left_idx_138, %right_idx_139 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_143 = arith.andi %eq_141, %cond_142 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_144 = arith.ori %cond_140, %cond_143 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_145 = arith.extui %cond_144 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_146 = arith.xori %cond_145, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_147 = arith.cmpi ne, %cond_146, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_148 = arith.xori %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_149 = arith.select %cond_147, %ret_148, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_150 = arith.xori %ret_114, %ret_149 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_151 = arith.xori %left_idx_138, %right_idx_139 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_152 = arith.select %cond_147, %new_idxs_151, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_153 = arith.xori %new_idxs_117, %new_idxs_152 : tensor<1x16xi32, #blocked5> loc(#loc187) + %flip_154 = tt.broadcast %flip_39 : tensor<1x2x1xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc143) + %flip_155 = tt.reshape %flip_154 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc144) + %y_156 = tt.reshape %ret_150 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc150) + %ileft_157 = tt.broadcast %left_mask_43 : tensor<1x2x1xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc152) + %ileft_158 = arith.muli %y_156, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc152) + %ileft_159 = "tt.reduce"(%ileft_158) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc189) + %ileft_160 = tt.expand_dims %ileft_159 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc154) + %ileft_161 = tt.broadcast %ileft_160 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc155) + %iright_162 = arith.muli %y_156, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc156) + %iright_163 = "tt.reduce"(%iright_162) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc191) + %iright_164 = tt.expand_dims %iright_163 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc158) + %iright_165 = tt.broadcast %iright_164 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc159) + %ileft_166 = tt.reshape %ileft_161 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_167 = tt.reshape %iright_165 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_168 = tt.reshape %new_idxs_153 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc162) + %left_idx_169 = arith.muli %y_idx_168, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc164) + %left_idx_170 = "tt.reduce"(%left_idx_169) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc194) + %left_idx_171 = tt.expand_dims %left_idx_170 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc166) + %left_idx_172 = tt.broadcast %left_idx_171 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc167) + %right_idx_173 = arith.muli %y_idx_168, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc169) + %right_idx_174 = "tt.reduce"(%right_idx_173) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc197) + %right_idx_175 = tt.expand_dims %right_idx_174 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc171) + %right_idx_176 = tt.broadcast %right_idx_175 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc172) + %left_idx_177 = tt.reshape %left_idx_172 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_178 = tt.reshape %right_idx_176 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_179 = arith.cmpi slt, %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_180 = arith.cmpi eq, %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_181 = arith.cmpi sgt, %left_idx_177, %right_idx_178 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_182 = arith.andi %eq_180, %cond_181 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_183 = arith.ori %cond_179, %cond_182 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_184 = arith.extui %cond_183 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_185 = arith.xori %cond_184, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_186 = arith.cmpi ne, %cond_185, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_187 = arith.xori %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_188 = arith.select %cond_186, %ret_187, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_189 = arith.xori %ret_150, %ret_188 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_190 = arith.xori %left_idx_177, %right_idx_178 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_191 = arith.select %cond_186, %new_idxs_190, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_192 = arith.xori %new_idxs_153, %new_idxs_191 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_193 = tt.reshape %ret_189 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc150) + %ileft_194 = arith.muli %y_193, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc152) + %ileft_195 = "tt.reduce"(%ileft_194) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc189) + %ileft_196 = tt.expand_dims %ileft_195 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc154) + %ileft_197 = tt.broadcast %ileft_196 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc155) + %iright_198 = arith.muli %y_193, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc156) + %iright_199 = "tt.reduce"(%iright_198) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc191) + %iright_200 = tt.expand_dims %iright_199 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc158) + %iright_201 = tt.broadcast %iright_200 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc159) + %ileft_202 = tt.reshape %ileft_197 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_203 = tt.reshape %iright_201 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_204 = tt.reshape %new_idxs_192 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc162) + %left_idx_205 = arith.muli %y_idx_204, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc164) + %left_idx_206 = "tt.reduce"(%left_idx_205) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc194) + %left_idx_207 = tt.expand_dims %left_idx_206 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc166) + %left_idx_208 = tt.broadcast %left_idx_207 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc167) + %right_idx_209 = arith.muli %y_idx_204, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc169) + %right_idx_210 = "tt.reduce"(%right_idx_209) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc197) + %right_idx_211 = tt.expand_dims %right_idx_210 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc171) + %right_idx_212 = tt.broadcast %right_idx_211 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc172) + %left_idx_213 = tt.reshape %left_idx_208 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_214 = tt.reshape %right_idx_212 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_215 = arith.cmpi slt, %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_216 = arith.cmpi eq, %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_217 = arith.cmpi sgt, %left_idx_213, %right_idx_214 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_218 = arith.andi %eq_216, %cond_217 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_219 = arith.ori %cond_215, %cond_218 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_220 = arith.extui %cond_219 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_221 = arith.xori %cond_220, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_222 = arith.cmpi ne, %cond_221, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_223 = arith.xori %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_224 = arith.select %cond_222, %ret_223, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_225 = arith.xori %ret_189, %ret_224 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_226 = arith.xori %left_idx_213, %right_idx_214 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_227 = arith.select %cond_222, %new_idxs_226, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_228 = arith.xori %new_idxs_192, %new_idxs_227 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_229 = tt.reshape %ret_225 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc150) + %ileft_230 = arith.muli %y_229, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_231 = "tt.reduce"(%ileft_230) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc189) + %ileft_232 = tt.expand_dims %ileft_231 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc154) + %ileft_233 = tt.broadcast %ileft_232 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc155) + %iright_234 = arith.muli %y_229, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_235 = "tt.reduce"(%iright_234) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc191) + %iright_236 = tt.expand_dims %iright_235 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc158) + %iright_237 = tt.broadcast %iright_236 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc159) + %ileft_238 = tt.reshape %ileft_233 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_239 = tt.reshape %iright_237 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_240 = tt.reshape %new_idxs_228 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc162) + %left_idx_241 = arith.muli %y_idx_240, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc164) + %left_idx_242 = "tt.reduce"(%left_idx_241) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc194) + %left_idx_243 = tt.expand_dims %left_idx_242 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc166) + %left_idx_244 = tt.broadcast %left_idx_243 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc167) + %right_idx_245 = arith.muli %y_idx_240, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc169) + %right_idx_246 = "tt.reduce"(%right_idx_245) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc197) + %right_idx_247 = tt.expand_dims %right_idx_246 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc171) + %right_idx_248 = tt.broadcast %right_idx_247 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc172) + %left_idx_249 = tt.reshape %left_idx_244 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_250 = tt.reshape %right_idx_248 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_251 = arith.cmpi slt, %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_252 = arith.cmpi eq, %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_253 = arith.cmpi sgt, %left_idx_249, %right_idx_250 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_254 = arith.andi %eq_252, %cond_253 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_255 = arith.ori %cond_251, %cond_254 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_256 = arith.extui %cond_255 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_257 = arith.xori %cond_256, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_258 = arith.cmpi ne, %cond_257, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_259 = arith.xori %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_260 = arith.select %cond_258, %ret_259, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_261 = arith.xori %ret_225, %ret_260 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_262 = arith.xori %left_idx_249, %right_idx_250 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_263 = arith.select %cond_258, %new_idxs_262, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_264 = arith.xori %new_idxs_228, %new_idxs_263 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_265 = tt.reshape %ret_261 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc150) + %ileft_266 = tt.broadcast %left_mask_44 : tensor<1x2x1xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc152) + %ileft_267 = arith.muli %y_265, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc152) + %ileft_268 = "tt.reduce"(%ileft_267) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc189) + %ileft_269 = tt.expand_dims %ileft_268 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc154) + %ileft_270 = tt.broadcast %ileft_269 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc155) + %iright_271 = arith.muli %y_265, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc156) + %iright_272 = "tt.reduce"(%iright_271) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc191) + %iright_273 = tt.expand_dims %iright_272 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc158) + %iright_274 = tt.broadcast %iright_273 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc159) + %ileft_275 = tt.reshape %ileft_270 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_276 = tt.reshape %iright_274 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_277 = tt.reshape %new_idxs_264 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc162) + %left_idx_278 = arith.muli %y_idx_277, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc164) + %left_idx_279 = "tt.reduce"(%left_idx_278) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc194) + %left_idx_280 = tt.expand_dims %left_idx_279 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc166) + %left_idx_281 = tt.broadcast %left_idx_280 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc167) + %right_idx_282 = arith.muli %y_idx_277, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc169) + %right_idx_283 = "tt.reduce"(%right_idx_282) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc197) + %right_idx_284 = tt.expand_dims %right_idx_283 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc171) + %right_idx_285 = tt.broadcast %right_idx_284 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc172) + %left_idx_286 = tt.reshape %left_idx_281 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_287 = tt.reshape %right_idx_285 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_288 = arith.cmpi slt, %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_289 = arith.cmpi eq, %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_290 = arith.cmpi sgt, %left_idx_286, %right_idx_287 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_291 = arith.andi %eq_289, %cond_290 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_292 = arith.ori %cond_288, %cond_291 : tensor<1x16xi1, #blocked5> loc(#loc179) + %ret_293 = arith.xori %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_294 = arith.select %cond_292, %ret_293, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_295 = arith.xori %ret_261, %ret_294 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_296 = arith.xori %left_idx_286, %right_idx_287 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_297 = arith.select %cond_292, %new_idxs_296, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_298 = arith.xori %new_idxs_264, %new_idxs_297 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_299 = tt.reshape %ret_295 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc150) + %ileft_300 = arith.muli %y_299, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc152) + %ileft_301 = "tt.reduce"(%ileft_300) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc189) + %ileft_302 = tt.expand_dims %ileft_301 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc154) + %ileft_303 = tt.broadcast %ileft_302 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc155) + %iright_304 = arith.muli %y_299, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc156) + %iright_305 = "tt.reduce"(%iright_304) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc191) + %iright_306 = tt.expand_dims %iright_305 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc158) + %iright_307 = tt.broadcast %iright_306 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc159) + %ileft_308 = tt.reshape %ileft_303 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_309 = tt.reshape %iright_307 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_310 = tt.reshape %new_idxs_298 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc162) + %left_idx_311 = arith.muli %y_idx_310, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc164) + %left_idx_312 = "tt.reduce"(%left_idx_311) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc194) + %left_idx_313 = tt.expand_dims %left_idx_312 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc166) + %left_idx_314 = tt.broadcast %left_idx_313 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc167) + %right_idx_315 = arith.muli %y_idx_310, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc169) + %right_idx_316 = "tt.reduce"(%right_idx_315) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc197) + %right_idx_317 = tt.expand_dims %right_idx_316 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc171) + %right_idx_318 = tt.broadcast %right_idx_317 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc172) + %left_idx_319 = tt.reshape %left_idx_314 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_320 = tt.reshape %right_idx_318 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_321 = arith.cmpi slt, %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_322 = arith.cmpi eq, %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_323 = arith.cmpi sgt, %left_idx_319, %right_idx_320 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_324 = arith.andi %eq_322, %cond_323 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_325 = arith.ori %cond_321, %cond_324 : tensor<1x16xi1, #blocked5> loc(#loc179) + %ret_326 = arith.xori %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_327 = arith.select %cond_325, %ret_326, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_328 = arith.xori %ret_295, %ret_327 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_329 = arith.xori %left_idx_319, %right_idx_320 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_330 = arith.select %cond_325, %new_idxs_329, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_331 = arith.xori %new_idxs_298, %new_idxs_330 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_332 = tt.reshape %ret_328 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc150) + %ileft_333 = arith.muli %y_332, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc152) + %ileft_334 = "tt.reduce"(%ileft_333) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc189) + %ileft_335 = tt.expand_dims %ileft_334 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc154) + %ileft_336 = tt.broadcast %ileft_335 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc155) + %iright_337 = arith.muli %y_332, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc156) + %iright_338 = "tt.reduce"(%iright_337) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc191) + %iright_339 = tt.expand_dims %iright_338 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc158) + %iright_340 = tt.broadcast %iright_339 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc159) + %ileft_341 = tt.reshape %ileft_336 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_342 = tt.reshape %iright_340 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_343 = tt.reshape %new_idxs_331 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc162) + %left_idx_344 = arith.muli %y_idx_343, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc164) + %left_idx_345 = "tt.reduce"(%left_idx_344) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc194) + %left_idx_346 = tt.expand_dims %left_idx_345 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc166) + %left_idx_347 = tt.broadcast %left_idx_346 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc167) + %right_idx_348 = arith.muli %y_idx_343, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc169) + %right_idx_349 = "tt.reduce"(%right_idx_348) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc197) + %right_idx_350 = tt.expand_dims %right_idx_349 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc171) + %right_idx_351 = tt.broadcast %right_idx_350 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc172) + %left_idx_352 = tt.reshape %left_idx_347 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_353 = tt.reshape %right_idx_351 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_354 = arith.cmpi slt, %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_355 = arith.cmpi eq, %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_356 = arith.cmpi sgt, %left_idx_352, %right_idx_353 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_357 = arith.andi %eq_355, %cond_356 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_358 = arith.ori %cond_354, %cond_357 : tensor<1x16xi1, #blocked5> loc(#loc179) + %ret_359 = arith.xori %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_360 = arith.select %cond_358, %ret_359, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_361 = arith.xori %ret_328, %ret_360 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_362 = arith.xori %left_idx_352, %right_idx_353 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_363 = arith.select %cond_358, %new_idxs_362, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_364 = arith.xori %new_idxs_331, %new_idxs_363 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_365 = tt.reshape %ret_361 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc150) + %ileft_366 = arith.muli %y_365, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_367 = "tt.reduce"(%ileft_366) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc189) + %ileft_368 = tt.expand_dims %ileft_367 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc154) + %ileft_369 = tt.broadcast %ileft_368 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc155) + %iright_370 = arith.muli %y_365, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_371 = "tt.reduce"(%iright_370) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc191) + %iright_372 = tt.expand_dims %iright_371 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc158) + %iright_373 = tt.broadcast %iright_372 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc159) + %ileft_374 = tt.reshape %ileft_369 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_375 = tt.reshape %iright_373 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_376 = tt.reshape %new_idxs_364 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc162) + %left_idx_377 = arith.muli %y_idx_376, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc164) + %left_idx_378 = "tt.reduce"(%left_idx_377) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc194) + %left_idx_379 = tt.expand_dims %left_idx_378 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc166) + %left_idx_380 = tt.broadcast %left_idx_379 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc167) + %right_idx_381 = arith.muli %y_idx_376, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc169) + %right_idx_382 = "tt.reduce"(%right_idx_381) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc197) + %right_idx_383 = tt.expand_dims %right_idx_382 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc171) + %right_idx_384 = tt.broadcast %right_idx_383 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc172) + %left_idx_385 = tt.reshape %left_idx_380 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_386 = tt.reshape %right_idx_384 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_387 = arith.cmpi slt, %ileft_374, %iright_375 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_388 = arith.cmpi eq, %ileft_374, %iright_375 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_389 = arith.cmpi sgt, %left_idx_385, %right_idx_386 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_390 = arith.andi %eq_388, %cond_389 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_391 = arith.ori %cond_387, %cond_390 : tensor<1x16xi1, #blocked5> loc(#loc179) + %new_idxs_392 = arith.xori %left_idx_385, %right_idx_386 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_393 = arith.select %cond_391, %new_idxs_392, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_394 = arith.xori %new_idxs_364, %new_idxs_393 : tensor<1x16xi32, #blocked5> loc(#loc187) + %tmp7 = arith.extsi %tmp0_28 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc133) + %tmp10 = arith.select %tmp0_25, %tmp7, %cst_0 : tensor<1x16xi1, #blocked>, tensor<1x16xi64, #blocked> loc(#loc134) + %tmp11 = "tt.reduce"(%tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_396: i64 loc(callsite(#loc1 at #loc135)), %tmp11_397: i64 loc(callsite(#loc1 at #loc135))): + %tmp11_398 = arith.addi %tmp11_396, %tmp11_397 : i64 loc(#loc188) + tt.reduce.return %tmp11_398 : i64 loc(#loc148) + }) : (tensor<1x16xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc148) + %tmp11_395 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc136) + %tmp14 = arith.trunci %tmp11_395 : tensor<1x1xi64, #blocked> to tensor<1x1xi32, #blocked> loc(#loc137) + %0 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc66) + %1 = tt.splat %0 : i32 -> tensor<1x16xi32, #blocked5> loc(#loc138) + %2 = arith.addi %r0_index_9, %1 : tensor<1x16xi32, #blocked5> loc(#loc67) + %3 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc68) + %4 = tt.addptr %3, %2 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc68) + tt.store %4, %new_idxs_394, %tmp0_26 : tensor<1x16x!tt.ptr, #blocked5> loc(#loc69) + %5 = tt.addptr %out_ptr3, %xoffset : !tt.ptr, i32 loc(#loc70) + %6 = tt.splat %5 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc71) + %7 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc71) + tt.store %6, %tmp14, %7 : tensor<1x1x!tt.ptr, #blocked> loc(#loc71) + tt.return loc(#loc72) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":26:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":27:38) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":33:19) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":34:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:35) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:49) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:45) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:30) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:54) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":38:19) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":42:19) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":44:34) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":45:29) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":48:21) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:35) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:32) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:25) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:47) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:37) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:4) +#loc78 = loc("xoffset"(#loc2)) +#loc79 = loc("xmask"(#loc3)) +#loc80 = loc("r0_index"(#loc4)) +#loc81 = loc("x0"(#loc5)) +#loc82 = loc("x1"(#loc6)) +#loc83 = loc("tmp0"(#loc7)) +#loc84 = loc("tmp0"(#loc8)) +#loc85 = loc("tmp0"(#loc9)) +#loc86 = loc("tmp0"(#loc10)) +#loc87 = loc("tmp0"(#loc11)) +#loc88 = loc("tmp0"(#loc12)) +#loc89 = loc("tmp2"(#loc13)) +#loc90 = loc("flip"(#loc14)) +#loc92 = loc("flip"(#loc17)) +#loc93 = loc("flip"(#loc18)) +#loc94 = loc("y"(#loc19)) +#loc95 = loc("left_mask"(#loc21)) +#loc96 = loc("ileft"(#loc22)) +#loc98 = loc("ileft"(#loc26)) +#loc99 = loc("ileft"(#loc27)) +#loc100 = loc("iright"(#loc28)) +#loc102 = loc("iright"(#loc30)) +#loc103 = loc("iright"(#loc31)) +#loc104 = loc("ileft"(#loc32)) +#loc105 = loc("iright"(#loc33)) +#loc106 = loc("y_idx"(#loc34)) +#loc107 = loc("left_idx"(#loc35)) +#loc108 = loc("left_idx"(#loc36)) +#loc109 = loc("input"(#loc37)) +#loc111 = loc("left_idx"(#loc39)) +#loc112 = loc("left_idx"(#loc40)) +#loc113 = loc("right_idx"(#loc41)) +#loc114 = loc("right_idx"(#loc42)) +#loc116 = loc("right_idx"(#loc44)) +#loc117 = loc("right_idx"(#loc45)) +#loc118 = loc("left_idx"(#loc46)) +#loc119 = loc("right_idx"(#loc47)) +#loc120 = loc("cond"(#loc48)) +#loc121 = loc("eq"(#loc49)) +#loc122 = loc("cond"(#loc50)) +#loc123 = loc("cond"(#loc51)) +#loc124 = loc("cond"(#loc52)) +#loc125 = loc("cond"(#loc53)) +#loc126 = loc("cond"(#loc54)) +#loc127 = loc("ret"(#loc55)) +#loc128 = loc("ret"(#loc56)) +#loc129 = loc("ret"(#loc57)) +#loc130 = loc("new_idxs"(#loc58)) +#loc131 = loc("new_idxs"(#loc59)) +#loc132 = loc("new_idxs"(#loc60)) +#loc133 = loc("tmp7"(#loc61)) +#loc134 = loc("tmp10"(#loc62)) +#loc136 = loc("tmp11"(#loc64)) +#loc137 = loc("tmp14"(#loc65)) +#loc138 = loc(fused[#loc67, #loc66]) +#loc139 = loc(fused[#loc84, #loc81]) +#loc140 = loc(fused[#loc86, #loc85]) +#loc141 = loc(fused[#loc88, #loc79]) +#loc142 = loc(callsite(#loc90 at #loc91)) +#loc143 = loc(callsite(#loc92 at #loc91)) +#loc144 = loc(callsite(#loc93 at #loc91)) +#loc146 = loc("cond"(#loc120)) +#loc147 = loc("eq"(#loc121)) +#loc148 = loc(callsite(#loc23 at #loc135)) +#loc150 = loc(callsite(#loc94 at #loc145)) +#loc151 = loc(callsite(#loc95 at #loc145)) +#loc152 = loc(callsite(#loc96 at #loc145)) +#loc154 = loc(callsite(#loc98 at #loc145)) +#loc155 = loc(callsite(#loc99 at #loc145)) +#loc156 = loc(callsite(#loc100 at #loc145)) +#loc158 = loc(callsite(#loc102 at #loc145)) +#loc159 = loc(callsite(#loc103 at #loc145)) +#loc160 = loc(callsite(#loc104 at #loc145)) +#loc161 = loc(callsite(#loc105 at #loc145)) +#loc162 = loc(callsite(#loc106 at #loc145)) +#loc163 = loc(callsite(#loc107 at #loc145)) +#loc164 = loc(callsite(#loc108 at #loc145)) +#loc166 = loc(callsite(#loc111 at #loc145)) +#loc167 = loc(callsite(#loc112 at #loc145)) +#loc168 = loc(callsite(#loc113 at #loc145)) +#loc169 = loc(callsite(#loc114 at #loc145)) +#loc171 = loc(callsite(#loc116 at #loc145)) +#loc172 = loc(callsite(#loc117 at #loc145)) +#loc173 = loc(callsite(#loc118 at #loc145)) +#loc174 = loc(callsite(#loc119 at #loc145)) +#loc175 = loc(callsite(#loc146 at #loc145)) +#loc176 = loc(callsite(#loc147 at #loc145)) +#loc177 = loc(callsite(#loc122 at #loc145)) +#loc178 = loc(callsite(#loc123 at #loc145)) +#loc179 = loc(callsite(#loc124 at #loc145)) +#loc180 = loc(callsite(#loc125 at #loc145)) +#loc181 = loc(callsite(#loc126 at #loc145)) +#loc182 = loc(callsite(#loc127 at #loc145)) +#loc183 = loc(callsite(#loc128 at #loc145)) +#loc184 = loc(callsite(#loc129 at #loc145)) +#loc185 = loc(callsite(#loc130 at #loc145)) +#loc186 = loc(callsite(#loc131 at #loc145)) +#loc187 = loc(callsite(#loc132 at #loc145)) +#loc188 = loc(callsite(#loc25 at #loc148)) +#loc189 = loc(callsite(#loc23 at #loc153)) +#loc191 = loc(callsite(#loc23 at #loc157)) +#loc193 = loc(callsite(#loc109 at #loc165)) +#loc194 = loc(callsite(#loc23 at #loc165)) +#loc196 = loc(callsite(#loc109 at #loc170)) +#loc197 = loc(callsite(#loc23 at #loc170)) +#loc199 = loc(callsite(#loc25 at #loc189)) +#loc200 = loc(callsite(#loc25 at #loc191)) +#loc201 = loc(callsite(#loc25 at #loc194)) +#loc202 = loc(callsite(#loc25 at #loc197)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir new file mode 100644 index 0000000000000000000000000000000000000000..2c7308da348d6635a34deeed89432b477d5858a4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/E2MI47QNGZ2SJDA3U3EKHN7H3EYRAANF6T7N5SFT2CZJYNBAWCNQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir @@ -0,0 +1,784 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":18:0) +#loc2 = loc(unknown) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":41:67) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":45:26) +#loc75 = loc("in_ptr0"(#loc)) +#loc76 = loc("out_ptr2"(#loc)) +#loc77 = loc("out_ptr3"(#loc)) +#loc78 = loc("xnumel"(#loc)) +#loc79 = loc("r0_numel"(#loc)) +#loc96 = loc(callsite(#loc18 at #loc4)) +#loc103 = loc("ileft"(#loc27)) +#loc107 = loc("iright"(#loc32)) +#loc116 = loc("left_idx"(#loc41)) +#loc121 = loc("right_idx"(#loc46)) +#loc140 = loc("tmp11"(#loc65)) +#loc151 = loc(callsite(#loc23 at #loc96)) +#loc155 = loc(callsite(#loc2 at #loc140)) +#loc159 = loc(callsite(#loc103 at #loc151)) +#loc163 = loc(callsite(#loc107 at #loc151)) +#loc171 = loc(callsite(#loc116 at #loc151)) +#loc176 = loc(callsite(#loc121 at #loc151)) +#loc196 = loc(callsite(#loc2 at #loc159)) +#loc198 = loc(callsite(#loc2 at #loc163)) +#loc201 = loc(callsite(#loc2 at #loc171)) +#loc204 = loc(callsite(#loc2 at #loc176)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %tmp0 = arith.constant 272 : i32 loc(#loc80) + %c16_i32 = arith.constant 16 : i32 loc(#loc2) + %xmask = arith.constant 128 : i32 loc(#loc81) + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc82) + %cst_0 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc2) + %tmp10 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc83) + %tmp0_1 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc84) + %xoffset = tt.get_program_id x : i32 loc(#loc85) + %xmask_2 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc81) + %xmask_3 = tt.splat %xmask_2 : i1 -> tensor<1x1xi1> loc(#loc81) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc86) + %r0_index_4 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc87) + %x0 = arith.remsi %xoffset, %c16_i32 : i32 loc(#loc88) + %x1 = arith.divsi %xoffset, %c16_i32 : i32 loc(#loc89) + %tmp0_5 = arith.muli %r0_index_4, %tmp0_1 : tensor<1x16xi32> loc(#loc84) + %tmp0_6 = tt.splat %x0 : i32 -> tensor<1x16xi32> loc(#loc144) + %tmp0_7 = arith.addi %tmp0_6, %tmp0_5 : tensor<1x16xi32> loc(#loc90) + %tmp0_8 = arith.muli %x1, %tmp0 : i32 loc(#loc80) + %tmp0_9 = tt.splat %tmp0_8 : i32 -> tensor<1x16xi32> loc(#loc145) + %tmp0_10 = arith.addi %tmp0_7, %tmp0_9 : tensor<1x16xi32> loc(#loc91) + %tmp0_11 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc92) + %tmp0_12 = tt.addptr %tmp0_11, %tmp0_10 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc92) + %tmp0_13 = tt.splat %xmask_2 : i1 -> tensor<1x16xi1> loc(#loc146) + %tmp0_14 = tt.load %tmp0_12, %tmp0_13, %cst_0 : tensor<1x16x!tt.ptr> loc(#loc93) + %tmp2 = arith.trunci %r0_index_4 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc94) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc147) + %flip_15 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc148) + %flip_16 = tt.expand_dims %flip_15 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc148) + %flip_17 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc149) + %flip_18 = tt.reshape %flip_17 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc150) + %y = tt.reshape %tmp0_14 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc156) + %left_mask = arith.subi %cst, %flip_16 : tensor<1x2x1xi32> loc(#loc157) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc158) + %ileft_19 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc158) + %ileft_20 = "tt.reduce"(%ileft_19) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc195) + %ileft_21 = tt.expand_dims %ileft_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc160) + %ileft_22 = tt.broadcast %ileft_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc161) + %iright = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc162) + %iright_23 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc162) + %iright_24 = "tt.reduce"(%iright_23) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc197) + %iright_25 = tt.expand_dims %iright_24 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc164) + %iright_26 = tt.broadcast %iright_25 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc165) + %ileft_27 = tt.reshape %ileft_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_28 = tt.reshape %iright_26 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx = tt.reshape %tmp2 : tensor<1x16xi16> -> tensor<8x2x1xi16> loc(#loc168) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc169) + %left_idx_29 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc170) + %left_idx_30 = arith.muli %y_idx, %left_idx_29 : tensor<8x2x1xi16> loc(#loc170) + %input = arith.extsi %left_idx_30 : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc199) + %left_idx_31 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_32 = tt.expand_dims %left_idx_31 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc172) + %left_idx_33 = tt.broadcast %left_idx_32 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc173) + %right_idx = arith.trunci %flip_16 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc174) + %right_idx_34 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc175) + %right_idx_35 = arith.muli %y_idx, %right_idx_34 : tensor<8x2x1xi16> loc(#loc175) + %input_36 = arith.extsi %right_idx_35 : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc202) + %right_idx_37 = "tt.reduce"(%input_36) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc203) + %right_idx_38 = tt.expand_dims %right_idx_37 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc177) + %right_idx_39 = tt.broadcast %right_idx_38 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc178) + %left_idx_40 = tt.reshape %left_idx_33 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_41 = tt.reshape %right_idx_39 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc180) + %cond = arith.cmpi slt, %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc181) + %eq = arith.cmpi eq, %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc182) + %cond_42 = arith.cmpi sgt, %left_idx_40, %right_idx_41 : tensor<1x16xi32> loc(#loc183) + %cond_43 = arith.andi %eq, %cond_42 : tensor<1x16xi1> loc(#loc184) + %cond_44 = arith.ori %cond, %cond_43 : tensor<1x16xi1> loc(#loc185) + %cond_45 = arith.extui %cond_44 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_46 = arith.xori %cond_45, %flip_18 : tensor<1x16xi32> loc(#loc186) + %cond_47 = arith.cmpi ne, %cond_46, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret = arith.xori %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc188) + %ret_48 = arith.select %cond_47, %ret, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_49 = arith.xori %tmp0_14, %ret_48 : tensor<1x16xi32> loc(#loc190) + %new_idxs = arith.xori %left_idx_40, %right_idx_41 : tensor<1x16xi32> loc(#loc191) + %new_idxs_50 = arith.select %cond_47, %new_idxs, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_51 = arith.extsi %tmp2 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc193) + %new_idxs_52 = arith.xori %new_idxs_51, %new_idxs_50 : tensor<1x16xi32> loc(#loc193) + %flip_53 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc149) + %flip_54 = tt.reshape %flip_53 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc150) + %y_55 = tt.reshape %ret_49 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc156) + %ileft_56 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc158) + %ileft_57 = arith.muli %y_55, %ileft_56 : tensor<4x2x2xi32> loc(#loc158) + %ileft_58 = "tt.reduce"(%ileft_57) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc195) + %ileft_59 = tt.expand_dims %ileft_58 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc160) + %ileft_60 = tt.broadcast %ileft_59 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc161) + %iright_61 = arith.muli %y_55, %flip_17 : tensor<4x2x2xi32> loc(#loc162) + %iright_62 = "tt.reduce"(%iright_61) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc197) + %iright_63 = tt.expand_dims %iright_62 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc164) + %iright_64 = tt.broadcast %iright_63 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc165) + %ileft_65 = tt.reshape %ileft_60 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_66 = tt.reshape %iright_64 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_67 = tt.reshape %new_idxs_52 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc168) + %left_idx_68 = arith.muli %y_idx_67, %ileft_56 : tensor<4x2x2xi32> loc(#loc170) + %left_idx_69 = "tt.reduce"(%left_idx_68) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_70 = tt.expand_dims %left_idx_69 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc172) + %left_idx_71 = tt.broadcast %left_idx_70 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc173) + %right_idx_72 = arith.muli %y_idx_67, %flip_17 : tensor<4x2x2xi32> loc(#loc175) + %right_idx_73 = "tt.reduce"(%right_idx_72) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc203) + %right_idx_74 = tt.expand_dims %right_idx_73 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc177) + %right_idx_75 = tt.broadcast %right_idx_74 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc178) + %left_idx_76 = tt.reshape %left_idx_71 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_77 = tt.reshape %right_idx_75 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_78 = arith.cmpi slt, %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc181) + %eq_79 = arith.cmpi eq, %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc182) + %cond_80 = arith.cmpi sgt, %left_idx_76, %right_idx_77 : tensor<1x16xi32> loc(#loc183) + %cond_81 = arith.andi %eq_79, %cond_80 : tensor<1x16xi1> loc(#loc184) + %cond_82 = arith.ori %cond_78, %cond_81 : tensor<1x16xi1> loc(#loc185) + %cond_83 = arith.extui %cond_82 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_84 = arith.xori %cond_83, %flip_54 : tensor<1x16xi32> loc(#loc186) + %cond_85 = arith.cmpi ne, %cond_84, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_86 = arith.xori %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc188) + %ret_87 = arith.select %cond_85, %ret_86, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_88 = arith.xori %ret_49, %ret_87 : tensor<1x16xi32> loc(#loc190) + %new_idxs_89 = arith.xori %left_idx_76, %right_idx_77 : tensor<1x16xi32> loc(#loc191) + %new_idxs_90 = arith.select %cond_85, %new_idxs_89, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_91 = arith.xori %new_idxs_52, %new_idxs_90 : tensor<1x16xi32> loc(#loc193) + %y_92 = tt.reshape %ret_88 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc156) + %ileft_93 = arith.muli %y_92, %ileft : tensor<8x2x1xi32> loc(#loc158) + %ileft_94 = "tt.reduce"(%ileft_93) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc195) + %ileft_95 = tt.expand_dims %ileft_94 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc160) + %ileft_96 = tt.broadcast %ileft_95 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc161) + %iright_97 = arith.muli %y_92, %iright : tensor<8x2x1xi32> loc(#loc162) + %iright_98 = "tt.reduce"(%iright_97) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc197) + %iright_99 = tt.expand_dims %iright_98 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc164) + %iright_100 = tt.broadcast %iright_99 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc165) + %ileft_101 = tt.reshape %ileft_96 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_102 = tt.reshape %iright_100 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_103 = tt.reshape %new_idxs_91 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc168) + %left_idx_104 = arith.muli %y_idx_103, %ileft : tensor<8x2x1xi32> loc(#loc170) + %left_idx_105 = "tt.reduce"(%left_idx_104) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_106 = tt.expand_dims %left_idx_105 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc172) + %left_idx_107 = tt.broadcast %left_idx_106 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc173) + %right_idx_108 = arith.muli %y_idx_103, %iright : tensor<8x2x1xi32> loc(#loc175) + %right_idx_109 = "tt.reduce"(%right_idx_108) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc203) + %right_idx_110 = tt.expand_dims %right_idx_109 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc177) + %right_idx_111 = tt.broadcast %right_idx_110 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc178) + %left_idx_112 = tt.reshape %left_idx_107 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_113 = tt.reshape %right_idx_111 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_114 = arith.cmpi slt, %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc181) + %eq_115 = arith.cmpi eq, %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc182) + %cond_116 = arith.cmpi sgt, %left_idx_112, %right_idx_113 : tensor<1x16xi32> loc(#loc183) + %cond_117 = arith.andi %eq_115, %cond_116 : tensor<1x16xi1> loc(#loc184) + %cond_118 = arith.ori %cond_114, %cond_117 : tensor<1x16xi1> loc(#loc185) + %cond_119 = arith.extui %cond_118 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_120 = arith.xori %cond_119, %flip_54 : tensor<1x16xi32> loc(#loc186) + %cond_121 = arith.cmpi ne, %cond_120, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_122 = arith.xori %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc188) + %ret_123 = arith.select %cond_121, %ret_122, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_124 = arith.xori %ret_88, %ret_123 : tensor<1x16xi32> loc(#loc190) + %new_idxs_125 = arith.xori %left_idx_112, %right_idx_113 : tensor<1x16xi32> loc(#loc191) + %new_idxs_126 = arith.select %cond_121, %new_idxs_125, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_127 = arith.xori %new_idxs_91, %new_idxs_126 : tensor<1x16xi32> loc(#loc193) + %flip_128 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc149) + %flip_129 = tt.reshape %flip_128 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc150) + %y_130 = tt.reshape %ret_124 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc156) + %ileft_131 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc158) + %ileft_132 = arith.muli %y_130, %ileft_131 : tensor<2x2x4xi32> loc(#loc158) + %ileft_133 = "tt.reduce"(%ileft_132) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc195) + %ileft_134 = tt.expand_dims %ileft_133 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc160) + %ileft_135 = tt.broadcast %ileft_134 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc161) + %iright_136 = arith.muli %y_130, %flip_53 : tensor<2x2x4xi32> loc(#loc162) + %iright_137 = "tt.reduce"(%iright_136) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc197) + %iright_138 = tt.expand_dims %iright_137 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc164) + %iright_139 = tt.broadcast %iright_138 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc165) + %ileft_140 = tt.reshape %ileft_135 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_141 = tt.reshape %iright_139 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_142 = tt.reshape %new_idxs_127 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc168) + %left_idx_143 = arith.muli %y_idx_142, %ileft_131 : tensor<2x2x4xi32> loc(#loc170) + %left_idx_144 = "tt.reduce"(%left_idx_143) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc200) + %left_idx_145 = tt.expand_dims %left_idx_144 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc172) + %left_idx_146 = tt.broadcast %left_idx_145 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc173) + %right_idx_147 = arith.muli %y_idx_142, %flip_53 : tensor<2x2x4xi32> loc(#loc175) + %right_idx_148 = "tt.reduce"(%right_idx_147) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc203) + %right_idx_149 = tt.expand_dims %right_idx_148 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc177) + %right_idx_150 = tt.broadcast %right_idx_149 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc178) + %left_idx_151 = tt.reshape %left_idx_146 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_152 = tt.reshape %right_idx_150 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_153 = arith.cmpi slt, %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc181) + %eq_154 = arith.cmpi eq, %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc182) + %cond_155 = arith.cmpi sgt, %left_idx_151, %right_idx_152 : tensor<1x16xi32> loc(#loc183) + %cond_156 = arith.andi %eq_154, %cond_155 : tensor<1x16xi1> loc(#loc184) + %cond_157 = arith.ori %cond_153, %cond_156 : tensor<1x16xi1> loc(#loc185) + %cond_158 = arith.extui %cond_157 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_159 = arith.xori %cond_158, %flip_129 : tensor<1x16xi32> loc(#loc186) + %cond_160 = arith.cmpi ne, %cond_159, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_161 = arith.xori %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc188) + %ret_162 = arith.select %cond_160, %ret_161, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_163 = arith.xori %ret_124, %ret_162 : tensor<1x16xi32> loc(#loc190) + %new_idxs_164 = arith.xori %left_idx_151, %right_idx_152 : tensor<1x16xi32> loc(#loc191) + %new_idxs_165 = arith.select %cond_160, %new_idxs_164, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_166 = arith.xori %new_idxs_127, %new_idxs_165 : tensor<1x16xi32> loc(#loc193) + %y_167 = tt.reshape %ret_163 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc156) + %ileft_168 = arith.muli %y_167, %ileft_56 : tensor<4x2x2xi32> loc(#loc158) + %ileft_169 = "tt.reduce"(%ileft_168) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc195) + %ileft_170 = tt.expand_dims %ileft_169 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc160) + %ileft_171 = tt.broadcast %ileft_170 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc161) + %iright_172 = arith.muli %y_167, %flip_17 : tensor<4x2x2xi32> loc(#loc162) + %iright_173 = "tt.reduce"(%iright_172) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc197) + %iright_174 = tt.expand_dims %iright_173 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc164) + %iright_175 = tt.broadcast %iright_174 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc165) + %ileft_176 = tt.reshape %ileft_171 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_177 = tt.reshape %iright_175 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_178 = tt.reshape %new_idxs_166 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc168) + %left_idx_179 = arith.muli %y_idx_178, %ileft_56 : tensor<4x2x2xi32> loc(#loc170) + %left_idx_180 = "tt.reduce"(%left_idx_179) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_181 = tt.expand_dims %left_idx_180 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc172) + %left_idx_182 = tt.broadcast %left_idx_181 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc173) + %right_idx_183 = arith.muli %y_idx_178, %flip_17 : tensor<4x2x2xi32> loc(#loc175) + %right_idx_184 = "tt.reduce"(%right_idx_183) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc203) + %right_idx_185 = tt.expand_dims %right_idx_184 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc177) + %right_idx_186 = tt.broadcast %right_idx_185 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc178) + %left_idx_187 = tt.reshape %left_idx_182 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_188 = tt.reshape %right_idx_186 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_189 = arith.cmpi slt, %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc181) + %eq_190 = arith.cmpi eq, %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc182) + %cond_191 = arith.cmpi sgt, %left_idx_187, %right_idx_188 : tensor<1x16xi32> loc(#loc183) + %cond_192 = arith.andi %eq_190, %cond_191 : tensor<1x16xi1> loc(#loc184) + %cond_193 = arith.ori %cond_189, %cond_192 : tensor<1x16xi1> loc(#loc185) + %cond_194 = arith.extui %cond_193 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_195 = arith.xori %cond_194, %flip_129 : tensor<1x16xi32> loc(#loc186) + %cond_196 = arith.cmpi ne, %cond_195, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_197 = arith.xori %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc188) + %ret_198 = arith.select %cond_196, %ret_197, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_199 = arith.xori %ret_163, %ret_198 : tensor<1x16xi32> loc(#loc190) + %new_idxs_200 = arith.xori %left_idx_187, %right_idx_188 : tensor<1x16xi32> loc(#loc191) + %new_idxs_201 = arith.select %cond_196, %new_idxs_200, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_202 = arith.xori %new_idxs_166, %new_idxs_201 : tensor<1x16xi32> loc(#loc193) + %y_203 = tt.reshape %ret_199 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc156) + %ileft_204 = arith.muli %y_203, %ileft : tensor<8x2x1xi32> loc(#loc158) + %ileft_205 = "tt.reduce"(%ileft_204) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc195) + %ileft_206 = tt.expand_dims %ileft_205 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc160) + %ileft_207 = tt.broadcast %ileft_206 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc161) + %iright_208 = arith.muli %y_203, %iright : tensor<8x2x1xi32> loc(#loc162) + %iright_209 = "tt.reduce"(%iright_208) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc197) + %iright_210 = tt.expand_dims %iright_209 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc164) + %iright_211 = tt.broadcast %iright_210 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc165) + %ileft_212 = tt.reshape %ileft_207 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_213 = tt.reshape %iright_211 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_214 = tt.reshape %new_idxs_202 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc168) + %left_idx_215 = arith.muli %y_idx_214, %ileft : tensor<8x2x1xi32> loc(#loc170) + %left_idx_216 = "tt.reduce"(%left_idx_215) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_217 = tt.expand_dims %left_idx_216 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc172) + %left_idx_218 = tt.broadcast %left_idx_217 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc173) + %right_idx_219 = arith.muli %y_idx_214, %iright : tensor<8x2x1xi32> loc(#loc175) + %right_idx_220 = "tt.reduce"(%right_idx_219) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc203) + %right_idx_221 = tt.expand_dims %right_idx_220 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc177) + %right_idx_222 = tt.broadcast %right_idx_221 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc178) + %left_idx_223 = tt.reshape %left_idx_218 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_224 = tt.reshape %right_idx_222 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_225 = arith.cmpi slt, %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc181) + %eq_226 = arith.cmpi eq, %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc182) + %cond_227 = arith.cmpi sgt, %left_idx_223, %right_idx_224 : tensor<1x16xi32> loc(#loc183) + %cond_228 = arith.andi %eq_226, %cond_227 : tensor<1x16xi1> loc(#loc184) + %cond_229 = arith.ori %cond_225, %cond_228 : tensor<1x16xi1> loc(#loc185) + %cond_230 = arith.extui %cond_229 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_231 = arith.xori %cond_230, %flip_129 : tensor<1x16xi32> loc(#loc186) + %cond_232 = arith.cmpi ne, %cond_231, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_233 = arith.xori %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc188) + %ret_234 = arith.select %cond_232, %ret_233, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_235 = arith.xori %ret_199, %ret_234 : tensor<1x16xi32> loc(#loc190) + %new_idxs_236 = arith.xori %left_idx_223, %right_idx_224 : tensor<1x16xi32> loc(#loc191) + %new_idxs_237 = arith.select %cond_232, %new_idxs_236, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_238 = arith.xori %new_idxs_202, %new_idxs_237 : tensor<1x16xi32> loc(#loc193) + %y_239 = tt.reshape %ret_235 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc156) + %ileft_240 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc158) + %ileft_241 = arith.muli %y_239, %ileft_240 : tensor<1x2x8xi32> loc(#loc158) + %ileft_242 = "tt.reduce"(%ileft_241) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc195) + %ileft_243 = tt.expand_dims %ileft_242 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc160) + %ileft_244 = tt.broadcast %ileft_243 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc161) + %iright_245 = arith.muli %y_239, %flip_128 : tensor<1x2x8xi32> loc(#loc162) + %iright_246 = "tt.reduce"(%iright_245) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc197) + %iright_247 = tt.expand_dims %iright_246 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc164) + %iright_248 = tt.broadcast %iright_247 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc165) + %ileft_249 = tt.reshape %ileft_244 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_250 = tt.reshape %iright_248 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_251 = tt.reshape %new_idxs_238 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc168) + %left_idx_252 = arith.muli %y_idx_251, %ileft_240 : tensor<1x2x8xi32> loc(#loc170) + %left_idx_253 = "tt.reduce"(%left_idx_252) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc200) + %left_idx_254 = tt.expand_dims %left_idx_253 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc172) + %left_idx_255 = tt.broadcast %left_idx_254 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc173) + %right_idx_256 = arith.muli %y_idx_251, %flip_128 : tensor<1x2x8xi32> loc(#loc175) + %right_idx_257 = "tt.reduce"(%right_idx_256) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc203) + %right_idx_258 = tt.expand_dims %right_idx_257 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc177) + %right_idx_259 = tt.broadcast %right_idx_258 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc178) + %left_idx_260 = tt.reshape %left_idx_255 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_261 = tt.reshape %right_idx_259 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_262 = arith.cmpi slt, %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc181) + %eq_263 = arith.cmpi eq, %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc182) + %cond_264 = arith.cmpi sgt, %left_idx_260, %right_idx_261 : tensor<1x16xi32> loc(#loc183) + %cond_265 = arith.andi %eq_263, %cond_264 : tensor<1x16xi1> loc(#loc184) + %cond_266 = arith.ori %cond_262, %cond_265 : tensor<1x16xi1> loc(#loc185) + %ret_267 = arith.xori %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc188) + %ret_268 = arith.select %cond_266, %ret_267, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_269 = arith.xori %ret_235, %ret_268 : tensor<1x16xi32> loc(#loc190) + %new_idxs_270 = arith.xori %left_idx_260, %right_idx_261 : tensor<1x16xi32> loc(#loc191) + %new_idxs_271 = arith.select %cond_266, %new_idxs_270, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_272 = arith.xori %new_idxs_238, %new_idxs_271 : tensor<1x16xi32> loc(#loc193) + %y_273 = tt.reshape %ret_269 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc156) + %ileft_274 = arith.muli %y_273, %ileft_131 : tensor<2x2x4xi32> loc(#loc158) + %ileft_275 = "tt.reduce"(%ileft_274) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc195) + %ileft_276 = tt.expand_dims %ileft_275 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc160) + %ileft_277 = tt.broadcast %ileft_276 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc161) + %iright_278 = arith.muli %y_273, %flip_53 : tensor<2x2x4xi32> loc(#loc162) + %iright_279 = "tt.reduce"(%iright_278) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc197) + %iright_280 = tt.expand_dims %iright_279 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc164) + %iright_281 = tt.broadcast %iright_280 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc165) + %ileft_282 = tt.reshape %ileft_277 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_283 = tt.reshape %iright_281 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_284 = tt.reshape %new_idxs_272 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc168) + %left_idx_285 = arith.muli %y_idx_284, %ileft_131 : tensor<2x2x4xi32> loc(#loc170) + %left_idx_286 = "tt.reduce"(%left_idx_285) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc200) + %left_idx_287 = tt.expand_dims %left_idx_286 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc172) + %left_idx_288 = tt.broadcast %left_idx_287 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc173) + %right_idx_289 = arith.muli %y_idx_284, %flip_53 : tensor<2x2x4xi32> loc(#loc175) + %right_idx_290 = "tt.reduce"(%right_idx_289) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc203) + %right_idx_291 = tt.expand_dims %right_idx_290 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc177) + %right_idx_292 = tt.broadcast %right_idx_291 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc178) + %left_idx_293 = tt.reshape %left_idx_288 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_294 = tt.reshape %right_idx_292 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_295 = arith.cmpi slt, %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc181) + %eq_296 = arith.cmpi eq, %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc182) + %cond_297 = arith.cmpi sgt, %left_idx_293, %right_idx_294 : tensor<1x16xi32> loc(#loc183) + %cond_298 = arith.andi %eq_296, %cond_297 : tensor<1x16xi1> loc(#loc184) + %cond_299 = arith.ori %cond_295, %cond_298 : tensor<1x16xi1> loc(#loc185) + %ret_300 = arith.xori %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc188) + %ret_301 = arith.select %cond_299, %ret_300, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_302 = arith.xori %ret_269, %ret_301 : tensor<1x16xi32> loc(#loc190) + %new_idxs_303 = arith.xori %left_idx_293, %right_idx_294 : tensor<1x16xi32> loc(#loc191) + %new_idxs_304 = arith.select %cond_299, %new_idxs_303, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_305 = arith.xori %new_idxs_272, %new_idxs_304 : tensor<1x16xi32> loc(#loc193) + %y_306 = tt.reshape %ret_302 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc156) + %ileft_307 = arith.muli %y_306, %ileft_56 : tensor<4x2x2xi32> loc(#loc158) + %ileft_308 = "tt.reduce"(%ileft_307) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc195) + %ileft_309 = tt.expand_dims %ileft_308 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc160) + %ileft_310 = tt.broadcast %ileft_309 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc161) + %iright_311 = arith.muli %y_306, %flip_17 : tensor<4x2x2xi32> loc(#loc162) + %iright_312 = "tt.reduce"(%iright_311) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc197) + %iright_313 = tt.expand_dims %iright_312 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc164) + %iright_314 = tt.broadcast %iright_313 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc165) + %ileft_315 = tt.reshape %ileft_310 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_316 = tt.reshape %iright_314 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_317 = tt.reshape %new_idxs_305 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc168) + %left_idx_318 = arith.muli %y_idx_317, %ileft_56 : tensor<4x2x2xi32> loc(#loc170) + %left_idx_319 = "tt.reduce"(%left_idx_318) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_320 = tt.expand_dims %left_idx_319 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc172) + %left_idx_321 = tt.broadcast %left_idx_320 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc173) + %right_idx_322 = arith.muli %y_idx_317, %flip_17 : tensor<4x2x2xi32> loc(#loc175) + %right_idx_323 = "tt.reduce"(%right_idx_322) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc203) + %right_idx_324 = tt.expand_dims %right_idx_323 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc177) + %right_idx_325 = tt.broadcast %right_idx_324 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc178) + %left_idx_326 = tt.reshape %left_idx_321 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_327 = tt.reshape %right_idx_325 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_328 = arith.cmpi slt, %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc181) + %eq_329 = arith.cmpi eq, %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc182) + %cond_330 = arith.cmpi sgt, %left_idx_326, %right_idx_327 : tensor<1x16xi32> loc(#loc183) + %cond_331 = arith.andi %eq_329, %cond_330 : tensor<1x16xi1> loc(#loc184) + %cond_332 = arith.ori %cond_328, %cond_331 : tensor<1x16xi1> loc(#loc185) + %ret_333 = arith.xori %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc188) + %ret_334 = arith.select %cond_332, %ret_333, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_335 = arith.xori %ret_302, %ret_334 : tensor<1x16xi32> loc(#loc190) + %new_idxs_336 = arith.xori %left_idx_326, %right_idx_327 : tensor<1x16xi32> loc(#loc191) + %new_idxs_337 = arith.select %cond_332, %new_idxs_336, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_338 = arith.xori %new_idxs_305, %new_idxs_337 : tensor<1x16xi32> loc(#loc193) + %y_339 = tt.reshape %ret_335 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc156) + %ileft_340 = arith.muli %y_339, %ileft : tensor<8x2x1xi32> loc(#loc158) + %ileft_341 = "tt.reduce"(%ileft_340) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc195) + %ileft_342 = tt.expand_dims %ileft_341 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc160) + %ileft_343 = tt.broadcast %ileft_342 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc161) + %iright_344 = arith.muli %y_339, %iright : tensor<8x2x1xi32> loc(#loc162) + %iright_345 = "tt.reduce"(%iright_344) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc197) + %iright_346 = tt.expand_dims %iright_345 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc164) + %iright_347 = tt.broadcast %iright_346 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc165) + %ileft_348 = tt.reshape %ileft_343 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_349 = tt.reshape %iright_347 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_350 = tt.reshape %new_idxs_338 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc168) + %left_idx_351 = arith.muli %y_idx_350, %ileft : tensor<8x2x1xi32> loc(#loc170) + %left_idx_352 = "tt.reduce"(%left_idx_351) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_353 = tt.expand_dims %left_idx_352 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc172) + %left_idx_354 = tt.broadcast %left_idx_353 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc173) + %right_idx_355 = arith.muli %y_idx_350, %iright : tensor<8x2x1xi32> loc(#loc175) + %right_idx_356 = "tt.reduce"(%right_idx_355) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc203) + %right_idx_357 = tt.expand_dims %right_idx_356 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc177) + %right_idx_358 = tt.broadcast %right_idx_357 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc178) + %left_idx_359 = tt.reshape %left_idx_354 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_360 = tt.reshape %right_idx_358 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_361 = arith.cmpi slt, %ileft_348, %iright_349 : tensor<1x16xi32> loc(#loc181) + %eq_362 = arith.cmpi eq, %ileft_348, %iright_349 : tensor<1x16xi32> loc(#loc182) + %cond_363 = arith.cmpi sgt, %left_idx_359, %right_idx_360 : tensor<1x16xi32> loc(#loc183) + %cond_364 = arith.andi %eq_362, %cond_363 : tensor<1x16xi1> loc(#loc184) + %cond_365 = arith.ori %cond_361, %cond_364 : tensor<1x16xi1> loc(#loc185) + %new_idxs_366 = arith.xori %left_idx_359, %right_idx_360 : tensor<1x16xi32> loc(#loc191) + %new_idxs_367 = arith.select %cond_365, %new_idxs_366, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_368 = arith.xori %new_idxs_338, %new_idxs_367 : tensor<1x16xi32> loc(#loc193) + %tmp7 = arith.extsi %tmp0_14 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc139) + %tmp10_369 = arith.select %tmp0_13, %tmp7, %tmp10 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc83) + %tmp11 = "tt.reduce"(%tmp10_369) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_371: i64 loc(callsite(#loc2 at #loc140)), %tmp11_372: i64 loc(callsite(#loc2 at #loc140))): + %tmp11_373 = arith.addi %tmp11_371, %tmp11_372 : i64 loc(#loc194) + tt.reduce.return %tmp11_373 : i64 loc(#loc154) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc154) + %tmp11_370 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc141) + %tmp14 = arith.trunci %tmp11_370 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc142) + %0 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc68) + %1 = tt.splat %0 : i32 -> tensor<1x16xi32> loc(#loc143) + %2 = arith.addi %r0_index_4, %1 : tensor<1x16xi32> loc(#loc69) + %3 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc70) + %4 = tt.addptr %3, %2 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc70) + tt.store %4, %new_idxs_368, %tmp0_13 : tensor<1x16x!tt.ptr> loc(#loc71) + %5 = tt.addptr %out_ptr3, %xoffset : !tt.ptr, i32 loc(#loc72) + %6 = tt.splat %5 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc72) + tt.store %6, %tmp14, %xmask_3 : tensor<1x1x!tt.ptr> loc(#loc73) + tt.return loc(#loc74) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:49) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":26:21) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":44:34) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:38) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":24:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":27:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":27:38) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":33:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":34:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:35) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:30) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:54) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":38:19) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":42:19) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":45:29) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":48:21) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:35) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:32) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:47) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:37) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:4) +#loc80 = loc("tmp0"(#loc1)) +#loc81 = loc("xmask"(#loc3)) +#loc82 = loc(callsite(#loc2 at #loc4)) +#loc83 = loc("tmp10"(#loc5)) +#loc84 = loc("tmp0"(#loc6)) +#loc85 = loc("xoffset"(#loc7)) +#loc86 = loc("r0_index"(#loc8)) +#loc87 = loc("r0_index"(#loc9)) +#loc88 = loc("x0"(#loc10)) +#loc89 = loc("x1"(#loc11)) +#loc90 = loc("tmp0"(#loc12)) +#loc91 = loc("tmp0"(#loc13)) +#loc92 = loc("tmp0"(#loc14)) +#loc93 = loc("tmp0"(#loc15)) +#loc94 = loc("tmp2"(#loc16)) +#loc95 = loc("flip"(#loc17)) +#loc97 = loc("flip"(#loc19)) +#loc98 = loc("flip"(#loc20)) +#loc99 = loc("flip"(#loc21)) +#loc100 = loc("y"(#loc22)) +#loc101 = loc("left_mask"(#loc24)) +#loc102 = loc("ileft"(#loc25)) +#loc104 = loc("ileft"(#loc29)) +#loc105 = loc("ileft"(#loc30)) +#loc106 = loc("iright"(#loc31)) +#loc108 = loc("iright"(#loc33)) +#loc109 = loc("iright"(#loc34)) +#loc110 = loc("ileft"(#loc35)) +#loc111 = loc("iright"(#loc36)) +#loc112 = loc("y_idx"(#loc37)) +#loc113 = loc("left_idx"(#loc38)) +#loc114 = loc("left_idx"(#loc39)) +#loc115 = loc("input"(#loc40)) +#loc117 = loc("left_idx"(#loc42)) +#loc118 = loc("left_idx"(#loc43)) +#loc119 = loc("right_idx"(#loc44)) +#loc120 = loc("right_idx"(#loc45)) +#loc122 = loc("right_idx"(#loc47)) +#loc123 = loc("right_idx"(#loc48)) +#loc124 = loc("left_idx"(#loc49)) +#loc125 = loc("right_idx"(#loc50)) +#loc126 = loc("cond"(#loc51)) +#loc127 = loc("eq"(#loc52)) +#loc128 = loc("cond"(#loc53)) +#loc129 = loc("cond"(#loc54)) +#loc130 = loc("cond"(#loc55)) +#loc131 = loc("cond"(#loc56)) +#loc132 = loc("cond"(#loc57)) +#loc133 = loc("ret"(#loc58)) +#loc134 = loc("ret"(#loc59)) +#loc135 = loc("ret"(#loc60)) +#loc136 = loc("new_idxs"(#loc61)) +#loc137 = loc("new_idxs"(#loc62)) +#loc138 = loc("new_idxs"(#loc63)) +#loc139 = loc("tmp7"(#loc64)) +#loc141 = loc("tmp11"(#loc66)) +#loc142 = loc("tmp14"(#loc67)) +#loc143 = loc(fused[#loc69, #loc68]) +#loc144 = loc(fused[#loc90, #loc88]) +#loc145 = loc(fused[#loc91, #loc80]) +#loc146 = loc(fused[#loc93, #loc81]) +#loc147 = loc(callsite(#loc95 at #loc96)) +#loc148 = loc(callsite(#loc97 at #loc96)) +#loc149 = loc(callsite(#loc98 at #loc96)) +#loc150 = loc(callsite(#loc99 at #loc96)) +#loc152 = loc("cond"(#loc126)) +#loc153 = loc("eq"(#loc127)) +#loc154 = loc(callsite(#loc26 at #loc140)) +#loc156 = loc(callsite(#loc100 at #loc151)) +#loc157 = loc(callsite(#loc101 at #loc151)) +#loc158 = loc(callsite(#loc102 at #loc151)) +#loc160 = loc(callsite(#loc104 at #loc151)) +#loc161 = loc(callsite(#loc105 at #loc151)) +#loc162 = loc(callsite(#loc106 at #loc151)) +#loc164 = loc(callsite(#loc108 at #loc151)) +#loc165 = loc(callsite(#loc109 at #loc151)) +#loc166 = loc(callsite(#loc110 at #loc151)) +#loc167 = loc(callsite(#loc111 at #loc151)) +#loc168 = loc(callsite(#loc112 at #loc151)) +#loc169 = loc(callsite(#loc113 at #loc151)) +#loc170 = loc(callsite(#loc114 at #loc151)) +#loc172 = loc(callsite(#loc117 at #loc151)) +#loc173 = loc(callsite(#loc118 at #loc151)) +#loc174 = loc(callsite(#loc119 at #loc151)) +#loc175 = loc(callsite(#loc120 at #loc151)) +#loc177 = loc(callsite(#loc122 at #loc151)) +#loc178 = loc(callsite(#loc123 at #loc151)) +#loc179 = loc(callsite(#loc124 at #loc151)) +#loc180 = loc(callsite(#loc125 at #loc151)) +#loc181 = loc(callsite(#loc152 at #loc151)) +#loc182 = loc(callsite(#loc153 at #loc151)) +#loc183 = loc(callsite(#loc128 at #loc151)) +#loc184 = loc(callsite(#loc129 at #loc151)) +#loc185 = loc(callsite(#loc130 at #loc151)) +#loc186 = loc(callsite(#loc131 at #loc151)) +#loc187 = loc(callsite(#loc132 at #loc151)) +#loc188 = loc(callsite(#loc133 at #loc151)) +#loc189 = loc(callsite(#loc134 at #loc151)) +#loc190 = loc(callsite(#loc135 at #loc151)) +#loc191 = loc(callsite(#loc136 at #loc151)) +#loc192 = loc(callsite(#loc137 at #loc151)) +#loc193 = loc(callsite(#loc138 at #loc151)) +#loc194 = loc(callsite(#loc28 at #loc154)) +#loc195 = loc(callsite(#loc26 at #loc159)) +#loc197 = loc(callsite(#loc26 at #loc163)) +#loc199 = loc(callsite(#loc115 at #loc171)) +#loc200 = loc(callsite(#loc26 at #loc171)) +#loc202 = loc(callsite(#loc115 at #loc176)) +#loc203 = loc(callsite(#loc26 at #loc176)) +#loc205 = loc(callsite(#loc28 at #loc195)) +#loc206 = loc(callsite(#loc28 at #loc197)) +#loc207 = loc(callsite(#loc28 at #loc200)) +#loc208 = loc(callsite(#loc28 at #loc203)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2b9400e2ecb6f593f54d9cdc6e5e25269979a2c0 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..7017abea6b3931b9108906098ca16fb3b0dbaa58 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..eb915de1356bb9ad1711f66e34b69762aa88cb96 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json @@ -0,0 +1 @@ +{"hash": "27bb8aff3b18f49fa9e29ff75f90ab73a22e8c0efd66e42e5178f47960fed60f", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..41b4373aadff51320b391094958134823820271c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir @@ -0,0 +1,280 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i64 %4, i64 %5, i32 %6, i32 %7, ptr addrspace(1) readnone captures(none) %8, ptr addrspace(1) readnone captures(none) %9) local_unnamed_addr #0 !dbg !4 { + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %12 = icmp slt i32 %11, %6, !dbg !8 + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %14 = and i32 %13, 384, !dbg !9 + %15 = zext nneg i32 %11 to i64, !dbg !10 + %.frozen = freeze i64 %3, !dbg !11 + %16 = sdiv i64 %15, %.frozen, !dbg !11 + %17 = mul i64 %16, %.frozen, !dbg !10 + %.decomposed = sub i64 %15, %17, !dbg !10 + %.lhs.trunc = trunc nsw i64 %16 to i32, !dbg !12 + %18 = srem i32 %.lhs.trunc, 16, !dbg !12 + %.sext = sext i32 %18 to i64, !dbg !12 + %19 = sdiv i64 %15, %5, !dbg !13 + %20 = shl nuw nsw i64 %.decomposed, 7, !dbg !14 + %21 = shl nsw i64 %.sext, 7, !dbg !15 + %22 = getelementptr i64, ptr addrspace(1) %0, i64 %19, !dbg !16 + %23 = and i32 %13, 127 + %24 = zext nneg i32 %23 to i64 + %25 = or disjoint i64 %20, %24 + %26 = icmp slt i64 %25, %4 + %27 = and i1 %12, %26 + %28 = icmp samesign ugt i64 %.decomposed, 15 + %29 = and i64 %25, 2047 + %30 = sub nsw i64 %.decomposed, %.sext + %31 = shl nsw i64 %30, 7 + %32 = select i1 %12, i1 %26, i1 false + %33 = zext nneg i32 %14 to i64, !dbg !17 + %34 = zext nneg i32 %13 to i64, !dbg !17 + %35 = insertelement <2 x i1> poison, i1 %32, i64 0, !dbg !18 + %36 = shufflevector <2 x i1> %35, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !18 + %37 = insertelement <2 x i64> poison, i64 %21, i64 0, !dbg !19 + %38 = shufflevector <2 x i64> %37, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !19 + %39 = insertelement <2 x i64> poison, i64 %25, i64 0, !dbg !20 + %40 = shufflevector <2 x i64> %39, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !20 + %41 = insertelement <2 x i1> poison, i1 %28, i64 0, !dbg !21 + %42 = shufflevector <2 x i1> %41, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !21 + %43 = insertelement <2 x i64> poison, i64 %31, i64 0, !dbg !22 + %44 = shufflevector <2 x i64> %43, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !22 + %45 = insertelement <2 x i32> poison, i32 %23, i64 0, !dbg !23 + %46 = shufflevector <2 x i32> %45, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !23 + %47 = insertelement <2 x i64> poison, i64 %29, i64 0, !dbg !24 + %48 = shufflevector <2 x i64> %47, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !24 + br label %49, !dbg !17 + +49: ; preds = %10, %49 + %indvars.iv = phi i64 [ 0, %10 ], [ %indvars.iv.next, %49 ] + %50 = phi <2 x i64> [ zeroinitializer, %10 ], [ %86, %49 ] + %51 = or disjoint i64 %indvars.iv, %33, !dbg !25 + %52 = or disjoint i64 %indvars.iv, %34, !dbg !25 + %53 = lshr exact i64 %51, 7, !dbg !26 + %54 = lshr i64 %52, 7, !dbg !26 + %55 = trunc nuw nsw i64 %54 to i32, !dbg !26 + %56 = or i32 %55, 4, !dbg !26 + %57 = zext nneg i32 %56 to i64, !dbg !19 + %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !27 + %59 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %22, i64 %58, i1 %27) #4, !dbg !27 + %60 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !27 + %61 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %22, i64 %60, i1 %27) #4, !dbg !27 + %62 = trunc nuw nsw i64 %53 to i32, !dbg !23 + %63 = insertelement <2 x i64> poison, i64 %53, i64 0, !dbg !19 + %64 = insertelement <2 x i64> %63, i64 %57, i64 1, !dbg !19 + %65 = or disjoint <2 x i64> %38, %64, !dbg !19 + %66 = icmp sge <2 x i64> %65, %40, !dbg !20 + %67 = insertelement <2 x i64> poison, i64 %59, i64 0, !dbg !28 + %68 = insertelement <2 x i64> %67, i64 %61, i64 1, !dbg !28 + %69 = icmp slt <2 x i64> %40, %68, !dbg !28 + %70 = icmp slt <2 x i64> %65, %68, !dbg !29 + %71 = and <2 x i1> %69, %70, !dbg !30 + %72 = and <2 x i1> %66, %71, !dbg !31 + %73 = icmp slt <2 x i64> %48, %68, !dbg !24 + %74 = insertelement <2 x i32> poison, i32 %62, i64 0, !dbg !23 + %75 = insertelement <2 x i32> %74, i32 %56, i64 1, !dbg !23 + %76 = sub nsw <2 x i32> %46, %75, !dbg !23 + %77 = zext <2 x i32> %76 to <2 x i64>, !dbg !23 + %78 = add nsw <2 x i64> %44, %77, !dbg !22 + %79 = and <2 x i64> %78, splat (i64 2047), !dbg !32 + %80 = icmp eq <2 x i64> %79, zeroinitializer, !dbg !33 + %81 = and <2 x i1> %80, %73, !dbg !21 + %82 = and <2 x i1> %42, %81, !dbg !21 + %83 = or <2 x i1> %72, %82, !dbg !34 + %84 = select <2 x i1> %36, <2 x i1> %83, <2 x i1> zeroinitializer, !dbg !18 + %85 = zext <2 x i1> %84 to <2 x i64>, !dbg !18 + %86 = add <2 x i64> %50, %85, !dbg !18 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1024, !dbg !17 + %87 = icmp samesign ult i64 %indvars.iv, 15360, !dbg !17 + br i1 %87, label %49, label %88, !dbg !17 + +88: ; preds = %49 + %89 = and i32 %13, 31, !dbg !9 + %90 = lshr i32 %13, 5, !dbg !9 + %shift = shufflevector <2 x i64> %86, <2 x i64> poison, <2 x i32> , !dbg !35 + %foldExtExtBinop = add <2 x i64> %86, %shift, !dbg !35 + %91 = extractelement <2 x i64> %foldExtExtBinop, i64 0, !dbg !35 + %92 = bitcast <2 x i64> %foldExtExtBinop to <4 x i32>, !dbg !39 + %93 = extractelement <4 x i32> %92, i64 1, !dbg !39 + %94 = trunc i64 %91 to i32, !dbg !39 + %95 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 16, i32 31), !dbg !39 + %96 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %93, i32 16, i32 31), !dbg !39 + %97 = insertelement <2 x i32> poison, i32 %95, i64 0, !dbg !39 + %98 = insertelement <2 x i32> %97, i32 %96, i64 1, !dbg !39 + %99 = bitcast <2 x i32> %98 to i64, !dbg !39 + %100 = add i64 %91, %99, !dbg !35 + %extelt.offset1 = lshr i64 %100, 32, !dbg !39 + %101 = trunc nuw i64 %extelt.offset1 to i32, !dbg !39 + %102 = trunc i64 %100 to i32, !dbg !39 + %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 8, i32 31), !dbg !39 + %104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 8, i32 31), !dbg !39 + %105 = insertelement <2 x i32> poison, i32 %103, i64 0, !dbg !39 + %106 = insertelement <2 x i32> %105, i32 %104, i64 1, !dbg !39 + %107 = bitcast <2 x i32> %106 to i64, !dbg !39 + %108 = add i64 %100, %107, !dbg !35 + %extelt.offset2 = lshr i64 %108, 32, !dbg !39 + %109 = trunc nuw i64 %extelt.offset2 to i32, !dbg !39 + %110 = trunc i64 %108 to i32, !dbg !39 + %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 4, i32 31), !dbg !39 + %112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %109, i32 4, i32 31), !dbg !39 + %113 = insertelement <2 x i32> poison, i32 %111, i64 0, !dbg !39 + %114 = insertelement <2 x i32> %113, i32 %112, i64 1, !dbg !39 + %115 = bitcast <2 x i32> %114 to i64, !dbg !39 + %116 = add i64 %108, %115, !dbg !35 + %extelt.offset3 = lshr i64 %116, 32, !dbg !39 + %117 = trunc nuw i64 %extelt.offset3 to i32, !dbg !39 + %118 = trunc i64 %116 to i32, !dbg !39 + %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 2, i32 31), !dbg !39 + %120 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 2, i32 31), !dbg !39 + %121 = insertelement <2 x i32> poison, i32 %119, i64 0, !dbg !39 + %122 = insertelement <2 x i32> %121, i32 %120, i64 1, !dbg !39 + %123 = bitcast <2 x i32> %122 to i64, !dbg !39 + %124 = add i64 %116, %123, !dbg !35 + %extelt.offset4 = lshr i64 %124, 32, !dbg !39 + %125 = trunc nuw i64 %extelt.offset4 to i32, !dbg !39 + %126 = trunc i64 %124 to i32, !dbg !39 + %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 1, i32 31), !dbg !39 + %128 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 1, i32 31), !dbg !39 + %129 = insertelement <2 x i32> poison, i32 %127, i64 0, !dbg !39 + %130 = insertelement <2 x i32> %129, i32 %128, i64 1, !dbg !39 + %131 = bitcast <2 x i32> %130 to i64, !dbg !39 + %132 = add i64 %124, %131, !dbg !35 + %133 = and i32 %90, 15, !dbg !39 + %134 = icmp eq i32 %89, 0, !dbg !39 + %135 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %133, !dbg !39 + %136 = insertelement <1 x i64> poison, i64 %132, i64 0, !dbg !39 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %135, <1 x i64> %136, i1 %134) #4, !dbg !39 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !39 + %137 = icmp samesign ult i32 %13, 16, !dbg !39 + %138 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %13, !dbg !39 + %139 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %138, i1 %137) #4, !dbg !39 + %extelt.offset5 = lshr i64 %139, 32, !dbg !39 + %140 = trunc nuw i64 %extelt.offset5 to i32, !dbg !39 + %141 = trunc i64 %139 to i32, !dbg !39 + %142 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %141, i32 8, i32 31), !dbg !39 + %143 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 8, i32 31), !dbg !39 + %144 = insertelement <2 x i32> poison, i32 %142, i64 0, !dbg !39 + %145 = insertelement <2 x i32> %144, i32 %143, i64 1, !dbg !39 + %146 = bitcast <2 x i32> %145 to i64, !dbg !39 + %147 = add i64 %139, %146, !dbg !35 + %extelt.offset6 = lshr i64 %147, 32, !dbg !39 + %148 = trunc nuw i64 %extelt.offset6 to i32, !dbg !39 + %149 = trunc i64 %147 to i32, !dbg !39 + %150 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %149, i32 4, i32 31), !dbg !39 + %151 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 4, i32 31), !dbg !39 + %152 = insertelement <2 x i32> poison, i32 %150, i64 0, !dbg !39 + %153 = insertelement <2 x i32> %152, i32 %151, i64 1, !dbg !39 + %154 = bitcast <2 x i32> %153 to i64, !dbg !39 + %155 = add i64 %147, %154, !dbg !35 + %extelt.offset7 = lshr i64 %155, 32, !dbg !39 + %156 = trunc nuw i64 %extelt.offset7 to i32, !dbg !39 + %157 = trunc i64 %155 to i32, !dbg !39 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 2, i32 31), !dbg !39 + %159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %156, i32 2, i32 31), !dbg !39 + %160 = insertelement <2 x i32> poison, i32 %158, i64 0, !dbg !39 + %161 = insertelement <2 x i32> %160, i32 %159, i64 1, !dbg !39 + %162 = bitcast <2 x i32> %161 to i64, !dbg !39 + %163 = add i64 %155, %162, !dbg !35 + %extelt.offset8 = lshr i64 %163, 32, !dbg !39 + %164 = trunc nuw i64 %extelt.offset8 to i32, !dbg !39 + %165 = trunc i64 %163 to i32, !dbg !39 + %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 1, i32 31), !dbg !39 + %167 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 1, i32 31), !dbg !39 + %168 = insertelement <2 x i32> poison, i32 %166, i64 0, !dbg !39 + %169 = insertelement <2 x i32> %168, i32 %167, i64 1, !dbg !39 + %170 = bitcast <2 x i32> %169 to i64, !dbg !39 + %171 = add i64 %163, %170, !dbg !35 + %172 = icmp eq i32 %13, 0, !dbg !39 + %173 = insertelement <1 x i64> poison, i64 %171, i64 0, !dbg !39 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %138, <1 x i64> %173, i1 %172) #4, !dbg !39 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !39 + %174 = load i64, ptr addrspace(3) @global_smem, align 16, !dbg !39 + %175 = add i64 %174, -1, !dbg !40 + %176 = icmp ult i64 %175, 16383, !dbg !40 + %177 = zext i1 %176 to i32, !dbg !41 + %178 = icmp eq i64 %174, 16384, !dbg !42 + %179 = zext i1 %178 to i32, !dbg !41 + %180 = getelementptr i32, ptr addrspace(1) %1, i64 %15, !dbg !43 + %181 = and i32 %13, 511, !dbg !44 + %182 = icmp eq i32 %181, 0, !dbg !44 + %183 = and i1 %182, %12, !dbg !44 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %177, ptr addrspace(1) %180, i1 %183) #4, !dbg !44 + %184 = getelementptr i32, ptr addrspace(1) %2, i64 %15, !dbg !45 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %179, ptr addrspace(1) %184, i1 %183) #4, !dbg !46 + ret void, !dbg !47 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1", linkageName: "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 24, column: 21, scope: !4) +!9 = !DILocation(line: 25, column: 37, scope: !4) +!10 = !DILocation(line: 27, column: 19, scope: !4) +!11 = !DILocation(line: 28, column: 21, scope: !4) +!12 = !DILocation(line: 28, column: 28, scope: !4) +!13 = !DILocation(line: 29, column: 19, scope: !4) +!14 = !DILocation(line: 39, column: 26, scope: !4) +!15 = !DILocation(line: 42, column: 26, scope: !4) +!16 = !DILocation(line: 45, column: 34, scope: !4) +!17 = !DILocation(line: 32, column: 40, scope: !4) +!18 = !DILocation(line: 76, column: 50, scope: !4) +!19 = !DILocation(line: 42, column: 22, scope: !4) +!20 = !DILocation(line: 44, column: 23, scope: !4) +!21 = !DILocation(line: 69, column: 24, scope: !4) +!22 = !DILocation(line: 57, column: 51, scope: !4) +!23 = !DILocation(line: 57, column: 38, scope: !4) +!24 = !DILocation(line: 55, column: 24, scope: !4) +!25 = !DILocation(line: 33, column: 31, scope: !4) +!26 = !DILocation(line: 38, column: 27, scope: !4) +!27 = !DILocation(line: 45, column: 76, scope: !4) +!28 = !DILocation(line: 46, column: 22, scope: !4) +!29 = !DILocation(line: 47, column: 22, scope: !4) +!30 = !DILocation(line: 48, column: 22, scope: !4) +!31 = !DILocation(line: 49, column: 23, scope: !4) +!32 = !DILocation(line: 66, column: 39, scope: !4) +!33 = !DILocation(line: 68, column: 25, scope: !4) +!34 = !DILocation(line: 70, column: 24, scope: !4) +!35 = !DILocation(line: 261, column: 15, scope: !36, inlinedAt: !38) +!36 = distinct !DILexicalBlockFile(scope: !4, file: !37, discriminator: 0) +!37 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!38 = !DILocation(line: 77, column: 27, scope: !4) +!39 = !DILocation(line: 291, column: 36, scope: !36, inlinedAt: !38) +!40 = !DILocation(line: 82, column: 20, scope: !4) +!41 = !DILocation(line: 0, scope: !4) +!42 = !DILocation(line: 85, column: 21, scope: !4) +!43 = !DILocation(line: 88, column: 25, scope: !4) +!44 = !DILocation(line: 88, column: 37, scope: !4) +!45 = !DILocation(line: 89, column: 25, scope: !4) +!46 = !DILocation(line: 89, column: 37, scope: !4) +!47 = !DILocation(line: 89, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..433c09e525ea74a5677d091e52da49387580bb14 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx @@ -0,0 +1,678 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 // -- Begin function triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 +.visible .entry triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_2, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_3, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_4, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_5, + .param .u32 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_6, + .param .u32 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_7, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_9 +) +.reqntid 512 +{ + .reg .pred %p<42>; + .reg .b32 %r<78>; + .reg .b64 %rd<128>; + .loc 1 18 0 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:18:0 + +// %bb.0: + ld.param.b64 %rd33, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_5]; +$L__tmp0: + .loc 1 22 28 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:22:28 + mov.u32 %r8, %ctaid.x; + .loc 1 27 19 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:27:19 + cvt.u64.u32 %rd1, %r8; + ld.param.b64 %rd34, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_3]; + .loc 1 28 21 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:28:21 + and.b64 %rd35, %rd34, -4294967296; + setp.ne.b64 %p6, %rd35, 0; + cvt.u32.u64 %r76, %rd1; + @%p6 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd122, %rd1, %rd34; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r9, %rd34; + div.u32 %r11, %r76, %r9; + cvt.u64.u32 %rd122, %r11; +$L__BB0_3: + .loc 1 0 21 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:0:21 + ld.param.b32 %r7, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_6]; + ld.param.b64 %rd32, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_4]; + ld.param.b64 %rd29, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_0]; + mov.u32 %r1, %tid.x; + .loc 1 27 19 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:27:19 + mul.lo.s64 %rd36, %rd122, %rd34; + sub.s64 %rd6, %rd1, %rd36; + .loc 1 28 28 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:28:28 + cvt.u32.u64 %r12, %rd122; + shr.s32 %r13, %r12, 31; + shr.u32 %r14, %r13, 28; + add.s32 %r15, %r12, %r14; + and.b32 %r16, %r15, -16; + sub.s32 %r17, %r12, %r16; + cvt.s64.s32 %rd7, %r17; + .loc 1 29 19 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:29:19 + and.b64 %rd37, %rd33, -4294967296; + setp.ne.b64 %p7, %rd37, 0; + @%p7 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd123, %rd1, %rd33; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r18, %rd33; + div.u32 %r20, %r76, %r18; + cvt.u64.u32 %rd123, %r20; +$L__BB0_6: + .loc 1 0 19 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:0:19 + ld.param.b64 %rd31, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_2]; + ld.param.b64 %rd30, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_1]; + .loc 1 24 21 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:24:21 + setp.lt.s32 %p8, %r76, %r7; + .loc 1 39 26 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:39:26 + shl.b64 %rd41, %rd6, 7; + .loc 1 42 26 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:42:26 + shl.b64 %rd12, %rd7, 7; + .loc 1 45 34 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:45:34 + shl.b64 %rd42, %rd123, 3; + add.s64 %rd51, %rd29, %rd42; + and.b32 %r2, %r1, 127; + cvt.u64.u32 %rd43, %r2; + or.b64 %rd15, %rd41, %rd43; + setp.lt.s64 %p9, %rd15, %rd32; + and.pred %p10, %p8, %p9; + setp.gt.u64 %p4, %rd6, 15; + and.b64 %rd18, %rd15, 2047; + sub.s64 %rd45, %rd6, %rd7; + shl.b64 %rd16, %rd45, 7; + .loc 1 32 40 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:32:40 + cvt.u64.u32 %rd46, %r1; + shr.u64 %rd47, %rd46, 7; + cvt.u32.u64 %r77, %rd47; + shr.u32 %r22, %r1, 7; + cvt.u64.u32 %rd48, %r22; + and.b64 %rd124, %rd48, 3; + mov.b64 %rd126, 0; + mov.b64 %rd125, -1024; + mov.b64 %rd127, %rd126; +$L__BB0_7: // =>This Inner Loop Header: Depth=1 + .loc 1 38 27 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:38:27 + or.b32 %r23, %r77, 4; + .loc 1 42 22 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:42:22 + cvt.u64.u32 %rd57, %r23; + .loc 1 45 76 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:45:76 + // begin inline asm + mov.u64 %rd49, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd49, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd50, 0x0; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd50 }, [ %rd51 + 0 ], %rd49; + // end inline asm + // begin inline asm + mov.u64 %rd53, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd54, 0x0; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd54 }, [ %rd51 + 0 ], %rd53; + // end inline asm + .loc 1 42 22 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:42:22 + or.b64 %rd58, %rd12, %rd57; + or.b64 %rd59, %rd12, %rd124; + .loc 1 44 23 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:44:23 + setp.ge.s64 %p12, %rd59, %rd15; + setp.ge.s64 %p13, %rd58, %rd15; + .loc 1 48 22 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:48:22 + max.s64 %rd60, %rd15, %rd59; + setp.lt.s64 %p14, %rd60, %rd50; + max.s64 %rd61, %rd15, %rd58; + setp.lt.s64 %p15, %rd61, %rd54; + .loc 1 49 23 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:49:23 + and.pred %p16, %p13, %p15; + and.pred %p17, %p12, %p14; + .loc 1 55 24 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:55:24 + setp.lt.s64 %p18, %rd18, %rd54; + setp.lt.s64 %p19, %rd18, %rd50; + cvt.u32.u64 %r24, %rd124; + .loc 1 57 38 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:57:38 + sub.s32 %r25, %r2, %r23; + sub.s32 %r26, %r2, %r24; + cvt.u64.u32 %rd62, %r26; + cvt.u64.u32 %rd63, %r25; + .loc 1 57 51 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:57:51 + add.s64 %rd64, %rd16, %rd63; + add.s64 %rd65, %rd16, %rd62; + .loc 1 66 39 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:66:39 + and.b64 %rd66, %rd65, 2047; + and.b64 %rd67, %rd64, 2047; + .loc 1 68 25 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:68:25 + setp.eq.b64 %p20, %rd67, 0; + setp.eq.b64 %p21, %rd66, 0; + .loc 1 69 24 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:69:24 + and.pred %p22, %p21, %p19; + and.pred %p23, %p20, %p18; + and.pred %p24, %p4, %p23; + and.pred %p25, %p4, %p22; + .loc 1 70 24 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:70:24 + or.pred %p26, %p17, %p25; + or.pred %p28, %p16, %p24; + .loc 1 76 50 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:76:50 + and.pred %p30, %p10, %p28; + and.pred %p31, %p10, %p26; + selp.b64 %rd68, 1, 0, %p31; + selp.b64 %rd69, 1, 0, %p30; + add.s64 %rd127, %rd127, %rd69; + add.s64 %rd126, %rd126, %rd68; + .loc 1 32 40 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:32:40 + add.s64 %rd125, %rd125, 1024; + add.s32 %r77, %r77, 8; + add.s64 %rd124, %rd124, 8; + setp.lt.u64 %p32, %rd125, 15360; + @%p32 bra $L__BB0_7; +// %bb.8: + .loc 1 25 37 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:25:37 + and.b32 %r33, %r1, 31; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd75, %rd126, %rd127; + mov.b64 {_, %r34}, %rd75; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + cvt.u32.u64 %r35, %rd75; + shfl.sync.bfly.b32 %r36, %r35, 16, 31, -1; + shfl.sync.bfly.b32 %r37, %r34, 16, 31, -1; + cvt.u64.u32 %rd76, %r36; + cvt.u64.u32 %rd77, %r37; + shl.b64 %rd78, %rd77, 32; + or.b64 %rd79, %rd76, %rd78; + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd80, %rd75, %rd79; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + mov.b64 {_, %r38}, %rd80; + cvt.u32.u64 %r39, %rd80; + shfl.sync.bfly.b32 %r40, %r39, 8, 31, -1; + shfl.sync.bfly.b32 %r41, %r38, 8, 31, -1; + cvt.u64.u32 %rd81, %r40; + cvt.u64.u32 %rd82, %r41; + shl.b64 %rd83, %rd82, 32; + or.b64 %rd84, %rd81, %rd83; + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd85, %rd80, %rd84; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + mov.b64 {_, %r42}, %rd85; + cvt.u32.u64 %r43, %rd85; + shfl.sync.bfly.b32 %r44, %r43, 4, 31, -1; + shfl.sync.bfly.b32 %r45, %r42, 4, 31, -1; + cvt.u64.u32 %rd86, %r44; + cvt.u64.u32 %rd87, %r45; + shl.b64 %rd88, %rd87, 32; + or.b64 %rd89, %rd86, %rd88; + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd90, %rd85, %rd89; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + mov.b64 {_, %r46}, %rd90; + cvt.u32.u64 %r47, %rd90; + shfl.sync.bfly.b32 %r48, %r47, 2, 31, -1; + shfl.sync.bfly.b32 %r49, %r46, 2, 31, -1; + cvt.u64.u32 %rd91, %r48; + cvt.u64.u32 %rd92, %r49; + shl.b64 %rd93, %rd92, 32; + or.b64 %rd94, %rd91, %rd93; + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd95, %rd90, %rd94; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + mov.b64 {_, %r50}, %rd95; + cvt.u32.u64 %r51, %rd95; + shfl.sync.bfly.b32 %r52, %r51, 1, 31, -1; + shfl.sync.bfly.b32 %r53, %r50, 1, 31, -1; + cvt.u64.u32 %rd96, %r52; + cvt.u64.u32 %rd97, %r53; + shl.b64 %rd98, %rd97, 32; + or.b64 %rd99, %rd96, %rd98; + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd70, %rd95, %rd99; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + setp.eq.b32 %p33, %r33, 0; + shr.u32 %r54, %r1, 2; + and.b32 %r55, %r54, 120; + mov.b32 %r56, global_smem; + add.s32 %r27, %r56, %r55; + // begin inline asm + @%p33 st.shared.b64 [ %r27 + 0 ], %rd70; + // end inline asm + bar.sync 0; + setp.lt.u32 %p34, %r1, 16; + shl.b32 %r57, %r1, 3; + add.s32 %r28, %r56, %r57; + // begin inline asm + @%p34 ld.shared.b64 %rd71, [ %r28 + 0 ]; + // end inline asm + mov.b64 {_, %r58}, %rd71; + cvt.u32.u64 %r59, %rd71; + shfl.sync.bfly.b32 %r60, %r59, 8, 31, -1; + shfl.sync.bfly.b32 %r61, %r58, 8, 31, -1; + cvt.u64.u32 %rd100, %r60; + cvt.u64.u32 %rd101, %r61; + shl.b64 %rd102, %rd101, 32; + or.b64 %rd103, %rd100, %rd102; + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd104, %rd71, %rd103; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + mov.b64 {_, %r62}, %rd104; + cvt.u32.u64 %r63, %rd104; + shfl.sync.bfly.b32 %r64, %r63, 4, 31, -1; + shfl.sync.bfly.b32 %r65, %r62, 4, 31, -1; + cvt.u64.u32 %rd105, %r64; + cvt.u64.u32 %rd106, %r65; + shl.b64 %rd107, %rd106, 32; + or.b64 %rd108, %rd105, %rd107; + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd109, %rd104, %rd108; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + mov.b64 {_, %r66}, %rd109; + cvt.u32.u64 %r67, %rd109; + shfl.sync.bfly.b32 %r68, %r67, 2, 31, -1; + shfl.sync.bfly.b32 %r69, %r66, 2, 31, -1; + cvt.u64.u32 %rd110, %r68; + cvt.u64.u32 %rd111, %r69; + shl.b64 %rd112, %rd111, 32; + or.b64 %rd113, %rd110, %rd112; + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd114, %rd109, %rd113; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + mov.b64 {_, %r70}, %rd114; + cvt.u32.u64 %r71, %rd114; + shfl.sync.bfly.b32 %r72, %r71, 1, 31, -1; + shfl.sync.bfly.b32 %r73, %r70, 1, 31, -1; + cvt.u64.u32 %rd115, %r72; + cvt.u64.u32 %rd116, %r73; + shl.b64 %rd117, %rd116, 32; + or.b64 %rd118, %rd115, %rd117; + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd72, %rd114, %rd118; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + setp.eq.b32 %p35, %r1, 0; + // begin inline asm + @%p35 st.shared.b64 [ %r28 + 0 ], %rd72; + // end inline asm + bar.sync 0; + ld.shared.b64 %rd119, [global_smem]; +$L__tmp2: + .loc 1 82 20 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:82:20 + add.s64 %rd120, %rd119, -1; + setp.lt.u64 %p39, %rd120, 16383; + .loc 1 0 0 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:0 + selp.b32 %r30, 1, 0, %p39; + .loc 1 85 21 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:85:21 + setp.eq.b64 %p40, %rd119, 16384; + .loc 1 0 0 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:0 + selp.b32 %r31, 1, 0, %p40; + .loc 1 88 25 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:88:25 + shl.b64 %rd121, %rd1, 2; + add.s64 %rd73, %rd30, %rd121; + .loc 1 88 37 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:88:37 + and.b32 %r74, %r1, 511; + setp.eq.b32 %p41, %r74, 0; + and.pred %p36, %p41, %p8; + // begin inline asm + @%p36 st.global.b32 [ %rd73 + 0 ], { %r30 }; + // end inline asm + .loc 1 89 25 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:89:25 + add.s64 %rd74, %rd31, %rd121; + .loc 1 89 37 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:89:37 + // begin inline asm + @%p36 st.global.b32 [ %rd74 + 0 ], { %r31 }; + // end inline asm + .loc 1 89 4 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:89:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 307 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x12c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 98 +.b8 105 +.b8 103 +.b8 101 +.b8 121 +.b8 110 +.b8 107 +.b8 109 +.b8 97 +.b8 109 +.b8 105 +.b8 114 +.b8 122 +.b8 114 +.b8 97 +.b8 53 +.b8 111 +.b8 99 +.b8 100 +.b8 101 +.b8 107 +.b8 52 +.b8 118 +.b8 102 +.b8 101 +.b8 51 +.b8 105 +.b8 100 +.b8 110 +.b8 104 +.b8 50 +.b8 107 +.b8 114 +.b8 50 +.b8 98 +.b8 102 +.b8 115 +.b8 99 +.b8 98 +.b8 120 +.b8 116 +.b8 105 +.b8 105 +.b8 109 +.b8 51 +.b8 114 +.b8 113 +.b8 53 +.b8 100 +.b8 102 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 98 +.b8 105 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x7d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 111 +.b8 114 +.b8 95 +.b8 99 +.b8 111 +.b8 110 +.b8 115 +.b8 116 +.b8 97 +.b8 110 +.b8 116 +.b8 95 +.b8 112 +.b8 97 +.b8 100 +.b8 95 +.b8 110 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 101 +.b8 95 +.b8 103 +.b8 116 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 109 +.b8 117 +.b8 116 +.b8 101 +.b8 95 +.b8 114 +.b8 101 +.b8 109 +.b8 97 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 114 +.b8 95 +.b8 115 +.b8 117 +.b8 98 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x108:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11d:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 77 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source new file mode 100644 index 0000000000000000000000000000000000000000..43c58e55d3f88939292b1ae8b4066b1ea9853217 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source @@ -0,0 +1,395 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":18:0) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc91 = loc(unknown) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc98 = loc("in_ptr0"(#loc)) +#loc99 = loc("out_ptr1"(#loc)) +#loc100 = loc("out_ptr2"(#loc)) +#loc101 = loc("ks0"(#loc)) +#loc102 = loc("ks1"(#loc)) +#loc103 = loc("ks2"(#loc)) +#loc104 = loc("xnumel"(#loc)) +#loc105 = loc("r0_numel"(#loc)) +#loc188 = loc("input"(#loc89)) +#loc189 = loc("a"(#loc94)) +#loc190 = loc("b"(#loc94)) +module { + tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 {tt.divisibility = 16 : i32} loc("ks2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 16384 : i32 loc(#loc106) + %xoffset = tt.get_program_id x : i32 loc(#loc107) + %xoffset_1 = arith.constant 1 : i32 loc(#loc108) + %xoffset_2 = arith.constant 1 : i32 loc(#loc108) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc108) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc109) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc110) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc111) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc111) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc112) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc112) + %r0_base = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc113) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<1024xi32> -> tensor<1x1024xi32> loc(#loc114) + %x0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc115) + %x0_9 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc115) + %x0_10 = arith.remsi %x0, %x0_9 : tensor<1x1xi64> loc(#loc115) + %x1 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc116) + %x1_11 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc116) + %x1_12 = arith.divsi %x1, %x1_11 : tensor<1x1xi64> loc(#loc116) + %x1_13 = arith.constant 16 : i32 loc(#loc117) + %x1_14 = arith.constant 16 : i64 loc(#loc117) + %x1_15 = arith.constant dense<16> : tensor<1x1xi64> loc(#loc117) + %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<1x1xi64> loc(#loc117) + %x2 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc118) + %x2_17 = tt.splat %ks2 : i64 -> tensor<1x1xi64> loc(#loc118) + %x2_18 = arith.divsi %x2, %x2_17 : tensor<1x1xi64> loc(#loc118) + %_tmp36 = arith.constant 0 : i64 loc(#loc119) + %_tmp36_19 = arith.constant dense<0> : tensor<1x1024xi64> loc(#loc119) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c1024_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp36_20 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp36_24 = %_tmp36_19) -> (tensor<1x1024xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x1024xi32> loc(#loc121) + %r0_index_25 = arith.addi %r0_index, %r0_base_8 : tensor<1x1024xi32> loc(#loc121) + %r0_mask = arith.constant dense<16384> : tensor<1x1024xi32> loc(#loc122) + %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x1024xi32> loc(#loc122) + %r0_3 = arith.constant 128 : i32 loc(#loc123) + %r0_3_27 = arith.constant 128 : i32 loc(#loc123) + %r0_3_28 = arith.constant dense<128> : tensor<1x1024xi32> loc(#loc123) + %r0_3_29 = arith.remsi %r0_index_25, %r0_3_28 : tensor<1x1024xi32> loc(#loc123) + %r0_4 = arith.constant 128 : i32 loc(#loc124) + %r0_4_30 = arith.constant 128 : i32 loc(#loc124) + %r0_4_31 = arith.constant dense<128> : tensor<1x1024xi32> loc(#loc124) + %r0_4_32 = arith.divsi %r0_index_25, %r0_4_31 : tensor<1x1024xi32> loc(#loc124) + %tmp0 = arith.constant 128 : i32 loc(#loc125) + %tmp0_33 = arith.constant 128 : i64 loc(#loc125) + %tmp0_34 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc125) + %tmp0_35 = arith.muli %tmp0_34, %x0_10 : tensor<1x1xi64> loc(#loc125) + %tmp0_36 = arith.extsi %r0_3_29 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc126) + %tmp0_37 = tt.broadcast %tmp0_35 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc126) + %tmp0_38 = arith.addi %tmp0_36, %tmp0_37 : tensor<1x1024xi64> loc(#loc126) + %tmp2 = tt.splat %ks1 : i64 -> tensor<1x1024xi64> loc(#loc127) + %tmp2_39 = arith.cmpi slt, %tmp0_38, %tmp2 : tensor<1x1024xi64> loc(#loc127) + %tmp3 = arith.constant 128 : i32 loc(#loc128) + %tmp3_40 = arith.constant 128 : i64 loc(#loc128) + %tmp3_41 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc128) + %tmp3_42 = arith.muli %tmp3_41, %x1_16 : tensor<1x1xi64> loc(#loc128) + %tmp3_43 = arith.extsi %r0_4_32 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc129) + %tmp3_44 = tt.broadcast %tmp3_42 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc129) + %tmp3_45 = arith.addi %tmp3_43, %tmp3_44 : tensor<1x1024xi64> loc(#loc129) + %tmp4 = arith.constant 128 : i32 loc(#loc130) + %tmp4_46 = arith.constant 128 : i64 loc(#loc130) + %tmp4_47 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc130) + %tmp4_48 = arith.muli %tmp4_47, %x0_10 : tensor<1x1xi64> loc(#loc130) + %tmp4_49 = arith.extsi %r0_3_29 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc131) + %tmp4_50 = tt.broadcast %tmp4_48 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc131) + %tmp4_51 = arith.addi %tmp4_49, %tmp4_50 : tensor<1x1024xi64> loc(#loc131) + %tmp5 = arith.cmpi sge, %tmp3_45, %tmp4_51 : tensor<1x1024xi64> loc(#loc132) + %tmp6 = tt.broadcast %x2_18 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc133) + %tmp6_52 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x1024x!tt.ptr> loc(#loc134) + %tmp6_53 = tt.addptr %tmp6_52, %tmp6 : tensor<1x1024x!tt.ptr>, tensor<1x1024xi64> loc(#loc134) + %tmp6_54 = arith.andi %r0_mask_26, %tmp2_39 : tensor<1x1024xi1> loc(#loc135) + %tmp6_55 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x1024xi1> loc(#loc136) + %tmp6_56 = arith.andi %tmp6_54, %tmp6_55 : tensor<1x1024xi1> loc(#loc136) + %tmp6_57 = arith.constant 0.000000e+00 : f32 loc(#loc137) + %tmp6_58 = arith.constant dense<0.000000e+00> : tensor<1x1024xf32> loc(#loc137) + %tmp6_59 = arith.fptosi %tmp6_58 : tensor<1x1024xf32> to tensor<1x1024xi64> loc(#loc137) + %tmp6_60 = tt.load %tmp6_53, %tmp6_56, %tmp6_59 evictionPolicy = evict_last : tensor<1x1024x!tt.ptr> loc(#loc137) + %tmp7 = arith.cmpi slt, %tmp4_51, %tmp6_60 : tensor<1x1024xi64> loc(#loc138) + %tmp8 = arith.cmpi slt, %tmp3_45, %tmp6_60 : tensor<1x1024xi64> loc(#loc139) + %tmp9 = arith.andi %tmp7, %tmp8 : tensor<1x1024xi1> loc(#loc140) + %tmp10 = arith.andi %tmp5, %tmp9 : tensor<1x1024xi1> loc(#loc141) + %tmp11 = arith.constant false loc(#loc142) + %tmp11_61 = arith.constant dense : tensor<1x1xi1> loc(#loc142) + %tmp12 = arith.constant dense : tensor<1x1024xi1> loc(#loc143) + %tmp12_62 = arith.ori %tmp12, %tmp10 : tensor<1x1024xi1> loc(#loc143) + %tmp13 = arith.constant 2048 : i64 loc(#loc144) + %tmp13_63 = arith.constant dense<2048> : tensor<1x1xi64> loc(#loc144) + %tmp14 = arith.constant dense<2048> : tensor<1x1024xi64> loc(#loc145) + %tmp14_64 = arith.cmpi sge, %tmp4_51, %tmp14 : tensor<1x1024xi64> loc(#loc145) + %tmp15 = arith.constant 128 : i32 loc(#loc146) + %tmp15_65 = arith.constant 128 : i64 loc(#loc146) + %tmp15_66 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc146) + %tmp15_67 = arith.muli %tmp15_66, %x0_10 : tensor<1x1xi64> loc(#loc146) + %tmp15_68 = arith.extsi %r0_3_29 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc147) + %tmp15_69 = tt.broadcast %tmp15_67 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc147) + %tmp15_70 = arith.addi %tmp15_68, %tmp15_69 : tensor<1x1024xi64> loc(#loc147) + %tmp15_71 = arith.constant 2048 : i32 loc(#loc148) + %tmp15_72 = arith.constant 2048 : i64 loc(#loc148) + %tmp15_73 = arith.constant dense<2048> : tensor<1x1024xi64> loc(#loc148) + %tmp15_74 = arith.remsi %tmp15_70, %tmp15_73 : tensor<1x1024xi64> loc(#loc148) + %tmp16 = arith.cmpi slt, %tmp15_74, %tmp6_60 : tensor<1x1024xi64> loc(#loc149) + %tmp17 = arith.andi %tmp14_64, %tmp16 : tensor<1x1024xi1> loc(#loc150) + %tmp18 = arith.constant -1 : i32 loc(#loc151) + %tmp18_75 = arith.constant -1 : i32 loc(#loc151) + %tmp18_76 = arith.constant dense<-1> : tensor<1x1024xi32> loc(#loc151) + %tmp18_77 = arith.muli %tmp18_76, %r0_4_32 : tensor<1x1024xi32> loc(#loc151) + %tmp18_78 = arith.addi %r0_3_29, %tmp18_77 : tensor<1x1024xi32> loc(#loc152) + %tmp18_79 = arith.constant -128 : i32 loc(#loc153) + %tmp18_80 = arith.constant -128 : i64 loc(#loc153) + %tmp18_81 = arith.constant dense<-128> : tensor<1x1xi64> loc(#loc153) + %tmp18_82 = arith.muli %tmp18_81, %x1_16 : tensor<1x1xi64> loc(#loc153) + %tmp18_83 = arith.extsi %tmp18_78 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc154) + %tmp18_84 = tt.broadcast %tmp18_82 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc154) + %tmp18_85 = arith.addi %tmp18_83, %tmp18_84 : tensor<1x1024xi64> loc(#loc154) + %tmp18_86 = arith.constant 128 : i32 loc(#loc155) + %tmp18_87 = arith.constant 128 : i64 loc(#loc155) + %tmp18_88 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc155) + %tmp18_89 = arith.muli %tmp18_88, %x0_10 : tensor<1x1xi64> loc(#loc155) + %tmp18_90 = tt.broadcast %tmp18_89 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc156) + %tmp18_91 = arith.addi %tmp18_85, %tmp18_90 : tensor<1x1024xi64> loc(#loc156) + %tmp19 = arith.constant dense<2048> : tensor<1x1024xi64> loc(#loc157) + %tmp19_92 = arith.remsi %tmp18_91, %tmp19 : tensor<1x1024xi64> loc(#loc157) + %tmp20 = arith.constant 0 : i32 loc(#loc158) + %tmp20_93 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc158) + %tmp21 = arith.extsi %tmp20_93 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc159) + %tmp21_94 = tt.broadcast %tmp21 : tensor<1x1xi64> -> tensor<1x1024xi64> loc(#loc159) + %tmp21_95 = arith.cmpi ne, %tmp19_92, %tmp21_94 : tensor<1x1024xi64> loc(#loc159) + %tmp22 = arith.constant 0 : i32 loc(#loc160) + %tmp22_96 = arith.extsi %tmp22 : i32 to i64 loc(#loc160) + %tmp22_97 = tt.splat %tmp22_96 : i64 -> tensor<1x1024xi64> loc(#loc160) + %tmp22_98 = arith.cmpi slt, %tmp19_92, %tmp22_97 : tensor<1x1024xi64> loc(#loc160) + %tmp23 = arith.constant 0 : i32 loc(#loc161) + %tmp23_99 = arith.extsi %tmp23 : i32 to i64 loc(#loc161) + %tmp23_100 = tt.splat %tmp23_99 : i64 -> tensor<1x1xi64> loc(#loc161) + %tmp23_101 = arith.cmpi slt, %tmp13_63, %tmp23_100 : tensor<1x1xi64> loc(#loc161) + %tmp24 = tt.broadcast %tmp23_101 : tensor<1x1xi1> -> tensor<1x1024xi1> loc(#loc162) + %tmp24_102 = arith.cmpi ne, %tmp22_98, %tmp24 : tensor<1x1024xi1> loc(#loc162) + %tmp25 = arith.andi %tmp21_95, %tmp24_102 : tensor<1x1024xi1> loc(#loc163) + %tmp26 = arith.constant dense<2048> : tensor<1x1024xi64> loc(#loc164) + %tmp26_103 = arith.addi %tmp19_92, %tmp26 : tensor<1x1024xi64> loc(#loc164) + %tmp27 = arith.select %tmp25, %tmp26_103, %tmp19_92 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc165) + %tmp28 = arith.constant 0 : i64 loc(#loc166) + %tmp28_104 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc166) + %tmp29 = arith.constant dense<0> : tensor<1x1024xi64> loc(#loc167) + %tmp29_105 = arith.cmpi eq, %tmp27, %tmp29 : tensor<1x1024xi64> loc(#loc167) + %tmp30 = arith.andi %tmp17, %tmp29_105 : tensor<1x1024xi1> loc(#loc168) + %tmp31 = arith.ori %tmp12_62, %tmp30 : tensor<1x1024xi1> loc(#loc169) + %tmp32 = arith.constant false loc(#loc170) + %tmp32_106 = arith.constant dense : tensor<1x1024xi1> loc(#loc170) + %tmp33 = arith.select %tmp2_39, %tmp31, %tmp32_106 : tensor<1x1024xi1>, tensor<1x1024xi1> loc(#loc171) + %tmp34 = arith.extui %tmp33 : tensor<1x1024xi1> to tensor<1x1024xi64> loc(#loc172) + %tmp37 = arith.addi %_tmp36_24, %tmp34 : tensor<1x1024xi64> loc(#loc173) + %_tmp36_107 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x1024xi1> loc(#loc174) + %_tmp36_108 = arith.andi %r0_mask_26, %_tmp36_107 : tensor<1x1024xi1> loc(#loc174) + %_tmp36_109 = arith.select %_tmp36_108, %tmp37, %_tmp36_24 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc175) + scf.yield %_tmp36_109 : tensor<1x1024xi64> loc(#loc71) + } loc(#loc120) + %tmp36 = tt.call @"triton.language.standard.sum__i64S1_1024S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp36_20) : (tensor<1x1024xi64>) -> tensor<1xi64> loc(#loc176) + %tmp36_21 = tt.expand_dims %tmp36 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc177) + %tmp38 = arith.constant 0 : i64 loc(#loc178) + %tmp38_22 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc178) + %tmp39 = arith.cmpi sgt, %tmp36_21, %tmp38_22 : tensor<1x1xi64> loc(#loc179) + %tmp40 = arith.constant 16384 : i64 loc(#loc180) + %tmp40_23 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc180) + %tmp41 = arith.cmpi slt, %tmp36_21, %tmp40_23 : tensor<1x1xi64> loc(#loc181) + %tmp42 = arith.andi %tmp39, %tmp41 : tensor<1x1xi1> loc(#loc182) + %tmp43 = arith.extui %tmp42 : tensor<1x1xi1> to tensor<1x1xi8> loc(#loc183) + %tmp44 = arith.extsi %tmp43 : tensor<1x1xi8> to tensor<1x1xi32> loc(#loc184) + %tmp45 = arith.cmpi eq, %tmp36_21, %tmp40_23 : tensor<1x1xi64> loc(#loc185) + %tmp46 = arith.extui %tmp45 : tensor<1x1xi1> to tensor<1x1xi8> loc(#loc186) + %tmp47 = arith.extsi %tmp46 : tensor<1x1xi8> to tensor<1x1xi32> loc(#loc187) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc84) + %5 = tt.addptr %4, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc84) + tt.store %5, %tmp44, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc85) + %6 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc86) + %7 = tt.addptr %6, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc86) + tt.store %7, %tmp47, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc87) + tt.return loc(#loc88) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_1024S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x1024xi64> loc("input"(#loc89))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc90) + tt.reduce.return %2 : i64 loc(#loc90) + }) : (tensor<1x1024xi64>) -> tensor<1xi64> loc(#loc90) + tt.return %0 : tensor<1xi64> loc(#loc92) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc93) + tt.return %1 : tensor<1xi64> loc(#loc93) + } loc(#loc89) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc94)), %b: i64 loc("b"(#loc94))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc95) + tt.return %0 : i64 loc(#loc96) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc97) + tt.return %1 : i64 loc(#loc97) + } loc(#loc94) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":28:21) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":28:28) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":29:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":30:44) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":32:40) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":33:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":34:29) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":37:27) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":38:27) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":39:26) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":39:22) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":41:22) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":42:26) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":42:22) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":43:26) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":43:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":44:23) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:54) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:34) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:86) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:93) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:76) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":46:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":47:22) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":48:22) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":49:23) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":50:39) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":51:24) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":52:38) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":53:24) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":54:29) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":54:25) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":54:35) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":55:24) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":56:24) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:29) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:24) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:45) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:38) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:55) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:51) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":58:25) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":59:35) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":60:25) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":61:92) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":62:92) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":63:25) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":64:24) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":65:24) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":66:39) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":67:35) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":68:25) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":69:24) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":70:24) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":71:44) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":72:38) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":73:25) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":75:25) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":76:36) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":76:50) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":76:8) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":77:27) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":77:30) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":78:31) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":79:20) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":80:35) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":81:20) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":82:20) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":83:21) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":84:21) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":85:21) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":86:21) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":87:21) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":88:25) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":88:37) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":89:25) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":89:37) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":89:4) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc106 = loc("r0_numel"(#loc1)) +#loc107 = loc("xoffset"(#loc2)) +#loc108 = loc("xoffset"(#loc3)) +#loc109 = loc("xindex"(#loc4)) +#loc110 = loc("xindex"(#loc5)) +#loc111 = loc("xindex"(#loc6)) +#loc112 = loc("xmask"(#loc7)) +#loc113 = loc("r0_base"(#loc8)) +#loc114 = loc("r0_base"(#loc9)) +#loc115 = loc("x0"(#loc10)) +#loc116 = loc("x1"(#loc11)) +#loc117 = loc("x1"(#loc12)) +#loc118 = loc("x2"(#loc13)) +#loc119 = loc("_tmp36"(#loc14)) +#loc120 = loc("_tmp36"(#loc15)) +#loc121 = loc("r0_index"(#loc16)) +#loc122 = loc("r0_mask"(#loc17)) +#loc123 = loc("r0_3"(#loc18)) +#loc124 = loc("r0_4"(#loc19)) +#loc125 = loc("tmp0"(#loc20)) +#loc126 = loc("tmp0"(#loc21)) +#loc127 = loc("tmp2"(#loc22)) +#loc128 = loc("tmp3"(#loc23)) +#loc129 = loc("tmp3"(#loc24)) +#loc130 = loc("tmp4"(#loc25)) +#loc131 = loc("tmp4"(#loc26)) +#loc132 = loc("tmp5"(#loc27)) +#loc133 = loc("tmp6"(#loc28)) +#loc134 = loc("tmp6"(#loc29)) +#loc135 = loc("tmp6"(#loc30)) +#loc136 = loc("tmp6"(#loc31)) +#loc137 = loc("tmp6"(#loc32)) +#loc138 = loc("tmp7"(#loc33)) +#loc139 = loc("tmp8"(#loc34)) +#loc140 = loc("tmp9"(#loc35)) +#loc141 = loc("tmp10"(#loc36)) +#loc142 = loc("tmp11"(#loc37)) +#loc143 = loc("tmp12"(#loc38)) +#loc144 = loc("tmp13"(#loc39)) +#loc145 = loc("tmp14"(#loc40)) +#loc146 = loc("tmp15"(#loc41)) +#loc147 = loc("tmp15"(#loc42)) +#loc148 = loc("tmp15"(#loc43)) +#loc149 = loc("tmp16"(#loc44)) +#loc150 = loc("tmp17"(#loc45)) +#loc151 = loc("tmp18"(#loc46)) +#loc152 = loc("tmp18"(#loc47)) +#loc153 = loc("tmp18"(#loc48)) +#loc154 = loc("tmp18"(#loc49)) +#loc155 = loc("tmp18"(#loc50)) +#loc156 = loc("tmp18"(#loc51)) +#loc157 = loc("tmp19"(#loc52)) +#loc158 = loc("tmp20"(#loc53)) +#loc159 = loc("tmp21"(#loc54)) +#loc160 = loc("tmp22"(#loc55)) +#loc161 = loc("tmp23"(#loc56)) +#loc162 = loc("tmp24"(#loc57)) +#loc163 = loc("tmp25"(#loc58)) +#loc164 = loc("tmp26"(#loc59)) +#loc165 = loc("tmp27"(#loc60)) +#loc166 = loc("tmp28"(#loc61)) +#loc167 = loc("tmp29"(#loc62)) +#loc168 = loc("tmp30"(#loc63)) +#loc169 = loc("tmp31"(#loc64)) +#loc170 = loc("tmp32"(#loc65)) +#loc171 = loc("tmp33"(#loc66)) +#loc172 = loc("tmp34"(#loc67)) +#loc173 = loc("tmp37"(#loc68)) +#loc174 = loc("_tmp36"(#loc69)) +#loc175 = loc("_tmp36"(#loc70)) +#loc176 = loc("tmp36"(#loc72)) +#loc177 = loc("tmp36"(#loc73)) +#loc178 = loc("tmp38"(#loc74)) +#loc179 = loc("tmp39"(#loc75)) +#loc180 = loc("tmp40"(#loc76)) +#loc181 = loc("tmp41"(#loc77)) +#loc182 = loc("tmp42"(#loc78)) +#loc183 = loc("tmp43"(#loc79)) +#loc184 = loc("tmp44"(#loc80)) +#loc185 = loc("tmp45"(#loc81)) +#loc186 = loc("tmp46"(#loc82)) +#loc187 = loc("tmp47"(#loc83)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..0b0ea5045619a6c2cc1c284b73591acdbd8f9841 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir @@ -0,0 +1,243 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":18:0) +#loc1 = loc(unknown) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":77:27) +#loc68 = loc("in_ptr0"(#loc)) +#loc69 = loc("out_ptr1"(#loc)) +#loc70 = loc("out_ptr2"(#loc)) +#loc71 = loc("ks0"(#loc)) +#loc72 = loc("ks1"(#loc)) +#loc73 = loc("ks2"(#loc)) +#loc74 = loc("xnumel"(#loc)) +#loc75 = loc("r0_numel"(#loc)) +#loc124 = loc("tmp36"(#loc52)) +#loc139 = loc(callsite(#loc1 at #loc124)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 {tt.divisibility = 16 : i32} loc("ks2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<2048> : tensor<1x1024xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<1x1024xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<16384> : tensor<1x1024xi32, #blocked> loc(#loc1) + %c-128_i64 = arith.constant -128 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %c16_i64 = arith.constant 16 : i64 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_2 = arith.constant dense<16384> : tensor<1x1xi64, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<1x1xi64, #blocked> loc(#loc1) + %cst_4 = arith.constant dense : tensor<1x1024xi1, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<0> : tensor<1x1024xi64, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc76) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc77) + %r0_base = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc78) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<1024xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x1024xi32, #blocked> loc(#loc78) + %x0 = arith.extsi %xoffset : i32 to i64 loc(#loc79) + %x0_7 = arith.remsi %x0, %ks0 : i64 loc(#loc79) + %x1 = arith.divsi %x0, %ks0 : i64 loc(#loc80) + %x1_8 = arith.remsi %x1, %c16_i64 : i64 loc(#loc81) + %x2 = arith.divsi %x0, %ks2 : i64 loc(#loc82) + %tmp0 = arith.muli %x0_7, %c128_i64 : i64 loc(#loc83) + %tmp0_9 = tt.splat %tmp0 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc134) + %tmp2 = tt.splat %ks1 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc85) + %tmp3 = arith.muli %x1_8, %c128_i64 : i64 loc(#loc86) + %tmp3_10 = tt.splat %tmp3 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc135) + %tmp6 = tt.addptr %in_ptr0, %x2 : !tt.ptr, i64 loc(#loc88) + %tmp6_11 = tt.splat %xmask : i1 -> tensor<1x1024xi1, #blocked> loc(#loc136) + %tmp6_12 = tt.splat %tmp6 : !tt.ptr -> tensor<1x1024x!tt.ptr, #blocked> loc(#loc90) + %tmp18 = arith.muli %x1_8, %c-128_i64 : i64 loc(#loc91) + %tmp18_13 = tt.splat %tmp18 : i64 -> tensor<1x1024xi64, #blocked> loc(#loc137) + %_tmp36 = scf.for %_tmp36_15 = %c0_i32 to %c16384_i32 step %c1024_i32 iter_args(%arg9 = %cst_5) -> (tensor<1x1024xi64, #blocked>) : i32 { + %r0_index = tt.splat %_tmp36_15 : i32 -> tensor<1x1024xi32, #blocked> loc(#loc94) + %r0_index_16 = arith.addi %r0_index, %r0_base_6 : tensor<1x1024xi32, #blocked> loc(#loc94) + %r0_mask = arith.cmpi slt, %r0_index_16, %cst_1 : tensor<1x1024xi32, #blocked> loc(#loc95) + %r0_3 = arith.remsi %r0_index_16, %cst_0 : tensor<1x1024xi32, #blocked> loc(#loc96) + %r0_4 = arith.divsi %r0_index_16, %cst_0 : tensor<1x1024xi32, #blocked> loc(#loc97) + %tmp0_17 = arith.extsi %r0_3 : tensor<1x1024xi32, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc84) + %tmp0_18 = arith.addi %tmp0_17, %tmp0_9 : tensor<1x1024xi64, #blocked> loc(#loc84) + %tmp2_19 = arith.cmpi slt, %tmp0_18, %tmp2 : tensor<1x1024xi64, #blocked> loc(#loc85) + %tmp3_20 = arith.extsi %r0_4 : tensor<1x1024xi32, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc87) + %tmp3_21 = arith.addi %tmp3_20, %tmp3_10 : tensor<1x1024xi64, #blocked> loc(#loc87) + %tmp5 = arith.cmpi sge, %tmp3_21, %tmp0_18 : tensor<1x1024xi64, #blocked> loc(#loc98) + %tmp6_22 = arith.andi %r0_mask, %tmp2_19 : tensor<1x1024xi1, #blocked> loc(#loc99) + %tmp6_23 = arith.andi %tmp6_22, %tmp6_11 : tensor<1x1024xi1, #blocked> loc(#loc89) + %tmp6_24 = tt.load %tmp6_12, %tmp6_23, %cst_5 evictionPolicy = evict_last : tensor<1x1024x!tt.ptr, #blocked> loc(#loc90) + %tmp7 = arith.cmpi slt, %tmp0_18, %tmp6_24 : tensor<1x1024xi64, #blocked> loc(#loc100) + %tmp8 = arith.cmpi slt, %tmp3_21, %tmp6_24 : tensor<1x1024xi64, #blocked> loc(#loc101) + %tmp9 = arith.andi %tmp7, %tmp8 : tensor<1x1024xi1, #blocked> loc(#loc102) + %tmp10 = arith.andi %tmp5, %tmp9 : tensor<1x1024xi1, #blocked> loc(#loc103) + %tmp14 = arith.cmpi sge, %tmp0_18, %cst : tensor<1x1024xi64, #blocked> loc(#loc104) + %tmp15 = arith.remsi %tmp0_18, %cst : tensor<1x1024xi64, #blocked> loc(#loc105) + %tmp16 = arith.cmpi slt, %tmp15, %tmp6_24 : tensor<1x1024xi64, #blocked> loc(#loc106) + %tmp17 = arith.andi %tmp14, %tmp16 : tensor<1x1024xi1, #blocked> loc(#loc107) + %tmp18_25 = arith.subi %r0_3, %r0_4 : tensor<1x1024xi32, #blocked> loc(#loc108) + %tmp18_26 = arith.extsi %tmp18_25 : tensor<1x1024xi32, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc92) + %tmp18_27 = arith.addi %tmp18_26, %tmp18_13 : tensor<1x1024xi64, #blocked> loc(#loc92) + %tmp18_28 = arith.addi %tmp18_27, %tmp0_9 : tensor<1x1024xi64, #blocked> loc(#loc109) + %tmp19 = arith.remsi %tmp18_28, %cst : tensor<1x1024xi64, #blocked> loc(#loc110) + %tmp21 = arith.cmpi ne, %tmp19, %cst_5 : tensor<1x1024xi64, #blocked> loc(#loc111) + %tmp22 = arith.cmpi slt, %tmp19, %cst_5 : tensor<1x1024xi64, #blocked> loc(#loc112) + %tmp25 = arith.andi %tmp21, %tmp22 : tensor<1x1024xi1, #blocked> loc(#loc113) + %tmp26 = arith.addi %tmp19, %cst : tensor<1x1024xi64, #blocked> loc(#loc114) + %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi64, #blocked> loc(#loc115) + %tmp29 = arith.cmpi eq, %tmp27, %cst_5 : tensor<1x1024xi64, #blocked> loc(#loc116) + %tmp30 = arith.andi %tmp17, %tmp29 : tensor<1x1024xi1, #blocked> loc(#loc117) + %tmp31 = arith.ori %tmp10, %tmp30 : tensor<1x1024xi1, #blocked> loc(#loc118) + %tmp33 = arith.select %tmp2_19, %tmp31, %cst_4 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi1, #blocked> loc(#loc119) + %tmp34 = arith.extui %tmp33 : tensor<1x1024xi1, #blocked> to tensor<1x1024xi64, #blocked> loc(#loc120) + %tmp37 = arith.addi %arg9, %tmp34 : tensor<1x1024xi64, #blocked> loc(#loc121) + %_tmp36_29 = arith.andi %r0_mask, %tmp6_11 : tensor<1x1024xi1, #blocked> loc(#loc122) + %_tmp36_30 = arith.select %_tmp36_29, %tmp37, %arg9 : tensor<1x1024xi1, #blocked>, tensor<1x1024xi64, #blocked> loc(#loc123) + scf.yield %_tmp36_30 : tensor<1x1024xi64, #blocked> loc(#loc50) + } loc(#loc93) + %tmp36 = "tt.reduce"(%_tmp36) <{axis = 1 : i32}> ({ + ^bb0(%tmp36_15: i64 loc(callsite(#loc1 at #loc124)), %tmp36_16: i64 loc(callsite(#loc1 at #loc124))): + %tmp36_17 = arith.addi %tmp36_15, %tmp36_16 : i64 loc(#loc142) + tt.reduce.return %tmp36_17 : i64 loc(#loc138) + }) : (tensor<1x1024xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc138) + %tmp36_14 = tt.expand_dims %tmp36 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc125) + %tmp39 = arith.cmpi sgt, %tmp36_14, %cst_3 : tensor<1x1xi64, #blocked> loc(#loc126) + %tmp41 = arith.cmpi slt, %tmp36_14, %cst_2 : tensor<1x1xi64, #blocked> loc(#loc127) + %tmp42 = arith.andi %tmp39, %tmp41 : tensor<1x1xi1, #blocked> loc(#loc128) + %tmp44 = arith.extui %tmp42 : tensor<1x1xi1, #blocked> to tensor<1x1xi32, #blocked> loc(#loc140) + %tmp45 = arith.cmpi eq, %tmp36_14, %cst_2 : tensor<1x1xi64, #blocked> loc(#loc131) + %tmp47 = arith.extui %tmp45 : tensor<1x1xi1, #blocked> to tensor<1x1xi32, #blocked> loc(#loc141) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc63) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc64) + %2 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc64) + tt.store %1, %tmp44, %2 : tensor<1x1x!tt.ptr, #blocked> loc(#loc64) + %3 = tt.addptr %out_ptr2, %xoffset : !tt.ptr, i32 loc(#loc65) + %4 = tt.splat %3 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc66) + tt.store %4, %tmp47, %2 : tensor<1x1x!tt.ptr, #blocked> loc(#loc66) + tt.return loc(#loc67) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":27:19) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":28:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":28:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":29:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":39:26) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":39:22) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":41:22) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":42:26) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":42:22) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:34) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:93) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:76) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:45) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:38) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":32:40) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":33:31) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":34:29) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":37:27) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":38:27) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":44:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:86) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":46:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":47:22) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":48:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":49:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":53:24) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":54:35) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":55:24) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":56:24) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:24) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:51) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":58:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":60:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":61:92) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":64:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":65:24) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":66:39) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":68:25) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":69:24) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":70:24) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":72:38) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":73:25) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":75:25) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":76:36) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":76:50) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":76:8) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":77:30) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":79:20) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":81:20) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":82:20) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":84:21) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":83:21) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":85:21) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":87:21) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":86:21) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":88:25) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":88:37) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":89:25) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":89:37) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":89:4) +#loc76 = loc("xoffset"(#loc2)) +#loc77 = loc("xmask"(#loc3)) +#loc78 = loc("r0_base"(#loc4)) +#loc79 = loc("x0"(#loc5)) +#loc80 = loc("x1"(#loc6)) +#loc81 = loc("x1"(#loc7)) +#loc82 = loc("x2"(#loc8)) +#loc83 = loc("tmp0"(#loc9)) +#loc84 = loc("tmp0"(#loc10)) +#loc85 = loc("tmp2"(#loc11)) +#loc86 = loc("tmp3"(#loc12)) +#loc87 = loc("tmp3"(#loc13)) +#loc88 = loc("tmp6"(#loc14)) +#loc89 = loc("tmp6"(#loc15)) +#loc90 = loc("tmp6"(#loc16)) +#loc91 = loc("tmp18"(#loc17)) +#loc92 = loc("tmp18"(#loc18)) +#loc93 = loc("_tmp36"(#loc19)) +#loc94 = loc("r0_index"(#loc20)) +#loc95 = loc("r0_mask"(#loc21)) +#loc96 = loc("r0_3"(#loc22)) +#loc97 = loc("r0_4"(#loc23)) +#loc98 = loc("tmp5"(#loc24)) +#loc99 = loc("tmp6"(#loc25)) +#loc100 = loc("tmp7"(#loc26)) +#loc101 = loc("tmp8"(#loc27)) +#loc102 = loc("tmp9"(#loc28)) +#loc103 = loc("tmp10"(#loc29)) +#loc104 = loc("tmp14"(#loc30)) +#loc105 = loc("tmp15"(#loc31)) +#loc106 = loc("tmp16"(#loc32)) +#loc107 = loc("tmp17"(#loc33)) +#loc108 = loc("tmp18"(#loc34)) +#loc109 = loc("tmp18"(#loc35)) +#loc110 = loc("tmp19"(#loc36)) +#loc111 = loc("tmp21"(#loc37)) +#loc112 = loc("tmp22"(#loc38)) +#loc113 = loc("tmp25"(#loc39)) +#loc114 = loc("tmp26"(#loc40)) +#loc115 = loc("tmp27"(#loc41)) +#loc116 = loc("tmp29"(#loc42)) +#loc117 = loc("tmp30"(#loc43)) +#loc118 = loc("tmp31"(#loc44)) +#loc119 = loc("tmp33"(#loc45)) +#loc120 = loc("tmp34"(#loc46)) +#loc121 = loc("tmp37"(#loc47)) +#loc122 = loc("_tmp36"(#loc48)) +#loc123 = loc("_tmp36"(#loc49)) +#loc125 = loc("tmp36"(#loc54)) +#loc126 = loc("tmp39"(#loc55)) +#loc127 = loc("tmp41"(#loc56)) +#loc128 = loc("tmp42"(#loc57)) +#loc129 = loc("tmp44"(#loc58)) +#loc130 = loc("tmp43"(#loc59)) +#loc131 = loc("tmp45"(#loc60)) +#loc132 = loc("tmp47"(#loc61)) +#loc133 = loc("tmp46"(#loc62)) +#loc134 = loc(fused[#loc84, #loc83]) +#loc135 = loc(fused[#loc87, #loc86]) +#loc136 = loc(fused[#loc89, #loc77]) +#loc137 = loc(fused[#loc92, #loc91]) +#loc138 = loc(callsite(#loc51 at #loc124)) +#loc140 = loc(fused[#loc129, #loc130]) +#loc141 = loc(fused[#loc132, #loc133]) +#loc142 = loc(callsite(#loc53 at #loc138)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..fb48090353e5e0977ac2ae874fcef4e5d197a1bd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/E65YV7Z3DD2J7KPCT73V7EFLOORC5DAO7VTOILSRPD2HSYH62YHQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir @@ -0,0 +1,246 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":18:0) +#loc1 = loc(unknown) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":77:27) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("out_ptr1"(#loc)) +#loc72 = loc("out_ptr2"(#loc)) +#loc73 = loc("ks0"(#loc)) +#loc74 = loc("ks1"(#loc)) +#loc75 = loc("ks2"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc128 = loc("tmp36"(#loc54)) +#loc143 = loc(callsite(#loc1 at #loc128)) +module { + tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 {tt.divisibility = 16 : i32} loc("ks2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c-128_i64 = arith.constant -128 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %x1 = arith.constant 16 : i64 loc(#loc78) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc3) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %tmp40 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc79) + %cst = arith.constant dense<0> : tensor<1x1xi64> loc(#loc1) + %cst_0 = arith.constant dense<2048> : tensor<1x1024xi64> loc(#loc1) + %cst_1 = arith.constant dense : tensor<1x1024xi1> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<1x1024xi32> loc(#loc1) + %cst_3 = arith.constant dense<16384> : tensor<1x1024xi32> loc(#loc1) + %cst_4 = arith.constant dense<0> : tensor<1x1024xi64> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc80) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc81) + %xmask_5 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc81) + %r0_base = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc82) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<1024xi32> -> tensor<1x1024xi32> loc(#loc83) + %x0 = arith.extsi %xoffset : i32 to i64 loc(#loc84) + %x0_7 = arith.remsi %x0, %ks0 : i64 loc(#loc84) + %x1_8 = arith.divsi %x0, %ks0 : i64 loc(#loc85) + %x1_9 = arith.remsi %x1_8, %x1 : i64 loc(#loc78) + %x2 = arith.divsi %x0, %ks2 : i64 loc(#loc86) + %_tmp36 = scf.for %r0_offset = %c0_i32 to %c16384_i32 step %c1024_i32 iter_args(%_tmp36_11 = %cst_4) -> (tensor<1x1024xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x1024xi32> loc(#loc88) + %r0_index_12 = arith.addi %r0_index, %r0_base_6 : tensor<1x1024xi32> loc(#loc88) + %r0_mask = arith.cmpi slt, %r0_index_12, %cst_3 : tensor<1x1024xi32> loc(#loc89) + %r0_3 = arith.remsi %r0_index_12, %cst_2 : tensor<1x1024xi32> loc(#loc90) + %r0_4 = arith.divsi %r0_index_12, %cst_2 : tensor<1x1024xi32> loc(#loc91) + %tmp0 = arith.muli %x0_7, %c128_i64 : i64 loc(#loc92) + %tmp0_13 = arith.extsi %r0_3 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc93) + %tmp0_14 = tt.splat %tmp0 : i64 -> tensor<1x1024xi64> loc(#loc138) + %tmp0_15 = arith.addi %tmp0_13, %tmp0_14 : tensor<1x1024xi64> loc(#loc93) + %tmp2 = tt.splat %ks1 : i64 -> tensor<1x1024xi64> loc(#loc94) + %tmp2_16 = arith.cmpi slt, %tmp0_15, %tmp2 : tensor<1x1024xi64> loc(#loc94) + %tmp3 = arith.muli %x1_9, %c128_i64 : i64 loc(#loc95) + %tmp3_17 = arith.extsi %r0_4 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc96) + %tmp3_18 = tt.splat %tmp3 : i64 -> tensor<1x1024xi64> loc(#loc139) + %tmp3_19 = arith.addi %tmp3_17, %tmp3_18 : tensor<1x1024xi64> loc(#loc96) + %tmp5 = arith.cmpi sge, %tmp3_19, %tmp0_15 : tensor<1x1024xi64> loc(#loc97) + %tmp6 = tt.addptr %in_ptr0, %x2 : !tt.ptr, i64 loc(#loc98) + %tmp6_20 = tt.splat %tmp6 : !tt.ptr -> tensor<1x1024x!tt.ptr> loc(#loc98) + %tmp6_21 = arith.andi %r0_mask, %tmp2_16 : tensor<1x1024xi1> loc(#loc99) + %tmp6_22 = tt.splat %xmask : i1 -> tensor<1x1024xi1> loc(#loc140) + %tmp6_23 = arith.andi %tmp6_21, %tmp6_22 : tensor<1x1024xi1> loc(#loc100) + %tmp6_24 = tt.load %tmp6_20, %tmp6_23, %cst_4 evictionPolicy = evict_last : tensor<1x1024x!tt.ptr> loc(#loc101) + %tmp7 = arith.cmpi slt, %tmp0_15, %tmp6_24 : tensor<1x1024xi64> loc(#loc102) + %tmp8 = arith.cmpi slt, %tmp3_19, %tmp6_24 : tensor<1x1024xi64> loc(#loc103) + %tmp9 = arith.andi %tmp7, %tmp8 : tensor<1x1024xi1> loc(#loc104) + %tmp10 = arith.andi %tmp5, %tmp9 : tensor<1x1024xi1> loc(#loc105) + %tmp14 = arith.cmpi sge, %tmp0_15, %cst_0 : tensor<1x1024xi64> loc(#loc106) + %tmp15 = arith.remsi %tmp0_15, %cst_0 : tensor<1x1024xi64> loc(#loc107) + %tmp16 = arith.cmpi slt, %tmp15, %tmp6_24 : tensor<1x1024xi64> loc(#loc108) + %tmp17 = arith.andi %tmp14, %tmp16 : tensor<1x1024xi1> loc(#loc109) + %tmp18 = arith.subi %r0_3, %r0_4 : tensor<1x1024xi32> loc(#loc110) + %tmp18_25 = arith.muli %x1_9, %c-128_i64 : i64 loc(#loc111) + %tmp18_26 = arith.extsi %tmp18 : tensor<1x1024xi32> to tensor<1x1024xi64> loc(#loc112) + %tmp18_27 = tt.splat %tmp18_25 : i64 -> tensor<1x1024xi64> loc(#loc141) + %tmp18_28 = arith.addi %tmp18_26, %tmp18_27 : tensor<1x1024xi64> loc(#loc112) + %tmp18_29 = arith.addi %tmp18_28, %tmp0_14 : tensor<1x1024xi64> loc(#loc113) + %tmp19 = arith.remsi %tmp18_29, %cst_0 : tensor<1x1024xi64> loc(#loc114) + %tmp21 = arith.cmpi ne, %tmp19, %cst_4 : tensor<1x1024xi64> loc(#loc115) + %tmp22 = arith.cmpi slt, %tmp19, %cst_4 : tensor<1x1024xi64> loc(#loc116) + %tmp25 = arith.andi %tmp21, %tmp22 : tensor<1x1024xi1> loc(#loc117) + %tmp26 = arith.addi %tmp19, %cst_0 : tensor<1x1024xi64> loc(#loc118) + %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc119) + %tmp29 = arith.cmpi eq, %tmp27, %cst_4 : tensor<1x1024xi64> loc(#loc120) + %tmp30 = arith.andi %tmp17, %tmp29 : tensor<1x1024xi1> loc(#loc121) + %tmp31 = arith.ori %tmp10, %tmp30 : tensor<1x1024xi1> loc(#loc122) + %tmp33 = arith.select %tmp2_16, %tmp31, %cst_1 : tensor<1x1024xi1>, tensor<1x1024xi1> loc(#loc123) + %tmp34 = arith.extui %tmp33 : tensor<1x1024xi1> to tensor<1x1024xi64> loc(#loc124) + %tmp37 = arith.addi %_tmp36_11, %tmp34 : tensor<1x1024xi64> loc(#loc125) + %_tmp36_30 = arith.andi %r0_mask, %tmp6_22 : tensor<1x1024xi1> loc(#loc126) + %_tmp36_31 = arith.select %_tmp36_30, %tmp37, %_tmp36_11 : tensor<1x1024xi1>, tensor<1x1024xi64> loc(#loc127) + scf.yield %_tmp36_31 : tensor<1x1024xi64> loc(#loc52) + } loc(#loc87) + %tmp36 = "tt.reduce"(%_tmp36) <{axis = 1 : i32}> ({ + ^bb0(%tmp36_11: i64 loc(callsite(#loc1 at #loc128)), %tmp36_12: i64 loc(callsite(#loc1 at #loc128))): + %tmp36_13 = arith.addi %tmp36_11, %tmp36_12 : i64 loc(#loc146) + tt.reduce.return %tmp36_13 : i64 loc(#loc142) + }) : (tensor<1x1024xi64>) -> tensor<1xi64> loc(#loc142) + %tmp36_10 = tt.expand_dims %tmp36 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc129) + %tmp39 = arith.cmpi sgt, %tmp36_10, %cst : tensor<1x1xi64> loc(#loc130) + %tmp41 = arith.cmpi slt, %tmp36_10, %tmp40 : tensor<1x1xi64> loc(#loc131) + %tmp42 = arith.andi %tmp39, %tmp41 : tensor<1x1xi1> loc(#loc132) + %tmp44 = arith.extui %tmp42 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc144) + %tmp45 = arith.cmpi eq, %tmp36_10, %tmp40 : tensor<1x1xi64> loc(#loc135) + %tmp47 = arith.extui %tmp45 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc145) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc65) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc65) + tt.store %1, %tmp44, %xmask_5 : tensor<1x1x!tt.ptr> loc(#loc66) + %2 = tt.addptr %out_ptr2, %xoffset : !tt.ptr, i32 loc(#loc67) + %3 = tt.splat %2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc67) + tt.store %3, %tmp47, %xmask_5 : tensor<1x1x!tt.ptr> loc(#loc68) + tt.return loc(#loc69) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":28:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":32:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":80:35) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":22:28) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":24:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":25:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":25:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":27:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":28:21) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":29:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":33:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":34:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":37:27) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":38:27) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":39:26) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":39:22) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":41:22) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":42:26) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":42:22) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":44:23) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:86) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:93) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:76) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":46:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":47:22) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":48:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":49:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":53:24) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":54:35) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":55:24) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":56:24) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:24) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:45) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:38) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:51) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":58:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":60:25) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":61:92) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":64:24) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":65:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":66:39) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":68:25) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":69:24) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":70:24) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":72:38) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":73:25) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":75:25) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":76:36) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":76:50) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":76:8) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":77:30) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":79:20) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":81:20) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":82:20) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":84:21) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":83:21) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":85:21) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":87:21) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":86:21) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":88:25) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":88:37) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":89:25) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":89:37) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":89:4) +#loc78 = loc("x1"(#loc2)) +#loc79 = loc("tmp40"(#loc4)) +#loc80 = loc("xoffset"(#loc5)) +#loc81 = loc("xmask"(#loc6)) +#loc82 = loc("r0_base"(#loc7)) +#loc83 = loc("r0_base"(#loc8)) +#loc84 = loc("x0"(#loc9)) +#loc85 = loc("x1"(#loc10)) +#loc86 = loc("x2"(#loc11)) +#loc87 = loc("_tmp36"(#loc3)) +#loc88 = loc("r0_index"(#loc12)) +#loc89 = loc("r0_mask"(#loc13)) +#loc90 = loc("r0_3"(#loc14)) +#loc91 = loc("r0_4"(#loc15)) +#loc92 = loc("tmp0"(#loc16)) +#loc93 = loc("tmp0"(#loc17)) +#loc94 = loc("tmp2"(#loc18)) +#loc95 = loc("tmp3"(#loc19)) +#loc96 = loc("tmp3"(#loc20)) +#loc97 = loc("tmp5"(#loc21)) +#loc98 = loc("tmp6"(#loc22)) +#loc99 = loc("tmp6"(#loc23)) +#loc100 = loc("tmp6"(#loc24)) +#loc101 = loc("tmp6"(#loc25)) +#loc102 = loc("tmp7"(#loc26)) +#loc103 = loc("tmp8"(#loc27)) +#loc104 = loc("tmp9"(#loc28)) +#loc105 = loc("tmp10"(#loc29)) +#loc106 = loc("tmp14"(#loc30)) +#loc107 = loc("tmp15"(#loc31)) +#loc108 = loc("tmp16"(#loc32)) +#loc109 = loc("tmp17"(#loc33)) +#loc110 = loc("tmp18"(#loc34)) +#loc111 = loc("tmp18"(#loc35)) +#loc112 = loc("tmp18"(#loc36)) +#loc113 = loc("tmp18"(#loc37)) +#loc114 = loc("tmp19"(#loc38)) +#loc115 = loc("tmp21"(#loc39)) +#loc116 = loc("tmp22"(#loc40)) +#loc117 = loc("tmp25"(#loc41)) +#loc118 = loc("tmp26"(#loc42)) +#loc119 = loc("tmp27"(#loc43)) +#loc120 = loc("tmp29"(#loc44)) +#loc121 = loc("tmp30"(#loc45)) +#loc122 = loc("tmp31"(#loc46)) +#loc123 = loc("tmp33"(#loc47)) +#loc124 = loc("tmp34"(#loc48)) +#loc125 = loc("tmp37"(#loc49)) +#loc126 = loc("_tmp36"(#loc50)) +#loc127 = loc("_tmp36"(#loc51)) +#loc129 = loc("tmp36"(#loc56)) +#loc130 = loc("tmp39"(#loc57)) +#loc131 = loc("tmp41"(#loc58)) +#loc132 = loc("tmp42"(#loc59)) +#loc133 = loc("tmp44"(#loc60)) +#loc134 = loc("tmp43"(#loc61)) +#loc135 = loc("tmp45"(#loc62)) +#loc136 = loc("tmp47"(#loc63)) +#loc137 = loc("tmp46"(#loc64)) +#loc138 = loc(fused[#loc93, #loc92]) +#loc139 = loc(fused[#loc96, #loc95]) +#loc140 = loc(fused[#loc100, #loc81]) +#loc141 = loc(fused[#loc112, #loc111]) +#loc142 = loc(callsite(#loc53 at #loc128)) +#loc144 = loc(fused[#loc133, #loc134]) +#loc145 = loc(fused[#loc136, #loc137]) +#loc146 = loc(callsite(#loc55 at #loc142)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..34b00ac6984fb9c0efb80dd3beff4ebd3f71a50e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin new file mode 100644 index 0000000000000000000000000000000000000000..9dcf7709724501b4b5b5922a3cf4ec4e14544035 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d148fe77f7ed5dfed5ec9d0346e7b78ee43e0163 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"hash": "20789ed347536015de365783d5f81371fbd0587b607eff6d47e815134479247e", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir new file mode 100644 index 0000000000000000000000000000000000000000..2c6be4dee3067ebafaffb860c0c51d55c518b038 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir @@ -0,0 +1,393 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = icmp samesign ult i32 %8, 32, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 15, !dbg !9 + %12 = and i32 %8, 15, !dbg !10 + %13 = lshr i32 %8, 4, !dbg !11 + %14 = mul nuw nsw i32 %11, 17, !dbg !12 + %15 = add nuw nsw i32 %14, %12, !dbg !13 + %16 = mul i32 %13, 272, !dbg !14 + %17 = add i32 %15, %16, !dbg !15 + %18 = sext i32 %17 to i64, !dbg !16 + %19 = getelementptr i32, ptr addrspace(1) %0, i64 %18, !dbg !16 + %20 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %19, i1 %9) #3, !dbg !17 + %21 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %19, i1 %9) #3, !dbg !17 + %22 = lshr i32 %10, 1, !dbg !18 + %.lobit = and i32 %22, 1, !dbg !18 + %23 = and i32 %10, 1, !dbg !18 + %24 = lshr i32 %10, 2, !dbg !18 + %.lobit1 = and i32 %24, 1, !dbg !18 + %25 = lshr i32 %10, 3, !dbg !18 + %.lobit2 = and i32 %25, 1, !dbg !18 + %26 = xor i32 %23, 1, !dbg !22 + %27 = xor i32 %.lobit, 1, !dbg !22 + %28 = xor i32 %.lobit1, 1, !dbg !22 + %29 = xor i32 %.lobit2, 1, !dbg !22 + %30 = mul nuw nsw i32 %20, %26, !dbg !23 + %31 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %30, i32 1, i32 31), !dbg !24 + %32 = add i32 %31, %30, !dbg !27 + %33 = mul nuw nsw i32 %20, %23, !dbg !28 + %34 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %33, i32 1, i32 31), !dbg !24 + %35 = add i32 %34, %33, !dbg !27 + %36 = mul nuw nsw i32 %26, %11, !dbg !29 + %37 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %36, i32 1, i32 31), !dbg !24 + %38 = add i32 %37, %36, !dbg !27 + %39 = mul nuw nsw i32 %11, %23, !dbg !30 + %40 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 1, i32 31), !dbg !24 + %41 = add i32 %40, %39, !dbg !27 + %42 = trunc i32 %22 to i1, !dbg !31 + %43 = icmp sge i32 %32, %35, !dbg !31 + %44 = icmp ne i32 %32, %35, !dbg !31 + %45 = icmp sle i32 %38, %41, !dbg !31 + %46 = or i1 %44, %45, !dbg !31 + %47 = and i1 %43, %46, !dbg !31 + %.not = xor i1 %47, %42, !dbg !31 + %48 = xor i32 %35, %32, !dbg !32 + %49 = select i1 %.not, i32 0, i32 %48, !dbg !33 + %50 = xor i32 %49, %20, !dbg !34 + %51 = xor i32 %41, %38, !dbg !35 + %52 = select i1 %.not, i32 0, i32 %51, !dbg !36 + %53 = xor i32 %52, %11, !dbg !37 + %54 = mul nuw nsw i32 %50, %27, !dbg !23 + %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 2, i32 31), !dbg !24 + %56 = add i32 %54, %55, !dbg !27 + %57 = mul nuw nsw i32 %50, %.lobit, !dbg !28 + %58 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %57, i32 2, i32 31), !dbg !24 + %59 = add i32 %57, %58, !dbg !27 + %60 = mul nuw nsw i32 %53, %27, !dbg !29 + %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 2, i32 31), !dbg !24 + %62 = add i32 %60, %61, !dbg !27 + %63 = mul nuw nsw i32 %53, %.lobit, !dbg !30 + %64 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 2, i32 31), !dbg !24 + %65 = add i32 %63, %64, !dbg !27 + %66 = trunc i32 %24 to i1, !dbg !31 + %67 = icmp sge i32 %56, %59, !dbg !31 + %68 = icmp ne i32 %56, %59, !dbg !31 + %69 = icmp sle i32 %62, %65, !dbg !31 + %70 = or i1 %68, %69, !dbg !31 + %71 = and i1 %67, %70, !dbg !31 + %.not3 = xor i1 %71, %66, !dbg !31 + %72 = xor i32 %56, %59, !dbg !32 + %73 = select i1 %.not3, i32 0, i32 %72, !dbg !33 + %74 = xor i32 %73, %50, !dbg !34 + %75 = xor i32 %62, %65, !dbg !35 + %76 = select i1 %.not3, i32 0, i32 %75, !dbg !36 + %77 = xor i32 %76, %53, !dbg !37 + %78 = mul nuw nsw i32 %74, %26, !dbg !23 + %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 1, i32 31), !dbg !24 + %80 = add i32 %78, %79, !dbg !27 + %81 = mul nuw nsw i32 %74, %23, !dbg !28 + %82 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %81, i32 1, i32 31), !dbg !24 + %83 = add i32 %81, %82, !dbg !27 + %84 = mul nuw nsw i32 %77, %26, !dbg !29 + %85 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %84, i32 1, i32 31), !dbg !24 + %86 = add i32 %84, %85, !dbg !27 + %87 = mul nuw nsw i32 %77, %23, !dbg !30 + %88 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 1, i32 31), !dbg !24 + %89 = add i32 %87, %88, !dbg !27 + %90 = icmp sge i32 %80, %83, !dbg !31 + %91 = icmp ne i32 %80, %83, !dbg !31 + %92 = icmp sle i32 %86, %89, !dbg !31 + %93 = or i1 %91, %92, !dbg !31 + %94 = and i1 %90, %93, !dbg !31 + %.not4 = xor i1 %94, %66, !dbg !31 + %95 = xor i32 %80, %83, !dbg !32 + %96 = select i1 %.not4, i32 0, i32 %95, !dbg !33 + %97 = xor i32 %96, %74, !dbg !34 + %98 = xor i32 %86, %89, !dbg !35 + %99 = select i1 %.not4, i32 0, i32 %98, !dbg !36 + %100 = xor i32 %99, %77, !dbg !37 + %101 = mul nuw nsw i32 %97, %28, !dbg !23 + %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 4, i32 31), !dbg !24 + %103 = add i32 %101, %102, !dbg !27 + %104 = mul nuw nsw i32 %97, %.lobit1, !dbg !28 + %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 4, i32 31), !dbg !24 + %106 = add i32 %104, %105, !dbg !27 + %107 = mul nuw nsw i32 %100, %28, !dbg !29 + %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 4, i32 31), !dbg !24 + %109 = add i32 %107, %108, !dbg !27 + %110 = mul nuw nsw i32 %100, %.lobit1, !dbg !30 + %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 4, i32 31), !dbg !24 + %112 = add i32 %110, %111, !dbg !27 + %113 = trunc i32 %25 to i1, !dbg !31 + %114 = icmp sge i32 %103, %106, !dbg !31 + %115 = icmp ne i32 %103, %106, !dbg !31 + %116 = icmp sle i32 %109, %112, !dbg !31 + %117 = or i1 %115, %116, !dbg !31 + %118 = and i1 %114, %117, !dbg !31 + %.not5 = xor i1 %118, %113, !dbg !31 + %119 = xor i32 %103, %106, !dbg !32 + %120 = select i1 %.not5, i32 0, i32 %119, !dbg !33 + %121 = xor i32 %120, %97, !dbg !34 + %122 = xor i32 %109, %112, !dbg !35 + %123 = select i1 %.not5, i32 0, i32 %122, !dbg !36 + %124 = xor i32 %123, %100, !dbg !37 + %125 = mul nuw nsw i32 %121, %27, !dbg !23 + %126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 2, i32 31), !dbg !24 + %127 = add i32 %125, %126, !dbg !27 + %128 = mul nuw nsw i32 %121, %.lobit, !dbg !28 + %129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 2, i32 31), !dbg !24 + %130 = add i32 %128, %129, !dbg !27 + %131 = mul nuw nsw i32 %124, %27, !dbg !29 + %132 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %131, i32 2, i32 31), !dbg !24 + %133 = add i32 %131, %132, !dbg !27 + %134 = mul nuw nsw i32 %124, %.lobit, !dbg !30 + %135 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 2, i32 31), !dbg !24 + %136 = add i32 %134, %135, !dbg !27 + %137 = icmp sge i32 %127, %130, !dbg !31 + %138 = icmp ne i32 %127, %130, !dbg !31 + %139 = icmp sle i32 %133, %136, !dbg !31 + %140 = or i1 %138, %139, !dbg !31 + %141 = and i1 %137, %140, !dbg !31 + %.not6 = xor i1 %141, %113, !dbg !31 + %142 = xor i32 %127, %130, !dbg !32 + %143 = select i1 %.not6, i32 0, i32 %142, !dbg !33 + %144 = xor i32 %143, %121, !dbg !34 + %145 = xor i32 %133, %136, !dbg !35 + %146 = select i1 %.not6, i32 0, i32 %145, !dbg !36 + %147 = xor i32 %146, %124, !dbg !37 + %148 = mul nuw nsw i32 %144, %26, !dbg !23 + %149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 1, i32 31), !dbg !24 + %150 = add i32 %148, %149, !dbg !27 + %151 = mul nuw nsw i32 %144, %23, !dbg !28 + %152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %151, i32 1, i32 31), !dbg !24 + %153 = add i32 %151, %152, !dbg !27 + %154 = mul nuw nsw i32 %147, %26, !dbg !29 + %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 1, i32 31), !dbg !24 + %156 = add i32 %154, %155, !dbg !27 + %157 = mul nuw nsw i32 %147, %23, !dbg !30 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !24 + %159 = add i32 %157, %158, !dbg !27 + %160 = icmp sge i32 %150, %153, !dbg !31 + %161 = icmp ne i32 %150, %153, !dbg !31 + %162 = icmp sle i32 %156, %159, !dbg !31 + %163 = or i1 %161, %162, !dbg !31 + %164 = and i1 %160, %163, !dbg !31 + %.not7 = xor i1 %164, %113, !dbg !31 + %165 = xor i32 %150, %153, !dbg !32 + %166 = select i1 %.not7, i32 0, i32 %165, !dbg !33 + %167 = xor i32 %166, %144, !dbg !34 + %168 = xor i32 %156, %159, !dbg !35 + %169 = select i1 %.not7, i32 0, i32 %168, !dbg !36 + %170 = xor i32 %169, %147, !dbg !37 + %171 = mul nuw nsw i32 %167, %29, !dbg !23 + %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %171, i32 8, i32 31), !dbg !24 + %173 = add i32 %171, %172, !dbg !27 + %174 = mul nuw nsw i32 %167, %.lobit2, !dbg !28 + %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 8, i32 31), !dbg !24 + %176 = add i32 %174, %175, !dbg !27 + %177 = mul nuw nsw i32 %170, %29, !dbg !29 + %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 8, i32 31), !dbg !24 + %179 = add i32 %177, %178, !dbg !27 + %180 = mul nuw nsw i32 %170, %.lobit2, !dbg !30 + %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 8, i32 31), !dbg !24 + %182 = add i32 %180, %181, !dbg !27 + %183 = icmp slt i32 %173, %176, !dbg !38 + %184 = icmp eq i32 %173, %176, !dbg !39 + %185 = icmp sgt i32 %179, %182, !dbg !40 + %186 = and i1 %184, %185, !dbg !41 + %187 = or i1 %183, %186, !dbg !42 + %188 = xor i32 %173, %176, !dbg !32 + %189 = select i1 %187, i32 %188, i32 0, !dbg !33 + %190 = xor i32 %189, %167, !dbg !34 + %191 = xor i32 %179, %182, !dbg !35 + %192 = select i1 %187, i32 %191, i32 0, !dbg !36 + %193 = xor i32 %192, %170, !dbg !37 + %194 = mul nuw nsw i32 %190, %28, !dbg !23 + %195 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %194, i32 4, i32 31), !dbg !24 + %196 = add i32 %194, %195, !dbg !27 + %197 = mul nuw nsw i32 %190, %.lobit1, !dbg !28 + %198 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %197, i32 4, i32 31), !dbg !24 + %199 = add i32 %197, %198, !dbg !27 + %200 = mul nuw nsw i32 %193, %28, !dbg !29 + %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 4, i32 31), !dbg !24 + %202 = add i32 %200, %201, !dbg !27 + %203 = mul nuw nsw i32 %193, %.lobit1, !dbg !30 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 4, i32 31), !dbg !24 + %205 = add i32 %203, %204, !dbg !27 + %206 = icmp slt i32 %196, %199, !dbg !38 + %207 = icmp eq i32 %196, %199, !dbg !39 + %208 = icmp sgt i32 %202, %205, !dbg !40 + %209 = and i1 %207, %208, !dbg !41 + %210 = or i1 %206, %209, !dbg !42 + %211 = xor i32 %196, %199, !dbg !32 + %212 = select i1 %210, i32 %211, i32 0, !dbg !33 + %213 = xor i32 %212, %190, !dbg !34 + %214 = xor i32 %202, %205, !dbg !35 + %215 = select i1 %210, i32 %214, i32 0, !dbg !36 + %216 = xor i32 %215, %193, !dbg !37 + %217 = mul nuw nsw i32 %213, %27, !dbg !23 + %218 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %217, i32 2, i32 31), !dbg !24 + %219 = add i32 %217, %218, !dbg !27 + %220 = mul nuw nsw i32 %213, %.lobit, !dbg !28 + %221 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 2, i32 31), !dbg !24 + %222 = add i32 %220, %221, !dbg !27 + %223 = mul nuw nsw i32 %216, %27, !dbg !29 + %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 2, i32 31), !dbg !24 + %225 = add i32 %223, %224, !dbg !27 + %226 = mul nuw nsw i32 %216, %.lobit, !dbg !30 + %227 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %226, i32 2, i32 31), !dbg !24 + %228 = add i32 %226, %227, !dbg !27 + %229 = icmp slt i32 %219, %222, !dbg !38 + %230 = icmp eq i32 %219, %222, !dbg !39 + %231 = icmp sgt i32 %225, %228, !dbg !40 + %232 = and i1 %230, %231, !dbg !41 + %233 = or i1 %229, %232, !dbg !42 + %234 = xor i32 %219, %222, !dbg !32 + %235 = select i1 %233, i32 %234, i32 0, !dbg !33 + %236 = xor i32 %235, %213, !dbg !34 + %237 = xor i32 %225, %228, !dbg !35 + %238 = select i1 %233, i32 %237, i32 0, !dbg !36 + %239 = xor i32 %238, %216, !dbg !37 + %240 = mul nuw nsw i32 %236, %26, !dbg !23 + %241 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 1, i32 31), !dbg !24 + %242 = add i32 %240, %241, !dbg !27 + %243 = mul nuw nsw i32 %236, %23, !dbg !28 + %244 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 1, i32 31), !dbg !24 + %245 = add i32 %243, %244, !dbg !27 + %246 = mul nuw nsw i32 %239, %26, !dbg !29 + %247 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 1, i32 31), !dbg !24 + %248 = add i32 %246, %247, !dbg !27 + %249 = mul nuw nsw i32 %239, %23, !dbg !30 + %250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %249, i32 1, i32 31), !dbg !24 + %251 = add i32 %249, %250, !dbg !27 + %252 = icmp slt i32 %242, %245, !dbg !38 + %253 = icmp eq i32 %242, %245, !dbg !39 + %254 = icmp sgt i32 %248, %251, !dbg !40 + %255 = and i1 %253, %254, !dbg !41 + %256 = or i1 %252, %255, !dbg !42 + %257 = xor i32 %248, %251, !dbg !35 + %258 = select i1 %256, i32 %257, i32 0, !dbg !36 + %259 = xor i32 %258, %239, !dbg !37 + %narrow = select i1 %9, i32 %21, i32 0, !dbg !43 + %260 = sext i32 %narrow to i64, !dbg !43 + %261 = ashr i32 %narrow, 31, !dbg !44 + %262 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %narrow, i32 8, i32 31), !dbg !44 + %263 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %261, i32 8, i32 31), !dbg !44 + %264 = insertelement <2 x i32> poison, i32 %262, i64 0, !dbg !44 + %265 = insertelement <2 x i32> %264, i32 %263, i64 1, !dbg !44 + %266 = bitcast <2 x i32> %265 to i64, !dbg !44 + %267 = add i64 %266, %260, !dbg !46 + %extelt.offset = lshr i64 %267, 32, !dbg !44 + %268 = trunc nuw i64 %extelt.offset to i32, !dbg !44 + %269 = trunc i64 %267 to i32, !dbg !44 + %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 4, i32 31), !dbg !44 + %271 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 4, i32 31), !dbg !44 + %272 = insertelement <2 x i32> poison, i32 %270, i64 0, !dbg !44 + %273 = insertelement <2 x i32> %272, i32 %271, i64 1, !dbg !44 + %274 = bitcast <2 x i32> %273 to i64, !dbg !44 + %275 = add i64 %267, %274, !dbg !46 + %extelt.offset8 = lshr i64 %275, 32, !dbg !44 + %276 = trunc nuw i64 %extelt.offset8 to i32, !dbg !44 + %277 = trunc i64 %275 to i32, !dbg !44 + %278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %277, i32 2, i32 31), !dbg !44 + %279 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 2, i32 31), !dbg !44 + %280 = insertelement <2 x i32> poison, i32 %278, i64 0, !dbg !44 + %281 = insertelement <2 x i32> %280, i32 %279, i64 1, !dbg !44 + %282 = bitcast <2 x i32> %281 to i64, !dbg !44 + %283 = add i64 %275, %282, !dbg !46 + %extelt.offset9 = lshr i64 %283, 32, !dbg !44 + %284 = trunc nuw i64 %extelt.offset9 to i32, !dbg !44 + %285 = trunc i64 %283 to i32, !dbg !44 + %286 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 1, i32 31), !dbg !44 + %287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %284, i32 1, i32 31), !dbg !44 + %288 = insertelement <2 x i32> poison, i32 %286, i64 0, !dbg !44 + %289 = insertelement <2 x i32> %288, i32 %287, i64 1, !dbg !44 + %290 = bitcast <2 x i32> %289 to i64, !dbg !44 + %291 = add i64 %283, %290, !dbg !46 + %292 = trunc i64 %291 to i32, !dbg !47 + %293 = shl i32 %8, 4, !dbg !48 + %294 = or disjoint i32 %11, %293, !dbg !49 + %295 = sext i32 %294 to i64, !dbg !50 + %296 = getelementptr i32, ptr addrspace(1) %1, i64 %295, !dbg !50 + %297 = and i32 %10, 48, !dbg !51 + %298 = icmp eq i32 %297, 0, !dbg !51 + %299 = and i1 %9, %298, !dbg !51 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %259, ptr addrspace(1) %296, i1 %299) #3, !dbg !51 + %300 = zext nneg i32 %8 to i64, !dbg !52 + %301 = getelementptr i32, ptr addrspace(1) %2, i64 %300, !dbg !52 + %302 = and i32 %10, 63, !dbg !53 + %303 = icmp eq i32 %302, 0, !dbg !53 + %304 = and i1 %9, %303, !dbg !53 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %292, ptr addrspace(1) %301, i1 %304) #3, !dbg !53 + ret void, !dbg !54 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", linkageName: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 24, column: 28, scope: !4) +!8 = !DILocation(line: 26, column: 21, scope: !4) +!9 = !DILocation(line: 27, column: 38, scope: !4) +!10 = !DILocation(line: 33, column: 19, scope: !4) +!11 = !DILocation(line: 34, column: 19, scope: !4) +!12 = !DILocation(line: 36, column: 38, scope: !4) +!13 = !DILocation(line: 36, column: 35, scope: !4) +!14 = !DILocation(line: 36, column: 49, scope: !4) +!15 = !DILocation(line: 36, column: 45, scope: !4) +!16 = !DILocation(line: 36, column: 30, scope: !4) +!17 = !DILocation(line: 36, column: 54, scope: !4) +!18 = !DILocation(line: 627, column: 44, scope: !19, inlinedAt: !21) +!19 = distinct !DILexicalBlockFile(scope: !4, file: !20, discriminator: 0) +!20 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!21 = !DILocation(line: 41, column: 67, scope: !4) +!22 = !DILocation(line: 537, column: 21, scope: !19, inlinedAt: !21) +!23 = !DILocation(line: 538, column: 40, scope: !19, inlinedAt: !21) +!24 = !DILocation(line: 291, column: 36, scope: !25, inlinedAt: !21) +!25 = distinct !DILexicalBlockFile(scope: !4, file: !26, discriminator: 0) +!26 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!27 = !DILocation(line: 261, column: 15, scope: !25, inlinedAt: !21) +!28 = !DILocation(line: 539, column: 41, scope: !19, inlinedAt: !21) +!29 = !DILocation(line: 548, column: 23, scope: !19, inlinedAt: !21) +!30 = !DILocation(line: 551, column: 23, scope: !19, inlinedAt: !21) +!31 = !DILocation(line: 599, column: 28, scope: !19, inlinedAt: !21) +!32 = !DILocation(line: 600, column: 38, scope: !19, inlinedAt: !21) +!33 = !DILocation(line: 600, column: 46, scope: !19, inlinedAt: !21) +!34 = !DILocation(line: 600, column: 15, scope: !19, inlinedAt: !21) +!35 = !DILocation(line: 601, column: 48, scope: !19, inlinedAt: !21) +!36 = !DILocation(line: 601, column: 59, scope: !19, inlinedAt: !21) +!37 = !DILocation(line: 601, column: 22, scope: !19, inlinedAt: !21) +!38 = !DILocation(line: 574, column: 22, scope: !19, inlinedAt: !21) +!39 = !DILocation(line: 591, column: 21, scope: !19, inlinedAt: !21) +!40 = !DILocation(line: 594, column: 40, scope: !19, inlinedAt: !21) +!41 = !DILocation(line: 594, column: 29, scope: !19, inlinedAt: !21) +!42 = !DILocation(line: 594, column: 23, scope: !19, inlinedAt: !21) +!43 = !DILocation(line: 44, column: 34, scope: !4) +!44 = !DILocation(line: 291, column: 36, scope: !25, inlinedAt: !45) +!45 = !DILocation(line: 45, column: 26, scope: !4) +!46 = !DILocation(line: 261, column: 15, scope: !25, inlinedAt: !45) +!47 = !DILocation(line: 48, column: 21, scope: !4) +!48 = !DILocation(line: 49, column: 35, scope: !4) +!49 = !DILocation(line: 49, column: 32, scope: !4) +!50 = !DILocation(line: 49, column: 25, scope: !4) +!51 = !DILocation(line: 49, column: 47, scope: !4) +!52 = !DILocation(line: 50, column: 25, scope: !4) +!53 = !DILocation(line: 50, column: 37, scope: !4) +!54 = !DILocation(line: 50, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx new file mode 100644 index 0000000000000000000000000000000000000000..d39e66fc4e2b88b439bbfd7b6e0a5558cc7e5065 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx @@ -0,0 +1,863 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 // -- Begin function triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 + // @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.visible .entry triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_6 +) +.reqntid 64 +{ + .reg .pred %p<64>; + .reg .b32 %r<224>; + .reg .b64 %rd<26>; + .loc 1 18 0 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:18:0 + +// %bb.0: + ld.param.b64 %rd5, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0]; + ld.param.b64 %rd6, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1]; +$L__tmp0: + .loc 1 24 28 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:24:28 + mov.u32 %r5, %ctaid.x; + .loc 1 26 21 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:26:21 + setp.lt.u32 %p1, %r5, 32; + ld.param.b64 %rd7, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2]; + .loc 1 27 38 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:27:38 + mov.u32 %r6, %tid.x; + and.b32 %r7, %r6, 15; + .loc 1 33 19 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:33:19 + and.b32 %r8, %r5, 15; + .loc 1 34 19 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:34:19 + shr.u32 %r9, %r5, 4; + .loc 1 36 35 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:36:35 + mad.lo.s32 %r10, %r7, 17, %r8; + .loc 1 36 45 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:36:45 + mad.lo.s32 %r11, %r9, 272, %r10; + .loc 1 36 30 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:36:30 + mad.wide.s32 %rd1, %r11, 4, %rd5; + .loc 1 36 54 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:36:54 + // begin inline asm + mov.u32 %r1, 0x0; + @%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2, 0x0; + @%p1 ld.global.b32 { %r2 }, [ %rd1 + 0 ]; + // end inline asm +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shr.u32 %r12, %r6, 1; + bfe.u32 %r13, %r6, 1, 1; + and.b32 %r14, %r6, 1; + shr.u32 %r15, %r6, 2; + bfe.u32 %r16, %r6, 2, 1; + shr.u32 %r17, %r6, 3; + bfe.u32 %r18, %r6, 3, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r19, %r14, 1; + xor.b32 %r20, %r13, 1; + xor.b32 %r21, %r16, 1; + xor.b32 %r22, %r18, 1; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r23, %r1, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r24, %r23, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r25, %r24, %r23; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r26, %r1, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r27, %r26, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r28, %r27, %r26; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r29, %r19, %r7; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r30, %r29, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r31, %r30, %r29; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r32, %r7, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r33, %r32, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r34, %r33, %r32; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.b32 %r35, %r12, 1; + setp.ne.b32 %p5, %r35, 0; + setp.ge.s32 %p6, %r25, %r28; + setp.ne.b32 %p7, %r25, %r28; + setp.le.s32 %p8, %r31, %r34; + or.pred %p9, %p7, %p8; + and.pred %p10, %p6, %p9; + xor.pred %p11, %p10, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r36, %r28, %r25; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r37, 0, %r36, %p11; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r38, %r37, %r1; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r39, %r34, %r31; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r40, 0, %r39, %p11; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r41, %r40, %r7; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r42, %r38, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r43, %r42, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r44, %r42, %r43; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r45, %r38, %r13; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r46, %r45, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r47, %r45, %r46; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r48, %r41, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r49, %r48, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r50, %r48, %r49; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r51, %r41, %r13; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r52, %r51, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r53, %r51, %r52; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.b32 %r54, %r15, 1; + setp.ne.b32 %p12, %r54, 0; + setp.ge.s32 %p13, %r44, %r47; + setp.ne.b32 %p14, %r44, %r47; + setp.le.s32 %p15, %r50, %r53; + or.pred %p16, %p14, %p15; + and.pred %p17, %p13, %p16; + xor.pred %p18, %p17, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r55, %r44, %r47; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r56, 0, %r55, %p18; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r57, %r56, %r38; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r58, %r50, %r53; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r59, 0, %r58, %p18; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r60, %r59, %r41; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r61, %r57, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r62, %r61, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r63, %r61, %r62; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r64, %r57, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r65, %r64, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r66, %r64, %r65; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r67, %r60, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r68, %r67, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r69, %r67, %r68; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r70, %r60, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r71, %r70, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r72, %r70, %r71; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.ge.s32 %p19, %r63, %r66; + setp.ne.b32 %p20, %r63, %r66; + setp.le.s32 %p21, %r69, %r72; + or.pred %p22, %p20, %p21; + and.pred %p23, %p19, %p22; + xor.pred %p24, %p23, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r73, %r63, %r66; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r74, 0, %r73, %p24; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r75, %r74, %r57; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r76, %r69, %r72; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r77, 0, %r76, %p24; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r78, %r77, %r60; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r79, %r75, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r80, %r79, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r81, %r79, %r80; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r82, %r75, %r16; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r83, %r82, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r84, %r82, %r83; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r85, %r78, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r86, %r85, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r87, %r85, %r86; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r88, %r78, %r16; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r89, %r88, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r90, %r88, %r89; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.b32 %r91, %r17, 1; + setp.ne.b32 %p25, %r91, 0; + setp.ge.s32 %p26, %r81, %r84; + setp.ne.b32 %p27, %r81, %r84; + setp.le.s32 %p28, %r87, %r90; + or.pred %p29, %p27, %p28; + and.pred %p30, %p26, %p29; + xor.pred %p31, %p30, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r92, %r81, %r84; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r93, 0, %r92, %p31; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r94, %r93, %r75; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r95, %r87, %r90; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r96, 0, %r95, %p31; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r97, %r96, %r78; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r98, %r94, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r99, %r98, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r100, %r98, %r99; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r101, %r94, %r13; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r102, %r101, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r103, %r101, %r102; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r104, %r97, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r105, %r104, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r106, %r104, %r105; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r107, %r97, %r13; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r108, %r107, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r109, %r107, %r108; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.ge.s32 %p32, %r100, %r103; + setp.ne.b32 %p33, %r100, %r103; + setp.le.s32 %p34, %r106, %r109; + or.pred %p35, %p33, %p34; + and.pred %p36, %p32, %p35; + xor.pred %p37, %p36, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r110, %r100, %r103; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r111, 0, %r110, %p37; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r112, %r111, %r94; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r113, %r106, %r109; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r114, 0, %r113, %p37; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r115, %r114, %r97; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r116, %r112, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r117, %r116, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r118, %r116, %r117; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r119, %r112, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r120, %r119, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r121, %r119, %r120; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r122, %r115, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r123, %r122, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r124, %r122, %r123; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r125, %r115, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r126, %r125, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r127, %r125, %r126; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.ge.s32 %p38, %r118, %r121; + setp.ne.b32 %p39, %r118, %r121; + setp.le.s32 %p40, %r124, %r127; + or.pred %p41, %p39, %p40; + and.pred %p42, %p38, %p41; + xor.pred %p43, %p42, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r128, %r118, %r121; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r129, 0, %r128, %p43; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r130, %r129, %r112; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r131, %r124, %r127; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r132, 0, %r131, %p43; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r133, %r132, %r115; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r134, %r130, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r135, %r134, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r136, %r134, %r135; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r137, %r130, %r18; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r138, %r137, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r139, %r137, %r138; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r140, %r133, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r141, %r140, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r142, %r140, %r141; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r143, %r133, %r18; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r144, %r143, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r145, %r143, %r144; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p44, %r136, %r139; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p45, %r136, %r139; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p46, %r142, %r145; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p47, %p45, %p46; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p48, %p44, %p47; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r146, %r136, %r139; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r147, %r146, 0, %p48; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r148, %r147, %r130; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r149, %r142, %r145; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r150, %r149, 0, %p48; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r151, %r150, %r133; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r152, %r148, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r153, %r152, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r154, %r152, %r153; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r155, %r148, %r16; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r156, %r155, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r157, %r155, %r156; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r158, %r151, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r159, %r158, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r160, %r158, %r159; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r161, %r151, %r16; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r162, %r161, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r163, %r161, %r162; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p49, %r154, %r157; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p50, %r154, %r157; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p51, %r160, %r163; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p52, %p50, %p51; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p53, %p49, %p52; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r164, %r154, %r157; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r165, %r164, 0, %p53; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r166, %r165, %r148; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r167, %r160, %r163; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r168, %r167, 0, %p53; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r169, %r168, %r151; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r170, %r166, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r171, %r170, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r172, %r170, %r171; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r173, %r166, %r13; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r174, %r173, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r175, %r173, %r174; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r176, %r169, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r177, %r176, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r178, %r176, %r177; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r179, %r169, %r13; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r180, %r179, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r181, %r179, %r180; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p54, %r172, %r175; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p55, %r172, %r175; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p56, %r178, %r181; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + and.pred %p57, %p55, %p56; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + or.pred %p58, %p54, %p57; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r182, %r172, %r175; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r183, %r182, 0, %p58; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r184, %r183, %r166; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r185, %r178, %r181; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r186, %r185, 0, %p58; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r187, %r186, %r169; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r188, %r184, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r189, %r188, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r190, %r188, %r189; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r191, %r184, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r192, %r191, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r193, %r191, %r192; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r194, %r187, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r195, %r194, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r196, %r194, %r195; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + mul.lo.s32 %r197, %r187, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + shfl.sync.bfly.b32 %r198, %r197, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + add.s32 %r199, %r197, %r198; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.lt.s32 %p59, %r190, %r193; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.eq.b32 %p60, %r190, %r193; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + setp.gt.s32 %p61, %r196, %r199; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r200, %r196, %r199; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + selp.b32 %r201, %r200, 0, %p61; + selp.b32 %r202, %r201, 0, %p60; + selp.b32 %r203, %r200, %r202, %p59; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:41:67 ] + xor.b32 %r3, %r203, %r187; +$L__tmp2: + .loc 1 44 34 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:44:34 + selp.b32 %r204, %r2, 0, %p1; + cvt.s64.s32 %rd8, %r204; +$L__tmp3: + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + shr.s32 %r205, %r204, 31; + shfl.sync.bfly.b32 %r206, %r204, 8, 31, -1; + shfl.sync.bfly.b32 %r207, %r205, 8, 31, -1; + cvt.u64.u32 %rd9, %r206; + cvt.u64.u32 %rd10, %r207; + shl.b64 %rd11, %rd10, 32; + or.b64 %rd12, %rd9, %rd11; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + add.s64 %rd13, %rd12, %rd8; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + mov.b64 {_, %r208}, %rd13; + cvt.u32.u64 %r209, %rd13; + shfl.sync.bfly.b32 %r210, %r209, 4, 31, -1; + shfl.sync.bfly.b32 %r211, %r208, 4, 31, -1; + cvt.u64.u32 %rd14, %r210; + cvt.u64.u32 %rd15, %r211; + shl.b64 %rd16, %rd15, 32; + or.b64 %rd17, %rd14, %rd16; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + add.s64 %rd18, %rd13, %rd17; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + mov.b64 {_, %r212}, %rd18; + cvt.u32.u64 %r213, %rd18; + shfl.sync.bfly.b32 %r214, %r213, 2, 31, -1; + shfl.sync.bfly.b32 %r215, %r212, 2, 31, -1; + cvt.u64.u32 %rd19, %r214; + cvt.u64.u32 %rd20, %r215; + shl.b64 %rd21, %rd20, 32; + or.b64 %rd22, %rd19, %rd21; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + add.s64 %rd23, %rd18, %rd22; + .loc 3 291 36 // standard.py:291:36 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + mov.b64 {_, %r216}, %rd23; + cvt.u32.u64 %r217, %rd23; + shfl.sync.bfly.b32 %r218, %r217, 1, 31, -1; + shfl.sync.bfly.b32 %r219, %r216, 1, 31, -1; + cvt.u64.u32 %rd24, %r218; + .loc 3 261 15 // standard.py:261:15 @[ cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:45:26 ] + add.s64 %rd25, %rd23, %rd24; +$L__tmp4: + .loc 1 48 21 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:48:21 + cvt.u32.u64 %r4, %rd25; + .loc 1 49 35 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:49:35 + shl.b32 %r220, %r5, 4; + .loc 1 49 32 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:49:32 + or.b32 %r221, %r7, %r220; + .loc 1 49 25 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:49:25 + mad.wide.s32 %rd3, %r221, 4, %rd6; + .loc 1 49 47 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:49:47 + and.b32 %r222, %r6, 48; + setp.eq.b32 %p62, %r222, 0; + and.pred %p3, %p1, %p62; + // begin inline asm + @%p3 st.global.b32 [ %rd3 + 0 ], { %r3 }; + // end inline asm + .loc 1 50 25 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:50:25 + mad.wide.u32 %rd4, %r5, 4, %rd7; + .loc 1 50 37 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:50:37 + and.b32 %r223, %r6, 63; + setp.eq.b32 %p63, %r223, 0; + and.pred %p4, %p1, %p63; + // begin inline asm + @%p4 st.global.b32 [ %rd4 + 0 ], { %r4 }; + // end inline asm + .loc 1 50 4 // cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py:50:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 267 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x104 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 119 +.b8 55 +.b8 117 +.b8 54 +.b8 114 +.b8 103 +.b8 107 +.b8 111 +.b8 105 +.b8 104 +.b8 119 +.b8 105 +.b8 122 +.b8 117 +.b8 103 +.b8 101 +.b8 54 +.b8 52 +.b8 52 +.b8 109 +.b8 110 +.b8 103 +.b8 50 +.b8 100 +.b8 100 +.b8 103 +.b8 51 +.b8 98 +.b8 113 +.b8 105 +.b8 105 +.b8 120 +.b8 102 +.b8 120 +.b8 101 +.b8 103 +.b8 115 +.b8 109 +.b8 120 +.b8 117 +.b8 103 +.b8 116 +.b8 98 +.b8 108 +.b8 97 +.b8 102 +.b8 99 +.b8 54 +.b8 107 +.b8 122 +.b8 51 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 119 +.b8 55 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x3d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 99 +.b8 108 +.b8 111 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 108 +.b8 105 +.b8 99 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 116 +.b8 114 +.b8 97 +.b8 110 +.b8 115 +.b8 112 +.b8 111 +.b8 115 +.b8 101 +.b8 95 +.b8 51 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc8:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xdd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 67 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xf5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source b/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source new file mode 100644 index 0000000000000000000000000000000000000000..c0b5420358093ebe0c6a4036168a6318c24f8ea8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source @@ -0,0 +1,1216 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":18:0) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc87 = loc(unknown) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc139 = loc("in_ptr0"(#loc)) +#loc140 = loc("out_ptr2"(#loc)) +#loc141 = loc("out_ptr3"(#loc)) +#loc142 = loc("xnumel"(#loc)) +#loc143 = loc("r0_numel"(#loc)) +#loc172 = loc("x"(#loc37)) +#loc173 = loc("idxs"(#loc37)) +#loc174 = loc("x"(#loc41)) +#loc175 = loc("idxs"(#loc41)) +#loc180 = loc("x"(#loc49)) +#loc181 = loc("idxs"(#loc49)) +#loc182 = loc("flip"(#loc49)) +#loc238 = loc("input"(#loc112)) +#loc239 = loc("a"(#loc116)) +#loc240 = loc("b"(#loc116)) +#loc242 = loc("x"(#loc121)) +#loc243 = loc("x"(#loc125)) +#loc244 = loc("input"(#loc134)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 32 : i32 loc(#loc144) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc145) + %xoffset = tt.get_program_id x : i32 loc(#loc146) + %xoffset_2 = arith.constant 1 : i32 loc(#loc147) + %xoffset_3 = arith.constant 1 : i32 loc(#loc147) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc147) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc148) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc149) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc150) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc150) + %xmask = arith.constant dense<32> : tensor<1x1xi32> loc(#loc151) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc151) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc152) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc153) + %r0_offset = arith.constant 0 : i32 loc(#loc154) + %r0_mask = arith.constant true loc(#loc155) + %r0_mask_10 = arith.constant dense : tensor<1x16xi1> loc(#loc155) + %x0 = arith.constant 16 : i32 loc(#loc156) + %x0_11 = arith.constant 16 : i32 loc(#loc156) + %x0_12 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc156) + %x0_13 = arith.remsi %xindex_7, %x0_12 : tensor<1x1xi32> loc(#loc156) + %x1 = arith.constant 16 : i32 loc(#loc157) + %x1_14 = arith.constant 16 : i32 loc(#loc157) + %x1_15 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc157) + %x1_16 = arith.divsi %xindex_7, %x1_15 : tensor<1x1xi32> loc(#loc157) + %tmp0 = arith.constant 17 : i32 loc(#loc158) + %tmp0_17 = arith.constant 17 : i32 loc(#loc158) + %tmp0_18 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc158) + %tmp0_19 = arith.muli %tmp0_18, %r0_index_9 : tensor<1x16xi32> loc(#loc158) + %tmp0_20 = tt.broadcast %x0_13 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc159) + %tmp0_21 = arith.addi %tmp0_20, %tmp0_19 : tensor<1x16xi32> loc(#loc159) + %tmp0_22 = arith.constant 272 : i32 loc(#loc160) + %tmp0_23 = arith.constant 272 : i32 loc(#loc160) + %tmp0_24 = arith.constant dense<272> : tensor<1x1xi32> loc(#loc160) + %tmp0_25 = arith.muli %tmp0_24, %x1_16 : tensor<1x1xi32> loc(#loc160) + %tmp0_26 = tt.broadcast %tmp0_25 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc161) + %tmp0_27 = arith.addi %tmp0_21, %tmp0_26 : tensor<1x16xi32> loc(#loc161) + %tmp0_28 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc162) + %tmp0_29 = tt.addptr %tmp0_28, %tmp0_27 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc162) + %tmp0_30 = arith.constant 0.000000e+00 : f32 loc(#loc163) + %tmp0_31 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc163) + %tmp0_32 = arith.constant dense<0.000000e+00> : tensor<1x16xf32> loc(#loc163) + %tmp0_33 = arith.fptosi %tmp0_32 : tensor<1x16xf32> to tensor<1x16xi32> loc(#loc163) + %tmp0_34 = tt.load %tmp0_29, %tmp0_31, %tmp0_33 : tensor<1x16x!tt.ptr> loc(#loc163) + %tmp2 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc164) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp0_34, %tmp2) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc22) + %tmp7 = arith.extsi %tmp0_34 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc165) + %tmp10 = arith.constant 0 : i32 loc(#loc166) + %tmp10_35 = arith.constant 0 : i64 loc(#loc166) + %tmp10_36 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc166) + %tmp10_37 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc166) + %tmp10_38 = arith.select %tmp10_37, %tmp7, %tmp10_36 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc166) + %tmp11 = tt.call @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp10_38) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc167) + %tmp11_39 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc168) + %tmp12 = arith.extsi %0#1 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc169) + %tmp13 = arith.trunci %tmp12 : tensor<1x16xi64> to tensor<1x16xi32> loc(#loc170) + %tmp14 = arith.trunci %tmp11_39 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc171) + %c16_i32 = arith.constant 16 : i32 loc(#loc30) + %c16_i32_40 = arith.constant 16 : i32 loc(#loc30) + %cst = arith.constant dense<16> : tensor<1x1xi32> loc(#loc30) + %1 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc30) + %2 = tt.broadcast %1 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc31) + %3 = arith.addi %r0_index_9, %2 : tensor<1x16xi32> loc(#loc31) + %4 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc32) + %5 = tt.addptr %4, %3 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc32) + %6 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc33) + tt.store %5, %tmp13, %6 : tensor<1x16x!tt.ptr> loc(#loc33) + %7 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc34) + %8 = tt.addptr %7, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc34) + tt.store %8, %tmp14, %xmask_8 : tensor<1x1x!tt.ptr> loc(#loc35) + tt.return loc(#loc36) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc37)), %idxs: tensor<1x16xi16> loc("idxs"(#loc37))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc38) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc38) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc38) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc38) + tt.return %3#0, %3#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc39) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc40) + %5 = ub.poison : tensor<1x16xi32> loc(#loc40) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc40) + } loc(#loc37) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc41)), %idxs: tensor<1x16xi16> loc("idxs"(#loc41))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc176) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc177) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc177) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc178) + %flip_3 = tt.reshape %flip_2 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc179) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i16S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi16>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + tt.return %0#0, %0#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc47) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi32> loc(#loc48) + %2 = ub.poison : tensor<1x16xi32> loc(#loc48) + tt.return %1, %2 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc48) + } loc(#loc41) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i16S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi16> loc("idxs"(#loc49)), %flip: tensor<1x16xi32> loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi16> -> tensor<8x2x1xi16> loc(#loc197) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc198) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc199) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<8x2x1xi16> loc(#loc199) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<8x2x1xi16>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc201) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc202) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc203) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc204) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<8x2x1xi16> loc(#loc204) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<8x2x1xi16>) -> tensor<8x1xi32> loc(#loc205) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc206) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc207) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_27 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_28 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_49 = arith.constant true loc(#loc215) + %cond_50 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<1x16xi1> loc(#loc215) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<1x16xi1> loc(#loc216) + %cond_53 = arith.ori %cond, %cond_52 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_53 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_50 = arith.ori %eq, %eq_49 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_50 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<1x16xi32> loc(#loc221) + %cond_30 = arith.andi %3, %cond_29 : tensor<1x16xi1> loc(#loc222) + %cond_31 = arith.ori %1, %cond_30 : tensor<1x16xi1> loc(#loc223) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<1x16xi1> loc(#loc224) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<1x16xi1> loc(#loc225) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<1x16xi1> loc(#loc226) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<1x16xi1> loc(#loc227) + %cond_36 = arith.extui %cond_35 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc228) + %cond_37 = arith.xori %cond_36, %flip : tensor<1x16xi32> loc(#loc228) + %cond_38 = arith.constant 0 : i32 loc(#loc229) + %cond_39 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc229) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<1x16xi32> loc(#loc229) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_43 = arith.xori %x, %ret_42 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<1x16xi32> loc(#loc234) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S1_16S__(%idxs) : (tensor<1x16xi16>) -> tensor<1x16xi16> loc(#loc235) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc236) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_47 = arith.extsi %idxs : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc237) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_43, %new_idxs_48 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x1xi32> loc("input"(#loc112))) -> tensor<8x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc113) + tt.return %0 : tensor<8x1xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x1xi32> loc(#loc115) + tt.return %1 : tensor<8x1xi32> loc(#loc115) + } loc(#loc112) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc116)), %b: i32 loc("b"(#loc116))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc117) + tt.return %0 : i32 loc(#loc118) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc119) + tt.return %1 : i32 loc(#loc119) + } loc(#loc116) + tt.func private @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x1xi16> loc("input"(#loc112))) -> tensor<8x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc241) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc113) + tt.return %0 : tensor<8x1xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x1xi32> loc(#loc115) + tt.return %1 : tensor<8x1xi32> loc(#loc115) + } loc(#loc112) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%x: tensor<1x16xi32> loc("x"(#loc121))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc122) + %false = arith.constant false loc(#loc123) + tt.return %false : i1 loc(#loc123) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc124) + tt.return %1 : i1 loc(#loc124) + } loc(#loc121) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S1_16S__(%x: tensor<1x16xi32> loc("x"(#loc125))) -> tensor<1x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc126) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc127) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc127) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc127) + %4 = arith.addi %x, %3 : tensor<1x16xi32> loc(#loc127) + tt.return %4 : tensor<1x16xi32> loc(#loc128) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x16xi32> loc(#loc129) + tt.return %5 : tensor<1x16xi32> loc(#loc129) + } loc(#loc125) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc131) + %cst = arith.constant dense : tensor<1xi1> loc(#loc131) + tt.return %cst : tensor<1xi1> loc(#loc132) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc133) + tt.return %0 : tensor<1xi1> loc(#loc133) + } loc(#loc130) + tt.func private @triton.language.standard.zeros_like__i32S1_16S__(%input: tensor<1x16xi32> loc("input"(#loc134))) -> tensor<1x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<1x16xi32> loc(#loc135) + tt.return %0 : tensor<1x16xi32> loc(#loc136) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi32> loc(#loc137) + tt.return %1 : tensor<1x16xi32> loc(#loc137) + } loc(#loc134) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<1x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc131) + %cst = arith.constant dense<0> : tensor<1x16xi32> loc(#loc131) + tt.return %cst : tensor<1x16xi32> loc(#loc132) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16xi32> loc(#loc133) + tt.return %0 : tensor<1x16xi32> loc(#loc133) + } loc(#loc130) + tt.func private @triton.language.standard.zeros_like__i16S1_16S__(%input: tensor<1x16xi16> loc("input"(#loc134))) -> tensor<1x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<1x16xi16> loc(#loc135) + tt.return %0 : tensor<1x16xi16> loc(#loc136) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi16> loc(#loc137) + tt.return %1 : tensor<1x16xi16> loc(#loc137) + } loc(#loc134) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<1x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc131) + %cst = arith.constant dense<0> : tensor<1x16xi16> loc(#loc131) + tt.return %cst : tensor<1x16xi16> loc(#loc132) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16xi16> loc(#loc133) + tt.return %0 : tensor<1x16xi16> loc(#loc133) + } loc(#loc130) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc41)), %idxs: tensor<1x16xi32> loc("idxs"(#loc41))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc176) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc177) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc177) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc178) + %flip_3 = tt.reshape %flip_2 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc179) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + tt.return %1#0, %1#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc47) + ^bb1: // no predecessors + %2 = ub.poison : tensor<1x16xi32> loc(#loc48) + %3 = ub.poison : tensor<1x16xi32> loc(#loc48) + tt.return %2, %3 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc48) + } loc(#loc41) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: tensor<1x16xi32> loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<4x2x2xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<4x2x2xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<4x2x2xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<4x2x2xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc215) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc215) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc216) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc228) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc228) + %cond_36 = arith.constant 0 : i32 loc(#loc229) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc229) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc229) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x2x2xi32> loc("input"(#loc112))) -> tensor<4x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc113) + tt.return %0 : tensor<4x2xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<4x2xi32> loc(#loc115) + tt.return %1 : tensor<4x2xi32> loc(#loc115) + } loc(#loc112) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: tensor<1x16xi32> loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x1xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x1xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc215) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc215) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc216) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc228) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc228) + %cond_36 = arith.constant 0 : i32 loc(#loc229) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc229) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc229) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc41)), %idxs: tensor<1x16xi32> loc("idxs"(#loc41))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc176) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc177) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc177) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc178) + %flip_3 = tt.reshape %flip_2 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc179) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + tt.return %2#0, %2#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc47) + ^bb1: // no predecessors + %3 = ub.poison : tensor<1x16xi32> loc(#loc48) + %4 = ub.poison : tensor<1x16xi32> loc(#loc48) + tt.return %3, %4 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc48) + } loc(#loc41) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: tensor<1x16xi32> loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<2x2x4xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<2x2x4xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<2x2x4xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<2x2x4xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc215) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc215) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc216) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc228) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc228) + %cond_36 = arith.constant 0 : i32 loc(#loc229) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc229) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc229) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<2x2x4xi32> loc("input"(#loc112))) -> tensor<2x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc113) + tt.return %0 : tensor<2x4xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<2x4xi32> loc(#loc115) + tt.return %1 : tensor<2x4xi32> loc(#loc115) + } loc(#loc112) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc41)), %idxs: tensor<1x16xi32> loc("idxs"(#loc41))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc245) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc46) + tt.return %3#0, %3#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc47) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc48) + %5 = ub.poison : tensor<1x16xi32> loc(#loc48) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc48) + } loc(#loc41) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: i1 loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<1x2x8xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<1x2x8xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<1x2x8xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<1x2x8xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc215) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc215) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc216) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc228) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc228) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2x8xi32> loc("input"(#loc112))) -> tensor<1x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc113) + tt.reduce.return %2 : i32 loc(#loc113) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc113) + tt.return %0 : tensor<1x8xi32> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x8xi32> loc(#loc115) + tt.return %1 : tensor<1x8xi32> loc(#loc115) + } loc(#loc112) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: i1 loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<2x2x4xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<2x2x4xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<2x2x4xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<2x2x4xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc215) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc215) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc216) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc228) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc228) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: i1 loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<4x2x2xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<4x2x2xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<4x2x2xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<4x2x2xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc215) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc215) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc216) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc228) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc228) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc49)), %idxs: tensor<1x16xi32> loc("idxs"(#loc49)), %flip: i1 loc("flip"(#loc49))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc183) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc184) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc185) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc185) + %left_mask = arith.constant 1 : i32 loc(#loc186) + %left_mask_2 = arith.constant 1 : i32 loc(#loc186) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc186) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc186) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc187) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc187) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc188) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc189) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc190) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc191) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc191) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc192) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc193) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc194) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc195) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc196) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc197) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc199) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x1xi32> loc(#loc199) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc201) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc202) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc204) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x1xi32> loc(#loc204) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc205) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc206) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc207) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc208) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc209) + %left_valid_mask = arith.constant true loc(#loc210) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc210) + %right_valid_mask = arith.constant true loc(#loc211) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc211) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc212) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc213) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc246) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc82) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc215) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc215) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc215) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc216) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc247) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc247) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc87) + } loc(#loc83) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc248) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc89) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc219) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc249) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc249) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc87) + } loc(#loc90) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc221) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc222) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc223) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc224) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc225) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc226) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc227) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc228) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc228) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc230) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc231) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc232) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc233) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc234) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc235) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc236) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc237) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc110) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc111) + %5 = ub.poison : tensor<1x16xi32> loc(#loc111) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc111) + } loc(#loc49) + tt.func private @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x16xi64> loc("input"(#loc112))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc113) + tt.reduce.return %2 : i64 loc(#loc113) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc113) + tt.return %0 : tensor<1xi64> loc(#loc114) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc115) + tt.return %1 : tensor<1xi64> loc(#loc115) + } loc(#loc112) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc116)), %b: i64 loc("b"(#loc116))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc117) + tt.return %0 : i64 loc(#loc118) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc119) + tt.return %1 : i64 loc(#loc119) + } loc(#loc116) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":33:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":34:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:38) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:49) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:54) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":38:19) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":41:67) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":42:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":44:34) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":45:26) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":45:29) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":46:20) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":47:21) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":48:21) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:35) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:32) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:47) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:25) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:37) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:4) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc144 = loc("xnumel"(#loc1)) +#loc145 = loc("r0_numel"(#loc2)) +#loc146 = loc("xoffset"(#loc3)) +#loc147 = loc("xoffset"(#loc4)) +#loc148 = loc("xindex"(#loc5)) +#loc149 = loc("xindex"(#loc6)) +#loc150 = loc("xindex"(#loc7)) +#loc151 = loc("xmask"(#loc8)) +#loc152 = loc("r0_index"(#loc9)) +#loc153 = loc("r0_index"(#loc10)) +#loc154 = loc("r0_offset"(#loc11)) +#loc155 = loc("r0_mask"(#loc12)) +#loc156 = loc("x0"(#loc13)) +#loc157 = loc("x1"(#loc14)) +#loc158 = loc("tmp0"(#loc15)) +#loc159 = loc("tmp0"(#loc16)) +#loc160 = loc("tmp0"(#loc17)) +#loc161 = loc("tmp0"(#loc18)) +#loc162 = loc("tmp0"(#loc19)) +#loc163 = loc("tmp0"(#loc20)) +#loc164 = loc("tmp2"(#loc21)) +#loc165 = loc("tmp7"(#loc23)) +#loc166 = loc("tmp10"(#loc24)) +#loc167 = loc("tmp11"(#loc25)) +#loc168 = loc("tmp11"(#loc26)) +#loc169 = loc("tmp12"(#loc27)) +#loc170 = loc("tmp13"(#loc28)) +#loc171 = loc("tmp14"(#loc29)) +#loc176 = loc("flip"(#loc42)) +#loc177 = loc("flip"(#loc43)) +#loc178 = loc("flip"(#loc44)) +#loc179 = loc("flip"(#loc45)) +#loc183 = loc("y"(#loc50)) +#loc184 = loc("right_mask"(#loc51)) +#loc185 = loc("right_mask"(#loc52)) +#loc186 = loc("left_mask"(#loc53)) +#loc187 = loc("ileft"(#loc54)) +#loc188 = loc("ileft"(#loc55)) +#loc189 = loc("ileft"(#loc56)) +#loc190 = loc("ileft"(#loc57)) +#loc191 = loc("iright"(#loc58)) +#loc192 = loc("iright"(#loc59)) +#loc193 = loc("iright"(#loc60)) +#loc194 = loc("iright"(#loc61)) +#loc195 = loc("ileft"(#loc62)) +#loc196 = loc("iright"(#loc63)) +#loc197 = loc("y_idx"(#loc64)) +#loc198 = loc("left_idx"(#loc65)) +#loc199 = loc("left_idx"(#loc66)) +#loc200 = loc("left_idx"(#loc67)) +#loc201 = loc("left_idx"(#loc68)) +#loc202 = loc("left_idx"(#loc69)) +#loc203 = loc("right_idx"(#loc70)) +#loc204 = loc("right_idx"(#loc71)) +#loc205 = loc("right_idx"(#loc72)) +#loc206 = loc("right_idx"(#loc73)) +#loc207 = loc("right_idx"(#loc74)) +#loc208 = loc("left_idx"(#loc75)) +#loc209 = loc("right_idx"(#loc76)) +#loc210 = loc("left_valid_mask"(#loc77)) +#loc211 = loc("right_valid_mask"(#loc78)) +#loc212 = loc("left_isnan"(#loc79)) +#loc213 = loc("right_isnan"(#loc80)) +#loc214 = loc("cond"(#loc81)) +#loc215 = loc("cond"(#loc84)) +#loc216 = loc("cond"(#loc85)) +#loc217 = loc("cond"(#loc86)) +#loc218 = loc("eq"(#loc88)) +#loc219 = loc("eq"(#loc91)) +#loc220 = loc("eq"(#loc92)) +#loc221 = loc("cond"(#loc93)) +#loc222 = loc("cond"(#loc94)) +#loc223 = loc("cond"(#loc95)) +#loc224 = loc("cond"(#loc96)) +#loc225 = loc("cond"(#loc97)) +#loc226 = loc("cond"(#loc98)) +#loc227 = loc("cond"(#loc99)) +#loc228 = loc("cond"(#loc100)) +#loc229 = loc("cond"(#loc101)) +#loc230 = loc("ret"(#loc102)) +#loc231 = loc("ret"(#loc103)) +#loc232 = loc("ret"(#loc104)) +#loc233 = loc("ret"(#loc105)) +#loc234 = loc("new_idxs"(#loc106)) +#loc235 = loc("new_idxs"(#loc107)) +#loc236 = loc("new_idxs"(#loc108)) +#loc237 = loc("new_idxs"(#loc109)) +#loc241 = loc("input"(#loc120)) +#loc245 = loc("flip"(#loc138)) +#loc246 = loc("cond"(#loc214)) +#loc247 = loc("cond"(#loc217)) +#loc248 = loc("eq"(#loc218)) +#loc249 = loc("eq"(#loc220)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..53b9caef30f4259074f40f3b9b536664c1d7e3dd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir @@ -0,0 +1,812 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [2, 2, 8], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [4, 2, 4], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked3 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [8, 2, 2], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked4 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [16, 2, 1], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [2, 1], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":18:0) +#loc1 = loc(unknown) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":41:67) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":45:26) +#loc73 = loc("in_ptr0"(#loc)) +#loc74 = loc("out_ptr2"(#loc)) +#loc75 = loc("out_ptr3"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc91 = loc(callsite(#loc15 at #loc16)) +#loc97 = loc("ileft"(#loc24)) +#loc101 = loc("iright"(#loc29)) +#loc110 = loc("left_idx"(#loc38)) +#loc115 = loc("right_idx"(#loc43)) +#loc135 = loc("tmp11"(#loc63)) +#loc145 = loc(callsite(#loc20 at #loc91)) +#loc149 = loc(callsite(#loc1 at #loc135)) +#loc153 = loc(callsite(#loc97 at #loc145)) +#loc157 = loc(callsite(#loc101 at #loc145)) +#loc165 = loc(callsite(#loc110 at #loc145)) +#loc170 = loc(callsite(#loc115 at #loc145)) +#loc190 = loc(callsite(#loc1 at #loc153)) +#loc192 = loc(callsite(#loc1 at #loc157)) +#loc195 = loc(callsite(#loc1 at #loc165)) +#loc198 = loc(callsite(#loc1 at #loc170)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<17> : tensor<1x16xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x16xi64, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked2> loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked3> loc(#loc1) + %cst_4 = arith.constant dense<0> : tensor<1x16xi32, #blocked> loc(#loc1) + %c272_i32 = arith.constant 272 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked4> loc(#loc1) + %cst_6 = arith.constant dense<0> : tensor<1x16xi32, #blocked5> loc(#loc1) + %cst_7 = arith.constant dense<17> : tensor<1x16xi32, #blocked5> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc78) + %xmask = arith.cmpi slt, %xoffset, %c32_i32 : i32 loc(#loc79) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked5}>> loc(#loc80) + %r0_index_8 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc80) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x16xi32, #blocked5> loc(#loc80) + %r0_index_10 = tt.expand_dims %r0_index_8 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc80) + %x0 = arith.remsi %xoffset, %c16_i32 : i32 loc(#loc81) + %x1 = arith.divsi %xoffset, %c16_i32 : i32 loc(#loc82) + %tmp0 = arith.muli %r0_index_9, %cst_7 : tensor<1x16xi32, #blocked5> loc(#loc83) + %tmp0_11 = arith.muli %r0_index_10, %cst : tensor<1x16xi32, #blocked> loc(#loc83) + %tmp0_12 = tt.splat %x0 : i32 -> tensor<1x16xi32, #blocked5> loc(#loc139) + %tmp0_13 = tt.splat %x0 : i32 -> tensor<1x16xi32, #blocked> loc(#loc139) + %tmp0_14 = arith.addi %tmp0_12, %tmp0 : tensor<1x16xi32, #blocked5> loc(#loc84) + %tmp0_15 = arith.addi %tmp0_13, %tmp0_11 : tensor<1x16xi32, #blocked> loc(#loc84) + %tmp0_16 = arith.muli %x1, %c272_i32 : i32 loc(#loc85) + %tmp0_17 = tt.splat %tmp0_16 : i32 -> tensor<1x16xi32, #blocked5> loc(#loc140) + %tmp0_18 = tt.splat %tmp0_16 : i32 -> tensor<1x16xi32, #blocked> loc(#loc140) + %tmp0_19 = arith.addi %tmp0_14, %tmp0_17 : tensor<1x16xi32, #blocked5> loc(#loc86) + %tmp0_20 = arith.addi %tmp0_15, %tmp0_18 : tensor<1x16xi32, #blocked> loc(#loc86) + %tmp0_21 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc87) + %tmp0_22 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> loc(#loc87) + %tmp0_23 = tt.addptr %tmp0_21, %tmp0_19 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc87) + %tmp0_24 = tt.addptr %tmp0_22, %tmp0_20 : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> loc(#loc87) + %tmp0_25 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked> loc(#loc141) + %tmp0_26 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked5> loc(#loc141) + %tmp0_27 = tt.load %tmp0_23, %tmp0_26, %cst_6 : tensor<1x16x!tt.ptr, #blocked5> loc(#loc88) + %tmp0_28 = tt.load %tmp0_24, %tmp0_25, %cst_4 : tensor<1x16x!tt.ptr, #blocked> loc(#loc88) + %tmp2 = arith.trunci %r0_index_9 : tensor<1x16xi32, #blocked5> to tensor<1x16xi16, #blocked5> loc(#loc89) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> loc(#loc142) + %flip_29 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> loc(#loc142) + %flip_30 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> loc(#loc142) + %flip_31 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> loc(#loc142) + %flip_32 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> loc(#loc142) + %flip_33 = tt.expand_dims %flip_29 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> loc(#loc142) + %flip_34 = tt.expand_dims %flip_30 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> loc(#loc142) + %flip_35 = tt.expand_dims %flip_31 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> loc(#loc142) + %flip_36 = tt.expand_dims %flip_32 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> -> tensor<1x2x1xi32, #blocked3> loc(#loc142) + %flip_37 = tt.expand_dims %flip_33 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> -> tensor<1x2x1xi32, #blocked4> loc(#loc142) + %flip_38 = tt.expand_dims %flip_34 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> -> tensor<1x2x1xi32, #blocked2> loc(#loc142) + %flip_39 = tt.expand_dims %flip_35 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> -> tensor<1x2x1xi32, #blocked1> loc(#loc142) + %flip_40 = tt.broadcast %flip_36 : tensor<1x2x1xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc143) + %flip_41 = tt.reshape %flip_40 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc144) + %y = tt.reshape %tmp0_27 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc150) + %left_mask = arith.subi %cst_5, %flip_37 : tensor<1x2x1xi32, #blocked4> loc(#loc151) + %left_mask_42 = arith.subi %cst_3, %flip_36 : tensor<1x2x1xi32, #blocked3> loc(#loc151) + %left_mask_43 = arith.subi %cst_2, %flip_38 : tensor<1x2x1xi32, #blocked2> loc(#loc151) + %left_mask_44 = arith.subi %cst_1, %flip_39 : tensor<1x2x1xi32, #blocked1> loc(#loc151) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_45 = arith.muli %y, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_46 = "tt.reduce"(%ileft_45) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc189) + %ileft_47 = tt.expand_dims %ileft_46 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc154) + %ileft_48 = tt.broadcast %ileft_47 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc155) + %iright = tt.broadcast %flip_37 : tensor<1x2x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_49 = arith.muli %y, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_50 = "tt.reduce"(%iright_49) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc191) + %iright_51 = tt.expand_dims %iright_50 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc158) + %iright_52 = tt.broadcast %iright_51 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc159) + %ileft_53 = tt.reshape %ileft_48 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_54 = tt.reshape %iright_52 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx = tt.reshape %tmp2 : tensor<1x16xi16, #blocked5> -> tensor<8x2x1xi16, #blocked4> loc(#loc162) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc163) + %left_idx_55 = tt.broadcast %left_idx : tensor<1x2x1xi16, #blocked4> -> tensor<8x2x1xi16, #blocked4> loc(#loc164) + %left_idx_56 = arith.muli %y_idx, %left_idx_55 : tensor<8x2x1xi16, #blocked4> loc(#loc164) + %input = arith.extsi %left_idx_56 : tensor<8x2x1xi16, #blocked4> to tensor<8x2x1xi32, #blocked4> loc(#loc193) + %left_idx_57 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc194) + %left_idx_58 = tt.expand_dims %left_idx_57 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc166) + %left_idx_59 = tt.broadcast %left_idx_58 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc167) + %right_idx = arith.trunci %flip_37 : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc168) + %right_idx_60 = tt.broadcast %right_idx : tensor<1x2x1xi16, #blocked4> -> tensor<8x2x1xi16, #blocked4> loc(#loc169) + %right_idx_61 = arith.muli %y_idx, %right_idx_60 : tensor<8x2x1xi16, #blocked4> loc(#loc169) + %input_62 = arith.extsi %right_idx_61 : tensor<8x2x1xi16, #blocked4> to tensor<8x2x1xi32, #blocked4> loc(#loc196) + %right_idx_63 = "tt.reduce"(%input_62) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc197) + %right_idx_64 = tt.expand_dims %right_idx_63 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc171) + %right_idx_65 = tt.broadcast %right_idx_64 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc172) + %left_idx_66 = tt.reshape %left_idx_59 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_67 = tt.reshape %right_idx_65 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond = arith.cmpi slt, %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq = arith.cmpi eq, %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_68 = arith.cmpi sgt, %left_idx_66, %right_idx_67 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_69 = arith.andi %eq, %cond_68 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_70 = arith.ori %cond, %cond_69 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_71 = arith.extui %cond_70 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_72 = arith.xori %cond_71, %flip_41 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_73 = arith.cmpi ne, %cond_72, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret = arith.xori %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_74 = arith.select %cond_73, %ret, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_75 = arith.xori %tmp0_27, %ret_74 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs = arith.xori %left_idx_66, %right_idx_67 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_76 = arith.select %cond_73, %new_idxs, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_77 = arith.extsi %tmp2 : tensor<1x16xi16, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc187) + %new_idxs_78 = arith.xori %new_idxs_77, %new_idxs_76 : tensor<1x16xi32, #blocked5> loc(#loc187) + %flip_79 = tt.broadcast %flip_38 : tensor<1x2x1xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc143) + %flip_80 = tt.reshape %flip_79 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc144) + %y_81 = tt.reshape %ret_75 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc150) + %ileft_82 = tt.broadcast %left_mask_42 : tensor<1x2x1xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc152) + %ileft_83 = arith.muli %y_81, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc152) + %ileft_84 = "tt.reduce"(%ileft_83) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc189) + %ileft_85 = tt.expand_dims %ileft_84 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc154) + %ileft_86 = tt.broadcast %ileft_85 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc155) + %iright_87 = arith.muli %y_81, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc156) + %iright_88 = "tt.reduce"(%iright_87) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc191) + %iright_89 = tt.expand_dims %iright_88 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc158) + %iright_90 = tt.broadcast %iright_89 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc159) + %ileft_91 = tt.reshape %ileft_86 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_92 = tt.reshape %iright_90 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_93 = tt.reshape %new_idxs_78 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc162) + %left_idx_94 = arith.muli %y_idx_93, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc164) + %left_idx_95 = "tt.reduce"(%left_idx_94) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc194) + %left_idx_96 = tt.expand_dims %left_idx_95 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc166) + %left_idx_97 = tt.broadcast %left_idx_96 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc167) + %right_idx_98 = arith.muli %y_idx_93, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc169) + %right_idx_99 = "tt.reduce"(%right_idx_98) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc197) + %right_idx_100 = tt.expand_dims %right_idx_99 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc171) + %right_idx_101 = tt.broadcast %right_idx_100 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc172) + %left_idx_102 = tt.reshape %left_idx_97 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_103 = tt.reshape %right_idx_101 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_104 = arith.cmpi slt, %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_105 = arith.cmpi eq, %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_106 = arith.cmpi sgt, %left_idx_102, %right_idx_103 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_107 = arith.andi %eq_105, %cond_106 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_108 = arith.ori %cond_104, %cond_107 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_109 = arith.extui %cond_108 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_110 = arith.xori %cond_109, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_111 = arith.cmpi ne, %cond_110, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_112 = arith.xori %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_113 = arith.select %cond_111, %ret_112, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_114 = arith.xori %ret_75, %ret_113 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_115 = arith.xori %left_idx_102, %right_idx_103 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_116 = arith.select %cond_111, %new_idxs_115, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_117 = arith.xori %new_idxs_78, %new_idxs_116 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_118 = tt.reshape %ret_114 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc150) + %ileft_119 = arith.muli %y_118, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_120 = "tt.reduce"(%ileft_119) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc189) + %ileft_121 = tt.expand_dims %ileft_120 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc154) + %ileft_122 = tt.broadcast %ileft_121 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc155) + %iright_123 = arith.muli %y_118, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_124 = "tt.reduce"(%iright_123) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc191) + %iright_125 = tt.expand_dims %iright_124 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc158) + %iright_126 = tt.broadcast %iright_125 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc159) + %ileft_127 = tt.reshape %ileft_122 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_128 = tt.reshape %iright_126 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_129 = tt.reshape %new_idxs_117 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc162) + %left_idx_130 = arith.muli %y_idx_129, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc164) + %left_idx_131 = "tt.reduce"(%left_idx_130) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc194) + %left_idx_132 = tt.expand_dims %left_idx_131 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc166) + %left_idx_133 = tt.broadcast %left_idx_132 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc167) + %right_idx_134 = arith.muli %y_idx_129, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc169) + %right_idx_135 = "tt.reduce"(%right_idx_134) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc197) + %right_idx_136 = tt.expand_dims %right_idx_135 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc171) + %right_idx_137 = tt.broadcast %right_idx_136 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc172) + %left_idx_138 = tt.reshape %left_idx_133 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_139 = tt.reshape %right_idx_137 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_140 = arith.cmpi slt, %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_141 = arith.cmpi eq, %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_142 = arith.cmpi sgt, %left_idx_138, %right_idx_139 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_143 = arith.andi %eq_141, %cond_142 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_144 = arith.ori %cond_140, %cond_143 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_145 = arith.extui %cond_144 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_146 = arith.xori %cond_145, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_147 = arith.cmpi ne, %cond_146, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_148 = arith.xori %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_149 = arith.select %cond_147, %ret_148, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_150 = arith.xori %ret_114, %ret_149 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_151 = arith.xori %left_idx_138, %right_idx_139 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_152 = arith.select %cond_147, %new_idxs_151, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_153 = arith.xori %new_idxs_117, %new_idxs_152 : tensor<1x16xi32, #blocked5> loc(#loc187) + %flip_154 = tt.broadcast %flip_39 : tensor<1x2x1xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc143) + %flip_155 = tt.reshape %flip_154 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc144) + %y_156 = tt.reshape %ret_150 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc150) + %ileft_157 = tt.broadcast %left_mask_43 : tensor<1x2x1xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc152) + %ileft_158 = arith.muli %y_156, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc152) + %ileft_159 = "tt.reduce"(%ileft_158) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc189) + %ileft_160 = tt.expand_dims %ileft_159 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc154) + %ileft_161 = tt.broadcast %ileft_160 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc155) + %iright_162 = arith.muli %y_156, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc156) + %iright_163 = "tt.reduce"(%iright_162) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc191) + %iright_164 = tt.expand_dims %iright_163 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc158) + %iright_165 = tt.broadcast %iright_164 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc159) + %ileft_166 = tt.reshape %ileft_161 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_167 = tt.reshape %iright_165 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_168 = tt.reshape %new_idxs_153 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc162) + %left_idx_169 = arith.muli %y_idx_168, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc164) + %left_idx_170 = "tt.reduce"(%left_idx_169) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc194) + %left_idx_171 = tt.expand_dims %left_idx_170 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc166) + %left_idx_172 = tt.broadcast %left_idx_171 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc167) + %right_idx_173 = arith.muli %y_idx_168, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc169) + %right_idx_174 = "tt.reduce"(%right_idx_173) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc197) + %right_idx_175 = tt.expand_dims %right_idx_174 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc171) + %right_idx_176 = tt.broadcast %right_idx_175 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc172) + %left_idx_177 = tt.reshape %left_idx_172 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_178 = tt.reshape %right_idx_176 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_179 = arith.cmpi slt, %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_180 = arith.cmpi eq, %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_181 = arith.cmpi sgt, %left_idx_177, %right_idx_178 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_182 = arith.andi %eq_180, %cond_181 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_183 = arith.ori %cond_179, %cond_182 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_184 = arith.extui %cond_183 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_185 = arith.xori %cond_184, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_186 = arith.cmpi ne, %cond_185, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_187 = arith.xori %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_188 = arith.select %cond_186, %ret_187, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_189 = arith.xori %ret_150, %ret_188 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_190 = arith.xori %left_idx_177, %right_idx_178 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_191 = arith.select %cond_186, %new_idxs_190, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_192 = arith.xori %new_idxs_153, %new_idxs_191 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_193 = tt.reshape %ret_189 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc150) + %ileft_194 = arith.muli %y_193, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc152) + %ileft_195 = "tt.reduce"(%ileft_194) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc189) + %ileft_196 = tt.expand_dims %ileft_195 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc154) + %ileft_197 = tt.broadcast %ileft_196 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc155) + %iright_198 = arith.muli %y_193, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc156) + %iright_199 = "tt.reduce"(%iright_198) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc191) + %iright_200 = tt.expand_dims %iright_199 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc158) + %iright_201 = tt.broadcast %iright_200 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc159) + %ileft_202 = tt.reshape %ileft_197 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_203 = tt.reshape %iright_201 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_204 = tt.reshape %new_idxs_192 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc162) + %left_idx_205 = arith.muli %y_idx_204, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc164) + %left_idx_206 = "tt.reduce"(%left_idx_205) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc194) + %left_idx_207 = tt.expand_dims %left_idx_206 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc166) + %left_idx_208 = tt.broadcast %left_idx_207 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc167) + %right_idx_209 = arith.muli %y_idx_204, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc169) + %right_idx_210 = "tt.reduce"(%right_idx_209) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc197) + %right_idx_211 = tt.expand_dims %right_idx_210 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc171) + %right_idx_212 = tt.broadcast %right_idx_211 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc172) + %left_idx_213 = tt.reshape %left_idx_208 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_214 = tt.reshape %right_idx_212 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_215 = arith.cmpi slt, %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_216 = arith.cmpi eq, %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_217 = arith.cmpi sgt, %left_idx_213, %right_idx_214 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_218 = arith.andi %eq_216, %cond_217 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_219 = arith.ori %cond_215, %cond_218 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_220 = arith.extui %cond_219 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_221 = arith.xori %cond_220, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_222 = arith.cmpi ne, %cond_221, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_223 = arith.xori %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_224 = arith.select %cond_222, %ret_223, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_225 = arith.xori %ret_189, %ret_224 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_226 = arith.xori %left_idx_213, %right_idx_214 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_227 = arith.select %cond_222, %new_idxs_226, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_228 = arith.xori %new_idxs_192, %new_idxs_227 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_229 = tt.reshape %ret_225 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc150) + %ileft_230 = arith.muli %y_229, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_231 = "tt.reduce"(%ileft_230) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc189) + %ileft_232 = tt.expand_dims %ileft_231 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc154) + %ileft_233 = tt.broadcast %ileft_232 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc155) + %iright_234 = arith.muli %y_229, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_235 = "tt.reduce"(%iright_234) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc191) + %iright_236 = tt.expand_dims %iright_235 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc158) + %iright_237 = tt.broadcast %iright_236 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc159) + %ileft_238 = tt.reshape %ileft_233 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_239 = tt.reshape %iright_237 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_240 = tt.reshape %new_idxs_228 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc162) + %left_idx_241 = arith.muli %y_idx_240, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc164) + %left_idx_242 = "tt.reduce"(%left_idx_241) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc194) + %left_idx_243 = tt.expand_dims %left_idx_242 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc166) + %left_idx_244 = tt.broadcast %left_idx_243 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc167) + %right_idx_245 = arith.muli %y_idx_240, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc169) + %right_idx_246 = "tt.reduce"(%right_idx_245) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc197) + %right_idx_247 = tt.expand_dims %right_idx_246 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc171) + %right_idx_248 = tt.broadcast %right_idx_247 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc172) + %left_idx_249 = tt.reshape %left_idx_244 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_250 = tt.reshape %right_idx_248 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_251 = arith.cmpi slt, %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_252 = arith.cmpi eq, %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_253 = arith.cmpi sgt, %left_idx_249, %right_idx_250 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_254 = arith.andi %eq_252, %cond_253 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_255 = arith.ori %cond_251, %cond_254 : tensor<1x16xi1, #blocked5> loc(#loc179) + %cond_256 = arith.extui %cond_255 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_257 = arith.xori %cond_256, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc180) + %cond_258 = arith.cmpi ne, %cond_257, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc181) + %ret_259 = arith.xori %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_260 = arith.select %cond_258, %ret_259, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_261 = arith.xori %ret_225, %ret_260 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_262 = arith.xori %left_idx_249, %right_idx_250 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_263 = arith.select %cond_258, %new_idxs_262, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_264 = arith.xori %new_idxs_228, %new_idxs_263 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_265 = tt.reshape %ret_261 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc150) + %ileft_266 = tt.broadcast %left_mask_44 : tensor<1x2x1xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc152) + %ileft_267 = arith.muli %y_265, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc152) + %ileft_268 = "tt.reduce"(%ileft_267) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc189) + %ileft_269 = tt.expand_dims %ileft_268 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc154) + %ileft_270 = tt.broadcast %ileft_269 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc155) + %iright_271 = arith.muli %y_265, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc156) + %iright_272 = "tt.reduce"(%iright_271) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc191) + %iright_273 = tt.expand_dims %iright_272 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc158) + %iright_274 = tt.broadcast %iright_273 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc159) + %ileft_275 = tt.reshape %ileft_270 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_276 = tt.reshape %iright_274 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_277 = tt.reshape %new_idxs_264 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc162) + %left_idx_278 = arith.muli %y_idx_277, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc164) + %left_idx_279 = "tt.reduce"(%left_idx_278) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc194) + %left_idx_280 = tt.expand_dims %left_idx_279 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc166) + %left_idx_281 = tt.broadcast %left_idx_280 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc167) + %right_idx_282 = arith.muli %y_idx_277, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc169) + %right_idx_283 = "tt.reduce"(%right_idx_282) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc197) + %right_idx_284 = tt.expand_dims %right_idx_283 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc171) + %right_idx_285 = tt.broadcast %right_idx_284 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc172) + %left_idx_286 = tt.reshape %left_idx_281 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_287 = tt.reshape %right_idx_285 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_288 = arith.cmpi slt, %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_289 = arith.cmpi eq, %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_290 = arith.cmpi sgt, %left_idx_286, %right_idx_287 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_291 = arith.andi %eq_289, %cond_290 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_292 = arith.ori %cond_288, %cond_291 : tensor<1x16xi1, #blocked5> loc(#loc179) + %ret_293 = arith.xori %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_294 = arith.select %cond_292, %ret_293, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_295 = arith.xori %ret_261, %ret_294 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_296 = arith.xori %left_idx_286, %right_idx_287 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_297 = arith.select %cond_292, %new_idxs_296, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_298 = arith.xori %new_idxs_264, %new_idxs_297 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_299 = tt.reshape %ret_295 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc150) + %ileft_300 = arith.muli %y_299, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc152) + %ileft_301 = "tt.reduce"(%ileft_300) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc189) + %ileft_302 = tt.expand_dims %ileft_301 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc154) + %ileft_303 = tt.broadcast %ileft_302 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc155) + %iright_304 = arith.muli %y_299, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc156) + %iright_305 = "tt.reduce"(%iright_304) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc191) + %iright_306 = tt.expand_dims %iright_305 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc158) + %iright_307 = tt.broadcast %iright_306 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc159) + %ileft_308 = tt.reshape %ileft_303 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_309 = tt.reshape %iright_307 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_310 = tt.reshape %new_idxs_298 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc162) + %left_idx_311 = arith.muli %y_idx_310, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc164) + %left_idx_312 = "tt.reduce"(%left_idx_311) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc194) + %left_idx_313 = tt.expand_dims %left_idx_312 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc166) + %left_idx_314 = tt.broadcast %left_idx_313 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc167) + %right_idx_315 = arith.muli %y_idx_310, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc169) + %right_idx_316 = "tt.reduce"(%right_idx_315) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc197) + %right_idx_317 = tt.expand_dims %right_idx_316 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc171) + %right_idx_318 = tt.broadcast %right_idx_317 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc172) + %left_idx_319 = tt.reshape %left_idx_314 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_320 = tt.reshape %right_idx_318 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_321 = arith.cmpi slt, %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_322 = arith.cmpi eq, %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_323 = arith.cmpi sgt, %left_idx_319, %right_idx_320 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_324 = arith.andi %eq_322, %cond_323 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_325 = arith.ori %cond_321, %cond_324 : tensor<1x16xi1, #blocked5> loc(#loc179) + %ret_326 = arith.xori %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_327 = arith.select %cond_325, %ret_326, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_328 = arith.xori %ret_295, %ret_327 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_329 = arith.xori %left_idx_319, %right_idx_320 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_330 = arith.select %cond_325, %new_idxs_329, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_331 = arith.xori %new_idxs_298, %new_idxs_330 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_332 = tt.reshape %ret_328 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc150) + %ileft_333 = arith.muli %y_332, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc152) + %ileft_334 = "tt.reduce"(%ileft_333) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc189) + %ileft_335 = tt.expand_dims %ileft_334 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc154) + %ileft_336 = tt.broadcast %ileft_335 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc155) + %iright_337 = arith.muli %y_332, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc156) + %iright_338 = "tt.reduce"(%iright_337) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc191) + %iright_339 = tt.expand_dims %iright_338 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc158) + %iright_340 = tt.broadcast %iright_339 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc159) + %ileft_341 = tt.reshape %ileft_336 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_342 = tt.reshape %iright_340 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_343 = tt.reshape %new_idxs_331 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc162) + %left_idx_344 = arith.muli %y_idx_343, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc164) + %left_idx_345 = "tt.reduce"(%left_idx_344) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc194) + %left_idx_346 = tt.expand_dims %left_idx_345 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc166) + %left_idx_347 = tt.broadcast %left_idx_346 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc167) + %right_idx_348 = arith.muli %y_idx_343, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc169) + %right_idx_349 = "tt.reduce"(%right_idx_348) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc197) + %right_idx_350 = tt.expand_dims %right_idx_349 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc171) + %right_idx_351 = tt.broadcast %right_idx_350 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc172) + %left_idx_352 = tt.reshape %left_idx_347 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_353 = tt.reshape %right_idx_351 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_354 = arith.cmpi slt, %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_355 = arith.cmpi eq, %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_356 = arith.cmpi sgt, %left_idx_352, %right_idx_353 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_357 = arith.andi %eq_355, %cond_356 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_358 = arith.ori %cond_354, %cond_357 : tensor<1x16xi1, #blocked5> loc(#loc179) + %ret_359 = arith.xori %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc182) + %ret_360 = arith.select %cond_358, %ret_359, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc183) + %ret_361 = arith.xori %ret_328, %ret_360 : tensor<1x16xi32, #blocked5> loc(#loc184) + %new_idxs_362 = arith.xori %left_idx_352, %right_idx_353 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_363 = arith.select %cond_358, %new_idxs_362, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_364 = arith.xori %new_idxs_331, %new_idxs_363 : tensor<1x16xi32, #blocked5> loc(#loc187) + %y_365 = tt.reshape %ret_361 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc150) + %ileft_366 = arith.muli %y_365, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc152) + %ileft_367 = "tt.reduce"(%ileft_366) <{axis = 1 : i32}> ({ + ^bb0(%ileft_396: i32 loc(callsite(#loc1 at #loc153)), %ileft_397: i32 loc(callsite(#loc1 at #loc153))): + %ileft_398 = arith.addi %ileft_396, %ileft_397 : i32 loc(#loc199) + tt.reduce.return %ileft_398 : i32 loc(#loc189) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc189) + %ileft_368 = tt.expand_dims %ileft_367 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc154) + %ileft_369 = tt.broadcast %ileft_368 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc155) + %iright_370 = arith.muli %y_365, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc156) + %iright_371 = "tt.reduce"(%iright_370) <{axis = 1 : i32}> ({ + ^bb0(%iright_396: i32 loc(callsite(#loc1 at #loc157)), %iright_397: i32 loc(callsite(#loc1 at #loc157))): + %iright_398 = arith.addi %iright_396, %iright_397 : i32 loc(#loc200) + tt.reduce.return %iright_398 : i32 loc(#loc191) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc191) + %iright_372 = tt.expand_dims %iright_371 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc158) + %iright_373 = tt.broadcast %iright_372 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc159) + %ileft_374 = tt.reshape %ileft_369 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc160) + %iright_375 = tt.reshape %iright_373 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc161) + %y_idx_376 = tt.reshape %new_idxs_364 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc162) + %left_idx_377 = arith.muli %y_idx_376, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc164) + %left_idx_378 = "tt.reduce"(%left_idx_377) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_396: i32 loc(callsite(#loc1 at #loc165)), %left_idx_397: i32 loc(callsite(#loc1 at #loc165))): + %left_idx_398 = arith.addi %left_idx_396, %left_idx_397 : i32 loc(#loc201) + tt.reduce.return %left_idx_398 : i32 loc(#loc194) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc194) + %left_idx_379 = tt.expand_dims %left_idx_378 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc166) + %left_idx_380 = tt.broadcast %left_idx_379 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc167) + %right_idx_381 = arith.muli %y_idx_376, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc169) + %right_idx_382 = "tt.reduce"(%right_idx_381) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_396: i32 loc(callsite(#loc1 at #loc170)), %right_idx_397: i32 loc(callsite(#loc1 at #loc170))): + %right_idx_398 = arith.addi %right_idx_396, %right_idx_397 : i32 loc(#loc202) + tt.reduce.return %right_idx_398 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc197) + %right_idx_383 = tt.expand_dims %right_idx_382 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc171) + %right_idx_384 = tt.broadcast %right_idx_383 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc172) + %left_idx_385 = tt.reshape %left_idx_380 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc173) + %right_idx_386 = tt.reshape %right_idx_384 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc174) + %cond_387 = arith.cmpi slt, %ileft_374, %iright_375 : tensor<1x16xi32, #blocked5> loc(#loc175) + %eq_388 = arith.cmpi eq, %ileft_374, %iright_375 : tensor<1x16xi32, #blocked5> loc(#loc176) + %cond_389 = arith.cmpi sgt, %left_idx_385, %right_idx_386 : tensor<1x16xi32, #blocked5> loc(#loc177) + %cond_390 = arith.andi %eq_388, %cond_389 : tensor<1x16xi1, #blocked5> loc(#loc178) + %cond_391 = arith.ori %cond_387, %cond_390 : tensor<1x16xi1, #blocked5> loc(#loc179) + %new_idxs_392 = arith.xori %left_idx_385, %right_idx_386 : tensor<1x16xi32, #blocked5> loc(#loc185) + %new_idxs_393 = arith.select %cond_391, %new_idxs_392, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc186) + %new_idxs_394 = arith.xori %new_idxs_364, %new_idxs_393 : tensor<1x16xi32, #blocked5> loc(#loc187) + %tmp7 = arith.extsi %tmp0_28 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc133) + %tmp10 = arith.select %tmp0_25, %tmp7, %cst_0 : tensor<1x16xi1, #blocked>, tensor<1x16xi64, #blocked> loc(#loc134) + %tmp11 = "tt.reduce"(%tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_396: i64 loc(callsite(#loc1 at #loc135)), %tmp11_397: i64 loc(callsite(#loc1 at #loc135))): + %tmp11_398 = arith.addi %tmp11_396, %tmp11_397 : i64 loc(#loc188) + tt.reduce.return %tmp11_398 : i64 loc(#loc148) + }) : (tensor<1x16xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc148) + %tmp11_395 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc136) + %tmp14 = arith.trunci %tmp11_395 : tensor<1x1xi64, #blocked> to tensor<1x1xi32, #blocked> loc(#loc137) + %0 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc66) + %1 = tt.splat %0 : i32 -> tensor<1x16xi32, #blocked5> loc(#loc138) + %2 = arith.addi %r0_index_9, %1 : tensor<1x16xi32, #blocked5> loc(#loc67) + %3 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc68) + %4 = tt.addptr %3, %2 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc68) + tt.store %4, %new_idxs_394, %tmp0_26 : tensor<1x16x!tt.ptr, #blocked5> loc(#loc69) + %5 = tt.addptr %out_ptr3, %xoffset : !tt.ptr, i32 loc(#loc70) + %6 = tt.splat %5 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc71) + %7 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc71) + tt.store %6, %tmp14, %7 : tensor<1x1x!tt.ptr, #blocked> loc(#loc71) + tt.return loc(#loc72) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":26:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":27:38) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":33:19) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":34:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:35) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:49) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:45) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:30) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:54) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":38:19) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":42:19) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":44:34) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":45:29) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":48:21) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:35) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:32) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:25) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:47) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:37) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:4) +#loc78 = loc("xoffset"(#loc2)) +#loc79 = loc("xmask"(#loc3)) +#loc80 = loc("r0_index"(#loc4)) +#loc81 = loc("x0"(#loc5)) +#loc82 = loc("x1"(#loc6)) +#loc83 = loc("tmp0"(#loc7)) +#loc84 = loc("tmp0"(#loc8)) +#loc85 = loc("tmp0"(#loc9)) +#loc86 = loc("tmp0"(#loc10)) +#loc87 = loc("tmp0"(#loc11)) +#loc88 = loc("tmp0"(#loc12)) +#loc89 = loc("tmp2"(#loc13)) +#loc90 = loc("flip"(#loc14)) +#loc92 = loc("flip"(#loc17)) +#loc93 = loc("flip"(#loc18)) +#loc94 = loc("y"(#loc19)) +#loc95 = loc("left_mask"(#loc21)) +#loc96 = loc("ileft"(#loc22)) +#loc98 = loc("ileft"(#loc26)) +#loc99 = loc("ileft"(#loc27)) +#loc100 = loc("iright"(#loc28)) +#loc102 = loc("iright"(#loc30)) +#loc103 = loc("iright"(#loc31)) +#loc104 = loc("ileft"(#loc32)) +#loc105 = loc("iright"(#loc33)) +#loc106 = loc("y_idx"(#loc34)) +#loc107 = loc("left_idx"(#loc35)) +#loc108 = loc("left_idx"(#loc36)) +#loc109 = loc("input"(#loc37)) +#loc111 = loc("left_idx"(#loc39)) +#loc112 = loc("left_idx"(#loc40)) +#loc113 = loc("right_idx"(#loc41)) +#loc114 = loc("right_idx"(#loc42)) +#loc116 = loc("right_idx"(#loc44)) +#loc117 = loc("right_idx"(#loc45)) +#loc118 = loc("left_idx"(#loc46)) +#loc119 = loc("right_idx"(#loc47)) +#loc120 = loc("cond"(#loc48)) +#loc121 = loc("eq"(#loc49)) +#loc122 = loc("cond"(#loc50)) +#loc123 = loc("cond"(#loc51)) +#loc124 = loc("cond"(#loc52)) +#loc125 = loc("cond"(#loc53)) +#loc126 = loc("cond"(#loc54)) +#loc127 = loc("ret"(#loc55)) +#loc128 = loc("ret"(#loc56)) +#loc129 = loc("ret"(#loc57)) +#loc130 = loc("new_idxs"(#loc58)) +#loc131 = loc("new_idxs"(#loc59)) +#loc132 = loc("new_idxs"(#loc60)) +#loc133 = loc("tmp7"(#loc61)) +#loc134 = loc("tmp10"(#loc62)) +#loc136 = loc("tmp11"(#loc64)) +#loc137 = loc("tmp14"(#loc65)) +#loc138 = loc(fused[#loc67, #loc66]) +#loc139 = loc(fused[#loc84, #loc81]) +#loc140 = loc(fused[#loc86, #loc85]) +#loc141 = loc(fused[#loc88, #loc79]) +#loc142 = loc(callsite(#loc90 at #loc91)) +#loc143 = loc(callsite(#loc92 at #loc91)) +#loc144 = loc(callsite(#loc93 at #loc91)) +#loc146 = loc("cond"(#loc120)) +#loc147 = loc("eq"(#loc121)) +#loc148 = loc(callsite(#loc23 at #loc135)) +#loc150 = loc(callsite(#loc94 at #loc145)) +#loc151 = loc(callsite(#loc95 at #loc145)) +#loc152 = loc(callsite(#loc96 at #loc145)) +#loc154 = loc(callsite(#loc98 at #loc145)) +#loc155 = loc(callsite(#loc99 at #loc145)) +#loc156 = loc(callsite(#loc100 at #loc145)) +#loc158 = loc(callsite(#loc102 at #loc145)) +#loc159 = loc(callsite(#loc103 at #loc145)) +#loc160 = loc(callsite(#loc104 at #loc145)) +#loc161 = loc(callsite(#loc105 at #loc145)) +#loc162 = loc(callsite(#loc106 at #loc145)) +#loc163 = loc(callsite(#loc107 at #loc145)) +#loc164 = loc(callsite(#loc108 at #loc145)) +#loc166 = loc(callsite(#loc111 at #loc145)) +#loc167 = loc(callsite(#loc112 at #loc145)) +#loc168 = loc(callsite(#loc113 at #loc145)) +#loc169 = loc(callsite(#loc114 at #loc145)) +#loc171 = loc(callsite(#loc116 at #loc145)) +#loc172 = loc(callsite(#loc117 at #loc145)) +#loc173 = loc(callsite(#loc118 at #loc145)) +#loc174 = loc(callsite(#loc119 at #loc145)) +#loc175 = loc(callsite(#loc146 at #loc145)) +#loc176 = loc(callsite(#loc147 at #loc145)) +#loc177 = loc(callsite(#loc122 at #loc145)) +#loc178 = loc(callsite(#loc123 at #loc145)) +#loc179 = loc(callsite(#loc124 at #loc145)) +#loc180 = loc(callsite(#loc125 at #loc145)) +#loc181 = loc(callsite(#loc126 at #loc145)) +#loc182 = loc(callsite(#loc127 at #loc145)) +#loc183 = loc(callsite(#loc128 at #loc145)) +#loc184 = loc(callsite(#loc129 at #loc145)) +#loc185 = loc(callsite(#loc130 at #loc145)) +#loc186 = loc(callsite(#loc131 at #loc145)) +#loc187 = loc(callsite(#loc132 at #loc145)) +#loc188 = loc(callsite(#loc25 at #loc148)) +#loc189 = loc(callsite(#loc23 at #loc153)) +#loc191 = loc(callsite(#loc23 at #loc157)) +#loc193 = loc(callsite(#loc109 at #loc165)) +#loc194 = loc(callsite(#loc23 at #loc165)) +#loc196 = loc(callsite(#loc109 at #loc170)) +#loc197 = loc(callsite(#loc23 at #loc170)) +#loc199 = loc(callsite(#loc25 at #loc189)) +#loc200 = loc(callsite(#loc25 at #loc191)) +#loc201 = loc(callsite(#loc25 at #loc194)) +#loc202 = loc(callsite(#loc25 at #loc197)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir new file mode 100644 index 0000000000000000000000000000000000000000..c21c9d7e2ee4e23c9a306a5f34215e073e1e580f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EB4J5U2HKNQBLXRWK6B5L6ATOH55AWD3MB7P63KH5AKRGRDZER7A/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir @@ -0,0 +1,784 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":18:0) +#loc2 = loc(unknown) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":41:67) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":45:26) +#loc75 = loc("in_ptr0"(#loc)) +#loc76 = loc("out_ptr2"(#loc)) +#loc77 = loc("out_ptr3"(#loc)) +#loc78 = loc("xnumel"(#loc)) +#loc79 = loc("r0_numel"(#loc)) +#loc96 = loc(callsite(#loc18 at #loc4)) +#loc103 = loc("ileft"(#loc27)) +#loc107 = loc("iright"(#loc32)) +#loc116 = loc("left_idx"(#loc41)) +#loc121 = loc("right_idx"(#loc46)) +#loc140 = loc("tmp11"(#loc65)) +#loc151 = loc(callsite(#loc23 at #loc96)) +#loc155 = loc(callsite(#loc2 at #loc140)) +#loc159 = loc(callsite(#loc103 at #loc151)) +#loc163 = loc(callsite(#loc107 at #loc151)) +#loc171 = loc(callsite(#loc116 at #loc151)) +#loc176 = loc(callsite(#loc121 at #loc151)) +#loc196 = loc(callsite(#loc2 at #loc159)) +#loc198 = loc(callsite(#loc2 at #loc163)) +#loc201 = loc(callsite(#loc2 at #loc171)) +#loc204 = loc(callsite(#loc2 at #loc176)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %tmp0 = arith.constant 272 : i32 loc(#loc80) + %c16_i32 = arith.constant 16 : i32 loc(#loc2) + %xmask = arith.constant 32 : i32 loc(#loc81) + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc82) + %cst_0 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc2) + %tmp10 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc83) + %tmp0_1 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc84) + %xoffset = tt.get_program_id x : i32 loc(#loc85) + %xmask_2 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc81) + %xmask_3 = tt.splat %xmask_2 : i1 -> tensor<1x1xi1> loc(#loc81) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc86) + %r0_index_4 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc87) + %x0 = arith.remsi %xoffset, %c16_i32 : i32 loc(#loc88) + %x1 = arith.divsi %xoffset, %c16_i32 : i32 loc(#loc89) + %tmp0_5 = arith.muli %r0_index_4, %tmp0_1 : tensor<1x16xi32> loc(#loc84) + %tmp0_6 = tt.splat %x0 : i32 -> tensor<1x16xi32> loc(#loc144) + %tmp0_7 = arith.addi %tmp0_6, %tmp0_5 : tensor<1x16xi32> loc(#loc90) + %tmp0_8 = arith.muli %x1, %tmp0 : i32 loc(#loc80) + %tmp0_9 = tt.splat %tmp0_8 : i32 -> tensor<1x16xi32> loc(#loc145) + %tmp0_10 = arith.addi %tmp0_7, %tmp0_9 : tensor<1x16xi32> loc(#loc91) + %tmp0_11 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc92) + %tmp0_12 = tt.addptr %tmp0_11, %tmp0_10 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc92) + %tmp0_13 = tt.splat %xmask_2 : i1 -> tensor<1x16xi1> loc(#loc146) + %tmp0_14 = tt.load %tmp0_12, %tmp0_13, %cst_0 : tensor<1x16x!tt.ptr> loc(#loc93) + %tmp2 = arith.trunci %r0_index_4 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc94) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc147) + %flip_15 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc148) + %flip_16 = tt.expand_dims %flip_15 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc148) + %flip_17 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc149) + %flip_18 = tt.reshape %flip_17 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc150) + %y = tt.reshape %tmp0_14 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc156) + %left_mask = arith.subi %cst, %flip_16 : tensor<1x2x1xi32> loc(#loc157) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc158) + %ileft_19 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc158) + %ileft_20 = "tt.reduce"(%ileft_19) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc195) + %ileft_21 = tt.expand_dims %ileft_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc160) + %ileft_22 = tt.broadcast %ileft_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc161) + %iright = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc162) + %iright_23 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc162) + %iright_24 = "tt.reduce"(%iright_23) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc197) + %iright_25 = tt.expand_dims %iright_24 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc164) + %iright_26 = tt.broadcast %iright_25 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc165) + %ileft_27 = tt.reshape %ileft_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_28 = tt.reshape %iright_26 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx = tt.reshape %tmp2 : tensor<1x16xi16> -> tensor<8x2x1xi16> loc(#loc168) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc169) + %left_idx_29 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc170) + %left_idx_30 = arith.muli %y_idx, %left_idx_29 : tensor<8x2x1xi16> loc(#loc170) + %input = arith.extsi %left_idx_30 : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc199) + %left_idx_31 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_32 = tt.expand_dims %left_idx_31 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc172) + %left_idx_33 = tt.broadcast %left_idx_32 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc173) + %right_idx = arith.trunci %flip_16 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc174) + %right_idx_34 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc175) + %right_idx_35 = arith.muli %y_idx, %right_idx_34 : tensor<8x2x1xi16> loc(#loc175) + %input_36 = arith.extsi %right_idx_35 : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc202) + %right_idx_37 = "tt.reduce"(%input_36) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc203) + %right_idx_38 = tt.expand_dims %right_idx_37 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc177) + %right_idx_39 = tt.broadcast %right_idx_38 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc178) + %left_idx_40 = tt.reshape %left_idx_33 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_41 = tt.reshape %right_idx_39 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc180) + %cond = arith.cmpi slt, %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc181) + %eq = arith.cmpi eq, %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc182) + %cond_42 = arith.cmpi sgt, %left_idx_40, %right_idx_41 : tensor<1x16xi32> loc(#loc183) + %cond_43 = arith.andi %eq, %cond_42 : tensor<1x16xi1> loc(#loc184) + %cond_44 = arith.ori %cond, %cond_43 : tensor<1x16xi1> loc(#loc185) + %cond_45 = arith.extui %cond_44 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_46 = arith.xori %cond_45, %flip_18 : tensor<1x16xi32> loc(#loc186) + %cond_47 = arith.cmpi ne, %cond_46, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret = arith.xori %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc188) + %ret_48 = arith.select %cond_47, %ret, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_49 = arith.xori %tmp0_14, %ret_48 : tensor<1x16xi32> loc(#loc190) + %new_idxs = arith.xori %left_idx_40, %right_idx_41 : tensor<1x16xi32> loc(#loc191) + %new_idxs_50 = arith.select %cond_47, %new_idxs, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_51 = arith.extsi %tmp2 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc193) + %new_idxs_52 = arith.xori %new_idxs_51, %new_idxs_50 : tensor<1x16xi32> loc(#loc193) + %flip_53 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc149) + %flip_54 = tt.reshape %flip_53 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc150) + %y_55 = tt.reshape %ret_49 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc156) + %ileft_56 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc158) + %ileft_57 = arith.muli %y_55, %ileft_56 : tensor<4x2x2xi32> loc(#loc158) + %ileft_58 = "tt.reduce"(%ileft_57) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc195) + %ileft_59 = tt.expand_dims %ileft_58 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc160) + %ileft_60 = tt.broadcast %ileft_59 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc161) + %iright_61 = arith.muli %y_55, %flip_17 : tensor<4x2x2xi32> loc(#loc162) + %iright_62 = "tt.reduce"(%iright_61) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc197) + %iright_63 = tt.expand_dims %iright_62 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc164) + %iright_64 = tt.broadcast %iright_63 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc165) + %ileft_65 = tt.reshape %ileft_60 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_66 = tt.reshape %iright_64 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_67 = tt.reshape %new_idxs_52 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc168) + %left_idx_68 = arith.muli %y_idx_67, %ileft_56 : tensor<4x2x2xi32> loc(#loc170) + %left_idx_69 = "tt.reduce"(%left_idx_68) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_70 = tt.expand_dims %left_idx_69 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc172) + %left_idx_71 = tt.broadcast %left_idx_70 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc173) + %right_idx_72 = arith.muli %y_idx_67, %flip_17 : tensor<4x2x2xi32> loc(#loc175) + %right_idx_73 = "tt.reduce"(%right_idx_72) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc203) + %right_idx_74 = tt.expand_dims %right_idx_73 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc177) + %right_idx_75 = tt.broadcast %right_idx_74 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc178) + %left_idx_76 = tt.reshape %left_idx_71 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_77 = tt.reshape %right_idx_75 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_78 = arith.cmpi slt, %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc181) + %eq_79 = arith.cmpi eq, %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc182) + %cond_80 = arith.cmpi sgt, %left_idx_76, %right_idx_77 : tensor<1x16xi32> loc(#loc183) + %cond_81 = arith.andi %eq_79, %cond_80 : tensor<1x16xi1> loc(#loc184) + %cond_82 = arith.ori %cond_78, %cond_81 : tensor<1x16xi1> loc(#loc185) + %cond_83 = arith.extui %cond_82 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_84 = arith.xori %cond_83, %flip_54 : tensor<1x16xi32> loc(#loc186) + %cond_85 = arith.cmpi ne, %cond_84, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_86 = arith.xori %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc188) + %ret_87 = arith.select %cond_85, %ret_86, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_88 = arith.xori %ret_49, %ret_87 : tensor<1x16xi32> loc(#loc190) + %new_idxs_89 = arith.xori %left_idx_76, %right_idx_77 : tensor<1x16xi32> loc(#loc191) + %new_idxs_90 = arith.select %cond_85, %new_idxs_89, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_91 = arith.xori %new_idxs_52, %new_idxs_90 : tensor<1x16xi32> loc(#loc193) + %y_92 = tt.reshape %ret_88 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc156) + %ileft_93 = arith.muli %y_92, %ileft : tensor<8x2x1xi32> loc(#loc158) + %ileft_94 = "tt.reduce"(%ileft_93) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc195) + %ileft_95 = tt.expand_dims %ileft_94 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc160) + %ileft_96 = tt.broadcast %ileft_95 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc161) + %iright_97 = arith.muli %y_92, %iright : tensor<8x2x1xi32> loc(#loc162) + %iright_98 = "tt.reduce"(%iright_97) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc197) + %iright_99 = tt.expand_dims %iright_98 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc164) + %iright_100 = tt.broadcast %iright_99 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc165) + %ileft_101 = tt.reshape %ileft_96 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_102 = tt.reshape %iright_100 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_103 = tt.reshape %new_idxs_91 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc168) + %left_idx_104 = arith.muli %y_idx_103, %ileft : tensor<8x2x1xi32> loc(#loc170) + %left_idx_105 = "tt.reduce"(%left_idx_104) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_106 = tt.expand_dims %left_idx_105 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc172) + %left_idx_107 = tt.broadcast %left_idx_106 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc173) + %right_idx_108 = arith.muli %y_idx_103, %iright : tensor<8x2x1xi32> loc(#loc175) + %right_idx_109 = "tt.reduce"(%right_idx_108) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc203) + %right_idx_110 = tt.expand_dims %right_idx_109 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc177) + %right_idx_111 = tt.broadcast %right_idx_110 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc178) + %left_idx_112 = tt.reshape %left_idx_107 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_113 = tt.reshape %right_idx_111 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_114 = arith.cmpi slt, %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc181) + %eq_115 = arith.cmpi eq, %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc182) + %cond_116 = arith.cmpi sgt, %left_idx_112, %right_idx_113 : tensor<1x16xi32> loc(#loc183) + %cond_117 = arith.andi %eq_115, %cond_116 : tensor<1x16xi1> loc(#loc184) + %cond_118 = arith.ori %cond_114, %cond_117 : tensor<1x16xi1> loc(#loc185) + %cond_119 = arith.extui %cond_118 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_120 = arith.xori %cond_119, %flip_54 : tensor<1x16xi32> loc(#loc186) + %cond_121 = arith.cmpi ne, %cond_120, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_122 = arith.xori %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc188) + %ret_123 = arith.select %cond_121, %ret_122, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_124 = arith.xori %ret_88, %ret_123 : tensor<1x16xi32> loc(#loc190) + %new_idxs_125 = arith.xori %left_idx_112, %right_idx_113 : tensor<1x16xi32> loc(#loc191) + %new_idxs_126 = arith.select %cond_121, %new_idxs_125, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_127 = arith.xori %new_idxs_91, %new_idxs_126 : tensor<1x16xi32> loc(#loc193) + %flip_128 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc149) + %flip_129 = tt.reshape %flip_128 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc150) + %y_130 = tt.reshape %ret_124 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc156) + %ileft_131 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc158) + %ileft_132 = arith.muli %y_130, %ileft_131 : tensor<2x2x4xi32> loc(#loc158) + %ileft_133 = "tt.reduce"(%ileft_132) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc195) + %ileft_134 = tt.expand_dims %ileft_133 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc160) + %ileft_135 = tt.broadcast %ileft_134 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc161) + %iright_136 = arith.muli %y_130, %flip_53 : tensor<2x2x4xi32> loc(#loc162) + %iright_137 = "tt.reduce"(%iright_136) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc197) + %iright_138 = tt.expand_dims %iright_137 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc164) + %iright_139 = tt.broadcast %iright_138 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc165) + %ileft_140 = tt.reshape %ileft_135 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_141 = tt.reshape %iright_139 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_142 = tt.reshape %new_idxs_127 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc168) + %left_idx_143 = arith.muli %y_idx_142, %ileft_131 : tensor<2x2x4xi32> loc(#loc170) + %left_idx_144 = "tt.reduce"(%left_idx_143) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc200) + %left_idx_145 = tt.expand_dims %left_idx_144 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc172) + %left_idx_146 = tt.broadcast %left_idx_145 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc173) + %right_idx_147 = arith.muli %y_idx_142, %flip_53 : tensor<2x2x4xi32> loc(#loc175) + %right_idx_148 = "tt.reduce"(%right_idx_147) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc203) + %right_idx_149 = tt.expand_dims %right_idx_148 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc177) + %right_idx_150 = tt.broadcast %right_idx_149 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc178) + %left_idx_151 = tt.reshape %left_idx_146 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_152 = tt.reshape %right_idx_150 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_153 = arith.cmpi slt, %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc181) + %eq_154 = arith.cmpi eq, %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc182) + %cond_155 = arith.cmpi sgt, %left_idx_151, %right_idx_152 : tensor<1x16xi32> loc(#loc183) + %cond_156 = arith.andi %eq_154, %cond_155 : tensor<1x16xi1> loc(#loc184) + %cond_157 = arith.ori %cond_153, %cond_156 : tensor<1x16xi1> loc(#loc185) + %cond_158 = arith.extui %cond_157 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_159 = arith.xori %cond_158, %flip_129 : tensor<1x16xi32> loc(#loc186) + %cond_160 = arith.cmpi ne, %cond_159, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_161 = arith.xori %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc188) + %ret_162 = arith.select %cond_160, %ret_161, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_163 = arith.xori %ret_124, %ret_162 : tensor<1x16xi32> loc(#loc190) + %new_idxs_164 = arith.xori %left_idx_151, %right_idx_152 : tensor<1x16xi32> loc(#loc191) + %new_idxs_165 = arith.select %cond_160, %new_idxs_164, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_166 = arith.xori %new_idxs_127, %new_idxs_165 : tensor<1x16xi32> loc(#loc193) + %y_167 = tt.reshape %ret_163 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc156) + %ileft_168 = arith.muli %y_167, %ileft_56 : tensor<4x2x2xi32> loc(#loc158) + %ileft_169 = "tt.reduce"(%ileft_168) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc195) + %ileft_170 = tt.expand_dims %ileft_169 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc160) + %ileft_171 = tt.broadcast %ileft_170 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc161) + %iright_172 = arith.muli %y_167, %flip_17 : tensor<4x2x2xi32> loc(#loc162) + %iright_173 = "tt.reduce"(%iright_172) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc197) + %iright_174 = tt.expand_dims %iright_173 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc164) + %iright_175 = tt.broadcast %iright_174 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc165) + %ileft_176 = tt.reshape %ileft_171 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_177 = tt.reshape %iright_175 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_178 = tt.reshape %new_idxs_166 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc168) + %left_idx_179 = arith.muli %y_idx_178, %ileft_56 : tensor<4x2x2xi32> loc(#loc170) + %left_idx_180 = "tt.reduce"(%left_idx_179) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_181 = tt.expand_dims %left_idx_180 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc172) + %left_idx_182 = tt.broadcast %left_idx_181 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc173) + %right_idx_183 = arith.muli %y_idx_178, %flip_17 : tensor<4x2x2xi32> loc(#loc175) + %right_idx_184 = "tt.reduce"(%right_idx_183) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc203) + %right_idx_185 = tt.expand_dims %right_idx_184 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc177) + %right_idx_186 = tt.broadcast %right_idx_185 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc178) + %left_idx_187 = tt.reshape %left_idx_182 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_188 = tt.reshape %right_idx_186 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_189 = arith.cmpi slt, %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc181) + %eq_190 = arith.cmpi eq, %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc182) + %cond_191 = arith.cmpi sgt, %left_idx_187, %right_idx_188 : tensor<1x16xi32> loc(#loc183) + %cond_192 = arith.andi %eq_190, %cond_191 : tensor<1x16xi1> loc(#loc184) + %cond_193 = arith.ori %cond_189, %cond_192 : tensor<1x16xi1> loc(#loc185) + %cond_194 = arith.extui %cond_193 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_195 = arith.xori %cond_194, %flip_129 : tensor<1x16xi32> loc(#loc186) + %cond_196 = arith.cmpi ne, %cond_195, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_197 = arith.xori %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc188) + %ret_198 = arith.select %cond_196, %ret_197, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_199 = arith.xori %ret_163, %ret_198 : tensor<1x16xi32> loc(#loc190) + %new_idxs_200 = arith.xori %left_idx_187, %right_idx_188 : tensor<1x16xi32> loc(#loc191) + %new_idxs_201 = arith.select %cond_196, %new_idxs_200, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_202 = arith.xori %new_idxs_166, %new_idxs_201 : tensor<1x16xi32> loc(#loc193) + %y_203 = tt.reshape %ret_199 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc156) + %ileft_204 = arith.muli %y_203, %ileft : tensor<8x2x1xi32> loc(#loc158) + %ileft_205 = "tt.reduce"(%ileft_204) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc195) + %ileft_206 = tt.expand_dims %ileft_205 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc160) + %ileft_207 = tt.broadcast %ileft_206 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc161) + %iright_208 = arith.muli %y_203, %iright : tensor<8x2x1xi32> loc(#loc162) + %iright_209 = "tt.reduce"(%iright_208) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc197) + %iright_210 = tt.expand_dims %iright_209 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc164) + %iright_211 = tt.broadcast %iright_210 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc165) + %ileft_212 = tt.reshape %ileft_207 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_213 = tt.reshape %iright_211 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_214 = tt.reshape %new_idxs_202 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc168) + %left_idx_215 = arith.muli %y_idx_214, %ileft : tensor<8x2x1xi32> loc(#loc170) + %left_idx_216 = "tt.reduce"(%left_idx_215) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_217 = tt.expand_dims %left_idx_216 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc172) + %left_idx_218 = tt.broadcast %left_idx_217 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc173) + %right_idx_219 = arith.muli %y_idx_214, %iright : tensor<8x2x1xi32> loc(#loc175) + %right_idx_220 = "tt.reduce"(%right_idx_219) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc203) + %right_idx_221 = tt.expand_dims %right_idx_220 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc177) + %right_idx_222 = tt.broadcast %right_idx_221 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc178) + %left_idx_223 = tt.reshape %left_idx_218 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_224 = tt.reshape %right_idx_222 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_225 = arith.cmpi slt, %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc181) + %eq_226 = arith.cmpi eq, %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc182) + %cond_227 = arith.cmpi sgt, %left_idx_223, %right_idx_224 : tensor<1x16xi32> loc(#loc183) + %cond_228 = arith.andi %eq_226, %cond_227 : tensor<1x16xi1> loc(#loc184) + %cond_229 = arith.ori %cond_225, %cond_228 : tensor<1x16xi1> loc(#loc185) + %cond_230 = arith.extui %cond_229 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc186) + %cond_231 = arith.xori %cond_230, %flip_129 : tensor<1x16xi32> loc(#loc186) + %cond_232 = arith.cmpi ne, %cond_231, %cst_0 : tensor<1x16xi32> loc(#loc187) + %ret_233 = arith.xori %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc188) + %ret_234 = arith.select %cond_232, %ret_233, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_235 = arith.xori %ret_199, %ret_234 : tensor<1x16xi32> loc(#loc190) + %new_idxs_236 = arith.xori %left_idx_223, %right_idx_224 : tensor<1x16xi32> loc(#loc191) + %new_idxs_237 = arith.select %cond_232, %new_idxs_236, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_238 = arith.xori %new_idxs_202, %new_idxs_237 : tensor<1x16xi32> loc(#loc193) + %y_239 = tt.reshape %ret_235 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc156) + %ileft_240 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc158) + %ileft_241 = arith.muli %y_239, %ileft_240 : tensor<1x2x8xi32> loc(#loc158) + %ileft_242 = "tt.reduce"(%ileft_241) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc195) + %ileft_243 = tt.expand_dims %ileft_242 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc160) + %ileft_244 = tt.broadcast %ileft_243 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc161) + %iright_245 = arith.muli %y_239, %flip_128 : tensor<1x2x8xi32> loc(#loc162) + %iright_246 = "tt.reduce"(%iright_245) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc197) + %iright_247 = tt.expand_dims %iright_246 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc164) + %iright_248 = tt.broadcast %iright_247 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc165) + %ileft_249 = tt.reshape %ileft_244 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_250 = tt.reshape %iright_248 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_251 = tt.reshape %new_idxs_238 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc168) + %left_idx_252 = arith.muli %y_idx_251, %ileft_240 : tensor<1x2x8xi32> loc(#loc170) + %left_idx_253 = "tt.reduce"(%left_idx_252) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc200) + %left_idx_254 = tt.expand_dims %left_idx_253 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc172) + %left_idx_255 = tt.broadcast %left_idx_254 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc173) + %right_idx_256 = arith.muli %y_idx_251, %flip_128 : tensor<1x2x8xi32> loc(#loc175) + %right_idx_257 = "tt.reduce"(%right_idx_256) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc203) + %right_idx_258 = tt.expand_dims %right_idx_257 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc177) + %right_idx_259 = tt.broadcast %right_idx_258 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc178) + %left_idx_260 = tt.reshape %left_idx_255 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_261 = tt.reshape %right_idx_259 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_262 = arith.cmpi slt, %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc181) + %eq_263 = arith.cmpi eq, %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc182) + %cond_264 = arith.cmpi sgt, %left_idx_260, %right_idx_261 : tensor<1x16xi32> loc(#loc183) + %cond_265 = arith.andi %eq_263, %cond_264 : tensor<1x16xi1> loc(#loc184) + %cond_266 = arith.ori %cond_262, %cond_265 : tensor<1x16xi1> loc(#loc185) + %ret_267 = arith.xori %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc188) + %ret_268 = arith.select %cond_266, %ret_267, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_269 = arith.xori %ret_235, %ret_268 : tensor<1x16xi32> loc(#loc190) + %new_idxs_270 = arith.xori %left_idx_260, %right_idx_261 : tensor<1x16xi32> loc(#loc191) + %new_idxs_271 = arith.select %cond_266, %new_idxs_270, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_272 = arith.xori %new_idxs_238, %new_idxs_271 : tensor<1x16xi32> loc(#loc193) + %y_273 = tt.reshape %ret_269 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc156) + %ileft_274 = arith.muli %y_273, %ileft_131 : tensor<2x2x4xi32> loc(#loc158) + %ileft_275 = "tt.reduce"(%ileft_274) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc195) + %ileft_276 = tt.expand_dims %ileft_275 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc160) + %ileft_277 = tt.broadcast %ileft_276 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc161) + %iright_278 = arith.muli %y_273, %flip_53 : tensor<2x2x4xi32> loc(#loc162) + %iright_279 = "tt.reduce"(%iright_278) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc197) + %iright_280 = tt.expand_dims %iright_279 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc164) + %iright_281 = tt.broadcast %iright_280 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc165) + %ileft_282 = tt.reshape %ileft_277 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_283 = tt.reshape %iright_281 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_284 = tt.reshape %new_idxs_272 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc168) + %left_idx_285 = arith.muli %y_idx_284, %ileft_131 : tensor<2x2x4xi32> loc(#loc170) + %left_idx_286 = "tt.reduce"(%left_idx_285) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc200) + %left_idx_287 = tt.expand_dims %left_idx_286 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc172) + %left_idx_288 = tt.broadcast %left_idx_287 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc173) + %right_idx_289 = arith.muli %y_idx_284, %flip_53 : tensor<2x2x4xi32> loc(#loc175) + %right_idx_290 = "tt.reduce"(%right_idx_289) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc203) + %right_idx_291 = tt.expand_dims %right_idx_290 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc177) + %right_idx_292 = tt.broadcast %right_idx_291 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc178) + %left_idx_293 = tt.reshape %left_idx_288 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_294 = tt.reshape %right_idx_292 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_295 = arith.cmpi slt, %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc181) + %eq_296 = arith.cmpi eq, %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc182) + %cond_297 = arith.cmpi sgt, %left_idx_293, %right_idx_294 : tensor<1x16xi32> loc(#loc183) + %cond_298 = arith.andi %eq_296, %cond_297 : tensor<1x16xi1> loc(#loc184) + %cond_299 = arith.ori %cond_295, %cond_298 : tensor<1x16xi1> loc(#loc185) + %ret_300 = arith.xori %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc188) + %ret_301 = arith.select %cond_299, %ret_300, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_302 = arith.xori %ret_269, %ret_301 : tensor<1x16xi32> loc(#loc190) + %new_idxs_303 = arith.xori %left_idx_293, %right_idx_294 : tensor<1x16xi32> loc(#loc191) + %new_idxs_304 = arith.select %cond_299, %new_idxs_303, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_305 = arith.xori %new_idxs_272, %new_idxs_304 : tensor<1x16xi32> loc(#loc193) + %y_306 = tt.reshape %ret_302 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc156) + %ileft_307 = arith.muli %y_306, %ileft_56 : tensor<4x2x2xi32> loc(#loc158) + %ileft_308 = "tt.reduce"(%ileft_307) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc195) + %ileft_309 = tt.expand_dims %ileft_308 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc160) + %ileft_310 = tt.broadcast %ileft_309 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc161) + %iright_311 = arith.muli %y_306, %flip_17 : tensor<4x2x2xi32> loc(#loc162) + %iright_312 = "tt.reduce"(%iright_311) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc197) + %iright_313 = tt.expand_dims %iright_312 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc164) + %iright_314 = tt.broadcast %iright_313 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc165) + %ileft_315 = tt.reshape %ileft_310 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_316 = tt.reshape %iright_314 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_317 = tt.reshape %new_idxs_305 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc168) + %left_idx_318 = arith.muli %y_idx_317, %ileft_56 : tensor<4x2x2xi32> loc(#loc170) + %left_idx_319 = "tt.reduce"(%left_idx_318) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc200) + %left_idx_320 = tt.expand_dims %left_idx_319 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc172) + %left_idx_321 = tt.broadcast %left_idx_320 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc173) + %right_idx_322 = arith.muli %y_idx_317, %flip_17 : tensor<4x2x2xi32> loc(#loc175) + %right_idx_323 = "tt.reduce"(%right_idx_322) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc203) + %right_idx_324 = tt.expand_dims %right_idx_323 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc177) + %right_idx_325 = tt.broadcast %right_idx_324 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc178) + %left_idx_326 = tt.reshape %left_idx_321 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_327 = tt.reshape %right_idx_325 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_328 = arith.cmpi slt, %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc181) + %eq_329 = arith.cmpi eq, %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc182) + %cond_330 = arith.cmpi sgt, %left_idx_326, %right_idx_327 : tensor<1x16xi32> loc(#loc183) + %cond_331 = arith.andi %eq_329, %cond_330 : tensor<1x16xi1> loc(#loc184) + %cond_332 = arith.ori %cond_328, %cond_331 : tensor<1x16xi1> loc(#loc185) + %ret_333 = arith.xori %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc188) + %ret_334 = arith.select %cond_332, %ret_333, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc189) + %ret_335 = arith.xori %ret_302, %ret_334 : tensor<1x16xi32> loc(#loc190) + %new_idxs_336 = arith.xori %left_idx_326, %right_idx_327 : tensor<1x16xi32> loc(#loc191) + %new_idxs_337 = arith.select %cond_332, %new_idxs_336, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_338 = arith.xori %new_idxs_305, %new_idxs_337 : tensor<1x16xi32> loc(#loc193) + %y_339 = tt.reshape %ret_335 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc156) + %ileft_340 = arith.muli %y_339, %ileft : tensor<8x2x1xi32> loc(#loc158) + %ileft_341 = "tt.reduce"(%ileft_340) <{axis = 1 : i32}> ({ + ^bb0(%ileft_371: i32 loc(callsite(#loc2 at #loc159)), %ileft_372: i32 loc(callsite(#loc2 at #loc159))): + %ileft_373 = arith.addi %ileft_371, %ileft_372 : i32 loc(#loc205) + tt.reduce.return %ileft_373 : i32 loc(#loc195) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc195) + %ileft_342 = tt.expand_dims %ileft_341 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc160) + %ileft_343 = tt.broadcast %ileft_342 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc161) + %iright_344 = arith.muli %y_339, %iright : tensor<8x2x1xi32> loc(#loc162) + %iright_345 = "tt.reduce"(%iright_344) <{axis = 1 : i32}> ({ + ^bb0(%iright_371: i32 loc(callsite(#loc2 at #loc163)), %iright_372: i32 loc(callsite(#loc2 at #loc163))): + %iright_373 = arith.addi %iright_371, %iright_372 : i32 loc(#loc206) + tt.reduce.return %iright_373 : i32 loc(#loc197) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc197) + %iright_346 = tt.expand_dims %iright_345 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc164) + %iright_347 = tt.broadcast %iright_346 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc165) + %ileft_348 = tt.reshape %ileft_343 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc166) + %iright_349 = tt.reshape %iright_347 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc167) + %y_idx_350 = tt.reshape %new_idxs_338 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc168) + %left_idx_351 = arith.muli %y_idx_350, %ileft : tensor<8x2x1xi32> loc(#loc170) + %left_idx_352 = "tt.reduce"(%left_idx_351) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_371: i32 loc(callsite(#loc2 at #loc171)), %left_idx_372: i32 loc(callsite(#loc2 at #loc171))): + %left_idx_373 = arith.addi %left_idx_371, %left_idx_372 : i32 loc(#loc207) + tt.reduce.return %left_idx_373 : i32 loc(#loc200) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc200) + %left_idx_353 = tt.expand_dims %left_idx_352 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc172) + %left_idx_354 = tt.broadcast %left_idx_353 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc173) + %right_idx_355 = arith.muli %y_idx_350, %iright : tensor<8x2x1xi32> loc(#loc175) + %right_idx_356 = "tt.reduce"(%right_idx_355) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_371: i32 loc(callsite(#loc2 at #loc176)), %right_idx_372: i32 loc(callsite(#loc2 at #loc176))): + %right_idx_373 = arith.addi %right_idx_371, %right_idx_372 : i32 loc(#loc208) + tt.reduce.return %right_idx_373 : i32 loc(#loc203) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc203) + %right_idx_357 = tt.expand_dims %right_idx_356 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc177) + %right_idx_358 = tt.broadcast %right_idx_357 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc178) + %left_idx_359 = tt.reshape %left_idx_354 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc179) + %right_idx_360 = tt.reshape %right_idx_358 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc180) + %cond_361 = arith.cmpi slt, %ileft_348, %iright_349 : tensor<1x16xi32> loc(#loc181) + %eq_362 = arith.cmpi eq, %ileft_348, %iright_349 : tensor<1x16xi32> loc(#loc182) + %cond_363 = arith.cmpi sgt, %left_idx_359, %right_idx_360 : tensor<1x16xi32> loc(#loc183) + %cond_364 = arith.andi %eq_362, %cond_363 : tensor<1x16xi1> loc(#loc184) + %cond_365 = arith.ori %cond_361, %cond_364 : tensor<1x16xi1> loc(#loc185) + %new_idxs_366 = arith.xori %left_idx_359, %right_idx_360 : tensor<1x16xi32> loc(#loc191) + %new_idxs_367 = arith.select %cond_365, %new_idxs_366, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc192) + %new_idxs_368 = arith.xori %new_idxs_338, %new_idxs_367 : tensor<1x16xi32> loc(#loc193) + %tmp7 = arith.extsi %tmp0_14 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc139) + %tmp10_369 = arith.select %tmp0_13, %tmp7, %tmp10 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc83) + %tmp11 = "tt.reduce"(%tmp10_369) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_371: i64 loc(callsite(#loc2 at #loc140)), %tmp11_372: i64 loc(callsite(#loc2 at #loc140))): + %tmp11_373 = arith.addi %tmp11_371, %tmp11_372 : i64 loc(#loc194) + tt.reduce.return %tmp11_373 : i64 loc(#loc154) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc154) + %tmp11_370 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc141) + %tmp14 = arith.trunci %tmp11_370 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc142) + %0 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc68) + %1 = tt.splat %0 : i32 -> tensor<1x16xi32> loc(#loc143) + %2 = arith.addi %r0_index_4, %1 : tensor<1x16xi32> loc(#loc69) + %3 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc70) + %4 = tt.addptr %3, %2 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc70) + tt.store %4, %new_idxs_368, %tmp0_13 : tensor<1x16x!tt.ptr> loc(#loc71) + %5 = tt.addptr %out_ptr3, %xoffset : !tt.ptr, i32 loc(#loc72) + %6 = tt.splat %5 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc72) + tt.store %6, %tmp14, %xmask_3 : tensor<1x1x!tt.ptr> loc(#loc73) + tt.return loc(#loc74) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:49) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":26:21) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":44:34) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:38) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":24:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":27:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":27:38) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":33:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":34:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:35) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:30) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":36:54) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":38:19) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":42:19) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":45:29) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":48:21) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:35) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:32) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":49:47) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:37) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/w7/cw7u6rgkoihwizuge644mng2ddg3bqiixfxegsmxugtblafc6kz3.py":50:4) +#loc80 = loc("tmp0"(#loc1)) +#loc81 = loc("xmask"(#loc3)) +#loc82 = loc(callsite(#loc2 at #loc4)) +#loc83 = loc("tmp10"(#loc5)) +#loc84 = loc("tmp0"(#loc6)) +#loc85 = loc("xoffset"(#loc7)) +#loc86 = loc("r0_index"(#loc8)) +#loc87 = loc("r0_index"(#loc9)) +#loc88 = loc("x0"(#loc10)) +#loc89 = loc("x1"(#loc11)) +#loc90 = loc("tmp0"(#loc12)) +#loc91 = loc("tmp0"(#loc13)) +#loc92 = loc("tmp0"(#loc14)) +#loc93 = loc("tmp0"(#loc15)) +#loc94 = loc("tmp2"(#loc16)) +#loc95 = loc("flip"(#loc17)) +#loc97 = loc("flip"(#loc19)) +#loc98 = loc("flip"(#loc20)) +#loc99 = loc("flip"(#loc21)) +#loc100 = loc("y"(#loc22)) +#loc101 = loc("left_mask"(#loc24)) +#loc102 = loc("ileft"(#loc25)) +#loc104 = loc("ileft"(#loc29)) +#loc105 = loc("ileft"(#loc30)) +#loc106 = loc("iright"(#loc31)) +#loc108 = loc("iright"(#loc33)) +#loc109 = loc("iright"(#loc34)) +#loc110 = loc("ileft"(#loc35)) +#loc111 = loc("iright"(#loc36)) +#loc112 = loc("y_idx"(#loc37)) +#loc113 = loc("left_idx"(#loc38)) +#loc114 = loc("left_idx"(#loc39)) +#loc115 = loc("input"(#loc40)) +#loc117 = loc("left_idx"(#loc42)) +#loc118 = loc("left_idx"(#loc43)) +#loc119 = loc("right_idx"(#loc44)) +#loc120 = loc("right_idx"(#loc45)) +#loc122 = loc("right_idx"(#loc47)) +#loc123 = loc("right_idx"(#loc48)) +#loc124 = loc("left_idx"(#loc49)) +#loc125 = loc("right_idx"(#loc50)) +#loc126 = loc("cond"(#loc51)) +#loc127 = loc("eq"(#loc52)) +#loc128 = loc("cond"(#loc53)) +#loc129 = loc("cond"(#loc54)) +#loc130 = loc("cond"(#loc55)) +#loc131 = loc("cond"(#loc56)) +#loc132 = loc("cond"(#loc57)) +#loc133 = loc("ret"(#loc58)) +#loc134 = loc("ret"(#loc59)) +#loc135 = loc("ret"(#loc60)) +#loc136 = loc("new_idxs"(#loc61)) +#loc137 = loc("new_idxs"(#loc62)) +#loc138 = loc("new_idxs"(#loc63)) +#loc139 = loc("tmp7"(#loc64)) +#loc141 = loc("tmp11"(#loc66)) +#loc142 = loc("tmp14"(#loc67)) +#loc143 = loc(fused[#loc69, #loc68]) +#loc144 = loc(fused[#loc90, #loc88]) +#loc145 = loc(fused[#loc91, #loc80]) +#loc146 = loc(fused[#loc93, #loc81]) +#loc147 = loc(callsite(#loc95 at #loc96)) +#loc148 = loc(callsite(#loc97 at #loc96)) +#loc149 = loc(callsite(#loc98 at #loc96)) +#loc150 = loc(callsite(#loc99 at #loc96)) +#loc152 = loc("cond"(#loc126)) +#loc153 = loc("eq"(#loc127)) +#loc154 = loc(callsite(#loc26 at #loc140)) +#loc156 = loc(callsite(#loc100 at #loc151)) +#loc157 = loc(callsite(#loc101 at #loc151)) +#loc158 = loc(callsite(#loc102 at #loc151)) +#loc160 = loc(callsite(#loc104 at #loc151)) +#loc161 = loc(callsite(#loc105 at #loc151)) +#loc162 = loc(callsite(#loc106 at #loc151)) +#loc164 = loc(callsite(#loc108 at #loc151)) +#loc165 = loc(callsite(#loc109 at #loc151)) +#loc166 = loc(callsite(#loc110 at #loc151)) +#loc167 = loc(callsite(#loc111 at #loc151)) +#loc168 = loc(callsite(#loc112 at #loc151)) +#loc169 = loc(callsite(#loc113 at #loc151)) +#loc170 = loc(callsite(#loc114 at #loc151)) +#loc172 = loc(callsite(#loc117 at #loc151)) +#loc173 = loc(callsite(#loc118 at #loc151)) +#loc174 = loc(callsite(#loc119 at #loc151)) +#loc175 = loc(callsite(#loc120 at #loc151)) +#loc177 = loc(callsite(#loc122 at #loc151)) +#loc178 = loc(callsite(#loc123 at #loc151)) +#loc179 = loc(callsite(#loc124 at #loc151)) +#loc180 = loc(callsite(#loc125 at #loc151)) +#loc181 = loc(callsite(#loc152 at #loc151)) +#loc182 = loc(callsite(#loc153 at #loc151)) +#loc183 = loc(callsite(#loc128 at #loc151)) +#loc184 = loc(callsite(#loc129 at #loc151)) +#loc185 = loc(callsite(#loc130 at #loc151)) +#loc186 = loc(callsite(#loc131 at #loc151)) +#loc187 = loc(callsite(#loc132 at #loc151)) +#loc188 = loc(callsite(#loc133 at #loc151)) +#loc189 = loc(callsite(#loc134 at #loc151)) +#loc190 = loc(callsite(#loc135 at #loc151)) +#loc191 = loc(callsite(#loc136 at #loc151)) +#loc192 = loc(callsite(#loc137 at #loc151)) +#loc193 = loc(callsite(#loc138 at #loc151)) +#loc194 = loc(callsite(#loc28 at #loc154)) +#loc195 = loc(callsite(#loc26 at #loc159)) +#loc197 = loc(callsite(#loc26 at #loc163)) +#loc199 = loc(callsite(#loc115 at #loc171)) +#loc200 = loc(callsite(#loc26 at #loc171)) +#loc202 = loc(callsite(#loc115 at #loc176)) +#loc203 = loc(callsite(#loc26 at #loc176)) +#loc205 = loc(callsite(#loc28 at #loc195)) +#loc206 = loc(callsite(#loc28 at #loc197)) +#loc207 = loc(callsite(#loc28 at #loc200)) +#loc208 = loc(callsite(#loc28 at #loc203)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..72c909519d93b8d7fcc795997c66573202055a05 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..ca562428ee8fff3695ba15019b4e403074e9f9e2 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ffc570e253857448771e8107fc02cc3a37b0af51 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"hash": "216454436a742975997ad2492b746bae3cb4daa170e2ff11d7e2252d07da3898", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 4096, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..22e385b9000611bf2e162290a07c708b7a18ad31 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir @@ -0,0 +1,1533 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_1 = internal constant [8 x i8] c"unknown\00" +@assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py\00" +@assertMessage_1 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp49 < 17\00" +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py\00" +@assertMessage_0 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp40 < 17\00" +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %13 = shl i32 %12, 5, !dbg !11 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %15 = and i32 %14, 124, !dbg !12 + %16 = lshr exact i32 %15, 2, !dbg !12 + %17 = and i32 %14, 31, !dbg !12 + %18 = or disjoint i32 %16, %13, !dbg !13 + %19 = icmp slt i32 %18, 128, !dbg !14 + %20 = and i32 %14, 3, !dbg !15 + %21 = shl i32 %18, 4, !dbg !16 + %22 = and i32 %14, 1, !dbg !17 + %23 = lshr i32 %14, 1, !dbg !17 + %.lobit = and i32 %23, 1, !dbg !17 + %24 = xor i32 %22, 1, !dbg !21 + %25 = xor i32 %.lobit, 1, !dbg !21 + %26 = trunc i32 %14 to i1, !dbg !22 + %27 = trunc i32 %23 to i1, !dbg !22 + %28 = insertelement <2 x i1> poison, i1 %27, i64 0, !dbg !23 + %29 = shufflevector <2 x i1> %28, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !23 + %30 = insertelement <4 x i32> poison, i32 %25, i64 0, !dbg !25 + %31 = shufflevector <4 x i32> %30, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !25 + %32 = insertelement <4 x i32> poison, i32 %.lobit, i64 0, !dbg !26 + %33 = shufflevector <4 x i32> %32, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !26 + %34 = insertelement <4 x i32> poison, i32 %24, i64 0, !dbg !27 + %35 = shufflevector <4 x i32> %34, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !27 + %36 = insertelement <4 x i32> poison, i32 %22, i64 0, !dbg !28 + %37 = shufflevector <4 x i32> %36, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !28 + %38 = insertelement <2 x i1> poison, i1 %26, i64 0, !dbg !22 + %39 = shufflevector <2 x i1> %38, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !22 + %40 = insertelement <2 x i32> poison, i32 %24, i64 0, !dbg !29 + %41 = shufflevector <2 x i32> %40, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !29 + %42 = insertelement <2 x i32> poison, i32 %22, i64 0, !dbg !30 + %43 = shufflevector <2 x i32> %42, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !30 + %44 = insertelement <2 x i32> %42, i32 %24, i64 1, !dbg !31 + %45 = insertelement <2 x i32> %40, i32 %22, i64 1, !dbg !32 + %46 = insertelement <4 x i1> poison, i1 %19, i64 0, !dbg !33 + %47 = shufflevector <4 x i1> %46, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !33 + %48 = shl nuw nsw i32 %15, 1, !dbg !34 + %49 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %48, !dbg !34 + %50 = shl nuw nsw i32 %17, 3, !dbg !34 + %51 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %50, !dbg !34 + %52 = shl nuw nsw i32 %20, 2, !dbg !15 + %53 = or disjoint i32 %52, 1, !dbg !15 + %54 = or disjoint i32 %52, 2, !dbg !15 + %55 = or disjoint i32 %52, 3, !dbg !15 + %56 = or disjoint i32 %21, %52, !dbg !35 + %57 = or disjoint i32 %21, %54, !dbg !35 + %58 = sext i32 %56 to i64, !dbg !36 + %59 = getelementptr i64, ptr addrspace(1) %0, i64 %58, !dbg !36 + %60 = sext i32 %57 to i64, !dbg !36 + %61 = getelementptr i64, ptr addrspace(1) %0, i64 %60, !dbg !36 + %62 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %59, i1 %19) #6, !dbg !37 + %63 = extractvalue { i64, i64 } %62, 0, !dbg !37 + %64 = extractvalue { i64, i64 } %62, 1, !dbg !37 + %65 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %61, i1 %19) #6, !dbg !37 + %66 = extractvalue { i64, i64 } %65, 0, !dbg !37 + %67 = extractvalue { i64, i64 } %65, 1, !dbg !37 + %68 = xor i32 %53, %52, !dbg !38 + %69 = xor i32 %54, %55, !dbg !38 + %70 = insertelement <4 x i64> poison, i64 %63, i64 0, !dbg !39 + %71 = insertelement <4 x i64> %70, i64 %64, i64 1, !dbg !39 + %72 = insertelement <4 x i64> %71, i64 %66, i64 2, !dbg !39 + %73 = insertelement <4 x i64> %72, i64 %67, i64 3, !dbg !39 + %74 = add <4 x i64> %73, splat (i64 -1), !dbg !39 + %75 = icmp ult <4 x i64> %74, splat (i64 16383), !dbg !39 + %76 = extractelement <4 x i1> %75, i64 0, !dbg !40 + %77 = zext i1 %76 to i32, !dbg !41 + %78 = extractelement <4 x i1> %75, i64 1, !dbg !40 + %79 = zext i1 %78 to i32, !dbg !41 + %80 = extractelement <4 x i1> %75, i64 2, !dbg !42 + %81 = zext i1 %80 to i32, !dbg !41 + %82 = extractelement <4 x i1> %75, i64 3, !dbg !42 + %83 = zext i1 %82 to i32, !dbg !41 + %84 = xor i1 %76, true, !dbg !40 + %85 = and i1 %78, %84, !dbg !40 + %.not = xor i1 %82, true, !dbg !42 + %86 = or i1 %80, %.not, !dbg !42 + %87 = xor i32 %77, %79, !dbg !43 + %88 = xor i32 %81, %83, !dbg !43 + %89 = select i1 %85, i32 %87, i32 0, !dbg !44 + %90 = select i1 %86, i32 %88, i32 0, !dbg !44 + %91 = xor i32 %89, %77, !dbg !45 + %92 = xor i32 %89, %79, !dbg !45 + %93 = xor i32 %90, %81, !dbg !45 + %94 = xor i32 %90, %83, !dbg !45 + %95 = select i1 %85, i32 %68, i32 0, !dbg !46 + %96 = select i1 %86, i32 %69, i32 0, !dbg !46 + %97 = xor i32 %95, %52, !dbg !47 + %98 = xor i32 %95, %53, !dbg !47 + %99 = xor i32 %96, %54, !dbg !47 + %100 = xor i32 %96, %55, !dbg !47 + %101 = insertelement <2 x i32> poison, i32 %91, i64 0, !dbg !22 + %102 = insertelement <2 x i32> %101, i32 %92, i64 1, !dbg !22 + %103 = insertelement <2 x i32> poison, i32 %93, i64 0, !dbg !22 + %104 = insertelement <2 x i32> %103, i32 %94, i64 1, !dbg !22 + %105 = icmp samesign uge <2 x i32> %102, %104, !dbg !22 + %106 = insertelement <2 x i32> %104, i32 %92, i64 1, !dbg !22 + %107 = insertelement <2 x i32> %102, i32 %94, i64 1, !dbg !22 + %108 = icmp ne <2 x i32> %106, %107, !dbg !22 + %109 = insertelement <2 x i32> poison, i32 %97, i64 0, !dbg !22 + %110 = insertelement <2 x i32> %109, i32 %98, i64 1, !dbg !22 + %111 = insertelement <2 x i32> poison, i32 %99, i64 0, !dbg !22 + %112 = insertelement <2 x i32> %111, i32 %100, i64 1, !dbg !22 + %113 = icmp ule <2 x i32> %110, %112, !dbg !22 + %114 = or <2 x i1> %113, %108, !dbg !22 + %115 = and <2 x i1> %105, %114, !dbg !22 + %116 = xor <2 x i1> %115, %39, !dbg !22 + %117 = xor <2 x i32> %107, %106, !dbg !43 + %118 = select <2 x i1> %116, <2 x i32> zeroinitializer, <2 x i32> %117, !dbg !44 + %119 = extractelement <2 x i32> %118, i64 0, !dbg !45 + %120 = xor i32 %119, %91, !dbg !45 + %121 = extractelement <2 x i32> %118, i64 1, !dbg !45 + %122 = xor i32 %121, %92, !dbg !45 + %123 = xor <2 x i32> %118, %104, !dbg !45 + %124 = xor i32 %99, %97, !dbg !38 + %125 = xor i32 %100, %98, !dbg !38 + %126 = extractelement <2 x i1> %116, i64 0, !dbg !46 + %127 = select i1 %126, i32 0, i32 %124, !dbg !46 + %128 = extractelement <2 x i1> %116, i64 1, !dbg !46 + %129 = select i1 %128, i32 0, i32 %125, !dbg !46 + %130 = xor i32 %127, %97, !dbg !47 + %131 = xor i32 %129, %98, !dbg !47 + %132 = xor i32 %127, %99, !dbg !47 + %133 = xor i32 %129, %100, !dbg !47 + %134 = icmp samesign uge i32 %120, %122, !dbg !22 + %135 = icmp ne i32 %120, %122, !dbg !22 + %136 = icmp samesign ule i32 %130, %131, !dbg !22 + %137 = or i1 %135, %136, !dbg !22 + %138 = and i1 %134, %137, !dbg !22 + %.not3 = xor i1 %138, %26, !dbg !22 + %139 = extractelement <2 x i32> %123, i64 0, !dbg !22 + %140 = extractelement <2 x i32> %123, i64 1, !dbg !22 + %141 = icmp samesign uge i32 %139, %140, !dbg !22 + %142 = icmp ne i32 %139, %140, !dbg !22 + %143 = icmp samesign ule i32 %132, %133, !dbg !22 + %144 = or i1 %142, %143, !dbg !22 + %145 = and i1 %141, %144, !dbg !22 + %.not4 = xor i1 %145, %26, !dbg !22 + %146 = xor i32 %120, %122, !dbg !43 + %147 = xor i32 %139, %140, !dbg !43 + %148 = select i1 %.not3, i32 0, i32 %146, !dbg !44 + %149 = select i1 %.not4, i32 0, i32 %147, !dbg !44 + %150 = xor i32 %148, %120, !dbg !45 + %151 = xor i32 %148, %122, !dbg !45 + %152 = xor i32 %149, %139, !dbg !45 + %153 = xor i32 %149, %140, !dbg !45 + %154 = xor i32 %130, %131, !dbg !38 + %155 = xor i32 %132, %133, !dbg !38 + %156 = select i1 %.not3, i32 0, i32 %154, !dbg !46 + %157 = select i1 %.not4, i32 0, i32 %155, !dbg !46 + %158 = xor i32 %156, %130, !dbg !47 + %159 = xor i32 %156, %131, !dbg !47 + %160 = xor i32 %157, %132, !dbg !47 + %161 = xor i32 %157, %133, !dbg !47 + %162 = mul nuw nsw i32 %150, %24, !dbg !29 + %163 = mul nuw nsw i32 %151, %24, !dbg !29 + %164 = mul nuw nsw i32 %152, %24, !dbg !29 + %165 = mul nuw nsw i32 %153, %24, !dbg !29 + %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %162, i32 1, i32 31), !dbg !48 + %167 = add i32 %162, %166, !dbg !51 + %168 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %163, i32 1, i32 31), !dbg !48 + %169 = add i32 %163, %168, !dbg !51 + %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 1, i32 31), !dbg !48 + %171 = add i32 %164, %170, !dbg !51 + %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 1, i32 31), !dbg !48 + %173 = add i32 %165, %172, !dbg !51 + %174 = mul nuw nsw i32 %150, %22, !dbg !30 + %175 = mul nuw nsw i32 %151, %22, !dbg !30 + %176 = mul nuw nsw i32 %152, %22, !dbg !30 + %177 = mul nuw nsw i32 %153, %22, !dbg !30 + %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 1, i32 31), !dbg !48 + %179 = add i32 %174, %178, !dbg !51 + %180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 1, i32 31), !dbg !48 + %181 = add i32 %175, %180, !dbg !51 + %182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 1, i32 31), !dbg !48 + %183 = add i32 %176, %182, !dbg !51 + %184 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 1, i32 31), !dbg !48 + %185 = add i32 %177, %184, !dbg !51 + %186 = mul nuw nsw i32 %158, %24, !dbg !32 + %187 = mul nuw nsw i32 %159, %24, !dbg !32 + %188 = mul nuw nsw i32 %160, %24, !dbg !32 + %189 = mul nuw nsw i32 %161, %24, !dbg !32 + %190 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %186, i32 1, i32 31), !dbg !48 + %191 = add i32 %186, %190, !dbg !51 + %192 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %187, i32 1, i32 31), !dbg !48 + %193 = add i32 %187, %192, !dbg !51 + %194 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %188, i32 1, i32 31), !dbg !48 + %195 = add i32 %188, %194, !dbg !51 + %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %189, i32 1, i32 31), !dbg !48 + %197 = add i32 %189, %196, !dbg !51 + %198 = mul nuw nsw i32 %158, %22, !dbg !31 + %199 = mul nuw nsw i32 %159, %22, !dbg !31 + %200 = mul nuw nsw i32 %160, %22, !dbg !31 + %201 = mul nuw nsw i32 %161, %22, !dbg !31 + %202 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 1, i32 31), !dbg !48 + %203 = add i32 %198, %202, !dbg !51 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %199, i32 1, i32 31), !dbg !48 + %205 = add i32 %199, %204, !dbg !51 + %206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 1, i32 31), !dbg !48 + %207 = add i32 %200, %206, !dbg !51 + %208 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %201, i32 1, i32 31), !dbg !48 + %209 = add i32 %201, %208, !dbg !51 + %210 = icmp sge i32 %167, %179, !dbg !22 + %211 = icmp ne i32 %167, %179, !dbg !22 + %212 = icmp sle i32 %191, %203, !dbg !22 + %213 = or i1 %211, %212, !dbg !22 + %214 = and i1 %210, %213, !dbg !22 + %.not5 = xor i1 %214, %27, !dbg !22 + %215 = icmp sge i32 %169, %181, !dbg !22 + %216 = icmp ne i32 %169, %181, !dbg !22 + %217 = icmp sle i32 %193, %205, !dbg !22 + %218 = or i1 %216, %217, !dbg !22 + %219 = and i1 %215, %218, !dbg !22 + %.not6 = xor i1 %219, %27, !dbg !22 + %220 = icmp sge i32 %171, %183, !dbg !22 + %221 = icmp ne i32 %171, %183, !dbg !22 + %222 = icmp sle i32 %195, %207, !dbg !22 + %223 = or i1 %221, %222, !dbg !22 + %224 = and i1 %220, %223, !dbg !22 + %.not7 = xor i1 %224, %27, !dbg !22 + %225 = icmp sge i32 %173, %185, !dbg !22 + %226 = icmp ne i32 %173, %185, !dbg !22 + %227 = icmp sle i32 %197, %209, !dbg !22 + %228 = or i1 %226, %227, !dbg !22 + %229 = and i1 %225, %228, !dbg !22 + %.not8 = xor i1 %229, %27, !dbg !22 + %230 = xor i32 %167, %179, !dbg !43 + %231 = xor i32 %169, %181, !dbg !43 + %232 = xor i32 %171, %183, !dbg !43 + %233 = xor i32 %173, %185, !dbg !43 + %234 = select i1 %.not5, i32 0, i32 %230, !dbg !44 + %235 = select i1 %.not6, i32 0, i32 %231, !dbg !44 + %236 = select i1 %.not7, i32 0, i32 %232, !dbg !44 + %237 = select i1 %.not8, i32 0, i32 %233, !dbg !44 + %238 = xor i32 %234, %150, !dbg !45 + %239 = xor i32 %235, %151, !dbg !45 + %240 = xor i32 %236, %152, !dbg !45 + %241 = xor i32 %237, %153, !dbg !45 + %242 = xor i32 %191, %203, !dbg !38 + %243 = xor i32 %193, %205, !dbg !38 + %244 = xor i32 %195, %207, !dbg !38 + %245 = xor i32 %197, %209, !dbg !38 + %246 = select i1 %.not5, i32 0, i32 %242, !dbg !46 + %247 = select i1 %.not6, i32 0, i32 %243, !dbg !46 + %248 = select i1 %.not7, i32 0, i32 %244, !dbg !46 + %249 = select i1 %.not8, i32 0, i32 %245, !dbg !46 + %250 = xor i32 %246, %158, !dbg !47 + %251 = xor i32 %247, %159, !dbg !47 + %252 = xor i32 %248, %160, !dbg !47 + %253 = xor i32 %249, %161, !dbg !47 + %254 = icmp sge i32 %238, %240, !dbg !22 + %255 = icmp ne i32 %238, %240, !dbg !22 + %256 = icmp sle i32 %250, %252, !dbg !22 + %257 = or i1 %255, %256, !dbg !22 + %258 = and i1 %254, %257, !dbg !22 + %.not9 = xor i1 %258, %27, !dbg !22 + %259 = icmp sge i32 %239, %241, !dbg !22 + %260 = icmp ne i32 %239, %241, !dbg !22 + %261 = icmp sle i32 %251, %253, !dbg !22 + %262 = or i1 %260, %261, !dbg !22 + %263 = and i1 %259, %262, !dbg !22 + %.not10 = xor i1 %263, %27, !dbg !22 + %264 = xor i32 %238, %240, !dbg !43 + %265 = xor i32 %239, %241, !dbg !43 + %266 = select i1 %.not9, i32 0, i32 %264, !dbg !44 + %267 = select i1 %.not10, i32 0, i32 %265, !dbg !44 + %268 = xor i32 %266, %238, !dbg !45 + %269 = xor i32 %267, %239, !dbg !45 + %270 = xor i32 %266, %240, !dbg !45 + %271 = xor i32 %267, %241, !dbg !45 + %272 = xor i32 %250, %252, !dbg !38 + %273 = xor i32 %251, %253, !dbg !38 + %274 = select i1 %.not9, i32 0, i32 %272, !dbg !46 + %275 = select i1 %.not10, i32 0, i32 %273, !dbg !46 + %276 = xor i32 %274, %250, !dbg !47 + %277 = xor i32 %275, %251, !dbg !47 + %278 = xor i32 %274, %252, !dbg !47 + %279 = xor i32 %275, %253, !dbg !47 + %280 = icmp sge i32 %268, %269, !dbg !22 + %281 = icmp ne i32 %268, %269, !dbg !22 + %282 = icmp sle i32 %276, %277, !dbg !22 + %283 = or i1 %281, %282, !dbg !22 + %284 = and i1 %280, %283, !dbg !22 + %.not11 = xor i1 %284, %27, !dbg !22 + %285 = icmp sge i32 %270, %271, !dbg !22 + %286 = icmp ne i32 %270, %271, !dbg !22 + %287 = icmp sle i32 %278, %279, !dbg !22 + %288 = or i1 %286, %287, !dbg !22 + %289 = and i1 %285, %288, !dbg !22 + %.not12 = xor i1 %289, %27, !dbg !22 + %290 = xor i32 %268, %269, !dbg !43 + %291 = xor i32 %270, %271, !dbg !43 + %292 = select i1 %.not11, i32 0, i32 %290, !dbg !44 + %293 = select i1 %.not12, i32 0, i32 %291, !dbg !44 + %294 = xor i32 %276, %277, !dbg !38 + %295 = xor i32 %278, %279, !dbg !38 + %296 = select i1 %.not11, i32 0, i32 %294, !dbg !46 + %297 = select i1 %.not12, i32 0, i32 %295, !dbg !46 + %298 = xor i32 %296, %276, !dbg !47 + %299 = xor i32 %296, %277, !dbg !47 + %300 = xor i32 %297, %278, !dbg !47 + %301 = xor i32 %297, %279, !dbg !47 + %302 = mul nuw nsw i32 %298, %25, !dbg !32 + %303 = mul nuw nsw i32 %299, %25, !dbg !32 + %304 = mul nuw nsw i32 %300, %25, !dbg !32 + %305 = mul nuw nsw i32 %301, %25, !dbg !32 + %306 = mul nuw nsw i32 %298, %.lobit, !dbg !31 + %307 = mul nuw nsw i32 %299, %.lobit, !dbg !31 + %308 = mul nuw nsw i32 %300, %.lobit, !dbg !31 + %309 = mul nuw nsw i32 %301, %.lobit, !dbg !31 + %310 = insertelement <2 x i32> poison, i32 %292, i64 0, !dbg !45 + %311 = shufflevector <2 x i32> %310, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !45 + %312 = insertelement <2 x i32> poison, i32 %269, i64 0, !dbg !45 + %313 = insertelement <2 x i32> %312, i32 %268, i64 1, !dbg !45 + %314 = xor <2 x i32> %311, %313, !dbg !45 + %315 = insertelement <2 x i32> poison, i32 %293, i64 0, !dbg !45 + %316 = shufflevector <2 x i32> %315, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !45 + %317 = insertelement <2 x i32> poison, i32 %271, i64 0, !dbg !45 + %318 = insertelement <2 x i32> %317, i32 %270, i64 1, !dbg !45 + %319 = xor <2 x i32> %316, %318, !dbg !45 + %320 = extractelement <2 x i32> %314, i64 1, !dbg !30 + %321 = mul nuw nsw i32 %320, %25, !dbg !29 + %322 = extractelement <2 x i32> %314, i64 0, !dbg !30 + %323 = mul nuw nsw i32 %322, %25, !dbg !29 + %324 = extractelement <2 x i32> %319, i64 1, !dbg !30 + %325 = mul nuw nsw i32 %324, %25, !dbg !29 + %326 = extractelement <2 x i32> %319, i64 0, !dbg !30 + %327 = mul nuw nsw i32 %326, %25, !dbg !29 + %328 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %321, i32 2, i32 31), !dbg !48 + %329 = add i32 %321, %328, !dbg !51 + %330 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %323, i32 2, i32 31), !dbg !48 + %331 = add i32 %323, %330, !dbg !51 + %332 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %325, i32 2, i32 31), !dbg !48 + %333 = add i32 %325, %332, !dbg !51 + %334 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %327, i32 2, i32 31), !dbg !48 + %335 = add i32 %327, %334, !dbg !51 + %336 = mul nuw nsw i32 %320, %.lobit, !dbg !30 + %337 = mul nuw nsw i32 %322, %.lobit, !dbg !30 + %338 = mul nuw nsw i32 %324, %.lobit, !dbg !30 + %339 = mul nuw nsw i32 %326, %.lobit, !dbg !30 + %340 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %336, i32 2, i32 31), !dbg !48 + %341 = add i32 %336, %340, !dbg !51 + %342 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %337, i32 2, i32 31), !dbg !48 + %343 = add i32 %337, %342, !dbg !51 + %344 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %338, i32 2, i32 31), !dbg !48 + %345 = add i32 %338, %344, !dbg !51 + %346 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %339, i32 2, i32 31), !dbg !48 + %347 = add i32 %339, %346, !dbg !51 + %348 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %302, i32 2, i32 31), !dbg !48 + %349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %303, i32 2, i32 31), !dbg !48 + %350 = insertelement <2 x i32> poison, i32 %303, i64 0, !dbg !51 + %351 = insertelement <2 x i32> %350, i32 %302, i64 1, !dbg !51 + %352 = insertelement <2 x i32> poison, i32 %349, i64 0, !dbg !51 + %353 = insertelement <2 x i32> %352, i32 %348, i64 1, !dbg !51 + %354 = add <2 x i32> %351, %353, !dbg !51 + %355 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %304, i32 2, i32 31), !dbg !48 + %356 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %305, i32 2, i32 31), !dbg !48 + %357 = insertelement <2 x i32> poison, i32 %305, i64 0, !dbg !51 + %358 = insertelement <2 x i32> %357, i32 %304, i64 1, !dbg !51 + %359 = insertelement <2 x i32> poison, i32 %356, i64 0, !dbg !51 + %360 = insertelement <2 x i32> %359, i32 %355, i64 1, !dbg !51 + %361 = add <2 x i32> %358, %360, !dbg !51 + %362 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %306, i32 2, i32 31), !dbg !48 + %363 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %307, i32 2, i32 31), !dbg !48 + %364 = insertelement <2 x i32> poison, i32 %307, i64 0, !dbg !51 + %365 = insertelement <2 x i32> %364, i32 %306, i64 1, !dbg !51 + %366 = insertelement <2 x i32> poison, i32 %363, i64 0, !dbg !51 + %367 = insertelement <2 x i32> %366, i32 %362, i64 1, !dbg !51 + %368 = add <2 x i32> %365, %367, !dbg !51 + %369 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 2, i32 31), !dbg !48 + %370 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %309, i32 2, i32 31), !dbg !48 + %371 = insertelement <2 x i32> poison, i32 %309, i64 0, !dbg !51 + %372 = insertelement <2 x i32> %371, i32 %308, i64 1, !dbg !51 + %373 = insertelement <2 x i32> poison, i32 %370, i64 0, !dbg !51 + %374 = insertelement <2 x i32> %373, i32 %369, i64 1, !dbg !51 + %375 = add <2 x i32> %372, %374, !dbg !51 + %376 = insertelement <2 x i32> poison, i32 %331, i64 0, !dbg !40 + %377 = insertelement <2 x i32> %376, i32 %329, i64 1, !dbg !40 + %378 = insertelement <2 x i32> poison, i32 %343, i64 0, !dbg !40 + %379 = insertelement <2 x i32> %378, i32 %341, i64 1, !dbg !40 + %380 = icmp slt <2 x i32> %377, %379, !dbg !40 + %381 = icmp eq <2 x i32> %377, %379, !dbg !52 + %382 = icmp sgt <2 x i32> %354, %368, !dbg !53 + %383 = and <2 x i1> %381, %382, !dbg !54 + %384 = or <2 x i1> %380, %383, !dbg !55 + %385 = insertelement <2 x i32> poison, i32 %335, i64 0, !dbg !40 + %386 = insertelement <2 x i32> %385, i32 %333, i64 1, !dbg !40 + %387 = insertelement <2 x i32> poison, i32 %347, i64 0, !dbg !40 + %388 = insertelement <2 x i32> %387, i32 %345, i64 1, !dbg !40 + %389 = icmp slt <2 x i32> %386, %388, !dbg !40 + %390 = icmp eq <2 x i32> %386, %388, !dbg !52 + %391 = icmp sgt <2 x i32> %361, %375, !dbg !53 + %392 = and <2 x i1> %390, %391, !dbg !54 + %393 = or <2 x i1> %389, %392, !dbg !55 + %394 = xor <2 x i32> %377, %379, !dbg !43 + %395 = xor <2 x i32> %386, %388, !dbg !43 + %396 = select <2 x i1> %384, <2 x i32> %394, <2 x i32> zeroinitializer, !dbg !44 + %397 = select <2 x i1> %393, <2 x i32> %395, <2 x i32> zeroinitializer, !dbg !44 + %398 = xor <2 x i32> %396, %314, !dbg !45 + %399 = xor <2 x i32> %397, %319, !dbg !45 + %400 = xor <2 x i32> %354, %368, !dbg !38 + %401 = xor <2 x i32> %361, %375, !dbg !38 + %402 = select <2 x i1> %384, <2 x i32> %400, <2 x i32> zeroinitializer, !dbg !46 + %403 = select <2 x i1> %393, <2 x i32> %401, <2 x i32> zeroinitializer, !dbg !46 + %404 = insertelement <2 x i32> poison, i32 %299, i64 0, !dbg !47 + %405 = insertelement <2 x i32> %404, i32 %298, i64 1, !dbg !47 + %406 = xor <2 x i32> %402, %405, !dbg !47 + %407 = insertelement <2 x i32> poison, i32 %301, i64 0, !dbg !47 + %408 = insertelement <2 x i32> %407, i32 %300, i64 1, !dbg !47 + %409 = xor <2 x i32> %403, %408, !dbg !47 + %410 = mul nuw nsw <2 x i32> %398, %41, !dbg !29 + %411 = mul nuw nsw <2 x i32> %399, %41, !dbg !29 + %412 = extractelement <2 x i32> %410, i64 1, !dbg !48 + %413 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %412, i32 1, i32 31), !dbg !48 + %414 = extractelement <2 x i32> %410, i64 0, !dbg !48 + %415 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %414, i32 1, i32 31), !dbg !48 + %416 = insertelement <2 x i32> poison, i32 %415, i64 0, !dbg !51 + %417 = insertelement <2 x i32> %416, i32 %413, i64 1, !dbg !51 + %418 = add <2 x i32> %410, %417, !dbg !51 + %419 = extractelement <2 x i32> %411, i64 1, !dbg !48 + %420 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %419, i32 1, i32 31), !dbg !48 + %421 = extractelement <2 x i32> %411, i64 0, !dbg !48 + %422 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %421, i32 1, i32 31), !dbg !48 + %423 = insertelement <2 x i32> poison, i32 %422, i64 0, !dbg !51 + %424 = insertelement <2 x i32> %423, i32 %420, i64 1, !dbg !51 + %425 = add <2 x i32> %411, %424, !dbg !51 + %426 = mul nuw nsw <2 x i32> %398, %43, !dbg !30 + %427 = mul nuw nsw <2 x i32> %399, %43, !dbg !30 + %428 = extractelement <2 x i32> %426, i64 1, !dbg !48 + %429 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %428, i32 1, i32 31), !dbg !48 + %430 = extractelement <2 x i32> %426, i64 0, !dbg !48 + %431 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %430, i32 1, i32 31), !dbg !48 + %432 = insertelement <2 x i32> poison, i32 %431, i64 0, !dbg !51 + %433 = insertelement <2 x i32> %432, i32 %429, i64 1, !dbg !51 + %434 = add <2 x i32> %426, %433, !dbg !51 + %435 = extractelement <2 x i32> %427, i64 1, !dbg !48 + %436 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %435, i32 1, i32 31), !dbg !48 + %437 = extractelement <2 x i32> %427, i64 0, !dbg !48 + %438 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %437, i32 1, i32 31), !dbg !48 + %439 = insertelement <2 x i32> poison, i32 %438, i64 0, !dbg !51 + %440 = insertelement <2 x i32> %439, i32 %436, i64 1, !dbg !51 + %441 = add <2 x i32> %427, %440, !dbg !51 + %442 = mul nuw nsw <2 x i32> %406, %41, !dbg !32 + %443 = extractelement <2 x i32> %442, i64 1, !dbg !48 + %444 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %443, i32 1, i32 31), !dbg !48 + %445 = mul nuw nsw <2 x i32> %409, %41, !dbg !32 + %446 = extractelement <2 x i32> %442, i64 0, !dbg !48 + %447 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %446, i32 1, i32 31), !dbg !48 + %448 = insertelement <2 x i32> poison, i32 %447, i64 0, !dbg !51 + %449 = insertelement <2 x i32> %448, i32 %444, i64 1, !dbg !51 + %450 = add <2 x i32> %442, %449, !dbg !51 + %451 = mul nuw nsw <2 x i32> %409, %44, !dbg !31 + %452 = extractelement <2 x i32> %445, i64 1, !dbg !48 + %453 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %452, i32 1, i32 31), !dbg !48 + %454 = mul nuw nsw <2 x i32> %409, %45, !dbg !32 + %455 = extractelement <2 x i32> %445, i64 0, !dbg !48 + %456 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %455, i32 1, i32 31), !dbg !48 + %457 = insertelement <2 x i32> poison, i32 %456, i64 0, !dbg !51 + %458 = insertelement <2 x i32> %457, i32 %453, i64 1, !dbg !51 + %459 = add <2 x i32> %445, %458, !dbg !51 + %460 = mul nuw nsw <2 x i32> %406, %43, !dbg !31 + %461 = mul nuw nsw <2 x i32> %409, %43, !dbg !31 + %462 = extractelement <2 x i32> %460, i64 1, !dbg !48 + %463 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %462, i32 1, i32 31), !dbg !48 + %464 = extractelement <2 x i32> %460, i64 0, !dbg !48 + %465 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %464, i32 1, i32 31), !dbg !48 + %466 = insertelement <2 x i32> poison, i32 %465, i64 0, !dbg !51 + %467 = insertelement <2 x i32> %466, i32 %463, i64 1, !dbg !51 + %468 = add <2 x i32> %460, %467, !dbg !51 + %469 = extractelement <2 x i32> %461, i64 1, !dbg !48 + %470 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %469, i32 1, i32 31), !dbg !48 + %471 = insertelement <2 x i32> %458, i32 %470, i64 1, !dbg !51 + %472 = add <2 x i32> %454, %471, !dbg !51 + %473 = extractelement <2 x i32> %461, i64 0, !dbg !48 + %474 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %473, i32 1, i32 31), !dbg !48 + %475 = insertelement <2 x i32> %471, i32 %474, i64 0, !dbg !51 + %476 = add <2 x i32> %475, %461, !dbg !51 + %477 = insertelement <2 x i32> %475, i32 %453, i64 1, !dbg !51 + %478 = add <2 x i32> %477, %451, !dbg !51 + %479 = icmp slt <2 x i32> %418, %434, !dbg !40 + %480 = icmp slt <2 x i32> %425, %441, !dbg !40 + %481 = icmp eq <2 x i32> %418, %434, !dbg !52 + %482 = icmp eq <2 x i32> %425, %441, !dbg !52 + %483 = icmp sgt <2 x i32> %450, %468, !dbg !53 + %484 = icmp sgt <2 x i32> %459, %476, !dbg !53 + %485 = and <2 x i1> %481, %483, !dbg !54 + %486 = and <2 x i1> %482, %484, !dbg !54 + %487 = or <2 x i1> %479, %485, !dbg !55 + %488 = or <2 x i1> %480, %486, !dbg !55 + %489 = xor <2 x i32> %418, %434, !dbg !43 + %490 = xor <2 x i32> %425, %441, !dbg !43 + %491 = select <2 x i1> %487, <2 x i32> %489, <2 x i32> zeroinitializer, !dbg !44 + %492 = select <2 x i1> %488, <2 x i32> %490, <2 x i32> zeroinitializer, !dbg !44 + %493 = xor <2 x i32> %491, %398, !dbg !45 + %494 = xor <2 x i32> %492, %399, !dbg !45 + %495 = xor <2 x i32> %450, %468, !dbg !38 + %496 = xor <2 x i32> %478, %472, !dbg !38 + %497 = select <2 x i1> %487, <2 x i32> %495, <2 x i32> zeroinitializer, !dbg !46 + %498 = select <2 x i1> %488, <2 x i32> %496, <2 x i32> zeroinitializer, !dbg !46 + %499 = xor <2 x i32> %497, %406, !dbg !47 + %500 = xor <2 x i32> %498, %409, !dbg !47 + %501 = icmp slt <2 x i32> %493, %494, !dbg !40 + %502 = icmp eq <2 x i32> %493, %494, !dbg !52 + %503 = icmp sgt <2 x i32> %499, %500, !dbg !53 + %504 = and <2 x i1> %502, %503, !dbg !54 + %505 = or <2 x i1> %501, %504, !dbg !55 + %506 = xor <2 x i32> %493, %494, !dbg !43 + %507 = select <2 x i1> %505, <2 x i32> %506, <2 x i32> zeroinitializer, !dbg !44 + %508 = shufflevector <2 x i32> %507, <2 x i32> poison, <2 x i32> , !dbg !45 + %509 = shufflevector <2 x i32> %493, <2 x i32> %494, <2 x i32> , !dbg !45 + %510 = xor <2 x i32> %508, %509, !dbg !45 + %511 = shufflevector <2 x i32> %507, <2 x i32> poison, <2 x i32> , !dbg !45 + %512 = shufflevector <2 x i32> %494, <2 x i32> %493, <2 x i32> , !dbg !45 + %513 = xor <2 x i32> %511, %512, !dbg !45 + %514 = shufflevector <2 x i32> %507, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !45 + %515 = shufflevector <2 x i32> %494, <2 x i32> %493, <2 x i32> , !dbg !45 + %516 = xor <2 x i32> %514, %515, !dbg !45 + %517 = shufflevector <2 x i32> %494, <2 x i32> %493, <2 x i32> , !dbg !45 + %518 = xor <2 x i32> %507, %517, !dbg !45 + %519 = xor <2 x i32> %499, %500, !dbg !38 + %520 = select <2 x i1> %505, <2 x i32> %519, <2 x i32> zeroinitializer, !dbg !46 + %521 = xor <2 x i32> %520, %499, !dbg !47 + %522 = xor <2 x i32> %520, %500, !dbg !47 + %523 = icmp slt <2 x i32> %513, %516, !dbg !40 + %524 = icmp eq <2 x i32> %518, %510, !dbg !52 + %525 = shufflevector <2 x i32> %521, <2 x i32> %522, <2 x i32> , !dbg !53 + %526 = shufflevector <2 x i32> %522, <2 x i32> %521, <2 x i32> , !dbg !53 + %527 = icmp sgt <2 x i32> %525, %526, !dbg !53 + %528 = and <2 x i1> %524, %527, !dbg !54 + %529 = or <2 x i1> %523, %528, !dbg !55 + %530 = xor <2 x i32> %526, %525, !dbg !38 + %531 = select <2 x i1> %529, <2 x i32> %530, <2 x i32> zeroinitializer, !dbg !46 + %532 = shufflevector <2 x i32> %531, <2 x i32> poison, <4 x i32> , !dbg !46 + %533 = shufflevector <2 x i32> %521, <2 x i32> %522, <4 x i32> , !dbg !47 + %534 = xor <4 x i32> %532, %533, !dbg !47 + %535 = select <4 x i1> %47, <4 x i1> %75, <4 x i1> zeroinitializer, !dbg !33 + %536 = bitcast <4 x i1> %535 to i4, !dbg !56 + %537 = tail call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 %536), !dbg !56 + %538 = zext nneg i4 %537 to i64, !dbg !56 + %539 = zext nneg i4 %537 to i32, !dbg !58 + %540 = icmp eq <4 x i64> %73, splat (i64 16384), !dbg !59 + %541 = extractelement <4 x i1> %540, i64 0, !dbg !60 + %542 = zext i1 %541 to i32, !dbg !41 + %543 = extractelement <4 x i1> %540, i64 1, !dbg !60 + %544 = zext i1 %543 to i32, !dbg !41 + %545 = extractelement <4 x i1> %540, i64 2, !dbg !61 + %546 = zext i1 %545 to i32, !dbg !41 + %547 = extractelement <4 x i1> %540, i64 3, !dbg !61 + %548 = zext i1 %547 to i32, !dbg !41 + %549 = xor i1 %541, true, !dbg !60 + %550 = and i1 %543, %549, !dbg !60 + %.not13 = xor i1 %547, true, !dbg !61 + %551 = or i1 %545, %.not13, !dbg !61 + %552 = xor i32 %542, %544, !dbg !62 + %553 = xor i32 %546, %548, !dbg !62 + %554 = select i1 %550, i32 %552, i32 0, !dbg !63 + %555 = select i1 %551, i32 %553, i32 0, !dbg !63 + %556 = xor i32 %554, %542, !dbg !64 + %557 = xor i32 %554, %544, !dbg !64 + %558 = xor i32 %555, %546, !dbg !64 + %559 = xor i32 %555, %548, !dbg !64 + %560 = select i1 %550, i32 %68, i32 0, !dbg !65 + %561 = select i1 %551, i32 %69, i32 0, !dbg !65 + %562 = xor i32 %560, %52, !dbg !66 + %563 = xor i32 %560, %53, !dbg !66 + %564 = xor i32 %561, %54, !dbg !66 + %565 = xor i32 %561, %55, !dbg !66 + %566 = insertelement <2 x i32> poison, i32 %556, i64 0, !dbg !23 + %567 = insertelement <2 x i32> %566, i32 %557, i64 1, !dbg !23 + %568 = insertelement <2 x i32> poison, i32 %558, i64 0, !dbg !23 + %569 = insertelement <2 x i32> %568, i32 %559, i64 1, !dbg !23 + %570 = icmp samesign uge <2 x i32> %567, %569, !dbg !23 + %571 = insertelement <2 x i32> %569, i32 %557, i64 1, !dbg !23 + %572 = insertelement <2 x i32> %567, i32 %559, i64 1, !dbg !23 + %573 = icmp ne <2 x i32> %571, %572, !dbg !23 + %574 = insertelement <2 x i32> poison, i32 %562, i64 0, !dbg !23 + %575 = insertelement <2 x i32> %574, i32 %563, i64 1, !dbg !23 + %576 = insertelement <2 x i32> poison, i32 %564, i64 0, !dbg !23 + %577 = insertelement <2 x i32> %576, i32 %565, i64 1, !dbg !23 + %578 = icmp ule <2 x i32> %575, %577, !dbg !23 + %579 = or <2 x i1> %578, %573, !dbg !23 + %580 = and <2 x i1> %570, %579, !dbg !23 + %581 = xor <2 x i1> %580, %39, !dbg !23 + %582 = xor <2 x i32> %572, %571, !dbg !62 + %583 = select <2 x i1> %581, <2 x i32> zeroinitializer, <2 x i32> %582, !dbg !63 + %584 = extractelement <2 x i32> %583, i64 0, !dbg !64 + %585 = xor i32 %584, %556, !dbg !64 + %586 = extractelement <2 x i32> %583, i64 1, !dbg !64 + %587 = xor i32 %586, %557, !dbg !64 + %588 = xor <2 x i32> %583, %569, !dbg !64 + %589 = xor i32 %564, %562, !dbg !67 + %590 = xor i32 %565, %563, !dbg !67 + %591 = extractelement <2 x i1> %581, i64 0, !dbg !65 + %592 = select i1 %591, i32 0, i32 %589, !dbg !65 + %593 = extractelement <2 x i1> %581, i64 1, !dbg !65 + %594 = select i1 %593, i32 0, i32 %590, !dbg !65 + %595 = xor i32 %592, %562, !dbg !66 + %596 = xor i32 %594, %563, !dbg !66 + %597 = xor i32 %592, %564, !dbg !66 + %598 = xor i32 %594, %565, !dbg !66 + %599 = icmp samesign uge i32 %585, %587, !dbg !23 + %600 = icmp ne i32 %585, %587, !dbg !23 + %601 = icmp samesign ule i32 %595, %596, !dbg !23 + %602 = or i1 %600, %601, !dbg !23 + %603 = and i1 %599, %602, !dbg !23 + %.not16 = xor i1 %603, %26, !dbg !23 + %604 = extractelement <2 x i32> %588, i64 0, !dbg !23 + %605 = extractelement <2 x i32> %588, i64 1, !dbg !23 + %606 = icmp samesign uge i32 %604, %605, !dbg !23 + %607 = icmp ne i32 %604, %605, !dbg !23 + %608 = icmp samesign ule i32 %597, %598, !dbg !23 + %609 = or i1 %607, %608, !dbg !23 + %610 = and i1 %606, %609, !dbg !23 + %.not17 = xor i1 %610, %26, !dbg !23 + %611 = xor i32 %585, %587, !dbg !62 + %612 = xor i32 %604, %605, !dbg !62 + %613 = select i1 %.not16, i32 0, i32 %611, !dbg !63 + %614 = select i1 %.not17, i32 0, i32 %612, !dbg !63 + %615 = xor i32 %613, %585, !dbg !64 + %616 = xor i32 %613, %587, !dbg !64 + %617 = xor i32 %614, %604, !dbg !64 + %618 = xor i32 %614, %605, !dbg !64 + %619 = xor i32 %595, %596, !dbg !67 + %620 = xor i32 %597, %598, !dbg !67 + %621 = select i1 %.not16, i32 0, i32 %619, !dbg !65 + %622 = select i1 %.not17, i32 0, i32 %620, !dbg !65 + %623 = mul nuw nsw i32 %615, %24, !dbg !27 + %624 = mul nuw nsw i32 %616, %24, !dbg !27 + %625 = mul nuw nsw i32 %617, %24, !dbg !27 + %626 = mul nuw nsw i32 %618, %24, !dbg !27 + %627 = mul nuw nsw i32 %615, %22, !dbg !28 + %628 = mul nuw nsw i32 %616, %22, !dbg !28 + %629 = mul nuw nsw i32 %617, %22, !dbg !28 + %630 = mul nuw nsw i32 %618, %22, !dbg !28 + %631 = insertelement <4 x i32> poison, i32 %622, i64 0, !dbg !66 + %632 = insertelement <4 x i32> %631, i32 %621, i64 1, !dbg !66 + %633 = shufflevector <4 x i32> %632, <4 x i32> poison, <4 x i32> , !dbg !66 + %634 = insertelement <4 x i32> poison, i32 %595, i64 0, !dbg !66 + %635 = insertelement <4 x i32> %634, i32 %596, i64 1, !dbg !66 + %636 = insertelement <4 x i32> %635, i32 %597, i64 2, !dbg !66 + %637 = insertelement <4 x i32> %636, i32 %598, i64 3, !dbg !66 + %638 = xor <4 x i32> %633, %637, !dbg !66 + %639 = extractelement <4 x i32> %638, i64 0, !dbg !26 + %640 = mul nuw nsw i32 %639, %24, !dbg !25 + %641 = extractelement <4 x i32> %638, i64 1, !dbg !26 + %642 = mul nuw nsw i32 %641, %24, !dbg !25 + %643 = extractelement <4 x i32> %638, i64 2, !dbg !26 + %644 = mul nuw nsw i32 %643, %24, !dbg !25 + %645 = extractelement <4 x i32> %638, i64 3, !dbg !26 + %646 = mul nuw nsw i32 %645, %24, !dbg !25 + %647 = mul nuw nsw i32 %639, %22, !dbg !26 + %648 = mul nuw nsw i32 %641, %22, !dbg !26 + %649 = mul nuw nsw i32 %643, %22, !dbg !26 + %650 = mul nuw nsw i32 %645, %22, !dbg !26 + %651 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %623, i32 1, i32 31), !dbg !68 + %652 = add i32 %651, %623, !dbg !69 + %653 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %624, i32 1, i32 31), !dbg !68 + %654 = add i32 %653, %624, !dbg !69 + %655 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %625, i32 1, i32 31), !dbg !68 + %656 = add i32 %655, %625, !dbg !69 + %657 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %626, i32 1, i32 31), !dbg !68 + %658 = add i32 %657, %626, !dbg !69 + %659 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %627, i32 1, i32 31), !dbg !68 + %660 = add i32 %659, %627, !dbg !69 + %661 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %628, i32 1, i32 31), !dbg !68 + %662 = add i32 %661, %628, !dbg !69 + %663 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %629, i32 1, i32 31), !dbg !68 + %664 = add i32 %663, %629, !dbg !69 + %665 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %630, i32 1, i32 31), !dbg !68 + %666 = add i32 %665, %630, !dbg !69 + %667 = icmp sge i32 %654, %662, !dbg !23 + %668 = icmp ne i32 %654, %662, !dbg !23 + %669 = insertelement <2 x i32> poison, i32 %652, i64 0, !dbg !23 + %670 = insertelement <2 x i32> %669, i32 %656, i64 1, !dbg !23 + %671 = insertelement <2 x i32> poison, i32 %660, i64 0, !dbg !23 + %672 = insertelement <2 x i32> %671, i32 %664, i64 1, !dbg !23 + %673 = icmp sge <2 x i32> %670, %672, !dbg !23 + %674 = icmp ne <2 x i32> %670, %672, !dbg !23 + %675 = icmp sge i32 %658, %666, !dbg !23 + %676 = icmp ne i32 %658, %666, !dbg !23 + %677 = xor i32 %660, %652, !dbg !62 + %678 = xor i32 %662, %654, !dbg !62 + %679 = xor i32 %664, %656, !dbg !62 + %680 = xor i32 %666, %658, !dbg !62 + %681 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %640, i32 1, i32 31), !dbg !68 + %682 = add i32 %681, %640, !dbg !69 + %683 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %642, i32 1, i32 31), !dbg !68 + %684 = add i32 %683, %642, !dbg !69 + %685 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %644, i32 1, i32 31), !dbg !68 + %686 = add i32 %685, %644, !dbg !69 + %687 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %646, i32 1, i32 31), !dbg !68 + %688 = add i32 %687, %646, !dbg !69 + %689 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %647, i32 1, i32 31), !dbg !68 + %690 = add i32 %689, %647, !dbg !69 + %691 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %648, i32 1, i32 31), !dbg !68 + %692 = add i32 %691, %648, !dbg !69 + %693 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %649, i32 1, i32 31), !dbg !68 + %694 = add i32 %693, %649, !dbg !69 + %695 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %650, i32 1, i32 31), !dbg !68 + %696 = add i32 %695, %650, !dbg !69 + %697 = insertelement <2 x i32> poison, i32 %682, i64 0, !dbg !23 + %698 = insertelement <2 x i32> %697, i32 %686, i64 1, !dbg !23 + %699 = insertelement <2 x i32> poison, i32 %690, i64 0, !dbg !23 + %700 = insertelement <2 x i32> %699, i32 %694, i64 1, !dbg !23 + %701 = icmp sle <2 x i32> %698, %700, !dbg !23 + %702 = or <2 x i1> %674, %701, !dbg !23 + %703 = and <2 x i1> %673, %702, !dbg !23 + %704 = xor <2 x i1> %703, %29, !dbg !23 + %705 = insertelement <2 x i32> poison, i32 %677, i64 0, !dbg !63 + %706 = insertelement <2 x i32> %705, i32 %679, i64 1, !dbg !63 + %707 = select <2 x i1> %704, <2 x i32> zeroinitializer, <2 x i32> %706, !dbg !63 + %708 = insertelement <2 x i32> poison, i32 %615, i64 0, !dbg !64 + %709 = insertelement <2 x i32> %708, i32 %617, i64 1, !dbg !64 + %710 = xor <2 x i32> %707, %709, !dbg !64 + %711 = insertelement <2 x i32> poison, i32 %688, i64 0, !dbg !23 + %712 = insertelement <2 x i32> %711, i32 %684, i64 1, !dbg !23 + %713 = insertelement <2 x i32> poison, i32 %696, i64 0, !dbg !23 + %714 = insertelement <2 x i32> %713, i32 %692, i64 1, !dbg !23 + %715 = icmp sle <2 x i32> %712, %714, !dbg !23 + %716 = insertelement <2 x i1> poison, i1 %676, i64 0, !dbg !23 + %717 = insertelement <2 x i1> %716, i1 %668, i64 1, !dbg !23 + %718 = or <2 x i1> %717, %715, !dbg !23 + %719 = insertelement <2 x i1> poison, i1 %675, i64 0, !dbg !23 + %720 = insertelement <2 x i1> %719, i1 %667, i64 1, !dbg !23 + %721 = and <2 x i1> %720, %718, !dbg !23 + %722 = xor <2 x i1> %721, %29, !dbg !23 + %723 = insertelement <2 x i32> poison, i32 %680, i64 0, !dbg !63 + %724 = insertelement <2 x i32> %723, i32 %678, i64 1, !dbg !63 + %725 = select <2 x i1> %722, <2 x i32> zeroinitializer, <2 x i32> %724, !dbg !63 + %726 = insertelement <2 x i32> poison, i32 %618, i64 0, !dbg !64 + %727 = insertelement <2 x i32> %726, i32 %616, i64 1, !dbg !64 + %728 = xor <2 x i32> %725, %727, !dbg !64 + %729 = xor i32 %690, %682, !dbg !67 + %730 = xor i32 %692, %684, !dbg !67 + %731 = xor i32 %694, %686, !dbg !67 + %732 = xor i32 %696, %688, !dbg !67 + %733 = shufflevector <2 x i1> %704, <2 x i1> %722, <4 x i32> , !dbg !65 + %734 = insertelement <4 x i32> poison, i32 %729, i64 0, !dbg !65 + %735 = insertelement <4 x i32> %734, i32 %730, i64 1, !dbg !65 + %736 = insertelement <4 x i32> %735, i32 %731, i64 2, !dbg !65 + %737 = insertelement <4 x i32> %736, i32 %732, i64 3, !dbg !65 + %738 = select <4 x i1> %733, <4 x i32> zeroinitializer, <4 x i32> %737, !dbg !65 + %739 = xor <4 x i32> %738, %638, !dbg !66 + %740 = extractelement <2 x i32> %710, i64 0, !dbg !23 + %741 = extractelement <2 x i32> %710, i64 1, !dbg !23 + %742 = icmp sge i32 %740, %741, !dbg !23 + %743 = icmp ne i32 %740, %741, !dbg !23 + %shift = shufflevector <4 x i32> %739, <4 x i32> poison, <4 x i32> , !dbg !23 + %744 = icmp sle <4 x i32> %739, %shift, !dbg !23 + %745 = extractelement <4 x i1> %744, i64 0, !dbg !23 + %746 = or i1 %743, %745, !dbg !23 + %747 = and i1 %742, %746, !dbg !23 + %748 = extractelement <2 x i32> %728, i64 0, !dbg !23 + %749 = extractelement <2 x i32> %728, i64 1, !dbg !23 + %750 = icmp sge i32 %749, %748, !dbg !23 + %751 = icmp ne i32 %749, %748, !dbg !23 + %shift53 = shufflevector <4 x i32> %739, <4 x i32> poison, <4 x i32> , !dbg !23 + %752 = icmp sle <4 x i32> %739, %shift53, !dbg !23 + %753 = extractelement <4 x i1> %752, i64 1, !dbg !23 + %754 = or i1 %751, %753, !dbg !23 + %755 = and i1 %750, %754, !dbg !23 + %756 = insertelement <2 x i1> poison, i1 %755, i64 0, !dbg !23 + %757 = insertelement <2 x i1> %756, i1 %747, i64 1, !dbg !23 + %758 = xor <2 x i1> %757, %29, !dbg !23 + %759 = xor i32 %741, %740, !dbg !62 + %760 = xor i32 %748, %749, !dbg !62 + %761 = extractelement <2 x i1> %758, i64 1, !dbg !63 + %762 = select i1 %761, i32 0, i32 %759, !dbg !63 + %763 = extractelement <2 x i1> %758, i64 0, !dbg !63 + %764 = select i1 %763, i32 0, i32 %760, !dbg !63 + %765 = xor i32 %762, %740, !dbg !64 + %766 = xor i32 %764, %749, !dbg !64 + %767 = xor i32 %762, %741, !dbg !64 + %768 = xor i32 %764, %748, !dbg !64 + %769 = shufflevector <4 x i32> %739, <4 x i32> poison, <2 x i32> , !dbg !67 + %770 = shufflevector <4 x i32> %739, <4 x i32> poison, <2 x i32> , !dbg !67 + %771 = xor <2 x i32> %769, %770, !dbg !67 + %772 = select <2 x i1> %758, <2 x i32> zeroinitializer, <2 x i32> %771, !dbg !65 + %773 = shufflevector <2 x i32> %772, <2 x i32> poison, <4 x i32> , !dbg !65 + %774 = shufflevector <2 x i32> %772, <2 x i32> poison, <2 x i32> , !dbg !66 + %775 = shufflevector <4 x i32> %739, <4 x i32> poison, <2 x i32> , !dbg !66 + %776 = xor <2 x i32> %774, %775, !dbg !66 + %777 = shufflevector <4 x i32> %739, <4 x i32> poison, <2 x i32> , !dbg !66 + %778 = xor <2 x i32> %772, %777, !dbg !66 + %779 = icmp sge i32 %765, %766, !dbg !23 + %780 = icmp ne i32 %765, %766, !dbg !23 + %781 = extractelement <2 x i32> %778, i64 1, !dbg !23 + %782 = extractelement <2 x i32> %776, i64 1, !dbg !23 + %783 = icmp sle i32 %781, %782, !dbg !23 + %784 = or i1 %780, %783, !dbg !23 + %785 = and i1 %779, %784, !dbg !23 + %.not24 = xor i1 %785, %27, !dbg !23 + %786 = icmp sge i32 %767, %768, !dbg !23 + %787 = icmp ne i32 %767, %768, !dbg !23 + %788 = extractelement <2 x i32> %778, i64 0, !dbg !23 + %789 = extractelement <2 x i32> %776, i64 0, !dbg !23 + %790 = icmp sle i32 %789, %788, !dbg !23 + %791 = or i1 %787, %790, !dbg !23 + %792 = and i1 %786, %791, !dbg !23 + %.not25 = xor i1 %792, %27, !dbg !23 + %793 = xor i32 %766, %765, !dbg !62 + %794 = xor i32 %768, %767, !dbg !62 + %795 = select i1 %.not24, i32 0, i32 %793, !dbg !63 + %796 = select i1 %.not25, i32 0, i32 %794, !dbg !63 + %797 = insertelement <2 x i32> poison, i32 %796, i64 0, !dbg !64 + %798 = insertelement <2 x i32> %797, i32 %795, i64 1, !dbg !64 + %799 = insertelement <2 x i32> poison, i32 %767, i64 0, !dbg !64 + %800 = insertelement <2 x i32> %799, i32 %765, i64 1, !dbg !64 + %801 = xor <2 x i32> %798, %800, !dbg !64 + %802 = insertelement <2 x i32> poison, i32 %768, i64 0, !dbg !64 + %803 = insertelement <2 x i32> %802, i32 %766, i64 1, !dbg !64 + %804 = xor <2 x i32> %798, %803, !dbg !64 + %805 = xor i32 %782, %781, !dbg !67 + %806 = xor i32 %788, %789, !dbg !67 + %807 = insertelement <2 x i1> poison, i1 %.not25, i64 0, !dbg !65 + %808 = insertelement <2 x i1> %807, i1 %.not24, i64 1, !dbg !65 + %809 = insertelement <2 x i32> poison, i32 %806, i64 0, !dbg !65 + %810 = insertelement <2 x i32> %809, i32 %805, i64 1, !dbg !65 + %811 = select <2 x i1> %808, <2 x i32> zeroinitializer, <2 x i32> %810, !dbg !65 + %812 = shufflevector <2 x i32> %811, <2 x i32> poison, <4 x i32> , !dbg !66 + %813 = xor <4 x i32> %773, %812, !dbg !66 + %814 = xor <2 x i32> %811, %776, !dbg !66 + %815 = xor <2 x i32> %811, %778, !dbg !66 + %816 = extractelement <2 x i32> %801, i64 1, !dbg !28 + %817 = mul nuw nsw i32 %816, %25, !dbg !27 + %818 = extractelement <2 x i32> %804, i64 1, !dbg !28 + %819 = mul nuw nsw i32 %818, %25, !dbg !27 + %820 = extractelement <2 x i32> %801, i64 0, !dbg !28 + %821 = mul nuw nsw i32 %820, %25, !dbg !27 + %822 = extractelement <2 x i32> %804, i64 0, !dbg !28 + %823 = mul nuw nsw i32 %822, %25, !dbg !27 + %824 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %817, i32 2, i32 31), !dbg !68 + %825 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %819, i32 2, i32 31), !dbg !68 + %826 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %821, i32 2, i32 31), !dbg !68 + %827 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %823, i32 2, i32 31), !dbg !68 + %828 = insertelement <4 x i32> poison, i32 %817, i64 0, !dbg !69 + %829 = insertelement <4 x i32> %828, i32 %819, i64 1, !dbg !69 + %830 = insertelement <4 x i32> %829, i32 %821, i64 2, !dbg !69 + %831 = insertelement <4 x i32> %830, i32 %823, i64 3, !dbg !69 + %832 = insertelement <4 x i32> poison, i32 %824, i64 0, !dbg !69 + %833 = insertelement <4 x i32> %832, i32 %825, i64 1, !dbg !69 + %834 = insertelement <4 x i32> %833, i32 %826, i64 2, !dbg !69 + %835 = insertelement <4 x i32> %834, i32 %827, i64 3, !dbg !69 + %836 = add <4 x i32> %831, %835, !dbg !69 + %837 = mul nuw nsw i32 %816, %.lobit, !dbg !28 + %838 = mul nuw nsw i32 %818, %.lobit, !dbg !28 + %839 = mul nuw nsw i32 %820, %.lobit, !dbg !28 + %840 = mul nuw nsw i32 %822, %.lobit, !dbg !28 + %841 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %837, i32 2, i32 31), !dbg !68 + %842 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %838, i32 2, i32 31), !dbg !68 + %843 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %839, i32 2, i32 31), !dbg !68 + %844 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %840, i32 2, i32 31), !dbg !68 + %845 = insertelement <4 x i32> poison, i32 %837, i64 0, !dbg !69 + %846 = insertelement <4 x i32> %845, i32 %838, i64 1, !dbg !69 + %847 = insertelement <4 x i32> %846, i32 %839, i64 2, !dbg !69 + %848 = insertelement <4 x i32> %847, i32 %840, i64 3, !dbg !69 + %849 = insertelement <4 x i32> poison, i32 %841, i64 0, !dbg !69 + %850 = insertelement <4 x i32> %849, i32 %842, i64 1, !dbg !69 + %851 = insertelement <4 x i32> %850, i32 %843, i64 2, !dbg !69 + %852 = insertelement <4 x i32> %851, i32 %844, i64 3, !dbg !69 + %853 = add <4 x i32> %848, %852, !dbg !69 + %854 = shufflevector <2 x i32> %815, <2 x i32> %814, <4 x i32> , !dbg !25 + %855 = mul nuw nsw <4 x i32> %854, %31, !dbg !25 + %856 = extractelement <4 x i32> %855, i64 0, !dbg !68 + %857 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %856, i32 2, i32 31), !dbg !68 + %858 = extractelement <4 x i32> %855, i64 1, !dbg !68 + %859 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %858, i32 2, i32 31), !dbg !68 + %860 = extractelement <4 x i32> %855, i64 2, !dbg !68 + %861 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %860, i32 2, i32 31), !dbg !68 + %862 = extractelement <4 x i32> %855, i64 3, !dbg !68 + %863 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %862, i32 2, i32 31), !dbg !68 + %864 = insertelement <4 x i32> poison, i32 %857, i64 0, !dbg !69 + %865 = insertelement <4 x i32> %864, i32 %859, i64 1, !dbg !69 + %866 = insertelement <4 x i32> %865, i32 %861, i64 2, !dbg !69 + %867 = insertelement <4 x i32> %866, i32 %863, i64 3, !dbg !69 + %868 = add <4 x i32> %855, %867, !dbg !69 + %869 = mul nuw nsw <4 x i32> %854, %33, !dbg !26 + %870 = extractelement <4 x i32> %869, i64 0, !dbg !68 + %871 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %870, i32 2, i32 31), !dbg !68 + %872 = extractelement <4 x i32> %869, i64 1, !dbg !68 + %873 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %872, i32 2, i32 31), !dbg !68 + %874 = extractelement <4 x i32> %869, i64 2, !dbg !68 + %875 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %874, i32 2, i32 31), !dbg !68 + %876 = extractelement <4 x i32> %869, i64 3, !dbg !68 + %877 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %876, i32 2, i32 31), !dbg !68 + %878 = insertelement <4 x i32> poison, i32 %871, i64 0, !dbg !69 + %879 = insertelement <4 x i32> %878, i32 %873, i64 1, !dbg !69 + %880 = insertelement <4 x i32> %879, i32 %875, i64 2, !dbg !69 + %881 = insertelement <4 x i32> %880, i32 %877, i64 3, !dbg !69 + %882 = add <4 x i32> %869, %881, !dbg !69 + %883 = icmp slt <4 x i32> %836, %853, !dbg !60 + %884 = icmp eq <4 x i32> %836, %853, !dbg !70 + %885 = icmp sgt <4 x i32> %868, %882, !dbg !71 + %886 = and <4 x i1> %884, %885, !dbg !72 + %887 = or <4 x i1> %883, %886, !dbg !73 + %888 = or <4 x i1> %883, %886, !dbg !73 + %889 = shufflevector <4 x i1> %888, <4 x i1> poison, <2 x i32> , !dbg !73 + %890 = or <4 x i1> %883, %886, !dbg !73 + %891 = shufflevector <4 x i1> %890, <4 x i1> poison, <2 x i32> , !dbg !73 + %foldExtExtBinop = xor <4 x i32> %836, %853, !dbg !62 + %foldExtExtBinop55 = xor <4 x i32> %836, %853, !dbg !62 + %foldExtExtBinop57 = xor <4 x i32> %836, %853, !dbg !62 + %foldExtExtBinop59 = xor <4 x i32> %836, %853, !dbg !62 + %892 = shufflevector <2 x i1> %889, <2 x i1> %891, <2 x i32> , !dbg !63 + %893 = shufflevector <4 x i32> %foldExtExtBinop57, <4 x i32> %foldExtExtBinop, <2 x i32> , !dbg !63 + %894 = select <2 x i1> %892, <2 x i32> %893, <2 x i32> zeroinitializer, !dbg !63 + %895 = shufflevector <2 x i1> %891, <2 x i1> %889, <2 x i32> , !dbg !63 + %896 = shufflevector <4 x i32> %foldExtExtBinop59, <4 x i32> %foldExtExtBinop55, <2 x i32> , !dbg !63 + %897 = select <2 x i1> %895, <2 x i32> %896, <2 x i32> zeroinitializer, !dbg !63 + %898 = shufflevector <2 x i32> %894, <2 x i32> %897, <2 x i32> , !dbg !64 + %899 = shufflevector <2 x i32> %801, <2 x i32> %804, <2 x i32> , !dbg !64 + %900 = xor <2 x i32> %898, %899, !dbg !64 + %901 = xor <2 x i32> %894, %801, !dbg !64 + %902 = xor <2 x i32> %897, %804, !dbg !64 + %903 = shufflevector <2 x i32> %897, <2 x i32> %894, <2 x i32> , !dbg !64 + %904 = shufflevector <2 x i32> %804, <2 x i32> %801, <2 x i32> , !dbg !64 + %905 = xor <2 x i32> %903, %904, !dbg !64 + %906 = xor <4 x i32> %868, %882, !dbg !67 + %907 = xor <4 x i32> %868, %882, !dbg !67 + %908 = shufflevector <4 x i32> %907, <4 x i32> poison, <2 x i32> , !dbg !67 + %909 = xor <4 x i32> %868, %882, !dbg !67 + %910 = shufflevector <4 x i32> %909, <4 x i32> poison, <2 x i32> , !dbg !67 + %911 = select <4 x i1> %887, <4 x i32> %906, <4 x i32> zeroinitializer, !dbg !65 + %912 = select <2 x i1> %889, <2 x i32> %908, <2 x i32> zeroinitializer, !dbg !65 + %913 = select <2 x i1> %891, <2 x i32> %910, <2 x i32> zeroinitializer, !dbg !65 + %914 = xor <4 x i32> %813, %911, !dbg !66 + %915 = shufflevector <2 x i32> %913, <2 x i32> %912, <2 x i32> , !dbg !66 + %916 = shufflevector <2 x i32> %815, <2 x i32> %814, <2 x i32> , !dbg !66 + %917 = xor <2 x i32> %915, %916, !dbg !66 + %918 = shufflevector <2 x i32> %912, <2 x i32> %913, <2 x i32> , !dbg !66 + %919 = shufflevector <2 x i32> %814, <2 x i32> %815, <2 x i32> , !dbg !66 + %920 = xor <2 x i32> %918, %919, !dbg !66 + %921 = shufflevector <2 x i32> %913, <2 x i32> %912, <2 x i32> , !dbg !66 + %922 = shufflevector <2 x i32> %815, <2 x i32> %814, <2 x i32> , !dbg !66 + %923 = xor <2 x i32> %921, %922, !dbg !66 + %924 = shufflevector <2 x i32> %912, <2 x i32> %913, <2 x i32> , !dbg !66 + %925 = shufflevector <2 x i32> %814, <2 x i32> %815, <2 x i32> , !dbg !66 + %926 = xor <2 x i32> %924, %925, !dbg !66 + %927 = shufflevector <2 x i32> %901, <2 x i32> %902, <4 x i32> , !dbg !27 + %928 = mul nuw nsw <4 x i32> %927, %35, !dbg !27 + %929 = extractelement <4 x i32> %928, i64 0, !dbg !68 + %930 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %929, i32 1, i32 31), !dbg !68 + %931 = extractelement <4 x i32> %928, i64 1, !dbg !68 + %932 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %931, i32 1, i32 31), !dbg !68 + %933 = extractelement <4 x i32> %928, i64 2, !dbg !68 + %934 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %933, i32 1, i32 31), !dbg !68 + %935 = extractelement <4 x i32> %928, i64 3, !dbg !68 + %936 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %935, i32 1, i32 31), !dbg !68 + %937 = insertelement <4 x i32> poison, i32 %930, i64 0, !dbg !69 + %938 = insertelement <4 x i32> %937, i32 %932, i64 1, !dbg !69 + %939 = insertelement <4 x i32> %938, i32 %934, i64 2, !dbg !69 + %940 = insertelement <4 x i32> %939, i32 %936, i64 3, !dbg !69 + %941 = add <4 x i32> %928, %940, !dbg !69 + %942 = mul nuw nsw <4 x i32> %927, %37, !dbg !28 + %943 = extractelement <4 x i32> %942, i64 0, !dbg !68 + %944 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %943, i32 1, i32 31), !dbg !68 + %945 = extractelement <4 x i32> %942, i64 1, !dbg !68 + %946 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %945, i32 1, i32 31), !dbg !68 + %947 = extractelement <4 x i32> %942, i64 2, !dbg !68 + %948 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %947, i32 1, i32 31), !dbg !68 + %949 = extractelement <4 x i32> %942, i64 3, !dbg !68 + %950 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %949, i32 1, i32 31), !dbg !68 + %951 = insertelement <4 x i32> poison, i32 %944, i64 0, !dbg !69 + %952 = insertelement <4 x i32> %951, i32 %946, i64 1, !dbg !69 + %953 = insertelement <4 x i32> %952, i32 %948, i64 2, !dbg !69 + %954 = insertelement <4 x i32> %953, i32 %950, i64 3, !dbg !69 + %955 = add <4 x i32> %942, %954, !dbg !69 + %956 = extractelement <2 x i32> %926, i64 1, !dbg !26 + %957 = mul nuw nsw i32 %956, %24, !dbg !25 + %958 = extractelement <2 x i32> %923, i64 1, !dbg !26 + %959 = mul nuw nsw i32 %958, %24, !dbg !25 + %960 = extractelement <2 x i32> %926, i64 0, !dbg !26 + %961 = mul nuw nsw i32 %960, %24, !dbg !25 + %962 = extractelement <2 x i32> %923, i64 0, !dbg !26 + %963 = mul nuw nsw i32 %962, %24, !dbg !25 + %964 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %957, i32 1, i32 31), !dbg !68 + %965 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %959, i32 1, i32 31), !dbg !68 + %966 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %961, i32 1, i32 31), !dbg !68 + %967 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %963, i32 1, i32 31), !dbg !68 + %968 = insertelement <2 x i32> poison, i32 %961, i64 0, !dbg !69 + %969 = insertelement <2 x i32> %968, i32 %959, i64 1, !dbg !69 + %970 = insertelement <2 x i32> poison, i32 %966, i64 0, !dbg !69 + %971 = insertelement <2 x i32> %970, i32 %965, i64 1, !dbg !69 + %972 = add <2 x i32> %969, %971, !dbg !69 + %973 = insertelement <4 x i32> poison, i32 %964, i64 0, !dbg !69 + %974 = insertelement <4 x i32> %973, i32 %965, i64 1, !dbg !69 + %975 = insertelement <4 x i32> %974, i32 %966, i64 2, !dbg !69 + %976 = insertelement <4 x i32> %975, i32 %967, i64 3, !dbg !69 + %977 = insertelement <4 x i32> poison, i32 %957, i64 0, !dbg !69 + %978 = insertelement <4 x i32> %977, i32 %959, i64 1, !dbg !69 + %979 = insertelement <4 x i32> %978, i32 %961, i64 2, !dbg !69 + %980 = insertelement <4 x i32> %979, i32 %963, i64 3, !dbg !69 + %981 = add <4 x i32> %976, %980, !dbg !69 + %982 = insertelement <2 x i32> poison, i32 %963, i64 0, !dbg !69 + %983 = insertelement <2 x i32> %982, i32 %957, i64 1, !dbg !69 + %984 = insertelement <2 x i32> poison, i32 %967, i64 0, !dbg !69 + %985 = insertelement <2 x i32> %984, i32 %964, i64 1, !dbg !69 + %986 = add <2 x i32> %983, %985, !dbg !69 + %987 = mul nuw nsw i32 %956, %22, !dbg !26 + %988 = mul nuw nsw i32 %958, %22, !dbg !26 + %989 = mul nuw nsw i32 %960, %22, !dbg !26 + %990 = mul nuw nsw i32 %962, %22, !dbg !26 + %991 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %987, i32 1, i32 31), !dbg !68 + %992 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %988, i32 1, i32 31), !dbg !68 + %993 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %989, i32 1, i32 31), !dbg !68 + %994 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %990, i32 1, i32 31), !dbg !68 + %995 = insertelement <2 x i32> poison, i32 %993, i64 0, !dbg !69 + %996 = insertelement <2 x i32> %995, i32 %992, i64 1, !dbg !69 + %997 = insertelement <2 x i32> poison, i32 %989, i64 0, !dbg !69 + %998 = insertelement <2 x i32> %997, i32 %988, i64 1, !dbg !69 + %999 = add <2 x i32> %996, %998, !dbg !69 + %1000 = insertelement <4 x i32> poison, i32 %991, i64 0, !dbg !69 + %1001 = insertelement <4 x i32> %1000, i32 %992, i64 1, !dbg !69 + %1002 = insertelement <4 x i32> %1001, i32 %993, i64 2, !dbg !69 + %1003 = insertelement <4 x i32> %1002, i32 %994, i64 3, !dbg !69 + %1004 = insertelement <4 x i32> poison, i32 %987, i64 0, !dbg !69 + %1005 = insertelement <4 x i32> %1004, i32 %988, i64 1, !dbg !69 + %1006 = insertelement <4 x i32> %1005, i32 %989, i64 2, !dbg !69 + %1007 = insertelement <4 x i32> %1006, i32 %990, i64 3, !dbg !69 + %1008 = add <4 x i32> %1003, %1007, !dbg !69 + %1009 = insertelement <2 x i32> poison, i32 %994, i64 0, !dbg !69 + %1010 = insertelement <2 x i32> %1009, i32 %991, i64 1, !dbg !69 + %1011 = insertelement <2 x i32> poison, i32 %990, i64 0, !dbg !69 + %1012 = insertelement <2 x i32> %1011, i32 %987, i64 1, !dbg !69 + %1013 = add <2 x i32> %1010, %1012, !dbg !69 + %1014 = icmp slt <4 x i32> %941, %955, !dbg !60 + %1015 = icmp slt <4 x i32> %941, %955, !dbg !60 + %1016 = shufflevector <4 x i1> %1015, <4 x i1> poison, <2 x i32> , !dbg !60 + %1017 = icmp slt <4 x i32> %941, %955, !dbg !60 + %1018 = shufflevector <4 x i1> %1017, <4 x i1> poison, <2 x i32> , !dbg !60 + %1019 = icmp eq <4 x i32> %941, %955, !dbg !70 + %1020 = icmp sgt <4 x i32> %981, %1008, !dbg !71 + %1021 = and <4 x i1> %1019, %1020, !dbg !72 + %1022 = and <4 x i1> %1019, %1020, !dbg !72 + %1023 = shufflevector <4 x i1> %1022, <4 x i1> poison, <2 x i32> , !dbg !72 + %1024 = and <4 x i1> %1019, %1020, !dbg !72 + %1025 = shufflevector <4 x i1> %1024, <4 x i1> poison, <2 x i32> , !dbg !72 + %1026 = or <4 x i1> %1014, %1021, !dbg !73 + %1027 = or <2 x i1> %1016, %1023, !dbg !73 + %1028 = or <2 x i1> %1018, %1025, !dbg !73 + %1029 = shufflevector <2 x i1> %1018, <2 x i1> %1016, <2 x i32> , !dbg !73 + %1030 = shufflevector <2 x i1> %1025, <2 x i1> %1023, <2 x i32> , !dbg !73 + %1031 = or <2 x i1> %1029, %1030, !dbg !73 + %1032 = shufflevector <2 x i1> %1016, <2 x i1> %1018, <2 x i32> , !dbg !73 + %1033 = shufflevector <2 x i1> %1023, <2 x i1> %1025, <2 x i32> , !dbg !73 + %1034 = or <2 x i1> %1032, %1033, !dbg !73 + %1035 = xor <4 x i32> %941, %955, !dbg !62 + %1036 = shufflevector <4 x i32> %1035, <4 x i32> poison, <2 x i32> , !dbg !62 + %1037 = xor <4 x i32> %941, %955, !dbg !62 + %1038 = shufflevector <4 x i32> %1037, <4 x i32> poison, <2 x i32> , !dbg !62 + %1039 = shufflevector <2 x i1> %1034, <2 x i1> %1031, <2 x i32> , !dbg !63 + %1040 = shufflevector <2 x i32> %1036, <2 x i32> %1038, <2 x i32> , !dbg !63 + %1041 = select <2 x i1> %1039, <2 x i32> %1040, <2 x i32> zeroinitializer, !dbg !63 + %1042 = select <2 x i1> %1034, <2 x i32> %1036, <2 x i32> zeroinitializer, !dbg !63 + %1043 = select <2 x i1> %1031, <2 x i32> %1038, <2 x i32> zeroinitializer, !dbg !63 + %1044 = shufflevector <2 x i1> %1031, <2 x i1> %1034, <2 x i32> , !dbg !63 + %1045 = shufflevector <2 x i32> %1038, <2 x i32> %1036, <2 x i32> , !dbg !63 + %1046 = select <2 x i1> %1044, <2 x i32> %1045, <2 x i32> zeroinitializer, !dbg !63 + %1047 = xor <2 x i32> %1041, %900, !dbg !64 + %1048 = xor <2 x i32> %1042, %901, !dbg !64 + %1049 = xor <2 x i32> %1043, %902, !dbg !64 + %1050 = xor <2 x i32> %1046, %905, !dbg !64 + %1051 = shufflevector <2 x i32> %1013, <2 x i32> %999, <4 x i32> , !dbg !67 + %1052 = shufflevector <2 x i32> %986, <2 x i32> %972, <4 x i32> , !dbg !67 + %1053 = xor <4 x i32> %1051, %1052, !dbg !67 + %1054 = xor <2 x i32> %999, %972, !dbg !67 + %1055 = xor <2 x i32> %1013, %986, !dbg !67 + %1056 = select <4 x i1> %1026, <4 x i32> %1053, <4 x i32> zeroinitializer, !dbg !65 + %1057 = select <2 x i1> %1027, <2 x i32> %1054, <2 x i32> zeroinitializer, !dbg !65 + %1058 = select <2 x i1> %1028, <2 x i32> %1055, <2 x i32> zeroinitializer, !dbg !65 + %1059 = shufflevector <2 x i1> %1031, <2 x i1> %1034, <2 x i32> , !dbg !65 + %1060 = shufflevector <2 x i32> %1054, <2 x i32> %1055, <2 x i32> , !dbg !65 + %1061 = select <2 x i1> %1059, <2 x i32> %1060, <2 x i32> zeroinitializer, !dbg !65 + %1062 = shufflevector <2 x i1> %1031, <2 x i1> %1034, <2 x i32> , !dbg !65 + %1063 = shufflevector <2 x i32> %1055, <2 x i32> %1054, <2 x i32> , !dbg !65 + %1064 = select <2 x i1> %1062, <2 x i32> %1063, <2 x i32> zeroinitializer, !dbg !65 + %1065 = shufflevector <2 x i32> %1055, <2 x i32> %1054, <2 x i32> , !dbg !65 + %1066 = select <2 x i1> %1031, <2 x i32> %1065, <2 x i32> zeroinitializer, !dbg !65 + %1067 = shufflevector <2 x i32> %1054, <2 x i32> %1055, <2 x i32> , !dbg !65 + %1068 = select <2 x i1> %1034, <2 x i32> %1067, <2 x i32> zeroinitializer, !dbg !65 + %1069 = xor <4 x i32> %914, %1056, !dbg !66 + %1070 = xor <2 x i32> %1061, %917, !dbg !66 + %1071 = xor <2 x i32> %1064, %920, !dbg !66 + %1072 = xor <2 x i32> %1066, %923, !dbg !66 + %1073 = xor <2 x i32> %1068, %926, !dbg !66 + %1074 = extractelement <2 x i32> %1048, i64 0, !dbg !60 + %1075 = extractelement <2 x i32> %1048, i64 1, !dbg !60 + %1076 = icmp slt i32 %1075, %1074, !dbg !60 + %1077 = extractelement <2 x i32> %1049, i64 0, !dbg !60 + %1078 = extractelement <2 x i32> %1049, i64 1, !dbg !60 + %1079 = icmp slt i32 %1078, %1077, !dbg !60 + %1080 = icmp eq i32 %1075, %1074, !dbg !70 + %1081 = icmp eq i32 %1078, %1077, !dbg !70 + %shift61 = shufflevector <2 x i32> %1073, <2 x i32> poison, <2 x i32> , !dbg !71 + %1082 = icmp sgt <2 x i32> %shift61, %1073, !dbg !71 + %1083 = extractelement <2 x i1> %1082, i64 0, !dbg !71 + %shift62 = shufflevector <2 x i32> %1072, <2 x i32> poison, <2 x i32> , !dbg !71 + %1084 = icmp sgt <2 x i32> %shift62, %1072, !dbg !71 + %1085 = extractelement <2 x i1> %1084, i64 0, !dbg !71 + %1086 = and i1 %1080, %1083, !dbg !72 + %1087 = and i1 %1081, %1085, !dbg !72 + %1088 = insertelement <2 x i1> poison, i1 %1076, i64 0, !dbg !73 + %1089 = insertelement <2 x i1> %1088, i1 %1079, i64 1, !dbg !73 + %1090 = insertelement <2 x i1> poison, i1 %1086, i64 0, !dbg !73 + %1091 = insertelement <2 x i1> %1090, i1 %1087, i64 1, !dbg !73 + %1092 = or <2 x i1> %1089, %1091, !dbg !73 + %1093 = shufflevector <2 x i32> %1048, <2 x i32> %1049, <2 x i32> , !dbg !62 + %1094 = shufflevector <2 x i32> %1048, <2 x i32> %1049, <2 x i32> , !dbg !62 + %1095 = xor <2 x i32> %1093, %1094, !dbg !62 + %1096 = select <2 x i1> %1092, <2 x i32> %1095, <2 x i32> zeroinitializer, !dbg !63 + %1097 = xor <2 x i32> %1096, %1047, !dbg !64 + %1098 = shufflevector <2 x i32> %1096, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !64 + %1099 = xor <2 x i32> %1098, %1048, !dbg !64 + %1100 = shufflevector <2 x i32> %1096, <2 x i32> poison, <2 x i32> , !dbg !64 + %1101 = shufflevector <2 x i32> %1096, <2 x i32> poison, <2 x i32> , !dbg !64 + %1102 = xor <2 x i32> %1101, %1049, !dbg !64 + %1103 = xor <2 x i32> %1100, %1050, !dbg !64 + %1104 = xor <2 x i32> %1071, %1070, !dbg !67 + %1105 = shufflevector <2 x i1> %1092, <2 x i1> poison, <2 x i32> , !dbg !65 + %1106 = select <2 x i1> %1105, <2 x i32> %1104, <2 x i32> zeroinitializer, !dbg !65 + %1107 = shufflevector <2 x i32> %1106, <2 x i32> poison, <4 x i32> , !dbg !66 + %1108 = xor <4 x i32> %1069, %1107, !dbg !66 + %1109 = shufflevector <2 x i32> %1106, <2 x i32> poison, <2 x i32> , !dbg !66 + %1110 = xor <2 x i32> %1057, %1109, !dbg !66 + %1111 = shufflevector <2 x i32> %1106, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !66 + %1112 = xor <2 x i32> %1111, %1072, !dbg !66 + %1113 = shufflevector <2 x i32> %1106, <2 x i32> poison, <2 x i32> , !dbg !66 + %1114 = xor <2 x i32> %1113, %1073, !dbg !66 + %1115 = icmp slt <2 x i32> %1099, %1102, !dbg !60 + %1116 = icmp eq <2 x i32> %1097, %1103, !dbg !70 + %1117 = icmp sgt <2 x i32> %1114, %1112, !dbg !71 + %1118 = and <2 x i1> %1116, %1117, !dbg !72 + %1119 = or <2 x i1> %1115, %1118, !dbg !73 + %1120 = xor <2 x i32> %1110, %1058, !dbg !67 + %1121 = xor <2 x i32> %1120, %814, !dbg !67 + %1122 = xor <2 x i32> %1121, %912, !dbg !67 + %1123 = xor <2 x i32> %1122, %815, !dbg !67 + %1124 = xor <2 x i32> %1123, %913, !dbg !67 + %1125 = xor <2 x i32> %1124, %1106, !dbg !67 + %1126 = select <2 x i1> %1119, <2 x i32> %1125, <2 x i32> zeroinitializer, !dbg !65 + %1127 = shufflevector <2 x i32> %1126, <2 x i32> poison, <4 x i32> , !dbg !65 + %1128 = xor <4 x i32> %1108, %1127, !dbg !66 + %1129 = xor <4 x i32> %1128, %739, !dbg !66 + %1130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %539, i32 2, i32 31), !dbg !58 + %1131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 2, i32 31), !dbg !58 + %1132 = insertelement <2 x i32> poison, i32 %1130, i64 0, !dbg !58 + %1133 = insertelement <2 x i32> %1132, i32 %1131, i64 1, !dbg !58 + %1134 = bitcast <2 x i32> %1133 to i64, !dbg !58 + %1135 = add i64 %538, %1134, !dbg !56 + %extelt.offset = lshr i64 %1135, 32, !dbg !58 + %1136 = trunc nuw i64 %extelt.offset to i32, !dbg !58 + %1137 = trunc i64 %1135 to i32, !dbg !58 + %1138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1137, i32 1, i32 31), !dbg !58 + %1139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1136, i32 1, i32 31), !dbg !58 + %1140 = insertelement <2 x i32> poison, i32 %1138, i64 0, !dbg !58 + %1141 = insertelement <2 x i32> %1140, i32 %1139, i64 1, !dbg !58 + %1142 = bitcast <2 x i32> %1141 to i64, !dbg !58 + %1143 = add i64 %1135, %1142, !dbg !56 + %1144 = insertelement <1 x i64> poison, i64 %1143, i64 0, !dbg !34 + store <1 x i64> %1144, ptr addrspace(3) %49, align 8, !dbg !34 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !34 + %1145 = load i64, ptr addrspace(3) %51, align 8, !dbg !34 + %1146 = select <4 x i1> %47, <4 x i1> %540, <4 x i1> zeroinitializer, !dbg !74 + %1147 = bitcast <4 x i1> %1146 to i4, !dbg !75 + %1148 = tail call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 %1147), !dbg !75 + %1149 = zext nneg i4 %1148 to i64, !dbg !75 + %1150 = zext nneg i4 %1148 to i32, !dbg !77 + %1151 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1150, i32 2, i32 31), !dbg !77 + %1152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 2, i32 31), !dbg !77 + %1153 = insertelement <2 x i32> poison, i32 %1151, i64 0, !dbg !77 + %1154 = insertelement <2 x i32> %1153, i32 %1152, i64 1, !dbg !77 + %1155 = bitcast <2 x i32> %1154 to i64, !dbg !77 + %1156 = add i64 %1149, %1155, !dbg !75 + %extelt.offset34 = lshr i64 %1156, 32, !dbg !77 + %1157 = trunc nuw i64 %extelt.offset34 to i32, !dbg !77 + %1158 = trunc i64 %1156 to i32, !dbg !77 + %1159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1158, i32 1, i32 31), !dbg !77 + %1160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1157, i32 1, i32 31), !dbg !77 + %1161 = insertelement <2 x i32> poison, i32 %1159, i64 0, !dbg !77 + %1162 = insertelement <2 x i32> %1161, i32 %1160, i64 1, !dbg !77 + %1163 = bitcast <2 x i32> %1162 to i64, !dbg !77 + %1164 = add i64 %1156, %1163, !dbg !75 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !78 + %1165 = insertelement <1 x i64> poison, i64 %1164, i64 0, !dbg !78 + store <1 x i64> %1165, ptr addrspace(3) %49, align 8, !dbg !78 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !78 + %1166 = load i64, ptr addrspace(3) %51, align 8, !dbg !78 + %1167 = trunc i64 %1143 to i32, !dbg !34 + %1168 = insertelement <4 x i32> poison, i32 %55, i64 0, !dbg !79 + %1169 = insertelement <4 x i32> %1168, i32 %54, i64 1, !dbg !79 + %1170 = insertelement <4 x i32> %1169, i32 %53, i64 2, !dbg !79 + %1171 = insertelement <4 x i32> %1170, i32 %52, i64 3, !dbg !79 + %1172 = insertelement <4 x i32> poison, i32 %1167, i64 0, !dbg !79 + %1173 = shufflevector <4 x i32> %1172, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !79 + %1174 = icmp slt <4 x i32> %1171, %1173, !dbg !79 + %1175 = select <4 x i1> %1174, <4 x i32> %534, <4 x i32> splat (i32 16), !dbg !80 + %1176 = add <4 x i32> %1175, splat (i32 17), !dbg !81 + %1177 = icmp slt <4 x i32> %1175, zeroinitializer, !dbg !82 + %1178 = select <4 x i1> %1177, <4 x i32> %1176, <4 x i32> %1175, !dbg !83 + %1179 = icmp ugt <4 x i32> %1178, splat (i32 16), !dbg !84 + %shift63 = shufflevector <4 x i1> %1179, <4 x i1> poison, <4 x i32> , !dbg !85 + %foldExtExtBinop64 = or <4 x i1> %shift63, %1179, !dbg !85 + %shift66 = shufflevector <4 x i1> %foldExtExtBinop64, <4 x i1> poison, <4 x i32> , !dbg !85 + %foldExtExtBinop67 = or <4 x i1> %1179, %shift66, !dbg !85 + %shift69 = shufflevector <4 x i1> %foldExtExtBinop67, <4 x i1> poison, <4 x i32> , !dbg !85 + %foldExtExtBinop70 = or <4 x i1> %1179, %shift69, !dbg !85 + %1180 = extractelement <4 x i1> %foldExtExtBinop70, i64 0, !dbg !85 + %1181 = and i1 %19, %1180, !dbg !85 + br i1 %1181, label %1182, label %1183, !dbg !85 + +1182: ; preds = %11 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 71, ptr nonnull @assertFunc_0, i64 1), !dbg !85 + unreachable, !dbg !85 + +1183: ; preds = %11 + %1184 = trunc i64 %1164 to i32, !dbg !78 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !85 + %1185 = insertelement <4 x i32> poison, i32 %52, i64 0, !dbg !86 + %1186 = insertelement <4 x i32> %1185, i32 %53, i64 1, !dbg !86 + %1187 = insertelement <4 x i32> %1186, i32 %54, i64 2, !dbg !86 + %1188 = insertelement <4 x i32> %1187, i32 %55, i64 3, !dbg !86 + %1189 = insertelement <4 x i32> poison, i32 %1184, i64 0, !dbg !86 + %1190 = shufflevector <4 x i32> %1189, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !86 + %1191 = icmp slt <4 x i32> %1188, %1190, !dbg !86 + %1192 = select <4 x i1> %1191, <4 x i32> %1129, <4 x i32> splat (i32 16), !dbg !87 + %1193 = add <4 x i32> %1192, splat (i32 17), !dbg !88 + %1194 = icmp slt <4 x i32> %1192, zeroinitializer, !dbg !89 + %1195 = select <4 x i1> %1194, <4 x i32> %1193, <4 x i32> %1192, !dbg !90 + %1196 = icmp ugt <4 x i32> %1195, splat (i32 16), !dbg !91 + %1197 = bitcast <4 x i1> %1196 to i4, !dbg !92 + %1198 = icmp ne i4 %1197, 0, !dbg !92 + %1199 = and i1 %19, %1198, !dbg !92 + br i1 %1199, label %1200, label %1201, !dbg !92 + +1200: ; preds = %1183 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 80, ptr nonnull @assertFunc_1, i64 1), !dbg !92 + unreachable, !dbg !92 + +1201: ; preds = %1183 + %1202 = trunc i64 %1166 to i32, !dbg !78 + %1203 = trunc i64 %1145 to i32, !dbg !34 + %1204 = or disjoint i32 %13, %17, !dbg !13 + %1205 = icmp slt i32 %1204, 128, !dbg !14 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !92 + %1206 = sext i32 %1204 to i64, !dbg !93 + %1207 = getelementptr i32, ptr addrspace(1) %1, i64 %1206, !dbg !93 + %1208 = and i32 %14, 96, !dbg !94 + %1209 = icmp eq i32 %1208, 0, !dbg !94 + %1210 = and i1 %1209, %1205, !dbg !94 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %1203, ptr addrspace(1) %1207, i1 %1210) #6, !dbg !94 + %1211 = getelementptr i32, ptr addrspace(1) %2, i64 %1206, !dbg !95 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %1202, ptr addrspace(1) %1211, i1 %1210) #6, !dbg !96 + %1212 = getelementptr i32, ptr addrspace(1) %3, i64 %58, !dbg !97 + %1213 = extractelement <4 x i32> %534, i64 0, !dbg !98 + %1214 = extractelement <4 x i32> %534, i64 1, !dbg !98 + %1215 = extractelement <4 x i32> %534, i64 2, !dbg !98 + %1216 = extractelement <4 x i32> %534, i64 3, !dbg !98 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1216, i32 %1215, i32 %1214, i32 %1213, ptr addrspace(1) %1212, i1 %19) #6, !dbg !98 + %1217 = mul i32 %18, 17, !dbg !99 + %1218 = extractelement <4 x i32> %1178, i64 3, !dbg !100 + %1219 = add i32 %1218, %1217, !dbg !100 + %1220 = extractelement <4 x i32> %1178, i64 2, !dbg !100 + %1221 = add i32 %1220, %1217, !dbg !100 + %1222 = extractelement <4 x i32> %1178, i64 1, !dbg !100 + %1223 = add i32 %1222, %1217, !dbg !100 + %1224 = extractelement <4 x i32> %1178, i64 0, !dbg !100 + %1225 = add i32 %1224, %1217, !dbg !100 + %1226 = sext i32 %1219 to i64, !dbg !101 + %1227 = getelementptr i32, ptr addrspace(1) %4, i64 %1226, !dbg !101 + %1228 = sext i32 %1221 to i64, !dbg !101 + %1229 = getelementptr i32, ptr addrspace(1) %4, i64 %1228, !dbg !101 + %1230 = sext i32 %1223 to i64, !dbg !101 + %1231 = getelementptr i32, ptr addrspace(1) %4, i64 %1230, !dbg !101 + %1232 = sext i32 %1225 to i64, !dbg !101 + %1233 = getelementptr i32, ptr addrspace(1) %4, i64 %1232, !dbg !101 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !102 + %1234 = ptrtoint ptr addrspace(1) %1227 to i64, !dbg !102 + %1235 = ptrtoint ptr addrspace(1) %1229 to i64, !dbg !102 + %1236 = ptrtoint ptr addrspace(1) %1231 to i64, !dbg !102 + %1237 = ptrtoint ptr addrspace(1) %1233 to i64, !dbg !102 + %1238 = and i32 %14, 48, !dbg !102 + %1239 = shl nuw nsw i32 %1238, 6, !dbg !102 + %1240 = shl nuw nsw i32 %14, 3, !dbg !102 + %1241 = and i32 %1240, 120, !dbg !102 + %1242 = lshr exact i32 %1238, 1, !dbg !102 + %1243 = shl nuw nsw i32 %14, 1, !dbg !102 + %1244 = and i32 %1243, 128, !dbg !102 + %1245 = or disjoint i32 %1239, %1241, !dbg !102 + %1246 = xor i32 %1245, %1242, !dbg !102 + %1247 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1244, !dbg !102 + %1248 = getelementptr inbounds nuw i8, ptr addrspace(3) %1247, i32 %1246, !dbg !102 + %1249 = insertelement <1 x i64> poison, i64 %1234, i64 0, !dbg !102 + store <1 x i64> %1249, ptr addrspace(3) %1248, align 8, !dbg !102 + %1250 = getelementptr inbounds nuw i8, ptr addrspace(3) %1248, i32 256, !dbg !102 + %1251 = insertelement <1 x i64> poison, i64 %1235, i64 0, !dbg !102 + store <1 x i64> %1251, ptr addrspace(3) %1250, align 8, !dbg !102 + %1252 = getelementptr inbounds nuw i8, ptr addrspace(3) %1248, i32 512, !dbg !102 + %1253 = insertelement <1 x i64> poison, i64 %1236, i64 0, !dbg !102 + store <1 x i64> %1253, ptr addrspace(3) %1252, align 8, !dbg !102 + %1254 = getelementptr inbounds nuw i8, ptr addrspace(3) %1248, i32 768, !dbg !102 + %1255 = insertelement <1 x i64> poison, i64 %1237, i64 0, !dbg !102 + store <1 x i64> %1255, ptr addrspace(3) %1254, align 8, !dbg !102 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !102 + %1256 = and i32 %14, 12, !dbg !102 + %1257 = shl nuw nsw i32 %1256, 8, !dbg !102 + %1258 = shl nuw nsw i32 %20, 5, !dbg !102 + %1259 = and i32 %1240, 896, !dbg !102 + %1260 = shl nuw nsw i32 %1256, 1, !dbg !102 + %1261 = or disjoint i32 %1257, %1258, !dbg !102 + %1262 = or disjoint i32 %1261, %1259, !dbg !102 + %1263 = or disjoint i32 %1262, %1260, !dbg !102 + %1264 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1263, !dbg !102 + %1265 = load i64, ptr addrspace(3) %1264, align 8, !dbg !102 + %1266 = xor i32 %1263, 8, !dbg !102 + %1267 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1266, !dbg !102 + %1268 = load i64, ptr addrspace(3) %1267, align 8, !dbg !102 + %1269 = xor i32 %1263, 16, !dbg !102 + %1270 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1269, !dbg !102 + %1271 = load i64, ptr addrspace(3) %1270, align 8, !dbg !102 + %1272 = xor i32 %1263, 24, !dbg !102 + %1273 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1272, !dbg !102 + %1274 = load i64, ptr addrspace(3) %1273, align 8, !dbg !102 + %1275 = inttoptr i64 %1265 to ptr addrspace(1), !dbg !102 + %1276 = inttoptr i64 %1268 to ptr addrspace(1), !dbg !102 + %1277 = inttoptr i64 %1271 to ptr addrspace(1), !dbg !102 + %1278 = inttoptr i64 %1274 to ptr addrspace(1), !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1275, i1 %1205) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1276, i1 %1205) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1277, i1 %1205) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1278, i1 %1205) #6, !dbg !102 + %1279 = getelementptr i32, ptr addrspace(1) %5, i64 %58, !dbg !103 + %1280 = extractelement <4 x i32> %1129, i64 0, !dbg !104 + %1281 = extractelement <4 x i32> %1129, i64 1, !dbg !104 + %1282 = extractelement <4 x i32> %1129, i64 2, !dbg !104 + %1283 = extractelement <4 x i32> %1129, i64 3, !dbg !104 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1280, i32 %1281, i32 %1282, i32 %1283, ptr addrspace(1) %1279, i1 %19) #6, !dbg !104 + %1284 = extractelement <4 x i32> %1195, i64 0, !dbg !105 + %1285 = add i32 %1284, %1217, !dbg !105 + %1286 = extractelement <4 x i32> %1195, i64 1, !dbg !105 + %1287 = add i32 %1286, %1217, !dbg !105 + %1288 = extractelement <4 x i32> %1195, i64 2, !dbg !105 + %1289 = add i32 %1288, %1217, !dbg !105 + %1290 = extractelement <4 x i32> %1195, i64 3, !dbg !105 + %1291 = add i32 %1290, %1217, !dbg !105 + %1292 = sext i32 %1285 to i64, !dbg !106 + %1293 = getelementptr i32, ptr addrspace(1) %6, i64 %1292, !dbg !106 + %1294 = sext i32 %1287 to i64, !dbg !106 + %1295 = getelementptr i32, ptr addrspace(1) %6, i64 %1294, !dbg !106 + %1296 = sext i32 %1289 to i64, !dbg !106 + %1297 = getelementptr i32, ptr addrspace(1) %6, i64 %1296, !dbg !106 + %1298 = sext i32 %1291 to i64, !dbg !106 + %1299 = getelementptr i32, ptr addrspace(1) %6, i64 %1298, !dbg !106 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !107 + %1300 = ptrtoint ptr addrspace(1) %1293 to i64, !dbg !107 + %1301 = ptrtoint ptr addrspace(1) %1295 to i64, !dbg !107 + %1302 = ptrtoint ptr addrspace(1) %1297 to i64, !dbg !107 + %1303 = ptrtoint ptr addrspace(1) %1299 to i64, !dbg !107 + %1304 = insertelement <1 x i64> poison, i64 %1300, i64 0, !dbg !107 + store <1 x i64> %1304, ptr addrspace(3) %1248, align 8, !dbg !107 + %1305 = insertelement <1 x i64> poison, i64 %1301, i64 0, !dbg !107 + store <1 x i64> %1305, ptr addrspace(3) %1250, align 8, !dbg !107 + %1306 = insertelement <1 x i64> poison, i64 %1302, i64 0, !dbg !107 + store <1 x i64> %1306, ptr addrspace(3) %1252, align 8, !dbg !107 + %1307 = insertelement <1 x i64> poison, i64 %1303, i64 0, !dbg !107 + store <1 x i64> %1307, ptr addrspace(3) %1254, align 8, !dbg !107 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !107 + %1308 = load i64, ptr addrspace(3) %1264, align 8, !dbg !107 + %1309 = load i64, ptr addrspace(3) %1267, align 8, !dbg !107 + %1310 = load i64, ptr addrspace(3) %1270, align 8, !dbg !107 + %1311 = load i64, ptr addrspace(3) %1273, align 8, !dbg !107 + %1312 = inttoptr i64 %1308 to ptr addrspace(1), !dbg !107 + %1313 = inttoptr i64 %1309 to ptr addrspace(1), !dbg !107 + %1314 = inttoptr i64 %1310 to ptr addrspace(1), !dbg !107 + %1315 = inttoptr i64 %1311 to ptr addrspace(1), !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1312, i1 %1205) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1313, i1 %1205) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1314, i1 %1205) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1315, i1 %1205) #6, !dbg !107 + ret void, !dbg !108 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i4 @llvm.ctpop.i4(i4) #5 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="128" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", linkageName: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 24, column: 28, scope: !9) +!11 = !DILocation(line: 24, column: 33, scope: !9) +!12 = !DILocation(line: 25, column: 44, scope: !9) +!13 = !DILocation(line: 25, column: 23, scope: !9) +!14 = !DILocation(line: 26, column: 21, scope: !9) +!15 = !DILocation(line: 27, column: 38, scope: !9) +!16 = !DILocation(line: 34, column: 40, scope: !9) +!17 = !DILocation(line: 627, column: 44, scope: !18, inlinedAt: !20) +!18 = distinct !DILexicalBlockFile(scope: !9, file: !19, discriminator: 0) +!19 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!20 = !DILocation(line: 46, column: 71, scope: !9) +!21 = !DILocation(line: 537, column: 21, scope: !18, inlinedAt: !20) +!22 = !DILocation(line: 599, column: 28, scope: !18, inlinedAt: !20) +!23 = !DILocation(line: 599, column: 28, scope: !18, inlinedAt: !24) +!24 = !DILocation(line: 51, column: 71, scope: !9) +!25 = !DILocation(line: 548, column: 23, scope: !18, inlinedAt: !24) +!26 = !DILocation(line: 551, column: 23, scope: !18, inlinedAt: !24) +!27 = !DILocation(line: 538, column: 40, scope: !18, inlinedAt: !24) +!28 = !DILocation(line: 539, column: 41, scope: !18, inlinedAt: !24) +!29 = !DILocation(line: 538, column: 40, scope: !18, inlinedAt: !20) +!30 = !DILocation(line: 539, column: 41, scope: !18, inlinedAt: !20) +!31 = !DILocation(line: 551, column: 23, scope: !18, inlinedAt: !20) +!32 = !DILocation(line: 548, column: 23, scope: !18, inlinedAt: !20) +!33 = !DILocation(line: 54, column: 35, scope: !9) +!34 = !DILocation(line: 60, column: 21, scope: !9) +!35 = !DILocation(line: 34, column: 37, scope: !9) +!36 = !DILocation(line: 34, column: 30, scope: !9) +!37 = !DILocation(line: 34, column: 45, scope: !9) +!38 = !DILocation(line: 601, column: 48, scope: !18, inlinedAt: !20) +!39 = !DILocation(line: 39, column: 18, scope: !9) +!40 = !DILocation(line: 574, column: 22, scope: !18, inlinedAt: !20) +!41 = !DILocation(line: 0, scope: !9) +!42 = !DILocation(line: 599, column: 19, scope: !18, inlinedAt: !20) +!43 = !DILocation(line: 600, column: 38, scope: !18, inlinedAt: !20) +!44 = !DILocation(line: 600, column: 46, scope: !18, inlinedAt: !20) +!45 = !DILocation(line: 600, column: 15, scope: !18, inlinedAt: !20) +!46 = !DILocation(line: 601, column: 59, scope: !18, inlinedAt: !20) +!47 = !DILocation(line: 601, column: 22, scope: !18, inlinedAt: !20) +!48 = !DILocation(line: 291, column: 36, scope: !49, inlinedAt: !20) +!49 = distinct !DILexicalBlockFile(scope: !9, file: !50, discriminator: 0) +!50 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!51 = !DILocation(line: 261, column: 15, scope: !49, inlinedAt: !20) +!52 = !DILocation(line: 591, column: 21, scope: !18, inlinedAt: !20) +!53 = !DILocation(line: 594, column: 40, scope: !18, inlinedAt: !20) +!54 = !DILocation(line: 594, column: 29, scope: !18, inlinedAt: !20) +!55 = !DILocation(line: 594, column: 23, scope: !18, inlinedAt: !20) +!56 = !DILocation(line: 261, column: 15, scope: !49, inlinedAt: !57) +!57 = !DILocation(line: 55, column: 26, scope: !9) +!58 = !DILocation(line: 291, column: 36, scope: !49, inlinedAt: !57) +!59 = !DILocation(line: 47, column: 20, scope: !9) +!60 = !DILocation(line: 574, column: 22, scope: !18, inlinedAt: !24) +!61 = !DILocation(line: 599, column: 19, scope: !18, inlinedAt: !24) +!62 = !DILocation(line: 600, column: 38, scope: !18, inlinedAt: !24) +!63 = !DILocation(line: 600, column: 46, scope: !18, inlinedAt: !24) +!64 = !DILocation(line: 600, column: 15, scope: !18, inlinedAt: !24) +!65 = !DILocation(line: 601, column: 59, scope: !18, inlinedAt: !24) +!66 = !DILocation(line: 601, column: 22, scope: !18, inlinedAt: !24) +!67 = !DILocation(line: 601, column: 48, scope: !18, inlinedAt: !24) +!68 = !DILocation(line: 291, column: 36, scope: !49, inlinedAt: !24) +!69 = !DILocation(line: 261, column: 15, scope: !49, inlinedAt: !24) +!70 = !DILocation(line: 591, column: 21, scope: !18, inlinedAt: !24) +!71 = !DILocation(line: 594, column: 40, scope: !18, inlinedAt: !24) +!72 = !DILocation(line: 594, column: 29, scope: !18, inlinedAt: !24) +!73 = !DILocation(line: 594, column: 23, scope: !18, inlinedAt: !24) +!74 = !DILocation(line: 58, column: 35, scope: !9) +!75 = !DILocation(line: 261, column: 15, scope: !49, inlinedAt: !76) +!76 = !DILocation(line: 59, column: 26, scope: !9) +!77 = !DILocation(line: 291, column: 36, scope: !49, inlinedAt: !76) +!78 = !DILocation(line: 61, column: 21, scope: !9) +!79 = !DILocation(line: 64, column: 19, scope: !9) +!80 = !DILocation(line: 66, column: 35, scope: !9) +!81 = !DILocation(line: 68, column: 20, scope: !9) +!82 = !DILocation(line: 69, column: 20, scope: !9) +!83 = !DILocation(line: 70, column: 35, scope: !9) +!84 = !DILocation(line: 71, column: 38, scope: !9) +!85 = !DILocation(line: 71, column: 63, scope: !9) +!86 = !DILocation(line: 75, column: 19, scope: !9) +!87 = !DILocation(line: 76, column: 35, scope: !9) +!88 = !DILocation(line: 77, column: 20, scope: !9) +!89 = !DILocation(line: 78, column: 20, scope: !9) +!90 = !DILocation(line: 79, column: 35, scope: !9) +!91 = !DILocation(line: 80, column: 38, scope: !9) +!92 = !DILocation(line: 80, column: 63, scope: !9) +!93 = !DILocation(line: 81, column: 25, scope: !9) +!94 = !DILocation(line: 81, column: 37, scope: !9) +!95 = !DILocation(line: 82, column: 25, scope: !9) +!96 = !DILocation(line: 82, column: 37, scope: !9) +!97 = !DILocation(line: 83, column: 25, scope: !9) +!98 = !DILocation(line: 83, column: 47, scope: !9) +!99 = !DILocation(line: 84, column: 52, scope: !9) +!100 = !DILocation(line: 84, column: 49, scope: !9) +!101 = !DILocation(line: 84, column: 25, scope: !9) +!102 = !DILocation(line: 84, column: 85, scope: !9) +!103 = !DILocation(line: 85, column: 25, scope: !9) +!104 = !DILocation(line: 85, column: 47, scope: !9) +!105 = !DILocation(line: 86, column: 49, scope: !9) +!106 = !DILocation(line: 86, column: 25, scope: !9) +!107 = !DILocation(line: 86, column: 85, scope: !9) +!108 = !DILocation(line: 86, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..c36f7f25142abf018d2dbe6a4abab1c3768d56e0 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx @@ -0,0 +1,2103 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 // -- Begin function triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 114, 107, 47, 99, 114, 107, 120, 112, 119, 105, 119, 104, 122, 107, 118, 117, 110, 55, 105, 53, 100, 50, 112, 101, 103, 111, 102, 116, 104, 121, 102, 105, 106, 110, 53, 119, 121, 103, 119, 110, 97, 101, 118, 51, 116, 119, 119, 108, 114, 98, 117, 111, 106, 113, 101, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 57, 32, 60, 32, 49, 55}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 114, 107, 47, 99, 114, 107, 120, 112, 119, 105, 119, 104, 122, 107, 118, 117, 110, 55, 105, 53, 100, 50, 112, 101, 103, 111, 102, 116, 104, 121, 102, 105, 106, 110, 53, 119, 121, 103, 119, 110, 97, 101, 118, 51, 116, 119, 119, 108, 114, 98, 117, 111, 106, 113, 101, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 48, 32, 60, 32, 49, 55}; +.extern .shared .align 16 .b8 global_smem[]; + // @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.visible .entry triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_7, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_8, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_9, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_10 +) +.reqntid 128 +{ + .reg .pred %p<324>; + .reg .b16 %rs<37>; + .reg .b32 %r<812>; + .reg .b64 %rd<82>; + .loc 1 18 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:18:0 +$L__func_begin0: + .loc 1 18 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:18:0 + +// %bb.0: + ld.param.b64 %rd17, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0]; +$L__tmp0: + .loc 1 24 28 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:24:28 + mov.u32 %r26, %ctaid.x; + .loc 1 24 33 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:24:33 + shl.b32 %r1, %r26, 5; + .loc 1 25 44 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:25:44 + mov.u32 %r2, %tid.x; + and.b32 %r27, %r2, 124; + bfe.u32 %r28, %r2, 2, 5; + and.b32 %r3, %r2, 31; + .loc 1 25 23 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:25:23 + or.b32 %r4, %r28, %r1; + .loc 1 26 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:26:21 + setp.gt.s32 %p3, %r4, 127; + setp.lt.s32 %p2, %r4, 128; + .loc 1 27 38 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:27:38 + and.b32 %r5, %r2, 3; + .loc 1 34 40 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:34:40 + shl.b32 %r29, %r4, 4; +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.b32 %r30, %r2, 1; + shr.u32 %r31, %r2, 1; + bfe.u32 %r32, %r2, 1, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r33, %r30, 1; + xor.b32 %r34, %r32, 1; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ne.b32 %p4, %r30, 0; + and.b32 %r35, %r31, 1; + setp.ne.b32 %p5, %r35, 0; +$L__tmp2: + .loc 1 60 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:60:21 + shl.b32 %r36, %r27, 1; + mov.b32 %r37, global_smem; + add.s32 %r38, %r37, %r36; + shl.b32 %r39, %r3, 3; + add.s32 %r40, %r37, %r39; + .loc 1 27 38 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:27:38 + shl.b32 %r6, %r5, 2; + or.b32 %r7, %r6, 1; + or.b32 %r8, %r6, 2; + or.b32 %r9, %r6, 3; + .loc 1 34 37 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:34:37 + or.b32 %r41, %r29, %r6; + .loc 1 34 30 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:34:30 + mad.wide.s32 %rd13, %r41, 8, %rd17; + cvt.s64.s32 %rd18, %r29; + cvt.u64.u32 %rd19, %r6; + or.b64 %rd20, %rd18, %rd19; + shl.b64 %rd21, %rd20, 3; + add.s64 %rd22, %rd17, %rd21; + add.s64 %rd16, %rd22, 16; + .loc 1 34 45 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:34:45 + // begin inline asm + mov.u64 %rd11, 0x0; + mov.u64 %rd12, 0x0; + @%p2 ld.global.v2.b64 { %rd11, %rd12 }, [ %rd13 + 0 ]; + // end inline asm + // begin inline asm + mov.u64 %rd14, 0x0; + mov.u64 %rd15, 0x0; + @%p2 ld.global.v2.b64 { %rd14, %rd15 }, [ %rd16 + 0 ]; + // end inline asm +$L__tmp3: + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r42, %r7, %r6; + xor.b32 %r43, %r8, %r9; +$L__tmp4: + .loc 1 39 18 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:39:18 + add.s64 %rd23, %rd12, -1; + add.s64 %rd24, %rd14, -1; + add.s64 %rd25, %rd11, -1; + add.s64 %rd26, %rd15, -1; + setp.gt.u64 %p6, %rd26, 16382; + setp.gt.u64 %p7, %rd25, 16382; + setp.lt.u64 %p8, %rd26, 16383; + setp.lt.u64 %p9, %rd24, 16383; + setp.lt.u64 %p10, %rd23, 16383; + setp.lt.u64 %p11, %rd25, 16383; + .loc 1 0 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:0 + selp.b32 %r44, 1, 0, %p11; + selp.b32 %r45, 1, 0, %p10; + selp.b32 %r46, 1, 0, %p9; + selp.b32 %r47, 1, 0, %p8; +$L__tmp5: + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.pred %p12, %p10, %p7; + .loc 2 599 19 // triton_helpers.py:599:19 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + or.pred %p13, %p9, %p6; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r48, %r44, %r45; + xor.b32 %r49, %r46, %r47; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r50, %r48, 0, %p12; + selp.b32 %r51, %r49, 0, %p13; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r52, %r50, %r44; + xor.b32 %r53, %r50, %r45; + xor.b32 %r54, %r51, %r46; + xor.b32 %r55, %r51, %r47; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r56, %r42, 0, %p12; + selp.b32 %r57, %r43, 0, %p13; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r58, %r56, %r6; + xor.b32 %r59, %r56, %r7; + xor.b32 %r60, %r57, %r8; + xor.b32 %r61, %r57, %r9; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.u32 %p14, %r52, %r54; + setp.ge.u32 %p15, %r53, %r55; + setp.ne.b32 %p16, %r53, %r55; + setp.ne.b32 %p17, %r54, %r52; + setp.le.u32 %p18, %r59, %r61; + setp.le.u32 %p19, %r58, %r60; + or.pred %p20, %p19, %p17; + or.pred %p21, %p18, %p16; + and.pred %p22, %p15, %p21; + and.pred %p23, %p14, %p20; + xor.pred %p24, %p23, %p4; + xor.pred %p25, %p22, %p4; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r62, %r52, %r54; + xor.b32 %r63, %r55, %r53; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r64, 0, %r63, %p25; + selp.b32 %r65, 0, %r62, %p24; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r66, %r65, %r52; + xor.b32 %r67, %r64, %r53; + xor.b32 %r68, %r64, %r55; + xor.b32 %r69, %r65, %r54; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r70, %r60, %r58; + xor.b32 %r71, %r61, %r59; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r72, 0, %r70, %p24; + selp.b32 %r73, 0, %r71, %p25; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r74, %r72, %r58; + xor.b32 %r75, %r73, %r59; + xor.b32 %r76, %r72, %r60; + xor.b32 %r77, %r73, %r61; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.u32 %p26, %r66, %r67; + setp.ne.b32 %p27, %r66, %r67; + setp.le.u32 %p28, %r74, %r75; + or.pred %p29, %p27, %p28; + and.pred %p30, %p26, %p29; + xor.pred %p31, %p30, %p4; + setp.ge.u32 %p32, %r69, %r68; + setp.ne.b32 %p33, %r69, %r68; + setp.le.u32 %p34, %r76, %r77; + or.pred %p35, %p33, %p34; + and.pred %p36, %p32, %p35; + xor.pred %p37, %p36, %p4; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r78, %r66, %r67; + xor.b32 %r79, %r69, %r68; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r80, 0, %r78, %p31; + selp.b32 %r81, 0, %r79, %p37; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r82, %r80, %r66; + xor.b32 %r83, %r80, %r67; + xor.b32 %r84, %r81, %r69; + xor.b32 %r85, %r81, %r68; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r86, %r74, %r75; + xor.b32 %r87, %r76, %r77; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r88, 0, %r86, %p31; + selp.b32 %r89, 0, %r87, %p37; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r90, %r88, %r74; + xor.b32 %r91, %r88, %r75; + xor.b32 %r92, %r89, %r76; + xor.b32 %r93, %r89, %r77; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r94, %r82, %r33; + mul.lo.s32 %r95, %r83, %r33; + mul.lo.s32 %r96, %r84, %r33; + mul.lo.s32 %r97, %r85, %r33; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r98, %r94, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r99, %r94, %r98; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r100, %r95, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r101, %r95, %r100; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r102, %r96, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r103, %r96, %r102; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r104, %r97, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r105, %r97, %r104; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r106, %r82, %r30; + mul.lo.s32 %r107, %r83, %r30; + mul.lo.s32 %r108, %r84, %r30; + mul.lo.s32 %r109, %r85, %r30; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r110, %r106, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r111, %r106, %r110; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r112, %r107, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r113, %r107, %r112; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r114, %r108, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r115, %r108, %r114; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r116, %r109, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r117, %r109, %r116; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r118, %r90, %r33; + mul.lo.s32 %r119, %r91, %r33; + mul.lo.s32 %r120, %r92, %r33; + mul.lo.s32 %r121, %r93, %r33; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r122, %r118, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r123, %r118, %r122; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r124, %r119, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r125, %r119, %r124; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r126, %r120, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r127, %r120, %r126; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r128, %r121, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r129, %r121, %r128; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r130, %r90, %r30; + mul.lo.s32 %r131, %r91, %r30; + mul.lo.s32 %r132, %r92, %r30; + mul.lo.s32 %r133, %r93, %r30; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r134, %r130, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r135, %r130, %r134; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r136, %r131, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r137, %r131, %r136; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r138, %r132, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r139, %r132, %r138; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r140, %r133, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r141, %r133, %r140; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.s32 %p38, %r99, %r111; + setp.ne.b32 %p39, %r99, %r111; + setp.le.s32 %p40, %r123, %r135; + or.pred %p41, %p39, %p40; + and.pred %p42, %p38, %p41; + xor.pred %p43, %p42, %p5; + setp.ge.s32 %p44, %r101, %r113; + setp.ne.b32 %p45, %r101, %r113; + setp.le.s32 %p46, %r125, %r137; + or.pred %p47, %p45, %p46; + and.pred %p48, %p44, %p47; + xor.pred %p49, %p48, %p5; + setp.ge.s32 %p50, %r103, %r115; + setp.ne.b32 %p51, %r103, %r115; + setp.le.s32 %p52, %r127, %r139; + or.pred %p53, %p51, %p52; + and.pred %p54, %p50, %p53; + xor.pred %p55, %p54, %p5; + setp.ge.s32 %p56, %r105, %r117; + setp.ne.b32 %p57, %r105, %r117; + setp.le.s32 %p58, %r129, %r141; + or.pred %p59, %p57, %p58; + and.pred %p60, %p56, %p59; + xor.pred %p61, %p60, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r142, %r99, %r111; + xor.b32 %r143, %r101, %r113; + xor.b32 %r144, %r103, %r115; + xor.b32 %r145, %r105, %r117; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r146, 0, %r142, %p43; + selp.b32 %r147, 0, %r143, %p49; + selp.b32 %r148, 0, %r144, %p55; + selp.b32 %r149, 0, %r145, %p61; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r150, %r146, %r82; + xor.b32 %r151, %r147, %r83; + xor.b32 %r152, %r148, %r84; + xor.b32 %r153, %r149, %r85; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r154, %r123, %r135; + xor.b32 %r155, %r125, %r137; + xor.b32 %r156, %r127, %r139; + xor.b32 %r157, %r129, %r141; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r158, 0, %r154, %p43; + selp.b32 %r159, 0, %r155, %p49; + selp.b32 %r160, 0, %r156, %p55; + selp.b32 %r161, 0, %r157, %p61; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r162, %r158, %r90; + xor.b32 %r163, %r159, %r91; + xor.b32 %r164, %r160, %r92; + xor.b32 %r165, %r161, %r93; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.s32 %p62, %r150, %r152; + setp.ne.b32 %p63, %r150, %r152; + setp.le.s32 %p64, %r162, %r164; + or.pred %p65, %p63, %p64; + and.pred %p66, %p62, %p65; + xor.pred %p67, %p66, %p5; + setp.ge.s32 %p68, %r151, %r153; + setp.ne.b32 %p69, %r151, %r153; + setp.le.s32 %p70, %r163, %r165; + or.pred %p71, %p69, %p70; + and.pred %p72, %p68, %p71; + xor.pred %p73, %p72, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r166, %r150, %r152; + xor.b32 %r167, %r151, %r153; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r168, 0, %r166, %p67; + selp.b32 %r169, 0, %r167, %p73; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r170, %r168, %r150; + xor.b32 %r171, %r169, %r151; + xor.b32 %r172, %r168, %r152; + xor.b32 %r173, %r169, %r153; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r174, %r162, %r164; + xor.b32 %r175, %r163, %r165; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r176, 0, %r174, %p67; + selp.b32 %r177, 0, %r175, %p73; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r178, %r176, %r162; + xor.b32 %r179, %r177, %r163; + xor.b32 %r180, %r176, %r164; + xor.b32 %r181, %r177, %r165; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.s32 %p74, %r170, %r171; + setp.ne.b32 %p75, %r170, %r171; + setp.le.s32 %p76, %r178, %r179; + or.pred %p77, %p75, %p76; + and.pred %p78, %p74, %p77; + xor.pred %p79, %p78, %p5; + setp.ge.s32 %p80, %r172, %r173; + setp.ne.b32 %p81, %r172, %r173; + setp.le.s32 %p82, %r180, %r181; + or.pred %p83, %p81, %p82; + and.pred %p84, %p80, %p83; + xor.pred %p85, %p84, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r182, %r170, %r171; + xor.b32 %r183, %r172, %r173; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r184, 0, %r182, %p79; + selp.b32 %r185, 0, %r183, %p85; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r186, %r178, %r179; + xor.b32 %r187, %r180, %r181; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r188, 0, %r186, %p79; + selp.b32 %r189, 0, %r187, %p85; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r190, %r188, %r178; + xor.b32 %r191, %r188, %r179; + xor.b32 %r192, %r189, %r180; + xor.b32 %r193, %r189, %r181; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r194, %r190, %r34; + mul.lo.s32 %r195, %r191, %r34; + mul.lo.s32 %r196, %r192, %r34; + mul.lo.s32 %r197, %r193, %r34; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r198, %r190, %r32; + mul.lo.s32 %r199, %r191, %r32; + mul.lo.s32 %r200, %r192, %r32; + mul.lo.s32 %r201, %r193, %r32; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r202, %r184, %r171; + xor.b32 %r203, %r184, %r170; + xor.b32 %r204, %r185, %r173; + xor.b32 %r205, %r185, %r172; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r206, %r203, %r34; + mul.lo.s32 %r207, %r202, %r34; + mul.lo.s32 %r208, %r205, %r34; + mul.lo.s32 %r209, %r204, %r34; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r210, %r206, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r211, %r206, %r210; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r212, %r207, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r213, %r207, %r212; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r214, %r208, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r215, %r208, %r214; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r216, %r209, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r217, %r209, %r216; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r218, %r203, %r32; + mul.lo.s32 %r219, %r202, %r32; + mul.lo.s32 %r220, %r205, %r32; + mul.lo.s32 %r221, %r204, %r32; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r222, %r218, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r223, %r218, %r222; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r224, %r219, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r225, %r219, %r224; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r226, %r220, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r227, %r220, %r226; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r228, %r221, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r229, %r221, %r228; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r230, %r194, 2, 31, -1; + shfl.sync.bfly.b32 %r231, %r195, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r232, %r195, %r231; + add.s32 %r233, %r194, %r230; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r234, %r196, 2, 31, -1; + shfl.sync.bfly.b32 %r235, %r197, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r236, %r197, %r235; + add.s32 %r237, %r196, %r234; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r238, %r198, 2, 31, -1; + shfl.sync.bfly.b32 %r239, %r199, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r240, %r199, %r239; + add.s32 %r241, %r198, %r238; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r242, %r200, 2, 31, -1; + shfl.sync.bfly.b32 %r243, %r201, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r244, %r201, %r243; + add.s32 %r245, %r200, %r242; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.lt.s32 %p86, %r213, %r225; + setp.lt.s32 %p87, %r211, %r223; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p88, %r211, %r223; + setp.eq.b32 %p89, %r213, %r225; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p90, %r233, %r241; + setp.gt.s32 %p91, %r232, %r240; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.pred %p92, %p89, %p91; + and.pred %p93, %p88, %p90; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + or.pred %p94, %p87, %p93; + or.pred %p95, %p86, %p92; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.lt.s32 %p96, %r217, %r229; + setp.lt.s32 %p97, %r215, %r227; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p98, %r215, %r227; + setp.eq.b32 %p99, %r217, %r229; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p100, %r237, %r245; + setp.gt.s32 %p101, %r236, %r244; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.pred %p102, %p99, %p101; + and.pred %p103, %p98, %p100; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + or.pred %p104, %p97, %p103; + or.pred %p105, %p96, %p102; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r246, %r211, %r223; + xor.b32 %r247, %r213, %r225; + xor.b32 %r248, %r215, %r227; + xor.b32 %r249, %r217, %r229; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r250, %r247, 0, %p95; + selp.b32 %r251, %r246, 0, %p94; + selp.b32 %r252, %r249, 0, %p105; + selp.b32 %r253, %r248, 0, %p104; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r254, %r251, %r203; + xor.b32 %r255, %r250, %r202; + xor.b32 %r256, %r253, %r205; + xor.b32 %r257, %r252, %r204; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r258, %r233, %r241; + xor.b32 %r259, %r232, %r240; + xor.b32 %r260, %r237, %r245; + xor.b32 %r261, %r236, %r244; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r262, %r259, 0, %p95; + selp.b32 %r263, %r258, 0, %p94; + selp.b32 %r264, %r261, 0, %p105; + selp.b32 %r265, %r260, 0, %p104; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r266, %r263, %r190; + xor.b32 %r267, %r262, %r191; + xor.b32 %r268, %r265, %r192; + xor.b32 %r269, %r264, %r193; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r270, %r255, %r33; + mul.lo.s32 %r271, %r254, %r33; + mul.lo.s32 %r272, %r257, %r33; + mul.lo.s32 %r273, %r256, %r33; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r274, %r271, 1, 31, -1; + shfl.sync.bfly.b32 %r275, %r270, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r276, %r271, %r274; + add.s32 %r277, %r270, %r275; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r278, %r273, 1, 31, -1; + shfl.sync.bfly.b32 %r279, %r272, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r280, %r273, %r278; + add.s32 %r281, %r272, %r279; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r282, %r255, %r30; + mul.lo.s32 %r283, %r254, %r30; + mul.lo.s32 %r284, %r257, %r30; + mul.lo.s32 %r285, %r256, %r30; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r286, %r283, 1, 31, -1; + shfl.sync.bfly.b32 %r287, %r282, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r288, %r283, %r286; + add.s32 %r289, %r282, %r287; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r290, %r285, 1, 31, -1; + shfl.sync.bfly.b32 %r291, %r284, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r292, %r285, %r290; + add.s32 %r293, %r284, %r291; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r294, %r267, %r33; + mul.lo.s32 %r295, %r266, %r33; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r296, %r295, 1, 31, -1; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r297, %r269, %r33; + mul.lo.s32 %r298, %r268, %r33; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r299, %r294, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r300, %r294, %r299; + add.s32 %r301, %r295, %r296; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r302, %r269, %r30; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r303, %r298, 1, 31, -1; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r304, %r268, %r30; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r305, %r297, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r306, %r297, %r305; + add.s32 %r307, %r298, %r303; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r308, %r267, %r30; + mul.lo.s32 %r309, %r266, %r30; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r310, %r309, 1, 31, -1; + shfl.sync.bfly.b32 %r311, %r308, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r312, %r308, %r311; + add.s32 %r313, %r309, %r310; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r314, %r304, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r315, %r304, %r314; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r316, %r302, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r317, %r316, %r302; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.lt.s32 %p106, %r277, %r289; + setp.lt.s32 %p107, %r276, %r288; + setp.lt.s32 %p108, %r281, %r293; + setp.lt.s32 %p109, %r280, %r292; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p110, %r276, %r288; + setp.eq.b32 %p111, %r277, %r289; + setp.eq.b32 %p112, %r280, %r292; + setp.eq.b32 %p113, %r281, %r293; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p114, %r301, %r313; + setp.gt.s32 %p115, %r300, %r312; + setp.gt.s32 %p116, %r307, %r315; + setp.gt.s32 %p117, %r306, %r317; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.pred %p118, %p111, %p115; + and.pred %p119, %p110, %p114; + and.pred %p120, %p113, %p117; + and.pred %p121, %p112, %p116; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + or.pred %p122, %p107, %p119; + or.pred %p123, %p106, %p118; + or.pred %p124, %p109, %p121; + or.pred %p125, %p108, %p120; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r318, %r276, %r288; + xor.b32 %r319, %r277, %r289; + xor.b32 %r320, %r280, %r292; + xor.b32 %r321, %r281, %r293; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r322, %r319, 0, %p123; + selp.b32 %r323, %r318, 0, %p122; + selp.b32 %r324, %r321, 0, %p125; + selp.b32 %r325, %r320, 0, %p124; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r326, %r323, %r254; + xor.b32 %r327, %r322, %r255; + xor.b32 %r328, %r325, %r256; + xor.b32 %r329, %r324, %r257; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r330, %r300, %r312; + xor.b32 %r331, %r301, %r313; + xor.b32 %r332, %r317, %r306; + xor.b32 %r333, %r307, %r315; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r334, %r331, 0, %p122; + selp.b32 %r335, %r330, 0, %p123; + selp.b32 %r336, %r333, 0, %p124; + selp.b32 %r337, %r332, 0, %p125; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r338, %r335, %r267; + xor.b32 %r339, %r334, %r266; + xor.b32 %r340, %r337, %r269; + xor.b32 %r341, %r336, %r268; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.lt.s32 %p126, %r327, %r329; + setp.lt.s32 %p127, %r326, %r328; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p128, %r326, %r328; + setp.eq.b32 %p129, %r327, %r329; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p130, %r339, %r341; + setp.gt.s32 %p131, %r338, %r340; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.pred %p132, %p129, %p131; + and.pred %p133, %p128, %p130; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + or.pred %p134, %p127, %p133; + or.pred %p135, %p126, %p132; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r342, %r326, %r328; + xor.b32 %r343, %r327, %r329; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r344, %r343, 0, %p135; + selp.b32 %r345, %r342, 0, %p134; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r346, %r345, %r328; + xor.b32 %r347, %r344, %r327; + xor.b32 %r348, %r345, %r326; + xor.b32 %r349, %r344, %r329; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r350, %r339, %r341; + xor.b32 %r351, %r338, %r340; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r352, %r351, 0, %p135; + selp.b32 %r353, %r350, 0, %p134; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r354, %r353, %r339; + xor.b32 %r355, %r352, %r338; + xor.b32 %r356, %r353, %r341; + xor.b32 %r357, %r352, %r340; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.lt.s32 %p136, %r348, %r347; + setp.lt.s32 %p137, %r346, %r349; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p138, %r348, %r347; + setp.eq.b32 %p139, %r349, %r346; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p140, %r354, %r355; + setp.gt.s32 %p141, %r356, %r357; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r358, %r355, %r354; + xor.b32 %r359, %r357, %r356; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r360, %r359, 0, %p141; + selp.b32 %r361, %r360, 0, %p139; + selp.b32 %r362, %r359, %r361, %p137; + selp.b32 %r363, %r358, 0, %p140; + selp.b32 %r364, %r363, 0, %p138; + selp.b32 %r365, %r358, %r364, %p136; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r12, %r365, %r355; + xor.b32 %r13, %r365, %r354; + xor.b32 %r11, %r362, %r356; + xor.b32 %r10, %r362, %r357; +$L__tmp6: + .loc 1 54 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:54:35 + and.pred %p142, %p2, %p9; + selp.b16 %rs1, 1, 0, %p142; + shl.b16 %rs2, %rs1, 2; + and.pred %p143, %p2, %p8; + selp.b16 %rs3, -1, 0, %p143; + shl.b16 %rs4, %rs3, 3; + or.b16 %rs5, %rs4, %rs2; + and.pred %p144, %p2, %p11; + selp.b16 %rs6, 1, 0, %p144; + and.pred %p145, %p2, %p10; + selp.b16 %rs7, -1, 0, %p145; + shl.b16 %rs8, %rs7, 1; + or.b16 %rs9, %rs6, %rs8; + and.b16 %rs10, %rs9, 3; + or.b16 %rs11, %rs10, %rs5; +$L__tmp7: + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + and.b16 %rs12, %rs11, 15; + cvt.u32.u16 %r366, %rs12; + popc.b32 %r367, %r366; + cvt.u64.u32 %rd27, %r367; +$L__tmp8: + .loc 1 47 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:47:20 + setp.ne.b64 %p146, %rd15, 16384; + setp.ne.b64 %p147, %rd11, 16384; + setp.eq.b64 %p148, %rd15, 16384; + setp.eq.b64 %p149, %rd14, 16384; + setp.eq.b64 %p150, %rd12, 16384; + setp.eq.b64 %p151, %rd11, 16384; + .loc 1 0 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:0 + selp.b32 %r368, 1, 0, %p151; + selp.b32 %r369, 1, 0, %p150; + selp.b32 %r370, 1, 0, %p149; + selp.b32 %r371, 1, 0, %p148; +$L__tmp9: + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + and.pred %p152, %p150, %p147; + .loc 2 599 19 // triton_helpers.py:599:19 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + or.pred %p153, %p149, %p146; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r372, %r368, %r369; + xor.b32 %r373, %r370, %r371; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r374, %r372, 0, %p152; + selp.b32 %r375, %r373, 0, %p153; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r376, %r374, %r368; + xor.b32 %r377, %r374, %r369; + xor.b32 %r378, %r375, %r370; + xor.b32 %r379, %r375, %r371; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r380, %r42, 0, %p152; + selp.b32 %r381, %r43, 0, %p153; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r382, %r380, %r6; + xor.b32 %r383, %r380, %r7; + xor.b32 %r384, %r381, %r8; + xor.b32 %r385, %r381, %r9; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.u32 %p154, %r376, %r378; + setp.ge.u32 %p155, %r377, %r379; + setp.ne.b32 %p156, %r377, %r379; + setp.ne.b32 %p157, %r378, %r376; + setp.le.u32 %p158, %r383, %r385; + setp.le.u32 %p159, %r382, %r384; + or.pred %p160, %p159, %p157; + or.pred %p161, %p158, %p156; + and.pred %p162, %p155, %p161; + and.pred %p163, %p154, %p160; + xor.pred %p164, %p163, %p4; + xor.pred %p165, %p162, %p4; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r386, %r376, %r378; + xor.b32 %r387, %r379, %r377; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r388, 0, %r387, %p165; + selp.b32 %r389, 0, %r386, %p164; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r390, %r389, %r376; + xor.b32 %r391, %r388, %r377; + xor.b32 %r392, %r388, %r379; + xor.b32 %r393, %r389, %r378; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r394, %r384, %r382; + xor.b32 %r395, %r385, %r383; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r396, 0, %r394, %p164; + selp.b32 %r397, 0, %r395, %p165; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r398, %r396, %r382; + xor.b32 %r399, %r397, %r383; + xor.b32 %r400, %r396, %r384; + xor.b32 %r401, %r397, %r385; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.u32 %p166, %r390, %r391; + setp.ne.b32 %p167, %r390, %r391; + setp.le.u32 %p168, %r398, %r399; + or.pred %p169, %p167, %p168; + and.pred %p170, %p166, %p169; + xor.pred %p171, %p170, %p4; + setp.ge.u32 %p172, %r393, %r392; + setp.ne.b32 %p173, %r393, %r392; + setp.le.u32 %p174, %r400, %r401; + or.pred %p175, %p173, %p174; + and.pred %p176, %p172, %p175; + xor.pred %p177, %p176, %p4; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r402, %r390, %r391; + xor.b32 %r403, %r393, %r392; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r404, 0, %r402, %p171; + selp.b32 %r405, 0, %r403, %p177; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r406, %r404, %r390; + xor.b32 %r407, %r404, %r391; + xor.b32 %r408, %r405, %r393; + xor.b32 %r409, %r405, %r392; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r410, %r398, %r399; + xor.b32 %r411, %r400, %r401; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r412, 0, %r410, %p171; + selp.b32 %r413, 0, %r411, %p177; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r414, %r406, %r33; + mul.lo.s32 %r415, %r407, %r33; + mul.lo.s32 %r416, %r408, %r33; + mul.lo.s32 %r417, %r409, %r33; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r418, %r406, %r30; + mul.lo.s32 %r419, %r407, %r30; + mul.lo.s32 %r420, %r408, %r30; + mul.lo.s32 %r421, %r409, %r30; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r422, %r413, %r401; + xor.b32 %r423, %r413, %r400; + xor.b32 %r424, %r412, %r399; + xor.b32 %r425, %r412, %r398; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r426, %r425, %r33; + mul.lo.s32 %r427, %r424, %r33; + mul.lo.s32 %r428, %r423, %r33; + mul.lo.s32 %r429, %r422, %r33; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r430, %r425, %r30; + mul.lo.s32 %r431, %r424, %r30; + mul.lo.s32 %r432, %r423, %r30; + mul.lo.s32 %r433, %r422, %r30; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r434, %r414, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r435, %r434, %r414; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r436, %r415, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r437, %r436, %r415; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r438, %r416, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r439, %r438, %r416; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r440, %r417, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r441, %r440, %r417; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r442, %r418, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r443, %r442, %r418; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r444, %r419, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r445, %r444, %r419; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r446, %r420, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r447, %r446, %r420; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r448, %r421, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r449, %r448, %r421; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.s32 %p178, %r437, %r445; + setp.ne.b32 %p179, %r437, %r445; + setp.ge.s32 %p180, %r435, %r443; + setp.ge.s32 %p181, %r439, %r447; + setp.ne.b32 %p182, %r439, %r447; + setp.ne.b32 %p183, %r435, %r443; + setp.ge.s32 %p184, %r441, %r449; + setp.ne.b32 %p185, %r441, %r449; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r450, %r443, %r435; + xor.b32 %r451, %r445, %r437; + xor.b32 %r452, %r447, %r439; + xor.b32 %r453, %r449, %r441; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r454, %r426, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r455, %r454, %r426; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r456, %r427, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r457, %r456, %r427; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r458, %r428, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r459, %r458, %r428; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r460, %r429, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r461, %r460, %r429; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r462, %r430, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r463, %r462, %r430; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r464, %r431, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r465, %r464, %r431; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r466, %r432, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r467, %r466, %r432; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r468, %r433, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r469, %r468, %r433; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.le.s32 %p186, %r459, %r467; + setp.le.s32 %p187, %r455, %r463; + or.pred %p188, %p183, %p187; + or.pred %p189, %p182, %p186; + and.pred %p190, %p181, %p189; + and.pred %p191, %p180, %p188; + xor.pred %p192, %p191, %p5; + xor.pred %p193, %p190, %p5; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r470, 0, %r452, %p193; + selp.b32 %r471, 0, %r450, %p192; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r472, %r471, %r406; + xor.b32 %r473, %r470, %r408; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.le.s32 %p194, %r461, %r469; + setp.le.s32 %p195, %r457, %r465; + or.pred %p196, %p179, %p195; + or.pred %p197, %p185, %p194; + and.pred %p198, %p184, %p197; + and.pred %p199, %p178, %p196; + xor.pred %p200, %p199, %p5; + xor.pred %p201, %p198, %p5; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r474, 0, %r453, %p201; + selp.b32 %r475, 0, %r451, %p200; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r476, %r475, %r407; + xor.b32 %r477, %r474, %r409; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r478, %r463, %r455; + xor.b32 %r479, %r465, %r457; + xor.b32 %r480, %r467, %r459; + xor.b32 %r481, %r469, %r461; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r482, 0, %r481, %p201; + selp.b32 %r483, 0, %r479, %p200; + selp.b32 %r484, 0, %r480, %p193; + selp.b32 %r485, 0, %r478, %p192; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r486, %r485, %r425; + xor.b32 %r487, %r484, %r423; + xor.b32 %r488, %r483, %r424; + xor.b32 %r489, %r482, %r422; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.s32 %p202, %r472, %r473; + setp.ne.b32 %p203, %r472, %r473; + setp.le.s32 %p204, %r488, %r489; + setp.le.s32 %p205, %r486, %r487; + or.pred %p206, %p203, %p205; + and.pred %p207, %p202, %p206; + setp.ge.s32 %p208, %r476, %r477; + setp.ne.b32 %p209, %r476, %r477; + or.pred %p210, %p209, %p204; + and.pred %p211, %p208, %p210; + xor.pred %p212, %p211, %p5; + xor.pred %p213, %p207, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r490, %r473, %r472; + xor.b32 %r491, %r477, %r476; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r492, 0, %r490, %p213; + selp.b32 %r493, 0, %r491, %p212; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r494, %r492, %r472; + xor.b32 %r495, %r493, %r476; + xor.b32 %r496, %r492, %r473; + xor.b32 %r497, %r493, %r477; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r498, %r487, %r486; + xor.b32 %r499, %r489, %r488; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r500, 0, %r499, %p212; + selp.b32 %r501, 0, %r498, %p213; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r502, %r501, %r487; + xor.b32 %r503, %r500, %r488; + xor.b32 %r504, %r500, %r489; + xor.b32 %r505, %r501, %r486; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.s32 %p214, %r494, %r495; + setp.ne.b32 %p215, %r494, %r495; + setp.le.s32 %p216, %r505, %r503; + or.pred %p217, %p215, %p216; + and.pred %p218, %p214, %p217; + xor.pred %p219, %p218, %p5; + setp.ge.s32 %p220, %r496, %r497; + setp.ne.b32 %p221, %r496, %r497; + setp.le.s32 %p222, %r502, %r504; + or.pred %p223, %p221, %p222; + and.pred %p224, %p220, %p223; + xor.pred %p225, %p224, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r506, %r495, %r494; + xor.b32 %r507, %r497, %r496; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r508, 0, %r506, %p219; + selp.b32 %r509, 0, %r507, %p225; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r510, %r509, %r496; + xor.b32 %r511, %r508, %r494; + xor.b32 %r512, %r509, %r497; + xor.b32 %r513, %r508, %r495; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r514, %r503, %r505; + xor.b32 %r515, %r504, %r502; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r516, 0, %r515, %p225; + selp.b32 %r517, 0, %r514, %p219; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r522, %r517, %r503; + xor.b32 %r523, %r516, %r502; + xor.b32 %r524, %r517, %r505; + xor.b32 %r525, %r516, %r504; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r526, %r511, %r34; + mul.lo.s32 %r527, %r513, %r34; + mul.lo.s32 %r528, %r510, %r34; + mul.lo.s32 %r529, %r512, %r34; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r530, %r526, 2, 31, -1; + shfl.sync.bfly.b32 %r531, %r527, 2, 31, -1; + shfl.sync.bfly.b32 %r532, %r528, 2, 31, -1; + shfl.sync.bfly.b32 %r533, %r529, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r534, %r527, %r531; + add.s32 %r535, %r529, %r533; + add.s32 %r536, %r526, %r530; + add.s32 %r537, %r528, %r532; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r538, %r511, %r32; + mul.lo.s32 %r539, %r513, %r32; + mul.lo.s32 %r540, %r510, %r32; + mul.lo.s32 %r541, %r512, %r32; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r542, %r538, 2, 31, -1; + shfl.sync.bfly.b32 %r543, %r539, 2, 31, -1; + shfl.sync.bfly.b32 %r544, %r540, 2, 31, -1; + shfl.sync.bfly.b32 %r545, %r541, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r546, %r539, %r543; + add.s32 %r547, %r541, %r545; + add.s32 %r548, %r538, %r542; + add.s32 %r549, %r540, %r544; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r550, %r525, %r34; + mul.lo.s32 %r551, %r523, %r34; + mul.lo.s32 %r552, %r522, %r34; + mul.lo.s32 %r553, %r524, %r34; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r554, %r553, 2, 31, -1; + shfl.sync.bfly.b32 %r555, %r552, 2, 31, -1; + shfl.sync.bfly.b32 %r556, %r551, 2, 31, -1; + shfl.sync.bfly.b32 %r557, %r550, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r558, %r551, %r556; + add.s32 %r559, %r553, %r554; + add.s32 %r560, %r550, %r557; + add.s32 %r561, %r552, %r555; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r562, %r525, %r32; + mul.lo.s32 %r563, %r523, %r32; + mul.lo.s32 %r564, %r522, %r32; + mul.lo.s32 %r565, %r524, %r32; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r566, %r565, 2, 31, -1; + shfl.sync.bfly.b32 %r567, %r564, 2, 31, -1; + shfl.sync.bfly.b32 %r568, %r563, 2, 31, -1; + shfl.sync.bfly.b32 %r569, %r562, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r570, %r563, %r568; + add.s32 %r571, %r565, %r566; + add.s32 %r572, %r562, %r569; + add.s32 %r573, %r564, %r567; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.lt.s32 %p226, %r537, %r549; + setp.lt.s32 %p227, %r536, %r548; + setp.lt.s32 %p228, %r535, %r547; + setp.lt.s32 %p229, %r534, %r546; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.eq.b32 %p230, %r534, %r546; + setp.eq.b32 %p231, %r535, %r547; + setp.eq.b32 %p232, %r536, %r548; + setp.eq.b32 %p233, %r537, %r549; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.gt.s32 %p234, %r561, %r573; + setp.gt.s32 %p235, %r560, %r572; + setp.gt.s32 %p236, %r559, %r571; + setp.gt.s32 %p237, %r558, %r570; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + and.pred %p238, %p233, %p237; + and.pred %p239, %p232, %p236; + and.pred %p240, %p231, %p235; + and.pred %p241, %p230, %p234; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + or.pred %p242, %p229, %p241; + or.pred %p243, %p228, %p240; + or.pred %p244, %p227, %p239; + or.pred %p245, %p226, %p238; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r574, %r534, %r546; + xor.b32 %r575, %r535, %r547; + xor.b32 %r576, %r536, %r548; + xor.b32 %r577, %r537, %r549; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r578, %r577, 0, %p245; + selp.b32 %r579, %r576, 0, %p244; + selp.b32 %r580, %r575, 0, %p243; + selp.b32 %r581, %r574, 0, %p242; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r582, %r581, %r513; + xor.b32 %r583, %r579, %r511; + xor.b32 %r584, %r578, %r510; + xor.b32 %r585, %r580, %r512; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r586, %r561, %r573; + xor.b32 %r587, %r558, %r570; + xor.b32 %r588, %r560, %r572; + xor.b32 %r589, %r559, %r571; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r590, %r589, 0, %p244; + selp.b32 %r591, %r588, 0, %p243; + selp.b32 %r592, %r587, 0, %p245; + selp.b32 %r593, %r586, 0, %p242; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r598, %r593, %r522; + xor.b32 %r599, %r590, %r524; + xor.b32 %r600, %r591, %r525; + xor.b32 %r601, %r592, %r523; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r602, %r585, %r33; + mul.lo.s32 %r603, %r584, %r33; + mul.lo.s32 %r604, %r582, %r33; + mul.lo.s32 %r605, %r583, %r33; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r606, %r605, 1, 31, -1; + shfl.sync.bfly.b32 %r607, %r604, 1, 31, -1; + shfl.sync.bfly.b32 %r608, %r603, 1, 31, -1; + shfl.sync.bfly.b32 %r609, %r602, 1, 31, -1; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r614, %r585, %r30; + mul.lo.s32 %r615, %r584, %r30; + mul.lo.s32 %r616, %r582, %r30; + mul.lo.s32 %r617, %r583, %r30; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r618, %r617, 1, 31, -1; + shfl.sync.bfly.b32 %r619, %r616, 1, 31, -1; + shfl.sync.bfly.b32 %r620, %r615, 1, 31, -1; + shfl.sync.bfly.b32 %r621, %r614, 1, 31, -1; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r626, %r599, %r33; + mul.lo.s32 %r627, %r598, %r33; + mul.lo.s32 %r628, %r601, %r33; + mul.lo.s32 %r629, %r600, %r33; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r630, %r626, 1, 31, -1; + shfl.sync.bfly.b32 %r631, %r627, 1, 31, -1; + shfl.sync.bfly.b32 %r632, %r628, 1, 31, -1; + shfl.sync.bfly.b32 %r633, %r629, 1, 31, -1; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r638, %r599, %r30; + mul.lo.s32 %r639, %r598, %r30; + mul.lo.s32 %r640, %r601, %r30; + mul.lo.s32 %r641, %r600, %r30; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r642, %r638, 1, 31, -1; + shfl.sync.bfly.b32 %r643, %r639, 1, 31, -1; + shfl.sync.bfly.b32 %r644, %r640, 1, 31, -1; + shfl.sync.bfly.b32 %r645, %r641, 1, 31, -1; +$L__tmp10: + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + shfl.sync.bfly.b32 %r720, %r367, 2, 31, -1; + mov.b32 %r721, 0; + shfl.sync.bfly.b32 %r722, %r721, 2, 31, -1; + cvt.u64.u32 %rd28, %r720; + cvt.u64.u32 %rd29, %r722; + shl.b64 %rd30, %rd29, 32; + or.b64 %rd31, %rd28, %rd30; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + add.s64 %rd32, %rd27, %rd31; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + mov.b64 {_, %r723}, %rd32; + cvt.u32.u64 %r724, %rd32; + shfl.sync.bfly.b32 %r725, %r724, 1, 31, -1; + shfl.sync.bfly.b32 %r726, %r723, 1, 31, -1; + cvt.u64.u32 %rd33, %r725; + cvt.u64.u32 %rd34, %r726; + shl.b64 %rd35, %rd34, 32; + or.b64 %rd36, %rd33, %rd35; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + add.s64 %rd37, %rd32, %rd36; +$L__tmp11: + .loc 1 60 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:60:21 + st.shared.b64 [%r38], %rd37; + bar.sync 0; + ld.shared.b64 %rd2, [%r40]; + .loc 1 58 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:58:35 + and.pred %p282, %p2, %p149; + selp.b16 %rs13, 1, 0, %p282; + shl.b16 %rs14, %rs13, 2; + and.pred %p283, %p2, %p148; + selp.b16 %rs15, -1, 0, %p283; + shl.b16 %rs16, %rs15, 3; + or.b16 %rs17, %rs16, %rs14; + and.pred %p284, %p2, %p151; + selp.b16 %rs18, 1, 0, %p284; + and.pred %p285, %p2, %p150; + selp.b16 %rs19, -1, 0, %p285; + shl.b16 %rs20, %rs19, 1; + or.b16 %rs21, %rs18, %rs20; + and.b16 %rs22, %rs21, 3; + or.b16 %rs23, %rs22, %rs17; +$L__tmp12: + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + and.b16 %rs24, %rs23, 15; + cvt.u32.u16 %r727, %rs24; + popc.b32 %r728, %r727; + cvt.u64.u32 %rd38, %r728; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + shfl.sync.bfly.b32 %r729, %r728, 2, 31, -1; + shfl.sync.bfly.b32 %r730, %r721, 2, 31, -1; + cvt.u64.u32 %rd39, %r729; + cvt.u64.u32 %rd40, %r730; + shl.b64 %rd41, %rd40, 32; + or.b64 %rd42, %rd39, %rd41; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + add.s64 %rd43, %rd38, %rd42; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + mov.b64 {_, %r731}, %rd43; + cvt.u32.u64 %r732, %rd43; + shfl.sync.bfly.b32 %r733, %r732, 1, 31, -1; + shfl.sync.bfly.b32 %r734, %r731, 1, 31, -1; + cvt.u64.u32 %rd44, %r733; + cvt.u64.u32 %rd45, %r734; + shl.b64 %rd46, %rd45, 32; + or.b64 %rd47, %rd44, %rd46; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + add.s64 %rd3, %rd43, %rd47; +$L__tmp13: + .loc 1 61 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:61:21 + bar.sync 0; + st.shared.b64 [%r38], %rd3; + bar.sync 0; + .loc 1 60 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:60:21 + cvt.u32.u64 %r735, %rd37; + .loc 1 64 19 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:64:19 + setp.lt.s32 %p286, %r7, %r735; + setp.lt.s32 %p287, %r6, %r735; + setp.lt.s32 %p288, %r8, %r735; + setp.lt.s32 %p289, %r9, %r735; + .loc 1 66 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:66:35 + selp.b32 %r736, %r10, 16, %p289; + selp.b32 %r737, %r11, 16, %p288; + selp.b32 %r738, %r13, 16, %p287; + selp.b32 %r739, %r12, 16, %p286; + .loc 1 68 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:68:20 + add.s32 %r740, %r739, 17; + add.s32 %r741, %r738, 17; + add.s32 %r742, %r737, 17; + add.s32 %r743, %r736, 17; + .loc 1 69 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:69:20 + setp.lt.s32 %p290, %r739, 0; + setp.lt.s32 %p291, %r738, 0; + setp.lt.s32 %p292, %r737, 0; + setp.lt.s32 %p293, %r736, 0; + .loc 1 70 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:70:35 + selp.b32 %r18, %r743, %r736, %p293; + selp.b32 %r19, %r742, %r737, %p292; + selp.b32 %r21, %r741, %r738, %p291; + selp.b32 %r20, %r740, %r739, %p290; + .loc 1 71 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:71:63 + max.u32 %r744, %r21, %r20; + max.u32 %r745, %r19, %r744; + max.u32 %r746, %r18, %r745; + setp.lt.u32 %p294, %r746, 17; + or.pred %p295, %p3, %p294; + @%p295 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + .loc 1 0 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:0 + xor.b32 %r518, %r501, %r517; + xor.b32 %r519, %r500, %r516; + xor.b32 %r520, %r501, %r516; + xor.b32 %r521, %r500, %r517; + xor.b32 %r594, %r521, %r593; + xor.b32 %r595, %r520, %r592; + xor.b32 %r596, %r519, %r591; + xor.b32 %r597, %r518, %r590; + add.s32 %r610, %r604, %r607; + add.s32 %r611, %r602, %r609; + add.s32 %r612, %r605, %r606; + add.s32 %r613, %r603, %r608; + add.s32 %r622, %r616, %r619; + add.s32 %r623, %r614, %r621; + add.s32 %r624, %r617, %r618; + add.s32 %r625, %r615, %r620; + add.s32 %r634, %r628, %r632; + add.s32 %r635, %r627, %r631; + add.s32 %r636, %r626, %r630; + add.s32 %r637, %r629, %r633; + add.s32 %r646, %r644, %r640; + add.s32 %r647, %r643, %r639; + add.s32 %r648, %r642, %r638; + add.s32 %r649, %r645, %r641; + setp.lt.s32 %p246, %r613, %r625; + setp.lt.s32 %p247, %r612, %r624; + setp.lt.s32 %p248, %r611, %r623; + setp.lt.s32 %p249, %r610, %r622; + setp.eq.b32 %p250, %r610, %r622; + setp.eq.b32 %p251, %r611, %r623; + setp.eq.b32 %p252, %r612, %r624; + setp.eq.b32 %p253, %r613, %r625; + setp.gt.s32 %p254, %r635, %r647; + setp.gt.s32 %p255, %r637, %r649; + setp.gt.s32 %p256, %r636, %r648; + setp.gt.s32 %p257, %r634, %r646; + and.pred %p258, %p253, %p257; + and.pred %p259, %p252, %p256; + and.pred %p260, %p251, %p255; + and.pred %p261, %p250, %p254; + or.pred %p262, %p249, %p261; + or.pred %p263, %p248, %p260; + or.pred %p264, %p247, %p259; + or.pred %p265, %p246, %p258; + xor.b32 %r650, %r610, %r622; + xor.b32 %r651, %r611, %r623; + xor.b32 %r652, %r612, %r624; + xor.b32 %r653, %r613, %r625; + selp.b32 %r654, %r653, 0, %p265; + selp.b32 %r655, %r652, 0, %p264; + selp.b32 %r656, %r651, 0, %p263; + selp.b32 %r657, %r650, 0, %p262; + xor.b32 %r658, %r654, %r584; + xor.b32 %r659, %r655, %r583; + xor.b32 %r660, %r657, %r582; + xor.b32 %r661, %r656, %r585; + xor.b32 %r662, %r646, %r634; + xor.b32 %r663, %r647, %r635; + xor.b32 %r664, %r648, %r636; + xor.b32 %r665, %r649, %r637; + selp.b32 %r666, %r663, 0, %p262; + selp.b32 %r667, %r662, 0, %p265; + selp.b32 %r668, %r665, 0, %p263; + selp.b32 %r669, %r664, 0, %p264; + xor.b32 %r670, %r597, %r669; + xor.b32 %r671, %r596, %r668; + xor.b32 %r672, %r669, %r599; + xor.b32 %r673, %r667, %r601; + xor.b32 %r674, %r666, %r598; + xor.b32 %r675, %r668, %r600; + setp.lt.s32 %p266, %r659, %r658; + setp.lt.s32 %p267, %r660, %r661; + setp.eq.b32 %p268, %r659, %r658; + setp.eq.b32 %p269, %r660, %r661; + setp.gt.s32 %p270, %r672, %r673; + setp.gt.s32 %p271, %r674, %r675; + and.pred %p272, %p268, %p270; + and.pred %p273, %p269, %p271; + or.pred %p274, %p266, %p272; + or.pred %p275, %p267, %p273; + xor.b32 %r676, %r658, %r659; + xor.b32 %r677, %r661, %r660; + selp.b32 %r678, %r677, 0, %p275; + selp.b32 %r679, %r676, 0, %p274; + xor.b32 %r680, %r679, %r658; + xor.b32 %r681, %r678, %r660; + xor.b32 %r682, %r679, %r659; + xor.b32 %r683, %r678, %r661; + xor.b32 %r684, %r675, %r674; + xor.b32 %r685, %r673, %r672; + selp.b32 %r686, %r685, 0, %p274; + selp.b32 %r687, %r684, 0, %p275; + xor.b32 %r688, %r671, %r687; + xor.b32 %r689, %r667, %r686; + xor.b32 %r690, %r689, %r595; + xor.b32 %r691, %r666, %r687; + xor.b32 %r692, %r691, %r594; + xor.b32 %r693, %r670, %r686; + xor.b32 %r694, %r687, %r675; + xor.b32 %r695, %r687, %r674; + xor.b32 %r696, %r686, %r673; + xor.b32 %r697, %r686, %r672; + setp.lt.s32 %p276, %r682, %r681; + setp.lt.s32 %p277, %r680, %r683; + setp.eq.b32 %p278, %r681, %r682; + setp.eq.b32 %p279, %r680, %r683; + setp.gt.s32 %p280, %r697, %r695; + setp.gt.s32 %p281, %r696, %r694; + xor.b32 %r698, %r689, %r668; + xor.b32 %r699, %r691, %r669; + xor.b32 %r700, %r699, %r522; + xor.b32 %r701, %r698, %r523; + xor.b32 %r702, %r701, %r592; + xor.b32 %r703, %r700, %r593; + xor.b32 %r704, %r703, %r524; + xor.b32 %r705, %r702, %r525; + xor.b32 %r706, %r705, %r591; + xor.b32 %r707, %r704, %r590; + xor.b32 %r708, %r707, %r686; + xor.b32 %r709, %r706, %r687; + selp.b32 %r710, %r709, 0, %p281; + selp.b32 %r711, %r710, 0, %p279; + selp.b32 %r712, %r709, %r711, %p277; + selp.b32 %r713, %r708, 0, %p280; + selp.b32 %r714, %r713, 0, %p278; + selp.b32 %r715, %r708, %r714, %p276; + xor.b32 %r716, %r693, %r715; + xor.b32 %r717, %r692, %r715; + xor.b32 %r718, %r690, %r712; + xor.b32 %r719, %r688, %r712; + xor.b32 %r17, %r719, %r489; + xor.b32 %r16, %r718, %r487; + xor.b32 %r15, %r717, %r488; + xor.b32 %r14, %r716, %r486; + .loc 1 61 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:61:21 + ld.shared.b64 %rd4, [%r40]; + cvt.u32.u64 %r747, %rd3; + .loc 1 71 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:71:63 + bar.sync 0; + .loc 1 75 19 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:75:19 + setp.lt.s32 %p297, %r8, %r747; + setp.lt.s32 %p298, %r9, %r747; + setp.lt.s32 %p299, %r6, %r747; + setp.lt.s32 %p300, %r7, %r747; + .loc 1 76 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:76:35 + selp.b32 %r748, %r15, 16, %p300; + selp.b32 %r749, %r14, 16, %p299; + selp.b32 %r750, %r17, 16, %p298; + selp.b32 %r751, %r16, 16, %p297; + .loc 1 77 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:77:20 + add.s32 %r752, %r751, 17; + add.s32 %r753, %r750, 17; + add.s32 %r754, %r749, 17; + add.s32 %r755, %r748, 17; + .loc 1 78 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:78:20 + setp.lt.s32 %p301, %r751, 0; + setp.lt.s32 %p302, %r750, 0; + setp.lt.s32 %p303, %r749, 0; + setp.lt.s32 %p304, %r748, 0; + .loc 1 79 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:79:35 + selp.b32 %r23, %r755, %r748, %p304; + selp.b32 %r22, %r754, %r749, %p303; + selp.b32 %r25, %r753, %r750, %p302; + selp.b32 %r24, %r752, %r751, %p301; + .loc 1 80 38 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:80:38 + setp.gt.u32 %p305, %r24, 16; + selp.b16 %rs25, 1, 0, %p305; + shl.b16 %rs26, %rs25, 2; + setp.gt.u32 %p306, %r25, 16; + selp.b16 %rs27, -1, 0, %p306; + shl.b16 %rs28, %rs27, 3; + or.b16 %rs29, %rs28, %rs26; + setp.gt.u32 %p307, %r22, 16; + selp.b16 %rs30, 1, 0, %p307; + setp.gt.u32 %p308, %r23, 16; + selp.b16 %rs31, -1, 0, %p308; + shl.b16 %rs32, %rs31, 1; + or.b16 %rs33, %rs30, %rs32; + and.b16 %rs34, %rs33, 3; + or.b16 %rs35, %rs34, %rs29; + .loc 1 80 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:80:63 + and.b16 %rs36, %rs35, 15; + setp.eq.b16 %p309, %rs36, 0; + or.pred %p310, %p3, %p309; + @%p310 bra $L__BB0_4; + bra.uni $L__BB0_3; +$L__BB0_4: + .loc 1 0 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:0:63 + ld.param.b64 %rd10, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6]; + ld.param.b64 %rd9, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5]; + ld.param.b64 %rd8, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4]; + ld.param.b64 %rd7, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3]; + ld.param.b64 %rd6, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2]; + ld.param.b64 %rd5, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1]; + cvt.s64.s32 %rd1, %r41; + .loc 1 61 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:61:21 + cvt.u32.u64 %r757, %rd4; + .loc 1 60 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:60:21 + cvt.u32.u64 %r756, %rd2; + .loc 1 25 23 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:25:23 + or.b32 %r774, %r1, %r3; + .loc 1 26 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:26:21 + setp.lt.s32 %p314, %r774, 128; + .loc 1 80 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:80:63 + bar.sync 0; + .loc 1 81 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:81:25 + mul.wide.s32 %rd60, %r774, 4; + add.s64 %rd48, %rd5, %rd60; + .loc 1 81 37 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:81:37 + and.b32 %r775, %r2, 96; + setp.eq.b32 %p323, %r775, 0; + and.pred %p311, %p323, %p314; + // begin inline asm + @%p311 st.global.b32 [ %rd48 + 0 ], { %r756 }; + // end inline asm + .loc 1 82 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:82:25 + add.s64 %rd49, %rd6, %rd60; + .loc 1 82 37 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:82:37 + // begin inline asm + @%p311 st.global.b32 [ %rd49 + 0 ], { %r757 }; + // end inline asm + .loc 1 83 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:83:25 + shl.b64 %rd61, %rd1, 2; + add.s64 %rd50, %rd7, %rd61; + .loc 1 83 47 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:83:47 + // begin inline asm + @%p2 st.global.v4.b32 [ %rd50 + 0 ], { %r13, %r12, %r11, %r10 }; + // end inline asm + .loc 1 84 52 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:84:52 + mul.lo.s32 %r776, %r4, 17; + .loc 1 84 49 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:84:49 + add.s32 %r777, %r21, %r776; + add.s32 %r778, %r20, %r776; + add.s32 %r779, %r19, %r776; + add.s32 %r780, %r18, %r776; + .loc 1 84 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:84:25 + mad.wide.s32 %rd62, %r777, 4, %rd8; + mad.wide.s32 %rd63, %r778, 4, %rd8; + mad.wide.s32 %rd64, %r779, 4, %rd8; + mad.wide.s32 %rd65, %r780, 4, %rd8; + .loc 1 84 85 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:84:85 + bar.sync 0; + and.b32 %r781, %r2, 48; + shl.b32 %r782, %r781, 6; + shl.b32 %r783, %r2, 3; + and.b32 %r784, %r783, 120; + shr.u32 %r785, %r781, 1; + shl.b32 %r786, %r2, 1; + and.b32 %r787, %r786, 128; + or.b32 %r788, %r782, %r784; + xor.b32 %r789, %r788, %r785; + add.s32 %r791, %r37, %r787; + add.s32 %r792, %r791, %r789; + st.shared.b64 [%r792], %rd62; + st.shared.b64 [%r792+256], %rd63; + st.shared.b64 [%r792+512], %rd64; + st.shared.b64 [%r792+768], %rd65; + bar.sync 0; + and.b32 %r793, %r2, 12; + shl.b32 %r794, %r793, 8; + shl.b32 %r795, %r5, 5; + and.b32 %r796, %r783, 896; + shl.b32 %r797, %r793, 1; + or.b32 %r798, %r794, %r795; + or.b32 %r799, %r798, %r796; + or.b32 %r800, %r799, %r797; + add.s32 %r801, %r37, %r800; + ld.shared.b64 %rd51, [%r801]; + xor.b32 %r802, %r800, 8; + add.s32 %r803, %r37, %r802; + ld.shared.b64 %rd52, [%r803]; + xor.b32 %r804, %r800, 16; + add.s32 %r805, %r37, %r804; + ld.shared.b64 %rd53, [%r805]; + xor.b32 %r806, %r800, 24; + add.s32 %r807, %r37, %r806; + ld.shared.b64 %rd54, [%r807]; + mov.b32 %r762, 1; + // begin inline asm + @%p314 st.global.b32 [ %rd51 + 0 ], { %r762 }; + // end inline asm + // begin inline asm + @%p314 st.global.b32 [ %rd52 + 0 ], { %r762 }; + // end inline asm + // begin inline asm + @%p314 st.global.b32 [ %rd53 + 0 ], { %r762 }; + // end inline asm + // begin inline asm + @%p314 st.global.b32 [ %rd54 + 0 ], { %r762 }; + // end inline asm + .loc 1 85 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:85:25 + add.s64 %rd55, %rd9, %rd61; + .loc 1 85 47 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:85:47 + // begin inline asm + @%p2 st.global.v4.b32 [ %rd55 + 0 ], { %r14, %r15, %r16, %r17 }; + // end inline asm + .loc 1 86 49 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:86:49 + add.s32 %r808, %r22, %r776; + add.s32 %r809, %r23, %r776; + add.s32 %r810, %r24, %r776; + add.s32 %r811, %r25, %r776; + .loc 1 86 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:86:25 + mad.wide.s32 %rd66, %r808, 4, %rd10; + mad.wide.s32 %rd67, %r809, 4, %rd10; + mad.wide.s32 %rd68, %r810, 4, %rd10; + mad.wide.s32 %rd69, %r811, 4, %rd10; + .loc 1 86 85 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:86:85 + bar.sync 0; + st.shared.b64 [%r792], %rd66; + st.shared.b64 [%r792+256], %rd67; + st.shared.b64 [%r792+512], %rd68; + st.shared.b64 [%r792+768], %rd69; + bar.sync 0; + ld.shared.b64 %rd56, [%r801]; + ld.shared.b64 %rd57, [%r803]; + ld.shared.b64 %rd58, [%r805]; + ld.shared.b64 %rd59, [%r807]; + // begin inline asm + @%p314 st.global.b32 [ %rd56 + 0 ], { %r762 }; + // end inline asm + // begin inline asm + @%p314 st.global.b32 [ %rd57 + 0 ], { %r762 }; + // end inline asm + // begin inline asm + @%p314 st.global.b32 [ %rd58 + 0 ], { %r762 }; + // end inline asm + // begin inline asm + @%p314 st.global.b32 [ %rd59 + 0 ], { %r762 }; + // end inline asm + .loc 1 86 4 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:86:4 + ret; +$L__BB0_1: + .loc 1 71 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:71:63 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd76, assertFunc_0; + cvta.global.u64 %rd77, %rd76; + st.param.b64 [param3], %rd77; + mov.b64 %rd78, assertFile_0; + cvta.global.u64 %rd79, %rd78; + st.param.b64 [param1], %rd79; + mov.b64 %rd80, assertMessage_0; + cvta.global.u64 %rd81, %rd80; + st.param.b64 [param0], %rd81; + st.param.b64 [param4], 1; + st.param.b32 [param2], 71; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__BB0_3: + .loc 1 80 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:80:63 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd70, assertFunc_1; + cvta.global.u64 %rd71, %rd70; + st.param.b64 [param3], %rd71; + mov.b64 %rd72, assertFile_1; + cvta.global.u64 %rd73, %rd72; + st.param.b64 [param1], %rd73; + mov.b64 %rd74, assertMessage_1; + cvta.global.u64 %rd75, %rd74; + st.param.b64 [param0], %rd75; + st.param.b64 [param4], 1; + st.param.b32 [param2], 80; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp14: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 376 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x171 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 114 +.b8 107 +.b8 120 +.b8 112 +.b8 119 +.b8 105 +.b8 119 +.b8 104 +.b8 122 +.b8 107 +.b8 118 +.b8 117 +.b8 110 +.b8 55 +.b8 105 +.b8 53 +.b8 100 +.b8 50 +.b8 112 +.b8 101 +.b8 103 +.b8 111 +.b8 102 +.b8 116 +.b8 104 +.b8 121 +.b8 102 +.b8 105 +.b8 106 +.b8 110 +.b8 53 +.b8 119 +.b8 121 +.b8 103 +.b8 119 +.b8 110 +.b8 97 +.b8 101 +.b8 118 +.b8 51 +.b8 116 +.b8 119 +.b8 119 +.b8 108 +.b8 114 +.b8 98 +.b8 117 +.b8 111 +.b8 106 +.b8 113 +.b8 101 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 114 +.b8 107 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x7a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 116 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 112 +.b8 117 +.b8 116 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 110 +.b8 101 +.b8 119 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 115 +.b8 99 +.b8 97 +.b8 108 +.b8 97 +.b8 114 +.b8 95 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 119 +.b8 104 +.b8 101 +.b8 114 +.b8 101 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x105:0x76 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x132:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp11 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 55 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x14a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp9 // DW_AT_low_pc +.b64 $L__tmp10 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x162:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp12 // DW_AT_low_pc +.b64 $L__tmp13 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 59 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source b/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source new file mode 100644 index 0000000000000000000000000000000000000000..0f726572e00dfb2a1c91e600dd0e3182d9e4c1a6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source @@ -0,0 +1,1405 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":18:0) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc141 = loc(unknown) +#loc166 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc170 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc175 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc179 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc188 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc193 = loc("in_ptr0"(#loc)) +#loc194 = loc("out_ptr4"(#loc)) +#loc195 = loc("out_ptr5"(#loc)) +#loc196 = loc("out_ptr6"(#loc)) +#loc197 = loc("out_ptr7"(#loc)) +#loc198 = loc("out_ptr8"(#loc)) +#loc199 = loc("out_ptr9"(#loc)) +#loc200 = loc("xnumel"(#loc)) +#loc201 = loc("r0_numel"(#loc)) +#loc257 = loc("x"(#loc91)) +#loc258 = loc("idxs"(#loc91)) +#loc259 = loc("x"(#loc95)) +#loc260 = loc("idxs"(#loc95)) +#loc265 = loc("x"(#loc103)) +#loc266 = loc("idxs"(#loc103)) +#loc267 = loc("flip"(#loc103)) +#loc323 = loc("input"(#loc166)) +#loc324 = loc("a"(#loc170)) +#loc325 = loc("b"(#loc170)) +#loc327 = loc("x"(#loc175)) +#loc328 = loc("x"(#loc179)) +#loc329 = loc("input"(#loc188)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 128 : i32 loc(#loc202) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc203) + %xoffset = tt.get_program_id x : i32 loc(#loc204) + %xoffset_2 = arith.constant 32 : i32 loc(#loc205) + %xoffset_3 = arith.constant 32 : i32 loc(#loc205) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc205) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc206) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc207) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<32x1xi32> loc(#loc208) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<32x1xi32> loc(#loc208) + %xmask = arith.constant dense<128> : tensor<32x1xi32> loc(#loc209) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<32x1xi32> loc(#loc209) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc210) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc211) + %r0_offset = arith.constant 0 : i32 loc(#loc212) + %r0_mask = arith.constant true loc(#loc213) + %r0_mask_10 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %tmp0 = arith.constant 16 : i32 loc(#loc214) + %tmp0_11 = arith.constant 16 : i32 loc(#loc214) + %tmp0_12 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc214) + %tmp0_13 = arith.muli %tmp0_12, %xindex_7 : tensor<32x1xi32> loc(#loc214) + %tmp0_14 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc215) + %tmp0_15 = tt.broadcast %tmp0_13 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc215) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<32x16xi32> loc(#loc215) + %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc216) + %tmp0_18 = tt.addptr %tmp0_17, %tmp0_16 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc216) + %tmp0_19 = arith.constant 0.000000e+00 : f32 loc(#loc217) + %tmp0_20 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc217) + %tmp0_21 = arith.constant dense<0.000000e+00> : tensor<32x16xf32> loc(#loc217) + %tmp0_22 = arith.fptosi %tmp0_21 : tensor<32x16xf32> to tensor<32x16xi64> loc(#loc217) + %tmp0_23 = tt.load %tmp0_18, %tmp0_20, %tmp0_22 : tensor<32x16x!tt.ptr> loc(#loc217) + %tmp1 = arith.constant 0 : i64 loc(#loc218) + %tmp1_24 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc218) + %tmp2 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc219) + %tmp2_25 = arith.cmpi sgt, %tmp0_23, %tmp2 : tensor<32x16xi64> loc(#loc219) + %tmp3 = arith.constant 16384 : i64 loc(#loc220) + %tmp3_26 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc220) + %tmp4 = arith.constant dense<16384> : tensor<32x16xi64> loc(#loc221) + %tmp4_27 = arith.cmpi slt, %tmp0_23, %tmp4 : tensor<32x16xi64> loc(#loc221) + %tmp5 = arith.andi %tmp2_25, %tmp4_27 : tensor<32x16xi1> loc(#loc222) + %tmp6 = arith.extui %tmp5 : tensor<32x16xi1> to tensor<32x16xi8> loc(#loc223) + %tmp7 = arith.extsi %tmp6 : tensor<32x16xi8> to tensor<32x16xi32> loc(#loc224) + %tmp9 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc225) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16> -> tensor<32x16xi16> loc(#loc226) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp7, %tmp11) : (tensor<32x16xi32>, tensor<32x16xi16>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc26) + %tmp14 = arith.constant dense<16384> : tensor<32x16xi64> loc(#loc227) + %tmp14_28 = arith.cmpi eq, %tmp0_23, %tmp14 : tensor<32x16xi64> loc(#loc227) + %tmp15 = arith.extui %tmp14_28 : tensor<32x16xi1> to tensor<32x16xi8> loc(#loc228) + %tmp16 = arith.extsi %tmp15 : tensor<32x16xi8> to tensor<32x16xi32> loc(#loc229) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp16, %tmp11) : (tensor<32x16xi32>, tensor<32x16xi16>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc30) + %tmp20 = arith.extsi %tmp7 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc230) + %tmp23 = arith.constant 0 : i32 loc(#loc231) + %tmp23_29 = arith.constant 0 : i64 loc(#loc231) + %tmp23_30 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc231) + %tmp23_31 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc231) + %tmp23_32 = arith.select %tmp23_31, %tmp20, %tmp23_30 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc231) + %tmp24 = tt.call @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp23_32) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc232) + %tmp24_33 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc233) + %tmp25 = arith.extsi %tmp16 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc234) + %tmp28 = arith.constant 0 : i32 loc(#loc235) + %tmp28_34 = arith.constant 0 : i64 loc(#loc235) + %tmp28_35 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc235) + %tmp28_36 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc235) + %tmp28_37 = arith.select %tmp28_36, %tmp25, %tmp28_35 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc235) + %tmp29 = tt.call @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp28_37) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc236) + %tmp29_38 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc237) + %tmp30 = arith.trunci %tmp24_33 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc238) + %tmp31 = arith.trunci %tmp29_38 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc239) + %tmp32 = arith.extsi %0#1 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc240) + %tmp33 = arith.trunci %tmp32 : tensor<32x16xi64> to tensor<32x16xi32> loc(#loc241) + %tmp34 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc242) + %tmp34_39 = tt.broadcast %tmp30 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc242) + %tmp34_40 = arith.cmpi slt, %tmp34, %tmp34_39 : tensor<32x16xi32> loc(#loc242) + %tmp35 = arith.constant 16 : i32 loc(#loc243) + %tmp35_41 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc243) + %tmp36 = arith.constant dense<16> : tensor<32x16xi32> loc(#loc244) + %tmp36_42 = arith.select %tmp34_40, %tmp33, %tmp36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc244) + %tmp37 = arith.constant 17 : i32 loc(#loc245) + %tmp37_43 = arith.constant dense<17> : tensor<32x16xi32> loc(#loc245) + %tmp38 = arith.addi %tmp36_42, %tmp37_43 : tensor<32x16xi32> loc(#loc246) + %tmp39 = arith.constant 0 : i32 loc(#loc247) + %tmp39_44 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc247) + %tmp39_45 = arith.cmpi slt, %tmp36_42, %tmp39_44 : tensor<32x16xi32> loc(#loc247) + %tmp40 = arith.select %tmp39_45, %tmp38, %tmp36_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc248) + %c0_i32 = arith.constant 0 : i32 loc(#loc50) + %cst = arith.constant dense<0> : tensor<32x16xi32> loc(#loc50) + %2 = arith.cmpi sle, %cst, %tmp40 : tensor<32x16xi32> loc(#loc50) + %c17_i32 = arith.constant 17 : i32 loc(#loc51) + %cst_46 = arith.constant dense<17> : tensor<32x16xi32> loc(#loc51) + %3 = arith.cmpi slt, %tmp40, %cst_46 : tensor<32x16xi32> loc(#loc51) + %4 = arith.andi %2, %3 : tensor<32x16xi1> loc(#loc52) + %true = arith.constant true loc(#loc53) + %cst_47 = arith.constant dense : tensor<32x1xi1> loc(#loc53) + %5 = arith.xori %xmask_8, %cst_47 : tensor<32x1xi1> loc(#loc53) + %6 = tt.broadcast %5 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc54) + %7 = arith.ori %4, %6 : tensor<32x16xi1> loc(#loc54) + tt.assert %7, "index out of bounds: 0 <= tmp40 < 17" : tensor<32x16xi1> loc(#loc55) + %tmp42 = arith.constant 1 : i32 loc(#loc249) + %tmp42_48 = arith.constant dense<1> : tensor<1x1xi32> loc(#loc249) + %tmp43 = arith.extsi %1#1 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc250) + %tmp44 = arith.trunci %tmp43 : tensor<32x16xi64> to tensor<32x16xi32> loc(#loc251) + %tmp45 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc252) + %tmp45_49 = tt.broadcast %tmp31 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc252) + %tmp45_50 = arith.cmpi slt, %tmp45, %tmp45_49 : tensor<32x16xi32> loc(#loc252) + %tmp46 = arith.constant dense<16> : tensor<32x16xi32> loc(#loc253) + %tmp46_51 = arith.select %tmp45_50, %tmp44, %tmp46 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc253) + %tmp47 = arith.addi %tmp46_51, %tmp37_43 : tensor<32x16xi32> loc(#loc254) + %tmp48 = arith.constant 0 : i32 loc(#loc255) + %tmp48_52 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc255) + %tmp48_53 = arith.cmpi slt, %tmp46_51, %tmp48_52 : tensor<32x16xi32> loc(#loc255) + %tmp49 = arith.select %tmp48_53, %tmp47, %tmp46_51 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc256) + %c0_i32_54 = arith.constant 0 : i32 loc(#loc64) + %cst_55 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc64) + %8 = arith.cmpi sle, %cst_55, %tmp49 : tensor<32x16xi32> loc(#loc64) + %c17_i32_56 = arith.constant 17 : i32 loc(#loc65) + %cst_57 = arith.constant dense<17> : tensor<32x16xi32> loc(#loc65) + %9 = arith.cmpi slt, %tmp49, %cst_57 : tensor<32x16xi32> loc(#loc65) + %10 = arith.andi %8, %9 : tensor<32x16xi1> loc(#loc66) + %true_58 = arith.constant true loc(#loc67) + %cst_59 = arith.constant dense : tensor<32x1xi1> loc(#loc67) + %11 = arith.xori %xmask_8, %cst_59 : tensor<32x1xi1> loc(#loc67) + %12 = tt.broadcast %11 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc68) + %13 = arith.ori %10, %12 : tensor<32x16xi1> loc(#loc68) + tt.assert %13, "index out of bounds: 0 <= tmp49 < 17" : tensor<32x16xi1> loc(#loc69) + %14 = tt.splat %out_ptr4 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc70) + %15 = tt.addptr %14, %xindex_7 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc70) + tt.store %15, %tmp30, %xmask_8 : tensor<32x1x!tt.ptr> loc(#loc71) + %16 = tt.splat %out_ptr5 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc72) + %17 = tt.addptr %16, %xindex_7 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc72) + tt.store %17, %tmp31, %xmask_8 : tensor<32x1x!tt.ptr> loc(#loc73) + %c16_i32 = arith.constant 16 : i32 loc(#loc74) + %c16_i32_60 = arith.constant 16 : i32 loc(#loc74) + %cst_61 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc74) + %18 = arith.muli %cst_61, %xindex_7 : tensor<32x1xi32> loc(#loc74) + %19 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc75) + %20 = tt.broadcast %18 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc75) + %21 = arith.addi %19, %20 : tensor<32x16xi32> loc(#loc75) + %22 = tt.splat %out_ptr6 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc76) + %23 = tt.addptr %22, %21 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc76) + %24 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc77) + tt.store %23, %tmp33, %24 : tensor<32x16x!tt.ptr> loc(#loc77) + %c17_i32_62 = arith.constant 17 : i32 loc(#loc78) + %c17_i32_63 = arith.constant 17 : i32 loc(#loc78) + %cst_64 = arith.constant dense<17> : tensor<32x1xi32> loc(#loc78) + %25 = arith.muli %cst_64, %xindex_7 : tensor<32x1xi32> loc(#loc78) + %26 = tt.broadcast %25 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc79) + %27 = arith.addi %tmp40, %26 : tensor<32x16xi32> loc(#loc79) + %28 = tt.splat %out_ptr7 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc80) + %29 = tt.addptr %28, %27 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc80) + %cst_65 = arith.constant dense<1> : tensor<32x16xi32> loc(#loc81) + %30 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc81) + tt.store %29, %cst_65, %30 : tensor<32x16x!tt.ptr> loc(#loc81) + %c16_i32_66 = arith.constant 16 : i32 loc(#loc82) + %c16_i32_67 = arith.constant 16 : i32 loc(#loc82) + %cst_68 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc82) + %31 = arith.muli %cst_68, %xindex_7 : tensor<32x1xi32> loc(#loc82) + %32 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc83) + %33 = tt.broadcast %31 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc83) + %34 = arith.addi %32, %33 : tensor<32x16xi32> loc(#loc83) + %35 = tt.splat %out_ptr8 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc84) + %36 = tt.addptr %35, %34 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc84) + %37 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc85) + tt.store %36, %tmp44, %37 : tensor<32x16x!tt.ptr> loc(#loc85) + %c17_i32_69 = arith.constant 17 : i32 loc(#loc86) + %c17_i32_70 = arith.constant 17 : i32 loc(#loc86) + %cst_71 = arith.constant dense<17> : tensor<32x1xi32> loc(#loc86) + %38 = arith.muli %cst_71, %xindex_7 : tensor<32x1xi32> loc(#loc86) + %39 = tt.broadcast %38 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc87) + %40 = arith.addi %tmp49, %39 : tensor<32x16xi32> loc(#loc87) + %41 = tt.splat %out_ptr9 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc88) + %42 = tt.addptr %41, %40 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc88) + %cst_72 = arith.constant dense<1> : tensor<32x16xi32> loc(#loc89) + %43 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc89) + tt.store %42, %cst_72, %43 : tensor<32x16x!tt.ptr> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc91)), %idxs: tensor<32x16xi16> loc("idxs"(#loc91))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<32x16xi32>, tensor<32x16xi16>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc92) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc92) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc92) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc92) + tt.return %3#0, %3#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc93) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc94) + %5 = ub.poison : tensor<32x16xi32> loc(#loc94) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc94) + } loc(#loc91) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc95)), %idxs: tensor<32x16xi16> loc("idxs"(#loc95))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i16S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi16>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + tt.return %0#0, %0#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc101) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi32> loc(#loc102) + %2 = ub.poison : tensor<32x16xi32> loc(#loc102) + tt.return %1, %2 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i16S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi16> loc("idxs"(#loc103)), %flip: tensor<32x16xi32> loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi16> -> tensor<256x2x1xi16> loc(#loc282) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc283) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc284) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<256x2x1xi16> loc(#loc284) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<256x2x1xi16>) -> tensor<256x1xi32> loc(#loc285) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc286) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc287) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc288) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc289) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<256x2x1xi16> loc(#loc289) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<256x2x1xi16>) -> tensor<256x1xi32> loc(#loc290) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc291) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc292) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_27 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_28 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_49 = arith.constant true loc(#loc300) + %cond_50 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<32x16xi1> loc(#loc300) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<32x16xi1> loc(#loc301) + %cond_53 = arith.ori %cond, %cond_52 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_53 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_50 = arith.ori %eq, %eq_49 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_50 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<32x16xi32> loc(#loc306) + %cond_30 = arith.andi %3, %cond_29 : tensor<32x16xi1> loc(#loc307) + %cond_31 = arith.ori %1, %cond_30 : tensor<32x16xi1> loc(#loc308) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<32x16xi1> loc(#loc309) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<32x16xi1> loc(#loc310) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<32x16xi1> loc(#loc311) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<32x16xi1> loc(#loc312) + %cond_36 = arith.extui %cond_35 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc313) + %cond_37 = arith.xori %cond_36, %flip : tensor<32x16xi32> loc(#loc313) + %cond_38 = arith.constant 0 : i32 loc(#loc314) + %cond_39 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc314) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<32x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_43 = arith.xori %x, %ret_42 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<32x16xi32> loc(#loc319) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S32_16S__(%idxs) : (tensor<32x16xi16>) -> tensor<32x16xi16> loc(#loc320) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<32x16xi16> to tensor<32x16xi32> loc(#loc321) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_47 = arith.extsi %idxs : tensor<32x16xi16> to tensor<32x16xi32> loc(#loc322) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_43, %new_idxs_48 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<256x2x1xi32> loc("input"(#loc166))) -> tensor<256x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc167) + tt.return %0 : tensor<256x1xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<256x1xi32> loc(#loc169) + tt.return %1 : tensor<256x1xi32> loc(#loc169) + } loc(#loc166) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc170)), %b: i32 loc("b"(#loc170))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc171) + tt.return %0 : i32 loc(#loc172) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc173) + tt.return %1 : i32 loc(#loc173) + } loc(#loc170) + tt.func private @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<256x2x1xi16> loc("input"(#loc166))) -> tensor<256x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc326) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc167) + tt.return %0 : tensor<256x1xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<256x1xi32> loc(#loc169) + tt.return %1 : tensor<256x1xi32> loc(#loc169) + } loc(#loc166) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%x: tensor<32x16xi32> loc("x"(#loc175))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc176) + %false = arith.constant false loc(#loc177) + tt.return %false : i1 loc(#loc177) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc178) + tt.return %1 : i1 loc(#loc178) + } loc(#loc175) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S32_16S__(%x: tensor<32x16xi32> loc("x"(#loc179))) -> tensor<32x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc180) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc181) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc181) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<32x16xi32> loc(#loc181) + %4 = arith.addi %x, %3 : tensor<32x16xi32> loc(#loc181) + tt.return %4 : tensor<32x16xi32> loc(#loc182) + ^bb1: // no predecessors + %5 = ub.poison : tensor<32x16xi32> loc(#loc183) + tt.return %5 : tensor<32x16xi32> loc(#loc183) + } loc(#loc179) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc185) + %cst = arith.constant dense : tensor<1xi1> loc(#loc185) + tt.return %cst : tensor<1xi1> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc187) + tt.return %0 : tensor<1xi1> loc(#loc187) + } loc(#loc184) + tt.func private @triton.language.standard.zeros_like__i32S32_16S__(%input: tensor<32x16xi32> loc("input"(#loc188))) -> tensor<32x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<32x16xi32> loc(#loc189) + tt.return %0 : tensor<32x16xi32> loc(#loc190) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi32> loc(#loc191) + tt.return %1 : tensor<32x16xi32> loc(#loc191) + } loc(#loc188) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<32x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc185) + %cst = arith.constant dense<0> : tensor<32x16xi32> loc(#loc185) + tt.return %cst : tensor<32x16xi32> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<32x16xi32> loc(#loc187) + tt.return %0 : tensor<32x16xi32> loc(#loc187) + } loc(#loc184) + tt.func private @triton.language.standard.zeros_like__i16S32_16S__(%input: tensor<32x16xi16> loc("input"(#loc188))) -> tensor<32x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<32x16xi16> loc(#loc189) + tt.return %0 : tensor<32x16xi16> loc(#loc190) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi16> loc(#loc191) + tt.return %1 : tensor<32x16xi16> loc(#loc191) + } loc(#loc188) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<32x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc185) + %cst = arith.constant dense<0> : tensor<32x16xi16> loc(#loc185) + tt.return %cst : tensor<32x16xi16> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<32x16xi16> loc(#loc187) + tt.return %0 : tensor<32x16xi16> loc(#loc187) + } loc(#loc184) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc95)), %idxs: tensor<32x16xi32> loc("idxs"(#loc95))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + tt.return %1#0, %1#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc101) + ^bb1: // no predecessors + %2 = ub.poison : tensor<32x16xi32> loc(#loc102) + %3 = ub.poison : tensor<32x16xi32> loc(#loc102) + tt.return %2, %3 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: tensor<32x16xi32> loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<128x2x2xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<128x2x2xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<128x2x2xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<128x2x2xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x2x2xi32> loc("input"(#loc166))) -> tensor<128x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc167) + tt.return %0 : tensor<128x2xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x2xi32> loc(#loc169) + tt.return %1 : tensor<128x2xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: tensor<32x16xi32> loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x1xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x1xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc95)), %idxs: tensor<32x16xi32> loc("idxs"(#loc95))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + tt.return %2#0, %2#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc101) + ^bb1: // no predecessors + %3 = ub.poison : tensor<32x16xi32> loc(#loc102) + %4 = ub.poison : tensor<32x16xi32> loc(#loc102) + tt.return %3, %4 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: tensor<32x16xi32> loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x4xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<64x2x4xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x4xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x4xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x4xi32> loc("input"(#loc166))) -> tensor<64x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc167) + tt.return %0 : tensor<64x4xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x4xi32> loc(#loc169) + tt.return %1 : tensor<64x4xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc95)), %idxs: tensor<32x16xi32> loc("idxs"(#loc95))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc330) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + tt.return %3#0, %3#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc101) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc102) + %5 = ub.poison : tensor<32x16xi32> loc(#loc102) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x8xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<32x2x8xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x8xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x8xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x2x8xi32> loc("input"(#loc166))) -> tensor<32x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc167) + tt.return %0 : tensor<32x8xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x8xi32> loc(#loc169) + tt.return %1 : tensor<32x8xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x4xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<64x2x4xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x4xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x4xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<128x2x2xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<128x2x2xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<128x2x2xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<128x2x2xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x1xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x1xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x16xi64> loc("input"(#loc166))) -> tensor<32xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc167) + tt.reduce.return %2 : i64 loc(#loc167) + }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc167) + tt.return %0 : tensor<32xi64> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32xi64> loc(#loc169) + tt.return %1 : tensor<32xi64> loc(#loc169) + } loc(#loc166) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc170)), %b: i64 loc("b"(#loc170))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc171) + tt.return %0 : i64 loc(#loc172) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc173) + tt.return %1 : i64 loc(#loc173) + } loc(#loc170) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:30) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:45) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":35:30) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":36:18) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":37:34) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":38:18) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":39:18) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":40:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":41:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":43:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":45:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":46:71) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":47:20) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":48:21) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":49:21) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":51:71) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":52:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":54:35) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:26) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:29) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":56:21) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":58:35) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:26) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:29) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":60:21) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":61:21) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":62:21) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":63:21) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":64:19) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":65:32) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":66:35) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":67:44) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":68:20) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":69:20) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":70:35) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:28) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:46) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:38) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:55) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:53) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:63) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":72:31) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":73:21) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":74:21) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":75:19) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":76:35) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":77:20) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":78:20) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":79:35) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:28) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:46) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:38) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:55) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:53) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:63) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:37) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:37) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:32) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:47) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:52) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:49) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:25) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:85) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:35) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:32) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:25) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:47) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:52) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:49) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:85) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:4) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc140 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc142 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc143 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc144 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc145 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc146 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc147 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc148 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc149 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc150 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc151 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc152 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc153 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc154 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc155 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc156 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc157 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc158 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc159 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc160 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc161 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc162 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc163 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc164 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc165 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc167 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc168 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc169 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc171 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc172 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc173 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc174 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc176 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc177 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc178 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc180 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc181 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc182 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc183 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc184 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc185 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc186 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc187 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc189 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc190 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc191 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc192 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc202 = loc("xnumel"(#loc1)) +#loc203 = loc("r0_numel"(#loc2)) +#loc204 = loc("xoffset"(#loc3)) +#loc205 = loc("xoffset"(#loc4)) +#loc206 = loc("xindex"(#loc5)) +#loc207 = loc("xindex"(#loc6)) +#loc208 = loc("xindex"(#loc7)) +#loc209 = loc("xmask"(#loc8)) +#loc210 = loc("r0_index"(#loc9)) +#loc211 = loc("r0_index"(#loc10)) +#loc212 = loc("r0_offset"(#loc11)) +#loc213 = loc("r0_mask"(#loc12)) +#loc214 = loc("tmp0"(#loc13)) +#loc215 = loc("tmp0"(#loc14)) +#loc216 = loc("tmp0"(#loc15)) +#loc217 = loc("tmp0"(#loc16)) +#loc218 = loc("tmp1"(#loc17)) +#loc219 = loc("tmp2"(#loc18)) +#loc220 = loc("tmp3"(#loc19)) +#loc221 = loc("tmp4"(#loc20)) +#loc222 = loc("tmp5"(#loc21)) +#loc223 = loc("tmp6"(#loc22)) +#loc224 = loc("tmp7"(#loc23)) +#loc225 = loc("tmp9"(#loc24)) +#loc226 = loc("tmp11"(#loc25)) +#loc227 = loc("tmp14"(#loc27)) +#loc228 = loc("tmp15"(#loc28)) +#loc229 = loc("tmp16"(#loc29)) +#loc230 = loc("tmp20"(#loc31)) +#loc231 = loc("tmp23"(#loc32)) +#loc232 = loc("tmp24"(#loc33)) +#loc233 = loc("tmp24"(#loc34)) +#loc234 = loc("tmp25"(#loc35)) +#loc235 = loc("tmp28"(#loc36)) +#loc236 = loc("tmp29"(#loc37)) +#loc237 = loc("tmp29"(#loc38)) +#loc238 = loc("tmp30"(#loc39)) +#loc239 = loc("tmp31"(#loc40)) +#loc240 = loc("tmp32"(#loc41)) +#loc241 = loc("tmp33"(#loc42)) +#loc242 = loc("tmp34"(#loc43)) +#loc243 = loc("tmp35"(#loc44)) +#loc244 = loc("tmp36"(#loc45)) +#loc245 = loc("tmp37"(#loc46)) +#loc246 = loc("tmp38"(#loc47)) +#loc247 = loc("tmp39"(#loc48)) +#loc248 = loc("tmp40"(#loc49)) +#loc249 = loc("tmp42"(#loc56)) +#loc250 = loc("tmp43"(#loc57)) +#loc251 = loc("tmp44"(#loc58)) +#loc252 = loc("tmp45"(#loc59)) +#loc253 = loc("tmp46"(#loc60)) +#loc254 = loc("tmp47"(#loc61)) +#loc255 = loc("tmp48"(#loc62)) +#loc256 = loc("tmp49"(#loc63)) +#loc261 = loc("flip"(#loc96)) +#loc262 = loc("flip"(#loc97)) +#loc263 = loc("flip"(#loc98)) +#loc264 = loc("flip"(#loc99)) +#loc268 = loc("y"(#loc104)) +#loc269 = loc("right_mask"(#loc105)) +#loc270 = loc("right_mask"(#loc106)) +#loc271 = loc("left_mask"(#loc107)) +#loc272 = loc("ileft"(#loc108)) +#loc273 = loc("ileft"(#loc109)) +#loc274 = loc("ileft"(#loc110)) +#loc275 = loc("ileft"(#loc111)) +#loc276 = loc("iright"(#loc112)) +#loc277 = loc("iright"(#loc113)) +#loc278 = loc("iright"(#loc114)) +#loc279 = loc("iright"(#loc115)) +#loc280 = loc("ileft"(#loc116)) +#loc281 = loc("iright"(#loc117)) +#loc282 = loc("y_idx"(#loc118)) +#loc283 = loc("left_idx"(#loc119)) +#loc284 = loc("left_idx"(#loc120)) +#loc285 = loc("left_idx"(#loc121)) +#loc286 = loc("left_idx"(#loc122)) +#loc287 = loc("left_idx"(#loc123)) +#loc288 = loc("right_idx"(#loc124)) +#loc289 = loc("right_idx"(#loc125)) +#loc290 = loc("right_idx"(#loc126)) +#loc291 = loc("right_idx"(#loc127)) +#loc292 = loc("right_idx"(#loc128)) +#loc293 = loc("left_idx"(#loc129)) +#loc294 = loc("right_idx"(#loc130)) +#loc295 = loc("left_valid_mask"(#loc131)) +#loc296 = loc("right_valid_mask"(#loc132)) +#loc297 = loc("left_isnan"(#loc133)) +#loc298 = loc("right_isnan"(#loc134)) +#loc299 = loc("cond"(#loc135)) +#loc300 = loc("cond"(#loc138)) +#loc301 = loc("cond"(#loc139)) +#loc302 = loc("cond"(#loc140)) +#loc303 = loc("eq"(#loc142)) +#loc304 = loc("eq"(#loc145)) +#loc305 = loc("eq"(#loc146)) +#loc306 = loc("cond"(#loc147)) +#loc307 = loc("cond"(#loc148)) +#loc308 = loc("cond"(#loc149)) +#loc309 = loc("cond"(#loc150)) +#loc310 = loc("cond"(#loc151)) +#loc311 = loc("cond"(#loc152)) +#loc312 = loc("cond"(#loc153)) +#loc313 = loc("cond"(#loc154)) +#loc314 = loc("cond"(#loc155)) +#loc315 = loc("ret"(#loc156)) +#loc316 = loc("ret"(#loc157)) +#loc317 = loc("ret"(#loc158)) +#loc318 = loc("ret"(#loc159)) +#loc319 = loc("new_idxs"(#loc160)) +#loc320 = loc("new_idxs"(#loc161)) +#loc321 = loc("new_idxs"(#loc162)) +#loc322 = loc("new_idxs"(#loc163)) +#loc326 = loc("input"(#loc174)) +#loc330 = loc("flip"(#loc192)) +#loc331 = loc("cond"(#loc299)) +#loc332 = loc("cond"(#loc302)) +#loc333 = loc("eq"(#loc303)) +#loc334 = loc("eq"(#loc305)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..fda6506a840c39ed8c424c1bad4b56d3ec359b8f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir @@ -0,0 +1,1480 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1, 4], threadsPerWarp = [8, 2, 2], warpsPerCTA = [4, 1, 1], order = [2, 1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1, 4], threadsPerWarp = [16, 2, 1], warpsPerCTA = [4, 1, 1], order = [2, 1, 0]}> +#blocked3 = #ttg.blocked<{sizePerThread = [1, 2, 2], threadsPerWarp = [32, 1, 1], warpsPerCTA = [4, 1, 1], order = [2, 1, 0]}> +#blocked4 = #ttg.blocked<{sizePerThread = [2, 2, 1], threadsPerWarp = [32, 1, 1], warpsPerCTA = [4, 1, 1], order = [2, 1, 0]}> +#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":18:0) +#loc1 = loc(unknown) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":46:71) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":51:71) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:26) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:26) +#loc117 = loc("in_ptr0"(#loc)) +#loc118 = loc("out_ptr4"(#loc)) +#loc119 = loc("out_ptr5"(#loc)) +#loc120 = loc("out_ptr6"(#loc)) +#loc121 = loc("out_ptr7"(#loc)) +#loc122 = loc("out_ptr8"(#loc)) +#loc123 = loc("out_ptr9"(#loc)) +#loc124 = loc("xnumel"(#loc)) +#loc125 = loc("r0_numel"(#loc)) +#loc144 = loc(callsite(#loc20 at #loc21)) +#loc150 = loc("ileft"(#loc29)) +#loc154 = loc("iright"(#loc34)) +#loc163 = loc("left_idx"(#loc43)) +#loc168 = loc("right_idx"(#loc48)) +#loc189 = loc(callsite(#loc20 at #loc69)) +#loc192 = loc("tmp24"(#loc72)) +#loc197 = loc("tmp29"(#loc77)) +#loc214 = loc(callsite(#loc25 at #loc144)) +#loc218 = loc(callsite(#loc25 at #loc189)) +#loc221 = loc(callsite(#loc1 at #loc192)) +#loc224 = loc(callsite(#loc1 at #loc197)) +#loc228 = loc(callsite(#loc150 at #loc214)) +#loc232 = loc(callsite(#loc154 at #loc214)) +#loc240 = loc(callsite(#loc163 at #loc214)) +#loc245 = loc(callsite(#loc168 at #loc214)) +#loc265 = loc(callsite(#loc150 at #loc218)) +#loc269 = loc(callsite(#loc154 at #loc218)) +#loc287 = loc(callsite(#loc163 at #loc218)) +#loc291 = loc(callsite(#loc168 at #loc218)) +#loc301 = loc(callsite(#loc1 at #loc228)) +#loc303 = loc(callsite(#loc1 at #loc232)) +#loc306 = loc(callsite(#loc1 at #loc240)) +#loc309 = loc(callsite(#loc1 at #loc245)) +#loc311 = loc(callsite(#loc1 at #loc265)) +#loc313 = loc(callsite(#loc1 at #loc269)) +#loc315 = loc(callsite(#loc1 at #loc287)) +#loc317 = loc(callsite(#loc1 at #loc291)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<32x1xi1, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<17> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked2> loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked3> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked4> loc(#loc1) + %cst_5 = arith.constant dense<16> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<32x1xi32, #blocked5> loc(#loc1) + %cst_7 = arith.constant dense<128> : tensor<32x1xi32, #blocked> loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %cst_8 = arith.constant dense<1> : tensor<32x16xi32, #blocked5> loc(#loc1) + %cst_9 = arith.constant dense<0> : tensor<32x16xi32, #blocked> loc(#loc1) + %cst_10 = arith.constant dense<17> : tensor<32x16xi32, #blocked> loc(#loc1) + %cst_11 = arith.constant dense<16> : tensor<32x16xi32, #blocked> loc(#loc1) + %cst_12 = arith.constant dense<16384> : tensor<32x16xi64, #blocked> loc(#loc1) + %cst_13 = arith.constant dense<0> : tensor<32x16xi64, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc126) + %xoffset_14 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc127) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc128) + %xindex_15 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc128) + %xindex_16 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc128) + %xindex_17 = tt.expand_dims %xindex_15 {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<32x1xi32, #blocked5> loc(#loc128) + %xindex_18 = tt.splat %xoffset_14 : i32 -> tensor<32x1xi32, #blocked> loc(#loc129) + %xindex_19 = tt.splat %xoffset_14 : i32 -> tensor<32x1xi32, #blocked5> loc(#loc129) + %xindex_20 = arith.addi %xindex_18, %xindex_16 : tensor<32x1xi32, #blocked> loc(#loc129) + %xindex_21 = arith.addi %xindex_19, %xindex_17 : tensor<32x1xi32, #blocked5> loc(#loc129) + %xmask = arith.cmpi slt, %xindex_20, %cst_7 : tensor<32x1xi32, #blocked> loc(#loc130) + %xmask_22 = arith.cmpi slt, %xindex_21, %cst_6 : tensor<32x1xi32, #blocked5> loc(#loc130) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc131) + %r0_index_23 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc131) + %tmp0 = arith.muli %xindex_20, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc132) + %tmp0_24 = tt.broadcast %r0_index_23 : tensor<1x16xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc133) + %tmp0_25 = tt.broadcast %tmp0 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc133) + %tmp0_26 = arith.addi %tmp0_24, %tmp0_25 : tensor<32x16xi32, #blocked> loc(#loc133) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc134) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc134) + %tmp0_29 = tt.broadcast %xmask : tensor<32x1xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc135) + %tmp0_30 = tt.broadcast %xmask_22 : tensor<32x1xi1, #blocked5> -> tensor<32x16xi1, #blocked5> loc(#loc135) + %tmp0_31 = tt.load %tmp0_28, %tmp0_29, %cst_13 : tensor<32x16x!tt.ptr, #blocked> loc(#loc135) + %tmp2 = arith.cmpi sgt, %tmp0_31, %cst_13 : tensor<32x16xi64, #blocked> loc(#loc136) + %tmp4 = arith.cmpi slt, %tmp0_31, %cst_12 : tensor<32x16xi64, #blocked> loc(#loc137) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<32x16xi1, #blocked> loc(#loc138) + %tmp7 = arith.extui %tmp5 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc210) + %tmp9 = arith.trunci %r0_index_23 : tensor<1x16xi32, #blocked> to tensor<1x16xi16, #blocked> loc(#loc141) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16, #blocked> -> tensor<32x16xi16, #blocked> loc(#loc142) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> loc(#loc211) + %flip_32 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> loc(#loc211) + %flip_33 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> loc(#loc211) + %flip_34 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> loc(#loc211) + %flip_35 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> loc(#loc211) + %flip_36 = tt.expand_dims %flip_32 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> loc(#loc211) + %flip_37 = tt.expand_dims %flip_33 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> loc(#loc211) + %flip_38 = tt.expand_dims %flip_34 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> loc(#loc211) + %flip_39 = tt.expand_dims %flip_35 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> -> tensor<1x2x1xi32, #blocked3> loc(#loc211) + %flip_40 = tt.expand_dims %flip_36 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> -> tensor<1x2x1xi32, #blocked4> loc(#loc211) + %flip_41 = tt.expand_dims %flip_37 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> -> tensor<1x2x1xi32, #blocked2> loc(#loc211) + %flip_42 = tt.expand_dims %flip_38 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> -> tensor<1x2x1xi32, #blocked1> loc(#loc211) + %flip_43 = tt.broadcast %flip_39 : tensor<1x2x1xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc212) + %flip_44 = tt.reshape %flip_43 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc213) + %y = tt.reshape %tmp7 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc225) + %left_mask = arith.subi %cst_4, %flip_40 : tensor<1x2x1xi32, #blocked4> loc(#loc226) + %left_mask_45 = arith.subi %cst_3, %flip_39 : tensor<1x2x1xi32, #blocked3> loc(#loc226) + %left_mask_46 = arith.subi %cst_2, %flip_41 : tensor<1x2x1xi32, #blocked2> loc(#loc226) + %left_mask_47 = arith.subi %cst_1, %flip_42 : tensor<1x2x1xi32, #blocked1> loc(#loc226) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc227) + %ileft_48 = arith.muli %y, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc227) + %ileft_49 = "tt.reduce"(%ileft_48) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_50 = tt.expand_dims %ileft_49 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc229) + %ileft_51 = tt.broadcast %ileft_50 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc230) + %iright = tt.broadcast %flip_40 : tensor<1x2x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc231) + %iright_52 = arith.muli %y, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc231) + %iright_53 = "tt.reduce"(%iright_52) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_54 = tt.expand_dims %iright_53 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc233) + %iright_55 = tt.broadcast %iright_54 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc234) + %ileft_56 = tt.reshape %ileft_51 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_57 = tt.reshape %iright_55 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx = tt.reshape %tmp11 : tensor<32x16xi16, #blocked> -> tensor<256x2x1xi16, #blocked4> loc(#loc237) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc238) + %left_idx_58 = tt.broadcast %left_idx : tensor<1x2x1xi16, #blocked4> -> tensor<256x2x1xi16, #blocked4> loc(#loc239) + %left_idx_59 = arith.muli %y_idx, %left_idx_58 : tensor<256x2x1xi16, #blocked4> loc(#loc239) + %input = arith.extsi %left_idx_59 : tensor<256x2x1xi16, #blocked4> to tensor<256x2x1xi32, #blocked4> loc(#loc304) + %left_idx_60 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_61 = tt.expand_dims %left_idx_60 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc241) + %left_idx_62 = tt.broadcast %left_idx_61 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc242) + %right_idx = arith.trunci %flip_40 : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc243) + %right_idx_63 = tt.broadcast %right_idx : tensor<1x2x1xi16, #blocked4> -> tensor<256x2x1xi16, #blocked4> loc(#loc244) + %right_idx_64 = arith.muli %y_idx, %right_idx_63 : tensor<256x2x1xi16, #blocked4> loc(#loc244) + %input_65 = arith.extsi %right_idx_64 : tensor<256x2x1xi16, #blocked4> to tensor<256x2x1xi32, #blocked4> loc(#loc307) + %right_idx_66 = "tt.reduce"(%input_65) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_67 = tt.expand_dims %right_idx_66 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc246) + %right_idx_68 = tt.broadcast %right_idx_67 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc247) + %left_idx_69 = tt.reshape %left_idx_62 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_70 = tt.reshape %right_idx_68 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond = arith.cmpi slt, %ileft_56, %iright_57 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq = arith.cmpi eq, %ileft_56, %iright_57 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_71 = arith.cmpi sgt, %left_idx_69, %right_idx_70 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_72 = arith.andi %eq, %cond_71 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_73 = arith.ori %cond, %cond_72 : tensor<32x16xi1, #blocked> loc(#loc254) + %cond_74 = arith.extui %cond_73 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc255) + %cond_75 = arith.xori %cond_74, %flip_44 : tensor<32x16xi32, #blocked> loc(#loc255) + %cond_76 = arith.cmpi ne, %cond_75, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc256) + %ret = arith.xori %ileft_56, %iright_57 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_77 = arith.select %cond_76, %ret, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_78 = arith.xori %tmp7, %ret_77 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs = arith.xori %left_idx_69, %right_idx_70 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_79 = arith.select %cond_76, %new_idxs, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_80 = arith.extsi %tmp9 : tensor<1x16xi16, #blocked> to tensor<1x16xi32, #blocked> loc(#loc262) + %new_idxs_81 = tt.broadcast %new_idxs_80 : tensor<1x16xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc262) + %new_idxs_82 = arith.xori %new_idxs_81, %new_idxs_79 : tensor<32x16xi32, #blocked> loc(#loc262) + %flip_83 = tt.broadcast %flip_41 : tensor<1x2x1xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc212) + %flip_84 = tt.reshape %flip_83 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc213) + %y_85 = tt.reshape %ret_78 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc225) + %ileft_86 = tt.broadcast %left_mask_45 : tensor<1x2x1xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc227) + %ileft_87 = arith.muli %y_85, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc227) + %ileft_88 = "tt.reduce"(%ileft_87) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_89 = tt.expand_dims %ileft_88 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc229) + %ileft_90 = tt.broadcast %ileft_89 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc230) + %iright_91 = arith.muli %y_85, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc231) + %iright_92 = "tt.reduce"(%iright_91) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_93 = tt.expand_dims %iright_92 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc233) + %iright_94 = tt.broadcast %iright_93 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc234) + %ileft_95 = tt.reshape %ileft_90 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_96 = tt.reshape %iright_94 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_97 = tt.reshape %new_idxs_82 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc237) + %left_idx_98 = arith.muli %y_idx_97, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc239) + %left_idx_99 = "tt.reduce"(%left_idx_98) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_100 = tt.expand_dims %left_idx_99 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc241) + %left_idx_101 = tt.broadcast %left_idx_100 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc242) + %right_idx_102 = arith.muli %y_idx_97, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc244) + %right_idx_103 = "tt.reduce"(%right_idx_102) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_104 = tt.expand_dims %right_idx_103 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc246) + %right_idx_105 = tt.broadcast %right_idx_104 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc247) + %left_idx_106 = tt.reshape %left_idx_101 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_107 = tt.reshape %right_idx_105 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_108 = arith.cmpi slt, %ileft_95, %iright_96 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_109 = arith.cmpi eq, %ileft_95, %iright_96 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_110 = arith.cmpi sgt, %left_idx_106, %right_idx_107 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_111 = arith.andi %eq_109, %cond_110 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_112 = arith.ori %cond_108, %cond_111 : tensor<32x16xi1, #blocked> loc(#loc254) + %cond_113 = arith.extui %cond_112 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc255) + %cond_114 = arith.xori %cond_113, %flip_84 : tensor<32x16xi32, #blocked> loc(#loc255) + %cond_115 = arith.cmpi ne, %cond_114, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc256) + %ret_116 = arith.xori %ileft_95, %iright_96 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_117 = arith.select %cond_115, %ret_116, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_118 = arith.xori %ret_78, %ret_117 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_119 = arith.xori %left_idx_106, %right_idx_107 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_120 = arith.select %cond_115, %new_idxs_119, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_121 = arith.xori %new_idxs_82, %new_idxs_120 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_122 = tt.reshape %ret_118 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc225) + %ileft_123 = arith.muli %y_122, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc227) + %ileft_124 = "tt.reduce"(%ileft_123) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_125 = tt.expand_dims %ileft_124 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc229) + %ileft_126 = tt.broadcast %ileft_125 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc230) + %iright_127 = arith.muli %y_122, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc231) + %iright_128 = "tt.reduce"(%iright_127) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_129 = tt.expand_dims %iright_128 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc233) + %iright_130 = tt.broadcast %iright_129 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc234) + %ileft_131 = tt.reshape %ileft_126 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_132 = tt.reshape %iright_130 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_133 = tt.reshape %new_idxs_121 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc237) + %left_idx_134 = arith.muli %y_idx_133, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc239) + %left_idx_135 = "tt.reduce"(%left_idx_134) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_136 = tt.expand_dims %left_idx_135 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc241) + %left_idx_137 = tt.broadcast %left_idx_136 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc242) + %right_idx_138 = arith.muli %y_idx_133, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc244) + %right_idx_139 = "tt.reduce"(%right_idx_138) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_140 = tt.expand_dims %right_idx_139 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc246) + %right_idx_141 = tt.broadcast %right_idx_140 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc247) + %left_idx_142 = tt.reshape %left_idx_137 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_143 = tt.reshape %right_idx_141 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_144 = arith.cmpi slt, %ileft_131, %iright_132 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_145 = arith.cmpi eq, %ileft_131, %iright_132 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_146 = arith.cmpi sgt, %left_idx_142, %right_idx_143 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_147 = arith.andi %eq_145, %cond_146 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_148 = arith.ori %cond_144, %cond_147 : tensor<32x16xi1, #blocked> loc(#loc254) + %cond_149 = arith.extui %cond_148 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc255) + %cond_150 = arith.xori %cond_149, %flip_84 : tensor<32x16xi32, #blocked> loc(#loc255) + %cond_151 = arith.cmpi ne, %cond_150, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc256) + %ret_152 = arith.xori %ileft_131, %iright_132 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_153 = arith.select %cond_151, %ret_152, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_154 = arith.xori %ret_118, %ret_153 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_155 = arith.xori %left_idx_142, %right_idx_143 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_156 = arith.select %cond_151, %new_idxs_155, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_157 = arith.xori %new_idxs_121, %new_idxs_156 : tensor<32x16xi32, #blocked> loc(#loc262) + %flip_158 = tt.broadcast %flip_42 : tensor<1x2x1xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc212) + %flip_159 = tt.reshape %flip_158 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc213) + %y_160 = tt.reshape %ret_154 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc225) + %ileft_161 = tt.broadcast %left_mask_46 : tensor<1x2x1xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc227) + %ileft_162 = arith.muli %y_160, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc227) + %ileft_163 = "tt.reduce"(%ileft_162) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc300) + %ileft_164 = tt.expand_dims %ileft_163 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc229) + %ileft_165 = tt.broadcast %ileft_164 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc230) + %iright_166 = arith.muli %y_160, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc231) + %iright_167 = "tt.reduce"(%iright_166) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %iright_168 = tt.expand_dims %iright_167 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc233) + %iright_169 = tt.broadcast %iright_168 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc234) + %ileft_170 = tt.reshape %ileft_165 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_171 = tt.reshape %iright_169 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_172 = tt.reshape %new_idxs_157 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc237) + %left_idx_173 = arith.muli %y_idx_172, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc239) + %left_idx_174 = "tt.reduce"(%left_idx_173) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %left_idx_175 = tt.expand_dims %left_idx_174 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc241) + %left_idx_176 = tt.broadcast %left_idx_175 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc242) + %right_idx_177 = arith.muli %y_idx_172, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc244) + %right_idx_178 = "tt.reduce"(%right_idx_177) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc308) + %right_idx_179 = tt.expand_dims %right_idx_178 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc246) + %right_idx_180 = tt.broadcast %right_idx_179 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc247) + %left_idx_181 = tt.reshape %left_idx_176 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_182 = tt.reshape %right_idx_180 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_183 = arith.cmpi slt, %ileft_170, %iright_171 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_184 = arith.cmpi eq, %ileft_170, %iright_171 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_185 = arith.cmpi sgt, %left_idx_181, %right_idx_182 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_186 = arith.andi %eq_184, %cond_185 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_187 = arith.ori %cond_183, %cond_186 : tensor<32x16xi1, #blocked> loc(#loc254) + %cond_188 = arith.extui %cond_187 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc255) + %cond_189 = arith.xori %cond_188, %flip_159 : tensor<32x16xi32, #blocked> loc(#loc255) + %cond_190 = arith.cmpi ne, %cond_189, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc256) + %ret_191 = arith.xori %ileft_170, %iright_171 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_192 = arith.select %cond_190, %ret_191, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_193 = arith.xori %ret_154, %ret_192 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_194 = arith.xori %left_idx_181, %right_idx_182 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_195 = arith.select %cond_190, %new_idxs_194, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_196 = arith.xori %new_idxs_157, %new_idxs_195 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_197 = tt.reshape %ret_193 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc225) + %ileft_198 = arith.muli %y_197, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc227) + %ileft_199 = "tt.reduce"(%ileft_198) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_200 = tt.expand_dims %ileft_199 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc229) + %ileft_201 = tt.broadcast %ileft_200 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc230) + %iright_202 = arith.muli %y_197, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc231) + %iright_203 = "tt.reduce"(%iright_202) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_204 = tt.expand_dims %iright_203 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc233) + %iright_205 = tt.broadcast %iright_204 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc234) + %ileft_206 = tt.reshape %ileft_201 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_207 = tt.reshape %iright_205 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_208 = tt.reshape %new_idxs_196 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc237) + %left_idx_209 = arith.muli %y_idx_208, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc239) + %left_idx_210 = "tt.reduce"(%left_idx_209) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_211 = tt.expand_dims %left_idx_210 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc241) + %left_idx_212 = tt.broadcast %left_idx_211 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc242) + %right_idx_213 = arith.muli %y_idx_208, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc244) + %right_idx_214 = "tt.reduce"(%right_idx_213) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_215 = tt.expand_dims %right_idx_214 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc246) + %right_idx_216 = tt.broadcast %right_idx_215 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc247) + %left_idx_217 = tt.reshape %left_idx_212 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_218 = tt.reshape %right_idx_216 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_219 = arith.cmpi slt, %ileft_206, %iright_207 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_220 = arith.cmpi eq, %ileft_206, %iright_207 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_221 = arith.cmpi sgt, %left_idx_217, %right_idx_218 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_222 = arith.andi %eq_220, %cond_221 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_223 = arith.ori %cond_219, %cond_222 : tensor<32x16xi1, #blocked> loc(#loc254) + %cond_224 = arith.extui %cond_223 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc255) + %cond_225 = arith.xori %cond_224, %flip_159 : tensor<32x16xi32, #blocked> loc(#loc255) + %cond_226 = arith.cmpi ne, %cond_225, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc256) + %ret_227 = arith.xori %ileft_206, %iright_207 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_228 = arith.select %cond_226, %ret_227, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_229 = arith.xori %ret_193, %ret_228 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_230 = arith.xori %left_idx_217, %right_idx_218 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_231 = arith.select %cond_226, %new_idxs_230, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_232 = arith.xori %new_idxs_196, %new_idxs_231 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_233 = tt.reshape %ret_229 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc225) + %ileft_234 = arith.muli %y_233, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc227) + %ileft_235 = "tt.reduce"(%ileft_234) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_236 = tt.expand_dims %ileft_235 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc229) + %ileft_237 = tt.broadcast %ileft_236 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc230) + %iright_238 = arith.muli %y_233, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc231) + %iright_239 = "tt.reduce"(%iright_238) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_240 = tt.expand_dims %iright_239 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc233) + %iright_241 = tt.broadcast %iright_240 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc234) + %ileft_242 = tt.reshape %ileft_237 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_243 = tt.reshape %iright_241 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_244 = tt.reshape %new_idxs_232 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc237) + %left_idx_245 = arith.muli %y_idx_244, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc239) + %left_idx_246 = "tt.reduce"(%left_idx_245) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_247 = tt.expand_dims %left_idx_246 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc241) + %left_idx_248 = tt.broadcast %left_idx_247 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc242) + %right_idx_249 = arith.muli %y_idx_244, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc244) + %right_idx_250 = "tt.reduce"(%right_idx_249) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_251 = tt.expand_dims %right_idx_250 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc246) + %right_idx_252 = tt.broadcast %right_idx_251 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc247) + %left_idx_253 = tt.reshape %left_idx_248 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_254 = tt.reshape %right_idx_252 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_255 = arith.cmpi slt, %ileft_242, %iright_243 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_256 = arith.cmpi eq, %ileft_242, %iright_243 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_257 = arith.cmpi sgt, %left_idx_253, %right_idx_254 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_258 = arith.andi %eq_256, %cond_257 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_259 = arith.ori %cond_255, %cond_258 : tensor<32x16xi1, #blocked> loc(#loc254) + %cond_260 = arith.extui %cond_259 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc255) + %cond_261 = arith.xori %cond_260, %flip_159 : tensor<32x16xi32, #blocked> loc(#loc255) + %cond_262 = arith.cmpi ne, %cond_261, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc256) + %ret_263 = arith.xori %ileft_242, %iright_243 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_264 = arith.select %cond_262, %ret_263, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_265 = arith.xori %ret_229, %ret_264 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_266 = arith.xori %left_idx_253, %right_idx_254 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_267 = arith.select %cond_262, %new_idxs_266, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_268 = arith.xori %new_idxs_232, %new_idxs_267 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_269 = tt.reshape %ret_265 : tensor<32x16xi32, #blocked> -> tensor<32x2x8xi32, #blocked1> loc(#loc225) + %ileft_270 = tt.broadcast %left_mask_47 : tensor<1x2x1xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc227) + %ileft_271 = arith.muli %y_269, %ileft_270 : tensor<32x2x8xi32, #blocked1> loc(#loc227) + %ileft_272 = "tt.reduce"(%ileft_271) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc300) + %ileft_273 = tt.expand_dims %ileft_272 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc229) + %ileft_274 = tt.broadcast %ileft_273 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc230) + %iright_275 = arith.muli %y_269, %flip_158 : tensor<32x2x8xi32, #blocked1> loc(#loc231) + %iright_276 = "tt.reduce"(%iright_275) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc302) + %iright_277 = tt.expand_dims %iright_276 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc233) + %iright_278 = tt.broadcast %iright_277 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc234) + %ileft_279 = tt.reshape %ileft_274 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_280 = tt.reshape %iright_278 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_281 = tt.reshape %new_idxs_268 : tensor<32x16xi32, #blocked> -> tensor<32x2x8xi32, #blocked1> loc(#loc237) + %left_idx_282 = arith.muli %y_idx_281, %ileft_270 : tensor<32x2x8xi32, #blocked1> loc(#loc239) + %left_idx_283 = "tt.reduce"(%left_idx_282) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc305) + %left_idx_284 = tt.expand_dims %left_idx_283 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc241) + %left_idx_285 = tt.broadcast %left_idx_284 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc242) + %right_idx_286 = arith.muli %y_idx_281, %flip_158 : tensor<32x2x8xi32, #blocked1> loc(#loc244) + %right_idx_287 = "tt.reduce"(%right_idx_286) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc308) + %right_idx_288 = tt.expand_dims %right_idx_287 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc246) + %right_idx_289 = tt.broadcast %right_idx_288 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc247) + %left_idx_290 = tt.reshape %left_idx_285 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_291 = tt.reshape %right_idx_289 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_292 = arith.cmpi slt, %ileft_279, %iright_280 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_293 = arith.cmpi eq, %ileft_279, %iright_280 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_294 = arith.cmpi sgt, %left_idx_290, %right_idx_291 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_295 = arith.andi %eq_293, %cond_294 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_296 = arith.ori %cond_292, %cond_295 : tensor<32x16xi1, #blocked> loc(#loc254) + %ret_297 = arith.xori %ileft_279, %iright_280 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_298 = arith.select %cond_296, %ret_297, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_299 = arith.xori %ret_265, %ret_298 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_300 = arith.xori %left_idx_290, %right_idx_291 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_301 = arith.select %cond_296, %new_idxs_300, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_302 = arith.xori %new_idxs_268, %new_idxs_301 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_303 = tt.reshape %ret_299 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc225) + %ileft_304 = arith.muli %y_303, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc227) + %ileft_305 = "tt.reduce"(%ileft_304) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc300) + %ileft_306 = tt.expand_dims %ileft_305 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc229) + %ileft_307 = tt.broadcast %ileft_306 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc230) + %iright_308 = arith.muli %y_303, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc231) + %iright_309 = "tt.reduce"(%iright_308) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %iright_310 = tt.expand_dims %iright_309 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc233) + %iright_311 = tt.broadcast %iright_310 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc234) + %ileft_312 = tt.reshape %ileft_307 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_313 = tt.reshape %iright_311 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_314 = tt.reshape %new_idxs_302 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc237) + %left_idx_315 = arith.muli %y_idx_314, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc239) + %left_idx_316 = "tt.reduce"(%left_idx_315) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %left_idx_317 = tt.expand_dims %left_idx_316 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc241) + %left_idx_318 = tt.broadcast %left_idx_317 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc242) + %right_idx_319 = arith.muli %y_idx_314, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc244) + %right_idx_320 = "tt.reduce"(%right_idx_319) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc308) + %right_idx_321 = tt.expand_dims %right_idx_320 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc246) + %right_idx_322 = tt.broadcast %right_idx_321 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc247) + %left_idx_323 = tt.reshape %left_idx_318 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_324 = tt.reshape %right_idx_322 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_325 = arith.cmpi slt, %ileft_312, %iright_313 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_326 = arith.cmpi eq, %ileft_312, %iright_313 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_327 = arith.cmpi sgt, %left_idx_323, %right_idx_324 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_328 = arith.andi %eq_326, %cond_327 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_329 = arith.ori %cond_325, %cond_328 : tensor<32x16xi1, #blocked> loc(#loc254) + %ret_330 = arith.xori %ileft_312, %iright_313 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_331 = arith.select %cond_329, %ret_330, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_332 = arith.xori %ret_299, %ret_331 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_333 = arith.xori %left_idx_323, %right_idx_324 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_334 = arith.select %cond_329, %new_idxs_333, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_335 = arith.xori %new_idxs_302, %new_idxs_334 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_336 = tt.reshape %ret_332 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc225) + %ileft_337 = arith.muli %y_336, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc227) + %ileft_338 = "tt.reduce"(%ileft_337) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_339 = tt.expand_dims %ileft_338 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc229) + %ileft_340 = tt.broadcast %ileft_339 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc230) + %iright_341 = arith.muli %y_336, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc231) + %iright_342 = "tt.reduce"(%iright_341) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_343 = tt.expand_dims %iright_342 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc233) + %iright_344 = tt.broadcast %iright_343 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc234) + %ileft_345 = tt.reshape %ileft_340 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_346 = tt.reshape %iright_344 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_347 = tt.reshape %new_idxs_335 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc237) + %left_idx_348 = arith.muli %y_idx_347, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc239) + %left_idx_349 = "tt.reduce"(%left_idx_348) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_350 = tt.expand_dims %left_idx_349 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc241) + %left_idx_351 = tt.broadcast %left_idx_350 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc242) + %right_idx_352 = arith.muli %y_idx_347, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc244) + %right_idx_353 = "tt.reduce"(%right_idx_352) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_354 = tt.expand_dims %right_idx_353 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc246) + %right_idx_355 = tt.broadcast %right_idx_354 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc247) + %left_idx_356 = tt.reshape %left_idx_351 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_357 = tt.reshape %right_idx_355 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_358 = arith.cmpi slt, %ileft_345, %iright_346 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_359 = arith.cmpi eq, %ileft_345, %iright_346 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_360 = arith.cmpi sgt, %left_idx_356, %right_idx_357 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_361 = arith.andi %eq_359, %cond_360 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_362 = arith.ori %cond_358, %cond_361 : tensor<32x16xi1, #blocked> loc(#loc254) + %ret_363 = arith.xori %ileft_345, %iright_346 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_364 = arith.select %cond_362, %ret_363, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_365 = arith.xori %ret_332, %ret_364 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_366 = arith.xori %left_idx_356, %right_idx_357 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_367 = arith.select %cond_362, %new_idxs_366, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_368 = arith.xori %new_idxs_335, %new_idxs_367 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_369 = tt.reshape %ret_365 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc225) + %ileft_370 = arith.muli %y_369, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc227) + %ileft_371 = "tt.reduce"(%ileft_370) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_372 = tt.expand_dims %ileft_371 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc229) + %ileft_373 = tt.broadcast %ileft_372 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc230) + %iright_374 = arith.muli %y_369, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc231) + %iright_375 = "tt.reduce"(%iright_374) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_376 = tt.expand_dims %iright_375 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc233) + %iright_377 = tt.broadcast %iright_376 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc234) + %ileft_378 = tt.reshape %ileft_373 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_379 = tt.reshape %iright_377 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_380 = tt.reshape %new_idxs_368 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc237) + %left_idx_381 = arith.muli %y_idx_380, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc239) + %left_idx_382 = "tt.reduce"(%left_idx_381) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_383 = tt.expand_dims %left_idx_382 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc241) + %left_idx_384 = tt.broadcast %left_idx_383 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc242) + %right_idx_385 = arith.muli %y_idx_380, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc244) + %right_idx_386 = "tt.reduce"(%right_idx_385) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_387 = tt.expand_dims %right_idx_386 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc246) + %right_idx_388 = tt.broadcast %right_idx_387 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc247) + %left_idx_389 = tt.reshape %left_idx_384 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_390 = tt.reshape %right_idx_388 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_391 = arith.cmpi slt, %ileft_378, %iright_379 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_392 = arith.cmpi eq, %ileft_378, %iright_379 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_393 = arith.cmpi sgt, %left_idx_389, %right_idx_390 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_394 = arith.andi %eq_392, %cond_393 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_395 = arith.ori %cond_391, %cond_394 : tensor<32x16xi1, #blocked> loc(#loc254) + %new_idxs_396 = arith.xori %left_idx_389, %right_idx_390 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_397 = arith.select %cond_395, %new_idxs_396, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_398 = arith.xori %new_idxs_368, %new_idxs_397 : tensor<32x16xi32, #blocked> loc(#loc262) + %tmp14 = arith.cmpi eq, %tmp0_31, %cst_12 : tensor<32x16xi64, #blocked> loc(#loc186) + %tmp16 = arith.extui %tmp14 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc217) + %y_399 = tt.reshape %tmp16 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc263) + %ileft_400 = arith.muli %y_399, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc264) + %ileft_401 = "tt.reduce"(%ileft_400) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_402 = tt.expand_dims %ileft_401 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc266) + %ileft_403 = tt.broadcast %ileft_402 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc267) + %iright_404 = arith.muli %y_399, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc268) + %iright_405 = "tt.reduce"(%iright_404) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_406 = tt.expand_dims %iright_405 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc270) + %iright_407 = tt.broadcast %iright_406 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc271) + %ileft_408 = tt.reshape %ileft_403 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_409 = tt.reshape %iright_407 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc273) + %cond_410 = arith.cmpi slt, %ileft_408, %iright_409 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_411 = arith.cmpi eq, %ileft_408, %iright_409 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_412 = arith.andi %eq_411, %cond_71 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_413 = arith.ori %cond_410, %cond_412 : tensor<32x16xi1, #blocked> loc(#loc277) + %cond_414 = arith.extui %cond_413 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc278) + %cond_415 = arith.xori %cond_414, %flip_44 : tensor<32x16xi32, #blocked> loc(#loc278) + %cond_416 = arith.cmpi ne, %cond_415, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc279) + %ret_417 = arith.xori %ileft_408, %iright_409 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_418 = arith.select %cond_416, %ret_417, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_419 = arith.xori %tmp16, %ret_418 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_420 = arith.select %cond_416, %new_idxs, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_421 = arith.xori %new_idxs_81, %new_idxs_420 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_422 = tt.reshape %ret_419 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc263) + %ileft_423 = arith.muli %y_422, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc264) + %ileft_424 = "tt.reduce"(%ileft_423) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_425 = tt.expand_dims %ileft_424 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc266) + %ileft_426 = tt.broadcast %ileft_425 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc267) + %iright_427 = arith.muli %y_422, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc268) + %iright_428 = "tt.reduce"(%iright_427) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_429 = tt.expand_dims %iright_428 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc270) + %iright_430 = tt.broadcast %iright_429 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc271) + %ileft_431 = tt.reshape %ileft_426 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_432 = tt.reshape %iright_430 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_433 = tt.reshape %new_idxs_421 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc285) + %left_idx_434 = arith.muli %y_idx_433, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc286) + %left_idx_435 = "tt.reduce"(%left_idx_434) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_436 = tt.expand_dims %left_idx_435 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc288) + %left_idx_437 = tt.broadcast %left_idx_436 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc289) + %right_idx_438 = arith.muli %y_idx_433, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc290) + %right_idx_439 = "tt.reduce"(%right_idx_438) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_440 = tt.expand_dims %right_idx_439 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc292) + %right_idx_441 = tt.broadcast %right_idx_440 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc293) + %left_idx_442 = tt.reshape %left_idx_437 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_443 = tt.reshape %right_idx_441 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_444 = arith.cmpi slt, %ileft_431, %iright_432 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_445 = arith.cmpi eq, %ileft_431, %iright_432 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_446 = arith.cmpi sgt, %left_idx_442, %right_idx_443 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_447 = arith.andi %eq_445, %cond_446 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_448 = arith.ori %cond_444, %cond_447 : tensor<32x16xi1, #blocked> loc(#loc277) + %cond_449 = arith.extui %cond_448 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc278) + %cond_450 = arith.xori %cond_449, %flip_84 : tensor<32x16xi32, #blocked> loc(#loc278) + %cond_451 = arith.cmpi ne, %cond_450, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc279) + %ret_452 = arith.xori %ileft_431, %iright_432 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_453 = arith.select %cond_451, %ret_452, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_454 = arith.xori %ret_419, %ret_453 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_455 = arith.xori %left_idx_442, %right_idx_443 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_456 = arith.select %cond_451, %new_idxs_455, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_457 = arith.xori %new_idxs_421, %new_idxs_456 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_458 = tt.reshape %ret_454 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc263) + %ileft_459 = arith.muli %y_458, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc264) + %ileft_460 = "tt.reduce"(%ileft_459) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_461 = tt.expand_dims %ileft_460 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc266) + %ileft_462 = tt.broadcast %ileft_461 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc267) + %iright_463 = arith.muli %y_458, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc268) + %iright_464 = "tt.reduce"(%iright_463) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_465 = tt.expand_dims %iright_464 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc270) + %iright_466 = tt.broadcast %iright_465 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc271) + %ileft_467 = tt.reshape %ileft_462 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_468 = tt.reshape %iright_466 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_469 = tt.reshape %new_idxs_457 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc285) + %left_idx_470 = arith.muli %y_idx_469, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc286) + %left_idx_471 = "tt.reduce"(%left_idx_470) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_472 = tt.expand_dims %left_idx_471 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc288) + %left_idx_473 = tt.broadcast %left_idx_472 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc289) + %right_idx_474 = arith.muli %y_idx_469, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc290) + %right_idx_475 = "tt.reduce"(%right_idx_474) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_476 = tt.expand_dims %right_idx_475 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc292) + %right_idx_477 = tt.broadcast %right_idx_476 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc293) + %left_idx_478 = tt.reshape %left_idx_473 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_479 = tt.reshape %right_idx_477 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_480 = arith.cmpi slt, %ileft_467, %iright_468 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_481 = arith.cmpi eq, %ileft_467, %iright_468 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_482 = arith.cmpi sgt, %left_idx_478, %right_idx_479 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_483 = arith.andi %eq_481, %cond_482 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_484 = arith.ori %cond_480, %cond_483 : tensor<32x16xi1, #blocked> loc(#loc277) + %cond_485 = arith.extui %cond_484 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc278) + %cond_486 = arith.xori %cond_485, %flip_84 : tensor<32x16xi32, #blocked> loc(#loc278) + %cond_487 = arith.cmpi ne, %cond_486, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc279) + %ret_488 = arith.xori %ileft_467, %iright_468 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_489 = arith.select %cond_487, %ret_488, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_490 = arith.xori %ret_454, %ret_489 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_491 = arith.xori %left_idx_478, %right_idx_479 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_492 = arith.select %cond_487, %new_idxs_491, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_493 = arith.xori %new_idxs_457, %new_idxs_492 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_494 = tt.reshape %ret_490 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc263) + %ileft_495 = arith.muli %y_494, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc264) + %ileft_496 = "tt.reduce"(%ileft_495) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc310) + %ileft_497 = tt.expand_dims %ileft_496 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc266) + %ileft_498 = tt.broadcast %ileft_497 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc267) + %iright_499 = arith.muli %y_494, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc268) + %iright_500 = "tt.reduce"(%iright_499) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc312) + %iright_501 = tt.expand_dims %iright_500 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc270) + %iright_502 = tt.broadcast %iright_501 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc271) + %ileft_503 = tt.reshape %ileft_498 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_504 = tt.reshape %iright_502 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_505 = tt.reshape %new_idxs_493 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc285) + %left_idx_506 = arith.muli %y_idx_505, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc286) + %left_idx_507 = "tt.reduce"(%left_idx_506) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc314) + %left_idx_508 = tt.expand_dims %left_idx_507 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc288) + %left_idx_509 = tt.broadcast %left_idx_508 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc289) + %right_idx_510 = arith.muli %y_idx_505, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc290) + %right_idx_511 = "tt.reduce"(%right_idx_510) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc316) + %right_idx_512 = tt.expand_dims %right_idx_511 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc292) + %right_idx_513 = tt.broadcast %right_idx_512 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc293) + %left_idx_514 = tt.reshape %left_idx_509 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_515 = tt.reshape %right_idx_513 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_516 = arith.cmpi slt, %ileft_503, %iright_504 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_517 = arith.cmpi eq, %ileft_503, %iright_504 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_518 = arith.cmpi sgt, %left_idx_514, %right_idx_515 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_519 = arith.andi %eq_517, %cond_518 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_520 = arith.ori %cond_516, %cond_519 : tensor<32x16xi1, #blocked> loc(#loc277) + %cond_521 = arith.extui %cond_520 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc278) + %cond_522 = arith.xori %cond_521, %flip_159 : tensor<32x16xi32, #blocked> loc(#loc278) + %cond_523 = arith.cmpi ne, %cond_522, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc279) + %ret_524 = arith.xori %ileft_503, %iright_504 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_525 = arith.select %cond_523, %ret_524, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_526 = arith.xori %ret_490, %ret_525 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_527 = arith.xori %left_idx_514, %right_idx_515 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_528 = arith.select %cond_523, %new_idxs_527, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_529 = arith.xori %new_idxs_493, %new_idxs_528 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_530 = tt.reshape %ret_526 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc263) + %ileft_531 = arith.muli %y_530, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc264) + %ileft_532 = "tt.reduce"(%ileft_531) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_533 = tt.expand_dims %ileft_532 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc266) + %ileft_534 = tt.broadcast %ileft_533 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc267) + %iright_535 = arith.muli %y_530, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc268) + %iright_536 = "tt.reduce"(%iright_535) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_537 = tt.expand_dims %iright_536 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc270) + %iright_538 = tt.broadcast %iright_537 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc271) + %ileft_539 = tt.reshape %ileft_534 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_540 = tt.reshape %iright_538 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_541 = tt.reshape %new_idxs_529 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc285) + %left_idx_542 = arith.muli %y_idx_541, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc286) + %left_idx_543 = "tt.reduce"(%left_idx_542) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_544 = tt.expand_dims %left_idx_543 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc288) + %left_idx_545 = tt.broadcast %left_idx_544 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc289) + %right_idx_546 = arith.muli %y_idx_541, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc290) + %right_idx_547 = "tt.reduce"(%right_idx_546) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_548 = tt.expand_dims %right_idx_547 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc292) + %right_idx_549 = tt.broadcast %right_idx_548 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc293) + %left_idx_550 = tt.reshape %left_idx_545 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_551 = tt.reshape %right_idx_549 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_552 = arith.cmpi slt, %ileft_539, %iright_540 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_553 = arith.cmpi eq, %ileft_539, %iright_540 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_554 = arith.cmpi sgt, %left_idx_550, %right_idx_551 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_555 = arith.andi %eq_553, %cond_554 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_556 = arith.ori %cond_552, %cond_555 : tensor<32x16xi1, #blocked> loc(#loc277) + %cond_557 = arith.extui %cond_556 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc278) + %cond_558 = arith.xori %cond_557, %flip_159 : tensor<32x16xi32, #blocked> loc(#loc278) + %cond_559 = arith.cmpi ne, %cond_558, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc279) + %ret_560 = arith.xori %ileft_539, %iright_540 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_561 = arith.select %cond_559, %ret_560, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_562 = arith.xori %ret_526, %ret_561 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_563 = arith.xori %left_idx_550, %right_idx_551 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_564 = arith.select %cond_559, %new_idxs_563, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_565 = arith.xori %new_idxs_529, %new_idxs_564 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_566 = tt.reshape %ret_562 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc263) + %ileft_567 = arith.muli %y_566, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc264) + %ileft_568 = "tt.reduce"(%ileft_567) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_569 = tt.expand_dims %ileft_568 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc266) + %ileft_570 = tt.broadcast %ileft_569 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc267) + %iright_571 = arith.muli %y_566, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc268) + %iright_572 = "tt.reduce"(%iright_571) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_573 = tt.expand_dims %iright_572 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc270) + %iright_574 = tt.broadcast %iright_573 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc271) + %ileft_575 = tt.reshape %ileft_570 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_576 = tt.reshape %iright_574 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_577 = tt.reshape %new_idxs_565 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc285) + %left_idx_578 = arith.muli %y_idx_577, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc286) + %left_idx_579 = "tt.reduce"(%left_idx_578) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_580 = tt.expand_dims %left_idx_579 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc288) + %left_idx_581 = tt.broadcast %left_idx_580 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc289) + %right_idx_582 = arith.muli %y_idx_577, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc290) + %right_idx_583 = "tt.reduce"(%right_idx_582) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_584 = tt.expand_dims %right_idx_583 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc292) + %right_idx_585 = tt.broadcast %right_idx_584 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc293) + %left_idx_586 = tt.reshape %left_idx_581 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_587 = tt.reshape %right_idx_585 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_588 = arith.cmpi slt, %ileft_575, %iright_576 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_589 = arith.cmpi eq, %ileft_575, %iright_576 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_590 = arith.cmpi sgt, %left_idx_586, %right_idx_587 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_591 = arith.andi %eq_589, %cond_590 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_592 = arith.ori %cond_588, %cond_591 : tensor<32x16xi1, #blocked> loc(#loc277) + %cond_593 = arith.extui %cond_592 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc278) + %cond_594 = arith.xori %cond_593, %flip_159 : tensor<32x16xi32, #blocked> loc(#loc278) + %cond_595 = arith.cmpi ne, %cond_594, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc279) + %ret_596 = arith.xori %ileft_575, %iright_576 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_597 = arith.select %cond_595, %ret_596, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_598 = arith.xori %ret_562, %ret_597 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_599 = arith.xori %left_idx_586, %right_idx_587 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_600 = arith.select %cond_595, %new_idxs_599, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_601 = arith.xori %new_idxs_565, %new_idxs_600 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_602 = tt.reshape %ret_598 : tensor<32x16xi32, #blocked> -> tensor<32x2x8xi32, #blocked1> loc(#loc263) + %ileft_603 = arith.muli %y_602, %ileft_270 : tensor<32x2x8xi32, #blocked1> loc(#loc264) + %ileft_604 = "tt.reduce"(%ileft_603) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc310) + %ileft_605 = tt.expand_dims %ileft_604 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc266) + %ileft_606 = tt.broadcast %ileft_605 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc267) + %iright_607 = arith.muli %y_602, %flip_158 : tensor<32x2x8xi32, #blocked1> loc(#loc268) + %iright_608 = "tt.reduce"(%iright_607) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc312) + %iright_609 = tt.expand_dims %iright_608 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc270) + %iright_610 = tt.broadcast %iright_609 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc271) + %ileft_611 = tt.reshape %ileft_606 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_612 = tt.reshape %iright_610 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_613 = tt.reshape %new_idxs_601 : tensor<32x16xi32, #blocked> -> tensor<32x2x8xi32, #blocked1> loc(#loc285) + %left_idx_614 = arith.muli %y_idx_613, %ileft_270 : tensor<32x2x8xi32, #blocked1> loc(#loc286) + %left_idx_615 = "tt.reduce"(%left_idx_614) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc314) + %left_idx_616 = tt.expand_dims %left_idx_615 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc288) + %left_idx_617 = tt.broadcast %left_idx_616 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc289) + %right_idx_618 = arith.muli %y_idx_613, %flip_158 : tensor<32x2x8xi32, #blocked1> loc(#loc290) + %right_idx_619 = "tt.reduce"(%right_idx_618) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc316) + %right_idx_620 = tt.expand_dims %right_idx_619 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc292) + %right_idx_621 = tt.broadcast %right_idx_620 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc293) + %left_idx_622 = tt.reshape %left_idx_617 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_623 = tt.reshape %right_idx_621 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_624 = arith.cmpi slt, %ileft_611, %iright_612 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_625 = arith.cmpi eq, %ileft_611, %iright_612 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_626 = arith.cmpi sgt, %left_idx_622, %right_idx_623 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_627 = arith.andi %eq_625, %cond_626 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_628 = arith.ori %cond_624, %cond_627 : tensor<32x16xi1, #blocked> loc(#loc277) + %ret_629 = arith.xori %ileft_611, %iright_612 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_630 = arith.select %cond_628, %ret_629, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_631 = arith.xori %ret_598, %ret_630 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_632 = arith.xori %left_idx_622, %right_idx_623 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_633 = arith.select %cond_628, %new_idxs_632, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_634 = arith.xori %new_idxs_601, %new_idxs_633 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_635 = tt.reshape %ret_631 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc263) + %ileft_636 = arith.muli %y_635, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc264) + %ileft_637 = "tt.reduce"(%ileft_636) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc310) + %ileft_638 = tt.expand_dims %ileft_637 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc266) + %ileft_639 = tt.broadcast %ileft_638 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc267) + %iright_640 = arith.muli %y_635, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc268) + %iright_641 = "tt.reduce"(%iright_640) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc312) + %iright_642 = tt.expand_dims %iright_641 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc270) + %iright_643 = tt.broadcast %iright_642 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc271) + %ileft_644 = tt.reshape %ileft_639 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_645 = tt.reshape %iright_643 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_646 = tt.reshape %new_idxs_634 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc285) + %left_idx_647 = arith.muli %y_idx_646, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc286) + %left_idx_648 = "tt.reduce"(%left_idx_647) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc314) + %left_idx_649 = tt.expand_dims %left_idx_648 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc288) + %left_idx_650 = tt.broadcast %left_idx_649 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc289) + %right_idx_651 = arith.muli %y_idx_646, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc290) + %right_idx_652 = "tt.reduce"(%right_idx_651) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc316) + %right_idx_653 = tt.expand_dims %right_idx_652 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc292) + %right_idx_654 = tt.broadcast %right_idx_653 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc293) + %left_idx_655 = tt.reshape %left_idx_650 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_656 = tt.reshape %right_idx_654 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_657 = arith.cmpi slt, %ileft_644, %iright_645 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_658 = arith.cmpi eq, %ileft_644, %iright_645 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_659 = arith.cmpi sgt, %left_idx_655, %right_idx_656 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_660 = arith.andi %eq_658, %cond_659 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_661 = arith.ori %cond_657, %cond_660 : tensor<32x16xi1, #blocked> loc(#loc277) + %ret_662 = arith.xori %ileft_644, %iright_645 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_663 = arith.select %cond_661, %ret_662, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_664 = arith.xori %ret_631, %ret_663 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_665 = arith.xori %left_idx_655, %right_idx_656 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_666 = arith.select %cond_661, %new_idxs_665, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_667 = arith.xori %new_idxs_634, %new_idxs_666 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_668 = tt.reshape %ret_664 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc263) + %ileft_669 = arith.muli %y_668, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc264) + %ileft_670 = "tt.reduce"(%ileft_669) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_671 = tt.expand_dims %ileft_670 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc266) + %ileft_672 = tt.broadcast %ileft_671 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc267) + %iright_673 = arith.muli %y_668, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc268) + %iright_674 = "tt.reduce"(%iright_673) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_675 = tt.expand_dims %iright_674 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc270) + %iright_676 = tt.broadcast %iright_675 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc271) + %ileft_677 = tt.reshape %ileft_672 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_678 = tt.reshape %iright_676 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_679 = tt.reshape %new_idxs_667 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc285) + %left_idx_680 = arith.muli %y_idx_679, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc286) + %left_idx_681 = "tt.reduce"(%left_idx_680) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_682 = tt.expand_dims %left_idx_681 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc288) + %left_idx_683 = tt.broadcast %left_idx_682 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc289) + %right_idx_684 = arith.muli %y_idx_679, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc290) + %right_idx_685 = "tt.reduce"(%right_idx_684) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_686 = tt.expand_dims %right_idx_685 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc292) + %right_idx_687 = tt.broadcast %right_idx_686 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc293) + %left_idx_688 = tt.reshape %left_idx_683 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_689 = tt.reshape %right_idx_687 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_690 = arith.cmpi slt, %ileft_677, %iright_678 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_691 = arith.cmpi eq, %ileft_677, %iright_678 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_692 = arith.cmpi sgt, %left_idx_688, %right_idx_689 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_693 = arith.andi %eq_691, %cond_692 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_694 = arith.ori %cond_690, %cond_693 : tensor<32x16xi1, #blocked> loc(#loc277) + %ret_695 = arith.xori %ileft_677, %iright_678 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_696 = arith.select %cond_694, %ret_695, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_697 = arith.xori %ret_664, %ret_696 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_698 = arith.xori %left_idx_688, %right_idx_689 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_699 = arith.select %cond_694, %new_idxs_698, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_700 = arith.xori %new_idxs_667, %new_idxs_699 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_701 = tt.reshape %ret_697 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc263) + %ileft_702 = arith.muli %y_701, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc264) + %ileft_703 = "tt.reduce"(%ileft_702) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_704 = tt.expand_dims %ileft_703 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc266) + %ileft_705 = tt.broadcast %ileft_704 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc267) + %iright_706 = arith.muli %y_701, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc268) + %iright_707 = "tt.reduce"(%iright_706) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_708 = tt.expand_dims %iright_707 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc270) + %iright_709 = tt.broadcast %iright_708 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc271) + %ileft_710 = tt.reshape %ileft_705 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_711 = tt.reshape %iright_709 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_712 = tt.reshape %new_idxs_700 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc285) + %left_idx_713 = arith.muli %y_idx_712, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc286) + %left_idx_714 = "tt.reduce"(%left_idx_713) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_715 = tt.expand_dims %left_idx_714 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc288) + %left_idx_716 = tt.broadcast %left_idx_715 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc289) + %right_idx_717 = arith.muli %y_idx_712, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc290) + %right_idx_718 = "tt.reduce"(%right_idx_717) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_719 = tt.expand_dims %right_idx_718 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc292) + %right_idx_720 = tt.broadcast %right_idx_719 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc293) + %left_idx_721 = tt.reshape %left_idx_716 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_722 = tt.reshape %right_idx_720 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_723 = arith.cmpi slt, %ileft_710, %iright_711 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_724 = arith.cmpi eq, %ileft_710, %iright_711 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_725 = arith.cmpi sgt, %left_idx_721, %right_idx_722 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_726 = arith.andi %eq_724, %cond_725 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_727 = arith.ori %cond_723, %cond_726 : tensor<32x16xi1, #blocked> loc(#loc277) + %new_idxs_728 = arith.xori %left_idx_721, %right_idx_722 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_729 = arith.select %cond_727, %new_idxs_728, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_730 = arith.xori %new_idxs_700, %new_idxs_729 : tensor<32x16xi32, #blocked> loc(#loc284) + %tmp20 = arith.extui %tmp5 : tensor<32x16xi1, #blocked> to tensor<32x16xi64, #blocked> loc(#loc219) + %tmp23 = arith.select %tmp0_29, %tmp20, %cst_13 : tensor<32x16xi1, #blocked>, tensor<32x16xi64, #blocked> loc(#loc191) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_741: i64 loc(callsite(#loc1 at #loc192)), %tmp24_742: i64 loc(callsite(#loc1 at #loc192))): + %tmp24_743 = arith.addi %tmp24_741, %tmp24_742 : i64 loc(#loc298) + tt.reduce.return %tmp24_743 : i64 loc(#loc220) + }) : (tensor<32x16xi64, #blocked>) -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc220) + %tmp30 = ttg.convert_layout %tmp24 : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc193) + %tmp24_731 = tt.expand_dims %tmp30 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<32x1xi64, #blocked5> loc(#loc194) + %tmp24_732 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi64, #blocked> loc(#loc194) + %tmp25 = arith.extui %tmp14 : tensor<32x16xi1, #blocked> to tensor<32x16xi64, #blocked> loc(#loc222) + %tmp28 = arith.select %tmp0_29, %tmp25, %cst_13 : tensor<32x16xi1, #blocked>, tensor<32x16xi64, #blocked> loc(#loc196) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_741: i64 loc(callsite(#loc1 at #loc197)), %tmp29_742: i64 loc(callsite(#loc1 at #loc197))): + %tmp29_743 = arith.addi %tmp29_741, %tmp29_742 : i64 loc(#loc299) + tt.reduce.return %tmp29_743 : i64 loc(#loc223) + }) : (tensor<32x16xi64, #blocked>) -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc223) + %tmp31 = ttg.convert_layout %tmp29 : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc198) + %tmp29_733 = tt.expand_dims %tmp31 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<32x1xi64, #blocked5> loc(#loc199) + %tmp29_734 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi64, #blocked> loc(#loc199) + %tmp30_735 = arith.trunci %tmp24_731 : tensor<32x1xi64, #blocked5> to tensor<32x1xi32, #blocked5> loc(#loc193) + %tmp30_736 = arith.trunci %tmp24_732 : tensor<32x1xi64, #blocked> to tensor<32x1xi32, #blocked> loc(#loc193) + %tmp31_737 = arith.trunci %tmp29_733 : tensor<32x1xi64, #blocked5> to tensor<32x1xi32, #blocked5> loc(#loc198) + %tmp31_738 = arith.trunci %tmp29_734 : tensor<32x1xi64, #blocked> to tensor<32x1xi32, #blocked> loc(#loc198) + %tmp34 = tt.broadcast %tmp30_736 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc200) + %tmp34_739 = arith.cmpi slt, %tmp0_24, %tmp34 : tensor<32x16xi32, #blocked> loc(#loc200) + %tmp36 = arith.select %tmp34_739, %new_idxs_398, %cst_11 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc201) + %tmp38 = arith.addi %tmp36, %cst_10 : tensor<32x16xi32, #blocked> loc(#loc202) + %tmp39 = arith.cmpi slt, %tmp36, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc203) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc204) + %0 = arith.cmpi sge, %tmp40, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc85) + %1 = arith.cmpi slt, %tmp40, %cst_10 : tensor<32x16xi32, #blocked> loc(#loc86) + %2 = arith.andi %0, %1 : tensor<32x16xi1, #blocked> loc(#loc87) + %3 = arith.xori %xmask, %cst : tensor<32x1xi1, #blocked> loc(#loc88) + %4 = tt.broadcast %3 : tensor<32x1xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc89) + %5 = arith.ori %2, %4 : tensor<32x16xi1, #blocked> loc(#loc89) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<32x16xi1, #blocked> loc(#loc90) + %tmp45 = tt.broadcast %tmp31_738 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc205) + %tmp45_740 = arith.cmpi slt, %tmp0_24, %tmp45 : tensor<32x16xi32, #blocked> loc(#loc205) + %tmp46 = arith.select %tmp45_740, %new_idxs_730, %cst_11 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc206) + %tmp47 = arith.addi %tmp46, %cst_10 : tensor<32x16xi32, #blocked> loc(#loc207) + %tmp48 = arith.cmpi slt, %tmp46, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc208) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc209) + %6 = arith.cmpi sge, %tmp49, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc96) + %7 = arith.cmpi slt, %tmp49, %cst_10 : tensor<32x16xi32, #blocked> loc(#loc97) + %8 = arith.andi %6, %7 : tensor<32x16xi1, #blocked> loc(#loc98) + %9 = arith.ori %8, %4 : tensor<32x16xi1, #blocked> loc(#loc99) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<32x16xi1, #blocked> loc(#loc100) + %10 = tt.splat %out_ptr4 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked5> loc(#loc101) + %11 = tt.addptr %10, %xindex_21 : tensor<32x1x!tt.ptr, #blocked5>, tensor<32x1xi32, #blocked5> loc(#loc101) + tt.store %11, %tmp30_735, %xmask_22 : tensor<32x1x!tt.ptr, #blocked5> loc(#loc102) + %12 = tt.splat %out_ptr5 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked5> loc(#loc103) + %13 = tt.addptr %12, %xindex_21 : tensor<32x1x!tt.ptr, #blocked5>, tensor<32x1xi32, #blocked5> loc(#loc103) + tt.store %13, %tmp31_737, %xmask_22 : tensor<32x1x!tt.ptr, #blocked5> loc(#loc104) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc105) + %15 = tt.addptr %14, %tmp0_26 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc105) + tt.store %15, %new_idxs_398, %tmp0_29 : tensor<32x16x!tt.ptr, #blocked> loc(#loc106) + %16 = arith.muli %xindex_20, %cst_0 : tensor<32x1xi32, #blocked> loc(#loc107) + %17 = tt.broadcast %16 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc108) + %18 = arith.addi %tmp40, %17 : tensor<32x16xi32, #blocked> loc(#loc108) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc109) + %20 = tt.addptr %19, %18 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc109) + %21 = ttg.convert_layout %20 : tensor<32x16x!tt.ptr, #blocked> -> tensor<32x16x!tt.ptr, #blocked5> loc(#loc110) + tt.store %21, %cst_8, %tmp0_30 : tensor<32x16x!tt.ptr, #blocked5> loc(#loc110) + %22 = tt.splat %out_ptr8 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc111) + %23 = tt.addptr %22, %tmp0_26 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc111) + tt.store %23, %new_idxs_730, %tmp0_29 : tensor<32x16x!tt.ptr, #blocked> loc(#loc112) + %24 = arith.addi %tmp49, %17 : tensor<32x16xi32, #blocked> loc(#loc113) + %25 = tt.splat %out_ptr9 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc114) + %26 = tt.addptr %25, %24 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc114) + %27 = ttg.convert_layout %26 : tensor<32x16x!tt.ptr, #blocked> -> tensor<32x16x!tt.ptr, #blocked5> loc(#loc115) + tt.store %27, %cst_8, %tmp0_30 : tensor<32x16x!tt.ptr, #blocked5> loc(#loc115) + tt.return loc(#loc116) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":26:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:40) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:30) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":36:18) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":38:18) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":39:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":41:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":40:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":43:19) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":45:34) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":47:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":49:21) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":48:21) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":52:20) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":54:35) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":60:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:29) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":56:21) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":58:35) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":61:21) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:29) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":64:19) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":66:35) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":68:20) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":69:20) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":70:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:28) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:46) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:38) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:55) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:53) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:63) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":75:19) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":76:35) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":77:20) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":78:20) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":79:35) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:28) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:46) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:38) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:53) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:63) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:25) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:37) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:25) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:37) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:25) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:47) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:52) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:49) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:25) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:85) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:25) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:47) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:49) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:85) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:4) +#loc126 = loc("xoffset"(#loc2)) +#loc127 = loc("xoffset"(#loc3)) +#loc128 = loc("xindex"(#loc4)) +#loc129 = loc("xindex"(#loc5)) +#loc130 = loc("xmask"(#loc6)) +#loc131 = loc("r0_index"(#loc7)) +#loc132 = loc("tmp0"(#loc8)) +#loc133 = loc("tmp0"(#loc9)) +#loc134 = loc("tmp0"(#loc10)) +#loc135 = loc("tmp0"(#loc11)) +#loc136 = loc("tmp2"(#loc12)) +#loc137 = loc("tmp4"(#loc13)) +#loc138 = loc("tmp5"(#loc14)) +#loc139 = loc("tmp7"(#loc15)) +#loc140 = loc("tmp6"(#loc16)) +#loc141 = loc("tmp9"(#loc17)) +#loc142 = loc("tmp11"(#loc18)) +#loc143 = loc("flip"(#loc19)) +#loc145 = loc("flip"(#loc22)) +#loc146 = loc("flip"(#loc23)) +#loc147 = loc("y"(#loc24)) +#loc148 = loc("left_mask"(#loc26)) +#loc149 = loc("ileft"(#loc27)) +#loc151 = loc("ileft"(#loc31)) +#loc152 = loc("ileft"(#loc32)) +#loc153 = loc("iright"(#loc33)) +#loc155 = loc("iright"(#loc35)) +#loc156 = loc("iright"(#loc36)) +#loc157 = loc("ileft"(#loc37)) +#loc158 = loc("iright"(#loc38)) +#loc159 = loc("y_idx"(#loc39)) +#loc160 = loc("left_idx"(#loc40)) +#loc161 = loc("left_idx"(#loc41)) +#loc162 = loc("input"(#loc42)) +#loc164 = loc("left_idx"(#loc44)) +#loc165 = loc("left_idx"(#loc45)) +#loc166 = loc("right_idx"(#loc46)) +#loc167 = loc("right_idx"(#loc47)) +#loc169 = loc("right_idx"(#loc49)) +#loc170 = loc("right_idx"(#loc50)) +#loc171 = loc("left_idx"(#loc51)) +#loc172 = loc("right_idx"(#loc52)) +#loc173 = loc("cond"(#loc53)) +#loc174 = loc("eq"(#loc54)) +#loc175 = loc("cond"(#loc55)) +#loc176 = loc("cond"(#loc56)) +#loc177 = loc("cond"(#loc57)) +#loc178 = loc("cond"(#loc58)) +#loc179 = loc("cond"(#loc59)) +#loc180 = loc("ret"(#loc60)) +#loc181 = loc("ret"(#loc61)) +#loc182 = loc("ret"(#loc62)) +#loc183 = loc("new_idxs"(#loc63)) +#loc184 = loc("new_idxs"(#loc64)) +#loc185 = loc("new_idxs"(#loc65)) +#loc186 = loc("tmp14"(#loc66)) +#loc187 = loc("tmp16"(#loc67)) +#loc188 = loc("tmp15"(#loc68)) +#loc190 = loc("tmp20"(#loc70)) +#loc191 = loc("tmp23"(#loc71)) +#loc193 = loc("tmp30"(#loc73)) +#loc194 = loc("tmp24"(#loc74)) +#loc195 = loc("tmp25"(#loc75)) +#loc196 = loc("tmp28"(#loc76)) +#loc198 = loc("tmp31"(#loc78)) +#loc199 = loc("tmp29"(#loc79)) +#loc200 = loc("tmp34"(#loc80)) +#loc201 = loc("tmp36"(#loc81)) +#loc202 = loc("tmp38"(#loc82)) +#loc203 = loc("tmp39"(#loc83)) +#loc204 = loc("tmp40"(#loc84)) +#loc205 = loc("tmp45"(#loc91)) +#loc206 = loc("tmp46"(#loc92)) +#loc207 = loc("tmp47"(#loc93)) +#loc208 = loc("tmp48"(#loc94)) +#loc209 = loc("tmp49"(#loc95)) +#loc210 = loc(fused[#loc139, #loc140]) +#loc211 = loc(callsite(#loc143 at #loc144)) +#loc212 = loc(callsite(#loc145 at #loc144)) +#loc213 = loc(callsite(#loc146 at #loc144)) +#loc215 = loc("cond"(#loc173)) +#loc216 = loc("eq"(#loc174)) +#loc217 = loc(fused[#loc187, #loc188]) +#loc219 = loc(fused[#loc190, #loc139, #loc140]) +#loc220 = loc(callsite(#loc28 at #loc192)) +#loc222 = loc(fused[#loc195, #loc187, #loc188]) +#loc223 = loc(callsite(#loc28 at #loc197)) +#loc225 = loc(callsite(#loc147 at #loc214)) +#loc226 = loc(callsite(#loc148 at #loc214)) +#loc227 = loc(callsite(#loc149 at #loc214)) +#loc229 = loc(callsite(#loc151 at #loc214)) +#loc230 = loc(callsite(#loc152 at #loc214)) +#loc231 = loc(callsite(#loc153 at #loc214)) +#loc233 = loc(callsite(#loc155 at #loc214)) +#loc234 = loc(callsite(#loc156 at #loc214)) +#loc235 = loc(callsite(#loc157 at #loc214)) +#loc236 = loc(callsite(#loc158 at #loc214)) +#loc237 = loc(callsite(#loc159 at #loc214)) +#loc238 = loc(callsite(#loc160 at #loc214)) +#loc239 = loc(callsite(#loc161 at #loc214)) +#loc241 = loc(callsite(#loc164 at #loc214)) +#loc242 = loc(callsite(#loc165 at #loc214)) +#loc243 = loc(callsite(#loc166 at #loc214)) +#loc244 = loc(callsite(#loc167 at #loc214)) +#loc246 = loc(callsite(#loc169 at #loc214)) +#loc247 = loc(callsite(#loc170 at #loc214)) +#loc248 = loc(callsite(#loc171 at #loc214)) +#loc249 = loc(callsite(#loc172 at #loc214)) +#loc250 = loc(callsite(#loc215 at #loc214)) +#loc251 = loc(callsite(#loc216 at #loc214)) +#loc252 = loc(callsite(#loc175 at #loc214)) +#loc253 = loc(callsite(#loc176 at #loc214)) +#loc254 = loc(callsite(#loc177 at #loc214)) +#loc255 = loc(callsite(#loc178 at #loc214)) +#loc256 = loc(callsite(#loc179 at #loc214)) +#loc257 = loc(callsite(#loc180 at #loc214)) +#loc258 = loc(callsite(#loc181 at #loc214)) +#loc259 = loc(callsite(#loc182 at #loc214)) +#loc260 = loc(callsite(#loc183 at #loc214)) +#loc261 = loc(callsite(#loc184 at #loc214)) +#loc262 = loc(callsite(#loc185 at #loc214)) +#loc263 = loc(callsite(#loc147 at #loc218)) +#loc264 = loc(callsite(#loc149 at #loc218)) +#loc266 = loc(callsite(#loc151 at #loc218)) +#loc267 = loc(callsite(#loc152 at #loc218)) +#loc268 = loc(callsite(#loc153 at #loc218)) +#loc270 = loc(callsite(#loc155 at #loc218)) +#loc271 = loc(callsite(#loc156 at #loc218)) +#loc272 = loc(callsite(#loc157 at #loc218)) +#loc273 = loc(callsite(#loc158 at #loc218)) +#loc274 = loc(callsite(#loc215 at #loc218)) +#loc275 = loc(callsite(#loc216 at #loc218)) +#loc276 = loc(callsite(#loc176 at #loc218)) +#loc277 = loc(callsite(#loc177 at #loc218)) +#loc278 = loc(callsite(#loc178 at #loc218)) +#loc279 = loc(callsite(#loc179 at #loc218)) +#loc280 = loc(callsite(#loc180 at #loc218)) +#loc281 = loc(callsite(#loc181 at #loc218)) +#loc282 = loc(callsite(#loc182 at #loc218)) +#loc283 = loc(callsite(#loc184 at #loc218)) +#loc284 = loc(callsite(#loc185 at #loc218)) +#loc285 = loc(callsite(#loc159 at #loc218)) +#loc286 = loc(callsite(#loc161 at #loc218)) +#loc288 = loc(callsite(#loc164 at #loc218)) +#loc289 = loc(callsite(#loc165 at #loc218)) +#loc290 = loc(callsite(#loc167 at #loc218)) +#loc292 = loc(callsite(#loc169 at #loc218)) +#loc293 = loc(callsite(#loc170 at #loc218)) +#loc294 = loc(callsite(#loc171 at #loc218)) +#loc295 = loc(callsite(#loc172 at #loc218)) +#loc296 = loc(callsite(#loc175 at #loc218)) +#loc297 = loc(callsite(#loc183 at #loc218)) +#loc298 = loc(callsite(#loc30 at #loc220)) +#loc299 = loc(callsite(#loc30 at #loc223)) +#loc300 = loc(callsite(#loc28 at #loc228)) +#loc302 = loc(callsite(#loc28 at #loc232)) +#loc304 = loc(callsite(#loc162 at #loc240)) +#loc305 = loc(callsite(#loc28 at #loc240)) +#loc307 = loc(callsite(#loc162 at #loc245)) +#loc308 = loc(callsite(#loc28 at #loc245)) +#loc310 = loc(callsite(#loc28 at #loc265)) +#loc312 = loc(callsite(#loc28 at #loc269)) +#loc314 = loc(callsite(#loc28 at #loc287)) +#loc316 = loc(callsite(#loc28 at #loc291)) +#loc318 = loc(callsite(#loc30 at #loc300)) +#loc319 = loc(callsite(#loc30 at #loc302)) +#loc320 = loc(callsite(#loc30 at #loc305)) +#loc321 = loc(callsite(#loc30 at #loc308)) +#loc322 = loc(callsite(#loc30 at #loc310)) +#loc323 = loc(callsite(#loc30 at #loc312)) +#loc324 = loc(callsite(#loc30 at #loc314)) +#loc325 = loc(callsite(#loc30 at #loc316)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..e9eea1056d868e789e45219f5a5815b629bc2642 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir @@ -0,0 +1,1451 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":18:0) +#loc1 = loc(unknown) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":46:71) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":51:71) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:26) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:26) +#loc120 = loc("in_ptr0"(#loc)) +#loc121 = loc("out_ptr4"(#loc)) +#loc122 = loc("out_ptr5"(#loc)) +#loc123 = loc("out_ptr6"(#loc)) +#loc124 = loc("out_ptr7"(#loc)) +#loc125 = loc("out_ptr8"(#loc)) +#loc126 = loc("out_ptr9"(#loc)) +#loc127 = loc("xnumel"(#loc)) +#loc128 = loc("r0_numel"(#loc)) +#loc149 = loc(callsite(#loc22 at #loc23)) +#loc156 = loc("ileft"(#loc32)) +#loc160 = loc("iright"(#loc37)) +#loc169 = loc("left_idx"(#loc46)) +#loc174 = loc("right_idx"(#loc51)) +#loc195 = loc(callsite(#loc22 at #loc72)) +#loc198 = loc("tmp24"(#loc75)) +#loc202 = loc("tmp29"(#loc79)) +#loc221 = loc(callsite(#loc28 at #loc149)) +#loc225 = loc(callsite(#loc28 at #loc195)) +#loc228 = loc(callsite(#loc1 at #loc198)) +#loc231 = loc(callsite(#loc1 at #loc202)) +#loc235 = loc(callsite(#loc156 at #loc221)) +#loc239 = loc(callsite(#loc160 at #loc221)) +#loc247 = loc(callsite(#loc169 at #loc221)) +#loc252 = loc(callsite(#loc174 at #loc221)) +#loc272 = loc(callsite(#loc156 at #loc225)) +#loc276 = loc(callsite(#loc160 at #loc225)) +#loc294 = loc(callsite(#loc169 at #loc225)) +#loc298 = loc(callsite(#loc174 at #loc225)) +#loc308 = loc(callsite(#loc1 at #loc235)) +#loc310 = loc(callsite(#loc1 at #loc239)) +#loc313 = loc(callsite(#loc1 at #loc247)) +#loc316 = loc(callsite(#loc1 at #loc252)) +#loc318 = loc(callsite(#loc1 at #loc272)) +#loc320 = loc(callsite(#loc1 at #loc276)) +#loc322 = loc(callsite(#loc1 at #loc294)) +#loc324 = loc(callsite(#loc1 at #loc298)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<32x16xi32> loc(#loc1) + %cst_1 = arith.constant dense<17> : tensor<32x1xi32> loc(#loc1) + %cst_2 = arith.constant dense : tensor<32x1xi1> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc1) + %cst_4 = arith.constant dense<17> : tensor<32x16xi32> loc(#loc1) + %cst_5 = arith.constant dense<16> : tensor<32x16xi32> loc(#loc1) + %cst_6 = arith.constant dense<16384> : tensor<32x16xi64> loc(#loc1) + %cst_7 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc1) + %cst_8 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc1) + %xmask = arith.constant dense<128> : tensor<32x1xi32> loc(#loc129) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc130) + %xoffset_9 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc131) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc132) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc133) + %xindex_11 = tt.splat %xoffset_9 : i32 -> tensor<32x1xi32> loc(#loc134) + %xindex_12 = arith.addi %xindex_11, %xindex_10 : tensor<32x1xi32> loc(#loc134) + %xmask_13 = arith.cmpi slt, %xindex_12, %xmask : tensor<32x1xi32> loc(#loc129) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc135) + %r0_index_14 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc136) + %tmp0 = arith.muli %xindex_12, %cst_8 : tensor<32x1xi32> loc(#loc137) + %tmp0_15 = tt.broadcast %r0_index_14 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc138) + %tmp0_16 = tt.broadcast %tmp0 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc138) + %tmp0_17 = arith.addi %tmp0_15, %tmp0_16 : tensor<32x16xi32> loc(#loc138) + %tmp0_18 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc139) + %tmp0_19 = tt.addptr %tmp0_18, %tmp0_17 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc139) + %tmp0_20 = tt.broadcast %xmask_13 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc140) + %tmp0_21 = tt.load %tmp0_19, %tmp0_20, %cst_7 : tensor<32x16x!tt.ptr> loc(#loc140) + %tmp2 = arith.cmpi sgt, %tmp0_21, %cst_7 : tensor<32x16xi64> loc(#loc141) + %tmp4 = arith.cmpi slt, %tmp0_21, %cst_6 : tensor<32x16xi64> loc(#loc142) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<32x16xi1> loc(#loc143) + %tmp7 = arith.extui %tmp5 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc216) + %tmp9 = arith.trunci %r0_index_14 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc146) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16> -> tensor<32x16xi16> loc(#loc147) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc217) + %flip_22 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc218) + %flip_23 = tt.expand_dims %flip_22 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc218) + %flip_24 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc219) + %flip_25 = tt.reshape %flip_24 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc220) + %y = tt.reshape %tmp7 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc232) + %left_mask = arith.subi %cst, %flip_23 : tensor<1x2x1xi32> loc(#loc233) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc234) + %ileft_26 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc234) + %ileft_27 = "tt.reduce"(%ileft_26) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc307) + %ileft_28 = tt.expand_dims %ileft_27 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc236) + %ileft_29 = tt.broadcast %ileft_28 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc237) + %iright = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc238) + %iright_30 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc238) + %iright_31 = "tt.reduce"(%iright_30) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc309) + %iright_32 = tt.expand_dims %iright_31 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc240) + %iright_33 = tt.broadcast %iright_32 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc241) + %ileft_34 = tt.reshape %ileft_29 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_35 = tt.reshape %iright_33 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx = tt.reshape %tmp11 : tensor<32x16xi16> -> tensor<256x2x1xi16> loc(#loc244) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc245) + %left_idx_36 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc246) + %left_idx_37 = arith.muli %y_idx, %left_idx_36 : tensor<256x2x1xi16> loc(#loc246) + %input = arith.extsi %left_idx_37 : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc311) + %left_idx_38 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc312) + %left_idx_39 = tt.expand_dims %left_idx_38 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc248) + %left_idx_40 = tt.broadcast %left_idx_39 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc249) + %right_idx = arith.trunci %flip_23 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc250) + %right_idx_41 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc251) + %right_idx_42 = arith.muli %y_idx, %right_idx_41 : tensor<256x2x1xi16> loc(#loc251) + %input_43 = arith.extsi %right_idx_42 : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc314) + %right_idx_44 = "tt.reduce"(%input_43) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc315) + %right_idx_45 = tt.expand_dims %right_idx_44 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc253) + %right_idx_46 = tt.broadcast %right_idx_45 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc254) + %left_idx_47 = tt.reshape %left_idx_40 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_48 = tt.reshape %right_idx_46 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc256) + %cond = arith.cmpi slt, %ileft_34, %iright_35 : tensor<32x16xi32> loc(#loc257) + %eq = arith.cmpi eq, %ileft_34, %iright_35 : tensor<32x16xi32> loc(#loc258) + %cond_49 = arith.cmpi sgt, %left_idx_47, %right_idx_48 : tensor<32x16xi32> loc(#loc259) + %cond_50 = arith.andi %eq, %cond_49 : tensor<32x16xi1> loc(#loc260) + %cond_51 = arith.ori %cond, %cond_50 : tensor<32x16xi1> loc(#loc261) + %cond_52 = arith.extui %cond_51 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc262) + %cond_53 = arith.xori %cond_52, %flip_25 : tensor<32x16xi32> loc(#loc262) + %cond_54 = arith.cmpi ne, %cond_53, %cst_3 : tensor<32x16xi32> loc(#loc263) + %ret = arith.xori %ileft_34, %iright_35 : tensor<32x16xi32> loc(#loc264) + %ret_55 = arith.select %cond_54, %ret, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_56 = arith.xori %tmp7, %ret_55 : tensor<32x16xi32> loc(#loc266) + %new_idxs = arith.xori %left_idx_47, %right_idx_48 : tensor<32x16xi32> loc(#loc267) + %new_idxs_57 = arith.select %cond_54, %new_idxs, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_58 = arith.extsi %tmp9 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc269) + %new_idxs_59 = tt.broadcast %new_idxs_58 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc269) + %new_idxs_60 = arith.xori %new_idxs_59, %new_idxs_57 : tensor<32x16xi32> loc(#loc269) + %flip_61 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc219) + %flip_62 = tt.reshape %flip_61 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc220) + %y_63 = tt.reshape %ret_56 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc232) + %ileft_64 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc234) + %ileft_65 = arith.muli %y_63, %ileft_64 : tensor<128x2x2xi32> loc(#loc234) + %ileft_66 = "tt.reduce"(%ileft_65) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc307) + %ileft_67 = tt.expand_dims %ileft_66 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc236) + %ileft_68 = tt.broadcast %ileft_67 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc237) + %iright_69 = arith.muli %y_63, %flip_24 : tensor<128x2x2xi32> loc(#loc238) + %iright_70 = "tt.reduce"(%iright_69) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc309) + %iright_71 = tt.expand_dims %iright_70 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc240) + %iright_72 = tt.broadcast %iright_71 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc241) + %ileft_73 = tt.reshape %ileft_68 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_74 = tt.reshape %iright_72 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_75 = tt.reshape %new_idxs_60 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc244) + %left_idx_76 = arith.muli %y_idx_75, %ileft_64 : tensor<128x2x2xi32> loc(#loc246) + %left_idx_77 = "tt.reduce"(%left_idx_76) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc312) + %left_idx_78 = tt.expand_dims %left_idx_77 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc248) + %left_idx_79 = tt.broadcast %left_idx_78 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc249) + %right_idx_80 = arith.muli %y_idx_75, %flip_24 : tensor<128x2x2xi32> loc(#loc251) + %right_idx_81 = "tt.reduce"(%right_idx_80) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc315) + %right_idx_82 = tt.expand_dims %right_idx_81 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc253) + %right_idx_83 = tt.broadcast %right_idx_82 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc254) + %left_idx_84 = tt.reshape %left_idx_79 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_85 = tt.reshape %right_idx_83 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_86 = arith.cmpi slt, %ileft_73, %iright_74 : tensor<32x16xi32> loc(#loc257) + %eq_87 = arith.cmpi eq, %ileft_73, %iright_74 : tensor<32x16xi32> loc(#loc258) + %cond_88 = arith.cmpi sgt, %left_idx_84, %right_idx_85 : tensor<32x16xi32> loc(#loc259) + %cond_89 = arith.andi %eq_87, %cond_88 : tensor<32x16xi1> loc(#loc260) + %cond_90 = arith.ori %cond_86, %cond_89 : tensor<32x16xi1> loc(#loc261) + %cond_91 = arith.extui %cond_90 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc262) + %cond_92 = arith.xori %cond_91, %flip_62 : tensor<32x16xi32> loc(#loc262) + %cond_93 = arith.cmpi ne, %cond_92, %cst_3 : tensor<32x16xi32> loc(#loc263) + %ret_94 = arith.xori %ileft_73, %iright_74 : tensor<32x16xi32> loc(#loc264) + %ret_95 = arith.select %cond_93, %ret_94, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_96 = arith.xori %ret_56, %ret_95 : tensor<32x16xi32> loc(#loc266) + %new_idxs_97 = arith.xori %left_idx_84, %right_idx_85 : tensor<32x16xi32> loc(#loc267) + %new_idxs_98 = arith.select %cond_93, %new_idxs_97, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_99 = arith.xori %new_idxs_60, %new_idxs_98 : tensor<32x16xi32> loc(#loc269) + %y_100 = tt.reshape %ret_96 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc232) + %ileft_101 = arith.muli %y_100, %ileft : tensor<256x2x1xi32> loc(#loc234) + %ileft_102 = "tt.reduce"(%ileft_101) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc307) + %ileft_103 = tt.expand_dims %ileft_102 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc236) + %ileft_104 = tt.broadcast %ileft_103 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc237) + %iright_105 = arith.muli %y_100, %iright : tensor<256x2x1xi32> loc(#loc238) + %iright_106 = "tt.reduce"(%iright_105) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc309) + %iright_107 = tt.expand_dims %iright_106 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc240) + %iright_108 = tt.broadcast %iright_107 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc241) + %ileft_109 = tt.reshape %ileft_104 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_110 = tt.reshape %iright_108 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_111 = tt.reshape %new_idxs_99 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc244) + %left_idx_112 = arith.muli %y_idx_111, %ileft : tensor<256x2x1xi32> loc(#loc246) + %left_idx_113 = "tt.reduce"(%left_idx_112) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc312) + %left_idx_114 = tt.expand_dims %left_idx_113 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc248) + %left_idx_115 = tt.broadcast %left_idx_114 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc249) + %right_idx_116 = arith.muli %y_idx_111, %iright : tensor<256x2x1xi32> loc(#loc251) + %right_idx_117 = "tt.reduce"(%right_idx_116) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc315) + %right_idx_118 = tt.expand_dims %right_idx_117 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc253) + %right_idx_119 = tt.broadcast %right_idx_118 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc254) + %left_idx_120 = tt.reshape %left_idx_115 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_121 = tt.reshape %right_idx_119 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_122 = arith.cmpi slt, %ileft_109, %iright_110 : tensor<32x16xi32> loc(#loc257) + %eq_123 = arith.cmpi eq, %ileft_109, %iright_110 : tensor<32x16xi32> loc(#loc258) + %cond_124 = arith.cmpi sgt, %left_idx_120, %right_idx_121 : tensor<32x16xi32> loc(#loc259) + %cond_125 = arith.andi %eq_123, %cond_124 : tensor<32x16xi1> loc(#loc260) + %cond_126 = arith.ori %cond_122, %cond_125 : tensor<32x16xi1> loc(#loc261) + %cond_127 = arith.extui %cond_126 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc262) + %cond_128 = arith.xori %cond_127, %flip_62 : tensor<32x16xi32> loc(#loc262) + %cond_129 = arith.cmpi ne, %cond_128, %cst_3 : tensor<32x16xi32> loc(#loc263) + %ret_130 = arith.xori %ileft_109, %iright_110 : tensor<32x16xi32> loc(#loc264) + %ret_131 = arith.select %cond_129, %ret_130, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_132 = arith.xori %ret_96, %ret_131 : tensor<32x16xi32> loc(#loc266) + %new_idxs_133 = arith.xori %left_idx_120, %right_idx_121 : tensor<32x16xi32> loc(#loc267) + %new_idxs_134 = arith.select %cond_129, %new_idxs_133, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_135 = arith.xori %new_idxs_99, %new_idxs_134 : tensor<32x16xi32> loc(#loc269) + %flip_136 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc219) + %flip_137 = tt.reshape %flip_136 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc220) + %y_138 = tt.reshape %ret_132 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc232) + %ileft_139 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc234) + %ileft_140 = arith.muli %y_138, %ileft_139 : tensor<64x2x4xi32> loc(#loc234) + %ileft_141 = "tt.reduce"(%ileft_140) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc307) + %ileft_142 = tt.expand_dims %ileft_141 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc236) + %ileft_143 = tt.broadcast %ileft_142 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc237) + %iright_144 = arith.muli %y_138, %flip_61 : tensor<64x2x4xi32> loc(#loc238) + %iright_145 = "tt.reduce"(%iright_144) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc309) + %iright_146 = tt.expand_dims %iright_145 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc240) + %iright_147 = tt.broadcast %iright_146 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc241) + %ileft_148 = tt.reshape %ileft_143 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_149 = tt.reshape %iright_147 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_150 = tt.reshape %new_idxs_135 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc244) + %left_idx_151 = arith.muli %y_idx_150, %ileft_139 : tensor<64x2x4xi32> loc(#loc246) + %left_idx_152 = "tt.reduce"(%left_idx_151) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc312) + %left_idx_153 = tt.expand_dims %left_idx_152 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc248) + %left_idx_154 = tt.broadcast %left_idx_153 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc249) + %right_idx_155 = arith.muli %y_idx_150, %flip_61 : tensor<64x2x4xi32> loc(#loc251) + %right_idx_156 = "tt.reduce"(%right_idx_155) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc315) + %right_idx_157 = tt.expand_dims %right_idx_156 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc253) + %right_idx_158 = tt.broadcast %right_idx_157 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc254) + %left_idx_159 = tt.reshape %left_idx_154 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_160 = tt.reshape %right_idx_158 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_161 = arith.cmpi slt, %ileft_148, %iright_149 : tensor<32x16xi32> loc(#loc257) + %eq_162 = arith.cmpi eq, %ileft_148, %iright_149 : tensor<32x16xi32> loc(#loc258) + %cond_163 = arith.cmpi sgt, %left_idx_159, %right_idx_160 : tensor<32x16xi32> loc(#loc259) + %cond_164 = arith.andi %eq_162, %cond_163 : tensor<32x16xi1> loc(#loc260) + %cond_165 = arith.ori %cond_161, %cond_164 : tensor<32x16xi1> loc(#loc261) + %cond_166 = arith.extui %cond_165 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc262) + %cond_167 = arith.xori %cond_166, %flip_137 : tensor<32x16xi32> loc(#loc262) + %cond_168 = arith.cmpi ne, %cond_167, %cst_3 : tensor<32x16xi32> loc(#loc263) + %ret_169 = arith.xori %ileft_148, %iright_149 : tensor<32x16xi32> loc(#loc264) + %ret_170 = arith.select %cond_168, %ret_169, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_171 = arith.xori %ret_132, %ret_170 : tensor<32x16xi32> loc(#loc266) + %new_idxs_172 = arith.xori %left_idx_159, %right_idx_160 : tensor<32x16xi32> loc(#loc267) + %new_idxs_173 = arith.select %cond_168, %new_idxs_172, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_174 = arith.xori %new_idxs_135, %new_idxs_173 : tensor<32x16xi32> loc(#loc269) + %y_175 = tt.reshape %ret_171 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc232) + %ileft_176 = arith.muli %y_175, %ileft_64 : tensor<128x2x2xi32> loc(#loc234) + %ileft_177 = "tt.reduce"(%ileft_176) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc307) + %ileft_178 = tt.expand_dims %ileft_177 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc236) + %ileft_179 = tt.broadcast %ileft_178 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc237) + %iright_180 = arith.muli %y_175, %flip_24 : tensor<128x2x2xi32> loc(#loc238) + %iright_181 = "tt.reduce"(%iright_180) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc309) + %iright_182 = tt.expand_dims %iright_181 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc240) + %iright_183 = tt.broadcast %iright_182 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc241) + %ileft_184 = tt.reshape %ileft_179 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_185 = tt.reshape %iright_183 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_186 = tt.reshape %new_idxs_174 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc244) + %left_idx_187 = arith.muli %y_idx_186, %ileft_64 : tensor<128x2x2xi32> loc(#loc246) + %left_idx_188 = "tt.reduce"(%left_idx_187) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc312) + %left_idx_189 = tt.expand_dims %left_idx_188 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc248) + %left_idx_190 = tt.broadcast %left_idx_189 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc249) + %right_idx_191 = arith.muli %y_idx_186, %flip_24 : tensor<128x2x2xi32> loc(#loc251) + %right_idx_192 = "tt.reduce"(%right_idx_191) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc315) + %right_idx_193 = tt.expand_dims %right_idx_192 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc253) + %right_idx_194 = tt.broadcast %right_idx_193 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc254) + %left_idx_195 = tt.reshape %left_idx_190 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_196 = tt.reshape %right_idx_194 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_197 = arith.cmpi slt, %ileft_184, %iright_185 : tensor<32x16xi32> loc(#loc257) + %eq_198 = arith.cmpi eq, %ileft_184, %iright_185 : tensor<32x16xi32> loc(#loc258) + %cond_199 = arith.cmpi sgt, %left_idx_195, %right_idx_196 : tensor<32x16xi32> loc(#loc259) + %cond_200 = arith.andi %eq_198, %cond_199 : tensor<32x16xi1> loc(#loc260) + %cond_201 = arith.ori %cond_197, %cond_200 : tensor<32x16xi1> loc(#loc261) + %cond_202 = arith.extui %cond_201 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc262) + %cond_203 = arith.xori %cond_202, %flip_137 : tensor<32x16xi32> loc(#loc262) + %cond_204 = arith.cmpi ne, %cond_203, %cst_3 : tensor<32x16xi32> loc(#loc263) + %ret_205 = arith.xori %ileft_184, %iright_185 : tensor<32x16xi32> loc(#loc264) + %ret_206 = arith.select %cond_204, %ret_205, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_207 = arith.xori %ret_171, %ret_206 : tensor<32x16xi32> loc(#loc266) + %new_idxs_208 = arith.xori %left_idx_195, %right_idx_196 : tensor<32x16xi32> loc(#loc267) + %new_idxs_209 = arith.select %cond_204, %new_idxs_208, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_210 = arith.xori %new_idxs_174, %new_idxs_209 : tensor<32x16xi32> loc(#loc269) + %y_211 = tt.reshape %ret_207 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc232) + %ileft_212 = arith.muli %y_211, %ileft : tensor<256x2x1xi32> loc(#loc234) + %ileft_213 = "tt.reduce"(%ileft_212) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc307) + %ileft_214 = tt.expand_dims %ileft_213 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc236) + %ileft_215 = tt.broadcast %ileft_214 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc237) + %iright_216 = arith.muli %y_211, %iright : tensor<256x2x1xi32> loc(#loc238) + %iright_217 = "tt.reduce"(%iright_216) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc309) + %iright_218 = tt.expand_dims %iright_217 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc240) + %iright_219 = tt.broadcast %iright_218 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc241) + %ileft_220 = tt.reshape %ileft_215 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_221 = tt.reshape %iright_219 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_222 = tt.reshape %new_idxs_210 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc244) + %left_idx_223 = arith.muli %y_idx_222, %ileft : tensor<256x2x1xi32> loc(#loc246) + %left_idx_224 = "tt.reduce"(%left_idx_223) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc312) + %left_idx_225 = tt.expand_dims %left_idx_224 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc248) + %left_idx_226 = tt.broadcast %left_idx_225 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc249) + %right_idx_227 = arith.muli %y_idx_222, %iright : tensor<256x2x1xi32> loc(#loc251) + %right_idx_228 = "tt.reduce"(%right_idx_227) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc315) + %right_idx_229 = tt.expand_dims %right_idx_228 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc253) + %right_idx_230 = tt.broadcast %right_idx_229 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc254) + %left_idx_231 = tt.reshape %left_idx_226 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_232 = tt.reshape %right_idx_230 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_233 = arith.cmpi slt, %ileft_220, %iright_221 : tensor<32x16xi32> loc(#loc257) + %eq_234 = arith.cmpi eq, %ileft_220, %iright_221 : tensor<32x16xi32> loc(#loc258) + %cond_235 = arith.cmpi sgt, %left_idx_231, %right_idx_232 : tensor<32x16xi32> loc(#loc259) + %cond_236 = arith.andi %eq_234, %cond_235 : tensor<32x16xi1> loc(#loc260) + %cond_237 = arith.ori %cond_233, %cond_236 : tensor<32x16xi1> loc(#loc261) + %cond_238 = arith.extui %cond_237 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc262) + %cond_239 = arith.xori %cond_238, %flip_137 : tensor<32x16xi32> loc(#loc262) + %cond_240 = arith.cmpi ne, %cond_239, %cst_3 : tensor<32x16xi32> loc(#loc263) + %ret_241 = arith.xori %ileft_220, %iright_221 : tensor<32x16xi32> loc(#loc264) + %ret_242 = arith.select %cond_240, %ret_241, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_243 = arith.xori %ret_207, %ret_242 : tensor<32x16xi32> loc(#loc266) + %new_idxs_244 = arith.xori %left_idx_231, %right_idx_232 : tensor<32x16xi32> loc(#loc267) + %new_idxs_245 = arith.select %cond_240, %new_idxs_244, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_246 = arith.xori %new_idxs_210, %new_idxs_245 : tensor<32x16xi32> loc(#loc269) + %y_247 = tt.reshape %ret_243 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc232) + %ileft_248 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc234) + %ileft_249 = arith.muli %y_247, %ileft_248 : tensor<32x2x8xi32> loc(#loc234) + %ileft_250 = "tt.reduce"(%ileft_249) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc307) + %ileft_251 = tt.expand_dims %ileft_250 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc236) + %ileft_252 = tt.broadcast %ileft_251 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc237) + %iright_253 = arith.muli %y_247, %flip_136 : tensor<32x2x8xi32> loc(#loc238) + %iright_254 = "tt.reduce"(%iright_253) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc309) + %iright_255 = tt.expand_dims %iright_254 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc240) + %iright_256 = tt.broadcast %iright_255 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc241) + %ileft_257 = tt.reshape %ileft_252 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_258 = tt.reshape %iright_256 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_259 = tt.reshape %new_idxs_246 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc244) + %left_idx_260 = arith.muli %y_idx_259, %ileft_248 : tensor<32x2x8xi32> loc(#loc246) + %left_idx_261 = "tt.reduce"(%left_idx_260) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc312) + %left_idx_262 = tt.expand_dims %left_idx_261 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc248) + %left_idx_263 = tt.broadcast %left_idx_262 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc249) + %right_idx_264 = arith.muli %y_idx_259, %flip_136 : tensor<32x2x8xi32> loc(#loc251) + %right_idx_265 = "tt.reduce"(%right_idx_264) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc315) + %right_idx_266 = tt.expand_dims %right_idx_265 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc253) + %right_idx_267 = tt.broadcast %right_idx_266 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc254) + %left_idx_268 = tt.reshape %left_idx_263 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_269 = tt.reshape %right_idx_267 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_270 = arith.cmpi slt, %ileft_257, %iright_258 : tensor<32x16xi32> loc(#loc257) + %eq_271 = arith.cmpi eq, %ileft_257, %iright_258 : tensor<32x16xi32> loc(#loc258) + %cond_272 = arith.cmpi sgt, %left_idx_268, %right_idx_269 : tensor<32x16xi32> loc(#loc259) + %cond_273 = arith.andi %eq_271, %cond_272 : tensor<32x16xi1> loc(#loc260) + %cond_274 = arith.ori %cond_270, %cond_273 : tensor<32x16xi1> loc(#loc261) + %ret_275 = arith.xori %ileft_257, %iright_258 : tensor<32x16xi32> loc(#loc264) + %ret_276 = arith.select %cond_274, %ret_275, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_277 = arith.xori %ret_243, %ret_276 : tensor<32x16xi32> loc(#loc266) + %new_idxs_278 = arith.xori %left_idx_268, %right_idx_269 : tensor<32x16xi32> loc(#loc267) + %new_idxs_279 = arith.select %cond_274, %new_idxs_278, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_280 = arith.xori %new_idxs_246, %new_idxs_279 : tensor<32x16xi32> loc(#loc269) + %y_281 = tt.reshape %ret_277 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc232) + %ileft_282 = arith.muli %y_281, %ileft_139 : tensor<64x2x4xi32> loc(#loc234) + %ileft_283 = "tt.reduce"(%ileft_282) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc307) + %ileft_284 = tt.expand_dims %ileft_283 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc236) + %ileft_285 = tt.broadcast %ileft_284 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc237) + %iright_286 = arith.muli %y_281, %flip_61 : tensor<64x2x4xi32> loc(#loc238) + %iright_287 = "tt.reduce"(%iright_286) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc309) + %iright_288 = tt.expand_dims %iright_287 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc240) + %iright_289 = tt.broadcast %iright_288 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc241) + %ileft_290 = tt.reshape %ileft_285 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_291 = tt.reshape %iright_289 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_292 = tt.reshape %new_idxs_280 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc244) + %left_idx_293 = arith.muli %y_idx_292, %ileft_139 : tensor<64x2x4xi32> loc(#loc246) + %left_idx_294 = "tt.reduce"(%left_idx_293) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc312) + %left_idx_295 = tt.expand_dims %left_idx_294 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc248) + %left_idx_296 = tt.broadcast %left_idx_295 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc249) + %right_idx_297 = arith.muli %y_idx_292, %flip_61 : tensor<64x2x4xi32> loc(#loc251) + %right_idx_298 = "tt.reduce"(%right_idx_297) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc315) + %right_idx_299 = tt.expand_dims %right_idx_298 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc253) + %right_idx_300 = tt.broadcast %right_idx_299 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc254) + %left_idx_301 = tt.reshape %left_idx_296 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_302 = tt.reshape %right_idx_300 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_303 = arith.cmpi slt, %ileft_290, %iright_291 : tensor<32x16xi32> loc(#loc257) + %eq_304 = arith.cmpi eq, %ileft_290, %iright_291 : tensor<32x16xi32> loc(#loc258) + %cond_305 = arith.cmpi sgt, %left_idx_301, %right_idx_302 : tensor<32x16xi32> loc(#loc259) + %cond_306 = arith.andi %eq_304, %cond_305 : tensor<32x16xi1> loc(#loc260) + %cond_307 = arith.ori %cond_303, %cond_306 : tensor<32x16xi1> loc(#loc261) + %ret_308 = arith.xori %ileft_290, %iright_291 : tensor<32x16xi32> loc(#loc264) + %ret_309 = arith.select %cond_307, %ret_308, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_310 = arith.xori %ret_277, %ret_309 : tensor<32x16xi32> loc(#loc266) + %new_idxs_311 = arith.xori %left_idx_301, %right_idx_302 : tensor<32x16xi32> loc(#loc267) + %new_idxs_312 = arith.select %cond_307, %new_idxs_311, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_313 = arith.xori %new_idxs_280, %new_idxs_312 : tensor<32x16xi32> loc(#loc269) + %y_314 = tt.reshape %ret_310 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc232) + %ileft_315 = arith.muli %y_314, %ileft_64 : tensor<128x2x2xi32> loc(#loc234) + %ileft_316 = "tt.reduce"(%ileft_315) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc307) + %ileft_317 = tt.expand_dims %ileft_316 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc236) + %ileft_318 = tt.broadcast %ileft_317 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc237) + %iright_319 = arith.muli %y_314, %flip_24 : tensor<128x2x2xi32> loc(#loc238) + %iright_320 = "tt.reduce"(%iright_319) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc309) + %iright_321 = tt.expand_dims %iright_320 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc240) + %iright_322 = tt.broadcast %iright_321 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc241) + %ileft_323 = tt.reshape %ileft_318 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_324 = tt.reshape %iright_322 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_325 = tt.reshape %new_idxs_313 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc244) + %left_idx_326 = arith.muli %y_idx_325, %ileft_64 : tensor<128x2x2xi32> loc(#loc246) + %left_idx_327 = "tt.reduce"(%left_idx_326) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc312) + %left_idx_328 = tt.expand_dims %left_idx_327 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc248) + %left_idx_329 = tt.broadcast %left_idx_328 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc249) + %right_idx_330 = arith.muli %y_idx_325, %flip_24 : tensor<128x2x2xi32> loc(#loc251) + %right_idx_331 = "tt.reduce"(%right_idx_330) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc315) + %right_idx_332 = tt.expand_dims %right_idx_331 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc253) + %right_idx_333 = tt.broadcast %right_idx_332 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc254) + %left_idx_334 = tt.reshape %left_idx_329 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_335 = tt.reshape %right_idx_333 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_336 = arith.cmpi slt, %ileft_323, %iright_324 : tensor<32x16xi32> loc(#loc257) + %eq_337 = arith.cmpi eq, %ileft_323, %iright_324 : tensor<32x16xi32> loc(#loc258) + %cond_338 = arith.cmpi sgt, %left_idx_334, %right_idx_335 : tensor<32x16xi32> loc(#loc259) + %cond_339 = arith.andi %eq_337, %cond_338 : tensor<32x16xi1> loc(#loc260) + %cond_340 = arith.ori %cond_336, %cond_339 : tensor<32x16xi1> loc(#loc261) + %ret_341 = arith.xori %ileft_323, %iright_324 : tensor<32x16xi32> loc(#loc264) + %ret_342 = arith.select %cond_340, %ret_341, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_343 = arith.xori %ret_310, %ret_342 : tensor<32x16xi32> loc(#loc266) + %new_idxs_344 = arith.xori %left_idx_334, %right_idx_335 : tensor<32x16xi32> loc(#loc267) + %new_idxs_345 = arith.select %cond_340, %new_idxs_344, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_346 = arith.xori %new_idxs_313, %new_idxs_345 : tensor<32x16xi32> loc(#loc269) + %y_347 = tt.reshape %ret_343 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc232) + %ileft_348 = arith.muli %y_347, %ileft : tensor<256x2x1xi32> loc(#loc234) + %ileft_349 = "tt.reduce"(%ileft_348) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc307) + %ileft_350 = tt.expand_dims %ileft_349 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc236) + %ileft_351 = tt.broadcast %ileft_350 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc237) + %iright_352 = arith.muli %y_347, %iright : tensor<256x2x1xi32> loc(#loc238) + %iright_353 = "tt.reduce"(%iright_352) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc309) + %iright_354 = tt.expand_dims %iright_353 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc240) + %iright_355 = tt.broadcast %iright_354 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc241) + %ileft_356 = tt.reshape %ileft_351 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_357 = tt.reshape %iright_355 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_358 = tt.reshape %new_idxs_346 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc244) + %left_idx_359 = arith.muli %y_idx_358, %ileft : tensor<256x2x1xi32> loc(#loc246) + %left_idx_360 = "tt.reduce"(%left_idx_359) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc312) + %left_idx_361 = tt.expand_dims %left_idx_360 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc248) + %left_idx_362 = tt.broadcast %left_idx_361 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc249) + %right_idx_363 = arith.muli %y_idx_358, %iright : tensor<256x2x1xi32> loc(#loc251) + %right_idx_364 = "tt.reduce"(%right_idx_363) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc315) + %right_idx_365 = tt.expand_dims %right_idx_364 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc253) + %right_idx_366 = tt.broadcast %right_idx_365 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc254) + %left_idx_367 = tt.reshape %left_idx_362 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_368 = tt.reshape %right_idx_366 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_369 = arith.cmpi slt, %ileft_356, %iright_357 : tensor<32x16xi32> loc(#loc257) + %eq_370 = arith.cmpi eq, %ileft_356, %iright_357 : tensor<32x16xi32> loc(#loc258) + %cond_371 = arith.cmpi sgt, %left_idx_367, %right_idx_368 : tensor<32x16xi32> loc(#loc259) + %cond_372 = arith.andi %eq_370, %cond_371 : tensor<32x16xi1> loc(#loc260) + %cond_373 = arith.ori %cond_369, %cond_372 : tensor<32x16xi1> loc(#loc261) + %new_idxs_374 = arith.xori %left_idx_367, %right_idx_368 : tensor<32x16xi32> loc(#loc267) + %new_idxs_375 = arith.select %cond_373, %new_idxs_374, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_376 = arith.xori %new_idxs_346, %new_idxs_375 : tensor<32x16xi32> loc(#loc269) + %tmp14 = arith.cmpi eq, %tmp0_21, %cst_6 : tensor<32x16xi64> loc(#loc192) + %tmp16 = arith.extui %tmp14 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc224) + %y_377 = tt.reshape %tmp16 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc270) + %ileft_378 = arith.muli %y_377, %ileft : tensor<256x2x1xi32> loc(#loc271) + %ileft_379 = "tt.reduce"(%ileft_378) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc317) + %ileft_380 = tt.expand_dims %ileft_379 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc273) + %ileft_381 = tt.broadcast %ileft_380 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc274) + %iright_382 = arith.muli %y_377, %iright : tensor<256x2x1xi32> loc(#loc275) + %iright_383 = "tt.reduce"(%iright_382) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc319) + %iright_384 = tt.expand_dims %iright_383 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc277) + %iright_385 = tt.broadcast %iright_384 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc278) + %ileft_386 = tt.reshape %ileft_381 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_387 = tt.reshape %iright_385 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %cond_388 = arith.cmpi slt, %ileft_386, %iright_387 : tensor<32x16xi32> loc(#loc281) + %eq_389 = arith.cmpi eq, %ileft_386, %iright_387 : tensor<32x16xi32> loc(#loc282) + %cond_390 = arith.andi %eq_389, %cond_49 : tensor<32x16xi1> loc(#loc283) + %cond_391 = arith.ori %cond_388, %cond_390 : tensor<32x16xi1> loc(#loc284) + %cond_392 = arith.extui %cond_391 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc285) + %cond_393 = arith.xori %cond_392, %flip_25 : tensor<32x16xi32> loc(#loc285) + %cond_394 = arith.cmpi ne, %cond_393, %cst_3 : tensor<32x16xi32> loc(#loc286) + %ret_395 = arith.xori %ileft_386, %iright_387 : tensor<32x16xi32> loc(#loc287) + %ret_396 = arith.select %cond_394, %ret_395, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_397 = arith.xori %tmp16, %ret_396 : tensor<32x16xi32> loc(#loc289) + %new_idxs_398 = arith.select %cond_394, %new_idxs, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_399 = arith.xori %new_idxs_59, %new_idxs_398 : tensor<32x16xi32> loc(#loc291) + %y_400 = tt.reshape %ret_397 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc270) + %ileft_401 = arith.muli %y_400, %ileft_64 : tensor<128x2x2xi32> loc(#loc271) + %ileft_402 = "tt.reduce"(%ileft_401) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc317) + %ileft_403 = tt.expand_dims %ileft_402 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc273) + %ileft_404 = tt.broadcast %ileft_403 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc274) + %iright_405 = arith.muli %y_400, %flip_24 : tensor<128x2x2xi32> loc(#loc275) + %iright_406 = "tt.reduce"(%iright_405) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc319) + %iright_407 = tt.expand_dims %iright_406 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc277) + %iright_408 = tt.broadcast %iright_407 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc278) + %ileft_409 = tt.reshape %ileft_404 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_410 = tt.reshape %iright_408 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_411 = tt.reshape %new_idxs_399 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc292) + %left_idx_412 = arith.muli %y_idx_411, %ileft_64 : tensor<128x2x2xi32> loc(#loc293) + %left_idx_413 = "tt.reduce"(%left_idx_412) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc321) + %left_idx_414 = tt.expand_dims %left_idx_413 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc295) + %left_idx_415 = tt.broadcast %left_idx_414 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc296) + %right_idx_416 = arith.muli %y_idx_411, %flip_24 : tensor<128x2x2xi32> loc(#loc297) + %right_idx_417 = "tt.reduce"(%right_idx_416) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc323) + %right_idx_418 = tt.expand_dims %right_idx_417 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc299) + %right_idx_419 = tt.broadcast %right_idx_418 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc300) + %left_idx_420 = tt.reshape %left_idx_415 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_421 = tt.reshape %right_idx_419 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_422 = arith.cmpi slt, %ileft_409, %iright_410 : tensor<32x16xi32> loc(#loc281) + %eq_423 = arith.cmpi eq, %ileft_409, %iright_410 : tensor<32x16xi32> loc(#loc282) + %cond_424 = arith.cmpi sgt, %left_idx_420, %right_idx_421 : tensor<32x16xi32> loc(#loc303) + %cond_425 = arith.andi %eq_423, %cond_424 : tensor<32x16xi1> loc(#loc283) + %cond_426 = arith.ori %cond_422, %cond_425 : tensor<32x16xi1> loc(#loc284) + %cond_427 = arith.extui %cond_426 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc285) + %cond_428 = arith.xori %cond_427, %flip_62 : tensor<32x16xi32> loc(#loc285) + %cond_429 = arith.cmpi ne, %cond_428, %cst_3 : tensor<32x16xi32> loc(#loc286) + %ret_430 = arith.xori %ileft_409, %iright_410 : tensor<32x16xi32> loc(#loc287) + %ret_431 = arith.select %cond_429, %ret_430, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_432 = arith.xori %ret_397, %ret_431 : tensor<32x16xi32> loc(#loc289) + %new_idxs_433 = arith.xori %left_idx_420, %right_idx_421 : tensor<32x16xi32> loc(#loc304) + %new_idxs_434 = arith.select %cond_429, %new_idxs_433, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_435 = arith.xori %new_idxs_399, %new_idxs_434 : tensor<32x16xi32> loc(#loc291) + %y_436 = tt.reshape %ret_432 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc270) + %ileft_437 = arith.muli %y_436, %ileft : tensor<256x2x1xi32> loc(#loc271) + %ileft_438 = "tt.reduce"(%ileft_437) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc317) + %ileft_439 = tt.expand_dims %ileft_438 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc273) + %ileft_440 = tt.broadcast %ileft_439 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc274) + %iright_441 = arith.muli %y_436, %iright : tensor<256x2x1xi32> loc(#loc275) + %iright_442 = "tt.reduce"(%iright_441) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc319) + %iright_443 = tt.expand_dims %iright_442 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc277) + %iright_444 = tt.broadcast %iright_443 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc278) + %ileft_445 = tt.reshape %ileft_440 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_446 = tt.reshape %iright_444 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_447 = tt.reshape %new_idxs_435 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc292) + %left_idx_448 = arith.muli %y_idx_447, %ileft : tensor<256x2x1xi32> loc(#loc293) + %left_idx_449 = "tt.reduce"(%left_idx_448) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc321) + %left_idx_450 = tt.expand_dims %left_idx_449 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc295) + %left_idx_451 = tt.broadcast %left_idx_450 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc296) + %right_idx_452 = arith.muli %y_idx_447, %iright : tensor<256x2x1xi32> loc(#loc297) + %right_idx_453 = "tt.reduce"(%right_idx_452) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc323) + %right_idx_454 = tt.expand_dims %right_idx_453 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc299) + %right_idx_455 = tt.broadcast %right_idx_454 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc300) + %left_idx_456 = tt.reshape %left_idx_451 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_457 = tt.reshape %right_idx_455 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_458 = arith.cmpi slt, %ileft_445, %iright_446 : tensor<32x16xi32> loc(#loc281) + %eq_459 = arith.cmpi eq, %ileft_445, %iright_446 : tensor<32x16xi32> loc(#loc282) + %cond_460 = arith.cmpi sgt, %left_idx_456, %right_idx_457 : tensor<32x16xi32> loc(#loc303) + %cond_461 = arith.andi %eq_459, %cond_460 : tensor<32x16xi1> loc(#loc283) + %cond_462 = arith.ori %cond_458, %cond_461 : tensor<32x16xi1> loc(#loc284) + %cond_463 = arith.extui %cond_462 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc285) + %cond_464 = arith.xori %cond_463, %flip_62 : tensor<32x16xi32> loc(#loc285) + %cond_465 = arith.cmpi ne, %cond_464, %cst_3 : tensor<32x16xi32> loc(#loc286) + %ret_466 = arith.xori %ileft_445, %iright_446 : tensor<32x16xi32> loc(#loc287) + %ret_467 = arith.select %cond_465, %ret_466, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_468 = arith.xori %ret_432, %ret_467 : tensor<32x16xi32> loc(#loc289) + %new_idxs_469 = arith.xori %left_idx_456, %right_idx_457 : tensor<32x16xi32> loc(#loc304) + %new_idxs_470 = arith.select %cond_465, %new_idxs_469, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_471 = arith.xori %new_idxs_435, %new_idxs_470 : tensor<32x16xi32> loc(#loc291) + %y_472 = tt.reshape %ret_468 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc270) + %ileft_473 = arith.muli %y_472, %ileft_139 : tensor<64x2x4xi32> loc(#loc271) + %ileft_474 = "tt.reduce"(%ileft_473) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc317) + %ileft_475 = tt.expand_dims %ileft_474 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc273) + %ileft_476 = tt.broadcast %ileft_475 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc274) + %iright_477 = arith.muli %y_472, %flip_61 : tensor<64x2x4xi32> loc(#loc275) + %iright_478 = "tt.reduce"(%iright_477) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc319) + %iright_479 = tt.expand_dims %iright_478 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc277) + %iright_480 = tt.broadcast %iright_479 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc278) + %ileft_481 = tt.reshape %ileft_476 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_482 = tt.reshape %iright_480 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_483 = tt.reshape %new_idxs_471 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc292) + %left_idx_484 = arith.muli %y_idx_483, %ileft_139 : tensor<64x2x4xi32> loc(#loc293) + %left_idx_485 = "tt.reduce"(%left_idx_484) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc321) + %left_idx_486 = tt.expand_dims %left_idx_485 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc295) + %left_idx_487 = tt.broadcast %left_idx_486 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc296) + %right_idx_488 = arith.muli %y_idx_483, %flip_61 : tensor<64x2x4xi32> loc(#loc297) + %right_idx_489 = "tt.reduce"(%right_idx_488) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc323) + %right_idx_490 = tt.expand_dims %right_idx_489 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc299) + %right_idx_491 = tt.broadcast %right_idx_490 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc300) + %left_idx_492 = tt.reshape %left_idx_487 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_493 = tt.reshape %right_idx_491 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_494 = arith.cmpi slt, %ileft_481, %iright_482 : tensor<32x16xi32> loc(#loc281) + %eq_495 = arith.cmpi eq, %ileft_481, %iright_482 : tensor<32x16xi32> loc(#loc282) + %cond_496 = arith.cmpi sgt, %left_idx_492, %right_idx_493 : tensor<32x16xi32> loc(#loc303) + %cond_497 = arith.andi %eq_495, %cond_496 : tensor<32x16xi1> loc(#loc283) + %cond_498 = arith.ori %cond_494, %cond_497 : tensor<32x16xi1> loc(#loc284) + %cond_499 = arith.extui %cond_498 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc285) + %cond_500 = arith.xori %cond_499, %flip_137 : tensor<32x16xi32> loc(#loc285) + %cond_501 = arith.cmpi ne, %cond_500, %cst_3 : tensor<32x16xi32> loc(#loc286) + %ret_502 = arith.xori %ileft_481, %iright_482 : tensor<32x16xi32> loc(#loc287) + %ret_503 = arith.select %cond_501, %ret_502, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_504 = arith.xori %ret_468, %ret_503 : tensor<32x16xi32> loc(#loc289) + %new_idxs_505 = arith.xori %left_idx_492, %right_idx_493 : tensor<32x16xi32> loc(#loc304) + %new_idxs_506 = arith.select %cond_501, %new_idxs_505, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_507 = arith.xori %new_idxs_471, %new_idxs_506 : tensor<32x16xi32> loc(#loc291) + %y_508 = tt.reshape %ret_504 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc270) + %ileft_509 = arith.muli %y_508, %ileft_64 : tensor<128x2x2xi32> loc(#loc271) + %ileft_510 = "tt.reduce"(%ileft_509) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc317) + %ileft_511 = tt.expand_dims %ileft_510 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc273) + %ileft_512 = tt.broadcast %ileft_511 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc274) + %iright_513 = arith.muli %y_508, %flip_24 : tensor<128x2x2xi32> loc(#loc275) + %iright_514 = "tt.reduce"(%iright_513) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc319) + %iright_515 = tt.expand_dims %iright_514 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc277) + %iright_516 = tt.broadcast %iright_515 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc278) + %ileft_517 = tt.reshape %ileft_512 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_518 = tt.reshape %iright_516 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_519 = tt.reshape %new_idxs_507 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc292) + %left_idx_520 = arith.muli %y_idx_519, %ileft_64 : tensor<128x2x2xi32> loc(#loc293) + %left_idx_521 = "tt.reduce"(%left_idx_520) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc321) + %left_idx_522 = tt.expand_dims %left_idx_521 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc295) + %left_idx_523 = tt.broadcast %left_idx_522 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc296) + %right_idx_524 = arith.muli %y_idx_519, %flip_24 : tensor<128x2x2xi32> loc(#loc297) + %right_idx_525 = "tt.reduce"(%right_idx_524) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc323) + %right_idx_526 = tt.expand_dims %right_idx_525 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc299) + %right_idx_527 = tt.broadcast %right_idx_526 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc300) + %left_idx_528 = tt.reshape %left_idx_523 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_529 = tt.reshape %right_idx_527 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_530 = arith.cmpi slt, %ileft_517, %iright_518 : tensor<32x16xi32> loc(#loc281) + %eq_531 = arith.cmpi eq, %ileft_517, %iright_518 : tensor<32x16xi32> loc(#loc282) + %cond_532 = arith.cmpi sgt, %left_idx_528, %right_idx_529 : tensor<32x16xi32> loc(#loc303) + %cond_533 = arith.andi %eq_531, %cond_532 : tensor<32x16xi1> loc(#loc283) + %cond_534 = arith.ori %cond_530, %cond_533 : tensor<32x16xi1> loc(#loc284) + %cond_535 = arith.extui %cond_534 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc285) + %cond_536 = arith.xori %cond_535, %flip_137 : tensor<32x16xi32> loc(#loc285) + %cond_537 = arith.cmpi ne, %cond_536, %cst_3 : tensor<32x16xi32> loc(#loc286) + %ret_538 = arith.xori %ileft_517, %iright_518 : tensor<32x16xi32> loc(#loc287) + %ret_539 = arith.select %cond_537, %ret_538, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_540 = arith.xori %ret_504, %ret_539 : tensor<32x16xi32> loc(#loc289) + %new_idxs_541 = arith.xori %left_idx_528, %right_idx_529 : tensor<32x16xi32> loc(#loc304) + %new_idxs_542 = arith.select %cond_537, %new_idxs_541, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_543 = arith.xori %new_idxs_507, %new_idxs_542 : tensor<32x16xi32> loc(#loc291) + %y_544 = tt.reshape %ret_540 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc270) + %ileft_545 = arith.muli %y_544, %ileft : tensor<256x2x1xi32> loc(#loc271) + %ileft_546 = "tt.reduce"(%ileft_545) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc317) + %ileft_547 = tt.expand_dims %ileft_546 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc273) + %ileft_548 = tt.broadcast %ileft_547 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc274) + %iright_549 = arith.muli %y_544, %iright : tensor<256x2x1xi32> loc(#loc275) + %iright_550 = "tt.reduce"(%iright_549) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc319) + %iright_551 = tt.expand_dims %iright_550 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc277) + %iright_552 = tt.broadcast %iright_551 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc278) + %ileft_553 = tt.reshape %ileft_548 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_554 = tt.reshape %iright_552 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_555 = tt.reshape %new_idxs_543 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc292) + %left_idx_556 = arith.muli %y_idx_555, %ileft : tensor<256x2x1xi32> loc(#loc293) + %left_idx_557 = "tt.reduce"(%left_idx_556) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc321) + %left_idx_558 = tt.expand_dims %left_idx_557 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc295) + %left_idx_559 = tt.broadcast %left_idx_558 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc296) + %right_idx_560 = arith.muli %y_idx_555, %iright : tensor<256x2x1xi32> loc(#loc297) + %right_idx_561 = "tt.reduce"(%right_idx_560) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc323) + %right_idx_562 = tt.expand_dims %right_idx_561 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc299) + %right_idx_563 = tt.broadcast %right_idx_562 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc300) + %left_idx_564 = tt.reshape %left_idx_559 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_565 = tt.reshape %right_idx_563 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_566 = arith.cmpi slt, %ileft_553, %iright_554 : tensor<32x16xi32> loc(#loc281) + %eq_567 = arith.cmpi eq, %ileft_553, %iright_554 : tensor<32x16xi32> loc(#loc282) + %cond_568 = arith.cmpi sgt, %left_idx_564, %right_idx_565 : tensor<32x16xi32> loc(#loc303) + %cond_569 = arith.andi %eq_567, %cond_568 : tensor<32x16xi1> loc(#loc283) + %cond_570 = arith.ori %cond_566, %cond_569 : tensor<32x16xi1> loc(#loc284) + %cond_571 = arith.extui %cond_570 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc285) + %cond_572 = arith.xori %cond_571, %flip_137 : tensor<32x16xi32> loc(#loc285) + %cond_573 = arith.cmpi ne, %cond_572, %cst_3 : tensor<32x16xi32> loc(#loc286) + %ret_574 = arith.xori %ileft_553, %iright_554 : tensor<32x16xi32> loc(#loc287) + %ret_575 = arith.select %cond_573, %ret_574, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_576 = arith.xori %ret_540, %ret_575 : tensor<32x16xi32> loc(#loc289) + %new_idxs_577 = arith.xori %left_idx_564, %right_idx_565 : tensor<32x16xi32> loc(#loc304) + %new_idxs_578 = arith.select %cond_573, %new_idxs_577, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_579 = arith.xori %new_idxs_543, %new_idxs_578 : tensor<32x16xi32> loc(#loc291) + %y_580 = tt.reshape %ret_576 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc270) + %ileft_581 = arith.muli %y_580, %ileft_248 : tensor<32x2x8xi32> loc(#loc271) + %ileft_582 = "tt.reduce"(%ileft_581) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc317) + %ileft_583 = tt.expand_dims %ileft_582 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc273) + %ileft_584 = tt.broadcast %ileft_583 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc274) + %iright_585 = arith.muli %y_580, %flip_136 : tensor<32x2x8xi32> loc(#loc275) + %iright_586 = "tt.reduce"(%iright_585) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc319) + %iright_587 = tt.expand_dims %iright_586 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc277) + %iright_588 = tt.broadcast %iright_587 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc278) + %ileft_589 = tt.reshape %ileft_584 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_590 = tt.reshape %iright_588 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_591 = tt.reshape %new_idxs_579 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc292) + %left_idx_592 = arith.muli %y_idx_591, %ileft_248 : tensor<32x2x8xi32> loc(#loc293) + %left_idx_593 = "tt.reduce"(%left_idx_592) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc321) + %left_idx_594 = tt.expand_dims %left_idx_593 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc295) + %left_idx_595 = tt.broadcast %left_idx_594 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc296) + %right_idx_596 = arith.muli %y_idx_591, %flip_136 : tensor<32x2x8xi32> loc(#loc297) + %right_idx_597 = "tt.reduce"(%right_idx_596) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc323) + %right_idx_598 = tt.expand_dims %right_idx_597 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc299) + %right_idx_599 = tt.broadcast %right_idx_598 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc300) + %left_idx_600 = tt.reshape %left_idx_595 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_601 = tt.reshape %right_idx_599 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_602 = arith.cmpi slt, %ileft_589, %iright_590 : tensor<32x16xi32> loc(#loc281) + %eq_603 = arith.cmpi eq, %ileft_589, %iright_590 : tensor<32x16xi32> loc(#loc282) + %cond_604 = arith.cmpi sgt, %left_idx_600, %right_idx_601 : tensor<32x16xi32> loc(#loc303) + %cond_605 = arith.andi %eq_603, %cond_604 : tensor<32x16xi1> loc(#loc283) + %cond_606 = arith.ori %cond_602, %cond_605 : tensor<32x16xi1> loc(#loc284) + %ret_607 = arith.xori %ileft_589, %iright_590 : tensor<32x16xi32> loc(#loc287) + %ret_608 = arith.select %cond_606, %ret_607, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_609 = arith.xori %ret_576, %ret_608 : tensor<32x16xi32> loc(#loc289) + %new_idxs_610 = arith.xori %left_idx_600, %right_idx_601 : tensor<32x16xi32> loc(#loc304) + %new_idxs_611 = arith.select %cond_606, %new_idxs_610, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_612 = arith.xori %new_idxs_579, %new_idxs_611 : tensor<32x16xi32> loc(#loc291) + %y_613 = tt.reshape %ret_609 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc270) + %ileft_614 = arith.muli %y_613, %ileft_139 : tensor<64x2x4xi32> loc(#loc271) + %ileft_615 = "tt.reduce"(%ileft_614) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc317) + %ileft_616 = tt.expand_dims %ileft_615 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc273) + %ileft_617 = tt.broadcast %ileft_616 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc274) + %iright_618 = arith.muli %y_613, %flip_61 : tensor<64x2x4xi32> loc(#loc275) + %iright_619 = "tt.reduce"(%iright_618) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc319) + %iright_620 = tt.expand_dims %iright_619 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc277) + %iright_621 = tt.broadcast %iright_620 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc278) + %ileft_622 = tt.reshape %ileft_617 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_623 = tt.reshape %iright_621 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_624 = tt.reshape %new_idxs_612 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc292) + %left_idx_625 = arith.muli %y_idx_624, %ileft_139 : tensor<64x2x4xi32> loc(#loc293) + %left_idx_626 = "tt.reduce"(%left_idx_625) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc321) + %left_idx_627 = tt.expand_dims %left_idx_626 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc295) + %left_idx_628 = tt.broadcast %left_idx_627 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc296) + %right_idx_629 = arith.muli %y_idx_624, %flip_61 : tensor<64x2x4xi32> loc(#loc297) + %right_idx_630 = "tt.reduce"(%right_idx_629) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc323) + %right_idx_631 = tt.expand_dims %right_idx_630 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc299) + %right_idx_632 = tt.broadcast %right_idx_631 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc300) + %left_idx_633 = tt.reshape %left_idx_628 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_634 = tt.reshape %right_idx_632 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_635 = arith.cmpi slt, %ileft_622, %iright_623 : tensor<32x16xi32> loc(#loc281) + %eq_636 = arith.cmpi eq, %ileft_622, %iright_623 : tensor<32x16xi32> loc(#loc282) + %cond_637 = arith.cmpi sgt, %left_idx_633, %right_idx_634 : tensor<32x16xi32> loc(#loc303) + %cond_638 = arith.andi %eq_636, %cond_637 : tensor<32x16xi1> loc(#loc283) + %cond_639 = arith.ori %cond_635, %cond_638 : tensor<32x16xi1> loc(#loc284) + %ret_640 = arith.xori %ileft_622, %iright_623 : tensor<32x16xi32> loc(#loc287) + %ret_641 = arith.select %cond_639, %ret_640, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_642 = arith.xori %ret_609, %ret_641 : tensor<32x16xi32> loc(#loc289) + %new_idxs_643 = arith.xori %left_idx_633, %right_idx_634 : tensor<32x16xi32> loc(#loc304) + %new_idxs_644 = arith.select %cond_639, %new_idxs_643, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_645 = arith.xori %new_idxs_612, %new_idxs_644 : tensor<32x16xi32> loc(#loc291) + %y_646 = tt.reshape %ret_642 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc270) + %ileft_647 = arith.muli %y_646, %ileft_64 : tensor<128x2x2xi32> loc(#loc271) + %ileft_648 = "tt.reduce"(%ileft_647) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc317) + %ileft_649 = tt.expand_dims %ileft_648 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc273) + %ileft_650 = tt.broadcast %ileft_649 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc274) + %iright_651 = arith.muli %y_646, %flip_24 : tensor<128x2x2xi32> loc(#loc275) + %iright_652 = "tt.reduce"(%iright_651) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc319) + %iright_653 = tt.expand_dims %iright_652 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc277) + %iright_654 = tt.broadcast %iright_653 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc278) + %ileft_655 = tt.reshape %ileft_650 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_656 = tt.reshape %iright_654 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_657 = tt.reshape %new_idxs_645 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc292) + %left_idx_658 = arith.muli %y_idx_657, %ileft_64 : tensor<128x2x2xi32> loc(#loc293) + %left_idx_659 = "tt.reduce"(%left_idx_658) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc321) + %left_idx_660 = tt.expand_dims %left_idx_659 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc295) + %left_idx_661 = tt.broadcast %left_idx_660 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc296) + %right_idx_662 = arith.muli %y_idx_657, %flip_24 : tensor<128x2x2xi32> loc(#loc297) + %right_idx_663 = "tt.reduce"(%right_idx_662) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc323) + %right_idx_664 = tt.expand_dims %right_idx_663 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc299) + %right_idx_665 = tt.broadcast %right_idx_664 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc300) + %left_idx_666 = tt.reshape %left_idx_661 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_667 = tt.reshape %right_idx_665 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_668 = arith.cmpi slt, %ileft_655, %iright_656 : tensor<32x16xi32> loc(#loc281) + %eq_669 = arith.cmpi eq, %ileft_655, %iright_656 : tensor<32x16xi32> loc(#loc282) + %cond_670 = arith.cmpi sgt, %left_idx_666, %right_idx_667 : tensor<32x16xi32> loc(#loc303) + %cond_671 = arith.andi %eq_669, %cond_670 : tensor<32x16xi1> loc(#loc283) + %cond_672 = arith.ori %cond_668, %cond_671 : tensor<32x16xi1> loc(#loc284) + %ret_673 = arith.xori %ileft_655, %iright_656 : tensor<32x16xi32> loc(#loc287) + %ret_674 = arith.select %cond_672, %ret_673, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_675 = arith.xori %ret_642, %ret_674 : tensor<32x16xi32> loc(#loc289) + %new_idxs_676 = arith.xori %left_idx_666, %right_idx_667 : tensor<32x16xi32> loc(#loc304) + %new_idxs_677 = arith.select %cond_672, %new_idxs_676, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_678 = arith.xori %new_idxs_645, %new_idxs_677 : tensor<32x16xi32> loc(#loc291) + %y_679 = tt.reshape %ret_675 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc270) + %ileft_680 = arith.muli %y_679, %ileft : tensor<256x2x1xi32> loc(#loc271) + %ileft_681 = "tt.reduce"(%ileft_680) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc317) + %ileft_682 = tt.expand_dims %ileft_681 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc273) + %ileft_683 = tt.broadcast %ileft_682 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc274) + %iright_684 = arith.muli %y_679, %iright : tensor<256x2x1xi32> loc(#loc275) + %iright_685 = "tt.reduce"(%iright_684) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc319) + %iright_686 = tt.expand_dims %iright_685 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc277) + %iright_687 = tt.broadcast %iright_686 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc278) + %ileft_688 = tt.reshape %ileft_683 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_689 = tt.reshape %iright_687 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_690 = tt.reshape %new_idxs_678 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc292) + %left_idx_691 = arith.muli %y_idx_690, %ileft : tensor<256x2x1xi32> loc(#loc293) + %left_idx_692 = "tt.reduce"(%left_idx_691) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc321) + %left_idx_693 = tt.expand_dims %left_idx_692 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc295) + %left_idx_694 = tt.broadcast %left_idx_693 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc296) + %right_idx_695 = arith.muli %y_idx_690, %iright : tensor<256x2x1xi32> loc(#loc297) + %right_idx_696 = "tt.reduce"(%right_idx_695) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc323) + %right_idx_697 = tt.expand_dims %right_idx_696 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc299) + %right_idx_698 = tt.broadcast %right_idx_697 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc300) + %left_idx_699 = tt.reshape %left_idx_694 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_700 = tt.reshape %right_idx_698 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_701 = arith.cmpi slt, %ileft_688, %iright_689 : tensor<32x16xi32> loc(#loc281) + %eq_702 = arith.cmpi eq, %ileft_688, %iright_689 : tensor<32x16xi32> loc(#loc282) + %cond_703 = arith.cmpi sgt, %left_idx_699, %right_idx_700 : tensor<32x16xi32> loc(#loc303) + %cond_704 = arith.andi %eq_702, %cond_703 : tensor<32x16xi1> loc(#loc283) + %cond_705 = arith.ori %cond_701, %cond_704 : tensor<32x16xi1> loc(#loc284) + %new_idxs_706 = arith.xori %left_idx_699, %right_idx_700 : tensor<32x16xi32> loc(#loc304) + %new_idxs_707 = arith.select %cond_705, %new_idxs_706, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_708 = arith.xori %new_idxs_678, %new_idxs_707 : tensor<32x16xi32> loc(#loc291) + %tmp20 = arith.extui %tmp5 : tensor<32x16xi1> to tensor<32x16xi64> loc(#loc226) + %tmp23 = arith.select %tmp0_20, %tmp20, %cst_7 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc197) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_713: i64 loc(callsite(#loc1 at #loc198)), %tmp24_714: i64 loc(callsite(#loc1 at #loc198))): + %tmp24_715 = arith.addi %tmp24_713, %tmp24_714 : i64 loc(#loc305) + tt.reduce.return %tmp24_715 : i64 loc(#loc227) + }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc227) + %tmp24_709 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc199) + %tmp25 = arith.extui %tmp14 : tensor<32x16xi1> to tensor<32x16xi64> loc(#loc229) + %tmp28 = arith.select %tmp0_20, %tmp25, %cst_7 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc201) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_713: i64 loc(callsite(#loc1 at #loc202)), %tmp29_714: i64 loc(callsite(#loc1 at #loc202))): + %tmp29_715 = arith.addi %tmp29_713, %tmp29_714 : i64 loc(#loc306) + tt.reduce.return %tmp29_715 : i64 loc(#loc230) + }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc230) + %tmp29_710 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc203) + %tmp30 = arith.trunci %tmp24_709 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc204) + %tmp31 = arith.trunci %tmp29_710 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc205) + %tmp34 = tt.broadcast %tmp30 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc206) + %tmp34_711 = arith.cmpi slt, %tmp0_15, %tmp34 : tensor<32x16xi32> loc(#loc206) + %tmp36 = arith.select %tmp34_711, %new_idxs_376, %cst_5 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc207) + %tmp38 = arith.addi %tmp36, %cst_4 : tensor<32x16xi32> loc(#loc208) + %tmp39 = arith.cmpi slt, %tmp36, %cst_3 : tensor<32x16xi32> loc(#loc209) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc210) + %0 = arith.cmpi sge, %tmp40, %cst_3 : tensor<32x16xi32> loc(#loc88) + %1 = arith.cmpi slt, %tmp40, %cst_4 : tensor<32x16xi32> loc(#loc89) + %2 = arith.andi %0, %1 : tensor<32x16xi1> loc(#loc90) + %3 = arith.xori %xmask_13, %cst_2 : tensor<32x1xi1> loc(#loc91) + %4 = tt.broadcast %3 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc92) + %5 = arith.ori %2, %4 : tensor<32x16xi1> loc(#loc92) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<32x16xi1> loc(#loc93) + %tmp45 = tt.broadcast %tmp31 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc211) + %tmp45_712 = arith.cmpi slt, %tmp0_15, %tmp45 : tensor<32x16xi32> loc(#loc211) + %tmp46 = arith.select %tmp45_712, %new_idxs_708, %cst_5 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc212) + %tmp47 = arith.addi %tmp46, %cst_4 : tensor<32x16xi32> loc(#loc213) + %tmp48 = arith.cmpi slt, %tmp46, %cst_3 : tensor<32x16xi32> loc(#loc214) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc215) + %6 = arith.cmpi sge, %tmp49, %cst_3 : tensor<32x16xi32> loc(#loc99) + %7 = arith.cmpi slt, %tmp49, %cst_4 : tensor<32x16xi32> loc(#loc100) + %8 = arith.andi %6, %7 : tensor<32x16xi1> loc(#loc101) + %9 = arith.ori %8, %4 : tensor<32x16xi1> loc(#loc102) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<32x16xi1> loc(#loc103) + %10 = tt.splat %out_ptr4 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc104) + %11 = tt.addptr %10, %xindex_12 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc104) + tt.store %11, %tmp30, %xmask_13 : tensor<32x1x!tt.ptr> loc(#loc105) + %12 = tt.splat %out_ptr5 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc106) + %13 = tt.addptr %12, %xindex_12 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc106) + tt.store %13, %tmp31, %xmask_13 : tensor<32x1x!tt.ptr> loc(#loc107) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc108) + %15 = tt.addptr %14, %tmp0_17 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc108) + tt.store %15, %new_idxs_376, %tmp0_20 : tensor<32x16x!tt.ptr> loc(#loc109) + %16 = arith.muli %xindex_12, %cst_1 : tensor<32x1xi32> loc(#loc110) + %17 = tt.broadcast %16 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc111) + %18 = arith.addi %tmp40, %17 : tensor<32x16xi32> loc(#loc111) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc112) + %20 = tt.addptr %19, %18 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc112) + tt.store %20, %cst_0, %tmp0_20 : tensor<32x16x!tt.ptr> loc(#loc113) + %21 = tt.splat %out_ptr8 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc114) + %22 = tt.addptr %21, %tmp0_17 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc114) + tt.store %22, %new_idxs_708, %tmp0_20 : tensor<32x16x!tt.ptr> loc(#loc115) + %23 = arith.addi %tmp49, %17 : tensor<32x16xi32> loc(#loc116) + %24 = tt.splat %out_ptr9 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc117) + %25 = tt.addptr %24, %23 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc117) + tt.store %25, %cst_0, %tmp0_20 : tensor<32x16x!tt.ptr> loc(#loc118) + tt.return loc(#loc119) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":26:21) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:38) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:30) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":36:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":38:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":39:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":41:19) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":40:19) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":43:19) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":45:34) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":47:20) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":49:21) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":48:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":52:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":54:35) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:29) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":56:21) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":58:35) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:29) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":60:21) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":61:21) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":64:19) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":66:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":68:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":69:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":70:35) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:28) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:46) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:38) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:55) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:53) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:63) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":75:19) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":76:35) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":77:20) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":78:20) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":79:35) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:28) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:46) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:38) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:53) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:63) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:25) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:37) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:25) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:37) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:25) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:47) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:52) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:49) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:25) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:85) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:47) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:49) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:25) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:85) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:4) +#loc129 = loc("xmask"(#loc2)) +#loc130 = loc("xoffset"(#loc3)) +#loc131 = loc("xoffset"(#loc4)) +#loc132 = loc("xindex"(#loc5)) +#loc133 = loc("xindex"(#loc6)) +#loc134 = loc("xindex"(#loc7)) +#loc135 = loc("r0_index"(#loc8)) +#loc136 = loc("r0_index"(#loc9)) +#loc137 = loc("tmp0"(#loc10)) +#loc138 = loc("tmp0"(#loc11)) +#loc139 = loc("tmp0"(#loc12)) +#loc140 = loc("tmp0"(#loc13)) +#loc141 = loc("tmp2"(#loc14)) +#loc142 = loc("tmp4"(#loc15)) +#loc143 = loc("tmp5"(#loc16)) +#loc144 = loc("tmp7"(#loc17)) +#loc145 = loc("tmp6"(#loc18)) +#loc146 = loc("tmp9"(#loc19)) +#loc147 = loc("tmp11"(#loc20)) +#loc148 = loc("flip"(#loc21)) +#loc150 = loc("flip"(#loc24)) +#loc151 = loc("flip"(#loc25)) +#loc152 = loc("flip"(#loc26)) +#loc153 = loc("y"(#loc27)) +#loc154 = loc("left_mask"(#loc29)) +#loc155 = loc("ileft"(#loc30)) +#loc157 = loc("ileft"(#loc34)) +#loc158 = loc("ileft"(#loc35)) +#loc159 = loc("iright"(#loc36)) +#loc161 = loc("iright"(#loc38)) +#loc162 = loc("iright"(#loc39)) +#loc163 = loc("ileft"(#loc40)) +#loc164 = loc("iright"(#loc41)) +#loc165 = loc("y_idx"(#loc42)) +#loc166 = loc("left_idx"(#loc43)) +#loc167 = loc("left_idx"(#loc44)) +#loc168 = loc("input"(#loc45)) +#loc170 = loc("left_idx"(#loc47)) +#loc171 = loc("left_idx"(#loc48)) +#loc172 = loc("right_idx"(#loc49)) +#loc173 = loc("right_idx"(#loc50)) +#loc175 = loc("right_idx"(#loc52)) +#loc176 = loc("right_idx"(#loc53)) +#loc177 = loc("left_idx"(#loc54)) +#loc178 = loc("right_idx"(#loc55)) +#loc179 = loc("cond"(#loc56)) +#loc180 = loc("eq"(#loc57)) +#loc181 = loc("cond"(#loc58)) +#loc182 = loc("cond"(#loc59)) +#loc183 = loc("cond"(#loc60)) +#loc184 = loc("cond"(#loc61)) +#loc185 = loc("cond"(#loc62)) +#loc186 = loc("ret"(#loc63)) +#loc187 = loc("ret"(#loc64)) +#loc188 = loc("ret"(#loc65)) +#loc189 = loc("new_idxs"(#loc66)) +#loc190 = loc("new_idxs"(#loc67)) +#loc191 = loc("new_idxs"(#loc68)) +#loc192 = loc("tmp14"(#loc69)) +#loc193 = loc("tmp16"(#loc70)) +#loc194 = loc("tmp15"(#loc71)) +#loc196 = loc("tmp20"(#loc73)) +#loc197 = loc("tmp23"(#loc74)) +#loc199 = loc("tmp24"(#loc76)) +#loc200 = loc("tmp25"(#loc77)) +#loc201 = loc("tmp28"(#loc78)) +#loc203 = loc("tmp29"(#loc80)) +#loc204 = loc("tmp30"(#loc81)) +#loc205 = loc("tmp31"(#loc82)) +#loc206 = loc("tmp34"(#loc83)) +#loc207 = loc("tmp36"(#loc84)) +#loc208 = loc("tmp38"(#loc85)) +#loc209 = loc("tmp39"(#loc86)) +#loc210 = loc("tmp40"(#loc87)) +#loc211 = loc("tmp45"(#loc94)) +#loc212 = loc("tmp46"(#loc95)) +#loc213 = loc("tmp47"(#loc96)) +#loc214 = loc("tmp48"(#loc97)) +#loc215 = loc("tmp49"(#loc98)) +#loc216 = loc(fused[#loc144, #loc145]) +#loc217 = loc(callsite(#loc148 at #loc149)) +#loc218 = loc(callsite(#loc150 at #loc149)) +#loc219 = loc(callsite(#loc151 at #loc149)) +#loc220 = loc(callsite(#loc152 at #loc149)) +#loc222 = loc("cond"(#loc179)) +#loc223 = loc("eq"(#loc180)) +#loc224 = loc(fused[#loc193, #loc194]) +#loc226 = loc(fused[#loc196, #loc144, #loc145]) +#loc227 = loc(callsite(#loc31 at #loc198)) +#loc229 = loc(fused[#loc200, #loc193, #loc194]) +#loc230 = loc(callsite(#loc31 at #loc202)) +#loc232 = loc(callsite(#loc153 at #loc221)) +#loc233 = loc(callsite(#loc154 at #loc221)) +#loc234 = loc(callsite(#loc155 at #loc221)) +#loc236 = loc(callsite(#loc157 at #loc221)) +#loc237 = loc(callsite(#loc158 at #loc221)) +#loc238 = loc(callsite(#loc159 at #loc221)) +#loc240 = loc(callsite(#loc161 at #loc221)) +#loc241 = loc(callsite(#loc162 at #loc221)) +#loc242 = loc(callsite(#loc163 at #loc221)) +#loc243 = loc(callsite(#loc164 at #loc221)) +#loc244 = loc(callsite(#loc165 at #loc221)) +#loc245 = loc(callsite(#loc166 at #loc221)) +#loc246 = loc(callsite(#loc167 at #loc221)) +#loc248 = loc(callsite(#loc170 at #loc221)) +#loc249 = loc(callsite(#loc171 at #loc221)) +#loc250 = loc(callsite(#loc172 at #loc221)) +#loc251 = loc(callsite(#loc173 at #loc221)) +#loc253 = loc(callsite(#loc175 at #loc221)) +#loc254 = loc(callsite(#loc176 at #loc221)) +#loc255 = loc(callsite(#loc177 at #loc221)) +#loc256 = loc(callsite(#loc178 at #loc221)) +#loc257 = loc(callsite(#loc222 at #loc221)) +#loc258 = loc(callsite(#loc223 at #loc221)) +#loc259 = loc(callsite(#loc181 at #loc221)) +#loc260 = loc(callsite(#loc182 at #loc221)) +#loc261 = loc(callsite(#loc183 at #loc221)) +#loc262 = loc(callsite(#loc184 at #loc221)) +#loc263 = loc(callsite(#loc185 at #loc221)) +#loc264 = loc(callsite(#loc186 at #loc221)) +#loc265 = loc(callsite(#loc187 at #loc221)) +#loc266 = loc(callsite(#loc188 at #loc221)) +#loc267 = loc(callsite(#loc189 at #loc221)) +#loc268 = loc(callsite(#loc190 at #loc221)) +#loc269 = loc(callsite(#loc191 at #loc221)) +#loc270 = loc(callsite(#loc153 at #loc225)) +#loc271 = loc(callsite(#loc155 at #loc225)) +#loc273 = loc(callsite(#loc157 at #loc225)) +#loc274 = loc(callsite(#loc158 at #loc225)) +#loc275 = loc(callsite(#loc159 at #loc225)) +#loc277 = loc(callsite(#loc161 at #loc225)) +#loc278 = loc(callsite(#loc162 at #loc225)) +#loc279 = loc(callsite(#loc163 at #loc225)) +#loc280 = loc(callsite(#loc164 at #loc225)) +#loc281 = loc(callsite(#loc222 at #loc225)) +#loc282 = loc(callsite(#loc223 at #loc225)) +#loc283 = loc(callsite(#loc182 at #loc225)) +#loc284 = loc(callsite(#loc183 at #loc225)) +#loc285 = loc(callsite(#loc184 at #loc225)) +#loc286 = loc(callsite(#loc185 at #loc225)) +#loc287 = loc(callsite(#loc186 at #loc225)) +#loc288 = loc(callsite(#loc187 at #loc225)) +#loc289 = loc(callsite(#loc188 at #loc225)) +#loc290 = loc(callsite(#loc190 at #loc225)) +#loc291 = loc(callsite(#loc191 at #loc225)) +#loc292 = loc(callsite(#loc165 at #loc225)) +#loc293 = loc(callsite(#loc167 at #loc225)) +#loc295 = loc(callsite(#loc170 at #loc225)) +#loc296 = loc(callsite(#loc171 at #loc225)) +#loc297 = loc(callsite(#loc173 at #loc225)) +#loc299 = loc(callsite(#loc175 at #loc225)) +#loc300 = loc(callsite(#loc176 at #loc225)) +#loc301 = loc(callsite(#loc177 at #loc225)) +#loc302 = loc(callsite(#loc178 at #loc225)) +#loc303 = loc(callsite(#loc181 at #loc225)) +#loc304 = loc(callsite(#loc189 at #loc225)) +#loc305 = loc(callsite(#loc33 at #loc227)) +#loc306 = loc(callsite(#loc33 at #loc230)) +#loc307 = loc(callsite(#loc31 at #loc235)) +#loc309 = loc(callsite(#loc31 at #loc239)) +#loc311 = loc(callsite(#loc168 at #loc247)) +#loc312 = loc(callsite(#loc31 at #loc247)) +#loc314 = loc(callsite(#loc168 at #loc252)) +#loc315 = loc(callsite(#loc31 at #loc252)) +#loc317 = loc(callsite(#loc31 at #loc272)) +#loc319 = loc(callsite(#loc31 at #loc276)) +#loc321 = loc(callsite(#loc31 at #loc294)) +#loc323 = loc(callsite(#loc31 at #loc298)) +#loc325 = loc(callsite(#loc33 at #loc307)) +#loc326 = loc(callsite(#loc33 at #loc309)) +#loc327 = loc(callsite(#loc33 at #loc312)) +#loc328 = loc(callsite(#loc33 at #loc315)) +#loc329 = loc(callsite(#loc33 at #loc317)) +#loc330 = loc(callsite(#loc33 at #loc319)) +#loc331 = loc(callsite(#loc33 at #loc321)) +#loc332 = loc(callsite(#loc33 at #loc323)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b5064103bb5b6bde150a02f9a4182fce1d06f386 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..9d56329f17265fa32c950215823005b4467d3926 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cadb0dbd04a96d4c8077f4b72751c65f4a90c328 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "21869c3b7e1d30656e900d0fe97e2caa8bff98c7b55852e63d8f2636233cc552", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 32, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..3268b72ef40b001b861a318dbe7ab94c63e25f3b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.llir @@ -0,0 +1,158 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 3, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 224, !dbg !9 + %12 = lshr exact i32 %11, 5, !dbg !9 + %13 = and i32 %10, 7, !dbg !9 + %14 = or disjoint i32 %12, %9, !dbg !10 + %15 = or disjoint i32 %9, %13, !dbg !10 + %16 = shl nuw nsw i32 %10, 2, !dbg !11 + %17 = and i32 %16, 124, !dbg !11 + %18 = sdiv i32 %14, 2048, !dbg !12 + %19 = mul i32 %18, 2048, !dbg !13 + %.decomposed = sub i32 %14, %19, !dbg !13 + %20 = srem i32 %18, 32, !dbg !14 + %21 = sdiv i32 %14, 65536, !dbg !15 + %22 = shl nsw i32 %20, 7, !dbg !16 + %23 = shl nsw i32 %.decomposed, 12, !dbg !17 + %24 = shl i32 %21, 23, !dbg !18 + %25 = or disjoint i32 %23, %17, !dbg !19 + %26 = add i32 %25, %24, !dbg !20 + %27 = add i32 %26, %22, !dbg !21 + %28 = sext i32 %27 to i64, !dbg !22 + %29 = getelementptr bfloat, ptr addrspace(1) %0, i64 %28, !dbg !22 + %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !23 + %31 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %29, i64 %30, i1 true) #4, !dbg !23 + %32 = extractvalue { i32, i32 } %31, 0, !dbg !23 + %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !23 + %34 = extractvalue { i32, i32 } %31, 1, !dbg !23 + %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !23 + %36 = shl i32 %14, 7, !dbg !24 + %37 = or disjoint i32 %36, %17, !dbg !25 + %38 = sext i32 %37 to i64, !dbg !26 + %39 = getelementptr bfloat, ptr addrspace(1) %1, i64 %38, !dbg !26 + %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !27 + %41 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %39, i64 %40, i1 true) #4, !dbg !27 + %42 = extractvalue { i32, i32 } %41, 0, !dbg !27 + %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !27 + %44 = extractvalue { i32, i32 } %41, 1, !dbg !27 + %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !27 + %46 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !28 + %47 = fpext <2 x bfloat> %43 to <2 x float>, !dbg !29 + %48 = fmul <2 x float> %46, %47, !dbg !30 + %49 = fadd <2 x float> %48, zeroinitializer, !dbg !31 + %50 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !28 + %51 = fpext <2 x bfloat> %45 to <2 x float>, !dbg !29 + %52 = fmul <2 x float> %50, %51, !dbg !30 + %53 = fadd <2 x float> %52, zeroinitializer, !dbg !31 + %shift = shufflevector <2 x float> %49, <2 x float> poison, <2 x i32> , !dbg !32 + %foldExtExtBinop = fadd <2 x float> %49, %shift, !dbg !32 + %foldExtExtBinop2 = fadd <2 x float> %53, %foldExtExtBinop, !dbg !32 + %shift4 = shufflevector <2 x float> %53, <2 x float> poison, <2 x i32> , !dbg !32 + %foldExtExtBinop5 = fadd <2 x float> %shift4, %foldExtExtBinop2, !dbg !32 + %54 = extractelement <2 x float> %foldExtExtBinop5, i64 0, !dbg !32 + %55 = bitcast float %54 to i32, !dbg !36 + %56 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 16, i32 31), !dbg !36 + %57 = bitcast i32 %56 to float, !dbg !36 + %58 = fadd float %54, %57, !dbg !32 + %59 = bitcast float %58 to i32, !dbg !36 + %60 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %59, i32 8, i32 31), !dbg !36 + %61 = bitcast i32 %60 to float, !dbg !36 + %62 = fadd float %58, %61, !dbg !32 + %63 = bitcast float %62 to i32, !dbg !36 + %64 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 4, i32 31), !dbg !36 + %65 = bitcast i32 %64 to float, !dbg !36 + %66 = fadd float %62, %65, !dbg !32 + %67 = bitcast float %66 to i32, !dbg !36 + %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 2, i32 31), !dbg !36 + %69 = bitcast i32 %68 to float, !dbg !36 + %70 = fadd float %66, %69, !dbg !32 + %71 = bitcast float %70 to i32, !dbg !36 + %72 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 1, i32 31), !dbg !36 + %73 = bitcast i32 %72 to float, !dbg !36 + %74 = fadd float %70, %73, !dbg !32 + %75 = lshr exact i32 %11, 3, !dbg !37 + %76 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %75, !dbg !37 + store float %74, ptr addrspace(3) %76, align 4, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37 + %77 = shl nuw nsw i32 %13, 2, !dbg !37 + %78 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %77, !dbg !37 + %79 = load i32, ptr addrspace(3) %78, align 4, !dbg !37 + %80 = sext i32 %15 to i64, !dbg !38 + %81 = getelementptr float, ptr addrspace(1) %2, i64 %80, !dbg !38 + %82 = and i32 %10, 248, !dbg !39 + %83 = icmp eq i32 %82, 0, !dbg !39 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %79, ptr addrspace(1) %81, i1 %83) #4, !dbg !39 + ret void, !dbg !40 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 21, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 29, column: 29, scope: !4) +!15 = !DILocation(line: 30, column: 19, scope: !4) +!16 = !DILocation(line: 39, column: 45, scope: !4) +!17 = !DILocation(line: 39, column: 55, scope: !4) +!18 = !DILocation(line: 39, column: 68, scope: !4) +!19 = !DILocation(line: 39, column: 41, scope: !4) +!20 = !DILocation(line: 39, column: 50, scope: !4) +!21 = !DILocation(line: 39, column: 60, scope: !4) +!22 = !DILocation(line: 39, column: 34, scope: !4) +!23 = !DILocation(line: 39, column: 73, scope: !4) +!24 = !DILocation(line: 40, column: 45, scope: !4) +!25 = !DILocation(line: 40, column: 41, scope: !4) +!26 = !DILocation(line: 40, column: 34, scope: !4) +!27 = !DILocation(line: 40, column: 50, scope: !4) +!28 = !DILocation(line: 39, column: 127, scope: !4) +!29 = !DILocation(line: 40, column: 104, scope: !4) +!30 = !DILocation(line: 41, column: 22, scope: !4) +!31 = !DILocation(line: 43, column: 23, scope: !4) +!32 = !DILocation(line: 261, column: 15, scope: !33, inlinedAt: !35) +!33 = distinct !DILexicalBlockFile(scope: !4, file: !34, discriminator: 0) +!34 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!35 = !DILocation(line: 45, column: 25, scope: !4) +!36 = !DILocation(line: 291, column: 36, scope: !33, inlinedAt: !35) +!37 = !DILocation(line: 45, column: 28, scope: !4) +!38 = !DILocation(line: 49, column: 25, scope: !4) +!39 = !DILocation(line: 49, column: 36, scope: !4) +!40 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..722b6e2932a920cfcc5ea8e955d6b1426f453c9e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.ptx @@ -0,0 +1,412 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u32 triton_red_fused_zeros_0_param_3, + .param .u32 triton_red_fused_zeros_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_6 +) +.reqntid 256 +{ + .reg .pred %p<4>; + .reg .b16 %rs<9>; + .reg .b32 %r<72>; + .reg .b64 %rd<11>; + .loc 1 18 0 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_red_fused_zeros_0_param_0]; + ld.param.b64 %rd9, [triton_red_fused_zeros_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:23:28 + mov.u32 %r10, %ctaid.x; + .loc 1 23 33 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:23:33 + shl.b32 %r11, %r10, 3; + ld.param.b64 %rd10, [triton_red_fused_zeros_0_param_2]; + .loc 1 24 44 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:24:44 + mov.u32 %r12, %tid.x; + and.b32 %r13, %r12, 224; + bfe.u32 %r14, %r12, 5, 3; + and.b32 %r15, %r12, 7; + .loc 1 24 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:24:23 + or.b32 %r16, %r14, %r11; + or.b32 %r17, %r11, %r15; + .loc 1 26 37 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:26:37 + shl.b32 %r18, %r12, 2; + and.b32 %r19, %r18, 124; + .loc 1 29 21 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:29:21 + bfe.s32 %r20, %r10, 28, 1; + shr.u32 %r21, %r20, 21; + add.s32 %r22, %r16, %r21; + shr.s32 %r23, %r22, 11; + .loc 1 28 19 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:28:19 + and.b32 %r24, %r22, 1046528; + sub.s32 %r25, %r16, %r24; + .loc 1 29 29 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:29:29 + shr.u32 %r26, %r23, 27; + add.s32 %r27, %r23, %r26; + and.b32 %r28, %r27, 33554400; + sub.s32 %r29, %r23, %r28; + .loc 1 30 19 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:30:19 + shr.u32 %r30, %r20, 16; + add.s32 %r31, %r16, %r30; + .loc 1 39 45 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:45 + shl.b32 %r32, %r29, 7; + .loc 1 39 55 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:55 + shl.b32 %r33, %r25, 12; + .loc 1 39 68 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:68 + shl.b32 %r34, %r31, 7; + and.b32 %r35, %r34, -8388608; + .loc 1 39 41 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:41 + or.b32 %r36, %r33, %r19; + .loc 1 39 50 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:50 + add.s32 %r37, %r36, %r35; + .loc 1 39 60 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:60 + add.s32 %r38, %r37, %r32; + .loc 1 39 34 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:34 + mad.wide.s32 %rd2, %r38, 2, %rd8; + .loc 1 39 73 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:73 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd3, 1.0; + // end inline asm + mov.b32 %r3, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd2 + 0 ], %rd3; + // end inline asm + .loc 1 40 45 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:45 + shl.b32 %r39, %r16, 7; + .loc 1 40 41 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:41 + or.b32 %r40, %r39, %r19; + .loc 1 40 34 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:34 + mad.wide.s32 %rd5, %r40, 2, %rd9; + .loc 1 40 50 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:50 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r5, %r3; + mov.u32 %r6, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r5, %r6 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 39 127 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:127 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r41, %rs1; + cvt.f32.bf16 %r42, %rs2; + .loc 1 40 104 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:104 + mov.b32 {%rs3, %rs4}, %r5; + cvt.f32.bf16 %r43, %rs3; + cvt.f32.bf16 %r44, %rs4; + .loc 1 43 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:43:23 + fma.rn.f32 %r45, %r42, %r44, 0f00000000; + fma.rn.f32 %r46, %r41, %r43, 0f00000000; + .loc 1 39 127 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:127 + mov.b32 {%rs5, %rs6}, %r2; + cvt.f32.bf16 %r47, %rs5; + cvt.f32.bf16 %r48, %rs6; + .loc 1 40 104 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:104 + mov.b32 {%rs7, %rs8}, %r6; + cvt.f32.bf16 %r49, %rs7; + cvt.f32.bf16 %r50, %rs8; + .loc 1 43 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:43:23 + fma.rn.f32 %r51, %r48, %r50, 0f00000000; + fma.rn.f32 %r52, %r47, %r49, 0f00000000; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r53, %r46, %r45; + add.f32 %r54, %r52, %r53; + add.f32 %r55, %r51, %r54; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r56, %r55, 16, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r57, %r55, %r56; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r58, %r57, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r59, %r57, %r58; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r60, %r59, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r61, %r59, %r60; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r62, %r61, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r63, %r61, %r62; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r64, %r63, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r65, %r63, %r64; +$L__tmp2: + .loc 1 45 28 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:28 + shr.u32 %r66, %r13, 3; + mov.b32 %r67, global_smem; + add.s32 %r68, %r67, %r66; + st.shared.b32 [%r68], %r65; + bar.sync 0; + shl.b32 %r69, %r15, 2; + add.s32 %r70, %r67, %r69; + ld.shared.b32 %r9, [%r70]; + .loc 1 49 25 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:49:25 + mad.wide.s32 %rd7, %r17, 4, %rd10; + .loc 1 49 36 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:49:36 + and.b32 %r71, %r12, 248; + setp.eq.b32 %p3, %r71, 0; + // begin inline asm + @%p3 st.global.b32 [ %rd7 + 0 ], { %r9 }; + // end inline asm + .loc 1 49 4 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:49:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 209 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xca DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 111 +.b8 103 +.b8 111 +.b8 108 +.b8 53 +.b8 53 +.b8 99 +.b8 116 +.b8 104 +.b8 107 +.b8 52 +.b8 122 +.b8 101 +.b8 118 +.b8 115 +.b8 121 +.b8 51 +.b8 100 +.b8 108 +.b8 113 +.b8 105 +.b8 121 +.b8 122 +.b8 105 +.b8 112 +.b8 101 +.b8 102 +.b8 118 +.b8 55 +.b8 51 +.b8 53 +.b8 103 +.b8 101 +.b8 50 +.b8 119 +.b8 116 +.b8 97 +.b8 100 +.b8 100 +.b8 118 +.b8 107 +.b8 52 +.b8 51 +.b8 54 +.b8 113 +.b8 104 +.b8 116 +.b8 53 +.b8 110 +.b8 111 +.b8 120 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 111 +.b8 103 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..aa34274d1580eb39ca84ab3be34e001024a565e1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.source @@ -0,0 +1,222 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":18:0) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc46 = loc(unknown) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc53 = loc("in_ptr0"(#loc)) +#loc54 = loc("in_ptr1"(#loc)) +#loc55 = loc("out_ptr1"(#loc)) +#loc56 = loc("xnumel"(#loc)) +#loc57 = loc("r0_numel"(#loc)) +#loc97 = loc("input"(#loc44)) +#loc98 = loc("a"(#loc49)) +#loc99 = loc("b"(#loc49)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 131072 : i32 loc(#loc58) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc59) + %xoffset = tt.get_program_id x : i32 loc(#loc60) + %xoffset_2 = arith.constant 8 : i32 loc(#loc61) + %xoffset_3 = arith.constant 8 : i32 loc(#loc61) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc61) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc62) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc63) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc64) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc64) + %xmask = arith.constant true loc(#loc65) + %xmask_8 = arith.constant dense : tensor<8x128xi1> loc(#loc65) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc66) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc67) + %x0 = arith.constant 2048 : i32 loc(#loc68) + %x0_10 = arith.constant 2048 : i32 loc(#loc68) + %x0_11 = arith.constant dense<2048> : tensor<8x1xi32> loc(#loc68) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<8x1xi32> loc(#loc68) + %x1 = arith.constant 2048 : i32 loc(#loc69) + %x1_13 = arith.constant 2048 : i32 loc(#loc69) + %x1_14 = arith.constant dense<2048> : tensor<8x1xi32> loc(#loc69) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<8x1xi32> loc(#loc69) + %x1_16 = arith.constant 32 : i32 loc(#loc70) + %x1_17 = arith.constant 32 : i32 loc(#loc70) + %x1_18 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc70) + %x1_19 = arith.remsi %x1_15, %x1_18 : tensor<8x1xi32> loc(#loc70) + %x2 = arith.constant 65536 : i32 loc(#loc71) + %x2_20 = arith.constant 65536 : i32 loc(#loc71) + %x2_21 = arith.constant dense<65536> : tensor<8x1xi32> loc(#loc71) + %x2_22 = arith.divsi %xindex_7, %x2_21 : tensor<8x1xi32> loc(#loc71) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc72) + %_tmp4_23 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc72) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c128_i32 = arith.constant 128 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_24 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_27 = %_tmp4_23) -> (tensor<8x128xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc74) + %r0_index_28 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc74) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc75) + %r0_mask_29 = arith.cmpi slt, %r0_index_28, %r0_mask : tensor<1x128xi32> loc(#loc75) + %tmp0 = arith.constant 128 : i32 loc(#loc76) + %tmp0_30 = arith.constant 128 : i32 loc(#loc76) + %tmp0_31 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc76) + %tmp0_32 = arith.muli %tmp0_31, %x1_19 : tensor<8x1xi32> loc(#loc76) + %tmp0_33 = tt.broadcast %r0_index_28 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc77) + %tmp0_34 = tt.broadcast %tmp0_32 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc77) + %tmp0_35 = arith.addi %tmp0_33, %tmp0_34 : tensor<8x128xi32> loc(#loc77) + %tmp0_36 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_37 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_38 = arith.constant dense<4096> : tensor<8x1xi32> loc(#loc78) + %tmp0_39 = arith.muli %tmp0_38, %x0_12 : tensor<8x1xi32> loc(#loc78) + %tmp0_40 = tt.broadcast %tmp0_39 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc79) + %tmp0_41 = arith.addi %tmp0_35, %tmp0_40 : tensor<8x128xi32> loc(#loc79) + %tmp0_42 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_43 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_44 = arith.constant dense<8388608> : tensor<8x1xi32> loc(#loc80) + %tmp0_45 = arith.muli %tmp0_44, %x2_22 : tensor<8x1xi32> loc(#loc80) + %tmp0_46 = tt.broadcast %tmp0_45 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc81) + %tmp0_47 = arith.addi %tmp0_41, %tmp0_46 : tensor<8x128xi32> loc(#loc81) + %tmp0_48 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc82) + %tmp0_49 = tt.addptr %tmp0_48, %tmp0_47 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc82) + %tmp0_50 = arith.constant 0.000000e+00 : f32 loc(#loc83) + %tmp0_51 = tt.broadcast %r0_mask_29 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc83) + %tmp0_52 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc83) + %tmp0_53 = arith.truncf %tmp0_52 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc83) + %tmp0_54 = tt.load %tmp0_49, %tmp0_51, %tmp0_53 evictionPolicy = evict_first : tensor<8x128x!tt.ptr> loc(#loc83) + %tmp0_55 = arith.extf %tmp0_54 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc84) + %tmp1 = arith.constant 128 : i32 loc(#loc85) + %tmp1_56 = arith.constant 128 : i32 loc(#loc85) + %tmp1_57 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc85) + %tmp1_58 = arith.muli %tmp1_57, %xindex_7 : tensor<8x1xi32> loc(#loc85) + %tmp1_59 = tt.broadcast %r0_index_28 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc86) + %tmp1_60 = tt.broadcast %tmp1_58 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc86) + %tmp1_61 = arith.addi %tmp1_59, %tmp1_60 : tensor<8x128xi32> loc(#loc86) + %tmp1_62 = tt.splat %in_ptr1 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc87) + %tmp1_63 = tt.addptr %tmp1_62, %tmp1_61 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc87) + %tmp1_64 = arith.constant 0.000000e+00 : f32 loc(#loc88) + %tmp1_65 = tt.broadcast %r0_mask_29 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc88) + %tmp1_66 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc88) + %tmp1_67 = arith.truncf %tmp1_66 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc88) + %tmp1_68 = tt.load %tmp1_63, %tmp1_65, %tmp1_67 evictionPolicy = evict_first : tensor<8x128x!tt.ptr> loc(#loc88) + %tmp1_69 = arith.extf %tmp1_68 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc89) + %tmp2 = arith.mulf %tmp0_55, %tmp1_69 : tensor<8x128xf32> loc(#loc90) + %tmp5 = arith.addf %_tmp4_27, %tmp2 : tensor<8x128xf32> loc(#loc91) + %_tmp4_70 = tt.broadcast %r0_mask_29 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc92) + %_tmp4_71 = arith.select %_tmp4_70, %tmp5, %_tmp4_27 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc92) + scf.yield %_tmp4_71 : tensor<8x128xf32> loc(#loc36) + } loc(#loc73) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_24) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc93) + %tmp4_25 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc94) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc95) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<8x1xf32> loc(#loc96) + %tmp8_26 = arith.subf %tmp4_25, %tmp8 : tensor<8x1xf32> loc(#loc96) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc41) + %5 = tt.addptr %4, %xindex_7 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc41) + tt.store %5, %tmp8_26 : tensor<8x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x128xf32> loc("input"(#loc44))) -> tensor<8xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc45) + tt.reduce.return %2 : f32 loc(#loc45) + }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc45) + tt.return %0 : tensor<8xf32> loc(#loc47) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8xf32> loc(#loc48) + tt.return %1 : tensor<8xf32> loc(#loc48) + } loc(#loc44) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc49)), %b: f32 loc("b"(#loc49))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc50) + tt.return %0 : f32 loc(#loc51) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc52) + tt.return %1 : f32 loc(#loc52) + } loc(#loc49) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":30:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":32:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:68) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:60) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:73) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:127) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:45) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:50) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:104) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":41:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":43:23) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:40) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:28) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":47:11) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":48:18) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:4) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc58 = loc("xnumel"(#loc1)) +#loc59 = loc("r0_numel"(#loc2)) +#loc60 = loc("xoffset"(#loc3)) +#loc61 = loc("xoffset"(#loc4)) +#loc62 = loc("xindex"(#loc5)) +#loc63 = loc("xindex"(#loc6)) +#loc64 = loc("xindex"(#loc7)) +#loc65 = loc("xmask"(#loc8)) +#loc66 = loc("r0_base"(#loc9)) +#loc67 = loc("r0_base"(#loc10)) +#loc68 = loc("x0"(#loc11)) +#loc69 = loc("x1"(#loc12)) +#loc70 = loc("x1"(#loc13)) +#loc71 = loc("x2"(#loc14)) +#loc72 = loc("_tmp4"(#loc15)) +#loc73 = loc("_tmp4"(#loc16)) +#loc74 = loc("r0_index"(#loc17)) +#loc75 = loc("r0_mask"(#loc18)) +#loc76 = loc("tmp0"(#loc19)) +#loc77 = loc("tmp0"(#loc20)) +#loc78 = loc("tmp0"(#loc21)) +#loc79 = loc("tmp0"(#loc22)) +#loc80 = loc("tmp0"(#loc23)) +#loc81 = loc("tmp0"(#loc24)) +#loc82 = loc("tmp0"(#loc25)) +#loc83 = loc("tmp0"(#loc26)) +#loc84 = loc("tmp0"(#loc27)) +#loc85 = loc("tmp1"(#loc28)) +#loc86 = loc("tmp1"(#loc29)) +#loc87 = loc("tmp1"(#loc30)) +#loc88 = loc("tmp1"(#loc31)) +#loc89 = loc("tmp1"(#loc32)) +#loc90 = loc("tmp2"(#loc33)) +#loc91 = loc("tmp5"(#loc34)) +#loc92 = loc("_tmp4"(#loc35)) +#loc93 = loc("tmp4"(#loc37)) +#loc94 = loc("tmp4"(#loc38)) +#loc95 = loc("tmp7"(#loc39)) +#loc96 = loc("tmp8"(#loc40)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..07713face4bd6d01f3cba0486c75c5be3746f48c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,142 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":18:0) +#loc1 = loc(unknown) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:25) +#loc36 = loc("in_ptr0"(#loc)) +#loc37 = loc("in_ptr1"(#loc)) +#loc38 = loc("out_ptr1"(#loc)) +#loc39 = loc("xnumel"(#loc)) +#loc40 = loc("r0_numel"(#loc)) +#loc68 = loc("tmp4"(#loc30)) +#loc71 = loc(callsite(#loc1 at #loc68)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<8388608> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<65536> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<32> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<2048> : tensor<8x1xi32, #blocked> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %cst_6 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<8x128xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc41) + %xoffset_8 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc42) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc43) + %xindex_9 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc43) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc43) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc43) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<8x1xi32, #blocked> loc(#loc44) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc44) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<8x1xi32, #blocked> loc(#loc44) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<8x1xi32, #blocked1> loc(#loc44) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc45) + %r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc45) + %x0 = arith.remsi %xindex_14, %cst_5 : tensor<8x1xi32, #blocked> loc(#loc46) + %x1 = arith.divsi %xindex_14, %cst_5 : tensor<8x1xi32, #blocked> loc(#loc47) + %x1_17 = arith.remsi %x1, %cst_4 : tensor<8x1xi32, #blocked> loc(#loc48) + %x2 = arith.divsi %xindex_14, %cst_3 : tensor<8x1xi32, #blocked> loc(#loc49) + %r0_mask = arith.cmpi slt, %r0_base_16, %cst : tensor<1x128xi32, #blocked> loc(#loc50) + %tmp0 = arith.muli %x1_17, %cst_0 : tensor<8x1xi32, #blocked> loc(#loc51) + %tmp0_18 = tt.broadcast %r0_base_16 : tensor<1x128xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc52) + %tmp0_19 = tt.broadcast %tmp0 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc52) + %tmp0_20 = arith.addi %tmp0_18, %tmp0_19 : tensor<8x128xi32, #blocked> loc(#loc52) + %tmp0_21 = arith.muli %x0, %cst_1 : tensor<8x1xi32, #blocked> loc(#loc53) + %tmp0_22 = tt.broadcast %tmp0_21 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc54) + %tmp0_23 = arith.addi %tmp0_20, %tmp0_22 : tensor<8x128xi32, #blocked> loc(#loc54) + %tmp0_24 = arith.muli %x2, %cst_2 : tensor<8x1xi32, #blocked> loc(#loc55) + %tmp0_25 = tt.broadcast %tmp0_24 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc56) + %tmp0_26 = arith.addi %tmp0_23, %tmp0_25 : tensor<8x128xi32, #blocked> loc(#loc56) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr, #blocked> loc(#loc57) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<8x128x!tt.ptr, #blocked>, tensor<8x128xi32, #blocked> loc(#loc57) + %tmp0_29 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc58) + %tmp0_30 = tt.load %tmp0_28, %tmp0_29, %cst_6 evictionPolicy = evict_first : tensor<8x128x!tt.ptr, #blocked> loc(#loc58) + %tmp0_31 = arith.extf %tmp0_30 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc59) + %tmp1 = arith.muli %xindex_14, %cst_0 : tensor<8x1xi32, #blocked> loc(#loc60) + %tmp1_32 = tt.broadcast %tmp1 : tensor<8x1xi32, #blocked> -> tensor<8x128xi32, #blocked> loc(#loc61) + %tmp1_33 = arith.addi %tmp0_18, %tmp1_32 : tensor<8x128xi32, #blocked> loc(#loc61) + %tmp1_34 = tt.splat %in_ptr1 : !tt.ptr -> tensor<8x128x!tt.ptr, #blocked> loc(#loc62) + %tmp1_35 = tt.addptr %tmp1_34, %tmp1_33 : tensor<8x128x!tt.ptr, #blocked>, tensor<8x128xi32, #blocked> loc(#loc62) + %tmp1_36 = tt.load %tmp1_35, %tmp0_29, %cst_6 evictionPolicy = evict_first : tensor<8x128x!tt.ptr, #blocked> loc(#loc63) + %tmp1_37 = arith.extf %tmp1_36 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc64) + %tmp2 = arith.mulf %tmp0_31, %tmp1_37 : tensor<8x128xf32, #blocked> loc(#loc65) + %tmp5 = arith.addf %tmp2, %cst_7 : tensor<8x128xf32, #blocked> loc(#loc66) + %_tmp4 = arith.select %tmp0_29, %tmp5, %cst_7 : tensor<8x128xi1, #blocked>, tensor<8x128xf32, #blocked> loc(#loc67) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_40: f32 loc(callsite(#loc1 at #loc68)), %tmp4_41: f32 loc(callsite(#loc1 at #loc68))): + %tmp4_42 = arith.addf %tmp4_40, %tmp4_41 : f32 loc(#loc72) + tt.reduce.return %tmp4_42 : f32 loc(#loc70) + }) : (tensor<8x128xf32, #blocked>) -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc70) + %tmp4_38 = ttg.convert_layout %tmp4 : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc69) + %tmp4_39 = tt.expand_dims %tmp4_38 {axis = 1 : i32} : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xf32, #blocked1> loc(#loc69) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked1> loc(#loc33) + %1 = tt.addptr %0, %xindex_15 : tensor<8x1x!tt.ptr, #blocked1>, tensor<8x1xi32, #blocked1> loc(#loc33) + tt.store %1, %tmp4_39 : tensor<8x1x!tt.ptr, #blocked1> loc(#loc34) + tt.return loc(#loc35) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":30:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":35:29) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:45) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:41) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:55) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:50) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:68) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:60) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:73) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:127) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:45) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:41) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:34) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:50) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:104) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":41:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":43:23) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:40) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:28) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:4) +#loc41 = loc("xoffset"(#loc2)) +#loc42 = loc("xoffset"(#loc3)) +#loc43 = loc("xindex"(#loc4)) +#loc44 = loc("xindex"(#loc5)) +#loc45 = loc("r0_base"(#loc6)) +#loc46 = loc("x0"(#loc7)) +#loc47 = loc("x1"(#loc8)) +#loc48 = loc("x1"(#loc9)) +#loc49 = loc("x2"(#loc10)) +#loc50 = loc("r0_mask"(#loc11)) +#loc51 = loc("tmp0"(#loc12)) +#loc52 = loc("tmp0"(#loc13)) +#loc53 = loc("tmp0"(#loc14)) +#loc54 = loc("tmp0"(#loc15)) +#loc55 = loc("tmp0"(#loc16)) +#loc56 = loc("tmp0"(#loc17)) +#loc57 = loc("tmp0"(#loc18)) +#loc58 = loc("tmp0"(#loc19)) +#loc59 = loc("tmp0"(#loc20)) +#loc60 = loc("tmp1"(#loc21)) +#loc61 = loc("tmp1"(#loc22)) +#loc62 = loc("tmp1"(#loc23)) +#loc63 = loc("tmp1"(#loc24)) +#loc64 = loc("tmp1"(#loc25)) +#loc65 = loc("tmp2"(#loc26)) +#loc66 = loc("tmp5"(#loc27)) +#loc67 = loc("_tmp4"(#loc28)) +#loc69 = loc("tmp4"(#loc32)) +#loc70 = loc(callsite(#loc29 at #loc68)) +#loc72 = loc(callsite(#loc31 at #loc70)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..91bb250137ec4d3d783b54680af6942b85385c1f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EGDJYO36DUYGK3UQBUH6S7RMVKF77GGHWVMFFZR5R4TDMIZ4YVJA/triton_red_fused_zeros_0.ttir @@ -0,0 +1,139 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":18:0) +#loc1 = loc(unknown) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:25) +#loc38 = loc("in_ptr0"(#loc)) +#loc39 = loc("in_ptr1"(#loc)) +#loc40 = loc("out_ptr1"(#loc)) +#loc41 = loc("xnumel"(#loc)) +#loc42 = loc("r0_numel"(#loc)) +#loc72 = loc("tmp4"(#loc32)) +#loc75 = loc(callsite(#loc1 at #loc72)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<8x128xbf16> loc(#loc1) + %cst_0 = arith.constant dense<8388608> : tensor<8x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<8x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<8x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc1) + %x2 = arith.constant dense<65536> : tensor<8x1xi32> loc(#loc43) + %x1 = arith.constant dense<32> : tensor<8x1xi32> loc(#loc44) + %cst_5 = arith.constant dense<2048> : tensor<8x1xi32> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc45) + %xoffset_6 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc46) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc47) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc48) + %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<8x1xi32> loc(#loc49) + %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<8x1xi32> loc(#loc49) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc50) + %r0_base_10 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc51) + %x0 = arith.remsi %xindex_9, %cst_5 : tensor<8x1xi32> loc(#loc52) + %x1_11 = arith.divsi %xindex_9, %cst_5 : tensor<8x1xi32> loc(#loc53) + %x1_12 = arith.remsi %x1_11, %x1 : tensor<8x1xi32> loc(#loc44) + %x2_13 = arith.divsi %xindex_9, %x2 : tensor<8x1xi32> loc(#loc43) + %r0_mask = arith.cmpi slt, %r0_base_10, %cst_3 : tensor<1x128xi32> loc(#loc54) + %tmp0 = arith.muli %x1_12, %cst_2 : tensor<8x1xi32> loc(#loc55) + %tmp0_14 = tt.broadcast %r0_base_10 : tensor<1x128xi32> -> tensor<8x128xi32> loc(#loc56) + %tmp0_15 = tt.broadcast %tmp0 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc56) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<8x128xi32> loc(#loc56) + %tmp0_17 = arith.muli %x0, %cst_1 : tensor<8x1xi32> loc(#loc57) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc58) + %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<8x128xi32> loc(#loc58) + %tmp0_20 = arith.muli %x2_13, %cst_0 : tensor<8x1xi32> loc(#loc59) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc60) + %tmp0_22 = arith.addi %tmp0_19, %tmp0_21 : tensor<8x128xi32> loc(#loc60) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc61) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc61) + %tmp0_25 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc62) + %tmp0_26 = tt.load %tmp0_24, %tmp0_25, %cst evictionPolicy = evict_first : tensor<8x128x!tt.ptr> loc(#loc62) + %tmp0_27 = arith.extf %tmp0_26 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc63) + %tmp1 = arith.muli %xindex_9, %cst_2 : tensor<8x1xi32> loc(#loc64) + %tmp1_28 = tt.broadcast %tmp1 : tensor<8x1xi32> -> tensor<8x128xi32> loc(#loc65) + %tmp1_29 = arith.addi %tmp0_14, %tmp1_28 : tensor<8x128xi32> loc(#loc65) + %tmp1_30 = tt.splat %in_ptr1 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc66) + %tmp1_31 = tt.addptr %tmp1_30, %tmp1_29 : tensor<8x128x!tt.ptr>, tensor<8x128xi32> loc(#loc66) + %tmp1_32 = tt.load %tmp1_31, %tmp0_25, %cst evictionPolicy = evict_first : tensor<8x128x!tt.ptr> loc(#loc67) + %tmp1_33 = arith.extf %tmp1_32 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc68) + %tmp2 = arith.mulf %tmp0_27, %tmp1_33 : tensor<8x128xf32> loc(#loc69) + %tmp5 = arith.addf %tmp2, %cst_4 : tensor<8x128xf32> loc(#loc70) + %_tmp4 = arith.select %tmp0_25, %tmp5, %cst_4 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc71) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_35: f32 loc(callsite(#loc1 at #loc72)), %tmp4_36: f32 loc(callsite(#loc1 at #loc72))): + %tmp4_37 = arith.addf %tmp4_35, %tmp4_36 : f32 loc(#loc76) + tt.reduce.return %tmp4_37 : f32 loc(#loc74) + }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc74) + %tmp4_34 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc73) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc35) + %1 = tt.addptr %0, %xindex_9 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc35) + tt.store %1, %tmp4_34 : tensor<8x1x!tt.ptr> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":30:19) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:29) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:33) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:44) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:23) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":35:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:45) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:41) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:55) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:50) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:68) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:60) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:73) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:127) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:45) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:41) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:104) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":41:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":43:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:28) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:36) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:4) +#loc43 = loc("x2"(#loc2)) +#loc44 = loc("x1"(#loc3)) +#loc45 = loc("xoffset"(#loc4)) +#loc46 = loc("xoffset"(#loc5)) +#loc47 = loc("xindex"(#loc6)) +#loc48 = loc("xindex"(#loc7)) +#loc49 = loc("xindex"(#loc8)) +#loc50 = loc("r0_base"(#loc9)) +#loc51 = loc("r0_base"(#loc10)) +#loc52 = loc("x0"(#loc11)) +#loc53 = loc("x1"(#loc12)) +#loc54 = loc("r0_mask"(#loc13)) +#loc55 = loc("tmp0"(#loc14)) +#loc56 = loc("tmp0"(#loc15)) +#loc57 = loc("tmp0"(#loc16)) +#loc58 = loc("tmp0"(#loc17)) +#loc59 = loc("tmp0"(#loc18)) +#loc60 = loc("tmp0"(#loc19)) +#loc61 = loc("tmp0"(#loc20)) +#loc62 = loc("tmp0"(#loc21)) +#loc63 = loc("tmp0"(#loc22)) +#loc64 = loc("tmp1"(#loc23)) +#loc65 = loc("tmp1"(#loc24)) +#loc66 = loc("tmp1"(#loc25)) +#loc67 = loc("tmp1"(#loc26)) +#loc68 = loc("tmp1"(#loc27)) +#loc69 = loc("tmp2"(#loc28)) +#loc70 = loc("tmp5"(#loc29)) +#loc71 = loc("_tmp4"(#loc30)) +#loc73 = loc("tmp4"(#loc34)) +#loc74 = loc(callsite(#loc31 at #loc72)) +#loc76 = loc(callsite(#loc33 at #loc74)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/__grp__log_softmax_forward_kernel.json b/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/__grp__log_softmax_forward_kernel.json new file mode 100644 index 0000000000000000000000000000000000000000..527508b2905ba9e20681f36c032ddb8f7b908297 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/__grp__log_softmax_forward_kernel.json @@ -0,0 +1 @@ +{"child_paths": {"log_softmax_forward_kernel.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.source", "log_softmax_forward_kernel.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.ttir", "log_softmax_forward_kernel.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.ttgir", "log_softmax_forward_kernel.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.llir", "log_softmax_forward_kernel.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.ptx", "log_softmax_forward_kernel.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.cubin", "log_softmax_forward_kernel.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.cubin new file mode 100644 index 0000000000000000000000000000000000000000..b1a5222fea0110d29f2d794b7d9ccce2163ab487 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.json b/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.json new file mode 100644 index 0000000000000000000000000000000000000000..336bb21bded466f3c96e88fc8d3800066e3a493d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.json @@ -0,0 +1 @@ +{"hash": "22dbafd5b593fe7a9facb579b28a2ac7007692a1837f9be3ae2db507d21d97bb", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "log_softmax_forward_kernel"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.llir new file mode 100644 index 0000000000000000000000000000000000000000..9e223ba1f24dd84509553ccc0458a93297fc75d8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.llir @@ -0,0 +1,1111 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @log_softmax_forward_kernel(ptr addrspace(1) %0, i32 %1, ptr addrspace(1) %2, i32 %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #0 !dbg !5 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %13 = zext nneg i32 %12 to i64, !dbg !9 + %14 = sext i32 %1 to i64, !dbg !10 + %15 = mul nsw i64 %14, %13, !dbg !10 + %16 = getelementptr bfloat, ptr addrspace(1) %0, i64 %15, !dbg !11 + %17 = sext i32 %3 to i64, !dbg !12 + %18 = mul nsw i64 %17, %13, !dbg !12 + %19 = getelementptr float, ptr addrspace(1) %2, i64 %18, !dbg !13 + %20 = getelementptr i1, ptr addrspace(1) %4, i64 %13, !dbg !14 + %21 = tail call i8 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.b8 { $0 }, [ $1 + 0 ];", "=c,l"(ptr addrspace(1) %20) #6, !dbg !15 + %.not = icmp eq i8 %21, 0, !dbg !15 + br i1 %.not, label %common.ret, label %22, !dbg !16 + +common.ret: ; preds = %11, %._crit_edge7 + ret void, !dbg !17 + +22: ; preds = %11 + %23 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !18 + %24 = and i32 %23, 31, !dbg !18 + %25 = lshr i32 %23, 5, !dbg !18 + %26 = shl nuw nsw i32 %23, 3, !dbg !18 + %27 = or disjoint i32 %26, 8192, !dbg !18 + %28 = or disjoint i32 %26, 16384, !dbg !18 + %29 = or disjoint i32 %26, 24576, !dbg !18 + %30 = icmp sgt i32 %8, 0, !dbg !19 + br i1 %30, label %.lr.ph, label %._crit_edge, !dbg !19 + +.lr.ph: ; preds = %22 + %31 = icmp eq i32 %24, 0 + %32 = getelementptr float, ptr addrspace(3) @global_smem, i32 %25 + %33 = icmp samesign ult i32 %23, 32 + %34 = getelementptr float, ptr addrspace(3) @global_smem, i32 %23 + %35 = icmp eq i32 %23, 0 + br label %36, !dbg !19 + +36: ; preds = %.lr.ph, %36 + %37 = phi float [ 0.000000e+00, %.lr.ph ], [ %464, %36 ] + %38 = phi float [ 0xFFF0000000000000, %.lr.ph ], [ %263, %36 ] + %39 = phi i32 [ 0, %.lr.ph ], [ %465, %36 ] + %40 = or disjoint i32 %39, %26, !dbg !20 + %41 = or disjoint i32 %39, %27, !dbg !20 + %42 = or disjoint i32 %39, %28, !dbg !20 + %43 = or disjoint i32 %39, %29, !dbg !20 + %44 = icmp slt i32 %40, %8, !dbg !21 + %45 = icmp slt i32 %41, %8, !dbg !21 + %46 = icmp slt i32 %42, %8, !dbg !21 + %47 = icmp slt i32 %43, %8, !dbg !21 + %48 = sext i32 %40 to i64, !dbg !22 + %49 = getelementptr bfloat, ptr addrspace(1) %16, i64 %48, !dbg !22 + %50 = sext i32 %41 to i64, !dbg !22 + %51 = getelementptr bfloat, ptr addrspace(1) %16, i64 %50, !dbg !22 + %52 = sext i32 %42 to i64, !dbg !22 + %53 = getelementptr bfloat, ptr addrspace(1) %16, i64 %52, !dbg !22 + %54 = sext i32 %43 to i64, !dbg !22 + %55 = getelementptr bfloat, ptr addrspace(1) %16, i64 %54, !dbg !22 + %56 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 -8323200, i32 -8323200, i32 -8323200, i32 -8323200, ptr addrspace(1) %49, i1 %44) #6, !dbg !23 + %57 = extractvalue { i32, i32, i32, i32 } %56, 0, !dbg !23 + %58 = bitcast i32 %57 to <2 x bfloat>, !dbg !23 + %59 = extractvalue { i32, i32, i32, i32 } %56, 1, !dbg !23 + %60 = bitcast i32 %59 to <2 x bfloat>, !dbg !23 + %61 = extractvalue { i32, i32, i32, i32 } %56, 2, !dbg !23 + %62 = bitcast i32 %61 to <2 x bfloat>, !dbg !23 + %63 = extractvalue { i32, i32, i32, i32 } %56, 3, !dbg !23 + %64 = bitcast i32 %63 to <2 x bfloat>, !dbg !23 + %65 = extractelement <2 x bfloat> %58, i64 0, !dbg !23 + %66 = extractelement <2 x bfloat> %58, i64 1, !dbg !23 + %67 = extractelement <2 x bfloat> %60, i64 0, !dbg !23 + %68 = extractelement <2 x bfloat> %60, i64 1, !dbg !23 + %69 = extractelement <2 x bfloat> %62, i64 0, !dbg !23 + %70 = extractelement <2 x bfloat> %62, i64 1, !dbg !23 + %71 = extractelement <2 x bfloat> %64, i64 0, !dbg !23 + %72 = extractelement <2 x bfloat> %64, i64 1, !dbg !23 + %73 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 -8323200, i32 -8323200, i32 -8323200, i32 -8323200, ptr addrspace(1) %51, i1 %45) #6, !dbg !23 + %74 = extractvalue { i32, i32, i32, i32 } %73, 0, !dbg !23 + %75 = bitcast i32 %74 to <2 x bfloat>, !dbg !23 + %76 = extractvalue { i32, i32, i32, i32 } %73, 1, !dbg !23 + %77 = bitcast i32 %76 to <2 x bfloat>, !dbg !23 + %78 = extractvalue { i32, i32, i32, i32 } %73, 2, !dbg !23 + %79 = bitcast i32 %78 to <2 x bfloat>, !dbg !23 + %80 = extractvalue { i32, i32, i32, i32 } %73, 3, !dbg !23 + %81 = bitcast i32 %80 to <2 x bfloat>, !dbg !23 + %82 = extractelement <2 x bfloat> %75, i64 0, !dbg !23 + %83 = extractelement <2 x bfloat> %75, i64 1, !dbg !23 + %84 = extractelement <2 x bfloat> %77, i64 0, !dbg !23 + %85 = extractelement <2 x bfloat> %77, i64 1, !dbg !23 + %86 = extractelement <2 x bfloat> %79, i64 0, !dbg !23 + %87 = extractelement <2 x bfloat> %79, i64 1, !dbg !23 + %88 = extractelement <2 x bfloat> %81, i64 0, !dbg !23 + %89 = extractelement <2 x bfloat> %81, i64 1, !dbg !23 + %90 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 -8323200, i32 -8323200, i32 -8323200, i32 -8323200, ptr addrspace(1) %53, i1 %46) #6, !dbg !23 + %91 = extractvalue { i32, i32, i32, i32 } %90, 0, !dbg !23 + %92 = bitcast i32 %91 to <2 x bfloat>, !dbg !23 + %93 = extractvalue { i32, i32, i32, i32 } %90, 1, !dbg !23 + %94 = bitcast i32 %93 to <2 x bfloat>, !dbg !23 + %95 = extractvalue { i32, i32, i32, i32 } %90, 2, !dbg !23 + %96 = bitcast i32 %95 to <2 x bfloat>, !dbg !23 + %97 = extractvalue { i32, i32, i32, i32 } %90, 3, !dbg !23 + %98 = bitcast i32 %97 to <2 x bfloat>, !dbg !23 + %99 = extractelement <2 x bfloat> %92, i64 0, !dbg !23 + %100 = extractelement <2 x bfloat> %92, i64 1, !dbg !23 + %101 = extractelement <2 x bfloat> %94, i64 0, !dbg !23 + %102 = extractelement <2 x bfloat> %94, i64 1, !dbg !23 + %103 = extractelement <2 x bfloat> %96, i64 0, !dbg !23 + %104 = extractelement <2 x bfloat> %96, i64 1, !dbg !23 + %105 = extractelement <2 x bfloat> %98, i64 0, !dbg !23 + %106 = extractelement <2 x bfloat> %98, i64 1, !dbg !23 + %107 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 -8323200, i32 -8323200, i32 -8323200, i32 -8323200, ptr addrspace(1) %55, i1 %47) #6, !dbg !23 + %108 = extractvalue { i32, i32, i32, i32 } %107, 0, !dbg !23 + %109 = bitcast i32 %108 to <2 x bfloat>, !dbg !23 + %110 = extractvalue { i32, i32, i32, i32 } %107, 1, !dbg !23 + %111 = bitcast i32 %110 to <2 x bfloat>, !dbg !23 + %112 = extractvalue { i32, i32, i32, i32 } %107, 2, !dbg !23 + %113 = bitcast i32 %112 to <2 x bfloat>, !dbg !23 + %114 = extractvalue { i32, i32, i32, i32 } %107, 3, !dbg !23 + %115 = bitcast i32 %114 to <2 x bfloat>, !dbg !23 + %116 = extractelement <2 x bfloat> %109, i64 0, !dbg !23 + %117 = extractelement <2 x bfloat> %109, i64 1, !dbg !23 + %118 = extractelement <2 x bfloat> %111, i64 0, !dbg !23 + %119 = extractelement <2 x bfloat> %111, i64 1, !dbg !23 + %120 = extractelement <2 x bfloat> %113, i64 0, !dbg !23 + %121 = extractelement <2 x bfloat> %113, i64 1, !dbg !23 + %122 = extractelement <2 x bfloat> %115, i64 0, !dbg !23 + %123 = extractelement <2 x bfloat> %115, i64 1, !dbg !23 + %124 = fpext bfloat %65 to float, !dbg !24 + %125 = fpext bfloat %66 to float, !dbg !24 + %126 = fpext bfloat %67 to float, !dbg !24 + %127 = fpext bfloat %68 to float, !dbg !24 + %128 = fpext bfloat %69 to float, !dbg !24 + %129 = fpext bfloat %70 to float, !dbg !24 + %130 = fpext bfloat %71 to float, !dbg !24 + %131 = fpext bfloat %72 to float, !dbg !24 + %132 = fpext bfloat %82 to float, !dbg !24 + %133 = fpext bfloat %83 to float, !dbg !24 + %134 = fpext bfloat %84 to float, !dbg !24 + %135 = fpext bfloat %85 to float, !dbg !24 + %136 = fpext bfloat %86 to float, !dbg !24 + %137 = fpext bfloat %87 to float, !dbg !24 + %138 = fpext bfloat %88 to float, !dbg !24 + %139 = fpext bfloat %89 to float, !dbg !24 + %140 = fpext bfloat %99 to float, !dbg !24 + %141 = fpext bfloat %100 to float, !dbg !24 + %142 = fpext bfloat %101 to float, !dbg !24 + %143 = fpext bfloat %102 to float, !dbg !24 + %144 = fpext bfloat %103 to float, !dbg !24 + %145 = fpext bfloat %104 to float, !dbg !24 + %146 = fpext bfloat %105 to float, !dbg !24 + %147 = fpext bfloat %106 to float, !dbg !24 + %148 = fpext bfloat %116 to float, !dbg !24 + %149 = fpext bfloat %117 to float, !dbg !24 + %150 = fpext bfloat %118 to float, !dbg !24 + %151 = fpext bfloat %119 to float, !dbg !24 + %152 = fpext bfloat %120 to float, !dbg !24 + %153 = fpext bfloat %121 to float, !dbg !24 + %154 = fpext bfloat %122 to float, !dbg !24 + %155 = fpext bfloat %123 to float, !dbg !24 + %156 = select i1 %44, float %124, float 0xFFF0000000000000, !dbg !25 + %157 = select i1 %44, float %125, float 0xFFF0000000000000, !dbg !25 + %158 = select i1 %44, float %126, float 0xFFF0000000000000, !dbg !25 + %159 = select i1 %44, float %127, float 0xFFF0000000000000, !dbg !25 + %160 = select i1 %44, float %128, float 0xFFF0000000000000, !dbg !25 + %161 = select i1 %44, float %129, float 0xFFF0000000000000, !dbg !25 + %162 = select i1 %44, float %130, float 0xFFF0000000000000, !dbg !25 + %163 = select i1 %44, float %131, float 0xFFF0000000000000, !dbg !25 + %164 = select i1 %45, float %132, float 0xFFF0000000000000, !dbg !25 + %165 = select i1 %45, float %133, float 0xFFF0000000000000, !dbg !25 + %166 = select i1 %45, float %134, float 0xFFF0000000000000, !dbg !25 + %167 = select i1 %45, float %135, float 0xFFF0000000000000, !dbg !25 + %168 = select i1 %45, float %136, float 0xFFF0000000000000, !dbg !25 + %169 = select i1 %45, float %137, float 0xFFF0000000000000, !dbg !25 + %170 = select i1 %45, float %138, float 0xFFF0000000000000, !dbg !25 + %171 = select i1 %45, float %139, float 0xFFF0000000000000, !dbg !25 + %172 = select i1 %46, float %140, float 0xFFF0000000000000, !dbg !25 + %173 = select i1 %46, float %141, float 0xFFF0000000000000, !dbg !25 + %174 = select i1 %46, float %142, float 0xFFF0000000000000, !dbg !25 + %175 = select i1 %46, float %143, float 0xFFF0000000000000, !dbg !25 + %176 = select i1 %46, float %144, float 0xFFF0000000000000, !dbg !25 + %177 = select i1 %46, float %145, float 0xFFF0000000000000, !dbg !25 + %178 = select i1 %46, float %146, float 0xFFF0000000000000, !dbg !25 + %179 = select i1 %46, float %147, float 0xFFF0000000000000, !dbg !25 + %180 = select i1 %47, float %148, float 0xFFF0000000000000, !dbg !25 + %181 = select i1 %47, float %149, float 0xFFF0000000000000, !dbg !25 + %182 = select i1 %47, float %150, float 0xFFF0000000000000, !dbg !25 + %183 = select i1 %47, float %151, float 0xFFF0000000000000, !dbg !25 + %184 = select i1 %47, float %152, float 0xFFF0000000000000, !dbg !25 + %185 = select i1 %47, float %153, float 0xFFF0000000000000, !dbg !25 + %186 = select i1 %47, float %154, float 0xFFF0000000000000, !dbg !25 + %187 = select i1 %47, float %155, float 0xFFF0000000000000, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !26 + %188 = tail call float @llvm.maxnum.f32(float %156, float %157), !dbg !30 + %189 = tail call float @llvm.maxnum.f32(float %188, float %158), !dbg !30 + %190 = tail call float @llvm.maxnum.f32(float %189, float %159), !dbg !30 + %191 = tail call float @llvm.maxnum.f32(float %190, float %160), !dbg !30 + %192 = tail call float @llvm.maxnum.f32(float %191, float %161), !dbg !30 + %193 = tail call float @llvm.maxnum.f32(float %192, float %162), !dbg !30 + %194 = tail call float @llvm.maxnum.f32(float %193, float %163), !dbg !30 + %195 = tail call float @llvm.maxnum.f32(float %194, float %164), !dbg !30 + %196 = tail call float @llvm.maxnum.f32(float %195, float %165), !dbg !30 + %197 = tail call float @llvm.maxnum.f32(float %196, float %166), !dbg !30 + %198 = tail call float @llvm.maxnum.f32(float %197, float %167), !dbg !30 + %199 = tail call float @llvm.maxnum.f32(float %198, float %168), !dbg !30 + %200 = tail call float @llvm.maxnum.f32(float %199, float %169), !dbg !30 + %201 = tail call float @llvm.maxnum.f32(float %200, float %170), !dbg !30 + %202 = tail call float @llvm.maxnum.f32(float %201, float %171), !dbg !30 + %203 = tail call float @llvm.maxnum.f32(float %202, float %172), !dbg !30 + %204 = tail call float @llvm.maxnum.f32(float %203, float %173), !dbg !30 + %205 = tail call float @llvm.maxnum.f32(float %204, float %174), !dbg !30 + %206 = tail call float @llvm.maxnum.f32(float %205, float %175), !dbg !30 + %207 = tail call float @llvm.maxnum.f32(float %206, float %176), !dbg !30 + %208 = tail call float @llvm.maxnum.f32(float %207, float %177), !dbg !30 + %209 = tail call float @llvm.maxnum.f32(float %208, float %178), !dbg !30 + %210 = tail call float @llvm.maxnum.f32(float %209, float %179), !dbg !30 + %211 = tail call float @llvm.maxnum.f32(float %210, float %180), !dbg !30 + %212 = tail call float @llvm.maxnum.f32(float %211, float %181), !dbg !30 + %213 = tail call float @llvm.maxnum.f32(float %212, float %182), !dbg !30 + %214 = tail call float @llvm.maxnum.f32(float %213, float %183), !dbg !30 + %215 = tail call float @llvm.maxnum.f32(float %214, float %184), !dbg !30 + %216 = tail call float @llvm.maxnum.f32(float %215, float %185), !dbg !30 + %217 = tail call float @llvm.maxnum.f32(float %216, float %186), !dbg !30 + %218 = tail call float @llvm.maxnum.f32(float %217, float %187), !dbg !30 + %219 = bitcast float %218 to i32, !dbg !26 + %220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 16, i32 31), !dbg !26 + %221 = bitcast i32 %220 to float, !dbg !26 + %222 = tail call float @llvm.maxnum.f32(float %218, float %221), !dbg !30 + %223 = bitcast float %222 to i32, !dbg !26 + %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 8, i32 31), !dbg !26 + %225 = bitcast i32 %224 to float, !dbg !26 + %226 = tail call float @llvm.maxnum.f32(float %222, float %225), !dbg !30 + %227 = bitcast float %226 to i32, !dbg !26 + %228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %227, i32 4, i32 31), !dbg !26 + %229 = bitcast i32 %228 to float, !dbg !26 + %230 = tail call float @llvm.maxnum.f32(float %226, float %229), !dbg !30 + %231 = bitcast float %230 to i32, !dbg !26 + %232 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %231, i32 2, i32 31), !dbg !26 + %233 = bitcast i32 %232 to float, !dbg !26 + %234 = tail call float @llvm.maxnum.f32(float %230, float %233), !dbg !30 + %235 = bitcast float %234 to i32, !dbg !26 + %236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %235, i32 1, i32 31), !dbg !26 + %237 = bitcast i32 %236 to float, !dbg !26 + %238 = tail call float @llvm.maxnum.f32(float %234, float %237), !dbg !30 + %239 = bitcast float %238 to <1 x i32>, !dbg !26 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %32, <1 x i32> %239, i1 %31) #6, !dbg !26 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !26 + %240 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %34, i1 %33) #6, !dbg !26 + %241 = bitcast i32 %240 to float, !dbg !26 + %242 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 16, i32 31), !dbg !26 + %243 = bitcast i32 %242 to float, !dbg !26 + %244 = tail call float @llvm.maxnum.f32(float %241, float %243), !dbg !30 + %245 = bitcast float %244 to i32, !dbg !26 + %246 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %245, i32 8, i32 31), !dbg !26 + %247 = bitcast i32 %246 to float, !dbg !26 + %248 = tail call float @llvm.maxnum.f32(float %244, float %247), !dbg !30 + %249 = bitcast float %248 to i32, !dbg !26 + %250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %249, i32 4, i32 31), !dbg !26 + %251 = bitcast i32 %250 to float, !dbg !26 + %252 = tail call float @llvm.maxnum.f32(float %248, float %251), !dbg !30 + %253 = bitcast float %252 to i32, !dbg !26 + %254 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %253, i32 2, i32 31), !dbg !26 + %255 = bitcast i32 %254 to float, !dbg !26 + %256 = tail call float @llvm.maxnum.f32(float %252, float %255), !dbg !30 + %257 = bitcast float %256 to i32, !dbg !26 + %258 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %257, i32 1, i32 31), !dbg !26 + %259 = bitcast i32 %258 to float, !dbg !26 + %260 = tail call float @llvm.maxnum.f32(float %256, float %259), !dbg !30 + %261 = bitcast float %260 to <1 x i32>, !dbg !26 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %34, <1 x i32> %261, i1 %35) #6, !dbg !26 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !26 + %262 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !26 + %263 = tail call float @llvm.maxnum.f32(float %38, float %262), !dbg !31 + %264 = fsub float %38, %263, !dbg !32 + %265 = fmul float %264, 0x3FF7154760000000, !dbg !33 + %266 = tail call float @llvm.nvvm.ex2.approx.f(float %265), !dbg !33 + %267 = fmul float %37, %266, !dbg !34 + %268 = fsub float %124, %263, !dbg !35 + %269 = fsub float %125, %263, !dbg !35 + %270 = fsub float %126, %263, !dbg !35 + %271 = fsub float %127, %263, !dbg !35 + %272 = fsub float %128, %263, !dbg !35 + %273 = fsub float %129, %263, !dbg !35 + %274 = fsub float %130, %263, !dbg !35 + %275 = fsub float %131, %263, !dbg !35 + %276 = fsub float %132, %263, !dbg !35 + %277 = fsub float %133, %263, !dbg !35 + %278 = fsub float %134, %263, !dbg !35 + %279 = fsub float %135, %263, !dbg !35 + %280 = fsub float %136, %263, !dbg !35 + %281 = fsub float %137, %263, !dbg !35 + %282 = fsub float %138, %263, !dbg !35 + %283 = fsub float %139, %263, !dbg !35 + %284 = fsub float %140, %263, !dbg !35 + %285 = fsub float %141, %263, !dbg !35 + %286 = fsub float %142, %263, !dbg !35 + %287 = fsub float %143, %263, !dbg !35 + %288 = fsub float %144, %263, !dbg !35 + %289 = fsub float %145, %263, !dbg !35 + %290 = fsub float %146, %263, !dbg !35 + %291 = fsub float %147, %263, !dbg !35 + %292 = fsub float %148, %263, !dbg !35 + %293 = fsub float %149, %263, !dbg !35 + %294 = fsub float %150, %263, !dbg !35 + %295 = fsub float %151, %263, !dbg !35 + %296 = fsub float %152, %263, !dbg !35 + %297 = fsub float %153, %263, !dbg !35 + %298 = fsub float %154, %263, !dbg !35 + %299 = fsub float %155, %263, !dbg !35 + %300 = fmul float %268, 0x3FF7154760000000, !dbg !36 + %301 = tail call float @llvm.nvvm.ex2.approx.f(float %300), !dbg !36 + %302 = fmul float %269, 0x3FF7154760000000, !dbg !36 + %303 = tail call float @llvm.nvvm.ex2.approx.f(float %302), !dbg !36 + %304 = fmul float %270, 0x3FF7154760000000, !dbg !36 + %305 = tail call float @llvm.nvvm.ex2.approx.f(float %304), !dbg !36 + %306 = fmul float %271, 0x3FF7154760000000, !dbg !36 + %307 = tail call float @llvm.nvvm.ex2.approx.f(float %306), !dbg !36 + %308 = fmul float %272, 0x3FF7154760000000, !dbg !36 + %309 = tail call float @llvm.nvvm.ex2.approx.f(float %308), !dbg !36 + %310 = fmul float %273, 0x3FF7154760000000, !dbg !36 + %311 = tail call float @llvm.nvvm.ex2.approx.f(float %310), !dbg !36 + %312 = fmul float %274, 0x3FF7154760000000, !dbg !36 + %313 = tail call float @llvm.nvvm.ex2.approx.f(float %312), !dbg !36 + %314 = fmul float %275, 0x3FF7154760000000, !dbg !36 + %315 = tail call float @llvm.nvvm.ex2.approx.f(float %314), !dbg !36 + %316 = fmul float %276, 0x3FF7154760000000, !dbg !36 + %317 = tail call float @llvm.nvvm.ex2.approx.f(float %316), !dbg !36 + %318 = fmul float %277, 0x3FF7154760000000, !dbg !36 + %319 = tail call float @llvm.nvvm.ex2.approx.f(float %318), !dbg !36 + %320 = fmul float %278, 0x3FF7154760000000, !dbg !36 + %321 = tail call float @llvm.nvvm.ex2.approx.f(float %320), !dbg !36 + %322 = fmul float %279, 0x3FF7154760000000, !dbg !36 + %323 = tail call float @llvm.nvvm.ex2.approx.f(float %322), !dbg !36 + %324 = fmul float %280, 0x3FF7154760000000, !dbg !36 + %325 = tail call float @llvm.nvvm.ex2.approx.f(float %324), !dbg !36 + %326 = fmul float %281, 0x3FF7154760000000, !dbg !36 + %327 = tail call float @llvm.nvvm.ex2.approx.f(float %326), !dbg !36 + %328 = fmul float %282, 0x3FF7154760000000, !dbg !36 + %329 = tail call float @llvm.nvvm.ex2.approx.f(float %328), !dbg !36 + %330 = fmul float %283, 0x3FF7154760000000, !dbg !36 + %331 = tail call float @llvm.nvvm.ex2.approx.f(float %330), !dbg !36 + %332 = fmul float %284, 0x3FF7154760000000, !dbg !36 + %333 = tail call float @llvm.nvvm.ex2.approx.f(float %332), !dbg !36 + %334 = fmul float %285, 0x3FF7154760000000, !dbg !36 + %335 = tail call float @llvm.nvvm.ex2.approx.f(float %334), !dbg !36 + %336 = fmul float %286, 0x3FF7154760000000, !dbg !36 + %337 = tail call float @llvm.nvvm.ex2.approx.f(float %336), !dbg !36 + %338 = fmul float %287, 0x3FF7154760000000, !dbg !36 + %339 = tail call float @llvm.nvvm.ex2.approx.f(float %338), !dbg !36 + %340 = fmul float %288, 0x3FF7154760000000, !dbg !36 + %341 = tail call float @llvm.nvvm.ex2.approx.f(float %340), !dbg !36 + %342 = fmul float %289, 0x3FF7154760000000, !dbg !36 + %343 = tail call float @llvm.nvvm.ex2.approx.f(float %342), !dbg !36 + %344 = fmul float %290, 0x3FF7154760000000, !dbg !36 + %345 = tail call float @llvm.nvvm.ex2.approx.f(float %344), !dbg !36 + %346 = fmul float %291, 0x3FF7154760000000, !dbg !36 + %347 = tail call float @llvm.nvvm.ex2.approx.f(float %346), !dbg !36 + %348 = fmul float %292, 0x3FF7154760000000, !dbg !36 + %349 = tail call float @llvm.nvvm.ex2.approx.f(float %348), !dbg !36 + %350 = fmul float %293, 0x3FF7154760000000, !dbg !36 + %351 = tail call float @llvm.nvvm.ex2.approx.f(float %350), !dbg !36 + %352 = fmul float %294, 0x3FF7154760000000, !dbg !36 + %353 = tail call float @llvm.nvvm.ex2.approx.f(float %352), !dbg !36 + %354 = fmul float %295, 0x3FF7154760000000, !dbg !36 + %355 = tail call float @llvm.nvvm.ex2.approx.f(float %354), !dbg !36 + %356 = fmul float %296, 0x3FF7154760000000, !dbg !36 + %357 = tail call float @llvm.nvvm.ex2.approx.f(float %356), !dbg !36 + %358 = fmul float %297, 0x3FF7154760000000, !dbg !36 + %359 = tail call float @llvm.nvvm.ex2.approx.f(float %358), !dbg !36 + %360 = fmul float %298, 0x3FF7154760000000, !dbg !36 + %361 = tail call float @llvm.nvvm.ex2.approx.f(float %360), !dbg !36 + %362 = fmul float %299, 0x3FF7154760000000, !dbg !36 + %363 = tail call float @llvm.nvvm.ex2.approx.f(float %362), !dbg !36 + %364 = select i1 %45, float %317, float 0.000000e+00, !dbg !37 + %365 = select i1 %45, float %319, float 0.000000e+00, !dbg !37 + %366 = select i1 %45, float %321, float 0.000000e+00, !dbg !37 + %367 = select i1 %45, float %323, float 0.000000e+00, !dbg !37 + %368 = select i1 %45, float %325, float 0.000000e+00, !dbg !37 + %369 = select i1 %45, float %327, float 0.000000e+00, !dbg !37 + %370 = select i1 %45, float %329, float 0.000000e+00, !dbg !37 + %371 = select i1 %45, float %331, float 0.000000e+00, !dbg !37 + %372 = select i1 %46, float %333, float 0.000000e+00, !dbg !37 + %373 = select i1 %46, float %335, float 0.000000e+00, !dbg !37 + %374 = select i1 %46, float %337, float 0.000000e+00, !dbg !37 + %375 = select i1 %46, float %339, float 0.000000e+00, !dbg !37 + %376 = select i1 %46, float %341, float 0.000000e+00, !dbg !37 + %377 = select i1 %46, float %343, float 0.000000e+00, !dbg !37 + %378 = select i1 %46, float %345, float 0.000000e+00, !dbg !37 + %379 = select i1 %46, float %347, float 0.000000e+00, !dbg !37 + %380 = select i1 %47, float %349, float 0.000000e+00, !dbg !37 + %381 = select i1 %47, float %351, float 0.000000e+00, !dbg !37 + %382 = select i1 %47, float %353, float 0.000000e+00, !dbg !37 + %383 = select i1 %47, float %355, float 0.000000e+00, !dbg !37 + %384 = select i1 %47, float %357, float 0.000000e+00, !dbg !37 + %385 = select i1 %47, float %359, float 0.000000e+00, !dbg !37 + %386 = select i1 %47, float %361, float 0.000000e+00, !dbg !37 + %387 = select i1 %47, float %363, float 0.000000e+00, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38 + %388 = fadd float %301, %303, !dbg !40 + %389 = fadd float %388, %305, !dbg !40 + %390 = fadd float %389, %307, !dbg !40 + %391 = fadd float %390, %309, !dbg !40 + %392 = fadd float %391, %311, !dbg !40 + %393 = fadd float %392, %313, !dbg !40 + %394 = fadd float %393, %315, !dbg !40 + %395 = select i1 %44, float %394, float 0.000000e+00, !dbg !40 + %396 = fadd float %395, %364, !dbg !40 + %397 = fadd float %396, %365, !dbg !40 + %398 = fadd float %397, %366, !dbg !40 + %399 = fadd float %398, %367, !dbg !40 + %400 = fadd float %399, %368, !dbg !40 + %401 = fadd float %400, %369, !dbg !40 + %402 = fadd float %401, %370, !dbg !40 + %403 = fadd float %402, %371, !dbg !40 + %404 = fadd float %403, %372, !dbg !40 + %405 = fadd float %404, %373, !dbg !40 + %406 = fadd float %405, %374, !dbg !40 + %407 = fadd float %406, %375, !dbg !40 + %408 = fadd float %407, %376, !dbg !40 + %409 = fadd float %408, %377, !dbg !40 + %410 = fadd float %409, %378, !dbg !40 + %411 = fadd float %410, %379, !dbg !40 + %412 = fadd float %411, %380, !dbg !40 + %413 = fadd float %412, %381, !dbg !40 + %414 = fadd float %413, %382, !dbg !40 + %415 = fadd float %414, %383, !dbg !40 + %416 = fadd float %415, %384, !dbg !40 + %417 = fadd float %416, %385, !dbg !40 + %418 = fadd float %417, %386, !dbg !40 + %419 = fadd float %418, %387, !dbg !40 + %420 = bitcast float %419 to i32, !dbg !38 + %421 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %420, i32 16, i32 31), !dbg !38 + %422 = bitcast i32 %421 to float, !dbg !38 + %423 = fadd float %419, %422, !dbg !40 + %424 = bitcast float %423 to i32, !dbg !38 + %425 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %424, i32 8, i32 31), !dbg !38 + %426 = bitcast i32 %425 to float, !dbg !38 + %427 = fadd float %423, %426, !dbg !40 + %428 = bitcast float %427 to i32, !dbg !38 + %429 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %428, i32 4, i32 31), !dbg !38 + %430 = bitcast i32 %429 to float, !dbg !38 + %431 = fadd float %427, %430, !dbg !40 + %432 = bitcast float %431 to i32, !dbg !38 + %433 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %432, i32 2, i32 31), !dbg !38 + %434 = bitcast i32 %433 to float, !dbg !38 + %435 = fadd float %431, %434, !dbg !40 + %436 = bitcast float %435 to i32, !dbg !38 + %437 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %436, i32 1, i32 31), !dbg !38 + %438 = bitcast i32 %437 to float, !dbg !38 + %439 = fadd float %435, %438, !dbg !40 + %440 = bitcast float %439 to <1 x i32>, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %32, <1 x i32> %440, i1 %31) #6, !dbg !38 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38 + %441 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %34, i1 %33) #6, !dbg !38 + %442 = bitcast i32 %441 to float, !dbg !38 + %443 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %441, i32 16, i32 31), !dbg !38 + %444 = bitcast i32 %443 to float, !dbg !38 + %445 = fadd float %442, %444, !dbg !40 + %446 = bitcast float %445 to i32, !dbg !38 + %447 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %446, i32 8, i32 31), !dbg !38 + %448 = bitcast i32 %447 to float, !dbg !38 + %449 = fadd float %445, %448, !dbg !40 + %450 = bitcast float %449 to i32, !dbg !38 + %451 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %450, i32 4, i32 31), !dbg !38 + %452 = bitcast i32 %451 to float, !dbg !38 + %453 = fadd float %449, %452, !dbg !40 + %454 = bitcast float %453 to i32, !dbg !38 + %455 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %454, i32 2, i32 31), !dbg !38 + %456 = bitcast i32 %455 to float, !dbg !38 + %457 = fadd float %453, %456, !dbg !40 + %458 = bitcast float %457 to i32, !dbg !38 + %459 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %458, i32 1, i32 31), !dbg !38 + %460 = bitcast i32 %459 to float, !dbg !38 + %461 = fadd float %457, %460, !dbg !40 + %462 = bitcast float %461 to <1 x i32>, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %34, <1 x i32> %462, i1 %35) #6, !dbg !38 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38 + %463 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !38 + %464 = fadd float %267, %463, !dbg !41 + %465 = add i32 %39, 32768, !dbg !19 + %466 = icmp slt i32 %465, %8, !dbg !19 + br i1 %466, label %36, label %._crit_edge, !dbg !19 + +._crit_edge: ; preds = %36, %22 + %.lcssa4 = phi float [ 0xFFF0000000000000, %22 ], [ %263, %36 ], !dbg !42 + %.lcssa3 = phi float [ 0.000000e+00, %22 ], [ %464, %36 ], !dbg !43 + %467 = fcmp olt float %.lcssa3, 0x3810000000000000, !dbg !44 + %468 = fmul float %.lcssa3, 0x4160000000000000, !dbg !44 + %.02.i = select i1 %467, float %468, float %.lcssa3, !dbg !44 + %i.i.0.i = select i1 %467, float -2.300000e+01, float 0.000000e+00, !dbg !44 + %469 = bitcast float %.02.i to i32, !dbg !44 + %470 = add i32 %469, -1059760811, !dbg !44 + %471 = and i32 %470, -8388608, !dbg !44 + %472 = sub i32 %469, %471, !dbg !44 + %473 = bitcast i32 %472 to float, !dbg !44 + %474 = sitofp i32 %471 to float, !dbg !44 + %475 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i = icmp eq i32 %475, 0, !dbg !44 + %476 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %474, float 0x3E80000000000000, float %i.i.0.i) #6, !dbg !44 + %477 = tail call float @llvm.nvvm.fma.rn.f(float %474, float 0x3E80000000000000, float %i.i.0.i) #6, !dbg !44 + %.08.i = select i1 %.not.i, float %477, float %476, !dbg !44 + %478 = fadd float %473, -1.000000e+00, !dbg !44 + %479 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i = icmp eq i32 %479, 0, !dbg !44 + %480 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %478, float 0x3FC2073EC0000000) #6, !dbg !44 + %481 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %478, float 0x3FC2073EC0000000) #6, !dbg !44 + %.010.i = select i1 %.not1.i, float %481, float %480, !dbg !44 + %482 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i = icmp eq i32 %482, 0, !dbg !44 + %483 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i, float %478, float 0xBFBF19B980000000) #6, !dbg !44 + %484 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i, float %478, float 0xBFBF19B980000000) #6, !dbg !44 + %.011.i = select i1 %.not2.i, float %484, float %483, !dbg !44 + %485 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i = icmp eq i32 %485, 0, !dbg !44 + %486 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i, float %478, float 0x3FC1E52AA0000000) #6, !dbg !44 + %487 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i, float %478, float 0x3FC1E52AA0000000) #6, !dbg !44 + %.012.i = select i1 %.not3.i, float %487, float %486, !dbg !44 + %488 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i = icmp eq i32 %488, 0, !dbg !44 + %489 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i, float %478, float 0xBFC55B1720000000) #6, !dbg !44 + %490 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i, float %478, float 0xBFC55B1720000000) #6, !dbg !44 + %.09.i = select i1 %.not4.i, float %490, float %489, !dbg !44 + %491 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not5.i = icmp eq i32 %491, 0, !dbg !44 + %492 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i, float %478, float 0x3FC99DA160000000) #6, !dbg !44 + %493 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i, float %478, float 0x3FC99DA160000000) #6, !dbg !44 + %.05.i = select i1 %.not5.i, float %493, float %492, !dbg !44 + %494 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not6.i = icmp eq i32 %494, 0, !dbg !44 + %495 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %478, float 0xBFCFFFE440000000) #6, !dbg !44 + %496 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %478, float 0xBFCFFFE440000000) #6, !dbg !44 + %.01.i = select i1 %.not6.i, float %496, float %495, !dbg !44 + %497 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not7.i = icmp eq i32 %497, 0, !dbg !44 + %498 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i, float %478, float 0x3FD5554F00000000) #6, !dbg !44 + %499 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i, float %478, float 0x3FD5554F00000000) #6, !dbg !44 + %.0.i = select i1 %.not7.i, float %499, float %498, !dbg !44 + %500 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not8.i = icmp eq i32 %500, 0, !dbg !44 + %501 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i, float %478, float -5.000000e-01) #6, !dbg !44 + %502 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i, float %478, float -5.000000e-01) #6, !dbg !44 + %.07.i = select i1 %.not8.i, float %502, float %501, !dbg !44 + %503 = fmul float %478, %.07.i, !dbg !44 + %504 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not9.i = icmp eq i32 %504, 0, !dbg !44 + %505 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %503, float %478, float %478) #6, !dbg !44 + %506 = tail call float @llvm.nvvm.fma.rn.f(float %503, float %478, float %478) #6, !dbg !44 + %.06.i = select i1 %.not9.i, float %506, float %505, !dbg !44 + %507 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not10.i = icmp eq i32 %507, 0, !dbg !44 + %508 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08.i, float 0x3FE62E4300000000, float %.06.i) #6, !dbg !44 + %509 = tail call float @llvm.nvvm.fma.rn.f(float %.08.i, float 0x3FE62E4300000000, float %.06.i) #6, !dbg !44 + %.04.i = select i1 %.not10.i, float %509, float %508, !dbg !44 + %510 = icmp ugt i32 %469, 2139095039, !dbg !44 + br i1 %510, label %__nv_fmaf_rn.exit.i.i, label %__nv_logf.exit, !dbg !44 + +__nv_fmaf_rn.exit.i.i: ; preds = %._crit_edge + %511 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not11.i = icmp eq i32 %511, 0, !dbg !44 + %512 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !44 + %513 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #6, !dbg !44 + %.03.i = select i1 %.not11.i, float %513, float %512, !dbg !44 + br label %__nv_logf.exit, !dbg !44 + +__nv_logf.exit: ; preds = %._crit_edge, %__nv_fmaf_rn.exit.i.i + %r.i.0.i = phi float [ %.03.i, %__nv_fmaf_rn.exit.i.i ], [ %.04.i, %._crit_edge ], !dbg !44 + %514 = fcmp oeq float %.02.i, 0.000000e+00, !dbg !44 + %r.i.1.i = select i1 %514, float 0xFFF0000000000000, float %r.i.0.i, !dbg !44 + br i1 %30, label %.lr.ph6, label %._crit_edge7, !dbg !45 + +.lr.ph6: ; preds = %__nv_logf.exit + %515 = icmp eq i32 %24, 0 + %516 = getelementptr float, ptr addrspace(3) @global_smem, i32 %25 + %517 = icmp samesign ult i32 %23, 32 + %518 = getelementptr float, ptr addrspace(3) @global_smem, i32 %23 + %519 = icmp eq i32 %23, 0 + br label %520, !dbg !45 + +520: ; preds = %.lr.ph6, %520 + %521 = phi float [ 0.000000e+00, %.lr.ph6 ], [ %915, %520 ] + %522 = phi i32 [ 0, %.lr.ph6 ], [ %916, %520 ] + %523 = or disjoint i32 %522, %26, !dbg !46 + %524 = or disjoint i32 %522, %27, !dbg !46 + %525 = or disjoint i32 %522, %28, !dbg !46 + %526 = or disjoint i32 %522, %29, !dbg !46 + %527 = icmp slt i32 %523, %8, !dbg !47 + %528 = icmp slt i32 %524, %8, !dbg !47 + %529 = icmp slt i32 %525, %8, !dbg !47 + %530 = icmp slt i32 %526, %8, !dbg !47 + %531 = sext i32 %523 to i64, !dbg !48 + %532 = getelementptr bfloat, ptr addrspace(1) %16, i64 %531, !dbg !48 + %533 = sext i32 %524 to i64, !dbg !48 + %534 = getelementptr bfloat, ptr addrspace(1) %16, i64 %533, !dbg !48 + %535 = sext i32 %525 to i64, !dbg !48 + %536 = getelementptr bfloat, ptr addrspace(1) %16, i64 %535, !dbg !48 + %537 = sext i32 %526 to i64, !dbg !48 + %538 = getelementptr bfloat, ptr addrspace(1) %16, i64 %537, !dbg !48 + %539 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %532, i1 %527) #6, !dbg !49 + %540 = extractvalue { i32, i32, i32, i32 } %539, 0, !dbg !49 + %541 = bitcast i32 %540 to <2 x bfloat>, !dbg !49 + %542 = extractvalue { i32, i32, i32, i32 } %539, 1, !dbg !49 + %543 = bitcast i32 %542 to <2 x bfloat>, !dbg !49 + %544 = extractvalue { i32, i32, i32, i32 } %539, 2, !dbg !49 + %545 = bitcast i32 %544 to <2 x bfloat>, !dbg !49 + %546 = extractvalue { i32, i32, i32, i32 } %539, 3, !dbg !49 + %547 = bitcast i32 %546 to <2 x bfloat>, !dbg !49 + %548 = extractelement <2 x bfloat> %541, i64 0, !dbg !49 + %549 = extractelement <2 x bfloat> %541, i64 1, !dbg !49 + %550 = extractelement <2 x bfloat> %543, i64 0, !dbg !49 + %551 = extractelement <2 x bfloat> %543, i64 1, !dbg !49 + %552 = extractelement <2 x bfloat> %545, i64 0, !dbg !49 + %553 = extractelement <2 x bfloat> %545, i64 1, !dbg !49 + %554 = extractelement <2 x bfloat> %547, i64 0, !dbg !49 + %555 = extractelement <2 x bfloat> %547, i64 1, !dbg !49 + %556 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %534, i1 %528) #6, !dbg !49 + %557 = extractvalue { i32, i32, i32, i32 } %556, 0, !dbg !49 + %558 = bitcast i32 %557 to <2 x bfloat>, !dbg !49 + %559 = extractvalue { i32, i32, i32, i32 } %556, 1, !dbg !49 + %560 = bitcast i32 %559 to <2 x bfloat>, !dbg !49 + %561 = extractvalue { i32, i32, i32, i32 } %556, 2, !dbg !49 + %562 = bitcast i32 %561 to <2 x bfloat>, !dbg !49 + %563 = extractvalue { i32, i32, i32, i32 } %556, 3, !dbg !49 + %564 = bitcast i32 %563 to <2 x bfloat>, !dbg !49 + %565 = extractelement <2 x bfloat> %558, i64 0, !dbg !49 + %566 = extractelement <2 x bfloat> %558, i64 1, !dbg !49 + %567 = extractelement <2 x bfloat> %560, i64 0, !dbg !49 + %568 = extractelement <2 x bfloat> %560, i64 1, !dbg !49 + %569 = extractelement <2 x bfloat> %562, i64 0, !dbg !49 + %570 = extractelement <2 x bfloat> %562, i64 1, !dbg !49 + %571 = extractelement <2 x bfloat> %564, i64 0, !dbg !49 + %572 = extractelement <2 x bfloat> %564, i64 1, !dbg !49 + %573 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %536, i1 %529) #6, !dbg !49 + %574 = extractvalue { i32, i32, i32, i32 } %573, 0, !dbg !49 + %575 = bitcast i32 %574 to <2 x bfloat>, !dbg !49 + %576 = extractvalue { i32, i32, i32, i32 } %573, 1, !dbg !49 + %577 = bitcast i32 %576 to <2 x bfloat>, !dbg !49 + %578 = extractvalue { i32, i32, i32, i32 } %573, 2, !dbg !49 + %579 = bitcast i32 %578 to <2 x bfloat>, !dbg !49 + %580 = extractvalue { i32, i32, i32, i32 } %573, 3, !dbg !49 + %581 = bitcast i32 %580 to <2 x bfloat>, !dbg !49 + %582 = extractelement <2 x bfloat> %575, i64 0, !dbg !49 + %583 = extractelement <2 x bfloat> %575, i64 1, !dbg !49 + %584 = extractelement <2 x bfloat> %577, i64 0, !dbg !49 + %585 = extractelement <2 x bfloat> %577, i64 1, !dbg !49 + %586 = extractelement <2 x bfloat> %579, i64 0, !dbg !49 + %587 = extractelement <2 x bfloat> %579, i64 1, !dbg !49 + %588 = extractelement <2 x bfloat> %581, i64 0, !dbg !49 + %589 = extractelement <2 x bfloat> %581, i64 1, !dbg !49 + %590 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %538, i1 %530) #6, !dbg !49 + %591 = extractvalue { i32, i32, i32, i32 } %590, 0, !dbg !49 + %592 = bitcast i32 %591 to <2 x bfloat>, !dbg !49 + %593 = extractvalue { i32, i32, i32, i32 } %590, 1, !dbg !49 + %594 = bitcast i32 %593 to <2 x bfloat>, !dbg !49 + %595 = extractvalue { i32, i32, i32, i32 } %590, 2, !dbg !49 + %596 = bitcast i32 %595 to <2 x bfloat>, !dbg !49 + %597 = extractvalue { i32, i32, i32, i32 } %590, 3, !dbg !49 + %598 = bitcast i32 %597 to <2 x bfloat>, !dbg !49 + %599 = extractelement <2 x bfloat> %592, i64 0, !dbg !49 + %600 = extractelement <2 x bfloat> %592, i64 1, !dbg !49 + %601 = extractelement <2 x bfloat> %594, i64 0, !dbg !49 + %602 = extractelement <2 x bfloat> %594, i64 1, !dbg !49 + %603 = extractelement <2 x bfloat> %596, i64 0, !dbg !49 + %604 = extractelement <2 x bfloat> %596, i64 1, !dbg !49 + %605 = extractelement <2 x bfloat> %598, i64 0, !dbg !49 + %606 = extractelement <2 x bfloat> %598, i64 1, !dbg !49 + %607 = fpext bfloat %548 to float, !dbg !50 + %608 = fpext bfloat %549 to float, !dbg !50 + %609 = fpext bfloat %550 to float, !dbg !50 + %610 = fpext bfloat %551 to float, !dbg !50 + %611 = fpext bfloat %552 to float, !dbg !50 + %612 = fpext bfloat %553 to float, !dbg !50 + %613 = fpext bfloat %554 to float, !dbg !50 + %614 = fpext bfloat %555 to float, !dbg !50 + %615 = fpext bfloat %565 to float, !dbg !50 + %616 = fpext bfloat %566 to float, !dbg !50 + %617 = fpext bfloat %567 to float, !dbg !50 + %618 = fpext bfloat %568 to float, !dbg !50 + %619 = fpext bfloat %569 to float, !dbg !50 + %620 = fpext bfloat %570 to float, !dbg !50 + %621 = fpext bfloat %571 to float, !dbg !50 + %622 = fpext bfloat %572 to float, !dbg !50 + %623 = fpext bfloat %582 to float, !dbg !50 + %624 = fpext bfloat %583 to float, !dbg !50 + %625 = fpext bfloat %584 to float, !dbg !50 + %626 = fpext bfloat %585 to float, !dbg !50 + %627 = fpext bfloat %586 to float, !dbg !50 + %628 = fpext bfloat %587 to float, !dbg !50 + %629 = fpext bfloat %588 to float, !dbg !50 + %630 = fpext bfloat %589 to float, !dbg !50 + %631 = fpext bfloat %599 to float, !dbg !50 + %632 = fpext bfloat %600 to float, !dbg !50 + %633 = fpext bfloat %601 to float, !dbg !50 + %634 = fpext bfloat %602 to float, !dbg !50 + %635 = fpext bfloat %603 to float, !dbg !50 + %636 = fpext bfloat %604 to float, !dbg !50 + %637 = fpext bfloat %605 to float, !dbg !50 + %638 = fpext bfloat %606 to float, !dbg !50 + %639 = getelementptr float, ptr addrspace(1) %19, i64 %531, !dbg !51 + %640 = getelementptr i8, ptr addrspace(1) %639, i64 16, !dbg !51 + %641 = getelementptr float, ptr addrspace(1) %19, i64 %533, !dbg !51 + %642 = getelementptr i8, ptr addrspace(1) %639, i64 32784, !dbg !51 + %643 = getelementptr float, ptr addrspace(1) %19, i64 %535, !dbg !51 + %644 = getelementptr i8, ptr addrspace(1) %639, i64 65552, !dbg !51 + %645 = getelementptr float, ptr addrspace(1) %19, i64 %537, !dbg !51 + %646 = getelementptr i8, ptr addrspace(1) %639, i64 98320, !dbg !51 + %647 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %639, i1 %527) #6, !dbg !52 + %648 = extractvalue { i32, i32, i32, i32 } %647, 0, !dbg !52 + %649 = extractvalue { i32, i32, i32, i32 } %647, 1, !dbg !52 + %650 = extractvalue { i32, i32, i32, i32 } %647, 2, !dbg !52 + %651 = extractvalue { i32, i32, i32, i32 } %647, 3, !dbg !52 + %652 = bitcast i32 %648 to float, !dbg !52 + %653 = bitcast i32 %649 to float, !dbg !52 + %654 = bitcast i32 %650 to float, !dbg !52 + %655 = bitcast i32 %651 to float, !dbg !52 + %656 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %640, i1 %527) #6, !dbg !52 + %657 = extractvalue { i32, i32, i32, i32 } %656, 0, !dbg !52 + %658 = extractvalue { i32, i32, i32, i32 } %656, 1, !dbg !52 + %659 = extractvalue { i32, i32, i32, i32 } %656, 2, !dbg !52 + %660 = extractvalue { i32, i32, i32, i32 } %656, 3, !dbg !52 + %661 = bitcast i32 %657 to float, !dbg !52 + %662 = bitcast i32 %658 to float, !dbg !52 + %663 = bitcast i32 %659 to float, !dbg !52 + %664 = bitcast i32 %660 to float, !dbg !52 + %665 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %641, i1 %528) #6, !dbg !52 + %666 = extractvalue { i32, i32, i32, i32 } %665, 0, !dbg !52 + %667 = extractvalue { i32, i32, i32, i32 } %665, 1, !dbg !52 + %668 = extractvalue { i32, i32, i32, i32 } %665, 2, !dbg !52 + %669 = extractvalue { i32, i32, i32, i32 } %665, 3, !dbg !52 + %670 = bitcast i32 %666 to float, !dbg !52 + %671 = bitcast i32 %667 to float, !dbg !52 + %672 = bitcast i32 %668 to float, !dbg !52 + %673 = bitcast i32 %669 to float, !dbg !52 + %674 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %642, i1 %528) #6, !dbg !52 + %675 = extractvalue { i32, i32, i32, i32 } %674, 0, !dbg !52 + %676 = extractvalue { i32, i32, i32, i32 } %674, 1, !dbg !52 + %677 = extractvalue { i32, i32, i32, i32 } %674, 2, !dbg !52 + %678 = extractvalue { i32, i32, i32, i32 } %674, 3, !dbg !52 + %679 = bitcast i32 %675 to float, !dbg !52 + %680 = bitcast i32 %676 to float, !dbg !52 + %681 = bitcast i32 %677 to float, !dbg !52 + %682 = bitcast i32 %678 to float, !dbg !52 + %683 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %643, i1 %529) #6, !dbg !52 + %684 = extractvalue { i32, i32, i32, i32 } %683, 0, !dbg !52 + %685 = extractvalue { i32, i32, i32, i32 } %683, 1, !dbg !52 + %686 = extractvalue { i32, i32, i32, i32 } %683, 2, !dbg !52 + %687 = extractvalue { i32, i32, i32, i32 } %683, 3, !dbg !52 + %688 = bitcast i32 %684 to float, !dbg !52 + %689 = bitcast i32 %685 to float, !dbg !52 + %690 = bitcast i32 %686 to float, !dbg !52 + %691 = bitcast i32 %687 to float, !dbg !52 + %692 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %644, i1 %529) #6, !dbg !52 + %693 = extractvalue { i32, i32, i32, i32 } %692, 0, !dbg !52 + %694 = extractvalue { i32, i32, i32, i32 } %692, 1, !dbg !52 + %695 = extractvalue { i32, i32, i32, i32 } %692, 2, !dbg !52 + %696 = extractvalue { i32, i32, i32, i32 } %692, 3, !dbg !52 + %697 = bitcast i32 %693 to float, !dbg !52 + %698 = bitcast i32 %694 to float, !dbg !52 + %699 = bitcast i32 %695 to float, !dbg !52 + %700 = bitcast i32 %696 to float, !dbg !52 + %701 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %645, i1 %530) #6, !dbg !52 + %702 = extractvalue { i32, i32, i32, i32 } %701, 0, !dbg !52 + %703 = extractvalue { i32, i32, i32, i32 } %701, 1, !dbg !52 + %704 = extractvalue { i32, i32, i32, i32 } %701, 2, !dbg !52 + %705 = extractvalue { i32, i32, i32, i32 } %701, 3, !dbg !52 + %706 = bitcast i32 %702 to float, !dbg !52 + %707 = bitcast i32 %703 to float, !dbg !52 + %708 = bitcast i32 %704 to float, !dbg !52 + %709 = bitcast i32 %705 to float, !dbg !52 + %710 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %646, i1 %530) #6, !dbg !52 + %711 = extractvalue { i32, i32, i32, i32 } %710, 0, !dbg !52 + %712 = extractvalue { i32, i32, i32, i32 } %710, 1, !dbg !52 + %713 = extractvalue { i32, i32, i32, i32 } %710, 2, !dbg !52 + %714 = extractvalue { i32, i32, i32, i32 } %710, 3, !dbg !52 + %715 = bitcast i32 %711 to float, !dbg !52 + %716 = bitcast i32 %712 to float, !dbg !52 + %717 = bitcast i32 %713 to float, !dbg !52 + %718 = bitcast i32 %714 to float, !dbg !52 + %719 = fsub float %607, %.lcssa4, !dbg !53 + %720 = fsub float %608, %.lcssa4, !dbg !53 + %721 = fsub float %609, %.lcssa4, !dbg !53 + %722 = fsub float %610, %.lcssa4, !dbg !53 + %723 = fsub float %611, %.lcssa4, !dbg !53 + %724 = fsub float %612, %.lcssa4, !dbg !53 + %725 = fsub float %613, %.lcssa4, !dbg !53 + %726 = fsub float %614, %.lcssa4, !dbg !53 + %727 = fsub float %615, %.lcssa4, !dbg !53 + %728 = fsub float %616, %.lcssa4, !dbg !53 + %729 = fsub float %617, %.lcssa4, !dbg !53 + %730 = fsub float %618, %.lcssa4, !dbg !53 + %731 = fsub float %619, %.lcssa4, !dbg !53 + %732 = fsub float %620, %.lcssa4, !dbg !53 + %733 = fsub float %621, %.lcssa4, !dbg !53 + %734 = fsub float %622, %.lcssa4, !dbg !53 + %735 = fsub float %623, %.lcssa4, !dbg !53 + %736 = fsub float %624, %.lcssa4, !dbg !53 + %737 = fsub float %625, %.lcssa4, !dbg !53 + %738 = fsub float %626, %.lcssa4, !dbg !53 + %739 = fsub float %627, %.lcssa4, !dbg !53 + %740 = fsub float %628, %.lcssa4, !dbg !53 + %741 = fsub float %629, %.lcssa4, !dbg !53 + %742 = fsub float %630, %.lcssa4, !dbg !53 + %743 = fsub float %631, %.lcssa4, !dbg !53 + %744 = fsub float %632, %.lcssa4, !dbg !53 + %745 = fsub float %633, %.lcssa4, !dbg !53 + %746 = fsub float %634, %.lcssa4, !dbg !53 + %747 = fsub float %635, %.lcssa4, !dbg !53 + %748 = fsub float %636, %.lcssa4, !dbg !53 + %749 = fsub float %637, %.lcssa4, !dbg !53 + %750 = fsub float %638, %.lcssa4, !dbg !53 + %751 = fsub float %719, %r.i.1.i, !dbg !54 + %752 = fsub float %720, %r.i.1.i, !dbg !54 + %753 = fsub float %721, %r.i.1.i, !dbg !54 + %754 = fsub float %722, %r.i.1.i, !dbg !54 + %755 = fsub float %723, %r.i.1.i, !dbg !54 + %756 = fsub float %724, %r.i.1.i, !dbg !54 + %757 = fsub float %725, %r.i.1.i, !dbg !54 + %758 = fsub float %726, %r.i.1.i, !dbg !54 + %759 = fsub float %727, %r.i.1.i, !dbg !54 + %760 = fsub float %728, %r.i.1.i, !dbg !54 + %761 = fsub float %729, %r.i.1.i, !dbg !54 + %762 = fsub float %730, %r.i.1.i, !dbg !54 + %763 = fsub float %731, %r.i.1.i, !dbg !54 + %764 = fsub float %732, %r.i.1.i, !dbg !54 + %765 = fsub float %733, %r.i.1.i, !dbg !54 + %766 = fsub float %734, %r.i.1.i, !dbg !54 + %767 = fsub float %735, %r.i.1.i, !dbg !54 + %768 = fsub float %736, %r.i.1.i, !dbg !54 + %769 = fsub float %737, %r.i.1.i, !dbg !54 + %770 = fsub float %738, %r.i.1.i, !dbg !54 + %771 = fsub float %739, %r.i.1.i, !dbg !54 + %772 = fsub float %740, %r.i.1.i, !dbg !54 + %773 = fsub float %741, %r.i.1.i, !dbg !54 + %774 = fsub float %742, %r.i.1.i, !dbg !54 + %775 = fsub float %743, %r.i.1.i, !dbg !54 + %776 = fsub float %744, %r.i.1.i, !dbg !54 + %777 = fsub float %745, %r.i.1.i, !dbg !54 + %778 = fsub float %746, %r.i.1.i, !dbg !54 + %779 = fsub float %747, %r.i.1.i, !dbg !54 + %780 = fsub float %748, %r.i.1.i, !dbg !54 + %781 = fsub float %749, %r.i.1.i, !dbg !54 + %782 = fsub float %750, %r.i.1.i, !dbg !54 + %783 = fmul float %751, %652, !dbg !55 + %784 = fmul float %752, %653, !dbg !55 + %785 = fmul float %753, %654, !dbg !55 + %786 = fmul float %754, %655, !dbg !55 + %787 = fmul float %755, %661, !dbg !55 + %788 = fmul float %756, %662, !dbg !55 + %789 = fmul float %757, %663, !dbg !55 + %790 = fmul float %758, %664, !dbg !55 + %791 = fmul float %759, %670, !dbg !55 + %792 = fmul float %760, %671, !dbg !55 + %793 = fmul float %761, %672, !dbg !55 + %794 = fmul float %762, %673, !dbg !55 + %795 = fmul float %763, %679, !dbg !55 + %796 = fmul float %764, %680, !dbg !55 + %797 = fmul float %765, %681, !dbg !55 + %798 = fmul float %766, %682, !dbg !55 + %799 = fmul float %767, %688, !dbg !55 + %800 = fmul float %768, %689, !dbg !55 + %801 = fmul float %769, %690, !dbg !55 + %802 = fmul float %770, %691, !dbg !55 + %803 = fmul float %771, %697, !dbg !55 + %804 = fmul float %772, %698, !dbg !55 + %805 = fmul float %773, %699, !dbg !55 + %806 = fmul float %774, %700, !dbg !55 + %807 = fmul float %775, %706, !dbg !55 + %808 = fmul float %776, %707, !dbg !55 + %809 = fmul float %777, %708, !dbg !55 + %810 = fmul float %778, %709, !dbg !55 + %811 = fmul float %779, %715, !dbg !55 + %812 = fmul float %780, %716, !dbg !55 + %813 = fmul float %781, %717, !dbg !55 + %814 = fmul float %782, %718, !dbg !55 + %815 = select i1 %528, float %791, float 0.000000e+00, !dbg !56 + %816 = select i1 %528, float %792, float 0.000000e+00, !dbg !56 + %817 = select i1 %528, float %793, float 0.000000e+00, !dbg !56 + %818 = select i1 %528, float %794, float 0.000000e+00, !dbg !56 + %819 = select i1 %528, float %795, float 0.000000e+00, !dbg !56 + %820 = select i1 %528, float %796, float 0.000000e+00, !dbg !56 + %821 = select i1 %528, float %797, float 0.000000e+00, !dbg !56 + %822 = select i1 %528, float %798, float 0.000000e+00, !dbg !56 + %823 = select i1 %529, float %799, float 0.000000e+00, !dbg !56 + %824 = select i1 %529, float %800, float 0.000000e+00, !dbg !56 + %825 = select i1 %529, float %801, float 0.000000e+00, !dbg !56 + %826 = select i1 %529, float %802, float 0.000000e+00, !dbg !56 + %827 = select i1 %529, float %803, float 0.000000e+00, !dbg !56 + %828 = select i1 %529, float %804, float 0.000000e+00, !dbg !56 + %829 = select i1 %529, float %805, float 0.000000e+00, !dbg !56 + %830 = select i1 %529, float %806, float 0.000000e+00, !dbg !56 + %831 = select i1 %530, float %807, float 0.000000e+00, !dbg !56 + %832 = select i1 %530, float %808, float 0.000000e+00, !dbg !56 + %833 = select i1 %530, float %809, float 0.000000e+00, !dbg !56 + %834 = select i1 %530, float %810, float 0.000000e+00, !dbg !56 + %835 = select i1 %530, float %811, float 0.000000e+00, !dbg !56 + %836 = select i1 %530, float %812, float 0.000000e+00, !dbg !56 + %837 = select i1 %530, float %813, float 0.000000e+00, !dbg !56 + %838 = select i1 %530, float %814, float 0.000000e+00, !dbg !56 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !57 + %839 = fadd float %783, %784, !dbg !59 + %840 = fadd float %785, %839, !dbg !59 + %841 = fadd float %786, %840, !dbg !59 + %842 = fadd float %787, %841, !dbg !59 + %843 = fadd float %788, %842, !dbg !59 + %844 = fadd float %789, %843, !dbg !59 + %845 = fadd float %790, %844, !dbg !59 + %846 = select i1 %527, float %845, float 0.000000e+00, !dbg !59 + %847 = fadd float %815, %846, !dbg !59 + %848 = fadd float %816, %847, !dbg !59 + %849 = fadd float %817, %848, !dbg !59 + %850 = fadd float %818, %849, !dbg !59 + %851 = fadd float %819, %850, !dbg !59 + %852 = fadd float %820, %851, !dbg !59 + %853 = fadd float %821, %852, !dbg !59 + %854 = fadd float %822, %853, !dbg !59 + %855 = fadd float %823, %854, !dbg !59 + %856 = fadd float %824, %855, !dbg !59 + %857 = fadd float %825, %856, !dbg !59 + %858 = fadd float %826, %857, !dbg !59 + %859 = fadd float %827, %858, !dbg !59 + %860 = fadd float %828, %859, !dbg !59 + %861 = fadd float %829, %860, !dbg !59 + %862 = fadd float %830, %861, !dbg !59 + %863 = fadd float %831, %862, !dbg !59 + %864 = fadd float %832, %863, !dbg !59 + %865 = fadd float %833, %864, !dbg !59 + %866 = fadd float %834, %865, !dbg !59 + %867 = fadd float %835, %866, !dbg !59 + %868 = fadd float %836, %867, !dbg !59 + %869 = fadd float %837, %868, !dbg !59 + %870 = fadd float %838, %869, !dbg !59 + %871 = bitcast float %870 to i32, !dbg !57 + %872 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %871, i32 16, i32 31), !dbg !57 + %873 = bitcast i32 %872 to float, !dbg !57 + %874 = fadd float %870, %873, !dbg !59 + %875 = bitcast float %874 to i32, !dbg !57 + %876 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %875, i32 8, i32 31), !dbg !57 + %877 = bitcast i32 %876 to float, !dbg !57 + %878 = fadd float %874, %877, !dbg !59 + %879 = bitcast float %878 to i32, !dbg !57 + %880 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %879, i32 4, i32 31), !dbg !57 + %881 = bitcast i32 %880 to float, !dbg !57 + %882 = fadd float %878, %881, !dbg !59 + %883 = bitcast float %882 to i32, !dbg !57 + %884 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %883, i32 2, i32 31), !dbg !57 + %885 = bitcast i32 %884 to float, !dbg !57 + %886 = fadd float %882, %885, !dbg !59 + %887 = bitcast float %886 to i32, !dbg !57 + %888 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %887, i32 1, i32 31), !dbg !57 + %889 = bitcast i32 %888 to float, !dbg !57 + %890 = fadd float %886, %889, !dbg !59 + %891 = bitcast float %890 to <1 x i32>, !dbg !57 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %516, <1 x i32> %891, i1 %515) #6, !dbg !57 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !57 + %892 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %518, i1 %517) #6, !dbg !57 + %893 = bitcast i32 %892 to float, !dbg !57 + %894 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %892, i32 16, i32 31), !dbg !57 + %895 = bitcast i32 %894 to float, !dbg !57 + %896 = fadd float %893, %895, !dbg !59 + %897 = bitcast float %896 to i32, !dbg !57 + %898 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %897, i32 8, i32 31), !dbg !57 + %899 = bitcast i32 %898 to float, !dbg !57 + %900 = fadd float %896, %899, !dbg !59 + %901 = bitcast float %900 to i32, !dbg !57 + %902 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %901, i32 4, i32 31), !dbg !57 + %903 = bitcast i32 %902 to float, !dbg !57 + %904 = fadd float %900, %903, !dbg !59 + %905 = bitcast float %904 to i32, !dbg !57 + %906 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %905, i32 2, i32 31), !dbg !57 + %907 = bitcast i32 %906 to float, !dbg !57 + %908 = fadd float %904, %907, !dbg !59 + %909 = bitcast float %908 to i32, !dbg !57 + %910 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %909, i32 1, i32 31), !dbg !57 + %911 = bitcast i32 %910 to float, !dbg !57 + %912 = fadd float %908, %911, !dbg !59 + %913 = bitcast float %912 to <1 x i32>, !dbg !57 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %518, <1 x i32> %913, i1 %519) #6, !dbg !57 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !57 + %914 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !57 + %915 = fadd float %521, %914, !dbg !60 + %916 = add i32 %522, 32768, !dbg !45 + %917 = icmp slt i32 %916, %8, !dbg !45 + br i1 %917, label %520, label %._crit_edge7, !dbg !45 + +._crit_edge7: ; preds = %520, %__nv_logf.exit + %.lcssa = phi float [ 0.000000e+00, %__nv_logf.exit ], [ %915, %520 ], !dbg !61 + %918 = getelementptr float, ptr addrspace(1) %5, i64 %13, !dbg !62 + %919 = getelementptr float, ptr addrspace(1) %6, i64 %13, !dbg !63 + %920 = getelementptr float, ptr addrspace(1) %7, i64 %13, !dbg !64 + %921 = fsub float 0.000000e+00, %.lcssa, !dbg !65 + %922 = icmp eq i32 %23, 0, !dbg !66 + %923 = bitcast float %921 to i32, !dbg !66 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %923, ptr addrspace(1) %918, i1 %922) #6, !dbg !66 + %924 = bitcast float %.lcssa4 to i32, !dbg !67 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %924, ptr addrspace(1) %919, i1 %922) #6, !dbg !67 + %925 = bitcast float %.lcssa3 to i32, !dbg !68 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %925, ptr addrspace(1) %920, i1 %922) #6, !dbg !68 + br label %common.ret, !dbg !69 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.maxnum.f32(float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.f(float, float, float) #1 + +attributes #0 = { nounwind "nvvm.reqntid"="1024" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "loss.py", directory: "/workspace/hanrui/SpecForge-ext/specforge/core") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "log_softmax_forward_kernel", linkageName: "log_softmax_forward_kernel", scope: !1, file: !1, line: 50, type: !6, scopeLine: 50, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 64, column: 31, scope: !5) +!9 = !DILocation(line: 64, column: 37, scope: !5) +!10 = !DILocation(line: 65, column: 31, scope: !5) +!11 = !DILocation(line: 65, column: 18, scope: !5) +!12 = !DILocation(line: 66, column: 31, scope: !5) +!13 = !DILocation(line: 66, column: 18, scope: !5) +!14 = !DILocation(line: 67, column: 25, scope: !5) +!15 = !DILocation(line: 68, column: 28, scope: !5) +!16 = !DILocation(line: 69, column: 24, scope: !5) +!17 = !DILocation(line: 0, scope: !5) +!18 = !DILocation(line: 76, column: 35, scope: !5) +!19 = !DILocation(line: 75, column: 30, scope: !5) +!20 = !DILocation(line: 76, column: 22, scope: !5) +!21 = !DILocation(line: 77, column: 25, scope: !5) +!22 = !DILocation(line: 79, column: 25, scope: !5) +!23 = !DILocation(line: 79, column: 12, scope: !5) +!24 = !DILocation(line: 80, column: 15, scope: !5) +!25 = !DILocation(line: 81, column: 56, scope: !5) +!26 = !DILocation(line: 189, column: 40, scope: !27, inlinedAt: !29) +!27 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0) +!28 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!29 = !DILocation(line: 81, column: 27, scope: !5) +!30 = !DILocation(line: 168, column: 27, scope: !27, inlinedAt: !29) +!31 = !DILocation(line: 82, column: 30, scope: !5) +!32 = !DILocation(line: 83, column: 27, scope: !5) +!33 = !DILocation(line: 83, column: 23, scope: !5) +!34 = !DILocation(line: 83, column: 16, scope: !5) +!35 = !DILocation(line: 84, column: 49, scope: !5) +!36 = !DILocation(line: 84, column: 34, scope: !5) +!37 = !DILocation(line: 84, column: 57, scope: !5) +!38 = !DILocation(line: 291, column: 36, scope: !27, inlinedAt: !39) +!39 = !DILocation(line: 84, column: 12, scope: !5) +!40 = !DILocation(line: 261, column: 15, scope: !27, inlinedAt: !39) +!41 = !DILocation(line: 83, column: 36, scope: !5) +!42 = !DILocation(line: 72, column: 8, scope: !5) +!43 = !DILocation(line: 73, column: 8, scope: !5) +!44 = !DILocation(line: 100, column: 32, scope: !5) +!45 = !DILocation(line: 89, column: 30, scope: !5) +!46 = !DILocation(line: 90, column: 22, scope: !5) +!47 = !DILocation(line: 91, column: 25, scope: !5) +!48 = !DILocation(line: 92, column: 44, scope: !5) +!49 = !DILocation(line: 92, column: 31, scope: !5) +!50 = !DILocation(line: 93, column: 12, scope: !5) +!51 = !DILocation(line: 95, column: 44, scope: !5) +!52 = !DILocation(line: 95, column: 31, scope: !5) +!53 = !DILocation(line: 99, column: 43, scope: !5) +!54 = !DILocation(line: 101, column: 49, scope: !5) +!55 = !DILocation(line: 102, column: 43, scope: !5) +!56 = !DILocation(line: 103, column: 57, scope: !5) +!57 = !DILocation(line: 291, column: 36, scope: !27, inlinedAt: !58) +!58 = !DILocation(line: 103, column: 23, scope: !5) +!59 = !DILocation(line: 261, column: 15, scope: !27, inlinedAt: !58) +!60 = !DILocation(line: 103, column: 16, scope: !5) +!61 = !DILocation(line: 88, column: 11, scope: !5) +!62 = !DILocation(line: 105, column: 16, scope: !5) +!63 = !DILocation(line: 106, column: 13, scope: !5) +!64 = !DILocation(line: 107, column: 13, scope: !5) +!65 = !DILocation(line: 108, column: 24, scope: !5) +!66 = !DILocation(line: 108, column: 23, scope: !5) +!67 = !DILocation(line: 109, column: 20, scope: !5) +!68 = !DILocation(line: 110, column: 20, scope: !5) +!69 = !DILocation(line: 110, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.ptx new file mode 100644 index 0000000000000000000000000000000000000000..f79bc459a9d3f5107fda23231c5ad4f0be469391 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.ptx @@ -0,0 +1,1190 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl log_softmax_forward_kernel // -- Begin function log_softmax_forward_kernel +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @log_softmax_forward_kernel +.visible .entry log_softmax_forward_kernel( + .param .u64 .ptr .global .align 1 log_softmax_forward_kernel_param_0, + .param .u32 log_softmax_forward_kernel_param_1, + .param .u64 .ptr .global .align 1 log_softmax_forward_kernel_param_2, + .param .u32 log_softmax_forward_kernel_param_3, + .param .u64 .ptr .global .align 1 log_softmax_forward_kernel_param_4, + .param .u64 .ptr .global .align 1 log_softmax_forward_kernel_param_5, + .param .u64 .ptr .global .align 1 log_softmax_forward_kernel_param_6, + .param .u64 .ptr .global .align 1 log_softmax_forward_kernel_param_7, + .param .u32 log_softmax_forward_kernel_param_8, + .param .u64 .ptr .global .align 1 log_softmax_forward_kernel_param_9, + .param .u64 .ptr .global .align 1 log_softmax_forward_kernel_param_10 +) +.reqntid 1024 +{ + .reg .pred %p<41>; + .reg .b16 %rs<67>; + .reg .b32 %r<727>; + .reg .b64 %rd<50>; +$L__func_begin0: + +// %bb.0: + .loc 1 64 31 // loss.py:64:31 + mov.u32 %r28, %ctaid.x; + .loc 1 64 37 // loss.py:64:37 + cvt.u64.u32 %rd1, %r28; + ld.param.b64 %rd13, [log_softmax_forward_kernel_param_4]; + .loc 1 67 25 // loss.py:67:25 + add.s64 %rd7, %rd13, %rd1; + .loc 1 68 28 // loss.py:68:28 + // begin inline asm + mov.u16 %rs1, 0x0; + ld.global.b8 { %rs1 }, [ %rd7 + 0 ]; + // end inline asm + and.b16 %rs2, %rs1, 255; + setp.ne.b16 %p1, %rs2, 0; + .loc 1 69 24 // loss.py:69:24 + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + .loc 1 0 24 // loss.py:0:24 + ld.param.b32 %r27, [log_softmax_forward_kernel_param_8]; + ld.param.b64 %rd8, [log_softmax_forward_kernel_param_0]; + ld.param.s32 %rd9, [log_softmax_forward_kernel_param_1]; + mul.lo.s64 %rd12, %rd9, %rd1; + shl.b64 %rd14, %rd12, 1; + add.s64 %rd2, %rd8, %rd14; + .loc 1 76 35 // loss.py:76:35 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + shr.u32 %r3, %r1, 5; + shl.b32 %r4, %r1, 3; + .loc 1 75 30 // loss.py:75:30 + setp.lt.s32 %p2, %r27, 1; + mov.b32 %r722, 0f00000000; + mov.b32 %r721, 0fFF800000; + shl.b32 %r715, %r3, 2; + mov.b32 %r716, global_smem; + shl.b32 %r717, %r1, 2; + setp.eq.b32 %p34, %r1, 0; + setp.lt.u32 %p8, %r1, 32; + setp.eq.b32 %p7, %r2, 0; + cvt.u64.u32 %rd49, %r4; + @%p2 bra $L__BB0_5; +// %bb.3: // %.lr.ph + .loc 1 0 30 // loss.py:0:30 + add.s32 %r69, %r716, %r715; + add.s32 %r78, %r716, %r717; + mov.b32 %r720, 0; + mov.b32 %r719, 0fFF800000; + mov.b32 %r722, 0f00000000; +$L__BB0_4: // =>This Inner Loop Header: Depth=1 + .loc 1 76 22 // loss.py:76:22 + add.s32 %r81, %r4, %r720; + add.s32 %r82, %r81, 8192; + add.s32 %r83, %r81, 16384; + .loc 1 77 25 // loss.py:77:25 + add.s32 %r84, %r81, 24576; + setp.lt.s32 %p3, %r81, %r27; + setp.lt.s32 %p4, %r82, %r27; + setp.lt.s32 %p5, %r83, %r27; + setp.lt.s32 %p6, %r84, %r27; + .loc 1 79 25 // loss.py:79:25 + mad.wide.s32 %rd17, %r81, 2, %rd2; + cvt.s64.s32 %rd21, %r720; + add.s64 %rd23, %rd21, %rd49; + shl.b64 %rd24, %rd23, 1; + add.s64 %rd25, %rd2, %rd24; + add.s64 %rd18, %rd25, 16384; + add.s64 %rd19, %rd25, 32768; + add.s64 %rd20, %rd25, 49152; + mov.b32 %r41, -8323200; + .loc 1 79 12 // loss.py:79:12 + // begin inline asm + mov.u32 %r37, %r41; + mov.u32 %r38, %r41; + mov.u32 %r39, %r41; + mov.u32 %r40, %r41; + @%p3 ld.global.v4.b32 { %r37, %r38, %r39, %r40 }, [ %rd17 + 0 ]; + // end inline asm + mov.b32 {%rs3, %rs4}, %r37; + mov.b32 {%rs5, %rs6}, %r38; + mov.b32 {%rs7, %rs8}, %r39; + mov.b32 {%rs9, %rs10}, %r40; + // begin inline asm + mov.u32 %r45, %r41; + mov.u32 %r46, %r41; + mov.u32 %r47, %r41; + mov.u32 %r48, %r41; + @%p4 ld.global.v4.b32 { %r45, %r46, %r47, %r48 }, [ %rd18 + 0 ]; + // end inline asm + mov.b32 {%rs11, %rs12}, %r45; + mov.b32 {%rs13, %rs14}, %r46; + mov.b32 {%rs15, %rs16}, %r47; + mov.b32 {%rs17, %rs18}, %r48; + // begin inline asm + mov.u32 %r53, %r41; + mov.u32 %r54, %r41; + mov.u32 %r55, %r41; + mov.u32 %r56, %r41; + @%p5 ld.global.v4.b32 { %r53, %r54, %r55, %r56 }, [ %rd19 + 0 ]; + // end inline asm + mov.b32 {%rs19, %rs20}, %r53; + mov.b32 {%rs21, %rs22}, %r54; + mov.b32 {%rs23, %rs24}, %r55; + mov.b32 {%rs25, %rs26}, %r56; + // begin inline asm + mov.u32 %r61, %r41; + mov.u32 %r62, %r41; + mov.u32 %r63, %r41; + mov.u32 %r64, %r41; + @%p6 ld.global.v4.b32 { %r61, %r62, %r63, %r64 }, [ %rd20 + 0 ]; + // end inline asm + mov.b32 {%rs27, %rs28}, %r61; + mov.b32 {%rs29, %rs30}, %r62; + mov.b32 {%rs31, %rs32}, %r63; + mov.b32 {%rs33, %rs34}, %r64; + .loc 1 80 15 // loss.py:80:15 + cvt.f32.bf16 %r85, %rs3; + cvt.f32.bf16 %r86, %rs4; + cvt.f32.bf16 %r87, %rs5; + cvt.f32.bf16 %r88, %rs6; + cvt.f32.bf16 %r89, %rs7; + cvt.f32.bf16 %r90, %rs8; + cvt.f32.bf16 %r91, %rs9; + cvt.f32.bf16 %r92, %rs10; + cvt.f32.bf16 %r93, %rs11; + cvt.f32.bf16 %r94, %rs12; + cvt.f32.bf16 %r95, %rs13; + cvt.f32.bf16 %r96, %rs14; + cvt.f32.bf16 %r97, %rs15; + cvt.f32.bf16 %r98, %rs16; + cvt.f32.bf16 %r99, %rs17; + cvt.f32.bf16 %r100, %rs18; + cvt.f32.bf16 %r101, %rs19; + cvt.f32.bf16 %r102, %rs20; + cvt.f32.bf16 %r103, %rs21; + cvt.f32.bf16 %r104, %rs22; + cvt.f32.bf16 %r105, %rs23; + cvt.f32.bf16 %r106, %rs24; + cvt.f32.bf16 %r107, %rs25; + cvt.f32.bf16 %r108, %rs26; + cvt.f32.bf16 %r109, %rs27; + cvt.f32.bf16 %r110, %rs28; + cvt.f32.bf16 %r111, %rs29; + cvt.f32.bf16 %r112, %rs30; + cvt.f32.bf16 %r113, %rs31; + cvt.f32.bf16 %r114, %rs32; + cvt.f32.bf16 %r115, %rs33; + cvt.f32.bf16 %r116, %rs34; + .loc 1 81 56 // loss.py:81:56 + selp.f32 %r117, %r85, 0fFF800000, %p3; + selp.f32 %r118, %r86, 0fFF800000, %p3; + selp.f32 %r119, %r87, 0fFF800000, %p3; + selp.f32 %r120, %r88, 0fFF800000, %p3; + selp.f32 %r121, %r89, 0fFF800000, %p3; + selp.f32 %r122, %r90, 0fFF800000, %p3; + selp.f32 %r123, %r91, 0fFF800000, %p3; + selp.f32 %r124, %r92, 0fFF800000, %p3; + selp.f32 %r125, %r93, 0fFF800000, %p4; + selp.f32 %r126, %r94, 0fFF800000, %p4; + selp.f32 %r127, %r95, 0fFF800000, %p4; + selp.f32 %r128, %r96, 0fFF800000, %p4; + selp.f32 %r129, %r97, 0fFF800000, %p4; + selp.f32 %r130, %r98, 0fFF800000, %p4; + selp.f32 %r131, %r99, 0fFF800000, %p4; + selp.f32 %r132, %r100, 0fFF800000, %p4; + selp.f32 %r133, %r101, 0fFF800000, %p5; + selp.f32 %r134, %r102, 0fFF800000, %p5; + selp.f32 %r135, %r103, 0fFF800000, %p5; + selp.f32 %r136, %r104, 0fFF800000, %p5; + selp.f32 %r137, %r105, 0fFF800000, %p5; + selp.f32 %r138, %r106, 0fFF800000, %p5; + selp.f32 %r139, %r107, 0fFF800000, %p5; + selp.f32 %r140, %r108, 0fFF800000, %p5; + selp.f32 %r141, %r109, 0fFF800000, %p6; + selp.f32 %r142, %r110, 0fFF800000, %p6; + selp.f32 %r143, %r111, 0fFF800000, %p6; + selp.f32 %r144, %r112, 0fFF800000, %p6; + selp.f32 %r145, %r113, 0fFF800000, %p6; + selp.f32 %r146, %r114, 0fFF800000, %p6; + selp.f32 %r147, %r115, 0fFF800000, %p6; + selp.f32 %r148, %r116, 0fFF800000, %p6; +$L__tmp0: + .loc 2 189 40 // standard.py:189:40 @[ loss.py:81:27 ] + bar.sync 0; + .loc 2 168 27 // standard.py:168:27 @[ loss.py:81:27 ] + max.f32 %r149, %r117, %r118; + max.f32 %r150, %r149, %r119; + max.f32 %r151, %r150, %r120; + max.f32 %r152, %r151, %r121; + max.f32 %r153, %r152, %r122; + max.f32 %r154, %r153, %r123; + max.f32 %r155, %r154, %r124; + max.f32 %r156, %r155, %r125; + max.f32 %r157, %r156, %r126; + max.f32 %r158, %r157, %r127; + max.f32 %r159, %r158, %r128; + max.f32 %r160, %r159, %r129; + max.f32 %r161, %r160, %r130; + max.f32 %r162, %r161, %r131; + max.f32 %r163, %r162, %r132; + max.f32 %r164, %r163, %r133; + max.f32 %r165, %r164, %r134; + max.f32 %r166, %r165, %r135; + max.f32 %r167, %r166, %r136; + max.f32 %r168, %r167, %r137; + max.f32 %r169, %r168, %r138; + max.f32 %r170, %r169, %r139; + max.f32 %r171, %r170, %r140; + max.f32 %r172, %r171, %r141; + max.f32 %r173, %r172, %r142; + max.f32 %r174, %r173, %r143; + max.f32 %r175, %r174, %r144; + max.f32 %r176, %r175, %r145; + max.f32 %r177, %r176, %r146; + max.f32 %r178, %r177, %r147; + max.f32 %r179, %r178, %r148; + .loc 2 189 40 // standard.py:189:40 @[ loss.py:81:27 ] + shfl.sync.bfly.b32 %r180, %r179, 16, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ loss.py:81:27 ] + max.f32 %r181, %r179, %r180; + .loc 2 189 40 // standard.py:189:40 @[ loss.py:81:27 ] + shfl.sync.bfly.b32 %r182, %r181, 8, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ loss.py:81:27 ] + max.f32 %r183, %r181, %r182; + .loc 2 189 40 // standard.py:189:40 @[ loss.py:81:27 ] + shfl.sync.bfly.b32 %r184, %r183, 4, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ loss.py:81:27 ] + max.f32 %r185, %r183, %r184; + .loc 2 189 40 // standard.py:189:40 @[ loss.py:81:27 ] + shfl.sync.bfly.b32 %r186, %r185, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ loss.py:81:27 ] + max.f32 %r187, %r185, %r186; + .loc 2 189 40 // standard.py:189:40 @[ loss.py:81:27 ] + shfl.sync.bfly.b32 %r188, %r187, 1, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ loss.py:81:27 ] + max.f32 %r70, %r187, %r188; + .loc 2 189 40 // standard.py:189:40 @[ loss.py:81:27 ] + // begin inline asm + @%p7 st.shared.b32 [ %r69 + 0 ], %r70; + // end inline asm + bar.sync 0; + // begin inline asm + @%p8 ld.shared.b32 %r71, [ %r78 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r189, %r71, 16, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ loss.py:81:27 ] + max.f32 %r190, %r71, %r189; + .loc 2 189 40 // standard.py:189:40 @[ loss.py:81:27 ] + shfl.sync.bfly.b32 %r191, %r190, 8, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ loss.py:81:27 ] + max.f32 %r192, %r190, %r191; + .loc 2 189 40 // standard.py:189:40 @[ loss.py:81:27 ] + shfl.sync.bfly.b32 %r193, %r192, 4, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ loss.py:81:27 ] + max.f32 %r194, %r192, %r193; + .loc 2 189 40 // standard.py:189:40 @[ loss.py:81:27 ] + shfl.sync.bfly.b32 %r195, %r194, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ loss.py:81:27 ] + max.f32 %r196, %r194, %r195; + .loc 2 189 40 // standard.py:189:40 @[ loss.py:81:27 ] + shfl.sync.bfly.b32 %r197, %r196, 1, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ loss.py:81:27 ] + max.f32 %r74, %r196, %r197; + .loc 2 189 40 // standard.py:189:40 @[ loss.py:81:27 ] + // begin inline asm + @%p34 st.shared.b32 [ %r78 + 0 ], %r74; + // end inline asm + bar.sync 0; + ld.shared.b32 %r198, [global_smem]; +$L__tmp1: + .loc 1 82 30 // loss.py:82:30 + max.f32 %r721, %r719, %r198; + .loc 1 83 27 // loss.py:83:27 + sub.f32 %r199, %r719, %r721; + .loc 1 83 23 // loss.py:83:23 + mul.f32 %r200, %r199, 0f3FB8AA3B; + ex2.approx.f32 %r201, %r200; + .loc 1 84 49 // loss.py:84:49 + sub.f32 %r202, %r85, %r721; + sub.f32 %r203, %r86, %r721; + sub.f32 %r204, %r87, %r721; + sub.f32 %r205, %r88, %r721; + sub.f32 %r206, %r89, %r721; + sub.f32 %r207, %r90, %r721; + sub.f32 %r208, %r91, %r721; + sub.f32 %r209, %r92, %r721; + sub.f32 %r210, %r93, %r721; + sub.f32 %r211, %r94, %r721; + sub.f32 %r212, %r95, %r721; + sub.f32 %r213, %r96, %r721; + sub.f32 %r214, %r97, %r721; + sub.f32 %r215, %r98, %r721; + sub.f32 %r216, %r99, %r721; + sub.f32 %r217, %r100, %r721; + sub.f32 %r218, %r101, %r721; + sub.f32 %r219, %r102, %r721; + sub.f32 %r220, %r103, %r721; + sub.f32 %r221, %r104, %r721; + sub.f32 %r222, %r105, %r721; + sub.f32 %r223, %r106, %r721; + sub.f32 %r224, %r107, %r721; + sub.f32 %r225, %r108, %r721; + sub.f32 %r226, %r109, %r721; + sub.f32 %r227, %r110, %r721; + sub.f32 %r228, %r111, %r721; + sub.f32 %r229, %r112, %r721; + sub.f32 %r230, %r113, %r721; + sub.f32 %r231, %r114, %r721; + sub.f32 %r232, %r115, %r721; + sub.f32 %r233, %r116, %r721; + .loc 1 84 34 // loss.py:84:34 + mul.f32 %r234, %r202, 0f3FB8AA3B; + ex2.approx.f32 %r235, %r234; + mul.f32 %r236, %r203, 0f3FB8AA3B; + ex2.approx.f32 %r237, %r236; + mul.f32 %r238, %r204, 0f3FB8AA3B; + ex2.approx.f32 %r239, %r238; + mul.f32 %r240, %r205, 0f3FB8AA3B; + ex2.approx.f32 %r241, %r240; + mul.f32 %r242, %r206, 0f3FB8AA3B; + ex2.approx.f32 %r243, %r242; + mul.f32 %r244, %r207, 0f3FB8AA3B; + ex2.approx.f32 %r245, %r244; + mul.f32 %r246, %r208, 0f3FB8AA3B; + ex2.approx.f32 %r247, %r246; + mul.f32 %r248, %r209, 0f3FB8AA3B; + ex2.approx.f32 %r249, %r248; + mul.f32 %r250, %r210, 0f3FB8AA3B; + ex2.approx.f32 %r251, %r250; + mul.f32 %r252, %r211, 0f3FB8AA3B; + ex2.approx.f32 %r253, %r252; + mul.f32 %r254, %r212, 0f3FB8AA3B; + ex2.approx.f32 %r255, %r254; + mul.f32 %r256, %r213, 0f3FB8AA3B; + ex2.approx.f32 %r257, %r256; + mul.f32 %r258, %r214, 0f3FB8AA3B; + ex2.approx.f32 %r259, %r258; + mul.f32 %r260, %r215, 0f3FB8AA3B; + ex2.approx.f32 %r261, %r260; + mul.f32 %r262, %r216, 0f3FB8AA3B; + ex2.approx.f32 %r263, %r262; + mul.f32 %r264, %r217, 0f3FB8AA3B; + ex2.approx.f32 %r265, %r264; + mul.f32 %r266, %r218, 0f3FB8AA3B; + ex2.approx.f32 %r267, %r266; + mul.f32 %r268, %r219, 0f3FB8AA3B; + ex2.approx.f32 %r269, %r268; + mul.f32 %r270, %r220, 0f3FB8AA3B; + ex2.approx.f32 %r271, %r270; + mul.f32 %r272, %r221, 0f3FB8AA3B; + ex2.approx.f32 %r273, %r272; + mul.f32 %r274, %r222, 0f3FB8AA3B; + ex2.approx.f32 %r275, %r274; + mul.f32 %r276, %r223, 0f3FB8AA3B; + ex2.approx.f32 %r277, %r276; + mul.f32 %r278, %r224, 0f3FB8AA3B; + ex2.approx.f32 %r279, %r278; + mul.f32 %r280, %r225, 0f3FB8AA3B; + ex2.approx.f32 %r281, %r280; + mul.f32 %r282, %r226, 0f3FB8AA3B; + ex2.approx.f32 %r283, %r282; + mul.f32 %r284, %r227, 0f3FB8AA3B; + ex2.approx.f32 %r285, %r284; + mul.f32 %r286, %r228, 0f3FB8AA3B; + ex2.approx.f32 %r287, %r286; + mul.f32 %r288, %r229, 0f3FB8AA3B; + ex2.approx.f32 %r289, %r288; + mul.f32 %r290, %r230, 0f3FB8AA3B; + ex2.approx.f32 %r291, %r290; + mul.f32 %r292, %r231, 0f3FB8AA3B; + ex2.approx.f32 %r293, %r292; + mul.f32 %r294, %r232, 0f3FB8AA3B; + ex2.approx.f32 %r295, %r294; + mul.f32 %r296, %r233, 0f3FB8AA3B; + ex2.approx.f32 %r297, %r296; + .loc 1 84 57 // loss.py:84:57 + selp.f32 %r298, %r251, 0f00000000, %p4; + selp.f32 %r299, %r253, 0f00000000, %p4; + selp.f32 %r300, %r255, 0f00000000, %p4; + selp.f32 %r301, %r257, 0f00000000, %p4; + selp.f32 %r302, %r259, 0f00000000, %p4; + selp.f32 %r303, %r261, 0f00000000, %p4; + selp.f32 %r304, %r263, 0f00000000, %p4; + selp.f32 %r305, %r265, 0f00000000, %p4; + selp.f32 %r306, %r267, 0f00000000, %p5; + selp.f32 %r307, %r269, 0f00000000, %p5; + selp.f32 %r308, %r271, 0f00000000, %p5; + selp.f32 %r309, %r273, 0f00000000, %p5; + selp.f32 %r310, %r275, 0f00000000, %p5; + selp.f32 %r311, %r277, 0f00000000, %p5; + selp.f32 %r312, %r279, 0f00000000, %p5; + selp.f32 %r313, %r281, 0f00000000, %p5; + selp.f32 %r314, %r283, 0f00000000, %p6; + selp.f32 %r315, %r285, 0f00000000, %p6; + selp.f32 %r316, %r287, 0f00000000, %p6; + selp.f32 %r317, %r289, 0f00000000, %p6; + selp.f32 %r318, %r291, 0f00000000, %p6; + selp.f32 %r319, %r293, 0f00000000, %p6; + selp.f32 %r320, %r295, 0f00000000, %p6; + selp.f32 %r321, %r297, 0f00000000, %p6; +$L__tmp2: + .loc 2 291 36 // standard.py:291:36 @[ loss.py:84:12 ] + bar.sync 0; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:84:12 ] + add.f32 %r322, %r235, %r237; + add.f32 %r323, %r322, %r239; + add.f32 %r324, %r323, %r241; + add.f32 %r325, %r324, %r243; + add.f32 %r326, %r325, %r245; + add.f32 %r327, %r326, %r247; + add.f32 %r328, %r327, %r249; + selp.f32 %r329, %r328, 0f00000000, %p3; + add.f32 %r330, %r329, %r298; + add.f32 %r331, %r330, %r299; + add.f32 %r332, %r331, %r300; + add.f32 %r333, %r332, %r301; + add.f32 %r334, %r333, %r302; + add.f32 %r335, %r334, %r303; + add.f32 %r336, %r335, %r304; + add.f32 %r337, %r336, %r305; + add.f32 %r338, %r337, %r306; + add.f32 %r339, %r338, %r307; + add.f32 %r340, %r339, %r308; + add.f32 %r341, %r340, %r309; + add.f32 %r342, %r341, %r310; + add.f32 %r343, %r342, %r311; + add.f32 %r344, %r343, %r312; + add.f32 %r345, %r344, %r313; + add.f32 %r346, %r345, %r314; + add.f32 %r347, %r346, %r315; + add.f32 %r348, %r347, %r316; + add.f32 %r349, %r348, %r317; + add.f32 %r350, %r349, %r318; + add.f32 %r351, %r350, %r319; + add.f32 %r352, %r351, %r320; + add.f32 %r353, %r352, %r321; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:84:12 ] + shfl.sync.bfly.b32 %r354, %r353, 16, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:84:12 ] + add.f32 %r355, %r353, %r354; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:84:12 ] + shfl.sync.bfly.b32 %r356, %r355, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:84:12 ] + add.f32 %r357, %r355, %r356; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:84:12 ] + shfl.sync.bfly.b32 %r358, %r357, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:84:12 ] + add.f32 %r359, %r357, %r358; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:84:12 ] + shfl.sync.bfly.b32 %r360, %r359, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:84:12 ] + add.f32 %r361, %r359, %r360; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:84:12 ] + shfl.sync.bfly.b32 %r362, %r361, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:84:12 ] + add.f32 %r76, %r361, %r362; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:84:12 ] + // begin inline asm + @%p7 st.shared.b32 [ %r69 + 0 ], %r76; + // end inline asm + bar.sync 0; + // begin inline asm + @%p8 ld.shared.b32 %r77, [ %r78 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r363, %r77, 16, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:84:12 ] + add.f32 %r364, %r77, %r363; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:84:12 ] + shfl.sync.bfly.b32 %r365, %r364, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:84:12 ] + add.f32 %r366, %r364, %r365; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:84:12 ] + shfl.sync.bfly.b32 %r367, %r366, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:84:12 ] + add.f32 %r368, %r366, %r367; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:84:12 ] + shfl.sync.bfly.b32 %r369, %r368, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:84:12 ] + add.f32 %r370, %r368, %r369; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:84:12 ] + shfl.sync.bfly.b32 %r371, %r370, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:84:12 ] + add.f32 %r80, %r370, %r371; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:84:12 ] + // begin inline asm + @%p34 st.shared.b32 [ %r78 + 0 ], %r80; + // end inline asm + bar.sync 0; + ld.shared.b32 %r372, [global_smem]; +$L__tmp3: + .loc 1 83 36 // loss.py:83:36 + fma.rn.f32 %r722, %r722, %r201, %r372; + .loc 1 75 30 // loss.py:75:30 + add.s32 %r720, %r720, 32768; + setp.lt.s32 %p13, %r720, %r27; + mov.b32 %r719, %r721; + @%p13 bra $L__BB0_4; +$L__BB0_5: // %._crit_edge + .loc 1 100 32 // loss.py:100:32 + setp.lt.f32 %p14, %r722, 0f00800000; + mul.f32 %r373, %r722, 0f4B000000; + selp.f32 %r15, %r373, %r722, %p14; + selp.f32 %r374, 0fC1B80000, 0f00000000, %p14; + add.s32 %r375, %r15, -1059760811; + and.b32 %r376, %r375, -8388608; + sub.s32 %r377, %r15, %r376; + cvt.rn.f32.s32 %r378, %r376; + mov.b32 %r379, 0f34000000; + fma.rn.ftz.f32 %r380, %r378, %r379, %r374; + add.f32 %r381, %r377, 0fBF800000; + mov.b32 %r382, 0f3E1039F6; + mov.b32 %r383, 0fBE055027; + fma.rn.ftz.f32 %r384, %r383, %r381, %r382; + mov.b32 %r385, 0fBDF8CDCC; + fma.rn.ftz.f32 %r386, %r384, %r381, %r385; + mov.b32 %r387, 0f3E0F2955; + fma.rn.ftz.f32 %r388, %r386, %r381, %r387; + mov.b32 %r389, 0fBE2AD8B9; + fma.rn.ftz.f32 %r390, %r388, %r381, %r389; + mov.b32 %r391, 0f3E4CED0B; + fma.rn.ftz.f32 %r392, %r390, %r381, %r391; + mov.b32 %r393, 0fBE7FFF22; + fma.rn.ftz.f32 %r394, %r392, %r381, %r393; + mov.b32 %r395, 0f3EAAAA78; + fma.rn.ftz.f32 %r396, %r394, %r381, %r395; + mov.b32 %r397, 0fBF000000; + fma.rn.ftz.f32 %r398, %r396, %r381, %r397; + mul.f32 %r399, %r381, %r398; + fma.rn.ftz.f32 %r400, %r399, %r381, %r381; + mov.b32 %r401, 0f3F317218; + fma.rn.ftz.f32 %r723, %r380, %r401, %r400; + setp.lt.u32 %p15, %r15, 2139095040; + @%p15 bra $L__BB0_7; +// %bb.6: // %__nv_fmaf_rn.exit.i.i + .loc 1 0 32 // loss.py:0:32 + mov.b32 %r402, 0f7F800000; + fma.rn.ftz.f32 %r723, %r15, %r402, %r402; +$L__BB0_7: // %__nv_logf.exit + ld.param.b64 %rd6, [log_softmax_forward_kernel_param_7]; + ld.param.b64 %rd5, [log_softmax_forward_kernel_param_6]; + ld.param.b64 %rd4, [log_softmax_forward_kernel_param_5]; + mov.b32 %r403, 0f00000000; + mov.b32 %r726, %r403; + .loc 1 89 30 // loss.py:89:30 + @%p2 bra $L__BB0_10; +// %bb.8: // %.lr.ph6 + .loc 1 0 30 // loss.py:0:30 + ld.param.s32 %rd11, [log_softmax_forward_kernel_param_3]; + mul.lo.s64 %rd15, %rd11, %rd1; + ld.param.b64 %rd10, [log_softmax_forward_kernel_param_2]; + shl.b64 %rd16, %rd15, 2; + add.s64 %rd3, %rd10, %rd16; + setp.eq.f32 %p17, %r15, 0f00000000; + selp.f32 %r19, 0fFF800000, %r723, %p17; + add.s32 %r505, %r716, %r715; + add.s32 %r508, %r716, %r717; + mov.b32 %r413, 0; + mov.b32 %r726, 0f00000000; + mov.b32 %r725, %r413; +$L__BB0_9: // =>This Inner Loop Header: Depth=1 + .loc 1 90 22 // loss.py:90:22 + add.s32 %r511, %r4, %r725; + add.s32 %r512, %r511, 8192; + add.s32 %r513, %r511, 16384; + .loc 1 91 25 // loss.py:91:25 + add.s32 %r514, %r511, 24576; + setp.lt.s32 %p18, %r511, %r27; + setp.lt.s32 %p19, %r512, %r27; + setp.lt.s32 %p20, %r513, %r27; + setp.lt.s32 %p21, %r514, %r27; + .loc 1 92 44 // loss.py:92:44 + mad.wide.s32 %rd26, %r511, 2, %rd2; + cvt.s64.s32 %rd38, %r725; + add.s64 %rd40, %rd38, %rd49; + shl.b64 %rd41, %rd40, 1; + add.s64 %rd42, %rd2, %rd41; + add.s64 %rd27, %rd42, 16384; + add.s64 %rd28, %rd42, 32768; + add.s64 %rd29, %rd42, 49152; + .loc 1 92 31 // loss.py:92:31 + // begin inline asm + mov.u32 %r409, %r413; + mov.u32 %r410, %r413; + mov.u32 %r411, %r413; + mov.u32 %r412, %r413; + @%p18 ld.global.v4.b32 { %r409, %r410, %r411, %r412 }, [ %rd26 + 0 ]; + // end inline asm + mov.b32 {%rs35, %rs36}, %r409; + mov.b32 {%rs37, %rs38}, %r410; + mov.b32 {%rs39, %rs40}, %r411; + mov.b32 {%rs41, %rs42}, %r412; + // begin inline asm + mov.u32 %r417, %r413; + mov.u32 %r418, %r413; + mov.u32 %r419, %r413; + mov.u32 %r420, %r413; + @%p19 ld.global.v4.b32 { %r417, %r418, %r419, %r420 }, [ %rd27 + 0 ]; + // end inline asm + mov.b32 {%rs43, %rs44}, %r417; + mov.b32 {%rs45, %rs46}, %r418; + mov.b32 {%rs47, %rs48}, %r419; + mov.b32 {%rs49, %rs50}, %r420; + // begin inline asm + mov.u32 %r425, %r413; + mov.u32 %r426, %r413; + mov.u32 %r427, %r413; + mov.u32 %r428, %r413; + @%p20 ld.global.v4.b32 { %r425, %r426, %r427, %r428 }, [ %rd28 + 0 ]; + // end inline asm + mov.b32 {%rs51, %rs52}, %r425; + mov.b32 {%rs53, %rs54}, %r426; + mov.b32 {%rs55, %rs56}, %r427; + mov.b32 {%rs57, %rs58}, %r428; + // begin inline asm + mov.u32 %r433, %r413; + mov.u32 %r434, %r413; + mov.u32 %r435, %r413; + mov.u32 %r436, %r413; + @%p21 ld.global.v4.b32 { %r433, %r434, %r435, %r436 }, [ %rd29 + 0 ]; + // end inline asm + mov.b32 {%rs59, %rs60}, %r433; + mov.b32 {%rs61, %rs62}, %r434; + mov.b32 {%rs63, %rs64}, %r435; + mov.b32 {%rs65, %rs66}, %r436; + .loc 1 93 12 // loss.py:93:12 + cvt.f32.bf16 %r515, %rs35; + cvt.f32.bf16 %r516, %rs36; + cvt.f32.bf16 %r517, %rs37; + cvt.f32.bf16 %r518, %rs38; + cvt.f32.bf16 %r519, %rs39; + cvt.f32.bf16 %r520, %rs40; + cvt.f32.bf16 %r521, %rs41; + cvt.f32.bf16 %r522, %rs42; + cvt.f32.bf16 %r523, %rs43; + cvt.f32.bf16 %r524, %rs44; + cvt.f32.bf16 %r525, %rs45; + cvt.f32.bf16 %r526, %rs46; + cvt.f32.bf16 %r527, %rs47; + cvt.f32.bf16 %r528, %rs48; + cvt.f32.bf16 %r529, %rs49; + cvt.f32.bf16 %r530, %rs50; + cvt.f32.bf16 %r531, %rs51; + cvt.f32.bf16 %r532, %rs52; + cvt.f32.bf16 %r533, %rs53; + cvt.f32.bf16 %r534, %rs54; + cvt.f32.bf16 %r535, %rs55; + cvt.f32.bf16 %r536, %rs56; + cvt.f32.bf16 %r537, %rs57; + cvt.f32.bf16 %r538, %rs58; + cvt.f32.bf16 %r539, %rs59; + cvt.f32.bf16 %r540, %rs60; + cvt.f32.bf16 %r541, %rs61; + cvt.f32.bf16 %r542, %rs62; + cvt.f32.bf16 %r543, %rs63; + cvt.f32.bf16 %r544, %rs64; + cvt.f32.bf16 %r545, %rs65; + cvt.f32.bf16 %r546, %rs66; + .loc 1 95 44 // loss.py:95:44 + mad.wide.s32 %rd30, %r511, 4, %rd3; + add.s64 %rd31, %rd30, 16; + shl.b64 %rd43, %rd40, 2; + add.s64 %rd44, %rd3, %rd43; + add.s64 %rd32, %rd44, 32768; + add.s64 %rd33, %rd30, 32784; + add.s64 %rd34, %rd44, 65536; + add.s64 %rd35, %rd30, 65552; + add.s64 %rd36, %rd44, 98304; + add.s64 %rd37, %rd30, 98320; + .loc 1 95 31 // loss.py:95:31 + // begin inline asm + mov.u32 %r441, %r413; + mov.u32 %r442, %r413; + mov.u32 %r443, %r413; + mov.u32 %r444, %r413; + @%p18 ld.global.v4.b32 { %r441, %r442, %r443, %r444 }, [ %rd30 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r449, %r413; + mov.u32 %r450, %r413; + mov.u32 %r451, %r413; + mov.u32 %r452, %r413; + @%p18 ld.global.v4.b32 { %r449, %r450, %r451, %r452 }, [ %rd31 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r457, %r413; + mov.u32 %r458, %r413; + mov.u32 %r459, %r413; + mov.u32 %r460, %r413; + @%p19 ld.global.v4.b32 { %r457, %r458, %r459, %r460 }, [ %rd32 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r465, %r413; + mov.u32 %r466, %r413; + mov.u32 %r467, %r413; + mov.u32 %r468, %r413; + @%p19 ld.global.v4.b32 { %r465, %r466, %r467, %r468 }, [ %rd33 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r473, %r413; + mov.u32 %r474, %r413; + mov.u32 %r475, %r413; + mov.u32 %r476, %r413; + @%p20 ld.global.v4.b32 { %r473, %r474, %r475, %r476 }, [ %rd34 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r481, %r413; + mov.u32 %r482, %r413; + mov.u32 %r483, %r413; + mov.u32 %r484, %r413; + @%p20 ld.global.v4.b32 { %r481, %r482, %r483, %r484 }, [ %rd35 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r489, %r413; + mov.u32 %r490, %r413; + mov.u32 %r491, %r413; + mov.u32 %r492, %r413; + @%p21 ld.global.v4.b32 { %r489, %r490, %r491, %r492 }, [ %rd36 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r497, %r413; + mov.u32 %r498, %r413; + mov.u32 %r499, %r413; + mov.u32 %r500, %r413; + @%p21 ld.global.v4.b32 { %r497, %r498, %r499, %r500 }, [ %rd37 + 0 ]; + // end inline asm + .loc 1 99 43 // loss.py:99:43 + sub.f32 %r547, %r515, %r721; + sub.f32 %r548, %r516, %r721; + sub.f32 %r549, %r517, %r721; + sub.f32 %r550, %r518, %r721; + sub.f32 %r551, %r519, %r721; + sub.f32 %r552, %r520, %r721; + sub.f32 %r553, %r521, %r721; + sub.f32 %r554, %r522, %r721; + sub.f32 %r555, %r523, %r721; + sub.f32 %r556, %r524, %r721; + sub.f32 %r557, %r525, %r721; + sub.f32 %r558, %r526, %r721; + sub.f32 %r559, %r527, %r721; + sub.f32 %r560, %r528, %r721; + sub.f32 %r561, %r529, %r721; + sub.f32 %r562, %r530, %r721; + sub.f32 %r563, %r531, %r721; + sub.f32 %r564, %r532, %r721; + sub.f32 %r565, %r533, %r721; + sub.f32 %r566, %r534, %r721; + sub.f32 %r567, %r535, %r721; + sub.f32 %r568, %r536, %r721; + sub.f32 %r569, %r537, %r721; + sub.f32 %r570, %r538, %r721; + sub.f32 %r571, %r539, %r721; + sub.f32 %r572, %r540, %r721; + sub.f32 %r573, %r541, %r721; + sub.f32 %r574, %r542, %r721; + sub.f32 %r575, %r543, %r721; + sub.f32 %r576, %r544, %r721; + sub.f32 %r577, %r545, %r721; + sub.f32 %r578, %r546, %r721; + .loc 1 101 49 // loss.py:101:49 + sub.f32 %r579, %r547, %r19; + sub.f32 %r580, %r548, %r19; + sub.f32 %r581, %r549, %r19; + sub.f32 %r582, %r550, %r19; + sub.f32 %r583, %r551, %r19; + sub.f32 %r584, %r552, %r19; + sub.f32 %r585, %r553, %r19; + sub.f32 %r586, %r554, %r19; + sub.f32 %r587, %r555, %r19; + sub.f32 %r588, %r556, %r19; + sub.f32 %r589, %r557, %r19; + sub.f32 %r590, %r558, %r19; + sub.f32 %r591, %r559, %r19; + sub.f32 %r592, %r560, %r19; + sub.f32 %r593, %r561, %r19; + sub.f32 %r594, %r562, %r19; + sub.f32 %r595, %r563, %r19; + sub.f32 %r596, %r564, %r19; + sub.f32 %r597, %r565, %r19; + sub.f32 %r598, %r566, %r19; + sub.f32 %r599, %r567, %r19; + sub.f32 %r600, %r568, %r19; + sub.f32 %r601, %r569, %r19; + sub.f32 %r602, %r570, %r19; + sub.f32 %r603, %r571, %r19; + sub.f32 %r604, %r572, %r19; + sub.f32 %r605, %r573, %r19; + sub.f32 %r606, %r574, %r19; + sub.f32 %r607, %r575, %r19; + sub.f32 %r608, %r576, %r19; + sub.f32 %r609, %r577, %r19; + sub.f32 %r610, %r578, %r19; + .loc 1 102 43 // loss.py:102:43 + mul.f32 %r611, %r580, %r442; + mul.f32 %r612, %r587, %r457; + mul.f32 %r613, %r588, %r458; + mul.f32 %r614, %r589, %r459; + mul.f32 %r615, %r590, %r460; + mul.f32 %r616, %r591, %r465; + mul.f32 %r617, %r592, %r466; + mul.f32 %r618, %r593, %r467; + mul.f32 %r619, %r594, %r468; + mul.f32 %r620, %r595, %r473; + mul.f32 %r621, %r596, %r474; + mul.f32 %r622, %r597, %r475; + mul.f32 %r623, %r598, %r476; + mul.f32 %r624, %r599, %r481; + mul.f32 %r625, %r600, %r482; + mul.f32 %r626, %r601, %r483; + mul.f32 %r627, %r602, %r484; + mul.f32 %r628, %r603, %r489; + mul.f32 %r629, %r604, %r490; + mul.f32 %r630, %r605, %r491; + mul.f32 %r631, %r606, %r492; + mul.f32 %r632, %r607, %r497; + mul.f32 %r633, %r608, %r498; + mul.f32 %r634, %r609, %r499; + mul.f32 %r635, %r610, %r500; + .loc 1 103 57 // loss.py:103:57 + selp.f32 %r636, %r612, 0f00000000, %p19; + selp.f32 %r637, %r613, 0f00000000, %p19; + selp.f32 %r638, %r614, 0f00000000, %p19; + selp.f32 %r639, %r615, 0f00000000, %p19; + selp.f32 %r640, %r616, 0f00000000, %p19; + selp.f32 %r641, %r617, 0f00000000, %p19; + selp.f32 %r642, %r618, 0f00000000, %p19; + selp.f32 %r643, %r619, 0f00000000, %p19; + selp.f32 %r644, %r620, 0f00000000, %p20; + selp.f32 %r645, %r621, 0f00000000, %p20; + selp.f32 %r646, %r622, 0f00000000, %p20; + selp.f32 %r647, %r623, 0f00000000, %p20; + selp.f32 %r648, %r624, 0f00000000, %p20; + selp.f32 %r649, %r625, 0f00000000, %p20; + selp.f32 %r650, %r626, 0f00000000, %p20; + selp.f32 %r651, %r627, 0f00000000, %p20; + selp.f32 %r652, %r628, 0f00000000, %p21; + selp.f32 %r653, %r629, 0f00000000, %p21; + selp.f32 %r654, %r630, 0f00000000, %p21; + selp.f32 %r655, %r631, 0f00000000, %p21; + selp.f32 %r656, %r632, 0f00000000, %p21; + selp.f32 %r657, %r633, 0f00000000, %p21; + selp.f32 %r658, %r634, 0f00000000, %p21; + selp.f32 %r659, %r635, 0f00000000, %p21; +$L__tmp4: + .loc 2 291 36 // standard.py:291:36 @[ loss.py:103:23 ] + bar.sync 0; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:103:23 ] + fma.rn.f32 %r660, %r579, %r441, %r611; + fma.rn.f32 %r661, %r581, %r443, %r660; + fma.rn.f32 %r662, %r582, %r444, %r661; + fma.rn.f32 %r663, %r583, %r449, %r662; + fma.rn.f32 %r664, %r584, %r450, %r663; + fma.rn.f32 %r665, %r585, %r451, %r664; + fma.rn.f32 %r666, %r586, %r452, %r665; + selp.f32 %r667, %r666, 0f00000000, %p18; + add.f32 %r668, %r636, %r667; + add.f32 %r669, %r637, %r668; + add.f32 %r670, %r638, %r669; + add.f32 %r671, %r639, %r670; + add.f32 %r672, %r640, %r671; + add.f32 %r673, %r641, %r672; + add.f32 %r674, %r642, %r673; + add.f32 %r675, %r643, %r674; + add.f32 %r676, %r644, %r675; + add.f32 %r677, %r645, %r676; + add.f32 %r678, %r646, %r677; + add.f32 %r679, %r647, %r678; + add.f32 %r680, %r648, %r679; + add.f32 %r681, %r649, %r680; + add.f32 %r682, %r650, %r681; + add.f32 %r683, %r651, %r682; + add.f32 %r684, %r652, %r683; + add.f32 %r685, %r653, %r684; + add.f32 %r686, %r654, %r685; + add.f32 %r687, %r655, %r686; + add.f32 %r688, %r656, %r687; + add.f32 %r689, %r657, %r688; + add.f32 %r690, %r658, %r689; + add.f32 %r691, %r659, %r690; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:103:23 ] + shfl.sync.bfly.b32 %r692, %r691, 16, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:103:23 ] + add.f32 %r693, %r691, %r692; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:103:23 ] + shfl.sync.bfly.b32 %r694, %r693, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:103:23 ] + add.f32 %r695, %r693, %r694; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:103:23 ] + shfl.sync.bfly.b32 %r696, %r695, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:103:23 ] + add.f32 %r697, %r695, %r696; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:103:23 ] + shfl.sync.bfly.b32 %r698, %r697, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:103:23 ] + add.f32 %r699, %r697, %r698; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:103:23 ] + shfl.sync.bfly.b32 %r700, %r699, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:103:23 ] + add.f32 %r506, %r699, %r700; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:103:23 ] + // begin inline asm + @%p7 st.shared.b32 [ %r505 + 0 ], %r506; + // end inline asm + bar.sync 0; + // begin inline asm + @%p8 ld.shared.b32 %r507, [ %r508 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r701, %r507, 16, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:103:23 ] + add.f32 %r702, %r507, %r701; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:103:23 ] + shfl.sync.bfly.b32 %r703, %r702, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:103:23 ] + add.f32 %r704, %r702, %r703; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:103:23 ] + shfl.sync.bfly.b32 %r705, %r704, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:103:23 ] + add.f32 %r706, %r704, %r705; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:103:23 ] + shfl.sync.bfly.b32 %r707, %r706, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:103:23 ] + add.f32 %r708, %r706, %r707; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:103:23 ] + shfl.sync.bfly.b32 %r709, %r708, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ loss.py:103:23 ] + add.f32 %r510, %r708, %r709; + .loc 2 291 36 // standard.py:291:36 @[ loss.py:103:23 ] + // begin inline asm + @%p34 st.shared.b32 [ %r508 + 0 ], %r510; + // end inline asm + bar.sync 0; + ld.shared.b32 %r710, [global_smem]; +$L__tmp5: + .loc 1 103 16 // loss.py:103:16 + add.f32 %r726, %r726, %r710; + .loc 1 89 30 // loss.py:89:30 + add.s32 %r725, %r725, 32768; + setp.lt.s32 %p33, %r725, %r27; + @%p33 bra $L__BB0_9; +$L__BB0_10: // %._crit_edge7 + .loc 1 105 16 // loss.py:105:16 + shl.b64 %rd48, %rd1, 2; + add.s64 %rd45, %rd4, %rd48; + .loc 1 106 13 // loss.py:106:13 + add.s64 %rd46, %rd5, %rd48; + .loc 1 107 13 // loss.py:107:13 + add.s64 %rd47, %rd6, %rd48; + .loc 1 108 24 // loss.py:108:24 + sub.f32 %r711, %r403, %r726; + .loc 1 108 23 // loss.py:108:23 + // begin inline asm + @%p34 st.global.b32 [ %rd45 + 0 ], { %r711 }; + // end inline asm + .loc 1 109 20 // loss.py:109:20 + // begin inline asm + @%p34 st.global.b32 [ %rd46 + 0 ], { %r721 }; + // end inline asm + .loc 1 110 20 // loss.py:110:20 + // begin inline asm + @%p34 st.global.b32 [ %rd47 + 0 ], { %r722 }; + // end inline asm +$L__BB0_1: // %common.ret + .loc 1 0 0 // loss.py:0 + ret; +$L__tmp6: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/specforge/core/loss.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 200 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xc1 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 108 // DW_AT_name +.b8 111 +.b8 115 +.b8 115 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 115 +.b8 112 +.b8 101 +.b8 99 +.b8 102 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 114 +.b8 101 +.b8 0 +.b8 2 // Abbrev [2] 0x50:0x1d DW_TAG_subprogram +.b8 108 // DW_AT_name +.b8 111 +.b8 103 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 102 +.b8 111 +.b8 114 +.b8 119 +.b8 97 +.b8 114 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x6d:0x5e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 80 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x82:0x18 DW_TAG_inlined_subroutine +.b32 80 // DW_AT_abstract_origin +.b64 $L__tmp0 // DW_AT_low_pc +.b64 $L__tmp1 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 81 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x9a:0x18 DW_TAG_inlined_subroutine +.b32 80 // DW_AT_abstract_origin +.b64 $L__tmp2 // DW_AT_low_pc +.b64 $L__tmp3 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 84 // DW_AT_call_line +.b8 12 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xb2:0x18 DW_TAG_inlined_subroutine +.b32 80 // DW_AT_abstract_origin +.b64 $L__tmp4 // DW_AT_low_pc +.b64 $L__tmp5 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 103 // DW_AT_call_line +.b8 23 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.source b/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.source new file mode 100644 index 0000000000000000000000000000000000000000..76834f53f4a32bdde65b5a309b38088b87e68e37 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.source @@ -0,0 +1,324 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":50:0) +#loc12 = loc(unknown) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":175:0) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":167:0) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc77 = loc("logits_ptr"(#loc)) +#loc78 = loc("logits_stride"(#loc)) +#loc79 = loc("target_ptr"(#loc)) +#loc80 = loc("target_stride"(#loc)) +#loc81 = loc("position_mask_ptr"(#loc)) +#loc82 = loc("loss_ptr"(#loc)) +#loc83 = loc("m_ptr"(#loc)) +#loc84 = loc("d_ptr"(#loc)) +#loc85 = loc("n_cols"(#loc)) +#loc136 = loc("input"(#loc61)) +#loc137 = loc("a"(#loc65)) +#loc138 = loc("b"(#loc65)) +#loc139 = loc("input"(#loc69)) +#loc140 = loc("a"(#loc73)) +#loc141 = loc("b"(#loc73)) +module { + tt.func public @log_softmax_forward_kernel(%logits_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("logits_ptr"(#loc)), %logits_stride: i32 {tt.divisibility = 16 : i32} loc("logits_stride"(#loc)), %target_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("target_ptr"(#loc)), %target_stride: i32 {tt.divisibility = 16 : i32} loc("target_stride"(#loc)), %position_mask_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("position_mask_ptr"(#loc)), %loss_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("loss_ptr"(#loc)), %m_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("m_ptr"(#loc)), %d_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("d_ptr"(#loc)), %n_cols: i32 {tt.divisibility = 16 : i32} loc("n_cols"(#loc))) attributes {noinline = false} { + %program_id = tt.get_program_id x : i32 loc(#loc86) + %program_id_0 = arith.extsi %program_id : i32 to i64 loc(#loc87) + %logits_ptr_1 = arith.extsi %logits_stride : i32 to i64 loc(#loc88) + %logits_ptr_2 = arith.muli %program_id_0, %logits_ptr_1 : i64 loc(#loc88) + %logits_ptr_3 = tt.addptr %logits_ptr, %logits_ptr_2 : !tt.ptr, i64 loc(#loc89) + %target_ptr_4 = arith.extsi %target_stride : i32 to i64 loc(#loc90) + %target_ptr_5 = arith.muli %program_id_0, %target_ptr_4 : i64 loc(#loc90) + %target_ptr_6 = tt.addptr %target_ptr, %target_ptr_5 : !tt.ptr, i64 loc(#loc91) + %position_mask_ptr_7 = arith.constant 1 : i32 loc(#loc92) + %position_mask_ptr_8 = arith.constant 1 : i64 loc(#loc92) + %position_mask_ptr_9 = arith.muli %program_id_0, %position_mask_ptr_8 : i64 loc(#loc92) + %position_mask_ptr_10 = tt.addptr %position_mask_ptr, %position_mask_ptr_9 : !tt.ptr, i64 loc(#loc93) + %position_mask = tt.bitcast %position_mask_ptr_10 : !tt.ptr -> !tt.ptr loc(#loc94) + %position_mask_11 = tt.load %position_mask : !tt.ptr loc(#loc94) + %position_mask_12 = arith.constant 0 : i8 loc(#loc94) + %position_mask_13 = arith.cmpi ne, %position_mask_11, %position_mask_12 : i8 loc(#loc94) + %c0_i32 = arith.constant 0 : i32 loc(#loc10) + %0 = arith.extui %position_mask_13 : i1 to i32 loc(#loc10) + %1 = arith.cmpi eq, %0, %c0_i32 : i32 loc(#loc10) + cf.cond_br %1, ^bb1, ^bb2 loc(#loc10) + ^bb1: // pred: ^bb0 + tt.return loc(#loc11) + ^bb2: // pred: ^bb0 + cf.br ^bb4 loc(#loc12) + ^bb3: // no predecessors + cf.br ^bb4 loc(#loc12) + ^bb4: // 2 preds: ^bb2, ^bb3 + %m = arith.constant 0xFF800000 : f32 loc(#loc95) + %d = arith.constant 0.000000e+00 : f32 loc(#loc96) + %c0_i32_14 = arith.constant 0 : i32 loc(#loc15) + %c32768_i32 = arith.constant 32768 : i32 loc(#loc15) + %2 = arith.bitcast %c0_i32_14 : i32 to i32 loc(#loc15) + %3 = arith.bitcast %n_cols : i32 to i32 loc(#loc15) + %4 = arith.bitcast %c32768_i32 : i32 to i32 loc(#loc15) + %5 = ub.poison : i32 loc(#loc15) + %d_15:2 = scf.for %i = %2 to %3 step %4 iter_args(%m_25 = %m, %d_26 = %d) -> (f32, f32) : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc98) + %offsets_27 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc99) + %offsets_28 = arith.extsi %offsets_27 : tensor<32768xi32> to tensor<32768xi64> loc(#loc99) + %offsets_29 = arith.extsi %offsets : tensor<32768xi32> to tensor<32768xi64> loc(#loc99) + %offsets_30 = arith.addi %offsets_28, %offsets_29 : tensor<32768xi64> loc(#loc99) + %offsets_31 = arith.constant 2147483647 : i64 loc(#loc99) + %offsets_32 = arith.constant -2147483648 : i64 loc(#loc99) + %offsets_33 = arith.constant dense<2147483647> : tensor<32768xi64> loc(#loc99) + %offsets_34 = arith.cmpi sle, %offsets_30, %offsets_33 : tensor<32768xi64> loc(#loc99) + %offsets_35 = arith.constant dense<-2147483648> : tensor<32768xi64> loc(#loc99) + %offsets_36 = arith.cmpi sge, %offsets_30, %offsets_35 : tensor<32768xi64> loc(#loc99) + %offsets_37 = arith.andi %offsets_34, %offsets_36 : tensor<32768xi1> loc(#loc99) + %offsets_38 = arith.addi %offsets_27, %offsets : tensor<32768xi32> loc(#loc99) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc100) + %mask_39 = arith.cmpi slt, %offsets_38, %mask : tensor<32768xi32> loc(#loc100) + %logits_block = tt.splat %logits_ptr_3 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc101) + %logits_block_40 = tt.addptr %logits_block, %offsets_38 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc101) + %logits_block_41 = arith.constant 0xFF800000 : f32 loc(#loc102) + %logits_block_42 = arith.constant dense<0xFF800000> : tensor<32768xf32> loc(#loc102) + %logits_block_43 = arith.truncf %logits_block_42 : tensor<32768xf32> to tensor<32768xbf16> loc(#loc102) + %logits_block_44 = tt.load %logits_block_40, %mask_39, %logits_block_43 : tensor<32768x!tt.ptr> loc(#loc102) + %logits_block_45 = arith.extf %logits_block_44 : tensor<32768xbf16> to tensor<32768xf32> loc(#loc103) + %block_max = arith.constant 0xFF800000 : f32 loc(#loc104) + %block_max_46 = arith.constant 0xFF800000 : f32 loc(#loc104) + %block_max_47 = arith.constant dense<0xFF800000> : tensor<32768xf32> loc(#loc104) + %block_max_48 = arith.select %mask_39, %logits_block_45, %block_max_47 : tensor<32768xi1>, tensor<32768xf32> loc(#loc104) + %block_max_49 = tt.call @"triton.language.standard.max__fp32S32768S__(1,)cNone_(2,)cconstexpr_False__(3,)cconstexpr_True__(4,)cconstexpr_False_"(%block_max_48) : (tensor<32768xf32>) -> f32 loc(#loc105) + %m_new = arith.maxnumf %m_25, %block_max_49 : f32 loc(#loc106) + %d_50 = arith.subf %m_25, %m_new : f32 loc(#loc107) + %d_51 = math.exp %d_50 : f32 loc(#loc108) + %d_52 = arith.mulf %d_26, %d_51 : f32 loc(#loc109) + %d_53 = tt.splat %m_new : f32 -> tensor<32768xf32> loc(#loc110) + %d_54 = arith.subf %logits_block_45, %d_53 : tensor<32768xf32> loc(#loc110) + %d_55 = math.exp %d_54 : tensor<32768xf32> loc(#loc111) + %d_56 = arith.constant 0.000000e+00 : f32 loc(#loc112) + %d_57 = arith.constant 0.000000e+00 : f32 loc(#loc112) + %d_58 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc112) + %d_59 = arith.select %mask_39, %d_55, %d_58 : tensor<32768xi1>, tensor<32768xf32> loc(#loc112) + %d_60 = tt.call @"triton.language.standard.sum__fp32S32768S__(1,)cNone_(2,)cconstexpr_False__(3,)cNone"(%d_59) : (tensor<32768xf32>) -> f32 loc(#loc113) + %d_61 = arith.addf %d_52, %d_60 : f32 loc(#loc114) + scf.yield %m_new, %d_61 : f32, f32 loc(#loc33) + } loc(#loc142) + %loss = arith.constant 0.000000e+00 : f32 loc(#loc115) + %c0_i32_16 = arith.constant 0 : i32 loc(#loc35) + %c32768_i32_17 = arith.constant 32768 : i32 loc(#loc35) + %6 = arith.bitcast %c0_i32_16 : i32 to i32 loc(#loc35) + %7 = arith.bitcast %n_cols : i32 to i32 loc(#loc35) + %8 = arith.bitcast %c32768_i32_17 : i32 to i32 loc(#loc35) + %9 = ub.poison : i32 loc(#loc35) + %loss_18 = scf.for %i = %6 to %7 step %8 iter_args(%loss_25 = %loss) -> (f32) : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc117) + %offsets_26 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc118) + %offsets_27 = arith.extsi %offsets_26 : tensor<32768xi32> to tensor<32768xi64> loc(#loc118) + %offsets_28 = arith.extsi %offsets : tensor<32768xi32> to tensor<32768xi64> loc(#loc118) + %offsets_29 = arith.addi %offsets_27, %offsets_28 : tensor<32768xi64> loc(#loc118) + %offsets_30 = arith.constant 2147483647 : i64 loc(#loc118) + %offsets_31 = arith.constant -2147483648 : i64 loc(#loc118) + %offsets_32 = arith.constant dense<2147483647> : tensor<32768xi64> loc(#loc118) + %offsets_33 = arith.cmpi sle, %offsets_29, %offsets_32 : tensor<32768xi64> loc(#loc118) + %offsets_34 = arith.constant dense<-2147483648> : tensor<32768xi64> loc(#loc118) + %offsets_35 = arith.cmpi sge, %offsets_29, %offsets_34 : tensor<32768xi64> loc(#loc118) + %offsets_36 = arith.andi %offsets_33, %offsets_35 : tensor<32768xi1> loc(#loc118) + %offsets_37 = arith.addi %offsets_26, %offsets : tensor<32768xi32> loc(#loc118) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc119) + %mask_38 = arith.cmpi slt, %offsets_37, %mask : tensor<32768xi32> loc(#loc119) + %logits_block = tt.splat %logits_ptr_3 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc120) + %logits_block_39 = tt.addptr %logits_block, %offsets_37 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc120) + %logits_block_40 = arith.constant 0.000000e+00 : f32 loc(#loc121) + %logits_block_41 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc121) + %logits_block_42 = arith.truncf %logits_block_41 : tensor<32768xf32> to tensor<32768xbf16> loc(#loc121) + %logits_block_43 = tt.load %logits_block_39, %mask_38, %logits_block_42 : tensor<32768x!tt.ptr> loc(#loc121) + %logits_block_44 = arith.extf %logits_block_43 : tensor<32768xbf16> to tensor<32768xf32> loc(#loc122) + %target_block = tt.splat %target_ptr_6 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc123) + %target_block_45 = tt.addptr %target_block, %offsets_37 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc123) + %target_block_46 = arith.constant 0.000000e+00 : f32 loc(#loc124) + %target_block_47 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc124) + %target_block_48 = tt.load %target_block_45, %mask_38, %target_block_47 : tensor<32768x!tt.ptr> loc(#loc124) + %normalized_logits = tt.splat %d_15#0 : f32 -> tensor<32768xf32> loc(#loc125) + %normalized_logits_49 = arith.subf %logits_block_44, %normalized_logits : tensor<32768xf32> loc(#loc125) + %log_normalizer = math.log %d_15#1 : f32 loc(#loc126) + %log_softmax_logits = tt.splat %log_normalizer : f32 -> tensor<32768xf32> loc(#loc127) + %log_softmax_logits_50 = arith.subf %normalized_logits_49, %log_softmax_logits : tensor<32768xf32> loc(#loc127) + %weighted_log_prob = arith.mulf %target_block_48, %log_softmax_logits_50 : tensor<32768xf32> loc(#loc128) + %loss_51 = arith.constant 0.000000e+00 : f32 loc(#loc129) + %loss_52 = arith.constant 0.000000e+00 : f32 loc(#loc129) + %loss_53 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc129) + %loss_54 = arith.select %mask_38, %weighted_log_prob, %loss_53 : tensor<32768xi1>, tensor<32768xf32> loc(#loc129) + %loss_55 = tt.call @"triton.language.standard.sum__fp32S32768S__(1,)cNone_(2,)cconstexpr_False__(3,)cNone"(%loss_54) : (tensor<32768xf32>) -> f32 loc(#loc130) + %loss_56 = arith.addf %loss_25, %loss_55 : f32 loc(#loc131) + scf.yield %loss_56 : f32 loc(#loc51) + } loc(#loc116) + %loss_ptr_19 = arith.constant 1 : i32 loc(#loc132) + %loss_ptr_20 = arith.constant 1 : i64 loc(#loc132) + %loss_ptr_21 = arith.muli %program_id_0, %loss_ptr_20 : i64 loc(#loc132) + %loss_ptr_22 = tt.addptr %loss_ptr, %loss_ptr_21 : !tt.ptr, i64 loc(#loc133) + %m_ptr_23 = tt.addptr %m_ptr, %program_id_0 : !tt.ptr, i64 loc(#loc134) + %d_ptr_24 = tt.addptr %d_ptr, %program_id_0 : !tt.ptr, i64 loc(#loc135) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc56) + %10 = arith.subf %cst, %loss_18 : f32 loc(#loc56) + tt.store %loss_ptr_22, %10 : !tt.ptr loc(#loc57) + tt.store %m_ptr_23, %d_15#0 : !tt.ptr loc(#loc58) + tt.store %d_ptr_24, %d_15#1 : !tt.ptr loc(#loc59) + tt.return loc(#loc60) + } loc(#loc) + tt.func private @"triton.language.standard.max__fp32S32768S__(1,)cNone_(2,)cconstexpr_False__(3,)cconstexpr_True__(4,)cconstexpr_False_"(%input: tensor<32768xf32> loc("input"(#loc61))) -> f32 attributes {noinline = false} { + %0 = tt.reshape %input allow_reorder : tensor<32768xf32> -> tensor<32768xf32> loc(#loc62) + %1 = "tt.reduce"(%0) <{axis = 0 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %3 = tt.call @triton.language.standard._elementwise_max__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc62) + tt.reduce.return %3 : f32 loc(#loc62) + }) : (tensor<32768xf32>) -> f32 loc(#loc62) + tt.return %1 : f32 loc(#loc63) + ^bb1: // no predecessors + %2 = ub.poison : f32 loc(#loc64) + tt.return %2 : f32 loc(#loc64) + } loc(#loc61) + tt.func private @triton.language.standard._elementwise_max__fp32_fp32__(%a: f32 loc("a"(#loc65)), %b: f32 loc("b"(#loc65))) -> f32 attributes {noinline = false} { + %0 = arith.maxnumf %a, %b : f32 loc(#loc66) + tt.return %0 : f32 loc(#loc67) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc68) + tt.return %1 : f32 loc(#loc68) + } loc(#loc65) + tt.func private @"triton.language.standard.sum__fp32S32768S__(1,)cNone_(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32768xf32> loc("input"(#loc69))) -> f32 attributes {noinline = false} { + %0 = tt.reshape %input allow_reorder : tensor<32768xf32> -> tensor<32768xf32> loc(#loc70) + %1 = "tt.reduce"(%0) <{axis = 0 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %3 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc70) + tt.reduce.return %3 : f32 loc(#loc70) + }) : (tensor<32768xf32>) -> f32 loc(#loc70) + tt.return %1 : f32 loc(#loc71) + ^bb1: // no predecessors + %2 = ub.poison : f32 loc(#loc72) + tt.return %2 : f32 loc(#loc72) + } loc(#loc69) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc73)), %b: f32 loc("b"(#loc73))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc74) + tt.return %0 : f32 loc(#loc75) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc76) + tt.return %1 : f32 loc(#loc76) + } loc(#loc73) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":64:31) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":64:37) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":65:31) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":65:18) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":66:31) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":66:18) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":67:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":67:25) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":68:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":69:24) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":70:8) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":72:8) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":73:8) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":75:30) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":76:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":76:22) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":77:25) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":79:25) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":79:12) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":80:15) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":81:56) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":81:27) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":82:30) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":83:27) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":83:23) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":83:16) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":84:49) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":84:34) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":84:57) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":84:12) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":83:36) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":86:8) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":88:11) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":89:30) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":90:35) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":90:22) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":91:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":92:44) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":92:31) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":93:12) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":95:44) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":95:31) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":99:43) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":100:32) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":101:49) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":102:43) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":103:57) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":103:23) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":103:16) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":103:8) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":105:29) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":105:16) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":106:13) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":107:13) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":108:24) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":108:23) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":109:20) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":110:20) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":110:4) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:15) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":177:4) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:11) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:4) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc86 = loc("program_id"(#loc1)) +#loc87 = loc("program_id"(#loc2)) +#loc88 = loc("logits_ptr"(#loc3)) +#loc89 = loc("logits_ptr"(#loc4)) +#loc90 = loc("target_ptr"(#loc5)) +#loc91 = loc("target_ptr"(#loc6)) +#loc92 = loc("position_mask_ptr"(#loc7)) +#loc93 = loc("position_mask_ptr"(#loc8)) +#loc94 = loc("position_mask"(#loc9)) +#loc95 = loc("m"(#loc13)) +#loc96 = loc("d"(#loc14)) +#loc97 = loc("m"(#loc15)) +#loc98 = loc("offsets"(#loc16)) +#loc99 = loc("offsets"(#loc17)) +#loc100 = loc("mask"(#loc18)) +#loc101 = loc("logits_block"(#loc19)) +#loc102 = loc("logits_block"(#loc20)) +#loc103 = loc("logits_block"(#loc21)) +#loc104 = loc("block_max"(#loc22)) +#loc105 = loc("block_max"(#loc23)) +#loc106 = loc("m_new"(#loc24)) +#loc107 = loc("d"(#loc25)) +#loc108 = loc("d"(#loc26)) +#loc109 = loc("d"(#loc27)) +#loc110 = loc("d"(#loc28)) +#loc111 = loc("d"(#loc29)) +#loc112 = loc("d"(#loc30)) +#loc113 = loc("d"(#loc31)) +#loc114 = loc("d"(#loc32)) +#loc115 = loc("loss"(#loc34)) +#loc116 = loc("loss"(#loc35)) +#loc117 = loc("offsets"(#loc36)) +#loc118 = loc("offsets"(#loc37)) +#loc119 = loc("mask"(#loc38)) +#loc120 = loc("logits_block"(#loc39)) +#loc121 = loc("logits_block"(#loc40)) +#loc122 = loc("logits_block"(#loc41)) +#loc123 = loc("target_block"(#loc42)) +#loc124 = loc("target_block"(#loc43)) +#loc125 = loc("normalized_logits"(#loc44)) +#loc126 = loc("log_normalizer"(#loc45)) +#loc127 = loc("log_softmax_logits"(#loc46)) +#loc128 = loc("weighted_log_prob"(#loc47)) +#loc129 = loc("loss"(#loc48)) +#loc130 = loc("loss"(#loc49)) +#loc131 = loc("loss"(#loc50)) +#loc132 = loc("loss_ptr"(#loc52)) +#loc133 = loc("loss_ptr"(#loc53)) +#loc134 = loc("m_ptr"(#loc54)) +#loc135 = loc("d_ptr"(#loc55)) +#loc142 = loc("d"(#loc97)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..ba9752bff5fbe7a63e3ea3df66e6b14adfd542a9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.ttgir @@ -0,0 +1,223 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [32], order = [0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [32], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":50:0) +#loc1 = loc(unknown) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":81:27) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":84:12) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":103:23) +#loc59 = loc("logits_ptr"(#loc)) +#loc60 = loc("logits_stride"(#loc)) +#loc61 = loc("target_ptr"(#loc)) +#loc62 = loc("target_stride"(#loc)) +#loc63 = loc("position_mask_ptr"(#loc)) +#loc64 = loc("loss_ptr"(#loc)) +#loc65 = loc("m_ptr"(#loc)) +#loc66 = loc("d_ptr"(#loc)) +#loc67 = loc("n_cols"(#loc)) +#loc84 = loc("block_max"(#loc21)) +#loc92 = loc("d"(#loc31)) +#loc107 = loc("loss"(#loc48)) +#loc114 = loc(callsite(#loc1 at #loc84)) +#loc116 = loc(callsite(#loc1 at #loc92)) +#loc118 = loc(callsite(#loc1 at #loc107)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @log_softmax_forward_kernel(%logits_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("logits_ptr"(#loc)), %logits_stride: i32 {tt.divisibility = 16 : i32} loc("logits_stride"(#loc)), %target_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("target_ptr"(#loc)), %target_stride: i32 {tt.divisibility = 16 : i32} loc("target_stride"(#loc)), %position_mask_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("position_mask_ptr"(#loc)), %loss_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("loss_ptr"(#loc)), %m_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("m_ptr"(#loc)), %d_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("d_ptr"(#loc)), %n_cols: i32 {tt.divisibility = 16 : i32} loc("n_cols"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0xFF800000> : tensor<32768xf32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<32768xf32, #blocked> loc(#loc1) + %c32768_i32 = arith.constant 32768 : i32 loc(#loc1) + %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1) + %cst_2 = arith.constant 0xFF800000 : f32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c0_i8 = arith.constant 0 : i8 loc(#loc1) + %cst_3 = arith.constant dense<0xFF80> : tensor<32768xbf16, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<32768xbf16, #blocked> loc(#loc1) + %program_id = tt.get_program_id x : i32 loc(#loc68) + %program_id_5 = arith.extsi %program_id : i32 to i64 loc(#loc69) + %logits_ptr_6 = arith.extsi %logits_stride : i32 to i64 loc(#loc70) + %logits_ptr_7 = arith.muli %program_id_5, %logits_ptr_6 : i64 loc(#loc70) + %logits_ptr_8 = tt.addptr %logits_ptr, %logits_ptr_7 : !tt.ptr, i64 loc(#loc71) + %target_ptr_9 = arith.extsi %target_stride : i32 to i64 loc(#loc72) + %target_ptr_10 = arith.muli %program_id_5, %target_ptr_9 : i64 loc(#loc72) + %target_ptr_11 = tt.addptr %target_ptr, %target_ptr_10 : !tt.ptr, i64 loc(#loc73) + %position_mask_ptr_12 = tt.addptr %position_mask_ptr, %program_id_5 : !tt.ptr, i64 loc(#loc74) + %position_mask = tt.bitcast %position_mask_ptr_12 : !tt.ptr -> !tt.ptr loc(#loc75) + %position_mask_13 = tt.load %position_mask : !tt.ptr loc(#loc75) + %position_mask_14 = arith.cmpi ne, %position_mask_13, %c0_i8 : i8 loc(#loc75) + %0 = arith.extui %position_mask_14 : i1 to i32 loc(#loc10) + %1 = arith.cmpi eq, %0, %c0_i32 : i32 loc(#loc10) + cf.cond_br %1, ^bb1, ^bb2 loc(#loc10) + ^bb1: // pred: ^bb0 + tt.return loc(#loc11) + ^bb2: // pred: ^bb0 + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32, #blocked> loc(#loc76) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32, #blocked> loc(#loc77) + %logits_block = tt.splat %logits_ptr_8 : !tt.ptr -> tensor<32768x!tt.ptr, #blocked> loc(#loc78) + %d:2 = scf.for %i = %c0_i32 to %n_cols step %c32768_i32 iter_args(%m = %cst_2, %d_18 = %cst_1) -> (f32, f32) : i32 { + %offsets_19 = tt.splat %i : i32 -> tensor<32768xi32, #blocked> loc(#loc80) + %offsets_20 = arith.addi %offsets_19, %offsets : tensor<32768xi32, #blocked> loc(#loc80) + %mask_21 = arith.cmpi slt, %offsets_20, %mask : tensor<32768xi32, #blocked> loc(#loc77) + %logits_block_22 = tt.addptr %logits_block, %offsets_20 : tensor<32768x!tt.ptr, #blocked>, tensor<32768xi32, #blocked> loc(#loc78) + %logits_block_23 = tt.load %logits_block_22, %mask_21, %cst_3 : tensor<32768x!tt.ptr, #blocked> loc(#loc81) + %logits_block_24 = arith.extf %logits_block_23 : tensor<32768xbf16, #blocked> to tensor<32768xf32, #blocked> loc(#loc82) + %block_max = arith.select %mask_21, %logits_block_24, %cst : tensor<32768xi1, #blocked>, tensor<32768xf32, #blocked> loc(#loc83) + %block_max_25 = tt.reshape %block_max allow_reorder : tensor<32768xf32, #blocked> -> tensor<32768xf32, #blocked1> loc(#loc113) + %block_max_26 = "tt.reduce"(%block_max_25) <{axis = 0 : i32}> ({ + ^bb0(%block_max_37: f32 loc(callsite(#loc1 at #loc84)), %block_max_38: f32 loc(callsite(#loc1 at #loc84))): + %block_max_39 = arith.maxnumf %block_max_37, %block_max_38 : f32 loc(#loc119) + tt.reduce.return %block_max_39 : f32 loc(#loc113) + }) : (tensor<32768xf32, #blocked1>) -> f32 loc(#loc113) + %m_new = arith.maxnumf %m, %block_max_26 : f32 loc(#loc85) + %d_27 = arith.subf %m, %m_new : f32 loc(#loc86) + %d_28 = math.exp %d_27 : f32 loc(#loc87) + %d_29 = arith.mulf %d_18, %d_28 : f32 loc(#loc88) + %d_30 = tt.splat %m_new : f32 -> tensor<32768xf32, #blocked> loc(#loc89) + %d_31 = arith.subf %logits_block_24, %d_30 : tensor<32768xf32, #blocked> loc(#loc89) + %d_32 = math.exp %d_31 : tensor<32768xf32, #blocked> loc(#loc90) + %d_33 = arith.select %mask_21, %d_32, %cst_0 : tensor<32768xi1, #blocked>, tensor<32768xf32, #blocked> loc(#loc91) + %d_34 = tt.reshape %d_33 allow_reorder : tensor<32768xf32, #blocked> -> tensor<32768xf32, #blocked1> loc(#loc115) + %d_35 = "tt.reduce"(%d_34) <{axis = 0 : i32}> ({ + ^bb0(%d_37: f32 loc(callsite(#loc1 at #loc92)), %d_38: f32 loc(callsite(#loc1 at #loc92))): + %d_39 = arith.addf %d_37, %d_38 : f32 loc(#loc120) + tt.reduce.return %d_39 : f32 loc(#loc115) + }) : (tensor<32768xf32, #blocked1>) -> f32 loc(#loc115) + %d_36 = arith.addf %d_29, %d_35 : f32 loc(#loc93) + scf.yield %m_new, %d_36 : f32, f32 loc(#loc34) + } loc(#loc112) + %target_block = tt.splat %target_ptr_11 : !tt.ptr -> tensor<32768x!tt.ptr, #blocked> loc(#loc94) + %normalized_logits = tt.splat %d#0 : f32 -> tensor<32768xf32, #blocked> loc(#loc95) + %log_normalizer = math.log %d#1 : f32 loc(#loc96) + %log_softmax_logits = tt.splat %log_normalizer : f32 -> tensor<32768xf32, #blocked> loc(#loc97) + %loss = scf.for %i = %c0_i32 to %n_cols step %c32768_i32 iter_args(%loss_18 = %cst_1) -> (f32) : i32 { + %offsets_19 = tt.splat %i : i32 -> tensor<32768xi32, #blocked> loc(#loc99) + %offsets_20 = arith.addi %offsets_19, %offsets : tensor<32768xi32, #blocked> loc(#loc99) + %mask_21 = arith.cmpi slt, %offsets_20, %mask : tensor<32768xi32, #blocked> loc(#loc100) + %logits_block_22 = tt.addptr %logits_block, %offsets_20 : tensor<32768x!tt.ptr, #blocked>, tensor<32768xi32, #blocked> loc(#loc101) + %logits_block_23 = tt.load %logits_block_22, %mask_21, %cst_4 : tensor<32768x!tt.ptr, #blocked> loc(#loc102) + %logits_block_24 = arith.extf %logits_block_23 : tensor<32768xbf16, #blocked> to tensor<32768xf32, #blocked> loc(#loc103) + %target_block_25 = tt.addptr %target_block, %offsets_20 : tensor<32768x!tt.ptr, #blocked>, tensor<32768xi32, #blocked> loc(#loc94) + %target_block_26 = tt.load %target_block_25, %mask_21, %cst_0 : tensor<32768x!tt.ptr, #blocked> loc(#loc104) + %normalized_logits_27 = arith.subf %logits_block_24, %normalized_logits : tensor<32768xf32, #blocked> loc(#loc95) + %log_softmax_logits_28 = arith.subf %normalized_logits_27, %log_softmax_logits : tensor<32768xf32, #blocked> loc(#loc97) + %weighted_log_prob = arith.mulf %target_block_26, %log_softmax_logits_28 : tensor<32768xf32, #blocked> loc(#loc105) + %loss_29 = arith.select %mask_21, %weighted_log_prob, %cst_0 : tensor<32768xi1, #blocked>, tensor<32768xf32, #blocked> loc(#loc106) + %loss_30 = tt.reshape %loss_29 allow_reorder : tensor<32768xf32, #blocked> -> tensor<32768xf32, #blocked1> loc(#loc117) + %loss_31 = "tt.reduce"(%loss_30) <{axis = 0 : i32}> ({ + ^bb0(%loss_33: f32 loc(callsite(#loc1 at #loc107)), %loss_34: f32 loc(callsite(#loc1 at #loc107))): + %loss_35 = arith.addf %loss_33, %loss_34 : f32 loc(#loc121) + tt.reduce.return %loss_35 : f32 loc(#loc117) + }) : (tensor<32768xf32, #blocked1>) -> f32 loc(#loc117) + %loss_32 = arith.addf %loss_18, %loss_31 : f32 loc(#loc108) + scf.yield %loss_32 : f32 loc(#loc50) + } loc(#loc98) + %loss_ptr_15 = tt.addptr %loss_ptr, %program_id_5 : !tt.ptr, i64 loc(#loc109) + %m_ptr_16 = tt.addptr %m_ptr, %program_id_5 : !tt.ptr, i64 loc(#loc110) + %d_ptr_17 = tt.addptr %d_ptr, %program_id_5 : !tt.ptr, i64 loc(#loc111) + %2 = arith.subf %cst_1, %loss : f32 loc(#loc54) + tt.store %loss_ptr_15, %2 : !tt.ptr loc(#loc55) + tt.store %m_ptr_16, %d#0 : !tt.ptr loc(#loc56) + tt.store %d_ptr_17, %d#1 : !tt.ptr loc(#loc57) + tt.return loc(#loc58) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":64:31) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":64:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":65:31) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":65:18) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":66:31) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":66:18) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":67:25) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":68:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":69:24) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":70:8) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":76:35) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":77:25) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":79:25) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":75:30) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":76:22) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":79:12) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":80:15) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":81:56) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":82:30) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":83:27) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":83:23) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":83:16) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":84:49) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":84:34) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":84:57) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":83:36) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":86:8) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":95:44) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":99:43) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":100:32) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":101:49) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":89:30) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":90:22) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":91:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":92:44) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":92:31) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":93:12) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":95:31) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":102:43) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":103:57) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":103:16) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":103:8) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":105:16) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":106:13) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":107:13) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":108:24) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":108:23) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":109:20) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":110:20) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":110:4) +#loc68 = loc("program_id"(#loc2)) +#loc69 = loc("program_id"(#loc3)) +#loc70 = loc("logits_ptr"(#loc4)) +#loc71 = loc("logits_ptr"(#loc5)) +#loc72 = loc("target_ptr"(#loc6)) +#loc73 = loc("target_ptr"(#loc7)) +#loc74 = loc("position_mask_ptr"(#loc8)) +#loc75 = loc("position_mask"(#loc9)) +#loc76 = loc("offsets"(#loc12)) +#loc77 = loc("mask"(#loc13)) +#loc78 = loc("logits_block"(#loc14)) +#loc79 = loc("m"(#loc15)) +#loc80 = loc("offsets"(#loc16)) +#loc81 = loc("logits_block"(#loc17)) +#loc82 = loc("logits_block"(#loc18)) +#loc83 = loc("block_max"(#loc19)) +#loc85 = loc("m_new"(#loc23)) +#loc86 = loc("d"(#loc24)) +#loc87 = loc("d"(#loc25)) +#loc88 = loc("d"(#loc26)) +#loc89 = loc("d"(#loc27)) +#loc90 = loc("d"(#loc28)) +#loc91 = loc("d"(#loc29)) +#loc93 = loc("d"(#loc33)) +#loc94 = loc("target_block"(#loc35)) +#loc95 = loc("normalized_logits"(#loc36)) +#loc96 = loc("log_normalizer"(#loc37)) +#loc97 = loc("log_softmax_logits"(#loc38)) +#loc98 = loc("loss"(#loc39)) +#loc99 = loc("offsets"(#loc40)) +#loc100 = loc("mask"(#loc41)) +#loc101 = loc("logits_block"(#loc42)) +#loc102 = loc("logits_block"(#loc43)) +#loc103 = loc("logits_block"(#loc44)) +#loc104 = loc("target_block"(#loc45)) +#loc105 = loc("weighted_log_prob"(#loc46)) +#loc106 = loc("loss"(#loc47)) +#loc108 = loc("loss"(#loc49)) +#loc109 = loc("loss_ptr"(#loc51)) +#loc110 = loc("m_ptr"(#loc52)) +#loc111 = loc("d_ptr"(#loc53)) +#loc112 = loc("d"(#loc79)) +#loc113 = loc(callsite(#loc20 at #loc84)) +#loc115 = loc(callsite(#loc30 at #loc92)) +#loc117 = loc(callsite(#loc30 at #loc107)) +#loc119 = loc(callsite(#loc22 at #loc113)) +#loc120 = loc(callsite(#loc32 at #loc115)) +#loc121 = loc(callsite(#loc32 at #loc117)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.ttir new file mode 100644 index 0000000000000000000000000000000000000000..78d69b368d43f3243cfcd244aa9d9620d99d39b3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ELN27VNVSP7HVH5MWV43FCRKY4AHNEVBQN7ZXY5OFW2QPUQ5S65Q/log_softmax_forward_kernel.ttir @@ -0,0 +1,226 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":50:0) +#loc1 = loc(unknown) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":81:27) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":84:12) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":103:23) +#loc60 = loc("logits_ptr"(#loc)) +#loc61 = loc("logits_stride"(#loc)) +#loc62 = loc("target_ptr"(#loc)) +#loc63 = loc("target_stride"(#loc)) +#loc64 = loc("position_mask_ptr"(#loc)) +#loc65 = loc("loss_ptr"(#loc)) +#loc66 = loc("m_ptr"(#loc)) +#loc67 = loc("d_ptr"(#loc)) +#loc68 = loc("n_cols"(#loc)) +#loc85 = loc("block_max"(#loc21)) +#loc93 = loc("d"(#loc31)) +#loc109 = loc("loss"(#loc49)) +#loc116 = loc(callsite(#loc1 at #loc85)) +#loc118 = loc(callsite(#loc1 at #loc93)) +#loc120 = loc(callsite(#loc1 at #loc109)) +module { + tt.func public @log_softmax_forward_kernel(%logits_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("logits_ptr"(#loc)), %logits_stride: i32 {tt.divisibility = 16 : i32} loc("logits_stride"(#loc)), %target_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("target_ptr"(#loc)), %target_stride: i32 {tt.divisibility = 16 : i32} loc("target_stride"(#loc)), %position_mask_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("position_mask_ptr"(#loc)), %loss_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("loss_ptr"(#loc)), %m_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("m_ptr"(#loc)), %d_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("d_ptr"(#loc)), %n_cols: i32 {tt.divisibility = 16 : i32} loc("n_cols"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<32768xbf16> loc(#loc1) + %cst_0 = arith.constant dense<0xFF80> : tensor<32768xbf16> loc(#loc1) + %c32768_i32 = arith.constant 32768 : i32 loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc1) + %cst_2 = arith.constant dense<0xFF800000> : tensor<32768xf32> loc(#loc1) + %cst_3 = arith.constant 0.000000e+00 : f32 loc(#loc1) + %cst_4 = arith.constant 0xFF800000 : f32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %position_mask = arith.constant 0 : i8 loc(#loc69) + %program_id = tt.get_program_id x : i32 loc(#loc70) + %program_id_5 = arith.extsi %program_id : i32 to i64 loc(#loc71) + %logits_ptr_6 = arith.extsi %logits_stride : i32 to i64 loc(#loc72) + %logits_ptr_7 = arith.muli %program_id_5, %logits_ptr_6 : i64 loc(#loc72) + %logits_ptr_8 = tt.addptr %logits_ptr, %logits_ptr_7 : !tt.ptr, i64 loc(#loc73) + %target_ptr_9 = arith.extsi %target_stride : i32 to i64 loc(#loc74) + %target_ptr_10 = arith.muli %program_id_5, %target_ptr_9 : i64 loc(#loc74) + %target_ptr_11 = tt.addptr %target_ptr, %target_ptr_10 : !tt.ptr, i64 loc(#loc75) + %position_mask_ptr_12 = tt.addptr %position_mask_ptr, %program_id_5 : !tt.ptr, i64 loc(#loc76) + %position_mask_13 = tt.bitcast %position_mask_ptr_12 : !tt.ptr -> !tt.ptr loc(#loc69) + %position_mask_14 = tt.load %position_mask_13 : !tt.ptr loc(#loc69) + %position_mask_15 = arith.cmpi ne, %position_mask_14, %position_mask : i8 loc(#loc69) + %0 = arith.extui %position_mask_15 : i1 to i32 loc(#loc10) + %1 = arith.cmpi eq, %0, %c0_i32 : i32 loc(#loc10) + cf.cond_br %1, ^bb1, ^bb2 loc(#loc10) + ^bb1: // pred: ^bb0 + tt.return loc(#loc11) + ^bb2: // pred: ^bb0 + %d:2 = scf.for %i = %c0_i32 to %n_cols step %c32768_i32 iter_args(%m = %cst_4, %d_19 = %cst_3) -> (f32, f32) : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc78) + %offsets_20 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc79) + %offsets_21 = arith.addi %offsets_20, %offsets : tensor<32768xi32> loc(#loc79) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc80) + %mask_22 = arith.cmpi slt, %offsets_21, %mask : tensor<32768xi32> loc(#loc80) + %logits_block = tt.splat %logits_ptr_8 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc81) + %logits_block_23 = tt.addptr %logits_block, %offsets_21 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc81) + %logits_block_24 = tt.load %logits_block_23, %mask_22, %cst_0 : tensor<32768x!tt.ptr> loc(#loc82) + %logits_block_25 = arith.extf %logits_block_24 : tensor<32768xbf16> to tensor<32768xf32> loc(#loc83) + %block_max = arith.select %mask_22, %logits_block_25, %cst_2 : tensor<32768xi1>, tensor<32768xf32> loc(#loc84) + %block_max_26 = tt.reshape %block_max allow_reorder : tensor<32768xf32> -> tensor<32768xf32> loc(#loc115) + %block_max_27 = "tt.reduce"(%block_max_26) <{axis = 0 : i32}> ({ + ^bb0(%block_max_38: f32 loc(callsite(#loc1 at #loc85)), %block_max_39: f32 loc(callsite(#loc1 at #loc85))): + %block_max_40 = arith.maxnumf %block_max_38, %block_max_39 : f32 loc(#loc121) + tt.reduce.return %block_max_40 : f32 loc(#loc115) + }) : (tensor<32768xf32>) -> f32 loc(#loc115) + %m_new = arith.maxnumf %m, %block_max_27 : f32 loc(#loc86) + %d_28 = arith.subf %m, %m_new : f32 loc(#loc87) + %d_29 = math.exp %d_28 : f32 loc(#loc88) + %d_30 = arith.mulf %d_19, %d_29 : f32 loc(#loc89) + %d_31 = tt.splat %m_new : f32 -> tensor<32768xf32> loc(#loc90) + %d_32 = arith.subf %logits_block_25, %d_31 : tensor<32768xf32> loc(#loc90) + %d_33 = math.exp %d_32 : tensor<32768xf32> loc(#loc91) + %d_34 = arith.select %mask_22, %d_33, %cst_1 : tensor<32768xi1>, tensor<32768xf32> loc(#loc92) + %d_35 = tt.reshape %d_34 allow_reorder : tensor<32768xf32> -> tensor<32768xf32> loc(#loc117) + %d_36 = "tt.reduce"(%d_35) <{axis = 0 : i32}> ({ + ^bb0(%d_38: f32 loc(callsite(#loc1 at #loc93)), %d_39: f32 loc(callsite(#loc1 at #loc93))): + %d_40 = arith.addf %d_38, %d_39 : f32 loc(#loc122) + tt.reduce.return %d_40 : f32 loc(#loc117) + }) : (tensor<32768xf32>) -> f32 loc(#loc117) + %d_37 = arith.addf %d_30, %d_36 : f32 loc(#loc94) + scf.yield %m_new, %d_37 : f32, f32 loc(#loc34) + } loc(#loc114) + %loss = scf.for %i = %c0_i32 to %n_cols step %c32768_i32 iter_args(%loss_19 = %cst_3) -> (f32) : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc96) + %offsets_20 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc97) + %offsets_21 = arith.addi %offsets_20, %offsets : tensor<32768xi32> loc(#loc97) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc98) + %mask_22 = arith.cmpi slt, %offsets_21, %mask : tensor<32768xi32> loc(#loc98) + %logits_block = tt.splat %logits_ptr_8 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc99) + %logits_block_23 = tt.addptr %logits_block, %offsets_21 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc99) + %logits_block_24 = tt.load %logits_block_23, %mask_22, %cst : tensor<32768x!tt.ptr> loc(#loc100) + %logits_block_25 = arith.extf %logits_block_24 : tensor<32768xbf16> to tensor<32768xf32> loc(#loc101) + %target_block = tt.splat %target_ptr_11 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc102) + %target_block_26 = tt.addptr %target_block, %offsets_21 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc102) + %target_block_27 = tt.load %target_block_26, %mask_22, %cst_1 : tensor<32768x!tt.ptr> loc(#loc103) + %normalized_logits = tt.splat %d#0 : f32 -> tensor<32768xf32> loc(#loc104) + %normalized_logits_28 = arith.subf %logits_block_25, %normalized_logits : tensor<32768xf32> loc(#loc104) + %log_normalizer = math.log %d#1 : f32 loc(#loc105) + %log_softmax_logits = tt.splat %log_normalizer : f32 -> tensor<32768xf32> loc(#loc106) + %log_softmax_logits_29 = arith.subf %normalized_logits_28, %log_softmax_logits : tensor<32768xf32> loc(#loc106) + %weighted_log_prob = arith.mulf %target_block_27, %log_softmax_logits_29 : tensor<32768xf32> loc(#loc107) + %loss_30 = arith.select %mask_22, %weighted_log_prob, %cst_1 : tensor<32768xi1>, tensor<32768xf32> loc(#loc108) + %loss_31 = tt.reshape %loss_30 allow_reorder : tensor<32768xf32> -> tensor<32768xf32> loc(#loc119) + %loss_32 = "tt.reduce"(%loss_31) <{axis = 0 : i32}> ({ + ^bb0(%loss_34: f32 loc(callsite(#loc1 at #loc109)), %loss_35: f32 loc(callsite(#loc1 at #loc109))): + %loss_36 = arith.addf %loss_34, %loss_35 : f32 loc(#loc123) + tt.reduce.return %loss_36 : f32 loc(#loc119) + }) : (tensor<32768xf32>) -> f32 loc(#loc119) + %loss_33 = arith.addf %loss_19, %loss_32 : f32 loc(#loc110) + scf.yield %loss_33 : f32 loc(#loc51) + } loc(#loc95) + %loss_ptr_16 = tt.addptr %loss_ptr, %program_id_5 : !tt.ptr, i64 loc(#loc111) + %m_ptr_17 = tt.addptr %m_ptr, %program_id_5 : !tt.ptr, i64 loc(#loc112) + %d_ptr_18 = tt.addptr %d_ptr, %program_id_5 : !tt.ptr, i64 loc(#loc113) + %2 = arith.subf %cst_3, %loss : f32 loc(#loc55) + tt.store %loss_ptr_16, %2 : !tt.ptr loc(#loc56) + tt.store %m_ptr_17, %d#0 : !tt.ptr loc(#loc57) + tt.store %d_ptr_18, %d#1 : !tt.ptr loc(#loc58) + tt.return loc(#loc59) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":68:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":64:31) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":64:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":65:31) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":65:18) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":66:31) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":66:18) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":67:25) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":69:24) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":70:8) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":75:30) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":76:35) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":76:22) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":77:25) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":79:25) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":79:12) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":80:15) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":81:56) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":82:30) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":83:27) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":83:23) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":83:16) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":84:49) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":84:34) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":84:57) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":83:36) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":86:8) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":89:30) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":90:35) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":90:22) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":91:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":92:44) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":92:31) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":93:12) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":95:44) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":95:31) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":99:43) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":100:32) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":101:49) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":102:43) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":103:57) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":103:16) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":103:8) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":105:16) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":106:13) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":107:13) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":108:24) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":108:23) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":109:20) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":110:20) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":110:4) +#loc69 = loc("position_mask"(#loc2)) +#loc70 = loc("program_id"(#loc3)) +#loc71 = loc("program_id"(#loc4)) +#loc72 = loc("logits_ptr"(#loc5)) +#loc73 = loc("logits_ptr"(#loc6)) +#loc74 = loc("target_ptr"(#loc7)) +#loc75 = loc("target_ptr"(#loc8)) +#loc76 = loc("position_mask_ptr"(#loc9)) +#loc77 = loc("m"(#loc12)) +#loc78 = loc("offsets"(#loc13)) +#loc79 = loc("offsets"(#loc14)) +#loc80 = loc("mask"(#loc15)) +#loc81 = loc("logits_block"(#loc16)) +#loc82 = loc("logits_block"(#loc17)) +#loc83 = loc("logits_block"(#loc18)) +#loc84 = loc("block_max"(#loc19)) +#loc86 = loc("m_new"(#loc23)) +#loc87 = loc("d"(#loc24)) +#loc88 = loc("d"(#loc25)) +#loc89 = loc("d"(#loc26)) +#loc90 = loc("d"(#loc27)) +#loc91 = loc("d"(#loc28)) +#loc92 = loc("d"(#loc29)) +#loc94 = loc("d"(#loc33)) +#loc95 = loc("loss"(#loc35)) +#loc96 = loc("offsets"(#loc36)) +#loc97 = loc("offsets"(#loc37)) +#loc98 = loc("mask"(#loc38)) +#loc99 = loc("logits_block"(#loc39)) +#loc100 = loc("logits_block"(#loc40)) +#loc101 = loc("logits_block"(#loc41)) +#loc102 = loc("target_block"(#loc42)) +#loc103 = loc("target_block"(#loc43)) +#loc104 = loc("normalized_logits"(#loc44)) +#loc105 = loc("log_normalizer"(#loc45)) +#loc106 = loc("log_softmax_logits"(#loc46)) +#loc107 = loc("weighted_log_prob"(#loc47)) +#loc108 = loc("loss"(#loc48)) +#loc110 = loc("loss"(#loc50)) +#loc111 = loc("loss_ptr"(#loc52)) +#loc112 = loc("m_ptr"(#loc53)) +#loc113 = loc("d_ptr"(#loc54)) +#loc114 = loc("d"(#loc77)) +#loc115 = loc(callsite(#loc20 at #loc85)) +#loc117 = loc(callsite(#loc30 at #loc93)) +#loc119 = loc(callsite(#loc30 at #loc109)) +#loc121 = loc(callsite(#loc22 at #loc115)) +#loc122 = loc(callsite(#loc32 at #loc117)) +#loc123 = loc(callsite(#loc32 at #loc119)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json new file mode 100644 index 0000000000000000000000000000000000000000..137a4c71b389dd1ccfac8490bacf9954e1869e69 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json new file mode 100644 index 0000000000000000000000000000000000000000..aa0035920e7601eef839dd538bebf9d5e4431884 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json @@ -0,0 +1 @@ +{"hash": "239ded5b75de8be8d5a6cc1077ce3cbdc0d26f69988d60123785c978c61178f7", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 16384, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..c04b46ec92296dc4f628c333dcbe69d09981900a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir @@ -0,0 +1,10206 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !5 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = and i32 %8, 31, !dbg !9 + %10 = lshr i32 %8, 5, !dbg !9 + %11 = and i32 %8, 511, !dbg !9 + %12 = shl nuw nsw i32 %11, 3, !dbg !9 + %13 = or disjoint i32 %12, 28672, !dbg !9 + %14 = shl nuw nsw i32 %11, 2, !dbg !9 + %15 = or disjoint i32 %14, 30720, !dbg !9 + %16 = icmp samesign ult i32 %13, 32000, !dbg !10 + %17 = icmp samesign ult i32 %15, 32000, !dbg !10 + %18 = mul i32 %7, 32000, !dbg !11 + %19 = add i32 %12, %18, !dbg !12 + %20 = add i32 %18, 4096, !dbg !9 + %21 = add i32 %20, %12, !dbg !12 + %22 = add i32 %18, 8192, !dbg !9 + %23 = add i32 %22, %12, !dbg !12 + %24 = add i32 %18, 12288, !dbg !9 + %25 = add i32 %24, %12, !dbg !12 + %26 = add i32 %18, 16384, !dbg !9 + %27 = add i32 %26, %12, !dbg !12 + %28 = add i32 %18, 20480, !dbg !9 + %29 = add i32 %28, %12, !dbg !12 + %30 = add i32 %18, 24576, !dbg !9 + %31 = add i32 %30, %12, !dbg !12 + %32 = add i32 %13, %18, !dbg !12 + %33 = add i32 %14, %18, !dbg !12 + %34 = add i32 %18, 2048, !dbg !9 + %35 = add i32 %34, %14, !dbg !12 + %36 = add i32 %20, %14, !dbg !12 + %37 = add i32 %18, 6144, !dbg !9 + %38 = add i32 %37, %14, !dbg !12 + %39 = add i32 %22, %14, !dbg !12 + %40 = add i32 %18, 10240, !dbg !9 + %41 = add i32 %40, %14, !dbg !12 + %42 = add i32 %24, %14, !dbg !12 + %43 = add i32 %18, 14336, !dbg !9 + %44 = add i32 %43, %14, !dbg !12 + %45 = add i32 %26, %14, !dbg !12 + %46 = add i32 %18, 18432, !dbg !9 + %47 = add i32 %46, %14, !dbg !12 + %48 = add i32 %28, %14, !dbg !12 + %49 = add i32 %18, 22528, !dbg !9 + %50 = add i32 %49, %14, !dbg !12 + %51 = add i32 %30, %14, !dbg !12 + %52 = add i32 %18, 26624, !dbg !9 + %53 = add i32 %52, %14, !dbg !12 + %54 = add i32 %18, 28672, !dbg !9 + %55 = add i32 %54, %14, !dbg !12 + %56 = add i32 %15, %18, !dbg !12 + %57 = sext i32 %19 to i64, !dbg !13 + %58 = getelementptr bfloat, ptr addrspace(1) %0, i64 %57, !dbg !13 + %59 = sext i32 %21 to i64, !dbg !13 + %60 = getelementptr bfloat, ptr addrspace(1) %0, i64 %59, !dbg !13 + %61 = sext i32 %23 to i64, !dbg !13 + %62 = getelementptr bfloat, ptr addrspace(1) %0, i64 %61, !dbg !13 + %63 = sext i32 %25 to i64, !dbg !13 + %64 = getelementptr bfloat, ptr addrspace(1) %0, i64 %63, !dbg !13 + %65 = sext i32 %27 to i64, !dbg !13 + %66 = getelementptr bfloat, ptr addrspace(1) %0, i64 %65, !dbg !13 + %67 = sext i32 %29 to i64, !dbg !13 + %68 = getelementptr bfloat, ptr addrspace(1) %0, i64 %67, !dbg !13 + %69 = sext i32 %31 to i64, !dbg !13 + %70 = getelementptr bfloat, ptr addrspace(1) %0, i64 %69, !dbg !13 + %71 = sext i32 %32 to i64, !dbg !13 + %72 = getelementptr bfloat, ptr addrspace(1) %0, i64 %71, !dbg !13 + %73 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %74 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %58, i64 %73, i1 true) #6, !dbg !14 + %75 = extractvalue { i32, i32, i32, i32 } %74, 0, !dbg !14 + %76 = bitcast i32 %75 to <2 x bfloat>, !dbg !14 + %77 = extractvalue { i32, i32, i32, i32 } %74, 1, !dbg !14 + %78 = bitcast i32 %77 to <2 x bfloat>, !dbg !14 + %79 = extractvalue { i32, i32, i32, i32 } %74, 2, !dbg !14 + %80 = bitcast i32 %79 to <2 x bfloat>, !dbg !14 + %81 = extractvalue { i32, i32, i32, i32 } %74, 3, !dbg !14 + %82 = bitcast i32 %81 to <2 x bfloat>, !dbg !14 + %83 = extractelement <2 x bfloat> %76, i64 0, !dbg !14 + %84 = extractelement <2 x bfloat> %76, i64 1, !dbg !14 + %85 = extractelement <2 x bfloat> %78, i64 0, !dbg !14 + %86 = extractelement <2 x bfloat> %78, i64 1, !dbg !14 + %87 = extractelement <2 x bfloat> %80, i64 0, !dbg !14 + %88 = extractelement <2 x bfloat> %80, i64 1, !dbg !14 + %89 = extractelement <2 x bfloat> %82, i64 0, !dbg !14 + %90 = extractelement <2 x bfloat> %82, i64 1, !dbg !14 + %91 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %92 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %60, i64 %91, i1 true) #6, !dbg !14 + %93 = extractvalue { i32, i32, i32, i32 } %92, 0, !dbg !14 + %94 = bitcast i32 %93 to <2 x bfloat>, !dbg !14 + %95 = extractvalue { i32, i32, i32, i32 } %92, 1, !dbg !14 + %96 = bitcast i32 %95 to <2 x bfloat>, !dbg !14 + %97 = extractvalue { i32, i32, i32, i32 } %92, 2, !dbg !14 + %98 = bitcast i32 %97 to <2 x bfloat>, !dbg !14 + %99 = extractvalue { i32, i32, i32, i32 } %92, 3, !dbg !14 + %100 = bitcast i32 %99 to <2 x bfloat>, !dbg !14 + %101 = extractelement <2 x bfloat> %94, i64 0, !dbg !14 + %102 = extractelement <2 x bfloat> %94, i64 1, !dbg !14 + %103 = extractelement <2 x bfloat> %96, i64 0, !dbg !14 + %104 = extractelement <2 x bfloat> %96, i64 1, !dbg !14 + %105 = extractelement <2 x bfloat> %98, i64 0, !dbg !14 + %106 = extractelement <2 x bfloat> %98, i64 1, !dbg !14 + %107 = extractelement <2 x bfloat> %100, i64 0, !dbg !14 + %108 = extractelement <2 x bfloat> %100, i64 1, !dbg !14 + %109 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %110 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %62, i64 %109, i1 true) #6, !dbg !14 + %111 = extractvalue { i32, i32, i32, i32 } %110, 0, !dbg !14 + %112 = bitcast i32 %111 to <2 x bfloat>, !dbg !14 + %113 = extractvalue { i32, i32, i32, i32 } %110, 1, !dbg !14 + %114 = bitcast i32 %113 to <2 x bfloat>, !dbg !14 + %115 = extractvalue { i32, i32, i32, i32 } %110, 2, !dbg !14 + %116 = bitcast i32 %115 to <2 x bfloat>, !dbg !14 + %117 = extractvalue { i32, i32, i32, i32 } %110, 3, !dbg !14 + %118 = bitcast i32 %117 to <2 x bfloat>, !dbg !14 + %119 = extractelement <2 x bfloat> %112, i64 0, !dbg !14 + %120 = extractelement <2 x bfloat> %112, i64 1, !dbg !14 + %121 = extractelement <2 x bfloat> %114, i64 0, !dbg !14 + %122 = extractelement <2 x bfloat> %114, i64 1, !dbg !14 + %123 = extractelement <2 x bfloat> %116, i64 0, !dbg !14 + %124 = extractelement <2 x bfloat> %116, i64 1, !dbg !14 + %125 = extractelement <2 x bfloat> %118, i64 0, !dbg !14 + %126 = extractelement <2 x bfloat> %118, i64 1, !dbg !14 + %127 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %128 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %64, i64 %127, i1 true) #6, !dbg !14 + %129 = extractvalue { i32, i32, i32, i32 } %128, 0, !dbg !14 + %130 = bitcast i32 %129 to <2 x bfloat>, !dbg !14 + %131 = extractvalue { i32, i32, i32, i32 } %128, 1, !dbg !14 + %132 = bitcast i32 %131 to <2 x bfloat>, !dbg !14 + %133 = extractvalue { i32, i32, i32, i32 } %128, 2, !dbg !14 + %134 = bitcast i32 %133 to <2 x bfloat>, !dbg !14 + %135 = extractvalue { i32, i32, i32, i32 } %128, 3, !dbg !14 + %136 = bitcast i32 %135 to <2 x bfloat>, !dbg !14 + %137 = extractelement <2 x bfloat> %130, i64 0, !dbg !14 + %138 = extractelement <2 x bfloat> %130, i64 1, !dbg !14 + %139 = extractelement <2 x bfloat> %132, i64 0, !dbg !14 + %140 = extractelement <2 x bfloat> %132, i64 1, !dbg !14 + %141 = extractelement <2 x bfloat> %134, i64 0, !dbg !14 + %142 = extractelement <2 x bfloat> %134, i64 1, !dbg !14 + %143 = extractelement <2 x bfloat> %136, i64 0, !dbg !14 + %144 = extractelement <2 x bfloat> %136, i64 1, !dbg !14 + %145 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %146 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %66, i64 %145, i1 true) #6, !dbg !14 + %147 = extractvalue { i32, i32, i32, i32 } %146, 0, !dbg !14 + %148 = bitcast i32 %147 to <2 x bfloat>, !dbg !14 + %149 = extractvalue { i32, i32, i32, i32 } %146, 1, !dbg !14 + %150 = bitcast i32 %149 to <2 x bfloat>, !dbg !14 + %151 = extractvalue { i32, i32, i32, i32 } %146, 2, !dbg !14 + %152 = bitcast i32 %151 to <2 x bfloat>, !dbg !14 + %153 = extractvalue { i32, i32, i32, i32 } %146, 3, !dbg !14 + %154 = bitcast i32 %153 to <2 x bfloat>, !dbg !14 + %155 = extractelement <2 x bfloat> %148, i64 0, !dbg !14 + %156 = extractelement <2 x bfloat> %148, i64 1, !dbg !14 + %157 = extractelement <2 x bfloat> %150, i64 0, !dbg !14 + %158 = extractelement <2 x bfloat> %150, i64 1, !dbg !14 + %159 = extractelement <2 x bfloat> %152, i64 0, !dbg !14 + %160 = extractelement <2 x bfloat> %152, i64 1, !dbg !14 + %161 = extractelement <2 x bfloat> %154, i64 0, !dbg !14 + %162 = extractelement <2 x bfloat> %154, i64 1, !dbg !14 + %163 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %164 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %68, i64 %163, i1 true) #6, !dbg !14 + %165 = extractvalue { i32, i32, i32, i32 } %164, 0, !dbg !14 + %166 = bitcast i32 %165 to <2 x bfloat>, !dbg !14 + %167 = extractvalue { i32, i32, i32, i32 } %164, 1, !dbg !14 + %168 = bitcast i32 %167 to <2 x bfloat>, !dbg !14 + %169 = extractvalue { i32, i32, i32, i32 } %164, 2, !dbg !14 + %170 = bitcast i32 %169 to <2 x bfloat>, !dbg !14 + %171 = extractvalue { i32, i32, i32, i32 } %164, 3, !dbg !14 + %172 = bitcast i32 %171 to <2 x bfloat>, !dbg !14 + %173 = extractelement <2 x bfloat> %166, i64 0, !dbg !14 + %174 = extractelement <2 x bfloat> %166, i64 1, !dbg !14 + %175 = extractelement <2 x bfloat> %168, i64 0, !dbg !14 + %176 = extractelement <2 x bfloat> %168, i64 1, !dbg !14 + %177 = extractelement <2 x bfloat> %170, i64 0, !dbg !14 + %178 = extractelement <2 x bfloat> %170, i64 1, !dbg !14 + %179 = extractelement <2 x bfloat> %172, i64 0, !dbg !14 + %180 = extractelement <2 x bfloat> %172, i64 1, !dbg !14 + %181 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %182 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %70, i64 %181, i1 true) #6, !dbg !14 + %183 = extractvalue { i32, i32, i32, i32 } %182, 0, !dbg !14 + %184 = bitcast i32 %183 to <2 x bfloat>, !dbg !14 + %185 = extractvalue { i32, i32, i32, i32 } %182, 1, !dbg !14 + %186 = bitcast i32 %185 to <2 x bfloat>, !dbg !14 + %187 = extractvalue { i32, i32, i32, i32 } %182, 2, !dbg !14 + %188 = bitcast i32 %187 to <2 x bfloat>, !dbg !14 + %189 = extractvalue { i32, i32, i32, i32 } %182, 3, !dbg !14 + %190 = bitcast i32 %189 to <2 x bfloat>, !dbg !14 + %191 = extractelement <2 x bfloat> %184, i64 0, !dbg !14 + %192 = extractelement <2 x bfloat> %184, i64 1, !dbg !14 + %193 = extractelement <2 x bfloat> %186, i64 0, !dbg !14 + %194 = extractelement <2 x bfloat> %186, i64 1, !dbg !14 + %195 = extractelement <2 x bfloat> %188, i64 0, !dbg !14 + %196 = extractelement <2 x bfloat> %188, i64 1, !dbg !14 + %197 = extractelement <2 x bfloat> %190, i64 0, !dbg !14 + %198 = extractelement <2 x bfloat> %190, i64 1, !dbg !14 + %199 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %200 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %72, i64 %199, i1 %16) #6, !dbg !14 + %201 = extractvalue { i32, i32, i32, i32 } %200, 0, !dbg !14 + %202 = bitcast i32 %201 to <2 x bfloat>, !dbg !14 + %203 = extractvalue { i32, i32, i32, i32 } %200, 1, !dbg !14 + %204 = bitcast i32 %203 to <2 x bfloat>, !dbg !14 + %205 = extractvalue { i32, i32, i32, i32 } %200, 2, !dbg !14 + %206 = bitcast i32 %205 to <2 x bfloat>, !dbg !14 + %207 = extractvalue { i32, i32, i32, i32 } %200, 3, !dbg !14 + %208 = bitcast i32 %207 to <2 x bfloat>, !dbg !14 + %209 = extractelement <2 x bfloat> %202, i64 0, !dbg !14 + %210 = extractelement <2 x bfloat> %202, i64 1, !dbg !14 + %211 = fpext bfloat %83 to float, !dbg !15 + %212 = fpext bfloat %84 to float, !dbg !15 + %213 = fpext bfloat %85 to float, !dbg !15 + %214 = fpext bfloat %86 to float, !dbg !15 + %215 = fpext bfloat %87 to float, !dbg !15 + %216 = fpext bfloat %88 to float, !dbg !15 + %217 = fpext bfloat %89 to float, !dbg !15 + %218 = fpext bfloat %90 to float, !dbg !15 + %219 = fpext bfloat %101 to float, !dbg !15 + %220 = fpext bfloat %102 to float, !dbg !15 + %221 = fpext bfloat %103 to float, !dbg !15 + %222 = fpext bfloat %104 to float, !dbg !15 + %223 = fpext bfloat %105 to float, !dbg !15 + %224 = fpext bfloat %106 to float, !dbg !15 + %225 = fpext bfloat %107 to float, !dbg !15 + %226 = fpext bfloat %108 to float, !dbg !15 + %227 = fpext bfloat %119 to float, !dbg !15 + %228 = fpext bfloat %120 to float, !dbg !15 + %229 = fpext bfloat %121 to float, !dbg !15 + %230 = fpext bfloat %122 to float, !dbg !15 + %231 = fpext bfloat %123 to float, !dbg !15 + %232 = fpext bfloat %124 to float, !dbg !15 + %233 = fpext bfloat %125 to float, !dbg !15 + %234 = fpext bfloat %126 to float, !dbg !15 + %235 = fpext bfloat %137 to float, !dbg !15 + %236 = fpext bfloat %138 to float, !dbg !15 + %237 = fpext bfloat %139 to float, !dbg !15 + %238 = fpext bfloat %140 to float, !dbg !15 + %239 = fpext bfloat %141 to float, !dbg !15 + %240 = fpext bfloat %142 to float, !dbg !15 + %241 = fpext bfloat %143 to float, !dbg !15 + %242 = fpext bfloat %144 to float, !dbg !15 + %243 = fpext bfloat %155 to float, !dbg !15 + %244 = fpext bfloat %156 to float, !dbg !15 + %245 = fpext bfloat %157 to float, !dbg !15 + %246 = fpext bfloat %158 to float, !dbg !15 + %247 = fpext bfloat %159 to float, !dbg !15 + %248 = fpext bfloat %160 to float, !dbg !15 + %249 = fpext bfloat %161 to float, !dbg !15 + %250 = fpext bfloat %162 to float, !dbg !15 + %251 = fpext bfloat %173 to float, !dbg !15 + %252 = fpext bfloat %174 to float, !dbg !15 + %253 = fpext bfloat %175 to float, !dbg !15 + %254 = fpext bfloat %176 to float, !dbg !15 + %255 = fpext bfloat %177 to float, !dbg !15 + %256 = fpext bfloat %178 to float, !dbg !15 + %257 = fpext bfloat %179 to float, !dbg !15 + %258 = fpext bfloat %180 to float, !dbg !15 + %259 = fpext bfloat %191 to float, !dbg !15 + %260 = fpext bfloat %192 to float, !dbg !15 + %261 = fpext bfloat %193 to float, !dbg !15 + %262 = fpext bfloat %194 to float, !dbg !15 + %263 = fpext bfloat %195 to float, !dbg !15 + %264 = fpext bfloat %196 to float, !dbg !15 + %265 = fpext bfloat %197 to float, !dbg !15 + %266 = fpext bfloat %198 to float, !dbg !15 + %267 = fpext bfloat %209 to float, !dbg !15 + %268 = fpext bfloat %210 to float, !dbg !15 + %269 = extractelement <2 x bfloat> %204, i64 0, !dbg !15 + %270 = fpext bfloat %269 to float, !dbg !15 + %271 = extractelement <2 x bfloat> %204, i64 1, !dbg !15 + %272 = fpext bfloat %271 to float, !dbg !15 + %273 = extractelement <2 x bfloat> %206, i64 0, !dbg !15 + %274 = fpext bfloat %273 to float, !dbg !15 + %275 = extractelement <2 x bfloat> %206, i64 1, !dbg !15 + %276 = fpext bfloat %275 to float, !dbg !15 + %277 = extractelement <2 x bfloat> %208, i64 0, !dbg !15 + %278 = fpext bfloat %277 to float, !dbg !15 + %279 = extractelement <2 x bfloat> %208, i64 1, !dbg !15 + %280 = fpext bfloat %279 to float, !dbg !15 + %281 = fcmp oeq bfloat %83, 0xRFF80, !dbg !16 + %282 = fcmp oeq bfloat %84, 0xRFF80, !dbg !16 + %283 = fcmp oeq bfloat %85, 0xRFF80, !dbg !16 + %284 = fcmp oeq bfloat %86, 0xRFF80, !dbg !16 + %285 = fcmp oeq bfloat %87, 0xRFF80, !dbg !16 + %286 = fcmp oeq bfloat %88, 0xRFF80, !dbg !16 + %287 = fcmp oeq bfloat %89, 0xRFF80, !dbg !16 + %288 = fcmp oeq bfloat %90, 0xRFF80, !dbg !16 + %289 = fcmp oeq bfloat %101, 0xRFF80, !dbg !16 + %290 = fcmp oeq bfloat %102, 0xRFF80, !dbg !16 + %291 = fcmp oeq bfloat %103, 0xRFF80, !dbg !16 + %292 = fcmp oeq bfloat %104, 0xRFF80, !dbg !16 + %293 = fcmp oeq bfloat %105, 0xRFF80, !dbg !16 + %294 = fcmp oeq bfloat %106, 0xRFF80, !dbg !16 + %295 = fcmp oeq bfloat %107, 0xRFF80, !dbg !16 + %296 = fcmp oeq bfloat %108, 0xRFF80, !dbg !16 + %297 = fcmp oeq bfloat %119, 0xRFF80, !dbg !16 + %298 = fcmp oeq bfloat %120, 0xRFF80, !dbg !16 + %299 = fcmp oeq bfloat %121, 0xRFF80, !dbg !16 + %300 = fcmp oeq bfloat %122, 0xRFF80, !dbg !16 + %301 = fcmp oeq bfloat %123, 0xRFF80, !dbg !16 + %302 = fcmp oeq bfloat %124, 0xRFF80, !dbg !16 + %303 = fcmp oeq bfloat %125, 0xRFF80, !dbg !16 + %304 = fcmp oeq bfloat %126, 0xRFF80, !dbg !16 + %305 = fcmp oeq bfloat %137, 0xRFF80, !dbg !16 + %306 = fcmp oeq bfloat %138, 0xRFF80, !dbg !16 + %307 = fcmp oeq bfloat %139, 0xRFF80, !dbg !16 + %308 = fcmp oeq bfloat %140, 0xRFF80, !dbg !16 + %309 = fcmp oeq bfloat %141, 0xRFF80, !dbg !16 + %310 = fcmp oeq bfloat %142, 0xRFF80, !dbg !16 + %311 = fcmp oeq bfloat %143, 0xRFF80, !dbg !16 + %312 = fcmp oeq bfloat %144, 0xRFF80, !dbg !16 + %313 = fcmp oeq bfloat %155, 0xRFF80, !dbg !16 + %314 = fcmp oeq bfloat %156, 0xRFF80, !dbg !16 + %315 = fcmp oeq bfloat %157, 0xRFF80, !dbg !16 + %316 = fcmp oeq bfloat %158, 0xRFF80, !dbg !16 + %317 = fcmp oeq bfloat %159, 0xRFF80, !dbg !16 + %318 = fcmp oeq bfloat %160, 0xRFF80, !dbg !16 + %319 = fcmp oeq bfloat %161, 0xRFF80, !dbg !16 + %320 = fcmp oeq bfloat %162, 0xRFF80, !dbg !16 + %321 = fcmp oeq bfloat %173, 0xRFF80, !dbg !16 + %322 = fcmp oeq bfloat %174, 0xRFF80, !dbg !16 + %323 = fcmp oeq bfloat %175, 0xRFF80, !dbg !16 + %324 = fcmp oeq bfloat %176, 0xRFF80, !dbg !16 + %325 = fcmp oeq bfloat %177, 0xRFF80, !dbg !16 + %326 = fcmp oeq bfloat %178, 0xRFF80, !dbg !16 + %327 = fcmp oeq bfloat %179, 0xRFF80, !dbg !16 + %328 = fcmp oeq bfloat %180, 0xRFF80, !dbg !16 + %329 = fcmp oeq bfloat %191, 0xRFF80, !dbg !16 + %330 = fcmp oeq bfloat %192, 0xRFF80, !dbg !16 + %331 = fcmp oeq bfloat %193, 0xRFF80, !dbg !16 + %332 = fcmp oeq bfloat %194, 0xRFF80, !dbg !16 + %333 = fcmp oeq bfloat %195, 0xRFF80, !dbg !16 + %334 = fcmp oeq bfloat %196, 0xRFF80, !dbg !16 + %335 = fcmp oeq bfloat %197, 0xRFF80, !dbg !16 + %336 = fcmp oeq bfloat %198, 0xRFF80, !dbg !16 + %337 = fcmp oeq bfloat %209, 0xRFF80, !dbg !16 + %338 = fcmp oeq bfloat %210, 0xRFF80, !dbg !16 + %339 = fsub float 0xFFF0000000000000, %211, !dbg !20 + %340 = fsub float 0xFFF0000000000000, %212, !dbg !20 + %341 = fsub float 0xFFF0000000000000, %213, !dbg !20 + %342 = fsub float 0xFFF0000000000000, %214, !dbg !20 + %343 = fsub float 0xFFF0000000000000, %215, !dbg !20 + %344 = fsub float 0xFFF0000000000000, %216, !dbg !20 + %345 = fsub float 0xFFF0000000000000, %217, !dbg !20 + %346 = fsub float 0xFFF0000000000000, %218, !dbg !20 + %347 = fsub float 0xFFF0000000000000, %219, !dbg !20 + %348 = fsub float 0xFFF0000000000000, %220, !dbg !20 + %349 = fsub float 0xFFF0000000000000, %221, !dbg !20 + %350 = fsub float 0xFFF0000000000000, %222, !dbg !20 + %351 = fsub float 0xFFF0000000000000, %223, !dbg !20 + %352 = fsub float 0xFFF0000000000000, %224, !dbg !20 + %353 = fsub float 0xFFF0000000000000, %225, !dbg !20 + %354 = fsub float 0xFFF0000000000000, %226, !dbg !20 + %355 = fsub float 0xFFF0000000000000, %227, !dbg !20 + %356 = fsub float 0xFFF0000000000000, %228, !dbg !20 + %357 = fsub float 0xFFF0000000000000, %229, !dbg !20 + %358 = fsub float 0xFFF0000000000000, %230, !dbg !20 + %359 = fsub float 0xFFF0000000000000, %231, !dbg !20 + %360 = fsub float 0xFFF0000000000000, %232, !dbg !20 + %361 = fsub float 0xFFF0000000000000, %233, !dbg !20 + %362 = fsub float 0xFFF0000000000000, %234, !dbg !20 + %363 = fsub float 0xFFF0000000000000, %235, !dbg !20 + %364 = fsub float 0xFFF0000000000000, %236, !dbg !20 + %365 = fsub float 0xFFF0000000000000, %237, !dbg !20 + %366 = fsub float 0xFFF0000000000000, %238, !dbg !20 + %367 = fsub float 0xFFF0000000000000, %239, !dbg !20 + %368 = fsub float 0xFFF0000000000000, %240, !dbg !20 + %369 = fsub float 0xFFF0000000000000, %241, !dbg !20 + %370 = fsub float 0xFFF0000000000000, %242, !dbg !20 + %371 = fsub float 0xFFF0000000000000, %243, !dbg !20 + %372 = fsub float 0xFFF0000000000000, %244, !dbg !20 + %373 = fsub float 0xFFF0000000000000, %245, !dbg !20 + %374 = fsub float 0xFFF0000000000000, %246, !dbg !20 + %375 = fsub float 0xFFF0000000000000, %247, !dbg !20 + %376 = fsub float 0xFFF0000000000000, %248, !dbg !20 + %377 = fsub float 0xFFF0000000000000, %249, !dbg !20 + %378 = fsub float 0xFFF0000000000000, %250, !dbg !20 + %379 = fsub float 0xFFF0000000000000, %251, !dbg !20 + %380 = fsub float 0xFFF0000000000000, %252, !dbg !20 + %381 = fsub float 0xFFF0000000000000, %253, !dbg !20 + %382 = fsub float 0xFFF0000000000000, %254, !dbg !20 + %383 = fsub float 0xFFF0000000000000, %255, !dbg !20 + %384 = fsub float 0xFFF0000000000000, %256, !dbg !20 + %385 = fsub float 0xFFF0000000000000, %257, !dbg !20 + %386 = fsub float 0xFFF0000000000000, %258, !dbg !20 + %387 = fsub float 0xFFF0000000000000, %259, !dbg !20 + %388 = fsub float 0xFFF0000000000000, %260, !dbg !20 + %389 = fsub float 0xFFF0000000000000, %261, !dbg !20 + %390 = fsub float 0xFFF0000000000000, %262, !dbg !20 + %391 = fsub float 0xFFF0000000000000, %263, !dbg !20 + %392 = fsub float 0xFFF0000000000000, %264, !dbg !20 + %393 = fsub float 0xFFF0000000000000, %265, !dbg !20 + %394 = fsub float 0xFFF0000000000000, %266, !dbg !20 + %395 = fsub float 0xFFF0000000000000, %267, !dbg !20 + %396 = fsub float 0xFFF0000000000000, %268, !dbg !20 + %397 = fsub float 0xFFF0000000000000, %270, !dbg !20 + %398 = fsub float 0xFFF0000000000000, %272, !dbg !20 + %399 = fsub float 0xFFF0000000000000, %274, !dbg !20 + %400 = fsub float 0xFFF0000000000000, %276, !dbg !20 + %401 = fsub float 0xFFF0000000000000, %278, !dbg !20 + %402 = fsub float 0xFFF0000000000000, %280, !dbg !20 + %403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i = icmp eq i32 %403, 0, !dbg !21 + %404 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %339, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %405 = tail call float @llvm.nvvm.fma.rn.f(float %339, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i = select i1 %.not.i, float %405, float %404, !dbg !21 + %406 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i = icmp eq i32 %406, 0, !dbg !21 + %407 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i) #6, !dbg !21 + %408 = tail call float @llvm.nvvm.saturate.f(float %.02.i) #6, !dbg !21 + %.03.i = select i1 %.not1.i, float %408, float %407, !dbg !21 + %409 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i = icmp eq i32 %409, 0, !dbg !21 + %410 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %411 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i = select i1 %.not2.i, float %411, float %410, !dbg !21 + %412 = fadd float %.04.i, 0xC168000FE0000000, !dbg !21 + %413 = fneg float %412, !dbg !21 + %414 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i = icmp eq i32 %414, 0, !dbg !21 + %415 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %339, float 0x3FF7154760000000, float %413) #6, !dbg !21 + %416 = tail call float @llvm.nvvm.fma.rn.f(float %339, float 0x3FF7154760000000, float %413) #6, !dbg !21 + %.0.i = select i1 %.not3.i, float %416, float %415, !dbg !21 + %417 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i = icmp eq i32 %417, 0, !dbg !21 + %418 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %339, float 0x3E54AE0C00000000, float %.0.i) #6, !dbg !21 + %419 = tail call float @llvm.nvvm.fma.rn.f(float %339, float 0x3E54AE0C00000000, float %.0.i) #6, !dbg !21 + %.01.i = select i1 %.not4.i, float %419, float %418, !dbg !21 + %420 = bitcast float %.04.i to i32, !dbg !21 + %421 = shl i32 %420, 23, !dbg !21 + %422 = bitcast i32 %421 to float, !dbg !21 + %423 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i) #6, !dbg !21 + %424 = fmul float %423, %422, !dbg !21 + %425 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i127 = icmp eq i32 %425, 0, !dbg !21 + %426 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %340, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %427 = tail call float @llvm.nvvm.fma.rn.f(float %340, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i128 = select i1 %.not.i127, float %427, float %426, !dbg !21 + %428 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i129 = icmp eq i32 %428, 0, !dbg !21 + %429 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i128) #6, !dbg !21 + %430 = tail call float @llvm.nvvm.saturate.f(float %.02.i128) #6, !dbg !21 + %.03.i130 = select i1 %.not1.i129, float %430, float %429, !dbg !21 + %431 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i131 = icmp eq i32 %431, 0, !dbg !21 + %432 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i130, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %433 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i130, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i132 = select i1 %.not2.i131, float %433, float %432, !dbg !21 + %434 = fadd float %.04.i132, 0xC168000FE0000000, !dbg !21 + %435 = fneg float %434, !dbg !21 + %436 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i133 = icmp eq i32 %436, 0, !dbg !21 + %437 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %340, float 0x3FF7154760000000, float %435) #6, !dbg !21 + %438 = tail call float @llvm.nvvm.fma.rn.f(float %340, float 0x3FF7154760000000, float %435) #6, !dbg !21 + %.0.i134 = select i1 %.not3.i133, float %438, float %437, !dbg !21 + %439 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i135 = icmp eq i32 %439, 0, !dbg !21 + %440 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %340, float 0x3E54AE0C00000000, float %.0.i134) #6, !dbg !21 + %441 = tail call float @llvm.nvvm.fma.rn.f(float %340, float 0x3E54AE0C00000000, float %.0.i134) #6, !dbg !21 + %.01.i136 = select i1 %.not4.i135, float %441, float %440, !dbg !21 + %442 = bitcast float %.04.i132 to i32, !dbg !21 + %443 = shl i32 %442, 23, !dbg !21 + %444 = bitcast i32 %443 to float, !dbg !21 + %445 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i136) #6, !dbg !21 + %446 = fmul float %445, %444, !dbg !21 + %447 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i137 = icmp eq i32 %447, 0, !dbg !21 + %448 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %341, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %449 = tail call float @llvm.nvvm.fma.rn.f(float %341, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i138 = select i1 %.not.i137, float %449, float %448, !dbg !21 + %450 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i139 = icmp eq i32 %450, 0, !dbg !21 + %451 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i138) #6, !dbg !21 + %452 = tail call float @llvm.nvvm.saturate.f(float %.02.i138) #6, !dbg !21 + %.03.i140 = select i1 %.not1.i139, float %452, float %451, !dbg !21 + %453 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i141 = icmp eq i32 %453, 0, !dbg !21 + %454 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i140, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %455 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i140, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i142 = select i1 %.not2.i141, float %455, float %454, !dbg !21 + %456 = fadd float %.04.i142, 0xC168000FE0000000, !dbg !21 + %457 = fneg float %456, !dbg !21 + %458 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i143 = icmp eq i32 %458, 0, !dbg !21 + %459 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %341, float 0x3FF7154760000000, float %457) #6, !dbg !21 + %460 = tail call float @llvm.nvvm.fma.rn.f(float %341, float 0x3FF7154760000000, float %457) #6, !dbg !21 + %.0.i144 = select i1 %.not3.i143, float %460, float %459, !dbg !21 + %461 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i145 = icmp eq i32 %461, 0, !dbg !21 + %462 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %341, float 0x3E54AE0C00000000, float %.0.i144) #6, !dbg !21 + %463 = tail call float @llvm.nvvm.fma.rn.f(float %341, float 0x3E54AE0C00000000, float %.0.i144) #6, !dbg !21 + %.01.i146 = select i1 %.not4.i145, float %463, float %462, !dbg !21 + %464 = bitcast float %.04.i142 to i32, !dbg !21 + %465 = shl i32 %464, 23, !dbg !21 + %466 = bitcast i32 %465 to float, !dbg !21 + %467 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i146) #6, !dbg !21 + %468 = fmul float %467, %466, !dbg !21 + %469 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i147 = icmp eq i32 %469, 0, !dbg !21 + %470 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %342, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %471 = tail call float @llvm.nvvm.fma.rn.f(float %342, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i148 = select i1 %.not.i147, float %471, float %470, !dbg !21 + %472 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i149 = icmp eq i32 %472, 0, !dbg !21 + %473 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i148) #6, !dbg !21 + %474 = tail call float @llvm.nvvm.saturate.f(float %.02.i148) #6, !dbg !21 + %.03.i150 = select i1 %.not1.i149, float %474, float %473, !dbg !21 + %475 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i151 = icmp eq i32 %475, 0, !dbg !21 + %476 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i150, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %477 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i150, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i152 = select i1 %.not2.i151, float %477, float %476, !dbg !21 + %478 = fadd float %.04.i152, 0xC168000FE0000000, !dbg !21 + %479 = fneg float %478, !dbg !21 + %480 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i153 = icmp eq i32 %480, 0, !dbg !21 + %481 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %342, float 0x3FF7154760000000, float %479) #6, !dbg !21 + %482 = tail call float @llvm.nvvm.fma.rn.f(float %342, float 0x3FF7154760000000, float %479) #6, !dbg !21 + %.0.i154 = select i1 %.not3.i153, float %482, float %481, !dbg !21 + %483 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i155 = icmp eq i32 %483, 0, !dbg !21 + %484 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %342, float 0x3E54AE0C00000000, float %.0.i154) #6, !dbg !21 + %485 = tail call float @llvm.nvvm.fma.rn.f(float %342, float 0x3E54AE0C00000000, float %.0.i154) #6, !dbg !21 + %.01.i156 = select i1 %.not4.i155, float %485, float %484, !dbg !21 + %486 = bitcast float %.04.i152 to i32, !dbg !21 + %487 = shl i32 %486, 23, !dbg !21 + %488 = bitcast i32 %487 to float, !dbg !21 + %489 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i156) #6, !dbg !21 + %490 = fmul float %489, %488, !dbg !21 + %491 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i157 = icmp eq i32 %491, 0, !dbg !21 + %492 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %343, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %493 = tail call float @llvm.nvvm.fma.rn.f(float %343, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i158 = select i1 %.not.i157, float %493, float %492, !dbg !21 + %494 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i159 = icmp eq i32 %494, 0, !dbg !21 + %495 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i158) #6, !dbg !21 + %496 = tail call float @llvm.nvvm.saturate.f(float %.02.i158) #6, !dbg !21 + %.03.i160 = select i1 %.not1.i159, float %496, float %495, !dbg !21 + %497 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i161 = icmp eq i32 %497, 0, !dbg !21 + %498 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i160, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %499 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i160, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i162 = select i1 %.not2.i161, float %499, float %498, !dbg !21 + %500 = fadd float %.04.i162, 0xC168000FE0000000, !dbg !21 + %501 = fneg float %500, !dbg !21 + %502 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i163 = icmp eq i32 %502, 0, !dbg !21 + %503 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %343, float 0x3FF7154760000000, float %501) #6, !dbg !21 + %504 = tail call float @llvm.nvvm.fma.rn.f(float %343, float 0x3FF7154760000000, float %501) #6, !dbg !21 + %.0.i164 = select i1 %.not3.i163, float %504, float %503, !dbg !21 + %505 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i165 = icmp eq i32 %505, 0, !dbg !21 + %506 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %343, float 0x3E54AE0C00000000, float %.0.i164) #6, !dbg !21 + %507 = tail call float @llvm.nvvm.fma.rn.f(float %343, float 0x3E54AE0C00000000, float %.0.i164) #6, !dbg !21 + %.01.i166 = select i1 %.not4.i165, float %507, float %506, !dbg !21 + %508 = bitcast float %.04.i162 to i32, !dbg !21 + %509 = shl i32 %508, 23, !dbg !21 + %510 = bitcast i32 %509 to float, !dbg !21 + %511 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i166) #6, !dbg !21 + %512 = fmul float %511, %510, !dbg !21 + %513 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i167 = icmp eq i32 %513, 0, !dbg !21 + %514 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %344, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %515 = tail call float @llvm.nvvm.fma.rn.f(float %344, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i168 = select i1 %.not.i167, float %515, float %514, !dbg !21 + %516 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i169 = icmp eq i32 %516, 0, !dbg !21 + %517 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i168) #6, !dbg !21 + %518 = tail call float @llvm.nvvm.saturate.f(float %.02.i168) #6, !dbg !21 + %.03.i170 = select i1 %.not1.i169, float %518, float %517, !dbg !21 + %519 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i171 = icmp eq i32 %519, 0, !dbg !21 + %520 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i170, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %521 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i170, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i172 = select i1 %.not2.i171, float %521, float %520, !dbg !21 + %522 = fadd float %.04.i172, 0xC168000FE0000000, !dbg !21 + %523 = fneg float %522, !dbg !21 + %524 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i173 = icmp eq i32 %524, 0, !dbg !21 + %525 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %344, float 0x3FF7154760000000, float %523) #6, !dbg !21 + %526 = tail call float @llvm.nvvm.fma.rn.f(float %344, float 0x3FF7154760000000, float %523) #6, !dbg !21 + %.0.i174 = select i1 %.not3.i173, float %526, float %525, !dbg !21 + %527 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i175 = icmp eq i32 %527, 0, !dbg !21 + %528 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %344, float 0x3E54AE0C00000000, float %.0.i174) #6, !dbg !21 + %529 = tail call float @llvm.nvvm.fma.rn.f(float %344, float 0x3E54AE0C00000000, float %.0.i174) #6, !dbg !21 + %.01.i176 = select i1 %.not4.i175, float %529, float %528, !dbg !21 + %530 = bitcast float %.04.i172 to i32, !dbg !21 + %531 = shl i32 %530, 23, !dbg !21 + %532 = bitcast i32 %531 to float, !dbg !21 + %533 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i176) #6, !dbg !21 + %534 = fmul float %533, %532, !dbg !21 + %535 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i177 = icmp eq i32 %535, 0, !dbg !21 + %536 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %345, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %537 = tail call float @llvm.nvvm.fma.rn.f(float %345, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i178 = select i1 %.not.i177, float %537, float %536, !dbg !21 + %538 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i179 = icmp eq i32 %538, 0, !dbg !21 + %539 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i178) #6, !dbg !21 + %540 = tail call float @llvm.nvvm.saturate.f(float %.02.i178) #6, !dbg !21 + %.03.i180 = select i1 %.not1.i179, float %540, float %539, !dbg !21 + %541 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i181 = icmp eq i32 %541, 0, !dbg !21 + %542 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i180, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %543 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i180, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i182 = select i1 %.not2.i181, float %543, float %542, !dbg !21 + %544 = fadd float %.04.i182, 0xC168000FE0000000, !dbg !21 + %545 = fneg float %544, !dbg !21 + %546 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i183 = icmp eq i32 %546, 0, !dbg !21 + %547 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %345, float 0x3FF7154760000000, float %545) #6, !dbg !21 + %548 = tail call float @llvm.nvvm.fma.rn.f(float %345, float 0x3FF7154760000000, float %545) #6, !dbg !21 + %.0.i184 = select i1 %.not3.i183, float %548, float %547, !dbg !21 + %549 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i185 = icmp eq i32 %549, 0, !dbg !21 + %550 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %345, float 0x3E54AE0C00000000, float %.0.i184) #6, !dbg !21 + %551 = tail call float @llvm.nvvm.fma.rn.f(float %345, float 0x3E54AE0C00000000, float %.0.i184) #6, !dbg !21 + %.01.i186 = select i1 %.not4.i185, float %551, float %550, !dbg !21 + %552 = bitcast float %.04.i182 to i32, !dbg !21 + %553 = shl i32 %552, 23, !dbg !21 + %554 = bitcast i32 %553 to float, !dbg !21 + %555 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i186) #6, !dbg !21 + %556 = fmul float %555, %554, !dbg !21 + %557 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i187 = icmp eq i32 %557, 0, !dbg !21 + %558 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %346, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %559 = tail call float @llvm.nvvm.fma.rn.f(float %346, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i188 = select i1 %.not.i187, float %559, float %558, !dbg !21 + %560 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i189 = icmp eq i32 %560, 0, !dbg !21 + %561 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i188) #6, !dbg !21 + %562 = tail call float @llvm.nvvm.saturate.f(float %.02.i188) #6, !dbg !21 + %.03.i190 = select i1 %.not1.i189, float %562, float %561, !dbg !21 + %563 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i191 = icmp eq i32 %563, 0, !dbg !21 + %564 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i190, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %565 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i190, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i192 = select i1 %.not2.i191, float %565, float %564, !dbg !21 + %566 = fadd float %.04.i192, 0xC168000FE0000000, !dbg !21 + %567 = fneg float %566, !dbg !21 + %568 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i193 = icmp eq i32 %568, 0, !dbg !21 + %569 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %346, float 0x3FF7154760000000, float %567) #6, !dbg !21 + %570 = tail call float @llvm.nvvm.fma.rn.f(float %346, float 0x3FF7154760000000, float %567) #6, !dbg !21 + %.0.i194 = select i1 %.not3.i193, float %570, float %569, !dbg !21 + %571 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i195 = icmp eq i32 %571, 0, !dbg !21 + %572 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %346, float 0x3E54AE0C00000000, float %.0.i194) #6, !dbg !21 + %573 = tail call float @llvm.nvvm.fma.rn.f(float %346, float 0x3E54AE0C00000000, float %.0.i194) #6, !dbg !21 + %.01.i196 = select i1 %.not4.i195, float %573, float %572, !dbg !21 + %574 = bitcast float %.04.i192 to i32, !dbg !21 + %575 = shl i32 %574, 23, !dbg !21 + %576 = bitcast i32 %575 to float, !dbg !21 + %577 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i196) #6, !dbg !21 + %578 = fmul float %577, %576, !dbg !21 + %579 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i197 = icmp eq i32 %579, 0, !dbg !21 + %580 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %347, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %581 = tail call float @llvm.nvvm.fma.rn.f(float %347, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i198 = select i1 %.not.i197, float %581, float %580, !dbg !21 + %582 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i199 = icmp eq i32 %582, 0, !dbg !21 + %583 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i198) #6, !dbg !21 + %584 = tail call float @llvm.nvvm.saturate.f(float %.02.i198) #6, !dbg !21 + %.03.i200 = select i1 %.not1.i199, float %584, float %583, !dbg !21 + %585 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i201 = icmp eq i32 %585, 0, !dbg !21 + %586 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i200, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %587 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i200, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i202 = select i1 %.not2.i201, float %587, float %586, !dbg !21 + %588 = fadd float %.04.i202, 0xC168000FE0000000, !dbg !21 + %589 = fneg float %588, !dbg !21 + %590 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i203 = icmp eq i32 %590, 0, !dbg !21 + %591 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %347, float 0x3FF7154760000000, float %589) #6, !dbg !21 + %592 = tail call float @llvm.nvvm.fma.rn.f(float %347, float 0x3FF7154760000000, float %589) #6, !dbg !21 + %.0.i204 = select i1 %.not3.i203, float %592, float %591, !dbg !21 + %593 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i205 = icmp eq i32 %593, 0, !dbg !21 + %594 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %347, float 0x3E54AE0C00000000, float %.0.i204) #6, !dbg !21 + %595 = tail call float @llvm.nvvm.fma.rn.f(float %347, float 0x3E54AE0C00000000, float %.0.i204) #6, !dbg !21 + %.01.i206 = select i1 %.not4.i205, float %595, float %594, !dbg !21 + %596 = bitcast float %.04.i202 to i32, !dbg !21 + %597 = shl i32 %596, 23, !dbg !21 + %598 = bitcast i32 %597 to float, !dbg !21 + %599 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i206) #6, !dbg !21 + %600 = fmul float %599, %598, !dbg !21 + %601 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i207 = icmp eq i32 %601, 0, !dbg !21 + %602 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %348, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %603 = tail call float @llvm.nvvm.fma.rn.f(float %348, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i208 = select i1 %.not.i207, float %603, float %602, !dbg !21 + %604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i209 = icmp eq i32 %604, 0, !dbg !21 + %605 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i208) #6, !dbg !21 + %606 = tail call float @llvm.nvvm.saturate.f(float %.02.i208) #6, !dbg !21 + %.03.i210 = select i1 %.not1.i209, float %606, float %605, !dbg !21 + %607 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i211 = icmp eq i32 %607, 0, !dbg !21 + %608 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i210, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %609 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i210, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i212 = select i1 %.not2.i211, float %609, float %608, !dbg !21 + %610 = fadd float %.04.i212, 0xC168000FE0000000, !dbg !21 + %611 = fneg float %610, !dbg !21 + %612 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i213 = icmp eq i32 %612, 0, !dbg !21 + %613 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %348, float 0x3FF7154760000000, float %611) #6, !dbg !21 + %614 = tail call float @llvm.nvvm.fma.rn.f(float %348, float 0x3FF7154760000000, float %611) #6, !dbg !21 + %.0.i214 = select i1 %.not3.i213, float %614, float %613, !dbg !21 + %615 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i215 = icmp eq i32 %615, 0, !dbg !21 + %616 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %348, float 0x3E54AE0C00000000, float %.0.i214) #6, !dbg !21 + %617 = tail call float @llvm.nvvm.fma.rn.f(float %348, float 0x3E54AE0C00000000, float %.0.i214) #6, !dbg !21 + %.01.i216 = select i1 %.not4.i215, float %617, float %616, !dbg !21 + %618 = bitcast float %.04.i212 to i32, !dbg !21 + %619 = shl i32 %618, 23, !dbg !21 + %620 = bitcast i32 %619 to float, !dbg !21 + %621 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i216) #6, !dbg !21 + %622 = fmul float %621, %620, !dbg !21 + %623 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i217 = icmp eq i32 %623, 0, !dbg !21 + %624 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %349, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %625 = tail call float @llvm.nvvm.fma.rn.f(float %349, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i218 = select i1 %.not.i217, float %625, float %624, !dbg !21 + %626 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i219 = icmp eq i32 %626, 0, !dbg !21 + %627 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i218) #6, !dbg !21 + %628 = tail call float @llvm.nvvm.saturate.f(float %.02.i218) #6, !dbg !21 + %.03.i220 = select i1 %.not1.i219, float %628, float %627, !dbg !21 + %629 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i221 = icmp eq i32 %629, 0, !dbg !21 + %630 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i220, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %631 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i220, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i222 = select i1 %.not2.i221, float %631, float %630, !dbg !21 + %632 = fadd float %.04.i222, 0xC168000FE0000000, !dbg !21 + %633 = fneg float %632, !dbg !21 + %634 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i223 = icmp eq i32 %634, 0, !dbg !21 + %635 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %349, float 0x3FF7154760000000, float %633) #6, !dbg !21 + %636 = tail call float @llvm.nvvm.fma.rn.f(float %349, float 0x3FF7154760000000, float %633) #6, !dbg !21 + %.0.i224 = select i1 %.not3.i223, float %636, float %635, !dbg !21 + %637 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i225 = icmp eq i32 %637, 0, !dbg !21 + %638 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %349, float 0x3E54AE0C00000000, float %.0.i224) #6, !dbg !21 + %639 = tail call float @llvm.nvvm.fma.rn.f(float %349, float 0x3E54AE0C00000000, float %.0.i224) #6, !dbg !21 + %.01.i226 = select i1 %.not4.i225, float %639, float %638, !dbg !21 + %640 = bitcast float %.04.i222 to i32, !dbg !21 + %641 = shl i32 %640, 23, !dbg !21 + %642 = bitcast i32 %641 to float, !dbg !21 + %643 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i226) #6, !dbg !21 + %644 = fmul float %643, %642, !dbg !21 + %645 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i227 = icmp eq i32 %645, 0, !dbg !21 + %646 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %350, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %647 = tail call float @llvm.nvvm.fma.rn.f(float %350, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i228 = select i1 %.not.i227, float %647, float %646, !dbg !21 + %648 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i229 = icmp eq i32 %648, 0, !dbg !21 + %649 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i228) #6, !dbg !21 + %650 = tail call float @llvm.nvvm.saturate.f(float %.02.i228) #6, !dbg !21 + %.03.i230 = select i1 %.not1.i229, float %650, float %649, !dbg !21 + %651 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i231 = icmp eq i32 %651, 0, !dbg !21 + %652 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i230, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %653 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i230, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i232 = select i1 %.not2.i231, float %653, float %652, !dbg !21 + %654 = fadd float %.04.i232, 0xC168000FE0000000, !dbg !21 + %655 = fneg float %654, !dbg !21 + %656 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i233 = icmp eq i32 %656, 0, !dbg !21 + %657 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %350, float 0x3FF7154760000000, float %655) #6, !dbg !21 + %658 = tail call float @llvm.nvvm.fma.rn.f(float %350, float 0x3FF7154760000000, float %655) #6, !dbg !21 + %.0.i234 = select i1 %.not3.i233, float %658, float %657, !dbg !21 + %659 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i235 = icmp eq i32 %659, 0, !dbg !21 + %660 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %350, float 0x3E54AE0C00000000, float %.0.i234) #6, !dbg !21 + %661 = tail call float @llvm.nvvm.fma.rn.f(float %350, float 0x3E54AE0C00000000, float %.0.i234) #6, !dbg !21 + %.01.i236 = select i1 %.not4.i235, float %661, float %660, !dbg !21 + %662 = bitcast float %.04.i232 to i32, !dbg !21 + %663 = shl i32 %662, 23, !dbg !21 + %664 = bitcast i32 %663 to float, !dbg !21 + %665 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i236) #6, !dbg !21 + %666 = fmul float %665, %664, !dbg !21 + %667 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i237 = icmp eq i32 %667, 0, !dbg !21 + %668 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %351, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %669 = tail call float @llvm.nvvm.fma.rn.f(float %351, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i238 = select i1 %.not.i237, float %669, float %668, !dbg !21 + %670 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i239 = icmp eq i32 %670, 0, !dbg !21 + %671 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i238) #6, !dbg !21 + %672 = tail call float @llvm.nvvm.saturate.f(float %.02.i238) #6, !dbg !21 + %.03.i240 = select i1 %.not1.i239, float %672, float %671, !dbg !21 + %673 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i241 = icmp eq i32 %673, 0, !dbg !21 + %674 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i240, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %675 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i240, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i242 = select i1 %.not2.i241, float %675, float %674, !dbg !21 + %676 = fadd float %.04.i242, 0xC168000FE0000000, !dbg !21 + %677 = fneg float %676, !dbg !21 + %678 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i243 = icmp eq i32 %678, 0, !dbg !21 + %679 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %351, float 0x3FF7154760000000, float %677) #6, !dbg !21 + %680 = tail call float @llvm.nvvm.fma.rn.f(float %351, float 0x3FF7154760000000, float %677) #6, !dbg !21 + %.0.i244 = select i1 %.not3.i243, float %680, float %679, !dbg !21 + %681 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i245 = icmp eq i32 %681, 0, !dbg !21 + %682 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %351, float 0x3E54AE0C00000000, float %.0.i244) #6, !dbg !21 + %683 = tail call float @llvm.nvvm.fma.rn.f(float %351, float 0x3E54AE0C00000000, float %.0.i244) #6, !dbg !21 + %.01.i246 = select i1 %.not4.i245, float %683, float %682, !dbg !21 + %684 = bitcast float %.04.i242 to i32, !dbg !21 + %685 = shl i32 %684, 23, !dbg !21 + %686 = bitcast i32 %685 to float, !dbg !21 + %687 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i246) #6, !dbg !21 + %688 = fmul float %687, %686, !dbg !21 + %689 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i247 = icmp eq i32 %689, 0, !dbg !21 + %690 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %352, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %691 = tail call float @llvm.nvvm.fma.rn.f(float %352, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i248 = select i1 %.not.i247, float %691, float %690, !dbg !21 + %692 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i249 = icmp eq i32 %692, 0, !dbg !21 + %693 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i248) #6, !dbg !21 + %694 = tail call float @llvm.nvvm.saturate.f(float %.02.i248) #6, !dbg !21 + %.03.i250 = select i1 %.not1.i249, float %694, float %693, !dbg !21 + %695 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i251 = icmp eq i32 %695, 0, !dbg !21 + %696 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i250, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %697 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i250, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i252 = select i1 %.not2.i251, float %697, float %696, !dbg !21 + %698 = fadd float %.04.i252, 0xC168000FE0000000, !dbg !21 + %699 = fneg float %698, !dbg !21 + %700 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i253 = icmp eq i32 %700, 0, !dbg !21 + %701 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %352, float 0x3FF7154760000000, float %699) #6, !dbg !21 + %702 = tail call float @llvm.nvvm.fma.rn.f(float %352, float 0x3FF7154760000000, float %699) #6, !dbg !21 + %.0.i254 = select i1 %.not3.i253, float %702, float %701, !dbg !21 + %703 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i255 = icmp eq i32 %703, 0, !dbg !21 + %704 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %352, float 0x3E54AE0C00000000, float %.0.i254) #6, !dbg !21 + %705 = tail call float @llvm.nvvm.fma.rn.f(float %352, float 0x3E54AE0C00000000, float %.0.i254) #6, !dbg !21 + %.01.i256 = select i1 %.not4.i255, float %705, float %704, !dbg !21 + %706 = bitcast float %.04.i252 to i32, !dbg !21 + %707 = shl i32 %706, 23, !dbg !21 + %708 = bitcast i32 %707 to float, !dbg !21 + %709 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i256) #6, !dbg !21 + %710 = fmul float %709, %708, !dbg !21 + %711 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i257 = icmp eq i32 %711, 0, !dbg !21 + %712 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %353, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %713 = tail call float @llvm.nvvm.fma.rn.f(float %353, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i258 = select i1 %.not.i257, float %713, float %712, !dbg !21 + %714 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i259 = icmp eq i32 %714, 0, !dbg !21 + %715 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i258) #6, !dbg !21 + %716 = tail call float @llvm.nvvm.saturate.f(float %.02.i258) #6, !dbg !21 + %.03.i260 = select i1 %.not1.i259, float %716, float %715, !dbg !21 + %717 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i261 = icmp eq i32 %717, 0, !dbg !21 + %718 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i260, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %719 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i260, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i262 = select i1 %.not2.i261, float %719, float %718, !dbg !21 + %720 = fadd float %.04.i262, 0xC168000FE0000000, !dbg !21 + %721 = fneg float %720, !dbg !21 + %722 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i263 = icmp eq i32 %722, 0, !dbg !21 + %723 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %353, float 0x3FF7154760000000, float %721) #6, !dbg !21 + %724 = tail call float @llvm.nvvm.fma.rn.f(float %353, float 0x3FF7154760000000, float %721) #6, !dbg !21 + %.0.i264 = select i1 %.not3.i263, float %724, float %723, !dbg !21 + %725 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i265 = icmp eq i32 %725, 0, !dbg !21 + %726 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %353, float 0x3E54AE0C00000000, float %.0.i264) #6, !dbg !21 + %727 = tail call float @llvm.nvvm.fma.rn.f(float %353, float 0x3E54AE0C00000000, float %.0.i264) #6, !dbg !21 + %.01.i266 = select i1 %.not4.i265, float %727, float %726, !dbg !21 + %728 = bitcast float %.04.i262 to i32, !dbg !21 + %729 = shl i32 %728, 23, !dbg !21 + %730 = bitcast i32 %729 to float, !dbg !21 + %731 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i266) #6, !dbg !21 + %732 = fmul float %731, %730, !dbg !21 + %733 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i267 = icmp eq i32 %733, 0, !dbg !21 + %734 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %354, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %735 = tail call float @llvm.nvvm.fma.rn.f(float %354, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i268 = select i1 %.not.i267, float %735, float %734, !dbg !21 + %736 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i269 = icmp eq i32 %736, 0, !dbg !21 + %737 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i268) #6, !dbg !21 + %738 = tail call float @llvm.nvvm.saturate.f(float %.02.i268) #6, !dbg !21 + %.03.i270 = select i1 %.not1.i269, float %738, float %737, !dbg !21 + %739 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i271 = icmp eq i32 %739, 0, !dbg !21 + %740 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i270, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %741 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i270, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i272 = select i1 %.not2.i271, float %741, float %740, !dbg !21 + %742 = fadd float %.04.i272, 0xC168000FE0000000, !dbg !21 + %743 = fneg float %742, !dbg !21 + %744 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i273 = icmp eq i32 %744, 0, !dbg !21 + %745 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %354, float 0x3FF7154760000000, float %743) #6, !dbg !21 + %746 = tail call float @llvm.nvvm.fma.rn.f(float %354, float 0x3FF7154760000000, float %743) #6, !dbg !21 + %.0.i274 = select i1 %.not3.i273, float %746, float %745, !dbg !21 + %747 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i275 = icmp eq i32 %747, 0, !dbg !21 + %748 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %354, float 0x3E54AE0C00000000, float %.0.i274) #6, !dbg !21 + %749 = tail call float @llvm.nvvm.fma.rn.f(float %354, float 0x3E54AE0C00000000, float %.0.i274) #6, !dbg !21 + %.01.i276 = select i1 %.not4.i275, float %749, float %748, !dbg !21 + %750 = bitcast float %.04.i272 to i32, !dbg !21 + %751 = shl i32 %750, 23, !dbg !21 + %752 = bitcast i32 %751 to float, !dbg !21 + %753 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i276) #6, !dbg !21 + %754 = fmul float %753, %752, !dbg !21 + %755 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i277 = icmp eq i32 %755, 0, !dbg !21 + %756 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %355, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %757 = tail call float @llvm.nvvm.fma.rn.f(float %355, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i278 = select i1 %.not.i277, float %757, float %756, !dbg !21 + %758 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i279 = icmp eq i32 %758, 0, !dbg !21 + %759 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i278) #6, !dbg !21 + %760 = tail call float @llvm.nvvm.saturate.f(float %.02.i278) #6, !dbg !21 + %.03.i280 = select i1 %.not1.i279, float %760, float %759, !dbg !21 + %761 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i281 = icmp eq i32 %761, 0, !dbg !21 + %762 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i280, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %763 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i280, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i282 = select i1 %.not2.i281, float %763, float %762, !dbg !21 + %764 = fadd float %.04.i282, 0xC168000FE0000000, !dbg !21 + %765 = fneg float %764, !dbg !21 + %766 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i283 = icmp eq i32 %766, 0, !dbg !21 + %767 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %355, float 0x3FF7154760000000, float %765) #6, !dbg !21 + %768 = tail call float @llvm.nvvm.fma.rn.f(float %355, float 0x3FF7154760000000, float %765) #6, !dbg !21 + %.0.i284 = select i1 %.not3.i283, float %768, float %767, !dbg !21 + %769 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i285 = icmp eq i32 %769, 0, !dbg !21 + %770 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %355, float 0x3E54AE0C00000000, float %.0.i284) #6, !dbg !21 + %771 = tail call float @llvm.nvvm.fma.rn.f(float %355, float 0x3E54AE0C00000000, float %.0.i284) #6, !dbg !21 + %.01.i286 = select i1 %.not4.i285, float %771, float %770, !dbg !21 + %772 = bitcast float %.04.i282 to i32, !dbg !21 + %773 = shl i32 %772, 23, !dbg !21 + %774 = bitcast i32 %773 to float, !dbg !21 + %775 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i286) #6, !dbg !21 + %776 = fmul float %775, %774, !dbg !21 + %777 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i287 = icmp eq i32 %777, 0, !dbg !21 + %778 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %356, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %779 = tail call float @llvm.nvvm.fma.rn.f(float %356, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i288 = select i1 %.not.i287, float %779, float %778, !dbg !21 + %780 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i289 = icmp eq i32 %780, 0, !dbg !21 + %781 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i288) #6, !dbg !21 + %782 = tail call float @llvm.nvvm.saturate.f(float %.02.i288) #6, !dbg !21 + %.03.i290 = select i1 %.not1.i289, float %782, float %781, !dbg !21 + %783 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i291 = icmp eq i32 %783, 0, !dbg !21 + %784 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i290, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %785 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i290, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i292 = select i1 %.not2.i291, float %785, float %784, !dbg !21 + %786 = fadd float %.04.i292, 0xC168000FE0000000, !dbg !21 + %787 = fneg float %786, !dbg !21 + %788 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i293 = icmp eq i32 %788, 0, !dbg !21 + %789 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %356, float 0x3FF7154760000000, float %787) #6, !dbg !21 + %790 = tail call float @llvm.nvvm.fma.rn.f(float %356, float 0x3FF7154760000000, float %787) #6, !dbg !21 + %.0.i294 = select i1 %.not3.i293, float %790, float %789, !dbg !21 + %791 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i295 = icmp eq i32 %791, 0, !dbg !21 + %792 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %356, float 0x3E54AE0C00000000, float %.0.i294) #6, !dbg !21 + %793 = tail call float @llvm.nvvm.fma.rn.f(float %356, float 0x3E54AE0C00000000, float %.0.i294) #6, !dbg !21 + %.01.i296 = select i1 %.not4.i295, float %793, float %792, !dbg !21 + %794 = bitcast float %.04.i292 to i32, !dbg !21 + %795 = shl i32 %794, 23, !dbg !21 + %796 = bitcast i32 %795 to float, !dbg !21 + %797 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i296) #6, !dbg !21 + %798 = fmul float %797, %796, !dbg !21 + %799 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i297 = icmp eq i32 %799, 0, !dbg !21 + %800 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %357, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %801 = tail call float @llvm.nvvm.fma.rn.f(float %357, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i298 = select i1 %.not.i297, float %801, float %800, !dbg !21 + %802 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i299 = icmp eq i32 %802, 0, !dbg !21 + %803 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i298) #6, !dbg !21 + %804 = tail call float @llvm.nvvm.saturate.f(float %.02.i298) #6, !dbg !21 + %.03.i300 = select i1 %.not1.i299, float %804, float %803, !dbg !21 + %805 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i301 = icmp eq i32 %805, 0, !dbg !21 + %806 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i300, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %807 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i300, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i302 = select i1 %.not2.i301, float %807, float %806, !dbg !21 + %808 = fadd float %.04.i302, 0xC168000FE0000000, !dbg !21 + %809 = fneg float %808, !dbg !21 + %810 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i303 = icmp eq i32 %810, 0, !dbg !21 + %811 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %357, float 0x3FF7154760000000, float %809) #6, !dbg !21 + %812 = tail call float @llvm.nvvm.fma.rn.f(float %357, float 0x3FF7154760000000, float %809) #6, !dbg !21 + %.0.i304 = select i1 %.not3.i303, float %812, float %811, !dbg !21 + %813 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i305 = icmp eq i32 %813, 0, !dbg !21 + %814 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %357, float 0x3E54AE0C00000000, float %.0.i304) #6, !dbg !21 + %815 = tail call float @llvm.nvvm.fma.rn.f(float %357, float 0x3E54AE0C00000000, float %.0.i304) #6, !dbg !21 + %.01.i306 = select i1 %.not4.i305, float %815, float %814, !dbg !21 + %816 = bitcast float %.04.i302 to i32, !dbg !21 + %817 = shl i32 %816, 23, !dbg !21 + %818 = bitcast i32 %817 to float, !dbg !21 + %819 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i306) #6, !dbg !21 + %820 = fmul float %819, %818, !dbg !21 + %821 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i307 = icmp eq i32 %821, 0, !dbg !21 + %822 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %358, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %823 = tail call float @llvm.nvvm.fma.rn.f(float %358, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i308 = select i1 %.not.i307, float %823, float %822, !dbg !21 + %824 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i309 = icmp eq i32 %824, 0, !dbg !21 + %825 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i308) #6, !dbg !21 + %826 = tail call float @llvm.nvvm.saturate.f(float %.02.i308) #6, !dbg !21 + %.03.i310 = select i1 %.not1.i309, float %826, float %825, !dbg !21 + %827 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i311 = icmp eq i32 %827, 0, !dbg !21 + %828 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i310, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %829 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i310, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i312 = select i1 %.not2.i311, float %829, float %828, !dbg !21 + %830 = fadd float %.04.i312, 0xC168000FE0000000, !dbg !21 + %831 = fneg float %830, !dbg !21 + %832 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i313 = icmp eq i32 %832, 0, !dbg !21 + %833 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %358, float 0x3FF7154760000000, float %831) #6, !dbg !21 + %834 = tail call float @llvm.nvvm.fma.rn.f(float %358, float 0x3FF7154760000000, float %831) #6, !dbg !21 + %.0.i314 = select i1 %.not3.i313, float %834, float %833, !dbg !21 + %835 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i315 = icmp eq i32 %835, 0, !dbg !21 + %836 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %358, float 0x3E54AE0C00000000, float %.0.i314) #6, !dbg !21 + %837 = tail call float @llvm.nvvm.fma.rn.f(float %358, float 0x3E54AE0C00000000, float %.0.i314) #6, !dbg !21 + %.01.i316 = select i1 %.not4.i315, float %837, float %836, !dbg !21 + %838 = bitcast float %.04.i312 to i32, !dbg !21 + %839 = shl i32 %838, 23, !dbg !21 + %840 = bitcast i32 %839 to float, !dbg !21 + %841 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i316) #6, !dbg !21 + %842 = fmul float %841, %840, !dbg !21 + %843 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i317 = icmp eq i32 %843, 0, !dbg !21 + %844 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %359, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %845 = tail call float @llvm.nvvm.fma.rn.f(float %359, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i318 = select i1 %.not.i317, float %845, float %844, !dbg !21 + %846 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i319 = icmp eq i32 %846, 0, !dbg !21 + %847 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i318) #6, !dbg !21 + %848 = tail call float @llvm.nvvm.saturate.f(float %.02.i318) #6, !dbg !21 + %.03.i320 = select i1 %.not1.i319, float %848, float %847, !dbg !21 + %849 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i321 = icmp eq i32 %849, 0, !dbg !21 + %850 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i320, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %851 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i320, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i322 = select i1 %.not2.i321, float %851, float %850, !dbg !21 + %852 = fadd float %.04.i322, 0xC168000FE0000000, !dbg !21 + %853 = fneg float %852, !dbg !21 + %854 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i323 = icmp eq i32 %854, 0, !dbg !21 + %855 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %359, float 0x3FF7154760000000, float %853) #6, !dbg !21 + %856 = tail call float @llvm.nvvm.fma.rn.f(float %359, float 0x3FF7154760000000, float %853) #6, !dbg !21 + %.0.i324 = select i1 %.not3.i323, float %856, float %855, !dbg !21 + %857 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i325 = icmp eq i32 %857, 0, !dbg !21 + %858 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %359, float 0x3E54AE0C00000000, float %.0.i324) #6, !dbg !21 + %859 = tail call float @llvm.nvvm.fma.rn.f(float %359, float 0x3E54AE0C00000000, float %.0.i324) #6, !dbg !21 + %.01.i326 = select i1 %.not4.i325, float %859, float %858, !dbg !21 + %860 = bitcast float %.04.i322 to i32, !dbg !21 + %861 = shl i32 %860, 23, !dbg !21 + %862 = bitcast i32 %861 to float, !dbg !21 + %863 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i326) #6, !dbg !21 + %864 = fmul float %863, %862, !dbg !21 + %865 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i327 = icmp eq i32 %865, 0, !dbg !21 + %866 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %360, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %867 = tail call float @llvm.nvvm.fma.rn.f(float %360, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i328 = select i1 %.not.i327, float %867, float %866, !dbg !21 + %868 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i329 = icmp eq i32 %868, 0, !dbg !21 + %869 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i328) #6, !dbg !21 + %870 = tail call float @llvm.nvvm.saturate.f(float %.02.i328) #6, !dbg !21 + %.03.i330 = select i1 %.not1.i329, float %870, float %869, !dbg !21 + %871 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i331 = icmp eq i32 %871, 0, !dbg !21 + %872 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i330, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %873 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i330, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i332 = select i1 %.not2.i331, float %873, float %872, !dbg !21 + %874 = fadd float %.04.i332, 0xC168000FE0000000, !dbg !21 + %875 = fneg float %874, !dbg !21 + %876 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i333 = icmp eq i32 %876, 0, !dbg !21 + %877 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %360, float 0x3FF7154760000000, float %875) #6, !dbg !21 + %878 = tail call float @llvm.nvvm.fma.rn.f(float %360, float 0x3FF7154760000000, float %875) #6, !dbg !21 + %.0.i334 = select i1 %.not3.i333, float %878, float %877, !dbg !21 + %879 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i335 = icmp eq i32 %879, 0, !dbg !21 + %880 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %360, float 0x3E54AE0C00000000, float %.0.i334) #6, !dbg !21 + %881 = tail call float @llvm.nvvm.fma.rn.f(float %360, float 0x3E54AE0C00000000, float %.0.i334) #6, !dbg !21 + %.01.i336 = select i1 %.not4.i335, float %881, float %880, !dbg !21 + %882 = bitcast float %.04.i332 to i32, !dbg !21 + %883 = shl i32 %882, 23, !dbg !21 + %884 = bitcast i32 %883 to float, !dbg !21 + %885 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i336) #6, !dbg !21 + %886 = fmul float %885, %884, !dbg !21 + %887 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i337 = icmp eq i32 %887, 0, !dbg !21 + %888 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %361, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %889 = tail call float @llvm.nvvm.fma.rn.f(float %361, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i338 = select i1 %.not.i337, float %889, float %888, !dbg !21 + %890 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i339 = icmp eq i32 %890, 0, !dbg !21 + %891 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i338) #6, !dbg !21 + %892 = tail call float @llvm.nvvm.saturate.f(float %.02.i338) #6, !dbg !21 + %.03.i340 = select i1 %.not1.i339, float %892, float %891, !dbg !21 + %893 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i341 = icmp eq i32 %893, 0, !dbg !21 + %894 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i340, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %895 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i340, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i342 = select i1 %.not2.i341, float %895, float %894, !dbg !21 + %896 = fadd float %.04.i342, 0xC168000FE0000000, !dbg !21 + %897 = fneg float %896, !dbg !21 + %898 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i343 = icmp eq i32 %898, 0, !dbg !21 + %899 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %361, float 0x3FF7154760000000, float %897) #6, !dbg !21 + %900 = tail call float @llvm.nvvm.fma.rn.f(float %361, float 0x3FF7154760000000, float %897) #6, !dbg !21 + %.0.i344 = select i1 %.not3.i343, float %900, float %899, !dbg !21 + %901 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i345 = icmp eq i32 %901, 0, !dbg !21 + %902 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %361, float 0x3E54AE0C00000000, float %.0.i344) #6, !dbg !21 + %903 = tail call float @llvm.nvvm.fma.rn.f(float %361, float 0x3E54AE0C00000000, float %.0.i344) #6, !dbg !21 + %.01.i346 = select i1 %.not4.i345, float %903, float %902, !dbg !21 + %904 = bitcast float %.04.i342 to i32, !dbg !21 + %905 = shl i32 %904, 23, !dbg !21 + %906 = bitcast i32 %905 to float, !dbg !21 + %907 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i346) #6, !dbg !21 + %908 = fmul float %907, %906, !dbg !21 + %909 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i347 = icmp eq i32 %909, 0, !dbg !21 + %910 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %362, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %911 = tail call float @llvm.nvvm.fma.rn.f(float %362, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i348 = select i1 %.not.i347, float %911, float %910, !dbg !21 + %912 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i349 = icmp eq i32 %912, 0, !dbg !21 + %913 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i348) #6, !dbg !21 + %914 = tail call float @llvm.nvvm.saturate.f(float %.02.i348) #6, !dbg !21 + %.03.i350 = select i1 %.not1.i349, float %914, float %913, !dbg !21 + %915 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i351 = icmp eq i32 %915, 0, !dbg !21 + %916 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i350, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %917 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i350, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i352 = select i1 %.not2.i351, float %917, float %916, !dbg !21 + %918 = fadd float %.04.i352, 0xC168000FE0000000, !dbg !21 + %919 = fneg float %918, !dbg !21 + %920 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i353 = icmp eq i32 %920, 0, !dbg !21 + %921 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %362, float 0x3FF7154760000000, float %919) #6, !dbg !21 + %922 = tail call float @llvm.nvvm.fma.rn.f(float %362, float 0x3FF7154760000000, float %919) #6, !dbg !21 + %.0.i354 = select i1 %.not3.i353, float %922, float %921, !dbg !21 + %923 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i355 = icmp eq i32 %923, 0, !dbg !21 + %924 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %362, float 0x3E54AE0C00000000, float %.0.i354) #6, !dbg !21 + %925 = tail call float @llvm.nvvm.fma.rn.f(float %362, float 0x3E54AE0C00000000, float %.0.i354) #6, !dbg !21 + %.01.i356 = select i1 %.not4.i355, float %925, float %924, !dbg !21 + %926 = bitcast float %.04.i352 to i32, !dbg !21 + %927 = shl i32 %926, 23, !dbg !21 + %928 = bitcast i32 %927 to float, !dbg !21 + %929 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i356) #6, !dbg !21 + %930 = fmul float %929, %928, !dbg !21 + %931 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i357 = icmp eq i32 %931, 0, !dbg !21 + %932 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %363, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %933 = tail call float @llvm.nvvm.fma.rn.f(float %363, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i358 = select i1 %.not.i357, float %933, float %932, !dbg !21 + %934 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i359 = icmp eq i32 %934, 0, !dbg !21 + %935 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i358) #6, !dbg !21 + %936 = tail call float @llvm.nvvm.saturate.f(float %.02.i358) #6, !dbg !21 + %.03.i360 = select i1 %.not1.i359, float %936, float %935, !dbg !21 + %937 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i361 = icmp eq i32 %937, 0, !dbg !21 + %938 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i360, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %939 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i360, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i362 = select i1 %.not2.i361, float %939, float %938, !dbg !21 + %940 = fadd float %.04.i362, 0xC168000FE0000000, !dbg !21 + %941 = fneg float %940, !dbg !21 + %942 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i363 = icmp eq i32 %942, 0, !dbg !21 + %943 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %363, float 0x3FF7154760000000, float %941) #6, !dbg !21 + %944 = tail call float @llvm.nvvm.fma.rn.f(float %363, float 0x3FF7154760000000, float %941) #6, !dbg !21 + %.0.i364 = select i1 %.not3.i363, float %944, float %943, !dbg !21 + %945 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i365 = icmp eq i32 %945, 0, !dbg !21 + %946 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %363, float 0x3E54AE0C00000000, float %.0.i364) #6, !dbg !21 + %947 = tail call float @llvm.nvvm.fma.rn.f(float %363, float 0x3E54AE0C00000000, float %.0.i364) #6, !dbg !21 + %.01.i366 = select i1 %.not4.i365, float %947, float %946, !dbg !21 + %948 = bitcast float %.04.i362 to i32, !dbg !21 + %949 = shl i32 %948, 23, !dbg !21 + %950 = bitcast i32 %949 to float, !dbg !21 + %951 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i366) #6, !dbg !21 + %952 = fmul float %951, %950, !dbg !21 + %953 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i367 = icmp eq i32 %953, 0, !dbg !21 + %954 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %364, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %955 = tail call float @llvm.nvvm.fma.rn.f(float %364, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i368 = select i1 %.not.i367, float %955, float %954, !dbg !21 + %956 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i369 = icmp eq i32 %956, 0, !dbg !21 + %957 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i368) #6, !dbg !21 + %958 = tail call float @llvm.nvvm.saturate.f(float %.02.i368) #6, !dbg !21 + %.03.i370 = select i1 %.not1.i369, float %958, float %957, !dbg !21 + %959 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i371 = icmp eq i32 %959, 0, !dbg !21 + %960 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i370, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %961 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i370, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i372 = select i1 %.not2.i371, float %961, float %960, !dbg !21 + %962 = fadd float %.04.i372, 0xC168000FE0000000, !dbg !21 + %963 = fneg float %962, !dbg !21 + %964 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i373 = icmp eq i32 %964, 0, !dbg !21 + %965 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %364, float 0x3FF7154760000000, float %963) #6, !dbg !21 + %966 = tail call float @llvm.nvvm.fma.rn.f(float %364, float 0x3FF7154760000000, float %963) #6, !dbg !21 + %.0.i374 = select i1 %.not3.i373, float %966, float %965, !dbg !21 + %967 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i375 = icmp eq i32 %967, 0, !dbg !21 + %968 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %364, float 0x3E54AE0C00000000, float %.0.i374) #6, !dbg !21 + %969 = tail call float @llvm.nvvm.fma.rn.f(float %364, float 0x3E54AE0C00000000, float %.0.i374) #6, !dbg !21 + %.01.i376 = select i1 %.not4.i375, float %969, float %968, !dbg !21 + %970 = bitcast float %.04.i372 to i32, !dbg !21 + %971 = shl i32 %970, 23, !dbg !21 + %972 = bitcast i32 %971 to float, !dbg !21 + %973 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i376) #6, !dbg !21 + %974 = fmul float %973, %972, !dbg !21 + %975 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i377 = icmp eq i32 %975, 0, !dbg !21 + %976 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %365, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %977 = tail call float @llvm.nvvm.fma.rn.f(float %365, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i378 = select i1 %.not.i377, float %977, float %976, !dbg !21 + %978 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i379 = icmp eq i32 %978, 0, !dbg !21 + %979 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i378) #6, !dbg !21 + %980 = tail call float @llvm.nvvm.saturate.f(float %.02.i378) #6, !dbg !21 + %.03.i380 = select i1 %.not1.i379, float %980, float %979, !dbg !21 + %981 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i381 = icmp eq i32 %981, 0, !dbg !21 + %982 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i380, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %983 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i380, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i382 = select i1 %.not2.i381, float %983, float %982, !dbg !21 + %984 = fadd float %.04.i382, 0xC168000FE0000000, !dbg !21 + %985 = fneg float %984, !dbg !21 + %986 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i383 = icmp eq i32 %986, 0, !dbg !21 + %987 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %365, float 0x3FF7154760000000, float %985) #6, !dbg !21 + %988 = tail call float @llvm.nvvm.fma.rn.f(float %365, float 0x3FF7154760000000, float %985) #6, !dbg !21 + %.0.i384 = select i1 %.not3.i383, float %988, float %987, !dbg !21 + %989 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i385 = icmp eq i32 %989, 0, !dbg !21 + %990 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %365, float 0x3E54AE0C00000000, float %.0.i384) #6, !dbg !21 + %991 = tail call float @llvm.nvvm.fma.rn.f(float %365, float 0x3E54AE0C00000000, float %.0.i384) #6, !dbg !21 + %.01.i386 = select i1 %.not4.i385, float %991, float %990, !dbg !21 + %992 = bitcast float %.04.i382 to i32, !dbg !21 + %993 = shl i32 %992, 23, !dbg !21 + %994 = bitcast i32 %993 to float, !dbg !21 + %995 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i386) #6, !dbg !21 + %996 = fmul float %995, %994, !dbg !21 + %997 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i387 = icmp eq i32 %997, 0, !dbg !21 + %998 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %366, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %999 = tail call float @llvm.nvvm.fma.rn.f(float %366, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i388 = select i1 %.not.i387, float %999, float %998, !dbg !21 + %1000 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i389 = icmp eq i32 %1000, 0, !dbg !21 + %1001 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i388) #6, !dbg !21 + %1002 = tail call float @llvm.nvvm.saturate.f(float %.02.i388) #6, !dbg !21 + %.03.i390 = select i1 %.not1.i389, float %1002, float %1001, !dbg !21 + %1003 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i391 = icmp eq i32 %1003, 0, !dbg !21 + %1004 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i390, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1005 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i390, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i392 = select i1 %.not2.i391, float %1005, float %1004, !dbg !21 + %1006 = fadd float %.04.i392, 0xC168000FE0000000, !dbg !21 + %1007 = fneg float %1006, !dbg !21 + %1008 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i393 = icmp eq i32 %1008, 0, !dbg !21 + %1009 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %366, float 0x3FF7154760000000, float %1007) #6, !dbg !21 + %1010 = tail call float @llvm.nvvm.fma.rn.f(float %366, float 0x3FF7154760000000, float %1007) #6, !dbg !21 + %.0.i394 = select i1 %.not3.i393, float %1010, float %1009, !dbg !21 + %1011 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i395 = icmp eq i32 %1011, 0, !dbg !21 + %1012 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %366, float 0x3E54AE0C00000000, float %.0.i394) #6, !dbg !21 + %1013 = tail call float @llvm.nvvm.fma.rn.f(float %366, float 0x3E54AE0C00000000, float %.0.i394) #6, !dbg !21 + %.01.i396 = select i1 %.not4.i395, float %1013, float %1012, !dbg !21 + %1014 = bitcast float %.04.i392 to i32, !dbg !21 + %1015 = shl i32 %1014, 23, !dbg !21 + %1016 = bitcast i32 %1015 to float, !dbg !21 + %1017 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i396) #6, !dbg !21 + %1018 = fmul float %1017, %1016, !dbg !21 + %1019 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i397 = icmp eq i32 %1019, 0, !dbg !21 + %1020 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %367, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1021 = tail call float @llvm.nvvm.fma.rn.f(float %367, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i398 = select i1 %.not.i397, float %1021, float %1020, !dbg !21 + %1022 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i399 = icmp eq i32 %1022, 0, !dbg !21 + %1023 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i398) #6, !dbg !21 + %1024 = tail call float @llvm.nvvm.saturate.f(float %.02.i398) #6, !dbg !21 + %.03.i400 = select i1 %.not1.i399, float %1024, float %1023, !dbg !21 + %1025 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i401 = icmp eq i32 %1025, 0, !dbg !21 + %1026 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i400, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1027 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i400, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i402 = select i1 %.not2.i401, float %1027, float %1026, !dbg !21 + %1028 = fadd float %.04.i402, 0xC168000FE0000000, !dbg !21 + %1029 = fneg float %1028, !dbg !21 + %1030 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i403 = icmp eq i32 %1030, 0, !dbg !21 + %1031 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %367, float 0x3FF7154760000000, float %1029) #6, !dbg !21 + %1032 = tail call float @llvm.nvvm.fma.rn.f(float %367, float 0x3FF7154760000000, float %1029) #6, !dbg !21 + %.0.i404 = select i1 %.not3.i403, float %1032, float %1031, !dbg !21 + %1033 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i405 = icmp eq i32 %1033, 0, !dbg !21 + %1034 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %367, float 0x3E54AE0C00000000, float %.0.i404) #6, !dbg !21 + %1035 = tail call float @llvm.nvvm.fma.rn.f(float %367, float 0x3E54AE0C00000000, float %.0.i404) #6, !dbg !21 + %.01.i406 = select i1 %.not4.i405, float %1035, float %1034, !dbg !21 + %1036 = bitcast float %.04.i402 to i32, !dbg !21 + %1037 = shl i32 %1036, 23, !dbg !21 + %1038 = bitcast i32 %1037 to float, !dbg !21 + %1039 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i406) #6, !dbg !21 + %1040 = fmul float %1039, %1038, !dbg !21 + %1041 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i407 = icmp eq i32 %1041, 0, !dbg !21 + %1042 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %368, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1043 = tail call float @llvm.nvvm.fma.rn.f(float %368, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i408 = select i1 %.not.i407, float %1043, float %1042, !dbg !21 + %1044 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i409 = icmp eq i32 %1044, 0, !dbg !21 + %1045 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i408) #6, !dbg !21 + %1046 = tail call float @llvm.nvvm.saturate.f(float %.02.i408) #6, !dbg !21 + %.03.i410 = select i1 %.not1.i409, float %1046, float %1045, !dbg !21 + %1047 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i411 = icmp eq i32 %1047, 0, !dbg !21 + %1048 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i410, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1049 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i410, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i412 = select i1 %.not2.i411, float %1049, float %1048, !dbg !21 + %1050 = fadd float %.04.i412, 0xC168000FE0000000, !dbg !21 + %1051 = fneg float %1050, !dbg !21 + %1052 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i413 = icmp eq i32 %1052, 0, !dbg !21 + %1053 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %368, float 0x3FF7154760000000, float %1051) #6, !dbg !21 + %1054 = tail call float @llvm.nvvm.fma.rn.f(float %368, float 0x3FF7154760000000, float %1051) #6, !dbg !21 + %.0.i414 = select i1 %.not3.i413, float %1054, float %1053, !dbg !21 + %1055 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i415 = icmp eq i32 %1055, 0, !dbg !21 + %1056 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %368, float 0x3E54AE0C00000000, float %.0.i414) #6, !dbg !21 + %1057 = tail call float @llvm.nvvm.fma.rn.f(float %368, float 0x3E54AE0C00000000, float %.0.i414) #6, !dbg !21 + %.01.i416 = select i1 %.not4.i415, float %1057, float %1056, !dbg !21 + %1058 = bitcast float %.04.i412 to i32, !dbg !21 + %1059 = shl i32 %1058, 23, !dbg !21 + %1060 = bitcast i32 %1059 to float, !dbg !21 + %1061 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i416) #6, !dbg !21 + %1062 = fmul float %1061, %1060, !dbg !21 + %1063 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i417 = icmp eq i32 %1063, 0, !dbg !21 + %1064 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %369, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1065 = tail call float @llvm.nvvm.fma.rn.f(float %369, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i418 = select i1 %.not.i417, float %1065, float %1064, !dbg !21 + %1066 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i419 = icmp eq i32 %1066, 0, !dbg !21 + %1067 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i418) #6, !dbg !21 + %1068 = tail call float @llvm.nvvm.saturate.f(float %.02.i418) #6, !dbg !21 + %.03.i420 = select i1 %.not1.i419, float %1068, float %1067, !dbg !21 + %1069 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i421 = icmp eq i32 %1069, 0, !dbg !21 + %1070 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i420, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1071 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i420, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i422 = select i1 %.not2.i421, float %1071, float %1070, !dbg !21 + %1072 = fadd float %.04.i422, 0xC168000FE0000000, !dbg !21 + %1073 = fneg float %1072, !dbg !21 + %1074 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i423 = icmp eq i32 %1074, 0, !dbg !21 + %1075 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %369, float 0x3FF7154760000000, float %1073) #6, !dbg !21 + %1076 = tail call float @llvm.nvvm.fma.rn.f(float %369, float 0x3FF7154760000000, float %1073) #6, !dbg !21 + %.0.i424 = select i1 %.not3.i423, float %1076, float %1075, !dbg !21 + %1077 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i425 = icmp eq i32 %1077, 0, !dbg !21 + %1078 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %369, float 0x3E54AE0C00000000, float %.0.i424) #6, !dbg !21 + %1079 = tail call float @llvm.nvvm.fma.rn.f(float %369, float 0x3E54AE0C00000000, float %.0.i424) #6, !dbg !21 + %.01.i426 = select i1 %.not4.i425, float %1079, float %1078, !dbg !21 + %1080 = bitcast float %.04.i422 to i32, !dbg !21 + %1081 = shl i32 %1080, 23, !dbg !21 + %1082 = bitcast i32 %1081 to float, !dbg !21 + %1083 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i426) #6, !dbg !21 + %1084 = fmul float %1083, %1082, !dbg !21 + %1085 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i427 = icmp eq i32 %1085, 0, !dbg !21 + %1086 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %370, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1087 = tail call float @llvm.nvvm.fma.rn.f(float %370, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i428 = select i1 %.not.i427, float %1087, float %1086, !dbg !21 + %1088 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i429 = icmp eq i32 %1088, 0, !dbg !21 + %1089 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i428) #6, !dbg !21 + %1090 = tail call float @llvm.nvvm.saturate.f(float %.02.i428) #6, !dbg !21 + %.03.i430 = select i1 %.not1.i429, float %1090, float %1089, !dbg !21 + %1091 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i431 = icmp eq i32 %1091, 0, !dbg !21 + %1092 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i430, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1093 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i430, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i432 = select i1 %.not2.i431, float %1093, float %1092, !dbg !21 + %1094 = fadd float %.04.i432, 0xC168000FE0000000, !dbg !21 + %1095 = fneg float %1094, !dbg !21 + %1096 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i433 = icmp eq i32 %1096, 0, !dbg !21 + %1097 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %370, float 0x3FF7154760000000, float %1095) #6, !dbg !21 + %1098 = tail call float @llvm.nvvm.fma.rn.f(float %370, float 0x3FF7154760000000, float %1095) #6, !dbg !21 + %.0.i434 = select i1 %.not3.i433, float %1098, float %1097, !dbg !21 + %1099 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i435 = icmp eq i32 %1099, 0, !dbg !21 + %1100 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %370, float 0x3E54AE0C00000000, float %.0.i434) #6, !dbg !21 + %1101 = tail call float @llvm.nvvm.fma.rn.f(float %370, float 0x3E54AE0C00000000, float %.0.i434) #6, !dbg !21 + %.01.i436 = select i1 %.not4.i435, float %1101, float %1100, !dbg !21 + %1102 = bitcast float %.04.i432 to i32, !dbg !21 + %1103 = shl i32 %1102, 23, !dbg !21 + %1104 = bitcast i32 %1103 to float, !dbg !21 + %1105 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i436) #6, !dbg !21 + %1106 = fmul float %1105, %1104, !dbg !21 + %1107 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i437 = icmp eq i32 %1107, 0, !dbg !21 + %1108 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %371, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1109 = tail call float @llvm.nvvm.fma.rn.f(float %371, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i438 = select i1 %.not.i437, float %1109, float %1108, !dbg !21 + %1110 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i439 = icmp eq i32 %1110, 0, !dbg !21 + %1111 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i438) #6, !dbg !21 + %1112 = tail call float @llvm.nvvm.saturate.f(float %.02.i438) #6, !dbg !21 + %.03.i440 = select i1 %.not1.i439, float %1112, float %1111, !dbg !21 + %1113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i441 = icmp eq i32 %1113, 0, !dbg !21 + %1114 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i440, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1115 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i440, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i442 = select i1 %.not2.i441, float %1115, float %1114, !dbg !21 + %1116 = fadd float %.04.i442, 0xC168000FE0000000, !dbg !21 + %1117 = fneg float %1116, !dbg !21 + %1118 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i443 = icmp eq i32 %1118, 0, !dbg !21 + %1119 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %371, float 0x3FF7154760000000, float %1117) #6, !dbg !21 + %1120 = tail call float @llvm.nvvm.fma.rn.f(float %371, float 0x3FF7154760000000, float %1117) #6, !dbg !21 + %.0.i444 = select i1 %.not3.i443, float %1120, float %1119, !dbg !21 + %1121 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i445 = icmp eq i32 %1121, 0, !dbg !21 + %1122 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %371, float 0x3E54AE0C00000000, float %.0.i444) #6, !dbg !21 + %1123 = tail call float @llvm.nvvm.fma.rn.f(float %371, float 0x3E54AE0C00000000, float %.0.i444) #6, !dbg !21 + %.01.i446 = select i1 %.not4.i445, float %1123, float %1122, !dbg !21 + %1124 = bitcast float %.04.i442 to i32, !dbg !21 + %1125 = shl i32 %1124, 23, !dbg !21 + %1126 = bitcast i32 %1125 to float, !dbg !21 + %1127 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i446) #6, !dbg !21 + %1128 = fmul float %1127, %1126, !dbg !21 + %1129 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i447 = icmp eq i32 %1129, 0, !dbg !21 + %1130 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %372, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1131 = tail call float @llvm.nvvm.fma.rn.f(float %372, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i448 = select i1 %.not.i447, float %1131, float %1130, !dbg !21 + %1132 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i449 = icmp eq i32 %1132, 0, !dbg !21 + %1133 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i448) #6, !dbg !21 + %1134 = tail call float @llvm.nvvm.saturate.f(float %.02.i448) #6, !dbg !21 + %.03.i450 = select i1 %.not1.i449, float %1134, float %1133, !dbg !21 + %1135 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i451 = icmp eq i32 %1135, 0, !dbg !21 + %1136 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i450, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1137 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i450, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i452 = select i1 %.not2.i451, float %1137, float %1136, !dbg !21 + %1138 = fadd float %.04.i452, 0xC168000FE0000000, !dbg !21 + %1139 = fneg float %1138, !dbg !21 + %1140 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i453 = icmp eq i32 %1140, 0, !dbg !21 + %1141 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %372, float 0x3FF7154760000000, float %1139) #6, !dbg !21 + %1142 = tail call float @llvm.nvvm.fma.rn.f(float %372, float 0x3FF7154760000000, float %1139) #6, !dbg !21 + %.0.i454 = select i1 %.not3.i453, float %1142, float %1141, !dbg !21 + %1143 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i455 = icmp eq i32 %1143, 0, !dbg !21 + %1144 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %372, float 0x3E54AE0C00000000, float %.0.i454) #6, !dbg !21 + %1145 = tail call float @llvm.nvvm.fma.rn.f(float %372, float 0x3E54AE0C00000000, float %.0.i454) #6, !dbg !21 + %.01.i456 = select i1 %.not4.i455, float %1145, float %1144, !dbg !21 + %1146 = bitcast float %.04.i452 to i32, !dbg !21 + %1147 = shl i32 %1146, 23, !dbg !21 + %1148 = bitcast i32 %1147 to float, !dbg !21 + %1149 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i456) #6, !dbg !21 + %1150 = fmul float %1149, %1148, !dbg !21 + %1151 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i457 = icmp eq i32 %1151, 0, !dbg !21 + %1152 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %373, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1153 = tail call float @llvm.nvvm.fma.rn.f(float %373, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i458 = select i1 %.not.i457, float %1153, float %1152, !dbg !21 + %1154 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i459 = icmp eq i32 %1154, 0, !dbg !21 + %1155 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i458) #6, !dbg !21 + %1156 = tail call float @llvm.nvvm.saturate.f(float %.02.i458) #6, !dbg !21 + %.03.i460 = select i1 %.not1.i459, float %1156, float %1155, !dbg !21 + %1157 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i461 = icmp eq i32 %1157, 0, !dbg !21 + %1158 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i460, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1159 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i460, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i462 = select i1 %.not2.i461, float %1159, float %1158, !dbg !21 + %1160 = fadd float %.04.i462, 0xC168000FE0000000, !dbg !21 + %1161 = fneg float %1160, !dbg !21 + %1162 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i463 = icmp eq i32 %1162, 0, !dbg !21 + %1163 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %373, float 0x3FF7154760000000, float %1161) #6, !dbg !21 + %1164 = tail call float @llvm.nvvm.fma.rn.f(float %373, float 0x3FF7154760000000, float %1161) #6, !dbg !21 + %.0.i464 = select i1 %.not3.i463, float %1164, float %1163, !dbg !21 + %1165 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i465 = icmp eq i32 %1165, 0, !dbg !21 + %1166 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %373, float 0x3E54AE0C00000000, float %.0.i464) #6, !dbg !21 + %1167 = tail call float @llvm.nvvm.fma.rn.f(float %373, float 0x3E54AE0C00000000, float %.0.i464) #6, !dbg !21 + %.01.i466 = select i1 %.not4.i465, float %1167, float %1166, !dbg !21 + %1168 = bitcast float %.04.i462 to i32, !dbg !21 + %1169 = shl i32 %1168, 23, !dbg !21 + %1170 = bitcast i32 %1169 to float, !dbg !21 + %1171 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i466) #6, !dbg !21 + %1172 = fmul float %1171, %1170, !dbg !21 + %1173 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i467 = icmp eq i32 %1173, 0, !dbg !21 + %1174 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %374, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1175 = tail call float @llvm.nvvm.fma.rn.f(float %374, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i468 = select i1 %.not.i467, float %1175, float %1174, !dbg !21 + %1176 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i469 = icmp eq i32 %1176, 0, !dbg !21 + %1177 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i468) #6, !dbg !21 + %1178 = tail call float @llvm.nvvm.saturate.f(float %.02.i468) #6, !dbg !21 + %.03.i470 = select i1 %.not1.i469, float %1178, float %1177, !dbg !21 + %1179 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i471 = icmp eq i32 %1179, 0, !dbg !21 + %1180 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i470, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1181 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i470, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i472 = select i1 %.not2.i471, float %1181, float %1180, !dbg !21 + %1182 = fadd float %.04.i472, 0xC168000FE0000000, !dbg !21 + %1183 = fneg float %1182, !dbg !21 + %1184 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i473 = icmp eq i32 %1184, 0, !dbg !21 + %1185 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %374, float 0x3FF7154760000000, float %1183) #6, !dbg !21 + %1186 = tail call float @llvm.nvvm.fma.rn.f(float %374, float 0x3FF7154760000000, float %1183) #6, !dbg !21 + %.0.i474 = select i1 %.not3.i473, float %1186, float %1185, !dbg !21 + %1187 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i475 = icmp eq i32 %1187, 0, !dbg !21 + %1188 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %374, float 0x3E54AE0C00000000, float %.0.i474) #6, !dbg !21 + %1189 = tail call float @llvm.nvvm.fma.rn.f(float %374, float 0x3E54AE0C00000000, float %.0.i474) #6, !dbg !21 + %.01.i476 = select i1 %.not4.i475, float %1189, float %1188, !dbg !21 + %1190 = bitcast float %.04.i472 to i32, !dbg !21 + %1191 = shl i32 %1190, 23, !dbg !21 + %1192 = bitcast i32 %1191 to float, !dbg !21 + %1193 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i476) #6, !dbg !21 + %1194 = fmul float %1193, %1192, !dbg !21 + %1195 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i477 = icmp eq i32 %1195, 0, !dbg !21 + %1196 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %375, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1197 = tail call float @llvm.nvvm.fma.rn.f(float %375, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i478 = select i1 %.not.i477, float %1197, float %1196, !dbg !21 + %1198 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i479 = icmp eq i32 %1198, 0, !dbg !21 + %1199 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i478) #6, !dbg !21 + %1200 = tail call float @llvm.nvvm.saturate.f(float %.02.i478) #6, !dbg !21 + %.03.i480 = select i1 %.not1.i479, float %1200, float %1199, !dbg !21 + %1201 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i481 = icmp eq i32 %1201, 0, !dbg !21 + %1202 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i480, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1203 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i480, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i482 = select i1 %.not2.i481, float %1203, float %1202, !dbg !21 + %1204 = fadd float %.04.i482, 0xC168000FE0000000, !dbg !21 + %1205 = fneg float %1204, !dbg !21 + %1206 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i483 = icmp eq i32 %1206, 0, !dbg !21 + %1207 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %375, float 0x3FF7154760000000, float %1205) #6, !dbg !21 + %1208 = tail call float @llvm.nvvm.fma.rn.f(float %375, float 0x3FF7154760000000, float %1205) #6, !dbg !21 + %.0.i484 = select i1 %.not3.i483, float %1208, float %1207, !dbg !21 + %1209 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i485 = icmp eq i32 %1209, 0, !dbg !21 + %1210 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %375, float 0x3E54AE0C00000000, float %.0.i484) #6, !dbg !21 + %1211 = tail call float @llvm.nvvm.fma.rn.f(float %375, float 0x3E54AE0C00000000, float %.0.i484) #6, !dbg !21 + %.01.i486 = select i1 %.not4.i485, float %1211, float %1210, !dbg !21 + %1212 = bitcast float %.04.i482 to i32, !dbg !21 + %1213 = shl i32 %1212, 23, !dbg !21 + %1214 = bitcast i32 %1213 to float, !dbg !21 + %1215 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i486) #6, !dbg !21 + %1216 = fmul float %1215, %1214, !dbg !21 + %1217 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i487 = icmp eq i32 %1217, 0, !dbg !21 + %1218 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %376, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1219 = tail call float @llvm.nvvm.fma.rn.f(float %376, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i488 = select i1 %.not.i487, float %1219, float %1218, !dbg !21 + %1220 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i489 = icmp eq i32 %1220, 0, !dbg !21 + %1221 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i488) #6, !dbg !21 + %1222 = tail call float @llvm.nvvm.saturate.f(float %.02.i488) #6, !dbg !21 + %.03.i490 = select i1 %.not1.i489, float %1222, float %1221, !dbg !21 + %1223 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i491 = icmp eq i32 %1223, 0, !dbg !21 + %1224 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i490, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1225 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i490, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i492 = select i1 %.not2.i491, float %1225, float %1224, !dbg !21 + %1226 = fadd float %.04.i492, 0xC168000FE0000000, !dbg !21 + %1227 = fneg float %1226, !dbg !21 + %1228 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i493 = icmp eq i32 %1228, 0, !dbg !21 + %1229 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %376, float 0x3FF7154760000000, float %1227) #6, !dbg !21 + %1230 = tail call float @llvm.nvvm.fma.rn.f(float %376, float 0x3FF7154760000000, float %1227) #6, !dbg !21 + %.0.i494 = select i1 %.not3.i493, float %1230, float %1229, !dbg !21 + %1231 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i495 = icmp eq i32 %1231, 0, !dbg !21 + %1232 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %376, float 0x3E54AE0C00000000, float %.0.i494) #6, !dbg !21 + %1233 = tail call float @llvm.nvvm.fma.rn.f(float %376, float 0x3E54AE0C00000000, float %.0.i494) #6, !dbg !21 + %.01.i496 = select i1 %.not4.i495, float %1233, float %1232, !dbg !21 + %1234 = bitcast float %.04.i492 to i32, !dbg !21 + %1235 = shl i32 %1234, 23, !dbg !21 + %1236 = bitcast i32 %1235 to float, !dbg !21 + %1237 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i496) #6, !dbg !21 + %1238 = fmul float %1237, %1236, !dbg !21 + %1239 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i497 = icmp eq i32 %1239, 0, !dbg !21 + %1240 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %377, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1241 = tail call float @llvm.nvvm.fma.rn.f(float %377, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i498 = select i1 %.not.i497, float %1241, float %1240, !dbg !21 + %1242 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i499 = icmp eq i32 %1242, 0, !dbg !21 + %1243 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i498) #6, !dbg !21 + %1244 = tail call float @llvm.nvvm.saturate.f(float %.02.i498) #6, !dbg !21 + %.03.i500 = select i1 %.not1.i499, float %1244, float %1243, !dbg !21 + %1245 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i501 = icmp eq i32 %1245, 0, !dbg !21 + %1246 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i500, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1247 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i500, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i502 = select i1 %.not2.i501, float %1247, float %1246, !dbg !21 + %1248 = fadd float %.04.i502, 0xC168000FE0000000, !dbg !21 + %1249 = fneg float %1248, !dbg !21 + %1250 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i503 = icmp eq i32 %1250, 0, !dbg !21 + %1251 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %377, float 0x3FF7154760000000, float %1249) #6, !dbg !21 + %1252 = tail call float @llvm.nvvm.fma.rn.f(float %377, float 0x3FF7154760000000, float %1249) #6, !dbg !21 + %.0.i504 = select i1 %.not3.i503, float %1252, float %1251, !dbg !21 + %1253 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i505 = icmp eq i32 %1253, 0, !dbg !21 + %1254 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %377, float 0x3E54AE0C00000000, float %.0.i504) #6, !dbg !21 + %1255 = tail call float @llvm.nvvm.fma.rn.f(float %377, float 0x3E54AE0C00000000, float %.0.i504) #6, !dbg !21 + %.01.i506 = select i1 %.not4.i505, float %1255, float %1254, !dbg !21 + %1256 = bitcast float %.04.i502 to i32, !dbg !21 + %1257 = shl i32 %1256, 23, !dbg !21 + %1258 = bitcast i32 %1257 to float, !dbg !21 + %1259 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i506) #6, !dbg !21 + %1260 = fmul float %1259, %1258, !dbg !21 + %1261 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i507 = icmp eq i32 %1261, 0, !dbg !21 + %1262 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %378, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1263 = tail call float @llvm.nvvm.fma.rn.f(float %378, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i508 = select i1 %.not.i507, float %1263, float %1262, !dbg !21 + %1264 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i509 = icmp eq i32 %1264, 0, !dbg !21 + %1265 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i508) #6, !dbg !21 + %1266 = tail call float @llvm.nvvm.saturate.f(float %.02.i508) #6, !dbg !21 + %.03.i510 = select i1 %.not1.i509, float %1266, float %1265, !dbg !21 + %1267 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i511 = icmp eq i32 %1267, 0, !dbg !21 + %1268 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i510, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1269 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i510, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i512 = select i1 %.not2.i511, float %1269, float %1268, !dbg !21 + %1270 = fadd float %.04.i512, 0xC168000FE0000000, !dbg !21 + %1271 = fneg float %1270, !dbg !21 + %1272 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i513 = icmp eq i32 %1272, 0, !dbg !21 + %1273 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %378, float 0x3FF7154760000000, float %1271) #6, !dbg !21 + %1274 = tail call float @llvm.nvvm.fma.rn.f(float %378, float 0x3FF7154760000000, float %1271) #6, !dbg !21 + %.0.i514 = select i1 %.not3.i513, float %1274, float %1273, !dbg !21 + %1275 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i515 = icmp eq i32 %1275, 0, !dbg !21 + %1276 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %378, float 0x3E54AE0C00000000, float %.0.i514) #6, !dbg !21 + %1277 = tail call float @llvm.nvvm.fma.rn.f(float %378, float 0x3E54AE0C00000000, float %.0.i514) #6, !dbg !21 + %.01.i516 = select i1 %.not4.i515, float %1277, float %1276, !dbg !21 + %1278 = bitcast float %.04.i512 to i32, !dbg !21 + %1279 = shl i32 %1278, 23, !dbg !21 + %1280 = bitcast i32 %1279 to float, !dbg !21 + %1281 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i516) #6, !dbg !21 + %1282 = fmul float %1281, %1280, !dbg !21 + %1283 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i517 = icmp eq i32 %1283, 0, !dbg !21 + %1284 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %379, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1285 = tail call float @llvm.nvvm.fma.rn.f(float %379, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i518 = select i1 %.not.i517, float %1285, float %1284, !dbg !21 + %1286 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i519 = icmp eq i32 %1286, 0, !dbg !21 + %1287 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i518) #6, !dbg !21 + %1288 = tail call float @llvm.nvvm.saturate.f(float %.02.i518) #6, !dbg !21 + %.03.i520 = select i1 %.not1.i519, float %1288, float %1287, !dbg !21 + %1289 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i521 = icmp eq i32 %1289, 0, !dbg !21 + %1290 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i520, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1291 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i520, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i522 = select i1 %.not2.i521, float %1291, float %1290, !dbg !21 + %1292 = fadd float %.04.i522, 0xC168000FE0000000, !dbg !21 + %1293 = fneg float %1292, !dbg !21 + %1294 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i523 = icmp eq i32 %1294, 0, !dbg !21 + %1295 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %379, float 0x3FF7154760000000, float %1293) #6, !dbg !21 + %1296 = tail call float @llvm.nvvm.fma.rn.f(float %379, float 0x3FF7154760000000, float %1293) #6, !dbg !21 + %.0.i524 = select i1 %.not3.i523, float %1296, float %1295, !dbg !21 + %1297 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i525 = icmp eq i32 %1297, 0, !dbg !21 + %1298 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %379, float 0x3E54AE0C00000000, float %.0.i524) #6, !dbg !21 + %1299 = tail call float @llvm.nvvm.fma.rn.f(float %379, float 0x3E54AE0C00000000, float %.0.i524) #6, !dbg !21 + %.01.i526 = select i1 %.not4.i525, float %1299, float %1298, !dbg !21 + %1300 = bitcast float %.04.i522 to i32, !dbg !21 + %1301 = shl i32 %1300, 23, !dbg !21 + %1302 = bitcast i32 %1301 to float, !dbg !21 + %1303 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i526) #6, !dbg !21 + %1304 = fmul float %1303, %1302, !dbg !21 + %1305 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i527 = icmp eq i32 %1305, 0, !dbg !21 + %1306 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %380, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1307 = tail call float @llvm.nvvm.fma.rn.f(float %380, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i528 = select i1 %.not.i527, float %1307, float %1306, !dbg !21 + %1308 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i529 = icmp eq i32 %1308, 0, !dbg !21 + %1309 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i528) #6, !dbg !21 + %1310 = tail call float @llvm.nvvm.saturate.f(float %.02.i528) #6, !dbg !21 + %.03.i530 = select i1 %.not1.i529, float %1310, float %1309, !dbg !21 + %1311 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i531 = icmp eq i32 %1311, 0, !dbg !21 + %1312 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i530, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1313 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i530, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i532 = select i1 %.not2.i531, float %1313, float %1312, !dbg !21 + %1314 = fadd float %.04.i532, 0xC168000FE0000000, !dbg !21 + %1315 = fneg float %1314, !dbg !21 + %1316 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i533 = icmp eq i32 %1316, 0, !dbg !21 + %1317 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %380, float 0x3FF7154760000000, float %1315) #6, !dbg !21 + %1318 = tail call float @llvm.nvvm.fma.rn.f(float %380, float 0x3FF7154760000000, float %1315) #6, !dbg !21 + %.0.i534 = select i1 %.not3.i533, float %1318, float %1317, !dbg !21 + %1319 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i535 = icmp eq i32 %1319, 0, !dbg !21 + %1320 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %380, float 0x3E54AE0C00000000, float %.0.i534) #6, !dbg !21 + %1321 = tail call float @llvm.nvvm.fma.rn.f(float %380, float 0x3E54AE0C00000000, float %.0.i534) #6, !dbg !21 + %.01.i536 = select i1 %.not4.i535, float %1321, float %1320, !dbg !21 + %1322 = bitcast float %.04.i532 to i32, !dbg !21 + %1323 = shl i32 %1322, 23, !dbg !21 + %1324 = bitcast i32 %1323 to float, !dbg !21 + %1325 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i536) #6, !dbg !21 + %1326 = fmul float %1325, %1324, !dbg !21 + %1327 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i537 = icmp eq i32 %1327, 0, !dbg !21 + %1328 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %381, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1329 = tail call float @llvm.nvvm.fma.rn.f(float %381, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i538 = select i1 %.not.i537, float %1329, float %1328, !dbg !21 + %1330 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i539 = icmp eq i32 %1330, 0, !dbg !21 + %1331 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i538) #6, !dbg !21 + %1332 = tail call float @llvm.nvvm.saturate.f(float %.02.i538) #6, !dbg !21 + %.03.i540 = select i1 %.not1.i539, float %1332, float %1331, !dbg !21 + %1333 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i541 = icmp eq i32 %1333, 0, !dbg !21 + %1334 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i540, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1335 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i540, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i542 = select i1 %.not2.i541, float %1335, float %1334, !dbg !21 + %1336 = fadd float %.04.i542, 0xC168000FE0000000, !dbg !21 + %1337 = fneg float %1336, !dbg !21 + %1338 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i543 = icmp eq i32 %1338, 0, !dbg !21 + %1339 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %381, float 0x3FF7154760000000, float %1337) #6, !dbg !21 + %1340 = tail call float @llvm.nvvm.fma.rn.f(float %381, float 0x3FF7154760000000, float %1337) #6, !dbg !21 + %.0.i544 = select i1 %.not3.i543, float %1340, float %1339, !dbg !21 + %1341 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i545 = icmp eq i32 %1341, 0, !dbg !21 + %1342 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %381, float 0x3E54AE0C00000000, float %.0.i544) #6, !dbg !21 + %1343 = tail call float @llvm.nvvm.fma.rn.f(float %381, float 0x3E54AE0C00000000, float %.0.i544) #6, !dbg !21 + %.01.i546 = select i1 %.not4.i545, float %1343, float %1342, !dbg !21 + %1344 = bitcast float %.04.i542 to i32, !dbg !21 + %1345 = shl i32 %1344, 23, !dbg !21 + %1346 = bitcast i32 %1345 to float, !dbg !21 + %1347 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i546) #6, !dbg !21 + %1348 = fmul float %1347, %1346, !dbg !21 + %1349 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i547 = icmp eq i32 %1349, 0, !dbg !21 + %1350 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %382, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1351 = tail call float @llvm.nvvm.fma.rn.f(float %382, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i548 = select i1 %.not.i547, float %1351, float %1350, !dbg !21 + %1352 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i549 = icmp eq i32 %1352, 0, !dbg !21 + %1353 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i548) #6, !dbg !21 + %1354 = tail call float @llvm.nvvm.saturate.f(float %.02.i548) #6, !dbg !21 + %.03.i550 = select i1 %.not1.i549, float %1354, float %1353, !dbg !21 + %1355 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i551 = icmp eq i32 %1355, 0, !dbg !21 + %1356 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i550, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1357 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i550, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i552 = select i1 %.not2.i551, float %1357, float %1356, !dbg !21 + %1358 = fadd float %.04.i552, 0xC168000FE0000000, !dbg !21 + %1359 = fneg float %1358, !dbg !21 + %1360 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i553 = icmp eq i32 %1360, 0, !dbg !21 + %1361 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %382, float 0x3FF7154760000000, float %1359) #6, !dbg !21 + %1362 = tail call float @llvm.nvvm.fma.rn.f(float %382, float 0x3FF7154760000000, float %1359) #6, !dbg !21 + %.0.i554 = select i1 %.not3.i553, float %1362, float %1361, !dbg !21 + %1363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i555 = icmp eq i32 %1363, 0, !dbg !21 + %1364 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %382, float 0x3E54AE0C00000000, float %.0.i554) #6, !dbg !21 + %1365 = tail call float @llvm.nvvm.fma.rn.f(float %382, float 0x3E54AE0C00000000, float %.0.i554) #6, !dbg !21 + %.01.i556 = select i1 %.not4.i555, float %1365, float %1364, !dbg !21 + %1366 = bitcast float %.04.i552 to i32, !dbg !21 + %1367 = shl i32 %1366, 23, !dbg !21 + %1368 = bitcast i32 %1367 to float, !dbg !21 + %1369 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i556) #6, !dbg !21 + %1370 = fmul float %1369, %1368, !dbg !21 + %1371 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i557 = icmp eq i32 %1371, 0, !dbg !21 + %1372 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %383, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1373 = tail call float @llvm.nvvm.fma.rn.f(float %383, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i558 = select i1 %.not.i557, float %1373, float %1372, !dbg !21 + %1374 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i559 = icmp eq i32 %1374, 0, !dbg !21 + %1375 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i558) #6, !dbg !21 + %1376 = tail call float @llvm.nvvm.saturate.f(float %.02.i558) #6, !dbg !21 + %.03.i560 = select i1 %.not1.i559, float %1376, float %1375, !dbg !21 + %1377 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i561 = icmp eq i32 %1377, 0, !dbg !21 + %1378 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i560, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1379 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i560, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i562 = select i1 %.not2.i561, float %1379, float %1378, !dbg !21 + %1380 = fadd float %.04.i562, 0xC168000FE0000000, !dbg !21 + %1381 = fneg float %1380, !dbg !21 + %1382 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i563 = icmp eq i32 %1382, 0, !dbg !21 + %1383 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %383, float 0x3FF7154760000000, float %1381) #6, !dbg !21 + %1384 = tail call float @llvm.nvvm.fma.rn.f(float %383, float 0x3FF7154760000000, float %1381) #6, !dbg !21 + %.0.i564 = select i1 %.not3.i563, float %1384, float %1383, !dbg !21 + %1385 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i565 = icmp eq i32 %1385, 0, !dbg !21 + %1386 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %383, float 0x3E54AE0C00000000, float %.0.i564) #6, !dbg !21 + %1387 = tail call float @llvm.nvvm.fma.rn.f(float %383, float 0x3E54AE0C00000000, float %.0.i564) #6, !dbg !21 + %.01.i566 = select i1 %.not4.i565, float %1387, float %1386, !dbg !21 + %1388 = bitcast float %.04.i562 to i32, !dbg !21 + %1389 = shl i32 %1388, 23, !dbg !21 + %1390 = bitcast i32 %1389 to float, !dbg !21 + %1391 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i566) #6, !dbg !21 + %1392 = fmul float %1391, %1390, !dbg !21 + %1393 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i567 = icmp eq i32 %1393, 0, !dbg !21 + %1394 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %384, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1395 = tail call float @llvm.nvvm.fma.rn.f(float %384, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i568 = select i1 %.not.i567, float %1395, float %1394, !dbg !21 + %1396 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i569 = icmp eq i32 %1396, 0, !dbg !21 + %1397 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i568) #6, !dbg !21 + %1398 = tail call float @llvm.nvvm.saturate.f(float %.02.i568) #6, !dbg !21 + %.03.i570 = select i1 %.not1.i569, float %1398, float %1397, !dbg !21 + %1399 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i571 = icmp eq i32 %1399, 0, !dbg !21 + %1400 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i570, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1401 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i570, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i572 = select i1 %.not2.i571, float %1401, float %1400, !dbg !21 + %1402 = fadd float %.04.i572, 0xC168000FE0000000, !dbg !21 + %1403 = fneg float %1402, !dbg !21 + %1404 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i573 = icmp eq i32 %1404, 0, !dbg !21 + %1405 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %384, float 0x3FF7154760000000, float %1403) #6, !dbg !21 + %1406 = tail call float @llvm.nvvm.fma.rn.f(float %384, float 0x3FF7154760000000, float %1403) #6, !dbg !21 + %.0.i574 = select i1 %.not3.i573, float %1406, float %1405, !dbg !21 + %1407 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i575 = icmp eq i32 %1407, 0, !dbg !21 + %1408 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %384, float 0x3E54AE0C00000000, float %.0.i574) #6, !dbg !21 + %1409 = tail call float @llvm.nvvm.fma.rn.f(float %384, float 0x3E54AE0C00000000, float %.0.i574) #6, !dbg !21 + %.01.i576 = select i1 %.not4.i575, float %1409, float %1408, !dbg !21 + %1410 = bitcast float %.04.i572 to i32, !dbg !21 + %1411 = shl i32 %1410, 23, !dbg !21 + %1412 = bitcast i32 %1411 to float, !dbg !21 + %1413 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i576) #6, !dbg !21 + %1414 = fmul float %1413, %1412, !dbg !21 + %1415 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i577 = icmp eq i32 %1415, 0, !dbg !21 + %1416 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %385, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1417 = tail call float @llvm.nvvm.fma.rn.f(float %385, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i578 = select i1 %.not.i577, float %1417, float %1416, !dbg !21 + %1418 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i579 = icmp eq i32 %1418, 0, !dbg !21 + %1419 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i578) #6, !dbg !21 + %1420 = tail call float @llvm.nvvm.saturate.f(float %.02.i578) #6, !dbg !21 + %.03.i580 = select i1 %.not1.i579, float %1420, float %1419, !dbg !21 + %1421 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i581 = icmp eq i32 %1421, 0, !dbg !21 + %1422 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i580, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1423 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i580, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i582 = select i1 %.not2.i581, float %1423, float %1422, !dbg !21 + %1424 = fadd float %.04.i582, 0xC168000FE0000000, !dbg !21 + %1425 = fneg float %1424, !dbg !21 + %1426 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i583 = icmp eq i32 %1426, 0, !dbg !21 + %1427 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %385, float 0x3FF7154760000000, float %1425) #6, !dbg !21 + %1428 = tail call float @llvm.nvvm.fma.rn.f(float %385, float 0x3FF7154760000000, float %1425) #6, !dbg !21 + %.0.i584 = select i1 %.not3.i583, float %1428, float %1427, !dbg !21 + %1429 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i585 = icmp eq i32 %1429, 0, !dbg !21 + %1430 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %385, float 0x3E54AE0C00000000, float %.0.i584) #6, !dbg !21 + %1431 = tail call float @llvm.nvvm.fma.rn.f(float %385, float 0x3E54AE0C00000000, float %.0.i584) #6, !dbg !21 + %.01.i586 = select i1 %.not4.i585, float %1431, float %1430, !dbg !21 + %1432 = bitcast float %.04.i582 to i32, !dbg !21 + %1433 = shl i32 %1432, 23, !dbg !21 + %1434 = bitcast i32 %1433 to float, !dbg !21 + %1435 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i586) #6, !dbg !21 + %1436 = fmul float %1435, %1434, !dbg !21 + %1437 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i587 = icmp eq i32 %1437, 0, !dbg !21 + %1438 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %386, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1439 = tail call float @llvm.nvvm.fma.rn.f(float %386, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i588 = select i1 %.not.i587, float %1439, float %1438, !dbg !21 + %1440 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i589 = icmp eq i32 %1440, 0, !dbg !21 + %1441 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i588) #6, !dbg !21 + %1442 = tail call float @llvm.nvvm.saturate.f(float %.02.i588) #6, !dbg !21 + %.03.i590 = select i1 %.not1.i589, float %1442, float %1441, !dbg !21 + %1443 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i591 = icmp eq i32 %1443, 0, !dbg !21 + %1444 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i590, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1445 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i590, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i592 = select i1 %.not2.i591, float %1445, float %1444, !dbg !21 + %1446 = fadd float %.04.i592, 0xC168000FE0000000, !dbg !21 + %1447 = fneg float %1446, !dbg !21 + %1448 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i593 = icmp eq i32 %1448, 0, !dbg !21 + %1449 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %386, float 0x3FF7154760000000, float %1447) #6, !dbg !21 + %1450 = tail call float @llvm.nvvm.fma.rn.f(float %386, float 0x3FF7154760000000, float %1447) #6, !dbg !21 + %.0.i594 = select i1 %.not3.i593, float %1450, float %1449, !dbg !21 + %1451 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i595 = icmp eq i32 %1451, 0, !dbg !21 + %1452 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %386, float 0x3E54AE0C00000000, float %.0.i594) #6, !dbg !21 + %1453 = tail call float @llvm.nvvm.fma.rn.f(float %386, float 0x3E54AE0C00000000, float %.0.i594) #6, !dbg !21 + %.01.i596 = select i1 %.not4.i595, float %1453, float %1452, !dbg !21 + %1454 = bitcast float %.04.i592 to i32, !dbg !21 + %1455 = shl i32 %1454, 23, !dbg !21 + %1456 = bitcast i32 %1455 to float, !dbg !21 + %1457 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i596) #6, !dbg !21 + %1458 = fmul float %1457, %1456, !dbg !21 + %1459 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i597 = icmp eq i32 %1459, 0, !dbg !21 + %1460 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %387, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1461 = tail call float @llvm.nvvm.fma.rn.f(float %387, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i598 = select i1 %.not.i597, float %1461, float %1460, !dbg !21 + %1462 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i599 = icmp eq i32 %1462, 0, !dbg !21 + %1463 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i598) #6, !dbg !21 + %1464 = tail call float @llvm.nvvm.saturate.f(float %.02.i598) #6, !dbg !21 + %.03.i600 = select i1 %.not1.i599, float %1464, float %1463, !dbg !21 + %1465 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i601 = icmp eq i32 %1465, 0, !dbg !21 + %1466 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i600, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1467 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i600, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i602 = select i1 %.not2.i601, float %1467, float %1466, !dbg !21 + %1468 = fadd float %.04.i602, 0xC168000FE0000000, !dbg !21 + %1469 = fneg float %1468, !dbg !21 + %1470 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i603 = icmp eq i32 %1470, 0, !dbg !21 + %1471 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %387, float 0x3FF7154760000000, float %1469) #6, !dbg !21 + %1472 = tail call float @llvm.nvvm.fma.rn.f(float %387, float 0x3FF7154760000000, float %1469) #6, !dbg !21 + %.0.i604 = select i1 %.not3.i603, float %1472, float %1471, !dbg !21 + %1473 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i605 = icmp eq i32 %1473, 0, !dbg !21 + %1474 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %387, float 0x3E54AE0C00000000, float %.0.i604) #6, !dbg !21 + %1475 = tail call float @llvm.nvvm.fma.rn.f(float %387, float 0x3E54AE0C00000000, float %.0.i604) #6, !dbg !21 + %.01.i606 = select i1 %.not4.i605, float %1475, float %1474, !dbg !21 + %1476 = bitcast float %.04.i602 to i32, !dbg !21 + %1477 = shl i32 %1476, 23, !dbg !21 + %1478 = bitcast i32 %1477 to float, !dbg !21 + %1479 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i606) #6, !dbg !21 + %1480 = fmul float %1479, %1478, !dbg !21 + %1481 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i607 = icmp eq i32 %1481, 0, !dbg !21 + %1482 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %388, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1483 = tail call float @llvm.nvvm.fma.rn.f(float %388, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i608 = select i1 %.not.i607, float %1483, float %1482, !dbg !21 + %1484 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i609 = icmp eq i32 %1484, 0, !dbg !21 + %1485 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i608) #6, !dbg !21 + %1486 = tail call float @llvm.nvvm.saturate.f(float %.02.i608) #6, !dbg !21 + %.03.i610 = select i1 %.not1.i609, float %1486, float %1485, !dbg !21 + %1487 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i611 = icmp eq i32 %1487, 0, !dbg !21 + %1488 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i610, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1489 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i610, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i612 = select i1 %.not2.i611, float %1489, float %1488, !dbg !21 + %1490 = fadd float %.04.i612, 0xC168000FE0000000, !dbg !21 + %1491 = fneg float %1490, !dbg !21 + %1492 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i613 = icmp eq i32 %1492, 0, !dbg !21 + %1493 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %388, float 0x3FF7154760000000, float %1491) #6, !dbg !21 + %1494 = tail call float @llvm.nvvm.fma.rn.f(float %388, float 0x3FF7154760000000, float %1491) #6, !dbg !21 + %.0.i614 = select i1 %.not3.i613, float %1494, float %1493, !dbg !21 + %1495 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i615 = icmp eq i32 %1495, 0, !dbg !21 + %1496 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %388, float 0x3E54AE0C00000000, float %.0.i614) #6, !dbg !21 + %1497 = tail call float @llvm.nvvm.fma.rn.f(float %388, float 0x3E54AE0C00000000, float %.0.i614) #6, !dbg !21 + %.01.i616 = select i1 %.not4.i615, float %1497, float %1496, !dbg !21 + %1498 = bitcast float %.04.i612 to i32, !dbg !21 + %1499 = shl i32 %1498, 23, !dbg !21 + %1500 = bitcast i32 %1499 to float, !dbg !21 + %1501 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i616) #6, !dbg !21 + %1502 = fmul float %1501, %1500, !dbg !21 + %1503 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i617 = icmp eq i32 %1503, 0, !dbg !21 + %1504 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %389, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1505 = tail call float @llvm.nvvm.fma.rn.f(float %389, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i618 = select i1 %.not.i617, float %1505, float %1504, !dbg !21 + %1506 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i619 = icmp eq i32 %1506, 0, !dbg !21 + %1507 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i618) #6, !dbg !21 + %1508 = tail call float @llvm.nvvm.saturate.f(float %.02.i618) #6, !dbg !21 + %.03.i620 = select i1 %.not1.i619, float %1508, float %1507, !dbg !21 + %1509 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i621 = icmp eq i32 %1509, 0, !dbg !21 + %1510 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i620, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1511 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i620, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i622 = select i1 %.not2.i621, float %1511, float %1510, !dbg !21 + %1512 = fadd float %.04.i622, 0xC168000FE0000000, !dbg !21 + %1513 = fneg float %1512, !dbg !21 + %1514 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i623 = icmp eq i32 %1514, 0, !dbg !21 + %1515 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %389, float 0x3FF7154760000000, float %1513) #6, !dbg !21 + %1516 = tail call float @llvm.nvvm.fma.rn.f(float %389, float 0x3FF7154760000000, float %1513) #6, !dbg !21 + %.0.i624 = select i1 %.not3.i623, float %1516, float %1515, !dbg !21 + %1517 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i625 = icmp eq i32 %1517, 0, !dbg !21 + %1518 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %389, float 0x3E54AE0C00000000, float %.0.i624) #6, !dbg !21 + %1519 = tail call float @llvm.nvvm.fma.rn.f(float %389, float 0x3E54AE0C00000000, float %.0.i624) #6, !dbg !21 + %.01.i626 = select i1 %.not4.i625, float %1519, float %1518, !dbg !21 + %1520 = bitcast float %.04.i622 to i32, !dbg !21 + %1521 = shl i32 %1520, 23, !dbg !21 + %1522 = bitcast i32 %1521 to float, !dbg !21 + %1523 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i626) #6, !dbg !21 + %1524 = fmul float %1523, %1522, !dbg !21 + %1525 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i627 = icmp eq i32 %1525, 0, !dbg !21 + %1526 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %390, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1527 = tail call float @llvm.nvvm.fma.rn.f(float %390, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i628 = select i1 %.not.i627, float %1527, float %1526, !dbg !21 + %1528 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i629 = icmp eq i32 %1528, 0, !dbg !21 + %1529 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i628) #6, !dbg !21 + %1530 = tail call float @llvm.nvvm.saturate.f(float %.02.i628) #6, !dbg !21 + %.03.i630 = select i1 %.not1.i629, float %1530, float %1529, !dbg !21 + %1531 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i631 = icmp eq i32 %1531, 0, !dbg !21 + %1532 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i630, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1533 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i630, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i632 = select i1 %.not2.i631, float %1533, float %1532, !dbg !21 + %1534 = fadd float %.04.i632, 0xC168000FE0000000, !dbg !21 + %1535 = fneg float %1534, !dbg !21 + %1536 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i633 = icmp eq i32 %1536, 0, !dbg !21 + %1537 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %390, float 0x3FF7154760000000, float %1535) #6, !dbg !21 + %1538 = tail call float @llvm.nvvm.fma.rn.f(float %390, float 0x3FF7154760000000, float %1535) #6, !dbg !21 + %.0.i634 = select i1 %.not3.i633, float %1538, float %1537, !dbg !21 + %1539 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i635 = icmp eq i32 %1539, 0, !dbg !21 + %1540 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %390, float 0x3E54AE0C00000000, float %.0.i634) #6, !dbg !21 + %1541 = tail call float @llvm.nvvm.fma.rn.f(float %390, float 0x3E54AE0C00000000, float %.0.i634) #6, !dbg !21 + %.01.i636 = select i1 %.not4.i635, float %1541, float %1540, !dbg !21 + %1542 = bitcast float %.04.i632 to i32, !dbg !21 + %1543 = shl i32 %1542, 23, !dbg !21 + %1544 = bitcast i32 %1543 to float, !dbg !21 + %1545 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i636) #6, !dbg !21 + %1546 = fmul float %1545, %1544, !dbg !21 + %1547 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i637 = icmp eq i32 %1547, 0, !dbg !21 + %1548 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %391, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1549 = tail call float @llvm.nvvm.fma.rn.f(float %391, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i638 = select i1 %.not.i637, float %1549, float %1548, !dbg !21 + %1550 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i639 = icmp eq i32 %1550, 0, !dbg !21 + %1551 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i638) #6, !dbg !21 + %1552 = tail call float @llvm.nvvm.saturate.f(float %.02.i638) #6, !dbg !21 + %.03.i640 = select i1 %.not1.i639, float %1552, float %1551, !dbg !21 + %1553 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i641 = icmp eq i32 %1553, 0, !dbg !21 + %1554 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i640, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1555 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i640, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i642 = select i1 %.not2.i641, float %1555, float %1554, !dbg !21 + %1556 = fadd float %.04.i642, 0xC168000FE0000000, !dbg !21 + %1557 = fneg float %1556, !dbg !21 + %1558 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i643 = icmp eq i32 %1558, 0, !dbg !21 + %1559 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %391, float 0x3FF7154760000000, float %1557) #6, !dbg !21 + %1560 = tail call float @llvm.nvvm.fma.rn.f(float %391, float 0x3FF7154760000000, float %1557) #6, !dbg !21 + %.0.i644 = select i1 %.not3.i643, float %1560, float %1559, !dbg !21 + %1561 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i645 = icmp eq i32 %1561, 0, !dbg !21 + %1562 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %391, float 0x3E54AE0C00000000, float %.0.i644) #6, !dbg !21 + %1563 = tail call float @llvm.nvvm.fma.rn.f(float %391, float 0x3E54AE0C00000000, float %.0.i644) #6, !dbg !21 + %.01.i646 = select i1 %.not4.i645, float %1563, float %1562, !dbg !21 + %1564 = bitcast float %.04.i642 to i32, !dbg !21 + %1565 = shl i32 %1564, 23, !dbg !21 + %1566 = bitcast i32 %1565 to float, !dbg !21 + %1567 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i646) #6, !dbg !21 + %1568 = fmul float %1567, %1566, !dbg !21 + %1569 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i647 = icmp eq i32 %1569, 0, !dbg !21 + %1570 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %392, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1571 = tail call float @llvm.nvvm.fma.rn.f(float %392, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i648 = select i1 %.not.i647, float %1571, float %1570, !dbg !21 + %1572 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i649 = icmp eq i32 %1572, 0, !dbg !21 + %1573 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i648) #6, !dbg !21 + %1574 = tail call float @llvm.nvvm.saturate.f(float %.02.i648) #6, !dbg !21 + %.03.i650 = select i1 %.not1.i649, float %1574, float %1573, !dbg !21 + %1575 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i651 = icmp eq i32 %1575, 0, !dbg !21 + %1576 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i650, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1577 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i650, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i652 = select i1 %.not2.i651, float %1577, float %1576, !dbg !21 + %1578 = fadd float %.04.i652, 0xC168000FE0000000, !dbg !21 + %1579 = fneg float %1578, !dbg !21 + %1580 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i653 = icmp eq i32 %1580, 0, !dbg !21 + %1581 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %392, float 0x3FF7154760000000, float %1579) #6, !dbg !21 + %1582 = tail call float @llvm.nvvm.fma.rn.f(float %392, float 0x3FF7154760000000, float %1579) #6, !dbg !21 + %.0.i654 = select i1 %.not3.i653, float %1582, float %1581, !dbg !21 + %1583 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i655 = icmp eq i32 %1583, 0, !dbg !21 + %1584 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %392, float 0x3E54AE0C00000000, float %.0.i654) #6, !dbg !21 + %1585 = tail call float @llvm.nvvm.fma.rn.f(float %392, float 0x3E54AE0C00000000, float %.0.i654) #6, !dbg !21 + %.01.i656 = select i1 %.not4.i655, float %1585, float %1584, !dbg !21 + %1586 = bitcast float %.04.i652 to i32, !dbg !21 + %1587 = shl i32 %1586, 23, !dbg !21 + %1588 = bitcast i32 %1587 to float, !dbg !21 + %1589 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i656) #6, !dbg !21 + %1590 = fmul float %1589, %1588, !dbg !21 + %1591 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i657 = icmp eq i32 %1591, 0, !dbg !21 + %1592 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %393, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1593 = tail call float @llvm.nvvm.fma.rn.f(float %393, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i658 = select i1 %.not.i657, float %1593, float %1592, !dbg !21 + %1594 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i659 = icmp eq i32 %1594, 0, !dbg !21 + %1595 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i658) #6, !dbg !21 + %1596 = tail call float @llvm.nvvm.saturate.f(float %.02.i658) #6, !dbg !21 + %.03.i660 = select i1 %.not1.i659, float %1596, float %1595, !dbg !21 + %1597 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i661 = icmp eq i32 %1597, 0, !dbg !21 + %1598 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i660, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1599 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i660, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i662 = select i1 %.not2.i661, float %1599, float %1598, !dbg !21 + %1600 = fadd float %.04.i662, 0xC168000FE0000000, !dbg !21 + %1601 = fneg float %1600, !dbg !21 + %1602 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i663 = icmp eq i32 %1602, 0, !dbg !21 + %1603 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %393, float 0x3FF7154760000000, float %1601) #6, !dbg !21 + %1604 = tail call float @llvm.nvvm.fma.rn.f(float %393, float 0x3FF7154760000000, float %1601) #6, !dbg !21 + %.0.i664 = select i1 %.not3.i663, float %1604, float %1603, !dbg !21 + %1605 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i665 = icmp eq i32 %1605, 0, !dbg !21 + %1606 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %393, float 0x3E54AE0C00000000, float %.0.i664) #6, !dbg !21 + %1607 = tail call float @llvm.nvvm.fma.rn.f(float %393, float 0x3E54AE0C00000000, float %.0.i664) #6, !dbg !21 + %.01.i666 = select i1 %.not4.i665, float %1607, float %1606, !dbg !21 + %1608 = bitcast float %.04.i662 to i32, !dbg !21 + %1609 = shl i32 %1608, 23, !dbg !21 + %1610 = bitcast i32 %1609 to float, !dbg !21 + %1611 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i666) #6, !dbg !21 + %1612 = fmul float %1611, %1610, !dbg !21 + %1613 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i667 = icmp eq i32 %1613, 0, !dbg !21 + %1614 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %394, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1615 = tail call float @llvm.nvvm.fma.rn.f(float %394, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i668 = select i1 %.not.i667, float %1615, float %1614, !dbg !21 + %1616 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i669 = icmp eq i32 %1616, 0, !dbg !21 + %1617 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i668) #6, !dbg !21 + %1618 = tail call float @llvm.nvvm.saturate.f(float %.02.i668) #6, !dbg !21 + %.03.i670 = select i1 %.not1.i669, float %1618, float %1617, !dbg !21 + %1619 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i671 = icmp eq i32 %1619, 0, !dbg !21 + %1620 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i670, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1621 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i670, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i672 = select i1 %.not2.i671, float %1621, float %1620, !dbg !21 + %1622 = fadd float %.04.i672, 0xC168000FE0000000, !dbg !21 + %1623 = fneg float %1622, !dbg !21 + %1624 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i673 = icmp eq i32 %1624, 0, !dbg !21 + %1625 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %394, float 0x3FF7154760000000, float %1623) #6, !dbg !21 + %1626 = tail call float @llvm.nvvm.fma.rn.f(float %394, float 0x3FF7154760000000, float %1623) #6, !dbg !21 + %.0.i674 = select i1 %.not3.i673, float %1626, float %1625, !dbg !21 + %1627 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i675 = icmp eq i32 %1627, 0, !dbg !21 + %1628 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %394, float 0x3E54AE0C00000000, float %.0.i674) #6, !dbg !21 + %1629 = tail call float @llvm.nvvm.fma.rn.f(float %394, float 0x3E54AE0C00000000, float %.0.i674) #6, !dbg !21 + %.01.i676 = select i1 %.not4.i675, float %1629, float %1628, !dbg !21 + %1630 = bitcast float %.04.i672 to i32, !dbg !21 + %1631 = shl i32 %1630, 23, !dbg !21 + %1632 = bitcast i32 %1631 to float, !dbg !21 + %1633 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i676) #6, !dbg !21 + %1634 = fmul float %1633, %1632, !dbg !21 + %1635 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i677 = icmp eq i32 %1635, 0, !dbg !21 + %1636 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %395, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1637 = tail call float @llvm.nvvm.fma.rn.f(float %395, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i678 = select i1 %.not.i677, float %1637, float %1636, !dbg !21 + %1638 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i679 = icmp eq i32 %1638, 0, !dbg !21 + %1639 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i678) #6, !dbg !21 + %1640 = tail call float @llvm.nvvm.saturate.f(float %.02.i678) #6, !dbg !21 + %.03.i680 = select i1 %.not1.i679, float %1640, float %1639, !dbg !21 + %1641 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i681 = icmp eq i32 %1641, 0, !dbg !21 + %1642 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i680, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1643 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i680, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i682 = select i1 %.not2.i681, float %1643, float %1642, !dbg !21 + %1644 = fadd float %.04.i682, 0xC168000FE0000000, !dbg !21 + %1645 = fneg float %1644, !dbg !21 + %1646 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i683 = icmp eq i32 %1646, 0, !dbg !21 + %1647 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %395, float 0x3FF7154760000000, float %1645) #6, !dbg !21 + %1648 = tail call float @llvm.nvvm.fma.rn.f(float %395, float 0x3FF7154760000000, float %1645) #6, !dbg !21 + %.0.i684 = select i1 %.not3.i683, float %1648, float %1647, !dbg !21 + %1649 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i685 = icmp eq i32 %1649, 0, !dbg !21 + %1650 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %395, float 0x3E54AE0C00000000, float %.0.i684) #6, !dbg !21 + %1651 = tail call float @llvm.nvvm.fma.rn.f(float %395, float 0x3E54AE0C00000000, float %.0.i684) #6, !dbg !21 + %.01.i686 = select i1 %.not4.i685, float %1651, float %1650, !dbg !21 + %1652 = bitcast float %.04.i682 to i32, !dbg !21 + %1653 = shl i32 %1652, 23, !dbg !21 + %1654 = bitcast i32 %1653 to float, !dbg !21 + %1655 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i686) #6, !dbg !21 + %1656 = fmul float %1655, %1654, !dbg !21 + %1657 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i687 = icmp eq i32 %1657, 0, !dbg !21 + %1658 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %396, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1659 = tail call float @llvm.nvvm.fma.rn.f(float %396, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i688 = select i1 %.not.i687, float %1659, float %1658, !dbg !21 + %1660 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i689 = icmp eq i32 %1660, 0, !dbg !21 + %1661 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i688) #6, !dbg !21 + %1662 = tail call float @llvm.nvvm.saturate.f(float %.02.i688) #6, !dbg !21 + %.03.i690 = select i1 %.not1.i689, float %1662, float %1661, !dbg !21 + %1663 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i691 = icmp eq i32 %1663, 0, !dbg !21 + %1664 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i690, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1665 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i690, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i692 = select i1 %.not2.i691, float %1665, float %1664, !dbg !21 + %1666 = fadd float %.04.i692, 0xC168000FE0000000, !dbg !21 + %1667 = fneg float %1666, !dbg !21 + %1668 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i693 = icmp eq i32 %1668, 0, !dbg !21 + %1669 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %396, float 0x3FF7154760000000, float %1667) #6, !dbg !21 + %1670 = tail call float @llvm.nvvm.fma.rn.f(float %396, float 0x3FF7154760000000, float %1667) #6, !dbg !21 + %.0.i694 = select i1 %.not3.i693, float %1670, float %1669, !dbg !21 + %1671 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i695 = icmp eq i32 %1671, 0, !dbg !21 + %1672 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %396, float 0x3E54AE0C00000000, float %.0.i694) #6, !dbg !21 + %1673 = tail call float @llvm.nvvm.fma.rn.f(float %396, float 0x3E54AE0C00000000, float %.0.i694) #6, !dbg !21 + %.01.i696 = select i1 %.not4.i695, float %1673, float %1672, !dbg !21 + %1674 = bitcast float %.04.i692 to i32, !dbg !21 + %1675 = shl i32 %1674, 23, !dbg !21 + %1676 = bitcast i32 %1675 to float, !dbg !21 + %1677 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i696) #6, !dbg !21 + %1678 = fmul float %1677, %1676, !dbg !21 + %1679 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i697 = icmp eq i32 %1679, 0, !dbg !21 + %1680 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %397, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1681 = tail call float @llvm.nvvm.fma.rn.f(float %397, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i698 = select i1 %.not.i697, float %1681, float %1680, !dbg !21 + %1682 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i699 = icmp eq i32 %1682, 0, !dbg !21 + %1683 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i698) #6, !dbg !21 + %1684 = tail call float @llvm.nvvm.saturate.f(float %.02.i698) #6, !dbg !21 + %.03.i700 = select i1 %.not1.i699, float %1684, float %1683, !dbg !21 + %1685 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %1686 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i700, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1687 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i700, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1688 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i703 = icmp eq i32 %1688, 0, !dbg !21 + %1689 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i705 = icmp eq i32 %1689, 0, !dbg !21 + %1690 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i707 = icmp eq i32 %1690, 0, !dbg !21 + %1691 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %398, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1692 = tail call float @llvm.nvvm.fma.rn.f(float %398, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i708 = select i1 %.not.i707, float %1692, float %1691, !dbg !21 + %1693 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i709 = icmp eq i32 %1693, 0, !dbg !21 + %1694 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i708) #6, !dbg !21 + %1695 = tail call float @llvm.nvvm.saturate.f(float %.02.i708) #6, !dbg !21 + %.03.i710 = select i1 %.not1.i709, float %1695, float %1694, !dbg !21 + %1696 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %1697 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i710, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1698 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i710, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1699 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i713 = icmp eq i32 %1699, 0, !dbg !21 + %1700 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i715 = icmp eq i32 %1700, 0, !dbg !21 + %1701 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i717 = icmp eq i32 %1701, 0, !dbg !21 + %1702 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %399, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1703 = tail call float @llvm.nvvm.fma.rn.f(float %399, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i718 = select i1 %.not.i717, float %1703, float %1702, !dbg !21 + %1704 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i719 = icmp eq i32 %1704, 0, !dbg !21 + %1705 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i718) #6, !dbg !21 + %1706 = tail call float @llvm.nvvm.saturate.f(float %.02.i718) #6, !dbg !21 + %.03.i720 = select i1 %.not1.i719, float %1706, float %1705, !dbg !21 + %1707 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %1708 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i720, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1709 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i720, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1710 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i723 = icmp eq i32 %1710, 0, !dbg !21 + %1711 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i725 = icmp eq i32 %1711, 0, !dbg !21 + %1712 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i727 = icmp eq i32 %1712, 0, !dbg !21 + %1713 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %400, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1714 = tail call float @llvm.nvvm.fma.rn.f(float %400, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i728 = select i1 %.not.i727, float %1714, float %1713, !dbg !21 + %1715 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i729 = icmp eq i32 %1715, 0, !dbg !21 + %1716 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i728) #6, !dbg !21 + %1717 = tail call float @llvm.nvvm.saturate.f(float %.02.i728) #6, !dbg !21 + %.03.i730 = select i1 %.not1.i729, float %1717, float %1716, !dbg !21 + %1718 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %1719 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i730, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1720 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i730, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1721 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i733 = icmp eq i32 %1721, 0, !dbg !21 + %1722 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i735 = icmp eq i32 %1722, 0, !dbg !21 + %1723 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i737 = icmp eq i32 %1723, 0, !dbg !21 + %1724 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %401, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1725 = tail call float @llvm.nvvm.fma.rn.f(float %401, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i738 = select i1 %.not.i737, float %1725, float %1724, !dbg !21 + %1726 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i739 = icmp eq i32 %1726, 0, !dbg !21 + %1727 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i738) #6, !dbg !21 + %1728 = tail call float @llvm.nvvm.saturate.f(float %.02.i738) #6, !dbg !21 + %.03.i740 = select i1 %.not1.i739, float %1728, float %1727, !dbg !21 + %1729 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %1730 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i740, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1731 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i740, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1732 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i743 = icmp eq i32 %1732, 0, !dbg !21 + %1733 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i745 = icmp eq i32 %1733, 0, !dbg !21 + %1734 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i747 = icmp eq i32 %1734, 0, !dbg !21 + %1735 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %402, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1736 = tail call float @llvm.nvvm.fma.rn.f(float %402, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i748 = select i1 %.not.i747, float %1736, float %1735, !dbg !21 + %1737 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i749 = icmp eq i32 %1737, 0, !dbg !21 + %1738 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i748) #6, !dbg !21 + %1739 = tail call float @llvm.nvvm.saturate.f(float %.02.i748) #6, !dbg !21 + %.03.i750 = select i1 %.not1.i749, float %1739, float %1738, !dbg !21 + %1740 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %1741 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i750, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1742 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i750, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1743 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i753 = icmp eq i32 %1743, 0, !dbg !21 + %1744 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i755 = icmp eq i32 %1744, 0, !dbg !21 + %1745 = fsub float %211, %211, !dbg !22 + %1746 = fsub float %212, %212, !dbg !22 + %1747 = fsub float %213, %213, !dbg !22 + %1748 = fsub float %214, %214, !dbg !22 + %1749 = fsub float %215, %215, !dbg !22 + %1750 = fsub float %216, %216, !dbg !22 + %1751 = fsub float %217, %217, !dbg !22 + %1752 = fsub float %218, %218, !dbg !22 + %1753 = fsub float %219, %219, !dbg !22 + %1754 = fsub float %220, %220, !dbg !22 + %1755 = fsub float %221, %221, !dbg !22 + %1756 = fsub float %222, %222, !dbg !22 + %1757 = fsub float %223, %223, !dbg !22 + %1758 = fsub float %224, %224, !dbg !22 + %1759 = fsub float %225, %225, !dbg !22 + %1760 = fsub float %226, %226, !dbg !22 + %1761 = fsub float %227, %227, !dbg !22 + %1762 = fsub float %228, %228, !dbg !22 + %1763 = fsub float %229, %229, !dbg !22 + %1764 = fsub float %230, %230, !dbg !22 + %1765 = fsub float %231, %231, !dbg !22 + %1766 = fsub float %232, %232, !dbg !22 + %1767 = fsub float %233, %233, !dbg !22 + %1768 = fsub float %234, %234, !dbg !22 + %1769 = fsub float %235, %235, !dbg !22 + %1770 = fsub float %236, %236, !dbg !22 + %1771 = fsub float %237, %237, !dbg !22 + %1772 = fsub float %238, %238, !dbg !22 + %1773 = fsub float %239, %239, !dbg !22 + %1774 = fsub float %240, %240, !dbg !22 + %1775 = fsub float %241, %241, !dbg !22 + %1776 = fsub float %242, %242, !dbg !22 + %1777 = fsub float %243, %243, !dbg !22 + %1778 = fsub float %244, %244, !dbg !22 + %1779 = fsub float %245, %245, !dbg !22 + %1780 = fsub float %246, %246, !dbg !22 + %1781 = fsub float %247, %247, !dbg !22 + %1782 = fsub float %248, %248, !dbg !22 + %1783 = fsub float %249, %249, !dbg !22 + %1784 = fsub float %250, %250, !dbg !22 + %1785 = fsub float %251, %251, !dbg !22 + %1786 = fsub float %252, %252, !dbg !22 + %1787 = fsub float %253, %253, !dbg !22 + %1788 = fsub float %254, %254, !dbg !22 + %1789 = fsub float %255, %255, !dbg !22 + %1790 = fsub float %256, %256, !dbg !22 + %1791 = fsub float %257, %257, !dbg !22 + %1792 = fsub float %258, %258, !dbg !22 + %1793 = fsub float %259, %259, !dbg !22 + %1794 = fsub float %260, %260, !dbg !22 + %1795 = fsub float %261, %261, !dbg !22 + %1796 = fsub float %262, %262, !dbg !22 + %1797 = fsub float %263, %263, !dbg !22 + %1798 = fsub float %264, %264, !dbg !22 + %1799 = fsub float %265, %265, !dbg !22 + %1800 = fsub float %266, %266, !dbg !22 + %1801 = fsub float %267, %267, !dbg !22 + %1802 = fsub float %268, %268, !dbg !22 + %1803 = fsub float %270, %270, !dbg !22 + %1804 = fsub float %272, %272, !dbg !22 + %1805 = fsub float %274, %274, !dbg !22 + %1806 = fsub float %276, %276, !dbg !22 + %1807 = fsub float %278, %278, !dbg !22 + %1808 = fsub float %280, %280, !dbg !22 + %1809 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i757 = icmp eq i32 %1809, 0, !dbg !21 + %1810 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1745, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1811 = tail call float @llvm.nvvm.fma.rn.f(float %1745, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i758 = select i1 %.not.i757, float %1811, float %1810, !dbg !21 + %1812 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i759 = icmp eq i32 %1812, 0, !dbg !21 + %1813 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i758) #6, !dbg !21 + %1814 = tail call float @llvm.nvvm.saturate.f(float %.02.i758) #6, !dbg !21 + %.03.i760 = select i1 %.not1.i759, float %1814, float %1813, !dbg !21 + %1815 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i761 = icmp eq i32 %1815, 0, !dbg !21 + %1816 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i760, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1817 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i760, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i762 = select i1 %.not2.i761, float %1817, float %1816, !dbg !21 + %1818 = fadd float %.04.i762, 0xC168000FE0000000, !dbg !21 + %1819 = fneg float %1818, !dbg !21 + %1820 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i763 = icmp eq i32 %1820, 0, !dbg !21 + %1821 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1745, float 0x3FF7154760000000, float %1819) #6, !dbg !21 + %1822 = tail call float @llvm.nvvm.fma.rn.f(float %1745, float 0x3FF7154760000000, float %1819) #6, !dbg !21 + %.0.i764 = select i1 %.not3.i763, float %1822, float %1821, !dbg !21 + %1823 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i765 = icmp eq i32 %1823, 0, !dbg !21 + %1824 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1745, float 0x3E54AE0C00000000, float %.0.i764) #6, !dbg !21 + %1825 = tail call float @llvm.nvvm.fma.rn.f(float %1745, float 0x3E54AE0C00000000, float %.0.i764) #6, !dbg !21 + %.01.i766 = select i1 %.not4.i765, float %1825, float %1824, !dbg !21 + %1826 = bitcast float %.04.i762 to i32, !dbg !21 + %1827 = shl i32 %1826, 23, !dbg !21 + %1828 = bitcast i32 %1827 to float, !dbg !21 + %1829 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i766) #6, !dbg !21 + %1830 = fmul float %1829, %1828, !dbg !21 + %1831 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i767 = icmp eq i32 %1831, 0, !dbg !21 + %1832 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1746, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1833 = tail call float @llvm.nvvm.fma.rn.f(float %1746, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i768 = select i1 %.not.i767, float %1833, float %1832, !dbg !21 + %1834 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i769 = icmp eq i32 %1834, 0, !dbg !21 + %1835 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i768) #6, !dbg !21 + %1836 = tail call float @llvm.nvvm.saturate.f(float %.02.i768) #6, !dbg !21 + %.03.i770 = select i1 %.not1.i769, float %1836, float %1835, !dbg !21 + %1837 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i771 = icmp eq i32 %1837, 0, !dbg !21 + %1838 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i770, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1839 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i770, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i772 = select i1 %.not2.i771, float %1839, float %1838, !dbg !21 + %1840 = fadd float %.04.i772, 0xC168000FE0000000, !dbg !21 + %1841 = fneg float %1840, !dbg !21 + %1842 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i773 = icmp eq i32 %1842, 0, !dbg !21 + %1843 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1746, float 0x3FF7154760000000, float %1841) #6, !dbg !21 + %1844 = tail call float @llvm.nvvm.fma.rn.f(float %1746, float 0x3FF7154760000000, float %1841) #6, !dbg !21 + %.0.i774 = select i1 %.not3.i773, float %1844, float %1843, !dbg !21 + %1845 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i775 = icmp eq i32 %1845, 0, !dbg !21 + %1846 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1746, float 0x3E54AE0C00000000, float %.0.i774) #6, !dbg !21 + %1847 = tail call float @llvm.nvvm.fma.rn.f(float %1746, float 0x3E54AE0C00000000, float %.0.i774) #6, !dbg !21 + %.01.i776 = select i1 %.not4.i775, float %1847, float %1846, !dbg !21 + %1848 = bitcast float %.04.i772 to i32, !dbg !21 + %1849 = shl i32 %1848, 23, !dbg !21 + %1850 = bitcast i32 %1849 to float, !dbg !21 + %1851 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i776) #6, !dbg !21 + %1852 = fmul float %1851, %1850, !dbg !21 + %1853 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i777 = icmp eq i32 %1853, 0, !dbg !21 + %1854 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1747, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1855 = tail call float @llvm.nvvm.fma.rn.f(float %1747, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i778 = select i1 %.not.i777, float %1855, float %1854, !dbg !21 + %1856 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i779 = icmp eq i32 %1856, 0, !dbg !21 + %1857 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i778) #6, !dbg !21 + %1858 = tail call float @llvm.nvvm.saturate.f(float %.02.i778) #6, !dbg !21 + %.03.i780 = select i1 %.not1.i779, float %1858, float %1857, !dbg !21 + %1859 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i781 = icmp eq i32 %1859, 0, !dbg !21 + %1860 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i780, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1861 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i780, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i782 = select i1 %.not2.i781, float %1861, float %1860, !dbg !21 + %1862 = fadd float %.04.i782, 0xC168000FE0000000, !dbg !21 + %1863 = fneg float %1862, !dbg !21 + %1864 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i783 = icmp eq i32 %1864, 0, !dbg !21 + %1865 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1747, float 0x3FF7154760000000, float %1863) #6, !dbg !21 + %1866 = tail call float @llvm.nvvm.fma.rn.f(float %1747, float 0x3FF7154760000000, float %1863) #6, !dbg !21 + %.0.i784 = select i1 %.not3.i783, float %1866, float %1865, !dbg !21 + %1867 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i785 = icmp eq i32 %1867, 0, !dbg !21 + %1868 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1747, float 0x3E54AE0C00000000, float %.0.i784) #6, !dbg !21 + %1869 = tail call float @llvm.nvvm.fma.rn.f(float %1747, float 0x3E54AE0C00000000, float %.0.i784) #6, !dbg !21 + %.01.i786 = select i1 %.not4.i785, float %1869, float %1868, !dbg !21 + %1870 = bitcast float %.04.i782 to i32, !dbg !21 + %1871 = shl i32 %1870, 23, !dbg !21 + %1872 = bitcast i32 %1871 to float, !dbg !21 + %1873 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i786) #6, !dbg !21 + %1874 = fmul float %1873, %1872, !dbg !21 + %1875 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i787 = icmp eq i32 %1875, 0, !dbg !21 + %1876 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1748, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1877 = tail call float @llvm.nvvm.fma.rn.f(float %1748, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i788 = select i1 %.not.i787, float %1877, float %1876, !dbg !21 + %1878 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i789 = icmp eq i32 %1878, 0, !dbg !21 + %1879 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i788) #6, !dbg !21 + %1880 = tail call float @llvm.nvvm.saturate.f(float %.02.i788) #6, !dbg !21 + %.03.i790 = select i1 %.not1.i789, float %1880, float %1879, !dbg !21 + %1881 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i791 = icmp eq i32 %1881, 0, !dbg !21 + %1882 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i790, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1883 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i790, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i792 = select i1 %.not2.i791, float %1883, float %1882, !dbg !21 + %1884 = fadd float %.04.i792, 0xC168000FE0000000, !dbg !21 + %1885 = fneg float %1884, !dbg !21 + %1886 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i793 = icmp eq i32 %1886, 0, !dbg !21 + %1887 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1748, float 0x3FF7154760000000, float %1885) #6, !dbg !21 + %1888 = tail call float @llvm.nvvm.fma.rn.f(float %1748, float 0x3FF7154760000000, float %1885) #6, !dbg !21 + %.0.i794 = select i1 %.not3.i793, float %1888, float %1887, !dbg !21 + %1889 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i795 = icmp eq i32 %1889, 0, !dbg !21 + %1890 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1748, float 0x3E54AE0C00000000, float %.0.i794) #6, !dbg !21 + %1891 = tail call float @llvm.nvvm.fma.rn.f(float %1748, float 0x3E54AE0C00000000, float %.0.i794) #6, !dbg !21 + %.01.i796 = select i1 %.not4.i795, float %1891, float %1890, !dbg !21 + %1892 = bitcast float %.04.i792 to i32, !dbg !21 + %1893 = shl i32 %1892, 23, !dbg !21 + %1894 = bitcast i32 %1893 to float, !dbg !21 + %1895 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i796) #6, !dbg !21 + %1896 = fmul float %1895, %1894, !dbg !21 + %1897 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i797 = icmp eq i32 %1897, 0, !dbg !21 + %1898 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1749, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1899 = tail call float @llvm.nvvm.fma.rn.f(float %1749, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i798 = select i1 %.not.i797, float %1899, float %1898, !dbg !21 + %1900 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i799 = icmp eq i32 %1900, 0, !dbg !21 + %1901 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i798) #6, !dbg !21 + %1902 = tail call float @llvm.nvvm.saturate.f(float %.02.i798) #6, !dbg !21 + %.03.i800 = select i1 %.not1.i799, float %1902, float %1901, !dbg !21 + %1903 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i801 = icmp eq i32 %1903, 0, !dbg !21 + %1904 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i800, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1905 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i800, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i802 = select i1 %.not2.i801, float %1905, float %1904, !dbg !21 + %1906 = fadd float %.04.i802, 0xC168000FE0000000, !dbg !21 + %1907 = fneg float %1906, !dbg !21 + %1908 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i803 = icmp eq i32 %1908, 0, !dbg !21 + %1909 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1749, float 0x3FF7154760000000, float %1907) #6, !dbg !21 + %1910 = tail call float @llvm.nvvm.fma.rn.f(float %1749, float 0x3FF7154760000000, float %1907) #6, !dbg !21 + %.0.i804 = select i1 %.not3.i803, float %1910, float %1909, !dbg !21 + %1911 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i805 = icmp eq i32 %1911, 0, !dbg !21 + %1912 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1749, float 0x3E54AE0C00000000, float %.0.i804) #6, !dbg !21 + %1913 = tail call float @llvm.nvvm.fma.rn.f(float %1749, float 0x3E54AE0C00000000, float %.0.i804) #6, !dbg !21 + %.01.i806 = select i1 %.not4.i805, float %1913, float %1912, !dbg !21 + %1914 = bitcast float %.04.i802 to i32, !dbg !21 + %1915 = shl i32 %1914, 23, !dbg !21 + %1916 = bitcast i32 %1915 to float, !dbg !21 + %1917 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i806) #6, !dbg !21 + %1918 = fmul float %1917, %1916, !dbg !21 + %1919 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i807 = icmp eq i32 %1919, 0, !dbg !21 + %1920 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1750, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1921 = tail call float @llvm.nvvm.fma.rn.f(float %1750, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i808 = select i1 %.not.i807, float %1921, float %1920, !dbg !21 + %1922 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i809 = icmp eq i32 %1922, 0, !dbg !21 + %1923 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i808) #6, !dbg !21 + %1924 = tail call float @llvm.nvvm.saturate.f(float %.02.i808) #6, !dbg !21 + %.03.i810 = select i1 %.not1.i809, float %1924, float %1923, !dbg !21 + %1925 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i811 = icmp eq i32 %1925, 0, !dbg !21 + %1926 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i810, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1927 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i810, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i812 = select i1 %.not2.i811, float %1927, float %1926, !dbg !21 + %1928 = fadd float %.04.i812, 0xC168000FE0000000, !dbg !21 + %1929 = fneg float %1928, !dbg !21 + %1930 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i813 = icmp eq i32 %1930, 0, !dbg !21 + %1931 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1750, float 0x3FF7154760000000, float %1929) #6, !dbg !21 + %1932 = tail call float @llvm.nvvm.fma.rn.f(float %1750, float 0x3FF7154760000000, float %1929) #6, !dbg !21 + %.0.i814 = select i1 %.not3.i813, float %1932, float %1931, !dbg !21 + %1933 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i815 = icmp eq i32 %1933, 0, !dbg !21 + %1934 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1750, float 0x3E54AE0C00000000, float %.0.i814) #6, !dbg !21 + %1935 = tail call float @llvm.nvvm.fma.rn.f(float %1750, float 0x3E54AE0C00000000, float %.0.i814) #6, !dbg !21 + %.01.i816 = select i1 %.not4.i815, float %1935, float %1934, !dbg !21 + %1936 = bitcast float %.04.i812 to i32, !dbg !21 + %1937 = shl i32 %1936, 23, !dbg !21 + %1938 = bitcast i32 %1937 to float, !dbg !21 + %1939 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i816) #6, !dbg !21 + %1940 = fmul float %1939, %1938, !dbg !21 + %1941 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i817 = icmp eq i32 %1941, 0, !dbg !21 + %1942 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1751, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1943 = tail call float @llvm.nvvm.fma.rn.f(float %1751, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i818 = select i1 %.not.i817, float %1943, float %1942, !dbg !21 + %1944 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i819 = icmp eq i32 %1944, 0, !dbg !21 + %1945 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i818) #6, !dbg !21 + %1946 = tail call float @llvm.nvvm.saturate.f(float %.02.i818) #6, !dbg !21 + %.03.i820 = select i1 %.not1.i819, float %1946, float %1945, !dbg !21 + %1947 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i821 = icmp eq i32 %1947, 0, !dbg !21 + %1948 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i820, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1949 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i820, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i822 = select i1 %.not2.i821, float %1949, float %1948, !dbg !21 + %1950 = fadd float %.04.i822, 0xC168000FE0000000, !dbg !21 + %1951 = fneg float %1950, !dbg !21 + %1952 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i823 = icmp eq i32 %1952, 0, !dbg !21 + %1953 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1751, float 0x3FF7154760000000, float %1951) #6, !dbg !21 + %1954 = tail call float @llvm.nvvm.fma.rn.f(float %1751, float 0x3FF7154760000000, float %1951) #6, !dbg !21 + %.0.i824 = select i1 %.not3.i823, float %1954, float %1953, !dbg !21 + %1955 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i825 = icmp eq i32 %1955, 0, !dbg !21 + %1956 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1751, float 0x3E54AE0C00000000, float %.0.i824) #6, !dbg !21 + %1957 = tail call float @llvm.nvvm.fma.rn.f(float %1751, float 0x3E54AE0C00000000, float %.0.i824) #6, !dbg !21 + %.01.i826 = select i1 %.not4.i825, float %1957, float %1956, !dbg !21 + %1958 = bitcast float %.04.i822 to i32, !dbg !21 + %1959 = shl i32 %1958, 23, !dbg !21 + %1960 = bitcast i32 %1959 to float, !dbg !21 + %1961 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i826) #6, !dbg !21 + %1962 = fmul float %1961, %1960, !dbg !21 + %1963 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i827 = icmp eq i32 %1963, 0, !dbg !21 + %1964 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1752, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1965 = tail call float @llvm.nvvm.fma.rn.f(float %1752, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i828 = select i1 %.not.i827, float %1965, float %1964, !dbg !21 + %1966 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i829 = icmp eq i32 %1966, 0, !dbg !21 + %1967 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i828) #6, !dbg !21 + %1968 = tail call float @llvm.nvvm.saturate.f(float %.02.i828) #6, !dbg !21 + %.03.i830 = select i1 %.not1.i829, float %1968, float %1967, !dbg !21 + %1969 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i831 = icmp eq i32 %1969, 0, !dbg !21 + %1970 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i830, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1971 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i830, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i832 = select i1 %.not2.i831, float %1971, float %1970, !dbg !21 + %1972 = fadd float %.04.i832, 0xC168000FE0000000, !dbg !21 + %1973 = fneg float %1972, !dbg !21 + %1974 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i833 = icmp eq i32 %1974, 0, !dbg !21 + %1975 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1752, float 0x3FF7154760000000, float %1973) #6, !dbg !21 + %1976 = tail call float @llvm.nvvm.fma.rn.f(float %1752, float 0x3FF7154760000000, float %1973) #6, !dbg !21 + %.0.i834 = select i1 %.not3.i833, float %1976, float %1975, !dbg !21 + %1977 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i835 = icmp eq i32 %1977, 0, !dbg !21 + %1978 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1752, float 0x3E54AE0C00000000, float %.0.i834) #6, !dbg !21 + %1979 = tail call float @llvm.nvvm.fma.rn.f(float %1752, float 0x3E54AE0C00000000, float %.0.i834) #6, !dbg !21 + %.01.i836 = select i1 %.not4.i835, float %1979, float %1978, !dbg !21 + %1980 = bitcast float %.04.i832 to i32, !dbg !21 + %1981 = shl i32 %1980, 23, !dbg !21 + %1982 = bitcast i32 %1981 to float, !dbg !21 + %1983 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i836) #6, !dbg !21 + %1984 = fmul float %1983, %1982, !dbg !21 + %1985 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i837 = icmp eq i32 %1985, 0, !dbg !21 + %1986 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1753, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1987 = tail call float @llvm.nvvm.fma.rn.f(float %1753, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i838 = select i1 %.not.i837, float %1987, float %1986, !dbg !21 + %1988 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i839 = icmp eq i32 %1988, 0, !dbg !21 + %1989 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i838) #6, !dbg !21 + %1990 = tail call float @llvm.nvvm.saturate.f(float %.02.i838) #6, !dbg !21 + %.03.i840 = select i1 %.not1.i839, float %1990, float %1989, !dbg !21 + %1991 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i841 = icmp eq i32 %1991, 0, !dbg !21 + %1992 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i840, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1993 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i840, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i842 = select i1 %.not2.i841, float %1993, float %1992, !dbg !21 + %1994 = fadd float %.04.i842, 0xC168000FE0000000, !dbg !21 + %1995 = fneg float %1994, !dbg !21 + %1996 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i843 = icmp eq i32 %1996, 0, !dbg !21 + %1997 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1753, float 0x3FF7154760000000, float %1995) #6, !dbg !21 + %1998 = tail call float @llvm.nvvm.fma.rn.f(float %1753, float 0x3FF7154760000000, float %1995) #6, !dbg !21 + %.0.i844 = select i1 %.not3.i843, float %1998, float %1997, !dbg !21 + %1999 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i845 = icmp eq i32 %1999, 0, !dbg !21 + %2000 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1753, float 0x3E54AE0C00000000, float %.0.i844) #6, !dbg !21 + %2001 = tail call float @llvm.nvvm.fma.rn.f(float %1753, float 0x3E54AE0C00000000, float %.0.i844) #6, !dbg !21 + %.01.i846 = select i1 %.not4.i845, float %2001, float %2000, !dbg !21 + %2002 = bitcast float %.04.i842 to i32, !dbg !21 + %2003 = shl i32 %2002, 23, !dbg !21 + %2004 = bitcast i32 %2003 to float, !dbg !21 + %2005 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i846) #6, !dbg !21 + %2006 = fmul float %2005, %2004, !dbg !21 + %2007 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i847 = icmp eq i32 %2007, 0, !dbg !21 + %2008 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1754, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2009 = tail call float @llvm.nvvm.fma.rn.f(float %1754, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i848 = select i1 %.not.i847, float %2009, float %2008, !dbg !21 + %2010 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i849 = icmp eq i32 %2010, 0, !dbg !21 + %2011 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i848) #6, !dbg !21 + %2012 = tail call float @llvm.nvvm.saturate.f(float %.02.i848) #6, !dbg !21 + %.03.i850 = select i1 %.not1.i849, float %2012, float %2011, !dbg !21 + %2013 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i851 = icmp eq i32 %2013, 0, !dbg !21 + %2014 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i850, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2015 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i850, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i852 = select i1 %.not2.i851, float %2015, float %2014, !dbg !21 + %2016 = fadd float %.04.i852, 0xC168000FE0000000, !dbg !21 + %2017 = fneg float %2016, !dbg !21 + %2018 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i853 = icmp eq i32 %2018, 0, !dbg !21 + %2019 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1754, float 0x3FF7154760000000, float %2017) #6, !dbg !21 + %2020 = tail call float @llvm.nvvm.fma.rn.f(float %1754, float 0x3FF7154760000000, float %2017) #6, !dbg !21 + %.0.i854 = select i1 %.not3.i853, float %2020, float %2019, !dbg !21 + %2021 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i855 = icmp eq i32 %2021, 0, !dbg !21 + %2022 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1754, float 0x3E54AE0C00000000, float %.0.i854) #6, !dbg !21 + %2023 = tail call float @llvm.nvvm.fma.rn.f(float %1754, float 0x3E54AE0C00000000, float %.0.i854) #6, !dbg !21 + %.01.i856 = select i1 %.not4.i855, float %2023, float %2022, !dbg !21 + %2024 = bitcast float %.04.i852 to i32, !dbg !21 + %2025 = shl i32 %2024, 23, !dbg !21 + %2026 = bitcast i32 %2025 to float, !dbg !21 + %2027 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i856) #6, !dbg !21 + %2028 = fmul float %2027, %2026, !dbg !21 + %2029 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i857 = icmp eq i32 %2029, 0, !dbg !21 + %2030 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1755, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2031 = tail call float @llvm.nvvm.fma.rn.f(float %1755, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i858 = select i1 %.not.i857, float %2031, float %2030, !dbg !21 + %2032 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i859 = icmp eq i32 %2032, 0, !dbg !21 + %2033 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i858) #6, !dbg !21 + %2034 = tail call float @llvm.nvvm.saturate.f(float %.02.i858) #6, !dbg !21 + %.03.i860 = select i1 %.not1.i859, float %2034, float %2033, !dbg !21 + %2035 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i861 = icmp eq i32 %2035, 0, !dbg !21 + %2036 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i860, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2037 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i860, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i862 = select i1 %.not2.i861, float %2037, float %2036, !dbg !21 + %2038 = fadd float %.04.i862, 0xC168000FE0000000, !dbg !21 + %2039 = fneg float %2038, !dbg !21 + %2040 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i863 = icmp eq i32 %2040, 0, !dbg !21 + %2041 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1755, float 0x3FF7154760000000, float %2039) #6, !dbg !21 + %2042 = tail call float @llvm.nvvm.fma.rn.f(float %1755, float 0x3FF7154760000000, float %2039) #6, !dbg !21 + %.0.i864 = select i1 %.not3.i863, float %2042, float %2041, !dbg !21 + %2043 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i865 = icmp eq i32 %2043, 0, !dbg !21 + %2044 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1755, float 0x3E54AE0C00000000, float %.0.i864) #6, !dbg !21 + %2045 = tail call float @llvm.nvvm.fma.rn.f(float %1755, float 0x3E54AE0C00000000, float %.0.i864) #6, !dbg !21 + %.01.i866 = select i1 %.not4.i865, float %2045, float %2044, !dbg !21 + %2046 = bitcast float %.04.i862 to i32, !dbg !21 + %2047 = shl i32 %2046, 23, !dbg !21 + %2048 = bitcast i32 %2047 to float, !dbg !21 + %2049 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i866) #6, !dbg !21 + %2050 = fmul float %2049, %2048, !dbg !21 + %2051 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i867 = icmp eq i32 %2051, 0, !dbg !21 + %2052 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1756, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2053 = tail call float @llvm.nvvm.fma.rn.f(float %1756, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i868 = select i1 %.not.i867, float %2053, float %2052, !dbg !21 + %2054 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i869 = icmp eq i32 %2054, 0, !dbg !21 + %2055 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i868) #6, !dbg !21 + %2056 = tail call float @llvm.nvvm.saturate.f(float %.02.i868) #6, !dbg !21 + %.03.i870 = select i1 %.not1.i869, float %2056, float %2055, !dbg !21 + %2057 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i871 = icmp eq i32 %2057, 0, !dbg !21 + %2058 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i870, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2059 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i870, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i872 = select i1 %.not2.i871, float %2059, float %2058, !dbg !21 + %2060 = fadd float %.04.i872, 0xC168000FE0000000, !dbg !21 + %2061 = fneg float %2060, !dbg !21 + %2062 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i873 = icmp eq i32 %2062, 0, !dbg !21 + %2063 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1756, float 0x3FF7154760000000, float %2061) #6, !dbg !21 + %2064 = tail call float @llvm.nvvm.fma.rn.f(float %1756, float 0x3FF7154760000000, float %2061) #6, !dbg !21 + %.0.i874 = select i1 %.not3.i873, float %2064, float %2063, !dbg !21 + %2065 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i875 = icmp eq i32 %2065, 0, !dbg !21 + %2066 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1756, float 0x3E54AE0C00000000, float %.0.i874) #6, !dbg !21 + %2067 = tail call float @llvm.nvvm.fma.rn.f(float %1756, float 0x3E54AE0C00000000, float %.0.i874) #6, !dbg !21 + %.01.i876 = select i1 %.not4.i875, float %2067, float %2066, !dbg !21 + %2068 = bitcast float %.04.i872 to i32, !dbg !21 + %2069 = shl i32 %2068, 23, !dbg !21 + %2070 = bitcast i32 %2069 to float, !dbg !21 + %2071 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i876) #6, !dbg !21 + %2072 = fmul float %2071, %2070, !dbg !21 + %2073 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i877 = icmp eq i32 %2073, 0, !dbg !21 + %2074 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1757, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2075 = tail call float @llvm.nvvm.fma.rn.f(float %1757, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i878 = select i1 %.not.i877, float %2075, float %2074, !dbg !21 + %2076 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i879 = icmp eq i32 %2076, 0, !dbg !21 + %2077 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i878) #6, !dbg !21 + %2078 = tail call float @llvm.nvvm.saturate.f(float %.02.i878) #6, !dbg !21 + %.03.i880 = select i1 %.not1.i879, float %2078, float %2077, !dbg !21 + %2079 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i881 = icmp eq i32 %2079, 0, !dbg !21 + %2080 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i880, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2081 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i880, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i882 = select i1 %.not2.i881, float %2081, float %2080, !dbg !21 + %2082 = fadd float %.04.i882, 0xC168000FE0000000, !dbg !21 + %2083 = fneg float %2082, !dbg !21 + %2084 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i883 = icmp eq i32 %2084, 0, !dbg !21 + %2085 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1757, float 0x3FF7154760000000, float %2083) #6, !dbg !21 + %2086 = tail call float @llvm.nvvm.fma.rn.f(float %1757, float 0x3FF7154760000000, float %2083) #6, !dbg !21 + %.0.i884 = select i1 %.not3.i883, float %2086, float %2085, !dbg !21 + %2087 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i885 = icmp eq i32 %2087, 0, !dbg !21 + %2088 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1757, float 0x3E54AE0C00000000, float %.0.i884) #6, !dbg !21 + %2089 = tail call float @llvm.nvvm.fma.rn.f(float %1757, float 0x3E54AE0C00000000, float %.0.i884) #6, !dbg !21 + %.01.i886 = select i1 %.not4.i885, float %2089, float %2088, !dbg !21 + %2090 = bitcast float %.04.i882 to i32, !dbg !21 + %2091 = shl i32 %2090, 23, !dbg !21 + %2092 = bitcast i32 %2091 to float, !dbg !21 + %2093 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i886) #6, !dbg !21 + %2094 = fmul float %2093, %2092, !dbg !21 + %2095 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i887 = icmp eq i32 %2095, 0, !dbg !21 + %2096 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1758, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2097 = tail call float @llvm.nvvm.fma.rn.f(float %1758, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i888 = select i1 %.not.i887, float %2097, float %2096, !dbg !21 + %2098 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i889 = icmp eq i32 %2098, 0, !dbg !21 + %2099 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i888) #6, !dbg !21 + %2100 = tail call float @llvm.nvvm.saturate.f(float %.02.i888) #6, !dbg !21 + %.03.i890 = select i1 %.not1.i889, float %2100, float %2099, !dbg !21 + %2101 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i891 = icmp eq i32 %2101, 0, !dbg !21 + %2102 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i890, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2103 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i890, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i892 = select i1 %.not2.i891, float %2103, float %2102, !dbg !21 + %2104 = fadd float %.04.i892, 0xC168000FE0000000, !dbg !21 + %2105 = fneg float %2104, !dbg !21 + %2106 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i893 = icmp eq i32 %2106, 0, !dbg !21 + %2107 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1758, float 0x3FF7154760000000, float %2105) #6, !dbg !21 + %2108 = tail call float @llvm.nvvm.fma.rn.f(float %1758, float 0x3FF7154760000000, float %2105) #6, !dbg !21 + %.0.i894 = select i1 %.not3.i893, float %2108, float %2107, !dbg !21 + %2109 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i895 = icmp eq i32 %2109, 0, !dbg !21 + %2110 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1758, float 0x3E54AE0C00000000, float %.0.i894) #6, !dbg !21 + %2111 = tail call float @llvm.nvvm.fma.rn.f(float %1758, float 0x3E54AE0C00000000, float %.0.i894) #6, !dbg !21 + %.01.i896 = select i1 %.not4.i895, float %2111, float %2110, !dbg !21 + %2112 = bitcast float %.04.i892 to i32, !dbg !21 + %2113 = shl i32 %2112, 23, !dbg !21 + %2114 = bitcast i32 %2113 to float, !dbg !21 + %2115 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i896) #6, !dbg !21 + %2116 = fmul float %2115, %2114, !dbg !21 + %2117 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i897 = icmp eq i32 %2117, 0, !dbg !21 + %2118 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1759, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2119 = tail call float @llvm.nvvm.fma.rn.f(float %1759, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i898 = select i1 %.not.i897, float %2119, float %2118, !dbg !21 + %2120 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i899 = icmp eq i32 %2120, 0, !dbg !21 + %2121 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i898) #6, !dbg !21 + %2122 = tail call float @llvm.nvvm.saturate.f(float %.02.i898) #6, !dbg !21 + %.03.i900 = select i1 %.not1.i899, float %2122, float %2121, !dbg !21 + %2123 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i901 = icmp eq i32 %2123, 0, !dbg !21 + %2124 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i900, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2125 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i900, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i902 = select i1 %.not2.i901, float %2125, float %2124, !dbg !21 + %2126 = fadd float %.04.i902, 0xC168000FE0000000, !dbg !21 + %2127 = fneg float %2126, !dbg !21 + %2128 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i903 = icmp eq i32 %2128, 0, !dbg !21 + %2129 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1759, float 0x3FF7154760000000, float %2127) #6, !dbg !21 + %2130 = tail call float @llvm.nvvm.fma.rn.f(float %1759, float 0x3FF7154760000000, float %2127) #6, !dbg !21 + %.0.i904 = select i1 %.not3.i903, float %2130, float %2129, !dbg !21 + %2131 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i905 = icmp eq i32 %2131, 0, !dbg !21 + %2132 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1759, float 0x3E54AE0C00000000, float %.0.i904) #6, !dbg !21 + %2133 = tail call float @llvm.nvvm.fma.rn.f(float %1759, float 0x3E54AE0C00000000, float %.0.i904) #6, !dbg !21 + %.01.i906 = select i1 %.not4.i905, float %2133, float %2132, !dbg !21 + %2134 = bitcast float %.04.i902 to i32, !dbg !21 + %2135 = shl i32 %2134, 23, !dbg !21 + %2136 = bitcast i32 %2135 to float, !dbg !21 + %2137 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i906) #6, !dbg !21 + %2138 = fmul float %2137, %2136, !dbg !21 + %2139 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i907 = icmp eq i32 %2139, 0, !dbg !21 + %2140 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1760, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2141 = tail call float @llvm.nvvm.fma.rn.f(float %1760, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i908 = select i1 %.not.i907, float %2141, float %2140, !dbg !21 + %2142 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i909 = icmp eq i32 %2142, 0, !dbg !21 + %2143 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i908) #6, !dbg !21 + %2144 = tail call float @llvm.nvvm.saturate.f(float %.02.i908) #6, !dbg !21 + %.03.i910 = select i1 %.not1.i909, float %2144, float %2143, !dbg !21 + %2145 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i911 = icmp eq i32 %2145, 0, !dbg !21 + %2146 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i910, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2147 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i910, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i912 = select i1 %.not2.i911, float %2147, float %2146, !dbg !21 + %2148 = fadd float %.04.i912, 0xC168000FE0000000, !dbg !21 + %2149 = fneg float %2148, !dbg !21 + %2150 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i913 = icmp eq i32 %2150, 0, !dbg !21 + %2151 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1760, float 0x3FF7154760000000, float %2149) #6, !dbg !21 + %2152 = tail call float @llvm.nvvm.fma.rn.f(float %1760, float 0x3FF7154760000000, float %2149) #6, !dbg !21 + %.0.i914 = select i1 %.not3.i913, float %2152, float %2151, !dbg !21 + %2153 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i915 = icmp eq i32 %2153, 0, !dbg !21 + %2154 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1760, float 0x3E54AE0C00000000, float %.0.i914) #6, !dbg !21 + %2155 = tail call float @llvm.nvvm.fma.rn.f(float %1760, float 0x3E54AE0C00000000, float %.0.i914) #6, !dbg !21 + %.01.i916 = select i1 %.not4.i915, float %2155, float %2154, !dbg !21 + %2156 = bitcast float %.04.i912 to i32, !dbg !21 + %2157 = shl i32 %2156, 23, !dbg !21 + %2158 = bitcast i32 %2157 to float, !dbg !21 + %2159 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i916) #6, !dbg !21 + %2160 = fmul float %2159, %2158, !dbg !21 + %2161 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i917 = icmp eq i32 %2161, 0, !dbg !21 + %2162 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1761, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2163 = tail call float @llvm.nvvm.fma.rn.f(float %1761, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i918 = select i1 %.not.i917, float %2163, float %2162, !dbg !21 + %2164 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i919 = icmp eq i32 %2164, 0, !dbg !21 + %2165 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i918) #6, !dbg !21 + %2166 = tail call float @llvm.nvvm.saturate.f(float %.02.i918) #6, !dbg !21 + %.03.i920 = select i1 %.not1.i919, float %2166, float %2165, !dbg !21 + %2167 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i921 = icmp eq i32 %2167, 0, !dbg !21 + %2168 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i920, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2169 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i920, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i922 = select i1 %.not2.i921, float %2169, float %2168, !dbg !21 + %2170 = fadd float %.04.i922, 0xC168000FE0000000, !dbg !21 + %2171 = fneg float %2170, !dbg !21 + %2172 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i923 = icmp eq i32 %2172, 0, !dbg !21 + %2173 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1761, float 0x3FF7154760000000, float %2171) #6, !dbg !21 + %2174 = tail call float @llvm.nvvm.fma.rn.f(float %1761, float 0x3FF7154760000000, float %2171) #6, !dbg !21 + %.0.i924 = select i1 %.not3.i923, float %2174, float %2173, !dbg !21 + %2175 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i925 = icmp eq i32 %2175, 0, !dbg !21 + %2176 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1761, float 0x3E54AE0C00000000, float %.0.i924) #6, !dbg !21 + %2177 = tail call float @llvm.nvvm.fma.rn.f(float %1761, float 0x3E54AE0C00000000, float %.0.i924) #6, !dbg !21 + %.01.i926 = select i1 %.not4.i925, float %2177, float %2176, !dbg !21 + %2178 = bitcast float %.04.i922 to i32, !dbg !21 + %2179 = shl i32 %2178, 23, !dbg !21 + %2180 = bitcast i32 %2179 to float, !dbg !21 + %2181 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i926) #6, !dbg !21 + %2182 = fmul float %2181, %2180, !dbg !21 + %2183 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i927 = icmp eq i32 %2183, 0, !dbg !21 + %2184 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1762, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2185 = tail call float @llvm.nvvm.fma.rn.f(float %1762, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i928 = select i1 %.not.i927, float %2185, float %2184, !dbg !21 + %2186 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i929 = icmp eq i32 %2186, 0, !dbg !21 + %2187 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i928) #6, !dbg !21 + %2188 = tail call float @llvm.nvvm.saturate.f(float %.02.i928) #6, !dbg !21 + %.03.i930 = select i1 %.not1.i929, float %2188, float %2187, !dbg !21 + %2189 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i931 = icmp eq i32 %2189, 0, !dbg !21 + %2190 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i930, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2191 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i930, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i932 = select i1 %.not2.i931, float %2191, float %2190, !dbg !21 + %2192 = fadd float %.04.i932, 0xC168000FE0000000, !dbg !21 + %2193 = fneg float %2192, !dbg !21 + %2194 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i933 = icmp eq i32 %2194, 0, !dbg !21 + %2195 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1762, float 0x3FF7154760000000, float %2193) #6, !dbg !21 + %2196 = tail call float @llvm.nvvm.fma.rn.f(float %1762, float 0x3FF7154760000000, float %2193) #6, !dbg !21 + %.0.i934 = select i1 %.not3.i933, float %2196, float %2195, !dbg !21 + %2197 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i935 = icmp eq i32 %2197, 0, !dbg !21 + %2198 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1762, float 0x3E54AE0C00000000, float %.0.i934) #6, !dbg !21 + %2199 = tail call float @llvm.nvvm.fma.rn.f(float %1762, float 0x3E54AE0C00000000, float %.0.i934) #6, !dbg !21 + %.01.i936 = select i1 %.not4.i935, float %2199, float %2198, !dbg !21 + %2200 = bitcast float %.04.i932 to i32, !dbg !21 + %2201 = shl i32 %2200, 23, !dbg !21 + %2202 = bitcast i32 %2201 to float, !dbg !21 + %2203 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i936) #6, !dbg !21 + %2204 = fmul float %2203, %2202, !dbg !21 + %2205 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i937 = icmp eq i32 %2205, 0, !dbg !21 + %2206 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1763, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2207 = tail call float @llvm.nvvm.fma.rn.f(float %1763, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i938 = select i1 %.not.i937, float %2207, float %2206, !dbg !21 + %2208 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i939 = icmp eq i32 %2208, 0, !dbg !21 + %2209 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i938) #6, !dbg !21 + %2210 = tail call float @llvm.nvvm.saturate.f(float %.02.i938) #6, !dbg !21 + %.03.i940 = select i1 %.not1.i939, float %2210, float %2209, !dbg !21 + %2211 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i941 = icmp eq i32 %2211, 0, !dbg !21 + %2212 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i940, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2213 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i940, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i942 = select i1 %.not2.i941, float %2213, float %2212, !dbg !21 + %2214 = fadd float %.04.i942, 0xC168000FE0000000, !dbg !21 + %2215 = fneg float %2214, !dbg !21 + %2216 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i943 = icmp eq i32 %2216, 0, !dbg !21 + %2217 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1763, float 0x3FF7154760000000, float %2215) #6, !dbg !21 + %2218 = tail call float @llvm.nvvm.fma.rn.f(float %1763, float 0x3FF7154760000000, float %2215) #6, !dbg !21 + %.0.i944 = select i1 %.not3.i943, float %2218, float %2217, !dbg !21 + %2219 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i945 = icmp eq i32 %2219, 0, !dbg !21 + %2220 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1763, float 0x3E54AE0C00000000, float %.0.i944) #6, !dbg !21 + %2221 = tail call float @llvm.nvvm.fma.rn.f(float %1763, float 0x3E54AE0C00000000, float %.0.i944) #6, !dbg !21 + %.01.i946 = select i1 %.not4.i945, float %2221, float %2220, !dbg !21 + %2222 = bitcast float %.04.i942 to i32, !dbg !21 + %2223 = shl i32 %2222, 23, !dbg !21 + %2224 = bitcast i32 %2223 to float, !dbg !21 + %2225 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i946) #6, !dbg !21 + %2226 = fmul float %2225, %2224, !dbg !21 + %2227 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i947 = icmp eq i32 %2227, 0, !dbg !21 + %2228 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1764, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2229 = tail call float @llvm.nvvm.fma.rn.f(float %1764, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i948 = select i1 %.not.i947, float %2229, float %2228, !dbg !21 + %2230 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i949 = icmp eq i32 %2230, 0, !dbg !21 + %2231 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i948) #6, !dbg !21 + %2232 = tail call float @llvm.nvvm.saturate.f(float %.02.i948) #6, !dbg !21 + %.03.i950 = select i1 %.not1.i949, float %2232, float %2231, !dbg !21 + %2233 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i951 = icmp eq i32 %2233, 0, !dbg !21 + %2234 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i950, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2235 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i950, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i952 = select i1 %.not2.i951, float %2235, float %2234, !dbg !21 + %2236 = fadd float %.04.i952, 0xC168000FE0000000, !dbg !21 + %2237 = fneg float %2236, !dbg !21 + %2238 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i953 = icmp eq i32 %2238, 0, !dbg !21 + %2239 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1764, float 0x3FF7154760000000, float %2237) #6, !dbg !21 + %2240 = tail call float @llvm.nvvm.fma.rn.f(float %1764, float 0x3FF7154760000000, float %2237) #6, !dbg !21 + %.0.i954 = select i1 %.not3.i953, float %2240, float %2239, !dbg !21 + %2241 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i955 = icmp eq i32 %2241, 0, !dbg !21 + %2242 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1764, float 0x3E54AE0C00000000, float %.0.i954) #6, !dbg !21 + %2243 = tail call float @llvm.nvvm.fma.rn.f(float %1764, float 0x3E54AE0C00000000, float %.0.i954) #6, !dbg !21 + %.01.i956 = select i1 %.not4.i955, float %2243, float %2242, !dbg !21 + %2244 = bitcast float %.04.i952 to i32, !dbg !21 + %2245 = shl i32 %2244, 23, !dbg !21 + %2246 = bitcast i32 %2245 to float, !dbg !21 + %2247 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i956) #6, !dbg !21 + %2248 = fmul float %2247, %2246, !dbg !21 + %2249 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i957 = icmp eq i32 %2249, 0, !dbg !21 + %2250 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1765, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2251 = tail call float @llvm.nvvm.fma.rn.f(float %1765, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i958 = select i1 %.not.i957, float %2251, float %2250, !dbg !21 + %2252 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i959 = icmp eq i32 %2252, 0, !dbg !21 + %2253 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i958) #6, !dbg !21 + %2254 = tail call float @llvm.nvvm.saturate.f(float %.02.i958) #6, !dbg !21 + %.03.i960 = select i1 %.not1.i959, float %2254, float %2253, !dbg !21 + %2255 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i961 = icmp eq i32 %2255, 0, !dbg !21 + %2256 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i960, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2257 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i960, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i962 = select i1 %.not2.i961, float %2257, float %2256, !dbg !21 + %2258 = fadd float %.04.i962, 0xC168000FE0000000, !dbg !21 + %2259 = fneg float %2258, !dbg !21 + %2260 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i963 = icmp eq i32 %2260, 0, !dbg !21 + %2261 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1765, float 0x3FF7154760000000, float %2259) #6, !dbg !21 + %2262 = tail call float @llvm.nvvm.fma.rn.f(float %1765, float 0x3FF7154760000000, float %2259) #6, !dbg !21 + %.0.i964 = select i1 %.not3.i963, float %2262, float %2261, !dbg !21 + %2263 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i965 = icmp eq i32 %2263, 0, !dbg !21 + %2264 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1765, float 0x3E54AE0C00000000, float %.0.i964) #6, !dbg !21 + %2265 = tail call float @llvm.nvvm.fma.rn.f(float %1765, float 0x3E54AE0C00000000, float %.0.i964) #6, !dbg !21 + %.01.i966 = select i1 %.not4.i965, float %2265, float %2264, !dbg !21 + %2266 = bitcast float %.04.i962 to i32, !dbg !21 + %2267 = shl i32 %2266, 23, !dbg !21 + %2268 = bitcast i32 %2267 to float, !dbg !21 + %2269 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i966) #6, !dbg !21 + %2270 = fmul float %2269, %2268, !dbg !21 + %2271 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i967 = icmp eq i32 %2271, 0, !dbg !21 + %2272 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1766, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2273 = tail call float @llvm.nvvm.fma.rn.f(float %1766, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i968 = select i1 %.not.i967, float %2273, float %2272, !dbg !21 + %2274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i969 = icmp eq i32 %2274, 0, !dbg !21 + %2275 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i968) #6, !dbg !21 + %2276 = tail call float @llvm.nvvm.saturate.f(float %.02.i968) #6, !dbg !21 + %.03.i970 = select i1 %.not1.i969, float %2276, float %2275, !dbg !21 + %2277 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i971 = icmp eq i32 %2277, 0, !dbg !21 + %2278 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i970, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2279 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i970, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i972 = select i1 %.not2.i971, float %2279, float %2278, !dbg !21 + %2280 = fadd float %.04.i972, 0xC168000FE0000000, !dbg !21 + %2281 = fneg float %2280, !dbg !21 + %2282 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i973 = icmp eq i32 %2282, 0, !dbg !21 + %2283 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1766, float 0x3FF7154760000000, float %2281) #6, !dbg !21 + %2284 = tail call float @llvm.nvvm.fma.rn.f(float %1766, float 0x3FF7154760000000, float %2281) #6, !dbg !21 + %.0.i974 = select i1 %.not3.i973, float %2284, float %2283, !dbg !21 + %2285 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i975 = icmp eq i32 %2285, 0, !dbg !21 + %2286 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1766, float 0x3E54AE0C00000000, float %.0.i974) #6, !dbg !21 + %2287 = tail call float @llvm.nvvm.fma.rn.f(float %1766, float 0x3E54AE0C00000000, float %.0.i974) #6, !dbg !21 + %.01.i976 = select i1 %.not4.i975, float %2287, float %2286, !dbg !21 + %2288 = bitcast float %.04.i972 to i32, !dbg !21 + %2289 = shl i32 %2288, 23, !dbg !21 + %2290 = bitcast i32 %2289 to float, !dbg !21 + %2291 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i976) #6, !dbg !21 + %2292 = fmul float %2291, %2290, !dbg !21 + %2293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i977 = icmp eq i32 %2293, 0, !dbg !21 + %2294 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1767, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2295 = tail call float @llvm.nvvm.fma.rn.f(float %1767, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i978 = select i1 %.not.i977, float %2295, float %2294, !dbg !21 + %2296 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i979 = icmp eq i32 %2296, 0, !dbg !21 + %2297 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i978) #6, !dbg !21 + %2298 = tail call float @llvm.nvvm.saturate.f(float %.02.i978) #6, !dbg !21 + %.03.i980 = select i1 %.not1.i979, float %2298, float %2297, !dbg !21 + %2299 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i981 = icmp eq i32 %2299, 0, !dbg !21 + %2300 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i980, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2301 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i980, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i982 = select i1 %.not2.i981, float %2301, float %2300, !dbg !21 + %2302 = fadd float %.04.i982, 0xC168000FE0000000, !dbg !21 + %2303 = fneg float %2302, !dbg !21 + %2304 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i983 = icmp eq i32 %2304, 0, !dbg !21 + %2305 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1767, float 0x3FF7154760000000, float %2303) #6, !dbg !21 + %2306 = tail call float @llvm.nvvm.fma.rn.f(float %1767, float 0x3FF7154760000000, float %2303) #6, !dbg !21 + %.0.i984 = select i1 %.not3.i983, float %2306, float %2305, !dbg !21 + %2307 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i985 = icmp eq i32 %2307, 0, !dbg !21 + %2308 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1767, float 0x3E54AE0C00000000, float %.0.i984) #6, !dbg !21 + %2309 = tail call float @llvm.nvvm.fma.rn.f(float %1767, float 0x3E54AE0C00000000, float %.0.i984) #6, !dbg !21 + %.01.i986 = select i1 %.not4.i985, float %2309, float %2308, !dbg !21 + %2310 = bitcast float %.04.i982 to i32, !dbg !21 + %2311 = shl i32 %2310, 23, !dbg !21 + %2312 = bitcast i32 %2311 to float, !dbg !21 + %2313 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i986) #6, !dbg !21 + %2314 = fmul float %2313, %2312, !dbg !21 + %2315 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i987 = icmp eq i32 %2315, 0, !dbg !21 + %2316 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1768, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2317 = tail call float @llvm.nvvm.fma.rn.f(float %1768, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i988 = select i1 %.not.i987, float %2317, float %2316, !dbg !21 + %2318 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i989 = icmp eq i32 %2318, 0, !dbg !21 + %2319 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i988) #6, !dbg !21 + %2320 = tail call float @llvm.nvvm.saturate.f(float %.02.i988) #6, !dbg !21 + %.03.i990 = select i1 %.not1.i989, float %2320, float %2319, !dbg !21 + %2321 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i991 = icmp eq i32 %2321, 0, !dbg !21 + %2322 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i990, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2323 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i990, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i992 = select i1 %.not2.i991, float %2323, float %2322, !dbg !21 + %2324 = fadd float %.04.i992, 0xC168000FE0000000, !dbg !21 + %2325 = fneg float %2324, !dbg !21 + %2326 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i993 = icmp eq i32 %2326, 0, !dbg !21 + %2327 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1768, float 0x3FF7154760000000, float %2325) #6, !dbg !21 + %2328 = tail call float @llvm.nvvm.fma.rn.f(float %1768, float 0x3FF7154760000000, float %2325) #6, !dbg !21 + %.0.i994 = select i1 %.not3.i993, float %2328, float %2327, !dbg !21 + %2329 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i995 = icmp eq i32 %2329, 0, !dbg !21 + %2330 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1768, float 0x3E54AE0C00000000, float %.0.i994) #6, !dbg !21 + %2331 = tail call float @llvm.nvvm.fma.rn.f(float %1768, float 0x3E54AE0C00000000, float %.0.i994) #6, !dbg !21 + %.01.i996 = select i1 %.not4.i995, float %2331, float %2330, !dbg !21 + %2332 = bitcast float %.04.i992 to i32, !dbg !21 + %2333 = shl i32 %2332, 23, !dbg !21 + %2334 = bitcast i32 %2333 to float, !dbg !21 + %2335 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i996) #6, !dbg !21 + %2336 = fmul float %2335, %2334, !dbg !21 + %2337 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i997 = icmp eq i32 %2337, 0, !dbg !21 + %2338 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1769, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2339 = tail call float @llvm.nvvm.fma.rn.f(float %1769, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i998 = select i1 %.not.i997, float %2339, float %2338, !dbg !21 + %2340 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i999 = icmp eq i32 %2340, 0, !dbg !21 + %2341 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i998) #6, !dbg !21 + %2342 = tail call float @llvm.nvvm.saturate.f(float %.02.i998) #6, !dbg !21 + %.03.i1000 = select i1 %.not1.i999, float %2342, float %2341, !dbg !21 + %2343 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1001 = icmp eq i32 %2343, 0, !dbg !21 + %2344 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1000, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2345 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1000, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1002 = select i1 %.not2.i1001, float %2345, float %2344, !dbg !21 + %2346 = fadd float %.04.i1002, 0xC168000FE0000000, !dbg !21 + %2347 = fneg float %2346, !dbg !21 + %2348 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1003 = icmp eq i32 %2348, 0, !dbg !21 + %2349 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1769, float 0x3FF7154760000000, float %2347) #6, !dbg !21 + %2350 = tail call float @llvm.nvvm.fma.rn.f(float %1769, float 0x3FF7154760000000, float %2347) #6, !dbg !21 + %.0.i1004 = select i1 %.not3.i1003, float %2350, float %2349, !dbg !21 + %2351 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1005 = icmp eq i32 %2351, 0, !dbg !21 + %2352 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1769, float 0x3E54AE0C00000000, float %.0.i1004) #6, !dbg !21 + %2353 = tail call float @llvm.nvvm.fma.rn.f(float %1769, float 0x3E54AE0C00000000, float %.0.i1004) #6, !dbg !21 + %.01.i1006 = select i1 %.not4.i1005, float %2353, float %2352, !dbg !21 + %2354 = bitcast float %.04.i1002 to i32, !dbg !21 + %2355 = shl i32 %2354, 23, !dbg !21 + %2356 = bitcast i32 %2355 to float, !dbg !21 + %2357 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1006) #6, !dbg !21 + %2358 = fmul float %2357, %2356, !dbg !21 + %2359 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1007 = icmp eq i32 %2359, 0, !dbg !21 + %2360 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1770, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2361 = tail call float @llvm.nvvm.fma.rn.f(float %1770, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1008 = select i1 %.not.i1007, float %2361, float %2360, !dbg !21 + %2362 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1009 = icmp eq i32 %2362, 0, !dbg !21 + %2363 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1008) #6, !dbg !21 + %2364 = tail call float @llvm.nvvm.saturate.f(float %.02.i1008) #6, !dbg !21 + %.03.i1010 = select i1 %.not1.i1009, float %2364, float %2363, !dbg !21 + %2365 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1011 = icmp eq i32 %2365, 0, !dbg !21 + %2366 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1010, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2367 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1010, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1012 = select i1 %.not2.i1011, float %2367, float %2366, !dbg !21 + %2368 = fadd float %.04.i1012, 0xC168000FE0000000, !dbg !21 + %2369 = fneg float %2368, !dbg !21 + %2370 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1013 = icmp eq i32 %2370, 0, !dbg !21 + %2371 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1770, float 0x3FF7154760000000, float %2369) #6, !dbg !21 + %2372 = tail call float @llvm.nvvm.fma.rn.f(float %1770, float 0x3FF7154760000000, float %2369) #6, !dbg !21 + %.0.i1014 = select i1 %.not3.i1013, float %2372, float %2371, !dbg !21 + %2373 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1015 = icmp eq i32 %2373, 0, !dbg !21 + %2374 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1770, float 0x3E54AE0C00000000, float %.0.i1014) #6, !dbg !21 + %2375 = tail call float @llvm.nvvm.fma.rn.f(float %1770, float 0x3E54AE0C00000000, float %.0.i1014) #6, !dbg !21 + %.01.i1016 = select i1 %.not4.i1015, float %2375, float %2374, !dbg !21 + %2376 = bitcast float %.04.i1012 to i32, !dbg !21 + %2377 = shl i32 %2376, 23, !dbg !21 + %2378 = bitcast i32 %2377 to float, !dbg !21 + %2379 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1016) #6, !dbg !21 + %2380 = fmul float %2379, %2378, !dbg !21 + %2381 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1017 = icmp eq i32 %2381, 0, !dbg !21 + %2382 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1771, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2383 = tail call float @llvm.nvvm.fma.rn.f(float %1771, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1018 = select i1 %.not.i1017, float %2383, float %2382, !dbg !21 + %2384 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1019 = icmp eq i32 %2384, 0, !dbg !21 + %2385 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1018) #6, !dbg !21 + %2386 = tail call float @llvm.nvvm.saturate.f(float %.02.i1018) #6, !dbg !21 + %.03.i1020 = select i1 %.not1.i1019, float %2386, float %2385, !dbg !21 + %2387 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1021 = icmp eq i32 %2387, 0, !dbg !21 + %2388 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1020, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2389 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1020, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1022 = select i1 %.not2.i1021, float %2389, float %2388, !dbg !21 + %2390 = fadd float %.04.i1022, 0xC168000FE0000000, !dbg !21 + %2391 = fneg float %2390, !dbg !21 + %2392 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1023 = icmp eq i32 %2392, 0, !dbg !21 + %2393 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1771, float 0x3FF7154760000000, float %2391) #6, !dbg !21 + %2394 = tail call float @llvm.nvvm.fma.rn.f(float %1771, float 0x3FF7154760000000, float %2391) #6, !dbg !21 + %.0.i1024 = select i1 %.not3.i1023, float %2394, float %2393, !dbg !21 + %2395 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1025 = icmp eq i32 %2395, 0, !dbg !21 + %2396 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1771, float 0x3E54AE0C00000000, float %.0.i1024) #6, !dbg !21 + %2397 = tail call float @llvm.nvvm.fma.rn.f(float %1771, float 0x3E54AE0C00000000, float %.0.i1024) #6, !dbg !21 + %.01.i1026 = select i1 %.not4.i1025, float %2397, float %2396, !dbg !21 + %2398 = bitcast float %.04.i1022 to i32, !dbg !21 + %2399 = shl i32 %2398, 23, !dbg !21 + %2400 = bitcast i32 %2399 to float, !dbg !21 + %2401 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1026) #6, !dbg !21 + %2402 = fmul float %2401, %2400, !dbg !21 + %2403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1027 = icmp eq i32 %2403, 0, !dbg !21 + %2404 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1772, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2405 = tail call float @llvm.nvvm.fma.rn.f(float %1772, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1028 = select i1 %.not.i1027, float %2405, float %2404, !dbg !21 + %2406 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1029 = icmp eq i32 %2406, 0, !dbg !21 + %2407 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1028) #6, !dbg !21 + %2408 = tail call float @llvm.nvvm.saturate.f(float %.02.i1028) #6, !dbg !21 + %.03.i1030 = select i1 %.not1.i1029, float %2408, float %2407, !dbg !21 + %2409 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1031 = icmp eq i32 %2409, 0, !dbg !21 + %2410 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1030, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2411 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1030, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1032 = select i1 %.not2.i1031, float %2411, float %2410, !dbg !21 + %2412 = fadd float %.04.i1032, 0xC168000FE0000000, !dbg !21 + %2413 = fneg float %2412, !dbg !21 + %2414 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1033 = icmp eq i32 %2414, 0, !dbg !21 + %2415 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1772, float 0x3FF7154760000000, float %2413) #6, !dbg !21 + %2416 = tail call float @llvm.nvvm.fma.rn.f(float %1772, float 0x3FF7154760000000, float %2413) #6, !dbg !21 + %.0.i1034 = select i1 %.not3.i1033, float %2416, float %2415, !dbg !21 + %2417 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1035 = icmp eq i32 %2417, 0, !dbg !21 + %2418 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1772, float 0x3E54AE0C00000000, float %.0.i1034) #6, !dbg !21 + %2419 = tail call float @llvm.nvvm.fma.rn.f(float %1772, float 0x3E54AE0C00000000, float %.0.i1034) #6, !dbg !21 + %.01.i1036 = select i1 %.not4.i1035, float %2419, float %2418, !dbg !21 + %2420 = bitcast float %.04.i1032 to i32, !dbg !21 + %2421 = shl i32 %2420, 23, !dbg !21 + %2422 = bitcast i32 %2421 to float, !dbg !21 + %2423 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1036) #6, !dbg !21 + %2424 = fmul float %2423, %2422, !dbg !21 + %2425 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1037 = icmp eq i32 %2425, 0, !dbg !21 + %2426 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1773, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2427 = tail call float @llvm.nvvm.fma.rn.f(float %1773, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1038 = select i1 %.not.i1037, float %2427, float %2426, !dbg !21 + %2428 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1039 = icmp eq i32 %2428, 0, !dbg !21 + %2429 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1038) #6, !dbg !21 + %2430 = tail call float @llvm.nvvm.saturate.f(float %.02.i1038) #6, !dbg !21 + %.03.i1040 = select i1 %.not1.i1039, float %2430, float %2429, !dbg !21 + %2431 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1041 = icmp eq i32 %2431, 0, !dbg !21 + %2432 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1040, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2433 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1040, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1042 = select i1 %.not2.i1041, float %2433, float %2432, !dbg !21 + %2434 = fadd float %.04.i1042, 0xC168000FE0000000, !dbg !21 + %2435 = fneg float %2434, !dbg !21 + %2436 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1043 = icmp eq i32 %2436, 0, !dbg !21 + %2437 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1773, float 0x3FF7154760000000, float %2435) #6, !dbg !21 + %2438 = tail call float @llvm.nvvm.fma.rn.f(float %1773, float 0x3FF7154760000000, float %2435) #6, !dbg !21 + %.0.i1044 = select i1 %.not3.i1043, float %2438, float %2437, !dbg !21 + %2439 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1045 = icmp eq i32 %2439, 0, !dbg !21 + %2440 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1773, float 0x3E54AE0C00000000, float %.0.i1044) #6, !dbg !21 + %2441 = tail call float @llvm.nvvm.fma.rn.f(float %1773, float 0x3E54AE0C00000000, float %.0.i1044) #6, !dbg !21 + %.01.i1046 = select i1 %.not4.i1045, float %2441, float %2440, !dbg !21 + %2442 = bitcast float %.04.i1042 to i32, !dbg !21 + %2443 = shl i32 %2442, 23, !dbg !21 + %2444 = bitcast i32 %2443 to float, !dbg !21 + %2445 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1046) #6, !dbg !21 + %2446 = fmul float %2445, %2444, !dbg !21 + %2447 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1047 = icmp eq i32 %2447, 0, !dbg !21 + %2448 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1774, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2449 = tail call float @llvm.nvvm.fma.rn.f(float %1774, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1048 = select i1 %.not.i1047, float %2449, float %2448, !dbg !21 + %2450 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1049 = icmp eq i32 %2450, 0, !dbg !21 + %2451 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1048) #6, !dbg !21 + %2452 = tail call float @llvm.nvvm.saturate.f(float %.02.i1048) #6, !dbg !21 + %.03.i1050 = select i1 %.not1.i1049, float %2452, float %2451, !dbg !21 + %2453 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1051 = icmp eq i32 %2453, 0, !dbg !21 + %2454 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1050, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2455 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1050, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1052 = select i1 %.not2.i1051, float %2455, float %2454, !dbg !21 + %2456 = fadd float %.04.i1052, 0xC168000FE0000000, !dbg !21 + %2457 = fneg float %2456, !dbg !21 + %2458 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1053 = icmp eq i32 %2458, 0, !dbg !21 + %2459 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1774, float 0x3FF7154760000000, float %2457) #6, !dbg !21 + %2460 = tail call float @llvm.nvvm.fma.rn.f(float %1774, float 0x3FF7154760000000, float %2457) #6, !dbg !21 + %.0.i1054 = select i1 %.not3.i1053, float %2460, float %2459, !dbg !21 + %2461 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1055 = icmp eq i32 %2461, 0, !dbg !21 + %2462 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1774, float 0x3E54AE0C00000000, float %.0.i1054) #6, !dbg !21 + %2463 = tail call float @llvm.nvvm.fma.rn.f(float %1774, float 0x3E54AE0C00000000, float %.0.i1054) #6, !dbg !21 + %.01.i1056 = select i1 %.not4.i1055, float %2463, float %2462, !dbg !21 + %2464 = bitcast float %.04.i1052 to i32, !dbg !21 + %2465 = shl i32 %2464, 23, !dbg !21 + %2466 = bitcast i32 %2465 to float, !dbg !21 + %2467 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1056) #6, !dbg !21 + %2468 = fmul float %2467, %2466, !dbg !21 + %2469 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1057 = icmp eq i32 %2469, 0, !dbg !21 + %2470 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1775, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2471 = tail call float @llvm.nvvm.fma.rn.f(float %1775, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1058 = select i1 %.not.i1057, float %2471, float %2470, !dbg !21 + %2472 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1059 = icmp eq i32 %2472, 0, !dbg !21 + %2473 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1058) #6, !dbg !21 + %2474 = tail call float @llvm.nvvm.saturate.f(float %.02.i1058) #6, !dbg !21 + %.03.i1060 = select i1 %.not1.i1059, float %2474, float %2473, !dbg !21 + %2475 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1061 = icmp eq i32 %2475, 0, !dbg !21 + %2476 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1060, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2477 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1060, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1062 = select i1 %.not2.i1061, float %2477, float %2476, !dbg !21 + %2478 = fadd float %.04.i1062, 0xC168000FE0000000, !dbg !21 + %2479 = fneg float %2478, !dbg !21 + %2480 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1063 = icmp eq i32 %2480, 0, !dbg !21 + %2481 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1775, float 0x3FF7154760000000, float %2479) #6, !dbg !21 + %2482 = tail call float @llvm.nvvm.fma.rn.f(float %1775, float 0x3FF7154760000000, float %2479) #6, !dbg !21 + %.0.i1064 = select i1 %.not3.i1063, float %2482, float %2481, !dbg !21 + %2483 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1065 = icmp eq i32 %2483, 0, !dbg !21 + %2484 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1775, float 0x3E54AE0C00000000, float %.0.i1064) #6, !dbg !21 + %2485 = tail call float @llvm.nvvm.fma.rn.f(float %1775, float 0x3E54AE0C00000000, float %.0.i1064) #6, !dbg !21 + %.01.i1066 = select i1 %.not4.i1065, float %2485, float %2484, !dbg !21 + %2486 = bitcast float %.04.i1062 to i32, !dbg !21 + %2487 = shl i32 %2486, 23, !dbg !21 + %2488 = bitcast i32 %2487 to float, !dbg !21 + %2489 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1066) #6, !dbg !21 + %2490 = fmul float %2489, %2488, !dbg !21 + %2491 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1067 = icmp eq i32 %2491, 0, !dbg !21 + %2492 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1776, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2493 = tail call float @llvm.nvvm.fma.rn.f(float %1776, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1068 = select i1 %.not.i1067, float %2493, float %2492, !dbg !21 + %2494 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1069 = icmp eq i32 %2494, 0, !dbg !21 + %2495 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1068) #6, !dbg !21 + %2496 = tail call float @llvm.nvvm.saturate.f(float %.02.i1068) #6, !dbg !21 + %.03.i1070 = select i1 %.not1.i1069, float %2496, float %2495, !dbg !21 + %2497 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1071 = icmp eq i32 %2497, 0, !dbg !21 + %2498 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1070, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2499 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1070, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1072 = select i1 %.not2.i1071, float %2499, float %2498, !dbg !21 + %2500 = fadd float %.04.i1072, 0xC168000FE0000000, !dbg !21 + %2501 = fneg float %2500, !dbg !21 + %2502 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1073 = icmp eq i32 %2502, 0, !dbg !21 + %2503 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1776, float 0x3FF7154760000000, float %2501) #6, !dbg !21 + %2504 = tail call float @llvm.nvvm.fma.rn.f(float %1776, float 0x3FF7154760000000, float %2501) #6, !dbg !21 + %.0.i1074 = select i1 %.not3.i1073, float %2504, float %2503, !dbg !21 + %2505 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1075 = icmp eq i32 %2505, 0, !dbg !21 + %2506 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1776, float 0x3E54AE0C00000000, float %.0.i1074) #6, !dbg !21 + %2507 = tail call float @llvm.nvvm.fma.rn.f(float %1776, float 0x3E54AE0C00000000, float %.0.i1074) #6, !dbg !21 + %.01.i1076 = select i1 %.not4.i1075, float %2507, float %2506, !dbg !21 + %2508 = bitcast float %.04.i1072 to i32, !dbg !21 + %2509 = shl i32 %2508, 23, !dbg !21 + %2510 = bitcast i32 %2509 to float, !dbg !21 + %2511 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1076) #6, !dbg !21 + %2512 = fmul float %2511, %2510, !dbg !21 + %2513 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1077 = icmp eq i32 %2513, 0, !dbg !21 + %2514 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1777, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2515 = tail call float @llvm.nvvm.fma.rn.f(float %1777, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1078 = select i1 %.not.i1077, float %2515, float %2514, !dbg !21 + %2516 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1079 = icmp eq i32 %2516, 0, !dbg !21 + %2517 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1078) #6, !dbg !21 + %2518 = tail call float @llvm.nvvm.saturate.f(float %.02.i1078) #6, !dbg !21 + %.03.i1080 = select i1 %.not1.i1079, float %2518, float %2517, !dbg !21 + %2519 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1081 = icmp eq i32 %2519, 0, !dbg !21 + %2520 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1080, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2521 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1080, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1082 = select i1 %.not2.i1081, float %2521, float %2520, !dbg !21 + %2522 = fadd float %.04.i1082, 0xC168000FE0000000, !dbg !21 + %2523 = fneg float %2522, !dbg !21 + %2524 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1083 = icmp eq i32 %2524, 0, !dbg !21 + %2525 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1777, float 0x3FF7154760000000, float %2523) #6, !dbg !21 + %2526 = tail call float @llvm.nvvm.fma.rn.f(float %1777, float 0x3FF7154760000000, float %2523) #6, !dbg !21 + %.0.i1084 = select i1 %.not3.i1083, float %2526, float %2525, !dbg !21 + %2527 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1085 = icmp eq i32 %2527, 0, !dbg !21 + %2528 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1777, float 0x3E54AE0C00000000, float %.0.i1084) #6, !dbg !21 + %2529 = tail call float @llvm.nvvm.fma.rn.f(float %1777, float 0x3E54AE0C00000000, float %.0.i1084) #6, !dbg !21 + %.01.i1086 = select i1 %.not4.i1085, float %2529, float %2528, !dbg !21 + %2530 = bitcast float %.04.i1082 to i32, !dbg !21 + %2531 = shl i32 %2530, 23, !dbg !21 + %2532 = bitcast i32 %2531 to float, !dbg !21 + %2533 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1086) #6, !dbg !21 + %2534 = fmul float %2533, %2532, !dbg !21 + %2535 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1087 = icmp eq i32 %2535, 0, !dbg !21 + %2536 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1778, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2537 = tail call float @llvm.nvvm.fma.rn.f(float %1778, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1088 = select i1 %.not.i1087, float %2537, float %2536, !dbg !21 + %2538 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1089 = icmp eq i32 %2538, 0, !dbg !21 + %2539 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1088) #6, !dbg !21 + %2540 = tail call float @llvm.nvvm.saturate.f(float %.02.i1088) #6, !dbg !21 + %.03.i1090 = select i1 %.not1.i1089, float %2540, float %2539, !dbg !21 + %2541 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1091 = icmp eq i32 %2541, 0, !dbg !21 + %2542 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1090, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2543 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1090, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1092 = select i1 %.not2.i1091, float %2543, float %2542, !dbg !21 + %2544 = fadd float %.04.i1092, 0xC168000FE0000000, !dbg !21 + %2545 = fneg float %2544, !dbg !21 + %2546 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1093 = icmp eq i32 %2546, 0, !dbg !21 + %2547 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1778, float 0x3FF7154760000000, float %2545) #6, !dbg !21 + %2548 = tail call float @llvm.nvvm.fma.rn.f(float %1778, float 0x3FF7154760000000, float %2545) #6, !dbg !21 + %.0.i1094 = select i1 %.not3.i1093, float %2548, float %2547, !dbg !21 + %2549 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1095 = icmp eq i32 %2549, 0, !dbg !21 + %2550 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1778, float 0x3E54AE0C00000000, float %.0.i1094) #6, !dbg !21 + %2551 = tail call float @llvm.nvvm.fma.rn.f(float %1778, float 0x3E54AE0C00000000, float %.0.i1094) #6, !dbg !21 + %.01.i1096 = select i1 %.not4.i1095, float %2551, float %2550, !dbg !21 + %2552 = bitcast float %.04.i1092 to i32, !dbg !21 + %2553 = shl i32 %2552, 23, !dbg !21 + %2554 = bitcast i32 %2553 to float, !dbg !21 + %2555 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1096) #6, !dbg !21 + %2556 = fmul float %2555, %2554, !dbg !21 + %2557 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1097 = icmp eq i32 %2557, 0, !dbg !21 + %2558 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1779, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2559 = tail call float @llvm.nvvm.fma.rn.f(float %1779, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1098 = select i1 %.not.i1097, float %2559, float %2558, !dbg !21 + %2560 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1099 = icmp eq i32 %2560, 0, !dbg !21 + %2561 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1098) #6, !dbg !21 + %2562 = tail call float @llvm.nvvm.saturate.f(float %.02.i1098) #6, !dbg !21 + %.03.i1100 = select i1 %.not1.i1099, float %2562, float %2561, !dbg !21 + %2563 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1101 = icmp eq i32 %2563, 0, !dbg !21 + %2564 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1100, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2565 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1100, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1102 = select i1 %.not2.i1101, float %2565, float %2564, !dbg !21 + %2566 = fadd float %.04.i1102, 0xC168000FE0000000, !dbg !21 + %2567 = fneg float %2566, !dbg !21 + %2568 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1103 = icmp eq i32 %2568, 0, !dbg !21 + %2569 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1779, float 0x3FF7154760000000, float %2567) #6, !dbg !21 + %2570 = tail call float @llvm.nvvm.fma.rn.f(float %1779, float 0x3FF7154760000000, float %2567) #6, !dbg !21 + %.0.i1104 = select i1 %.not3.i1103, float %2570, float %2569, !dbg !21 + %2571 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1105 = icmp eq i32 %2571, 0, !dbg !21 + %2572 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1779, float 0x3E54AE0C00000000, float %.0.i1104) #6, !dbg !21 + %2573 = tail call float @llvm.nvvm.fma.rn.f(float %1779, float 0x3E54AE0C00000000, float %.0.i1104) #6, !dbg !21 + %.01.i1106 = select i1 %.not4.i1105, float %2573, float %2572, !dbg !21 + %2574 = bitcast float %.04.i1102 to i32, !dbg !21 + %2575 = shl i32 %2574, 23, !dbg !21 + %2576 = bitcast i32 %2575 to float, !dbg !21 + %2577 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1106) #6, !dbg !21 + %2578 = fmul float %2577, %2576, !dbg !21 + %2579 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1107 = icmp eq i32 %2579, 0, !dbg !21 + %2580 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1780, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2581 = tail call float @llvm.nvvm.fma.rn.f(float %1780, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1108 = select i1 %.not.i1107, float %2581, float %2580, !dbg !21 + %2582 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1109 = icmp eq i32 %2582, 0, !dbg !21 + %2583 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1108) #6, !dbg !21 + %2584 = tail call float @llvm.nvvm.saturate.f(float %.02.i1108) #6, !dbg !21 + %.03.i1110 = select i1 %.not1.i1109, float %2584, float %2583, !dbg !21 + %2585 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1111 = icmp eq i32 %2585, 0, !dbg !21 + %2586 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1110, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2587 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1110, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1112 = select i1 %.not2.i1111, float %2587, float %2586, !dbg !21 + %2588 = fadd float %.04.i1112, 0xC168000FE0000000, !dbg !21 + %2589 = fneg float %2588, !dbg !21 + %2590 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1113 = icmp eq i32 %2590, 0, !dbg !21 + %2591 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1780, float 0x3FF7154760000000, float %2589) #6, !dbg !21 + %2592 = tail call float @llvm.nvvm.fma.rn.f(float %1780, float 0x3FF7154760000000, float %2589) #6, !dbg !21 + %.0.i1114 = select i1 %.not3.i1113, float %2592, float %2591, !dbg !21 + %2593 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1115 = icmp eq i32 %2593, 0, !dbg !21 + %2594 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1780, float 0x3E54AE0C00000000, float %.0.i1114) #6, !dbg !21 + %2595 = tail call float @llvm.nvvm.fma.rn.f(float %1780, float 0x3E54AE0C00000000, float %.0.i1114) #6, !dbg !21 + %.01.i1116 = select i1 %.not4.i1115, float %2595, float %2594, !dbg !21 + %2596 = bitcast float %.04.i1112 to i32, !dbg !21 + %2597 = shl i32 %2596, 23, !dbg !21 + %2598 = bitcast i32 %2597 to float, !dbg !21 + %2599 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1116) #6, !dbg !21 + %2600 = fmul float %2599, %2598, !dbg !21 + %2601 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1117 = icmp eq i32 %2601, 0, !dbg !21 + %2602 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1781, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2603 = tail call float @llvm.nvvm.fma.rn.f(float %1781, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1118 = select i1 %.not.i1117, float %2603, float %2602, !dbg !21 + %2604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1119 = icmp eq i32 %2604, 0, !dbg !21 + %2605 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1118) #6, !dbg !21 + %2606 = tail call float @llvm.nvvm.saturate.f(float %.02.i1118) #6, !dbg !21 + %.03.i1120 = select i1 %.not1.i1119, float %2606, float %2605, !dbg !21 + %2607 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1121 = icmp eq i32 %2607, 0, !dbg !21 + %2608 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1120, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2609 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1120, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1122 = select i1 %.not2.i1121, float %2609, float %2608, !dbg !21 + %2610 = fadd float %.04.i1122, 0xC168000FE0000000, !dbg !21 + %2611 = fneg float %2610, !dbg !21 + %2612 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1123 = icmp eq i32 %2612, 0, !dbg !21 + %2613 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1781, float 0x3FF7154760000000, float %2611) #6, !dbg !21 + %2614 = tail call float @llvm.nvvm.fma.rn.f(float %1781, float 0x3FF7154760000000, float %2611) #6, !dbg !21 + %.0.i1124 = select i1 %.not3.i1123, float %2614, float %2613, !dbg !21 + %2615 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1125 = icmp eq i32 %2615, 0, !dbg !21 + %2616 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1781, float 0x3E54AE0C00000000, float %.0.i1124) #6, !dbg !21 + %2617 = tail call float @llvm.nvvm.fma.rn.f(float %1781, float 0x3E54AE0C00000000, float %.0.i1124) #6, !dbg !21 + %.01.i1126 = select i1 %.not4.i1125, float %2617, float %2616, !dbg !21 + %2618 = bitcast float %.04.i1122 to i32, !dbg !21 + %2619 = shl i32 %2618, 23, !dbg !21 + %2620 = bitcast i32 %2619 to float, !dbg !21 + %2621 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1126) #6, !dbg !21 + %2622 = fmul float %2621, %2620, !dbg !21 + %2623 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1127 = icmp eq i32 %2623, 0, !dbg !21 + %2624 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1782, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2625 = tail call float @llvm.nvvm.fma.rn.f(float %1782, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1128 = select i1 %.not.i1127, float %2625, float %2624, !dbg !21 + %2626 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1129 = icmp eq i32 %2626, 0, !dbg !21 + %2627 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1128) #6, !dbg !21 + %2628 = tail call float @llvm.nvvm.saturate.f(float %.02.i1128) #6, !dbg !21 + %.03.i1130 = select i1 %.not1.i1129, float %2628, float %2627, !dbg !21 + %2629 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1131 = icmp eq i32 %2629, 0, !dbg !21 + %2630 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1130, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2631 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1130, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1132 = select i1 %.not2.i1131, float %2631, float %2630, !dbg !21 + %2632 = fadd float %.04.i1132, 0xC168000FE0000000, !dbg !21 + %2633 = fneg float %2632, !dbg !21 + %2634 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1133 = icmp eq i32 %2634, 0, !dbg !21 + %2635 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1782, float 0x3FF7154760000000, float %2633) #6, !dbg !21 + %2636 = tail call float @llvm.nvvm.fma.rn.f(float %1782, float 0x3FF7154760000000, float %2633) #6, !dbg !21 + %.0.i1134 = select i1 %.not3.i1133, float %2636, float %2635, !dbg !21 + %2637 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1135 = icmp eq i32 %2637, 0, !dbg !21 + %2638 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1782, float 0x3E54AE0C00000000, float %.0.i1134) #6, !dbg !21 + %2639 = tail call float @llvm.nvvm.fma.rn.f(float %1782, float 0x3E54AE0C00000000, float %.0.i1134) #6, !dbg !21 + %.01.i1136 = select i1 %.not4.i1135, float %2639, float %2638, !dbg !21 + %2640 = bitcast float %.04.i1132 to i32, !dbg !21 + %2641 = shl i32 %2640, 23, !dbg !21 + %2642 = bitcast i32 %2641 to float, !dbg !21 + %2643 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1136) #6, !dbg !21 + %2644 = fmul float %2643, %2642, !dbg !21 + %2645 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1137 = icmp eq i32 %2645, 0, !dbg !21 + %2646 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1783, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2647 = tail call float @llvm.nvvm.fma.rn.f(float %1783, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1138 = select i1 %.not.i1137, float %2647, float %2646, !dbg !21 + %2648 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1139 = icmp eq i32 %2648, 0, !dbg !21 + %2649 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1138) #6, !dbg !21 + %2650 = tail call float @llvm.nvvm.saturate.f(float %.02.i1138) #6, !dbg !21 + %.03.i1140 = select i1 %.not1.i1139, float %2650, float %2649, !dbg !21 + %2651 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1141 = icmp eq i32 %2651, 0, !dbg !21 + %2652 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1140, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2653 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1140, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1142 = select i1 %.not2.i1141, float %2653, float %2652, !dbg !21 + %2654 = fadd float %.04.i1142, 0xC168000FE0000000, !dbg !21 + %2655 = fneg float %2654, !dbg !21 + %2656 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1143 = icmp eq i32 %2656, 0, !dbg !21 + %2657 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1783, float 0x3FF7154760000000, float %2655) #6, !dbg !21 + %2658 = tail call float @llvm.nvvm.fma.rn.f(float %1783, float 0x3FF7154760000000, float %2655) #6, !dbg !21 + %.0.i1144 = select i1 %.not3.i1143, float %2658, float %2657, !dbg !21 + %2659 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1145 = icmp eq i32 %2659, 0, !dbg !21 + %2660 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1783, float 0x3E54AE0C00000000, float %.0.i1144) #6, !dbg !21 + %2661 = tail call float @llvm.nvvm.fma.rn.f(float %1783, float 0x3E54AE0C00000000, float %.0.i1144) #6, !dbg !21 + %.01.i1146 = select i1 %.not4.i1145, float %2661, float %2660, !dbg !21 + %2662 = bitcast float %.04.i1142 to i32, !dbg !21 + %2663 = shl i32 %2662, 23, !dbg !21 + %2664 = bitcast i32 %2663 to float, !dbg !21 + %2665 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1146) #6, !dbg !21 + %2666 = fmul float %2665, %2664, !dbg !21 + %2667 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1147 = icmp eq i32 %2667, 0, !dbg !21 + %2668 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1784, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2669 = tail call float @llvm.nvvm.fma.rn.f(float %1784, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1148 = select i1 %.not.i1147, float %2669, float %2668, !dbg !21 + %2670 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1149 = icmp eq i32 %2670, 0, !dbg !21 + %2671 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1148) #6, !dbg !21 + %2672 = tail call float @llvm.nvvm.saturate.f(float %.02.i1148) #6, !dbg !21 + %.03.i1150 = select i1 %.not1.i1149, float %2672, float %2671, !dbg !21 + %2673 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1151 = icmp eq i32 %2673, 0, !dbg !21 + %2674 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1150, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2675 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1150, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1152 = select i1 %.not2.i1151, float %2675, float %2674, !dbg !21 + %2676 = fadd float %.04.i1152, 0xC168000FE0000000, !dbg !21 + %2677 = fneg float %2676, !dbg !21 + %2678 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1153 = icmp eq i32 %2678, 0, !dbg !21 + %2679 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1784, float 0x3FF7154760000000, float %2677) #6, !dbg !21 + %2680 = tail call float @llvm.nvvm.fma.rn.f(float %1784, float 0x3FF7154760000000, float %2677) #6, !dbg !21 + %.0.i1154 = select i1 %.not3.i1153, float %2680, float %2679, !dbg !21 + %2681 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1155 = icmp eq i32 %2681, 0, !dbg !21 + %2682 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1784, float 0x3E54AE0C00000000, float %.0.i1154) #6, !dbg !21 + %2683 = tail call float @llvm.nvvm.fma.rn.f(float %1784, float 0x3E54AE0C00000000, float %.0.i1154) #6, !dbg !21 + %.01.i1156 = select i1 %.not4.i1155, float %2683, float %2682, !dbg !21 + %2684 = bitcast float %.04.i1152 to i32, !dbg !21 + %2685 = shl i32 %2684, 23, !dbg !21 + %2686 = bitcast i32 %2685 to float, !dbg !21 + %2687 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1156) #6, !dbg !21 + %2688 = fmul float %2687, %2686, !dbg !21 + %2689 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1157 = icmp eq i32 %2689, 0, !dbg !21 + %2690 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1785, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2691 = tail call float @llvm.nvvm.fma.rn.f(float %1785, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1158 = select i1 %.not.i1157, float %2691, float %2690, !dbg !21 + %2692 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1159 = icmp eq i32 %2692, 0, !dbg !21 + %2693 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1158) #6, !dbg !21 + %2694 = tail call float @llvm.nvvm.saturate.f(float %.02.i1158) #6, !dbg !21 + %.03.i1160 = select i1 %.not1.i1159, float %2694, float %2693, !dbg !21 + %2695 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1161 = icmp eq i32 %2695, 0, !dbg !21 + %2696 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1160, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2697 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1160, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1162 = select i1 %.not2.i1161, float %2697, float %2696, !dbg !21 + %2698 = fadd float %.04.i1162, 0xC168000FE0000000, !dbg !21 + %2699 = fneg float %2698, !dbg !21 + %2700 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1163 = icmp eq i32 %2700, 0, !dbg !21 + %2701 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1785, float 0x3FF7154760000000, float %2699) #6, !dbg !21 + %2702 = tail call float @llvm.nvvm.fma.rn.f(float %1785, float 0x3FF7154760000000, float %2699) #6, !dbg !21 + %.0.i1164 = select i1 %.not3.i1163, float %2702, float %2701, !dbg !21 + %2703 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1165 = icmp eq i32 %2703, 0, !dbg !21 + %2704 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1785, float 0x3E54AE0C00000000, float %.0.i1164) #6, !dbg !21 + %2705 = tail call float @llvm.nvvm.fma.rn.f(float %1785, float 0x3E54AE0C00000000, float %.0.i1164) #6, !dbg !21 + %.01.i1166 = select i1 %.not4.i1165, float %2705, float %2704, !dbg !21 + %2706 = bitcast float %.04.i1162 to i32, !dbg !21 + %2707 = shl i32 %2706, 23, !dbg !21 + %2708 = bitcast i32 %2707 to float, !dbg !21 + %2709 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1166) #6, !dbg !21 + %2710 = fmul float %2709, %2708, !dbg !21 + %2711 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1167 = icmp eq i32 %2711, 0, !dbg !21 + %2712 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1786, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2713 = tail call float @llvm.nvvm.fma.rn.f(float %1786, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1168 = select i1 %.not.i1167, float %2713, float %2712, !dbg !21 + %2714 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1169 = icmp eq i32 %2714, 0, !dbg !21 + %2715 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1168) #6, !dbg !21 + %2716 = tail call float @llvm.nvvm.saturate.f(float %.02.i1168) #6, !dbg !21 + %.03.i1170 = select i1 %.not1.i1169, float %2716, float %2715, !dbg !21 + %2717 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1171 = icmp eq i32 %2717, 0, !dbg !21 + %2718 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1170, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2719 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1170, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1172 = select i1 %.not2.i1171, float %2719, float %2718, !dbg !21 + %2720 = fadd float %.04.i1172, 0xC168000FE0000000, !dbg !21 + %2721 = fneg float %2720, !dbg !21 + %2722 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1173 = icmp eq i32 %2722, 0, !dbg !21 + %2723 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1786, float 0x3FF7154760000000, float %2721) #6, !dbg !21 + %2724 = tail call float @llvm.nvvm.fma.rn.f(float %1786, float 0x3FF7154760000000, float %2721) #6, !dbg !21 + %.0.i1174 = select i1 %.not3.i1173, float %2724, float %2723, !dbg !21 + %2725 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1175 = icmp eq i32 %2725, 0, !dbg !21 + %2726 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1786, float 0x3E54AE0C00000000, float %.0.i1174) #6, !dbg !21 + %2727 = tail call float @llvm.nvvm.fma.rn.f(float %1786, float 0x3E54AE0C00000000, float %.0.i1174) #6, !dbg !21 + %.01.i1176 = select i1 %.not4.i1175, float %2727, float %2726, !dbg !21 + %2728 = bitcast float %.04.i1172 to i32, !dbg !21 + %2729 = shl i32 %2728, 23, !dbg !21 + %2730 = bitcast i32 %2729 to float, !dbg !21 + %2731 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1176) #6, !dbg !21 + %2732 = fmul float %2731, %2730, !dbg !21 + %2733 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1177 = icmp eq i32 %2733, 0, !dbg !21 + %2734 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1787, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2735 = tail call float @llvm.nvvm.fma.rn.f(float %1787, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1178 = select i1 %.not.i1177, float %2735, float %2734, !dbg !21 + %2736 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1179 = icmp eq i32 %2736, 0, !dbg !21 + %2737 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1178) #6, !dbg !21 + %2738 = tail call float @llvm.nvvm.saturate.f(float %.02.i1178) #6, !dbg !21 + %.03.i1180 = select i1 %.not1.i1179, float %2738, float %2737, !dbg !21 + %2739 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1181 = icmp eq i32 %2739, 0, !dbg !21 + %2740 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1180, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2741 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1180, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1182 = select i1 %.not2.i1181, float %2741, float %2740, !dbg !21 + %2742 = fadd float %.04.i1182, 0xC168000FE0000000, !dbg !21 + %2743 = fneg float %2742, !dbg !21 + %2744 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1183 = icmp eq i32 %2744, 0, !dbg !21 + %2745 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1787, float 0x3FF7154760000000, float %2743) #6, !dbg !21 + %2746 = tail call float @llvm.nvvm.fma.rn.f(float %1787, float 0x3FF7154760000000, float %2743) #6, !dbg !21 + %.0.i1184 = select i1 %.not3.i1183, float %2746, float %2745, !dbg !21 + %2747 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1185 = icmp eq i32 %2747, 0, !dbg !21 + %2748 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1787, float 0x3E54AE0C00000000, float %.0.i1184) #6, !dbg !21 + %2749 = tail call float @llvm.nvvm.fma.rn.f(float %1787, float 0x3E54AE0C00000000, float %.0.i1184) #6, !dbg !21 + %.01.i1186 = select i1 %.not4.i1185, float %2749, float %2748, !dbg !21 + %2750 = bitcast float %.04.i1182 to i32, !dbg !21 + %2751 = shl i32 %2750, 23, !dbg !21 + %2752 = bitcast i32 %2751 to float, !dbg !21 + %2753 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1186) #6, !dbg !21 + %2754 = fmul float %2753, %2752, !dbg !21 + %2755 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1187 = icmp eq i32 %2755, 0, !dbg !21 + %2756 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1788, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2757 = tail call float @llvm.nvvm.fma.rn.f(float %1788, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1188 = select i1 %.not.i1187, float %2757, float %2756, !dbg !21 + %2758 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1189 = icmp eq i32 %2758, 0, !dbg !21 + %2759 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1188) #6, !dbg !21 + %2760 = tail call float @llvm.nvvm.saturate.f(float %.02.i1188) #6, !dbg !21 + %.03.i1190 = select i1 %.not1.i1189, float %2760, float %2759, !dbg !21 + %2761 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1191 = icmp eq i32 %2761, 0, !dbg !21 + %2762 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1190, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2763 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1190, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1192 = select i1 %.not2.i1191, float %2763, float %2762, !dbg !21 + %2764 = fadd float %.04.i1192, 0xC168000FE0000000, !dbg !21 + %2765 = fneg float %2764, !dbg !21 + %2766 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1193 = icmp eq i32 %2766, 0, !dbg !21 + %2767 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1788, float 0x3FF7154760000000, float %2765) #6, !dbg !21 + %2768 = tail call float @llvm.nvvm.fma.rn.f(float %1788, float 0x3FF7154760000000, float %2765) #6, !dbg !21 + %.0.i1194 = select i1 %.not3.i1193, float %2768, float %2767, !dbg !21 + %2769 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1195 = icmp eq i32 %2769, 0, !dbg !21 + %2770 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1788, float 0x3E54AE0C00000000, float %.0.i1194) #6, !dbg !21 + %2771 = tail call float @llvm.nvvm.fma.rn.f(float %1788, float 0x3E54AE0C00000000, float %.0.i1194) #6, !dbg !21 + %.01.i1196 = select i1 %.not4.i1195, float %2771, float %2770, !dbg !21 + %2772 = bitcast float %.04.i1192 to i32, !dbg !21 + %2773 = shl i32 %2772, 23, !dbg !21 + %2774 = bitcast i32 %2773 to float, !dbg !21 + %2775 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1196) #6, !dbg !21 + %2776 = fmul float %2775, %2774, !dbg !21 + %2777 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1197 = icmp eq i32 %2777, 0, !dbg !21 + %2778 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1789, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2779 = tail call float @llvm.nvvm.fma.rn.f(float %1789, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1198 = select i1 %.not.i1197, float %2779, float %2778, !dbg !21 + %2780 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1199 = icmp eq i32 %2780, 0, !dbg !21 + %2781 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1198) #6, !dbg !21 + %2782 = tail call float @llvm.nvvm.saturate.f(float %.02.i1198) #6, !dbg !21 + %.03.i1200 = select i1 %.not1.i1199, float %2782, float %2781, !dbg !21 + %2783 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1201 = icmp eq i32 %2783, 0, !dbg !21 + %2784 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1200, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2785 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1200, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1202 = select i1 %.not2.i1201, float %2785, float %2784, !dbg !21 + %2786 = fadd float %.04.i1202, 0xC168000FE0000000, !dbg !21 + %2787 = fneg float %2786, !dbg !21 + %2788 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1203 = icmp eq i32 %2788, 0, !dbg !21 + %2789 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1789, float 0x3FF7154760000000, float %2787) #6, !dbg !21 + %2790 = tail call float @llvm.nvvm.fma.rn.f(float %1789, float 0x3FF7154760000000, float %2787) #6, !dbg !21 + %.0.i1204 = select i1 %.not3.i1203, float %2790, float %2789, !dbg !21 + %2791 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1205 = icmp eq i32 %2791, 0, !dbg !21 + %2792 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1789, float 0x3E54AE0C00000000, float %.0.i1204) #6, !dbg !21 + %2793 = tail call float @llvm.nvvm.fma.rn.f(float %1789, float 0x3E54AE0C00000000, float %.0.i1204) #6, !dbg !21 + %.01.i1206 = select i1 %.not4.i1205, float %2793, float %2792, !dbg !21 + %2794 = bitcast float %.04.i1202 to i32, !dbg !21 + %2795 = shl i32 %2794, 23, !dbg !21 + %2796 = bitcast i32 %2795 to float, !dbg !21 + %2797 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1206) #6, !dbg !21 + %2798 = fmul float %2797, %2796, !dbg !21 + %2799 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1207 = icmp eq i32 %2799, 0, !dbg !21 + %2800 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1790, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2801 = tail call float @llvm.nvvm.fma.rn.f(float %1790, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1208 = select i1 %.not.i1207, float %2801, float %2800, !dbg !21 + %2802 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1209 = icmp eq i32 %2802, 0, !dbg !21 + %2803 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1208) #6, !dbg !21 + %2804 = tail call float @llvm.nvvm.saturate.f(float %.02.i1208) #6, !dbg !21 + %.03.i1210 = select i1 %.not1.i1209, float %2804, float %2803, !dbg !21 + %2805 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1211 = icmp eq i32 %2805, 0, !dbg !21 + %2806 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1210, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2807 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1210, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1212 = select i1 %.not2.i1211, float %2807, float %2806, !dbg !21 + %2808 = fadd float %.04.i1212, 0xC168000FE0000000, !dbg !21 + %2809 = fneg float %2808, !dbg !21 + %2810 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1213 = icmp eq i32 %2810, 0, !dbg !21 + %2811 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1790, float 0x3FF7154760000000, float %2809) #6, !dbg !21 + %2812 = tail call float @llvm.nvvm.fma.rn.f(float %1790, float 0x3FF7154760000000, float %2809) #6, !dbg !21 + %.0.i1214 = select i1 %.not3.i1213, float %2812, float %2811, !dbg !21 + %2813 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1215 = icmp eq i32 %2813, 0, !dbg !21 + %2814 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1790, float 0x3E54AE0C00000000, float %.0.i1214) #6, !dbg !21 + %2815 = tail call float @llvm.nvvm.fma.rn.f(float %1790, float 0x3E54AE0C00000000, float %.0.i1214) #6, !dbg !21 + %.01.i1216 = select i1 %.not4.i1215, float %2815, float %2814, !dbg !21 + %2816 = bitcast float %.04.i1212 to i32, !dbg !21 + %2817 = shl i32 %2816, 23, !dbg !21 + %2818 = bitcast i32 %2817 to float, !dbg !21 + %2819 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1216) #6, !dbg !21 + %2820 = fmul float %2819, %2818, !dbg !21 + %2821 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1217 = icmp eq i32 %2821, 0, !dbg !21 + %2822 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1791, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2823 = tail call float @llvm.nvvm.fma.rn.f(float %1791, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1218 = select i1 %.not.i1217, float %2823, float %2822, !dbg !21 + %2824 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1219 = icmp eq i32 %2824, 0, !dbg !21 + %2825 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1218) #6, !dbg !21 + %2826 = tail call float @llvm.nvvm.saturate.f(float %.02.i1218) #6, !dbg !21 + %.03.i1220 = select i1 %.not1.i1219, float %2826, float %2825, !dbg !21 + %2827 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1221 = icmp eq i32 %2827, 0, !dbg !21 + %2828 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1220, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2829 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1220, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1222 = select i1 %.not2.i1221, float %2829, float %2828, !dbg !21 + %2830 = fadd float %.04.i1222, 0xC168000FE0000000, !dbg !21 + %2831 = fneg float %2830, !dbg !21 + %2832 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1223 = icmp eq i32 %2832, 0, !dbg !21 + %2833 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1791, float 0x3FF7154760000000, float %2831) #6, !dbg !21 + %2834 = tail call float @llvm.nvvm.fma.rn.f(float %1791, float 0x3FF7154760000000, float %2831) #6, !dbg !21 + %.0.i1224 = select i1 %.not3.i1223, float %2834, float %2833, !dbg !21 + %2835 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1225 = icmp eq i32 %2835, 0, !dbg !21 + %2836 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1791, float 0x3E54AE0C00000000, float %.0.i1224) #6, !dbg !21 + %2837 = tail call float @llvm.nvvm.fma.rn.f(float %1791, float 0x3E54AE0C00000000, float %.0.i1224) #6, !dbg !21 + %.01.i1226 = select i1 %.not4.i1225, float %2837, float %2836, !dbg !21 + %2838 = bitcast float %.04.i1222 to i32, !dbg !21 + %2839 = shl i32 %2838, 23, !dbg !21 + %2840 = bitcast i32 %2839 to float, !dbg !21 + %2841 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1226) #6, !dbg !21 + %2842 = fmul float %2841, %2840, !dbg !21 + %2843 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1227 = icmp eq i32 %2843, 0, !dbg !21 + %2844 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1792, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2845 = tail call float @llvm.nvvm.fma.rn.f(float %1792, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1228 = select i1 %.not.i1227, float %2845, float %2844, !dbg !21 + %2846 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1229 = icmp eq i32 %2846, 0, !dbg !21 + %2847 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1228) #6, !dbg !21 + %2848 = tail call float @llvm.nvvm.saturate.f(float %.02.i1228) #6, !dbg !21 + %.03.i1230 = select i1 %.not1.i1229, float %2848, float %2847, !dbg !21 + %2849 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1231 = icmp eq i32 %2849, 0, !dbg !21 + %2850 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1230, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2851 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1230, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1232 = select i1 %.not2.i1231, float %2851, float %2850, !dbg !21 + %2852 = fadd float %.04.i1232, 0xC168000FE0000000, !dbg !21 + %2853 = fneg float %2852, !dbg !21 + %2854 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1233 = icmp eq i32 %2854, 0, !dbg !21 + %2855 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1792, float 0x3FF7154760000000, float %2853) #6, !dbg !21 + %2856 = tail call float @llvm.nvvm.fma.rn.f(float %1792, float 0x3FF7154760000000, float %2853) #6, !dbg !21 + %.0.i1234 = select i1 %.not3.i1233, float %2856, float %2855, !dbg !21 + %2857 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1235 = icmp eq i32 %2857, 0, !dbg !21 + %2858 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1792, float 0x3E54AE0C00000000, float %.0.i1234) #6, !dbg !21 + %2859 = tail call float @llvm.nvvm.fma.rn.f(float %1792, float 0x3E54AE0C00000000, float %.0.i1234) #6, !dbg !21 + %.01.i1236 = select i1 %.not4.i1235, float %2859, float %2858, !dbg !21 + %2860 = bitcast float %.04.i1232 to i32, !dbg !21 + %2861 = shl i32 %2860, 23, !dbg !21 + %2862 = bitcast i32 %2861 to float, !dbg !21 + %2863 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1236) #6, !dbg !21 + %2864 = fmul float %2863, %2862, !dbg !21 + %2865 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1237 = icmp eq i32 %2865, 0, !dbg !21 + %2866 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1793, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2867 = tail call float @llvm.nvvm.fma.rn.f(float %1793, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1238 = select i1 %.not.i1237, float %2867, float %2866, !dbg !21 + %2868 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1239 = icmp eq i32 %2868, 0, !dbg !21 + %2869 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1238) #6, !dbg !21 + %2870 = tail call float @llvm.nvvm.saturate.f(float %.02.i1238) #6, !dbg !21 + %.03.i1240 = select i1 %.not1.i1239, float %2870, float %2869, !dbg !21 + %2871 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1241 = icmp eq i32 %2871, 0, !dbg !21 + %2872 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1240, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2873 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1240, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1242 = select i1 %.not2.i1241, float %2873, float %2872, !dbg !21 + %2874 = fadd float %.04.i1242, 0xC168000FE0000000, !dbg !21 + %2875 = fneg float %2874, !dbg !21 + %2876 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1243 = icmp eq i32 %2876, 0, !dbg !21 + %2877 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1793, float 0x3FF7154760000000, float %2875) #6, !dbg !21 + %2878 = tail call float @llvm.nvvm.fma.rn.f(float %1793, float 0x3FF7154760000000, float %2875) #6, !dbg !21 + %.0.i1244 = select i1 %.not3.i1243, float %2878, float %2877, !dbg !21 + %2879 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1245 = icmp eq i32 %2879, 0, !dbg !21 + %2880 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1793, float 0x3E54AE0C00000000, float %.0.i1244) #6, !dbg !21 + %2881 = tail call float @llvm.nvvm.fma.rn.f(float %1793, float 0x3E54AE0C00000000, float %.0.i1244) #6, !dbg !21 + %.01.i1246 = select i1 %.not4.i1245, float %2881, float %2880, !dbg !21 + %2882 = bitcast float %.04.i1242 to i32, !dbg !21 + %2883 = shl i32 %2882, 23, !dbg !21 + %2884 = bitcast i32 %2883 to float, !dbg !21 + %2885 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1246) #6, !dbg !21 + %2886 = fmul float %2885, %2884, !dbg !21 + %2887 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1247 = icmp eq i32 %2887, 0, !dbg !21 + %2888 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1794, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2889 = tail call float @llvm.nvvm.fma.rn.f(float %1794, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1248 = select i1 %.not.i1247, float %2889, float %2888, !dbg !21 + %2890 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1249 = icmp eq i32 %2890, 0, !dbg !21 + %2891 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1248) #6, !dbg !21 + %2892 = tail call float @llvm.nvvm.saturate.f(float %.02.i1248) #6, !dbg !21 + %.03.i1250 = select i1 %.not1.i1249, float %2892, float %2891, !dbg !21 + %2893 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1251 = icmp eq i32 %2893, 0, !dbg !21 + %2894 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1250, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2895 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1250, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1252 = select i1 %.not2.i1251, float %2895, float %2894, !dbg !21 + %2896 = fadd float %.04.i1252, 0xC168000FE0000000, !dbg !21 + %2897 = fneg float %2896, !dbg !21 + %2898 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1253 = icmp eq i32 %2898, 0, !dbg !21 + %2899 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1794, float 0x3FF7154760000000, float %2897) #6, !dbg !21 + %2900 = tail call float @llvm.nvvm.fma.rn.f(float %1794, float 0x3FF7154760000000, float %2897) #6, !dbg !21 + %.0.i1254 = select i1 %.not3.i1253, float %2900, float %2899, !dbg !21 + %2901 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1255 = icmp eq i32 %2901, 0, !dbg !21 + %2902 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1794, float 0x3E54AE0C00000000, float %.0.i1254) #6, !dbg !21 + %2903 = tail call float @llvm.nvvm.fma.rn.f(float %1794, float 0x3E54AE0C00000000, float %.0.i1254) #6, !dbg !21 + %.01.i1256 = select i1 %.not4.i1255, float %2903, float %2902, !dbg !21 + %2904 = bitcast float %.04.i1252 to i32, !dbg !21 + %2905 = shl i32 %2904, 23, !dbg !21 + %2906 = bitcast i32 %2905 to float, !dbg !21 + %2907 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1256) #6, !dbg !21 + %2908 = fmul float %2907, %2906, !dbg !21 + %2909 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1257 = icmp eq i32 %2909, 0, !dbg !21 + %2910 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1795, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2911 = tail call float @llvm.nvvm.fma.rn.f(float %1795, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1258 = select i1 %.not.i1257, float %2911, float %2910, !dbg !21 + %2912 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1259 = icmp eq i32 %2912, 0, !dbg !21 + %2913 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1258) #6, !dbg !21 + %2914 = tail call float @llvm.nvvm.saturate.f(float %.02.i1258) #6, !dbg !21 + %.03.i1260 = select i1 %.not1.i1259, float %2914, float %2913, !dbg !21 + %2915 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1261 = icmp eq i32 %2915, 0, !dbg !21 + %2916 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1260, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2917 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1260, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1262 = select i1 %.not2.i1261, float %2917, float %2916, !dbg !21 + %2918 = fadd float %.04.i1262, 0xC168000FE0000000, !dbg !21 + %2919 = fneg float %2918, !dbg !21 + %2920 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1263 = icmp eq i32 %2920, 0, !dbg !21 + %2921 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1795, float 0x3FF7154760000000, float %2919) #6, !dbg !21 + %2922 = tail call float @llvm.nvvm.fma.rn.f(float %1795, float 0x3FF7154760000000, float %2919) #6, !dbg !21 + %.0.i1264 = select i1 %.not3.i1263, float %2922, float %2921, !dbg !21 + %2923 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1265 = icmp eq i32 %2923, 0, !dbg !21 + %2924 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1795, float 0x3E54AE0C00000000, float %.0.i1264) #6, !dbg !21 + %2925 = tail call float @llvm.nvvm.fma.rn.f(float %1795, float 0x3E54AE0C00000000, float %.0.i1264) #6, !dbg !21 + %.01.i1266 = select i1 %.not4.i1265, float %2925, float %2924, !dbg !21 + %2926 = bitcast float %.04.i1262 to i32, !dbg !21 + %2927 = shl i32 %2926, 23, !dbg !21 + %2928 = bitcast i32 %2927 to float, !dbg !21 + %2929 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1266) #6, !dbg !21 + %2930 = fmul float %2929, %2928, !dbg !21 + %2931 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1267 = icmp eq i32 %2931, 0, !dbg !21 + %2932 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1796, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2933 = tail call float @llvm.nvvm.fma.rn.f(float %1796, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1268 = select i1 %.not.i1267, float %2933, float %2932, !dbg !21 + %2934 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1269 = icmp eq i32 %2934, 0, !dbg !21 + %2935 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1268) #6, !dbg !21 + %2936 = tail call float @llvm.nvvm.saturate.f(float %.02.i1268) #6, !dbg !21 + %.03.i1270 = select i1 %.not1.i1269, float %2936, float %2935, !dbg !21 + %2937 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1271 = icmp eq i32 %2937, 0, !dbg !21 + %2938 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1270, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2939 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1270, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1272 = select i1 %.not2.i1271, float %2939, float %2938, !dbg !21 + %2940 = fadd float %.04.i1272, 0xC168000FE0000000, !dbg !21 + %2941 = fneg float %2940, !dbg !21 + %2942 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1273 = icmp eq i32 %2942, 0, !dbg !21 + %2943 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1796, float 0x3FF7154760000000, float %2941) #6, !dbg !21 + %2944 = tail call float @llvm.nvvm.fma.rn.f(float %1796, float 0x3FF7154760000000, float %2941) #6, !dbg !21 + %.0.i1274 = select i1 %.not3.i1273, float %2944, float %2943, !dbg !21 + %2945 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1275 = icmp eq i32 %2945, 0, !dbg !21 + %2946 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1796, float 0x3E54AE0C00000000, float %.0.i1274) #6, !dbg !21 + %2947 = tail call float @llvm.nvvm.fma.rn.f(float %1796, float 0x3E54AE0C00000000, float %.0.i1274) #6, !dbg !21 + %.01.i1276 = select i1 %.not4.i1275, float %2947, float %2946, !dbg !21 + %2948 = bitcast float %.04.i1272 to i32, !dbg !21 + %2949 = shl i32 %2948, 23, !dbg !21 + %2950 = bitcast i32 %2949 to float, !dbg !21 + %2951 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1276) #6, !dbg !21 + %2952 = fmul float %2951, %2950, !dbg !21 + %2953 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1277 = icmp eq i32 %2953, 0, !dbg !21 + %2954 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1797, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2955 = tail call float @llvm.nvvm.fma.rn.f(float %1797, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1278 = select i1 %.not.i1277, float %2955, float %2954, !dbg !21 + %2956 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1279 = icmp eq i32 %2956, 0, !dbg !21 + %2957 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1278) #6, !dbg !21 + %2958 = tail call float @llvm.nvvm.saturate.f(float %.02.i1278) #6, !dbg !21 + %.03.i1280 = select i1 %.not1.i1279, float %2958, float %2957, !dbg !21 + %2959 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1281 = icmp eq i32 %2959, 0, !dbg !21 + %2960 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1280, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2961 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1280, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1282 = select i1 %.not2.i1281, float %2961, float %2960, !dbg !21 + %2962 = fadd float %.04.i1282, 0xC168000FE0000000, !dbg !21 + %2963 = fneg float %2962, !dbg !21 + %2964 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1283 = icmp eq i32 %2964, 0, !dbg !21 + %2965 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1797, float 0x3FF7154760000000, float %2963) #6, !dbg !21 + %2966 = tail call float @llvm.nvvm.fma.rn.f(float %1797, float 0x3FF7154760000000, float %2963) #6, !dbg !21 + %.0.i1284 = select i1 %.not3.i1283, float %2966, float %2965, !dbg !21 + %2967 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1285 = icmp eq i32 %2967, 0, !dbg !21 + %2968 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1797, float 0x3E54AE0C00000000, float %.0.i1284) #6, !dbg !21 + %2969 = tail call float @llvm.nvvm.fma.rn.f(float %1797, float 0x3E54AE0C00000000, float %.0.i1284) #6, !dbg !21 + %.01.i1286 = select i1 %.not4.i1285, float %2969, float %2968, !dbg !21 + %2970 = bitcast float %.04.i1282 to i32, !dbg !21 + %2971 = shl i32 %2970, 23, !dbg !21 + %2972 = bitcast i32 %2971 to float, !dbg !21 + %2973 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1286) #6, !dbg !21 + %2974 = fmul float %2973, %2972, !dbg !21 + %2975 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1287 = icmp eq i32 %2975, 0, !dbg !21 + %2976 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1798, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2977 = tail call float @llvm.nvvm.fma.rn.f(float %1798, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1288 = select i1 %.not.i1287, float %2977, float %2976, !dbg !21 + %2978 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1289 = icmp eq i32 %2978, 0, !dbg !21 + %2979 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1288) #6, !dbg !21 + %2980 = tail call float @llvm.nvvm.saturate.f(float %.02.i1288) #6, !dbg !21 + %.03.i1290 = select i1 %.not1.i1289, float %2980, float %2979, !dbg !21 + %2981 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1291 = icmp eq i32 %2981, 0, !dbg !21 + %2982 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1290, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2983 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1290, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1292 = select i1 %.not2.i1291, float %2983, float %2982, !dbg !21 + %2984 = fadd float %.04.i1292, 0xC168000FE0000000, !dbg !21 + %2985 = fneg float %2984, !dbg !21 + %2986 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1293 = icmp eq i32 %2986, 0, !dbg !21 + %2987 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1798, float 0x3FF7154760000000, float %2985) #6, !dbg !21 + %2988 = tail call float @llvm.nvvm.fma.rn.f(float %1798, float 0x3FF7154760000000, float %2985) #6, !dbg !21 + %.0.i1294 = select i1 %.not3.i1293, float %2988, float %2987, !dbg !21 + %2989 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1295 = icmp eq i32 %2989, 0, !dbg !21 + %2990 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1798, float 0x3E54AE0C00000000, float %.0.i1294) #6, !dbg !21 + %2991 = tail call float @llvm.nvvm.fma.rn.f(float %1798, float 0x3E54AE0C00000000, float %.0.i1294) #6, !dbg !21 + %.01.i1296 = select i1 %.not4.i1295, float %2991, float %2990, !dbg !21 + %2992 = bitcast float %.04.i1292 to i32, !dbg !21 + %2993 = shl i32 %2992, 23, !dbg !21 + %2994 = bitcast i32 %2993 to float, !dbg !21 + %2995 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1296) #6, !dbg !21 + %2996 = fmul float %2995, %2994, !dbg !21 + %2997 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1297 = icmp eq i32 %2997, 0, !dbg !21 + %2998 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1799, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2999 = tail call float @llvm.nvvm.fma.rn.f(float %1799, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1298 = select i1 %.not.i1297, float %2999, float %2998, !dbg !21 + %3000 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1299 = icmp eq i32 %3000, 0, !dbg !21 + %3001 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1298) #6, !dbg !21 + %3002 = tail call float @llvm.nvvm.saturate.f(float %.02.i1298) #6, !dbg !21 + %.03.i1300 = select i1 %.not1.i1299, float %3002, float %3001, !dbg !21 + %3003 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1301 = icmp eq i32 %3003, 0, !dbg !21 + %3004 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1300, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3005 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1300, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1302 = select i1 %.not2.i1301, float %3005, float %3004, !dbg !21 + %3006 = fadd float %.04.i1302, 0xC168000FE0000000, !dbg !21 + %3007 = fneg float %3006, !dbg !21 + %3008 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1303 = icmp eq i32 %3008, 0, !dbg !21 + %3009 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1799, float 0x3FF7154760000000, float %3007) #6, !dbg !21 + %3010 = tail call float @llvm.nvvm.fma.rn.f(float %1799, float 0x3FF7154760000000, float %3007) #6, !dbg !21 + %.0.i1304 = select i1 %.not3.i1303, float %3010, float %3009, !dbg !21 + %3011 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1305 = icmp eq i32 %3011, 0, !dbg !21 + %3012 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1799, float 0x3E54AE0C00000000, float %.0.i1304) #6, !dbg !21 + %3013 = tail call float @llvm.nvvm.fma.rn.f(float %1799, float 0x3E54AE0C00000000, float %.0.i1304) #6, !dbg !21 + %.01.i1306 = select i1 %.not4.i1305, float %3013, float %3012, !dbg !21 + %3014 = bitcast float %.04.i1302 to i32, !dbg !21 + %3015 = shl i32 %3014, 23, !dbg !21 + %3016 = bitcast i32 %3015 to float, !dbg !21 + %3017 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1306) #6, !dbg !21 + %3018 = fmul float %3017, %3016, !dbg !21 + %3019 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1307 = icmp eq i32 %3019, 0, !dbg !21 + %3020 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1800, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %3021 = tail call float @llvm.nvvm.fma.rn.f(float %1800, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1308 = select i1 %.not.i1307, float %3021, float %3020, !dbg !21 + %3022 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1309 = icmp eq i32 %3022, 0, !dbg !21 + %3023 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1308) #6, !dbg !21 + %3024 = tail call float @llvm.nvvm.saturate.f(float %.02.i1308) #6, !dbg !21 + %.03.i1310 = select i1 %.not1.i1309, float %3024, float %3023, !dbg !21 + %3025 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1311 = icmp eq i32 %3025, 0, !dbg !21 + %3026 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1310, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3027 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1310, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1312 = select i1 %.not2.i1311, float %3027, float %3026, !dbg !21 + %3028 = fadd float %.04.i1312, 0xC168000FE0000000, !dbg !21 + %3029 = fneg float %3028, !dbg !21 + %3030 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1313 = icmp eq i32 %3030, 0, !dbg !21 + %3031 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1800, float 0x3FF7154760000000, float %3029) #6, !dbg !21 + %3032 = tail call float @llvm.nvvm.fma.rn.f(float %1800, float 0x3FF7154760000000, float %3029) #6, !dbg !21 + %.0.i1314 = select i1 %.not3.i1313, float %3032, float %3031, !dbg !21 + %3033 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1315 = icmp eq i32 %3033, 0, !dbg !21 + %3034 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1800, float 0x3E54AE0C00000000, float %.0.i1314) #6, !dbg !21 + %3035 = tail call float @llvm.nvvm.fma.rn.f(float %1800, float 0x3E54AE0C00000000, float %.0.i1314) #6, !dbg !21 + %.01.i1316 = select i1 %.not4.i1315, float %3035, float %3034, !dbg !21 + %3036 = bitcast float %.04.i1312 to i32, !dbg !21 + %3037 = shl i32 %3036, 23, !dbg !21 + %3038 = bitcast i32 %3037 to float, !dbg !21 + %3039 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1316) #6, !dbg !21 + %3040 = fmul float %3039, %3038, !dbg !21 + %3041 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1317 = icmp eq i32 %3041, 0, !dbg !21 + %3042 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1801, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %3043 = tail call float @llvm.nvvm.fma.rn.f(float %1801, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1318 = select i1 %.not.i1317, float %3043, float %3042, !dbg !21 + %3044 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1319 = icmp eq i32 %3044, 0, !dbg !21 + %3045 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1318) #6, !dbg !21 + %3046 = tail call float @llvm.nvvm.saturate.f(float %.02.i1318) #6, !dbg !21 + %.03.i1320 = select i1 %.not1.i1319, float %3046, float %3045, !dbg !21 + %3047 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1321 = icmp eq i32 %3047, 0, !dbg !21 + %3048 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1320, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3049 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1320, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1322 = select i1 %.not2.i1321, float %3049, float %3048, !dbg !21 + %3050 = fadd float %.04.i1322, 0xC168000FE0000000, !dbg !21 + %3051 = fneg float %3050, !dbg !21 + %3052 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1323 = icmp eq i32 %3052, 0, !dbg !21 + %3053 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1801, float 0x3FF7154760000000, float %3051) #6, !dbg !21 + %3054 = tail call float @llvm.nvvm.fma.rn.f(float %1801, float 0x3FF7154760000000, float %3051) #6, !dbg !21 + %.0.i1324 = select i1 %.not3.i1323, float %3054, float %3053, !dbg !21 + %3055 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1325 = icmp eq i32 %3055, 0, !dbg !21 + %3056 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1801, float 0x3E54AE0C00000000, float %.0.i1324) #6, !dbg !21 + %3057 = tail call float @llvm.nvvm.fma.rn.f(float %1801, float 0x3E54AE0C00000000, float %.0.i1324) #6, !dbg !21 + %.01.i1326 = select i1 %.not4.i1325, float %3057, float %3056, !dbg !21 + %3058 = bitcast float %.04.i1322 to i32, !dbg !21 + %3059 = shl i32 %3058, 23, !dbg !21 + %3060 = bitcast i32 %3059 to float, !dbg !21 + %3061 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1326) #6, !dbg !21 + %3062 = fmul float %3061, %3060, !dbg !21 + %3063 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1327 = icmp eq i32 %3063, 0, !dbg !21 + %3064 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1802, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %3065 = tail call float @llvm.nvvm.fma.rn.f(float %1802, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1328 = select i1 %.not.i1327, float %3065, float %3064, !dbg !21 + %3066 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1329 = icmp eq i32 %3066, 0, !dbg !21 + %3067 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1328) #6, !dbg !21 + %3068 = tail call float @llvm.nvvm.saturate.f(float %.02.i1328) #6, !dbg !21 + %.03.i1330 = select i1 %.not1.i1329, float %3068, float %3067, !dbg !21 + %3069 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1331 = icmp eq i32 %3069, 0, !dbg !21 + %3070 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1330, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3071 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1330, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1332 = select i1 %.not2.i1331, float %3071, float %3070, !dbg !21 + %3072 = fadd float %.04.i1332, 0xC168000FE0000000, !dbg !21 + %3073 = fneg float %3072, !dbg !21 + %3074 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1333 = icmp eq i32 %3074, 0, !dbg !21 + %3075 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1802, float 0x3FF7154760000000, float %3073) #6, !dbg !21 + %3076 = tail call float @llvm.nvvm.fma.rn.f(float %1802, float 0x3FF7154760000000, float %3073) #6, !dbg !21 + %.0.i1334 = select i1 %.not3.i1333, float %3076, float %3075, !dbg !21 + %3077 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1335 = icmp eq i32 %3077, 0, !dbg !21 + %3078 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1802, float 0x3E54AE0C00000000, float %.0.i1334) #6, !dbg !21 + %3079 = tail call float @llvm.nvvm.fma.rn.f(float %1802, float 0x3E54AE0C00000000, float %.0.i1334) #6, !dbg !21 + %.01.i1336 = select i1 %.not4.i1335, float %3079, float %3078, !dbg !21 + %3080 = bitcast float %.04.i1332 to i32, !dbg !21 + %3081 = shl i32 %3080, 23, !dbg !21 + %3082 = bitcast i32 %3081 to float, !dbg !21 + %3083 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1336) #6, !dbg !21 + %3084 = fmul float %3083, %3082, !dbg !21 + %3085 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1337 = icmp eq i32 %3085, 0, !dbg !21 + %3086 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1803, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %3087 = tail call float @llvm.nvvm.fma.rn.f(float %1803, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1338 = select i1 %.not.i1337, float %3087, float %3086, !dbg !21 + %3088 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1339 = icmp eq i32 %3088, 0, !dbg !21 + %3089 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1338) #6, !dbg !21 + %3090 = tail call float @llvm.nvvm.saturate.f(float %.02.i1338) #6, !dbg !21 + %.03.i1340 = select i1 %.not1.i1339, float %3090, float %3089, !dbg !21 + %3091 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %3092 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1340, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3093 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1340, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3094 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1343 = icmp eq i32 %3094, 0, !dbg !21 + %3095 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1345 = icmp eq i32 %3095, 0, !dbg !21 + %3096 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1347 = icmp eq i32 %3096, 0, !dbg !21 + %3097 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1804, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %3098 = tail call float @llvm.nvvm.fma.rn.f(float %1804, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1348 = select i1 %.not.i1347, float %3098, float %3097, !dbg !21 + %3099 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1349 = icmp eq i32 %3099, 0, !dbg !21 + %3100 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1348) #6, !dbg !21 + %3101 = tail call float @llvm.nvvm.saturate.f(float %.02.i1348) #6, !dbg !21 + %.03.i1350 = select i1 %.not1.i1349, float %3101, float %3100, !dbg !21 + %3102 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %3103 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1350, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3104 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1350, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3105 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1353 = icmp eq i32 %3105, 0, !dbg !21 + %3106 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1355 = icmp eq i32 %3106, 0, !dbg !21 + %3107 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1357 = icmp eq i32 %3107, 0, !dbg !21 + %3108 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1805, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %3109 = tail call float @llvm.nvvm.fma.rn.f(float %1805, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1358 = select i1 %.not.i1357, float %3109, float %3108, !dbg !21 + %3110 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1359 = icmp eq i32 %3110, 0, !dbg !21 + %3111 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1358) #6, !dbg !21 + %3112 = tail call float @llvm.nvvm.saturate.f(float %.02.i1358) #6, !dbg !21 + %.03.i1360 = select i1 %.not1.i1359, float %3112, float %3111, !dbg !21 + %3113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %3114 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1360, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3115 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1360, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3116 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1363 = icmp eq i32 %3116, 0, !dbg !21 + %3117 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1365 = icmp eq i32 %3117, 0, !dbg !21 + %3118 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1367 = icmp eq i32 %3118, 0, !dbg !21 + %3119 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1806, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %3120 = tail call float @llvm.nvvm.fma.rn.f(float %1806, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1368 = select i1 %.not.i1367, float %3120, float %3119, !dbg !21 + %3121 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1369 = icmp eq i32 %3121, 0, !dbg !21 + %3122 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1368) #6, !dbg !21 + %3123 = tail call float @llvm.nvvm.saturate.f(float %.02.i1368) #6, !dbg !21 + %.03.i1370 = select i1 %.not1.i1369, float %3123, float %3122, !dbg !21 + %3124 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %3125 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1370, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3126 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1370, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3127 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1373 = icmp eq i32 %3127, 0, !dbg !21 + %3128 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1375 = icmp eq i32 %3128, 0, !dbg !21 + %3129 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1377 = icmp eq i32 %3129, 0, !dbg !21 + %3130 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1807, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %3131 = tail call float @llvm.nvvm.fma.rn.f(float %1807, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1378 = select i1 %.not.i1377, float %3131, float %3130, !dbg !21 + %3132 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1379 = icmp eq i32 %3132, 0, !dbg !21 + %3133 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1378) #6, !dbg !21 + %3134 = tail call float @llvm.nvvm.saturate.f(float %.02.i1378) #6, !dbg !21 + %.03.i1380 = select i1 %.not1.i1379, float %3134, float %3133, !dbg !21 + %3135 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %3136 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1380, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3137 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1380, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3138 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1383 = icmp eq i32 %3138, 0, !dbg !21 + %3139 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1385 = icmp eq i32 %3139, 0, !dbg !21 + %3140 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1387 = icmp eq i32 %3140, 0, !dbg !21 + %3141 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1808, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %3142 = tail call float @llvm.nvvm.fma.rn.f(float %1808, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1388 = select i1 %.not.i1387, float %3142, float %3141, !dbg !21 + %3143 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1389 = icmp eq i32 %3143, 0, !dbg !21 + %3144 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1388) #6, !dbg !21 + %3145 = tail call float @llvm.nvvm.saturate.f(float %.02.i1388) #6, !dbg !21 + %.03.i1390 = select i1 %.not1.i1389, float %3145, float %3144, !dbg !21 + %3146 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %3147 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1390, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3148 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1390, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3149 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1393 = icmp eq i32 %3149, 0, !dbg !21 + %3150 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1395 = icmp eq i32 %3150, 0, !dbg !21 + %3151 = fmul float %424, 0.000000e+00, !dbg !23 + %3152 = fmul float %446, 0.000000e+00, !dbg !23 + %3153 = fmul float %468, 0.000000e+00, !dbg !23 + %3154 = fmul float %490, 0.000000e+00, !dbg !23 + %3155 = fmul float %512, 0.000000e+00, !dbg !23 + %3156 = fmul float %534, 0.000000e+00, !dbg !23 + %3157 = fmul float %556, 0.000000e+00, !dbg !23 + %3158 = fmul float %578, 0.000000e+00, !dbg !23 + %3159 = fmul float %600, 0.000000e+00, !dbg !23 + %3160 = fmul float %622, 0.000000e+00, !dbg !23 + %3161 = fmul float %644, 0.000000e+00, !dbg !23 + %3162 = fmul float %666, 0.000000e+00, !dbg !23 + %3163 = fmul float %688, 0.000000e+00, !dbg !23 + %3164 = fmul float %710, 0.000000e+00, !dbg !23 + %3165 = fmul float %732, 0.000000e+00, !dbg !23 + %3166 = fmul float %754, 0.000000e+00, !dbg !23 + %3167 = fmul float %776, 0.000000e+00, !dbg !23 + %3168 = fmul float %798, 0.000000e+00, !dbg !23 + %3169 = fmul float %820, 0.000000e+00, !dbg !23 + %3170 = fmul float %842, 0.000000e+00, !dbg !23 + %3171 = fmul float %864, 0.000000e+00, !dbg !23 + %3172 = fmul float %886, 0.000000e+00, !dbg !23 + %3173 = fmul float %908, 0.000000e+00, !dbg !23 + %3174 = fmul float %930, 0.000000e+00, !dbg !23 + %3175 = fmul float %952, 0.000000e+00, !dbg !23 + %3176 = fmul float %974, 0.000000e+00, !dbg !23 + %3177 = fmul float %996, 0.000000e+00, !dbg !23 + %3178 = fmul float %1018, 0.000000e+00, !dbg !23 + %3179 = fmul float %1040, 0.000000e+00, !dbg !23 + %3180 = fmul float %1062, 0.000000e+00, !dbg !23 + %3181 = fmul float %1084, 0.000000e+00, !dbg !23 + %3182 = fmul float %1106, 0.000000e+00, !dbg !23 + %3183 = fmul float %1128, 0.000000e+00, !dbg !23 + %3184 = fmul float %1150, 0.000000e+00, !dbg !23 + %3185 = fmul float %1172, 0.000000e+00, !dbg !23 + %3186 = fmul float %1194, 0.000000e+00, !dbg !23 + %3187 = fmul float %1216, 0.000000e+00, !dbg !23 + %3188 = fmul float %1238, 0.000000e+00, !dbg !23 + %3189 = fmul float %1260, 0.000000e+00, !dbg !23 + %3190 = fmul float %1282, 0.000000e+00, !dbg !23 + %3191 = fmul float %1304, 0.000000e+00, !dbg !23 + %3192 = fmul float %1326, 0.000000e+00, !dbg !23 + %3193 = fmul float %1348, 0.000000e+00, !dbg !23 + %3194 = fmul float %1370, 0.000000e+00, !dbg !23 + %3195 = fmul float %1392, 0.000000e+00, !dbg !23 + %3196 = fmul float %1414, 0.000000e+00, !dbg !23 + %3197 = fmul float %1436, 0.000000e+00, !dbg !23 + %3198 = fmul float %1458, 0.000000e+00, !dbg !23 + %3199 = fmul float %1480, 0.000000e+00, !dbg !23 + %3200 = fmul float %1502, 0.000000e+00, !dbg !23 + %3201 = fmul float %1524, 0.000000e+00, !dbg !23 + %3202 = fmul float %1546, 0.000000e+00, !dbg !23 + %3203 = fmul float %1568, 0.000000e+00, !dbg !23 + %3204 = fmul float %1590, 0.000000e+00, !dbg !23 + %3205 = fmul float %1612, 0.000000e+00, !dbg !23 + %3206 = fmul float %1634, 0.000000e+00, !dbg !23 + %3207 = fmul float %1656, 0.000000e+00, !dbg !23 + %3208 = fmul float %1678, 0.000000e+00, !dbg !23 + %3209 = fadd float %3151, %1830, !dbg !24 + %3210 = select i1 %281, float 1.000000e+00, float %3209, !dbg !24 + %3211 = fadd float %3152, %1852, !dbg !24 + %3212 = select i1 %282, float 1.000000e+00, float %3211, !dbg !24 + %3213 = fadd float %3153, %1874, !dbg !24 + %3214 = select i1 %283, float 1.000000e+00, float %3213, !dbg !24 + %3215 = fadd float %3154, %1896, !dbg !24 + %3216 = select i1 %284, float 1.000000e+00, float %3215, !dbg !24 + %3217 = fadd float %3155, %1918, !dbg !24 + %3218 = select i1 %285, float 1.000000e+00, float %3217, !dbg !24 + %3219 = fadd float %3156, %1940, !dbg !24 + %3220 = select i1 %286, float 1.000000e+00, float %3219, !dbg !24 + %3221 = fadd float %3157, %1962, !dbg !24 + %3222 = select i1 %287, float 1.000000e+00, float %3221, !dbg !24 + %3223 = fadd float %3158, %1984, !dbg !24 + %3224 = select i1 %288, float 1.000000e+00, float %3223, !dbg !24 + %3225 = fadd float %3159, %2006, !dbg !24 + %3226 = select i1 %289, float 1.000000e+00, float %3225, !dbg !24 + %3227 = fadd float %3160, %2028, !dbg !24 + %3228 = select i1 %290, float 1.000000e+00, float %3227, !dbg !24 + %3229 = fadd float %3161, %2050, !dbg !24 + %3230 = select i1 %291, float 1.000000e+00, float %3229, !dbg !24 + %3231 = fadd float %3162, %2072, !dbg !24 + %3232 = select i1 %292, float 1.000000e+00, float %3231, !dbg !24 + %3233 = fadd float %3163, %2094, !dbg !24 + %3234 = select i1 %293, float 1.000000e+00, float %3233, !dbg !24 + %3235 = fadd float %3164, %2116, !dbg !24 + %3236 = select i1 %294, float 1.000000e+00, float %3235, !dbg !24 + %3237 = fadd float %3165, %2138, !dbg !24 + %3238 = select i1 %295, float 1.000000e+00, float %3237, !dbg !24 + %3239 = fadd float %3166, %2160, !dbg !24 + %3240 = select i1 %296, float 1.000000e+00, float %3239, !dbg !24 + %3241 = fadd float %3167, %2182, !dbg !24 + %3242 = select i1 %297, float 1.000000e+00, float %3241, !dbg !24 + %3243 = fadd float %3168, %2204, !dbg !24 + %3244 = select i1 %298, float 1.000000e+00, float %3243, !dbg !24 + %3245 = fadd float %3169, %2226, !dbg !24 + %3246 = select i1 %299, float 1.000000e+00, float %3245, !dbg !24 + %3247 = fadd float %3170, %2248, !dbg !24 + %3248 = select i1 %300, float 1.000000e+00, float %3247, !dbg !24 + %3249 = fadd float %3171, %2270, !dbg !24 + %3250 = select i1 %301, float 1.000000e+00, float %3249, !dbg !24 + %3251 = fadd float %3172, %2292, !dbg !24 + %3252 = select i1 %302, float 1.000000e+00, float %3251, !dbg !24 + %3253 = fadd float %3173, %2314, !dbg !24 + %3254 = select i1 %303, float 1.000000e+00, float %3253, !dbg !24 + %3255 = fadd float %3174, %2336, !dbg !24 + %3256 = select i1 %304, float 1.000000e+00, float %3255, !dbg !24 + %3257 = fadd float %3175, %2358, !dbg !24 + %3258 = select i1 %305, float 1.000000e+00, float %3257, !dbg !24 + %3259 = fadd float %3176, %2380, !dbg !24 + %3260 = select i1 %306, float 1.000000e+00, float %3259, !dbg !24 + %3261 = fadd float %3177, %2402, !dbg !24 + %3262 = select i1 %307, float 1.000000e+00, float %3261, !dbg !24 + %3263 = fadd float %3178, %2424, !dbg !24 + %3264 = select i1 %308, float 1.000000e+00, float %3263, !dbg !24 + %3265 = fadd float %3179, %2446, !dbg !24 + %3266 = select i1 %309, float 1.000000e+00, float %3265, !dbg !24 + %3267 = fadd float %3180, %2468, !dbg !24 + %3268 = select i1 %310, float 1.000000e+00, float %3267, !dbg !24 + %3269 = fadd float %3181, %2490, !dbg !24 + %3270 = select i1 %311, float 1.000000e+00, float %3269, !dbg !24 + %3271 = fadd float %3182, %2512, !dbg !24 + %3272 = select i1 %312, float 1.000000e+00, float %3271, !dbg !24 + %3273 = fadd float %3183, %2534, !dbg !24 + %3274 = select i1 %313, float 1.000000e+00, float %3273, !dbg !24 + %3275 = fadd float %3184, %2556, !dbg !24 + %3276 = select i1 %314, float 1.000000e+00, float %3275, !dbg !24 + %3277 = fadd float %3185, %2578, !dbg !24 + %3278 = select i1 %315, float 1.000000e+00, float %3277, !dbg !24 + %3279 = fadd float %3186, %2600, !dbg !24 + %3280 = select i1 %316, float 1.000000e+00, float %3279, !dbg !24 + %3281 = fadd float %3187, %2622, !dbg !24 + %3282 = select i1 %317, float 1.000000e+00, float %3281, !dbg !24 + %3283 = fadd float %3188, %2644, !dbg !24 + %3284 = select i1 %318, float 1.000000e+00, float %3283, !dbg !24 + %3285 = fadd float %3189, %2666, !dbg !24 + %3286 = select i1 %319, float 1.000000e+00, float %3285, !dbg !24 + %3287 = fadd float %3190, %2688, !dbg !24 + %3288 = select i1 %320, float 1.000000e+00, float %3287, !dbg !24 + %3289 = fadd float %3191, %2710, !dbg !24 + %3290 = select i1 %321, float 1.000000e+00, float %3289, !dbg !24 + %3291 = fadd float %3192, %2732, !dbg !24 + %3292 = select i1 %322, float 1.000000e+00, float %3291, !dbg !24 + %3293 = fadd float %3193, %2754, !dbg !24 + %3294 = select i1 %323, float 1.000000e+00, float %3293, !dbg !24 + %3295 = fadd float %3194, %2776, !dbg !24 + %3296 = select i1 %324, float 1.000000e+00, float %3295, !dbg !24 + %3297 = fadd float %3195, %2798, !dbg !24 + %3298 = select i1 %325, float 1.000000e+00, float %3297, !dbg !24 + %3299 = fadd float %3196, %2820, !dbg !24 + %3300 = select i1 %326, float 1.000000e+00, float %3299, !dbg !24 + %3301 = fadd float %3197, %2842, !dbg !24 + %3302 = select i1 %327, float 1.000000e+00, float %3301, !dbg !24 + %3303 = fadd float %3198, %2864, !dbg !24 + %3304 = select i1 %328, float 1.000000e+00, float %3303, !dbg !24 + %3305 = fadd float %3199, %2886, !dbg !24 + %3306 = select i1 %329, float 1.000000e+00, float %3305, !dbg !24 + %3307 = fadd float %3200, %2908, !dbg !24 + %3308 = select i1 %330, float 1.000000e+00, float %3307, !dbg !24 + %3309 = fadd float %3201, %2930, !dbg !24 + %3310 = select i1 %331, float 1.000000e+00, float %3309, !dbg !24 + %3311 = fadd float %3202, %2952, !dbg !24 + %3312 = select i1 %332, float 1.000000e+00, float %3311, !dbg !24 + %3313 = fadd float %3203, %2974, !dbg !24 + %3314 = select i1 %333, float 1.000000e+00, float %3313, !dbg !24 + %3315 = fadd float %3204, %2996, !dbg !24 + %3316 = select i1 %334, float 1.000000e+00, float %3315, !dbg !24 + %3317 = fadd float %3205, %3018, !dbg !24 + %3318 = select i1 %335, float 1.000000e+00, float %3317, !dbg !24 + %3319 = fadd float %3206, %3040, !dbg !24 + %3320 = select i1 %336, float 1.000000e+00, float %3319, !dbg !24 + %3321 = fadd float %3207, %3062, !dbg !24 + %3322 = select i1 %337, float 1.000000e+00, float %3321, !dbg !24 + %3323 = fadd float %3208, %3084, !dbg !24 + %3324 = select i1 %338, float 1.000000e+00, float %3323, !dbg !24 + %3325 = select i1 %16, float %267, float 0xFFF0000000000000, !dbg !25 + %3326 = select i1 %16, float %268, float 0xFFF0000000000000, !dbg !25 + %3327 = select i1 %16, float %270, float 0xFFF0000000000000, !dbg !25 + %3328 = select i1 %16, float %272, float 0xFFF0000000000000, !dbg !25 + %3329 = select i1 %16, float %274, float 0xFFF0000000000000, !dbg !25 + %3330 = select i1 %16, float %276, float 0xFFF0000000000000, !dbg !25 + %3331 = select i1 %16, float %278, float 0xFFF0000000000000, !dbg !25 + %3332 = select i1 %16, float %280, float 0xFFF0000000000000, !dbg !25 + %3333 = select i1 %16, float %3322, float 0.000000e+00, !dbg !26 + %3334 = select i1 %16, float %3324, float 0.000000e+00, !dbg !26 + %3335 = fcmp ogt bfloat %83, %84, !dbg !27 + %3336 = fcmp uno bfloat %83, 0xR0000, !dbg !29 + %3337 = or i1 %3335, %3336, !dbg !30 + %3338 = select i1 %3337, float %211, float %212, !dbg !31 + %3339 = fcmp ogt float %3338, %213, !dbg !27 + %3340 = fcmp uno float %3338, 0.000000e+00, !dbg !29 + %3341 = or i1 %3339, %3340, !dbg !30 + %3342 = select i1 %3341, float %3338, float %213, !dbg !31 + %3343 = fcmp ogt float %3342, %214, !dbg !27 + %3344 = fcmp uno float %3342, 0.000000e+00, !dbg !29 + %3345 = or i1 %3343, %3344, !dbg !30 + %3346 = select i1 %3345, float %3342, float %214, !dbg !31 + %3347 = fcmp ogt float %3346, %215, !dbg !27 + %3348 = fcmp uno float %3346, 0.000000e+00, !dbg !29 + %3349 = or i1 %3347, %3348, !dbg !30 + %3350 = select i1 %3349, float %3346, float %215, !dbg !31 + %3351 = fcmp ogt float %3350, %216, !dbg !27 + %3352 = fcmp uno float %3350, 0.000000e+00, !dbg !29 + %3353 = or i1 %3351, %3352, !dbg !30 + %3354 = select i1 %3353, float %3350, float %216, !dbg !31 + %3355 = fcmp ogt float %3354, %217, !dbg !27 + %3356 = fcmp uno float %3354, 0.000000e+00, !dbg !29 + %3357 = or i1 %3355, %3356, !dbg !30 + %3358 = select i1 %3357, float %3354, float %217, !dbg !31 + %3359 = fcmp ogt float %3358, %218, !dbg !27 + %3360 = fcmp uno float %3358, 0.000000e+00, !dbg !29 + %3361 = or i1 %3359, %3360, !dbg !30 + %3362 = select i1 %3361, float %3358, float %218, !dbg !31 + %3363 = fcmp ogt float %3362, %219, !dbg !27 + %3364 = fcmp uno float %3362, 0.000000e+00, !dbg !29 + %3365 = or i1 %3363, %3364, !dbg !30 + %3366 = select i1 %3365, float %3362, float %219, !dbg !31 + %3367 = fcmp ogt float %3366, %220, !dbg !27 + %3368 = fcmp uno float %3366, 0.000000e+00, !dbg !29 + %3369 = or i1 %3367, %3368, !dbg !30 + %3370 = select i1 %3369, float %3366, float %220, !dbg !31 + %3371 = fcmp ogt float %3370, %221, !dbg !27 + %3372 = fcmp uno float %3370, 0.000000e+00, !dbg !29 + %3373 = or i1 %3371, %3372, !dbg !30 + %3374 = select i1 %3373, float %3370, float %221, !dbg !31 + %3375 = fcmp ogt float %3374, %222, !dbg !27 + %3376 = fcmp uno float %3374, 0.000000e+00, !dbg !29 + %3377 = or i1 %3375, %3376, !dbg !30 + %3378 = select i1 %3377, float %3374, float %222, !dbg !31 + %3379 = fcmp ogt float %3378, %223, !dbg !27 + %3380 = fcmp uno float %3378, 0.000000e+00, !dbg !29 + %3381 = or i1 %3379, %3380, !dbg !30 + %3382 = select i1 %3381, float %3378, float %223, !dbg !31 + %3383 = fcmp ogt float %3382, %224, !dbg !27 + %3384 = fcmp uno float %3382, 0.000000e+00, !dbg !29 + %3385 = or i1 %3383, %3384, !dbg !30 + %3386 = select i1 %3385, float %3382, float %224, !dbg !31 + %3387 = fcmp ogt float %3386, %225, !dbg !27 + %3388 = fcmp uno float %3386, 0.000000e+00, !dbg !29 + %3389 = or i1 %3387, %3388, !dbg !30 + %3390 = select i1 %3389, float %3386, float %225, !dbg !31 + %3391 = fcmp ogt float %3390, %226, !dbg !27 + %3392 = fcmp uno float %3390, 0.000000e+00, !dbg !29 + %3393 = or i1 %3391, %3392, !dbg !30 + %3394 = select i1 %3393, float %3390, float %226, !dbg !31 + %3395 = fcmp ogt float %3394, %227, !dbg !27 + %3396 = fcmp uno float %3394, 0.000000e+00, !dbg !29 + %3397 = or i1 %3395, %3396, !dbg !30 + %3398 = select i1 %3397, float %3394, float %227, !dbg !31 + %3399 = fcmp ogt float %3398, %228, !dbg !27 + %3400 = fcmp uno float %3398, 0.000000e+00, !dbg !29 + %3401 = or i1 %3399, %3400, !dbg !30 + %3402 = select i1 %3401, float %3398, float %228, !dbg !31 + %3403 = fcmp ogt float %3402, %229, !dbg !27 + %3404 = fcmp uno float %3402, 0.000000e+00, !dbg !29 + %3405 = or i1 %3403, %3404, !dbg !30 + %3406 = select i1 %3405, float %3402, float %229, !dbg !31 + %3407 = fcmp ogt float %3406, %230, !dbg !27 + %3408 = fcmp uno float %3406, 0.000000e+00, !dbg !29 + %3409 = or i1 %3407, %3408, !dbg !30 + %3410 = select i1 %3409, float %3406, float %230, !dbg !31 + %3411 = fcmp ogt float %3410, %231, !dbg !27 + %3412 = fcmp uno float %3410, 0.000000e+00, !dbg !29 + %3413 = or i1 %3411, %3412, !dbg !30 + %3414 = select i1 %3413, float %3410, float %231, !dbg !31 + %3415 = fcmp ogt float %3414, %232, !dbg !27 + %3416 = fcmp uno float %3414, 0.000000e+00, !dbg !29 + %3417 = or i1 %3415, %3416, !dbg !30 + %3418 = select i1 %3417, float %3414, float %232, !dbg !31 + %3419 = fcmp ogt float %3418, %233, !dbg !27 + %3420 = fcmp uno float %3418, 0.000000e+00, !dbg !29 + %3421 = or i1 %3419, %3420, !dbg !30 + %3422 = select i1 %3421, float %3418, float %233, !dbg !31 + %3423 = fcmp ogt float %3422, %234, !dbg !27 + %3424 = fcmp uno float %3422, 0.000000e+00, !dbg !29 + %3425 = or i1 %3423, %3424, !dbg !30 + %3426 = select i1 %3425, float %3422, float %234, !dbg !31 + %3427 = fcmp ogt float %3426, %235, !dbg !27 + %3428 = fcmp uno float %3426, 0.000000e+00, !dbg !29 + %3429 = or i1 %3427, %3428, !dbg !30 + %3430 = select i1 %3429, float %3426, float %235, !dbg !31 + %3431 = fcmp ogt float %3430, %236, !dbg !27 + %3432 = fcmp uno float %3430, 0.000000e+00, !dbg !29 + %3433 = or i1 %3431, %3432, !dbg !30 + %3434 = select i1 %3433, float %3430, float %236, !dbg !31 + %3435 = fcmp ogt float %3434, %237, !dbg !27 + %3436 = fcmp uno float %3434, 0.000000e+00, !dbg !29 + %3437 = or i1 %3435, %3436, !dbg !30 + %3438 = select i1 %3437, float %3434, float %237, !dbg !31 + %3439 = fcmp ogt float %3438, %238, !dbg !27 + %3440 = fcmp uno float %3438, 0.000000e+00, !dbg !29 + %3441 = or i1 %3439, %3440, !dbg !30 + %3442 = select i1 %3441, float %3438, float %238, !dbg !31 + %3443 = fcmp ogt float %3442, %239, !dbg !27 + %3444 = fcmp uno float %3442, 0.000000e+00, !dbg !29 + %3445 = or i1 %3443, %3444, !dbg !30 + %3446 = select i1 %3445, float %3442, float %239, !dbg !31 + %3447 = fcmp ogt float %3446, %240, !dbg !27 + %3448 = fcmp uno float %3446, 0.000000e+00, !dbg !29 + %3449 = or i1 %3447, %3448, !dbg !30 + %3450 = select i1 %3449, float %3446, float %240, !dbg !31 + %3451 = fcmp ogt float %3450, %241, !dbg !27 + %3452 = fcmp uno float %3450, 0.000000e+00, !dbg !29 + %3453 = or i1 %3451, %3452, !dbg !30 + %3454 = select i1 %3453, float %3450, float %241, !dbg !31 + %3455 = fcmp ogt float %3454, %242, !dbg !27 + %3456 = fcmp uno float %3454, 0.000000e+00, !dbg !29 + %3457 = or i1 %3455, %3456, !dbg !30 + %3458 = select i1 %3457, float %3454, float %242, !dbg !31 + %3459 = fcmp ogt float %3458, %243, !dbg !27 + %3460 = fcmp uno float %3458, 0.000000e+00, !dbg !29 + %3461 = or i1 %3459, %3460, !dbg !30 + %3462 = select i1 %3461, float %3458, float %243, !dbg !31 + %3463 = fcmp ogt float %3462, %244, !dbg !27 + %3464 = fcmp uno float %3462, 0.000000e+00, !dbg !29 + %3465 = or i1 %3463, %3464, !dbg !30 + %3466 = select i1 %3465, float %3462, float %244, !dbg !31 + %3467 = fcmp ogt float %3466, %245, !dbg !27 + %3468 = fcmp uno float %3466, 0.000000e+00, !dbg !29 + %3469 = or i1 %3467, %3468, !dbg !30 + %3470 = select i1 %3469, float %3466, float %245, !dbg !31 + %3471 = fcmp ogt float %3470, %246, !dbg !27 + %3472 = fcmp uno float %3470, 0.000000e+00, !dbg !29 + %3473 = or i1 %3471, %3472, !dbg !30 + %3474 = select i1 %3473, float %3470, float %246, !dbg !31 + %3475 = fcmp ogt float %3474, %247, !dbg !27 + %3476 = fcmp uno float %3474, 0.000000e+00, !dbg !29 + %3477 = or i1 %3475, %3476, !dbg !30 + %3478 = select i1 %3477, float %3474, float %247, !dbg !31 + %3479 = fcmp ogt float %3478, %248, !dbg !27 + %3480 = fcmp uno float %3478, 0.000000e+00, !dbg !29 + %3481 = or i1 %3479, %3480, !dbg !30 + %3482 = select i1 %3481, float %3478, float %248, !dbg !31 + %3483 = fcmp ogt float %3482, %249, !dbg !27 + %3484 = fcmp uno float %3482, 0.000000e+00, !dbg !29 + %3485 = or i1 %3483, %3484, !dbg !30 + %3486 = select i1 %3485, float %3482, float %249, !dbg !31 + %3487 = fcmp ogt float %3486, %250, !dbg !27 + %3488 = fcmp uno float %3486, 0.000000e+00, !dbg !29 + %3489 = or i1 %3487, %3488, !dbg !30 + %3490 = select i1 %3489, float %3486, float %250, !dbg !31 + %3491 = fcmp ogt float %3490, %251, !dbg !27 + %3492 = fcmp uno float %3490, 0.000000e+00, !dbg !29 + %3493 = or i1 %3491, %3492, !dbg !30 + %3494 = select i1 %3493, float %3490, float %251, !dbg !31 + %3495 = fcmp ogt float %3494, %252, !dbg !27 + %3496 = fcmp uno float %3494, 0.000000e+00, !dbg !29 + %3497 = or i1 %3495, %3496, !dbg !30 + %3498 = select i1 %3497, float %3494, float %252, !dbg !31 + %3499 = fcmp ogt float %3498, %253, !dbg !27 + %3500 = fcmp uno float %3498, 0.000000e+00, !dbg !29 + %3501 = or i1 %3499, %3500, !dbg !30 + %3502 = select i1 %3501, float %3498, float %253, !dbg !31 + %3503 = fcmp ogt float %3502, %254, !dbg !27 + %3504 = fcmp uno float %3502, 0.000000e+00, !dbg !29 + %3505 = or i1 %3503, %3504, !dbg !30 + %3506 = select i1 %3505, float %3502, float %254, !dbg !31 + %3507 = fcmp ogt float %3506, %255, !dbg !27 + %3508 = fcmp uno float %3506, 0.000000e+00, !dbg !29 + %3509 = or i1 %3507, %3508, !dbg !30 + %3510 = select i1 %3509, float %3506, float %255, !dbg !31 + %3511 = fcmp ogt float %3510, %256, !dbg !27 + %3512 = fcmp uno float %3510, 0.000000e+00, !dbg !29 + %3513 = or i1 %3511, %3512, !dbg !30 + %3514 = select i1 %3513, float %3510, float %256, !dbg !31 + %3515 = fcmp ogt float %3514, %257, !dbg !27 + %3516 = fcmp uno float %3514, 0.000000e+00, !dbg !29 + %3517 = or i1 %3515, %3516, !dbg !30 + %3518 = select i1 %3517, float %3514, float %257, !dbg !31 + %3519 = fcmp ogt float %3518, %258, !dbg !27 + %3520 = fcmp uno float %3518, 0.000000e+00, !dbg !29 + %3521 = or i1 %3519, %3520, !dbg !30 + %3522 = select i1 %3521, float %3518, float %258, !dbg !31 + %3523 = fcmp ogt float %3522, %259, !dbg !27 + %3524 = fcmp uno float %3522, 0.000000e+00, !dbg !29 + %3525 = or i1 %3523, %3524, !dbg !30 + %3526 = select i1 %3525, float %3522, float %259, !dbg !31 + %3527 = fcmp ogt float %3526, %260, !dbg !27 + %3528 = fcmp uno float %3526, 0.000000e+00, !dbg !29 + %3529 = or i1 %3527, %3528, !dbg !30 + %3530 = select i1 %3529, float %3526, float %260, !dbg !31 + %3531 = fcmp ogt float %3530, %261, !dbg !27 + %3532 = fcmp uno float %3530, 0.000000e+00, !dbg !29 + %3533 = or i1 %3531, %3532, !dbg !30 + %3534 = select i1 %3533, float %3530, float %261, !dbg !31 + %3535 = fcmp ogt float %3534, %262, !dbg !27 + %3536 = fcmp uno float %3534, 0.000000e+00, !dbg !29 + %3537 = or i1 %3535, %3536, !dbg !30 + %3538 = select i1 %3537, float %3534, float %262, !dbg !31 + %3539 = fcmp ogt float %3538, %263, !dbg !27 + %3540 = fcmp uno float %3538, 0.000000e+00, !dbg !29 + %3541 = or i1 %3539, %3540, !dbg !30 + %3542 = select i1 %3541, float %3538, float %263, !dbg !31 + %3543 = fcmp ogt float %3542, %264, !dbg !27 + %3544 = fcmp uno float %3542, 0.000000e+00, !dbg !29 + %3545 = or i1 %3543, %3544, !dbg !30 + %3546 = select i1 %3545, float %3542, float %264, !dbg !31 + %3547 = fcmp ogt float %3546, %265, !dbg !27 + %3548 = fcmp uno float %3546, 0.000000e+00, !dbg !29 + %3549 = or i1 %3547, %3548, !dbg !30 + %3550 = select i1 %3549, float %3546, float %265, !dbg !31 + %3551 = fcmp ogt float %3550, %266, !dbg !27 + %3552 = fcmp uno float %3550, 0.000000e+00, !dbg !29 + %3553 = or i1 %3551, %3552, !dbg !30 + %3554 = select i1 %3553, float %3550, float %266, !dbg !31 + %3555 = fcmp ogt float %3554, %3325, !dbg !27 + %3556 = fcmp uno float %3554, 0.000000e+00, !dbg !29 + %3557 = or i1 %3555, %3556, !dbg !30 + %3558 = select i1 %3557, float %3554, float %3325, !dbg !31 + %3559 = fcmp ogt float %3558, %3326, !dbg !27 + %3560 = fcmp uno float %3558, 0.000000e+00, !dbg !29 + %3561 = or i1 %3559, %3560, !dbg !30 + %3562 = select i1 %3561, float %3558, float %3326, !dbg !31 + %3563 = fcmp ogt float %3562, %3327, !dbg !27 + %3564 = fcmp uno float %3562, 0.000000e+00, !dbg !29 + %3565 = or i1 %3563, %3564, !dbg !30 + %3566 = select i1 %3565, float %3562, float %3327, !dbg !31 + %3567 = fcmp ogt float %3566, %3328, !dbg !27 + %3568 = fcmp uno float %3566, 0.000000e+00, !dbg !29 + %3569 = or i1 %3567, %3568, !dbg !30 + %3570 = select i1 %3569, float %3566, float %3328, !dbg !31 + %3571 = fcmp ogt float %3570, %3329, !dbg !27 + %3572 = fcmp uno float %3570, 0.000000e+00, !dbg !29 + %3573 = or i1 %3571, %3572, !dbg !30 + %3574 = select i1 %3573, float %3570, float %3329, !dbg !31 + %3575 = fcmp ogt float %3574, %3330, !dbg !27 + %3576 = fcmp uno float %3574, 0.000000e+00, !dbg !29 + %3577 = or i1 %3575, %3576, !dbg !30 + %3578 = select i1 %3577, float %3574, float %3330, !dbg !31 + %3579 = fcmp ogt float %3578, %3331, !dbg !27 + %3580 = fcmp uno float %3578, 0.000000e+00, !dbg !29 + %3581 = or i1 %3579, %3580, !dbg !30 + %3582 = select i1 %3581, float %3578, float %3331, !dbg !31 + %3583 = fcmp ogt float %3582, %3332, !dbg !27 + %3584 = fcmp uno float %3582, 0.000000e+00, !dbg !29 + %3585 = or i1 %3583, %3584, !dbg !30 + %3586 = select i1 %3585, float %3582, float %3332, !dbg !31 + %3587 = bitcast float %3586 to i32, !dbg !32 + %3588 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3587, i32 16, i32 31), !dbg !32 + %3589 = bitcast i32 %3588 to float, !dbg !32 + %3590 = fcmp ogt float %3586, %3589, !dbg !27 + %3591 = fcmp uno float %3586, 0.000000e+00, !dbg !29 + %3592 = or i1 %3591, %3590, !dbg !30 + %3593 = select i1 %3592, float %3586, float %3589, !dbg !31 + %3594 = bitcast float %3593 to i32, !dbg !32 + %3595 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3594, i32 8, i32 31), !dbg !32 + %3596 = bitcast i32 %3595 to float, !dbg !32 + %3597 = fcmp ogt float %3593, %3596, !dbg !27 + %3598 = fcmp uno float %3593, 0.000000e+00, !dbg !29 + %3599 = or i1 %3597, %3598, !dbg !30 + %3600 = select i1 %3599, float %3593, float %3596, !dbg !31 + %3601 = bitcast float %3600 to i32, !dbg !32 + %3602 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3601, i32 4, i32 31), !dbg !32 + %3603 = bitcast i32 %3602 to float, !dbg !32 + %3604 = fcmp ogt float %3600, %3603, !dbg !27 + %3605 = fcmp uno float %3600, 0.000000e+00, !dbg !29 + %3606 = or i1 %3604, %3605, !dbg !30 + %3607 = select i1 %3606, float %3600, float %3603, !dbg !31 + %3608 = bitcast float %3607 to i32, !dbg !32 + %3609 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3608, i32 2, i32 31), !dbg !32 + %3610 = bitcast i32 %3609 to float, !dbg !32 + %3611 = fcmp ogt float %3607, %3610, !dbg !27 + %3612 = fcmp uno float %3607, 0.000000e+00, !dbg !29 + %3613 = or i1 %3611, %3612, !dbg !30 + %3614 = select i1 %3613, float %3607, float %3610, !dbg !31 + %3615 = bitcast float %3614 to i32, !dbg !32 + %3616 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3615, i32 1, i32 31), !dbg !32 + %3617 = bitcast i32 %3616 to float, !dbg !32 + %3618 = fcmp ogt float %3614, %3617, !dbg !27 + %3619 = fcmp uno float %3614, 0.000000e+00, !dbg !29 + %3620 = or i1 %3618, %3619, !dbg !30 + %3621 = and i32 %10, 15, !dbg !32 + %3622 = icmp eq i32 %9, 0, !dbg !32 + %3623 = getelementptr float, ptr addrspace(3) @global_smem, i32 %3621, !dbg !32 + %3624 = select i1 %3620, i32 %3615, i32 %3616, !dbg !31 + %3625 = insertelement <1 x i32> poison, i32 %3624, i64 0, !dbg !32 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %3623, <1 x i32> %3625, i1 %3622) #6, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !32 + %3626 = icmp samesign ult i32 %8, 16, !dbg !32 + %3627 = getelementptr float, ptr addrspace(3) @global_smem, i32 %8, !dbg !32 + %3628 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %3627, i1 %3626) #6, !dbg !32 + %3629 = bitcast i32 %3628 to float, !dbg !32 + %3630 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3628, i32 8, i32 31), !dbg !32 + %3631 = bitcast i32 %3630 to float, !dbg !32 + %3632 = fcmp ogt float %3629, %3631, !dbg !27 + %3633 = fcmp uno float %3629, 0.000000e+00, !dbg !29 + %3634 = or i1 %3633, %3632, !dbg !30 + %3635 = select i1 %3634, float %3629, float %3631, !dbg !31 + %3636 = bitcast float %3635 to i32, !dbg !32 + %3637 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3636, i32 4, i32 31), !dbg !32 + %3638 = bitcast i32 %3637 to float, !dbg !32 + %3639 = fcmp ogt float %3635, %3638, !dbg !27 + %3640 = fcmp uno float %3635, 0.000000e+00, !dbg !29 + %3641 = or i1 %3639, %3640, !dbg !30 + %3642 = select i1 %3641, float %3635, float %3638, !dbg !31 + %3643 = bitcast float %3642 to i32, !dbg !32 + %3644 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3643, i32 2, i32 31), !dbg !32 + %3645 = bitcast i32 %3644 to float, !dbg !32 + %3646 = fcmp ogt float %3642, %3645, !dbg !27 + %3647 = fcmp uno float %3642, 0.000000e+00, !dbg !29 + %3648 = or i1 %3646, %3647, !dbg !30 + %3649 = select i1 %3648, float %3642, float %3645, !dbg !31 + %3650 = bitcast float %3649 to i32, !dbg !32 + %3651 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3650, i32 1, i32 31), !dbg !32 + %3652 = bitcast i32 %3651 to float, !dbg !32 + %3653 = fcmp ogt float %3649, %3652, !dbg !27 + %3654 = fcmp uno float %3649, 0.000000e+00, !dbg !29 + %3655 = or i1 %3653, %3654, !dbg !30 + %3656 = icmp eq i32 %8, 0, !dbg !32 + %3657 = select i1 %3655, i32 %3650, i32 %3651, !dbg !31 + %3658 = insertelement <1 x i32> poison, i32 %3657, i64 0, !dbg !32 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %3627, <1 x i32> %3658, i1 %3656) #6, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !32 + %3659 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !32 + %3660 = fcmp oeq float %3659, 0xFFF0000000000000, !dbg !33 + %3661 = fsub float %211, %3659, !dbg !34 + %3662 = fsub float %212, %3659, !dbg !34 + %3663 = fsub float %213, %3659, !dbg !34 + %3664 = fsub float %214, %3659, !dbg !34 + %3665 = fsub float %215, %3659, !dbg !34 + %3666 = fsub float %216, %3659, !dbg !34 + %3667 = fsub float %217, %3659, !dbg !34 + %3668 = fsub float %218, %3659, !dbg !34 + %3669 = fsub float %219, %3659, !dbg !34 + %3670 = fsub float %220, %3659, !dbg !34 + %3671 = fsub float %221, %3659, !dbg !34 + %3672 = fsub float %222, %3659, !dbg !34 + %3673 = fsub float %223, %3659, !dbg !34 + %3674 = fsub float %224, %3659, !dbg !34 + %3675 = fsub float %225, %3659, !dbg !34 + %3676 = fsub float %226, %3659, !dbg !34 + %3677 = fsub float %227, %3659, !dbg !34 + %3678 = fsub float %228, %3659, !dbg !34 + %3679 = fsub float %229, %3659, !dbg !34 + %3680 = fsub float %230, %3659, !dbg !34 + %3681 = fsub float %231, %3659, !dbg !34 + %3682 = fsub float %232, %3659, !dbg !34 + %3683 = fsub float %233, %3659, !dbg !34 + %3684 = fsub float %234, %3659, !dbg !34 + %3685 = fsub float %235, %3659, !dbg !34 + %3686 = fsub float %236, %3659, !dbg !34 + %3687 = fsub float %237, %3659, !dbg !34 + %3688 = fsub float %238, %3659, !dbg !34 + %3689 = fsub float %239, %3659, !dbg !34 + %3690 = fsub float %240, %3659, !dbg !34 + %3691 = fsub float %241, %3659, !dbg !34 + %3692 = fsub float %242, %3659, !dbg !34 + %3693 = fsub float %243, %3659, !dbg !34 + %3694 = fsub float %244, %3659, !dbg !34 + %3695 = fsub float %245, %3659, !dbg !34 + %3696 = fsub float %246, %3659, !dbg !34 + %3697 = fsub float %247, %3659, !dbg !34 + %3698 = fsub float %248, %3659, !dbg !34 + %3699 = fsub float %249, %3659, !dbg !34 + %3700 = fsub float %250, %3659, !dbg !34 + %3701 = fsub float %251, %3659, !dbg !34 + %3702 = fsub float %252, %3659, !dbg !34 + %3703 = fsub float %253, %3659, !dbg !34 + %3704 = fsub float %254, %3659, !dbg !34 + %3705 = fsub float %255, %3659, !dbg !34 + %3706 = fsub float %256, %3659, !dbg !34 + %3707 = fsub float %257, %3659, !dbg !34 + %3708 = fsub float %258, %3659, !dbg !34 + %3709 = fsub float %259, %3659, !dbg !34 + %3710 = fsub float %260, %3659, !dbg !34 + %3711 = fsub float %261, %3659, !dbg !34 + %3712 = fsub float %262, %3659, !dbg !34 + %3713 = fsub float %263, %3659, !dbg !34 + %3714 = fsub float %264, %3659, !dbg !34 + %3715 = fsub float %265, %3659, !dbg !34 + %3716 = fsub float %266, %3659, !dbg !34 + %3717 = fsub float %3325, %3659, !dbg !34 + %3718 = fsub float %3326, %3659, !dbg !34 + %3719 = fsub float %3327, %3659, !dbg !34 + %3720 = fsub float %3328, %3659, !dbg !34 + %3721 = fsub float %3329, %3659, !dbg !34 + %3722 = fsub float %3330, %3659, !dbg !34 + %3723 = fsub float %3331, %3659, !dbg !34 + %3724 = fsub float %3332, %3659, !dbg !34 + %3725 = select i1 %3660, float 0.000000e+00, float %3661, !dbg !35 + %3726 = select i1 %3660, float 0.000000e+00, float %3662, !dbg !35 + %3727 = select i1 %3660, float 0.000000e+00, float %3663, !dbg !35 + %3728 = select i1 %3660, float 0.000000e+00, float %3664, !dbg !35 + %3729 = select i1 %3660, float 0.000000e+00, float %3665, !dbg !35 + %3730 = select i1 %3660, float 0.000000e+00, float %3666, !dbg !35 + %3731 = select i1 %3660, float 0.000000e+00, float %3667, !dbg !35 + %3732 = select i1 %3660, float 0.000000e+00, float %3668, !dbg !35 + %3733 = select i1 %3660, float 0.000000e+00, float %3669, !dbg !35 + %3734 = select i1 %3660, float 0.000000e+00, float %3670, !dbg !35 + %3735 = select i1 %3660, float 0.000000e+00, float %3671, !dbg !35 + %3736 = select i1 %3660, float 0.000000e+00, float %3672, !dbg !35 + %3737 = select i1 %3660, float 0.000000e+00, float %3673, !dbg !35 + %3738 = select i1 %3660, float 0.000000e+00, float %3674, !dbg !35 + %3739 = select i1 %3660, float 0.000000e+00, float %3675, !dbg !35 + %3740 = select i1 %3660, float 0.000000e+00, float %3676, !dbg !35 + %3741 = select i1 %3660, float 0.000000e+00, float %3677, !dbg !35 + %3742 = select i1 %3660, float 0.000000e+00, float %3678, !dbg !35 + %3743 = select i1 %3660, float 0.000000e+00, float %3679, !dbg !35 + %3744 = select i1 %3660, float 0.000000e+00, float %3680, !dbg !35 + %3745 = select i1 %3660, float 0.000000e+00, float %3681, !dbg !35 + %3746 = select i1 %3660, float 0.000000e+00, float %3682, !dbg !35 + %3747 = select i1 %3660, float 0.000000e+00, float %3683, !dbg !35 + %3748 = select i1 %3660, float 0.000000e+00, float %3684, !dbg !35 + %3749 = select i1 %3660, float 0.000000e+00, float %3685, !dbg !35 + %3750 = select i1 %3660, float 0.000000e+00, float %3686, !dbg !35 + %3751 = select i1 %3660, float 0.000000e+00, float %3687, !dbg !35 + %3752 = select i1 %3660, float 0.000000e+00, float %3688, !dbg !35 + %3753 = select i1 %3660, float 0.000000e+00, float %3689, !dbg !35 + %3754 = select i1 %3660, float 0.000000e+00, float %3690, !dbg !35 + %3755 = select i1 %3660, float 0.000000e+00, float %3691, !dbg !35 + %3756 = select i1 %3660, float 0.000000e+00, float %3692, !dbg !35 + %3757 = select i1 %3660, float 0.000000e+00, float %3693, !dbg !35 + %3758 = select i1 %3660, float 0.000000e+00, float %3694, !dbg !35 + %3759 = select i1 %3660, float 0.000000e+00, float %3695, !dbg !35 + %3760 = select i1 %3660, float 0.000000e+00, float %3696, !dbg !35 + %3761 = select i1 %3660, float 0.000000e+00, float %3697, !dbg !35 + %3762 = select i1 %3660, float 0.000000e+00, float %3698, !dbg !35 + %3763 = select i1 %3660, float 0.000000e+00, float %3699, !dbg !35 + %3764 = select i1 %3660, float 0.000000e+00, float %3700, !dbg !35 + %3765 = select i1 %3660, float 0.000000e+00, float %3701, !dbg !35 + %3766 = select i1 %3660, float 0.000000e+00, float %3702, !dbg !35 + %3767 = select i1 %3660, float 0.000000e+00, float %3703, !dbg !35 + %3768 = select i1 %3660, float 0.000000e+00, float %3704, !dbg !35 + %3769 = select i1 %3660, float 0.000000e+00, float %3705, !dbg !35 + %3770 = select i1 %3660, float 0.000000e+00, float %3706, !dbg !35 + %3771 = select i1 %3660, float 0.000000e+00, float %3707, !dbg !35 + %3772 = select i1 %3660, float 0.000000e+00, float %3708, !dbg !35 + %3773 = select i1 %3660, float 0.000000e+00, float %3709, !dbg !35 + %3774 = select i1 %3660, float 0.000000e+00, float %3710, !dbg !35 + %3775 = select i1 %3660, float 0.000000e+00, float %3711, !dbg !35 + %3776 = select i1 %3660, float 0.000000e+00, float %3712, !dbg !35 + %3777 = select i1 %3660, float 0.000000e+00, float %3713, !dbg !35 + %3778 = select i1 %3660, float 0.000000e+00, float %3714, !dbg !35 + %3779 = select i1 %3660, float 0.000000e+00, float %3715, !dbg !35 + %3780 = select i1 %3660, float 0.000000e+00, float %3716, !dbg !35 + %3781 = select i1 %3660, float 0.000000e+00, float %3717, !dbg !35 + %3782 = select i1 %3660, float 0.000000e+00, float %3718, !dbg !35 + %3783 = select i1 %3660, float 0.000000e+00, float %3719, !dbg !35 + %3784 = select i1 %3660, float 0.000000e+00, float %3720, !dbg !35 + %3785 = select i1 %3660, float 0.000000e+00, float %3721, !dbg !35 + %3786 = select i1 %3660, float 0.000000e+00, float %3722, !dbg !35 + %3787 = select i1 %3660, float 0.000000e+00, float %3723, !dbg !35 + %3788 = select i1 %3660, float 0.000000e+00, float %3724, !dbg !35 + %3789 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1397 = icmp eq i32 %3789, 0, !dbg !36 + %3790 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3725, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3791 = tail call float @llvm.nvvm.fma.rn.f(float %3725, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1398 = select i1 %.not.i1397, float %3791, float %3790, !dbg !36 + %3792 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1399 = icmp eq i32 %3792, 0, !dbg !36 + %3793 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1398) #6, !dbg !36 + %3794 = tail call float @llvm.nvvm.saturate.f(float %.02.i1398) #6, !dbg !36 + %.03.i1400 = select i1 %.not1.i1399, float %3794, float %3793, !dbg !36 + %3795 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1401 = icmp eq i32 %3795, 0, !dbg !36 + %3796 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1400, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3797 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1400, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1402 = select i1 %.not2.i1401, float %3797, float %3796, !dbg !36 + %3798 = fadd float %.04.i1402, 0xC168000FE0000000, !dbg !36 + %3799 = fneg float %3798, !dbg !36 + %3800 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1403 = icmp eq i32 %3800, 0, !dbg !36 + %3801 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3725, float 0x3FF7154760000000, float %3799) #6, !dbg !36 + %3802 = tail call float @llvm.nvvm.fma.rn.f(float %3725, float 0x3FF7154760000000, float %3799) #6, !dbg !36 + %.0.i1404 = select i1 %.not3.i1403, float %3802, float %3801, !dbg !36 + %3803 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1405 = icmp eq i32 %3803, 0, !dbg !36 + %3804 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3725, float 0x3E54AE0C00000000, float %.0.i1404) #6, !dbg !36 + %3805 = tail call float @llvm.nvvm.fma.rn.f(float %3725, float 0x3E54AE0C00000000, float %.0.i1404) #6, !dbg !36 + %.01.i1406 = select i1 %.not4.i1405, float %3805, float %3804, !dbg !36 + %3806 = bitcast float %.04.i1402 to i32, !dbg !36 + %3807 = shl i32 %3806, 23, !dbg !36 + %3808 = bitcast i32 %3807 to float, !dbg !36 + %3809 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1406) #6, !dbg !36 + %3810 = fmul float %3809, %3808, !dbg !36 + %3811 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1407 = icmp eq i32 %3811, 0, !dbg !36 + %3812 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3726, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3813 = tail call float @llvm.nvvm.fma.rn.f(float %3726, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1408 = select i1 %.not.i1407, float %3813, float %3812, !dbg !36 + %3814 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1409 = icmp eq i32 %3814, 0, !dbg !36 + %3815 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1408) #6, !dbg !36 + %3816 = tail call float @llvm.nvvm.saturate.f(float %.02.i1408) #6, !dbg !36 + %.03.i1410 = select i1 %.not1.i1409, float %3816, float %3815, !dbg !36 + %3817 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1411 = icmp eq i32 %3817, 0, !dbg !36 + %3818 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1410, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3819 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1410, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1412 = select i1 %.not2.i1411, float %3819, float %3818, !dbg !36 + %3820 = fadd float %.04.i1412, 0xC168000FE0000000, !dbg !36 + %3821 = fneg float %3820, !dbg !36 + %3822 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1413 = icmp eq i32 %3822, 0, !dbg !36 + %3823 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3726, float 0x3FF7154760000000, float %3821) #6, !dbg !36 + %3824 = tail call float @llvm.nvvm.fma.rn.f(float %3726, float 0x3FF7154760000000, float %3821) #6, !dbg !36 + %.0.i1414 = select i1 %.not3.i1413, float %3824, float %3823, !dbg !36 + %3825 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1415 = icmp eq i32 %3825, 0, !dbg !36 + %3826 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3726, float 0x3E54AE0C00000000, float %.0.i1414) #6, !dbg !36 + %3827 = tail call float @llvm.nvvm.fma.rn.f(float %3726, float 0x3E54AE0C00000000, float %.0.i1414) #6, !dbg !36 + %.01.i1416 = select i1 %.not4.i1415, float %3827, float %3826, !dbg !36 + %3828 = bitcast float %.04.i1412 to i32, !dbg !36 + %3829 = shl i32 %3828, 23, !dbg !36 + %3830 = bitcast i32 %3829 to float, !dbg !36 + %3831 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1416) #6, !dbg !36 + %3832 = fmul float %3831, %3830, !dbg !36 + %3833 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1417 = icmp eq i32 %3833, 0, !dbg !36 + %3834 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3727, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3835 = tail call float @llvm.nvvm.fma.rn.f(float %3727, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1418 = select i1 %.not.i1417, float %3835, float %3834, !dbg !36 + %3836 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1419 = icmp eq i32 %3836, 0, !dbg !36 + %3837 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1418) #6, !dbg !36 + %3838 = tail call float @llvm.nvvm.saturate.f(float %.02.i1418) #6, !dbg !36 + %.03.i1420 = select i1 %.not1.i1419, float %3838, float %3837, !dbg !36 + %3839 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1421 = icmp eq i32 %3839, 0, !dbg !36 + %3840 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1420, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3841 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1420, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1422 = select i1 %.not2.i1421, float %3841, float %3840, !dbg !36 + %3842 = fadd float %.04.i1422, 0xC168000FE0000000, !dbg !36 + %3843 = fneg float %3842, !dbg !36 + %3844 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1423 = icmp eq i32 %3844, 0, !dbg !36 + %3845 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3727, float 0x3FF7154760000000, float %3843) #6, !dbg !36 + %3846 = tail call float @llvm.nvvm.fma.rn.f(float %3727, float 0x3FF7154760000000, float %3843) #6, !dbg !36 + %.0.i1424 = select i1 %.not3.i1423, float %3846, float %3845, !dbg !36 + %3847 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1425 = icmp eq i32 %3847, 0, !dbg !36 + %3848 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3727, float 0x3E54AE0C00000000, float %.0.i1424) #6, !dbg !36 + %3849 = tail call float @llvm.nvvm.fma.rn.f(float %3727, float 0x3E54AE0C00000000, float %.0.i1424) #6, !dbg !36 + %.01.i1426 = select i1 %.not4.i1425, float %3849, float %3848, !dbg !36 + %3850 = bitcast float %.04.i1422 to i32, !dbg !36 + %3851 = shl i32 %3850, 23, !dbg !36 + %3852 = bitcast i32 %3851 to float, !dbg !36 + %3853 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1426) #6, !dbg !36 + %3854 = fmul float %3853, %3852, !dbg !36 + %3855 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1427 = icmp eq i32 %3855, 0, !dbg !36 + %3856 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3728, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3857 = tail call float @llvm.nvvm.fma.rn.f(float %3728, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1428 = select i1 %.not.i1427, float %3857, float %3856, !dbg !36 + %3858 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1429 = icmp eq i32 %3858, 0, !dbg !36 + %3859 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1428) #6, !dbg !36 + %3860 = tail call float @llvm.nvvm.saturate.f(float %.02.i1428) #6, !dbg !36 + %.03.i1430 = select i1 %.not1.i1429, float %3860, float %3859, !dbg !36 + %3861 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1431 = icmp eq i32 %3861, 0, !dbg !36 + %3862 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1430, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3863 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1430, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1432 = select i1 %.not2.i1431, float %3863, float %3862, !dbg !36 + %3864 = fadd float %.04.i1432, 0xC168000FE0000000, !dbg !36 + %3865 = fneg float %3864, !dbg !36 + %3866 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1433 = icmp eq i32 %3866, 0, !dbg !36 + %3867 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3728, float 0x3FF7154760000000, float %3865) #6, !dbg !36 + %3868 = tail call float @llvm.nvvm.fma.rn.f(float %3728, float 0x3FF7154760000000, float %3865) #6, !dbg !36 + %.0.i1434 = select i1 %.not3.i1433, float %3868, float %3867, !dbg !36 + %3869 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1435 = icmp eq i32 %3869, 0, !dbg !36 + %3870 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3728, float 0x3E54AE0C00000000, float %.0.i1434) #6, !dbg !36 + %3871 = tail call float @llvm.nvvm.fma.rn.f(float %3728, float 0x3E54AE0C00000000, float %.0.i1434) #6, !dbg !36 + %.01.i1436 = select i1 %.not4.i1435, float %3871, float %3870, !dbg !36 + %3872 = bitcast float %.04.i1432 to i32, !dbg !36 + %3873 = shl i32 %3872, 23, !dbg !36 + %3874 = bitcast i32 %3873 to float, !dbg !36 + %3875 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1436) #6, !dbg !36 + %3876 = fmul float %3875, %3874, !dbg !36 + %3877 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1437 = icmp eq i32 %3877, 0, !dbg !36 + %3878 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3729, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3879 = tail call float @llvm.nvvm.fma.rn.f(float %3729, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1438 = select i1 %.not.i1437, float %3879, float %3878, !dbg !36 + %3880 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1439 = icmp eq i32 %3880, 0, !dbg !36 + %3881 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1438) #6, !dbg !36 + %3882 = tail call float @llvm.nvvm.saturate.f(float %.02.i1438) #6, !dbg !36 + %.03.i1440 = select i1 %.not1.i1439, float %3882, float %3881, !dbg !36 + %3883 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1441 = icmp eq i32 %3883, 0, !dbg !36 + %3884 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1440, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3885 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1440, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1442 = select i1 %.not2.i1441, float %3885, float %3884, !dbg !36 + %3886 = fadd float %.04.i1442, 0xC168000FE0000000, !dbg !36 + %3887 = fneg float %3886, !dbg !36 + %3888 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1443 = icmp eq i32 %3888, 0, !dbg !36 + %3889 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3729, float 0x3FF7154760000000, float %3887) #6, !dbg !36 + %3890 = tail call float @llvm.nvvm.fma.rn.f(float %3729, float 0x3FF7154760000000, float %3887) #6, !dbg !36 + %.0.i1444 = select i1 %.not3.i1443, float %3890, float %3889, !dbg !36 + %3891 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1445 = icmp eq i32 %3891, 0, !dbg !36 + %3892 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3729, float 0x3E54AE0C00000000, float %.0.i1444) #6, !dbg !36 + %3893 = tail call float @llvm.nvvm.fma.rn.f(float %3729, float 0x3E54AE0C00000000, float %.0.i1444) #6, !dbg !36 + %.01.i1446 = select i1 %.not4.i1445, float %3893, float %3892, !dbg !36 + %3894 = bitcast float %.04.i1442 to i32, !dbg !36 + %3895 = shl i32 %3894, 23, !dbg !36 + %3896 = bitcast i32 %3895 to float, !dbg !36 + %3897 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1446) #6, !dbg !36 + %3898 = fmul float %3897, %3896, !dbg !36 + %3899 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1447 = icmp eq i32 %3899, 0, !dbg !36 + %3900 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3730, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3901 = tail call float @llvm.nvvm.fma.rn.f(float %3730, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1448 = select i1 %.not.i1447, float %3901, float %3900, !dbg !36 + %3902 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1449 = icmp eq i32 %3902, 0, !dbg !36 + %3903 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1448) #6, !dbg !36 + %3904 = tail call float @llvm.nvvm.saturate.f(float %.02.i1448) #6, !dbg !36 + %.03.i1450 = select i1 %.not1.i1449, float %3904, float %3903, !dbg !36 + %3905 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1451 = icmp eq i32 %3905, 0, !dbg !36 + %3906 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1450, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3907 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1450, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1452 = select i1 %.not2.i1451, float %3907, float %3906, !dbg !36 + %3908 = fadd float %.04.i1452, 0xC168000FE0000000, !dbg !36 + %3909 = fneg float %3908, !dbg !36 + %3910 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1453 = icmp eq i32 %3910, 0, !dbg !36 + %3911 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3730, float 0x3FF7154760000000, float %3909) #6, !dbg !36 + %3912 = tail call float @llvm.nvvm.fma.rn.f(float %3730, float 0x3FF7154760000000, float %3909) #6, !dbg !36 + %.0.i1454 = select i1 %.not3.i1453, float %3912, float %3911, !dbg !36 + %3913 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1455 = icmp eq i32 %3913, 0, !dbg !36 + %3914 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3730, float 0x3E54AE0C00000000, float %.0.i1454) #6, !dbg !36 + %3915 = tail call float @llvm.nvvm.fma.rn.f(float %3730, float 0x3E54AE0C00000000, float %.0.i1454) #6, !dbg !36 + %.01.i1456 = select i1 %.not4.i1455, float %3915, float %3914, !dbg !36 + %3916 = bitcast float %.04.i1452 to i32, !dbg !36 + %3917 = shl i32 %3916, 23, !dbg !36 + %3918 = bitcast i32 %3917 to float, !dbg !36 + %3919 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1456) #6, !dbg !36 + %3920 = fmul float %3919, %3918, !dbg !36 + %3921 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1457 = icmp eq i32 %3921, 0, !dbg !36 + %3922 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3731, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3923 = tail call float @llvm.nvvm.fma.rn.f(float %3731, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1458 = select i1 %.not.i1457, float %3923, float %3922, !dbg !36 + %3924 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1459 = icmp eq i32 %3924, 0, !dbg !36 + %3925 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1458) #6, !dbg !36 + %3926 = tail call float @llvm.nvvm.saturate.f(float %.02.i1458) #6, !dbg !36 + %.03.i1460 = select i1 %.not1.i1459, float %3926, float %3925, !dbg !36 + %3927 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1461 = icmp eq i32 %3927, 0, !dbg !36 + %3928 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1460, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3929 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1460, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1462 = select i1 %.not2.i1461, float %3929, float %3928, !dbg !36 + %3930 = fadd float %.04.i1462, 0xC168000FE0000000, !dbg !36 + %3931 = fneg float %3930, !dbg !36 + %3932 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1463 = icmp eq i32 %3932, 0, !dbg !36 + %3933 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3731, float 0x3FF7154760000000, float %3931) #6, !dbg !36 + %3934 = tail call float @llvm.nvvm.fma.rn.f(float %3731, float 0x3FF7154760000000, float %3931) #6, !dbg !36 + %.0.i1464 = select i1 %.not3.i1463, float %3934, float %3933, !dbg !36 + %3935 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1465 = icmp eq i32 %3935, 0, !dbg !36 + %3936 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3731, float 0x3E54AE0C00000000, float %.0.i1464) #6, !dbg !36 + %3937 = tail call float @llvm.nvvm.fma.rn.f(float %3731, float 0x3E54AE0C00000000, float %.0.i1464) #6, !dbg !36 + %.01.i1466 = select i1 %.not4.i1465, float %3937, float %3936, !dbg !36 + %3938 = bitcast float %.04.i1462 to i32, !dbg !36 + %3939 = shl i32 %3938, 23, !dbg !36 + %3940 = bitcast i32 %3939 to float, !dbg !36 + %3941 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1466) #6, !dbg !36 + %3942 = fmul float %3941, %3940, !dbg !36 + %3943 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1467 = icmp eq i32 %3943, 0, !dbg !36 + %3944 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3732, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3945 = tail call float @llvm.nvvm.fma.rn.f(float %3732, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1468 = select i1 %.not.i1467, float %3945, float %3944, !dbg !36 + %3946 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1469 = icmp eq i32 %3946, 0, !dbg !36 + %3947 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1468) #6, !dbg !36 + %3948 = tail call float @llvm.nvvm.saturate.f(float %.02.i1468) #6, !dbg !36 + %.03.i1470 = select i1 %.not1.i1469, float %3948, float %3947, !dbg !36 + %3949 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1471 = icmp eq i32 %3949, 0, !dbg !36 + %3950 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1470, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3951 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1470, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1472 = select i1 %.not2.i1471, float %3951, float %3950, !dbg !36 + %3952 = fadd float %.04.i1472, 0xC168000FE0000000, !dbg !36 + %3953 = fneg float %3952, !dbg !36 + %3954 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1473 = icmp eq i32 %3954, 0, !dbg !36 + %3955 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3732, float 0x3FF7154760000000, float %3953) #6, !dbg !36 + %3956 = tail call float @llvm.nvvm.fma.rn.f(float %3732, float 0x3FF7154760000000, float %3953) #6, !dbg !36 + %.0.i1474 = select i1 %.not3.i1473, float %3956, float %3955, !dbg !36 + %3957 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1475 = icmp eq i32 %3957, 0, !dbg !36 + %3958 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3732, float 0x3E54AE0C00000000, float %.0.i1474) #6, !dbg !36 + %3959 = tail call float @llvm.nvvm.fma.rn.f(float %3732, float 0x3E54AE0C00000000, float %.0.i1474) #6, !dbg !36 + %.01.i1476 = select i1 %.not4.i1475, float %3959, float %3958, !dbg !36 + %3960 = bitcast float %.04.i1472 to i32, !dbg !36 + %3961 = shl i32 %3960, 23, !dbg !36 + %3962 = bitcast i32 %3961 to float, !dbg !36 + %3963 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1476) #6, !dbg !36 + %3964 = fmul float %3963, %3962, !dbg !36 + %3965 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1477 = icmp eq i32 %3965, 0, !dbg !36 + %3966 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3733, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3967 = tail call float @llvm.nvvm.fma.rn.f(float %3733, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1478 = select i1 %.not.i1477, float %3967, float %3966, !dbg !36 + %3968 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1479 = icmp eq i32 %3968, 0, !dbg !36 + %3969 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1478) #6, !dbg !36 + %3970 = tail call float @llvm.nvvm.saturate.f(float %.02.i1478) #6, !dbg !36 + %.03.i1480 = select i1 %.not1.i1479, float %3970, float %3969, !dbg !36 + %3971 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1481 = icmp eq i32 %3971, 0, !dbg !36 + %3972 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1480, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3973 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1480, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1482 = select i1 %.not2.i1481, float %3973, float %3972, !dbg !36 + %3974 = fadd float %.04.i1482, 0xC168000FE0000000, !dbg !36 + %3975 = fneg float %3974, !dbg !36 + %3976 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1483 = icmp eq i32 %3976, 0, !dbg !36 + %3977 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3733, float 0x3FF7154760000000, float %3975) #6, !dbg !36 + %3978 = tail call float @llvm.nvvm.fma.rn.f(float %3733, float 0x3FF7154760000000, float %3975) #6, !dbg !36 + %.0.i1484 = select i1 %.not3.i1483, float %3978, float %3977, !dbg !36 + %3979 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1485 = icmp eq i32 %3979, 0, !dbg !36 + %3980 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3733, float 0x3E54AE0C00000000, float %.0.i1484) #6, !dbg !36 + %3981 = tail call float @llvm.nvvm.fma.rn.f(float %3733, float 0x3E54AE0C00000000, float %.0.i1484) #6, !dbg !36 + %.01.i1486 = select i1 %.not4.i1485, float %3981, float %3980, !dbg !36 + %3982 = bitcast float %.04.i1482 to i32, !dbg !36 + %3983 = shl i32 %3982, 23, !dbg !36 + %3984 = bitcast i32 %3983 to float, !dbg !36 + %3985 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1486) #6, !dbg !36 + %3986 = fmul float %3985, %3984, !dbg !36 + %3987 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1487 = icmp eq i32 %3987, 0, !dbg !36 + %3988 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3734, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3989 = tail call float @llvm.nvvm.fma.rn.f(float %3734, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1488 = select i1 %.not.i1487, float %3989, float %3988, !dbg !36 + %3990 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1489 = icmp eq i32 %3990, 0, !dbg !36 + %3991 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1488) #6, !dbg !36 + %3992 = tail call float @llvm.nvvm.saturate.f(float %.02.i1488) #6, !dbg !36 + %.03.i1490 = select i1 %.not1.i1489, float %3992, float %3991, !dbg !36 + %3993 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1491 = icmp eq i32 %3993, 0, !dbg !36 + %3994 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1490, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3995 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1490, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1492 = select i1 %.not2.i1491, float %3995, float %3994, !dbg !36 + %3996 = fadd float %.04.i1492, 0xC168000FE0000000, !dbg !36 + %3997 = fneg float %3996, !dbg !36 + %3998 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1493 = icmp eq i32 %3998, 0, !dbg !36 + %3999 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3734, float 0x3FF7154760000000, float %3997) #6, !dbg !36 + %4000 = tail call float @llvm.nvvm.fma.rn.f(float %3734, float 0x3FF7154760000000, float %3997) #6, !dbg !36 + %.0.i1494 = select i1 %.not3.i1493, float %4000, float %3999, !dbg !36 + %4001 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1495 = icmp eq i32 %4001, 0, !dbg !36 + %4002 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3734, float 0x3E54AE0C00000000, float %.0.i1494) #6, !dbg !36 + %4003 = tail call float @llvm.nvvm.fma.rn.f(float %3734, float 0x3E54AE0C00000000, float %.0.i1494) #6, !dbg !36 + %.01.i1496 = select i1 %.not4.i1495, float %4003, float %4002, !dbg !36 + %4004 = bitcast float %.04.i1492 to i32, !dbg !36 + %4005 = shl i32 %4004, 23, !dbg !36 + %4006 = bitcast i32 %4005 to float, !dbg !36 + %4007 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1496) #6, !dbg !36 + %4008 = fmul float %4007, %4006, !dbg !36 + %4009 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1497 = icmp eq i32 %4009, 0, !dbg !36 + %4010 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3735, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4011 = tail call float @llvm.nvvm.fma.rn.f(float %3735, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1498 = select i1 %.not.i1497, float %4011, float %4010, !dbg !36 + %4012 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1499 = icmp eq i32 %4012, 0, !dbg !36 + %4013 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1498) #6, !dbg !36 + %4014 = tail call float @llvm.nvvm.saturate.f(float %.02.i1498) #6, !dbg !36 + %.03.i1500 = select i1 %.not1.i1499, float %4014, float %4013, !dbg !36 + %4015 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1501 = icmp eq i32 %4015, 0, !dbg !36 + %4016 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1500, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4017 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1500, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1502 = select i1 %.not2.i1501, float %4017, float %4016, !dbg !36 + %4018 = fadd float %.04.i1502, 0xC168000FE0000000, !dbg !36 + %4019 = fneg float %4018, !dbg !36 + %4020 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1503 = icmp eq i32 %4020, 0, !dbg !36 + %4021 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3735, float 0x3FF7154760000000, float %4019) #6, !dbg !36 + %4022 = tail call float @llvm.nvvm.fma.rn.f(float %3735, float 0x3FF7154760000000, float %4019) #6, !dbg !36 + %.0.i1504 = select i1 %.not3.i1503, float %4022, float %4021, !dbg !36 + %4023 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1505 = icmp eq i32 %4023, 0, !dbg !36 + %4024 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3735, float 0x3E54AE0C00000000, float %.0.i1504) #6, !dbg !36 + %4025 = tail call float @llvm.nvvm.fma.rn.f(float %3735, float 0x3E54AE0C00000000, float %.0.i1504) #6, !dbg !36 + %.01.i1506 = select i1 %.not4.i1505, float %4025, float %4024, !dbg !36 + %4026 = bitcast float %.04.i1502 to i32, !dbg !36 + %4027 = shl i32 %4026, 23, !dbg !36 + %4028 = bitcast i32 %4027 to float, !dbg !36 + %4029 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1506) #6, !dbg !36 + %4030 = fmul float %4029, %4028, !dbg !36 + %4031 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1507 = icmp eq i32 %4031, 0, !dbg !36 + %4032 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3736, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4033 = tail call float @llvm.nvvm.fma.rn.f(float %3736, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1508 = select i1 %.not.i1507, float %4033, float %4032, !dbg !36 + %4034 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1509 = icmp eq i32 %4034, 0, !dbg !36 + %4035 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1508) #6, !dbg !36 + %4036 = tail call float @llvm.nvvm.saturate.f(float %.02.i1508) #6, !dbg !36 + %.03.i1510 = select i1 %.not1.i1509, float %4036, float %4035, !dbg !36 + %4037 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1511 = icmp eq i32 %4037, 0, !dbg !36 + %4038 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1510, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4039 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1510, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1512 = select i1 %.not2.i1511, float %4039, float %4038, !dbg !36 + %4040 = fadd float %.04.i1512, 0xC168000FE0000000, !dbg !36 + %4041 = fneg float %4040, !dbg !36 + %4042 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1513 = icmp eq i32 %4042, 0, !dbg !36 + %4043 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3736, float 0x3FF7154760000000, float %4041) #6, !dbg !36 + %4044 = tail call float @llvm.nvvm.fma.rn.f(float %3736, float 0x3FF7154760000000, float %4041) #6, !dbg !36 + %.0.i1514 = select i1 %.not3.i1513, float %4044, float %4043, !dbg !36 + %4045 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1515 = icmp eq i32 %4045, 0, !dbg !36 + %4046 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3736, float 0x3E54AE0C00000000, float %.0.i1514) #6, !dbg !36 + %4047 = tail call float @llvm.nvvm.fma.rn.f(float %3736, float 0x3E54AE0C00000000, float %.0.i1514) #6, !dbg !36 + %.01.i1516 = select i1 %.not4.i1515, float %4047, float %4046, !dbg !36 + %4048 = bitcast float %.04.i1512 to i32, !dbg !36 + %4049 = shl i32 %4048, 23, !dbg !36 + %4050 = bitcast i32 %4049 to float, !dbg !36 + %4051 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1516) #6, !dbg !36 + %4052 = fmul float %4051, %4050, !dbg !36 + %4053 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1517 = icmp eq i32 %4053, 0, !dbg !36 + %4054 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3737, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4055 = tail call float @llvm.nvvm.fma.rn.f(float %3737, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1518 = select i1 %.not.i1517, float %4055, float %4054, !dbg !36 + %4056 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1519 = icmp eq i32 %4056, 0, !dbg !36 + %4057 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1518) #6, !dbg !36 + %4058 = tail call float @llvm.nvvm.saturate.f(float %.02.i1518) #6, !dbg !36 + %.03.i1520 = select i1 %.not1.i1519, float %4058, float %4057, !dbg !36 + %4059 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1521 = icmp eq i32 %4059, 0, !dbg !36 + %4060 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1520, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4061 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1520, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1522 = select i1 %.not2.i1521, float %4061, float %4060, !dbg !36 + %4062 = fadd float %.04.i1522, 0xC168000FE0000000, !dbg !36 + %4063 = fneg float %4062, !dbg !36 + %4064 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1523 = icmp eq i32 %4064, 0, !dbg !36 + %4065 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3737, float 0x3FF7154760000000, float %4063) #6, !dbg !36 + %4066 = tail call float @llvm.nvvm.fma.rn.f(float %3737, float 0x3FF7154760000000, float %4063) #6, !dbg !36 + %.0.i1524 = select i1 %.not3.i1523, float %4066, float %4065, !dbg !36 + %4067 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1525 = icmp eq i32 %4067, 0, !dbg !36 + %4068 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3737, float 0x3E54AE0C00000000, float %.0.i1524) #6, !dbg !36 + %4069 = tail call float @llvm.nvvm.fma.rn.f(float %3737, float 0x3E54AE0C00000000, float %.0.i1524) #6, !dbg !36 + %.01.i1526 = select i1 %.not4.i1525, float %4069, float %4068, !dbg !36 + %4070 = bitcast float %.04.i1522 to i32, !dbg !36 + %4071 = shl i32 %4070, 23, !dbg !36 + %4072 = bitcast i32 %4071 to float, !dbg !36 + %4073 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1526) #6, !dbg !36 + %4074 = fmul float %4073, %4072, !dbg !36 + %4075 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1527 = icmp eq i32 %4075, 0, !dbg !36 + %4076 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3738, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4077 = tail call float @llvm.nvvm.fma.rn.f(float %3738, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1528 = select i1 %.not.i1527, float %4077, float %4076, !dbg !36 + %4078 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1529 = icmp eq i32 %4078, 0, !dbg !36 + %4079 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1528) #6, !dbg !36 + %4080 = tail call float @llvm.nvvm.saturate.f(float %.02.i1528) #6, !dbg !36 + %.03.i1530 = select i1 %.not1.i1529, float %4080, float %4079, !dbg !36 + %4081 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1531 = icmp eq i32 %4081, 0, !dbg !36 + %4082 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1530, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4083 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1530, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1532 = select i1 %.not2.i1531, float %4083, float %4082, !dbg !36 + %4084 = fadd float %.04.i1532, 0xC168000FE0000000, !dbg !36 + %4085 = fneg float %4084, !dbg !36 + %4086 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1533 = icmp eq i32 %4086, 0, !dbg !36 + %4087 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3738, float 0x3FF7154760000000, float %4085) #6, !dbg !36 + %4088 = tail call float @llvm.nvvm.fma.rn.f(float %3738, float 0x3FF7154760000000, float %4085) #6, !dbg !36 + %.0.i1534 = select i1 %.not3.i1533, float %4088, float %4087, !dbg !36 + %4089 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1535 = icmp eq i32 %4089, 0, !dbg !36 + %4090 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3738, float 0x3E54AE0C00000000, float %.0.i1534) #6, !dbg !36 + %4091 = tail call float @llvm.nvvm.fma.rn.f(float %3738, float 0x3E54AE0C00000000, float %.0.i1534) #6, !dbg !36 + %.01.i1536 = select i1 %.not4.i1535, float %4091, float %4090, !dbg !36 + %4092 = bitcast float %.04.i1532 to i32, !dbg !36 + %4093 = shl i32 %4092, 23, !dbg !36 + %4094 = bitcast i32 %4093 to float, !dbg !36 + %4095 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1536) #6, !dbg !36 + %4096 = fmul float %4095, %4094, !dbg !36 + %4097 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1537 = icmp eq i32 %4097, 0, !dbg !36 + %4098 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3739, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4099 = tail call float @llvm.nvvm.fma.rn.f(float %3739, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1538 = select i1 %.not.i1537, float %4099, float %4098, !dbg !36 + %4100 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1539 = icmp eq i32 %4100, 0, !dbg !36 + %4101 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1538) #6, !dbg !36 + %4102 = tail call float @llvm.nvvm.saturate.f(float %.02.i1538) #6, !dbg !36 + %.03.i1540 = select i1 %.not1.i1539, float %4102, float %4101, !dbg !36 + %4103 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1541 = icmp eq i32 %4103, 0, !dbg !36 + %4104 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1540, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4105 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1540, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1542 = select i1 %.not2.i1541, float %4105, float %4104, !dbg !36 + %4106 = fadd float %.04.i1542, 0xC168000FE0000000, !dbg !36 + %4107 = fneg float %4106, !dbg !36 + %4108 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1543 = icmp eq i32 %4108, 0, !dbg !36 + %4109 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3739, float 0x3FF7154760000000, float %4107) #6, !dbg !36 + %4110 = tail call float @llvm.nvvm.fma.rn.f(float %3739, float 0x3FF7154760000000, float %4107) #6, !dbg !36 + %.0.i1544 = select i1 %.not3.i1543, float %4110, float %4109, !dbg !36 + %4111 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1545 = icmp eq i32 %4111, 0, !dbg !36 + %4112 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3739, float 0x3E54AE0C00000000, float %.0.i1544) #6, !dbg !36 + %4113 = tail call float @llvm.nvvm.fma.rn.f(float %3739, float 0x3E54AE0C00000000, float %.0.i1544) #6, !dbg !36 + %.01.i1546 = select i1 %.not4.i1545, float %4113, float %4112, !dbg !36 + %4114 = bitcast float %.04.i1542 to i32, !dbg !36 + %4115 = shl i32 %4114, 23, !dbg !36 + %4116 = bitcast i32 %4115 to float, !dbg !36 + %4117 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1546) #6, !dbg !36 + %4118 = fmul float %4117, %4116, !dbg !36 + %4119 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1547 = icmp eq i32 %4119, 0, !dbg !36 + %4120 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3740, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4121 = tail call float @llvm.nvvm.fma.rn.f(float %3740, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1548 = select i1 %.not.i1547, float %4121, float %4120, !dbg !36 + %4122 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1549 = icmp eq i32 %4122, 0, !dbg !36 + %4123 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1548) #6, !dbg !36 + %4124 = tail call float @llvm.nvvm.saturate.f(float %.02.i1548) #6, !dbg !36 + %.03.i1550 = select i1 %.not1.i1549, float %4124, float %4123, !dbg !36 + %4125 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1551 = icmp eq i32 %4125, 0, !dbg !36 + %4126 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1550, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4127 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1550, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1552 = select i1 %.not2.i1551, float %4127, float %4126, !dbg !36 + %4128 = fadd float %.04.i1552, 0xC168000FE0000000, !dbg !36 + %4129 = fneg float %4128, !dbg !36 + %4130 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1553 = icmp eq i32 %4130, 0, !dbg !36 + %4131 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3740, float 0x3FF7154760000000, float %4129) #6, !dbg !36 + %4132 = tail call float @llvm.nvvm.fma.rn.f(float %3740, float 0x3FF7154760000000, float %4129) #6, !dbg !36 + %.0.i1554 = select i1 %.not3.i1553, float %4132, float %4131, !dbg !36 + %4133 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1555 = icmp eq i32 %4133, 0, !dbg !36 + %4134 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3740, float 0x3E54AE0C00000000, float %.0.i1554) #6, !dbg !36 + %4135 = tail call float @llvm.nvvm.fma.rn.f(float %3740, float 0x3E54AE0C00000000, float %.0.i1554) #6, !dbg !36 + %.01.i1556 = select i1 %.not4.i1555, float %4135, float %4134, !dbg !36 + %4136 = bitcast float %.04.i1552 to i32, !dbg !36 + %4137 = shl i32 %4136, 23, !dbg !36 + %4138 = bitcast i32 %4137 to float, !dbg !36 + %4139 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1556) #6, !dbg !36 + %4140 = fmul float %4139, %4138, !dbg !36 + %4141 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1557 = icmp eq i32 %4141, 0, !dbg !36 + %4142 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3741, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4143 = tail call float @llvm.nvvm.fma.rn.f(float %3741, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1558 = select i1 %.not.i1557, float %4143, float %4142, !dbg !36 + %4144 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1559 = icmp eq i32 %4144, 0, !dbg !36 + %4145 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1558) #6, !dbg !36 + %4146 = tail call float @llvm.nvvm.saturate.f(float %.02.i1558) #6, !dbg !36 + %.03.i1560 = select i1 %.not1.i1559, float %4146, float %4145, !dbg !36 + %4147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1561 = icmp eq i32 %4147, 0, !dbg !36 + %4148 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1560, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4149 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1560, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1562 = select i1 %.not2.i1561, float %4149, float %4148, !dbg !36 + %4150 = fadd float %.04.i1562, 0xC168000FE0000000, !dbg !36 + %4151 = fneg float %4150, !dbg !36 + %4152 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1563 = icmp eq i32 %4152, 0, !dbg !36 + %4153 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3741, float 0x3FF7154760000000, float %4151) #6, !dbg !36 + %4154 = tail call float @llvm.nvvm.fma.rn.f(float %3741, float 0x3FF7154760000000, float %4151) #6, !dbg !36 + %.0.i1564 = select i1 %.not3.i1563, float %4154, float %4153, !dbg !36 + %4155 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1565 = icmp eq i32 %4155, 0, !dbg !36 + %4156 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3741, float 0x3E54AE0C00000000, float %.0.i1564) #6, !dbg !36 + %4157 = tail call float @llvm.nvvm.fma.rn.f(float %3741, float 0x3E54AE0C00000000, float %.0.i1564) #6, !dbg !36 + %.01.i1566 = select i1 %.not4.i1565, float %4157, float %4156, !dbg !36 + %4158 = bitcast float %.04.i1562 to i32, !dbg !36 + %4159 = shl i32 %4158, 23, !dbg !36 + %4160 = bitcast i32 %4159 to float, !dbg !36 + %4161 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1566) #6, !dbg !36 + %4162 = fmul float %4161, %4160, !dbg !36 + %4163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1567 = icmp eq i32 %4163, 0, !dbg !36 + %4164 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3742, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4165 = tail call float @llvm.nvvm.fma.rn.f(float %3742, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1568 = select i1 %.not.i1567, float %4165, float %4164, !dbg !36 + %4166 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1569 = icmp eq i32 %4166, 0, !dbg !36 + %4167 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1568) #6, !dbg !36 + %4168 = tail call float @llvm.nvvm.saturate.f(float %.02.i1568) #6, !dbg !36 + %.03.i1570 = select i1 %.not1.i1569, float %4168, float %4167, !dbg !36 + %4169 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1571 = icmp eq i32 %4169, 0, !dbg !36 + %4170 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1570, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4171 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1570, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1572 = select i1 %.not2.i1571, float %4171, float %4170, !dbg !36 + %4172 = fadd float %.04.i1572, 0xC168000FE0000000, !dbg !36 + %4173 = fneg float %4172, !dbg !36 + %4174 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1573 = icmp eq i32 %4174, 0, !dbg !36 + %4175 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3742, float 0x3FF7154760000000, float %4173) #6, !dbg !36 + %4176 = tail call float @llvm.nvvm.fma.rn.f(float %3742, float 0x3FF7154760000000, float %4173) #6, !dbg !36 + %.0.i1574 = select i1 %.not3.i1573, float %4176, float %4175, !dbg !36 + %4177 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1575 = icmp eq i32 %4177, 0, !dbg !36 + %4178 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3742, float 0x3E54AE0C00000000, float %.0.i1574) #6, !dbg !36 + %4179 = tail call float @llvm.nvvm.fma.rn.f(float %3742, float 0x3E54AE0C00000000, float %.0.i1574) #6, !dbg !36 + %.01.i1576 = select i1 %.not4.i1575, float %4179, float %4178, !dbg !36 + %4180 = bitcast float %.04.i1572 to i32, !dbg !36 + %4181 = shl i32 %4180, 23, !dbg !36 + %4182 = bitcast i32 %4181 to float, !dbg !36 + %4183 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1576) #6, !dbg !36 + %4184 = fmul float %4183, %4182, !dbg !36 + %4185 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1577 = icmp eq i32 %4185, 0, !dbg !36 + %4186 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3743, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4187 = tail call float @llvm.nvvm.fma.rn.f(float %3743, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1578 = select i1 %.not.i1577, float %4187, float %4186, !dbg !36 + %4188 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1579 = icmp eq i32 %4188, 0, !dbg !36 + %4189 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1578) #6, !dbg !36 + %4190 = tail call float @llvm.nvvm.saturate.f(float %.02.i1578) #6, !dbg !36 + %.03.i1580 = select i1 %.not1.i1579, float %4190, float %4189, !dbg !36 + %4191 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1581 = icmp eq i32 %4191, 0, !dbg !36 + %4192 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1580, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4193 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1580, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1582 = select i1 %.not2.i1581, float %4193, float %4192, !dbg !36 + %4194 = fadd float %.04.i1582, 0xC168000FE0000000, !dbg !36 + %4195 = fneg float %4194, !dbg !36 + %4196 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1583 = icmp eq i32 %4196, 0, !dbg !36 + %4197 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3743, float 0x3FF7154760000000, float %4195) #6, !dbg !36 + %4198 = tail call float @llvm.nvvm.fma.rn.f(float %3743, float 0x3FF7154760000000, float %4195) #6, !dbg !36 + %.0.i1584 = select i1 %.not3.i1583, float %4198, float %4197, !dbg !36 + %4199 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1585 = icmp eq i32 %4199, 0, !dbg !36 + %4200 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3743, float 0x3E54AE0C00000000, float %.0.i1584) #6, !dbg !36 + %4201 = tail call float @llvm.nvvm.fma.rn.f(float %3743, float 0x3E54AE0C00000000, float %.0.i1584) #6, !dbg !36 + %.01.i1586 = select i1 %.not4.i1585, float %4201, float %4200, !dbg !36 + %4202 = bitcast float %.04.i1582 to i32, !dbg !36 + %4203 = shl i32 %4202, 23, !dbg !36 + %4204 = bitcast i32 %4203 to float, !dbg !36 + %4205 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1586) #6, !dbg !36 + %4206 = fmul float %4205, %4204, !dbg !36 + %4207 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1587 = icmp eq i32 %4207, 0, !dbg !36 + %4208 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3744, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4209 = tail call float @llvm.nvvm.fma.rn.f(float %3744, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1588 = select i1 %.not.i1587, float %4209, float %4208, !dbg !36 + %4210 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1589 = icmp eq i32 %4210, 0, !dbg !36 + %4211 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1588) #6, !dbg !36 + %4212 = tail call float @llvm.nvvm.saturate.f(float %.02.i1588) #6, !dbg !36 + %.03.i1590 = select i1 %.not1.i1589, float %4212, float %4211, !dbg !36 + %4213 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1591 = icmp eq i32 %4213, 0, !dbg !36 + %4214 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1590, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4215 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1590, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1592 = select i1 %.not2.i1591, float %4215, float %4214, !dbg !36 + %4216 = fadd float %.04.i1592, 0xC168000FE0000000, !dbg !36 + %4217 = fneg float %4216, !dbg !36 + %4218 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1593 = icmp eq i32 %4218, 0, !dbg !36 + %4219 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3744, float 0x3FF7154760000000, float %4217) #6, !dbg !36 + %4220 = tail call float @llvm.nvvm.fma.rn.f(float %3744, float 0x3FF7154760000000, float %4217) #6, !dbg !36 + %.0.i1594 = select i1 %.not3.i1593, float %4220, float %4219, !dbg !36 + %4221 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1595 = icmp eq i32 %4221, 0, !dbg !36 + %4222 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3744, float 0x3E54AE0C00000000, float %.0.i1594) #6, !dbg !36 + %4223 = tail call float @llvm.nvvm.fma.rn.f(float %3744, float 0x3E54AE0C00000000, float %.0.i1594) #6, !dbg !36 + %.01.i1596 = select i1 %.not4.i1595, float %4223, float %4222, !dbg !36 + %4224 = bitcast float %.04.i1592 to i32, !dbg !36 + %4225 = shl i32 %4224, 23, !dbg !36 + %4226 = bitcast i32 %4225 to float, !dbg !36 + %4227 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1596) #6, !dbg !36 + %4228 = fmul float %4227, %4226, !dbg !36 + %4229 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1597 = icmp eq i32 %4229, 0, !dbg !36 + %4230 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3745, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4231 = tail call float @llvm.nvvm.fma.rn.f(float %3745, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1598 = select i1 %.not.i1597, float %4231, float %4230, !dbg !36 + %4232 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1599 = icmp eq i32 %4232, 0, !dbg !36 + %4233 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1598) #6, !dbg !36 + %4234 = tail call float @llvm.nvvm.saturate.f(float %.02.i1598) #6, !dbg !36 + %.03.i1600 = select i1 %.not1.i1599, float %4234, float %4233, !dbg !36 + %4235 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1601 = icmp eq i32 %4235, 0, !dbg !36 + %4236 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1600, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4237 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1600, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1602 = select i1 %.not2.i1601, float %4237, float %4236, !dbg !36 + %4238 = fadd float %.04.i1602, 0xC168000FE0000000, !dbg !36 + %4239 = fneg float %4238, !dbg !36 + %4240 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1603 = icmp eq i32 %4240, 0, !dbg !36 + %4241 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3745, float 0x3FF7154760000000, float %4239) #6, !dbg !36 + %4242 = tail call float @llvm.nvvm.fma.rn.f(float %3745, float 0x3FF7154760000000, float %4239) #6, !dbg !36 + %.0.i1604 = select i1 %.not3.i1603, float %4242, float %4241, !dbg !36 + %4243 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1605 = icmp eq i32 %4243, 0, !dbg !36 + %4244 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3745, float 0x3E54AE0C00000000, float %.0.i1604) #6, !dbg !36 + %4245 = tail call float @llvm.nvvm.fma.rn.f(float %3745, float 0x3E54AE0C00000000, float %.0.i1604) #6, !dbg !36 + %.01.i1606 = select i1 %.not4.i1605, float %4245, float %4244, !dbg !36 + %4246 = bitcast float %.04.i1602 to i32, !dbg !36 + %4247 = shl i32 %4246, 23, !dbg !36 + %4248 = bitcast i32 %4247 to float, !dbg !36 + %4249 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1606) #6, !dbg !36 + %4250 = fmul float %4249, %4248, !dbg !36 + %4251 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1607 = icmp eq i32 %4251, 0, !dbg !36 + %4252 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3746, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4253 = tail call float @llvm.nvvm.fma.rn.f(float %3746, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1608 = select i1 %.not.i1607, float %4253, float %4252, !dbg !36 + %4254 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1609 = icmp eq i32 %4254, 0, !dbg !36 + %4255 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1608) #6, !dbg !36 + %4256 = tail call float @llvm.nvvm.saturate.f(float %.02.i1608) #6, !dbg !36 + %.03.i1610 = select i1 %.not1.i1609, float %4256, float %4255, !dbg !36 + %4257 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1611 = icmp eq i32 %4257, 0, !dbg !36 + %4258 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1610, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4259 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1610, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1612 = select i1 %.not2.i1611, float %4259, float %4258, !dbg !36 + %4260 = fadd float %.04.i1612, 0xC168000FE0000000, !dbg !36 + %4261 = fneg float %4260, !dbg !36 + %4262 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1613 = icmp eq i32 %4262, 0, !dbg !36 + %4263 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3746, float 0x3FF7154760000000, float %4261) #6, !dbg !36 + %4264 = tail call float @llvm.nvvm.fma.rn.f(float %3746, float 0x3FF7154760000000, float %4261) #6, !dbg !36 + %.0.i1614 = select i1 %.not3.i1613, float %4264, float %4263, !dbg !36 + %4265 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1615 = icmp eq i32 %4265, 0, !dbg !36 + %4266 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3746, float 0x3E54AE0C00000000, float %.0.i1614) #6, !dbg !36 + %4267 = tail call float @llvm.nvvm.fma.rn.f(float %3746, float 0x3E54AE0C00000000, float %.0.i1614) #6, !dbg !36 + %.01.i1616 = select i1 %.not4.i1615, float %4267, float %4266, !dbg !36 + %4268 = bitcast float %.04.i1612 to i32, !dbg !36 + %4269 = shl i32 %4268, 23, !dbg !36 + %4270 = bitcast i32 %4269 to float, !dbg !36 + %4271 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1616) #6, !dbg !36 + %4272 = fmul float %4271, %4270, !dbg !36 + %4273 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1617 = icmp eq i32 %4273, 0, !dbg !36 + %4274 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3747, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4275 = tail call float @llvm.nvvm.fma.rn.f(float %3747, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1618 = select i1 %.not.i1617, float %4275, float %4274, !dbg !36 + %4276 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1619 = icmp eq i32 %4276, 0, !dbg !36 + %4277 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1618) #6, !dbg !36 + %4278 = tail call float @llvm.nvvm.saturate.f(float %.02.i1618) #6, !dbg !36 + %.03.i1620 = select i1 %.not1.i1619, float %4278, float %4277, !dbg !36 + %4279 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1621 = icmp eq i32 %4279, 0, !dbg !36 + %4280 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1620, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4281 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1620, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1622 = select i1 %.not2.i1621, float %4281, float %4280, !dbg !36 + %4282 = fadd float %.04.i1622, 0xC168000FE0000000, !dbg !36 + %4283 = fneg float %4282, !dbg !36 + %4284 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1623 = icmp eq i32 %4284, 0, !dbg !36 + %4285 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3747, float 0x3FF7154760000000, float %4283) #6, !dbg !36 + %4286 = tail call float @llvm.nvvm.fma.rn.f(float %3747, float 0x3FF7154760000000, float %4283) #6, !dbg !36 + %.0.i1624 = select i1 %.not3.i1623, float %4286, float %4285, !dbg !36 + %4287 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1625 = icmp eq i32 %4287, 0, !dbg !36 + %4288 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3747, float 0x3E54AE0C00000000, float %.0.i1624) #6, !dbg !36 + %4289 = tail call float @llvm.nvvm.fma.rn.f(float %3747, float 0x3E54AE0C00000000, float %.0.i1624) #6, !dbg !36 + %.01.i1626 = select i1 %.not4.i1625, float %4289, float %4288, !dbg !36 + %4290 = bitcast float %.04.i1622 to i32, !dbg !36 + %4291 = shl i32 %4290, 23, !dbg !36 + %4292 = bitcast i32 %4291 to float, !dbg !36 + %4293 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1626) #6, !dbg !36 + %4294 = fmul float %4293, %4292, !dbg !36 + %4295 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1627 = icmp eq i32 %4295, 0, !dbg !36 + %4296 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3748, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4297 = tail call float @llvm.nvvm.fma.rn.f(float %3748, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1628 = select i1 %.not.i1627, float %4297, float %4296, !dbg !36 + %4298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1629 = icmp eq i32 %4298, 0, !dbg !36 + %4299 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1628) #6, !dbg !36 + %4300 = tail call float @llvm.nvvm.saturate.f(float %.02.i1628) #6, !dbg !36 + %.03.i1630 = select i1 %.not1.i1629, float %4300, float %4299, !dbg !36 + %4301 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1631 = icmp eq i32 %4301, 0, !dbg !36 + %4302 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1630, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4303 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1630, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1632 = select i1 %.not2.i1631, float %4303, float %4302, !dbg !36 + %4304 = fadd float %.04.i1632, 0xC168000FE0000000, !dbg !36 + %4305 = fneg float %4304, !dbg !36 + %4306 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1633 = icmp eq i32 %4306, 0, !dbg !36 + %4307 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3748, float 0x3FF7154760000000, float %4305) #6, !dbg !36 + %4308 = tail call float @llvm.nvvm.fma.rn.f(float %3748, float 0x3FF7154760000000, float %4305) #6, !dbg !36 + %.0.i1634 = select i1 %.not3.i1633, float %4308, float %4307, !dbg !36 + %4309 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1635 = icmp eq i32 %4309, 0, !dbg !36 + %4310 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3748, float 0x3E54AE0C00000000, float %.0.i1634) #6, !dbg !36 + %4311 = tail call float @llvm.nvvm.fma.rn.f(float %3748, float 0x3E54AE0C00000000, float %.0.i1634) #6, !dbg !36 + %.01.i1636 = select i1 %.not4.i1635, float %4311, float %4310, !dbg !36 + %4312 = bitcast float %.04.i1632 to i32, !dbg !36 + %4313 = shl i32 %4312, 23, !dbg !36 + %4314 = bitcast i32 %4313 to float, !dbg !36 + %4315 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1636) #6, !dbg !36 + %4316 = fmul float %4315, %4314, !dbg !36 + %4317 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1637 = icmp eq i32 %4317, 0, !dbg !36 + %4318 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3749, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4319 = tail call float @llvm.nvvm.fma.rn.f(float %3749, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1638 = select i1 %.not.i1637, float %4319, float %4318, !dbg !36 + %4320 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1639 = icmp eq i32 %4320, 0, !dbg !36 + %4321 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1638) #6, !dbg !36 + %4322 = tail call float @llvm.nvvm.saturate.f(float %.02.i1638) #6, !dbg !36 + %.03.i1640 = select i1 %.not1.i1639, float %4322, float %4321, !dbg !36 + %4323 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1641 = icmp eq i32 %4323, 0, !dbg !36 + %4324 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1640, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4325 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1640, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1642 = select i1 %.not2.i1641, float %4325, float %4324, !dbg !36 + %4326 = fadd float %.04.i1642, 0xC168000FE0000000, !dbg !36 + %4327 = fneg float %4326, !dbg !36 + %4328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1643 = icmp eq i32 %4328, 0, !dbg !36 + %4329 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3749, float 0x3FF7154760000000, float %4327) #6, !dbg !36 + %4330 = tail call float @llvm.nvvm.fma.rn.f(float %3749, float 0x3FF7154760000000, float %4327) #6, !dbg !36 + %.0.i1644 = select i1 %.not3.i1643, float %4330, float %4329, !dbg !36 + %4331 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1645 = icmp eq i32 %4331, 0, !dbg !36 + %4332 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3749, float 0x3E54AE0C00000000, float %.0.i1644) #6, !dbg !36 + %4333 = tail call float @llvm.nvvm.fma.rn.f(float %3749, float 0x3E54AE0C00000000, float %.0.i1644) #6, !dbg !36 + %.01.i1646 = select i1 %.not4.i1645, float %4333, float %4332, !dbg !36 + %4334 = bitcast float %.04.i1642 to i32, !dbg !36 + %4335 = shl i32 %4334, 23, !dbg !36 + %4336 = bitcast i32 %4335 to float, !dbg !36 + %4337 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1646) #6, !dbg !36 + %4338 = fmul float %4337, %4336, !dbg !36 + %4339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1647 = icmp eq i32 %4339, 0, !dbg !36 + %4340 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3750, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4341 = tail call float @llvm.nvvm.fma.rn.f(float %3750, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1648 = select i1 %.not.i1647, float %4341, float %4340, !dbg !36 + %4342 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1649 = icmp eq i32 %4342, 0, !dbg !36 + %4343 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1648) #6, !dbg !36 + %4344 = tail call float @llvm.nvvm.saturate.f(float %.02.i1648) #6, !dbg !36 + %.03.i1650 = select i1 %.not1.i1649, float %4344, float %4343, !dbg !36 + %4345 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1651 = icmp eq i32 %4345, 0, !dbg !36 + %4346 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1650, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4347 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1650, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1652 = select i1 %.not2.i1651, float %4347, float %4346, !dbg !36 + %4348 = fadd float %.04.i1652, 0xC168000FE0000000, !dbg !36 + %4349 = fneg float %4348, !dbg !36 + %4350 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1653 = icmp eq i32 %4350, 0, !dbg !36 + %4351 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3750, float 0x3FF7154760000000, float %4349) #6, !dbg !36 + %4352 = tail call float @llvm.nvvm.fma.rn.f(float %3750, float 0x3FF7154760000000, float %4349) #6, !dbg !36 + %.0.i1654 = select i1 %.not3.i1653, float %4352, float %4351, !dbg !36 + %4353 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1655 = icmp eq i32 %4353, 0, !dbg !36 + %4354 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3750, float 0x3E54AE0C00000000, float %.0.i1654) #6, !dbg !36 + %4355 = tail call float @llvm.nvvm.fma.rn.f(float %3750, float 0x3E54AE0C00000000, float %.0.i1654) #6, !dbg !36 + %.01.i1656 = select i1 %.not4.i1655, float %4355, float %4354, !dbg !36 + %4356 = bitcast float %.04.i1652 to i32, !dbg !36 + %4357 = shl i32 %4356, 23, !dbg !36 + %4358 = bitcast i32 %4357 to float, !dbg !36 + %4359 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1656) #6, !dbg !36 + %4360 = fmul float %4359, %4358, !dbg !36 + %4361 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1657 = icmp eq i32 %4361, 0, !dbg !36 + %4362 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3751, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4363 = tail call float @llvm.nvvm.fma.rn.f(float %3751, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1658 = select i1 %.not.i1657, float %4363, float %4362, !dbg !36 + %4364 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1659 = icmp eq i32 %4364, 0, !dbg !36 + %4365 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1658) #6, !dbg !36 + %4366 = tail call float @llvm.nvvm.saturate.f(float %.02.i1658) #6, !dbg !36 + %.03.i1660 = select i1 %.not1.i1659, float %4366, float %4365, !dbg !36 + %4367 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1661 = icmp eq i32 %4367, 0, !dbg !36 + %4368 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1660, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4369 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1660, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1662 = select i1 %.not2.i1661, float %4369, float %4368, !dbg !36 + %4370 = fadd float %.04.i1662, 0xC168000FE0000000, !dbg !36 + %4371 = fneg float %4370, !dbg !36 + %4372 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1663 = icmp eq i32 %4372, 0, !dbg !36 + %4373 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3751, float 0x3FF7154760000000, float %4371) #6, !dbg !36 + %4374 = tail call float @llvm.nvvm.fma.rn.f(float %3751, float 0x3FF7154760000000, float %4371) #6, !dbg !36 + %.0.i1664 = select i1 %.not3.i1663, float %4374, float %4373, !dbg !36 + %4375 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1665 = icmp eq i32 %4375, 0, !dbg !36 + %4376 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3751, float 0x3E54AE0C00000000, float %.0.i1664) #6, !dbg !36 + %4377 = tail call float @llvm.nvvm.fma.rn.f(float %3751, float 0x3E54AE0C00000000, float %.0.i1664) #6, !dbg !36 + %.01.i1666 = select i1 %.not4.i1665, float %4377, float %4376, !dbg !36 + %4378 = bitcast float %.04.i1662 to i32, !dbg !36 + %4379 = shl i32 %4378, 23, !dbg !36 + %4380 = bitcast i32 %4379 to float, !dbg !36 + %4381 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1666) #6, !dbg !36 + %4382 = fmul float %4381, %4380, !dbg !36 + %4383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1667 = icmp eq i32 %4383, 0, !dbg !36 + %4384 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3752, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4385 = tail call float @llvm.nvvm.fma.rn.f(float %3752, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1668 = select i1 %.not.i1667, float %4385, float %4384, !dbg !36 + %4386 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1669 = icmp eq i32 %4386, 0, !dbg !36 + %4387 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1668) #6, !dbg !36 + %4388 = tail call float @llvm.nvvm.saturate.f(float %.02.i1668) #6, !dbg !36 + %.03.i1670 = select i1 %.not1.i1669, float %4388, float %4387, !dbg !36 + %4389 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1671 = icmp eq i32 %4389, 0, !dbg !36 + %4390 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1670, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4391 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1670, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1672 = select i1 %.not2.i1671, float %4391, float %4390, !dbg !36 + %4392 = fadd float %.04.i1672, 0xC168000FE0000000, !dbg !36 + %4393 = fneg float %4392, !dbg !36 + %4394 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1673 = icmp eq i32 %4394, 0, !dbg !36 + %4395 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3752, float 0x3FF7154760000000, float %4393) #6, !dbg !36 + %4396 = tail call float @llvm.nvvm.fma.rn.f(float %3752, float 0x3FF7154760000000, float %4393) #6, !dbg !36 + %.0.i1674 = select i1 %.not3.i1673, float %4396, float %4395, !dbg !36 + %4397 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1675 = icmp eq i32 %4397, 0, !dbg !36 + %4398 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3752, float 0x3E54AE0C00000000, float %.0.i1674) #6, !dbg !36 + %4399 = tail call float @llvm.nvvm.fma.rn.f(float %3752, float 0x3E54AE0C00000000, float %.0.i1674) #6, !dbg !36 + %.01.i1676 = select i1 %.not4.i1675, float %4399, float %4398, !dbg !36 + %4400 = bitcast float %.04.i1672 to i32, !dbg !36 + %4401 = shl i32 %4400, 23, !dbg !36 + %4402 = bitcast i32 %4401 to float, !dbg !36 + %4403 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1676) #6, !dbg !36 + %4404 = fmul float %4403, %4402, !dbg !36 + %4405 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1677 = icmp eq i32 %4405, 0, !dbg !36 + %4406 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3753, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4407 = tail call float @llvm.nvvm.fma.rn.f(float %3753, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1678 = select i1 %.not.i1677, float %4407, float %4406, !dbg !36 + %4408 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1679 = icmp eq i32 %4408, 0, !dbg !36 + %4409 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1678) #6, !dbg !36 + %4410 = tail call float @llvm.nvvm.saturate.f(float %.02.i1678) #6, !dbg !36 + %.03.i1680 = select i1 %.not1.i1679, float %4410, float %4409, !dbg !36 + %4411 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1681 = icmp eq i32 %4411, 0, !dbg !36 + %4412 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1680, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4413 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1680, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1682 = select i1 %.not2.i1681, float %4413, float %4412, !dbg !36 + %4414 = fadd float %.04.i1682, 0xC168000FE0000000, !dbg !36 + %4415 = fneg float %4414, !dbg !36 + %4416 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1683 = icmp eq i32 %4416, 0, !dbg !36 + %4417 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3753, float 0x3FF7154760000000, float %4415) #6, !dbg !36 + %4418 = tail call float @llvm.nvvm.fma.rn.f(float %3753, float 0x3FF7154760000000, float %4415) #6, !dbg !36 + %.0.i1684 = select i1 %.not3.i1683, float %4418, float %4417, !dbg !36 + %4419 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1685 = icmp eq i32 %4419, 0, !dbg !36 + %4420 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3753, float 0x3E54AE0C00000000, float %.0.i1684) #6, !dbg !36 + %4421 = tail call float @llvm.nvvm.fma.rn.f(float %3753, float 0x3E54AE0C00000000, float %.0.i1684) #6, !dbg !36 + %.01.i1686 = select i1 %.not4.i1685, float %4421, float %4420, !dbg !36 + %4422 = bitcast float %.04.i1682 to i32, !dbg !36 + %4423 = shl i32 %4422, 23, !dbg !36 + %4424 = bitcast i32 %4423 to float, !dbg !36 + %4425 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1686) #6, !dbg !36 + %4426 = fmul float %4425, %4424, !dbg !36 + %4427 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1687 = icmp eq i32 %4427, 0, !dbg !36 + %4428 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3754, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4429 = tail call float @llvm.nvvm.fma.rn.f(float %3754, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1688 = select i1 %.not.i1687, float %4429, float %4428, !dbg !36 + %4430 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1689 = icmp eq i32 %4430, 0, !dbg !36 + %4431 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1688) #6, !dbg !36 + %4432 = tail call float @llvm.nvvm.saturate.f(float %.02.i1688) #6, !dbg !36 + %.03.i1690 = select i1 %.not1.i1689, float %4432, float %4431, !dbg !36 + %4433 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1691 = icmp eq i32 %4433, 0, !dbg !36 + %4434 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1690, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4435 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1690, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1692 = select i1 %.not2.i1691, float %4435, float %4434, !dbg !36 + %4436 = fadd float %.04.i1692, 0xC168000FE0000000, !dbg !36 + %4437 = fneg float %4436, !dbg !36 + %4438 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1693 = icmp eq i32 %4438, 0, !dbg !36 + %4439 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3754, float 0x3FF7154760000000, float %4437) #6, !dbg !36 + %4440 = tail call float @llvm.nvvm.fma.rn.f(float %3754, float 0x3FF7154760000000, float %4437) #6, !dbg !36 + %.0.i1694 = select i1 %.not3.i1693, float %4440, float %4439, !dbg !36 + %4441 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1695 = icmp eq i32 %4441, 0, !dbg !36 + %4442 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3754, float 0x3E54AE0C00000000, float %.0.i1694) #6, !dbg !36 + %4443 = tail call float @llvm.nvvm.fma.rn.f(float %3754, float 0x3E54AE0C00000000, float %.0.i1694) #6, !dbg !36 + %.01.i1696 = select i1 %.not4.i1695, float %4443, float %4442, !dbg !36 + %4444 = bitcast float %.04.i1692 to i32, !dbg !36 + %4445 = shl i32 %4444, 23, !dbg !36 + %4446 = bitcast i32 %4445 to float, !dbg !36 + %4447 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1696) #6, !dbg !36 + %4448 = fmul float %4447, %4446, !dbg !36 + %4449 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1697 = icmp eq i32 %4449, 0, !dbg !36 + %4450 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3755, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4451 = tail call float @llvm.nvvm.fma.rn.f(float %3755, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1698 = select i1 %.not.i1697, float %4451, float %4450, !dbg !36 + %4452 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1699 = icmp eq i32 %4452, 0, !dbg !36 + %4453 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1698) #6, !dbg !36 + %4454 = tail call float @llvm.nvvm.saturate.f(float %.02.i1698) #6, !dbg !36 + %.03.i1700 = select i1 %.not1.i1699, float %4454, float %4453, !dbg !36 + %4455 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1701 = icmp eq i32 %4455, 0, !dbg !36 + %4456 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1700, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4457 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1700, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1702 = select i1 %.not2.i1701, float %4457, float %4456, !dbg !36 + %4458 = fadd float %.04.i1702, 0xC168000FE0000000, !dbg !36 + %4459 = fneg float %4458, !dbg !36 + %4460 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1703 = icmp eq i32 %4460, 0, !dbg !36 + %4461 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3755, float 0x3FF7154760000000, float %4459) #6, !dbg !36 + %4462 = tail call float @llvm.nvvm.fma.rn.f(float %3755, float 0x3FF7154760000000, float %4459) #6, !dbg !36 + %.0.i1704 = select i1 %.not3.i1703, float %4462, float %4461, !dbg !36 + %4463 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1705 = icmp eq i32 %4463, 0, !dbg !36 + %4464 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3755, float 0x3E54AE0C00000000, float %.0.i1704) #6, !dbg !36 + %4465 = tail call float @llvm.nvvm.fma.rn.f(float %3755, float 0x3E54AE0C00000000, float %.0.i1704) #6, !dbg !36 + %.01.i1706 = select i1 %.not4.i1705, float %4465, float %4464, !dbg !36 + %4466 = bitcast float %.04.i1702 to i32, !dbg !36 + %4467 = shl i32 %4466, 23, !dbg !36 + %4468 = bitcast i32 %4467 to float, !dbg !36 + %4469 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1706) #6, !dbg !36 + %4470 = fmul float %4469, %4468, !dbg !36 + %4471 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1707 = icmp eq i32 %4471, 0, !dbg !36 + %4472 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3756, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4473 = tail call float @llvm.nvvm.fma.rn.f(float %3756, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1708 = select i1 %.not.i1707, float %4473, float %4472, !dbg !36 + %4474 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1709 = icmp eq i32 %4474, 0, !dbg !36 + %4475 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1708) #6, !dbg !36 + %4476 = tail call float @llvm.nvvm.saturate.f(float %.02.i1708) #6, !dbg !36 + %.03.i1710 = select i1 %.not1.i1709, float %4476, float %4475, !dbg !36 + %4477 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1711 = icmp eq i32 %4477, 0, !dbg !36 + %4478 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1710, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4479 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1710, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1712 = select i1 %.not2.i1711, float %4479, float %4478, !dbg !36 + %4480 = fadd float %.04.i1712, 0xC168000FE0000000, !dbg !36 + %4481 = fneg float %4480, !dbg !36 + %4482 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1713 = icmp eq i32 %4482, 0, !dbg !36 + %4483 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3756, float 0x3FF7154760000000, float %4481) #6, !dbg !36 + %4484 = tail call float @llvm.nvvm.fma.rn.f(float %3756, float 0x3FF7154760000000, float %4481) #6, !dbg !36 + %.0.i1714 = select i1 %.not3.i1713, float %4484, float %4483, !dbg !36 + %4485 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1715 = icmp eq i32 %4485, 0, !dbg !36 + %4486 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3756, float 0x3E54AE0C00000000, float %.0.i1714) #6, !dbg !36 + %4487 = tail call float @llvm.nvvm.fma.rn.f(float %3756, float 0x3E54AE0C00000000, float %.0.i1714) #6, !dbg !36 + %.01.i1716 = select i1 %.not4.i1715, float %4487, float %4486, !dbg !36 + %4488 = bitcast float %.04.i1712 to i32, !dbg !36 + %4489 = shl i32 %4488, 23, !dbg !36 + %4490 = bitcast i32 %4489 to float, !dbg !36 + %4491 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1716) #6, !dbg !36 + %4492 = fmul float %4491, %4490, !dbg !36 + %4493 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1717 = icmp eq i32 %4493, 0, !dbg !36 + %4494 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3757, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4495 = tail call float @llvm.nvvm.fma.rn.f(float %3757, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1718 = select i1 %.not.i1717, float %4495, float %4494, !dbg !36 + %4496 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1719 = icmp eq i32 %4496, 0, !dbg !36 + %4497 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1718) #6, !dbg !36 + %4498 = tail call float @llvm.nvvm.saturate.f(float %.02.i1718) #6, !dbg !36 + %.03.i1720 = select i1 %.not1.i1719, float %4498, float %4497, !dbg !36 + %4499 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1721 = icmp eq i32 %4499, 0, !dbg !36 + %4500 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1720, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4501 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1720, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1722 = select i1 %.not2.i1721, float %4501, float %4500, !dbg !36 + %4502 = fadd float %.04.i1722, 0xC168000FE0000000, !dbg !36 + %4503 = fneg float %4502, !dbg !36 + %4504 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1723 = icmp eq i32 %4504, 0, !dbg !36 + %4505 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3757, float 0x3FF7154760000000, float %4503) #6, !dbg !36 + %4506 = tail call float @llvm.nvvm.fma.rn.f(float %3757, float 0x3FF7154760000000, float %4503) #6, !dbg !36 + %.0.i1724 = select i1 %.not3.i1723, float %4506, float %4505, !dbg !36 + %4507 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1725 = icmp eq i32 %4507, 0, !dbg !36 + %4508 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3757, float 0x3E54AE0C00000000, float %.0.i1724) #6, !dbg !36 + %4509 = tail call float @llvm.nvvm.fma.rn.f(float %3757, float 0x3E54AE0C00000000, float %.0.i1724) #6, !dbg !36 + %.01.i1726 = select i1 %.not4.i1725, float %4509, float %4508, !dbg !36 + %4510 = bitcast float %.04.i1722 to i32, !dbg !36 + %4511 = shl i32 %4510, 23, !dbg !36 + %4512 = bitcast i32 %4511 to float, !dbg !36 + %4513 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1726) #6, !dbg !36 + %4514 = fmul float %4513, %4512, !dbg !36 + %4515 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1727 = icmp eq i32 %4515, 0, !dbg !36 + %4516 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3758, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4517 = tail call float @llvm.nvvm.fma.rn.f(float %3758, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1728 = select i1 %.not.i1727, float %4517, float %4516, !dbg !36 + %4518 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1729 = icmp eq i32 %4518, 0, !dbg !36 + %4519 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1728) #6, !dbg !36 + %4520 = tail call float @llvm.nvvm.saturate.f(float %.02.i1728) #6, !dbg !36 + %.03.i1730 = select i1 %.not1.i1729, float %4520, float %4519, !dbg !36 + %4521 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1731 = icmp eq i32 %4521, 0, !dbg !36 + %4522 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1730, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4523 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1730, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1732 = select i1 %.not2.i1731, float %4523, float %4522, !dbg !36 + %4524 = fadd float %.04.i1732, 0xC168000FE0000000, !dbg !36 + %4525 = fneg float %4524, !dbg !36 + %4526 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1733 = icmp eq i32 %4526, 0, !dbg !36 + %4527 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3758, float 0x3FF7154760000000, float %4525) #6, !dbg !36 + %4528 = tail call float @llvm.nvvm.fma.rn.f(float %3758, float 0x3FF7154760000000, float %4525) #6, !dbg !36 + %.0.i1734 = select i1 %.not3.i1733, float %4528, float %4527, !dbg !36 + %4529 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1735 = icmp eq i32 %4529, 0, !dbg !36 + %4530 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3758, float 0x3E54AE0C00000000, float %.0.i1734) #6, !dbg !36 + %4531 = tail call float @llvm.nvvm.fma.rn.f(float %3758, float 0x3E54AE0C00000000, float %.0.i1734) #6, !dbg !36 + %.01.i1736 = select i1 %.not4.i1735, float %4531, float %4530, !dbg !36 + %4532 = bitcast float %.04.i1732 to i32, !dbg !36 + %4533 = shl i32 %4532, 23, !dbg !36 + %4534 = bitcast i32 %4533 to float, !dbg !36 + %4535 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1736) #6, !dbg !36 + %4536 = fmul float %4535, %4534, !dbg !36 + %4537 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1737 = icmp eq i32 %4537, 0, !dbg !36 + %4538 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3759, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4539 = tail call float @llvm.nvvm.fma.rn.f(float %3759, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1738 = select i1 %.not.i1737, float %4539, float %4538, !dbg !36 + %4540 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1739 = icmp eq i32 %4540, 0, !dbg !36 + %4541 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1738) #6, !dbg !36 + %4542 = tail call float @llvm.nvvm.saturate.f(float %.02.i1738) #6, !dbg !36 + %.03.i1740 = select i1 %.not1.i1739, float %4542, float %4541, !dbg !36 + %4543 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1741 = icmp eq i32 %4543, 0, !dbg !36 + %4544 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1740, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4545 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1740, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1742 = select i1 %.not2.i1741, float %4545, float %4544, !dbg !36 + %4546 = fadd float %.04.i1742, 0xC168000FE0000000, !dbg !36 + %4547 = fneg float %4546, !dbg !36 + %4548 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1743 = icmp eq i32 %4548, 0, !dbg !36 + %4549 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3759, float 0x3FF7154760000000, float %4547) #6, !dbg !36 + %4550 = tail call float @llvm.nvvm.fma.rn.f(float %3759, float 0x3FF7154760000000, float %4547) #6, !dbg !36 + %.0.i1744 = select i1 %.not3.i1743, float %4550, float %4549, !dbg !36 + %4551 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1745 = icmp eq i32 %4551, 0, !dbg !36 + %4552 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3759, float 0x3E54AE0C00000000, float %.0.i1744) #6, !dbg !36 + %4553 = tail call float @llvm.nvvm.fma.rn.f(float %3759, float 0x3E54AE0C00000000, float %.0.i1744) #6, !dbg !36 + %.01.i1746 = select i1 %.not4.i1745, float %4553, float %4552, !dbg !36 + %4554 = bitcast float %.04.i1742 to i32, !dbg !36 + %4555 = shl i32 %4554, 23, !dbg !36 + %4556 = bitcast i32 %4555 to float, !dbg !36 + %4557 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1746) #6, !dbg !36 + %4558 = fmul float %4557, %4556, !dbg !36 + %4559 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1747 = icmp eq i32 %4559, 0, !dbg !36 + %4560 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3760, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4561 = tail call float @llvm.nvvm.fma.rn.f(float %3760, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1748 = select i1 %.not.i1747, float %4561, float %4560, !dbg !36 + %4562 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1749 = icmp eq i32 %4562, 0, !dbg !36 + %4563 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1748) #6, !dbg !36 + %4564 = tail call float @llvm.nvvm.saturate.f(float %.02.i1748) #6, !dbg !36 + %.03.i1750 = select i1 %.not1.i1749, float %4564, float %4563, !dbg !36 + %4565 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1751 = icmp eq i32 %4565, 0, !dbg !36 + %4566 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1750, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4567 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1750, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1752 = select i1 %.not2.i1751, float %4567, float %4566, !dbg !36 + %4568 = fadd float %.04.i1752, 0xC168000FE0000000, !dbg !36 + %4569 = fneg float %4568, !dbg !36 + %4570 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1753 = icmp eq i32 %4570, 0, !dbg !36 + %4571 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3760, float 0x3FF7154760000000, float %4569) #6, !dbg !36 + %4572 = tail call float @llvm.nvvm.fma.rn.f(float %3760, float 0x3FF7154760000000, float %4569) #6, !dbg !36 + %.0.i1754 = select i1 %.not3.i1753, float %4572, float %4571, !dbg !36 + %4573 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1755 = icmp eq i32 %4573, 0, !dbg !36 + %4574 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3760, float 0x3E54AE0C00000000, float %.0.i1754) #6, !dbg !36 + %4575 = tail call float @llvm.nvvm.fma.rn.f(float %3760, float 0x3E54AE0C00000000, float %.0.i1754) #6, !dbg !36 + %.01.i1756 = select i1 %.not4.i1755, float %4575, float %4574, !dbg !36 + %4576 = bitcast float %.04.i1752 to i32, !dbg !36 + %4577 = shl i32 %4576, 23, !dbg !36 + %4578 = bitcast i32 %4577 to float, !dbg !36 + %4579 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1756) #6, !dbg !36 + %4580 = fmul float %4579, %4578, !dbg !36 + %4581 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1757 = icmp eq i32 %4581, 0, !dbg !36 + %4582 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3761, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4583 = tail call float @llvm.nvvm.fma.rn.f(float %3761, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1758 = select i1 %.not.i1757, float %4583, float %4582, !dbg !36 + %4584 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1759 = icmp eq i32 %4584, 0, !dbg !36 + %4585 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1758) #6, !dbg !36 + %4586 = tail call float @llvm.nvvm.saturate.f(float %.02.i1758) #6, !dbg !36 + %.03.i1760 = select i1 %.not1.i1759, float %4586, float %4585, !dbg !36 + %4587 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1761 = icmp eq i32 %4587, 0, !dbg !36 + %4588 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1760, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4589 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1760, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1762 = select i1 %.not2.i1761, float %4589, float %4588, !dbg !36 + %4590 = fadd float %.04.i1762, 0xC168000FE0000000, !dbg !36 + %4591 = fneg float %4590, !dbg !36 + %4592 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1763 = icmp eq i32 %4592, 0, !dbg !36 + %4593 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3761, float 0x3FF7154760000000, float %4591) #6, !dbg !36 + %4594 = tail call float @llvm.nvvm.fma.rn.f(float %3761, float 0x3FF7154760000000, float %4591) #6, !dbg !36 + %.0.i1764 = select i1 %.not3.i1763, float %4594, float %4593, !dbg !36 + %4595 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1765 = icmp eq i32 %4595, 0, !dbg !36 + %4596 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3761, float 0x3E54AE0C00000000, float %.0.i1764) #6, !dbg !36 + %4597 = tail call float @llvm.nvvm.fma.rn.f(float %3761, float 0x3E54AE0C00000000, float %.0.i1764) #6, !dbg !36 + %.01.i1766 = select i1 %.not4.i1765, float %4597, float %4596, !dbg !36 + %4598 = bitcast float %.04.i1762 to i32, !dbg !36 + %4599 = shl i32 %4598, 23, !dbg !36 + %4600 = bitcast i32 %4599 to float, !dbg !36 + %4601 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1766) #6, !dbg !36 + %4602 = fmul float %4601, %4600, !dbg !36 + %4603 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1767 = icmp eq i32 %4603, 0, !dbg !36 + %4604 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3762, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4605 = tail call float @llvm.nvvm.fma.rn.f(float %3762, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1768 = select i1 %.not.i1767, float %4605, float %4604, !dbg !36 + %4606 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1769 = icmp eq i32 %4606, 0, !dbg !36 + %4607 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1768) #6, !dbg !36 + %4608 = tail call float @llvm.nvvm.saturate.f(float %.02.i1768) #6, !dbg !36 + %.03.i1770 = select i1 %.not1.i1769, float %4608, float %4607, !dbg !36 + %4609 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1771 = icmp eq i32 %4609, 0, !dbg !36 + %4610 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1770, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4611 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1770, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1772 = select i1 %.not2.i1771, float %4611, float %4610, !dbg !36 + %4612 = fadd float %.04.i1772, 0xC168000FE0000000, !dbg !36 + %4613 = fneg float %4612, !dbg !36 + %4614 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1773 = icmp eq i32 %4614, 0, !dbg !36 + %4615 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3762, float 0x3FF7154760000000, float %4613) #6, !dbg !36 + %4616 = tail call float @llvm.nvvm.fma.rn.f(float %3762, float 0x3FF7154760000000, float %4613) #6, !dbg !36 + %.0.i1774 = select i1 %.not3.i1773, float %4616, float %4615, !dbg !36 + %4617 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1775 = icmp eq i32 %4617, 0, !dbg !36 + %4618 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3762, float 0x3E54AE0C00000000, float %.0.i1774) #6, !dbg !36 + %4619 = tail call float @llvm.nvvm.fma.rn.f(float %3762, float 0x3E54AE0C00000000, float %.0.i1774) #6, !dbg !36 + %.01.i1776 = select i1 %.not4.i1775, float %4619, float %4618, !dbg !36 + %4620 = bitcast float %.04.i1772 to i32, !dbg !36 + %4621 = shl i32 %4620, 23, !dbg !36 + %4622 = bitcast i32 %4621 to float, !dbg !36 + %4623 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1776) #6, !dbg !36 + %4624 = fmul float %4623, %4622, !dbg !36 + %4625 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1777 = icmp eq i32 %4625, 0, !dbg !36 + %4626 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3763, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4627 = tail call float @llvm.nvvm.fma.rn.f(float %3763, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1778 = select i1 %.not.i1777, float %4627, float %4626, !dbg !36 + %4628 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1779 = icmp eq i32 %4628, 0, !dbg !36 + %4629 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1778) #6, !dbg !36 + %4630 = tail call float @llvm.nvvm.saturate.f(float %.02.i1778) #6, !dbg !36 + %.03.i1780 = select i1 %.not1.i1779, float %4630, float %4629, !dbg !36 + %4631 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1781 = icmp eq i32 %4631, 0, !dbg !36 + %4632 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1780, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4633 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1780, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1782 = select i1 %.not2.i1781, float %4633, float %4632, !dbg !36 + %4634 = fadd float %.04.i1782, 0xC168000FE0000000, !dbg !36 + %4635 = fneg float %4634, !dbg !36 + %4636 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1783 = icmp eq i32 %4636, 0, !dbg !36 + %4637 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3763, float 0x3FF7154760000000, float %4635) #6, !dbg !36 + %4638 = tail call float @llvm.nvvm.fma.rn.f(float %3763, float 0x3FF7154760000000, float %4635) #6, !dbg !36 + %.0.i1784 = select i1 %.not3.i1783, float %4638, float %4637, !dbg !36 + %4639 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1785 = icmp eq i32 %4639, 0, !dbg !36 + %4640 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3763, float 0x3E54AE0C00000000, float %.0.i1784) #6, !dbg !36 + %4641 = tail call float @llvm.nvvm.fma.rn.f(float %3763, float 0x3E54AE0C00000000, float %.0.i1784) #6, !dbg !36 + %.01.i1786 = select i1 %.not4.i1785, float %4641, float %4640, !dbg !36 + %4642 = bitcast float %.04.i1782 to i32, !dbg !36 + %4643 = shl i32 %4642, 23, !dbg !36 + %4644 = bitcast i32 %4643 to float, !dbg !36 + %4645 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1786) #6, !dbg !36 + %4646 = fmul float %4645, %4644, !dbg !36 + %4647 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1787 = icmp eq i32 %4647, 0, !dbg !36 + %4648 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3764, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4649 = tail call float @llvm.nvvm.fma.rn.f(float %3764, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1788 = select i1 %.not.i1787, float %4649, float %4648, !dbg !36 + %4650 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1789 = icmp eq i32 %4650, 0, !dbg !36 + %4651 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1788) #6, !dbg !36 + %4652 = tail call float @llvm.nvvm.saturate.f(float %.02.i1788) #6, !dbg !36 + %.03.i1790 = select i1 %.not1.i1789, float %4652, float %4651, !dbg !36 + %4653 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1791 = icmp eq i32 %4653, 0, !dbg !36 + %4654 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1790, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4655 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1790, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1792 = select i1 %.not2.i1791, float %4655, float %4654, !dbg !36 + %4656 = fadd float %.04.i1792, 0xC168000FE0000000, !dbg !36 + %4657 = fneg float %4656, !dbg !36 + %4658 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1793 = icmp eq i32 %4658, 0, !dbg !36 + %4659 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3764, float 0x3FF7154760000000, float %4657) #6, !dbg !36 + %4660 = tail call float @llvm.nvvm.fma.rn.f(float %3764, float 0x3FF7154760000000, float %4657) #6, !dbg !36 + %.0.i1794 = select i1 %.not3.i1793, float %4660, float %4659, !dbg !36 + %4661 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1795 = icmp eq i32 %4661, 0, !dbg !36 + %4662 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3764, float 0x3E54AE0C00000000, float %.0.i1794) #6, !dbg !36 + %4663 = tail call float @llvm.nvvm.fma.rn.f(float %3764, float 0x3E54AE0C00000000, float %.0.i1794) #6, !dbg !36 + %.01.i1796 = select i1 %.not4.i1795, float %4663, float %4662, !dbg !36 + %4664 = bitcast float %.04.i1792 to i32, !dbg !36 + %4665 = shl i32 %4664, 23, !dbg !36 + %4666 = bitcast i32 %4665 to float, !dbg !36 + %4667 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1796) #6, !dbg !36 + %4668 = fmul float %4667, %4666, !dbg !36 + %4669 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1797 = icmp eq i32 %4669, 0, !dbg !36 + %4670 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3765, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4671 = tail call float @llvm.nvvm.fma.rn.f(float %3765, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1798 = select i1 %.not.i1797, float %4671, float %4670, !dbg !36 + %4672 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1799 = icmp eq i32 %4672, 0, !dbg !36 + %4673 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1798) #6, !dbg !36 + %4674 = tail call float @llvm.nvvm.saturate.f(float %.02.i1798) #6, !dbg !36 + %.03.i1800 = select i1 %.not1.i1799, float %4674, float %4673, !dbg !36 + %4675 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1801 = icmp eq i32 %4675, 0, !dbg !36 + %4676 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1800, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4677 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1800, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1802 = select i1 %.not2.i1801, float %4677, float %4676, !dbg !36 + %4678 = fadd float %.04.i1802, 0xC168000FE0000000, !dbg !36 + %4679 = fneg float %4678, !dbg !36 + %4680 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1803 = icmp eq i32 %4680, 0, !dbg !36 + %4681 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3765, float 0x3FF7154760000000, float %4679) #6, !dbg !36 + %4682 = tail call float @llvm.nvvm.fma.rn.f(float %3765, float 0x3FF7154760000000, float %4679) #6, !dbg !36 + %.0.i1804 = select i1 %.not3.i1803, float %4682, float %4681, !dbg !36 + %4683 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1805 = icmp eq i32 %4683, 0, !dbg !36 + %4684 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3765, float 0x3E54AE0C00000000, float %.0.i1804) #6, !dbg !36 + %4685 = tail call float @llvm.nvvm.fma.rn.f(float %3765, float 0x3E54AE0C00000000, float %.0.i1804) #6, !dbg !36 + %.01.i1806 = select i1 %.not4.i1805, float %4685, float %4684, !dbg !36 + %4686 = bitcast float %.04.i1802 to i32, !dbg !36 + %4687 = shl i32 %4686, 23, !dbg !36 + %4688 = bitcast i32 %4687 to float, !dbg !36 + %4689 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1806) #6, !dbg !36 + %4690 = fmul float %4689, %4688, !dbg !36 + %4691 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1807 = icmp eq i32 %4691, 0, !dbg !36 + %4692 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3766, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4693 = tail call float @llvm.nvvm.fma.rn.f(float %3766, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1808 = select i1 %.not.i1807, float %4693, float %4692, !dbg !36 + %4694 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1809 = icmp eq i32 %4694, 0, !dbg !36 + %4695 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1808) #6, !dbg !36 + %4696 = tail call float @llvm.nvvm.saturate.f(float %.02.i1808) #6, !dbg !36 + %.03.i1810 = select i1 %.not1.i1809, float %4696, float %4695, !dbg !36 + %4697 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1811 = icmp eq i32 %4697, 0, !dbg !36 + %4698 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1810, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4699 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1810, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1812 = select i1 %.not2.i1811, float %4699, float %4698, !dbg !36 + %4700 = fadd float %.04.i1812, 0xC168000FE0000000, !dbg !36 + %4701 = fneg float %4700, !dbg !36 + %4702 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1813 = icmp eq i32 %4702, 0, !dbg !36 + %4703 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3766, float 0x3FF7154760000000, float %4701) #6, !dbg !36 + %4704 = tail call float @llvm.nvvm.fma.rn.f(float %3766, float 0x3FF7154760000000, float %4701) #6, !dbg !36 + %.0.i1814 = select i1 %.not3.i1813, float %4704, float %4703, !dbg !36 + %4705 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1815 = icmp eq i32 %4705, 0, !dbg !36 + %4706 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3766, float 0x3E54AE0C00000000, float %.0.i1814) #6, !dbg !36 + %4707 = tail call float @llvm.nvvm.fma.rn.f(float %3766, float 0x3E54AE0C00000000, float %.0.i1814) #6, !dbg !36 + %.01.i1816 = select i1 %.not4.i1815, float %4707, float %4706, !dbg !36 + %4708 = bitcast float %.04.i1812 to i32, !dbg !36 + %4709 = shl i32 %4708, 23, !dbg !36 + %4710 = bitcast i32 %4709 to float, !dbg !36 + %4711 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1816) #6, !dbg !36 + %4712 = fmul float %4711, %4710, !dbg !36 + %4713 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1817 = icmp eq i32 %4713, 0, !dbg !36 + %4714 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3767, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4715 = tail call float @llvm.nvvm.fma.rn.f(float %3767, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1818 = select i1 %.not.i1817, float %4715, float %4714, !dbg !36 + %4716 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1819 = icmp eq i32 %4716, 0, !dbg !36 + %4717 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1818) #6, !dbg !36 + %4718 = tail call float @llvm.nvvm.saturate.f(float %.02.i1818) #6, !dbg !36 + %.03.i1820 = select i1 %.not1.i1819, float %4718, float %4717, !dbg !36 + %4719 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1821 = icmp eq i32 %4719, 0, !dbg !36 + %4720 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1820, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4721 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1820, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1822 = select i1 %.not2.i1821, float %4721, float %4720, !dbg !36 + %4722 = fadd float %.04.i1822, 0xC168000FE0000000, !dbg !36 + %4723 = fneg float %4722, !dbg !36 + %4724 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1823 = icmp eq i32 %4724, 0, !dbg !36 + %4725 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3767, float 0x3FF7154760000000, float %4723) #6, !dbg !36 + %4726 = tail call float @llvm.nvvm.fma.rn.f(float %3767, float 0x3FF7154760000000, float %4723) #6, !dbg !36 + %.0.i1824 = select i1 %.not3.i1823, float %4726, float %4725, !dbg !36 + %4727 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1825 = icmp eq i32 %4727, 0, !dbg !36 + %4728 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3767, float 0x3E54AE0C00000000, float %.0.i1824) #6, !dbg !36 + %4729 = tail call float @llvm.nvvm.fma.rn.f(float %3767, float 0x3E54AE0C00000000, float %.0.i1824) #6, !dbg !36 + %.01.i1826 = select i1 %.not4.i1825, float %4729, float %4728, !dbg !36 + %4730 = bitcast float %.04.i1822 to i32, !dbg !36 + %4731 = shl i32 %4730, 23, !dbg !36 + %4732 = bitcast i32 %4731 to float, !dbg !36 + %4733 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1826) #6, !dbg !36 + %4734 = fmul float %4733, %4732, !dbg !36 + %4735 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1827 = icmp eq i32 %4735, 0, !dbg !36 + %4736 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3768, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4737 = tail call float @llvm.nvvm.fma.rn.f(float %3768, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1828 = select i1 %.not.i1827, float %4737, float %4736, !dbg !36 + %4738 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1829 = icmp eq i32 %4738, 0, !dbg !36 + %4739 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1828) #6, !dbg !36 + %4740 = tail call float @llvm.nvvm.saturate.f(float %.02.i1828) #6, !dbg !36 + %.03.i1830 = select i1 %.not1.i1829, float %4740, float %4739, !dbg !36 + %4741 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1831 = icmp eq i32 %4741, 0, !dbg !36 + %4742 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1830, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4743 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1830, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1832 = select i1 %.not2.i1831, float %4743, float %4742, !dbg !36 + %4744 = fadd float %.04.i1832, 0xC168000FE0000000, !dbg !36 + %4745 = fneg float %4744, !dbg !36 + %4746 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1833 = icmp eq i32 %4746, 0, !dbg !36 + %4747 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3768, float 0x3FF7154760000000, float %4745) #6, !dbg !36 + %4748 = tail call float @llvm.nvvm.fma.rn.f(float %3768, float 0x3FF7154760000000, float %4745) #6, !dbg !36 + %.0.i1834 = select i1 %.not3.i1833, float %4748, float %4747, !dbg !36 + %4749 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1835 = icmp eq i32 %4749, 0, !dbg !36 + %4750 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3768, float 0x3E54AE0C00000000, float %.0.i1834) #6, !dbg !36 + %4751 = tail call float @llvm.nvvm.fma.rn.f(float %3768, float 0x3E54AE0C00000000, float %.0.i1834) #6, !dbg !36 + %.01.i1836 = select i1 %.not4.i1835, float %4751, float %4750, !dbg !36 + %4752 = bitcast float %.04.i1832 to i32, !dbg !36 + %4753 = shl i32 %4752, 23, !dbg !36 + %4754 = bitcast i32 %4753 to float, !dbg !36 + %4755 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1836) #6, !dbg !36 + %4756 = fmul float %4755, %4754, !dbg !36 + %4757 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1837 = icmp eq i32 %4757, 0, !dbg !36 + %4758 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3769, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4759 = tail call float @llvm.nvvm.fma.rn.f(float %3769, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1838 = select i1 %.not.i1837, float %4759, float %4758, !dbg !36 + %4760 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1839 = icmp eq i32 %4760, 0, !dbg !36 + %4761 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1838) #6, !dbg !36 + %4762 = tail call float @llvm.nvvm.saturate.f(float %.02.i1838) #6, !dbg !36 + %.03.i1840 = select i1 %.not1.i1839, float %4762, float %4761, !dbg !36 + %4763 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1841 = icmp eq i32 %4763, 0, !dbg !36 + %4764 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1840, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4765 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1840, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1842 = select i1 %.not2.i1841, float %4765, float %4764, !dbg !36 + %4766 = fadd float %.04.i1842, 0xC168000FE0000000, !dbg !36 + %4767 = fneg float %4766, !dbg !36 + %4768 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1843 = icmp eq i32 %4768, 0, !dbg !36 + %4769 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3769, float 0x3FF7154760000000, float %4767) #6, !dbg !36 + %4770 = tail call float @llvm.nvvm.fma.rn.f(float %3769, float 0x3FF7154760000000, float %4767) #6, !dbg !36 + %.0.i1844 = select i1 %.not3.i1843, float %4770, float %4769, !dbg !36 + %4771 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1845 = icmp eq i32 %4771, 0, !dbg !36 + %4772 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3769, float 0x3E54AE0C00000000, float %.0.i1844) #6, !dbg !36 + %4773 = tail call float @llvm.nvvm.fma.rn.f(float %3769, float 0x3E54AE0C00000000, float %.0.i1844) #6, !dbg !36 + %.01.i1846 = select i1 %.not4.i1845, float %4773, float %4772, !dbg !36 + %4774 = bitcast float %.04.i1842 to i32, !dbg !36 + %4775 = shl i32 %4774, 23, !dbg !36 + %4776 = bitcast i32 %4775 to float, !dbg !36 + %4777 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1846) #6, !dbg !36 + %4778 = fmul float %4777, %4776, !dbg !36 + %4779 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1847 = icmp eq i32 %4779, 0, !dbg !36 + %4780 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3770, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4781 = tail call float @llvm.nvvm.fma.rn.f(float %3770, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1848 = select i1 %.not.i1847, float %4781, float %4780, !dbg !36 + %4782 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1849 = icmp eq i32 %4782, 0, !dbg !36 + %4783 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1848) #6, !dbg !36 + %4784 = tail call float @llvm.nvvm.saturate.f(float %.02.i1848) #6, !dbg !36 + %.03.i1850 = select i1 %.not1.i1849, float %4784, float %4783, !dbg !36 + %4785 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1851 = icmp eq i32 %4785, 0, !dbg !36 + %4786 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1850, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4787 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1850, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1852 = select i1 %.not2.i1851, float %4787, float %4786, !dbg !36 + %4788 = fadd float %.04.i1852, 0xC168000FE0000000, !dbg !36 + %4789 = fneg float %4788, !dbg !36 + %4790 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1853 = icmp eq i32 %4790, 0, !dbg !36 + %4791 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3770, float 0x3FF7154760000000, float %4789) #6, !dbg !36 + %4792 = tail call float @llvm.nvvm.fma.rn.f(float %3770, float 0x3FF7154760000000, float %4789) #6, !dbg !36 + %.0.i1854 = select i1 %.not3.i1853, float %4792, float %4791, !dbg !36 + %4793 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1855 = icmp eq i32 %4793, 0, !dbg !36 + %4794 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3770, float 0x3E54AE0C00000000, float %.0.i1854) #6, !dbg !36 + %4795 = tail call float @llvm.nvvm.fma.rn.f(float %3770, float 0x3E54AE0C00000000, float %.0.i1854) #6, !dbg !36 + %.01.i1856 = select i1 %.not4.i1855, float %4795, float %4794, !dbg !36 + %4796 = bitcast float %.04.i1852 to i32, !dbg !36 + %4797 = shl i32 %4796, 23, !dbg !36 + %4798 = bitcast i32 %4797 to float, !dbg !36 + %4799 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1856) #6, !dbg !36 + %4800 = fmul float %4799, %4798, !dbg !36 + %4801 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1857 = icmp eq i32 %4801, 0, !dbg !36 + %4802 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3771, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4803 = tail call float @llvm.nvvm.fma.rn.f(float %3771, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1858 = select i1 %.not.i1857, float %4803, float %4802, !dbg !36 + %4804 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1859 = icmp eq i32 %4804, 0, !dbg !36 + %4805 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1858) #6, !dbg !36 + %4806 = tail call float @llvm.nvvm.saturate.f(float %.02.i1858) #6, !dbg !36 + %.03.i1860 = select i1 %.not1.i1859, float %4806, float %4805, !dbg !36 + %4807 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1861 = icmp eq i32 %4807, 0, !dbg !36 + %4808 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1860, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4809 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1860, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1862 = select i1 %.not2.i1861, float %4809, float %4808, !dbg !36 + %4810 = fadd float %.04.i1862, 0xC168000FE0000000, !dbg !36 + %4811 = fneg float %4810, !dbg !36 + %4812 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1863 = icmp eq i32 %4812, 0, !dbg !36 + %4813 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3771, float 0x3FF7154760000000, float %4811) #6, !dbg !36 + %4814 = tail call float @llvm.nvvm.fma.rn.f(float %3771, float 0x3FF7154760000000, float %4811) #6, !dbg !36 + %.0.i1864 = select i1 %.not3.i1863, float %4814, float %4813, !dbg !36 + %4815 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1865 = icmp eq i32 %4815, 0, !dbg !36 + %4816 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3771, float 0x3E54AE0C00000000, float %.0.i1864) #6, !dbg !36 + %4817 = tail call float @llvm.nvvm.fma.rn.f(float %3771, float 0x3E54AE0C00000000, float %.0.i1864) #6, !dbg !36 + %.01.i1866 = select i1 %.not4.i1865, float %4817, float %4816, !dbg !36 + %4818 = bitcast float %.04.i1862 to i32, !dbg !36 + %4819 = shl i32 %4818, 23, !dbg !36 + %4820 = bitcast i32 %4819 to float, !dbg !36 + %4821 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1866) #6, !dbg !36 + %4822 = fmul float %4821, %4820, !dbg !36 + %4823 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1867 = icmp eq i32 %4823, 0, !dbg !36 + %4824 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3772, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4825 = tail call float @llvm.nvvm.fma.rn.f(float %3772, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1868 = select i1 %.not.i1867, float %4825, float %4824, !dbg !36 + %4826 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1869 = icmp eq i32 %4826, 0, !dbg !36 + %4827 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1868) #6, !dbg !36 + %4828 = tail call float @llvm.nvvm.saturate.f(float %.02.i1868) #6, !dbg !36 + %.03.i1870 = select i1 %.not1.i1869, float %4828, float %4827, !dbg !36 + %4829 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1871 = icmp eq i32 %4829, 0, !dbg !36 + %4830 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1870, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4831 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1870, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1872 = select i1 %.not2.i1871, float %4831, float %4830, !dbg !36 + %4832 = fadd float %.04.i1872, 0xC168000FE0000000, !dbg !36 + %4833 = fneg float %4832, !dbg !36 + %4834 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1873 = icmp eq i32 %4834, 0, !dbg !36 + %4835 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3772, float 0x3FF7154760000000, float %4833) #6, !dbg !36 + %4836 = tail call float @llvm.nvvm.fma.rn.f(float %3772, float 0x3FF7154760000000, float %4833) #6, !dbg !36 + %.0.i1874 = select i1 %.not3.i1873, float %4836, float %4835, !dbg !36 + %4837 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1875 = icmp eq i32 %4837, 0, !dbg !36 + %4838 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3772, float 0x3E54AE0C00000000, float %.0.i1874) #6, !dbg !36 + %4839 = tail call float @llvm.nvvm.fma.rn.f(float %3772, float 0x3E54AE0C00000000, float %.0.i1874) #6, !dbg !36 + %.01.i1876 = select i1 %.not4.i1875, float %4839, float %4838, !dbg !36 + %4840 = bitcast float %.04.i1872 to i32, !dbg !36 + %4841 = shl i32 %4840, 23, !dbg !36 + %4842 = bitcast i32 %4841 to float, !dbg !36 + %4843 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1876) #6, !dbg !36 + %4844 = fmul float %4843, %4842, !dbg !36 + %4845 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1877 = icmp eq i32 %4845, 0, !dbg !36 + %4846 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3773, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4847 = tail call float @llvm.nvvm.fma.rn.f(float %3773, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1878 = select i1 %.not.i1877, float %4847, float %4846, !dbg !36 + %4848 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1879 = icmp eq i32 %4848, 0, !dbg !36 + %4849 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1878) #6, !dbg !36 + %4850 = tail call float @llvm.nvvm.saturate.f(float %.02.i1878) #6, !dbg !36 + %.03.i1880 = select i1 %.not1.i1879, float %4850, float %4849, !dbg !36 + %4851 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1881 = icmp eq i32 %4851, 0, !dbg !36 + %4852 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1880, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4853 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1880, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1882 = select i1 %.not2.i1881, float %4853, float %4852, !dbg !36 + %4854 = fadd float %.04.i1882, 0xC168000FE0000000, !dbg !36 + %4855 = fneg float %4854, !dbg !36 + %4856 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1883 = icmp eq i32 %4856, 0, !dbg !36 + %4857 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3773, float 0x3FF7154760000000, float %4855) #6, !dbg !36 + %4858 = tail call float @llvm.nvvm.fma.rn.f(float %3773, float 0x3FF7154760000000, float %4855) #6, !dbg !36 + %.0.i1884 = select i1 %.not3.i1883, float %4858, float %4857, !dbg !36 + %4859 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1885 = icmp eq i32 %4859, 0, !dbg !36 + %4860 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3773, float 0x3E54AE0C00000000, float %.0.i1884) #6, !dbg !36 + %4861 = tail call float @llvm.nvvm.fma.rn.f(float %3773, float 0x3E54AE0C00000000, float %.0.i1884) #6, !dbg !36 + %.01.i1886 = select i1 %.not4.i1885, float %4861, float %4860, !dbg !36 + %4862 = bitcast float %.04.i1882 to i32, !dbg !36 + %4863 = shl i32 %4862, 23, !dbg !36 + %4864 = bitcast i32 %4863 to float, !dbg !36 + %4865 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1886) #6, !dbg !36 + %4866 = fmul float %4865, %4864, !dbg !36 + %4867 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1887 = icmp eq i32 %4867, 0, !dbg !36 + %4868 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3774, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4869 = tail call float @llvm.nvvm.fma.rn.f(float %3774, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1888 = select i1 %.not.i1887, float %4869, float %4868, !dbg !36 + %4870 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1889 = icmp eq i32 %4870, 0, !dbg !36 + %4871 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1888) #6, !dbg !36 + %4872 = tail call float @llvm.nvvm.saturate.f(float %.02.i1888) #6, !dbg !36 + %.03.i1890 = select i1 %.not1.i1889, float %4872, float %4871, !dbg !36 + %4873 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1891 = icmp eq i32 %4873, 0, !dbg !36 + %4874 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1890, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4875 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1890, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1892 = select i1 %.not2.i1891, float %4875, float %4874, !dbg !36 + %4876 = fadd float %.04.i1892, 0xC168000FE0000000, !dbg !36 + %4877 = fneg float %4876, !dbg !36 + %4878 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1893 = icmp eq i32 %4878, 0, !dbg !36 + %4879 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3774, float 0x3FF7154760000000, float %4877) #6, !dbg !36 + %4880 = tail call float @llvm.nvvm.fma.rn.f(float %3774, float 0x3FF7154760000000, float %4877) #6, !dbg !36 + %.0.i1894 = select i1 %.not3.i1893, float %4880, float %4879, !dbg !36 + %4881 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1895 = icmp eq i32 %4881, 0, !dbg !36 + %4882 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3774, float 0x3E54AE0C00000000, float %.0.i1894) #6, !dbg !36 + %4883 = tail call float @llvm.nvvm.fma.rn.f(float %3774, float 0x3E54AE0C00000000, float %.0.i1894) #6, !dbg !36 + %.01.i1896 = select i1 %.not4.i1895, float %4883, float %4882, !dbg !36 + %4884 = bitcast float %.04.i1892 to i32, !dbg !36 + %4885 = shl i32 %4884, 23, !dbg !36 + %4886 = bitcast i32 %4885 to float, !dbg !36 + %4887 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1896) #6, !dbg !36 + %4888 = fmul float %4887, %4886, !dbg !36 + %4889 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1897 = icmp eq i32 %4889, 0, !dbg !36 + %4890 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3775, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4891 = tail call float @llvm.nvvm.fma.rn.f(float %3775, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1898 = select i1 %.not.i1897, float %4891, float %4890, !dbg !36 + %4892 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1899 = icmp eq i32 %4892, 0, !dbg !36 + %4893 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1898) #6, !dbg !36 + %4894 = tail call float @llvm.nvvm.saturate.f(float %.02.i1898) #6, !dbg !36 + %.03.i1900 = select i1 %.not1.i1899, float %4894, float %4893, !dbg !36 + %4895 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1901 = icmp eq i32 %4895, 0, !dbg !36 + %4896 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1900, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4897 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1900, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1902 = select i1 %.not2.i1901, float %4897, float %4896, !dbg !36 + %4898 = fadd float %.04.i1902, 0xC168000FE0000000, !dbg !36 + %4899 = fneg float %4898, !dbg !36 + %4900 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1903 = icmp eq i32 %4900, 0, !dbg !36 + %4901 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3775, float 0x3FF7154760000000, float %4899) #6, !dbg !36 + %4902 = tail call float @llvm.nvvm.fma.rn.f(float %3775, float 0x3FF7154760000000, float %4899) #6, !dbg !36 + %.0.i1904 = select i1 %.not3.i1903, float %4902, float %4901, !dbg !36 + %4903 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1905 = icmp eq i32 %4903, 0, !dbg !36 + %4904 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3775, float 0x3E54AE0C00000000, float %.0.i1904) #6, !dbg !36 + %4905 = tail call float @llvm.nvvm.fma.rn.f(float %3775, float 0x3E54AE0C00000000, float %.0.i1904) #6, !dbg !36 + %.01.i1906 = select i1 %.not4.i1905, float %4905, float %4904, !dbg !36 + %4906 = bitcast float %.04.i1902 to i32, !dbg !36 + %4907 = shl i32 %4906, 23, !dbg !36 + %4908 = bitcast i32 %4907 to float, !dbg !36 + %4909 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1906) #6, !dbg !36 + %4910 = fmul float %4909, %4908, !dbg !36 + %4911 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1907 = icmp eq i32 %4911, 0, !dbg !36 + %4912 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3776, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4913 = tail call float @llvm.nvvm.fma.rn.f(float %3776, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1908 = select i1 %.not.i1907, float %4913, float %4912, !dbg !36 + %4914 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1909 = icmp eq i32 %4914, 0, !dbg !36 + %4915 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1908) #6, !dbg !36 + %4916 = tail call float @llvm.nvvm.saturate.f(float %.02.i1908) #6, !dbg !36 + %.03.i1910 = select i1 %.not1.i1909, float %4916, float %4915, !dbg !36 + %4917 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1911 = icmp eq i32 %4917, 0, !dbg !36 + %4918 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1910, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4919 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1910, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1912 = select i1 %.not2.i1911, float %4919, float %4918, !dbg !36 + %4920 = fadd float %.04.i1912, 0xC168000FE0000000, !dbg !36 + %4921 = fneg float %4920, !dbg !36 + %4922 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1913 = icmp eq i32 %4922, 0, !dbg !36 + %4923 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3776, float 0x3FF7154760000000, float %4921) #6, !dbg !36 + %4924 = tail call float @llvm.nvvm.fma.rn.f(float %3776, float 0x3FF7154760000000, float %4921) #6, !dbg !36 + %.0.i1914 = select i1 %.not3.i1913, float %4924, float %4923, !dbg !36 + %4925 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1915 = icmp eq i32 %4925, 0, !dbg !36 + %4926 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3776, float 0x3E54AE0C00000000, float %.0.i1914) #6, !dbg !36 + %4927 = tail call float @llvm.nvvm.fma.rn.f(float %3776, float 0x3E54AE0C00000000, float %.0.i1914) #6, !dbg !36 + %.01.i1916 = select i1 %.not4.i1915, float %4927, float %4926, !dbg !36 + %4928 = bitcast float %.04.i1912 to i32, !dbg !36 + %4929 = shl i32 %4928, 23, !dbg !36 + %4930 = bitcast i32 %4929 to float, !dbg !36 + %4931 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1916) #6, !dbg !36 + %4932 = fmul float %4931, %4930, !dbg !36 + %4933 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1917 = icmp eq i32 %4933, 0, !dbg !36 + %4934 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3777, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4935 = tail call float @llvm.nvvm.fma.rn.f(float %3777, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1918 = select i1 %.not.i1917, float %4935, float %4934, !dbg !36 + %4936 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1919 = icmp eq i32 %4936, 0, !dbg !36 + %4937 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1918) #6, !dbg !36 + %4938 = tail call float @llvm.nvvm.saturate.f(float %.02.i1918) #6, !dbg !36 + %.03.i1920 = select i1 %.not1.i1919, float %4938, float %4937, !dbg !36 + %4939 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1921 = icmp eq i32 %4939, 0, !dbg !36 + %4940 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1920, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4941 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1920, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1922 = select i1 %.not2.i1921, float %4941, float %4940, !dbg !36 + %4942 = fadd float %.04.i1922, 0xC168000FE0000000, !dbg !36 + %4943 = fneg float %4942, !dbg !36 + %4944 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1923 = icmp eq i32 %4944, 0, !dbg !36 + %4945 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3777, float 0x3FF7154760000000, float %4943) #6, !dbg !36 + %4946 = tail call float @llvm.nvvm.fma.rn.f(float %3777, float 0x3FF7154760000000, float %4943) #6, !dbg !36 + %.0.i1924 = select i1 %.not3.i1923, float %4946, float %4945, !dbg !36 + %4947 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1925 = icmp eq i32 %4947, 0, !dbg !36 + %4948 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3777, float 0x3E54AE0C00000000, float %.0.i1924) #6, !dbg !36 + %4949 = tail call float @llvm.nvvm.fma.rn.f(float %3777, float 0x3E54AE0C00000000, float %.0.i1924) #6, !dbg !36 + %.01.i1926 = select i1 %.not4.i1925, float %4949, float %4948, !dbg !36 + %4950 = bitcast float %.04.i1922 to i32, !dbg !36 + %4951 = shl i32 %4950, 23, !dbg !36 + %4952 = bitcast i32 %4951 to float, !dbg !36 + %4953 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1926) #6, !dbg !36 + %4954 = fmul float %4953, %4952, !dbg !36 + %4955 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1927 = icmp eq i32 %4955, 0, !dbg !36 + %4956 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3778, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4957 = tail call float @llvm.nvvm.fma.rn.f(float %3778, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1928 = select i1 %.not.i1927, float %4957, float %4956, !dbg !36 + %4958 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1929 = icmp eq i32 %4958, 0, !dbg !36 + %4959 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1928) #6, !dbg !36 + %4960 = tail call float @llvm.nvvm.saturate.f(float %.02.i1928) #6, !dbg !36 + %.03.i1930 = select i1 %.not1.i1929, float %4960, float %4959, !dbg !36 + %4961 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1931 = icmp eq i32 %4961, 0, !dbg !36 + %4962 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1930, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4963 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1930, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1932 = select i1 %.not2.i1931, float %4963, float %4962, !dbg !36 + %4964 = fadd float %.04.i1932, 0xC168000FE0000000, !dbg !36 + %4965 = fneg float %4964, !dbg !36 + %4966 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1933 = icmp eq i32 %4966, 0, !dbg !36 + %4967 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3778, float 0x3FF7154760000000, float %4965) #6, !dbg !36 + %4968 = tail call float @llvm.nvvm.fma.rn.f(float %3778, float 0x3FF7154760000000, float %4965) #6, !dbg !36 + %.0.i1934 = select i1 %.not3.i1933, float %4968, float %4967, !dbg !36 + %4969 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1935 = icmp eq i32 %4969, 0, !dbg !36 + %4970 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3778, float 0x3E54AE0C00000000, float %.0.i1934) #6, !dbg !36 + %4971 = tail call float @llvm.nvvm.fma.rn.f(float %3778, float 0x3E54AE0C00000000, float %.0.i1934) #6, !dbg !36 + %.01.i1936 = select i1 %.not4.i1935, float %4971, float %4970, !dbg !36 + %4972 = bitcast float %.04.i1932 to i32, !dbg !36 + %4973 = shl i32 %4972, 23, !dbg !36 + %4974 = bitcast i32 %4973 to float, !dbg !36 + %4975 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1936) #6, !dbg !36 + %4976 = fmul float %4975, %4974, !dbg !36 + %4977 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1937 = icmp eq i32 %4977, 0, !dbg !36 + %4978 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3779, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4979 = tail call float @llvm.nvvm.fma.rn.f(float %3779, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1938 = select i1 %.not.i1937, float %4979, float %4978, !dbg !36 + %4980 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1939 = icmp eq i32 %4980, 0, !dbg !36 + %4981 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1938) #6, !dbg !36 + %4982 = tail call float @llvm.nvvm.saturate.f(float %.02.i1938) #6, !dbg !36 + %.03.i1940 = select i1 %.not1.i1939, float %4982, float %4981, !dbg !36 + %4983 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1941 = icmp eq i32 %4983, 0, !dbg !36 + %4984 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1940, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4985 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1940, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1942 = select i1 %.not2.i1941, float %4985, float %4984, !dbg !36 + %4986 = fadd float %.04.i1942, 0xC168000FE0000000, !dbg !36 + %4987 = fneg float %4986, !dbg !36 + %4988 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1943 = icmp eq i32 %4988, 0, !dbg !36 + %4989 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3779, float 0x3FF7154760000000, float %4987) #6, !dbg !36 + %4990 = tail call float @llvm.nvvm.fma.rn.f(float %3779, float 0x3FF7154760000000, float %4987) #6, !dbg !36 + %.0.i1944 = select i1 %.not3.i1943, float %4990, float %4989, !dbg !36 + %4991 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1945 = icmp eq i32 %4991, 0, !dbg !36 + %4992 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3779, float 0x3E54AE0C00000000, float %.0.i1944) #6, !dbg !36 + %4993 = tail call float @llvm.nvvm.fma.rn.f(float %3779, float 0x3E54AE0C00000000, float %.0.i1944) #6, !dbg !36 + %.01.i1946 = select i1 %.not4.i1945, float %4993, float %4992, !dbg !36 + %4994 = bitcast float %.04.i1942 to i32, !dbg !36 + %4995 = shl i32 %4994, 23, !dbg !36 + %4996 = bitcast i32 %4995 to float, !dbg !36 + %4997 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1946) #6, !dbg !36 + %4998 = fmul float %4997, %4996, !dbg !36 + %4999 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1947 = icmp eq i32 %4999, 0, !dbg !36 + %5000 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3780, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %5001 = tail call float @llvm.nvvm.fma.rn.f(float %3780, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1948 = select i1 %.not.i1947, float %5001, float %5000, !dbg !36 + %5002 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1949 = icmp eq i32 %5002, 0, !dbg !36 + %5003 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1948) #6, !dbg !36 + %5004 = tail call float @llvm.nvvm.saturate.f(float %.02.i1948) #6, !dbg !36 + %.03.i1950 = select i1 %.not1.i1949, float %5004, float %5003, !dbg !36 + %5005 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1951 = icmp eq i32 %5005, 0, !dbg !36 + %5006 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1950, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5007 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1950, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1952 = select i1 %.not2.i1951, float %5007, float %5006, !dbg !36 + %5008 = fadd float %.04.i1952, 0xC168000FE0000000, !dbg !36 + %5009 = fneg float %5008, !dbg !36 + %5010 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1953 = icmp eq i32 %5010, 0, !dbg !36 + %5011 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3780, float 0x3FF7154760000000, float %5009) #6, !dbg !36 + %5012 = tail call float @llvm.nvvm.fma.rn.f(float %3780, float 0x3FF7154760000000, float %5009) #6, !dbg !36 + %.0.i1954 = select i1 %.not3.i1953, float %5012, float %5011, !dbg !36 + %5013 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1955 = icmp eq i32 %5013, 0, !dbg !36 + %5014 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3780, float 0x3E54AE0C00000000, float %.0.i1954) #6, !dbg !36 + %5015 = tail call float @llvm.nvvm.fma.rn.f(float %3780, float 0x3E54AE0C00000000, float %.0.i1954) #6, !dbg !36 + %.01.i1956 = select i1 %.not4.i1955, float %5015, float %5014, !dbg !36 + %5016 = bitcast float %.04.i1952 to i32, !dbg !36 + %5017 = shl i32 %5016, 23, !dbg !36 + %5018 = bitcast i32 %5017 to float, !dbg !36 + %5019 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1956) #6, !dbg !36 + %5020 = fmul float %5019, %5018, !dbg !36 + %5021 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1957 = icmp eq i32 %5021, 0, !dbg !36 + %5022 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3781, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %5023 = tail call float @llvm.nvvm.fma.rn.f(float %3781, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1958 = select i1 %.not.i1957, float %5023, float %5022, !dbg !36 + %5024 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1959 = icmp eq i32 %5024, 0, !dbg !36 + %5025 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1958) #6, !dbg !36 + %5026 = tail call float @llvm.nvvm.saturate.f(float %.02.i1958) #6, !dbg !36 + %.03.i1960 = select i1 %.not1.i1959, float %5026, float %5025, !dbg !36 + %5027 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1961 = icmp eq i32 %5027, 0, !dbg !36 + %5028 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1960, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5029 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1960, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1962 = select i1 %.not2.i1961, float %5029, float %5028, !dbg !36 + %5030 = fadd float %.04.i1962, 0xC168000FE0000000, !dbg !36 + %5031 = fneg float %5030, !dbg !36 + %5032 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1963 = icmp eq i32 %5032, 0, !dbg !36 + %5033 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3781, float 0x3FF7154760000000, float %5031) #6, !dbg !36 + %5034 = tail call float @llvm.nvvm.fma.rn.f(float %3781, float 0x3FF7154760000000, float %5031) #6, !dbg !36 + %.0.i1964 = select i1 %.not3.i1963, float %5034, float %5033, !dbg !36 + %5035 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1965 = icmp eq i32 %5035, 0, !dbg !36 + %5036 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3781, float 0x3E54AE0C00000000, float %.0.i1964) #6, !dbg !36 + %5037 = tail call float @llvm.nvvm.fma.rn.f(float %3781, float 0x3E54AE0C00000000, float %.0.i1964) #6, !dbg !36 + %.01.i1966 = select i1 %.not4.i1965, float %5037, float %5036, !dbg !36 + %5038 = bitcast float %.04.i1962 to i32, !dbg !36 + %5039 = shl i32 %5038, 23, !dbg !36 + %5040 = bitcast i32 %5039 to float, !dbg !36 + %5041 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1966) #6, !dbg !36 + %5042 = fmul float %5041, %5040, !dbg !36 + %5043 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1967 = icmp eq i32 %5043, 0, !dbg !36 + %5044 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3782, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %5045 = tail call float @llvm.nvvm.fma.rn.f(float %3782, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1968 = select i1 %.not.i1967, float %5045, float %5044, !dbg !36 + %5046 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1969 = icmp eq i32 %5046, 0, !dbg !36 + %5047 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1968) #6, !dbg !36 + %5048 = tail call float @llvm.nvvm.saturate.f(float %.02.i1968) #6, !dbg !36 + %.03.i1970 = select i1 %.not1.i1969, float %5048, float %5047, !dbg !36 + %5049 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1971 = icmp eq i32 %5049, 0, !dbg !36 + %5050 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1970, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5051 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1970, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1972 = select i1 %.not2.i1971, float %5051, float %5050, !dbg !36 + %5052 = fadd float %.04.i1972, 0xC168000FE0000000, !dbg !36 + %5053 = fneg float %5052, !dbg !36 + %5054 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1973 = icmp eq i32 %5054, 0, !dbg !36 + %5055 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3782, float 0x3FF7154760000000, float %5053) #6, !dbg !36 + %5056 = tail call float @llvm.nvvm.fma.rn.f(float %3782, float 0x3FF7154760000000, float %5053) #6, !dbg !36 + %.0.i1974 = select i1 %.not3.i1973, float %5056, float %5055, !dbg !36 + %5057 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1975 = icmp eq i32 %5057, 0, !dbg !36 + %5058 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3782, float 0x3E54AE0C00000000, float %.0.i1974) #6, !dbg !36 + %5059 = tail call float @llvm.nvvm.fma.rn.f(float %3782, float 0x3E54AE0C00000000, float %.0.i1974) #6, !dbg !36 + %.01.i1976 = select i1 %.not4.i1975, float %5059, float %5058, !dbg !36 + %5060 = bitcast float %.04.i1972 to i32, !dbg !36 + %5061 = shl i32 %5060, 23, !dbg !36 + %5062 = bitcast i32 %5061 to float, !dbg !36 + %5063 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1976) #6, !dbg !36 + %5064 = fmul float %5063, %5062, !dbg !36 + %5065 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1977 = icmp eq i32 %5065, 0, !dbg !36 + %5066 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3783, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %5067 = tail call float @llvm.nvvm.fma.rn.f(float %3783, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1978 = select i1 %.not.i1977, float %5067, float %5066, !dbg !36 + %5068 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1979 = icmp eq i32 %5068, 0, !dbg !36 + %5069 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1978) #6, !dbg !36 + %5070 = tail call float @llvm.nvvm.saturate.f(float %.02.i1978) #6, !dbg !36 + %.03.i1980 = select i1 %.not1.i1979, float %5070, float %5069, !dbg !36 + %5071 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %5072 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1980, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5073 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1980, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5074 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1983 = icmp eq i32 %5074, 0, !dbg !36 + %5075 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1985 = icmp eq i32 %5075, 0, !dbg !36 + %5076 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1987 = icmp eq i32 %5076, 0, !dbg !36 + %5077 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3784, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %5078 = tail call float @llvm.nvvm.fma.rn.f(float %3784, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1988 = select i1 %.not.i1987, float %5078, float %5077, !dbg !36 + %5079 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1989 = icmp eq i32 %5079, 0, !dbg !36 + %5080 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1988) #6, !dbg !36 + %5081 = tail call float @llvm.nvvm.saturate.f(float %.02.i1988) #6, !dbg !36 + %.03.i1990 = select i1 %.not1.i1989, float %5081, float %5080, !dbg !36 + %5082 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %5083 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1990, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5084 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1990, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5085 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1993 = icmp eq i32 %5085, 0, !dbg !36 + %5086 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1995 = icmp eq i32 %5086, 0, !dbg !36 + %5087 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1997 = icmp eq i32 %5087, 0, !dbg !36 + %5088 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3785, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %5089 = tail call float @llvm.nvvm.fma.rn.f(float %3785, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1998 = select i1 %.not.i1997, float %5089, float %5088, !dbg !36 + %5090 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1999 = icmp eq i32 %5090, 0, !dbg !36 + %5091 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1998) #6, !dbg !36 + %5092 = tail call float @llvm.nvvm.saturate.f(float %.02.i1998) #6, !dbg !36 + %.03.i2000 = select i1 %.not1.i1999, float %5092, float %5091, !dbg !36 + %5093 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %5094 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2000, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5095 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2000, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5096 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i2003 = icmp eq i32 %5096, 0, !dbg !36 + %5097 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i2005 = icmp eq i32 %5097, 0, !dbg !36 + %5098 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i2007 = icmp eq i32 %5098, 0, !dbg !36 + %5099 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3786, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %5100 = tail call float @llvm.nvvm.fma.rn.f(float %3786, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i2008 = select i1 %.not.i2007, float %5100, float %5099, !dbg !36 + %5101 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i2009 = icmp eq i32 %5101, 0, !dbg !36 + %5102 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2008) #6, !dbg !36 + %5103 = tail call float @llvm.nvvm.saturate.f(float %.02.i2008) #6, !dbg !36 + %.03.i2010 = select i1 %.not1.i2009, float %5103, float %5102, !dbg !36 + %5104 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %5105 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2010, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5106 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2010, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5107 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i2013 = icmp eq i32 %5107, 0, !dbg !36 + %5108 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i2015 = icmp eq i32 %5108, 0, !dbg !36 + %5109 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i2017 = icmp eq i32 %5109, 0, !dbg !36 + %5110 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3787, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %5111 = tail call float @llvm.nvvm.fma.rn.f(float %3787, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i2018 = select i1 %.not.i2017, float %5111, float %5110, !dbg !36 + %5112 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i2019 = icmp eq i32 %5112, 0, !dbg !36 + %5113 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2018) #6, !dbg !36 + %5114 = tail call float @llvm.nvvm.saturate.f(float %.02.i2018) #6, !dbg !36 + %.03.i2020 = select i1 %.not1.i2019, float %5114, float %5113, !dbg !36 + %5115 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %5116 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2020, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5117 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2020, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5118 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i2023 = icmp eq i32 %5118, 0, !dbg !36 + %5119 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i2025 = icmp eq i32 %5119, 0, !dbg !36 + %5120 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i2027 = icmp eq i32 %5120, 0, !dbg !36 + %5121 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3788, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %5122 = tail call float @llvm.nvvm.fma.rn.f(float %3788, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i2028 = select i1 %.not.i2027, float %5122, float %5121, !dbg !36 + %5123 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i2029 = icmp eq i32 %5123, 0, !dbg !36 + %5124 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2028) #6, !dbg !36 + %5125 = tail call float @llvm.nvvm.saturate.f(float %.02.i2028) #6, !dbg !36 + %.03.i2030 = select i1 %.not1.i2029, float %5125, float %5124, !dbg !36 + %5126 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %5127 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2030, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5128 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2030, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5129 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i2033 = icmp eq i32 %5129, 0, !dbg !36 + %5130 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i2035 = icmp eq i32 %5130, 0, !dbg !36 + %5131 = fmul float %3210, %3810, !dbg !37 + %5132 = fmul float %3212, %3832, !dbg !37 + %5133 = fmul float %3214, %3854, !dbg !37 + %5134 = fmul float %3216, %3876, !dbg !37 + %5135 = fmul float %3218, %3898, !dbg !37 + %5136 = fmul float %3220, %3920, !dbg !37 + %5137 = fmul float %3222, %3942, !dbg !37 + %5138 = fmul float %3224, %3964, !dbg !37 + %5139 = fmul float %3226, %3986, !dbg !37 + %5140 = fmul float %3228, %4008, !dbg !37 + %5141 = fmul float %3230, %4030, !dbg !37 + %5142 = fmul float %3232, %4052, !dbg !37 + %5143 = fmul float %3234, %4074, !dbg !37 + %5144 = fmul float %3236, %4096, !dbg !37 + %5145 = fmul float %3238, %4118, !dbg !37 + %5146 = fmul float %3240, %4140, !dbg !37 + %5147 = fmul float %3242, %4162, !dbg !37 + %5148 = fmul float %3244, %4184, !dbg !37 + %5149 = fmul float %3246, %4206, !dbg !37 + %5150 = fmul float %3248, %4228, !dbg !37 + %5151 = fmul float %3250, %4250, !dbg !37 + %5152 = fmul float %3252, %4272, !dbg !37 + %5153 = fmul float %3254, %4294, !dbg !37 + %5154 = fmul float %3256, %4316, !dbg !37 + %5155 = fmul float %3258, %4338, !dbg !37 + %5156 = fmul float %3260, %4360, !dbg !37 + %5157 = fmul float %3262, %4382, !dbg !37 + %5158 = fmul float %3264, %4404, !dbg !37 + %5159 = fmul float %3266, %4426, !dbg !37 + %5160 = fmul float %3268, %4448, !dbg !37 + %5161 = fmul float %3270, %4470, !dbg !37 + %5162 = fmul float %3272, %4492, !dbg !37 + %5163 = fmul float %3274, %4514, !dbg !37 + %5164 = fmul float %3276, %4536, !dbg !37 + %5165 = fmul float %3278, %4558, !dbg !37 + %5166 = fmul float %3280, %4580, !dbg !37 + %5167 = fmul float %3282, %4602, !dbg !37 + %5168 = fmul float %3284, %4624, !dbg !37 + %5169 = fmul float %3286, %4646, !dbg !37 + %5170 = fmul float %3288, %4668, !dbg !37 + %5171 = fmul float %3290, %4690, !dbg !37 + %5172 = fmul float %3292, %4712, !dbg !37 + %5173 = fmul float %3294, %4734, !dbg !37 + %5174 = fmul float %3296, %4756, !dbg !37 + %5175 = fmul float %3298, %4778, !dbg !37 + %5176 = fmul float %3300, %4800, !dbg !37 + %5177 = fmul float %3302, %4822, !dbg !37 + %5178 = fmul float %3304, %4844, !dbg !37 + %5179 = fmul float %3306, %4866, !dbg !37 + %5180 = fmul float %3308, %4888, !dbg !37 + %5181 = fmul float %3310, %4910, !dbg !37 + %5182 = fmul float %3312, %4932, !dbg !37 + %5183 = fmul float %3314, %4954, !dbg !37 + %5184 = fmul float %3316, %4976, !dbg !37 + %5185 = fmul float %3318, %4998, !dbg !37 + %5186 = fmul float %3320, %5020, !dbg !37 + %5187 = fmul float %3333, %5042, !dbg !37 + %5188 = fmul float %3334, %5064, !dbg !37 + %5189 = fcmp oeq <2 x bfloat> %204, splat (bfloat 0xRFF80), !dbg !16 + %5190 = insertelement <2 x i32> poison, i32 %1685, i64 0, !dbg !21 + %5191 = insertelement <2 x i32> %5190, i32 %1696, i64 1, !dbg !21 + %5192 = icmp eq <2 x i32> %5191, zeroinitializer, !dbg !21 + %5193 = insertelement <2 x float> poison, float %1687, i64 0, !dbg !21 + %5194 = insertelement <2 x float> %5193, float %1698, i64 1, !dbg !21 + %5195 = insertelement <2 x float> poison, float %1686, i64 0, !dbg !21 + %5196 = insertelement <2 x float> %5195, float %1697, i64 1, !dbg !21 + %5197 = select <2 x i1> %5192, <2 x float> %5194, <2 x float> %5196, !dbg !21 + %5198 = extractelement <2 x float> %5197, i64 0, !dbg !21 + %5199 = fadd float %5198, 0xC168000FE0000000, !dbg !21 + %5200 = fneg float %5199, !dbg !21 + %5201 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %397, float 0x3FF7154760000000, float %5200) #6, !dbg !21 + %5202 = tail call float @llvm.nvvm.fma.rn.f(float %397, float 0x3FF7154760000000, float %5200) #6, !dbg !21 + %.0.i704 = select i1 %.not3.i703, float %5202, float %5201, !dbg !21 + %5203 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %397, float 0x3E54AE0C00000000, float %.0.i704) #6, !dbg !21 + %5204 = tail call float @llvm.nvvm.fma.rn.f(float %397, float 0x3E54AE0C00000000, float %.0.i704) #6, !dbg !21 + %.01.i706 = select i1 %.not4.i705, float %5204, float %5203, !dbg !21 + %5205 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i706) #6, !dbg !21 + %5206 = extractelement <2 x float> %5197, i64 1, !dbg !21 + %5207 = fadd float %5206, 0xC168000FE0000000, !dbg !21 + %5208 = fneg float %5207, !dbg !21 + %5209 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %398, float 0x3FF7154760000000, float %5208) #6, !dbg !21 + %5210 = tail call float @llvm.nvvm.fma.rn.f(float %398, float 0x3FF7154760000000, float %5208) #6, !dbg !21 + %.0.i714 = select i1 %.not3.i713, float %5210, float %5209, !dbg !21 + %5211 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %398, float 0x3E54AE0C00000000, float %.0.i714) #6, !dbg !21 + %5212 = tail call float @llvm.nvvm.fma.rn.f(float %398, float 0x3E54AE0C00000000, float %.0.i714) #6, !dbg !21 + %.01.i716 = select i1 %.not4.i715, float %5212, float %5211, !dbg !21 + %5213 = bitcast <2 x float> %5197 to <2 x i32>, !dbg !21 + %5214 = shl <2 x i32> %5213, splat (i32 23), !dbg !21 + %5215 = bitcast <2 x i32> %5214 to <2 x float>, !dbg !21 + %5216 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i716) #6, !dbg !21 + %5217 = insertelement <2 x float> poison, float %5205, i64 0, !dbg !21 + %5218 = insertelement <2 x float> %5217, float %5216, i64 1, !dbg !21 + %5219 = fmul <2 x float> %5218, %5215, !dbg !21 + %5220 = insertelement <2 x i32> poison, i32 %3091, i64 0, !dbg !21 + %5221 = insertelement <2 x i32> %5220, i32 %3102, i64 1, !dbg !21 + %5222 = icmp eq <2 x i32> %5221, zeroinitializer, !dbg !21 + %5223 = insertelement <2 x float> poison, float %3093, i64 0, !dbg !21 + %5224 = insertelement <2 x float> %5223, float %3104, i64 1, !dbg !21 + %5225 = insertelement <2 x float> poison, float %3092, i64 0, !dbg !21 + %5226 = insertelement <2 x float> %5225, float %3103, i64 1, !dbg !21 + %5227 = select <2 x i1> %5222, <2 x float> %5224, <2 x float> %5226, !dbg !21 + %5228 = extractelement <2 x float> %5227, i64 0, !dbg !21 + %5229 = fadd float %5228, 0xC168000FE0000000, !dbg !21 + %5230 = fneg float %5229, !dbg !21 + %5231 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1803, float 0x3FF7154760000000, float %5230) #6, !dbg !21 + %5232 = tail call float @llvm.nvvm.fma.rn.f(float %1803, float 0x3FF7154760000000, float %5230) #6, !dbg !21 + %.0.i1344 = select i1 %.not3.i1343, float %5232, float %5231, !dbg !21 + %5233 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1803, float 0x3E54AE0C00000000, float %.0.i1344) #6, !dbg !21 + %5234 = tail call float @llvm.nvvm.fma.rn.f(float %1803, float 0x3E54AE0C00000000, float %.0.i1344) #6, !dbg !21 + %.01.i1346 = select i1 %.not4.i1345, float %5234, float %5233, !dbg !21 + %5235 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1346) #6, !dbg !21 + %5236 = extractelement <2 x float> %5227, i64 1, !dbg !21 + %5237 = fadd float %5236, 0xC168000FE0000000, !dbg !21 + %5238 = fneg float %5237, !dbg !21 + %5239 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1804, float 0x3FF7154760000000, float %5238) #6, !dbg !21 + %5240 = tail call float @llvm.nvvm.fma.rn.f(float %1804, float 0x3FF7154760000000, float %5238) #6, !dbg !21 + %.0.i1354 = select i1 %.not3.i1353, float %5240, float %5239, !dbg !21 + %5241 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1804, float 0x3E54AE0C00000000, float %.0.i1354) #6, !dbg !21 + %5242 = tail call float @llvm.nvvm.fma.rn.f(float %1804, float 0x3E54AE0C00000000, float %.0.i1354) #6, !dbg !21 + %.01.i1356 = select i1 %.not4.i1355, float %5242, float %5241, !dbg !21 + %5243 = bitcast <2 x float> %5227 to <2 x i32>, !dbg !21 + %5244 = shl <2 x i32> %5243, splat (i32 23), !dbg !21 + %5245 = bitcast <2 x i32> %5244 to <2 x float>, !dbg !21 + %5246 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1356) #6, !dbg !21 + %5247 = insertelement <2 x float> poison, float %5235, i64 0, !dbg !21 + %5248 = insertelement <2 x float> %5247, float %5246, i64 1, !dbg !21 + %5249 = fmul <2 x float> %5248, %5245, !dbg !21 + %5250 = fmul <2 x float> %5219, zeroinitializer, !dbg !23 + %5251 = fadd <2 x float> %5250, %5249, !dbg !24 + %5252 = select <2 x i1> %5189, <2 x float> splat (float 1.000000e+00), <2 x float> %5251, !dbg !24 + %5253 = insertelement <2 x i1> poison, i1 %16, i64 0, !dbg !26 + %5254 = shufflevector <2 x i1> %5253, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !26 + %5255 = select <2 x i1> %5254, <2 x float> %5252, <2 x float> zeroinitializer, !dbg !26 + %5256 = insertelement <2 x i32> poison, i32 %5071, i64 0, !dbg !36 + %5257 = insertelement <2 x i32> %5256, i32 %5082, i64 1, !dbg !36 + %5258 = icmp eq <2 x i32> %5257, zeroinitializer, !dbg !36 + %5259 = insertelement <2 x float> poison, float %5073, i64 0, !dbg !36 + %5260 = insertelement <2 x float> %5259, float %5084, i64 1, !dbg !36 + %5261 = insertelement <2 x float> poison, float %5072, i64 0, !dbg !36 + %5262 = insertelement <2 x float> %5261, float %5083, i64 1, !dbg !36 + %5263 = select <2 x i1> %5258, <2 x float> %5260, <2 x float> %5262, !dbg !36 + %5264 = extractelement <2 x float> %5263, i64 0, !dbg !36 + %5265 = fadd float %5264, 0xC168000FE0000000, !dbg !36 + %5266 = fneg float %5265, !dbg !36 + %5267 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3783, float 0x3FF7154760000000, float %5266) #6, !dbg !36 + %5268 = tail call float @llvm.nvvm.fma.rn.f(float %3783, float 0x3FF7154760000000, float %5266) #6, !dbg !36 + %.0.i1984 = select i1 %.not3.i1983, float %5268, float %5267, !dbg !36 + %5269 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3783, float 0x3E54AE0C00000000, float %.0.i1984) #6, !dbg !36 + %5270 = tail call float @llvm.nvvm.fma.rn.f(float %3783, float 0x3E54AE0C00000000, float %.0.i1984) #6, !dbg !36 + %.01.i1986 = select i1 %.not4.i1985, float %5270, float %5269, !dbg !36 + %5271 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1986) #6, !dbg !36 + %5272 = extractelement <2 x float> %5263, i64 1, !dbg !36 + %5273 = fadd float %5272, 0xC168000FE0000000, !dbg !36 + %5274 = fneg float %5273, !dbg !36 + %5275 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3784, float 0x3FF7154760000000, float %5274) #6, !dbg !36 + %5276 = tail call float @llvm.nvvm.fma.rn.f(float %3784, float 0x3FF7154760000000, float %5274) #6, !dbg !36 + %.0.i1994 = select i1 %.not3.i1993, float %5276, float %5275, !dbg !36 + %5277 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3784, float 0x3E54AE0C00000000, float %.0.i1994) #6, !dbg !36 + %5278 = tail call float @llvm.nvvm.fma.rn.f(float %3784, float 0x3E54AE0C00000000, float %.0.i1994) #6, !dbg !36 + %.01.i1996 = select i1 %.not4.i1995, float %5278, float %5277, !dbg !36 + %5279 = bitcast <2 x float> %5263 to <2 x i32>, !dbg !36 + %5280 = shl <2 x i32> %5279, splat (i32 23), !dbg !36 + %5281 = bitcast <2 x i32> %5280 to <2 x float>, !dbg !36 + %5282 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1996) #6, !dbg !36 + %5283 = insertelement <2 x float> poison, float %5271, i64 0, !dbg !36 + %5284 = insertelement <2 x float> %5283, float %5282, i64 1, !dbg !36 + %5285 = fmul <2 x float> %5284, %5281, !dbg !36 + %5286 = fmul <2 x float> %5255, %5285, !dbg !37 + %5287 = fcmp oeq <2 x bfloat> %206, splat (bfloat 0xRFF80), !dbg !16 + %5288 = insertelement <2 x i32> poison, i32 %1707, i64 0, !dbg !21 + %5289 = insertelement <2 x i32> %5288, i32 %1718, i64 1, !dbg !21 + %5290 = icmp eq <2 x i32> %5289, zeroinitializer, !dbg !21 + %5291 = insertelement <2 x float> poison, float %1709, i64 0, !dbg !21 + %5292 = insertelement <2 x float> %5291, float %1720, i64 1, !dbg !21 + %5293 = insertelement <2 x float> poison, float %1708, i64 0, !dbg !21 + %5294 = insertelement <2 x float> %5293, float %1719, i64 1, !dbg !21 + %5295 = select <2 x i1> %5290, <2 x float> %5292, <2 x float> %5294, !dbg !21 + %5296 = extractelement <2 x float> %5295, i64 0, !dbg !21 + %5297 = fadd float %5296, 0xC168000FE0000000, !dbg !21 + %5298 = fneg float %5297, !dbg !21 + %5299 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %399, float 0x3FF7154760000000, float %5298) #6, !dbg !21 + %5300 = tail call float @llvm.nvvm.fma.rn.f(float %399, float 0x3FF7154760000000, float %5298) #6, !dbg !21 + %.0.i724 = select i1 %.not3.i723, float %5300, float %5299, !dbg !21 + %5301 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %399, float 0x3E54AE0C00000000, float %.0.i724) #6, !dbg !21 + %5302 = tail call float @llvm.nvvm.fma.rn.f(float %399, float 0x3E54AE0C00000000, float %.0.i724) #6, !dbg !21 + %.01.i726 = select i1 %.not4.i725, float %5302, float %5301, !dbg !21 + %5303 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i726) #6, !dbg !21 + %5304 = extractelement <2 x float> %5295, i64 1, !dbg !21 + %5305 = fadd float %5304, 0xC168000FE0000000, !dbg !21 + %5306 = fneg float %5305, !dbg !21 + %5307 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %400, float 0x3FF7154760000000, float %5306) #6, !dbg !21 + %5308 = tail call float @llvm.nvvm.fma.rn.f(float %400, float 0x3FF7154760000000, float %5306) #6, !dbg !21 + %.0.i734 = select i1 %.not3.i733, float %5308, float %5307, !dbg !21 + %5309 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %400, float 0x3E54AE0C00000000, float %.0.i734) #6, !dbg !21 + %5310 = tail call float @llvm.nvvm.fma.rn.f(float %400, float 0x3E54AE0C00000000, float %.0.i734) #6, !dbg !21 + %.01.i736 = select i1 %.not4.i735, float %5310, float %5309, !dbg !21 + %5311 = bitcast <2 x float> %5295 to <2 x i32>, !dbg !21 + %5312 = shl <2 x i32> %5311, splat (i32 23), !dbg !21 + %5313 = bitcast <2 x i32> %5312 to <2 x float>, !dbg !21 + %5314 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i736) #6, !dbg !21 + %5315 = insertelement <2 x float> poison, float %5303, i64 0, !dbg !21 + %5316 = insertelement <2 x float> %5315, float %5314, i64 1, !dbg !21 + %5317 = fmul <2 x float> %5316, %5313, !dbg !21 + %5318 = insertelement <2 x i32> poison, i32 %3113, i64 0, !dbg !21 + %5319 = insertelement <2 x i32> %5318, i32 %3124, i64 1, !dbg !21 + %5320 = icmp eq <2 x i32> %5319, zeroinitializer, !dbg !21 + %5321 = insertelement <2 x float> poison, float %3115, i64 0, !dbg !21 + %5322 = insertelement <2 x float> %5321, float %3126, i64 1, !dbg !21 + %5323 = insertelement <2 x float> poison, float %3114, i64 0, !dbg !21 + %5324 = insertelement <2 x float> %5323, float %3125, i64 1, !dbg !21 + %5325 = select <2 x i1> %5320, <2 x float> %5322, <2 x float> %5324, !dbg !21 + %5326 = extractelement <2 x float> %5325, i64 0, !dbg !21 + %5327 = fadd float %5326, 0xC168000FE0000000, !dbg !21 + %5328 = fneg float %5327, !dbg !21 + %5329 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1805, float 0x3FF7154760000000, float %5328) #6, !dbg !21 + %5330 = tail call float @llvm.nvvm.fma.rn.f(float %1805, float 0x3FF7154760000000, float %5328) #6, !dbg !21 + %.0.i1364 = select i1 %.not3.i1363, float %5330, float %5329, !dbg !21 + %5331 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1805, float 0x3E54AE0C00000000, float %.0.i1364) #6, !dbg !21 + %5332 = tail call float @llvm.nvvm.fma.rn.f(float %1805, float 0x3E54AE0C00000000, float %.0.i1364) #6, !dbg !21 + %.01.i1366 = select i1 %.not4.i1365, float %5332, float %5331, !dbg !21 + %5333 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1366) #6, !dbg !21 + %5334 = extractelement <2 x float> %5325, i64 1, !dbg !21 + %5335 = fadd float %5334, 0xC168000FE0000000, !dbg !21 + %5336 = fneg float %5335, !dbg !21 + %5337 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1806, float 0x3FF7154760000000, float %5336) #6, !dbg !21 + %5338 = tail call float @llvm.nvvm.fma.rn.f(float %1806, float 0x3FF7154760000000, float %5336) #6, !dbg !21 + %.0.i1374 = select i1 %.not3.i1373, float %5338, float %5337, !dbg !21 + %5339 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1806, float 0x3E54AE0C00000000, float %.0.i1374) #6, !dbg !21 + %5340 = tail call float @llvm.nvvm.fma.rn.f(float %1806, float 0x3E54AE0C00000000, float %.0.i1374) #6, !dbg !21 + %.01.i1376 = select i1 %.not4.i1375, float %5340, float %5339, !dbg !21 + %5341 = bitcast <2 x float> %5325 to <2 x i32>, !dbg !21 + %5342 = shl <2 x i32> %5341, splat (i32 23), !dbg !21 + %5343 = bitcast <2 x i32> %5342 to <2 x float>, !dbg !21 + %5344 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1376) #6, !dbg !21 + %5345 = insertelement <2 x float> poison, float %5333, i64 0, !dbg !21 + %5346 = insertelement <2 x float> %5345, float %5344, i64 1, !dbg !21 + %5347 = fmul <2 x float> %5346, %5343, !dbg !21 + %5348 = fmul <2 x float> %5317, zeroinitializer, !dbg !23 + %5349 = fadd <2 x float> %5348, %5347, !dbg !24 + %5350 = select <2 x i1> %5287, <2 x float> splat (float 1.000000e+00), <2 x float> %5349, !dbg !24 + %5351 = select <2 x i1> %5254, <2 x float> %5350, <2 x float> zeroinitializer, !dbg !26 + %5352 = insertelement <2 x i32> poison, i32 %5093, i64 0, !dbg !36 + %5353 = insertelement <2 x i32> %5352, i32 %5104, i64 1, !dbg !36 + %5354 = icmp eq <2 x i32> %5353, zeroinitializer, !dbg !36 + %5355 = insertelement <2 x float> poison, float %5095, i64 0, !dbg !36 + %5356 = insertelement <2 x float> %5355, float %5106, i64 1, !dbg !36 + %5357 = insertelement <2 x float> poison, float %5094, i64 0, !dbg !36 + %5358 = insertelement <2 x float> %5357, float %5105, i64 1, !dbg !36 + %5359 = select <2 x i1> %5354, <2 x float> %5356, <2 x float> %5358, !dbg !36 + %5360 = extractelement <2 x float> %5359, i64 0, !dbg !36 + %5361 = fadd float %5360, 0xC168000FE0000000, !dbg !36 + %5362 = fneg float %5361, !dbg !36 + %5363 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3785, float 0x3FF7154760000000, float %5362) #6, !dbg !36 + %5364 = tail call float @llvm.nvvm.fma.rn.f(float %3785, float 0x3FF7154760000000, float %5362) #6, !dbg !36 + %.0.i2004 = select i1 %.not3.i2003, float %5364, float %5363, !dbg !36 + %5365 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3785, float 0x3E54AE0C00000000, float %.0.i2004) #6, !dbg !36 + %5366 = tail call float @llvm.nvvm.fma.rn.f(float %3785, float 0x3E54AE0C00000000, float %.0.i2004) #6, !dbg !36 + %.01.i2006 = select i1 %.not4.i2005, float %5366, float %5365, !dbg !36 + %5367 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2006) #6, !dbg !36 + %5368 = extractelement <2 x float> %5359, i64 1, !dbg !36 + %5369 = fadd float %5368, 0xC168000FE0000000, !dbg !36 + %5370 = fneg float %5369, !dbg !36 + %5371 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3786, float 0x3FF7154760000000, float %5370) #6, !dbg !36 + %5372 = tail call float @llvm.nvvm.fma.rn.f(float %3786, float 0x3FF7154760000000, float %5370) #6, !dbg !36 + %.0.i2014 = select i1 %.not3.i2013, float %5372, float %5371, !dbg !36 + %5373 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3786, float 0x3E54AE0C00000000, float %.0.i2014) #6, !dbg !36 + %5374 = tail call float @llvm.nvvm.fma.rn.f(float %3786, float 0x3E54AE0C00000000, float %.0.i2014) #6, !dbg !36 + %.01.i2016 = select i1 %.not4.i2015, float %5374, float %5373, !dbg !36 + %5375 = bitcast <2 x float> %5359 to <2 x i32>, !dbg !36 + %5376 = shl <2 x i32> %5375, splat (i32 23), !dbg !36 + %5377 = bitcast <2 x i32> %5376 to <2 x float>, !dbg !36 + %5378 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2016) #6, !dbg !36 + %5379 = insertelement <2 x float> poison, float %5367, i64 0, !dbg !36 + %5380 = insertelement <2 x float> %5379, float %5378, i64 1, !dbg !36 + %5381 = fmul <2 x float> %5380, %5377, !dbg !36 + %5382 = fmul <2 x float> %5351, %5381, !dbg !37 + %5383 = fcmp oeq <2 x bfloat> %208, splat (bfloat 0xRFF80), !dbg !16 + %5384 = insertelement <2 x i32> poison, i32 %1729, i64 0, !dbg !21 + %5385 = insertelement <2 x i32> %5384, i32 %1740, i64 1, !dbg !21 + %5386 = icmp eq <2 x i32> %5385, zeroinitializer, !dbg !21 + %5387 = insertelement <2 x float> poison, float %1731, i64 0, !dbg !21 + %5388 = insertelement <2 x float> %5387, float %1742, i64 1, !dbg !21 + %5389 = insertelement <2 x float> poison, float %1730, i64 0, !dbg !21 + %5390 = insertelement <2 x float> %5389, float %1741, i64 1, !dbg !21 + %5391 = select <2 x i1> %5386, <2 x float> %5388, <2 x float> %5390, !dbg !21 + %5392 = extractelement <2 x float> %5391, i64 0, !dbg !21 + %5393 = fadd float %5392, 0xC168000FE0000000, !dbg !21 + %5394 = fneg float %5393, !dbg !21 + %5395 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %401, float 0x3FF7154760000000, float %5394) #6, !dbg !21 + %5396 = tail call float @llvm.nvvm.fma.rn.f(float %401, float 0x3FF7154760000000, float %5394) #6, !dbg !21 + %.0.i744 = select i1 %.not3.i743, float %5396, float %5395, !dbg !21 + %5397 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %401, float 0x3E54AE0C00000000, float %.0.i744) #6, !dbg !21 + %5398 = tail call float @llvm.nvvm.fma.rn.f(float %401, float 0x3E54AE0C00000000, float %.0.i744) #6, !dbg !21 + %.01.i746 = select i1 %.not4.i745, float %5398, float %5397, !dbg !21 + %5399 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i746) #6, !dbg !21 + %5400 = extractelement <2 x float> %5391, i64 1, !dbg !21 + %5401 = fadd float %5400, 0xC168000FE0000000, !dbg !21 + %5402 = fneg float %5401, !dbg !21 + %5403 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %402, float 0x3FF7154760000000, float %5402) #6, !dbg !21 + %5404 = tail call float @llvm.nvvm.fma.rn.f(float %402, float 0x3FF7154760000000, float %5402) #6, !dbg !21 + %.0.i754 = select i1 %.not3.i753, float %5404, float %5403, !dbg !21 + %5405 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %402, float 0x3E54AE0C00000000, float %.0.i754) #6, !dbg !21 + %5406 = tail call float @llvm.nvvm.fma.rn.f(float %402, float 0x3E54AE0C00000000, float %.0.i754) #6, !dbg !21 + %.01.i756 = select i1 %.not4.i755, float %5406, float %5405, !dbg !21 + %5407 = bitcast <2 x float> %5391 to <2 x i32>, !dbg !21 + %5408 = shl <2 x i32> %5407, splat (i32 23), !dbg !21 + %5409 = bitcast <2 x i32> %5408 to <2 x float>, !dbg !21 + %5410 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i756) #6, !dbg !21 + %5411 = insertelement <2 x float> poison, float %5399, i64 0, !dbg !21 + %5412 = insertelement <2 x float> %5411, float %5410, i64 1, !dbg !21 + %5413 = fmul <2 x float> %5412, %5409, !dbg !21 + %5414 = insertelement <2 x i32> poison, i32 %3135, i64 0, !dbg !21 + %5415 = insertelement <2 x i32> %5414, i32 %3146, i64 1, !dbg !21 + %5416 = icmp eq <2 x i32> %5415, zeroinitializer, !dbg !21 + %5417 = insertelement <2 x float> poison, float %3137, i64 0, !dbg !21 + %5418 = insertelement <2 x float> %5417, float %3148, i64 1, !dbg !21 + %5419 = insertelement <2 x float> poison, float %3136, i64 0, !dbg !21 + %5420 = insertelement <2 x float> %5419, float %3147, i64 1, !dbg !21 + %5421 = select <2 x i1> %5416, <2 x float> %5418, <2 x float> %5420, !dbg !21 + %5422 = extractelement <2 x float> %5421, i64 0, !dbg !21 + %5423 = fadd float %5422, 0xC168000FE0000000, !dbg !21 + %5424 = fneg float %5423, !dbg !21 + %5425 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1807, float 0x3FF7154760000000, float %5424) #6, !dbg !21 + %5426 = tail call float @llvm.nvvm.fma.rn.f(float %1807, float 0x3FF7154760000000, float %5424) #6, !dbg !21 + %.0.i1384 = select i1 %.not3.i1383, float %5426, float %5425, !dbg !21 + %5427 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1807, float 0x3E54AE0C00000000, float %.0.i1384) #6, !dbg !21 + %5428 = tail call float @llvm.nvvm.fma.rn.f(float %1807, float 0x3E54AE0C00000000, float %.0.i1384) #6, !dbg !21 + %.01.i1386 = select i1 %.not4.i1385, float %5428, float %5427, !dbg !21 + %5429 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1386) #6, !dbg !21 + %5430 = extractelement <2 x float> %5421, i64 1, !dbg !21 + %5431 = fadd float %5430, 0xC168000FE0000000, !dbg !21 + %5432 = fneg float %5431, !dbg !21 + %5433 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1808, float 0x3FF7154760000000, float %5432) #6, !dbg !21 + %5434 = tail call float @llvm.nvvm.fma.rn.f(float %1808, float 0x3FF7154760000000, float %5432) #6, !dbg !21 + %.0.i1394 = select i1 %.not3.i1393, float %5434, float %5433, !dbg !21 + %5435 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1808, float 0x3E54AE0C00000000, float %.0.i1394) #6, !dbg !21 + %5436 = tail call float @llvm.nvvm.fma.rn.f(float %1808, float 0x3E54AE0C00000000, float %.0.i1394) #6, !dbg !21 + %.01.i1396 = select i1 %.not4.i1395, float %5436, float %5435, !dbg !21 + %5437 = bitcast <2 x float> %5421 to <2 x i32>, !dbg !21 + %5438 = shl <2 x i32> %5437, splat (i32 23), !dbg !21 + %5439 = bitcast <2 x i32> %5438 to <2 x float>, !dbg !21 + %5440 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1396) #6, !dbg !21 + %5441 = insertelement <2 x float> poison, float %5429, i64 0, !dbg !21 + %5442 = insertelement <2 x float> %5441, float %5440, i64 1, !dbg !21 + %5443 = fmul <2 x float> %5442, %5439, !dbg !21 + %5444 = fmul <2 x float> %5413, zeroinitializer, !dbg !23 + %5445 = fadd <2 x float> %5444, %5443, !dbg !24 + %5446 = select <2 x i1> %5383, <2 x float> splat (float 1.000000e+00), <2 x float> %5445, !dbg !24 + %5447 = select <2 x i1> %5254, <2 x float> %5446, <2 x float> zeroinitializer, !dbg !26 + %5448 = insertelement <2 x i32> poison, i32 %5115, i64 0, !dbg !36 + %5449 = insertelement <2 x i32> %5448, i32 %5126, i64 1, !dbg !36 + %5450 = icmp eq <2 x i32> %5449, zeroinitializer, !dbg !36 + %5451 = insertelement <2 x float> poison, float %5117, i64 0, !dbg !36 + %5452 = insertelement <2 x float> %5451, float %5128, i64 1, !dbg !36 + %5453 = insertelement <2 x float> poison, float %5116, i64 0, !dbg !36 + %5454 = insertelement <2 x float> %5453, float %5127, i64 1, !dbg !36 + %5455 = select <2 x i1> %5450, <2 x float> %5452, <2 x float> %5454, !dbg !36 + %5456 = extractelement <2 x float> %5455, i64 0, !dbg !36 + %5457 = fadd float %5456, 0xC168000FE0000000, !dbg !36 + %5458 = fneg float %5457, !dbg !36 + %5459 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3787, float 0x3FF7154760000000, float %5458) #6, !dbg !36 + %5460 = tail call float @llvm.nvvm.fma.rn.f(float %3787, float 0x3FF7154760000000, float %5458) #6, !dbg !36 + %.0.i2024 = select i1 %.not3.i2023, float %5460, float %5459, !dbg !36 + %5461 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3787, float 0x3E54AE0C00000000, float %.0.i2024) #6, !dbg !36 + %5462 = tail call float @llvm.nvvm.fma.rn.f(float %3787, float 0x3E54AE0C00000000, float %.0.i2024) #6, !dbg !36 + %.01.i2026 = select i1 %.not4.i2025, float %5462, float %5461, !dbg !36 + %5463 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2026) #6, !dbg !36 + %5464 = extractelement <2 x float> %5455, i64 1, !dbg !36 + %5465 = fadd float %5464, 0xC168000FE0000000, !dbg !36 + %5466 = fneg float %5465, !dbg !36 + %5467 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3788, float 0x3FF7154760000000, float %5466) #6, !dbg !36 + %5468 = tail call float @llvm.nvvm.fma.rn.f(float %3788, float 0x3FF7154760000000, float %5466) #6, !dbg !36 + %.0.i2034 = select i1 %.not3.i2033, float %5468, float %5467, !dbg !36 + %5469 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3788, float 0x3E54AE0C00000000, float %.0.i2034) #6, !dbg !36 + %5470 = tail call float @llvm.nvvm.fma.rn.f(float %3788, float 0x3E54AE0C00000000, float %.0.i2034) #6, !dbg !36 + %.01.i2036 = select i1 %.not4.i2035, float %5470, float %5469, !dbg !36 + %5471 = bitcast <2 x float> %5455 to <2 x i32>, !dbg !36 + %5472 = shl <2 x i32> %5471, splat (i32 23), !dbg !36 + %5473 = bitcast <2 x i32> %5472 to <2 x float>, !dbg !36 + %5474 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2036) #6, !dbg !36 + %5475 = insertelement <2 x float> poison, float %5463, i64 0, !dbg !36 + %5476 = insertelement <2 x float> %5475, float %5474, i64 1, !dbg !36 + %5477 = fmul <2 x float> %5476, %5473, !dbg !36 + %5478 = fmul <2 x float> %5447, %5477, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38 + %5479 = fadd float %5131, %5132, !dbg !41 + %5480 = fadd float %5479, %5133, !dbg !41 + %5481 = fadd float %5480, %5134, !dbg !41 + %5482 = fadd float %5481, %5135, !dbg !41 + %5483 = fadd float %5482, %5136, !dbg !41 + %5484 = fadd float %5483, %5137, !dbg !41 + %5485 = fadd float %5484, %5138, !dbg !41 + %5486 = fadd float %5485, %5139, !dbg !41 + %5487 = fadd float %5486, %5140, !dbg !41 + %5488 = fadd float %5487, %5141, !dbg !41 + %5489 = fadd float %5488, %5142, !dbg !41 + %5490 = fadd float %5489, %5143, !dbg !41 + %5491 = fadd float %5490, %5144, !dbg !41 + %5492 = fadd float %5491, %5145, !dbg !41 + %5493 = fadd float %5492, %5146, !dbg !41 + %5494 = fadd float %5493, %5147, !dbg !41 + %5495 = fadd float %5494, %5148, !dbg !41 + %5496 = fadd float %5495, %5149, !dbg !41 + %5497 = fadd float %5496, %5150, !dbg !41 + %5498 = fadd float %5497, %5151, !dbg !41 + %5499 = fadd float %5498, %5152, !dbg !41 + %5500 = fadd float %5499, %5153, !dbg !41 + %5501 = fadd float %5500, %5154, !dbg !41 + %5502 = fadd float %5501, %5155, !dbg !41 + %5503 = fadd float %5502, %5156, !dbg !41 + %5504 = fadd float %5503, %5157, !dbg !41 + %5505 = fadd float %5504, %5158, !dbg !41 + %5506 = fadd float %5505, %5159, !dbg !41 + %5507 = fadd float %5506, %5160, !dbg !41 + %5508 = fadd float %5507, %5161, !dbg !41 + %5509 = fadd float %5508, %5162, !dbg !41 + %5510 = fadd float %5509, %5163, !dbg !41 + %5511 = fadd float %5510, %5164, !dbg !41 + %5512 = fadd float %5511, %5165, !dbg !41 + %5513 = fadd float %5512, %5166, !dbg !41 + %5514 = fadd float %5513, %5167, !dbg !41 + %5515 = fadd float %5514, %5168, !dbg !41 + %5516 = fadd float %5515, %5169, !dbg !41 + %5517 = fadd float %5516, %5170, !dbg !41 + %5518 = fadd float %5517, %5171, !dbg !41 + %5519 = fadd float %5518, %5172, !dbg !41 + %5520 = fadd float %5519, %5173, !dbg !41 + %5521 = fadd float %5520, %5174, !dbg !41 + %5522 = fadd float %5521, %5175, !dbg !41 + %5523 = fadd float %5522, %5176, !dbg !41 + %5524 = fadd float %5523, %5177, !dbg !41 + %5525 = fadd float %5524, %5178, !dbg !41 + %5526 = fadd float %5525, %5179, !dbg !41 + %5527 = fadd float %5526, %5180, !dbg !41 + %5528 = fadd float %5527, %5181, !dbg !41 + %5529 = fadd float %5528, %5182, !dbg !41 + %5530 = fadd float %5529, %5183, !dbg !41 + %5531 = fadd float %5530, %5184, !dbg !41 + %5532 = fadd float %5531, %5185, !dbg !41 + %5533 = fadd float %5532, %5186, !dbg !41 + %5534 = fadd float %5533, %5187, !dbg !41 + %5535 = fadd float %5534, %5188, !dbg !41 + %5536 = extractelement <2 x float> %5286, i64 0, !dbg !41 + %5537 = fadd float %5535, %5536, !dbg !41 + %5538 = extractelement <2 x float> %5286, i64 1, !dbg !41 + %5539 = fadd float %5537, %5538, !dbg !41 + %5540 = extractelement <2 x float> %5382, i64 0, !dbg !41 + %5541 = fadd float %5539, %5540, !dbg !41 + %5542 = extractelement <2 x float> %5382, i64 1, !dbg !41 + %5543 = fadd float %5541, %5542, !dbg !41 + %5544 = extractelement <2 x float> %5478, i64 0, !dbg !41 + %5545 = fadd float %5543, %5544, !dbg !41 + %5546 = extractelement <2 x float> %5478, i64 1, !dbg !41 + %5547 = fadd float %5545, %5546, !dbg !41 + %5548 = bitcast float %5547 to i32, !dbg !38 + %5549 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %5548, i32 16, i32 31), !dbg !38 + %5550 = bitcast i32 %5549 to float, !dbg !38 + %5551 = fadd float %5547, %5550, !dbg !41 + %5552 = bitcast float %5551 to i32, !dbg !38 + %5553 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %5552, i32 8, i32 31), !dbg !38 + %5554 = bitcast i32 %5553 to float, !dbg !38 + %5555 = fadd float %5551, %5554, !dbg !41 + %5556 = bitcast float %5555 to i32, !dbg !38 + %5557 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %5556, i32 4, i32 31), !dbg !38 + %5558 = bitcast i32 %5557 to float, !dbg !38 + %5559 = fadd float %5555, %5558, !dbg !41 + %5560 = bitcast float %5559 to i32, !dbg !38 + %5561 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %5560, i32 2, i32 31), !dbg !38 + %5562 = bitcast i32 %5561 to float, !dbg !38 + %5563 = fadd float %5559, %5562, !dbg !41 + %5564 = bitcast float %5563 to i32, !dbg !38 + %5565 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %5564, i32 1, i32 31), !dbg !38 + %5566 = bitcast i32 %5565 to float, !dbg !38 + %5567 = fadd float %5563, %5566, !dbg !41 + %5568 = bitcast float %5567 to <1 x i32>, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %3623, <1 x i32> %5568, i1 %3622) #6, !dbg !38 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38 + %5569 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %3627, i1 %3626) #6, !dbg !38 + %5570 = bitcast i32 %5569 to float, !dbg !38 + %5571 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %5569, i32 8, i32 31), !dbg !38 + %5572 = bitcast i32 %5571 to float, !dbg !38 + %5573 = fadd float %5570, %5572, !dbg !41 + %5574 = bitcast float %5573 to i32, !dbg !38 + %5575 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %5574, i32 4, i32 31), !dbg !38 + %5576 = bitcast i32 %5575 to float, !dbg !38 + %5577 = fadd float %5573, %5576, !dbg !41 + %5578 = bitcast float %5577 to i32, !dbg !38 + %5579 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %5578, i32 2, i32 31), !dbg !38 + %5580 = bitcast i32 %5579 to float, !dbg !38 + %5581 = fadd float %5577, %5580, !dbg !41 + %5582 = bitcast float %5581 to i32, !dbg !38 + %5583 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %5582, i32 1, i32 31), !dbg !38 + %5584 = bitcast i32 %5583 to float, !dbg !38 + %5585 = fadd float %5581, %5584, !dbg !41 + %5586 = bitcast float %5585 to <1 x i32>, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %3627, <1 x i32> %5586, i1 %3656) #6, !dbg !38 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38 + %5587 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !38 + %5588 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !42 + %5589 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %58, i64 %5588, i1 true) #6, !dbg !42 + %5590 = extractvalue { i32, i32, i32, i32 } %5589, 0, !dbg !42 + %5591 = bitcast i32 %5590 to <2 x bfloat>, !dbg !42 + %5592 = extractvalue { i32, i32, i32, i32 } %5589, 1, !dbg !42 + %5593 = bitcast i32 %5592 to <2 x bfloat>, !dbg !42 + %5594 = extractvalue { i32, i32, i32, i32 } %5589, 2, !dbg !42 + %5595 = bitcast i32 %5594 to <2 x bfloat>, !dbg !42 + %5596 = extractvalue { i32, i32, i32, i32 } %5589, 3, !dbg !42 + %5597 = bitcast i32 %5596 to <2 x bfloat>, !dbg !42 + %5598 = extractelement <2 x bfloat> %5591, i64 0, !dbg !42 + %5599 = extractelement <2 x bfloat> %5591, i64 1, !dbg !42 + %5600 = extractelement <2 x bfloat> %5593, i64 0, !dbg !42 + %5601 = extractelement <2 x bfloat> %5593, i64 1, !dbg !42 + %5602 = extractelement <2 x bfloat> %5595, i64 0, !dbg !42 + %5603 = extractelement <2 x bfloat> %5595, i64 1, !dbg !42 + %5604 = extractelement <2 x bfloat> %5597, i64 0, !dbg !42 + %5605 = extractelement <2 x bfloat> %5597, i64 1, !dbg !42 + %5606 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !42 + %5607 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %60, i64 %5606, i1 true) #6, !dbg !42 + %5608 = extractvalue { i32, i32, i32, i32 } %5607, 0, !dbg !42 + %5609 = bitcast i32 %5608 to <2 x bfloat>, !dbg !42 + %5610 = extractvalue { i32, i32, i32, i32 } %5607, 1, !dbg !42 + %5611 = bitcast i32 %5610 to <2 x bfloat>, !dbg !42 + %5612 = extractvalue { i32, i32, i32, i32 } %5607, 2, !dbg !42 + %5613 = bitcast i32 %5612 to <2 x bfloat>, !dbg !42 + %5614 = extractvalue { i32, i32, i32, i32 } %5607, 3, !dbg !42 + %5615 = bitcast i32 %5614 to <2 x bfloat>, !dbg !42 + %5616 = extractelement <2 x bfloat> %5609, i64 0, !dbg !42 + %5617 = extractelement <2 x bfloat> %5609, i64 1, !dbg !42 + %5618 = extractelement <2 x bfloat> %5611, i64 0, !dbg !42 + %5619 = extractelement <2 x bfloat> %5611, i64 1, !dbg !42 + %5620 = extractelement <2 x bfloat> %5613, i64 0, !dbg !42 + %5621 = extractelement <2 x bfloat> %5613, i64 1, !dbg !42 + %5622 = extractelement <2 x bfloat> %5615, i64 0, !dbg !42 + %5623 = extractelement <2 x bfloat> %5615, i64 1, !dbg !42 + %5624 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !42 + %5625 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %62, i64 %5624, i1 true) #6, !dbg !42 + %5626 = extractvalue { i32, i32, i32, i32 } %5625, 0, !dbg !42 + %5627 = bitcast i32 %5626 to <2 x bfloat>, !dbg !42 + %5628 = extractvalue { i32, i32, i32, i32 } %5625, 1, !dbg !42 + %5629 = bitcast i32 %5628 to <2 x bfloat>, !dbg !42 + %5630 = extractvalue { i32, i32, i32, i32 } %5625, 2, !dbg !42 + %5631 = bitcast i32 %5630 to <2 x bfloat>, !dbg !42 + %5632 = extractvalue { i32, i32, i32, i32 } %5625, 3, !dbg !42 + %5633 = bitcast i32 %5632 to <2 x bfloat>, !dbg !42 + %5634 = extractelement <2 x bfloat> %5627, i64 0, !dbg !42 + %5635 = extractelement <2 x bfloat> %5627, i64 1, !dbg !42 + %5636 = extractelement <2 x bfloat> %5629, i64 0, !dbg !42 + %5637 = extractelement <2 x bfloat> %5629, i64 1, !dbg !42 + %5638 = extractelement <2 x bfloat> %5631, i64 0, !dbg !42 + %5639 = extractelement <2 x bfloat> %5631, i64 1, !dbg !42 + %5640 = extractelement <2 x bfloat> %5633, i64 0, !dbg !42 + %5641 = extractelement <2 x bfloat> %5633, i64 1, !dbg !42 + %5642 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !42 + %5643 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %64, i64 %5642, i1 true) #6, !dbg !42 + %5644 = extractvalue { i32, i32, i32, i32 } %5643, 0, !dbg !42 + %5645 = bitcast i32 %5644 to <2 x bfloat>, !dbg !42 + %5646 = extractvalue { i32, i32, i32, i32 } %5643, 1, !dbg !42 + %5647 = bitcast i32 %5646 to <2 x bfloat>, !dbg !42 + %5648 = extractvalue { i32, i32, i32, i32 } %5643, 2, !dbg !42 + %5649 = bitcast i32 %5648 to <2 x bfloat>, !dbg !42 + %5650 = extractvalue { i32, i32, i32, i32 } %5643, 3, !dbg !42 + %5651 = bitcast i32 %5650 to <2 x bfloat>, !dbg !42 + %5652 = extractelement <2 x bfloat> %5645, i64 0, !dbg !42 + %5653 = extractelement <2 x bfloat> %5645, i64 1, !dbg !42 + %5654 = extractelement <2 x bfloat> %5647, i64 0, !dbg !42 + %5655 = extractelement <2 x bfloat> %5647, i64 1, !dbg !42 + %5656 = extractelement <2 x bfloat> %5649, i64 0, !dbg !42 + %5657 = extractelement <2 x bfloat> %5649, i64 1, !dbg !42 + %5658 = extractelement <2 x bfloat> %5651, i64 0, !dbg !42 + %5659 = extractelement <2 x bfloat> %5651, i64 1, !dbg !42 + %5660 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !42 + %5661 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %66, i64 %5660, i1 true) #6, !dbg !42 + %5662 = extractvalue { i32, i32, i32, i32 } %5661, 0, !dbg !42 + %5663 = bitcast i32 %5662 to <2 x bfloat>, !dbg !42 + %5664 = extractvalue { i32, i32, i32, i32 } %5661, 1, !dbg !42 + %5665 = bitcast i32 %5664 to <2 x bfloat>, !dbg !42 + %5666 = extractvalue { i32, i32, i32, i32 } %5661, 2, !dbg !42 + %5667 = bitcast i32 %5666 to <2 x bfloat>, !dbg !42 + %5668 = extractvalue { i32, i32, i32, i32 } %5661, 3, !dbg !42 + %5669 = bitcast i32 %5668 to <2 x bfloat>, !dbg !42 + %5670 = extractelement <2 x bfloat> %5663, i64 0, !dbg !42 + %5671 = extractelement <2 x bfloat> %5663, i64 1, !dbg !42 + %5672 = extractelement <2 x bfloat> %5665, i64 0, !dbg !42 + %5673 = extractelement <2 x bfloat> %5665, i64 1, !dbg !42 + %5674 = extractelement <2 x bfloat> %5667, i64 0, !dbg !42 + %5675 = extractelement <2 x bfloat> %5667, i64 1, !dbg !42 + %5676 = extractelement <2 x bfloat> %5669, i64 0, !dbg !42 + %5677 = extractelement <2 x bfloat> %5669, i64 1, !dbg !42 + %5678 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !42 + %5679 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %68, i64 %5678, i1 true) #6, !dbg !42 + %5680 = extractvalue { i32, i32, i32, i32 } %5679, 0, !dbg !42 + %5681 = bitcast i32 %5680 to <2 x bfloat>, !dbg !42 + %5682 = extractvalue { i32, i32, i32, i32 } %5679, 1, !dbg !42 + %5683 = bitcast i32 %5682 to <2 x bfloat>, !dbg !42 + %5684 = extractvalue { i32, i32, i32, i32 } %5679, 2, !dbg !42 + %5685 = bitcast i32 %5684 to <2 x bfloat>, !dbg !42 + %5686 = extractvalue { i32, i32, i32, i32 } %5679, 3, !dbg !42 + %5687 = bitcast i32 %5686 to <2 x bfloat>, !dbg !42 + %5688 = extractelement <2 x bfloat> %5681, i64 0, !dbg !42 + %5689 = extractelement <2 x bfloat> %5681, i64 1, !dbg !42 + %5690 = extractelement <2 x bfloat> %5683, i64 0, !dbg !42 + %5691 = extractelement <2 x bfloat> %5683, i64 1, !dbg !42 + %5692 = extractelement <2 x bfloat> %5685, i64 0, !dbg !42 + %5693 = extractelement <2 x bfloat> %5685, i64 1, !dbg !42 + %5694 = extractelement <2 x bfloat> %5687, i64 0, !dbg !42 + %5695 = extractelement <2 x bfloat> %5687, i64 1, !dbg !42 + %5696 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !42 + %5697 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %70, i64 %5696, i1 true) #6, !dbg !42 + %5698 = extractvalue { i32, i32, i32, i32 } %5697, 0, !dbg !42 + %5699 = bitcast i32 %5698 to <2 x bfloat>, !dbg !42 + %5700 = extractvalue { i32, i32, i32, i32 } %5697, 1, !dbg !42 + %5701 = bitcast i32 %5700 to <2 x bfloat>, !dbg !42 + %5702 = extractvalue { i32, i32, i32, i32 } %5697, 2, !dbg !42 + %5703 = bitcast i32 %5702 to <2 x bfloat>, !dbg !42 + %5704 = extractvalue { i32, i32, i32, i32 } %5697, 3, !dbg !42 + %5705 = bitcast i32 %5704 to <2 x bfloat>, !dbg !42 + %5706 = extractelement <2 x bfloat> %5699, i64 0, !dbg !42 + %5707 = extractelement <2 x bfloat> %5699, i64 1, !dbg !42 + %5708 = extractelement <2 x bfloat> %5701, i64 0, !dbg !42 + %5709 = extractelement <2 x bfloat> %5701, i64 1, !dbg !42 + %5710 = extractelement <2 x bfloat> %5703, i64 0, !dbg !42 + %5711 = extractelement <2 x bfloat> %5703, i64 1, !dbg !42 + %5712 = extractelement <2 x bfloat> %5705, i64 0, !dbg !42 + %5713 = extractelement <2 x bfloat> %5705, i64 1, !dbg !42 + %5714 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !42 + %5715 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %72, i64 %5714, i1 %16) #6, !dbg !42 + %5716 = extractvalue { i32, i32, i32, i32 } %5715, 0, !dbg !42 + %5717 = bitcast i32 %5716 to <2 x bfloat>, !dbg !42 + %5718 = extractvalue { i32, i32, i32, i32 } %5715, 1, !dbg !42 + %5719 = bitcast i32 %5718 to <2 x bfloat>, !dbg !42 + %5720 = extractvalue { i32, i32, i32, i32 } %5715, 2, !dbg !42 + %5721 = bitcast i32 %5720 to <2 x bfloat>, !dbg !42 + %5722 = extractvalue { i32, i32, i32, i32 } %5715, 3, !dbg !42 + %5723 = bitcast i32 %5722 to <2 x bfloat>, !dbg !42 + %5724 = extractelement <2 x bfloat> %5717, i64 0, !dbg !42 + %5725 = extractelement <2 x bfloat> %5717, i64 1, !dbg !42 + %5726 = extractelement <2 x bfloat> %5719, i64 0, !dbg !42 + %5727 = extractelement <2 x bfloat> %5719, i64 1, !dbg !42 + %5728 = extractelement <2 x bfloat> %5721, i64 0, !dbg !42 + %5729 = extractelement <2 x bfloat> %5721, i64 1, !dbg !42 + %5730 = extractelement <2 x bfloat> %5723, i64 0, !dbg !42 + %5731 = extractelement <2 x bfloat> %5723, i64 1, !dbg !42 + %5732 = fpext bfloat %5598 to float, !dbg !43 + %5733 = fpext bfloat %5599 to float, !dbg !43 + %5734 = fpext bfloat %5600 to float, !dbg !43 + %5735 = fpext bfloat %5601 to float, !dbg !43 + %5736 = fpext bfloat %5602 to float, !dbg !43 + %5737 = fpext bfloat %5603 to float, !dbg !43 + %5738 = fpext bfloat %5604 to float, !dbg !43 + %5739 = fpext bfloat %5605 to float, !dbg !43 + %5740 = fpext bfloat %5616 to float, !dbg !43 + %5741 = fpext bfloat %5617 to float, !dbg !43 + %5742 = fpext bfloat %5618 to float, !dbg !43 + %5743 = fpext bfloat %5619 to float, !dbg !43 + %5744 = fpext bfloat %5620 to float, !dbg !43 + %5745 = fpext bfloat %5621 to float, !dbg !43 + %5746 = fpext bfloat %5622 to float, !dbg !43 + %5747 = fpext bfloat %5623 to float, !dbg !43 + %5748 = fpext bfloat %5634 to float, !dbg !43 + %5749 = fpext bfloat %5635 to float, !dbg !43 + %5750 = fpext bfloat %5636 to float, !dbg !43 + %5751 = fpext bfloat %5637 to float, !dbg !43 + %5752 = fpext bfloat %5638 to float, !dbg !43 + %5753 = fpext bfloat %5639 to float, !dbg !43 + %5754 = fpext bfloat %5640 to float, !dbg !43 + %5755 = fpext bfloat %5641 to float, !dbg !43 + %5756 = fpext bfloat %5652 to float, !dbg !43 + %5757 = fpext bfloat %5653 to float, !dbg !43 + %5758 = fpext bfloat %5654 to float, !dbg !43 + %5759 = fpext bfloat %5655 to float, !dbg !43 + %5760 = fpext bfloat %5656 to float, !dbg !43 + %5761 = fpext bfloat %5657 to float, !dbg !43 + %5762 = fpext bfloat %5658 to float, !dbg !43 + %5763 = fpext bfloat %5659 to float, !dbg !43 + %5764 = fpext bfloat %5670 to float, !dbg !43 + %5765 = fpext bfloat %5671 to float, !dbg !43 + %5766 = fpext bfloat %5672 to float, !dbg !43 + %5767 = fpext bfloat %5673 to float, !dbg !43 + %5768 = fpext bfloat %5674 to float, !dbg !43 + %5769 = fpext bfloat %5675 to float, !dbg !43 + %5770 = fpext bfloat %5676 to float, !dbg !43 + %5771 = fpext bfloat %5677 to float, !dbg !43 + %5772 = fpext bfloat %5688 to float, !dbg !43 + %5773 = fpext bfloat %5689 to float, !dbg !43 + %5774 = fpext bfloat %5690 to float, !dbg !43 + %5775 = fpext bfloat %5691 to float, !dbg !43 + %5776 = fpext bfloat %5692 to float, !dbg !43 + %5777 = fpext bfloat %5693 to float, !dbg !43 + %5778 = fpext bfloat %5694 to float, !dbg !43 + %5779 = fpext bfloat %5695 to float, !dbg !43 + %5780 = fpext bfloat %5706 to float, !dbg !43 + %5781 = fpext bfloat %5707 to float, !dbg !43 + %5782 = fpext bfloat %5708 to float, !dbg !43 + %5783 = fpext bfloat %5709 to float, !dbg !43 + %5784 = fpext bfloat %5710 to float, !dbg !43 + %5785 = fpext bfloat %5711 to float, !dbg !43 + %5786 = fpext bfloat %5712 to float, !dbg !43 + %5787 = fpext bfloat %5713 to float, !dbg !43 + %5788 = fpext bfloat %5724 to float, !dbg !43 + %5789 = fpext bfloat %5725 to float, !dbg !43 + %5790 = fpext bfloat %5726 to float, !dbg !43 + %5791 = fpext bfloat %5727 to float, !dbg !43 + %5792 = fpext bfloat %5728 to float, !dbg !43 + %5793 = fpext bfloat %5729 to float, !dbg !43 + %5794 = fpext bfloat %5730 to float, !dbg !43 + %5795 = fpext bfloat %5731 to float, !dbg !43 + %5796 = fsub float %5732, %3659, !dbg !44 + %5797 = fsub float %5733, %3659, !dbg !44 + %5798 = fsub float %5734, %3659, !dbg !44 + %5799 = fsub float %5735, %3659, !dbg !44 + %5800 = fsub float %5736, %3659, !dbg !44 + %5801 = fsub float %5737, %3659, !dbg !44 + %5802 = fsub float %5738, %3659, !dbg !44 + %5803 = fsub float %5739, %3659, !dbg !44 + %5804 = fsub float %5740, %3659, !dbg !44 + %5805 = fsub float %5741, %3659, !dbg !44 + %5806 = fsub float %5742, %3659, !dbg !44 + %5807 = fsub float %5743, %3659, !dbg !44 + %5808 = fsub float %5744, %3659, !dbg !44 + %5809 = fsub float %5745, %3659, !dbg !44 + %5810 = fsub float %5746, %3659, !dbg !44 + %5811 = fsub float %5747, %3659, !dbg !44 + %5812 = fsub float %5748, %3659, !dbg !44 + %5813 = fsub float %5749, %3659, !dbg !44 + %5814 = fsub float %5750, %3659, !dbg !44 + %5815 = fsub float %5751, %3659, !dbg !44 + %5816 = fsub float %5752, %3659, !dbg !44 + %5817 = fsub float %5753, %3659, !dbg !44 + %5818 = fsub float %5754, %3659, !dbg !44 + %5819 = fsub float %5755, %3659, !dbg !44 + %5820 = fsub float %5756, %3659, !dbg !44 + %5821 = fsub float %5757, %3659, !dbg !44 + %5822 = fsub float %5758, %3659, !dbg !44 + %5823 = fsub float %5759, %3659, !dbg !44 + %5824 = fsub float %5760, %3659, !dbg !44 + %5825 = fsub float %5761, %3659, !dbg !44 + %5826 = fsub float %5762, %3659, !dbg !44 + %5827 = fsub float %5763, %3659, !dbg !44 + %5828 = fsub float %5764, %3659, !dbg !44 + %5829 = fsub float %5765, %3659, !dbg !44 + %5830 = fsub float %5766, %3659, !dbg !44 + %5831 = fsub float %5767, %3659, !dbg !44 + %5832 = fsub float %5768, %3659, !dbg !44 + %5833 = fsub float %5769, %3659, !dbg !44 + %5834 = fsub float %5770, %3659, !dbg !44 + %5835 = fsub float %5771, %3659, !dbg !44 + %5836 = fsub float %5772, %3659, !dbg !44 + %5837 = fsub float %5773, %3659, !dbg !44 + %5838 = fsub float %5774, %3659, !dbg !44 + %5839 = fsub float %5775, %3659, !dbg !44 + %5840 = fsub float %5776, %3659, !dbg !44 + %5841 = fsub float %5777, %3659, !dbg !44 + %5842 = fsub float %5778, %3659, !dbg !44 + %5843 = fsub float %5779, %3659, !dbg !44 + %5844 = fsub float %5780, %3659, !dbg !44 + %5845 = fsub float %5781, %3659, !dbg !44 + %5846 = fsub float %5782, %3659, !dbg !44 + %5847 = fsub float %5783, %3659, !dbg !44 + %5848 = fsub float %5784, %3659, !dbg !44 + %5849 = fsub float %5785, %3659, !dbg !44 + %5850 = fsub float %5786, %3659, !dbg !44 + %5851 = fsub float %5787, %3659, !dbg !44 + %5852 = fsub float %5788, %3659, !dbg !44 + %5853 = fsub float %5789, %3659, !dbg !44 + %5854 = fsub float %5790, %3659, !dbg !44 + %5855 = fsub float %5791, %3659, !dbg !44 + %5856 = fsub float %5792, %3659, !dbg !44 + %5857 = fsub float %5793, %3659, !dbg !44 + %5858 = fsub float %5794, %3659, !dbg !44 + %5859 = fsub float %5795, %3659, !dbg !44 + %5860 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2037 = icmp eq i32 %5860, 0, !dbg !45 + %5861 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5796, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %5862 = tail call float @llvm.nvvm.fma.rn.f(float %5796, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2038 = select i1 %.not.i2037, float %5862, float %5861, !dbg !45 + %5863 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2039 = icmp eq i32 %5863, 0, !dbg !45 + %5864 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2038) #6, !dbg !45 + %5865 = tail call float @llvm.nvvm.saturate.f(float %.02.i2038) #6, !dbg !45 + %.03.i2040 = select i1 %.not1.i2039, float %5865, float %5864, !dbg !45 + %5866 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2041 = icmp eq i32 %5866, 0, !dbg !45 + %5867 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2040, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %5868 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2040, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2042 = select i1 %.not2.i2041, float %5868, float %5867, !dbg !45 + %5869 = fadd float %.04.i2042, 0xC168000FE0000000, !dbg !45 + %5870 = fneg float %5869, !dbg !45 + %5871 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2043 = icmp eq i32 %5871, 0, !dbg !45 + %5872 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5796, float 0x3FF7154760000000, float %5870) #6, !dbg !45 + %5873 = tail call float @llvm.nvvm.fma.rn.f(float %5796, float 0x3FF7154760000000, float %5870) #6, !dbg !45 + %.0.i2044 = select i1 %.not3.i2043, float %5873, float %5872, !dbg !45 + %5874 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2045 = icmp eq i32 %5874, 0, !dbg !45 + %5875 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5796, float 0x3E54AE0C00000000, float %.0.i2044) #6, !dbg !45 + %5876 = tail call float @llvm.nvvm.fma.rn.f(float %5796, float 0x3E54AE0C00000000, float %.0.i2044) #6, !dbg !45 + %.01.i2046 = select i1 %.not4.i2045, float %5876, float %5875, !dbg !45 + %5877 = bitcast float %.04.i2042 to i32, !dbg !45 + %5878 = shl i32 %5877, 23, !dbg !45 + %5879 = bitcast i32 %5878 to float, !dbg !45 + %5880 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2046) #6, !dbg !45 + %5881 = fmul float %5880, %5879, !dbg !45 + %5882 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2047 = icmp eq i32 %5882, 0, !dbg !45 + %5883 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5797, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %5884 = tail call float @llvm.nvvm.fma.rn.f(float %5797, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2048 = select i1 %.not.i2047, float %5884, float %5883, !dbg !45 + %5885 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2049 = icmp eq i32 %5885, 0, !dbg !45 + %5886 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2048) #6, !dbg !45 + %5887 = tail call float @llvm.nvvm.saturate.f(float %.02.i2048) #6, !dbg !45 + %.03.i2050 = select i1 %.not1.i2049, float %5887, float %5886, !dbg !45 + %5888 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2051 = icmp eq i32 %5888, 0, !dbg !45 + %5889 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2050, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %5890 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2050, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2052 = select i1 %.not2.i2051, float %5890, float %5889, !dbg !45 + %5891 = fadd float %.04.i2052, 0xC168000FE0000000, !dbg !45 + %5892 = fneg float %5891, !dbg !45 + %5893 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2053 = icmp eq i32 %5893, 0, !dbg !45 + %5894 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5797, float 0x3FF7154760000000, float %5892) #6, !dbg !45 + %5895 = tail call float @llvm.nvvm.fma.rn.f(float %5797, float 0x3FF7154760000000, float %5892) #6, !dbg !45 + %.0.i2054 = select i1 %.not3.i2053, float %5895, float %5894, !dbg !45 + %5896 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2055 = icmp eq i32 %5896, 0, !dbg !45 + %5897 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5797, float 0x3E54AE0C00000000, float %.0.i2054) #6, !dbg !45 + %5898 = tail call float @llvm.nvvm.fma.rn.f(float %5797, float 0x3E54AE0C00000000, float %.0.i2054) #6, !dbg !45 + %.01.i2056 = select i1 %.not4.i2055, float %5898, float %5897, !dbg !45 + %5899 = bitcast float %.04.i2052 to i32, !dbg !45 + %5900 = shl i32 %5899, 23, !dbg !45 + %5901 = bitcast i32 %5900 to float, !dbg !45 + %5902 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2056) #6, !dbg !45 + %5903 = fmul float %5902, %5901, !dbg !45 + %5904 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2057 = icmp eq i32 %5904, 0, !dbg !45 + %5905 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5798, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %5906 = tail call float @llvm.nvvm.fma.rn.f(float %5798, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2058 = select i1 %.not.i2057, float %5906, float %5905, !dbg !45 + %5907 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2059 = icmp eq i32 %5907, 0, !dbg !45 + %5908 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2058) #6, !dbg !45 + %5909 = tail call float @llvm.nvvm.saturate.f(float %.02.i2058) #6, !dbg !45 + %.03.i2060 = select i1 %.not1.i2059, float %5909, float %5908, !dbg !45 + %5910 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2061 = icmp eq i32 %5910, 0, !dbg !45 + %5911 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2060, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %5912 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2060, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2062 = select i1 %.not2.i2061, float %5912, float %5911, !dbg !45 + %5913 = fadd float %.04.i2062, 0xC168000FE0000000, !dbg !45 + %5914 = fneg float %5913, !dbg !45 + %5915 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2063 = icmp eq i32 %5915, 0, !dbg !45 + %5916 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5798, float 0x3FF7154760000000, float %5914) #6, !dbg !45 + %5917 = tail call float @llvm.nvvm.fma.rn.f(float %5798, float 0x3FF7154760000000, float %5914) #6, !dbg !45 + %.0.i2064 = select i1 %.not3.i2063, float %5917, float %5916, !dbg !45 + %5918 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2065 = icmp eq i32 %5918, 0, !dbg !45 + %5919 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5798, float 0x3E54AE0C00000000, float %.0.i2064) #6, !dbg !45 + %5920 = tail call float @llvm.nvvm.fma.rn.f(float %5798, float 0x3E54AE0C00000000, float %.0.i2064) #6, !dbg !45 + %.01.i2066 = select i1 %.not4.i2065, float %5920, float %5919, !dbg !45 + %5921 = bitcast float %.04.i2062 to i32, !dbg !45 + %5922 = shl i32 %5921, 23, !dbg !45 + %5923 = bitcast i32 %5922 to float, !dbg !45 + %5924 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2066) #6, !dbg !45 + %5925 = fmul float %5924, %5923, !dbg !45 + %5926 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2067 = icmp eq i32 %5926, 0, !dbg !45 + %5927 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5799, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %5928 = tail call float @llvm.nvvm.fma.rn.f(float %5799, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2068 = select i1 %.not.i2067, float %5928, float %5927, !dbg !45 + %5929 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2069 = icmp eq i32 %5929, 0, !dbg !45 + %5930 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2068) #6, !dbg !45 + %5931 = tail call float @llvm.nvvm.saturate.f(float %.02.i2068) #6, !dbg !45 + %.03.i2070 = select i1 %.not1.i2069, float %5931, float %5930, !dbg !45 + %5932 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2071 = icmp eq i32 %5932, 0, !dbg !45 + %5933 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2070, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %5934 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2070, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2072 = select i1 %.not2.i2071, float %5934, float %5933, !dbg !45 + %5935 = fadd float %.04.i2072, 0xC168000FE0000000, !dbg !45 + %5936 = fneg float %5935, !dbg !45 + %5937 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2073 = icmp eq i32 %5937, 0, !dbg !45 + %5938 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5799, float 0x3FF7154760000000, float %5936) #6, !dbg !45 + %5939 = tail call float @llvm.nvvm.fma.rn.f(float %5799, float 0x3FF7154760000000, float %5936) #6, !dbg !45 + %.0.i2074 = select i1 %.not3.i2073, float %5939, float %5938, !dbg !45 + %5940 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2075 = icmp eq i32 %5940, 0, !dbg !45 + %5941 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5799, float 0x3E54AE0C00000000, float %.0.i2074) #6, !dbg !45 + %5942 = tail call float @llvm.nvvm.fma.rn.f(float %5799, float 0x3E54AE0C00000000, float %.0.i2074) #6, !dbg !45 + %.01.i2076 = select i1 %.not4.i2075, float %5942, float %5941, !dbg !45 + %5943 = bitcast float %.04.i2072 to i32, !dbg !45 + %5944 = shl i32 %5943, 23, !dbg !45 + %5945 = bitcast i32 %5944 to float, !dbg !45 + %5946 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2076) #6, !dbg !45 + %5947 = fmul float %5946, %5945, !dbg !45 + %5948 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2077 = icmp eq i32 %5948, 0, !dbg !45 + %5949 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5800, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %5950 = tail call float @llvm.nvvm.fma.rn.f(float %5800, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2078 = select i1 %.not.i2077, float %5950, float %5949, !dbg !45 + %5951 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2079 = icmp eq i32 %5951, 0, !dbg !45 + %5952 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2078) #6, !dbg !45 + %5953 = tail call float @llvm.nvvm.saturate.f(float %.02.i2078) #6, !dbg !45 + %.03.i2080 = select i1 %.not1.i2079, float %5953, float %5952, !dbg !45 + %5954 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2081 = icmp eq i32 %5954, 0, !dbg !45 + %5955 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2080, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %5956 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2080, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2082 = select i1 %.not2.i2081, float %5956, float %5955, !dbg !45 + %5957 = fadd float %.04.i2082, 0xC168000FE0000000, !dbg !45 + %5958 = fneg float %5957, !dbg !45 + %5959 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2083 = icmp eq i32 %5959, 0, !dbg !45 + %5960 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5800, float 0x3FF7154760000000, float %5958) #6, !dbg !45 + %5961 = tail call float @llvm.nvvm.fma.rn.f(float %5800, float 0x3FF7154760000000, float %5958) #6, !dbg !45 + %.0.i2084 = select i1 %.not3.i2083, float %5961, float %5960, !dbg !45 + %5962 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2085 = icmp eq i32 %5962, 0, !dbg !45 + %5963 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5800, float 0x3E54AE0C00000000, float %.0.i2084) #6, !dbg !45 + %5964 = tail call float @llvm.nvvm.fma.rn.f(float %5800, float 0x3E54AE0C00000000, float %.0.i2084) #6, !dbg !45 + %.01.i2086 = select i1 %.not4.i2085, float %5964, float %5963, !dbg !45 + %5965 = bitcast float %.04.i2082 to i32, !dbg !45 + %5966 = shl i32 %5965, 23, !dbg !45 + %5967 = bitcast i32 %5966 to float, !dbg !45 + %5968 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2086) #6, !dbg !45 + %5969 = fmul float %5968, %5967, !dbg !45 + %5970 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2087 = icmp eq i32 %5970, 0, !dbg !45 + %5971 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5801, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %5972 = tail call float @llvm.nvvm.fma.rn.f(float %5801, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2088 = select i1 %.not.i2087, float %5972, float %5971, !dbg !45 + %5973 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2089 = icmp eq i32 %5973, 0, !dbg !45 + %5974 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2088) #6, !dbg !45 + %5975 = tail call float @llvm.nvvm.saturate.f(float %.02.i2088) #6, !dbg !45 + %.03.i2090 = select i1 %.not1.i2089, float %5975, float %5974, !dbg !45 + %5976 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2091 = icmp eq i32 %5976, 0, !dbg !45 + %5977 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2090, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %5978 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2090, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2092 = select i1 %.not2.i2091, float %5978, float %5977, !dbg !45 + %5979 = fadd float %.04.i2092, 0xC168000FE0000000, !dbg !45 + %5980 = fneg float %5979, !dbg !45 + %5981 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2093 = icmp eq i32 %5981, 0, !dbg !45 + %5982 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5801, float 0x3FF7154760000000, float %5980) #6, !dbg !45 + %5983 = tail call float @llvm.nvvm.fma.rn.f(float %5801, float 0x3FF7154760000000, float %5980) #6, !dbg !45 + %.0.i2094 = select i1 %.not3.i2093, float %5983, float %5982, !dbg !45 + %5984 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2095 = icmp eq i32 %5984, 0, !dbg !45 + %5985 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5801, float 0x3E54AE0C00000000, float %.0.i2094) #6, !dbg !45 + %5986 = tail call float @llvm.nvvm.fma.rn.f(float %5801, float 0x3E54AE0C00000000, float %.0.i2094) #6, !dbg !45 + %.01.i2096 = select i1 %.not4.i2095, float %5986, float %5985, !dbg !45 + %5987 = bitcast float %.04.i2092 to i32, !dbg !45 + %5988 = shl i32 %5987, 23, !dbg !45 + %5989 = bitcast i32 %5988 to float, !dbg !45 + %5990 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2096) #6, !dbg !45 + %5991 = fmul float %5990, %5989, !dbg !45 + %5992 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2097 = icmp eq i32 %5992, 0, !dbg !45 + %5993 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5802, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %5994 = tail call float @llvm.nvvm.fma.rn.f(float %5802, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2098 = select i1 %.not.i2097, float %5994, float %5993, !dbg !45 + %5995 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2099 = icmp eq i32 %5995, 0, !dbg !45 + %5996 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2098) #6, !dbg !45 + %5997 = tail call float @llvm.nvvm.saturate.f(float %.02.i2098) #6, !dbg !45 + %.03.i2100 = select i1 %.not1.i2099, float %5997, float %5996, !dbg !45 + %5998 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2101 = icmp eq i32 %5998, 0, !dbg !45 + %5999 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2100, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6000 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2100, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2102 = select i1 %.not2.i2101, float %6000, float %5999, !dbg !45 + %6001 = fadd float %.04.i2102, 0xC168000FE0000000, !dbg !45 + %6002 = fneg float %6001, !dbg !45 + %6003 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2103 = icmp eq i32 %6003, 0, !dbg !45 + %6004 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5802, float 0x3FF7154760000000, float %6002) #6, !dbg !45 + %6005 = tail call float @llvm.nvvm.fma.rn.f(float %5802, float 0x3FF7154760000000, float %6002) #6, !dbg !45 + %.0.i2104 = select i1 %.not3.i2103, float %6005, float %6004, !dbg !45 + %6006 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2105 = icmp eq i32 %6006, 0, !dbg !45 + %6007 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5802, float 0x3E54AE0C00000000, float %.0.i2104) #6, !dbg !45 + %6008 = tail call float @llvm.nvvm.fma.rn.f(float %5802, float 0x3E54AE0C00000000, float %.0.i2104) #6, !dbg !45 + %.01.i2106 = select i1 %.not4.i2105, float %6008, float %6007, !dbg !45 + %6009 = bitcast float %.04.i2102 to i32, !dbg !45 + %6010 = shl i32 %6009, 23, !dbg !45 + %6011 = bitcast i32 %6010 to float, !dbg !45 + %6012 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2106) #6, !dbg !45 + %6013 = fmul float %6012, %6011, !dbg !45 + %6014 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2107 = icmp eq i32 %6014, 0, !dbg !45 + %6015 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5803, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6016 = tail call float @llvm.nvvm.fma.rn.f(float %5803, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2108 = select i1 %.not.i2107, float %6016, float %6015, !dbg !45 + %6017 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2109 = icmp eq i32 %6017, 0, !dbg !45 + %6018 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2108) #6, !dbg !45 + %6019 = tail call float @llvm.nvvm.saturate.f(float %.02.i2108) #6, !dbg !45 + %.03.i2110 = select i1 %.not1.i2109, float %6019, float %6018, !dbg !45 + %6020 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2111 = icmp eq i32 %6020, 0, !dbg !45 + %6021 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2110, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6022 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2110, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2112 = select i1 %.not2.i2111, float %6022, float %6021, !dbg !45 + %6023 = fadd float %.04.i2112, 0xC168000FE0000000, !dbg !45 + %6024 = fneg float %6023, !dbg !45 + %6025 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2113 = icmp eq i32 %6025, 0, !dbg !45 + %6026 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5803, float 0x3FF7154760000000, float %6024) #6, !dbg !45 + %6027 = tail call float @llvm.nvvm.fma.rn.f(float %5803, float 0x3FF7154760000000, float %6024) #6, !dbg !45 + %.0.i2114 = select i1 %.not3.i2113, float %6027, float %6026, !dbg !45 + %6028 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2115 = icmp eq i32 %6028, 0, !dbg !45 + %6029 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5803, float 0x3E54AE0C00000000, float %.0.i2114) #6, !dbg !45 + %6030 = tail call float @llvm.nvvm.fma.rn.f(float %5803, float 0x3E54AE0C00000000, float %.0.i2114) #6, !dbg !45 + %.01.i2116 = select i1 %.not4.i2115, float %6030, float %6029, !dbg !45 + %6031 = bitcast float %.04.i2112 to i32, !dbg !45 + %6032 = shl i32 %6031, 23, !dbg !45 + %6033 = bitcast i32 %6032 to float, !dbg !45 + %6034 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2116) #6, !dbg !45 + %6035 = fmul float %6034, %6033, !dbg !45 + %6036 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2117 = icmp eq i32 %6036, 0, !dbg !45 + %6037 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5804, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6038 = tail call float @llvm.nvvm.fma.rn.f(float %5804, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2118 = select i1 %.not.i2117, float %6038, float %6037, !dbg !45 + %6039 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2119 = icmp eq i32 %6039, 0, !dbg !45 + %6040 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2118) #6, !dbg !45 + %6041 = tail call float @llvm.nvvm.saturate.f(float %.02.i2118) #6, !dbg !45 + %.03.i2120 = select i1 %.not1.i2119, float %6041, float %6040, !dbg !45 + %6042 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2121 = icmp eq i32 %6042, 0, !dbg !45 + %6043 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2120, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6044 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2120, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2122 = select i1 %.not2.i2121, float %6044, float %6043, !dbg !45 + %6045 = fadd float %.04.i2122, 0xC168000FE0000000, !dbg !45 + %6046 = fneg float %6045, !dbg !45 + %6047 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2123 = icmp eq i32 %6047, 0, !dbg !45 + %6048 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5804, float 0x3FF7154760000000, float %6046) #6, !dbg !45 + %6049 = tail call float @llvm.nvvm.fma.rn.f(float %5804, float 0x3FF7154760000000, float %6046) #6, !dbg !45 + %.0.i2124 = select i1 %.not3.i2123, float %6049, float %6048, !dbg !45 + %6050 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2125 = icmp eq i32 %6050, 0, !dbg !45 + %6051 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5804, float 0x3E54AE0C00000000, float %.0.i2124) #6, !dbg !45 + %6052 = tail call float @llvm.nvvm.fma.rn.f(float %5804, float 0x3E54AE0C00000000, float %.0.i2124) #6, !dbg !45 + %.01.i2126 = select i1 %.not4.i2125, float %6052, float %6051, !dbg !45 + %6053 = bitcast float %.04.i2122 to i32, !dbg !45 + %6054 = shl i32 %6053, 23, !dbg !45 + %6055 = bitcast i32 %6054 to float, !dbg !45 + %6056 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2126) #6, !dbg !45 + %6057 = fmul float %6056, %6055, !dbg !45 + %6058 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2127 = icmp eq i32 %6058, 0, !dbg !45 + %6059 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5805, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6060 = tail call float @llvm.nvvm.fma.rn.f(float %5805, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2128 = select i1 %.not.i2127, float %6060, float %6059, !dbg !45 + %6061 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2129 = icmp eq i32 %6061, 0, !dbg !45 + %6062 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2128) #6, !dbg !45 + %6063 = tail call float @llvm.nvvm.saturate.f(float %.02.i2128) #6, !dbg !45 + %.03.i2130 = select i1 %.not1.i2129, float %6063, float %6062, !dbg !45 + %6064 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2131 = icmp eq i32 %6064, 0, !dbg !45 + %6065 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2130, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6066 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2130, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2132 = select i1 %.not2.i2131, float %6066, float %6065, !dbg !45 + %6067 = fadd float %.04.i2132, 0xC168000FE0000000, !dbg !45 + %6068 = fneg float %6067, !dbg !45 + %6069 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2133 = icmp eq i32 %6069, 0, !dbg !45 + %6070 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5805, float 0x3FF7154760000000, float %6068) #6, !dbg !45 + %6071 = tail call float @llvm.nvvm.fma.rn.f(float %5805, float 0x3FF7154760000000, float %6068) #6, !dbg !45 + %.0.i2134 = select i1 %.not3.i2133, float %6071, float %6070, !dbg !45 + %6072 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2135 = icmp eq i32 %6072, 0, !dbg !45 + %6073 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5805, float 0x3E54AE0C00000000, float %.0.i2134) #6, !dbg !45 + %6074 = tail call float @llvm.nvvm.fma.rn.f(float %5805, float 0x3E54AE0C00000000, float %.0.i2134) #6, !dbg !45 + %.01.i2136 = select i1 %.not4.i2135, float %6074, float %6073, !dbg !45 + %6075 = bitcast float %.04.i2132 to i32, !dbg !45 + %6076 = shl i32 %6075, 23, !dbg !45 + %6077 = bitcast i32 %6076 to float, !dbg !45 + %6078 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2136) #6, !dbg !45 + %6079 = fmul float %6078, %6077, !dbg !45 + %6080 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2137 = icmp eq i32 %6080, 0, !dbg !45 + %6081 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5806, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6082 = tail call float @llvm.nvvm.fma.rn.f(float %5806, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2138 = select i1 %.not.i2137, float %6082, float %6081, !dbg !45 + %6083 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2139 = icmp eq i32 %6083, 0, !dbg !45 + %6084 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2138) #6, !dbg !45 + %6085 = tail call float @llvm.nvvm.saturate.f(float %.02.i2138) #6, !dbg !45 + %.03.i2140 = select i1 %.not1.i2139, float %6085, float %6084, !dbg !45 + %6086 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2141 = icmp eq i32 %6086, 0, !dbg !45 + %6087 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2140, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6088 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2140, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2142 = select i1 %.not2.i2141, float %6088, float %6087, !dbg !45 + %6089 = fadd float %.04.i2142, 0xC168000FE0000000, !dbg !45 + %6090 = fneg float %6089, !dbg !45 + %6091 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2143 = icmp eq i32 %6091, 0, !dbg !45 + %6092 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5806, float 0x3FF7154760000000, float %6090) #6, !dbg !45 + %6093 = tail call float @llvm.nvvm.fma.rn.f(float %5806, float 0x3FF7154760000000, float %6090) #6, !dbg !45 + %.0.i2144 = select i1 %.not3.i2143, float %6093, float %6092, !dbg !45 + %6094 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2145 = icmp eq i32 %6094, 0, !dbg !45 + %6095 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5806, float 0x3E54AE0C00000000, float %.0.i2144) #6, !dbg !45 + %6096 = tail call float @llvm.nvvm.fma.rn.f(float %5806, float 0x3E54AE0C00000000, float %.0.i2144) #6, !dbg !45 + %.01.i2146 = select i1 %.not4.i2145, float %6096, float %6095, !dbg !45 + %6097 = bitcast float %.04.i2142 to i32, !dbg !45 + %6098 = shl i32 %6097, 23, !dbg !45 + %6099 = bitcast i32 %6098 to float, !dbg !45 + %6100 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2146) #6, !dbg !45 + %6101 = fmul float %6100, %6099, !dbg !45 + %6102 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2147 = icmp eq i32 %6102, 0, !dbg !45 + %6103 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5807, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6104 = tail call float @llvm.nvvm.fma.rn.f(float %5807, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2148 = select i1 %.not.i2147, float %6104, float %6103, !dbg !45 + %6105 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2149 = icmp eq i32 %6105, 0, !dbg !45 + %6106 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2148) #6, !dbg !45 + %6107 = tail call float @llvm.nvvm.saturate.f(float %.02.i2148) #6, !dbg !45 + %.03.i2150 = select i1 %.not1.i2149, float %6107, float %6106, !dbg !45 + %6108 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2151 = icmp eq i32 %6108, 0, !dbg !45 + %6109 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2150, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6110 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2150, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2152 = select i1 %.not2.i2151, float %6110, float %6109, !dbg !45 + %6111 = fadd float %.04.i2152, 0xC168000FE0000000, !dbg !45 + %6112 = fneg float %6111, !dbg !45 + %6113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2153 = icmp eq i32 %6113, 0, !dbg !45 + %6114 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5807, float 0x3FF7154760000000, float %6112) #6, !dbg !45 + %6115 = tail call float @llvm.nvvm.fma.rn.f(float %5807, float 0x3FF7154760000000, float %6112) #6, !dbg !45 + %.0.i2154 = select i1 %.not3.i2153, float %6115, float %6114, !dbg !45 + %6116 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2155 = icmp eq i32 %6116, 0, !dbg !45 + %6117 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5807, float 0x3E54AE0C00000000, float %.0.i2154) #6, !dbg !45 + %6118 = tail call float @llvm.nvvm.fma.rn.f(float %5807, float 0x3E54AE0C00000000, float %.0.i2154) #6, !dbg !45 + %.01.i2156 = select i1 %.not4.i2155, float %6118, float %6117, !dbg !45 + %6119 = bitcast float %.04.i2152 to i32, !dbg !45 + %6120 = shl i32 %6119, 23, !dbg !45 + %6121 = bitcast i32 %6120 to float, !dbg !45 + %6122 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2156) #6, !dbg !45 + %6123 = fmul float %6122, %6121, !dbg !45 + %6124 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2157 = icmp eq i32 %6124, 0, !dbg !45 + %6125 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5808, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6126 = tail call float @llvm.nvvm.fma.rn.f(float %5808, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2158 = select i1 %.not.i2157, float %6126, float %6125, !dbg !45 + %6127 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2159 = icmp eq i32 %6127, 0, !dbg !45 + %6128 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2158) #6, !dbg !45 + %6129 = tail call float @llvm.nvvm.saturate.f(float %.02.i2158) #6, !dbg !45 + %.03.i2160 = select i1 %.not1.i2159, float %6129, float %6128, !dbg !45 + %6130 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2161 = icmp eq i32 %6130, 0, !dbg !45 + %6131 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2160, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6132 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2160, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2162 = select i1 %.not2.i2161, float %6132, float %6131, !dbg !45 + %6133 = fadd float %.04.i2162, 0xC168000FE0000000, !dbg !45 + %6134 = fneg float %6133, !dbg !45 + %6135 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2163 = icmp eq i32 %6135, 0, !dbg !45 + %6136 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5808, float 0x3FF7154760000000, float %6134) #6, !dbg !45 + %6137 = tail call float @llvm.nvvm.fma.rn.f(float %5808, float 0x3FF7154760000000, float %6134) #6, !dbg !45 + %.0.i2164 = select i1 %.not3.i2163, float %6137, float %6136, !dbg !45 + %6138 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2165 = icmp eq i32 %6138, 0, !dbg !45 + %6139 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5808, float 0x3E54AE0C00000000, float %.0.i2164) #6, !dbg !45 + %6140 = tail call float @llvm.nvvm.fma.rn.f(float %5808, float 0x3E54AE0C00000000, float %.0.i2164) #6, !dbg !45 + %.01.i2166 = select i1 %.not4.i2165, float %6140, float %6139, !dbg !45 + %6141 = bitcast float %.04.i2162 to i32, !dbg !45 + %6142 = shl i32 %6141, 23, !dbg !45 + %6143 = bitcast i32 %6142 to float, !dbg !45 + %6144 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2166) #6, !dbg !45 + %6145 = fmul float %6144, %6143, !dbg !45 + %6146 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2167 = icmp eq i32 %6146, 0, !dbg !45 + %6147 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5809, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6148 = tail call float @llvm.nvvm.fma.rn.f(float %5809, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2168 = select i1 %.not.i2167, float %6148, float %6147, !dbg !45 + %6149 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2169 = icmp eq i32 %6149, 0, !dbg !45 + %6150 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2168) #6, !dbg !45 + %6151 = tail call float @llvm.nvvm.saturate.f(float %.02.i2168) #6, !dbg !45 + %.03.i2170 = select i1 %.not1.i2169, float %6151, float %6150, !dbg !45 + %6152 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2171 = icmp eq i32 %6152, 0, !dbg !45 + %6153 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2170, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6154 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2170, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2172 = select i1 %.not2.i2171, float %6154, float %6153, !dbg !45 + %6155 = fadd float %.04.i2172, 0xC168000FE0000000, !dbg !45 + %6156 = fneg float %6155, !dbg !45 + %6157 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2173 = icmp eq i32 %6157, 0, !dbg !45 + %6158 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5809, float 0x3FF7154760000000, float %6156) #6, !dbg !45 + %6159 = tail call float @llvm.nvvm.fma.rn.f(float %5809, float 0x3FF7154760000000, float %6156) #6, !dbg !45 + %.0.i2174 = select i1 %.not3.i2173, float %6159, float %6158, !dbg !45 + %6160 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2175 = icmp eq i32 %6160, 0, !dbg !45 + %6161 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5809, float 0x3E54AE0C00000000, float %.0.i2174) #6, !dbg !45 + %6162 = tail call float @llvm.nvvm.fma.rn.f(float %5809, float 0x3E54AE0C00000000, float %.0.i2174) #6, !dbg !45 + %.01.i2176 = select i1 %.not4.i2175, float %6162, float %6161, !dbg !45 + %6163 = bitcast float %.04.i2172 to i32, !dbg !45 + %6164 = shl i32 %6163, 23, !dbg !45 + %6165 = bitcast i32 %6164 to float, !dbg !45 + %6166 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2176) #6, !dbg !45 + %6167 = fmul float %6166, %6165, !dbg !45 + %6168 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2177 = icmp eq i32 %6168, 0, !dbg !45 + %6169 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5810, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6170 = tail call float @llvm.nvvm.fma.rn.f(float %5810, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2178 = select i1 %.not.i2177, float %6170, float %6169, !dbg !45 + %6171 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2179 = icmp eq i32 %6171, 0, !dbg !45 + %6172 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2178) #6, !dbg !45 + %6173 = tail call float @llvm.nvvm.saturate.f(float %.02.i2178) #6, !dbg !45 + %.03.i2180 = select i1 %.not1.i2179, float %6173, float %6172, !dbg !45 + %6174 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2181 = icmp eq i32 %6174, 0, !dbg !45 + %6175 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2180, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6176 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2180, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2182 = select i1 %.not2.i2181, float %6176, float %6175, !dbg !45 + %6177 = fadd float %.04.i2182, 0xC168000FE0000000, !dbg !45 + %6178 = fneg float %6177, !dbg !45 + %6179 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2183 = icmp eq i32 %6179, 0, !dbg !45 + %6180 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5810, float 0x3FF7154760000000, float %6178) #6, !dbg !45 + %6181 = tail call float @llvm.nvvm.fma.rn.f(float %5810, float 0x3FF7154760000000, float %6178) #6, !dbg !45 + %.0.i2184 = select i1 %.not3.i2183, float %6181, float %6180, !dbg !45 + %6182 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2185 = icmp eq i32 %6182, 0, !dbg !45 + %6183 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5810, float 0x3E54AE0C00000000, float %.0.i2184) #6, !dbg !45 + %6184 = tail call float @llvm.nvvm.fma.rn.f(float %5810, float 0x3E54AE0C00000000, float %.0.i2184) #6, !dbg !45 + %.01.i2186 = select i1 %.not4.i2185, float %6184, float %6183, !dbg !45 + %6185 = bitcast float %.04.i2182 to i32, !dbg !45 + %6186 = shl i32 %6185, 23, !dbg !45 + %6187 = bitcast i32 %6186 to float, !dbg !45 + %6188 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2186) #6, !dbg !45 + %6189 = fmul float %6188, %6187, !dbg !45 + %6190 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2187 = icmp eq i32 %6190, 0, !dbg !45 + %6191 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5811, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6192 = tail call float @llvm.nvvm.fma.rn.f(float %5811, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2188 = select i1 %.not.i2187, float %6192, float %6191, !dbg !45 + %6193 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2189 = icmp eq i32 %6193, 0, !dbg !45 + %6194 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2188) #6, !dbg !45 + %6195 = tail call float @llvm.nvvm.saturate.f(float %.02.i2188) #6, !dbg !45 + %.03.i2190 = select i1 %.not1.i2189, float %6195, float %6194, !dbg !45 + %6196 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2191 = icmp eq i32 %6196, 0, !dbg !45 + %6197 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2190, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6198 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2190, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2192 = select i1 %.not2.i2191, float %6198, float %6197, !dbg !45 + %6199 = fadd float %.04.i2192, 0xC168000FE0000000, !dbg !45 + %6200 = fneg float %6199, !dbg !45 + %6201 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2193 = icmp eq i32 %6201, 0, !dbg !45 + %6202 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5811, float 0x3FF7154760000000, float %6200) #6, !dbg !45 + %6203 = tail call float @llvm.nvvm.fma.rn.f(float %5811, float 0x3FF7154760000000, float %6200) #6, !dbg !45 + %.0.i2194 = select i1 %.not3.i2193, float %6203, float %6202, !dbg !45 + %6204 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2195 = icmp eq i32 %6204, 0, !dbg !45 + %6205 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5811, float 0x3E54AE0C00000000, float %.0.i2194) #6, !dbg !45 + %6206 = tail call float @llvm.nvvm.fma.rn.f(float %5811, float 0x3E54AE0C00000000, float %.0.i2194) #6, !dbg !45 + %.01.i2196 = select i1 %.not4.i2195, float %6206, float %6205, !dbg !45 + %6207 = bitcast float %.04.i2192 to i32, !dbg !45 + %6208 = shl i32 %6207, 23, !dbg !45 + %6209 = bitcast i32 %6208 to float, !dbg !45 + %6210 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2196) #6, !dbg !45 + %6211 = fmul float %6210, %6209, !dbg !45 + %6212 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2197 = icmp eq i32 %6212, 0, !dbg !45 + %6213 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5812, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6214 = tail call float @llvm.nvvm.fma.rn.f(float %5812, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2198 = select i1 %.not.i2197, float %6214, float %6213, !dbg !45 + %6215 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2199 = icmp eq i32 %6215, 0, !dbg !45 + %6216 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2198) #6, !dbg !45 + %6217 = tail call float @llvm.nvvm.saturate.f(float %.02.i2198) #6, !dbg !45 + %.03.i2200 = select i1 %.not1.i2199, float %6217, float %6216, !dbg !45 + %6218 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2201 = icmp eq i32 %6218, 0, !dbg !45 + %6219 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2200, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6220 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2200, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2202 = select i1 %.not2.i2201, float %6220, float %6219, !dbg !45 + %6221 = fadd float %.04.i2202, 0xC168000FE0000000, !dbg !45 + %6222 = fneg float %6221, !dbg !45 + %6223 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2203 = icmp eq i32 %6223, 0, !dbg !45 + %6224 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5812, float 0x3FF7154760000000, float %6222) #6, !dbg !45 + %6225 = tail call float @llvm.nvvm.fma.rn.f(float %5812, float 0x3FF7154760000000, float %6222) #6, !dbg !45 + %.0.i2204 = select i1 %.not3.i2203, float %6225, float %6224, !dbg !45 + %6226 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2205 = icmp eq i32 %6226, 0, !dbg !45 + %6227 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5812, float 0x3E54AE0C00000000, float %.0.i2204) #6, !dbg !45 + %6228 = tail call float @llvm.nvvm.fma.rn.f(float %5812, float 0x3E54AE0C00000000, float %.0.i2204) #6, !dbg !45 + %.01.i2206 = select i1 %.not4.i2205, float %6228, float %6227, !dbg !45 + %6229 = bitcast float %.04.i2202 to i32, !dbg !45 + %6230 = shl i32 %6229, 23, !dbg !45 + %6231 = bitcast i32 %6230 to float, !dbg !45 + %6232 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2206) #6, !dbg !45 + %6233 = fmul float %6232, %6231, !dbg !45 + %6234 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2207 = icmp eq i32 %6234, 0, !dbg !45 + %6235 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5813, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6236 = tail call float @llvm.nvvm.fma.rn.f(float %5813, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2208 = select i1 %.not.i2207, float %6236, float %6235, !dbg !45 + %6237 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2209 = icmp eq i32 %6237, 0, !dbg !45 + %6238 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2208) #6, !dbg !45 + %6239 = tail call float @llvm.nvvm.saturate.f(float %.02.i2208) #6, !dbg !45 + %.03.i2210 = select i1 %.not1.i2209, float %6239, float %6238, !dbg !45 + %6240 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2211 = icmp eq i32 %6240, 0, !dbg !45 + %6241 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2210, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6242 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2210, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2212 = select i1 %.not2.i2211, float %6242, float %6241, !dbg !45 + %6243 = fadd float %.04.i2212, 0xC168000FE0000000, !dbg !45 + %6244 = fneg float %6243, !dbg !45 + %6245 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2213 = icmp eq i32 %6245, 0, !dbg !45 + %6246 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5813, float 0x3FF7154760000000, float %6244) #6, !dbg !45 + %6247 = tail call float @llvm.nvvm.fma.rn.f(float %5813, float 0x3FF7154760000000, float %6244) #6, !dbg !45 + %.0.i2214 = select i1 %.not3.i2213, float %6247, float %6246, !dbg !45 + %6248 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2215 = icmp eq i32 %6248, 0, !dbg !45 + %6249 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5813, float 0x3E54AE0C00000000, float %.0.i2214) #6, !dbg !45 + %6250 = tail call float @llvm.nvvm.fma.rn.f(float %5813, float 0x3E54AE0C00000000, float %.0.i2214) #6, !dbg !45 + %.01.i2216 = select i1 %.not4.i2215, float %6250, float %6249, !dbg !45 + %6251 = bitcast float %.04.i2212 to i32, !dbg !45 + %6252 = shl i32 %6251, 23, !dbg !45 + %6253 = bitcast i32 %6252 to float, !dbg !45 + %6254 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2216) #6, !dbg !45 + %6255 = fmul float %6254, %6253, !dbg !45 + %6256 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2217 = icmp eq i32 %6256, 0, !dbg !45 + %6257 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5814, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6258 = tail call float @llvm.nvvm.fma.rn.f(float %5814, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2218 = select i1 %.not.i2217, float %6258, float %6257, !dbg !45 + %6259 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2219 = icmp eq i32 %6259, 0, !dbg !45 + %6260 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2218) #6, !dbg !45 + %6261 = tail call float @llvm.nvvm.saturate.f(float %.02.i2218) #6, !dbg !45 + %.03.i2220 = select i1 %.not1.i2219, float %6261, float %6260, !dbg !45 + %6262 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2221 = icmp eq i32 %6262, 0, !dbg !45 + %6263 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2220, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6264 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2220, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2222 = select i1 %.not2.i2221, float %6264, float %6263, !dbg !45 + %6265 = fadd float %.04.i2222, 0xC168000FE0000000, !dbg !45 + %6266 = fneg float %6265, !dbg !45 + %6267 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2223 = icmp eq i32 %6267, 0, !dbg !45 + %6268 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5814, float 0x3FF7154760000000, float %6266) #6, !dbg !45 + %6269 = tail call float @llvm.nvvm.fma.rn.f(float %5814, float 0x3FF7154760000000, float %6266) #6, !dbg !45 + %.0.i2224 = select i1 %.not3.i2223, float %6269, float %6268, !dbg !45 + %6270 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2225 = icmp eq i32 %6270, 0, !dbg !45 + %6271 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5814, float 0x3E54AE0C00000000, float %.0.i2224) #6, !dbg !45 + %6272 = tail call float @llvm.nvvm.fma.rn.f(float %5814, float 0x3E54AE0C00000000, float %.0.i2224) #6, !dbg !45 + %.01.i2226 = select i1 %.not4.i2225, float %6272, float %6271, !dbg !45 + %6273 = bitcast float %.04.i2222 to i32, !dbg !45 + %6274 = shl i32 %6273, 23, !dbg !45 + %6275 = bitcast i32 %6274 to float, !dbg !45 + %6276 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2226) #6, !dbg !45 + %6277 = fmul float %6276, %6275, !dbg !45 + %6278 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2227 = icmp eq i32 %6278, 0, !dbg !45 + %6279 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5815, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6280 = tail call float @llvm.nvvm.fma.rn.f(float %5815, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2228 = select i1 %.not.i2227, float %6280, float %6279, !dbg !45 + %6281 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2229 = icmp eq i32 %6281, 0, !dbg !45 + %6282 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2228) #6, !dbg !45 + %6283 = tail call float @llvm.nvvm.saturate.f(float %.02.i2228) #6, !dbg !45 + %.03.i2230 = select i1 %.not1.i2229, float %6283, float %6282, !dbg !45 + %6284 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2231 = icmp eq i32 %6284, 0, !dbg !45 + %6285 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2230, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6286 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2230, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2232 = select i1 %.not2.i2231, float %6286, float %6285, !dbg !45 + %6287 = fadd float %.04.i2232, 0xC168000FE0000000, !dbg !45 + %6288 = fneg float %6287, !dbg !45 + %6289 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2233 = icmp eq i32 %6289, 0, !dbg !45 + %6290 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5815, float 0x3FF7154760000000, float %6288) #6, !dbg !45 + %6291 = tail call float @llvm.nvvm.fma.rn.f(float %5815, float 0x3FF7154760000000, float %6288) #6, !dbg !45 + %.0.i2234 = select i1 %.not3.i2233, float %6291, float %6290, !dbg !45 + %6292 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2235 = icmp eq i32 %6292, 0, !dbg !45 + %6293 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5815, float 0x3E54AE0C00000000, float %.0.i2234) #6, !dbg !45 + %6294 = tail call float @llvm.nvvm.fma.rn.f(float %5815, float 0x3E54AE0C00000000, float %.0.i2234) #6, !dbg !45 + %.01.i2236 = select i1 %.not4.i2235, float %6294, float %6293, !dbg !45 + %6295 = bitcast float %.04.i2232 to i32, !dbg !45 + %6296 = shl i32 %6295, 23, !dbg !45 + %6297 = bitcast i32 %6296 to float, !dbg !45 + %6298 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2236) #6, !dbg !45 + %6299 = fmul float %6298, %6297, !dbg !45 + %6300 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2237 = icmp eq i32 %6300, 0, !dbg !45 + %6301 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5816, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6302 = tail call float @llvm.nvvm.fma.rn.f(float %5816, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2238 = select i1 %.not.i2237, float %6302, float %6301, !dbg !45 + %6303 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2239 = icmp eq i32 %6303, 0, !dbg !45 + %6304 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2238) #6, !dbg !45 + %6305 = tail call float @llvm.nvvm.saturate.f(float %.02.i2238) #6, !dbg !45 + %.03.i2240 = select i1 %.not1.i2239, float %6305, float %6304, !dbg !45 + %6306 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2241 = icmp eq i32 %6306, 0, !dbg !45 + %6307 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2240, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6308 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2240, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2242 = select i1 %.not2.i2241, float %6308, float %6307, !dbg !45 + %6309 = fadd float %.04.i2242, 0xC168000FE0000000, !dbg !45 + %6310 = fneg float %6309, !dbg !45 + %6311 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2243 = icmp eq i32 %6311, 0, !dbg !45 + %6312 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5816, float 0x3FF7154760000000, float %6310) #6, !dbg !45 + %6313 = tail call float @llvm.nvvm.fma.rn.f(float %5816, float 0x3FF7154760000000, float %6310) #6, !dbg !45 + %.0.i2244 = select i1 %.not3.i2243, float %6313, float %6312, !dbg !45 + %6314 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2245 = icmp eq i32 %6314, 0, !dbg !45 + %6315 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5816, float 0x3E54AE0C00000000, float %.0.i2244) #6, !dbg !45 + %6316 = tail call float @llvm.nvvm.fma.rn.f(float %5816, float 0x3E54AE0C00000000, float %.0.i2244) #6, !dbg !45 + %.01.i2246 = select i1 %.not4.i2245, float %6316, float %6315, !dbg !45 + %6317 = bitcast float %.04.i2242 to i32, !dbg !45 + %6318 = shl i32 %6317, 23, !dbg !45 + %6319 = bitcast i32 %6318 to float, !dbg !45 + %6320 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2246) #6, !dbg !45 + %6321 = fmul float %6320, %6319, !dbg !45 + %6322 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2247 = icmp eq i32 %6322, 0, !dbg !45 + %6323 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5817, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6324 = tail call float @llvm.nvvm.fma.rn.f(float %5817, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2248 = select i1 %.not.i2247, float %6324, float %6323, !dbg !45 + %6325 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2249 = icmp eq i32 %6325, 0, !dbg !45 + %6326 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2248) #6, !dbg !45 + %6327 = tail call float @llvm.nvvm.saturate.f(float %.02.i2248) #6, !dbg !45 + %.03.i2250 = select i1 %.not1.i2249, float %6327, float %6326, !dbg !45 + %6328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2251 = icmp eq i32 %6328, 0, !dbg !45 + %6329 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2250, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6330 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2250, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2252 = select i1 %.not2.i2251, float %6330, float %6329, !dbg !45 + %6331 = fadd float %.04.i2252, 0xC168000FE0000000, !dbg !45 + %6332 = fneg float %6331, !dbg !45 + %6333 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2253 = icmp eq i32 %6333, 0, !dbg !45 + %6334 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5817, float 0x3FF7154760000000, float %6332) #6, !dbg !45 + %6335 = tail call float @llvm.nvvm.fma.rn.f(float %5817, float 0x3FF7154760000000, float %6332) #6, !dbg !45 + %.0.i2254 = select i1 %.not3.i2253, float %6335, float %6334, !dbg !45 + %6336 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2255 = icmp eq i32 %6336, 0, !dbg !45 + %6337 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5817, float 0x3E54AE0C00000000, float %.0.i2254) #6, !dbg !45 + %6338 = tail call float @llvm.nvvm.fma.rn.f(float %5817, float 0x3E54AE0C00000000, float %.0.i2254) #6, !dbg !45 + %.01.i2256 = select i1 %.not4.i2255, float %6338, float %6337, !dbg !45 + %6339 = bitcast float %.04.i2252 to i32, !dbg !45 + %6340 = shl i32 %6339, 23, !dbg !45 + %6341 = bitcast i32 %6340 to float, !dbg !45 + %6342 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2256) #6, !dbg !45 + %6343 = fmul float %6342, %6341, !dbg !45 + %6344 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2257 = icmp eq i32 %6344, 0, !dbg !45 + %6345 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5818, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6346 = tail call float @llvm.nvvm.fma.rn.f(float %5818, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2258 = select i1 %.not.i2257, float %6346, float %6345, !dbg !45 + %6347 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2259 = icmp eq i32 %6347, 0, !dbg !45 + %6348 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2258) #6, !dbg !45 + %6349 = tail call float @llvm.nvvm.saturate.f(float %.02.i2258) #6, !dbg !45 + %.03.i2260 = select i1 %.not1.i2259, float %6349, float %6348, !dbg !45 + %6350 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2261 = icmp eq i32 %6350, 0, !dbg !45 + %6351 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2260, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6352 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2260, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2262 = select i1 %.not2.i2261, float %6352, float %6351, !dbg !45 + %6353 = fadd float %.04.i2262, 0xC168000FE0000000, !dbg !45 + %6354 = fneg float %6353, !dbg !45 + %6355 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2263 = icmp eq i32 %6355, 0, !dbg !45 + %6356 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5818, float 0x3FF7154760000000, float %6354) #6, !dbg !45 + %6357 = tail call float @llvm.nvvm.fma.rn.f(float %5818, float 0x3FF7154760000000, float %6354) #6, !dbg !45 + %.0.i2264 = select i1 %.not3.i2263, float %6357, float %6356, !dbg !45 + %6358 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2265 = icmp eq i32 %6358, 0, !dbg !45 + %6359 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5818, float 0x3E54AE0C00000000, float %.0.i2264) #6, !dbg !45 + %6360 = tail call float @llvm.nvvm.fma.rn.f(float %5818, float 0x3E54AE0C00000000, float %.0.i2264) #6, !dbg !45 + %.01.i2266 = select i1 %.not4.i2265, float %6360, float %6359, !dbg !45 + %6361 = bitcast float %.04.i2262 to i32, !dbg !45 + %6362 = shl i32 %6361, 23, !dbg !45 + %6363 = bitcast i32 %6362 to float, !dbg !45 + %6364 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2266) #6, !dbg !45 + %6365 = fmul float %6364, %6363, !dbg !45 + %6366 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2267 = icmp eq i32 %6366, 0, !dbg !45 + %6367 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5819, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6368 = tail call float @llvm.nvvm.fma.rn.f(float %5819, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2268 = select i1 %.not.i2267, float %6368, float %6367, !dbg !45 + %6369 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2269 = icmp eq i32 %6369, 0, !dbg !45 + %6370 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2268) #6, !dbg !45 + %6371 = tail call float @llvm.nvvm.saturate.f(float %.02.i2268) #6, !dbg !45 + %.03.i2270 = select i1 %.not1.i2269, float %6371, float %6370, !dbg !45 + %6372 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2271 = icmp eq i32 %6372, 0, !dbg !45 + %6373 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2270, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6374 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2270, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2272 = select i1 %.not2.i2271, float %6374, float %6373, !dbg !45 + %6375 = fadd float %.04.i2272, 0xC168000FE0000000, !dbg !45 + %6376 = fneg float %6375, !dbg !45 + %6377 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2273 = icmp eq i32 %6377, 0, !dbg !45 + %6378 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5819, float 0x3FF7154760000000, float %6376) #6, !dbg !45 + %6379 = tail call float @llvm.nvvm.fma.rn.f(float %5819, float 0x3FF7154760000000, float %6376) #6, !dbg !45 + %.0.i2274 = select i1 %.not3.i2273, float %6379, float %6378, !dbg !45 + %6380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2275 = icmp eq i32 %6380, 0, !dbg !45 + %6381 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5819, float 0x3E54AE0C00000000, float %.0.i2274) #6, !dbg !45 + %6382 = tail call float @llvm.nvvm.fma.rn.f(float %5819, float 0x3E54AE0C00000000, float %.0.i2274) #6, !dbg !45 + %.01.i2276 = select i1 %.not4.i2275, float %6382, float %6381, !dbg !45 + %6383 = bitcast float %.04.i2272 to i32, !dbg !45 + %6384 = shl i32 %6383, 23, !dbg !45 + %6385 = bitcast i32 %6384 to float, !dbg !45 + %6386 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2276) #6, !dbg !45 + %6387 = fmul float %6386, %6385, !dbg !45 + %6388 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2277 = icmp eq i32 %6388, 0, !dbg !45 + %6389 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5820, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6390 = tail call float @llvm.nvvm.fma.rn.f(float %5820, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2278 = select i1 %.not.i2277, float %6390, float %6389, !dbg !45 + %6391 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2279 = icmp eq i32 %6391, 0, !dbg !45 + %6392 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2278) #6, !dbg !45 + %6393 = tail call float @llvm.nvvm.saturate.f(float %.02.i2278) #6, !dbg !45 + %.03.i2280 = select i1 %.not1.i2279, float %6393, float %6392, !dbg !45 + %6394 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2281 = icmp eq i32 %6394, 0, !dbg !45 + %6395 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2280, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6396 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2280, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2282 = select i1 %.not2.i2281, float %6396, float %6395, !dbg !45 + %6397 = fadd float %.04.i2282, 0xC168000FE0000000, !dbg !45 + %6398 = fneg float %6397, !dbg !45 + %6399 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2283 = icmp eq i32 %6399, 0, !dbg !45 + %6400 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5820, float 0x3FF7154760000000, float %6398) #6, !dbg !45 + %6401 = tail call float @llvm.nvvm.fma.rn.f(float %5820, float 0x3FF7154760000000, float %6398) #6, !dbg !45 + %.0.i2284 = select i1 %.not3.i2283, float %6401, float %6400, !dbg !45 + %6402 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2285 = icmp eq i32 %6402, 0, !dbg !45 + %6403 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5820, float 0x3E54AE0C00000000, float %.0.i2284) #6, !dbg !45 + %6404 = tail call float @llvm.nvvm.fma.rn.f(float %5820, float 0x3E54AE0C00000000, float %.0.i2284) #6, !dbg !45 + %.01.i2286 = select i1 %.not4.i2285, float %6404, float %6403, !dbg !45 + %6405 = bitcast float %.04.i2282 to i32, !dbg !45 + %6406 = shl i32 %6405, 23, !dbg !45 + %6407 = bitcast i32 %6406 to float, !dbg !45 + %6408 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2286) #6, !dbg !45 + %6409 = fmul float %6408, %6407, !dbg !45 + %6410 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2287 = icmp eq i32 %6410, 0, !dbg !45 + %6411 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5821, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6412 = tail call float @llvm.nvvm.fma.rn.f(float %5821, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2288 = select i1 %.not.i2287, float %6412, float %6411, !dbg !45 + %6413 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2289 = icmp eq i32 %6413, 0, !dbg !45 + %6414 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2288) #6, !dbg !45 + %6415 = tail call float @llvm.nvvm.saturate.f(float %.02.i2288) #6, !dbg !45 + %.03.i2290 = select i1 %.not1.i2289, float %6415, float %6414, !dbg !45 + %6416 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2291 = icmp eq i32 %6416, 0, !dbg !45 + %6417 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2290, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6418 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2290, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2292 = select i1 %.not2.i2291, float %6418, float %6417, !dbg !45 + %6419 = fadd float %.04.i2292, 0xC168000FE0000000, !dbg !45 + %6420 = fneg float %6419, !dbg !45 + %6421 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2293 = icmp eq i32 %6421, 0, !dbg !45 + %6422 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5821, float 0x3FF7154760000000, float %6420) #6, !dbg !45 + %6423 = tail call float @llvm.nvvm.fma.rn.f(float %5821, float 0x3FF7154760000000, float %6420) #6, !dbg !45 + %.0.i2294 = select i1 %.not3.i2293, float %6423, float %6422, !dbg !45 + %6424 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2295 = icmp eq i32 %6424, 0, !dbg !45 + %6425 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5821, float 0x3E54AE0C00000000, float %.0.i2294) #6, !dbg !45 + %6426 = tail call float @llvm.nvvm.fma.rn.f(float %5821, float 0x3E54AE0C00000000, float %.0.i2294) #6, !dbg !45 + %.01.i2296 = select i1 %.not4.i2295, float %6426, float %6425, !dbg !45 + %6427 = bitcast float %.04.i2292 to i32, !dbg !45 + %6428 = shl i32 %6427, 23, !dbg !45 + %6429 = bitcast i32 %6428 to float, !dbg !45 + %6430 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2296) #6, !dbg !45 + %6431 = fmul float %6430, %6429, !dbg !45 + %6432 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2297 = icmp eq i32 %6432, 0, !dbg !45 + %6433 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5822, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6434 = tail call float @llvm.nvvm.fma.rn.f(float %5822, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2298 = select i1 %.not.i2297, float %6434, float %6433, !dbg !45 + %6435 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2299 = icmp eq i32 %6435, 0, !dbg !45 + %6436 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2298) #6, !dbg !45 + %6437 = tail call float @llvm.nvvm.saturate.f(float %.02.i2298) #6, !dbg !45 + %.03.i2300 = select i1 %.not1.i2299, float %6437, float %6436, !dbg !45 + %6438 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2301 = icmp eq i32 %6438, 0, !dbg !45 + %6439 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2300, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6440 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2300, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2302 = select i1 %.not2.i2301, float %6440, float %6439, !dbg !45 + %6441 = fadd float %.04.i2302, 0xC168000FE0000000, !dbg !45 + %6442 = fneg float %6441, !dbg !45 + %6443 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2303 = icmp eq i32 %6443, 0, !dbg !45 + %6444 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5822, float 0x3FF7154760000000, float %6442) #6, !dbg !45 + %6445 = tail call float @llvm.nvvm.fma.rn.f(float %5822, float 0x3FF7154760000000, float %6442) #6, !dbg !45 + %.0.i2304 = select i1 %.not3.i2303, float %6445, float %6444, !dbg !45 + %6446 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2305 = icmp eq i32 %6446, 0, !dbg !45 + %6447 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5822, float 0x3E54AE0C00000000, float %.0.i2304) #6, !dbg !45 + %6448 = tail call float @llvm.nvvm.fma.rn.f(float %5822, float 0x3E54AE0C00000000, float %.0.i2304) #6, !dbg !45 + %.01.i2306 = select i1 %.not4.i2305, float %6448, float %6447, !dbg !45 + %6449 = bitcast float %.04.i2302 to i32, !dbg !45 + %6450 = shl i32 %6449, 23, !dbg !45 + %6451 = bitcast i32 %6450 to float, !dbg !45 + %6452 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2306) #6, !dbg !45 + %6453 = fmul float %6452, %6451, !dbg !45 + %6454 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2307 = icmp eq i32 %6454, 0, !dbg !45 + %6455 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5823, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6456 = tail call float @llvm.nvvm.fma.rn.f(float %5823, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2308 = select i1 %.not.i2307, float %6456, float %6455, !dbg !45 + %6457 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2309 = icmp eq i32 %6457, 0, !dbg !45 + %6458 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2308) #6, !dbg !45 + %6459 = tail call float @llvm.nvvm.saturate.f(float %.02.i2308) #6, !dbg !45 + %.03.i2310 = select i1 %.not1.i2309, float %6459, float %6458, !dbg !45 + %6460 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2311 = icmp eq i32 %6460, 0, !dbg !45 + %6461 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2310, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6462 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2310, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2312 = select i1 %.not2.i2311, float %6462, float %6461, !dbg !45 + %6463 = fadd float %.04.i2312, 0xC168000FE0000000, !dbg !45 + %6464 = fneg float %6463, !dbg !45 + %6465 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2313 = icmp eq i32 %6465, 0, !dbg !45 + %6466 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5823, float 0x3FF7154760000000, float %6464) #6, !dbg !45 + %6467 = tail call float @llvm.nvvm.fma.rn.f(float %5823, float 0x3FF7154760000000, float %6464) #6, !dbg !45 + %.0.i2314 = select i1 %.not3.i2313, float %6467, float %6466, !dbg !45 + %6468 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2315 = icmp eq i32 %6468, 0, !dbg !45 + %6469 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5823, float 0x3E54AE0C00000000, float %.0.i2314) #6, !dbg !45 + %6470 = tail call float @llvm.nvvm.fma.rn.f(float %5823, float 0x3E54AE0C00000000, float %.0.i2314) #6, !dbg !45 + %.01.i2316 = select i1 %.not4.i2315, float %6470, float %6469, !dbg !45 + %6471 = bitcast float %.04.i2312 to i32, !dbg !45 + %6472 = shl i32 %6471, 23, !dbg !45 + %6473 = bitcast i32 %6472 to float, !dbg !45 + %6474 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2316) #6, !dbg !45 + %6475 = fmul float %6474, %6473, !dbg !45 + %6476 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2317 = icmp eq i32 %6476, 0, !dbg !45 + %6477 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5824, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6478 = tail call float @llvm.nvvm.fma.rn.f(float %5824, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2318 = select i1 %.not.i2317, float %6478, float %6477, !dbg !45 + %6479 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2319 = icmp eq i32 %6479, 0, !dbg !45 + %6480 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2318) #6, !dbg !45 + %6481 = tail call float @llvm.nvvm.saturate.f(float %.02.i2318) #6, !dbg !45 + %.03.i2320 = select i1 %.not1.i2319, float %6481, float %6480, !dbg !45 + %6482 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2321 = icmp eq i32 %6482, 0, !dbg !45 + %6483 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2320, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6484 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2320, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2322 = select i1 %.not2.i2321, float %6484, float %6483, !dbg !45 + %6485 = fadd float %.04.i2322, 0xC168000FE0000000, !dbg !45 + %6486 = fneg float %6485, !dbg !45 + %6487 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2323 = icmp eq i32 %6487, 0, !dbg !45 + %6488 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5824, float 0x3FF7154760000000, float %6486) #6, !dbg !45 + %6489 = tail call float @llvm.nvvm.fma.rn.f(float %5824, float 0x3FF7154760000000, float %6486) #6, !dbg !45 + %.0.i2324 = select i1 %.not3.i2323, float %6489, float %6488, !dbg !45 + %6490 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2325 = icmp eq i32 %6490, 0, !dbg !45 + %6491 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5824, float 0x3E54AE0C00000000, float %.0.i2324) #6, !dbg !45 + %6492 = tail call float @llvm.nvvm.fma.rn.f(float %5824, float 0x3E54AE0C00000000, float %.0.i2324) #6, !dbg !45 + %.01.i2326 = select i1 %.not4.i2325, float %6492, float %6491, !dbg !45 + %6493 = bitcast float %.04.i2322 to i32, !dbg !45 + %6494 = shl i32 %6493, 23, !dbg !45 + %6495 = bitcast i32 %6494 to float, !dbg !45 + %6496 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2326) #6, !dbg !45 + %6497 = fmul float %6496, %6495, !dbg !45 + %6498 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2327 = icmp eq i32 %6498, 0, !dbg !45 + %6499 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5825, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6500 = tail call float @llvm.nvvm.fma.rn.f(float %5825, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2328 = select i1 %.not.i2327, float %6500, float %6499, !dbg !45 + %6501 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2329 = icmp eq i32 %6501, 0, !dbg !45 + %6502 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2328) #6, !dbg !45 + %6503 = tail call float @llvm.nvvm.saturate.f(float %.02.i2328) #6, !dbg !45 + %.03.i2330 = select i1 %.not1.i2329, float %6503, float %6502, !dbg !45 + %6504 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2331 = icmp eq i32 %6504, 0, !dbg !45 + %6505 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2330, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6506 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2330, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2332 = select i1 %.not2.i2331, float %6506, float %6505, !dbg !45 + %6507 = fadd float %.04.i2332, 0xC168000FE0000000, !dbg !45 + %6508 = fneg float %6507, !dbg !45 + %6509 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2333 = icmp eq i32 %6509, 0, !dbg !45 + %6510 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5825, float 0x3FF7154760000000, float %6508) #6, !dbg !45 + %6511 = tail call float @llvm.nvvm.fma.rn.f(float %5825, float 0x3FF7154760000000, float %6508) #6, !dbg !45 + %.0.i2334 = select i1 %.not3.i2333, float %6511, float %6510, !dbg !45 + %6512 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2335 = icmp eq i32 %6512, 0, !dbg !45 + %6513 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5825, float 0x3E54AE0C00000000, float %.0.i2334) #6, !dbg !45 + %6514 = tail call float @llvm.nvvm.fma.rn.f(float %5825, float 0x3E54AE0C00000000, float %.0.i2334) #6, !dbg !45 + %.01.i2336 = select i1 %.not4.i2335, float %6514, float %6513, !dbg !45 + %6515 = bitcast float %.04.i2332 to i32, !dbg !45 + %6516 = shl i32 %6515, 23, !dbg !45 + %6517 = bitcast i32 %6516 to float, !dbg !45 + %6518 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2336) #6, !dbg !45 + %6519 = fmul float %6518, %6517, !dbg !45 + %6520 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2337 = icmp eq i32 %6520, 0, !dbg !45 + %6521 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5826, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6522 = tail call float @llvm.nvvm.fma.rn.f(float %5826, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2338 = select i1 %.not.i2337, float %6522, float %6521, !dbg !45 + %6523 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2339 = icmp eq i32 %6523, 0, !dbg !45 + %6524 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2338) #6, !dbg !45 + %6525 = tail call float @llvm.nvvm.saturate.f(float %.02.i2338) #6, !dbg !45 + %.03.i2340 = select i1 %.not1.i2339, float %6525, float %6524, !dbg !45 + %6526 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2341 = icmp eq i32 %6526, 0, !dbg !45 + %6527 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2340, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6528 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2340, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2342 = select i1 %.not2.i2341, float %6528, float %6527, !dbg !45 + %6529 = fadd float %.04.i2342, 0xC168000FE0000000, !dbg !45 + %6530 = fneg float %6529, !dbg !45 + %6531 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2343 = icmp eq i32 %6531, 0, !dbg !45 + %6532 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5826, float 0x3FF7154760000000, float %6530) #6, !dbg !45 + %6533 = tail call float @llvm.nvvm.fma.rn.f(float %5826, float 0x3FF7154760000000, float %6530) #6, !dbg !45 + %.0.i2344 = select i1 %.not3.i2343, float %6533, float %6532, !dbg !45 + %6534 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2345 = icmp eq i32 %6534, 0, !dbg !45 + %6535 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5826, float 0x3E54AE0C00000000, float %.0.i2344) #6, !dbg !45 + %6536 = tail call float @llvm.nvvm.fma.rn.f(float %5826, float 0x3E54AE0C00000000, float %.0.i2344) #6, !dbg !45 + %.01.i2346 = select i1 %.not4.i2345, float %6536, float %6535, !dbg !45 + %6537 = bitcast float %.04.i2342 to i32, !dbg !45 + %6538 = shl i32 %6537, 23, !dbg !45 + %6539 = bitcast i32 %6538 to float, !dbg !45 + %6540 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2346) #6, !dbg !45 + %6541 = fmul float %6540, %6539, !dbg !45 + %6542 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2347 = icmp eq i32 %6542, 0, !dbg !45 + %6543 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5827, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6544 = tail call float @llvm.nvvm.fma.rn.f(float %5827, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2348 = select i1 %.not.i2347, float %6544, float %6543, !dbg !45 + %6545 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2349 = icmp eq i32 %6545, 0, !dbg !45 + %6546 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2348) #6, !dbg !45 + %6547 = tail call float @llvm.nvvm.saturate.f(float %.02.i2348) #6, !dbg !45 + %.03.i2350 = select i1 %.not1.i2349, float %6547, float %6546, !dbg !45 + %6548 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2351 = icmp eq i32 %6548, 0, !dbg !45 + %6549 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2350, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6550 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2350, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2352 = select i1 %.not2.i2351, float %6550, float %6549, !dbg !45 + %6551 = fadd float %.04.i2352, 0xC168000FE0000000, !dbg !45 + %6552 = fneg float %6551, !dbg !45 + %6553 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2353 = icmp eq i32 %6553, 0, !dbg !45 + %6554 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5827, float 0x3FF7154760000000, float %6552) #6, !dbg !45 + %6555 = tail call float @llvm.nvvm.fma.rn.f(float %5827, float 0x3FF7154760000000, float %6552) #6, !dbg !45 + %.0.i2354 = select i1 %.not3.i2353, float %6555, float %6554, !dbg !45 + %6556 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2355 = icmp eq i32 %6556, 0, !dbg !45 + %6557 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5827, float 0x3E54AE0C00000000, float %.0.i2354) #6, !dbg !45 + %6558 = tail call float @llvm.nvvm.fma.rn.f(float %5827, float 0x3E54AE0C00000000, float %.0.i2354) #6, !dbg !45 + %.01.i2356 = select i1 %.not4.i2355, float %6558, float %6557, !dbg !45 + %6559 = bitcast float %.04.i2352 to i32, !dbg !45 + %6560 = shl i32 %6559, 23, !dbg !45 + %6561 = bitcast i32 %6560 to float, !dbg !45 + %6562 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2356) #6, !dbg !45 + %6563 = fmul float %6562, %6561, !dbg !45 + %6564 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2357 = icmp eq i32 %6564, 0, !dbg !45 + %6565 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5828, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6566 = tail call float @llvm.nvvm.fma.rn.f(float %5828, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2358 = select i1 %.not.i2357, float %6566, float %6565, !dbg !45 + %6567 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2359 = icmp eq i32 %6567, 0, !dbg !45 + %6568 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2358) #6, !dbg !45 + %6569 = tail call float @llvm.nvvm.saturate.f(float %.02.i2358) #6, !dbg !45 + %.03.i2360 = select i1 %.not1.i2359, float %6569, float %6568, !dbg !45 + %6570 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2361 = icmp eq i32 %6570, 0, !dbg !45 + %6571 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2360, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6572 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2360, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2362 = select i1 %.not2.i2361, float %6572, float %6571, !dbg !45 + %6573 = fadd float %.04.i2362, 0xC168000FE0000000, !dbg !45 + %6574 = fneg float %6573, !dbg !45 + %6575 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2363 = icmp eq i32 %6575, 0, !dbg !45 + %6576 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5828, float 0x3FF7154760000000, float %6574) #6, !dbg !45 + %6577 = tail call float @llvm.nvvm.fma.rn.f(float %5828, float 0x3FF7154760000000, float %6574) #6, !dbg !45 + %.0.i2364 = select i1 %.not3.i2363, float %6577, float %6576, !dbg !45 + %6578 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2365 = icmp eq i32 %6578, 0, !dbg !45 + %6579 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5828, float 0x3E54AE0C00000000, float %.0.i2364) #6, !dbg !45 + %6580 = tail call float @llvm.nvvm.fma.rn.f(float %5828, float 0x3E54AE0C00000000, float %.0.i2364) #6, !dbg !45 + %.01.i2366 = select i1 %.not4.i2365, float %6580, float %6579, !dbg !45 + %6581 = bitcast float %.04.i2362 to i32, !dbg !45 + %6582 = shl i32 %6581, 23, !dbg !45 + %6583 = bitcast i32 %6582 to float, !dbg !45 + %6584 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2366) #6, !dbg !45 + %6585 = fmul float %6584, %6583, !dbg !45 + %6586 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2367 = icmp eq i32 %6586, 0, !dbg !45 + %6587 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5829, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6588 = tail call float @llvm.nvvm.fma.rn.f(float %5829, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2368 = select i1 %.not.i2367, float %6588, float %6587, !dbg !45 + %6589 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2369 = icmp eq i32 %6589, 0, !dbg !45 + %6590 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2368) #6, !dbg !45 + %6591 = tail call float @llvm.nvvm.saturate.f(float %.02.i2368) #6, !dbg !45 + %.03.i2370 = select i1 %.not1.i2369, float %6591, float %6590, !dbg !45 + %6592 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2371 = icmp eq i32 %6592, 0, !dbg !45 + %6593 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2370, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6594 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2370, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2372 = select i1 %.not2.i2371, float %6594, float %6593, !dbg !45 + %6595 = fadd float %.04.i2372, 0xC168000FE0000000, !dbg !45 + %6596 = fneg float %6595, !dbg !45 + %6597 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2373 = icmp eq i32 %6597, 0, !dbg !45 + %6598 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5829, float 0x3FF7154760000000, float %6596) #6, !dbg !45 + %6599 = tail call float @llvm.nvvm.fma.rn.f(float %5829, float 0x3FF7154760000000, float %6596) #6, !dbg !45 + %.0.i2374 = select i1 %.not3.i2373, float %6599, float %6598, !dbg !45 + %6600 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2375 = icmp eq i32 %6600, 0, !dbg !45 + %6601 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5829, float 0x3E54AE0C00000000, float %.0.i2374) #6, !dbg !45 + %6602 = tail call float @llvm.nvvm.fma.rn.f(float %5829, float 0x3E54AE0C00000000, float %.0.i2374) #6, !dbg !45 + %.01.i2376 = select i1 %.not4.i2375, float %6602, float %6601, !dbg !45 + %6603 = bitcast float %.04.i2372 to i32, !dbg !45 + %6604 = shl i32 %6603, 23, !dbg !45 + %6605 = bitcast i32 %6604 to float, !dbg !45 + %6606 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2376) #6, !dbg !45 + %6607 = fmul float %6606, %6605, !dbg !45 + %6608 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2377 = icmp eq i32 %6608, 0, !dbg !45 + %6609 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5830, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6610 = tail call float @llvm.nvvm.fma.rn.f(float %5830, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2378 = select i1 %.not.i2377, float %6610, float %6609, !dbg !45 + %6611 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2379 = icmp eq i32 %6611, 0, !dbg !45 + %6612 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2378) #6, !dbg !45 + %6613 = tail call float @llvm.nvvm.saturate.f(float %.02.i2378) #6, !dbg !45 + %.03.i2380 = select i1 %.not1.i2379, float %6613, float %6612, !dbg !45 + %6614 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2381 = icmp eq i32 %6614, 0, !dbg !45 + %6615 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2380, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6616 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2380, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2382 = select i1 %.not2.i2381, float %6616, float %6615, !dbg !45 + %6617 = fadd float %.04.i2382, 0xC168000FE0000000, !dbg !45 + %6618 = fneg float %6617, !dbg !45 + %6619 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2383 = icmp eq i32 %6619, 0, !dbg !45 + %6620 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5830, float 0x3FF7154760000000, float %6618) #6, !dbg !45 + %6621 = tail call float @llvm.nvvm.fma.rn.f(float %5830, float 0x3FF7154760000000, float %6618) #6, !dbg !45 + %.0.i2384 = select i1 %.not3.i2383, float %6621, float %6620, !dbg !45 + %6622 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2385 = icmp eq i32 %6622, 0, !dbg !45 + %6623 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5830, float 0x3E54AE0C00000000, float %.0.i2384) #6, !dbg !45 + %6624 = tail call float @llvm.nvvm.fma.rn.f(float %5830, float 0x3E54AE0C00000000, float %.0.i2384) #6, !dbg !45 + %.01.i2386 = select i1 %.not4.i2385, float %6624, float %6623, !dbg !45 + %6625 = bitcast float %.04.i2382 to i32, !dbg !45 + %6626 = shl i32 %6625, 23, !dbg !45 + %6627 = bitcast i32 %6626 to float, !dbg !45 + %6628 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2386) #6, !dbg !45 + %6629 = fmul float %6628, %6627, !dbg !45 + %6630 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2387 = icmp eq i32 %6630, 0, !dbg !45 + %6631 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5831, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6632 = tail call float @llvm.nvvm.fma.rn.f(float %5831, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2388 = select i1 %.not.i2387, float %6632, float %6631, !dbg !45 + %6633 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2389 = icmp eq i32 %6633, 0, !dbg !45 + %6634 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2388) #6, !dbg !45 + %6635 = tail call float @llvm.nvvm.saturate.f(float %.02.i2388) #6, !dbg !45 + %.03.i2390 = select i1 %.not1.i2389, float %6635, float %6634, !dbg !45 + %6636 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2391 = icmp eq i32 %6636, 0, !dbg !45 + %6637 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2390, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6638 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2390, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2392 = select i1 %.not2.i2391, float %6638, float %6637, !dbg !45 + %6639 = fadd float %.04.i2392, 0xC168000FE0000000, !dbg !45 + %6640 = fneg float %6639, !dbg !45 + %6641 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2393 = icmp eq i32 %6641, 0, !dbg !45 + %6642 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5831, float 0x3FF7154760000000, float %6640) #6, !dbg !45 + %6643 = tail call float @llvm.nvvm.fma.rn.f(float %5831, float 0x3FF7154760000000, float %6640) #6, !dbg !45 + %.0.i2394 = select i1 %.not3.i2393, float %6643, float %6642, !dbg !45 + %6644 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2395 = icmp eq i32 %6644, 0, !dbg !45 + %6645 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5831, float 0x3E54AE0C00000000, float %.0.i2394) #6, !dbg !45 + %6646 = tail call float @llvm.nvvm.fma.rn.f(float %5831, float 0x3E54AE0C00000000, float %.0.i2394) #6, !dbg !45 + %.01.i2396 = select i1 %.not4.i2395, float %6646, float %6645, !dbg !45 + %6647 = bitcast float %.04.i2392 to i32, !dbg !45 + %6648 = shl i32 %6647, 23, !dbg !45 + %6649 = bitcast i32 %6648 to float, !dbg !45 + %6650 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2396) #6, !dbg !45 + %6651 = fmul float %6650, %6649, !dbg !45 + %6652 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2397 = icmp eq i32 %6652, 0, !dbg !45 + %6653 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5832, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6654 = tail call float @llvm.nvvm.fma.rn.f(float %5832, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2398 = select i1 %.not.i2397, float %6654, float %6653, !dbg !45 + %6655 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2399 = icmp eq i32 %6655, 0, !dbg !45 + %6656 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2398) #6, !dbg !45 + %6657 = tail call float @llvm.nvvm.saturate.f(float %.02.i2398) #6, !dbg !45 + %.03.i2400 = select i1 %.not1.i2399, float %6657, float %6656, !dbg !45 + %6658 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2401 = icmp eq i32 %6658, 0, !dbg !45 + %6659 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2400, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6660 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2400, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2402 = select i1 %.not2.i2401, float %6660, float %6659, !dbg !45 + %6661 = fadd float %.04.i2402, 0xC168000FE0000000, !dbg !45 + %6662 = fneg float %6661, !dbg !45 + %6663 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2403 = icmp eq i32 %6663, 0, !dbg !45 + %6664 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5832, float 0x3FF7154760000000, float %6662) #6, !dbg !45 + %6665 = tail call float @llvm.nvvm.fma.rn.f(float %5832, float 0x3FF7154760000000, float %6662) #6, !dbg !45 + %.0.i2404 = select i1 %.not3.i2403, float %6665, float %6664, !dbg !45 + %6666 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2405 = icmp eq i32 %6666, 0, !dbg !45 + %6667 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5832, float 0x3E54AE0C00000000, float %.0.i2404) #6, !dbg !45 + %6668 = tail call float @llvm.nvvm.fma.rn.f(float %5832, float 0x3E54AE0C00000000, float %.0.i2404) #6, !dbg !45 + %.01.i2406 = select i1 %.not4.i2405, float %6668, float %6667, !dbg !45 + %6669 = bitcast float %.04.i2402 to i32, !dbg !45 + %6670 = shl i32 %6669, 23, !dbg !45 + %6671 = bitcast i32 %6670 to float, !dbg !45 + %6672 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2406) #6, !dbg !45 + %6673 = fmul float %6672, %6671, !dbg !45 + %6674 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2407 = icmp eq i32 %6674, 0, !dbg !45 + %6675 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5833, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6676 = tail call float @llvm.nvvm.fma.rn.f(float %5833, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2408 = select i1 %.not.i2407, float %6676, float %6675, !dbg !45 + %6677 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2409 = icmp eq i32 %6677, 0, !dbg !45 + %6678 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2408) #6, !dbg !45 + %6679 = tail call float @llvm.nvvm.saturate.f(float %.02.i2408) #6, !dbg !45 + %.03.i2410 = select i1 %.not1.i2409, float %6679, float %6678, !dbg !45 + %6680 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2411 = icmp eq i32 %6680, 0, !dbg !45 + %6681 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2410, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6682 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2410, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2412 = select i1 %.not2.i2411, float %6682, float %6681, !dbg !45 + %6683 = fadd float %.04.i2412, 0xC168000FE0000000, !dbg !45 + %6684 = fneg float %6683, !dbg !45 + %6685 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2413 = icmp eq i32 %6685, 0, !dbg !45 + %6686 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5833, float 0x3FF7154760000000, float %6684) #6, !dbg !45 + %6687 = tail call float @llvm.nvvm.fma.rn.f(float %5833, float 0x3FF7154760000000, float %6684) #6, !dbg !45 + %.0.i2414 = select i1 %.not3.i2413, float %6687, float %6686, !dbg !45 + %6688 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2415 = icmp eq i32 %6688, 0, !dbg !45 + %6689 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5833, float 0x3E54AE0C00000000, float %.0.i2414) #6, !dbg !45 + %6690 = tail call float @llvm.nvvm.fma.rn.f(float %5833, float 0x3E54AE0C00000000, float %.0.i2414) #6, !dbg !45 + %.01.i2416 = select i1 %.not4.i2415, float %6690, float %6689, !dbg !45 + %6691 = bitcast float %.04.i2412 to i32, !dbg !45 + %6692 = shl i32 %6691, 23, !dbg !45 + %6693 = bitcast i32 %6692 to float, !dbg !45 + %6694 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2416) #6, !dbg !45 + %6695 = fmul float %6694, %6693, !dbg !45 + %6696 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2417 = icmp eq i32 %6696, 0, !dbg !45 + %6697 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5834, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6698 = tail call float @llvm.nvvm.fma.rn.f(float %5834, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2418 = select i1 %.not.i2417, float %6698, float %6697, !dbg !45 + %6699 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2419 = icmp eq i32 %6699, 0, !dbg !45 + %6700 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2418) #6, !dbg !45 + %6701 = tail call float @llvm.nvvm.saturate.f(float %.02.i2418) #6, !dbg !45 + %.03.i2420 = select i1 %.not1.i2419, float %6701, float %6700, !dbg !45 + %6702 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2421 = icmp eq i32 %6702, 0, !dbg !45 + %6703 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2420, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6704 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2420, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2422 = select i1 %.not2.i2421, float %6704, float %6703, !dbg !45 + %6705 = fadd float %.04.i2422, 0xC168000FE0000000, !dbg !45 + %6706 = fneg float %6705, !dbg !45 + %6707 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2423 = icmp eq i32 %6707, 0, !dbg !45 + %6708 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5834, float 0x3FF7154760000000, float %6706) #6, !dbg !45 + %6709 = tail call float @llvm.nvvm.fma.rn.f(float %5834, float 0x3FF7154760000000, float %6706) #6, !dbg !45 + %.0.i2424 = select i1 %.not3.i2423, float %6709, float %6708, !dbg !45 + %6710 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2425 = icmp eq i32 %6710, 0, !dbg !45 + %6711 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5834, float 0x3E54AE0C00000000, float %.0.i2424) #6, !dbg !45 + %6712 = tail call float @llvm.nvvm.fma.rn.f(float %5834, float 0x3E54AE0C00000000, float %.0.i2424) #6, !dbg !45 + %.01.i2426 = select i1 %.not4.i2425, float %6712, float %6711, !dbg !45 + %6713 = bitcast float %.04.i2422 to i32, !dbg !45 + %6714 = shl i32 %6713, 23, !dbg !45 + %6715 = bitcast i32 %6714 to float, !dbg !45 + %6716 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2426) #6, !dbg !45 + %6717 = fmul float %6716, %6715, !dbg !45 + %6718 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2427 = icmp eq i32 %6718, 0, !dbg !45 + %6719 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5835, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6720 = tail call float @llvm.nvvm.fma.rn.f(float %5835, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2428 = select i1 %.not.i2427, float %6720, float %6719, !dbg !45 + %6721 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2429 = icmp eq i32 %6721, 0, !dbg !45 + %6722 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2428) #6, !dbg !45 + %6723 = tail call float @llvm.nvvm.saturate.f(float %.02.i2428) #6, !dbg !45 + %.03.i2430 = select i1 %.not1.i2429, float %6723, float %6722, !dbg !45 + %6724 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2431 = icmp eq i32 %6724, 0, !dbg !45 + %6725 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2430, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6726 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2430, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2432 = select i1 %.not2.i2431, float %6726, float %6725, !dbg !45 + %6727 = fadd float %.04.i2432, 0xC168000FE0000000, !dbg !45 + %6728 = fneg float %6727, !dbg !45 + %6729 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2433 = icmp eq i32 %6729, 0, !dbg !45 + %6730 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5835, float 0x3FF7154760000000, float %6728) #6, !dbg !45 + %6731 = tail call float @llvm.nvvm.fma.rn.f(float %5835, float 0x3FF7154760000000, float %6728) #6, !dbg !45 + %.0.i2434 = select i1 %.not3.i2433, float %6731, float %6730, !dbg !45 + %6732 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2435 = icmp eq i32 %6732, 0, !dbg !45 + %6733 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5835, float 0x3E54AE0C00000000, float %.0.i2434) #6, !dbg !45 + %6734 = tail call float @llvm.nvvm.fma.rn.f(float %5835, float 0x3E54AE0C00000000, float %.0.i2434) #6, !dbg !45 + %.01.i2436 = select i1 %.not4.i2435, float %6734, float %6733, !dbg !45 + %6735 = bitcast float %.04.i2432 to i32, !dbg !45 + %6736 = shl i32 %6735, 23, !dbg !45 + %6737 = bitcast i32 %6736 to float, !dbg !45 + %6738 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2436) #6, !dbg !45 + %6739 = fmul float %6738, %6737, !dbg !45 + %6740 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2437 = icmp eq i32 %6740, 0, !dbg !45 + %6741 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5836, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6742 = tail call float @llvm.nvvm.fma.rn.f(float %5836, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2438 = select i1 %.not.i2437, float %6742, float %6741, !dbg !45 + %6743 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2439 = icmp eq i32 %6743, 0, !dbg !45 + %6744 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2438) #6, !dbg !45 + %6745 = tail call float @llvm.nvvm.saturate.f(float %.02.i2438) #6, !dbg !45 + %.03.i2440 = select i1 %.not1.i2439, float %6745, float %6744, !dbg !45 + %6746 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2441 = icmp eq i32 %6746, 0, !dbg !45 + %6747 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2440, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6748 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2440, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2442 = select i1 %.not2.i2441, float %6748, float %6747, !dbg !45 + %6749 = fadd float %.04.i2442, 0xC168000FE0000000, !dbg !45 + %6750 = fneg float %6749, !dbg !45 + %6751 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2443 = icmp eq i32 %6751, 0, !dbg !45 + %6752 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5836, float 0x3FF7154760000000, float %6750) #6, !dbg !45 + %6753 = tail call float @llvm.nvvm.fma.rn.f(float %5836, float 0x3FF7154760000000, float %6750) #6, !dbg !45 + %.0.i2444 = select i1 %.not3.i2443, float %6753, float %6752, !dbg !45 + %6754 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2445 = icmp eq i32 %6754, 0, !dbg !45 + %6755 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5836, float 0x3E54AE0C00000000, float %.0.i2444) #6, !dbg !45 + %6756 = tail call float @llvm.nvvm.fma.rn.f(float %5836, float 0x3E54AE0C00000000, float %.0.i2444) #6, !dbg !45 + %.01.i2446 = select i1 %.not4.i2445, float %6756, float %6755, !dbg !45 + %6757 = bitcast float %.04.i2442 to i32, !dbg !45 + %6758 = shl i32 %6757, 23, !dbg !45 + %6759 = bitcast i32 %6758 to float, !dbg !45 + %6760 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2446) #6, !dbg !45 + %6761 = fmul float %6760, %6759, !dbg !45 + %6762 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2447 = icmp eq i32 %6762, 0, !dbg !45 + %6763 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5837, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6764 = tail call float @llvm.nvvm.fma.rn.f(float %5837, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2448 = select i1 %.not.i2447, float %6764, float %6763, !dbg !45 + %6765 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2449 = icmp eq i32 %6765, 0, !dbg !45 + %6766 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2448) #6, !dbg !45 + %6767 = tail call float @llvm.nvvm.saturate.f(float %.02.i2448) #6, !dbg !45 + %.03.i2450 = select i1 %.not1.i2449, float %6767, float %6766, !dbg !45 + %6768 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2451 = icmp eq i32 %6768, 0, !dbg !45 + %6769 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2450, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6770 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2450, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2452 = select i1 %.not2.i2451, float %6770, float %6769, !dbg !45 + %6771 = fadd float %.04.i2452, 0xC168000FE0000000, !dbg !45 + %6772 = fneg float %6771, !dbg !45 + %6773 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2453 = icmp eq i32 %6773, 0, !dbg !45 + %6774 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5837, float 0x3FF7154760000000, float %6772) #6, !dbg !45 + %6775 = tail call float @llvm.nvvm.fma.rn.f(float %5837, float 0x3FF7154760000000, float %6772) #6, !dbg !45 + %.0.i2454 = select i1 %.not3.i2453, float %6775, float %6774, !dbg !45 + %6776 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2455 = icmp eq i32 %6776, 0, !dbg !45 + %6777 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5837, float 0x3E54AE0C00000000, float %.0.i2454) #6, !dbg !45 + %6778 = tail call float @llvm.nvvm.fma.rn.f(float %5837, float 0x3E54AE0C00000000, float %.0.i2454) #6, !dbg !45 + %.01.i2456 = select i1 %.not4.i2455, float %6778, float %6777, !dbg !45 + %6779 = bitcast float %.04.i2452 to i32, !dbg !45 + %6780 = shl i32 %6779, 23, !dbg !45 + %6781 = bitcast i32 %6780 to float, !dbg !45 + %6782 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2456) #6, !dbg !45 + %6783 = fmul float %6782, %6781, !dbg !45 + %6784 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2457 = icmp eq i32 %6784, 0, !dbg !45 + %6785 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5838, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6786 = tail call float @llvm.nvvm.fma.rn.f(float %5838, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2458 = select i1 %.not.i2457, float %6786, float %6785, !dbg !45 + %6787 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2459 = icmp eq i32 %6787, 0, !dbg !45 + %6788 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2458) #6, !dbg !45 + %6789 = tail call float @llvm.nvvm.saturate.f(float %.02.i2458) #6, !dbg !45 + %.03.i2460 = select i1 %.not1.i2459, float %6789, float %6788, !dbg !45 + %6790 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2461 = icmp eq i32 %6790, 0, !dbg !45 + %6791 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2460, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6792 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2460, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2462 = select i1 %.not2.i2461, float %6792, float %6791, !dbg !45 + %6793 = fadd float %.04.i2462, 0xC168000FE0000000, !dbg !45 + %6794 = fneg float %6793, !dbg !45 + %6795 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2463 = icmp eq i32 %6795, 0, !dbg !45 + %6796 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5838, float 0x3FF7154760000000, float %6794) #6, !dbg !45 + %6797 = tail call float @llvm.nvvm.fma.rn.f(float %5838, float 0x3FF7154760000000, float %6794) #6, !dbg !45 + %.0.i2464 = select i1 %.not3.i2463, float %6797, float %6796, !dbg !45 + %6798 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2465 = icmp eq i32 %6798, 0, !dbg !45 + %6799 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5838, float 0x3E54AE0C00000000, float %.0.i2464) #6, !dbg !45 + %6800 = tail call float @llvm.nvvm.fma.rn.f(float %5838, float 0x3E54AE0C00000000, float %.0.i2464) #6, !dbg !45 + %.01.i2466 = select i1 %.not4.i2465, float %6800, float %6799, !dbg !45 + %6801 = bitcast float %.04.i2462 to i32, !dbg !45 + %6802 = shl i32 %6801, 23, !dbg !45 + %6803 = bitcast i32 %6802 to float, !dbg !45 + %6804 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2466) #6, !dbg !45 + %6805 = fmul float %6804, %6803, !dbg !45 + %6806 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2467 = icmp eq i32 %6806, 0, !dbg !45 + %6807 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5839, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6808 = tail call float @llvm.nvvm.fma.rn.f(float %5839, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2468 = select i1 %.not.i2467, float %6808, float %6807, !dbg !45 + %6809 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2469 = icmp eq i32 %6809, 0, !dbg !45 + %6810 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2468) #6, !dbg !45 + %6811 = tail call float @llvm.nvvm.saturate.f(float %.02.i2468) #6, !dbg !45 + %.03.i2470 = select i1 %.not1.i2469, float %6811, float %6810, !dbg !45 + %6812 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2471 = icmp eq i32 %6812, 0, !dbg !45 + %6813 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2470, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6814 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2470, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2472 = select i1 %.not2.i2471, float %6814, float %6813, !dbg !45 + %6815 = fadd float %.04.i2472, 0xC168000FE0000000, !dbg !45 + %6816 = fneg float %6815, !dbg !45 + %6817 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2473 = icmp eq i32 %6817, 0, !dbg !45 + %6818 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5839, float 0x3FF7154760000000, float %6816) #6, !dbg !45 + %6819 = tail call float @llvm.nvvm.fma.rn.f(float %5839, float 0x3FF7154760000000, float %6816) #6, !dbg !45 + %.0.i2474 = select i1 %.not3.i2473, float %6819, float %6818, !dbg !45 + %6820 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2475 = icmp eq i32 %6820, 0, !dbg !45 + %6821 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5839, float 0x3E54AE0C00000000, float %.0.i2474) #6, !dbg !45 + %6822 = tail call float @llvm.nvvm.fma.rn.f(float %5839, float 0x3E54AE0C00000000, float %.0.i2474) #6, !dbg !45 + %.01.i2476 = select i1 %.not4.i2475, float %6822, float %6821, !dbg !45 + %6823 = bitcast float %.04.i2472 to i32, !dbg !45 + %6824 = shl i32 %6823, 23, !dbg !45 + %6825 = bitcast i32 %6824 to float, !dbg !45 + %6826 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2476) #6, !dbg !45 + %6827 = fmul float %6826, %6825, !dbg !45 + %6828 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2477 = icmp eq i32 %6828, 0, !dbg !45 + %6829 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5840, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6830 = tail call float @llvm.nvvm.fma.rn.f(float %5840, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2478 = select i1 %.not.i2477, float %6830, float %6829, !dbg !45 + %6831 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2479 = icmp eq i32 %6831, 0, !dbg !45 + %6832 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2478) #6, !dbg !45 + %6833 = tail call float @llvm.nvvm.saturate.f(float %.02.i2478) #6, !dbg !45 + %.03.i2480 = select i1 %.not1.i2479, float %6833, float %6832, !dbg !45 + %6834 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2481 = icmp eq i32 %6834, 0, !dbg !45 + %6835 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2480, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6836 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2480, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2482 = select i1 %.not2.i2481, float %6836, float %6835, !dbg !45 + %6837 = fadd float %.04.i2482, 0xC168000FE0000000, !dbg !45 + %6838 = fneg float %6837, !dbg !45 + %6839 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2483 = icmp eq i32 %6839, 0, !dbg !45 + %6840 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5840, float 0x3FF7154760000000, float %6838) #6, !dbg !45 + %6841 = tail call float @llvm.nvvm.fma.rn.f(float %5840, float 0x3FF7154760000000, float %6838) #6, !dbg !45 + %.0.i2484 = select i1 %.not3.i2483, float %6841, float %6840, !dbg !45 + %6842 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2485 = icmp eq i32 %6842, 0, !dbg !45 + %6843 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5840, float 0x3E54AE0C00000000, float %.0.i2484) #6, !dbg !45 + %6844 = tail call float @llvm.nvvm.fma.rn.f(float %5840, float 0x3E54AE0C00000000, float %.0.i2484) #6, !dbg !45 + %.01.i2486 = select i1 %.not4.i2485, float %6844, float %6843, !dbg !45 + %6845 = bitcast float %.04.i2482 to i32, !dbg !45 + %6846 = shl i32 %6845, 23, !dbg !45 + %6847 = bitcast i32 %6846 to float, !dbg !45 + %6848 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2486) #6, !dbg !45 + %6849 = fmul float %6848, %6847, !dbg !45 + %6850 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2487 = icmp eq i32 %6850, 0, !dbg !45 + %6851 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5841, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6852 = tail call float @llvm.nvvm.fma.rn.f(float %5841, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2488 = select i1 %.not.i2487, float %6852, float %6851, !dbg !45 + %6853 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2489 = icmp eq i32 %6853, 0, !dbg !45 + %6854 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2488) #6, !dbg !45 + %6855 = tail call float @llvm.nvvm.saturate.f(float %.02.i2488) #6, !dbg !45 + %.03.i2490 = select i1 %.not1.i2489, float %6855, float %6854, !dbg !45 + %6856 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2491 = icmp eq i32 %6856, 0, !dbg !45 + %6857 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2490, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6858 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2490, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2492 = select i1 %.not2.i2491, float %6858, float %6857, !dbg !45 + %6859 = fadd float %.04.i2492, 0xC168000FE0000000, !dbg !45 + %6860 = fneg float %6859, !dbg !45 + %6861 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2493 = icmp eq i32 %6861, 0, !dbg !45 + %6862 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5841, float 0x3FF7154760000000, float %6860) #6, !dbg !45 + %6863 = tail call float @llvm.nvvm.fma.rn.f(float %5841, float 0x3FF7154760000000, float %6860) #6, !dbg !45 + %.0.i2494 = select i1 %.not3.i2493, float %6863, float %6862, !dbg !45 + %6864 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2495 = icmp eq i32 %6864, 0, !dbg !45 + %6865 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5841, float 0x3E54AE0C00000000, float %.0.i2494) #6, !dbg !45 + %6866 = tail call float @llvm.nvvm.fma.rn.f(float %5841, float 0x3E54AE0C00000000, float %.0.i2494) #6, !dbg !45 + %.01.i2496 = select i1 %.not4.i2495, float %6866, float %6865, !dbg !45 + %6867 = bitcast float %.04.i2492 to i32, !dbg !45 + %6868 = shl i32 %6867, 23, !dbg !45 + %6869 = bitcast i32 %6868 to float, !dbg !45 + %6870 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2496) #6, !dbg !45 + %6871 = fmul float %6870, %6869, !dbg !45 + %6872 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2497 = icmp eq i32 %6872, 0, !dbg !45 + %6873 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5842, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6874 = tail call float @llvm.nvvm.fma.rn.f(float %5842, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2498 = select i1 %.not.i2497, float %6874, float %6873, !dbg !45 + %6875 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2499 = icmp eq i32 %6875, 0, !dbg !45 + %6876 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2498) #6, !dbg !45 + %6877 = tail call float @llvm.nvvm.saturate.f(float %.02.i2498) #6, !dbg !45 + %.03.i2500 = select i1 %.not1.i2499, float %6877, float %6876, !dbg !45 + %6878 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2501 = icmp eq i32 %6878, 0, !dbg !45 + %6879 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2500, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6880 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2500, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2502 = select i1 %.not2.i2501, float %6880, float %6879, !dbg !45 + %6881 = fadd float %.04.i2502, 0xC168000FE0000000, !dbg !45 + %6882 = fneg float %6881, !dbg !45 + %6883 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2503 = icmp eq i32 %6883, 0, !dbg !45 + %6884 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5842, float 0x3FF7154760000000, float %6882) #6, !dbg !45 + %6885 = tail call float @llvm.nvvm.fma.rn.f(float %5842, float 0x3FF7154760000000, float %6882) #6, !dbg !45 + %.0.i2504 = select i1 %.not3.i2503, float %6885, float %6884, !dbg !45 + %6886 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2505 = icmp eq i32 %6886, 0, !dbg !45 + %6887 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5842, float 0x3E54AE0C00000000, float %.0.i2504) #6, !dbg !45 + %6888 = tail call float @llvm.nvvm.fma.rn.f(float %5842, float 0x3E54AE0C00000000, float %.0.i2504) #6, !dbg !45 + %.01.i2506 = select i1 %.not4.i2505, float %6888, float %6887, !dbg !45 + %6889 = bitcast float %.04.i2502 to i32, !dbg !45 + %6890 = shl i32 %6889, 23, !dbg !45 + %6891 = bitcast i32 %6890 to float, !dbg !45 + %6892 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2506) #6, !dbg !45 + %6893 = fmul float %6892, %6891, !dbg !45 + %6894 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2507 = icmp eq i32 %6894, 0, !dbg !45 + %6895 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5843, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6896 = tail call float @llvm.nvvm.fma.rn.f(float %5843, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2508 = select i1 %.not.i2507, float %6896, float %6895, !dbg !45 + %6897 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2509 = icmp eq i32 %6897, 0, !dbg !45 + %6898 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2508) #6, !dbg !45 + %6899 = tail call float @llvm.nvvm.saturate.f(float %.02.i2508) #6, !dbg !45 + %.03.i2510 = select i1 %.not1.i2509, float %6899, float %6898, !dbg !45 + %6900 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2511 = icmp eq i32 %6900, 0, !dbg !45 + %6901 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2510, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6902 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2510, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2512 = select i1 %.not2.i2511, float %6902, float %6901, !dbg !45 + %6903 = fadd float %.04.i2512, 0xC168000FE0000000, !dbg !45 + %6904 = fneg float %6903, !dbg !45 + %6905 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2513 = icmp eq i32 %6905, 0, !dbg !45 + %6906 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5843, float 0x3FF7154760000000, float %6904) #6, !dbg !45 + %6907 = tail call float @llvm.nvvm.fma.rn.f(float %5843, float 0x3FF7154760000000, float %6904) #6, !dbg !45 + %.0.i2514 = select i1 %.not3.i2513, float %6907, float %6906, !dbg !45 + %6908 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2515 = icmp eq i32 %6908, 0, !dbg !45 + %6909 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5843, float 0x3E54AE0C00000000, float %.0.i2514) #6, !dbg !45 + %6910 = tail call float @llvm.nvvm.fma.rn.f(float %5843, float 0x3E54AE0C00000000, float %.0.i2514) #6, !dbg !45 + %.01.i2516 = select i1 %.not4.i2515, float %6910, float %6909, !dbg !45 + %6911 = bitcast float %.04.i2512 to i32, !dbg !45 + %6912 = shl i32 %6911, 23, !dbg !45 + %6913 = bitcast i32 %6912 to float, !dbg !45 + %6914 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2516) #6, !dbg !45 + %6915 = fmul float %6914, %6913, !dbg !45 + %6916 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2517 = icmp eq i32 %6916, 0, !dbg !45 + %6917 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5844, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6918 = tail call float @llvm.nvvm.fma.rn.f(float %5844, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2518 = select i1 %.not.i2517, float %6918, float %6917, !dbg !45 + %6919 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2519 = icmp eq i32 %6919, 0, !dbg !45 + %6920 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2518) #6, !dbg !45 + %6921 = tail call float @llvm.nvvm.saturate.f(float %.02.i2518) #6, !dbg !45 + %.03.i2520 = select i1 %.not1.i2519, float %6921, float %6920, !dbg !45 + %6922 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2521 = icmp eq i32 %6922, 0, !dbg !45 + %6923 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2520, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6924 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2520, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2522 = select i1 %.not2.i2521, float %6924, float %6923, !dbg !45 + %6925 = fadd float %.04.i2522, 0xC168000FE0000000, !dbg !45 + %6926 = fneg float %6925, !dbg !45 + %6927 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2523 = icmp eq i32 %6927, 0, !dbg !45 + %6928 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5844, float 0x3FF7154760000000, float %6926) #6, !dbg !45 + %6929 = tail call float @llvm.nvvm.fma.rn.f(float %5844, float 0x3FF7154760000000, float %6926) #6, !dbg !45 + %.0.i2524 = select i1 %.not3.i2523, float %6929, float %6928, !dbg !45 + %6930 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2525 = icmp eq i32 %6930, 0, !dbg !45 + %6931 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5844, float 0x3E54AE0C00000000, float %.0.i2524) #6, !dbg !45 + %6932 = tail call float @llvm.nvvm.fma.rn.f(float %5844, float 0x3E54AE0C00000000, float %.0.i2524) #6, !dbg !45 + %.01.i2526 = select i1 %.not4.i2525, float %6932, float %6931, !dbg !45 + %6933 = bitcast float %.04.i2522 to i32, !dbg !45 + %6934 = shl i32 %6933, 23, !dbg !45 + %6935 = bitcast i32 %6934 to float, !dbg !45 + %6936 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2526) #6, !dbg !45 + %6937 = fmul float %6936, %6935, !dbg !45 + %6938 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2527 = icmp eq i32 %6938, 0, !dbg !45 + %6939 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5845, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6940 = tail call float @llvm.nvvm.fma.rn.f(float %5845, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2528 = select i1 %.not.i2527, float %6940, float %6939, !dbg !45 + %6941 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2529 = icmp eq i32 %6941, 0, !dbg !45 + %6942 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2528) #6, !dbg !45 + %6943 = tail call float @llvm.nvvm.saturate.f(float %.02.i2528) #6, !dbg !45 + %.03.i2530 = select i1 %.not1.i2529, float %6943, float %6942, !dbg !45 + %6944 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2531 = icmp eq i32 %6944, 0, !dbg !45 + %6945 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2530, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6946 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2530, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2532 = select i1 %.not2.i2531, float %6946, float %6945, !dbg !45 + %6947 = fadd float %.04.i2532, 0xC168000FE0000000, !dbg !45 + %6948 = fneg float %6947, !dbg !45 + %6949 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2533 = icmp eq i32 %6949, 0, !dbg !45 + %6950 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5845, float 0x3FF7154760000000, float %6948) #6, !dbg !45 + %6951 = tail call float @llvm.nvvm.fma.rn.f(float %5845, float 0x3FF7154760000000, float %6948) #6, !dbg !45 + %.0.i2534 = select i1 %.not3.i2533, float %6951, float %6950, !dbg !45 + %6952 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2535 = icmp eq i32 %6952, 0, !dbg !45 + %6953 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5845, float 0x3E54AE0C00000000, float %.0.i2534) #6, !dbg !45 + %6954 = tail call float @llvm.nvvm.fma.rn.f(float %5845, float 0x3E54AE0C00000000, float %.0.i2534) #6, !dbg !45 + %.01.i2536 = select i1 %.not4.i2535, float %6954, float %6953, !dbg !45 + %6955 = bitcast float %.04.i2532 to i32, !dbg !45 + %6956 = shl i32 %6955, 23, !dbg !45 + %6957 = bitcast i32 %6956 to float, !dbg !45 + %6958 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2536) #6, !dbg !45 + %6959 = fmul float %6958, %6957, !dbg !45 + %6960 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2537 = icmp eq i32 %6960, 0, !dbg !45 + %6961 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5846, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6962 = tail call float @llvm.nvvm.fma.rn.f(float %5846, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2538 = select i1 %.not.i2537, float %6962, float %6961, !dbg !45 + %6963 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2539 = icmp eq i32 %6963, 0, !dbg !45 + %6964 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2538) #6, !dbg !45 + %6965 = tail call float @llvm.nvvm.saturate.f(float %.02.i2538) #6, !dbg !45 + %.03.i2540 = select i1 %.not1.i2539, float %6965, float %6964, !dbg !45 + %6966 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2541 = icmp eq i32 %6966, 0, !dbg !45 + %6967 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2540, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6968 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2540, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2542 = select i1 %.not2.i2541, float %6968, float %6967, !dbg !45 + %6969 = fadd float %.04.i2542, 0xC168000FE0000000, !dbg !45 + %6970 = fneg float %6969, !dbg !45 + %6971 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2543 = icmp eq i32 %6971, 0, !dbg !45 + %6972 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5846, float 0x3FF7154760000000, float %6970) #6, !dbg !45 + %6973 = tail call float @llvm.nvvm.fma.rn.f(float %5846, float 0x3FF7154760000000, float %6970) #6, !dbg !45 + %.0.i2544 = select i1 %.not3.i2543, float %6973, float %6972, !dbg !45 + %6974 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2545 = icmp eq i32 %6974, 0, !dbg !45 + %6975 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5846, float 0x3E54AE0C00000000, float %.0.i2544) #6, !dbg !45 + %6976 = tail call float @llvm.nvvm.fma.rn.f(float %5846, float 0x3E54AE0C00000000, float %.0.i2544) #6, !dbg !45 + %.01.i2546 = select i1 %.not4.i2545, float %6976, float %6975, !dbg !45 + %6977 = bitcast float %.04.i2542 to i32, !dbg !45 + %6978 = shl i32 %6977, 23, !dbg !45 + %6979 = bitcast i32 %6978 to float, !dbg !45 + %6980 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2546) #6, !dbg !45 + %6981 = fmul float %6980, %6979, !dbg !45 + %6982 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2547 = icmp eq i32 %6982, 0, !dbg !45 + %6983 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5847, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6984 = tail call float @llvm.nvvm.fma.rn.f(float %5847, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2548 = select i1 %.not.i2547, float %6984, float %6983, !dbg !45 + %6985 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2549 = icmp eq i32 %6985, 0, !dbg !45 + %6986 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2548) #6, !dbg !45 + %6987 = tail call float @llvm.nvvm.saturate.f(float %.02.i2548) #6, !dbg !45 + %.03.i2550 = select i1 %.not1.i2549, float %6987, float %6986, !dbg !45 + %6988 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2551 = icmp eq i32 %6988, 0, !dbg !45 + %6989 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2550, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6990 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2550, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2552 = select i1 %.not2.i2551, float %6990, float %6989, !dbg !45 + %6991 = fadd float %.04.i2552, 0xC168000FE0000000, !dbg !45 + %6992 = fneg float %6991, !dbg !45 + %6993 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2553 = icmp eq i32 %6993, 0, !dbg !45 + %6994 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5847, float 0x3FF7154760000000, float %6992) #6, !dbg !45 + %6995 = tail call float @llvm.nvvm.fma.rn.f(float %5847, float 0x3FF7154760000000, float %6992) #6, !dbg !45 + %.0.i2554 = select i1 %.not3.i2553, float %6995, float %6994, !dbg !45 + %6996 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2555 = icmp eq i32 %6996, 0, !dbg !45 + %6997 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5847, float 0x3E54AE0C00000000, float %.0.i2554) #6, !dbg !45 + %6998 = tail call float @llvm.nvvm.fma.rn.f(float %5847, float 0x3E54AE0C00000000, float %.0.i2554) #6, !dbg !45 + %.01.i2556 = select i1 %.not4.i2555, float %6998, float %6997, !dbg !45 + %6999 = bitcast float %.04.i2552 to i32, !dbg !45 + %7000 = shl i32 %6999, 23, !dbg !45 + %7001 = bitcast i32 %7000 to float, !dbg !45 + %7002 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2556) #6, !dbg !45 + %7003 = fmul float %7002, %7001, !dbg !45 + %7004 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2557 = icmp eq i32 %7004, 0, !dbg !45 + %7005 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5848, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7006 = tail call float @llvm.nvvm.fma.rn.f(float %5848, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2558 = select i1 %.not.i2557, float %7006, float %7005, !dbg !45 + %7007 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2559 = icmp eq i32 %7007, 0, !dbg !45 + %7008 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2558) #6, !dbg !45 + %7009 = tail call float @llvm.nvvm.saturate.f(float %.02.i2558) #6, !dbg !45 + %.03.i2560 = select i1 %.not1.i2559, float %7009, float %7008, !dbg !45 + %7010 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2561 = icmp eq i32 %7010, 0, !dbg !45 + %7011 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2560, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7012 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2560, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2562 = select i1 %.not2.i2561, float %7012, float %7011, !dbg !45 + %7013 = fadd float %.04.i2562, 0xC168000FE0000000, !dbg !45 + %7014 = fneg float %7013, !dbg !45 + %7015 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2563 = icmp eq i32 %7015, 0, !dbg !45 + %7016 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5848, float 0x3FF7154760000000, float %7014) #6, !dbg !45 + %7017 = tail call float @llvm.nvvm.fma.rn.f(float %5848, float 0x3FF7154760000000, float %7014) #6, !dbg !45 + %.0.i2564 = select i1 %.not3.i2563, float %7017, float %7016, !dbg !45 + %7018 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2565 = icmp eq i32 %7018, 0, !dbg !45 + %7019 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5848, float 0x3E54AE0C00000000, float %.0.i2564) #6, !dbg !45 + %7020 = tail call float @llvm.nvvm.fma.rn.f(float %5848, float 0x3E54AE0C00000000, float %.0.i2564) #6, !dbg !45 + %.01.i2566 = select i1 %.not4.i2565, float %7020, float %7019, !dbg !45 + %7021 = bitcast float %.04.i2562 to i32, !dbg !45 + %7022 = shl i32 %7021, 23, !dbg !45 + %7023 = bitcast i32 %7022 to float, !dbg !45 + %7024 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2566) #6, !dbg !45 + %7025 = fmul float %7024, %7023, !dbg !45 + %7026 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2567 = icmp eq i32 %7026, 0, !dbg !45 + %7027 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5849, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7028 = tail call float @llvm.nvvm.fma.rn.f(float %5849, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2568 = select i1 %.not.i2567, float %7028, float %7027, !dbg !45 + %7029 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2569 = icmp eq i32 %7029, 0, !dbg !45 + %7030 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2568) #6, !dbg !45 + %7031 = tail call float @llvm.nvvm.saturate.f(float %.02.i2568) #6, !dbg !45 + %.03.i2570 = select i1 %.not1.i2569, float %7031, float %7030, !dbg !45 + %7032 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2571 = icmp eq i32 %7032, 0, !dbg !45 + %7033 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2570, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7034 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2570, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2572 = select i1 %.not2.i2571, float %7034, float %7033, !dbg !45 + %7035 = fadd float %.04.i2572, 0xC168000FE0000000, !dbg !45 + %7036 = fneg float %7035, !dbg !45 + %7037 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2573 = icmp eq i32 %7037, 0, !dbg !45 + %7038 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5849, float 0x3FF7154760000000, float %7036) #6, !dbg !45 + %7039 = tail call float @llvm.nvvm.fma.rn.f(float %5849, float 0x3FF7154760000000, float %7036) #6, !dbg !45 + %.0.i2574 = select i1 %.not3.i2573, float %7039, float %7038, !dbg !45 + %7040 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2575 = icmp eq i32 %7040, 0, !dbg !45 + %7041 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5849, float 0x3E54AE0C00000000, float %.0.i2574) #6, !dbg !45 + %7042 = tail call float @llvm.nvvm.fma.rn.f(float %5849, float 0x3E54AE0C00000000, float %.0.i2574) #6, !dbg !45 + %.01.i2576 = select i1 %.not4.i2575, float %7042, float %7041, !dbg !45 + %7043 = bitcast float %.04.i2572 to i32, !dbg !45 + %7044 = shl i32 %7043, 23, !dbg !45 + %7045 = bitcast i32 %7044 to float, !dbg !45 + %7046 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2576) #6, !dbg !45 + %7047 = fmul float %7046, %7045, !dbg !45 + %7048 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2577 = icmp eq i32 %7048, 0, !dbg !45 + %7049 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5850, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7050 = tail call float @llvm.nvvm.fma.rn.f(float %5850, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2578 = select i1 %.not.i2577, float %7050, float %7049, !dbg !45 + %7051 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2579 = icmp eq i32 %7051, 0, !dbg !45 + %7052 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2578) #6, !dbg !45 + %7053 = tail call float @llvm.nvvm.saturate.f(float %.02.i2578) #6, !dbg !45 + %.03.i2580 = select i1 %.not1.i2579, float %7053, float %7052, !dbg !45 + %7054 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2581 = icmp eq i32 %7054, 0, !dbg !45 + %7055 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2580, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7056 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2580, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2582 = select i1 %.not2.i2581, float %7056, float %7055, !dbg !45 + %7057 = fadd float %.04.i2582, 0xC168000FE0000000, !dbg !45 + %7058 = fneg float %7057, !dbg !45 + %7059 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2583 = icmp eq i32 %7059, 0, !dbg !45 + %7060 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5850, float 0x3FF7154760000000, float %7058) #6, !dbg !45 + %7061 = tail call float @llvm.nvvm.fma.rn.f(float %5850, float 0x3FF7154760000000, float %7058) #6, !dbg !45 + %.0.i2584 = select i1 %.not3.i2583, float %7061, float %7060, !dbg !45 + %7062 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2585 = icmp eq i32 %7062, 0, !dbg !45 + %7063 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5850, float 0x3E54AE0C00000000, float %.0.i2584) #6, !dbg !45 + %7064 = tail call float @llvm.nvvm.fma.rn.f(float %5850, float 0x3E54AE0C00000000, float %.0.i2584) #6, !dbg !45 + %.01.i2586 = select i1 %.not4.i2585, float %7064, float %7063, !dbg !45 + %7065 = bitcast float %.04.i2582 to i32, !dbg !45 + %7066 = shl i32 %7065, 23, !dbg !45 + %7067 = bitcast i32 %7066 to float, !dbg !45 + %7068 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2586) #6, !dbg !45 + %7069 = fmul float %7068, %7067, !dbg !45 + %7070 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2587 = icmp eq i32 %7070, 0, !dbg !45 + %7071 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5851, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7072 = tail call float @llvm.nvvm.fma.rn.f(float %5851, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2588 = select i1 %.not.i2587, float %7072, float %7071, !dbg !45 + %7073 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2589 = icmp eq i32 %7073, 0, !dbg !45 + %7074 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2588) #6, !dbg !45 + %7075 = tail call float @llvm.nvvm.saturate.f(float %.02.i2588) #6, !dbg !45 + %.03.i2590 = select i1 %.not1.i2589, float %7075, float %7074, !dbg !45 + %7076 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2591 = icmp eq i32 %7076, 0, !dbg !45 + %7077 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2590, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7078 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2590, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2592 = select i1 %.not2.i2591, float %7078, float %7077, !dbg !45 + %7079 = fadd float %.04.i2592, 0xC168000FE0000000, !dbg !45 + %7080 = fneg float %7079, !dbg !45 + %7081 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2593 = icmp eq i32 %7081, 0, !dbg !45 + %7082 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5851, float 0x3FF7154760000000, float %7080) #6, !dbg !45 + %7083 = tail call float @llvm.nvvm.fma.rn.f(float %5851, float 0x3FF7154760000000, float %7080) #6, !dbg !45 + %.0.i2594 = select i1 %.not3.i2593, float %7083, float %7082, !dbg !45 + %7084 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2595 = icmp eq i32 %7084, 0, !dbg !45 + %7085 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5851, float 0x3E54AE0C00000000, float %.0.i2594) #6, !dbg !45 + %7086 = tail call float @llvm.nvvm.fma.rn.f(float %5851, float 0x3E54AE0C00000000, float %.0.i2594) #6, !dbg !45 + %.01.i2596 = select i1 %.not4.i2595, float %7086, float %7085, !dbg !45 + %7087 = bitcast float %.04.i2592 to i32, !dbg !45 + %7088 = shl i32 %7087, 23, !dbg !45 + %7089 = bitcast i32 %7088 to float, !dbg !45 + %7090 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2596) #6, !dbg !45 + %7091 = fmul float %7090, %7089, !dbg !45 + %7092 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2597 = icmp eq i32 %7092, 0, !dbg !45 + %7093 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5852, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7094 = tail call float @llvm.nvvm.fma.rn.f(float %5852, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2598 = select i1 %.not.i2597, float %7094, float %7093, !dbg !45 + %7095 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2599 = icmp eq i32 %7095, 0, !dbg !45 + %7096 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2598) #6, !dbg !45 + %7097 = tail call float @llvm.nvvm.saturate.f(float %.02.i2598) #6, !dbg !45 + %.03.i2600 = select i1 %.not1.i2599, float %7097, float %7096, !dbg !45 + %7098 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2601 = icmp eq i32 %7098, 0, !dbg !45 + %7099 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2600, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7100 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2600, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2602 = select i1 %.not2.i2601, float %7100, float %7099, !dbg !45 + %7101 = fadd float %.04.i2602, 0xC168000FE0000000, !dbg !45 + %7102 = fneg float %7101, !dbg !45 + %7103 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2603 = icmp eq i32 %7103, 0, !dbg !45 + %7104 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5852, float 0x3FF7154760000000, float %7102) #6, !dbg !45 + %7105 = tail call float @llvm.nvvm.fma.rn.f(float %5852, float 0x3FF7154760000000, float %7102) #6, !dbg !45 + %.0.i2604 = select i1 %.not3.i2603, float %7105, float %7104, !dbg !45 + %7106 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2605 = icmp eq i32 %7106, 0, !dbg !45 + %7107 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5852, float 0x3E54AE0C00000000, float %.0.i2604) #6, !dbg !45 + %7108 = tail call float @llvm.nvvm.fma.rn.f(float %5852, float 0x3E54AE0C00000000, float %.0.i2604) #6, !dbg !45 + %.01.i2606 = select i1 %.not4.i2605, float %7108, float %7107, !dbg !45 + %7109 = bitcast float %.04.i2602 to i32, !dbg !45 + %7110 = shl i32 %7109, 23, !dbg !45 + %7111 = bitcast i32 %7110 to float, !dbg !45 + %7112 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2606) #6, !dbg !45 + %7113 = fmul float %7112, %7111, !dbg !45 + %7114 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2607 = icmp eq i32 %7114, 0, !dbg !45 + %7115 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5853, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7116 = tail call float @llvm.nvvm.fma.rn.f(float %5853, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2608 = select i1 %.not.i2607, float %7116, float %7115, !dbg !45 + %7117 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2609 = icmp eq i32 %7117, 0, !dbg !45 + %7118 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2608) #6, !dbg !45 + %7119 = tail call float @llvm.nvvm.saturate.f(float %.02.i2608) #6, !dbg !45 + %.03.i2610 = select i1 %.not1.i2609, float %7119, float %7118, !dbg !45 + %7120 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2611 = icmp eq i32 %7120, 0, !dbg !45 + %7121 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2610, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7122 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2610, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2612 = select i1 %.not2.i2611, float %7122, float %7121, !dbg !45 + %7123 = fadd float %.04.i2612, 0xC168000FE0000000, !dbg !45 + %7124 = fneg float %7123, !dbg !45 + %7125 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2613 = icmp eq i32 %7125, 0, !dbg !45 + %7126 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5853, float 0x3FF7154760000000, float %7124) #6, !dbg !45 + %7127 = tail call float @llvm.nvvm.fma.rn.f(float %5853, float 0x3FF7154760000000, float %7124) #6, !dbg !45 + %.0.i2614 = select i1 %.not3.i2613, float %7127, float %7126, !dbg !45 + %7128 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2615 = icmp eq i32 %7128, 0, !dbg !45 + %7129 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5853, float 0x3E54AE0C00000000, float %.0.i2614) #6, !dbg !45 + %7130 = tail call float @llvm.nvvm.fma.rn.f(float %5853, float 0x3E54AE0C00000000, float %.0.i2614) #6, !dbg !45 + %.01.i2616 = select i1 %.not4.i2615, float %7130, float %7129, !dbg !45 + %7131 = bitcast float %.04.i2612 to i32, !dbg !45 + %7132 = shl i32 %7131, 23, !dbg !45 + %7133 = bitcast i32 %7132 to float, !dbg !45 + %7134 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2616) #6, !dbg !45 + %7135 = fmul float %7134, %7133, !dbg !45 + %7136 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2617 = icmp eq i32 %7136, 0, !dbg !45 + %7137 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5854, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7138 = tail call float @llvm.nvvm.fma.rn.f(float %5854, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2618 = select i1 %.not.i2617, float %7138, float %7137, !dbg !45 + %7139 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2619 = icmp eq i32 %7139, 0, !dbg !45 + %7140 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2618) #6, !dbg !45 + %7141 = tail call float @llvm.nvvm.saturate.f(float %.02.i2618) #6, !dbg !45 + %.03.i2620 = select i1 %.not1.i2619, float %7141, float %7140, !dbg !45 + %7142 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2621 = icmp eq i32 %7142, 0, !dbg !45 + %7143 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2620, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7144 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2620, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2622 = select i1 %.not2.i2621, float %7144, float %7143, !dbg !45 + %7145 = fadd float %.04.i2622, 0xC168000FE0000000, !dbg !45 + %7146 = fneg float %7145, !dbg !45 + %7147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2623 = icmp eq i32 %7147, 0, !dbg !45 + %7148 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5854, float 0x3FF7154760000000, float %7146) #6, !dbg !45 + %7149 = tail call float @llvm.nvvm.fma.rn.f(float %5854, float 0x3FF7154760000000, float %7146) #6, !dbg !45 + %.0.i2624 = select i1 %.not3.i2623, float %7149, float %7148, !dbg !45 + %7150 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2625 = icmp eq i32 %7150, 0, !dbg !45 + %7151 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5854, float 0x3E54AE0C00000000, float %.0.i2624) #6, !dbg !45 + %7152 = tail call float @llvm.nvvm.fma.rn.f(float %5854, float 0x3E54AE0C00000000, float %.0.i2624) #6, !dbg !45 + %.01.i2626 = select i1 %.not4.i2625, float %7152, float %7151, !dbg !45 + %7153 = bitcast float %.04.i2622 to i32, !dbg !45 + %7154 = shl i32 %7153, 23, !dbg !45 + %7155 = bitcast i32 %7154 to float, !dbg !45 + %7156 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2626) #6, !dbg !45 + %7157 = fmul float %7156, %7155, !dbg !45 + %7158 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2627 = icmp eq i32 %7158, 0, !dbg !45 + %7159 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5855, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7160 = tail call float @llvm.nvvm.fma.rn.f(float %5855, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2628 = select i1 %.not.i2627, float %7160, float %7159, !dbg !45 + %7161 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2629 = icmp eq i32 %7161, 0, !dbg !45 + %7162 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2628) #6, !dbg !45 + %7163 = tail call float @llvm.nvvm.saturate.f(float %.02.i2628) #6, !dbg !45 + %.03.i2630 = select i1 %.not1.i2629, float %7163, float %7162, !dbg !45 + %7164 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2631 = icmp eq i32 %7164, 0, !dbg !45 + %7165 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2630, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7166 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2630, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2632 = select i1 %.not2.i2631, float %7166, float %7165, !dbg !45 + %7167 = fadd float %.04.i2632, 0xC168000FE0000000, !dbg !45 + %7168 = fneg float %7167, !dbg !45 + %7169 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2633 = icmp eq i32 %7169, 0, !dbg !45 + %7170 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5855, float 0x3FF7154760000000, float %7168) #6, !dbg !45 + %7171 = tail call float @llvm.nvvm.fma.rn.f(float %5855, float 0x3FF7154760000000, float %7168) #6, !dbg !45 + %.0.i2634 = select i1 %.not3.i2633, float %7171, float %7170, !dbg !45 + %7172 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2635 = icmp eq i32 %7172, 0, !dbg !45 + %7173 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5855, float 0x3E54AE0C00000000, float %.0.i2634) #6, !dbg !45 + %7174 = tail call float @llvm.nvvm.fma.rn.f(float %5855, float 0x3E54AE0C00000000, float %.0.i2634) #6, !dbg !45 + %.01.i2636 = select i1 %.not4.i2635, float %7174, float %7173, !dbg !45 + %7175 = bitcast float %.04.i2632 to i32, !dbg !45 + %7176 = shl i32 %7175, 23, !dbg !45 + %7177 = bitcast i32 %7176 to float, !dbg !45 + %7178 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2636) #6, !dbg !45 + %7179 = fmul float %7178, %7177, !dbg !45 + %7180 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2637 = icmp eq i32 %7180, 0, !dbg !45 + %7181 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5856, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7182 = tail call float @llvm.nvvm.fma.rn.f(float %5856, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2638 = select i1 %.not.i2637, float %7182, float %7181, !dbg !45 + %7183 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2639 = icmp eq i32 %7183, 0, !dbg !45 + %7184 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2638) #6, !dbg !45 + %7185 = tail call float @llvm.nvvm.saturate.f(float %.02.i2638) #6, !dbg !45 + %.03.i2640 = select i1 %.not1.i2639, float %7185, float %7184, !dbg !45 + %7186 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2641 = icmp eq i32 %7186, 0, !dbg !45 + %7187 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2640, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7188 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2640, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2642 = select i1 %.not2.i2641, float %7188, float %7187, !dbg !45 + %7189 = fadd float %.04.i2642, 0xC168000FE0000000, !dbg !45 + %7190 = fneg float %7189, !dbg !45 + %7191 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2643 = icmp eq i32 %7191, 0, !dbg !45 + %7192 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5856, float 0x3FF7154760000000, float %7190) #6, !dbg !45 + %7193 = tail call float @llvm.nvvm.fma.rn.f(float %5856, float 0x3FF7154760000000, float %7190) #6, !dbg !45 + %.0.i2644 = select i1 %.not3.i2643, float %7193, float %7192, !dbg !45 + %7194 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2645 = icmp eq i32 %7194, 0, !dbg !45 + %7195 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5856, float 0x3E54AE0C00000000, float %.0.i2644) #6, !dbg !45 + %7196 = tail call float @llvm.nvvm.fma.rn.f(float %5856, float 0x3E54AE0C00000000, float %.0.i2644) #6, !dbg !45 + %.01.i2646 = select i1 %.not4.i2645, float %7196, float %7195, !dbg !45 + %7197 = bitcast float %.04.i2642 to i32, !dbg !45 + %7198 = shl i32 %7197, 23, !dbg !45 + %7199 = bitcast i32 %7198 to float, !dbg !45 + %7200 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2646) #6, !dbg !45 + %7201 = fmul float %7200, %7199, !dbg !45 + %7202 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2647 = icmp eq i32 %7202, 0, !dbg !45 + %7203 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5857, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7204 = tail call float @llvm.nvvm.fma.rn.f(float %5857, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2648 = select i1 %.not.i2647, float %7204, float %7203, !dbg !45 + %7205 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2649 = icmp eq i32 %7205, 0, !dbg !45 + %7206 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2648) #6, !dbg !45 + %7207 = tail call float @llvm.nvvm.saturate.f(float %.02.i2648) #6, !dbg !45 + %.03.i2650 = select i1 %.not1.i2649, float %7207, float %7206, !dbg !45 + %7208 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2651 = icmp eq i32 %7208, 0, !dbg !45 + %7209 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2650, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7210 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2650, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2652 = select i1 %.not2.i2651, float %7210, float %7209, !dbg !45 + %7211 = fadd float %.04.i2652, 0xC168000FE0000000, !dbg !45 + %7212 = fneg float %7211, !dbg !45 + %7213 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2653 = icmp eq i32 %7213, 0, !dbg !45 + %7214 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5857, float 0x3FF7154760000000, float %7212) #6, !dbg !45 + %7215 = tail call float @llvm.nvvm.fma.rn.f(float %5857, float 0x3FF7154760000000, float %7212) #6, !dbg !45 + %.0.i2654 = select i1 %.not3.i2653, float %7215, float %7214, !dbg !45 + %7216 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2655 = icmp eq i32 %7216, 0, !dbg !45 + %7217 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5857, float 0x3E54AE0C00000000, float %.0.i2654) #6, !dbg !45 + %7218 = tail call float @llvm.nvvm.fma.rn.f(float %5857, float 0x3E54AE0C00000000, float %.0.i2654) #6, !dbg !45 + %.01.i2656 = select i1 %.not4.i2655, float %7218, float %7217, !dbg !45 + %7219 = bitcast float %.04.i2652 to i32, !dbg !45 + %7220 = shl i32 %7219, 23, !dbg !45 + %7221 = bitcast i32 %7220 to float, !dbg !45 + %7222 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2656) #6, !dbg !45 + %7223 = fmul float %7222, %7221, !dbg !45 + %7224 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2657 = icmp eq i32 %7224, 0, !dbg !45 + %7225 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5858, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7226 = tail call float @llvm.nvvm.fma.rn.f(float %5858, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2658 = select i1 %.not.i2657, float %7226, float %7225, !dbg !45 + %7227 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2659 = icmp eq i32 %7227, 0, !dbg !45 + %7228 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2658) #6, !dbg !45 + %7229 = tail call float @llvm.nvvm.saturate.f(float %.02.i2658) #6, !dbg !45 + %.03.i2660 = select i1 %.not1.i2659, float %7229, float %7228, !dbg !45 + %7230 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2661 = icmp eq i32 %7230, 0, !dbg !45 + %7231 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2660, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7232 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2660, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2662 = select i1 %.not2.i2661, float %7232, float %7231, !dbg !45 + %7233 = fadd float %.04.i2662, 0xC168000FE0000000, !dbg !45 + %7234 = fneg float %7233, !dbg !45 + %7235 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2663 = icmp eq i32 %7235, 0, !dbg !45 + %7236 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5858, float 0x3FF7154760000000, float %7234) #6, !dbg !45 + %7237 = tail call float @llvm.nvvm.fma.rn.f(float %5858, float 0x3FF7154760000000, float %7234) #6, !dbg !45 + %.0.i2664 = select i1 %.not3.i2663, float %7237, float %7236, !dbg !45 + %7238 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2665 = icmp eq i32 %7238, 0, !dbg !45 + %7239 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5858, float 0x3E54AE0C00000000, float %.0.i2664) #6, !dbg !45 + %7240 = tail call float @llvm.nvvm.fma.rn.f(float %5858, float 0x3E54AE0C00000000, float %.0.i2664) #6, !dbg !45 + %.01.i2666 = select i1 %.not4.i2665, float %7240, float %7239, !dbg !45 + %7241 = bitcast float %.04.i2662 to i32, !dbg !45 + %7242 = shl i32 %7241, 23, !dbg !45 + %7243 = bitcast i32 %7242 to float, !dbg !45 + %7244 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2666) #6, !dbg !45 + %7245 = fmul float %7244, %7243, !dbg !45 + %7246 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2667 = icmp eq i32 %7246, 0, !dbg !45 + %7247 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5859, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7248 = tail call float @llvm.nvvm.fma.rn.f(float %5859, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2668 = select i1 %.not.i2667, float %7248, float %7247, !dbg !45 + %7249 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2669 = icmp eq i32 %7249, 0, !dbg !45 + %7250 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2668) #6, !dbg !45 + %7251 = tail call float @llvm.nvvm.saturate.f(float %.02.i2668) #6, !dbg !45 + %.03.i2670 = select i1 %.not1.i2669, float %7251, float %7250, !dbg !45 + %7252 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2671 = icmp eq i32 %7252, 0, !dbg !45 + %7253 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2670, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7254 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2670, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2672 = select i1 %.not2.i2671, float %7254, float %7253, !dbg !45 + %7255 = fadd float %.04.i2672, 0xC168000FE0000000, !dbg !45 + %7256 = fneg float %7255, !dbg !45 + %7257 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2673 = icmp eq i32 %7257, 0, !dbg !45 + %7258 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5859, float 0x3FF7154760000000, float %7256) #6, !dbg !45 + %7259 = tail call float @llvm.nvvm.fma.rn.f(float %5859, float 0x3FF7154760000000, float %7256) #6, !dbg !45 + %.0.i2674 = select i1 %.not3.i2673, float %7259, float %7258, !dbg !45 + %7260 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2675 = icmp eq i32 %7260, 0, !dbg !45 + %7261 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5859, float 0x3E54AE0C00000000, float %.0.i2674) #6, !dbg !45 + %7262 = tail call float @llvm.nvvm.fma.rn.f(float %5859, float 0x3E54AE0C00000000, float %.0.i2674) #6, !dbg !45 + %.01.i2676 = select i1 %.not4.i2675, float %7262, float %7261, !dbg !45 + %7263 = bitcast float %.04.i2672 to i32, !dbg !45 + %7264 = shl i32 %7263, 23, !dbg !45 + %7265 = bitcast i32 %7264 to float, !dbg !45 + %7266 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2676) #6, !dbg !45 + %7267 = fmul float %7266, %7265, !dbg !45 + %7268 = tail call float @llvm.nvvm.div.full(float %5881, float %5587), !dbg !46 + %7269 = tail call float @llvm.nvvm.div.full(float %5903, float %5587), !dbg !46 + %7270 = tail call float @llvm.nvvm.div.full(float %5925, float %5587), !dbg !46 + %7271 = tail call float @llvm.nvvm.div.full(float %5947, float %5587), !dbg !46 + %7272 = tail call float @llvm.nvvm.div.full(float %5969, float %5587), !dbg !46 + %7273 = tail call float @llvm.nvvm.div.full(float %5991, float %5587), !dbg !46 + %7274 = tail call float @llvm.nvvm.div.full(float %6013, float %5587), !dbg !46 + %7275 = tail call float @llvm.nvvm.div.full(float %6035, float %5587), !dbg !46 + %7276 = tail call float @llvm.nvvm.div.full(float %6057, float %5587), !dbg !46 + %7277 = tail call float @llvm.nvvm.div.full(float %6079, float %5587), !dbg !46 + %7278 = tail call float @llvm.nvvm.div.full(float %6101, float %5587), !dbg !46 + %7279 = tail call float @llvm.nvvm.div.full(float %6123, float %5587), !dbg !46 + %7280 = tail call float @llvm.nvvm.div.full(float %6145, float %5587), !dbg !46 + %7281 = tail call float @llvm.nvvm.div.full(float %6167, float %5587), !dbg !46 + %7282 = tail call float @llvm.nvvm.div.full(float %6189, float %5587), !dbg !46 + %7283 = tail call float @llvm.nvvm.div.full(float %6211, float %5587), !dbg !46 + %7284 = tail call float @llvm.nvvm.div.full(float %6233, float %5587), !dbg !46 + %7285 = tail call float @llvm.nvvm.div.full(float %6255, float %5587), !dbg !46 + %7286 = tail call float @llvm.nvvm.div.full(float %6277, float %5587), !dbg !46 + %7287 = tail call float @llvm.nvvm.div.full(float %6299, float %5587), !dbg !46 + %7288 = tail call float @llvm.nvvm.div.full(float %6321, float %5587), !dbg !46 + %7289 = tail call float @llvm.nvvm.div.full(float %6343, float %5587), !dbg !46 + %7290 = tail call float @llvm.nvvm.div.full(float %6365, float %5587), !dbg !46 + %7291 = tail call float @llvm.nvvm.div.full(float %6387, float %5587), !dbg !46 + %7292 = tail call float @llvm.nvvm.div.full(float %6409, float %5587), !dbg !46 + %7293 = tail call float @llvm.nvvm.div.full(float %6431, float %5587), !dbg !46 + %7294 = tail call float @llvm.nvvm.div.full(float %6453, float %5587), !dbg !46 + %7295 = tail call float @llvm.nvvm.div.full(float %6475, float %5587), !dbg !46 + %7296 = tail call float @llvm.nvvm.div.full(float %6497, float %5587), !dbg !46 + %7297 = tail call float @llvm.nvvm.div.full(float %6519, float %5587), !dbg !46 + %7298 = tail call float @llvm.nvvm.div.full(float %6541, float %5587), !dbg !46 + %7299 = tail call float @llvm.nvvm.div.full(float %6563, float %5587), !dbg !46 + %7300 = tail call float @llvm.nvvm.div.full(float %6585, float %5587), !dbg !46 + %7301 = tail call float @llvm.nvvm.div.full(float %6607, float %5587), !dbg !46 + %7302 = tail call float @llvm.nvvm.div.full(float %6629, float %5587), !dbg !46 + %7303 = tail call float @llvm.nvvm.div.full(float %6651, float %5587), !dbg !46 + %7304 = tail call float @llvm.nvvm.div.full(float %6673, float %5587), !dbg !46 + %7305 = tail call float @llvm.nvvm.div.full(float %6695, float %5587), !dbg !46 + %7306 = tail call float @llvm.nvvm.div.full(float %6717, float %5587), !dbg !46 + %7307 = tail call float @llvm.nvvm.div.full(float %6739, float %5587), !dbg !46 + %7308 = tail call float @llvm.nvvm.div.full(float %6761, float %5587), !dbg !46 + %7309 = tail call float @llvm.nvvm.div.full(float %6783, float %5587), !dbg !46 + %7310 = tail call float @llvm.nvvm.div.full(float %6805, float %5587), !dbg !46 + %7311 = tail call float @llvm.nvvm.div.full(float %6827, float %5587), !dbg !46 + %7312 = tail call float @llvm.nvvm.div.full(float %6849, float %5587), !dbg !46 + %7313 = tail call float @llvm.nvvm.div.full(float %6871, float %5587), !dbg !46 + %7314 = tail call float @llvm.nvvm.div.full(float %6893, float %5587), !dbg !46 + %7315 = tail call float @llvm.nvvm.div.full(float %6915, float %5587), !dbg !46 + %7316 = tail call float @llvm.nvvm.div.full(float %6937, float %5587), !dbg !46 + %7317 = tail call float @llvm.nvvm.div.full(float %6959, float %5587), !dbg !46 + %7318 = tail call float @llvm.nvvm.div.full(float %6981, float %5587), !dbg !46 + %7319 = tail call float @llvm.nvvm.div.full(float %7003, float %5587), !dbg !46 + %7320 = tail call float @llvm.nvvm.div.full(float %7025, float %5587), !dbg !46 + %7321 = tail call float @llvm.nvvm.div.full(float %7047, float %5587), !dbg !46 + %7322 = tail call float @llvm.nvvm.div.full(float %7069, float %5587), !dbg !46 + %7323 = tail call float @llvm.nvvm.div.full(float %7091, float %5587), !dbg !46 + %7324 = tail call float @llvm.nvvm.div.full(float %7113, float %5587), !dbg !46 + %7325 = tail call float @llvm.nvvm.div.full(float %7135, float %5587), !dbg !46 + %7326 = tail call float @llvm.nvvm.div.full(float %7157, float %5587), !dbg !46 + %7327 = tail call float @llvm.nvvm.div.full(float %7179, float %5587), !dbg !46 + %7328 = tail call float @llvm.nvvm.div.full(float %7201, float %5587), !dbg !46 + %7329 = tail call float @llvm.nvvm.div.full(float %7223, float %5587), !dbg !46 + %7330 = tail call float @llvm.nvvm.div.full(float %7245, float %5587), !dbg !46 + %7331 = tail call float @llvm.nvvm.div.full(float %7267, float %5587), !dbg !46 + %7332 = sext i32 %33 to i64, !dbg !47 + %7333 = getelementptr float, ptr addrspace(1) %1, i64 %7332, !dbg !47 + %7334 = sext i32 %35 to i64, !dbg !47 + %7335 = getelementptr float, ptr addrspace(1) %1, i64 %7334, !dbg !47 + %7336 = sext i32 %36 to i64, !dbg !47 + %7337 = getelementptr float, ptr addrspace(1) %1, i64 %7336, !dbg !47 + %7338 = sext i32 %38 to i64, !dbg !47 + %7339 = getelementptr float, ptr addrspace(1) %1, i64 %7338, !dbg !47 + %7340 = sext i32 %39 to i64, !dbg !47 + %7341 = getelementptr float, ptr addrspace(1) %1, i64 %7340, !dbg !47 + %7342 = sext i32 %41 to i64, !dbg !47 + %7343 = getelementptr float, ptr addrspace(1) %1, i64 %7342, !dbg !47 + %7344 = sext i32 %42 to i64, !dbg !47 + %7345 = getelementptr float, ptr addrspace(1) %1, i64 %7344, !dbg !47 + %7346 = sext i32 %44 to i64, !dbg !47 + %7347 = getelementptr float, ptr addrspace(1) %1, i64 %7346, !dbg !47 + %7348 = sext i32 %45 to i64, !dbg !47 + %7349 = getelementptr float, ptr addrspace(1) %1, i64 %7348, !dbg !47 + %7350 = sext i32 %47 to i64, !dbg !47 + %7351 = getelementptr float, ptr addrspace(1) %1, i64 %7350, !dbg !47 + %7352 = sext i32 %48 to i64, !dbg !47 + %7353 = getelementptr float, ptr addrspace(1) %1, i64 %7352, !dbg !47 + %7354 = sext i32 %50 to i64, !dbg !47 + %7355 = getelementptr float, ptr addrspace(1) %1, i64 %7354, !dbg !47 + %7356 = sext i32 %51 to i64, !dbg !47 + %7357 = getelementptr float, ptr addrspace(1) %1, i64 %7356, !dbg !47 + %7358 = sext i32 %53 to i64, !dbg !47 + %7359 = getelementptr float, ptr addrspace(1) %1, i64 %7358, !dbg !47 + %7360 = sext i32 %55 to i64, !dbg !47 + %7361 = getelementptr float, ptr addrspace(1) %1, i64 %7360, !dbg !47 + %7362 = sext i32 %56 to i64, !dbg !47 + %7363 = getelementptr float, ptr addrspace(1) %1, i64 %7362, !dbg !47 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7364 = shl nuw nsw i32 %11, 4, !dbg !48 + %7365 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %7364, !dbg !48 + %7366 = insertelement <4 x float> poison, float %7268, i64 0, !dbg !48 + %7367 = insertelement <4 x float> %7366, float %7269, i64 1, !dbg !48 + %7368 = insertelement <4 x float> %7367, float %7270, i64 2, !dbg !48 + %7369 = insertelement <4 x float> %7368, float %7271, i64 3, !dbg !48 + store <4 x float> %7369, ptr addrspace(3) %7365, align 16, !dbg !48 + %7370 = xor i32 %7364, 8256, !dbg !48 + %7371 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %7370, !dbg !48 + %7372 = insertelement <4 x float> poison, float %7272, i64 0, !dbg !48 + %7373 = insertelement <4 x float> %7372, float %7273, i64 1, !dbg !48 + %7374 = insertelement <4 x float> %7373, float %7274, i64 2, !dbg !48 + %7375 = insertelement <4 x float> %7374, float %7275, i64 3, !dbg !48 + store <4 x float> %7375, ptr addrspace(3) %7371, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7376 = shl nuw nsw i32 %8, 3, !dbg !48 + %7377 = and i32 %7376, 4080, !dbg !48 + %7378 = and i32 %8, 1, !dbg !48 + %7379 = icmp eq i32 %7378, 0, !dbg !48 + %7380 = select i1 %7379, i32 0, i32 8256, !dbg !48 + %7381 = xor i32 %7380, %7377, !dbg !48 + %7382 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %7381, !dbg !48 + %7383 = load <4 x i32>, ptr addrspace(3) %7382, align 16, !dbg !48 + %7384 = getelementptr inbounds nuw i8, ptr addrspace(3) %7382, i32 4096, !dbg !48 + %7385 = load <4 x i32>, ptr addrspace(3) %7384, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7386 = insertelement <4 x float> poison, float %7276, i64 0, !dbg !48 + %7387 = insertelement <4 x float> %7386, float %7277, i64 1, !dbg !48 + %7388 = insertelement <4 x float> %7387, float %7278, i64 2, !dbg !48 + %7389 = insertelement <4 x float> %7388, float %7279, i64 3, !dbg !48 + store <4 x float> %7389, ptr addrspace(3) %7365, align 16, !dbg !48 + %7390 = insertelement <4 x float> poison, float %7280, i64 0, !dbg !48 + %7391 = insertelement <4 x float> %7390, float %7281, i64 1, !dbg !48 + %7392 = insertelement <4 x float> %7391, float %7282, i64 2, !dbg !48 + %7393 = insertelement <4 x float> %7392, float %7283, i64 3, !dbg !48 + store <4 x float> %7393, ptr addrspace(3) %7371, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7394 = load <4 x i32>, ptr addrspace(3) %7382, align 16, !dbg !48 + %7395 = load <4 x i32>, ptr addrspace(3) %7384, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7396 = insertelement <4 x float> poison, float %7284, i64 0, !dbg !48 + %7397 = insertelement <4 x float> %7396, float %7285, i64 1, !dbg !48 + %7398 = insertelement <4 x float> %7397, float %7286, i64 2, !dbg !48 + %7399 = insertelement <4 x float> %7398, float %7287, i64 3, !dbg !48 + store <4 x float> %7399, ptr addrspace(3) %7365, align 16, !dbg !48 + %7400 = insertelement <4 x float> poison, float %7288, i64 0, !dbg !48 + %7401 = insertelement <4 x float> %7400, float %7289, i64 1, !dbg !48 + %7402 = insertelement <4 x float> %7401, float %7290, i64 2, !dbg !48 + %7403 = insertelement <4 x float> %7402, float %7291, i64 3, !dbg !48 + store <4 x float> %7403, ptr addrspace(3) %7371, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7404 = load <4 x i32>, ptr addrspace(3) %7382, align 16, !dbg !48 + %7405 = load <4 x i32>, ptr addrspace(3) %7384, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7406 = insertelement <4 x float> poison, float %7292, i64 0, !dbg !48 + %7407 = insertelement <4 x float> %7406, float %7293, i64 1, !dbg !48 + %7408 = insertelement <4 x float> %7407, float %7294, i64 2, !dbg !48 + %7409 = insertelement <4 x float> %7408, float %7295, i64 3, !dbg !48 + store <4 x float> %7409, ptr addrspace(3) %7365, align 16, !dbg !48 + %7410 = insertelement <4 x float> poison, float %7296, i64 0, !dbg !48 + %7411 = insertelement <4 x float> %7410, float %7297, i64 1, !dbg !48 + %7412 = insertelement <4 x float> %7411, float %7298, i64 2, !dbg !48 + %7413 = insertelement <4 x float> %7412, float %7299, i64 3, !dbg !48 + store <4 x float> %7413, ptr addrspace(3) %7371, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7414 = load <4 x i32>, ptr addrspace(3) %7382, align 16, !dbg !48 + %7415 = load <4 x i32>, ptr addrspace(3) %7384, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7416 = insertelement <4 x float> poison, float %7300, i64 0, !dbg !48 + %7417 = insertelement <4 x float> %7416, float %7301, i64 1, !dbg !48 + %7418 = insertelement <4 x float> %7417, float %7302, i64 2, !dbg !48 + %7419 = insertelement <4 x float> %7418, float %7303, i64 3, !dbg !48 + store <4 x float> %7419, ptr addrspace(3) %7365, align 16, !dbg !48 + %7420 = insertelement <4 x float> poison, float %7304, i64 0, !dbg !48 + %7421 = insertelement <4 x float> %7420, float %7305, i64 1, !dbg !48 + %7422 = insertelement <4 x float> %7421, float %7306, i64 2, !dbg !48 + %7423 = insertelement <4 x float> %7422, float %7307, i64 3, !dbg !48 + store <4 x float> %7423, ptr addrspace(3) %7371, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7424 = load <4 x i32>, ptr addrspace(3) %7382, align 16, !dbg !48 + %7425 = load <4 x i32>, ptr addrspace(3) %7384, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7426 = insertelement <4 x float> poison, float %7308, i64 0, !dbg !48 + %7427 = insertelement <4 x float> %7426, float %7309, i64 1, !dbg !48 + %7428 = insertelement <4 x float> %7427, float %7310, i64 2, !dbg !48 + %7429 = insertelement <4 x float> %7428, float %7311, i64 3, !dbg !48 + store <4 x float> %7429, ptr addrspace(3) %7365, align 16, !dbg !48 + %7430 = insertelement <4 x float> poison, float %7312, i64 0, !dbg !48 + %7431 = insertelement <4 x float> %7430, float %7313, i64 1, !dbg !48 + %7432 = insertelement <4 x float> %7431, float %7314, i64 2, !dbg !48 + %7433 = insertelement <4 x float> %7432, float %7315, i64 3, !dbg !48 + store <4 x float> %7433, ptr addrspace(3) %7371, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7434 = load <4 x i32>, ptr addrspace(3) %7382, align 16, !dbg !48 + %7435 = load <4 x i32>, ptr addrspace(3) %7384, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7436 = insertelement <4 x float> poison, float %7316, i64 0, !dbg !48 + %7437 = insertelement <4 x float> %7436, float %7317, i64 1, !dbg !48 + %7438 = insertelement <4 x float> %7437, float %7318, i64 2, !dbg !48 + %7439 = insertelement <4 x float> %7438, float %7319, i64 3, !dbg !48 + store <4 x float> %7439, ptr addrspace(3) %7365, align 16, !dbg !48 + %7440 = insertelement <4 x float> poison, float %7320, i64 0, !dbg !48 + %7441 = insertelement <4 x float> %7440, float %7321, i64 1, !dbg !48 + %7442 = insertelement <4 x float> %7441, float %7322, i64 2, !dbg !48 + %7443 = insertelement <4 x float> %7442, float %7323, i64 3, !dbg !48 + store <4 x float> %7443, ptr addrspace(3) %7371, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7444 = load <4 x i32>, ptr addrspace(3) %7382, align 16, !dbg !48 + %7445 = load <4 x i32>, ptr addrspace(3) %7384, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7446 = insertelement <4 x float> poison, float %7324, i64 0, !dbg !48 + %7447 = insertelement <4 x float> %7446, float %7325, i64 1, !dbg !48 + %7448 = insertelement <4 x float> %7447, float %7326, i64 2, !dbg !48 + %7449 = insertelement <4 x float> %7448, float %7327, i64 3, !dbg !48 + store <4 x float> %7449, ptr addrspace(3) %7365, align 16, !dbg !48 + %7450 = insertelement <4 x float> poison, float %7328, i64 0, !dbg !48 + %7451 = insertelement <4 x float> %7450, float %7329, i64 1, !dbg !48 + %7452 = insertelement <4 x float> %7451, float %7330, i64 2, !dbg !48 + %7453 = insertelement <4 x float> %7452, float %7331, i64 3, !dbg !48 + store <4 x float> %7453, ptr addrspace(3) %7371, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7454 = load <4 x i32>, ptr addrspace(3) %7382, align 16, !dbg !48 + %7455 = load <4 x i32>, ptr addrspace(3) %7384, align 16, !dbg !48 + %.extract = extractelement <4 x i32> %7383, i64 0, !dbg !48 + %.extract64 = extractelement <4 x i32> %7383, i64 1, !dbg !48 + %.extract65 = extractelement <4 x i32> %7383, i64 2, !dbg !48 + %.extract66 = extractelement <4 x i32> %7383, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract, i32 %.extract64, i32 %.extract65, i32 %.extract66, ptr addrspace(1) %7333, i1 true) #6, !dbg !48 + %.extract67 = extractelement <4 x i32> %7385, i64 0, !dbg !48 + %.extract68 = extractelement <4 x i32> %7385, i64 1, !dbg !48 + %.extract69 = extractelement <4 x i32> %7385, i64 2, !dbg !48 + %.extract70 = extractelement <4 x i32> %7385, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract67, i32 %.extract68, i32 %.extract69, i32 %.extract70, ptr addrspace(1) %7335, i1 true) #6, !dbg !48 + %.extract71 = extractelement <4 x i32> %7394, i64 0, !dbg !48 + %.extract72 = extractelement <4 x i32> %7394, i64 1, !dbg !48 + %.extract73 = extractelement <4 x i32> %7394, i64 2, !dbg !48 + %.extract74 = extractelement <4 x i32> %7394, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract71, i32 %.extract72, i32 %.extract73, i32 %.extract74, ptr addrspace(1) %7337, i1 true) #6, !dbg !48 + %.extract75 = extractelement <4 x i32> %7395, i64 0, !dbg !48 + %.extract76 = extractelement <4 x i32> %7395, i64 1, !dbg !48 + %.extract77 = extractelement <4 x i32> %7395, i64 2, !dbg !48 + %.extract78 = extractelement <4 x i32> %7395, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract75, i32 %.extract76, i32 %.extract77, i32 %.extract78, ptr addrspace(1) %7339, i1 true) #6, !dbg !48 + %.extract79 = extractelement <4 x i32> %7404, i64 0, !dbg !48 + %.extract80 = extractelement <4 x i32> %7404, i64 1, !dbg !48 + %.extract81 = extractelement <4 x i32> %7404, i64 2, !dbg !48 + %.extract82 = extractelement <4 x i32> %7404, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract79, i32 %.extract80, i32 %.extract81, i32 %.extract82, ptr addrspace(1) %7341, i1 true) #6, !dbg !48 + %.extract83 = extractelement <4 x i32> %7405, i64 0, !dbg !48 + %.extract84 = extractelement <4 x i32> %7405, i64 1, !dbg !48 + %.extract85 = extractelement <4 x i32> %7405, i64 2, !dbg !48 + %.extract86 = extractelement <4 x i32> %7405, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract83, i32 %.extract84, i32 %.extract85, i32 %.extract86, ptr addrspace(1) %7343, i1 true) #6, !dbg !48 + %.extract87 = extractelement <4 x i32> %7414, i64 0, !dbg !48 + %.extract88 = extractelement <4 x i32> %7414, i64 1, !dbg !48 + %.extract89 = extractelement <4 x i32> %7414, i64 2, !dbg !48 + %.extract90 = extractelement <4 x i32> %7414, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract87, i32 %.extract88, i32 %.extract89, i32 %.extract90, ptr addrspace(1) %7345, i1 true) #6, !dbg !48 + %.extract91 = extractelement <4 x i32> %7415, i64 0, !dbg !48 + %.extract92 = extractelement <4 x i32> %7415, i64 1, !dbg !48 + %.extract93 = extractelement <4 x i32> %7415, i64 2, !dbg !48 + %.extract94 = extractelement <4 x i32> %7415, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract91, i32 %.extract92, i32 %.extract93, i32 %.extract94, ptr addrspace(1) %7347, i1 true) #6, !dbg !48 + %.extract95 = extractelement <4 x i32> %7424, i64 0, !dbg !48 + %.extract96 = extractelement <4 x i32> %7424, i64 1, !dbg !48 + %.extract97 = extractelement <4 x i32> %7424, i64 2, !dbg !48 + %.extract98 = extractelement <4 x i32> %7424, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract95, i32 %.extract96, i32 %.extract97, i32 %.extract98, ptr addrspace(1) %7349, i1 true) #6, !dbg !48 + %.extract99 = extractelement <4 x i32> %7425, i64 0, !dbg !48 + %.extract100 = extractelement <4 x i32> %7425, i64 1, !dbg !48 + %.extract101 = extractelement <4 x i32> %7425, i64 2, !dbg !48 + %.extract102 = extractelement <4 x i32> %7425, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract99, i32 %.extract100, i32 %.extract101, i32 %.extract102, ptr addrspace(1) %7351, i1 true) #6, !dbg !48 + %.extract103 = extractelement <4 x i32> %7434, i64 0, !dbg !48 + %.extract104 = extractelement <4 x i32> %7434, i64 1, !dbg !48 + %.extract105 = extractelement <4 x i32> %7434, i64 2, !dbg !48 + %.extract106 = extractelement <4 x i32> %7434, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract103, i32 %.extract104, i32 %.extract105, i32 %.extract106, ptr addrspace(1) %7353, i1 true) #6, !dbg !48 + %.extract107 = extractelement <4 x i32> %7435, i64 0, !dbg !48 + %.extract108 = extractelement <4 x i32> %7435, i64 1, !dbg !48 + %.extract109 = extractelement <4 x i32> %7435, i64 2, !dbg !48 + %.extract110 = extractelement <4 x i32> %7435, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract107, i32 %.extract108, i32 %.extract109, i32 %.extract110, ptr addrspace(1) %7355, i1 true) #6, !dbg !48 + %.extract111 = extractelement <4 x i32> %7444, i64 0, !dbg !48 + %.extract112 = extractelement <4 x i32> %7444, i64 1, !dbg !48 + %.extract113 = extractelement <4 x i32> %7444, i64 2, !dbg !48 + %.extract114 = extractelement <4 x i32> %7444, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract111, i32 %.extract112, i32 %.extract113, i32 %.extract114, ptr addrspace(1) %7357, i1 true) #6, !dbg !48 + %.extract115 = extractelement <4 x i32> %7445, i64 0, !dbg !48 + %.extract116 = extractelement <4 x i32> %7445, i64 1, !dbg !48 + %.extract117 = extractelement <4 x i32> %7445, i64 2, !dbg !48 + %.extract118 = extractelement <4 x i32> %7445, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract115, i32 %.extract116, i32 %.extract117, i32 %.extract118, ptr addrspace(1) %7359, i1 true) #6, !dbg !48 + %.extract119 = extractelement <4 x i32> %7454, i64 0, !dbg !48 + %.extract120 = extractelement <4 x i32> %7454, i64 1, !dbg !48 + %.extract121 = extractelement <4 x i32> %7454, i64 2, !dbg !48 + %.extract122 = extractelement <4 x i32> %7454, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract119, i32 %.extract120, i32 %.extract121, i32 %.extract122, ptr addrspace(1) %7361, i1 true) #6, !dbg !48 + %.extract123 = extractelement <4 x i32> %7455, i64 0, !dbg !48 + %.extract124 = extractelement <4 x i32> %7455, i64 1, !dbg !48 + %.extract125 = extractelement <4 x i32> %7455, i64 2, !dbg !48 + %.extract126 = extractelement <4 x i32> %7455, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract123, i32 %.extract124, i32 %.extract125, i32 %.extract126, ptr addrspace(1) %7363, i1 %17) #6, !dbg !48 + ret void, !dbg !49 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.saturate.ftz.f(float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.saturate.f(float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rm.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rm.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0", linkageName: "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 26, column: 37, scope: !5) +!10 = !DILocation(line: 33, column: 29, scope: !5) +!11 = !DILocation(line: 37, column: 47, scope: !5) +!12 = !DILocation(line: 37, column: 41, scope: !5) +!13 = !DILocation(line: 37, column: 34, scope: !5) +!14 = !DILocation(line: 37, column: 52, scope: !5) +!15 = !DILocation(line: 37, column: 105, scope: !5) +!16 = !DILocation(line: 196, column: 19, scope: !17, inlinedAt: !19) +!17 = distinct !DILexicalBlockFile(scope: !5, file: !18, discriminator: 0) +!18 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!19 = !DILocation(line: 42, column: 40, scope: !5) +!20 = !DILocation(line: 196, column: 53, scope: !17, inlinedAt: !19) +!21 = !DILocation(line: 173, column: 29, scope: !17, inlinedAt: !19) +!22 = !DILocation(line: 199, column: 53, scope: !17, inlinedAt: !19) +!23 = !DILocation(line: 205, column: 24, scope: !17, inlinedAt: !19) +!24 = !DILocation(line: 205, column: 36, scope: !17, inlinedAt: !19) +!25 = !DILocation(line: 45, column: 54, scope: !5) +!26 = !DILocation(line: 46, column: 54, scope: !5) +!27 = !DILocation(line: 110, column: 15, scope: !17, inlinedAt: !28) +!28 = !DILocation(line: 49, column: 33, scope: !5) +!29 = !DILocation(line: 112, column: 21, scope: !17, inlinedAt: !28) +!30 = !DILocation(line: 112, column: 16, scope: !17, inlinedAt: !28) +!31 = !DILocation(line: 113, column: 29, scope: !17, inlinedAt: !28) +!32 = !DILocation(line: 123, column: 29, scope: !17, inlinedAt: !28) +!33 = !DILocation(line: 180, column: 40, scope: !17, inlinedAt: !28) +!34 = !DILocation(line: 180, column: 68, scope: !17, inlinedAt: !28) +!35 = !DILocation(line: 180, column: 58, scope: !17, inlinedAt: !28) +!36 = !DILocation(line: 173, column: 29, scope: !17, inlinedAt: !28) +!37 = !DILocation(line: 181, column: 31, scope: !17, inlinedAt: !28) +!38 = !DILocation(line: 291, column: 36, scope: !39, inlinedAt: !28) +!39 = distinct !DILexicalBlockFile(scope: !5, file: !40, discriminator: 0) +!40 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!41 = !DILocation(line: 261, column: 15, scope: !39, inlinedAt: !28) +!42 = !DILocation(line: 58, column: 52, scope: !5) +!43 = !DILocation(line: 58, column: 106, scope: !5) +!44 = !DILocation(line: 60, column: 22, scope: !5) +!45 = !DILocation(line: 61, column: 29, scope: !5) +!46 = !DILocation(line: 62, column: 23, scope: !5) +!47 = !DILocation(line: 63, column: 29, scope: !5) +!48 = !DILocation(line: 63, column: 53, scope: !5) +!49 = !DILocation(line: 52, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..b88f677fdb894fab37a14d69a6adb95fdf8a163e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx @@ -0,0 +1,4793 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 // -- Begin function triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 +.visible .entry triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0( + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_1, + .param .u32 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_2, + .param .u32 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_5 +) +.reqntid 512 +{ + .reg .pred %p<248>; + .reg .b16 %rs<130>; + .reg .b32 %r<3729>; + .reg .b64 %rd<112>; + .loc 1 18 0 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:18:0 + +// %bb.0: + ld.param.b64 %rd65, [triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_0]; + ld.param.b64 %rd66, [triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_1]; +$L__tmp0: + .loc 1 23 28 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:23:28 + mov.u32 %r205, %ctaid.x; + .loc 1 26 37 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:26:37 + mov.u32 %r206, %tid.x; + and.b32 %r207, %r206, 31; + and.b32 %r208, %r206, 511; + shl.b32 %r209, %r208, 3; + or.b32 %r210, %r209, 28672; + shl.b32 %r211, %r208, 2; + or.b32 %r212, %r211, 30720; + .loc 1 33 29 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:33:29 + setp.lt.u32 %p8, %r210, 32000; + setp.lt.u32 %p38, %r212, 32000; + .loc 1 37 47 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:47 + mul.lo.s32 %r213, %r205, 32000; + .loc 1 37 41 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:41 + add.s32 %r214, %r209, %r213; + add.s32 %r215, %r214, 4096; + add.s32 %r216, %r214, 8192; + add.s32 %r217, %r214, 12288; + add.s32 %r218, %r214, 16384; + add.s32 %r219, %r214, 20480; + add.s32 %r220, %r214, 24576; + add.s32 %r221, %r210, %r213; + sub.s32 %r222, %r214, %r211; + .loc 1 26 37 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:26:37 + add.s32 %r223, %r213, %r211; + .loc 1 37 41 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:41 + add.s32 %r224, %r223, 2048; + sub.s32 %r225, %r215, %r211; + add.s32 %r226, %r223, 6144; + sub.s32 %r227, %r216, %r211; + add.s32 %r228, %r223, 10240; + sub.s32 %r229, %r217, %r211; + add.s32 %r230, %r223, 14336; + sub.s32 %r231, %r218, %r211; + add.s32 %r232, %r223, 18432; + sub.s32 %r233, %r219, %r211; + add.s32 %r234, %r223, 22528; + sub.s32 %r235, %r220, %r211; + add.s32 %r236, %r223, 26624; + add.s32 %r237, %r223, 28672; + add.s32 %r238, %r212, %r213; + .loc 1 37 34 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:34 + mad.wide.s32 %rd2, %r214, 2, %rd65; + mad.wide.s32 %rd5, %r215, 2, %rd65; + mad.wide.s32 %rd8, %r216, 2, %rd65; + mad.wide.s32 %rd11, %r217, 2, %rd65; + mad.wide.s32 %rd14, %r218, 2, %rd65; + mad.wide.s32 %rd17, %r219, 2, %rd65; + mad.wide.s32 %rd20, %r220, 2, %rd65; + mad.wide.s32 %rd23, %r221, 2, %rd65; + .loc 1 37 52 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:52 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd3, 1.0; + // end inline asm + mov.b32 %r5, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd2 + 0 ], %rd3; + // end inline asm + mov.b32 {%rs1, %rs2}, %r1; + mov.b32 {%rs3, %rs4}, %r2; + mov.b32 {%rs5, %rs6}, %r3; + mov.b32 {%rs7, %rs8}, %r4; + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r9, %r5; + mov.u32 %r10, %r5; + mov.u32 %r11, %r5; + mov.u32 %r12, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r9, %r10, %r11, %r12 }, [ %rd5 + 0 ], %rd6; + // end inline asm + mov.b32 {%rs9, %rs10}, %r9; + mov.b32 {%rs11, %rs12}, %r10; + mov.b32 {%rs13, %rs14}, %r11; + mov.b32 {%rs15, %rs16}, %r12; + // begin inline asm + mov.u64 %rd9, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd9, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r17, %r5; + mov.u32 %r18, %r5; + mov.u32 %r19, %r5; + mov.u32 %r20, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r17, %r18, %r19, %r20 }, [ %rd8 + 0 ], %rd9; + // end inline asm + mov.b32 {%rs17, %rs18}, %r17; + mov.b32 {%rs19, %rs20}, %r18; + mov.b32 {%rs21, %rs22}, %r19; + mov.b32 {%rs23, %rs24}, %r20; + // begin inline asm + mov.u64 %rd12, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r25, %r5; + mov.u32 %r26, %r5; + mov.u32 %r27, %r5; + mov.u32 %r28, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd11 + 0 ], %rd12; + // end inline asm + mov.b32 {%rs25, %rs26}, %r25; + mov.b32 {%rs27, %rs28}, %r26; + mov.b32 {%rs29, %rs30}, %r27; + mov.b32 {%rs31, %rs32}, %r28; + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r33, %r5; + mov.u32 %r34, %r5; + mov.u32 %r35, %r5; + mov.u32 %r36, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd14 + 0 ], %rd15; + // end inline asm + mov.b32 {%rs33, %rs34}, %r33; + mov.b32 {%rs35, %rs36}, %r34; + mov.b32 {%rs37, %rs38}, %r35; + mov.b32 {%rs39, %rs40}, %r36; + // begin inline asm + mov.u64 %rd18, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r41, %r5; + mov.u32 %r42, %r5; + mov.u32 %r43, %r5; + mov.u32 %r44, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r41, %r42, %r43, %r44 }, [ %rd17 + 0 ], %rd18; + // end inline asm + mov.b32 {%rs41, %rs42}, %r41; + mov.b32 {%rs43, %rs44}, %r42; + mov.b32 {%rs45, %rs46}, %r43; + mov.b32 {%rs47, %rs48}, %r44; + // begin inline asm + mov.u64 %rd21, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd21, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r49, %r5; + mov.u32 %r50, %r5; + mov.u32 %r51, %r5; + mov.u32 %r52, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r49, %r50, %r51, %r52 }, [ %rd20 + 0 ], %rd21; + // end inline asm + mov.b32 {%rs49, %rs50}, %r49; + mov.b32 {%rs51, %rs52}, %r50; + mov.b32 {%rs53, %rs54}, %r51; + mov.b32 {%rs55, %rs56}, %r52; + // begin inline asm + mov.u64 %rd24, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd24, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r57, %r5; + mov.u32 %r58, %r5; + mov.u32 %r59, %r5; + mov.u32 %r60, %r5; + @%p8 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r57, %r58, %r59, %r60 }, [ %rd23 + 0 ], %rd24; + // end inline asm + mov.b32 {%rs57, %rs58}, %r57; + .loc 1 37 105 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:105 + cvt.f32.bf16 %r239, %rs1; + cvt.f32.bf16 %r240, %rs2; + cvt.f32.bf16 %r241, %rs3; + cvt.f32.bf16 %r242, %rs4; + cvt.f32.bf16 %r243, %rs5; + cvt.f32.bf16 %r244, %rs6; + cvt.f32.bf16 %r245, %rs7; + cvt.f32.bf16 %r246, %rs8; + cvt.f32.bf16 %r247, %rs9; + cvt.f32.bf16 %r248, %rs10; + cvt.f32.bf16 %r249, %rs11; + cvt.f32.bf16 %r250, %rs12; + cvt.f32.bf16 %r251, %rs13; + cvt.f32.bf16 %r252, %rs14; + cvt.f32.bf16 %r253, %rs15; + cvt.f32.bf16 %r254, %rs16; + cvt.f32.bf16 %r255, %rs17; + cvt.f32.bf16 %r256, %rs18; + cvt.f32.bf16 %r257, %rs19; + cvt.f32.bf16 %r258, %rs20; + cvt.f32.bf16 %r259, %rs21; + cvt.f32.bf16 %r260, %rs22; + cvt.f32.bf16 %r261, %rs23; + cvt.f32.bf16 %r262, %rs24; + cvt.f32.bf16 %r263, %rs25; + cvt.f32.bf16 %r264, %rs26; + cvt.f32.bf16 %r265, %rs27; + cvt.f32.bf16 %r266, %rs28; + cvt.f32.bf16 %r267, %rs29; + cvt.f32.bf16 %r268, %rs30; + cvt.f32.bf16 %r269, %rs31; + cvt.f32.bf16 %r270, %rs32; + cvt.f32.bf16 %r271, %rs33; + cvt.f32.bf16 %r272, %rs34; + cvt.f32.bf16 %r273, %rs35; + cvt.f32.bf16 %r274, %rs36; + cvt.f32.bf16 %r275, %rs37; + cvt.f32.bf16 %r276, %rs38; + cvt.f32.bf16 %r277, %rs39; + cvt.f32.bf16 %r278, %rs40; + cvt.f32.bf16 %r279, %rs41; + cvt.f32.bf16 %r280, %rs42; + cvt.f32.bf16 %r281, %rs43; + cvt.f32.bf16 %r282, %rs44; + cvt.f32.bf16 %r283, %rs45; + cvt.f32.bf16 %r284, %rs46; + cvt.f32.bf16 %r285, %rs47; + cvt.f32.bf16 %r286, %rs48; + cvt.f32.bf16 %r287, %rs49; + cvt.f32.bf16 %r288, %rs50; + cvt.f32.bf16 %r289, %rs51; + cvt.f32.bf16 %r290, %rs52; + cvt.f32.bf16 %r291, %rs53; + cvt.f32.bf16 %r292, %rs54; + cvt.f32.bf16 %r293, %rs55; + cvt.f32.bf16 %r294, %rs56; + cvt.f32.bf16 %r295, %rs57; + cvt.f32.bf16 %r296, %rs58; + mov.b32 {%rs59, %rs60}, %r58; + cvt.f32.bf16 %r297, %rs59; + cvt.f32.bf16 %r298, %rs60; + mov.b32 {%rs61, %rs62}, %r59; + cvt.f32.bf16 %r299, %rs61; + cvt.f32.bf16 %r300, %rs62; + mov.b32 {%rs63, %rs64}, %r60; + cvt.f32.bf16 %r301, %rs63; + cvt.f32.bf16 %r302, %rs64; + mov.b16 %rs65, 0xFF80; +$L__tmp1: + .loc 2 196 19 // triton_helpers.py:196:19 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.eq.bf16 %p39, %rs1, %rs65; + setp.eq.bf16 %p40, %rs2, %rs65; + setp.eq.bf16 %p41, %rs3, %rs65; + setp.eq.bf16 %p42, %rs4, %rs65; + setp.eq.bf16 %p43, %rs5, %rs65; + setp.eq.bf16 %p44, %rs6, %rs65; + setp.eq.bf16 %p45, %rs7, %rs65; + setp.eq.bf16 %p46, %rs8, %rs65; + setp.eq.bf16 %p47, %rs9, %rs65; + setp.eq.bf16 %p48, %rs10, %rs65; + setp.eq.bf16 %p49, %rs11, %rs65; + setp.eq.bf16 %p50, %rs12, %rs65; + setp.eq.bf16 %p51, %rs13, %rs65; + setp.eq.bf16 %p52, %rs14, %rs65; + setp.eq.bf16 %p53, %rs15, %rs65; + setp.eq.bf16 %p54, %rs16, %rs65; + setp.eq.bf16 %p55, %rs17, %rs65; + setp.eq.bf16 %p56, %rs18, %rs65; + setp.eq.bf16 %p57, %rs19, %rs65; + setp.eq.bf16 %p58, %rs20, %rs65; + setp.eq.bf16 %p59, %rs21, %rs65; + setp.eq.bf16 %p60, %rs22, %rs65; + setp.eq.bf16 %p61, %rs23, %rs65; + setp.eq.bf16 %p62, %rs24, %rs65; + setp.eq.bf16 %p63, %rs25, %rs65; + setp.eq.bf16 %p64, %rs26, %rs65; + setp.eq.bf16 %p65, %rs27, %rs65; + setp.eq.bf16 %p66, %rs28, %rs65; + setp.eq.bf16 %p67, %rs29, %rs65; + setp.eq.bf16 %p68, %rs30, %rs65; + setp.eq.bf16 %p69, %rs31, %rs65; + setp.eq.bf16 %p70, %rs32, %rs65; + setp.eq.bf16 %p71, %rs33, %rs65; + setp.eq.bf16 %p72, %rs34, %rs65; + setp.eq.bf16 %p73, %rs35, %rs65; + setp.eq.bf16 %p74, %rs36, %rs65; + setp.eq.bf16 %p75, %rs37, %rs65; + setp.eq.bf16 %p76, %rs38, %rs65; + setp.eq.bf16 %p77, %rs39, %rs65; + setp.eq.bf16 %p78, %rs40, %rs65; + setp.eq.bf16 %p79, %rs41, %rs65; + setp.eq.bf16 %p80, %rs42, %rs65; + setp.eq.bf16 %p81, %rs43, %rs65; + setp.eq.bf16 %p82, %rs44, %rs65; + setp.eq.bf16 %p83, %rs45, %rs65; + setp.eq.bf16 %p84, %rs46, %rs65; + setp.eq.bf16 %p85, %rs47, %rs65; + setp.eq.bf16 %p86, %rs48, %rs65; + setp.eq.bf16 %p87, %rs49, %rs65; + setp.eq.bf16 %p88, %rs50, %rs65; + setp.eq.bf16 %p89, %rs51, %rs65; + setp.eq.bf16 %p90, %rs52, %rs65; + setp.eq.bf16 %p91, %rs53, %rs65; + setp.eq.bf16 %p92, %rs54, %rs65; + setp.eq.bf16 %p93, %rs55, %rs65; + setp.eq.bf16 %p94, %rs56, %rs65; + setp.eq.bf16 %p95, %rs57, %rs65; + setp.eq.bf16 %p96, %rs58, %rs65; + mov.b32 %r303, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + sub.f32 %r304, %r303, %r239; + sub.f32 %r305, %r303, %r240; + sub.f32 %r306, %r303, %r241; + sub.f32 %r307, %r303, %r242; + sub.f32 %r308, %r303, %r243; + sub.f32 %r309, %r303, %r244; + sub.f32 %r310, %r303, %r245; + sub.f32 %r311, %r303, %r246; + sub.f32 %r312, %r303, %r247; + sub.f32 %r313, %r303, %r248; + sub.f32 %r314, %r303, %r249; + sub.f32 %r315, %r303, %r250; + sub.f32 %r316, %r303, %r251; + sub.f32 %r317, %r303, %r252; + sub.f32 %r318, %r303, %r253; + sub.f32 %r319, %r303, %r254; + sub.f32 %r320, %r303, %r255; + sub.f32 %r321, %r303, %r256; + sub.f32 %r322, %r303, %r257; + sub.f32 %r323, %r303, %r258; + sub.f32 %r324, %r303, %r259; + sub.f32 %r325, %r303, %r260; + sub.f32 %r326, %r303, %r261; + sub.f32 %r327, %r303, %r262; + sub.f32 %r328, %r303, %r263; + sub.f32 %r329, %r303, %r264; + sub.f32 %r330, %r303, %r265; + sub.f32 %r331, %r303, %r266; + sub.f32 %r332, %r303, %r267; + sub.f32 %r333, %r303, %r268; + sub.f32 %r334, %r303, %r269; + sub.f32 %r335, %r303, %r270; + sub.f32 %r336, %r303, %r271; + sub.f32 %r337, %r303, %r272; + sub.f32 %r338, %r303, %r273; + sub.f32 %r339, %r303, %r274; + sub.f32 %r340, %r303, %r275; + sub.f32 %r341, %r303, %r276; + sub.f32 %r342, %r303, %r277; + sub.f32 %r343, %r303, %r278; + sub.f32 %r344, %r303, %r279; + sub.f32 %r345, %r303, %r280; + sub.f32 %r346, %r303, %r281; + sub.f32 %r347, %r303, %r282; + sub.f32 %r348, %r303, %r283; + sub.f32 %r349, %r303, %r284; + sub.f32 %r350, %r303, %r285; + sub.f32 %r351, %r303, %r286; + sub.f32 %r352, %r303, %r287; + sub.f32 %r353, %r303, %r288; + sub.f32 %r354, %r303, %r289; + sub.f32 %r355, %r303, %r290; + sub.f32 %r356, %r303, %r291; + sub.f32 %r357, %r303, %r292; + sub.f32 %r358, %r303, %r293; + sub.f32 %r359, %r303, %r294; + sub.f32 %r360, %r303, %r295; + sub.f32 %r361, %r303, %r296; + sub.f32 %r362, %r303, %r297; + sub.f32 %r363, %r303, %r298; + sub.f32 %r364, %r303, %r299; + sub.f32 %r365, %r303, %r300; + sub.f32 %r366, %r303, %r301; + sub.f32 %r367, %r303, %r302; + mov.b32 %r368, 0f3F000000; + mov.b32 %r369, 0f3BBB989D; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.ftz.f32 %r370, %r304, %r369, %r368; + cvt.ftz.sat.f32.f32 %r371, %r370; + mov.b32 %r372, 0f4B400001; + mov.b32 %r373, 0f437C0000; + fma.rm.ftz.f32 %r374, %r371, %r373, %r372; + add.f32 %r375, %r374, 0fCB40007F; + neg.f32 %r376, %r375; + mov.b32 %r377, 0f3FB8AA3B; + fma.rn.ftz.f32 %r378, %r304, %r377, %r376; + mov.b32 %r379, 0f32A57060; + fma.rn.ftz.f32 %r380, %r304, %r379, %r378; + shl.b32 %r381, %r374, 23; + ex2.approx.ftz.f32 %r382, %r380; + mul.f32 %r383, %r382, %r381; + fma.rn.ftz.f32 %r384, %r305, %r369, %r368; + cvt.ftz.sat.f32.f32 %r385, %r384; + fma.rm.ftz.f32 %r386, %r385, %r373, %r372; + add.f32 %r387, %r386, 0fCB40007F; + neg.f32 %r388, %r387; + fma.rn.ftz.f32 %r389, %r305, %r377, %r388; + fma.rn.ftz.f32 %r390, %r305, %r379, %r389; + shl.b32 %r391, %r386, 23; + ex2.approx.ftz.f32 %r392, %r390; + mul.f32 %r393, %r392, %r391; + fma.rn.ftz.f32 %r394, %r306, %r369, %r368; + cvt.ftz.sat.f32.f32 %r395, %r394; + fma.rm.ftz.f32 %r396, %r395, %r373, %r372; + add.f32 %r397, %r396, 0fCB40007F; + neg.f32 %r398, %r397; + fma.rn.ftz.f32 %r399, %r306, %r377, %r398; + fma.rn.ftz.f32 %r400, %r306, %r379, %r399; + shl.b32 %r401, %r396, 23; + ex2.approx.ftz.f32 %r402, %r400; + mul.f32 %r403, %r402, %r401; + fma.rn.ftz.f32 %r404, %r307, %r369, %r368; + cvt.ftz.sat.f32.f32 %r405, %r404; + fma.rm.ftz.f32 %r406, %r405, %r373, %r372; + add.f32 %r407, %r406, 0fCB40007F; + neg.f32 %r408, %r407; + fma.rn.ftz.f32 %r409, %r307, %r377, %r408; + fma.rn.ftz.f32 %r410, %r307, %r379, %r409; + shl.b32 %r411, %r406, 23; + ex2.approx.ftz.f32 %r412, %r410; + mul.f32 %r413, %r412, %r411; + fma.rn.ftz.f32 %r414, %r308, %r369, %r368; + cvt.ftz.sat.f32.f32 %r415, %r414; + fma.rm.ftz.f32 %r416, %r415, %r373, %r372; + add.f32 %r417, %r416, 0fCB40007F; + neg.f32 %r418, %r417; + fma.rn.ftz.f32 %r419, %r308, %r377, %r418; + fma.rn.ftz.f32 %r420, %r308, %r379, %r419; + shl.b32 %r421, %r416, 23; + ex2.approx.ftz.f32 %r422, %r420; + mul.f32 %r423, %r422, %r421; + fma.rn.ftz.f32 %r424, %r309, %r369, %r368; + cvt.ftz.sat.f32.f32 %r425, %r424; + fma.rm.ftz.f32 %r426, %r425, %r373, %r372; + add.f32 %r427, %r426, 0fCB40007F; + neg.f32 %r428, %r427; + fma.rn.ftz.f32 %r429, %r309, %r377, %r428; + fma.rn.ftz.f32 %r430, %r309, %r379, %r429; + shl.b32 %r431, %r426, 23; + ex2.approx.ftz.f32 %r432, %r430; + mul.f32 %r433, %r432, %r431; + fma.rn.ftz.f32 %r434, %r310, %r369, %r368; + cvt.ftz.sat.f32.f32 %r435, %r434; + fma.rm.ftz.f32 %r436, %r435, %r373, %r372; + add.f32 %r437, %r436, 0fCB40007F; + neg.f32 %r438, %r437; + fma.rn.ftz.f32 %r439, %r310, %r377, %r438; + fma.rn.ftz.f32 %r440, %r310, %r379, %r439; + shl.b32 %r441, %r436, 23; + ex2.approx.ftz.f32 %r442, %r440; + mul.f32 %r443, %r442, %r441; + fma.rn.ftz.f32 %r444, %r311, %r369, %r368; + cvt.ftz.sat.f32.f32 %r445, %r444; + fma.rm.ftz.f32 %r446, %r445, %r373, %r372; + add.f32 %r447, %r446, 0fCB40007F; + neg.f32 %r448, %r447; + fma.rn.ftz.f32 %r449, %r311, %r377, %r448; + fma.rn.ftz.f32 %r450, %r311, %r379, %r449; + shl.b32 %r451, %r446, 23; + ex2.approx.ftz.f32 %r452, %r450; + mul.f32 %r453, %r452, %r451; + fma.rn.ftz.f32 %r454, %r312, %r369, %r368; + cvt.ftz.sat.f32.f32 %r455, %r454; + fma.rm.ftz.f32 %r456, %r455, %r373, %r372; + add.f32 %r457, %r456, 0fCB40007F; + neg.f32 %r458, %r457; + fma.rn.ftz.f32 %r459, %r312, %r377, %r458; + fma.rn.ftz.f32 %r460, %r312, %r379, %r459; + shl.b32 %r461, %r456, 23; + ex2.approx.ftz.f32 %r462, %r460; + mul.f32 %r463, %r462, %r461; + fma.rn.ftz.f32 %r464, %r313, %r369, %r368; + cvt.ftz.sat.f32.f32 %r465, %r464; + fma.rm.ftz.f32 %r466, %r465, %r373, %r372; + add.f32 %r467, %r466, 0fCB40007F; + neg.f32 %r468, %r467; + fma.rn.ftz.f32 %r469, %r313, %r377, %r468; + fma.rn.ftz.f32 %r470, %r313, %r379, %r469; + shl.b32 %r471, %r466, 23; + ex2.approx.ftz.f32 %r472, %r470; + mul.f32 %r473, %r472, %r471; + fma.rn.ftz.f32 %r474, %r314, %r369, %r368; + cvt.ftz.sat.f32.f32 %r475, %r474; + fma.rm.ftz.f32 %r476, %r475, %r373, %r372; + add.f32 %r477, %r476, 0fCB40007F; + neg.f32 %r478, %r477; + fma.rn.ftz.f32 %r479, %r314, %r377, %r478; + fma.rn.ftz.f32 %r480, %r314, %r379, %r479; + shl.b32 %r481, %r476, 23; + ex2.approx.ftz.f32 %r482, %r480; + mul.f32 %r483, %r482, %r481; + fma.rn.ftz.f32 %r484, %r315, %r369, %r368; + cvt.ftz.sat.f32.f32 %r485, %r484; + fma.rm.ftz.f32 %r486, %r485, %r373, %r372; + add.f32 %r487, %r486, 0fCB40007F; + neg.f32 %r488, %r487; + fma.rn.ftz.f32 %r489, %r315, %r377, %r488; + fma.rn.ftz.f32 %r490, %r315, %r379, %r489; + shl.b32 %r491, %r486, 23; + ex2.approx.ftz.f32 %r492, %r490; + mul.f32 %r493, %r492, %r491; + fma.rn.ftz.f32 %r494, %r316, %r369, %r368; + cvt.ftz.sat.f32.f32 %r495, %r494; + fma.rm.ftz.f32 %r496, %r495, %r373, %r372; + add.f32 %r497, %r496, 0fCB40007F; + neg.f32 %r498, %r497; + fma.rn.ftz.f32 %r499, %r316, %r377, %r498; + fma.rn.ftz.f32 %r500, %r316, %r379, %r499; + shl.b32 %r501, %r496, 23; + ex2.approx.ftz.f32 %r502, %r500; + mul.f32 %r503, %r502, %r501; + fma.rn.ftz.f32 %r504, %r317, %r369, %r368; + cvt.ftz.sat.f32.f32 %r505, %r504; + fma.rm.ftz.f32 %r506, %r505, %r373, %r372; + add.f32 %r507, %r506, 0fCB40007F; + neg.f32 %r508, %r507; + fma.rn.ftz.f32 %r509, %r317, %r377, %r508; + fma.rn.ftz.f32 %r510, %r317, %r379, %r509; + shl.b32 %r511, %r506, 23; + ex2.approx.ftz.f32 %r512, %r510; + mul.f32 %r513, %r512, %r511; + fma.rn.ftz.f32 %r514, %r318, %r369, %r368; + cvt.ftz.sat.f32.f32 %r515, %r514; + fma.rm.ftz.f32 %r516, %r515, %r373, %r372; + add.f32 %r517, %r516, 0fCB40007F; + neg.f32 %r518, %r517; + fma.rn.ftz.f32 %r519, %r318, %r377, %r518; + fma.rn.ftz.f32 %r520, %r318, %r379, %r519; + shl.b32 %r521, %r516, 23; + ex2.approx.ftz.f32 %r522, %r520; + mul.f32 %r523, %r522, %r521; + fma.rn.ftz.f32 %r524, %r319, %r369, %r368; + cvt.ftz.sat.f32.f32 %r525, %r524; + fma.rm.ftz.f32 %r526, %r525, %r373, %r372; + add.f32 %r527, %r526, 0fCB40007F; + neg.f32 %r528, %r527; + fma.rn.ftz.f32 %r529, %r319, %r377, %r528; + fma.rn.ftz.f32 %r530, %r319, %r379, %r529; + shl.b32 %r531, %r526, 23; + ex2.approx.ftz.f32 %r532, %r530; + mul.f32 %r533, %r532, %r531; + fma.rn.ftz.f32 %r534, %r320, %r369, %r368; + cvt.ftz.sat.f32.f32 %r535, %r534; + fma.rm.ftz.f32 %r536, %r535, %r373, %r372; + add.f32 %r537, %r536, 0fCB40007F; + neg.f32 %r538, %r537; + fma.rn.ftz.f32 %r539, %r320, %r377, %r538; + fma.rn.ftz.f32 %r540, %r320, %r379, %r539; + shl.b32 %r541, %r536, 23; + ex2.approx.ftz.f32 %r542, %r540; + mul.f32 %r543, %r542, %r541; + fma.rn.ftz.f32 %r544, %r321, %r369, %r368; + cvt.ftz.sat.f32.f32 %r545, %r544; + fma.rm.ftz.f32 %r546, %r545, %r373, %r372; + add.f32 %r547, %r546, 0fCB40007F; + neg.f32 %r548, %r547; + fma.rn.ftz.f32 %r549, %r321, %r377, %r548; + fma.rn.ftz.f32 %r550, %r321, %r379, %r549; + shl.b32 %r551, %r546, 23; + ex2.approx.ftz.f32 %r552, %r550; + mul.f32 %r553, %r552, %r551; + fma.rn.ftz.f32 %r554, %r322, %r369, %r368; + cvt.ftz.sat.f32.f32 %r555, %r554; + fma.rm.ftz.f32 %r556, %r555, %r373, %r372; + add.f32 %r557, %r556, 0fCB40007F; + neg.f32 %r558, %r557; + fma.rn.ftz.f32 %r559, %r322, %r377, %r558; + fma.rn.ftz.f32 %r560, %r322, %r379, %r559; + shl.b32 %r561, %r556, 23; + ex2.approx.ftz.f32 %r562, %r560; + mul.f32 %r563, %r562, %r561; + fma.rn.ftz.f32 %r564, %r323, %r369, %r368; + cvt.ftz.sat.f32.f32 %r565, %r564; + fma.rm.ftz.f32 %r566, %r565, %r373, %r372; + add.f32 %r567, %r566, 0fCB40007F; + neg.f32 %r568, %r567; + fma.rn.ftz.f32 %r569, %r323, %r377, %r568; + fma.rn.ftz.f32 %r570, %r323, %r379, %r569; + shl.b32 %r571, %r566, 23; + ex2.approx.ftz.f32 %r572, %r570; + mul.f32 %r573, %r572, %r571; + fma.rn.ftz.f32 %r574, %r324, %r369, %r368; + cvt.ftz.sat.f32.f32 %r575, %r574; + fma.rm.ftz.f32 %r576, %r575, %r373, %r372; + add.f32 %r577, %r576, 0fCB40007F; + neg.f32 %r578, %r577; + fma.rn.ftz.f32 %r579, %r324, %r377, %r578; + fma.rn.ftz.f32 %r580, %r324, %r379, %r579; + shl.b32 %r581, %r576, 23; + ex2.approx.ftz.f32 %r582, %r580; + mul.f32 %r583, %r582, %r581; + fma.rn.ftz.f32 %r584, %r325, %r369, %r368; + cvt.ftz.sat.f32.f32 %r585, %r584; + fma.rm.ftz.f32 %r586, %r585, %r373, %r372; + add.f32 %r587, %r586, 0fCB40007F; + neg.f32 %r588, %r587; + fma.rn.ftz.f32 %r589, %r325, %r377, %r588; + fma.rn.ftz.f32 %r590, %r325, %r379, %r589; + shl.b32 %r591, %r586, 23; + ex2.approx.ftz.f32 %r592, %r590; + mul.f32 %r593, %r592, %r591; + fma.rn.ftz.f32 %r594, %r326, %r369, %r368; + cvt.ftz.sat.f32.f32 %r595, %r594; + fma.rm.ftz.f32 %r596, %r595, %r373, %r372; + add.f32 %r597, %r596, 0fCB40007F; + neg.f32 %r598, %r597; + fma.rn.ftz.f32 %r599, %r326, %r377, %r598; + fma.rn.ftz.f32 %r600, %r326, %r379, %r599; + shl.b32 %r601, %r596, 23; + ex2.approx.ftz.f32 %r602, %r600; + mul.f32 %r603, %r602, %r601; + fma.rn.ftz.f32 %r604, %r327, %r369, %r368; + cvt.ftz.sat.f32.f32 %r605, %r604; + fma.rm.ftz.f32 %r606, %r605, %r373, %r372; + add.f32 %r607, %r606, 0fCB40007F; + neg.f32 %r608, %r607; + fma.rn.ftz.f32 %r609, %r327, %r377, %r608; + fma.rn.ftz.f32 %r610, %r327, %r379, %r609; + shl.b32 %r611, %r606, 23; + ex2.approx.ftz.f32 %r612, %r610; + mul.f32 %r613, %r612, %r611; + fma.rn.ftz.f32 %r614, %r328, %r369, %r368; + cvt.ftz.sat.f32.f32 %r615, %r614; + fma.rm.ftz.f32 %r616, %r615, %r373, %r372; + add.f32 %r617, %r616, 0fCB40007F; + neg.f32 %r618, %r617; + fma.rn.ftz.f32 %r619, %r328, %r377, %r618; + fma.rn.ftz.f32 %r620, %r328, %r379, %r619; + shl.b32 %r621, %r616, 23; + ex2.approx.ftz.f32 %r622, %r620; + mul.f32 %r623, %r622, %r621; + fma.rn.ftz.f32 %r624, %r329, %r369, %r368; + cvt.ftz.sat.f32.f32 %r625, %r624; + fma.rm.ftz.f32 %r626, %r625, %r373, %r372; + add.f32 %r627, %r626, 0fCB40007F; + neg.f32 %r628, %r627; + fma.rn.ftz.f32 %r629, %r329, %r377, %r628; + fma.rn.ftz.f32 %r630, %r329, %r379, %r629; + shl.b32 %r631, %r626, 23; + ex2.approx.ftz.f32 %r632, %r630; + mul.f32 %r633, %r632, %r631; + fma.rn.ftz.f32 %r634, %r330, %r369, %r368; + cvt.ftz.sat.f32.f32 %r635, %r634; + fma.rm.ftz.f32 %r636, %r635, %r373, %r372; + add.f32 %r637, %r636, 0fCB40007F; + neg.f32 %r638, %r637; + fma.rn.ftz.f32 %r639, %r330, %r377, %r638; + fma.rn.ftz.f32 %r640, %r330, %r379, %r639; + shl.b32 %r641, %r636, 23; + ex2.approx.ftz.f32 %r642, %r640; + mul.f32 %r643, %r642, %r641; + fma.rn.ftz.f32 %r644, %r331, %r369, %r368; + cvt.ftz.sat.f32.f32 %r645, %r644; + fma.rm.ftz.f32 %r646, %r645, %r373, %r372; + add.f32 %r647, %r646, 0fCB40007F; + neg.f32 %r648, %r647; + fma.rn.ftz.f32 %r649, %r331, %r377, %r648; + fma.rn.ftz.f32 %r650, %r331, %r379, %r649; + shl.b32 %r651, %r646, 23; + ex2.approx.ftz.f32 %r652, %r650; + mul.f32 %r653, %r652, %r651; + fma.rn.ftz.f32 %r654, %r332, %r369, %r368; + cvt.ftz.sat.f32.f32 %r655, %r654; + fma.rm.ftz.f32 %r656, %r655, %r373, %r372; + add.f32 %r657, %r656, 0fCB40007F; + neg.f32 %r658, %r657; + fma.rn.ftz.f32 %r659, %r332, %r377, %r658; + fma.rn.ftz.f32 %r660, %r332, %r379, %r659; + shl.b32 %r661, %r656, 23; + ex2.approx.ftz.f32 %r662, %r660; + mul.f32 %r663, %r662, %r661; + fma.rn.ftz.f32 %r664, %r333, %r369, %r368; + cvt.ftz.sat.f32.f32 %r665, %r664; + fma.rm.ftz.f32 %r666, %r665, %r373, %r372; + add.f32 %r667, %r666, 0fCB40007F; + neg.f32 %r668, %r667; + fma.rn.ftz.f32 %r669, %r333, %r377, %r668; + fma.rn.ftz.f32 %r670, %r333, %r379, %r669; + shl.b32 %r671, %r666, 23; + ex2.approx.ftz.f32 %r672, %r670; + mul.f32 %r673, %r672, %r671; + fma.rn.ftz.f32 %r674, %r334, %r369, %r368; + cvt.ftz.sat.f32.f32 %r675, %r674; + fma.rm.ftz.f32 %r676, %r675, %r373, %r372; + add.f32 %r677, %r676, 0fCB40007F; + neg.f32 %r678, %r677; + fma.rn.ftz.f32 %r679, %r334, %r377, %r678; + fma.rn.ftz.f32 %r680, %r334, %r379, %r679; + shl.b32 %r681, %r676, 23; + ex2.approx.ftz.f32 %r682, %r680; + mul.f32 %r683, %r682, %r681; + fma.rn.ftz.f32 %r684, %r335, %r369, %r368; + cvt.ftz.sat.f32.f32 %r685, %r684; + fma.rm.ftz.f32 %r686, %r685, %r373, %r372; + add.f32 %r687, %r686, 0fCB40007F; + neg.f32 %r688, %r687; + fma.rn.ftz.f32 %r689, %r335, %r377, %r688; + fma.rn.ftz.f32 %r690, %r335, %r379, %r689; + shl.b32 %r691, %r686, 23; + ex2.approx.ftz.f32 %r692, %r690; + mul.f32 %r693, %r692, %r691; + fma.rn.ftz.f32 %r694, %r336, %r369, %r368; + cvt.ftz.sat.f32.f32 %r695, %r694; + fma.rm.ftz.f32 %r696, %r695, %r373, %r372; + add.f32 %r697, %r696, 0fCB40007F; + neg.f32 %r698, %r697; + fma.rn.ftz.f32 %r699, %r336, %r377, %r698; + fma.rn.ftz.f32 %r700, %r336, %r379, %r699; + shl.b32 %r701, %r696, 23; + ex2.approx.ftz.f32 %r702, %r700; + mul.f32 %r703, %r702, %r701; + fma.rn.ftz.f32 %r704, %r337, %r369, %r368; + cvt.ftz.sat.f32.f32 %r705, %r704; + fma.rm.ftz.f32 %r706, %r705, %r373, %r372; + add.f32 %r707, %r706, 0fCB40007F; + neg.f32 %r708, %r707; + fma.rn.ftz.f32 %r709, %r337, %r377, %r708; + fma.rn.ftz.f32 %r710, %r337, %r379, %r709; + shl.b32 %r711, %r706, 23; + ex2.approx.ftz.f32 %r712, %r710; + mul.f32 %r713, %r712, %r711; + fma.rn.ftz.f32 %r714, %r338, %r369, %r368; + cvt.ftz.sat.f32.f32 %r715, %r714; + fma.rm.ftz.f32 %r716, %r715, %r373, %r372; + add.f32 %r717, %r716, 0fCB40007F; + neg.f32 %r718, %r717; + fma.rn.ftz.f32 %r719, %r338, %r377, %r718; + fma.rn.ftz.f32 %r720, %r338, %r379, %r719; + shl.b32 %r721, %r716, 23; + ex2.approx.ftz.f32 %r722, %r720; + mul.f32 %r723, %r722, %r721; + fma.rn.ftz.f32 %r724, %r339, %r369, %r368; + cvt.ftz.sat.f32.f32 %r725, %r724; + fma.rm.ftz.f32 %r726, %r725, %r373, %r372; + add.f32 %r727, %r726, 0fCB40007F; + neg.f32 %r728, %r727; + fma.rn.ftz.f32 %r729, %r339, %r377, %r728; + fma.rn.ftz.f32 %r730, %r339, %r379, %r729; + shl.b32 %r731, %r726, 23; + ex2.approx.ftz.f32 %r732, %r730; + mul.f32 %r733, %r732, %r731; + fma.rn.ftz.f32 %r734, %r340, %r369, %r368; + cvt.ftz.sat.f32.f32 %r735, %r734; + fma.rm.ftz.f32 %r736, %r735, %r373, %r372; + add.f32 %r737, %r736, 0fCB40007F; + neg.f32 %r738, %r737; + fma.rn.ftz.f32 %r739, %r340, %r377, %r738; + fma.rn.ftz.f32 %r740, %r340, %r379, %r739; + shl.b32 %r741, %r736, 23; + ex2.approx.ftz.f32 %r742, %r740; + mul.f32 %r743, %r742, %r741; + fma.rn.ftz.f32 %r744, %r341, %r369, %r368; + cvt.ftz.sat.f32.f32 %r745, %r744; + fma.rm.ftz.f32 %r746, %r745, %r373, %r372; + add.f32 %r747, %r746, 0fCB40007F; + neg.f32 %r748, %r747; + fma.rn.ftz.f32 %r749, %r341, %r377, %r748; + fma.rn.ftz.f32 %r750, %r341, %r379, %r749; + shl.b32 %r751, %r746, 23; + ex2.approx.ftz.f32 %r752, %r750; + mul.f32 %r753, %r752, %r751; + fma.rn.ftz.f32 %r754, %r342, %r369, %r368; + cvt.ftz.sat.f32.f32 %r755, %r754; + fma.rm.ftz.f32 %r756, %r755, %r373, %r372; + add.f32 %r757, %r756, 0fCB40007F; + neg.f32 %r758, %r757; + fma.rn.ftz.f32 %r759, %r342, %r377, %r758; + fma.rn.ftz.f32 %r760, %r342, %r379, %r759; + shl.b32 %r761, %r756, 23; + ex2.approx.ftz.f32 %r762, %r760; + mul.f32 %r763, %r762, %r761; + fma.rn.ftz.f32 %r764, %r343, %r369, %r368; + cvt.ftz.sat.f32.f32 %r765, %r764; + fma.rm.ftz.f32 %r766, %r765, %r373, %r372; + add.f32 %r767, %r766, 0fCB40007F; + neg.f32 %r768, %r767; + fma.rn.ftz.f32 %r769, %r343, %r377, %r768; + fma.rn.ftz.f32 %r770, %r343, %r379, %r769; + shl.b32 %r771, %r766, 23; + ex2.approx.ftz.f32 %r772, %r770; + mul.f32 %r773, %r772, %r771; + fma.rn.ftz.f32 %r774, %r344, %r369, %r368; + cvt.ftz.sat.f32.f32 %r775, %r774; + fma.rm.ftz.f32 %r776, %r775, %r373, %r372; + add.f32 %r777, %r776, 0fCB40007F; + neg.f32 %r778, %r777; + fma.rn.ftz.f32 %r779, %r344, %r377, %r778; + fma.rn.ftz.f32 %r780, %r344, %r379, %r779; + shl.b32 %r781, %r776, 23; + ex2.approx.ftz.f32 %r782, %r780; + mul.f32 %r783, %r782, %r781; + fma.rn.ftz.f32 %r784, %r345, %r369, %r368; + cvt.ftz.sat.f32.f32 %r785, %r784; + fma.rm.ftz.f32 %r786, %r785, %r373, %r372; + add.f32 %r787, %r786, 0fCB40007F; + neg.f32 %r788, %r787; + fma.rn.ftz.f32 %r789, %r345, %r377, %r788; + fma.rn.ftz.f32 %r790, %r345, %r379, %r789; + shl.b32 %r791, %r786, 23; + ex2.approx.ftz.f32 %r792, %r790; + mul.f32 %r793, %r792, %r791; + fma.rn.ftz.f32 %r794, %r346, %r369, %r368; + cvt.ftz.sat.f32.f32 %r795, %r794; + fma.rm.ftz.f32 %r796, %r795, %r373, %r372; + add.f32 %r797, %r796, 0fCB40007F; + neg.f32 %r798, %r797; + fma.rn.ftz.f32 %r799, %r346, %r377, %r798; + fma.rn.ftz.f32 %r800, %r346, %r379, %r799; + shl.b32 %r801, %r796, 23; + ex2.approx.ftz.f32 %r802, %r800; + mul.f32 %r803, %r802, %r801; + fma.rn.ftz.f32 %r804, %r347, %r369, %r368; + cvt.ftz.sat.f32.f32 %r805, %r804; + fma.rm.ftz.f32 %r806, %r805, %r373, %r372; + add.f32 %r807, %r806, 0fCB40007F; + neg.f32 %r808, %r807; + fma.rn.ftz.f32 %r809, %r347, %r377, %r808; + fma.rn.ftz.f32 %r810, %r347, %r379, %r809; + shl.b32 %r811, %r806, 23; + ex2.approx.ftz.f32 %r812, %r810; + mul.f32 %r813, %r812, %r811; + fma.rn.ftz.f32 %r814, %r348, %r369, %r368; + cvt.ftz.sat.f32.f32 %r815, %r814; + fma.rm.ftz.f32 %r816, %r815, %r373, %r372; + add.f32 %r817, %r816, 0fCB40007F; + neg.f32 %r818, %r817; + fma.rn.ftz.f32 %r819, %r348, %r377, %r818; + fma.rn.ftz.f32 %r820, %r348, %r379, %r819; + shl.b32 %r821, %r816, 23; + ex2.approx.ftz.f32 %r822, %r820; + mul.f32 %r823, %r822, %r821; + fma.rn.ftz.f32 %r824, %r349, %r369, %r368; + cvt.ftz.sat.f32.f32 %r825, %r824; + fma.rm.ftz.f32 %r826, %r825, %r373, %r372; + add.f32 %r827, %r826, 0fCB40007F; + neg.f32 %r828, %r827; + fma.rn.ftz.f32 %r829, %r349, %r377, %r828; + fma.rn.ftz.f32 %r830, %r349, %r379, %r829; + shl.b32 %r831, %r826, 23; + ex2.approx.ftz.f32 %r832, %r830; + mul.f32 %r833, %r832, %r831; + fma.rn.ftz.f32 %r834, %r350, %r369, %r368; + cvt.ftz.sat.f32.f32 %r835, %r834; + fma.rm.ftz.f32 %r836, %r835, %r373, %r372; + add.f32 %r837, %r836, 0fCB40007F; + neg.f32 %r838, %r837; + fma.rn.ftz.f32 %r839, %r350, %r377, %r838; + fma.rn.ftz.f32 %r840, %r350, %r379, %r839; + shl.b32 %r841, %r836, 23; + ex2.approx.ftz.f32 %r842, %r840; + mul.f32 %r843, %r842, %r841; + fma.rn.ftz.f32 %r844, %r351, %r369, %r368; + cvt.ftz.sat.f32.f32 %r845, %r844; + fma.rm.ftz.f32 %r846, %r845, %r373, %r372; + add.f32 %r847, %r846, 0fCB40007F; + neg.f32 %r848, %r847; + fma.rn.ftz.f32 %r849, %r351, %r377, %r848; + fma.rn.ftz.f32 %r850, %r351, %r379, %r849; + shl.b32 %r851, %r846, 23; + ex2.approx.ftz.f32 %r852, %r850; + mul.f32 %r853, %r852, %r851; + fma.rn.ftz.f32 %r854, %r352, %r369, %r368; + cvt.ftz.sat.f32.f32 %r855, %r854; + fma.rm.ftz.f32 %r856, %r855, %r373, %r372; + add.f32 %r857, %r856, 0fCB40007F; + neg.f32 %r858, %r857; + fma.rn.ftz.f32 %r859, %r352, %r377, %r858; + fma.rn.ftz.f32 %r860, %r352, %r379, %r859; + shl.b32 %r861, %r856, 23; + ex2.approx.ftz.f32 %r862, %r860; + mul.f32 %r863, %r862, %r861; + fma.rn.ftz.f32 %r864, %r353, %r369, %r368; + cvt.ftz.sat.f32.f32 %r865, %r864; + fma.rm.ftz.f32 %r866, %r865, %r373, %r372; + add.f32 %r867, %r866, 0fCB40007F; + neg.f32 %r868, %r867; + fma.rn.ftz.f32 %r869, %r353, %r377, %r868; + fma.rn.ftz.f32 %r870, %r353, %r379, %r869; + shl.b32 %r871, %r866, 23; + ex2.approx.ftz.f32 %r872, %r870; + mul.f32 %r873, %r872, %r871; + fma.rn.ftz.f32 %r874, %r354, %r369, %r368; + cvt.ftz.sat.f32.f32 %r875, %r874; + fma.rm.ftz.f32 %r876, %r875, %r373, %r372; + add.f32 %r877, %r876, 0fCB40007F; + neg.f32 %r878, %r877; + fma.rn.ftz.f32 %r879, %r354, %r377, %r878; + fma.rn.ftz.f32 %r880, %r354, %r379, %r879; + shl.b32 %r881, %r876, 23; + ex2.approx.ftz.f32 %r882, %r880; + mul.f32 %r883, %r882, %r881; + fma.rn.ftz.f32 %r884, %r355, %r369, %r368; + cvt.ftz.sat.f32.f32 %r885, %r884; + fma.rm.ftz.f32 %r886, %r885, %r373, %r372; + add.f32 %r887, %r886, 0fCB40007F; + neg.f32 %r888, %r887; + fma.rn.ftz.f32 %r889, %r355, %r377, %r888; + fma.rn.ftz.f32 %r890, %r355, %r379, %r889; + shl.b32 %r891, %r886, 23; + ex2.approx.ftz.f32 %r892, %r890; + mul.f32 %r893, %r892, %r891; + fma.rn.ftz.f32 %r894, %r356, %r369, %r368; + cvt.ftz.sat.f32.f32 %r895, %r894; + fma.rm.ftz.f32 %r896, %r895, %r373, %r372; + add.f32 %r897, %r896, 0fCB40007F; + neg.f32 %r898, %r897; + fma.rn.ftz.f32 %r899, %r356, %r377, %r898; + fma.rn.ftz.f32 %r900, %r356, %r379, %r899; + shl.b32 %r901, %r896, 23; + ex2.approx.ftz.f32 %r902, %r900; + mul.f32 %r903, %r902, %r901; + fma.rn.ftz.f32 %r904, %r357, %r369, %r368; + cvt.ftz.sat.f32.f32 %r905, %r904; + fma.rm.ftz.f32 %r906, %r905, %r373, %r372; + add.f32 %r907, %r906, 0fCB40007F; + neg.f32 %r908, %r907; + fma.rn.ftz.f32 %r909, %r357, %r377, %r908; + fma.rn.ftz.f32 %r910, %r357, %r379, %r909; + shl.b32 %r911, %r906, 23; + ex2.approx.ftz.f32 %r912, %r910; + mul.f32 %r913, %r912, %r911; + fma.rn.ftz.f32 %r914, %r358, %r369, %r368; + cvt.ftz.sat.f32.f32 %r915, %r914; + fma.rm.ftz.f32 %r916, %r915, %r373, %r372; + add.f32 %r917, %r916, 0fCB40007F; + neg.f32 %r918, %r917; + fma.rn.ftz.f32 %r919, %r358, %r377, %r918; + fma.rn.ftz.f32 %r920, %r358, %r379, %r919; + shl.b32 %r921, %r916, 23; + ex2.approx.ftz.f32 %r922, %r920; + mul.f32 %r923, %r922, %r921; + fma.rn.ftz.f32 %r924, %r359, %r369, %r368; + cvt.ftz.sat.f32.f32 %r925, %r924; + fma.rm.ftz.f32 %r926, %r925, %r373, %r372; + add.f32 %r927, %r926, 0fCB40007F; + neg.f32 %r928, %r927; + fma.rn.ftz.f32 %r929, %r359, %r377, %r928; + fma.rn.ftz.f32 %r930, %r359, %r379, %r929; + shl.b32 %r931, %r926, 23; + ex2.approx.ftz.f32 %r932, %r930; + mul.f32 %r933, %r932, %r931; + fma.rn.ftz.f32 %r934, %r360, %r369, %r368; + cvt.ftz.sat.f32.f32 %r935, %r934; + fma.rm.ftz.f32 %r936, %r935, %r373, %r372; + add.f32 %r937, %r936, 0fCB40007F; + neg.f32 %r938, %r937; + fma.rn.ftz.f32 %r939, %r360, %r377, %r938; + fma.rn.ftz.f32 %r940, %r360, %r379, %r939; + shl.b32 %r941, %r936, 23; + ex2.approx.ftz.f32 %r942, %r940; + mul.f32 %r943, %r942, %r941; + fma.rn.ftz.f32 %r944, %r361, %r369, %r368; + cvt.ftz.sat.f32.f32 %r945, %r944; + fma.rm.ftz.f32 %r946, %r945, %r373, %r372; + add.f32 %r947, %r946, 0fCB40007F; + neg.f32 %r948, %r947; + fma.rn.ftz.f32 %r949, %r361, %r377, %r948; + fma.rn.ftz.f32 %r950, %r361, %r379, %r949; + shl.b32 %r951, %r946, 23; + ex2.approx.ftz.f32 %r952, %r950; + mul.f32 %r953, %r952, %r951; + fma.rn.ftz.f32 %r954, %r362, %r369, %r368; + cvt.ftz.sat.f32.f32 %r955, %r954; + fma.rm.ftz.f32 %r956, %r955, %r373, %r372; + fma.rn.ftz.f32 %r957, %r363, %r369, %r368; + cvt.ftz.sat.f32.f32 %r958, %r957; + fma.rm.ftz.f32 %r959, %r958, %r373, %r372; + fma.rn.ftz.f32 %r960, %r364, %r369, %r368; + cvt.ftz.sat.f32.f32 %r961, %r960; + fma.rm.ftz.f32 %r962, %r961, %r373, %r372; + fma.rn.ftz.f32 %r963, %r365, %r369, %r368; + cvt.ftz.sat.f32.f32 %r964, %r963; + fma.rm.ftz.f32 %r965, %r964, %r373, %r372; + fma.rn.ftz.f32 %r966, %r366, %r369, %r368; + cvt.ftz.sat.f32.f32 %r967, %r966; + fma.rm.ftz.f32 %r968, %r967, %r373, %r372; + fma.rn.ftz.f32 %r969, %r367, %r369, %r368; + cvt.ftz.sat.f32.f32 %r970, %r969; + fma.rm.ftz.f32 %r971, %r970, %r373, %r372; + .loc 2 199 53 // triton_helpers.py:199:53 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + sub.f32 %r972, %r239, %r239; + sub.f32 %r973, %r240, %r240; + sub.f32 %r974, %r241, %r241; + sub.f32 %r975, %r242, %r242; + sub.f32 %r976, %r243, %r243; + sub.f32 %r977, %r244, %r244; + sub.f32 %r978, %r245, %r245; + sub.f32 %r979, %r246, %r246; + sub.f32 %r980, %r247, %r247; + sub.f32 %r981, %r248, %r248; + sub.f32 %r982, %r249, %r249; + sub.f32 %r983, %r250, %r250; + sub.f32 %r984, %r251, %r251; + sub.f32 %r985, %r252, %r252; + sub.f32 %r986, %r253, %r253; + sub.f32 %r987, %r254, %r254; + sub.f32 %r988, %r255, %r255; + sub.f32 %r989, %r256, %r256; + sub.f32 %r990, %r257, %r257; + sub.f32 %r991, %r258, %r258; + sub.f32 %r992, %r259, %r259; + sub.f32 %r993, %r260, %r260; + sub.f32 %r994, %r261, %r261; + sub.f32 %r995, %r262, %r262; + sub.f32 %r996, %r263, %r263; + sub.f32 %r997, %r264, %r264; + sub.f32 %r998, %r265, %r265; + sub.f32 %r999, %r266, %r266; + sub.f32 %r1000, %r267, %r267; + sub.f32 %r1001, %r268, %r268; + sub.f32 %r1002, %r269, %r269; + sub.f32 %r1003, %r270, %r270; + sub.f32 %r1004, %r271, %r271; + sub.f32 %r1005, %r272, %r272; + sub.f32 %r1006, %r273, %r273; + sub.f32 %r1007, %r274, %r274; + sub.f32 %r1008, %r275, %r275; + sub.f32 %r1009, %r276, %r276; + sub.f32 %r1010, %r277, %r277; + sub.f32 %r1011, %r278, %r278; + sub.f32 %r1012, %r279, %r279; + sub.f32 %r1013, %r280, %r280; + sub.f32 %r1014, %r281, %r281; + sub.f32 %r1015, %r282, %r282; + sub.f32 %r1016, %r283, %r283; + sub.f32 %r1017, %r284, %r284; + sub.f32 %r1018, %r285, %r285; + sub.f32 %r1019, %r286, %r286; + sub.f32 %r1020, %r287, %r287; + sub.f32 %r1021, %r288, %r288; + sub.f32 %r1022, %r289, %r289; + sub.f32 %r1023, %r290, %r290; + sub.f32 %r1024, %r291, %r291; + sub.f32 %r1025, %r292, %r292; + sub.f32 %r1026, %r293, %r293; + sub.f32 %r1027, %r294, %r294; + sub.f32 %r1028, %r295, %r295; + sub.f32 %r1029, %r296, %r296; + sub.f32 %r1030, %r297, %r297; + sub.f32 %r1031, %r298, %r298; + sub.f32 %r1032, %r299, %r299; + sub.f32 %r1033, %r300, %r300; + sub.f32 %r1034, %r301, %r301; + sub.f32 %r1035, %r302, %r302; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.ftz.f32 %r1036, %r972, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1037, %r1036; + fma.rm.ftz.f32 %r1038, %r1037, %r373, %r372; + add.f32 %r1039, %r1038, 0fCB40007F; + neg.f32 %r1040, %r1039; + fma.rn.ftz.f32 %r1041, %r972, %r377, %r1040; + fma.rn.ftz.f32 %r1042, %r972, %r379, %r1041; + shl.b32 %r1043, %r1038, 23; + ex2.approx.ftz.f32 %r1044, %r1042; + mul.f32 %r1045, %r1044, %r1043; + fma.rn.ftz.f32 %r1046, %r973, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1047, %r1046; + fma.rm.ftz.f32 %r1048, %r1047, %r373, %r372; + add.f32 %r1049, %r1048, 0fCB40007F; + neg.f32 %r1050, %r1049; + fma.rn.ftz.f32 %r1051, %r973, %r377, %r1050; + fma.rn.ftz.f32 %r1052, %r973, %r379, %r1051; + shl.b32 %r1053, %r1048, 23; + ex2.approx.ftz.f32 %r1054, %r1052; + mul.f32 %r1055, %r1054, %r1053; + fma.rn.ftz.f32 %r1056, %r974, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1057, %r1056; + fma.rm.ftz.f32 %r1058, %r1057, %r373, %r372; + add.f32 %r1059, %r1058, 0fCB40007F; + neg.f32 %r1060, %r1059; + fma.rn.ftz.f32 %r1061, %r974, %r377, %r1060; + fma.rn.ftz.f32 %r1062, %r974, %r379, %r1061; + shl.b32 %r1063, %r1058, 23; + ex2.approx.ftz.f32 %r1064, %r1062; + mul.f32 %r1065, %r1064, %r1063; + fma.rn.ftz.f32 %r1066, %r975, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1067, %r1066; + fma.rm.ftz.f32 %r1068, %r1067, %r373, %r372; + add.f32 %r1069, %r1068, 0fCB40007F; + neg.f32 %r1070, %r1069; + fma.rn.ftz.f32 %r1071, %r975, %r377, %r1070; + fma.rn.ftz.f32 %r1072, %r975, %r379, %r1071; + shl.b32 %r1073, %r1068, 23; + ex2.approx.ftz.f32 %r1074, %r1072; + mul.f32 %r1075, %r1074, %r1073; + fma.rn.ftz.f32 %r1076, %r976, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1077, %r1076; + fma.rm.ftz.f32 %r1078, %r1077, %r373, %r372; + add.f32 %r1079, %r1078, 0fCB40007F; + neg.f32 %r1080, %r1079; + fma.rn.ftz.f32 %r1081, %r976, %r377, %r1080; + fma.rn.ftz.f32 %r1082, %r976, %r379, %r1081; + shl.b32 %r1083, %r1078, 23; + ex2.approx.ftz.f32 %r1084, %r1082; + mul.f32 %r1085, %r1084, %r1083; + fma.rn.ftz.f32 %r1086, %r977, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1087, %r1086; + fma.rm.ftz.f32 %r1088, %r1087, %r373, %r372; + add.f32 %r1089, %r1088, 0fCB40007F; + neg.f32 %r1090, %r1089; + fma.rn.ftz.f32 %r1091, %r977, %r377, %r1090; + fma.rn.ftz.f32 %r1092, %r977, %r379, %r1091; + shl.b32 %r1093, %r1088, 23; + ex2.approx.ftz.f32 %r1094, %r1092; + mul.f32 %r1095, %r1094, %r1093; + fma.rn.ftz.f32 %r1096, %r978, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1097, %r1096; + fma.rm.ftz.f32 %r1098, %r1097, %r373, %r372; + add.f32 %r1099, %r1098, 0fCB40007F; + neg.f32 %r1100, %r1099; + fma.rn.ftz.f32 %r1101, %r978, %r377, %r1100; + fma.rn.ftz.f32 %r1102, %r978, %r379, %r1101; + shl.b32 %r1103, %r1098, 23; + ex2.approx.ftz.f32 %r1104, %r1102; + mul.f32 %r1105, %r1104, %r1103; + fma.rn.ftz.f32 %r1106, %r979, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1107, %r1106; + fma.rm.ftz.f32 %r1108, %r1107, %r373, %r372; + add.f32 %r1109, %r1108, 0fCB40007F; + neg.f32 %r1110, %r1109; + fma.rn.ftz.f32 %r1111, %r979, %r377, %r1110; + fma.rn.ftz.f32 %r1112, %r979, %r379, %r1111; + shl.b32 %r1113, %r1108, 23; + ex2.approx.ftz.f32 %r1114, %r1112; + mul.f32 %r1115, %r1114, %r1113; + fma.rn.ftz.f32 %r1116, %r980, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1117, %r1116; + fma.rm.ftz.f32 %r1118, %r1117, %r373, %r372; + add.f32 %r1119, %r1118, 0fCB40007F; + neg.f32 %r1120, %r1119; + fma.rn.ftz.f32 %r1121, %r980, %r377, %r1120; + fma.rn.ftz.f32 %r1122, %r980, %r379, %r1121; + shl.b32 %r1123, %r1118, 23; + ex2.approx.ftz.f32 %r1124, %r1122; + mul.f32 %r1125, %r1124, %r1123; + fma.rn.ftz.f32 %r1126, %r981, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1127, %r1126; + fma.rm.ftz.f32 %r1128, %r1127, %r373, %r372; + add.f32 %r1129, %r1128, 0fCB40007F; + neg.f32 %r1130, %r1129; + fma.rn.ftz.f32 %r1131, %r981, %r377, %r1130; + fma.rn.ftz.f32 %r1132, %r981, %r379, %r1131; + shl.b32 %r1133, %r1128, 23; + ex2.approx.ftz.f32 %r1134, %r1132; + mul.f32 %r1135, %r1134, %r1133; + fma.rn.ftz.f32 %r1136, %r982, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1137, %r1136; + fma.rm.ftz.f32 %r1138, %r1137, %r373, %r372; + add.f32 %r1139, %r1138, 0fCB40007F; + neg.f32 %r1140, %r1139; + fma.rn.ftz.f32 %r1141, %r982, %r377, %r1140; + fma.rn.ftz.f32 %r1142, %r982, %r379, %r1141; + shl.b32 %r1143, %r1138, 23; + ex2.approx.ftz.f32 %r1144, %r1142; + mul.f32 %r1145, %r1144, %r1143; + fma.rn.ftz.f32 %r1146, %r983, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1147, %r1146; + fma.rm.ftz.f32 %r1148, %r1147, %r373, %r372; + add.f32 %r1149, %r1148, 0fCB40007F; + neg.f32 %r1150, %r1149; + fma.rn.ftz.f32 %r1151, %r983, %r377, %r1150; + fma.rn.ftz.f32 %r1152, %r983, %r379, %r1151; + shl.b32 %r1153, %r1148, 23; + ex2.approx.ftz.f32 %r1154, %r1152; + mul.f32 %r1155, %r1154, %r1153; + fma.rn.ftz.f32 %r1156, %r984, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1157, %r1156; + fma.rm.ftz.f32 %r1158, %r1157, %r373, %r372; + add.f32 %r1159, %r1158, 0fCB40007F; + neg.f32 %r1160, %r1159; + fma.rn.ftz.f32 %r1161, %r984, %r377, %r1160; + fma.rn.ftz.f32 %r1162, %r984, %r379, %r1161; + shl.b32 %r1163, %r1158, 23; + ex2.approx.ftz.f32 %r1164, %r1162; + mul.f32 %r1165, %r1164, %r1163; + fma.rn.ftz.f32 %r1166, %r985, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1167, %r1166; + fma.rm.ftz.f32 %r1168, %r1167, %r373, %r372; + add.f32 %r1169, %r1168, 0fCB40007F; + neg.f32 %r1170, %r1169; + fma.rn.ftz.f32 %r1171, %r985, %r377, %r1170; + fma.rn.ftz.f32 %r1172, %r985, %r379, %r1171; + shl.b32 %r1173, %r1168, 23; + ex2.approx.ftz.f32 %r1174, %r1172; + mul.f32 %r1175, %r1174, %r1173; + fma.rn.ftz.f32 %r1176, %r986, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1177, %r1176; + fma.rm.ftz.f32 %r1178, %r1177, %r373, %r372; + add.f32 %r1179, %r1178, 0fCB40007F; + neg.f32 %r1180, %r1179; + fma.rn.ftz.f32 %r1181, %r986, %r377, %r1180; + fma.rn.ftz.f32 %r1182, %r986, %r379, %r1181; + shl.b32 %r1183, %r1178, 23; + ex2.approx.ftz.f32 %r1184, %r1182; + mul.f32 %r1185, %r1184, %r1183; + fma.rn.ftz.f32 %r1186, %r987, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1187, %r1186; + fma.rm.ftz.f32 %r1188, %r1187, %r373, %r372; + add.f32 %r1189, %r1188, 0fCB40007F; + neg.f32 %r1190, %r1189; + fma.rn.ftz.f32 %r1191, %r987, %r377, %r1190; + fma.rn.ftz.f32 %r1192, %r987, %r379, %r1191; + shl.b32 %r1193, %r1188, 23; + ex2.approx.ftz.f32 %r1194, %r1192; + mul.f32 %r1195, %r1194, %r1193; + fma.rn.ftz.f32 %r1196, %r988, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1197, %r1196; + fma.rm.ftz.f32 %r1198, %r1197, %r373, %r372; + add.f32 %r1199, %r1198, 0fCB40007F; + neg.f32 %r1200, %r1199; + fma.rn.ftz.f32 %r1201, %r988, %r377, %r1200; + fma.rn.ftz.f32 %r1202, %r988, %r379, %r1201; + shl.b32 %r1203, %r1198, 23; + ex2.approx.ftz.f32 %r1204, %r1202; + mul.f32 %r1205, %r1204, %r1203; + fma.rn.ftz.f32 %r1206, %r989, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1207, %r1206; + fma.rm.ftz.f32 %r1208, %r1207, %r373, %r372; + add.f32 %r1209, %r1208, 0fCB40007F; + neg.f32 %r1210, %r1209; + fma.rn.ftz.f32 %r1211, %r989, %r377, %r1210; + fma.rn.ftz.f32 %r1212, %r989, %r379, %r1211; + shl.b32 %r1213, %r1208, 23; + ex2.approx.ftz.f32 %r1214, %r1212; + mul.f32 %r1215, %r1214, %r1213; + fma.rn.ftz.f32 %r1216, %r990, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1217, %r1216; + fma.rm.ftz.f32 %r1218, %r1217, %r373, %r372; + add.f32 %r1219, %r1218, 0fCB40007F; + neg.f32 %r1220, %r1219; + fma.rn.ftz.f32 %r1221, %r990, %r377, %r1220; + fma.rn.ftz.f32 %r1222, %r990, %r379, %r1221; + shl.b32 %r1223, %r1218, 23; + ex2.approx.ftz.f32 %r1224, %r1222; + mul.f32 %r1225, %r1224, %r1223; + fma.rn.ftz.f32 %r1226, %r991, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1227, %r1226; + fma.rm.ftz.f32 %r1228, %r1227, %r373, %r372; + add.f32 %r1229, %r1228, 0fCB40007F; + neg.f32 %r1230, %r1229; + fma.rn.ftz.f32 %r1231, %r991, %r377, %r1230; + fma.rn.ftz.f32 %r1232, %r991, %r379, %r1231; + shl.b32 %r1233, %r1228, 23; + ex2.approx.ftz.f32 %r1234, %r1232; + mul.f32 %r1235, %r1234, %r1233; + fma.rn.ftz.f32 %r1236, %r992, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1237, %r1236; + fma.rm.ftz.f32 %r1238, %r1237, %r373, %r372; + add.f32 %r1239, %r1238, 0fCB40007F; + neg.f32 %r1240, %r1239; + fma.rn.ftz.f32 %r1241, %r992, %r377, %r1240; + fma.rn.ftz.f32 %r1242, %r992, %r379, %r1241; + shl.b32 %r1243, %r1238, 23; + ex2.approx.ftz.f32 %r1244, %r1242; + mul.f32 %r1245, %r1244, %r1243; + fma.rn.ftz.f32 %r1246, %r993, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1247, %r1246; + fma.rm.ftz.f32 %r1248, %r1247, %r373, %r372; + add.f32 %r1249, %r1248, 0fCB40007F; + neg.f32 %r1250, %r1249; + fma.rn.ftz.f32 %r1251, %r993, %r377, %r1250; + fma.rn.ftz.f32 %r1252, %r993, %r379, %r1251; + shl.b32 %r1253, %r1248, 23; + ex2.approx.ftz.f32 %r1254, %r1252; + mul.f32 %r1255, %r1254, %r1253; + fma.rn.ftz.f32 %r1256, %r994, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1257, %r1256; + fma.rm.ftz.f32 %r1258, %r1257, %r373, %r372; + add.f32 %r1259, %r1258, 0fCB40007F; + neg.f32 %r1260, %r1259; + fma.rn.ftz.f32 %r1261, %r994, %r377, %r1260; + fma.rn.ftz.f32 %r1262, %r994, %r379, %r1261; + shl.b32 %r1263, %r1258, 23; + ex2.approx.ftz.f32 %r1264, %r1262; + mul.f32 %r1265, %r1264, %r1263; + fma.rn.ftz.f32 %r1266, %r995, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1267, %r1266; + fma.rm.ftz.f32 %r1268, %r1267, %r373, %r372; + add.f32 %r1269, %r1268, 0fCB40007F; + neg.f32 %r1270, %r1269; + fma.rn.ftz.f32 %r1271, %r995, %r377, %r1270; + fma.rn.ftz.f32 %r1272, %r995, %r379, %r1271; + shl.b32 %r1273, %r1268, 23; + ex2.approx.ftz.f32 %r1274, %r1272; + mul.f32 %r1275, %r1274, %r1273; + fma.rn.ftz.f32 %r1276, %r996, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1277, %r1276; + fma.rm.ftz.f32 %r1278, %r1277, %r373, %r372; + add.f32 %r1279, %r1278, 0fCB40007F; + neg.f32 %r1280, %r1279; + fma.rn.ftz.f32 %r1281, %r996, %r377, %r1280; + fma.rn.ftz.f32 %r1282, %r996, %r379, %r1281; + shl.b32 %r1283, %r1278, 23; + ex2.approx.ftz.f32 %r1284, %r1282; + mul.f32 %r1285, %r1284, %r1283; + fma.rn.ftz.f32 %r1286, %r997, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1287, %r1286; + fma.rm.ftz.f32 %r1288, %r1287, %r373, %r372; + add.f32 %r1289, %r1288, 0fCB40007F; + neg.f32 %r1290, %r1289; + fma.rn.ftz.f32 %r1291, %r997, %r377, %r1290; + fma.rn.ftz.f32 %r1292, %r997, %r379, %r1291; + shl.b32 %r1293, %r1288, 23; + ex2.approx.ftz.f32 %r1294, %r1292; + mul.f32 %r1295, %r1294, %r1293; + fma.rn.ftz.f32 %r1296, %r998, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1297, %r1296; + fma.rm.ftz.f32 %r1298, %r1297, %r373, %r372; + add.f32 %r1299, %r1298, 0fCB40007F; + neg.f32 %r1300, %r1299; + fma.rn.ftz.f32 %r1301, %r998, %r377, %r1300; + fma.rn.ftz.f32 %r1302, %r998, %r379, %r1301; + shl.b32 %r1303, %r1298, 23; + ex2.approx.ftz.f32 %r1304, %r1302; + mul.f32 %r1305, %r1304, %r1303; + fma.rn.ftz.f32 %r1306, %r999, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1307, %r1306; + fma.rm.ftz.f32 %r1308, %r1307, %r373, %r372; + add.f32 %r1309, %r1308, 0fCB40007F; + neg.f32 %r1310, %r1309; + fma.rn.ftz.f32 %r1311, %r999, %r377, %r1310; + fma.rn.ftz.f32 %r1312, %r999, %r379, %r1311; + shl.b32 %r1313, %r1308, 23; + ex2.approx.ftz.f32 %r1314, %r1312; + mul.f32 %r1315, %r1314, %r1313; + fma.rn.ftz.f32 %r1316, %r1000, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1317, %r1316; + fma.rm.ftz.f32 %r1318, %r1317, %r373, %r372; + add.f32 %r1319, %r1318, 0fCB40007F; + neg.f32 %r1320, %r1319; + fma.rn.ftz.f32 %r1321, %r1000, %r377, %r1320; + fma.rn.ftz.f32 %r1322, %r1000, %r379, %r1321; + shl.b32 %r1323, %r1318, 23; + ex2.approx.ftz.f32 %r1324, %r1322; + mul.f32 %r1325, %r1324, %r1323; + fma.rn.ftz.f32 %r1326, %r1001, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1327, %r1326; + fma.rm.ftz.f32 %r1328, %r1327, %r373, %r372; + add.f32 %r1329, %r1328, 0fCB40007F; + neg.f32 %r1330, %r1329; + fma.rn.ftz.f32 %r1331, %r1001, %r377, %r1330; + fma.rn.ftz.f32 %r1332, %r1001, %r379, %r1331; + shl.b32 %r1333, %r1328, 23; + ex2.approx.ftz.f32 %r1334, %r1332; + mul.f32 %r1335, %r1334, %r1333; + fma.rn.ftz.f32 %r1336, %r1002, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1337, %r1336; + fma.rm.ftz.f32 %r1338, %r1337, %r373, %r372; + add.f32 %r1339, %r1338, 0fCB40007F; + neg.f32 %r1340, %r1339; + fma.rn.ftz.f32 %r1341, %r1002, %r377, %r1340; + fma.rn.ftz.f32 %r1342, %r1002, %r379, %r1341; + shl.b32 %r1343, %r1338, 23; + ex2.approx.ftz.f32 %r1344, %r1342; + mul.f32 %r1345, %r1344, %r1343; + fma.rn.ftz.f32 %r1346, %r1003, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1347, %r1346; + fma.rm.ftz.f32 %r1348, %r1347, %r373, %r372; + add.f32 %r1349, %r1348, 0fCB40007F; + neg.f32 %r1350, %r1349; + fma.rn.ftz.f32 %r1351, %r1003, %r377, %r1350; + fma.rn.ftz.f32 %r1352, %r1003, %r379, %r1351; + shl.b32 %r1353, %r1348, 23; + ex2.approx.ftz.f32 %r1354, %r1352; + mul.f32 %r1355, %r1354, %r1353; + fma.rn.ftz.f32 %r1356, %r1004, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1357, %r1356; + fma.rm.ftz.f32 %r1358, %r1357, %r373, %r372; + add.f32 %r1359, %r1358, 0fCB40007F; + neg.f32 %r1360, %r1359; + fma.rn.ftz.f32 %r1361, %r1004, %r377, %r1360; + fma.rn.ftz.f32 %r1362, %r1004, %r379, %r1361; + shl.b32 %r1363, %r1358, 23; + ex2.approx.ftz.f32 %r1364, %r1362; + mul.f32 %r1365, %r1364, %r1363; + fma.rn.ftz.f32 %r1366, %r1005, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1367, %r1366; + fma.rm.ftz.f32 %r1368, %r1367, %r373, %r372; + add.f32 %r1369, %r1368, 0fCB40007F; + neg.f32 %r1370, %r1369; + fma.rn.ftz.f32 %r1371, %r1005, %r377, %r1370; + fma.rn.ftz.f32 %r1372, %r1005, %r379, %r1371; + shl.b32 %r1373, %r1368, 23; + ex2.approx.ftz.f32 %r1374, %r1372; + mul.f32 %r1375, %r1374, %r1373; + fma.rn.ftz.f32 %r1376, %r1006, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1377, %r1376; + fma.rm.ftz.f32 %r1378, %r1377, %r373, %r372; + add.f32 %r1379, %r1378, 0fCB40007F; + neg.f32 %r1380, %r1379; + fma.rn.ftz.f32 %r1381, %r1006, %r377, %r1380; + fma.rn.ftz.f32 %r1382, %r1006, %r379, %r1381; + shl.b32 %r1383, %r1378, 23; + ex2.approx.ftz.f32 %r1384, %r1382; + mul.f32 %r1385, %r1384, %r1383; + fma.rn.ftz.f32 %r1386, %r1007, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1387, %r1386; + fma.rm.ftz.f32 %r1388, %r1387, %r373, %r372; + add.f32 %r1389, %r1388, 0fCB40007F; + neg.f32 %r1390, %r1389; + fma.rn.ftz.f32 %r1391, %r1007, %r377, %r1390; + fma.rn.ftz.f32 %r1392, %r1007, %r379, %r1391; + shl.b32 %r1393, %r1388, 23; + ex2.approx.ftz.f32 %r1394, %r1392; + mul.f32 %r1395, %r1394, %r1393; + fma.rn.ftz.f32 %r1396, %r1008, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1397, %r1396; + fma.rm.ftz.f32 %r1398, %r1397, %r373, %r372; + add.f32 %r1399, %r1398, 0fCB40007F; + neg.f32 %r1400, %r1399; + fma.rn.ftz.f32 %r1401, %r1008, %r377, %r1400; + fma.rn.ftz.f32 %r1402, %r1008, %r379, %r1401; + shl.b32 %r1403, %r1398, 23; + ex2.approx.ftz.f32 %r1404, %r1402; + mul.f32 %r1405, %r1404, %r1403; + fma.rn.ftz.f32 %r1406, %r1009, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1407, %r1406; + fma.rm.ftz.f32 %r1408, %r1407, %r373, %r372; + add.f32 %r1409, %r1408, 0fCB40007F; + neg.f32 %r1410, %r1409; + fma.rn.ftz.f32 %r1411, %r1009, %r377, %r1410; + fma.rn.ftz.f32 %r1412, %r1009, %r379, %r1411; + shl.b32 %r1413, %r1408, 23; + ex2.approx.ftz.f32 %r1414, %r1412; + mul.f32 %r1415, %r1414, %r1413; + fma.rn.ftz.f32 %r1416, %r1010, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1417, %r1416; + fma.rm.ftz.f32 %r1418, %r1417, %r373, %r372; + add.f32 %r1419, %r1418, 0fCB40007F; + neg.f32 %r1420, %r1419; + fma.rn.ftz.f32 %r1421, %r1010, %r377, %r1420; + fma.rn.ftz.f32 %r1422, %r1010, %r379, %r1421; + shl.b32 %r1423, %r1418, 23; + ex2.approx.ftz.f32 %r1424, %r1422; + mul.f32 %r1425, %r1424, %r1423; + fma.rn.ftz.f32 %r1426, %r1011, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1427, %r1426; + fma.rm.ftz.f32 %r1428, %r1427, %r373, %r372; + add.f32 %r1429, %r1428, 0fCB40007F; + neg.f32 %r1430, %r1429; + fma.rn.ftz.f32 %r1431, %r1011, %r377, %r1430; + fma.rn.ftz.f32 %r1432, %r1011, %r379, %r1431; + shl.b32 %r1433, %r1428, 23; + ex2.approx.ftz.f32 %r1434, %r1432; + mul.f32 %r1435, %r1434, %r1433; + fma.rn.ftz.f32 %r1436, %r1012, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1437, %r1436; + fma.rm.ftz.f32 %r1438, %r1437, %r373, %r372; + add.f32 %r1439, %r1438, 0fCB40007F; + neg.f32 %r1440, %r1439; + fma.rn.ftz.f32 %r1441, %r1012, %r377, %r1440; + fma.rn.ftz.f32 %r1442, %r1012, %r379, %r1441; + shl.b32 %r1443, %r1438, 23; + ex2.approx.ftz.f32 %r1444, %r1442; + mul.f32 %r1445, %r1444, %r1443; + fma.rn.ftz.f32 %r1446, %r1013, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1447, %r1446; + fma.rm.ftz.f32 %r1448, %r1447, %r373, %r372; + add.f32 %r1449, %r1448, 0fCB40007F; + neg.f32 %r1450, %r1449; + fma.rn.ftz.f32 %r1451, %r1013, %r377, %r1450; + fma.rn.ftz.f32 %r1452, %r1013, %r379, %r1451; + shl.b32 %r1453, %r1448, 23; + ex2.approx.ftz.f32 %r1454, %r1452; + mul.f32 %r1455, %r1454, %r1453; + fma.rn.ftz.f32 %r1456, %r1014, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1457, %r1456; + fma.rm.ftz.f32 %r1458, %r1457, %r373, %r372; + add.f32 %r1459, %r1458, 0fCB40007F; + neg.f32 %r1460, %r1459; + fma.rn.ftz.f32 %r1461, %r1014, %r377, %r1460; + fma.rn.ftz.f32 %r1462, %r1014, %r379, %r1461; + shl.b32 %r1463, %r1458, 23; + ex2.approx.ftz.f32 %r1464, %r1462; + mul.f32 %r1465, %r1464, %r1463; + fma.rn.ftz.f32 %r1466, %r1015, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1467, %r1466; + fma.rm.ftz.f32 %r1468, %r1467, %r373, %r372; + add.f32 %r1469, %r1468, 0fCB40007F; + neg.f32 %r1470, %r1469; + fma.rn.ftz.f32 %r1471, %r1015, %r377, %r1470; + fma.rn.ftz.f32 %r1472, %r1015, %r379, %r1471; + shl.b32 %r1473, %r1468, 23; + ex2.approx.ftz.f32 %r1474, %r1472; + mul.f32 %r1475, %r1474, %r1473; + fma.rn.ftz.f32 %r1476, %r1016, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1477, %r1476; + fma.rm.ftz.f32 %r1478, %r1477, %r373, %r372; + add.f32 %r1479, %r1478, 0fCB40007F; + neg.f32 %r1480, %r1479; + fma.rn.ftz.f32 %r1481, %r1016, %r377, %r1480; + fma.rn.ftz.f32 %r1482, %r1016, %r379, %r1481; + shl.b32 %r1483, %r1478, 23; + ex2.approx.ftz.f32 %r1484, %r1482; + mul.f32 %r1485, %r1484, %r1483; + fma.rn.ftz.f32 %r1486, %r1017, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1487, %r1486; + fma.rm.ftz.f32 %r1488, %r1487, %r373, %r372; + add.f32 %r1489, %r1488, 0fCB40007F; + neg.f32 %r1490, %r1489; + fma.rn.ftz.f32 %r1491, %r1017, %r377, %r1490; + fma.rn.ftz.f32 %r1492, %r1017, %r379, %r1491; + shl.b32 %r1493, %r1488, 23; + ex2.approx.ftz.f32 %r1494, %r1492; + mul.f32 %r1495, %r1494, %r1493; + fma.rn.ftz.f32 %r1496, %r1018, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1497, %r1496; + fma.rm.ftz.f32 %r1498, %r1497, %r373, %r372; + add.f32 %r1499, %r1498, 0fCB40007F; + neg.f32 %r1500, %r1499; + fma.rn.ftz.f32 %r1501, %r1018, %r377, %r1500; + fma.rn.ftz.f32 %r1502, %r1018, %r379, %r1501; + shl.b32 %r1503, %r1498, 23; + ex2.approx.ftz.f32 %r1504, %r1502; + mul.f32 %r1505, %r1504, %r1503; + fma.rn.ftz.f32 %r1506, %r1019, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1507, %r1506; + fma.rm.ftz.f32 %r1508, %r1507, %r373, %r372; + add.f32 %r1509, %r1508, 0fCB40007F; + neg.f32 %r1510, %r1509; + fma.rn.ftz.f32 %r1511, %r1019, %r377, %r1510; + fma.rn.ftz.f32 %r1512, %r1019, %r379, %r1511; + shl.b32 %r1513, %r1508, 23; + ex2.approx.ftz.f32 %r1514, %r1512; + mul.f32 %r1515, %r1514, %r1513; + fma.rn.ftz.f32 %r1516, %r1020, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1517, %r1516; + fma.rm.ftz.f32 %r1518, %r1517, %r373, %r372; + add.f32 %r1519, %r1518, 0fCB40007F; + neg.f32 %r1520, %r1519; + fma.rn.ftz.f32 %r1521, %r1020, %r377, %r1520; + fma.rn.ftz.f32 %r1522, %r1020, %r379, %r1521; + shl.b32 %r1523, %r1518, 23; + ex2.approx.ftz.f32 %r1524, %r1522; + mul.f32 %r1525, %r1524, %r1523; + fma.rn.ftz.f32 %r1526, %r1021, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1527, %r1526; + fma.rm.ftz.f32 %r1528, %r1527, %r373, %r372; + add.f32 %r1529, %r1528, 0fCB40007F; + neg.f32 %r1530, %r1529; + fma.rn.ftz.f32 %r1531, %r1021, %r377, %r1530; + fma.rn.ftz.f32 %r1532, %r1021, %r379, %r1531; + shl.b32 %r1533, %r1528, 23; + ex2.approx.ftz.f32 %r1534, %r1532; + mul.f32 %r1535, %r1534, %r1533; + fma.rn.ftz.f32 %r1536, %r1022, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1537, %r1536; + fma.rm.ftz.f32 %r1538, %r1537, %r373, %r372; + add.f32 %r1539, %r1538, 0fCB40007F; + neg.f32 %r1540, %r1539; + fma.rn.ftz.f32 %r1541, %r1022, %r377, %r1540; + fma.rn.ftz.f32 %r1542, %r1022, %r379, %r1541; + shl.b32 %r1543, %r1538, 23; + ex2.approx.ftz.f32 %r1544, %r1542; + mul.f32 %r1545, %r1544, %r1543; + fma.rn.ftz.f32 %r1546, %r1023, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1547, %r1546; + fma.rm.ftz.f32 %r1548, %r1547, %r373, %r372; + add.f32 %r1549, %r1548, 0fCB40007F; + neg.f32 %r1550, %r1549; + fma.rn.ftz.f32 %r1551, %r1023, %r377, %r1550; + fma.rn.ftz.f32 %r1552, %r1023, %r379, %r1551; + shl.b32 %r1553, %r1548, 23; + ex2.approx.ftz.f32 %r1554, %r1552; + mul.f32 %r1555, %r1554, %r1553; + fma.rn.ftz.f32 %r1556, %r1024, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1557, %r1556; + fma.rm.ftz.f32 %r1558, %r1557, %r373, %r372; + add.f32 %r1559, %r1558, 0fCB40007F; + neg.f32 %r1560, %r1559; + fma.rn.ftz.f32 %r1561, %r1024, %r377, %r1560; + fma.rn.ftz.f32 %r1562, %r1024, %r379, %r1561; + shl.b32 %r1563, %r1558, 23; + ex2.approx.ftz.f32 %r1564, %r1562; + mul.f32 %r1565, %r1564, %r1563; + fma.rn.ftz.f32 %r1566, %r1025, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1567, %r1566; + fma.rm.ftz.f32 %r1568, %r1567, %r373, %r372; + add.f32 %r1569, %r1568, 0fCB40007F; + neg.f32 %r1570, %r1569; + fma.rn.ftz.f32 %r1571, %r1025, %r377, %r1570; + fma.rn.ftz.f32 %r1572, %r1025, %r379, %r1571; + shl.b32 %r1573, %r1568, 23; + ex2.approx.ftz.f32 %r1574, %r1572; + mul.f32 %r1575, %r1574, %r1573; + fma.rn.ftz.f32 %r1576, %r1026, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1577, %r1576; + fma.rm.ftz.f32 %r1578, %r1577, %r373, %r372; + add.f32 %r1579, %r1578, 0fCB40007F; + neg.f32 %r1580, %r1579; + fma.rn.ftz.f32 %r1581, %r1026, %r377, %r1580; + fma.rn.ftz.f32 %r1582, %r1026, %r379, %r1581; + shl.b32 %r1583, %r1578, 23; + ex2.approx.ftz.f32 %r1584, %r1582; + mul.f32 %r1585, %r1584, %r1583; + fma.rn.ftz.f32 %r1586, %r1027, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1587, %r1586; + fma.rm.ftz.f32 %r1588, %r1587, %r373, %r372; + add.f32 %r1589, %r1588, 0fCB40007F; + neg.f32 %r1590, %r1589; + fma.rn.ftz.f32 %r1591, %r1027, %r377, %r1590; + fma.rn.ftz.f32 %r1592, %r1027, %r379, %r1591; + shl.b32 %r1593, %r1588, 23; + ex2.approx.ftz.f32 %r1594, %r1592; + mul.f32 %r1595, %r1594, %r1593; + fma.rn.ftz.f32 %r1596, %r1028, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1597, %r1596; + fma.rm.ftz.f32 %r1598, %r1597, %r373, %r372; + add.f32 %r1599, %r1598, 0fCB40007F; + neg.f32 %r1600, %r1599; + fma.rn.ftz.f32 %r1601, %r1028, %r377, %r1600; + fma.rn.ftz.f32 %r1602, %r1028, %r379, %r1601; + shl.b32 %r1603, %r1598, 23; + ex2.approx.ftz.f32 %r1604, %r1602; + mul.f32 %r1605, %r1604, %r1603; + fma.rn.ftz.f32 %r1606, %r1029, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1607, %r1606; + fma.rm.ftz.f32 %r1608, %r1607, %r373, %r372; + add.f32 %r1609, %r1608, 0fCB40007F; + neg.f32 %r1610, %r1609; + fma.rn.ftz.f32 %r1611, %r1029, %r377, %r1610; + fma.rn.ftz.f32 %r1612, %r1029, %r379, %r1611; + shl.b32 %r1613, %r1608, 23; + ex2.approx.ftz.f32 %r1614, %r1612; + mul.f32 %r1615, %r1614, %r1613; + fma.rn.ftz.f32 %r1616, %r1030, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1617, %r1616; + fma.rm.ftz.f32 %r1618, %r1617, %r373, %r372; + fma.rn.ftz.f32 %r1619, %r1031, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1620, %r1619; + fma.rm.ftz.f32 %r1621, %r1620, %r373, %r372; + fma.rn.ftz.f32 %r1622, %r1032, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1623, %r1622; + fma.rm.ftz.f32 %r1624, %r1623, %r373, %r372; + fma.rn.ftz.f32 %r1625, %r1033, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1626, %r1625; + fma.rm.ftz.f32 %r1627, %r1626, %r373, %r372; + fma.rn.ftz.f32 %r1628, %r1034, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1629, %r1628; + fma.rm.ftz.f32 %r1630, %r1629, %r373, %r372; + fma.rn.ftz.f32 %r1631, %r1035, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1632, %r1631; + fma.rm.ftz.f32 %r1633, %r1632, %r373, %r372; + .loc 2 205 36 // triton_helpers.py:205:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.f32 %r1634, %r383, 0f00000000, %r1045; + selp.f32 %r1635, 0f3F800000, %r1634, %p39; + fma.rn.f32 %r1636, %r393, 0f00000000, %r1055; + selp.f32 %r1637, 0f3F800000, %r1636, %p40; + fma.rn.f32 %r1638, %r403, 0f00000000, %r1065; + selp.f32 %r1639, 0f3F800000, %r1638, %p41; + fma.rn.f32 %r1640, %r413, 0f00000000, %r1075; + selp.f32 %r1641, 0f3F800000, %r1640, %p42; + fma.rn.f32 %r1642, %r423, 0f00000000, %r1085; + selp.f32 %r1643, 0f3F800000, %r1642, %p43; + fma.rn.f32 %r1644, %r433, 0f00000000, %r1095; + selp.f32 %r1645, 0f3F800000, %r1644, %p44; + fma.rn.f32 %r1646, %r443, 0f00000000, %r1105; + selp.f32 %r1647, 0f3F800000, %r1646, %p45; + fma.rn.f32 %r1648, %r453, 0f00000000, %r1115; + selp.f32 %r1649, 0f3F800000, %r1648, %p46; + fma.rn.f32 %r1650, %r463, 0f00000000, %r1125; + selp.f32 %r1651, 0f3F800000, %r1650, %p47; + fma.rn.f32 %r1652, %r473, 0f00000000, %r1135; + selp.f32 %r1653, 0f3F800000, %r1652, %p48; + fma.rn.f32 %r1654, %r483, 0f00000000, %r1145; + selp.f32 %r1655, 0f3F800000, %r1654, %p49; + fma.rn.f32 %r1656, %r493, 0f00000000, %r1155; + selp.f32 %r1657, 0f3F800000, %r1656, %p50; + fma.rn.f32 %r1658, %r503, 0f00000000, %r1165; + selp.f32 %r1659, 0f3F800000, %r1658, %p51; + fma.rn.f32 %r1660, %r513, 0f00000000, %r1175; + selp.f32 %r1661, 0f3F800000, %r1660, %p52; + fma.rn.f32 %r1662, %r523, 0f00000000, %r1185; + selp.f32 %r1663, 0f3F800000, %r1662, %p53; + fma.rn.f32 %r1664, %r533, 0f00000000, %r1195; + selp.f32 %r1665, 0f3F800000, %r1664, %p54; + fma.rn.f32 %r1666, %r543, 0f00000000, %r1205; + selp.f32 %r1667, 0f3F800000, %r1666, %p55; + fma.rn.f32 %r1668, %r553, 0f00000000, %r1215; + selp.f32 %r1669, 0f3F800000, %r1668, %p56; + fma.rn.f32 %r1670, %r563, 0f00000000, %r1225; + selp.f32 %r1671, 0f3F800000, %r1670, %p57; + fma.rn.f32 %r1672, %r573, 0f00000000, %r1235; + selp.f32 %r1673, 0f3F800000, %r1672, %p58; + fma.rn.f32 %r1674, %r583, 0f00000000, %r1245; + selp.f32 %r1675, 0f3F800000, %r1674, %p59; + fma.rn.f32 %r1676, %r593, 0f00000000, %r1255; + selp.f32 %r1677, 0f3F800000, %r1676, %p60; + fma.rn.f32 %r1678, %r603, 0f00000000, %r1265; + selp.f32 %r1679, 0f3F800000, %r1678, %p61; + fma.rn.f32 %r1680, %r613, 0f00000000, %r1275; + selp.f32 %r1681, 0f3F800000, %r1680, %p62; + fma.rn.f32 %r1682, %r623, 0f00000000, %r1285; + selp.f32 %r1683, 0f3F800000, %r1682, %p63; + fma.rn.f32 %r1684, %r633, 0f00000000, %r1295; + selp.f32 %r1685, 0f3F800000, %r1684, %p64; + fma.rn.f32 %r1686, %r643, 0f00000000, %r1305; + selp.f32 %r1687, 0f3F800000, %r1686, %p65; + fma.rn.f32 %r1688, %r653, 0f00000000, %r1315; + selp.f32 %r1689, 0f3F800000, %r1688, %p66; + fma.rn.f32 %r1690, %r663, 0f00000000, %r1325; + selp.f32 %r1691, 0f3F800000, %r1690, %p67; + fma.rn.f32 %r1692, %r673, 0f00000000, %r1335; + selp.f32 %r1693, 0f3F800000, %r1692, %p68; + fma.rn.f32 %r1694, %r683, 0f00000000, %r1345; + selp.f32 %r1695, 0f3F800000, %r1694, %p69; + fma.rn.f32 %r1696, %r693, 0f00000000, %r1355; + selp.f32 %r1697, 0f3F800000, %r1696, %p70; + fma.rn.f32 %r1698, %r703, 0f00000000, %r1365; + selp.f32 %r1699, 0f3F800000, %r1698, %p71; + fma.rn.f32 %r1700, %r713, 0f00000000, %r1375; + selp.f32 %r1701, 0f3F800000, %r1700, %p72; + fma.rn.f32 %r1702, %r723, 0f00000000, %r1385; + selp.f32 %r1703, 0f3F800000, %r1702, %p73; + fma.rn.f32 %r1704, %r733, 0f00000000, %r1395; + selp.f32 %r1705, 0f3F800000, %r1704, %p74; + fma.rn.f32 %r1706, %r743, 0f00000000, %r1405; + selp.f32 %r1707, 0f3F800000, %r1706, %p75; + fma.rn.f32 %r1708, %r753, 0f00000000, %r1415; + selp.f32 %r1709, 0f3F800000, %r1708, %p76; + fma.rn.f32 %r1710, %r763, 0f00000000, %r1425; + selp.f32 %r1711, 0f3F800000, %r1710, %p77; + fma.rn.f32 %r1712, %r773, 0f00000000, %r1435; + selp.f32 %r1713, 0f3F800000, %r1712, %p78; + fma.rn.f32 %r1714, %r783, 0f00000000, %r1445; + selp.f32 %r1715, 0f3F800000, %r1714, %p79; + fma.rn.f32 %r1716, %r793, 0f00000000, %r1455; + selp.f32 %r1717, 0f3F800000, %r1716, %p80; + fma.rn.f32 %r1718, %r803, 0f00000000, %r1465; + selp.f32 %r1719, 0f3F800000, %r1718, %p81; + fma.rn.f32 %r1720, %r813, 0f00000000, %r1475; + selp.f32 %r1721, 0f3F800000, %r1720, %p82; + fma.rn.f32 %r1722, %r823, 0f00000000, %r1485; + selp.f32 %r1723, 0f3F800000, %r1722, %p83; + fma.rn.f32 %r1724, %r833, 0f00000000, %r1495; + selp.f32 %r1725, 0f3F800000, %r1724, %p84; + fma.rn.f32 %r1726, %r843, 0f00000000, %r1505; + selp.f32 %r1727, 0f3F800000, %r1726, %p85; + fma.rn.f32 %r1728, %r853, 0f00000000, %r1515; + selp.f32 %r1729, 0f3F800000, %r1728, %p86; + fma.rn.f32 %r1730, %r863, 0f00000000, %r1525; + selp.f32 %r1731, 0f3F800000, %r1730, %p87; + fma.rn.f32 %r1732, %r873, 0f00000000, %r1535; + selp.f32 %r1733, 0f3F800000, %r1732, %p88; + fma.rn.f32 %r1734, %r883, 0f00000000, %r1545; + selp.f32 %r1735, 0f3F800000, %r1734, %p89; + fma.rn.f32 %r1736, %r893, 0f00000000, %r1555; + selp.f32 %r1737, 0f3F800000, %r1736, %p90; + fma.rn.f32 %r1738, %r903, 0f00000000, %r1565; + selp.f32 %r1739, 0f3F800000, %r1738, %p91; + fma.rn.f32 %r1740, %r913, 0f00000000, %r1575; + selp.f32 %r1741, 0f3F800000, %r1740, %p92; + fma.rn.f32 %r1742, %r923, 0f00000000, %r1585; + selp.f32 %r1743, 0f3F800000, %r1742, %p93; + fma.rn.f32 %r1744, %r933, 0f00000000, %r1595; + selp.f32 %r1745, 0f3F800000, %r1744, %p94; + fma.rn.f32 %r1746, %r943, 0f00000000, %r1605; + selp.f32 %r1747, 0f3F800000, %r1746, %p95; + fma.rn.f32 %r1748, %r953, 0f00000000, %r1615; + selp.f32 %r1749, 0f3F800000, %r1748, %p96; +$L__tmp2: + .loc 1 45 54 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:45:54 + selp.f32 %r1750, %r295, 0fFF800000, %p8; + selp.f32 %r1751, %r296, 0fFF800000, %p8; + selp.f32 %r1752, %r297, 0fFF800000, %p8; + selp.f32 %r1753, %r298, 0fFF800000, %p8; + selp.f32 %r1754, %r299, 0fFF800000, %p8; + selp.f32 %r1755, %r300, 0fFF800000, %p8; + selp.f32 %r1756, %r301, 0fFF800000, %p8; + selp.f32 %r1757, %r302, 0fFF800000, %p8; + .loc 1 46 54 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:46:54 + selp.f32 %r1758, %r1747, 0f00000000, %p8; + selp.f32 %r1759, %r1749, 0f00000000, %p8; +$L__tmp3: + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.bf16 %p97, %rs1, %rs2; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.bf16 %p98, %rs1, %rs1; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1760, %r239, %r240, %p98; + selp.f32 %r1761, %r239, %r1760, %p97; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p99, %r1761, %r241; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p100, %r1761, %r1761; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1762, %r1761, %r241, %p100; + selp.f32 %r1763, %r1761, %r1762, %p99; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p101, %r1763, %r242; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p102, %r1763, %r1763; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1764, %r1763, %r242, %p102; + selp.f32 %r1765, %r1763, %r1764, %p101; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p103, %r1765, %r243; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p104, %r1765, %r1765; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1766, %r1765, %r243, %p104; + selp.f32 %r1767, %r1765, %r1766, %p103; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p105, %r1767, %r244; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p106, %r1767, %r1767; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1768, %r1767, %r244, %p106; + selp.f32 %r1769, %r1767, %r1768, %p105; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p107, %r1769, %r245; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p108, %r1769, %r1769; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1770, %r1769, %r245, %p108; + selp.f32 %r1771, %r1769, %r1770, %p107; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p109, %r1771, %r246; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p110, %r1771, %r1771; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1772, %r1771, %r246, %p110; + selp.f32 %r1773, %r1771, %r1772, %p109; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p111, %r1773, %r247; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p112, %r1773, %r1773; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1774, %r1773, %r247, %p112; + selp.f32 %r1775, %r1773, %r1774, %p111; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p113, %r1775, %r248; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p114, %r1775, %r1775; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1776, %r1775, %r248, %p114; + selp.f32 %r1777, %r1775, %r1776, %p113; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p115, %r1777, %r249; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p116, %r1777, %r1777; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1778, %r1777, %r249, %p116; + selp.f32 %r1779, %r1777, %r1778, %p115; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p117, %r1779, %r250; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p118, %r1779, %r1779; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1780, %r1779, %r250, %p118; + selp.f32 %r1781, %r1779, %r1780, %p117; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p119, %r1781, %r251; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p120, %r1781, %r1781; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1782, %r1781, %r251, %p120; + selp.f32 %r1783, %r1781, %r1782, %p119; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p121, %r1783, %r252; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p122, %r1783, %r1783; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1784, %r1783, %r252, %p122; + selp.f32 %r1785, %r1783, %r1784, %p121; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p123, %r1785, %r253; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p124, %r1785, %r1785; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1786, %r1785, %r253, %p124; + selp.f32 %r1787, %r1785, %r1786, %p123; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p125, %r1787, %r254; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p126, %r1787, %r1787; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1788, %r1787, %r254, %p126; + selp.f32 %r1789, %r1787, %r1788, %p125; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p127, %r1789, %r255; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p128, %r1789, %r1789; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1790, %r1789, %r255, %p128; + selp.f32 %r1791, %r1789, %r1790, %p127; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p129, %r1791, %r256; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p130, %r1791, %r1791; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1792, %r1791, %r256, %p130; + selp.f32 %r1793, %r1791, %r1792, %p129; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p131, %r1793, %r257; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p132, %r1793, %r1793; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1794, %r1793, %r257, %p132; + selp.f32 %r1795, %r1793, %r1794, %p131; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p133, %r1795, %r258; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p134, %r1795, %r1795; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1796, %r1795, %r258, %p134; + selp.f32 %r1797, %r1795, %r1796, %p133; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p135, %r1797, %r259; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p136, %r1797, %r1797; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1798, %r1797, %r259, %p136; + selp.f32 %r1799, %r1797, %r1798, %p135; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p137, %r1799, %r260; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p138, %r1799, %r1799; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1800, %r1799, %r260, %p138; + selp.f32 %r1801, %r1799, %r1800, %p137; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p139, %r1801, %r261; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p140, %r1801, %r1801; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1802, %r1801, %r261, %p140; + selp.f32 %r1803, %r1801, %r1802, %p139; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p141, %r1803, %r262; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p142, %r1803, %r1803; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1804, %r1803, %r262, %p142; + selp.f32 %r1805, %r1803, %r1804, %p141; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p143, %r1805, %r263; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p144, %r1805, %r1805; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1806, %r1805, %r263, %p144; + selp.f32 %r1807, %r1805, %r1806, %p143; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p145, %r1807, %r264; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p146, %r1807, %r1807; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1808, %r1807, %r264, %p146; + selp.f32 %r1809, %r1807, %r1808, %p145; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p147, %r1809, %r265; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p148, %r1809, %r1809; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1810, %r1809, %r265, %p148; + selp.f32 %r1811, %r1809, %r1810, %p147; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p149, %r1811, %r266; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p150, %r1811, %r1811; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1812, %r1811, %r266, %p150; + selp.f32 %r1813, %r1811, %r1812, %p149; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p151, %r1813, %r267; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p152, %r1813, %r1813; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1814, %r1813, %r267, %p152; + selp.f32 %r1815, %r1813, %r1814, %p151; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p153, %r1815, %r268; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p154, %r1815, %r1815; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1816, %r1815, %r268, %p154; + selp.f32 %r1817, %r1815, %r1816, %p153; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p155, %r1817, %r269; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p156, %r1817, %r1817; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1818, %r1817, %r269, %p156; + selp.f32 %r1819, %r1817, %r1818, %p155; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p157, %r1819, %r270; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p158, %r1819, %r1819; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1820, %r1819, %r270, %p158; + selp.f32 %r1821, %r1819, %r1820, %p157; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p159, %r1821, %r271; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p160, %r1821, %r1821; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1822, %r1821, %r271, %p160; + selp.f32 %r1823, %r1821, %r1822, %p159; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p161, %r1823, %r272; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p162, %r1823, %r1823; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1824, %r1823, %r272, %p162; + selp.f32 %r1825, %r1823, %r1824, %p161; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p163, %r1825, %r273; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p164, %r1825, %r1825; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1826, %r1825, %r273, %p164; + selp.f32 %r1827, %r1825, %r1826, %p163; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p165, %r1827, %r274; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p166, %r1827, %r1827; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1828, %r1827, %r274, %p166; + selp.f32 %r1829, %r1827, %r1828, %p165; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p167, %r1829, %r275; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p168, %r1829, %r1829; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1830, %r1829, %r275, %p168; + selp.f32 %r1831, %r1829, %r1830, %p167; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p169, %r1831, %r276; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p170, %r1831, %r1831; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1832, %r1831, %r276, %p170; + selp.f32 %r1833, %r1831, %r1832, %p169; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p171, %r1833, %r277; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p172, %r1833, %r1833; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1834, %r1833, %r277, %p172; + selp.f32 %r1835, %r1833, %r1834, %p171; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p173, %r1835, %r278; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p174, %r1835, %r1835; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1836, %r1835, %r278, %p174; + selp.f32 %r1837, %r1835, %r1836, %p173; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p175, %r1837, %r279; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p176, %r1837, %r1837; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1838, %r1837, %r279, %p176; + selp.f32 %r1839, %r1837, %r1838, %p175; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p177, %r1839, %r280; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p178, %r1839, %r1839; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1840, %r1839, %r280, %p178; + selp.f32 %r1841, %r1839, %r1840, %p177; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p179, %r1841, %r281; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p180, %r1841, %r1841; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1842, %r1841, %r281, %p180; + selp.f32 %r1843, %r1841, %r1842, %p179; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p181, %r1843, %r282; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p182, %r1843, %r1843; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1844, %r1843, %r282, %p182; + selp.f32 %r1845, %r1843, %r1844, %p181; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p183, %r1845, %r283; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p184, %r1845, %r1845; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1846, %r1845, %r283, %p184; + selp.f32 %r1847, %r1845, %r1846, %p183; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p185, %r1847, %r284; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p186, %r1847, %r1847; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1848, %r1847, %r284, %p186; + selp.f32 %r1849, %r1847, %r1848, %p185; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p187, %r1849, %r285; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p188, %r1849, %r1849; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1850, %r1849, %r285, %p188; + selp.f32 %r1851, %r1849, %r1850, %p187; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p189, %r1851, %r286; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p190, %r1851, %r1851; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1852, %r1851, %r286, %p190; + selp.f32 %r1853, %r1851, %r1852, %p189; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p191, %r1853, %r287; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p192, %r1853, %r1853; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1854, %r1853, %r287, %p192; + selp.f32 %r1855, %r1853, %r1854, %p191; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p193, %r1855, %r288; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p194, %r1855, %r1855; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1856, %r1855, %r288, %p194; + selp.f32 %r1857, %r1855, %r1856, %p193; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p195, %r1857, %r289; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p196, %r1857, %r1857; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1858, %r1857, %r289, %p196; + selp.f32 %r1859, %r1857, %r1858, %p195; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p197, %r1859, %r290; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p198, %r1859, %r1859; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1860, %r1859, %r290, %p198; + selp.f32 %r1861, %r1859, %r1860, %p197; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p199, %r1861, %r291; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p200, %r1861, %r1861; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1862, %r1861, %r291, %p200; + selp.f32 %r1863, %r1861, %r1862, %p199; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p201, %r1863, %r292; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p202, %r1863, %r1863; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1864, %r1863, %r292, %p202; + selp.f32 %r1865, %r1863, %r1864, %p201; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p203, %r1865, %r293; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p204, %r1865, %r1865; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1866, %r1865, %r293, %p204; + selp.f32 %r1867, %r1865, %r1866, %p203; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p205, %r1867, %r294; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p206, %r1867, %r1867; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1868, %r1867, %r294, %p206; + selp.f32 %r1869, %r1867, %r1868, %p205; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p207, %r1869, %r1750; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p208, %r1869, %r1869; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1870, %r1869, %r1750, %p208; + selp.f32 %r1871, %r1869, %r1870, %p207; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p209, %r1871, %r1751; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p210, %r1871, %r1871; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1872, %r1871, %r1751, %p210; + selp.f32 %r1873, %r1871, %r1872, %p209; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p211, %r1873, %r1752; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p212, %r1873, %r1873; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1874, %r1873, %r1752, %p212; + selp.f32 %r1875, %r1873, %r1874, %p211; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p213, %r1875, %r1753; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p214, %r1875, %r1875; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1876, %r1875, %r1753, %p214; + selp.f32 %r1877, %r1875, %r1876, %p213; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p215, %r1877, %r1754; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p216, %r1877, %r1877; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1878, %r1877, %r1754, %p216; + selp.f32 %r1879, %r1877, %r1878, %p215; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p217, %r1879, %r1755; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p218, %r1879, %r1879; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1880, %r1879, %r1755, %p218; + selp.f32 %r1881, %r1879, %r1880, %p217; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p219, %r1881, %r1756; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p220, %r1881, %r1881; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1882, %r1881, %r1756, %p220; + selp.f32 %r1883, %r1881, %r1882, %p219; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p221, %r1883, %r1757; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p222, %r1883, %r1883; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1884, %r1883, %r1757, %p222; + selp.f32 %r1885, %r1883, %r1884, %p221; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1886, %r1885, 16, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p223, %r1885, %r1886; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p224, %r1885, %r1885; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1887, %r1885, %r1886, %p223; + selp.f32 %r1888, %r1885, %r1887, %p224; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1889, %r1888, 8, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p225, %r1888, %r1889; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p226, %r1888, %r1888; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1890, %r1888, %r1889, %p226; + selp.f32 %r1891, %r1888, %r1890, %p225; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1892, %r1891, 4, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p227, %r1891, %r1892; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p228, %r1891, %r1891; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1893, %r1891, %r1892, %p228; + selp.f32 %r1894, %r1891, %r1893, %p227; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1895, %r1894, 2, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p229, %r1894, %r1895; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p230, %r1894, %r1894; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1896, %r1894, %r1895, %p230; + selp.f32 %r1897, %r1894, %r1896, %p229; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1898, %r1897, 1, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p231, %r1897, %r1898; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p232, %r1897, %r1897; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.eq.b32 %p9, %r207, 0; + shr.u32 %r1899, %r206, 3; + and.b32 %r1900, %r1899, 60; + mov.b32 %r1901, global_smem; + add.s32 %r65, %r1901, %r1900; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.b32 %r1902, %r1897, %r1898, %p232; + selp.b32 %r66, %r1897, %r1902, %p231; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + // begin inline asm + @%p9 st.shared.b32 [ %r65 + 0 ], %r66; + // end inline asm + bar.sync 0; + setp.lt.u32 %p10, %r206, 16; + shl.b32 %r1903, %r206, 2; + add.s32 %r68, %r1901, %r1903; + // begin inline asm + @%p10 ld.shared.b32 %r67, [ %r68 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1904, %r67, 8, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p233, %r67, %r1904; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p234, %r67, %r67; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1905, %r67, %r1904, %p233; + selp.f32 %r1906, %r67, %r1905, %p234; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1907, %r1906, 4, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p235, %r1906, %r1907; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p236, %r1906, %r1906; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1908, %r1906, %r1907, %p236; + selp.f32 %r1909, %r1906, %r1908, %p235; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1910, %r1909, 2, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p237, %r1909, %r1910; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p238, %r1909, %r1909; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1911, %r1909, %r1910, %p238; + selp.f32 %r1912, %r1909, %r1911, %p237; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1913, %r1912, 1, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p239, %r1912, %r1913; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p240, %r1912, %r1912; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.eq.b32 %p11, %r206, 0; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.b32 %r1914, %r1912, %r1913, %p240; + selp.b32 %r70, %r1912, %r1914, %p239; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + // begin inline asm + @%p11 st.shared.b32 [ %r68 + 0 ], %r70; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1915, [global_smem]; + .loc 2 180 40 // triton_helpers.py:180:40 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.eq.f32 %p241, %r1915, 0fFF800000; + .loc 2 180 68 // triton_helpers.py:180:68 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + sub.f32 %r1916, %r239, %r1915; + sub.f32 %r1917, %r240, %r1915; + sub.f32 %r1918, %r241, %r1915; + sub.f32 %r1919, %r242, %r1915; + sub.f32 %r1920, %r243, %r1915; + sub.f32 %r1921, %r244, %r1915; + sub.f32 %r1922, %r245, %r1915; + sub.f32 %r1923, %r246, %r1915; + sub.f32 %r1924, %r247, %r1915; + sub.f32 %r1925, %r248, %r1915; + sub.f32 %r1926, %r249, %r1915; + sub.f32 %r1927, %r250, %r1915; + sub.f32 %r1928, %r251, %r1915; + sub.f32 %r1929, %r252, %r1915; + sub.f32 %r1930, %r253, %r1915; + sub.f32 %r1931, %r254, %r1915; + sub.f32 %r1932, %r255, %r1915; + sub.f32 %r1933, %r256, %r1915; + sub.f32 %r1934, %r257, %r1915; + sub.f32 %r1935, %r258, %r1915; + sub.f32 %r1936, %r259, %r1915; + sub.f32 %r1937, %r260, %r1915; + sub.f32 %r1938, %r261, %r1915; + sub.f32 %r1939, %r262, %r1915; + sub.f32 %r1940, %r263, %r1915; + sub.f32 %r1941, %r264, %r1915; + sub.f32 %r1942, %r265, %r1915; + sub.f32 %r1943, %r266, %r1915; + sub.f32 %r1944, %r267, %r1915; + sub.f32 %r1945, %r268, %r1915; + sub.f32 %r1946, %r269, %r1915; + sub.f32 %r1947, %r270, %r1915; + sub.f32 %r1948, %r271, %r1915; + sub.f32 %r1949, %r272, %r1915; + sub.f32 %r1950, %r273, %r1915; + sub.f32 %r1951, %r274, %r1915; + sub.f32 %r1952, %r275, %r1915; + sub.f32 %r1953, %r276, %r1915; + sub.f32 %r1954, %r277, %r1915; + sub.f32 %r1955, %r278, %r1915; + sub.f32 %r1956, %r279, %r1915; + sub.f32 %r1957, %r280, %r1915; + sub.f32 %r1958, %r281, %r1915; + sub.f32 %r1959, %r282, %r1915; + sub.f32 %r1960, %r283, %r1915; + sub.f32 %r1961, %r284, %r1915; + sub.f32 %r1962, %r285, %r1915; + sub.f32 %r1963, %r286, %r1915; + sub.f32 %r1964, %r287, %r1915; + sub.f32 %r1965, %r288, %r1915; + sub.f32 %r1966, %r289, %r1915; + sub.f32 %r1967, %r290, %r1915; + sub.f32 %r1968, %r291, %r1915; + sub.f32 %r1969, %r292, %r1915; + sub.f32 %r1970, %r293, %r1915; + sub.f32 %r1971, %r294, %r1915; + sub.f32 %r1972, %r1750, %r1915; + sub.f32 %r1973, %r1751, %r1915; + sub.f32 %r1974, %r1752, %r1915; + sub.f32 %r1975, %r1753, %r1915; + sub.f32 %r1976, %r1754, %r1915; + sub.f32 %r1977, %r1755, %r1915; + sub.f32 %r1978, %r1756, %r1915; + sub.f32 %r1979, %r1757, %r1915; + .loc 2 180 58 // triton_helpers.py:180:58 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1980, 0f00000000, %r1916, %p241; + selp.f32 %r1981, 0f00000000, %r1917, %p241; + selp.f32 %r1982, 0f00000000, %r1918, %p241; + selp.f32 %r1983, 0f00000000, %r1919, %p241; + selp.f32 %r1984, 0f00000000, %r1920, %p241; + selp.f32 %r1985, 0f00000000, %r1921, %p241; + selp.f32 %r1986, 0f00000000, %r1922, %p241; + selp.f32 %r1987, 0f00000000, %r1923, %p241; + selp.f32 %r1988, 0f00000000, %r1924, %p241; + selp.f32 %r1989, 0f00000000, %r1925, %p241; + selp.f32 %r1990, 0f00000000, %r1926, %p241; + selp.f32 %r1991, 0f00000000, %r1927, %p241; + selp.f32 %r1992, 0f00000000, %r1928, %p241; + selp.f32 %r1993, 0f00000000, %r1929, %p241; + selp.f32 %r1994, 0f00000000, %r1930, %p241; + selp.f32 %r1995, 0f00000000, %r1931, %p241; + selp.f32 %r1996, 0f00000000, %r1932, %p241; + selp.f32 %r1997, 0f00000000, %r1933, %p241; + selp.f32 %r1998, 0f00000000, %r1934, %p241; + selp.f32 %r1999, 0f00000000, %r1935, %p241; + selp.f32 %r2000, 0f00000000, %r1936, %p241; + selp.f32 %r2001, 0f00000000, %r1937, %p241; + selp.f32 %r2002, 0f00000000, %r1938, %p241; + selp.f32 %r2003, 0f00000000, %r1939, %p241; + selp.f32 %r2004, 0f00000000, %r1940, %p241; + selp.f32 %r2005, 0f00000000, %r1941, %p241; + selp.f32 %r2006, 0f00000000, %r1942, %p241; + selp.f32 %r2007, 0f00000000, %r1943, %p241; + selp.f32 %r2008, 0f00000000, %r1944, %p241; + selp.f32 %r2009, 0f00000000, %r1945, %p241; + selp.f32 %r2010, 0f00000000, %r1946, %p241; + selp.f32 %r2011, 0f00000000, %r1947, %p241; + selp.f32 %r2012, 0f00000000, %r1948, %p241; + selp.f32 %r2013, 0f00000000, %r1949, %p241; + selp.f32 %r2014, 0f00000000, %r1950, %p241; + selp.f32 %r2015, 0f00000000, %r1951, %p241; + selp.f32 %r2016, 0f00000000, %r1952, %p241; + selp.f32 %r2017, 0f00000000, %r1953, %p241; + selp.f32 %r2018, 0f00000000, %r1954, %p241; + selp.f32 %r2019, 0f00000000, %r1955, %p241; + selp.f32 %r2020, 0f00000000, %r1956, %p241; + selp.f32 %r2021, 0f00000000, %r1957, %p241; + selp.f32 %r2022, 0f00000000, %r1958, %p241; + selp.f32 %r2023, 0f00000000, %r1959, %p241; + selp.f32 %r2024, 0f00000000, %r1960, %p241; + selp.f32 %r2025, 0f00000000, %r1961, %p241; + selp.f32 %r2026, 0f00000000, %r1962, %p241; + selp.f32 %r2027, 0f00000000, %r1963, %p241; + selp.f32 %r2028, 0f00000000, %r1964, %p241; + selp.f32 %r2029, 0f00000000, %r1965, %p241; + selp.f32 %r2030, 0f00000000, %r1966, %p241; + selp.f32 %r2031, 0f00000000, %r1967, %p241; + selp.f32 %r2032, 0f00000000, %r1968, %p241; + selp.f32 %r2033, 0f00000000, %r1969, %p241; + selp.f32 %r2034, 0f00000000, %r1970, %p241; + selp.f32 %r2035, 0f00000000, %r1971, %p241; + selp.f32 %r2036, 0f00000000, %r1972, %p241; + selp.f32 %r2037, 0f00000000, %r1973, %p241; + selp.f32 %r2038, 0f00000000, %r1974, %p241; + selp.f32 %r2039, 0f00000000, %r1975, %p241; + selp.f32 %r2040, 0f00000000, %r1976, %p241; + selp.f32 %r2041, 0f00000000, %r1977, %p241; + selp.f32 %r2042, 0f00000000, %r1978, %p241; + selp.f32 %r2043, 0f00000000, %r1979, %p241; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + fma.rn.ftz.f32 %r2044, %r1980, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2045, %r2044; + fma.rm.ftz.f32 %r2046, %r2045, %r373, %r372; + add.f32 %r2047, %r2046, 0fCB40007F; + neg.f32 %r2048, %r2047; + fma.rn.ftz.f32 %r2049, %r1980, %r377, %r2048; + fma.rn.ftz.f32 %r2050, %r1980, %r379, %r2049; + shl.b32 %r2051, %r2046, 23; + ex2.approx.ftz.f32 %r2052, %r2050; + mul.f32 %r2053, %r2052, %r2051; + fma.rn.ftz.f32 %r2054, %r1981, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2055, %r2054; + fma.rm.ftz.f32 %r2056, %r2055, %r373, %r372; + add.f32 %r2057, %r2056, 0fCB40007F; + neg.f32 %r2058, %r2057; + fma.rn.ftz.f32 %r2059, %r1981, %r377, %r2058; + fma.rn.ftz.f32 %r2060, %r1981, %r379, %r2059; + shl.b32 %r2061, %r2056, 23; + ex2.approx.ftz.f32 %r2062, %r2060; + mul.f32 %r2063, %r2062, %r2061; + fma.rn.ftz.f32 %r2064, %r1982, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2065, %r2064; + fma.rm.ftz.f32 %r2066, %r2065, %r373, %r372; + add.f32 %r2067, %r2066, 0fCB40007F; + neg.f32 %r2068, %r2067; + fma.rn.ftz.f32 %r2069, %r1982, %r377, %r2068; + fma.rn.ftz.f32 %r2070, %r1982, %r379, %r2069; + shl.b32 %r2071, %r2066, 23; + ex2.approx.ftz.f32 %r2072, %r2070; + mul.f32 %r2073, %r2072, %r2071; + fma.rn.ftz.f32 %r2074, %r1983, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2075, %r2074; + fma.rm.ftz.f32 %r2076, %r2075, %r373, %r372; + add.f32 %r2077, %r2076, 0fCB40007F; + neg.f32 %r2078, %r2077; + fma.rn.ftz.f32 %r2079, %r1983, %r377, %r2078; + fma.rn.ftz.f32 %r2080, %r1983, %r379, %r2079; + shl.b32 %r2081, %r2076, 23; + ex2.approx.ftz.f32 %r2082, %r2080; + mul.f32 %r2083, %r2082, %r2081; + fma.rn.ftz.f32 %r2084, %r1984, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2085, %r2084; + fma.rm.ftz.f32 %r2086, %r2085, %r373, %r372; + add.f32 %r2087, %r2086, 0fCB40007F; + neg.f32 %r2088, %r2087; + fma.rn.ftz.f32 %r2089, %r1984, %r377, %r2088; + fma.rn.ftz.f32 %r2090, %r1984, %r379, %r2089; + shl.b32 %r2091, %r2086, 23; + ex2.approx.ftz.f32 %r2092, %r2090; + mul.f32 %r2093, %r2092, %r2091; + fma.rn.ftz.f32 %r2094, %r1985, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2095, %r2094; + fma.rm.ftz.f32 %r2096, %r2095, %r373, %r372; + add.f32 %r2097, %r2096, 0fCB40007F; + neg.f32 %r2098, %r2097; + fma.rn.ftz.f32 %r2099, %r1985, %r377, %r2098; + fma.rn.ftz.f32 %r2100, %r1985, %r379, %r2099; + shl.b32 %r2101, %r2096, 23; + ex2.approx.ftz.f32 %r2102, %r2100; + mul.f32 %r2103, %r2102, %r2101; + fma.rn.ftz.f32 %r2104, %r1986, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2105, %r2104; + fma.rm.ftz.f32 %r2106, %r2105, %r373, %r372; + add.f32 %r2107, %r2106, 0fCB40007F; + neg.f32 %r2108, %r2107; + fma.rn.ftz.f32 %r2109, %r1986, %r377, %r2108; + fma.rn.ftz.f32 %r2110, %r1986, %r379, %r2109; + shl.b32 %r2111, %r2106, 23; + ex2.approx.ftz.f32 %r2112, %r2110; + mul.f32 %r2113, %r2112, %r2111; + fma.rn.ftz.f32 %r2114, %r1987, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2115, %r2114; + fma.rm.ftz.f32 %r2116, %r2115, %r373, %r372; + add.f32 %r2117, %r2116, 0fCB40007F; + neg.f32 %r2118, %r2117; + fma.rn.ftz.f32 %r2119, %r1987, %r377, %r2118; + fma.rn.ftz.f32 %r2120, %r1987, %r379, %r2119; + shl.b32 %r2121, %r2116, 23; + ex2.approx.ftz.f32 %r2122, %r2120; + mul.f32 %r2123, %r2122, %r2121; + fma.rn.ftz.f32 %r2124, %r1988, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2125, %r2124; + fma.rm.ftz.f32 %r2126, %r2125, %r373, %r372; + add.f32 %r2127, %r2126, 0fCB40007F; + neg.f32 %r2128, %r2127; + fma.rn.ftz.f32 %r2129, %r1988, %r377, %r2128; + fma.rn.ftz.f32 %r2130, %r1988, %r379, %r2129; + shl.b32 %r2131, %r2126, 23; + ex2.approx.ftz.f32 %r2132, %r2130; + mul.f32 %r2133, %r2132, %r2131; + fma.rn.ftz.f32 %r2134, %r1989, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2135, %r2134; + fma.rm.ftz.f32 %r2136, %r2135, %r373, %r372; + add.f32 %r2137, %r2136, 0fCB40007F; + neg.f32 %r2138, %r2137; + fma.rn.ftz.f32 %r2139, %r1989, %r377, %r2138; + fma.rn.ftz.f32 %r2140, %r1989, %r379, %r2139; + shl.b32 %r2141, %r2136, 23; + ex2.approx.ftz.f32 %r2142, %r2140; + mul.f32 %r2143, %r2142, %r2141; + fma.rn.ftz.f32 %r2144, %r1990, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2145, %r2144; + fma.rm.ftz.f32 %r2146, %r2145, %r373, %r372; + add.f32 %r2147, %r2146, 0fCB40007F; + neg.f32 %r2148, %r2147; + fma.rn.ftz.f32 %r2149, %r1990, %r377, %r2148; + fma.rn.ftz.f32 %r2150, %r1990, %r379, %r2149; + shl.b32 %r2151, %r2146, 23; + ex2.approx.ftz.f32 %r2152, %r2150; + mul.f32 %r2153, %r2152, %r2151; + fma.rn.ftz.f32 %r2154, %r1991, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2155, %r2154; + fma.rm.ftz.f32 %r2156, %r2155, %r373, %r372; + add.f32 %r2157, %r2156, 0fCB40007F; + neg.f32 %r2158, %r2157; + fma.rn.ftz.f32 %r2159, %r1991, %r377, %r2158; + fma.rn.ftz.f32 %r2160, %r1991, %r379, %r2159; + shl.b32 %r2161, %r2156, 23; + ex2.approx.ftz.f32 %r2162, %r2160; + mul.f32 %r2163, %r2162, %r2161; + fma.rn.ftz.f32 %r2164, %r1992, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2165, %r2164; + fma.rm.ftz.f32 %r2166, %r2165, %r373, %r372; + add.f32 %r2167, %r2166, 0fCB40007F; + neg.f32 %r2168, %r2167; + fma.rn.ftz.f32 %r2169, %r1992, %r377, %r2168; + fma.rn.ftz.f32 %r2170, %r1992, %r379, %r2169; + shl.b32 %r2171, %r2166, 23; + ex2.approx.ftz.f32 %r2172, %r2170; + mul.f32 %r2173, %r2172, %r2171; + fma.rn.ftz.f32 %r2174, %r1993, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2175, %r2174; + fma.rm.ftz.f32 %r2176, %r2175, %r373, %r372; + add.f32 %r2177, %r2176, 0fCB40007F; + neg.f32 %r2178, %r2177; + fma.rn.ftz.f32 %r2179, %r1993, %r377, %r2178; + fma.rn.ftz.f32 %r2180, %r1993, %r379, %r2179; + shl.b32 %r2181, %r2176, 23; + ex2.approx.ftz.f32 %r2182, %r2180; + mul.f32 %r2183, %r2182, %r2181; + fma.rn.ftz.f32 %r2184, %r1994, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2185, %r2184; + fma.rm.ftz.f32 %r2186, %r2185, %r373, %r372; + add.f32 %r2187, %r2186, 0fCB40007F; + neg.f32 %r2188, %r2187; + fma.rn.ftz.f32 %r2189, %r1994, %r377, %r2188; + fma.rn.ftz.f32 %r2190, %r1994, %r379, %r2189; + shl.b32 %r2191, %r2186, 23; + ex2.approx.ftz.f32 %r2192, %r2190; + mul.f32 %r2193, %r2192, %r2191; + fma.rn.ftz.f32 %r2194, %r1995, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2195, %r2194; + fma.rm.ftz.f32 %r2196, %r2195, %r373, %r372; + add.f32 %r2197, %r2196, 0fCB40007F; + neg.f32 %r2198, %r2197; + fma.rn.ftz.f32 %r2199, %r1995, %r377, %r2198; + fma.rn.ftz.f32 %r2200, %r1995, %r379, %r2199; + shl.b32 %r2201, %r2196, 23; + ex2.approx.ftz.f32 %r2202, %r2200; + mul.f32 %r2203, %r2202, %r2201; + fma.rn.ftz.f32 %r2204, %r1996, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2205, %r2204; + fma.rm.ftz.f32 %r2206, %r2205, %r373, %r372; + add.f32 %r2207, %r2206, 0fCB40007F; + neg.f32 %r2208, %r2207; + fma.rn.ftz.f32 %r2209, %r1996, %r377, %r2208; + fma.rn.ftz.f32 %r2210, %r1996, %r379, %r2209; + shl.b32 %r2211, %r2206, 23; + ex2.approx.ftz.f32 %r2212, %r2210; + mul.f32 %r2213, %r2212, %r2211; + fma.rn.ftz.f32 %r2214, %r1997, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2215, %r2214; + fma.rm.ftz.f32 %r2216, %r2215, %r373, %r372; + add.f32 %r2217, %r2216, 0fCB40007F; + neg.f32 %r2218, %r2217; + fma.rn.ftz.f32 %r2219, %r1997, %r377, %r2218; + fma.rn.ftz.f32 %r2220, %r1997, %r379, %r2219; + shl.b32 %r2221, %r2216, 23; + ex2.approx.ftz.f32 %r2222, %r2220; + mul.f32 %r2223, %r2222, %r2221; + fma.rn.ftz.f32 %r2224, %r1998, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2225, %r2224; + fma.rm.ftz.f32 %r2226, %r2225, %r373, %r372; + add.f32 %r2227, %r2226, 0fCB40007F; + neg.f32 %r2228, %r2227; + fma.rn.ftz.f32 %r2229, %r1998, %r377, %r2228; + fma.rn.ftz.f32 %r2230, %r1998, %r379, %r2229; + shl.b32 %r2231, %r2226, 23; + ex2.approx.ftz.f32 %r2232, %r2230; + mul.f32 %r2233, %r2232, %r2231; + fma.rn.ftz.f32 %r2234, %r1999, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2235, %r2234; + fma.rm.ftz.f32 %r2236, %r2235, %r373, %r372; + add.f32 %r2237, %r2236, 0fCB40007F; + neg.f32 %r2238, %r2237; + fma.rn.ftz.f32 %r2239, %r1999, %r377, %r2238; + fma.rn.ftz.f32 %r2240, %r1999, %r379, %r2239; + shl.b32 %r2241, %r2236, 23; + ex2.approx.ftz.f32 %r2242, %r2240; + mul.f32 %r2243, %r2242, %r2241; + fma.rn.ftz.f32 %r2244, %r2000, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2245, %r2244; + fma.rm.ftz.f32 %r2246, %r2245, %r373, %r372; + add.f32 %r2247, %r2246, 0fCB40007F; + neg.f32 %r2248, %r2247; + fma.rn.ftz.f32 %r2249, %r2000, %r377, %r2248; + fma.rn.ftz.f32 %r2250, %r2000, %r379, %r2249; + shl.b32 %r2251, %r2246, 23; + ex2.approx.ftz.f32 %r2252, %r2250; + mul.f32 %r2253, %r2252, %r2251; + fma.rn.ftz.f32 %r2254, %r2001, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2255, %r2254; + fma.rm.ftz.f32 %r2256, %r2255, %r373, %r372; + add.f32 %r2257, %r2256, 0fCB40007F; + neg.f32 %r2258, %r2257; + fma.rn.ftz.f32 %r2259, %r2001, %r377, %r2258; + fma.rn.ftz.f32 %r2260, %r2001, %r379, %r2259; + shl.b32 %r2261, %r2256, 23; + ex2.approx.ftz.f32 %r2262, %r2260; + mul.f32 %r2263, %r2262, %r2261; + fma.rn.ftz.f32 %r2264, %r2002, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2265, %r2264; + fma.rm.ftz.f32 %r2266, %r2265, %r373, %r372; + add.f32 %r2267, %r2266, 0fCB40007F; + neg.f32 %r2268, %r2267; + fma.rn.ftz.f32 %r2269, %r2002, %r377, %r2268; + fma.rn.ftz.f32 %r2270, %r2002, %r379, %r2269; + shl.b32 %r2271, %r2266, 23; + ex2.approx.ftz.f32 %r2272, %r2270; + mul.f32 %r2273, %r2272, %r2271; + fma.rn.ftz.f32 %r2274, %r2003, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2275, %r2274; + fma.rm.ftz.f32 %r2276, %r2275, %r373, %r372; + add.f32 %r2277, %r2276, 0fCB40007F; + neg.f32 %r2278, %r2277; + fma.rn.ftz.f32 %r2279, %r2003, %r377, %r2278; + fma.rn.ftz.f32 %r2280, %r2003, %r379, %r2279; + shl.b32 %r2281, %r2276, 23; + ex2.approx.ftz.f32 %r2282, %r2280; + mul.f32 %r2283, %r2282, %r2281; + fma.rn.ftz.f32 %r2284, %r2004, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2285, %r2284; + fma.rm.ftz.f32 %r2286, %r2285, %r373, %r372; + add.f32 %r2287, %r2286, 0fCB40007F; + neg.f32 %r2288, %r2287; + fma.rn.ftz.f32 %r2289, %r2004, %r377, %r2288; + fma.rn.ftz.f32 %r2290, %r2004, %r379, %r2289; + shl.b32 %r2291, %r2286, 23; + ex2.approx.ftz.f32 %r2292, %r2290; + mul.f32 %r2293, %r2292, %r2291; + fma.rn.ftz.f32 %r2294, %r2005, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2295, %r2294; + fma.rm.ftz.f32 %r2296, %r2295, %r373, %r372; + add.f32 %r2297, %r2296, 0fCB40007F; + neg.f32 %r2298, %r2297; + fma.rn.ftz.f32 %r2299, %r2005, %r377, %r2298; + fma.rn.ftz.f32 %r2300, %r2005, %r379, %r2299; + shl.b32 %r2301, %r2296, 23; + ex2.approx.ftz.f32 %r2302, %r2300; + mul.f32 %r2303, %r2302, %r2301; + fma.rn.ftz.f32 %r2304, %r2006, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2305, %r2304; + fma.rm.ftz.f32 %r2306, %r2305, %r373, %r372; + add.f32 %r2307, %r2306, 0fCB40007F; + neg.f32 %r2308, %r2307; + fma.rn.ftz.f32 %r2309, %r2006, %r377, %r2308; + fma.rn.ftz.f32 %r2310, %r2006, %r379, %r2309; + shl.b32 %r2311, %r2306, 23; + ex2.approx.ftz.f32 %r2312, %r2310; + mul.f32 %r2313, %r2312, %r2311; + fma.rn.ftz.f32 %r2314, %r2007, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2315, %r2314; + fma.rm.ftz.f32 %r2316, %r2315, %r373, %r372; + add.f32 %r2317, %r2316, 0fCB40007F; + neg.f32 %r2318, %r2317; + fma.rn.ftz.f32 %r2319, %r2007, %r377, %r2318; + fma.rn.ftz.f32 %r2320, %r2007, %r379, %r2319; + shl.b32 %r2321, %r2316, 23; + ex2.approx.ftz.f32 %r2322, %r2320; + mul.f32 %r2323, %r2322, %r2321; + fma.rn.ftz.f32 %r2324, %r2008, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2325, %r2324; + fma.rm.ftz.f32 %r2326, %r2325, %r373, %r372; + add.f32 %r2327, %r2326, 0fCB40007F; + neg.f32 %r2328, %r2327; + fma.rn.ftz.f32 %r2329, %r2008, %r377, %r2328; + fma.rn.ftz.f32 %r2330, %r2008, %r379, %r2329; + shl.b32 %r2331, %r2326, 23; + ex2.approx.ftz.f32 %r2332, %r2330; + mul.f32 %r2333, %r2332, %r2331; + fma.rn.ftz.f32 %r2334, %r2009, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2335, %r2334; + fma.rm.ftz.f32 %r2336, %r2335, %r373, %r372; + add.f32 %r2337, %r2336, 0fCB40007F; + neg.f32 %r2338, %r2337; + fma.rn.ftz.f32 %r2339, %r2009, %r377, %r2338; + fma.rn.ftz.f32 %r2340, %r2009, %r379, %r2339; + shl.b32 %r2341, %r2336, 23; + ex2.approx.ftz.f32 %r2342, %r2340; + mul.f32 %r2343, %r2342, %r2341; + fma.rn.ftz.f32 %r2344, %r2010, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2345, %r2344; + fma.rm.ftz.f32 %r2346, %r2345, %r373, %r372; + add.f32 %r2347, %r2346, 0fCB40007F; + neg.f32 %r2348, %r2347; + fma.rn.ftz.f32 %r2349, %r2010, %r377, %r2348; + fma.rn.ftz.f32 %r2350, %r2010, %r379, %r2349; + shl.b32 %r2351, %r2346, 23; + ex2.approx.ftz.f32 %r2352, %r2350; + mul.f32 %r2353, %r2352, %r2351; + fma.rn.ftz.f32 %r2354, %r2011, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2355, %r2354; + fma.rm.ftz.f32 %r2356, %r2355, %r373, %r372; + add.f32 %r2357, %r2356, 0fCB40007F; + neg.f32 %r2358, %r2357; + fma.rn.ftz.f32 %r2359, %r2011, %r377, %r2358; + fma.rn.ftz.f32 %r2360, %r2011, %r379, %r2359; + shl.b32 %r2361, %r2356, 23; + ex2.approx.ftz.f32 %r2362, %r2360; + mul.f32 %r2363, %r2362, %r2361; + fma.rn.ftz.f32 %r2364, %r2012, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2365, %r2364; + fma.rm.ftz.f32 %r2366, %r2365, %r373, %r372; + add.f32 %r2367, %r2366, 0fCB40007F; + neg.f32 %r2368, %r2367; + fma.rn.ftz.f32 %r2369, %r2012, %r377, %r2368; + fma.rn.ftz.f32 %r2370, %r2012, %r379, %r2369; + shl.b32 %r2371, %r2366, 23; + ex2.approx.ftz.f32 %r2372, %r2370; + mul.f32 %r2373, %r2372, %r2371; + fma.rn.ftz.f32 %r2374, %r2013, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2375, %r2374; + fma.rm.ftz.f32 %r2376, %r2375, %r373, %r372; + add.f32 %r2377, %r2376, 0fCB40007F; + neg.f32 %r2378, %r2377; + fma.rn.ftz.f32 %r2379, %r2013, %r377, %r2378; + fma.rn.ftz.f32 %r2380, %r2013, %r379, %r2379; + shl.b32 %r2381, %r2376, 23; + ex2.approx.ftz.f32 %r2382, %r2380; + mul.f32 %r2383, %r2382, %r2381; + fma.rn.ftz.f32 %r2384, %r2014, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2385, %r2384; + fma.rm.ftz.f32 %r2386, %r2385, %r373, %r372; + add.f32 %r2387, %r2386, 0fCB40007F; + neg.f32 %r2388, %r2387; + fma.rn.ftz.f32 %r2389, %r2014, %r377, %r2388; + fma.rn.ftz.f32 %r2390, %r2014, %r379, %r2389; + shl.b32 %r2391, %r2386, 23; + ex2.approx.ftz.f32 %r2392, %r2390; + mul.f32 %r2393, %r2392, %r2391; + fma.rn.ftz.f32 %r2394, %r2015, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2395, %r2394; + fma.rm.ftz.f32 %r2396, %r2395, %r373, %r372; + add.f32 %r2397, %r2396, 0fCB40007F; + neg.f32 %r2398, %r2397; + fma.rn.ftz.f32 %r2399, %r2015, %r377, %r2398; + fma.rn.ftz.f32 %r2400, %r2015, %r379, %r2399; + shl.b32 %r2401, %r2396, 23; + ex2.approx.ftz.f32 %r2402, %r2400; + mul.f32 %r2403, %r2402, %r2401; + fma.rn.ftz.f32 %r2404, %r2016, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2405, %r2404; + fma.rm.ftz.f32 %r2406, %r2405, %r373, %r372; + add.f32 %r2407, %r2406, 0fCB40007F; + neg.f32 %r2408, %r2407; + fma.rn.ftz.f32 %r2409, %r2016, %r377, %r2408; + fma.rn.ftz.f32 %r2410, %r2016, %r379, %r2409; + shl.b32 %r2411, %r2406, 23; + ex2.approx.ftz.f32 %r2412, %r2410; + mul.f32 %r2413, %r2412, %r2411; + fma.rn.ftz.f32 %r2414, %r2017, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2415, %r2414; + fma.rm.ftz.f32 %r2416, %r2415, %r373, %r372; + add.f32 %r2417, %r2416, 0fCB40007F; + neg.f32 %r2418, %r2417; + fma.rn.ftz.f32 %r2419, %r2017, %r377, %r2418; + fma.rn.ftz.f32 %r2420, %r2017, %r379, %r2419; + shl.b32 %r2421, %r2416, 23; + ex2.approx.ftz.f32 %r2422, %r2420; + mul.f32 %r2423, %r2422, %r2421; + fma.rn.ftz.f32 %r2424, %r2018, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2425, %r2424; + fma.rm.ftz.f32 %r2426, %r2425, %r373, %r372; + add.f32 %r2427, %r2426, 0fCB40007F; + neg.f32 %r2428, %r2427; + fma.rn.ftz.f32 %r2429, %r2018, %r377, %r2428; + fma.rn.ftz.f32 %r2430, %r2018, %r379, %r2429; + shl.b32 %r2431, %r2426, 23; + ex2.approx.ftz.f32 %r2432, %r2430; + mul.f32 %r2433, %r2432, %r2431; + fma.rn.ftz.f32 %r2434, %r2019, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2435, %r2434; + fma.rm.ftz.f32 %r2436, %r2435, %r373, %r372; + add.f32 %r2437, %r2436, 0fCB40007F; + neg.f32 %r2438, %r2437; + fma.rn.ftz.f32 %r2439, %r2019, %r377, %r2438; + fma.rn.ftz.f32 %r2440, %r2019, %r379, %r2439; + shl.b32 %r2441, %r2436, 23; + ex2.approx.ftz.f32 %r2442, %r2440; + mul.f32 %r2443, %r2442, %r2441; + fma.rn.ftz.f32 %r2444, %r2020, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2445, %r2444; + fma.rm.ftz.f32 %r2446, %r2445, %r373, %r372; + add.f32 %r2447, %r2446, 0fCB40007F; + neg.f32 %r2448, %r2447; + fma.rn.ftz.f32 %r2449, %r2020, %r377, %r2448; + fma.rn.ftz.f32 %r2450, %r2020, %r379, %r2449; + shl.b32 %r2451, %r2446, 23; + ex2.approx.ftz.f32 %r2452, %r2450; + mul.f32 %r2453, %r2452, %r2451; + fma.rn.ftz.f32 %r2454, %r2021, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2455, %r2454; + fma.rm.ftz.f32 %r2456, %r2455, %r373, %r372; + add.f32 %r2457, %r2456, 0fCB40007F; + neg.f32 %r2458, %r2457; + fma.rn.ftz.f32 %r2459, %r2021, %r377, %r2458; + fma.rn.ftz.f32 %r2460, %r2021, %r379, %r2459; + shl.b32 %r2461, %r2456, 23; + ex2.approx.ftz.f32 %r2462, %r2460; + mul.f32 %r2463, %r2462, %r2461; + fma.rn.ftz.f32 %r2464, %r2022, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2465, %r2464; + fma.rm.ftz.f32 %r2466, %r2465, %r373, %r372; + add.f32 %r2467, %r2466, 0fCB40007F; + neg.f32 %r2468, %r2467; + fma.rn.ftz.f32 %r2469, %r2022, %r377, %r2468; + fma.rn.ftz.f32 %r2470, %r2022, %r379, %r2469; + shl.b32 %r2471, %r2466, 23; + ex2.approx.ftz.f32 %r2472, %r2470; + mul.f32 %r2473, %r2472, %r2471; + fma.rn.ftz.f32 %r2474, %r2023, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2475, %r2474; + fma.rm.ftz.f32 %r2476, %r2475, %r373, %r372; + add.f32 %r2477, %r2476, 0fCB40007F; + neg.f32 %r2478, %r2477; + fma.rn.ftz.f32 %r2479, %r2023, %r377, %r2478; + fma.rn.ftz.f32 %r2480, %r2023, %r379, %r2479; + shl.b32 %r2481, %r2476, 23; + ex2.approx.ftz.f32 %r2482, %r2480; + mul.f32 %r2483, %r2482, %r2481; + fma.rn.ftz.f32 %r2484, %r2024, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2485, %r2484; + fma.rm.ftz.f32 %r2486, %r2485, %r373, %r372; + add.f32 %r2487, %r2486, 0fCB40007F; + neg.f32 %r2488, %r2487; + fma.rn.ftz.f32 %r2489, %r2024, %r377, %r2488; + fma.rn.ftz.f32 %r2490, %r2024, %r379, %r2489; + shl.b32 %r2491, %r2486, 23; + ex2.approx.ftz.f32 %r2492, %r2490; + mul.f32 %r2493, %r2492, %r2491; + fma.rn.ftz.f32 %r2494, %r2025, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2495, %r2494; + fma.rm.ftz.f32 %r2496, %r2495, %r373, %r372; + add.f32 %r2497, %r2496, 0fCB40007F; + neg.f32 %r2498, %r2497; + fma.rn.ftz.f32 %r2499, %r2025, %r377, %r2498; + fma.rn.ftz.f32 %r2500, %r2025, %r379, %r2499; + shl.b32 %r2501, %r2496, 23; + ex2.approx.ftz.f32 %r2502, %r2500; + mul.f32 %r2503, %r2502, %r2501; + fma.rn.ftz.f32 %r2504, %r2026, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2505, %r2504; + fma.rm.ftz.f32 %r2506, %r2505, %r373, %r372; + add.f32 %r2507, %r2506, 0fCB40007F; + neg.f32 %r2508, %r2507; + fma.rn.ftz.f32 %r2509, %r2026, %r377, %r2508; + fma.rn.ftz.f32 %r2510, %r2026, %r379, %r2509; + shl.b32 %r2511, %r2506, 23; + ex2.approx.ftz.f32 %r2512, %r2510; + mul.f32 %r2513, %r2512, %r2511; + fma.rn.ftz.f32 %r2514, %r2027, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2515, %r2514; + fma.rm.ftz.f32 %r2516, %r2515, %r373, %r372; + add.f32 %r2517, %r2516, 0fCB40007F; + neg.f32 %r2518, %r2517; + fma.rn.ftz.f32 %r2519, %r2027, %r377, %r2518; + fma.rn.ftz.f32 %r2520, %r2027, %r379, %r2519; + shl.b32 %r2521, %r2516, 23; + ex2.approx.ftz.f32 %r2522, %r2520; + mul.f32 %r2523, %r2522, %r2521; + fma.rn.ftz.f32 %r2524, %r2028, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2525, %r2524; + fma.rm.ftz.f32 %r2526, %r2525, %r373, %r372; + add.f32 %r2527, %r2526, 0fCB40007F; + neg.f32 %r2528, %r2527; + fma.rn.ftz.f32 %r2529, %r2028, %r377, %r2528; + fma.rn.ftz.f32 %r2530, %r2028, %r379, %r2529; + shl.b32 %r2531, %r2526, 23; + ex2.approx.ftz.f32 %r2532, %r2530; + mul.f32 %r2533, %r2532, %r2531; + fma.rn.ftz.f32 %r2534, %r2029, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2535, %r2534; + fma.rm.ftz.f32 %r2536, %r2535, %r373, %r372; + add.f32 %r2537, %r2536, 0fCB40007F; + neg.f32 %r2538, %r2537; + fma.rn.ftz.f32 %r2539, %r2029, %r377, %r2538; + fma.rn.ftz.f32 %r2540, %r2029, %r379, %r2539; + shl.b32 %r2541, %r2536, 23; + ex2.approx.ftz.f32 %r2542, %r2540; + mul.f32 %r2543, %r2542, %r2541; + fma.rn.ftz.f32 %r2544, %r2030, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2545, %r2544; + fma.rm.ftz.f32 %r2546, %r2545, %r373, %r372; + add.f32 %r2547, %r2546, 0fCB40007F; + neg.f32 %r2548, %r2547; + fma.rn.ftz.f32 %r2549, %r2030, %r377, %r2548; + fma.rn.ftz.f32 %r2550, %r2030, %r379, %r2549; + shl.b32 %r2551, %r2546, 23; + ex2.approx.ftz.f32 %r2552, %r2550; + mul.f32 %r2553, %r2552, %r2551; + fma.rn.ftz.f32 %r2554, %r2031, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2555, %r2554; + fma.rm.ftz.f32 %r2556, %r2555, %r373, %r372; + add.f32 %r2557, %r2556, 0fCB40007F; + neg.f32 %r2558, %r2557; + fma.rn.ftz.f32 %r2559, %r2031, %r377, %r2558; + fma.rn.ftz.f32 %r2560, %r2031, %r379, %r2559; + shl.b32 %r2561, %r2556, 23; + ex2.approx.ftz.f32 %r2562, %r2560; + mul.f32 %r2563, %r2562, %r2561; + fma.rn.ftz.f32 %r2564, %r2032, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2565, %r2564; + fma.rm.ftz.f32 %r2566, %r2565, %r373, %r372; + add.f32 %r2567, %r2566, 0fCB40007F; + neg.f32 %r2568, %r2567; + fma.rn.ftz.f32 %r2569, %r2032, %r377, %r2568; + fma.rn.ftz.f32 %r2570, %r2032, %r379, %r2569; + shl.b32 %r2571, %r2566, 23; + ex2.approx.ftz.f32 %r2572, %r2570; + mul.f32 %r2573, %r2572, %r2571; + fma.rn.ftz.f32 %r2574, %r2033, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2575, %r2574; + fma.rm.ftz.f32 %r2576, %r2575, %r373, %r372; + add.f32 %r2577, %r2576, 0fCB40007F; + neg.f32 %r2578, %r2577; + fma.rn.ftz.f32 %r2579, %r2033, %r377, %r2578; + fma.rn.ftz.f32 %r2580, %r2033, %r379, %r2579; + shl.b32 %r2581, %r2576, 23; + ex2.approx.ftz.f32 %r2582, %r2580; + mul.f32 %r2583, %r2582, %r2581; + fma.rn.ftz.f32 %r2584, %r2034, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2585, %r2584; + fma.rm.ftz.f32 %r2586, %r2585, %r373, %r372; + add.f32 %r2587, %r2586, 0fCB40007F; + neg.f32 %r2588, %r2587; + fma.rn.ftz.f32 %r2589, %r2034, %r377, %r2588; + fma.rn.ftz.f32 %r2590, %r2034, %r379, %r2589; + shl.b32 %r2591, %r2586, 23; + ex2.approx.ftz.f32 %r2592, %r2590; + mul.f32 %r2593, %r2592, %r2591; + fma.rn.ftz.f32 %r2594, %r2035, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2595, %r2594; + fma.rm.ftz.f32 %r2596, %r2595, %r373, %r372; + add.f32 %r2597, %r2596, 0fCB40007F; + neg.f32 %r2598, %r2597; + fma.rn.ftz.f32 %r2599, %r2035, %r377, %r2598; + fma.rn.ftz.f32 %r2600, %r2035, %r379, %r2599; + shl.b32 %r2601, %r2596, 23; + ex2.approx.ftz.f32 %r2602, %r2600; + mul.f32 %r2603, %r2602, %r2601; + fma.rn.ftz.f32 %r2604, %r2036, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2605, %r2604; + fma.rm.ftz.f32 %r2606, %r2605, %r373, %r372; + add.f32 %r2607, %r2606, 0fCB40007F; + neg.f32 %r2608, %r2607; + fma.rn.ftz.f32 %r2609, %r2036, %r377, %r2608; + fma.rn.ftz.f32 %r2610, %r2036, %r379, %r2609; + shl.b32 %r2611, %r2606, 23; + ex2.approx.ftz.f32 %r2612, %r2610; + mul.f32 %r2613, %r2612, %r2611; + fma.rn.ftz.f32 %r2614, %r2037, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2615, %r2614; + fma.rm.ftz.f32 %r2616, %r2615, %r373, %r372; + add.f32 %r2617, %r2616, 0fCB40007F; + neg.f32 %r2618, %r2617; + fma.rn.ftz.f32 %r2619, %r2037, %r377, %r2618; + fma.rn.ftz.f32 %r2620, %r2037, %r379, %r2619; + shl.b32 %r2621, %r2616, 23; + ex2.approx.ftz.f32 %r2622, %r2620; + mul.f32 %r2623, %r2622, %r2621; + fma.rn.ftz.f32 %r2624, %r2038, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2625, %r2624; + fma.rm.ftz.f32 %r2626, %r2625, %r373, %r372; + fma.rn.ftz.f32 %r2627, %r2039, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2628, %r2627; + fma.rm.ftz.f32 %r2629, %r2628, %r373, %r372; + fma.rn.ftz.f32 %r2630, %r2040, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2631, %r2630; + fma.rm.ftz.f32 %r2632, %r2631, %r373, %r372; + fma.rn.ftz.f32 %r2633, %r2041, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2634, %r2633; + fma.rm.ftz.f32 %r2635, %r2634, %r373, %r372; + fma.rn.ftz.f32 %r2636, %r2042, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2637, %r2636; + fma.rm.ftz.f32 %r2638, %r2637, %r373, %r372; + fma.rn.ftz.f32 %r2639, %r2043, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2640, %r2639; + fma.rm.ftz.f32 %r2641, %r2640, %r373, %r372; + .loc 2 181 31 // triton_helpers.py:181:31 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mul.f32 %r2642, %r1637, %r2063; + mov.b32 %r2643, -8323200; +$L__tmp4: + .loc 2 196 19 // triton_helpers.py:196:19 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.eq.bf16x2 %p242|%p243, %r58, %r2643; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + mov.b64 %rd67, {%r956, %r959}; + cvt.u32.u64 %r2644, %rd67; + add.f32 %r2645, %r956, 0fCB40007F; + neg.f32 %r2646, %r2645; + fma.rn.ftz.f32 %r2647, %r362, %r377, %r2646; + fma.rn.ftz.f32 %r2648, %r362, %r379, %r2647; + ex2.approx.ftz.f32 %r2649, %r2648; + add.f32 %r2650, %r959, 0fCB40007F; + neg.f32 %r2651, %r2650; + fma.rn.ftz.f32 %r2652, %r363, %r377, %r2651; + fma.rn.ftz.f32 %r2653, %r363, %r379, %r2652; + shl.b64 %rd68, %rd67, 23; + and.b64 %rd69, %rd68, -36028797018963968; + shl.b32 %r2654, %r2644, 23; + cvt.u64.u32 %rd70, %r2654; + or.b64 %rd71, %rd70, %rd69; + ex2.approx.ftz.f32 %r2655, %r2653; + mov.b64 {%r2656, %r2657}, %rd71; + mul.f32 %r2658, %r2649, %r2656; + mul.f32 %r2659, %r2655, %r2657; + mov.b64 %rd72, {%r1618, %r1621}; + cvt.u32.u64 %r2660, %rd72; + add.f32 %r2661, %r1618, 0fCB40007F; + neg.f32 %r2662, %r2661; + fma.rn.ftz.f32 %r2663, %r1030, %r377, %r2662; + fma.rn.ftz.f32 %r2664, %r1030, %r379, %r2663; + ex2.approx.ftz.f32 %r2665, %r2664; + add.f32 %r2666, %r1621, 0fCB40007F; + neg.f32 %r2667, %r2666; + fma.rn.ftz.f32 %r2668, %r1031, %r377, %r2667; + fma.rn.ftz.f32 %r2669, %r1031, %r379, %r2668; + shl.b64 %rd73, %rd72, 23; + and.b64 %rd74, %rd73, -36028797018963968; + shl.b32 %r2670, %r2660, 23; + cvt.u64.u32 %rd75, %r2670; + or.b64 %rd76, %rd75, %rd74; + ex2.approx.ftz.f32 %r2671, %r2669; + mov.b64 {%r2672, %r2673}, %rd76; + mul.f32 %r2674, %r2665, %r2672; + mul.f32 %r2675, %r2671, %r2673; + .loc 2 205 36 // triton_helpers.py:205:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.f32 %r2676, %r2659, 0f00000000, %r2675; + fma.rn.f32 %r2677, %r2658, 0f00000000, %r2674; + selp.f32 %r2678, 0f3F800000, %r2677, %p242; + selp.f32 %r2679, 0f3F800000, %r2676, %p243; +$L__tmp5: + .loc 1 46 54 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:46:54 + selp.f32 %r2680, %r2679, 0f00000000, %p8; + selp.f32 %r2681, %r2678, 0f00000000, %p8; +$L__tmp6: + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mov.b64 %rd77, {%r2626, %r2629}; + cvt.u32.u64 %r2682, %rd77; + add.f32 %r2683, %r2626, 0fCB40007F; + neg.f32 %r2684, %r2683; + fma.rn.ftz.f32 %r2685, %r2038, %r377, %r2684; + fma.rn.ftz.f32 %r2686, %r2038, %r379, %r2685; + ex2.approx.ftz.f32 %r2687, %r2686; + add.f32 %r2688, %r2629, 0fCB40007F; + neg.f32 %r2689, %r2688; + fma.rn.ftz.f32 %r2690, %r2039, %r377, %r2689; + fma.rn.ftz.f32 %r2691, %r2039, %r379, %r2690; + shl.b64 %rd78, %rd77, 23; + and.b64 %rd79, %rd78, -36028797018963968; + shl.b32 %r2692, %r2682, 23; + cvt.u64.u32 %rd80, %r2692; + or.b64 %rd81, %rd80, %rd79; + ex2.approx.ftz.f32 %r2693, %r2691; + mov.b64 {%r2694, %r2695}, %rd81; + mul.f32 %r2696, %r2693, %r2695; + mul.f32 %r2697, %r2687, %r2694; +$L__tmp7: + .loc 2 196 19 // triton_helpers.py:196:19 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.eq.bf16x2 %p244|%p245, %r59, %r2643; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + mov.b64 %rd82, {%r962, %r965}; + cvt.u32.u64 %r2698, %rd82; + add.f32 %r2699, %r962, 0fCB40007F; + neg.f32 %r2700, %r2699; + fma.rn.ftz.f32 %r2701, %r364, %r377, %r2700; + fma.rn.ftz.f32 %r2702, %r364, %r379, %r2701; + ex2.approx.ftz.f32 %r2703, %r2702; + add.f32 %r2704, %r965, 0fCB40007F; + neg.f32 %r2705, %r2704; + fma.rn.ftz.f32 %r2706, %r365, %r377, %r2705; + fma.rn.ftz.f32 %r2707, %r365, %r379, %r2706; + shl.b64 %rd83, %rd82, 23; + and.b64 %rd84, %rd83, -36028797018963968; + shl.b32 %r2708, %r2698, 23; + cvt.u64.u32 %rd85, %r2708; + or.b64 %rd86, %rd85, %rd84; + ex2.approx.ftz.f32 %r2709, %r2707; + mov.b64 {%r2710, %r2711}, %rd86; + mul.f32 %r2712, %r2703, %r2710; + mul.f32 %r2713, %r2709, %r2711; + mov.b64 %rd87, {%r1624, %r1627}; + cvt.u32.u64 %r2714, %rd87; + add.f32 %r2715, %r1624, 0fCB40007F; + neg.f32 %r2716, %r2715; + fma.rn.ftz.f32 %r2717, %r1032, %r377, %r2716; + fma.rn.ftz.f32 %r2718, %r1032, %r379, %r2717; + ex2.approx.ftz.f32 %r2719, %r2718; + add.f32 %r2720, %r1627, 0fCB40007F; + neg.f32 %r2721, %r2720; + fma.rn.ftz.f32 %r2722, %r1033, %r377, %r2721; + fma.rn.ftz.f32 %r2723, %r1033, %r379, %r2722; + shl.b64 %rd88, %rd87, 23; + and.b64 %rd89, %rd88, -36028797018963968; + shl.b32 %r2724, %r2714, 23; + cvt.u64.u32 %rd90, %r2724; + or.b64 %rd91, %rd90, %rd89; + ex2.approx.ftz.f32 %r2725, %r2723; + mov.b64 {%r2726, %r2727}, %rd91; + mul.f32 %r2728, %r2719, %r2726; + mul.f32 %r2729, %r2725, %r2727; + .loc 2 205 36 // triton_helpers.py:205:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.f32 %r2730, %r2713, 0f00000000, %r2729; + fma.rn.f32 %r2731, %r2712, 0f00000000, %r2728; + selp.f32 %r2732, 0f3F800000, %r2731, %p244; + selp.f32 %r2733, 0f3F800000, %r2730, %p245; +$L__tmp8: + .loc 1 46 54 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:46:54 + selp.f32 %r2734, %r2733, 0f00000000, %p8; + selp.f32 %r2735, %r2732, 0f00000000, %p8; +$L__tmp9: + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mov.b64 %rd92, {%r2632, %r2635}; + cvt.u32.u64 %r2736, %rd92; + add.f32 %r2737, %r2632, 0fCB40007F; + neg.f32 %r2738, %r2737; + fma.rn.ftz.f32 %r2739, %r2040, %r377, %r2738; + fma.rn.ftz.f32 %r2740, %r2040, %r379, %r2739; + ex2.approx.ftz.f32 %r2741, %r2740; + add.f32 %r2742, %r2635, 0fCB40007F; + neg.f32 %r2743, %r2742; + fma.rn.ftz.f32 %r2744, %r2041, %r377, %r2743; + fma.rn.ftz.f32 %r2745, %r2041, %r379, %r2744; + shl.b64 %rd93, %rd92, 23; + and.b64 %rd94, %rd93, -36028797018963968; + shl.b32 %r2746, %r2736, 23; + cvt.u64.u32 %rd95, %r2746; + or.b64 %rd96, %rd95, %rd94; + ex2.approx.ftz.f32 %r2747, %r2745; + mov.b64 {%r2748, %r2749}, %rd96; + mul.f32 %r2750, %r2747, %r2749; + mul.f32 %r2751, %r2741, %r2748; +$L__tmp10: + .loc 2 196 19 // triton_helpers.py:196:19 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.eq.bf16x2 %p246|%p247, %r60, %r2643; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + mov.b64 %rd97, {%r968, %r971}; + cvt.u32.u64 %r2752, %rd97; + add.f32 %r2753, %r968, 0fCB40007F; + neg.f32 %r2754, %r2753; + fma.rn.ftz.f32 %r2755, %r366, %r377, %r2754; + fma.rn.ftz.f32 %r2756, %r366, %r379, %r2755; + ex2.approx.ftz.f32 %r2757, %r2756; + add.f32 %r2758, %r971, 0fCB40007F; + neg.f32 %r2759, %r2758; + fma.rn.ftz.f32 %r2760, %r367, %r377, %r2759; + fma.rn.ftz.f32 %r2761, %r367, %r379, %r2760; + shl.b64 %rd98, %rd97, 23; + and.b64 %rd99, %rd98, -36028797018963968; + shl.b32 %r2762, %r2752, 23; + cvt.u64.u32 %rd100, %r2762; + or.b64 %rd101, %rd100, %rd99; + ex2.approx.ftz.f32 %r2763, %r2761; + mov.b64 {%r2764, %r2765}, %rd101; + mul.f32 %r2766, %r2757, %r2764; + mul.f32 %r2767, %r2763, %r2765; + mov.b64 %rd102, {%r1630, %r1633}; + cvt.u32.u64 %r2768, %rd102; + add.f32 %r2769, %r1630, 0fCB40007F; + neg.f32 %r2770, %r2769; + fma.rn.ftz.f32 %r2771, %r1034, %r377, %r2770; + fma.rn.ftz.f32 %r2772, %r1034, %r379, %r2771; + ex2.approx.ftz.f32 %r2773, %r2772; + add.f32 %r2774, %r1633, 0fCB40007F; + neg.f32 %r2775, %r2774; + fma.rn.ftz.f32 %r2776, %r1035, %r377, %r2775; + fma.rn.ftz.f32 %r2777, %r1035, %r379, %r2776; + shl.b64 %rd103, %rd102, 23; + and.b64 %rd104, %rd103, -36028797018963968; + shl.b32 %r2778, %r2768, 23; + cvt.u64.u32 %rd105, %r2778; + or.b64 %rd106, %rd105, %rd104; + ex2.approx.ftz.f32 %r2779, %r2777; + mov.b64 {%r2780, %r2781}, %rd106; + mul.f32 %r2782, %r2773, %r2780; + mul.f32 %r2783, %r2779, %r2781; + .loc 2 205 36 // triton_helpers.py:205:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.f32 %r2784, %r2767, 0f00000000, %r2783; + fma.rn.f32 %r2785, %r2766, 0f00000000, %r2782; + selp.f32 %r2786, 0f3F800000, %r2785, %p246; + selp.f32 %r2787, 0f3F800000, %r2784, %p247; +$L__tmp11: + .loc 1 46 54 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:46:54 + selp.f32 %r2788, %r2787, 0f00000000, %p8; + selp.f32 %r2789, %r2786, 0f00000000, %p8; +$L__tmp12: + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mov.b64 %rd107, {%r2638, %r2641}; + cvt.u32.u64 %r2790, %rd107; + add.f32 %r2791, %r2638, 0fCB40007F; + neg.f32 %r2792, %r2791; + fma.rn.ftz.f32 %r2793, %r2042, %r377, %r2792; + fma.rn.ftz.f32 %r2794, %r2042, %r379, %r2793; + ex2.approx.ftz.f32 %r2795, %r2794; + add.f32 %r2796, %r2641, 0fCB40007F; + neg.f32 %r2797, %r2796; + fma.rn.ftz.f32 %r2798, %r2043, %r377, %r2797; + fma.rn.ftz.f32 %r2799, %r2043, %r379, %r2798; + shl.b64 %rd108, %rd107, 23; + and.b64 %rd109, %rd108, -36028797018963968; + shl.b32 %r2800, %r2790, 23; + cvt.u64.u32 %rd110, %r2800; + or.b64 %rd111, %rd110, %rd109; + ex2.approx.ftz.f32 %r2801, %r2799; + mov.b64 {%r2802, %r2803}, %rd111; + mul.f32 %r2804, %r2801, %r2803; + mul.f32 %r2805, %r2795, %r2802; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + bar.sync 0; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + fma.rn.f32 %r2806, %r1635, %r2053, %r2642; + fma.rn.f32 %r2807, %r1639, %r2073, %r2806; + fma.rn.f32 %r2808, %r1641, %r2083, %r2807; + fma.rn.f32 %r2809, %r1643, %r2093, %r2808; + fma.rn.f32 %r2810, %r1645, %r2103, %r2809; + fma.rn.f32 %r2811, %r1647, %r2113, %r2810; + fma.rn.f32 %r2812, %r1649, %r2123, %r2811; + fma.rn.f32 %r2813, %r1651, %r2133, %r2812; + fma.rn.f32 %r2814, %r1653, %r2143, %r2813; + fma.rn.f32 %r2815, %r1655, %r2153, %r2814; + fma.rn.f32 %r2816, %r1657, %r2163, %r2815; + fma.rn.f32 %r2817, %r1659, %r2173, %r2816; + fma.rn.f32 %r2818, %r1661, %r2183, %r2817; + fma.rn.f32 %r2819, %r1663, %r2193, %r2818; + fma.rn.f32 %r2820, %r1665, %r2203, %r2819; + fma.rn.f32 %r2821, %r1667, %r2213, %r2820; + fma.rn.f32 %r2822, %r1669, %r2223, %r2821; + fma.rn.f32 %r2823, %r1671, %r2233, %r2822; + fma.rn.f32 %r2824, %r1673, %r2243, %r2823; + fma.rn.f32 %r2825, %r1675, %r2253, %r2824; + fma.rn.f32 %r2826, %r1677, %r2263, %r2825; + fma.rn.f32 %r2827, %r1679, %r2273, %r2826; + fma.rn.f32 %r2828, %r1681, %r2283, %r2827; + fma.rn.f32 %r2829, %r1683, %r2293, %r2828; + fma.rn.f32 %r2830, %r1685, %r2303, %r2829; + fma.rn.f32 %r2831, %r1687, %r2313, %r2830; + fma.rn.f32 %r2832, %r1689, %r2323, %r2831; + fma.rn.f32 %r2833, %r1691, %r2333, %r2832; + fma.rn.f32 %r2834, %r1693, %r2343, %r2833; + fma.rn.f32 %r2835, %r1695, %r2353, %r2834; + fma.rn.f32 %r2836, %r1697, %r2363, %r2835; + fma.rn.f32 %r2837, %r1699, %r2373, %r2836; + fma.rn.f32 %r2838, %r1701, %r2383, %r2837; + fma.rn.f32 %r2839, %r1703, %r2393, %r2838; + fma.rn.f32 %r2840, %r1705, %r2403, %r2839; + fma.rn.f32 %r2841, %r1707, %r2413, %r2840; + fma.rn.f32 %r2842, %r1709, %r2423, %r2841; + fma.rn.f32 %r2843, %r1711, %r2433, %r2842; + fma.rn.f32 %r2844, %r1713, %r2443, %r2843; + fma.rn.f32 %r2845, %r1715, %r2453, %r2844; + fma.rn.f32 %r2846, %r1717, %r2463, %r2845; + fma.rn.f32 %r2847, %r1719, %r2473, %r2846; + fma.rn.f32 %r2848, %r1721, %r2483, %r2847; + fma.rn.f32 %r2849, %r1723, %r2493, %r2848; + fma.rn.f32 %r2850, %r1725, %r2503, %r2849; + fma.rn.f32 %r2851, %r1727, %r2513, %r2850; + fma.rn.f32 %r2852, %r1729, %r2523, %r2851; + fma.rn.f32 %r2853, %r1731, %r2533, %r2852; + fma.rn.f32 %r2854, %r1733, %r2543, %r2853; + fma.rn.f32 %r2855, %r1735, %r2553, %r2854; + fma.rn.f32 %r2856, %r1737, %r2563, %r2855; + fma.rn.f32 %r2857, %r1739, %r2573, %r2856; + fma.rn.f32 %r2858, %r1741, %r2583, %r2857; + fma.rn.f32 %r2859, %r1743, %r2593, %r2858; + fma.rn.f32 %r2860, %r1745, %r2603, %r2859; + fma.rn.f32 %r2861, %r1758, %r2613, %r2860; + fma.rn.f32 %r2862, %r1759, %r2623, %r2861; + fma.rn.f32 %r2863, %r2681, %r2697, %r2862; + fma.rn.f32 %r2864, %r2680, %r2696, %r2863; + fma.rn.f32 %r2865, %r2735, %r2751, %r2864; + fma.rn.f32 %r2866, %r2734, %r2750, %r2865; + fma.rn.f32 %r2867, %r2789, %r2805, %r2866; + fma.rn.f32 %r2868, %r2788, %r2804, %r2867; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r2869, %r2868, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r2870, %r2868, %r2869; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r2871, %r2870, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r2872, %r2870, %r2871; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r2873, %r2872, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r2874, %r2872, %r2873; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r2875, %r2874, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r2876, %r2874, %r2875; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r2877, %r2876, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r72, %r2876, %r2877; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + // begin inline asm + @%p9 st.shared.b32 [ %r65 + 0 ], %r72; + // end inline asm + bar.sync 0; + // begin inline asm + @%p10 ld.shared.b32 %r73, [ %r68 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r2878, %r73, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r2879, %r73, %r2878; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r2880, %r2879, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r2881, %r2879, %r2880; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r2882, %r2881, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r2883, %r2881, %r2882; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r2884, %r2883, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r76, %r2883, %r2884; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + // begin inline asm + @%p11 st.shared.b32 [ %r68 + 0 ], %r76; + // end inline asm + bar.sync 0; + ld.shared.b32 %r2885, [global_smem]; +$L__tmp13: + .loc 1 58 52 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:58:52 + // begin inline asm + mov.u64 %rd27, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd27, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r77, %r5; + mov.u32 %r78, %r5; + mov.u32 %r79, %r5; + mov.u32 %r80, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r77, %r78, %r79, %r80 }, [ %rd2 + 0 ], %rd27; + // end inline asm + mov.b32 {%rs66, %rs67}, %r77; + mov.b32 {%rs68, %rs69}, %r78; + mov.b32 {%rs70, %rs71}, %r79; + mov.b32 {%rs72, %rs73}, %r80; + // begin inline asm + mov.u64 %rd30, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd30, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r85, %r5; + mov.u32 %r86, %r5; + mov.u32 %r87, %r5; + mov.u32 %r88, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r85, %r86, %r87, %r88 }, [ %rd5 + 0 ], %rd30; + // end inline asm + mov.b32 {%rs74, %rs75}, %r85; + mov.b32 {%rs76, %rs77}, %r86; + mov.b32 {%rs78, %rs79}, %r87; + mov.b32 {%rs80, %rs81}, %r88; + // begin inline asm + mov.u64 %rd33, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd33, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r93, %r5; + mov.u32 %r94, %r5; + mov.u32 %r95, %r5; + mov.u32 %r96, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r93, %r94, %r95, %r96 }, [ %rd8 + 0 ], %rd33; + // end inline asm + mov.b32 {%rs82, %rs83}, %r93; + mov.b32 {%rs84, %rs85}, %r94; + mov.b32 {%rs86, %rs87}, %r95; + mov.b32 {%rs88, %rs89}, %r96; + // begin inline asm + mov.u64 %rd36, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd36, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r101, %r5; + mov.u32 %r102, %r5; + mov.u32 %r103, %r5; + mov.u32 %r104, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r101, %r102, %r103, %r104 }, [ %rd11 + 0 ], %rd36; + // end inline asm + mov.b32 {%rs90, %rs91}, %r101; + mov.b32 {%rs92, %rs93}, %r102; + mov.b32 {%rs94, %rs95}, %r103; + mov.b32 {%rs96, %rs97}, %r104; + // begin inline asm + mov.u64 %rd39, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd39, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r109, %r5; + mov.u32 %r110, %r5; + mov.u32 %r111, %r5; + mov.u32 %r112, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r109, %r110, %r111, %r112 }, [ %rd14 + 0 ], %rd39; + // end inline asm + mov.b32 {%rs98, %rs99}, %r109; + mov.b32 {%rs100, %rs101}, %r110; + mov.b32 {%rs102, %rs103}, %r111; + mov.b32 {%rs104, %rs105}, %r112; + // begin inline asm + mov.u64 %rd42, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd42, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r117, %r5; + mov.u32 %r118, %r5; + mov.u32 %r119, %r5; + mov.u32 %r120, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r117, %r118, %r119, %r120 }, [ %rd17 + 0 ], %rd42; + // end inline asm + mov.b32 {%rs106, %rs107}, %r117; + mov.b32 {%rs108, %rs109}, %r118; + mov.b32 {%rs110, %rs111}, %r119; + mov.b32 {%rs112, %rs113}, %r120; + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd45, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r125, %r5; + mov.u32 %r126, %r5; + mov.u32 %r127, %r5; + mov.u32 %r128, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r125, %r126, %r127, %r128 }, [ %rd20 + 0 ], %rd45; + // end inline asm + mov.b32 {%rs114, %rs115}, %r125; + mov.b32 {%rs116, %rs117}, %r126; + mov.b32 {%rs118, %rs119}, %r127; + mov.b32 {%rs120, %rs121}, %r128; + // begin inline asm + mov.u64 %rd48, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd48, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r133, %r5; + mov.u32 %r134, %r5; + mov.u32 %r135, %r5; + mov.u32 %r136, %r5; + @%p8 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r133, %r134, %r135, %r136 }, [ %rd23 + 0 ], %rd48; + // end inline asm + mov.b32 {%rs122, %rs123}, %r133; + mov.b32 {%rs124, %rs125}, %r134; + mov.b32 {%rs126, %rs127}, %r135; + mov.b32 {%rs128, %rs129}, %r136; + .loc 1 58 106 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:58:106 + cvt.f32.bf16 %r2886, %rs66; + cvt.f32.bf16 %r2887, %rs67; + cvt.f32.bf16 %r2888, %rs68; + cvt.f32.bf16 %r2889, %rs69; + cvt.f32.bf16 %r2890, %rs70; + cvt.f32.bf16 %r2891, %rs71; + cvt.f32.bf16 %r2892, %rs72; + cvt.f32.bf16 %r2893, %rs73; + cvt.f32.bf16 %r2894, %rs74; + cvt.f32.bf16 %r2895, %rs75; + cvt.f32.bf16 %r2896, %rs76; + cvt.f32.bf16 %r2897, %rs77; + cvt.f32.bf16 %r2898, %rs78; + cvt.f32.bf16 %r2899, %rs79; + cvt.f32.bf16 %r2900, %rs80; + cvt.f32.bf16 %r2901, %rs81; + cvt.f32.bf16 %r2902, %rs82; + cvt.f32.bf16 %r2903, %rs83; + cvt.f32.bf16 %r2904, %rs84; + cvt.f32.bf16 %r2905, %rs85; + cvt.f32.bf16 %r2906, %rs86; + cvt.f32.bf16 %r2907, %rs87; + cvt.f32.bf16 %r2908, %rs88; + cvt.f32.bf16 %r2909, %rs89; + cvt.f32.bf16 %r2910, %rs90; + cvt.f32.bf16 %r2911, %rs91; + cvt.f32.bf16 %r2912, %rs92; + cvt.f32.bf16 %r2913, %rs93; + cvt.f32.bf16 %r2914, %rs94; + cvt.f32.bf16 %r2915, %rs95; + cvt.f32.bf16 %r2916, %rs96; + cvt.f32.bf16 %r2917, %rs97; + cvt.f32.bf16 %r2918, %rs98; + cvt.f32.bf16 %r2919, %rs99; + cvt.f32.bf16 %r2920, %rs100; + cvt.f32.bf16 %r2921, %rs101; + cvt.f32.bf16 %r2922, %rs102; + cvt.f32.bf16 %r2923, %rs103; + cvt.f32.bf16 %r2924, %rs104; + cvt.f32.bf16 %r2925, %rs105; + cvt.f32.bf16 %r2926, %rs106; + cvt.f32.bf16 %r2927, %rs107; + cvt.f32.bf16 %r2928, %rs108; + cvt.f32.bf16 %r2929, %rs109; + cvt.f32.bf16 %r2930, %rs110; + cvt.f32.bf16 %r2931, %rs111; + cvt.f32.bf16 %r2932, %rs112; + cvt.f32.bf16 %r2933, %rs113; + cvt.f32.bf16 %r2934, %rs114; + cvt.f32.bf16 %r2935, %rs115; + cvt.f32.bf16 %r2936, %rs116; + cvt.f32.bf16 %r2937, %rs117; + cvt.f32.bf16 %r2938, %rs118; + cvt.f32.bf16 %r2939, %rs119; + cvt.f32.bf16 %r2940, %rs120; + cvt.f32.bf16 %r2941, %rs121; + cvt.f32.bf16 %r2942, %rs122; + cvt.f32.bf16 %r2943, %rs123; + cvt.f32.bf16 %r2944, %rs124; + cvt.f32.bf16 %r2945, %rs125; + cvt.f32.bf16 %r2946, %rs126; + cvt.f32.bf16 %r2947, %rs127; + cvt.f32.bf16 %r2948, %rs128; + cvt.f32.bf16 %r2949, %rs129; + .loc 1 60 22 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:60:22 + sub.f32 %r2950, %r2886, %r1915; + sub.f32 %r2951, %r2887, %r1915; + sub.f32 %r2952, %r2888, %r1915; + sub.f32 %r2953, %r2889, %r1915; + sub.f32 %r2954, %r2890, %r1915; + sub.f32 %r2955, %r2891, %r1915; + sub.f32 %r2956, %r2892, %r1915; + sub.f32 %r2957, %r2893, %r1915; + sub.f32 %r2958, %r2894, %r1915; + sub.f32 %r2959, %r2895, %r1915; + sub.f32 %r2960, %r2896, %r1915; + sub.f32 %r2961, %r2897, %r1915; + sub.f32 %r2962, %r2898, %r1915; + sub.f32 %r2963, %r2899, %r1915; + sub.f32 %r2964, %r2900, %r1915; + sub.f32 %r2965, %r2901, %r1915; + sub.f32 %r2966, %r2902, %r1915; + sub.f32 %r2967, %r2903, %r1915; + sub.f32 %r2968, %r2904, %r1915; + sub.f32 %r2969, %r2905, %r1915; + sub.f32 %r2970, %r2906, %r1915; + sub.f32 %r2971, %r2907, %r1915; + sub.f32 %r2972, %r2908, %r1915; + sub.f32 %r2973, %r2909, %r1915; + sub.f32 %r2974, %r2910, %r1915; + sub.f32 %r2975, %r2911, %r1915; + sub.f32 %r2976, %r2912, %r1915; + sub.f32 %r2977, %r2913, %r1915; + sub.f32 %r2978, %r2914, %r1915; + sub.f32 %r2979, %r2915, %r1915; + sub.f32 %r2980, %r2916, %r1915; + sub.f32 %r2981, %r2917, %r1915; + sub.f32 %r2982, %r2918, %r1915; + sub.f32 %r2983, %r2919, %r1915; + sub.f32 %r2984, %r2920, %r1915; + sub.f32 %r2985, %r2921, %r1915; + sub.f32 %r2986, %r2922, %r1915; + sub.f32 %r2987, %r2923, %r1915; + sub.f32 %r2988, %r2924, %r1915; + sub.f32 %r2989, %r2925, %r1915; + sub.f32 %r2990, %r2926, %r1915; + sub.f32 %r2991, %r2927, %r1915; + sub.f32 %r2992, %r2928, %r1915; + sub.f32 %r2993, %r2929, %r1915; + sub.f32 %r2994, %r2930, %r1915; + sub.f32 %r2995, %r2931, %r1915; + sub.f32 %r2996, %r2932, %r1915; + sub.f32 %r2997, %r2933, %r1915; + sub.f32 %r2998, %r2934, %r1915; + sub.f32 %r2999, %r2935, %r1915; + sub.f32 %r3000, %r2936, %r1915; + sub.f32 %r3001, %r2937, %r1915; + sub.f32 %r3002, %r2938, %r1915; + sub.f32 %r3003, %r2939, %r1915; + sub.f32 %r3004, %r2940, %r1915; + sub.f32 %r3005, %r2941, %r1915; + sub.f32 %r3006, %r2942, %r1915; + sub.f32 %r3007, %r2943, %r1915; + sub.f32 %r3008, %r2944, %r1915; + sub.f32 %r3009, %r2945, %r1915; + sub.f32 %r3010, %r2946, %r1915; + sub.f32 %r3011, %r2947, %r1915; + sub.f32 %r3012, %r2948, %r1915; + sub.f32 %r3013, %r2949, %r1915; + .loc 1 61 29 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:61:29 + fma.rn.ftz.f32 %r3014, %r2950, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3015, %r3014; + fma.rm.ftz.f32 %r3016, %r3015, %r373, %r372; + add.f32 %r3017, %r3016, 0fCB40007F; + neg.f32 %r3018, %r3017; + fma.rn.ftz.f32 %r3019, %r2950, %r377, %r3018; + fma.rn.ftz.f32 %r3020, %r2950, %r379, %r3019; + shl.b32 %r3021, %r3016, 23; + ex2.approx.ftz.f32 %r3022, %r3020; + mul.f32 %r3023, %r3022, %r3021; + fma.rn.ftz.f32 %r3024, %r2951, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3025, %r3024; + fma.rm.ftz.f32 %r3026, %r3025, %r373, %r372; + add.f32 %r3027, %r3026, 0fCB40007F; + neg.f32 %r3028, %r3027; + fma.rn.ftz.f32 %r3029, %r2951, %r377, %r3028; + fma.rn.ftz.f32 %r3030, %r2951, %r379, %r3029; + shl.b32 %r3031, %r3026, 23; + ex2.approx.ftz.f32 %r3032, %r3030; + mul.f32 %r3033, %r3032, %r3031; + fma.rn.ftz.f32 %r3034, %r2952, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3035, %r3034; + fma.rm.ftz.f32 %r3036, %r3035, %r373, %r372; + add.f32 %r3037, %r3036, 0fCB40007F; + neg.f32 %r3038, %r3037; + fma.rn.ftz.f32 %r3039, %r2952, %r377, %r3038; + fma.rn.ftz.f32 %r3040, %r2952, %r379, %r3039; + shl.b32 %r3041, %r3036, 23; + ex2.approx.ftz.f32 %r3042, %r3040; + mul.f32 %r3043, %r3042, %r3041; + fma.rn.ftz.f32 %r3044, %r2953, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3045, %r3044; + fma.rm.ftz.f32 %r3046, %r3045, %r373, %r372; + add.f32 %r3047, %r3046, 0fCB40007F; + neg.f32 %r3048, %r3047; + fma.rn.ftz.f32 %r3049, %r2953, %r377, %r3048; + fma.rn.ftz.f32 %r3050, %r2953, %r379, %r3049; + shl.b32 %r3051, %r3046, 23; + ex2.approx.ftz.f32 %r3052, %r3050; + mul.f32 %r3053, %r3052, %r3051; + fma.rn.ftz.f32 %r3054, %r2954, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3055, %r3054; + fma.rm.ftz.f32 %r3056, %r3055, %r373, %r372; + add.f32 %r3057, %r3056, 0fCB40007F; + neg.f32 %r3058, %r3057; + fma.rn.ftz.f32 %r3059, %r2954, %r377, %r3058; + fma.rn.ftz.f32 %r3060, %r2954, %r379, %r3059; + shl.b32 %r3061, %r3056, 23; + ex2.approx.ftz.f32 %r3062, %r3060; + mul.f32 %r3063, %r3062, %r3061; + fma.rn.ftz.f32 %r3064, %r2955, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3065, %r3064; + fma.rm.ftz.f32 %r3066, %r3065, %r373, %r372; + add.f32 %r3067, %r3066, 0fCB40007F; + neg.f32 %r3068, %r3067; + fma.rn.ftz.f32 %r3069, %r2955, %r377, %r3068; + fma.rn.ftz.f32 %r3070, %r2955, %r379, %r3069; + shl.b32 %r3071, %r3066, 23; + ex2.approx.ftz.f32 %r3072, %r3070; + mul.f32 %r3073, %r3072, %r3071; + fma.rn.ftz.f32 %r3074, %r2956, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3075, %r3074; + fma.rm.ftz.f32 %r3076, %r3075, %r373, %r372; + add.f32 %r3077, %r3076, 0fCB40007F; + neg.f32 %r3078, %r3077; + fma.rn.ftz.f32 %r3079, %r2956, %r377, %r3078; + fma.rn.ftz.f32 %r3080, %r2956, %r379, %r3079; + shl.b32 %r3081, %r3076, 23; + ex2.approx.ftz.f32 %r3082, %r3080; + mul.f32 %r3083, %r3082, %r3081; + fma.rn.ftz.f32 %r3084, %r2957, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3085, %r3084; + fma.rm.ftz.f32 %r3086, %r3085, %r373, %r372; + add.f32 %r3087, %r3086, 0fCB40007F; + neg.f32 %r3088, %r3087; + fma.rn.ftz.f32 %r3089, %r2957, %r377, %r3088; + fma.rn.ftz.f32 %r3090, %r2957, %r379, %r3089; + shl.b32 %r3091, %r3086, 23; + ex2.approx.ftz.f32 %r3092, %r3090; + mul.f32 %r3093, %r3092, %r3091; + fma.rn.ftz.f32 %r3094, %r2958, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3095, %r3094; + fma.rm.ftz.f32 %r3096, %r3095, %r373, %r372; + add.f32 %r3097, %r3096, 0fCB40007F; + neg.f32 %r3098, %r3097; + fma.rn.ftz.f32 %r3099, %r2958, %r377, %r3098; + fma.rn.ftz.f32 %r3100, %r2958, %r379, %r3099; + shl.b32 %r3101, %r3096, 23; + ex2.approx.ftz.f32 %r3102, %r3100; + mul.f32 %r3103, %r3102, %r3101; + fma.rn.ftz.f32 %r3104, %r2959, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3105, %r3104; + fma.rm.ftz.f32 %r3106, %r3105, %r373, %r372; + add.f32 %r3107, %r3106, 0fCB40007F; + neg.f32 %r3108, %r3107; + fma.rn.ftz.f32 %r3109, %r2959, %r377, %r3108; + fma.rn.ftz.f32 %r3110, %r2959, %r379, %r3109; + shl.b32 %r3111, %r3106, 23; + ex2.approx.ftz.f32 %r3112, %r3110; + mul.f32 %r3113, %r3112, %r3111; + fma.rn.ftz.f32 %r3114, %r2960, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3115, %r3114; + fma.rm.ftz.f32 %r3116, %r3115, %r373, %r372; + add.f32 %r3117, %r3116, 0fCB40007F; + neg.f32 %r3118, %r3117; + fma.rn.ftz.f32 %r3119, %r2960, %r377, %r3118; + fma.rn.ftz.f32 %r3120, %r2960, %r379, %r3119; + shl.b32 %r3121, %r3116, 23; + ex2.approx.ftz.f32 %r3122, %r3120; + mul.f32 %r3123, %r3122, %r3121; + fma.rn.ftz.f32 %r3124, %r2961, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3125, %r3124; + fma.rm.ftz.f32 %r3126, %r3125, %r373, %r372; + add.f32 %r3127, %r3126, 0fCB40007F; + neg.f32 %r3128, %r3127; + fma.rn.ftz.f32 %r3129, %r2961, %r377, %r3128; + fma.rn.ftz.f32 %r3130, %r2961, %r379, %r3129; + shl.b32 %r3131, %r3126, 23; + ex2.approx.ftz.f32 %r3132, %r3130; + mul.f32 %r3133, %r3132, %r3131; + fma.rn.ftz.f32 %r3134, %r2962, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3135, %r3134; + fma.rm.ftz.f32 %r3136, %r3135, %r373, %r372; + add.f32 %r3137, %r3136, 0fCB40007F; + neg.f32 %r3138, %r3137; + fma.rn.ftz.f32 %r3139, %r2962, %r377, %r3138; + fma.rn.ftz.f32 %r3140, %r2962, %r379, %r3139; + shl.b32 %r3141, %r3136, 23; + ex2.approx.ftz.f32 %r3142, %r3140; + mul.f32 %r3143, %r3142, %r3141; + fma.rn.ftz.f32 %r3144, %r2963, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3145, %r3144; + fma.rm.ftz.f32 %r3146, %r3145, %r373, %r372; + add.f32 %r3147, %r3146, 0fCB40007F; + neg.f32 %r3148, %r3147; + fma.rn.ftz.f32 %r3149, %r2963, %r377, %r3148; + fma.rn.ftz.f32 %r3150, %r2963, %r379, %r3149; + shl.b32 %r3151, %r3146, 23; + ex2.approx.ftz.f32 %r3152, %r3150; + mul.f32 %r3153, %r3152, %r3151; + fma.rn.ftz.f32 %r3154, %r2964, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3155, %r3154; + fma.rm.ftz.f32 %r3156, %r3155, %r373, %r372; + add.f32 %r3157, %r3156, 0fCB40007F; + neg.f32 %r3158, %r3157; + fma.rn.ftz.f32 %r3159, %r2964, %r377, %r3158; + fma.rn.ftz.f32 %r3160, %r2964, %r379, %r3159; + shl.b32 %r3161, %r3156, 23; + ex2.approx.ftz.f32 %r3162, %r3160; + mul.f32 %r3163, %r3162, %r3161; + fma.rn.ftz.f32 %r3164, %r2965, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3165, %r3164; + fma.rm.ftz.f32 %r3166, %r3165, %r373, %r372; + add.f32 %r3167, %r3166, 0fCB40007F; + neg.f32 %r3168, %r3167; + fma.rn.ftz.f32 %r3169, %r2965, %r377, %r3168; + fma.rn.ftz.f32 %r3170, %r2965, %r379, %r3169; + shl.b32 %r3171, %r3166, 23; + ex2.approx.ftz.f32 %r3172, %r3170; + mul.f32 %r3173, %r3172, %r3171; + fma.rn.ftz.f32 %r3174, %r2966, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3175, %r3174; + fma.rm.ftz.f32 %r3176, %r3175, %r373, %r372; + add.f32 %r3177, %r3176, 0fCB40007F; + neg.f32 %r3178, %r3177; + fma.rn.ftz.f32 %r3179, %r2966, %r377, %r3178; + fma.rn.ftz.f32 %r3180, %r2966, %r379, %r3179; + shl.b32 %r3181, %r3176, 23; + ex2.approx.ftz.f32 %r3182, %r3180; + mul.f32 %r3183, %r3182, %r3181; + fma.rn.ftz.f32 %r3184, %r2967, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3185, %r3184; + fma.rm.ftz.f32 %r3186, %r3185, %r373, %r372; + add.f32 %r3187, %r3186, 0fCB40007F; + neg.f32 %r3188, %r3187; + fma.rn.ftz.f32 %r3189, %r2967, %r377, %r3188; + fma.rn.ftz.f32 %r3190, %r2967, %r379, %r3189; + shl.b32 %r3191, %r3186, 23; + ex2.approx.ftz.f32 %r3192, %r3190; + mul.f32 %r3193, %r3192, %r3191; + fma.rn.ftz.f32 %r3194, %r2968, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3195, %r3194; + fma.rm.ftz.f32 %r3196, %r3195, %r373, %r372; + add.f32 %r3197, %r3196, 0fCB40007F; + neg.f32 %r3198, %r3197; + fma.rn.ftz.f32 %r3199, %r2968, %r377, %r3198; + fma.rn.ftz.f32 %r3200, %r2968, %r379, %r3199; + shl.b32 %r3201, %r3196, 23; + ex2.approx.ftz.f32 %r3202, %r3200; + mul.f32 %r3203, %r3202, %r3201; + fma.rn.ftz.f32 %r3204, %r2969, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3205, %r3204; + fma.rm.ftz.f32 %r3206, %r3205, %r373, %r372; + add.f32 %r3207, %r3206, 0fCB40007F; + neg.f32 %r3208, %r3207; + fma.rn.ftz.f32 %r3209, %r2969, %r377, %r3208; + fma.rn.ftz.f32 %r3210, %r2969, %r379, %r3209; + shl.b32 %r3211, %r3206, 23; + ex2.approx.ftz.f32 %r3212, %r3210; + mul.f32 %r3213, %r3212, %r3211; + fma.rn.ftz.f32 %r3214, %r2970, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3215, %r3214; + fma.rm.ftz.f32 %r3216, %r3215, %r373, %r372; + add.f32 %r3217, %r3216, 0fCB40007F; + neg.f32 %r3218, %r3217; + fma.rn.ftz.f32 %r3219, %r2970, %r377, %r3218; + fma.rn.ftz.f32 %r3220, %r2970, %r379, %r3219; + shl.b32 %r3221, %r3216, 23; + ex2.approx.ftz.f32 %r3222, %r3220; + mul.f32 %r3223, %r3222, %r3221; + fma.rn.ftz.f32 %r3224, %r2971, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3225, %r3224; + fma.rm.ftz.f32 %r3226, %r3225, %r373, %r372; + add.f32 %r3227, %r3226, 0fCB40007F; + neg.f32 %r3228, %r3227; + fma.rn.ftz.f32 %r3229, %r2971, %r377, %r3228; + fma.rn.ftz.f32 %r3230, %r2971, %r379, %r3229; + shl.b32 %r3231, %r3226, 23; + ex2.approx.ftz.f32 %r3232, %r3230; + mul.f32 %r3233, %r3232, %r3231; + fma.rn.ftz.f32 %r3234, %r2972, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3235, %r3234; + fma.rm.ftz.f32 %r3236, %r3235, %r373, %r372; + add.f32 %r3237, %r3236, 0fCB40007F; + neg.f32 %r3238, %r3237; + fma.rn.ftz.f32 %r3239, %r2972, %r377, %r3238; + fma.rn.ftz.f32 %r3240, %r2972, %r379, %r3239; + shl.b32 %r3241, %r3236, 23; + ex2.approx.ftz.f32 %r3242, %r3240; + mul.f32 %r3243, %r3242, %r3241; + fma.rn.ftz.f32 %r3244, %r2973, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3245, %r3244; + fma.rm.ftz.f32 %r3246, %r3245, %r373, %r372; + add.f32 %r3247, %r3246, 0fCB40007F; + neg.f32 %r3248, %r3247; + fma.rn.ftz.f32 %r3249, %r2973, %r377, %r3248; + fma.rn.ftz.f32 %r3250, %r2973, %r379, %r3249; + shl.b32 %r3251, %r3246, 23; + ex2.approx.ftz.f32 %r3252, %r3250; + mul.f32 %r3253, %r3252, %r3251; + fma.rn.ftz.f32 %r3254, %r2974, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3255, %r3254; + fma.rm.ftz.f32 %r3256, %r3255, %r373, %r372; + add.f32 %r3257, %r3256, 0fCB40007F; + neg.f32 %r3258, %r3257; + fma.rn.ftz.f32 %r3259, %r2974, %r377, %r3258; + fma.rn.ftz.f32 %r3260, %r2974, %r379, %r3259; + shl.b32 %r3261, %r3256, 23; + ex2.approx.ftz.f32 %r3262, %r3260; + mul.f32 %r3263, %r3262, %r3261; + fma.rn.ftz.f32 %r3264, %r2975, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3265, %r3264; + fma.rm.ftz.f32 %r3266, %r3265, %r373, %r372; + add.f32 %r3267, %r3266, 0fCB40007F; + neg.f32 %r3268, %r3267; + fma.rn.ftz.f32 %r3269, %r2975, %r377, %r3268; + fma.rn.ftz.f32 %r3270, %r2975, %r379, %r3269; + shl.b32 %r3271, %r3266, 23; + ex2.approx.ftz.f32 %r3272, %r3270; + mul.f32 %r3273, %r3272, %r3271; + fma.rn.ftz.f32 %r3274, %r2976, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3275, %r3274; + fma.rm.ftz.f32 %r3276, %r3275, %r373, %r372; + add.f32 %r3277, %r3276, 0fCB40007F; + neg.f32 %r3278, %r3277; + fma.rn.ftz.f32 %r3279, %r2976, %r377, %r3278; + fma.rn.ftz.f32 %r3280, %r2976, %r379, %r3279; + shl.b32 %r3281, %r3276, 23; + ex2.approx.ftz.f32 %r3282, %r3280; + mul.f32 %r3283, %r3282, %r3281; + fma.rn.ftz.f32 %r3284, %r2977, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3285, %r3284; + fma.rm.ftz.f32 %r3286, %r3285, %r373, %r372; + add.f32 %r3287, %r3286, 0fCB40007F; + neg.f32 %r3288, %r3287; + fma.rn.ftz.f32 %r3289, %r2977, %r377, %r3288; + fma.rn.ftz.f32 %r3290, %r2977, %r379, %r3289; + shl.b32 %r3291, %r3286, 23; + ex2.approx.ftz.f32 %r3292, %r3290; + mul.f32 %r3293, %r3292, %r3291; + fma.rn.ftz.f32 %r3294, %r2978, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3295, %r3294; + fma.rm.ftz.f32 %r3296, %r3295, %r373, %r372; + add.f32 %r3297, %r3296, 0fCB40007F; + neg.f32 %r3298, %r3297; + fma.rn.ftz.f32 %r3299, %r2978, %r377, %r3298; + fma.rn.ftz.f32 %r3300, %r2978, %r379, %r3299; + shl.b32 %r3301, %r3296, 23; + ex2.approx.ftz.f32 %r3302, %r3300; + mul.f32 %r3303, %r3302, %r3301; + fma.rn.ftz.f32 %r3304, %r2979, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3305, %r3304; + fma.rm.ftz.f32 %r3306, %r3305, %r373, %r372; + add.f32 %r3307, %r3306, 0fCB40007F; + neg.f32 %r3308, %r3307; + fma.rn.ftz.f32 %r3309, %r2979, %r377, %r3308; + fma.rn.ftz.f32 %r3310, %r2979, %r379, %r3309; + shl.b32 %r3311, %r3306, 23; + ex2.approx.ftz.f32 %r3312, %r3310; + mul.f32 %r3313, %r3312, %r3311; + fma.rn.ftz.f32 %r3314, %r2980, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3315, %r3314; + fma.rm.ftz.f32 %r3316, %r3315, %r373, %r372; + add.f32 %r3317, %r3316, 0fCB40007F; + neg.f32 %r3318, %r3317; + fma.rn.ftz.f32 %r3319, %r2980, %r377, %r3318; + fma.rn.ftz.f32 %r3320, %r2980, %r379, %r3319; + shl.b32 %r3321, %r3316, 23; + ex2.approx.ftz.f32 %r3322, %r3320; + mul.f32 %r3323, %r3322, %r3321; + fma.rn.ftz.f32 %r3324, %r2981, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3325, %r3324; + fma.rm.ftz.f32 %r3326, %r3325, %r373, %r372; + add.f32 %r3327, %r3326, 0fCB40007F; + neg.f32 %r3328, %r3327; + fma.rn.ftz.f32 %r3329, %r2981, %r377, %r3328; + fma.rn.ftz.f32 %r3330, %r2981, %r379, %r3329; + shl.b32 %r3331, %r3326, 23; + ex2.approx.ftz.f32 %r3332, %r3330; + mul.f32 %r3333, %r3332, %r3331; + fma.rn.ftz.f32 %r3334, %r2982, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3335, %r3334; + fma.rm.ftz.f32 %r3336, %r3335, %r373, %r372; + add.f32 %r3337, %r3336, 0fCB40007F; + neg.f32 %r3338, %r3337; + fma.rn.ftz.f32 %r3339, %r2982, %r377, %r3338; + fma.rn.ftz.f32 %r3340, %r2982, %r379, %r3339; + shl.b32 %r3341, %r3336, 23; + ex2.approx.ftz.f32 %r3342, %r3340; + mul.f32 %r3343, %r3342, %r3341; + fma.rn.ftz.f32 %r3344, %r2983, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3345, %r3344; + fma.rm.ftz.f32 %r3346, %r3345, %r373, %r372; + add.f32 %r3347, %r3346, 0fCB40007F; + neg.f32 %r3348, %r3347; + fma.rn.ftz.f32 %r3349, %r2983, %r377, %r3348; + fma.rn.ftz.f32 %r3350, %r2983, %r379, %r3349; + shl.b32 %r3351, %r3346, 23; + ex2.approx.ftz.f32 %r3352, %r3350; + mul.f32 %r3353, %r3352, %r3351; + fma.rn.ftz.f32 %r3354, %r2984, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3355, %r3354; + fma.rm.ftz.f32 %r3356, %r3355, %r373, %r372; + add.f32 %r3357, %r3356, 0fCB40007F; + neg.f32 %r3358, %r3357; + fma.rn.ftz.f32 %r3359, %r2984, %r377, %r3358; + fma.rn.ftz.f32 %r3360, %r2984, %r379, %r3359; + shl.b32 %r3361, %r3356, 23; + ex2.approx.ftz.f32 %r3362, %r3360; + mul.f32 %r3363, %r3362, %r3361; + fma.rn.ftz.f32 %r3364, %r2985, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3365, %r3364; + fma.rm.ftz.f32 %r3366, %r3365, %r373, %r372; + add.f32 %r3367, %r3366, 0fCB40007F; + neg.f32 %r3368, %r3367; + fma.rn.ftz.f32 %r3369, %r2985, %r377, %r3368; + fma.rn.ftz.f32 %r3370, %r2985, %r379, %r3369; + shl.b32 %r3371, %r3366, 23; + ex2.approx.ftz.f32 %r3372, %r3370; + mul.f32 %r3373, %r3372, %r3371; + fma.rn.ftz.f32 %r3374, %r2986, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3375, %r3374; + fma.rm.ftz.f32 %r3376, %r3375, %r373, %r372; + add.f32 %r3377, %r3376, 0fCB40007F; + neg.f32 %r3378, %r3377; + fma.rn.ftz.f32 %r3379, %r2986, %r377, %r3378; + fma.rn.ftz.f32 %r3380, %r2986, %r379, %r3379; + shl.b32 %r3381, %r3376, 23; + ex2.approx.ftz.f32 %r3382, %r3380; + mul.f32 %r3383, %r3382, %r3381; + fma.rn.ftz.f32 %r3384, %r2987, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3385, %r3384; + fma.rm.ftz.f32 %r3386, %r3385, %r373, %r372; + add.f32 %r3387, %r3386, 0fCB40007F; + neg.f32 %r3388, %r3387; + fma.rn.ftz.f32 %r3389, %r2987, %r377, %r3388; + fma.rn.ftz.f32 %r3390, %r2987, %r379, %r3389; + shl.b32 %r3391, %r3386, 23; + ex2.approx.ftz.f32 %r3392, %r3390; + mul.f32 %r3393, %r3392, %r3391; + fma.rn.ftz.f32 %r3394, %r2988, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3395, %r3394; + fma.rm.ftz.f32 %r3396, %r3395, %r373, %r372; + add.f32 %r3397, %r3396, 0fCB40007F; + neg.f32 %r3398, %r3397; + fma.rn.ftz.f32 %r3399, %r2988, %r377, %r3398; + fma.rn.ftz.f32 %r3400, %r2988, %r379, %r3399; + shl.b32 %r3401, %r3396, 23; + ex2.approx.ftz.f32 %r3402, %r3400; + mul.f32 %r3403, %r3402, %r3401; + fma.rn.ftz.f32 %r3404, %r2989, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3405, %r3404; + fma.rm.ftz.f32 %r3406, %r3405, %r373, %r372; + add.f32 %r3407, %r3406, 0fCB40007F; + neg.f32 %r3408, %r3407; + fma.rn.ftz.f32 %r3409, %r2989, %r377, %r3408; + fma.rn.ftz.f32 %r3410, %r2989, %r379, %r3409; + shl.b32 %r3411, %r3406, 23; + ex2.approx.ftz.f32 %r3412, %r3410; + mul.f32 %r3413, %r3412, %r3411; + fma.rn.ftz.f32 %r3414, %r2990, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3415, %r3414; + fma.rm.ftz.f32 %r3416, %r3415, %r373, %r372; + add.f32 %r3417, %r3416, 0fCB40007F; + neg.f32 %r3418, %r3417; + fma.rn.ftz.f32 %r3419, %r2990, %r377, %r3418; + fma.rn.ftz.f32 %r3420, %r2990, %r379, %r3419; + shl.b32 %r3421, %r3416, 23; + ex2.approx.ftz.f32 %r3422, %r3420; + mul.f32 %r3423, %r3422, %r3421; + fma.rn.ftz.f32 %r3424, %r2991, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3425, %r3424; + fma.rm.ftz.f32 %r3426, %r3425, %r373, %r372; + add.f32 %r3427, %r3426, 0fCB40007F; + neg.f32 %r3428, %r3427; + fma.rn.ftz.f32 %r3429, %r2991, %r377, %r3428; + fma.rn.ftz.f32 %r3430, %r2991, %r379, %r3429; + shl.b32 %r3431, %r3426, 23; + ex2.approx.ftz.f32 %r3432, %r3430; + mul.f32 %r3433, %r3432, %r3431; + fma.rn.ftz.f32 %r3434, %r2992, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3435, %r3434; + fma.rm.ftz.f32 %r3436, %r3435, %r373, %r372; + add.f32 %r3437, %r3436, 0fCB40007F; + neg.f32 %r3438, %r3437; + fma.rn.ftz.f32 %r3439, %r2992, %r377, %r3438; + fma.rn.ftz.f32 %r3440, %r2992, %r379, %r3439; + shl.b32 %r3441, %r3436, 23; + ex2.approx.ftz.f32 %r3442, %r3440; + mul.f32 %r3443, %r3442, %r3441; + fma.rn.ftz.f32 %r3444, %r2993, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3445, %r3444; + fma.rm.ftz.f32 %r3446, %r3445, %r373, %r372; + add.f32 %r3447, %r3446, 0fCB40007F; + neg.f32 %r3448, %r3447; + fma.rn.ftz.f32 %r3449, %r2993, %r377, %r3448; + fma.rn.ftz.f32 %r3450, %r2993, %r379, %r3449; + shl.b32 %r3451, %r3446, 23; + ex2.approx.ftz.f32 %r3452, %r3450; + mul.f32 %r3453, %r3452, %r3451; + fma.rn.ftz.f32 %r3454, %r2994, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3455, %r3454; + fma.rm.ftz.f32 %r3456, %r3455, %r373, %r372; + add.f32 %r3457, %r3456, 0fCB40007F; + neg.f32 %r3458, %r3457; + fma.rn.ftz.f32 %r3459, %r2994, %r377, %r3458; + fma.rn.ftz.f32 %r3460, %r2994, %r379, %r3459; + shl.b32 %r3461, %r3456, 23; + ex2.approx.ftz.f32 %r3462, %r3460; + mul.f32 %r3463, %r3462, %r3461; + fma.rn.ftz.f32 %r3464, %r2995, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3465, %r3464; + fma.rm.ftz.f32 %r3466, %r3465, %r373, %r372; + add.f32 %r3467, %r3466, 0fCB40007F; + neg.f32 %r3468, %r3467; + fma.rn.ftz.f32 %r3469, %r2995, %r377, %r3468; + fma.rn.ftz.f32 %r3470, %r2995, %r379, %r3469; + shl.b32 %r3471, %r3466, 23; + ex2.approx.ftz.f32 %r3472, %r3470; + mul.f32 %r3473, %r3472, %r3471; + fma.rn.ftz.f32 %r3474, %r2996, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3475, %r3474; + fma.rm.ftz.f32 %r3476, %r3475, %r373, %r372; + add.f32 %r3477, %r3476, 0fCB40007F; + neg.f32 %r3478, %r3477; + fma.rn.ftz.f32 %r3479, %r2996, %r377, %r3478; + fma.rn.ftz.f32 %r3480, %r2996, %r379, %r3479; + shl.b32 %r3481, %r3476, 23; + ex2.approx.ftz.f32 %r3482, %r3480; + mul.f32 %r3483, %r3482, %r3481; + fma.rn.ftz.f32 %r3484, %r2997, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3485, %r3484; + fma.rm.ftz.f32 %r3486, %r3485, %r373, %r372; + add.f32 %r3487, %r3486, 0fCB40007F; + neg.f32 %r3488, %r3487; + fma.rn.ftz.f32 %r3489, %r2997, %r377, %r3488; + fma.rn.ftz.f32 %r3490, %r2997, %r379, %r3489; + shl.b32 %r3491, %r3486, 23; + ex2.approx.ftz.f32 %r3492, %r3490; + mul.f32 %r3493, %r3492, %r3491; + fma.rn.ftz.f32 %r3494, %r2998, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3495, %r3494; + fma.rm.ftz.f32 %r3496, %r3495, %r373, %r372; + add.f32 %r3497, %r3496, 0fCB40007F; + neg.f32 %r3498, %r3497; + fma.rn.ftz.f32 %r3499, %r2998, %r377, %r3498; + fma.rn.ftz.f32 %r3500, %r2998, %r379, %r3499; + shl.b32 %r3501, %r3496, 23; + ex2.approx.ftz.f32 %r3502, %r3500; + mul.f32 %r3503, %r3502, %r3501; + fma.rn.ftz.f32 %r3504, %r2999, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3505, %r3504; + fma.rm.ftz.f32 %r3506, %r3505, %r373, %r372; + add.f32 %r3507, %r3506, 0fCB40007F; + neg.f32 %r3508, %r3507; + fma.rn.ftz.f32 %r3509, %r2999, %r377, %r3508; + fma.rn.ftz.f32 %r3510, %r2999, %r379, %r3509; + shl.b32 %r3511, %r3506, 23; + ex2.approx.ftz.f32 %r3512, %r3510; + mul.f32 %r3513, %r3512, %r3511; + fma.rn.ftz.f32 %r3514, %r3000, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3515, %r3514; + fma.rm.ftz.f32 %r3516, %r3515, %r373, %r372; + add.f32 %r3517, %r3516, 0fCB40007F; + neg.f32 %r3518, %r3517; + fma.rn.ftz.f32 %r3519, %r3000, %r377, %r3518; + fma.rn.ftz.f32 %r3520, %r3000, %r379, %r3519; + shl.b32 %r3521, %r3516, 23; + ex2.approx.ftz.f32 %r3522, %r3520; + mul.f32 %r3523, %r3522, %r3521; + fma.rn.ftz.f32 %r3524, %r3001, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3525, %r3524; + fma.rm.ftz.f32 %r3526, %r3525, %r373, %r372; + add.f32 %r3527, %r3526, 0fCB40007F; + neg.f32 %r3528, %r3527; + fma.rn.ftz.f32 %r3529, %r3001, %r377, %r3528; + fma.rn.ftz.f32 %r3530, %r3001, %r379, %r3529; + shl.b32 %r3531, %r3526, 23; + ex2.approx.ftz.f32 %r3532, %r3530; + mul.f32 %r3533, %r3532, %r3531; + fma.rn.ftz.f32 %r3534, %r3002, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3535, %r3534; + fma.rm.ftz.f32 %r3536, %r3535, %r373, %r372; + add.f32 %r3537, %r3536, 0fCB40007F; + neg.f32 %r3538, %r3537; + fma.rn.ftz.f32 %r3539, %r3002, %r377, %r3538; + fma.rn.ftz.f32 %r3540, %r3002, %r379, %r3539; + shl.b32 %r3541, %r3536, 23; + ex2.approx.ftz.f32 %r3542, %r3540; + mul.f32 %r3543, %r3542, %r3541; + fma.rn.ftz.f32 %r3544, %r3003, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3545, %r3544; + fma.rm.ftz.f32 %r3546, %r3545, %r373, %r372; + add.f32 %r3547, %r3546, 0fCB40007F; + neg.f32 %r3548, %r3547; + fma.rn.ftz.f32 %r3549, %r3003, %r377, %r3548; + fma.rn.ftz.f32 %r3550, %r3003, %r379, %r3549; + shl.b32 %r3551, %r3546, 23; + ex2.approx.ftz.f32 %r3552, %r3550; + mul.f32 %r3553, %r3552, %r3551; + fma.rn.ftz.f32 %r3554, %r3004, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3555, %r3554; + fma.rm.ftz.f32 %r3556, %r3555, %r373, %r372; + add.f32 %r3557, %r3556, 0fCB40007F; + neg.f32 %r3558, %r3557; + fma.rn.ftz.f32 %r3559, %r3004, %r377, %r3558; + fma.rn.ftz.f32 %r3560, %r3004, %r379, %r3559; + shl.b32 %r3561, %r3556, 23; + ex2.approx.ftz.f32 %r3562, %r3560; + mul.f32 %r3563, %r3562, %r3561; + fma.rn.ftz.f32 %r3564, %r3005, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3565, %r3564; + fma.rm.ftz.f32 %r3566, %r3565, %r373, %r372; + add.f32 %r3567, %r3566, 0fCB40007F; + neg.f32 %r3568, %r3567; + fma.rn.ftz.f32 %r3569, %r3005, %r377, %r3568; + fma.rn.ftz.f32 %r3570, %r3005, %r379, %r3569; + shl.b32 %r3571, %r3566, 23; + ex2.approx.ftz.f32 %r3572, %r3570; + mul.f32 %r3573, %r3572, %r3571; + fma.rn.ftz.f32 %r3574, %r3006, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3575, %r3574; + fma.rm.ftz.f32 %r3576, %r3575, %r373, %r372; + add.f32 %r3577, %r3576, 0fCB40007F; + neg.f32 %r3578, %r3577; + fma.rn.ftz.f32 %r3579, %r3006, %r377, %r3578; + fma.rn.ftz.f32 %r3580, %r3006, %r379, %r3579; + shl.b32 %r3581, %r3576, 23; + ex2.approx.ftz.f32 %r3582, %r3580; + mul.f32 %r3583, %r3582, %r3581; + fma.rn.ftz.f32 %r3584, %r3007, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3585, %r3584; + fma.rm.ftz.f32 %r3586, %r3585, %r373, %r372; + add.f32 %r3587, %r3586, 0fCB40007F; + neg.f32 %r3588, %r3587; + fma.rn.ftz.f32 %r3589, %r3007, %r377, %r3588; + fma.rn.ftz.f32 %r3590, %r3007, %r379, %r3589; + shl.b32 %r3591, %r3586, 23; + ex2.approx.ftz.f32 %r3592, %r3590; + mul.f32 %r3593, %r3592, %r3591; + fma.rn.ftz.f32 %r3594, %r3008, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3595, %r3594; + fma.rm.ftz.f32 %r3596, %r3595, %r373, %r372; + add.f32 %r3597, %r3596, 0fCB40007F; + neg.f32 %r3598, %r3597; + fma.rn.ftz.f32 %r3599, %r3008, %r377, %r3598; + fma.rn.ftz.f32 %r3600, %r3008, %r379, %r3599; + shl.b32 %r3601, %r3596, 23; + ex2.approx.ftz.f32 %r3602, %r3600; + mul.f32 %r3603, %r3602, %r3601; + fma.rn.ftz.f32 %r3604, %r3009, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3605, %r3604; + fma.rm.ftz.f32 %r3606, %r3605, %r373, %r372; + add.f32 %r3607, %r3606, 0fCB40007F; + neg.f32 %r3608, %r3607; + fma.rn.ftz.f32 %r3609, %r3009, %r377, %r3608; + fma.rn.ftz.f32 %r3610, %r3009, %r379, %r3609; + shl.b32 %r3611, %r3606, 23; + ex2.approx.ftz.f32 %r3612, %r3610; + mul.f32 %r3613, %r3612, %r3611; + fma.rn.ftz.f32 %r3614, %r3010, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3615, %r3614; + fma.rm.ftz.f32 %r3616, %r3615, %r373, %r372; + add.f32 %r3617, %r3616, 0fCB40007F; + neg.f32 %r3618, %r3617; + fma.rn.ftz.f32 %r3619, %r3010, %r377, %r3618; + fma.rn.ftz.f32 %r3620, %r3010, %r379, %r3619; + shl.b32 %r3621, %r3616, 23; + ex2.approx.ftz.f32 %r3622, %r3620; + mul.f32 %r3623, %r3622, %r3621; + fma.rn.ftz.f32 %r3624, %r3011, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3625, %r3624; + fma.rm.ftz.f32 %r3626, %r3625, %r373, %r372; + add.f32 %r3627, %r3626, 0fCB40007F; + neg.f32 %r3628, %r3627; + fma.rn.ftz.f32 %r3629, %r3011, %r377, %r3628; + fma.rn.ftz.f32 %r3630, %r3011, %r379, %r3629; + shl.b32 %r3631, %r3626, 23; + ex2.approx.ftz.f32 %r3632, %r3630; + mul.f32 %r3633, %r3632, %r3631; + fma.rn.ftz.f32 %r3634, %r3012, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3635, %r3634; + fma.rm.ftz.f32 %r3636, %r3635, %r373, %r372; + add.f32 %r3637, %r3636, 0fCB40007F; + neg.f32 %r3638, %r3637; + fma.rn.ftz.f32 %r3639, %r3012, %r377, %r3638; + fma.rn.ftz.f32 %r3640, %r3012, %r379, %r3639; + shl.b32 %r3641, %r3636, 23; + ex2.approx.ftz.f32 %r3642, %r3640; + mul.f32 %r3643, %r3642, %r3641; + fma.rn.ftz.f32 %r3644, %r3013, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3645, %r3644; + fma.rm.ftz.f32 %r3646, %r3645, %r373, %r372; + add.f32 %r3647, %r3646, 0fCB40007F; + neg.f32 %r3648, %r3647; + fma.rn.ftz.f32 %r3649, %r3013, %r377, %r3648; + fma.rn.ftz.f32 %r3650, %r3013, %r379, %r3649; + shl.b32 %r3651, %r3646, 23; + ex2.approx.ftz.f32 %r3652, %r3650; + mul.f32 %r3653, %r3652, %r3651; + .loc 1 62 23 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:62:23 + div.full.f32 %r3654, %r3023, %r2885; + div.full.f32 %r3655, %r3033, %r2885; + div.full.f32 %r3656, %r3043, %r2885; + div.full.f32 %r3657, %r3053, %r2885; + div.full.f32 %r3658, %r3063, %r2885; + div.full.f32 %r3659, %r3073, %r2885; + div.full.f32 %r3660, %r3083, %r2885; + div.full.f32 %r3661, %r3093, %r2885; + div.full.f32 %r3662, %r3103, %r2885; + div.full.f32 %r3663, %r3113, %r2885; + div.full.f32 %r3664, %r3123, %r2885; + div.full.f32 %r3665, %r3133, %r2885; + div.full.f32 %r3666, %r3143, %r2885; + div.full.f32 %r3667, %r3153, %r2885; + div.full.f32 %r3668, %r3163, %r2885; + div.full.f32 %r3669, %r3173, %r2885; + div.full.f32 %r3670, %r3183, %r2885; + div.full.f32 %r3671, %r3193, %r2885; + div.full.f32 %r3672, %r3203, %r2885; + div.full.f32 %r3673, %r3213, %r2885; + div.full.f32 %r3674, %r3223, %r2885; + div.full.f32 %r3675, %r3233, %r2885; + div.full.f32 %r3676, %r3243, %r2885; + div.full.f32 %r3677, %r3253, %r2885; + div.full.f32 %r3678, %r3263, %r2885; + div.full.f32 %r3679, %r3273, %r2885; + div.full.f32 %r3680, %r3283, %r2885; + div.full.f32 %r3681, %r3293, %r2885; + div.full.f32 %r3682, %r3303, %r2885; + div.full.f32 %r3683, %r3313, %r2885; + div.full.f32 %r3684, %r3323, %r2885; + div.full.f32 %r3685, %r3333, %r2885; + div.full.f32 %r3686, %r3343, %r2885; + div.full.f32 %r3687, %r3353, %r2885; + div.full.f32 %r3688, %r3363, %r2885; + div.full.f32 %r3689, %r3373, %r2885; + div.full.f32 %r3690, %r3383, %r2885; + div.full.f32 %r3691, %r3393, %r2885; + div.full.f32 %r3692, %r3403, %r2885; + div.full.f32 %r3693, %r3413, %r2885; + div.full.f32 %r3694, %r3423, %r2885; + div.full.f32 %r3695, %r3433, %r2885; + div.full.f32 %r3696, %r3443, %r2885; + div.full.f32 %r3697, %r3453, %r2885; + div.full.f32 %r3698, %r3463, %r2885; + div.full.f32 %r3699, %r3473, %r2885; + div.full.f32 %r3700, %r3483, %r2885; + div.full.f32 %r3701, %r3493, %r2885; + div.full.f32 %r3702, %r3503, %r2885; + div.full.f32 %r3703, %r3513, %r2885; + div.full.f32 %r3704, %r3523, %r2885; + div.full.f32 %r3705, %r3533, %r2885; + div.full.f32 %r3706, %r3543, %r2885; + div.full.f32 %r3707, %r3553, %r2885; + div.full.f32 %r3708, %r3563, %r2885; + div.full.f32 %r3709, %r3573, %r2885; + div.full.f32 %r3710, %r3583, %r2885; + div.full.f32 %r3711, %r3593, %r2885; + div.full.f32 %r3712, %r3603, %r2885; + div.full.f32 %r3713, %r3613, %r2885; + div.full.f32 %r3714, %r3623, %r2885; + div.full.f32 %r3715, %r3633, %r2885; + div.full.f32 %r3716, %r3643, %r2885; + div.full.f32 %r3717, %r3653, %r2885; + .loc 1 63 29 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:63:29 + mad.wide.s32 %rd49, %r222, 4, %rd66; + mad.wide.s32 %rd50, %r224, 4, %rd66; + mad.wide.s32 %rd51, %r225, 4, %rd66; + mad.wide.s32 %rd52, %r226, 4, %rd66; + mad.wide.s32 %rd53, %r227, 4, %rd66; + mad.wide.s32 %rd54, %r228, 4, %rd66; + mad.wide.s32 %rd55, %r229, 4, %rd66; + mad.wide.s32 %rd56, %r230, 4, %rd66; + mad.wide.s32 %rd57, %r231, 4, %rd66; + mad.wide.s32 %rd58, %r232, 4, %rd66; + mad.wide.s32 %rd59, %r233, 4, %rd66; + mad.wide.s32 %rd60, %r234, 4, %rd66; + mad.wide.s32 %rd61, %r235, 4, %rd66; + mad.wide.s32 %rd62, %r236, 4, %rd66; + mad.wide.s32 %rd63, %r237, 4, %rd66; + mad.wide.s32 %rd64, %r238, 4, %rd66; + .loc 1 63 53 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:63:53 + bar.sync 0; + shl.b32 %r3718, %r208, 4; + add.s32 %r3719, %r1901, %r3718; + st.shared.v4.b32 [%r3719], {%r3654, %r3655, %r3656, %r3657}; + xor.b32 %r3720, %r3718, 64; + add.s32 %r3721, %r1901, %r3720; + st.shared.v4.b32 [%r3721+8192], {%r3658, %r3659, %r3660, %r3661}; + bar.sync 0; + shl.b32 %r3722, %r206, 3; + and.b32 %r3723, %r3722, 4080; + and.b32 %r3724, %r206, 1; + neg.s32 %r3725, %r3724; + and.b32 %r3726, %r3725, 8256; + xor.b32 %r3727, %r3726, %r3723; + add.s32 %r3728, %r1901, %r3727; + ld.shared.v4.b32 {%r141, %r142, %r143, %r144}, [%r3728]; + ld.shared.v4.b32 {%r145, %r146, %r147, %r148}, [%r3728+4096]; + bar.sync 0; + st.shared.v4.b32 [%r3719], {%r3662, %r3663, %r3664, %r3665}; + st.shared.v4.b32 [%r3721+8192], {%r3666, %r3667, %r3668, %r3669}; + bar.sync 0; + ld.shared.v4.b32 {%r149, %r150, %r151, %r152}, [%r3728]; + ld.shared.v4.b32 {%r153, %r154, %r155, %r156}, [%r3728+4096]; + bar.sync 0; + st.shared.v4.b32 [%r3719], {%r3670, %r3671, %r3672, %r3673}; + st.shared.v4.b32 [%r3721+8192], {%r3674, %r3675, %r3676, %r3677}; + bar.sync 0; + ld.shared.v4.b32 {%r157, %r158, %r159, %r160}, [%r3728]; + ld.shared.v4.b32 {%r161, %r162, %r163, %r164}, [%r3728+4096]; + bar.sync 0; + st.shared.v4.b32 [%r3719], {%r3678, %r3679, %r3680, %r3681}; + st.shared.v4.b32 [%r3721+8192], {%r3682, %r3683, %r3684, %r3685}; + bar.sync 0; + ld.shared.v4.b32 {%r165, %r166, %r167, %r168}, [%r3728]; + ld.shared.v4.b32 {%r169, %r170, %r171, %r172}, [%r3728+4096]; + bar.sync 0; + st.shared.v4.b32 [%r3719], {%r3686, %r3687, %r3688, %r3689}; + st.shared.v4.b32 [%r3721+8192], {%r3690, %r3691, %r3692, %r3693}; + bar.sync 0; + ld.shared.v4.b32 {%r173, %r174, %r175, %r176}, [%r3728]; + ld.shared.v4.b32 {%r177, %r178, %r179, %r180}, [%r3728+4096]; + bar.sync 0; + st.shared.v4.b32 [%r3719], {%r3694, %r3695, %r3696, %r3697}; + st.shared.v4.b32 [%r3721+8192], {%r3698, %r3699, %r3700, %r3701}; + bar.sync 0; + ld.shared.v4.b32 {%r181, %r182, %r183, %r184}, [%r3728]; + ld.shared.v4.b32 {%r185, %r186, %r187, %r188}, [%r3728+4096]; + bar.sync 0; + st.shared.v4.b32 [%r3719], {%r3702, %r3703, %r3704, %r3705}; + st.shared.v4.b32 [%r3721+8192], {%r3706, %r3707, %r3708, %r3709}; + bar.sync 0; + ld.shared.v4.b32 {%r189, %r190, %r191, %r192}, [%r3728]; + ld.shared.v4.b32 {%r193, %r194, %r195, %r196}, [%r3728+4096]; + bar.sync 0; + st.shared.v4.b32 [%r3719], {%r3710, %r3711, %r3712, %r3713}; + st.shared.v4.b32 [%r3721+8192], {%r3714, %r3715, %r3716, %r3717}; + bar.sync 0; + ld.shared.v4.b32 {%r197, %r198, %r199, %r200}, [%r3728]; + ld.shared.v4.b32 {%r201, %r202, %r203, %r204}, [%r3728+4096]; + // begin inline asm + @%p1 st.global.v4.b32 [ %rd49 + 0 ], { %r141, %r142, %r143, %r144 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd50 + 0 ], { %r145, %r146, %r147, %r148 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd51 + 0 ], { %r149, %r150, %r151, %r152 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd52 + 0 ], { %r153, %r154, %r155, %r156 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd53 + 0 ], { %r157, %r158, %r159, %r160 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd54 + 0 ], { %r161, %r162, %r163, %r164 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd55 + 0 ], { %r165, %r166, %r167, %r168 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd56 + 0 ], { %r169, %r170, %r171, %r172 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd57 + 0 ], { %r173, %r174, %r175, %r176 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd58 + 0 ], { %r177, %r178, %r179, %r180 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd59 + 0 ], { %r181, %r182, %r183, %r184 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd60 + 0 ], { %r185, %r186, %r187, %r188 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd61 + 0 ], { %r189, %r190, %r191, %r192 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd62 + 0 ], { %r193, %r194, %r195, %r196 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd63 + 0 ], { %r197, %r198, %r199, %r200 }; + // end inline asm + // begin inline asm + @%p38 st.global.v4.b32 [ %rd64 + 0 ], { %r201, %r202, %r203, %r204 }; + // end inline asm + .loc 1 52 4 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:52:4 + ret; +$L__tmp14: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 276 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x10d DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 52 +.b8 105 +.b8 119 +.b8 110 +.b8 104 +.b8 115 +.b8 102 +.b8 53 +.b8 107 +.b8 102 +.b8 109 +.b8 109 +.b8 55 +.b8 106 +.b8 110 +.b8 122 +.b8 114 +.b8 107 +.b8 121 +.b8 105 +.b8 118 +.b8 52 +.b8 120 +.b8 51 +.b8 121 +.b8 97 +.b8 104 +.b8 106 +.b8 111 +.b8 103 +.b8 54 +.b8 100 +.b8 121 +.b8 104 +.b8 102 +.b8 52 +.b8 112 +.b8 114 +.b8 109 +.b8 50 +.b8 99 +.b8 106 +.b8 100 +.b8 105 +.b8 53 +.b8 120 +.b8 104 +.b8 108 +.b8 108 +.b8 120 +.b8 50 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 52 +.b8 105 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x46 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 101 +.b8 120 +.b8 112 +.b8 95 +.b8 112 +.b8 114 +.b8 101 +.b8 112 +.b8 97 +.b8 114 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 111 +.b8 110 +.b8 108 +.b8 105 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 117 +.b8 98 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xd1:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xe6:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp11 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 40 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xfe:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp13 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 49 // DW_AT_call_line +.b8 33 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source new file mode 100644 index 0000000000000000000000000000000000000000..e2df6bfde70760a7723fbf6e03a25f91335f0e70 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source @@ -0,0 +1,449 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":18:0) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":186:0) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":109:0) +#loc68 = loc(unknown) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":169:0) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":177:0) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":122:0) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc108 = loc("in_ptr0"(#loc)) +#loc109 = loc("out_ptr2"(#loc)) +#loc110 = loc("xnumel"(#loc)) +#loc111 = loc("r0_numel"(#loc)) +#loc146 = loc("lhs_max"(#loc48)) +#loc147 = loc("lhs_sum"(#loc48)) +#loc148 = loc("rhs_max"(#loc48)) +#loc160 = loc("a"(#loc62)) +#loc161 = loc("b"(#loc62)) +#loc165 = loc("x"(#loc72)) +#loc166 = loc("x"(#loc76)) +#loc167 = loc("x"(#loc81)) +#loc168 = loc("lhs_max"(#loc85)) +#loc169 = loc("lhs_sum"(#loc85)) +#loc178 = loc("a"(#loc96)) +#loc179 = loc("input"(#loc100)) +#loc180 = loc("a"(#loc104)) +#loc181 = loc("b"(#loc104)) +module { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 16384 : i32 loc(#loc112) + %r0_numel_1 = arith.constant 32000 : i32 loc(#loc113) + %xoffset = tt.get_program_id x : i32 loc(#loc114) + %xoffset_2 = arith.constant 1 : i32 loc(#loc115) + %xoffset_3 = arith.constant 1 : i32 loc(#loc115) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc115) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc116) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc117) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc118) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc118) + %xmask = arith.constant true loc(#loc119) + %xmask_8 = arith.constant dense : tensor<1x32768xi1> loc(#loc119) + %r0_base = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc120) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32768xi32> -> tensor<1x32768xi32> loc(#loc121) + %_tmp3_max = arith.constant 0xFF800000 : f32 loc(#loc122) + %_tmp3_max_10 = arith.constant dense<0xFF800000> : tensor<1x32768xf32> loc(#loc122) + %_tmp3_sum = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_32768__(1,)cconstexpr_fp32_"() : () -> tensor<1x32768xf32> loc(#loc123) + %c0_i32 = arith.constant 0 : i32 loc(#loc13) + %c32768_i32 = arith.constant 32768 : i32 loc(#loc13) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc13) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc13) + %2 = arith.bitcast %c32768_i32 : i32 to i32 loc(#loc13) + %3 = ub.poison : i32 loc(#loc13) + %_tmp3_sum_11:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_max_14 = %_tmp3_max_10, %_tmp3_sum_15 = %_tmp3_sum) -> (tensor<1x32768xf32>, tensor<1x32768xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32768xi32> loc(#loc125) + %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x32768xi32> loc(#loc125) + %r0_mask = arith.constant dense<32000> : tensor<1x32768xi32> loc(#loc126) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x32768xi32> loc(#loc126) + %tmp0 = arith.constant 32000 : i32 loc(#loc127) + %tmp0_18 = arith.constant 32000 : i32 loc(#loc127) + %tmp0_19 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc127) + %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc127) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x32768xi32> loc(#loc128) + %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x32768xi32> loc(#loc128) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32768x!tt.ptr> loc(#loc129) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x32768x!tt.ptr>, tensor<1x32768xi32> loc(#loc129) + %tmp0_25 = arith.constant 0.000000e+00 : f32 loc(#loc130) + %tmp0_26 = arith.constant dense<0.000000e+00> : tensor<1x32768xf32> loc(#loc130) + %tmp0_27 = arith.truncf %tmp0_26 : tensor<1x32768xf32> to tensor<1x32768xbf16> loc(#loc130) + %tmp0_28 = tt.load %tmp0_24, %r0_mask_17, %tmp0_27 evictionPolicy = evict_last : tensor<1x32768x!tt.ptr> loc(#loc130) + %tmp0_29 = arith.extf %tmp0_28 : tensor<1x32768xbf16> to tensor<1x32768xf32> loc(#loc131) + %9:2 = tt.call @"torch._inductor.runtime.triton_helpers.online_softmax_combine__fp32S1_32768S_fp32S1_32768S_fp32S1_32768S__(3,)cconstexpr_False_"(%_tmp3_max_14, %_tmp3_sum_15, %tmp0_29) : (tensor<1x32768xf32>, tensor<1x32768xf32>, tensor<1x32768xf32>) -> (tensor<1x32768xf32>, tensor<1x32768xf32>) loc(#loc21) + %_tmp3_max_30 = arith.select %r0_mask_17, %9#0, %_tmp3_max_14 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc132) + %_tmp3_sum_31 = arith.select %r0_mask_17, %9#1, %_tmp3_sum_15 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc133) + scf.yield %_tmp3_max_30, %_tmp3_sum_31 : tensor<1x32768xf32>, tensor<1x32768xf32> loc(#loc24) + } loc(#loc182) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.online_softmax_reduce__fp32S1_32768S_fp32S1_32768S__(2,)cconstexpr_1__(3,)cconstexpr_False_"(%_tmp3_sum_11#0, %_tmp3_sum_11#1) : (tensor<1x32768xf32>, tensor<1x32768xf32>) -> (tensor<1xf32>, tensor<1xf32>) loc(#loc25) + %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc134) + %tmp4 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc135) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc28) + %c32768_i32_13 = arith.constant 32768 : i32 loc(#loc28) + %5 = arith.bitcast %c0_i32_12 : i32 to i32 loc(#loc28) + %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc28) + %7 = arith.bitcast %c32768_i32_13 : i32 to i32 loc(#loc28) + %8 = ub.poison : i32 loc(#loc28) + scf.for %r0_offset = %5 to %6 step %7 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32768xi32> loc(#loc136) + %r0_index_14 = arith.addi %r0_index, %r0_base_9 : tensor<1x32768xi32> loc(#loc136) + %r0_mask = arith.constant dense<32000> : tensor<1x32768xi32> loc(#loc137) + %r0_mask_15 = arith.cmpi slt, %r0_index_14, %r0_mask : tensor<1x32768xi32> loc(#loc137) + %tmp5 = arith.constant 32000 : i32 loc(#loc138) + %tmp5_16 = arith.constant 32000 : i32 loc(#loc138) + %tmp5_17 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc138) + %tmp5_18 = arith.muli %tmp5_17, %xindex_7 : tensor<1x1xi32> loc(#loc138) + %tmp5_19 = tt.broadcast %tmp5_18 : tensor<1x1xi32> -> tensor<1x32768xi32> loc(#loc139) + %tmp5_20 = arith.addi %r0_index_14, %tmp5_19 : tensor<1x32768xi32> loc(#loc139) + %tmp5_21 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32768x!tt.ptr> loc(#loc140) + %tmp5_22 = tt.addptr %tmp5_21, %tmp5_20 : tensor<1x32768x!tt.ptr>, tensor<1x32768xi32> loc(#loc140) + %tmp5_23 = arith.constant 0.000000e+00 : f32 loc(#loc141) + %tmp5_24 = arith.constant dense<0.000000e+00> : tensor<1x32768xf32> loc(#loc141) + %tmp5_25 = arith.truncf %tmp5_24 : tensor<1x32768xf32> to tensor<1x32768xbf16> loc(#loc141) + %tmp5_26 = tt.load %tmp5_22, %r0_mask_15, %tmp5_25 evictionPolicy = evict_first : tensor<1x32768x!tt.ptr> loc(#loc141) + %tmp5_27 = arith.extf %tmp5_26 : tensor<1x32768xbf16> to tensor<1x32768xf32> loc(#loc142) + %tmp7 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x32768xf32> loc(#loc143) + %tmp7_28 = arith.subf %tmp5_27, %tmp7 : tensor<1x32768xf32> loc(#loc143) + %tmp8 = tt.extern_elementwise %tmp7_28 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc144) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32> -> tensor<1x32768xf32> loc(#loc145) + %tmp9_29 = arith.divf %tmp8, %tmp9 : tensor<1x32768xf32> loc(#loc145) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc39) + %c32000_i32_30 = arith.constant 32000 : i32 loc(#loc39) + %cst = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc39) + %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc39) + %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x32768xi32> loc(#loc40) + %11 = arith.addi %r0_index_14, %10 : tensor<1x32768xi32> loc(#loc40) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32768x!tt.ptr> loc(#loc41) + %13 = tt.addptr %12, %11 : tensor<1x32768x!tt.ptr>, tensor<1x32768xi32> loc(#loc41) + tt.store %13, %tmp9_29, %r0_mask_15 : tensor<1x32768x!tt.ptr> loc(#loc42) + } loc(#loc28) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_32768__(1,)cconstexpr_fp32_"() -> tensor<1x32768xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc45) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x32768xf32> loc(#loc45) + tt.return %cst_0 : tensor<1x32768xf32> loc(#loc46) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x32768xf32> loc(#loc47) + tt.return %0 : tensor<1x32768xf32> loc(#loc47) + } loc(#loc44) + tt.func private @"torch._inductor.runtime.triton_helpers.online_softmax_combine__fp32S1_32768S_fp32S1_32768S_fp32S1_32768S__(3,)cconstexpr_False_"(%lhs_max: tensor<1x32768xf32> loc("lhs_max"(#loc48)), %lhs_sum: tensor<1x32768xf32> loc("lhs_sum"(#loc48)), %rhs_max: tensor<1x32768xf32> loc("rhs_max"(#loc48))) -> (tensor<1x32768xf32>, tensor<1x32768xf32>) attributes {noinline = false} { + %out_max = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32S1_32768S_fp32S1_32768S__(%lhs_max, %rhs_max) : (tensor<1x32768xf32>, tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc149) + %lhs_scale = arith.constant 0xFF800000 : f32 loc(#loc150) + %lhs_scale_0 = arith.constant dense<0xFF800000> : tensor<1x32768xf32> loc(#loc150) + %lhs_scale_1 = arith.cmpf oeq, %out_max, %lhs_scale_0 : tensor<1x32768xf32> loc(#loc150) + %lhs_scale_2 = arith.subf %lhs_max, %out_max : tensor<1x32768xf32> loc(#loc151) + %lhs_scale_3 = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_32768S__(1,)cconstexpr_False_"(%lhs_scale_2) : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc152) + %lhs_scale_4 = arith.constant 1.000000e+00 : f32 loc(#loc153) + %lhs_scale_5 = arith.constant 1.000000e+00 : f32 loc(#loc153) + %lhs_scale_6 = arith.constant dense<1.000000e+00> : tensor<1x32768xf32> loc(#loc153) + %lhs_scale_7 = arith.select %lhs_scale_1, %lhs_scale_6, %lhs_scale_3 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc153) + %rhs_scale = arith.constant 0xFF800000 : f32 loc(#loc154) + %rhs_scale_8 = arith.constant dense<0xFF800000> : tensor<1x32768xf32> loc(#loc154) + %rhs_scale_9 = arith.cmpf oeq, %out_max, %rhs_scale_8 : tensor<1x32768xf32> loc(#loc154) + %rhs_scale_10 = arith.subf %rhs_max, %out_max : tensor<1x32768xf32> loc(#loc155) + %rhs_scale_11 = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_32768S__(1,)cconstexpr_False_"(%rhs_scale_10) : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc156) + %rhs_scale_12 = arith.constant 1.000000e+00 : f32 loc(#loc157) + %rhs_scale_13 = arith.constant 1.000000e+00 : f32 loc(#loc157) + %rhs_scale_14 = arith.constant dense<1.000000e+00> : tensor<1x32768xf32> loc(#loc157) + %rhs_scale_15 = arith.select %rhs_scale_9, %rhs_scale_14, %rhs_scale_11 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc157) + %out_sum = arith.mulf %lhs_sum, %lhs_scale_7 : tensor<1x32768xf32> loc(#loc158) + %out_sum_16 = arith.addf %out_sum, %rhs_scale_15 : tensor<1x32768xf32> loc(#loc159) + tt.return %out_max, %out_sum_16 : tensor<1x32768xf32>, tensor<1x32768xf32> loc(#loc60) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x32768xf32> loc(#loc61) + %1 = ub.poison : tensor<1x32768xf32> loc(#loc61) + tt.return %0, %1 : tensor<1x32768xf32>, tensor<1x32768xf32> loc(#loc61) + } loc(#loc48) + tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32S1_32768S_fp32S1_32768S__(%a: tensor<1x32768xf32> loc("a"(#loc62)), %b: tensor<1x32768xf32> loc("b"(#loc62))) -> tensor<1x32768xf32> attributes {noinline = false} { + %mask = arith.cmpf ogt, %a, %b : tensor<1x32768xf32> loc(#loc183) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_32768S__(%a) : (tensor<1x32768xf32>) -> i1 loc(#loc64) + %1 = scf.if %0 -> (tensor<1x32768xi1>) { + %mask_0 = arith.cmpf une, %a, %a : tensor<1x32768xf32> loc(#loc163) + %mask_1 = arith.ori %mask, %mask_0 : tensor<1x32768xi1> loc(#loc184) + scf.yield %mask_1 : tensor<1x32768xi1> loc(#loc184) + } else { + scf.yield %mask : tensor<1x32768xi1> loc(#loc68) + } loc(#loc65) + %2 = arith.select %1, %a, %b : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc69) + tt.return %2 : tensor<1x32768xf32> loc(#loc70) + ^bb1: // no predecessors + %3 = ub.poison : tensor<1x32768xf32> loc(#loc71) + tt.return %3 : tensor<1x32768xf32> loc(#loc71) + } loc(#loc62) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_32768S__(%x: tensor<1x32768xf32> loc("x"(#loc72))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_32768S__(%x) : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc73) + %true = arith.constant true loc(#loc74) + tt.return %true : i1 loc(#loc74) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc75) + tt.return %1 : i1 loc(#loc75) + } loc(#loc72) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_32768S__(%x: tensor<1x32768xf32> loc("x"(#loc76))) -> tensor<1x32768xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc77) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc78) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc78) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x32768xf32> loc(#loc78) + %4 = arith.addf %x, %3 : tensor<1x32768xf32> loc(#loc78) + tt.return %4 : tensor<1x32768xf32> loc(#loc79) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x32768xf32> loc(#loc80) + tt.return %5 : tensor<1x32768xf32> loc(#loc80) + } loc(#loc76) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc45) + %cst = arith.constant dense : tensor<1xi1> loc(#loc45) + tt.return %cst : tensor<1xi1> loc(#loc46) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc47) + tt.return %0 : tensor<1xi1> loc(#loc47) + } loc(#loc44) + tt.func private @"torch._inductor.runtime.triton_helpers.exp__fp32S1_32768S__(1,)cconstexpr_False_"(%x: tensor<1x32768xf32> loc("x"(#loc81))) -> tensor<1x32768xf32> attributes {noinline = false} { + %0 = tt.extern_elementwise %x {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc82) + tt.return %0 : tensor<1x32768xf32> loc(#loc83) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x32768xf32> loc(#loc84) + tt.return %1 : tensor<1x32768xf32> loc(#loc84) + } loc(#loc81) + tt.func private @"torch._inductor.runtime.triton_helpers.online_softmax_reduce__fp32S1_32768S_fp32S1_32768S__(2,)cconstexpr_1__(3,)cconstexpr_False_"(%lhs_max: tensor<1x32768xf32> loc("lhs_max"(#loc85)), %lhs_sum: tensor<1x32768xf32> loc("lhs_sum"(#loc85))) -> (tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} { + %out_max = tt.call @"torch._inductor.runtime.triton_helpers.max2__fp32S1_32768S__(1,)cconstexpr_1_"(%lhs_max) : (tensor<1x32768xf32>) -> tensor<1xf32> loc(#loc170) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc171) + %delta = arith.constant 0xFF800000 : f32 loc(#loc172) + %delta_0 = arith.constant dense<0xFF800000> : tensor<1x1xf32> loc(#loc172) + %delta_1 = arith.cmpf oeq, %out_max_keepdim, %delta_0 : tensor<1x1xf32> loc(#loc172) + %delta_2 = tt.broadcast %out_max_keepdim : tensor<1x1xf32> -> tensor<1x32768xf32> loc(#loc173) + %delta_3 = arith.subf %lhs_max, %delta_2 : tensor<1x32768xf32> loc(#loc173) + %delta_4 = arith.constant 0 : i32 loc(#loc174) + %delta_5 = arith.constant 0.000000e+00 : f32 loc(#loc174) + %delta_6 = arith.constant dense<0.000000e+00> : tensor<1x32768xf32> loc(#loc174) + %delta_7 = tt.broadcast %delta_1 : tensor<1x1xi1> -> tensor<1x32768xi1> loc(#loc174) + %delta_8 = arith.select %delta_7, %delta_6, %delta_3 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc174) + %out_sum = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_32768S__(1,)cconstexpr_False_"(%delta_8) : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc175) + %out_sum_9 = arith.mulf %lhs_sum, %out_sum : tensor<1x32768xf32> loc(#loc176) + %out_sum_10 = tt.call @"triton.language.standard.sum__fp32S1_32768S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%out_sum_9) : (tensor<1x32768xf32>) -> tensor<1xf32> loc(#loc177) + tt.return %out_max, %out_sum_10 : tensor<1xf32>, tensor<1xf32> loc(#loc94) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xf32> loc(#loc95) + %1 = ub.poison : tensor<1xf32> loc(#loc95) + tt.return %0, %1 : tensor<1xf32>, tensor<1xf32> loc(#loc95) + } loc(#loc85) + tt.func private @"torch._inductor.runtime.triton_helpers.max2__fp32S1_32768S__(1,)cconstexpr_1_"(%a: tensor<1x32768xf32> loc("a"(#loc96))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%a) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc97) + tt.reduce.return %2 : f32 loc(#loc97) + }) : (tensor<1x32768xf32>) -> tensor<1xf32> loc(#loc97) + tt.return %0 : tensor<1xf32> loc(#loc98) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc99) + tt.return %1 : tensor<1xf32> loc(#loc99) + } loc(#loc96) + tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%a: f32 loc("a"(#loc62)), %b: f32 loc("b"(#loc62))) -> f32 attributes {noinline = false} { + %mask = arith.cmpf ogt, %a, %b : f32 loc(#loc183) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a) : (f32) -> i1 loc(#loc64) + %1 = scf.if %0 -> (i1) { + %mask_0 = arith.cmpf une, %a, %a : f32 loc(#loc163) + %mask_1 = arith.ori %mask, %mask_0 : i1 loc(#loc184) + scf.yield %mask_1 : i1 loc(#loc184) + } else { + scf.yield %mask : i1 loc(#loc68) + } loc(#loc65) + %2 = arith.select %1, %a, %b : f32 loc(#loc69) + tt.return %2 : f32 loc(#loc70) + ^bb1: // no predecessors + %3 = ub.poison : f32 loc(#loc71) + tt.return %3 : f32 loc(#loc71) + } loc(#loc62) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc72))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc73) + %true = arith.constant true loc(#loc74) + tt.return %true : i1 loc(#loc74) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc75) + tt.return %1 : i1 loc(#loc75) + } loc(#loc72) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc76))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc77) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc78) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc78) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc78) + tt.return %3 : tensor<1xf32> loc(#loc79) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc80) + tt.return %4 : tensor<1xf32> loc(#loc80) + } loc(#loc76) + tt.func private @"triton.language.standard.sum__fp32S1_32768S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x32768xf32> loc("input"(#loc100))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc101) + tt.reduce.return %2 : f32 loc(#loc101) + }) : (tensor<1x32768xf32>) -> tensor<1xf32> loc(#loc101) + tt.return %0 : tensor<1xf32> loc(#loc102) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc103) + tt.return %1 : tensor<1xf32> loc(#loc103) + } loc(#loc100) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc104)), %b: f32 loc("b"(#loc104))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc105) + tt.return %0 : f32 loc(#loc106) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc107) + tt.return %1 : f32 loc(#loc107) + } loc(#loc104) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":29:59) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":30:45) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":31:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":32:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":33:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:47) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:52) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:105) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":42:40) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":45:54) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":46:54) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":46:8) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":49:33) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":50:16) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":51:16) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":52:40) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":53:31) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":54:29) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:47) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:52) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:106) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":60:22) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":61:29) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":62:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:42) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:36) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:29) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:53) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":52:4) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:19) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":206:11) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":206:4) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:19) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:7) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:11) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:4) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:15) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":170:4) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":182:11) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":182:4) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:11) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:4) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc112 = loc("xnumel"(#loc1)) +#loc113 = loc("r0_numel"(#loc2)) +#loc114 = loc("xoffset"(#loc3)) +#loc115 = loc("xoffset"(#loc4)) +#loc116 = loc("xindex"(#loc5)) +#loc117 = loc("xindex"(#loc6)) +#loc118 = loc("xindex"(#loc7)) +#loc119 = loc("xmask"(#loc8)) +#loc120 = loc("r0_base"(#loc9)) +#loc121 = loc("r0_base"(#loc10)) +#loc122 = loc("_tmp3_max"(#loc11)) +#loc123 = loc("_tmp3_sum"(#loc12)) +#loc124 = loc("_tmp3_max"(#loc13)) +#loc125 = loc("r0_index"(#loc14)) +#loc126 = loc("r0_mask"(#loc15)) +#loc127 = loc("tmp0"(#loc16)) +#loc128 = loc("tmp0"(#loc17)) +#loc129 = loc("tmp0"(#loc18)) +#loc130 = loc("tmp0"(#loc19)) +#loc131 = loc("tmp0"(#loc20)) +#loc132 = loc("_tmp3_max"(#loc22)) +#loc133 = loc("_tmp3_sum"(#loc23)) +#loc134 = loc("tmp3"(#loc26)) +#loc135 = loc("tmp4"(#loc27)) +#loc136 = loc("r0_index"(#loc29)) +#loc137 = loc("r0_mask"(#loc30)) +#loc138 = loc("tmp5"(#loc31)) +#loc139 = loc("tmp5"(#loc32)) +#loc140 = loc("tmp5"(#loc33)) +#loc141 = loc("tmp5"(#loc34)) +#loc142 = loc("tmp5"(#loc35)) +#loc143 = loc("tmp7"(#loc36)) +#loc144 = loc("tmp8"(#loc37)) +#loc145 = loc("tmp9"(#loc38)) +#loc149 = loc("out_max"(#loc49)) +#loc150 = loc("lhs_scale"(#loc50)) +#loc151 = loc("lhs_scale"(#loc51)) +#loc152 = loc("lhs_scale"(#loc52)) +#loc153 = loc("lhs_scale"(#loc53)) +#loc154 = loc("rhs_scale"(#loc54)) +#loc155 = loc("rhs_scale"(#loc55)) +#loc156 = loc("rhs_scale"(#loc56)) +#loc157 = loc("rhs_scale"(#loc57)) +#loc158 = loc("out_sum"(#loc58)) +#loc159 = loc("out_sum"(#loc59)) +#loc162 = loc("mask"(#loc63)) +#loc163 = loc("mask"(#loc66)) +#loc164 = loc("mask"(#loc67)) +#loc170 = loc("out_max"(#loc86)) +#loc171 = loc("out_max_keepdim"(#loc87)) +#loc172 = loc("delta"(#loc88)) +#loc173 = loc("delta"(#loc89)) +#loc174 = loc("delta"(#loc90)) +#loc175 = loc("out_sum"(#loc91)) +#loc176 = loc("out_sum"(#loc92)) +#loc177 = loc("out_sum"(#loc93)) +#loc182 = loc("_tmp3_sum"(#loc124)) +#loc183 = loc("mask"(#loc162)) +#loc184 = loc("mask"(#loc164)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..fe7b1166529957e43cec02f595b2b9af66a7c510 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir @@ -0,0 +1,201 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":18:0) +#loc1 = loc(unknown) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":49:33) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc49 = loc("in_ptr0"(#loc)) +#loc50 = loc("out_ptr2"(#loc)) +#loc51 = loc("xnumel"(#loc)) +#loc52 = loc("r0_numel"(#loc)) +#loc74 = loc("out_max"(#loc27)) +#loc83 = loc("out_sum"(#loc38)) +#loc102 = loc(callsite(#loc74 at #loc28)) +#loc110 = loc(callsite(#loc83 at #loc28)) +#loc116 = loc(callsite(#loc1 at #loc102)) +#loc119 = loc(callsite(#loc1 at #loc110)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32000> : tensor<1x32768xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<32000> : tensor<1x32768xi32, #blocked1> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x32768xbf16, #blocked1> loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %cst_2 = arith.constant dense<0xFF800000> : tensor<1x1xf32, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x32768xf32, #blocked1> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<1x32768xf32, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<0xFF800000> : tensor<1x32768xf32, #blocked1> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc53) + %r0_base = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc54) + %r0_base_6 = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc54) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32768xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32768xi32, #blocked1> loc(#loc54) + %r0_base_8 = tt.expand_dims %r0_base_6 {axis = 0 : i32} : tensor<32768xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32768xi32, #blocked> loc(#loc54) + %r0_mask = arith.cmpi slt, %r0_base_7, %cst_0 : tensor<1x32768xi32, #blocked1> loc(#loc55) + %r0_mask_9 = arith.cmpi slt, %r0_base_8, %cst : tensor<1x32768xi32, #blocked> loc(#loc55) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc56) + %tmp0_10 = tt.splat %tmp0 : i32 -> tensor<1x32768xi32, #blocked1> loc(#loc90) + %tmp0_11 = tt.splat %tmp0 : i32 -> tensor<1x32768xi32, #blocked> loc(#loc90) + %tmp0_12 = arith.addi %r0_base_7, %tmp0_10 : tensor<1x32768xi32, #blocked1> loc(#loc57) + %tmp0_13 = arith.addi %r0_base_8, %tmp0_11 : tensor<1x32768xi32, #blocked> loc(#loc57) + %tmp0_14 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32768x!tt.ptr, #blocked1> loc(#loc58) + %tmp0_15 = tt.addptr %tmp0_14, %tmp0_12 : tensor<1x32768x!tt.ptr, #blocked1>, tensor<1x32768xi32, #blocked1> loc(#loc58) + %tmp0_16 = tt.load %tmp0_15, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x32768x!tt.ptr, #blocked1> loc(#loc59) + %tmp0_17 = arith.extf %tmp0_16 : tensor<1x32768xbf16, #blocked1> to tensor<1x32768xf32, #blocked1> loc(#loc60) + %mask = arith.cmpf ogt, %cst_5, %tmp0_17 : tensor<1x32768xf32, #blocked1> loc(#loc111) + %out_max = arith.select %mask, %cst_5, %tmp0_17 : tensor<1x32768xi1, #blocked1>, tensor<1x32768xf32, #blocked1> loc(#loc112) + %lhs_scale = arith.cmpf oeq, %out_max, %cst_5 : tensor<1x32768xf32, #blocked1> loc(#loc93) + %lhs_scale_18 = arith.subf %cst_5, %out_max : tensor<1x32768xf32, #blocked1> loc(#loc94) + %lhs_scale_19 = tt.extern_elementwise %lhs_scale_18 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32, #blocked1>) -> tensor<1x32768xf32, #blocked1> loc(#loc113) + %lhs_scale_20 = arith.select %lhs_scale, %cst_3, %lhs_scale_19 : tensor<1x32768xi1, #blocked1>, tensor<1x32768xf32, #blocked1> loc(#loc96) + %rhs_scale = arith.subf %tmp0_17, %out_max : tensor<1x32768xf32, #blocked1> loc(#loc97) + %rhs_scale_21 = tt.extern_elementwise %rhs_scale {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32, #blocked1>) -> tensor<1x32768xf32, #blocked1> loc(#loc114) + %rhs_scale_22 = arith.select %lhs_scale, %cst_3, %rhs_scale_21 : tensor<1x32768xi1, #blocked1>, tensor<1x32768xf32, #blocked1> loc(#loc99) + %out_sum = arith.mulf %lhs_scale_20, %cst_4 : tensor<1x32768xf32, #blocked1> loc(#loc100) + %out_sum_23 = arith.addf %out_sum, %rhs_scale_22 : tensor<1x32768xf32, #blocked1> loc(#loc101) + %_tmp3_max = arith.select %r0_mask, %out_max, %cst_5 : tensor<1x32768xi1, #blocked1>, tensor<1x32768xf32, #blocked1> loc(#loc72) + %_tmp3_sum = arith.select %r0_mask, %out_sum_23, %cst_4 : tensor<1x32768xi1, #blocked1>, tensor<1x32768xf32, #blocked1> loc(#loc73) + %out_max_24 = "tt.reduce"(%_tmp3_max) <{axis = 1 : i32}> ({ + ^bb0(%out_max_34: f32 loc(callsite(#loc1 at #loc102)), %out_max_35: f32 loc(callsite(#loc1 at #loc102))): + %mask_36 = arith.cmpf ogt, %out_max_34, %out_max_35 : f32 loc(#loc120) + %mask_37 = arith.cmpf une, %out_max_34, %out_max_34 : f32 loc(#loc121) + %mask_38 = arith.ori %mask_36, %mask_37 : i1 loc(#loc122) + %out_max_39 = arith.select %mask_38, %out_max_34, %out_max_35 : f32 loc(#loc123) + tt.reduce.return %out_max_39 : f32 loc(#loc115) + }) : (tensor<1x32768xf32, #blocked1>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc115) + %out_max_keepdim = tt.expand_dims %out_max_24 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xf32, #blocked1> loc(#loc104) + %delta = arith.cmpf oeq, %out_max_keepdim, %cst_2 : tensor<1x1xf32, #blocked1> loc(#loc105) + %delta_25 = tt.broadcast %out_max_keepdim : tensor<1x1xf32, #blocked1> -> tensor<1x32768xf32, #blocked1> loc(#loc106) + %delta_26 = arith.subf %_tmp3_max, %delta_25 : tensor<1x32768xf32, #blocked1> loc(#loc106) + %delta_27 = tt.broadcast %delta : tensor<1x1xi1, #blocked1> -> tensor<1x32768xi1, #blocked1> loc(#loc107) + %delta_28 = arith.select %delta_27, %cst_4, %delta_26 : tensor<1x32768xi1, #blocked1>, tensor<1x32768xf32, #blocked1> loc(#loc107) + %out_sum_29 = tt.extern_elementwise %delta_28 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32, #blocked1>) -> tensor<1x32768xf32, #blocked1> loc(#loc117) + %out_sum_30 = arith.mulf %_tmp3_sum, %out_sum_29 : tensor<1x32768xf32, #blocked1> loc(#loc109) + %out_sum_31 = "tt.reduce"(%out_sum_30) <{axis = 1 : i32}> ({ + ^bb0(%out_sum_34: f32 loc(callsite(#loc1 at #loc110)), %out_sum_35: f32 loc(callsite(#loc1 at #loc110))): + %out_sum_36 = arith.addf %out_sum_34, %out_sum_35 : f32 loc(#loc124) + tt.reduce.return %out_sum_36 : f32 loc(#loc118) + }) : (tensor<1x32768xf32, #blocked1>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc118) + %tmp4 = tt.expand_dims %out_sum_31 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xf32, #blocked1> loc(#loc84) + %tmp5 = tt.load %tmp0_15, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x32768x!tt.ptr, #blocked1> loc(#loc85) + %tmp5_32 = arith.extf %tmp5 : tensor<1x32768xbf16, #blocked1> to tensor<1x32768xf32, #blocked1> loc(#loc86) + %tmp7 = arith.subf %tmp5_32, %delta_25 : tensor<1x32768xf32, #blocked1> loc(#loc87) + %tmp8 = tt.extern_elementwise %tmp7 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32, #blocked1>) -> tensor<1x32768xf32, #blocked1> loc(#loc88) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32, #blocked1> -> tensor<1x32768xf32, #blocked1> loc(#loc89) + %tmp9_33 = arith.divf %tmp8, %tmp9 : tensor<1x32768xf32, #blocked1> loc(#loc89) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32768x!tt.ptr, #blocked> loc(#loc46) + %1 = tt.addptr %0, %tmp0_13 : tensor<1x32768x!tt.ptr, #blocked>, tensor<1x32768xi32, #blocked> loc(#loc46) + %2 = ttg.convert_layout %tmp9_33 : tensor<1x32768xf32, #blocked1> -> tensor<1x32768xf32, #blocked> loc(#loc47) + tt.store %1, %2, %r0_mask_9 : tensor<1x32768x!tt.ptr, #blocked> loc(#loc47) + tt.return loc(#loc48) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":26:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":33:29) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:47) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:41) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:34) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:52) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:105) +#loc10 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc11 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":42:40) +#loc13 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":45:54) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":46:54) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":51:16) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:52) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:106) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":60:22) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":61:29) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":62:23) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:29) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:53) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":52:4) +#loc53 = loc("xoffset"(#loc2)) +#loc54 = loc("r0_base"(#loc3)) +#loc55 = loc("r0_mask"(#loc4)) +#loc56 = loc("tmp0"(#loc5)) +#loc57 = loc("tmp0"(#loc6)) +#loc58 = loc("tmp0"(#loc7)) +#loc59 = loc("tmp0"(#loc8)) +#loc60 = loc("tmp0"(#loc9)) +#loc61 = loc("mask"(#loc10)) +#loc62 = loc("out_max"(#loc11)) +#loc63 = loc("lhs_scale"(#loc14)) +#loc64 = loc("lhs_scale"(#loc15)) +#loc65 = loc("lhs_scale"(#loc17)) +#loc66 = loc("lhs_scale"(#loc18)) +#loc67 = loc("rhs_scale"(#loc19)) +#loc68 = loc("rhs_scale"(#loc20)) +#loc69 = loc("rhs_scale"(#loc21)) +#loc70 = loc("out_sum"(#loc22)) +#loc71 = loc("out_sum"(#loc23)) +#loc72 = loc("_tmp3_max"(#loc24)) +#loc73 = loc("_tmp3_sum"(#loc25)) +#loc75 = loc("mask"(#loc29)) +#loc76 = loc("mask"(#loc30)) +#loc77 = loc("out_max_keepdim"(#loc31)) +#loc78 = loc("delta"(#loc32)) +#loc79 = loc("delta"(#loc33)) +#loc80 = loc("delta"(#loc34)) +#loc81 = loc("out_sum"(#loc35)) +#loc82 = loc("out_sum"(#loc36)) +#loc84 = loc("tmp4"(#loc40)) +#loc85 = loc("tmp5"(#loc41)) +#loc86 = loc("tmp5"(#loc42)) +#loc87 = loc("tmp7"(#loc43)) +#loc88 = loc("tmp8"(#loc44)) +#loc89 = loc("tmp9"(#loc45)) +#loc90 = loc(fused[#loc57, #loc56]) +#loc91 = loc("mask"(#loc61)) +#loc92 = loc(callsite(#loc62 at #loc12)) +#loc93 = loc(callsite(#loc63 at #loc12)) +#loc94 = loc(callsite(#loc64 at #loc12)) +#loc95 = loc(callsite(#loc65 at #loc12)) +#loc96 = loc(callsite(#loc66 at #loc12)) +#loc97 = loc(callsite(#loc67 at #loc12)) +#loc98 = loc(callsite(#loc68 at #loc12)) +#loc99 = loc(callsite(#loc69 at #loc12)) +#loc100 = loc(callsite(#loc70 at #loc12)) +#loc101 = loc(callsite(#loc71 at #loc12)) +#loc103 = loc("mask"(#loc76)) +#loc104 = loc(callsite(#loc77 at #loc28)) +#loc105 = loc(callsite(#loc78 at #loc28)) +#loc106 = loc(callsite(#loc79 at #loc28)) +#loc107 = loc(callsite(#loc80 at #loc28)) +#loc108 = loc(callsite(#loc81 at #loc28)) +#loc109 = loc(callsite(#loc82 at #loc28)) +#loc111 = loc(callsite(#loc91 at #loc92)) +#loc112 = loc(callsite(#loc13 at #loc92)) +#loc113 = loc(callsite(#loc16 at #loc95)) +#loc114 = loc(callsite(#loc16 at #loc98)) +#loc115 = loc(callsite(#loc26 at #loc102)) +#loc117 = loc(callsite(#loc16 at #loc108)) +#loc118 = loc(callsite(#loc37 at #loc110)) +#loc120 = loc(callsite(#loc91 at #loc115)) +#loc121 = loc(callsite(#loc75 at #loc115)) +#loc122 = loc(callsite(#loc103 at #loc115)) +#loc123 = loc(callsite(#loc13 at #loc115)) +#loc124 = loc(callsite(#loc39 at #loc118)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..f6c8ac826745d6229d330ad95ff6f941687a214d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/EOO62W3V32F6RVNGZQIHPTR4XXANE33JTCGWAERXQXEXRRQRPD3Q/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir @@ -0,0 +1,195 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":18:0) +#loc1 = loc(unknown) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":49:33) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc50 = loc("in_ptr0"(#loc)) +#loc51 = loc("out_ptr2"(#loc)) +#loc52 = loc("xnumel"(#loc)) +#loc53 = loc("r0_numel"(#loc)) +#loc78 = loc("out_max"(#loc30)) +#loc86 = loc("out_sum"(#loc39)) +#loc106 = loc(callsite(#loc78 at #loc3)) +#loc113 = loc(callsite(#loc86 at #loc3)) +#loc119 = loc(callsite(#loc1 at #loc106)) +#loc122 = loc(callsite(#loc1 at #loc113)) +module { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %delta = arith.constant dense<0xFF800000> : tensor<1x1xf32> loc(#loc93) + %cst = arith.constant dense<1.000000e+00> : tensor<1x32768xf32> loc(#loc55) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x32768xf32> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x32768xbf16> loc(#loc1) + %cst_2 = arith.constant dense<32000> : tensor<1x32768xi32> loc(#loc1) + %cst_3 = arith.constant dense<0xFF800000> : tensor<1x32768xf32> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc56) + %r0_base = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc57) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32768xi32> -> tensor<1x32768xi32> loc(#loc58) + %r0_mask = arith.cmpi slt, %r0_base_4, %cst_2 : tensor<1x32768xi32> loc(#loc59) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc60) + %tmp0_5 = tt.splat %tmp0 : i32 -> tensor<1x32768xi32> loc(#loc94) + %tmp0_6 = arith.addi %r0_base_4, %tmp0_5 : tensor<1x32768xi32> loc(#loc61) + %tmp0_7 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32768x!tt.ptr> loc(#loc62) + %tmp0_8 = tt.addptr %tmp0_7, %tmp0_6 : tensor<1x32768x!tt.ptr>, tensor<1x32768xi32> loc(#loc62) + %tmp0_9 = tt.load %tmp0_8, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x32768x!tt.ptr> loc(#loc63) + %tmp0_10 = arith.extf %tmp0_9 : tensor<1x32768xbf16> to tensor<1x32768xf32> loc(#loc64) + %mask = arith.cmpf ogt, %cst_3, %tmp0_10 : tensor<1x32768xf32> loc(#loc114) + %out_max = arith.select %mask, %cst_3, %tmp0_10 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc115) + %lhs_scale = arith.cmpf oeq, %out_max, %cst_3 : tensor<1x32768xf32> loc(#loc97) + %lhs_scale_11 = arith.subf %cst_3, %out_max : tensor<1x32768xf32> loc(#loc98) + %lhs_scale_12 = tt.extern_elementwise %lhs_scale_11 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc116) + %lhs_scale_13 = arith.select %lhs_scale, %cst, %lhs_scale_12 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc100) + %rhs_scale = arith.subf %tmp0_10, %out_max : tensor<1x32768xf32> loc(#loc101) + %rhs_scale_14 = tt.extern_elementwise %rhs_scale {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc117) + %rhs_scale_15 = arith.select %lhs_scale, %cst, %rhs_scale_14 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc103) + %out_sum = arith.mulf %lhs_scale_13, %cst_0 : tensor<1x32768xf32> loc(#loc104) + %out_sum_16 = arith.addf %out_sum, %rhs_scale_15 : tensor<1x32768xf32> loc(#loc105) + %_tmp3_max = arith.select %r0_mask, %out_max, %cst_3 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc76) + %_tmp3_sum = arith.select %r0_mask, %out_sum_16, %cst_0 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc77) + %out_max_17 = "tt.reduce"(%_tmp3_max) <{axis = 1 : i32}> ({ + ^bb0(%out_max_28: f32 loc(callsite(#loc1 at #loc106)), %out_max_29: f32 loc(callsite(#loc1 at #loc106))): + %mask_30 = arith.cmpf ogt, %out_max_28, %out_max_29 : f32 loc(#loc123) + %mask_31 = arith.cmpf une, %out_max_28, %out_max_28 : f32 loc(#loc124) + %mask_32 = arith.ori %mask_30, %mask_31 : i1 loc(#loc125) + %out_max_33 = arith.select %mask_32, %out_max_28, %out_max_29 : f32 loc(#loc126) + tt.reduce.return %out_max_33 : f32 loc(#loc118) + }) : (tensor<1x32768xf32>) -> tensor<1xf32> loc(#loc118) + %out_max_keepdim = tt.expand_dims %out_max_17 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc108) + %delta_18 = arith.cmpf oeq, %out_max_keepdim, %delta : tensor<1x1xf32> loc(#loc93) + %delta_19 = tt.broadcast %out_max_keepdim : tensor<1x1xf32> -> tensor<1x32768xf32> loc(#loc109) + %delta_20 = arith.subf %_tmp3_max, %delta_19 : tensor<1x32768xf32> loc(#loc109) + %delta_21 = tt.broadcast %delta_18 : tensor<1x1xi1> -> tensor<1x32768xi1> loc(#loc110) + %delta_22 = arith.select %delta_21, %cst_0, %delta_20 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc110) + %out_sum_23 = tt.extern_elementwise %delta_22 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc120) + %out_sum_24 = arith.mulf %_tmp3_sum, %out_sum_23 : tensor<1x32768xf32> loc(#loc112) + %out_sum_25 = "tt.reduce"(%out_sum_24) <{axis = 1 : i32}> ({ + ^bb0(%out_sum_28: f32 loc(callsite(#loc1 at #loc113)), %out_sum_29: f32 loc(callsite(#loc1 at #loc113))): + %out_sum_30 = arith.addf %out_sum_28, %out_sum_29 : f32 loc(#loc127) + tt.reduce.return %out_sum_30 : f32 loc(#loc121) + }) : (tensor<1x32768xf32>) -> tensor<1xf32> loc(#loc121) + %tmp4 = tt.expand_dims %out_sum_25 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc87) + %tmp5 = tt.load %tmp0_8, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x32768x!tt.ptr> loc(#loc88) + %tmp5_26 = arith.extf %tmp5 : tensor<1x32768xbf16> to tensor<1x32768xf32> loc(#loc89) + %tmp7 = arith.subf %tmp5_26, %delta_19 : tensor<1x32768xf32> loc(#loc90) + %tmp8 = tt.extern_elementwise %tmp7 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc91) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32> -> tensor<1x32768xf32> loc(#loc92) + %tmp9_27 = arith.divf %tmp8, %tmp9 : tensor<1x32768xf32> loc(#loc92) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32768x!tt.ptr> loc(#loc47) + %1 = tt.addptr %0, %tmp0_6 : tensor<1x32768x!tt.ptr>, tensor<1x32768xi32> loc(#loc47) + tt.store %1, %tmp9_27, %r0_mask : tensor<1x32768x!tt.ptr> loc(#loc48) + tt.return loc(#loc49) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":42:40) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":23:28) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":26:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":26:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":33:29) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:47) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:52) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:105) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":45:54) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":46:54) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":51:16) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:52) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:106) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":60:22) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":61:29) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":62:23) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:29) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:53) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":52:4) +#loc54 = loc("delta"(#loc2)) +#loc55 = loc(callsite(#loc1 at #loc4)) +#loc56 = loc("xoffset"(#loc5)) +#loc57 = loc("r0_base"(#loc6)) +#loc58 = loc("r0_base"(#loc7)) +#loc59 = loc("r0_mask"(#loc8)) +#loc60 = loc("tmp0"(#loc9)) +#loc61 = loc("tmp0"(#loc10)) +#loc62 = loc("tmp0"(#loc11)) +#loc63 = loc("tmp0"(#loc12)) +#loc64 = loc("tmp0"(#loc13)) +#loc65 = loc("mask"(#loc14)) +#loc66 = loc("out_max"(#loc15)) +#loc67 = loc("lhs_scale"(#loc17)) +#loc68 = loc("lhs_scale"(#loc18)) +#loc69 = loc("lhs_scale"(#loc20)) +#loc70 = loc("lhs_scale"(#loc21)) +#loc71 = loc("rhs_scale"(#loc22)) +#loc72 = loc("rhs_scale"(#loc23)) +#loc73 = loc("rhs_scale"(#loc24)) +#loc74 = loc("out_sum"(#loc25)) +#loc75 = loc("out_sum"(#loc26)) +#loc76 = loc("_tmp3_max"(#loc27)) +#loc77 = loc("_tmp3_sum"(#loc28)) +#loc79 = loc("mask"(#loc31)) +#loc80 = loc("mask"(#loc32)) +#loc81 = loc("out_max_keepdim"(#loc33)) +#loc82 = loc("delta"(#loc34)) +#loc83 = loc("delta"(#loc35)) +#loc84 = loc("out_sum"(#loc36)) +#loc85 = loc("out_sum"(#loc37)) +#loc87 = loc("tmp4"(#loc41)) +#loc88 = loc("tmp5"(#loc42)) +#loc89 = loc("tmp5"(#loc43)) +#loc90 = loc("tmp7"(#loc44)) +#loc91 = loc("tmp8"(#loc45)) +#loc92 = loc("tmp9"(#loc46)) +#loc93 = loc(callsite(#loc54 at #loc3)) +#loc94 = loc(fused[#loc61, #loc60]) +#loc95 = loc("mask"(#loc65)) +#loc96 = loc(callsite(#loc66 at #loc4)) +#loc97 = loc(callsite(#loc67 at #loc4)) +#loc98 = loc(callsite(#loc68 at #loc4)) +#loc99 = loc(callsite(#loc69 at #loc4)) +#loc100 = loc(callsite(#loc70 at #loc4)) +#loc101 = loc(callsite(#loc71 at #loc4)) +#loc102 = loc(callsite(#loc72 at #loc4)) +#loc103 = loc(callsite(#loc73 at #loc4)) +#loc104 = loc(callsite(#loc74 at #loc4)) +#loc105 = loc(callsite(#loc75 at #loc4)) +#loc107 = loc("mask"(#loc80)) +#loc108 = loc(callsite(#loc81 at #loc3)) +#loc109 = loc(callsite(#loc82 at #loc3)) +#loc110 = loc(callsite(#loc83 at #loc3)) +#loc111 = loc(callsite(#loc84 at #loc3)) +#loc112 = loc(callsite(#loc85 at #loc3)) +#loc114 = loc(callsite(#loc95 at #loc96)) +#loc115 = loc(callsite(#loc16 at #loc96)) +#loc116 = loc(callsite(#loc19 at #loc99)) +#loc117 = loc(callsite(#loc19 at #loc102)) +#loc118 = loc(callsite(#loc29 at #loc106)) +#loc120 = loc(callsite(#loc19 at #loc111)) +#loc121 = loc(callsite(#loc38 at #loc113)) +#loc123 = loc(callsite(#loc95 at #loc118)) +#loc124 = loc(callsite(#loc79 at #loc118)) +#loc125 = loc(callsite(#loc107 at #loc118)) +#loc126 = loc(callsite(#loc16 at #loc118)) +#loc127 = loc(callsite(#loc40 at #loc121)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9dae29753f5b7a85556aae04887dd554065d0481 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..55e84e6ecda84473f8c4289f6fa90a87f6975a6e Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e090e69169bebe56ef7149974c10d057f011f555 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "2b3c2b5fad1b69293a163ec86269abe219ddb58dbb38d468b4aa35ef6f1e2325", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..84bf14c5d2c48cbf7c57a889b1c948dd0e07cd72 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.llir @@ -0,0 +1,186 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i64 %4, i32 %5, i32 %6, ptr addrspace(1) readnone captures(none) %7, ptr addrspace(1) readnone captures(none) %8) local_unnamed_addr #0 !dbg !4 { + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %11 = shl i32 %10, 6, !dbg !8 + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %13 = and i32 %12, 252, !dbg !9 + %14 = lshr exact i32 %13, 2, !dbg !9 + %15 = or disjoint i32 %14, %11, !dbg !10 + %16 = icmp slt i32 %15, %5, !dbg !11 + %17 = and i32 %12, 3, !dbg !12 + %18 = sext i32 %15 to i64, !dbg !13 + %.frozen = freeze i64 %3, !dbg !14 + %19 = sdiv i64 %18, %.frozen, !dbg !14 + %20 = mul i64 %19, %.frozen, !dbg !13 + %.decomposed = sub i64 %18, %20, !dbg !13 + %21 = srem i64 %19, 32, !dbg !15 + %22 = sdiv i64 %18, %4, !dbg !16 + %.not = icmp ne i64 %.decomposed, 0, !dbg !17 + %23 = icmp slt i32 %11, 0, !dbg !21 + %24 = icmp slt i64 %3, 0, !dbg !22 + %25 = xor i1 %23, %24, !dbg !23 + %narrow = select i1 %25, i1 %.not, i1 false, !dbg !24 + %26 = sext i1 %narrow to i64, !dbg !24 + %27 = add nsw i64 %19, %26, !dbg !24 + %28 = shl i64 %3, 12, !dbg !25 + %29 = mul i64 %28, %22, !dbg !26 + %30 = icmp slt i64 %3, 2, !dbg !27 + %31 = icmp sgt i64 %3, 1, !dbg !28 + %32 = select i1 %31, i64 %3, i64 0, !dbg !29 + %33 = zext i1 %30 to i64, !dbg !30 + %34 = add i64 %32, %33, !dbg !31 + %35 = shl i64 %34, 7, !dbg !32 + %36 = mul i64 %35, %27, !dbg !33 + %.idx = shl nsw i64 %21, 8 + %37 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx + %.idx1 = shl nsw i64 %.decomposed, 13 + %invariant.gep = getelementptr i8, ptr addrspace(1) %37, i64 %.idx1, !dbg !34 + %invariant.gep3 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %29, !dbg !34 + %.idx2 = shl nsw i64 %.decomposed, 8 + %38 = getelementptr i8, ptr addrspace(1) %1, i64 %.idx2 + %invariant.gep5 = getelementptr bfloat, ptr addrspace(1) %38, i64 %36, !dbg !34 + %.fr = freeze i1 %16 + %39 = zext nneg i32 %17 to i64, !dbg !34 + br i1 %.fr, label %.split.us, label %.split + +.split.us: ; preds = %9, %.split.us + %indvars.iv9 = phi i64 [ %indvars.iv.next10, %.split.us ], [ 0, %9 ] + %40 = phi float [ %51, %.split.us ], [ 0.000000e+00, %9 ] + %41 = or disjoint i64 %indvars.iv9, %39, !dbg !35 + %gep4.us = getelementptr bfloat, ptr addrspace(1) %invariant.gep3, i64 %41, !dbg !36 + %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !37 + %43 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep4.us, i64 %42, i1 true) #4, !dbg !37 + %44 = bitcast i16 %43 to bfloat, !dbg !37 + %45 = fpext bfloat %44 to float, !dbg !38 + %gep.us = getelementptr bfloat, ptr addrspace(1) %invariant.gep5, i64 %41, !dbg !39 + %46 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !40 + %47 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep.us, i64 %46, i1 true) #4, !dbg !40 + %48 = bitcast i16 %47 to bfloat, !dbg !40 + %49 = fpext bfloat %48 to float, !dbg !41 + %50 = fmul float %45, %49, !dbg !42 + %51 = fadd float %40, %50, !dbg !43 + %indvars.iv.next10 = add nuw nsw i64 %indvars.iv9, 4, !dbg !34 + %52 = icmp samesign ult i64 %indvars.iv9, 124, !dbg !34 + br i1 %52, label %.split.us, label %.split7.us, !dbg !34 + +.split: ; preds = %9, %.split + %indvars.iv = phi i64 [ %indvars.iv.next, %.split ], [ 0, %9 ] + %53 = or disjoint i64 %indvars.iv, %39, !dbg !35 + %gep4 = getelementptr bfloat, ptr addrspace(1) %invariant.gep3, i64 %53, !dbg !36 + %54 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !37 + %55 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep4, i64 %54, i1 false) #4, !dbg !37 + %gep = getelementptr bfloat, ptr addrspace(1) %invariant.gep5, i64 %53, !dbg !39 + %56 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !40 + %57 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep, i64 %56, i1 false) #4, !dbg !40 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4, !dbg !34 + %58 = icmp samesign ult i64 %indvars.iv, 124, !dbg !34 + br i1 %58, label %.split, label %.split7.us, !dbg !34 + +.split7.us: ; preds = %.split, %.split.us + %.us-phi = phi float [ %51, %.split.us ], [ 0.000000e+00, %.split ], !dbg !9 + %59 = and i32 %12, 63, !dbg !9 + %60 = or disjoint i32 %11, %59, !dbg !10 + %61 = icmp slt i32 %60, %5, !dbg !11 + %62 = bitcast float %.us-phi to i32, !dbg !44 + %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %62, i32 2, i32 31), !dbg !44 + %64 = bitcast i32 %63 to float, !dbg !44 + %65 = fadd float %.us-phi, %64, !dbg !48 + %66 = bitcast float %65 to i32, !dbg !44 + %67 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %66, i32 1, i32 31), !dbg !44 + %68 = bitcast i32 %67 to float, !dbg !44 + %69 = fadd float %65, %68, !dbg !48 + %70 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %13, !dbg !49 + store float %69, ptr addrspace(3) %70, align 4, !dbg !49 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !49 + %71 = shl nuw nsw i32 %59, 2, !dbg !49 + %72 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %71, !dbg !49 + %73 = load i32, ptr addrspace(3) %72, align 4, !dbg !49 + %74 = sext i32 %60 to i64, !dbg !50 + %75 = getelementptr float, ptr addrspace(1) %2, i64 %74, !dbg !50 + %76 = and i32 %12, 192, !dbg !51 + %77 = icmp eq i32 %76, 0, !dbg !51 + %78 = and i1 %77, %61, !dbg !51 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %73, ptr addrspace(1) %75, i1 %78) #4, !dbg !51 + ret void, !dbg !52 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 22, column: 33, scope: !4) +!9 = !DILocation(line: 23, column: 44, scope: !4) +!10 = !DILocation(line: 23, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 21, scope: !4) +!12 = !DILocation(line: 25, column: 37, scope: !4) +!13 = !DILocation(line: 27, column: 19, scope: !4) +!14 = !DILocation(line: 28, column: 21, scope: !4) +!15 = !DILocation(line: 28, column: 28, scope: !4) +!16 = !DILocation(line: 29, column: 19, scope: !4) +!17 = !DILocation(line: 74, column: 34, scope: !18, inlinedAt: !20) +!18 = distinct !DILexicalBlockFile(scope: !4, file: !19, discriminator: 0) +!19 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!20 = !DILocation(line: 30, column: 51, scope: !4) +!21 = !DILocation(line: 75, column: 25, scope: !18, inlinedAt: !20) +!22 = !DILocation(line: 75, column: 36, scope: !18, inlinedAt: !20) +!23 = !DILocation(line: 75, column: 32, scope: !18, inlinedAt: !20) +!24 = !DILocation(line: 75, column: 47, scope: !18, inlinedAt: !20) +!25 = !DILocation(line: 39, column: 65, scope: !4) +!26 = !DILocation(line: 39, column: 69, scope: !4) +!27 = !DILocation(line: 40, column: 73, scope: !4) +!28 = !DILocation(line: 40, column: 99, scope: !4) +!29 = !DILocation(line: 40, column: 90, scope: !4) +!30 = !DILocation(line: 40, scope: !4) +!31 = !DILocation(line: 40, column: 81, scope: !4) +!32 = !DILocation(line: 40, column: 54, scope: !4) +!33 = !DILocation(line: 40, column: 58, scope: !4) +!34 = !DILocation(line: 33, column: 40, scope: !4) +!35 = !DILocation(line: 34, column: 31, scope: !4) +!36 = !DILocation(line: 39, column: 34, scope: !4) +!37 = !DILocation(line: 39, column: 74, scope: !4) +!38 = !DILocation(line: 39, column: 136, scope: !4) +!39 = !DILocation(line: 40, column: 34, scope: !4) +!40 = !DILocation(line: 40, column: 106, scope: !4) +!41 = !DILocation(line: 40, column: 168, scope: !4) +!42 = !DILocation(line: 41, column: 22, scope: !4) +!43 = !DILocation(line: 43, column: 23, scope: !4) +!44 = !DILocation(line: 291, column: 36, scope: !45, inlinedAt: !47) +!45 = distinct !DILexicalBlockFile(scope: !4, file: !46, discriminator: 0) +!46 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!47 = !DILocation(line: 45, column: 25, scope: !4) +!48 = !DILocation(line: 261, column: 15, scope: !45, inlinedAt: !47) +!49 = !DILocation(line: 45, column: 28, scope: !4) +!50 = !DILocation(line: 49, column: 25, scope: !4) +!51 = !DILocation(line: 49, column: 36, scope: !4) +!52 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..e4524c06043ccac974e3dc033385563cd115b27b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.ptx @@ -0,0 +1,517 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u64 triton_red_fused_zeros_0_param_3, + .param .u64 triton_red_fused_zeros_0_param_4, + .param .u32 triton_red_fused_zeros_0_param_5, + .param .u32 triton_red_fused_zeros_0_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_7, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_8 +) +.reqntid 256 +{ + .reg .pred %p<22>; + .reg .b16 %rs<9>; + .reg .b32 %r<41>; + .reg .b64 %rd<102>; + .loc 1 18 0 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:18:0 + +// %bb.0: + ld.param.b64 %rd35, [triton_red_fused_zeros_0_param_4]; + ld.param.b64 %rd36, [triton_red_fused_zeros_0_param_3]; +$L__tmp0: + .loc 1 22 28 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:22:28 + mov.u32 %r10, %ctaid.x; + .loc 1 22 33 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:22:33 + shl.b32 %r1, %r10, 6; + .loc 1 23 44 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:23:44 + mov.u32 %r2, %tid.x; + bfe.u32 %r4, %r2, 2, 6; + .loc 1 23 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:23:23 + or.b32 %r11, %r4, %r1; + .loc 1 27 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:27:19 + cvt.s64.s32 %rd1, %r11; + .loc 1 28 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:28:21 + or.b64 %rd37, %rd1, %rd36; + and.b64 %rd38, %rd37, -4294967296; + setp.ne.b64 %p2, %rd38, 0; + cvt.u32.u64 %r38, %rd1; + @%p2 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd94, %rd1, %rd36; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r12, %rd36; + div.u32 %r14, %r38, %r12; + cvt.u64.u32 %rd94, %r14; +$L__BB0_3: + .loc 1 0 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:0:21 + ld.param.b32 %r9, [triton_red_fused_zeros_0_param_5]; + and.b32 %r5, %r2, 3; + .loc 1 27 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:27:19 + mul.lo.s64 %rd6, %rd94, %rd36; + .loc 1 28 28 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:28:28 + shr.s64 %rd39, %rd94, 63; + shr.u64 %rd40, %rd39, 59; + add.s64 %rd41, %rd94, %rd40; + and.b64 %rd42, %rd41, -32; + sub.s64 %rd8, %rd94, %rd42; + .loc 1 29 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:29:19 + or.b64 %rd43, %rd1, %rd35; + and.b64 %rd44, %rd43, -4294967296; + setp.ne.b64 %p3, %rd44, 0; + @%p3 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd95, %rd1, %rd35; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r15, %rd35; + div.u32 %r17, %r38, %r15; + cvt.u64.u32 %rd95, %r17; +$L__BB0_6: + .loc 1 0 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:0:19 + ld.param.b64 %rd33, [triton_red_fused_zeros_0_param_2]; + ld.param.b64 %rd32, [triton_red_fused_zeros_0_param_1]; + ld.param.b64 %rd31, [triton_red_fused_zeros_0_param_0]; + and.b32 %r3, %r2, 252; + sub.s64 %rd7, %rd1, %rd6; + .loc 1 24 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:24:21 + setp.lt.s32 %p4, %r38, %r9; +$L__tmp1: + .loc 2 75 25 // triton_helpers.py:75:25 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + setp.lt.s32 %p5, %r1, 0; + .loc 2 75 36 // triton_helpers.py:75:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + setp.lt.s64 %p6, %rd36, 0; + .loc 2 75 32 // triton_helpers.py:75:32 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + xor.pred %p1, %p5, %p6; +$L__tmp2: + .loc 1 40 73 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:73 + setp.lt.s64 %p7, %rd36, 2; + .loc 1 40 99 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:99 + setp.gt.s64 %p8, %rd36, 1; + .loc 1 40 90 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:90 + selp.b64 %rd45, %rd36, 0, %p8; + .loc 1 40 0 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40 + selp.b64 %rd46, 1, 0, %p7; + .loc 1 40 81 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:81 + add.s64 %rd12, %rd45, %rd46; + shl.b64 %rd13, %rd8, 8; + .loc 1 33 40 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:33:40 + cvt.u64.u32 %rd14, %r5; + @%p4 bra $L__BB0_9; + bra.uni $L__BB0_7; +$L__BB0_9: // %.split.us.preheader +$L__tmp3: + .loc 2 74 34 // triton_helpers.py:74:34 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + setp.ne.b64 %p14, %rd7, 0; +$L__tmp4: + .loc 1 33 40 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:33:40 + and.pred %p15, %p14, %p1; + selp.b64 %rd71, -1, 0, %p15; + add.s64 %rd72, %rd94, %rd71; + mul.lo.s64 %rd73, %rd72, %rd12; + shl.b64 %rd74, %rd73, 8; + add.s32 %r22, %r1, %r4; + mad.wide.s32 %rd75, %r22, 256, %rd74; + shl.b64 %rd76, %rd14, 1; + or.b64 %rd77, %rd75, %rd76; + shl.b64 %rd78, %rd6, 8; + sub.s64 %rd79, %rd77, %rd78; + add.s64 %rd97, %rd32, %rd79; + mul.lo.s64 %rd80, %rd95, %rd36; + shl.b64 %rd81, %rd80, 13; + mad.wide.s32 %rd82, %r22, 8192, %rd81; + add.s64 %rd83, %rd82, %rd13; + add.s64 %rd84, %rd83, %rd76; + shl.b64 %rd85, %rd6, 13; + sub.s64 %rd86, %rd84, %rd85; + add.s64 %rd96, %rd31, %rd86; + mov.b32 %r40, 0f00000000; + mov.b64 %rd98, -4; +$L__BB0_10: // %.split.us + // =>This Inner Loop Header: Depth=1 + .loc 1 39 74 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:74 + // begin inline asm + mov.u64 %rd87, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd87, 1.0; + // end inline asm + mov.b16 %rs6, 0; + mov.pred %p16, -1; + // begin inline asm + mov.u16 %rs5, %rs6; + @%p16 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs5 }, [ %rd96 + 0 ], %rd87; + // end inline asm + .loc 1 39 136 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:136 + cvt.f32.bf16 %r23, %rs5; + .loc 1 40 106 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:106 + // begin inline asm + mov.u64 %rd90, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd90, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs7, %rs6; + @%p16 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs7 }, [ %rd97 + 0 ], %rd90; + // end inline asm + .loc 1 40 168 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:168 + cvt.f32.bf16 %r24, %rs7; + .loc 1 43 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:43:23 + fma.rn.f32 %r40, %r23, %r24, %r40; + .loc 1 33 40 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:33:40 + add.s64 %rd98, %rd98, 4; + add.s64 %rd97, %rd97, 8; + add.s64 %rd96, %rd96, 8; + setp.lt.u64 %p18, %rd98, 124; + @%p18 bra $L__BB0_10; + bra.uni $L__BB0_11; +$L__BB0_7: // %.split.preheader +$L__tmp5: + .loc 2 74 34 // triton_helpers.py:74:34 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + setp.ne.b64 %p9, %rd7, 0; +$L__tmp6: + .loc 1 33 40 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:33:40 + and.pred %p10, %p9, %p1; + selp.b64 %rd48, -1, 0, %p10; + add.s64 %rd49, %rd94, %rd48; + mul.lo.s64 %rd50, %rd49, %rd12; + shl.b64 %rd51, %rd50, 8; + add.s32 %r19, %r1, %r4; + mad.wide.s32 %rd52, %r19, 256, %rd51; + shl.b64 %rd53, %rd14, 1; + or.b64 %rd54, %rd52, %rd53; + shl.b64 %rd55, %rd6, 8; + sub.s64 %rd56, %rd54, %rd55; + add.s64 %rd100, %rd32, %rd56; + mul.lo.s64 %rd57, %rd95, %rd36; + shl.b64 %rd58, %rd57, 13; + mad.wide.s32 %rd59, %r19, 8192, %rd58; + add.s64 %rd60, %rd59, %rd13; + add.s64 %rd61, %rd60, %rd53; + shl.b64 %rd62, %rd6, 13; + sub.s64 %rd63, %rd61, %rd62; + add.s64 %rd99, %rd31, %rd63; + mov.b64 %rd101, -4; +$L__BB0_8: // %.split + // =>This Inner Loop Header: Depth=1 + .loc 1 39 74 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:74 + // begin inline asm + mov.u64 %rd64, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd64, 1.0; + // end inline asm + mov.b16 %rs2, 0; + mov.pred %p11, 0; + // begin inline asm + mov.u16 %rs1, %rs2; + @%p11 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs1 }, [ %rd99 + 0 ], %rd64; + // end inline asm + .loc 1 40 106 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:106 + // begin inline asm + mov.u64 %rd67, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd67, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs3, %rs2; + @%p11 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs3 }, [ %rd100 + 0 ], %rd67; + // end inline asm + .loc 1 33 40 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:33:40 + add.s64 %rd101, %rd101, 4; + add.s64 %rd100, %rd100, 8; + add.s64 %rd99, %rd99, 8; + setp.lt.u64 %p13, %rd101, 124; + mov.b32 %r40, 0f00000000; + @%p13 bra $L__BB0_8; +$L__BB0_11: // %.split7.us + .loc 1 23 44 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:23:44 + and.b32 %r26, %r2, 63; + .loc 1 23 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:23:23 + or.b32 %r27, %r1, %r26; + .loc 1 24 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:24:21 + setp.lt.s32 %p20, %r27, %r9; +$L__tmp7: + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r28, %r40, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r29, %r40, %r28; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r30, %r29, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r31, %r29, %r30; +$L__tmp8: + .loc 1 45 28 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:28 + mov.b32 %r32, global_smem; + add.s32 %r33, %r32, %r3; + st.shared.b32 [%r33], %r31; + bar.sync 0; + shl.b32 %r34, %r26, 2; + add.s32 %r35, %r32, %r34; + ld.shared.b32 %r25, [%r35]; + .loc 1 49 25 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:49:25 + mad.wide.s32 %rd93, %r27, 4, %rd33; + .loc 1 49 36 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:49:36 + and.b32 %r36, %r2, 192; + setp.eq.b32 %p21, %r36, 0; + and.pred %p19, %p21, %p20; + // begin inline asm + @%p19 st.global.b32 [ %rd93 + 0 ], { %r25 }; + // end inline asm + .loc 1 49 4 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:49:4 + ret; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 233 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe2 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 54 +.b8 52 +.b8 100 +.b8 110 +.b8 112 +.b8 110 +.b8 99 +.b8 50 +.b8 52 +.b8 117 +.b8 106 +.b8 115 +.b8 52 +.b8 101 +.b8 108 +.b8 51 +.b8 111 +.b8 122 +.b8 101 +.b8 100 +.b8 97 +.b8 55 +.b8 112 +.b8 110 +.b8 115 +.b8 118 +.b8 116 +.b8 101 +.b8 119 +.b8 104 +.b8 55 +.b8 120 +.b8 54 +.b8 55 +.b8 103 +.b8 122 +.b8 119 +.b8 116 +.b8 102 +.b8 111 +.b8 55 +.b8 122 +.b8 120 +.b8 118 +.b8 114 +.b8 102 +.b8 115 +.b8 113 +.b8 122 +.b8 115 +.b8 103 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 54 +.b8 52 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 30 // DW_AT_call_line +.b8 51 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd3:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..c3ef8728bb6d6a5f3ad5ebd320d14b217b80e939 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.source @@ -0,0 +1,325 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":18:0) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc70 = loc(unknown) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc77 = loc("in_ptr0"(#loc)) +#loc78 = loc("in_ptr1"(#loc)) +#loc79 = loc("out_ptr1"(#loc)) +#loc80 = loc("ks0"(#loc)) +#loc81 = loc("ks1"(#loc)) +#loc82 = loc("xnumel"(#loc)) +#loc83 = loc("r0_numel"(#loc)) +#loc135 = loc("a"(#loc56)) +#loc136 = loc("b"(#loc56)) +#loc142 = loc("input"(#loc68)) +#loc143 = loc("a"(#loc73)) +#loc144 = loc("b"(#loc73)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 128 : i32 loc(#loc84) + %xoffset = tt.get_program_id x : i32 loc(#loc85) + %xoffset_1 = arith.constant 64 : i32 loc(#loc86) + %xoffset_2 = arith.constant 64 : i32 loc(#loc86) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc86) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc87) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc88) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<64x1xi32> loc(#loc89) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<64x1xi32> loc(#loc89) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32> loc(#loc90) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<64x1xi32> loc(#loc90) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc91) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc92) + %x0 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc93) + %x0_9 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc93) + %x0_10 = arith.remsi %x0, %x0_9 : tensor<64x1xi64> loc(#loc93) + %x1 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc94) + %x1_11 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc94) + %x1_12 = arith.divsi %x1, %x1_11 : tensor<64x1xi64> loc(#loc94) + %x1_13 = arith.constant 32 : i32 loc(#loc95) + %x1_14 = arith.constant 32 : i64 loc(#loc95) + %x1_15 = arith.constant dense<32> : tensor<64x1xi64> loc(#loc95) + %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<64x1xi64> loc(#loc95) + %x2 = arith.extsi %xindex_6 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc96) + %x2_17 = tt.splat %ks1 : i64 -> tensor<64x1xi64> loc(#loc96) + %x2_18 = arith.divsi %x2, %x2_17 : tensor<64x1xi64> loc(#loc96) + %x5 = tt.call @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S64_1S_i64__(%xindex_6, %ks0) : (tensor<64x1xi32>, i64) -> tensor<64x1xi64> loc(#loc97) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc98) + %_tmp4_19 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc98) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c4_i32 = arith.constant 4 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_20 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_19) -> (tensor<64x4xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc100) + %r0_index_24 = arith.addi %r0_index, %r0_base_8 : tensor<1x4xi32> loc(#loc100) + %r0_mask = arith.constant dense<128> : tensor<1x4xi32> loc(#loc101) + %r0_mask_25 = arith.cmpi slt, %r0_index_24, %r0_mask : tensor<1x4xi32> loc(#loc101) + %tmp0 = arith.constant 128 : i32 loc(#loc102) + %tmp0_26 = arith.constant 128 : i64 loc(#loc102) + %tmp0_27 = arith.constant dense<128> : tensor<64x1xi64> loc(#loc102) + %tmp0_28 = arith.muli %tmp0_27, %x1_16 : tensor<64x1xi64> loc(#loc102) + %tmp0_29 = arith.extsi %r0_index_24 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc103) + %tmp0_30 = tt.broadcast %tmp0_29 : tensor<1x4xi64> -> tensor<64x4xi64> loc(#loc103) + %tmp0_31 = tt.broadcast %tmp0_28 : tensor<64x1xi64> -> tensor<64x4xi64> loc(#loc103) + %tmp0_32 = arith.addi %tmp0_30, %tmp0_31 : tensor<64x4xi64> loc(#loc103) + %tmp0_33 = arith.constant 4096 : i32 loc(#loc104) + %tmp0_34 = arith.constant 4096 : i64 loc(#loc104) + %tmp0_35 = arith.constant dense<4096> : tensor<64x1xi64> loc(#loc104) + %tmp0_36 = arith.muli %tmp0_35, %x0_10 : tensor<64x1xi64> loc(#loc104) + %tmp0_37 = tt.broadcast %tmp0_36 : tensor<64x1xi64> -> tensor<64x4xi64> loc(#loc105) + %tmp0_38 = arith.addi %tmp0_32, %tmp0_37 : tensor<64x4xi64> loc(#loc105) + %tmp0_39 = arith.constant 4096 : i32 loc(#loc106) + %tmp0_40 = arith.constant 4096 : i64 loc(#loc106) + %tmp0_41 = arith.muli %tmp0_40, %ks0 : i64 loc(#loc106) + %tmp0_42 = tt.splat %tmp0_41 : i64 -> tensor<64x1xi64> loc(#loc107) + %tmp0_43 = arith.muli %tmp0_42, %x2_18 : tensor<64x1xi64> loc(#loc107) + %tmp0_44 = tt.broadcast %tmp0_43 : tensor<64x1xi64> -> tensor<64x4xi64> loc(#loc108) + %tmp0_45 = arith.addi %tmp0_38, %tmp0_44 : tensor<64x4xi64> loc(#loc108) + %tmp0_46 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc109) + %tmp0_47 = tt.addptr %tmp0_46, %tmp0_45 : tensor<64x4x!tt.ptr>, tensor<64x4xi64> loc(#loc109) + %tmp0_48 = tt.broadcast %r0_mask_25 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc110) + %tmp0_49 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x4xi1> loc(#loc110) + %tmp0_50 = arith.andi %tmp0_48, %tmp0_49 : tensor<64x4xi1> loc(#loc110) + %tmp0_51 = arith.constant 0.000000e+00 : f32 loc(#loc111) + %tmp0_52 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc111) + %tmp0_53 = arith.truncf %tmp0_52 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc111) + %tmp0_54 = tt.load %tmp0_47, %tmp0_50, %tmp0_53 evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc111) + %tmp0_55 = arith.extf %tmp0_54 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc112) + %tmp1 = arith.constant 128 : i32 loc(#loc113) + %tmp1_56 = arith.constant 128 : i64 loc(#loc113) + %tmp1_57 = arith.constant dense<128> : tensor<64x1xi64> loc(#loc113) + %tmp1_58 = arith.muli %tmp1_57, %x0_10 : tensor<64x1xi64> loc(#loc113) + %tmp1_59 = arith.extsi %r0_index_24 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc114) + %tmp1_60 = tt.broadcast %tmp1_59 : tensor<1x4xi64> -> tensor<64x4xi64> loc(#loc114) + %tmp1_61 = tt.broadcast %tmp1_58 : tensor<64x1xi64> -> tensor<64x4xi64> loc(#loc114) + %tmp1_62 = arith.addi %tmp1_60, %tmp1_61 : tensor<64x4xi64> loc(#loc114) + %tmp1_63 = arith.constant 128 : i32 loc(#loc115) + %tmp1_64 = arith.constant 128 : i64 loc(#loc115) + %tmp1_65 = arith.constant dense<128> : tensor<64x1xi64> loc(#loc115) + %tmp1_66 = arith.muli %tmp1_65, %x5 : tensor<64x1xi64> loc(#loc115) + %tmp1_67 = arith.constant 1 : i32 loc(#loc116) + %tmp1_68 = arith.extsi %tmp1_67 : i32 to i64 loc(#loc116) + %tmp1_69 = arith.cmpi sge, %tmp1_68, %ks0 : i64 loc(#loc116) + %tmp1_70 = arith.constant 1 : i32 loc(#loc117) + %tmp1_71 = arith.constant 1 : i32 loc(#loc117) + %tmp1_72 = arith.extui %tmp1_69 : i1 to i32 loc(#loc117) + %tmp1_73 = arith.muli %tmp1_71, %tmp1_72 : i32 loc(#loc117) + %tmp1_74 = arith.constant 1 : i32 loc(#loc118) + %tmp1_75 = arith.extsi %tmp1_74 : i32 to i64 loc(#loc118) + %tmp1_76 = arith.cmpi sgt, %ks0, %tmp1_75 : i64 loc(#loc118) + %tmp1_77 = arith.extui %tmp1_76 : i1 to i64 loc(#loc119) + %tmp1_78 = arith.muli %ks0, %tmp1_77 : i64 loc(#loc119) + %tmp1_79 = arith.extsi %tmp1_73 : i32 to i64 loc(#loc120) + %tmp1_80 = arith.addi %tmp1_79, %tmp1_78 : i64 loc(#loc120) + %tmp1_81 = tt.splat %tmp1_80 : i64 -> tensor<64x1xi64> loc(#loc121) + %tmp1_82 = arith.muli %tmp1_66, %tmp1_81 : tensor<64x1xi64> loc(#loc121) + %tmp1_83 = tt.broadcast %tmp1_82 : tensor<64x1xi64> -> tensor<64x4xi64> loc(#loc122) + %tmp1_84 = arith.addi %tmp1_62, %tmp1_83 : tensor<64x4xi64> loc(#loc122) + %tmp1_85 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc123) + %tmp1_86 = tt.addptr %tmp1_85, %tmp1_84 : tensor<64x4x!tt.ptr>, tensor<64x4xi64> loc(#loc123) + %tmp1_87 = tt.broadcast %r0_mask_25 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc124) + %tmp1_88 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x4xi1> loc(#loc124) + %tmp1_89 = arith.andi %tmp1_87, %tmp1_88 : tensor<64x4xi1> loc(#loc124) + %tmp1_90 = arith.constant 0.000000e+00 : f32 loc(#loc125) + %tmp1_91 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc125) + %tmp1_92 = arith.truncf %tmp1_91 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc125) + %tmp1_93 = tt.load %tmp1_86, %tmp1_89, %tmp1_92 evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc125) + %tmp1_94 = arith.extf %tmp1_93 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc126) + %tmp2 = arith.mulf %tmp0_55, %tmp1_94 : tensor<64x4xf32> loc(#loc127) + %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<64x4xf32> loc(#loc128) + %_tmp4_95 = tt.broadcast %r0_mask_25 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc129) + %_tmp4_96 = tt.broadcast %xmask_7 : tensor<64x1xi1> -> tensor<64x4xi1> loc(#loc129) + %_tmp4_97 = arith.andi %_tmp4_95, %_tmp4_96 : tensor<64x4xi1> loc(#loc129) + %_tmp4_98 = arith.select %_tmp4_97, %tmp5, %_tmp4_23 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc130) + scf.yield %_tmp4_98 : tensor<64x4xf32> loc(#loc48) + } loc(#loc99) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_20) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc131) + %tmp4_21 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc132) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc133) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<64x1xf32> loc(#loc134) + %tmp8_22 = arith.subf %tmp4_21, %tmp8 : tensor<64x1xf32> loc(#loc134) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc53) + %5 = tt.addptr %4, %xindex_6 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc53) + tt.store %5, %tmp8_22, %xmask_7 : tensor<64x1x!tt.ptr> loc(#loc54) + tt.return loc(#loc55) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S64_1S_i64__(%a: tensor<64x1xi32> loc("a"(#loc56)), %b: i64 loc("b"(#loc56))) -> tensor<64x1xi64> attributes {noinline = false} { + %quot = arith.extsi %a : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc137) + %quot_0 = tt.splat %b : i64 -> tensor<64x1xi64> loc(#loc137) + %quot_1 = arith.divsi %quot, %quot_0 : tensor<64x1xi64> loc(#loc137) + %remainder = arith.extsi %a : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc138) + %remainder_2 = tt.splat %b : i64 -> tensor<64x1xi64> loc(#loc138) + %remainder_3 = arith.remsi %remainder, %remainder_2 : tensor<64x1xi64> loc(#loc138) + %fixed = arith.constant 0 : i32 loc(#loc139) + %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc139) + %fixed_5 = tt.splat %fixed_4 : i64 -> tensor<64x1xi64> loc(#loc139) + %fixed_6 = arith.cmpi ne, %remainder_3, %fixed_5 : tensor<64x1xi64> loc(#loc139) + %fixed_7 = arith.constant 1 : i32 loc(#loc140) + %fixed_8 = arith.constant 1 : i64 loc(#loc140) + %fixed_9 = arith.constant dense<1> : tensor<64x1xi64> loc(#loc140) + %fixed_10 = arith.subi %quot_1, %fixed_9 : tensor<64x1xi64> loc(#loc140) + %fixed_11 = arith.select %fixed_6, %fixed_10, %quot_1 : tensor<64x1xi1>, tensor<64x1xi64> loc(#loc141) + %c0_i32 = arith.constant 0 : i32 loc(#loc62) + %cst = arith.constant dense<0> : tensor<64x1xi32> loc(#loc62) + %0 = arith.cmpi slt, %a, %cst : tensor<64x1xi32> loc(#loc62) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc63) + %1 = arith.extsi %c0_i32_12 : i32 to i64 loc(#loc63) + %2 = arith.cmpi slt, %b, %1 : i64 loc(#loc63) + %3 = tt.splat %2 : i1 -> tensor<64x1xi1> loc(#loc64) + %4 = arith.cmpi ne, %0, %3 : tensor<64x1xi1> loc(#loc64) + %5 = arith.select %4, %fixed_11, %quot_1 : tensor<64x1xi1>, tensor<64x1xi64> loc(#loc65) + tt.return %5 : tensor<64x1xi64> loc(#loc66) + ^bb1: // no predecessors + %6 = ub.poison : tensor<64x1xi64> loc(#loc67) + tt.return %6 : tensor<64x1xi64> loc(#loc67) + } loc(#loc56) + tt.func private @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x4xf32> loc("input"(#loc68))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc69) + tt.reduce.return %2 : f32 loc(#loc69) + }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc69) + tt.return %0 : tensor<64xf32> loc(#loc71) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc72) + tt.return %1 : tensor<64xf32> loc(#loc72) + } loc(#loc68) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc73)), %b: f32 loc("b"(#loc73))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc74) + tt.return %0 : f32 loc(#loc75) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc76) + tt.return %1 : f32 loc(#loc76) + } loc(#loc73) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:21) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:28) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":29:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":30:51) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":31:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:65) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:69) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:60) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:34) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:84) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:74) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:136) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:45) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:41) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:54) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:73) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:65) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:99) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:90) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:81) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:58) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:50) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:34) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:116) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:106) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:168) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":41:22) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":43:23) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:35) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:48) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:8) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:25) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:28) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":47:11) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":48:18) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:36) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:4) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc84 = loc("r0_numel"(#loc1)) +#loc85 = loc("xoffset"(#loc2)) +#loc86 = loc("xoffset"(#loc3)) +#loc87 = loc("xindex"(#loc4)) +#loc88 = loc("xindex"(#loc5)) +#loc89 = loc("xindex"(#loc6)) +#loc90 = loc("xmask"(#loc7)) +#loc91 = loc("r0_base"(#loc8)) +#loc92 = loc("r0_base"(#loc9)) +#loc93 = loc("x0"(#loc10)) +#loc94 = loc("x1"(#loc11)) +#loc95 = loc("x1"(#loc12)) +#loc96 = loc("x2"(#loc13)) +#loc97 = loc("x5"(#loc14)) +#loc98 = loc("_tmp4"(#loc15)) +#loc99 = loc("_tmp4"(#loc16)) +#loc100 = loc("r0_index"(#loc17)) +#loc101 = loc("r0_mask"(#loc18)) +#loc102 = loc("tmp0"(#loc19)) +#loc103 = loc("tmp0"(#loc20)) +#loc104 = loc("tmp0"(#loc21)) +#loc105 = loc("tmp0"(#loc22)) +#loc106 = loc("tmp0"(#loc23)) +#loc107 = loc("tmp0"(#loc24)) +#loc108 = loc("tmp0"(#loc25)) +#loc109 = loc("tmp0"(#loc26)) +#loc110 = loc("tmp0"(#loc27)) +#loc111 = loc("tmp0"(#loc28)) +#loc112 = loc("tmp0"(#loc29)) +#loc113 = loc("tmp1"(#loc30)) +#loc114 = loc("tmp1"(#loc31)) +#loc115 = loc("tmp1"(#loc32)) +#loc116 = loc("tmp1"(#loc33)) +#loc117 = loc("tmp1"(#loc34)) +#loc118 = loc("tmp1"(#loc35)) +#loc119 = loc("tmp1"(#loc36)) +#loc120 = loc("tmp1"(#loc37)) +#loc121 = loc("tmp1"(#loc38)) +#loc122 = loc("tmp1"(#loc39)) +#loc123 = loc("tmp1"(#loc40)) +#loc124 = loc("tmp1"(#loc41)) +#loc125 = loc("tmp1"(#loc42)) +#loc126 = loc("tmp1"(#loc43)) +#loc127 = loc("tmp2"(#loc44)) +#loc128 = loc("tmp5"(#loc45)) +#loc129 = loc("_tmp4"(#loc46)) +#loc130 = loc("_tmp4"(#loc47)) +#loc131 = loc("tmp4"(#loc49)) +#loc132 = loc("tmp4"(#loc50)) +#loc133 = loc("tmp7"(#loc51)) +#loc134 = loc("tmp8"(#loc52)) +#loc137 = loc("quot"(#loc57)) +#loc138 = loc("remainder"(#loc58)) +#loc139 = loc("fixed"(#loc59)) +#loc140 = loc("fixed"(#loc60)) +#loc141 = loc("fixed"(#loc61)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..4015e86ae9e342258a3f79609002d914ec946cfa --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,233 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":18:0) +#loc1 = loc(unknown) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:25) +#loc58 = loc("in_ptr0"(#loc)) +#loc59 = loc("in_ptr1"(#loc)) +#loc60 = loc("out_ptr1"(#loc)) +#loc61 = loc("ks0"(#loc)) +#loc62 = loc("ks1"(#loc)) +#loc63 = loc("xnumel"(#loc)) +#loc64 = loc("r0_numel"(#loc)) +#loc109 = loc("tmp4"(#loc52)) +#loc120 = loc(callsite(#loc1 at #loc109)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<64x1xi64, #blocked> loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c4096_i64 = arith.constant 4096 : i64 loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<1x4xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc65) + %xoffset_8 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc66) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc67) + %xindex_9 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc67) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc67) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc67) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked> loc(#loc68) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc68) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<64x1xi32, #blocked> loc(#loc68) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<64x1xi32, #blocked1> loc(#loc68) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32, #blocked> loc(#loc69) + %xmask_16 = tt.splat %xnumel : i32 -> tensor<64x1xi32, #blocked1> loc(#loc69) + %xmask_17 = arith.cmpi slt, %xindex_14, %xmask : tensor<64x1xi32, #blocked> loc(#loc69) + %xmask_18 = arith.cmpi slt, %xindex_15, %xmask_16 : tensor<64x1xi32, #blocked1> loc(#loc69) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc70) + %r0_base_19 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4xi32, #blocked> loc(#loc70) + %x0 = arith.extsi %xindex_14 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> loc(#loc71) + %x0_20 = tt.splat %ks0 : i64 -> tensor<64x1xi64, #blocked> loc(#loc71) + %x0_21 = arith.remsi %x0, %x0_20 : tensor<64x1xi64, #blocked> loc(#loc71) + %x1 = arith.divsi %x0, %x0_20 : tensor<64x1xi64, #blocked> loc(#loc72) + %x1_22 = arith.remsi %x1, %cst : tensor<64x1xi64, #blocked> loc(#loc73) + %x2 = tt.splat %ks1 : i64 -> tensor<64x1xi64, #blocked> loc(#loc74) + %x2_23 = arith.divsi %x0, %x2 : tensor<64x1xi64, #blocked> loc(#loc74) + %fixed = arith.cmpi ne, %x0_21, %cst_2 : tensor<64x1xi64, #blocked> loc(#loc111) + %fixed_24 = arith.subi %x1, %cst_4 : tensor<64x1xi64, #blocked> loc(#loc112) + %fixed_25 = arith.select %fixed, %fixed_24, %x1 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked> loc(#loc113) + %x5 = arith.cmpi slt, %xindex_14, %cst_3 : tensor<64x1xi32, #blocked> loc(#loc114) + %x5_26 = arith.cmpi slt, %ks0, %c0_i64 : i64 loc(#loc115) + %x5_27 = tt.splat %x5_26 : i1 -> tensor<64x1xi1, #blocked> loc(#loc116) + %x5_28 = arith.cmpi ne, %x5, %x5_27 : tensor<64x1xi1, #blocked> loc(#loc116) + %x5_29 = arith.select %x5_28, %fixed_25, %x1 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked> loc(#loc117) + %tmp0 = arith.muli %x1_22, %cst_0 : tensor<64x1xi64, #blocked> loc(#loc79) + %tmp0_30 = tt.broadcast %tmp0 : tensor<64x1xi64, #blocked> -> tensor<64x4xi64, #blocked> loc(#loc80) + %tmp0_31 = arith.muli %x0_21, %cst_1 : tensor<64x1xi64, #blocked> loc(#loc81) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<64x1xi64, #blocked> -> tensor<64x4xi64, #blocked> loc(#loc82) + %tmp0_33 = arith.muli %ks0, %c4096_i64 : i64 loc(#loc83) + %tmp0_34 = tt.splat %tmp0_33 : i64 -> tensor<64x1xi64, #blocked> loc(#loc84) + %tmp0_35 = arith.muli %tmp0_34, %x2_23 : tensor<64x1xi64, #blocked> loc(#loc84) + %tmp0_36 = tt.broadcast %tmp0_35 : tensor<64x1xi64, #blocked> -> tensor<64x4xi64, #blocked> loc(#loc85) + %tmp0_37 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked> loc(#loc86) + %tmp0_38 = tt.broadcast %xmask_17 : tensor<64x1xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc87) + %tmp1 = arith.muli %x0_21, %cst_0 : tensor<64x1xi64, #blocked> loc(#loc88) + %tmp1_39 = tt.broadcast %tmp1 : tensor<64x1xi64, #blocked> -> tensor<64x4xi64, #blocked> loc(#loc89) + %tmp1_40 = arith.muli %x5_29, %cst_0 : tensor<64x1xi64, #blocked> loc(#loc90) + %tmp1_41 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc91) + %tmp1_42 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc92) + %tmp1_43 = arith.extui %tmp1_42 : i1 to i64 loc(#loc93) + %tmp1_44 = arith.muli %ks0, %tmp1_43 : i64 loc(#loc93) + %tmp1_45 = arith.extui %tmp1_41 : i1 to i64 loc(#loc118) + %tmp1_46 = arith.addi %tmp1_45, %tmp1_44 : i64 loc(#loc94) + %tmp1_47 = tt.splat %tmp1_46 : i64 -> tensor<64x1xi64, #blocked> loc(#loc96) + %tmp1_48 = arith.muli %tmp1_40, %tmp1_47 : tensor<64x1xi64, #blocked> loc(#loc96) + %tmp1_49 = tt.broadcast %tmp1_48 : tensor<64x1xi64, #blocked> -> tensor<64x4xi64, #blocked> loc(#loc97) + %tmp1_50 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked> loc(#loc98) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_53 = %cst_7) -> (tensor<64x4xf32, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked> loc(#loc100) + %r0_index_54 = arith.addi %r0_index, %r0_base_19 : tensor<1x4xi32, #blocked> loc(#loc100) + %r0_mask = arith.cmpi slt, %r0_index_54, %cst_6 : tensor<1x4xi32, #blocked> loc(#loc101) + %tmp0_55 = arith.extsi %r0_index_54 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked> loc(#loc80) + %tmp0_56 = tt.broadcast %tmp0_55 : tensor<1x4xi64, #blocked> -> tensor<64x4xi64, #blocked> loc(#loc80) + %tmp0_57 = arith.addi %tmp0_56, %tmp0_30 : tensor<64x4xi64, #blocked> loc(#loc80) + %tmp0_58 = arith.addi %tmp0_57, %tmp0_32 : tensor<64x4xi64, #blocked> loc(#loc82) + %tmp0_59 = arith.addi %tmp0_58, %tmp0_36 : tensor<64x4xi64, #blocked> loc(#loc85) + %tmp0_60 = tt.addptr %tmp0_37, %tmp0_59 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi64, #blocked> loc(#loc86) + %tmp0_61 = tt.broadcast %r0_mask : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc87) + %tmp0_62 = arith.andi %tmp0_61, %tmp0_38 : tensor<64x4xi1, #blocked> loc(#loc87) + %tmp0_63 = tt.load %tmp0_60, %tmp0_62, %cst_5 evictionPolicy = evict_first : tensor<64x4x!tt.ptr, #blocked> loc(#loc102) + %tmp0_64 = arith.extf %tmp0_63 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc103) + %tmp1_65 = arith.addi %tmp0_56, %tmp1_39 : tensor<64x4xi64, #blocked> loc(#loc89) + %tmp1_66 = arith.addi %tmp1_65, %tmp1_49 : tensor<64x4xi64, #blocked> loc(#loc97) + %tmp1_67 = tt.addptr %tmp1_50, %tmp1_66 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi64, #blocked> loc(#loc98) + %tmp1_68 = tt.load %tmp1_67, %tmp0_62, %cst_5 evictionPolicy = evict_first : tensor<64x4x!tt.ptr, #blocked> loc(#loc104) + %tmp1_69 = arith.extf %tmp1_68 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc105) + %tmp2 = arith.mulf %tmp0_64, %tmp1_69 : tensor<64x4xf32, #blocked> loc(#loc106) + %tmp5 = arith.addf %_tmp4_53, %tmp2 : tensor<64x4xf32, #blocked> loc(#loc107) + %_tmp4_70 = arith.select %tmp0_62, %tmp5, %_tmp4_53 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc108) + scf.yield %_tmp4_70 : tensor<64x4xf32, #blocked> loc(#loc50) + } loc(#loc99) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_53: f32 loc(callsite(#loc1 at #loc109)), %tmp4_54: f32 loc(callsite(#loc1 at #loc109))): + %tmp4_55 = arith.addf %tmp4_53, %tmp4_54 : f32 loc(#loc121) + tt.reduce.return %tmp4_55 : f32 loc(#loc119) + }) : (tensor<64x4xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc119) + %tmp4_51 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc110) + %tmp4_52 = tt.expand_dims %tmp4_51 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc110) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc55) + %1 = tt.addptr %0, %xindex_15 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc55) + tt.store %1, %tmp4_52, %xmask_18 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc56) + tt.return loc(#loc57) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":24:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":27:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:21) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:28) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":29:19) +#loc12 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":30:51) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:45) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:41) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:55) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:50) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:65) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:69) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:60) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:34) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:84) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:45) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:41) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:54) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:73) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:99) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:90) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:81) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:65) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:58) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:50) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:34) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":33:40) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":34:31) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":35:29) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:74) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:136) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:106) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:168) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":41:22) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":43:23) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:48) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:8) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:28) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:25) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:36) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:4) +#loc65 = loc("xoffset"(#loc2)) +#loc66 = loc("xoffset"(#loc3)) +#loc67 = loc("xindex"(#loc4)) +#loc68 = loc("xindex"(#loc5)) +#loc69 = loc("xmask"(#loc6)) +#loc70 = loc("r0_base"(#loc7)) +#loc71 = loc("x0"(#loc8)) +#loc72 = loc("x1"(#loc9)) +#loc73 = loc("x1"(#loc10)) +#loc74 = loc("x2"(#loc11)) +#loc75 = loc("fixed"(#loc12)) +#loc76 = loc("x5"(#loc13)) +#loc77 = loc("fixed"(#loc14)) +#loc78 = loc("fixed"(#loc15)) +#loc79 = loc("tmp0"(#loc20)) +#loc80 = loc("tmp0"(#loc21)) +#loc81 = loc("tmp0"(#loc22)) +#loc82 = loc("tmp0"(#loc23)) +#loc83 = loc("tmp0"(#loc24)) +#loc84 = loc("tmp0"(#loc25)) +#loc85 = loc("tmp0"(#loc26)) +#loc86 = loc("tmp0"(#loc27)) +#loc87 = loc("tmp0"(#loc28)) +#loc88 = loc("tmp1"(#loc29)) +#loc89 = loc("tmp1"(#loc30)) +#loc90 = loc("tmp1"(#loc31)) +#loc91 = loc("tmp1"(#loc32)) +#loc92 = loc("tmp1"(#loc33)) +#loc93 = loc("tmp1"(#loc34)) +#loc94 = loc("tmp1"(#loc35)) +#loc95 = loc("tmp1"(#loc36)) +#loc96 = loc("tmp1"(#loc37)) +#loc97 = loc("tmp1"(#loc38)) +#loc98 = loc("tmp1"(#loc39)) +#loc99 = loc("_tmp4"(#loc40)) +#loc100 = loc("r0_index"(#loc41)) +#loc101 = loc("r0_mask"(#loc42)) +#loc102 = loc("tmp0"(#loc43)) +#loc103 = loc("tmp0"(#loc44)) +#loc104 = loc("tmp1"(#loc45)) +#loc105 = loc("tmp1"(#loc46)) +#loc106 = loc("tmp2"(#loc47)) +#loc107 = loc("tmp5"(#loc48)) +#loc108 = loc("_tmp4"(#loc49)) +#loc110 = loc("tmp4"(#loc54)) +#loc111 = loc(callsite(#loc75 at #loc76)) +#loc112 = loc(callsite(#loc77 at #loc76)) +#loc113 = loc(callsite(#loc78 at #loc76)) +#loc114 = loc(callsite(#loc16 at #loc76)) +#loc115 = loc(callsite(#loc17 at #loc76)) +#loc116 = loc(callsite(#loc18 at #loc76)) +#loc117 = loc(callsite(#loc19 at #loc76)) +#loc118 = loc(fused[#loc94, #loc95]) +#loc119 = loc(callsite(#loc51 at #loc109)) +#loc121 = loc(callsite(#loc53 at #loc119)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..e6bdc9cc53fb29fb46efe4d7f0f83072a32f8b58 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.ttir @@ -0,0 +1,228 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":18:0) +#loc6 = loc(unknown) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:25) +#loc60 = loc("in_ptr0"(#loc)) +#loc61 = loc("in_ptr1"(#loc)) +#loc62 = loc("out_ptr1"(#loc)) +#loc63 = loc("ks0"(#loc)) +#loc64 = loc("ks1"(#loc)) +#loc65 = loc("xnumel"(#loc)) +#loc66 = loc("r0_numel"(#loc)) +#loc113 = loc("tmp4"(#loc54)) +#loc124 = loc(callsite(#loc6 at #loc113)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %fixed = arith.constant dense<1> : tensor<64x1xi64> loc(#loc115) + %x5 = arith.constant dense<0> : tensor<64x1xi32> loc(#loc116) + %fixed_0 = arith.constant dense<0> : tensor<64x1xi64> loc(#loc117) + %x5_1 = arith.constant 0 : i64 loc(#loc118) + %c1_i64 = arith.constant 1 : i64 loc(#loc6) + %cst = arith.constant dense<0.000000e+00> : tensor<64x4xbf16> loc(#loc6) + %c4_i32 = arith.constant 4 : i32 loc(#loc7) + %c128_i32 = arith.constant 128 : i32 loc(#loc7) + %c0_i32 = arith.constant 0 : i32 loc(#loc7) + %cst_2 = arith.constant dense<4096> : tensor<64x1xi64> loc(#loc6) + %c4096_i64 = arith.constant 4096 : i64 loc(#loc6) + %cst_3 = arith.constant dense<128> : tensor<64x1xi64> loc(#loc6) + %cst_4 = arith.constant dense<128> : tensor<1x4xi32> loc(#loc6) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc6) + %x1 = arith.constant dense<32> : tensor<64x1xi64> loc(#loc70) + %c64_i32 = arith.constant 64 : i32 loc(#loc6) + %xoffset = tt.get_program_id x : i32 loc(#loc71) + %xoffset_6 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc72) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc73) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc74) + %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<64x1xi32> loc(#loc75) + %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<64x1xi32> loc(#loc75) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32> loc(#loc76) + %xmask_10 = arith.cmpi slt, %xindex_9, %xmask : tensor<64x1xi32> loc(#loc76) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc77) + %r0_base_11 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc78) + %x0 = arith.extsi %xindex_9 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc79) + %x0_12 = tt.splat %ks0 : i64 -> tensor<64x1xi64> loc(#loc79) + %x0_13 = arith.remsi %x0, %x0_12 : tensor<64x1xi64> loc(#loc79) + %x1_14 = arith.divsi %x0, %x0_12 : tensor<64x1xi64> loc(#loc80) + %x1_15 = arith.remsi %x1_14, %x1 : tensor<64x1xi64> loc(#loc70) + %x2 = tt.splat %ks1 : i64 -> tensor<64x1xi64> loc(#loc81) + %x2_16 = arith.divsi %x0, %x2 : tensor<64x1xi64> loc(#loc81) + %fixed_17 = arith.cmpi ne, %x0_13, %fixed_0 : tensor<64x1xi64> loc(#loc117) + %fixed_18 = arith.subi %x1_14, %fixed : tensor<64x1xi64> loc(#loc115) + %fixed_19 = arith.select %fixed_17, %fixed_18, %x1_14 : tensor<64x1xi1>, tensor<64x1xi64> loc(#loc119) + %x5_20 = arith.cmpi slt, %xindex_9, %x5 : tensor<64x1xi32> loc(#loc116) + %x5_21 = arith.cmpi slt, %ks0, %x5_1 : i64 loc(#loc118) + %x5_22 = tt.splat %x5_21 : i1 -> tensor<64x1xi1> loc(#loc120) + %x5_23 = arith.cmpi ne, %x5_20, %x5_22 : tensor<64x1xi1> loc(#loc120) + %x5_24 = arith.select %x5_23, %fixed_19, %x1_14 : tensor<64x1xi1>, tensor<64x1xi64> loc(#loc121) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_26 = %cst_5) -> (tensor<64x4xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc84) + %r0_index_27 = arith.addi %r0_index, %r0_base_11 : tensor<1x4xi32> loc(#loc84) + %r0_mask = arith.cmpi slt, %r0_index_27, %cst_4 : tensor<1x4xi32> loc(#loc85) + %tmp0 = arith.muli %x1_15, %cst_3 : tensor<64x1xi64> loc(#loc86) + %tmp0_28 = arith.extsi %r0_index_27 : tensor<1x4xi32> to tensor<1x4xi64> loc(#loc87) + %tmp0_29 = tt.broadcast %tmp0_28 : tensor<1x4xi64> -> tensor<64x4xi64> loc(#loc87) + %tmp0_30 = tt.broadcast %tmp0 : tensor<64x1xi64> -> tensor<64x4xi64> loc(#loc87) + %tmp0_31 = arith.addi %tmp0_29, %tmp0_30 : tensor<64x4xi64> loc(#loc87) + %tmp0_32 = arith.muli %x0_13, %cst_2 : tensor<64x1xi64> loc(#loc88) + %tmp0_33 = tt.broadcast %tmp0_32 : tensor<64x1xi64> -> tensor<64x4xi64> loc(#loc89) + %tmp0_34 = arith.addi %tmp0_31, %tmp0_33 : tensor<64x4xi64> loc(#loc89) + %tmp0_35 = arith.muli %ks0, %c4096_i64 : i64 loc(#loc90) + %tmp0_36 = tt.splat %tmp0_35 : i64 -> tensor<64x1xi64> loc(#loc91) + %tmp0_37 = arith.muli %tmp0_36, %x2_16 : tensor<64x1xi64> loc(#loc91) + %tmp0_38 = tt.broadcast %tmp0_37 : tensor<64x1xi64> -> tensor<64x4xi64> loc(#loc92) + %tmp0_39 = arith.addi %tmp0_34, %tmp0_38 : tensor<64x4xi64> loc(#loc92) + %tmp0_40 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc93) + %tmp0_41 = tt.addptr %tmp0_40, %tmp0_39 : tensor<64x4x!tt.ptr>, tensor<64x4xi64> loc(#loc93) + %tmp0_42 = tt.broadcast %r0_mask : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc94) + %tmp0_43 = tt.broadcast %xmask_10 : tensor<64x1xi1> -> tensor<64x4xi1> loc(#loc94) + %tmp0_44 = arith.andi %tmp0_42, %tmp0_43 : tensor<64x4xi1> loc(#loc94) + %tmp0_45 = tt.load %tmp0_41, %tmp0_44, %cst evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc95) + %tmp0_46 = arith.extf %tmp0_45 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc96) + %tmp1 = arith.muli %x0_13, %cst_3 : tensor<64x1xi64> loc(#loc97) + %tmp1_47 = tt.broadcast %tmp1 : tensor<64x1xi64> -> tensor<64x4xi64> loc(#loc98) + %tmp1_48 = arith.addi %tmp0_29, %tmp1_47 : tensor<64x4xi64> loc(#loc98) + %tmp1_49 = arith.muli %x5_24, %cst_3 : tensor<64x1xi64> loc(#loc99) + %tmp1_50 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc100) + %tmp1_51 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc101) + %tmp1_52 = arith.extui %tmp1_51 : i1 to i64 loc(#loc102) + %tmp1_53 = arith.muli %ks0, %tmp1_52 : i64 loc(#loc102) + %tmp1_54 = arith.extui %tmp1_50 : i1 to i64 loc(#loc122) + %tmp1_55 = arith.addi %tmp1_54, %tmp1_53 : i64 loc(#loc103) + %tmp1_56 = tt.splat %tmp1_55 : i64 -> tensor<64x1xi64> loc(#loc105) + %tmp1_57 = arith.muli %tmp1_49, %tmp1_56 : tensor<64x1xi64> loc(#loc105) + %tmp1_58 = tt.broadcast %tmp1_57 : tensor<64x1xi64> -> tensor<64x4xi64> loc(#loc106) + %tmp1_59 = arith.addi %tmp1_48, %tmp1_58 : tensor<64x4xi64> loc(#loc106) + %tmp1_60 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc107) + %tmp1_61 = tt.addptr %tmp1_60, %tmp1_59 : tensor<64x4x!tt.ptr>, tensor<64x4xi64> loc(#loc107) + %tmp1_62 = tt.load %tmp1_61, %tmp0_44, %cst evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc108) + %tmp1_63 = arith.extf %tmp1_62 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc109) + %tmp2 = arith.mulf %tmp0_46, %tmp1_63 : tensor<64x4xf32> loc(#loc110) + %tmp5 = arith.addf %_tmp4_26, %tmp2 : tensor<64x4xf32> loc(#loc111) + %_tmp4_64 = arith.select %tmp0_44, %tmp5, %_tmp4_26 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc112) + scf.yield %_tmp4_64 : tensor<64x4xf32> loc(#loc52) + } loc(#loc83) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_26: f32 loc(callsite(#loc6 at #loc113)), %tmp4_27: f32 loc(callsite(#loc6 at #loc113))): + %tmp4_28 = arith.addf %tmp4_26, %tmp4_27 : f32 loc(#loc125) + tt.reduce.return %tmp4_28 : f32 loc(#loc123) + }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc123) + %tmp4_25 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc114) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc57) + %1 = tt.addptr %0, %xindex_9 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc57) + tt.store %1, %tmp4_25, %xmask_10 : tensor<64x1x!tt.ptr> loc(#loc58) + tt.return loc(#loc59) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":30:51) +#loc3 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc4 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc5 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":33:40) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:33) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:44) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:23) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":24:21) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:27) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:37) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":27:19) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:21) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":29:19) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":34:31) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":35:29) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:45) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:41) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:55) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:50) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:65) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:69) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:60) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:34) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:84) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:74) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:136) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:45) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:41) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:54) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:73) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:99) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:90) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:81) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:65) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:58) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:50) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:34) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:106) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:168) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":41:22) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":43:23) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:48) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:8) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:28) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:25) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:36) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:4) +#loc67 = loc("fixed"(#loc1)) +#loc68 = loc("x5"(#loc2)) +#loc69 = loc("fixed"(#loc4)) +#loc70 = loc("x1"(#loc8)) +#loc71 = loc("xoffset"(#loc9)) +#loc72 = loc("xoffset"(#loc10)) +#loc73 = loc("xindex"(#loc11)) +#loc74 = loc("xindex"(#loc12)) +#loc75 = loc("xindex"(#loc13)) +#loc76 = loc("xmask"(#loc14)) +#loc77 = loc("r0_base"(#loc15)) +#loc78 = loc("r0_base"(#loc16)) +#loc79 = loc("x0"(#loc17)) +#loc80 = loc("x1"(#loc18)) +#loc81 = loc("x2"(#loc19)) +#loc82 = loc("fixed"(#loc20)) +#loc83 = loc("_tmp4"(#loc7)) +#loc84 = loc("r0_index"(#loc23)) +#loc85 = loc("r0_mask"(#loc24)) +#loc86 = loc("tmp0"(#loc25)) +#loc87 = loc("tmp0"(#loc26)) +#loc88 = loc("tmp0"(#loc27)) +#loc89 = loc("tmp0"(#loc28)) +#loc90 = loc("tmp0"(#loc29)) +#loc91 = loc("tmp0"(#loc30)) +#loc92 = loc("tmp0"(#loc31)) +#loc93 = loc("tmp0"(#loc32)) +#loc94 = loc("tmp0"(#loc33)) +#loc95 = loc("tmp0"(#loc34)) +#loc96 = loc("tmp0"(#loc35)) +#loc97 = loc("tmp1"(#loc36)) +#loc98 = loc("tmp1"(#loc37)) +#loc99 = loc("tmp1"(#loc38)) +#loc100 = loc("tmp1"(#loc39)) +#loc101 = loc("tmp1"(#loc40)) +#loc102 = loc("tmp1"(#loc41)) +#loc103 = loc("tmp1"(#loc42)) +#loc104 = loc("tmp1"(#loc43)) +#loc105 = loc("tmp1"(#loc44)) +#loc106 = loc("tmp1"(#loc45)) +#loc107 = loc("tmp1"(#loc46)) +#loc108 = loc("tmp1"(#loc47)) +#loc109 = loc("tmp1"(#loc48)) +#loc110 = loc("tmp2"(#loc49)) +#loc111 = loc("tmp5"(#loc50)) +#loc112 = loc("_tmp4"(#loc51)) +#loc114 = loc("tmp4"(#loc56)) +#loc115 = loc(callsite(#loc67 at #loc68)) +#loc116 = loc(callsite(#loc3 at #loc68)) +#loc117 = loc(callsite(#loc69 at #loc68)) +#loc118 = loc(callsite(#loc5 at #loc68)) +#loc119 = loc(callsite(#loc82 at #loc68)) +#loc120 = loc(callsite(#loc21 at #loc68)) +#loc121 = loc(callsite(#loc22 at #loc68)) +#loc122 = loc(fused[#loc103, #loc104]) +#loc123 = loc(callsite(#loc53 at #loc113)) +#loc125 = loc(callsite(#loc55 at #loc123)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5d955b288593dc15816e9d77b028e31fb4bf33aa --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..1374575d7c31ae3f875800bf8e505a10458f7e2f Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7d6018b6b5b63b3152f27dea962c5c6cad7728bb --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "2b1ea6d8b524e2448139788c9e5148033f200e7c9d7f9ebbf8119f15ed4e1d49", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 8, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..14b3474138953aff91de45c222dd69361ed192e2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.llir @@ -0,0 +1,176 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i64 %4, i32 %5, i32 %6, ptr addrspace(1) readnone captures(none) %7, ptr addrspace(1) readnone captures(none) %8) local_unnamed_addr #0 !dbg !4 { + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %11 = icmp slt i32 %10, %5, !dbg !8 + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %13 = and i32 %12, 31, !dbg !9 + %14 = lshr i32 %12, 5, !dbg !9 + %15 = shl nuw nsw i32 %12, 1, !dbg !9 + %16 = and i32 %15, 126, !dbg !9 + %17 = zext nneg i32 %10 to i64, !dbg !10 + %.frozen = freeze i64 %3, !dbg !11 + %18 = sdiv i64 %17, %.frozen, !dbg !11 + %19 = mul i64 %18, %.frozen, !dbg !10 + %.decomposed = sub i64 %17, %19, !dbg !10 + %.lhs.trunc = trunc nsw i64 %18 to i32, !dbg !12 + %20 = srem i32 %.lhs.trunc, 32, !dbg !12 + %.sext = sext i32 %20 to i64, !dbg !12 + %21 = sdiv i64 %17, %4, !dbg !13 + %.not = icmp ne i64 %.decomposed, 0, !dbg !14 + %22 = icmp slt i64 %3, 0, !dbg !18 + %narrow = select i1 %22, i1 %.not, i1 false, !dbg !19 + %23 = sext i1 %narrow to i64, !dbg !19 + %24 = add nsw i64 %18, %23, !dbg !19 + %25 = zext nneg i32 %16 to i64, !dbg !20 + %26 = shl i64 %3, 12, !dbg !21 + %27 = mul i64 %26, %21, !dbg !22 + %.idx = shl nsw i64 %.sext, 8, !dbg !23 + %28 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx, !dbg !23 + %29 = getelementptr bfloat, ptr addrspace(1) %28, i64 %25, !dbg !23 + %.idx1 = shl nuw nsw i64 %.decomposed, 13, !dbg !23 + %30 = getelementptr i8, ptr addrspace(1) %29, i64 %.idx1, !dbg !23 + %31 = getelementptr bfloat, ptr addrspace(1) %30, i64 %27, !dbg !23 + %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !24 + %33 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %31, i64 %32, i1 %11) #4, !dbg !24 + %34 = bitcast i32 %33 to <2 x bfloat>, !dbg !24 + %35 = icmp slt i64 %3, 2, !dbg !25 + %36 = icmp sgt i64 %3, 1, !dbg !26 + %37 = select i1 %36, i64 %3, i64 0, !dbg !27 + %38 = zext i1 %35 to i64, !dbg !28 + %39 = add i64 %37, %38, !dbg !29 + %40 = shl i64 %39, 7, !dbg !30 + %41 = mul i64 %40, %24, !dbg !31 + %.idx2 = shl nuw nsw i64 %.decomposed, 8, !dbg !32 + %42 = getelementptr i8, ptr addrspace(1) %1, i64 %.idx2, !dbg !32 + %43 = getelementptr bfloat, ptr addrspace(1) %42, i64 %25, !dbg !32 + %44 = getelementptr bfloat, ptr addrspace(1) %43, i64 %41, !dbg !32 + %45 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !33 + %46 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %44, i64 %45, i1 %11) #4, !dbg !33 + %47 = bitcast i32 %46 to <2 x bfloat>, !dbg !33 + %48 = fpext <2 x bfloat> %34 to <2 x float>, !dbg !34 + %49 = fpext <2 x bfloat> %47 to <2 x float>, !dbg !35 + %50 = fmul <2 x float> %48, %49, !dbg !36 + %51 = fadd <2 x float> %50, zeroinitializer, !dbg !37 + %shift = shufflevector <2 x float> %51, <2 x float> poison, <2 x i32> , !dbg !38 + %foldExtExtBinop = fadd <2 x float> %51, %shift, !dbg !38 + %52 = extractelement <2 x float> %foldExtExtBinop, i64 0, !dbg !38 + %53 = select i1 %11, float %52, float 0.000000e+00, !dbg !38 + %54 = bitcast float %53 to i32, !dbg !42 + %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 16, i32 31), !dbg !42 + %56 = bitcast i32 %55 to float, !dbg !42 + %57 = fadd float %53, %56, !dbg !38 + %58 = bitcast float %57 to i32, !dbg !42 + %59 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %58, i32 8, i32 31), !dbg !42 + %60 = bitcast i32 %59 to float, !dbg !42 + %61 = fadd float %57, %60, !dbg !38 + %62 = bitcast float %61 to i32, !dbg !42 + %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %62, i32 4, i32 31), !dbg !42 + %64 = bitcast i32 %63 to float, !dbg !42 + %65 = fadd float %61, %64, !dbg !38 + %66 = bitcast float %65 to i32, !dbg !42 + %67 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %66, i32 2, i32 31), !dbg !42 + %68 = bitcast i32 %67 to float, !dbg !42 + %69 = fadd float %65, %68, !dbg !38 + %70 = bitcast float %69 to i32, !dbg !42 + %71 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %70, i32 1, i32 31), !dbg !42 + %72 = bitcast i32 %71 to float, !dbg !42 + %73 = fadd float %69, %72, !dbg !38 + %74 = and i32 %14, 1, !dbg !42 + %75 = icmp eq i32 %13, 0, !dbg !42 + %76 = getelementptr float, ptr addrspace(3) @global_smem, i32 %74, !dbg !42 + %77 = bitcast float %73 to <1 x i32>, !dbg !42 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %76, <1 x i32> %77, i1 %75) #4, !dbg !42 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !42 + %78 = icmp samesign ult i32 %12, 2, !dbg !42 + %79 = getelementptr float, ptr addrspace(3) @global_smem, i32 %12, !dbg !42 + %80 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %79, i1 %78) #4, !dbg !42 + %81 = bitcast i32 %80 to float, !dbg !42 + %82 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %80, i32 1, i32 31), !dbg !42 + %83 = bitcast i32 %82 to float, !dbg !42 + %84 = fadd float %81, %83, !dbg !38 + %85 = icmp eq i32 %12, 0, !dbg !42 + %86 = bitcast float %84 to <1 x i32>, !dbg !42 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %79, <1 x i32> %86, i1 %85) #4, !dbg !42 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !42 + %87 = load i32, ptr addrspace(3) @global_smem, align 16, !dbg !42 + %88 = getelementptr float, ptr addrspace(1) %2, i64 %17, !dbg !43 + %89 = and i32 %12, 63, !dbg !44 + %90 = icmp eq i32 %89, 0, !dbg !44 + %91 = and i1 %90, %11, !dbg !44 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %87, ptr addrspace(1) %88, i1 %91) #4, !dbg !44 + ret void, !dbg !45 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 24, column: 21, scope: !4) +!9 = !DILocation(line: 25, column: 37, scope: !4) +!10 = !DILocation(line: 27, column: 19, scope: !4) +!11 = !DILocation(line: 28, column: 21, scope: !4) +!12 = !DILocation(line: 28, column: 28, scope: !4) +!13 = !DILocation(line: 29, column: 19, scope: !4) +!14 = !DILocation(line: 74, column: 34, scope: !15, inlinedAt: !17) +!15 = distinct !DILexicalBlockFile(scope: !4, file: !16, discriminator: 0) +!16 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!17 = !DILocation(line: 30, column: 51, scope: !4) +!18 = !DILocation(line: 75, column: 36, scope: !15, inlinedAt: !17) +!19 = !DILocation(line: 75, column: 47, scope: !15, inlinedAt: !17) +!20 = !DILocation(line: 39, column: 41, scope: !4) +!21 = !DILocation(line: 39, column: 65, scope: !4) +!22 = !DILocation(line: 39, column: 69, scope: !4) +!23 = !DILocation(line: 39, column: 34, scope: !4) +!24 = !DILocation(line: 39, column: 74, scope: !4) +!25 = !DILocation(line: 40, column: 73, scope: !4) +!26 = !DILocation(line: 40, column: 99, scope: !4) +!27 = !DILocation(line: 40, column: 90, scope: !4) +!28 = !DILocation(line: 40, scope: !4) +!29 = !DILocation(line: 40, column: 81, scope: !4) +!30 = !DILocation(line: 40, column: 54, scope: !4) +!31 = !DILocation(line: 40, column: 58, scope: !4) +!32 = !DILocation(line: 40, column: 34, scope: !4) +!33 = !DILocation(line: 40, column: 106, scope: !4) +!34 = !DILocation(line: 39, column: 136, scope: !4) +!35 = !DILocation(line: 40, column: 168, scope: !4) +!36 = !DILocation(line: 41, column: 22, scope: !4) +!37 = !DILocation(line: 43, column: 23, scope: !4) +!38 = !DILocation(line: 261, column: 15, scope: !39, inlinedAt: !41) +!39 = distinct !DILexicalBlockFile(scope: !4, file: !40, discriminator: 0) +!40 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!41 = !DILocation(line: 45, column: 25, scope: !4) +!42 = !DILocation(line: 291, column: 36, scope: !39, inlinedAt: !41) +!43 = !DILocation(line: 49, column: 25, scope: !4) +!44 = !DILocation(line: 49, column: 36, scope: !4) +!45 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..566bb7533a77c67bcd40ec7ebc17eaa8dddf89a3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.ptx @@ -0,0 +1,467 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u64 triton_red_fused_zeros_0_param_3, + .param .u64 triton_red_fused_zeros_0_param_4, + .param .u32 triton_red_fused_zeros_0_param_5, + .param .u32 triton_red_fused_zeros_0_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_7, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_8 +) +.reqntid 64 +{ + .reg .pred %p<15>; + .reg .b16 %rs<5>; + .reg .b32 %r<57>; + .reg .b64 %rd<48>; + .loc 1 18 0 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:18:0 + +// %bb.0: + ld.param.b64 %rd15, [triton_red_fused_zeros_0_param_4]; + ld.param.b64 %rd14, [triton_red_fused_zeros_0_param_3]; +$L__tmp0: + .loc 1 22 28 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:22:28 + mov.u32 %r6, %ctaid.x; + .loc 1 25 37 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:25:37 + mov.u32 %r1, %tid.x; + shl.b32 %r7, %r1, 1; + .loc 1 27 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:27:19 + cvt.u64.u32 %rd1, %r6; + .loc 1 28 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:28:21 + and.b64 %rd17, %rd14, -4294967296; + setp.ne.b64 %p1, %rd17, 0; + cvt.u32.u64 %r56, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd46, %rd1, %rd14; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r8, %rd14; + div.u32 %r10, %r56, %r8; + cvt.u64.u32 %rd46, %r10; +$L__BB0_3: + .loc 1 0 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:0:21 + ld.param.b32 %r5, [triton_red_fused_zeros_0_param_5]; + ld.param.b64 %rd13, [triton_red_fused_zeros_0_param_2]; + ld.param.b64 %rd12, [triton_red_fused_zeros_0_param_1]; + ld.param.b64 %rd11, [triton_red_fused_zeros_0_param_0]; + and.b32 %r2, %r1, 31; + shr.u32 %r3, %r1, 5; + and.b32 %r4, %r7, 126; + .loc 1 27 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:27:19 + mul.lo.s64 %rd18, %rd46, %rd14; + sub.s64 %rd6, %rd1, %rd18; + .loc 1 28 28 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:28:28 + cvt.u32.u64 %r11, %rd46; + shr.s32 %r12, %r11, 31; + shr.u32 %r13, %r12, 27; + add.s32 %r14, %r11, %r13; + and.b32 %r15, %r14, -32; + sub.s32 %r16, %r11, %r15; + cvt.s64.s32 %rd7, %r16; + .loc 1 29 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:29:19 + and.b64 %rd19, %rd15, -4294967296; + setp.ne.b64 %p2, %rd19, 0; + @%p2 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd47, %rd1, %rd15; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r17, %rd15; + div.u32 %r19, %r56, %r17; + cvt.u64.u32 %rd47, %r19; +$L__BB0_6: + .loc 1 24 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:24:21 + setp.lt.s32 %p3, %r56, %r5; +$L__tmp1: + .loc 2 74 34 // triton_helpers.py:74:34 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + setp.ne.b64 %p9, %rd6, 0; + .loc 2 75 36 // triton_helpers.py:75:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + setp.lt.s64 %p10, %rd14, 0; + .loc 2 75 47 // triton_helpers.py:75:47 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + and.pred %p11, %p10, %p9; + selp.b64 %rd27, -1, 0, %p11; + add.s64 %rd28, %rd46, %rd27; +$L__tmp2: + .loc 1 39 69 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:69 + mul.lo.s64 %rd29, %rd14, %rd47; + .loc 1 39 34 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:34 + shl.b64 %rd30, %rd7, 8; + add.s64 %rd31, %rd11, %rd30; + mul.wide.u32 %rd32, %r4, 2; + add.s64 %rd33, %rd31, %rd32; + shl.b64 %rd34, %rd6, 13; + add.s64 %rd35, %rd33, %rd34; + shl.b64 %rd36, %rd29, 13; + add.s64 %rd21, %rd35, %rd36; + .loc 1 39 74 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:74 + // begin inline asm + mov.u64 %rd22, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd22, 1.0; + // end inline asm + mov.b32 %r21, 0; + // begin inline asm + mov.u32 %r20, %r21; + @%p3 ld.global.L1::evict_first.L2::cache_hint.b32 { %r20 }, [ %rd21 + 0 ], %rd22; + // end inline asm + .loc 1 40 73 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:73 + setp.lt.s64 %p12, %rd14, 2; + .loc 1 40 99 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:99 + setp.gt.s64 %p13, %rd14, 1; + .loc 1 40 90 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:90 + selp.b64 %rd37, %rd14, 0, %p13; + .loc 1 40 0 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40 + selp.b64 %rd38, 1, 0, %p12; + .loc 1 40 81 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:81 + add.s64 %rd39, %rd37, %rd38; + .loc 1 40 58 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:58 + mul.lo.s64 %rd40, %rd39, %rd28; + .loc 1 40 34 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:34 + shl.b64 %rd41, %rd6, 8; + add.s64 %rd42, %rd12, %rd41; + add.s64 %rd43, %rd42, %rd32; + shl.b64 %rd44, %rd40, 8; + add.s64 %rd24, %rd43, %rd44; + .loc 1 40 106 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:106 + // begin inline asm + mov.u64 %rd25, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd25, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r22, %r21; + @%p3 ld.global.L1::evict_first.L2::cache_hint.b32 { %r22 }, [ %rd24 + 0 ], %rd25; + // end inline asm + .loc 1 39 136 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:136 + mov.b32 {%rs1, %rs2}, %r20; + cvt.f32.bf16 %r32, %rs1; + cvt.f32.bf16 %r33, %rs2; + .loc 1 40 168 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:168 + mov.b32 {%rs3, %rs4}, %r22; + cvt.f32.bf16 %r34, %rs3; + cvt.f32.bf16 %r35, %rs4; + .loc 1 43 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:43:23 + fma.rn.f32 %r36, %r33, %r35, 0f00000000; + fma.rn.f32 %r37, %r32, %r34, 0f00000000; +$L__tmp3: + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r38, %r37, %r36; + selp.f32 %r39, %r38, 0f00000000, %p3; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r40, %r39, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r41, %r39, %r40; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r42, %r41, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r43, %r41, %r42; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r44, %r43, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r45, %r43, %r44; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r46, %r45, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r47, %r45, %r46; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r48, %r47, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r25, %r47, %r48; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + and.b32 %r49, %r3, 1; + setp.eq.b32 %p5, %r2, 0; + shl.b32 %r50, %r49, 2; + mov.b32 %r51, global_smem; + add.s32 %r24, %r51, %r50; + // begin inline asm + @%p5 st.shared.b32 [ %r24 + 0 ], %r25; + // end inline asm + bar.sync 0; + setp.lt.u32 %p6, %r1, 2; + shl.b32 %r52, %r1, 2; + add.s32 %r27, %r51, %r52; + // begin inline asm + @%p6 ld.shared.b32 %r26, [ %r27 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r53, %r26, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r29, %r26, %r53; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + setp.eq.b32 %p7, %r1, 0; + // begin inline asm + @%p7 st.shared.b32 [ %r27 + 0 ], %r29; + // end inline asm + bar.sync 0; + ld.shared.b32 %r30, [global_smem]; +$L__tmp4: + .loc 1 49 25 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:49:25 + shl.b64 %rd45, %rd1, 2; + add.s64 %rd26, %rd13, %rd45; + .loc 1 49 36 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:49:36 + and.b32 %r54, %r1, 63; + setp.eq.b32 %p14, %r54, 0; + and.pred %p8, %p14, %p3; + // begin inline asm + @%p8 st.global.b32 [ %rd26 + 0 ], { %r30 }; + // end inline asm + .loc 1 49 4 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:49:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 233 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe2 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 54 +.b8 52 +.b8 100 +.b8 110 +.b8 112 +.b8 110 +.b8 99 +.b8 50 +.b8 52 +.b8 117 +.b8 106 +.b8 115 +.b8 52 +.b8 101 +.b8 108 +.b8 51 +.b8 111 +.b8 122 +.b8 101 +.b8 100 +.b8 97 +.b8 55 +.b8 112 +.b8 110 +.b8 115 +.b8 118 +.b8 116 +.b8 101 +.b8 119 +.b8 104 +.b8 55 +.b8 120 +.b8 54 +.b8 55 +.b8 103 +.b8 122 +.b8 119 +.b8 116 +.b8 102 +.b8 111 +.b8 55 +.b8 122 +.b8 120 +.b8 118 +.b8 114 +.b8 102 +.b8 115 +.b8 113 +.b8 122 +.b8 115 +.b8 103 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 54 +.b8 52 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 30 // DW_AT_call_line +.b8 51 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd3:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..d2910199dd374d7169a38f640358013257149612 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.source @@ -0,0 +1,320 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":18:0) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc70 = loc(unknown) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc77 = loc("in_ptr0"(#loc)) +#loc78 = loc("in_ptr1"(#loc)) +#loc79 = loc("out_ptr1"(#loc)) +#loc80 = loc("ks0"(#loc)) +#loc81 = loc("ks1"(#loc)) +#loc82 = loc("xnumel"(#loc)) +#loc83 = loc("r0_numel"(#loc)) +#loc135 = loc("a"(#loc56)) +#loc136 = loc("b"(#loc56)) +#loc142 = loc("input"(#loc68)) +#loc143 = loc("a"(#loc73)) +#loc144 = loc("b"(#loc73)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 128 : i32 loc(#loc84) + %xoffset = tt.get_program_id x : i32 loc(#loc85) + %xoffset_1 = arith.constant 1 : i32 loc(#loc86) + %xoffset_2 = arith.constant 1 : i32 loc(#loc86) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc86) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc87) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc88) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc89) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc89) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc90) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc90) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc91) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc92) + %x0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc93) + %x0_9 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc93) + %x0_10 = arith.remsi %x0, %x0_9 : tensor<1x1xi64> loc(#loc93) + %x1 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc94) + %x1_11 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc94) + %x1_12 = arith.divsi %x1, %x1_11 : tensor<1x1xi64> loc(#loc94) + %x1_13 = arith.constant 32 : i32 loc(#loc95) + %x1_14 = arith.constant 32 : i64 loc(#loc95) + %x1_15 = arith.constant dense<32> : tensor<1x1xi64> loc(#loc95) + %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<1x1xi64> loc(#loc95) + %x2 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc96) + %x2_17 = tt.splat %ks1 : i64 -> tensor<1x1xi64> loc(#loc96) + %x2_18 = arith.divsi %x2, %x2_17 : tensor<1x1xi64> loc(#loc96) + %x5 = tt.call @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S1_1S_i64__(%xindex_6, %ks0) : (tensor<1x1xi32>, i64) -> tensor<1x1xi64> loc(#loc97) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc98) + %_tmp4_19 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc98) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c128_i32 = arith.constant 128 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_20 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_19) -> (tensor<1x128xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc100) + %r0_index_24 = arith.addi %r0_index, %r0_base_8 : tensor<1x128xi32> loc(#loc100) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc101) + %r0_mask_25 = arith.cmpi slt, %r0_index_24, %r0_mask : tensor<1x128xi32> loc(#loc101) + %tmp0 = arith.constant 128 : i32 loc(#loc102) + %tmp0_26 = arith.constant 128 : i64 loc(#loc102) + %tmp0_27 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc102) + %tmp0_28 = arith.muli %tmp0_27, %x1_16 : tensor<1x1xi64> loc(#loc102) + %tmp0_29 = arith.extsi %r0_index_24 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc103) + %tmp0_30 = tt.broadcast %tmp0_28 : tensor<1x1xi64> -> tensor<1x128xi64> loc(#loc103) + %tmp0_31 = arith.addi %tmp0_29, %tmp0_30 : tensor<1x128xi64> loc(#loc103) + %tmp0_32 = arith.constant 4096 : i32 loc(#loc104) + %tmp0_33 = arith.constant 4096 : i64 loc(#loc104) + %tmp0_34 = arith.constant dense<4096> : tensor<1x1xi64> loc(#loc104) + %tmp0_35 = arith.muli %tmp0_34, %x0_10 : tensor<1x1xi64> loc(#loc104) + %tmp0_36 = tt.broadcast %tmp0_35 : tensor<1x1xi64> -> tensor<1x128xi64> loc(#loc105) + %tmp0_37 = arith.addi %tmp0_31, %tmp0_36 : tensor<1x128xi64> loc(#loc105) + %tmp0_38 = arith.constant 4096 : i32 loc(#loc106) + %tmp0_39 = arith.constant 4096 : i64 loc(#loc106) + %tmp0_40 = arith.muli %tmp0_39, %ks0 : i64 loc(#loc106) + %tmp0_41 = tt.splat %tmp0_40 : i64 -> tensor<1x1xi64> loc(#loc107) + %tmp0_42 = arith.muli %tmp0_41, %x2_18 : tensor<1x1xi64> loc(#loc107) + %tmp0_43 = tt.broadcast %tmp0_42 : tensor<1x1xi64> -> tensor<1x128xi64> loc(#loc108) + %tmp0_44 = arith.addi %tmp0_37, %tmp0_43 : tensor<1x128xi64> loc(#loc108) + %tmp0_45 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc109) + %tmp0_46 = tt.addptr %tmp0_45, %tmp0_44 : tensor<1x128x!tt.ptr>, tensor<1x128xi64> loc(#loc109) + %tmp0_47 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x128xi1> loc(#loc110) + %tmp0_48 = arith.andi %r0_mask_25, %tmp0_47 : tensor<1x128xi1> loc(#loc110) + %tmp0_49 = arith.constant 0.000000e+00 : f32 loc(#loc111) + %tmp0_50 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc111) + %tmp0_51 = arith.truncf %tmp0_50 : tensor<1x128xf32> to tensor<1x128xbf16> loc(#loc111) + %tmp0_52 = tt.load %tmp0_46, %tmp0_48, %tmp0_51 evictionPolicy = evict_first : tensor<1x128x!tt.ptr> loc(#loc111) + %tmp0_53 = arith.extf %tmp0_52 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc112) + %tmp1 = arith.constant 128 : i32 loc(#loc113) + %tmp1_54 = arith.constant 128 : i64 loc(#loc113) + %tmp1_55 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc113) + %tmp1_56 = arith.muli %tmp1_55, %x0_10 : tensor<1x1xi64> loc(#loc113) + %tmp1_57 = arith.extsi %r0_index_24 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc114) + %tmp1_58 = tt.broadcast %tmp1_56 : tensor<1x1xi64> -> tensor<1x128xi64> loc(#loc114) + %tmp1_59 = arith.addi %tmp1_57, %tmp1_58 : tensor<1x128xi64> loc(#loc114) + %tmp1_60 = arith.constant 128 : i32 loc(#loc115) + %tmp1_61 = arith.constant 128 : i64 loc(#loc115) + %tmp1_62 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc115) + %tmp1_63 = arith.muli %tmp1_62, %x5 : tensor<1x1xi64> loc(#loc115) + %tmp1_64 = arith.constant 1 : i32 loc(#loc116) + %tmp1_65 = arith.extsi %tmp1_64 : i32 to i64 loc(#loc116) + %tmp1_66 = arith.cmpi sge, %tmp1_65, %ks0 : i64 loc(#loc116) + %tmp1_67 = arith.constant 1 : i32 loc(#loc117) + %tmp1_68 = arith.constant 1 : i32 loc(#loc117) + %tmp1_69 = arith.extui %tmp1_66 : i1 to i32 loc(#loc117) + %tmp1_70 = arith.muli %tmp1_68, %tmp1_69 : i32 loc(#loc117) + %tmp1_71 = arith.constant 1 : i32 loc(#loc118) + %tmp1_72 = arith.extsi %tmp1_71 : i32 to i64 loc(#loc118) + %tmp1_73 = arith.cmpi sgt, %ks0, %tmp1_72 : i64 loc(#loc118) + %tmp1_74 = arith.extui %tmp1_73 : i1 to i64 loc(#loc119) + %tmp1_75 = arith.muli %ks0, %tmp1_74 : i64 loc(#loc119) + %tmp1_76 = arith.extsi %tmp1_70 : i32 to i64 loc(#loc120) + %tmp1_77 = arith.addi %tmp1_76, %tmp1_75 : i64 loc(#loc120) + %tmp1_78 = tt.splat %tmp1_77 : i64 -> tensor<1x1xi64> loc(#loc121) + %tmp1_79 = arith.muli %tmp1_63, %tmp1_78 : tensor<1x1xi64> loc(#loc121) + %tmp1_80 = tt.broadcast %tmp1_79 : tensor<1x1xi64> -> tensor<1x128xi64> loc(#loc122) + %tmp1_81 = arith.addi %tmp1_59, %tmp1_80 : tensor<1x128xi64> loc(#loc122) + %tmp1_82 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc123) + %tmp1_83 = tt.addptr %tmp1_82, %tmp1_81 : tensor<1x128x!tt.ptr>, tensor<1x128xi64> loc(#loc123) + %tmp1_84 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x128xi1> loc(#loc124) + %tmp1_85 = arith.andi %r0_mask_25, %tmp1_84 : tensor<1x128xi1> loc(#loc124) + %tmp1_86 = arith.constant 0.000000e+00 : f32 loc(#loc125) + %tmp1_87 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc125) + %tmp1_88 = arith.truncf %tmp1_87 : tensor<1x128xf32> to tensor<1x128xbf16> loc(#loc125) + %tmp1_89 = tt.load %tmp1_83, %tmp1_85, %tmp1_88 evictionPolicy = evict_first : tensor<1x128x!tt.ptr> loc(#loc125) + %tmp1_90 = arith.extf %tmp1_89 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc126) + %tmp2 = arith.mulf %tmp0_53, %tmp1_90 : tensor<1x128xf32> loc(#loc127) + %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<1x128xf32> loc(#loc128) + %_tmp4_91 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x128xi1> loc(#loc129) + %_tmp4_92 = arith.andi %r0_mask_25, %_tmp4_91 : tensor<1x128xi1> loc(#loc129) + %_tmp4_93 = arith.select %_tmp4_92, %tmp5, %_tmp4_23 : tensor<1x128xi1>, tensor<1x128xf32> loc(#loc130) + scf.yield %_tmp4_93 : tensor<1x128xf32> loc(#loc48) + } loc(#loc99) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S1_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_20) : (tensor<1x128xf32>) -> tensor<1xf32> loc(#loc131) + %tmp4_21 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc132) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc133) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<1x1xf32> loc(#loc134) + %tmp8_22 = arith.subf %tmp4_21, %tmp8 : tensor<1x1xf32> loc(#loc134) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc53) + %5 = tt.addptr %4, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc53) + tt.store %5, %tmp8_22, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc54) + tt.return loc(#loc55) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S1_1S_i64__(%a: tensor<1x1xi32> loc("a"(#loc56)), %b: i64 loc("b"(#loc56))) -> tensor<1x1xi64> attributes {noinline = false} { + %quot = arith.extsi %a : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc137) + %quot_0 = tt.splat %b : i64 -> tensor<1x1xi64> loc(#loc137) + %quot_1 = arith.divsi %quot, %quot_0 : tensor<1x1xi64> loc(#loc137) + %remainder = arith.extsi %a : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc138) + %remainder_2 = tt.splat %b : i64 -> tensor<1x1xi64> loc(#loc138) + %remainder_3 = arith.remsi %remainder, %remainder_2 : tensor<1x1xi64> loc(#loc138) + %fixed = arith.constant 0 : i32 loc(#loc139) + %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc139) + %fixed_5 = tt.splat %fixed_4 : i64 -> tensor<1x1xi64> loc(#loc139) + %fixed_6 = arith.cmpi ne, %remainder_3, %fixed_5 : tensor<1x1xi64> loc(#loc139) + %fixed_7 = arith.constant 1 : i32 loc(#loc140) + %fixed_8 = arith.constant 1 : i64 loc(#loc140) + %fixed_9 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc140) + %fixed_10 = arith.subi %quot_1, %fixed_9 : tensor<1x1xi64> loc(#loc140) + %fixed_11 = arith.select %fixed_6, %fixed_10, %quot_1 : tensor<1x1xi1>, tensor<1x1xi64> loc(#loc141) + %c0_i32 = arith.constant 0 : i32 loc(#loc62) + %cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc62) + %0 = arith.cmpi slt, %a, %cst : tensor<1x1xi32> loc(#loc62) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc63) + %1 = arith.extsi %c0_i32_12 : i32 to i64 loc(#loc63) + %2 = arith.cmpi slt, %b, %1 : i64 loc(#loc63) + %3 = tt.splat %2 : i1 -> tensor<1x1xi1> loc(#loc64) + %4 = arith.cmpi ne, %0, %3 : tensor<1x1xi1> loc(#loc64) + %5 = arith.select %4, %fixed_11, %quot_1 : tensor<1x1xi1>, tensor<1x1xi64> loc(#loc65) + tt.return %5 : tensor<1x1xi64> loc(#loc66) + ^bb1: // no predecessors + %6 = ub.poison : tensor<1x1xi64> loc(#loc67) + tt.return %6 : tensor<1x1xi64> loc(#loc67) + } loc(#loc56) + tt.func private @"triton.language.standard.sum__fp32S1_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x128xf32> loc("input"(#loc68))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc69) + tt.reduce.return %2 : f32 loc(#loc69) + }) : (tensor<1x128xf32>) -> tensor<1xf32> loc(#loc69) + tt.return %0 : tensor<1xf32> loc(#loc71) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc72) + tt.return %1 : tensor<1xf32> loc(#loc72) + } loc(#loc68) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc73)), %b: f32 loc("b"(#loc73))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc74) + tt.return %0 : f32 loc(#loc75) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc76) + tt.return %1 : f32 loc(#loc76) + } loc(#loc73) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:21) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:28) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":29:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":30:51) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":31:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:65) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:69) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:60) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:34) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:84) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:74) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:136) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:45) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:41) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:54) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:73) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:65) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:99) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:90) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:81) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:58) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:50) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:34) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:116) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:106) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:168) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":41:22) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":43:23) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:35) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:48) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:8) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:25) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:28) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":47:11) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":48:18) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:36) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:4) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc84 = loc("r0_numel"(#loc1)) +#loc85 = loc("xoffset"(#loc2)) +#loc86 = loc("xoffset"(#loc3)) +#loc87 = loc("xindex"(#loc4)) +#loc88 = loc("xindex"(#loc5)) +#loc89 = loc("xindex"(#loc6)) +#loc90 = loc("xmask"(#loc7)) +#loc91 = loc("r0_base"(#loc8)) +#loc92 = loc("r0_base"(#loc9)) +#loc93 = loc("x0"(#loc10)) +#loc94 = loc("x1"(#loc11)) +#loc95 = loc("x1"(#loc12)) +#loc96 = loc("x2"(#loc13)) +#loc97 = loc("x5"(#loc14)) +#loc98 = loc("_tmp4"(#loc15)) +#loc99 = loc("_tmp4"(#loc16)) +#loc100 = loc("r0_index"(#loc17)) +#loc101 = loc("r0_mask"(#loc18)) +#loc102 = loc("tmp0"(#loc19)) +#loc103 = loc("tmp0"(#loc20)) +#loc104 = loc("tmp0"(#loc21)) +#loc105 = loc("tmp0"(#loc22)) +#loc106 = loc("tmp0"(#loc23)) +#loc107 = loc("tmp0"(#loc24)) +#loc108 = loc("tmp0"(#loc25)) +#loc109 = loc("tmp0"(#loc26)) +#loc110 = loc("tmp0"(#loc27)) +#loc111 = loc("tmp0"(#loc28)) +#loc112 = loc("tmp0"(#loc29)) +#loc113 = loc("tmp1"(#loc30)) +#loc114 = loc("tmp1"(#loc31)) +#loc115 = loc("tmp1"(#loc32)) +#loc116 = loc("tmp1"(#loc33)) +#loc117 = loc("tmp1"(#loc34)) +#loc118 = loc("tmp1"(#loc35)) +#loc119 = loc("tmp1"(#loc36)) +#loc120 = loc("tmp1"(#loc37)) +#loc121 = loc("tmp1"(#loc38)) +#loc122 = loc("tmp1"(#loc39)) +#loc123 = loc("tmp1"(#loc40)) +#loc124 = loc("tmp1"(#loc41)) +#loc125 = loc("tmp1"(#loc42)) +#loc126 = loc("tmp1"(#loc43)) +#loc127 = loc("tmp2"(#loc44)) +#loc128 = loc("tmp5"(#loc45)) +#loc129 = loc("_tmp4"(#loc46)) +#loc130 = loc("_tmp4"(#loc47)) +#loc131 = loc("tmp4"(#loc49)) +#loc132 = loc("tmp4"(#loc50)) +#loc133 = loc("tmp7"(#loc51)) +#loc134 = loc("tmp8"(#loc52)) +#loc137 = loc("quot"(#loc57)) +#loc138 = loc("remainder"(#loc58)) +#loc139 = loc("fixed"(#loc59)) +#loc140 = loc("fixed"(#loc60)) +#loc141 = loc("fixed"(#loc61)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..8de619e522b8703e317dcf4622b6db0936e30135 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,198 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":18:0) +#loc1 = loc(unknown) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:25) +#loc52 = loc("in_ptr0"(#loc)) +#loc53 = loc("in_ptr1"(#loc)) +#loc54 = loc("out_ptr1"(#loc)) +#loc55 = loc("ks0"(#loc)) +#loc56 = loc("ks1"(#loc)) +#loc57 = loc("xnumel"(#loc)) +#loc58 = loc("r0_numel"(#loc)) +#loc98 = loc("tmp4"(#loc46)) +#loc115 = loc(callsite(#loc1 at #loc98)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %c4096_i64 = arith.constant 4096 : i64 loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c32_i64 = arith.constant 32 : i64 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x128xbf16, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x128xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc59) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc60) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc61) + %r0_base_2 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc61) + %x0 = arith.extsi %xoffset : i32 to i64 loc(#loc62) + %x0_3 = arith.remsi %x0, %ks0 : i64 loc(#loc62) + %x1 = arith.divsi %x0, %ks0 : i64 loc(#loc63) + %x1_4 = arith.remsi %x1, %c32_i64 : i64 loc(#loc64) + %x2 = arith.divsi %x0, %ks1 : i64 loc(#loc65) + %fixed = arith.cmpi ne, %x0_3, %c0_i64 : i64 loc(#loc100) + %fixed_5 = arith.subi %x1, %c1_i64 : i64 loc(#loc101) + %fixed_6 = arith.select %fixed, %fixed_5, %x1 : i64 loc(#loc102) + %x5 = arith.cmpi slt, %xoffset, %c0_i32 : i32 loc(#loc103) + %x5_7 = arith.cmpi slt, %ks0, %c0_i64 : i64 loc(#loc104) + %x5_8 = arith.cmpi ne, %x5, %x5_7 : i1 loc(#loc105) + %x5_9 = arith.select %x5_8, %fixed_6, %x1 : i64 loc(#loc106) + %r0_mask = arith.cmpi slt, %r0_base_2, %cst : tensor<1x128xi32, #blocked> loc(#loc70) + %tmp0 = arith.muli %x1_4, %c128_i64 : i64 loc(#loc71) + %tmp0_10 = arith.extsi %r0_base_2 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked> loc(#loc72) + %tmp0_11 = tt.splat %tmp0 : i64 -> tensor<1x128xi64, #blocked> loc(#loc107) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<1x128xi64, #blocked> loc(#loc72) + %tmp0_13 = arith.muli %x0_3, %c4096_i64 : i64 loc(#loc73) + %tmp0_14 = tt.splat %tmp0_13 : i64 -> tensor<1x128xi64, #blocked> loc(#loc108) + %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<1x128xi64, #blocked> loc(#loc74) + %tmp0_16 = arith.muli %ks0, %c4096_i64 : i64 loc(#loc75) + %tmp0_17 = arith.muli %tmp0_16, %x2 : i64 loc(#loc76) + %tmp0_18 = tt.splat %tmp0_17 : i64 -> tensor<1x128xi64, #blocked> loc(#loc109) + %tmp0_19 = arith.addi %tmp0_15, %tmp0_18 : tensor<1x128xi64, #blocked> loc(#loc77) + %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked> loc(#loc78) + %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi64, #blocked> loc(#loc78) + %tmp0_22 = tt.splat %xmask : i1 -> tensor<1x128xi1, #blocked> loc(#loc110) + %tmp0_23 = arith.andi %r0_mask, %tmp0_22 : tensor<1x128xi1, #blocked> loc(#loc79) + %tmp0_24 = tt.load %tmp0_21, %tmp0_23, %cst_0 evictionPolicy = evict_first : tensor<1x128x!tt.ptr, #blocked> loc(#loc80) + %tmp0_25 = arith.extf %tmp0_24 : tensor<1x128xbf16, #blocked> to tensor<1x128xf32, #blocked> loc(#loc81) + %tmp1 = arith.muli %x0_3, %c128_i64 : i64 loc(#loc82) + %tmp1_26 = tt.splat %tmp1 : i64 -> tensor<1x128xi64, #blocked> loc(#loc111) + %tmp1_27 = arith.addi %tmp0_10, %tmp1_26 : tensor<1x128xi64, #blocked> loc(#loc83) + %tmp1_28 = arith.muli %x5_9, %c128_i64 : i64 loc(#loc84) + %tmp1_29 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc85) + %tmp1_30 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc86) + %tmp1_31 = arith.extui %tmp1_30 : i1 to i64 loc(#loc87) + %tmp1_32 = arith.muli %ks0, %tmp1_31 : i64 loc(#loc87) + %tmp1_33 = arith.extui %tmp1_29 : i1 to i64 loc(#loc112) + %tmp1_34 = arith.addi %tmp1_33, %tmp1_32 : i64 loc(#loc88) + %tmp1_35 = arith.muli %tmp1_28, %tmp1_34 : i64 loc(#loc90) + %tmp1_36 = tt.splat %tmp1_35 : i64 -> tensor<1x128xi64, #blocked> loc(#loc113) + %tmp1_37 = arith.addi %tmp1_27, %tmp1_36 : tensor<1x128xi64, #blocked> loc(#loc91) + %tmp1_38 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked> loc(#loc92) + %tmp1_39 = tt.addptr %tmp1_38, %tmp1_37 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi64, #blocked> loc(#loc92) + %tmp1_40 = tt.load %tmp1_39, %tmp0_23, %cst_0 evictionPolicy = evict_first : tensor<1x128x!tt.ptr, #blocked> loc(#loc93) + %tmp1_41 = arith.extf %tmp1_40 : tensor<1x128xbf16, #blocked> to tensor<1x128xf32, #blocked> loc(#loc94) + %tmp2 = arith.mulf %tmp0_25, %tmp1_41 : tensor<1x128xf32, #blocked> loc(#loc95) + %tmp5 = arith.addf %tmp2, %cst_1 : tensor<1x128xf32, #blocked> loc(#loc96) + %_tmp4 = arith.select %tmp0_23, %tmp5, %cst_1 : tensor<1x128xi1, #blocked>, tensor<1x128xf32, #blocked> loc(#loc97) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_44: f32 loc(callsite(#loc1 at #loc98)), %tmp4_45: f32 loc(callsite(#loc1 at #loc98))): + %tmp4_46 = arith.addf %tmp4_44, %tmp4_45 : f32 loc(#loc116) + tt.reduce.return %tmp4_46 : f32 loc(#loc114) + }) : (tensor<1x128xf32, #blocked>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc114) + %tmp4_42 = ttg.convert_layout %tmp4 : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc99) + %tmp4_43 = tt.expand_dims %tmp4_42 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xf32, #blocked1> loc(#loc99) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc49) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc50) + %2 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked1> loc(#loc50) + tt.store %1, %tmp4_43, %2 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc50) + tt.return loc(#loc51) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":27:19) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":29:19) +#loc9 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":30:51) +#loc11 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc12 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc13 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":35:29) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:55) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:50) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:65) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:69) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:60) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:84) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:74) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:136) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:45) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:41) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:54) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:73) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:99) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:90) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:81) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:65) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:58) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:50) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:34) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:106) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:168) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":41:22) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":43:23) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:48) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:28) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:25) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:36) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:4) +#loc59 = loc("xoffset"(#loc2)) +#loc60 = loc("xmask"(#loc3)) +#loc61 = loc("r0_base"(#loc4)) +#loc62 = loc("x0"(#loc5)) +#loc63 = loc("x1"(#loc6)) +#loc64 = loc("x1"(#loc7)) +#loc65 = loc("x2"(#loc8)) +#loc66 = loc("fixed"(#loc9)) +#loc67 = loc("x5"(#loc10)) +#loc68 = loc("fixed"(#loc11)) +#loc69 = loc("fixed"(#loc12)) +#loc70 = loc("r0_mask"(#loc17)) +#loc71 = loc("tmp0"(#loc18)) +#loc72 = loc("tmp0"(#loc19)) +#loc73 = loc("tmp0"(#loc20)) +#loc74 = loc("tmp0"(#loc21)) +#loc75 = loc("tmp0"(#loc22)) +#loc76 = loc("tmp0"(#loc23)) +#loc77 = loc("tmp0"(#loc24)) +#loc78 = loc("tmp0"(#loc25)) +#loc79 = loc("tmp0"(#loc26)) +#loc80 = loc("tmp0"(#loc27)) +#loc81 = loc("tmp0"(#loc28)) +#loc82 = loc("tmp1"(#loc29)) +#loc83 = loc("tmp1"(#loc30)) +#loc84 = loc("tmp1"(#loc31)) +#loc85 = loc("tmp1"(#loc32)) +#loc86 = loc("tmp1"(#loc33)) +#loc87 = loc("tmp1"(#loc34)) +#loc88 = loc("tmp1"(#loc35)) +#loc89 = loc("tmp1"(#loc36)) +#loc90 = loc("tmp1"(#loc37)) +#loc91 = loc("tmp1"(#loc38)) +#loc92 = loc("tmp1"(#loc39)) +#loc93 = loc("tmp1"(#loc40)) +#loc94 = loc("tmp1"(#loc41)) +#loc95 = loc("tmp2"(#loc42)) +#loc96 = loc("tmp5"(#loc43)) +#loc97 = loc("_tmp4"(#loc44)) +#loc99 = loc("tmp4"(#loc48)) +#loc100 = loc(callsite(#loc66 at #loc67)) +#loc101 = loc(callsite(#loc68 at #loc67)) +#loc102 = loc(callsite(#loc69 at #loc67)) +#loc103 = loc(callsite(#loc13 at #loc67)) +#loc104 = loc(callsite(#loc14 at #loc67)) +#loc105 = loc(callsite(#loc15 at #loc67)) +#loc106 = loc(callsite(#loc16 at #loc67)) +#loc107 = loc(fused[#loc72, #loc71]) +#loc108 = loc(fused[#loc74, #loc73]) +#loc109 = loc(fused[#loc77, #loc76]) +#loc110 = loc(fused[#loc79, #loc60]) +#loc111 = loc(fused[#loc83, #loc82]) +#loc112 = loc(fused[#loc88, #loc89]) +#loc113 = loc(fused[#loc91, #loc90]) +#loc114 = loc(callsite(#loc45 at #loc98)) +#loc116 = loc(callsite(#loc47 at #loc114)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..0b66e6a8a2b52bc0a5c7f2fe12bc073fb20a932d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FMPKNWFVETREJAJZPCGJ4UKIAM7SADT4TV7Z5O7YCGPRL3KODVEQ/triton_red_fused_zeros_0.ttir @@ -0,0 +1,197 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":18:0) +#loc1 = loc(unknown) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:25) +#loc53 = loc("in_ptr0"(#loc)) +#loc54 = loc("in_ptr1"(#loc)) +#loc55 = loc("out_ptr1"(#loc)) +#loc56 = loc("ks0"(#loc)) +#loc57 = loc("ks1"(#loc)) +#loc58 = loc("xnumel"(#loc)) +#loc59 = loc("r0_numel"(#loc)) +#loc100 = loc("tmp4"(#loc47)) +#loc117 = loc(callsite(#loc1 at #loc100)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %x5 = arith.constant 0 : i32 loc(#loc102) + %x1 = arith.constant 32 : i64 loc(#loc61) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst = arith.constant dense<0.000000e+00> : tensor<1x128xbf16> loc(#loc1) + %c4096_i64 = arith.constant 4096 : i64 loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc62) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc63) + %xmask_2 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc63) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc64) + %r0_base_3 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc65) + %x0 = arith.extsi %xoffset : i32 to i64 loc(#loc66) + %x0_4 = arith.remsi %x0, %ks0 : i64 loc(#loc66) + %x1_5 = arith.divsi %x0, %ks0 : i64 loc(#loc67) + %x1_6 = arith.remsi %x1_5, %x1 : i64 loc(#loc61) + %x2 = arith.divsi %x0, %ks1 : i64 loc(#loc68) + %fixed = arith.cmpi ne, %x0_4, %c0_i64 : i64 loc(#loc103) + %fixed_7 = arith.subi %x1_5, %c1_i64 : i64 loc(#loc104) + %fixed_8 = arith.select %fixed, %fixed_7, %x1_5 : i64 loc(#loc105) + %x5_9 = arith.cmpi slt, %xoffset, %x5 : i32 loc(#loc102) + %x5_10 = arith.cmpi slt, %ks0, %c0_i64 : i64 loc(#loc106) + %x5_11 = arith.cmpi ne, %x5_9, %x5_10 : i1 loc(#loc107) + %x5_12 = arith.select %x5_11, %fixed_8, %x1_5 : i64 loc(#loc108) + %r0_mask = arith.cmpi slt, %r0_base_3, %cst_0 : tensor<1x128xi32> loc(#loc72) + %tmp0 = arith.muli %x1_6, %c128_i64 : i64 loc(#loc73) + %tmp0_13 = arith.extsi %r0_base_3 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc74) + %tmp0_14 = tt.splat %tmp0 : i64 -> tensor<1x128xi64> loc(#loc109) + %tmp0_15 = arith.addi %tmp0_13, %tmp0_14 : tensor<1x128xi64> loc(#loc74) + %tmp0_16 = arith.muli %x0_4, %c4096_i64 : i64 loc(#loc75) + %tmp0_17 = tt.splat %tmp0_16 : i64 -> tensor<1x128xi64> loc(#loc110) + %tmp0_18 = arith.addi %tmp0_15, %tmp0_17 : tensor<1x128xi64> loc(#loc76) + %tmp0_19 = arith.muli %ks0, %c4096_i64 : i64 loc(#loc77) + %tmp0_20 = arith.muli %tmp0_19, %x2 : i64 loc(#loc78) + %tmp0_21 = tt.splat %tmp0_20 : i64 -> tensor<1x128xi64> loc(#loc111) + %tmp0_22 = arith.addi %tmp0_18, %tmp0_21 : tensor<1x128xi64> loc(#loc79) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc80) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x128x!tt.ptr>, tensor<1x128xi64> loc(#loc80) + %tmp0_25 = tt.splat %xmask : i1 -> tensor<1x128xi1> loc(#loc112) + %tmp0_26 = arith.andi %r0_mask, %tmp0_25 : tensor<1x128xi1> loc(#loc81) + %tmp0_27 = tt.load %tmp0_24, %tmp0_26, %cst evictionPolicy = evict_first : tensor<1x128x!tt.ptr> loc(#loc82) + %tmp0_28 = arith.extf %tmp0_27 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc83) + %tmp1 = arith.muli %x0_4, %c128_i64 : i64 loc(#loc84) + %tmp1_29 = tt.splat %tmp1 : i64 -> tensor<1x128xi64> loc(#loc113) + %tmp1_30 = arith.addi %tmp0_13, %tmp1_29 : tensor<1x128xi64> loc(#loc85) + %tmp1_31 = arith.muli %x5_12, %c128_i64 : i64 loc(#loc86) + %tmp1_32 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc87) + %tmp1_33 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc88) + %tmp1_34 = arith.extui %tmp1_33 : i1 to i64 loc(#loc89) + %tmp1_35 = arith.muli %ks0, %tmp1_34 : i64 loc(#loc89) + %tmp1_36 = arith.extui %tmp1_32 : i1 to i64 loc(#loc114) + %tmp1_37 = arith.addi %tmp1_36, %tmp1_35 : i64 loc(#loc90) + %tmp1_38 = arith.muli %tmp1_31, %tmp1_37 : i64 loc(#loc92) + %tmp1_39 = tt.splat %tmp1_38 : i64 -> tensor<1x128xi64> loc(#loc115) + %tmp1_40 = arith.addi %tmp1_30, %tmp1_39 : tensor<1x128xi64> loc(#loc93) + %tmp1_41 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc94) + %tmp1_42 = tt.addptr %tmp1_41, %tmp1_40 : tensor<1x128x!tt.ptr>, tensor<1x128xi64> loc(#loc94) + %tmp1_43 = tt.load %tmp1_42, %tmp0_26, %cst evictionPolicy = evict_first : tensor<1x128x!tt.ptr> loc(#loc95) + %tmp1_44 = arith.extf %tmp1_43 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc96) + %tmp2 = arith.mulf %tmp0_28, %tmp1_44 : tensor<1x128xf32> loc(#loc97) + %tmp5 = arith.addf %tmp2, %cst_1 : tensor<1x128xf32> loc(#loc98) + %_tmp4 = arith.select %tmp0_26, %tmp5, %cst_1 : tensor<1x128xi1>, tensor<1x128xf32> loc(#loc99) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_46: f32 loc(callsite(#loc1 at #loc100)), %tmp4_47: f32 loc(callsite(#loc1 at #loc100))): + %tmp4_48 = arith.addf %tmp4_46, %tmp4_47 : f32 loc(#loc118) + tt.reduce.return %tmp4_48 : f32 loc(#loc116) + }) : (tensor<1x128xf32>) -> tensor<1xf32> loc(#loc116) + %tmp4_45 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc101) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc50) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc50) + tt.store %1, %tmp4_45, %xmask_2 : tensor<1x1x!tt.ptr> loc(#loc51) + tt.return loc(#loc52) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":30:51) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:28) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":24:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":27:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:21) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":29:19) +#loc12 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc13 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:65) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:69) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:60) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:34) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:84) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:74) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:136) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:45) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:41) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:54) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:73) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:99) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:90) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:81) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:65) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:58) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:50) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:34) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:106) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:168) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":41:22) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":43:23) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:48) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:28) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:25) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:36) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:4) +#loc60 = loc("x5"(#loc3)) +#loc61 = loc("x1"(#loc4)) +#loc62 = loc("xoffset"(#loc5)) +#loc63 = loc("xmask"(#loc6)) +#loc64 = loc("r0_base"(#loc7)) +#loc65 = loc("r0_base"(#loc8)) +#loc66 = loc("x0"(#loc9)) +#loc67 = loc("x1"(#loc10)) +#loc68 = loc("x2"(#loc11)) +#loc69 = loc("fixed"(#loc12)) +#loc70 = loc("fixed"(#loc13)) +#loc71 = loc("fixed"(#loc14)) +#loc72 = loc("r0_mask"(#loc18)) +#loc73 = loc("tmp0"(#loc19)) +#loc74 = loc("tmp0"(#loc20)) +#loc75 = loc("tmp0"(#loc21)) +#loc76 = loc("tmp0"(#loc22)) +#loc77 = loc("tmp0"(#loc23)) +#loc78 = loc("tmp0"(#loc24)) +#loc79 = loc("tmp0"(#loc25)) +#loc80 = loc("tmp0"(#loc26)) +#loc81 = loc("tmp0"(#loc27)) +#loc82 = loc("tmp0"(#loc28)) +#loc83 = loc("tmp0"(#loc29)) +#loc84 = loc("tmp1"(#loc30)) +#loc85 = loc("tmp1"(#loc31)) +#loc86 = loc("tmp1"(#loc32)) +#loc87 = loc("tmp1"(#loc33)) +#loc88 = loc("tmp1"(#loc34)) +#loc89 = loc("tmp1"(#loc35)) +#loc90 = loc("tmp1"(#loc36)) +#loc91 = loc("tmp1"(#loc37)) +#loc92 = loc("tmp1"(#loc38)) +#loc93 = loc("tmp1"(#loc39)) +#loc94 = loc("tmp1"(#loc40)) +#loc95 = loc("tmp1"(#loc41)) +#loc96 = loc("tmp1"(#loc42)) +#loc97 = loc("tmp2"(#loc43)) +#loc98 = loc("tmp5"(#loc44)) +#loc99 = loc("_tmp4"(#loc45)) +#loc101 = loc("tmp4"(#loc49)) +#loc102 = loc(callsite(#loc2 at #loc60)) +#loc103 = loc(callsite(#loc69 at #loc60)) +#loc104 = loc(callsite(#loc70 at #loc60)) +#loc105 = loc(callsite(#loc71 at #loc60)) +#loc106 = loc(callsite(#loc15 at #loc60)) +#loc107 = loc(callsite(#loc16 at #loc60)) +#loc108 = loc(callsite(#loc17 at #loc60)) +#loc109 = loc(fused[#loc74, #loc73]) +#loc110 = loc(fused[#loc76, #loc75]) +#loc111 = loc(fused[#loc79, #loc78]) +#loc112 = loc(fused[#loc81, #loc63]) +#loc113 = loc(fused[#loc85, #loc84]) +#loc114 = loc(fused[#loc90, #loc91]) +#loc115 = loc(fused[#loc93, #loc92]) +#loc116 = loc(callsite(#loc46 at #loc100)) +#loc118 = loc(callsite(#loc48 at #loc116)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/__grp__triton_red_fused_argmax_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/__grp__triton_red_fused_argmax_0.json new file mode 100644 index 0000000000000000000000000000000000000000..78dd6183c8fb677e8c9b01ff6249093397feed89 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/__grp__triton_red_fused_argmax_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_argmax_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.source", "triton_red_fused_argmax_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.ttir", "triton_red_fused_argmax_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.ttgir", "triton_red_fused_argmax_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.llir", "triton_red_fused_argmax_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.ptx", "triton_red_fused_argmax_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.cubin", "triton_red_fused_argmax_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..876d471a893821b3d515b8e299986a58f5b1840e Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7bc048ec47efc49746068d9ea43c2368af61b192 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.json @@ -0,0 +1 @@ +{"hash": "2be3e1562efcc7b138aa62c6a222491756694a225f8326f7d00be4f4da0323a6", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..1c6d86e5ed7b2c2509599d1d0dbd68be54a71704 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.llir @@ -0,0 +1,433 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_argmax_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %9 = shl nuw nsw i32 %8, 2, !dbg !8 + %10 = and i32 %9, 2044, !dbg !8 + %11 = mul i32 %7, 32000, !dbg !9 + %12 = zext nneg i32 %10 to i64, !dbg !10 + br label %13, !dbg !10 + +13: ; preds = %6, %13 + %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %13 ] + %14 = phi i32 [ 2147483647, %6 ], [ %100, %13 ] + %15 = phi i32 [ 2147483647, %6 ], [ %101, %13 ] + %16 = phi float [ 0xFFF0000000000000, %6 ], [ %97, %13 ] + %17 = phi float [ 0xFFF0000000000000, %6 ], [ %98, %13 ] + %18 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %96, %13 ] + %19 = phi <2 x i32> [ splat (i32 2147483647), %6 ], [ %99, %13 ] + %20 = or disjoint i64 %indvars.iv, %12, !dbg !11 + %21 = or disjoint i64 %20, 2, !dbg !11 + %22 = or disjoint i64 %20, 3, !dbg !11 + %23 = icmp samesign ult i64 %20, 32000, !dbg !12 + %24 = trunc nuw nsw i64 %20 to i32, !dbg !13 + %25 = add i32 %11, %24, !dbg !13 + %26 = sext i32 %25 to i64, !dbg !14 + %27 = getelementptr bfloat, ptr addrspace(1) %0, i64 %26, !dbg !14 + %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !15 + %29 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %27, i64 %28, i1 %23) #4, !dbg !15 + %30 = extractvalue { i32, i32 } %29, 0, !dbg !15 + %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !15 + %32 = extractvalue { i32, i32 } %29, 1, !dbg !15 + %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !15 + %34 = extractelement <2 x bfloat> %33, i64 0, !dbg !15 + %35 = extractelement <2 x bfloat> %33, i64 1, !dbg !15 + %36 = fpext bfloat %34 to float, !dbg !16 + %37 = fpext bfloat %35 to float, !dbg !16 + %38 = fcmp ogt float %16, %36, !dbg !17 + %39 = fcmp ogt float %17, %37, !dbg !17 + %40 = fcmp oeq float %16, %36, !dbg !21 + %41 = fcmp oeq float %17, %37, !dbg !21 + %42 = fcmp uno <2 x float> %18, zeroinitializer, !dbg !22 + %43 = fcmp uno float %16, 0.000000e+00, !dbg !22 + %44 = fcmp uno float %17, 0.000000e+00, !dbg !22 + %45 = fcmp uno bfloat %34, 0xR0000, !dbg !23 + %46 = fcmp uno bfloat %35, 0xR0000, !dbg !23 + %47 = xor i1 %45, true, !dbg !24 + %48 = xor i1 %46, true, !dbg !24 + %49 = and i1 %43, %47, !dbg !25 + %50 = and i1 %44, %48, !dbg !25 + %51 = or i1 %38, %49, !dbg !26 + %52 = or i1 %39, %50, !dbg !26 + %53 = and i1 %43, %45, !dbg !27 + %54 = and i1 %44, %46, !dbg !27 + %55 = or i1 %40, %53, !dbg !28 + %56 = or i1 %41, %54, !dbg !28 + %57 = sext <2 x i32> %19 to <2 x i64>, !dbg !29 + %58 = sext i32 %14 to i64, !dbg !29 + %59 = icmp sgt i64 %21, %58, !dbg !29 + %60 = sext i32 %15 to i64, !dbg !29 + %61 = icmp sgt i64 %22, %60, !dbg !29 + %62 = and i1 %59, %55, !dbg !30 + %63 = and i1 %61, %56, !dbg !30 + %64 = or i1 %51, %62, !dbg !31 + %65 = or i1 %52, %63, !dbg !31 + %66 = select i1 %64, float %16, float %36, !dbg !32 + %67 = select i1 %65, float %17, float %37, !dbg !32 + %68 = trunc nuw nsw i64 %20 to i32, !dbg !33 + %69 = or disjoint i32 %68, 1, !dbg !33 + %70 = fpext <2 x bfloat> %31 to <2 x float>, !dbg !16 + %71 = fcmp ogt <2 x float> %18, %70, !dbg !17 + %72 = fcmp oeq <2 x float> %18, %70, !dbg !21 + %73 = fcmp uno <2 x bfloat> %31, zeroinitializer, !dbg !23 + %74 = xor <2 x i1> %73, splat (i1 true), !dbg !24 + %75 = and <2 x i1> %42, %74, !dbg !25 + %76 = or <2 x i1> %71, %75, !dbg !26 + %77 = and <2 x i1> %42, %73, !dbg !27 + %78 = or <2 x i1> %72, %77, !dbg !28 + %79 = insertelement <2 x i64> poison, i64 %20, i64 0, !dbg !29 + %80 = shufflevector <2 x i64> %79, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !29 + %81 = icmp sgt <2 x i64> %80, %57, !dbg !29 + %82 = icmp sge <2 x i64> %80, %57, !dbg !29 + %83 = shufflevector <2 x i1> %81, <2 x i1> %82, <2 x i32> , !dbg !29 + %84 = and <2 x i1> %83, %78, !dbg !30 + %85 = or <2 x i1> %76, %84, !dbg !31 + %86 = select <2 x i1> %85, <2 x float> %18, <2 x float> %70, !dbg !32 + %87 = insertelement <2 x i32> poison, i32 %24, i64 0, !dbg !33 + %88 = insertelement <2 x i32> %87, i32 %69, i64 1, !dbg !33 + %89 = select <2 x i1> %85, <2 x i32> %19, <2 x i32> %88, !dbg !33 + %90 = trunc nuw nsw i64 %21 to i32, !dbg !33 + %91 = select i1 %64, i32 %14, i32 %90, !dbg !33 + %92 = trunc nuw nsw i64 %22 to i32, !dbg !33 + %93 = select i1 %65, i32 %15, i32 %92, !dbg !33 + %94 = insertelement <2 x i1> poison, i1 %23, i64 0, !dbg !34 + %95 = shufflevector <2 x i1> %94, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !34 + %96 = select <2 x i1> %95, <2 x float> %86, <2 x float> %18, !dbg !34 + %97 = select i1 %23, float %66, float %16, !dbg !34 + %98 = select i1 %23, float %67, float %17, !dbg !34 + %99 = select <2 x i1> %95, <2 x i32> %89, <2 x i32> %19, !dbg !35 + %100 = select i1 %23, i32 %91, i32 %14, !dbg !35 + %101 = select i1 %23, i32 %93, i32 %15, !dbg !35 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !10 + %102 = icmp samesign ult i64 %indvars.iv, 29952, !dbg !10 + br i1 %102, label %13, label %103, !dbg !10 + +103: ; preds = %13 + %104 = and i32 %8, 31, !dbg !8 + %105 = lshr i32 %8, 5, !dbg !8 + %106 = shufflevector <2 x float> %96, <2 x float> poison, <2 x i32> , !dbg !36 + %107 = fcmp ogt <2 x float> %96, %106, !dbg !36 + %108 = fcmp oeq <2 x float> %96, %106, !dbg !36 + %109 = shufflevector <2 x i1> %107, <2 x i1> %108, <2 x i32> , !dbg !36 + %110 = extractelement <2 x float> %96, i64 0, !dbg !38 + %111 = fcmp uno float %110, 0.000000e+00, !dbg !38 + %112 = extractelement <2 x float> %96, i64 1, !dbg !39 + %113 = fcmp uno float %112, 0.000000e+00, !dbg !39 + %114 = xor i1 %113, true, !dbg !40 + %115 = insertelement <2 x i1> poison, i1 %111, i64 0, !dbg !41 + %116 = shufflevector <2 x i1> %115, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !41 + %117 = insertelement <2 x i1> poison, i1 %114, i64 0, !dbg !41 + %118 = insertelement <2 x i1> %117, i1 %113, i64 1, !dbg !41 + %119 = and <2 x i1> %116, %118, !dbg !41 + %120 = or <2 x i1> %109, %119, !dbg !42 + %121 = extractelement <2 x i32> %99, i64 0, !dbg !43 + %122 = extractelement <2 x i32> %99, i64 1, !dbg !43 + %123 = icmp slt i32 %121, %122, !dbg !43 + %124 = extractelement <2 x i1> %120, i64 1, !dbg !44 + %125 = and i1 %123, %124, !dbg !44 + %126 = extractelement <2 x i1> %120, i64 0, !dbg !45 + %127 = or i1 %126, %125, !dbg !45 + %128 = select i1 %127, float %110, float %112, !dbg !46 + %129 = select i1 %127, i32 %121, i32 %122, !dbg !47 + %130 = fcmp ogt float %128, %97, !dbg !36 + %131 = fcmp oeq float %128, %97, !dbg !48 + %132 = fcmp uno float %128, 0.000000e+00, !dbg !38 + %133 = fcmp uno float %97, 0.000000e+00, !dbg !39 + %134 = xor i1 %133, true, !dbg !40 + %135 = and i1 %132, %134, !dbg !41 + %136 = or i1 %130, %135, !dbg !42 + %137 = and i1 %133, %132, !dbg !49 + %138 = or i1 %131, %137, !dbg !50 + %139 = icmp slt i32 %129, %100, !dbg !43 + %140 = and i1 %139, %138, !dbg !44 + %141 = or i1 %136, %140, !dbg !45 + %142 = select i1 %141, float %128, float %97, !dbg !46 + %143 = select i1 %141, i32 %129, i32 %100, !dbg !47 + %144 = fcmp ogt float %142, %98, !dbg !36 + %145 = fcmp oeq float %142, %98, !dbg !48 + %146 = fcmp uno float %142, 0.000000e+00, !dbg !38 + %147 = fcmp uno float %98, 0.000000e+00, !dbg !39 + %148 = xor i1 %147, true, !dbg !40 + %149 = and i1 %146, %148, !dbg !41 + %150 = or i1 %144, %149, !dbg !42 + %151 = and i1 %147, %146, !dbg !49 + %152 = or i1 %145, %151, !dbg !50 + %153 = icmp slt i32 %143, %101, !dbg !43 + %154 = and i1 %153, %152, !dbg !44 + %155 = or i1 %150, %154, !dbg !45 + %156 = select i1 %155, float %142, float %98, !dbg !46 + %157 = select i1 %155, i32 %143, i32 %101, !dbg !47 + %158 = bitcast float %156 to i32, !dbg !51 + %159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %158, i32 16, i32 31), !dbg !51 + %160 = bitcast i32 %159 to float, !dbg !51 + %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 16, i32 31), !dbg !51 + %162 = fcmp ogt float %156, %160, !dbg !36 + %163 = fcmp oeq float %156, %160, !dbg !48 + %164 = fcmp uno float %156, 0.000000e+00, !dbg !38 + %165 = fcmp uno float %160, 0.000000e+00, !dbg !39 + %166 = xor i1 %165, true, !dbg !40 + %167 = and i1 %164, %166, !dbg !41 + %168 = or i1 %162, %167, !dbg !42 + %169 = and i1 %164, %165, !dbg !49 + %170 = or i1 %163, %169, !dbg !50 + %171 = icmp slt i32 %157, %161, !dbg !43 + %172 = and i1 %171, %170, !dbg !44 + %173 = or i1 %168, %172, !dbg !45 + %174 = select i1 %173, float %156, float %160, !dbg !46 + %175 = select i1 %173, i32 %157, i32 %161, !dbg !47 + %176 = bitcast float %174 to i32, !dbg !51 + %177 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 8, i32 31), !dbg !51 + %178 = bitcast i32 %177 to float, !dbg !51 + %179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 8, i32 31), !dbg !51 + %180 = fcmp ogt float %174, %178, !dbg !36 + %181 = fcmp oeq float %174, %178, !dbg !48 + %182 = fcmp uno float %174, 0.000000e+00, !dbg !38 + %183 = fcmp uno float %178, 0.000000e+00, !dbg !39 + %184 = xor i1 %183, true, !dbg !40 + %185 = and i1 %182, %184, !dbg !41 + %186 = or i1 %180, %185, !dbg !42 + %187 = and i1 %183, %182, !dbg !49 + %188 = or i1 %181, %187, !dbg !50 + %189 = icmp slt i32 %175, %179, !dbg !43 + %190 = and i1 %189, %188, !dbg !44 + %191 = or i1 %186, %190, !dbg !45 + %192 = select i1 %191, float %174, float %178, !dbg !46 + %193 = select i1 %191, i32 %175, i32 %179, !dbg !47 + %194 = bitcast float %192 to i32, !dbg !51 + %195 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %194, i32 4, i32 31), !dbg !51 + %196 = bitcast i32 %195 to float, !dbg !51 + %197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %193, i32 4, i32 31), !dbg !51 + %198 = fcmp ogt float %192, %196, !dbg !36 + %199 = fcmp oeq float %192, %196, !dbg !48 + %200 = fcmp uno float %192, 0.000000e+00, !dbg !38 + %201 = fcmp uno float %196, 0.000000e+00, !dbg !39 + %202 = xor i1 %201, true, !dbg !40 + %203 = and i1 %200, %202, !dbg !41 + %204 = or i1 %198, %203, !dbg !42 + %205 = and i1 %201, %200, !dbg !49 + %206 = or i1 %199, %205, !dbg !50 + %207 = icmp slt i32 %193, %197, !dbg !43 + %208 = and i1 %207, %206, !dbg !44 + %209 = or i1 %204, %208, !dbg !45 + %210 = select i1 %209, float %192, float %196, !dbg !46 + %211 = select i1 %209, i32 %193, i32 %197, !dbg !47 + %212 = bitcast float %210 to i32, !dbg !51 + %213 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %212, i32 2, i32 31), !dbg !51 + %214 = bitcast i32 %213 to float, !dbg !51 + %215 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %211, i32 2, i32 31), !dbg !51 + %216 = fcmp ogt float %210, %214, !dbg !36 + %217 = fcmp oeq float %210, %214, !dbg !48 + %218 = fcmp uno float %210, 0.000000e+00, !dbg !38 + %219 = fcmp uno float %214, 0.000000e+00, !dbg !39 + %220 = xor i1 %219, true, !dbg !40 + %221 = and i1 %218, %220, !dbg !41 + %222 = or i1 %216, %221, !dbg !42 + %223 = and i1 %219, %218, !dbg !49 + %224 = or i1 %217, %223, !dbg !50 + %225 = icmp slt i32 %211, %215, !dbg !43 + %226 = and i1 %225, %224, !dbg !44 + %227 = or i1 %222, %226, !dbg !45 + %228 = select i1 %227, float %210, float %214, !dbg !46 + %229 = select i1 %227, i32 %211, i32 %215, !dbg !47 + %230 = bitcast float %228 to i32, !dbg !51 + %231 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %230, i32 1, i32 31), !dbg !51 + %232 = bitcast i32 %231 to float, !dbg !51 + %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %229, i32 1, i32 31), !dbg !51 + %234 = fcmp ogt float %228, %232, !dbg !36 + %235 = fcmp oeq float %228, %232, !dbg !48 + %236 = fcmp uno float %228, 0.000000e+00, !dbg !38 + %237 = fcmp uno float %232, 0.000000e+00, !dbg !39 + %238 = xor i1 %237, true, !dbg !40 + %239 = and i1 %236, %238, !dbg !41 + %240 = or i1 %234, %239, !dbg !42 + %241 = and i1 %237, %236, !dbg !49 + %242 = or i1 %235, %241, !dbg !50 + %243 = icmp slt i32 %229, %233, !dbg !43 + %244 = and i1 %243, %242, !dbg !44 + %245 = or i1 %240, %244, !dbg !45 + %246 = select i1 %245, i32 %229, i32 %233, !dbg !47 + %247 = and i32 %105, 15, !dbg !51 + %248 = icmp eq i32 %104, 0, !dbg !51 + %249 = getelementptr float, ptr addrspace(3) @global_smem, i32 %247, !dbg !51 + %250 = select i1 %245, i32 %230, i32 %231, !dbg !46 + %251 = insertelement <1 x i32> poison, i32 %250, i64 0, !dbg !51 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %249, <1 x i32> %251, i1 %248) #4, !dbg !51 + %252 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %247, !dbg !51 + %253 = insertelement <1 x i32> poison, i32 %246, i64 0, !dbg !51 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %252, <1 x i32> %253, i1 %248) #4, !dbg !51 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !51 + %254 = icmp samesign ult i32 %8, 16, !dbg !51 + %255 = getelementptr float, ptr addrspace(3) @global_smem, i32 %8, !dbg !51 + %256 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %255, i1 %254) #4, !dbg !51 + %257 = bitcast i32 %256 to float, !dbg !51 + %258 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %8, !dbg !51 + %259 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %258, i1 %254) #4, !dbg !51 + %260 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %256, i32 8, i32 31), !dbg !51 + %261 = bitcast i32 %260 to float, !dbg !51 + %262 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %259, i32 8, i32 31), !dbg !51 + %263 = fcmp ogt float %257, %261, !dbg !36 + %264 = fcmp oeq float %257, %261, !dbg !48 + %265 = fcmp uno float %257, 0.000000e+00, !dbg !38 + %266 = fcmp uno float %261, 0.000000e+00, !dbg !39 + %267 = xor i1 %266, true, !dbg !40 + %268 = and i1 %265, %267, !dbg !41 + %269 = or i1 %263, %268, !dbg !42 + %270 = and i1 %265, %266, !dbg !49 + %271 = or i1 %264, %270, !dbg !50 + %272 = icmp slt i32 %259, %262, !dbg !43 + %273 = and i1 %272, %271, !dbg !44 + %274 = or i1 %269, %273, !dbg !45 + %275 = select i1 %274, float %257, float %261, !dbg !46 + %276 = select i1 %274, i32 %259, i32 %262, !dbg !47 + %277 = bitcast float %275 to i32, !dbg !51 + %278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %277, i32 4, i32 31), !dbg !51 + %279 = bitcast i32 %278 to float, !dbg !51 + %280 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 4, i32 31), !dbg !51 + %281 = fcmp ogt float %275, %279, !dbg !36 + %282 = fcmp oeq float %275, %279, !dbg !48 + %283 = fcmp uno float %275, 0.000000e+00, !dbg !38 + %284 = fcmp uno float %279, 0.000000e+00, !dbg !39 + %285 = xor i1 %284, true, !dbg !40 + %286 = and i1 %283, %285, !dbg !41 + %287 = or i1 %281, %286, !dbg !42 + %288 = and i1 %284, %283, !dbg !49 + %289 = or i1 %282, %288, !dbg !50 + %290 = icmp slt i32 %276, %280, !dbg !43 + %291 = and i1 %290, %289, !dbg !44 + %292 = or i1 %287, %291, !dbg !45 + %293 = select i1 %292, float %275, float %279, !dbg !46 + %294 = select i1 %292, i32 %276, i32 %280, !dbg !47 + %295 = bitcast float %293 to i32, !dbg !51 + %296 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %295, i32 2, i32 31), !dbg !51 + %297 = bitcast i32 %296 to float, !dbg !51 + %298 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %294, i32 2, i32 31), !dbg !51 + %299 = fcmp ogt float %293, %297, !dbg !36 + %300 = fcmp oeq float %293, %297, !dbg !48 + %301 = fcmp uno float %293, 0.000000e+00, !dbg !38 + %302 = fcmp uno float %297, 0.000000e+00, !dbg !39 + %303 = xor i1 %302, true, !dbg !40 + %304 = and i1 %301, %303, !dbg !41 + %305 = or i1 %299, %304, !dbg !42 + %306 = and i1 %302, %301, !dbg !49 + %307 = or i1 %300, %306, !dbg !50 + %308 = icmp slt i32 %294, %298, !dbg !43 + %309 = and i1 %308, %307, !dbg !44 + %310 = or i1 %305, %309, !dbg !45 + %311 = select i1 %310, float %293, float %297, !dbg !46 + %312 = select i1 %310, i32 %294, i32 %298, !dbg !47 + %313 = bitcast float %311 to i32, !dbg !51 + %314 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %313, i32 1, i32 31), !dbg !51 + %315 = bitcast i32 %314 to float, !dbg !51 + %316 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %312, i32 1, i32 31), !dbg !51 + %317 = fcmp ogt float %311, %315, !dbg !36 + %318 = fcmp oeq float %311, %315, !dbg !48 + %319 = fcmp uno float %311, 0.000000e+00, !dbg !38 + %320 = fcmp uno float %315, 0.000000e+00, !dbg !39 + %321 = xor i1 %320, true, !dbg !40 + %322 = and i1 %319, %321, !dbg !41 + %323 = or i1 %317, %322, !dbg !42 + %324 = and i1 %320, %319, !dbg !49 + %325 = or i1 %318, %324, !dbg !50 + %326 = icmp slt i32 %312, %316, !dbg !43 + %327 = and i1 %326, %325, !dbg !44 + %328 = or i1 %323, %327, !dbg !45 + %329 = select i1 %328, i32 %312, i32 %316, !dbg !47 + %330 = icmp eq i32 %8, 0, !dbg !51 + %331 = select i1 %328, i32 %313, i32 %314, !dbg !46 + %332 = insertelement <1 x i32> poison, i32 %331, i64 0, !dbg !51 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %255, <1 x i32> %332, i1 %330) #4, !dbg !51 + %333 = insertelement <1 x i32> poison, i32 %329, i64 0, !dbg !51 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %258, <1 x i32> %333, i1 %330) #4, !dbg !51 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !51 + %334 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !51 + %335 = zext nneg i32 %7 to i64, !dbg !52 + %336 = getelementptr i64, ptr addrspace(1) %1, i64 %335, !dbg !52 + %337 = sext i32 %334 to i64, !dbg !53 + %338 = and i32 %8, 511, !dbg !53 + %339 = icmp eq i32 %338, 0, !dbg !53 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %337, ptr addrspace(1) %336, i1 %339) #4, !dbg !53 + ret void, !dbg !54 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_0", linkageName: "triton_red_fused_argmax_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 26, column: 37, scope: !4) +!9 = !DILocation(line: 37, column: 47, scope: !4) +!10 = !DILocation(line: 31, column: 40, scope: !4) +!11 = !DILocation(line: 32, column: 31, scope: !4) +!12 = !DILocation(line: 33, column: 29, scope: !4) +!13 = !DILocation(line: 37, column: 41, scope: !4) +!14 = !DILocation(line: 37, column: 34, scope: !4) +!15 = !DILocation(line: 37, column: 52, scope: !4) +!16 = !DILocation(line: 37, column: 106, scope: !4) +!17 = !DILocation(line: 144, column: 21, scope: !18, inlinedAt: !20) +!18 = distinct !DILexicalBlockFile(scope: !4, file: !19, discriminator: 0) +!19 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!20 = !DILocation(line: 40, column: 38, scope: !4) +!21 = !DILocation(line: 145, column: 23, scope: !18, inlinedAt: !20) +!22 = !DILocation(line: 147, column: 29, scope: !18, inlinedAt: !20) +!23 = !DILocation(line: 148, column: 29, scope: !18, inlinedAt: !20) +!24 = !DILocation(line: 149, column: 31, scope: !18, inlinedAt: !20) +!25 = !DILocation(line: 149, column: 27, scope: !18, inlinedAt: !20) +!26 = !DILocation(line: 149, column: 16, scope: !18, inlinedAt: !20) +!27 = !DILocation(line: 151, column: 27, scope: !18, inlinedAt: !20) +!28 = !DILocation(line: 151, column: 17, scope: !18, inlinedAt: !20) +!29 = !DILocation(line: 154, column: 31, scope: !18, inlinedAt: !20) +!30 = !DILocation(line: 154, column: 21, scope: !18, inlinedAt: !20) +!31 = !DILocation(line: 154, column: 12, scope: !18, inlinedAt: !20) +!32 = !DILocation(line: 155, column: 35, scope: !18, inlinedAt: !20) +!33 = !DILocation(line: 155, column: 69, scope: !18, inlinedAt: !20) +!34 = !DILocation(line: 42, column: 46, scope: !4) +!35 = !DILocation(line: 43, column: 58, scope: !4) +!36 = !DILocation(line: 144, column: 21, scope: !18, inlinedAt: !37) +!37 = !DILocation(line: 44, column: 75, scope: !4) +!38 = !DILocation(line: 147, column: 29, scope: !18, inlinedAt: !37) +!39 = !DILocation(line: 148, column: 29, scope: !18, inlinedAt: !37) +!40 = !DILocation(line: 149, column: 31, scope: !18, inlinedAt: !37) +!41 = !DILocation(line: 149, column: 27, scope: !18, inlinedAt: !37) +!42 = !DILocation(line: 149, column: 16, scope: !18, inlinedAt: !37) +!43 = !DILocation(line: 154, column: 31, scope: !18, inlinedAt: !37) +!44 = !DILocation(line: 154, column: 21, scope: !18, inlinedAt: !37) +!45 = !DILocation(line: 154, column: 12, scope: !18, inlinedAt: !37) +!46 = !DILocation(line: 155, column: 35, scope: !18, inlinedAt: !37) +!47 = !DILocation(line: 155, column: 69, scope: !18, inlinedAt: !37) +!48 = !DILocation(line: 145, column: 23, scope: !18, inlinedAt: !37) +!49 = !DILocation(line: 151, column: 27, scope: !18, inlinedAt: !37) +!50 = !DILocation(line: 151, column: 17, scope: !18, inlinedAt: !37) +!51 = !DILocation(line: 165, column: 42, scope: !18, inlinedAt: !37) +!52 = !DILocation(line: 46, column: 25, scope: !4) +!53 = !DILocation(line: 46, column: 36, scope: !4) +!54 = !DILocation(line: 46, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..4fd5294a77d487d33cc53e0168d84cbfd8666c3c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.ptx @@ -0,0 +1,833 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_argmax_0 // -- Begin function triton_red_fused_argmax_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_argmax_0 +.visible .entry triton_red_fused_argmax_0( + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_0_param_1, + .param .u32 triton_red_fused_argmax_0_param_2, + .param .u32 triton_red_fused_argmax_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_0_param_5 +) +.reqntid 512 +{ + .reg .pred %p<202>; + .reg .b16 %rs<5>; + .reg .b32 %r<114>; + .reg .b64 %rd<27>; + .loc 1 18 0 // cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_red_fused_argmax_0_param_1]; + ld.param.b64 %rd7, [triton_red_fused_argmax_0_param_0]; +$L__tmp0: + .loc 1 23 28 // cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:23:28 + mov.u32 %r1, %ctaid.x; + .loc 1 26 37 // cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:26:37 + mov.u32 %r2, %tid.x; + shl.b32 %r19, %r2, 2; + and.b32 %r20, %r19, 2044; + .loc 1 31 40 // cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:31:40 + cvt.u64.u32 %rd1, %r20; + mad.lo.s32 %r21, %r1, 32000, %r20; + cvt.u64.u32 %rd2, %r21; + mov.b32 %r110, 0fFF800000; + mov.b64 %rd26, {%r110, %r110}; + mov.b32 %r108, 2147483647; + mov.b64 %rd25, -2048; + mov.b32 %r109, %r108; + mov.b32 %r111, %r110; + mov.b32 %r112, %r108; + mov.b32 %r113, %r108; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 32 31 // cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:32:31 + add.s64 %rd14, %rd1, %rd25; + add.s64 %rd15, %rd14, 2048; + add.s64 %rd16, %rd14, 2050; + .loc 1 33 29 // cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:33:29 + add.s64 %rd17, %rd14, 2051; + setp.lt.u64 %p1, %rd15, 32000; + .loc 1 37 34 // cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:37:34 + add.s64 %rd18, %rd2, %rd25; + cvt.u32.u64 %r26, %rd18; + add.s32 %r27, %r26, 2048; + mad.wide.s32 %rd12, %r27, 2, %rd7; + .loc 1 37 52 // cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:37:52 + // begin inline asm + mov.u64 %rd11, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd11, 1.0; + // end inline asm + mov.b32 %r24, 0; + // begin inline asm + mov.u32 %r22, %r24; + mov.u32 %r23, %r24; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r22, %r23 }, [ %rd12 + 0 ], %rd11; + // end inline asm + mov.b32 {%rs1, %rs2}, %r23; + .loc 1 37 106 // cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:37:106 + cvt.f32.bf16 %r28, %rs1; + cvt.f32.bf16 %r29, %rs2; +$L__tmp1: + .loc 2 144 21 // triton_helpers.py:144:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + setp.gt.f32 %p2, %r110, %r28; + setp.gt.f32 %p3, %r111, %r29; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + setp.eq.f32 %p4, %r110, %r28; + setp.eq.f32 %p5, %r111, %r29; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + mov.b64 {%r30, %r31}, %rd26; + setp.nan.f32 %p6, %r30, %r30; + setp.nan.f32 %p7, %r31, %r31; + setp.nan.f32 %p8, %r110, %r110; + setp.nan.f32 %p9, %r111, %r111; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + setp.nan.bf16 %p10, %rs1, %rs1; + setp.num.bf16 %p11, %rs1, %rs1; + setp.nan.bf16 %p12, %rs2, %rs2; + setp.num.bf16 %p13, %rs2, %rs2; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + and.pred %p14, %p8, %p11; + and.pred %p15, %p9, %p13; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + or.pred %p16, %p2, %p14; + or.pred %p17, %p3, %p15; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + and.pred %p18, %p8, %p10; + and.pred %p19, %p9, %p12; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + or.pred %p20, %p4, %p18; + or.pred %p21, %p5, %p19; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + cvt.s64.s32 %rd19, %r113; + cvt.s64.s32 %rd20, %r112; + cvt.s64.s32 %rd21, %r108; + setp.gt.s64 %p22, %rd16, %rd21; + cvt.s64.s32 %rd22, %r109; + setp.gt.s64 %p23, %rd17, %rd22; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + and.pred %p24, %p22, %p20; + and.pred %p25, %p23, %p21; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + or.pred %p26, %p16, %p24; + or.pred %p27, %p17, %p25; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + selp.f32 %r32, %r110, %r28, %p26; + selp.f32 %r33, %r111, %r29, %p27; +$L__tmp2: + .loc 1 37 106 // cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:37:106 + mov.b32 {%rs3, %rs4}, %r22; + cvt.f32.bf16 %r34, %rs3; + cvt.f32.bf16 %r35, %rs4; +$L__tmp3: + .loc 2 144 21 // triton_helpers.py:144:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + setp.gt.f32 %p28, %r31, %r35; + setp.gt.f32 %p29, %r30, %r34; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + setp.eq.f32 %p30, %r30, %r34; + setp.eq.f32 %p31, %r31, %r35; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + setp.nan.bf16x2 %p32|%p33, %r22, %r24; + setp.num.bf16x2 %p34|%p35, %r22, %r24; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + and.pred %p36, %p7, %p35; + and.pred %p37, %p6, %p34; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + or.pred %p38, %p29, %p37; + or.pred %p39, %p28, %p36; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + and.pred %p40, %p6, %p32; + and.pred %p41, %p7, %p33; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + or.pred %p42, %p31, %p41; + or.pred %p43, %p30, %p40; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + setp.gt.s64 %p44, %rd15, %rd20; + setp.ge.s64 %p45, %rd15, %rd19; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + and.pred %p46, %p44, %p43; + and.pred %p47, %p45, %p42; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + or.pred %p48, %p39, %p47; + or.pred %p49, %p38, %p46; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + selp.f32 %r36, %r30, %r34, %p49; + selp.f32 %r37, %r31, %r35, %p48; + cvt.u32.u64 %r38, %rd15; + cvt.u32.u64 %r39, %rd14; + add.s32 %r40, %r39, 2049; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:40:38 ] + selp.b32 %r41, %r112, %r38, %p49; + selp.b32 %r42, %r113, %r40, %p48; + cvt.u32.u64 %r43, %rd16; + selp.b32 %r44, %r108, %r43, %p26; + cvt.u32.u64 %r45, %rd17; + selp.b32 %r46, %r109, %r45, %p27; +$L__tmp4: + .loc 1 42 46 // cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:42:46 + selp.f32 %r47, %r37, %r31, %p1; + selp.f32 %r48, %r36, %r30, %p1; + mov.b64 %rd26, {%r48, %r47}; + selp.f32 %r110, %r32, %r110, %p1; + selp.f32 %r111, %r33, %r111, %p1; + .loc 1 43 58 // cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:43:58 + selp.b32 %r113, %r42, %r113, %p1; + selp.b32 %r112, %r41, %r112, %p1; + selp.b32 %r108, %r44, %r108, %p1; + selp.b32 %r109, %r46, %r109, %p1; + .loc 1 31 40 // cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:31:40 + add.s64 %rd25, %rd25, 2048; + setp.lt.u64 %p50, %rd25, 29952; + @%p50 bra $L__BB0_1; +// %bb.2: + .loc 1 26 37 // cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:26:37 + and.b32 %r61, %r2, 31; +$L__tmp5: + .loc 2 144 21 // triton_helpers.py:144:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + mov.b64 {%r62, %r63}, %rd26; + setp.gt.f32 %p58, %r62, %r63; + setp.eq.f32 %p59, %r63, %r62; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p60, %r62, %r62; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.num.f32 %p61, %r63, %r63; + setp.nan.f32 %p62, %r63, %r63; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p63, %p60, %p62; + and.pred %p64, %p60, %p61; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p65, %p58, %p64; + or.pred %p66, %p59, %p63; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.lt.s32 %p67, %r112, %r113; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p68, %p67, %p66; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p69, %p65, %p68; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.f32 %r64, %r62, %r63, %p69; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.b32 %r65, %r112, %r113, %p69; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.gt.f32 %p70, %r64, %r110; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.eq.f32 %p71, %r64, %r110; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p72, %r64, %r64; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p73, %r110, %r110; + setp.num.f32 %p74, %r110, %r110; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p75, %p72, %p74; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p76, %p70, %p75; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p77, %p73, %p72; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p78, %p71, %p77; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.lt.s32 %p79, %r65, %r108; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p80, %p79, %p78; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p81, %p76, %p80; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.f32 %r66, %r64, %r110, %p81; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.b32 %r67, %r65, %r108, %p81; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.gt.f32 %p82, %r66, %r111; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.eq.f32 %p83, %r66, %r111; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p84, %r66, %r66; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p85, %r111, %r111; + setp.num.f32 %p86, %r111, %r111; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p87, %p84, %p86; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p88, %p82, %p87; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p89, %p85, %p84; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p90, %p83, %p89; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.lt.s32 %p91, %r67, %r109; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p92, %p91, %p90; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p93, %p88, %p92; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.f32 %r68, %r66, %r111, %p93; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.b32 %r69, %r67, %r109, %p93; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + shfl.sync.bfly.b32 %r70, %r68, 16, 31, -1; + shfl.sync.bfly.b32 %r71, %r69, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.gt.f32 %p94, %r68, %r70; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.eq.f32 %p95, %r68, %r70; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p96, %r68, %r68; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p97, %r70, %r70; + setp.num.f32 %p98, %r70, %r70; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p99, %p96, %p98; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p100, %p94, %p99; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p101, %p96, %p97; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p102, %p95, %p101; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.lt.s32 %p103, %r69, %r71; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p104, %p103, %p102; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p105, %p100, %p104; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.f32 %r72, %r68, %r70, %p105; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.b32 %r73, %r69, %r71, %p105; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + shfl.sync.bfly.b32 %r74, %r72, 8, 31, -1; + shfl.sync.bfly.b32 %r75, %r73, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.gt.f32 %p106, %r72, %r74; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.eq.f32 %p107, %r72, %r74; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p108, %r72, %r72; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p109, %r74, %r74; + setp.num.f32 %p110, %r74, %r74; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p111, %p108, %p110; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p112, %p106, %p111; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p113, %p109, %p108; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p114, %p107, %p113; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.lt.s32 %p115, %r73, %r75; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p116, %p115, %p114; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p117, %p112, %p116; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.f32 %r76, %r72, %r74, %p117; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.b32 %r77, %r73, %r75, %p117; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + shfl.sync.bfly.b32 %r78, %r76, 4, 31, -1; + shfl.sync.bfly.b32 %r79, %r77, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.gt.f32 %p118, %r76, %r78; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.eq.f32 %p119, %r76, %r78; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p120, %r76, %r76; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p121, %r78, %r78; + setp.num.f32 %p122, %r78, %r78; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p123, %p120, %p122; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p124, %p118, %p123; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p125, %p121, %p120; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p126, %p119, %p125; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.lt.s32 %p127, %r77, %r79; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p128, %p127, %p126; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p129, %p124, %p128; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.f32 %r80, %r76, %r78, %p129; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.b32 %r81, %r77, %r79, %p129; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + shfl.sync.bfly.b32 %r82, %r80, 2, 31, -1; + shfl.sync.bfly.b32 %r83, %r81, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.gt.f32 %p130, %r80, %r82; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.eq.f32 %p131, %r80, %r82; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p132, %r80, %r80; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p133, %r82, %r82; + setp.num.f32 %p134, %r82, %r82; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p135, %p132, %p134; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p136, %p130, %p135; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p137, %p133, %p132; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p138, %p131, %p137; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.lt.s32 %p139, %r81, %r83; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p140, %p139, %p138; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p141, %p136, %p140; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.f32 %r84, %r80, %r82, %p141; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.b32 %r85, %r81, %r83, %p141; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + shfl.sync.bfly.b32 %r86, %r84, 1, 31, -1; + shfl.sync.bfly.b32 %r87, %r85, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.gt.f32 %p142, %r84, %r86; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.eq.f32 %p143, %r84, %r86; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p144, %r84, %r84; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p145, %r86, %r86; + setp.num.f32 %p146, %r86, %r86; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p147, %p144, %p146; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p148, %p142, %p147; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p149, %p145, %p144; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p150, %p143, %p149; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.lt.s32 %p151, %r85, %r87; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p152, %p151, %p150; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p153, %p148, %p152; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.b32 %r52, %r85, %r87, %p153; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.eq.b32 %p51, %r61, 0; + shr.u32 %r88, %r2, 3; + and.b32 %r89, %r88, 60; + mov.b32 %r90, global_smem; + add.s32 %r49, %r90, %r89; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.b32 %r50, %r84, %r86, %p153; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + // begin inline asm + @%p51 st.shared.b32 [ %r49 + 0 ], %r50; + // end inline asm + add.s32 %r91, %r90, 64; + add.s32 %r51, %r91, %r89; + // begin inline asm + @%p51 st.shared.b32 [ %r51 + 0 ], %r52; + // end inline asm + bar.sync 0; + setp.lt.u32 %p53, %r2, 16; + add.s32 %r54, %r90, %r19; + // begin inline asm + @%p53 ld.shared.b32 %r53, [ %r54 + 0 ]; + // end inline asm + add.s32 %r56, %r91, %r19; + // begin inline asm + @%p53 ld.shared.b32 %r55, [ %r56 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r93, %r53, 8, 31, -1; + shfl.sync.bfly.b32 %r94, %r55, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.gt.f32 %p154, %r53, %r93; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.eq.f32 %p155, %r53, %r93; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p156, %r53, %r53; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p157, %r93, %r93; + setp.num.f32 %p158, %r93, %r93; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p159, %p156, %p158; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p160, %p154, %p159; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p161, %p156, %p157; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p162, %p155, %p161; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.lt.s32 %p163, %r55, %r94; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p164, %p163, %p162; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p165, %p160, %p164; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.f32 %r95, %r53, %r93, %p165; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.b32 %r96, %r55, %r94, %p165; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + shfl.sync.bfly.b32 %r97, %r95, 4, 31, -1; + shfl.sync.bfly.b32 %r98, %r96, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.gt.f32 %p166, %r95, %r97; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.eq.f32 %p167, %r95, %r97; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p168, %r95, %r95; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p169, %r97, %r97; + setp.num.f32 %p170, %r97, %r97; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p171, %p168, %p170; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p172, %p166, %p171; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p173, %p169, %p168; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p174, %p167, %p173; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.lt.s32 %p175, %r96, %r98; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p176, %p175, %p174; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p177, %p172, %p176; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.f32 %r99, %r95, %r97, %p177; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.b32 %r100, %r96, %r98, %p177; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + shfl.sync.bfly.b32 %r101, %r99, 2, 31, -1; + shfl.sync.bfly.b32 %r102, %r100, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.gt.f32 %p178, %r99, %r101; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.eq.f32 %p179, %r99, %r101; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p180, %r99, %r99; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p181, %r101, %r101; + setp.num.f32 %p182, %r101, %r101; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p183, %p180, %p182; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p184, %p178, %p183; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p185, %p181, %p180; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p186, %p179, %p185; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.lt.s32 %p187, %r100, %r102; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p188, %p187, %p186; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p189, %p184, %p188; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.f32 %r103, %r99, %r101, %p189; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.b32 %r104, %r100, %r102, %p189; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + shfl.sync.bfly.b32 %r105, %r103, 1, 31, -1; + shfl.sync.bfly.b32 %r106, %r104, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.gt.f32 %p190, %r103, %r105; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.eq.f32 %p191, %r103, %r105; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p192, %r103, %r103; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.nan.f32 %p193, %r105, %r105; + setp.num.f32 %p194, %r105, %r105; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p195, %p192, %p194; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p196, %p190, %p195; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p197, %p193, %p192; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p198, %p191, %p197; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.lt.s32 %p199, %r104, %r106; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + and.pred %p200, %p199, %p198; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + or.pred %p201, %p196, %p200; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.b32 %r60, %r104, %r106, %p201; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + setp.eq.b32 %p55, %r2, 0; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + selp.b32 %r58, %r103, %r105, %p201; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:44:75 ] + // begin inline asm + @%p55 st.shared.b32 [ %r54 + 0 ], %r58; + // end inline asm + // begin inline asm + @%p55 st.shared.b32 [ %r56 + 0 ], %r60; + // end inline asm + bar.sync 0; + ld.shared.s32 %rd23, [global_smem+64]; +$L__tmp6: + .loc 1 46 25 // cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:46:25 + mad.wide.u32 %rd24, %r1, 8, %rd8; + .loc 1 46 36 // cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:46:36 + and.b32 %r107, %r2, 511; + setp.eq.b32 %p57, %r107, 0; + // begin inline asm + @%p57 st.global.b64 [ %rd24 + 0 ], { %rd23 }; + // end inline asm + .loc 1 46 4 // cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py:46:4 + ret; +$L__tmp7: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 234 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 117 +.b8 53 +.b8 53 +.b8 111 +.b8 52 +.b8 102 +.b8 52 +.b8 107 +.b8 104 +.b8 103 +.b8 50 +.b8 119 +.b8 117 +.b8 111 +.b8 110 +.b8 104 +.b8 97 +.b8 111 +.b8 111 +.b8 103 +.b8 109 +.b8 55 +.b8 99 +.b8 119 +.b8 101 +.b8 55 +.b8 98 +.b8 101 +.b8 105 +.b8 118 +.b8 103 +.b8 53 +.b8 111 +.b8 116 +.b8 103 +.b8 117 +.b8 116 +.b8 103 +.b8 110 +.b8 114 +.b8 118 +.b8 51 +.b8 120 +.b8 107 +.b8 101 +.b8 108 +.b8 97 +.b8 107 +.b8 118 +.b8 99 +.b8 122 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 117 +.b8 53 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 40 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.source new file mode 100644 index 0000000000000000000000000000000000000000..d98552efdee80d07da997b7c5b3e9056a35192ed --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.source @@ -0,0 +1,291 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":18:0) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc42 = loc(unknown) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc67 = loc("in_ptr0"(#loc)) +#loc68 = loc("out_ptr0"(#loc)) +#loc69 = loc("xnumel"(#loc)) +#loc70 = loc("r0_numel"(#loc)) +#loc94 = loc("a_value"(#loc30)) +#loc95 = loc("a_index"(#loc30)) +#loc96 = loc("b_value"(#loc30)) +#loc97 = loc("b_index"(#loc30)) +#loc110 = loc("x"(#loc50)) +#loc111 = loc("x"(#loc54)) +#loc112 = loc("value"(#loc63)) +#loc113 = loc("index"(#loc63)) +module { + tt.func public @triton_red_fused_argmax_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 4096 : i32 loc(#loc71) + %r0_numel_1 = arith.constant 32000 : i32 loc(#loc72) + %xoffset = tt.get_program_id x : i32 loc(#loc73) + %xoffset_2 = arith.constant 1 : i32 loc(#loc74) + %xoffset_3 = arith.constant 1 : i32 loc(#loc74) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc74) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc75) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc76) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc77) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc77) + %xmask = arith.constant true loc(#loc78) + %xmask_8 = arith.constant dense : tensor<1x2048xi1> loc(#loc78) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc79) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc80) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc81) + %_tmp2_10 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc81) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc82) + %_tmp2_index_11 = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc82) + %c0_i32 = arith.constant 0 : i32 loc(#loc13) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc13) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc13) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc13) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc13) + %3 = ub.poison : i32 loc(#loc13) + %_tmp2_index_12:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_13 = %_tmp2_10, %_tmp2_index_14 = %_tmp2_index_11) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc84) + %r0_index_15 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc84) + %r0_mask = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc85) + %r0_mask_16 = arith.cmpi slt, %r0_index_15, %r0_mask : tensor<1x2048xi32> loc(#loc85) + %tmp0 = arith.constant 32000 : i32 loc(#loc86) + %tmp0_17 = arith.constant 32000 : i32 loc(#loc86) + %tmp0_18 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc86) + %tmp0_19 = arith.muli %tmp0_18, %xindex_7 : tensor<1x1xi32> loc(#loc86) + %tmp0_20 = tt.broadcast %tmp0_19 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc87) + %tmp0_21 = arith.addi %r0_index_15, %tmp0_20 : tensor<1x2048xi32> loc(#loc87) + %tmp0_22 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc88) + %tmp0_23 = tt.addptr %tmp0_22, %tmp0_21 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc88) + %tmp0_24 = arith.constant 0.000000e+00 : f32 loc(#loc89) + %tmp0_25 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc89) + %tmp0_26 = arith.truncf %tmp0_25 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc89) + %tmp0_27 = tt.load %tmp0_23, %r0_mask_16, %tmp0_26 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc89) + %tmp0_28 = arith.extf %tmp0_27 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc90) + %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%_tmp2_13, %_tmp2_index_14, %tmp0_28, %r0_index_15) : (tensor<1x2048xf32>, tensor<1x2048xi32>, tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) loc(#loc21) + %_tmp2_29 = arith.select %r0_mask_16, %8#0, %_tmp2_13 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc91) + %_tmp2_index_30 = arith.select %r0_mask_16, %8#1, %_tmp2_index_14 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc92) + scf.yield %_tmp2_29, %_tmp2_index_30 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc24) + } loc(#loc114) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%_tmp2_index_12#0, %_tmp2_index_12#1) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc25) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc93) + %5 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc27) + %6 = tt.addptr %5, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc27) + %7 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc28) + tt.store %6, %7 : tensor<1x1x!tt.ptr> loc(#loc28) + tt.return loc(#loc29) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%a_value: tensor<1x2048xf32> loc("a_value"(#loc30)), %a_index: tensor<1x2048xi32> loc("a_index"(#loc30)), %b_value: tensor<1x2048xf32> loc("b_value"(#loc30)), %b_index: tensor<1x2048xi32> loc("b_index"(#loc30))) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<1x2048xf32> loc(#loc115) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<1x2048xf32> loc(#loc116) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%a_value) : (tensor<1x2048xf32>) -> i1 loc(#loc33) + %1:2 = scf.if %0 -> (tensor<1x2048xi1>, tensor<1x2048xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<1x2048xf32> loc(#loc100) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<1x2048xf32> loc(#loc101) + %mask_3 = arith.constant true loc(#loc102) + %mask_4 = arith.constant dense : tensor<1x2048xi1> loc(#loc102) + %mask_5 = arith.xori %b_isnan, %mask_4 : tensor<1x2048xi1> loc(#loc102) + %mask_6 = arith.andi %a_isnan, %mask_5 : tensor<1x2048xi1> loc(#loc103) + %mask_7 = arith.ori %mask, %mask_6 : tensor<1x2048xi1> loc(#loc117) + %equal_8 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc105) + %equal_9 = arith.ori %equal, %equal_8 : tensor<1x2048xi1> loc(#loc118) + scf.yield %mask_7, %equal_9 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc118) + } else { + scf.yield %mask, %equal : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc42) + } loc(#loc34) + %mask_0 = arith.cmpi slt, %a_index, %b_index : tensor<1x2048xi32> loc(#loc107) + %mask_1 = arith.andi %1#1, %mask_0 : tensor<1x2048xi1> loc(#loc108) + %mask_2 = arith.ori %1#0, %mask_1 : tensor<1x2048xi1> loc(#loc109) + %2 = arith.select %mask_2, %a_value, %b_value : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc46) + %3 = arith.select %mask_2, %a_index, %b_index : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc47) + tt.return %2, %3 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc48) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x2048xf32> loc(#loc49) + %5 = ub.poison : tensor<1x2048xi32> loc(#loc49) + tt.return %4, %5 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc49) + } loc(#loc30) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc50))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc51) + %true = arith.constant true loc(#loc52) + tt.return %true : i1 loc(#loc52) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc53) + tt.return %1 : i1 loc(#loc53) + } loc(#loc50) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc54))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc55) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc56) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc56) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc56) + %4 = arith.addf %x, %3 : tensor<1x2048xf32> loc(#loc56) + tt.return %4 : tensor<1x2048xf32> loc(#loc57) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x2048xf32> loc(#loc58) + tt.return %5 : tensor<1x2048xf32> loc(#loc58) + } loc(#loc54) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc60) + %cst = arith.constant dense : tensor<1xi1> loc(#loc60) + tt.return %cst : tensor<1xi1> loc(#loc61) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc62) + tt.return %0 : tensor<1xi1> loc(#loc62) + } loc(#loc59) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%value: tensor<1x2048xf32> loc("value"(#loc63)), %index: tensor<1x2048xi32> loc("index"(#loc63))) -> (tensor<1xf32>, tensor<1xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc64) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc64) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc64) + tt.return %0#0, %0#1 : tensor<1xf32>, tensor<1xi32> loc(#loc65) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc66) + %2 = ub.poison : tensor<1xi32> loc(#loc66) + tt.return %1, %2 : tensor<1xf32>, tensor<1xi32> loc(#loc66) + } loc(#loc63) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc30)), %a_index: i32 loc("a_index"(#loc30)), %b_value: f32 loc("b_value"(#loc30)), %b_index: i32 loc("b_index"(#loc30))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc115) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc116) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc33) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc100) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc101) + %mask_3 = arith.constant true loc(#loc102) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc102) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc103) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc117) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc105) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc118) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc118) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc42) + } loc(#loc34) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc107) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc108) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc109) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc46) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc47) + tt.return %2, %3 : f32, i32 loc(#loc48) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc49) + %5 = ub.poison : i32 loc(#loc49) + tt.return %4, %5 : f32, i32 loc(#loc49) + } loc(#loc30) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc50))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc51) + %true = arith.constant true loc(#loc52) + tt.return %true : i1 loc(#loc52) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc53) + tt.return %1 : i1 loc(#loc53) + } loc(#loc50) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc54))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc55) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc56) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc56) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc56) + tt.return %3 : tensor<1xf32> loc(#loc57) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc58) + tt.return %4 : tensor<1xf32> loc(#loc58) + } loc(#loc54) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":29:55) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":30:58) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":31:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":32:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":33:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":37:47) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":37:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":37:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":37:52) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":37:106) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":40:38) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":42:46) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":43:58) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":43:8) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":44:75) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":45:20) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":46:25) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":46:36) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":46:4) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc71 = loc("xnumel"(#loc1)) +#loc72 = loc("r0_numel"(#loc2)) +#loc73 = loc("xoffset"(#loc3)) +#loc74 = loc("xoffset"(#loc4)) +#loc75 = loc("xindex"(#loc5)) +#loc76 = loc("xindex"(#loc6)) +#loc77 = loc("xindex"(#loc7)) +#loc78 = loc("xmask"(#loc8)) +#loc79 = loc("r0_base"(#loc9)) +#loc80 = loc("r0_base"(#loc10)) +#loc81 = loc("_tmp2"(#loc11)) +#loc82 = loc("_tmp2_index"(#loc12)) +#loc83 = loc("_tmp2"(#loc13)) +#loc84 = loc("r0_index"(#loc14)) +#loc85 = loc("r0_mask"(#loc15)) +#loc86 = loc("tmp0"(#loc16)) +#loc87 = loc("tmp0"(#loc17)) +#loc88 = loc("tmp0"(#loc18)) +#loc89 = loc("tmp0"(#loc19)) +#loc90 = loc("tmp0"(#loc20)) +#loc91 = loc("_tmp2"(#loc22)) +#loc92 = loc("_tmp2_index"(#loc23)) +#loc93 = loc("tmp2"(#loc26)) +#loc98 = loc("mask"(#loc31)) +#loc99 = loc("equal"(#loc32)) +#loc100 = loc("a_isnan"(#loc35)) +#loc101 = loc("b_isnan"(#loc36)) +#loc102 = loc("mask"(#loc37)) +#loc103 = loc("mask"(#loc38)) +#loc104 = loc("mask"(#loc39)) +#loc105 = loc("equal"(#loc40)) +#loc106 = loc("equal"(#loc41)) +#loc107 = loc("mask"(#loc43)) +#loc108 = loc("mask"(#loc44)) +#loc109 = loc("mask"(#loc45)) +#loc114 = loc("_tmp2_index"(#loc83)) +#loc115 = loc("mask"(#loc98)) +#loc116 = loc("equal"(#loc99)) +#loc117 = loc("mask"(#loc104)) +#loc118 = loc("equal"(#loc106)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..5c5738d3d044db0356cf7134335060f21ab340c7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.ttgir @@ -0,0 +1,173 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":18:0) +#loc1 = loc(unknown) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":44:75) +#loc36 = loc("in_ptr0"(#loc)) +#loc37 = loc("out_ptr0"(#loc)) +#loc38 = loc("xnumel"(#loc)) +#loc39 = loc("r0_numel"(#loc)) +#loc67 = loc(callsite(#loc1 at #loc31)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_argmax_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32000> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %true = arith.constant true loc(#loc1) + %cst_1 = arith.constant dense : tensor<1x2048xi1, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<2147483647> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc40) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc41) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc41) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc42) + %tmp0_5 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc69) + %tmp0_6 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc44) + %_tmp2_index:2 = scf.for %_tmp2_index_7 = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp2 = %cst_3, %_tmp2_index_8 = %cst_2) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp2_index_7 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc46) + %r0_index_9 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32, #blocked> loc(#loc46) + %r0_mask = arith.cmpi slt, %r0_index_9, %cst : tensor<1x2048xi32, #blocked> loc(#loc47) + %tmp0_10 = arith.addi %r0_index_9, %tmp0_5 : tensor<1x2048xi32, #blocked> loc(#loc43) + %tmp0_11 = tt.addptr %tmp0_6, %tmp0_10 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc44) + %tmp0_12 = tt.load %tmp0_11, %r0_mask, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc48) + %tmp0_13 = arith.extf %tmp0_12 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc49) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_13 : tensor<1x2048xf32, #blocked> loc(#loc93) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_13 : tensor<1x2048xf32, #blocked> loc(#loc94) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<1x2048xf32, #blocked> loc(#loc73) + %b_isnan = arith.cmpf une, %tmp0_13, %tmp0_13 : tensor<1x2048xf32, #blocked> loc(#loc74) + %mask_14 = arith.xori %b_isnan, %cst_1 : tensor<1x2048xi1, #blocked> loc(#loc75) + %mask_15 = arith.andi %a_isnan, %mask_14 : tensor<1x2048xi1, #blocked> loc(#loc76) + %mask_16 = arith.ori %mask, %mask_15 : tensor<1x2048xi1, #blocked> loc(#loc95) + %equal_17 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1, #blocked> loc(#loc78) + %equal_18 = arith.ori %equal, %equal_17 : tensor<1x2048xi1, #blocked> loc(#loc96) + %mask_19 = arith.cmpi slt, %_tmp2_index_8, %r0_index_9 : tensor<1x2048xi32, #blocked> loc(#loc80) + %mask_20 = arith.andi %equal_18, %mask_19 : tensor<1x2048xi1, #blocked> loc(#loc81) + %mask_21 = arith.ori %mask_16, %mask_20 : tensor<1x2048xi1, #blocked> loc(#loc82) + %5 = arith.select %mask_21, %_tmp2, %tmp0_13 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc62) + %6 = arith.select %mask_21, %_tmp2_index_8, %r0_index_9 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc63) + %_tmp2_22 = arith.select %r0_mask, %5, %_tmp2 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc64) + %_tmp2_index_23 = arith.select %r0_mask, %6, %_tmp2_index_8 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc65) + scf.yield %_tmp2_22, %_tmp2_index_23 : tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc29) + } loc(#loc70) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc31)), %arg5: i32 loc(callsite(#loc1 at #loc31)), %arg6: f32 loc(callsite(#loc1 at #loc31)), %arg7: i32 loc(callsite(#loc1 at #loc31))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc97) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc98) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc83) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc84) + %mask_7 = arith.xori %b_isnan, %true : i1 loc(#loc85) + %mask_8 = arith.andi %a_isnan, %mask_7 : i1 loc(#loc86) + %mask_9 = arith.ori %mask, %mask_8 : i1 loc(#loc99) + %equal_10 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc87) + %equal_11 = arith.ori %equal, %equal_10 : i1 loc(#loc100) + %mask_12 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc88) + %mask_13 = arith.andi %equal_11, %mask_12 : i1 loc(#loc89) + %mask_14 = arith.ori %mask_9, %mask_13 : i1 loc(#loc90) + %5 = arith.select %mask_14, %arg4, %arg6 : f32 loc(#loc91) + %6 = arith.select %mask_14, %arg5, %arg7 : i32 loc(#loc92) + tt.reduce.return %5, %6 : f32, i32 loc(#loc66) + }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc66) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi32, #blocked> loc(#loc68) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc33) + %2 = ttg.convert_layout %tmp2 : tensor<1x1xi32, #blocked> -> tensor<1x1xi32, #blocked1> loc(#loc34) + %3 = arith.extsi %2 : tensor<1x1xi32, #blocked1> to tensor<1x1xi64, #blocked1> loc(#loc34) + %4 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc34) + tt.store %4, %3 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc34) + tt.return loc(#loc35) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":26:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":37:47) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":37:41) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":37:34) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":31:40) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":32:31) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":33:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":37:52) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":37:106) +#loc12 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":40:38) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":42:46) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":43:58) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":43:8) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":45:20) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":46:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":46:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":46:4) +#loc40 = loc("xoffset"(#loc2)) +#loc41 = loc("r0_base"(#loc3)) +#loc42 = loc("tmp0"(#loc4)) +#loc43 = loc("tmp0"(#loc5)) +#loc44 = loc("tmp0"(#loc6)) +#loc45 = loc("_tmp2"(#loc7)) +#loc46 = loc("r0_index"(#loc8)) +#loc47 = loc("r0_mask"(#loc9)) +#loc48 = loc("tmp0"(#loc10)) +#loc49 = loc("tmp0"(#loc11)) +#loc50 = loc("mask"(#loc12)) +#loc51 = loc("equal"(#loc14)) +#loc52 = loc("a_isnan"(#loc15)) +#loc53 = loc("b_isnan"(#loc16)) +#loc54 = loc("mask"(#loc17)) +#loc55 = loc("mask"(#loc18)) +#loc56 = loc("mask"(#loc19)) +#loc57 = loc("equal"(#loc20)) +#loc58 = loc("equal"(#loc21)) +#loc59 = loc("mask"(#loc22)) +#loc60 = loc("mask"(#loc23)) +#loc61 = loc("mask"(#loc24)) +#loc62 = loc(callsite(#loc25 at #loc13)) +#loc63 = loc(callsite(#loc26 at #loc13)) +#loc64 = loc("_tmp2"(#loc27)) +#loc65 = loc("_tmp2_index"(#loc28)) +#loc66 = loc(callsite(#loc30 at #loc31)) +#loc68 = loc("tmp2"(#loc32)) +#loc69 = loc(fused[#loc43, #loc42]) +#loc70 = loc("_tmp2_index"(#loc45)) +#loc71 = loc("mask"(#loc50)) +#loc72 = loc("equal"(#loc51)) +#loc73 = loc(callsite(#loc52 at #loc13)) +#loc74 = loc(callsite(#loc53 at #loc13)) +#loc75 = loc(callsite(#loc54 at #loc13)) +#loc76 = loc(callsite(#loc55 at #loc13)) +#loc77 = loc("mask"(#loc56)) +#loc78 = loc(callsite(#loc57 at #loc13)) +#loc79 = loc("equal"(#loc58)) +#loc80 = loc(callsite(#loc59 at #loc13)) +#loc81 = loc(callsite(#loc60 at #loc13)) +#loc82 = loc(callsite(#loc61 at #loc13)) +#loc83 = loc(callsite(#loc52 at #loc66)) +#loc84 = loc(callsite(#loc53 at #loc66)) +#loc85 = loc(callsite(#loc54 at #loc66)) +#loc86 = loc(callsite(#loc55 at #loc66)) +#loc87 = loc(callsite(#loc57 at #loc66)) +#loc88 = loc(callsite(#loc59 at #loc66)) +#loc89 = loc(callsite(#loc60 at #loc66)) +#loc90 = loc(callsite(#loc61 at #loc66)) +#loc91 = loc(callsite(#loc25 at #loc66)) +#loc92 = loc(callsite(#loc26 at #loc66)) +#loc93 = loc(callsite(#loc71 at #loc13)) +#loc94 = loc(callsite(#loc72 at #loc13)) +#loc95 = loc(callsite(#loc77 at #loc13)) +#loc96 = loc(callsite(#loc79 at #loc13)) +#loc97 = loc(callsite(#loc71 at #loc66)) +#loc98 = loc(callsite(#loc72 at #loc66)) +#loc99 = loc(callsite(#loc77 at #loc66)) +#loc100 = loc(callsite(#loc79 at #loc66)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..9fbf84799146d6c620df5f57af086dcb4ce305ef --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/FPR6CVRO7TD3COFKMLDKEISJC5LGSSRCL6BSN56QBPSPJWQDEOTA/triton_red_fused_argmax_0.ttir @@ -0,0 +1,176 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":44:75) +#loc39 = loc("in_ptr0"(#loc)) +#loc40 = loc("out_ptr0"(#loc)) +#loc41 = loc("xnumel"(#loc)) +#loc42 = loc("r0_numel"(#loc)) +#loc43 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_argmax_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %true = arith.constant true loc(#loc43) + %cst = arith.constant dense : tensor<1x2048xi1> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc3) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_1 = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc44) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc45) + %xoffset = tt.get_program_id x : i32 loc(#loc46) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc47) + %r0_base_2 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc48) + %_tmp2_index_3:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp2_4 = %_tmp2, %_tmp2_index_5 = %_tmp2_index) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc50) + %r0_index_6 = arith.addi %r0_index, %r0_base_2 : tensor<1x2048xi32> loc(#loc50) + %r0_mask = arith.cmpi slt, %r0_index_6, %cst_1 : tensor<1x2048xi32> loc(#loc51) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc52) + %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc76) + %tmp0_8 = arith.addi %r0_index_6, %tmp0_7 : tensor<1x2048xi32> loc(#loc53) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc54) + %tmp0_10 = tt.addptr %tmp0_9, %tmp0_8 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc54) + %tmp0_11 = tt.load %tmp0_10, %r0_mask, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc55) + %tmp0_12 = arith.extf %tmp0_11 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc56) + %mask = arith.cmpf ogt, %_tmp2_4, %tmp0_12 : tensor<1x2048xf32> loc(#loc99) + %equal = arith.cmpf oeq, %_tmp2_4, %tmp0_12 : tensor<1x2048xf32> loc(#loc100) + %a_isnan = arith.cmpf une, %_tmp2_4, %_tmp2_4 : tensor<1x2048xf32> loc(#loc79) + %b_isnan = arith.cmpf une, %tmp0_12, %tmp0_12 : tensor<1x2048xf32> loc(#loc80) + %mask_13 = arith.xori %b_isnan, %cst : tensor<1x2048xi1> loc(#loc81) + %mask_14 = arith.andi %a_isnan, %mask_13 : tensor<1x2048xi1> loc(#loc82) + %mask_15 = arith.ori %mask, %mask_14 : tensor<1x2048xi1> loc(#loc101) + %equal_16 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc84) + %equal_17 = arith.ori %equal, %equal_16 : tensor<1x2048xi1> loc(#loc102) + %mask_18 = arith.cmpi slt, %_tmp2_index_5, %r0_index_6 : tensor<1x2048xi32> loc(#loc86) + %mask_19 = arith.andi %equal_17, %mask_18 : tensor<1x2048xi1> loc(#loc87) + %mask_20 = arith.ori %mask_15, %mask_19 : tensor<1x2048xi1> loc(#loc88) + %4 = arith.select %mask_20, %_tmp2_4, %tmp0_12 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc69) + %5 = arith.select %mask_20, %_tmp2_index_5, %r0_index_6 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc70) + %_tmp2_21 = arith.select %r0_mask, %4, %_tmp2_4 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc71) + %_tmp2_index_22 = arith.select %r0_mask, %5, %_tmp2_index_5 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc72) + scf.yield %_tmp2_21, %_tmp2_index_22 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc33) + } loc(#loc75) + %0:2 = "tt.reduce"(%_tmp2_index_3#0, %_tmp2_index_3#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc2)), %arg5: i32 loc(callsite(#loc1 at #loc2)), %arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc103) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc104) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc89) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc90) + %mask_4 = arith.xori %b_isnan, %true : i1 loc(#loc91) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc92) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc105) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc93) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc106) + %mask_9 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc94) + %mask_10 = arith.andi %equal_8, %mask_9 : i1 loc(#loc95) + %mask_11 = arith.ori %mask_6, %mask_10 : i1 loc(#loc96) + %4 = arith.select %mask_11, %arg4, %arg6 : f32 loc(#loc97) + %5 = arith.select %mask_11, %arg5, %arg7 : i32 loc(#loc98) + tt.reduce.return %4, %5 : f32, i32 loc(#loc73) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc73) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc74) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc36) + %2 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc36) + %3 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc37) + tt.store %2, %3 : tensor<1x1x!tt.ptr> loc(#loc37) + tt.return loc(#loc38) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":31:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":30:58) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":29:55) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":23:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":26:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":26:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":32:31) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":33:29) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":37:47) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":37:41) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":37:34) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":37:52) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":37:106) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":40:38) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":42:46) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":43:58) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":43:8) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":45:20) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":46:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":46:36) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/u5/cu55o4f4khg2wuonhaoogm7cwe7beivg5otgutgnrv3xkelakvcz.py":46:4) +#loc44 = loc("_tmp2_index"(#loc4)) +#loc45 = loc("_tmp2"(#loc5)) +#loc46 = loc("xoffset"(#loc6)) +#loc47 = loc("r0_base"(#loc7)) +#loc48 = loc("r0_base"(#loc8)) +#loc49 = loc("_tmp2"(#loc3)) +#loc50 = loc("r0_index"(#loc9)) +#loc51 = loc("r0_mask"(#loc10)) +#loc52 = loc("tmp0"(#loc11)) +#loc53 = loc("tmp0"(#loc12)) +#loc54 = loc("tmp0"(#loc13)) +#loc55 = loc("tmp0"(#loc14)) +#loc56 = loc("tmp0"(#loc15)) +#loc57 = loc("mask"(#loc16)) +#loc58 = loc("equal"(#loc18)) +#loc59 = loc("a_isnan"(#loc19)) +#loc60 = loc("b_isnan"(#loc20)) +#loc61 = loc("mask"(#loc21)) +#loc62 = loc("mask"(#loc22)) +#loc63 = loc("mask"(#loc23)) +#loc64 = loc("equal"(#loc24)) +#loc65 = loc("equal"(#loc25)) +#loc66 = loc("mask"(#loc26)) +#loc67 = loc("mask"(#loc27)) +#loc68 = loc("mask"(#loc28)) +#loc69 = loc(callsite(#loc29 at #loc17)) +#loc70 = loc(callsite(#loc30 at #loc17)) +#loc71 = loc("_tmp2"(#loc31)) +#loc72 = loc("_tmp2_index"(#loc32)) +#loc73 = loc(callsite(#loc34 at #loc2)) +#loc74 = loc("tmp2"(#loc35)) +#loc75 = loc("_tmp2_index"(#loc49)) +#loc76 = loc(fused[#loc53, #loc52]) +#loc77 = loc("mask"(#loc57)) +#loc78 = loc("equal"(#loc58)) +#loc79 = loc(callsite(#loc59 at #loc17)) +#loc80 = loc(callsite(#loc60 at #loc17)) +#loc81 = loc(callsite(#loc61 at #loc17)) +#loc82 = loc(callsite(#loc62 at #loc17)) +#loc83 = loc("mask"(#loc63)) +#loc84 = loc(callsite(#loc64 at #loc17)) +#loc85 = loc("equal"(#loc65)) +#loc86 = loc(callsite(#loc66 at #loc17)) +#loc87 = loc(callsite(#loc67 at #loc17)) +#loc88 = loc(callsite(#loc68 at #loc17)) +#loc89 = loc(callsite(#loc59 at #loc73)) +#loc90 = loc(callsite(#loc60 at #loc73)) +#loc91 = loc(callsite(#loc61 at #loc73)) +#loc92 = loc(callsite(#loc62 at #loc73)) +#loc93 = loc(callsite(#loc64 at #loc73)) +#loc94 = loc(callsite(#loc66 at #loc73)) +#loc95 = loc(callsite(#loc67 at #loc73)) +#loc96 = loc(callsite(#loc68 at #loc73)) +#loc97 = loc(callsite(#loc29 at #loc73)) +#loc98 = loc(callsite(#loc30 at #loc73)) +#loc99 = loc(callsite(#loc77 at #loc17)) +#loc100 = loc(callsite(#loc78 at #loc17)) +#loc101 = loc(callsite(#loc83 at #loc17)) +#loc102 = loc(callsite(#loc85 at #loc17)) +#loc103 = loc(callsite(#loc77 at #loc73)) +#loc104 = loc(callsite(#loc78 at #loc73)) +#loc105 = loc(callsite(#loc83 at #loc73)) +#loc106 = loc(callsite(#loc85 at #loc73)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8c721cc69364836c15685e384b224183d419b582 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin new file mode 100644 index 0000000000000000000000000000000000000000..667d39be791bce78496f50bb3842d6965868de07 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..47baac3b757e4e87df6f4ac8a4f009aa867f6bb9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"hash": "36974fad074388784ad55942904c024ffd83b3dfd0a81a0cc930134242b6ab8b", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir new file mode 100644 index 0000000000000000000000000000000000000000..12515aa23e2137461aafe113de05d5b781635490 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir @@ -0,0 +1,415 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = icmp slt i32 %9, %4, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 15, !dbg !9 + %13 = zext nneg i32 %9 to i64, !dbg !10 + %.frozen = freeze i64 %3, !dbg !11 + %14 = sdiv i64 %13, %.frozen, !dbg !11 + %15 = mul i64 %14, %.frozen, !dbg !10 + %.decomposed = sub i64 %13, %15, !dbg !10 + %16 = zext nneg i32 %12 to i64, !dbg !12 + %17 = shl nsw i64 %14, 4, !dbg !13 + %18 = mul i64 %3, %16, !dbg !14 + %19 = shl i64 %3, 4, !dbg !15 + %20 = mul i64 %19, %14, !dbg !16 + %21 = getelementptr i32, ptr addrspace(1) %0, i64 %.decomposed, !dbg !17 + %22 = getelementptr i32, ptr addrspace(1) %21, i64 %16, !dbg !17 + %23 = getelementptr i32, ptr addrspace(1) %22, i64 %17, !dbg !17 + %24 = getelementptr i32, ptr addrspace(1) %23, i64 %18, !dbg !17 + %25 = getelementptr i32, ptr addrspace(1) %24, i64 %20, !dbg !17 + %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !18 + %27 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %25, i64 %26, i1 %10) #3, !dbg !18 + %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !18 + %29 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %25, i64 %28, i1 %10) #3, !dbg !18 + %30 = lshr i32 %11, 1, !dbg !19 + %.lobit = and i32 %30, 1, !dbg !19 + %31 = and i32 %11, 1, !dbg !19 + %32 = lshr i32 %11, 2, !dbg !19 + %.lobit1 = and i32 %32, 1, !dbg !19 + %33 = lshr i32 %11, 3, !dbg !19 + %.lobit2 = and i32 %33, 1, !dbg !19 + %34 = xor i32 %31, 1, !dbg !23 + %35 = xor i32 %.lobit, 1, !dbg !23 + %36 = xor i32 %.lobit1, 1, !dbg !23 + %37 = xor i32 %.lobit2, 1, !dbg !23 + %38 = mul nuw nsw i32 %29, %34, !dbg !24 + %39 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %38, i32 1, i32 31), !dbg !25 + %40 = add i32 %38, %39, !dbg !28 + %41 = mul nuw nsw i32 %29, %31, !dbg !29 + %42 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %41, i32 1, i32 31), !dbg !25 + %43 = add i32 %42, %41, !dbg !28 + %44 = mul nuw nsw i32 %34, %12, !dbg !30 + %45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 1, i32 31), !dbg !25 + %46 = add i32 %45, %44, !dbg !28 + %47 = mul nuw nsw i32 %12, %31, !dbg !31 + %48 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %47, i32 1, i32 31), !dbg !25 + %49 = add i32 %48, %47, !dbg !28 + %50 = trunc i32 %30 to i1, !dbg !32 + %51 = icmp sge i32 %40, %43, !dbg !32 + %52 = icmp ne i32 %40, %43, !dbg !32 + %53 = icmp sle i32 %46, %49, !dbg !32 + %54 = or i1 %52, %53, !dbg !32 + %55 = and i1 %51, %54, !dbg !32 + %.not = xor i1 %55, %50, !dbg !32 + %56 = xor i32 %43, %40, !dbg !33 + %57 = select i1 %.not, i32 0, i32 %56, !dbg !34 + %58 = xor i32 %57, %29, !dbg !35 + %59 = xor i32 %49, %46, !dbg !36 + %60 = select i1 %.not, i32 0, i32 %59, !dbg !37 + %61 = xor i32 %60, %12, !dbg !38 + %62 = mul nuw nsw i32 %58, %35, !dbg !24 + %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %62, i32 2, i32 31), !dbg !25 + %64 = add i32 %62, %63, !dbg !28 + %65 = mul nuw nsw i32 %58, %.lobit, !dbg !29 + %66 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %65, i32 2, i32 31), !dbg !25 + %67 = add i32 %65, %66, !dbg !28 + %68 = mul nuw nsw i32 %61, %35, !dbg !30 + %69 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %68, i32 2, i32 31), !dbg !25 + %70 = add i32 %68, %69, !dbg !28 + %71 = mul nuw nsw i32 %61, %.lobit, !dbg !31 + %72 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 2, i32 31), !dbg !25 + %73 = add i32 %71, %72, !dbg !28 + %74 = trunc i32 %32 to i1, !dbg !32 + %75 = icmp sge i32 %64, %67, !dbg !32 + %76 = icmp ne i32 %64, %67, !dbg !32 + %77 = icmp sle i32 %70, %73, !dbg !32 + %78 = or i1 %76, %77, !dbg !32 + %79 = and i1 %75, %78, !dbg !32 + %.not3 = xor i1 %79, %74, !dbg !32 + %80 = xor i32 %64, %67, !dbg !33 + %81 = select i1 %.not3, i32 0, i32 %80, !dbg !34 + %82 = xor i32 %81, %58, !dbg !35 + %83 = xor i32 %70, %73, !dbg !36 + %84 = select i1 %.not3, i32 0, i32 %83, !dbg !37 + %85 = xor i32 %84, %61, !dbg !38 + %86 = mul nuw nsw i32 %82, %34, !dbg !24 + %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 1, i32 31), !dbg !25 + %88 = add i32 %86, %87, !dbg !28 + %89 = mul nuw nsw i32 %82, %31, !dbg !29 + %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 1, i32 31), !dbg !25 + %91 = add i32 %89, %90, !dbg !28 + %92 = mul nuw nsw i32 %85, %34, !dbg !30 + %93 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %92, i32 1, i32 31), !dbg !25 + %94 = add i32 %92, %93, !dbg !28 + %95 = mul nuw nsw i32 %85, %31, !dbg !31 + %96 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %95, i32 1, i32 31), !dbg !25 + %97 = add i32 %95, %96, !dbg !28 + %98 = icmp sge i32 %88, %91, !dbg !32 + %99 = icmp ne i32 %88, %91, !dbg !32 + %100 = icmp sle i32 %94, %97, !dbg !32 + %101 = or i1 %99, %100, !dbg !32 + %102 = and i1 %98, %101, !dbg !32 + %.not4 = xor i1 %102, %74, !dbg !32 + %103 = xor i32 %88, %91, !dbg !33 + %104 = select i1 %.not4, i32 0, i32 %103, !dbg !34 + %105 = xor i32 %104, %82, !dbg !35 + %106 = xor i32 %94, %97, !dbg !36 + %107 = select i1 %.not4, i32 0, i32 %106, !dbg !37 + %108 = xor i32 %107, %85, !dbg !38 + %109 = mul nuw nsw i32 %105, %36, !dbg !24 + %110 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %109, i32 4, i32 31), !dbg !25 + %111 = add i32 %109, %110, !dbg !28 + %112 = mul nuw nsw i32 %105, %.lobit1, !dbg !29 + %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 4, i32 31), !dbg !25 + %114 = add i32 %112, %113, !dbg !28 + %115 = mul nuw nsw i32 %108, %36, !dbg !30 + %116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %115, i32 4, i32 31), !dbg !25 + %117 = add i32 %115, %116, !dbg !28 + %118 = mul nuw nsw i32 %108, %.lobit1, !dbg !31 + %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 4, i32 31), !dbg !25 + %120 = add i32 %118, %119, !dbg !28 + %121 = trunc i32 %33 to i1, !dbg !32 + %122 = icmp sge i32 %111, %114, !dbg !32 + %123 = icmp ne i32 %111, %114, !dbg !32 + %124 = icmp sle i32 %117, %120, !dbg !32 + %125 = or i1 %123, %124, !dbg !32 + %126 = and i1 %122, %125, !dbg !32 + %.not5 = xor i1 %126, %121, !dbg !32 + %127 = xor i32 %111, %114, !dbg !33 + %128 = select i1 %.not5, i32 0, i32 %127, !dbg !34 + %129 = xor i32 %128, %105, !dbg !35 + %130 = xor i32 %117, %120, !dbg !36 + %131 = select i1 %.not5, i32 0, i32 %130, !dbg !37 + %132 = xor i32 %131, %108, !dbg !38 + %133 = mul nuw nsw i32 %129, %35, !dbg !24 + %134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 2, i32 31), !dbg !25 + %135 = add i32 %133, %134, !dbg !28 + %136 = mul nuw nsw i32 %129, %.lobit, !dbg !29 + %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 2, i32 31), !dbg !25 + %138 = add i32 %136, %137, !dbg !28 + %139 = mul nuw nsw i32 %132, %35, !dbg !30 + %140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 2, i32 31), !dbg !25 + %141 = add i32 %139, %140, !dbg !28 + %142 = mul nuw nsw i32 %132, %.lobit, !dbg !31 + %143 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 2, i32 31), !dbg !25 + %144 = add i32 %142, %143, !dbg !28 + %145 = icmp sge i32 %135, %138, !dbg !32 + %146 = icmp ne i32 %135, %138, !dbg !32 + %147 = icmp sle i32 %141, %144, !dbg !32 + %148 = or i1 %146, %147, !dbg !32 + %149 = and i1 %145, %148, !dbg !32 + %.not6 = xor i1 %149, %121, !dbg !32 + %150 = xor i32 %135, %138, !dbg !33 + %151 = select i1 %.not6, i32 0, i32 %150, !dbg !34 + %152 = xor i32 %151, %129, !dbg !35 + %153 = xor i32 %141, %144, !dbg !36 + %154 = select i1 %.not6, i32 0, i32 %153, !dbg !37 + %155 = xor i32 %154, %132, !dbg !38 + %156 = mul nuw nsw i32 %152, %34, !dbg !24 + %157 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %156, i32 1, i32 31), !dbg !25 + %158 = add i32 %156, %157, !dbg !28 + %159 = mul nuw nsw i32 %152, %31, !dbg !29 + %160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %159, i32 1, i32 31), !dbg !25 + %161 = add i32 %159, %160, !dbg !28 + %162 = mul nuw nsw i32 %155, %34, !dbg !30 + %163 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %162, i32 1, i32 31), !dbg !25 + %164 = add i32 %162, %163, !dbg !28 + %165 = mul nuw nsw i32 %155, %31, !dbg !31 + %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 1, i32 31), !dbg !25 + %167 = add i32 %165, %166, !dbg !28 + %168 = icmp sge i32 %158, %161, !dbg !32 + %169 = icmp ne i32 %158, %161, !dbg !32 + %170 = icmp sle i32 %164, %167, !dbg !32 + %171 = or i1 %169, %170, !dbg !32 + %172 = and i1 %168, %171, !dbg !32 + %.not7 = xor i1 %172, %121, !dbg !32 + %173 = xor i32 %158, %161, !dbg !33 + %174 = select i1 %.not7, i32 0, i32 %173, !dbg !34 + %175 = xor i32 %174, %152, !dbg !35 + %176 = xor i32 %164, %167, !dbg !36 + %177 = select i1 %.not7, i32 0, i32 %176, !dbg !37 + %178 = xor i32 %177, %155, !dbg !38 + %179 = mul nuw nsw i32 %175, %37, !dbg !24 + %180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %179, i32 8, i32 31), !dbg !25 + %181 = add i32 %179, %180, !dbg !28 + %182 = mul nuw nsw i32 %175, %.lobit2, !dbg !29 + %183 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %182, i32 8, i32 31), !dbg !25 + %184 = add i32 %182, %183, !dbg !28 + %185 = mul nuw nsw i32 %178, %37, !dbg !30 + %186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %185, i32 8, i32 31), !dbg !25 + %187 = add i32 %185, %186, !dbg !28 + %188 = mul nuw nsw i32 %178, %.lobit2, !dbg !31 + %189 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %188, i32 8, i32 31), !dbg !25 + %190 = add i32 %188, %189, !dbg !28 + %191 = icmp slt i32 %181, %184, !dbg !39 + %192 = icmp eq i32 %181, %184, !dbg !40 + %193 = icmp sgt i32 %187, %190, !dbg !41 + %194 = and i1 %192, %193, !dbg !42 + %195 = or i1 %191, %194, !dbg !43 + %196 = xor i32 %181, %184, !dbg !33 + %197 = select i1 %195, i32 %196, i32 0, !dbg !34 + %198 = xor i32 %197, %175, !dbg !35 + %199 = xor i32 %187, %190, !dbg !36 + %200 = select i1 %195, i32 %199, i32 0, !dbg !37 + %201 = xor i32 %200, %178, !dbg !38 + %202 = mul nuw nsw i32 %198, %36, !dbg !24 + %203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 4, i32 31), !dbg !25 + %204 = add i32 %202, %203, !dbg !28 + %205 = mul nuw nsw i32 %198, %.lobit1, !dbg !29 + %206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %205, i32 4, i32 31), !dbg !25 + %207 = add i32 %205, %206, !dbg !28 + %208 = mul nuw nsw i32 %201, %36, !dbg !30 + %209 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %208, i32 4, i32 31), !dbg !25 + %210 = add i32 %208, %209, !dbg !28 + %211 = mul nuw nsw i32 %201, %.lobit1, !dbg !31 + %212 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %211, i32 4, i32 31), !dbg !25 + %213 = add i32 %211, %212, !dbg !28 + %214 = icmp slt i32 %204, %207, !dbg !39 + %215 = icmp eq i32 %204, %207, !dbg !40 + %216 = icmp sgt i32 %210, %213, !dbg !41 + %217 = and i1 %215, %216, !dbg !42 + %218 = or i1 %214, %217, !dbg !43 + %219 = xor i32 %204, %207, !dbg !33 + %220 = select i1 %218, i32 %219, i32 0, !dbg !34 + %221 = xor i32 %220, %198, !dbg !35 + %222 = xor i32 %210, %213, !dbg !36 + %223 = select i1 %218, i32 %222, i32 0, !dbg !37 + %224 = xor i32 %223, %201, !dbg !38 + %225 = mul nuw nsw i32 %221, %35, !dbg !24 + %226 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %225, i32 2, i32 31), !dbg !25 + %227 = add i32 %225, %226, !dbg !28 + %228 = mul nuw nsw i32 %221, %.lobit, !dbg !29 + %229 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %228, i32 2, i32 31), !dbg !25 + %230 = add i32 %228, %229, !dbg !28 + %231 = mul nuw nsw i32 %224, %35, !dbg !30 + %232 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %231, i32 2, i32 31), !dbg !25 + %233 = add i32 %231, %232, !dbg !28 + %234 = mul nuw nsw i32 %224, %.lobit, !dbg !31 + %235 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %234, i32 2, i32 31), !dbg !25 + %236 = add i32 %234, %235, !dbg !28 + %237 = icmp slt i32 %227, %230, !dbg !39 + %238 = icmp eq i32 %227, %230, !dbg !40 + %239 = icmp sgt i32 %233, %236, !dbg !41 + %240 = and i1 %238, %239, !dbg !42 + %241 = or i1 %237, %240, !dbg !43 + %242 = xor i32 %227, %230, !dbg !33 + %243 = select i1 %241, i32 %242, i32 0, !dbg !34 + %244 = xor i32 %243, %221, !dbg !35 + %245 = xor i32 %233, %236, !dbg !36 + %246 = select i1 %241, i32 %245, i32 0, !dbg !37 + %247 = xor i32 %246, %224, !dbg !38 + %248 = mul nuw nsw i32 %244, %34, !dbg !24 + %249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %248, i32 1, i32 31), !dbg !25 + %250 = add i32 %248, %249, !dbg !28 + %251 = mul nuw nsw i32 %244, %31, !dbg !29 + %252 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %251, i32 1, i32 31), !dbg !25 + %253 = add i32 %251, %252, !dbg !28 + %254 = mul nuw nsw i32 %247, %34, !dbg !30 + %255 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %254, i32 1, i32 31), !dbg !25 + %256 = add i32 %254, %255, !dbg !28 + %257 = mul nuw nsw i32 %247, %31, !dbg !31 + %258 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %257, i32 1, i32 31), !dbg !25 + %259 = add i32 %257, %258, !dbg !28 + %260 = icmp slt i32 %250, %253, !dbg !39 + %261 = icmp eq i32 %250, %253, !dbg !40 + %262 = icmp sgt i32 %256, %259, !dbg !41 + %263 = and i1 %261, %262, !dbg !42 + %264 = or i1 %260, %263, !dbg !43 + %265 = xor i32 %256, %259, !dbg !36 + %266 = select i1 %264, i32 %265, i32 0, !dbg !37 + %267 = xor i32 %266, %247, !dbg !38 + %narrow = select i1 %10, i32 %27, i32 0, !dbg !44 + %268 = sext i32 %narrow to i64, !dbg !44 + %269 = ashr i32 %narrow, 31, !dbg !45 + %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %narrow, i32 8, i32 31), !dbg !45 + %271 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 8, i32 31), !dbg !45 + %272 = insertelement <2 x i32> poison, i32 %270, i64 0, !dbg !45 + %273 = insertelement <2 x i32> %272, i32 %271, i64 1, !dbg !45 + %274 = bitcast <2 x i32> %273 to i64, !dbg !45 + %275 = add i64 %274, %268, !dbg !47 + %extelt.offset = lshr i64 %275, 32, !dbg !45 + %276 = trunc nuw i64 %extelt.offset to i32, !dbg !45 + %277 = trunc i64 %275 to i32, !dbg !45 + %278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %277, i32 4, i32 31), !dbg !45 + %279 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 4, i32 31), !dbg !45 + %280 = insertelement <2 x i32> poison, i32 %278, i64 0, !dbg !45 + %281 = insertelement <2 x i32> %280, i32 %279, i64 1, !dbg !45 + %282 = bitcast <2 x i32> %281 to i64, !dbg !45 + %283 = add i64 %275, %282, !dbg !47 + %extelt.offset8 = lshr i64 %283, 32, !dbg !45 + %284 = trunc nuw i64 %extelt.offset8 to i32, !dbg !45 + %285 = trunc i64 %283 to i32, !dbg !45 + %286 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 2, i32 31), !dbg !45 + %287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %284, i32 2, i32 31), !dbg !45 + %288 = insertelement <2 x i32> poison, i32 %286, i64 0, !dbg !45 + %289 = insertelement <2 x i32> %288, i32 %287, i64 1, !dbg !45 + %290 = bitcast <2 x i32> %289 to i64, !dbg !45 + %291 = add i64 %283, %290, !dbg !47 + %extelt.offset9 = lshr i64 %291, 32, !dbg !45 + %292 = trunc nuw i64 %extelt.offset9 to i32, !dbg !45 + %293 = trunc i64 %291 to i32, !dbg !45 + %294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 1, i32 31), !dbg !45 + %295 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %292, i32 1, i32 31), !dbg !45 + %296 = insertelement <2 x i32> poison, i32 %294, i64 0, !dbg !45 + %297 = insertelement <2 x i32> %296, i32 %295, i64 1, !dbg !45 + %298 = bitcast <2 x i32> %297 to i64, !dbg !45 + %299 = add i64 %291, %298, !dbg !47 + %300 = trunc i64 %299 to i32, !dbg !48 + %301 = icmp slt i64 %3, 2, !dbg !49 + %302 = icmp sgt i64 %3, 1, !dbg !50 + %303 = select i1 %302, i64 %3, i64 0, !dbg !51 + %304 = zext i1 %301 to i64, !dbg !52 + %305 = add i64 %303, %304, !dbg !53 + %306 = mul i64 %17, %305, !dbg !54 + %.idx = shl nuw nsw i64 %.decomposed, 6, !dbg !55 + %307 = getelementptr i8, ptr addrspace(1) %1, i64 %.idx, !dbg !55 + %308 = getelementptr i32, ptr addrspace(1) %307, i64 %16, !dbg !55 + %309 = getelementptr i32, ptr addrspace(1) %308, i64 %306, !dbg !55 + %310 = and i32 %11, 48, !dbg !56 + %311 = icmp eq i32 %310, 0, !dbg !56 + %312 = and i1 %311, %10, !dbg !56 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %267, ptr addrspace(1) %309, i1 %312) #3, !dbg !56 + %313 = mul i64 %14, %305, !dbg !57 + %314 = getelementptr i32, ptr addrspace(1) %2, i64 %.decomposed, !dbg !58 + %315 = getelementptr i32, ptr addrspace(1) %314, i64 %313, !dbg !58 + %316 = and i32 %11, 63, !dbg !59 + %317 = icmp eq i32 %316, 0, !dbg !59 + %318 = and i1 %317, %10, !dbg !59 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %300, ptr addrspace(1) %315, i1 %318) #3, !dbg !59 + ret void, !dbg !60 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", linkageName: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 25, column: 21, scope: !4) +!9 = !DILocation(line: 26, column: 38, scope: !4) +!10 = !DILocation(line: 32, column: 19, scope: !4) +!11 = !DILocation(line: 33, column: 19, scope: !4) +!12 = !DILocation(line: 35, column: 37, scope: !4) +!13 = !DILocation(line: 35, column: 45, scope: !4) +!14 = !DILocation(line: 35, column: 54, scope: !4) +!15 = !DILocation(line: 35, column: 64, scope: !4) +!16 = !DILocation(line: 35, column: 68, scope: !4) +!17 = !DILocation(line: 35, column: 30, scope: !4) +!18 = !DILocation(line: 35, column: 73, scope: !4) +!19 = !DILocation(line: 627, column: 44, scope: !20, inlinedAt: !22) +!20 = distinct !DILexicalBlockFile(scope: !4, file: !21, discriminator: 0) +!21 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!22 = !DILocation(line: 40, column: 67, scope: !4) +!23 = !DILocation(line: 537, column: 21, scope: !20, inlinedAt: !22) +!24 = !DILocation(line: 538, column: 40, scope: !20, inlinedAt: !22) +!25 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !22) +!26 = distinct !DILexicalBlockFile(scope: !4, file: !27, discriminator: 0) +!27 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!28 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !22) +!29 = !DILocation(line: 539, column: 41, scope: !20, inlinedAt: !22) +!30 = !DILocation(line: 548, column: 23, scope: !20, inlinedAt: !22) +!31 = !DILocation(line: 551, column: 23, scope: !20, inlinedAt: !22) +!32 = !DILocation(line: 599, column: 28, scope: !20, inlinedAt: !22) +!33 = !DILocation(line: 600, column: 38, scope: !20, inlinedAt: !22) +!34 = !DILocation(line: 600, column: 46, scope: !20, inlinedAt: !22) +!35 = !DILocation(line: 600, column: 15, scope: !20, inlinedAt: !22) +!36 = !DILocation(line: 601, column: 48, scope: !20, inlinedAt: !22) +!37 = !DILocation(line: 601, column: 59, scope: !20, inlinedAt: !22) +!38 = !DILocation(line: 601, column: 22, scope: !20, inlinedAt: !22) +!39 = !DILocation(line: 574, column: 22, scope: !20, inlinedAt: !22) +!40 = !DILocation(line: 591, column: 21, scope: !20, inlinedAt: !22) +!41 = !DILocation(line: 594, column: 40, scope: !20, inlinedAt: !22) +!42 = !DILocation(line: 594, column: 29, scope: !20, inlinedAt: !22) +!43 = !DILocation(line: 594, column: 23, scope: !20, inlinedAt: !22) +!44 = !DILocation(line: 43, column: 34, scope: !4) +!45 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !46) +!46 = !DILocation(line: 44, column: 26, scope: !4) +!47 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !46) +!48 = !DILocation(line: 47, column: 21, scope: !4) +!49 = !DILocation(line: 48, column: 62, scope: !4) +!50 = !DILocation(line: 48, column: 88, scope: !4) +!51 = !DILocation(line: 48, column: 79, scope: !4) +!52 = !DILocation(line: 48, scope: !4) +!53 = !DILocation(line: 48, column: 70, scope: !4) +!54 = !DILocation(line: 48, column: 47, scope: !4) +!55 = !DILocation(line: 48, column: 25, scope: !4) +!56 = !DILocation(line: 48, column: 102, scope: !4) +!57 = !DILocation(line: 49, column: 34, scope: !4) +!58 = !DILocation(line: 49, column: 25, scope: !4) +!59 = !DILocation(line: 49, column: 89, scope: !4) +!60 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx new file mode 100644 index 0000000000000000000000000000000000000000..65221c321bd83b7592b487b0f8cca6db4ffd33df --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx @@ -0,0 +1,918 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 // -- Begin function triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 + // @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.visible .entry triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2, + .param .u64 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_6, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_7 +) +.reqntid 64 +{ + .reg .pred %p<67>; + .reg .b32 %r<224>; + .reg .b64 %rd<65>; + .loc 1 18 0 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:18:0 + +// %bb.0: + ld.param.b32 %r3, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4]; + ld.param.b64 %rd8, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2]; + ld.param.b64 %rd7, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1]; + ld.param.b64 %rd6, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0]; + ld.param.b64 %rd9, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3]; +$L__tmp0: + .loc 1 23 28 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:23:28 + mov.u32 %r4, %ctaid.x; + .loc 1 26 38 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:26:38 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 15; + .loc 1 32 19 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:32:19 + cvt.u64.u32 %rd1, %r4; + .loc 1 33 19 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:33:19 + and.b64 %rd11, %rd9, -4294967296; + setp.ne.b64 %p1, %rd11, 0; + cvt.u32.u64 %r223, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd64, %rd1, %rd9; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r5, %rd9; + div.u32 %r7, %r223, %r5; + cvt.u64.u32 %rd64, %r7; +$L__BB0_3: + .loc 1 25 21 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:25:21 + setp.lt.s32 %p2, %r223, %r3; + .loc 1 32 19 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:32:19 + mul.lo.s64 %rd20, %rd64, %rd9; + sub.s64 %rd21, %rd1, %rd20; + .loc 1 35 37 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:35:37 + cvt.u64.u32 %rd22, %r2; + .loc 1 35 45 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:35:45 + shl.b64 %rd23, %rd64, 4; + .loc 1 35 54 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:35:54 + mul.lo.s64 %rd24, %rd9, %rd22; + .loc 1 35 68 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:35:68 + mul.lo.s64 %rd25, %rd9, %rd64; + .loc 1 35 30 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:35:30 + shl.b64 %rd26, %rd21, 2; + add.s64 %rd27, %rd6, %rd26; + mul.wide.u32 %rd28, %r2, 4; + add.s64 %rd29, %rd27, %rd28; + shl.b64 %rd30, %rd64, 6; + add.s64 %rd31, %rd29, %rd30; + shl.b64 %rd32, %rd24, 2; + add.s64 %rd33, %rd31, %rd32; + shl.b64 %rd34, %rd25, 6; + add.s64 %rd13, %rd33, %rd34; + .loc 1 35 73 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:35:73 + // begin inline asm + mov.u64 %rd14, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd14, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r8, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r8 }, [ %rd13 + 0 ], %rd14; + // end inline asm + // begin inline asm + mov.u64 %rd17, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r9, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r9 }, [ %rd13 + 0 ], %rd17; + // end inline asm +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shr.u32 %r13, %r1, 1; + bfe.u32 %r14, %r1, 1, 1; + and.b32 %r15, %r1, 1; + shr.u32 %r16, %r1, 2; + bfe.u32 %r17, %r1, 2, 1; + shr.u32 %r18, %r1, 3; + bfe.u32 %r19, %r1, 3, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r20, %r15, 1; + xor.b32 %r21, %r14, 1; + xor.b32 %r22, %r17, 1; + xor.b32 %r23, %r19, 1; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r24, %r9, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r25, %r24, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r26, %r24, %r25; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r27, %r9, %r15; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r28, %r27, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r29, %r28, %r27; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r30, %r20, %r2; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r31, %r30, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r32, %r31, %r30; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r33, %r2, %r15; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r34, %r33, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r35, %r34, %r33; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.b32 %r36, %r13, 1; + setp.ne.b32 %p6, %r36, 0; + setp.ge.s32 %p7, %r26, %r29; + setp.ne.b32 %p8, %r26, %r29; + setp.le.s32 %p9, %r32, %r35; + or.pred %p10, %p8, %p9; + and.pred %p11, %p7, %p10; + xor.pred %p12, %p11, %p6; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r37, %r29, %r26; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r38, 0, %r37, %p12; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r39, %r38, %r9; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r40, %r35, %r32; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r41, 0, %r40, %p12; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r42, %r41, %r2; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r43, %r39, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r44, %r43, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r45, %r43, %r44; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r46, %r39, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r47, %r46, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r48, %r46, %r47; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r49, %r42, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r50, %r49, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r51, %r49, %r50; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r52, %r42, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r53, %r52, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r54, %r52, %r53; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.b32 %r55, %r16, 1; + setp.ne.b32 %p13, %r55, 0; + setp.ge.s32 %p14, %r45, %r48; + setp.ne.b32 %p15, %r45, %r48; + setp.le.s32 %p16, %r51, %r54; + or.pred %p17, %p15, %p16; + and.pred %p18, %p14, %p17; + xor.pred %p19, %p18, %p13; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r56, %r45, %r48; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r57, 0, %r56, %p19; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r58, %r57, %r39; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r59, %r51, %r54; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r60, 0, %r59, %p19; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r61, %r60, %r42; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r62, %r58, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r63, %r62, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r64, %r62, %r63; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r65, %r58, %r15; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r66, %r65, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r67, %r65, %r66; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r68, %r61, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r69, %r68, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r70, %r68, %r69; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r71, %r61, %r15; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r72, %r71, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r73, %r71, %r72; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.ge.s32 %p20, %r64, %r67; + setp.ne.b32 %p21, %r64, %r67; + setp.le.s32 %p22, %r70, %r73; + or.pred %p23, %p21, %p22; + and.pred %p24, %p20, %p23; + xor.pred %p25, %p24, %p13; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r74, %r64, %r67; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r75, 0, %r74, %p25; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r76, %r75, %r58; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r77, %r70, %r73; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r78, 0, %r77, %p25; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r79, %r78, %r61; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r80, %r76, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r81, %r80, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r82, %r80, %r81; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r83, %r76, %r17; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r84, %r83, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r85, %r83, %r84; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r86, %r79, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r87, %r86, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r88, %r86, %r87; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r89, %r79, %r17; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r90, %r89, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r91, %r89, %r90; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.b32 %r92, %r18, 1; + setp.ne.b32 %p26, %r92, 0; + setp.ge.s32 %p27, %r82, %r85; + setp.ne.b32 %p28, %r82, %r85; + setp.le.s32 %p29, %r88, %r91; + or.pred %p30, %p28, %p29; + and.pred %p31, %p27, %p30; + xor.pred %p32, %p31, %p26; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r93, %r82, %r85; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r94, 0, %r93, %p32; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r95, %r94, %r76; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r96, %r88, %r91; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r97, 0, %r96, %p32; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r98, %r97, %r79; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r99, %r95, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r100, %r99, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r101, %r99, %r100; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r102, %r95, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r103, %r102, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r104, %r102, %r103; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r105, %r98, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r106, %r105, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r107, %r105, %r106; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r108, %r98, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r109, %r108, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r110, %r108, %r109; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.ge.s32 %p33, %r101, %r104; + setp.ne.b32 %p34, %r101, %r104; + setp.le.s32 %p35, %r107, %r110; + or.pred %p36, %p34, %p35; + and.pred %p37, %p33, %p36; + xor.pred %p38, %p37, %p26; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r111, %r101, %r104; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r112, 0, %r111, %p38; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r113, %r112, %r95; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r114, %r107, %r110; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r115, 0, %r114, %p38; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r116, %r115, %r98; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r117, %r113, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r118, %r117, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r119, %r117, %r118; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r120, %r113, %r15; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r121, %r120, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r122, %r120, %r121; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r123, %r116, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r124, %r123, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r125, %r123, %r124; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r126, %r116, %r15; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r127, %r126, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r128, %r126, %r127; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.ge.s32 %p39, %r119, %r122; + setp.ne.b32 %p40, %r119, %r122; + setp.le.s32 %p41, %r125, %r128; + or.pred %p42, %p40, %p41; + and.pred %p43, %p39, %p42; + xor.pred %p44, %p43, %p26; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r129, %r119, %r122; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r130, 0, %r129, %p44; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r131, %r130, %r113; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r132, %r125, %r128; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r133, 0, %r132, %p44; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r134, %r133, %r116; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r135, %r131, %r23; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r136, %r135, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r137, %r135, %r136; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r138, %r131, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r139, %r138, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r140, %r138, %r139; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r141, %r134, %r23; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r142, %r141, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r143, %r141, %r142; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r144, %r134, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r145, %r144, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r146, %r144, %r145; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p45, %r137, %r140; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p46, %r137, %r140; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p47, %r143, %r146; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p48, %p46, %p47; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p49, %p45, %p48; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r147, %r137, %r140; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r148, %r147, 0, %p49; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r149, %r148, %r131; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r150, %r143, %r146; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r151, %r150, 0, %p49; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r152, %r151, %r134; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r153, %r149, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r154, %r153, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r155, %r153, %r154; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r156, %r149, %r17; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r157, %r156, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r158, %r156, %r157; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r159, %r152, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r160, %r159, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r161, %r159, %r160; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r162, %r152, %r17; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r163, %r162, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r164, %r162, %r163; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p50, %r155, %r158; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p51, %r155, %r158; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p52, %r161, %r164; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p53, %p51, %p52; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p54, %p50, %p53; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r165, %r155, %r158; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r166, %r165, 0, %p54; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r167, %r166, %r149; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r168, %r161, %r164; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r169, %r168, 0, %p54; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r170, %r169, %r152; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r171, %r167, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r172, %r171, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r173, %r171, %r172; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r174, %r167, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r175, %r174, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r176, %r174, %r175; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r177, %r170, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r178, %r177, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r179, %r177, %r178; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r180, %r170, %r14; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r181, %r180, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r182, %r180, %r181; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p55, %r173, %r176; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p56, %r173, %r176; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p57, %r179, %r182; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p58, %p56, %p57; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p59, %p55, %p58; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r183, %r173, %r176; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r184, %r183, 0, %p59; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r185, %r184, %r167; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r186, %r179, %r182; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r187, %r186, 0, %p59; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r188, %r187, %r170; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r189, %r185, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r190, %r189, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r191, %r189, %r190; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r192, %r185, %r15; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r193, %r192, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r194, %r192, %r193; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r195, %r188, %r20; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r196, %r195, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r197, %r195, %r196; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r198, %r188, %r15; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r199, %r198, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r200, %r198, %r199; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p60, %r191, %r194; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p61, %r191, %r194; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p62, %r197, %r200; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r201, %r197, %r200; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r202, %r201, 0, %p62; + selp.b32 %r203, %r202, 0, %p61; + selp.b32 %r204, %r201, %r203, %p60; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r10, %r204, %r188; +$L__tmp2: + .loc 1 43 34 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:43:34 + selp.b32 %r205, %r8, 0, %p2; + cvt.s64.s32 %rd35, %r205; +$L__tmp3: + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + shr.s32 %r206, %r205, 31; + shfl.sync.bfly.b32 %r207, %r205, 8, 31, -1; + shfl.sync.bfly.b32 %r208, %r206, 8, 31, -1; + cvt.u64.u32 %rd36, %r207; + cvt.u64.u32 %rd37, %r208; + shl.b64 %rd38, %rd37, 32; + or.b64 %rd39, %rd36, %rd38; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + add.s64 %rd40, %rd39, %rd35; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + mov.b64 {_, %r209}, %rd40; + cvt.u32.u64 %r210, %rd40; + shfl.sync.bfly.b32 %r211, %r210, 4, 31, -1; + shfl.sync.bfly.b32 %r212, %r209, 4, 31, -1; + cvt.u64.u32 %rd41, %r211; + cvt.u64.u32 %rd42, %r212; + shl.b64 %rd43, %rd42, 32; + or.b64 %rd44, %rd41, %rd43; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + add.s64 %rd45, %rd40, %rd44; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + mov.b64 {_, %r213}, %rd45; + cvt.u32.u64 %r214, %rd45; + shfl.sync.bfly.b32 %r215, %r214, 2, 31, -1; + shfl.sync.bfly.b32 %r216, %r213, 2, 31, -1; + cvt.u64.u32 %rd46, %r215; + cvt.u64.u32 %rd47, %r216; + shl.b64 %rd48, %rd47, 32; + or.b64 %rd49, %rd46, %rd48; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + add.s64 %rd50, %rd45, %rd49; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + mov.b64 {_, %r217}, %rd50; + cvt.u32.u64 %r218, %rd50; + shfl.sync.bfly.b32 %r219, %r218, 1, 31, -1; + shfl.sync.bfly.b32 %r220, %r217, 1, 31, -1; + cvt.u64.u32 %rd51, %r219; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + add.s64 %rd52, %rd50, %rd51; +$L__tmp4: + .loc 1 47 21 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:47:21 + cvt.u32.u64 %r11, %rd52; + .loc 1 48 62 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:62 + setp.lt.s64 %p63, %rd9, 2; + .loc 1 48 88 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:88 + setp.gt.s64 %p64, %rd9, 1; + .loc 1 48 79 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:79 + selp.b64 %rd53, %rd9, 0, %p64; + .loc 1 48 0 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48 + selp.b64 %rd54, 1, 0, %p63; + .loc 1 48 70 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:70 + add.s64 %rd55, %rd53, %rd54; + .loc 1 48 47 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:47 + mul.lo.s64 %rd56, %rd23, %rd55; + .loc 1 48 25 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:25 + shl.b64 %rd57, %rd21, 6; + add.s64 %rd58, %rd7, %rd57; + add.s64 %rd59, %rd58, %rd28; + shl.b64 %rd60, %rd56, 2; + add.s64 %rd18, %rd59, %rd60; + .loc 1 48 102 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:102 + and.b32 %r221, %r1, 48; + setp.eq.b32 %p65, %r221, 0; + and.pred %p4, %p65, %p2; + // begin inline asm + @%p4 st.global.b32 [ %rd18 + 0 ], { %r10 }; + // end inline asm + .loc 1 49 34 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:49:34 + mul.lo.s64 %rd61, %rd64, %rd55; + .loc 1 49 25 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:49:25 + add.s64 %rd62, %rd8, %rd26; + shl.b64 %rd63, %rd61, 2; + add.s64 %rd19, %rd62, %rd63; + .loc 1 49 89 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:49:89 + and.b32 %r222, %r1, 63; + setp.eq.b32 %p66, %r222, 0; + and.pred %p5, %p66, %p2; + // begin inline asm + @%p5 st.global.b32 [ %rd19 + 0 ], { %r11 }; + // end inline asm + .loc 1 49 4 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:49:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 267 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x104 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 119 +.b8 101 +.b8 53 +.b8 52 +.b8 113 +.b8 100 +.b8 122 +.b8 117 +.b8 100 +.b8 54 +.b8 120 +.b8 115 +.b8 107 +.b8 104 +.b8 118 +.b8 106 +.b8 115 +.b8 117 +.b8 98 +.b8 122 +.b8 113 +.b8 98 +.b8 118 +.b8 105 +.b8 111 +.b8 98 +.b8 116 +.b8 111 +.b8 102 +.b8 105 +.b8 55 +.b8 114 +.b8 116 +.b8 108 +.b8 97 +.b8 51 +.b8 99 +.b8 122 +.b8 51 +.b8 102 +.b8 118 +.b8 117 +.b8 102 +.b8 109 +.b8 101 +.b8 107 +.b8 102 +.b8 121 +.b8 52 +.b8 113 +.b8 102 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 119 +.b8 101 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x3d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 99 +.b8 108 +.b8 111 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 108 +.b8 105 +.b8 99 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 116 +.b8 114 +.b8 97 +.b8 110 +.b8 115 +.b8 112 +.b8 111 +.b8 115 +.b8 101 +.b8 95 +.b8 51 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc8:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xdd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 40 // DW_AT_call_line +.b8 67 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xf5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source b/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source new file mode 100644 index 0000000000000000000000000000000000000000..26f6de8a885ce893d0c2016e47b5ddebe70b4fc3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source @@ -0,0 +1,1283 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":18:0) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc105 = loc(unknown) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc143 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc152 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc157 = loc("in_ptr0"(#loc)) +#loc158 = loc("out_ptr2"(#loc)) +#loc159 = loc("out_ptr3"(#loc)) +#loc160 = loc("ks0"(#loc)) +#loc161 = loc("xnumel"(#loc)) +#loc162 = loc("r0_numel"(#loc)) +#loc194 = loc("x"(#loc55)) +#loc195 = loc("idxs"(#loc55)) +#loc196 = loc("x"(#loc59)) +#loc197 = loc("idxs"(#loc59)) +#loc202 = loc("x"(#loc67)) +#loc203 = loc("idxs"(#loc67)) +#loc204 = loc("flip"(#loc67)) +#loc260 = loc("input"(#loc130)) +#loc261 = loc("a"(#loc134)) +#loc262 = loc("b"(#loc134)) +#loc264 = loc("x"(#loc139)) +#loc265 = loc("x"(#loc143)) +#loc266 = loc("input"(#loc152)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 16 : i32 loc(#loc163) + %xoffset = tt.get_program_id x : i32 loc(#loc164) + %xoffset_1 = arith.constant 1 : i32 loc(#loc165) + %xoffset_2 = arith.constant 1 : i32 loc(#loc165) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc165) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc166) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc167) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc168) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc168) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc169) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc169) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc170) + %r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc171) + %r0_offset = arith.constant 0 : i32 loc(#loc172) + %r0_mask = arith.constant true loc(#loc173) + %r0_mask_9 = arith.constant dense : tensor<1x16xi1> loc(#loc173) + %x0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc174) + %x0_10 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc174) + %x0_11 = arith.remsi %x0, %x0_10 : tensor<1x1xi64> loc(#loc174) + %x1 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc175) + %x1_12 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc175) + %x1_13 = arith.divsi %x1, %x1_12 : tensor<1x1xi64> loc(#loc175) + %tmp0 = arith.extsi %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc176) + %tmp0_14 = tt.broadcast %x0_11 : tensor<1x1xi64> -> tensor<1x16xi64> loc(#loc176) + %tmp0_15 = arith.addi %tmp0, %tmp0_14 : tensor<1x16xi64> loc(#loc176) + %tmp0_16 = arith.constant 16 : i32 loc(#loc177) + %tmp0_17 = arith.constant 16 : i64 loc(#loc177) + %tmp0_18 = arith.constant dense<16> : tensor<1x1xi64> loc(#loc177) + %tmp0_19 = arith.muli %tmp0_18, %x1_13 : tensor<1x1xi64> loc(#loc177) + %tmp0_20 = tt.broadcast %tmp0_19 : tensor<1x1xi64> -> tensor<1x16xi64> loc(#loc178) + %tmp0_21 = arith.addi %tmp0_15, %tmp0_20 : tensor<1x16xi64> loc(#loc178) + %tmp0_22 = arith.extsi %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc179) + %tmp0_23 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc179) + %tmp0_24 = arith.muli %tmp0_23, %tmp0_22 : tensor<1x16xi64> loc(#loc179) + %tmp0_25 = arith.addi %tmp0_21, %tmp0_24 : tensor<1x16xi64> loc(#loc180) + %tmp0_26 = arith.constant 16 : i32 loc(#loc181) + %tmp0_27 = arith.constant 16 : i64 loc(#loc181) + %tmp0_28 = arith.muli %tmp0_27, %ks0 : i64 loc(#loc181) + %tmp0_29 = tt.splat %tmp0_28 : i64 -> tensor<1x1xi64> loc(#loc182) + %tmp0_30 = arith.muli %tmp0_29, %x1_13 : tensor<1x1xi64> loc(#loc182) + %tmp0_31 = tt.broadcast %tmp0_30 : tensor<1x1xi64> -> tensor<1x16xi64> loc(#loc183) + %tmp0_32 = arith.addi %tmp0_25, %tmp0_31 : tensor<1x16xi64> loc(#loc183) + %tmp0_33 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc184) + %tmp0_34 = tt.addptr %tmp0_33, %tmp0_32 : tensor<1x16x!tt.ptr>, tensor<1x16xi64> loc(#loc184) + %tmp0_35 = arith.constant 0.000000e+00 : f32 loc(#loc185) + %tmp0_36 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc185) + %tmp0_37 = arith.constant dense<0.000000e+00> : tensor<1x16xf32> loc(#loc185) + %tmp0_38 = arith.fptosi %tmp0_37 : tensor<1x16xf32> to tensor<1x16xi32> loc(#loc185) + %tmp0_39 = tt.load %tmp0_34, %tmp0_36, %tmp0_38 evictionPolicy = evict_last : tensor<1x16x!tt.ptr> loc(#loc185) + %tmp2 = arith.trunci %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc186) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp0_39, %tmp2) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc25) + %tmp7 = arith.extsi %tmp0_39 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc187) + %tmp10 = arith.constant 0 : i32 loc(#loc188) + %tmp10_40 = arith.constant 0 : i64 loc(#loc188) + %tmp10_41 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc188) + %tmp10_42 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc188) + %tmp10_43 = arith.select %tmp10_42, %tmp7, %tmp10_41 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc188) + %tmp11 = tt.call @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp10_43) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc189) + %tmp11_44 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc190) + %tmp12 = arith.extsi %0#1 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc191) + %tmp13 = arith.trunci %tmp12 : tensor<1x16xi64> to tensor<1x16xi32> loc(#loc192) + %tmp14 = arith.trunci %tmp11_44 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc193) + %c16_i32 = arith.constant 16 : i32 loc(#loc33) + %c16_i64 = arith.constant 16 : i64 loc(#loc33) + %cst = arith.constant dense<16> : tensor<1x1xi64> loc(#loc33) + %1 = arith.muli %cst, %x0_11 : tensor<1x1xi64> loc(#loc33) + %2 = arith.extsi %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc34) + %3 = tt.broadcast %1 : tensor<1x1xi64> -> tensor<1x16xi64> loc(#loc34) + %4 = arith.addi %2, %3 : tensor<1x16xi64> loc(#loc34) + %c16_i32_45 = arith.constant 16 : i32 loc(#loc35) + %c16_i64_46 = arith.constant 16 : i64 loc(#loc35) + %cst_47 = arith.constant dense<16> : tensor<1x1xi64> loc(#loc35) + %5 = arith.muli %cst_47, %x1_13 : tensor<1x1xi64> loc(#loc35) + %c1_i32 = arith.constant 1 : i32 loc(#loc36) + %6 = arith.extsi %c1_i32 : i32 to i64 loc(#loc36) + %7 = arith.cmpi sge, %6, %ks0 : i64 loc(#loc36) + %c1_i32_48 = arith.constant 1 : i32 loc(#loc37) + %c1_i32_49 = arith.constant 1 : i32 loc(#loc37) + %8 = arith.extui %7 : i1 to i32 loc(#loc37) + %9 = arith.muli %c1_i32_49, %8 : i32 loc(#loc37) + %c1_i32_50 = arith.constant 1 : i32 loc(#loc38) + %10 = arith.extsi %c1_i32_50 : i32 to i64 loc(#loc38) + %11 = arith.cmpi sgt, %ks0, %10 : i64 loc(#loc38) + %12 = arith.extui %11 : i1 to i64 loc(#loc39) + %13 = arith.muli %ks0, %12 : i64 loc(#loc39) + %14 = arith.extsi %9 : i32 to i64 loc(#loc40) + %15 = arith.addi %14, %13 : i64 loc(#loc40) + %16 = tt.splat %15 : i64 -> tensor<1x1xi64> loc(#loc41) + %17 = arith.muli %5, %16 : tensor<1x1xi64> loc(#loc41) + %18 = tt.broadcast %17 : tensor<1x1xi64> -> tensor<1x16xi64> loc(#loc42) + %19 = arith.addi %4, %18 : tensor<1x16xi64> loc(#loc42) + %20 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc43) + %21 = tt.addptr %20, %19 : tensor<1x16x!tt.ptr>, tensor<1x16xi64> loc(#loc43) + %22 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc44) + tt.store %21, %tmp13, %22 : tensor<1x16x!tt.ptr> loc(#loc44) + %c1_i32_51 = arith.constant 1 : i32 loc(#loc45) + %23 = arith.extsi %c1_i32_51 : i32 to i64 loc(#loc45) + %24 = arith.cmpi sge, %23, %ks0 : i64 loc(#loc45) + %c1_i32_52 = arith.constant 1 : i32 loc(#loc46) + %c1_i32_53 = arith.constant 1 : i32 loc(#loc46) + %25 = arith.extui %24 : i1 to i32 loc(#loc46) + %26 = arith.muli %c1_i32_53, %25 : i32 loc(#loc46) + %c1_i32_54 = arith.constant 1 : i32 loc(#loc47) + %27 = arith.extsi %c1_i32_54 : i32 to i64 loc(#loc47) + %28 = arith.cmpi sgt, %ks0, %27 : i64 loc(#loc47) + %29 = arith.extui %28 : i1 to i64 loc(#loc48) + %30 = arith.muli %ks0, %29 : i64 loc(#loc48) + %31 = arith.extsi %26 : i32 to i64 loc(#loc49) + %32 = arith.addi %31, %30 : i64 loc(#loc49) + %33 = tt.splat %32 : i64 -> tensor<1x1xi64> loc(#loc50) + %34 = arith.muli %x1_13, %33 : tensor<1x1xi64> loc(#loc50) + %35 = arith.addi %x0_11, %34 : tensor<1x1xi64> loc(#loc51) + %36 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc52) + %37 = tt.addptr %36, %35 : tensor<1x1x!tt.ptr>, tensor<1x1xi64> loc(#loc52) + tt.store %37, %tmp14, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc53) + tt.return loc(#loc54) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc55)), %idxs: tensor<1x16xi16> loc("idxs"(#loc55))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc56) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc56) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc56) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc56) + tt.return %3#0, %3#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc57) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc58) + %5 = ub.poison : tensor<1x16xi32> loc(#loc58) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc58) + } loc(#loc55) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc59)), %idxs: tensor<1x16xi16> loc("idxs"(#loc59))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc198) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc199) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc199) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc200) + %flip_3 = tt.reshape %flip_2 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc201) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i16S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi16>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc64) + tt.return %0#0, %0#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc65) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi32> loc(#loc66) + %2 = ub.poison : tensor<1x16xi32> loc(#loc66) + tt.return %1, %2 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc66) + } loc(#loc59) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i16S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc67)), %idxs: tensor<1x16xi16> loc("idxs"(#loc67)), %flip: tensor<1x16xi32> loc("flip"(#loc67))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc205) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc206) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc207) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc207) + %left_mask = arith.constant 1 : i32 loc(#loc208) + %left_mask_2 = arith.constant 1 : i32 loc(#loc208) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc208) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc208) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc209) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc209) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc210) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc211) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc212) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc213) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc213) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc214) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc215) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc216) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc217) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc218) + %y_idx = tt.reshape %idxs : tensor<1x16xi16> -> tensor<8x2x1xi16> loc(#loc219) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc220) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc221) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<8x2x1xi16> loc(#loc221) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<8x2x1xi16>) -> tensor<8x1xi32> loc(#loc222) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc223) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc224) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc225) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc226) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<8x2x1xi16> loc(#loc226) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<8x2x1xi16>) -> tensor<8x1xi32> loc(#loc227) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc228) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc229) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc230) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc231) + %left_valid_mask = arith.constant true loc(#loc232) + %left_valid_mask_27 = arith.constant dense : tensor<1x16xi1> loc(#loc232) + %right_valid_mask = arith.constant true loc(#loc233) + %right_valid_mask_28 = arith.constant dense : tensor<1x16xi1> loc(#loc233) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc234) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc235) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc268) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc100) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_49 = arith.constant true loc(#loc237) + %cond_50 = arith.constant dense : tensor<1x16xi1> loc(#loc237) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<1x16xi1> loc(#loc237) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<1x16xi1> loc(#loc238) + %cond_53 = arith.ori %cond, %cond_52 : tensor<1x16xi1> loc(#loc269) + scf.yield %cond_53 : tensor<1x16xi1> loc(#loc269) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc105) + } loc(#loc101) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc270) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc107) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc241) + %eq_50 = arith.ori %eq, %eq_49 : tensor<1x16xi1> loc(#loc271) + scf.yield %eq_50 : tensor<1x16xi1> loc(#loc271) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc105) + } loc(#loc108) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<1x16xi32> loc(#loc243) + %cond_30 = arith.andi %3, %cond_29 : tensor<1x16xi1> loc(#loc244) + %cond_31 = arith.ori %1, %cond_30 : tensor<1x16xi1> loc(#loc245) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<1x16xi1> loc(#loc246) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<1x16xi1> loc(#loc247) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<1x16xi1> loc(#loc248) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<1x16xi1> loc(#loc249) + %cond_36 = arith.extui %cond_35 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc250) + %cond_37 = arith.xori %cond_36, %flip : tensor<1x16xi32> loc(#loc250) + %cond_38 = arith.constant 0 : i32 loc(#loc251) + %cond_39 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc251) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<1x16xi32> loc(#loc251) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc252) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc253) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc254) + %ret_43 = arith.xori %x, %ret_42 : tensor<1x16xi32> loc(#loc255) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<1x16xi32> loc(#loc256) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S1_16S__(%idxs) : (tensor<1x16xi16>) -> tensor<1x16xi16> loc(#loc257) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc258) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc258) + %new_idxs_47 = arith.extsi %idxs : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc259) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<1x16xi32> loc(#loc259) + tt.return %ret_43, %new_idxs_48 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc128) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc129) + %5 = ub.poison : tensor<1x16xi32> loc(#loc129) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc129) + } loc(#loc67) + tt.func private @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x1xi32> loc("input"(#loc130))) -> tensor<8x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc131) + tt.reduce.return %2 : i32 loc(#loc131) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc131) + tt.return %0 : tensor<8x1xi32> loc(#loc132) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x1xi32> loc(#loc133) + tt.return %1 : tensor<8x1xi32> loc(#loc133) + } loc(#loc130) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc134)), %b: i32 loc("b"(#loc134))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc135) + tt.return %0 : i32 loc(#loc136) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc137) + tt.return %1 : i32 loc(#loc137) + } loc(#loc134) + tt.func private @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x1xi16> loc("input"(#loc130))) -> tensor<8x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc263) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc131) + tt.reduce.return %2 : i32 loc(#loc131) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc131) + tt.return %0 : tensor<8x1xi32> loc(#loc132) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x1xi32> loc(#loc133) + tt.return %1 : tensor<8x1xi32> loc(#loc133) + } loc(#loc130) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%x: tensor<1x16xi32> loc("x"(#loc139))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc140) + %false = arith.constant false loc(#loc141) + tt.return %false : i1 loc(#loc141) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc142) + tt.return %1 : i1 loc(#loc142) + } loc(#loc139) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S1_16S__(%x: tensor<1x16xi32> loc("x"(#loc143))) -> tensor<1x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc144) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc145) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc145) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc145) + %4 = arith.addi %x, %3 : tensor<1x16xi32> loc(#loc145) + tt.return %4 : tensor<1x16xi32> loc(#loc146) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x16xi32> loc(#loc147) + tt.return %5 : tensor<1x16xi32> loc(#loc147) + } loc(#loc143) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc149) + %cst = arith.constant dense : tensor<1xi1> loc(#loc149) + tt.return %cst : tensor<1xi1> loc(#loc150) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc151) + tt.return %0 : tensor<1xi1> loc(#loc151) + } loc(#loc148) + tt.func private @triton.language.standard.zeros_like__i32S1_16S__(%input: tensor<1x16xi32> loc("input"(#loc152))) -> tensor<1x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<1x16xi32> loc(#loc153) + tt.return %0 : tensor<1x16xi32> loc(#loc154) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi32> loc(#loc155) + tt.return %1 : tensor<1x16xi32> loc(#loc155) + } loc(#loc152) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<1x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc149) + %cst = arith.constant dense<0> : tensor<1x16xi32> loc(#loc149) + tt.return %cst : tensor<1x16xi32> loc(#loc150) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16xi32> loc(#loc151) + tt.return %0 : tensor<1x16xi32> loc(#loc151) + } loc(#loc148) + tt.func private @triton.language.standard.zeros_like__i16S1_16S__(%input: tensor<1x16xi16> loc("input"(#loc152))) -> tensor<1x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<1x16xi16> loc(#loc153) + tt.return %0 : tensor<1x16xi16> loc(#loc154) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi16> loc(#loc155) + tt.return %1 : tensor<1x16xi16> loc(#loc155) + } loc(#loc152) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<1x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc149) + %cst = arith.constant dense<0> : tensor<1x16xi16> loc(#loc149) + tt.return %cst : tensor<1x16xi16> loc(#loc150) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16xi16> loc(#loc151) + tt.return %0 : tensor<1x16xi16> loc(#loc151) + } loc(#loc148) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc59)), %idxs: tensor<1x16xi32> loc("idxs"(#loc59))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc198) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc199) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc199) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc200) + %flip_3 = tt.reshape %flip_2 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc201) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc64) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc64) + tt.return %1#0, %1#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc65) + ^bb1: // no predecessors + %2 = ub.poison : tensor<1x16xi32> loc(#loc66) + %3 = ub.poison : tensor<1x16xi32> loc(#loc66) + tt.return %2, %3 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc66) + } loc(#loc59) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc67)), %idxs: tensor<1x16xi32> loc("idxs"(#loc67)), %flip: tensor<1x16xi32> loc("flip"(#loc67))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc205) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc206) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc207) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc207) + %left_mask = arith.constant 1 : i32 loc(#loc208) + %left_mask_2 = arith.constant 1 : i32 loc(#loc208) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc208) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc208) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc209) + %ileft_5 = arith.muli %y, %ileft : tensor<4x2x2xi32> loc(#loc209) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc210) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc211) + %ileft_8 = tt.broadcast %ileft_7 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc212) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc213) + %iright_9 = arith.muli %y, %iright : tensor<4x2x2xi32> loc(#loc213) + %iright_10 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc214) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc215) + %iright_12 = tt.broadcast %iright_11 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc216) + %ileft_13 = tt.reshape %ileft_8 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc217) + %iright_14 = tt.reshape %iright_12 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc218) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc219) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc221) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<4x2x2xi32> loc(#loc221) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc222) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc223) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc224) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc226) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<4x2x2xi32> loc(#loc226) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc227) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc228) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc229) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc230) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc231) + %left_valid_mask = arith.constant true loc(#loc232) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc232) + %right_valid_mask = arith.constant true loc(#loc233) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc233) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc234) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc235) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc268) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc100) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc237) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc237) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc237) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc238) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc269) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc269) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc105) + } loc(#loc101) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc270) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc107) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc241) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc271) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc271) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc105) + } loc(#loc108) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc243) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc244) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc245) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc246) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc247) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc248) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc249) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc250) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc250) + %cond_36 = arith.constant 0 : i32 loc(#loc251) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc251) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc251) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc252) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc253) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc254) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc255) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc256) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc257) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc258) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc259) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc128) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc129) + %5 = ub.poison : tensor<1x16xi32> loc(#loc129) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc129) + } loc(#loc67) + tt.func private @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x2x2xi32> loc("input"(#loc130))) -> tensor<4x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc131) + tt.reduce.return %2 : i32 loc(#loc131) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc131) + tt.return %0 : tensor<4x2xi32> loc(#loc132) + ^bb1: // no predecessors + %1 = ub.poison : tensor<4x2xi32> loc(#loc133) + tt.return %1 : tensor<4x2xi32> loc(#loc133) + } loc(#loc130) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc67)), %idxs: tensor<1x16xi32> loc("idxs"(#loc67)), %flip: tensor<1x16xi32> loc("flip"(#loc67))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc205) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc206) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc207) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc207) + %left_mask = arith.constant 1 : i32 loc(#loc208) + %left_mask_2 = arith.constant 1 : i32 loc(#loc208) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc208) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc208) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc209) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc209) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc210) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc211) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc212) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc213) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc213) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc214) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc215) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc216) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc217) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc218) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc219) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc221) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x1xi32> loc(#loc221) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc222) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc223) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc224) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc226) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x1xi32> loc(#loc226) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc227) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc228) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc229) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc230) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc231) + %left_valid_mask = arith.constant true loc(#loc232) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc232) + %right_valid_mask = arith.constant true loc(#loc233) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc233) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc234) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc235) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc268) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc100) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc237) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc237) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc237) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc238) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc269) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc269) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc105) + } loc(#loc101) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc270) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc107) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc241) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc271) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc271) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc105) + } loc(#loc108) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc243) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc244) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc245) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc246) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc247) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc248) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc249) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc250) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc250) + %cond_36 = arith.constant 0 : i32 loc(#loc251) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc251) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc251) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc252) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc253) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc254) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc255) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc256) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc257) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc258) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc259) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc128) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc129) + %5 = ub.poison : tensor<1x16xi32> loc(#loc129) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc129) + } loc(#loc67) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc59)), %idxs: tensor<1x16xi32> loc("idxs"(#loc59))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc198) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc199) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc199) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc200) + %flip_3 = tt.reshape %flip_2 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc201) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc64) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc64) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc64) + tt.return %2#0, %2#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc65) + ^bb1: // no predecessors + %3 = ub.poison : tensor<1x16xi32> loc(#loc66) + %4 = ub.poison : tensor<1x16xi32> loc(#loc66) + tt.return %3, %4 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc66) + } loc(#loc59) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc67)), %idxs: tensor<1x16xi32> loc("idxs"(#loc67)), %flip: tensor<1x16xi32> loc("flip"(#loc67))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc205) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc206) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc207) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc207) + %left_mask = arith.constant 1 : i32 loc(#loc208) + %left_mask_2 = arith.constant 1 : i32 loc(#loc208) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc208) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc208) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc209) + %ileft_5 = arith.muli %y, %ileft : tensor<2x2x4xi32> loc(#loc209) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc210) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc211) + %ileft_8 = tt.broadcast %ileft_7 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc212) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc213) + %iright_9 = arith.muli %y, %iright : tensor<2x2x4xi32> loc(#loc213) + %iright_10 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc214) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc215) + %iright_12 = tt.broadcast %iright_11 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc216) + %ileft_13 = tt.reshape %ileft_8 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc217) + %iright_14 = tt.reshape %iright_12 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc218) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc219) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc221) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<2x2x4xi32> loc(#loc221) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc222) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc223) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc224) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc226) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<2x2x4xi32> loc(#loc226) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc227) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc228) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc229) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc230) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc231) + %left_valid_mask = arith.constant true loc(#loc232) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc232) + %right_valid_mask = arith.constant true loc(#loc233) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc233) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc234) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc235) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc268) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc100) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc237) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc237) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc237) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc238) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc269) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc269) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc105) + } loc(#loc101) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc270) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc107) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc241) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc271) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc271) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc105) + } loc(#loc108) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc243) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc244) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc245) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc246) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc247) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc248) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc249) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc250) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc250) + %cond_36 = arith.constant 0 : i32 loc(#loc251) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc251) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc251) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc252) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc253) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc254) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc255) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc256) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc257) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc258) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc259) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc128) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc129) + %5 = ub.poison : tensor<1x16xi32> loc(#loc129) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc129) + } loc(#loc67) + tt.func private @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<2x2x4xi32> loc("input"(#loc130))) -> tensor<2x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc131) + tt.reduce.return %2 : i32 loc(#loc131) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc131) + tt.return %0 : tensor<2x4xi32> loc(#loc132) + ^bb1: // no predecessors + %1 = ub.poison : tensor<2x4xi32> loc(#loc133) + tt.return %1 : tensor<2x4xi32> loc(#loc133) + } loc(#loc130) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc59)), %idxs: tensor<1x16xi32> loc("idxs"(#loc59))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc267) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc64) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc64) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc64) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc64) + tt.return %3#0, %3#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc65) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc66) + %5 = ub.poison : tensor<1x16xi32> loc(#loc66) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc66) + } loc(#loc59) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc67)), %idxs: tensor<1x16xi32> loc("idxs"(#loc67)), %flip: i1 loc("flip"(#loc67))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc205) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc206) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc207) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc207) + %left_mask = arith.constant 1 : i32 loc(#loc208) + %left_mask_2 = arith.constant 1 : i32 loc(#loc208) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc208) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc208) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc209) + %ileft_5 = arith.muli %y, %ileft : tensor<1x2x8xi32> loc(#loc209) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc210) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc211) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc212) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc213) + %iright_9 = arith.muli %y, %iright : tensor<1x2x8xi32> loc(#loc213) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc214) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc215) + %iright_12 = tt.broadcast %iright_11 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc216) + %ileft_13 = tt.reshape %ileft_8 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc217) + %iright_14 = tt.reshape %iright_12 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc218) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc219) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc221) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<1x2x8xi32> loc(#loc221) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc222) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc223) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc224) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc226) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<1x2x8xi32> loc(#loc226) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc227) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc228) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc229) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc230) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc231) + %left_valid_mask = arith.constant true loc(#loc232) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc232) + %right_valid_mask = arith.constant true loc(#loc233) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc233) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc234) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc235) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc268) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc100) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc237) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc237) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc237) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc238) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc269) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc269) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc105) + } loc(#loc101) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc270) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc107) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc241) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc271) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc271) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc105) + } loc(#loc108) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc243) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc244) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc245) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc246) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc247) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc248) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc249) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc250) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc250) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc252) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc253) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc254) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc255) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc256) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc257) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc258) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc259) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc128) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc129) + %5 = ub.poison : tensor<1x16xi32> loc(#loc129) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc129) + } loc(#loc67) + tt.func private @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2x8xi32> loc("input"(#loc130))) -> tensor<1x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc131) + tt.reduce.return %2 : i32 loc(#loc131) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc131) + tt.return %0 : tensor<1x8xi32> loc(#loc132) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x8xi32> loc(#loc133) + tt.return %1 : tensor<1x8xi32> loc(#loc133) + } loc(#loc130) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc67)), %idxs: tensor<1x16xi32> loc("idxs"(#loc67)), %flip: i1 loc("flip"(#loc67))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc205) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc206) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc207) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc207) + %left_mask = arith.constant 1 : i32 loc(#loc208) + %left_mask_2 = arith.constant 1 : i32 loc(#loc208) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc208) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc208) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc209) + %ileft_5 = arith.muli %y, %ileft : tensor<2x2x4xi32> loc(#loc209) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc210) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc211) + %ileft_8 = tt.broadcast %ileft_7 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc212) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc213) + %iright_9 = arith.muli %y, %iright : tensor<2x2x4xi32> loc(#loc213) + %iright_10 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc214) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc215) + %iright_12 = tt.broadcast %iright_11 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc216) + %ileft_13 = tt.reshape %ileft_8 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc217) + %iright_14 = tt.reshape %iright_12 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc218) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc219) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc221) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<2x2x4xi32> loc(#loc221) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc222) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc223) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc224) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc226) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<2x2x4xi32> loc(#loc226) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc227) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc228) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc229) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc230) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc231) + %left_valid_mask = arith.constant true loc(#loc232) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc232) + %right_valid_mask = arith.constant true loc(#loc233) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc233) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc234) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc235) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc268) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc100) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc237) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc237) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc237) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc238) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc269) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc269) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc105) + } loc(#loc101) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc270) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc107) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc241) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc271) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc271) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc105) + } loc(#loc108) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc243) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc244) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc245) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc246) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc247) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc248) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc249) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc250) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc250) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc252) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc253) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc254) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc255) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc256) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc257) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc258) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc259) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc128) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc129) + %5 = ub.poison : tensor<1x16xi32> loc(#loc129) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc129) + } loc(#loc67) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc67)), %idxs: tensor<1x16xi32> loc("idxs"(#loc67)), %flip: i1 loc("flip"(#loc67))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc205) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc206) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc207) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc207) + %left_mask = arith.constant 1 : i32 loc(#loc208) + %left_mask_2 = arith.constant 1 : i32 loc(#loc208) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc208) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc208) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc209) + %ileft_5 = arith.muli %y, %ileft : tensor<4x2x2xi32> loc(#loc209) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc210) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc211) + %ileft_8 = tt.broadcast %ileft_7 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc212) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc213) + %iright_9 = arith.muli %y, %iright : tensor<4x2x2xi32> loc(#loc213) + %iright_10 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc214) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc215) + %iright_12 = tt.broadcast %iright_11 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc216) + %ileft_13 = tt.reshape %ileft_8 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc217) + %iright_14 = tt.reshape %iright_12 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc218) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc219) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc221) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<4x2x2xi32> loc(#loc221) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc222) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc223) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc224) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc226) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<4x2x2xi32> loc(#loc226) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc227) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc228) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc229) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc230) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc231) + %left_valid_mask = arith.constant true loc(#loc232) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc232) + %right_valid_mask = arith.constant true loc(#loc233) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc233) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc234) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc235) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc268) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc100) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc237) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc237) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc237) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc238) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc269) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc269) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc105) + } loc(#loc101) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc270) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc107) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc241) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc271) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc271) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc105) + } loc(#loc108) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc243) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc244) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc245) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc246) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc247) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc248) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc249) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc250) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc250) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc252) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc253) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc254) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc255) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc256) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc257) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc258) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc259) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc128) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc129) + %5 = ub.poison : tensor<1x16xi32> loc(#loc129) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc129) + } loc(#loc67) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc67)), %idxs: tensor<1x16xi32> loc("idxs"(#loc67)), %flip: i1 loc("flip"(#loc67))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc205) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc206) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc207) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc207) + %left_mask = arith.constant 1 : i32 loc(#loc208) + %left_mask_2 = arith.constant 1 : i32 loc(#loc208) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc208) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc208) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc209) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc209) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc210) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc211) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc212) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc213) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc213) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc214) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc215) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc216) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc217) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc218) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc219) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc221) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x1xi32> loc(#loc221) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc222) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc223) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc224) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc226) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x1xi32> loc(#loc226) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc227) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc228) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc229) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc230) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc231) + %left_valid_mask = arith.constant true loc(#loc232) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc232) + %right_valid_mask = arith.constant true loc(#loc233) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc233) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc234) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc235) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc268) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc100) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc237) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc237) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc237) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc238) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc269) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc269) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc105) + } loc(#loc101) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc270) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc107) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc241) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc271) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc271) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc105) + } loc(#loc108) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc243) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc244) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc245) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc246) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc247) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc248) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc249) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc250) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc250) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc252) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc253) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc254) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc255) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc256) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc257) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc258) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc259) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc128) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc129) + %5 = ub.poison : tensor<1x16xi32> loc(#loc129) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc129) + } loc(#loc67) + tt.func private @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x16xi64> loc("input"(#loc130))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc131) + tt.reduce.return %2 : i64 loc(#loc131) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc131) + tt.return %0 : tensor<1xi64> loc(#loc132) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc133) + tt.return %1 : tensor<1xi64> loc(#loc133) + } loc(#loc130) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc134)), %b: i64 loc("b"(#loc134))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc135) + tt.return %0 : i64 loc(#loc136) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc137) + tt.return %1 : i64 loc(#loc137) + } loc(#loc134) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":25:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":26:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":26:38) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":27:16) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":28:48) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":32:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":33:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:45) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:42) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:54) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:50) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:64) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:68) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:61) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:30) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:73) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":37:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":40:67) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":41:19) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":43:34) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":44:26) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":44:29) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":45:20) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":46:21) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":47:21) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:35) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:32) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:43) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:62) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:54) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:88) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:79) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:70) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:47) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:40) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:25) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:102) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:49) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:41) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:75) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:66) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:57) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:34) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:30) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:25) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:89) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:4) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc140 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc141 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc142 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc144 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc145 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc146 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc147 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc148 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc149 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc150 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc151 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc153 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc154 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc155 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc156 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc163 = loc("r0_numel"(#loc1)) +#loc164 = loc("xoffset"(#loc2)) +#loc165 = loc("xoffset"(#loc3)) +#loc166 = loc("xindex"(#loc4)) +#loc167 = loc("xindex"(#loc5)) +#loc168 = loc("xindex"(#loc6)) +#loc169 = loc("xmask"(#loc7)) +#loc170 = loc("r0_index"(#loc8)) +#loc171 = loc("r0_index"(#loc9)) +#loc172 = loc("r0_offset"(#loc10)) +#loc173 = loc("r0_mask"(#loc11)) +#loc174 = loc("x0"(#loc12)) +#loc175 = loc("x1"(#loc13)) +#loc176 = loc("tmp0"(#loc14)) +#loc177 = loc("tmp0"(#loc15)) +#loc178 = loc("tmp0"(#loc16)) +#loc179 = loc("tmp0"(#loc17)) +#loc180 = loc("tmp0"(#loc18)) +#loc181 = loc("tmp0"(#loc19)) +#loc182 = loc("tmp0"(#loc20)) +#loc183 = loc("tmp0"(#loc21)) +#loc184 = loc("tmp0"(#loc22)) +#loc185 = loc("tmp0"(#loc23)) +#loc186 = loc("tmp2"(#loc24)) +#loc187 = loc("tmp7"(#loc26)) +#loc188 = loc("tmp10"(#loc27)) +#loc189 = loc("tmp11"(#loc28)) +#loc190 = loc("tmp11"(#loc29)) +#loc191 = loc("tmp12"(#loc30)) +#loc192 = loc("tmp13"(#loc31)) +#loc193 = loc("tmp14"(#loc32)) +#loc198 = loc("flip"(#loc60)) +#loc199 = loc("flip"(#loc61)) +#loc200 = loc("flip"(#loc62)) +#loc201 = loc("flip"(#loc63)) +#loc205 = loc("y"(#loc68)) +#loc206 = loc("right_mask"(#loc69)) +#loc207 = loc("right_mask"(#loc70)) +#loc208 = loc("left_mask"(#loc71)) +#loc209 = loc("ileft"(#loc72)) +#loc210 = loc("ileft"(#loc73)) +#loc211 = loc("ileft"(#loc74)) +#loc212 = loc("ileft"(#loc75)) +#loc213 = loc("iright"(#loc76)) +#loc214 = loc("iright"(#loc77)) +#loc215 = loc("iright"(#loc78)) +#loc216 = loc("iright"(#loc79)) +#loc217 = loc("ileft"(#loc80)) +#loc218 = loc("iright"(#loc81)) +#loc219 = loc("y_idx"(#loc82)) +#loc220 = loc("left_idx"(#loc83)) +#loc221 = loc("left_idx"(#loc84)) +#loc222 = loc("left_idx"(#loc85)) +#loc223 = loc("left_idx"(#loc86)) +#loc224 = loc("left_idx"(#loc87)) +#loc225 = loc("right_idx"(#loc88)) +#loc226 = loc("right_idx"(#loc89)) +#loc227 = loc("right_idx"(#loc90)) +#loc228 = loc("right_idx"(#loc91)) +#loc229 = loc("right_idx"(#loc92)) +#loc230 = loc("left_idx"(#loc93)) +#loc231 = loc("right_idx"(#loc94)) +#loc232 = loc("left_valid_mask"(#loc95)) +#loc233 = loc("right_valid_mask"(#loc96)) +#loc234 = loc("left_isnan"(#loc97)) +#loc235 = loc("right_isnan"(#loc98)) +#loc236 = loc("cond"(#loc99)) +#loc237 = loc("cond"(#loc102)) +#loc238 = loc("cond"(#loc103)) +#loc239 = loc("cond"(#loc104)) +#loc240 = loc("eq"(#loc106)) +#loc241 = loc("eq"(#loc109)) +#loc242 = loc("eq"(#loc110)) +#loc243 = loc("cond"(#loc111)) +#loc244 = loc("cond"(#loc112)) +#loc245 = loc("cond"(#loc113)) +#loc246 = loc("cond"(#loc114)) +#loc247 = loc("cond"(#loc115)) +#loc248 = loc("cond"(#loc116)) +#loc249 = loc("cond"(#loc117)) +#loc250 = loc("cond"(#loc118)) +#loc251 = loc("cond"(#loc119)) +#loc252 = loc("ret"(#loc120)) +#loc253 = loc("ret"(#loc121)) +#loc254 = loc("ret"(#loc122)) +#loc255 = loc("ret"(#loc123)) +#loc256 = loc("new_idxs"(#loc124)) +#loc257 = loc("new_idxs"(#loc125)) +#loc258 = loc("new_idxs"(#loc126)) +#loc259 = loc("new_idxs"(#loc127)) +#loc263 = loc("input"(#loc138)) +#loc267 = loc("flip"(#loc156)) +#loc268 = loc("cond"(#loc236)) +#loc269 = loc("cond"(#loc239)) +#loc270 = loc("eq"(#loc240)) +#loc271 = loc("eq"(#loc242)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..9cd14ce470a051b0d1f9d3714d29a60e528912e3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir @@ -0,0 +1,854 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [2, 2, 8], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [4, 2, 4], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked3 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [8, 2, 2], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked4 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [16, 2, 1], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [2, 1], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":18:0) +#loc1 = loc(unknown) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":40:67) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":44:26) +#loc86 = loc("in_ptr0"(#loc)) +#loc87 = loc("out_ptr2"(#loc)) +#loc88 = loc("out_ptr3"(#loc)) +#loc89 = loc("ks0"(#loc)) +#loc90 = loc("xnumel"(#loc)) +#loc91 = loc("r0_numel"(#loc)) +#loc109 = loc(callsite(#loc19 at #loc20)) +#loc115 = loc("ileft"(#loc28)) +#loc119 = loc("iright"(#loc33)) +#loc128 = loc("left_idx"(#loc42)) +#loc133 = loc("right_idx"(#loc47)) +#loc153 = loc("tmp11"(#loc67)) +#loc166 = loc(callsite(#loc24 at #loc109)) +#loc170 = loc(callsite(#loc1 at #loc153)) +#loc174 = loc(callsite(#loc115 at #loc166)) +#loc178 = loc(callsite(#loc119 at #loc166)) +#loc186 = loc(callsite(#loc128 at #loc166)) +#loc191 = loc(callsite(#loc133 at #loc166)) +#loc211 = loc(callsite(#loc1 at #loc174)) +#loc213 = loc(callsite(#loc1 at #loc178)) +#loc216 = loc(callsite(#loc1 at #loc186)) +#loc219 = loc(callsite(#loc1 at #loc191)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x16xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked1> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked2> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked3> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<1x16xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked4> loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst_5 = arith.constant dense<0> : tensor<1x16xi32, #blocked5> loc(#loc1) + %c16_i64 = arith.constant 16 : i64 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc92) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc93) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc94) + %r0_index_6 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked5}>> loc(#loc94) + %r0_index_7 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc94) + %r0_index_8 = tt.expand_dims %r0_index_6 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x16xi32, #blocked5> loc(#loc94) + %x0 = arith.extsi %xoffset : i32 to i64 loc(#loc95) + %x0_9 = arith.remsi %x0, %ks0 : i64 loc(#loc95) + %x1 = arith.divsi %x0, %ks0 : i64 loc(#loc96) + %tmp0 = arith.extsi %r0_index_7 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc97) + %tmp0_10 = arith.extsi %r0_index_8 : tensor<1x16xi32, #blocked5> to tensor<1x16xi64, #blocked5> loc(#loc97) + %tmp0_11 = tt.splat %x0_9 : i64 -> tensor<1x16xi64, #blocked> loc(#loc159) + %tmp0_12 = tt.splat %x0_9 : i64 -> tensor<1x16xi64, #blocked5> loc(#loc159) + %tmp0_13 = arith.addi %tmp0, %tmp0_11 : tensor<1x16xi64, #blocked> loc(#loc97) + %tmp0_14 = arith.addi %tmp0_10, %tmp0_12 : tensor<1x16xi64, #blocked5> loc(#loc97) + %tmp0_15 = arith.muli %x1, %c16_i64 : i64 loc(#loc98) + %tmp0_16 = tt.splat %tmp0_15 : i64 -> tensor<1x16xi64, #blocked> loc(#loc160) + %tmp0_17 = tt.splat %tmp0_15 : i64 -> tensor<1x16xi64, #blocked5> loc(#loc160) + %tmp0_18 = arith.addi %tmp0_13, %tmp0_16 : tensor<1x16xi64, #blocked> loc(#loc99) + %tmp0_19 = arith.addi %tmp0_14, %tmp0_17 : tensor<1x16xi64, #blocked5> loc(#loc99) + %tmp0_20 = tt.splat %ks0 : i64 -> tensor<1x16xi64, #blocked> loc(#loc100) + %tmp0_21 = tt.splat %ks0 : i64 -> tensor<1x16xi64, #blocked5> loc(#loc100) + %tmp0_22 = arith.muli %tmp0_20, %tmp0 : tensor<1x16xi64, #blocked> loc(#loc100) + %tmp0_23 = arith.muli %tmp0_21, %tmp0_10 : tensor<1x16xi64, #blocked5> loc(#loc100) + %tmp0_24 = arith.addi %tmp0_18, %tmp0_22 : tensor<1x16xi64, #blocked> loc(#loc101) + %tmp0_25 = arith.addi %tmp0_19, %tmp0_23 : tensor<1x16xi64, #blocked5> loc(#loc101) + %tmp0_26 = arith.muli %ks0, %c16_i64 : i64 loc(#loc102) + %tmp0_27 = arith.muli %tmp0_26, %x1 : i64 loc(#loc103) + %tmp0_28 = tt.splat %tmp0_27 : i64 -> tensor<1x16xi64, #blocked> loc(#loc161) + %tmp0_29 = tt.splat %tmp0_27 : i64 -> tensor<1x16xi64, #blocked5> loc(#loc161) + %tmp0_30 = arith.addi %tmp0_24, %tmp0_28 : tensor<1x16xi64, #blocked> loc(#loc104) + %tmp0_31 = arith.addi %tmp0_25, %tmp0_29 : tensor<1x16xi64, #blocked5> loc(#loc104) + %tmp0_32 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> loc(#loc105) + %tmp0_33 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc105) + %tmp0_34 = tt.addptr %tmp0_32, %tmp0_30 : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi64, #blocked> loc(#loc105) + %tmp0_35 = tt.addptr %tmp0_33, %tmp0_31 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi64, #blocked5> loc(#loc105) + %tmp0_36 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked> loc(#loc162) + %tmp0_37 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked5> loc(#loc162) + %tmp0_38 = tt.load %tmp0_34, %tmp0_36, %cst_3 evictionPolicy = evict_last : tensor<1x16x!tt.ptr, #blocked> loc(#loc106) + %tmp0_39 = tt.load %tmp0_35, %tmp0_37, %cst_5 evictionPolicy = evict_last : tensor<1x16x!tt.ptr, #blocked5> loc(#loc106) + %tmp2 = arith.trunci %r0_index_8 : tensor<1x16xi32, #blocked5> to tensor<1x16xi16, #blocked5> loc(#loc107) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> loc(#loc163) + %flip_40 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> loc(#loc163) + %flip_41 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> loc(#loc163) + %flip_42 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> loc(#loc163) + %flip_43 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> loc(#loc163) + %flip_44 = tt.expand_dims %flip_40 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> loc(#loc163) + %flip_45 = tt.expand_dims %flip_41 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> loc(#loc163) + %flip_46 = tt.expand_dims %flip_42 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> loc(#loc163) + %flip_47 = tt.expand_dims %flip_43 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> -> tensor<1x2x1xi32, #blocked3> loc(#loc163) + %flip_48 = tt.expand_dims %flip_44 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> -> tensor<1x2x1xi32, #blocked4> loc(#loc163) + %flip_49 = tt.expand_dims %flip_45 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> -> tensor<1x2x1xi32, #blocked2> loc(#loc163) + %flip_50 = tt.expand_dims %flip_46 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> -> tensor<1x2x1xi32, #blocked1> loc(#loc163) + %flip_51 = tt.broadcast %flip_47 : tensor<1x2x1xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc164) + %flip_52 = tt.reshape %flip_51 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc165) + %y = tt.reshape %tmp0_39 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc171) + %left_mask = arith.subi %cst_4, %flip_48 : tensor<1x2x1xi32, #blocked4> loc(#loc172) + %left_mask_53 = arith.subi %cst_2, %flip_47 : tensor<1x2x1xi32, #blocked3> loc(#loc172) + %left_mask_54 = arith.subi %cst_1, %flip_49 : tensor<1x2x1xi32, #blocked2> loc(#loc172) + %left_mask_55 = arith.subi %cst_0, %flip_50 : tensor<1x2x1xi32, #blocked1> loc(#loc172) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc173) + %ileft_56 = arith.muli %y, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc173) + %ileft_57 = "tt.reduce"(%ileft_56) <{axis = 1 : i32}> ({ + ^bb0(%ileft_407: i32 loc(callsite(#loc1 at #loc174)), %ileft_408: i32 loc(callsite(#loc1 at #loc174))): + %ileft_409 = arith.addi %ileft_407, %ileft_408 : i32 loc(#loc220) + tt.reduce.return %ileft_409 : i32 loc(#loc210) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc210) + %ileft_58 = tt.expand_dims %ileft_57 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc175) + %ileft_59 = tt.broadcast %ileft_58 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc176) + %iright = tt.broadcast %flip_48 : tensor<1x2x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc177) + %iright_60 = arith.muli %y, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc177) + %iright_61 = "tt.reduce"(%iright_60) <{axis = 1 : i32}> ({ + ^bb0(%iright_407: i32 loc(callsite(#loc1 at #loc178)), %iright_408: i32 loc(callsite(#loc1 at #loc178))): + %iright_409 = arith.addi %iright_407, %iright_408 : i32 loc(#loc221) + tt.reduce.return %iright_409 : i32 loc(#loc212) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc212) + %iright_62 = tt.expand_dims %iright_61 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc179) + %iright_63 = tt.broadcast %iright_62 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc180) + %ileft_64 = tt.reshape %ileft_59 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc181) + %iright_65 = tt.reshape %iright_63 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc182) + %y_idx = tt.reshape %tmp2 : tensor<1x16xi16, #blocked5> -> tensor<8x2x1xi16, #blocked4> loc(#loc183) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc184) + %left_idx_66 = tt.broadcast %left_idx : tensor<1x2x1xi16, #blocked4> -> tensor<8x2x1xi16, #blocked4> loc(#loc185) + %left_idx_67 = arith.muli %y_idx, %left_idx_66 : tensor<8x2x1xi16, #blocked4> loc(#loc185) + %input = arith.extsi %left_idx_67 : tensor<8x2x1xi16, #blocked4> to tensor<8x2x1xi32, #blocked4> loc(#loc214) + %left_idx_68 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_407: i32 loc(callsite(#loc1 at #loc186)), %left_idx_408: i32 loc(callsite(#loc1 at #loc186))): + %left_idx_409 = arith.addi %left_idx_407, %left_idx_408 : i32 loc(#loc222) + tt.reduce.return %left_idx_409 : i32 loc(#loc215) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc215) + %left_idx_69 = tt.expand_dims %left_idx_68 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc187) + %left_idx_70 = tt.broadcast %left_idx_69 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc188) + %right_idx = arith.trunci %flip_48 : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc189) + %right_idx_71 = tt.broadcast %right_idx : tensor<1x2x1xi16, #blocked4> -> tensor<8x2x1xi16, #blocked4> loc(#loc190) + %right_idx_72 = arith.muli %y_idx, %right_idx_71 : tensor<8x2x1xi16, #blocked4> loc(#loc190) + %input_73 = arith.extsi %right_idx_72 : tensor<8x2x1xi16, #blocked4> to tensor<8x2x1xi32, #blocked4> loc(#loc217) + %right_idx_74 = "tt.reduce"(%input_73) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_407: i32 loc(callsite(#loc1 at #loc191)), %right_idx_408: i32 loc(callsite(#loc1 at #loc191))): + %right_idx_409 = arith.addi %right_idx_407, %right_idx_408 : i32 loc(#loc223) + tt.reduce.return %right_idx_409 : i32 loc(#loc218) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc218) + %right_idx_75 = tt.expand_dims %right_idx_74 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc192) + %right_idx_76 = tt.broadcast %right_idx_75 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc193) + %left_idx_77 = tt.reshape %left_idx_70 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc194) + %right_idx_78 = tt.reshape %right_idx_76 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc195) + %cond = arith.cmpi slt, %ileft_64, %iright_65 : tensor<1x16xi32, #blocked5> loc(#loc196) + %eq = arith.cmpi eq, %ileft_64, %iright_65 : tensor<1x16xi32, #blocked5> loc(#loc197) + %cond_79 = arith.cmpi sgt, %left_idx_77, %right_idx_78 : tensor<1x16xi32, #blocked5> loc(#loc198) + %cond_80 = arith.andi %eq, %cond_79 : tensor<1x16xi1, #blocked5> loc(#loc199) + %cond_81 = arith.ori %cond, %cond_80 : tensor<1x16xi1, #blocked5> loc(#loc200) + %cond_82 = arith.extui %cond_81 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc201) + %cond_83 = arith.xori %cond_82, %flip_52 : tensor<1x16xi32, #blocked5> loc(#loc201) + %cond_84 = arith.cmpi ne, %cond_83, %cst_5 : tensor<1x16xi32, #blocked5> loc(#loc202) + %ret = arith.xori %ileft_64, %iright_65 : tensor<1x16xi32, #blocked5> loc(#loc203) + %ret_85 = arith.select %cond_84, %ret, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc204) + %ret_86 = arith.xori %tmp0_39, %ret_85 : tensor<1x16xi32, #blocked5> loc(#loc205) + %new_idxs = arith.xori %left_idx_77, %right_idx_78 : tensor<1x16xi32, #blocked5> loc(#loc206) + %new_idxs_87 = arith.select %cond_84, %new_idxs, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc207) + %new_idxs_88 = arith.extsi %tmp2 : tensor<1x16xi16, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc208) + %new_idxs_89 = arith.xori %new_idxs_88, %new_idxs_87 : tensor<1x16xi32, #blocked5> loc(#loc208) + %flip_90 = tt.broadcast %flip_49 : tensor<1x2x1xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc164) + %flip_91 = tt.reshape %flip_90 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc165) + %y_92 = tt.reshape %ret_86 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc171) + %ileft_93 = tt.broadcast %left_mask_53 : tensor<1x2x1xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc173) + %ileft_94 = arith.muli %y_92, %ileft_93 : tensor<4x2x2xi32, #blocked3> loc(#loc173) + %ileft_95 = "tt.reduce"(%ileft_94) <{axis = 1 : i32}> ({ + ^bb0(%ileft_407: i32 loc(callsite(#loc1 at #loc174)), %ileft_408: i32 loc(callsite(#loc1 at #loc174))): + %ileft_409 = arith.addi %ileft_407, %ileft_408 : i32 loc(#loc220) + tt.reduce.return %ileft_409 : i32 loc(#loc210) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc210) + %ileft_96 = tt.expand_dims %ileft_95 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc175) + %ileft_97 = tt.broadcast %ileft_96 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc176) + %iright_98 = arith.muli %y_92, %flip_51 : tensor<4x2x2xi32, #blocked3> loc(#loc177) + %iright_99 = "tt.reduce"(%iright_98) <{axis = 1 : i32}> ({ + ^bb0(%iright_407: i32 loc(callsite(#loc1 at #loc178)), %iright_408: i32 loc(callsite(#loc1 at #loc178))): + %iright_409 = arith.addi %iright_407, %iright_408 : i32 loc(#loc221) + tt.reduce.return %iright_409 : i32 loc(#loc212) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc212) + %iright_100 = tt.expand_dims %iright_99 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc179) + %iright_101 = tt.broadcast %iright_100 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc180) + %ileft_102 = tt.reshape %ileft_97 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc181) + %iright_103 = tt.reshape %iright_101 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc182) + %y_idx_104 = tt.reshape %new_idxs_89 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc183) + %left_idx_105 = arith.muli %y_idx_104, %ileft_93 : tensor<4x2x2xi32, #blocked3> loc(#loc185) + %left_idx_106 = "tt.reduce"(%left_idx_105) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_407: i32 loc(callsite(#loc1 at #loc186)), %left_idx_408: i32 loc(callsite(#loc1 at #loc186))): + %left_idx_409 = arith.addi %left_idx_407, %left_idx_408 : i32 loc(#loc222) + tt.reduce.return %left_idx_409 : i32 loc(#loc215) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc215) + %left_idx_107 = tt.expand_dims %left_idx_106 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc187) + %left_idx_108 = tt.broadcast %left_idx_107 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc188) + %right_idx_109 = arith.muli %y_idx_104, %flip_51 : tensor<4x2x2xi32, #blocked3> loc(#loc190) + %right_idx_110 = "tt.reduce"(%right_idx_109) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_407: i32 loc(callsite(#loc1 at #loc191)), %right_idx_408: i32 loc(callsite(#loc1 at #loc191))): + %right_idx_409 = arith.addi %right_idx_407, %right_idx_408 : i32 loc(#loc223) + tt.reduce.return %right_idx_409 : i32 loc(#loc218) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc218) + %right_idx_111 = tt.expand_dims %right_idx_110 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc192) + %right_idx_112 = tt.broadcast %right_idx_111 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc193) + %left_idx_113 = tt.reshape %left_idx_108 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc194) + %right_idx_114 = tt.reshape %right_idx_112 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc195) + %cond_115 = arith.cmpi slt, %ileft_102, %iright_103 : tensor<1x16xi32, #blocked5> loc(#loc196) + %eq_116 = arith.cmpi eq, %ileft_102, %iright_103 : tensor<1x16xi32, #blocked5> loc(#loc197) + %cond_117 = arith.cmpi sgt, %left_idx_113, %right_idx_114 : tensor<1x16xi32, #blocked5> loc(#loc198) + %cond_118 = arith.andi %eq_116, %cond_117 : tensor<1x16xi1, #blocked5> loc(#loc199) + %cond_119 = arith.ori %cond_115, %cond_118 : tensor<1x16xi1, #blocked5> loc(#loc200) + %cond_120 = arith.extui %cond_119 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc201) + %cond_121 = arith.xori %cond_120, %flip_91 : tensor<1x16xi32, #blocked5> loc(#loc201) + %cond_122 = arith.cmpi ne, %cond_121, %cst_5 : tensor<1x16xi32, #blocked5> loc(#loc202) + %ret_123 = arith.xori %ileft_102, %iright_103 : tensor<1x16xi32, #blocked5> loc(#loc203) + %ret_124 = arith.select %cond_122, %ret_123, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc204) + %ret_125 = arith.xori %ret_86, %ret_124 : tensor<1x16xi32, #blocked5> loc(#loc205) + %new_idxs_126 = arith.xori %left_idx_113, %right_idx_114 : tensor<1x16xi32, #blocked5> loc(#loc206) + %new_idxs_127 = arith.select %cond_122, %new_idxs_126, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc207) + %new_idxs_128 = arith.xori %new_idxs_89, %new_idxs_127 : tensor<1x16xi32, #blocked5> loc(#loc208) + %y_129 = tt.reshape %ret_125 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc171) + %ileft_130 = arith.muli %y_129, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc173) + %ileft_131 = "tt.reduce"(%ileft_130) <{axis = 1 : i32}> ({ + ^bb0(%ileft_407: i32 loc(callsite(#loc1 at #loc174)), %ileft_408: i32 loc(callsite(#loc1 at #loc174))): + %ileft_409 = arith.addi %ileft_407, %ileft_408 : i32 loc(#loc220) + tt.reduce.return %ileft_409 : i32 loc(#loc210) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc210) + %ileft_132 = tt.expand_dims %ileft_131 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc175) + %ileft_133 = tt.broadcast %ileft_132 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc176) + %iright_134 = arith.muli %y_129, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc177) + %iright_135 = "tt.reduce"(%iright_134) <{axis = 1 : i32}> ({ + ^bb0(%iright_407: i32 loc(callsite(#loc1 at #loc178)), %iright_408: i32 loc(callsite(#loc1 at #loc178))): + %iright_409 = arith.addi %iright_407, %iright_408 : i32 loc(#loc221) + tt.reduce.return %iright_409 : i32 loc(#loc212) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc212) + %iright_136 = tt.expand_dims %iright_135 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc179) + %iright_137 = tt.broadcast %iright_136 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc180) + %ileft_138 = tt.reshape %ileft_133 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc181) + %iright_139 = tt.reshape %iright_137 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc182) + %y_idx_140 = tt.reshape %new_idxs_128 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc183) + %left_idx_141 = arith.muli %y_idx_140, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc185) + %left_idx_142 = "tt.reduce"(%left_idx_141) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_407: i32 loc(callsite(#loc1 at #loc186)), %left_idx_408: i32 loc(callsite(#loc1 at #loc186))): + %left_idx_409 = arith.addi %left_idx_407, %left_idx_408 : i32 loc(#loc222) + tt.reduce.return %left_idx_409 : i32 loc(#loc215) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc215) + %left_idx_143 = tt.expand_dims %left_idx_142 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc187) + %left_idx_144 = tt.broadcast %left_idx_143 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc188) + %right_idx_145 = arith.muli %y_idx_140, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc190) + %right_idx_146 = "tt.reduce"(%right_idx_145) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_407: i32 loc(callsite(#loc1 at #loc191)), %right_idx_408: i32 loc(callsite(#loc1 at #loc191))): + %right_idx_409 = arith.addi %right_idx_407, %right_idx_408 : i32 loc(#loc223) + tt.reduce.return %right_idx_409 : i32 loc(#loc218) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc218) + %right_idx_147 = tt.expand_dims %right_idx_146 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc192) + %right_idx_148 = tt.broadcast %right_idx_147 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc193) + %left_idx_149 = tt.reshape %left_idx_144 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc194) + %right_idx_150 = tt.reshape %right_idx_148 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc195) + %cond_151 = arith.cmpi slt, %ileft_138, %iright_139 : tensor<1x16xi32, #blocked5> loc(#loc196) + %eq_152 = arith.cmpi eq, %ileft_138, %iright_139 : tensor<1x16xi32, #blocked5> loc(#loc197) + %cond_153 = arith.cmpi sgt, %left_idx_149, %right_idx_150 : tensor<1x16xi32, #blocked5> loc(#loc198) + %cond_154 = arith.andi %eq_152, %cond_153 : tensor<1x16xi1, #blocked5> loc(#loc199) + %cond_155 = arith.ori %cond_151, %cond_154 : tensor<1x16xi1, #blocked5> loc(#loc200) + %cond_156 = arith.extui %cond_155 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc201) + %cond_157 = arith.xori %cond_156, %flip_91 : tensor<1x16xi32, #blocked5> loc(#loc201) + %cond_158 = arith.cmpi ne, %cond_157, %cst_5 : tensor<1x16xi32, #blocked5> loc(#loc202) + %ret_159 = arith.xori %ileft_138, %iright_139 : tensor<1x16xi32, #blocked5> loc(#loc203) + %ret_160 = arith.select %cond_158, %ret_159, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc204) + %ret_161 = arith.xori %ret_125, %ret_160 : tensor<1x16xi32, #blocked5> loc(#loc205) + %new_idxs_162 = arith.xori %left_idx_149, %right_idx_150 : tensor<1x16xi32, #blocked5> loc(#loc206) + %new_idxs_163 = arith.select %cond_158, %new_idxs_162, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc207) + %new_idxs_164 = arith.xori %new_idxs_128, %new_idxs_163 : tensor<1x16xi32, #blocked5> loc(#loc208) + %flip_165 = tt.broadcast %flip_50 : tensor<1x2x1xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc164) + %flip_166 = tt.reshape %flip_165 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc165) + %y_167 = tt.reshape %ret_161 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc171) + %ileft_168 = tt.broadcast %left_mask_54 : tensor<1x2x1xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc173) + %ileft_169 = arith.muli %y_167, %ileft_168 : tensor<2x2x4xi32, #blocked2> loc(#loc173) + %ileft_170 = "tt.reduce"(%ileft_169) <{axis = 1 : i32}> ({ + ^bb0(%ileft_407: i32 loc(callsite(#loc1 at #loc174)), %ileft_408: i32 loc(callsite(#loc1 at #loc174))): + %ileft_409 = arith.addi %ileft_407, %ileft_408 : i32 loc(#loc220) + tt.reduce.return %ileft_409 : i32 loc(#loc210) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc210) + %ileft_171 = tt.expand_dims %ileft_170 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc175) + %ileft_172 = tt.broadcast %ileft_171 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc176) + %iright_173 = arith.muli %y_167, %flip_90 : tensor<2x2x4xi32, #blocked2> loc(#loc177) + %iright_174 = "tt.reduce"(%iright_173) <{axis = 1 : i32}> ({ + ^bb0(%iright_407: i32 loc(callsite(#loc1 at #loc178)), %iright_408: i32 loc(callsite(#loc1 at #loc178))): + %iright_409 = arith.addi %iright_407, %iright_408 : i32 loc(#loc221) + tt.reduce.return %iright_409 : i32 loc(#loc212) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc212) + %iright_175 = tt.expand_dims %iright_174 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc179) + %iright_176 = tt.broadcast %iright_175 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc180) + %ileft_177 = tt.reshape %ileft_172 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc181) + %iright_178 = tt.reshape %iright_176 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc182) + %y_idx_179 = tt.reshape %new_idxs_164 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc183) + %left_idx_180 = arith.muli %y_idx_179, %ileft_168 : tensor<2x2x4xi32, #blocked2> loc(#loc185) + %left_idx_181 = "tt.reduce"(%left_idx_180) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_407: i32 loc(callsite(#loc1 at #loc186)), %left_idx_408: i32 loc(callsite(#loc1 at #loc186))): + %left_idx_409 = arith.addi %left_idx_407, %left_idx_408 : i32 loc(#loc222) + tt.reduce.return %left_idx_409 : i32 loc(#loc215) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc215) + %left_idx_182 = tt.expand_dims %left_idx_181 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc187) + %left_idx_183 = tt.broadcast %left_idx_182 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc188) + %right_idx_184 = arith.muli %y_idx_179, %flip_90 : tensor<2x2x4xi32, #blocked2> loc(#loc190) + %right_idx_185 = "tt.reduce"(%right_idx_184) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_407: i32 loc(callsite(#loc1 at #loc191)), %right_idx_408: i32 loc(callsite(#loc1 at #loc191))): + %right_idx_409 = arith.addi %right_idx_407, %right_idx_408 : i32 loc(#loc223) + tt.reduce.return %right_idx_409 : i32 loc(#loc218) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc218) + %right_idx_186 = tt.expand_dims %right_idx_185 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc192) + %right_idx_187 = tt.broadcast %right_idx_186 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc193) + %left_idx_188 = tt.reshape %left_idx_183 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc194) + %right_idx_189 = tt.reshape %right_idx_187 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc195) + %cond_190 = arith.cmpi slt, %ileft_177, %iright_178 : tensor<1x16xi32, #blocked5> loc(#loc196) + %eq_191 = arith.cmpi eq, %ileft_177, %iright_178 : tensor<1x16xi32, #blocked5> loc(#loc197) + %cond_192 = arith.cmpi sgt, %left_idx_188, %right_idx_189 : tensor<1x16xi32, #blocked5> loc(#loc198) + %cond_193 = arith.andi %eq_191, %cond_192 : tensor<1x16xi1, #blocked5> loc(#loc199) + %cond_194 = arith.ori %cond_190, %cond_193 : tensor<1x16xi1, #blocked5> loc(#loc200) + %cond_195 = arith.extui %cond_194 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc201) + %cond_196 = arith.xori %cond_195, %flip_166 : tensor<1x16xi32, #blocked5> loc(#loc201) + %cond_197 = arith.cmpi ne, %cond_196, %cst_5 : tensor<1x16xi32, #blocked5> loc(#loc202) + %ret_198 = arith.xori %ileft_177, %iright_178 : tensor<1x16xi32, #blocked5> loc(#loc203) + %ret_199 = arith.select %cond_197, %ret_198, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc204) + %ret_200 = arith.xori %ret_161, %ret_199 : tensor<1x16xi32, #blocked5> loc(#loc205) + %new_idxs_201 = arith.xori %left_idx_188, %right_idx_189 : tensor<1x16xi32, #blocked5> loc(#loc206) + %new_idxs_202 = arith.select %cond_197, %new_idxs_201, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc207) + %new_idxs_203 = arith.xori %new_idxs_164, %new_idxs_202 : tensor<1x16xi32, #blocked5> loc(#loc208) + %y_204 = tt.reshape %ret_200 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc171) + %ileft_205 = arith.muli %y_204, %ileft_93 : tensor<4x2x2xi32, #blocked3> loc(#loc173) + %ileft_206 = "tt.reduce"(%ileft_205) <{axis = 1 : i32}> ({ + ^bb0(%ileft_407: i32 loc(callsite(#loc1 at #loc174)), %ileft_408: i32 loc(callsite(#loc1 at #loc174))): + %ileft_409 = arith.addi %ileft_407, %ileft_408 : i32 loc(#loc220) + tt.reduce.return %ileft_409 : i32 loc(#loc210) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc210) + %ileft_207 = tt.expand_dims %ileft_206 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc175) + %ileft_208 = tt.broadcast %ileft_207 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc176) + %iright_209 = arith.muli %y_204, %flip_51 : tensor<4x2x2xi32, #blocked3> loc(#loc177) + %iright_210 = "tt.reduce"(%iright_209) <{axis = 1 : i32}> ({ + ^bb0(%iright_407: i32 loc(callsite(#loc1 at #loc178)), %iright_408: i32 loc(callsite(#loc1 at #loc178))): + %iright_409 = arith.addi %iright_407, %iright_408 : i32 loc(#loc221) + tt.reduce.return %iright_409 : i32 loc(#loc212) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc212) + %iright_211 = tt.expand_dims %iright_210 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc179) + %iright_212 = tt.broadcast %iright_211 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc180) + %ileft_213 = tt.reshape %ileft_208 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc181) + %iright_214 = tt.reshape %iright_212 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc182) + %y_idx_215 = tt.reshape %new_idxs_203 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc183) + %left_idx_216 = arith.muli %y_idx_215, %ileft_93 : tensor<4x2x2xi32, #blocked3> loc(#loc185) + %left_idx_217 = "tt.reduce"(%left_idx_216) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_407: i32 loc(callsite(#loc1 at #loc186)), %left_idx_408: i32 loc(callsite(#loc1 at #loc186))): + %left_idx_409 = arith.addi %left_idx_407, %left_idx_408 : i32 loc(#loc222) + tt.reduce.return %left_idx_409 : i32 loc(#loc215) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc215) + %left_idx_218 = tt.expand_dims %left_idx_217 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc187) + %left_idx_219 = tt.broadcast %left_idx_218 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc188) + %right_idx_220 = arith.muli %y_idx_215, %flip_51 : tensor<4x2x2xi32, #blocked3> loc(#loc190) + %right_idx_221 = "tt.reduce"(%right_idx_220) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_407: i32 loc(callsite(#loc1 at #loc191)), %right_idx_408: i32 loc(callsite(#loc1 at #loc191))): + %right_idx_409 = arith.addi %right_idx_407, %right_idx_408 : i32 loc(#loc223) + tt.reduce.return %right_idx_409 : i32 loc(#loc218) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc218) + %right_idx_222 = tt.expand_dims %right_idx_221 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc192) + %right_idx_223 = tt.broadcast %right_idx_222 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc193) + %left_idx_224 = tt.reshape %left_idx_219 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc194) + %right_idx_225 = tt.reshape %right_idx_223 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc195) + %cond_226 = arith.cmpi slt, %ileft_213, %iright_214 : tensor<1x16xi32, #blocked5> loc(#loc196) + %eq_227 = arith.cmpi eq, %ileft_213, %iright_214 : tensor<1x16xi32, #blocked5> loc(#loc197) + %cond_228 = arith.cmpi sgt, %left_idx_224, %right_idx_225 : tensor<1x16xi32, #blocked5> loc(#loc198) + %cond_229 = arith.andi %eq_227, %cond_228 : tensor<1x16xi1, #blocked5> loc(#loc199) + %cond_230 = arith.ori %cond_226, %cond_229 : tensor<1x16xi1, #blocked5> loc(#loc200) + %cond_231 = arith.extui %cond_230 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc201) + %cond_232 = arith.xori %cond_231, %flip_166 : tensor<1x16xi32, #blocked5> loc(#loc201) + %cond_233 = arith.cmpi ne, %cond_232, %cst_5 : tensor<1x16xi32, #blocked5> loc(#loc202) + %ret_234 = arith.xori %ileft_213, %iright_214 : tensor<1x16xi32, #blocked5> loc(#loc203) + %ret_235 = arith.select %cond_233, %ret_234, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc204) + %ret_236 = arith.xori %ret_200, %ret_235 : tensor<1x16xi32, #blocked5> loc(#loc205) + %new_idxs_237 = arith.xori %left_idx_224, %right_idx_225 : tensor<1x16xi32, #blocked5> loc(#loc206) + %new_idxs_238 = arith.select %cond_233, %new_idxs_237, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc207) + %new_idxs_239 = arith.xori %new_idxs_203, %new_idxs_238 : tensor<1x16xi32, #blocked5> loc(#loc208) + %y_240 = tt.reshape %ret_236 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc171) + %ileft_241 = arith.muli %y_240, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc173) + %ileft_242 = "tt.reduce"(%ileft_241) <{axis = 1 : i32}> ({ + ^bb0(%ileft_407: i32 loc(callsite(#loc1 at #loc174)), %ileft_408: i32 loc(callsite(#loc1 at #loc174))): + %ileft_409 = arith.addi %ileft_407, %ileft_408 : i32 loc(#loc220) + tt.reduce.return %ileft_409 : i32 loc(#loc210) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc210) + %ileft_243 = tt.expand_dims %ileft_242 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc175) + %ileft_244 = tt.broadcast %ileft_243 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc176) + %iright_245 = arith.muli %y_240, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc177) + %iright_246 = "tt.reduce"(%iright_245) <{axis = 1 : i32}> ({ + ^bb0(%iright_407: i32 loc(callsite(#loc1 at #loc178)), %iright_408: i32 loc(callsite(#loc1 at #loc178))): + %iright_409 = arith.addi %iright_407, %iright_408 : i32 loc(#loc221) + tt.reduce.return %iright_409 : i32 loc(#loc212) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc212) + %iright_247 = tt.expand_dims %iright_246 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc179) + %iright_248 = tt.broadcast %iright_247 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc180) + %ileft_249 = tt.reshape %ileft_244 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc181) + %iright_250 = tt.reshape %iright_248 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc182) + %y_idx_251 = tt.reshape %new_idxs_239 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc183) + %left_idx_252 = arith.muli %y_idx_251, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc185) + %left_idx_253 = "tt.reduce"(%left_idx_252) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_407: i32 loc(callsite(#loc1 at #loc186)), %left_idx_408: i32 loc(callsite(#loc1 at #loc186))): + %left_idx_409 = arith.addi %left_idx_407, %left_idx_408 : i32 loc(#loc222) + tt.reduce.return %left_idx_409 : i32 loc(#loc215) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc215) + %left_idx_254 = tt.expand_dims %left_idx_253 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc187) + %left_idx_255 = tt.broadcast %left_idx_254 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc188) + %right_idx_256 = arith.muli %y_idx_251, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc190) + %right_idx_257 = "tt.reduce"(%right_idx_256) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_407: i32 loc(callsite(#loc1 at #loc191)), %right_idx_408: i32 loc(callsite(#loc1 at #loc191))): + %right_idx_409 = arith.addi %right_idx_407, %right_idx_408 : i32 loc(#loc223) + tt.reduce.return %right_idx_409 : i32 loc(#loc218) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc218) + %right_idx_258 = tt.expand_dims %right_idx_257 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc192) + %right_idx_259 = tt.broadcast %right_idx_258 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc193) + %left_idx_260 = tt.reshape %left_idx_255 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc194) + %right_idx_261 = tt.reshape %right_idx_259 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc195) + %cond_262 = arith.cmpi slt, %ileft_249, %iright_250 : tensor<1x16xi32, #blocked5> loc(#loc196) + %eq_263 = arith.cmpi eq, %ileft_249, %iright_250 : tensor<1x16xi32, #blocked5> loc(#loc197) + %cond_264 = arith.cmpi sgt, %left_idx_260, %right_idx_261 : tensor<1x16xi32, #blocked5> loc(#loc198) + %cond_265 = arith.andi %eq_263, %cond_264 : tensor<1x16xi1, #blocked5> loc(#loc199) + %cond_266 = arith.ori %cond_262, %cond_265 : tensor<1x16xi1, #blocked5> loc(#loc200) + %cond_267 = arith.extui %cond_266 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc201) + %cond_268 = arith.xori %cond_267, %flip_166 : tensor<1x16xi32, #blocked5> loc(#loc201) + %cond_269 = arith.cmpi ne, %cond_268, %cst_5 : tensor<1x16xi32, #blocked5> loc(#loc202) + %ret_270 = arith.xori %ileft_249, %iright_250 : tensor<1x16xi32, #blocked5> loc(#loc203) + %ret_271 = arith.select %cond_269, %ret_270, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc204) + %ret_272 = arith.xori %ret_236, %ret_271 : tensor<1x16xi32, #blocked5> loc(#loc205) + %new_idxs_273 = arith.xori %left_idx_260, %right_idx_261 : tensor<1x16xi32, #blocked5> loc(#loc206) + %new_idxs_274 = arith.select %cond_269, %new_idxs_273, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc207) + %new_idxs_275 = arith.xori %new_idxs_239, %new_idxs_274 : tensor<1x16xi32, #blocked5> loc(#loc208) + %y_276 = tt.reshape %ret_272 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc171) + %ileft_277 = tt.broadcast %left_mask_55 : tensor<1x2x1xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc173) + %ileft_278 = arith.muli %y_276, %ileft_277 : tensor<1x2x8xi32, #blocked1> loc(#loc173) + %ileft_279 = "tt.reduce"(%ileft_278) <{axis = 1 : i32}> ({ + ^bb0(%ileft_407: i32 loc(callsite(#loc1 at #loc174)), %ileft_408: i32 loc(callsite(#loc1 at #loc174))): + %ileft_409 = arith.addi %ileft_407, %ileft_408 : i32 loc(#loc220) + tt.reduce.return %ileft_409 : i32 loc(#loc210) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc210) + %ileft_280 = tt.expand_dims %ileft_279 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc175) + %ileft_281 = tt.broadcast %ileft_280 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc176) + %iright_282 = arith.muli %y_276, %flip_165 : tensor<1x2x8xi32, #blocked1> loc(#loc177) + %iright_283 = "tt.reduce"(%iright_282) <{axis = 1 : i32}> ({ + ^bb0(%iright_407: i32 loc(callsite(#loc1 at #loc178)), %iright_408: i32 loc(callsite(#loc1 at #loc178))): + %iright_409 = arith.addi %iright_407, %iright_408 : i32 loc(#loc221) + tt.reduce.return %iright_409 : i32 loc(#loc212) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc212) + %iright_284 = tt.expand_dims %iright_283 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc179) + %iright_285 = tt.broadcast %iright_284 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc180) + %ileft_286 = tt.reshape %ileft_281 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc181) + %iright_287 = tt.reshape %iright_285 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc182) + %y_idx_288 = tt.reshape %new_idxs_275 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc183) + %left_idx_289 = arith.muli %y_idx_288, %ileft_277 : tensor<1x2x8xi32, #blocked1> loc(#loc185) + %left_idx_290 = "tt.reduce"(%left_idx_289) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_407: i32 loc(callsite(#loc1 at #loc186)), %left_idx_408: i32 loc(callsite(#loc1 at #loc186))): + %left_idx_409 = arith.addi %left_idx_407, %left_idx_408 : i32 loc(#loc222) + tt.reduce.return %left_idx_409 : i32 loc(#loc215) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc215) + %left_idx_291 = tt.expand_dims %left_idx_290 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc187) + %left_idx_292 = tt.broadcast %left_idx_291 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc188) + %right_idx_293 = arith.muli %y_idx_288, %flip_165 : tensor<1x2x8xi32, #blocked1> loc(#loc190) + %right_idx_294 = "tt.reduce"(%right_idx_293) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_407: i32 loc(callsite(#loc1 at #loc191)), %right_idx_408: i32 loc(callsite(#loc1 at #loc191))): + %right_idx_409 = arith.addi %right_idx_407, %right_idx_408 : i32 loc(#loc223) + tt.reduce.return %right_idx_409 : i32 loc(#loc218) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc218) + %right_idx_295 = tt.expand_dims %right_idx_294 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc192) + %right_idx_296 = tt.broadcast %right_idx_295 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc193) + %left_idx_297 = tt.reshape %left_idx_292 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc194) + %right_idx_298 = tt.reshape %right_idx_296 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc195) + %cond_299 = arith.cmpi slt, %ileft_286, %iright_287 : tensor<1x16xi32, #blocked5> loc(#loc196) + %eq_300 = arith.cmpi eq, %ileft_286, %iright_287 : tensor<1x16xi32, #blocked5> loc(#loc197) + %cond_301 = arith.cmpi sgt, %left_idx_297, %right_idx_298 : tensor<1x16xi32, #blocked5> loc(#loc198) + %cond_302 = arith.andi %eq_300, %cond_301 : tensor<1x16xi1, #blocked5> loc(#loc199) + %cond_303 = arith.ori %cond_299, %cond_302 : tensor<1x16xi1, #blocked5> loc(#loc200) + %ret_304 = arith.xori %ileft_286, %iright_287 : tensor<1x16xi32, #blocked5> loc(#loc203) + %ret_305 = arith.select %cond_303, %ret_304, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc204) + %ret_306 = arith.xori %ret_272, %ret_305 : tensor<1x16xi32, #blocked5> loc(#loc205) + %new_idxs_307 = arith.xori %left_idx_297, %right_idx_298 : tensor<1x16xi32, #blocked5> loc(#loc206) + %new_idxs_308 = arith.select %cond_303, %new_idxs_307, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc207) + %new_idxs_309 = arith.xori %new_idxs_275, %new_idxs_308 : tensor<1x16xi32, #blocked5> loc(#loc208) + %y_310 = tt.reshape %ret_306 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc171) + %ileft_311 = arith.muli %y_310, %ileft_168 : tensor<2x2x4xi32, #blocked2> loc(#loc173) + %ileft_312 = "tt.reduce"(%ileft_311) <{axis = 1 : i32}> ({ + ^bb0(%ileft_407: i32 loc(callsite(#loc1 at #loc174)), %ileft_408: i32 loc(callsite(#loc1 at #loc174))): + %ileft_409 = arith.addi %ileft_407, %ileft_408 : i32 loc(#loc220) + tt.reduce.return %ileft_409 : i32 loc(#loc210) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc210) + %ileft_313 = tt.expand_dims %ileft_312 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc175) + %ileft_314 = tt.broadcast %ileft_313 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc176) + %iright_315 = arith.muli %y_310, %flip_90 : tensor<2x2x4xi32, #blocked2> loc(#loc177) + %iright_316 = "tt.reduce"(%iright_315) <{axis = 1 : i32}> ({ + ^bb0(%iright_407: i32 loc(callsite(#loc1 at #loc178)), %iright_408: i32 loc(callsite(#loc1 at #loc178))): + %iright_409 = arith.addi %iright_407, %iright_408 : i32 loc(#loc221) + tt.reduce.return %iright_409 : i32 loc(#loc212) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc212) + %iright_317 = tt.expand_dims %iright_316 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc179) + %iright_318 = tt.broadcast %iright_317 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc180) + %ileft_319 = tt.reshape %ileft_314 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc181) + %iright_320 = tt.reshape %iright_318 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc182) + %y_idx_321 = tt.reshape %new_idxs_309 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc183) + %left_idx_322 = arith.muli %y_idx_321, %ileft_168 : tensor<2x2x4xi32, #blocked2> loc(#loc185) + %left_idx_323 = "tt.reduce"(%left_idx_322) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_407: i32 loc(callsite(#loc1 at #loc186)), %left_idx_408: i32 loc(callsite(#loc1 at #loc186))): + %left_idx_409 = arith.addi %left_idx_407, %left_idx_408 : i32 loc(#loc222) + tt.reduce.return %left_idx_409 : i32 loc(#loc215) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc215) + %left_idx_324 = tt.expand_dims %left_idx_323 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc187) + %left_idx_325 = tt.broadcast %left_idx_324 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc188) + %right_idx_326 = arith.muli %y_idx_321, %flip_90 : tensor<2x2x4xi32, #blocked2> loc(#loc190) + %right_idx_327 = "tt.reduce"(%right_idx_326) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_407: i32 loc(callsite(#loc1 at #loc191)), %right_idx_408: i32 loc(callsite(#loc1 at #loc191))): + %right_idx_409 = arith.addi %right_idx_407, %right_idx_408 : i32 loc(#loc223) + tt.reduce.return %right_idx_409 : i32 loc(#loc218) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc218) + %right_idx_328 = tt.expand_dims %right_idx_327 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc192) + %right_idx_329 = tt.broadcast %right_idx_328 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc193) + %left_idx_330 = tt.reshape %left_idx_325 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc194) + %right_idx_331 = tt.reshape %right_idx_329 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc195) + %cond_332 = arith.cmpi slt, %ileft_319, %iright_320 : tensor<1x16xi32, #blocked5> loc(#loc196) + %eq_333 = arith.cmpi eq, %ileft_319, %iright_320 : tensor<1x16xi32, #blocked5> loc(#loc197) + %cond_334 = arith.cmpi sgt, %left_idx_330, %right_idx_331 : tensor<1x16xi32, #blocked5> loc(#loc198) + %cond_335 = arith.andi %eq_333, %cond_334 : tensor<1x16xi1, #blocked5> loc(#loc199) + %cond_336 = arith.ori %cond_332, %cond_335 : tensor<1x16xi1, #blocked5> loc(#loc200) + %ret_337 = arith.xori %ileft_319, %iright_320 : tensor<1x16xi32, #blocked5> loc(#loc203) + %ret_338 = arith.select %cond_336, %ret_337, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc204) + %ret_339 = arith.xori %ret_306, %ret_338 : tensor<1x16xi32, #blocked5> loc(#loc205) + %new_idxs_340 = arith.xori %left_idx_330, %right_idx_331 : tensor<1x16xi32, #blocked5> loc(#loc206) + %new_idxs_341 = arith.select %cond_336, %new_idxs_340, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc207) + %new_idxs_342 = arith.xori %new_idxs_309, %new_idxs_341 : tensor<1x16xi32, #blocked5> loc(#loc208) + %y_343 = tt.reshape %ret_339 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc171) + %ileft_344 = arith.muli %y_343, %ileft_93 : tensor<4x2x2xi32, #blocked3> loc(#loc173) + %ileft_345 = "tt.reduce"(%ileft_344) <{axis = 1 : i32}> ({ + ^bb0(%ileft_407: i32 loc(callsite(#loc1 at #loc174)), %ileft_408: i32 loc(callsite(#loc1 at #loc174))): + %ileft_409 = arith.addi %ileft_407, %ileft_408 : i32 loc(#loc220) + tt.reduce.return %ileft_409 : i32 loc(#loc210) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc210) + %ileft_346 = tt.expand_dims %ileft_345 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc175) + %ileft_347 = tt.broadcast %ileft_346 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc176) + %iright_348 = arith.muli %y_343, %flip_51 : tensor<4x2x2xi32, #blocked3> loc(#loc177) + %iright_349 = "tt.reduce"(%iright_348) <{axis = 1 : i32}> ({ + ^bb0(%iright_407: i32 loc(callsite(#loc1 at #loc178)), %iright_408: i32 loc(callsite(#loc1 at #loc178))): + %iright_409 = arith.addi %iright_407, %iright_408 : i32 loc(#loc221) + tt.reduce.return %iright_409 : i32 loc(#loc212) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc212) + %iright_350 = tt.expand_dims %iright_349 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc179) + %iright_351 = tt.broadcast %iright_350 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc180) + %ileft_352 = tt.reshape %ileft_347 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc181) + %iright_353 = tt.reshape %iright_351 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc182) + %y_idx_354 = tt.reshape %new_idxs_342 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc183) + %left_idx_355 = arith.muli %y_idx_354, %ileft_93 : tensor<4x2x2xi32, #blocked3> loc(#loc185) + %left_idx_356 = "tt.reduce"(%left_idx_355) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_407: i32 loc(callsite(#loc1 at #loc186)), %left_idx_408: i32 loc(callsite(#loc1 at #loc186))): + %left_idx_409 = arith.addi %left_idx_407, %left_idx_408 : i32 loc(#loc222) + tt.reduce.return %left_idx_409 : i32 loc(#loc215) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc215) + %left_idx_357 = tt.expand_dims %left_idx_356 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc187) + %left_idx_358 = tt.broadcast %left_idx_357 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc188) + %right_idx_359 = arith.muli %y_idx_354, %flip_51 : tensor<4x2x2xi32, #blocked3> loc(#loc190) + %right_idx_360 = "tt.reduce"(%right_idx_359) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_407: i32 loc(callsite(#loc1 at #loc191)), %right_idx_408: i32 loc(callsite(#loc1 at #loc191))): + %right_idx_409 = arith.addi %right_idx_407, %right_idx_408 : i32 loc(#loc223) + tt.reduce.return %right_idx_409 : i32 loc(#loc218) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc218) + %right_idx_361 = tt.expand_dims %right_idx_360 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc192) + %right_idx_362 = tt.broadcast %right_idx_361 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc193) + %left_idx_363 = tt.reshape %left_idx_358 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc194) + %right_idx_364 = tt.reshape %right_idx_362 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc195) + %cond_365 = arith.cmpi slt, %ileft_352, %iright_353 : tensor<1x16xi32, #blocked5> loc(#loc196) + %eq_366 = arith.cmpi eq, %ileft_352, %iright_353 : tensor<1x16xi32, #blocked5> loc(#loc197) + %cond_367 = arith.cmpi sgt, %left_idx_363, %right_idx_364 : tensor<1x16xi32, #blocked5> loc(#loc198) + %cond_368 = arith.andi %eq_366, %cond_367 : tensor<1x16xi1, #blocked5> loc(#loc199) + %cond_369 = arith.ori %cond_365, %cond_368 : tensor<1x16xi1, #blocked5> loc(#loc200) + %ret_370 = arith.xori %ileft_352, %iright_353 : tensor<1x16xi32, #blocked5> loc(#loc203) + %ret_371 = arith.select %cond_369, %ret_370, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc204) + %ret_372 = arith.xori %ret_339, %ret_371 : tensor<1x16xi32, #blocked5> loc(#loc205) + %new_idxs_373 = arith.xori %left_idx_363, %right_idx_364 : tensor<1x16xi32, #blocked5> loc(#loc206) + %new_idxs_374 = arith.select %cond_369, %new_idxs_373, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc207) + %new_idxs_375 = arith.xori %new_idxs_342, %new_idxs_374 : tensor<1x16xi32, #blocked5> loc(#loc208) + %y_376 = tt.reshape %ret_372 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc171) + %ileft_377 = arith.muli %y_376, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc173) + %ileft_378 = "tt.reduce"(%ileft_377) <{axis = 1 : i32}> ({ + ^bb0(%ileft_407: i32 loc(callsite(#loc1 at #loc174)), %ileft_408: i32 loc(callsite(#loc1 at #loc174))): + %ileft_409 = arith.addi %ileft_407, %ileft_408 : i32 loc(#loc220) + tt.reduce.return %ileft_409 : i32 loc(#loc210) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc210) + %ileft_379 = tt.expand_dims %ileft_378 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc175) + %ileft_380 = tt.broadcast %ileft_379 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc176) + %iright_381 = arith.muli %y_376, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc177) + %iright_382 = "tt.reduce"(%iright_381) <{axis = 1 : i32}> ({ + ^bb0(%iright_407: i32 loc(callsite(#loc1 at #loc178)), %iright_408: i32 loc(callsite(#loc1 at #loc178))): + %iright_409 = arith.addi %iright_407, %iright_408 : i32 loc(#loc221) + tt.reduce.return %iright_409 : i32 loc(#loc212) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc212) + %iright_383 = tt.expand_dims %iright_382 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc179) + %iright_384 = tt.broadcast %iright_383 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc180) + %ileft_385 = tt.reshape %ileft_380 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc181) + %iright_386 = tt.reshape %iright_384 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc182) + %y_idx_387 = tt.reshape %new_idxs_375 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc183) + %left_idx_388 = arith.muli %y_idx_387, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc185) + %left_idx_389 = "tt.reduce"(%left_idx_388) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_407: i32 loc(callsite(#loc1 at #loc186)), %left_idx_408: i32 loc(callsite(#loc1 at #loc186))): + %left_idx_409 = arith.addi %left_idx_407, %left_idx_408 : i32 loc(#loc222) + tt.reduce.return %left_idx_409 : i32 loc(#loc215) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc215) + %left_idx_390 = tt.expand_dims %left_idx_389 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc187) + %left_idx_391 = tt.broadcast %left_idx_390 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc188) + %right_idx_392 = arith.muli %y_idx_387, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc190) + %right_idx_393 = "tt.reduce"(%right_idx_392) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_407: i32 loc(callsite(#loc1 at #loc191)), %right_idx_408: i32 loc(callsite(#loc1 at #loc191))): + %right_idx_409 = arith.addi %right_idx_407, %right_idx_408 : i32 loc(#loc223) + tt.reduce.return %right_idx_409 : i32 loc(#loc218) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc218) + %right_idx_394 = tt.expand_dims %right_idx_393 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc192) + %right_idx_395 = tt.broadcast %right_idx_394 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc193) + %left_idx_396 = tt.reshape %left_idx_391 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc194) + %right_idx_397 = tt.reshape %right_idx_395 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc195) + %cond_398 = arith.cmpi slt, %ileft_385, %iright_386 : tensor<1x16xi32, #blocked5> loc(#loc196) + %eq_399 = arith.cmpi eq, %ileft_385, %iright_386 : tensor<1x16xi32, #blocked5> loc(#loc197) + %cond_400 = arith.cmpi sgt, %left_idx_396, %right_idx_397 : tensor<1x16xi32, #blocked5> loc(#loc198) + %cond_401 = arith.andi %eq_399, %cond_400 : tensor<1x16xi1, #blocked5> loc(#loc199) + %cond_402 = arith.ori %cond_398, %cond_401 : tensor<1x16xi1, #blocked5> loc(#loc200) + %new_idxs_403 = arith.xori %left_idx_396, %right_idx_397 : tensor<1x16xi32, #blocked5> loc(#loc206) + %new_idxs_404 = arith.select %cond_402, %new_idxs_403, %cst_5 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc207) + %new_idxs_405 = arith.xori %new_idxs_375, %new_idxs_404 : tensor<1x16xi32, #blocked5> loc(#loc208) + %tmp7 = arith.extsi %tmp0_38 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc151) + %tmp10 = arith.select %tmp0_36, %tmp7, %cst : tensor<1x16xi1, #blocked>, tensor<1x16xi64, #blocked> loc(#loc152) + %tmp11 = "tt.reduce"(%tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_407: i64 loc(callsite(#loc1 at #loc153)), %tmp11_408: i64 loc(callsite(#loc1 at #loc153))): + %tmp11_409 = arith.addi %tmp11_407, %tmp11_408 : i64 loc(#loc209) + tt.reduce.return %tmp11_409 : i64 loc(#loc169) + }) : (tensor<1x16xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc169) + %tmp11_406 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc154) + %tmp14 = arith.trunci %tmp11_406 : tensor<1x1xi64, #blocked> to tensor<1x1xi32, #blocked> loc(#loc155) + %0 = arith.muli %x0_9, %c16_i64 : i64 loc(#loc70) + %1 = tt.splat %0 : i64 -> tensor<1x16xi64, #blocked5> loc(#loc156) + %2 = arith.addi %tmp0_10, %1 : tensor<1x16xi64, #blocked5> loc(#loc71) + %3 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc72) + %4 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc73) + %5 = arith.extui %4 : i1 to i64 loc(#loc74) + %6 = arith.muli %ks0, %5 : i64 loc(#loc74) + %7 = arith.extui %3 : i1 to i64 loc(#loc157) + %8 = arith.addi %7, %6 : i64 loc(#loc75) + %9 = arith.muli %tmp0_15, %8 : i64 loc(#loc77) + %10 = tt.splat %9 : i64 -> tensor<1x16xi64, #blocked5> loc(#loc158) + %11 = arith.addi %2, %10 : tensor<1x16xi64, #blocked5> loc(#loc78) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc79) + %13 = tt.addptr %12, %11 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi64, #blocked5> loc(#loc79) + tt.store %13, %new_idxs_405, %tmp0_37 : tensor<1x16x!tt.ptr, #blocked5> loc(#loc80) + %14 = arith.muli %x1, %8 : i64 loc(#loc81) + %15 = arith.addi %x0_9, %14 : i64 loc(#loc82) + %16 = tt.addptr %out_ptr3, %15 : !tt.ptr, i64 loc(#loc83) + %17 = tt.splat %16 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc84) + %18 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc84) + tt.store %17, %tmp14, %18 : tensor<1x1x!tt.ptr, #blocked> loc(#loc84) + tt.return loc(#loc85) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":25:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":26:38) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":32:19) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":33:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:45) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:42) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:54) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:50) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:64) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:68) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:61) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:30) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:73) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":37:19) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":41:19) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":43:34) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":44:29) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":47:21) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:32) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:62) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:88) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:79) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:70) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:54) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:47) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:40) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:25) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:102) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:34) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:30) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:25) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:89) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:4) +#loc92 = loc("xoffset"(#loc2)) +#loc93 = loc("xmask"(#loc3)) +#loc94 = loc("r0_index"(#loc4)) +#loc95 = loc("x0"(#loc5)) +#loc96 = loc("x1"(#loc6)) +#loc97 = loc("tmp0"(#loc7)) +#loc98 = loc("tmp0"(#loc8)) +#loc99 = loc("tmp0"(#loc9)) +#loc100 = loc("tmp0"(#loc10)) +#loc101 = loc("tmp0"(#loc11)) +#loc102 = loc("tmp0"(#loc12)) +#loc103 = loc("tmp0"(#loc13)) +#loc104 = loc("tmp0"(#loc14)) +#loc105 = loc("tmp0"(#loc15)) +#loc106 = loc("tmp0"(#loc16)) +#loc107 = loc("tmp2"(#loc17)) +#loc108 = loc("flip"(#loc18)) +#loc110 = loc("flip"(#loc21)) +#loc111 = loc("flip"(#loc22)) +#loc112 = loc("y"(#loc23)) +#loc113 = loc("left_mask"(#loc25)) +#loc114 = loc("ileft"(#loc26)) +#loc116 = loc("ileft"(#loc30)) +#loc117 = loc("ileft"(#loc31)) +#loc118 = loc("iright"(#loc32)) +#loc120 = loc("iright"(#loc34)) +#loc121 = loc("iright"(#loc35)) +#loc122 = loc("ileft"(#loc36)) +#loc123 = loc("iright"(#loc37)) +#loc124 = loc("y_idx"(#loc38)) +#loc125 = loc("left_idx"(#loc39)) +#loc126 = loc("left_idx"(#loc40)) +#loc127 = loc("input"(#loc41)) +#loc129 = loc("left_idx"(#loc43)) +#loc130 = loc("left_idx"(#loc44)) +#loc131 = loc("right_idx"(#loc45)) +#loc132 = loc("right_idx"(#loc46)) +#loc134 = loc("right_idx"(#loc48)) +#loc135 = loc("right_idx"(#loc49)) +#loc136 = loc("left_idx"(#loc50)) +#loc137 = loc("right_idx"(#loc51)) +#loc138 = loc("cond"(#loc52)) +#loc139 = loc("eq"(#loc53)) +#loc140 = loc("cond"(#loc54)) +#loc141 = loc("cond"(#loc55)) +#loc142 = loc("cond"(#loc56)) +#loc143 = loc("cond"(#loc57)) +#loc144 = loc("cond"(#loc58)) +#loc145 = loc("ret"(#loc59)) +#loc146 = loc("ret"(#loc60)) +#loc147 = loc("ret"(#loc61)) +#loc148 = loc("new_idxs"(#loc62)) +#loc149 = loc("new_idxs"(#loc63)) +#loc150 = loc("new_idxs"(#loc64)) +#loc151 = loc("tmp7"(#loc65)) +#loc152 = loc("tmp10"(#loc66)) +#loc154 = loc("tmp11"(#loc68)) +#loc155 = loc("tmp14"(#loc69)) +#loc156 = loc(fused[#loc71, #loc70]) +#loc157 = loc(fused[#loc75, #loc76]) +#loc158 = loc(fused[#loc78, #loc77]) +#loc159 = loc(fused[#loc97, #loc95]) +#loc160 = loc(fused[#loc99, #loc98]) +#loc161 = loc(fused[#loc104, #loc103]) +#loc162 = loc(fused[#loc106, #loc93]) +#loc163 = loc(callsite(#loc108 at #loc109)) +#loc164 = loc(callsite(#loc110 at #loc109)) +#loc165 = loc(callsite(#loc111 at #loc109)) +#loc167 = loc("cond"(#loc138)) +#loc168 = loc("eq"(#loc139)) +#loc169 = loc(callsite(#loc27 at #loc153)) +#loc171 = loc(callsite(#loc112 at #loc166)) +#loc172 = loc(callsite(#loc113 at #loc166)) +#loc173 = loc(callsite(#loc114 at #loc166)) +#loc175 = loc(callsite(#loc116 at #loc166)) +#loc176 = loc(callsite(#loc117 at #loc166)) +#loc177 = loc(callsite(#loc118 at #loc166)) +#loc179 = loc(callsite(#loc120 at #loc166)) +#loc180 = loc(callsite(#loc121 at #loc166)) +#loc181 = loc(callsite(#loc122 at #loc166)) +#loc182 = loc(callsite(#loc123 at #loc166)) +#loc183 = loc(callsite(#loc124 at #loc166)) +#loc184 = loc(callsite(#loc125 at #loc166)) +#loc185 = loc(callsite(#loc126 at #loc166)) +#loc187 = loc(callsite(#loc129 at #loc166)) +#loc188 = loc(callsite(#loc130 at #loc166)) +#loc189 = loc(callsite(#loc131 at #loc166)) +#loc190 = loc(callsite(#loc132 at #loc166)) +#loc192 = loc(callsite(#loc134 at #loc166)) +#loc193 = loc(callsite(#loc135 at #loc166)) +#loc194 = loc(callsite(#loc136 at #loc166)) +#loc195 = loc(callsite(#loc137 at #loc166)) +#loc196 = loc(callsite(#loc167 at #loc166)) +#loc197 = loc(callsite(#loc168 at #loc166)) +#loc198 = loc(callsite(#loc140 at #loc166)) +#loc199 = loc(callsite(#loc141 at #loc166)) +#loc200 = loc(callsite(#loc142 at #loc166)) +#loc201 = loc(callsite(#loc143 at #loc166)) +#loc202 = loc(callsite(#loc144 at #loc166)) +#loc203 = loc(callsite(#loc145 at #loc166)) +#loc204 = loc(callsite(#loc146 at #loc166)) +#loc205 = loc(callsite(#loc147 at #loc166)) +#loc206 = loc(callsite(#loc148 at #loc166)) +#loc207 = loc(callsite(#loc149 at #loc166)) +#loc208 = loc(callsite(#loc150 at #loc166)) +#loc209 = loc(callsite(#loc29 at #loc169)) +#loc210 = loc(callsite(#loc27 at #loc174)) +#loc212 = loc(callsite(#loc27 at #loc178)) +#loc214 = loc(callsite(#loc127 at #loc186)) +#loc215 = loc(callsite(#loc27 at #loc186)) +#loc217 = loc(callsite(#loc127 at #loc191)) +#loc218 = loc(callsite(#loc27 at #loc191)) +#loc220 = loc(callsite(#loc29 at #loc210)) +#loc221 = loc(callsite(#loc29 at #loc212)) +#loc222 = loc(callsite(#loc29 at #loc215)) +#loc223 = loc(callsite(#loc29 at #loc218)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir new file mode 100644 index 0000000000000000000000000000000000000000..eaaeb558a584cac0f7b8833a7e2da0ada0498852 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/G2LU7LIHIOEHQSWVLFBJATACJ76YHM672CUBUDGJGAJUEQVWVOFQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir @@ -0,0 +1,822 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":40:67) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":44:26) +#loc88 = loc("in_ptr0"(#loc)) +#loc89 = loc("out_ptr2"(#loc)) +#loc90 = loc("out_ptr3"(#loc)) +#loc91 = loc("ks0"(#loc)) +#loc92 = loc("xnumel"(#loc)) +#loc93 = loc("r0_numel"(#loc)) +#loc114 = loc(callsite(#loc22 at #loc2)) +#loc121 = loc("ileft"(#loc31)) +#loc125 = loc("iright"(#loc36)) +#loc134 = loc("left_idx"(#loc45)) +#loc139 = loc("right_idx"(#loc50)) +#loc158 = loc("tmp11"(#loc69)) +#loc172 = loc(callsite(#loc27 at #loc114)) +#loc176 = loc(callsite(#loc1 at #loc158)) +#loc180 = loc(callsite(#loc121 at #loc172)) +#loc184 = loc(callsite(#loc125 at #loc172)) +#loc192 = loc(callsite(#loc134 at #loc172)) +#loc197 = loc(callsite(#loc139 at #loc172)) +#loc217 = loc(callsite(#loc1 at #loc180)) +#loc219 = loc(callsite(#loc1 at #loc184)) +#loc222 = loc(callsite(#loc1 at #loc192)) +#loc225 = loc(callsite(#loc1 at #loc197)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc94) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc1) + %tmp10 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc95) + %c16_i64 = arith.constant 16 : i64 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc96) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc97) + %xmask_1 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc97) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc98) + %r0_index_2 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc99) + %x0 = arith.extsi %xoffset : i32 to i64 loc(#loc100) + %x0_3 = arith.remsi %x0, %ks0 : i64 loc(#loc100) + %x1 = arith.divsi %x0, %ks0 : i64 loc(#loc101) + %tmp0 = arith.extsi %r0_index_2 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc102) + %tmp0_4 = tt.splat %x0_3 : i64 -> tensor<1x16xi64> loc(#loc164) + %tmp0_5 = arith.addi %tmp0, %tmp0_4 : tensor<1x16xi64> loc(#loc102) + %tmp0_6 = arith.muli %x1, %c16_i64 : i64 loc(#loc103) + %tmp0_7 = tt.splat %tmp0_6 : i64 -> tensor<1x16xi64> loc(#loc165) + %tmp0_8 = arith.addi %tmp0_5, %tmp0_7 : tensor<1x16xi64> loc(#loc104) + %tmp0_9 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc105) + %tmp0_10 = arith.muli %tmp0_9, %tmp0 : tensor<1x16xi64> loc(#loc105) + %tmp0_11 = arith.addi %tmp0_8, %tmp0_10 : tensor<1x16xi64> loc(#loc106) + %tmp0_12 = arith.muli %ks0, %c16_i64 : i64 loc(#loc107) + %tmp0_13 = arith.muli %tmp0_12, %x1 : i64 loc(#loc108) + %tmp0_14 = tt.splat %tmp0_13 : i64 -> tensor<1x16xi64> loc(#loc166) + %tmp0_15 = arith.addi %tmp0_11, %tmp0_14 : tensor<1x16xi64> loc(#loc109) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc110) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<1x16x!tt.ptr>, tensor<1x16xi64> loc(#loc110) + %tmp0_18 = tt.splat %xmask : i1 -> tensor<1x16xi1> loc(#loc167) + %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %cst_0 evictionPolicy = evict_last : tensor<1x16x!tt.ptr> loc(#loc111) + %tmp2 = arith.trunci %r0_index_2 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc112) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc168) + %flip_20 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc169) + %flip_21 = tt.expand_dims %flip_20 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc169) + %flip_22 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc170) + %flip_23 = tt.reshape %flip_22 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc171) + %y = tt.reshape %tmp0_19 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc177) + %left_mask = arith.subi %cst, %flip_21 : tensor<1x2x1xi32> loc(#loc178) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc179) + %ileft_24 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc179) + %ileft_25 = "tt.reduce"(%ileft_24) <{axis = 1 : i32}> ({ + ^bb0(%ileft_376: i32 loc(callsite(#loc1 at #loc180)), %ileft_377: i32 loc(callsite(#loc1 at #loc180))): + %ileft_378 = arith.addi %ileft_376, %ileft_377 : i32 loc(#loc226) + tt.reduce.return %ileft_378 : i32 loc(#loc216) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc216) + %ileft_26 = tt.expand_dims %ileft_25 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc181) + %ileft_27 = tt.broadcast %ileft_26 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc182) + %iright = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc183) + %iright_28 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc183) + %iright_29 = "tt.reduce"(%iright_28) <{axis = 1 : i32}> ({ + ^bb0(%iright_376: i32 loc(callsite(#loc1 at #loc184)), %iright_377: i32 loc(callsite(#loc1 at #loc184))): + %iright_378 = arith.addi %iright_376, %iright_377 : i32 loc(#loc227) + tt.reduce.return %iright_378 : i32 loc(#loc218) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc218) + %iright_30 = tt.expand_dims %iright_29 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc185) + %iright_31 = tt.broadcast %iright_30 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc186) + %ileft_32 = tt.reshape %ileft_27 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc187) + %iright_33 = tt.reshape %iright_31 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc188) + %y_idx = tt.reshape %tmp2 : tensor<1x16xi16> -> tensor<8x2x1xi16> loc(#loc189) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc190) + %left_idx_34 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc191) + %left_idx_35 = arith.muli %y_idx, %left_idx_34 : tensor<8x2x1xi16> loc(#loc191) + %input = arith.extsi %left_idx_35 : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc220) + %left_idx_36 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_376: i32 loc(callsite(#loc1 at #loc192)), %left_idx_377: i32 loc(callsite(#loc1 at #loc192))): + %left_idx_378 = arith.addi %left_idx_376, %left_idx_377 : i32 loc(#loc228) + tt.reduce.return %left_idx_378 : i32 loc(#loc221) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc221) + %left_idx_37 = tt.expand_dims %left_idx_36 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc193) + %left_idx_38 = tt.broadcast %left_idx_37 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc194) + %right_idx = arith.trunci %flip_21 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc195) + %right_idx_39 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc196) + %right_idx_40 = arith.muli %y_idx, %right_idx_39 : tensor<8x2x1xi16> loc(#loc196) + %input_41 = arith.extsi %right_idx_40 : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc223) + %right_idx_42 = "tt.reduce"(%input_41) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_376: i32 loc(callsite(#loc1 at #loc197)), %right_idx_377: i32 loc(callsite(#loc1 at #loc197))): + %right_idx_378 = arith.addi %right_idx_376, %right_idx_377 : i32 loc(#loc229) + tt.reduce.return %right_idx_378 : i32 loc(#loc224) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc224) + %right_idx_43 = tt.expand_dims %right_idx_42 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc198) + %right_idx_44 = tt.broadcast %right_idx_43 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc199) + %left_idx_45 = tt.reshape %left_idx_38 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc200) + %right_idx_46 = tt.reshape %right_idx_44 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc201) + %cond = arith.cmpi slt, %ileft_32, %iright_33 : tensor<1x16xi32> loc(#loc202) + %eq = arith.cmpi eq, %ileft_32, %iright_33 : tensor<1x16xi32> loc(#loc203) + %cond_47 = arith.cmpi sgt, %left_idx_45, %right_idx_46 : tensor<1x16xi32> loc(#loc204) + %cond_48 = arith.andi %eq, %cond_47 : tensor<1x16xi1> loc(#loc205) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc206) + %cond_50 = arith.extui %cond_49 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc207) + %cond_51 = arith.xori %cond_50, %flip_23 : tensor<1x16xi32> loc(#loc207) + %cond_52 = arith.cmpi ne, %cond_51, %cst_0 : tensor<1x16xi32> loc(#loc208) + %ret = arith.xori %ileft_32, %iright_33 : tensor<1x16xi32> loc(#loc209) + %ret_53 = arith.select %cond_52, %ret, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc210) + %ret_54 = arith.xori %tmp0_19, %ret_53 : tensor<1x16xi32> loc(#loc211) + %new_idxs = arith.xori %left_idx_45, %right_idx_46 : tensor<1x16xi32> loc(#loc212) + %new_idxs_55 = arith.select %cond_52, %new_idxs, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc213) + %new_idxs_56 = arith.extsi %tmp2 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc214) + %new_idxs_57 = arith.xori %new_idxs_56, %new_idxs_55 : tensor<1x16xi32> loc(#loc214) + %flip_58 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc170) + %flip_59 = tt.reshape %flip_58 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc171) + %y_60 = tt.reshape %ret_54 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc177) + %ileft_61 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc179) + %ileft_62 = arith.muli %y_60, %ileft_61 : tensor<4x2x2xi32> loc(#loc179) + %ileft_63 = "tt.reduce"(%ileft_62) <{axis = 1 : i32}> ({ + ^bb0(%ileft_376: i32 loc(callsite(#loc1 at #loc180)), %ileft_377: i32 loc(callsite(#loc1 at #loc180))): + %ileft_378 = arith.addi %ileft_376, %ileft_377 : i32 loc(#loc226) + tt.reduce.return %ileft_378 : i32 loc(#loc216) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc216) + %ileft_64 = tt.expand_dims %ileft_63 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc181) + %ileft_65 = tt.broadcast %ileft_64 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc182) + %iright_66 = arith.muli %y_60, %flip_22 : tensor<4x2x2xi32> loc(#loc183) + %iright_67 = "tt.reduce"(%iright_66) <{axis = 1 : i32}> ({ + ^bb0(%iright_376: i32 loc(callsite(#loc1 at #loc184)), %iright_377: i32 loc(callsite(#loc1 at #loc184))): + %iright_378 = arith.addi %iright_376, %iright_377 : i32 loc(#loc227) + tt.reduce.return %iright_378 : i32 loc(#loc218) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc218) + %iright_68 = tt.expand_dims %iright_67 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc185) + %iright_69 = tt.broadcast %iright_68 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc186) + %ileft_70 = tt.reshape %ileft_65 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc187) + %iright_71 = tt.reshape %iright_69 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc188) + %y_idx_72 = tt.reshape %new_idxs_57 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc189) + %left_idx_73 = arith.muli %y_idx_72, %ileft_61 : tensor<4x2x2xi32> loc(#loc191) + %left_idx_74 = "tt.reduce"(%left_idx_73) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_376: i32 loc(callsite(#loc1 at #loc192)), %left_idx_377: i32 loc(callsite(#loc1 at #loc192))): + %left_idx_378 = arith.addi %left_idx_376, %left_idx_377 : i32 loc(#loc228) + tt.reduce.return %left_idx_378 : i32 loc(#loc221) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc221) + %left_idx_75 = tt.expand_dims %left_idx_74 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc193) + %left_idx_76 = tt.broadcast %left_idx_75 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc194) + %right_idx_77 = arith.muli %y_idx_72, %flip_22 : tensor<4x2x2xi32> loc(#loc196) + %right_idx_78 = "tt.reduce"(%right_idx_77) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_376: i32 loc(callsite(#loc1 at #loc197)), %right_idx_377: i32 loc(callsite(#loc1 at #loc197))): + %right_idx_378 = arith.addi %right_idx_376, %right_idx_377 : i32 loc(#loc229) + tt.reduce.return %right_idx_378 : i32 loc(#loc224) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc224) + %right_idx_79 = tt.expand_dims %right_idx_78 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc198) + %right_idx_80 = tt.broadcast %right_idx_79 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc199) + %left_idx_81 = tt.reshape %left_idx_76 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc200) + %right_idx_82 = tt.reshape %right_idx_80 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc201) + %cond_83 = arith.cmpi slt, %ileft_70, %iright_71 : tensor<1x16xi32> loc(#loc202) + %eq_84 = arith.cmpi eq, %ileft_70, %iright_71 : tensor<1x16xi32> loc(#loc203) + %cond_85 = arith.cmpi sgt, %left_idx_81, %right_idx_82 : tensor<1x16xi32> loc(#loc204) + %cond_86 = arith.andi %eq_84, %cond_85 : tensor<1x16xi1> loc(#loc205) + %cond_87 = arith.ori %cond_83, %cond_86 : tensor<1x16xi1> loc(#loc206) + %cond_88 = arith.extui %cond_87 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc207) + %cond_89 = arith.xori %cond_88, %flip_59 : tensor<1x16xi32> loc(#loc207) + %cond_90 = arith.cmpi ne, %cond_89, %cst_0 : tensor<1x16xi32> loc(#loc208) + %ret_91 = arith.xori %ileft_70, %iright_71 : tensor<1x16xi32> loc(#loc209) + %ret_92 = arith.select %cond_90, %ret_91, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc210) + %ret_93 = arith.xori %ret_54, %ret_92 : tensor<1x16xi32> loc(#loc211) + %new_idxs_94 = arith.xori %left_idx_81, %right_idx_82 : tensor<1x16xi32> loc(#loc212) + %new_idxs_95 = arith.select %cond_90, %new_idxs_94, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc213) + %new_idxs_96 = arith.xori %new_idxs_57, %new_idxs_95 : tensor<1x16xi32> loc(#loc214) + %y_97 = tt.reshape %ret_93 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc177) + %ileft_98 = arith.muli %y_97, %ileft : tensor<8x2x1xi32> loc(#loc179) + %ileft_99 = "tt.reduce"(%ileft_98) <{axis = 1 : i32}> ({ + ^bb0(%ileft_376: i32 loc(callsite(#loc1 at #loc180)), %ileft_377: i32 loc(callsite(#loc1 at #loc180))): + %ileft_378 = arith.addi %ileft_376, %ileft_377 : i32 loc(#loc226) + tt.reduce.return %ileft_378 : i32 loc(#loc216) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc216) + %ileft_100 = tt.expand_dims %ileft_99 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc181) + %ileft_101 = tt.broadcast %ileft_100 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc182) + %iright_102 = arith.muli %y_97, %iright : tensor<8x2x1xi32> loc(#loc183) + %iright_103 = "tt.reduce"(%iright_102) <{axis = 1 : i32}> ({ + ^bb0(%iright_376: i32 loc(callsite(#loc1 at #loc184)), %iright_377: i32 loc(callsite(#loc1 at #loc184))): + %iright_378 = arith.addi %iright_376, %iright_377 : i32 loc(#loc227) + tt.reduce.return %iright_378 : i32 loc(#loc218) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc218) + %iright_104 = tt.expand_dims %iright_103 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc185) + %iright_105 = tt.broadcast %iright_104 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc186) + %ileft_106 = tt.reshape %ileft_101 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc187) + %iright_107 = tt.reshape %iright_105 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc188) + %y_idx_108 = tt.reshape %new_idxs_96 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc189) + %left_idx_109 = arith.muli %y_idx_108, %ileft : tensor<8x2x1xi32> loc(#loc191) + %left_idx_110 = "tt.reduce"(%left_idx_109) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_376: i32 loc(callsite(#loc1 at #loc192)), %left_idx_377: i32 loc(callsite(#loc1 at #loc192))): + %left_idx_378 = arith.addi %left_idx_376, %left_idx_377 : i32 loc(#loc228) + tt.reduce.return %left_idx_378 : i32 loc(#loc221) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc221) + %left_idx_111 = tt.expand_dims %left_idx_110 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc193) + %left_idx_112 = tt.broadcast %left_idx_111 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc194) + %right_idx_113 = arith.muli %y_idx_108, %iright : tensor<8x2x1xi32> loc(#loc196) + %right_idx_114 = "tt.reduce"(%right_idx_113) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_376: i32 loc(callsite(#loc1 at #loc197)), %right_idx_377: i32 loc(callsite(#loc1 at #loc197))): + %right_idx_378 = arith.addi %right_idx_376, %right_idx_377 : i32 loc(#loc229) + tt.reduce.return %right_idx_378 : i32 loc(#loc224) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc224) + %right_idx_115 = tt.expand_dims %right_idx_114 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc198) + %right_idx_116 = tt.broadcast %right_idx_115 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc199) + %left_idx_117 = tt.reshape %left_idx_112 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc200) + %right_idx_118 = tt.reshape %right_idx_116 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc201) + %cond_119 = arith.cmpi slt, %ileft_106, %iright_107 : tensor<1x16xi32> loc(#loc202) + %eq_120 = arith.cmpi eq, %ileft_106, %iright_107 : tensor<1x16xi32> loc(#loc203) + %cond_121 = arith.cmpi sgt, %left_idx_117, %right_idx_118 : tensor<1x16xi32> loc(#loc204) + %cond_122 = arith.andi %eq_120, %cond_121 : tensor<1x16xi1> loc(#loc205) + %cond_123 = arith.ori %cond_119, %cond_122 : tensor<1x16xi1> loc(#loc206) + %cond_124 = arith.extui %cond_123 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc207) + %cond_125 = arith.xori %cond_124, %flip_59 : tensor<1x16xi32> loc(#loc207) + %cond_126 = arith.cmpi ne, %cond_125, %cst_0 : tensor<1x16xi32> loc(#loc208) + %ret_127 = arith.xori %ileft_106, %iright_107 : tensor<1x16xi32> loc(#loc209) + %ret_128 = arith.select %cond_126, %ret_127, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc210) + %ret_129 = arith.xori %ret_93, %ret_128 : tensor<1x16xi32> loc(#loc211) + %new_idxs_130 = arith.xori %left_idx_117, %right_idx_118 : tensor<1x16xi32> loc(#loc212) + %new_idxs_131 = arith.select %cond_126, %new_idxs_130, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc213) + %new_idxs_132 = arith.xori %new_idxs_96, %new_idxs_131 : tensor<1x16xi32> loc(#loc214) + %flip_133 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc170) + %flip_134 = tt.reshape %flip_133 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc171) + %y_135 = tt.reshape %ret_129 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc177) + %ileft_136 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc179) + %ileft_137 = arith.muli %y_135, %ileft_136 : tensor<2x2x4xi32> loc(#loc179) + %ileft_138 = "tt.reduce"(%ileft_137) <{axis = 1 : i32}> ({ + ^bb0(%ileft_376: i32 loc(callsite(#loc1 at #loc180)), %ileft_377: i32 loc(callsite(#loc1 at #loc180))): + %ileft_378 = arith.addi %ileft_376, %ileft_377 : i32 loc(#loc226) + tt.reduce.return %ileft_378 : i32 loc(#loc216) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc216) + %ileft_139 = tt.expand_dims %ileft_138 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc181) + %ileft_140 = tt.broadcast %ileft_139 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc182) + %iright_141 = arith.muli %y_135, %flip_58 : tensor<2x2x4xi32> loc(#loc183) + %iright_142 = "tt.reduce"(%iright_141) <{axis = 1 : i32}> ({ + ^bb0(%iright_376: i32 loc(callsite(#loc1 at #loc184)), %iright_377: i32 loc(callsite(#loc1 at #loc184))): + %iright_378 = arith.addi %iright_376, %iright_377 : i32 loc(#loc227) + tt.reduce.return %iright_378 : i32 loc(#loc218) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc218) + %iright_143 = tt.expand_dims %iright_142 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc185) + %iright_144 = tt.broadcast %iright_143 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc186) + %ileft_145 = tt.reshape %ileft_140 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc187) + %iright_146 = tt.reshape %iright_144 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc188) + %y_idx_147 = tt.reshape %new_idxs_132 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc189) + %left_idx_148 = arith.muli %y_idx_147, %ileft_136 : tensor<2x2x4xi32> loc(#loc191) + %left_idx_149 = "tt.reduce"(%left_idx_148) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_376: i32 loc(callsite(#loc1 at #loc192)), %left_idx_377: i32 loc(callsite(#loc1 at #loc192))): + %left_idx_378 = arith.addi %left_idx_376, %left_idx_377 : i32 loc(#loc228) + tt.reduce.return %left_idx_378 : i32 loc(#loc221) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc221) + %left_idx_150 = tt.expand_dims %left_idx_149 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc193) + %left_idx_151 = tt.broadcast %left_idx_150 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc194) + %right_idx_152 = arith.muli %y_idx_147, %flip_58 : tensor<2x2x4xi32> loc(#loc196) + %right_idx_153 = "tt.reduce"(%right_idx_152) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_376: i32 loc(callsite(#loc1 at #loc197)), %right_idx_377: i32 loc(callsite(#loc1 at #loc197))): + %right_idx_378 = arith.addi %right_idx_376, %right_idx_377 : i32 loc(#loc229) + tt.reduce.return %right_idx_378 : i32 loc(#loc224) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc224) + %right_idx_154 = tt.expand_dims %right_idx_153 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc198) + %right_idx_155 = tt.broadcast %right_idx_154 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc199) + %left_idx_156 = tt.reshape %left_idx_151 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc200) + %right_idx_157 = tt.reshape %right_idx_155 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc201) + %cond_158 = arith.cmpi slt, %ileft_145, %iright_146 : tensor<1x16xi32> loc(#loc202) + %eq_159 = arith.cmpi eq, %ileft_145, %iright_146 : tensor<1x16xi32> loc(#loc203) + %cond_160 = arith.cmpi sgt, %left_idx_156, %right_idx_157 : tensor<1x16xi32> loc(#loc204) + %cond_161 = arith.andi %eq_159, %cond_160 : tensor<1x16xi1> loc(#loc205) + %cond_162 = arith.ori %cond_158, %cond_161 : tensor<1x16xi1> loc(#loc206) + %cond_163 = arith.extui %cond_162 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc207) + %cond_164 = arith.xori %cond_163, %flip_134 : tensor<1x16xi32> loc(#loc207) + %cond_165 = arith.cmpi ne, %cond_164, %cst_0 : tensor<1x16xi32> loc(#loc208) + %ret_166 = arith.xori %ileft_145, %iright_146 : tensor<1x16xi32> loc(#loc209) + %ret_167 = arith.select %cond_165, %ret_166, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc210) + %ret_168 = arith.xori %ret_129, %ret_167 : tensor<1x16xi32> loc(#loc211) + %new_idxs_169 = arith.xori %left_idx_156, %right_idx_157 : tensor<1x16xi32> loc(#loc212) + %new_idxs_170 = arith.select %cond_165, %new_idxs_169, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc213) + %new_idxs_171 = arith.xori %new_idxs_132, %new_idxs_170 : tensor<1x16xi32> loc(#loc214) + %y_172 = tt.reshape %ret_168 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc177) + %ileft_173 = arith.muli %y_172, %ileft_61 : tensor<4x2x2xi32> loc(#loc179) + %ileft_174 = "tt.reduce"(%ileft_173) <{axis = 1 : i32}> ({ + ^bb0(%ileft_376: i32 loc(callsite(#loc1 at #loc180)), %ileft_377: i32 loc(callsite(#loc1 at #loc180))): + %ileft_378 = arith.addi %ileft_376, %ileft_377 : i32 loc(#loc226) + tt.reduce.return %ileft_378 : i32 loc(#loc216) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc216) + %ileft_175 = tt.expand_dims %ileft_174 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc181) + %ileft_176 = tt.broadcast %ileft_175 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc182) + %iright_177 = arith.muli %y_172, %flip_22 : tensor<4x2x2xi32> loc(#loc183) + %iright_178 = "tt.reduce"(%iright_177) <{axis = 1 : i32}> ({ + ^bb0(%iright_376: i32 loc(callsite(#loc1 at #loc184)), %iright_377: i32 loc(callsite(#loc1 at #loc184))): + %iright_378 = arith.addi %iright_376, %iright_377 : i32 loc(#loc227) + tt.reduce.return %iright_378 : i32 loc(#loc218) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc218) + %iright_179 = tt.expand_dims %iright_178 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc185) + %iright_180 = tt.broadcast %iright_179 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc186) + %ileft_181 = tt.reshape %ileft_176 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc187) + %iright_182 = tt.reshape %iright_180 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc188) + %y_idx_183 = tt.reshape %new_idxs_171 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc189) + %left_idx_184 = arith.muli %y_idx_183, %ileft_61 : tensor<4x2x2xi32> loc(#loc191) + %left_idx_185 = "tt.reduce"(%left_idx_184) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_376: i32 loc(callsite(#loc1 at #loc192)), %left_idx_377: i32 loc(callsite(#loc1 at #loc192))): + %left_idx_378 = arith.addi %left_idx_376, %left_idx_377 : i32 loc(#loc228) + tt.reduce.return %left_idx_378 : i32 loc(#loc221) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc221) + %left_idx_186 = tt.expand_dims %left_idx_185 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc193) + %left_idx_187 = tt.broadcast %left_idx_186 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc194) + %right_idx_188 = arith.muli %y_idx_183, %flip_22 : tensor<4x2x2xi32> loc(#loc196) + %right_idx_189 = "tt.reduce"(%right_idx_188) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_376: i32 loc(callsite(#loc1 at #loc197)), %right_idx_377: i32 loc(callsite(#loc1 at #loc197))): + %right_idx_378 = arith.addi %right_idx_376, %right_idx_377 : i32 loc(#loc229) + tt.reduce.return %right_idx_378 : i32 loc(#loc224) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc224) + %right_idx_190 = tt.expand_dims %right_idx_189 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc198) + %right_idx_191 = tt.broadcast %right_idx_190 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc199) + %left_idx_192 = tt.reshape %left_idx_187 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc200) + %right_idx_193 = tt.reshape %right_idx_191 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc201) + %cond_194 = arith.cmpi slt, %ileft_181, %iright_182 : tensor<1x16xi32> loc(#loc202) + %eq_195 = arith.cmpi eq, %ileft_181, %iright_182 : tensor<1x16xi32> loc(#loc203) + %cond_196 = arith.cmpi sgt, %left_idx_192, %right_idx_193 : tensor<1x16xi32> loc(#loc204) + %cond_197 = arith.andi %eq_195, %cond_196 : tensor<1x16xi1> loc(#loc205) + %cond_198 = arith.ori %cond_194, %cond_197 : tensor<1x16xi1> loc(#loc206) + %cond_199 = arith.extui %cond_198 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc207) + %cond_200 = arith.xori %cond_199, %flip_134 : tensor<1x16xi32> loc(#loc207) + %cond_201 = arith.cmpi ne, %cond_200, %cst_0 : tensor<1x16xi32> loc(#loc208) + %ret_202 = arith.xori %ileft_181, %iright_182 : tensor<1x16xi32> loc(#loc209) + %ret_203 = arith.select %cond_201, %ret_202, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc210) + %ret_204 = arith.xori %ret_168, %ret_203 : tensor<1x16xi32> loc(#loc211) + %new_idxs_205 = arith.xori %left_idx_192, %right_idx_193 : tensor<1x16xi32> loc(#loc212) + %new_idxs_206 = arith.select %cond_201, %new_idxs_205, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc213) + %new_idxs_207 = arith.xori %new_idxs_171, %new_idxs_206 : tensor<1x16xi32> loc(#loc214) + %y_208 = tt.reshape %ret_204 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc177) + %ileft_209 = arith.muli %y_208, %ileft : tensor<8x2x1xi32> loc(#loc179) + %ileft_210 = "tt.reduce"(%ileft_209) <{axis = 1 : i32}> ({ + ^bb0(%ileft_376: i32 loc(callsite(#loc1 at #loc180)), %ileft_377: i32 loc(callsite(#loc1 at #loc180))): + %ileft_378 = arith.addi %ileft_376, %ileft_377 : i32 loc(#loc226) + tt.reduce.return %ileft_378 : i32 loc(#loc216) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc216) + %ileft_211 = tt.expand_dims %ileft_210 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc181) + %ileft_212 = tt.broadcast %ileft_211 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc182) + %iright_213 = arith.muli %y_208, %iright : tensor<8x2x1xi32> loc(#loc183) + %iright_214 = "tt.reduce"(%iright_213) <{axis = 1 : i32}> ({ + ^bb0(%iright_376: i32 loc(callsite(#loc1 at #loc184)), %iright_377: i32 loc(callsite(#loc1 at #loc184))): + %iright_378 = arith.addi %iright_376, %iright_377 : i32 loc(#loc227) + tt.reduce.return %iright_378 : i32 loc(#loc218) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc218) + %iright_215 = tt.expand_dims %iright_214 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc185) + %iright_216 = tt.broadcast %iright_215 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc186) + %ileft_217 = tt.reshape %ileft_212 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc187) + %iright_218 = tt.reshape %iright_216 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc188) + %y_idx_219 = tt.reshape %new_idxs_207 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc189) + %left_idx_220 = arith.muli %y_idx_219, %ileft : tensor<8x2x1xi32> loc(#loc191) + %left_idx_221 = "tt.reduce"(%left_idx_220) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_376: i32 loc(callsite(#loc1 at #loc192)), %left_idx_377: i32 loc(callsite(#loc1 at #loc192))): + %left_idx_378 = arith.addi %left_idx_376, %left_idx_377 : i32 loc(#loc228) + tt.reduce.return %left_idx_378 : i32 loc(#loc221) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc221) + %left_idx_222 = tt.expand_dims %left_idx_221 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc193) + %left_idx_223 = tt.broadcast %left_idx_222 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc194) + %right_idx_224 = arith.muli %y_idx_219, %iright : tensor<8x2x1xi32> loc(#loc196) + %right_idx_225 = "tt.reduce"(%right_idx_224) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_376: i32 loc(callsite(#loc1 at #loc197)), %right_idx_377: i32 loc(callsite(#loc1 at #loc197))): + %right_idx_378 = arith.addi %right_idx_376, %right_idx_377 : i32 loc(#loc229) + tt.reduce.return %right_idx_378 : i32 loc(#loc224) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc224) + %right_idx_226 = tt.expand_dims %right_idx_225 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc198) + %right_idx_227 = tt.broadcast %right_idx_226 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc199) + %left_idx_228 = tt.reshape %left_idx_223 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc200) + %right_idx_229 = tt.reshape %right_idx_227 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc201) + %cond_230 = arith.cmpi slt, %ileft_217, %iright_218 : tensor<1x16xi32> loc(#loc202) + %eq_231 = arith.cmpi eq, %ileft_217, %iright_218 : tensor<1x16xi32> loc(#loc203) + %cond_232 = arith.cmpi sgt, %left_idx_228, %right_idx_229 : tensor<1x16xi32> loc(#loc204) + %cond_233 = arith.andi %eq_231, %cond_232 : tensor<1x16xi1> loc(#loc205) + %cond_234 = arith.ori %cond_230, %cond_233 : tensor<1x16xi1> loc(#loc206) + %cond_235 = arith.extui %cond_234 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc207) + %cond_236 = arith.xori %cond_235, %flip_134 : tensor<1x16xi32> loc(#loc207) + %cond_237 = arith.cmpi ne, %cond_236, %cst_0 : tensor<1x16xi32> loc(#loc208) + %ret_238 = arith.xori %ileft_217, %iright_218 : tensor<1x16xi32> loc(#loc209) + %ret_239 = arith.select %cond_237, %ret_238, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc210) + %ret_240 = arith.xori %ret_204, %ret_239 : tensor<1x16xi32> loc(#loc211) + %new_idxs_241 = arith.xori %left_idx_228, %right_idx_229 : tensor<1x16xi32> loc(#loc212) + %new_idxs_242 = arith.select %cond_237, %new_idxs_241, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc213) + %new_idxs_243 = arith.xori %new_idxs_207, %new_idxs_242 : tensor<1x16xi32> loc(#loc214) + %y_244 = tt.reshape %ret_240 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc177) + %ileft_245 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc179) + %ileft_246 = arith.muli %y_244, %ileft_245 : tensor<1x2x8xi32> loc(#loc179) + %ileft_247 = "tt.reduce"(%ileft_246) <{axis = 1 : i32}> ({ + ^bb0(%ileft_376: i32 loc(callsite(#loc1 at #loc180)), %ileft_377: i32 loc(callsite(#loc1 at #loc180))): + %ileft_378 = arith.addi %ileft_376, %ileft_377 : i32 loc(#loc226) + tt.reduce.return %ileft_378 : i32 loc(#loc216) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc216) + %ileft_248 = tt.expand_dims %ileft_247 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc181) + %ileft_249 = tt.broadcast %ileft_248 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc182) + %iright_250 = arith.muli %y_244, %flip_133 : tensor<1x2x8xi32> loc(#loc183) + %iright_251 = "tt.reduce"(%iright_250) <{axis = 1 : i32}> ({ + ^bb0(%iright_376: i32 loc(callsite(#loc1 at #loc184)), %iright_377: i32 loc(callsite(#loc1 at #loc184))): + %iright_378 = arith.addi %iright_376, %iright_377 : i32 loc(#loc227) + tt.reduce.return %iright_378 : i32 loc(#loc218) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc218) + %iright_252 = tt.expand_dims %iright_251 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc185) + %iright_253 = tt.broadcast %iright_252 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc186) + %ileft_254 = tt.reshape %ileft_249 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc187) + %iright_255 = tt.reshape %iright_253 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc188) + %y_idx_256 = tt.reshape %new_idxs_243 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc189) + %left_idx_257 = arith.muli %y_idx_256, %ileft_245 : tensor<1x2x8xi32> loc(#loc191) + %left_idx_258 = "tt.reduce"(%left_idx_257) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_376: i32 loc(callsite(#loc1 at #loc192)), %left_idx_377: i32 loc(callsite(#loc1 at #loc192))): + %left_idx_378 = arith.addi %left_idx_376, %left_idx_377 : i32 loc(#loc228) + tt.reduce.return %left_idx_378 : i32 loc(#loc221) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc221) + %left_idx_259 = tt.expand_dims %left_idx_258 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc193) + %left_idx_260 = tt.broadcast %left_idx_259 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc194) + %right_idx_261 = arith.muli %y_idx_256, %flip_133 : tensor<1x2x8xi32> loc(#loc196) + %right_idx_262 = "tt.reduce"(%right_idx_261) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_376: i32 loc(callsite(#loc1 at #loc197)), %right_idx_377: i32 loc(callsite(#loc1 at #loc197))): + %right_idx_378 = arith.addi %right_idx_376, %right_idx_377 : i32 loc(#loc229) + tt.reduce.return %right_idx_378 : i32 loc(#loc224) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc224) + %right_idx_263 = tt.expand_dims %right_idx_262 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc198) + %right_idx_264 = tt.broadcast %right_idx_263 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc199) + %left_idx_265 = tt.reshape %left_idx_260 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc200) + %right_idx_266 = tt.reshape %right_idx_264 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc201) + %cond_267 = arith.cmpi slt, %ileft_254, %iright_255 : tensor<1x16xi32> loc(#loc202) + %eq_268 = arith.cmpi eq, %ileft_254, %iright_255 : tensor<1x16xi32> loc(#loc203) + %cond_269 = arith.cmpi sgt, %left_idx_265, %right_idx_266 : tensor<1x16xi32> loc(#loc204) + %cond_270 = arith.andi %eq_268, %cond_269 : tensor<1x16xi1> loc(#loc205) + %cond_271 = arith.ori %cond_267, %cond_270 : tensor<1x16xi1> loc(#loc206) + %ret_272 = arith.xori %ileft_254, %iright_255 : tensor<1x16xi32> loc(#loc209) + %ret_273 = arith.select %cond_271, %ret_272, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc210) + %ret_274 = arith.xori %ret_240, %ret_273 : tensor<1x16xi32> loc(#loc211) + %new_idxs_275 = arith.xori %left_idx_265, %right_idx_266 : tensor<1x16xi32> loc(#loc212) + %new_idxs_276 = arith.select %cond_271, %new_idxs_275, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc213) + %new_idxs_277 = arith.xori %new_idxs_243, %new_idxs_276 : tensor<1x16xi32> loc(#loc214) + %y_278 = tt.reshape %ret_274 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc177) + %ileft_279 = arith.muli %y_278, %ileft_136 : tensor<2x2x4xi32> loc(#loc179) + %ileft_280 = "tt.reduce"(%ileft_279) <{axis = 1 : i32}> ({ + ^bb0(%ileft_376: i32 loc(callsite(#loc1 at #loc180)), %ileft_377: i32 loc(callsite(#loc1 at #loc180))): + %ileft_378 = arith.addi %ileft_376, %ileft_377 : i32 loc(#loc226) + tt.reduce.return %ileft_378 : i32 loc(#loc216) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc216) + %ileft_281 = tt.expand_dims %ileft_280 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc181) + %ileft_282 = tt.broadcast %ileft_281 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc182) + %iright_283 = arith.muli %y_278, %flip_58 : tensor<2x2x4xi32> loc(#loc183) + %iright_284 = "tt.reduce"(%iright_283) <{axis = 1 : i32}> ({ + ^bb0(%iright_376: i32 loc(callsite(#loc1 at #loc184)), %iright_377: i32 loc(callsite(#loc1 at #loc184))): + %iright_378 = arith.addi %iright_376, %iright_377 : i32 loc(#loc227) + tt.reduce.return %iright_378 : i32 loc(#loc218) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc218) + %iright_285 = tt.expand_dims %iright_284 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc185) + %iright_286 = tt.broadcast %iright_285 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc186) + %ileft_287 = tt.reshape %ileft_282 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc187) + %iright_288 = tt.reshape %iright_286 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc188) + %y_idx_289 = tt.reshape %new_idxs_277 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc189) + %left_idx_290 = arith.muli %y_idx_289, %ileft_136 : tensor<2x2x4xi32> loc(#loc191) + %left_idx_291 = "tt.reduce"(%left_idx_290) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_376: i32 loc(callsite(#loc1 at #loc192)), %left_idx_377: i32 loc(callsite(#loc1 at #loc192))): + %left_idx_378 = arith.addi %left_idx_376, %left_idx_377 : i32 loc(#loc228) + tt.reduce.return %left_idx_378 : i32 loc(#loc221) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc221) + %left_idx_292 = tt.expand_dims %left_idx_291 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc193) + %left_idx_293 = tt.broadcast %left_idx_292 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc194) + %right_idx_294 = arith.muli %y_idx_289, %flip_58 : tensor<2x2x4xi32> loc(#loc196) + %right_idx_295 = "tt.reduce"(%right_idx_294) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_376: i32 loc(callsite(#loc1 at #loc197)), %right_idx_377: i32 loc(callsite(#loc1 at #loc197))): + %right_idx_378 = arith.addi %right_idx_376, %right_idx_377 : i32 loc(#loc229) + tt.reduce.return %right_idx_378 : i32 loc(#loc224) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc224) + %right_idx_296 = tt.expand_dims %right_idx_295 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc198) + %right_idx_297 = tt.broadcast %right_idx_296 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc199) + %left_idx_298 = tt.reshape %left_idx_293 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc200) + %right_idx_299 = tt.reshape %right_idx_297 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc201) + %cond_300 = arith.cmpi slt, %ileft_287, %iright_288 : tensor<1x16xi32> loc(#loc202) + %eq_301 = arith.cmpi eq, %ileft_287, %iright_288 : tensor<1x16xi32> loc(#loc203) + %cond_302 = arith.cmpi sgt, %left_idx_298, %right_idx_299 : tensor<1x16xi32> loc(#loc204) + %cond_303 = arith.andi %eq_301, %cond_302 : tensor<1x16xi1> loc(#loc205) + %cond_304 = arith.ori %cond_300, %cond_303 : tensor<1x16xi1> loc(#loc206) + %ret_305 = arith.xori %ileft_287, %iright_288 : tensor<1x16xi32> loc(#loc209) + %ret_306 = arith.select %cond_304, %ret_305, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc210) + %ret_307 = arith.xori %ret_274, %ret_306 : tensor<1x16xi32> loc(#loc211) + %new_idxs_308 = arith.xori %left_idx_298, %right_idx_299 : tensor<1x16xi32> loc(#loc212) + %new_idxs_309 = arith.select %cond_304, %new_idxs_308, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc213) + %new_idxs_310 = arith.xori %new_idxs_277, %new_idxs_309 : tensor<1x16xi32> loc(#loc214) + %y_311 = tt.reshape %ret_307 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc177) + %ileft_312 = arith.muli %y_311, %ileft_61 : tensor<4x2x2xi32> loc(#loc179) + %ileft_313 = "tt.reduce"(%ileft_312) <{axis = 1 : i32}> ({ + ^bb0(%ileft_376: i32 loc(callsite(#loc1 at #loc180)), %ileft_377: i32 loc(callsite(#loc1 at #loc180))): + %ileft_378 = arith.addi %ileft_376, %ileft_377 : i32 loc(#loc226) + tt.reduce.return %ileft_378 : i32 loc(#loc216) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc216) + %ileft_314 = tt.expand_dims %ileft_313 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc181) + %ileft_315 = tt.broadcast %ileft_314 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc182) + %iright_316 = arith.muli %y_311, %flip_22 : tensor<4x2x2xi32> loc(#loc183) + %iright_317 = "tt.reduce"(%iright_316) <{axis = 1 : i32}> ({ + ^bb0(%iright_376: i32 loc(callsite(#loc1 at #loc184)), %iright_377: i32 loc(callsite(#loc1 at #loc184))): + %iright_378 = arith.addi %iright_376, %iright_377 : i32 loc(#loc227) + tt.reduce.return %iright_378 : i32 loc(#loc218) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc218) + %iright_318 = tt.expand_dims %iright_317 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc185) + %iright_319 = tt.broadcast %iright_318 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc186) + %ileft_320 = tt.reshape %ileft_315 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc187) + %iright_321 = tt.reshape %iright_319 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc188) + %y_idx_322 = tt.reshape %new_idxs_310 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc189) + %left_idx_323 = arith.muli %y_idx_322, %ileft_61 : tensor<4x2x2xi32> loc(#loc191) + %left_idx_324 = "tt.reduce"(%left_idx_323) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_376: i32 loc(callsite(#loc1 at #loc192)), %left_idx_377: i32 loc(callsite(#loc1 at #loc192))): + %left_idx_378 = arith.addi %left_idx_376, %left_idx_377 : i32 loc(#loc228) + tt.reduce.return %left_idx_378 : i32 loc(#loc221) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc221) + %left_idx_325 = tt.expand_dims %left_idx_324 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc193) + %left_idx_326 = tt.broadcast %left_idx_325 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc194) + %right_idx_327 = arith.muli %y_idx_322, %flip_22 : tensor<4x2x2xi32> loc(#loc196) + %right_idx_328 = "tt.reduce"(%right_idx_327) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_376: i32 loc(callsite(#loc1 at #loc197)), %right_idx_377: i32 loc(callsite(#loc1 at #loc197))): + %right_idx_378 = arith.addi %right_idx_376, %right_idx_377 : i32 loc(#loc229) + tt.reduce.return %right_idx_378 : i32 loc(#loc224) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc224) + %right_idx_329 = tt.expand_dims %right_idx_328 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc198) + %right_idx_330 = tt.broadcast %right_idx_329 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc199) + %left_idx_331 = tt.reshape %left_idx_326 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc200) + %right_idx_332 = tt.reshape %right_idx_330 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc201) + %cond_333 = arith.cmpi slt, %ileft_320, %iright_321 : tensor<1x16xi32> loc(#loc202) + %eq_334 = arith.cmpi eq, %ileft_320, %iright_321 : tensor<1x16xi32> loc(#loc203) + %cond_335 = arith.cmpi sgt, %left_idx_331, %right_idx_332 : tensor<1x16xi32> loc(#loc204) + %cond_336 = arith.andi %eq_334, %cond_335 : tensor<1x16xi1> loc(#loc205) + %cond_337 = arith.ori %cond_333, %cond_336 : tensor<1x16xi1> loc(#loc206) + %ret_338 = arith.xori %ileft_320, %iright_321 : tensor<1x16xi32> loc(#loc209) + %ret_339 = arith.select %cond_337, %ret_338, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc210) + %ret_340 = arith.xori %ret_307, %ret_339 : tensor<1x16xi32> loc(#loc211) + %new_idxs_341 = arith.xori %left_idx_331, %right_idx_332 : tensor<1x16xi32> loc(#loc212) + %new_idxs_342 = arith.select %cond_337, %new_idxs_341, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc213) + %new_idxs_343 = arith.xori %new_idxs_310, %new_idxs_342 : tensor<1x16xi32> loc(#loc214) + %y_344 = tt.reshape %ret_340 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc177) + %ileft_345 = arith.muli %y_344, %ileft : tensor<8x2x1xi32> loc(#loc179) + %ileft_346 = "tt.reduce"(%ileft_345) <{axis = 1 : i32}> ({ + ^bb0(%ileft_376: i32 loc(callsite(#loc1 at #loc180)), %ileft_377: i32 loc(callsite(#loc1 at #loc180))): + %ileft_378 = arith.addi %ileft_376, %ileft_377 : i32 loc(#loc226) + tt.reduce.return %ileft_378 : i32 loc(#loc216) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc216) + %ileft_347 = tt.expand_dims %ileft_346 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc181) + %ileft_348 = tt.broadcast %ileft_347 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc182) + %iright_349 = arith.muli %y_344, %iright : tensor<8x2x1xi32> loc(#loc183) + %iright_350 = "tt.reduce"(%iright_349) <{axis = 1 : i32}> ({ + ^bb0(%iright_376: i32 loc(callsite(#loc1 at #loc184)), %iright_377: i32 loc(callsite(#loc1 at #loc184))): + %iright_378 = arith.addi %iright_376, %iright_377 : i32 loc(#loc227) + tt.reduce.return %iright_378 : i32 loc(#loc218) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc218) + %iright_351 = tt.expand_dims %iright_350 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc185) + %iright_352 = tt.broadcast %iright_351 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc186) + %ileft_353 = tt.reshape %ileft_348 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc187) + %iright_354 = tt.reshape %iright_352 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc188) + %y_idx_355 = tt.reshape %new_idxs_343 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc189) + %left_idx_356 = arith.muli %y_idx_355, %ileft : tensor<8x2x1xi32> loc(#loc191) + %left_idx_357 = "tt.reduce"(%left_idx_356) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_376: i32 loc(callsite(#loc1 at #loc192)), %left_idx_377: i32 loc(callsite(#loc1 at #loc192))): + %left_idx_378 = arith.addi %left_idx_376, %left_idx_377 : i32 loc(#loc228) + tt.reduce.return %left_idx_378 : i32 loc(#loc221) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc221) + %left_idx_358 = tt.expand_dims %left_idx_357 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc193) + %left_idx_359 = tt.broadcast %left_idx_358 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc194) + %right_idx_360 = arith.muli %y_idx_355, %iright : tensor<8x2x1xi32> loc(#loc196) + %right_idx_361 = "tt.reduce"(%right_idx_360) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_376: i32 loc(callsite(#loc1 at #loc197)), %right_idx_377: i32 loc(callsite(#loc1 at #loc197))): + %right_idx_378 = arith.addi %right_idx_376, %right_idx_377 : i32 loc(#loc229) + tt.reduce.return %right_idx_378 : i32 loc(#loc224) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc224) + %right_idx_362 = tt.expand_dims %right_idx_361 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc198) + %right_idx_363 = tt.broadcast %right_idx_362 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc199) + %left_idx_364 = tt.reshape %left_idx_359 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc200) + %right_idx_365 = tt.reshape %right_idx_363 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc201) + %cond_366 = arith.cmpi slt, %ileft_353, %iright_354 : tensor<1x16xi32> loc(#loc202) + %eq_367 = arith.cmpi eq, %ileft_353, %iright_354 : tensor<1x16xi32> loc(#loc203) + %cond_368 = arith.cmpi sgt, %left_idx_364, %right_idx_365 : tensor<1x16xi32> loc(#loc204) + %cond_369 = arith.andi %eq_367, %cond_368 : tensor<1x16xi1> loc(#loc205) + %cond_370 = arith.ori %cond_366, %cond_369 : tensor<1x16xi1> loc(#loc206) + %new_idxs_371 = arith.xori %left_idx_364, %right_idx_365 : tensor<1x16xi32> loc(#loc212) + %new_idxs_372 = arith.select %cond_370, %new_idxs_371, %cst_0 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc213) + %new_idxs_373 = arith.xori %new_idxs_343, %new_idxs_372 : tensor<1x16xi32> loc(#loc214) + %tmp7 = arith.extsi %tmp0_19 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc157) + %tmp10_374 = arith.select %tmp0_18, %tmp7, %tmp10 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc95) + %tmp11 = "tt.reduce"(%tmp10_374) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_376: i64 loc(callsite(#loc1 at #loc158)), %tmp11_377: i64 loc(callsite(#loc1 at #loc158))): + %tmp11_378 = arith.addi %tmp11_376, %tmp11_377 : i64 loc(#loc215) + tt.reduce.return %tmp11_378 : i64 loc(#loc175) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc175) + %tmp11_375 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc159) + %tmp14 = arith.trunci %tmp11_375 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc160) + %0 = arith.muli %x0_3, %c16_i64 : i64 loc(#loc72) + %1 = tt.splat %0 : i64 -> tensor<1x16xi64> loc(#loc161) + %2 = arith.addi %tmp0, %1 : tensor<1x16xi64> loc(#loc73) + %3 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc74) + %4 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc75) + %5 = arith.extui %4 : i1 to i64 loc(#loc76) + %6 = arith.muli %ks0, %5 : i64 loc(#loc76) + %7 = arith.extui %3 : i1 to i64 loc(#loc162) + %8 = arith.addi %7, %6 : i64 loc(#loc77) + %9 = arith.muli %tmp0_6, %8 : i64 loc(#loc79) + %10 = tt.splat %9 : i64 -> tensor<1x16xi64> loc(#loc163) + %11 = arith.addi %2, %10 : tensor<1x16xi64> loc(#loc80) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc81) + %13 = tt.addptr %12, %11 : tensor<1x16x!tt.ptr>, tensor<1x16xi64> loc(#loc81) + tt.store %13, %new_idxs_373, %tmp0_18 : tensor<1x16x!tt.ptr> loc(#loc82) + %14 = arith.muli %x1, %8 : i64 loc(#loc83) + %15 = arith.addi %x0_3, %14 : i64 loc(#loc84) + %16 = tt.addptr %out_ptr3, %15 : !tt.ptr, i64 loc(#loc85) + %17 = tt.splat %16 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc85) + tt.store %17, %tmp14, %xmask_1 : tensor<1x1x!tt.ptr> loc(#loc86) + tt.return loc(#loc87) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":43:34) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":23:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":25:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":26:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":26:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":32:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":33:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:42) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:54) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:50) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:64) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:68) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:61) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:30) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:73) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":37:19) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":41:19) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":44:29) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":47:21) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:35) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:32) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:62) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:88) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:79) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:70) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:54) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:47) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:40) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:25) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:102) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:34) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:30) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:25) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:89) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:4) +#loc94 = loc(callsite(#loc1 at #loc2)) +#loc95 = loc("tmp10"(#loc3)) +#loc96 = loc("xoffset"(#loc4)) +#loc97 = loc("xmask"(#loc5)) +#loc98 = loc("r0_index"(#loc6)) +#loc99 = loc("r0_index"(#loc7)) +#loc100 = loc("x0"(#loc8)) +#loc101 = loc("x1"(#loc9)) +#loc102 = loc("tmp0"(#loc10)) +#loc103 = loc("tmp0"(#loc11)) +#loc104 = loc("tmp0"(#loc12)) +#loc105 = loc("tmp0"(#loc13)) +#loc106 = loc("tmp0"(#loc14)) +#loc107 = loc("tmp0"(#loc15)) +#loc108 = loc("tmp0"(#loc16)) +#loc109 = loc("tmp0"(#loc17)) +#loc110 = loc("tmp0"(#loc18)) +#loc111 = loc("tmp0"(#loc19)) +#loc112 = loc("tmp2"(#loc20)) +#loc113 = loc("flip"(#loc21)) +#loc115 = loc("flip"(#loc23)) +#loc116 = loc("flip"(#loc24)) +#loc117 = loc("flip"(#loc25)) +#loc118 = loc("y"(#loc26)) +#loc119 = loc("left_mask"(#loc28)) +#loc120 = loc("ileft"(#loc29)) +#loc122 = loc("ileft"(#loc33)) +#loc123 = loc("ileft"(#loc34)) +#loc124 = loc("iright"(#loc35)) +#loc126 = loc("iright"(#loc37)) +#loc127 = loc("iright"(#loc38)) +#loc128 = loc("ileft"(#loc39)) +#loc129 = loc("iright"(#loc40)) +#loc130 = loc("y_idx"(#loc41)) +#loc131 = loc("left_idx"(#loc42)) +#loc132 = loc("left_idx"(#loc43)) +#loc133 = loc("input"(#loc44)) +#loc135 = loc("left_idx"(#loc46)) +#loc136 = loc("left_idx"(#loc47)) +#loc137 = loc("right_idx"(#loc48)) +#loc138 = loc("right_idx"(#loc49)) +#loc140 = loc("right_idx"(#loc51)) +#loc141 = loc("right_idx"(#loc52)) +#loc142 = loc("left_idx"(#loc53)) +#loc143 = loc("right_idx"(#loc54)) +#loc144 = loc("cond"(#loc55)) +#loc145 = loc("eq"(#loc56)) +#loc146 = loc("cond"(#loc57)) +#loc147 = loc("cond"(#loc58)) +#loc148 = loc("cond"(#loc59)) +#loc149 = loc("cond"(#loc60)) +#loc150 = loc("cond"(#loc61)) +#loc151 = loc("ret"(#loc62)) +#loc152 = loc("ret"(#loc63)) +#loc153 = loc("ret"(#loc64)) +#loc154 = loc("new_idxs"(#loc65)) +#loc155 = loc("new_idxs"(#loc66)) +#loc156 = loc("new_idxs"(#loc67)) +#loc157 = loc("tmp7"(#loc68)) +#loc159 = loc("tmp11"(#loc70)) +#loc160 = loc("tmp14"(#loc71)) +#loc161 = loc(fused[#loc73, #loc72]) +#loc162 = loc(fused[#loc77, #loc78]) +#loc163 = loc(fused[#loc80, #loc79]) +#loc164 = loc(fused[#loc102, #loc100]) +#loc165 = loc(fused[#loc104, #loc103]) +#loc166 = loc(fused[#loc109, #loc108]) +#loc167 = loc(fused[#loc111, #loc97]) +#loc168 = loc(callsite(#loc113 at #loc114)) +#loc169 = loc(callsite(#loc115 at #loc114)) +#loc170 = loc(callsite(#loc116 at #loc114)) +#loc171 = loc(callsite(#loc117 at #loc114)) +#loc173 = loc("cond"(#loc144)) +#loc174 = loc("eq"(#loc145)) +#loc175 = loc(callsite(#loc30 at #loc158)) +#loc177 = loc(callsite(#loc118 at #loc172)) +#loc178 = loc(callsite(#loc119 at #loc172)) +#loc179 = loc(callsite(#loc120 at #loc172)) +#loc181 = loc(callsite(#loc122 at #loc172)) +#loc182 = loc(callsite(#loc123 at #loc172)) +#loc183 = loc(callsite(#loc124 at #loc172)) +#loc185 = loc(callsite(#loc126 at #loc172)) +#loc186 = loc(callsite(#loc127 at #loc172)) +#loc187 = loc(callsite(#loc128 at #loc172)) +#loc188 = loc(callsite(#loc129 at #loc172)) +#loc189 = loc(callsite(#loc130 at #loc172)) +#loc190 = loc(callsite(#loc131 at #loc172)) +#loc191 = loc(callsite(#loc132 at #loc172)) +#loc193 = loc(callsite(#loc135 at #loc172)) +#loc194 = loc(callsite(#loc136 at #loc172)) +#loc195 = loc(callsite(#loc137 at #loc172)) +#loc196 = loc(callsite(#loc138 at #loc172)) +#loc198 = loc(callsite(#loc140 at #loc172)) +#loc199 = loc(callsite(#loc141 at #loc172)) +#loc200 = loc(callsite(#loc142 at #loc172)) +#loc201 = loc(callsite(#loc143 at #loc172)) +#loc202 = loc(callsite(#loc173 at #loc172)) +#loc203 = loc(callsite(#loc174 at #loc172)) +#loc204 = loc(callsite(#loc146 at #loc172)) +#loc205 = loc(callsite(#loc147 at #loc172)) +#loc206 = loc(callsite(#loc148 at #loc172)) +#loc207 = loc(callsite(#loc149 at #loc172)) +#loc208 = loc(callsite(#loc150 at #loc172)) +#loc209 = loc(callsite(#loc151 at #loc172)) +#loc210 = loc(callsite(#loc152 at #loc172)) +#loc211 = loc(callsite(#loc153 at #loc172)) +#loc212 = loc(callsite(#loc154 at #loc172)) +#loc213 = loc(callsite(#loc155 at #loc172)) +#loc214 = loc(callsite(#loc156 at #loc172)) +#loc215 = loc(callsite(#loc32 at #loc175)) +#loc216 = loc(callsite(#loc30 at #loc180)) +#loc218 = loc(callsite(#loc30 at #loc184)) +#loc220 = loc(callsite(#loc133 at #loc192)) +#loc221 = loc(callsite(#loc30 at #loc192)) +#loc223 = loc(callsite(#loc133 at #loc197)) +#loc224 = loc(callsite(#loc30 at #loc197)) +#loc226 = loc(callsite(#loc32 at #loc216)) +#loc227 = loc(callsite(#loc32 at #loc218)) +#loc228 = loc(callsite(#loc32 at #loc221)) +#loc229 = loc(callsite(#loc32 at #loc224)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7bc00ea61b25c32cd0ebf68ba2eaeaa477ea3299 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..5b5ca7e2a750d2ff97660adb9e6ea7be5c9e89fb Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d2273e7d11ec303a27dfd693d32ccdd0c664d207 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json @@ -0,0 +1 @@ +{"hash": "31322f85a2b960030b287234182c00a1a5c45689e40c21b0b426d8ccb0c16251", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 64, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..753b6c2d77f9a710c335f18ba7e9088918012f3c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir @@ -0,0 +1,934 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !5 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = shl nuw nsw i32 %8, 2, !dbg !9 + %10 = and i32 %9, 2044, !dbg !9 + %11 = mul i32 %7, 32000, !dbg !10 + %12 = zext nneg i32 %10 to i64, !dbg !11 + br label %13, !dbg !11 + +13: ; preds = %6, %13 + %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %13 ] + %14 = phi <2 x float> [ zeroinitializer, %6 ], [ %271, %13 ] + %15 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %269, %13 ] + %16 = phi <2 x float> [ zeroinitializer, %6 ], [ %270, %13 ] + %17 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %268, %13 ] + %18 = or disjoint i64 %indvars.iv, %12, !dbg !12 + %19 = icmp samesign ult i64 %18, 32000, !dbg !13 + %20 = trunc nuw nsw i64 %18 to i32, !dbg !14 + %21 = add i32 %11, %20, !dbg !14 + %22 = sext i32 %21 to i64, !dbg !15 + %23 = getelementptr bfloat, ptr addrspace(1) %0, i64 %22, !dbg !15 + %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !16 + %25 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %23, i64 %24, i1 %19) #6, !dbg !16 + %26 = extractvalue { i32, i32 } %25, 0, !dbg !16 + %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !16 + %28 = extractvalue { i32, i32 } %25, 1, !dbg !16 + %29 = bitcast i32 %28 to <2 x bfloat>, !dbg !16 + %30 = fcmp uno <2 x float> %17, zeroinitializer, !dbg !17 + %31 = fcmp uno <2 x float> %15, zeroinitializer, !dbg !17 + %32 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i72 = icmp eq i32 %32, 0, !dbg !21 + %33 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i74 = icmp eq i32 %33, 0, !dbg !21 + %34 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i78 = icmp eq i32 %35, 0, !dbg !21 + %36 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i80 = icmp eq i32 %36, 0, !dbg !21 + %37 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i82 = icmp eq i32 %37, 0, !dbg !21 + %38 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i84 = icmp eq i32 %38, 0, !dbg !21 + %39 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %40 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i88 = icmp eq i32 %40, 0, !dbg !21 + %41 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i90 = icmp eq i32 %41, 0, !dbg !21 + %42 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i92 = icmp eq i32 %42, 0, !dbg !21 + %43 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i94 = icmp eq i32 %43, 0, !dbg !21 + %44 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %45 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i98 = icmp eq i32 %45, 0, !dbg !21 + %46 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i100 = icmp eq i32 %46, 0, !dbg !21 + %47 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i102 = icmp eq i32 %47, 0, !dbg !21 + %48 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i104 = icmp eq i32 %48, 0, !dbg !21 + %49 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %50 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i108 = icmp eq i32 %50, 0, !dbg !21 + %51 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i110 = icmp eq i32 %51, 0, !dbg !21 + %52 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i112 = icmp eq i32 %52, 0, !dbg !21 + %53 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i114 = icmp eq i32 %53, 0, !dbg !21 + %54 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %55 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i118 = icmp eq i32 %55, 0, !dbg !21 + %56 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i120 = icmp eq i32 %56, 0, !dbg !21 + %57 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i122 = icmp eq i32 %57, 0, !dbg !21 + %58 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i124 = icmp eq i32 %58, 0, !dbg !21 + %59 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %60 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i128 = icmp eq i32 %60, 0, !dbg !21 + %61 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i130 = icmp eq i32 %61, 0, !dbg !21 + %62 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i132 = icmp eq i32 %62, 0, !dbg !21 + %63 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i134 = icmp eq i32 %63, 0, !dbg !21 + %64 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %65 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i138 = icmp eq i32 %65, 0, !dbg !21 + %66 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i140 = icmp eq i32 %66, 0, !dbg !21 + %67 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i142 = icmp eq i32 %67, 0, !dbg !21 + %68 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i144 = icmp eq i32 %68, 0, !dbg !21 + %69 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %70 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i148 = icmp eq i32 %70, 0, !dbg !21 + %71 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i150 = icmp eq i32 %71, 0, !dbg !21 + %72 = fpext <2 x bfloat> %27 to <2 x float>, !dbg !22 + %73 = fcmp ogt <2 x float> %17, %72, !dbg !23 + %74 = or <2 x i1> %30, %73, !dbg !24 + %75 = select <2 x i1> %74, <2 x float> %17, <2 x float> %72, !dbg !25 + %76 = fcmp oeq <2 x float> %75, splat (float 0xFFF0000000000000), !dbg !26 + %foldExtExtBinop = fsub <2 x float> %17, %75, !dbg !27 + %77 = extractelement <2 x float> %foldExtExtBinop, i64 0, !dbg !27 + %foldExtExtBinop177 = fsub <2 x float> %17, %75, !dbg !27 + %78 = extractelement <2 x float> %foldExtExtBinop177, i64 1, !dbg !27 + %79 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %77, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %80 = tail call float @llvm.nvvm.fma.rn.f(float %77, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i73 = select i1 %.not.i72, float %80, float %79, !dbg !21 + %81 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i73) #6, !dbg !21 + %82 = tail call float @llvm.nvvm.saturate.f(float %.02.i73) #6, !dbg !21 + %.03.i75 = select i1 %.not1.i74, float %82, float %81, !dbg !21 + %83 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i75, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %84 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i75, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %85 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %78, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %86 = tail call float @llvm.nvvm.fma.rn.f(float %78, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i83 = select i1 %.not.i82, float %86, float %85, !dbg !21 + %87 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i83) #6, !dbg !21 + %88 = tail call float @llvm.nvvm.saturate.f(float %.02.i83) #6, !dbg !21 + %.03.i85 = select i1 %.not1.i84, float %88, float %87, !dbg !21 + %89 = insertelement <2 x i32> poison, i32 %34, i64 0, !dbg !21 + %90 = insertelement <2 x i32> %89, i32 %39, i64 1, !dbg !21 + %91 = icmp eq <2 x i32> %90, zeroinitializer, !dbg !21 + %92 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i85, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %93 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i85, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %94 = insertelement <2 x float> poison, float %84, i64 0, !dbg !21 + %95 = insertelement <2 x float> %94, float %93, i64 1, !dbg !21 + %96 = insertelement <2 x float> poison, float %83, i64 0, !dbg !21 + %97 = insertelement <2 x float> %96, float %92, i64 1, !dbg !21 + %98 = select <2 x i1> %91, <2 x float> %95, <2 x float> %97, !dbg !21 + %99 = extractelement <2 x float> %98, i64 0, !dbg !21 + %100 = fadd float %99, 0xC168000FE0000000, !dbg !21 + %101 = fneg float %100, !dbg !21 + %102 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %77, float 0x3FF7154760000000, float %101) #6, !dbg !21 + %103 = tail call float @llvm.nvvm.fma.rn.f(float %77, float 0x3FF7154760000000, float %101) #6, !dbg !21 + %.0.i79 = select i1 %.not3.i78, float %103, float %102, !dbg !21 + %104 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %77, float 0x3E54AE0C00000000, float %.0.i79) #6, !dbg !21 + %105 = tail call float @llvm.nvvm.fma.rn.f(float %77, float 0x3E54AE0C00000000, float %.0.i79) #6, !dbg !21 + %.01.i81 = select i1 %.not4.i80, float %105, float %104, !dbg !21 + %106 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i81) #6, !dbg !21 + %107 = extractelement <2 x float> %98, i64 1, !dbg !21 + %108 = fadd float %107, 0xC168000FE0000000, !dbg !21 + %109 = fneg float %108, !dbg !21 + %110 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %78, float 0x3FF7154760000000, float %109) #6, !dbg !21 + %111 = tail call float @llvm.nvvm.fma.rn.f(float %78, float 0x3FF7154760000000, float %109) #6, !dbg !21 + %.0.i89 = select i1 %.not3.i88, float %111, float %110, !dbg !21 + %112 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %78, float 0x3E54AE0C00000000, float %.0.i89) #6, !dbg !21 + %113 = tail call float @llvm.nvvm.fma.rn.f(float %78, float 0x3E54AE0C00000000, float %.0.i89) #6, !dbg !21 + %.01.i91 = select i1 %.not4.i90, float %113, float %112, !dbg !21 + %114 = bitcast <2 x float> %98 to <2 x i32>, !dbg !21 + %115 = shl <2 x i32> %114, splat (i32 23), !dbg !21 + %116 = bitcast <2 x i32> %115 to <2 x float>, !dbg !21 + %117 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i91) #6, !dbg !21 + %118 = insertelement <2 x float> poison, float %106, i64 0, !dbg !21 + %119 = insertelement <2 x float> %118, float %117, i64 1, !dbg !21 + %120 = fmul <2 x float> %119, %116, !dbg !21 + %121 = select <2 x i1> %76, <2 x float> splat (float 1.000000e+00), <2 x float> %120, !dbg !28 + %foldExtExtBinop179 = fsub <2 x float> %72, %75, !dbg !29 + %122 = extractelement <2 x float> %foldExtExtBinop179, i64 0, !dbg !29 + %foldExtExtBinop181 = fsub <2 x float> %72, %75, !dbg !29 + %123 = extractelement <2 x float> %foldExtExtBinop181, i64 1, !dbg !29 + %124 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %122, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %125 = tail call float @llvm.nvvm.fma.rn.f(float %122, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i113 = select i1 %.not.i112, float %125, float %124, !dbg !21 + %126 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i113) #6, !dbg !21 + %127 = tail call float @llvm.nvvm.saturate.f(float %.02.i113) #6, !dbg !21 + %.03.i115 = select i1 %.not1.i114, float %127, float %126, !dbg !21 + %128 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i115, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %129 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i115, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %130 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %123, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %131 = tail call float @llvm.nvvm.fma.rn.f(float %123, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i123 = select i1 %.not.i122, float %131, float %130, !dbg !21 + %132 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i123) #6, !dbg !21 + %133 = tail call float @llvm.nvvm.saturate.f(float %.02.i123) #6, !dbg !21 + %.03.i125 = select i1 %.not1.i124, float %133, float %132, !dbg !21 + %134 = insertelement <2 x i32> poison, i32 %54, i64 0, !dbg !21 + %135 = insertelement <2 x i32> %134, i32 %59, i64 1, !dbg !21 + %136 = icmp eq <2 x i32> %135, zeroinitializer, !dbg !21 + %137 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i125, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %138 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i125, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %139 = insertelement <2 x float> poison, float %129, i64 0, !dbg !21 + %140 = insertelement <2 x float> %139, float %138, i64 1, !dbg !21 + %141 = insertelement <2 x float> poison, float %128, i64 0, !dbg !21 + %142 = insertelement <2 x float> %141, float %137, i64 1, !dbg !21 + %143 = select <2 x i1> %136, <2 x float> %140, <2 x float> %142, !dbg !21 + %144 = extractelement <2 x float> %143, i64 0, !dbg !21 + %145 = fadd float %144, 0xC168000FE0000000, !dbg !21 + %146 = fneg float %145, !dbg !21 + %147 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %122, float 0x3FF7154760000000, float %146) #6, !dbg !21 + %148 = tail call float @llvm.nvvm.fma.rn.f(float %122, float 0x3FF7154760000000, float %146) #6, !dbg !21 + %.0.i119 = select i1 %.not3.i118, float %148, float %147, !dbg !21 + %149 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %122, float 0x3E54AE0C00000000, float %.0.i119) #6, !dbg !21 + %150 = tail call float @llvm.nvvm.fma.rn.f(float %122, float 0x3E54AE0C00000000, float %.0.i119) #6, !dbg !21 + %.01.i121 = select i1 %.not4.i120, float %150, float %149, !dbg !21 + %151 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i121) #6, !dbg !21 + %152 = extractelement <2 x float> %143, i64 1, !dbg !21 + %153 = fadd float %152, 0xC168000FE0000000, !dbg !21 + %154 = fneg float %153, !dbg !21 + %155 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %123, float 0x3FF7154760000000, float %154) #6, !dbg !21 + %156 = tail call float @llvm.nvvm.fma.rn.f(float %123, float 0x3FF7154760000000, float %154) #6, !dbg !21 + %.0.i129 = select i1 %.not3.i128, float %156, float %155, !dbg !21 + %157 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %123, float 0x3E54AE0C00000000, float %.0.i129) #6, !dbg !21 + %158 = tail call float @llvm.nvvm.fma.rn.f(float %123, float 0x3E54AE0C00000000, float %.0.i129) #6, !dbg !21 + %.01.i131 = select i1 %.not4.i130, float %158, float %157, !dbg !21 + %159 = bitcast <2 x float> %143 to <2 x i32>, !dbg !21 + %160 = shl <2 x i32> %159, splat (i32 23), !dbg !21 + %161 = bitcast <2 x i32> %160 to <2 x float>, !dbg !21 + %162 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i131) #6, !dbg !21 + %163 = insertelement <2 x float> poison, float %151, i64 0, !dbg !21 + %164 = insertelement <2 x float> %163, float %162, i64 1, !dbg !21 + %165 = fmul <2 x float> %164, %161, !dbg !21 + %166 = select <2 x i1> %76, <2 x float> splat (float 1.000000e+00), <2 x float> %165, !dbg !30 + %167 = fmul <2 x float> %16, %121, !dbg !31 + %168 = fadd <2 x float> %167, %166, !dbg !32 + %169 = fpext <2 x bfloat> %29 to <2 x float>, !dbg !22 + %170 = fcmp ogt <2 x float> %15, %169, !dbg !23 + %171 = or <2 x i1> %31, %170, !dbg !24 + %172 = select <2 x i1> %171, <2 x float> %15, <2 x float> %169, !dbg !25 + %173 = fcmp oeq <2 x float> %172, splat (float 0xFFF0000000000000), !dbg !26 + %foldExtExtBinop183 = fsub <2 x float> %15, %172, !dbg !27 + %174 = extractelement <2 x float> %foldExtExtBinop183, i64 0, !dbg !27 + %foldExtExtBinop185 = fsub <2 x float> %15, %172, !dbg !27 + %175 = extractelement <2 x float> %foldExtExtBinop185, i64 1, !dbg !27 + %176 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %174, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %177 = tail call float @llvm.nvvm.fma.rn.f(float %174, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i93 = select i1 %.not.i92, float %177, float %176, !dbg !21 + %178 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i93) #6, !dbg !21 + %179 = tail call float @llvm.nvvm.saturate.f(float %.02.i93) #6, !dbg !21 + %.03.i95 = select i1 %.not1.i94, float %179, float %178, !dbg !21 + %180 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i95, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %181 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i95, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %182 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %175, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %183 = tail call float @llvm.nvvm.fma.rn.f(float %175, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i103 = select i1 %.not.i102, float %183, float %182, !dbg !21 + %184 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i103) #6, !dbg !21 + %185 = tail call float @llvm.nvvm.saturate.f(float %.02.i103) #6, !dbg !21 + %.03.i105 = select i1 %.not1.i104, float %185, float %184, !dbg !21 + %186 = insertelement <2 x i32> poison, i32 %44, i64 0, !dbg !21 + %187 = insertelement <2 x i32> %186, i32 %49, i64 1, !dbg !21 + %188 = icmp eq <2 x i32> %187, zeroinitializer, !dbg !21 + %189 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i105, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %190 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i105, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %191 = insertelement <2 x float> poison, float %181, i64 0, !dbg !21 + %192 = insertelement <2 x float> %191, float %190, i64 1, !dbg !21 + %193 = insertelement <2 x float> poison, float %180, i64 0, !dbg !21 + %194 = insertelement <2 x float> %193, float %189, i64 1, !dbg !21 + %195 = select <2 x i1> %188, <2 x float> %192, <2 x float> %194, !dbg !21 + %196 = extractelement <2 x float> %195, i64 0, !dbg !21 + %197 = fadd float %196, 0xC168000FE0000000, !dbg !21 + %198 = fneg float %197, !dbg !21 + %199 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %174, float 0x3FF7154760000000, float %198) #6, !dbg !21 + %200 = tail call float @llvm.nvvm.fma.rn.f(float %174, float 0x3FF7154760000000, float %198) #6, !dbg !21 + %.0.i99 = select i1 %.not3.i98, float %200, float %199, !dbg !21 + %201 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %174, float 0x3E54AE0C00000000, float %.0.i99) #6, !dbg !21 + %202 = tail call float @llvm.nvvm.fma.rn.f(float %174, float 0x3E54AE0C00000000, float %.0.i99) #6, !dbg !21 + %.01.i101 = select i1 %.not4.i100, float %202, float %201, !dbg !21 + %203 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i101) #6, !dbg !21 + %204 = extractelement <2 x float> %195, i64 1, !dbg !21 + %205 = fadd float %204, 0xC168000FE0000000, !dbg !21 + %206 = fneg float %205, !dbg !21 + %207 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %175, float 0x3FF7154760000000, float %206) #6, !dbg !21 + %208 = tail call float @llvm.nvvm.fma.rn.f(float %175, float 0x3FF7154760000000, float %206) #6, !dbg !21 + %.0.i109 = select i1 %.not3.i108, float %208, float %207, !dbg !21 + %209 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %175, float 0x3E54AE0C00000000, float %.0.i109) #6, !dbg !21 + %210 = tail call float @llvm.nvvm.fma.rn.f(float %175, float 0x3E54AE0C00000000, float %.0.i109) #6, !dbg !21 + %.01.i111 = select i1 %.not4.i110, float %210, float %209, !dbg !21 + %211 = bitcast <2 x float> %195 to <2 x i32>, !dbg !21 + %212 = shl <2 x i32> %211, splat (i32 23), !dbg !21 + %213 = bitcast <2 x i32> %212 to <2 x float>, !dbg !21 + %214 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i111) #6, !dbg !21 + %215 = insertelement <2 x float> poison, float %203, i64 0, !dbg !21 + %216 = insertelement <2 x float> %215, float %214, i64 1, !dbg !21 + %217 = fmul <2 x float> %216, %213, !dbg !21 + %218 = select <2 x i1> %173, <2 x float> splat (float 1.000000e+00), <2 x float> %217, !dbg !28 + %foldExtExtBinop187 = fsub <2 x float> %169, %172, !dbg !29 + %219 = extractelement <2 x float> %foldExtExtBinop187, i64 0, !dbg !29 + %foldExtExtBinop189 = fsub <2 x float> %169, %172, !dbg !29 + %220 = extractelement <2 x float> %foldExtExtBinop189, i64 1, !dbg !29 + %221 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %219, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %222 = tail call float @llvm.nvvm.fma.rn.f(float %219, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i133 = select i1 %.not.i132, float %222, float %221, !dbg !21 + %223 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i133) #6, !dbg !21 + %224 = tail call float @llvm.nvvm.saturate.f(float %.02.i133) #6, !dbg !21 + %.03.i135 = select i1 %.not1.i134, float %224, float %223, !dbg !21 + %225 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i135, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %226 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i135, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %227 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %220, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %228 = tail call float @llvm.nvvm.fma.rn.f(float %220, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i143 = select i1 %.not.i142, float %228, float %227, !dbg !21 + %229 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i143) #6, !dbg !21 + %230 = tail call float @llvm.nvvm.saturate.f(float %.02.i143) #6, !dbg !21 + %.03.i145 = select i1 %.not1.i144, float %230, float %229, !dbg !21 + %231 = insertelement <2 x i32> poison, i32 %64, i64 0, !dbg !21 + %232 = insertelement <2 x i32> %231, i32 %69, i64 1, !dbg !21 + %233 = icmp eq <2 x i32> %232, zeroinitializer, !dbg !21 + %234 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i145, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %235 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i145, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %236 = insertelement <2 x float> poison, float %226, i64 0, !dbg !21 + %237 = insertelement <2 x float> %236, float %235, i64 1, !dbg !21 + %238 = insertelement <2 x float> poison, float %225, i64 0, !dbg !21 + %239 = insertelement <2 x float> %238, float %234, i64 1, !dbg !21 + %240 = select <2 x i1> %233, <2 x float> %237, <2 x float> %239, !dbg !21 + %241 = extractelement <2 x float> %240, i64 0, !dbg !21 + %242 = fadd float %241, 0xC168000FE0000000, !dbg !21 + %243 = fneg float %242, !dbg !21 + %244 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %219, float 0x3FF7154760000000, float %243) #6, !dbg !21 + %245 = tail call float @llvm.nvvm.fma.rn.f(float %219, float 0x3FF7154760000000, float %243) #6, !dbg !21 + %.0.i139 = select i1 %.not3.i138, float %245, float %244, !dbg !21 + %246 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %219, float 0x3E54AE0C00000000, float %.0.i139) #6, !dbg !21 + %247 = tail call float @llvm.nvvm.fma.rn.f(float %219, float 0x3E54AE0C00000000, float %.0.i139) #6, !dbg !21 + %.01.i141 = select i1 %.not4.i140, float %247, float %246, !dbg !21 + %248 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i141) #6, !dbg !21 + %249 = extractelement <2 x float> %240, i64 1, !dbg !21 + %250 = fadd float %249, 0xC168000FE0000000, !dbg !21 + %251 = fneg float %250, !dbg !21 + %252 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %220, float 0x3FF7154760000000, float %251) #6, !dbg !21 + %253 = tail call float @llvm.nvvm.fma.rn.f(float %220, float 0x3FF7154760000000, float %251) #6, !dbg !21 + %.0.i149 = select i1 %.not3.i148, float %253, float %252, !dbg !21 + %254 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %220, float 0x3E54AE0C00000000, float %.0.i149) #6, !dbg !21 + %255 = tail call float @llvm.nvvm.fma.rn.f(float %220, float 0x3E54AE0C00000000, float %.0.i149) #6, !dbg !21 + %.01.i151 = select i1 %.not4.i150, float %255, float %254, !dbg !21 + %256 = bitcast <2 x float> %240 to <2 x i32>, !dbg !21 + %257 = shl <2 x i32> %256, splat (i32 23), !dbg !21 + %258 = bitcast <2 x i32> %257 to <2 x float>, !dbg !21 + %259 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i151) #6, !dbg !21 + %260 = insertelement <2 x float> poison, float %248, i64 0, !dbg !21 + %261 = insertelement <2 x float> %260, float %259, i64 1, !dbg !21 + %262 = fmul <2 x float> %261, %258, !dbg !21 + %263 = select <2 x i1> %173, <2 x float> splat (float 1.000000e+00), <2 x float> %262, !dbg !30 + %264 = fmul <2 x float> %14, %218, !dbg !31 + %265 = fadd <2 x float> %264, %263, !dbg !32 + %266 = insertelement <2 x i1> poison, i1 %19, i64 0, !dbg !33 + %267 = shufflevector <2 x i1> %266, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !33 + %268 = select <2 x i1> %267, <2 x float> %75, <2 x float> %17, !dbg !33 + %269 = select <2 x i1> %267, <2 x float> %172, <2 x float> %15, !dbg !33 + %270 = select <2 x i1> %267, <2 x float> %168, <2 x float> %16, !dbg !34 + %271 = select <2 x i1> %267, <2 x float> %265, <2 x float> %14, !dbg !34 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !11 + %272 = icmp samesign ult i64 %indvars.iv, 29952, !dbg !11 + br i1 %272, label %13, label %273, !dbg !11 + +273: ; preds = %13 + %274 = and i32 %8, 31, !dbg !9 + %275 = lshr i32 %8, 5, !dbg !9 + %276 = extractelement <2 x float> %268, i64 0, !dbg !35 + %277 = extractelement <2 x float> %268, i64 1, !dbg !35 + %278 = fcmp ogt float %276, %277, !dbg !35 + %279 = fcmp uno float %276, 0.000000e+00, !dbg !37 + %280 = or i1 %278, %279, !dbg !38 + %281 = select i1 %280, float %276, float %277, !dbg !39 + %282 = extractelement <2 x float> %269, i64 0, !dbg !35 + %283 = fcmp ogt float %281, %282, !dbg !35 + %284 = fcmp uno float %281, 0.000000e+00, !dbg !37 + %285 = or i1 %283, %284, !dbg !38 + %286 = select i1 %285, float %281, float %282, !dbg !39 + %287 = extractelement <2 x float> %269, i64 1, !dbg !35 + %288 = fcmp ogt float %286, %287, !dbg !35 + %289 = fcmp uno float %286, 0.000000e+00, !dbg !37 + %290 = or i1 %288, %289, !dbg !38 + %291 = select i1 %290, float %286, float %287, !dbg !39 + %292 = bitcast float %291 to i32, !dbg !40 + %293 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %292, i32 16, i32 31), !dbg !40 + %294 = bitcast i32 %293 to float, !dbg !40 + %295 = fcmp ogt float %291, %294, !dbg !35 + %296 = fcmp uno float %291, 0.000000e+00, !dbg !37 + %297 = or i1 %296, %295, !dbg !38 + %298 = select i1 %297, float %291, float %294, !dbg !39 + %299 = bitcast float %298 to i32, !dbg !40 + %300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %299, i32 8, i32 31), !dbg !40 + %301 = bitcast i32 %300 to float, !dbg !40 + %302 = fcmp ogt float %298, %301, !dbg !35 + %303 = fcmp uno float %298, 0.000000e+00, !dbg !37 + %304 = or i1 %302, %303, !dbg !38 + %305 = select i1 %304, float %298, float %301, !dbg !39 + %306 = bitcast float %305 to i32, !dbg !40 + %307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %306, i32 4, i32 31), !dbg !40 + %308 = bitcast i32 %307 to float, !dbg !40 + %309 = fcmp ogt float %305, %308, !dbg !35 + %310 = fcmp uno float %305, 0.000000e+00, !dbg !37 + %311 = or i1 %309, %310, !dbg !38 + %312 = select i1 %311, float %305, float %308, !dbg !39 + %313 = bitcast float %312 to i32, !dbg !40 + %314 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %313, i32 2, i32 31), !dbg !40 + %315 = bitcast i32 %314 to float, !dbg !40 + %316 = fcmp ogt float %312, %315, !dbg !35 + %317 = fcmp uno float %312, 0.000000e+00, !dbg !37 + %318 = or i1 %316, %317, !dbg !38 + %319 = select i1 %318, float %312, float %315, !dbg !39 + %320 = bitcast float %319 to i32, !dbg !40 + %321 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %320, i32 1, i32 31), !dbg !40 + %322 = bitcast i32 %321 to float, !dbg !40 + %323 = fcmp ogt float %319, %322, !dbg !35 + %324 = fcmp uno float %319, 0.000000e+00, !dbg !37 + %325 = or i1 %323, %324, !dbg !38 + %326 = and i32 %275, 15, !dbg !40 + %327 = icmp eq i32 %274, 0, !dbg !40 + %328 = getelementptr float, ptr addrspace(3) @global_smem, i32 %326, !dbg !40 + %329 = select i1 %325, i32 %320, i32 %321, !dbg !39 + %330 = insertelement <1 x i32> poison, i32 %329, i64 0, !dbg !40 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %328, <1 x i32> %330, i1 %327) #6, !dbg !40 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !40 + %331 = icmp samesign ult i32 %8, 16, !dbg !40 + %332 = getelementptr float, ptr addrspace(3) @global_smem, i32 %8, !dbg !40 + %333 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %332, i1 %331) #6, !dbg !40 + %334 = bitcast i32 %333 to float, !dbg !40 + %335 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %333, i32 8, i32 31), !dbg !40 + %336 = bitcast i32 %335 to float, !dbg !40 + %337 = fcmp ogt float %334, %336, !dbg !35 + %338 = fcmp uno float %334, 0.000000e+00, !dbg !37 + %339 = or i1 %338, %337, !dbg !38 + %340 = select i1 %339, float %334, float %336, !dbg !39 + %341 = bitcast float %340 to i32, !dbg !40 + %342 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %341, i32 4, i32 31), !dbg !40 + %343 = bitcast i32 %342 to float, !dbg !40 + %344 = fcmp ogt float %340, %343, !dbg !35 + %345 = fcmp uno float %340, 0.000000e+00, !dbg !37 + %346 = or i1 %344, %345, !dbg !38 + %347 = select i1 %346, float %340, float %343, !dbg !39 + %348 = bitcast float %347 to i32, !dbg !40 + %349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %348, i32 2, i32 31), !dbg !40 + %350 = bitcast i32 %349 to float, !dbg !40 + %351 = fcmp ogt float %347, %350, !dbg !35 + %352 = fcmp uno float %347, 0.000000e+00, !dbg !37 + %353 = or i1 %351, %352, !dbg !38 + %354 = select i1 %353, float %347, float %350, !dbg !39 + %355 = bitcast float %354 to i32, !dbg !40 + %356 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %355, i32 1, i32 31), !dbg !40 + %357 = bitcast i32 %356 to float, !dbg !40 + %358 = fcmp ogt float %354, %357, !dbg !35 + %359 = fcmp uno float %354, 0.000000e+00, !dbg !37 + %360 = or i1 %358, %359, !dbg !38 + %361 = icmp eq i32 %8, 0, !dbg !40 + %362 = select i1 %360, i32 %355, i32 %356, !dbg !39 + %363 = insertelement <1 x i32> poison, i32 %362, i64 0, !dbg !40 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %332, <1 x i32> %363, i1 %361) #6, !dbg !40 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !40 + %364 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !40 + %365 = fcmp oeq float %364, 0xFFF0000000000000, !dbg !41 + %366 = fsub float %276, %364, !dbg !42 + %367 = fsub float %277, %364, !dbg !42 + %368 = fsub float %282, %364, !dbg !42 + %369 = fsub float %287, %364, !dbg !42 + %370 = select i1 %365, float 0.000000e+00, float %366, !dbg !43 + %371 = select i1 %365, float 0.000000e+00, float %367, !dbg !43 + %372 = select i1 %365, float 0.000000e+00, float %368, !dbg !43 + %373 = select i1 %365, float 0.000000e+00, float %369, !dbg !43 + %374 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i = icmp eq i32 %374, 0, !dbg !44 + %375 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %370, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %376 = tail call float @llvm.nvvm.fma.rn.f(float %370, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i = select i1 %.not.i, float %376, float %375, !dbg !44 + %377 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i = icmp eq i32 %377, 0, !dbg !44 + %378 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i) #6, !dbg !44 + %379 = tail call float @llvm.nvvm.saturate.f(float %.02.i) #6, !dbg !44 + %.03.i = select i1 %.not1.i, float %379, float %378, !dbg !44 + %380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %381 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %382 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i = icmp eq i32 %383, 0, !dbg !44 + %384 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i = icmp eq i32 %384, 0, !dbg !44 + %385 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i2 = icmp eq i32 %385, 0, !dbg !44 + %386 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %371, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %387 = tail call float @llvm.nvvm.fma.rn.f(float %371, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i3 = select i1 %.not.i2, float %387, float %386, !dbg !44 + %388 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i4 = icmp eq i32 %388, 0, !dbg !44 + %389 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i3) #6, !dbg !44 + %390 = tail call float @llvm.nvvm.saturate.f(float %.02.i3) #6, !dbg !44 + %.03.i5 = select i1 %.not1.i4, float %390, float %389, !dbg !44 + %391 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %392 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i5, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %393 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i5, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %394 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i8 = icmp eq i32 %394, 0, !dbg !44 + %395 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i10 = icmp eq i32 %395, 0, !dbg !44 + %396 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i12 = icmp eq i32 %396, 0, !dbg !44 + %397 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %372, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %398 = tail call float @llvm.nvvm.fma.rn.f(float %372, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i13 = select i1 %.not.i12, float %398, float %397, !dbg !44 + %399 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i14 = icmp eq i32 %399, 0, !dbg !44 + %400 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i13) #6, !dbg !44 + %401 = tail call float @llvm.nvvm.saturate.f(float %.02.i13) #6, !dbg !44 + %.03.i15 = select i1 %.not1.i14, float %401, float %400, !dbg !44 + %402 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %403 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i15, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %404 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i15, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %405 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i18 = icmp eq i32 %405, 0, !dbg !44 + %406 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i20 = icmp eq i32 %406, 0, !dbg !44 + %407 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i22 = icmp eq i32 %407, 0, !dbg !44 + %408 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %373, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %409 = tail call float @llvm.nvvm.fma.rn.f(float %373, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i23 = select i1 %.not.i22, float %409, float %408, !dbg !44 + %410 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i24 = icmp eq i32 %410, 0, !dbg !44 + %411 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i23) #6, !dbg !44 + %412 = tail call float @llvm.nvvm.saturate.f(float %.02.i23) #6, !dbg !44 + %.03.i25 = select i1 %.not1.i24, float %412, float %411, !dbg !44 + %413 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %414 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i25, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %415 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i25, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %416 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i28 = icmp eq i32 %416, 0, !dbg !44 + %417 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i30 = icmp eq i32 %417, 0, !dbg !44 + %418 = insertelement <2 x i32> poison, i32 %380, i64 0, !dbg !44 + %419 = insertelement <2 x i32> %418, i32 %391, i64 1, !dbg !44 + %420 = icmp eq <2 x i32> %419, zeroinitializer, !dbg !44 + %421 = insertelement <2 x float> poison, float %382, i64 0, !dbg !44 + %422 = insertelement <2 x float> %421, float %393, i64 1, !dbg !44 + %423 = insertelement <2 x float> poison, float %381, i64 0, !dbg !44 + %424 = insertelement <2 x float> %423, float %392, i64 1, !dbg !44 + %425 = select <2 x i1> %420, <2 x float> %422, <2 x float> %424, !dbg !44 + %426 = extractelement <2 x float> %425, i64 0, !dbg !44 + %427 = fadd float %426, 0xC168000FE0000000, !dbg !44 + %428 = fneg float %427, !dbg !44 + %429 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %370, float 0x3FF7154760000000, float %428) #6, !dbg !44 + %430 = tail call float @llvm.nvvm.fma.rn.f(float %370, float 0x3FF7154760000000, float %428) #6, !dbg !44 + %.0.i = select i1 %.not3.i, float %430, float %429, !dbg !44 + %431 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %370, float 0x3E54AE0C00000000, float %.0.i) #6, !dbg !44 + %432 = tail call float @llvm.nvvm.fma.rn.f(float %370, float 0x3E54AE0C00000000, float %.0.i) #6, !dbg !44 + %.01.i = select i1 %.not4.i, float %432, float %431, !dbg !44 + %433 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i) #6, !dbg !44 + %434 = extractelement <2 x float> %425, i64 1, !dbg !44 + %435 = fadd float %434, 0xC168000FE0000000, !dbg !44 + %436 = fneg float %435, !dbg !44 + %437 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %371, float 0x3FF7154760000000, float %436) #6, !dbg !44 + %438 = tail call float @llvm.nvvm.fma.rn.f(float %371, float 0x3FF7154760000000, float %436) #6, !dbg !44 + %.0.i9 = select i1 %.not3.i8, float %438, float %437, !dbg !44 + %439 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %371, float 0x3E54AE0C00000000, float %.0.i9) #6, !dbg !44 + %440 = tail call float @llvm.nvvm.fma.rn.f(float %371, float 0x3E54AE0C00000000, float %.0.i9) #6, !dbg !44 + %.01.i11 = select i1 %.not4.i10, float %440, float %439, !dbg !44 + %441 = bitcast <2 x float> %425 to <2 x i32>, !dbg !44 + %442 = shl <2 x i32> %441, splat (i32 23), !dbg !44 + %443 = bitcast <2 x i32> %442 to <2 x float>, !dbg !44 + %444 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i11) #6, !dbg !44 + %445 = insertelement <2 x float> poison, float %433, i64 0, !dbg !44 + %446 = insertelement <2 x float> %445, float %444, i64 1, !dbg !44 + %447 = fmul <2 x float> %446, %443, !dbg !44 + %448 = fmul <2 x float> %270, %447, !dbg !45 + %449 = insertelement <2 x i32> poison, i32 %402, i64 0, !dbg !44 + %450 = insertelement <2 x i32> %449, i32 %413, i64 1, !dbg !44 + %451 = icmp eq <2 x i32> %450, zeroinitializer, !dbg !44 + %452 = insertelement <2 x float> poison, float %404, i64 0, !dbg !44 + %453 = insertelement <2 x float> %452, float %415, i64 1, !dbg !44 + %454 = insertelement <2 x float> poison, float %403, i64 0, !dbg !44 + %455 = insertelement <2 x float> %454, float %414, i64 1, !dbg !44 + %456 = select <2 x i1> %451, <2 x float> %453, <2 x float> %455, !dbg !44 + %457 = extractelement <2 x float> %456, i64 0, !dbg !44 + %458 = fadd float %457, 0xC168000FE0000000, !dbg !44 + %459 = fneg float %458, !dbg !44 + %460 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %372, float 0x3FF7154760000000, float %459) #6, !dbg !44 + %461 = tail call float @llvm.nvvm.fma.rn.f(float %372, float 0x3FF7154760000000, float %459) #6, !dbg !44 + %.0.i19 = select i1 %.not3.i18, float %461, float %460, !dbg !44 + %462 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %372, float 0x3E54AE0C00000000, float %.0.i19) #6, !dbg !44 + %463 = tail call float @llvm.nvvm.fma.rn.f(float %372, float 0x3E54AE0C00000000, float %.0.i19) #6, !dbg !44 + %.01.i21 = select i1 %.not4.i20, float %463, float %462, !dbg !44 + %464 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i21) #6, !dbg !44 + %465 = extractelement <2 x float> %456, i64 1, !dbg !44 + %466 = fadd float %465, 0xC168000FE0000000, !dbg !44 + %467 = fneg float %466, !dbg !44 + %468 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %373, float 0x3FF7154760000000, float %467) #6, !dbg !44 + %469 = tail call float @llvm.nvvm.fma.rn.f(float %373, float 0x3FF7154760000000, float %467) #6, !dbg !44 + %.0.i29 = select i1 %.not3.i28, float %469, float %468, !dbg !44 + %470 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %373, float 0x3E54AE0C00000000, float %.0.i29) #6, !dbg !44 + %471 = tail call float @llvm.nvvm.fma.rn.f(float %373, float 0x3E54AE0C00000000, float %.0.i29) #6, !dbg !44 + %.01.i31 = select i1 %.not4.i30, float %471, float %470, !dbg !44 + %472 = bitcast <2 x float> %456 to <2 x i32>, !dbg !44 + %473 = shl <2 x i32> %472, splat (i32 23), !dbg !44 + %474 = bitcast <2 x i32> %473 to <2 x float>, !dbg !44 + %475 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i31) #6, !dbg !44 + %476 = insertelement <2 x float> poison, float %464, i64 0, !dbg !44 + %477 = insertelement <2 x float> %476, float %475, i64 1, !dbg !44 + %478 = fmul <2 x float> %477, %474, !dbg !44 + %479 = fmul <2 x float> %271, %478, !dbg !45 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %shift = shufflevector <2 x float> %448, <2 x float> poison, <2 x i32> , !dbg !49 + %foldExtExtBinop191 = fadd <2 x float> %448, %shift, !dbg !49 + %foldExtExtBinop193 = fadd <2 x float> %foldExtExtBinop191, %479, !dbg !49 + %shift195 = shufflevector <2 x float> %479, <2 x float> poison, <2 x i32> , !dbg !49 + %foldExtExtBinop196 = fadd <2 x float> %foldExtExtBinop193, %shift195, !dbg !49 + %480 = extractelement <2 x float> %foldExtExtBinop196, i64 0, !dbg !49 + %481 = bitcast float %480 to i32, !dbg !46 + %482 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %481, i32 16, i32 31), !dbg !46 + %483 = bitcast i32 %482 to float, !dbg !46 + %484 = fadd float %480, %483, !dbg !49 + %485 = bitcast float %484 to i32, !dbg !46 + %486 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %485, i32 8, i32 31), !dbg !46 + %487 = bitcast i32 %486 to float, !dbg !46 + %488 = fadd float %484, %487, !dbg !49 + %489 = bitcast float %488 to i32, !dbg !46 + %490 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %489, i32 4, i32 31), !dbg !46 + %491 = bitcast i32 %490 to float, !dbg !46 + %492 = fadd float %488, %491, !dbg !49 + %493 = bitcast float %492 to i32, !dbg !46 + %494 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %493, i32 2, i32 31), !dbg !46 + %495 = bitcast i32 %494 to float, !dbg !46 + %496 = fadd float %492, %495, !dbg !49 + %497 = bitcast float %496 to i32, !dbg !46 + %498 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %497, i32 1, i32 31), !dbg !46 + %499 = bitcast i32 %498 to float, !dbg !46 + %500 = fadd float %496, %499, !dbg !49 + %501 = bitcast float %500 to <1 x i32>, !dbg !46 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %328, <1 x i32> %501, i1 %327) #6, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %502 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %332, i1 %331) #6, !dbg !46 + %503 = bitcast i32 %502 to float, !dbg !46 + %504 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %502, i32 8, i32 31), !dbg !46 + %505 = bitcast i32 %504 to float, !dbg !46 + %506 = fadd float %503, %505, !dbg !49 + %507 = bitcast float %506 to i32, !dbg !46 + %508 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %507, i32 4, i32 31), !dbg !46 + %509 = bitcast i32 %508 to float, !dbg !46 + %510 = fadd float %506, %509, !dbg !49 + %511 = bitcast float %510 to i32, !dbg !46 + %512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %511, i32 2, i32 31), !dbg !46 + %513 = bitcast i32 %512 to float, !dbg !46 + %514 = fadd float %510, %513, !dbg !49 + %515 = bitcast float %514 to i32, !dbg !46 + %516 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %515, i32 1, i32 31), !dbg !46 + %517 = bitcast i32 %516 to float, !dbg !46 + %518 = fadd float %514, %517, !dbg !49 + %519 = bitcast float %518 to <1 x i32>, !dbg !46 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %332, <1 x i32> %519, i1 %361) #6, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %520 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !46 + br label %521, !dbg !50 + +521: ; preds = %273, %521 + %indvars.iv160 = phi i64 [ 0, %273 ], [ %indvars.iv.next161, %521 ] + %522 = or disjoint i64 %indvars.iv160, %12, !dbg !51 + %523 = icmp samesign ult i64 %522, 32000, !dbg !52 + %524 = trunc nuw nsw i64 %522 to i32, !dbg !53 + %525 = add i32 %11, %524, !dbg !53 + %526 = sext i32 %525 to i64, !dbg !54 + %527 = getelementptr bfloat, ptr addrspace(1) %0, i64 %526, !dbg !54 + %528 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !55 + %529 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %527, i64 %528, i1 %523) #6, !dbg !55 + %530 = extractvalue { i32, i32 } %529, 0, !dbg !55 + %531 = bitcast i32 %530 to <2 x bfloat>, !dbg !55 + %532 = extractvalue { i32, i32 } %529, 1, !dbg !55 + %533 = bitcast i32 %532 to <2 x bfloat>, !dbg !55 + %534 = extractelement <2 x bfloat> %531, i64 0, !dbg !55 + %535 = extractelement <2 x bfloat> %531, i64 1, !dbg !55 + %536 = extractelement <2 x bfloat> %533, i64 0, !dbg !55 + %537 = extractelement <2 x bfloat> %533, i64 1, !dbg !55 + %538 = fpext bfloat %534 to float, !dbg !56 + %539 = fpext bfloat %535 to float, !dbg !56 + %540 = fpext bfloat %536 to float, !dbg !56 + %541 = fpext bfloat %537 to float, !dbg !56 + %542 = fsub float %538, %364, !dbg !57 + %543 = fsub float %539, %364, !dbg !57 + %544 = fsub float %540, %364, !dbg !57 + %545 = fsub float %541, %364, !dbg !57 + %546 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i32 = icmp eq i32 %546, 0, !dbg !58 + %547 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %542, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %548 = tail call float @llvm.nvvm.fma.rn.f(float %542, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i33 = select i1 %.not.i32, float %548, float %547, !dbg !58 + %549 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i34 = icmp eq i32 %549, 0, !dbg !58 + %550 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i33) #6, !dbg !58 + %551 = tail call float @llvm.nvvm.saturate.f(float %.02.i33) #6, !dbg !58 + %.03.i35 = select i1 %.not1.i34, float %551, float %550, !dbg !58 + %552 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i36 = icmp eq i32 %552, 0, !dbg !58 + %553 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i35, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %554 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i35, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i37 = select i1 %.not2.i36, float %554, float %553, !dbg !58 + %555 = fadd float %.04.i37, 0xC168000FE0000000, !dbg !58 + %556 = fneg float %555, !dbg !58 + %557 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i38 = icmp eq i32 %557, 0, !dbg !58 + %558 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %542, float 0x3FF7154760000000, float %556) #6, !dbg !58 + %559 = tail call float @llvm.nvvm.fma.rn.f(float %542, float 0x3FF7154760000000, float %556) #6, !dbg !58 + %.0.i39 = select i1 %.not3.i38, float %559, float %558, !dbg !58 + %560 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i40 = icmp eq i32 %560, 0, !dbg !58 + %561 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %542, float 0x3E54AE0C00000000, float %.0.i39) #6, !dbg !58 + %562 = tail call float @llvm.nvvm.fma.rn.f(float %542, float 0x3E54AE0C00000000, float %.0.i39) #6, !dbg !58 + %.01.i41 = select i1 %.not4.i40, float %562, float %561, !dbg !58 + %563 = bitcast float %.04.i37 to i32, !dbg !58 + %564 = shl i32 %563, 23, !dbg !58 + %565 = bitcast i32 %564 to float, !dbg !58 + %566 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i41) #6, !dbg !58 + %567 = fmul float %566, %565, !dbg !58 + %568 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i42 = icmp eq i32 %568, 0, !dbg !58 + %569 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %543, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %570 = tail call float @llvm.nvvm.fma.rn.f(float %543, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i43 = select i1 %.not.i42, float %570, float %569, !dbg !58 + %571 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i44 = icmp eq i32 %571, 0, !dbg !58 + %572 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i43) #6, !dbg !58 + %573 = tail call float @llvm.nvvm.saturate.f(float %.02.i43) #6, !dbg !58 + %.03.i45 = select i1 %.not1.i44, float %573, float %572, !dbg !58 + %574 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i46 = icmp eq i32 %574, 0, !dbg !58 + %575 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i45, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %576 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i45, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i47 = select i1 %.not2.i46, float %576, float %575, !dbg !58 + %577 = fadd float %.04.i47, 0xC168000FE0000000, !dbg !58 + %578 = fneg float %577, !dbg !58 + %579 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i48 = icmp eq i32 %579, 0, !dbg !58 + %580 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %543, float 0x3FF7154760000000, float %578) #6, !dbg !58 + %581 = tail call float @llvm.nvvm.fma.rn.f(float %543, float 0x3FF7154760000000, float %578) #6, !dbg !58 + %.0.i49 = select i1 %.not3.i48, float %581, float %580, !dbg !58 + %582 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i50 = icmp eq i32 %582, 0, !dbg !58 + %583 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %543, float 0x3E54AE0C00000000, float %.0.i49) #6, !dbg !58 + %584 = tail call float @llvm.nvvm.fma.rn.f(float %543, float 0x3E54AE0C00000000, float %.0.i49) #6, !dbg !58 + %.01.i51 = select i1 %.not4.i50, float %584, float %583, !dbg !58 + %585 = bitcast float %.04.i47 to i32, !dbg !58 + %586 = shl i32 %585, 23, !dbg !58 + %587 = bitcast i32 %586 to float, !dbg !58 + %588 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i51) #6, !dbg !58 + %589 = fmul float %588, %587, !dbg !58 + %590 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i52 = icmp eq i32 %590, 0, !dbg !58 + %591 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %544, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %592 = tail call float @llvm.nvvm.fma.rn.f(float %544, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i53 = select i1 %.not.i52, float %592, float %591, !dbg !58 + %593 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i54 = icmp eq i32 %593, 0, !dbg !58 + %594 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i53) #6, !dbg !58 + %595 = tail call float @llvm.nvvm.saturate.f(float %.02.i53) #6, !dbg !58 + %.03.i55 = select i1 %.not1.i54, float %595, float %594, !dbg !58 + %596 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i56 = icmp eq i32 %596, 0, !dbg !58 + %597 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i55, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %598 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i55, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i57 = select i1 %.not2.i56, float %598, float %597, !dbg !58 + %599 = fadd float %.04.i57, 0xC168000FE0000000, !dbg !58 + %600 = fneg float %599, !dbg !58 + %601 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i58 = icmp eq i32 %601, 0, !dbg !58 + %602 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %544, float 0x3FF7154760000000, float %600) #6, !dbg !58 + %603 = tail call float @llvm.nvvm.fma.rn.f(float %544, float 0x3FF7154760000000, float %600) #6, !dbg !58 + %.0.i59 = select i1 %.not3.i58, float %603, float %602, !dbg !58 + %604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i60 = icmp eq i32 %604, 0, !dbg !58 + %605 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %544, float 0x3E54AE0C00000000, float %.0.i59) #6, !dbg !58 + %606 = tail call float @llvm.nvvm.fma.rn.f(float %544, float 0x3E54AE0C00000000, float %.0.i59) #6, !dbg !58 + %.01.i61 = select i1 %.not4.i60, float %606, float %605, !dbg !58 + %607 = bitcast float %.04.i57 to i32, !dbg !58 + %608 = shl i32 %607, 23, !dbg !58 + %609 = bitcast i32 %608 to float, !dbg !58 + %610 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i61) #6, !dbg !58 + %611 = fmul float %610, %609, !dbg !58 + %612 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i62 = icmp eq i32 %612, 0, !dbg !58 + %613 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %545, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %614 = tail call float @llvm.nvvm.fma.rn.f(float %545, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i63 = select i1 %.not.i62, float %614, float %613, !dbg !58 + %615 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i64 = icmp eq i32 %615, 0, !dbg !58 + %616 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i63) #6, !dbg !58 + %617 = tail call float @llvm.nvvm.saturate.f(float %.02.i63) #6, !dbg !58 + %.03.i65 = select i1 %.not1.i64, float %617, float %616, !dbg !58 + %618 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i66 = icmp eq i32 %618, 0, !dbg !58 + %619 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i65, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %620 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i65, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i67 = select i1 %.not2.i66, float %620, float %619, !dbg !58 + %621 = fadd float %.04.i67, 0xC168000FE0000000, !dbg !58 + %622 = fneg float %621, !dbg !58 + %623 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i68 = icmp eq i32 %623, 0, !dbg !58 + %624 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %545, float 0x3FF7154760000000, float %622) #6, !dbg !58 + %625 = tail call float @llvm.nvvm.fma.rn.f(float %545, float 0x3FF7154760000000, float %622) #6, !dbg !58 + %.0.i69 = select i1 %.not3.i68, float %625, float %624, !dbg !58 + %626 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i70 = icmp eq i32 %626, 0, !dbg !58 + %627 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %545, float 0x3E54AE0C00000000, float %.0.i69) #6, !dbg !58 + %628 = tail call float @llvm.nvvm.fma.rn.f(float %545, float 0x3E54AE0C00000000, float %.0.i69) #6, !dbg !58 + %.01.i71 = select i1 %.not4.i70, float %628, float %627, !dbg !58 + %629 = bitcast float %.04.i67 to i32, !dbg !58 + %630 = shl i32 %629, 23, !dbg !58 + %631 = bitcast i32 %630 to float, !dbg !58 + %632 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i71) #6, !dbg !58 + %633 = fmul float %632, %631, !dbg !58 + %634 = tail call float @llvm.nvvm.div.full(float %567, float %520), !dbg !59 + %635 = tail call float @llvm.nvvm.div.full(float %589, float %520), !dbg !59 + %636 = tail call float @llvm.nvvm.div.full(float %611, float %520), !dbg !59 + %637 = tail call float @llvm.nvvm.div.full(float %633, float %520), !dbg !59 + %638 = getelementptr float, ptr addrspace(1) %1, i64 %526, !dbg !60 + %639 = bitcast float %634 to i32, !dbg !61 + %640 = bitcast float %635 to i32, !dbg !61 + %641 = bitcast float %636 to i32, !dbg !61 + %642 = bitcast float %637 to i32, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %639, i32 %640, i32 %641, i32 %642, ptr addrspace(1) %638, i1 %523) #6, !dbg !61 + %indvars.iv.next161 = add nuw nsw i64 %indvars.iv160, 2048, !dbg !50 + %643 = icmp samesign ult i64 %indvars.iv160, 29952, !dbg !50 + br i1 %643, label %521, label %644, !dbg !50 + +644: ; preds = %521 + ret void, !dbg !62 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.saturate.ftz.f(float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.saturate.f(float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rm.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rm.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0", linkageName: "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 26, column: 37, scope: !5) +!10 = !DILocation(line: 37, column: 47, scope: !5) +!11 = !DILocation(line: 31, column: 40, scope: !5) +!12 = !DILocation(line: 32, column: 31, scope: !5) +!13 = !DILocation(line: 33, column: 29, scope: !5) +!14 = !DILocation(line: 37, column: 41, scope: !5) +!15 = !DILocation(line: 37, column: 34, scope: !5) +!16 = !DILocation(line: 37, column: 52, scope: !5) +!17 = !DILocation(line: 112, column: 21, scope: !18, inlinedAt: !20) +!18 = distinct !DILexicalBlockFile(scope: !5, file: !19, discriminator: 0) +!19 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!20 = !DILocation(line: 42, column: 40, scope: !5) +!21 = !DILocation(line: 173, column: 29, scope: !18, inlinedAt: !20) +!22 = !DILocation(line: 37, column: 105, scope: !5) +!23 = !DILocation(line: 110, column: 15, scope: !18, inlinedAt: !20) +!24 = !DILocation(line: 112, column: 16, scope: !18, inlinedAt: !20) +!25 = !DILocation(line: 113, column: 29, scope: !18, inlinedAt: !20) +!26 = !DILocation(line: 196, column: 19, scope: !18, inlinedAt: !20) +!27 = !DILocation(line: 196, column: 53, scope: !18, inlinedAt: !20) +!28 = !DILocation(line: 196, column: 39, scope: !18, inlinedAt: !20) +!29 = !DILocation(line: 199, column: 53, scope: !18, inlinedAt: !20) +!30 = !DILocation(line: 199, column: 39, scope: !18, inlinedAt: !20) +!31 = !DILocation(line: 205, column: 24, scope: !18, inlinedAt: !20) +!32 = !DILocation(line: 205, column: 36, scope: !18, inlinedAt: !20) +!33 = !DILocation(line: 45, column: 54, scope: !5) +!34 = !DILocation(line: 46, column: 54, scope: !5) +!35 = !DILocation(line: 110, column: 15, scope: !18, inlinedAt: !36) +!36 = !DILocation(line: 49, column: 33, scope: !5) +!37 = !DILocation(line: 112, column: 21, scope: !18, inlinedAt: !36) +!38 = !DILocation(line: 112, column: 16, scope: !18, inlinedAt: !36) +!39 = !DILocation(line: 113, column: 29, scope: !18, inlinedAt: !36) +!40 = !DILocation(line: 123, column: 29, scope: !18, inlinedAt: !36) +!41 = !DILocation(line: 180, column: 40, scope: !18, inlinedAt: !36) +!42 = !DILocation(line: 180, column: 68, scope: !18, inlinedAt: !36) +!43 = !DILocation(line: 180, column: 58, scope: !18, inlinedAt: !36) +!44 = !DILocation(line: 173, column: 29, scope: !18, inlinedAt: !36) +!45 = !DILocation(line: 181, column: 31, scope: !18, inlinedAt: !36) +!46 = !DILocation(line: 291, column: 36, scope: !47, inlinedAt: !36) +!47 = distinct !DILexicalBlockFile(scope: !5, file: !48, discriminator: 0) +!48 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!49 = !DILocation(line: 261, column: 15, scope: !47, inlinedAt: !36) +!50 = !DILocation(line: 52, column: 40, scope: !5) +!51 = !DILocation(line: 53, column: 31, scope: !5) +!52 = !DILocation(line: 54, column: 29, scope: !5) +!53 = !DILocation(line: 58, column: 41, scope: !5) +!54 = !DILocation(line: 58, column: 34, scope: !5) +!55 = !DILocation(line: 58, column: 52, scope: !5) +!56 = !DILocation(line: 58, column: 106, scope: !5) +!57 = !DILocation(line: 60, column: 22, scope: !5) +!58 = !DILocation(line: 61, column: 29, scope: !5) +!59 = !DILocation(line: 62, column: 23, scope: !5) +!60 = !DILocation(line: 63, column: 29, scope: !5) +!61 = !DILocation(line: 63, column: 53, scope: !5) +!62 = !DILocation(line: 52, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..c51acf5f589c96a1add3fdb23e2284736ca534e4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx @@ -0,0 +1,921 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 // -- Begin function triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 +.visible .entry triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0( + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_1, + .param .u32 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_2, + .param .u32 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_5 +) +.reqntid 512 +{ + .reg .pred %p<49>; + .reg .b16 %rs<9>; + .reg .b32 %r<354>; + .reg .b64 %rd<68>; + .loc 1 18 0 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:18:0 + +// %bb.0: + ld.param.b64 %rd16, [triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_1]; + ld.param.b64 %rd15, [triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_0]; +$L__tmp0: + .loc 1 23 28 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:23:28 + mov.u32 %r4, %ctaid.x; + .loc 1 26 37 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:26:37 + mov.u32 %r1, %tid.x; + shl.b32 %r5, %r1, 2; + and.b32 %r6, %r5, 2044; + .loc 1 31 40 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:31:40 + cvt.u64.u32 %rd1, %r6; + mad.lo.s32 %r7, %r4, 32000, %r6; + cvt.u64.u32 %rd2, %r7; + mov.b32 %r8, 0fFF800000; + mov.b64 %rd64, {%r8, %r8}; + mov.b32 %r9, 0f00000000; + mov.b64 %rd63, {%r9, %r9}; + mov.b64 %rd62, 0; + mov.b64 %rd65, %rd63; + mov.b64 %rd66, %rd64; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 33 29 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:33:29 + add.s64 %rd23, %rd1, %rd62; + setp.lt.u64 %p1, %rd23, 32000; + .loc 1 37 34 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:34 + add.s64 %rd24, %rd2, %rd62; + cvt.u32.u64 %r14, %rd24; + mad.wide.s32 %rd21, %r14, 2, %rd15; + .loc 1 37 52 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:52 + // begin inline asm + mov.u64 %rd20, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0; + // end inline asm + mov.b32 %r12, 0; + // begin inline asm + mov.u32 %r10, %r12; + mov.u32 %r11, %r12; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r10, %r11 }, [ %rd21 + 0 ], %rd20; + // end inline asm +$L__tmp1: + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + mov.b64 {%r15, %r16}, %rd66; + setp.nan.f32 %p2, %r15, %r15; + setp.nan.f32 %p3, %r16, %r16; + mov.b64 {%r17, %r18}, %rd64; + setp.nan.f32 %p4, %r17, %r17; + setp.nan.f32 %p5, %r18, %r18; +$L__tmp2: + .loc 1 37 105 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:105 + mov.b32 {%rs1, %rs2}, %r10; + cvt.f32.bf16 %r19, %rs2; + cvt.f32.bf16 %r20, %rs1; +$L__tmp3: + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.gt.f32 %p6, %r15, %r20; + setp.gt.f32 %p7, %r16, %r19; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r21, %r16, %r19, %p7; + selp.f32 %r22, %r16, %r21, %p3; + selp.f32 %r23, %r15, %r20, %p6; + selp.f32 %r24, %r15, %r23, %p2; + .loc 2 196 19 // triton_helpers.py:196:19 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.eq.f32 %p8, %r24, 0fFF800000; + setp.eq.f32 %p9, %r22, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + sub.f32 %r25, %r16, %r22; + sub.f32 %r26, %r15, %r24; + mov.b32 %r27, 0f3F000000; + mov.b32 %r28, 0f3BBB989D; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.ftz.f32 %r29, %r26, %r28, %r27; + cvt.ftz.sat.f32.f32 %r30, %r29; + mov.b32 %r31, 0f4B400001; + mov.b32 %r32, 0f437C0000; + fma.rm.ftz.f32 %r33, %r30, %r32, %r31; + fma.rn.ftz.f32 %r34, %r25, %r28, %r27; + cvt.ftz.sat.f32.f32 %r35, %r34; + fma.rm.ftz.f32 %r36, %r35, %r32, %r31; + mov.b64 %rd25, {%r33, %r36}; + cvt.u32.u64 %r37, %rd25; + add.f32 %r38, %r33, 0fCB40007F; + neg.f32 %r39, %r38; + mov.b32 %r40, 0f3FB8AA3B; + fma.rn.ftz.f32 %r41, %r26, %r40, %r39; + mov.b32 %r42, 0f32A57060; + fma.rn.ftz.f32 %r43, %r26, %r42, %r41; + ex2.approx.ftz.f32 %r44, %r43; + add.f32 %r45, %r36, 0fCB40007F; + neg.f32 %r46, %r45; + fma.rn.ftz.f32 %r47, %r25, %r40, %r46; + fma.rn.ftz.f32 %r48, %r25, %r42, %r47; + shl.b64 %rd26, %rd25, 23; + and.b64 %rd27, %rd26, -36028797018963968; + shl.b32 %r49, %r37, 23; + cvt.u64.u32 %rd28, %r49; + or.b64 %rd29, %rd28, %rd27; + ex2.approx.ftz.f32 %r50, %r48; + mov.b64 {%r51, %r52}, %rd29; + mul.f32 %r53, %r44, %r51; + mul.f32 %r54, %r50, %r52; + .loc 2 196 39 // triton_helpers.py:196:39 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r55, 0f3F800000, %r54, %p9; + selp.f32 %r56, 0f3F800000, %r53, %p8; + .loc 2 199 53 // triton_helpers.py:199:53 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + sub.f32 %r57, %r19, %r22; + sub.f32 %r58, %r20, %r24; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.ftz.f32 %r59, %r58, %r28, %r27; + cvt.ftz.sat.f32.f32 %r60, %r59; + fma.rm.ftz.f32 %r61, %r60, %r32, %r31; + fma.rn.ftz.f32 %r62, %r57, %r28, %r27; + cvt.ftz.sat.f32.f32 %r63, %r62; + fma.rm.ftz.f32 %r64, %r63, %r32, %r31; + mov.b64 %rd30, {%r61, %r64}; + cvt.u32.u64 %r65, %rd30; + add.f32 %r66, %r61, 0fCB40007F; + neg.f32 %r67, %r66; + fma.rn.ftz.f32 %r68, %r58, %r40, %r67; + fma.rn.ftz.f32 %r69, %r58, %r42, %r68; + ex2.approx.ftz.f32 %r70, %r69; + add.f32 %r71, %r64, 0fCB40007F; + neg.f32 %r72, %r71; + fma.rn.ftz.f32 %r73, %r57, %r40, %r72; + fma.rn.ftz.f32 %r74, %r57, %r42, %r73; + shl.b64 %rd31, %rd30, 23; + and.b64 %rd32, %rd31, -36028797018963968; + shl.b32 %r75, %r65, 23; + cvt.u64.u32 %rd33, %r75; + or.b64 %rd34, %rd33, %rd32; + ex2.approx.ftz.f32 %r76, %r74; + mov.b64 {%r77, %r78}, %rd34; + mul.f32 %r79, %r70, %r77; + mul.f32 %r80, %r76, %r78; + .loc 2 199 39 // triton_helpers.py:199:39 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r81, 0f3F800000, %r80, %p9; + selp.f32 %r82, 0f3F800000, %r79, %p8; + .loc 2 205 36 // triton_helpers.py:205:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + mov.b64 {%r83, %r84}, %rd65; + fma.rn.f32 %r85, %r83, %r56, %r82; + fma.rn.f32 %r86, %r84, %r55, %r81; +$L__tmp4: + .loc 1 37 105 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:105 + mov.b32 {%rs3, %rs4}, %r11; + cvt.f32.bf16 %r87, %rs4; + cvt.f32.bf16 %r88, %rs3; +$L__tmp5: + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.gt.f32 %p10, %r17, %r88; + setp.gt.f32 %p11, %r18, %r87; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r89, %r18, %r87, %p11; + selp.f32 %r90, %r18, %r89, %p5; + selp.f32 %r91, %r17, %r88, %p10; + selp.f32 %r92, %r17, %r91, %p4; + .loc 2 196 19 // triton_helpers.py:196:19 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.eq.f32 %p12, %r92, 0fFF800000; + setp.eq.f32 %p13, %r90, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + sub.f32 %r93, %r18, %r90; + sub.f32 %r94, %r17, %r92; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.ftz.f32 %r95, %r94, %r28, %r27; + cvt.ftz.sat.f32.f32 %r96, %r95; + fma.rm.ftz.f32 %r97, %r96, %r32, %r31; + fma.rn.ftz.f32 %r98, %r93, %r28, %r27; + cvt.ftz.sat.f32.f32 %r99, %r98; + fma.rm.ftz.f32 %r100, %r99, %r32, %r31; + mov.b64 %rd35, {%r97, %r100}; + cvt.u32.u64 %r101, %rd35; + add.f32 %r102, %r97, 0fCB40007F; + neg.f32 %r103, %r102; + fma.rn.ftz.f32 %r104, %r94, %r40, %r103; + fma.rn.ftz.f32 %r105, %r94, %r42, %r104; + ex2.approx.ftz.f32 %r106, %r105; + add.f32 %r107, %r100, 0fCB40007F; + neg.f32 %r108, %r107; + fma.rn.ftz.f32 %r109, %r93, %r40, %r108; + fma.rn.ftz.f32 %r110, %r93, %r42, %r109; + shl.b64 %rd36, %rd35, 23; + and.b64 %rd37, %rd36, -36028797018963968; + shl.b32 %r111, %r101, 23; + cvt.u64.u32 %rd38, %r111; + or.b64 %rd39, %rd38, %rd37; + ex2.approx.ftz.f32 %r112, %r110; + mov.b64 {%r113, %r114}, %rd39; + mul.f32 %r115, %r106, %r113; + mul.f32 %r116, %r112, %r114; + .loc 2 196 39 // triton_helpers.py:196:39 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r117, 0f3F800000, %r116, %p13; + selp.f32 %r118, 0f3F800000, %r115, %p12; + .loc 2 199 53 // triton_helpers.py:199:53 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + sub.f32 %r119, %r87, %r90; + sub.f32 %r120, %r88, %r92; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.ftz.f32 %r121, %r120, %r28, %r27; + cvt.ftz.sat.f32.f32 %r122, %r121; + fma.rm.ftz.f32 %r123, %r122, %r32, %r31; + fma.rn.ftz.f32 %r124, %r119, %r28, %r27; + cvt.ftz.sat.f32.f32 %r125, %r124; + fma.rm.ftz.f32 %r126, %r125, %r32, %r31; + mov.b64 %rd40, {%r123, %r126}; + cvt.u32.u64 %r127, %rd40; + add.f32 %r128, %r123, 0fCB40007F; + neg.f32 %r129, %r128; + fma.rn.ftz.f32 %r130, %r120, %r40, %r129; + fma.rn.ftz.f32 %r131, %r120, %r42, %r130; + ex2.approx.ftz.f32 %r132, %r131; + add.f32 %r133, %r126, 0fCB40007F; + neg.f32 %r134, %r133; + fma.rn.ftz.f32 %r135, %r119, %r40, %r134; + fma.rn.ftz.f32 %r136, %r119, %r42, %r135; + shl.b64 %rd41, %rd40, 23; + and.b64 %rd42, %rd41, -36028797018963968; + shl.b32 %r137, %r127, 23; + cvt.u64.u32 %rd43, %r137; + or.b64 %rd44, %rd43, %rd42; + ex2.approx.ftz.f32 %r138, %r136; + mov.b64 {%r139, %r140}, %rd44; + mul.f32 %r141, %r132, %r139; + mul.f32 %r142, %r138, %r140; + .loc 2 199 39 // triton_helpers.py:199:39 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r143, 0f3F800000, %r142, %p13; + selp.f32 %r144, 0f3F800000, %r141, %p12; + .loc 2 205 36 // triton_helpers.py:205:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + mov.b64 {%r145, %r146}, %rd63; + fma.rn.f32 %r147, %r145, %r118, %r144; + fma.rn.f32 %r148, %r146, %r117, %r143; +$L__tmp6: + .loc 1 45 54 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:45:54 + selp.f32 %r149, %r22, %r16, %p1; + selp.f32 %r150, %r24, %r15, %p1; + mov.b64 %rd66, {%r150, %r149}; + selp.f32 %r151, %r90, %r18, %p1; + selp.f32 %r152, %r92, %r17, %p1; + mov.b64 %rd64, {%r152, %r151}; + .loc 1 46 54 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:46:54 + selp.f32 %r153, %r86, %r84, %p1; + selp.f32 %r154, %r85, %r83, %p1; + mov.b64 %rd65, {%r154, %r153}; + selp.f32 %r155, %r148, %r146, %p1; + selp.f32 %r156, %r147, %r145, %p1; + mov.b64 %rd63, {%r156, %r155}; + .loc 1 31 40 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:31:40 + add.s64 %rd12, %rd62, 2048; + setp.lt.u64 %p14, %rd62, 29952; + mov.b64 %rd62, %rd12; + @%p14 bra $L__BB0_1; +// %bb.2: + .loc 1 26 37 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:26:37 + and.b32 %r169, %r1, 31; +$L__tmp7: + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mov.b64 {%r170, %r171}, %rd66; + setp.gt.f32 %p21, %r170, %r171; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p22, %r170, %r170; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r172, %r170, %r171, %p22; + selp.f32 %r173, %r170, %r172, %p21; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mov.b64 {%r174, %r175}, %rd64; + setp.gt.f32 %p23, %r173, %r174; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p24, %r173, %r173; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r176, %r173, %r174, %p24; + selp.f32 %r177, %r173, %r176, %p23; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p25, %r177, %r175; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p26, %r177, %r177; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r178, %r177, %r175, %p26; + selp.f32 %r179, %r177, %r178, %p25; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r180, %r179, 16, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p27, %r179, %r180; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p28, %r179, %r179; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r181, %r179, %r180, %p27; + selp.f32 %r182, %r179, %r181, %p28; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r183, %r182, 8, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p29, %r182, %r183; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p30, %r182, %r182; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r184, %r182, %r183, %p30; + selp.f32 %r185, %r182, %r184, %p29; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r186, %r185, 4, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p31, %r185, %r186; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p32, %r185, %r185; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r187, %r185, %r186, %p32; + selp.f32 %r188, %r185, %r187, %p31; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r189, %r188, 2, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p33, %r188, %r189; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p34, %r188, %r188; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r190, %r188, %r189, %p34; + selp.f32 %r191, %r188, %r190, %p33; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r192, %r191, 1, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p35, %r191, %r192; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p36, %r191, %r191; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.eq.b32 %p15, %r169, 0; + shr.u32 %r193, %r1, 3; + and.b32 %r194, %r193, 60; + mov.b32 %r195, global_smem; + add.s32 %r157, %r195, %r194; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.b32 %r196, %r191, %r192, %p36; + selp.b32 %r158, %r191, %r196, %p35; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + // begin inline asm + @%p15 st.shared.b32 [ %r157 + 0 ], %r158; + // end inline asm + bar.sync 0; + setp.lt.u32 %p16, %r1, 16; + add.s32 %r160, %r195, %r5; + // begin inline asm + @%p16 ld.shared.b32 %r159, [ %r160 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r198, %r159, 8, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p37, %r159, %r198; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p38, %r159, %r159; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r199, %r159, %r198, %p37; + selp.f32 %r200, %r159, %r199, %p38; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r201, %r200, 4, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p39, %r200, %r201; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p40, %r200, %r200; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r202, %r200, %r201, %p40; + selp.f32 %r203, %r200, %r202, %p39; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r204, %r203, 2, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p41, %r203, %r204; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p42, %r203, %r203; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r205, %r203, %r204, %p42; + selp.f32 %r206, %r203, %r205, %p41; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r207, %r206, 1, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p43, %r206, %r207; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p44, %r206, %r206; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.eq.b32 %p17, %r1, 0; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.b32 %r208, %r206, %r207, %p44; + selp.b32 %r162, %r206, %r208, %p43; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + // begin inline asm + @%p17 st.shared.b32 [ %r160 + 0 ], %r162; + // end inline asm + bar.sync 0; + ld.shared.b32 %r2, [global_smem]; + .loc 2 180 40 // triton_helpers.py:180:40 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.eq.f32 %p45, %r2, 0fFF800000; + .loc 2 180 68 // triton_helpers.py:180:68 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + sub.f32 %r209, %r170, %r2; + sub.f32 %r210, %r171, %r2; + sub.f32 %r211, %r174, %r2; + sub.f32 %r212, %r175, %r2; + .loc 2 180 58 // triton_helpers.py:180:58 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r213, 0f00000000, %r209, %p45; + selp.f32 %r214, 0f00000000, %r210, %p45; + selp.f32 %r215, 0f00000000, %r211, %p45; + selp.f32 %r216, 0f00000000, %r212, %p45; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + fma.rn.ftz.f32 %r219, %r213, %r28, %r27; + cvt.ftz.sat.f32.f32 %r220, %r219; + fma.rm.ftz.f32 %r223, %r220, %r32, %r31; + fma.rn.ftz.f32 %r224, %r214, %r28, %r27; + cvt.ftz.sat.f32.f32 %r225, %r224; + fma.rm.ftz.f32 %r226, %r225, %r32, %r31; + fma.rn.ftz.f32 %r227, %r215, %r28, %r27; + cvt.ftz.sat.f32.f32 %r228, %r227; + fma.rm.ftz.f32 %r229, %r228, %r32, %r31; + fma.rn.ftz.f32 %r230, %r216, %r28, %r27; + cvt.ftz.sat.f32.f32 %r231, %r230; + fma.rm.ftz.f32 %r232, %r231, %r32, %r31; + mov.b64 %rd46, {%r223, %r226}; + cvt.u32.u64 %r233, %rd46; + add.f32 %r234, %r223, 0fCB40007F; + neg.f32 %r235, %r234; + fma.rn.ftz.f32 %r237, %r213, %r40, %r235; + fma.rn.ftz.f32 %r239, %r213, %r42, %r237; + ex2.approx.ftz.f32 %r240, %r239; + add.f32 %r241, %r226, 0fCB40007F; + neg.f32 %r242, %r241; + fma.rn.ftz.f32 %r243, %r214, %r40, %r242; + fma.rn.ftz.f32 %r244, %r214, %r42, %r243; + shl.b64 %rd47, %rd46, 23; + and.b64 %rd48, %rd47, -36028797018963968; + shl.b32 %r245, %r233, 23; + cvt.u64.u32 %rd49, %r245; + or.b64 %rd50, %rd49, %rd48; + ex2.approx.ftz.f32 %r246, %r244; + mov.b64 {%r247, %r248}, %rd50; + mul.f32 %r249, %r240, %r247; + mul.f32 %r250, %r246, %r248; + .loc 2 181 31 // triton_helpers.py:181:31 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mov.b64 {%r251, %r252}, %rd65; + mul.f32 %r253, %r252, %r250; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mov.b64 %rd51, {%r229, %r232}; + cvt.u32.u64 %r254, %rd51; + add.f32 %r255, %r229, 0fCB40007F; + neg.f32 %r256, %r255; + fma.rn.ftz.f32 %r257, %r215, %r40, %r256; + fma.rn.ftz.f32 %r258, %r215, %r42, %r257; + ex2.approx.ftz.f32 %r259, %r258; + add.f32 %r260, %r232, 0fCB40007F; + neg.f32 %r261, %r260; + fma.rn.ftz.f32 %r262, %r216, %r40, %r261; + fma.rn.ftz.f32 %r263, %r216, %r42, %r262; + shl.b64 %rd52, %rd51, 23; + and.b64 %rd53, %rd52, -36028797018963968; + shl.b32 %r264, %r254, 23; + cvt.u64.u32 %rd54, %r264; + or.b64 %rd55, %rd54, %rd53; + ex2.approx.ftz.f32 %r265, %r263; + mov.b64 {%r266, %r267}, %rd55; + mul.f32 %r268, %r265, %r267; + mul.f32 %r269, %r259, %r266; + .loc 2 181 31 // triton_helpers.py:181:31 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mov.b64 {%r270, %r271}, %rd63; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + bar.sync 0; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + fma.rn.f32 %r272, %r251, %r249, %r253; + fma.rn.f32 %r273, %r270, %r269, %r272; + fma.rn.f32 %r274, %r271, %r268, %r273; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r275, %r274, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r276, %r274, %r275; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r277, %r276, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r278, %r276, %r277; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r279, %r278, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r280, %r278, %r279; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r281, %r280, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r282, %r280, %r281; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r283, %r282, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r164, %r282, %r283; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + // begin inline asm + @%p15 st.shared.b32 [ %r157 + 0 ], %r164; + // end inline asm + bar.sync 0; + // begin inline asm + @%p16 ld.shared.b32 %r165, [ %r160 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r284, %r165, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r285, %r165, %r284; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r286, %r285, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r287, %r285, %r286; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r288, %r287, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r289, %r287, %r288; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r290, %r289, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r168, %r289, %r290; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + // begin inline asm + @%p17 st.shared.b32 [ %r160 + 0 ], %r168; + // end inline asm + bar.sync 0; + mov.b64 %rd67, 0; + ld.shared.b32 %r3, [global_smem]; +$L__tmp8: +$L__BB0_3: // =>This Inner Loop Header: Depth=1 + .loc 1 54 29 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:54:29 + add.s64 %rd60, %rd1, %rd67; + setp.lt.u64 %p46, %rd60, 32000; + .loc 1 58 34 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:58:34 + add.s64 %rd61, %rd2, %rd67; + cvt.u32.u64 %r299, %rd61; + mad.wide.s32 %rd57, %r299, 2, %rd15; + .loc 1 58 52 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:58:52 + // begin inline asm + mov.u64 %rd56, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd56, 1.0; + // end inline asm + mov.b32 %r293, 0; + // begin inline asm + mov.u32 %r291, %r293; + mov.u32 %r292, %r293; + @%p46 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r291, %r292 }, [ %rd57 + 0 ], %rd56; + // end inline asm + mov.b32 {%rs5, %rs6}, %r291; + mov.b32 {%rs7, %rs8}, %r292; + .loc 1 58 106 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:58:106 + cvt.f32.bf16 %r300, %rs5; + cvt.f32.bf16 %r301, %rs6; + cvt.f32.bf16 %r302, %rs7; + cvt.f32.bf16 %r303, %rs8; + .loc 1 60 22 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:60:22 + sub.f32 %r304, %r300, %r2; + sub.f32 %r305, %r301, %r2; + sub.f32 %r306, %r302, %r2; + sub.f32 %r307, %r303, %r2; + mov.b32 %r308, 0f3F000000; + mov.b32 %r309, 0f3BBB989D; + .loc 1 61 29 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:61:29 + fma.rn.ftz.f32 %r310, %r304, %r309, %r308; + cvt.ftz.sat.f32.f32 %r311, %r310; + mov.b32 %r312, 0f4B400001; + mov.b32 %r313, 0f437C0000; + fma.rm.ftz.f32 %r314, %r311, %r313, %r312; + add.f32 %r315, %r314, 0fCB40007F; + neg.f32 %r316, %r315; + mov.b32 %r317, 0f3FB8AA3B; + fma.rn.ftz.f32 %r318, %r304, %r317, %r316; + mov.b32 %r319, 0f32A57060; + fma.rn.ftz.f32 %r320, %r304, %r319, %r318; + shl.b32 %r321, %r314, 23; + ex2.approx.ftz.f32 %r322, %r320; + mul.f32 %r323, %r322, %r321; + fma.rn.ftz.f32 %r324, %r305, %r309, %r308; + cvt.ftz.sat.f32.f32 %r325, %r324; + fma.rm.ftz.f32 %r326, %r325, %r313, %r312; + add.f32 %r327, %r326, 0fCB40007F; + neg.f32 %r328, %r327; + fma.rn.ftz.f32 %r329, %r305, %r317, %r328; + fma.rn.ftz.f32 %r330, %r305, %r319, %r329; + shl.b32 %r331, %r326, 23; + ex2.approx.ftz.f32 %r332, %r330; + mul.f32 %r333, %r332, %r331; + fma.rn.ftz.f32 %r334, %r306, %r309, %r308; + cvt.ftz.sat.f32.f32 %r335, %r334; + fma.rm.ftz.f32 %r336, %r335, %r313, %r312; + add.f32 %r337, %r336, 0fCB40007F; + neg.f32 %r338, %r337; + fma.rn.ftz.f32 %r339, %r306, %r317, %r338; + fma.rn.ftz.f32 %r340, %r306, %r319, %r339; + shl.b32 %r341, %r336, 23; + ex2.approx.ftz.f32 %r342, %r340; + mul.f32 %r343, %r342, %r341; + fma.rn.ftz.f32 %r344, %r307, %r309, %r308; + cvt.ftz.sat.f32.f32 %r345, %r344; + fma.rm.ftz.f32 %r346, %r345, %r313, %r312; + add.f32 %r347, %r346, 0fCB40007F; + neg.f32 %r348, %r347; + fma.rn.ftz.f32 %r349, %r307, %r317, %r348; + fma.rn.ftz.f32 %r350, %r307, %r319, %r349; + shl.b32 %r351, %r346, 23; + ex2.approx.ftz.f32 %r352, %r350; + mul.f32 %r353, %r352, %r351; + .loc 1 62 23 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:62:23 + div.full.f32 %r295, %r323, %r3; + div.full.f32 %r296, %r333, %r3; + div.full.f32 %r297, %r343, %r3; + div.full.f32 %r298, %r353, %r3; + .loc 1 63 29 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:63:29 + mad.wide.s32 %rd59, %r299, 4, %rd16; + .loc 1 63 53 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:63:53 + // begin inline asm + @%p46 st.global.v4.b32 [ %rd59 + 0 ], { %r295, %r296, %r297, %r298 }; + // end inline asm + .loc 1 52 40 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:52:40 + add.s64 %rd14, %rd67, 2048; + setp.lt.u64 %p48, %rd67, 29952; + mov.b64 %rd67, %rd14; + @%p48 bra $L__BB0_3; +// %bb.4: + .loc 1 52 4 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:52:4 + ret; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 276 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x10d DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 52 +.b8 105 +.b8 119 +.b8 110 +.b8 104 +.b8 115 +.b8 102 +.b8 53 +.b8 107 +.b8 102 +.b8 109 +.b8 109 +.b8 55 +.b8 106 +.b8 110 +.b8 122 +.b8 114 +.b8 107 +.b8 121 +.b8 105 +.b8 118 +.b8 52 +.b8 120 +.b8 51 +.b8 121 +.b8 97 +.b8 104 +.b8 106 +.b8 111 +.b8 103 +.b8 54 +.b8 100 +.b8 121 +.b8 104 +.b8 102 +.b8 52 +.b8 112 +.b8 114 +.b8 109 +.b8 50 +.b8 99 +.b8 106 +.b8 100 +.b8 105 +.b8 53 +.b8 120 +.b8 104 +.b8 108 +.b8 108 +.b8 120 +.b8 50 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 52 +.b8 105 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x46 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 101 +.b8 120 +.b8 112 +.b8 95 +.b8 112 +.b8 114 +.b8 101 +.b8 112 +.b8 97 +.b8 114 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 111 +.b8 110 +.b8 108 +.b8 105 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 117 +.b8 98 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xd1:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xe6:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 40 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xfe:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 49 // DW_AT_call_line +.b8 33 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source new file mode 100644 index 0000000000000000000000000000000000000000..32c1c33f92dcb0976034546e9949449e4abc81a3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source @@ -0,0 +1,449 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":18:0) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":186:0) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":109:0) +#loc68 = loc(unknown) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":169:0) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":177:0) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":122:0) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc108 = loc("in_ptr0"(#loc)) +#loc109 = loc("out_ptr2"(#loc)) +#loc110 = loc("xnumel"(#loc)) +#loc111 = loc("r0_numel"(#loc)) +#loc146 = loc("lhs_max"(#loc48)) +#loc147 = loc("lhs_sum"(#loc48)) +#loc148 = loc("rhs_max"(#loc48)) +#loc160 = loc("a"(#loc62)) +#loc161 = loc("b"(#loc62)) +#loc165 = loc("x"(#loc72)) +#loc166 = loc("x"(#loc76)) +#loc167 = loc("x"(#loc81)) +#loc168 = loc("lhs_max"(#loc85)) +#loc169 = loc("lhs_sum"(#loc85)) +#loc178 = loc("a"(#loc96)) +#loc179 = loc("input"(#loc100)) +#loc180 = loc("a"(#loc104)) +#loc181 = loc("b"(#loc104)) +module { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 16384 : i32 loc(#loc112) + %r0_numel_1 = arith.constant 32000 : i32 loc(#loc113) + %xoffset = tt.get_program_id x : i32 loc(#loc114) + %xoffset_2 = arith.constant 1 : i32 loc(#loc115) + %xoffset_3 = arith.constant 1 : i32 loc(#loc115) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc115) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc116) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc117) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc118) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc118) + %xmask = arith.constant true loc(#loc119) + %xmask_8 = arith.constant dense : tensor<1x2048xi1> loc(#loc119) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc120) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc121) + %_tmp3_max = arith.constant 0xFF800000 : f32 loc(#loc122) + %_tmp3_max_10 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc122) + %_tmp3_sum = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc123) + %c0_i32 = arith.constant 0 : i32 loc(#loc13) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc13) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc13) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc13) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc13) + %3 = ub.poison : i32 loc(#loc13) + %_tmp3_sum_11:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_max_14 = %_tmp3_max_10, %_tmp3_sum_15 = %_tmp3_sum) -> (tensor<1x2048xf32>, tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc125) + %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc125) + %r0_mask = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc126) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x2048xi32> loc(#loc126) + %tmp0 = arith.constant 32000 : i32 loc(#loc127) + %tmp0_18 = arith.constant 32000 : i32 loc(#loc127) + %tmp0_19 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc127) + %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc127) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc128) + %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x2048xi32> loc(#loc128) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc129) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc129) + %tmp0_25 = arith.constant 0.000000e+00 : f32 loc(#loc130) + %tmp0_26 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc130) + %tmp0_27 = arith.truncf %tmp0_26 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc130) + %tmp0_28 = tt.load %tmp0_24, %r0_mask_17, %tmp0_27 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc130) + %tmp0_29 = arith.extf %tmp0_28 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc131) + %9:2 = tt.call @"torch._inductor.runtime.triton_helpers.online_softmax_combine__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_False_"(%_tmp3_max_14, %_tmp3_sum_15, %tmp0_29) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1x2048xf32>, tensor<1x2048xf32>) loc(#loc21) + %_tmp3_max_30 = arith.select %r0_mask_17, %9#0, %_tmp3_max_14 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc132) + %_tmp3_sum_31 = arith.select %r0_mask_17, %9#1, %_tmp3_sum_15 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc133) + scf.yield %_tmp3_max_30, %_tmp3_sum_31 : tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc24) + } loc(#loc182) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.online_softmax_reduce__fp32S1_2048S_fp32S1_2048S__(2,)cconstexpr_1__(3,)cconstexpr_False_"(%_tmp3_sum_11#0, %_tmp3_sum_11#1) : (tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>) loc(#loc25) + %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc134) + %tmp4 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc135) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc28) + %c2048_i32_13 = arith.constant 2048 : i32 loc(#loc28) + %5 = arith.bitcast %c0_i32_12 : i32 to i32 loc(#loc28) + %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc28) + %7 = arith.bitcast %c2048_i32_13 : i32 to i32 loc(#loc28) + %8 = ub.poison : i32 loc(#loc28) + scf.for %r0_offset = %5 to %6 step %7 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc136) + %r0_index_14 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc136) + %r0_mask = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc137) + %r0_mask_15 = arith.cmpi slt, %r0_index_14, %r0_mask : tensor<1x2048xi32> loc(#loc137) + %tmp5 = arith.constant 32000 : i32 loc(#loc138) + %tmp5_16 = arith.constant 32000 : i32 loc(#loc138) + %tmp5_17 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc138) + %tmp5_18 = arith.muli %tmp5_17, %xindex_7 : tensor<1x1xi32> loc(#loc138) + %tmp5_19 = tt.broadcast %tmp5_18 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc139) + %tmp5_20 = arith.addi %r0_index_14, %tmp5_19 : tensor<1x2048xi32> loc(#loc139) + %tmp5_21 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc140) + %tmp5_22 = tt.addptr %tmp5_21, %tmp5_20 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc140) + %tmp5_23 = arith.constant 0.000000e+00 : f32 loc(#loc141) + %tmp5_24 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc141) + %tmp5_25 = arith.truncf %tmp5_24 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc141) + %tmp5_26 = tt.load %tmp5_22, %r0_mask_15, %tmp5_25 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc141) + %tmp5_27 = arith.extf %tmp5_26 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc142) + %tmp7 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc143) + %tmp7_28 = arith.subf %tmp5_27, %tmp7 : tensor<1x2048xf32> loc(#loc143) + %tmp8 = tt.extern_elementwise %tmp7_28 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc144) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc145) + %tmp9_29 = arith.divf %tmp8, %tmp9 : tensor<1x2048xf32> loc(#loc145) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc39) + %c32000_i32_30 = arith.constant 32000 : i32 loc(#loc39) + %cst = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc39) + %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc39) + %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc40) + %11 = arith.addi %r0_index_14, %10 : tensor<1x2048xi32> loc(#loc40) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc41) + %13 = tt.addptr %12, %11 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc41) + tt.store %13, %tmp9_29, %r0_mask_15 : tensor<1x2048x!tt.ptr> loc(#loc42) + } loc(#loc28) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() -> tensor<1x2048xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc45) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc45) + tt.return %cst_0 : tensor<1x2048xf32> loc(#loc46) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x2048xf32> loc(#loc47) + tt.return %0 : tensor<1x2048xf32> loc(#loc47) + } loc(#loc44) + tt.func private @"torch._inductor.runtime.triton_helpers.online_softmax_combine__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_False_"(%lhs_max: tensor<1x2048xf32> loc("lhs_max"(#loc48)), %lhs_sum: tensor<1x2048xf32> loc("lhs_sum"(#loc48)), %rhs_max: tensor<1x2048xf32> loc("rhs_max"(#loc48))) -> (tensor<1x2048xf32>, tensor<1x2048xf32>) attributes {noinline = false} { + %out_max = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32S1_2048S_fp32S1_2048S__(%lhs_max, %rhs_max) : (tensor<1x2048xf32>, tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc149) + %lhs_scale = arith.constant 0xFF800000 : f32 loc(#loc150) + %lhs_scale_0 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc150) + %lhs_scale_1 = arith.cmpf oeq, %out_max, %lhs_scale_0 : tensor<1x2048xf32> loc(#loc150) + %lhs_scale_2 = arith.subf %lhs_max, %out_max : tensor<1x2048xf32> loc(#loc151) + %lhs_scale_3 = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_2048S__(1,)cconstexpr_False_"(%lhs_scale_2) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc152) + %lhs_scale_4 = arith.constant 1.000000e+00 : f32 loc(#loc153) + %lhs_scale_5 = arith.constant 1.000000e+00 : f32 loc(#loc153) + %lhs_scale_6 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc153) + %lhs_scale_7 = arith.select %lhs_scale_1, %lhs_scale_6, %lhs_scale_3 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc153) + %rhs_scale = arith.constant 0xFF800000 : f32 loc(#loc154) + %rhs_scale_8 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc154) + %rhs_scale_9 = arith.cmpf oeq, %out_max, %rhs_scale_8 : tensor<1x2048xf32> loc(#loc154) + %rhs_scale_10 = arith.subf %rhs_max, %out_max : tensor<1x2048xf32> loc(#loc155) + %rhs_scale_11 = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_2048S__(1,)cconstexpr_False_"(%rhs_scale_10) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc156) + %rhs_scale_12 = arith.constant 1.000000e+00 : f32 loc(#loc157) + %rhs_scale_13 = arith.constant 1.000000e+00 : f32 loc(#loc157) + %rhs_scale_14 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc157) + %rhs_scale_15 = arith.select %rhs_scale_9, %rhs_scale_14, %rhs_scale_11 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc157) + %out_sum = arith.mulf %lhs_sum, %lhs_scale_7 : tensor<1x2048xf32> loc(#loc158) + %out_sum_16 = arith.addf %out_sum, %rhs_scale_15 : tensor<1x2048xf32> loc(#loc159) + tt.return %out_max, %out_sum_16 : tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc60) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x2048xf32> loc(#loc61) + %1 = ub.poison : tensor<1x2048xf32> loc(#loc61) + tt.return %0, %1 : tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc61) + } loc(#loc48) + tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32S1_2048S_fp32S1_2048S__(%a: tensor<1x2048xf32> loc("a"(#loc62)), %b: tensor<1x2048xf32> loc("b"(#loc62))) -> tensor<1x2048xf32> attributes {noinline = false} { + %mask = arith.cmpf ogt, %a, %b : tensor<1x2048xf32> loc(#loc183) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%a) : (tensor<1x2048xf32>) -> i1 loc(#loc64) + %1 = scf.if %0 -> (tensor<1x2048xi1>) { + %mask_0 = arith.cmpf une, %a, %a : tensor<1x2048xf32> loc(#loc163) + %mask_1 = arith.ori %mask, %mask_0 : tensor<1x2048xi1> loc(#loc184) + scf.yield %mask_1 : tensor<1x2048xi1> loc(#loc184) + } else { + scf.yield %mask : tensor<1x2048xi1> loc(#loc68) + } loc(#loc65) + %2 = arith.select %1, %a, %b : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc69) + tt.return %2 : tensor<1x2048xf32> loc(#loc70) + ^bb1: // no predecessors + %3 = ub.poison : tensor<1x2048xf32> loc(#loc71) + tt.return %3 : tensor<1x2048xf32> loc(#loc71) + } loc(#loc62) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc72))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc73) + %true = arith.constant true loc(#loc74) + tt.return %true : i1 loc(#loc74) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc75) + tt.return %1 : i1 loc(#loc75) + } loc(#loc72) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc76))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc77) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc78) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc78) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc78) + %4 = arith.addf %x, %3 : tensor<1x2048xf32> loc(#loc78) + tt.return %4 : tensor<1x2048xf32> loc(#loc79) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x2048xf32> loc(#loc80) + tt.return %5 : tensor<1x2048xf32> loc(#loc80) + } loc(#loc76) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc45) + %cst = arith.constant dense : tensor<1xi1> loc(#loc45) + tt.return %cst : tensor<1xi1> loc(#loc46) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc47) + tt.return %0 : tensor<1xi1> loc(#loc47) + } loc(#loc44) + tt.func private @"torch._inductor.runtime.triton_helpers.exp__fp32S1_2048S__(1,)cconstexpr_False_"(%x: tensor<1x2048xf32> loc("x"(#loc81))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.extern_elementwise %x {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc82) + tt.return %0 : tensor<1x2048xf32> loc(#loc83) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x2048xf32> loc(#loc84) + tt.return %1 : tensor<1x2048xf32> loc(#loc84) + } loc(#loc81) + tt.func private @"torch._inductor.runtime.triton_helpers.online_softmax_reduce__fp32S1_2048S_fp32S1_2048S__(2,)cconstexpr_1__(3,)cconstexpr_False_"(%lhs_max: tensor<1x2048xf32> loc("lhs_max"(#loc85)), %lhs_sum: tensor<1x2048xf32> loc("lhs_sum"(#loc85))) -> (tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} { + %out_max = tt.call @"torch._inductor.runtime.triton_helpers.max2__fp32S1_2048S__(1,)cconstexpr_1_"(%lhs_max) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc170) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc171) + %delta = arith.constant 0xFF800000 : f32 loc(#loc172) + %delta_0 = arith.constant dense<0xFF800000> : tensor<1x1xf32> loc(#loc172) + %delta_1 = arith.cmpf oeq, %out_max_keepdim, %delta_0 : tensor<1x1xf32> loc(#loc172) + %delta_2 = tt.broadcast %out_max_keepdim : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc173) + %delta_3 = arith.subf %lhs_max, %delta_2 : tensor<1x2048xf32> loc(#loc173) + %delta_4 = arith.constant 0 : i32 loc(#loc174) + %delta_5 = arith.constant 0.000000e+00 : f32 loc(#loc174) + %delta_6 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc174) + %delta_7 = tt.broadcast %delta_1 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc174) + %delta_8 = arith.select %delta_7, %delta_6, %delta_3 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc174) + %out_sum = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_2048S__(1,)cconstexpr_False_"(%delta_8) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc175) + %out_sum_9 = arith.mulf %lhs_sum, %out_sum : tensor<1x2048xf32> loc(#loc176) + %out_sum_10 = tt.call @"triton.language.standard.sum__fp32S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%out_sum_9) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc177) + tt.return %out_max, %out_sum_10 : tensor<1xf32>, tensor<1xf32> loc(#loc94) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xf32> loc(#loc95) + %1 = ub.poison : tensor<1xf32> loc(#loc95) + tt.return %0, %1 : tensor<1xf32>, tensor<1xf32> loc(#loc95) + } loc(#loc85) + tt.func private @"torch._inductor.runtime.triton_helpers.max2__fp32S1_2048S__(1,)cconstexpr_1_"(%a: tensor<1x2048xf32> loc("a"(#loc96))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%a) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc97) + tt.reduce.return %2 : f32 loc(#loc97) + }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc97) + tt.return %0 : tensor<1xf32> loc(#loc98) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc99) + tt.return %1 : tensor<1xf32> loc(#loc99) + } loc(#loc96) + tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%a: f32 loc("a"(#loc62)), %b: f32 loc("b"(#loc62))) -> f32 attributes {noinline = false} { + %mask = arith.cmpf ogt, %a, %b : f32 loc(#loc183) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a) : (f32) -> i1 loc(#loc64) + %1 = scf.if %0 -> (i1) { + %mask_0 = arith.cmpf une, %a, %a : f32 loc(#loc163) + %mask_1 = arith.ori %mask, %mask_0 : i1 loc(#loc184) + scf.yield %mask_1 : i1 loc(#loc184) + } else { + scf.yield %mask : i1 loc(#loc68) + } loc(#loc65) + %2 = arith.select %1, %a, %b : f32 loc(#loc69) + tt.return %2 : f32 loc(#loc70) + ^bb1: // no predecessors + %3 = ub.poison : f32 loc(#loc71) + tt.return %3 : f32 loc(#loc71) + } loc(#loc62) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc72))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc73) + %true = arith.constant true loc(#loc74) + tt.return %true : i1 loc(#loc74) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc75) + tt.return %1 : i1 loc(#loc75) + } loc(#loc72) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc76))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc77) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc78) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc78) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc78) + tt.return %3 : tensor<1xf32> loc(#loc79) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc80) + tt.return %4 : tensor<1xf32> loc(#loc80) + } loc(#loc76) + tt.func private @"triton.language.standard.sum__fp32S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2048xf32> loc("input"(#loc100))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc101) + tt.reduce.return %2 : f32 loc(#loc101) + }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc101) + tt.return %0 : tensor<1xf32> loc(#loc102) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc103) + tt.return %1 : tensor<1xf32> loc(#loc103) + } loc(#loc100) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc104)), %b: f32 loc("b"(#loc104))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc105) + tt.return %0 : f32 loc(#loc106) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc107) + tt.return %1 : f32 loc(#loc107) + } loc(#loc104) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":29:59) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":30:45) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":31:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":32:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":33:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:47) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:52) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:105) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":42:40) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":45:54) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":46:54) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":46:8) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":49:33) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":50:16) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":51:16) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":52:40) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":53:31) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":54:29) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:47) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:52) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:106) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":60:22) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":61:29) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":62:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:42) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:36) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:29) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:53) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":52:4) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:19) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":206:11) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":206:4) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:19) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:7) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:11) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:4) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:15) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":170:4) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":182:11) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":182:4) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:11) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:4) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc112 = loc("xnumel"(#loc1)) +#loc113 = loc("r0_numel"(#loc2)) +#loc114 = loc("xoffset"(#loc3)) +#loc115 = loc("xoffset"(#loc4)) +#loc116 = loc("xindex"(#loc5)) +#loc117 = loc("xindex"(#loc6)) +#loc118 = loc("xindex"(#loc7)) +#loc119 = loc("xmask"(#loc8)) +#loc120 = loc("r0_base"(#loc9)) +#loc121 = loc("r0_base"(#loc10)) +#loc122 = loc("_tmp3_max"(#loc11)) +#loc123 = loc("_tmp3_sum"(#loc12)) +#loc124 = loc("_tmp3_max"(#loc13)) +#loc125 = loc("r0_index"(#loc14)) +#loc126 = loc("r0_mask"(#loc15)) +#loc127 = loc("tmp0"(#loc16)) +#loc128 = loc("tmp0"(#loc17)) +#loc129 = loc("tmp0"(#loc18)) +#loc130 = loc("tmp0"(#loc19)) +#loc131 = loc("tmp0"(#loc20)) +#loc132 = loc("_tmp3_max"(#loc22)) +#loc133 = loc("_tmp3_sum"(#loc23)) +#loc134 = loc("tmp3"(#loc26)) +#loc135 = loc("tmp4"(#loc27)) +#loc136 = loc("r0_index"(#loc29)) +#loc137 = loc("r0_mask"(#loc30)) +#loc138 = loc("tmp5"(#loc31)) +#loc139 = loc("tmp5"(#loc32)) +#loc140 = loc("tmp5"(#loc33)) +#loc141 = loc("tmp5"(#loc34)) +#loc142 = loc("tmp5"(#loc35)) +#loc143 = loc("tmp7"(#loc36)) +#loc144 = loc("tmp8"(#loc37)) +#loc145 = loc("tmp9"(#loc38)) +#loc149 = loc("out_max"(#loc49)) +#loc150 = loc("lhs_scale"(#loc50)) +#loc151 = loc("lhs_scale"(#loc51)) +#loc152 = loc("lhs_scale"(#loc52)) +#loc153 = loc("lhs_scale"(#loc53)) +#loc154 = loc("rhs_scale"(#loc54)) +#loc155 = loc("rhs_scale"(#loc55)) +#loc156 = loc("rhs_scale"(#loc56)) +#loc157 = loc("rhs_scale"(#loc57)) +#loc158 = loc("out_sum"(#loc58)) +#loc159 = loc("out_sum"(#loc59)) +#loc162 = loc("mask"(#loc63)) +#loc163 = loc("mask"(#loc66)) +#loc164 = loc("mask"(#loc67)) +#loc170 = loc("out_max"(#loc86)) +#loc171 = loc("out_max_keepdim"(#loc87)) +#loc172 = loc("delta"(#loc88)) +#loc173 = loc("delta"(#loc89)) +#loc174 = loc("delta"(#loc90)) +#loc175 = loc("out_sum"(#loc91)) +#loc176 = loc("out_sum"(#loc92)) +#loc177 = loc("out_sum"(#loc93)) +#loc182 = loc("_tmp3_sum"(#loc124)) +#loc183 = loc("mask"(#loc162)) +#loc184 = loc("mask"(#loc164)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..1ed2ce668d8ca196ad85d9198bea5b22d3c29751 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir @@ -0,0 +1,226 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":18:0) +#loc1 = loc(unknown) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":49:33) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc57 = loc("in_ptr0"(#loc)) +#loc58 = loc("out_ptr2"(#loc)) +#loc59 = loc("xnumel"(#loc)) +#loc60 = loc("r0_numel"(#loc)) +#loc86 = loc("out_max"(#loc32)) +#loc93 = loc("out_sum"(#loc41)) +#loc118 = loc(callsite(#loc86 at #loc33)) +#loc125 = loc(callsite(#loc93 at #loc33)) +#loc133 = loc(callsite(#loc1 at #loc118)) +#loc136 = loc(callsite(#loc1 at #loc125)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32000> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %cst_1 = arith.constant dense<0xFF800000> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc61) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc62) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc62) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc63) + %tmp0_6 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc104) + %tmp0_7 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc65) + %_tmp3_sum:2 = scf.for %_tmp3_sum_14 = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%arg5 = %cst_4, %arg6 = %cst_3) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp3_sum_14 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc67) + %r0_index_15 = arith.addi %r0_index, %r0_base_5 : tensor<1x2048xi32, #blocked> loc(#loc67) + %r0_mask = arith.cmpi slt, %r0_index_15, %cst : tensor<1x2048xi32, #blocked> loc(#loc68) + %tmp0_16 = arith.addi %r0_index_15, %tmp0_6 : tensor<1x2048xi32, #blocked> loc(#loc64) + %tmp0_17 = tt.addptr %tmp0_7, %tmp0_16 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc65) + %tmp0_18 = tt.load %tmp0_17, %r0_mask, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc69) + %tmp0_19 = arith.extf %tmp0_18 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc70) + %mask = arith.cmpf ogt, %arg5, %tmp0_19 : tensor<1x2048xf32, #blocked> loc(#loc126) + %mask_20 = arith.cmpf une, %arg5, %arg5 : tensor<1x2048xf32, #blocked> loc(#loc127) + %mask_21 = arith.ori %mask, %mask_20 : tensor<1x2048xi1, #blocked> loc(#loc128) + %out_max_22 = arith.select %mask_21, %arg5, %tmp0_19 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc129) + %lhs_scale = arith.cmpf oeq, %out_max_22, %cst_4 : tensor<1x2048xf32, #blocked> loc(#loc109) + %lhs_scale_23 = arith.subf %arg5, %out_max_22 : tensor<1x2048xf32, #blocked> loc(#loc110) + %lhs_scale_24 = tt.extern_elementwise %lhs_scale_23 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked> loc(#loc130) + %lhs_scale_25 = arith.select %lhs_scale, %cst_2, %lhs_scale_24 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc112) + %rhs_scale = arith.subf %tmp0_19, %out_max_22 : tensor<1x2048xf32, #blocked> loc(#loc113) + %rhs_scale_26 = tt.extern_elementwise %rhs_scale {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked> loc(#loc131) + %rhs_scale_27 = arith.select %lhs_scale, %cst_2, %rhs_scale_26 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc115) + %out_sum_28 = arith.mulf %arg6, %lhs_scale_25 : tensor<1x2048xf32, #blocked> loc(#loc116) + %out_sum_29 = arith.addf %out_sum_28, %rhs_scale_27 : tensor<1x2048xf32, #blocked> loc(#loc117) + %_tmp3_max = arith.select %r0_mask, %out_max_22, %arg5 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc84) + %_tmp3_sum_30 = arith.select %r0_mask, %out_sum_29, %arg6 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc85) + scf.yield %_tmp3_max, %_tmp3_sum_30 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc30) + } loc(#loc105) + %out_max = "tt.reduce"(%_tmp3_sum#0) <{axis = 1 : i32}> ({ + ^bb0(%out_max_14: f32 loc(callsite(#loc1 at #loc118)), %out_max_15: f32 loc(callsite(#loc1 at #loc118))): + %mask = arith.cmpf ogt, %out_max_14, %out_max_15 : f32 loc(#loc137) + %mask_16 = arith.cmpf une, %out_max_14, %out_max_14 : f32 loc(#loc138) + %mask_17 = arith.ori %mask, %mask_16 : i1 loc(#loc139) + %out_max_18 = arith.select %mask_17, %out_max_14, %out_max_15 : f32 loc(#loc140) + tt.reduce.return %out_max_18 : f32 loc(#loc132) + }) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc132) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc119) + %delta = arith.cmpf oeq, %out_max_keepdim, %cst_1 : tensor<1x1xf32, #blocked> loc(#loc120) + %delta_8 = tt.broadcast %out_max_keepdim : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc121) + %delta_9 = arith.subf %_tmp3_sum#0, %delta_8 : tensor<1x2048xf32, #blocked> loc(#loc121) + %delta_10 = tt.broadcast %delta : tensor<1x1xi1, #blocked> -> tensor<1x2048xi1, #blocked> loc(#loc122) + %delta_11 = arith.select %delta_10, %cst_3, %delta_9 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc122) + %out_sum = tt.extern_elementwise %delta_11 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked> loc(#loc134) + %out_sum_12 = arith.mulf %_tmp3_sum#1, %out_sum : tensor<1x2048xf32, #blocked> loc(#loc124) + %out_sum_13 = "tt.reduce"(%out_sum_12) <{axis = 1 : i32}> ({ + ^bb0(%out_sum_14: f32 loc(callsite(#loc1 at #loc125)), %out_sum_15: f32 loc(callsite(#loc1 at #loc125))): + %out_sum_16 = arith.addf %out_sum_14, %out_sum_15 : f32 loc(#loc141) + tt.reduce.return %out_sum_16 : f32 loc(#loc135) + }) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc135) + %tmp4 = tt.expand_dims %out_sum_13 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc94) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc95) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc45) + scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc96) + %r0_index_14 = arith.addi %r0_index, %r0_base_5 : tensor<1x2048xi32, #blocked> loc(#loc96) + %r0_mask = arith.cmpi slt, %r0_index_14, %cst : tensor<1x2048xi32, #blocked> loc(#loc97) + %tmp5 = arith.addi %r0_index_14, %tmp0_6 : tensor<1x2048xi32, #blocked> loc(#loc98) + %tmp5_15 = tt.addptr %tmp0_7, %tmp5 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc99) + %tmp5_16 = tt.load %tmp5_15, %r0_mask, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc100) + %tmp5_17 = arith.extf %tmp5_16 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc101) + %tmp7 = arith.subf %tmp5_17, %delta_8 : tensor<1x2048xf32, #blocked> loc(#loc102) + %tmp8 = tt.extern_elementwise %tmp7 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked> loc(#loc103) + %tmp9_18 = arith.divf %tmp8, %tmp9 : tensor<1x2048xf32, #blocked> loc(#loc95) + %1 = tt.addptr %0, %tmp5 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc45) + tt.store %1, %tmp9_18, %r0_mask : tensor<1x2048x!tt.ptr, #blocked> loc(#loc55) + } loc(#loc46) + tt.return loc(#loc56) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":26:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:47) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:41) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:34) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":31:40) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":32:31) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":33:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:52) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:105) +#loc12 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc13 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":42:40) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":45:54) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":46:54) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":46:8) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":51:16) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":62:23) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:29) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":52:40) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":53:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":54:29) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:41) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:34) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:52) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:106) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":60:22) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":61:29) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:53) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":52:4) +#loc61 = loc("xoffset"(#loc2)) +#loc62 = loc("r0_base"(#loc3)) +#loc63 = loc("tmp0"(#loc4)) +#loc64 = loc("tmp0"(#loc5)) +#loc65 = loc("tmp0"(#loc6)) +#loc66 = loc("_tmp3_max"(#loc7)) +#loc67 = loc("r0_index"(#loc8)) +#loc68 = loc("r0_mask"(#loc9)) +#loc69 = loc("tmp0"(#loc10)) +#loc70 = loc("tmp0"(#loc11)) +#loc71 = loc("mask"(#loc12)) +#loc72 = loc("out_max"(#loc13)) +#loc73 = loc("mask"(#loc15)) +#loc74 = loc("mask"(#loc16)) +#loc75 = loc("lhs_scale"(#loc18)) +#loc76 = loc("lhs_scale"(#loc19)) +#loc77 = loc("lhs_scale"(#loc21)) +#loc78 = loc("lhs_scale"(#loc22)) +#loc79 = loc("rhs_scale"(#loc23)) +#loc80 = loc("rhs_scale"(#loc24)) +#loc81 = loc("rhs_scale"(#loc25)) +#loc82 = loc("out_sum"(#loc26)) +#loc83 = loc("out_sum"(#loc27)) +#loc84 = loc("_tmp3_max"(#loc28)) +#loc85 = loc("_tmp3_sum"(#loc29)) +#loc87 = loc("out_max_keepdim"(#loc34)) +#loc88 = loc("delta"(#loc35)) +#loc89 = loc("delta"(#loc36)) +#loc90 = loc("delta"(#loc37)) +#loc91 = loc("out_sum"(#loc38)) +#loc92 = loc("out_sum"(#loc39)) +#loc94 = loc("tmp4"(#loc43)) +#loc95 = loc("tmp9"(#loc44)) +#loc96 = loc("r0_index"(#loc47)) +#loc97 = loc("r0_mask"(#loc48)) +#loc98 = loc("tmp5"(#loc49)) +#loc99 = loc("tmp5"(#loc50)) +#loc100 = loc("tmp5"(#loc51)) +#loc101 = loc("tmp5"(#loc52)) +#loc102 = loc("tmp7"(#loc53)) +#loc103 = loc("tmp8"(#loc54)) +#loc104 = loc(fused[#loc64, #loc63]) +#loc105 = loc("_tmp3_sum"(#loc66)) +#loc106 = loc("mask"(#loc71)) +#loc107 = loc(callsite(#loc72 at #loc14)) +#loc108 = loc("mask"(#loc74)) +#loc109 = loc(callsite(#loc75 at #loc14)) +#loc110 = loc(callsite(#loc76 at #loc14)) +#loc111 = loc(callsite(#loc77 at #loc14)) +#loc112 = loc(callsite(#loc78 at #loc14)) +#loc113 = loc(callsite(#loc79 at #loc14)) +#loc114 = loc(callsite(#loc80 at #loc14)) +#loc115 = loc(callsite(#loc81 at #loc14)) +#loc116 = loc(callsite(#loc82 at #loc14)) +#loc117 = loc(callsite(#loc83 at #loc14)) +#loc119 = loc(callsite(#loc87 at #loc33)) +#loc120 = loc(callsite(#loc88 at #loc33)) +#loc121 = loc(callsite(#loc89 at #loc33)) +#loc122 = loc(callsite(#loc90 at #loc33)) +#loc123 = loc(callsite(#loc91 at #loc33)) +#loc124 = loc(callsite(#loc92 at #loc33)) +#loc126 = loc(callsite(#loc106 at #loc107)) +#loc127 = loc(callsite(#loc73 at #loc107)) +#loc128 = loc(callsite(#loc108 at #loc107)) +#loc129 = loc(callsite(#loc17 at #loc107)) +#loc130 = loc(callsite(#loc20 at #loc111)) +#loc131 = loc(callsite(#loc20 at #loc114)) +#loc132 = loc(callsite(#loc31 at #loc118)) +#loc134 = loc(callsite(#loc20 at #loc123)) +#loc135 = loc(callsite(#loc40 at #loc125)) +#loc137 = loc(callsite(#loc106 at #loc132)) +#loc138 = loc(callsite(#loc73 at #loc132)) +#loc139 = loc(callsite(#loc108 at #loc132)) +#loc140 = loc(callsite(#loc17 at #loc132)) +#loc141 = loc(callsite(#loc42 at #loc135)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..3f8a905451b9fee4b55838929686ac4968ed4b8b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/GEZC7BNCXFQAGCZIOI2BQLAAUGS4IVUJ4QGCDMFUE3MMZMGBMJIQ/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir @@ -0,0 +1,233 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":18:0) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":49:33) +#loc3 = loc(unknown) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc59 = loc("in_ptr0"(#loc)) +#loc60 = loc("out_ptr2"(#loc)) +#loc61 = loc("xnumel"(#loc)) +#loc62 = loc("r0_numel"(#loc)) +#loc90 = loc("out_max"(#loc35)) +#loc96 = loc("out_sum"(#loc42)) +#loc123 = loc(callsite(#loc90 at #loc2)) +#loc129 = loc(callsite(#loc96 at #loc2)) +#loc138 = loc(callsite(#loc3 at #loc123)) +#loc141 = loc(callsite(#loc3 at #loc129)) +module { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %delta = arith.constant dense<0xFF800000> : tensor<1x1xf32> loc(#loc108) + %cst = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc3) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc3) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc3) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_2 = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc3) + %cst_3 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc64) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc65) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc66) + %_tmp3_sum:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp3_max = %cst_3, %_tmp3_sum_12 = %cst_0) -> (tensor<1x2048xf32>, tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc68) + %r0_index_13 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32> loc(#loc68) + %r0_mask = arith.cmpi slt, %r0_index_13, %cst_2 : tensor<1x2048xi32> loc(#loc69) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc70) + %tmp0_14 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc110) + %tmp0_15 = arith.addi %r0_index_13, %tmp0_14 : tensor<1x2048xi32> loc(#loc71) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc72) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc72) + %tmp0_18 = tt.load %tmp0_17, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc73) + %tmp0_19 = arith.extf %tmp0_18 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc74) + %mask = arith.cmpf ogt, %_tmp3_max, %tmp0_19 : tensor<1x2048xf32> loc(#loc131) + %mask_20 = arith.cmpf une, %_tmp3_max, %_tmp3_max : tensor<1x2048xf32> loc(#loc132) + %mask_21 = arith.ori %mask, %mask_20 : tensor<1x2048xi1> loc(#loc133) + %out_max_22 = arith.select %mask_21, %_tmp3_max, %tmp0_19 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc134) + %lhs_scale = arith.cmpf oeq, %out_max_22, %cst_3 : tensor<1x2048xf32> loc(#loc114) + %lhs_scale_23 = arith.subf %_tmp3_max, %out_max_22 : tensor<1x2048xf32> loc(#loc115) + %lhs_scale_24 = tt.extern_elementwise %lhs_scale_23 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc135) + %lhs_scale_25 = arith.select %lhs_scale, %cst, %lhs_scale_24 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc117) + %rhs_scale = arith.subf %tmp0_19, %out_max_22 : tensor<1x2048xf32> loc(#loc118) + %rhs_scale_26 = tt.extern_elementwise %rhs_scale {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc136) + %rhs_scale_27 = arith.select %lhs_scale, %cst, %rhs_scale_26 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc120) + %out_sum_28 = arith.mulf %_tmp3_sum_12, %lhs_scale_25 : tensor<1x2048xf32> loc(#loc121) + %out_sum_29 = arith.addf %out_sum_28, %rhs_scale_27 : tensor<1x2048xf32> loc(#loc122) + %_tmp3_max_30 = arith.select %r0_mask, %out_max_22, %_tmp3_max : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc88) + %_tmp3_sum_31 = arith.select %r0_mask, %out_sum_29, %_tmp3_sum_12 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc89) + scf.yield %_tmp3_max_30, %_tmp3_sum_31 : tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc33) + } loc(#loc109) + %out_max = "tt.reduce"(%_tmp3_sum#0) <{axis = 1 : i32}> ({ + ^bb0(%out_max_12: f32 loc(callsite(#loc3 at #loc123)), %out_max_13: f32 loc(callsite(#loc3 at #loc123))): + %mask = arith.cmpf ogt, %out_max_12, %out_max_13 : f32 loc(#loc142) + %mask_14 = arith.cmpf une, %out_max_12, %out_max_12 : f32 loc(#loc143) + %mask_15 = arith.ori %mask, %mask_14 : i1 loc(#loc144) + %out_max_16 = arith.select %mask_15, %out_max_12, %out_max_13 : f32 loc(#loc145) + tt.reduce.return %out_max_16 : f32 loc(#loc137) + }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc137) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc124) + %delta_5 = arith.cmpf oeq, %out_max_keepdim, %delta : tensor<1x1xf32> loc(#loc108) + %delta_6 = tt.broadcast %out_max_keepdim : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc125) + %delta_7 = arith.subf %_tmp3_sum#0, %delta_6 : tensor<1x2048xf32> loc(#loc125) + %delta_8 = tt.broadcast %delta_5 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc126) + %delta_9 = arith.select %delta_8, %cst_0, %delta_7 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc126) + %out_sum = tt.extern_elementwise %delta_9 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc139) + %out_sum_10 = arith.mulf %_tmp3_sum#1, %out_sum : tensor<1x2048xf32> loc(#loc128) + %out_sum_11 = "tt.reduce"(%out_sum_10) <{axis = 1 : i32}> ({ + ^bb0(%out_sum_12: f32 loc(callsite(#loc3 at #loc129)), %out_sum_13: f32 loc(callsite(#loc3 at #loc129))): + %out_sum_14 = arith.addf %out_sum_12, %out_sum_13 : f32 loc(#loc146) + tt.reduce.return %out_sum_14 : f32 loc(#loc140) + }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc140) + %tmp4 = tt.expand_dims %out_sum_11 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc97) + scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc98) + %r0_index_12 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32> loc(#loc98) + %r0_mask = arith.cmpi slt, %r0_index_12, %cst_2 : tensor<1x2048xi32> loc(#loc99) + %tmp5 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc100) + %tmp5_13 = tt.splat %tmp5 : i32 -> tensor<1x2048xi32> loc(#loc130) + %tmp5_14 = arith.addi %r0_index_12, %tmp5_13 : tensor<1x2048xi32> loc(#loc101) + %tmp5_15 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc102) + %tmp5_16 = tt.addptr %tmp5_15, %tmp5_14 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc102) + %tmp5_17 = tt.load %tmp5_16, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc103) + %tmp5_18 = arith.extf %tmp5_17 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc104) + %tmp7 = arith.subf %tmp5_18, %delta_6 : tensor<1x2048xf32> loc(#loc105) + %tmp8 = tt.extern_elementwise %tmp7 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc106) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc107) + %tmp9_19 = arith.divf %tmp8, %tmp9 : tensor<1x2048xf32> loc(#loc107) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc56) + %1 = tt.addptr %0, %tmp5_14 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc56) + tt.store %1, %tmp9_19, %r0_mask : tensor<1x2048x!tt.ptr> loc(#loc57) + } loc(#loc45) + tt.return loc(#loc58) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":23:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":26:27) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":31:40) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":32:31) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":33:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:47) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:41) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:34) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:52) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:105) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":42:40) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":45:54) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":46:54) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":46:8) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":51:16) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":52:40) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":53:31) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":54:29) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:47) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:41) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:34) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:52) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:106) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":60:22) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":61:29) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":62:23) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:29) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:53) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":52:4) +#loc63 = loc("delta"(#loc1)) +#loc64 = loc("xoffset"(#loc4)) +#loc65 = loc("r0_base"(#loc5)) +#loc66 = loc("r0_base"(#loc6)) +#loc67 = loc("_tmp3_max"(#loc7)) +#loc68 = loc("r0_index"(#loc8)) +#loc69 = loc("r0_mask"(#loc9)) +#loc70 = loc("tmp0"(#loc10)) +#loc71 = loc("tmp0"(#loc11)) +#loc72 = loc("tmp0"(#loc12)) +#loc73 = loc("tmp0"(#loc13)) +#loc74 = loc("tmp0"(#loc14)) +#loc75 = loc("mask"(#loc15)) +#loc76 = loc("out_max"(#loc16)) +#loc77 = loc("mask"(#loc18)) +#loc78 = loc("mask"(#loc19)) +#loc79 = loc("lhs_scale"(#loc21)) +#loc80 = loc("lhs_scale"(#loc22)) +#loc81 = loc("lhs_scale"(#loc24)) +#loc82 = loc("lhs_scale"(#loc25)) +#loc83 = loc("rhs_scale"(#loc26)) +#loc84 = loc("rhs_scale"(#loc27)) +#loc85 = loc("rhs_scale"(#loc28)) +#loc86 = loc("out_sum"(#loc29)) +#loc87 = loc("out_sum"(#loc30)) +#loc88 = loc("_tmp3_max"(#loc31)) +#loc89 = loc("_tmp3_sum"(#loc32)) +#loc91 = loc("out_max_keepdim"(#loc36)) +#loc92 = loc("delta"(#loc37)) +#loc93 = loc("delta"(#loc38)) +#loc94 = loc("out_sum"(#loc39)) +#loc95 = loc("out_sum"(#loc40)) +#loc97 = loc("tmp4"(#loc44)) +#loc98 = loc("r0_index"(#loc46)) +#loc99 = loc("r0_mask"(#loc47)) +#loc100 = loc("tmp5"(#loc48)) +#loc101 = loc("tmp5"(#loc49)) +#loc102 = loc("tmp5"(#loc50)) +#loc103 = loc("tmp5"(#loc51)) +#loc104 = loc("tmp5"(#loc52)) +#loc105 = loc("tmp7"(#loc53)) +#loc106 = loc("tmp8"(#loc54)) +#loc107 = loc("tmp9"(#loc55)) +#loc108 = loc(callsite(#loc63 at #loc2)) +#loc109 = loc("_tmp3_sum"(#loc67)) +#loc110 = loc(fused[#loc71, #loc70]) +#loc111 = loc("mask"(#loc75)) +#loc112 = loc(callsite(#loc76 at #loc17)) +#loc113 = loc("mask"(#loc78)) +#loc114 = loc(callsite(#loc79 at #loc17)) +#loc115 = loc(callsite(#loc80 at #loc17)) +#loc116 = loc(callsite(#loc81 at #loc17)) +#loc117 = loc(callsite(#loc82 at #loc17)) +#loc118 = loc(callsite(#loc83 at #loc17)) +#loc119 = loc(callsite(#loc84 at #loc17)) +#loc120 = loc(callsite(#loc85 at #loc17)) +#loc121 = loc(callsite(#loc86 at #loc17)) +#loc122 = loc(callsite(#loc87 at #loc17)) +#loc124 = loc(callsite(#loc91 at #loc2)) +#loc125 = loc(callsite(#loc92 at #loc2)) +#loc126 = loc(callsite(#loc93 at #loc2)) +#loc127 = loc(callsite(#loc94 at #loc2)) +#loc128 = loc(callsite(#loc95 at #loc2)) +#loc130 = loc(fused[#loc101, #loc100]) +#loc131 = loc(callsite(#loc111 at #loc112)) +#loc132 = loc(callsite(#loc77 at #loc112)) +#loc133 = loc(callsite(#loc113 at #loc112)) +#loc134 = loc(callsite(#loc20 at #loc112)) +#loc135 = loc(callsite(#loc23 at #loc116)) +#loc136 = loc(callsite(#loc23 at #loc119)) +#loc137 = loc(callsite(#loc34 at #loc123)) +#loc139 = loc(callsite(#loc23 at #loc127)) +#loc140 = loc(callsite(#loc41 at #loc129)) +#loc142 = loc(callsite(#loc111 at #loc137)) +#loc143 = loc(callsite(#loc77 at #loc137)) +#loc144 = loc(callsite(#loc113 at #loc137)) +#loc145 = loc(callsite(#loc20 at #loc137)) +#loc146 = loc(callsite(#loc43 at #loc140)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/__grp__triton_red_fused_sum_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/__grp__triton_red_fused_sum_3.json new file mode 100644 index 0000000000000000000000000000000000000000..66b16d19017f618e5b97d7f8cf4e0cbd8424ace6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/__grp__triton_red_fused_sum_3.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_sum_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.source", "triton_red_fused_sum_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.ttir", "triton_red_fused_sum_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.ttgir", "triton_red_fused_sum_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.llir", "triton_red_fused_sum_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.ptx", "triton_red_fused_sum_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.cubin", "triton_red_fused_sum_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.cubin new file mode 100644 index 0000000000000000000000000000000000000000..7413ad1a36c2b4a71ca27c3cd32925164434ddd3 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4eefe3021f597dfb89ab21ddf97567ed3a103b07 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.json @@ -0,0 +1 @@ +{"hash": "3ec60d04fe3d62cd72e0e59b19d14017b970f8524fe3b56d2275d56d5dc21ded", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_sum_3"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.llir new file mode 100644 index 0000000000000000000000000000000000000000..b5cf399286e55914162c5382fc93238ead2b5e38 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.llir @@ -0,0 +1,291 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_sum_3(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = icmp samesign ult i32 %7, 2, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = shl nuw nsw i32 %9, 1, !dbg !9 + %11 = and i32 %10, 1022, !dbg !9 + %12 = shl i32 %7, 13, !dbg !10 + %13 = or disjoint i32 %11, %12 + %14 = or disjoint i32 %10, %12 + %15 = or i32 %14, 1024 + %16 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !11 + br i1 %8, label %.split.us.preheader, label %.split.preheader + +.split.preheader: ; preds = %6 + %17 = sext i32 %13 to i64, !dbg !12 + %18 = getelementptr i64, ptr addrspace(1) %0, i64 %17, !dbg !13 + %19 = sext i32 %15 to i64, !dbg !13 + %20 = getelementptr i64, ptr addrspace(1) %0, i64 %19, !dbg !13 + %21 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %18, i64 %16, i1 false) #4, !dbg !11 + %22 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !11 + %23 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %20, i64 %22, i1 false) #4, !dbg !11 + %24 = or i32 %14, 3072, !dbg !14 + %25 = getelementptr i64, ptr addrspace(1) %0, i64 %17, !dbg !13 + %26 = getelementptr i8, ptr addrspace(1) %25, i64 16384, !dbg !13 + %27 = sext i32 %24 to i64, !dbg !13 + %28 = getelementptr i64, ptr addrspace(1) %0, i64 %27, !dbg !13 + %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !11 + %30 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %26, i64 %29, i1 false) #4, !dbg !11 + %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !11 + %32 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %28, i64 %31, i1 false) #4, !dbg !11 + %33 = or i32 %14, 5120, !dbg !14 + %34 = getelementptr i64, ptr addrspace(1) %0, i64 %17, !dbg !13 + %35 = getelementptr i8, ptr addrspace(1) %34, i64 32768, !dbg !13 + %36 = sext i32 %33 to i64, !dbg !13 + %37 = getelementptr i64, ptr addrspace(1) %0, i64 %36, !dbg !13 + %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !11 + %39 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %35, i64 %38, i1 false) #4, !dbg !11 + %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !11 + %41 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %37, i64 %40, i1 false) #4, !dbg !11 + %42 = or i32 %14, 7168, !dbg !14 + %43 = getelementptr i64, ptr addrspace(1) %0, i64 %17, !dbg !13 + %44 = getelementptr i8, ptr addrspace(1) %43, i64 49152, !dbg !13 + %45 = sext i32 %42 to i64, !dbg !13 + %46 = getelementptr i64, ptr addrspace(1) %0, i64 %45, !dbg !13 + %47 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !11 + %48 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %44, i64 %47, i1 false) #4, !dbg !11 + %49 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !11 + %50 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %46, i64 %49, i1 false) #4, !dbg !11 + br label %.split10.us, !dbg !9 + +.split.us.preheader: ; preds = %6 + %51 = zext nneg i32 %13 to i64, !dbg !12 + %52 = getelementptr i64, ptr addrspace(1) %0, i64 %51, !dbg !13 + %53 = zext nneg i32 %15 to i64, !dbg !13 + %54 = getelementptr i64, ptr addrspace(1) %0, i64 %53, !dbg !13 + %55 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %52, i64 %16, i1 true) #4, !dbg !11 + %56 = extractvalue { i64, i64 } %55, 0, !dbg !11 + %57 = extractvalue { i64, i64 } %55, 1, !dbg !11 + %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !11 + %59 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %54, i64 %58, i1 true) #4, !dbg !11 + %60 = extractvalue { i64, i64 } %59, 0, !dbg !11 + %61 = extractvalue { i64, i64 } %59, 1, !dbg !11 + %62 = or i32 %14, 3072, !dbg !14 + %63 = getelementptr i64, ptr addrspace(1) %0, i64 %51, !dbg !13 + %64 = getelementptr i8, ptr addrspace(1) %63, i64 16384, !dbg !13 + %65 = zext nneg i32 %62 to i64, !dbg !13 + %66 = getelementptr i64, ptr addrspace(1) %0, i64 %65, !dbg !13 + %67 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !11 + %68 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %64, i64 %67, i1 true) #4, !dbg !11 + %69 = extractvalue { i64, i64 } %68, 0, !dbg !11 + %70 = extractvalue { i64, i64 } %68, 1, !dbg !11 + %71 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !11 + %72 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %66, i64 %71, i1 true) #4, !dbg !11 + %73 = extractvalue { i64, i64 } %72, 0, !dbg !11 + %74 = extractvalue { i64, i64 } %72, 1, !dbg !11 + %75 = or i32 %14, 5120, !dbg !14 + %76 = getelementptr i64, ptr addrspace(1) %0, i64 %51, !dbg !13 + %77 = getelementptr i8, ptr addrspace(1) %76, i64 32768, !dbg !13 + %78 = zext nneg i32 %75 to i64, !dbg !13 + %79 = getelementptr i64, ptr addrspace(1) %0, i64 %78, !dbg !13 + %80 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !11 + %81 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %77, i64 %80, i1 true) #4, !dbg !11 + %82 = extractvalue { i64, i64 } %81, 0, !dbg !11 + %83 = extractvalue { i64, i64 } %81, 1, !dbg !11 + %84 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !11 + %85 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %79, i64 %84, i1 true) #4, !dbg !11 + %86 = extractvalue { i64, i64 } %85, 0, !dbg !11 + %87 = extractvalue { i64, i64 } %85, 1, !dbg !11 + %88 = or i32 %14, 7168, !dbg !14 + %89 = getelementptr i64, ptr addrspace(1) %0, i64 %51, !dbg !13 + %90 = getelementptr i8, ptr addrspace(1) %89, i64 49152, !dbg !13 + %91 = zext nneg i32 %88 to i64, !dbg !13 + %92 = getelementptr i64, ptr addrspace(1) %0, i64 %91, !dbg !13 + %93 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !11 + %94 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %90, i64 %93, i1 true) #4, !dbg !11 + %95 = extractvalue { i64, i64 } %94, 0, !dbg !11 + %96 = extractvalue { i64, i64 } %94, 1, !dbg !11 + %97 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !11 + %98 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %92, i64 %97, i1 true) #4, !dbg !11 + %99 = extractvalue { i64, i64 } %98, 0, !dbg !11 + %100 = extractvalue { i64, i64 } %98, 1, !dbg !11 + %101 = insertelement <2 x i64> poison, i64 %69, i64 0, !dbg !15 + %102 = insertelement <2 x i64> %101, i64 %70, i64 1, !dbg !15 + %103 = insertelement <2 x i64> poison, i64 %56, i64 0, !dbg !15 + %104 = insertelement <2 x i64> %103, i64 %57, i64 1, !dbg !15 + %105 = add <2 x i64> %102, %104, !dbg !15 + %106 = insertelement <2 x i64> poison, i64 %82, i64 0, !dbg !15 + %107 = insertelement <2 x i64> %106, i64 %83, i64 1, !dbg !15 + %108 = add <2 x i64> %107, %105, !dbg !15 + %109 = insertelement <2 x i64> poison, i64 %95, i64 0, !dbg !15 + %110 = insertelement <2 x i64> %109, i64 %96, i64 1, !dbg !15 + %111 = add <2 x i64> %110, %108, !dbg !15 + %112 = insertelement <2 x i64> poison, i64 %73, i64 0, !dbg !15 + %113 = insertelement <2 x i64> %112, i64 %74, i64 1, !dbg !15 + %114 = insertelement <2 x i64> poison, i64 %60, i64 0, !dbg !15 + %115 = insertelement <2 x i64> %114, i64 %61, i64 1, !dbg !15 + %116 = add <2 x i64> %113, %115, !dbg !15 + %117 = insertelement <2 x i64> poison, i64 %86, i64 0, !dbg !15 + %118 = insertelement <2 x i64> %117, i64 %87, i64 1, !dbg !15 + %119 = add <2 x i64> %118, %116, !dbg !15 + %120 = insertelement <2 x i64> poison, i64 %99, i64 0, !dbg !15 + %121 = insertelement <2 x i64> %120, i64 %100, i64 1, !dbg !15 + %122 = add <2 x i64> %121, %119, !dbg !15 + %shift = shufflevector <2 x i64> %111, <2 x i64> poison, <2 x i32> , !dbg !9 + %foldExtExtBinop = add <2 x i64> %111, %shift, !dbg !9 + %foldExtExtBinop14 = add <2 x i64> %foldExtExtBinop, %122, !dbg !9 + %shift16 = shufflevector <2 x i64> %122, <2 x i64> poison, <2 x i32> , !dbg !9 + %foldExtExtBinop17 = add <2 x i64> %foldExtExtBinop14, %shift16, !dbg !9 + %123 = extractelement <2 x i64> %foldExtExtBinop17, i64 0, !dbg !9 + br label %.split10.us, !dbg !9 + +.split10.us: ; preds = %.split.preheader, %.split.us.preheader + %124 = phi i64 [ %123, %.split.us.preheader ], [ 0, %.split.preheader ], !dbg !16 + %125 = and i32 %9, 31, !dbg !9 + %126 = lshr i32 %9, 5, !dbg !9 + %extelt.offset = lshr i64 %124, 32, !dbg !20 + %127 = trunc nuw i64 %extelt.offset to i32, !dbg !20 + %128 = trunc i64 %124 to i32, !dbg !20 + %129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 16, i32 31), !dbg !20 + %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %127, i32 16, i32 31), !dbg !20 + %131 = insertelement <2 x i32> poison, i32 %129, i64 0, !dbg !20 + %132 = insertelement <2 x i32> %131, i32 %130, i64 1, !dbg !20 + %133 = bitcast <2 x i32> %132 to i64, !dbg !20 + %134 = add i64 %124, %133, !dbg !16 + %extelt.offset1 = lshr i64 %134, 32, !dbg !20 + %135 = trunc nuw i64 %extelt.offset1 to i32, !dbg !20 + %136 = trunc i64 %134 to i32, !dbg !20 + %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 8, i32 31), !dbg !20 + %138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 8, i32 31), !dbg !20 + %139 = insertelement <2 x i32> poison, i32 %137, i64 0, !dbg !20 + %140 = insertelement <2 x i32> %139, i32 %138, i64 1, !dbg !20 + %141 = bitcast <2 x i32> %140 to i64, !dbg !20 + %142 = add i64 %134, %141, !dbg !16 + %extelt.offset2 = lshr i64 %142, 32, !dbg !20 + %143 = trunc nuw i64 %extelt.offset2 to i32, !dbg !20 + %144 = trunc i64 %142 to i32, !dbg !20 + %145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 4, i32 31), !dbg !20 + %146 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %143, i32 4, i32 31), !dbg !20 + %147 = insertelement <2 x i32> poison, i32 %145, i64 0, !dbg !20 + %148 = insertelement <2 x i32> %147, i32 %146, i64 1, !dbg !20 + %149 = bitcast <2 x i32> %148 to i64, !dbg !20 + %150 = add i64 %142, %149, !dbg !16 + %extelt.offset3 = lshr i64 %150, 32, !dbg !20 + %151 = trunc nuw i64 %extelt.offset3 to i32, !dbg !20 + %152 = trunc i64 %150 to i32, !dbg !20 + %153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 2, i32 31), !dbg !20 + %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %151, i32 2, i32 31), !dbg !20 + %155 = insertelement <2 x i32> poison, i32 %153, i64 0, !dbg !20 + %156 = insertelement <2 x i32> %155, i32 %154, i64 1, !dbg !20 + %157 = bitcast <2 x i32> %156 to i64, !dbg !20 + %158 = add i64 %150, %157, !dbg !16 + %extelt.offset4 = lshr i64 %158, 32, !dbg !20 + %159 = trunc nuw i64 %extelt.offset4 to i32, !dbg !20 + %160 = trunc i64 %158 to i32, !dbg !20 + %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 1, i32 31), !dbg !20 + %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %159, i32 1, i32 31), !dbg !20 + %163 = insertelement <2 x i32> poison, i32 %161, i64 0, !dbg !20 + %164 = insertelement <2 x i32> %163, i32 %162, i64 1, !dbg !20 + %165 = bitcast <2 x i32> %164 to i64, !dbg !20 + %166 = add i64 %158, %165, !dbg !16 + %167 = and i32 %126, 15, !dbg !20 + %168 = icmp eq i32 %125, 0, !dbg !20 + %169 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %167, !dbg !20 + %170 = insertelement <1 x i64> poison, i64 %166, i64 0, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %169, <1 x i64> %170, i1 %168) #4, !dbg !20 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !20 + %171 = icmp samesign ult i32 %9, 16, !dbg !20 + %172 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %9, !dbg !20 + %173 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %172, i1 %171) #4, !dbg !20 + %extelt.offset5 = lshr i64 %173, 32, !dbg !20 + %174 = trunc nuw i64 %extelt.offset5 to i32, !dbg !20 + %175 = trunc i64 %173 to i32, !dbg !20 + %176 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 8, i32 31), !dbg !20 + %177 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 8, i32 31), !dbg !20 + %178 = insertelement <2 x i32> poison, i32 %176, i64 0, !dbg !20 + %179 = insertelement <2 x i32> %178, i32 %177, i64 1, !dbg !20 + %180 = bitcast <2 x i32> %179 to i64, !dbg !20 + %181 = add i64 %173, %180, !dbg !16 + %extelt.offset6 = lshr i64 %181, 32, !dbg !20 + %182 = trunc nuw i64 %extelt.offset6 to i32, !dbg !20 + %183 = trunc i64 %181 to i32, !dbg !20 + %184 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %183, i32 4, i32 31), !dbg !20 + %185 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %182, i32 4, i32 31), !dbg !20 + %186 = insertelement <2 x i32> poison, i32 %184, i64 0, !dbg !20 + %187 = insertelement <2 x i32> %186, i32 %185, i64 1, !dbg !20 + %188 = bitcast <2 x i32> %187 to i64, !dbg !20 + %189 = add i64 %181, %188, !dbg !16 + %extelt.offset7 = lshr i64 %189, 32, !dbg !20 + %190 = trunc nuw i64 %extelt.offset7 to i32, !dbg !20 + %191 = trunc i64 %189 to i32, !dbg !20 + %192 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %191, i32 2, i32 31), !dbg !20 + %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %190, i32 2, i32 31), !dbg !20 + %194 = insertelement <2 x i32> poison, i32 %192, i64 0, !dbg !20 + %195 = insertelement <2 x i32> %194, i32 %193, i64 1, !dbg !20 + %196 = bitcast <2 x i32> %195 to i64, !dbg !20 + %197 = add i64 %189, %196, !dbg !16 + %extelt.offset8 = lshr i64 %197, 32, !dbg !20 + %198 = trunc nuw i64 %extelt.offset8 to i32, !dbg !20 + %199 = trunc i64 %197 to i32, !dbg !20 + %200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %199, i32 1, i32 31), !dbg !20 + %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 1, i32 31), !dbg !20 + %202 = insertelement <2 x i32> poison, i32 %200, i64 0, !dbg !20 + %203 = insertelement <2 x i32> %202, i32 %201, i64 1, !dbg !20 + %204 = bitcast <2 x i32> %203 to i64, !dbg !20 + %205 = add i64 %197, %204, !dbg !16 + %206 = icmp eq i32 %9, 0, !dbg !20 + %207 = insertelement <1 x i64> poison, i64 %205, i64 0, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %172, <1 x i64> %207, i1 %206) #4, !dbg !20 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !20 + %208 = load i64, ptr addrspace(3) @global_smem, align 16, !dbg !20 + %209 = zext nneg i32 %7 to i64, !dbg !21 + %210 = getelementptr i64, ptr addrspace(1) %1, i64 %209, !dbg !21 + %211 = and i32 %9, 511, !dbg !22 + %212 = icmp eq i32 %211, 0, !dbg !22 + %213 = and i1 %8, %212, !dbg !22 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %208, ptr addrspace(1) %210, i1 %213) #4, !dbg !22 + ret void, !dbg !23 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_sum_3", linkageName: "triton_red_fused_sum_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 25, column: 21, scope: !4) +!9 = !DILocation(line: 26, column: 37, scope: !4) +!10 = !DILocation(line: 36, column: 46, scope: !4) +!11 = !DILocation(line: 36, column: 51, scope: !4) +!12 = !DILocation(line: 30, column: 40, scope: !4) +!13 = !DILocation(line: 36, column: 34, scope: !4) +!14 = !DILocation(line: 36, column: 41, scope: !4) +!15 = !DILocation(line: 39, column: 48, scope: !4) +!16 = !DILocation(line: 261, column: 15, scope: !17, inlinedAt: !19) +!17 = distinct !DILexicalBlockFile(scope: !4, file: !18, discriminator: 0) +!18 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!19 = !DILocation(line: 40, column: 25, scope: !4) +!20 = !DILocation(line: 291, column: 36, scope: !17, inlinedAt: !19) +!21 = !DILocation(line: 41, column: 25, scope: !4) +!22 = !DILocation(line: 41, column: 36, scope: !4) +!23 = !DILocation(line: 41, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.ptx new file mode 100644 index 0000000000000000000000000000000000000000..8866f1a3b35667463e370954cad77d667c9a5895 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.ptx @@ -0,0 +1,626 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_sum_3 // -- Begin function triton_red_fused_sum_3 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_sum_3 +.visible .entry triton_red_fused_sum_3( + .param .u64 .ptr .global .align 1 triton_red_fused_sum_3_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_sum_3_param_1, + .param .u32 triton_red_fused_sum_3_param_2, + .param .u32 triton_red_fused_sum_3_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused_sum_3_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_sum_3_param_5 +) +.reqntid 512 +{ + .reg .pred %p<24>; + .reg .b32 %r<60>; + .reg .b64 %rd<149>; + .loc 1 18 0 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:18:0 + +// %bb.0: + ld.param.b64 %rd5, [triton_red_fused_sum_3_param_1]; + ld.param.b64 %rd4, [triton_red_fused_sum_3_param_0]; +$L__tmp0: + .loc 1 23 28 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:23:28 + mov.u32 %r1, %ctaid.x; + .loc 1 25 21 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:25:21 + setp.lt.u32 %p1, %r1, 2; + .loc 1 26 37 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:26:37 + mov.u32 %r2, %tid.x; + shl.b32 %r6, %r2, 1; + and.b32 %r7, %r6, 1022; + .loc 1 36 46 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:46 + shl.b32 %r8, %r1, 13; + or.b32 %r3, %r7, %r8; + or.b32 %r4, %r6, %r8; + or.b32 %r5, %r4, 1024; + .loc 1 36 51 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:51 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0; + // end inline asm + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: // %.split.us.preheader + .loc 1 36 34 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:34 + mad.wide.u32 %rd49, %r3, 8, %rd4; + mad.wide.u32 %rd54, %r5, 8, %rd4; + mov.pred %p10, -1; + .loc 1 36 51 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:51 + // begin inline asm + mov.u64 %rd47, 0x0; + mov.u64 %rd48, 0x0; + @%p10 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd47, %rd48 }, [ %rd49 + 0 ], %rd6; + // end inline asm + // begin inline asm + mov.u64 %rd55, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd55, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd52, 0x0; + mov.u64 %rd53, 0x0; + @%p10 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd52, %rd53 }, [ %rd54 + 0 ], %rd55; + // end inline asm + .loc 1 36 41 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:41 + or.b32 %r12, %r4, 3072; + .loc 1 36 34 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:34 + add.s64 %rd59, %rd49, 16384; + mad.wide.u32 %rd64, %r12, 8, %rd4; + .loc 1 36 51 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:51 + // begin inline asm + mov.u64 %rd60, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd60, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd57, 0x0; + mov.u64 %rd58, 0x0; + @%p10 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd57, %rd58 }, [ %rd59 + 0 ], %rd60; + // end inline asm + // begin inline asm + mov.u64 %rd65, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd65, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd62, 0x0; + mov.u64 %rd63, 0x0; + @%p10 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd62, %rd63 }, [ %rd64 + 0 ], %rd65; + // end inline asm + .loc 1 36 41 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:41 + or.b32 %r13, %r4, 5120; + .loc 1 36 34 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:34 + add.s64 %rd69, %rd49, 32768; + mad.wide.u32 %rd74, %r13, 8, %rd4; + .loc 1 36 51 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:51 + // begin inline asm + mov.u64 %rd70, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd70, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd67, 0x0; + mov.u64 %rd68, 0x0; + @%p10 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd67, %rd68 }, [ %rd69 + 0 ], %rd70; + // end inline asm + // begin inline asm + mov.u64 %rd75, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd75, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd72, 0x0; + mov.u64 %rd73, 0x0; + @%p10 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd72, %rd73 }, [ %rd74 + 0 ], %rd75; + // end inline asm + .loc 1 36 41 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:41 + or.b32 %r14, %r4, 7168; + .loc 1 36 34 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:34 + add.s64 %rd79, %rd49, 49152; + mad.wide.u32 %rd84, %r14, 8, %rd4; + .loc 1 36 51 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:51 + // begin inline asm + mov.u64 %rd80, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd80, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd77, 0x0; + mov.u64 %rd78, 0x0; + @%p10 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd77, %rd78 }, [ %rd79 + 0 ], %rd80; + // end inline asm + // begin inline asm + mov.u64 %rd85, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd85, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd82, 0x0; + mov.u64 %rd83, 0x0; + @%p10 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd82, %rd83 }, [ %rd84 + 0 ], %rd85; + // end inline asm + .loc 1 39 48 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:39:48 + add.s64 %rd86, %rd58, %rd48; + add.s64 %rd87, %rd57, %rd47; + add.s64 %rd88, %rd67, %rd87; + add.s64 %rd89, %rd68, %rd86; + add.s64 %rd90, %rd78, %rd89; + add.s64 %rd91, %rd77, %rd88; + add.s64 %rd92, %rd63, %rd53; + add.s64 %rd93, %rd62, %rd52; + add.s64 %rd94, %rd72, %rd93; + add.s64 %rd95, %rd73, %rd92; + add.s64 %rd96, %rd83, %rd95; + add.s64 %rd97, %rd82, %rd94; + .loc 1 26 37 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:26:37 + add.s64 %rd98, %rd91, %rd90; + add.s64 %rd99, %rd98, %rd97; + add.s64 %rd148, %rd99, %rd96; + bra.uni $L__BB0_3; +$L__BB0_1: // %.split.preheader + .loc 1 36 34 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:34 + mad.wide.s32 %rd9, %r3, 8, %rd4; + mad.wide.s32 %rd14, %r5, 8, %rd4; + mov.pred %p2, 0; + .loc 1 36 51 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:51 + // begin inline asm + mov.u64 %rd7, 0x0; + mov.u64 %rd8, 0x0; + @%p2 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd7, %rd8 }, [ %rd9 + 0 ], %rd6; + // end inline asm + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd12, 0x0; + mov.u64 %rd13, 0x0; + @%p2 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd12, %rd13 }, [ %rd14 + 0 ], %rd15; + // end inline asm + .loc 1 36 41 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:41 + or.b32 %r9, %r4, 3072; + .loc 1 36 34 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:34 + add.s64 %rd19, %rd9, 16384; + mad.wide.s32 %rd24, %r9, 8, %rd4; + .loc 1 36 51 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:51 + // begin inline asm + mov.u64 %rd20, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd20, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd17, 0x0; + mov.u64 %rd18, 0x0; + @%p2 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd17, %rd18 }, [ %rd19 + 0 ], %rd20; + // end inline asm + // begin inline asm + mov.u64 %rd25, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd25, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd22, 0x0; + mov.u64 %rd23, 0x0; + @%p2 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd22, %rd23 }, [ %rd24 + 0 ], %rd25; + // end inline asm + .loc 1 36 41 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:41 + or.b32 %r10, %r4, 5120; + .loc 1 36 34 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:34 + add.s64 %rd29, %rd9, 32768; + mad.wide.s32 %rd34, %r10, 8, %rd4; + .loc 1 36 51 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:51 + // begin inline asm + mov.u64 %rd30, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd30, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd27, 0x0; + mov.u64 %rd28, 0x0; + @%p2 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd27, %rd28 }, [ %rd29 + 0 ], %rd30; + // end inline asm + // begin inline asm + mov.u64 %rd35, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd35, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd32, 0x0; + mov.u64 %rd33, 0x0; + @%p2 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd32, %rd33 }, [ %rd34 + 0 ], %rd35; + // end inline asm + .loc 1 36 41 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:41 + or.b32 %r11, %r4, 7168; + .loc 1 36 34 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:34 + add.s64 %rd39, %rd9, 49152; + mad.wide.s32 %rd44, %r11, 8, %rd4; + .loc 1 36 51 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:36:51 + // begin inline asm + mov.u64 %rd40, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd40, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd37, 0x0; + mov.u64 %rd38, 0x0; + @%p2 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd37, %rd38 }, [ %rd39 + 0 ], %rd40; + // end inline asm + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd45, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd42, 0x0; + mov.u64 %rd43, 0x0; + @%p2 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd42, %rd43 }, [ %rd44 + 0 ], %rd45; + // end inline asm + mov.b64 %rd148, 0; +$L__BB0_3: // %.split10.us + .loc 1 26 37 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:26:37 + and.b32 %r18, %r2, 31; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + mov.b64 {_, %r19}, %rd148; + cvt.u32.u64 %r20, %rd148; + shfl.sync.bfly.b32 %r21, %r20, 16, 31, -1; + shfl.sync.bfly.b32 %r22, %r19, 16, 31, -1; + cvt.u64.u32 %rd105, %r21; + cvt.u64.u32 %rd106, %r22; + shl.b64 %rd107, %rd106, 32; + or.b64 %rd108, %rd105, %rd107; + .loc 2 261 15 // standard.py:261:15 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + add.s64 %rd109, %rd148, %rd108; + .loc 2 291 36 // standard.py:291:36 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + mov.b64 {_, %r23}, %rd109; + cvt.u32.u64 %r24, %rd109; + shfl.sync.bfly.b32 %r25, %r24, 8, 31, -1; + shfl.sync.bfly.b32 %r26, %r23, 8, 31, -1; + cvt.u64.u32 %rd110, %r25; + cvt.u64.u32 %rd111, %r26; + shl.b64 %rd112, %rd111, 32; + or.b64 %rd113, %rd110, %rd112; + .loc 2 261 15 // standard.py:261:15 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + add.s64 %rd114, %rd109, %rd113; + .loc 2 291 36 // standard.py:291:36 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + mov.b64 {_, %r27}, %rd114; + cvt.u32.u64 %r28, %rd114; + shfl.sync.bfly.b32 %r29, %r28, 4, 31, -1; + shfl.sync.bfly.b32 %r30, %r27, 4, 31, -1; + cvt.u64.u32 %rd115, %r29; + cvt.u64.u32 %rd116, %r30; + shl.b64 %rd117, %rd116, 32; + or.b64 %rd118, %rd115, %rd117; + .loc 2 261 15 // standard.py:261:15 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + add.s64 %rd119, %rd114, %rd118; + .loc 2 291 36 // standard.py:291:36 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + mov.b64 {_, %r31}, %rd119; + cvt.u32.u64 %r32, %rd119; + shfl.sync.bfly.b32 %r33, %r32, 2, 31, -1; + shfl.sync.bfly.b32 %r34, %r31, 2, 31, -1; + cvt.u64.u32 %rd120, %r33; + cvt.u64.u32 %rd121, %r34; + shl.b64 %rd122, %rd121, 32; + or.b64 %rd123, %rd120, %rd122; + .loc 2 261 15 // standard.py:261:15 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + add.s64 %rd124, %rd119, %rd123; + .loc 2 291 36 // standard.py:291:36 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + mov.b64 {_, %r35}, %rd124; + cvt.u32.u64 %r36, %rd124; + shfl.sync.bfly.b32 %r37, %r36, 1, 31, -1; + shfl.sync.bfly.b32 %r38, %r35, 1, 31, -1; + cvt.u64.u32 %rd125, %r37; + cvt.u64.u32 %rd126, %r38; + shl.b64 %rd127, %rd126, 32; + or.b64 %rd128, %rd125, %rd127; + .loc 2 261 15 // standard.py:261:15 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + add.s64 %rd100, %rd124, %rd128; + .loc 2 291 36 // standard.py:291:36 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + setp.eq.b32 %p18, %r18, 0; + shr.u32 %r39, %r2, 2; + and.b32 %r40, %r39, 120; + mov.b32 %r41, global_smem; + add.s32 %r15, %r41, %r40; + // begin inline asm + @%p18 st.shared.b64 [ %r15 + 0 ], %rd100; + // end inline asm + bar.sync 0; + setp.lt.u32 %p19, %r2, 16; + shl.b32 %r42, %r2, 3; + add.s32 %r16, %r41, %r42; + // begin inline asm + @%p19 ld.shared.b64 %rd101, [ %r16 + 0 ]; + // end inline asm + mov.b64 {_, %r43}, %rd101; + cvt.u32.u64 %r44, %rd101; + shfl.sync.bfly.b32 %r45, %r44, 8, 31, -1; + shfl.sync.bfly.b32 %r46, %r43, 8, 31, -1; + cvt.u64.u32 %rd129, %r45; + cvt.u64.u32 %rd130, %r46; + shl.b64 %rd131, %rd130, 32; + or.b64 %rd132, %rd129, %rd131; + .loc 2 261 15 // standard.py:261:15 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + add.s64 %rd133, %rd101, %rd132; + .loc 2 291 36 // standard.py:291:36 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + mov.b64 {_, %r47}, %rd133; + cvt.u32.u64 %r48, %rd133; + shfl.sync.bfly.b32 %r49, %r48, 4, 31, -1; + shfl.sync.bfly.b32 %r50, %r47, 4, 31, -1; + cvt.u64.u32 %rd134, %r49; + cvt.u64.u32 %rd135, %r50; + shl.b64 %rd136, %rd135, 32; + or.b64 %rd137, %rd134, %rd136; + .loc 2 261 15 // standard.py:261:15 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + add.s64 %rd138, %rd133, %rd137; + .loc 2 291 36 // standard.py:291:36 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + mov.b64 {_, %r51}, %rd138; + cvt.u32.u64 %r52, %rd138; + shfl.sync.bfly.b32 %r53, %r52, 2, 31, -1; + shfl.sync.bfly.b32 %r54, %r51, 2, 31, -1; + cvt.u64.u32 %rd139, %r53; + cvt.u64.u32 %rd140, %r54; + shl.b64 %rd141, %rd140, 32; + or.b64 %rd142, %rd139, %rd141; + .loc 2 261 15 // standard.py:261:15 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + add.s64 %rd143, %rd138, %rd142; + .loc 2 291 36 // standard.py:291:36 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + mov.b64 {_, %r55}, %rd143; + cvt.u32.u64 %r56, %rd143; + shfl.sync.bfly.b32 %r57, %r56, 1, 31, -1; + shfl.sync.bfly.b32 %r58, %r55, 1, 31, -1; + cvt.u64.u32 %rd144, %r57; + cvt.u64.u32 %rd145, %r58; + shl.b64 %rd146, %rd145, 32; + or.b64 %rd147, %rd144, %rd146; + .loc 2 261 15 // standard.py:261:15 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + add.s64 %rd102, %rd143, %rd147; + .loc 2 291 36 // standard.py:291:36 @[ cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:40:25 ] + setp.eq.b32 %p20, %r2, 0; + // begin inline asm + @%p20 st.shared.b64 [ %r16 + 0 ], %rd102; + // end inline asm + bar.sync 0; + ld.shared.b64 %rd103, [global_smem]; +$L__tmp2: + .loc 1 41 25 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:41:25 + mad.wide.u32 %rd104, %r1, 8, %rd5; + .loc 1 41 36 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:41:36 + and.b32 %r59, %r2, 511; + setp.eq.b32 %p23, %r59, 0; + and.pred %p21, %p1, %p23; + // begin inline asm + @%p21 st.global.b64 [ %rd104 + 0 ], { %rd103 }; + // end inline asm + .loc 1 41 4 // cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py:41:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 207 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xc8 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 117 +.b8 111 +.b8 102 +.b8 108 +.b8 118 +.b8 50 +.b8 100 +.b8 110 +.b8 108 +.b8 114 +.b8 114 +.b8 110 +.b8 113 +.b8 116 +.b8 102 +.b8 51 +.b8 50 +.b8 116 +.b8 113 +.b8 122 +.b8 52 +.b8 51 +.b8 53 +.b8 112 +.b8 122 +.b8 112 +.b8 119 +.b8 51 +.b8 104 +.b8 118 +.b8 114 +.b8 109 +.b8 97 +.b8 109 +.b8 122 +.b8 109 +.b8 115 +.b8 52 +.b8 115 +.b8 105 +.b8 119 +.b8 115 +.b8 118 +.b8 120 +.b8 101 +.b8 108 +.b8 106 +.b8 113 +.b8 99 +.b8 55 +.b8 107 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 117 +.b8 111 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x19 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 51 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa4:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xb9:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 40 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.source b/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.source new file mode 100644 index 0000000000000000000000000000000000000000..7caba5745ce5e3589f321b66a9f5cc82885ec5a4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.source @@ -0,0 +1,144 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":18:0) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc31 = loc(unknown) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc38 = loc("in_ptr0"(#loc)) +#loc39 = loc("out_ptr0"(#loc)) +#loc40 = loc("xnumel"(#loc)) +#loc41 = loc("r0_numel"(#loc)) +#loc66 = loc("input"(#loc29)) +#loc67 = loc("a"(#loc34)) +#loc68 = loc("b"(#loc34)) +module { + tt.func public @triton_red_fused_sum_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 2 : i32 loc(#loc42) + %r0_numel_1 = arith.constant 8192 : i32 loc(#loc43) + %xoffset = tt.get_program_id x : i32 loc(#loc44) + %xoffset_2 = arith.constant 1 : i32 loc(#loc45) + %xoffset_3 = arith.constant 1 : i32 loc(#loc45) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc45) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc46) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc47) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc48) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc48) + %xmask = arith.constant dense<2> : tensor<1x1xi32> loc(#loc49) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc49) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc50) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc51) + %_tmp2 = arith.constant 0 : i64 loc(#loc52) + %_tmp2_10 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc52) + %c0_i32 = arith.constant 0 : i32 loc(#loc12) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc12) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc12) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc12) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc12) + %3 = ub.poison : i32 loc(#loc12) + %_tmp2_11 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_13 = %_tmp2_10) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc54) + %r0_index_14 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc54) + %r0_mask = arith.constant dense<8192> : tensor<1x2048xi32> loc(#loc55) + %r0_mask_15 = arith.cmpi slt, %r0_index_14, %r0_mask : tensor<1x2048xi32> loc(#loc55) + %tmp0 = arith.constant 8192 : i32 loc(#loc56) + %tmp0_16 = arith.constant 8192 : i32 loc(#loc56) + %tmp0_17 = arith.constant dense<8192> : tensor<1x1xi32> loc(#loc56) + %tmp0_18 = arith.muli %tmp0_17, %xindex_7 : tensor<1x1xi32> loc(#loc56) + %tmp0_19 = tt.broadcast %tmp0_18 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc57) + %tmp0_20 = arith.addi %r0_index_14, %tmp0_19 : tensor<1x2048xi32> loc(#loc57) + %tmp0_21 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc58) + %tmp0_22 = tt.addptr %tmp0_21, %tmp0_20 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc58) + %tmp0_23 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc59) + %tmp0_24 = arith.andi %r0_mask_15, %tmp0_23 : tensor<1x2048xi1> loc(#loc59) + %tmp0_25 = arith.constant 0.000000e+00 : f32 loc(#loc60) + %tmp0_26 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc60) + %tmp0_27 = arith.fptosi %tmp0_26 : tensor<1x2048xf32> to tensor<1x2048xi64> loc(#loc60) + %tmp0_28 = tt.load %tmp0_22, %tmp0_24, %tmp0_27 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc60) + %tmp3 = arith.addi %_tmp2_13, %tmp0_28 : tensor<1x2048xi64> loc(#loc61) + %_tmp2_29 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc62) + %_tmp2_30 = arith.andi %r0_mask_15, %_tmp2_29 : tensor<1x2048xi1> loc(#loc62) + %_tmp2_31 = arith.select %_tmp2_30, %tmp3, %_tmp2_13 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc63) + scf.yield %_tmp2_31 : tensor<1x2048xi64> loc(#loc23) + } loc(#loc53) + %tmp2 = tt.call @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp2_11) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc64) + %tmp2_12 = tt.expand_dims %tmp2 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc65) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc26) + %5 = tt.addptr %4, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc26) + tt.store %5, %tmp2_12, %xmask_8 : tensor<1x1x!tt.ptr> loc(#loc27) + tt.return loc(#loc28) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2048xi64> loc("input"(#loc29))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc30) + tt.reduce.return %2 : i64 loc(#loc30) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc30) + tt.return %0 : tensor<1xi64> loc(#loc32) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc33) + tt.return %1 : tensor<1xi64> loc(#loc33) + } loc(#loc29) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc34)), %b: i64 loc("b"(#loc34))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc35) + tt.return %0 : i64 loc(#loc36) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc37) + tt.return %1 : i64 loc(#loc37) + } loc(#loc34) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":25:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":29:43) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":30:40) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":31:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":32:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":36:46) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":36:41) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":36:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":36:61) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":36:51) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":38:23) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":39:35) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":39:48) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":39:8) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":40:25) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":40:28) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":41:25) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":41:36) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":41:4) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc42 = loc("xnumel"(#loc1)) +#loc43 = loc("r0_numel"(#loc2)) +#loc44 = loc("xoffset"(#loc3)) +#loc45 = loc("xoffset"(#loc4)) +#loc46 = loc("xindex"(#loc5)) +#loc47 = loc("xindex"(#loc6)) +#loc48 = loc("xindex"(#loc7)) +#loc49 = loc("xmask"(#loc8)) +#loc50 = loc("r0_base"(#loc9)) +#loc51 = loc("r0_base"(#loc10)) +#loc52 = loc("_tmp2"(#loc11)) +#loc53 = loc("_tmp2"(#loc12)) +#loc54 = loc("r0_index"(#loc13)) +#loc55 = loc("r0_mask"(#loc14)) +#loc56 = loc("tmp0"(#loc15)) +#loc57 = loc("tmp0"(#loc16)) +#loc58 = loc("tmp0"(#loc17)) +#loc59 = loc("tmp0"(#loc18)) +#loc60 = loc("tmp0"(#loc19)) +#loc61 = loc("tmp3"(#loc20)) +#loc62 = loc("_tmp2"(#loc21)) +#loc63 = loc("_tmp2"(#loc22)) +#loc64 = loc("tmp2"(#loc24)) +#loc65 = loc("tmp2"(#loc25)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..653af042411c41156d660dd4d6e0b4d16985066f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.ttgir @@ -0,0 +1,91 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":18:0) +#loc1 = loc(unknown) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":40:25) +#loc23 = loc("in_ptr0"(#loc)) +#loc24 = loc("out_ptr0"(#loc)) +#loc25 = loc("xnumel"(#loc)) +#loc26 = loc("r0_numel"(#loc)) +#loc40 = loc("tmp2"(#loc17)) +#loc45 = loc(callsite(#loc1 at #loc40)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_sum_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<8192> : tensor<1x2048xi32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c8192_i32 = arith.constant 8192 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x2048xi64, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc27) + %xmask = arith.cmpi slt, %xoffset, %c2_i32 : i32 loc(#loc28) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc29) + %r0_base_1 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc29) + %tmp0 = arith.muli %xoffset, %c8192_i32 : i32 loc(#loc30) + %tmp0_2 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc42) + %tmp0_3 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc32) + %tmp0_4 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc43) + %_tmp2 = scf.for %_tmp2_7 = %c0_i32 to %c8192_i32 step %c2048_i32 iter_args(%arg5 = %cst_0) -> (tensor<1x2048xi64, #blocked>) : i32 { + %r0_index = tt.splat %_tmp2_7 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc35) + %r0_index_8 = arith.addi %r0_index, %r0_base_1 : tensor<1x2048xi32, #blocked> loc(#loc35) + %r0_mask = arith.cmpi slt, %r0_index_8, %cst : tensor<1x2048xi32, #blocked> loc(#loc36) + %tmp0_9 = arith.addi %r0_index_8, %tmp0_2 : tensor<1x2048xi32, #blocked> loc(#loc31) + %tmp0_10 = tt.addptr %tmp0_3, %tmp0_9 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc32) + %tmp0_11 = arith.andi %r0_mask, %tmp0_4 : tensor<1x2048xi1, #blocked> loc(#loc33) + %tmp0_12 = tt.load %tmp0_10, %tmp0_11, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc37) + %tmp3 = arith.addi %arg5, %tmp0_12 : tensor<1x2048xi64, #blocked> loc(#loc38) + %_tmp2_13 = arith.select %tmp0_11, %tmp3, %arg5 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc39) + scf.yield %_tmp2_13 : tensor<1x2048xi64, #blocked> loc(#loc15) + } loc(#loc34) + %tmp2 = "tt.reduce"(%_tmp2) <{axis = 1 : i32}> ({ + ^bb0(%tmp2_7: i64 loc(callsite(#loc1 at #loc40)), %tmp2_8: i64 loc(callsite(#loc1 at #loc40))): + %tmp2_9 = arith.addi %tmp2_7, %tmp2_8 : i64 loc(#loc46) + tt.reduce.return %tmp2_9 : i64 loc(#loc44) + }) : (tensor<1x2048xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc44) + %tmp2_5 = ttg.convert_layout %tmp2 : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc41) + %tmp2_6 = tt.expand_dims %tmp2_5 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi64, #blocked1> loc(#loc41) + %0 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc20) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc21) + %2 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked1> loc(#loc21) + tt.store %1, %tmp2_6, %2 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc21) + tt.return loc(#loc22) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":25:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":26:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":36:46) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":36:41) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":36:34) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":36:61) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":30:40) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":31:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":32:29) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":36:51) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":38:23) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":39:48) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":39:8) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":40:28) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":41:25) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":41:36) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":41:4) +#loc27 = loc("xoffset"(#loc2)) +#loc28 = loc("xmask"(#loc3)) +#loc29 = loc("r0_base"(#loc4)) +#loc30 = loc("tmp0"(#loc5)) +#loc31 = loc("tmp0"(#loc6)) +#loc32 = loc("tmp0"(#loc7)) +#loc33 = loc("tmp0"(#loc8)) +#loc34 = loc("_tmp2"(#loc9)) +#loc35 = loc("r0_index"(#loc10)) +#loc36 = loc("r0_mask"(#loc11)) +#loc37 = loc("tmp0"(#loc12)) +#loc38 = loc("tmp3"(#loc13)) +#loc39 = loc("_tmp2"(#loc14)) +#loc41 = loc("tmp2"(#loc19)) +#loc42 = loc(fused[#loc31, #loc30]) +#loc43 = loc(fused[#loc33, #loc28]) +#loc44 = loc(callsite(#loc16 at #loc40)) +#loc46 = loc(callsite(#loc18 at #loc44)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.ttir new file mode 100644 index 0000000000000000000000000000000000000000..6a6b13aec80d58dcc557c7efad1f940c5639f92c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/H3DA2BH6HVRM24XA4WNRTUKAC64XB6CSJ7R3K3JCOXKW2XOCDXWQ/triton_red_fused_sum_3.ttir @@ -0,0 +1,90 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":18:0) +#loc3 = loc(unknown) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":40:25) +#loc24 = loc("in_ptr0"(#loc)) +#loc25 = loc("out_ptr0"(#loc)) +#loc26 = loc("xnumel"(#loc)) +#loc27 = loc("r0_numel"(#loc)) +#loc42 = loc("tmp2"(#loc18)) +#loc47 = loc(callsite(#loc3 at #loc42)) +module { + tt.func public @triton_red_fused_sum_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xmask = arith.constant 2 : i32 loc(#loc28) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc2) + %c8192_i32 = arith.constant 8192 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst = arith.constant dense<8192> : tensor<1x2048xi32> loc(#loc3) + %cst_0 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc29) + %xmask_1 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc28) + %xmask_2 = tt.splat %xmask_1 : i1 -> tensor<1x1xi1> loc(#loc28) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc30) + %r0_base_3 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc31) + %_tmp2 = scf.for %r0_offset = %c0_i32 to %c8192_i32 step %c2048_i32 iter_args(%_tmp2_5 = %cst_0) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc33) + %r0_index_6 = arith.addi %r0_index, %r0_base_3 : tensor<1x2048xi32> loc(#loc33) + %r0_mask = arith.cmpi slt, %r0_index_6, %cst : tensor<1x2048xi32> loc(#loc34) + %tmp0 = arith.muli %xoffset, %c8192_i32 : i32 loc(#loc35) + %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc44) + %tmp0_8 = arith.addi %r0_index_6, %tmp0_7 : tensor<1x2048xi32> loc(#loc36) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc37) + %tmp0_10 = tt.addptr %tmp0_9, %tmp0_8 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc37) + %tmp0_11 = tt.splat %xmask_1 : i1 -> tensor<1x2048xi1> loc(#loc45) + %tmp0_12 = arith.andi %r0_mask, %tmp0_11 : tensor<1x2048xi1> loc(#loc38) + %tmp0_13 = tt.load %tmp0_10, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc39) + %tmp3 = arith.addi %_tmp2_5, %tmp0_13 : tensor<1x2048xi64> loc(#loc40) + %_tmp2_14 = arith.select %tmp0_12, %tmp3, %_tmp2_5 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc41) + scf.yield %_tmp2_14 : tensor<1x2048xi64> loc(#loc16) + } loc(#loc32) + %tmp2 = "tt.reduce"(%_tmp2) <{axis = 1 : i32}> ({ + ^bb0(%tmp2_5: i64 loc(callsite(#loc3 at #loc42)), %tmp2_6: i64 loc(callsite(#loc3 at #loc42))): + %tmp2_7 = arith.addi %tmp2_5, %tmp2_6 : i64 loc(#loc48) + tt.reduce.return %tmp2_7 : i64 loc(#loc46) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc46) + %tmp2_4 = tt.expand_dims %tmp2 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc43) + %0 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc21) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc21) + tt.store %1, %tmp2_4, %xmask_2 : tensor<1x1x!tt.ptr> loc(#loc22) + tt.return loc(#loc23) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":25:21) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":30:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":23:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":26:27) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":31:31) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":32:29) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":36:46) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":36:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":36:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":36:61) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":36:51) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":38:23) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":39:48) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":39:8) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":40:28) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":41:25) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":41:36) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/uo/cuoflv2dnlrrnqtf32tqz435pzpw3hvrmamzms4siwsvxeljqc7k.py":41:4) +#loc28 = loc("xmask"(#loc1)) +#loc29 = loc("xoffset"(#loc4)) +#loc30 = loc("r0_base"(#loc5)) +#loc31 = loc("r0_base"(#loc6)) +#loc32 = loc("_tmp2"(#loc2)) +#loc33 = loc("r0_index"(#loc7)) +#loc34 = loc("r0_mask"(#loc8)) +#loc35 = loc("tmp0"(#loc9)) +#loc36 = loc("tmp0"(#loc10)) +#loc37 = loc("tmp0"(#loc11)) +#loc38 = loc("tmp0"(#loc12)) +#loc39 = loc("tmp0"(#loc13)) +#loc40 = loc("tmp3"(#loc14)) +#loc41 = loc("_tmp2"(#loc15)) +#loc43 = loc("tmp2"(#loc20)) +#loc44 = loc(fused[#loc36, #loc35]) +#loc45 = loc(fused[#loc38, #loc28]) +#loc46 = loc(callsite(#loc17 at #loc42)) +#loc48 = loc(callsite(#loc19 at #loc46)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/__grp__triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/__grp__triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c75dc80429951c2b0652af337c253effb342c88c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/__grp__triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..6a43329304806a6b3a58a2bb99069594944c1704 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..20b6273d5884981f847dcb5662770efa2490596d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"hash": "3810bbf73eb8dc4dbcc7866fc0040909088e275481d88f556d9c0d3f05994c21", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..90ed55d8b19831d9df9a7572dd6443c1fb0e11f2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.llir @@ -0,0 +1,450 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_argmax_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = icmp slt i32 %9, %4, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 511, !dbg !9 + %13 = zext nneg i32 %9 to i64, !dbg !10 + %.frozen = freeze i64 %2, !dbg !11 + %14 = sdiv i64 %13, %.frozen, !dbg !11 + %15 = mul i64 %14, %.frozen, !dbg !10 + %.decomposed = sub i64 %13, %15, !dbg !10 + %16 = mul i64 %14, %3, !dbg !12 + %.idx = mul nuw nsw i64 %.decomposed, 128000 + %17 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx + %invariant.gep = getelementptr float, ptr addrspace(1) %17, i64 %16, !dbg !13 + %18 = zext nneg i32 %12 to i64, !dbg !13 + %19 = insertelement <2 x i1> poison, i1 %10, i64 0, !dbg !14 + %20 = shufflevector <2 x i1> %19, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !15 + br label %21, !dbg !13 + +21: ; preds = %8, %21 + %indvars.iv = phi i64 [ 0, %8 ], [ %indvars.iv.next, %21 ] + %22 = phi i32 [ 2147483647, %8 ], [ %106, %21 ] + %23 = phi i32 [ 2147483647, %8 ], [ %107, %21 ] + %24 = phi <2 x float> [ splat (float 0xFFF0000000000000), %8 ], [ %104, %21 ] + %25 = phi <2 x i32> [ splat (i32 2147483647), %8 ], [ %108, %21 ] + %26 = phi <2 x float> [ splat (float 0xFFF0000000000000), %8 ], [ %105, %21 ] + %27 = or disjoint i64 %indvars.iv, %18, !dbg !16 + %28 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !16 + %29 = or disjoint i32 %11, %28, !dbg !16 + %30 = or i32 %29, 512, !dbg !16 + %31 = or disjoint i64 %27, 1024, !dbg !16 + %32 = or i32 %29, 1536, !dbg !16 + %33 = icmp samesign ult i32 %30, 32000, !dbg !17 + %34 = icmp samesign ult i64 %31, 32000, !dbg !17 + %35 = icmp samesign ult i32 %32, 32000, !dbg !17 + %36 = zext nneg i32 %30 to i64, !dbg !18 + %37 = zext nneg i32 %32 to i64, !dbg !18 + %gep = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %27, !dbg !19 + %gep3 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %36, !dbg !19 + %gep5 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %31, !dbg !19 + %gep7 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %37, !dbg !19 + %38 = and i1 %10, %33, !dbg !15 + %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %40 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep, i64 %39, i1 %10) #4, !dbg !20 + %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %42 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep3, i64 %41, i1 %38) #4, !dbg !20 + %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %44 = fcmp uno <2 x float> %24, zeroinitializer, !dbg !21 + %45 = fcmp uno <2 x float> %26, zeroinitializer, !dbg !21 + %46 = sext i32 %22 to i64, !dbg !25 + %47 = icmp sgt i64 %27, %46, !dbg !25 + %48 = icmp slt i32 %23, %30, !dbg !25 + %49 = extractelement <2 x i32> %25, i64 0, !dbg !25 + %50 = sext i32 %49 to i64, !dbg !25 + %51 = icmp sgt i64 %31, %50, !dbg !25 + %52 = extractelement <2 x i32> %25, i64 1, !dbg !25 + %53 = icmp slt i32 %52, %32, !dbg !25 + %54 = insertelement <2 x i32> poison, i32 %40, i64 0, !dbg !20 + %55 = insertelement <2 x i32> %54, i32 %42, i64 1, !dbg !20 + %56 = bitcast <2 x i32> %55 to <2 x float>, !dbg !20 + %57 = fcmp ogt <2 x float> %24, %56, !dbg !26 + %58 = fcmp oeq <2 x float> %24, %56, !dbg !27 + %59 = fcmp uno <2 x float> %56, zeroinitializer, !dbg !28 + %60 = xor <2 x i1> %59, splat (i1 true), !dbg !29 + %61 = and <2 x i1> %44, %60, !dbg !30 + %62 = or <2 x i1> %57, %61, !dbg !31 + %63 = and <2 x i1> %44, %59, !dbg !32 + %64 = or <2 x i1> %58, %63, !dbg !33 + %65 = insertelement <2 x i1> poison, i1 %47, i64 0, !dbg !34 + %66 = insertelement <2 x i1> %65, i1 %48, i64 1, !dbg !34 + %67 = and <2 x i1> %66, %64, !dbg !34 + %68 = or <2 x i1> %62, %67, !dbg !35 + %69 = select <2 x i1> %68, <2 x float> %24, <2 x float> %56, !dbg !36 + %70 = trunc nuw nsw i64 %27 to i32, !dbg !37 + %71 = extractelement <2 x i1> %68, i64 0, !dbg !37 + %72 = select i1 %71, i32 %22, i32 %70, !dbg !37 + %73 = extractelement <2 x i1> %68, i64 1, !dbg !37 + %74 = select i1 %73, i32 %23, i32 %30, !dbg !37 + %75 = trunc nuw nsw i64 %31 to i32, !dbg !37 + %76 = insertelement <2 x i1> poison, i1 %34, i64 0, !dbg !15 + %77 = insertelement <2 x i1> %76, i1 %35, i64 1, !dbg !15 + %78 = and <2 x i1> %20, %77, !dbg !15 + %79 = extractelement <2 x i1> %78, i64 0, !dbg !20 + %80 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep5, i64 %43, i1 %79) #4, !dbg !20 + %81 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %82 = extractelement <2 x i1> %78, i64 1, !dbg !20 + %83 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep7, i64 %81, i1 %82) #4, !dbg !20 + %84 = insertelement <2 x i32> poison, i32 %80, i64 0, !dbg !20 + %85 = insertelement <2 x i32> %84, i32 %83, i64 1, !dbg !20 + %86 = bitcast <2 x i32> %85 to <2 x float>, !dbg !20 + %87 = fcmp ogt <2 x float> %26, %86, !dbg !26 + %88 = fcmp oeq <2 x float> %26, %86, !dbg !27 + %89 = fcmp uno <2 x float> %86, zeroinitializer, !dbg !28 + %90 = xor <2 x i1> %89, splat (i1 true), !dbg !29 + %91 = and <2 x i1> %45, %90, !dbg !30 + %92 = or <2 x i1> %87, %91, !dbg !31 + %93 = and <2 x i1> %45, %89, !dbg !32 + %94 = or <2 x i1> %88, %93, !dbg !33 + %95 = insertelement <2 x i1> poison, i1 %51, i64 0, !dbg !34 + %96 = insertelement <2 x i1> %95, i1 %53, i64 1, !dbg !34 + %97 = and <2 x i1> %96, %94, !dbg !34 + %98 = or <2 x i1> %92, %97, !dbg !35 + %99 = select <2 x i1> %98, <2 x float> %26, <2 x float> %86, !dbg !36 + %100 = insertelement <2 x i32> poison, i32 %75, i64 0, !dbg !37 + %101 = insertelement <2 x i32> %100, i32 %32, i64 1, !dbg !37 + %102 = select <2 x i1> %98, <2 x i32> %25, <2 x i32> %101, !dbg !37 + %103 = insertelement <2 x i1> %19, i1 %38, i64 1, !dbg !14 + %104 = select <2 x i1> %103, <2 x float> %69, <2 x float> %24, !dbg !14 + %105 = select <2 x i1> %78, <2 x float> %99, <2 x float> %26, !dbg !14 + %106 = select i1 %10, i32 %72, i32 %22, !dbg !38 + %107 = select i1 %38, i32 %74, i32 %23, !dbg !38 + %108 = select <2 x i1> %78, <2 x i32> %102, <2 x i32> %25, !dbg !38 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !13 + %109 = icmp samesign ult i64 %indvars.iv, 29952, !dbg !13 + br i1 %109, label %21, label %110, !dbg !13 + +110: ; preds = %21 + %111 = and i32 %11, 31, !dbg !9 + %112 = lshr i32 %11, 5, !dbg !9 + %113 = shufflevector <2 x float> %104, <2 x float> poison, <2 x i32> , !dbg !39 + %114 = fcmp ogt <2 x float> %104, %113, !dbg !39 + %115 = fcmp oeq <2 x float> %104, %113, !dbg !39 + %116 = shufflevector <2 x i1> %114, <2 x i1> %115, <2 x i32> , !dbg !39 + %117 = extractelement <2 x float> %104, i64 0, !dbg !41 + %118 = fcmp uno float %117, 0.000000e+00, !dbg !41 + %119 = extractelement <2 x float> %104, i64 1, !dbg !42 + %120 = fcmp uno float %119, 0.000000e+00, !dbg !42 + %121 = xor i1 %120, true, !dbg !43 + %122 = insertelement <2 x i1> poison, i1 %118, i64 0, !dbg !44 + %123 = shufflevector <2 x i1> %122, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !44 + %124 = insertelement <2 x i1> poison, i1 %121, i64 0, !dbg !44 + %125 = insertelement <2 x i1> %124, i1 %120, i64 1, !dbg !44 + %126 = and <2 x i1> %123, %125, !dbg !44 + %127 = or <2 x i1> %116, %126, !dbg !45 + %128 = icmp slt i32 %106, %107, !dbg !46 + %129 = extractelement <2 x i1> %127, i64 1, !dbg !47 + %130 = and i1 %128, %129, !dbg !47 + %131 = extractelement <2 x i1> %127, i64 0, !dbg !48 + %132 = or i1 %131, %130, !dbg !48 + %133 = select i1 %132, float %117, float %119, !dbg !49 + %134 = select i1 %132, i32 %106, i32 %107, !dbg !50 + %135 = extractelement <2 x float> %105, i64 0, !dbg !39 + %136 = fcmp ogt float %133, %135, !dbg !39 + %137 = fcmp oeq float %133, %135, !dbg !51 + %138 = fcmp uno float %133, 0.000000e+00, !dbg !41 + %139 = fcmp uno float %135, 0.000000e+00, !dbg !42 + %140 = xor i1 %139, true, !dbg !43 + %141 = and i1 %138, %140, !dbg !44 + %142 = or i1 %136, %141, !dbg !45 + %143 = and i1 %139, %138, !dbg !52 + %144 = or i1 %137, %143, !dbg !53 + %145 = extractelement <2 x i32> %108, i64 0, !dbg !46 + %146 = icmp slt i32 %134, %145, !dbg !46 + %147 = and i1 %146, %144, !dbg !47 + %148 = or i1 %142, %147, !dbg !48 + %149 = select i1 %148, float %133, float %135, !dbg !49 + %150 = select i1 %148, i32 %134, i32 %145, !dbg !50 + %151 = extractelement <2 x float> %105, i64 1, !dbg !39 + %152 = fcmp ogt float %149, %151, !dbg !39 + %153 = fcmp oeq float %149, %151, !dbg !51 + %154 = fcmp uno float %149, 0.000000e+00, !dbg !41 + %155 = fcmp uno float %151, 0.000000e+00, !dbg !42 + %156 = xor i1 %155, true, !dbg !43 + %157 = and i1 %154, %156, !dbg !44 + %158 = or i1 %152, %157, !dbg !45 + %159 = and i1 %155, %154, !dbg !52 + %160 = or i1 %153, %159, !dbg !53 + %161 = extractelement <2 x i32> %108, i64 1, !dbg !46 + %162 = icmp slt i32 %150, %161, !dbg !46 + %163 = and i1 %162, %160, !dbg !47 + %164 = or i1 %158, %163, !dbg !48 + %165 = select i1 %164, float %149, float %151, !dbg !49 + %166 = select i1 %164, i32 %150, i32 %161, !dbg !50 + %167 = bitcast float %165 to i32, !dbg !54 + %168 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %167, i32 16, i32 31), !dbg !54 + %169 = bitcast i32 %168 to float, !dbg !54 + %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %166, i32 16, i32 31), !dbg !54 + %171 = fcmp ogt float %165, %169, !dbg !39 + %172 = fcmp oeq float %165, %169, !dbg !51 + %173 = fcmp uno float %165, 0.000000e+00, !dbg !41 + %174 = fcmp uno float %169, 0.000000e+00, !dbg !42 + %175 = xor i1 %174, true, !dbg !43 + %176 = and i1 %173, %175, !dbg !44 + %177 = or i1 %171, %176, !dbg !45 + %178 = and i1 %173, %174, !dbg !52 + %179 = or i1 %172, %178, !dbg !53 + %180 = icmp slt i32 %166, %170, !dbg !46 + %181 = and i1 %180, %179, !dbg !47 + %182 = or i1 %177, %181, !dbg !48 + %183 = select i1 %182, float %165, float %169, !dbg !49 + %184 = select i1 %182, i32 %166, i32 %170, !dbg !50 + %185 = bitcast float %183 to i32, !dbg !54 + %186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %185, i32 8, i32 31), !dbg !54 + %187 = bitcast i32 %186 to float, !dbg !54 + %188 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 8, i32 31), !dbg !54 + %189 = fcmp ogt float %183, %187, !dbg !39 + %190 = fcmp oeq float %183, %187, !dbg !51 + %191 = fcmp uno float %183, 0.000000e+00, !dbg !41 + %192 = fcmp uno float %187, 0.000000e+00, !dbg !42 + %193 = xor i1 %192, true, !dbg !43 + %194 = and i1 %191, %193, !dbg !44 + %195 = or i1 %189, %194, !dbg !45 + %196 = and i1 %192, %191, !dbg !52 + %197 = or i1 %190, %196, !dbg !53 + %198 = icmp slt i32 %184, %188, !dbg !46 + %199 = and i1 %198, %197, !dbg !47 + %200 = or i1 %195, %199, !dbg !48 + %201 = select i1 %200, float %183, float %187, !dbg !49 + %202 = select i1 %200, i32 %184, i32 %188, !dbg !50 + %203 = bitcast float %201 to i32, !dbg !54 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 4, i32 31), !dbg !54 + %205 = bitcast i32 %204 to float, !dbg !54 + %206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 4, i32 31), !dbg !54 + %207 = fcmp ogt float %201, %205, !dbg !39 + %208 = fcmp oeq float %201, %205, !dbg !51 + %209 = fcmp uno float %201, 0.000000e+00, !dbg !41 + %210 = fcmp uno float %205, 0.000000e+00, !dbg !42 + %211 = xor i1 %210, true, !dbg !43 + %212 = and i1 %209, %211, !dbg !44 + %213 = or i1 %207, %212, !dbg !45 + %214 = and i1 %210, %209, !dbg !52 + %215 = or i1 %208, %214, !dbg !53 + %216 = icmp slt i32 %202, %206, !dbg !46 + %217 = and i1 %216, %215, !dbg !47 + %218 = or i1 %213, %217, !dbg !48 + %219 = select i1 %218, float %201, float %205, !dbg !49 + %220 = select i1 %218, i32 %202, i32 %206, !dbg !50 + %221 = bitcast float %219 to i32, !dbg !54 + %222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 2, i32 31), !dbg !54 + %223 = bitcast i32 %222 to float, !dbg !54 + %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 2, i32 31), !dbg !54 + %225 = fcmp ogt float %219, %223, !dbg !39 + %226 = fcmp oeq float %219, %223, !dbg !51 + %227 = fcmp uno float %219, 0.000000e+00, !dbg !41 + %228 = fcmp uno float %223, 0.000000e+00, !dbg !42 + %229 = xor i1 %228, true, !dbg !43 + %230 = and i1 %227, %229, !dbg !44 + %231 = or i1 %225, %230, !dbg !45 + %232 = and i1 %228, %227, !dbg !52 + %233 = or i1 %226, %232, !dbg !53 + %234 = icmp slt i32 %220, %224, !dbg !46 + %235 = and i1 %234, %233, !dbg !47 + %236 = or i1 %231, %235, !dbg !48 + %237 = select i1 %236, float %219, float %223, !dbg !49 + %238 = select i1 %236, i32 %220, i32 %224, !dbg !50 + %239 = bitcast float %237 to i32, !dbg !54 + %240 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %239, i32 1, i32 31), !dbg !54 + %241 = bitcast i32 %240 to float, !dbg !54 + %242 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %238, i32 1, i32 31), !dbg !54 + %243 = fcmp ogt float %237, %241, !dbg !39 + %244 = fcmp oeq float %237, %241, !dbg !51 + %245 = fcmp uno float %237, 0.000000e+00, !dbg !41 + %246 = fcmp uno float %241, 0.000000e+00, !dbg !42 + %247 = xor i1 %246, true, !dbg !43 + %248 = and i1 %245, %247, !dbg !44 + %249 = or i1 %243, %248, !dbg !45 + %250 = and i1 %246, %245, !dbg !52 + %251 = or i1 %244, %250, !dbg !53 + %252 = icmp slt i32 %238, %242, !dbg !46 + %253 = and i1 %252, %251, !dbg !47 + %254 = or i1 %249, %253, !dbg !48 + %255 = select i1 %254, i32 %238, i32 %242, !dbg !50 + %256 = and i32 %112, 15, !dbg !54 + %257 = icmp eq i32 %111, 0, !dbg !54 + %258 = getelementptr float, ptr addrspace(3) @global_smem, i32 %256, !dbg !54 + %259 = select i1 %254, i32 %239, i32 %240, !dbg !49 + %260 = insertelement <1 x i32> poison, i32 %259, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %258, <1 x i32> %260, i1 %257) #4, !dbg !54 + %261 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %256, !dbg !54 + %262 = insertelement <1 x i32> poison, i32 %255, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %261, <1 x i32> %262, i1 %257) #4, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + %263 = icmp samesign ult i32 %11, 16, !dbg !54 + %264 = getelementptr float, ptr addrspace(3) @global_smem, i32 %11, !dbg !54 + %265 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %264, i1 %263) #4, !dbg !54 + %266 = bitcast i32 %265 to float, !dbg !54 + %267 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %11, !dbg !54 + %268 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %267, i1 %263) #4, !dbg !54 + %269 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %265, i32 8, i32 31), !dbg !54 + %270 = bitcast i32 %269 to float, !dbg !54 + %271 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 8, i32 31), !dbg !54 + %272 = fcmp ogt float %266, %270, !dbg !39 + %273 = fcmp oeq float %266, %270, !dbg !51 + %274 = fcmp uno float %266, 0.000000e+00, !dbg !41 + %275 = fcmp uno float %270, 0.000000e+00, !dbg !42 + %276 = xor i1 %275, true, !dbg !43 + %277 = and i1 %274, %276, !dbg !44 + %278 = or i1 %272, %277, !dbg !45 + %279 = and i1 %274, %275, !dbg !52 + %280 = or i1 %273, %279, !dbg !53 + %281 = icmp slt i32 %268, %271, !dbg !46 + %282 = and i1 %281, %280, !dbg !47 + %283 = or i1 %278, %282, !dbg !48 + %284 = select i1 %283, float %266, float %270, !dbg !49 + %285 = select i1 %283, i32 %268, i32 %271, !dbg !50 + %286 = bitcast float %284 to i32, !dbg !54 + %287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %286, i32 4, i32 31), !dbg !54 + %288 = bitcast i32 %287 to float, !dbg !54 + %289 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 4, i32 31), !dbg !54 + %290 = fcmp ogt float %284, %288, !dbg !39 + %291 = fcmp oeq float %284, %288, !dbg !51 + %292 = fcmp uno float %284, 0.000000e+00, !dbg !41 + %293 = fcmp uno float %288, 0.000000e+00, !dbg !42 + %294 = xor i1 %293, true, !dbg !43 + %295 = and i1 %292, %294, !dbg !44 + %296 = or i1 %290, %295, !dbg !45 + %297 = and i1 %293, %292, !dbg !52 + %298 = or i1 %291, %297, !dbg !53 + %299 = icmp slt i32 %285, %289, !dbg !46 + %300 = and i1 %299, %298, !dbg !47 + %301 = or i1 %296, %300, !dbg !48 + %302 = select i1 %301, float %284, float %288, !dbg !49 + %303 = select i1 %301, i32 %285, i32 %289, !dbg !50 + %304 = bitcast float %302 to i32, !dbg !54 + %305 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %304, i32 2, i32 31), !dbg !54 + %306 = bitcast i32 %305 to float, !dbg !54 + %307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %303, i32 2, i32 31), !dbg !54 + %308 = fcmp ogt float %302, %306, !dbg !39 + %309 = fcmp oeq float %302, %306, !dbg !51 + %310 = fcmp uno float %302, 0.000000e+00, !dbg !41 + %311 = fcmp uno float %306, 0.000000e+00, !dbg !42 + %312 = xor i1 %311, true, !dbg !43 + %313 = and i1 %310, %312, !dbg !44 + %314 = or i1 %308, %313, !dbg !45 + %315 = and i1 %311, %310, !dbg !52 + %316 = or i1 %309, %315, !dbg !53 + %317 = icmp slt i32 %303, %307, !dbg !46 + %318 = and i1 %317, %316, !dbg !47 + %319 = or i1 %314, %318, !dbg !48 + %320 = select i1 %319, float %302, float %306, !dbg !49 + %321 = select i1 %319, i32 %303, i32 %307, !dbg !50 + %322 = bitcast float %320 to i32, !dbg !54 + %323 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %322, i32 1, i32 31), !dbg !54 + %324 = bitcast i32 %323 to float, !dbg !54 + %325 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %321, i32 1, i32 31), !dbg !54 + %326 = fcmp ogt float %320, %324, !dbg !39 + %327 = fcmp oeq float %320, %324, !dbg !51 + %328 = fcmp uno float %320, 0.000000e+00, !dbg !41 + %329 = fcmp uno float %324, 0.000000e+00, !dbg !42 + %330 = xor i1 %329, true, !dbg !43 + %331 = and i1 %328, %330, !dbg !44 + %332 = or i1 %326, %331, !dbg !45 + %333 = and i1 %329, %328, !dbg !52 + %334 = or i1 %327, %333, !dbg !53 + %335 = icmp slt i32 %321, %325, !dbg !46 + %336 = and i1 %335, %334, !dbg !47 + %337 = or i1 %332, %336, !dbg !48 + %338 = select i1 %337, i32 %321, i32 %325, !dbg !50 + %339 = icmp eq i32 %11, 0, !dbg !54 + %340 = select i1 %337, i32 %322, i32 %323, !dbg !49 + %341 = insertelement <1 x i32> poison, i32 %340, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %264, <1 x i32> %341, i1 %339) #4, !dbg !54 + %342 = insertelement <1 x i32> poison, i32 %338, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %267, <1 x i32> %342, i1 %339) #4, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + %343 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !54 + %344 = getelementptr i64, ptr addrspace(1) %1, i64 %13, !dbg !55 + %345 = sext i32 %343 to i64, !dbg !56 + %346 = icmp eq i32 %12, 0, !dbg !56 + %347 = and i1 %346, %10, !dbg !56 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %345, ptr addrspace(1) %344, i1 %347) #4, !dbg !56 + ret void, !dbg !57 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_1", linkageName: "triton_red_fused_argmax_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 24, column: 21, scope: !4) +!9 = !DILocation(line: 25, column: 37, scope: !4) +!10 = !DILocation(line: 27, column: 19, scope: !4) +!11 = !DILocation(line: 28, column: 19, scope: !4) +!12 = !DILocation(line: 38, column: 56, scope: !4) +!13 = !DILocation(line: 32, column: 40, scope: !4) +!14 = !DILocation(line: 43, column: 54, scope: !4) +!15 = !DILocation(line: 38, column: 71, scope: !4) +!16 = !DILocation(line: 33, column: 31, scope: !4) +!17 = !DILocation(line: 34, column: 29, scope: !4) +!18 = !DILocation(line: 38, column: 41, scope: !4) +!19 = !DILocation(line: 38, column: 34, scope: !4) +!20 = !DILocation(line: 38, column: 61, scope: !4) +!21 = !DILocation(line: 147, column: 29, scope: !22, inlinedAt: !24) +!22 = distinct !DILexicalBlockFile(scope: !4, file: !23, discriminator: 0) +!23 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!24 = !DILocation(line: 41, column: 38, scope: !4) +!25 = !DILocation(line: 154, column: 31, scope: !22, inlinedAt: !24) +!26 = !DILocation(line: 144, column: 21, scope: !22, inlinedAt: !24) +!27 = !DILocation(line: 145, column: 23, scope: !22, inlinedAt: !24) +!28 = !DILocation(line: 148, column: 29, scope: !22, inlinedAt: !24) +!29 = !DILocation(line: 149, column: 31, scope: !22, inlinedAt: !24) +!30 = !DILocation(line: 149, column: 27, scope: !22, inlinedAt: !24) +!31 = !DILocation(line: 149, column: 16, scope: !22, inlinedAt: !24) +!32 = !DILocation(line: 151, column: 27, scope: !22, inlinedAt: !24) +!33 = !DILocation(line: 151, column: 17, scope: !22, inlinedAt: !24) +!34 = !DILocation(line: 154, column: 21, scope: !22, inlinedAt: !24) +!35 = !DILocation(line: 154, column: 12, scope: !22, inlinedAt: !24) +!36 = !DILocation(line: 155, column: 35, scope: !22, inlinedAt: !24) +!37 = !DILocation(line: 155, column: 69, scope: !22, inlinedAt: !24) +!38 = !DILocation(line: 44, column: 66, scope: !4) +!39 = !DILocation(line: 144, column: 21, scope: !22, inlinedAt: !40) +!40 = !DILocation(line: 45, column: 75, scope: !4) +!41 = !DILocation(line: 147, column: 29, scope: !22, inlinedAt: !40) +!42 = !DILocation(line: 148, column: 29, scope: !22, inlinedAt: !40) +!43 = !DILocation(line: 149, column: 31, scope: !22, inlinedAt: !40) +!44 = !DILocation(line: 149, column: 27, scope: !22, inlinedAt: !40) +!45 = !DILocation(line: 149, column: 16, scope: !22, inlinedAt: !40) +!46 = !DILocation(line: 154, column: 31, scope: !22, inlinedAt: !40) +!47 = !DILocation(line: 154, column: 21, scope: !22, inlinedAt: !40) +!48 = !DILocation(line: 154, column: 12, scope: !22, inlinedAt: !40) +!49 = !DILocation(line: 155, column: 35, scope: !22, inlinedAt: !40) +!50 = !DILocation(line: 155, column: 69, scope: !22, inlinedAt: !40) +!51 = !DILocation(line: 145, column: 23, scope: !22, inlinedAt: !40) +!52 = !DILocation(line: 151, column: 27, scope: !22, inlinedAt: !40) +!53 = !DILocation(line: 151, column: 17, scope: !22, inlinedAt: !40) +!54 = !DILocation(line: 165, column: 42, scope: !22, inlinedAt: !40) +!55 = !DILocation(line: 47, column: 25, scope: !4) +!56 = !DILocation(line: 47, column: 36, scope: !4) +!57 = !DILocation(line: 47, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..6cb1330de9222d50d6df812c340652f3db505cd1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.ptx @@ -0,0 +1,915 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_argmax_1 // -- Begin function triton_red_fused_argmax_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_argmax_1 +.visible .entry triton_red_fused_argmax_1( + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_1, + .param .u64 triton_red_fused_argmax_1_param_2, + .param .u64 triton_red_fused_argmax_1_param_3, + .param .u32 triton_red_fused_argmax_1_param_4, + .param .u32 triton_red_fused_argmax_1_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_7 +) +.reqntid 512 +{ + .reg .pred %p<215>; + .reg .b32 %r<123>; + .reg .b64 %rd<70>; + .loc 1 18 0 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:18:0 + +// %bb.0: + ld.param.b32 %r10, [triton_red_fused_argmax_1_param_4]; + ld.param.b64 %rd20, [triton_red_fused_argmax_1_param_3]; + ld.param.b64 %rd18, [triton_red_fused_argmax_1_param_0]; +$L__tmp0: + .loc 1 22 28 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:22:28 + mov.u32 %r11, %ctaid.x; + ld.param.b64 %rd21, [triton_red_fused_argmax_1_param_2]; + .loc 1 25 37 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:25:37 + mov.u32 %r12, %tid.x; + and.b32 %r1, %r12, 511; + .loc 1 27 19 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:27:19 + cvt.u64.u32 %rd2, %r11; + .loc 1 28 19 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:28:19 + and.b64 %rd22, %rd21, -4294967296; + setp.ne.b64 %p5, %rd22, 0; + cvt.u32.u64 %r118, %rd2; + @%p5 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd65, %rd2, %rd21; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r13, %rd21; + div.u32 %r15, %r118, %r13; + cvt.u64.u32 %rd65, %r15; +$L__BB0_3: + .loc 1 0 19 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:0:19 + ld.param.b64 %rd19, [triton_red_fused_argmax_1_param_1]; + cvt.u64.u32 %rd1, %r12; + .loc 1 24 21 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:24:21 + setp.lt.s32 %p3, %r118, %r10; + .loc 1 27 19 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:27:19 + mul.lo.s64 %rd25, %rd65, %rd21; + sub.s64 %rd26, %rd2, %rd25; + .loc 1 38 56 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:38:56 + mul.lo.s64 %rd27, %rd65, %rd20; + mad.lo.s64 %rd28, %rd26, 128000, %rd18; + .loc 1 32 40 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:32:40 + shl.b64 %rd29, %rd27, 2; + add.s64 %rd7, %rd28, %rd29; + cvt.u64.u32 %rd8, %r1; + shl.b64 %rd30, %rd20, 2; + mul.lo.s64 %rd31, %rd21, 128000; + sub.s64 %rd32, %rd30, %rd31; + mul.lo.s64 %rd33, %rd65, %rd32; + mad.lo.s64 %rd34, %rd2, 128000, %rd33; + mad.wide.u32 %rd35, %r1, 4, %rd34; + add.s64 %rd66, %rd18, %rd35; + mov.b32 %r20, 0fFF800000; + mov.b64 %rd68, {%r20, %r20}; + mov.b32 %r119, 2147483647; + mov.b64 %rd67, -2048; + mov.b32 %r120, %r119; + mov.b32 %r121, %r119; + mov.b32 %r122, %r119; + mov.b64 %rd69, %rd68; +$L__BB0_4: // =>This Inner Loop Header: Depth=1 + .loc 1 33 31 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:33:31 + add.s64 %rd48, %rd8, %rd67; + add.s64 %rd49, %rd48, 2048; + add.s64 %rd50, %rd1, %rd67; + cvt.u32.u64 %r30, %rd50; + add.s32 %r31, %r30, 2048; + or.b32 %r32, %r31, 512; + add.s64 %rd51, %rd48, 3072; + or.b32 %r33, %r31, 1536; + .loc 1 34 29 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:34:29 + setp.lt.u32 %p10, %r32, 32000; + setp.lt.u64 %p11, %rd51, 32000; + setp.lt.u32 %p12, %r33, 32000; + .loc 1 38 34 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:38:34 + mad.wide.u32 %rd40, %r32, 4, %rd7; + add.s64 %rd43, %rd66, 4096; + mad.wide.u32 %rd46, %r33, 4, %rd7; + .loc 1 38 71 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:38:71 + and.pred %p7, %p3, %p10; + .loc 1 38 61 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:38:61 + // begin inline asm + mov.u64 %rd36, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd36, 1.0; + // end inline asm + mov.b32 %r22, 0; + // begin inline asm + mov.u32 %r21, %r22; + @%p3 ld.global.L1::evict_first.L2::cache_hint.b32 { %r21 }, [ %rd66 + 0 ], %rd36; + // end inline asm + // begin inline asm + mov.u64 %rd39, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd39, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r23, %r22; + @%p7 ld.global.L1::evict_first.L2::cache_hint.b32 { %r23 }, [ %rd40 + 0 ], %rd39; + // end inline asm + // begin inline asm + mov.u64 %rd42, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd42, 1.0; + // end inline asm +$L__tmp1: + .loc 2 147 29 // triton_helpers.py:147:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + mov.b64 {%r34, %r35}, %rd68; + setp.nan.f32 %p13, %r34, %r34; + setp.nan.f32 %p14, %r35, %r35; + mov.b64 {%r36, %r37}, %rd69; + setp.nan.f32 %p15, %r36, %r36; + setp.nan.f32 %p16, %r37, %r37; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + cvt.s64.s32 %rd52, %r119; + setp.gt.s64 %p17, %rd49, %rd52; + setp.lt.s32 %p18, %r120, %r32; + cvt.s64.s32 %rd53, %r121; + setp.gt.s64 %p19, %rd51, %rd53; + setp.lt.s32 %p20, %r122, %r33; +$L__tmp2: + .loc 1 38 61 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:38:61 + cvt.u64.u32 %rd54, %r23; + shl.b64 %rd55, %rd54, 32; + cvt.u64.u32 %rd56, %r21; + or.b64 %rd57, %rd56, %rd55; +$L__tmp3: + .loc 2 148 29 // triton_helpers.py:148:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + mov.b64 {%r38, %r39}, %rd57; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + setp.gt.f32 %p21, %r35, %r39; + setp.gt.f32 %p22, %r34, %r38; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + setp.eq.f32 %p23, %r34, %r38; + setp.eq.f32 %p24, %r35, %r39; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + setp.nan.f32 %p25, %r39, %r39; + setp.nan.f32 %p26, %r38, %r38; + setp.num.f32 %p27, %r38, %r38; + setp.num.f32 %p28, %r39, %r39; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + and.pred %p29, %p14, %p28; + and.pred %p30, %p13, %p27; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + or.pred %p31, %p22, %p30; + or.pred %p32, %p21, %p29; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + and.pred %p33, %p13, %p26; + and.pred %p34, %p14, %p25; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + or.pred %p35, %p24, %p34; + or.pred %p36, %p23, %p33; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + and.pred %p37, %p17, %p36; + and.pred %p38, %p18, %p35; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + or.pred %p39, %p32, %p38; + or.pred %p40, %p31, %p37; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + selp.f32 %r40, %r34, %r38, %p40; + selp.f32 %r41, %r35, %r39, %p39; + cvt.u32.u64 %r42, %rd49; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + selp.b32 %r43, %r119, %r42, %p40; + selp.b32 %r44, %r120, %r32, %p39; +$L__tmp4: + .loc 1 38 71 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:38:71 + and.pred %p9, %p3, %p12; + and.pred %p8, %p3, %p11; + .loc 1 38 61 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:38:61 + // begin inline asm + mov.u32 %r25, %r22; + @%p8 ld.global.L1::evict_first.L2::cache_hint.b32 { %r25 }, [ %rd43 + 0 ], %rd42; + // end inline asm + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd45, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r27, %r22; + @%p9 ld.global.L1::evict_first.L2::cache_hint.b32 { %r27 }, [ %rd46 + 0 ], %rd45; + // end inline asm + cvt.u64.u32 %rd58, %r27; + shl.b64 %rd59, %rd58, 32; + cvt.u64.u32 %rd60, %r25; + or.b64 %rd61, %rd60, %rd59; +$L__tmp5: + .loc 2 148 29 // triton_helpers.py:148:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + mov.b64 {%r45, %r46}, %rd61; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + setp.gt.f32 %p41, %r37, %r46; + setp.gt.f32 %p42, %r36, %r45; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + setp.eq.f32 %p43, %r36, %r45; + setp.eq.f32 %p44, %r37, %r46; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + setp.nan.f32 %p45, %r46, %r46; + setp.nan.f32 %p46, %r45, %r45; + setp.num.f32 %p47, %r45, %r45; + setp.num.f32 %p48, %r46, %r46; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + and.pred %p49, %p16, %p48; + and.pred %p50, %p15, %p47; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + or.pred %p51, %p42, %p50; + or.pred %p52, %p41, %p49; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + and.pred %p53, %p15, %p46; + and.pred %p54, %p16, %p45; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + or.pred %p55, %p44, %p54; + or.pred %p56, %p43, %p53; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + and.pred %p57, %p19, %p56; + and.pred %p58, %p20, %p55; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + or.pred %p59, %p52, %p58; + or.pred %p60, %p51, %p57; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + selp.f32 %r47, %r36, %r45, %p60; + selp.f32 %r48, %r37, %r46, %p59; + cvt.u32.u64 %r49, %rd51; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:41:38 ] + selp.b32 %r50, %r121, %r49, %p60; + selp.b32 %r51, %r122, %r33, %p59; +$L__tmp6: + .loc 1 43 54 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:43:54 + selp.f32 %r52, %r41, %r35, %p7; + selp.f32 %r53, %r40, %r34, %p3; + mov.b64 %rd68, {%r53, %r52}; + selp.f32 %r54, %r48, %r37, %p9; + selp.f32 %r55, %r47, %r36, %p8; + mov.b64 %rd69, {%r55, %r54}; + .loc 1 44 66 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:44:66 + selp.b32 %r119, %r43, %r119, %p3; + selp.b32 %r120, %r44, %r120, %p7; + selp.b32 %r122, %r51, %r122, %p9; + selp.b32 %r121, %r50, %r121, %p8; + .loc 1 32 40 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:32:40 + add.s64 %rd67, %rd67, 2048; + add.s64 %rd66, %rd66, 8192; + setp.lt.u64 %p61, %rd67, 29952; + @%p61 bra $L__BB0_4; +// %bb.5: + .loc 1 0 40 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:0:40 + cvt.u32.u64 %r68, %rd1; + .loc 1 25 37 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:25:37 + and.b32 %r70, %r68, 31; +$L__tmp7: + .loc 2 144 21 // triton_helpers.py:144:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + mov.b64 {%r71, %r72}, %rd68; + setp.gt.f32 %p70, %r71, %r72; + setp.eq.f32 %p71, %r72, %r71; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p72, %r71, %r71; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.num.f32 %p73, %r72, %r72; + setp.nan.f32 %p74, %r72, %r72; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p75, %p72, %p74; + and.pred %p76, %p72, %p73; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p77, %p70, %p76; + or.pred %p78, %p71, %p75; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.lt.s32 %p79, %r119, %r120; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p80, %p79, %p78; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p81, %p77, %p80; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.f32 %r73, %r71, %r72, %p81; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.b32 %r74, %r119, %r120, %p81; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + mov.b64 {%r75, %r76}, %rd69; + setp.gt.f32 %p82, %r73, %r75; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.eq.f32 %p83, %r73, %r75; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p84, %r73, %r73; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p85, %r75, %r75; + setp.num.f32 %p86, %r75, %r75; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p87, %p84, %p86; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p88, %p82, %p87; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p89, %p85, %p84; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p90, %p83, %p89; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.lt.s32 %p91, %r74, %r121; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p92, %p91, %p90; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p93, %p88, %p92; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.f32 %r77, %r73, %r75, %p93; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.b32 %r78, %r74, %r121, %p93; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.gt.f32 %p94, %r77, %r76; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.eq.f32 %p95, %r77, %r76; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p96, %r77, %r77; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p97, %r76, %r76; + setp.num.f32 %p98, %r76, %r76; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p99, %p96, %p98; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p100, %p94, %p99; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p101, %p97, %p96; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p102, %p95, %p101; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.lt.s32 %p103, %r78, %r122; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p104, %p103, %p102; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p105, %p100, %p104; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.f32 %r79, %r77, %r76, %p105; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.b32 %r80, %r78, %r122, %p105; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + shfl.sync.bfly.b32 %r81, %r79, 16, 31, -1; + shfl.sync.bfly.b32 %r82, %r80, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.gt.f32 %p106, %r79, %r81; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.eq.f32 %p107, %r79, %r81; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p108, %r79, %r79; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p109, %r81, %r81; + setp.num.f32 %p110, %r81, %r81; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p111, %p108, %p110; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p112, %p106, %p111; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p113, %p108, %p109; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p114, %p107, %p113; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.lt.s32 %p115, %r80, %r82; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p116, %p115, %p114; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p117, %p112, %p116; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.f32 %r83, %r79, %r81, %p117; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.b32 %r84, %r80, %r82, %p117; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + shfl.sync.bfly.b32 %r85, %r83, 8, 31, -1; + shfl.sync.bfly.b32 %r86, %r84, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.gt.f32 %p118, %r83, %r85; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.eq.f32 %p119, %r83, %r85; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p120, %r83, %r83; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p121, %r85, %r85; + setp.num.f32 %p122, %r85, %r85; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p123, %p120, %p122; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p124, %p118, %p123; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p125, %p121, %p120; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p126, %p119, %p125; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.lt.s32 %p127, %r84, %r86; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p128, %p127, %p126; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p129, %p124, %p128; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.f32 %r87, %r83, %r85, %p129; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.b32 %r88, %r84, %r86, %p129; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + shfl.sync.bfly.b32 %r89, %r87, 4, 31, -1; + shfl.sync.bfly.b32 %r90, %r88, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.gt.f32 %p130, %r87, %r89; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.eq.f32 %p131, %r87, %r89; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p132, %r87, %r87; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p133, %r89, %r89; + setp.num.f32 %p134, %r89, %r89; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p135, %p132, %p134; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p136, %p130, %p135; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p137, %p133, %p132; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p138, %p131, %p137; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.lt.s32 %p139, %r88, %r90; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p140, %p139, %p138; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p141, %p136, %p140; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.f32 %r91, %r87, %r89, %p141; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.b32 %r92, %r88, %r90, %p141; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + shfl.sync.bfly.b32 %r93, %r91, 2, 31, -1; + shfl.sync.bfly.b32 %r94, %r92, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.gt.f32 %p142, %r91, %r93; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.eq.f32 %p143, %r91, %r93; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p144, %r91, %r91; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p145, %r93, %r93; + setp.num.f32 %p146, %r93, %r93; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p147, %p144, %p146; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p148, %p142, %p147; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p149, %p145, %p144; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p150, %p143, %p149; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.lt.s32 %p151, %r92, %r94; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p152, %p151, %p150; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p153, %p148, %p152; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.f32 %r95, %r91, %r93, %p153; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.b32 %r96, %r92, %r94, %p153; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + shfl.sync.bfly.b32 %r97, %r95, 1, 31, -1; + shfl.sync.bfly.b32 %r98, %r96, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.gt.f32 %p154, %r95, %r97; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.eq.f32 %p155, %r95, %r97; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p156, %r95, %r95; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p157, %r97, %r97; + setp.num.f32 %p158, %r97, %r97; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p159, %p156, %p158; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p160, %p154, %p159; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p161, %p157, %p156; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p162, %p155, %p161; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.lt.s32 %p163, %r96, %r98; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p164, %p163, %p162; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p165, %p160, %p164; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.b32 %r59, %r96, %r98, %p165; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.eq.b32 %p62, %r70, 0; + shr.u32 %r99, %r68, 3; + and.b32 %r100, %r99, 60; + mov.b32 %r101, global_smem; + add.s32 %r56, %r101, %r100; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.b32 %r57, %r95, %r97, %p165; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + // begin inline asm + @%p62 st.shared.b32 [ %r56 + 0 ], %r57; + // end inline asm + add.s32 %r102, %r101, 64; + add.s32 %r58, %r102, %r100; + // begin inline asm + @%p62 st.shared.b32 [ %r58 + 0 ], %r59; + // end inline asm + bar.sync 0; + setp.lt.u32 %p64, %r68, 16; + shl.b32 %r103, %r68, 2; + add.s32 %r61, %r101, %r103; + // begin inline asm + @%p64 ld.shared.b32 %r60, [ %r61 + 0 ]; + // end inline asm + add.s32 %r63, %r102, %r103; + // begin inline asm + @%p64 ld.shared.b32 %r62, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r104, %r60, 8, 31, -1; + shfl.sync.bfly.b32 %r105, %r62, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.gt.f32 %p166, %r60, %r104; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.eq.f32 %p167, %r60, %r104; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p168, %r60, %r60; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p169, %r104, %r104; + setp.num.f32 %p170, %r104, %r104; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p171, %p168, %p170; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p172, %p166, %p171; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p173, %p168, %p169; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p174, %p167, %p173; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.lt.s32 %p175, %r62, %r105; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p176, %p175, %p174; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p177, %p172, %p176; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.f32 %r106, %r60, %r104, %p177; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.b32 %r107, %r62, %r105, %p177; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + shfl.sync.bfly.b32 %r108, %r106, 4, 31, -1; + shfl.sync.bfly.b32 %r109, %r107, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.gt.f32 %p178, %r106, %r108; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.eq.f32 %p179, %r106, %r108; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p180, %r106, %r106; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p181, %r108, %r108; + setp.num.f32 %p182, %r108, %r108; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p183, %p180, %p182; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p184, %p178, %p183; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p185, %p181, %p180; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p186, %p179, %p185; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.lt.s32 %p187, %r107, %r109; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p188, %p187, %p186; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p189, %p184, %p188; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.f32 %r110, %r106, %r108, %p189; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.b32 %r111, %r107, %r109, %p189; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + shfl.sync.bfly.b32 %r112, %r110, 2, 31, -1; + shfl.sync.bfly.b32 %r113, %r111, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.gt.f32 %p190, %r110, %r112; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.eq.f32 %p191, %r110, %r112; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p192, %r110, %r110; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p193, %r112, %r112; + setp.num.f32 %p194, %r112, %r112; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p195, %p192, %p194; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p196, %p190, %p195; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p197, %p193, %p192; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p198, %p191, %p197; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.lt.s32 %p199, %r111, %r113; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p200, %p199, %p198; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p201, %p196, %p200; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.f32 %r114, %r110, %r112, %p201; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.b32 %r115, %r111, %r113, %p201; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + shfl.sync.bfly.b32 %r116, %r114, 1, 31, -1; + shfl.sync.bfly.b32 %r117, %r115, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.gt.f32 %p202, %r114, %r116; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.eq.f32 %p203, %r114, %r116; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p204, %r114, %r114; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.nan.f32 %p205, %r116, %r116; + setp.num.f32 %p206, %r116, %r116; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p207, %p204, %p206; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p208, %p202, %p207; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p209, %p205, %p204; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p210, %p203, %p209; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.lt.s32 %p211, %r115, %r117; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + and.pred %p212, %p211, %p210; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + or.pred %p213, %p208, %p212; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.b32 %r67, %r115, %r117, %p213; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + setp.eq.b32 %p66, %r68, 0; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + selp.b32 %r65, %r114, %r116, %p213; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:45:75 ] + // begin inline asm + @%p66 st.shared.b32 [ %r61 + 0 ], %r65; + // end inline asm + // begin inline asm + @%p66 st.shared.b32 [ %r63 + 0 ], %r67; + // end inline asm + bar.sync 0; + ld.shared.s32 %rd62, [global_smem+64]; +$L__tmp8: + .loc 1 47 25 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:47:25 + shl.b64 %rd64, %rd2, 3; + add.s64 %rd63, %rd19, %rd64; + .loc 1 47 36 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:47:36 + setp.eq.b32 %p214, %r1, 0; + and.pred %p68, %p214, %p3; + // begin inline asm + @%p68 st.global.b64 [ %rd63 + 0 ], { %rd62 }; + // end inline asm + .loc 1 47 4 // cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py:47:4 + ret; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 234 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 103 +.b8 112 +.b8 115 +.b8 102 +.b8 122 +.b8 115 +.b8 119 +.b8 122 +.b8 100 +.b8 100 +.b8 97 +.b8 51 +.b8 120 +.b8 113 +.b8 116 +.b8 117 +.b8 122 +.b8 115 +.b8 118 +.b8 110 +.b8 54 +.b8 54 +.b8 121 +.b8 99 +.b8 52 +.b8 104 +.b8 119 +.b8 112 +.b8 114 +.b8 54 +.b8 50 +.b8 109 +.b8 106 +.b8 105 +.b8 116 +.b8 50 +.b8 51 +.b8 117 +.b8 108 +.b8 115 +.b8 100 +.b8 122 +.b8 110 +.b8 97 +.b8 105 +.b8 101 +.b8 120 +.b8 100 +.b8 105 +.b8 109 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 103 +.b8 112 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.source new file mode 100644 index 0000000000000000000000000000000000000000..4f644534ada46d87975fff1a19060f1753afc7b1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.source @@ -0,0 +1,317 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":18:0) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc47 = loc(unknown) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc72 = loc("in_ptr0"(#loc)) +#loc73 = loc("out_ptr0"(#loc)) +#loc74 = loc("ks0"(#loc)) +#loc75 = loc("ks1"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc106 = loc("a_value"(#loc35)) +#loc107 = loc("a_index"(#loc35)) +#loc108 = loc("b_value"(#loc35)) +#loc109 = loc("b_index"(#loc35)) +#loc122 = loc("x"(#loc55)) +#loc123 = loc("x"(#loc59)) +#loc124 = loc("value"(#loc68)) +#loc125 = loc("index"(#loc68)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 32000 : i32 loc(#loc78) + %xoffset = tt.get_program_id x : i32 loc(#loc79) + %xoffset_1 = arith.constant 1 : i32 loc(#loc80) + %xoffset_2 = arith.constant 1 : i32 loc(#loc80) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc80) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc81) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc82) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc83) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc83) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc84) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc84) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc85) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc86) + %x0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc87) + %x0_9 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc87) + %x0_10 = arith.remsi %x0, %x0_9 : tensor<1x1xi64> loc(#loc87) + %x1 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc88) + %x1_11 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc88) + %x1_12 = arith.divsi %x1, %x1_11 : tensor<1x1xi64> loc(#loc88) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc89) + %_tmp2_13 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc89) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc90) + %_tmp2_index_14 = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc90) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp2_index_15:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_16 = %_tmp2_13, %_tmp2_index_17 = %_tmp2_index_14) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc92) + %r0_index_18 = arith.addi %r0_index, %r0_base_8 : tensor<1x2048xi32> loc(#loc92) + %r0_mask = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc93) + %r0_mask_19 = arith.cmpi slt, %r0_index_18, %r0_mask : tensor<1x2048xi32> loc(#loc93) + %tmp0 = arith.constant 32000 : i32 loc(#loc94) + %tmp0_20 = arith.constant 32000 : i64 loc(#loc94) + %tmp0_21 = arith.constant dense<32000> : tensor<1x1xi64> loc(#loc94) + %tmp0_22 = arith.muli %tmp0_21, %x0_10 : tensor<1x1xi64> loc(#loc94) + %tmp0_23 = arith.extsi %r0_index_18 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc95) + %tmp0_24 = tt.broadcast %tmp0_22 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc95) + %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<1x2048xi64> loc(#loc95) + %tmp0_26 = tt.splat %ks1 : i64 -> tensor<1x1xi64> loc(#loc96) + %tmp0_27 = arith.muli %tmp0_26, %x1_12 : tensor<1x1xi64> loc(#loc96) + %tmp0_28 = tt.broadcast %tmp0_27 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc97) + %tmp0_29 = arith.addi %tmp0_25, %tmp0_28 : tensor<1x2048xi64> loc(#loc97) + %tmp0_30 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc98) + %tmp0_31 = tt.addptr %tmp0_30, %tmp0_29 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc98) + %tmp0_32 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc99) + %tmp0_33 = arith.andi %r0_mask_19, %tmp0_32 : tensor<1x2048xi1> loc(#loc99) + %tmp0_34 = arith.constant 0.000000e+00 : f32 loc(#loc100) + %tmp0_35 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc100) + %tmp0_36 = tt.load %tmp0_31, %tmp0_33, %tmp0_35 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc100) + %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%_tmp2_16, %_tmp2_index_17, %tmp0_36, %r0_index_18) : (tensor<1x2048xf32>, tensor<1x2048xi32>, tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) loc(#loc24) + %_tmp2_37 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc101) + %_tmp2_38 = arith.andi %r0_mask_19, %_tmp2_37 : tensor<1x2048xi1> loc(#loc101) + %_tmp2_39 = arith.select %_tmp2_38, %8#0, %_tmp2_16 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc102) + %_tmp2_index_40 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc103) + %_tmp2_index_41 = arith.andi %r0_mask_19, %_tmp2_index_40 : tensor<1x2048xi1> loc(#loc103) + %_tmp2_index_42 = arith.select %_tmp2_index_41, %8#1, %_tmp2_index_17 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc104) + scf.yield %_tmp2_39, %_tmp2_index_42 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc29) + } loc(#loc126) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%_tmp2_index_15#0, %_tmp2_index_15#1) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc30) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc105) + %5 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc32) + %6 = tt.addptr %5, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc32) + %7 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc33) + tt.store %6, %7, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc33) + tt.return loc(#loc34) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%a_value: tensor<1x2048xf32> loc("a_value"(#loc35)), %a_index: tensor<1x2048xi32> loc("a_index"(#loc35)), %b_value: tensor<1x2048xf32> loc("b_value"(#loc35)), %b_index: tensor<1x2048xi32> loc("b_index"(#loc35))) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<1x2048xf32> loc(#loc127) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<1x2048xf32> loc(#loc128) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%a_value) : (tensor<1x2048xf32>) -> i1 loc(#loc38) + %1:2 = scf.if %0 -> (tensor<1x2048xi1>, tensor<1x2048xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<1x2048xf32> loc(#loc112) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<1x2048xf32> loc(#loc113) + %mask_3 = arith.constant true loc(#loc114) + %mask_4 = arith.constant dense : tensor<1x2048xi1> loc(#loc114) + %mask_5 = arith.xori %b_isnan, %mask_4 : tensor<1x2048xi1> loc(#loc114) + %mask_6 = arith.andi %a_isnan, %mask_5 : tensor<1x2048xi1> loc(#loc115) + %mask_7 = arith.ori %mask, %mask_6 : tensor<1x2048xi1> loc(#loc129) + %equal_8 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc117) + %equal_9 = arith.ori %equal, %equal_8 : tensor<1x2048xi1> loc(#loc130) + scf.yield %mask_7, %equal_9 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc130) + } else { + scf.yield %mask, %equal : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc47) + } loc(#loc39) + %mask_0 = arith.cmpi slt, %a_index, %b_index : tensor<1x2048xi32> loc(#loc119) + %mask_1 = arith.andi %1#1, %mask_0 : tensor<1x2048xi1> loc(#loc120) + %mask_2 = arith.ori %1#0, %mask_1 : tensor<1x2048xi1> loc(#loc121) + %2 = arith.select %mask_2, %a_value, %b_value : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc51) + %3 = arith.select %mask_2, %a_index, %b_index : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc52) + tt.return %2, %3 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc53) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x2048xf32> loc(#loc54) + %5 = ub.poison : tensor<1x2048xi32> loc(#loc54) + tt.return %4, %5 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc54) + } loc(#loc35) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc55))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc56) + %true = arith.constant true loc(#loc57) + tt.return %true : i1 loc(#loc57) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc58) + tt.return %1 : i1 loc(#loc58) + } loc(#loc55) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc59))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc60) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc61) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc61) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc61) + %4 = arith.addf %x, %3 : tensor<1x2048xf32> loc(#loc61) + tt.return %4 : tensor<1x2048xf32> loc(#loc62) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x2048xf32> loc(#loc63) + tt.return %5 : tensor<1x2048xf32> loc(#loc63) + } loc(#loc59) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc65) + %cst = arith.constant dense : tensor<1xi1> loc(#loc65) + tt.return %cst : tensor<1xi1> loc(#loc66) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc67) + tt.return %0 : tensor<1xi1> loc(#loc67) + } loc(#loc64) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%value: tensor<1x2048xf32> loc("value"(#loc68)), %index: tensor<1x2048xi32> loc("index"(#loc68))) -> (tensor<1xf32>, tensor<1xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc69) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc69) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc69) + tt.return %0#0, %0#1 : tensor<1xf32>, tensor<1xi32> loc(#loc70) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc71) + %2 = ub.poison : tensor<1xi32> loc(#loc71) + tt.return %1, %2 : tensor<1xf32>, tensor<1xi32> loc(#loc71) + } loc(#loc68) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc35)), %a_index: i32 loc("a_index"(#loc35)), %b_value: f32 loc("b_value"(#loc35)), %b_index: i32 loc("b_index"(#loc35))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc127) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc128) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc38) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc112) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc113) + %mask_3 = arith.constant true loc(#loc114) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc114) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc115) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc129) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc117) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc130) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc130) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc47) + } loc(#loc39) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc119) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc120) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc121) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc51) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc52) + tt.return %2, %3 : f32, i32 loc(#loc53) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc54) + %5 = ub.poison : i32 loc(#loc54) + tt.return %4, %5 : f32, i32 loc(#loc54) + } loc(#loc35) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc55))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc56) + %true = arith.constant true loc(#loc57) + tt.return %true : i1 loc(#loc57) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc58) + tt.return %1 : i1 loc(#loc58) + } loc(#loc55) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc59))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc60) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc61) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc61) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc61) + tt.return %3 : tensor<1xf32> loc(#loc62) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc63) + tt.return %4 : tensor<1xf32> loc(#loc63) + } loc(#loc59) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":29:55) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":30:58) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":32:40) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":33:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":34:29) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:47) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:41) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:56) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:52) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:71) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:61) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":41:38) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":43:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":43:54) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":44:41) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":44:66) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":44:8) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":45:75) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":46:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":47:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":47:36) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":47:4) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc78 = loc("r0_numel"(#loc1)) +#loc79 = loc("xoffset"(#loc2)) +#loc80 = loc("xoffset"(#loc3)) +#loc81 = loc("xindex"(#loc4)) +#loc82 = loc("xindex"(#loc5)) +#loc83 = loc("xindex"(#loc6)) +#loc84 = loc("xmask"(#loc7)) +#loc85 = loc("r0_base"(#loc8)) +#loc86 = loc("r0_base"(#loc9)) +#loc87 = loc("x0"(#loc10)) +#loc88 = loc("x1"(#loc11)) +#loc89 = loc("_tmp2"(#loc12)) +#loc90 = loc("_tmp2_index"(#loc13)) +#loc91 = loc("_tmp2"(#loc14)) +#loc92 = loc("r0_index"(#loc15)) +#loc93 = loc("r0_mask"(#loc16)) +#loc94 = loc("tmp0"(#loc17)) +#loc95 = loc("tmp0"(#loc18)) +#loc96 = loc("tmp0"(#loc19)) +#loc97 = loc("tmp0"(#loc20)) +#loc98 = loc("tmp0"(#loc21)) +#loc99 = loc("tmp0"(#loc22)) +#loc100 = loc("tmp0"(#loc23)) +#loc101 = loc("_tmp2"(#loc25)) +#loc102 = loc("_tmp2"(#loc26)) +#loc103 = loc("_tmp2_index"(#loc27)) +#loc104 = loc("_tmp2_index"(#loc28)) +#loc105 = loc("tmp2"(#loc31)) +#loc110 = loc("mask"(#loc36)) +#loc111 = loc("equal"(#loc37)) +#loc112 = loc("a_isnan"(#loc40)) +#loc113 = loc("b_isnan"(#loc41)) +#loc114 = loc("mask"(#loc42)) +#loc115 = loc("mask"(#loc43)) +#loc116 = loc("mask"(#loc44)) +#loc117 = loc("equal"(#loc45)) +#loc118 = loc("equal"(#loc46)) +#loc119 = loc("mask"(#loc48)) +#loc120 = loc("mask"(#loc49)) +#loc121 = loc("mask"(#loc50)) +#loc126 = loc("_tmp2_index"(#loc91)) +#loc127 = loc("mask"(#loc110)) +#loc128 = loc("equal"(#loc111)) +#loc129 = loc("mask"(#loc116)) +#loc130 = loc("equal"(#loc118)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..4e06babe67f8ba60102db5e186f5b84f2b8f5e79 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.ttgir @@ -0,0 +1,198 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":18:0) +#loc1 = loc(unknown) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":45:75) +#loc41 = loc("in_ptr0"(#loc)) +#loc42 = loc("out_ptr0"(#loc)) +#loc43 = loc("ks0"(#loc)) +#loc44 = loc("ks1"(#loc)) +#loc45 = loc("xnumel"(#loc)) +#loc46 = loc("r0_numel"(#loc)) +#loc79 = loc(callsite(#loc1 at #loc36)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<2147483647> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<32000> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %cst_3 = arith.constant dense : tensor<1x2048xi1, #blocked> loc(#loc1) + %true = arith.constant true loc(#loc1) + %c32000_i64 = arith.constant 32000 : i64 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc47) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc48) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc49) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc49) + %x0 = arith.extsi %xoffset : i32 to i64 loc(#loc50) + %x0_5 = arith.remsi %x0, %ks0 : i64 loc(#loc50) + %x1 = arith.divsi %x0, %ks0 : i64 loc(#loc51) + %tmp0 = arith.muli %x0_5, %c32000_i64 : i64 loc(#loc52) + %tmp0_6 = tt.splat %tmp0 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc81) + %tmp0_7 = arith.muli %ks1, %x1 : i64 loc(#loc54) + %tmp0_8 = tt.splat %tmp0_7 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc82) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc56) + %tmp0_10 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc83) + %_tmp2_index:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp2 = %cst, %_tmp2_index_11 = %cst_0) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc59) + %r0_index_12 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32, #blocked> loc(#loc59) + %r0_mask = arith.cmpi slt, %r0_index_12, %cst_1 : tensor<1x2048xi32, #blocked> loc(#loc60) + %tmp0_13 = arith.extsi %r0_index_12 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc53) + %tmp0_14 = arith.addi %tmp0_13, %tmp0_6 : tensor<1x2048xi64, #blocked> loc(#loc53) + %tmp0_15 = arith.addi %tmp0_14, %tmp0_8 : tensor<1x2048xi64, #blocked> loc(#loc55) + %tmp0_16 = tt.addptr %tmp0_9, %tmp0_15 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc56) + %tmp0_17 = arith.andi %r0_mask, %tmp0_10 : tensor<1x2048xi1, #blocked> loc(#loc57) + %tmp0_18 = tt.load %tmp0_16, %tmp0_17, %cst_2 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc61) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_18 : tensor<1x2048xf32, #blocked> loc(#loc107) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_18 : tensor<1x2048xf32, #blocked> loc(#loc108) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<1x2048xf32, #blocked> loc(#loc87) + %b_isnan = arith.cmpf une, %tmp0_18, %tmp0_18 : tensor<1x2048xf32, #blocked> loc(#loc88) + %mask_19 = arith.xori %b_isnan, %cst_3 : tensor<1x2048xi1, #blocked> loc(#loc89) + %mask_20 = arith.andi %a_isnan, %mask_19 : tensor<1x2048xi1, #blocked> loc(#loc90) + %mask_21 = arith.ori %mask, %mask_20 : tensor<1x2048xi1, #blocked> loc(#loc109) + %equal_22 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1, #blocked> loc(#loc92) + %equal_23 = arith.ori %equal, %equal_22 : tensor<1x2048xi1, #blocked> loc(#loc110) + %mask_24 = arith.cmpi slt, %_tmp2_index_11, %r0_index_12 : tensor<1x2048xi32, #blocked> loc(#loc94) + %mask_25 = arith.andi %equal_23, %mask_24 : tensor<1x2048xi1, #blocked> loc(#loc95) + %mask_26 = arith.ori %mask_21, %mask_25 : tensor<1x2048xi1, #blocked> loc(#loc96) + %6 = arith.select %mask_26, %_tmp2, %tmp0_18 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc74) + %7 = arith.select %mask_26, %_tmp2_index_11, %r0_index_12 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc75) + %_tmp2_27 = arith.select %tmp0_17, %6, %_tmp2 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc76) + %_tmp2_index_28 = arith.select %tmp0_17, %7, %_tmp2_index_11 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc77) + scf.yield %_tmp2_27, %_tmp2_index_28 : tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc34) + } loc(#loc84) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc36)), %arg7: i32 loc(callsite(#loc1 at #loc36)), %arg8: f32 loc(callsite(#loc1 at #loc36)), %arg9: i32 loc(callsite(#loc1 at #loc36))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc111) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc112) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc97) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc98) + %mask_11 = arith.xori %b_isnan, %true : i1 loc(#loc99) + %mask_12 = arith.andi %a_isnan, %mask_11 : i1 loc(#loc100) + %mask_13 = arith.ori %mask, %mask_12 : i1 loc(#loc113) + %equal_14 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc101) + %equal_15 = arith.ori %equal, %equal_14 : i1 loc(#loc114) + %mask_16 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc102) + %mask_17 = arith.andi %equal_15, %mask_16 : i1 loc(#loc103) + %mask_18 = arith.ori %mask_13, %mask_17 : i1 loc(#loc104) + %6 = arith.select %mask_18, %arg6, %arg8 : f32 loc(#loc105) + %7 = arith.select %mask_18, %arg7, %arg9 : i32 loc(#loc106) + tt.reduce.return %6, %7 : f32, i32 loc(#loc78) + }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc78) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi32, #blocked> loc(#loc80) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc38) + %2 = ttg.convert_layout %tmp2 : tensor<1x1xi32, #blocked> -> tensor<1x1xi32, #blocked1> loc(#loc39) + %3 = arith.extsi %2 : tensor<1x1xi32, #blocked1> to tensor<1x1xi64, #blocked1> loc(#loc39) + %4 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc39) + %5 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked1> loc(#loc39) + tt.store %4, %3, %5 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc39) + tt.return loc(#loc40) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":27:19) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":28:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:47) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:41) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:56) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:52) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:71) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":32:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":33:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":34:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:61) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":41:38) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":43:54) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":44:66) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":44:8) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":46:20) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":47:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":47:36) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":47:4) +#loc47 = loc("xoffset"(#loc2)) +#loc48 = loc("xmask"(#loc3)) +#loc49 = loc("r0_base"(#loc4)) +#loc50 = loc("x0"(#loc5)) +#loc51 = loc("x1"(#loc6)) +#loc52 = loc("tmp0"(#loc7)) +#loc53 = loc("tmp0"(#loc8)) +#loc54 = loc("tmp0"(#loc9)) +#loc55 = loc("tmp0"(#loc10)) +#loc56 = loc("tmp0"(#loc11)) +#loc57 = loc("tmp0"(#loc12)) +#loc58 = loc("_tmp2"(#loc13)) +#loc59 = loc("r0_index"(#loc14)) +#loc60 = loc("r0_mask"(#loc15)) +#loc61 = loc("tmp0"(#loc16)) +#loc62 = loc("mask"(#loc17)) +#loc63 = loc("equal"(#loc19)) +#loc64 = loc("a_isnan"(#loc20)) +#loc65 = loc("b_isnan"(#loc21)) +#loc66 = loc("mask"(#loc22)) +#loc67 = loc("mask"(#loc23)) +#loc68 = loc("mask"(#loc24)) +#loc69 = loc("equal"(#loc25)) +#loc70 = loc("equal"(#loc26)) +#loc71 = loc("mask"(#loc27)) +#loc72 = loc("mask"(#loc28)) +#loc73 = loc("mask"(#loc29)) +#loc74 = loc(callsite(#loc30 at #loc18)) +#loc75 = loc(callsite(#loc31 at #loc18)) +#loc76 = loc("_tmp2"(#loc32)) +#loc77 = loc("_tmp2_index"(#loc33)) +#loc78 = loc(callsite(#loc35 at #loc36)) +#loc80 = loc("tmp2"(#loc37)) +#loc81 = loc(fused[#loc53, #loc52]) +#loc82 = loc(fused[#loc55, #loc54]) +#loc83 = loc(fused[#loc57, #loc48]) +#loc84 = loc("_tmp2_index"(#loc58)) +#loc85 = loc("mask"(#loc62)) +#loc86 = loc("equal"(#loc63)) +#loc87 = loc(callsite(#loc64 at #loc18)) +#loc88 = loc(callsite(#loc65 at #loc18)) +#loc89 = loc(callsite(#loc66 at #loc18)) +#loc90 = loc(callsite(#loc67 at #loc18)) +#loc91 = loc("mask"(#loc68)) +#loc92 = loc(callsite(#loc69 at #loc18)) +#loc93 = loc("equal"(#loc70)) +#loc94 = loc(callsite(#loc71 at #loc18)) +#loc95 = loc(callsite(#loc72 at #loc18)) +#loc96 = loc(callsite(#loc73 at #loc18)) +#loc97 = loc(callsite(#loc64 at #loc78)) +#loc98 = loc(callsite(#loc65 at #loc78)) +#loc99 = loc(callsite(#loc66 at #loc78)) +#loc100 = loc(callsite(#loc67 at #loc78)) +#loc101 = loc(callsite(#loc69 at #loc78)) +#loc102 = loc(callsite(#loc71 at #loc78)) +#loc103 = loc(callsite(#loc72 at #loc78)) +#loc104 = loc(callsite(#loc73 at #loc78)) +#loc105 = loc(callsite(#loc30 at #loc78)) +#loc106 = loc(callsite(#loc31 at #loc78)) +#loc107 = loc(callsite(#loc85 at #loc18)) +#loc108 = loc(callsite(#loc86 at #loc18)) +#loc109 = loc(callsite(#loc91 at #loc18)) +#loc110 = loc(callsite(#loc93 at #loc18)) +#loc111 = loc(callsite(#loc85 at #loc78)) +#loc112 = loc(callsite(#loc86 at #loc78)) +#loc113 = loc(callsite(#loc91 at #loc78)) +#loc114 = loc(callsite(#loc93 at #loc78)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..1bd32404304cc7eb063c720600774984deeb9f20 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HAILX5Z6XDOE3PGHQZX4ABAJBEEI4J2UQHMI6VLNTQGT6BMZJQQQ/triton_red_fused_argmax_1.ttir @@ -0,0 +1,201 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":45:75) +#loc44 = loc("in_ptr0"(#loc)) +#loc45 = loc("out_ptr0"(#loc)) +#loc46 = loc("ks0"(#loc)) +#loc47 = loc("ks1"(#loc)) +#loc48 = loc("xnumel"(#loc)) +#loc49 = loc("r0_numel"(#loc)) +#loc50 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c32000_i64 = arith.constant 32000 : i64 loc(#loc1) + %true = arith.constant true loc(#loc50) + %cst = arith.constant dense : tensor<1x2048xi1> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc3) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc1) + %cst_1 = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc51) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc52) + %xoffset = tt.get_program_id x : i32 loc(#loc53) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc54) + %xmask_2 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc54) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc55) + %r0_base_3 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc56) + %x0 = arith.extsi %xoffset : i32 to i64 loc(#loc57) + %x0_4 = arith.remsi %x0, %ks0 : i64 loc(#loc57) + %x1 = arith.divsi %x0, %ks0 : i64 loc(#loc58) + %_tmp2_index_5:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp2_6 = %_tmp2, %_tmp2_index_7 = %_tmp2_index) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc60) + %r0_index_8 = arith.addi %r0_index, %r0_base_3 : tensor<1x2048xi32> loc(#loc60) + %r0_mask = arith.cmpi slt, %r0_index_8, %cst_1 : tensor<1x2048xi32> loc(#loc61) + %tmp0 = arith.muli %x0_4, %c32000_i64 : i64 loc(#loc62) + %tmp0_9 = arith.extsi %r0_index_8 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc63) + %tmp0_10 = tt.splat %tmp0 : i64 -> tensor<1x2048xi64> loc(#loc88) + %tmp0_11 = arith.addi %tmp0_9, %tmp0_10 : tensor<1x2048xi64> loc(#loc63) + %tmp0_12 = arith.muli %ks1, %x1 : i64 loc(#loc64) + %tmp0_13 = tt.splat %tmp0_12 : i64 -> tensor<1x2048xi64> loc(#loc89) + %tmp0_14 = arith.addi %tmp0_11, %tmp0_13 : tensor<1x2048xi64> loc(#loc65) + %tmp0_15 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc66) + %tmp0_16 = tt.addptr %tmp0_15, %tmp0_14 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc66) + %tmp0_17 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc90) + %tmp0_18 = arith.andi %r0_mask, %tmp0_17 : tensor<1x2048xi1> loc(#loc67) + %tmp0_19 = tt.load %tmp0_16, %tmp0_18, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc68) + %mask = arith.cmpf ogt, %_tmp2_6, %tmp0_19 : tensor<1x2048xf32> loc(#loc113) + %equal = arith.cmpf oeq, %_tmp2_6, %tmp0_19 : tensor<1x2048xf32> loc(#loc114) + %a_isnan = arith.cmpf une, %_tmp2_6, %_tmp2_6 : tensor<1x2048xf32> loc(#loc93) + %b_isnan = arith.cmpf une, %tmp0_19, %tmp0_19 : tensor<1x2048xf32> loc(#loc94) + %mask_20 = arith.xori %b_isnan, %cst : tensor<1x2048xi1> loc(#loc95) + %mask_21 = arith.andi %a_isnan, %mask_20 : tensor<1x2048xi1> loc(#loc96) + %mask_22 = arith.ori %mask, %mask_21 : tensor<1x2048xi1> loc(#loc115) + %equal_23 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc98) + %equal_24 = arith.ori %equal, %equal_23 : tensor<1x2048xi1> loc(#loc116) + %mask_25 = arith.cmpi slt, %_tmp2_index_7, %r0_index_8 : tensor<1x2048xi32> loc(#loc100) + %mask_26 = arith.andi %equal_24, %mask_25 : tensor<1x2048xi1> loc(#loc101) + %mask_27 = arith.ori %mask_22, %mask_26 : tensor<1x2048xi1> loc(#loc102) + %4 = arith.select %mask_27, %_tmp2_6, %tmp0_19 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc81) + %5 = arith.select %mask_27, %_tmp2_index_7, %r0_index_8 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc82) + %_tmp2_28 = arith.select %tmp0_18, %4, %_tmp2_6 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc83) + %_tmp2_index_29 = arith.select %tmp0_18, %5, %_tmp2_index_7 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc84) + scf.yield %_tmp2_28, %_tmp2_index_29 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc38) + } loc(#loc87) + %0:2 = "tt.reduce"(%_tmp2_index_5#0, %_tmp2_index_5#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2)), %arg8: f32 loc(callsite(#loc1 at #loc2)), %arg9: i32 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc117) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc118) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc103) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc104) + %mask_6 = arith.xori %b_isnan, %true : i1 loc(#loc105) + %mask_7 = arith.andi %a_isnan, %mask_6 : i1 loc(#loc106) + %mask_8 = arith.ori %mask, %mask_7 : i1 loc(#loc119) + %equal_9 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc107) + %equal_10 = arith.ori %equal, %equal_9 : i1 loc(#loc120) + %mask_11 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc108) + %mask_12 = arith.andi %equal_10, %mask_11 : i1 loc(#loc109) + %mask_13 = arith.ori %mask_8, %mask_12 : i1 loc(#loc110) + %4 = arith.select %mask_13, %arg6, %arg8 : f32 loc(#loc111) + %5 = arith.select %mask_13, %arg7, %arg9 : i32 loc(#loc112) + tt.reduce.return %4, %5 : f32, i32 loc(#loc85) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc85) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc86) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc41) + %2 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc41) + %3 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc42) + tt.store %2, %3, %xmask_2 : tensor<1x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":32:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":30:58) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":29:55) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":22:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":33:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":34:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:47) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:41) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:56) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:52) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:71) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":38:61) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":41:38) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":43:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":44:66) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":44:8) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":46:20) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":47:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":47:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gp/cgpsfzswzdda3xqtuzsvn66yc4hwpr62mjit23ulsdznaiexdimr.py":47:4) +#loc51 = loc("_tmp2_index"(#loc4)) +#loc52 = loc("_tmp2"(#loc5)) +#loc53 = loc("xoffset"(#loc6)) +#loc54 = loc("xmask"(#loc7)) +#loc55 = loc("r0_base"(#loc8)) +#loc56 = loc("r0_base"(#loc9)) +#loc57 = loc("x0"(#loc10)) +#loc58 = loc("x1"(#loc11)) +#loc59 = loc("_tmp2"(#loc3)) +#loc60 = loc("r0_index"(#loc12)) +#loc61 = loc("r0_mask"(#loc13)) +#loc62 = loc("tmp0"(#loc14)) +#loc63 = loc("tmp0"(#loc15)) +#loc64 = loc("tmp0"(#loc16)) +#loc65 = loc("tmp0"(#loc17)) +#loc66 = loc("tmp0"(#loc18)) +#loc67 = loc("tmp0"(#loc19)) +#loc68 = loc("tmp0"(#loc20)) +#loc69 = loc("mask"(#loc21)) +#loc70 = loc("equal"(#loc23)) +#loc71 = loc("a_isnan"(#loc24)) +#loc72 = loc("b_isnan"(#loc25)) +#loc73 = loc("mask"(#loc26)) +#loc74 = loc("mask"(#loc27)) +#loc75 = loc("mask"(#loc28)) +#loc76 = loc("equal"(#loc29)) +#loc77 = loc("equal"(#loc30)) +#loc78 = loc("mask"(#loc31)) +#loc79 = loc("mask"(#loc32)) +#loc80 = loc("mask"(#loc33)) +#loc81 = loc(callsite(#loc34 at #loc22)) +#loc82 = loc(callsite(#loc35 at #loc22)) +#loc83 = loc("_tmp2"(#loc36)) +#loc84 = loc("_tmp2_index"(#loc37)) +#loc85 = loc(callsite(#loc39 at #loc2)) +#loc86 = loc("tmp2"(#loc40)) +#loc87 = loc("_tmp2_index"(#loc59)) +#loc88 = loc(fused[#loc63, #loc62]) +#loc89 = loc(fused[#loc65, #loc64]) +#loc90 = loc(fused[#loc67, #loc54]) +#loc91 = loc("mask"(#loc69)) +#loc92 = loc("equal"(#loc70)) +#loc93 = loc(callsite(#loc71 at #loc22)) +#loc94 = loc(callsite(#loc72 at #loc22)) +#loc95 = loc(callsite(#loc73 at #loc22)) +#loc96 = loc(callsite(#loc74 at #loc22)) +#loc97 = loc("mask"(#loc75)) +#loc98 = loc(callsite(#loc76 at #loc22)) +#loc99 = loc("equal"(#loc77)) +#loc100 = loc(callsite(#loc78 at #loc22)) +#loc101 = loc(callsite(#loc79 at #loc22)) +#loc102 = loc(callsite(#loc80 at #loc22)) +#loc103 = loc(callsite(#loc71 at #loc85)) +#loc104 = loc(callsite(#loc72 at #loc85)) +#loc105 = loc(callsite(#loc73 at #loc85)) +#loc106 = loc(callsite(#loc74 at #loc85)) +#loc107 = loc(callsite(#loc76 at #loc85)) +#loc108 = loc(callsite(#loc78 at #loc85)) +#loc109 = loc(callsite(#loc79 at #loc85)) +#loc110 = loc(callsite(#loc80 at #loc85)) +#loc111 = loc(callsite(#loc34 at #loc85)) +#loc112 = loc(callsite(#loc35 at #loc85)) +#loc113 = loc(callsite(#loc91 at #loc22)) +#loc114 = loc(callsite(#loc92 at #loc22)) +#loc115 = loc(callsite(#loc97 at #loc22)) +#loc116 = loc(callsite(#loc99 at #loc22)) +#loc117 = loc(callsite(#loc91 at #loc85)) +#loc118 = loc(callsite(#loc92 at #loc85)) +#loc119 = loc(callsite(#loc97 at #loc85)) +#loc120 = loc(callsite(#loc99 at #loc85)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/__grp__triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/__grp__triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3dbac0d0aed41dbe1112fb25bb6879562f614ae3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/__grp__triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.source", "triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttir", "triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttgir", "triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.llir", "triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ptx", "triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.cubin", "triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.json new file mode 100644 index 0000000000000000000000000000000000000000..82166f95f87b7a0217b8be4f9495538163132476 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.json @@ -0,0 +1 @@ +{"hash": "3b4049401792f1b8b25c740c50a5b8608b75486a3dbe773aed4756f37879e14b", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.llir new file mode 100644 index 0000000000000000000000000000000000000000..fdc2bff620ee07a73b7f113e3d26999037bc7f0e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.llir @@ -0,0 +1,208 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py\00" +@assertMessage_0 = internal constant [90 x i8] c"index out of bounds: 0 <= tmp10 < 1 + (triton_helpers.div_floor_integer(127 + ks3, 128))\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i64 %4, i64 %5, i64 %6, i64 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %13 = shl i32 %12, 8, !dbg !11 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %15 = and i32 %14, 127, !dbg !12 + %16 = icmp slt i64 %4, 2, !dbg !13 + %17 = icmp sgt i64 %4, 1, !dbg !14 + %18 = select i1 %17, i64 %4, i64 0, !dbg !15 + %19 = zext i1 %16 to i64, !dbg !16 + %20 = add i64 %18, %19, !dbg !17 + %21 = icmp slt i64 %5, 2, !dbg !18 + %22 = icmp sgt i64 %5, 1, !dbg !19 + %23 = select i1 %22, i64 %5, i64 0, !dbg !20 + %24 = zext i1 %21 to i64, !dbg !16 + %25 = add i64 %23, %24, !dbg !21 + %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !22 + %27 = add i64 %4, 1, !dbg !23 + %28 = add i64 %7, 127, !dbg !24 + %29 = sdiv i64 %28, 128, !dbg !25 + %30 = and i64 %28, 127, !dbg !29 + %.not = icmp ne i64 %30, 0, !dbg !29 + %31 = icmp slt i64 %28, 0, !dbg !30 + %narrow = and i1 %31, %.not, !dbg !31 + %32 = sext i1 %narrow to i64, !dbg !31 + %33 = add nsw i64 %29, %32, !dbg !31 + %34 = or disjoint i32 %13, %15, !dbg !32 + %35 = or disjoint i32 %34, 128, !dbg !32 + %36 = insertelement <2 x i32> poison, i32 %34, i64 0, !dbg !33 + %37 = insertelement <2 x i32> %36, i32 %35, i64 1, !dbg !33 + %38 = insertelement <2 x i32> poison, i32 %8, i64 0, !dbg !33 + %39 = shufflevector <2 x i32> %38, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !33 + %40 = icmp slt <2 x i32> %37, %39, !dbg !33 + %41 = sext <2 x i32> %37 to <2 x i64>, !dbg !34 + %42 = insertelement <2 x i64> poison, i64 %4, i64 0, !dbg !34 + %43 = shufflevector <2 x i64> %42, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !34 + %44 = srem <2 x i64> %41, %43, !dbg !34 + %45 = extractelement <2 x i64> %41, i64 0, !dbg !35 + %46 = sdiv i64 %45, %4, !dbg !36 + %47 = extractelement <2 x i64> %41, i64 1, !dbg !35 + %48 = sdiv i64 %47, %4, !dbg !36 + %49 = srem i64 %46, %5, !dbg !37 + %50 = srem i64 %48, %5, !dbg !37 + %51 = sdiv i64 %45, %6, !dbg !35 + %52 = sdiv i64 %47, %6, !dbg !35 + %53 = mul i64 %51, %25, !dbg !38 + %54 = mul i64 %52, %25, !dbg !38 + %reass.add = add i64 %53, %49, !dbg !39 + %reass.mul = mul i64 %reass.add, %20, !dbg !39 + %55 = extractelement <2 x i64> %44, i64 0, !dbg !39 + %56 = add i64 %reass.mul, %55, !dbg !39 + %reass.add12 = add i64 %54, %50, !dbg !39 + %reass.mul13 = mul i64 %reass.add12, %20, !dbg !39 + %57 = extractelement <2 x i64> %44, i64 1, !dbg !39 + %58 = add i64 %reass.mul13, %57, !dbg !39 + %59 = getelementptr i64, ptr addrspace(1) %0, i64 %56, !dbg !40 + %60 = getelementptr i64, ptr addrspace(1) %0, i64 %58, !dbg !40 + %61 = extractelement <2 x i1> %40, i64 0, !dbg !41 + %62 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %59, i64 %26, i1 %61) #4, !dbg !22 + %63 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !22 + %64 = extractelement <2 x i1> %40, i64 1, !dbg !41 + %65 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %60, i64 %63, i1 %64) #4, !dbg !22 + %66 = getelementptr i32, ptr addrspace(1) %1, i64 %46, !dbg !42 + %67 = getelementptr i32, ptr addrspace(1) %1, i64 %48, !dbg !42 + %68 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !41 + %69 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %66, i64 %68, i1 %61) #4, !dbg !41 + %70 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !41 + %71 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %67, i64 %70, i1 %64) #4, !dbg !41 + %72 = insertelement <2 x i32> poison, i32 %69, i64 0, !dbg !43 + %73 = insertelement <2 x i32> %72, i32 %71, i64 1, !dbg !43 + %74 = sext <2 x i32> %73 to <2 x i64>, !dbg !43 + %75 = icmp slt <2 x i64> %44, %74, !dbg !43 + %76 = insertelement <2 x i64> poison, i64 %62, i64 0, !dbg !44 + %77 = insertelement <2 x i64> %76, i64 %65, i64 1, !dbg !44 + %78 = shl <2 x i64> %77, splat (i64 32), !dbg !44 + %79 = ashr exact <2 x i64> %78, splat (i64 32), !dbg !44 + %80 = select <2 x i1> %75, <2 x i64> %79, <2 x i64> %43, !dbg !44 + %81 = icmp slt <2 x i64> %80, zeroinitializer, !dbg !45 + %82 = insertelement <2 x i64> poison, i64 %27, i64 0, !dbg !46 + %83 = shufflevector <2 x i64> %82, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !46 + %84 = select <2 x i1> %81, <2 x i64> %83, <2 x i64> zeroinitializer, !dbg !46 + %85 = add <2 x i64> %84, %80, !dbg !46 + %86 = icmp slt <2 x i64> %85, zeroinitializer, !dbg !47 + %87 = insertelement <2 x i64> poison, i64 %33, i64 0, !dbg !48 + %88 = shufflevector <2 x i64> %87, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !48 + %89 = icmp sgt <2 x i64> %85, %88, !dbg !48 + %90 = or <2 x i1> %86, %89, !dbg !49 + %91 = and <2 x i1> %40, %90, !dbg !50 + %shift = shufflevector <2 x i1> %91, <2 x i1> poison, <2 x i32> , !dbg !51 + %foldExtExtBinop = or <2 x i1> %91, %shift, !dbg !51 + %92 = extractelement <2 x i1> %foldExtExtBinop, i64 0, !dbg !51 + br i1 %92, label %93, label %94, !dbg !51 + +93: ; preds = %11 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 37, ptr nonnull @assertFunc_0, i64 1), !dbg !51 + unreachable, !dbg !51 + +94: ; preds = %11 + %95 = trunc i64 %65 to i32, !dbg !52 + %96 = trunc i64 %62 to i32, !dbg !52 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !51 + %97 = getelementptr i32, ptr addrspace(1) %2, i64 %56, !dbg !53 + %98 = getelementptr i32, ptr addrspace(1) %2, i64 %58, !dbg !53 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %96, ptr addrspace(1) %97, i1 %61) #4, !dbg !54 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %95, ptr addrspace(1) %98, i1 %64) #4, !dbg !54 + %99 = mul i64 %46, %4, !dbg !55 + %100 = mul i64 %48, %4, !dbg !55 + %101 = extractelement <2 x i64> %85, i64 0, !dbg !56 + %102 = getelementptr i32, ptr addrspace(1) %3, i64 %101, !dbg !56 + %103 = getelementptr i32, ptr addrspace(1) %102, i64 %46, !dbg !56 + %104 = getelementptr i32, ptr addrspace(1) %103, i64 %99, !dbg !56 + %105 = extractelement <2 x i64> %85, i64 1, !dbg !56 + %106 = getelementptr i32, ptr addrspace(1) %3, i64 %105, !dbg !56 + %107 = getelementptr i32, ptr addrspace(1) %106, i64 %48, !dbg !56 + %108 = getelementptr i32, ptr addrspace(1) %107, i64 %100, !dbg !56 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %104, i1 %61) #4, !dbg !57 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %108, i1 %64) #4, !dbg !57 + ret void, !dbg !58 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="128" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3", linkageName: "triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 19, column: 28, scope: !9) +!11 = !DILocation(line: 19, column: 33, scope: !9) +!12 = !DILocation(line: 20, column: 36, scope: !9) +!13 = !DILocation(line: 26, column: 54, scope: !9) +!14 = !DILocation(line: 26, column: 80, scope: !9) +!15 = !DILocation(line: 26, column: 71, scope: !9) +!16 = !DILocation(line: 26, scope: !9) +!17 = !DILocation(line: 26, column: 62, scope: !9) +!18 = !DILocation(line: 26, column: 153, scope: !9) +!19 = !DILocation(line: 26, column: 179, scope: !9) +!20 = !DILocation(line: 26, column: 170, scope: !9) +!21 = !DILocation(line: 26, column: 161, scope: !9) +!22 = !DILocation(line: 26, column: 186, scope: !9) +!23 = !DILocation(line: 33, column: 15, scope: !9) +!24 = !DILocation(line: 37, column: 90, scope: !9) +!25 = !DILocation(line: 72, column: 16, scope: !26, inlinedAt: !28) +!26 = distinct !DILexicalBlockFile(scope: !9, file: !27, discriminator: 0) +!27 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!28 = !DILocation(line: 37, column: 96, scope: !9) +!29 = !DILocation(line: 74, column: 34, scope: !26, inlinedAt: !28) +!30 = !DILocation(line: 75, column: 25, scope: !26, inlinedAt: !28) +!31 = !DILocation(line: 75, column: 47, scope: !26, inlinedAt: !28) +!32 = !DILocation(line: 20, column: 23, scope: !9) +!33 = !DILocation(line: 21, column: 21, scope: !9) +!34 = !DILocation(line: 22, column: 19, scope: !9) +!35 = !DILocation(line: 24, column: 19, scope: !9) +!36 = !DILocation(line: 23, column: 21, scope: !9) +!37 = !DILocation(line: 23, column: 28, scope: !9) +!38 = !DILocation(line: 26, column: 138, scope: !9) +!39 = !DILocation(line: 26, column: 87, scope: !9) +!40 = !DILocation(line: 26, column: 30, scope: !9) +!41 = !DILocation(line: 27, column: 35, scope: !9) +!42 = !DILocation(line: 27, column: 30, scope: !9) +!43 = !DILocation(line: 30, column: 18, scope: !9) +!44 = !DILocation(line: 32, column: 32, scope: !9) +!45 = !DILocation(line: 35, column: 18, scope: !9) +!46 = !DILocation(line: 36, column: 33, scope: !9) +!47 = !DILocation(line: 37, column: 28, scope: !9) +!48 = !DILocation(line: 37, column: 46, scope: !9) +!49 = !DILocation(line: 37, column: 38, scope: !9) +!50 = !DILocation(line: 37, column: 106, scope: !9) +!51 = !DILocation(line: 37, column: 116, scope: !9) +!52 = !DILocation(line: 28, column: 19, scope: !9) +!53 = !DILocation(line: 39, column: 25, scope: !9) +!54 = !DILocation(line: 39, column: 187, scope: !9) +!55 = !DILocation(line: 40, column: 42, scope: !9) +!56 = !DILocation(line: 40, column: 25, scope: !9) +!57 = !DILocation(line: 40, column: 54, scope: !9) +!58 = !DILocation(line: 40, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ptx new file mode 100644 index 0000000000000000000000000000000000000000..15f4e9f22e7c4fc4f66c9d19df08ea345f4d57fb --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ptx @@ -0,0 +1,498 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3 // -- Begin function triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 108, 105, 47, 99, 108, 105, 109, 122, 103, 122, 105, 112, 116, 115, 106, 108, 122, 52, 116, 115, 117, 120, 97, 99, 118, 116, 54, 109, 117, 121, 111, 101, 101, 109, 111, 55, 108, 50, 98, 120, 99, 121, 111, 116, 102, 106, 121, 51, 118, 105, 106, 54, 104, 98, 116, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[90] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 48, 32, 60, 32, 49, 32, 43, 32, 40, 116, 114, 105, 116, 111, 110, 95, 104, 101, 108, 112, 101, 114, 115, 46, 100, 105, 118, 95, 102, 108, 111, 111, 114, 95, 105, 110, 116, 101, 103, 101, 114, 40, 49, 50, 55, 32, 43, 32, 107, 115, 51, 44, 32, 32, 49, 50, 56, 41, 41}; + // @triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3 +.visible .entry triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3( + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_3, + .param .u64 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_4, + .param .u64 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_5, + .param .u64 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_6, + .param .u64 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_7, + .param .u32 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_8, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_9, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_10 +) +.reqntid 128 +{ + .reg .pred %p<36>; + .reg .b32 %r<33>; + .reg .b64 %rd<129>; + .loc 1 18 0 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:18:0 +$L__func_begin0: + .loc 1 18 0 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:18:0 + +// %bb.0: + ld.param.b64 %rd42, [triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_4]; +$L__tmp0: + .loc 1 19 28 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:19:28 + mov.u32 %r5, %ctaid.x; + .loc 1 19 33 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:19:33 + shl.b32 %r6, %r5, 8; + .loc 1 20 36 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:20:36 + mov.u32 %r7, %tid.x; + and.b32 %r8, %r7, 127; + ld.param.b64 %rd47, [triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_7]; + .loc 1 26 186 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:26:186 + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0; + // end inline asm + .loc 1 37 90 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:90 + add.s64 %rd51, %rd47, 127; + .loc 1 20 23 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:20:23 + or.b32 %r1, %r6, %r8; + or.b32 %r2, %r1, 128; + .loc 1 22 19 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:22:19 + cvt.s64.s32 %rd7, %r2; + cvt.s64.s32 %rd6, %r1; + .loc 1 23 21 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:23:21 + or.b64 %rd58, %rd6, %rd42; + and.b64 %rd59, %rd58, -4294967296; + setp.ne.b64 %p10, %rd59, 0; + @%p10 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd123, %rd6, %rd42; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r9, %rd42; + cvt.u32.u64 %r10, %rd6; + div.u32 %r11, %r10, %r9; + cvt.u64.u32 %rd123, %r11; +$L__BB0_3: + .loc 1 0 21 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:0:21 + shr.s64 %rd52, %rd51, 63; + and.b64 %rd56, %rd51, 127; + ld.param.b64 %rd43, [triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_5]; + .loc 1 23 21 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:23:21 + or.b64 %rd60, %rd7, %rd42; + and.b64 %rd61, %rd60, -4294967296; + setp.ne.b64 %p11, %rd61, 0; + @%p11 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd124, %rd7, %rd42; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r12, %rd42; + cvt.u32.u64 %r13, %rd7; + div.u32 %r14, %r13, %r12; + cvt.u64.u32 %rd124, %r14; +$L__BB0_6: + .loc 1 0 0 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:0 + shr.u64 %rd53, %rd52, 57; + setp.ne.b64 %p7, %rd56, 0; + setp.lt.s64 %p8, %rd51, 0; + .loc 1 23 28 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:23:28 + or.b64 %rd62, %rd123, %rd43; + and.b64 %rd63, %rd62, -4294967296; + setp.ne.b64 %p12, %rd63, 0; + @%p12 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + rem.s64 %rd125, %rd123, %rd43; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r15, %rd43; + cvt.u32.u64 %r16, %rd123; + rem.u32 %r17, %r16, %r15; + cvt.u64.u32 %rd125, %r17; +$L__BB0_9: + .loc 1 0 28 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:0:28 + setp.lt.s64 %p3, %rd42, 2; + setp.gt.s64 %p4, %rd42, 1; + setp.lt.s64 %p5, %rd43, 2; + setp.gt.s64 %p6, %rd43, 1; + add.s64 %rd54, %rd51, %rd53; + and.pred %p9, %p8, %p7; + ld.param.b64 %rd44, [triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_6]; + .loc 1 23 28 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:23:28 + or.b64 %rd64, %rd124, %rd43; + and.b64 %rd65, %rd64, -4294967296; + setp.ne.b64 %p13, %rd65, 0; + @%p13 bra $L__BB0_11; + bra.uni $L__BB0_10; +$L__BB0_11: + rem.s64 %rd126, %rd124, %rd43; + bra.uni $L__BB0_12; +$L__BB0_10: + cvt.u32.u64 %r18, %rd43; + cvt.u32.u64 %r19, %rd124; + rem.u32 %r20, %r19, %r18; + cvt.u64.u32 %rd126, %r20; +$L__BB0_12: + .loc 1 0 28 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:0:28 + selp.b64 %rd46, %rd42, 0, %p4; + selp.b64 %rd48, 1, 0, %p3; + selp.b64 %rd49, %rd43, 0, %p6; + selp.b64 %rd50, 1, 0, %p5; + shr.s64 %rd55, %rd54, 7; + selp.b64 %rd57, -1, 0, %p9; + .loc 1 24 19 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:24:19 + or.b64 %rd66, %rd6, %rd44; + and.b64 %rd67, %rd66, -4294967296; + setp.ne.b64 %p14, %rd67, 0; + @%p14 bra $L__BB0_14; + bra.uni $L__BB0_13; +$L__BB0_14: + div.s64 %rd127, %rd6, %rd44; + bra.uni $L__BB0_15; +$L__BB0_13: + cvt.u32.u64 %r21, %rd44; + cvt.u32.u64 %r22, %rd6; + div.u32 %r23, %r22, %r21; + cvt.u64.u32 %rd127, %r23; +$L__BB0_15: + .loc 1 0 19 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:0:19 + ld.param.b64 %rd39, [triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_1]; + ld.param.b64 %rd38, [triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_0]; + ld.param.b32 %r4, [triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_8]; + add.s64 %rd1, %rd46, %rd48; + add.s64 %rd2, %rd49, %rd50; + add.s64 %rd4, %rd42, 1; + add.s64 %rd5, %rd55, %rd57; + rem.s64 %rd10, %rd6, %rd42; + rem.s64 %rd11, %rd7, %rd42; + .loc 1 24 19 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:24:19 + or.b64 %rd68, %rd7, %rd44; + and.b64 %rd69, %rd68, -4294967296; + setp.ne.b64 %p15, %rd69, 0; + @%p15 bra $L__BB0_17; + bra.uni $L__BB0_16; +$L__BB0_17: + div.s64 %rd128, %rd7, %rd44; + bra.uni $L__BB0_18; +$L__BB0_16: + cvt.u32.u64 %r24, %rd44; + cvt.u32.u64 %r25, %rd7; + div.u32 %r26, %r25, %r24; + cvt.u64.u32 %rd128, %r26; +$L__BB0_18: + .loc 1 21 21 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:21:21 + setp.lt.s32 %p2, %r2, %r4; + setp.lt.s32 %p1, %r1, %r4; + .loc 1 26 87 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:26:87 + mad.lo.s64 %rd83, %rd127, %rd2, %rd125; + mad.lo.s64 %rd32, %rd83, %rd1, %rd10; + mad.lo.s64 %rd84, %rd128, %rd2, %rd126; + mad.lo.s64 %rd33, %rd84, %rd1, %rd11; + .loc 1 26 30 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:26:30 + shl.b64 %rd85, %rd32, 3; + add.s64 %rd71, %rd38, %rd85; + shl.b64 %rd86, %rd33, 3; + add.s64 %rd75, %rd38, %rd86; + .loc 1 26 186 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:26:186 + // begin inline asm + mov.u64 %rd70, 0x0; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd70 }, [ %rd71 + 0 ], %rd45; + // end inline asm + // begin inline asm + mov.u64 %rd73, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd73, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd74, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd74 }, [ %rd75 + 0 ], %rd73; + // end inline asm + .loc 1 27 30 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:27:30 + shl.b64 %rd87, %rd123, 2; + add.s64 %rd78, %rd39, %rd87; + shl.b64 %rd88, %rd124, 2; + add.s64 %rd81, %rd39, %rd88; + .loc 1 27 35 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:27:35 + // begin inline asm + mov.u64 %rd77, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd77, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r27, 0x0; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b32 { %r27 }, [ %rd78 + 0 ], %rd77; + // end inline asm + // begin inline asm + mov.u64 %rd80, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd80, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r28, 0x0; + @%p2 ld.global.L1::evict_last.L2::cache_hint.b32 { %r28 }, [ %rd81 + 0 ], %rd80; + // end inline asm + .loc 1 30 18 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:30:18 + cvt.s64.s32 %rd89, %r28; + cvt.s64.s32 %rd90, %r27; + setp.lt.s64 %p20, %rd10, %rd90; + setp.lt.s64 %p21, %rd11, %rd89; + .loc 1 32 32 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:32:32 + cvt.s64.s32 %rd91, %rd70; + cvt.s64.s32 %rd92, %rd74; + selp.b64 %rd93, %rd92, %rd42, %p21; + selp.b64 %rd94, %rd91, %rd42, %p20; + .loc 1 36 33 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:36:33 + shr.s64 %rd95, %rd94, 63; + and.b64 %rd96, %rd95, %rd4; + shr.s64 %rd97, %rd93, 63; + and.b64 %rd98, %rd97, %rd4; + add.s64 %rd37, %rd98, %rd93; + add.s64 %rd36, %rd96, %rd94; + .loc 1 37 28 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:28 + setp.lt.s64 %p22, %rd36, 0; + setp.lt.s64 %p23, %rd37, 0; + .loc 1 37 46 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:46 + setp.gt.s64 %p24, %rd36, %rd5; + setp.gt.s64 %p25, %rd37, %rd5; + .loc 1 37 38 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:38 + or.pred %p26, %p23, %p25; + or.pred %p27, %p22, %p24; + .loc 1 37 106 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:106 + and.pred %p28, %p1, %p27; + and.pred %p29, %p2, %p26; + .loc 1 37 116 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:116 + or.pred %p30, %p28, %p29; + not.pred %p31, %p30; + @%p31 bra $L__BB0_20; + bra.uni $L__BB0_19; +$L__BB0_20: + .loc 1 0 116 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:0:116 + ld.param.b64 %rd41, [triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_3]; + ld.param.b64 %rd40, [triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_2]; + .loc 1 28 19 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:28:19 + cvt.u32.u64 %r30, %rd74; + cvt.u32.u64 %r29, %rd70; + .loc 1 37 116 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:116 + bar.sync 0; + .loc 1 39 25 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:39:25 + shl.b64 %rd103, %rd32, 2; + add.s64 %rd99, %rd40, %rd103; + shl.b64 %rd104, %rd33, 2; + add.s64 %rd100, %rd40, %rd104; + .loc 1 39 187 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:39:187 + // begin inline asm + @%p1 st.global.b32 [ %rd99 + 0 ], { %r29 }; + // end inline asm + // begin inline asm + @%p2 st.global.b32 [ %rd100 + 0 ], { %r30 }; + // end inline asm + .loc 1 40 42 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:40:42 + mul.lo.s64 %rd105, %rd123, %rd42; + mul.lo.s64 %rd106, %rd124, %rd42; + .loc 1 40 25 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:40:25 + shl.b64 %rd107, %rd36, 2; + add.s64 %rd108, %rd41, %rd107; + add.s64 %rd110, %rd108, %rd87; + shl.b64 %rd111, %rd105, 2; + add.s64 %rd101, %rd110, %rd111; + shl.b64 %rd112, %rd37, 2; + add.s64 %rd113, %rd41, %rd112; + add.s64 %rd115, %rd113, %rd88; + shl.b64 %rd116, %rd106, 2; + add.s64 %rd102, %rd115, %rd116; + mov.b32 %r31, 1; + .loc 1 40 54 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:40:54 + // begin inline asm + @%p1 st.global.b32 [ %rd101 + 0 ], { %r31 }; + // end inline asm + // begin inline asm + @%p2 st.global.b32 [ %rd102 + 0 ], { %r31 }; + // end inline asm + .loc 1 40 4 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:40:4 + ret; +$L__BB0_19: + .loc 1 37 116 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:116 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd117, assertFunc_0; + cvta.global.u64 %rd118, %rd117; + st.param.b64 [param3], %rd118; + mov.b64 %rd119, assertFile_0; + cvta.global.u64 %rd120, %rd119; + st.param.b64 [param1], %rd120; + mov.b64 %rd121, assertMessage_0; + cvta.global.u64 %rd122, %rd121; + st.param.b64 [param0], %rd122; + st.param.b64 [param4], 1; + st.param.b32 [param2], 37; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 108 +.b8 105 +.b8 109 +.b8 122 +.b8 103 +.b8 122 +.b8 105 +.b8 112 +.b8 116 +.b8 115 +.b8 106 +.b8 108 +.b8 122 +.b8 52 +.b8 116 +.b8 115 +.b8 117 +.b8 120 +.b8 97 +.b8 99 +.b8 118 +.b8 116 +.b8 54 +.b8 109 +.b8 117 +.b8 121 +.b8 111 +.b8 101 +.b8 101 +.b8 109 +.b8 111 +.b8 55 +.b8 108 +.b8 50 +.b8 98 +.b8 120 +.b8 99 +.b8 121 +.b8 111 +.b8 116 +.b8 102 +.b8 106 +.b8 121 +.b8 51 +.b8 118 +.b8 105 +.b8 106 +.b8 54 +.b8 104 +.b8 98 +.b8 116 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 108 +.b8 105 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.source b/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.source new file mode 100644 index 0000000000000000000000000000000000000000..b124eb8e2b4c8881fc111ac09e05266eba695273 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.source @@ -0,0 +1,355 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":18:0) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0) +#loc91 = loc("in_ptr0"(#loc)) +#loc92 = loc("in_ptr1"(#loc)) +#loc93 = loc("out_ptr0"(#loc)) +#loc94 = loc("out_ptr1"(#loc)) +#loc95 = loc("ks0"(#loc)) +#loc96 = loc("ks1"(#loc)) +#loc97 = loc("ks2"(#loc)) +#loc98 = loc("ks3"(#loc)) +#loc99 = loc("xnumel"(#loc)) +#loc142 = loc("a"(#loc80)) +module { + tt.func public @triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc100) + %xoffset_0 = arith.constant 256 : i32 loc(#loc101) + %xoffset_1 = arith.constant 256 : i32 loc(#loc101) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc101) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc102) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<256xi32> loc(#loc103) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<256xi32> loc(#loc103) + %xmask = tt.splat %xnumel : i32 -> tensor<256xi32> loc(#loc104) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<256xi32> loc(#loc104) + %x0 = arith.extsi %xindex_4 : tensor<256xi32> to tensor<256xi64> loc(#loc105) + %x0_6 = tt.splat %ks0 : i64 -> tensor<256xi64> loc(#loc105) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<256xi64> loc(#loc105) + %x1 = arith.extsi %xindex_4 : tensor<256xi32> to tensor<256xi64> loc(#loc106) + %x1_8 = tt.splat %ks0 : i64 -> tensor<256xi64> loc(#loc106) + %x1_9 = arith.divsi %x1, %x1_8 : tensor<256xi64> loc(#loc106) + %x1_10 = tt.splat %ks1 : i64 -> tensor<256xi64> loc(#loc107) + %x1_11 = arith.remsi %x1_9, %x1_10 : tensor<256xi64> loc(#loc107) + %x2 = arith.extsi %xindex_4 : tensor<256xi32> to tensor<256xi64> loc(#loc108) + %x2_12 = tt.splat %ks2 : i64 -> tensor<256xi64> loc(#loc108) + %x2_13 = arith.divsi %x2, %x2_12 : tensor<256xi64> loc(#loc108) + %x3 = arith.extsi %xindex_4 : tensor<256xi32> to tensor<256xi64> loc(#loc109) + %x3_14 = tt.splat %ks0 : i64 -> tensor<256xi64> loc(#loc109) + %x3_15 = arith.divsi %x3, %x3_14 : tensor<256xi64> loc(#loc109) + %tmp0 = arith.constant 1 : i32 loc(#loc110) + %tmp0_16 = arith.extsi %tmp0 : i32 to i64 loc(#loc110) + %tmp0_17 = arith.cmpi sge, %tmp0_16, %ks0 : i64 loc(#loc110) + %tmp0_18 = arith.constant 1 : i32 loc(#loc111) + %tmp0_19 = arith.constant 1 : i32 loc(#loc111) + %tmp0_20 = arith.extui %tmp0_17 : i1 to i32 loc(#loc111) + %tmp0_21 = arith.muli %tmp0_19, %tmp0_20 : i32 loc(#loc111) + %tmp0_22 = arith.constant 1 : i32 loc(#loc112) + %tmp0_23 = arith.extsi %tmp0_22 : i32 to i64 loc(#loc112) + %tmp0_24 = arith.cmpi sgt, %ks0, %tmp0_23 : i64 loc(#loc112) + %tmp0_25 = arith.extui %tmp0_24 : i1 to i64 loc(#loc113) + %tmp0_26 = arith.muli %ks0, %tmp0_25 : i64 loc(#loc113) + %tmp0_27 = arith.extsi %tmp0_21 : i32 to i64 loc(#loc114) + %tmp0_28 = arith.addi %tmp0_27, %tmp0_26 : i64 loc(#loc114) + %tmp0_29 = tt.splat %tmp0_28 : i64 -> tensor<256xi64> loc(#loc115) + %tmp0_30 = arith.muli %x1_11, %tmp0_29 : tensor<256xi64> loc(#loc115) + %tmp0_31 = arith.addi %x0_7, %tmp0_30 : tensor<256xi64> loc(#loc116) + %tmp0_32 = arith.constant 1 : i32 loc(#loc117) + %tmp0_33 = arith.extsi %tmp0_32 : i32 to i64 loc(#loc117) + %tmp0_34 = arith.cmpi sge, %tmp0_33, %ks0 : i64 loc(#loc117) + %tmp0_35 = arith.constant 1 : i32 loc(#loc118) + %tmp0_36 = arith.constant 1 : i32 loc(#loc118) + %tmp0_37 = arith.extui %tmp0_34 : i1 to i32 loc(#loc118) + %tmp0_38 = arith.muli %tmp0_36, %tmp0_37 : i32 loc(#loc118) + %tmp0_39 = arith.constant 1 : i32 loc(#loc119) + %tmp0_40 = arith.extsi %tmp0_39 : i32 to i64 loc(#loc119) + %tmp0_41 = arith.cmpi sgt, %ks0, %tmp0_40 : i64 loc(#loc119) + %tmp0_42 = arith.extui %tmp0_41 : i1 to i64 loc(#loc120) + %tmp0_43 = arith.muli %ks0, %tmp0_42 : i64 loc(#loc120) + %tmp0_44 = arith.extsi %tmp0_38 : i32 to i64 loc(#loc121) + %tmp0_45 = arith.addi %tmp0_44, %tmp0_43 : i64 loc(#loc121) + %tmp0_46 = tt.splat %tmp0_45 : i64 -> tensor<256xi64> loc(#loc122) + %tmp0_47 = arith.muli %x2_13, %tmp0_46 : tensor<256xi64> loc(#loc122) + %tmp0_48 = arith.constant 1 : i32 loc(#loc123) + %tmp0_49 = arith.extsi %tmp0_48 : i32 to i64 loc(#loc123) + %tmp0_50 = arith.cmpi sge, %tmp0_49, %ks1 : i64 loc(#loc123) + %tmp0_51 = arith.constant 1 : i32 loc(#loc124) + %tmp0_52 = arith.constant 1 : i32 loc(#loc124) + %tmp0_53 = arith.extui %tmp0_50 : i1 to i32 loc(#loc124) + %tmp0_54 = arith.muli %tmp0_52, %tmp0_53 : i32 loc(#loc124) + %tmp0_55 = arith.constant 1 : i32 loc(#loc125) + %tmp0_56 = arith.extsi %tmp0_55 : i32 to i64 loc(#loc125) + %tmp0_57 = arith.cmpi sgt, %ks1, %tmp0_56 : i64 loc(#loc125) + %tmp0_58 = arith.extui %tmp0_57 : i1 to i64 loc(#loc126) + %tmp0_59 = arith.muli %ks1, %tmp0_58 : i64 loc(#loc126) + %tmp0_60 = arith.extsi %tmp0_54 : i32 to i64 loc(#loc127) + %tmp0_61 = arith.addi %tmp0_60, %tmp0_59 : i64 loc(#loc127) + %tmp0_62 = tt.splat %tmp0_61 : i64 -> tensor<256xi64> loc(#loc128) + %tmp0_63 = arith.muli %tmp0_47, %tmp0_62 : tensor<256xi64> loc(#loc128) + %tmp0_64 = arith.addi %tmp0_31, %tmp0_63 : tensor<256xi64> loc(#loc129) + %tmp0_65 = tt.splat %in_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc130) + %tmp0_66 = tt.addptr %tmp0_65, %tmp0_64 : tensor<256x!tt.ptr>, tensor<256xi64> loc(#loc130) + %tmp0_67 = tt.load %tmp0_66, %xmask_5 evictionPolicy = evict_last : tensor<256x!tt.ptr> loc(#loc131) + %tmp2 = tt.splat %in_ptr1 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc132) + %tmp2_68 = tt.addptr %tmp2, %x3_15 : tensor<256x!tt.ptr>, tensor<256xi64> loc(#loc132) + %tmp2_69 = tt.load %tmp2_68, %xmask_5 evictionPolicy = evict_last : tensor<256x!tt.ptr> loc(#loc133) + %tmp1 = arith.trunci %tmp0_67 : tensor<256xi64> to tensor<256xi32> loc(#loc134) + %tmp4 = arith.extsi %tmp2_69 : tensor<256xi32> to tensor<256xi64> loc(#loc135) + %tmp4_70 = arith.cmpi slt, %x0_7, %tmp4 : tensor<256xi64> loc(#loc135) + %tmp6 = arith.extsi %tmp1 : tensor<256xi32> to tensor<256xi64> loc(#loc136) + %tmp6_71 = tt.splat %ks0 : i64 -> tensor<256xi64> loc(#loc136) + %tmp6_72 = arith.select %tmp4_70, %tmp6, %tmp6_71 : tensor<256xi1>, tensor<256xi64> loc(#loc136) + %tmp7 = arith.constant 1 : i32 loc(#loc137) + %tmp7_73 = arith.constant 1 : i64 loc(#loc137) + %tmp7_74 = arith.addi %tmp7_73, %ks0 : i64 loc(#loc137) + %tmp8 = tt.splat %tmp7_74 : i64 -> tensor<256xi64> loc(#loc138) + %tmp8_75 = arith.addi %tmp6_72, %tmp8 : tensor<256xi64> loc(#loc138) + %tmp9 = arith.constant 0 : i32 loc(#loc139) + %tmp9_76 = arith.extsi %tmp9 : i32 to i64 loc(#loc139) + %tmp9_77 = tt.splat %tmp9_76 : i64 -> tensor<256xi64> loc(#loc139) + %tmp9_78 = arith.cmpi slt, %tmp6_72, %tmp9_77 : tensor<256xi64> loc(#loc139) + %tmp10 = arith.select %tmp9_78, %tmp8_75, %tmp6_72 : tensor<256xi1>, tensor<256xi64> loc(#loc140) + %c0_i32 = arith.constant 0 : i32 loc(#loc42) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc42) + %1 = tt.splat %0 : i64 -> tensor<256xi64> loc(#loc42) + %2 = arith.cmpi sle, %1, %tmp10 : tensor<256xi64> loc(#loc42) + %c127_i32 = arith.constant 127 : i32 loc(#loc43) + %c127_i64 = arith.constant 127 : i64 loc(#loc43) + %3 = arith.addi %c127_i64, %ks3 : i64 loc(#loc43) + %4 = tt.call @"torch._inductor.runtime.triton_helpers.div_floor_integer__i64__(1,)cconstexpr_128_"(%3) : (i64) -> i64 loc(#loc44) + %c1_i32 = arith.constant 1 : i32 loc(#loc45) + %c1_i64 = arith.constant 1 : i64 loc(#loc45) + %5 = arith.addi %c1_i64, %4 : i64 loc(#loc45) + %6 = tt.splat %5 : i64 -> tensor<256xi64> loc(#loc46) + %7 = arith.cmpi slt, %tmp10, %6 : tensor<256xi64> loc(#loc46) + %8 = arith.andi %2, %7 : tensor<256xi1> loc(#loc47) + %true = arith.constant true loc(#loc48) + %cst = arith.constant dense : tensor<256xi1> loc(#loc48) + %9 = arith.xori %xmask_5, %cst : tensor<256xi1> loc(#loc48) + %10 = arith.ori %8, %9 : tensor<256xi1> loc(#loc49) + tt.assert %10, "index out of bounds: 0 <= tmp10 < 1 + (triton_helpers.div_floor_integer(127 + ks3, 128))" : tensor<256xi1> loc(#loc50) + %tmp12 = arith.constant 1 : i32 loc(#loc141) + %tmp12_79 = arith.constant dense<1> : tensor<1xi32> loc(#loc141) + %c1_i32_80 = arith.constant 1 : i32 loc(#loc52) + %11 = arith.extsi %c1_i32_80 : i32 to i64 loc(#loc52) + %12 = arith.cmpi sge, %11, %ks0 : i64 loc(#loc52) + %c1_i32_81 = arith.constant 1 : i32 loc(#loc53) + %c1_i32_82 = arith.constant 1 : i32 loc(#loc53) + %13 = arith.extui %12 : i1 to i32 loc(#loc53) + %14 = arith.muli %c1_i32_82, %13 : i32 loc(#loc53) + %c1_i32_83 = arith.constant 1 : i32 loc(#loc54) + %15 = arith.extsi %c1_i32_83 : i32 to i64 loc(#loc54) + %16 = arith.cmpi sgt, %ks0, %15 : i64 loc(#loc54) + %17 = arith.extui %16 : i1 to i64 loc(#loc55) + %18 = arith.muli %ks0, %17 : i64 loc(#loc55) + %19 = arith.extsi %14 : i32 to i64 loc(#loc56) + %20 = arith.addi %19, %18 : i64 loc(#loc56) + %21 = tt.splat %20 : i64 -> tensor<256xi64> loc(#loc57) + %22 = arith.muli %x1_11, %21 : tensor<256xi64> loc(#loc57) + %23 = arith.addi %x0_7, %22 : tensor<256xi64> loc(#loc58) + %c1_i32_84 = arith.constant 1 : i32 loc(#loc59) + %24 = arith.extsi %c1_i32_84 : i32 to i64 loc(#loc59) + %25 = arith.cmpi sge, %24, %ks0 : i64 loc(#loc59) + %c1_i32_85 = arith.constant 1 : i32 loc(#loc60) + %c1_i32_86 = arith.constant 1 : i32 loc(#loc60) + %26 = arith.extui %25 : i1 to i32 loc(#loc60) + %27 = arith.muli %c1_i32_86, %26 : i32 loc(#loc60) + %c1_i32_87 = arith.constant 1 : i32 loc(#loc61) + %28 = arith.extsi %c1_i32_87 : i32 to i64 loc(#loc61) + %29 = arith.cmpi sgt, %ks0, %28 : i64 loc(#loc61) + %30 = arith.extui %29 : i1 to i64 loc(#loc62) + %31 = arith.muli %ks0, %30 : i64 loc(#loc62) + %32 = arith.extsi %27 : i32 to i64 loc(#loc63) + %33 = arith.addi %32, %31 : i64 loc(#loc63) + %34 = tt.splat %33 : i64 -> tensor<256xi64> loc(#loc64) + %35 = arith.muli %x2_13, %34 : tensor<256xi64> loc(#loc64) + %c1_i32_88 = arith.constant 1 : i32 loc(#loc65) + %36 = arith.extsi %c1_i32_88 : i32 to i64 loc(#loc65) + %37 = arith.cmpi sge, %36, %ks1 : i64 loc(#loc65) + %c1_i32_89 = arith.constant 1 : i32 loc(#loc66) + %c1_i32_90 = arith.constant 1 : i32 loc(#loc66) + %38 = arith.extui %37 : i1 to i32 loc(#loc66) + %39 = arith.muli %c1_i32_90, %38 : i32 loc(#loc66) + %c1_i32_91 = arith.constant 1 : i32 loc(#loc67) + %40 = arith.extsi %c1_i32_91 : i32 to i64 loc(#loc67) + %41 = arith.cmpi sgt, %ks1, %40 : i64 loc(#loc67) + %42 = arith.extui %41 : i1 to i64 loc(#loc68) + %43 = arith.muli %ks1, %42 : i64 loc(#loc68) + %44 = arith.extsi %39 : i32 to i64 loc(#loc69) + %45 = arith.addi %44, %43 : i64 loc(#loc69) + %46 = tt.splat %45 : i64 -> tensor<256xi64> loc(#loc70) + %47 = arith.muli %35, %46 : tensor<256xi64> loc(#loc70) + %48 = arith.addi %23, %47 : tensor<256xi64> loc(#loc71) + %49 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc72) + %50 = tt.addptr %49, %48 : tensor<256x!tt.ptr>, tensor<256xi64> loc(#loc72) + tt.store %50, %tmp1, %xmask_5 : tensor<256x!tt.ptr> loc(#loc73) + %51 = arith.addi %tmp10, %x3_15 : tensor<256xi64> loc(#loc74) + %52 = tt.splat %ks0 : i64 -> tensor<256xi64> loc(#loc75) + %53 = arith.muli %52, %x3_15 : tensor<256xi64> loc(#loc75) + %54 = arith.addi %51, %53 : tensor<256xi64> loc(#loc76) + %55 = tt.splat %out_ptr1 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc77) + %56 = tt.addptr %55, %54 : tensor<256x!tt.ptr>, tensor<256xi64> loc(#loc77) + %cst_92 = arith.constant dense<1> : tensor<256xi32> loc(#loc78) + tt.store %56, %cst_92, %xmask_5 : tensor<256x!tt.ptr> loc(#loc78) + tt.return loc(#loc79) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.div_floor_integer__i64__(1,)cconstexpr_128_"(%a: i64 loc("a"(#loc80))) -> i64 attributes {noinline = false} { + %quot = arith.constant 128 : i32 loc(#loc143) + %quot_0 = arith.constant 128 : i64 loc(#loc143) + %quot_1 = arith.divsi %a, %quot_0 : i64 loc(#loc143) + %remainder = arith.constant 128 : i32 loc(#loc144) + %remainder_2 = arith.constant 128 : i64 loc(#loc144) + %remainder_3 = arith.remsi %a, %remainder_2 : i64 loc(#loc144) + %fixed = arith.constant 0 : i32 loc(#loc145) + %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc145) + %fixed_5 = arith.cmpi ne, %remainder_3, %fixed_4 : i64 loc(#loc145) + %fixed_6 = arith.constant 1 : i32 loc(#loc146) + %fixed_7 = arith.constant 1 : i64 loc(#loc146) + %fixed_8 = arith.subi %quot_1, %fixed_7 : i64 loc(#loc146) + %fixed_9 = arith.select %fixed_5, %fixed_8, %quot_1 : i64 loc(#loc147) + %c0_i32 = arith.constant 0 : i32 loc(#loc86) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc86) + %1 = arith.cmpi slt, %a, %0 : i64 loc(#loc86) + %false = arith.constant false loc(#loc87) + %2 = arith.cmpi ne, %1, %false : i1 loc(#loc87) + %3 = arith.select %2, %fixed_9, %quot_1 : i64 loc(#loc88) + tt.return %3 : i64 loc(#loc89) + ^bb1: // no predecessors + %4 = ub.poison : i64 loc(#loc90) + tt.return %4 : i64 loc(#loc90) + } loc(#loc80) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":22:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":23:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":23:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":24:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":25:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:54) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:46) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:80) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:71) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:62) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:39) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:35) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:106) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:98) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:132) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:123) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:114) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:91) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:153) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:145) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:179) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:170) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:161) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:138) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:87) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:30) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:186) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":27:30) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":27:35) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":28:19) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":30:18) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":32:32) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":33:15) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":34:18) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":35:18) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":36:33) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:28) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:90) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:96) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:51) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:46) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:38) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:108) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:106) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:116) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":38:28) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:49) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:41) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:75) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:66) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:57) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:34) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:30) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:101) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:93) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:127) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:118) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:109) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:86) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:148) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:140) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:174) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:165) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:156) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:133) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:82) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:187) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:33) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:42) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:38) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:25) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:54) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:4) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4) +#loc100 = loc("xoffset"(#loc1)) +#loc101 = loc("xoffset"(#loc2)) +#loc102 = loc("xindex"(#loc3)) +#loc103 = loc("xindex"(#loc4)) +#loc104 = loc("xmask"(#loc5)) +#loc105 = loc("x0"(#loc6)) +#loc106 = loc("x1"(#loc7)) +#loc107 = loc("x1"(#loc8)) +#loc108 = loc("x2"(#loc9)) +#loc109 = loc("x3"(#loc10)) +#loc110 = loc("tmp0"(#loc11)) +#loc111 = loc("tmp0"(#loc12)) +#loc112 = loc("tmp0"(#loc13)) +#loc113 = loc("tmp0"(#loc14)) +#loc114 = loc("tmp0"(#loc15)) +#loc115 = loc("tmp0"(#loc16)) +#loc116 = loc("tmp0"(#loc17)) +#loc117 = loc("tmp0"(#loc18)) +#loc118 = loc("tmp0"(#loc19)) +#loc119 = loc("tmp0"(#loc20)) +#loc120 = loc("tmp0"(#loc21)) +#loc121 = loc("tmp0"(#loc22)) +#loc122 = loc("tmp0"(#loc23)) +#loc123 = loc("tmp0"(#loc24)) +#loc124 = loc("tmp0"(#loc25)) +#loc125 = loc("tmp0"(#loc26)) +#loc126 = loc("tmp0"(#loc27)) +#loc127 = loc("tmp0"(#loc28)) +#loc128 = loc("tmp0"(#loc29)) +#loc129 = loc("tmp0"(#loc30)) +#loc130 = loc("tmp0"(#loc31)) +#loc131 = loc("tmp0"(#loc32)) +#loc132 = loc("tmp2"(#loc33)) +#loc133 = loc("tmp2"(#loc34)) +#loc134 = loc("tmp1"(#loc35)) +#loc135 = loc("tmp4"(#loc36)) +#loc136 = loc("tmp6"(#loc37)) +#loc137 = loc("tmp7"(#loc38)) +#loc138 = loc("tmp8"(#loc39)) +#loc139 = loc("tmp9"(#loc40)) +#loc140 = loc("tmp10"(#loc41)) +#loc141 = loc("tmp12"(#loc51)) +#loc143 = loc("quot"(#loc81)) +#loc144 = loc("remainder"(#loc82)) +#loc145 = loc("fixed"(#loc83)) +#loc146 = loc("fixed"(#loc84)) +#loc147 = loc("fixed"(#loc85)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..4e2b457982b88a412d2a3909096064b287a6cc03 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttgir @@ -0,0 +1,208 @@ +#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":18:0) +#loc61 = loc("in_ptr0"(#loc)) +#loc62 = loc("in_ptr1"(#loc)) +#loc63 = loc("out_ptr0"(#loc)) +#loc64 = loc("out_ptr1"(#loc)) +#loc65 = loc("ks0"(#loc)) +#loc66 = loc("ks1"(#loc)) +#loc67 = loc("ks2"(#loc)) +#loc68 = loc("ks3"(#loc)) +#loc69 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c127_i64 = arith.constant 127 : i64 loc(#loc1) + %cst = arith.constant dense : tensor<256xi1, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<256xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<256xi64, #blocked> loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc70) + %xoffset_2 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc71) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> loc(#loc72) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<256xi32, #blocked> loc(#loc73) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<256xi32, #blocked> loc(#loc73) + %xmask = tt.splat %xnumel : i32 -> tensor<256xi32, #blocked> loc(#loc74) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<256xi32, #blocked> loc(#loc74) + %x0 = arith.extsi %xindex_4 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked> loc(#loc75) + %x0_6 = tt.splat %ks0 : i64 -> tensor<256xi64, #blocked> loc(#loc75) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<256xi64, #blocked> loc(#loc75) + %x1 = arith.divsi %x0, %x0_6 : tensor<256xi64, #blocked> loc(#loc76) + %x1_8 = tt.splat %ks1 : i64 -> tensor<256xi64, #blocked> loc(#loc77) + %x1_9 = arith.remsi %x1, %x1_8 : tensor<256xi64, #blocked> loc(#loc77) + %x2 = tt.splat %ks2 : i64 -> tensor<256xi64, #blocked> loc(#loc78) + %x2_10 = arith.divsi %x0, %x2 : tensor<256xi64, #blocked> loc(#loc78) + %tmp0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc79) + %tmp0_11 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc80) + %tmp0_12 = arith.extui %tmp0_11 : i1 to i64 loc(#loc81) + %tmp0_13 = arith.muli %ks0, %tmp0_12 : i64 loc(#loc81) + %tmp0_14 = arith.extui %tmp0 : i1 to i64 loc(#loc112) + %tmp0_15 = arith.addi %tmp0_14, %tmp0_13 : i64 loc(#loc82) + %tmp0_16 = tt.splat %tmp0_15 : i64 -> tensor<256xi64, #blocked> loc(#loc84) + %tmp0_17 = arith.muli %x1_9, %tmp0_16 : tensor<256xi64, #blocked> loc(#loc84) + %tmp0_18 = arith.addi %x0_7, %tmp0_17 : tensor<256xi64, #blocked> loc(#loc85) + %tmp0_19 = arith.muli %x2_10, %tmp0_16 : tensor<256xi64, #blocked> loc(#loc86) + %tmp0_20 = arith.cmpi sle, %ks1, %c1_i64 : i64 loc(#loc87) + %tmp0_21 = arith.cmpi sgt, %ks1, %c1_i64 : i64 loc(#loc88) + %tmp0_22 = arith.extui %tmp0_21 : i1 to i64 loc(#loc89) + %tmp0_23 = arith.muli %ks1, %tmp0_22 : i64 loc(#loc89) + %tmp0_24 = arith.extui %tmp0_20 : i1 to i64 loc(#loc113) + %tmp0_25 = arith.addi %tmp0_24, %tmp0_23 : i64 loc(#loc90) + %tmp0_26 = tt.splat %tmp0_25 : i64 -> tensor<256xi64, #blocked> loc(#loc92) + %tmp0_27 = arith.muli %tmp0_19, %tmp0_26 : tensor<256xi64, #blocked> loc(#loc92) + %tmp0_28 = arith.addi %tmp0_18, %tmp0_27 : tensor<256xi64, #blocked> loc(#loc93) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<256x!tt.ptr, #blocked> loc(#loc94) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<256x!tt.ptr, #blocked>, tensor<256xi64, #blocked> loc(#loc94) + %tmp0_31 = tt.load %tmp0_30, %xmask_5 evictionPolicy = evict_last : tensor<256x!tt.ptr, #blocked> loc(#loc95) + %tmp2 = tt.splat %in_ptr1 : !tt.ptr -> tensor<256x!tt.ptr, #blocked> loc(#loc96) + %tmp2_32 = tt.addptr %tmp2, %x1 : tensor<256x!tt.ptr, #blocked>, tensor<256xi64, #blocked> loc(#loc96) + %tmp2_33 = tt.load %tmp2_32, %xmask_5 evictionPolicy = evict_last : tensor<256x!tt.ptr, #blocked> loc(#loc97) + %tmp1 = arith.trunci %tmp0_31 : tensor<256xi64, #blocked> to tensor<256xi32, #blocked> loc(#loc98) + %tmp4 = arith.extsi %tmp2_33 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked> loc(#loc99) + %tmp4_34 = arith.cmpi slt, %x0_7, %tmp4 : tensor<256xi64, #blocked> loc(#loc99) + %tmp6 = arith.extsi %tmp1 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked> loc(#loc100) + %tmp6_35 = arith.select %tmp4_34, %tmp6, %x0_6 : tensor<256xi1, #blocked>, tensor<256xi64, #blocked> loc(#loc100) + %tmp7 = arith.addi %ks0, %c1_i64 : i64 loc(#loc101) + %tmp8 = tt.splat %tmp7 : i64 -> tensor<256xi64, #blocked> loc(#loc102) + %tmp8_36 = arith.addi %tmp6_35, %tmp8 : tensor<256xi64, #blocked> loc(#loc102) + %tmp9 = arith.cmpi slt, %tmp6_35, %cst_1 : tensor<256xi64, #blocked> loc(#loc103) + %tmp10 = arith.select %tmp9, %tmp8_36, %tmp6_35 : tensor<256xi1, #blocked>, tensor<256xi64, #blocked> loc(#loc104) + %0 = arith.cmpi sge, %tmp10, %cst_1 : tensor<256xi64, #blocked> loc(#loc37) + %1 = arith.addi %ks3, %c127_i64 : i64 loc(#loc38) + %quot = arith.divsi %1, %c128_i64 : i64 loc(#loc114) + %remainder = arith.remsi %1, %c128_i64 : i64 loc(#loc115) + %fixed = arith.cmpi ne, %remainder, %c0_i64 : i64 loc(#loc116) + %fixed_37 = arith.subi %quot, %c1_i64 : i64 loc(#loc117) + %fixed_38 = arith.select %fixed, %fixed_37, %quot : i64 loc(#loc118) + %2 = arith.cmpi slt, %1, %c0_i64 : i64 loc(#loc110) + %3 = arith.select %2, %fixed_38, %quot : i64 loc(#loc111) + %4 = arith.addi %3, %c1_i64 : i64 loc(#loc47) + %5 = tt.splat %4 : i64 -> tensor<256xi64, #blocked> loc(#loc48) + %6 = arith.cmpi slt, %tmp10, %5 : tensor<256xi64, #blocked> loc(#loc48) + %7 = arith.andi %0, %6 : tensor<256xi1, #blocked> loc(#loc49) + %8 = arith.xori %xmask_5, %cst : tensor<256xi1, #blocked> loc(#loc50) + %9 = arith.ori %7, %8 : tensor<256xi1, #blocked> loc(#loc51) + tt.assert %9, "index out of bounds: 0 <= tmp10 < 1 + (triton_helpers.div_floor_integer(127 + ks3, 128))" : tensor<256xi1, #blocked> loc(#loc52) + %10 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr, #blocked> loc(#loc53) + %11 = tt.addptr %10, %tmp0_28 : tensor<256x!tt.ptr, #blocked>, tensor<256xi64, #blocked> loc(#loc53) + tt.store %11, %tmp1, %xmask_5 : tensor<256x!tt.ptr, #blocked> loc(#loc54) + %12 = arith.addi %tmp10, %x1 : tensor<256xi64, #blocked> loc(#loc55) + %13 = arith.muli %x0_6, %x1 : tensor<256xi64, #blocked> loc(#loc56) + %14 = arith.addi %12, %13 : tensor<256xi64, #blocked> loc(#loc57) + %15 = tt.splat %out_ptr1 : !tt.ptr -> tensor<256x!tt.ptr, #blocked> loc(#loc58) + %16 = tt.addptr %15, %14 : tensor<256x!tt.ptr, #blocked>, tensor<256xi64, #blocked> loc(#loc58) + tt.store %16, %cst_0, %xmask_5 : tensor<256x!tt.ptr, #blocked> loc(#loc59) + tt.return loc(#loc60) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":23:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":23:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":24:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:54) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:80) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:71) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:62) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:46) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:39) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:35) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:91) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:153) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:179) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:170) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:161) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:145) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:138) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:87) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:30) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:186) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":27:30) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":27:35) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":28:19) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":30:18) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":32:32) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":33:15) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":34:18) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":35:18) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":36:33) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:28) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:90) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:96) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:51) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:46) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:38) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:108) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:106) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:116) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:187) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:33) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:42) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:38) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:25) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:54) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:4) +#loc70 = loc("xoffset"(#loc2)) +#loc71 = loc("xoffset"(#loc3)) +#loc72 = loc("xindex"(#loc4)) +#loc73 = loc("xindex"(#loc5)) +#loc74 = loc("xmask"(#loc6)) +#loc75 = loc("x0"(#loc7)) +#loc76 = loc("x1"(#loc8)) +#loc77 = loc("x1"(#loc9)) +#loc78 = loc("x2"(#loc10)) +#loc79 = loc("tmp0"(#loc11)) +#loc80 = loc("tmp0"(#loc12)) +#loc81 = loc("tmp0"(#loc13)) +#loc82 = loc("tmp0"(#loc14)) +#loc83 = loc("tmp0"(#loc15)) +#loc84 = loc("tmp0"(#loc16)) +#loc85 = loc("tmp0"(#loc17)) +#loc86 = loc("tmp0"(#loc18)) +#loc87 = loc("tmp0"(#loc19)) +#loc88 = loc("tmp0"(#loc20)) +#loc89 = loc("tmp0"(#loc21)) +#loc90 = loc("tmp0"(#loc22)) +#loc91 = loc("tmp0"(#loc23)) +#loc92 = loc("tmp0"(#loc24)) +#loc93 = loc("tmp0"(#loc25)) +#loc94 = loc("tmp0"(#loc26)) +#loc95 = loc("tmp0"(#loc27)) +#loc96 = loc("tmp2"(#loc28)) +#loc97 = loc("tmp2"(#loc29)) +#loc98 = loc("tmp1"(#loc30)) +#loc99 = loc("tmp4"(#loc31)) +#loc100 = loc("tmp6"(#loc32)) +#loc101 = loc("tmp7"(#loc33)) +#loc102 = loc("tmp8"(#loc34)) +#loc103 = loc("tmp9"(#loc35)) +#loc104 = loc("tmp10"(#loc36)) +#loc105 = loc("quot"(#loc39)) +#loc106 = loc("remainder"(#loc41)) +#loc107 = loc("fixed"(#loc42)) +#loc108 = loc("fixed"(#loc43)) +#loc109 = loc("fixed"(#loc44)) +#loc110 = loc(callsite(#loc45 at #loc40)) +#loc111 = loc(callsite(#loc46 at #loc40)) +#loc112 = loc(fused[#loc82, #loc83]) +#loc113 = loc(fused[#loc90, #loc91]) +#loc114 = loc(callsite(#loc105 at #loc40)) +#loc115 = loc(callsite(#loc106 at #loc40)) +#loc116 = loc(callsite(#loc107 at #loc40)) +#loc117 = loc(callsite(#loc108 at #loc40)) +#loc118 = loc(callsite(#loc109 at #loc40)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttir new file mode 100644 index 0000000000000000000000000000000000000000..66d0f5d4babdf27d9e691be809f64ba5c02f59c6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HNAESQAXSLY3RMS4OQGFBJNYMCFXKSDKHW7HOOXNI5LPG6DZ4FFQ/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttir @@ -0,0 +1,208 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":18:0) +#loc61 = loc("in_ptr0"(#loc)) +#loc62 = loc("in_ptr1"(#loc)) +#loc63 = loc("out_ptr0"(#loc)) +#loc64 = loc("out_ptr1"(#loc)) +#loc65 = loc("ks0"(#loc)) +#loc66 = loc("ks1"(#loc)) +#loc67 = loc("ks2"(#loc)) +#loc68 = loc("ks3"(#loc)) +#loc69 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %c128_i64 = arith.constant 128 : i64 loc(#loc70) + %c0_i64 = arith.constant 0 : i64 loc(#loc70) + %cst = arith.constant dense<0> : tensor<256xi64> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<256xi32> loc(#loc3) + %cst_1 = arith.constant dense : tensor<256xi1> loc(#loc4) + %c127_i64 = arith.constant 127 : i64 loc(#loc5) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc71) + %xoffset_2 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc72) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc73) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<256xi32> loc(#loc74) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<256xi32> loc(#loc74) + %xmask = tt.splat %xnumel : i32 -> tensor<256xi32> loc(#loc75) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<256xi32> loc(#loc75) + %x0 = arith.extsi %xindex_4 : tensor<256xi32> to tensor<256xi64> loc(#loc76) + %x0_6 = tt.splat %ks0 : i64 -> tensor<256xi64> loc(#loc76) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<256xi64> loc(#loc76) + %x1 = arith.divsi %x0, %x0_6 : tensor<256xi64> loc(#loc77) + %x1_8 = tt.splat %ks1 : i64 -> tensor<256xi64> loc(#loc78) + %x1_9 = arith.remsi %x1, %x1_8 : tensor<256xi64> loc(#loc78) + %x2 = tt.splat %ks2 : i64 -> tensor<256xi64> loc(#loc79) + %x2_10 = arith.divsi %x0, %x2 : tensor<256xi64> loc(#loc79) + %tmp0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc80) + %tmp0_11 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc81) + %tmp0_12 = arith.extui %tmp0_11 : i1 to i64 loc(#loc82) + %tmp0_13 = arith.muli %ks0, %tmp0_12 : i64 loc(#loc82) + %tmp0_14 = arith.extui %tmp0 : i1 to i64 loc(#loc113) + %tmp0_15 = arith.addi %tmp0_14, %tmp0_13 : i64 loc(#loc83) + %tmp0_16 = tt.splat %tmp0_15 : i64 -> tensor<256xi64> loc(#loc85) + %tmp0_17 = arith.muli %x1_9, %tmp0_16 : tensor<256xi64> loc(#loc85) + %tmp0_18 = arith.addi %x0_7, %tmp0_17 : tensor<256xi64> loc(#loc86) + %tmp0_19 = arith.muli %x2_10, %tmp0_16 : tensor<256xi64> loc(#loc87) + %tmp0_20 = arith.cmpi sle, %ks1, %c1_i64 : i64 loc(#loc88) + %tmp0_21 = arith.cmpi sgt, %ks1, %c1_i64 : i64 loc(#loc89) + %tmp0_22 = arith.extui %tmp0_21 : i1 to i64 loc(#loc90) + %tmp0_23 = arith.muli %ks1, %tmp0_22 : i64 loc(#loc90) + %tmp0_24 = arith.extui %tmp0_20 : i1 to i64 loc(#loc114) + %tmp0_25 = arith.addi %tmp0_24, %tmp0_23 : i64 loc(#loc91) + %tmp0_26 = tt.splat %tmp0_25 : i64 -> tensor<256xi64> loc(#loc93) + %tmp0_27 = arith.muli %tmp0_19, %tmp0_26 : tensor<256xi64> loc(#loc93) + %tmp0_28 = arith.addi %tmp0_18, %tmp0_27 : tensor<256xi64> loc(#loc94) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc95) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<256x!tt.ptr>, tensor<256xi64> loc(#loc95) + %tmp0_31 = tt.load %tmp0_30, %xmask_5 evictionPolicy = evict_last : tensor<256x!tt.ptr> loc(#loc96) + %tmp2 = tt.splat %in_ptr1 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc97) + %tmp2_32 = tt.addptr %tmp2, %x1 : tensor<256x!tt.ptr>, tensor<256xi64> loc(#loc97) + %tmp2_33 = tt.load %tmp2_32, %xmask_5 evictionPolicy = evict_last : tensor<256x!tt.ptr> loc(#loc98) + %tmp1 = arith.trunci %tmp0_31 : tensor<256xi64> to tensor<256xi32> loc(#loc99) + %tmp4 = arith.extsi %tmp2_33 : tensor<256xi32> to tensor<256xi64> loc(#loc100) + %tmp4_34 = arith.cmpi slt, %x0_7, %tmp4 : tensor<256xi64> loc(#loc100) + %tmp6 = arith.extsi %tmp1 : tensor<256xi32> to tensor<256xi64> loc(#loc101) + %tmp6_35 = arith.select %tmp4_34, %tmp6, %x0_6 : tensor<256xi1>, tensor<256xi64> loc(#loc101) + %tmp7 = arith.addi %ks0, %c1_i64 : i64 loc(#loc102) + %tmp8 = tt.splat %tmp7 : i64 -> tensor<256xi64> loc(#loc103) + %tmp8_36 = arith.addi %tmp6_35, %tmp8 : tensor<256xi64> loc(#loc103) + %tmp9 = arith.cmpi slt, %tmp6_35, %cst : tensor<256xi64> loc(#loc104) + %tmp10 = arith.select %tmp9, %tmp8_36, %tmp6_35 : tensor<256xi1>, tensor<256xi64> loc(#loc105) + %0 = arith.cmpi sge, %tmp10, %cst : tensor<256xi64> loc(#loc41) + %1 = arith.addi %ks3, %c127_i64 : i64 loc(#loc5) + %quot = arith.divsi %1, %c128_i64 : i64 loc(#loc115) + %remainder = arith.remsi %1, %c128_i64 : i64 loc(#loc116) + %fixed = arith.cmpi ne, %remainder, %c0_i64 : i64 loc(#loc117) + %fixed_37 = arith.subi %quot, %c1_i64 : i64 loc(#loc118) + %fixed_38 = arith.select %fixed, %fixed_37, %quot : i64 loc(#loc119) + %2 = arith.cmpi slt, %1, %c0_i64 : i64 loc(#loc111) + %3 = arith.select %2, %fixed_38, %quot : i64 loc(#loc112) + %4 = arith.addi %3, %c1_i64 : i64 loc(#loc49) + %5 = tt.splat %4 : i64 -> tensor<256xi64> loc(#loc50) + %6 = arith.cmpi slt, %tmp10, %5 : tensor<256xi64> loc(#loc50) + %7 = arith.andi %0, %6 : tensor<256xi1> loc(#loc51) + %8 = arith.xori %xmask_5, %cst_1 : tensor<256xi1> loc(#loc4) + %9 = arith.ori %7, %8 : tensor<256xi1> loc(#loc52) + tt.assert %9, "index out of bounds: 0 <= tmp10 < 1 + (triton_helpers.div_floor_integer(127 + ks3, 128))" : tensor<256xi1> loc(#loc53) + %10 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc54) + %11 = tt.addptr %10, %tmp0_28 : tensor<256x!tt.ptr>, tensor<256xi64> loc(#loc54) + tt.store %11, %tmp1, %xmask_5 : tensor<256x!tt.ptr> loc(#loc55) + %12 = arith.addi %tmp10, %x1 : tensor<256xi64> loc(#loc56) + %13 = arith.muli %x0_6, %x1 : tensor<256xi64> loc(#loc57) + %14 = arith.addi %12, %13 : tensor<256xi64> loc(#loc58) + %15 = tt.splat %out_ptr1 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc59) + %16 = tt.addptr %15, %14 : tensor<256x!tt.ptr>, tensor<256xi64> loc(#loc59) + tt.store %16, %cst_0, %xmask_5 : tensor<256x!tt.ptr> loc(#loc3) + tt.return loc(#loc60) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:96) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:54) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:108) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:90) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":19:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":19:33) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":20:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":20:23) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":21:21) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":22:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":23:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":23:28) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":24:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:54) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:80) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:71) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:62) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:46) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:39) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:35) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:91) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:153) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:179) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:170) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:161) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:145) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:138) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:87) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:30) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:186) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":27:30) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":27:35) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":28:19) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":30:18) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":32:32) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":33:15) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":34:18) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":35:18) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":36:33) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:28) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:51) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:46) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:38) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:106) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:116) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:25) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:187) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:33) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:42) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:38) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:25) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:4) +#loc70 = loc(callsite(#loc1 at #loc2)) +#loc71 = loc("xoffset"(#loc6)) +#loc72 = loc("xoffset"(#loc7)) +#loc73 = loc("xindex"(#loc8)) +#loc74 = loc("xindex"(#loc9)) +#loc75 = loc("xmask"(#loc10)) +#loc76 = loc("x0"(#loc11)) +#loc77 = loc("x1"(#loc12)) +#loc78 = loc("x1"(#loc13)) +#loc79 = loc("x2"(#loc14)) +#loc80 = loc("tmp0"(#loc15)) +#loc81 = loc("tmp0"(#loc16)) +#loc82 = loc("tmp0"(#loc17)) +#loc83 = loc("tmp0"(#loc18)) +#loc84 = loc("tmp0"(#loc19)) +#loc85 = loc("tmp0"(#loc20)) +#loc86 = loc("tmp0"(#loc21)) +#loc87 = loc("tmp0"(#loc22)) +#loc88 = loc("tmp0"(#loc23)) +#loc89 = loc("tmp0"(#loc24)) +#loc90 = loc("tmp0"(#loc25)) +#loc91 = loc("tmp0"(#loc26)) +#loc92 = loc("tmp0"(#loc27)) +#loc93 = loc("tmp0"(#loc28)) +#loc94 = loc("tmp0"(#loc29)) +#loc95 = loc("tmp0"(#loc30)) +#loc96 = loc("tmp0"(#loc31)) +#loc97 = loc("tmp2"(#loc32)) +#loc98 = loc("tmp2"(#loc33)) +#loc99 = loc("tmp1"(#loc34)) +#loc100 = loc("tmp4"(#loc35)) +#loc101 = loc("tmp6"(#loc36)) +#loc102 = loc("tmp7"(#loc37)) +#loc103 = loc("tmp8"(#loc38)) +#loc104 = loc("tmp9"(#loc39)) +#loc105 = loc("tmp10"(#loc40)) +#loc106 = loc("quot"(#loc42)) +#loc107 = loc("remainder"(#loc43)) +#loc108 = loc("fixed"(#loc44)) +#loc109 = loc("fixed"(#loc45)) +#loc110 = loc("fixed"(#loc46)) +#loc111 = loc(callsite(#loc47 at #loc2)) +#loc112 = loc(callsite(#loc48 at #loc2)) +#loc113 = loc(fused[#loc83, #loc84]) +#loc114 = loc(fused[#loc91, #loc92]) +#loc115 = loc(callsite(#loc106 at #loc2)) +#loc116 = loc(callsite(#loc107 at #loc2)) +#loc117 = loc(callsite(#loc108 at #loc2)) +#loc118 = loc(callsite(#loc109 at #loc2)) +#loc119 = loc(callsite(#loc110 at #loc2)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/__grp__triton_tem_fused_zeros_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/__grp__triton_tem_fused_zeros_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d5de827b9f21c786fa8d971b847e94c33c895b6d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/__grp__triton_tem_fused_zeros_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_tem_fused_zeros_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.source", "triton_tem_fused_zeros_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.ttir", "triton_tem_fused_zeros_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.ttgir", "triton_tem_fused_zeros_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.llir", "triton_tem_fused_zeros_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.ptx", "triton_tem_fused_zeros_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.cubin", "triton_tem_fused_zeros_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.json new file mode 100644 index 0000000000000000000000000000000000000000..aecbfdddc7aeb7694ec9b7825ad3cef755c794b9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.json @@ -0,0 +1 @@ +{"hash": "3d168f557bd091f87a867ffa145187b919205aec7c76b1a52b338456b28533eb", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 164864, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_zeros_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..ec9328bed84bbeaf31309080fb0002019030f949 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.llir @@ -0,0 +1,12657 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_tem_fused_zeros_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, ptr addrspace(1) %10, ptr addrspace(1) %11, ptr addrspace(1) %12, ptr addrspace(1) %13, ptr addrspace(1) %14, ptr addrspace(1) %15, ptr addrspace(1) %16, ptr addrspace(1) %17, ptr addrspace(1) readnone captures(none) %18, ptr addrspace(1) readnone captures(none) %19) local_unnamed_addr #0 !dbg !5 { + %21 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %22 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !9 + %23 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !10 + %24 = and i32 %22, 1, !dbg !11 + %25 = shl i32 %23, 18, !dbg !12 + %26 = shl nuw nsw i32 %24, 21, !dbg !13 + %27 = add i32 %26, %25, !dbg !14 + %28 = sext i32 %27 to i64, !dbg !15 + %29 = shl i32 %22, 21, !dbg !16 + %30 = add i32 %25, %29, !dbg !17 + %31 = sext i32 %30 to i64, !dbg !18 + %32 = getelementptr bfloat, ptr addrspace(1) %1, i64 %28, !dbg !19 + %33 = getelementptr bfloat, ptr addrspace(1) %2, i64 %28, !dbg !20 + %34 = getelementptr bfloat, ptr addrspace(1) %7, i64 %31, !dbg !21 + %35 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !22 + %36 = lshr i32 %35, 5, !dbg !22 + %37 = and i32 %35, 240, !dbg !22 + %38 = lshr exact i32 %37, 4, !dbg !22 + %39 = or disjoint i32 %38, 16, !dbg !22 + %40 = or disjoint i32 %38, 32, !dbg !22 + %41 = or disjoint i32 %38, 48, !dbg !22 + %42 = or disjoint i32 %38, 64, !dbg !22 + %43 = or disjoint i32 %38, 80, !dbg !22 + %44 = or disjoint i32 %38, 96, !dbg !22 + %45 = or disjoint i32 %38, 112, !dbg !22 + %46 = lshr i32 %35, 1, !dbg !22 + %47 = and i32 %46, 112, !dbg !22 + %48 = lshr i32 %35, 2, !dbg !22 + %49 = and i32 %48, 7, !dbg !22 + %50 = or disjoint i32 %47, %49, !dbg !22 + %51 = or disjoint i32 %50, 8, !dbg !22 + %52 = icmp samesign ugt i32 %21, 15, !dbg !23 + br i1 %52, label %53, label %4460, !dbg !24 + +53: ; preds = %20 + %54 = add nsw i32 %21, -16, !dbg !25 + %55 = lshr i32 %54, 4, !dbg !26 + %56 = shl nuw nsw i32 %23, 2, !dbg !27 + %57 = add nuw nsw i32 %55, %56, !dbg !28 + %58 = and i32 %21, 15, !dbg !29 + %59 = shl nuw nsw i32 %24, 4, !dbg !30 + %60 = or disjoint i32 %59, %58, !dbg !31 + %61 = shl nuw nsw i32 %24, 8, !dbg !32 + %62 = shl nuw nsw i32 %58, 4, !dbg !33 + %63 = or disjoint i32 %61, %62, !dbg !34 + %64 = shl i32 %57, 7, !dbg !35 + %65 = shl i32 %22, 23, !dbg !36 + %66 = add i32 %64, %65, !dbg !37 + %67 = sext i32 %66 to i64, !dbg !38 + %68 = shl i32 %57, 18, !dbg !39 + %69 = add i32 %68, %65, !dbg !40 + %70 = sext i32 %69 to i64, !dbg !41 + %71 = shl nuw i32 %22, 16, !dbg !42 + %72 = shl i32 %57, 11, !dbg !42 + %73 = add i32 %72, %71, !dbg !42 + %74 = sext i32 %73 to i64, !dbg !43 + %75 = getelementptr bfloat, ptr addrspace(1) %0, i64 %67, !dbg !44 + %76 = getelementptr bfloat, ptr addrspace(1) %5, i64 %70, !dbg !45 + %77 = getelementptr bfloat, ptr addrspace(1) %6, i64 %67, !dbg !46 + %78 = getelementptr float, ptr addrspace(1) %3, i64 %74, !dbg !47 + %79 = getelementptr float, ptr addrspace(1) %4, i64 %74, !dbg !48 + %80 = shl nuw nsw i32 %58, 7, !dbg !49 + %81 = or disjoint i32 %80, %38, !dbg !50 + %82 = or disjoint i32 %39, %80, !dbg !50 + %83 = or disjoint i32 %40, %80, !dbg !50 + %84 = or disjoint i32 %41, %80, !dbg !50 + %85 = or disjoint i32 %42, %80, !dbg !50 + %86 = or disjoint i32 %43, %80, !dbg !50 + %87 = or disjoint i32 %44, %80, !dbg !50 + %88 = or disjoint i32 %45, %80, !dbg !50 + %89 = insertelement <2 x i32> poison, i32 %51, i64 0, !dbg !50 + %90 = insertelement <2 x i32> %89, i32 %50, i64 1, !dbg !50 + %91 = insertelement <2 x i32> poison, i32 %80, i64 0, !dbg !50 + %92 = shufflevector <2 x i32> %90, <2 x i32> poison, <32 x i32> , !dbg !50 + %93 = shufflevector <2 x i32> %91, <2 x i32> poison, <32 x i32> zeroinitializer, !dbg !50 + %94 = or disjoint <32 x i32> %92, %93, !dbg !50 + %95 = shl nuw nsw i32 %81, 12, !dbg !51 + %96 = shl nuw nsw i32 %82, 12, !dbg !51 + %97 = shl nuw nsw i32 %83, 12, !dbg !51 + %98 = shl nuw nsw i32 %84, 12, !dbg !51 + %99 = shl nuw nsw i32 %85, 12, !dbg !51 + %100 = shl nuw nsw i32 %86, 12, !dbg !51 + %101 = shl nuw nsw i32 %87, 12, !dbg !51 + %102 = shl nuw nsw i32 %88, 12, !dbg !51 + %103 = zext nneg i32 %95 to i64, !dbg !54 + %104 = getelementptr bfloat, ptr addrspace(1) %75, i64 %103, !dbg !54 + %105 = zext nneg i32 %96 to i64, !dbg !54 + %106 = getelementptr bfloat, ptr addrspace(1) %75, i64 %105, !dbg !54 + %107 = zext nneg i32 %97 to i64, !dbg !54 + %108 = getelementptr bfloat, ptr addrspace(1) %75, i64 %107, !dbg !54 + %109 = zext nneg i32 %98 to i64, !dbg !54 + %110 = getelementptr bfloat, ptr addrspace(1) %75, i64 %109, !dbg !54 + %111 = zext nneg i32 %99 to i64, !dbg !54 + %112 = getelementptr bfloat, ptr addrspace(1) %75, i64 %111, !dbg !54 + %113 = zext nneg i32 %100 to i64, !dbg !54 + %114 = getelementptr bfloat, ptr addrspace(1) %75, i64 %113, !dbg !54 + %115 = zext nneg i32 %101 to i64, !dbg !54 + %116 = getelementptr bfloat, ptr addrspace(1) %75, i64 %115, !dbg !54 + %117 = zext nneg i32 %102 to i64, !dbg !54 + %118 = getelementptr bfloat, ptr addrspace(1) %75, i64 %117, !dbg !54 + %119 = shl nuw nsw i32 %35, 3, !dbg !55 + %120 = and i32 %119, 120, !dbg !55 + %121 = zext nneg i32 %120 to i64, !dbg !56 + %122 = getelementptr bfloat, ptr addrspace(1) %104, i64 %121, !dbg !56 + %123 = getelementptr bfloat, ptr addrspace(1) %106, i64 %121, !dbg !56 + %124 = getelementptr bfloat, ptr addrspace(1) %108, i64 %121, !dbg !56 + %125 = getelementptr bfloat, ptr addrspace(1) %110, i64 %121, !dbg !56 + %126 = getelementptr bfloat, ptr addrspace(1) %112, i64 %121, !dbg !56 + %127 = getelementptr bfloat, ptr addrspace(1) %114, i64 %121, !dbg !56 + %128 = getelementptr bfloat, ptr addrspace(1) %116, i64 %121, !dbg !56 + %129 = getelementptr bfloat, ptr addrspace(1) %118, i64 %121, !dbg !56 + %130 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %122) #3, !dbg !57 + %131 = extractvalue { i32, i32, i32, i32 } %130, 0, !dbg !57 + %132 = extractvalue { i32, i32, i32, i32 } %130, 1, !dbg !57 + %133 = extractvalue { i32, i32, i32, i32 } %130, 2, !dbg !57 + %134 = extractvalue { i32, i32, i32, i32 } %130, 3, !dbg !57 + %135 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %123) #3, !dbg !57 + %136 = extractvalue { i32, i32, i32, i32 } %135, 0, !dbg !57 + %137 = extractvalue { i32, i32, i32, i32 } %135, 1, !dbg !57 + %138 = extractvalue { i32, i32, i32, i32 } %135, 2, !dbg !57 + %139 = extractvalue { i32, i32, i32, i32 } %135, 3, !dbg !57 + %140 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %124) #3, !dbg !57 + %141 = extractvalue { i32, i32, i32, i32 } %140, 0, !dbg !57 + %142 = extractvalue { i32, i32, i32, i32 } %140, 1, !dbg !57 + %143 = extractvalue { i32, i32, i32, i32 } %140, 2, !dbg !57 + %144 = extractvalue { i32, i32, i32, i32 } %140, 3, !dbg !57 + %145 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %125) #3, !dbg !57 + %146 = extractvalue { i32, i32, i32, i32 } %145, 0, !dbg !57 + %147 = extractvalue { i32, i32, i32, i32 } %145, 1, !dbg !57 + %148 = extractvalue { i32, i32, i32, i32 } %145, 2, !dbg !57 + %149 = extractvalue { i32, i32, i32, i32 } %145, 3, !dbg !57 + %150 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %126) #3, !dbg !57 + %151 = extractvalue { i32, i32, i32, i32 } %150, 0, !dbg !57 + %152 = extractvalue { i32, i32, i32, i32 } %150, 1, !dbg !57 + %153 = extractvalue { i32, i32, i32, i32 } %150, 2, !dbg !57 + %154 = extractvalue { i32, i32, i32, i32 } %150, 3, !dbg !57 + %155 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %127) #3, !dbg !57 + %156 = extractvalue { i32, i32, i32, i32 } %155, 0, !dbg !57 + %157 = extractvalue { i32, i32, i32, i32 } %155, 1, !dbg !57 + %158 = extractvalue { i32, i32, i32, i32 } %155, 2, !dbg !57 + %159 = extractvalue { i32, i32, i32, i32 } %155, 3, !dbg !57 + %160 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %128) #3, !dbg !57 + %161 = extractvalue { i32, i32, i32, i32 } %160, 0, !dbg !57 + %162 = extractvalue { i32, i32, i32, i32 } %160, 1, !dbg !57 + %163 = extractvalue { i32, i32, i32, i32 } %160, 2, !dbg !57 + %164 = extractvalue { i32, i32, i32, i32 } %160, 3, !dbg !57 + %165 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %129) #3, !dbg !57 + %166 = extractvalue { i32, i32, i32, i32 } %165, 0, !dbg !57 + %167 = extractvalue { i32, i32, i32, i32 } %165, 1, !dbg !57 + %168 = extractvalue { i32, i32, i32, i32 } %165, 2, !dbg !57 + %169 = extractvalue { i32, i32, i32, i32 } %165, 3, !dbg !57 + %170 = shl nuw nsw i32 %35, 4, !dbg !57 + %171 = and i32 %170, 112, !dbg !57 + %172 = shl nuw nsw i32 %37, 3, !dbg !57 + %173 = and i32 %35, 112, !dbg !57 + %174 = and i32 %35, 8, !dbg !57 + %175 = shl nuw nsw i32 %174, 11, !dbg !57 + %176 = or disjoint i32 %171, %172, !dbg !57 + %177 = xor i32 %176, %173, !dbg !57 + %178 = or disjoint i32 %177, %175, !dbg !57 + %179 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %178, !dbg !57 + %180 = insertelement <4 x i32> poison, i32 %131, i64 0, !dbg !57 + %181 = insertelement <4 x i32> %180, i32 %132, i64 1, !dbg !57 + %182 = insertelement <4 x i32> %181, i32 %133, i64 2, !dbg !57 + %183 = insertelement <4 x i32> %182, i32 %134, i64 3, !dbg !57 + store <4 x i32> %183, ptr addrspace(3) %179, align 16, !dbg !57 + %184 = or disjoint i32 %178, 2048, !dbg !57 + %185 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %184, !dbg !57 + %186 = insertelement <4 x i32> poison, i32 %136, i64 0, !dbg !57 + %187 = insertelement <4 x i32> %186, i32 %137, i64 1, !dbg !57 + %188 = insertelement <4 x i32> %187, i32 %138, i64 2, !dbg !57 + %189 = insertelement <4 x i32> %188, i32 %139, i64 3, !dbg !57 + store <4 x i32> %189, ptr addrspace(3) %185, align 16, !dbg !57 + %190 = or disjoint i32 %178, 4096, !dbg !57 + %191 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %190, !dbg !57 + %192 = insertelement <4 x i32> poison, i32 %141, i64 0, !dbg !57 + %193 = insertelement <4 x i32> %192, i32 %142, i64 1, !dbg !57 + %194 = insertelement <4 x i32> %193, i32 %143, i64 2, !dbg !57 + %195 = insertelement <4 x i32> %194, i32 %144, i64 3, !dbg !57 + store <4 x i32> %195, ptr addrspace(3) %191, align 16, !dbg !57 + %196 = or disjoint i32 %178, 6144, !dbg !57 + %197 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %196, !dbg !57 + %198 = insertelement <4 x i32> poison, i32 %146, i64 0, !dbg !57 + %199 = insertelement <4 x i32> %198, i32 %147, i64 1, !dbg !57 + %200 = insertelement <4 x i32> %199, i32 %148, i64 2, !dbg !57 + %201 = insertelement <4 x i32> %200, i32 %149, i64 3, !dbg !57 + store <4 x i32> %201, ptr addrspace(3) %197, align 16, !dbg !57 + %202 = or disjoint i32 %178, 8192, !dbg !57 + %203 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %202, !dbg !57 + %204 = insertelement <4 x i32> poison, i32 %151, i64 0, !dbg !57 + %205 = insertelement <4 x i32> %204, i32 %152, i64 1, !dbg !57 + %206 = insertelement <4 x i32> %205, i32 %153, i64 2, !dbg !57 + %207 = insertelement <4 x i32> %206, i32 %154, i64 3, !dbg !57 + store <4 x i32> %207, ptr addrspace(3) %203, align 16, !dbg !57 + %208 = or disjoint i32 %178, 10240, !dbg !57 + %209 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %208, !dbg !57 + %210 = insertelement <4 x i32> poison, i32 %156, i64 0, !dbg !57 + %211 = insertelement <4 x i32> %210, i32 %157, i64 1, !dbg !57 + %212 = insertelement <4 x i32> %211, i32 %158, i64 2, !dbg !57 + %213 = insertelement <4 x i32> %212, i32 %159, i64 3, !dbg !57 + store <4 x i32> %213, ptr addrspace(3) %209, align 16, !dbg !57 + %214 = or disjoint i32 %178, 12288, !dbg !57 + %215 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %214, !dbg !57 + %216 = insertelement <4 x i32> poison, i32 %161, i64 0, !dbg !57 + %217 = insertelement <4 x i32> %216, i32 %162, i64 1, !dbg !57 + %218 = insertelement <4 x i32> %217, i32 %163, i64 2, !dbg !57 + %219 = insertelement <4 x i32> %218, i32 %164, i64 3, !dbg !57 + store <4 x i32> %219, ptr addrspace(3) %215, align 16, !dbg !57 + %220 = or disjoint i32 %178, 14336, !dbg !57 + %221 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %220, !dbg !57 + %222 = insertelement <4 x i32> poison, i32 %166, i64 0, !dbg !57 + %223 = insertelement <4 x i32> %222, i32 %167, i64 1, !dbg !57 + %224 = insertelement <4 x i32> %223, i32 %168, i64 2, !dbg !57 + %225 = insertelement <4 x i32> %224, i32 %169, i64 3, !dbg !57 + store <4 x i32> %225, ptr addrspace(3) %221, align 16, !dbg !57 + %226 = shl nuw nsw i32 %81, 7, !dbg !58 + %227 = shl nuw nsw i32 %82, 7, !dbg !58 + %228 = shl nuw nsw i32 %83, 7, !dbg !58 + %229 = shl nuw nsw i32 %84, 7, !dbg !58 + %230 = shl nuw nsw i32 %85, 7, !dbg !58 + %231 = shl nuw nsw i32 %86, 7, !dbg !58 + %232 = shl nuw nsw i32 %87, 7, !dbg !58 + %233 = shl nuw nsw i32 %88, 7, !dbg !58 + %234 = zext nneg i32 %226 to i64, !dbg !60 + %235 = getelementptr bfloat, ptr addrspace(1) %76, i64 %234, !dbg !60 + %236 = zext nneg i32 %227 to i64, !dbg !60 + %237 = getelementptr bfloat, ptr addrspace(1) %76, i64 %236, !dbg !60 + %238 = zext nneg i32 %228 to i64, !dbg !60 + %239 = getelementptr bfloat, ptr addrspace(1) %76, i64 %238, !dbg !60 + %240 = zext nneg i32 %229 to i64, !dbg !60 + %241 = getelementptr bfloat, ptr addrspace(1) %76, i64 %240, !dbg !60 + %242 = zext nneg i32 %230 to i64, !dbg !60 + %243 = getelementptr bfloat, ptr addrspace(1) %76, i64 %242, !dbg !60 + %244 = zext nneg i32 %231 to i64, !dbg !60 + %245 = getelementptr bfloat, ptr addrspace(1) %76, i64 %244, !dbg !60 + %246 = zext nneg i32 %232 to i64, !dbg !60 + %247 = getelementptr bfloat, ptr addrspace(1) %76, i64 %246, !dbg !60 + %248 = zext nneg i32 %233 to i64, !dbg !60 + %249 = getelementptr bfloat, ptr addrspace(1) %76, i64 %248, !dbg !60 + %250 = getelementptr bfloat, ptr addrspace(1) %235, i64 %121, !dbg !61 + %251 = getelementptr bfloat, ptr addrspace(1) %237, i64 %121, !dbg !61 + %252 = getelementptr bfloat, ptr addrspace(1) %239, i64 %121, !dbg !61 + %253 = getelementptr bfloat, ptr addrspace(1) %241, i64 %121, !dbg !61 + %254 = getelementptr bfloat, ptr addrspace(1) %243, i64 %121, !dbg !61 + %255 = getelementptr bfloat, ptr addrspace(1) %245, i64 %121, !dbg !61 + %256 = getelementptr bfloat, ptr addrspace(1) %247, i64 %121, !dbg !61 + %257 = getelementptr bfloat, ptr addrspace(1) %249, i64 %121, !dbg !61 + %258 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %250) #3, !dbg !62 + %259 = extractvalue { i32, i32, i32, i32 } %258, 0, !dbg !62 + %260 = extractvalue { i32, i32, i32, i32 } %258, 1, !dbg !62 + %261 = extractvalue { i32, i32, i32, i32 } %258, 2, !dbg !62 + %262 = extractvalue { i32, i32, i32, i32 } %258, 3, !dbg !62 + %263 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %251) #3, !dbg !62 + %264 = extractvalue { i32, i32, i32, i32 } %263, 0, !dbg !62 + %265 = extractvalue { i32, i32, i32, i32 } %263, 1, !dbg !62 + %266 = extractvalue { i32, i32, i32, i32 } %263, 2, !dbg !62 + %267 = extractvalue { i32, i32, i32, i32 } %263, 3, !dbg !62 + %268 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %252) #3, !dbg !62 + %269 = extractvalue { i32, i32, i32, i32 } %268, 0, !dbg !62 + %270 = extractvalue { i32, i32, i32, i32 } %268, 1, !dbg !62 + %271 = extractvalue { i32, i32, i32, i32 } %268, 2, !dbg !62 + %272 = extractvalue { i32, i32, i32, i32 } %268, 3, !dbg !62 + %273 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %253) #3, !dbg !62 + %274 = extractvalue { i32, i32, i32, i32 } %273, 0, !dbg !62 + %275 = extractvalue { i32, i32, i32, i32 } %273, 1, !dbg !62 + %276 = extractvalue { i32, i32, i32, i32 } %273, 2, !dbg !62 + %277 = extractvalue { i32, i32, i32, i32 } %273, 3, !dbg !62 + %278 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %254) #3, !dbg !62 + %279 = extractvalue { i32, i32, i32, i32 } %278, 0, !dbg !62 + %280 = extractvalue { i32, i32, i32, i32 } %278, 1, !dbg !62 + %281 = extractvalue { i32, i32, i32, i32 } %278, 2, !dbg !62 + %282 = extractvalue { i32, i32, i32, i32 } %278, 3, !dbg !62 + %283 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %255) #3, !dbg !62 + %284 = extractvalue { i32, i32, i32, i32 } %283, 0, !dbg !62 + %285 = extractvalue { i32, i32, i32, i32 } %283, 1, !dbg !62 + %286 = extractvalue { i32, i32, i32, i32 } %283, 2, !dbg !62 + %287 = extractvalue { i32, i32, i32, i32 } %283, 3, !dbg !62 + %288 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %256) #3, !dbg !62 + %289 = extractvalue { i32, i32, i32, i32 } %288, 0, !dbg !62 + %290 = extractvalue { i32, i32, i32, i32 } %288, 1, !dbg !62 + %291 = extractvalue { i32, i32, i32, i32 } %288, 2, !dbg !62 + %292 = extractvalue { i32, i32, i32, i32 } %288, 3, !dbg !62 + %293 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %257) #3, !dbg !62 + %294 = extractvalue { i32, i32, i32, i32 } %293, 0, !dbg !62 + %295 = extractvalue { i32, i32, i32, i32 } %293, 1, !dbg !62 + %296 = extractvalue { i32, i32, i32, i32 } %293, 2, !dbg !62 + %297 = extractvalue { i32, i32, i32, i32 } %293, 3, !dbg !62 + %298 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %178, !dbg !62 + %299 = insertelement <4 x i32> poison, i32 %259, i64 0, !dbg !62 + %300 = insertelement <4 x i32> %299, i32 %260, i64 1, !dbg !62 + %301 = insertelement <4 x i32> %300, i32 %261, i64 2, !dbg !62 + %302 = insertelement <4 x i32> %301, i32 %262, i64 3, !dbg !62 + store <4 x i32> %302, ptr addrspace(3) %298, align 16, !dbg !62 + %303 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %184, !dbg !62 + %304 = insertelement <4 x i32> poison, i32 %264, i64 0, !dbg !62 + %305 = insertelement <4 x i32> %304, i32 %265, i64 1, !dbg !62 + %306 = insertelement <4 x i32> %305, i32 %266, i64 2, !dbg !62 + %307 = insertelement <4 x i32> %306, i32 %267, i64 3, !dbg !62 + store <4 x i32> %307, ptr addrspace(3) %303, align 16, !dbg !62 + %308 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %190, !dbg !62 + %309 = insertelement <4 x i32> poison, i32 %269, i64 0, !dbg !62 + %310 = insertelement <4 x i32> %309, i32 %270, i64 1, !dbg !62 + %311 = insertelement <4 x i32> %310, i32 %271, i64 2, !dbg !62 + %312 = insertelement <4 x i32> %311, i32 %272, i64 3, !dbg !62 + store <4 x i32> %312, ptr addrspace(3) %308, align 16, !dbg !62 + %313 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %196, !dbg !62 + %314 = insertelement <4 x i32> poison, i32 %274, i64 0, !dbg !62 + %315 = insertelement <4 x i32> %314, i32 %275, i64 1, !dbg !62 + %316 = insertelement <4 x i32> %315, i32 %276, i64 2, !dbg !62 + %317 = insertelement <4 x i32> %316, i32 %277, i64 3, !dbg !62 + store <4 x i32> %317, ptr addrspace(3) %313, align 16, !dbg !62 + %318 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %202, !dbg !62 + %319 = insertelement <4 x i32> poison, i32 %279, i64 0, !dbg !62 + %320 = insertelement <4 x i32> %319, i32 %280, i64 1, !dbg !62 + %321 = insertelement <4 x i32> %320, i32 %281, i64 2, !dbg !62 + %322 = insertelement <4 x i32> %321, i32 %282, i64 3, !dbg !62 + store <4 x i32> %322, ptr addrspace(3) %318, align 16, !dbg !62 + %323 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %208, !dbg !62 + %324 = insertelement <4 x i32> poison, i32 %284, i64 0, !dbg !62 + %325 = insertelement <4 x i32> %324, i32 %285, i64 1, !dbg !62 + %326 = insertelement <4 x i32> %325, i32 %286, i64 2, !dbg !62 + %327 = insertelement <4 x i32> %326, i32 %287, i64 3, !dbg !62 + store <4 x i32> %327, ptr addrspace(3) %323, align 16, !dbg !62 + %328 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %214, !dbg !62 + %329 = insertelement <4 x i32> poison, i32 %289, i64 0, !dbg !62 + %330 = insertelement <4 x i32> %329, i32 %290, i64 1, !dbg !62 + %331 = insertelement <4 x i32> %330, i32 %291, i64 2, !dbg !62 + %332 = insertelement <4 x i32> %331, i32 %292, i64 3, !dbg !62 + store <4 x i32> %332, ptr addrspace(3) %328, align 16, !dbg !62 + %333 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %220, !dbg !62 + %334 = insertelement <4 x i32> poison, i32 %294, i64 0, !dbg !62 + %335 = insertelement <4 x i32> %334, i32 %295, i64 1, !dbg !62 + %336 = insertelement <4 x i32> %335, i32 %296, i64 2, !dbg !62 + %337 = insertelement <4 x i32> %336, i32 %297, i64 3, !dbg !62 + store <4 x i32> %337, ptr addrspace(3) %333, align 16, !dbg !62 + %338 = extractelement <32 x i32> %94, i64 2, !dbg !63 + %339 = zext nneg i32 %338 to i64, !dbg !63 + %340 = getelementptr float, ptr addrspace(1) %79, i64 %339, !dbg !63 + %341 = extractelement <32 x i32> %94, i64 0, !dbg !63 + %342 = zext nneg i32 %341 to i64, !dbg !63 + %343 = getelementptr float, ptr addrspace(1) %79, i64 %342, !dbg !63 + %344 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %340) #3, !dbg !64 + %345 = bitcast i32 %344 to float, !dbg !64 + %346 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %343) #3, !dbg !64 + %347 = bitcast i32 %346 to float, !dbg !64 + %348 = getelementptr float, ptr addrspace(1) %78, i64 %339, !dbg !65 + %349 = getelementptr float, ptr addrspace(1) %78, i64 %342, !dbg !65 + %350 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %348) #3, !dbg !66 + %351 = bitcast i32 %350 to float, !dbg !66 + %352 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %349) #3, !dbg !66 + %353 = bitcast i32 %352 to float, !dbg !66 + %354 = fcmp oeq float %351, 0xFFF0000000000000, !dbg !67 + %355 = fcmp oeq float %353, 0xFFF0000000000000, !dbg !67 + %356 = select i1 %354, float 0.000000e+00, float %351, !dbg !68 + %357 = select i1 %355, float 0.000000e+00, float %353, !dbg !68 + %358 = zext nneg i32 %63 to i64, !dbg !69 + %359 = getelementptr i32, ptr addrspace(1) %9, i64 %358, !dbg !69 + %360 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %359) #3, !dbg !70 + %361 = shl i32 %360, 7, !dbg !71 + %362 = zext nneg i32 %60 to i64, !dbg !72 + %363 = getelementptr i32, ptr addrspace(1) %8, i64 %362, !dbg !72 + %364 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %363) #3, !dbg !73 + %365 = and i32 %35, 3, !dbg !74 + %366 = or disjoint i32 %361, %38, !dbg !75 + %367 = or disjoint i32 %361, %39, !dbg !75 + %368 = or disjoint i32 %361, %40, !dbg !75 + %369 = or disjoint i32 %361, %41, !dbg !75 + %370 = shl i32 %366, 7, !dbg !76 + %371 = shl i32 %367, 7, !dbg !76 + %372 = shl i32 %368, 7, !dbg !76 + %373 = shl i32 %369, 7, !dbg !76 + %374 = sext i32 %370 to i64, !dbg !78 + %375 = getelementptr bfloat, ptr addrspace(1) %32, i64 %374, !dbg !78 + %376 = sext i32 %371 to i64, !dbg !78 + %377 = getelementptr bfloat, ptr addrspace(1) %32, i64 %376, !dbg !78 + %378 = sext i32 %372 to i64, !dbg !78 + %379 = getelementptr bfloat, ptr addrspace(1) %32, i64 %378, !dbg !78 + %380 = sext i32 %373 to i64, !dbg !78 + %381 = getelementptr bfloat, ptr addrspace(1) %32, i64 %380, !dbg !78 + %382 = getelementptr bfloat, ptr addrspace(1) %375, i64 %121, !dbg !79 + %383 = getelementptr bfloat, ptr addrspace(1) %377, i64 %121, !dbg !79 + %384 = getelementptr bfloat, ptr addrspace(1) %379, i64 %121, !dbg !79 + %385 = getelementptr bfloat, ptr addrspace(1) %381, i64 %121, !dbg !79 + %386 = getelementptr bfloat, ptr addrspace(1) %33, i64 %374, !dbg !80 + %387 = getelementptr bfloat, ptr addrspace(1) %33, i64 %376, !dbg !80 + %388 = getelementptr bfloat, ptr addrspace(1) %33, i64 %378, !dbg !80 + %389 = getelementptr bfloat, ptr addrspace(1) %33, i64 %380, !dbg !80 + %390 = getelementptr bfloat, ptr addrspace(1) %386, i64 %121, !dbg !81 + %391 = getelementptr bfloat, ptr addrspace(1) %387, i64 %121, !dbg !81 + %392 = getelementptr bfloat, ptr addrspace(1) %388, i64 %121, !dbg !81 + %393 = getelementptr bfloat, ptr addrspace(1) %389, i64 %121, !dbg !81 + %394 = shl i32 %364, 1, !dbg !82 + %395 = zext nneg i32 %22 to i64, !dbg !83 + %396 = getelementptr i64, ptr addrspace(1) %16, i64 %395, !dbg !83 + %397 = icmp sgt i32 %394, 0, !dbg !84 + %398 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %396, i1 %397) #3, !dbg !85 + %399 = icmp sgt i64 %398, %339, !dbg !86 + %400 = icmp sgt i64 %398, %342, !dbg !86 + %401 = shl nuw nsw i32 %174, 10, !dbg !87 + %402 = or disjoint i32 %177, %401, !dbg !87 + %403 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %402, !dbg !87 + %404 = select i1 %397, i32 16, i32 0, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %403, ptr addrspace(1) %382, i32 %404) #3, !dbg !87 + %405 = or disjoint i32 %402, 2048, !dbg !87 + %406 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %405, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %406, ptr addrspace(1) %383, i32 %404) #3, !dbg !87 + %407 = or disjoint i32 %402, 4096, !dbg !87 + %408 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %407, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %408, ptr addrspace(1) %384, i32 %404) #3, !dbg !87 + %409 = or disjoint i32 %402, 6144, !dbg !87 + %410 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %409, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %410, ptr addrspace(1) %385, i32 %404) #3, !dbg !87 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !87 + %411 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %402, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %411, ptr addrspace(1) %390, i32 %404) #3, !dbg !87 + %412 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %405, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %412, ptr addrspace(1) %391, i32 %404) #3, !dbg !87 + %413 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %407, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %413, ptr addrspace(1) %392, i32 %404) #3, !dbg !87 + %414 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %409, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %414, ptr addrspace(1) %393, i32 %404) #3, !dbg !87 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !87 + %415 = icmp sgt i32 %394, 1, !dbg !84 + %416 = getelementptr i8, ptr addrspace(1) %382, i64 16384, !dbg !88 + %417 = getelementptr i8, ptr addrspace(1) %383, i64 16384, !dbg !88 + %418 = getelementptr i8, ptr addrspace(1) %384, i64 16384, !dbg !88 + %419 = getelementptr i8, ptr addrspace(1) %385, i64 16384, !dbg !88 + %420 = getelementptr i8, ptr addrspace(1) %390, i64 16384, !dbg !89 + %421 = getelementptr i8, ptr addrspace(1) %391, i64 16384, !dbg !89 + %422 = getelementptr i8, ptr addrspace(1) %392, i64 16384, !dbg !89 + %423 = getelementptr i8, ptr addrspace(1) %393, i64 16384, !dbg !89 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !87 + %424 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %402, !dbg !87 + %425 = select i1 %415, i32 16, i32 0, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %424, ptr addrspace(1) %416, i32 %425) #3, !dbg !87 + %426 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %405, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %426, ptr addrspace(1) %417, i32 %425) #3, !dbg !87 + %427 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %407, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %427, ptr addrspace(1) %418, i32 %425) #3, !dbg !87 + %428 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %409, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %428, ptr addrspace(1) %419, i32 %425) #3, !dbg !87 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !87 + %429 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %402, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %429, ptr addrspace(1) %420, i32 %425) #3, !dbg !87 + %430 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %405, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %430, ptr addrspace(1) %421, i32 %425) #3, !dbg !87 + %431 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %407, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %431, ptr addrspace(1) %422, i32 %425) #3, !dbg !87 + %432 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %409, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %432, ptr addrspace(1) %423, i32 %425) #3, !dbg !87 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !87 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #3, !dbg !90 + br i1 %397, label %.lr.ph1846, label %._crit_edge1847, !dbg !84 + +.lr.ph1846: ; preds = %53 + %433 = tail call i32 @llvm.umin.i32(i32 %394, i32 32), !dbg !91 + %434 = shl nuw nsw i32 %365, 1, !dbg !74 + %435 = or disjoint i32 %361, %434, !dbg !74 + %436 = insertelement <8 x i32> poison, i32 %435, i64 0, !dbg !75 + %437 = add nsw i32 %433, -2 + %438 = add nsw i32 %433, -1 + %439 = shufflevector <8 x i32> %436, <8 x i32> poison, <16 x i32> + %440 = insertelement <16 x i32> %439, i32 %435, i64 8 + %441 = insertelement <16 x i32> %440, i32 %435, i64 9 + %442 = insertelement <16 x i32> %441, i32 %435, i64 10 + %443 = insertelement <16 x i32> %442, i32 %435, i64 11 + %444 = insertelement <16 x i32> %443, i32 %435, i64 12 + %445 = insertelement <16 x i32> %444, i32 %435, i64 13 + %446 = insertelement <16 x i32> %445, i32 %435, i64 14 + %447 = or disjoint <16 x i32> %446, + %448 = insertelement <16 x i32> %447, i32 %435, i64 15 + %449 = insertelement <8 x i64> poison, i64 %398, i64 0, !dbg !92 + %450 = shufflevector <8 x i64> %449, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !92 + br label %451, !dbg !84 + +451: ; preds = %.lr.ph1846, %__nv_exp2f.exit1507 + %452 = phi i32 [ 64, %.lr.ph1846 ], [ %2289, %__nv_exp2f.exit1507 ] + %453 = phi i32 [ -1, %.lr.ph1846 ], [ %526, %__nv_exp2f.exit1507 ] + %454 = phi i32 [ 1, %.lr.ph1846 ], [ %2302, %__nv_exp2f.exit1507 ] + %.pn9181828 = phi ptr addrspace(1) [ %423, %.lr.ph1846 ], [ %2299, %__nv_exp2f.exit1507 ] + %.pn9341827 = phi ptr addrspace(1) [ %422, %.lr.ph1846 ], [ %2298, %__nv_exp2f.exit1507 ] + %.pn9501826 = phi ptr addrspace(1) [ %421, %.lr.ph1846 ], [ %2297, %__nv_exp2f.exit1507 ] + %.pn9661825 = phi ptr addrspace(1) [ %420, %.lr.ph1846 ], [ %2296, %__nv_exp2f.exit1507 ] + %.pn8541824 = phi ptr addrspace(1) [ %419, %.lr.ph1846 ], [ %2295, %__nv_exp2f.exit1507 ] + %.pn8701823 = phi ptr addrspace(1) [ %418, %.lr.ph1846 ], [ %2294, %__nv_exp2f.exit1507 ] + %.pn8861822 = phi ptr addrspace(1) [ %417, %.lr.ph1846 ], [ %2293, %__nv_exp2f.exit1507 ] + %.pn9021821 = phi ptr addrspace(1) [ %416, %.lr.ph1846 ], [ %2292, %__nv_exp2f.exit1507 ] + %455 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2203, %__nv_exp2f.exit1507 ] + %456 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2204, %__nv_exp2f.exit1507 ] + %457 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2205, %__nv_exp2f.exit1507 ] + %458 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2206, %__nv_exp2f.exit1507 ] + %459 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2207, %__nv_exp2f.exit1507 ] + %460 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2208, %__nv_exp2f.exit1507 ] + %461 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2209, %__nv_exp2f.exit1507 ] + %462 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2210, %__nv_exp2f.exit1507 ] + %463 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2211, %__nv_exp2f.exit1507 ] + %464 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2212, %__nv_exp2f.exit1507 ] + %465 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2213, %__nv_exp2f.exit1507 ] + %466 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2214, %__nv_exp2f.exit1507 ] + %467 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2215, %__nv_exp2f.exit1507 ] + %468 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2216, %__nv_exp2f.exit1507 ] + %469 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2217, %__nv_exp2f.exit1507 ] + %470 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2218, %__nv_exp2f.exit1507 ] + %471 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2219, %__nv_exp2f.exit1507 ] + %472 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2220, %__nv_exp2f.exit1507 ] + %473 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2221, %__nv_exp2f.exit1507 ] + %474 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2222, %__nv_exp2f.exit1507 ] + %475 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2223, %__nv_exp2f.exit1507 ] + %476 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2224, %__nv_exp2f.exit1507 ] + %477 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2225, %__nv_exp2f.exit1507 ] + %478 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2226, %__nv_exp2f.exit1507 ] + %479 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2227, %__nv_exp2f.exit1507 ] + %480 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2228, %__nv_exp2f.exit1507 ] + %481 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2229, %__nv_exp2f.exit1507 ] + %482 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2230, %__nv_exp2f.exit1507 ] + %483 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2231, %__nv_exp2f.exit1507 ] + %484 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2232, %__nv_exp2f.exit1507 ] + %485 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2233, %__nv_exp2f.exit1507 ] + %486 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2234, %__nv_exp2f.exit1507 ] + %487 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2235, %__nv_exp2f.exit1507 ] + %488 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2236, %__nv_exp2f.exit1507 ] + %489 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2237, %__nv_exp2f.exit1507 ] + %490 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2238, %__nv_exp2f.exit1507 ] + %491 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2239, %__nv_exp2f.exit1507 ] + %492 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2240, %__nv_exp2f.exit1507 ] + %493 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2241, %__nv_exp2f.exit1507 ] + %494 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2242, %__nv_exp2f.exit1507 ] + %495 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2243, %__nv_exp2f.exit1507 ] + %496 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2244, %__nv_exp2f.exit1507 ] + %497 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2245, %__nv_exp2f.exit1507 ] + %498 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2246, %__nv_exp2f.exit1507 ] + %499 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2247, %__nv_exp2f.exit1507 ] + %500 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2248, %__nv_exp2f.exit1507 ] + %501 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2249, %__nv_exp2f.exit1507 ] + %502 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2250, %__nv_exp2f.exit1507 ] + %503 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2251, %__nv_exp2f.exit1507 ] + %504 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2252, %__nv_exp2f.exit1507 ] + %505 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2253, %__nv_exp2f.exit1507 ] + %506 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2254, %__nv_exp2f.exit1507 ] + %507 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2255, %__nv_exp2f.exit1507 ] + %508 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2256, %__nv_exp2f.exit1507 ] + %509 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2257, %__nv_exp2f.exit1507 ] + %510 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2258, %__nv_exp2f.exit1507 ] + %511 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2259, %__nv_exp2f.exit1507 ] + %512 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2260, %__nv_exp2f.exit1507 ] + %513 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2261, %__nv_exp2f.exit1507 ] + %514 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2262, %__nv_exp2f.exit1507 ] + %515 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2263, %__nv_exp2f.exit1507 ] + %516 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2264, %__nv_exp2f.exit1507 ] + %517 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2265, %__nv_exp2f.exit1507 ] + %518 = phi float [ 0.000000e+00, %.lr.ph1846 ], [ %2266, %__nv_exp2f.exit1507 ] + %519 = phi i32 [ 0, %.lr.ph1846 ], [ %2270, %__nv_exp2f.exit1507 ] + %520 = phi <16 x i32> [ %448, %.lr.ph1846 ], [ %2269, %__nv_exp2f.exit1507 ] + %521 = shufflevector <16 x i32> %520, <16 x i32> poison, <32 x i32> + %522 = icmp slt i32 %519, %437, !dbg !84 + %523 = icmp slt i32 %519, %438, !dbg !84 + %524 = add i32 %453, 1, !dbg !84 + %525 = icmp sgt i32 %524, 2, !dbg !84 + %526 = select i1 %525, i32 0, i32 %524, !dbg !84 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !87 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !87 + %527 = shl i32 %526, 13, !dbg !87 + %528 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %527, !dbg !87 + %529 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %36, i32 0, i32 31), !dbg !90 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !90 + %530 = shl i32 %529, 11, !dbg !90 + %531 = and i32 %530, 8192, !dbg !90 + %532 = add i32 %531, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !90 + %533 = lshr exact i32 %532, 4, !dbg !90 + %534 = and i32 %533, 16383, !dbg !90 + %535 = zext nneg i32 %534 to i64, !dbg !90 + %536 = or disjoint i64 %535, 4611686293372403712, !dbg !90 + %537 = ptrtoint ptr addrspace(3) %528 to i32, !dbg !90 + %538 = lshr exact i32 %537, 4, !dbg !90 + %539 = and i32 %538, 16383, !dbg !90 + %540 = zext nneg i32 %539 to i64, !dbg !90 + %541 = or disjoint i64 %540, 4611686293338849280, !dbg !90 + %542 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %536, i64 %541) #3, !dbg !90 + %543 = or disjoint i32 %531, 32, !dbg !90 + %544 = add i32 %543, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !90 + %545 = lshr exact i32 %544, 4, !dbg !90 + %546 = and i32 %545, 16383, !dbg !90 + %547 = zext nneg i32 %546 to i64, !dbg !90 + %548 = or disjoint i64 %547, 4611686293372403712, !dbg !90 + %549 = add i32 %537, 32, !dbg !90 + %550 = lshr exact i32 %549, 4, !dbg !90 + %551 = and i32 %550, 16383, !dbg !90 + %552 = zext nneg i32 %551 to i64, !dbg !90 + %553 = or disjoint i64 %552, 4611686293338849280, !dbg !90 + %554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 0, !dbg !90 + %555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 1, !dbg !90 + %556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 2, !dbg !90 + %557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 3, !dbg !90 + %558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 4, !dbg !90 + %559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 5, !dbg !90 + %560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 6, !dbg !90 + %561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 7, !dbg !90 + %562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 8, !dbg !90 + %563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 9, !dbg !90 + %564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 10, !dbg !90 + %565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 11, !dbg !90 + %566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 12, !dbg !90 + %567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 13, !dbg !90 + %568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 14, !dbg !90 + %569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 15, !dbg !90 + %570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 16, !dbg !90 + %571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 17, !dbg !90 + %572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 18, !dbg !90 + %573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 19, !dbg !90 + %574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 20, !dbg !90 + %575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 21, !dbg !90 + %576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 22, !dbg !90 + %577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 23, !dbg !90 + %578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 24, !dbg !90 + %579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 25, !dbg !90 + %580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 26, !dbg !90 + %581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 27, !dbg !90 + %582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 28, !dbg !90 + %583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 29, !dbg !90 + %584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 30, !dbg !90 + %585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %542, 31, !dbg !90 + %586 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %554, float %555, float %556, float %557, float %558, float %559, float %560, float %561, float %562, float %563, float %564, float %565, float %566, float %567, float %568, float %569, float %570, float %571, float %572, float %573, float %574, float %575, float %576, float %577, float %578, float %579, float %580, float %581, float %582, float %583, float %584, float %585, i64 %548, i64 %553, i1 true) #3, !dbg !90 + %587 = or disjoint i32 %531, 64, !dbg !90 + %588 = add i32 %587, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !90 + %589 = lshr exact i32 %588, 4, !dbg !90 + %590 = and i32 %589, 16383, !dbg !90 + %591 = zext nneg i32 %590 to i64, !dbg !90 + %592 = or disjoint i64 %591, 4611686293372403712, !dbg !90 + %593 = add i32 %537, 64, !dbg !90 + %594 = lshr exact i32 %593, 4, !dbg !90 + %595 = and i32 %594, 16383, !dbg !90 + %596 = zext nneg i32 %595 to i64, !dbg !90 + %597 = or disjoint i64 %596, 4611686293338849280, !dbg !90 + %598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 0, !dbg !90 + %599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 1, !dbg !90 + %600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 2, !dbg !90 + %601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 3, !dbg !90 + %602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 4, !dbg !90 + %603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 5, !dbg !90 + %604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 6, !dbg !90 + %605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 7, !dbg !90 + %606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 8, !dbg !90 + %607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 9, !dbg !90 + %608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 10, !dbg !90 + %609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 11, !dbg !90 + %610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 12, !dbg !90 + %611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 13, !dbg !90 + %612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 14, !dbg !90 + %613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 15, !dbg !90 + %614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 16, !dbg !90 + %615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 17, !dbg !90 + %616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 18, !dbg !90 + %617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 19, !dbg !90 + %618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 20, !dbg !90 + %619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 21, !dbg !90 + %620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 22, !dbg !90 + %621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 23, !dbg !90 + %622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 24, !dbg !90 + %623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 25, !dbg !90 + %624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 26, !dbg !90 + %625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 27, !dbg !90 + %626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 28, !dbg !90 + %627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 29, !dbg !90 + %628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 30, !dbg !90 + %629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %586, 31, !dbg !90 + %630 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %598, float %599, float %600, float %601, float %602, float %603, float %604, float %605, float %606, float %607, float %608, float %609, float %610, float %611, float %612, float %613, float %614, float %615, float %616, float %617, float %618, float %619, float %620, float %621, float %622, float %623, float %624, float %625, float %626, float %627, float %628, float %629, i64 %592, i64 %597, i1 true) #3, !dbg !90 + %631 = or disjoint i32 %531, 96, !dbg !90 + %632 = add i32 %631, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !90 + %633 = lshr exact i32 %632, 4, !dbg !90 + %634 = and i32 %633, 16383, !dbg !90 + %635 = zext nneg i32 %634 to i64, !dbg !90 + %636 = or disjoint i64 %635, 4611686293372403712, !dbg !90 + %637 = add i32 %537, 96, !dbg !90 + %638 = lshr exact i32 %637, 4, !dbg !90 + %639 = and i32 %638, 16383, !dbg !90 + %640 = zext nneg i32 %639 to i64, !dbg !90 + %641 = or disjoint i64 %640, 4611686293338849280, !dbg !90 + %642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 0, !dbg !90 + %643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 1, !dbg !90 + %644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 2, !dbg !90 + %645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 3, !dbg !90 + %646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 4, !dbg !90 + %647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 5, !dbg !90 + %648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 6, !dbg !90 + %649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 7, !dbg !90 + %650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 8, !dbg !90 + %651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 9, !dbg !90 + %652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 10, !dbg !90 + %653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 11, !dbg !90 + %654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 12, !dbg !90 + %655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 13, !dbg !90 + %656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 14, !dbg !90 + %657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 15, !dbg !90 + %658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 16, !dbg !90 + %659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 17, !dbg !90 + %660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 18, !dbg !90 + %661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 19, !dbg !90 + %662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 20, !dbg !90 + %663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 21, !dbg !90 + %664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 22, !dbg !90 + %665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 23, !dbg !90 + %666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 24, !dbg !90 + %667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 25, !dbg !90 + %668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 26, !dbg !90 + %669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 27, !dbg !90 + %670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 28, !dbg !90 + %671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 29, !dbg !90 + %672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 30, !dbg !90 + %673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %630, 31, !dbg !90 + %674 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %642, float %643, float %644, float %645, float %646, float %647, float %648, float %649, float %650, float %651, float %652, float %653, float %654, float %655, float %656, float %657, float %658, float %659, float %660, float %661, float %662, float %663, float %664, float %665, float %666, float %667, float %668, float %669, float %670, float %671, float %672, float %673, i64 %636, i64 %641, i1 true) #3, !dbg !90 + %675 = or disjoint i32 %531, 16384, !dbg !90 + %676 = add i32 %675, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !90 + %677 = lshr exact i32 %676, 4, !dbg !90 + %678 = and i32 %677, 16383, !dbg !90 + %679 = zext nneg i32 %678 to i64, !dbg !90 + %680 = or disjoint i64 %679, 4611686293372403712, !dbg !90 + %681 = add i32 %537, 8192, !dbg !90 + %682 = lshr exact i32 %681, 4, !dbg !90 + %683 = and i32 %682, 16383, !dbg !90 + %684 = zext nneg i32 %683 to i64, !dbg !90 + %685 = or disjoint i64 %684, 4611686293338849280, !dbg !90 + %686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 0, !dbg !90 + %687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 1, !dbg !90 + %688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 2, !dbg !90 + %689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 3, !dbg !90 + %690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 4, !dbg !90 + %691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 5, !dbg !90 + %692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 6, !dbg !90 + %693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 7, !dbg !90 + %694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 8, !dbg !90 + %695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 9, !dbg !90 + %696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 10, !dbg !90 + %697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 11, !dbg !90 + %698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 12, !dbg !90 + %699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 13, !dbg !90 + %700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 14, !dbg !90 + %701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 15, !dbg !90 + %702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 16, !dbg !90 + %703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 17, !dbg !90 + %704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 18, !dbg !90 + %705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 19, !dbg !90 + %706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 20, !dbg !90 + %707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 21, !dbg !90 + %708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 22, !dbg !90 + %709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 23, !dbg !90 + %710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 24, !dbg !90 + %711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 25, !dbg !90 + %712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 26, !dbg !90 + %713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 27, !dbg !90 + %714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 28, !dbg !90 + %715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 29, !dbg !90 + %716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 30, !dbg !90 + %717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %674, 31, !dbg !90 + %718 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %686, float %687, float %688, float %689, float %690, float %691, float %692, float %693, float %694, float %695, float %696, float %697, float %698, float %699, float %700, float %701, float %702, float %703, float %704, float %705, float %706, float %707, float %708, float %709, float %710, float %711, float %712, float %713, float %714, float %715, float %716, float %717, i64 %680, i64 %685, i1 true) #3, !dbg !90 + %719 = or disjoint i32 %531, 16416, !dbg !90 + %720 = add i32 %719, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !90 + %721 = lshr exact i32 %720, 4, !dbg !90 + %722 = and i32 %721, 16383, !dbg !90 + %723 = zext nneg i32 %722 to i64, !dbg !90 + %724 = or disjoint i64 %723, 4611686293372403712, !dbg !90 + %725 = add i32 %537, 8224, !dbg !90 + %726 = lshr exact i32 %725, 4, !dbg !90 + %727 = and i32 %726, 16383, !dbg !90 + %728 = zext nneg i32 %727 to i64, !dbg !90 + %729 = or disjoint i64 %728, 4611686293338849280, !dbg !90 + %730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 0, !dbg !90 + %731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 1, !dbg !90 + %732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 2, !dbg !90 + %733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 3, !dbg !90 + %734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 4, !dbg !90 + %735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 5, !dbg !90 + %736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 6, !dbg !90 + %737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 7, !dbg !90 + %738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 8, !dbg !90 + %739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 9, !dbg !90 + %740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 10, !dbg !90 + %741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 11, !dbg !90 + %742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 12, !dbg !90 + %743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 13, !dbg !90 + %744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 14, !dbg !90 + %745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 15, !dbg !90 + %746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 16, !dbg !90 + %747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 17, !dbg !90 + %748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 18, !dbg !90 + %749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 19, !dbg !90 + %750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 20, !dbg !90 + %751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 21, !dbg !90 + %752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 22, !dbg !90 + %753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 23, !dbg !90 + %754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 24, !dbg !90 + %755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 25, !dbg !90 + %756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 26, !dbg !90 + %757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 27, !dbg !90 + %758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 28, !dbg !90 + %759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 29, !dbg !90 + %760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 30, !dbg !90 + %761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %718, 31, !dbg !90 + %762 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %730, float %731, float %732, float %733, float %734, float %735, float %736, float %737, float %738, float %739, float %740, float %741, float %742, float %743, float %744, float %745, float %746, float %747, float %748, float %749, float %750, float %751, float %752, float %753, float %754, float %755, float %756, float %757, float %758, float %759, float %760, float %761, i64 %724, i64 %729, i1 true) #3, !dbg !90 + %763 = or disjoint i32 %531, 16448, !dbg !90 + %764 = add i32 %763, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !90 + %765 = lshr exact i32 %764, 4, !dbg !90 + %766 = and i32 %765, 16383, !dbg !90 + %767 = zext nneg i32 %766 to i64, !dbg !90 + %768 = or disjoint i64 %767, 4611686293372403712, !dbg !90 + %769 = add i32 %537, 8256, !dbg !90 + %770 = lshr exact i32 %769, 4, !dbg !90 + %771 = and i32 %770, 16383, !dbg !90 + %772 = zext nneg i32 %771 to i64, !dbg !90 + %773 = or disjoint i64 %772, 4611686293338849280, !dbg !90 + %774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 0, !dbg !90 + %775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 1, !dbg !90 + %776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 2, !dbg !90 + %777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 3, !dbg !90 + %778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 4, !dbg !90 + %779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 5, !dbg !90 + %780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 6, !dbg !90 + %781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 7, !dbg !90 + %782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 8, !dbg !90 + %783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 9, !dbg !90 + %784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 10, !dbg !90 + %785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 11, !dbg !90 + %786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 12, !dbg !90 + %787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 13, !dbg !90 + %788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 14, !dbg !90 + %789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 15, !dbg !90 + %790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 16, !dbg !90 + %791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 17, !dbg !90 + %792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 18, !dbg !90 + %793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 19, !dbg !90 + %794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 20, !dbg !90 + %795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 21, !dbg !90 + %796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 22, !dbg !90 + %797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 23, !dbg !90 + %798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 24, !dbg !90 + %799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 25, !dbg !90 + %800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 26, !dbg !90 + %801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 27, !dbg !90 + %802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 28, !dbg !90 + %803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 29, !dbg !90 + %804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 30, !dbg !90 + %805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %762, 31, !dbg !90 + %806 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %774, float %775, float %776, float %777, float %778, float %779, float %780, float %781, float %782, float %783, float %784, float %785, float %786, float %787, float %788, float %789, float %790, float %791, float %792, float %793, float %794, float %795, float %796, float %797, float %798, float %799, float %800, float %801, float %802, float %803, float %804, float %805, i64 %768, i64 %773, i1 true) #3, !dbg !90 + %807 = or disjoint i32 %531, 16480, !dbg !90 + %808 = add i32 %807, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !90 + %809 = lshr exact i32 %808, 4, !dbg !90 + %810 = and i32 %809, 16383, !dbg !90 + %811 = zext nneg i32 %810 to i64, !dbg !90 + %812 = or disjoint i64 %811, 4611686293372403712, !dbg !90 + %813 = add i32 %537, 8288, !dbg !90 + %814 = lshr exact i32 %813, 4, !dbg !90 + %815 = and i32 %814, 16383, !dbg !90 + %816 = zext nneg i32 %815 to i64, !dbg !90 + %817 = or disjoint i64 %816, 4611686293338849280, !dbg !90 + %818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 0, !dbg !90 + %819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 1, !dbg !90 + %820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 2, !dbg !90 + %821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 3, !dbg !90 + %822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 4, !dbg !90 + %823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 5, !dbg !90 + %824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 6, !dbg !90 + %825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 7, !dbg !90 + %826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 8, !dbg !90 + %827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 9, !dbg !90 + %828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 10, !dbg !90 + %829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 11, !dbg !90 + %830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 12, !dbg !90 + %831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 13, !dbg !90 + %832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 14, !dbg !90 + %833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 15, !dbg !90 + %834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 16, !dbg !90 + %835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 17, !dbg !90 + %836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 18, !dbg !90 + %837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 19, !dbg !90 + %838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 20, !dbg !90 + %839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 21, !dbg !90 + %840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 22, !dbg !90 + %841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 23, !dbg !90 + %842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 24, !dbg !90 + %843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 25, !dbg !90 + %844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 26, !dbg !90 + %845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 27, !dbg !90 + %846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 28, !dbg !90 + %847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 29, !dbg !90 + %848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 30, !dbg !90 + %849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %806, 31, !dbg !90 + %850 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %818, float %819, float %820, float %821, float %822, float %823, float %824, float %825, float %826, float %827, float %828, float %829, float %830, float %831, float %832, float %833, float %834, float %835, float %836, float %837, float %838, float %839, float %840, float %841, float %842, float %843, float %844, float %845, float %846, float %847, float %848, float %849, i64 %812, i64 %817, i1 true) #3, !dbg !90 + %851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 0, !dbg !90 + %852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 1, !dbg !90 + %853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 2, !dbg !90 + %854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 3, !dbg !90 + %855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 4, !dbg !90 + %856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 5, !dbg !90 + %857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 6, !dbg !90 + %858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 7, !dbg !90 + %859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 8, !dbg !90 + %860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 9, !dbg !90 + %861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 10, !dbg !90 + %862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 11, !dbg !90 + %863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 12, !dbg !90 + %864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 13, !dbg !90 + %865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 14, !dbg !90 + %866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 15, !dbg !90 + %867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 16, !dbg !90 + %868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 17, !dbg !90 + %869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 18, !dbg !90 + %870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 19, !dbg !90 + %871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 20, !dbg !90 + %872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 21, !dbg !90 + %873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 22, !dbg !90 + %874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 23, !dbg !90 + %875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 24, !dbg !90 + %876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 25, !dbg !90 + %877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 26, !dbg !90 + %878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 27, !dbg !90 + %879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 28, !dbg !90 + %880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 29, !dbg !90 + %881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 30, !dbg !90 + %882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %850, 31, !dbg !90 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !90 + %883 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %851, float %852, float %853, float %854, float %855, float %856, float %857, float %858, float %859, float %860, float %861, float %862, float %863, float %864, float %865, float %866, float %867, float %868, float %869, float %870, float %871, float %872, float %873, float %874, float %875, float %876, float %877, float %878, float %879, float %880, float %881, float %882, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %528, i32 0, i32 0) #3, !dbg !90 + %884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 0, !dbg !90 + %885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 1, !dbg !90 + %886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 2, !dbg !90 + %887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 3, !dbg !90 + %888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 4, !dbg !90 + %889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 5, !dbg !90 + %890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 6, !dbg !90 + %891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 7, !dbg !90 + %892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 8, !dbg !90 + %893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 9, !dbg !90 + %894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 10, !dbg !90 + %895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 11, !dbg !90 + %896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 12, !dbg !90 + %897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 13, !dbg !90 + %898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 14, !dbg !90 + %899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 15, !dbg !90 + %900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 16, !dbg !90 + %901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 17, !dbg !90 + %902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 18, !dbg !90 + %903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 19, !dbg !90 + %904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 20, !dbg !90 + %905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 21, !dbg !90 + %906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 22, !dbg !90 + %907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 23, !dbg !90 + %908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 24, !dbg !90 + %909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 25, !dbg !90 + %910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 26, !dbg !90 + %911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 27, !dbg !90 + %912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 28, !dbg !90 + %913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 29, !dbg !90 + %914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 30, !dbg !90 + %915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %883, 31, !dbg !90 + %916 = fmul float %884, 0x3FB6A09E60000000, !dbg !93 + %917 = fmul float %885, 0x3FB6A09E60000000, !dbg !93 + %918 = fmul float %886, 0x3FB6A09E60000000, !dbg !93 + %919 = fmul float %887, 0x3FB6A09E60000000, !dbg !93 + %920 = fmul float %888, 0x3FB6A09E60000000, !dbg !93 + %921 = fmul float %889, 0x3FB6A09E60000000, !dbg !93 + %922 = fmul float %890, 0x3FB6A09E60000000, !dbg !93 + %923 = fmul float %891, 0x3FB6A09E60000000, !dbg !93 + %924 = fmul float %892, 0x3FB6A09E60000000, !dbg !93 + %925 = fmul float %893, 0x3FB6A09E60000000, !dbg !93 + %926 = fmul float %894, 0x3FB6A09E60000000, !dbg !93 + %927 = fmul float %895, 0x3FB6A09E60000000, !dbg !93 + %928 = fmul float %896, 0x3FB6A09E60000000, !dbg !93 + %929 = fmul float %897, 0x3FB6A09E60000000, !dbg !93 + %930 = fmul float %898, 0x3FB6A09E60000000, !dbg !93 + %931 = fmul float %899, 0x3FB6A09E60000000, !dbg !93 + %932 = fmul float %900, 0x3FB6A09E60000000, !dbg !93 + %933 = fmul float %901, 0x3FB6A09E60000000, !dbg !93 + %934 = fmul float %902, 0x3FB6A09E60000000, !dbg !93 + %935 = fmul float %903, 0x3FB6A09E60000000, !dbg !93 + %936 = fmul float %904, 0x3FB6A09E60000000, !dbg !93 + %937 = fmul float %905, 0x3FB6A09E60000000, !dbg !93 + %938 = fmul float %906, 0x3FB6A09E60000000, !dbg !93 + %939 = fmul float %907, 0x3FB6A09E60000000, !dbg !93 + %940 = fmul float %908, 0x3FB6A09E60000000, !dbg !93 + %941 = fmul float %909, 0x3FB6A09E60000000, !dbg !93 + %942 = fmul float %910, 0x3FB6A09E60000000, !dbg !93 + %943 = fmul float %911, 0x3FB6A09E60000000, !dbg !93 + %944 = fmul float %912, 0x3FB6A09E60000000, !dbg !93 + %945 = fmul float %913, 0x3FB6A09E60000000, !dbg !93 + %946 = fmul float %914, 0x3FB6A09E60000000, !dbg !93 + %947 = fmul float %915, 0x3FB6A09E60000000, !dbg !93 + %948 = extractelement <16 x i32> %520, i64 15, !dbg !94 + %949 = icmp sge i32 %338, %948, !dbg !94 + %950 = extractelement <16 x i32> %520, i64 14, !dbg !95 + %951 = icmp sge i32 %338, %950, !dbg !94 + %952 = icmp sge i32 %341, %948, !dbg !94 + %953 = icmp sge i32 %341, %950, !dbg !94 + %954 = extractelement <16 x i32> %520, i64 13, !dbg !94 + %955 = icmp sge i32 %338, %954, !dbg !94 + %956 = extractelement <16 x i32> %520, i64 12, !dbg !95 + %957 = icmp sge i32 %338, %956, !dbg !94 + %958 = icmp sge i32 %341, %954, !dbg !94 + %959 = icmp sge i32 %341, %956, !dbg !94 + %960 = extractelement <16 x i32> %520, i64 11, !dbg !94 + %961 = icmp sge i32 %338, %960, !dbg !94 + %962 = extractelement <16 x i32> %520, i64 10, !dbg !95 + %963 = icmp sge i32 %338, %962, !dbg !94 + %964 = icmp sge i32 %341, %960, !dbg !94 + %965 = icmp sge i32 %341, %962, !dbg !94 + %966 = extractelement <16 x i32> %520, i64 9, !dbg !94 + %967 = icmp sge i32 %338, %966, !dbg !94 + %968 = extractelement <16 x i32> %520, i64 8, !dbg !95 + %969 = icmp sge i32 %338, %968, !dbg !94 + %970 = icmp sge i32 %341, %966, !dbg !94 + %971 = icmp sge i32 %341, %968, !dbg !94 + %972 = extractelement <16 x i32> %520, i64 7, !dbg !94 + %973 = icmp sge i32 %338, %972, !dbg !94 + %974 = extractelement <16 x i32> %520, i64 6, !dbg !95 + %975 = icmp sge i32 %338, %974, !dbg !94 + %976 = icmp sge i32 %341, %972, !dbg !94 + %977 = icmp sge i32 %341, %974, !dbg !94 + %978 = extractelement <16 x i32> %520, i64 5, !dbg !94 + %979 = icmp sge i32 %338, %978, !dbg !94 + %980 = extractelement <16 x i32> %520, i64 4, !dbg !95 + %981 = icmp sge i32 %338, %980, !dbg !94 + %982 = icmp sge i32 %341, %978, !dbg !94 + %983 = icmp sge i32 %341, %980, !dbg !94 + %984 = extractelement <16 x i32> %520, i64 3, !dbg !94 + %985 = icmp sge i32 %338, %984, !dbg !94 + %986 = extractelement <16 x i32> %520, i64 2, !dbg !95 + %987 = icmp sge i32 %338, %986, !dbg !94 + %988 = icmp sge i32 %341, %984, !dbg !94 + %989 = icmp sge i32 %341, %986, !dbg !94 + %990 = extractelement <16 x i32> %520, i64 1, !dbg !94 + %991 = icmp sge i32 %338, %990, !dbg !94 + %992 = extractelement <16 x i32> %520, i64 0, !dbg !95 + %993 = icmp sge i32 %338, %992, !dbg !94 + %994 = icmp sge i32 %341, %990, !dbg !94 + %995 = icmp sge i32 %341, %992, !dbg !94 + %996 = and i1 %399, %949, !dbg !96 + %997 = and i1 %399, %951, !dbg !96 + %998 = and i1 %400, %952, !dbg !96 + %999 = and i1 %400, %953, !dbg !96 + %1000 = and i1 %399, %955, !dbg !96 + %1001 = and i1 %399, %957, !dbg !96 + %1002 = and i1 %400, %958, !dbg !96 + %1003 = and i1 %400, %959, !dbg !96 + %1004 = and i1 %399, %961, !dbg !96 + %1005 = and i1 %399, %963, !dbg !96 + %1006 = and i1 %400, %964, !dbg !96 + %1007 = and i1 %400, %965, !dbg !96 + %1008 = and i1 %399, %967, !dbg !96 + %1009 = and i1 %399, %969, !dbg !96 + %1010 = and i1 %400, %970, !dbg !96 + %1011 = and i1 %400, %971, !dbg !96 + %1012 = and i1 %399, %973, !dbg !96 + %1013 = and i1 %399, %975, !dbg !96 + %1014 = and i1 %400, %976, !dbg !96 + %1015 = and i1 %400, %977, !dbg !96 + %1016 = and i1 %399, %979, !dbg !96 + %1017 = and i1 %399, %981, !dbg !96 + %1018 = and i1 %400, %982, !dbg !96 + %1019 = and i1 %400, %983, !dbg !96 + %1020 = and i1 %399, %985, !dbg !96 + %1021 = and i1 %399, %987, !dbg !96 + %1022 = and i1 %400, %988, !dbg !96 + %1023 = and i1 %400, %989, !dbg !96 + %1024 = and i1 %399, %991, !dbg !96 + %1025 = and i1 %399, %993, !dbg !96 + %1026 = and i1 %400, %994, !dbg !96 + %1027 = and i1 %400, %995, !dbg !96 + %1028 = icmp sgt i32 %948, 2047, !dbg !97 + %1029 = icmp sgt i32 %954, 2047, !dbg !97 + %1030 = icmp sgt i32 %960, 2047, !dbg !97 + %1031 = icmp sgt i32 %966, 2047, !dbg !97 + %1032 = icmp sgt i32 %972, 2047, !dbg !97 + %1033 = icmp sgt i32 %978, 2047, !dbg !97 + %1034 = icmp sgt i32 %984, 2047, !dbg !97 + %1035 = icmp sgt i32 %990, 2047, !dbg !97 + %1036 = shufflevector <16 x i32> %520, <16 x i32> poison, <8 x i32> , !dbg !95 + %1037 = srem <8 x i32> %1036, splat (i32 2048), !dbg !95 + %1038 = icmp ne <8 x i32> %1037, zeroinitializer, !dbg !98 + %1039 = shufflevector <16 x i32> %520, <16 x i32> poison, <8 x i32> , !dbg !99 + %1040 = and <8 x i32> %1039, splat (i32 -2147481601), !dbg !99 + %1041 = icmp ugt <8 x i32> %1040, splat (i32 -2147483648), !dbg !99 + %1042 = trunc <8 x i32> %1039 to <8 x i16>, !dbg !100 + %1043 = and <8 x i16> %1042, splat (i16 2047), !dbg !100 + %1044 = zext nneg <8 x i16> %1043 to <8 x i64>, !dbg !92 + %1045 = icmp sgt <8 x i64> %450, %1044, !dbg !92 + %1046 = and <8 x i1> %1041, %1038, !dbg !101 + %1047 = add nsw <8 x i32> %1037, splat (i32 2048), !dbg !102 + %1048 = select <8 x i1> %1046, <8 x i32> %1047, <8 x i32> %1037, !dbg !103 + %1049 = sext <8 x i32> %1048 to <8 x i64>, !dbg !92 + %1050 = icmp sgt <8 x i64> %450, %1049, !dbg !92 + %1051 = extractelement <8 x i1> %1045, i64 7, !dbg !104 + %1052 = and i1 %1028, %1051, !dbg !104 + %1053 = extractelement <8 x i1> %1050, i64 7, !dbg !104 + %1054 = and i1 %1028, %1053, !dbg !104 + %1055 = extractelement <8 x i1> %1045, i64 6, !dbg !104 + %1056 = and i1 %1029, %1055, !dbg !104 + %1057 = extractelement <8 x i1> %1050, i64 6, !dbg !104 + %1058 = and i1 %1029, %1057, !dbg !104 + %1059 = extractelement <8 x i1> %1045, i64 5, !dbg !104 + %1060 = and i1 %1030, %1059, !dbg !104 + %1061 = extractelement <8 x i1> %1050, i64 5, !dbg !104 + %1062 = and i1 %1030, %1061, !dbg !104 + %1063 = extractelement <8 x i1> %1045, i64 4, !dbg !104 + %1064 = and i1 %1031, %1063, !dbg !104 + %1065 = extractelement <8 x i1> %1050, i64 4, !dbg !104 + %1066 = and i1 %1031, %1065, !dbg !104 + %1067 = extractelement <8 x i1> %1045, i64 3, !dbg !104 + %1068 = and i1 %1032, %1067, !dbg !104 + %1069 = extractelement <8 x i1> %1050, i64 3, !dbg !104 + %1070 = and i1 %1032, %1069, !dbg !104 + %1071 = extractelement <8 x i1> %1045, i64 2, !dbg !104 + %1072 = and i1 %1033, %1071, !dbg !104 + %1073 = extractelement <8 x i1> %1050, i64 2, !dbg !104 + %1074 = and i1 %1033, %1073, !dbg !104 + %1075 = extractelement <8 x i1> %1045, i64 1, !dbg !104 + %1076 = and i1 %1034, %1075, !dbg !104 + %1077 = extractelement <8 x i1> %1050, i64 1, !dbg !104 + %1078 = and i1 %1034, %1077, !dbg !104 + %1079 = extractelement <8 x i1> %1045, i64 0, !dbg !104 + %1080 = and i1 %1035, %1079, !dbg !104 + %1081 = extractelement <8 x i1> %1050, i64 0, !dbg !104 + %1082 = and i1 %1035, %1081, !dbg !104 + %1083 = sub <32 x i32> %521, %94, !dbg !105 + %1084 = and <32 x i32> %1083, splat (i32 2047), !dbg !106 + %1085 = icmp eq <32 x i32> %1084, zeroinitializer, !dbg !107 + %1086 = extractelement <32 x i1> %1085, i64 31, !dbg !108 + %1087 = and i1 %1086, %1052, !dbg !108 + %1088 = extractelement <32 x i1> %1085, i64 30, !dbg !108 + %1089 = and i1 %1088, %1054, !dbg !108 + %1090 = extractelement <32 x i1> %1085, i64 29, !dbg !108 + %1091 = and i1 %1090, %1052, !dbg !108 + %1092 = extractelement <32 x i1> %1085, i64 28, !dbg !108 + %1093 = and i1 %1092, %1054, !dbg !108 + %1094 = extractelement <32 x i1> %1085, i64 27, !dbg !108 + %1095 = and i1 %1094, %1056, !dbg !108 + %1096 = extractelement <32 x i1> %1085, i64 26, !dbg !108 + %1097 = and i1 %1096, %1058, !dbg !108 + %1098 = extractelement <32 x i1> %1085, i64 25, !dbg !108 + %1099 = and i1 %1098, %1056, !dbg !108 + %1100 = extractelement <32 x i1> %1085, i64 24, !dbg !108 + %1101 = and i1 %1100, %1058, !dbg !108 + %1102 = extractelement <32 x i1> %1085, i64 23, !dbg !108 + %1103 = and i1 %1102, %1060, !dbg !108 + %1104 = extractelement <32 x i1> %1085, i64 22, !dbg !108 + %1105 = and i1 %1104, %1062, !dbg !108 + %1106 = extractelement <32 x i1> %1085, i64 21, !dbg !108 + %1107 = and i1 %1106, %1060, !dbg !108 + %1108 = extractelement <32 x i1> %1085, i64 20, !dbg !108 + %1109 = and i1 %1108, %1062, !dbg !108 + %1110 = extractelement <32 x i1> %1085, i64 19, !dbg !108 + %1111 = and i1 %1110, %1064, !dbg !108 + %1112 = extractelement <32 x i1> %1085, i64 18, !dbg !108 + %1113 = and i1 %1112, %1066, !dbg !108 + %1114 = extractelement <32 x i1> %1085, i64 17, !dbg !108 + %1115 = and i1 %1114, %1064, !dbg !108 + %1116 = extractelement <32 x i1> %1085, i64 16, !dbg !108 + %1117 = and i1 %1116, %1066, !dbg !108 + %1118 = extractelement <32 x i1> %1085, i64 15, !dbg !108 + %1119 = and i1 %1118, %1068, !dbg !108 + %1120 = extractelement <32 x i1> %1085, i64 14, !dbg !108 + %1121 = and i1 %1120, %1070, !dbg !108 + %1122 = extractelement <32 x i1> %1085, i64 13, !dbg !108 + %1123 = and i1 %1122, %1068, !dbg !108 + %1124 = extractelement <32 x i1> %1085, i64 12, !dbg !108 + %1125 = and i1 %1124, %1070, !dbg !108 + %1126 = extractelement <32 x i1> %1085, i64 11, !dbg !108 + %1127 = and i1 %1126, %1072, !dbg !108 + %1128 = extractelement <32 x i1> %1085, i64 10, !dbg !108 + %1129 = and i1 %1128, %1074, !dbg !108 + %1130 = extractelement <32 x i1> %1085, i64 9, !dbg !108 + %1131 = and i1 %1130, %1072, !dbg !108 + %1132 = extractelement <32 x i1> %1085, i64 8, !dbg !108 + %1133 = and i1 %1132, %1074, !dbg !108 + %1134 = extractelement <32 x i1> %1085, i64 7, !dbg !108 + %1135 = and i1 %1134, %1076, !dbg !108 + %1136 = extractelement <32 x i1> %1085, i64 6, !dbg !108 + %1137 = and i1 %1136, %1078, !dbg !108 + %1138 = extractelement <32 x i1> %1085, i64 5, !dbg !108 + %1139 = and i1 %1138, %1076, !dbg !108 + %1140 = extractelement <32 x i1> %1085, i64 4, !dbg !108 + %1141 = and i1 %1140, %1078, !dbg !108 + %1142 = extractelement <32 x i1> %1085, i64 3, !dbg !108 + %1143 = and i1 %1142, %1080, !dbg !108 + %1144 = extractelement <32 x i1> %1085, i64 2, !dbg !108 + %1145 = and i1 %1144, %1082, !dbg !108 + %1146 = extractelement <32 x i1> %1085, i64 1, !dbg !108 + %1147 = and i1 %1146, %1080, !dbg !108 + %1148 = extractelement <32 x i1> %1085, i64 0, !dbg !108 + %1149 = and i1 %1148, %1082, !dbg !108 + %1150 = or i1 %996, %1087, !dbg !109 + %1151 = or i1 %997, %1089, !dbg !109 + %1152 = or i1 %998, %1091, !dbg !109 + %1153 = or i1 %999, %1093, !dbg !109 + %1154 = or i1 %1000, %1095, !dbg !109 + %1155 = or i1 %1001, %1097, !dbg !109 + %1156 = or i1 %1002, %1099, !dbg !109 + %1157 = or i1 %1003, %1101, !dbg !109 + %1158 = or i1 %1004, %1103, !dbg !109 + %1159 = or i1 %1005, %1105, !dbg !109 + %1160 = or i1 %1006, %1107, !dbg !109 + %1161 = or i1 %1007, %1109, !dbg !109 + %1162 = or i1 %1008, %1111, !dbg !109 + %1163 = or i1 %1009, %1113, !dbg !109 + %1164 = or i1 %1010, %1115, !dbg !109 + %1165 = or i1 %1011, %1117, !dbg !109 + %1166 = or i1 %1012, %1119, !dbg !109 + %1167 = or i1 %1013, %1121, !dbg !109 + %1168 = or i1 %1014, %1123, !dbg !109 + %1169 = or i1 %1015, %1125, !dbg !109 + %1170 = or i1 %1016, %1127, !dbg !109 + %1171 = or i1 %1017, %1129, !dbg !109 + %1172 = or i1 %1018, %1131, !dbg !109 + %1173 = or i1 %1019, %1133, !dbg !109 + %1174 = or i1 %1020, %1135, !dbg !109 + %1175 = or i1 %1021, %1137, !dbg !109 + %1176 = or i1 %1022, %1139, !dbg !109 + %1177 = or i1 %1023, %1141, !dbg !109 + %1178 = or i1 %1024, %1143, !dbg !109 + %1179 = or i1 %1025, %1145, !dbg !109 + %1180 = or i1 %1026, %1147, !dbg !109 + %1181 = or i1 %1027, %1149, !dbg !109 + %1182 = fmul float %916, 0x3FF7154760000000, !dbg !110 + %1183 = select i1 %1150, float %1182, float 0xFFF0000000000000, !dbg !111 + %1184 = fmul float %917, 0x3FF7154760000000, !dbg !110 + %1185 = select i1 %1151, float %1184, float 0xFFF0000000000000, !dbg !111 + %1186 = fmul float %918, 0x3FF7154760000000, !dbg !110 + %1187 = select i1 %1152, float %1186, float 0xFFF0000000000000, !dbg !111 + %1188 = fmul float %919, 0x3FF7154760000000, !dbg !110 + %1189 = select i1 %1153, float %1188, float 0xFFF0000000000000, !dbg !111 + %1190 = fmul float %920, 0x3FF7154760000000, !dbg !110 + %1191 = select i1 %1154, float %1190, float 0xFFF0000000000000, !dbg !111 + %1192 = fmul float %921, 0x3FF7154760000000, !dbg !110 + %1193 = select i1 %1155, float %1192, float 0xFFF0000000000000, !dbg !111 + %1194 = fmul float %922, 0x3FF7154760000000, !dbg !110 + %1195 = select i1 %1156, float %1194, float 0xFFF0000000000000, !dbg !111 + %1196 = fmul float %923, 0x3FF7154760000000, !dbg !110 + %1197 = select i1 %1157, float %1196, float 0xFFF0000000000000, !dbg !111 + %1198 = fmul float %924, 0x3FF7154760000000, !dbg !110 + %1199 = select i1 %1158, float %1198, float 0xFFF0000000000000, !dbg !111 + %1200 = fmul float %925, 0x3FF7154760000000, !dbg !110 + %1201 = select i1 %1159, float %1200, float 0xFFF0000000000000, !dbg !111 + %1202 = fmul float %926, 0x3FF7154760000000, !dbg !110 + %1203 = select i1 %1160, float %1202, float 0xFFF0000000000000, !dbg !111 + %1204 = fmul float %927, 0x3FF7154760000000, !dbg !110 + %1205 = select i1 %1161, float %1204, float 0xFFF0000000000000, !dbg !111 + %1206 = fmul float %928, 0x3FF7154760000000, !dbg !110 + %1207 = select i1 %1162, float %1206, float 0xFFF0000000000000, !dbg !111 + %1208 = fmul float %929, 0x3FF7154760000000, !dbg !110 + %1209 = select i1 %1163, float %1208, float 0xFFF0000000000000, !dbg !111 + %1210 = fmul float %930, 0x3FF7154760000000, !dbg !110 + %1211 = select i1 %1164, float %1210, float 0xFFF0000000000000, !dbg !111 + %1212 = fmul float %931, 0x3FF7154760000000, !dbg !110 + %1213 = select i1 %1165, float %1212, float 0xFFF0000000000000, !dbg !111 + %1214 = fmul float %932, 0x3FF7154760000000, !dbg !110 + %1215 = select i1 %1166, float %1214, float 0xFFF0000000000000, !dbg !111 + %1216 = fmul float %933, 0x3FF7154760000000, !dbg !110 + %1217 = select i1 %1167, float %1216, float 0xFFF0000000000000, !dbg !111 + %1218 = fmul float %934, 0x3FF7154760000000, !dbg !110 + %1219 = select i1 %1168, float %1218, float 0xFFF0000000000000, !dbg !111 + %1220 = fmul float %935, 0x3FF7154760000000, !dbg !110 + %1221 = select i1 %1169, float %1220, float 0xFFF0000000000000, !dbg !111 + %1222 = fmul float %936, 0x3FF7154760000000, !dbg !110 + %1223 = select i1 %1170, float %1222, float 0xFFF0000000000000, !dbg !111 + %1224 = fmul float %937, 0x3FF7154760000000, !dbg !110 + %1225 = select i1 %1171, float %1224, float 0xFFF0000000000000, !dbg !111 + %1226 = fmul float %938, 0x3FF7154760000000, !dbg !110 + %1227 = select i1 %1172, float %1226, float 0xFFF0000000000000, !dbg !111 + %1228 = fmul float %939, 0x3FF7154760000000, !dbg !110 + %1229 = select i1 %1173, float %1228, float 0xFFF0000000000000, !dbg !111 + %1230 = fmul float %940, 0x3FF7154760000000, !dbg !110 + %1231 = select i1 %1174, float %1230, float 0xFFF0000000000000, !dbg !111 + %1232 = fmul float %941, 0x3FF7154760000000, !dbg !110 + %1233 = select i1 %1175, float %1232, float 0xFFF0000000000000, !dbg !111 + %1234 = fmul float %942, 0x3FF7154760000000, !dbg !110 + %1235 = select i1 %1176, float %1234, float 0xFFF0000000000000, !dbg !111 + %1236 = fmul float %943, 0x3FF7154760000000, !dbg !110 + %1237 = select i1 %1177, float %1236, float 0xFFF0000000000000, !dbg !111 + %1238 = fmul float %944, 0x3FF7154760000000, !dbg !110 + %1239 = select i1 %1178, float %1238, float 0xFFF0000000000000, !dbg !111 + %1240 = fmul float %945, 0x3FF7154760000000, !dbg !110 + %1241 = select i1 %1179, float %1240, float 0xFFF0000000000000, !dbg !111 + %1242 = fmul float %946, 0x3FF7154760000000, !dbg !110 + %1243 = select i1 %1180, float %1242, float 0xFFF0000000000000, !dbg !111 + %1244 = fmul float %947, 0x3FF7154760000000, !dbg !110 + %1245 = select i1 %1181, float %1244, float 0xFFF0000000000000, !dbg !111 + %1246 = fsub float %1183, %356, !dbg !112 + %1247 = fsub float %1185, %356, !dbg !112 + %1248 = fsub float %1187, %357, !dbg !112 + %1249 = fsub float %1189, %357, !dbg !112 + %1250 = fsub float %1191, %356, !dbg !112 + %1251 = fsub float %1193, %356, !dbg !112 + %1252 = fsub float %1195, %357, !dbg !112 + %1253 = fsub float %1197, %357, !dbg !112 + %1254 = fsub float %1199, %356, !dbg !112 + %1255 = fsub float %1201, %356, !dbg !112 + %1256 = fsub float %1203, %357, !dbg !112 + %1257 = fsub float %1205, %357, !dbg !112 + %1258 = fsub float %1207, %356, !dbg !112 + %1259 = fsub float %1209, %356, !dbg !112 + %1260 = fsub float %1211, %357, !dbg !112 + %1261 = fsub float %1213, %357, !dbg !112 + %1262 = fsub float %1215, %356, !dbg !112 + %1263 = fsub float %1217, %356, !dbg !112 + %1264 = fsub float %1219, %357, !dbg !112 + %1265 = fsub float %1221, %357, !dbg !112 + %1266 = fsub float %1223, %356, !dbg !112 + %1267 = fsub float %1225, %356, !dbg !112 + %1268 = fsub float %1227, %357, !dbg !112 + %1269 = fsub float %1229, %357, !dbg !112 + %1270 = fsub float %1231, %356, !dbg !112 + %1271 = fsub float %1233, %356, !dbg !112 + %1272 = fsub float %1235, %357, !dbg !112 + %1273 = fsub float %1237, %357, !dbg !112 + %1274 = fsub float %1239, %356, !dbg !112 + %1275 = fsub float %1241, %356, !dbg !112 + %1276 = fsub float %1243, %357, !dbg !112 + %1277 = fsub float %1245, %357, !dbg !112 + %1278 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1412 = icmp eq i32 %1278, 0, !dbg !113 + br i1 %.not.i1412, label %1281, label %1279, !dbg !113 + +1279: ; preds = %451 + %1280 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1246) #3, !dbg !113 + br label %__nv_exp2f.exit1414, !dbg !113 + +1281: ; preds = %451 + %1282 = tail call float @llvm.nvvm.ex2.approx.f(float %1246) #3, !dbg !113 + br label %__nv_exp2f.exit1414, !dbg !113 + +__nv_exp2f.exit1414: ; preds = %1279, %1281 + %.0.i1413 = phi float [ %1280, %1279 ], [ %1282, %1281 ], !dbg !113 + %1283 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1415 = icmp eq i32 %1283, 0, !dbg !113 + br i1 %.not.i1415, label %1286, label %1284, !dbg !113 + +1284: ; preds = %__nv_exp2f.exit1414 + %1285 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1247) #3, !dbg !113 + br label %__nv_exp2f.exit1417, !dbg !113 + +1286: ; preds = %__nv_exp2f.exit1414 + %1287 = tail call float @llvm.nvvm.ex2.approx.f(float %1247) #3, !dbg !113 + br label %__nv_exp2f.exit1417, !dbg !113 + +__nv_exp2f.exit1417: ; preds = %1284, %1286 + %.0.i1416 = phi float [ %1285, %1284 ], [ %1287, %1286 ], !dbg !113 + %1288 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1418 = icmp eq i32 %1288, 0, !dbg !113 + br i1 %.not.i1418, label %1291, label %1289, !dbg !113 + +1289: ; preds = %__nv_exp2f.exit1417 + %1290 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1248) #3, !dbg !113 + br label %__nv_exp2f.exit1420, !dbg !113 + +1291: ; preds = %__nv_exp2f.exit1417 + %1292 = tail call float @llvm.nvvm.ex2.approx.f(float %1248) #3, !dbg !113 + br label %__nv_exp2f.exit1420, !dbg !113 + +__nv_exp2f.exit1420: ; preds = %1289, %1291 + %.0.i1419 = phi float [ %1290, %1289 ], [ %1292, %1291 ], !dbg !113 + %1293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1421 = icmp eq i32 %1293, 0, !dbg !113 + br i1 %.not.i1421, label %1296, label %1294, !dbg !113 + +1294: ; preds = %__nv_exp2f.exit1420 + %1295 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1249) #3, !dbg !113 + br label %__nv_exp2f.exit1423, !dbg !113 + +1296: ; preds = %__nv_exp2f.exit1420 + %1297 = tail call float @llvm.nvvm.ex2.approx.f(float %1249) #3, !dbg !113 + br label %__nv_exp2f.exit1423, !dbg !113 + +__nv_exp2f.exit1423: ; preds = %1294, %1296 + %.0.i1422 = phi float [ %1295, %1294 ], [ %1297, %1296 ], !dbg !113 + %1298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1424 = icmp eq i32 %1298, 0, !dbg !113 + br i1 %.not.i1424, label %1301, label %1299, !dbg !113 + +1299: ; preds = %__nv_exp2f.exit1423 + %1300 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1250) #3, !dbg !113 + br label %__nv_exp2f.exit1426, !dbg !113 + +1301: ; preds = %__nv_exp2f.exit1423 + %1302 = tail call float @llvm.nvvm.ex2.approx.f(float %1250) #3, !dbg !113 + br label %__nv_exp2f.exit1426, !dbg !113 + +__nv_exp2f.exit1426: ; preds = %1299, %1301 + %.0.i1425 = phi float [ %1300, %1299 ], [ %1302, %1301 ], !dbg !113 + %1303 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1427 = icmp eq i32 %1303, 0, !dbg !113 + br i1 %.not.i1427, label %1306, label %1304, !dbg !113 + +1304: ; preds = %__nv_exp2f.exit1426 + %1305 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1251) #3, !dbg !113 + br label %__nv_exp2f.exit1429, !dbg !113 + +1306: ; preds = %__nv_exp2f.exit1426 + %1307 = tail call float @llvm.nvvm.ex2.approx.f(float %1251) #3, !dbg !113 + br label %__nv_exp2f.exit1429, !dbg !113 + +__nv_exp2f.exit1429: ; preds = %1304, %1306 + %.0.i1428 = phi float [ %1305, %1304 ], [ %1307, %1306 ], !dbg !113 + %1308 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1430 = icmp eq i32 %1308, 0, !dbg !113 + br i1 %.not.i1430, label %1311, label %1309, !dbg !113 + +1309: ; preds = %__nv_exp2f.exit1429 + %1310 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1252) #3, !dbg !113 + br label %__nv_exp2f.exit1432, !dbg !113 + +1311: ; preds = %__nv_exp2f.exit1429 + %1312 = tail call float @llvm.nvvm.ex2.approx.f(float %1252) #3, !dbg !113 + br label %__nv_exp2f.exit1432, !dbg !113 + +__nv_exp2f.exit1432: ; preds = %1309, %1311 + %.0.i1431 = phi float [ %1310, %1309 ], [ %1312, %1311 ], !dbg !113 + %1313 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1433 = icmp eq i32 %1313, 0, !dbg !113 + br i1 %.not.i1433, label %1316, label %1314, !dbg !113 + +1314: ; preds = %__nv_exp2f.exit1432 + %1315 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1253) #3, !dbg !113 + br label %__nv_exp2f.exit1435, !dbg !113 + +1316: ; preds = %__nv_exp2f.exit1432 + %1317 = tail call float @llvm.nvvm.ex2.approx.f(float %1253) #3, !dbg !113 + br label %__nv_exp2f.exit1435, !dbg !113 + +__nv_exp2f.exit1435: ; preds = %1314, %1316 + %.0.i1434 = phi float [ %1315, %1314 ], [ %1317, %1316 ], !dbg !113 + %1318 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1436 = icmp eq i32 %1318, 0, !dbg !113 + br i1 %.not.i1436, label %1321, label %1319, !dbg !113 + +1319: ; preds = %__nv_exp2f.exit1435 + %1320 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1254) #3, !dbg !113 + br label %__nv_exp2f.exit1438, !dbg !113 + +1321: ; preds = %__nv_exp2f.exit1435 + %1322 = tail call float @llvm.nvvm.ex2.approx.f(float %1254) #3, !dbg !113 + br label %__nv_exp2f.exit1438, !dbg !113 + +__nv_exp2f.exit1438: ; preds = %1319, %1321 + %.0.i1437 = phi float [ %1320, %1319 ], [ %1322, %1321 ], !dbg !113 + %1323 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1439 = icmp eq i32 %1323, 0, !dbg !113 + br i1 %.not.i1439, label %1326, label %1324, !dbg !113 + +1324: ; preds = %__nv_exp2f.exit1438 + %1325 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1255) #3, !dbg !113 + br label %__nv_exp2f.exit1441, !dbg !113 + +1326: ; preds = %__nv_exp2f.exit1438 + %1327 = tail call float @llvm.nvvm.ex2.approx.f(float %1255) #3, !dbg !113 + br label %__nv_exp2f.exit1441, !dbg !113 + +__nv_exp2f.exit1441: ; preds = %1324, %1326 + %.0.i1440 = phi float [ %1325, %1324 ], [ %1327, %1326 ], !dbg !113 + %1328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1442 = icmp eq i32 %1328, 0, !dbg !113 + br i1 %.not.i1442, label %1331, label %1329, !dbg !113 + +1329: ; preds = %__nv_exp2f.exit1441 + %1330 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1256) #3, !dbg !113 + br label %__nv_exp2f.exit1444, !dbg !113 + +1331: ; preds = %__nv_exp2f.exit1441 + %1332 = tail call float @llvm.nvvm.ex2.approx.f(float %1256) #3, !dbg !113 + br label %__nv_exp2f.exit1444, !dbg !113 + +__nv_exp2f.exit1444: ; preds = %1329, %1331 + %.0.i1443 = phi float [ %1330, %1329 ], [ %1332, %1331 ], !dbg !113 + %1333 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1445 = icmp eq i32 %1333, 0, !dbg !113 + br i1 %.not.i1445, label %1336, label %1334, !dbg !113 + +1334: ; preds = %__nv_exp2f.exit1444 + %1335 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1257) #3, !dbg !113 + br label %__nv_exp2f.exit1447, !dbg !113 + +1336: ; preds = %__nv_exp2f.exit1444 + %1337 = tail call float @llvm.nvvm.ex2.approx.f(float %1257) #3, !dbg !113 + br label %__nv_exp2f.exit1447, !dbg !113 + +__nv_exp2f.exit1447: ; preds = %1334, %1336 + %.0.i1446 = phi float [ %1335, %1334 ], [ %1337, %1336 ], !dbg !113 + %1338 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1448 = icmp eq i32 %1338, 0, !dbg !113 + br i1 %.not.i1448, label %1341, label %1339, !dbg !113 + +1339: ; preds = %__nv_exp2f.exit1447 + %1340 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1258) #3, !dbg !113 + br label %__nv_exp2f.exit1450, !dbg !113 + +1341: ; preds = %__nv_exp2f.exit1447 + %1342 = tail call float @llvm.nvvm.ex2.approx.f(float %1258) #3, !dbg !113 + br label %__nv_exp2f.exit1450, !dbg !113 + +__nv_exp2f.exit1450: ; preds = %1339, %1341 + %.0.i1449 = phi float [ %1340, %1339 ], [ %1342, %1341 ], !dbg !113 + %1343 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1451 = icmp eq i32 %1343, 0, !dbg !113 + br i1 %.not.i1451, label %1346, label %1344, !dbg !113 + +1344: ; preds = %__nv_exp2f.exit1450 + %1345 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1259) #3, !dbg !113 + br label %__nv_exp2f.exit1453, !dbg !113 + +1346: ; preds = %__nv_exp2f.exit1450 + %1347 = tail call float @llvm.nvvm.ex2.approx.f(float %1259) #3, !dbg !113 + br label %__nv_exp2f.exit1453, !dbg !113 + +__nv_exp2f.exit1453: ; preds = %1344, %1346 + %.0.i1452 = phi float [ %1345, %1344 ], [ %1347, %1346 ], !dbg !113 + %1348 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1454 = icmp eq i32 %1348, 0, !dbg !113 + br i1 %.not.i1454, label %1351, label %1349, !dbg !113 + +1349: ; preds = %__nv_exp2f.exit1453 + %1350 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1260) #3, !dbg !113 + br label %__nv_exp2f.exit1456, !dbg !113 + +1351: ; preds = %__nv_exp2f.exit1453 + %1352 = tail call float @llvm.nvvm.ex2.approx.f(float %1260) #3, !dbg !113 + br label %__nv_exp2f.exit1456, !dbg !113 + +__nv_exp2f.exit1456: ; preds = %1349, %1351 + %.0.i1455 = phi float [ %1350, %1349 ], [ %1352, %1351 ], !dbg !113 + %1353 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1457 = icmp eq i32 %1353, 0, !dbg !113 + br i1 %.not.i1457, label %1356, label %1354, !dbg !113 + +1354: ; preds = %__nv_exp2f.exit1456 + %1355 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1261) #3, !dbg !113 + br label %__nv_exp2f.exit1459, !dbg !113 + +1356: ; preds = %__nv_exp2f.exit1456 + %1357 = tail call float @llvm.nvvm.ex2.approx.f(float %1261) #3, !dbg !113 + br label %__nv_exp2f.exit1459, !dbg !113 + +__nv_exp2f.exit1459: ; preds = %1354, %1356 + %.0.i1458 = phi float [ %1355, %1354 ], [ %1357, %1356 ], !dbg !113 + %1358 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1460 = icmp eq i32 %1358, 0, !dbg !113 + br i1 %.not.i1460, label %1361, label %1359, !dbg !113 + +1359: ; preds = %__nv_exp2f.exit1459 + %1360 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1262) #3, !dbg !113 + br label %__nv_exp2f.exit1462, !dbg !113 + +1361: ; preds = %__nv_exp2f.exit1459 + %1362 = tail call float @llvm.nvvm.ex2.approx.f(float %1262) #3, !dbg !113 + br label %__nv_exp2f.exit1462, !dbg !113 + +__nv_exp2f.exit1462: ; preds = %1359, %1361 + %.0.i1461 = phi float [ %1360, %1359 ], [ %1362, %1361 ], !dbg !113 + %1363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1463 = icmp eq i32 %1363, 0, !dbg !113 + br i1 %.not.i1463, label %1366, label %1364, !dbg !113 + +1364: ; preds = %__nv_exp2f.exit1462 + %1365 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1263) #3, !dbg !113 + br label %__nv_exp2f.exit1465, !dbg !113 + +1366: ; preds = %__nv_exp2f.exit1462 + %1367 = tail call float @llvm.nvvm.ex2.approx.f(float %1263) #3, !dbg !113 + br label %__nv_exp2f.exit1465, !dbg !113 + +__nv_exp2f.exit1465: ; preds = %1364, %1366 + %.0.i1464 = phi float [ %1365, %1364 ], [ %1367, %1366 ], !dbg !113 + %1368 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1466 = icmp eq i32 %1368, 0, !dbg !113 + br i1 %.not.i1466, label %1371, label %1369, !dbg !113 + +1369: ; preds = %__nv_exp2f.exit1465 + %1370 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1264) #3, !dbg !113 + br label %__nv_exp2f.exit1468, !dbg !113 + +1371: ; preds = %__nv_exp2f.exit1465 + %1372 = tail call float @llvm.nvvm.ex2.approx.f(float %1264) #3, !dbg !113 + br label %__nv_exp2f.exit1468, !dbg !113 + +__nv_exp2f.exit1468: ; preds = %1369, %1371 + %.0.i1467 = phi float [ %1370, %1369 ], [ %1372, %1371 ], !dbg !113 + %1373 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1469 = icmp eq i32 %1373, 0, !dbg !113 + br i1 %.not.i1469, label %1376, label %1374, !dbg !113 + +1374: ; preds = %__nv_exp2f.exit1468 + %1375 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1265) #3, !dbg !113 + br label %__nv_exp2f.exit1471, !dbg !113 + +1376: ; preds = %__nv_exp2f.exit1468 + %1377 = tail call float @llvm.nvvm.ex2.approx.f(float %1265) #3, !dbg !113 + br label %__nv_exp2f.exit1471, !dbg !113 + +__nv_exp2f.exit1471: ; preds = %1374, %1376 + %.0.i1470 = phi float [ %1375, %1374 ], [ %1377, %1376 ], !dbg !113 + %1378 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1472 = icmp eq i32 %1378, 0, !dbg !113 + br i1 %.not.i1472, label %1381, label %1379, !dbg !113 + +1379: ; preds = %__nv_exp2f.exit1471 + %1380 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1266) #3, !dbg !113 + br label %__nv_exp2f.exit1474, !dbg !113 + +1381: ; preds = %__nv_exp2f.exit1471 + %1382 = tail call float @llvm.nvvm.ex2.approx.f(float %1266) #3, !dbg !113 + br label %__nv_exp2f.exit1474, !dbg !113 + +__nv_exp2f.exit1474: ; preds = %1379, %1381 + %.0.i1473 = phi float [ %1380, %1379 ], [ %1382, %1381 ], !dbg !113 + %1383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1475 = icmp eq i32 %1383, 0, !dbg !113 + br i1 %.not.i1475, label %1386, label %1384, !dbg !113 + +1384: ; preds = %__nv_exp2f.exit1474 + %1385 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1267) #3, !dbg !113 + br label %__nv_exp2f.exit1477, !dbg !113 + +1386: ; preds = %__nv_exp2f.exit1474 + %1387 = tail call float @llvm.nvvm.ex2.approx.f(float %1267) #3, !dbg !113 + br label %__nv_exp2f.exit1477, !dbg !113 + +__nv_exp2f.exit1477: ; preds = %1384, %1386 + %.0.i1476 = phi float [ %1385, %1384 ], [ %1387, %1386 ], !dbg !113 + %1388 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1478 = icmp eq i32 %1388, 0, !dbg !113 + br i1 %.not.i1478, label %1391, label %1389, !dbg !113 + +1389: ; preds = %__nv_exp2f.exit1477 + %1390 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1268) #3, !dbg !113 + br label %__nv_exp2f.exit1480, !dbg !113 + +1391: ; preds = %__nv_exp2f.exit1477 + %1392 = tail call float @llvm.nvvm.ex2.approx.f(float %1268) #3, !dbg !113 + br label %__nv_exp2f.exit1480, !dbg !113 + +__nv_exp2f.exit1480: ; preds = %1389, %1391 + %.0.i1479 = phi float [ %1390, %1389 ], [ %1392, %1391 ], !dbg !113 + %1393 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1481 = icmp eq i32 %1393, 0, !dbg !113 + br i1 %.not.i1481, label %1396, label %1394, !dbg !113 + +1394: ; preds = %__nv_exp2f.exit1480 + %1395 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1269) #3, !dbg !113 + br label %__nv_exp2f.exit1483, !dbg !113 + +1396: ; preds = %__nv_exp2f.exit1480 + %1397 = tail call float @llvm.nvvm.ex2.approx.f(float %1269) #3, !dbg !113 + br label %__nv_exp2f.exit1483, !dbg !113 + +__nv_exp2f.exit1483: ; preds = %1394, %1396 + %.0.i1482 = phi float [ %1395, %1394 ], [ %1397, %1396 ], !dbg !113 + %1398 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1484 = icmp eq i32 %1398, 0, !dbg !113 + br i1 %.not.i1484, label %1401, label %1399, !dbg !113 + +1399: ; preds = %__nv_exp2f.exit1483 + %1400 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1270) #3, !dbg !113 + br label %__nv_exp2f.exit1486, !dbg !113 + +1401: ; preds = %__nv_exp2f.exit1483 + %1402 = tail call float @llvm.nvvm.ex2.approx.f(float %1270) #3, !dbg !113 + br label %__nv_exp2f.exit1486, !dbg !113 + +__nv_exp2f.exit1486: ; preds = %1399, %1401 + %.0.i1485 = phi float [ %1400, %1399 ], [ %1402, %1401 ], !dbg !113 + %1403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1487 = icmp eq i32 %1403, 0, !dbg !113 + br i1 %.not.i1487, label %1406, label %1404, !dbg !113 + +1404: ; preds = %__nv_exp2f.exit1486 + %1405 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1271) #3, !dbg !113 + br label %__nv_exp2f.exit1489, !dbg !113 + +1406: ; preds = %__nv_exp2f.exit1486 + %1407 = tail call float @llvm.nvvm.ex2.approx.f(float %1271) #3, !dbg !113 + br label %__nv_exp2f.exit1489, !dbg !113 + +__nv_exp2f.exit1489: ; preds = %1404, %1406 + %.0.i1488 = phi float [ %1405, %1404 ], [ %1407, %1406 ], !dbg !113 + %1408 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1490 = icmp eq i32 %1408, 0, !dbg !113 + br i1 %.not.i1490, label %1411, label %1409, !dbg !113 + +1409: ; preds = %__nv_exp2f.exit1489 + %1410 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1272) #3, !dbg !113 + br label %__nv_exp2f.exit1492, !dbg !113 + +1411: ; preds = %__nv_exp2f.exit1489 + %1412 = tail call float @llvm.nvvm.ex2.approx.f(float %1272) #3, !dbg !113 + br label %__nv_exp2f.exit1492, !dbg !113 + +__nv_exp2f.exit1492: ; preds = %1409, %1411 + %.0.i1491 = phi float [ %1410, %1409 ], [ %1412, %1411 ], !dbg !113 + %1413 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1493 = icmp eq i32 %1413, 0, !dbg !113 + br i1 %.not.i1493, label %1416, label %1414, !dbg !113 + +1414: ; preds = %__nv_exp2f.exit1492 + %1415 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1273) #3, !dbg !113 + br label %__nv_exp2f.exit1495, !dbg !113 + +1416: ; preds = %__nv_exp2f.exit1492 + %1417 = tail call float @llvm.nvvm.ex2.approx.f(float %1273) #3, !dbg !113 + br label %__nv_exp2f.exit1495, !dbg !113 + +__nv_exp2f.exit1495: ; preds = %1414, %1416 + %.0.i1494 = phi float [ %1415, %1414 ], [ %1417, %1416 ], !dbg !113 + %1418 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1496 = icmp eq i32 %1418, 0, !dbg !113 + br i1 %.not.i1496, label %1421, label %1419, !dbg !113 + +1419: ; preds = %__nv_exp2f.exit1495 + %1420 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1274) #3, !dbg !113 + br label %__nv_exp2f.exit1498, !dbg !113 + +1421: ; preds = %__nv_exp2f.exit1495 + %1422 = tail call float @llvm.nvvm.ex2.approx.f(float %1274) #3, !dbg !113 + br label %__nv_exp2f.exit1498, !dbg !113 + +__nv_exp2f.exit1498: ; preds = %1419, %1421 + %.0.i1497 = phi float [ %1420, %1419 ], [ %1422, %1421 ], !dbg !113 + %1423 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1499 = icmp eq i32 %1423, 0, !dbg !113 + br i1 %.not.i1499, label %1426, label %1424, !dbg !113 + +1424: ; preds = %__nv_exp2f.exit1498 + %1425 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1275) #3, !dbg !113 + br label %__nv_exp2f.exit1501, !dbg !113 + +1426: ; preds = %__nv_exp2f.exit1498 + %1427 = tail call float @llvm.nvvm.ex2.approx.f(float %1275) #3, !dbg !113 + br label %__nv_exp2f.exit1501, !dbg !113 + +__nv_exp2f.exit1501: ; preds = %1424, %1426 + %.0.i1500 = phi float [ %1425, %1424 ], [ %1427, %1426 ], !dbg !113 + %1428 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1502 = icmp eq i32 %1428, 0, !dbg !113 + br i1 %.not.i1502, label %1431, label %1429, !dbg !113 + +1429: ; preds = %__nv_exp2f.exit1501 + %1430 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1276) #3, !dbg !113 + br label %__nv_exp2f.exit1504, !dbg !113 + +1431: ; preds = %__nv_exp2f.exit1501 + %1432 = tail call float @llvm.nvvm.ex2.approx.f(float %1276) #3, !dbg !113 + br label %__nv_exp2f.exit1504, !dbg !113 + +__nv_exp2f.exit1504: ; preds = %1429, %1431 + %.0.i1503 = phi float [ %1430, %1429 ], [ %1432, %1431 ], !dbg !113 + %1433 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !113 + %.not.i1505 = icmp eq i32 %1433, 0, !dbg !113 + br i1 %.not.i1505, label %1436, label %1434, !dbg !113 + +1434: ; preds = %__nv_exp2f.exit1504 + %1435 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1277) #3, !dbg !113 + br label %__nv_exp2f.exit1507, !dbg !113 + +1436: ; preds = %__nv_exp2f.exit1504 + %1437 = tail call float @llvm.nvvm.ex2.approx.f(float %1277) #3, !dbg !113 + br label %__nv_exp2f.exit1507, !dbg !113 + +__nv_exp2f.exit1507: ; preds = %1434, %1436 + %.0.i1506 = phi float [ %1435, %1434 ], [ %1437, %1436 ], !dbg !113 + %1438 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %527, !dbg !87 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !114 + %1439 = add i32 %531, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !114 + %1440 = lshr exact i32 %1439, 4, !dbg !114 + %1441 = and i32 %1440, 16383, !dbg !114 + %1442 = zext nneg i32 %1441 to i64, !dbg !114 + %1443 = or disjoint i64 %1442, 4611686293372403712, !dbg !114 + %1444 = ptrtoint ptr addrspace(3) %1438 to i32, !dbg !114 + %1445 = lshr exact i32 %1444, 4, !dbg !114 + %1446 = and i32 %1445, 16383, !dbg !114 + %1447 = zext nneg i32 %1446 to i64, !dbg !114 + %1448 = or disjoint i64 %1447, 4611686293338849280, !dbg !114 + %1449 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %1443, i64 %1448) #3, !dbg !114 + %1450 = add i32 %543, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !114 + %1451 = lshr exact i32 %1450, 4, !dbg !114 + %1452 = and i32 %1451, 16383, !dbg !114 + %1453 = zext nneg i32 %1452 to i64, !dbg !114 + %1454 = or disjoint i64 %1453, 4611686293372403712, !dbg !114 + %1455 = add i32 %1444, 32, !dbg !114 + %1456 = lshr exact i32 %1455, 4, !dbg !114 + %1457 = and i32 %1456, 16383, !dbg !114 + %1458 = zext nneg i32 %1457 to i64, !dbg !114 + %1459 = or disjoint i64 %1458, 4611686293338849280, !dbg !114 + %1460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 0, !dbg !114 + %1461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 1, !dbg !114 + %1462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 2, !dbg !114 + %1463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 3, !dbg !114 + %1464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 4, !dbg !114 + %1465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 5, !dbg !114 + %1466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 6, !dbg !114 + %1467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 7, !dbg !114 + %1468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 8, !dbg !114 + %1469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 9, !dbg !114 + %1470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 10, !dbg !114 + %1471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 11, !dbg !114 + %1472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 12, !dbg !114 + %1473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 13, !dbg !114 + %1474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 14, !dbg !114 + %1475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 15, !dbg !114 + %1476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 16, !dbg !114 + %1477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 17, !dbg !114 + %1478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 18, !dbg !114 + %1479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 19, !dbg !114 + %1480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 20, !dbg !114 + %1481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 21, !dbg !114 + %1482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 22, !dbg !114 + %1483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 23, !dbg !114 + %1484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 24, !dbg !114 + %1485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 25, !dbg !114 + %1486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 26, !dbg !114 + %1487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 27, !dbg !114 + %1488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 28, !dbg !114 + %1489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 29, !dbg !114 + %1490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 30, !dbg !114 + %1491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1449, 31, !dbg !114 + %1492 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1460, float %1461, float %1462, float %1463, float %1464, float %1465, float %1466, float %1467, float %1468, float %1469, float %1470, float %1471, float %1472, float %1473, float %1474, float %1475, float %1476, float %1477, float %1478, float %1479, float %1480, float %1481, float %1482, float %1483, float %1484, float %1485, float %1486, float %1487, float %1488, float %1489, float %1490, float %1491, i64 %1454, i64 %1459, i1 true) #3, !dbg !114 + %1493 = add i32 %587, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !114 + %1494 = lshr exact i32 %1493, 4, !dbg !114 + %1495 = and i32 %1494, 16383, !dbg !114 + %1496 = zext nneg i32 %1495 to i64, !dbg !114 + %1497 = or disjoint i64 %1496, 4611686293372403712, !dbg !114 + %1498 = add i32 %1444, 64, !dbg !114 + %1499 = lshr exact i32 %1498, 4, !dbg !114 + %1500 = and i32 %1499, 16383, !dbg !114 + %1501 = zext nneg i32 %1500 to i64, !dbg !114 + %1502 = or disjoint i64 %1501, 4611686293338849280, !dbg !114 + %1503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 0, !dbg !114 + %1504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 1, !dbg !114 + %1505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 2, !dbg !114 + %1506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 3, !dbg !114 + %1507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 4, !dbg !114 + %1508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 5, !dbg !114 + %1509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 6, !dbg !114 + %1510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 7, !dbg !114 + %1511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 8, !dbg !114 + %1512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 9, !dbg !114 + %1513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 10, !dbg !114 + %1514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 11, !dbg !114 + %1515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 12, !dbg !114 + %1516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 13, !dbg !114 + %1517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 14, !dbg !114 + %1518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 15, !dbg !114 + %1519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 16, !dbg !114 + %1520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 17, !dbg !114 + %1521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 18, !dbg !114 + %1522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 19, !dbg !114 + %1523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 20, !dbg !114 + %1524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 21, !dbg !114 + %1525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 22, !dbg !114 + %1526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 23, !dbg !114 + %1527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 24, !dbg !114 + %1528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 25, !dbg !114 + %1529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 26, !dbg !114 + %1530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 27, !dbg !114 + %1531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 28, !dbg !114 + %1532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 29, !dbg !114 + %1533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 30, !dbg !114 + %1534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1492, 31, !dbg !114 + %1535 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1503, float %1504, float %1505, float %1506, float %1507, float %1508, float %1509, float %1510, float %1511, float %1512, float %1513, float %1514, float %1515, float %1516, float %1517, float %1518, float %1519, float %1520, float %1521, float %1522, float %1523, float %1524, float %1525, float %1526, float %1527, float %1528, float %1529, float %1530, float %1531, float %1532, float %1533, float %1534, i64 %1497, i64 %1502, i1 true) #3, !dbg !114 + %1536 = add i32 %631, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !114 + %1537 = lshr exact i32 %1536, 4, !dbg !114 + %1538 = and i32 %1537, 16383, !dbg !114 + %1539 = zext nneg i32 %1538 to i64, !dbg !114 + %1540 = or disjoint i64 %1539, 4611686293372403712, !dbg !114 + %1541 = add i32 %1444, 96, !dbg !114 + %1542 = lshr exact i32 %1541, 4, !dbg !114 + %1543 = and i32 %1542, 16383, !dbg !114 + %1544 = zext nneg i32 %1543 to i64, !dbg !114 + %1545 = or disjoint i64 %1544, 4611686293338849280, !dbg !114 + %1546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 0, !dbg !114 + %1547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 1, !dbg !114 + %1548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 2, !dbg !114 + %1549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 3, !dbg !114 + %1550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 4, !dbg !114 + %1551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 5, !dbg !114 + %1552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 6, !dbg !114 + %1553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 7, !dbg !114 + %1554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 8, !dbg !114 + %1555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 9, !dbg !114 + %1556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 10, !dbg !114 + %1557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 11, !dbg !114 + %1558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 12, !dbg !114 + %1559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 13, !dbg !114 + %1560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 14, !dbg !114 + %1561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 15, !dbg !114 + %1562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 16, !dbg !114 + %1563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 17, !dbg !114 + %1564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 18, !dbg !114 + %1565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 19, !dbg !114 + %1566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 20, !dbg !114 + %1567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 21, !dbg !114 + %1568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 22, !dbg !114 + %1569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 23, !dbg !114 + %1570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 24, !dbg !114 + %1571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 25, !dbg !114 + %1572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 26, !dbg !114 + %1573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 27, !dbg !114 + %1574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 28, !dbg !114 + %1575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 29, !dbg !114 + %1576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 30, !dbg !114 + %1577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1535, 31, !dbg !114 + %1578 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1546, float %1547, float %1548, float %1549, float %1550, float %1551, float %1552, float %1553, float %1554, float %1555, float %1556, float %1557, float %1558, float %1559, float %1560, float %1561, float %1562, float %1563, float %1564, float %1565, float %1566, float %1567, float %1568, float %1569, float %1570, float %1571, float %1572, float %1573, float %1574, float %1575, float %1576, float %1577, i64 %1540, i64 %1545, i1 true) #3, !dbg !114 + %1579 = add i32 %675, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !114 + %1580 = lshr exact i32 %1579, 4, !dbg !114 + %1581 = and i32 %1580, 16383, !dbg !114 + %1582 = zext nneg i32 %1581 to i64, !dbg !114 + %1583 = or disjoint i64 %1582, 4611686293372403712, !dbg !114 + %1584 = add i32 %1444, 8192, !dbg !114 + %1585 = lshr exact i32 %1584, 4, !dbg !114 + %1586 = and i32 %1585, 16383, !dbg !114 + %1587 = zext nneg i32 %1586 to i64, !dbg !114 + %1588 = or disjoint i64 %1587, 4611686293338849280, !dbg !114 + %1589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 0, !dbg !114 + %1590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 1, !dbg !114 + %1591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 2, !dbg !114 + %1592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 3, !dbg !114 + %1593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 4, !dbg !114 + %1594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 5, !dbg !114 + %1595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 6, !dbg !114 + %1596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 7, !dbg !114 + %1597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 8, !dbg !114 + %1598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 9, !dbg !114 + %1599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 10, !dbg !114 + %1600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 11, !dbg !114 + %1601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 12, !dbg !114 + %1602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 13, !dbg !114 + %1603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 14, !dbg !114 + %1604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 15, !dbg !114 + %1605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 16, !dbg !114 + %1606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 17, !dbg !114 + %1607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 18, !dbg !114 + %1608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 19, !dbg !114 + %1609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 20, !dbg !114 + %1610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 21, !dbg !114 + %1611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 22, !dbg !114 + %1612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 23, !dbg !114 + %1613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 24, !dbg !114 + %1614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 25, !dbg !114 + %1615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 26, !dbg !114 + %1616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 27, !dbg !114 + %1617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 28, !dbg !114 + %1618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 29, !dbg !114 + %1619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 30, !dbg !114 + %1620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1578, 31, !dbg !114 + %1621 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1589, float %1590, float %1591, float %1592, float %1593, float %1594, float %1595, float %1596, float %1597, float %1598, float %1599, float %1600, float %1601, float %1602, float %1603, float %1604, float %1605, float %1606, float %1607, float %1608, float %1609, float %1610, float %1611, float %1612, float %1613, float %1614, float %1615, float %1616, float %1617, float %1618, float %1619, float %1620, i64 %1583, i64 %1588, i1 true) #3, !dbg !114 + %1622 = add i32 %719, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !114 + %1623 = lshr exact i32 %1622, 4, !dbg !114 + %1624 = and i32 %1623, 16383, !dbg !114 + %1625 = zext nneg i32 %1624 to i64, !dbg !114 + %1626 = or disjoint i64 %1625, 4611686293372403712, !dbg !114 + %1627 = add i32 %1444, 8224, !dbg !114 + %1628 = lshr exact i32 %1627, 4, !dbg !114 + %1629 = and i32 %1628, 16383, !dbg !114 + %1630 = zext nneg i32 %1629 to i64, !dbg !114 + %1631 = or disjoint i64 %1630, 4611686293338849280, !dbg !114 + %1632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 0, !dbg !114 + %1633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 1, !dbg !114 + %1634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 2, !dbg !114 + %1635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 3, !dbg !114 + %1636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 4, !dbg !114 + %1637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 5, !dbg !114 + %1638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 6, !dbg !114 + %1639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 7, !dbg !114 + %1640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 8, !dbg !114 + %1641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 9, !dbg !114 + %1642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 10, !dbg !114 + %1643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 11, !dbg !114 + %1644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 12, !dbg !114 + %1645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 13, !dbg !114 + %1646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 14, !dbg !114 + %1647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 15, !dbg !114 + %1648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 16, !dbg !114 + %1649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 17, !dbg !114 + %1650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 18, !dbg !114 + %1651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 19, !dbg !114 + %1652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 20, !dbg !114 + %1653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 21, !dbg !114 + %1654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 22, !dbg !114 + %1655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 23, !dbg !114 + %1656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 24, !dbg !114 + %1657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 25, !dbg !114 + %1658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 26, !dbg !114 + %1659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 27, !dbg !114 + %1660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 28, !dbg !114 + %1661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 29, !dbg !114 + %1662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 30, !dbg !114 + %1663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1621, 31, !dbg !114 + %1664 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1632, float %1633, float %1634, float %1635, float %1636, float %1637, float %1638, float %1639, float %1640, float %1641, float %1642, float %1643, float %1644, float %1645, float %1646, float %1647, float %1648, float %1649, float %1650, float %1651, float %1652, float %1653, float %1654, float %1655, float %1656, float %1657, float %1658, float %1659, float %1660, float %1661, float %1662, float %1663, i64 %1626, i64 %1631, i1 true) #3, !dbg !114 + %1665 = add i32 %763, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !114 + %1666 = lshr exact i32 %1665, 4, !dbg !114 + %1667 = and i32 %1666, 16383, !dbg !114 + %1668 = zext nneg i32 %1667 to i64, !dbg !114 + %1669 = or disjoint i64 %1668, 4611686293372403712, !dbg !114 + %1670 = add i32 %1444, 8256, !dbg !114 + %1671 = lshr exact i32 %1670, 4, !dbg !114 + %1672 = and i32 %1671, 16383, !dbg !114 + %1673 = zext nneg i32 %1672 to i64, !dbg !114 + %1674 = or disjoint i64 %1673, 4611686293338849280, !dbg !114 + %1675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 0, !dbg !114 + %1676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 1, !dbg !114 + %1677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 2, !dbg !114 + %1678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 3, !dbg !114 + %1679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 4, !dbg !114 + %1680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 5, !dbg !114 + %1681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 6, !dbg !114 + %1682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 7, !dbg !114 + %1683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 8, !dbg !114 + %1684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 9, !dbg !114 + %1685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 10, !dbg !114 + %1686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 11, !dbg !114 + %1687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 12, !dbg !114 + %1688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 13, !dbg !114 + %1689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 14, !dbg !114 + %1690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 15, !dbg !114 + %1691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 16, !dbg !114 + %1692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 17, !dbg !114 + %1693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 18, !dbg !114 + %1694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 19, !dbg !114 + %1695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 20, !dbg !114 + %1696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 21, !dbg !114 + %1697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 22, !dbg !114 + %1698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 23, !dbg !114 + %1699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 24, !dbg !114 + %1700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 25, !dbg !114 + %1701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 26, !dbg !114 + %1702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 27, !dbg !114 + %1703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 28, !dbg !114 + %1704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 29, !dbg !114 + %1705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 30, !dbg !114 + %1706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1664, 31, !dbg !114 + %1707 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1675, float %1676, float %1677, float %1678, float %1679, float %1680, float %1681, float %1682, float %1683, float %1684, float %1685, float %1686, float %1687, float %1688, float %1689, float %1690, float %1691, float %1692, float %1693, float %1694, float %1695, float %1696, float %1697, float %1698, float %1699, float %1700, float %1701, float %1702, float %1703, float %1704, float %1705, float %1706, i64 %1669, i64 %1674, i1 true) #3, !dbg !114 + %1708 = add i32 %807, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !114 + %1709 = lshr exact i32 %1708, 4, !dbg !114 + %1710 = and i32 %1709, 16383, !dbg !114 + %1711 = zext nneg i32 %1710 to i64, !dbg !114 + %1712 = or disjoint i64 %1711, 4611686293372403712, !dbg !114 + %1713 = add i32 %1444, 8288, !dbg !114 + %1714 = lshr exact i32 %1713, 4, !dbg !114 + %1715 = and i32 %1714, 16383, !dbg !114 + %1716 = zext nneg i32 %1715 to i64, !dbg !114 + %1717 = or disjoint i64 %1716, 4611686293338849280, !dbg !114 + %1718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 0, !dbg !114 + %1719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 1, !dbg !114 + %1720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 2, !dbg !114 + %1721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 3, !dbg !114 + %1722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 4, !dbg !114 + %1723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 5, !dbg !114 + %1724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 6, !dbg !114 + %1725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 7, !dbg !114 + %1726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 8, !dbg !114 + %1727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 9, !dbg !114 + %1728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 10, !dbg !114 + %1729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 11, !dbg !114 + %1730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 12, !dbg !114 + %1731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 13, !dbg !114 + %1732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 14, !dbg !114 + %1733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 15, !dbg !114 + %1734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 16, !dbg !114 + %1735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 17, !dbg !114 + %1736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 18, !dbg !114 + %1737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 19, !dbg !114 + %1738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 20, !dbg !114 + %1739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 21, !dbg !114 + %1740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 22, !dbg !114 + %1741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 23, !dbg !114 + %1742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 24, !dbg !114 + %1743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 25, !dbg !114 + %1744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 26, !dbg !114 + %1745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 27, !dbg !114 + %1746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 28, !dbg !114 + %1747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 29, !dbg !114 + %1748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 30, !dbg !114 + %1749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1707, 31, !dbg !114 + %1750 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1718, float %1719, float %1720, float %1721, float %1722, float %1723, float %1724, float %1725, float %1726, float %1727, float %1728, float %1729, float %1730, float %1731, float %1732, float %1733, float %1734, float %1735, float %1736, float %1737, float %1738, float %1739, float %1740, float %1741, float %1742, float %1743, float %1744, float %1745, float %1746, float %1747, float %1748, float %1749, i64 %1712, i64 %1717, i1 true) #3, !dbg !114 + %1751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 0, !dbg !114 + %1752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 1, !dbg !114 + %1753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 2, !dbg !114 + %1754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 3, !dbg !114 + %1755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 4, !dbg !114 + %1756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 5, !dbg !114 + %1757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 6, !dbg !114 + %1758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 7, !dbg !114 + %1759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 8, !dbg !114 + %1760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 9, !dbg !114 + %1761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 10, !dbg !114 + %1762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 11, !dbg !114 + %1763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 12, !dbg !114 + %1764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 13, !dbg !114 + %1765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 14, !dbg !114 + %1766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 15, !dbg !114 + %1767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 16, !dbg !114 + %1768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 17, !dbg !114 + %1769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 18, !dbg !114 + %1770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 19, !dbg !114 + %1771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 20, !dbg !114 + %1772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 21, !dbg !114 + %1773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 22, !dbg !114 + %1774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 23, !dbg !114 + %1775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 24, !dbg !114 + %1776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 25, !dbg !114 + %1777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 26, !dbg !114 + %1778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 27, !dbg !114 + %1779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 28, !dbg !114 + %1780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 29, !dbg !114 + %1781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 30, !dbg !114 + %1782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1750, 31, !dbg !114 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !114 + %1783 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %1751, float %1752, float %1753, float %1754, float %1755, float %1756, float %1757, float %1758, float %1759, float %1760, float %1761, float %1762, float %1763, float %1764, float %1765, float %1766, float %1767, float %1768, float %1769, float %1770, float %1771, float %1772, float %1773, float %1774, float %1775, float %1776, float %1777, float %1778, float %1779, float %1780, float %1781, float %1782, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 0, i32 0, ptr addrspace(3) %1438, i32 0, i32 0) #3, !dbg !114 + %1784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 0, !dbg !114 + %1785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 1, !dbg !114 + %1786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 2, !dbg !114 + %1787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 3, !dbg !114 + %1788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 4, !dbg !114 + %1789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 5, !dbg !114 + %1790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 6, !dbg !114 + %1791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 7, !dbg !114 + %1792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 8, !dbg !114 + %1793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 9, !dbg !114 + %1794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 10, !dbg !114 + %1795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 11, !dbg !114 + %1796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 12, !dbg !114 + %1797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 13, !dbg !114 + %1798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 14, !dbg !114 + %1799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 15, !dbg !114 + %1800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 16, !dbg !114 + %1801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 17, !dbg !114 + %1802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 18, !dbg !114 + %1803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 19, !dbg !114 + %1804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 20, !dbg !114 + %1805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 21, !dbg !114 + %1806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 22, !dbg !114 + %1807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 23, !dbg !114 + %1808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 24, !dbg !114 + %1809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 25, !dbg !114 + %1810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 26, !dbg !114 + %1811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 27, !dbg !114 + %1812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 28, !dbg !114 + %1813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 29, !dbg !114 + %1814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 30, !dbg !114 + %1815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1783, 31, !dbg !114 + %1816 = fsub float %1784, %345, !dbg !115 + %1817 = fsub float %1785, %345, !dbg !115 + %1818 = fsub float %1786, %347, !dbg !115 + %1819 = fsub float %1787, %347, !dbg !115 + %1820 = fsub float %1788, %345, !dbg !115 + %1821 = fsub float %1789, %345, !dbg !115 + %1822 = fsub float %1790, %347, !dbg !115 + %1823 = fsub float %1791, %347, !dbg !115 + %1824 = fsub float %1792, %345, !dbg !115 + %1825 = fsub float %1793, %345, !dbg !115 + %1826 = fsub float %1794, %347, !dbg !115 + %1827 = fsub float %1795, %347, !dbg !115 + %1828 = fsub float %1796, %345, !dbg !115 + %1829 = fsub float %1797, %345, !dbg !115 + %1830 = fsub float %1798, %347, !dbg !115 + %1831 = fsub float %1799, %347, !dbg !115 + %1832 = fsub float %1800, %345, !dbg !115 + %1833 = fsub float %1801, %345, !dbg !115 + %1834 = fsub float %1802, %347, !dbg !115 + %1835 = fsub float %1803, %347, !dbg !115 + %1836 = fsub float %1804, %345, !dbg !115 + %1837 = fsub float %1805, %345, !dbg !115 + %1838 = fsub float %1806, %347, !dbg !115 + %1839 = fsub float %1807, %347, !dbg !115 + %1840 = fsub float %1808, %345, !dbg !115 + %1841 = fsub float %1809, %345, !dbg !115 + %1842 = fsub float %1810, %347, !dbg !115 + %1843 = fsub float %1811, %347, !dbg !115 + %1844 = fsub float %1812, %345, !dbg !115 + %1845 = fsub float %1813, %345, !dbg !115 + %1846 = fsub float %1814, %347, !dbg !115 + %1847 = fsub float %1815, %347, !dbg !115 + %1848 = fmul float %.0.i1413, %1816, !dbg !116 + %1849 = fmul float %.0.i1416, %1817, !dbg !116 + %1850 = fmul float %.0.i1419, %1818, !dbg !116 + %1851 = fmul float %.0.i1422, %1819, !dbg !116 + %1852 = fmul float %.0.i1425, %1820, !dbg !116 + %1853 = fmul float %.0.i1428, %1821, !dbg !116 + %1854 = fmul float %.0.i1431, %1822, !dbg !116 + %1855 = fmul float %.0.i1434, %1823, !dbg !116 + %1856 = fmul float %.0.i1437, %1824, !dbg !116 + %1857 = fmul float %.0.i1440, %1825, !dbg !116 + %1858 = fmul float %.0.i1443, %1826, !dbg !116 + %1859 = fmul float %.0.i1446, %1827, !dbg !116 + %1860 = fmul float %.0.i1449, %1828, !dbg !116 + %1861 = fmul float %.0.i1452, %1829, !dbg !116 + %1862 = fmul float %.0.i1455, %1830, !dbg !116 + %1863 = fmul float %.0.i1458, %1831, !dbg !116 + %1864 = fmul float %.0.i1461, %1832, !dbg !116 + %1865 = fmul float %.0.i1464, %1833, !dbg !116 + %1866 = fmul float %.0.i1467, %1834, !dbg !116 + %1867 = fmul float %.0.i1470, %1835, !dbg !116 + %1868 = fmul float %.0.i1473, %1836, !dbg !116 + %1869 = fmul float %.0.i1476, %1837, !dbg !116 + %1870 = fmul float %.0.i1479, %1838, !dbg !116 + %1871 = fmul float %.0.i1482, %1839, !dbg !116 + %1872 = fmul float %.0.i1485, %1840, !dbg !116 + %1873 = fmul float %.0.i1488, %1841, !dbg !116 + %1874 = fmul float %.0.i1491, %1842, !dbg !116 + %1875 = fmul float %.0.i1494, %1843, !dbg !116 + %1876 = fmul float %.0.i1497, %1844, !dbg !116 + %1877 = fmul float %.0.i1500, %1845, !dbg !116 + %1878 = fmul float %.0.i1503, %1846, !dbg !116 + %1879 = fmul float %.0.i1506, %1847, !dbg !116 + %1880 = fptrunc float %1848 to bfloat, !dbg !117 + %1881 = select i1 %1150, bfloat %1880, bfloat 0xR0000, !dbg !118 + %1882 = fptrunc float %1849 to bfloat, !dbg !117 + %1883 = select i1 %1151, bfloat %1882, bfloat 0xR0000, !dbg !118 + %1884 = fptrunc float %1850 to bfloat, !dbg !117 + %1885 = select i1 %1152, bfloat %1884, bfloat 0xR0000, !dbg !118 + %1886 = fptrunc float %1851 to bfloat, !dbg !117 + %1887 = select i1 %1153, bfloat %1886, bfloat 0xR0000, !dbg !118 + %1888 = fptrunc float %1852 to bfloat, !dbg !117 + %1889 = select i1 %1154, bfloat %1888, bfloat 0xR0000, !dbg !118 + %1890 = fptrunc float %1853 to bfloat, !dbg !117 + %1891 = select i1 %1155, bfloat %1890, bfloat 0xR0000, !dbg !118 + %1892 = fptrunc float %1854 to bfloat, !dbg !117 + %1893 = select i1 %1156, bfloat %1892, bfloat 0xR0000, !dbg !118 + %1894 = fptrunc float %1855 to bfloat, !dbg !117 + %1895 = select i1 %1157, bfloat %1894, bfloat 0xR0000, !dbg !118 + %1896 = fptrunc float %1856 to bfloat, !dbg !117 + %1897 = select i1 %1158, bfloat %1896, bfloat 0xR0000, !dbg !118 + %1898 = fptrunc float %1857 to bfloat, !dbg !117 + %1899 = select i1 %1159, bfloat %1898, bfloat 0xR0000, !dbg !118 + %1900 = fptrunc float %1858 to bfloat, !dbg !117 + %1901 = select i1 %1160, bfloat %1900, bfloat 0xR0000, !dbg !118 + %1902 = fptrunc float %1859 to bfloat, !dbg !117 + %1903 = select i1 %1161, bfloat %1902, bfloat 0xR0000, !dbg !118 + %1904 = fptrunc float %1860 to bfloat, !dbg !117 + %1905 = select i1 %1162, bfloat %1904, bfloat 0xR0000, !dbg !118 + %1906 = fptrunc float %1861 to bfloat, !dbg !117 + %1907 = select i1 %1163, bfloat %1906, bfloat 0xR0000, !dbg !118 + %1908 = fptrunc float %1862 to bfloat, !dbg !117 + %1909 = select i1 %1164, bfloat %1908, bfloat 0xR0000, !dbg !118 + %1910 = fptrunc float %1863 to bfloat, !dbg !117 + %1911 = select i1 %1165, bfloat %1910, bfloat 0xR0000, !dbg !118 + %1912 = fptrunc float %1864 to bfloat, !dbg !117 + %1913 = select i1 %1166, bfloat %1912, bfloat 0xR0000, !dbg !118 + %1914 = fptrunc float %1865 to bfloat, !dbg !117 + %1915 = select i1 %1167, bfloat %1914, bfloat 0xR0000, !dbg !118 + %1916 = fptrunc float %1866 to bfloat, !dbg !117 + %1917 = select i1 %1168, bfloat %1916, bfloat 0xR0000, !dbg !118 + %1918 = fptrunc float %1867 to bfloat, !dbg !117 + %1919 = select i1 %1169, bfloat %1918, bfloat 0xR0000, !dbg !118 + %1920 = fptrunc float %1868 to bfloat, !dbg !117 + %1921 = select i1 %1170, bfloat %1920, bfloat 0xR0000, !dbg !118 + %1922 = fptrunc float %1869 to bfloat, !dbg !117 + %1923 = select i1 %1171, bfloat %1922, bfloat 0xR0000, !dbg !118 + %1924 = fptrunc float %1870 to bfloat, !dbg !117 + %1925 = select i1 %1172, bfloat %1924, bfloat 0xR0000, !dbg !118 + %1926 = fptrunc float %1871 to bfloat, !dbg !117 + %1927 = select i1 %1173, bfloat %1926, bfloat 0xR0000, !dbg !118 + %1928 = fptrunc float %1872 to bfloat, !dbg !117 + %1929 = select i1 %1174, bfloat %1928, bfloat 0xR0000, !dbg !118 + %1930 = fptrunc float %1873 to bfloat, !dbg !117 + %1931 = select i1 %1175, bfloat %1930, bfloat 0xR0000, !dbg !118 + %1932 = fptrunc float %1874 to bfloat, !dbg !117 + %1933 = select i1 %1176, bfloat %1932, bfloat 0xR0000, !dbg !118 + %1934 = fptrunc float %1875 to bfloat, !dbg !117 + %1935 = select i1 %1177, bfloat %1934, bfloat 0xR0000, !dbg !118 + %1936 = fptrunc float %1876 to bfloat, !dbg !117 + %1937 = select i1 %1178, bfloat %1936, bfloat 0xR0000, !dbg !118 + %1938 = fptrunc float %1877 to bfloat, !dbg !117 + %1939 = select i1 %1179, bfloat %1938, bfloat 0xR0000, !dbg !118 + %1940 = fptrunc float %1878 to bfloat, !dbg !117 + %1941 = select i1 %1180, bfloat %1940, bfloat 0xR0000, !dbg !118 + %1942 = fptrunc float %1879 to bfloat, !dbg !117 + %1943 = select i1 %1181, bfloat %1942, bfloat 0xR0000, !dbg !118 + %1944 = insertelement <2 x bfloat> poison, bfloat %1881, i64 0, !dbg !119 + %1945 = insertelement <2 x bfloat> %1944, bfloat %1883, i64 1, !dbg !119 + %1946 = bitcast <2 x bfloat> %1945 to i32, !dbg !119 + %1947 = insertelement <2 x bfloat> poison, bfloat %1885, i64 0, !dbg !119 + %1948 = insertelement <2 x bfloat> %1947, bfloat %1887, i64 1, !dbg !119 + %1949 = bitcast <2 x bfloat> %1948 to i32, !dbg !119 + %1950 = insertelement <2 x bfloat> poison, bfloat %1889, i64 0, !dbg !119 + %1951 = insertelement <2 x bfloat> %1950, bfloat %1891, i64 1, !dbg !119 + %1952 = bitcast <2 x bfloat> %1951 to i32, !dbg !119 + %1953 = insertelement <2 x bfloat> poison, bfloat %1893, i64 0, !dbg !119 + %1954 = insertelement <2 x bfloat> %1953, bfloat %1895, i64 1, !dbg !119 + %1955 = bitcast <2 x bfloat> %1954 to i32, !dbg !119 + %1956 = insertelement <2 x bfloat> poison, bfloat %1897, i64 0, !dbg !119 + %1957 = insertelement <2 x bfloat> %1956, bfloat %1899, i64 1, !dbg !119 + %1958 = bitcast <2 x bfloat> %1957 to i32, !dbg !119 + %1959 = insertelement <2 x bfloat> poison, bfloat %1901, i64 0, !dbg !119 + %1960 = insertelement <2 x bfloat> %1959, bfloat %1903, i64 1, !dbg !119 + %1961 = bitcast <2 x bfloat> %1960 to i32, !dbg !119 + %1962 = insertelement <2 x bfloat> poison, bfloat %1905, i64 0, !dbg !119 + %1963 = insertelement <2 x bfloat> %1962, bfloat %1907, i64 1, !dbg !119 + %1964 = bitcast <2 x bfloat> %1963 to i32, !dbg !119 + %1965 = insertelement <2 x bfloat> poison, bfloat %1909, i64 0, !dbg !119 + %1966 = insertelement <2 x bfloat> %1965, bfloat %1911, i64 1, !dbg !119 + %1967 = bitcast <2 x bfloat> %1966 to i32, !dbg !119 + %1968 = insertelement <2 x bfloat> poison, bfloat %1913, i64 0, !dbg !119 + %1969 = insertelement <2 x bfloat> %1968, bfloat %1915, i64 1, !dbg !119 + %1970 = bitcast <2 x bfloat> %1969 to i32, !dbg !119 + %1971 = insertelement <2 x bfloat> poison, bfloat %1917, i64 0, !dbg !119 + %1972 = insertelement <2 x bfloat> %1971, bfloat %1919, i64 1, !dbg !119 + %1973 = bitcast <2 x bfloat> %1972 to i32, !dbg !119 + %1974 = insertelement <2 x bfloat> poison, bfloat %1921, i64 0, !dbg !119 + %1975 = insertelement <2 x bfloat> %1974, bfloat %1923, i64 1, !dbg !119 + %1976 = bitcast <2 x bfloat> %1975 to i32, !dbg !119 + %1977 = insertelement <2 x bfloat> poison, bfloat %1925, i64 0, !dbg !119 + %1978 = insertelement <2 x bfloat> %1977, bfloat %1927, i64 1, !dbg !119 + %1979 = bitcast <2 x bfloat> %1978 to i32, !dbg !119 + %1980 = insertelement <2 x bfloat> poison, bfloat %1929, i64 0, !dbg !119 + %1981 = insertelement <2 x bfloat> %1980, bfloat %1931, i64 1, !dbg !119 + %1982 = bitcast <2 x bfloat> %1981 to i32, !dbg !119 + %1983 = insertelement <2 x bfloat> poison, bfloat %1933, i64 0, !dbg !119 + %1984 = insertelement <2 x bfloat> %1983, bfloat %1935, i64 1, !dbg !119 + %1985 = bitcast <2 x bfloat> %1984 to i32, !dbg !119 + %1986 = insertelement <2 x bfloat> poison, bfloat %1937, i64 0, !dbg !119 + %1987 = insertelement <2 x bfloat> %1986, bfloat %1939, i64 1, !dbg !119 + %1988 = bitcast <2 x bfloat> %1987 to i32, !dbg !119 + %1989 = insertelement <2 x bfloat> poison, bfloat %1941, i64 0, !dbg !119 + %1990 = insertelement <2 x bfloat> %1989, bfloat %1943, i64 1, !dbg !119 + %1991 = bitcast <2 x bfloat> %1990 to i32, !dbg !119 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !119 + %1992 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %455, float %456, float %457, float %458, float %459, float %460, float %461, float %462, float %463, float %464, float %465, float %466, float %467, float %468, float %469, float %470, float %471, float %472, float %473, float %474, float %475, float %476, float %477, float %478, float %479, float %480, float %481, float %482, float %483, float %484, float %485, float %486, float %487, float %488, float %489, float %490, float %491, float %492, float %493, float %494, float %495, float %496, float %497, float %498, float %499, float %500, float %501, float %502, float %503, float %504, float %505, float %506, float %507, float %508, float %509, float %510, float %511, float %512, float %513, float %514, float %515, float %516, float %517, float %518, i32 %1946, i32 %1949, i32 %1952, i32 %1955, i64 %541, i1 true) #3, !dbg !119 + %1993 = add i32 %537, 2048, !dbg !119 + %1994 = lshr exact i32 %1993, 4, !dbg !119 + %1995 = and i32 %1994, 16383, !dbg !119 + %1996 = zext nneg i32 %1995 to i64, !dbg !119 + %1997 = or disjoint i64 %1996, 4611686293338849280, !dbg !119 + %1998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 0, !dbg !119 + %1999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 1, !dbg !119 + %2000 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 2, !dbg !119 + %2001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 3, !dbg !119 + %2002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 4, !dbg !119 + %2003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 5, !dbg !119 + %2004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 6, !dbg !119 + %2005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 7, !dbg !119 + %2006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 8, !dbg !119 + %2007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 9, !dbg !119 + %2008 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 10, !dbg !119 + %2009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 11, !dbg !119 + %2010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 12, !dbg !119 + %2011 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 13, !dbg !119 + %2012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 14, !dbg !119 + %2013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 15, !dbg !119 + %2014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 16, !dbg !119 + %2015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 17, !dbg !119 + %2016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 18, !dbg !119 + %2017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 19, !dbg !119 + %2018 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 20, !dbg !119 + %2019 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 21, !dbg !119 + %2020 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 22, !dbg !119 + %2021 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 23, !dbg !119 + %2022 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 24, !dbg !119 + %2023 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 25, !dbg !119 + %2024 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 26, !dbg !119 + %2025 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 27, !dbg !119 + %2026 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 28, !dbg !119 + %2027 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 29, !dbg !119 + %2028 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 30, !dbg !119 + %2029 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 31, !dbg !119 + %2030 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 32, !dbg !119 + %2031 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 33, !dbg !119 + %2032 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 34, !dbg !119 + %2033 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 35, !dbg !119 + %2034 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 36, !dbg !119 + %2035 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 37, !dbg !119 + %2036 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 38, !dbg !119 + %2037 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 39, !dbg !119 + %2038 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 40, !dbg !119 + %2039 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 41, !dbg !119 + %2040 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 42, !dbg !119 + %2041 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 43, !dbg !119 + %2042 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 44, !dbg !119 + %2043 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 45, !dbg !119 + %2044 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 46, !dbg !119 + %2045 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 47, !dbg !119 + %2046 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 48, !dbg !119 + %2047 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 49, !dbg !119 + %2048 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 50, !dbg !119 + %2049 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 51, !dbg !119 + %2050 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 52, !dbg !119 + %2051 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 53, !dbg !119 + %2052 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 54, !dbg !119 + %2053 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 55, !dbg !119 + %2054 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 56, !dbg !119 + %2055 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 57, !dbg !119 + %2056 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 58, !dbg !119 + %2057 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 59, !dbg !119 + %2058 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 60, !dbg !119 + %2059 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 61, !dbg !119 + %2060 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 62, !dbg !119 + %2061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1992, 63, !dbg !119 + %2062 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1998, float %1999, float %2000, float %2001, float %2002, float %2003, float %2004, float %2005, float %2006, float %2007, float %2008, float %2009, float %2010, float %2011, float %2012, float %2013, float %2014, float %2015, float %2016, float %2017, float %2018, float %2019, float %2020, float %2021, float %2022, float %2023, float %2024, float %2025, float %2026, float %2027, float %2028, float %2029, float %2030, float %2031, float %2032, float %2033, float %2034, float %2035, float %2036, float %2037, float %2038, float %2039, float %2040, float %2041, float %2042, float %2043, float %2044, float %2045, float %2046, float %2047, float %2048, float %2049, float %2050, float %2051, float %2052, float %2053, float %2054, float %2055, float %2056, float %2057, float %2058, float %2059, float %2060, float %2061, i32 %1958, i32 %1961, i32 %1964, i32 %1967, i64 %1997, i1 true) #3, !dbg !119 + %2063 = add i32 %537, 4096, !dbg !119 + %2064 = lshr exact i32 %2063, 4, !dbg !119 + %2065 = and i32 %2064, 16383, !dbg !119 + %2066 = zext nneg i32 %2065 to i64, !dbg !119 + %2067 = or disjoint i64 %2066, 4611686293338849280, !dbg !119 + %2068 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 0, !dbg !119 + %2069 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 1, !dbg !119 + %2070 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 2, !dbg !119 + %2071 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 3, !dbg !119 + %2072 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 4, !dbg !119 + %2073 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 5, !dbg !119 + %2074 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 6, !dbg !119 + %2075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 7, !dbg !119 + %2076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 8, !dbg !119 + %2077 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 9, !dbg !119 + %2078 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 10, !dbg !119 + %2079 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 11, !dbg !119 + %2080 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 12, !dbg !119 + %2081 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 13, !dbg !119 + %2082 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 14, !dbg !119 + %2083 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 15, !dbg !119 + %2084 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 16, !dbg !119 + %2085 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 17, !dbg !119 + %2086 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 18, !dbg !119 + %2087 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 19, !dbg !119 + %2088 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 20, !dbg !119 + %2089 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 21, !dbg !119 + %2090 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 22, !dbg !119 + %2091 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 23, !dbg !119 + %2092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 24, !dbg !119 + %2093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 25, !dbg !119 + %2094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 26, !dbg !119 + %2095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 27, !dbg !119 + %2096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 28, !dbg !119 + %2097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 29, !dbg !119 + %2098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 30, !dbg !119 + %2099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 31, !dbg !119 + %2100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 32, !dbg !119 + %2101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 33, !dbg !119 + %2102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 34, !dbg !119 + %2103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 35, !dbg !119 + %2104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 36, !dbg !119 + %2105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 37, !dbg !119 + %2106 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 38, !dbg !119 + %2107 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 39, !dbg !119 + %2108 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 40, !dbg !119 + %2109 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 41, !dbg !119 + %2110 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 42, !dbg !119 + %2111 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 43, !dbg !119 + %2112 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 44, !dbg !119 + %2113 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 45, !dbg !119 + %2114 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 46, !dbg !119 + %2115 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 47, !dbg !119 + %2116 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 48, !dbg !119 + %2117 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 49, !dbg !119 + %2118 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 50, !dbg !119 + %2119 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 51, !dbg !119 + %2120 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 52, !dbg !119 + %2121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 53, !dbg !119 + %2122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 54, !dbg !119 + %2123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 55, !dbg !119 + %2124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 56, !dbg !119 + %2125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 57, !dbg !119 + %2126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 58, !dbg !119 + %2127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 59, !dbg !119 + %2128 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 60, !dbg !119 + %2129 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 61, !dbg !119 + %2130 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 62, !dbg !119 + %2131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2062, 63, !dbg !119 + %2132 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %2068, float %2069, float %2070, float %2071, float %2072, float %2073, float %2074, float %2075, float %2076, float %2077, float %2078, float %2079, float %2080, float %2081, float %2082, float %2083, float %2084, float %2085, float %2086, float %2087, float %2088, float %2089, float %2090, float %2091, float %2092, float %2093, float %2094, float %2095, float %2096, float %2097, float %2098, float %2099, float %2100, float %2101, float %2102, float %2103, float %2104, float %2105, float %2106, float %2107, float %2108, float %2109, float %2110, float %2111, float %2112, float %2113, float %2114, float %2115, float %2116, float %2117, float %2118, float %2119, float %2120, float %2121, float %2122, float %2123, float %2124, float %2125, float %2126, float %2127, float %2128, float %2129, float %2130, float %2131, i32 %1970, i32 %1973, i32 %1976, i32 %1979, i64 %2067, i1 true) #3, !dbg !119 + %2133 = add i32 %537, 6144, !dbg !119 + %2134 = lshr exact i32 %2133, 4, !dbg !119 + %2135 = and i32 %2134, 16383, !dbg !119 + %2136 = zext nneg i32 %2135 to i64, !dbg !119 + %2137 = or disjoint i64 %2136, 4611686293338849280, !dbg !119 + %2138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 0, !dbg !119 + %2139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 1, !dbg !119 + %2140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 2, !dbg !119 + %2141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 3, !dbg !119 + %2142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 4, !dbg !119 + %2143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 5, !dbg !119 + %2144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 6, !dbg !119 + %2145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 7, !dbg !119 + %2146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 8, !dbg !119 + %2147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 9, !dbg !119 + %2148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 10, !dbg !119 + %2149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 11, !dbg !119 + %2150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 12, !dbg !119 + %2151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 13, !dbg !119 + %2152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 14, !dbg !119 + %2153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 15, !dbg !119 + %2154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 16, !dbg !119 + %2155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 17, !dbg !119 + %2156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 18, !dbg !119 + %2157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 19, !dbg !119 + %2158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 20, !dbg !119 + %2159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 21, !dbg !119 + %2160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 22, !dbg !119 + %2161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 23, !dbg !119 + %2162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 24, !dbg !119 + %2163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 25, !dbg !119 + %2164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 26, !dbg !119 + %2165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 27, !dbg !119 + %2166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 28, !dbg !119 + %2167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 29, !dbg !119 + %2168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 30, !dbg !119 + %2169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 31, !dbg !119 + %2170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 32, !dbg !119 + %2171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 33, !dbg !119 + %2172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 34, !dbg !119 + %2173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 35, !dbg !119 + %2174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 36, !dbg !119 + %2175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 37, !dbg !119 + %2176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 38, !dbg !119 + %2177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 39, !dbg !119 + %2178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 40, !dbg !119 + %2179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 41, !dbg !119 + %2180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 42, !dbg !119 + %2181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 43, !dbg !119 + %2182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 44, !dbg !119 + %2183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 45, !dbg !119 + %2184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 46, !dbg !119 + %2185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 47, !dbg !119 + %2186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 48, !dbg !119 + %2187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 49, !dbg !119 + %2188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 50, !dbg !119 + %2189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 51, !dbg !119 + %2190 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 52, !dbg !119 + %2191 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 53, !dbg !119 + %2192 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 54, !dbg !119 + %2193 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 55, !dbg !119 + %2194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 56, !dbg !119 + %2195 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 57, !dbg !119 + %2196 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 58, !dbg !119 + %2197 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 59, !dbg !119 + %2198 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 60, !dbg !119 + %2199 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 61, !dbg !119 + %2200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 62, !dbg !119 + %2201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2132, 63, !dbg !119 + %2202 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %2138, float %2139, float %2140, float %2141, float %2142, float %2143, float %2144, float %2145, float %2146, float %2147, float %2148, float %2149, float %2150, float %2151, float %2152, float %2153, float %2154, float %2155, float %2156, float %2157, float %2158, float %2159, float %2160, float %2161, float %2162, float %2163, float %2164, float %2165, float %2166, float %2167, float %2168, float %2169, float %2170, float %2171, float %2172, float %2173, float %2174, float %2175, float %2176, float %2177, float %2178, float %2179, float %2180, float %2181, float %2182, float %2183, float %2184, float %2185, float %2186, float %2187, float %2188, float %2189, float %2190, float %2191, float %2192, float %2193, float %2194, float %2195, float %2196, float %2197, float %2198, float %2199, float %2200, float %2201, i32 %1982, i32 %1985, i32 %1988, i32 %1991, i64 %2137, i1 true) #3, !dbg !119 + %2203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 0, !dbg !119 + %2204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 1, !dbg !119 + %2205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 2, !dbg !119 + %2206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 3, !dbg !119 + %2207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 4, !dbg !119 + %2208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 5, !dbg !119 + %2209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 6, !dbg !119 + %2210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 7, !dbg !119 + %2211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 8, !dbg !119 + %2212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 9, !dbg !119 + %2213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 10, !dbg !119 + %2214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 11, !dbg !119 + %2215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 12, !dbg !119 + %2216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 13, !dbg !119 + %2217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 14, !dbg !119 + %2218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 15, !dbg !119 + %2219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 16, !dbg !119 + %2220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 17, !dbg !119 + %2221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 18, !dbg !119 + %2222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 19, !dbg !119 + %2223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 20, !dbg !119 + %2224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 21, !dbg !119 + %2225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 22, !dbg !119 + %2226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 23, !dbg !119 + %2227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 24, !dbg !119 + %2228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 25, !dbg !119 + %2229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 26, !dbg !119 + %2230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 27, !dbg !119 + %2231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 28, !dbg !119 + %2232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 29, !dbg !119 + %2233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 30, !dbg !119 + %2234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 31, !dbg !119 + %2235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 32, !dbg !119 + %2236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 33, !dbg !119 + %2237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 34, !dbg !119 + %2238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 35, !dbg !119 + %2239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 36, !dbg !119 + %2240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 37, !dbg !119 + %2241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 38, !dbg !119 + %2242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 39, !dbg !119 + %2243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 40, !dbg !119 + %2244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 41, !dbg !119 + %2245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 42, !dbg !119 + %2246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 43, !dbg !119 + %2247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 44, !dbg !119 + %2248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 45, !dbg !119 + %2249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 46, !dbg !119 + %2250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 47, !dbg !119 + %2251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 48, !dbg !119 + %2252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 49, !dbg !119 + %2253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 50, !dbg !119 + %2254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 51, !dbg !119 + %2255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 52, !dbg !119 + %2256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 53, !dbg !119 + %2257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 54, !dbg !119 + %2258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 55, !dbg !119 + %2259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 56, !dbg !119 + %2260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 57, !dbg !119 + %2261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 58, !dbg !119 + %2262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 59, !dbg !119 + %2263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 60, !dbg !119 + %2264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 61, !dbg !119 + %2265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 62, !dbg !119 + %2266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2202, 63, !dbg !119 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !119 + %2267 = insertelement <16 x i32> poison, i32 %452, i64 0, !dbg !120 + %2268 = shufflevector <16 x i32> %2267, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !120 + %2269 = add <16 x i32> %2268, %520, !dbg !120 + %2270 = add nuw nsw i32 %519, 1, !dbg !84 + %2271 = lshr i32 %2270, 1, !dbg !121 + %2272 = zext nneg i32 %2271 to i64, !dbg !122 + %2273 = getelementptr i32, ptr addrspace(1) %359, i64 %2272, !dbg !122 + %2274 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !123 + %2275 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %2273, i64 %2274, i1 %523) #3, !dbg !123 + %2276 = add nuw nsw i32 %2271, 1, !dbg !124 + %2277 = icmp slt i32 %2276, %364, !dbg !125 + %2278 = getelementptr i8, ptr addrspace(1) %2273, i64 4, !dbg !126 + %2279 = and i1 %523, %2277, !dbg !84 + %2280 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !127 + %2281 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %2278, i64 %2280, i1 %2279) #3, !dbg !127 + %2282 = and i32 %519, 1, !dbg !128 + %2283 = sub i32 %2281, %2275, !dbg !129 + %2284 = shl i32 %2283, 7, !dbg !130 + %2285 = add i32 %2284, -64, !dbg !131 + %2286 = xor i32 %2282, 1, !dbg !132 + %2287 = mul nuw nsw i32 %2285, %2286, !dbg !132 + %2288 = shl nuw nsw i32 %2282, 6, !dbg !133 + %2289 = add i32 %2287, %2288, !dbg !134 + %2290 = shl i32 %2289, 7, !dbg !135 + %2291 = sext i32 %2290 to i64, !dbg !88 + %2292 = getelementptr bfloat, ptr addrspace(1) %.pn9021821, i64 %2291, !dbg !88 + %2293 = getelementptr bfloat, ptr addrspace(1) %.pn8861822, i64 %2291, !dbg !88 + %2294 = getelementptr bfloat, ptr addrspace(1) %.pn8701823, i64 %2291, !dbg !88 + %2295 = getelementptr bfloat, ptr addrspace(1) %.pn8541824, i64 %2291, !dbg !88 + %2296 = getelementptr bfloat, ptr addrspace(1) %.pn9661825, i64 %2291, !dbg !89 + %2297 = getelementptr bfloat, ptr addrspace(1) %.pn9501826, i64 %2291, !dbg !89 + %2298 = getelementptr bfloat, ptr addrspace(1) %.pn9341827, i64 %2291, !dbg !89 + %2299 = getelementptr bfloat, ptr addrspace(1) %.pn9181828, i64 %2291, !dbg !89 + %2300 = add i32 %454, 1, !dbg !84 + %2301 = icmp sgt i32 %2300, 2, !dbg !84 + %2302 = select i1 %2301, i32 0, i32 %2300, !dbg !84 + %2303 = shl i32 %2302, 13, !dbg !87 + %2304 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %2303, !dbg !87 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !87 + %2305 = getelementptr inbounds nuw i8, ptr addrspace(3) %2304, i32 %402, !dbg !87 + %2306 = select i1 %522, i32 16, i32 0, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %2305, ptr addrspace(1) %2292, i32 %2306) #3, !dbg !87 + %2307 = getelementptr inbounds nuw i8, ptr addrspace(3) %2304, i32 %405, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2307, ptr addrspace(1) %2293, i32 %2306) #3, !dbg !87 + %2308 = getelementptr inbounds nuw i8, ptr addrspace(3) %2304, i32 %407, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2308, ptr addrspace(1) %2294, i32 %2306) #3, !dbg !87 + %2309 = getelementptr inbounds nuw i8, ptr addrspace(3) %2304, i32 %409, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2309, ptr addrspace(1) %2295, i32 %2306) #3, !dbg !87 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !87 + %2310 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %2303, !dbg !87 + %2311 = getelementptr inbounds nuw i8, ptr addrspace(3) %2310, i32 %402, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %2311, ptr addrspace(1) %2296, i32 %2306) #3, !dbg !87 + %2312 = getelementptr inbounds nuw i8, ptr addrspace(3) %2310, i32 %405, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2312, ptr addrspace(1) %2297, i32 %2306) #3, !dbg !87 + %2313 = getelementptr inbounds nuw i8, ptr addrspace(3) %2310, i32 %407, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2313, ptr addrspace(1) %2298, i32 %2306) #3, !dbg !87 + %2314 = getelementptr inbounds nuw i8, ptr addrspace(3) %2310, i32 %409, !dbg !87 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2314, ptr addrspace(1) %2299, i32 %2306) #3, !dbg !87 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !87 + %exitcond2125.not = icmp eq i32 %2270, %433, !dbg !84 + br i1 %exitcond2125.not, label %._crit_edge1847, label %451, !dbg !84 + +._crit_edge1847: ; preds = %__nv_exp2f.exit1507, %53 + %2315 = phi float [ 0.000000e+00, %53 ], [ %2203, %__nv_exp2f.exit1507 ] + %2316 = phi float [ 0.000000e+00, %53 ], [ %2204, %__nv_exp2f.exit1507 ] + %2317 = phi float [ 0.000000e+00, %53 ], [ %2205, %__nv_exp2f.exit1507 ] + %2318 = phi float [ 0.000000e+00, %53 ], [ %2206, %__nv_exp2f.exit1507 ] + %2319 = phi float [ 0.000000e+00, %53 ], [ %2207, %__nv_exp2f.exit1507 ] + %2320 = phi float [ 0.000000e+00, %53 ], [ %2208, %__nv_exp2f.exit1507 ] + %2321 = phi float [ 0.000000e+00, %53 ], [ %2209, %__nv_exp2f.exit1507 ] + %2322 = phi float [ 0.000000e+00, %53 ], [ %2210, %__nv_exp2f.exit1507 ] + %2323 = phi float [ 0.000000e+00, %53 ], [ %2211, %__nv_exp2f.exit1507 ] + %2324 = phi float [ 0.000000e+00, %53 ], [ %2212, %__nv_exp2f.exit1507 ] + %2325 = phi float [ 0.000000e+00, %53 ], [ %2213, %__nv_exp2f.exit1507 ] + %2326 = phi float [ 0.000000e+00, %53 ], [ %2214, %__nv_exp2f.exit1507 ] + %2327 = phi float [ 0.000000e+00, %53 ], [ %2215, %__nv_exp2f.exit1507 ] + %2328 = phi float [ 0.000000e+00, %53 ], [ %2216, %__nv_exp2f.exit1507 ] + %2329 = phi float [ 0.000000e+00, %53 ], [ %2217, %__nv_exp2f.exit1507 ] + %2330 = phi float [ 0.000000e+00, %53 ], [ %2218, %__nv_exp2f.exit1507 ] + %2331 = phi float [ 0.000000e+00, %53 ], [ %2219, %__nv_exp2f.exit1507 ] + %2332 = phi float [ 0.000000e+00, %53 ], [ %2220, %__nv_exp2f.exit1507 ] + %2333 = phi float [ 0.000000e+00, %53 ], [ %2221, %__nv_exp2f.exit1507 ] + %2334 = phi float [ 0.000000e+00, %53 ], [ %2222, %__nv_exp2f.exit1507 ] + %2335 = phi float [ 0.000000e+00, %53 ], [ %2223, %__nv_exp2f.exit1507 ] + %2336 = phi float [ 0.000000e+00, %53 ], [ %2224, %__nv_exp2f.exit1507 ] + %2337 = phi float [ 0.000000e+00, %53 ], [ %2225, %__nv_exp2f.exit1507 ] + %2338 = phi float [ 0.000000e+00, %53 ], [ %2226, %__nv_exp2f.exit1507 ] + %2339 = phi float [ 0.000000e+00, %53 ], [ %2227, %__nv_exp2f.exit1507 ] + %2340 = phi float [ 0.000000e+00, %53 ], [ %2228, %__nv_exp2f.exit1507 ] + %2341 = phi float [ 0.000000e+00, %53 ], [ %2229, %__nv_exp2f.exit1507 ] + %2342 = phi float [ 0.000000e+00, %53 ], [ %2230, %__nv_exp2f.exit1507 ] + %2343 = phi float [ 0.000000e+00, %53 ], [ %2231, %__nv_exp2f.exit1507 ] + %2344 = phi float [ 0.000000e+00, %53 ], [ %2232, %__nv_exp2f.exit1507 ] + %2345 = phi float [ 0.000000e+00, %53 ], [ %2233, %__nv_exp2f.exit1507 ] + %2346 = phi float [ 0.000000e+00, %53 ], [ %2234, %__nv_exp2f.exit1507 ] + %2347 = phi float [ 0.000000e+00, %53 ], [ %2235, %__nv_exp2f.exit1507 ] + %2348 = phi float [ 0.000000e+00, %53 ], [ %2236, %__nv_exp2f.exit1507 ] + %2349 = phi float [ 0.000000e+00, %53 ], [ %2237, %__nv_exp2f.exit1507 ] + %2350 = phi float [ 0.000000e+00, %53 ], [ %2238, %__nv_exp2f.exit1507 ] + %2351 = phi float [ 0.000000e+00, %53 ], [ %2239, %__nv_exp2f.exit1507 ] + %2352 = phi float [ 0.000000e+00, %53 ], [ %2240, %__nv_exp2f.exit1507 ] + %2353 = phi float [ 0.000000e+00, %53 ], [ %2241, %__nv_exp2f.exit1507 ] + %2354 = phi float [ 0.000000e+00, %53 ], [ %2242, %__nv_exp2f.exit1507 ] + %2355 = phi float [ 0.000000e+00, %53 ], [ %2243, %__nv_exp2f.exit1507 ] + %2356 = phi float [ 0.000000e+00, %53 ], [ %2244, %__nv_exp2f.exit1507 ] + %2357 = phi float [ 0.000000e+00, %53 ], [ %2245, %__nv_exp2f.exit1507 ] + %2358 = phi float [ 0.000000e+00, %53 ], [ %2246, %__nv_exp2f.exit1507 ] + %2359 = phi float [ 0.000000e+00, %53 ], [ %2247, %__nv_exp2f.exit1507 ] + %2360 = phi float [ 0.000000e+00, %53 ], [ %2248, %__nv_exp2f.exit1507 ] + %2361 = phi float [ 0.000000e+00, %53 ], [ %2249, %__nv_exp2f.exit1507 ] + %2362 = phi float [ 0.000000e+00, %53 ], [ %2250, %__nv_exp2f.exit1507 ] + %2363 = phi float [ 0.000000e+00, %53 ], [ %2251, %__nv_exp2f.exit1507 ] + %2364 = phi float [ 0.000000e+00, %53 ], [ %2252, %__nv_exp2f.exit1507 ] + %2365 = phi float [ 0.000000e+00, %53 ], [ %2253, %__nv_exp2f.exit1507 ] + %2366 = phi float [ 0.000000e+00, %53 ], [ %2254, %__nv_exp2f.exit1507 ] + %2367 = phi float [ 0.000000e+00, %53 ], [ %2255, %__nv_exp2f.exit1507 ] + %2368 = phi float [ 0.000000e+00, %53 ], [ %2256, %__nv_exp2f.exit1507 ] + %2369 = phi float [ 0.000000e+00, %53 ], [ %2257, %__nv_exp2f.exit1507 ] + %2370 = phi float [ 0.000000e+00, %53 ], [ %2258, %__nv_exp2f.exit1507 ] + %2371 = phi float [ 0.000000e+00, %53 ], [ %2259, %__nv_exp2f.exit1507 ] + %2372 = phi float [ 0.000000e+00, %53 ], [ %2260, %__nv_exp2f.exit1507 ] + %2373 = phi float [ 0.000000e+00, %53 ], [ %2261, %__nv_exp2f.exit1507 ] + %2374 = phi float [ 0.000000e+00, %53 ], [ %2262, %__nv_exp2f.exit1507 ] + %2375 = phi float [ 0.000000e+00, %53 ], [ %2263, %__nv_exp2f.exit1507 ] + %2376 = phi float [ 0.000000e+00, %53 ], [ %2264, %__nv_exp2f.exit1507 ] + %2377 = phi float [ 0.000000e+00, %53 ], [ %2265, %__nv_exp2f.exit1507 ] + %2378 = phi float [ 0.000000e+00, %53 ], [ %2266, %__nv_exp2f.exit1507 ] + %2379 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %2315, float %2316, float %2317, float %2318, float %2319, float %2320, float %2321, float %2322, float %2323, float %2324, float %2325, float %2326, float %2327, float %2328, float %2329, float %2330, float %2331, float %2332, float %2333, float %2334, float %2335, float %2336, float %2337, float %2338, float %2339, float %2340, float %2341, float %2342, float %2343, float %2344, float %2345, float %2346, float %2347, float %2348, float %2349, float %2350, float %2351, float %2352, float %2353, float %2354, float %2355, float %2356, float %2357, float %2358, float %2359, float %2360, float %2361, float %2362, float %2363, float %2364, float %2365, float %2366, float %2367, float %2368, float %2369, float %2370, float %2371, float %2372, float %2373, float %2374, float %2375, float %2376, float %2377, float %2378) #3, !dbg !84 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !84 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !84 + %2380 = getelementptr i32, ptr addrspace(1) %13, i64 %358, !dbg !136 + %2381 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %2380) #3, !dbg !137 + %2382 = shl i32 %2381, 7, !dbg !138 + %2383 = getelementptr i32, ptr addrspace(1) %12, i64 %362, !dbg !139 + %2384 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %2383) #3, !dbg !140 + %2385 = or disjoint i32 %2382, %38, !dbg !141 + %2386 = or disjoint i32 %2382, %39, !dbg !141 + %2387 = or disjoint i32 %2382, %40, !dbg !141 + %2388 = or disjoint i32 %2382, %41, !dbg !141 + %2389 = shl i32 %2385, 7, !dbg !142 + %2390 = shl i32 %2386, 7, !dbg !142 + %2391 = shl i32 %2387, 7, !dbg !142 + %2392 = shl i32 %2388, 7, !dbg !142 + %2393 = sext i32 %2389 to i64, !dbg !144 + %2394 = getelementptr bfloat, ptr addrspace(1) %32, i64 %2393, !dbg !144 + %2395 = sext i32 %2390 to i64, !dbg !144 + %2396 = getelementptr bfloat, ptr addrspace(1) %32, i64 %2395, !dbg !144 + %2397 = sext i32 %2391 to i64, !dbg !144 + %2398 = getelementptr bfloat, ptr addrspace(1) %32, i64 %2397, !dbg !144 + %2399 = sext i32 %2392 to i64, !dbg !144 + %2400 = getelementptr bfloat, ptr addrspace(1) %32, i64 %2399, !dbg !144 + %2401 = getelementptr bfloat, ptr addrspace(1) %2394, i64 %121, !dbg !145 + %2402 = getelementptr bfloat, ptr addrspace(1) %2396, i64 %121, !dbg !145 + %2403 = getelementptr bfloat, ptr addrspace(1) %2398, i64 %121, !dbg !145 + %2404 = getelementptr bfloat, ptr addrspace(1) %2400, i64 %121, !dbg !145 + %2405 = getelementptr bfloat, ptr addrspace(1) %33, i64 %2393, !dbg !146 + %2406 = getelementptr bfloat, ptr addrspace(1) %33, i64 %2395, !dbg !146 + %2407 = getelementptr bfloat, ptr addrspace(1) %33, i64 %2397, !dbg !146 + %2408 = getelementptr bfloat, ptr addrspace(1) %33, i64 %2399, !dbg !146 + %2409 = getelementptr bfloat, ptr addrspace(1) %2405, i64 %121, !dbg !147 + %2410 = getelementptr bfloat, ptr addrspace(1) %2406, i64 %121, !dbg !147 + %2411 = getelementptr bfloat, ptr addrspace(1) %2407, i64 %121, !dbg !147 + %2412 = getelementptr bfloat, ptr addrspace(1) %2408, i64 %121, !dbg !147 + %2413 = shl i32 %2384, 1, !dbg !148 + %2414 = icmp sgt i32 %2413, 0, !dbg !149 + %2415 = select i1 %2414, i32 16, i32 0, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %403, ptr addrspace(1) %2401, i32 %2415) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %406, ptr addrspace(1) %2402, i32 %2415) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %408, ptr addrspace(1) %2403, i32 %2415) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %410, ptr addrspace(1) %2404, i32 %2415) #3, !dbg !150 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %411, ptr addrspace(1) %2409, i32 %2415) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %412, ptr addrspace(1) %2410, i32 %2415) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %413, ptr addrspace(1) %2411, i32 %2415) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %414, ptr addrspace(1) %2412, i32 %2415) #3, !dbg !150 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !150 + %2416 = icmp sgt i32 %2413, 1, !dbg !149 + %2417 = getelementptr i8, ptr addrspace(1) %2401, i64 16384, !dbg !151 + %2418 = getelementptr i8, ptr addrspace(1) %2402, i64 16384, !dbg !151 + %2419 = getelementptr i8, ptr addrspace(1) %2403, i64 16384, !dbg !151 + %2420 = getelementptr i8, ptr addrspace(1) %2404, i64 16384, !dbg !151 + %2421 = getelementptr i8, ptr addrspace(1) %2409, i64 16384, !dbg !152 + %2422 = getelementptr i8, ptr addrspace(1) %2410, i64 16384, !dbg !152 + %2423 = getelementptr i8, ptr addrspace(1) %2411, i64 16384, !dbg !152 + %2424 = getelementptr i8, ptr addrspace(1) %2412, i64 16384, !dbg !152 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !150 + %2425 = select i1 %2416, i32 16, i32 0, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %424, ptr addrspace(1) %2417, i32 %2425) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %426, ptr addrspace(1) %2418, i32 %2425) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %427, ptr addrspace(1) %2419, i32 %2425) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %428, ptr addrspace(1) %2420, i32 %2425) #3, !dbg !150 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %429, ptr addrspace(1) %2421, i32 %2425) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %430, ptr addrspace(1) %2422, i32 %2425) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %431, ptr addrspace(1) %2423, i32 %2425) #3, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %432, ptr addrspace(1) %2424, i32 %2425) #3, !dbg !150 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !150 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #3, !dbg !153 + %2426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 0, !dbg !149 + %2427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 1, !dbg !149 + %2428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 2, !dbg !149 + %2429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 3, !dbg !149 + %2430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 4, !dbg !149 + %2431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 5, !dbg !149 + %2432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 6, !dbg !149 + %2433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 7, !dbg !149 + %2434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 8, !dbg !149 + %2435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 9, !dbg !149 + %2436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 10, !dbg !149 + %2437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 11, !dbg !149 + %2438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 12, !dbg !149 + %2439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 13, !dbg !149 + %2440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 14, !dbg !149 + %2441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 15, !dbg !149 + %2442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 16, !dbg !149 + %2443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 17, !dbg !149 + %2444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 18, !dbg !149 + %2445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 19, !dbg !149 + %2446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 20, !dbg !149 + %2447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 21, !dbg !149 + %2448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 22, !dbg !149 + %2449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 23, !dbg !149 + %2450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 24, !dbg !149 + %2451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 25, !dbg !149 + %2452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 26, !dbg !149 + %2453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 27, !dbg !149 + %2454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 28, !dbg !149 + %2455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 29, !dbg !149 + %2456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 30, !dbg !149 + %2457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 31, !dbg !149 + %2458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 32, !dbg !149 + %2459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 33, !dbg !149 + %2460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 34, !dbg !149 + %2461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 35, !dbg !149 + %2462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 36, !dbg !149 + %2463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 37, !dbg !149 + %2464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 38, !dbg !149 + %2465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 39, !dbg !149 + %2466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 40, !dbg !149 + %2467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 41, !dbg !149 + %2468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 42, !dbg !149 + %2469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 43, !dbg !149 + %2470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 44, !dbg !149 + %2471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 45, !dbg !149 + %2472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 46, !dbg !149 + %2473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 47, !dbg !149 + %2474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 48, !dbg !149 + %2475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 49, !dbg !149 + %2476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 50, !dbg !149 + %2477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 51, !dbg !149 + %2478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 52, !dbg !149 + %2479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 53, !dbg !149 + %2480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 54, !dbg !149 + %2481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 55, !dbg !149 + %2482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 56, !dbg !149 + %2483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 57, !dbg !149 + %2484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 58, !dbg !149 + %2485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 59, !dbg !149 + %2486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 60, !dbg !149 + %2487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 61, !dbg !149 + %2488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 62, !dbg !149 + %2489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 63, !dbg !149 + br i1 %2414, label %.lr.ph1858, label %._crit_edge1859, !dbg !149 + +.lr.ph1858: ; preds = %._crit_edge1847 + %2490 = tail call i32 @llvm.umin.i32(i32 %2413, i32 32), !dbg !154 + %2491 = add nsw i32 %2490, -2 + %2492 = add nsw i32 %2490, -1 + %2493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 0, !dbg !149 + %2494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 1, !dbg !149 + %2495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 2, !dbg !149 + %2496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 3, !dbg !149 + %2497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 4, !dbg !149 + %2498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 5, !dbg !149 + %2499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 6, !dbg !149 + %2500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 7, !dbg !149 + %2501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 8, !dbg !149 + %2502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 9, !dbg !149 + %2503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 10, !dbg !149 + %2504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 11, !dbg !149 + %2505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 12, !dbg !149 + %2506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 13, !dbg !149 + %2507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 14, !dbg !149 + %2508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 15, !dbg !149 + %2509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 16, !dbg !149 + %2510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 17, !dbg !149 + %2511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 18, !dbg !149 + %2512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 19, !dbg !149 + %2513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 20, !dbg !149 + %2514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 21, !dbg !149 + %2515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 22, !dbg !149 + %2516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 23, !dbg !149 + %2517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 24, !dbg !149 + %2518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 25, !dbg !149 + %2519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 26, !dbg !149 + %2520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 27, !dbg !149 + %2521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 28, !dbg !149 + %2522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 29, !dbg !149 + %2523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 30, !dbg !149 + %2524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 31, !dbg !149 + %2525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 32, !dbg !149 + %2526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 33, !dbg !149 + %2527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 34, !dbg !149 + %2528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 35, !dbg !149 + %2529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 36, !dbg !149 + %2530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 37, !dbg !149 + %2531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 38, !dbg !149 + %2532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 39, !dbg !149 + %2533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 40, !dbg !149 + %2534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 41, !dbg !149 + %2535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 42, !dbg !149 + %2536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 43, !dbg !149 + %2537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 44, !dbg !149 + %2538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 45, !dbg !149 + %2539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 46, !dbg !149 + %2540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 47, !dbg !149 + %2541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 48, !dbg !149 + %2542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 49, !dbg !149 + %2543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 50, !dbg !149 + %2544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 51, !dbg !149 + %2545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 52, !dbg !149 + %2546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 53, !dbg !149 + %2547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 54, !dbg !149 + %2548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 55, !dbg !149 + %2549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 56, !dbg !149 + %2550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 57, !dbg !149 + %2551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 58, !dbg !149 + %2552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 59, !dbg !149 + %2553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 60, !dbg !149 + %2554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 61, !dbg !149 + %2555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 62, !dbg !149 + %2556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2379, 63, !dbg !149 + %2557 = insertelement <2 x float> poison, float %347, i64 0, !dbg !155 + %2558 = shufflevector <2 x float> %2557, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !155 + %2559 = insertelement <2 x float> poison, float %345, i64 0, !dbg !155 + %2560 = shufflevector <2 x float> %2559, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !155 + br label %2561, !dbg !149 + +2561: ; preds = %.lr.ph1858, %__nv_exp2f.exit1411 + %2562 = phi i32 [ -1, %.lr.ph1858 ], [ %2569, %__nv_exp2f.exit1411 ] + %2563 = phi i32 [ 1, %.lr.ph1858 ], [ %4028, %__nv_exp2f.exit1411 ] + %.pn10781856 = phi ptr addrspace(1) [ %2424, %.lr.ph1858 ], [ %4025, %__nv_exp2f.exit1411 ] + %.pn10941855 = phi ptr addrspace(1) [ %2423, %.lr.ph1858 ], [ %4024, %__nv_exp2f.exit1411 ] + %.pn11101854 = phi ptr addrspace(1) [ %2422, %.lr.ph1858 ], [ %4023, %__nv_exp2f.exit1411 ] + %.pn11261853 = phi ptr addrspace(1) [ %2421, %.lr.ph1858 ], [ %4022, %__nv_exp2f.exit1411 ] + %.pn10141852 = phi ptr addrspace(1) [ %2420, %.lr.ph1858 ], [ %4021, %__nv_exp2f.exit1411 ] + %.pn10301851 = phi ptr addrspace(1) [ %2419, %.lr.ph1858 ], [ %4020, %__nv_exp2f.exit1411 ] + %.pn10461850 = phi ptr addrspace(1) [ %2418, %.lr.ph1858 ], [ %4019, %__nv_exp2f.exit1411 ] + %.pn10621849 = phi ptr addrspace(1) [ %2417, %.lr.ph1858 ], [ %4018, %__nv_exp2f.exit1411 ] + %.pn = phi float [ %2493, %.lr.ph1858 ], [ %3932, %__nv_exp2f.exit1411 ] + %.pn2389 = phi float [ %2494, %.lr.ph1858 ], [ %3933, %__nv_exp2f.exit1411 ] + %.pn2390 = phi float [ %2495, %.lr.ph1858 ], [ %3934, %__nv_exp2f.exit1411 ] + %.pn2391 = phi float [ %2496, %.lr.ph1858 ], [ %3935, %__nv_exp2f.exit1411 ] + %.pn2392 = phi float [ %2497, %.lr.ph1858 ], [ %3936, %__nv_exp2f.exit1411 ] + %.pn2393 = phi float [ %2498, %.lr.ph1858 ], [ %3937, %__nv_exp2f.exit1411 ] + %.pn2394 = phi float [ %2499, %.lr.ph1858 ], [ %3938, %__nv_exp2f.exit1411 ] + %.pn2395 = phi float [ %2500, %.lr.ph1858 ], [ %3939, %__nv_exp2f.exit1411 ] + %.pn2396 = phi float [ %2501, %.lr.ph1858 ], [ %3940, %__nv_exp2f.exit1411 ] + %.pn2397 = phi float [ %2502, %.lr.ph1858 ], [ %3941, %__nv_exp2f.exit1411 ] + %.pn2398 = phi float [ %2503, %.lr.ph1858 ], [ %3942, %__nv_exp2f.exit1411 ] + %.pn2399 = phi float [ %2504, %.lr.ph1858 ], [ %3943, %__nv_exp2f.exit1411 ] + %.pn2400 = phi float [ %2505, %.lr.ph1858 ], [ %3944, %__nv_exp2f.exit1411 ] + %.pn2401 = phi float [ %2506, %.lr.ph1858 ], [ %3945, %__nv_exp2f.exit1411 ] + %.pn2402 = phi float [ %2507, %.lr.ph1858 ], [ %3946, %__nv_exp2f.exit1411 ] + %.pn2403 = phi float [ %2508, %.lr.ph1858 ], [ %3947, %__nv_exp2f.exit1411 ] + %.pn2404 = phi float [ %2509, %.lr.ph1858 ], [ %3948, %__nv_exp2f.exit1411 ] + %.pn2405 = phi float [ %2510, %.lr.ph1858 ], [ %3949, %__nv_exp2f.exit1411 ] + %.pn2406 = phi float [ %2511, %.lr.ph1858 ], [ %3950, %__nv_exp2f.exit1411 ] + %.pn2407 = phi float [ %2512, %.lr.ph1858 ], [ %3951, %__nv_exp2f.exit1411 ] + %.pn2408 = phi float [ %2513, %.lr.ph1858 ], [ %3952, %__nv_exp2f.exit1411 ] + %.pn2409 = phi float [ %2514, %.lr.ph1858 ], [ %3953, %__nv_exp2f.exit1411 ] + %.pn2410 = phi float [ %2515, %.lr.ph1858 ], [ %3954, %__nv_exp2f.exit1411 ] + %.pn2411 = phi float [ %2516, %.lr.ph1858 ], [ %3955, %__nv_exp2f.exit1411 ] + %.pn2412 = phi float [ %2517, %.lr.ph1858 ], [ %3956, %__nv_exp2f.exit1411 ] + %.pn2413 = phi float [ %2518, %.lr.ph1858 ], [ %3957, %__nv_exp2f.exit1411 ] + %.pn2414 = phi float [ %2519, %.lr.ph1858 ], [ %3958, %__nv_exp2f.exit1411 ] + %.pn2415 = phi float [ %2520, %.lr.ph1858 ], [ %3959, %__nv_exp2f.exit1411 ] + %.pn2416 = phi float [ %2521, %.lr.ph1858 ], [ %3960, %__nv_exp2f.exit1411 ] + %.pn2417 = phi float [ %2522, %.lr.ph1858 ], [ %3961, %__nv_exp2f.exit1411 ] + %.pn2418 = phi float [ %2523, %.lr.ph1858 ], [ %3962, %__nv_exp2f.exit1411 ] + %.pn2419 = phi float [ %2524, %.lr.ph1858 ], [ %3963, %__nv_exp2f.exit1411 ] + %.pn2420 = phi float [ %2525, %.lr.ph1858 ], [ %3964, %__nv_exp2f.exit1411 ] + %.pn2421 = phi float [ %2526, %.lr.ph1858 ], [ %3965, %__nv_exp2f.exit1411 ] + %.pn2422 = phi float [ %2527, %.lr.ph1858 ], [ %3966, %__nv_exp2f.exit1411 ] + %.pn2423 = phi float [ %2528, %.lr.ph1858 ], [ %3967, %__nv_exp2f.exit1411 ] + %.pn2424 = phi float [ %2529, %.lr.ph1858 ], [ %3968, %__nv_exp2f.exit1411 ] + %.pn2425 = phi float [ %2530, %.lr.ph1858 ], [ %3969, %__nv_exp2f.exit1411 ] + %.pn2426 = phi float [ %2531, %.lr.ph1858 ], [ %3970, %__nv_exp2f.exit1411 ] + %.pn2427 = phi float [ %2532, %.lr.ph1858 ], [ %3971, %__nv_exp2f.exit1411 ] + %.pn2428 = phi float [ %2533, %.lr.ph1858 ], [ %3972, %__nv_exp2f.exit1411 ] + %.pn2429 = phi float [ %2534, %.lr.ph1858 ], [ %3973, %__nv_exp2f.exit1411 ] + %.pn2430 = phi float [ %2535, %.lr.ph1858 ], [ %3974, %__nv_exp2f.exit1411 ] + %.pn2431 = phi float [ %2536, %.lr.ph1858 ], [ %3975, %__nv_exp2f.exit1411 ] + %.pn2432 = phi float [ %2537, %.lr.ph1858 ], [ %3976, %__nv_exp2f.exit1411 ] + %.pn2433 = phi float [ %2538, %.lr.ph1858 ], [ %3977, %__nv_exp2f.exit1411 ] + %.pn2434 = phi float [ %2539, %.lr.ph1858 ], [ %3978, %__nv_exp2f.exit1411 ] + %.pn2435 = phi float [ %2540, %.lr.ph1858 ], [ %3979, %__nv_exp2f.exit1411 ] + %.pn2436 = phi float [ %2541, %.lr.ph1858 ], [ %3980, %__nv_exp2f.exit1411 ] + %.pn2437 = phi float [ %2542, %.lr.ph1858 ], [ %3981, %__nv_exp2f.exit1411 ] + %.pn2438 = phi float [ %2543, %.lr.ph1858 ], [ %3982, %__nv_exp2f.exit1411 ] + %.pn2439 = phi float [ %2544, %.lr.ph1858 ], [ %3983, %__nv_exp2f.exit1411 ] + %.pn2440 = phi float [ %2545, %.lr.ph1858 ], [ %3984, %__nv_exp2f.exit1411 ] + %.pn2441 = phi float [ %2546, %.lr.ph1858 ], [ %3985, %__nv_exp2f.exit1411 ] + %.pn2442 = phi float [ %2547, %.lr.ph1858 ], [ %3986, %__nv_exp2f.exit1411 ] + %.pn2443 = phi float [ %2548, %.lr.ph1858 ], [ %3987, %__nv_exp2f.exit1411 ] + %.pn2444 = phi float [ %2549, %.lr.ph1858 ], [ %3988, %__nv_exp2f.exit1411 ] + %.pn2445 = phi float [ %2550, %.lr.ph1858 ], [ %3989, %__nv_exp2f.exit1411 ] + %.pn2446 = phi float [ %2551, %.lr.ph1858 ], [ %3990, %__nv_exp2f.exit1411 ] + %.pn2447 = phi float [ %2552, %.lr.ph1858 ], [ %3991, %__nv_exp2f.exit1411 ] + %.pn2448 = phi float [ %2553, %.lr.ph1858 ], [ %3992, %__nv_exp2f.exit1411 ] + %.pn2449 = phi float [ %2554, %.lr.ph1858 ], [ %3993, %__nv_exp2f.exit1411 ] + %.pn2450 = phi float [ %2555, %.lr.ph1858 ], [ %3994, %__nv_exp2f.exit1411 ] + %.pn2451 = phi float [ %2556, %.lr.ph1858 ], [ %3995, %__nv_exp2f.exit1411 ] + %2564 = phi i32 [ 0, %.lr.ph1858 ], [ %3996, %__nv_exp2f.exit1411 ] + %2565 = icmp slt i32 %2564, %2491, !dbg !149 + %2566 = icmp slt i32 %2564, %2492, !dbg !149 + %2567 = add i32 %2562, 1, !dbg !149 + %2568 = icmp sgt i32 %2567, 2, !dbg !149 + %2569 = select i1 %2568, i32 0, i32 %2567, !dbg !149 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !150 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !150 + %2570 = shl i32 %2569, 13, !dbg !150 + %2571 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %2570, !dbg !150 + %2572 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %36, i32 0, i32 31), !dbg !153 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !153 + %2573 = shl i32 %2572, 11, !dbg !153 + %2574 = and i32 %2573, 8192, !dbg !153 + %2575 = add i32 %2574, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !153 + %2576 = lshr exact i32 %2575, 4, !dbg !153 + %2577 = and i32 %2576, 16383, !dbg !153 + %2578 = zext nneg i32 %2577 to i64, !dbg !153 + %2579 = or disjoint i64 %2578, 4611686293372403712, !dbg !153 + %2580 = ptrtoint ptr addrspace(3) %2571 to i32, !dbg !153 + %2581 = lshr exact i32 %2580, 4, !dbg !153 + %2582 = and i32 %2581, 16383, !dbg !153 + %2583 = zext nneg i32 %2582 to i64, !dbg !153 + %2584 = or disjoint i64 %2583, 4611686293338849280, !dbg !153 + %2585 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %2579, i64 %2584) #3, !dbg !153 + %2586 = or disjoint i32 %2574, 32, !dbg !153 + %2587 = add i32 %2586, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !153 + %2588 = lshr exact i32 %2587, 4, !dbg !153 + %2589 = and i32 %2588, 16383, !dbg !153 + %2590 = zext nneg i32 %2589 to i64, !dbg !153 + %2591 = or disjoint i64 %2590, 4611686293372403712, !dbg !153 + %2592 = add i32 %2580, 32, !dbg !153 + %2593 = lshr exact i32 %2592, 4, !dbg !153 + %2594 = and i32 %2593, 16383, !dbg !153 + %2595 = zext nneg i32 %2594 to i64, !dbg !153 + %2596 = or disjoint i64 %2595, 4611686293338849280, !dbg !153 + %2597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 0, !dbg !153 + %2598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 1, !dbg !153 + %2599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 2, !dbg !153 + %2600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 3, !dbg !153 + %2601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 4, !dbg !153 + %2602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 5, !dbg !153 + %2603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 6, !dbg !153 + %2604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 7, !dbg !153 + %2605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 8, !dbg !153 + %2606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 9, !dbg !153 + %2607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 10, !dbg !153 + %2608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 11, !dbg !153 + %2609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 12, !dbg !153 + %2610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 13, !dbg !153 + %2611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 14, !dbg !153 + %2612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 15, !dbg !153 + %2613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 16, !dbg !153 + %2614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 17, !dbg !153 + %2615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 18, !dbg !153 + %2616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 19, !dbg !153 + %2617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 20, !dbg !153 + %2618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 21, !dbg !153 + %2619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 22, !dbg !153 + %2620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 23, !dbg !153 + %2621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 24, !dbg !153 + %2622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 25, !dbg !153 + %2623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 26, !dbg !153 + %2624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 27, !dbg !153 + %2625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 28, !dbg !153 + %2626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 29, !dbg !153 + %2627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 30, !dbg !153 + %2628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2585, 31, !dbg !153 + %2629 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2597, float %2598, float %2599, float %2600, float %2601, float %2602, float %2603, float %2604, float %2605, float %2606, float %2607, float %2608, float %2609, float %2610, float %2611, float %2612, float %2613, float %2614, float %2615, float %2616, float %2617, float %2618, float %2619, float %2620, float %2621, float %2622, float %2623, float %2624, float %2625, float %2626, float %2627, float %2628, i64 %2591, i64 %2596, i1 true) #3, !dbg !153 + %2630 = or disjoint i32 %2574, 64, !dbg !153 + %2631 = add i32 %2630, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !153 + %2632 = lshr exact i32 %2631, 4, !dbg !153 + %2633 = and i32 %2632, 16383, !dbg !153 + %2634 = zext nneg i32 %2633 to i64, !dbg !153 + %2635 = or disjoint i64 %2634, 4611686293372403712, !dbg !153 + %2636 = add i32 %2580, 64, !dbg !153 + %2637 = lshr exact i32 %2636, 4, !dbg !153 + %2638 = and i32 %2637, 16383, !dbg !153 + %2639 = zext nneg i32 %2638 to i64, !dbg !153 + %2640 = or disjoint i64 %2639, 4611686293338849280, !dbg !153 + %2641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 0, !dbg !153 + %2642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 1, !dbg !153 + %2643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 2, !dbg !153 + %2644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 3, !dbg !153 + %2645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 4, !dbg !153 + %2646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 5, !dbg !153 + %2647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 6, !dbg !153 + %2648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 7, !dbg !153 + %2649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 8, !dbg !153 + %2650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 9, !dbg !153 + %2651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 10, !dbg !153 + %2652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 11, !dbg !153 + %2653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 12, !dbg !153 + %2654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 13, !dbg !153 + %2655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 14, !dbg !153 + %2656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 15, !dbg !153 + %2657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 16, !dbg !153 + %2658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 17, !dbg !153 + %2659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 18, !dbg !153 + %2660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 19, !dbg !153 + %2661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 20, !dbg !153 + %2662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 21, !dbg !153 + %2663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 22, !dbg !153 + %2664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 23, !dbg !153 + %2665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 24, !dbg !153 + %2666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 25, !dbg !153 + %2667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 26, !dbg !153 + %2668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 27, !dbg !153 + %2669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 28, !dbg !153 + %2670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 29, !dbg !153 + %2671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 30, !dbg !153 + %2672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2629, 31, !dbg !153 + %2673 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2641, float %2642, float %2643, float %2644, float %2645, float %2646, float %2647, float %2648, float %2649, float %2650, float %2651, float %2652, float %2653, float %2654, float %2655, float %2656, float %2657, float %2658, float %2659, float %2660, float %2661, float %2662, float %2663, float %2664, float %2665, float %2666, float %2667, float %2668, float %2669, float %2670, float %2671, float %2672, i64 %2635, i64 %2640, i1 true) #3, !dbg !153 + %2674 = or disjoint i32 %2574, 96, !dbg !153 + %2675 = add i32 %2674, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !153 + %2676 = lshr exact i32 %2675, 4, !dbg !153 + %2677 = and i32 %2676, 16383, !dbg !153 + %2678 = zext nneg i32 %2677 to i64, !dbg !153 + %2679 = or disjoint i64 %2678, 4611686293372403712, !dbg !153 + %2680 = add i32 %2580, 96, !dbg !153 + %2681 = lshr exact i32 %2680, 4, !dbg !153 + %2682 = and i32 %2681, 16383, !dbg !153 + %2683 = zext nneg i32 %2682 to i64, !dbg !153 + %2684 = or disjoint i64 %2683, 4611686293338849280, !dbg !153 + %2685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 0, !dbg !153 + %2686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 1, !dbg !153 + %2687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 2, !dbg !153 + %2688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 3, !dbg !153 + %2689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 4, !dbg !153 + %2690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 5, !dbg !153 + %2691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 6, !dbg !153 + %2692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 7, !dbg !153 + %2693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 8, !dbg !153 + %2694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 9, !dbg !153 + %2695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 10, !dbg !153 + %2696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 11, !dbg !153 + %2697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 12, !dbg !153 + %2698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 13, !dbg !153 + %2699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 14, !dbg !153 + %2700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 15, !dbg !153 + %2701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 16, !dbg !153 + %2702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 17, !dbg !153 + %2703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 18, !dbg !153 + %2704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 19, !dbg !153 + %2705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 20, !dbg !153 + %2706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 21, !dbg !153 + %2707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 22, !dbg !153 + %2708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 23, !dbg !153 + %2709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 24, !dbg !153 + %2710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 25, !dbg !153 + %2711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 26, !dbg !153 + %2712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 27, !dbg !153 + %2713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 28, !dbg !153 + %2714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 29, !dbg !153 + %2715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 30, !dbg !153 + %2716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2673, 31, !dbg !153 + %2717 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2685, float %2686, float %2687, float %2688, float %2689, float %2690, float %2691, float %2692, float %2693, float %2694, float %2695, float %2696, float %2697, float %2698, float %2699, float %2700, float %2701, float %2702, float %2703, float %2704, float %2705, float %2706, float %2707, float %2708, float %2709, float %2710, float %2711, float %2712, float %2713, float %2714, float %2715, float %2716, i64 %2679, i64 %2684, i1 true) #3, !dbg !153 + %2718 = or disjoint i32 %2574, 16384, !dbg !153 + %2719 = add i32 %2718, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !153 + %2720 = lshr exact i32 %2719, 4, !dbg !153 + %2721 = and i32 %2720, 16383, !dbg !153 + %2722 = zext nneg i32 %2721 to i64, !dbg !153 + %2723 = or disjoint i64 %2722, 4611686293372403712, !dbg !153 + %2724 = add i32 %2580, 8192, !dbg !153 + %2725 = lshr exact i32 %2724, 4, !dbg !153 + %2726 = and i32 %2725, 16383, !dbg !153 + %2727 = zext nneg i32 %2726 to i64, !dbg !153 + %2728 = or disjoint i64 %2727, 4611686293338849280, !dbg !153 + %2729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 0, !dbg !153 + %2730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 1, !dbg !153 + %2731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 2, !dbg !153 + %2732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 3, !dbg !153 + %2733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 4, !dbg !153 + %2734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 5, !dbg !153 + %2735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 6, !dbg !153 + %2736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 7, !dbg !153 + %2737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 8, !dbg !153 + %2738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 9, !dbg !153 + %2739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 10, !dbg !153 + %2740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 11, !dbg !153 + %2741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 12, !dbg !153 + %2742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 13, !dbg !153 + %2743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 14, !dbg !153 + %2744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 15, !dbg !153 + %2745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 16, !dbg !153 + %2746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 17, !dbg !153 + %2747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 18, !dbg !153 + %2748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 19, !dbg !153 + %2749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 20, !dbg !153 + %2750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 21, !dbg !153 + %2751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 22, !dbg !153 + %2752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 23, !dbg !153 + %2753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 24, !dbg !153 + %2754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 25, !dbg !153 + %2755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 26, !dbg !153 + %2756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 27, !dbg !153 + %2757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 28, !dbg !153 + %2758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 29, !dbg !153 + %2759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 30, !dbg !153 + %2760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2717, 31, !dbg !153 + %2761 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2729, float %2730, float %2731, float %2732, float %2733, float %2734, float %2735, float %2736, float %2737, float %2738, float %2739, float %2740, float %2741, float %2742, float %2743, float %2744, float %2745, float %2746, float %2747, float %2748, float %2749, float %2750, float %2751, float %2752, float %2753, float %2754, float %2755, float %2756, float %2757, float %2758, float %2759, float %2760, i64 %2723, i64 %2728, i1 true) #3, !dbg !153 + %2762 = or disjoint i32 %2574, 16416, !dbg !153 + %2763 = add i32 %2762, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !153 + %2764 = lshr exact i32 %2763, 4, !dbg !153 + %2765 = and i32 %2764, 16383, !dbg !153 + %2766 = zext nneg i32 %2765 to i64, !dbg !153 + %2767 = or disjoint i64 %2766, 4611686293372403712, !dbg !153 + %2768 = add i32 %2580, 8224, !dbg !153 + %2769 = lshr exact i32 %2768, 4, !dbg !153 + %2770 = and i32 %2769, 16383, !dbg !153 + %2771 = zext nneg i32 %2770 to i64, !dbg !153 + %2772 = or disjoint i64 %2771, 4611686293338849280, !dbg !153 + %2773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 0, !dbg !153 + %2774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 1, !dbg !153 + %2775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 2, !dbg !153 + %2776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 3, !dbg !153 + %2777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 4, !dbg !153 + %2778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 5, !dbg !153 + %2779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 6, !dbg !153 + %2780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 7, !dbg !153 + %2781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 8, !dbg !153 + %2782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 9, !dbg !153 + %2783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 10, !dbg !153 + %2784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 11, !dbg !153 + %2785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 12, !dbg !153 + %2786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 13, !dbg !153 + %2787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 14, !dbg !153 + %2788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 15, !dbg !153 + %2789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 16, !dbg !153 + %2790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 17, !dbg !153 + %2791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 18, !dbg !153 + %2792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 19, !dbg !153 + %2793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 20, !dbg !153 + %2794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 21, !dbg !153 + %2795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 22, !dbg !153 + %2796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 23, !dbg !153 + %2797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 24, !dbg !153 + %2798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 25, !dbg !153 + %2799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 26, !dbg !153 + %2800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 27, !dbg !153 + %2801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 28, !dbg !153 + %2802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 29, !dbg !153 + %2803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 30, !dbg !153 + %2804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2761, 31, !dbg !153 + %2805 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2773, float %2774, float %2775, float %2776, float %2777, float %2778, float %2779, float %2780, float %2781, float %2782, float %2783, float %2784, float %2785, float %2786, float %2787, float %2788, float %2789, float %2790, float %2791, float %2792, float %2793, float %2794, float %2795, float %2796, float %2797, float %2798, float %2799, float %2800, float %2801, float %2802, float %2803, float %2804, i64 %2767, i64 %2772, i1 true) #3, !dbg !153 + %2806 = or disjoint i32 %2574, 16448, !dbg !153 + %2807 = add i32 %2806, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !153 + %2808 = lshr exact i32 %2807, 4, !dbg !153 + %2809 = and i32 %2808, 16383, !dbg !153 + %2810 = zext nneg i32 %2809 to i64, !dbg !153 + %2811 = or disjoint i64 %2810, 4611686293372403712, !dbg !153 + %2812 = add i32 %2580, 8256, !dbg !153 + %2813 = lshr exact i32 %2812, 4, !dbg !153 + %2814 = and i32 %2813, 16383, !dbg !153 + %2815 = zext nneg i32 %2814 to i64, !dbg !153 + %2816 = or disjoint i64 %2815, 4611686293338849280, !dbg !153 + %2817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 0, !dbg !153 + %2818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 1, !dbg !153 + %2819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 2, !dbg !153 + %2820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 3, !dbg !153 + %2821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 4, !dbg !153 + %2822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 5, !dbg !153 + %2823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 6, !dbg !153 + %2824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 7, !dbg !153 + %2825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 8, !dbg !153 + %2826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 9, !dbg !153 + %2827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 10, !dbg !153 + %2828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 11, !dbg !153 + %2829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 12, !dbg !153 + %2830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 13, !dbg !153 + %2831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 14, !dbg !153 + %2832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 15, !dbg !153 + %2833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 16, !dbg !153 + %2834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 17, !dbg !153 + %2835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 18, !dbg !153 + %2836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 19, !dbg !153 + %2837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 20, !dbg !153 + %2838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 21, !dbg !153 + %2839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 22, !dbg !153 + %2840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 23, !dbg !153 + %2841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 24, !dbg !153 + %2842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 25, !dbg !153 + %2843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 26, !dbg !153 + %2844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 27, !dbg !153 + %2845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 28, !dbg !153 + %2846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 29, !dbg !153 + %2847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 30, !dbg !153 + %2848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2805, 31, !dbg !153 + %2849 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2817, float %2818, float %2819, float %2820, float %2821, float %2822, float %2823, float %2824, float %2825, float %2826, float %2827, float %2828, float %2829, float %2830, float %2831, float %2832, float %2833, float %2834, float %2835, float %2836, float %2837, float %2838, float %2839, float %2840, float %2841, float %2842, float %2843, float %2844, float %2845, float %2846, float %2847, float %2848, i64 %2811, i64 %2816, i1 true) #3, !dbg !153 + %2850 = or disjoint i32 %2574, 16480, !dbg !153 + %2851 = add i32 %2850, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !153 + %2852 = lshr exact i32 %2851, 4, !dbg !153 + %2853 = and i32 %2852, 16383, !dbg !153 + %2854 = zext nneg i32 %2853 to i64, !dbg !153 + %2855 = or disjoint i64 %2854, 4611686293372403712, !dbg !153 + %2856 = add i32 %2580, 8288, !dbg !153 + %2857 = lshr exact i32 %2856, 4, !dbg !153 + %2858 = and i32 %2857, 16383, !dbg !153 + %2859 = zext nneg i32 %2858 to i64, !dbg !153 + %2860 = or disjoint i64 %2859, 4611686293338849280, !dbg !153 + %2861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 0, !dbg !153 + %2862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 1, !dbg !153 + %2863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 2, !dbg !153 + %2864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 3, !dbg !153 + %2865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 4, !dbg !153 + %2866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 5, !dbg !153 + %2867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 6, !dbg !153 + %2868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 7, !dbg !153 + %2869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 8, !dbg !153 + %2870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 9, !dbg !153 + %2871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 10, !dbg !153 + %2872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 11, !dbg !153 + %2873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 12, !dbg !153 + %2874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 13, !dbg !153 + %2875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 14, !dbg !153 + %2876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 15, !dbg !153 + %2877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 16, !dbg !153 + %2878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 17, !dbg !153 + %2879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 18, !dbg !153 + %2880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 19, !dbg !153 + %2881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 20, !dbg !153 + %2882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 21, !dbg !153 + %2883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 22, !dbg !153 + %2884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 23, !dbg !153 + %2885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 24, !dbg !153 + %2886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 25, !dbg !153 + %2887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 26, !dbg !153 + %2888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 27, !dbg !153 + %2889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 28, !dbg !153 + %2890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 29, !dbg !153 + %2891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 30, !dbg !153 + %2892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2849, 31, !dbg !153 + %2893 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2861, float %2862, float %2863, float %2864, float %2865, float %2866, float %2867, float %2868, float %2869, float %2870, float %2871, float %2872, float %2873, float %2874, float %2875, float %2876, float %2877, float %2878, float %2879, float %2880, float %2881, float %2882, float %2883, float %2884, float %2885, float %2886, float %2887, float %2888, float %2889, float %2890, float %2891, float %2892, i64 %2855, i64 %2860, i1 true) #3, !dbg !153 + %2894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 0, !dbg !153 + %2895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 1, !dbg !153 + %2896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 2, !dbg !153 + %2897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 3, !dbg !153 + %2898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 4, !dbg !153 + %2899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 5, !dbg !153 + %2900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 6, !dbg !153 + %2901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 7, !dbg !153 + %2902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 8, !dbg !153 + %2903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 9, !dbg !153 + %2904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 10, !dbg !153 + %2905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 11, !dbg !153 + %2906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 12, !dbg !153 + %2907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 13, !dbg !153 + %2908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 14, !dbg !153 + %2909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 15, !dbg !153 + %2910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 16, !dbg !153 + %2911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 17, !dbg !153 + %2912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 18, !dbg !153 + %2913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 19, !dbg !153 + %2914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 20, !dbg !153 + %2915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 21, !dbg !153 + %2916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 22, !dbg !153 + %2917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 23, !dbg !153 + %2918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 24, !dbg !153 + %2919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 25, !dbg !153 + %2920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 26, !dbg !153 + %2921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 27, !dbg !153 + %2922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 28, !dbg !153 + %2923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 29, !dbg !153 + %2924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 30, !dbg !153 + %2925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2893, 31, !dbg !153 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !153 + %2926 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %2894, float %2895, float %2896, float %2897, float %2898, float %2899, float %2900, float %2901, float %2902, float %2903, float %2904, float %2905, float %2906, float %2907, float %2908, float %2909, float %2910, float %2911, float %2912, float %2913, float %2914, float %2915, float %2916, float %2917, float %2918, float %2919, float %2920, float %2921, float %2922, float %2923, float %2924, float %2925, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %2571, i32 0, i32 0) #3, !dbg !153 + %2927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 0, !dbg !153 + %2928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 1, !dbg !153 + %2929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 2, !dbg !153 + %2930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 3, !dbg !153 + %2931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 4, !dbg !153 + %2932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 5, !dbg !153 + %2933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 6, !dbg !153 + %2934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 7, !dbg !153 + %2935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 8, !dbg !153 + %2936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 9, !dbg !153 + %2937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 10, !dbg !153 + %2938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 11, !dbg !153 + %2939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 12, !dbg !153 + %2940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 13, !dbg !153 + %2941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 14, !dbg !153 + %2942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 15, !dbg !153 + %2943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 16, !dbg !153 + %2944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 17, !dbg !153 + %2945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 18, !dbg !153 + %2946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 19, !dbg !153 + %2947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 20, !dbg !153 + %2948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 21, !dbg !153 + %2949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 22, !dbg !153 + %2950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 23, !dbg !153 + %2951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 24, !dbg !153 + %2952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 25, !dbg !153 + %2953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 26, !dbg !153 + %2954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 27, !dbg !153 + %2955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 28, !dbg !153 + %2956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 29, !dbg !153 + %2957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 30, !dbg !153 + %2958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %2926, 31, !dbg !153 + %2959 = fmul float %2927, 0x3FB6A09E60000000, !dbg !156 + %2960 = fmul float %2928, 0x3FB6A09E60000000, !dbg !156 + %2961 = fmul float %2929, 0x3FB6A09E60000000, !dbg !156 + %2962 = fmul float %2930, 0x3FB6A09E60000000, !dbg !156 + %2963 = fmul float %2931, 0x3FB6A09E60000000, !dbg !156 + %2964 = fmul float %2932, 0x3FB6A09E60000000, !dbg !156 + %2965 = fmul float %2933, 0x3FB6A09E60000000, !dbg !156 + %2966 = fmul float %2934, 0x3FB6A09E60000000, !dbg !156 + %2967 = fmul float %2935, 0x3FB6A09E60000000, !dbg !156 + %2968 = fmul float %2936, 0x3FB6A09E60000000, !dbg !156 + %2969 = fmul float %2937, 0x3FB6A09E60000000, !dbg !156 + %2970 = fmul float %2938, 0x3FB6A09E60000000, !dbg !156 + %2971 = fmul float %2939, 0x3FB6A09E60000000, !dbg !156 + %2972 = fmul float %2940, 0x3FB6A09E60000000, !dbg !156 + %2973 = fmul float %2941, 0x3FB6A09E60000000, !dbg !156 + %2974 = fmul float %2942, 0x3FB6A09E60000000, !dbg !156 + %2975 = fmul float %2943, 0x3FB6A09E60000000, !dbg !156 + %2976 = fmul float %2944, 0x3FB6A09E60000000, !dbg !156 + %2977 = fmul float %2945, 0x3FB6A09E60000000, !dbg !156 + %2978 = fmul float %2946, 0x3FB6A09E60000000, !dbg !156 + %2979 = fmul float %2947, 0x3FB6A09E60000000, !dbg !156 + %2980 = fmul float %2948, 0x3FB6A09E60000000, !dbg !156 + %2981 = fmul float %2949, 0x3FB6A09E60000000, !dbg !156 + %2982 = fmul float %2950, 0x3FB6A09E60000000, !dbg !156 + %2983 = fmul float %2951, 0x3FB6A09E60000000, !dbg !156 + %2984 = fmul float %2952, 0x3FB6A09E60000000, !dbg !156 + %2985 = fmul float %2953, 0x3FB6A09E60000000, !dbg !156 + %2986 = fmul float %2954, 0x3FB6A09E60000000, !dbg !156 + %2987 = fmul float %2955, 0x3FB6A09E60000000, !dbg !156 + %2988 = fmul float %2956, 0x3FB6A09E60000000, !dbg !156 + %2989 = fmul float %2957, 0x3FB6A09E60000000, !dbg !156 + %2990 = fmul float %2958, 0x3FB6A09E60000000, !dbg !156 + %2991 = fmul float %2959, 0x3FF7154760000000, !dbg !157 + %2992 = fmul float %2960, 0x3FF7154760000000, !dbg !157 + %2993 = fmul float %2961, 0x3FF7154760000000, !dbg !157 + %2994 = fmul float %2962, 0x3FF7154760000000, !dbg !157 + %2995 = fmul float %2963, 0x3FF7154760000000, !dbg !157 + %2996 = fmul float %2964, 0x3FF7154760000000, !dbg !157 + %2997 = fmul float %2965, 0x3FF7154760000000, !dbg !157 + %2998 = fmul float %2966, 0x3FF7154760000000, !dbg !157 + %2999 = fmul float %2967, 0x3FF7154760000000, !dbg !157 + %3000 = fmul float %2968, 0x3FF7154760000000, !dbg !157 + %3001 = fmul float %2969, 0x3FF7154760000000, !dbg !157 + %3002 = fmul float %2970, 0x3FF7154760000000, !dbg !157 + %3003 = fmul float %2971, 0x3FF7154760000000, !dbg !157 + %3004 = fmul float %2972, 0x3FF7154760000000, !dbg !157 + %3005 = fmul float %2973, 0x3FF7154760000000, !dbg !157 + %3006 = fmul float %2974, 0x3FF7154760000000, !dbg !157 + %3007 = fmul float %2975, 0x3FF7154760000000, !dbg !157 + %3008 = fmul float %2976, 0x3FF7154760000000, !dbg !157 + %3009 = fmul float %2977, 0x3FF7154760000000, !dbg !157 + %3010 = fmul float %2978, 0x3FF7154760000000, !dbg !157 + %3011 = fmul float %2979, 0x3FF7154760000000, !dbg !157 + %3012 = fmul float %2980, 0x3FF7154760000000, !dbg !157 + %3013 = fmul float %2981, 0x3FF7154760000000, !dbg !157 + %3014 = fmul float %2982, 0x3FF7154760000000, !dbg !157 + %3015 = fmul float %2983, 0x3FF7154760000000, !dbg !157 + %3016 = fmul float %2984, 0x3FF7154760000000, !dbg !157 + %3017 = fmul float %2985, 0x3FF7154760000000, !dbg !157 + %3018 = fmul float %2986, 0x3FF7154760000000, !dbg !157 + %3019 = fmul float %2987, 0x3FF7154760000000, !dbg !157 + %3020 = fmul float %2988, 0x3FF7154760000000, !dbg !157 + %3021 = fmul float %2989, 0x3FF7154760000000, !dbg !157 + %3022 = fmul float %2990, 0x3FF7154760000000, !dbg !157 + %3023 = fsub float %2991, %356, !dbg !158 + %3024 = fsub float %2992, %356, !dbg !158 + %3025 = fsub float %2993, %357, !dbg !158 + %3026 = fsub float %2994, %357, !dbg !158 + %3027 = fsub float %2995, %356, !dbg !158 + %3028 = fsub float %2996, %356, !dbg !158 + %3029 = fsub float %2997, %357, !dbg !158 + %3030 = fsub float %2998, %357, !dbg !158 + %3031 = fsub float %2999, %356, !dbg !158 + %3032 = fsub float %3000, %356, !dbg !158 + %3033 = fsub float %3001, %357, !dbg !158 + %3034 = fsub float %3002, %357, !dbg !158 + %3035 = fsub float %3003, %356, !dbg !158 + %3036 = fsub float %3004, %356, !dbg !158 + %3037 = fsub float %3005, %357, !dbg !158 + %3038 = fsub float %3006, %357, !dbg !158 + %3039 = fsub float %3007, %356, !dbg !158 + %3040 = fsub float %3008, %356, !dbg !158 + %3041 = fsub float %3009, %357, !dbg !158 + %3042 = fsub float %3010, %357, !dbg !158 + %3043 = fsub float %3011, %356, !dbg !158 + %3044 = fsub float %3012, %356, !dbg !158 + %3045 = fsub float %3013, %357, !dbg !158 + %3046 = fsub float %3014, %357, !dbg !158 + %3047 = fsub float %3015, %356, !dbg !158 + %3048 = fsub float %3016, %356, !dbg !158 + %3049 = fsub float %3017, %357, !dbg !158 + %3050 = fsub float %3018, %357, !dbg !158 + %3051 = fsub float %3019, %356, !dbg !158 + %3052 = fsub float %3020, %356, !dbg !158 + %3053 = fsub float %3021, %357, !dbg !158 + %3054 = fsub float %3022, %357, !dbg !158 + %3055 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1316 = icmp eq i32 %3055, 0, !dbg !159 + br i1 %.not.i1316, label %3058, label %3056, !dbg !159 + +3056: ; preds = %2561 + %3057 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3023) #3, !dbg !159 + br label %__nv_exp2f.exit1318, !dbg !159 + +3058: ; preds = %2561 + %3059 = tail call float @llvm.nvvm.ex2.approx.f(float %3023) #3, !dbg !159 + br label %__nv_exp2f.exit1318, !dbg !159 + +__nv_exp2f.exit1318: ; preds = %3056, %3058 + %.0.i1317 = phi float [ %3057, %3056 ], [ %3059, %3058 ], !dbg !159 + %3060 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1319 = icmp eq i32 %3060, 0, !dbg !159 + br i1 %.not.i1319, label %3063, label %3061, !dbg !159 + +3061: ; preds = %__nv_exp2f.exit1318 + %3062 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3024) #3, !dbg !159 + br label %__nv_exp2f.exit1321, !dbg !159 + +3063: ; preds = %__nv_exp2f.exit1318 + %3064 = tail call float @llvm.nvvm.ex2.approx.f(float %3024) #3, !dbg !159 + br label %__nv_exp2f.exit1321, !dbg !159 + +__nv_exp2f.exit1321: ; preds = %3061, %3063 + %.0.i1320 = phi float [ %3062, %3061 ], [ %3064, %3063 ], !dbg !159 + %3065 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1322 = icmp eq i32 %3065, 0, !dbg !159 + br i1 %.not.i1322, label %3068, label %3066, !dbg !159 + +3066: ; preds = %__nv_exp2f.exit1321 + %3067 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3025) #3, !dbg !159 + br label %__nv_exp2f.exit1324, !dbg !159 + +3068: ; preds = %__nv_exp2f.exit1321 + %3069 = tail call float @llvm.nvvm.ex2.approx.f(float %3025) #3, !dbg !159 + br label %__nv_exp2f.exit1324, !dbg !159 + +__nv_exp2f.exit1324: ; preds = %3066, %3068 + %.0.i1323 = phi float [ %3067, %3066 ], [ %3069, %3068 ], !dbg !159 + %3070 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1325 = icmp eq i32 %3070, 0, !dbg !159 + br i1 %.not.i1325, label %3073, label %3071, !dbg !159 + +3071: ; preds = %__nv_exp2f.exit1324 + %3072 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3026) #3, !dbg !159 + br label %__nv_exp2f.exit1327, !dbg !159 + +3073: ; preds = %__nv_exp2f.exit1324 + %3074 = tail call float @llvm.nvvm.ex2.approx.f(float %3026) #3, !dbg !159 + br label %__nv_exp2f.exit1327, !dbg !159 + +__nv_exp2f.exit1327: ; preds = %3071, %3073 + %.0.i1326 = phi float [ %3072, %3071 ], [ %3074, %3073 ], !dbg !159 + %3075 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1328 = icmp eq i32 %3075, 0, !dbg !159 + br i1 %.not.i1328, label %3078, label %3076, !dbg !159 + +3076: ; preds = %__nv_exp2f.exit1327 + %3077 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3027) #3, !dbg !159 + br label %__nv_exp2f.exit1330, !dbg !159 + +3078: ; preds = %__nv_exp2f.exit1327 + %3079 = tail call float @llvm.nvvm.ex2.approx.f(float %3027) #3, !dbg !159 + br label %__nv_exp2f.exit1330, !dbg !159 + +__nv_exp2f.exit1330: ; preds = %3076, %3078 + %.0.i1329 = phi float [ %3077, %3076 ], [ %3079, %3078 ], !dbg !159 + %3080 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1331 = icmp eq i32 %3080, 0, !dbg !159 + br i1 %.not.i1331, label %3083, label %3081, !dbg !159 + +3081: ; preds = %__nv_exp2f.exit1330 + %3082 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3028) #3, !dbg !159 + br label %__nv_exp2f.exit1333, !dbg !159 + +3083: ; preds = %__nv_exp2f.exit1330 + %3084 = tail call float @llvm.nvvm.ex2.approx.f(float %3028) #3, !dbg !159 + br label %__nv_exp2f.exit1333, !dbg !159 + +__nv_exp2f.exit1333: ; preds = %3081, %3083 + %.0.i1332 = phi float [ %3082, %3081 ], [ %3084, %3083 ], !dbg !159 + %3085 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1334 = icmp eq i32 %3085, 0, !dbg !159 + br i1 %.not.i1334, label %3088, label %3086, !dbg !159 + +3086: ; preds = %__nv_exp2f.exit1333 + %3087 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3029) #3, !dbg !159 + br label %__nv_exp2f.exit1336, !dbg !159 + +3088: ; preds = %__nv_exp2f.exit1333 + %3089 = tail call float @llvm.nvvm.ex2.approx.f(float %3029) #3, !dbg !159 + br label %__nv_exp2f.exit1336, !dbg !159 + +__nv_exp2f.exit1336: ; preds = %3086, %3088 + %.0.i1335 = phi float [ %3087, %3086 ], [ %3089, %3088 ], !dbg !159 + %3090 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1337 = icmp eq i32 %3090, 0, !dbg !159 + br i1 %.not.i1337, label %3093, label %3091, !dbg !159 + +3091: ; preds = %__nv_exp2f.exit1336 + %3092 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3030) #3, !dbg !159 + br label %__nv_exp2f.exit1339, !dbg !159 + +3093: ; preds = %__nv_exp2f.exit1336 + %3094 = tail call float @llvm.nvvm.ex2.approx.f(float %3030) #3, !dbg !159 + br label %__nv_exp2f.exit1339, !dbg !159 + +__nv_exp2f.exit1339: ; preds = %3091, %3093 + %.0.i1338 = phi float [ %3092, %3091 ], [ %3094, %3093 ], !dbg !159 + %3095 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1340 = icmp eq i32 %3095, 0, !dbg !159 + br i1 %.not.i1340, label %3098, label %3096, !dbg !159 + +3096: ; preds = %__nv_exp2f.exit1339 + %3097 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3031) #3, !dbg !159 + br label %__nv_exp2f.exit1342, !dbg !159 + +3098: ; preds = %__nv_exp2f.exit1339 + %3099 = tail call float @llvm.nvvm.ex2.approx.f(float %3031) #3, !dbg !159 + br label %__nv_exp2f.exit1342, !dbg !159 + +__nv_exp2f.exit1342: ; preds = %3096, %3098 + %.0.i1341 = phi float [ %3097, %3096 ], [ %3099, %3098 ], !dbg !159 + %3100 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1343 = icmp eq i32 %3100, 0, !dbg !159 + br i1 %.not.i1343, label %3103, label %3101, !dbg !159 + +3101: ; preds = %__nv_exp2f.exit1342 + %3102 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3032) #3, !dbg !159 + br label %__nv_exp2f.exit1345, !dbg !159 + +3103: ; preds = %__nv_exp2f.exit1342 + %3104 = tail call float @llvm.nvvm.ex2.approx.f(float %3032) #3, !dbg !159 + br label %__nv_exp2f.exit1345, !dbg !159 + +__nv_exp2f.exit1345: ; preds = %3101, %3103 + %.0.i1344 = phi float [ %3102, %3101 ], [ %3104, %3103 ], !dbg !159 + %3105 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1346 = icmp eq i32 %3105, 0, !dbg !159 + br i1 %.not.i1346, label %3108, label %3106, !dbg !159 + +3106: ; preds = %__nv_exp2f.exit1345 + %3107 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3033) #3, !dbg !159 + br label %__nv_exp2f.exit1348, !dbg !159 + +3108: ; preds = %__nv_exp2f.exit1345 + %3109 = tail call float @llvm.nvvm.ex2.approx.f(float %3033) #3, !dbg !159 + br label %__nv_exp2f.exit1348, !dbg !159 + +__nv_exp2f.exit1348: ; preds = %3106, %3108 + %.0.i1347 = phi float [ %3107, %3106 ], [ %3109, %3108 ], !dbg !159 + %3110 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1349 = icmp eq i32 %3110, 0, !dbg !159 + br i1 %.not.i1349, label %3113, label %3111, !dbg !159 + +3111: ; preds = %__nv_exp2f.exit1348 + %3112 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3034) #3, !dbg !159 + br label %__nv_exp2f.exit1351, !dbg !159 + +3113: ; preds = %__nv_exp2f.exit1348 + %3114 = tail call float @llvm.nvvm.ex2.approx.f(float %3034) #3, !dbg !159 + br label %__nv_exp2f.exit1351, !dbg !159 + +__nv_exp2f.exit1351: ; preds = %3111, %3113 + %.0.i1350 = phi float [ %3112, %3111 ], [ %3114, %3113 ], !dbg !159 + %3115 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1352 = icmp eq i32 %3115, 0, !dbg !159 + br i1 %.not.i1352, label %3118, label %3116, !dbg !159 + +3116: ; preds = %__nv_exp2f.exit1351 + %3117 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3035) #3, !dbg !159 + br label %__nv_exp2f.exit1354, !dbg !159 + +3118: ; preds = %__nv_exp2f.exit1351 + %3119 = tail call float @llvm.nvvm.ex2.approx.f(float %3035) #3, !dbg !159 + br label %__nv_exp2f.exit1354, !dbg !159 + +__nv_exp2f.exit1354: ; preds = %3116, %3118 + %.0.i1353 = phi float [ %3117, %3116 ], [ %3119, %3118 ], !dbg !159 + %3120 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1355 = icmp eq i32 %3120, 0, !dbg !159 + br i1 %.not.i1355, label %3123, label %3121, !dbg !159 + +3121: ; preds = %__nv_exp2f.exit1354 + %3122 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3036) #3, !dbg !159 + br label %__nv_exp2f.exit1357, !dbg !159 + +3123: ; preds = %__nv_exp2f.exit1354 + %3124 = tail call float @llvm.nvvm.ex2.approx.f(float %3036) #3, !dbg !159 + br label %__nv_exp2f.exit1357, !dbg !159 + +__nv_exp2f.exit1357: ; preds = %3121, %3123 + %.0.i1356 = phi float [ %3122, %3121 ], [ %3124, %3123 ], !dbg !159 + %3125 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1358 = icmp eq i32 %3125, 0, !dbg !159 + br i1 %.not.i1358, label %3128, label %3126, !dbg !159 + +3126: ; preds = %__nv_exp2f.exit1357 + %3127 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3037) #3, !dbg !159 + br label %__nv_exp2f.exit1360, !dbg !159 + +3128: ; preds = %__nv_exp2f.exit1357 + %3129 = tail call float @llvm.nvvm.ex2.approx.f(float %3037) #3, !dbg !159 + br label %__nv_exp2f.exit1360, !dbg !159 + +__nv_exp2f.exit1360: ; preds = %3126, %3128 + %.0.i1359 = phi float [ %3127, %3126 ], [ %3129, %3128 ], !dbg !159 + %3130 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1361 = icmp eq i32 %3130, 0, !dbg !159 + br i1 %.not.i1361, label %3133, label %3131, !dbg !159 + +3131: ; preds = %__nv_exp2f.exit1360 + %3132 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3038) #3, !dbg !159 + br label %__nv_exp2f.exit1363, !dbg !159 + +3133: ; preds = %__nv_exp2f.exit1360 + %3134 = tail call float @llvm.nvvm.ex2.approx.f(float %3038) #3, !dbg !159 + br label %__nv_exp2f.exit1363, !dbg !159 + +__nv_exp2f.exit1363: ; preds = %3131, %3133 + %.0.i1362 = phi float [ %3132, %3131 ], [ %3134, %3133 ], !dbg !159 + %3135 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1364 = icmp eq i32 %3135, 0, !dbg !159 + br i1 %.not.i1364, label %3138, label %3136, !dbg !159 + +3136: ; preds = %__nv_exp2f.exit1363 + %3137 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3039) #3, !dbg !159 + br label %__nv_exp2f.exit1366, !dbg !159 + +3138: ; preds = %__nv_exp2f.exit1363 + %3139 = tail call float @llvm.nvvm.ex2.approx.f(float %3039) #3, !dbg !159 + br label %__nv_exp2f.exit1366, !dbg !159 + +__nv_exp2f.exit1366: ; preds = %3136, %3138 + %.0.i1365 = phi float [ %3137, %3136 ], [ %3139, %3138 ], !dbg !159 + %3140 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1367 = icmp eq i32 %3140, 0, !dbg !159 + br i1 %.not.i1367, label %3143, label %3141, !dbg !159 + +3141: ; preds = %__nv_exp2f.exit1366 + %3142 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3040) #3, !dbg !159 + br label %__nv_exp2f.exit1369, !dbg !159 + +3143: ; preds = %__nv_exp2f.exit1366 + %3144 = tail call float @llvm.nvvm.ex2.approx.f(float %3040) #3, !dbg !159 + br label %__nv_exp2f.exit1369, !dbg !159 + +__nv_exp2f.exit1369: ; preds = %3141, %3143 + %.0.i1368 = phi float [ %3142, %3141 ], [ %3144, %3143 ], !dbg !159 + %3145 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1370 = icmp eq i32 %3145, 0, !dbg !159 + br i1 %.not.i1370, label %3148, label %3146, !dbg !159 + +3146: ; preds = %__nv_exp2f.exit1369 + %3147 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3041) #3, !dbg !159 + br label %__nv_exp2f.exit1372, !dbg !159 + +3148: ; preds = %__nv_exp2f.exit1369 + %3149 = tail call float @llvm.nvvm.ex2.approx.f(float %3041) #3, !dbg !159 + br label %__nv_exp2f.exit1372, !dbg !159 + +__nv_exp2f.exit1372: ; preds = %3146, %3148 + %.0.i1371 = phi float [ %3147, %3146 ], [ %3149, %3148 ], !dbg !159 + %3150 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1373 = icmp eq i32 %3150, 0, !dbg !159 + br i1 %.not.i1373, label %3153, label %3151, !dbg !159 + +3151: ; preds = %__nv_exp2f.exit1372 + %3152 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3042) #3, !dbg !159 + br label %__nv_exp2f.exit1375, !dbg !159 + +3153: ; preds = %__nv_exp2f.exit1372 + %3154 = tail call float @llvm.nvvm.ex2.approx.f(float %3042) #3, !dbg !159 + br label %__nv_exp2f.exit1375, !dbg !159 + +__nv_exp2f.exit1375: ; preds = %3151, %3153 + %.0.i1374 = phi float [ %3152, %3151 ], [ %3154, %3153 ], !dbg !159 + %3155 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1376 = icmp eq i32 %3155, 0, !dbg !159 + br i1 %.not.i1376, label %3158, label %3156, !dbg !159 + +3156: ; preds = %__nv_exp2f.exit1375 + %3157 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3043) #3, !dbg !159 + br label %__nv_exp2f.exit1378, !dbg !159 + +3158: ; preds = %__nv_exp2f.exit1375 + %3159 = tail call float @llvm.nvvm.ex2.approx.f(float %3043) #3, !dbg !159 + br label %__nv_exp2f.exit1378, !dbg !159 + +__nv_exp2f.exit1378: ; preds = %3156, %3158 + %.0.i1377 = phi float [ %3157, %3156 ], [ %3159, %3158 ], !dbg !159 + %3160 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1379 = icmp eq i32 %3160, 0, !dbg !159 + br i1 %.not.i1379, label %3163, label %3161, !dbg !159 + +3161: ; preds = %__nv_exp2f.exit1378 + %3162 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3044) #3, !dbg !159 + br label %__nv_exp2f.exit1381, !dbg !159 + +3163: ; preds = %__nv_exp2f.exit1378 + %3164 = tail call float @llvm.nvvm.ex2.approx.f(float %3044) #3, !dbg !159 + br label %__nv_exp2f.exit1381, !dbg !159 + +__nv_exp2f.exit1381: ; preds = %3161, %3163 + %.0.i1380 = phi float [ %3162, %3161 ], [ %3164, %3163 ], !dbg !159 + %3165 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1382 = icmp eq i32 %3165, 0, !dbg !159 + br i1 %.not.i1382, label %3168, label %3166, !dbg !159 + +3166: ; preds = %__nv_exp2f.exit1381 + %3167 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3045) #3, !dbg !159 + br label %__nv_exp2f.exit1384, !dbg !159 + +3168: ; preds = %__nv_exp2f.exit1381 + %3169 = tail call float @llvm.nvvm.ex2.approx.f(float %3045) #3, !dbg !159 + br label %__nv_exp2f.exit1384, !dbg !159 + +__nv_exp2f.exit1384: ; preds = %3166, %3168 + %.0.i1383 = phi float [ %3167, %3166 ], [ %3169, %3168 ], !dbg !159 + %3170 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1385 = icmp eq i32 %3170, 0, !dbg !159 + br i1 %.not.i1385, label %3173, label %3171, !dbg !159 + +3171: ; preds = %__nv_exp2f.exit1384 + %3172 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3046) #3, !dbg !159 + br label %__nv_exp2f.exit1387, !dbg !159 + +3173: ; preds = %__nv_exp2f.exit1384 + %3174 = tail call float @llvm.nvvm.ex2.approx.f(float %3046) #3, !dbg !159 + br label %__nv_exp2f.exit1387, !dbg !159 + +__nv_exp2f.exit1387: ; preds = %3171, %3173 + %.0.i1386 = phi float [ %3172, %3171 ], [ %3174, %3173 ], !dbg !159 + %3175 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1388 = icmp eq i32 %3175, 0, !dbg !159 + br i1 %.not.i1388, label %3178, label %3176, !dbg !159 + +3176: ; preds = %__nv_exp2f.exit1387 + %3177 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3047) #3, !dbg !159 + br label %__nv_exp2f.exit1390, !dbg !159 + +3178: ; preds = %__nv_exp2f.exit1387 + %3179 = tail call float @llvm.nvvm.ex2.approx.f(float %3047) #3, !dbg !159 + br label %__nv_exp2f.exit1390, !dbg !159 + +__nv_exp2f.exit1390: ; preds = %3176, %3178 + %.0.i1389 = phi float [ %3177, %3176 ], [ %3179, %3178 ], !dbg !159 + %3180 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1391 = icmp eq i32 %3180, 0, !dbg !159 + br i1 %.not.i1391, label %3183, label %3181, !dbg !159 + +3181: ; preds = %__nv_exp2f.exit1390 + %3182 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3048) #3, !dbg !159 + br label %__nv_exp2f.exit1393, !dbg !159 + +3183: ; preds = %__nv_exp2f.exit1390 + %3184 = tail call float @llvm.nvvm.ex2.approx.f(float %3048) #3, !dbg !159 + br label %__nv_exp2f.exit1393, !dbg !159 + +__nv_exp2f.exit1393: ; preds = %3181, %3183 + %.0.i1392 = phi float [ %3182, %3181 ], [ %3184, %3183 ], !dbg !159 + %3185 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1394 = icmp eq i32 %3185, 0, !dbg !159 + br i1 %.not.i1394, label %3188, label %3186, !dbg !159 + +3186: ; preds = %__nv_exp2f.exit1393 + %3187 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3049) #3, !dbg !159 + br label %__nv_exp2f.exit1396, !dbg !159 + +3188: ; preds = %__nv_exp2f.exit1393 + %3189 = tail call float @llvm.nvvm.ex2.approx.f(float %3049) #3, !dbg !159 + br label %__nv_exp2f.exit1396, !dbg !159 + +__nv_exp2f.exit1396: ; preds = %3186, %3188 + %.0.i1395 = phi float [ %3187, %3186 ], [ %3189, %3188 ], !dbg !159 + %3190 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1397 = icmp eq i32 %3190, 0, !dbg !159 + br i1 %.not.i1397, label %3193, label %3191, !dbg !159 + +3191: ; preds = %__nv_exp2f.exit1396 + %3192 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3050) #3, !dbg !159 + br label %__nv_exp2f.exit1399, !dbg !159 + +3193: ; preds = %__nv_exp2f.exit1396 + %3194 = tail call float @llvm.nvvm.ex2.approx.f(float %3050) #3, !dbg !159 + br label %__nv_exp2f.exit1399, !dbg !159 + +__nv_exp2f.exit1399: ; preds = %3191, %3193 + %.0.i1398 = phi float [ %3192, %3191 ], [ %3194, %3193 ], !dbg !159 + %3195 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1400 = icmp eq i32 %3195, 0, !dbg !159 + br i1 %.not.i1400, label %3198, label %3196, !dbg !159 + +3196: ; preds = %__nv_exp2f.exit1399 + %3197 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3051) #3, !dbg !159 + br label %__nv_exp2f.exit1402, !dbg !159 + +3198: ; preds = %__nv_exp2f.exit1399 + %3199 = tail call float @llvm.nvvm.ex2.approx.f(float %3051) #3, !dbg !159 + br label %__nv_exp2f.exit1402, !dbg !159 + +__nv_exp2f.exit1402: ; preds = %3196, %3198 + %.0.i1401 = phi float [ %3197, %3196 ], [ %3199, %3198 ], !dbg !159 + %3200 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1403 = icmp eq i32 %3200, 0, !dbg !159 + br i1 %.not.i1403, label %3203, label %3201, !dbg !159 + +3201: ; preds = %__nv_exp2f.exit1402 + %3202 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3052) #3, !dbg !159 + br label %__nv_exp2f.exit1405, !dbg !159 + +3203: ; preds = %__nv_exp2f.exit1402 + %3204 = tail call float @llvm.nvvm.ex2.approx.f(float %3052) #3, !dbg !159 + br label %__nv_exp2f.exit1405, !dbg !159 + +__nv_exp2f.exit1405: ; preds = %3201, %3203 + %.0.i1404 = phi float [ %3202, %3201 ], [ %3204, %3203 ], !dbg !159 + %3205 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1406 = icmp eq i32 %3205, 0, !dbg !159 + br i1 %.not.i1406, label %3208, label %3206, !dbg !159 + +3206: ; preds = %__nv_exp2f.exit1405 + %3207 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3053) #3, !dbg !159 + br label %__nv_exp2f.exit1408, !dbg !159 + +3208: ; preds = %__nv_exp2f.exit1405 + %3209 = tail call float @llvm.nvvm.ex2.approx.f(float %3053) #3, !dbg !159 + br label %__nv_exp2f.exit1408, !dbg !159 + +__nv_exp2f.exit1408: ; preds = %3206, %3208 + %.0.i1407 = phi float [ %3207, %3206 ], [ %3209, %3208 ], !dbg !159 + %3210 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !159 + %.not.i1409 = icmp eq i32 %3210, 0, !dbg !159 + br i1 %.not.i1409, label %3213, label %3211, !dbg !159 + +3211: ; preds = %__nv_exp2f.exit1408 + %3212 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3054) #3, !dbg !159 + br label %__nv_exp2f.exit1411, !dbg !159 + +3213: ; preds = %__nv_exp2f.exit1408 + %3214 = tail call float @llvm.nvvm.ex2.approx.f(float %3054) #3, !dbg !159 + br label %__nv_exp2f.exit1411, !dbg !159 + +__nv_exp2f.exit1411: ; preds = %3211, %3213 + %.0.i1410 = phi float [ %3212, %3211 ], [ %3214, %3213 ], !dbg !159 + %3215 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %2570, !dbg !150 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !160 + %3216 = add i32 %2574, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !160 + %3217 = lshr exact i32 %3216, 4, !dbg !160 + %3218 = and i32 %3217, 16383, !dbg !160 + %3219 = zext nneg i32 %3218 to i64, !dbg !160 + %3220 = or disjoint i64 %3219, 4611686293372403712, !dbg !160 + %3221 = ptrtoint ptr addrspace(3) %3215 to i32, !dbg !160 + %3222 = lshr exact i32 %3221, 4, !dbg !160 + %3223 = and i32 %3222, 16383, !dbg !160 + %3224 = zext nneg i32 %3223 to i64, !dbg !160 + %3225 = or disjoint i64 %3224, 4611686293338849280, !dbg !160 + %3226 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %3220, i64 %3225) #3, !dbg !160 + %3227 = add i32 %2586, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !160 + %3228 = lshr exact i32 %3227, 4, !dbg !160 + %3229 = and i32 %3228, 16383, !dbg !160 + %3230 = zext nneg i32 %3229 to i64, !dbg !160 + %3231 = or disjoint i64 %3230, 4611686293372403712, !dbg !160 + %3232 = add i32 %3221, 32, !dbg !160 + %3233 = lshr exact i32 %3232, 4, !dbg !160 + %3234 = and i32 %3233, 16383, !dbg !160 + %3235 = zext nneg i32 %3234 to i64, !dbg !160 + %3236 = or disjoint i64 %3235, 4611686293338849280, !dbg !160 + %3237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 0, !dbg !160 + %3238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 1, !dbg !160 + %3239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 2, !dbg !160 + %3240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 3, !dbg !160 + %3241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 4, !dbg !160 + %3242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 5, !dbg !160 + %3243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 6, !dbg !160 + %3244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 7, !dbg !160 + %3245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 8, !dbg !160 + %3246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 9, !dbg !160 + %3247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 10, !dbg !160 + %3248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 11, !dbg !160 + %3249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 12, !dbg !160 + %3250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 13, !dbg !160 + %3251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 14, !dbg !160 + %3252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 15, !dbg !160 + %3253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 16, !dbg !160 + %3254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 17, !dbg !160 + %3255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 18, !dbg !160 + %3256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 19, !dbg !160 + %3257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 20, !dbg !160 + %3258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 21, !dbg !160 + %3259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 22, !dbg !160 + %3260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 23, !dbg !160 + %3261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 24, !dbg !160 + %3262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 25, !dbg !160 + %3263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 26, !dbg !160 + %3264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 27, !dbg !160 + %3265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 28, !dbg !160 + %3266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 29, !dbg !160 + %3267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 30, !dbg !160 + %3268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3226, 31, !dbg !160 + %3269 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3237, float %3238, float %3239, float %3240, float %3241, float %3242, float %3243, float %3244, float %3245, float %3246, float %3247, float %3248, float %3249, float %3250, float %3251, float %3252, float %3253, float %3254, float %3255, float %3256, float %3257, float %3258, float %3259, float %3260, float %3261, float %3262, float %3263, float %3264, float %3265, float %3266, float %3267, float %3268, i64 %3231, i64 %3236, i1 true) #3, !dbg !160 + %3270 = add i32 %2630, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !160 + %3271 = lshr exact i32 %3270, 4, !dbg !160 + %3272 = and i32 %3271, 16383, !dbg !160 + %3273 = zext nneg i32 %3272 to i64, !dbg !160 + %3274 = or disjoint i64 %3273, 4611686293372403712, !dbg !160 + %3275 = add i32 %3221, 64, !dbg !160 + %3276 = lshr exact i32 %3275, 4, !dbg !160 + %3277 = and i32 %3276, 16383, !dbg !160 + %3278 = zext nneg i32 %3277 to i64, !dbg !160 + %3279 = or disjoint i64 %3278, 4611686293338849280, !dbg !160 + %3280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 0, !dbg !160 + %3281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 1, !dbg !160 + %3282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 2, !dbg !160 + %3283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 3, !dbg !160 + %3284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 4, !dbg !160 + %3285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 5, !dbg !160 + %3286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 6, !dbg !160 + %3287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 7, !dbg !160 + %3288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 8, !dbg !160 + %3289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 9, !dbg !160 + %3290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 10, !dbg !160 + %3291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 11, !dbg !160 + %3292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 12, !dbg !160 + %3293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 13, !dbg !160 + %3294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 14, !dbg !160 + %3295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 15, !dbg !160 + %3296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 16, !dbg !160 + %3297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 17, !dbg !160 + %3298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 18, !dbg !160 + %3299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 19, !dbg !160 + %3300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 20, !dbg !160 + %3301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 21, !dbg !160 + %3302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 22, !dbg !160 + %3303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 23, !dbg !160 + %3304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 24, !dbg !160 + %3305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 25, !dbg !160 + %3306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 26, !dbg !160 + %3307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 27, !dbg !160 + %3308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 28, !dbg !160 + %3309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 29, !dbg !160 + %3310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 30, !dbg !160 + %3311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3269, 31, !dbg !160 + %3312 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3280, float %3281, float %3282, float %3283, float %3284, float %3285, float %3286, float %3287, float %3288, float %3289, float %3290, float %3291, float %3292, float %3293, float %3294, float %3295, float %3296, float %3297, float %3298, float %3299, float %3300, float %3301, float %3302, float %3303, float %3304, float %3305, float %3306, float %3307, float %3308, float %3309, float %3310, float %3311, i64 %3274, i64 %3279, i1 true) #3, !dbg !160 + %3313 = add i32 %2674, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !160 + %3314 = lshr exact i32 %3313, 4, !dbg !160 + %3315 = and i32 %3314, 16383, !dbg !160 + %3316 = zext nneg i32 %3315 to i64, !dbg !160 + %3317 = or disjoint i64 %3316, 4611686293372403712, !dbg !160 + %3318 = add i32 %3221, 96, !dbg !160 + %3319 = lshr exact i32 %3318, 4, !dbg !160 + %3320 = and i32 %3319, 16383, !dbg !160 + %3321 = zext nneg i32 %3320 to i64, !dbg !160 + %3322 = or disjoint i64 %3321, 4611686293338849280, !dbg !160 + %3323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 0, !dbg !160 + %3324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 1, !dbg !160 + %3325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 2, !dbg !160 + %3326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 3, !dbg !160 + %3327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 4, !dbg !160 + %3328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 5, !dbg !160 + %3329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 6, !dbg !160 + %3330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 7, !dbg !160 + %3331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 8, !dbg !160 + %3332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 9, !dbg !160 + %3333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 10, !dbg !160 + %3334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 11, !dbg !160 + %3335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 12, !dbg !160 + %3336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 13, !dbg !160 + %3337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 14, !dbg !160 + %3338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 15, !dbg !160 + %3339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 16, !dbg !160 + %3340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 17, !dbg !160 + %3341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 18, !dbg !160 + %3342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 19, !dbg !160 + %3343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 20, !dbg !160 + %3344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 21, !dbg !160 + %3345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 22, !dbg !160 + %3346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 23, !dbg !160 + %3347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 24, !dbg !160 + %3348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 25, !dbg !160 + %3349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 26, !dbg !160 + %3350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 27, !dbg !160 + %3351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 28, !dbg !160 + %3352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 29, !dbg !160 + %3353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 30, !dbg !160 + %3354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3312, 31, !dbg !160 + %3355 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3323, float %3324, float %3325, float %3326, float %3327, float %3328, float %3329, float %3330, float %3331, float %3332, float %3333, float %3334, float %3335, float %3336, float %3337, float %3338, float %3339, float %3340, float %3341, float %3342, float %3343, float %3344, float %3345, float %3346, float %3347, float %3348, float %3349, float %3350, float %3351, float %3352, float %3353, float %3354, i64 %3317, i64 %3322, i1 true) #3, !dbg !160 + %3356 = add i32 %2718, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !160 + %3357 = lshr exact i32 %3356, 4, !dbg !160 + %3358 = and i32 %3357, 16383, !dbg !160 + %3359 = zext nneg i32 %3358 to i64, !dbg !160 + %3360 = or disjoint i64 %3359, 4611686293372403712, !dbg !160 + %3361 = add i32 %3221, 8192, !dbg !160 + %3362 = lshr exact i32 %3361, 4, !dbg !160 + %3363 = and i32 %3362, 16383, !dbg !160 + %3364 = zext nneg i32 %3363 to i64, !dbg !160 + %3365 = or disjoint i64 %3364, 4611686293338849280, !dbg !160 + %3366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 0, !dbg !160 + %3367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 1, !dbg !160 + %3368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 2, !dbg !160 + %3369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 3, !dbg !160 + %3370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 4, !dbg !160 + %3371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 5, !dbg !160 + %3372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 6, !dbg !160 + %3373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 7, !dbg !160 + %3374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 8, !dbg !160 + %3375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 9, !dbg !160 + %3376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 10, !dbg !160 + %3377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 11, !dbg !160 + %3378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 12, !dbg !160 + %3379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 13, !dbg !160 + %3380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 14, !dbg !160 + %3381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 15, !dbg !160 + %3382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 16, !dbg !160 + %3383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 17, !dbg !160 + %3384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 18, !dbg !160 + %3385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 19, !dbg !160 + %3386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 20, !dbg !160 + %3387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 21, !dbg !160 + %3388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 22, !dbg !160 + %3389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 23, !dbg !160 + %3390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 24, !dbg !160 + %3391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 25, !dbg !160 + %3392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 26, !dbg !160 + %3393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 27, !dbg !160 + %3394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 28, !dbg !160 + %3395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 29, !dbg !160 + %3396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 30, !dbg !160 + %3397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3355, 31, !dbg !160 + %3398 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3366, float %3367, float %3368, float %3369, float %3370, float %3371, float %3372, float %3373, float %3374, float %3375, float %3376, float %3377, float %3378, float %3379, float %3380, float %3381, float %3382, float %3383, float %3384, float %3385, float %3386, float %3387, float %3388, float %3389, float %3390, float %3391, float %3392, float %3393, float %3394, float %3395, float %3396, float %3397, i64 %3360, i64 %3365, i1 true) #3, !dbg !160 + %3399 = add i32 %2762, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !160 + %3400 = lshr exact i32 %3399, 4, !dbg !160 + %3401 = and i32 %3400, 16383, !dbg !160 + %3402 = zext nneg i32 %3401 to i64, !dbg !160 + %3403 = or disjoint i64 %3402, 4611686293372403712, !dbg !160 + %3404 = add i32 %3221, 8224, !dbg !160 + %3405 = lshr exact i32 %3404, 4, !dbg !160 + %3406 = and i32 %3405, 16383, !dbg !160 + %3407 = zext nneg i32 %3406 to i64, !dbg !160 + %3408 = or disjoint i64 %3407, 4611686293338849280, !dbg !160 + %3409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 0, !dbg !160 + %3410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 1, !dbg !160 + %3411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 2, !dbg !160 + %3412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 3, !dbg !160 + %3413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 4, !dbg !160 + %3414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 5, !dbg !160 + %3415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 6, !dbg !160 + %3416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 7, !dbg !160 + %3417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 8, !dbg !160 + %3418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 9, !dbg !160 + %3419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 10, !dbg !160 + %3420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 11, !dbg !160 + %3421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 12, !dbg !160 + %3422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 13, !dbg !160 + %3423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 14, !dbg !160 + %3424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 15, !dbg !160 + %3425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 16, !dbg !160 + %3426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 17, !dbg !160 + %3427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 18, !dbg !160 + %3428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 19, !dbg !160 + %3429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 20, !dbg !160 + %3430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 21, !dbg !160 + %3431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 22, !dbg !160 + %3432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 23, !dbg !160 + %3433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 24, !dbg !160 + %3434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 25, !dbg !160 + %3435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 26, !dbg !160 + %3436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 27, !dbg !160 + %3437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 28, !dbg !160 + %3438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 29, !dbg !160 + %3439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 30, !dbg !160 + %3440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3398, 31, !dbg !160 + %3441 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3409, float %3410, float %3411, float %3412, float %3413, float %3414, float %3415, float %3416, float %3417, float %3418, float %3419, float %3420, float %3421, float %3422, float %3423, float %3424, float %3425, float %3426, float %3427, float %3428, float %3429, float %3430, float %3431, float %3432, float %3433, float %3434, float %3435, float %3436, float %3437, float %3438, float %3439, float %3440, i64 %3403, i64 %3408, i1 true) #3, !dbg !160 + %3442 = add i32 %2806, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !160 + %3443 = lshr exact i32 %3442, 4, !dbg !160 + %3444 = and i32 %3443, 16383, !dbg !160 + %3445 = zext nneg i32 %3444 to i64, !dbg !160 + %3446 = or disjoint i64 %3445, 4611686293372403712, !dbg !160 + %3447 = add i32 %3221, 8256, !dbg !160 + %3448 = lshr exact i32 %3447, 4, !dbg !160 + %3449 = and i32 %3448, 16383, !dbg !160 + %3450 = zext nneg i32 %3449 to i64, !dbg !160 + %3451 = or disjoint i64 %3450, 4611686293338849280, !dbg !160 + %3452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 0, !dbg !160 + %3453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 1, !dbg !160 + %3454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 2, !dbg !160 + %3455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 3, !dbg !160 + %3456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 4, !dbg !160 + %3457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 5, !dbg !160 + %3458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 6, !dbg !160 + %3459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 7, !dbg !160 + %3460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 8, !dbg !160 + %3461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 9, !dbg !160 + %3462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 10, !dbg !160 + %3463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 11, !dbg !160 + %3464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 12, !dbg !160 + %3465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 13, !dbg !160 + %3466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 14, !dbg !160 + %3467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 15, !dbg !160 + %3468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 16, !dbg !160 + %3469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 17, !dbg !160 + %3470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 18, !dbg !160 + %3471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 19, !dbg !160 + %3472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 20, !dbg !160 + %3473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 21, !dbg !160 + %3474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 22, !dbg !160 + %3475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 23, !dbg !160 + %3476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 24, !dbg !160 + %3477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 25, !dbg !160 + %3478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 26, !dbg !160 + %3479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 27, !dbg !160 + %3480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 28, !dbg !160 + %3481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 29, !dbg !160 + %3482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 30, !dbg !160 + %3483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3441, 31, !dbg !160 + %3484 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3452, float %3453, float %3454, float %3455, float %3456, float %3457, float %3458, float %3459, float %3460, float %3461, float %3462, float %3463, float %3464, float %3465, float %3466, float %3467, float %3468, float %3469, float %3470, float %3471, float %3472, float %3473, float %3474, float %3475, float %3476, float %3477, float %3478, float %3479, float %3480, float %3481, float %3482, float %3483, i64 %3446, i64 %3451, i1 true) #3, !dbg !160 + %3485 = add i32 %2850, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !160 + %3486 = lshr exact i32 %3485, 4, !dbg !160 + %3487 = and i32 %3486, 16383, !dbg !160 + %3488 = zext nneg i32 %3487 to i64, !dbg !160 + %3489 = or disjoint i64 %3488, 4611686293372403712, !dbg !160 + %3490 = add i32 %3221, 8288, !dbg !160 + %3491 = lshr exact i32 %3490, 4, !dbg !160 + %3492 = and i32 %3491, 16383, !dbg !160 + %3493 = zext nneg i32 %3492 to i64, !dbg !160 + %3494 = or disjoint i64 %3493, 4611686293338849280, !dbg !160 + %3495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 0, !dbg !160 + %3496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 1, !dbg !160 + %3497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 2, !dbg !160 + %3498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 3, !dbg !160 + %3499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 4, !dbg !160 + %3500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 5, !dbg !160 + %3501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 6, !dbg !160 + %3502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 7, !dbg !160 + %3503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 8, !dbg !160 + %3504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 9, !dbg !160 + %3505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 10, !dbg !160 + %3506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 11, !dbg !160 + %3507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 12, !dbg !160 + %3508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 13, !dbg !160 + %3509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 14, !dbg !160 + %3510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 15, !dbg !160 + %3511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 16, !dbg !160 + %3512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 17, !dbg !160 + %3513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 18, !dbg !160 + %3514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 19, !dbg !160 + %3515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 20, !dbg !160 + %3516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 21, !dbg !160 + %3517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 22, !dbg !160 + %3518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 23, !dbg !160 + %3519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 24, !dbg !160 + %3520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 25, !dbg !160 + %3521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 26, !dbg !160 + %3522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 27, !dbg !160 + %3523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 28, !dbg !160 + %3524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 29, !dbg !160 + %3525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 30, !dbg !160 + %3526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3484, 31, !dbg !160 + %3527 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3495, float %3496, float %3497, float %3498, float %3499, float %3500, float %3501, float %3502, float %3503, float %3504, float %3505, float %3506, float %3507, float %3508, float %3509, float %3510, float %3511, float %3512, float %3513, float %3514, float %3515, float %3516, float %3517, float %3518, float %3519, float %3520, float %3521, float %3522, float %3523, float %3524, float %3525, float %3526, i64 %3489, i64 %3494, i1 true) #3, !dbg !160 + %3528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 0, !dbg !160 + %3529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 1, !dbg !160 + %3530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 2, !dbg !160 + %3531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 3, !dbg !160 + %3532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 4, !dbg !160 + %3533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 5, !dbg !160 + %3534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 6, !dbg !160 + %3535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 7, !dbg !160 + %3536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 8, !dbg !160 + %3537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 9, !dbg !160 + %3538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 10, !dbg !160 + %3539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 11, !dbg !160 + %3540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 12, !dbg !160 + %3541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 13, !dbg !160 + %3542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 14, !dbg !160 + %3543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 15, !dbg !160 + %3544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 16, !dbg !160 + %3545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 17, !dbg !160 + %3546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 18, !dbg !160 + %3547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 19, !dbg !160 + %3548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 20, !dbg !160 + %3549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 21, !dbg !160 + %3550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 22, !dbg !160 + %3551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 23, !dbg !160 + %3552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 24, !dbg !160 + %3553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 25, !dbg !160 + %3554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 26, !dbg !160 + %3555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 27, !dbg !160 + %3556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 28, !dbg !160 + %3557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 29, !dbg !160 + %3558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 30, !dbg !160 + %3559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3527, 31, !dbg !160 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !160 + %3560 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %3528, float %3529, float %3530, float %3531, float %3532, float %3533, float %3534, float %3535, float %3536, float %3537, float %3538, float %3539, float %3540, float %3541, float %3542, float %3543, float %3544, float %3545, float %3546, float %3547, float %3548, float %3549, float %3550, float %3551, float %3552, float %3553, float %3554, float %3555, float %3556, float %3557, float %3558, float %3559, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 0, i32 0, ptr addrspace(3) %3215, i32 0, i32 0) #3, !dbg !160 + %3561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 0, !dbg !160 + %3562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 1, !dbg !160 + %3563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 2, !dbg !160 + %3564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 3, !dbg !160 + %3565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 4, !dbg !160 + %3566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 5, !dbg !160 + %3567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 6, !dbg !160 + %3568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 7, !dbg !160 + %3569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 8, !dbg !160 + %3570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 9, !dbg !160 + %3571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 10, !dbg !160 + %3572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 11, !dbg !160 + %3573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 12, !dbg !160 + %3574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 13, !dbg !160 + %3575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 14, !dbg !160 + %3576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 15, !dbg !160 + %3577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 16, !dbg !160 + %3578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 17, !dbg !160 + %3579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 18, !dbg !160 + %3580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 19, !dbg !160 + %3581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 20, !dbg !160 + %3582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 21, !dbg !160 + %3583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 22, !dbg !160 + %3584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 23, !dbg !160 + %3585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 24, !dbg !160 + %3586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 25, !dbg !160 + %3587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 26, !dbg !160 + %3588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 27, !dbg !160 + %3589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 28, !dbg !160 + %3590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 29, !dbg !160 + %3591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 30, !dbg !160 + %3592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3560, 31, !dbg !160 + %3593 = insertelement <2 x float> poison, float %3561, i64 0, !dbg !155 + %3594 = insertelement <2 x float> %3593, float %3562, i64 1, !dbg !155 + %3595 = fsub <2 x float> %3594, %2560, !dbg !155 + %3596 = insertelement <2 x float> poison, float %.0.i1317, i64 0, !dbg !161 + %3597 = insertelement <2 x float> %3596, float %.0.i1320, i64 1, !dbg !161 + %3598 = fmul <2 x float> %3597, %3595, !dbg !161 + %3599 = fptrunc <2 x float> %3598 to <2 x bfloat>, !dbg !162 + %3600 = insertelement <2 x float> poison, float %3563, i64 0, !dbg !155 + %3601 = insertelement <2 x float> %3600, float %3564, i64 1, !dbg !155 + %3602 = fsub <2 x float> %3601, %2558, !dbg !155 + %3603 = insertelement <2 x float> poison, float %.0.i1323, i64 0, !dbg !161 + %3604 = insertelement <2 x float> %3603, float %.0.i1326, i64 1, !dbg !161 + %3605 = fmul <2 x float> %3604, %3602, !dbg !161 + %3606 = fptrunc <2 x float> %3605 to <2 x bfloat>, !dbg !162 + %3607 = insertelement <2 x float> poison, float %3565, i64 0, !dbg !155 + %3608 = insertelement <2 x float> %3607, float %3566, i64 1, !dbg !155 + %3609 = fsub <2 x float> %3608, %2560, !dbg !155 + %3610 = insertelement <2 x float> poison, float %.0.i1329, i64 0, !dbg !161 + %3611 = insertelement <2 x float> %3610, float %.0.i1332, i64 1, !dbg !161 + %3612 = fmul <2 x float> %3611, %3609, !dbg !161 + %3613 = fptrunc <2 x float> %3612 to <2 x bfloat>, !dbg !162 + %3614 = insertelement <2 x float> poison, float %3567, i64 0, !dbg !155 + %3615 = insertelement <2 x float> %3614, float %3568, i64 1, !dbg !155 + %3616 = fsub <2 x float> %3615, %2558, !dbg !155 + %3617 = insertelement <2 x float> poison, float %.0.i1335, i64 0, !dbg !161 + %3618 = insertelement <2 x float> %3617, float %.0.i1338, i64 1, !dbg !161 + %3619 = fmul <2 x float> %3618, %3616, !dbg !161 + %3620 = fptrunc <2 x float> %3619 to <2 x bfloat>, !dbg !162 + %3621 = insertelement <2 x float> poison, float %3569, i64 0, !dbg !155 + %3622 = insertelement <2 x float> %3621, float %3570, i64 1, !dbg !155 + %3623 = fsub <2 x float> %3622, %2560, !dbg !155 + %3624 = insertelement <2 x float> poison, float %.0.i1341, i64 0, !dbg !161 + %3625 = insertelement <2 x float> %3624, float %.0.i1344, i64 1, !dbg !161 + %3626 = fmul <2 x float> %3625, %3623, !dbg !161 + %3627 = fptrunc <2 x float> %3626 to <2 x bfloat>, !dbg !162 + %3628 = insertelement <2 x float> poison, float %3571, i64 0, !dbg !155 + %3629 = insertelement <2 x float> %3628, float %3572, i64 1, !dbg !155 + %3630 = fsub <2 x float> %3629, %2558, !dbg !155 + %3631 = insertelement <2 x float> poison, float %.0.i1347, i64 0, !dbg !161 + %3632 = insertelement <2 x float> %3631, float %.0.i1350, i64 1, !dbg !161 + %3633 = fmul <2 x float> %3632, %3630, !dbg !161 + %3634 = fptrunc <2 x float> %3633 to <2 x bfloat>, !dbg !162 + %3635 = insertelement <2 x float> poison, float %3573, i64 0, !dbg !155 + %3636 = insertelement <2 x float> %3635, float %3574, i64 1, !dbg !155 + %3637 = fsub <2 x float> %3636, %2560, !dbg !155 + %3638 = insertelement <2 x float> poison, float %.0.i1353, i64 0, !dbg !161 + %3639 = insertelement <2 x float> %3638, float %.0.i1356, i64 1, !dbg !161 + %3640 = fmul <2 x float> %3639, %3637, !dbg !161 + %3641 = fptrunc <2 x float> %3640 to <2 x bfloat>, !dbg !162 + %3642 = insertelement <2 x float> poison, float %3575, i64 0, !dbg !155 + %3643 = insertelement <2 x float> %3642, float %3576, i64 1, !dbg !155 + %3644 = fsub <2 x float> %3643, %2558, !dbg !155 + %3645 = insertelement <2 x float> poison, float %.0.i1359, i64 0, !dbg !161 + %3646 = insertelement <2 x float> %3645, float %.0.i1362, i64 1, !dbg !161 + %3647 = fmul <2 x float> %3646, %3644, !dbg !161 + %3648 = fptrunc <2 x float> %3647 to <2 x bfloat>, !dbg !162 + %3649 = insertelement <2 x float> poison, float %3577, i64 0, !dbg !155 + %3650 = insertelement <2 x float> %3649, float %3578, i64 1, !dbg !155 + %3651 = fsub <2 x float> %3650, %2560, !dbg !155 + %3652 = insertelement <2 x float> poison, float %.0.i1365, i64 0, !dbg !161 + %3653 = insertelement <2 x float> %3652, float %.0.i1368, i64 1, !dbg !161 + %3654 = fmul <2 x float> %3653, %3651, !dbg !161 + %3655 = fptrunc <2 x float> %3654 to <2 x bfloat>, !dbg !162 + %3656 = insertelement <2 x float> poison, float %3579, i64 0, !dbg !155 + %3657 = insertelement <2 x float> %3656, float %3580, i64 1, !dbg !155 + %3658 = fsub <2 x float> %3657, %2558, !dbg !155 + %3659 = insertelement <2 x float> poison, float %.0.i1371, i64 0, !dbg !161 + %3660 = insertelement <2 x float> %3659, float %.0.i1374, i64 1, !dbg !161 + %3661 = fmul <2 x float> %3660, %3658, !dbg !161 + %3662 = fptrunc <2 x float> %3661 to <2 x bfloat>, !dbg !162 + %3663 = insertelement <2 x float> poison, float %3581, i64 0, !dbg !155 + %3664 = insertelement <2 x float> %3663, float %3582, i64 1, !dbg !155 + %3665 = fsub <2 x float> %3664, %2560, !dbg !155 + %3666 = insertelement <2 x float> poison, float %.0.i1377, i64 0, !dbg !161 + %3667 = insertelement <2 x float> %3666, float %.0.i1380, i64 1, !dbg !161 + %3668 = fmul <2 x float> %3667, %3665, !dbg !161 + %3669 = fptrunc <2 x float> %3668 to <2 x bfloat>, !dbg !162 + %3670 = insertelement <2 x float> poison, float %3583, i64 0, !dbg !155 + %3671 = insertelement <2 x float> %3670, float %3584, i64 1, !dbg !155 + %3672 = fsub <2 x float> %3671, %2558, !dbg !155 + %3673 = insertelement <2 x float> poison, float %.0.i1383, i64 0, !dbg !161 + %3674 = insertelement <2 x float> %3673, float %.0.i1386, i64 1, !dbg !161 + %3675 = fmul <2 x float> %3674, %3672, !dbg !161 + %3676 = fptrunc <2 x float> %3675 to <2 x bfloat>, !dbg !162 + %3677 = insertelement <2 x float> poison, float %3585, i64 0, !dbg !155 + %3678 = insertelement <2 x float> %3677, float %3586, i64 1, !dbg !155 + %3679 = fsub <2 x float> %3678, %2560, !dbg !155 + %3680 = insertelement <2 x float> poison, float %.0.i1389, i64 0, !dbg !161 + %3681 = insertelement <2 x float> %3680, float %.0.i1392, i64 1, !dbg !161 + %3682 = fmul <2 x float> %3681, %3679, !dbg !161 + %3683 = fptrunc <2 x float> %3682 to <2 x bfloat>, !dbg !162 + %3684 = insertelement <2 x float> poison, float %3587, i64 0, !dbg !155 + %3685 = insertelement <2 x float> %3684, float %3588, i64 1, !dbg !155 + %3686 = fsub <2 x float> %3685, %2558, !dbg !155 + %3687 = insertelement <2 x float> poison, float %.0.i1395, i64 0, !dbg !161 + %3688 = insertelement <2 x float> %3687, float %.0.i1398, i64 1, !dbg !161 + %3689 = fmul <2 x float> %3688, %3686, !dbg !161 + %3690 = fptrunc <2 x float> %3689 to <2 x bfloat>, !dbg !162 + %3691 = insertelement <2 x float> poison, float %3589, i64 0, !dbg !155 + %3692 = insertelement <2 x float> %3691, float %3590, i64 1, !dbg !155 + %3693 = fsub <2 x float> %3692, %2560, !dbg !155 + %3694 = insertelement <2 x float> poison, float %.0.i1401, i64 0, !dbg !161 + %3695 = insertelement <2 x float> %3694, float %.0.i1404, i64 1, !dbg !161 + %3696 = fmul <2 x float> %3695, %3693, !dbg !161 + %3697 = fptrunc <2 x float> %3696 to <2 x bfloat>, !dbg !162 + %3698 = insertelement <2 x float> poison, float %3591, i64 0, !dbg !155 + %3699 = insertelement <2 x float> %3698, float %3592, i64 1, !dbg !155 + %3700 = fsub <2 x float> %3699, %2558, !dbg !155 + %3701 = insertelement <2 x float> poison, float %.0.i1407, i64 0, !dbg !161 + %3702 = insertelement <2 x float> %3701, float %.0.i1410, i64 1, !dbg !161 + %3703 = fmul <2 x float> %3702, %3700, !dbg !161 + %3704 = fptrunc <2 x float> %3703 to <2 x bfloat>, !dbg !162 + %3705 = bitcast <2 x bfloat> %3599 to i32, !dbg !163 + %3706 = bitcast <2 x bfloat> %3606 to i32, !dbg !163 + %3707 = bitcast <2 x bfloat> %3613 to i32, !dbg !163 + %3708 = bitcast <2 x bfloat> %3620 to i32, !dbg !163 + %3709 = bitcast <2 x bfloat> %3627 to i32, !dbg !163 + %3710 = bitcast <2 x bfloat> %3634 to i32, !dbg !163 + %3711 = bitcast <2 x bfloat> %3641 to i32, !dbg !163 + %3712 = bitcast <2 x bfloat> %3648 to i32, !dbg !163 + %3713 = bitcast <2 x bfloat> %3655 to i32, !dbg !163 + %3714 = bitcast <2 x bfloat> %3662 to i32, !dbg !163 + %3715 = bitcast <2 x bfloat> %3669 to i32, !dbg !163 + %3716 = bitcast <2 x bfloat> %3676 to i32, !dbg !163 + %3717 = bitcast <2 x bfloat> %3683 to i32, !dbg !163 + %3718 = bitcast <2 x bfloat> %3690 to i32, !dbg !163 + %3719 = bitcast <2 x bfloat> %3697 to i32, !dbg !163 + %3720 = bitcast <2 x bfloat> %3704 to i32, !dbg !163 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !163 + %3721 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %.pn, float %.pn2389, float %.pn2390, float %.pn2391, float %.pn2392, float %.pn2393, float %.pn2394, float %.pn2395, float %.pn2396, float %.pn2397, float %.pn2398, float %.pn2399, float %.pn2400, float %.pn2401, float %.pn2402, float %.pn2403, float %.pn2404, float %.pn2405, float %.pn2406, float %.pn2407, float %.pn2408, float %.pn2409, float %.pn2410, float %.pn2411, float %.pn2412, float %.pn2413, float %.pn2414, float %.pn2415, float %.pn2416, float %.pn2417, float %.pn2418, float %.pn2419, float %.pn2420, float %.pn2421, float %.pn2422, float %.pn2423, float %.pn2424, float %.pn2425, float %.pn2426, float %.pn2427, float %.pn2428, float %.pn2429, float %.pn2430, float %.pn2431, float %.pn2432, float %.pn2433, float %.pn2434, float %.pn2435, float %.pn2436, float %.pn2437, float %.pn2438, float %.pn2439, float %.pn2440, float %.pn2441, float %.pn2442, float %.pn2443, float %.pn2444, float %.pn2445, float %.pn2446, float %.pn2447, float %.pn2448, float %.pn2449, float %.pn2450, float %.pn2451, i32 %3705, i32 %3706, i32 %3707, i32 %3708, i64 %2584, i1 true) #3, !dbg !163 + %3722 = add i32 %2580, 2048, !dbg !163 + %3723 = lshr exact i32 %3722, 4, !dbg !163 + %3724 = and i32 %3723, 16383, !dbg !163 + %3725 = zext nneg i32 %3724 to i64, !dbg !163 + %3726 = or disjoint i64 %3725, 4611686293338849280, !dbg !163 + %3727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 0, !dbg !163 + %3728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 1, !dbg !163 + %3729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 2, !dbg !163 + %3730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 3, !dbg !163 + %3731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 4, !dbg !163 + %3732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 5, !dbg !163 + %3733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 6, !dbg !163 + %3734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 7, !dbg !163 + %3735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 8, !dbg !163 + %3736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 9, !dbg !163 + %3737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 10, !dbg !163 + %3738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 11, !dbg !163 + %3739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 12, !dbg !163 + %3740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 13, !dbg !163 + %3741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 14, !dbg !163 + %3742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 15, !dbg !163 + %3743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 16, !dbg !163 + %3744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 17, !dbg !163 + %3745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 18, !dbg !163 + %3746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 19, !dbg !163 + %3747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 20, !dbg !163 + %3748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 21, !dbg !163 + %3749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 22, !dbg !163 + %3750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 23, !dbg !163 + %3751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 24, !dbg !163 + %3752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 25, !dbg !163 + %3753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 26, !dbg !163 + %3754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 27, !dbg !163 + %3755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 28, !dbg !163 + %3756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 29, !dbg !163 + %3757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 30, !dbg !163 + %3758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 31, !dbg !163 + %3759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 32, !dbg !163 + %3760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 33, !dbg !163 + %3761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 34, !dbg !163 + %3762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 35, !dbg !163 + %3763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 36, !dbg !163 + %3764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 37, !dbg !163 + %3765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 38, !dbg !163 + %3766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 39, !dbg !163 + %3767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 40, !dbg !163 + %3768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 41, !dbg !163 + %3769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 42, !dbg !163 + %3770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 43, !dbg !163 + %3771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 44, !dbg !163 + %3772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 45, !dbg !163 + %3773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 46, !dbg !163 + %3774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 47, !dbg !163 + %3775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 48, !dbg !163 + %3776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 49, !dbg !163 + %3777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 50, !dbg !163 + %3778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 51, !dbg !163 + %3779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 52, !dbg !163 + %3780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 53, !dbg !163 + %3781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 54, !dbg !163 + %3782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 55, !dbg !163 + %3783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 56, !dbg !163 + %3784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 57, !dbg !163 + %3785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 58, !dbg !163 + %3786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 59, !dbg !163 + %3787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 60, !dbg !163 + %3788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 61, !dbg !163 + %3789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 62, !dbg !163 + %3790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3721, 63, !dbg !163 + %3791 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3727, float %3728, float %3729, float %3730, float %3731, float %3732, float %3733, float %3734, float %3735, float %3736, float %3737, float %3738, float %3739, float %3740, float %3741, float %3742, float %3743, float %3744, float %3745, float %3746, float %3747, float %3748, float %3749, float %3750, float %3751, float %3752, float %3753, float %3754, float %3755, float %3756, float %3757, float %3758, float %3759, float %3760, float %3761, float %3762, float %3763, float %3764, float %3765, float %3766, float %3767, float %3768, float %3769, float %3770, float %3771, float %3772, float %3773, float %3774, float %3775, float %3776, float %3777, float %3778, float %3779, float %3780, float %3781, float %3782, float %3783, float %3784, float %3785, float %3786, float %3787, float %3788, float %3789, float %3790, i32 %3709, i32 %3710, i32 %3711, i32 %3712, i64 %3726, i1 true) #3, !dbg !163 + %3792 = add i32 %2580, 4096, !dbg !163 + %3793 = lshr exact i32 %3792, 4, !dbg !163 + %3794 = and i32 %3793, 16383, !dbg !163 + %3795 = zext nneg i32 %3794 to i64, !dbg !163 + %3796 = or disjoint i64 %3795, 4611686293338849280, !dbg !163 + %3797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 0, !dbg !163 + %3798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 1, !dbg !163 + %3799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 2, !dbg !163 + %3800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 3, !dbg !163 + %3801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 4, !dbg !163 + %3802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 5, !dbg !163 + %3803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 6, !dbg !163 + %3804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 7, !dbg !163 + %3805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 8, !dbg !163 + %3806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 9, !dbg !163 + %3807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 10, !dbg !163 + %3808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 11, !dbg !163 + %3809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 12, !dbg !163 + %3810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 13, !dbg !163 + %3811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 14, !dbg !163 + %3812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 15, !dbg !163 + %3813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 16, !dbg !163 + %3814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 17, !dbg !163 + %3815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 18, !dbg !163 + %3816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 19, !dbg !163 + %3817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 20, !dbg !163 + %3818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 21, !dbg !163 + %3819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 22, !dbg !163 + %3820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 23, !dbg !163 + %3821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 24, !dbg !163 + %3822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 25, !dbg !163 + %3823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 26, !dbg !163 + %3824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 27, !dbg !163 + %3825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 28, !dbg !163 + %3826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 29, !dbg !163 + %3827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 30, !dbg !163 + %3828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 31, !dbg !163 + %3829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 32, !dbg !163 + %3830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 33, !dbg !163 + %3831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 34, !dbg !163 + %3832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 35, !dbg !163 + %3833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 36, !dbg !163 + %3834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 37, !dbg !163 + %3835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 38, !dbg !163 + %3836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 39, !dbg !163 + %3837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 40, !dbg !163 + %3838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 41, !dbg !163 + %3839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 42, !dbg !163 + %3840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 43, !dbg !163 + %3841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 44, !dbg !163 + %3842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 45, !dbg !163 + %3843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 46, !dbg !163 + %3844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 47, !dbg !163 + %3845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 48, !dbg !163 + %3846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 49, !dbg !163 + %3847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 50, !dbg !163 + %3848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 51, !dbg !163 + %3849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 52, !dbg !163 + %3850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 53, !dbg !163 + %3851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 54, !dbg !163 + %3852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 55, !dbg !163 + %3853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 56, !dbg !163 + %3854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 57, !dbg !163 + %3855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 58, !dbg !163 + %3856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 59, !dbg !163 + %3857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 60, !dbg !163 + %3858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 61, !dbg !163 + %3859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 62, !dbg !163 + %3860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3791, 63, !dbg !163 + %3861 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3797, float %3798, float %3799, float %3800, float %3801, float %3802, float %3803, float %3804, float %3805, float %3806, float %3807, float %3808, float %3809, float %3810, float %3811, float %3812, float %3813, float %3814, float %3815, float %3816, float %3817, float %3818, float %3819, float %3820, float %3821, float %3822, float %3823, float %3824, float %3825, float %3826, float %3827, float %3828, float %3829, float %3830, float %3831, float %3832, float %3833, float %3834, float %3835, float %3836, float %3837, float %3838, float %3839, float %3840, float %3841, float %3842, float %3843, float %3844, float %3845, float %3846, float %3847, float %3848, float %3849, float %3850, float %3851, float %3852, float %3853, float %3854, float %3855, float %3856, float %3857, float %3858, float %3859, float %3860, i32 %3713, i32 %3714, i32 %3715, i32 %3716, i64 %3796, i1 true) #3, !dbg !163 + %3862 = add i32 %2580, 6144, !dbg !163 + %3863 = lshr exact i32 %3862, 4, !dbg !163 + %3864 = and i32 %3863, 16383, !dbg !163 + %3865 = zext nneg i32 %3864 to i64, !dbg !163 + %3866 = or disjoint i64 %3865, 4611686293338849280, !dbg !163 + %3867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 0, !dbg !163 + %3868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 1, !dbg !163 + %3869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 2, !dbg !163 + %3870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 3, !dbg !163 + %3871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 4, !dbg !163 + %3872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 5, !dbg !163 + %3873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 6, !dbg !163 + %3874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 7, !dbg !163 + %3875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 8, !dbg !163 + %3876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 9, !dbg !163 + %3877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 10, !dbg !163 + %3878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 11, !dbg !163 + %3879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 12, !dbg !163 + %3880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 13, !dbg !163 + %3881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 14, !dbg !163 + %3882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 15, !dbg !163 + %3883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 16, !dbg !163 + %3884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 17, !dbg !163 + %3885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 18, !dbg !163 + %3886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 19, !dbg !163 + %3887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 20, !dbg !163 + %3888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 21, !dbg !163 + %3889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 22, !dbg !163 + %3890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 23, !dbg !163 + %3891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 24, !dbg !163 + %3892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 25, !dbg !163 + %3893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 26, !dbg !163 + %3894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 27, !dbg !163 + %3895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 28, !dbg !163 + %3896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 29, !dbg !163 + %3897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 30, !dbg !163 + %3898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 31, !dbg !163 + %3899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 32, !dbg !163 + %3900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 33, !dbg !163 + %3901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 34, !dbg !163 + %3902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 35, !dbg !163 + %3903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 36, !dbg !163 + %3904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 37, !dbg !163 + %3905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 38, !dbg !163 + %3906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 39, !dbg !163 + %3907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 40, !dbg !163 + %3908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 41, !dbg !163 + %3909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 42, !dbg !163 + %3910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 43, !dbg !163 + %3911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 44, !dbg !163 + %3912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 45, !dbg !163 + %3913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 46, !dbg !163 + %3914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 47, !dbg !163 + %3915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 48, !dbg !163 + %3916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 49, !dbg !163 + %3917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 50, !dbg !163 + %3918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 51, !dbg !163 + %3919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 52, !dbg !163 + %3920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 53, !dbg !163 + %3921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 54, !dbg !163 + %3922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 55, !dbg !163 + %3923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 56, !dbg !163 + %3924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 57, !dbg !163 + %3925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 58, !dbg !163 + %3926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 59, !dbg !163 + %3927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 60, !dbg !163 + %3928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 61, !dbg !163 + %3929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 62, !dbg !163 + %3930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3861, 63, !dbg !163 + %3931 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3867, float %3868, float %3869, float %3870, float %3871, float %3872, float %3873, float %3874, float %3875, float %3876, float %3877, float %3878, float %3879, float %3880, float %3881, float %3882, float %3883, float %3884, float %3885, float %3886, float %3887, float %3888, float %3889, float %3890, float %3891, float %3892, float %3893, float %3894, float %3895, float %3896, float %3897, float %3898, float %3899, float %3900, float %3901, float %3902, float %3903, float %3904, float %3905, float %3906, float %3907, float %3908, float %3909, float %3910, float %3911, float %3912, float %3913, float %3914, float %3915, float %3916, float %3917, float %3918, float %3919, float %3920, float %3921, float %3922, float %3923, float %3924, float %3925, float %3926, float %3927, float %3928, float %3929, float %3930, i32 %3717, i32 %3718, i32 %3719, i32 %3720, i64 %3866, i1 true) #3, !dbg !163 + %3932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 0, !dbg !163 + %3933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 1, !dbg !163 + %3934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 2, !dbg !163 + %3935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 3, !dbg !163 + %3936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 4, !dbg !163 + %3937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 5, !dbg !163 + %3938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 6, !dbg !163 + %3939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 7, !dbg !163 + %3940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 8, !dbg !163 + %3941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 9, !dbg !163 + %3942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 10, !dbg !163 + %3943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 11, !dbg !163 + %3944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 12, !dbg !163 + %3945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 13, !dbg !163 + %3946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 14, !dbg !163 + %3947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 15, !dbg !163 + %3948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 16, !dbg !163 + %3949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 17, !dbg !163 + %3950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 18, !dbg !163 + %3951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 19, !dbg !163 + %3952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 20, !dbg !163 + %3953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 21, !dbg !163 + %3954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 22, !dbg !163 + %3955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 23, !dbg !163 + %3956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 24, !dbg !163 + %3957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 25, !dbg !163 + %3958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 26, !dbg !163 + %3959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 27, !dbg !163 + %3960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 28, !dbg !163 + %3961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 29, !dbg !163 + %3962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 30, !dbg !163 + %3963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 31, !dbg !163 + %3964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 32, !dbg !163 + %3965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 33, !dbg !163 + %3966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 34, !dbg !163 + %3967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 35, !dbg !163 + %3968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 36, !dbg !163 + %3969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 37, !dbg !163 + %3970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 38, !dbg !163 + %3971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 39, !dbg !163 + %3972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 40, !dbg !163 + %3973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 41, !dbg !163 + %3974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 42, !dbg !163 + %3975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 43, !dbg !163 + %3976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 44, !dbg !163 + %3977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 45, !dbg !163 + %3978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 46, !dbg !163 + %3979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 47, !dbg !163 + %3980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 48, !dbg !163 + %3981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 49, !dbg !163 + %3982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 50, !dbg !163 + %3983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 51, !dbg !163 + %3984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 52, !dbg !163 + %3985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 53, !dbg !163 + %3986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 54, !dbg !163 + %3987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 55, !dbg !163 + %3988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 56, !dbg !163 + %3989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 57, !dbg !163 + %3990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 58, !dbg !163 + %3991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 59, !dbg !163 + %3992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 60, !dbg !163 + %3993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 61, !dbg !163 + %3994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 62, !dbg !163 + %3995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3931, 63, !dbg !163 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !163 + %3996 = add nuw nsw i32 %2564, 1, !dbg !149 + %3997 = lshr i32 %3996, 1, !dbg !164 + %3998 = zext nneg i32 %3997 to i64, !dbg !165 + %3999 = getelementptr i32, ptr addrspace(1) %2380, i64 %3998, !dbg !165 + %4000 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !166 + %4001 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %3999, i64 %4000, i1 %2566) #3, !dbg !166 + %4002 = add nuw nsw i32 %3997, 1, !dbg !167 + %4003 = icmp slt i32 %4002, %2384, !dbg !168 + %4004 = getelementptr i8, ptr addrspace(1) %3999, i64 4, !dbg !169 + %4005 = and i1 %2566, %4003, !dbg !149 + %4006 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !170 + %4007 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %4004, i64 %4006, i1 %4005) #3, !dbg !170 + %4008 = and i32 %2564, 1, !dbg !171 + %4009 = sub i32 %4007, %4001, !dbg !172 + %4010 = shl i32 %4009, 7, !dbg !173 + %4011 = add i32 %4010, 33554368, !dbg !174 + %4012 = shl nuw nsw i32 %4008, 13, !dbg !175 + %4013 = shl nuw nsw i32 %4008, 7, !dbg !176 + %4014 = xor i32 %4013, 128, !dbg !176 + %4015 = mul i32 %4014, %4011, !dbg !175 + %4016 = add i32 %4015, %4012, !dbg !175 + %4017 = sext i32 %4016 to i64, !dbg !151 + %4018 = getelementptr bfloat, ptr addrspace(1) %.pn10621849, i64 %4017, !dbg !151 + %4019 = getelementptr bfloat, ptr addrspace(1) %.pn10461850, i64 %4017, !dbg !151 + %4020 = getelementptr bfloat, ptr addrspace(1) %.pn10301851, i64 %4017, !dbg !151 + %4021 = getelementptr bfloat, ptr addrspace(1) %.pn10141852, i64 %4017, !dbg !151 + %4022 = getelementptr bfloat, ptr addrspace(1) %.pn11261853, i64 %4017, !dbg !152 + %4023 = getelementptr bfloat, ptr addrspace(1) %.pn11101854, i64 %4017, !dbg !152 + %4024 = getelementptr bfloat, ptr addrspace(1) %.pn10941855, i64 %4017, !dbg !152 + %4025 = getelementptr bfloat, ptr addrspace(1) %.pn10781856, i64 %4017, !dbg !152 + %4026 = add i32 %2563, 1, !dbg !149 + %4027 = icmp sgt i32 %4026, 2, !dbg !149 + %4028 = select i1 %4027, i32 0, i32 %4026, !dbg !149 + %4029 = shl i32 %4028, 13, !dbg !150 + %4030 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %4029, !dbg !150 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !150 + %4031 = getelementptr inbounds nuw i8, ptr addrspace(3) %4030, i32 %402, !dbg !150 + %4032 = select i1 %2565, i32 16, i32 0, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4031, ptr addrspace(1) %4018, i32 %4032) #3, !dbg !150 + %4033 = getelementptr inbounds nuw i8, ptr addrspace(3) %4030, i32 %405, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4033, ptr addrspace(1) %4019, i32 %4032) #3, !dbg !150 + %4034 = getelementptr inbounds nuw i8, ptr addrspace(3) %4030, i32 %407, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4034, ptr addrspace(1) %4020, i32 %4032) #3, !dbg !150 + %4035 = getelementptr inbounds nuw i8, ptr addrspace(3) %4030, i32 %409, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4035, ptr addrspace(1) %4021, i32 %4032) #3, !dbg !150 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !150 + %4036 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %4029, !dbg !150 + %4037 = getelementptr inbounds nuw i8, ptr addrspace(3) %4036, i32 %402, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4037, ptr addrspace(1) %4022, i32 %4032) #3, !dbg !150 + %4038 = getelementptr inbounds nuw i8, ptr addrspace(3) %4036, i32 %405, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4038, ptr addrspace(1) %4023, i32 %4032) #3, !dbg !150 + %4039 = getelementptr inbounds nuw i8, ptr addrspace(3) %4036, i32 %407, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4039, ptr addrspace(1) %4024, i32 %4032) #3, !dbg !150 + %4040 = getelementptr inbounds nuw i8, ptr addrspace(3) %4036, i32 %409, !dbg !150 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4040, ptr addrspace(1) %4025, i32 %4032) #3, !dbg !150 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !150 + %exitcond2127.not = icmp eq i32 %3996, %2490, !dbg !149 + br i1 %exitcond2127.not, label %._crit_edge1859, label %2561, !dbg !149 + +._crit_edge1859: ; preds = %__nv_exp2f.exit1411, %._crit_edge1847 + %4041 = phi float [ %2426, %._crit_edge1847 ], [ %3932, %__nv_exp2f.exit1411 ], !dbg !77 + %4042 = phi float [ %2427, %._crit_edge1847 ], [ %3933, %__nv_exp2f.exit1411 ], !dbg !77 + %4043 = phi float [ %2428, %._crit_edge1847 ], [ %3934, %__nv_exp2f.exit1411 ], !dbg !77 + %4044 = phi float [ %2429, %._crit_edge1847 ], [ %3935, %__nv_exp2f.exit1411 ], !dbg !77 + %4045 = phi float [ %2430, %._crit_edge1847 ], [ %3936, %__nv_exp2f.exit1411 ], !dbg !77 + %4046 = phi float [ %2431, %._crit_edge1847 ], [ %3937, %__nv_exp2f.exit1411 ], !dbg !77 + %4047 = phi float [ %2432, %._crit_edge1847 ], [ %3938, %__nv_exp2f.exit1411 ], !dbg !77 + %4048 = phi float [ %2433, %._crit_edge1847 ], [ %3939, %__nv_exp2f.exit1411 ], !dbg !77 + %4049 = phi float [ %2434, %._crit_edge1847 ], [ %3940, %__nv_exp2f.exit1411 ], !dbg !77 + %4050 = phi float [ %2435, %._crit_edge1847 ], [ %3941, %__nv_exp2f.exit1411 ], !dbg !77 + %4051 = phi float [ %2436, %._crit_edge1847 ], [ %3942, %__nv_exp2f.exit1411 ], !dbg !77 + %4052 = phi float [ %2437, %._crit_edge1847 ], [ %3943, %__nv_exp2f.exit1411 ], !dbg !77 + %4053 = phi float [ %2438, %._crit_edge1847 ], [ %3944, %__nv_exp2f.exit1411 ], !dbg !77 + %4054 = phi float [ %2439, %._crit_edge1847 ], [ %3945, %__nv_exp2f.exit1411 ], !dbg !77 + %4055 = phi float [ %2440, %._crit_edge1847 ], [ %3946, %__nv_exp2f.exit1411 ], !dbg !77 + %4056 = phi float [ %2441, %._crit_edge1847 ], [ %3947, %__nv_exp2f.exit1411 ], !dbg !77 + %4057 = phi float [ %2442, %._crit_edge1847 ], [ %3948, %__nv_exp2f.exit1411 ], !dbg !77 + %4058 = phi float [ %2443, %._crit_edge1847 ], [ %3949, %__nv_exp2f.exit1411 ], !dbg !77 + %4059 = phi float [ %2444, %._crit_edge1847 ], [ %3950, %__nv_exp2f.exit1411 ], !dbg !77 + %4060 = phi float [ %2445, %._crit_edge1847 ], [ %3951, %__nv_exp2f.exit1411 ], !dbg !77 + %4061 = phi float [ %2446, %._crit_edge1847 ], [ %3952, %__nv_exp2f.exit1411 ], !dbg !77 + %4062 = phi float [ %2447, %._crit_edge1847 ], [ %3953, %__nv_exp2f.exit1411 ], !dbg !77 + %4063 = phi float [ %2448, %._crit_edge1847 ], [ %3954, %__nv_exp2f.exit1411 ], !dbg !77 + %4064 = phi float [ %2449, %._crit_edge1847 ], [ %3955, %__nv_exp2f.exit1411 ], !dbg !77 + %4065 = phi float [ %2450, %._crit_edge1847 ], [ %3956, %__nv_exp2f.exit1411 ], !dbg !77 + %4066 = phi float [ %2451, %._crit_edge1847 ], [ %3957, %__nv_exp2f.exit1411 ], !dbg !77 + %4067 = phi float [ %2452, %._crit_edge1847 ], [ %3958, %__nv_exp2f.exit1411 ], !dbg !77 + %4068 = phi float [ %2453, %._crit_edge1847 ], [ %3959, %__nv_exp2f.exit1411 ], !dbg !77 + %4069 = phi float [ %2454, %._crit_edge1847 ], [ %3960, %__nv_exp2f.exit1411 ], !dbg !77 + %4070 = phi float [ %2455, %._crit_edge1847 ], [ %3961, %__nv_exp2f.exit1411 ], !dbg !77 + %4071 = phi float [ %2456, %._crit_edge1847 ], [ %3962, %__nv_exp2f.exit1411 ], !dbg !77 + %4072 = phi float [ %2457, %._crit_edge1847 ], [ %3963, %__nv_exp2f.exit1411 ], !dbg !77 + %4073 = phi float [ %2458, %._crit_edge1847 ], [ %3964, %__nv_exp2f.exit1411 ], !dbg !77 + %4074 = phi float [ %2459, %._crit_edge1847 ], [ %3965, %__nv_exp2f.exit1411 ], !dbg !77 + %4075 = phi float [ %2460, %._crit_edge1847 ], [ %3966, %__nv_exp2f.exit1411 ], !dbg !77 + %4076 = phi float [ %2461, %._crit_edge1847 ], [ %3967, %__nv_exp2f.exit1411 ], !dbg !77 + %4077 = phi float [ %2462, %._crit_edge1847 ], [ %3968, %__nv_exp2f.exit1411 ], !dbg !77 + %4078 = phi float [ %2463, %._crit_edge1847 ], [ %3969, %__nv_exp2f.exit1411 ], !dbg !77 + %4079 = phi float [ %2464, %._crit_edge1847 ], [ %3970, %__nv_exp2f.exit1411 ], !dbg !77 + %4080 = phi float [ %2465, %._crit_edge1847 ], [ %3971, %__nv_exp2f.exit1411 ], !dbg !77 + %4081 = phi float [ %2466, %._crit_edge1847 ], [ %3972, %__nv_exp2f.exit1411 ], !dbg !77 + %4082 = phi float [ %2467, %._crit_edge1847 ], [ %3973, %__nv_exp2f.exit1411 ], !dbg !77 + %4083 = phi float [ %2468, %._crit_edge1847 ], [ %3974, %__nv_exp2f.exit1411 ], !dbg !77 + %4084 = phi float [ %2469, %._crit_edge1847 ], [ %3975, %__nv_exp2f.exit1411 ], !dbg !77 + %4085 = phi float [ %2470, %._crit_edge1847 ], [ %3976, %__nv_exp2f.exit1411 ], !dbg !77 + %4086 = phi float [ %2471, %._crit_edge1847 ], [ %3977, %__nv_exp2f.exit1411 ], !dbg !77 + %4087 = phi float [ %2472, %._crit_edge1847 ], [ %3978, %__nv_exp2f.exit1411 ], !dbg !77 + %4088 = phi float [ %2473, %._crit_edge1847 ], [ %3979, %__nv_exp2f.exit1411 ], !dbg !77 + %4089 = phi float [ %2474, %._crit_edge1847 ], [ %3980, %__nv_exp2f.exit1411 ], !dbg !77 + %4090 = phi float [ %2475, %._crit_edge1847 ], [ %3981, %__nv_exp2f.exit1411 ], !dbg !77 + %4091 = phi float [ %2476, %._crit_edge1847 ], [ %3982, %__nv_exp2f.exit1411 ], !dbg !77 + %4092 = phi float [ %2477, %._crit_edge1847 ], [ %3983, %__nv_exp2f.exit1411 ], !dbg !77 + %4093 = phi float [ %2478, %._crit_edge1847 ], [ %3984, %__nv_exp2f.exit1411 ], !dbg !77 + %4094 = phi float [ %2479, %._crit_edge1847 ], [ %3985, %__nv_exp2f.exit1411 ], !dbg !77 + %4095 = phi float [ %2480, %._crit_edge1847 ], [ %3986, %__nv_exp2f.exit1411 ], !dbg !77 + %4096 = phi float [ %2481, %._crit_edge1847 ], [ %3987, %__nv_exp2f.exit1411 ], !dbg !77 + %4097 = phi float [ %2482, %._crit_edge1847 ], [ %3988, %__nv_exp2f.exit1411 ], !dbg !77 + %4098 = phi float [ %2483, %._crit_edge1847 ], [ %3989, %__nv_exp2f.exit1411 ], !dbg !77 + %4099 = phi float [ %2484, %._crit_edge1847 ], [ %3990, %__nv_exp2f.exit1411 ], !dbg !77 + %4100 = phi float [ %2485, %._crit_edge1847 ], [ %3991, %__nv_exp2f.exit1411 ], !dbg !77 + %4101 = phi float [ %2486, %._crit_edge1847 ], [ %3992, %__nv_exp2f.exit1411 ], !dbg !77 + %4102 = phi float [ %2487, %._crit_edge1847 ], [ %3993, %__nv_exp2f.exit1411 ], !dbg !77 + %4103 = phi float [ %2488, %._crit_edge1847 ], [ %3994, %__nv_exp2f.exit1411 ], !dbg !77 + %4104 = phi float [ %2489, %._crit_edge1847 ], [ %3995, %__nv_exp2f.exit1411 ], !dbg !77 + %4105 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %4041, float %4042, float %4043, float %4044, float %4045, float %4046, float %4047, float %4048, float %4049, float %4050, float %4051, float %4052, float %4053, float %4054, float %4055, float %4056, float %4057, float %4058, float %4059, float %4060, float %4061, float %4062, float %4063, float %4064, float %4065, float %4066, float %4067, float %4068, float %4069, float %4070, float %4071, float %4072, float %4073, float %4074, float %4075, float %4076, float %4077, float %4078, float %4079, float %4080, float %4081, float %4082, float %4083, float %4084, float %4085, float %4086, float %4087, float %4088, float %4089, float %4090, float %4091, float %4092, float %4093, float %4094, float %4095, float %4096, float %4097, float %4098, float %4099, float %4100, float %4101, float %4102, float %4103, float %4104) #3, !dbg !149 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !149 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !149 + %4106 = getelementptr bfloat, ptr addrspace(1) %77, i64 %103, !dbg !177 + %4107 = getelementptr bfloat, ptr addrspace(1) %77, i64 %105, !dbg !177 + %4108 = getelementptr bfloat, ptr addrspace(1) %77, i64 %107, !dbg !177 + %4109 = getelementptr bfloat, ptr addrspace(1) %77, i64 %109, !dbg !177 + %4110 = getelementptr bfloat, ptr addrspace(1) %77, i64 %111, !dbg !177 + %4111 = getelementptr bfloat, ptr addrspace(1) %77, i64 %113, !dbg !177 + %4112 = getelementptr bfloat, ptr addrspace(1) %77, i64 %115, !dbg !177 + %4113 = getelementptr bfloat, ptr addrspace(1) %77, i64 %117, !dbg !177 + %4114 = getelementptr bfloat, ptr addrspace(1) %4106, i64 %121, !dbg !178 + %4115 = getelementptr bfloat, ptr addrspace(1) %4107, i64 %121, !dbg !178 + %4116 = getelementptr bfloat, ptr addrspace(1) %4108, i64 %121, !dbg !178 + %4117 = getelementptr bfloat, ptr addrspace(1) %4109, i64 %121, !dbg !178 + %4118 = getelementptr bfloat, ptr addrspace(1) %4110, i64 %121, !dbg !178 + %4119 = getelementptr bfloat, ptr addrspace(1) %4111, i64 %121, !dbg !178 + %4120 = getelementptr bfloat, ptr addrspace(1) %4112, i64 %121, !dbg !178 + %4121 = getelementptr bfloat, ptr addrspace(1) %4113, i64 %121, !dbg !178 + %4122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 0, !dbg !179 + %4123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 1, !dbg !179 + %4124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 2, !dbg !179 + %4125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 3, !dbg !179 + %4126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 4, !dbg !179 + %4127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 5, !dbg !179 + %4128 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 6, !dbg !179 + %4129 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 7, !dbg !179 + %4130 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 8, !dbg !179 + %4131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 9, !dbg !179 + %4132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 10, !dbg !179 + %4133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 11, !dbg !179 + %4134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 12, !dbg !179 + %4135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 13, !dbg !179 + %4136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 14, !dbg !179 + %4137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 15, !dbg !179 + %4138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 16, !dbg !179 + %4139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 17, !dbg !179 + %4140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 18, !dbg !179 + %4141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 19, !dbg !179 + %4142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 20, !dbg !179 + %4143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 21, !dbg !179 + %4144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 22, !dbg !179 + %4145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 23, !dbg !179 + %4146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 24, !dbg !179 + %4147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 25, !dbg !179 + %4148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 26, !dbg !179 + %4149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 27, !dbg !179 + %4150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 28, !dbg !179 + %4151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 29, !dbg !179 + %4152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 30, !dbg !179 + %4153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 31, !dbg !179 + %4154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 32, !dbg !179 + %4155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 33, !dbg !179 + %4156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 34, !dbg !179 + %4157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 35, !dbg !179 + %4158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 36, !dbg !179 + %4159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 37, !dbg !179 + %4160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 38, !dbg !179 + %4161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 39, !dbg !179 + %4162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 40, !dbg !179 + %4163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 41, !dbg !179 + %4164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 42, !dbg !179 + %4165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 43, !dbg !179 + %4166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 44, !dbg !179 + %4167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 45, !dbg !179 + %4168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 46, !dbg !179 + %4169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 47, !dbg !179 + %4170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 48, !dbg !179 + %4171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 49, !dbg !179 + %4172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 50, !dbg !179 + %4173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 51, !dbg !179 + %4174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 52, !dbg !179 + %4175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 53, !dbg !179 + %4176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 54, !dbg !179 + %4177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 55, !dbg !179 + %4178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 56, !dbg !179 + %4179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 57, !dbg !179 + %4180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 58, !dbg !179 + %4181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 59, !dbg !179 + %4182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 60, !dbg !179 + %4183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 61, !dbg !179 + %4184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 62, !dbg !179 + %4185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4105, 63, !dbg !179 + %4186 = insertelement <2 x float> poison, float %4122, i64 0, !dbg !179 + %4187 = insertelement <2 x float> %4186, float %4123, i64 1, !dbg !179 + %4188 = fmul <2 x float> %4187, splat (float 0x3FB6A09E60000000), !dbg !179 + %4189 = fptrunc <2 x float> %4188 to <2 x bfloat>, !dbg !180 + %4190 = insertelement <2 x float> poison, float %4124, i64 0, !dbg !179 + %4191 = insertelement <2 x float> %4190, float %4125, i64 1, !dbg !179 + %4192 = fmul <2 x float> %4191, splat (float 0x3FB6A09E60000000), !dbg !179 + %4193 = fptrunc <2 x float> %4192 to <2 x bfloat>, !dbg !180 + %4194 = insertelement <2 x float> poison, float %4126, i64 0, !dbg !179 + %4195 = insertelement <2 x float> %4194, float %4127, i64 1, !dbg !179 + %4196 = fmul <2 x float> %4195, splat (float 0x3FB6A09E60000000), !dbg !179 + %4197 = fptrunc <2 x float> %4196 to <2 x bfloat>, !dbg !180 + %4198 = insertelement <2 x float> poison, float %4128, i64 0, !dbg !179 + %4199 = insertelement <2 x float> %4198, float %4129, i64 1, !dbg !179 + %4200 = fmul <2 x float> %4199, splat (float 0x3FB6A09E60000000), !dbg !179 + %4201 = fptrunc <2 x float> %4200 to <2 x bfloat>, !dbg !180 + %4202 = insertelement <2 x float> poison, float %4130, i64 0, !dbg !179 + %4203 = insertelement <2 x float> %4202, float %4131, i64 1, !dbg !179 + %4204 = fmul <2 x float> %4203, splat (float 0x3FB6A09E60000000), !dbg !179 + %4205 = fptrunc <2 x float> %4204 to <2 x bfloat>, !dbg !180 + %4206 = insertelement <2 x float> poison, float %4132, i64 0, !dbg !179 + %4207 = insertelement <2 x float> %4206, float %4133, i64 1, !dbg !179 + %4208 = fmul <2 x float> %4207, splat (float 0x3FB6A09E60000000), !dbg !179 + %4209 = fptrunc <2 x float> %4208 to <2 x bfloat>, !dbg !180 + %4210 = insertelement <2 x float> poison, float %4134, i64 0, !dbg !179 + %4211 = insertelement <2 x float> %4210, float %4135, i64 1, !dbg !179 + %4212 = fmul <2 x float> %4211, splat (float 0x3FB6A09E60000000), !dbg !179 + %4213 = fptrunc <2 x float> %4212 to <2 x bfloat>, !dbg !180 + %4214 = insertelement <2 x float> poison, float %4136, i64 0, !dbg !179 + %4215 = insertelement <2 x float> %4214, float %4137, i64 1, !dbg !179 + %4216 = fmul <2 x float> %4215, splat (float 0x3FB6A09E60000000), !dbg !179 + %4217 = fptrunc <2 x float> %4216 to <2 x bfloat>, !dbg !180 + %4218 = insertelement <2 x float> poison, float %4138, i64 0, !dbg !179 + %4219 = insertelement <2 x float> %4218, float %4139, i64 1, !dbg !179 + %4220 = fmul <2 x float> %4219, splat (float 0x3FB6A09E60000000), !dbg !179 + %4221 = fptrunc <2 x float> %4220 to <2 x bfloat>, !dbg !180 + %4222 = insertelement <2 x float> poison, float %4140, i64 0, !dbg !179 + %4223 = insertelement <2 x float> %4222, float %4141, i64 1, !dbg !179 + %4224 = fmul <2 x float> %4223, splat (float 0x3FB6A09E60000000), !dbg !179 + %4225 = fptrunc <2 x float> %4224 to <2 x bfloat>, !dbg !180 + %4226 = insertelement <2 x float> poison, float %4142, i64 0, !dbg !179 + %4227 = insertelement <2 x float> %4226, float %4143, i64 1, !dbg !179 + %4228 = fmul <2 x float> %4227, splat (float 0x3FB6A09E60000000), !dbg !179 + %4229 = fptrunc <2 x float> %4228 to <2 x bfloat>, !dbg !180 + %4230 = insertelement <2 x float> poison, float %4144, i64 0, !dbg !179 + %4231 = insertelement <2 x float> %4230, float %4145, i64 1, !dbg !179 + %4232 = fmul <2 x float> %4231, splat (float 0x3FB6A09E60000000), !dbg !179 + %4233 = fptrunc <2 x float> %4232 to <2 x bfloat>, !dbg !180 + %4234 = insertelement <2 x float> poison, float %4146, i64 0, !dbg !179 + %4235 = insertelement <2 x float> %4234, float %4147, i64 1, !dbg !179 + %4236 = fmul <2 x float> %4235, splat (float 0x3FB6A09E60000000), !dbg !179 + %4237 = fptrunc <2 x float> %4236 to <2 x bfloat>, !dbg !180 + %4238 = insertelement <2 x float> poison, float %4148, i64 0, !dbg !179 + %4239 = insertelement <2 x float> %4238, float %4149, i64 1, !dbg !179 + %4240 = fmul <2 x float> %4239, splat (float 0x3FB6A09E60000000), !dbg !179 + %4241 = fptrunc <2 x float> %4240 to <2 x bfloat>, !dbg !180 + %4242 = insertelement <2 x float> poison, float %4150, i64 0, !dbg !179 + %4243 = insertelement <2 x float> %4242, float %4151, i64 1, !dbg !179 + %4244 = fmul <2 x float> %4243, splat (float 0x3FB6A09E60000000), !dbg !179 + %4245 = fptrunc <2 x float> %4244 to <2 x bfloat>, !dbg !180 + %4246 = insertelement <2 x float> poison, float %4152, i64 0, !dbg !179 + %4247 = insertelement <2 x float> %4246, float %4153, i64 1, !dbg !179 + %4248 = fmul <2 x float> %4247, splat (float 0x3FB6A09E60000000), !dbg !179 + %4249 = fptrunc <2 x float> %4248 to <2 x bfloat>, !dbg !180 + %4250 = insertelement <2 x float> poison, float %4154, i64 0, !dbg !179 + %4251 = insertelement <2 x float> %4250, float %4155, i64 1, !dbg !179 + %4252 = fmul <2 x float> %4251, splat (float 0x3FB6A09E60000000), !dbg !179 + %4253 = fptrunc <2 x float> %4252 to <2 x bfloat>, !dbg !180 + %4254 = insertelement <2 x float> poison, float %4156, i64 0, !dbg !179 + %4255 = insertelement <2 x float> %4254, float %4157, i64 1, !dbg !179 + %4256 = fmul <2 x float> %4255, splat (float 0x3FB6A09E60000000), !dbg !179 + %4257 = fptrunc <2 x float> %4256 to <2 x bfloat>, !dbg !180 + %4258 = insertelement <2 x float> poison, float %4158, i64 0, !dbg !179 + %4259 = insertelement <2 x float> %4258, float %4159, i64 1, !dbg !179 + %4260 = fmul <2 x float> %4259, splat (float 0x3FB6A09E60000000), !dbg !179 + %4261 = fptrunc <2 x float> %4260 to <2 x bfloat>, !dbg !180 + %4262 = insertelement <2 x float> poison, float %4160, i64 0, !dbg !179 + %4263 = insertelement <2 x float> %4262, float %4161, i64 1, !dbg !179 + %4264 = fmul <2 x float> %4263, splat (float 0x3FB6A09E60000000), !dbg !179 + %4265 = fptrunc <2 x float> %4264 to <2 x bfloat>, !dbg !180 + %4266 = insertelement <2 x float> poison, float %4162, i64 0, !dbg !179 + %4267 = insertelement <2 x float> %4266, float %4163, i64 1, !dbg !179 + %4268 = fmul <2 x float> %4267, splat (float 0x3FB6A09E60000000), !dbg !179 + %4269 = fptrunc <2 x float> %4268 to <2 x bfloat>, !dbg !180 + %4270 = insertelement <2 x float> poison, float %4164, i64 0, !dbg !179 + %4271 = insertelement <2 x float> %4270, float %4165, i64 1, !dbg !179 + %4272 = fmul <2 x float> %4271, splat (float 0x3FB6A09E60000000), !dbg !179 + %4273 = fptrunc <2 x float> %4272 to <2 x bfloat>, !dbg !180 + %4274 = insertelement <2 x float> poison, float %4166, i64 0, !dbg !179 + %4275 = insertelement <2 x float> %4274, float %4167, i64 1, !dbg !179 + %4276 = fmul <2 x float> %4275, splat (float 0x3FB6A09E60000000), !dbg !179 + %4277 = fptrunc <2 x float> %4276 to <2 x bfloat>, !dbg !180 + %4278 = insertelement <2 x float> poison, float %4168, i64 0, !dbg !179 + %4279 = insertelement <2 x float> %4278, float %4169, i64 1, !dbg !179 + %4280 = fmul <2 x float> %4279, splat (float 0x3FB6A09E60000000), !dbg !179 + %4281 = fptrunc <2 x float> %4280 to <2 x bfloat>, !dbg !180 + %4282 = insertelement <2 x float> poison, float %4170, i64 0, !dbg !179 + %4283 = insertelement <2 x float> %4282, float %4171, i64 1, !dbg !179 + %4284 = fmul <2 x float> %4283, splat (float 0x3FB6A09E60000000), !dbg !179 + %4285 = fptrunc <2 x float> %4284 to <2 x bfloat>, !dbg !180 + %4286 = insertelement <2 x float> poison, float %4172, i64 0, !dbg !179 + %4287 = insertelement <2 x float> %4286, float %4173, i64 1, !dbg !179 + %4288 = fmul <2 x float> %4287, splat (float 0x3FB6A09E60000000), !dbg !179 + %4289 = fptrunc <2 x float> %4288 to <2 x bfloat>, !dbg !180 + %4290 = insertelement <2 x float> poison, float %4174, i64 0, !dbg !179 + %4291 = insertelement <2 x float> %4290, float %4175, i64 1, !dbg !179 + %4292 = fmul <2 x float> %4291, splat (float 0x3FB6A09E60000000), !dbg !179 + %4293 = fptrunc <2 x float> %4292 to <2 x bfloat>, !dbg !180 + %4294 = insertelement <2 x float> poison, float %4176, i64 0, !dbg !179 + %4295 = insertelement <2 x float> %4294, float %4177, i64 1, !dbg !179 + %4296 = fmul <2 x float> %4295, splat (float 0x3FB6A09E60000000), !dbg !179 + %4297 = fptrunc <2 x float> %4296 to <2 x bfloat>, !dbg !180 + %4298 = insertelement <2 x float> poison, float %4178, i64 0, !dbg !179 + %4299 = insertelement <2 x float> %4298, float %4179, i64 1, !dbg !179 + %4300 = fmul <2 x float> %4299, splat (float 0x3FB6A09E60000000), !dbg !179 + %4301 = fptrunc <2 x float> %4300 to <2 x bfloat>, !dbg !180 + %4302 = insertelement <2 x float> poison, float %4180, i64 0, !dbg !179 + %4303 = insertelement <2 x float> %4302, float %4181, i64 1, !dbg !179 + %4304 = fmul <2 x float> %4303, splat (float 0x3FB6A09E60000000), !dbg !179 + %4305 = fptrunc <2 x float> %4304 to <2 x bfloat>, !dbg !180 + %4306 = insertelement <2 x float> poison, float %4182, i64 0, !dbg !179 + %4307 = insertelement <2 x float> %4306, float %4183, i64 1, !dbg !179 + %4308 = fmul <2 x float> %4307, splat (float 0x3FB6A09E60000000), !dbg !179 + %4309 = fptrunc <2 x float> %4308 to <2 x bfloat>, !dbg !180 + %4310 = insertelement <2 x float> poison, float %4184, i64 0, !dbg !179 + %4311 = insertelement <2 x float> %4310, float %4185, i64 1, !dbg !179 + %4312 = fmul <2 x float> %4311, splat (float 0x3FB6A09E60000000), !dbg !179 + %4313 = fptrunc <2 x float> %4312 to <2 x bfloat>, !dbg !180 + %4314 = shl nuw nsw i32 %365, 13, !dbg !180 + %4315 = shl nuw nsw i32 %35, 5, !dbg !180 + %4316 = and i32 %4315, 7264, !dbg !180 + %4317 = and i32 %35, 24, !dbg !180 + %4318 = shl nuw nsw i32 %4317, 4, !dbg !180 + %4319 = shl nuw nsw i32 %35, 2, !dbg !180 + %4320 = and i32 %4319, 16, !dbg !180 + %4321 = or disjoint i32 %4314, %4320, !dbg !180 + %4322 = or disjoint i32 %4316, %4318, !dbg !180 + %4323 = or disjoint i32 %4321, %4322, !dbg !180 + %4324 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4323, !dbg !180 + %4325 = bitcast <2 x bfloat> %4189 to i32, !dbg !180 + %4326 = bitcast <2 x bfloat> %4197 to i32, !dbg !180 + %4327 = bitcast <2 x bfloat> %4205 to i32, !dbg !180 + %4328 = bitcast <2 x bfloat> %4213 to i32, !dbg !180 + %4329 = insertelement <4 x i32> poison, i32 %4325, i64 0, !dbg !180 + %4330 = insertelement <4 x i32> %4329, i32 %4326, i64 1, !dbg !180 + %4331 = insertelement <4 x i32> %4330, i32 %4327, i64 2, !dbg !180 + %4332 = insertelement <4 x i32> %4331, i32 %4328, i64 3, !dbg !180 + store <4 x i32> %4332, ptr addrspace(3) %4324, align 16, !dbg !180 + %4333 = getelementptr inbounds nuw i8, ptr addrspace(3) %4324, i32 512, !dbg !180 + %4334 = bitcast <2 x bfloat> %4193 to i32, !dbg !180 + %4335 = bitcast <2 x bfloat> %4201 to i32, !dbg !180 + %4336 = bitcast <2 x bfloat> %4209 to i32, !dbg !180 + %4337 = bitcast <2 x bfloat> %4217 to i32, !dbg !180 + %4338 = insertelement <4 x i32> poison, i32 %4334, i64 0, !dbg !180 + %4339 = insertelement <4 x i32> %4338, i32 %4335, i64 1, !dbg !180 + %4340 = insertelement <4 x i32> %4339, i32 %4336, i64 2, !dbg !180 + %4341 = insertelement <4 x i32> %4340, i32 %4337, i64 3, !dbg !180 + store <4 x i32> %4341, ptr addrspace(3) %4333, align 16, !dbg !180 + %4342 = xor i32 %4323, 32, !dbg !180 + %4343 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4342, !dbg !180 + %4344 = bitcast <2 x bfloat> %4221 to i32, !dbg !180 + %4345 = bitcast <2 x bfloat> %4229 to i32, !dbg !180 + %4346 = bitcast <2 x bfloat> %4237 to i32, !dbg !180 + %4347 = bitcast <2 x bfloat> %4245 to i32, !dbg !180 + %4348 = insertelement <4 x i32> poison, i32 %4344, i64 0, !dbg !180 + %4349 = insertelement <4 x i32> %4348, i32 %4345, i64 1, !dbg !180 + %4350 = insertelement <4 x i32> %4349, i32 %4346, i64 2, !dbg !180 + %4351 = insertelement <4 x i32> %4350, i32 %4347, i64 3, !dbg !180 + store <4 x i32> %4351, ptr addrspace(3) %4343, align 16, !dbg !180 + %4352 = getelementptr inbounds nuw i8, ptr addrspace(3) %4343, i32 512, !dbg !180 + %4353 = bitcast <2 x bfloat> %4225 to i32, !dbg !180 + %4354 = bitcast <2 x bfloat> %4233 to i32, !dbg !180 + %4355 = bitcast <2 x bfloat> %4241 to i32, !dbg !180 + %4356 = bitcast <2 x bfloat> %4249 to i32, !dbg !180 + %4357 = insertelement <4 x i32> poison, i32 %4353, i64 0, !dbg !180 + %4358 = insertelement <4 x i32> %4357, i32 %4354, i64 1, !dbg !180 + %4359 = insertelement <4 x i32> %4358, i32 %4355, i64 2, !dbg !180 + %4360 = insertelement <4 x i32> %4359, i32 %4356, i64 3, !dbg !180 + store <4 x i32> %4360, ptr addrspace(3) %4352, align 16, !dbg !180 + %4361 = xor i32 %4323, 64, !dbg !180 + %4362 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4361, !dbg !180 + %4363 = bitcast <2 x bfloat> %4253 to i32, !dbg !180 + %4364 = bitcast <2 x bfloat> %4261 to i32, !dbg !180 + %4365 = bitcast <2 x bfloat> %4269 to i32, !dbg !180 + %4366 = bitcast <2 x bfloat> %4277 to i32, !dbg !180 + %4367 = insertelement <4 x i32> poison, i32 %4363, i64 0, !dbg !180 + %4368 = insertelement <4 x i32> %4367, i32 %4364, i64 1, !dbg !180 + %4369 = insertelement <4 x i32> %4368, i32 %4365, i64 2, !dbg !180 + %4370 = insertelement <4 x i32> %4369, i32 %4366, i64 3, !dbg !180 + store <4 x i32> %4370, ptr addrspace(3) %4362, align 16, !dbg !180 + %4371 = getelementptr inbounds nuw i8, ptr addrspace(3) %4362, i32 512, !dbg !180 + %4372 = bitcast <2 x bfloat> %4257 to i32, !dbg !180 + %4373 = bitcast <2 x bfloat> %4265 to i32, !dbg !180 + %4374 = bitcast <2 x bfloat> %4273 to i32, !dbg !180 + %4375 = bitcast <2 x bfloat> %4281 to i32, !dbg !180 + %4376 = insertelement <4 x i32> poison, i32 %4372, i64 0, !dbg !180 + %4377 = insertelement <4 x i32> %4376, i32 %4373, i64 1, !dbg !180 + %4378 = insertelement <4 x i32> %4377, i32 %4374, i64 2, !dbg !180 + %4379 = insertelement <4 x i32> %4378, i32 %4375, i64 3, !dbg !180 + store <4 x i32> %4379, ptr addrspace(3) %4371, align 16, !dbg !180 + %4380 = xor i32 %4323, 96, !dbg !180 + %4381 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4380, !dbg !180 + %4382 = bitcast <2 x bfloat> %4285 to i32, !dbg !180 + %4383 = bitcast <2 x bfloat> %4293 to i32, !dbg !180 + %4384 = bitcast <2 x bfloat> %4301 to i32, !dbg !180 + %4385 = bitcast <2 x bfloat> %4309 to i32, !dbg !180 + %4386 = insertelement <4 x i32> poison, i32 %4382, i64 0, !dbg !180 + %4387 = insertelement <4 x i32> %4386, i32 %4383, i64 1, !dbg !180 + %4388 = insertelement <4 x i32> %4387, i32 %4384, i64 2, !dbg !180 + %4389 = insertelement <4 x i32> %4388, i32 %4385, i64 3, !dbg !180 + store <4 x i32> %4389, ptr addrspace(3) %4381, align 16, !dbg !180 + %4390 = getelementptr inbounds nuw i8, ptr addrspace(3) %4381, i32 512, !dbg !180 + %4391 = bitcast <2 x bfloat> %4289 to i32, !dbg !180 + %4392 = bitcast <2 x bfloat> %4297 to i32, !dbg !180 + %4393 = bitcast <2 x bfloat> %4305 to i32, !dbg !180 + %4394 = bitcast <2 x bfloat> %4313 to i32, !dbg !180 + %4395 = insertelement <4 x i32> poison, i32 %4391, i64 0, !dbg !180 + %4396 = insertelement <4 x i32> %4395, i32 %4392, i64 1, !dbg !180 + %4397 = insertelement <4 x i32> %4396, i32 %4393, i64 2, !dbg !180 + %4398 = insertelement <4 x i32> %4397, i32 %4394, i64 3, !dbg !180 + store <4 x i32> %4398, ptr addrspace(3) %4390, align 16, !dbg !180 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !180 + %4399 = shl nuw nsw i32 %4317, 10, !dbg !180 + %4400 = shl nuw nsw i32 %365, 5, !dbg !180 + %4401 = and i32 %4319, 1008, !dbg !180 + %4402 = or disjoint i32 %4399, %4400, !dbg !180 + %4403 = xor i32 %4402, %4401, !dbg !180 + %4404 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4403, !dbg !180 + %4405 = ptrtoint ptr addrspace(3) %4404 to i32, !dbg !180 + %4406 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4405) #3, !dbg !180 + %4407 = extractvalue { i32, i32, i32, i32 } %4406, 0, !dbg !180 + %4408 = extractvalue { i32, i32, i32, i32 } %4406, 1, !dbg !180 + %4409 = extractvalue { i32, i32, i32, i32 } %4406, 2, !dbg !180 + %4410 = extractvalue { i32, i32, i32, i32 } %4406, 3, !dbg !180 + %4411 = getelementptr inbounds nuw i8, ptr addrspace(3) %4404, i32 1024, !dbg !180 + %4412 = ptrtoint ptr addrspace(3) %4411 to i32, !dbg !180 + %4413 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4412) #3, !dbg !180 + %4414 = extractvalue { i32, i32, i32, i32 } %4413, 0, !dbg !180 + %4415 = extractvalue { i32, i32, i32, i32 } %4413, 1, !dbg !180 + %4416 = extractvalue { i32, i32, i32, i32 } %4413, 2, !dbg !180 + %4417 = extractvalue { i32, i32, i32, i32 } %4413, 3, !dbg !180 + %4418 = getelementptr inbounds nuw i8, ptr addrspace(3) %4404, i32 2048, !dbg !180 + %4419 = ptrtoint ptr addrspace(3) %4418 to i32, !dbg !180 + %4420 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4419) #3, !dbg !180 + %4421 = extractvalue { i32, i32, i32, i32 } %4420, 0, !dbg !180 + %4422 = extractvalue { i32, i32, i32, i32 } %4420, 1, !dbg !180 + %4423 = extractvalue { i32, i32, i32, i32 } %4420, 2, !dbg !180 + %4424 = extractvalue { i32, i32, i32, i32 } %4420, 3, !dbg !180 + %4425 = getelementptr inbounds nuw i8, ptr addrspace(3) %4404, i32 3072, !dbg !180 + %4426 = ptrtoint ptr addrspace(3) %4425 to i32, !dbg !180 + %4427 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4426) #3, !dbg !180 + %4428 = extractvalue { i32, i32, i32, i32 } %4427, 0, !dbg !180 + %4429 = extractvalue { i32, i32, i32, i32 } %4427, 1, !dbg !180 + %4430 = extractvalue { i32, i32, i32, i32 } %4427, 2, !dbg !180 + %4431 = extractvalue { i32, i32, i32, i32 } %4427, 3, !dbg !180 + %4432 = getelementptr inbounds nuw i8, ptr addrspace(3) %4404, i32 4096, !dbg !180 + %4433 = ptrtoint ptr addrspace(3) %4432 to i32, !dbg !180 + %4434 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4433) #3, !dbg !180 + %4435 = extractvalue { i32, i32, i32, i32 } %4434, 0, !dbg !180 + %4436 = extractvalue { i32, i32, i32, i32 } %4434, 1, !dbg !180 + %4437 = extractvalue { i32, i32, i32, i32 } %4434, 2, !dbg !180 + %4438 = extractvalue { i32, i32, i32, i32 } %4434, 3, !dbg !180 + %4439 = getelementptr inbounds nuw i8, ptr addrspace(3) %4404, i32 5120, !dbg !180 + %4440 = ptrtoint ptr addrspace(3) %4439 to i32, !dbg !180 + %4441 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4440) #3, !dbg !180 + %4442 = extractvalue { i32, i32, i32, i32 } %4441, 0, !dbg !180 + %4443 = extractvalue { i32, i32, i32, i32 } %4441, 1, !dbg !180 + %4444 = extractvalue { i32, i32, i32, i32 } %4441, 2, !dbg !180 + %4445 = extractvalue { i32, i32, i32, i32 } %4441, 3, !dbg !180 + %4446 = getelementptr inbounds nuw i8, ptr addrspace(3) %4404, i32 6144, !dbg !180 + %4447 = ptrtoint ptr addrspace(3) %4446 to i32, !dbg !180 + %4448 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4447) #3, !dbg !180 + %4449 = extractvalue { i32, i32, i32, i32 } %4448, 0, !dbg !180 + %4450 = extractvalue { i32, i32, i32, i32 } %4448, 1, !dbg !180 + %4451 = extractvalue { i32, i32, i32, i32 } %4448, 2, !dbg !180 + %4452 = extractvalue { i32, i32, i32, i32 } %4448, 3, !dbg !180 + %4453 = getelementptr inbounds nuw i8, ptr addrspace(3) %4404, i32 7168, !dbg !180 + %4454 = ptrtoint ptr addrspace(3) %4453 to i32, !dbg !180 + %4455 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4454) #3, !dbg !180 + %4456 = extractvalue { i32, i32, i32, i32 } %4455, 0, !dbg !180 + %4457 = extractvalue { i32, i32, i32, i32 } %4455, 1, !dbg !180 + %4458 = extractvalue { i32, i32, i32, i32 } %4455, 2, !dbg !180 + %4459 = extractvalue { i32, i32, i32, i32 } %4455, 3, !dbg !180 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %4407, i32 %4408, i32 %4409, i32 %4410, ptr addrspace(1) %4114) #3, !dbg !180 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %4414, i32 %4415, i32 %4416, i32 %4417, ptr addrspace(1) %4115) #3, !dbg !180 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %4421, i32 %4422, i32 %4423, i32 %4424, ptr addrspace(1) %4116) #3, !dbg !180 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %4428, i32 %4429, i32 %4430, i32 %4431, ptr addrspace(1) %4117) #3, !dbg !180 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %4435, i32 %4436, i32 %4437, i32 %4438, ptr addrspace(1) %4118) #3, !dbg !180 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %4442, i32 %4443, i32 %4444, i32 %4445, ptr addrspace(1) %4119) #3, !dbg !180 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %4449, i32 %4450, i32 %4451, i32 %4452, ptr addrspace(1) %4120) #3, !dbg !180 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %4456, i32 %4457, i32 %4458, i32 %4459, ptr addrspace(1) %4121) #3, !dbg !180 + br label %10331, !dbg !24 + +4460: ; preds = %20 + %4461 = shl nuw nsw i32 %21, 7, !dbg !181 + %4462 = or disjoint i32 %38, %4461, !dbg !182 + %4463 = or disjoint i32 %39, %4461, !dbg !182 + %4464 = or disjoint i32 %40, %4461, !dbg !182 + %4465 = or disjoint i32 %41, %4461, !dbg !182 + %4466 = or disjoint i32 %42, %4461, !dbg !182 + %4467 = or disjoint i32 %43, %4461, !dbg !182 + %4468 = or disjoint i32 %44, %4461, !dbg !182 + %4469 = or disjoint i32 %45, %4461, !dbg !182 + %4470 = or disjoint i32 %50, %4461, !dbg !182 + %4471 = or disjoint i32 %51, %4461, !dbg !182 + %4472 = shl nuw nsw i32 %4462, 7, !dbg !183 + %4473 = shl nuw nsw i32 %4463, 7, !dbg !183 + %4474 = shl nuw nsw i32 %4464, 7, !dbg !183 + %4475 = shl nuw nsw i32 %4465, 7, !dbg !183 + %4476 = shl nuw nsw i32 %4466, 7, !dbg !183 + %4477 = shl nuw nsw i32 %4467, 7, !dbg !183 + %4478 = shl nuw nsw i32 %4468, 7, !dbg !183 + %4479 = shl nuw nsw i32 %4469, 7, !dbg !183 + %4480 = zext nneg i32 %4472 to i64, !dbg !185 + %4481 = getelementptr bfloat, ptr addrspace(1) %32, i64 %4480, !dbg !185 + %4482 = zext nneg i32 %4473 to i64, !dbg !185 + %4483 = getelementptr bfloat, ptr addrspace(1) %32, i64 %4482, !dbg !185 + %4484 = zext nneg i32 %4474 to i64, !dbg !185 + %4485 = getelementptr bfloat, ptr addrspace(1) %32, i64 %4484, !dbg !185 + %4486 = zext nneg i32 %4475 to i64, !dbg !185 + %4487 = getelementptr bfloat, ptr addrspace(1) %32, i64 %4486, !dbg !185 + %4488 = zext nneg i32 %4476 to i64, !dbg !185 + %4489 = getelementptr bfloat, ptr addrspace(1) %32, i64 %4488, !dbg !185 + %4490 = zext nneg i32 %4477 to i64, !dbg !185 + %4491 = getelementptr bfloat, ptr addrspace(1) %32, i64 %4490, !dbg !185 + %4492 = zext nneg i32 %4478 to i64, !dbg !185 + %4493 = getelementptr bfloat, ptr addrspace(1) %32, i64 %4492, !dbg !185 + %4494 = zext nneg i32 %4479 to i64, !dbg !185 + %4495 = getelementptr bfloat, ptr addrspace(1) %32, i64 %4494, !dbg !185 + %4496 = shl nuw nsw i32 %35, 3, !dbg !186 + %4497 = and i32 %4496, 120, !dbg !186 + %4498 = zext nneg i32 %4497 to i64, !dbg !187 + %4499 = getelementptr bfloat, ptr addrspace(1) %4481, i64 %4498, !dbg !187 + %4500 = getelementptr bfloat, ptr addrspace(1) %4483, i64 %4498, !dbg !187 + %4501 = getelementptr bfloat, ptr addrspace(1) %4485, i64 %4498, !dbg !187 + %4502 = getelementptr bfloat, ptr addrspace(1) %4487, i64 %4498, !dbg !187 + %4503 = getelementptr bfloat, ptr addrspace(1) %4489, i64 %4498, !dbg !187 + %4504 = getelementptr bfloat, ptr addrspace(1) %4491, i64 %4498, !dbg !187 + %4505 = getelementptr bfloat, ptr addrspace(1) %4493, i64 %4498, !dbg !187 + %4506 = getelementptr bfloat, ptr addrspace(1) %4495, i64 %4498, !dbg !187 + %4507 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4499) #3, !dbg !188 + %4508 = extractvalue { i32, i32, i32, i32 } %4507, 0, !dbg !188 + %4509 = extractvalue { i32, i32, i32, i32 } %4507, 1, !dbg !188 + %4510 = extractvalue { i32, i32, i32, i32 } %4507, 2, !dbg !188 + %4511 = extractvalue { i32, i32, i32, i32 } %4507, 3, !dbg !188 + %4512 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4500) #3, !dbg !188 + %4513 = extractvalue { i32, i32, i32, i32 } %4512, 0, !dbg !188 + %4514 = extractvalue { i32, i32, i32, i32 } %4512, 1, !dbg !188 + %4515 = extractvalue { i32, i32, i32, i32 } %4512, 2, !dbg !188 + %4516 = extractvalue { i32, i32, i32, i32 } %4512, 3, !dbg !188 + %4517 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4501) #3, !dbg !188 + %4518 = extractvalue { i32, i32, i32, i32 } %4517, 0, !dbg !188 + %4519 = extractvalue { i32, i32, i32, i32 } %4517, 1, !dbg !188 + %4520 = extractvalue { i32, i32, i32, i32 } %4517, 2, !dbg !188 + %4521 = extractvalue { i32, i32, i32, i32 } %4517, 3, !dbg !188 + %4522 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4502) #3, !dbg !188 + %4523 = extractvalue { i32, i32, i32, i32 } %4522, 0, !dbg !188 + %4524 = extractvalue { i32, i32, i32, i32 } %4522, 1, !dbg !188 + %4525 = extractvalue { i32, i32, i32, i32 } %4522, 2, !dbg !188 + %4526 = extractvalue { i32, i32, i32, i32 } %4522, 3, !dbg !188 + %4527 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4503) #3, !dbg !188 + %4528 = extractvalue { i32, i32, i32, i32 } %4527, 0, !dbg !188 + %4529 = extractvalue { i32, i32, i32, i32 } %4527, 1, !dbg !188 + %4530 = extractvalue { i32, i32, i32, i32 } %4527, 2, !dbg !188 + %4531 = extractvalue { i32, i32, i32, i32 } %4527, 3, !dbg !188 + %4532 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4504) #3, !dbg !188 + %4533 = extractvalue { i32, i32, i32, i32 } %4532, 0, !dbg !188 + %4534 = extractvalue { i32, i32, i32, i32 } %4532, 1, !dbg !188 + %4535 = extractvalue { i32, i32, i32, i32 } %4532, 2, !dbg !188 + %4536 = extractvalue { i32, i32, i32, i32 } %4532, 3, !dbg !188 + %4537 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4505) #3, !dbg !188 + %4538 = extractvalue { i32, i32, i32, i32 } %4537, 0, !dbg !188 + %4539 = extractvalue { i32, i32, i32, i32 } %4537, 1, !dbg !188 + %4540 = extractvalue { i32, i32, i32, i32 } %4537, 2, !dbg !188 + %4541 = extractvalue { i32, i32, i32, i32 } %4537, 3, !dbg !188 + %4542 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4506) #3, !dbg !188 + %4543 = extractvalue { i32, i32, i32, i32 } %4542, 0, !dbg !188 + %4544 = extractvalue { i32, i32, i32, i32 } %4542, 1, !dbg !188 + %4545 = extractvalue { i32, i32, i32, i32 } %4542, 2, !dbg !188 + %4546 = extractvalue { i32, i32, i32, i32 } %4542, 3, !dbg !188 + %4547 = shl nuw nsw i32 %35, 4, !dbg !188 + %4548 = and i32 %4547, 112, !dbg !188 + %4549 = shl nuw nsw i32 %37, 3, !dbg !188 + %4550 = and i32 %35, 112, !dbg !188 + %4551 = and i32 %35, 8, !dbg !188 + %4552 = shl nuw nsw i32 %4551, 11, !dbg !188 + %4553 = or disjoint i32 %4548, %4549, !dbg !188 + %4554 = xor i32 %4553, %4550, !dbg !188 + %4555 = or disjoint i32 %4554, %4552, !dbg !188 + %4556 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4555, !dbg !188 + %4557 = insertelement <4 x i32> poison, i32 %4508, i64 0, !dbg !188 + %4558 = insertelement <4 x i32> %4557, i32 %4509, i64 1, !dbg !188 + %4559 = insertelement <4 x i32> %4558, i32 %4510, i64 2, !dbg !188 + %4560 = insertelement <4 x i32> %4559, i32 %4511, i64 3, !dbg !188 + store <4 x i32> %4560, ptr addrspace(3) %4556, align 16, !dbg !188 + %4561 = or disjoint i32 %4555, 2048, !dbg !188 + %4562 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4561, !dbg !188 + %4563 = insertelement <4 x i32> poison, i32 %4513, i64 0, !dbg !188 + %4564 = insertelement <4 x i32> %4563, i32 %4514, i64 1, !dbg !188 + %4565 = insertelement <4 x i32> %4564, i32 %4515, i64 2, !dbg !188 + %4566 = insertelement <4 x i32> %4565, i32 %4516, i64 3, !dbg !188 + store <4 x i32> %4566, ptr addrspace(3) %4562, align 16, !dbg !188 + %4567 = or disjoint i32 %4555, 4096, !dbg !188 + %4568 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4567, !dbg !188 + %4569 = insertelement <4 x i32> poison, i32 %4518, i64 0, !dbg !188 + %4570 = insertelement <4 x i32> %4569, i32 %4519, i64 1, !dbg !188 + %4571 = insertelement <4 x i32> %4570, i32 %4520, i64 2, !dbg !188 + %4572 = insertelement <4 x i32> %4571, i32 %4521, i64 3, !dbg !188 + store <4 x i32> %4572, ptr addrspace(3) %4568, align 16, !dbg !188 + %4573 = or disjoint i32 %4555, 6144, !dbg !188 + %4574 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4573, !dbg !188 + %4575 = insertelement <4 x i32> poison, i32 %4523, i64 0, !dbg !188 + %4576 = insertelement <4 x i32> %4575, i32 %4524, i64 1, !dbg !188 + %4577 = insertelement <4 x i32> %4576, i32 %4525, i64 2, !dbg !188 + %4578 = insertelement <4 x i32> %4577, i32 %4526, i64 3, !dbg !188 + store <4 x i32> %4578, ptr addrspace(3) %4574, align 16, !dbg !188 + %4579 = or disjoint i32 %4555, 8192, !dbg !188 + %4580 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4579, !dbg !188 + %4581 = insertelement <4 x i32> poison, i32 %4528, i64 0, !dbg !188 + %4582 = insertelement <4 x i32> %4581, i32 %4529, i64 1, !dbg !188 + %4583 = insertelement <4 x i32> %4582, i32 %4530, i64 2, !dbg !188 + %4584 = insertelement <4 x i32> %4583, i32 %4531, i64 3, !dbg !188 + store <4 x i32> %4584, ptr addrspace(3) %4580, align 16, !dbg !188 + %4585 = or disjoint i32 %4555, 10240, !dbg !188 + %4586 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4585, !dbg !188 + %4587 = insertelement <4 x i32> poison, i32 %4533, i64 0, !dbg !188 + %4588 = insertelement <4 x i32> %4587, i32 %4534, i64 1, !dbg !188 + %4589 = insertelement <4 x i32> %4588, i32 %4535, i64 2, !dbg !188 + %4590 = insertelement <4 x i32> %4589, i32 %4536, i64 3, !dbg !188 + store <4 x i32> %4590, ptr addrspace(3) %4586, align 16, !dbg !188 + %4591 = or disjoint i32 %4555, 12288, !dbg !188 + %4592 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4591, !dbg !188 + %4593 = insertelement <4 x i32> poison, i32 %4538, i64 0, !dbg !188 + %4594 = insertelement <4 x i32> %4593, i32 %4539, i64 1, !dbg !188 + %4595 = insertelement <4 x i32> %4594, i32 %4540, i64 2, !dbg !188 + %4596 = insertelement <4 x i32> %4595, i32 %4541, i64 3, !dbg !188 + store <4 x i32> %4596, ptr addrspace(3) %4592, align 16, !dbg !188 + %4597 = or disjoint i32 %4555, 14336, !dbg !188 + %4598 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 %4597, !dbg !188 + %4599 = insertelement <4 x i32> poison, i32 %4543, i64 0, !dbg !188 + %4600 = insertelement <4 x i32> %4599, i32 %4544, i64 1, !dbg !188 + %4601 = insertelement <4 x i32> %4600, i32 %4545, i64 2, !dbg !188 + %4602 = insertelement <4 x i32> %4601, i32 %4546, i64 3, !dbg !188 + store <4 x i32> %4602, ptr addrspace(3) %4598, align 16, !dbg !188 + %4603 = getelementptr bfloat, ptr addrspace(1) %33, i64 %4480, !dbg !189 + %4604 = getelementptr bfloat, ptr addrspace(1) %33, i64 %4482, !dbg !189 + %4605 = getelementptr bfloat, ptr addrspace(1) %33, i64 %4484, !dbg !189 + %4606 = getelementptr bfloat, ptr addrspace(1) %33, i64 %4486, !dbg !189 + %4607 = getelementptr bfloat, ptr addrspace(1) %33, i64 %4488, !dbg !189 + %4608 = getelementptr bfloat, ptr addrspace(1) %33, i64 %4490, !dbg !189 + %4609 = getelementptr bfloat, ptr addrspace(1) %33, i64 %4492, !dbg !189 + %4610 = getelementptr bfloat, ptr addrspace(1) %33, i64 %4494, !dbg !189 + %4611 = getelementptr bfloat, ptr addrspace(1) %4603, i64 %4498, !dbg !191 + %4612 = getelementptr bfloat, ptr addrspace(1) %4604, i64 %4498, !dbg !191 + %4613 = getelementptr bfloat, ptr addrspace(1) %4605, i64 %4498, !dbg !191 + %4614 = getelementptr bfloat, ptr addrspace(1) %4606, i64 %4498, !dbg !191 + %4615 = getelementptr bfloat, ptr addrspace(1) %4607, i64 %4498, !dbg !191 + %4616 = getelementptr bfloat, ptr addrspace(1) %4608, i64 %4498, !dbg !191 + %4617 = getelementptr bfloat, ptr addrspace(1) %4609, i64 %4498, !dbg !191 + %4618 = getelementptr bfloat, ptr addrspace(1) %4610, i64 %4498, !dbg !191 + %4619 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4611) #3, !dbg !192 + %4620 = extractvalue { i32, i32, i32, i32 } %4619, 0, !dbg !192 + %4621 = extractvalue { i32, i32, i32, i32 } %4619, 1, !dbg !192 + %4622 = extractvalue { i32, i32, i32, i32 } %4619, 2, !dbg !192 + %4623 = extractvalue { i32, i32, i32, i32 } %4619, 3, !dbg !192 + %4624 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4612) #3, !dbg !192 + %4625 = extractvalue { i32, i32, i32, i32 } %4624, 0, !dbg !192 + %4626 = extractvalue { i32, i32, i32, i32 } %4624, 1, !dbg !192 + %4627 = extractvalue { i32, i32, i32, i32 } %4624, 2, !dbg !192 + %4628 = extractvalue { i32, i32, i32, i32 } %4624, 3, !dbg !192 + %4629 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4613) #3, !dbg !192 + %4630 = extractvalue { i32, i32, i32, i32 } %4629, 0, !dbg !192 + %4631 = extractvalue { i32, i32, i32, i32 } %4629, 1, !dbg !192 + %4632 = extractvalue { i32, i32, i32, i32 } %4629, 2, !dbg !192 + %4633 = extractvalue { i32, i32, i32, i32 } %4629, 3, !dbg !192 + %4634 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4614) #3, !dbg !192 + %4635 = extractvalue { i32, i32, i32, i32 } %4634, 0, !dbg !192 + %4636 = extractvalue { i32, i32, i32, i32 } %4634, 1, !dbg !192 + %4637 = extractvalue { i32, i32, i32, i32 } %4634, 2, !dbg !192 + %4638 = extractvalue { i32, i32, i32, i32 } %4634, 3, !dbg !192 + %4639 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4615) #3, !dbg !192 + %4640 = extractvalue { i32, i32, i32, i32 } %4639, 0, !dbg !192 + %4641 = extractvalue { i32, i32, i32, i32 } %4639, 1, !dbg !192 + %4642 = extractvalue { i32, i32, i32, i32 } %4639, 2, !dbg !192 + %4643 = extractvalue { i32, i32, i32, i32 } %4639, 3, !dbg !192 + %4644 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4616) #3, !dbg !192 + %4645 = extractvalue { i32, i32, i32, i32 } %4644, 0, !dbg !192 + %4646 = extractvalue { i32, i32, i32, i32 } %4644, 1, !dbg !192 + %4647 = extractvalue { i32, i32, i32, i32 } %4644, 2, !dbg !192 + %4648 = extractvalue { i32, i32, i32, i32 } %4644, 3, !dbg !192 + %4649 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4617) #3, !dbg !192 + %4650 = extractvalue { i32, i32, i32, i32 } %4649, 0, !dbg !192 + %4651 = extractvalue { i32, i32, i32, i32 } %4649, 1, !dbg !192 + %4652 = extractvalue { i32, i32, i32, i32 } %4649, 2, !dbg !192 + %4653 = extractvalue { i32, i32, i32, i32 } %4649, 3, !dbg !192 + %4654 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l"(ptr addrspace(1) %4618) #3, !dbg !192 + %4655 = extractvalue { i32, i32, i32, i32 } %4654, 0, !dbg !192 + %4656 = extractvalue { i32, i32, i32, i32 } %4654, 1, !dbg !192 + %4657 = extractvalue { i32, i32, i32, i32 } %4654, 2, !dbg !192 + %4658 = extractvalue { i32, i32, i32, i32 } %4654, 3, !dbg !192 + %4659 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4555, !dbg !192 + %4660 = insertelement <4 x i32> poison, i32 %4620, i64 0, !dbg !192 + %4661 = insertelement <4 x i32> %4660, i32 %4621, i64 1, !dbg !192 + %4662 = insertelement <4 x i32> %4661, i32 %4622, i64 2, !dbg !192 + %4663 = insertelement <4 x i32> %4662, i32 %4623, i64 3, !dbg !192 + store <4 x i32> %4663, ptr addrspace(3) %4659, align 16, !dbg !192 + %4664 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4561, !dbg !192 + %4665 = insertelement <4 x i32> poison, i32 %4625, i64 0, !dbg !192 + %4666 = insertelement <4 x i32> %4665, i32 %4626, i64 1, !dbg !192 + %4667 = insertelement <4 x i32> %4666, i32 %4627, i64 2, !dbg !192 + %4668 = insertelement <4 x i32> %4667, i32 %4628, i64 3, !dbg !192 + store <4 x i32> %4668, ptr addrspace(3) %4664, align 16, !dbg !192 + %4669 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4567, !dbg !192 + %4670 = insertelement <4 x i32> poison, i32 %4630, i64 0, !dbg !192 + %4671 = insertelement <4 x i32> %4670, i32 %4631, i64 1, !dbg !192 + %4672 = insertelement <4 x i32> %4671, i32 %4632, i64 2, !dbg !192 + %4673 = insertelement <4 x i32> %4672, i32 %4633, i64 3, !dbg !192 + store <4 x i32> %4673, ptr addrspace(3) %4669, align 16, !dbg !192 + %4674 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4573, !dbg !192 + %4675 = insertelement <4 x i32> poison, i32 %4635, i64 0, !dbg !192 + %4676 = insertelement <4 x i32> %4675, i32 %4636, i64 1, !dbg !192 + %4677 = insertelement <4 x i32> %4676, i32 %4637, i64 2, !dbg !192 + %4678 = insertelement <4 x i32> %4677, i32 %4638, i64 3, !dbg !192 + store <4 x i32> %4678, ptr addrspace(3) %4674, align 16, !dbg !192 + %4679 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4579, !dbg !192 + %4680 = insertelement <4 x i32> poison, i32 %4640, i64 0, !dbg !192 + %4681 = insertelement <4 x i32> %4680, i32 %4641, i64 1, !dbg !192 + %4682 = insertelement <4 x i32> %4681, i32 %4642, i64 2, !dbg !192 + %4683 = insertelement <4 x i32> %4682, i32 %4643, i64 3, !dbg !192 + store <4 x i32> %4683, ptr addrspace(3) %4679, align 16, !dbg !192 + %4684 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4585, !dbg !192 + %4685 = insertelement <4 x i32> poison, i32 %4645, i64 0, !dbg !192 + %4686 = insertelement <4 x i32> %4685, i32 %4646, i64 1, !dbg !192 + %4687 = insertelement <4 x i32> %4686, i32 %4647, i64 2, !dbg !192 + %4688 = insertelement <4 x i32> %4687, i32 %4648, i64 3, !dbg !192 + store <4 x i32> %4688, ptr addrspace(3) %4684, align 16, !dbg !192 + %4689 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4591, !dbg !192 + %4690 = insertelement <4 x i32> poison, i32 %4650, i64 0, !dbg !192 + %4691 = insertelement <4 x i32> %4690, i32 %4651, i64 1, !dbg !192 + %4692 = insertelement <4 x i32> %4691, i32 %4652, i64 2, !dbg !192 + %4693 = insertelement <4 x i32> %4692, i32 %4653, i64 3, !dbg !192 + store <4 x i32> %4693, ptr addrspace(3) %4689, align 16, !dbg !192 + %4694 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 %4597, !dbg !192 + %4695 = insertelement <4 x i32> poison, i32 %4655, i64 0, !dbg !192 + %4696 = insertelement <4 x i32> %4695, i32 %4656, i64 1, !dbg !192 + %4697 = insertelement <4 x i32> %4696, i32 %4657, i64 2, !dbg !192 + %4698 = insertelement <4 x i32> %4697, i32 %4658, i64 3, !dbg !192 + store <4 x i32> %4698, ptr addrspace(3) %4694, align 16, !dbg !192 + %4699 = shl nuw nsw i32 %23, 2, !dbg !193 + %4700 = shl i32 %22, 23, !dbg !194 + %4701 = shl nuw nsw i32 %24, 4, !dbg !195 + %4702 = or disjoint i32 %4701, %21, !dbg !196 + %4703 = shl nuw nsw i32 %24, 8, !dbg !197 + %4704 = shl nuw nsw i32 %21, 4, !dbg !198 + %4705 = or disjoint i32 %4703, %4704, !dbg !199 + %4706 = zext nneg i32 %4705 to i64, !dbg !200 + %4707 = getelementptr i32, ptr addrspace(1) %11, i64 %4706, !dbg !200 + %4708 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %4707, i1 true) #3, !dbg !201 + %4709 = shl i32 %4708, 7, !dbg !202 + %4710 = zext nneg i32 %4702 to i64, !dbg !203 + %4711 = getelementptr i32, ptr addrspace(1) %10, i64 %4710, !dbg !203 + %4712 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %4711, i1 true) #3, !dbg !204 + %4713 = and i32 %35, 3, !dbg !205 + %4714 = shl nuw nsw i32 %4713, 1, !dbg !205 + %4715 = or disjoint i32 %4714, 8, !dbg !205 + %4716 = or disjoint i32 %4714, 16, !dbg !205 + %4717 = or disjoint i32 %4714, 24, !dbg !205 + %4718 = or disjoint i32 %4714, 32, !dbg !205 + %4719 = or disjoint i32 %4714, 40, !dbg !205 + %4720 = or disjoint i32 %4714, 48, !dbg !205 + %4721 = or disjoint i32 %4714, 56, !dbg !205 + %4722 = or disjoint i32 %4709, %38, !dbg !206 + %4723 = or disjoint i32 %4709, %39, !dbg !206 + %4724 = or disjoint i32 %4709, %40, !dbg !206 + %4725 = or disjoint i32 %4709, %41, !dbg !206 + %4726 = or disjoint i32 %4709, %4714, !dbg !206 + %4727 = or disjoint i32 %4726, 1, !dbg !206 + %4728 = or disjoint i32 %4709, %4715, !dbg !206 + %4729 = or disjoint i32 %4726, 9, !dbg !206 + %4730 = or disjoint i32 %4709, %4716, !dbg !206 + %4731 = or disjoint i32 %4726, 17, !dbg !206 + %4732 = or disjoint i32 %4709, %4717, !dbg !206 + %4733 = or disjoint i32 %4726, 25, !dbg !206 + %4734 = or disjoint i32 %4709, %4718, !dbg !206 + %4735 = or disjoint i32 %4726, 33, !dbg !206 + %4736 = or disjoint i32 %4709, %4719, !dbg !206 + %4737 = or disjoint i32 %4726, 41, !dbg !206 + %4738 = or disjoint i32 %4709, %4720, !dbg !206 + %4739 = or disjoint i32 %4726, 49, !dbg !206 + %4740 = or disjoint i32 %4709, %4721, !dbg !206 + %4741 = or disjoint i32 %4726, 57, !dbg !206 + %4742 = shl i32 %4722, 12, !dbg !207 + %4743 = shl i32 %4723, 12, !dbg !207 + %4744 = shl i32 %4724, 12, !dbg !207 + %4745 = shl i32 %4725, 12, !dbg !207 + %4746 = shl i32 %4722, 7, !dbg !209 + %4747 = shl i32 %4723, 7, !dbg !209 + %4748 = shl i32 %4724, 7, !dbg !209 + %4749 = shl i32 %4725, 7, !dbg !209 + %4750 = shl i32 %4712, 1, !dbg !210 + %4751 = tail call i32 @llvm.smin.i32(i32 %4750, i32 32), !dbg !211 + %4752 = zext nneg i32 %22 to i64, !dbg !212 + %4753 = getelementptr i64, ptr addrspace(1) %16, i64 %4752, !dbg !212 + %4754 = icmp sgt i32 %4750, 0, !dbg !213 + %4755 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %4753, i1 %4754) #3, !dbg !214 + %4756 = getelementptr i32, ptr addrspace(1) %15, i64 %4706, !dbg !215 + %4757 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %4756, i1 true) #3, !dbg !216 + %4758 = shl i32 %4757, 7, !dbg !217 + %4759 = getelementptr i32, ptr addrspace(1) %14, i64 %4710, !dbg !218 + %4760 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %4759, i1 true) #3, !dbg !219 + %4761 = or disjoint i32 %4758, %38, !dbg !220 + %4762 = or disjoint i32 %4758, %39, !dbg !220 + %4763 = or disjoint i32 %4758, %40, !dbg !220 + %4764 = or disjoint i32 %4758, %41, !dbg !220 + %4765 = or disjoint i32 %4758, %4714, !dbg !220 + %4766 = or disjoint i32 %4758, %4715, !dbg !220 + %4767 = or disjoint i32 %4758, %4716, !dbg !220 + %4768 = or disjoint i32 %4758, %4717, !dbg !220 + %4769 = or disjoint i32 %4758, %4718, !dbg !220 + %4770 = or disjoint i32 %4758, %4719, !dbg !220 + %4771 = or disjoint i32 %4758, %4720, !dbg !220 + %4772 = or disjoint i32 %4758, %4721, !dbg !220 + %4773 = shl i32 %4761, 12, !dbg !221 + %4774 = shl i32 %4762, 12, !dbg !221 + %4775 = shl i32 %4763, 12, !dbg !221 + %4776 = shl i32 %4764, 12, !dbg !221 + %4777 = shl i32 %4761, 7, !dbg !223 + %4778 = shl i32 %4762, 7, !dbg !223 + %4779 = shl i32 %4763, 7, !dbg !223 + %4780 = shl i32 %4764, 7, !dbg !223 + %4781 = shl i32 %4760, 1, !dbg !224 + %4782 = tail call i32 @llvm.smin.i32(i32 %4781, i32 32), !dbg !225 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #3, !dbg !226 + %4783 = shl nuw i32 %22, 16 + %4784 = sext i32 %4742 to i64 + %4785 = sext i32 %4743 to i64 + %4786 = sext i32 %4744 to i64 + %4787 = sext i32 %4745 to i64 + %4788 = sext i32 %4746 to i64 + %4789 = sext i32 %4747 to i64 + %4790 = sext i32 %4748 to i64 + %4791 = sext i32 %4749 to i64 + %4792 = shl nuw nsw i32 %4551, 10 + %4793 = or disjoint i32 %4554, %4792 + %4794 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4793 + %4795 = select i1 %4754, i32 16, i32 0 + %4796 = or disjoint i32 %4793, 2048 + %4797 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4796 + %4798 = or disjoint i32 %4793, 4096 + %4799 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4798 + %4800 = or disjoint i32 %4793, 6144 + %4801 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4800 + %4802 = sext i32 %4726 to i64 + %4803 = sext i32 %4728 to i64 + %4804 = sext i32 %4730 to i64 + %4805 = sext i32 %4732 to i64 + %4806 = sext i32 %4734 to i64 + %4807 = sext i32 %4736 to i64 + %4808 = sext i32 %4738 to i64 + %4809 = sext i32 %4740 to i64 + %4810 = and i32 %35, 252 + %4811 = icmp eq i32 %4810, 0 + %4812 = shl nuw nsw i32 %4713, 3 + %4813 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %4812 + %4814 = select i1 %4754, i32 8, i32 0 + %4815 = or disjoint i32 %4812, 32 + %4816 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %4815 + %4817 = or disjoint i32 %4812, 64 + %4818 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %4817 + %4819 = or disjoint i32 %4812, 96 + %4820 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %4819 + %4821 = or disjoint i32 %4812, 128 + %4822 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %4821 + %4823 = or disjoint i32 %4812, 160 + %4824 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %4823 + %4825 = or disjoint i32 %4812, 192 + %4826 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %4825 + %4827 = or disjoint i32 %4812, 224 + %4828 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %4827 + %4829 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %4793 + %4830 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %4796 + %4831 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %4798 + %4832 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %4800 + %4833 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %4812 + %4834 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %4815 + %4835 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %4817 + %4836 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %4819 + %4837 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %4821 + %4838 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %4823 + %4839 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %4825 + %4840 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %4827 + %4841 = icmp sgt i32 %4750, 1 + %4842 = or disjoint i32 %4726, 64 + %4843 = or disjoint i32 %4728, 64 + %4844 = or disjoint i32 %4730, 64 + %4845 = or disjoint i32 %4732, 64 + %4846 = or disjoint i32 %4734, 64 + %4847 = or disjoint i32 %4736, 64 + %4848 = or disjoint i32 %4738, 64 + %4849 = or disjoint i32 %4740, 64 + %4850 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %4793 + %4851 = select i1 %4841, i32 16, i32 0 + %4852 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %4796 + %4853 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %4798 + %4854 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %4800 + %4855 = sext i32 %4842 to i64 + %4856 = sext i32 %4843 to i64 + %4857 = sext i32 %4844 to i64 + %4858 = sext i32 %4845 to i64 + %4859 = sext i32 %4846 to i64 + %4860 = sext i32 %4847 to i64 + %4861 = sext i32 %4848 to i64 + %4862 = sext i32 %4849 to i64 + %4863 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %4812 + %4864 = select i1 %4841, i32 8, i32 0 + %4865 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %4815 + %4866 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %4817 + %4867 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %4819 + %4868 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %4821 + %4869 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %4823 + %4870 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %4825 + %4871 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98560), i32 %4827 + %4872 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %4793 + %4873 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %4796 + %4874 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %4798 + %4875 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %4800 + %4876 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %4812 + %4877 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %4815 + %4878 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %4817 + %4879 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %4819 + %4880 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %4821 + %4881 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %4823 + %4882 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %4825 + %4883 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99072), i32 %4827 + %4884 = add i32 %4751, -2 + %4885 = add nsw i32 %4751, -1 + %4886 = sext i32 %4773 to i64 + %4887 = sext i32 %4774 to i64 + %4888 = sext i32 %4775 to i64 + %4889 = sext i32 %4776 to i64 + %4890 = sext i32 %4777 to i64 + %4891 = sext i32 %4778 to i64 + %4892 = sext i32 %4779 to i64 + %4893 = sext i32 %4780 to i64 + %4894 = icmp sgt i32 %4781, 0 + %4895 = select i1 %4894, i32 16, i32 0 + %4896 = sext i32 %4765 to i64 + %4897 = sext i32 %4766 to i64 + %4898 = sext i32 %4767 to i64 + %4899 = sext i32 %4768 to i64 + %4900 = sext i32 %4769 to i64 + %4901 = sext i32 %4770 to i64 + %4902 = sext i32 %4771 to i64 + %4903 = sext i32 %4772 to i64 + %4904 = select i1 %4894, i32 8, i32 0 + %4905 = icmp sgt i32 %4781, 1 + %4906 = or disjoint i32 %4765, 64 + %4907 = or disjoint i32 %4766, 64 + %4908 = or disjoint i32 %4767, 64 + %4909 = or disjoint i32 %4768, 64 + %4910 = or disjoint i32 %4769, 64 + %4911 = or disjoint i32 %4770, 64 + %4912 = or disjoint i32 %4771, 64 + %4913 = or disjoint i32 %4772, 64 + %4914 = select i1 %4905, i32 16, i32 0 + %4915 = sext i32 %4906 to i64 + %4916 = sext i32 %4907 to i64 + %4917 = sext i32 %4908 to i64 + %4918 = sext i32 %4909 to i64 + %4919 = sext i32 %4910 to i64 + %4920 = sext i32 %4911 to i64 + %4921 = sext i32 %4912 to i64 + %4922 = sext i32 %4913 to i64 + %4923 = select i1 %4905, i32 8, i32 0 + %4924 = add i32 %4782, -2 + %4925 = add nsw i32 %4782, -1 + %smax = tail call i32 @llvm.smax.i32(i32 %4751, i32 1), !dbg !227 + %smax2121 = tail call i32 @llvm.smax.i32(i32 %4782, i32 1), !dbg !227 + %4926 = zext nneg i32 %4699 to i64, !dbg !227 + %4927 = insertelement <16 x i32> poison, i32 %4741, i64 0 + %4928 = insertelement <16 x i32> %4927, i32 %4740, i64 1 + %4929 = insertelement <16 x i32> %4928, i32 %4739, i64 2 + %4930 = insertelement <16 x i32> %4929, i32 %4738, i64 3 + %4931 = insertelement <16 x i32> %4930, i32 %4737, i64 4 + %4932 = insertelement <16 x i32> %4931, i32 %4736, i64 5 + %4933 = insertelement <16 x i32> %4932, i32 %4735, i64 6 + %4934 = insertelement <16 x i32> %4933, i32 %4734, i64 7 + %4935 = insertelement <16 x i32> %4934, i32 %4733, i64 8 + %4936 = insertelement <16 x i32> %4935, i32 %4732, i64 9 + %4937 = insertelement <16 x i32> %4936, i32 %4731, i64 10 + %4938 = insertelement <16 x i32> %4937, i32 %4730, i64 11 + %4939 = insertelement <16 x i32> %4938, i32 %4729, i64 12 + %4940 = insertelement <16 x i32> %4939, i32 %4728, i64 13 + %4941 = insertelement <16 x i32> %4940, i32 %4727, i64 14 + %4942 = insertelement <16 x i32> %4941, i32 %4726, i64 15 + %4943 = insertelement <16 x i64> poison, i64 %4755, i64 0 + %4944 = shufflevector <16 x i64> %4943, <16 x i64> poison, <16 x i32> zeroinitializer + br label %4945, !dbg !227 + +4945: ; preds = %4460, %._crit_edge1692 + %indvars.iv = phi i64 [ 0, %4460 ], [ %indvars.iv.next, %._crit_edge1692 ] + %4946 = phi float [ 0.000000e+00, %4460 ], [ %9744, %._crit_edge1692 ] + %4947 = phi float [ 0.000000e+00, %4460 ], [ %9745, %._crit_edge1692 ] + %4948 = phi float [ 0.000000e+00, %4460 ], [ %9746, %._crit_edge1692 ] + %4949 = phi float [ 0.000000e+00, %4460 ], [ %9747, %._crit_edge1692 ] + %4950 = phi float [ 0.000000e+00, %4460 ], [ %9748, %._crit_edge1692 ] + %4951 = phi float [ 0.000000e+00, %4460 ], [ %9749, %._crit_edge1692 ] + %4952 = phi float [ 0.000000e+00, %4460 ], [ %9750, %._crit_edge1692 ] + %4953 = phi float [ 0.000000e+00, %4460 ], [ %9751, %._crit_edge1692 ] + %4954 = phi float [ 0.000000e+00, %4460 ], [ %9752, %._crit_edge1692 ] + %4955 = phi float [ 0.000000e+00, %4460 ], [ %9753, %._crit_edge1692 ] + %4956 = phi float [ 0.000000e+00, %4460 ], [ %9754, %._crit_edge1692 ] + %4957 = phi float [ 0.000000e+00, %4460 ], [ %9755, %._crit_edge1692 ] + %4958 = phi float [ 0.000000e+00, %4460 ], [ %9756, %._crit_edge1692 ] + %4959 = phi float [ 0.000000e+00, %4460 ], [ %9757, %._crit_edge1692 ] + %4960 = phi float [ 0.000000e+00, %4460 ], [ %9758, %._crit_edge1692 ] + %4961 = phi float [ 0.000000e+00, %4460 ], [ %9759, %._crit_edge1692 ] + %4962 = phi float [ 0.000000e+00, %4460 ], [ %9760, %._crit_edge1692 ] + %4963 = phi float [ 0.000000e+00, %4460 ], [ %9761, %._crit_edge1692 ] + %4964 = phi float [ 0.000000e+00, %4460 ], [ %9762, %._crit_edge1692 ] + %4965 = phi float [ 0.000000e+00, %4460 ], [ %9763, %._crit_edge1692 ] + %4966 = phi float [ 0.000000e+00, %4460 ], [ %9764, %._crit_edge1692 ] + %4967 = phi float [ 0.000000e+00, %4460 ], [ %9765, %._crit_edge1692 ] + %4968 = phi float [ 0.000000e+00, %4460 ], [ %9766, %._crit_edge1692 ] + %4969 = phi float [ 0.000000e+00, %4460 ], [ %9767, %._crit_edge1692 ] + %4970 = phi float [ 0.000000e+00, %4460 ], [ %9768, %._crit_edge1692 ] + %4971 = phi float [ 0.000000e+00, %4460 ], [ %9769, %._crit_edge1692 ] + %4972 = phi float [ 0.000000e+00, %4460 ], [ %9770, %._crit_edge1692 ] + %4973 = phi float [ 0.000000e+00, %4460 ], [ %9771, %._crit_edge1692 ] + %4974 = phi float [ 0.000000e+00, %4460 ], [ %9772, %._crit_edge1692 ] + %4975 = phi float [ 0.000000e+00, %4460 ], [ %9773, %._crit_edge1692 ] + %4976 = phi float [ 0.000000e+00, %4460 ], [ %9774, %._crit_edge1692 ] + %4977 = phi float [ 0.000000e+00, %4460 ], [ %9775, %._crit_edge1692 ] + %4978 = phi float [ 0.000000e+00, %4460 ], [ %9776, %._crit_edge1692 ] + %4979 = phi float [ 0.000000e+00, %4460 ], [ %9777, %._crit_edge1692 ] + %4980 = phi float [ 0.000000e+00, %4460 ], [ %9778, %._crit_edge1692 ] + %4981 = phi float [ 0.000000e+00, %4460 ], [ %9779, %._crit_edge1692 ] + %4982 = phi float [ 0.000000e+00, %4460 ], [ %9780, %._crit_edge1692 ] + %4983 = phi float [ 0.000000e+00, %4460 ], [ %9781, %._crit_edge1692 ] + %4984 = phi float [ 0.000000e+00, %4460 ], [ %9782, %._crit_edge1692 ] + %4985 = phi float [ 0.000000e+00, %4460 ], [ %9783, %._crit_edge1692 ] + %4986 = phi float [ 0.000000e+00, %4460 ], [ %9784, %._crit_edge1692 ] + %4987 = phi float [ 0.000000e+00, %4460 ], [ %9785, %._crit_edge1692 ] + %4988 = phi float [ 0.000000e+00, %4460 ], [ %9786, %._crit_edge1692 ] + %4989 = phi float [ 0.000000e+00, %4460 ], [ %9787, %._crit_edge1692 ] + %4990 = phi float [ 0.000000e+00, %4460 ], [ %9788, %._crit_edge1692 ] + %4991 = phi float [ 0.000000e+00, %4460 ], [ %9789, %._crit_edge1692 ] + %4992 = phi float [ 0.000000e+00, %4460 ], [ %9790, %._crit_edge1692 ] + %4993 = phi float [ 0.000000e+00, %4460 ], [ %9791, %._crit_edge1692 ] + %4994 = phi float [ 0.000000e+00, %4460 ], [ %9792, %._crit_edge1692 ] + %4995 = phi float [ 0.000000e+00, %4460 ], [ %9793, %._crit_edge1692 ] + %4996 = phi float [ 0.000000e+00, %4460 ], [ %9794, %._crit_edge1692 ] + %4997 = phi float [ 0.000000e+00, %4460 ], [ %9795, %._crit_edge1692 ] + %4998 = phi float [ 0.000000e+00, %4460 ], [ %9796, %._crit_edge1692 ] + %4999 = phi float [ 0.000000e+00, %4460 ], [ %9797, %._crit_edge1692 ] + %5000 = phi float [ 0.000000e+00, %4460 ], [ %9798, %._crit_edge1692 ] + %5001 = phi float [ 0.000000e+00, %4460 ], [ %9799, %._crit_edge1692 ] + %5002 = phi float [ 0.000000e+00, %4460 ], [ %9800, %._crit_edge1692 ] + %5003 = phi float [ 0.000000e+00, %4460 ], [ %9801, %._crit_edge1692 ] + %5004 = phi float [ 0.000000e+00, %4460 ], [ %9802, %._crit_edge1692 ] + %5005 = phi float [ 0.000000e+00, %4460 ], [ %9803, %._crit_edge1692 ] + %5006 = phi float [ 0.000000e+00, %4460 ], [ %9804, %._crit_edge1692 ] + %5007 = phi float [ 0.000000e+00, %4460 ], [ %9805, %._crit_edge1692 ] + %5008 = phi float [ 0.000000e+00, %4460 ], [ %9806, %._crit_edge1692 ] + %5009 = phi float [ 0.000000e+00, %4460 ], [ %9807, %._crit_edge1692 ] + %5010 = phi float [ 0.000000e+00, %4460 ], [ %9680, %._crit_edge1692 ] + %5011 = phi float [ 0.000000e+00, %4460 ], [ %9681, %._crit_edge1692 ] + %5012 = phi float [ 0.000000e+00, %4460 ], [ %9682, %._crit_edge1692 ] + %5013 = phi float [ 0.000000e+00, %4460 ], [ %9683, %._crit_edge1692 ] + %5014 = phi float [ 0.000000e+00, %4460 ], [ %9684, %._crit_edge1692 ] + %5015 = phi float [ 0.000000e+00, %4460 ], [ %9685, %._crit_edge1692 ] + %5016 = phi float [ 0.000000e+00, %4460 ], [ %9686, %._crit_edge1692 ] + %5017 = phi float [ 0.000000e+00, %4460 ], [ %9687, %._crit_edge1692 ] + %5018 = phi float [ 0.000000e+00, %4460 ], [ %9688, %._crit_edge1692 ] + %5019 = phi float [ 0.000000e+00, %4460 ], [ %9689, %._crit_edge1692 ] + %5020 = phi float [ 0.000000e+00, %4460 ], [ %9690, %._crit_edge1692 ] + %5021 = phi float [ 0.000000e+00, %4460 ], [ %9691, %._crit_edge1692 ] + %5022 = phi float [ 0.000000e+00, %4460 ], [ %9692, %._crit_edge1692 ] + %5023 = phi float [ 0.000000e+00, %4460 ], [ %9693, %._crit_edge1692 ] + %5024 = phi float [ 0.000000e+00, %4460 ], [ %9694, %._crit_edge1692 ] + %5025 = phi float [ 0.000000e+00, %4460 ], [ %9695, %._crit_edge1692 ] + %5026 = phi float [ 0.000000e+00, %4460 ], [ %9696, %._crit_edge1692 ] + %5027 = phi float [ 0.000000e+00, %4460 ], [ %9697, %._crit_edge1692 ] + %5028 = phi float [ 0.000000e+00, %4460 ], [ %9698, %._crit_edge1692 ] + %5029 = phi float [ 0.000000e+00, %4460 ], [ %9699, %._crit_edge1692 ] + %5030 = phi float [ 0.000000e+00, %4460 ], [ %9700, %._crit_edge1692 ] + %5031 = phi float [ 0.000000e+00, %4460 ], [ %9701, %._crit_edge1692 ] + %5032 = phi float [ 0.000000e+00, %4460 ], [ %9702, %._crit_edge1692 ] + %5033 = phi float [ 0.000000e+00, %4460 ], [ %9703, %._crit_edge1692 ] + %5034 = phi float [ 0.000000e+00, %4460 ], [ %9704, %._crit_edge1692 ] + %5035 = phi float [ 0.000000e+00, %4460 ], [ %9705, %._crit_edge1692 ] + %5036 = phi float [ 0.000000e+00, %4460 ], [ %9706, %._crit_edge1692 ] + %5037 = phi float [ 0.000000e+00, %4460 ], [ %9707, %._crit_edge1692 ] + %5038 = phi float [ 0.000000e+00, %4460 ], [ %9708, %._crit_edge1692 ] + %5039 = phi float [ 0.000000e+00, %4460 ], [ %9709, %._crit_edge1692 ] + %5040 = phi float [ 0.000000e+00, %4460 ], [ %9710, %._crit_edge1692 ] + %5041 = phi float [ 0.000000e+00, %4460 ], [ %9711, %._crit_edge1692 ] + %5042 = phi float [ 0.000000e+00, %4460 ], [ %9712, %._crit_edge1692 ] + %5043 = phi float [ 0.000000e+00, %4460 ], [ %9713, %._crit_edge1692 ] + %5044 = phi float [ 0.000000e+00, %4460 ], [ %9714, %._crit_edge1692 ] + %5045 = phi float [ 0.000000e+00, %4460 ], [ %9715, %._crit_edge1692 ] + %5046 = phi float [ 0.000000e+00, %4460 ], [ %9716, %._crit_edge1692 ] + %5047 = phi float [ 0.000000e+00, %4460 ], [ %9717, %._crit_edge1692 ] + %5048 = phi float [ 0.000000e+00, %4460 ], [ %9718, %._crit_edge1692 ] + %5049 = phi float [ 0.000000e+00, %4460 ], [ %9719, %._crit_edge1692 ] + %5050 = phi float [ 0.000000e+00, %4460 ], [ %9720, %._crit_edge1692 ] + %5051 = phi float [ 0.000000e+00, %4460 ], [ %9721, %._crit_edge1692 ] + %5052 = phi float [ 0.000000e+00, %4460 ], [ %9722, %._crit_edge1692 ] + %5053 = phi float [ 0.000000e+00, %4460 ], [ %9723, %._crit_edge1692 ] + %5054 = phi float [ 0.000000e+00, %4460 ], [ %9724, %._crit_edge1692 ] + %5055 = phi float [ 0.000000e+00, %4460 ], [ %9725, %._crit_edge1692 ] + %5056 = phi float [ 0.000000e+00, %4460 ], [ %9726, %._crit_edge1692 ] + %5057 = phi float [ 0.000000e+00, %4460 ], [ %9727, %._crit_edge1692 ] + %5058 = phi float [ 0.000000e+00, %4460 ], [ %9728, %._crit_edge1692 ] + %5059 = phi float [ 0.000000e+00, %4460 ], [ %9729, %._crit_edge1692 ] + %5060 = phi float [ 0.000000e+00, %4460 ], [ %9730, %._crit_edge1692 ] + %5061 = phi float [ 0.000000e+00, %4460 ], [ %9731, %._crit_edge1692 ] + %5062 = phi float [ 0.000000e+00, %4460 ], [ %9732, %._crit_edge1692 ] + %5063 = phi float [ 0.000000e+00, %4460 ], [ %9733, %._crit_edge1692 ] + %5064 = phi float [ 0.000000e+00, %4460 ], [ %9734, %._crit_edge1692 ] + %5065 = phi float [ 0.000000e+00, %4460 ], [ %9735, %._crit_edge1692 ] + %5066 = phi float [ 0.000000e+00, %4460 ], [ %9736, %._crit_edge1692 ] + %5067 = phi float [ 0.000000e+00, %4460 ], [ %9737, %._crit_edge1692 ] + %5068 = phi float [ 0.000000e+00, %4460 ], [ %9738, %._crit_edge1692 ] + %5069 = phi float [ 0.000000e+00, %4460 ], [ %9739, %._crit_edge1692 ] + %5070 = phi float [ 0.000000e+00, %4460 ], [ %9740, %._crit_edge1692 ] + %5071 = phi float [ 0.000000e+00, %4460 ], [ %9741, %._crit_edge1692 ] + %5072 = phi float [ 0.000000e+00, %4460 ], [ %9742, %._crit_edge1692 ] + %5073 = phi float [ 0.000000e+00, %4460 ], [ %9743, %._crit_edge1692 ] + %5074 = add nuw nsw i64 %indvars.iv, %4926, !dbg !228 + %.tr = trunc i64 %5074 to i32, !dbg !229 + %5075 = shl i32 %.tr, 7, !dbg !229 + %5076 = add i32 %5075, %4700, !dbg !229 + %5077 = sext i32 %5076 to i64, !dbg !230 + %5078 = trunc nuw nsw i64 %5074 to i32, !dbg !231 + %5079 = shl i32 %5078, 18, !dbg !231 + %5080 = add i32 %5079, %4700, !dbg !232 + %5081 = sext i32 %5080 to i64, !dbg !233 + %.tr2128 = trunc i64 %5074 to i32, !dbg !234 + %5082 = shl i32 %.tr2128, 11, !dbg !234 + %5083 = add i32 %5082, %4783, !dbg !234 + %5084 = sext i32 %5083 to i64, !dbg !235 + %5085 = getelementptr bfloat, ptr addrspace(1) %0, i64 %5077, !dbg !236 + %5086 = getelementptr bfloat, ptr addrspace(1) %5, i64 %5081, !dbg !237 + %5087 = getelementptr float, ptr addrspace(1) %3, i64 %5084, !dbg !238 + %5088 = getelementptr float, ptr addrspace(1) %4, i64 %5084, !dbg !239 + %5089 = getelementptr bfloat, ptr addrspace(1) %5085, i64 %4784, !dbg !240 + %5090 = getelementptr bfloat, ptr addrspace(1) %5085, i64 %4785, !dbg !240 + %5091 = getelementptr bfloat, ptr addrspace(1) %5085, i64 %4786, !dbg !240 + %5092 = getelementptr bfloat, ptr addrspace(1) %5085, i64 %4787, !dbg !240 + %5093 = getelementptr bfloat, ptr addrspace(1) %5089, i64 %4498, !dbg !241 + %5094 = getelementptr bfloat, ptr addrspace(1) %5090, i64 %4498, !dbg !241 + %5095 = getelementptr bfloat, ptr addrspace(1) %5091, i64 %4498, !dbg !241 + %5096 = getelementptr bfloat, ptr addrspace(1) %5092, i64 %4498, !dbg !241 + %5097 = getelementptr bfloat, ptr addrspace(1) %5086, i64 %4788, !dbg !242 + %5098 = getelementptr bfloat, ptr addrspace(1) %5086, i64 %4789, !dbg !242 + %5099 = getelementptr bfloat, ptr addrspace(1) %5086, i64 %4790, !dbg !242 + %5100 = getelementptr bfloat, ptr addrspace(1) %5086, i64 %4791, !dbg !242 + %5101 = getelementptr bfloat, ptr addrspace(1) %5097, i64 %4498, !dbg !243 + %5102 = getelementptr bfloat, ptr addrspace(1) %5098, i64 %4498, !dbg !243 + %5103 = getelementptr bfloat, ptr addrspace(1) %5099, i64 %4498, !dbg !243 + %5104 = getelementptr bfloat, ptr addrspace(1) %5100, i64 %4498, !dbg !243 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4794, ptr addrspace(1) %5093, i32 %4795) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4797, ptr addrspace(1) %5094, i32 %4795) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4799, ptr addrspace(1) %5095, i32 %4795) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4801, ptr addrspace(1) %5096, i32 %4795) #3, !dbg !244 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !244 + %5105 = getelementptr float, ptr addrspace(1) %5087, i64 %4802, !dbg !245 + %5106 = getelementptr float, ptr addrspace(1) %5087, i64 %4803, !dbg !245 + %5107 = getelementptr float, ptr addrspace(1) %5087, i64 %4804, !dbg !245 + %5108 = getelementptr float, ptr addrspace(1) %5087, i64 %4805, !dbg !245 + %5109 = getelementptr float, ptr addrspace(1) %5087, i64 %4806, !dbg !245 + %5110 = getelementptr float, ptr addrspace(1) %5087, i64 %4807, !dbg !245 + %5111 = getelementptr float, ptr addrspace(1) %5087, i64 %4808, !dbg !245 + %5112 = getelementptr float, ptr addrspace(1) %5087, i64 %4809, !dbg !245 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %4813, ptr addrspace(1) %5105, i32 %4814, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4816, ptr addrspace(1) %5106, i32 %4814, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4818, ptr addrspace(1) %5107, i32 %4814, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4820, ptr addrspace(1) %5108, i32 %4814, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4822, ptr addrspace(1) %5109, i32 %4814, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4824, ptr addrspace(1) %5110, i32 %4814, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4826, ptr addrspace(1) %5111, i32 %4814, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4828, ptr addrspace(1) %5112, i32 %4814, i1 %4811) #3, !dbg !246 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !246 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4829, ptr addrspace(1) %5101, i32 %4795) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4830, ptr addrspace(1) %5102, i32 %4795) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4831, ptr addrspace(1) %5103, i32 %4795) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4832, ptr addrspace(1) %5104, i32 %4795) #3, !dbg !244 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !244 + %5113 = getelementptr float, ptr addrspace(1) %5088, i64 %4802, !dbg !247 + %5114 = getelementptr float, ptr addrspace(1) %5088, i64 %4803, !dbg !247 + %5115 = getelementptr float, ptr addrspace(1) %5088, i64 %4804, !dbg !247 + %5116 = getelementptr float, ptr addrspace(1) %5088, i64 %4805, !dbg !247 + %5117 = getelementptr float, ptr addrspace(1) %5088, i64 %4806, !dbg !247 + %5118 = getelementptr float, ptr addrspace(1) %5088, i64 %4807, !dbg !247 + %5119 = getelementptr float, ptr addrspace(1) %5088, i64 %4808, !dbg !247 + %5120 = getelementptr float, ptr addrspace(1) %5088, i64 %4809, !dbg !247 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %4833, ptr addrspace(1) %5113, i32 %4814, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4834, ptr addrspace(1) %5114, i32 %4814, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4835, ptr addrspace(1) %5115, i32 %4814, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4836, ptr addrspace(1) %5116, i32 %4814, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4837, ptr addrspace(1) %5117, i32 %4814, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4838, ptr addrspace(1) %5118, i32 %4814, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4839, ptr addrspace(1) %5119, i32 %4814, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4840, ptr addrspace(1) %5120, i32 %4814, i1 %4811) #3, !dbg !248 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !248 + %5121 = getelementptr i8, ptr addrspace(1) %5093, i64 524288, !dbg !249 + %5122 = getelementptr i8, ptr addrspace(1) %5094, i64 524288, !dbg !249 + %5123 = getelementptr i8, ptr addrspace(1) %5095, i64 524288, !dbg !249 + %5124 = getelementptr i8, ptr addrspace(1) %5096, i64 524288, !dbg !249 + %5125 = getelementptr i8, ptr addrspace(1) %5101, i64 16384, !dbg !250 + %5126 = getelementptr i8, ptr addrspace(1) %5102, i64 16384, !dbg !250 + %5127 = getelementptr i8, ptr addrspace(1) %5103, i64 16384, !dbg !250 + %5128 = getelementptr i8, ptr addrspace(1) %5104, i64 16384, !dbg !250 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4850, ptr addrspace(1) %5121, i32 %4851) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4852, ptr addrspace(1) %5122, i32 %4851) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4853, ptr addrspace(1) %5123, i32 %4851) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4854, ptr addrspace(1) %5124, i32 %4851) #3, !dbg !244 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !244 + %5129 = getelementptr float, ptr addrspace(1) %5087, i64 %4855, !dbg !245 + %5130 = getelementptr float, ptr addrspace(1) %5087, i64 %4856, !dbg !245 + %5131 = getelementptr float, ptr addrspace(1) %5087, i64 %4857, !dbg !245 + %5132 = getelementptr float, ptr addrspace(1) %5087, i64 %4858, !dbg !245 + %5133 = getelementptr float, ptr addrspace(1) %5087, i64 %4859, !dbg !245 + %5134 = getelementptr float, ptr addrspace(1) %5087, i64 %4860, !dbg !245 + %5135 = getelementptr float, ptr addrspace(1) %5087, i64 %4861, !dbg !245 + %5136 = getelementptr float, ptr addrspace(1) %5087, i64 %4862, !dbg !245 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %4863, ptr addrspace(1) %5129, i32 %4864, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4865, ptr addrspace(1) %5130, i32 %4864, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4866, ptr addrspace(1) %5131, i32 %4864, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4867, ptr addrspace(1) %5132, i32 %4864, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4868, ptr addrspace(1) %5133, i32 %4864, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4869, ptr addrspace(1) %5134, i32 %4864, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4870, ptr addrspace(1) %5135, i32 %4864, i1 %4811) #3, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4871, ptr addrspace(1) %5136, i32 %4864, i1 %4811) #3, !dbg !246 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !246 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4872, ptr addrspace(1) %5125, i32 %4851) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4873, ptr addrspace(1) %5126, i32 %4851) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4874, ptr addrspace(1) %5127, i32 %4851) #3, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4875, ptr addrspace(1) %5128, i32 %4851) #3, !dbg !244 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !244 + %5137 = getelementptr float, ptr addrspace(1) %5088, i64 %4855, !dbg !247 + %5138 = getelementptr float, ptr addrspace(1) %5088, i64 %4856, !dbg !247 + %5139 = getelementptr float, ptr addrspace(1) %5088, i64 %4857, !dbg !247 + %5140 = getelementptr float, ptr addrspace(1) %5088, i64 %4858, !dbg !247 + %5141 = getelementptr float, ptr addrspace(1) %5088, i64 %4859, !dbg !247 + %5142 = getelementptr float, ptr addrspace(1) %5088, i64 %4860, !dbg !247 + %5143 = getelementptr float, ptr addrspace(1) %5088, i64 %4861, !dbg !247 + %5144 = getelementptr float, ptr addrspace(1) %5088, i64 %4862, !dbg !247 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %4876, ptr addrspace(1) %5137, i32 %4864, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4877, ptr addrspace(1) %5138, i32 %4864, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4878, ptr addrspace(1) %5139, i32 %4864, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4879, ptr addrspace(1) %5140, i32 %4864, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4880, ptr addrspace(1) %5141, i32 %4864, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4881, ptr addrspace(1) %5142, i32 %4864, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4882, ptr addrspace(1) %5143, i32 %4864, i1 %4811) #3, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4883, ptr addrspace(1) %5144, i32 %4864, i1 %4811) #3, !dbg !248 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !248 + br i1 %4754, label %.lr.ph, label %._crit_edge, !dbg !213 + +.lr.ph: ; preds = %4945, %__nv_exp2f.exit1315 + %5145 = phi i32 [ %7352, %__nv_exp2f.exit1315 ], [ 64, %4945 ] + %5146 = phi i32 [ %5284, %__nv_exp2f.exit1315 ], [ -1, %4945 ] + %5147 = phi i32 [ %7375, %__nv_exp2f.exit1315 ], [ 1, %4945 ] + %5148 = phi i32 [ %5287, %__nv_exp2f.exit1315 ], [ -1, %4945 ] + %5149 = phi i32 [ %7378, %__nv_exp2f.exit1315 ], [ 1, %4945 ] + %.pn1881528 = phi i32 [ %7372, %__nv_exp2f.exit1315 ], [ %4849, %4945 ] + %.pn1921527 = phi i32 [ %7371, %__nv_exp2f.exit1315 ], [ %4848, %4945 ] + %.pn1961526 = phi i32 [ %7370, %__nv_exp2f.exit1315 ], [ %4847, %4945 ] + %.pn2001525 = phi i32 [ %7369, %__nv_exp2f.exit1315 ], [ %4846, %4945 ] + %.pn2041524 = phi i32 [ %7368, %__nv_exp2f.exit1315 ], [ %4845, %4945 ] + %.pn2081523 = phi i32 [ %7367, %__nv_exp2f.exit1315 ], [ %4844, %4945 ] + %.pn2121522 = phi i32 [ %7366, %__nv_exp2f.exit1315 ], [ %4843, %4945 ] + %.pn2161521 = phi i32 [ %7365, %__nv_exp2f.exit1315 ], [ %4842, %4945 ] + %.pn1361520 = phi ptr addrspace(1) [ %7364, %__nv_exp2f.exit1315 ], [ %5128, %4945 ] + %.pn1521519 = phi ptr addrspace(1) [ %7363, %__nv_exp2f.exit1315 ], [ %5127, %4945 ] + %.pn1681518 = phi ptr addrspace(1) [ %7362, %__nv_exp2f.exit1315 ], [ %5126, %4945 ] + %.pn1841517 = phi ptr addrspace(1) [ %7361, %__nv_exp2f.exit1315 ], [ %5125, %4945 ] + %.pn721516 = phi ptr addrspace(1) [ %7358, %__nv_exp2f.exit1315 ], [ %5124, %4945 ] + %.pn881515 = phi ptr addrspace(1) [ %7357, %__nv_exp2f.exit1315 ], [ %5123, %4945 ] + %.pn1041514 = phi ptr addrspace(1) [ %7356, %__nv_exp2f.exit1315 ], [ %5122, %4945 ] + %.pn1201513 = phi ptr addrspace(1) [ %7355, %__nv_exp2f.exit1315 ], [ %5121, %4945 ] + %5150 = phi float [ %6410, %__nv_exp2f.exit1315 ], [ %5010, %4945 ] + %5151 = phi float [ %6411, %__nv_exp2f.exit1315 ], [ %5011, %4945 ] + %5152 = phi float [ %6412, %__nv_exp2f.exit1315 ], [ %5012, %4945 ] + %5153 = phi float [ %6413, %__nv_exp2f.exit1315 ], [ %5013, %4945 ] + %5154 = phi float [ %6414, %__nv_exp2f.exit1315 ], [ %5014, %4945 ] + %5155 = phi float [ %6415, %__nv_exp2f.exit1315 ], [ %5015, %4945 ] + %5156 = phi float [ %6416, %__nv_exp2f.exit1315 ], [ %5016, %4945 ] + %5157 = phi float [ %6417, %__nv_exp2f.exit1315 ], [ %5017, %4945 ] + %5158 = phi float [ %6418, %__nv_exp2f.exit1315 ], [ %5018, %4945 ] + %5159 = phi float [ %6419, %__nv_exp2f.exit1315 ], [ %5019, %4945 ] + %5160 = phi float [ %6420, %__nv_exp2f.exit1315 ], [ %5020, %4945 ] + %5161 = phi float [ %6421, %__nv_exp2f.exit1315 ], [ %5021, %4945 ] + %5162 = phi float [ %6422, %__nv_exp2f.exit1315 ], [ %5022, %4945 ] + %5163 = phi float [ %6423, %__nv_exp2f.exit1315 ], [ %5023, %4945 ] + %5164 = phi float [ %6424, %__nv_exp2f.exit1315 ], [ %5024, %4945 ] + %5165 = phi float [ %6425, %__nv_exp2f.exit1315 ], [ %5025, %4945 ] + %5166 = phi float [ %6426, %__nv_exp2f.exit1315 ], [ %5026, %4945 ] + %5167 = phi float [ %6427, %__nv_exp2f.exit1315 ], [ %5027, %4945 ] + %5168 = phi float [ %6428, %__nv_exp2f.exit1315 ], [ %5028, %4945 ] + %5169 = phi float [ %6429, %__nv_exp2f.exit1315 ], [ %5029, %4945 ] + %5170 = phi float [ %6430, %__nv_exp2f.exit1315 ], [ %5030, %4945 ] + %5171 = phi float [ %6431, %__nv_exp2f.exit1315 ], [ %5031, %4945 ] + %5172 = phi float [ %6432, %__nv_exp2f.exit1315 ], [ %5032, %4945 ] + %5173 = phi float [ %6433, %__nv_exp2f.exit1315 ], [ %5033, %4945 ] + %5174 = phi float [ %6434, %__nv_exp2f.exit1315 ], [ %5034, %4945 ] + %5175 = phi float [ %6435, %__nv_exp2f.exit1315 ], [ %5035, %4945 ] + %5176 = phi float [ %6436, %__nv_exp2f.exit1315 ], [ %5036, %4945 ] + %5177 = phi float [ %6437, %__nv_exp2f.exit1315 ], [ %5037, %4945 ] + %5178 = phi float [ %6438, %__nv_exp2f.exit1315 ], [ %5038, %4945 ] + %5179 = phi float [ %6439, %__nv_exp2f.exit1315 ], [ %5039, %4945 ] + %5180 = phi float [ %6440, %__nv_exp2f.exit1315 ], [ %5040, %4945 ] + %5181 = phi float [ %6441, %__nv_exp2f.exit1315 ], [ %5041, %4945 ] + %5182 = phi float [ %6442, %__nv_exp2f.exit1315 ], [ %5042, %4945 ] + %5183 = phi float [ %6443, %__nv_exp2f.exit1315 ], [ %5043, %4945 ] + %5184 = phi float [ %6444, %__nv_exp2f.exit1315 ], [ %5044, %4945 ] + %5185 = phi float [ %6445, %__nv_exp2f.exit1315 ], [ %5045, %4945 ] + %5186 = phi float [ %6446, %__nv_exp2f.exit1315 ], [ %5046, %4945 ] + %5187 = phi float [ %6447, %__nv_exp2f.exit1315 ], [ %5047, %4945 ] + %5188 = phi float [ %6448, %__nv_exp2f.exit1315 ], [ %5048, %4945 ] + %5189 = phi float [ %6449, %__nv_exp2f.exit1315 ], [ %5049, %4945 ] + %5190 = phi float [ %6450, %__nv_exp2f.exit1315 ], [ %5050, %4945 ] + %5191 = phi float [ %6451, %__nv_exp2f.exit1315 ], [ %5051, %4945 ] + %5192 = phi float [ %6452, %__nv_exp2f.exit1315 ], [ %5052, %4945 ] + %5193 = phi float [ %6453, %__nv_exp2f.exit1315 ], [ %5053, %4945 ] + %5194 = phi float [ %6454, %__nv_exp2f.exit1315 ], [ %5054, %4945 ] + %5195 = phi float [ %6455, %__nv_exp2f.exit1315 ], [ %5055, %4945 ] + %5196 = phi float [ %6456, %__nv_exp2f.exit1315 ], [ %5056, %4945 ] + %5197 = phi float [ %6457, %__nv_exp2f.exit1315 ], [ %5057, %4945 ] + %5198 = phi float [ %6458, %__nv_exp2f.exit1315 ], [ %5058, %4945 ] + %5199 = phi float [ %6459, %__nv_exp2f.exit1315 ], [ %5059, %4945 ] + %5200 = phi float [ %6460, %__nv_exp2f.exit1315 ], [ %5060, %4945 ] + %5201 = phi float [ %6461, %__nv_exp2f.exit1315 ], [ %5061, %4945 ] + %5202 = phi float [ %6462, %__nv_exp2f.exit1315 ], [ %5062, %4945 ] + %5203 = phi float [ %6463, %__nv_exp2f.exit1315 ], [ %5063, %4945 ] + %5204 = phi float [ %6464, %__nv_exp2f.exit1315 ], [ %5064, %4945 ] + %5205 = phi float [ %6465, %__nv_exp2f.exit1315 ], [ %5065, %4945 ] + %5206 = phi float [ %6466, %__nv_exp2f.exit1315 ], [ %5066, %4945 ] + %5207 = phi float [ %6467, %__nv_exp2f.exit1315 ], [ %5067, %4945 ] + %5208 = phi float [ %6468, %__nv_exp2f.exit1315 ], [ %5068, %4945 ] + %5209 = phi float [ %6469, %__nv_exp2f.exit1315 ], [ %5069, %4945 ] + %5210 = phi float [ %6470, %__nv_exp2f.exit1315 ], [ %5070, %4945 ] + %5211 = phi float [ %6471, %__nv_exp2f.exit1315 ], [ %5071, %4945 ] + %5212 = phi float [ %6472, %__nv_exp2f.exit1315 ], [ %5072, %4945 ] + %5213 = phi float [ %6473, %__nv_exp2f.exit1315 ], [ %5073, %4945 ] + %5214 = phi float [ %7266, %__nv_exp2f.exit1315 ], [ %4946, %4945 ] + %5215 = phi float [ %7267, %__nv_exp2f.exit1315 ], [ %4947, %4945 ] + %5216 = phi float [ %7268, %__nv_exp2f.exit1315 ], [ %4948, %4945 ] + %5217 = phi float [ %7269, %__nv_exp2f.exit1315 ], [ %4949, %4945 ] + %5218 = phi float [ %7270, %__nv_exp2f.exit1315 ], [ %4950, %4945 ] + %5219 = phi float [ %7271, %__nv_exp2f.exit1315 ], [ %4951, %4945 ] + %5220 = phi float [ %7272, %__nv_exp2f.exit1315 ], [ %4952, %4945 ] + %5221 = phi float [ %7273, %__nv_exp2f.exit1315 ], [ %4953, %4945 ] + %5222 = phi float [ %7274, %__nv_exp2f.exit1315 ], [ %4954, %4945 ] + %5223 = phi float [ %7275, %__nv_exp2f.exit1315 ], [ %4955, %4945 ] + %5224 = phi float [ %7276, %__nv_exp2f.exit1315 ], [ %4956, %4945 ] + %5225 = phi float [ %7277, %__nv_exp2f.exit1315 ], [ %4957, %4945 ] + %5226 = phi float [ %7278, %__nv_exp2f.exit1315 ], [ %4958, %4945 ] + %5227 = phi float [ %7279, %__nv_exp2f.exit1315 ], [ %4959, %4945 ] + %5228 = phi float [ %7280, %__nv_exp2f.exit1315 ], [ %4960, %4945 ] + %5229 = phi float [ %7281, %__nv_exp2f.exit1315 ], [ %4961, %4945 ] + %5230 = phi float [ %7282, %__nv_exp2f.exit1315 ], [ %4962, %4945 ] + %5231 = phi float [ %7283, %__nv_exp2f.exit1315 ], [ %4963, %4945 ] + %5232 = phi float [ %7284, %__nv_exp2f.exit1315 ], [ %4964, %4945 ] + %5233 = phi float [ %7285, %__nv_exp2f.exit1315 ], [ %4965, %4945 ] + %5234 = phi float [ %7286, %__nv_exp2f.exit1315 ], [ %4966, %4945 ] + %5235 = phi float [ %7287, %__nv_exp2f.exit1315 ], [ %4967, %4945 ] + %5236 = phi float [ %7288, %__nv_exp2f.exit1315 ], [ %4968, %4945 ] + %5237 = phi float [ %7289, %__nv_exp2f.exit1315 ], [ %4969, %4945 ] + %5238 = phi float [ %7290, %__nv_exp2f.exit1315 ], [ %4970, %4945 ] + %5239 = phi float [ %7291, %__nv_exp2f.exit1315 ], [ %4971, %4945 ] + %5240 = phi float [ %7292, %__nv_exp2f.exit1315 ], [ %4972, %4945 ] + %5241 = phi float [ %7293, %__nv_exp2f.exit1315 ], [ %4973, %4945 ] + %5242 = phi float [ %7294, %__nv_exp2f.exit1315 ], [ %4974, %4945 ] + %5243 = phi float [ %7295, %__nv_exp2f.exit1315 ], [ %4975, %4945 ] + %5244 = phi float [ %7296, %__nv_exp2f.exit1315 ], [ %4976, %4945 ] + %5245 = phi float [ %7297, %__nv_exp2f.exit1315 ], [ %4977, %4945 ] + %5246 = phi float [ %7298, %__nv_exp2f.exit1315 ], [ %4978, %4945 ] + %5247 = phi float [ %7299, %__nv_exp2f.exit1315 ], [ %4979, %4945 ] + %5248 = phi float [ %7300, %__nv_exp2f.exit1315 ], [ %4980, %4945 ] + %5249 = phi float [ %7301, %__nv_exp2f.exit1315 ], [ %4981, %4945 ] + %5250 = phi float [ %7302, %__nv_exp2f.exit1315 ], [ %4982, %4945 ] + %5251 = phi float [ %7303, %__nv_exp2f.exit1315 ], [ %4983, %4945 ] + %5252 = phi float [ %7304, %__nv_exp2f.exit1315 ], [ %4984, %4945 ] + %5253 = phi float [ %7305, %__nv_exp2f.exit1315 ], [ %4985, %4945 ] + %5254 = phi float [ %7306, %__nv_exp2f.exit1315 ], [ %4986, %4945 ] + %5255 = phi float [ %7307, %__nv_exp2f.exit1315 ], [ %4987, %4945 ] + %5256 = phi float [ %7308, %__nv_exp2f.exit1315 ], [ %4988, %4945 ] + %5257 = phi float [ %7309, %__nv_exp2f.exit1315 ], [ %4989, %4945 ] + %5258 = phi float [ %7310, %__nv_exp2f.exit1315 ], [ %4990, %4945 ] + %5259 = phi float [ %7311, %__nv_exp2f.exit1315 ], [ %4991, %4945 ] + %5260 = phi float [ %7312, %__nv_exp2f.exit1315 ], [ %4992, %4945 ] + %5261 = phi float [ %7313, %__nv_exp2f.exit1315 ], [ %4993, %4945 ] + %5262 = phi float [ %7314, %__nv_exp2f.exit1315 ], [ %4994, %4945 ] + %5263 = phi float [ %7315, %__nv_exp2f.exit1315 ], [ %4995, %4945 ] + %5264 = phi float [ %7316, %__nv_exp2f.exit1315 ], [ %4996, %4945 ] + %5265 = phi float [ %7317, %__nv_exp2f.exit1315 ], [ %4997, %4945 ] + %5266 = phi float [ %7318, %__nv_exp2f.exit1315 ], [ %4998, %4945 ] + %5267 = phi float [ %7319, %__nv_exp2f.exit1315 ], [ %4999, %4945 ] + %5268 = phi float [ %7320, %__nv_exp2f.exit1315 ], [ %5000, %4945 ] + %5269 = phi float [ %7321, %__nv_exp2f.exit1315 ], [ %5001, %4945 ] + %5270 = phi float [ %7322, %__nv_exp2f.exit1315 ], [ %5002, %4945 ] + %5271 = phi float [ %7323, %__nv_exp2f.exit1315 ], [ %5003, %4945 ] + %5272 = phi float [ %7324, %__nv_exp2f.exit1315 ], [ %5004, %4945 ] + %5273 = phi float [ %7325, %__nv_exp2f.exit1315 ], [ %5005, %4945 ] + %5274 = phi float [ %7326, %__nv_exp2f.exit1315 ], [ %5006, %4945 ] + %5275 = phi float [ %7327, %__nv_exp2f.exit1315 ], [ %5007, %4945 ] + %5276 = phi float [ %7328, %__nv_exp2f.exit1315 ], [ %5008, %4945 ] + %5277 = phi float [ %7329, %__nv_exp2f.exit1315 ], [ %5009, %4945 ] + %5278 = phi i32 [ %7333, %__nv_exp2f.exit1315 ], [ 0, %4945 ] + %5279 = phi <16 x i32> [ %7332, %__nv_exp2f.exit1315 ], [ %4942, %4945 ] + %5280 = icmp slt i32 %5278, %4884, !dbg !213 + %5281 = icmp slt i32 %5278, %4885, !dbg !213 + %5282 = add i32 %5146, 1, !dbg !213 + %5283 = icmp sgt i32 %5282, 1, !dbg !213 + %5284 = select i1 %5283, i32 0, i32 %5282, !dbg !213 + %5285 = add i32 %5148, 1, !dbg !213 + %5286 = icmp sgt i32 %5285, 2, !dbg !213 + %5287 = select i1 %5286, i32 0, i32 %5285, !dbg !213 + tail call void @llvm.nvvm.cp.async.wait.group(i32 4), !dbg !244 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !244 + %5288 = shl i32 %5287, 13, !dbg !244 + %5289 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %5288, !dbg !244 + %5290 = shl i32 %5284, 6, !dbg !246 + %5291 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5290, !dbg !246 + %5292 = getelementptr inbounds nuw i8, ptr addrspace(3) %5291, i32 %4812, !dbg !246 + %5293 = load float, ptr addrspace(3) %5292, align 8, !dbg !246 + %5294 = getelementptr inbounds nuw i8, ptr addrspace(3) %5292, i32 4, !dbg !246 + %5295 = load float, ptr addrspace(3) %5294, align 4, !dbg !246 + %5296 = getelementptr inbounds nuw i8, ptr addrspace(3) %5291, i32 %4815, !dbg !246 + %5297 = load float, ptr addrspace(3) %5296, align 8, !dbg !246 + %5298 = getelementptr inbounds nuw i8, ptr addrspace(3) %5296, i32 4, !dbg !246 + %5299 = load float, ptr addrspace(3) %5298, align 4, !dbg !246 + %5300 = getelementptr inbounds nuw i8, ptr addrspace(3) %5291, i32 %4817, !dbg !246 + %5301 = load float, ptr addrspace(3) %5300, align 8, !dbg !246 + %5302 = getelementptr inbounds nuw i8, ptr addrspace(3) %5300, i32 4, !dbg !246 + %5303 = load float, ptr addrspace(3) %5302, align 4, !dbg !246 + %5304 = getelementptr inbounds nuw i8, ptr addrspace(3) %5291, i32 %4819, !dbg !246 + %5305 = load float, ptr addrspace(3) %5304, align 8, !dbg !246 + %5306 = getelementptr inbounds nuw i8, ptr addrspace(3) %5304, i32 4, !dbg !246 + %5307 = load float, ptr addrspace(3) %5306, align 4, !dbg !246 + %5308 = getelementptr inbounds nuw i8, ptr addrspace(3) %5291, i32 %4821, !dbg !246 + %5309 = load float, ptr addrspace(3) %5308, align 8, !dbg !246 + %5310 = getelementptr inbounds nuw i8, ptr addrspace(3) %5308, i32 4, !dbg !246 + %5311 = load float, ptr addrspace(3) %5310, align 4, !dbg !246 + %5312 = getelementptr inbounds nuw i8, ptr addrspace(3) %5291, i32 %4823, !dbg !246 + %5313 = load float, ptr addrspace(3) %5312, align 8, !dbg !246 + %5314 = getelementptr inbounds nuw i8, ptr addrspace(3) %5312, i32 4, !dbg !246 + %5315 = load float, ptr addrspace(3) %5314, align 4, !dbg !246 + %5316 = getelementptr inbounds nuw i8, ptr addrspace(3) %5291, i32 %4825, !dbg !246 + %5317 = load float, ptr addrspace(3) %5316, align 8, !dbg !246 + %5318 = getelementptr inbounds nuw i8, ptr addrspace(3) %5316, i32 4, !dbg !246 + %5319 = load float, ptr addrspace(3) %5318, align 4, !dbg !246 + %5320 = getelementptr inbounds nuw i8, ptr addrspace(3) %5291, i32 %4827, !dbg !246 + %5321 = load float, ptr addrspace(3) %5320, align 8, !dbg !246 + %5322 = getelementptr inbounds nuw i8, ptr addrspace(3) %5320, i32 4, !dbg !246 + %5323 = load float, ptr addrspace(3) %5322, align 4, !dbg !246 + %5324 = fcmp oeq float %5293, 0xFFF0000000000000, !dbg !251 + %5325 = fcmp oeq float %5295, 0xFFF0000000000000, !dbg !251 + %5326 = fcmp oeq float %5297, 0xFFF0000000000000, !dbg !251 + %5327 = fcmp oeq float %5299, 0xFFF0000000000000, !dbg !251 + %5328 = fcmp oeq float %5301, 0xFFF0000000000000, !dbg !251 + %5329 = fcmp oeq float %5303, 0xFFF0000000000000, !dbg !251 + %5330 = fcmp oeq float %5305, 0xFFF0000000000000, !dbg !251 + %5331 = fcmp oeq float %5307, 0xFFF0000000000000, !dbg !251 + %5332 = fcmp oeq float %5309, 0xFFF0000000000000, !dbg !251 + %5333 = fcmp oeq float %5311, 0xFFF0000000000000, !dbg !251 + %5334 = fcmp oeq float %5313, 0xFFF0000000000000, !dbg !251 + %5335 = fcmp oeq float %5315, 0xFFF0000000000000, !dbg !251 + %5336 = fcmp oeq float %5317, 0xFFF0000000000000, !dbg !251 + %5337 = fcmp oeq float %5319, 0xFFF0000000000000, !dbg !251 + %5338 = fcmp oeq float %5321, 0xFFF0000000000000, !dbg !251 + %5339 = fcmp oeq float %5323, 0xFFF0000000000000, !dbg !251 + %5340 = select i1 %5324, float 0.000000e+00, float %5293, !dbg !252 + %5341 = select i1 %5325, float 0.000000e+00, float %5295, !dbg !252 + %5342 = select i1 %5326, float 0.000000e+00, float %5297, !dbg !252 + %5343 = select i1 %5327, float 0.000000e+00, float %5299, !dbg !252 + %5344 = select i1 %5328, float 0.000000e+00, float %5301, !dbg !252 + %5345 = select i1 %5329, float 0.000000e+00, float %5303, !dbg !252 + %5346 = select i1 %5330, float 0.000000e+00, float %5305, !dbg !252 + %5347 = select i1 %5331, float 0.000000e+00, float %5307, !dbg !252 + %5348 = select i1 %5332, float 0.000000e+00, float %5309, !dbg !252 + %5349 = select i1 %5333, float 0.000000e+00, float %5311, !dbg !252 + %5350 = select i1 %5334, float 0.000000e+00, float %5313, !dbg !252 + %5351 = select i1 %5335, float 0.000000e+00, float %5315, !dbg !252 + %5352 = select i1 %5336, float 0.000000e+00, float %5317, !dbg !252 + %5353 = select i1 %5337, float 0.000000e+00, float %5319, !dbg !252 + %5354 = select i1 %5338, float 0.000000e+00, float %5321, !dbg !252 + %5355 = select i1 %5339, float 0.000000e+00, float %5323, !dbg !252 + %5356 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %36, i32 0, i32 31), !dbg !226 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !226 + %5357 = shl i32 %5356, 11, !dbg !226 + %5358 = and i32 %5357, 8192, !dbg !226 + %5359 = add i32 %5358, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !226 + %5360 = lshr exact i32 %5359, 4, !dbg !226 + %5361 = and i32 %5360, 16383, !dbg !226 + %5362 = zext nneg i32 %5361 to i64, !dbg !226 + %5363 = or disjoint i64 %5362, 4611686293372403712, !dbg !226 + %5364 = ptrtoint ptr addrspace(3) %5289 to i32, !dbg !226 + %5365 = lshr exact i32 %5364, 4, !dbg !226 + %5366 = and i32 %5365, 16383, !dbg !226 + %5367 = zext nneg i32 %5366 to i64, !dbg !226 + %5368 = or disjoint i64 %5367, 4611686293338849280, !dbg !226 + %5369 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %5363, i64 %5368) #3, !dbg !226 + %5370 = or disjoint i32 %5358, 32, !dbg !226 + %5371 = add i32 %5370, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !226 + %5372 = lshr exact i32 %5371, 4, !dbg !226 + %5373 = and i32 %5372, 16383, !dbg !226 + %5374 = zext nneg i32 %5373 to i64, !dbg !226 + %5375 = or disjoint i64 %5374, 4611686293372403712, !dbg !226 + %5376 = add i32 %5364, 32, !dbg !226 + %5377 = lshr exact i32 %5376, 4, !dbg !226 + %5378 = and i32 %5377, 16383, !dbg !226 + %5379 = zext nneg i32 %5378 to i64, !dbg !226 + %5380 = or disjoint i64 %5379, 4611686293338849280, !dbg !226 + %5381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 0, !dbg !226 + %5382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 1, !dbg !226 + %5383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 2, !dbg !226 + %5384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 3, !dbg !226 + %5385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 4, !dbg !226 + %5386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 5, !dbg !226 + %5387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 6, !dbg !226 + %5388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 7, !dbg !226 + %5389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 8, !dbg !226 + %5390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 9, !dbg !226 + %5391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 10, !dbg !226 + %5392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 11, !dbg !226 + %5393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 12, !dbg !226 + %5394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 13, !dbg !226 + %5395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 14, !dbg !226 + %5396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 15, !dbg !226 + %5397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 16, !dbg !226 + %5398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 17, !dbg !226 + %5399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 18, !dbg !226 + %5400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 19, !dbg !226 + %5401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 20, !dbg !226 + %5402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 21, !dbg !226 + %5403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 22, !dbg !226 + %5404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 23, !dbg !226 + %5405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 24, !dbg !226 + %5406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 25, !dbg !226 + %5407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 26, !dbg !226 + %5408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 27, !dbg !226 + %5409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 28, !dbg !226 + %5410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 29, !dbg !226 + %5411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 30, !dbg !226 + %5412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5369, 31, !dbg !226 + %5413 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %5381, float %5382, float %5383, float %5384, float %5385, float %5386, float %5387, float %5388, float %5389, float %5390, float %5391, float %5392, float %5393, float %5394, float %5395, float %5396, float %5397, float %5398, float %5399, float %5400, float %5401, float %5402, float %5403, float %5404, float %5405, float %5406, float %5407, float %5408, float %5409, float %5410, float %5411, float %5412, i64 %5375, i64 %5380, i1 true) #3, !dbg !226 + %5414 = or disjoint i32 %5358, 64, !dbg !226 + %5415 = add i32 %5414, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !226 + %5416 = lshr exact i32 %5415, 4, !dbg !226 + %5417 = and i32 %5416, 16383, !dbg !226 + %5418 = zext nneg i32 %5417 to i64, !dbg !226 + %5419 = or disjoint i64 %5418, 4611686293372403712, !dbg !226 + %5420 = add i32 %5364, 64, !dbg !226 + %5421 = lshr exact i32 %5420, 4, !dbg !226 + %5422 = and i32 %5421, 16383, !dbg !226 + %5423 = zext nneg i32 %5422 to i64, !dbg !226 + %5424 = or disjoint i64 %5423, 4611686293338849280, !dbg !226 + %5425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 0, !dbg !226 + %5426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 1, !dbg !226 + %5427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 2, !dbg !226 + %5428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 3, !dbg !226 + %5429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 4, !dbg !226 + %5430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 5, !dbg !226 + %5431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 6, !dbg !226 + %5432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 7, !dbg !226 + %5433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 8, !dbg !226 + %5434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 9, !dbg !226 + %5435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 10, !dbg !226 + %5436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 11, !dbg !226 + %5437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 12, !dbg !226 + %5438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 13, !dbg !226 + %5439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 14, !dbg !226 + %5440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 15, !dbg !226 + %5441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 16, !dbg !226 + %5442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 17, !dbg !226 + %5443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 18, !dbg !226 + %5444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 19, !dbg !226 + %5445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 20, !dbg !226 + %5446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 21, !dbg !226 + %5447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 22, !dbg !226 + %5448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 23, !dbg !226 + %5449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 24, !dbg !226 + %5450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 25, !dbg !226 + %5451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 26, !dbg !226 + %5452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 27, !dbg !226 + %5453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 28, !dbg !226 + %5454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 29, !dbg !226 + %5455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 30, !dbg !226 + %5456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5413, 31, !dbg !226 + %5457 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %5425, float %5426, float %5427, float %5428, float %5429, float %5430, float %5431, float %5432, float %5433, float %5434, float %5435, float %5436, float %5437, float %5438, float %5439, float %5440, float %5441, float %5442, float %5443, float %5444, float %5445, float %5446, float %5447, float %5448, float %5449, float %5450, float %5451, float %5452, float %5453, float %5454, float %5455, float %5456, i64 %5419, i64 %5424, i1 true) #3, !dbg !226 + %5458 = or disjoint i32 %5358, 96, !dbg !226 + %5459 = add i32 %5458, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !226 + %5460 = lshr exact i32 %5459, 4, !dbg !226 + %5461 = and i32 %5460, 16383, !dbg !226 + %5462 = zext nneg i32 %5461 to i64, !dbg !226 + %5463 = or disjoint i64 %5462, 4611686293372403712, !dbg !226 + %5464 = add i32 %5364, 96, !dbg !226 + %5465 = lshr exact i32 %5464, 4, !dbg !226 + %5466 = and i32 %5465, 16383, !dbg !226 + %5467 = zext nneg i32 %5466 to i64, !dbg !226 + %5468 = or disjoint i64 %5467, 4611686293338849280, !dbg !226 + %5469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 0, !dbg !226 + %5470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 1, !dbg !226 + %5471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 2, !dbg !226 + %5472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 3, !dbg !226 + %5473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 4, !dbg !226 + %5474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 5, !dbg !226 + %5475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 6, !dbg !226 + %5476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 7, !dbg !226 + %5477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 8, !dbg !226 + %5478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 9, !dbg !226 + %5479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 10, !dbg !226 + %5480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 11, !dbg !226 + %5481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 12, !dbg !226 + %5482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 13, !dbg !226 + %5483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 14, !dbg !226 + %5484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 15, !dbg !226 + %5485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 16, !dbg !226 + %5486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 17, !dbg !226 + %5487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 18, !dbg !226 + %5488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 19, !dbg !226 + %5489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 20, !dbg !226 + %5490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 21, !dbg !226 + %5491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 22, !dbg !226 + %5492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 23, !dbg !226 + %5493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 24, !dbg !226 + %5494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 25, !dbg !226 + %5495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 26, !dbg !226 + %5496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 27, !dbg !226 + %5497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 28, !dbg !226 + %5498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 29, !dbg !226 + %5499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 30, !dbg !226 + %5500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5457, 31, !dbg !226 + %5501 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %5469, float %5470, float %5471, float %5472, float %5473, float %5474, float %5475, float %5476, float %5477, float %5478, float %5479, float %5480, float %5481, float %5482, float %5483, float %5484, float %5485, float %5486, float %5487, float %5488, float %5489, float %5490, float %5491, float %5492, float %5493, float %5494, float %5495, float %5496, float %5497, float %5498, float %5499, float %5500, i64 %5463, i64 %5468, i1 true) #3, !dbg !226 + %5502 = or disjoint i32 %5358, 16384, !dbg !226 + %5503 = add i32 %5502, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !226 + %5504 = lshr exact i32 %5503, 4, !dbg !226 + %5505 = and i32 %5504, 16383, !dbg !226 + %5506 = zext nneg i32 %5505 to i64, !dbg !226 + %5507 = or disjoint i64 %5506, 4611686293372403712, !dbg !226 + %5508 = add i32 %5364, 8192, !dbg !226 + %5509 = lshr exact i32 %5508, 4, !dbg !226 + %5510 = and i32 %5509, 16383, !dbg !226 + %5511 = zext nneg i32 %5510 to i64, !dbg !226 + %5512 = or disjoint i64 %5511, 4611686293338849280, !dbg !226 + %5513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 0, !dbg !226 + %5514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 1, !dbg !226 + %5515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 2, !dbg !226 + %5516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 3, !dbg !226 + %5517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 4, !dbg !226 + %5518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 5, !dbg !226 + %5519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 6, !dbg !226 + %5520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 7, !dbg !226 + %5521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 8, !dbg !226 + %5522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 9, !dbg !226 + %5523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 10, !dbg !226 + %5524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 11, !dbg !226 + %5525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 12, !dbg !226 + %5526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 13, !dbg !226 + %5527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 14, !dbg !226 + %5528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 15, !dbg !226 + %5529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 16, !dbg !226 + %5530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 17, !dbg !226 + %5531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 18, !dbg !226 + %5532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 19, !dbg !226 + %5533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 20, !dbg !226 + %5534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 21, !dbg !226 + %5535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 22, !dbg !226 + %5536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 23, !dbg !226 + %5537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 24, !dbg !226 + %5538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 25, !dbg !226 + %5539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 26, !dbg !226 + %5540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 27, !dbg !226 + %5541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 28, !dbg !226 + %5542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 29, !dbg !226 + %5543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 30, !dbg !226 + %5544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5501, 31, !dbg !226 + %5545 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %5513, float %5514, float %5515, float %5516, float %5517, float %5518, float %5519, float %5520, float %5521, float %5522, float %5523, float %5524, float %5525, float %5526, float %5527, float %5528, float %5529, float %5530, float %5531, float %5532, float %5533, float %5534, float %5535, float %5536, float %5537, float %5538, float %5539, float %5540, float %5541, float %5542, float %5543, float %5544, i64 %5507, i64 %5512, i1 true) #3, !dbg !226 + %5546 = or disjoint i32 %5358, 16416, !dbg !226 + %5547 = add i32 %5546, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !226 + %5548 = lshr exact i32 %5547, 4, !dbg !226 + %5549 = and i32 %5548, 16383, !dbg !226 + %5550 = zext nneg i32 %5549 to i64, !dbg !226 + %5551 = or disjoint i64 %5550, 4611686293372403712, !dbg !226 + %5552 = add i32 %5364, 8224, !dbg !226 + %5553 = lshr exact i32 %5552, 4, !dbg !226 + %5554 = and i32 %5553, 16383, !dbg !226 + %5555 = zext nneg i32 %5554 to i64, !dbg !226 + %5556 = or disjoint i64 %5555, 4611686293338849280, !dbg !226 + %5557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 0, !dbg !226 + %5558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 1, !dbg !226 + %5559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 2, !dbg !226 + %5560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 3, !dbg !226 + %5561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 4, !dbg !226 + %5562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 5, !dbg !226 + %5563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 6, !dbg !226 + %5564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 7, !dbg !226 + %5565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 8, !dbg !226 + %5566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 9, !dbg !226 + %5567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 10, !dbg !226 + %5568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 11, !dbg !226 + %5569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 12, !dbg !226 + %5570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 13, !dbg !226 + %5571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 14, !dbg !226 + %5572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 15, !dbg !226 + %5573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 16, !dbg !226 + %5574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 17, !dbg !226 + %5575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 18, !dbg !226 + %5576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 19, !dbg !226 + %5577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 20, !dbg !226 + %5578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 21, !dbg !226 + %5579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 22, !dbg !226 + %5580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 23, !dbg !226 + %5581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 24, !dbg !226 + %5582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 25, !dbg !226 + %5583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 26, !dbg !226 + %5584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 27, !dbg !226 + %5585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 28, !dbg !226 + %5586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 29, !dbg !226 + %5587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 30, !dbg !226 + %5588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5545, 31, !dbg !226 + %5589 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %5557, float %5558, float %5559, float %5560, float %5561, float %5562, float %5563, float %5564, float %5565, float %5566, float %5567, float %5568, float %5569, float %5570, float %5571, float %5572, float %5573, float %5574, float %5575, float %5576, float %5577, float %5578, float %5579, float %5580, float %5581, float %5582, float %5583, float %5584, float %5585, float %5586, float %5587, float %5588, i64 %5551, i64 %5556, i1 true) #3, !dbg !226 + %5590 = or disjoint i32 %5358, 16448, !dbg !226 + %5591 = add i32 %5590, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !226 + %5592 = lshr exact i32 %5591, 4, !dbg !226 + %5593 = and i32 %5592, 16383, !dbg !226 + %5594 = zext nneg i32 %5593 to i64, !dbg !226 + %5595 = or disjoint i64 %5594, 4611686293372403712, !dbg !226 + %5596 = add i32 %5364, 8256, !dbg !226 + %5597 = lshr exact i32 %5596, 4, !dbg !226 + %5598 = and i32 %5597, 16383, !dbg !226 + %5599 = zext nneg i32 %5598 to i64, !dbg !226 + %5600 = or disjoint i64 %5599, 4611686293338849280, !dbg !226 + %5601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 0, !dbg !226 + %5602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 1, !dbg !226 + %5603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 2, !dbg !226 + %5604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 3, !dbg !226 + %5605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 4, !dbg !226 + %5606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 5, !dbg !226 + %5607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 6, !dbg !226 + %5608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 7, !dbg !226 + %5609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 8, !dbg !226 + %5610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 9, !dbg !226 + %5611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 10, !dbg !226 + %5612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 11, !dbg !226 + %5613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 12, !dbg !226 + %5614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 13, !dbg !226 + %5615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 14, !dbg !226 + %5616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 15, !dbg !226 + %5617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 16, !dbg !226 + %5618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 17, !dbg !226 + %5619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 18, !dbg !226 + %5620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 19, !dbg !226 + %5621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 20, !dbg !226 + %5622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 21, !dbg !226 + %5623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 22, !dbg !226 + %5624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 23, !dbg !226 + %5625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 24, !dbg !226 + %5626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 25, !dbg !226 + %5627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 26, !dbg !226 + %5628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 27, !dbg !226 + %5629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 28, !dbg !226 + %5630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 29, !dbg !226 + %5631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 30, !dbg !226 + %5632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5589, 31, !dbg !226 + %5633 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %5601, float %5602, float %5603, float %5604, float %5605, float %5606, float %5607, float %5608, float %5609, float %5610, float %5611, float %5612, float %5613, float %5614, float %5615, float %5616, float %5617, float %5618, float %5619, float %5620, float %5621, float %5622, float %5623, float %5624, float %5625, float %5626, float %5627, float %5628, float %5629, float %5630, float %5631, float %5632, i64 %5595, i64 %5600, i1 true) #3, !dbg !226 + %5634 = or disjoint i32 %5358, 16480, !dbg !226 + %5635 = add i32 %5634, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !226 + %5636 = lshr exact i32 %5635, 4, !dbg !226 + %5637 = and i32 %5636, 16383, !dbg !226 + %5638 = zext nneg i32 %5637 to i64, !dbg !226 + %5639 = or disjoint i64 %5638, 4611686293372403712, !dbg !226 + %5640 = add i32 %5364, 8288, !dbg !226 + %5641 = lshr exact i32 %5640, 4, !dbg !226 + %5642 = and i32 %5641, 16383, !dbg !226 + %5643 = zext nneg i32 %5642 to i64, !dbg !226 + %5644 = or disjoint i64 %5643, 4611686293338849280, !dbg !226 + %5645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 0, !dbg !226 + %5646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 1, !dbg !226 + %5647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 2, !dbg !226 + %5648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 3, !dbg !226 + %5649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 4, !dbg !226 + %5650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 5, !dbg !226 + %5651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 6, !dbg !226 + %5652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 7, !dbg !226 + %5653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 8, !dbg !226 + %5654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 9, !dbg !226 + %5655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 10, !dbg !226 + %5656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 11, !dbg !226 + %5657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 12, !dbg !226 + %5658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 13, !dbg !226 + %5659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 14, !dbg !226 + %5660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 15, !dbg !226 + %5661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 16, !dbg !226 + %5662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 17, !dbg !226 + %5663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 18, !dbg !226 + %5664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 19, !dbg !226 + %5665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 20, !dbg !226 + %5666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 21, !dbg !226 + %5667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 22, !dbg !226 + %5668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 23, !dbg !226 + %5669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 24, !dbg !226 + %5670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 25, !dbg !226 + %5671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 26, !dbg !226 + %5672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 27, !dbg !226 + %5673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 28, !dbg !226 + %5674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 29, !dbg !226 + %5675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 30, !dbg !226 + %5676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5633, 31, !dbg !226 + %5677 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %5645, float %5646, float %5647, float %5648, float %5649, float %5650, float %5651, float %5652, float %5653, float %5654, float %5655, float %5656, float %5657, float %5658, float %5659, float %5660, float %5661, float %5662, float %5663, float %5664, float %5665, float %5666, float %5667, float %5668, float %5669, float %5670, float %5671, float %5672, float %5673, float %5674, float %5675, float %5676, i64 %5639, i64 %5644, i1 true) #3, !dbg !226 + %5678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 0, !dbg !226 + %5679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 1, !dbg !226 + %5680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 2, !dbg !226 + %5681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 3, !dbg !226 + %5682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 4, !dbg !226 + %5683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 5, !dbg !226 + %5684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 6, !dbg !226 + %5685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 7, !dbg !226 + %5686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 8, !dbg !226 + %5687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 9, !dbg !226 + %5688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 10, !dbg !226 + %5689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 11, !dbg !226 + %5690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 12, !dbg !226 + %5691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 13, !dbg !226 + %5692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 14, !dbg !226 + %5693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 15, !dbg !226 + %5694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 16, !dbg !226 + %5695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 17, !dbg !226 + %5696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 18, !dbg !226 + %5697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 19, !dbg !226 + %5698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 20, !dbg !226 + %5699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 21, !dbg !226 + %5700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 22, !dbg !226 + %5701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 23, !dbg !226 + %5702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 24, !dbg !226 + %5703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 25, !dbg !226 + %5704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 26, !dbg !226 + %5705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 27, !dbg !226 + %5706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 28, !dbg !226 + %5707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 29, !dbg !226 + %5708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 30, !dbg !226 + %5709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5677, 31, !dbg !226 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !226 + %5710 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %5678, float %5679, float %5680, float %5681, float %5682, float %5683, float %5684, float %5685, float %5686, float %5687, float %5688, float %5689, float %5690, float %5691, float %5692, float %5693, float %5694, float %5695, float %5696, float %5697, float %5698, float %5699, float %5700, float %5701, float %5702, float %5703, float %5704, float %5705, float %5706, float %5707, float %5708, float %5709, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 0, i32 0, ptr addrspace(3) %5289, i32 0, i32 0) #3, !dbg !226 + %5711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 0, !dbg !226 + %5712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 1, !dbg !226 + %5713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 2, !dbg !226 + %5714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 3, !dbg !226 + %5715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 4, !dbg !226 + %5716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 5, !dbg !226 + %5717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 6, !dbg !226 + %5718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 7, !dbg !226 + %5719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 8, !dbg !226 + %5720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 9, !dbg !226 + %5721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 10, !dbg !226 + %5722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 11, !dbg !226 + %5723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 12, !dbg !226 + %5724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 13, !dbg !226 + %5725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 14, !dbg !226 + %5726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 15, !dbg !226 + %5727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 16, !dbg !226 + %5728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 17, !dbg !226 + %5729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 18, !dbg !226 + %5730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 19, !dbg !226 + %5731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 20, !dbg !226 + %5732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 21, !dbg !226 + %5733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 22, !dbg !226 + %5734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 23, !dbg !226 + %5735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 24, !dbg !226 + %5736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 25, !dbg !226 + %5737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 26, !dbg !226 + %5738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 27, !dbg !226 + %5739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 28, !dbg !226 + %5740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 29, !dbg !226 + %5741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 30, !dbg !226 + %5742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %5710, 31, !dbg !226 + %5743 = fmul float %5711, 0x3FB6A09E60000000, !dbg !253 + %5744 = fmul float %5712, 0x3FB6A09E60000000, !dbg !253 + %5745 = fmul float %5713, 0x3FB6A09E60000000, !dbg !253 + %5746 = fmul float %5714, 0x3FB6A09E60000000, !dbg !253 + %5747 = fmul float %5715, 0x3FB6A09E60000000, !dbg !253 + %5748 = fmul float %5716, 0x3FB6A09E60000000, !dbg !253 + %5749 = fmul float %5717, 0x3FB6A09E60000000, !dbg !253 + %5750 = fmul float %5718, 0x3FB6A09E60000000, !dbg !253 + %5751 = fmul float %5719, 0x3FB6A09E60000000, !dbg !253 + %5752 = fmul float %5720, 0x3FB6A09E60000000, !dbg !253 + %5753 = fmul float %5721, 0x3FB6A09E60000000, !dbg !253 + %5754 = fmul float %5722, 0x3FB6A09E60000000, !dbg !253 + %5755 = fmul float %5723, 0x3FB6A09E60000000, !dbg !253 + %5756 = fmul float %5724, 0x3FB6A09E60000000, !dbg !253 + %5757 = fmul float %5725, 0x3FB6A09E60000000, !dbg !253 + %5758 = fmul float %5726, 0x3FB6A09E60000000, !dbg !253 + %5759 = fmul float %5727, 0x3FB6A09E60000000, !dbg !253 + %5760 = fmul float %5728, 0x3FB6A09E60000000, !dbg !253 + %5761 = fmul float %5729, 0x3FB6A09E60000000, !dbg !253 + %5762 = fmul float %5730, 0x3FB6A09E60000000, !dbg !253 + %5763 = fmul float %5731, 0x3FB6A09E60000000, !dbg !253 + %5764 = fmul float %5732, 0x3FB6A09E60000000, !dbg !253 + %5765 = fmul float %5733, 0x3FB6A09E60000000, !dbg !253 + %5766 = fmul float %5734, 0x3FB6A09E60000000, !dbg !253 + %5767 = fmul float %5735, 0x3FB6A09E60000000, !dbg !253 + %5768 = fmul float %5736, 0x3FB6A09E60000000, !dbg !253 + %5769 = fmul float %5737, 0x3FB6A09E60000000, !dbg !253 + %5770 = fmul float %5738, 0x3FB6A09E60000000, !dbg !253 + %5771 = fmul float %5739, 0x3FB6A09E60000000, !dbg !253 + %5772 = fmul float %5740, 0x3FB6A09E60000000, !dbg !253 + %5773 = fmul float %5741, 0x3FB6A09E60000000, !dbg !253 + %5774 = fmul float %5742, 0x3FB6A09E60000000, !dbg !253 + %5775 = extractelement <16 x i32> %5279, i64 15, !dbg !254 + %5776 = icmp sge i32 %5775, %4470, !dbg !254 + %5777 = extractelement <16 x i32> %5279, i64 14, !dbg !254 + %5778 = icmp sge i32 %5777, %4470, !dbg !254 + %5779 = icmp sge i32 %5775, %4471, !dbg !254 + %5780 = icmp sge i32 %5777, %4471, !dbg !254 + %5781 = extractelement <16 x i32> %5279, i64 13, !dbg !254 + %5782 = icmp sge i32 %5781, %4470, !dbg !254 + %5783 = extractelement <16 x i32> %5279, i64 12, !dbg !254 + %5784 = icmp sge i32 %5783, %4470, !dbg !254 + %5785 = icmp sge i32 %5781, %4471, !dbg !254 + %5786 = icmp sge i32 %5783, %4471, !dbg !254 + %5787 = extractelement <16 x i32> %5279, i64 11, !dbg !254 + %5788 = icmp sge i32 %5787, %4470, !dbg !254 + %5789 = extractelement <16 x i32> %5279, i64 10, !dbg !254 + %5790 = icmp sge i32 %5789, %4470, !dbg !254 + %5791 = icmp sge i32 %5787, %4471, !dbg !254 + %5792 = icmp sge i32 %5789, %4471, !dbg !254 + %5793 = extractelement <16 x i32> %5279, i64 9, !dbg !254 + %5794 = icmp sge i32 %5793, %4470, !dbg !254 + %5795 = extractelement <16 x i32> %5279, i64 8, !dbg !254 + %5796 = icmp sge i32 %5795, %4470, !dbg !254 + %5797 = icmp sge i32 %5793, %4471, !dbg !254 + %5798 = icmp sge i32 %5795, %4471, !dbg !254 + %5799 = extractelement <16 x i32> %5279, i64 7, !dbg !254 + %5800 = icmp sge i32 %5799, %4470, !dbg !254 + %5801 = extractelement <16 x i32> %5279, i64 6, !dbg !254 + %5802 = icmp sge i32 %5801, %4470, !dbg !254 + %5803 = icmp sge i32 %5799, %4471, !dbg !254 + %5804 = icmp sge i32 %5801, %4471, !dbg !254 + %5805 = extractelement <16 x i32> %5279, i64 5, !dbg !254 + %5806 = icmp sge i32 %5805, %4470, !dbg !254 + %5807 = extractelement <16 x i32> %5279, i64 4, !dbg !254 + %5808 = icmp sge i32 %5807, %4470, !dbg !254 + %5809 = icmp sge i32 %5805, %4471, !dbg !254 + %5810 = icmp sge i32 %5807, %4471, !dbg !254 + %5811 = extractelement <16 x i32> %5279, i64 3, !dbg !254 + %5812 = icmp sge i32 %5811, %4470, !dbg !254 + %5813 = extractelement <16 x i32> %5279, i64 2, !dbg !254 + %5814 = icmp sge i32 %5813, %4470, !dbg !254 + %5815 = icmp sge i32 %5811, %4471, !dbg !254 + %5816 = icmp sge i32 %5813, %4471, !dbg !254 + %5817 = extractelement <16 x i32> %5279, i64 1, !dbg !254 + %5818 = icmp sge i32 %5817, %4470, !dbg !254 + %5819 = extractelement <16 x i32> %5279, i64 0, !dbg !254 + %5820 = icmp sge i32 %5819, %4470, !dbg !254 + %5821 = icmp sge i32 %5817, %4471, !dbg !254 + %5822 = icmp sge i32 %5819, %4471, !dbg !254 + %5823 = sext <16 x i32> %5279 to <16 x i64>, !dbg !255 + %5824 = icmp sgt <16 x i64> %4944, %5823, !dbg !255 + %5825 = extractelement <16 x i1> %5824, i64 15, !dbg !256 + %5826 = and i1 %5776, %5825, !dbg !256 + %5827 = extractelement <16 x i1> %5824, i64 14, !dbg !256 + %5828 = and i1 %5778, %5827, !dbg !256 + %5829 = and i1 %5779, %5825, !dbg !256 + %5830 = and i1 %5780, %5827, !dbg !256 + %5831 = extractelement <16 x i1> %5824, i64 13, !dbg !256 + %5832 = and i1 %5782, %5831, !dbg !256 + %5833 = extractelement <16 x i1> %5824, i64 12, !dbg !256 + %5834 = and i1 %5784, %5833, !dbg !256 + %5835 = and i1 %5785, %5831, !dbg !256 + %5836 = and i1 %5786, %5833, !dbg !256 + %5837 = extractelement <16 x i1> %5824, i64 11, !dbg !256 + %5838 = and i1 %5788, %5837, !dbg !256 + %5839 = extractelement <16 x i1> %5824, i64 10, !dbg !256 + %5840 = and i1 %5790, %5839, !dbg !256 + %5841 = and i1 %5791, %5837, !dbg !256 + %5842 = and i1 %5792, %5839, !dbg !256 + %5843 = extractelement <16 x i1> %5824, i64 9, !dbg !256 + %5844 = and i1 %5794, %5843, !dbg !256 + %5845 = extractelement <16 x i1> %5824, i64 8, !dbg !256 + %5846 = and i1 %5796, %5845, !dbg !256 + %5847 = and i1 %5797, %5843, !dbg !256 + %5848 = and i1 %5798, %5845, !dbg !256 + %5849 = extractelement <16 x i1> %5824, i64 7, !dbg !256 + %5850 = and i1 %5800, %5849, !dbg !256 + %5851 = extractelement <16 x i1> %5824, i64 6, !dbg !256 + %5852 = and i1 %5802, %5851, !dbg !256 + %5853 = and i1 %5803, %5849, !dbg !256 + %5854 = and i1 %5804, %5851, !dbg !256 + %5855 = extractelement <16 x i1> %5824, i64 5, !dbg !256 + %5856 = and i1 %5806, %5855, !dbg !256 + %5857 = extractelement <16 x i1> %5824, i64 4, !dbg !256 + %5858 = and i1 %5808, %5857, !dbg !256 + %5859 = and i1 %5809, %5855, !dbg !256 + %5860 = and i1 %5810, %5857, !dbg !256 + %5861 = extractelement <16 x i1> %5824, i64 3, !dbg !256 + %5862 = and i1 %5812, %5861, !dbg !256 + %5863 = extractelement <16 x i1> %5824, i64 2, !dbg !256 + %5864 = and i1 %5814, %5863, !dbg !256 + %5865 = and i1 %5815, %5861, !dbg !256 + %5866 = and i1 %5816, %5863, !dbg !256 + %5867 = extractelement <16 x i1> %5824, i64 1, !dbg !256 + %5868 = and i1 %5818, %5867, !dbg !256 + %5869 = extractelement <16 x i1> %5824, i64 0, !dbg !256 + %5870 = and i1 %5820, %5869, !dbg !256 + %5871 = and i1 %5821, %5867, !dbg !256 + %5872 = and i1 %5822, %5869, !dbg !256 + %5873 = fmul float %5743, 0x3FF7154760000000, !dbg !257 + %5874 = select i1 %5826, float %5873, float 0xFFF0000000000000, !dbg !258 + %5875 = fmul float %5744, 0x3FF7154760000000, !dbg !257 + %5876 = select i1 %5828, float %5875, float 0xFFF0000000000000, !dbg !258 + %5877 = fmul float %5745, 0x3FF7154760000000, !dbg !257 + %5878 = select i1 %5829, float %5877, float 0xFFF0000000000000, !dbg !258 + %5879 = fmul float %5746, 0x3FF7154760000000, !dbg !257 + %5880 = select i1 %5830, float %5879, float 0xFFF0000000000000, !dbg !258 + %5881 = fmul float %5747, 0x3FF7154760000000, !dbg !257 + %5882 = select i1 %5832, float %5881, float 0xFFF0000000000000, !dbg !258 + %5883 = fmul float %5748, 0x3FF7154760000000, !dbg !257 + %5884 = select i1 %5834, float %5883, float 0xFFF0000000000000, !dbg !258 + %5885 = fmul float %5749, 0x3FF7154760000000, !dbg !257 + %5886 = select i1 %5835, float %5885, float 0xFFF0000000000000, !dbg !258 + %5887 = fmul float %5750, 0x3FF7154760000000, !dbg !257 + %5888 = select i1 %5836, float %5887, float 0xFFF0000000000000, !dbg !258 + %5889 = fmul float %5751, 0x3FF7154760000000, !dbg !257 + %5890 = select i1 %5838, float %5889, float 0xFFF0000000000000, !dbg !258 + %5891 = fmul float %5752, 0x3FF7154760000000, !dbg !257 + %5892 = select i1 %5840, float %5891, float 0xFFF0000000000000, !dbg !258 + %5893 = fmul float %5753, 0x3FF7154760000000, !dbg !257 + %5894 = select i1 %5841, float %5893, float 0xFFF0000000000000, !dbg !258 + %5895 = fmul float %5754, 0x3FF7154760000000, !dbg !257 + %5896 = select i1 %5842, float %5895, float 0xFFF0000000000000, !dbg !258 + %5897 = fmul float %5755, 0x3FF7154760000000, !dbg !257 + %5898 = select i1 %5844, float %5897, float 0xFFF0000000000000, !dbg !258 + %5899 = fmul float %5756, 0x3FF7154760000000, !dbg !257 + %5900 = select i1 %5846, float %5899, float 0xFFF0000000000000, !dbg !258 + %5901 = fmul float %5757, 0x3FF7154760000000, !dbg !257 + %5902 = select i1 %5847, float %5901, float 0xFFF0000000000000, !dbg !258 + %5903 = fmul float %5758, 0x3FF7154760000000, !dbg !257 + %5904 = select i1 %5848, float %5903, float 0xFFF0000000000000, !dbg !258 + %5905 = fmul float %5759, 0x3FF7154760000000, !dbg !257 + %5906 = select i1 %5850, float %5905, float 0xFFF0000000000000, !dbg !258 + %5907 = fmul float %5760, 0x3FF7154760000000, !dbg !257 + %5908 = select i1 %5852, float %5907, float 0xFFF0000000000000, !dbg !258 + %5909 = fmul float %5761, 0x3FF7154760000000, !dbg !257 + %5910 = select i1 %5853, float %5909, float 0xFFF0000000000000, !dbg !258 + %5911 = fmul float %5762, 0x3FF7154760000000, !dbg !257 + %5912 = select i1 %5854, float %5911, float 0xFFF0000000000000, !dbg !258 + %5913 = fmul float %5763, 0x3FF7154760000000, !dbg !257 + %5914 = select i1 %5856, float %5913, float 0xFFF0000000000000, !dbg !258 + %5915 = fmul float %5764, 0x3FF7154760000000, !dbg !257 + %5916 = select i1 %5858, float %5915, float 0xFFF0000000000000, !dbg !258 + %5917 = fmul float %5765, 0x3FF7154760000000, !dbg !257 + %5918 = select i1 %5859, float %5917, float 0xFFF0000000000000, !dbg !258 + %5919 = fmul float %5766, 0x3FF7154760000000, !dbg !257 + %5920 = select i1 %5860, float %5919, float 0xFFF0000000000000, !dbg !258 + %5921 = fmul float %5767, 0x3FF7154760000000, !dbg !257 + %5922 = select i1 %5862, float %5921, float 0xFFF0000000000000, !dbg !258 + %5923 = fmul float %5768, 0x3FF7154760000000, !dbg !257 + %5924 = select i1 %5864, float %5923, float 0xFFF0000000000000, !dbg !258 + %5925 = fmul float %5769, 0x3FF7154760000000, !dbg !257 + %5926 = select i1 %5865, float %5925, float 0xFFF0000000000000, !dbg !258 + %5927 = fmul float %5770, 0x3FF7154760000000, !dbg !257 + %5928 = select i1 %5866, float %5927, float 0xFFF0000000000000, !dbg !258 + %5929 = fmul float %5771, 0x3FF7154760000000, !dbg !257 + %5930 = select i1 %5868, float %5929, float 0xFFF0000000000000, !dbg !258 + %5931 = fmul float %5772, 0x3FF7154760000000, !dbg !257 + %5932 = select i1 %5870, float %5931, float 0xFFF0000000000000, !dbg !258 + %5933 = fmul float %5773, 0x3FF7154760000000, !dbg !257 + %5934 = select i1 %5871, float %5933, float 0xFFF0000000000000, !dbg !258 + %5935 = fmul float %5774, 0x3FF7154760000000, !dbg !257 + %5936 = select i1 %5872, float %5935, float 0xFFF0000000000000, !dbg !258 + %5937 = fsub float %5874, %5340, !dbg !259 + %5938 = fsub float %5876, %5341, !dbg !259 + %5939 = fsub float %5878, %5340, !dbg !259 + %5940 = fsub float %5880, %5341, !dbg !259 + %5941 = fsub float %5882, %5342, !dbg !259 + %5942 = fsub float %5884, %5343, !dbg !259 + %5943 = fsub float %5886, %5342, !dbg !259 + %5944 = fsub float %5888, %5343, !dbg !259 + %5945 = fsub float %5890, %5344, !dbg !259 + %5946 = fsub float %5892, %5345, !dbg !259 + %5947 = fsub float %5894, %5344, !dbg !259 + %5948 = fsub float %5896, %5345, !dbg !259 + %5949 = fsub float %5898, %5346, !dbg !259 + %5950 = fsub float %5900, %5347, !dbg !259 + %5951 = fsub float %5902, %5346, !dbg !259 + %5952 = fsub float %5904, %5347, !dbg !259 + %5953 = fsub float %5906, %5348, !dbg !259 + %5954 = fsub float %5908, %5349, !dbg !259 + %5955 = fsub float %5910, %5348, !dbg !259 + %5956 = fsub float %5912, %5349, !dbg !259 + %5957 = fsub float %5914, %5350, !dbg !259 + %5958 = fsub float %5916, %5351, !dbg !259 + %5959 = fsub float %5918, %5350, !dbg !259 + %5960 = fsub float %5920, %5351, !dbg !259 + %5961 = fsub float %5922, %5352, !dbg !259 + %5962 = fsub float %5924, %5353, !dbg !259 + %5963 = fsub float %5926, %5352, !dbg !259 + %5964 = fsub float %5928, %5353, !dbg !259 + %5965 = fsub float %5930, %5354, !dbg !259 + %5966 = fsub float %5932, %5355, !dbg !259 + %5967 = fsub float %5934, %5354, !dbg !259 + %5968 = fsub float %5936, %5355, !dbg !259 + %5969 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1220 = icmp eq i32 %5969, 0, !dbg !260 + br i1 %.not.i1220, label %5972, label %5970, !dbg !260 + +5970: ; preds = %.lr.ph + %5971 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5937) #3, !dbg !260 + br label %__nv_exp2f.exit1222, !dbg !260 + +5972: ; preds = %.lr.ph + %5973 = tail call float @llvm.nvvm.ex2.approx.f(float %5937) #3, !dbg !260 + br label %__nv_exp2f.exit1222, !dbg !260 + +__nv_exp2f.exit1222: ; preds = %5970, %5972 + %.0.i1221 = phi float [ %5971, %5970 ], [ %5973, %5972 ], !dbg !260 + %5974 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1223 = icmp eq i32 %5974, 0, !dbg !260 + br i1 %.not.i1223, label %5977, label %5975, !dbg !260 + +5975: ; preds = %__nv_exp2f.exit1222 + %5976 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5938) #3, !dbg !260 + br label %__nv_exp2f.exit1225, !dbg !260 + +5977: ; preds = %__nv_exp2f.exit1222 + %5978 = tail call float @llvm.nvvm.ex2.approx.f(float %5938) #3, !dbg !260 + br label %__nv_exp2f.exit1225, !dbg !260 + +__nv_exp2f.exit1225: ; preds = %5975, %5977 + %.0.i1224 = phi float [ %5976, %5975 ], [ %5978, %5977 ], !dbg !260 + %5979 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1226 = icmp eq i32 %5979, 0, !dbg !260 + br i1 %.not.i1226, label %5982, label %5980, !dbg !260 + +5980: ; preds = %__nv_exp2f.exit1225 + %5981 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5939) #3, !dbg !260 + br label %__nv_exp2f.exit1228, !dbg !260 + +5982: ; preds = %__nv_exp2f.exit1225 + %5983 = tail call float @llvm.nvvm.ex2.approx.f(float %5939) #3, !dbg !260 + br label %__nv_exp2f.exit1228, !dbg !260 + +__nv_exp2f.exit1228: ; preds = %5980, %5982 + %.0.i1227 = phi float [ %5981, %5980 ], [ %5983, %5982 ], !dbg !260 + %5984 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1229 = icmp eq i32 %5984, 0, !dbg !260 + br i1 %.not.i1229, label %5987, label %5985, !dbg !260 + +5985: ; preds = %__nv_exp2f.exit1228 + %5986 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5940) #3, !dbg !260 + br label %__nv_exp2f.exit1231, !dbg !260 + +5987: ; preds = %__nv_exp2f.exit1228 + %5988 = tail call float @llvm.nvvm.ex2.approx.f(float %5940) #3, !dbg !260 + br label %__nv_exp2f.exit1231, !dbg !260 + +__nv_exp2f.exit1231: ; preds = %5985, %5987 + %.0.i1230 = phi float [ %5986, %5985 ], [ %5988, %5987 ], !dbg !260 + %5989 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1232 = icmp eq i32 %5989, 0, !dbg !260 + br i1 %.not.i1232, label %5992, label %5990, !dbg !260 + +5990: ; preds = %__nv_exp2f.exit1231 + %5991 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5941) #3, !dbg !260 + br label %__nv_exp2f.exit1234, !dbg !260 + +5992: ; preds = %__nv_exp2f.exit1231 + %5993 = tail call float @llvm.nvvm.ex2.approx.f(float %5941) #3, !dbg !260 + br label %__nv_exp2f.exit1234, !dbg !260 + +__nv_exp2f.exit1234: ; preds = %5990, %5992 + %.0.i1233 = phi float [ %5991, %5990 ], [ %5993, %5992 ], !dbg !260 + %5994 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1235 = icmp eq i32 %5994, 0, !dbg !260 + br i1 %.not.i1235, label %5997, label %5995, !dbg !260 + +5995: ; preds = %__nv_exp2f.exit1234 + %5996 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5942) #3, !dbg !260 + br label %__nv_exp2f.exit1237, !dbg !260 + +5997: ; preds = %__nv_exp2f.exit1234 + %5998 = tail call float @llvm.nvvm.ex2.approx.f(float %5942) #3, !dbg !260 + br label %__nv_exp2f.exit1237, !dbg !260 + +__nv_exp2f.exit1237: ; preds = %5995, %5997 + %.0.i1236 = phi float [ %5996, %5995 ], [ %5998, %5997 ], !dbg !260 + %5999 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1238 = icmp eq i32 %5999, 0, !dbg !260 + br i1 %.not.i1238, label %6002, label %6000, !dbg !260 + +6000: ; preds = %__nv_exp2f.exit1237 + %6001 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5943) #3, !dbg !260 + br label %__nv_exp2f.exit1240, !dbg !260 + +6002: ; preds = %__nv_exp2f.exit1237 + %6003 = tail call float @llvm.nvvm.ex2.approx.f(float %5943) #3, !dbg !260 + br label %__nv_exp2f.exit1240, !dbg !260 + +__nv_exp2f.exit1240: ; preds = %6000, %6002 + %.0.i1239 = phi float [ %6001, %6000 ], [ %6003, %6002 ], !dbg !260 + %6004 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1241 = icmp eq i32 %6004, 0, !dbg !260 + br i1 %.not.i1241, label %6007, label %6005, !dbg !260 + +6005: ; preds = %__nv_exp2f.exit1240 + %6006 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5944) #3, !dbg !260 + br label %__nv_exp2f.exit1243, !dbg !260 + +6007: ; preds = %__nv_exp2f.exit1240 + %6008 = tail call float @llvm.nvvm.ex2.approx.f(float %5944) #3, !dbg !260 + br label %__nv_exp2f.exit1243, !dbg !260 + +__nv_exp2f.exit1243: ; preds = %6005, %6007 + %.0.i1242 = phi float [ %6006, %6005 ], [ %6008, %6007 ], !dbg !260 + %6009 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1244 = icmp eq i32 %6009, 0, !dbg !260 + br i1 %.not.i1244, label %6012, label %6010, !dbg !260 + +6010: ; preds = %__nv_exp2f.exit1243 + %6011 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5945) #3, !dbg !260 + br label %__nv_exp2f.exit1246, !dbg !260 + +6012: ; preds = %__nv_exp2f.exit1243 + %6013 = tail call float @llvm.nvvm.ex2.approx.f(float %5945) #3, !dbg !260 + br label %__nv_exp2f.exit1246, !dbg !260 + +__nv_exp2f.exit1246: ; preds = %6010, %6012 + %.0.i1245 = phi float [ %6011, %6010 ], [ %6013, %6012 ], !dbg !260 + %6014 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1247 = icmp eq i32 %6014, 0, !dbg !260 + br i1 %.not.i1247, label %6017, label %6015, !dbg !260 + +6015: ; preds = %__nv_exp2f.exit1246 + %6016 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5946) #3, !dbg !260 + br label %__nv_exp2f.exit1249, !dbg !260 + +6017: ; preds = %__nv_exp2f.exit1246 + %6018 = tail call float @llvm.nvvm.ex2.approx.f(float %5946) #3, !dbg !260 + br label %__nv_exp2f.exit1249, !dbg !260 + +__nv_exp2f.exit1249: ; preds = %6015, %6017 + %.0.i1248 = phi float [ %6016, %6015 ], [ %6018, %6017 ], !dbg !260 + %6019 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1250 = icmp eq i32 %6019, 0, !dbg !260 + br i1 %.not.i1250, label %6022, label %6020, !dbg !260 + +6020: ; preds = %__nv_exp2f.exit1249 + %6021 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5947) #3, !dbg !260 + br label %__nv_exp2f.exit1252, !dbg !260 + +6022: ; preds = %__nv_exp2f.exit1249 + %6023 = tail call float @llvm.nvvm.ex2.approx.f(float %5947) #3, !dbg !260 + br label %__nv_exp2f.exit1252, !dbg !260 + +__nv_exp2f.exit1252: ; preds = %6020, %6022 + %.0.i1251 = phi float [ %6021, %6020 ], [ %6023, %6022 ], !dbg !260 + %6024 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1253 = icmp eq i32 %6024, 0, !dbg !260 + br i1 %.not.i1253, label %6027, label %6025, !dbg !260 + +6025: ; preds = %__nv_exp2f.exit1252 + %6026 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5948) #3, !dbg !260 + br label %__nv_exp2f.exit1255, !dbg !260 + +6027: ; preds = %__nv_exp2f.exit1252 + %6028 = tail call float @llvm.nvvm.ex2.approx.f(float %5948) #3, !dbg !260 + br label %__nv_exp2f.exit1255, !dbg !260 + +__nv_exp2f.exit1255: ; preds = %6025, %6027 + %.0.i1254 = phi float [ %6026, %6025 ], [ %6028, %6027 ], !dbg !260 + %6029 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1256 = icmp eq i32 %6029, 0, !dbg !260 + br i1 %.not.i1256, label %6032, label %6030, !dbg !260 + +6030: ; preds = %__nv_exp2f.exit1255 + %6031 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5949) #3, !dbg !260 + br label %__nv_exp2f.exit1258, !dbg !260 + +6032: ; preds = %__nv_exp2f.exit1255 + %6033 = tail call float @llvm.nvvm.ex2.approx.f(float %5949) #3, !dbg !260 + br label %__nv_exp2f.exit1258, !dbg !260 + +__nv_exp2f.exit1258: ; preds = %6030, %6032 + %.0.i1257 = phi float [ %6031, %6030 ], [ %6033, %6032 ], !dbg !260 + %6034 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1259 = icmp eq i32 %6034, 0, !dbg !260 + br i1 %.not.i1259, label %6037, label %6035, !dbg !260 + +6035: ; preds = %__nv_exp2f.exit1258 + %6036 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5950) #3, !dbg !260 + br label %__nv_exp2f.exit1261, !dbg !260 + +6037: ; preds = %__nv_exp2f.exit1258 + %6038 = tail call float @llvm.nvvm.ex2.approx.f(float %5950) #3, !dbg !260 + br label %__nv_exp2f.exit1261, !dbg !260 + +__nv_exp2f.exit1261: ; preds = %6035, %6037 + %.0.i1260 = phi float [ %6036, %6035 ], [ %6038, %6037 ], !dbg !260 + %6039 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1262 = icmp eq i32 %6039, 0, !dbg !260 + br i1 %.not.i1262, label %6042, label %6040, !dbg !260 + +6040: ; preds = %__nv_exp2f.exit1261 + %6041 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5951) #3, !dbg !260 + br label %__nv_exp2f.exit1264, !dbg !260 + +6042: ; preds = %__nv_exp2f.exit1261 + %6043 = tail call float @llvm.nvvm.ex2.approx.f(float %5951) #3, !dbg !260 + br label %__nv_exp2f.exit1264, !dbg !260 + +__nv_exp2f.exit1264: ; preds = %6040, %6042 + %.0.i1263 = phi float [ %6041, %6040 ], [ %6043, %6042 ], !dbg !260 + %6044 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1265 = icmp eq i32 %6044, 0, !dbg !260 + br i1 %.not.i1265, label %6047, label %6045, !dbg !260 + +6045: ; preds = %__nv_exp2f.exit1264 + %6046 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5952) #3, !dbg !260 + br label %__nv_exp2f.exit1267, !dbg !260 + +6047: ; preds = %__nv_exp2f.exit1264 + %6048 = tail call float @llvm.nvvm.ex2.approx.f(float %5952) #3, !dbg !260 + br label %__nv_exp2f.exit1267, !dbg !260 + +__nv_exp2f.exit1267: ; preds = %6045, %6047 + %.0.i1266 = phi float [ %6046, %6045 ], [ %6048, %6047 ], !dbg !260 + %6049 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1268 = icmp eq i32 %6049, 0, !dbg !260 + br i1 %.not.i1268, label %6052, label %6050, !dbg !260 + +6050: ; preds = %__nv_exp2f.exit1267 + %6051 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5953) #3, !dbg !260 + br label %__nv_exp2f.exit1270, !dbg !260 + +6052: ; preds = %__nv_exp2f.exit1267 + %6053 = tail call float @llvm.nvvm.ex2.approx.f(float %5953) #3, !dbg !260 + br label %__nv_exp2f.exit1270, !dbg !260 + +__nv_exp2f.exit1270: ; preds = %6050, %6052 + %.0.i1269 = phi float [ %6051, %6050 ], [ %6053, %6052 ], !dbg !260 + %6054 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1271 = icmp eq i32 %6054, 0, !dbg !260 + br i1 %.not.i1271, label %6057, label %6055, !dbg !260 + +6055: ; preds = %__nv_exp2f.exit1270 + %6056 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5954) #3, !dbg !260 + br label %__nv_exp2f.exit1273, !dbg !260 + +6057: ; preds = %__nv_exp2f.exit1270 + %6058 = tail call float @llvm.nvvm.ex2.approx.f(float %5954) #3, !dbg !260 + br label %__nv_exp2f.exit1273, !dbg !260 + +__nv_exp2f.exit1273: ; preds = %6055, %6057 + %.0.i1272 = phi float [ %6056, %6055 ], [ %6058, %6057 ], !dbg !260 + %6059 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1274 = icmp eq i32 %6059, 0, !dbg !260 + br i1 %.not.i1274, label %6062, label %6060, !dbg !260 + +6060: ; preds = %__nv_exp2f.exit1273 + %6061 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5955) #3, !dbg !260 + br label %__nv_exp2f.exit1276, !dbg !260 + +6062: ; preds = %__nv_exp2f.exit1273 + %6063 = tail call float @llvm.nvvm.ex2.approx.f(float %5955) #3, !dbg !260 + br label %__nv_exp2f.exit1276, !dbg !260 + +__nv_exp2f.exit1276: ; preds = %6060, %6062 + %.0.i1275 = phi float [ %6061, %6060 ], [ %6063, %6062 ], !dbg !260 + %6064 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1277 = icmp eq i32 %6064, 0, !dbg !260 + br i1 %.not.i1277, label %6067, label %6065, !dbg !260 + +6065: ; preds = %__nv_exp2f.exit1276 + %6066 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5956) #3, !dbg !260 + br label %__nv_exp2f.exit1279, !dbg !260 + +6067: ; preds = %__nv_exp2f.exit1276 + %6068 = tail call float @llvm.nvvm.ex2.approx.f(float %5956) #3, !dbg !260 + br label %__nv_exp2f.exit1279, !dbg !260 + +__nv_exp2f.exit1279: ; preds = %6065, %6067 + %.0.i1278 = phi float [ %6066, %6065 ], [ %6068, %6067 ], !dbg !260 + %6069 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1280 = icmp eq i32 %6069, 0, !dbg !260 + br i1 %.not.i1280, label %6072, label %6070, !dbg !260 + +6070: ; preds = %__nv_exp2f.exit1279 + %6071 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5957) #3, !dbg !260 + br label %__nv_exp2f.exit1282, !dbg !260 + +6072: ; preds = %__nv_exp2f.exit1279 + %6073 = tail call float @llvm.nvvm.ex2.approx.f(float %5957) #3, !dbg !260 + br label %__nv_exp2f.exit1282, !dbg !260 + +__nv_exp2f.exit1282: ; preds = %6070, %6072 + %.0.i1281 = phi float [ %6071, %6070 ], [ %6073, %6072 ], !dbg !260 + %6074 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1283 = icmp eq i32 %6074, 0, !dbg !260 + br i1 %.not.i1283, label %6077, label %6075, !dbg !260 + +6075: ; preds = %__nv_exp2f.exit1282 + %6076 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5958) #3, !dbg !260 + br label %__nv_exp2f.exit1285, !dbg !260 + +6077: ; preds = %__nv_exp2f.exit1282 + %6078 = tail call float @llvm.nvvm.ex2.approx.f(float %5958) #3, !dbg !260 + br label %__nv_exp2f.exit1285, !dbg !260 + +__nv_exp2f.exit1285: ; preds = %6075, %6077 + %.0.i1284 = phi float [ %6076, %6075 ], [ %6078, %6077 ], !dbg !260 + %6079 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1286 = icmp eq i32 %6079, 0, !dbg !260 + br i1 %.not.i1286, label %6082, label %6080, !dbg !260 + +6080: ; preds = %__nv_exp2f.exit1285 + %6081 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5959) #3, !dbg !260 + br label %__nv_exp2f.exit1288, !dbg !260 + +6082: ; preds = %__nv_exp2f.exit1285 + %6083 = tail call float @llvm.nvvm.ex2.approx.f(float %5959) #3, !dbg !260 + br label %__nv_exp2f.exit1288, !dbg !260 + +__nv_exp2f.exit1288: ; preds = %6080, %6082 + %.0.i1287 = phi float [ %6081, %6080 ], [ %6083, %6082 ], !dbg !260 + %6084 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1289 = icmp eq i32 %6084, 0, !dbg !260 + br i1 %.not.i1289, label %6087, label %6085, !dbg !260 + +6085: ; preds = %__nv_exp2f.exit1288 + %6086 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5960) #3, !dbg !260 + br label %__nv_exp2f.exit1291, !dbg !260 + +6087: ; preds = %__nv_exp2f.exit1288 + %6088 = tail call float @llvm.nvvm.ex2.approx.f(float %5960) #3, !dbg !260 + br label %__nv_exp2f.exit1291, !dbg !260 + +__nv_exp2f.exit1291: ; preds = %6085, %6087 + %.0.i1290 = phi float [ %6086, %6085 ], [ %6088, %6087 ], !dbg !260 + %6089 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1292 = icmp eq i32 %6089, 0, !dbg !260 + br i1 %.not.i1292, label %6092, label %6090, !dbg !260 + +6090: ; preds = %__nv_exp2f.exit1291 + %6091 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5961) #3, !dbg !260 + br label %__nv_exp2f.exit1294, !dbg !260 + +6092: ; preds = %__nv_exp2f.exit1291 + %6093 = tail call float @llvm.nvvm.ex2.approx.f(float %5961) #3, !dbg !260 + br label %__nv_exp2f.exit1294, !dbg !260 + +__nv_exp2f.exit1294: ; preds = %6090, %6092 + %.0.i1293 = phi float [ %6091, %6090 ], [ %6093, %6092 ], !dbg !260 + %6094 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1295 = icmp eq i32 %6094, 0, !dbg !260 + br i1 %.not.i1295, label %6097, label %6095, !dbg !260 + +6095: ; preds = %__nv_exp2f.exit1294 + %6096 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5962) #3, !dbg !260 + br label %__nv_exp2f.exit1297, !dbg !260 + +6097: ; preds = %__nv_exp2f.exit1294 + %6098 = tail call float @llvm.nvvm.ex2.approx.f(float %5962) #3, !dbg !260 + br label %__nv_exp2f.exit1297, !dbg !260 + +__nv_exp2f.exit1297: ; preds = %6095, %6097 + %.0.i1296 = phi float [ %6096, %6095 ], [ %6098, %6097 ], !dbg !260 + %6099 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1298 = icmp eq i32 %6099, 0, !dbg !260 + br i1 %.not.i1298, label %6102, label %6100, !dbg !260 + +6100: ; preds = %__nv_exp2f.exit1297 + %6101 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5963) #3, !dbg !260 + br label %__nv_exp2f.exit1300, !dbg !260 + +6102: ; preds = %__nv_exp2f.exit1297 + %6103 = tail call float @llvm.nvvm.ex2.approx.f(float %5963) #3, !dbg !260 + br label %__nv_exp2f.exit1300, !dbg !260 + +__nv_exp2f.exit1300: ; preds = %6100, %6102 + %.0.i1299 = phi float [ %6101, %6100 ], [ %6103, %6102 ], !dbg !260 + %6104 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1301 = icmp eq i32 %6104, 0, !dbg !260 + br i1 %.not.i1301, label %6107, label %6105, !dbg !260 + +6105: ; preds = %__nv_exp2f.exit1300 + %6106 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5964) #3, !dbg !260 + br label %__nv_exp2f.exit1303, !dbg !260 + +6107: ; preds = %__nv_exp2f.exit1300 + %6108 = tail call float @llvm.nvvm.ex2.approx.f(float %5964) #3, !dbg !260 + br label %__nv_exp2f.exit1303, !dbg !260 + +__nv_exp2f.exit1303: ; preds = %6105, %6107 + %.0.i1302 = phi float [ %6106, %6105 ], [ %6108, %6107 ], !dbg !260 + %6109 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1304 = icmp eq i32 %6109, 0, !dbg !260 + br i1 %.not.i1304, label %6112, label %6110, !dbg !260 + +6110: ; preds = %__nv_exp2f.exit1303 + %6111 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5965) #3, !dbg !260 + br label %__nv_exp2f.exit1306, !dbg !260 + +6112: ; preds = %__nv_exp2f.exit1303 + %6113 = tail call float @llvm.nvvm.ex2.approx.f(float %5965) #3, !dbg !260 + br label %__nv_exp2f.exit1306, !dbg !260 + +__nv_exp2f.exit1306: ; preds = %6110, %6112 + %.0.i1305 = phi float [ %6111, %6110 ], [ %6113, %6112 ], !dbg !260 + %6114 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1307 = icmp eq i32 %6114, 0, !dbg !260 + br i1 %.not.i1307, label %6117, label %6115, !dbg !260 + +6115: ; preds = %__nv_exp2f.exit1306 + %6116 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5966) #3, !dbg !260 + br label %__nv_exp2f.exit1309, !dbg !260 + +6117: ; preds = %__nv_exp2f.exit1306 + %6118 = tail call float @llvm.nvvm.ex2.approx.f(float %5966) #3, !dbg !260 + br label %__nv_exp2f.exit1309, !dbg !260 + +__nv_exp2f.exit1309: ; preds = %6115, %6117 + %.0.i1308 = phi float [ %6116, %6115 ], [ %6118, %6117 ], !dbg !260 + %6119 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1310 = icmp eq i32 %6119, 0, !dbg !260 + br i1 %.not.i1310, label %6122, label %6120, !dbg !260 + +6120: ; preds = %__nv_exp2f.exit1309 + %6121 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5967) #3, !dbg !260 + br label %__nv_exp2f.exit1312, !dbg !260 + +6122: ; preds = %__nv_exp2f.exit1309 + %6123 = tail call float @llvm.nvvm.ex2.approx.f(float %5967) #3, !dbg !260 + br label %__nv_exp2f.exit1312, !dbg !260 + +__nv_exp2f.exit1312: ; preds = %6120, %6122 + %.0.i1311 = phi float [ %6121, %6120 ], [ %6123, %6122 ], !dbg !260 + %6124 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !260 + %.not.i1313 = icmp eq i32 %6124, 0, !dbg !260 + br i1 %.not.i1313, label %6127, label %6125, !dbg !260 + +6125: ; preds = %__nv_exp2f.exit1312 + %6126 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %5968) #3, !dbg !260 + br label %__nv_exp2f.exit1315, !dbg !260 + +6127: ; preds = %__nv_exp2f.exit1312 + %6128 = tail call float @llvm.nvvm.ex2.approx.f(float %5968) #3, !dbg !260 + br label %__nv_exp2f.exit1315, !dbg !260 + +__nv_exp2f.exit1315: ; preds = %6125, %6127 + %.0.i1314 = phi float [ %6126, %6125 ], [ %6128, %6127 ], !dbg !260 + %6129 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %5288, !dbg !244 + %6130 = insertelement <2 x float> poison, float %.0.i1221, i64 0, !dbg !261 + %6131 = insertelement <2 x float> %6130, float %.0.i1224, i64 1, !dbg !261 + %6132 = fptrunc <2 x float> %6131 to <2 x bfloat>, !dbg !261 + %6133 = insertelement <2 x float> poison, float %.0.i1227, i64 0, !dbg !261 + %6134 = insertelement <2 x float> %6133, float %.0.i1230, i64 1, !dbg !261 + %6135 = fptrunc <2 x float> %6134 to <2 x bfloat>, !dbg !261 + %6136 = insertelement <2 x float> poison, float %.0.i1233, i64 0, !dbg !261 + %6137 = insertelement <2 x float> %6136, float %.0.i1236, i64 1, !dbg !261 + %6138 = fptrunc <2 x float> %6137 to <2 x bfloat>, !dbg !261 + %6139 = insertelement <2 x float> poison, float %.0.i1239, i64 0, !dbg !261 + %6140 = insertelement <2 x float> %6139, float %.0.i1242, i64 1, !dbg !261 + %6141 = fptrunc <2 x float> %6140 to <2 x bfloat>, !dbg !261 + %6142 = insertelement <2 x float> poison, float %.0.i1245, i64 0, !dbg !261 + %6143 = insertelement <2 x float> %6142, float %.0.i1248, i64 1, !dbg !261 + %6144 = fptrunc <2 x float> %6143 to <2 x bfloat>, !dbg !261 + %6145 = insertelement <2 x float> poison, float %.0.i1251, i64 0, !dbg !261 + %6146 = insertelement <2 x float> %6145, float %.0.i1254, i64 1, !dbg !261 + %6147 = fptrunc <2 x float> %6146 to <2 x bfloat>, !dbg !261 + %6148 = insertelement <2 x float> poison, float %.0.i1257, i64 0, !dbg !261 + %6149 = insertelement <2 x float> %6148, float %.0.i1260, i64 1, !dbg !261 + %6150 = fptrunc <2 x float> %6149 to <2 x bfloat>, !dbg !261 + %6151 = insertelement <2 x float> poison, float %.0.i1263, i64 0, !dbg !261 + %6152 = insertelement <2 x float> %6151, float %.0.i1266, i64 1, !dbg !261 + %6153 = fptrunc <2 x float> %6152 to <2 x bfloat>, !dbg !261 + %6154 = insertelement <2 x float> poison, float %.0.i1269, i64 0, !dbg !261 + %6155 = insertelement <2 x float> %6154, float %.0.i1272, i64 1, !dbg !261 + %6156 = fptrunc <2 x float> %6155 to <2 x bfloat>, !dbg !261 + %6157 = insertelement <2 x float> poison, float %.0.i1275, i64 0, !dbg !261 + %6158 = insertelement <2 x float> %6157, float %.0.i1278, i64 1, !dbg !261 + %6159 = fptrunc <2 x float> %6158 to <2 x bfloat>, !dbg !261 + %6160 = insertelement <2 x float> poison, float %.0.i1281, i64 0, !dbg !261 + %6161 = insertelement <2 x float> %6160, float %.0.i1284, i64 1, !dbg !261 + %6162 = fptrunc <2 x float> %6161 to <2 x bfloat>, !dbg !261 + %6163 = insertelement <2 x float> poison, float %.0.i1287, i64 0, !dbg !261 + %6164 = insertelement <2 x float> %6163, float %.0.i1290, i64 1, !dbg !261 + %6165 = fptrunc <2 x float> %6164 to <2 x bfloat>, !dbg !261 + %6166 = insertelement <2 x float> poison, float %.0.i1293, i64 0, !dbg !261 + %6167 = insertelement <2 x float> %6166, float %.0.i1296, i64 1, !dbg !261 + %6168 = fptrunc <2 x float> %6167 to <2 x bfloat>, !dbg !261 + %6169 = insertelement <2 x float> poison, float %.0.i1299, i64 0, !dbg !261 + %6170 = insertelement <2 x float> %6169, float %.0.i1302, i64 1, !dbg !261 + %6171 = fptrunc <2 x float> %6170 to <2 x bfloat>, !dbg !261 + %6172 = insertelement <2 x float> poison, float %.0.i1305, i64 0, !dbg !261 + %6173 = insertelement <2 x float> %6172, float %.0.i1308, i64 1, !dbg !261 + %6174 = fptrunc <2 x float> %6173 to <2 x bfloat>, !dbg !261 + %6175 = insertelement <2 x float> poison, float %.0.i1311, i64 0, !dbg !261 + %6176 = insertelement <2 x float> %6175, float %.0.i1314, i64 1, !dbg !261 + %6177 = fptrunc <2 x float> %6176 to <2 x bfloat>, !dbg !261 + %6178 = bitcast <2 x bfloat> %6132 to i32, !dbg !262 + %6179 = bitcast <2 x bfloat> %6135 to i32, !dbg !262 + %6180 = bitcast <2 x bfloat> %6138 to i32, !dbg !262 + %6181 = bitcast <2 x bfloat> %6141 to i32, !dbg !262 + %6182 = bitcast <2 x bfloat> %6144 to i32, !dbg !262 + %6183 = bitcast <2 x bfloat> %6147 to i32, !dbg !262 + %6184 = bitcast <2 x bfloat> %6150 to i32, !dbg !262 + %6185 = bitcast <2 x bfloat> %6153 to i32, !dbg !262 + %6186 = bitcast <2 x bfloat> %6156 to i32, !dbg !262 + %6187 = bitcast <2 x bfloat> %6159 to i32, !dbg !262 + %6188 = bitcast <2 x bfloat> %6162 to i32, !dbg !262 + %6189 = bitcast <2 x bfloat> %6165 to i32, !dbg !262 + %6190 = bitcast <2 x bfloat> %6168 to i32, !dbg !262 + %6191 = bitcast <2 x bfloat> %6171 to i32, !dbg !262 + %6192 = bitcast <2 x bfloat> %6174 to i32, !dbg !262 + %6193 = bitcast <2 x bfloat> %6177 to i32, !dbg !262 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !262 + %6194 = ptrtoint ptr addrspace(3) %6129 to i32, !dbg !262 + %6195 = lshr exact i32 %6194, 4, !dbg !262 + %6196 = and i32 %6195, 16383, !dbg !262 + %6197 = zext nneg i32 %6196 to i64, !dbg !262 + %6198 = or disjoint i64 %6197, 4611686293338849280, !dbg !262 + %6199 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %5150, float %5151, float %5152, float %5153, float %5154, float %5155, float %5156, float %5157, float %5158, float %5159, float %5160, float %5161, float %5162, float %5163, float %5164, float %5165, float %5166, float %5167, float %5168, float %5169, float %5170, float %5171, float %5172, float %5173, float %5174, float %5175, float %5176, float %5177, float %5178, float %5179, float %5180, float %5181, float %5182, float %5183, float %5184, float %5185, float %5186, float %5187, float %5188, float %5189, float %5190, float %5191, float %5192, float %5193, float %5194, float %5195, float %5196, float %5197, float %5198, float %5199, float %5200, float %5201, float %5202, float %5203, float %5204, float %5205, float %5206, float %5207, float %5208, float %5209, float %5210, float %5211, float %5212, float %5213, i32 %6178, i32 %6179, i32 %6180, i32 %6181, i64 %6198, i1 true) #3, !dbg !262 + %6200 = add i32 %6194, 2048, !dbg !262 + %6201 = lshr exact i32 %6200, 4, !dbg !262 + %6202 = and i32 %6201, 16383, !dbg !262 + %6203 = zext nneg i32 %6202 to i64, !dbg !262 + %6204 = or disjoint i64 %6203, 4611686293338849280, !dbg !262 + %6205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 0, !dbg !262 + %6206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 1, !dbg !262 + %6207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 2, !dbg !262 + %6208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 3, !dbg !262 + %6209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 4, !dbg !262 + %6210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 5, !dbg !262 + %6211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 6, !dbg !262 + %6212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 7, !dbg !262 + %6213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 8, !dbg !262 + %6214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 9, !dbg !262 + %6215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 10, !dbg !262 + %6216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 11, !dbg !262 + %6217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 12, !dbg !262 + %6218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 13, !dbg !262 + %6219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 14, !dbg !262 + %6220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 15, !dbg !262 + %6221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 16, !dbg !262 + %6222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 17, !dbg !262 + %6223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 18, !dbg !262 + %6224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 19, !dbg !262 + %6225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 20, !dbg !262 + %6226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 21, !dbg !262 + %6227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 22, !dbg !262 + %6228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 23, !dbg !262 + %6229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 24, !dbg !262 + %6230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 25, !dbg !262 + %6231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 26, !dbg !262 + %6232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 27, !dbg !262 + %6233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 28, !dbg !262 + %6234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 29, !dbg !262 + %6235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 30, !dbg !262 + %6236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 31, !dbg !262 + %6237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 32, !dbg !262 + %6238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 33, !dbg !262 + %6239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 34, !dbg !262 + %6240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 35, !dbg !262 + %6241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 36, !dbg !262 + %6242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 37, !dbg !262 + %6243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 38, !dbg !262 + %6244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 39, !dbg !262 + %6245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 40, !dbg !262 + %6246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 41, !dbg !262 + %6247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 42, !dbg !262 + %6248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 43, !dbg !262 + %6249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 44, !dbg !262 + %6250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 45, !dbg !262 + %6251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 46, !dbg !262 + %6252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 47, !dbg !262 + %6253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 48, !dbg !262 + %6254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 49, !dbg !262 + %6255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 50, !dbg !262 + %6256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 51, !dbg !262 + %6257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 52, !dbg !262 + %6258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 53, !dbg !262 + %6259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 54, !dbg !262 + %6260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 55, !dbg !262 + %6261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 56, !dbg !262 + %6262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 57, !dbg !262 + %6263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 58, !dbg !262 + %6264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 59, !dbg !262 + %6265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 60, !dbg !262 + %6266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 61, !dbg !262 + %6267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 62, !dbg !262 + %6268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6199, 63, !dbg !262 + %6269 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %6205, float %6206, float %6207, float %6208, float %6209, float %6210, float %6211, float %6212, float %6213, float %6214, float %6215, float %6216, float %6217, float %6218, float %6219, float %6220, float %6221, float %6222, float %6223, float %6224, float %6225, float %6226, float %6227, float %6228, float %6229, float %6230, float %6231, float %6232, float %6233, float %6234, float %6235, float %6236, float %6237, float %6238, float %6239, float %6240, float %6241, float %6242, float %6243, float %6244, float %6245, float %6246, float %6247, float %6248, float %6249, float %6250, float %6251, float %6252, float %6253, float %6254, float %6255, float %6256, float %6257, float %6258, float %6259, float %6260, float %6261, float %6262, float %6263, float %6264, float %6265, float %6266, float %6267, float %6268, i32 %6182, i32 %6183, i32 %6184, i32 %6185, i64 %6204, i1 true) #3, !dbg !262 + %6270 = add i32 %6194, 4096, !dbg !262 + %6271 = lshr exact i32 %6270, 4, !dbg !262 + %6272 = and i32 %6271, 16383, !dbg !262 + %6273 = zext nneg i32 %6272 to i64, !dbg !262 + %6274 = or disjoint i64 %6273, 4611686293338849280, !dbg !262 + %6275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 0, !dbg !262 + %6276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 1, !dbg !262 + %6277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 2, !dbg !262 + %6278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 3, !dbg !262 + %6279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 4, !dbg !262 + %6280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 5, !dbg !262 + %6281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 6, !dbg !262 + %6282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 7, !dbg !262 + %6283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 8, !dbg !262 + %6284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 9, !dbg !262 + %6285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 10, !dbg !262 + %6286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 11, !dbg !262 + %6287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 12, !dbg !262 + %6288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 13, !dbg !262 + %6289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 14, !dbg !262 + %6290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 15, !dbg !262 + %6291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 16, !dbg !262 + %6292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 17, !dbg !262 + %6293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 18, !dbg !262 + %6294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 19, !dbg !262 + %6295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 20, !dbg !262 + %6296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 21, !dbg !262 + %6297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 22, !dbg !262 + %6298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 23, !dbg !262 + %6299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 24, !dbg !262 + %6300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 25, !dbg !262 + %6301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 26, !dbg !262 + %6302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 27, !dbg !262 + %6303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 28, !dbg !262 + %6304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 29, !dbg !262 + %6305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 30, !dbg !262 + %6306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 31, !dbg !262 + %6307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 32, !dbg !262 + %6308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 33, !dbg !262 + %6309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 34, !dbg !262 + %6310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 35, !dbg !262 + %6311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 36, !dbg !262 + %6312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 37, !dbg !262 + %6313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 38, !dbg !262 + %6314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 39, !dbg !262 + %6315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 40, !dbg !262 + %6316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 41, !dbg !262 + %6317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 42, !dbg !262 + %6318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 43, !dbg !262 + %6319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 44, !dbg !262 + %6320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 45, !dbg !262 + %6321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 46, !dbg !262 + %6322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 47, !dbg !262 + %6323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 48, !dbg !262 + %6324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 49, !dbg !262 + %6325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 50, !dbg !262 + %6326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 51, !dbg !262 + %6327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 52, !dbg !262 + %6328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 53, !dbg !262 + %6329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 54, !dbg !262 + %6330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 55, !dbg !262 + %6331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 56, !dbg !262 + %6332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 57, !dbg !262 + %6333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 58, !dbg !262 + %6334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 59, !dbg !262 + %6335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 60, !dbg !262 + %6336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 61, !dbg !262 + %6337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 62, !dbg !262 + %6338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6269, 63, !dbg !262 + %6339 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %6275, float %6276, float %6277, float %6278, float %6279, float %6280, float %6281, float %6282, float %6283, float %6284, float %6285, float %6286, float %6287, float %6288, float %6289, float %6290, float %6291, float %6292, float %6293, float %6294, float %6295, float %6296, float %6297, float %6298, float %6299, float %6300, float %6301, float %6302, float %6303, float %6304, float %6305, float %6306, float %6307, float %6308, float %6309, float %6310, float %6311, float %6312, float %6313, float %6314, float %6315, float %6316, float %6317, float %6318, float %6319, float %6320, float %6321, float %6322, float %6323, float %6324, float %6325, float %6326, float %6327, float %6328, float %6329, float %6330, float %6331, float %6332, float %6333, float %6334, float %6335, float %6336, float %6337, float %6338, i32 %6186, i32 %6187, i32 %6188, i32 %6189, i64 %6274, i1 true) #3, !dbg !262 + %6340 = add i32 %6194, 6144, !dbg !262 + %6341 = lshr exact i32 %6340, 4, !dbg !262 + %6342 = and i32 %6341, 16383, !dbg !262 + %6343 = zext nneg i32 %6342 to i64, !dbg !262 + %6344 = or disjoint i64 %6343, 4611686293338849280, !dbg !262 + %6345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 0, !dbg !262 + %6346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 1, !dbg !262 + %6347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 2, !dbg !262 + %6348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 3, !dbg !262 + %6349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 4, !dbg !262 + %6350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 5, !dbg !262 + %6351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 6, !dbg !262 + %6352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 7, !dbg !262 + %6353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 8, !dbg !262 + %6354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 9, !dbg !262 + %6355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 10, !dbg !262 + %6356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 11, !dbg !262 + %6357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 12, !dbg !262 + %6358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 13, !dbg !262 + %6359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 14, !dbg !262 + %6360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 15, !dbg !262 + %6361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 16, !dbg !262 + %6362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 17, !dbg !262 + %6363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 18, !dbg !262 + %6364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 19, !dbg !262 + %6365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 20, !dbg !262 + %6366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 21, !dbg !262 + %6367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 22, !dbg !262 + %6368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 23, !dbg !262 + %6369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 24, !dbg !262 + %6370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 25, !dbg !262 + %6371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 26, !dbg !262 + %6372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 27, !dbg !262 + %6373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 28, !dbg !262 + %6374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 29, !dbg !262 + %6375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 30, !dbg !262 + %6376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 31, !dbg !262 + %6377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 32, !dbg !262 + %6378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 33, !dbg !262 + %6379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 34, !dbg !262 + %6380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 35, !dbg !262 + %6381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 36, !dbg !262 + %6382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 37, !dbg !262 + %6383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 38, !dbg !262 + %6384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 39, !dbg !262 + %6385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 40, !dbg !262 + %6386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 41, !dbg !262 + %6387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 42, !dbg !262 + %6388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 43, !dbg !262 + %6389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 44, !dbg !262 + %6390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 45, !dbg !262 + %6391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 46, !dbg !262 + %6392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 47, !dbg !262 + %6393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 48, !dbg !262 + %6394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 49, !dbg !262 + %6395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 50, !dbg !262 + %6396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 51, !dbg !262 + %6397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 52, !dbg !262 + %6398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 53, !dbg !262 + %6399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 54, !dbg !262 + %6400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 55, !dbg !262 + %6401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 56, !dbg !262 + %6402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 57, !dbg !262 + %6403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 58, !dbg !262 + %6404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 59, !dbg !262 + %6405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 60, !dbg !262 + %6406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 61, !dbg !262 + %6407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 62, !dbg !262 + %6408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6339, 63, !dbg !262 + %6409 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %6345, float %6346, float %6347, float %6348, float %6349, float %6350, float %6351, float %6352, float %6353, float %6354, float %6355, float %6356, float %6357, float %6358, float %6359, float %6360, float %6361, float %6362, float %6363, float %6364, float %6365, float %6366, float %6367, float %6368, float %6369, float %6370, float %6371, float %6372, float %6373, float %6374, float %6375, float %6376, float %6377, float %6378, float %6379, float %6380, float %6381, float %6382, float %6383, float %6384, float %6385, float %6386, float %6387, float %6388, float %6389, float %6390, float %6391, float %6392, float %6393, float %6394, float %6395, float %6396, float %6397, float %6398, float %6399, float %6400, float %6401, float %6402, float %6403, float %6404, float %6405, float %6406, float %6407, float %6408, i32 %6190, i32 %6191, i32 %6192, i32 %6193, i64 %6344, i1 true) #3, !dbg !262 + %6410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 0, !dbg !262 + %6411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 1, !dbg !262 + %6412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 2, !dbg !262 + %6413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 3, !dbg !262 + %6414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 4, !dbg !262 + %6415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 5, !dbg !262 + %6416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 6, !dbg !262 + %6417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 7, !dbg !262 + %6418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 8, !dbg !262 + %6419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 9, !dbg !262 + %6420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 10, !dbg !262 + %6421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 11, !dbg !262 + %6422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 12, !dbg !262 + %6423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 13, !dbg !262 + %6424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 14, !dbg !262 + %6425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 15, !dbg !262 + %6426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 16, !dbg !262 + %6427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 17, !dbg !262 + %6428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 18, !dbg !262 + %6429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 19, !dbg !262 + %6430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 20, !dbg !262 + %6431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 21, !dbg !262 + %6432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 22, !dbg !262 + %6433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 23, !dbg !262 + %6434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 24, !dbg !262 + %6435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 25, !dbg !262 + %6436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 26, !dbg !262 + %6437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 27, !dbg !262 + %6438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 28, !dbg !262 + %6439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 29, !dbg !262 + %6440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 30, !dbg !262 + %6441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 31, !dbg !262 + %6442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 32, !dbg !262 + %6443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 33, !dbg !262 + %6444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 34, !dbg !262 + %6445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 35, !dbg !262 + %6446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 36, !dbg !262 + %6447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 37, !dbg !262 + %6448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 38, !dbg !262 + %6449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 39, !dbg !262 + %6450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 40, !dbg !262 + %6451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 41, !dbg !262 + %6452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 42, !dbg !262 + %6453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 43, !dbg !262 + %6454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 44, !dbg !262 + %6455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 45, !dbg !262 + %6456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 46, !dbg !262 + %6457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 47, !dbg !262 + %6458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 48, !dbg !262 + %6459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 49, !dbg !262 + %6460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 50, !dbg !262 + %6461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 51, !dbg !262 + %6462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 52, !dbg !262 + %6463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 53, !dbg !262 + %6464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 54, !dbg !262 + %6465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 55, !dbg !262 + %6466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 56, !dbg !262 + %6467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 57, !dbg !262 + %6468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 58, !dbg !262 + %6469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 59, !dbg !262 + %6470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 60, !dbg !262 + %6471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 61, !dbg !262 + %6472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 62, !dbg !262 + %6473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6409, 63, !dbg !262 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !262 + %6474 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %5290, !dbg !248 + %6475 = getelementptr inbounds nuw i8, ptr addrspace(3) %6474, i32 %4812, !dbg !248 + %6476 = load float, ptr addrspace(3) %6475, align 8, !dbg !248 + %6477 = getelementptr inbounds nuw i8, ptr addrspace(3) %6475, i32 4, !dbg !248 + %6478 = load float, ptr addrspace(3) %6477, align 4, !dbg !248 + %6479 = getelementptr inbounds nuw i8, ptr addrspace(3) %6474, i32 %4815, !dbg !248 + %6480 = load float, ptr addrspace(3) %6479, align 8, !dbg !248 + %6481 = getelementptr inbounds nuw i8, ptr addrspace(3) %6479, i32 4, !dbg !248 + %6482 = load float, ptr addrspace(3) %6481, align 4, !dbg !248 + %6483 = getelementptr inbounds nuw i8, ptr addrspace(3) %6474, i32 %4817, !dbg !248 + %6484 = load float, ptr addrspace(3) %6483, align 8, !dbg !248 + %6485 = getelementptr inbounds nuw i8, ptr addrspace(3) %6483, i32 4, !dbg !248 + %6486 = load float, ptr addrspace(3) %6485, align 4, !dbg !248 + %6487 = getelementptr inbounds nuw i8, ptr addrspace(3) %6474, i32 %4819, !dbg !248 + %6488 = load float, ptr addrspace(3) %6487, align 8, !dbg !248 + %6489 = getelementptr inbounds nuw i8, ptr addrspace(3) %6487, i32 4, !dbg !248 + %6490 = load float, ptr addrspace(3) %6489, align 4, !dbg !248 + %6491 = getelementptr inbounds nuw i8, ptr addrspace(3) %6474, i32 %4821, !dbg !248 + %6492 = load float, ptr addrspace(3) %6491, align 8, !dbg !248 + %6493 = getelementptr inbounds nuw i8, ptr addrspace(3) %6491, i32 4, !dbg !248 + %6494 = load float, ptr addrspace(3) %6493, align 4, !dbg !248 + %6495 = getelementptr inbounds nuw i8, ptr addrspace(3) %6474, i32 %4823, !dbg !248 + %6496 = load float, ptr addrspace(3) %6495, align 8, !dbg !248 + %6497 = getelementptr inbounds nuw i8, ptr addrspace(3) %6495, i32 4, !dbg !248 + %6498 = load float, ptr addrspace(3) %6497, align 4, !dbg !248 + %6499 = getelementptr inbounds nuw i8, ptr addrspace(3) %6474, i32 %4825, !dbg !248 + %6500 = load float, ptr addrspace(3) %6499, align 8, !dbg !248 + %6501 = getelementptr inbounds nuw i8, ptr addrspace(3) %6499, i32 4, !dbg !248 + %6502 = load float, ptr addrspace(3) %6501, align 4, !dbg !248 + %6503 = getelementptr inbounds nuw i8, ptr addrspace(3) %6474, i32 %4827, !dbg !248 + %6504 = load float, ptr addrspace(3) %6503, align 8, !dbg !248 + %6505 = getelementptr inbounds nuw i8, ptr addrspace(3) %6503, i32 4, !dbg !248 + %6506 = load float, ptr addrspace(3) %6505, align 4, !dbg !248 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !263 + %6507 = add i32 %5358, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !263 + %6508 = lshr exact i32 %6507, 4, !dbg !263 + %6509 = and i32 %6508, 16383, !dbg !263 + %6510 = zext nneg i32 %6509 to i64, !dbg !263 + %6511 = or disjoint i64 %6510, 4611686293372403712, !dbg !263 + %6512 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %6511, i64 %6198) #3, !dbg !263 + %6513 = add i32 %5370, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !263 + %6514 = lshr exact i32 %6513, 4, !dbg !263 + %6515 = and i32 %6514, 16383, !dbg !263 + %6516 = zext nneg i32 %6515 to i64, !dbg !263 + %6517 = or disjoint i64 %6516, 4611686293372403712, !dbg !263 + %6518 = add i32 %6194, 32, !dbg !263 + %6519 = lshr exact i32 %6518, 4, !dbg !263 + %6520 = and i32 %6519, 16383, !dbg !263 + %6521 = zext nneg i32 %6520 to i64, !dbg !263 + %6522 = or disjoint i64 %6521, 4611686293338849280, !dbg !263 + %6523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 0, !dbg !263 + %6524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 1, !dbg !263 + %6525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 2, !dbg !263 + %6526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 3, !dbg !263 + %6527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 4, !dbg !263 + %6528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 5, !dbg !263 + %6529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 6, !dbg !263 + %6530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 7, !dbg !263 + %6531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 8, !dbg !263 + %6532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 9, !dbg !263 + %6533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 10, !dbg !263 + %6534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 11, !dbg !263 + %6535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 12, !dbg !263 + %6536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 13, !dbg !263 + %6537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 14, !dbg !263 + %6538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 15, !dbg !263 + %6539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 16, !dbg !263 + %6540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 17, !dbg !263 + %6541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 18, !dbg !263 + %6542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 19, !dbg !263 + %6543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 20, !dbg !263 + %6544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 21, !dbg !263 + %6545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 22, !dbg !263 + %6546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 23, !dbg !263 + %6547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 24, !dbg !263 + %6548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 25, !dbg !263 + %6549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 26, !dbg !263 + %6550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 27, !dbg !263 + %6551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 28, !dbg !263 + %6552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 29, !dbg !263 + %6553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 30, !dbg !263 + %6554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6512, 31, !dbg !263 + %6555 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6523, float %6524, float %6525, float %6526, float %6527, float %6528, float %6529, float %6530, float %6531, float %6532, float %6533, float %6534, float %6535, float %6536, float %6537, float %6538, float %6539, float %6540, float %6541, float %6542, float %6543, float %6544, float %6545, float %6546, float %6547, float %6548, float %6549, float %6550, float %6551, float %6552, float %6553, float %6554, i64 %6517, i64 %6522, i1 true) #3, !dbg !263 + %6556 = add i32 %5414, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !263 + %6557 = lshr exact i32 %6556, 4, !dbg !263 + %6558 = and i32 %6557, 16383, !dbg !263 + %6559 = zext nneg i32 %6558 to i64, !dbg !263 + %6560 = or disjoint i64 %6559, 4611686293372403712, !dbg !263 + %6561 = add i32 %6194, 64, !dbg !263 + %6562 = lshr exact i32 %6561, 4, !dbg !263 + %6563 = and i32 %6562, 16383, !dbg !263 + %6564 = zext nneg i32 %6563 to i64, !dbg !263 + %6565 = or disjoint i64 %6564, 4611686293338849280, !dbg !263 + %6566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 0, !dbg !263 + %6567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 1, !dbg !263 + %6568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 2, !dbg !263 + %6569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 3, !dbg !263 + %6570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 4, !dbg !263 + %6571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 5, !dbg !263 + %6572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 6, !dbg !263 + %6573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 7, !dbg !263 + %6574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 8, !dbg !263 + %6575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 9, !dbg !263 + %6576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 10, !dbg !263 + %6577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 11, !dbg !263 + %6578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 12, !dbg !263 + %6579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 13, !dbg !263 + %6580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 14, !dbg !263 + %6581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 15, !dbg !263 + %6582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 16, !dbg !263 + %6583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 17, !dbg !263 + %6584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 18, !dbg !263 + %6585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 19, !dbg !263 + %6586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 20, !dbg !263 + %6587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 21, !dbg !263 + %6588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 22, !dbg !263 + %6589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 23, !dbg !263 + %6590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 24, !dbg !263 + %6591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 25, !dbg !263 + %6592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 26, !dbg !263 + %6593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 27, !dbg !263 + %6594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 28, !dbg !263 + %6595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 29, !dbg !263 + %6596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 30, !dbg !263 + %6597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6555, 31, !dbg !263 + %6598 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6566, float %6567, float %6568, float %6569, float %6570, float %6571, float %6572, float %6573, float %6574, float %6575, float %6576, float %6577, float %6578, float %6579, float %6580, float %6581, float %6582, float %6583, float %6584, float %6585, float %6586, float %6587, float %6588, float %6589, float %6590, float %6591, float %6592, float %6593, float %6594, float %6595, float %6596, float %6597, i64 %6560, i64 %6565, i1 true) #3, !dbg !263 + %6599 = add i32 %5458, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !263 + %6600 = lshr exact i32 %6599, 4, !dbg !263 + %6601 = and i32 %6600, 16383, !dbg !263 + %6602 = zext nneg i32 %6601 to i64, !dbg !263 + %6603 = or disjoint i64 %6602, 4611686293372403712, !dbg !263 + %6604 = add i32 %6194, 96, !dbg !263 + %6605 = lshr exact i32 %6604, 4, !dbg !263 + %6606 = and i32 %6605, 16383, !dbg !263 + %6607 = zext nneg i32 %6606 to i64, !dbg !263 + %6608 = or disjoint i64 %6607, 4611686293338849280, !dbg !263 + %6609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 0, !dbg !263 + %6610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 1, !dbg !263 + %6611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 2, !dbg !263 + %6612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 3, !dbg !263 + %6613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 4, !dbg !263 + %6614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 5, !dbg !263 + %6615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 6, !dbg !263 + %6616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 7, !dbg !263 + %6617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 8, !dbg !263 + %6618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 9, !dbg !263 + %6619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 10, !dbg !263 + %6620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 11, !dbg !263 + %6621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 12, !dbg !263 + %6622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 13, !dbg !263 + %6623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 14, !dbg !263 + %6624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 15, !dbg !263 + %6625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 16, !dbg !263 + %6626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 17, !dbg !263 + %6627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 18, !dbg !263 + %6628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 19, !dbg !263 + %6629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 20, !dbg !263 + %6630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 21, !dbg !263 + %6631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 22, !dbg !263 + %6632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 23, !dbg !263 + %6633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 24, !dbg !263 + %6634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 25, !dbg !263 + %6635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 26, !dbg !263 + %6636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 27, !dbg !263 + %6637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 28, !dbg !263 + %6638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 29, !dbg !263 + %6639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 30, !dbg !263 + %6640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6598, 31, !dbg !263 + %6641 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6609, float %6610, float %6611, float %6612, float %6613, float %6614, float %6615, float %6616, float %6617, float %6618, float %6619, float %6620, float %6621, float %6622, float %6623, float %6624, float %6625, float %6626, float %6627, float %6628, float %6629, float %6630, float %6631, float %6632, float %6633, float %6634, float %6635, float %6636, float %6637, float %6638, float %6639, float %6640, i64 %6603, i64 %6608, i1 true) #3, !dbg !263 + %6642 = add i32 %5502, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !263 + %6643 = lshr exact i32 %6642, 4, !dbg !263 + %6644 = and i32 %6643, 16383, !dbg !263 + %6645 = zext nneg i32 %6644 to i64, !dbg !263 + %6646 = or disjoint i64 %6645, 4611686293372403712, !dbg !263 + %6647 = add i32 %6194, 8192, !dbg !263 + %6648 = lshr exact i32 %6647, 4, !dbg !263 + %6649 = and i32 %6648, 16383, !dbg !263 + %6650 = zext nneg i32 %6649 to i64, !dbg !263 + %6651 = or disjoint i64 %6650, 4611686293338849280, !dbg !263 + %6652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 0, !dbg !263 + %6653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 1, !dbg !263 + %6654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 2, !dbg !263 + %6655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 3, !dbg !263 + %6656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 4, !dbg !263 + %6657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 5, !dbg !263 + %6658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 6, !dbg !263 + %6659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 7, !dbg !263 + %6660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 8, !dbg !263 + %6661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 9, !dbg !263 + %6662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 10, !dbg !263 + %6663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 11, !dbg !263 + %6664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 12, !dbg !263 + %6665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 13, !dbg !263 + %6666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 14, !dbg !263 + %6667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 15, !dbg !263 + %6668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 16, !dbg !263 + %6669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 17, !dbg !263 + %6670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 18, !dbg !263 + %6671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 19, !dbg !263 + %6672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 20, !dbg !263 + %6673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 21, !dbg !263 + %6674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 22, !dbg !263 + %6675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 23, !dbg !263 + %6676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 24, !dbg !263 + %6677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 25, !dbg !263 + %6678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 26, !dbg !263 + %6679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 27, !dbg !263 + %6680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 28, !dbg !263 + %6681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 29, !dbg !263 + %6682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 30, !dbg !263 + %6683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6641, 31, !dbg !263 + %6684 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6652, float %6653, float %6654, float %6655, float %6656, float %6657, float %6658, float %6659, float %6660, float %6661, float %6662, float %6663, float %6664, float %6665, float %6666, float %6667, float %6668, float %6669, float %6670, float %6671, float %6672, float %6673, float %6674, float %6675, float %6676, float %6677, float %6678, float %6679, float %6680, float %6681, float %6682, float %6683, i64 %6646, i64 %6651, i1 true) #3, !dbg !263 + %6685 = add i32 %5546, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !263 + %6686 = lshr exact i32 %6685, 4, !dbg !263 + %6687 = and i32 %6686, 16383, !dbg !263 + %6688 = zext nneg i32 %6687 to i64, !dbg !263 + %6689 = or disjoint i64 %6688, 4611686293372403712, !dbg !263 + %6690 = add i32 %6194, 8224, !dbg !263 + %6691 = lshr exact i32 %6690, 4, !dbg !263 + %6692 = and i32 %6691, 16383, !dbg !263 + %6693 = zext nneg i32 %6692 to i64, !dbg !263 + %6694 = or disjoint i64 %6693, 4611686293338849280, !dbg !263 + %6695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 0, !dbg !263 + %6696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 1, !dbg !263 + %6697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 2, !dbg !263 + %6698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 3, !dbg !263 + %6699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 4, !dbg !263 + %6700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 5, !dbg !263 + %6701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 6, !dbg !263 + %6702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 7, !dbg !263 + %6703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 8, !dbg !263 + %6704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 9, !dbg !263 + %6705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 10, !dbg !263 + %6706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 11, !dbg !263 + %6707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 12, !dbg !263 + %6708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 13, !dbg !263 + %6709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 14, !dbg !263 + %6710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 15, !dbg !263 + %6711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 16, !dbg !263 + %6712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 17, !dbg !263 + %6713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 18, !dbg !263 + %6714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 19, !dbg !263 + %6715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 20, !dbg !263 + %6716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 21, !dbg !263 + %6717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 22, !dbg !263 + %6718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 23, !dbg !263 + %6719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 24, !dbg !263 + %6720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 25, !dbg !263 + %6721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 26, !dbg !263 + %6722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 27, !dbg !263 + %6723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 28, !dbg !263 + %6724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 29, !dbg !263 + %6725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 30, !dbg !263 + %6726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6684, 31, !dbg !263 + %6727 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6695, float %6696, float %6697, float %6698, float %6699, float %6700, float %6701, float %6702, float %6703, float %6704, float %6705, float %6706, float %6707, float %6708, float %6709, float %6710, float %6711, float %6712, float %6713, float %6714, float %6715, float %6716, float %6717, float %6718, float %6719, float %6720, float %6721, float %6722, float %6723, float %6724, float %6725, float %6726, i64 %6689, i64 %6694, i1 true) #3, !dbg !263 + %6728 = add i32 %5590, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !263 + %6729 = lshr exact i32 %6728, 4, !dbg !263 + %6730 = and i32 %6729, 16383, !dbg !263 + %6731 = zext nneg i32 %6730 to i64, !dbg !263 + %6732 = or disjoint i64 %6731, 4611686293372403712, !dbg !263 + %6733 = add i32 %6194, 8256, !dbg !263 + %6734 = lshr exact i32 %6733, 4, !dbg !263 + %6735 = and i32 %6734, 16383, !dbg !263 + %6736 = zext nneg i32 %6735 to i64, !dbg !263 + %6737 = or disjoint i64 %6736, 4611686293338849280, !dbg !263 + %6738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 0, !dbg !263 + %6739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 1, !dbg !263 + %6740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 2, !dbg !263 + %6741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 3, !dbg !263 + %6742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 4, !dbg !263 + %6743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 5, !dbg !263 + %6744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 6, !dbg !263 + %6745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 7, !dbg !263 + %6746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 8, !dbg !263 + %6747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 9, !dbg !263 + %6748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 10, !dbg !263 + %6749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 11, !dbg !263 + %6750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 12, !dbg !263 + %6751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 13, !dbg !263 + %6752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 14, !dbg !263 + %6753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 15, !dbg !263 + %6754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 16, !dbg !263 + %6755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 17, !dbg !263 + %6756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 18, !dbg !263 + %6757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 19, !dbg !263 + %6758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 20, !dbg !263 + %6759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 21, !dbg !263 + %6760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 22, !dbg !263 + %6761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 23, !dbg !263 + %6762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 24, !dbg !263 + %6763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 25, !dbg !263 + %6764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 26, !dbg !263 + %6765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 27, !dbg !263 + %6766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 28, !dbg !263 + %6767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 29, !dbg !263 + %6768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 30, !dbg !263 + %6769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6727, 31, !dbg !263 + %6770 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6738, float %6739, float %6740, float %6741, float %6742, float %6743, float %6744, float %6745, float %6746, float %6747, float %6748, float %6749, float %6750, float %6751, float %6752, float %6753, float %6754, float %6755, float %6756, float %6757, float %6758, float %6759, float %6760, float %6761, float %6762, float %6763, float %6764, float %6765, float %6766, float %6767, float %6768, float %6769, i64 %6732, i64 %6737, i1 true) #3, !dbg !263 + %6771 = add i32 %5634, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !263 + %6772 = lshr exact i32 %6771, 4, !dbg !263 + %6773 = and i32 %6772, 16383, !dbg !263 + %6774 = zext nneg i32 %6773 to i64, !dbg !263 + %6775 = or disjoint i64 %6774, 4611686293372403712, !dbg !263 + %6776 = add i32 %6194, 8288, !dbg !263 + %6777 = lshr exact i32 %6776, 4, !dbg !263 + %6778 = and i32 %6777, 16383, !dbg !263 + %6779 = zext nneg i32 %6778 to i64, !dbg !263 + %6780 = or disjoint i64 %6779, 4611686293338849280, !dbg !263 + %6781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 0, !dbg !263 + %6782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 1, !dbg !263 + %6783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 2, !dbg !263 + %6784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 3, !dbg !263 + %6785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 4, !dbg !263 + %6786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 5, !dbg !263 + %6787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 6, !dbg !263 + %6788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 7, !dbg !263 + %6789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 8, !dbg !263 + %6790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 9, !dbg !263 + %6791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 10, !dbg !263 + %6792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 11, !dbg !263 + %6793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 12, !dbg !263 + %6794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 13, !dbg !263 + %6795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 14, !dbg !263 + %6796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 15, !dbg !263 + %6797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 16, !dbg !263 + %6798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 17, !dbg !263 + %6799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 18, !dbg !263 + %6800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 19, !dbg !263 + %6801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 20, !dbg !263 + %6802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 21, !dbg !263 + %6803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 22, !dbg !263 + %6804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 23, !dbg !263 + %6805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 24, !dbg !263 + %6806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 25, !dbg !263 + %6807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 26, !dbg !263 + %6808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 27, !dbg !263 + %6809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 28, !dbg !263 + %6810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 29, !dbg !263 + %6811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 30, !dbg !263 + %6812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6770, 31, !dbg !263 + %6813 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6781, float %6782, float %6783, float %6784, float %6785, float %6786, float %6787, float %6788, float %6789, float %6790, float %6791, float %6792, float %6793, float %6794, float %6795, float %6796, float %6797, float %6798, float %6799, float %6800, float %6801, float %6802, float %6803, float %6804, float %6805, float %6806, float %6807, float %6808, float %6809, float %6810, float %6811, float %6812, i64 %6775, i64 %6780, i1 true) #3, !dbg !263 + %6814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 0, !dbg !263 + %6815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 1, !dbg !263 + %6816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 2, !dbg !263 + %6817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 3, !dbg !263 + %6818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 4, !dbg !263 + %6819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 5, !dbg !263 + %6820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 6, !dbg !263 + %6821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 7, !dbg !263 + %6822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 8, !dbg !263 + %6823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 9, !dbg !263 + %6824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 10, !dbg !263 + %6825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 11, !dbg !263 + %6826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 12, !dbg !263 + %6827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 13, !dbg !263 + %6828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 14, !dbg !263 + %6829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 15, !dbg !263 + %6830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 16, !dbg !263 + %6831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 17, !dbg !263 + %6832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 18, !dbg !263 + %6833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 19, !dbg !263 + %6834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 20, !dbg !263 + %6835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 21, !dbg !263 + %6836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 22, !dbg !263 + %6837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 23, !dbg !263 + %6838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 24, !dbg !263 + %6839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 25, !dbg !263 + %6840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 26, !dbg !263 + %6841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 27, !dbg !263 + %6842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 28, !dbg !263 + %6843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 29, !dbg !263 + %6844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 30, !dbg !263 + %6845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6813, 31, !dbg !263 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !263 + %6846 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %6814, float %6815, float %6816, float %6817, float %6818, float %6819, float %6820, float %6821, float %6822, float %6823, float %6824, float %6825, float %6826, float %6827, float %6828, float %6829, float %6830, float %6831, float %6832, float %6833, float %6834, float %6835, float %6836, float %6837, float %6838, float %6839, float %6840, float %6841, float %6842, float %6843, float %6844, float %6845, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 0, i32 0, ptr addrspace(3) %6129, i32 0, i32 0) #3, !dbg !263 + %6847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 0, !dbg !263 + %6848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 1, !dbg !263 + %6849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 2, !dbg !263 + %6850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 3, !dbg !263 + %6851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 4, !dbg !263 + %6852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 5, !dbg !263 + %6853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 6, !dbg !263 + %6854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 7, !dbg !263 + %6855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 8, !dbg !263 + %6856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 9, !dbg !263 + %6857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 10, !dbg !263 + %6858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 11, !dbg !263 + %6859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 12, !dbg !263 + %6860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 13, !dbg !263 + %6861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 14, !dbg !263 + %6862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 15, !dbg !263 + %6863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 16, !dbg !263 + %6864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 17, !dbg !263 + %6865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 18, !dbg !263 + %6866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 19, !dbg !263 + %6867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 20, !dbg !263 + %6868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 21, !dbg !263 + %6869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 22, !dbg !263 + %6870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 23, !dbg !263 + %6871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 24, !dbg !263 + %6872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 25, !dbg !263 + %6873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 26, !dbg !263 + %6874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 27, !dbg !263 + %6875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 28, !dbg !263 + %6876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 29, !dbg !263 + %6877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 30, !dbg !263 + %6878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6846, 31, !dbg !263 + %6879 = fsub float %6847, %6476, !dbg !264 + %6880 = fsub float %6848, %6478, !dbg !264 + %6881 = fsub float %6849, %6476, !dbg !264 + %6882 = fsub float %6850, %6478, !dbg !264 + %6883 = fsub float %6851, %6480, !dbg !264 + %6884 = fsub float %6852, %6482, !dbg !264 + %6885 = fsub float %6853, %6480, !dbg !264 + %6886 = fsub float %6854, %6482, !dbg !264 + %6887 = fsub float %6855, %6484, !dbg !264 + %6888 = fsub float %6856, %6486, !dbg !264 + %6889 = fsub float %6857, %6484, !dbg !264 + %6890 = fsub float %6858, %6486, !dbg !264 + %6891 = fsub float %6859, %6488, !dbg !264 + %6892 = fsub float %6860, %6490, !dbg !264 + %6893 = fsub float %6861, %6488, !dbg !264 + %6894 = fsub float %6862, %6490, !dbg !264 + %6895 = fsub float %6863, %6492, !dbg !264 + %6896 = fsub float %6864, %6494, !dbg !264 + %6897 = fsub float %6865, %6492, !dbg !264 + %6898 = fsub float %6866, %6494, !dbg !264 + %6899 = fsub float %6867, %6496, !dbg !264 + %6900 = fsub float %6868, %6498, !dbg !264 + %6901 = fsub float %6869, %6496, !dbg !264 + %6902 = fsub float %6870, %6498, !dbg !264 + %6903 = fsub float %6871, %6500, !dbg !264 + %6904 = fsub float %6872, %6502, !dbg !264 + %6905 = fsub float %6873, %6500, !dbg !264 + %6906 = fsub float %6874, %6502, !dbg !264 + %6907 = fsub float %6875, %6504, !dbg !264 + %6908 = fsub float %6876, %6506, !dbg !264 + %6909 = fsub float %6877, %6504, !dbg !264 + %6910 = fsub float %6878, %6506, !dbg !264 + %6911 = fmul float %.0.i1221, %6879, !dbg !265 + %6912 = fmul float %.0.i1224, %6880, !dbg !265 + %6913 = fmul float %.0.i1227, %6881, !dbg !265 + %6914 = fmul float %.0.i1230, %6882, !dbg !265 + %6915 = fmul float %.0.i1233, %6883, !dbg !265 + %6916 = fmul float %.0.i1236, %6884, !dbg !265 + %6917 = fmul float %.0.i1239, %6885, !dbg !265 + %6918 = fmul float %.0.i1242, %6886, !dbg !265 + %6919 = fmul float %.0.i1245, %6887, !dbg !265 + %6920 = fmul float %.0.i1248, %6888, !dbg !265 + %6921 = fmul float %.0.i1251, %6889, !dbg !265 + %6922 = fmul float %.0.i1254, %6890, !dbg !265 + %6923 = fmul float %.0.i1257, %6891, !dbg !265 + %6924 = fmul float %.0.i1260, %6892, !dbg !265 + %6925 = fmul float %.0.i1263, %6893, !dbg !265 + %6926 = fmul float %.0.i1266, %6894, !dbg !265 + %6927 = fmul float %.0.i1269, %6895, !dbg !265 + %6928 = fmul float %.0.i1272, %6896, !dbg !265 + %6929 = fmul float %.0.i1275, %6897, !dbg !265 + %6930 = fmul float %.0.i1278, %6898, !dbg !265 + %6931 = fmul float %.0.i1281, %6899, !dbg !265 + %6932 = fmul float %.0.i1284, %6900, !dbg !265 + %6933 = fmul float %.0.i1287, %6901, !dbg !265 + %6934 = fmul float %.0.i1290, %6902, !dbg !265 + %6935 = fmul float %.0.i1293, %6903, !dbg !265 + %6936 = fmul float %.0.i1296, %6904, !dbg !265 + %6937 = fmul float %.0.i1299, %6905, !dbg !265 + %6938 = fmul float %.0.i1302, %6906, !dbg !265 + %6939 = fmul float %.0.i1305, %6907, !dbg !265 + %6940 = fmul float %.0.i1308, %6908, !dbg !265 + %6941 = fmul float %.0.i1311, %6909, !dbg !265 + %6942 = fmul float %.0.i1314, %6910, !dbg !265 + %6943 = fptrunc float %6911 to bfloat, !dbg !266 + %6944 = select i1 %5826, bfloat %6943, bfloat 0xR0000, !dbg !267 + %6945 = fptrunc float %6912 to bfloat, !dbg !266 + %6946 = select i1 %5828, bfloat %6945, bfloat 0xR0000, !dbg !267 + %6947 = fptrunc float %6913 to bfloat, !dbg !266 + %6948 = select i1 %5829, bfloat %6947, bfloat 0xR0000, !dbg !267 + %6949 = fptrunc float %6914 to bfloat, !dbg !266 + %6950 = select i1 %5830, bfloat %6949, bfloat 0xR0000, !dbg !267 + %6951 = fptrunc float %6915 to bfloat, !dbg !266 + %6952 = select i1 %5832, bfloat %6951, bfloat 0xR0000, !dbg !267 + %6953 = fptrunc float %6916 to bfloat, !dbg !266 + %6954 = select i1 %5834, bfloat %6953, bfloat 0xR0000, !dbg !267 + %6955 = fptrunc float %6917 to bfloat, !dbg !266 + %6956 = select i1 %5835, bfloat %6955, bfloat 0xR0000, !dbg !267 + %6957 = fptrunc float %6918 to bfloat, !dbg !266 + %6958 = select i1 %5836, bfloat %6957, bfloat 0xR0000, !dbg !267 + %6959 = fptrunc float %6919 to bfloat, !dbg !266 + %6960 = select i1 %5838, bfloat %6959, bfloat 0xR0000, !dbg !267 + %6961 = fptrunc float %6920 to bfloat, !dbg !266 + %6962 = select i1 %5840, bfloat %6961, bfloat 0xR0000, !dbg !267 + %6963 = fptrunc float %6921 to bfloat, !dbg !266 + %6964 = select i1 %5841, bfloat %6963, bfloat 0xR0000, !dbg !267 + %6965 = fptrunc float %6922 to bfloat, !dbg !266 + %6966 = select i1 %5842, bfloat %6965, bfloat 0xR0000, !dbg !267 + %6967 = fptrunc float %6923 to bfloat, !dbg !266 + %6968 = select i1 %5844, bfloat %6967, bfloat 0xR0000, !dbg !267 + %6969 = fptrunc float %6924 to bfloat, !dbg !266 + %6970 = select i1 %5846, bfloat %6969, bfloat 0xR0000, !dbg !267 + %6971 = fptrunc float %6925 to bfloat, !dbg !266 + %6972 = select i1 %5847, bfloat %6971, bfloat 0xR0000, !dbg !267 + %6973 = fptrunc float %6926 to bfloat, !dbg !266 + %6974 = select i1 %5848, bfloat %6973, bfloat 0xR0000, !dbg !267 + %6975 = fptrunc float %6927 to bfloat, !dbg !266 + %6976 = select i1 %5850, bfloat %6975, bfloat 0xR0000, !dbg !267 + %6977 = fptrunc float %6928 to bfloat, !dbg !266 + %6978 = select i1 %5852, bfloat %6977, bfloat 0xR0000, !dbg !267 + %6979 = fptrunc float %6929 to bfloat, !dbg !266 + %6980 = select i1 %5853, bfloat %6979, bfloat 0xR0000, !dbg !267 + %6981 = fptrunc float %6930 to bfloat, !dbg !266 + %6982 = select i1 %5854, bfloat %6981, bfloat 0xR0000, !dbg !267 + %6983 = fptrunc float %6931 to bfloat, !dbg !266 + %6984 = select i1 %5856, bfloat %6983, bfloat 0xR0000, !dbg !267 + %6985 = fptrunc float %6932 to bfloat, !dbg !266 + %6986 = select i1 %5858, bfloat %6985, bfloat 0xR0000, !dbg !267 + %6987 = fptrunc float %6933 to bfloat, !dbg !266 + %6988 = select i1 %5859, bfloat %6987, bfloat 0xR0000, !dbg !267 + %6989 = fptrunc float %6934 to bfloat, !dbg !266 + %6990 = select i1 %5860, bfloat %6989, bfloat 0xR0000, !dbg !267 + %6991 = fptrunc float %6935 to bfloat, !dbg !266 + %6992 = select i1 %5862, bfloat %6991, bfloat 0xR0000, !dbg !267 + %6993 = fptrunc float %6936 to bfloat, !dbg !266 + %6994 = select i1 %5864, bfloat %6993, bfloat 0xR0000, !dbg !267 + %6995 = fptrunc float %6937 to bfloat, !dbg !266 + %6996 = select i1 %5865, bfloat %6995, bfloat 0xR0000, !dbg !267 + %6997 = fptrunc float %6938 to bfloat, !dbg !266 + %6998 = select i1 %5866, bfloat %6997, bfloat 0xR0000, !dbg !267 + %6999 = fptrunc float %6939 to bfloat, !dbg !266 + %7000 = select i1 %5868, bfloat %6999, bfloat 0xR0000, !dbg !267 + %7001 = fptrunc float %6940 to bfloat, !dbg !266 + %7002 = select i1 %5870, bfloat %7001, bfloat 0xR0000, !dbg !267 + %7003 = fptrunc float %6941 to bfloat, !dbg !266 + %7004 = select i1 %5871, bfloat %7003, bfloat 0xR0000, !dbg !267 + %7005 = fptrunc float %6942 to bfloat, !dbg !266 + %7006 = select i1 %5872, bfloat %7005, bfloat 0xR0000, !dbg !267 + %7007 = insertelement <2 x bfloat> poison, bfloat %6944, i64 0, !dbg !268 + %7008 = insertelement <2 x bfloat> %7007, bfloat %6946, i64 1, !dbg !268 + %7009 = bitcast <2 x bfloat> %7008 to i32, !dbg !268 + %7010 = insertelement <2 x bfloat> poison, bfloat %6948, i64 0, !dbg !268 + %7011 = insertelement <2 x bfloat> %7010, bfloat %6950, i64 1, !dbg !268 + %7012 = bitcast <2 x bfloat> %7011 to i32, !dbg !268 + %7013 = insertelement <2 x bfloat> poison, bfloat %6952, i64 0, !dbg !268 + %7014 = insertelement <2 x bfloat> %7013, bfloat %6954, i64 1, !dbg !268 + %7015 = bitcast <2 x bfloat> %7014 to i32, !dbg !268 + %7016 = insertelement <2 x bfloat> poison, bfloat %6956, i64 0, !dbg !268 + %7017 = insertelement <2 x bfloat> %7016, bfloat %6958, i64 1, !dbg !268 + %7018 = bitcast <2 x bfloat> %7017 to i32, !dbg !268 + %7019 = insertelement <2 x bfloat> poison, bfloat %6960, i64 0, !dbg !268 + %7020 = insertelement <2 x bfloat> %7019, bfloat %6962, i64 1, !dbg !268 + %7021 = bitcast <2 x bfloat> %7020 to i32, !dbg !268 + %7022 = insertelement <2 x bfloat> poison, bfloat %6964, i64 0, !dbg !268 + %7023 = insertelement <2 x bfloat> %7022, bfloat %6966, i64 1, !dbg !268 + %7024 = bitcast <2 x bfloat> %7023 to i32, !dbg !268 + %7025 = insertelement <2 x bfloat> poison, bfloat %6968, i64 0, !dbg !268 + %7026 = insertelement <2 x bfloat> %7025, bfloat %6970, i64 1, !dbg !268 + %7027 = bitcast <2 x bfloat> %7026 to i32, !dbg !268 + %7028 = insertelement <2 x bfloat> poison, bfloat %6972, i64 0, !dbg !268 + %7029 = insertelement <2 x bfloat> %7028, bfloat %6974, i64 1, !dbg !268 + %7030 = bitcast <2 x bfloat> %7029 to i32, !dbg !268 + %7031 = insertelement <2 x bfloat> poison, bfloat %6976, i64 0, !dbg !268 + %7032 = insertelement <2 x bfloat> %7031, bfloat %6978, i64 1, !dbg !268 + %7033 = bitcast <2 x bfloat> %7032 to i32, !dbg !268 + %7034 = insertelement <2 x bfloat> poison, bfloat %6980, i64 0, !dbg !268 + %7035 = insertelement <2 x bfloat> %7034, bfloat %6982, i64 1, !dbg !268 + %7036 = bitcast <2 x bfloat> %7035 to i32, !dbg !268 + %7037 = insertelement <2 x bfloat> poison, bfloat %6984, i64 0, !dbg !268 + %7038 = insertelement <2 x bfloat> %7037, bfloat %6986, i64 1, !dbg !268 + %7039 = bitcast <2 x bfloat> %7038 to i32, !dbg !268 + %7040 = insertelement <2 x bfloat> poison, bfloat %6988, i64 0, !dbg !268 + %7041 = insertelement <2 x bfloat> %7040, bfloat %6990, i64 1, !dbg !268 + %7042 = bitcast <2 x bfloat> %7041 to i32, !dbg !268 + %7043 = insertelement <2 x bfloat> poison, bfloat %6992, i64 0, !dbg !268 + %7044 = insertelement <2 x bfloat> %7043, bfloat %6994, i64 1, !dbg !268 + %7045 = bitcast <2 x bfloat> %7044 to i32, !dbg !268 + %7046 = insertelement <2 x bfloat> poison, bfloat %6996, i64 0, !dbg !268 + %7047 = insertelement <2 x bfloat> %7046, bfloat %6998, i64 1, !dbg !268 + %7048 = bitcast <2 x bfloat> %7047 to i32, !dbg !268 + %7049 = insertelement <2 x bfloat> poison, bfloat %7000, i64 0, !dbg !268 + %7050 = insertelement <2 x bfloat> %7049, bfloat %7002, i64 1, !dbg !268 + %7051 = bitcast <2 x bfloat> %7050 to i32, !dbg !268 + %7052 = insertelement <2 x bfloat> poison, bfloat %7004, i64 0, !dbg !268 + %7053 = insertelement <2 x bfloat> %7052, bfloat %7006, i64 1, !dbg !268 + %7054 = bitcast <2 x bfloat> %7053 to i32, !dbg !268 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !268 + %7055 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %5214, float %5215, float %5216, float %5217, float %5218, float %5219, float %5220, float %5221, float %5222, float %5223, float %5224, float %5225, float %5226, float %5227, float %5228, float %5229, float %5230, float %5231, float %5232, float %5233, float %5234, float %5235, float %5236, float %5237, float %5238, float %5239, float %5240, float %5241, float %5242, float %5243, float %5244, float %5245, float %5246, float %5247, float %5248, float %5249, float %5250, float %5251, float %5252, float %5253, float %5254, float %5255, float %5256, float %5257, float %5258, float %5259, float %5260, float %5261, float %5262, float %5263, float %5264, float %5265, float %5266, float %5267, float %5268, float %5269, float %5270, float %5271, float %5272, float %5273, float %5274, float %5275, float %5276, float %5277, i32 %7009, i32 %7012, i32 %7015, i32 %7018, i64 %5368, i1 true) #3, !dbg !268 + %7056 = add i32 %5364, 2048, !dbg !268 + %7057 = lshr exact i32 %7056, 4, !dbg !268 + %7058 = and i32 %7057, 16383, !dbg !268 + %7059 = zext nneg i32 %7058 to i64, !dbg !268 + %7060 = or disjoint i64 %7059, 4611686293338849280, !dbg !268 + %7061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 0, !dbg !268 + %7062 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 1, !dbg !268 + %7063 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 2, !dbg !268 + %7064 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 3, !dbg !268 + %7065 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 4, !dbg !268 + %7066 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 5, !dbg !268 + %7067 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 6, !dbg !268 + %7068 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 7, !dbg !268 + %7069 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 8, !dbg !268 + %7070 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 9, !dbg !268 + %7071 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 10, !dbg !268 + %7072 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 11, !dbg !268 + %7073 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 12, !dbg !268 + %7074 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 13, !dbg !268 + %7075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 14, !dbg !268 + %7076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 15, !dbg !268 + %7077 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 16, !dbg !268 + %7078 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 17, !dbg !268 + %7079 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 18, !dbg !268 + %7080 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 19, !dbg !268 + %7081 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 20, !dbg !268 + %7082 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 21, !dbg !268 + %7083 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 22, !dbg !268 + %7084 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 23, !dbg !268 + %7085 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 24, !dbg !268 + %7086 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 25, !dbg !268 + %7087 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 26, !dbg !268 + %7088 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 27, !dbg !268 + %7089 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 28, !dbg !268 + %7090 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 29, !dbg !268 + %7091 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 30, !dbg !268 + %7092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 31, !dbg !268 + %7093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 32, !dbg !268 + %7094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 33, !dbg !268 + %7095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 34, !dbg !268 + %7096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 35, !dbg !268 + %7097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 36, !dbg !268 + %7098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 37, !dbg !268 + %7099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 38, !dbg !268 + %7100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 39, !dbg !268 + %7101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 40, !dbg !268 + %7102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 41, !dbg !268 + %7103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 42, !dbg !268 + %7104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 43, !dbg !268 + %7105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 44, !dbg !268 + %7106 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 45, !dbg !268 + %7107 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 46, !dbg !268 + %7108 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 47, !dbg !268 + %7109 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 48, !dbg !268 + %7110 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 49, !dbg !268 + %7111 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 50, !dbg !268 + %7112 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 51, !dbg !268 + %7113 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 52, !dbg !268 + %7114 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 53, !dbg !268 + %7115 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 54, !dbg !268 + %7116 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 55, !dbg !268 + %7117 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 56, !dbg !268 + %7118 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 57, !dbg !268 + %7119 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 58, !dbg !268 + %7120 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 59, !dbg !268 + %7121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 60, !dbg !268 + %7122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 61, !dbg !268 + %7123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 62, !dbg !268 + %7124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7055, 63, !dbg !268 + %7125 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %7061, float %7062, float %7063, float %7064, float %7065, float %7066, float %7067, float %7068, float %7069, float %7070, float %7071, float %7072, float %7073, float %7074, float %7075, float %7076, float %7077, float %7078, float %7079, float %7080, float %7081, float %7082, float %7083, float %7084, float %7085, float %7086, float %7087, float %7088, float %7089, float %7090, float %7091, float %7092, float %7093, float %7094, float %7095, float %7096, float %7097, float %7098, float %7099, float %7100, float %7101, float %7102, float %7103, float %7104, float %7105, float %7106, float %7107, float %7108, float %7109, float %7110, float %7111, float %7112, float %7113, float %7114, float %7115, float %7116, float %7117, float %7118, float %7119, float %7120, float %7121, float %7122, float %7123, float %7124, i32 %7021, i32 %7024, i32 %7027, i32 %7030, i64 %7060, i1 true) #3, !dbg !268 + %7126 = add i32 %5364, 4096, !dbg !268 + %7127 = lshr exact i32 %7126, 4, !dbg !268 + %7128 = and i32 %7127, 16383, !dbg !268 + %7129 = zext nneg i32 %7128 to i64, !dbg !268 + %7130 = or disjoint i64 %7129, 4611686293338849280, !dbg !268 + %7131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 0, !dbg !268 + %7132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 1, !dbg !268 + %7133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 2, !dbg !268 + %7134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 3, !dbg !268 + %7135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 4, !dbg !268 + %7136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 5, !dbg !268 + %7137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 6, !dbg !268 + %7138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 7, !dbg !268 + %7139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 8, !dbg !268 + %7140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 9, !dbg !268 + %7141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 10, !dbg !268 + %7142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 11, !dbg !268 + %7143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 12, !dbg !268 + %7144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 13, !dbg !268 + %7145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 14, !dbg !268 + %7146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 15, !dbg !268 + %7147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 16, !dbg !268 + %7148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 17, !dbg !268 + %7149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 18, !dbg !268 + %7150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 19, !dbg !268 + %7151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 20, !dbg !268 + %7152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 21, !dbg !268 + %7153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 22, !dbg !268 + %7154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 23, !dbg !268 + %7155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 24, !dbg !268 + %7156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 25, !dbg !268 + %7157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 26, !dbg !268 + %7158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 27, !dbg !268 + %7159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 28, !dbg !268 + %7160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 29, !dbg !268 + %7161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 30, !dbg !268 + %7162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 31, !dbg !268 + %7163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 32, !dbg !268 + %7164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 33, !dbg !268 + %7165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 34, !dbg !268 + %7166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 35, !dbg !268 + %7167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 36, !dbg !268 + %7168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 37, !dbg !268 + %7169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 38, !dbg !268 + %7170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 39, !dbg !268 + %7171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 40, !dbg !268 + %7172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 41, !dbg !268 + %7173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 42, !dbg !268 + %7174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 43, !dbg !268 + %7175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 44, !dbg !268 + %7176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 45, !dbg !268 + %7177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 46, !dbg !268 + %7178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 47, !dbg !268 + %7179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 48, !dbg !268 + %7180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 49, !dbg !268 + %7181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 50, !dbg !268 + %7182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 51, !dbg !268 + %7183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 52, !dbg !268 + %7184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 53, !dbg !268 + %7185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 54, !dbg !268 + %7186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 55, !dbg !268 + %7187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 56, !dbg !268 + %7188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 57, !dbg !268 + %7189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 58, !dbg !268 + %7190 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 59, !dbg !268 + %7191 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 60, !dbg !268 + %7192 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 61, !dbg !268 + %7193 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 62, !dbg !268 + %7194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7125, 63, !dbg !268 + %7195 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %7131, float %7132, float %7133, float %7134, float %7135, float %7136, float %7137, float %7138, float %7139, float %7140, float %7141, float %7142, float %7143, float %7144, float %7145, float %7146, float %7147, float %7148, float %7149, float %7150, float %7151, float %7152, float %7153, float %7154, float %7155, float %7156, float %7157, float %7158, float %7159, float %7160, float %7161, float %7162, float %7163, float %7164, float %7165, float %7166, float %7167, float %7168, float %7169, float %7170, float %7171, float %7172, float %7173, float %7174, float %7175, float %7176, float %7177, float %7178, float %7179, float %7180, float %7181, float %7182, float %7183, float %7184, float %7185, float %7186, float %7187, float %7188, float %7189, float %7190, float %7191, float %7192, float %7193, float %7194, i32 %7033, i32 %7036, i32 %7039, i32 %7042, i64 %7130, i1 true) #3, !dbg !268 + %7196 = add i32 %5364, 6144, !dbg !268 + %7197 = lshr exact i32 %7196, 4, !dbg !268 + %7198 = and i32 %7197, 16383, !dbg !268 + %7199 = zext nneg i32 %7198 to i64, !dbg !268 + %7200 = or disjoint i64 %7199, 4611686293338849280, !dbg !268 + %7201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 0, !dbg !268 + %7202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 1, !dbg !268 + %7203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 2, !dbg !268 + %7204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 3, !dbg !268 + %7205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 4, !dbg !268 + %7206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 5, !dbg !268 + %7207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 6, !dbg !268 + %7208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 7, !dbg !268 + %7209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 8, !dbg !268 + %7210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 9, !dbg !268 + %7211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 10, !dbg !268 + %7212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 11, !dbg !268 + %7213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 12, !dbg !268 + %7214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 13, !dbg !268 + %7215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 14, !dbg !268 + %7216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 15, !dbg !268 + %7217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 16, !dbg !268 + %7218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 17, !dbg !268 + %7219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 18, !dbg !268 + %7220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 19, !dbg !268 + %7221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 20, !dbg !268 + %7222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 21, !dbg !268 + %7223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 22, !dbg !268 + %7224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 23, !dbg !268 + %7225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 24, !dbg !268 + %7226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 25, !dbg !268 + %7227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 26, !dbg !268 + %7228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 27, !dbg !268 + %7229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 28, !dbg !268 + %7230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 29, !dbg !268 + %7231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 30, !dbg !268 + %7232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 31, !dbg !268 + %7233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 32, !dbg !268 + %7234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 33, !dbg !268 + %7235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 34, !dbg !268 + %7236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 35, !dbg !268 + %7237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 36, !dbg !268 + %7238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 37, !dbg !268 + %7239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 38, !dbg !268 + %7240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 39, !dbg !268 + %7241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 40, !dbg !268 + %7242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 41, !dbg !268 + %7243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 42, !dbg !268 + %7244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 43, !dbg !268 + %7245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 44, !dbg !268 + %7246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 45, !dbg !268 + %7247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 46, !dbg !268 + %7248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 47, !dbg !268 + %7249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 48, !dbg !268 + %7250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 49, !dbg !268 + %7251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 50, !dbg !268 + %7252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 51, !dbg !268 + %7253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 52, !dbg !268 + %7254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 53, !dbg !268 + %7255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 54, !dbg !268 + %7256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 55, !dbg !268 + %7257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 56, !dbg !268 + %7258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 57, !dbg !268 + %7259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 58, !dbg !268 + %7260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 59, !dbg !268 + %7261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 60, !dbg !268 + %7262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 61, !dbg !268 + %7263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 62, !dbg !268 + %7264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7195, 63, !dbg !268 + %7265 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %7201, float %7202, float %7203, float %7204, float %7205, float %7206, float %7207, float %7208, float %7209, float %7210, float %7211, float %7212, float %7213, float %7214, float %7215, float %7216, float %7217, float %7218, float %7219, float %7220, float %7221, float %7222, float %7223, float %7224, float %7225, float %7226, float %7227, float %7228, float %7229, float %7230, float %7231, float %7232, float %7233, float %7234, float %7235, float %7236, float %7237, float %7238, float %7239, float %7240, float %7241, float %7242, float %7243, float %7244, float %7245, float %7246, float %7247, float %7248, float %7249, float %7250, float %7251, float %7252, float %7253, float %7254, float %7255, float %7256, float %7257, float %7258, float %7259, float %7260, float %7261, float %7262, float %7263, float %7264, i32 %7045, i32 %7048, i32 %7051, i32 %7054, i64 %7200, i1 true) #3, !dbg !268 + %7266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 0, !dbg !268 + %7267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 1, !dbg !268 + %7268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 2, !dbg !268 + %7269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 3, !dbg !268 + %7270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 4, !dbg !268 + %7271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 5, !dbg !268 + %7272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 6, !dbg !268 + %7273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 7, !dbg !268 + %7274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 8, !dbg !268 + %7275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 9, !dbg !268 + %7276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 10, !dbg !268 + %7277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 11, !dbg !268 + %7278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 12, !dbg !268 + %7279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 13, !dbg !268 + %7280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 14, !dbg !268 + %7281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 15, !dbg !268 + %7282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 16, !dbg !268 + %7283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 17, !dbg !268 + %7284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 18, !dbg !268 + %7285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 19, !dbg !268 + %7286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 20, !dbg !268 + %7287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 21, !dbg !268 + %7288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 22, !dbg !268 + %7289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 23, !dbg !268 + %7290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 24, !dbg !268 + %7291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 25, !dbg !268 + %7292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 26, !dbg !268 + %7293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 27, !dbg !268 + %7294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 28, !dbg !268 + %7295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 29, !dbg !268 + %7296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 30, !dbg !268 + %7297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 31, !dbg !268 + %7298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 32, !dbg !268 + %7299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 33, !dbg !268 + %7300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 34, !dbg !268 + %7301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 35, !dbg !268 + %7302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 36, !dbg !268 + %7303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 37, !dbg !268 + %7304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 38, !dbg !268 + %7305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 39, !dbg !268 + %7306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 40, !dbg !268 + %7307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 41, !dbg !268 + %7308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 42, !dbg !268 + %7309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 43, !dbg !268 + %7310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 44, !dbg !268 + %7311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 45, !dbg !268 + %7312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 46, !dbg !268 + %7313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 47, !dbg !268 + %7314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 48, !dbg !268 + %7315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 49, !dbg !268 + %7316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 50, !dbg !268 + %7317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 51, !dbg !268 + %7318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 52, !dbg !268 + %7319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 53, !dbg !268 + %7320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 54, !dbg !268 + %7321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 55, !dbg !268 + %7322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 56, !dbg !268 + %7323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 57, !dbg !268 + %7324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 58, !dbg !268 + %7325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 59, !dbg !268 + %7326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 60, !dbg !268 + %7327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 61, !dbg !268 + %7328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 62, !dbg !268 + %7329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7265, 63, !dbg !268 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !268 + %7330 = insertelement <16 x i32> poison, i32 %5145, i64 0, !dbg !269 + %7331 = shufflevector <16 x i32> %7330, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !269 + %7332 = add <16 x i32> %5279, %7331, !dbg !269 + %7333 = add nuw nsw i32 %5278, 1, !dbg !213 + %7334 = lshr i32 %7333, 1, !dbg !270 + %7335 = zext nneg i32 %7334 to i64, !dbg !271 + %7336 = getelementptr i32, ptr addrspace(1) %4707, i64 %7335, !dbg !271 + %7337 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !272 + %7338 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %7336, i64 %7337, i1 %5281) #3, !dbg !272 + %7339 = add nuw nsw i32 %7334, 1, !dbg !273 + %7340 = icmp slt i32 %7339, %4712, !dbg !274 + %7341 = getelementptr i8, ptr addrspace(1) %7336, i64 4, !dbg !275 + %7342 = and i1 %5281, %7340, !dbg !213 + %7343 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !276 + %7344 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %7341, i64 %7343, i1 %7342) #3, !dbg !276 + %7345 = and i32 %5278, 1, !dbg !277 + %7346 = sub i32 %7344, %7338, !dbg !278 + %7347 = shl i32 %7346, 7, !dbg !279 + %7348 = add i32 %7347, -64, !dbg !280 + %7349 = xor i32 %7345, 1, !dbg !281 + %7350 = mul nuw nsw i32 %7348, %7349, !dbg !281 + %7351 = shl nuw nsw i32 %7345, 6, !dbg !282 + %7352 = add i32 %7350, %7351, !dbg !283 + %7353 = shl i32 %7352, 12, !dbg !284 + %7354 = sext i32 %7353 to i64, !dbg !249 + %7355 = getelementptr bfloat, ptr addrspace(1) %.pn1201513, i64 %7354, !dbg !249 + %7356 = getelementptr bfloat, ptr addrspace(1) %.pn1041514, i64 %7354, !dbg !249 + %7357 = getelementptr bfloat, ptr addrspace(1) %.pn881515, i64 %7354, !dbg !249 + %7358 = getelementptr bfloat, ptr addrspace(1) %.pn721516, i64 %7354, !dbg !249 + %7359 = shl i32 %7352, 7, !dbg !285 + %7360 = sext i32 %7359 to i64, !dbg !250 + %7361 = getelementptr bfloat, ptr addrspace(1) %.pn1841517, i64 %7360, !dbg !250 + %7362 = getelementptr bfloat, ptr addrspace(1) %.pn1681518, i64 %7360, !dbg !250 + %7363 = getelementptr bfloat, ptr addrspace(1) %.pn1521519, i64 %7360, !dbg !250 + %7364 = getelementptr bfloat, ptr addrspace(1) %.pn1361520, i64 %7360, !dbg !250 + %7365 = add i32 %7352, %.pn2161521, !dbg !269 + %7366 = add i32 %7352, %.pn2121522, !dbg !269 + %7367 = add i32 %7352, %.pn2081523, !dbg !269 + %7368 = add i32 %7352, %.pn2041524, !dbg !269 + %7369 = add i32 %7352, %.pn2001525, !dbg !269 + %7370 = add i32 %7352, %.pn1961526, !dbg !269 + %7371 = add i32 %7352, %.pn1921527, !dbg !269 + %7372 = add i32 %7352, %.pn1881528, !dbg !269 + %7373 = add i32 %5147, 1, !dbg !213 + %7374 = icmp sgt i32 %7373, 1, !dbg !213 + %7375 = select i1 %7374, i32 0, i32 %7373, !dbg !213 + %7376 = add i32 %5149, 1, !dbg !213 + %7377 = icmp sgt i32 %7376, 2, !dbg !213 + %7378 = select i1 %7377, i32 0, i32 %7376, !dbg !213 + %7379 = shl i32 %7378, 13, !dbg !244 + %7380 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %7379, !dbg !244 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !244 + %7381 = getelementptr inbounds nuw i8, ptr addrspace(3) %7380, i32 %4793, !dbg !244 + %7382 = select i1 %5280, i32 16, i32 0, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %7381, ptr addrspace(1) %7355, i32 %7382) #3, !dbg !244 + %7383 = getelementptr inbounds nuw i8, ptr addrspace(3) %7380, i32 %4796, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %7383, ptr addrspace(1) %7356, i32 %7382) #3, !dbg !244 + %7384 = getelementptr inbounds nuw i8, ptr addrspace(3) %7380, i32 %4798, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %7384, ptr addrspace(1) %7357, i32 %7382) #3, !dbg !244 + %7385 = getelementptr inbounds nuw i8, ptr addrspace(3) %7380, i32 %4800, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %7385, ptr addrspace(1) %7358, i32 %7382) #3, !dbg !244 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !244 + %7386 = sext i32 %7365 to i64, !dbg !245 + %7387 = getelementptr float, ptr addrspace(1) %5087, i64 %7386, !dbg !245 + %7388 = sext i32 %7366 to i64, !dbg !245 + %7389 = getelementptr float, ptr addrspace(1) %5087, i64 %7388, !dbg !245 + %7390 = sext i32 %7367 to i64, !dbg !245 + %7391 = getelementptr float, ptr addrspace(1) %5087, i64 %7390, !dbg !245 + %7392 = sext i32 %7368 to i64, !dbg !245 + %7393 = getelementptr float, ptr addrspace(1) %5087, i64 %7392, !dbg !245 + %7394 = sext i32 %7369 to i64, !dbg !245 + %7395 = getelementptr float, ptr addrspace(1) %5087, i64 %7394, !dbg !245 + %7396 = sext i32 %7370 to i64, !dbg !245 + %7397 = getelementptr float, ptr addrspace(1) %5087, i64 %7396, !dbg !245 + %7398 = sext i32 %7371 to i64, !dbg !245 + %7399 = getelementptr float, ptr addrspace(1) %5087, i64 %7398, !dbg !245 + %7400 = sext i32 %7372 to i64, !dbg !245 + %7401 = getelementptr float, ptr addrspace(1) %5087, i64 %7400, !dbg !245 + %7402 = shl i32 %7375, 6, !dbg !246 + %7403 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %7402, !dbg !246 + %7404 = getelementptr inbounds nuw i8, ptr addrspace(3) %7403, i32 %4812, !dbg !246 + %7405 = select i1 %5280, i32 8, i32 0, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %7404, ptr addrspace(1) %7387, i32 %7405, i1 %4811) #3, !dbg !246 + %7406 = getelementptr inbounds nuw i8, ptr addrspace(3) %7403, i32 %4815, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7406, ptr addrspace(1) %7389, i32 %7405, i1 %4811) #3, !dbg !246 + %7407 = getelementptr inbounds nuw i8, ptr addrspace(3) %7403, i32 %4817, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7407, ptr addrspace(1) %7391, i32 %7405, i1 %4811) #3, !dbg !246 + %7408 = getelementptr inbounds nuw i8, ptr addrspace(3) %7403, i32 %4819, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7408, ptr addrspace(1) %7393, i32 %7405, i1 %4811) #3, !dbg !246 + %7409 = getelementptr inbounds nuw i8, ptr addrspace(3) %7403, i32 %4821, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7409, ptr addrspace(1) %7395, i32 %7405, i1 %4811) #3, !dbg !246 + %7410 = getelementptr inbounds nuw i8, ptr addrspace(3) %7403, i32 %4823, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7410, ptr addrspace(1) %7397, i32 %7405, i1 %4811) #3, !dbg !246 + %7411 = getelementptr inbounds nuw i8, ptr addrspace(3) %7403, i32 %4825, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7411, ptr addrspace(1) %7399, i32 %7405, i1 %4811) #3, !dbg !246 + %7412 = getelementptr inbounds nuw i8, ptr addrspace(3) %7403, i32 %4827, !dbg !246 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7412, ptr addrspace(1) %7401, i32 %7405, i1 %4811) #3, !dbg !246 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !246 + %7413 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %7379, !dbg !244 + %7414 = getelementptr inbounds nuw i8, ptr addrspace(3) %7413, i32 %4793, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %7414, ptr addrspace(1) %7361, i32 %7382) #3, !dbg !244 + %7415 = getelementptr inbounds nuw i8, ptr addrspace(3) %7413, i32 %4796, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %7415, ptr addrspace(1) %7362, i32 %7382) #3, !dbg !244 + %7416 = getelementptr inbounds nuw i8, ptr addrspace(3) %7413, i32 %4798, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %7416, ptr addrspace(1) %7363, i32 %7382) #3, !dbg !244 + %7417 = getelementptr inbounds nuw i8, ptr addrspace(3) %7413, i32 %4800, !dbg !244 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %7417, ptr addrspace(1) %7364, i32 %7382) #3, !dbg !244 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !244 + %7418 = getelementptr float, ptr addrspace(1) %5088, i64 %7386, !dbg !247 + %7419 = getelementptr float, ptr addrspace(1) %5088, i64 %7388, !dbg !247 + %7420 = getelementptr float, ptr addrspace(1) %5088, i64 %7390, !dbg !247 + %7421 = getelementptr float, ptr addrspace(1) %5088, i64 %7392, !dbg !247 + %7422 = getelementptr float, ptr addrspace(1) %5088, i64 %7394, !dbg !247 + %7423 = getelementptr float, ptr addrspace(1) %5088, i64 %7396, !dbg !247 + %7424 = getelementptr float, ptr addrspace(1) %5088, i64 %7398, !dbg !247 + %7425 = getelementptr float, ptr addrspace(1) %5088, i64 %7400, !dbg !247 + %7426 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %7402, !dbg !248 + %7427 = getelementptr inbounds nuw i8, ptr addrspace(3) %7426, i32 %4812, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %7427, ptr addrspace(1) %7418, i32 %7405, i1 %4811) #3, !dbg !248 + %7428 = getelementptr inbounds nuw i8, ptr addrspace(3) %7426, i32 %4815, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7428, ptr addrspace(1) %7419, i32 %7405, i1 %4811) #3, !dbg !248 + %7429 = getelementptr inbounds nuw i8, ptr addrspace(3) %7426, i32 %4817, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7429, ptr addrspace(1) %7420, i32 %7405, i1 %4811) #3, !dbg !248 + %7430 = getelementptr inbounds nuw i8, ptr addrspace(3) %7426, i32 %4819, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7430, ptr addrspace(1) %7421, i32 %7405, i1 %4811) #3, !dbg !248 + %7431 = getelementptr inbounds nuw i8, ptr addrspace(3) %7426, i32 %4821, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7431, ptr addrspace(1) %7422, i32 %7405, i1 %4811) #3, !dbg !248 + %7432 = getelementptr inbounds nuw i8, ptr addrspace(3) %7426, i32 %4823, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7432, ptr addrspace(1) %7423, i32 %7405, i1 %4811) #3, !dbg !248 + %7433 = getelementptr inbounds nuw i8, ptr addrspace(3) %7426, i32 %4825, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7433, ptr addrspace(1) %7424, i32 %7405, i1 %4811) #3, !dbg !248 + %7434 = getelementptr inbounds nuw i8, ptr addrspace(3) %7426, i32 %4827, !dbg !248 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %7434, ptr addrspace(1) %7425, i32 %7405, i1 %4811) #3, !dbg !248 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !248 + %exitcond.not = icmp eq i32 %7333, %smax, !dbg !213 + br i1 %exitcond.not, label %._crit_edge, label %.lr.ph, !dbg !213 + +._crit_edge: ; preds = %__nv_exp2f.exit1315, %4945 + %7435 = phi float [ %4946, %4945 ], [ %7266, %__nv_exp2f.exit1315 ] + %7436 = phi float [ %4947, %4945 ], [ %7267, %__nv_exp2f.exit1315 ] + %7437 = phi float [ %4948, %4945 ], [ %7268, %__nv_exp2f.exit1315 ] + %7438 = phi float [ %4949, %4945 ], [ %7269, %__nv_exp2f.exit1315 ] + %7439 = phi float [ %4950, %4945 ], [ %7270, %__nv_exp2f.exit1315 ] + %7440 = phi float [ %4951, %4945 ], [ %7271, %__nv_exp2f.exit1315 ] + %7441 = phi float [ %4952, %4945 ], [ %7272, %__nv_exp2f.exit1315 ] + %7442 = phi float [ %4953, %4945 ], [ %7273, %__nv_exp2f.exit1315 ] + %7443 = phi float [ %4954, %4945 ], [ %7274, %__nv_exp2f.exit1315 ] + %7444 = phi float [ %4955, %4945 ], [ %7275, %__nv_exp2f.exit1315 ] + %7445 = phi float [ %4956, %4945 ], [ %7276, %__nv_exp2f.exit1315 ] + %7446 = phi float [ %4957, %4945 ], [ %7277, %__nv_exp2f.exit1315 ] + %7447 = phi float [ %4958, %4945 ], [ %7278, %__nv_exp2f.exit1315 ] + %7448 = phi float [ %4959, %4945 ], [ %7279, %__nv_exp2f.exit1315 ] + %7449 = phi float [ %4960, %4945 ], [ %7280, %__nv_exp2f.exit1315 ] + %7450 = phi float [ %4961, %4945 ], [ %7281, %__nv_exp2f.exit1315 ] + %7451 = phi float [ %4962, %4945 ], [ %7282, %__nv_exp2f.exit1315 ] + %7452 = phi float [ %4963, %4945 ], [ %7283, %__nv_exp2f.exit1315 ] + %7453 = phi float [ %4964, %4945 ], [ %7284, %__nv_exp2f.exit1315 ] + %7454 = phi float [ %4965, %4945 ], [ %7285, %__nv_exp2f.exit1315 ] + %7455 = phi float [ %4966, %4945 ], [ %7286, %__nv_exp2f.exit1315 ] + %7456 = phi float [ %4967, %4945 ], [ %7287, %__nv_exp2f.exit1315 ] + %7457 = phi float [ %4968, %4945 ], [ %7288, %__nv_exp2f.exit1315 ] + %7458 = phi float [ %4969, %4945 ], [ %7289, %__nv_exp2f.exit1315 ] + %7459 = phi float [ %4970, %4945 ], [ %7290, %__nv_exp2f.exit1315 ] + %7460 = phi float [ %4971, %4945 ], [ %7291, %__nv_exp2f.exit1315 ] + %7461 = phi float [ %4972, %4945 ], [ %7292, %__nv_exp2f.exit1315 ] + %7462 = phi float [ %4973, %4945 ], [ %7293, %__nv_exp2f.exit1315 ] + %7463 = phi float [ %4974, %4945 ], [ %7294, %__nv_exp2f.exit1315 ] + %7464 = phi float [ %4975, %4945 ], [ %7295, %__nv_exp2f.exit1315 ] + %7465 = phi float [ %4976, %4945 ], [ %7296, %__nv_exp2f.exit1315 ] + %7466 = phi float [ %4977, %4945 ], [ %7297, %__nv_exp2f.exit1315 ] + %7467 = phi float [ %4978, %4945 ], [ %7298, %__nv_exp2f.exit1315 ] + %7468 = phi float [ %4979, %4945 ], [ %7299, %__nv_exp2f.exit1315 ] + %7469 = phi float [ %4980, %4945 ], [ %7300, %__nv_exp2f.exit1315 ] + %7470 = phi float [ %4981, %4945 ], [ %7301, %__nv_exp2f.exit1315 ] + %7471 = phi float [ %4982, %4945 ], [ %7302, %__nv_exp2f.exit1315 ] + %7472 = phi float [ %4983, %4945 ], [ %7303, %__nv_exp2f.exit1315 ] + %7473 = phi float [ %4984, %4945 ], [ %7304, %__nv_exp2f.exit1315 ] + %7474 = phi float [ %4985, %4945 ], [ %7305, %__nv_exp2f.exit1315 ] + %7475 = phi float [ %4986, %4945 ], [ %7306, %__nv_exp2f.exit1315 ] + %7476 = phi float [ %4987, %4945 ], [ %7307, %__nv_exp2f.exit1315 ] + %7477 = phi float [ %4988, %4945 ], [ %7308, %__nv_exp2f.exit1315 ] + %7478 = phi float [ %4989, %4945 ], [ %7309, %__nv_exp2f.exit1315 ] + %7479 = phi float [ %4990, %4945 ], [ %7310, %__nv_exp2f.exit1315 ] + %7480 = phi float [ %4991, %4945 ], [ %7311, %__nv_exp2f.exit1315 ] + %7481 = phi float [ %4992, %4945 ], [ %7312, %__nv_exp2f.exit1315 ] + %7482 = phi float [ %4993, %4945 ], [ %7313, %__nv_exp2f.exit1315 ] + %7483 = phi float [ %4994, %4945 ], [ %7314, %__nv_exp2f.exit1315 ] + %7484 = phi float [ %4995, %4945 ], [ %7315, %__nv_exp2f.exit1315 ] + %7485 = phi float [ %4996, %4945 ], [ %7316, %__nv_exp2f.exit1315 ] + %7486 = phi float [ %4997, %4945 ], [ %7317, %__nv_exp2f.exit1315 ] + %7487 = phi float [ %4998, %4945 ], [ %7318, %__nv_exp2f.exit1315 ] + %7488 = phi float [ %4999, %4945 ], [ %7319, %__nv_exp2f.exit1315 ] + %7489 = phi float [ %5000, %4945 ], [ %7320, %__nv_exp2f.exit1315 ] + %7490 = phi float [ %5001, %4945 ], [ %7321, %__nv_exp2f.exit1315 ] + %7491 = phi float [ %5002, %4945 ], [ %7322, %__nv_exp2f.exit1315 ] + %7492 = phi float [ %5003, %4945 ], [ %7323, %__nv_exp2f.exit1315 ] + %7493 = phi float [ %5004, %4945 ], [ %7324, %__nv_exp2f.exit1315 ] + %7494 = phi float [ %5005, %4945 ], [ %7325, %__nv_exp2f.exit1315 ] + %7495 = phi float [ %5006, %4945 ], [ %7326, %__nv_exp2f.exit1315 ] + %7496 = phi float [ %5007, %4945 ], [ %7327, %__nv_exp2f.exit1315 ] + %7497 = phi float [ %5008, %4945 ], [ %7328, %__nv_exp2f.exit1315 ] + %7498 = phi float [ %5009, %4945 ], [ %7329, %__nv_exp2f.exit1315 ] + %7499 = phi float [ %5010, %4945 ], [ %6410, %__nv_exp2f.exit1315 ] + %7500 = phi float [ %5011, %4945 ], [ %6411, %__nv_exp2f.exit1315 ] + %7501 = phi float [ %5012, %4945 ], [ %6412, %__nv_exp2f.exit1315 ] + %7502 = phi float [ %5013, %4945 ], [ %6413, %__nv_exp2f.exit1315 ] + %7503 = phi float [ %5014, %4945 ], [ %6414, %__nv_exp2f.exit1315 ] + %7504 = phi float [ %5015, %4945 ], [ %6415, %__nv_exp2f.exit1315 ] + %7505 = phi float [ %5016, %4945 ], [ %6416, %__nv_exp2f.exit1315 ] + %7506 = phi float [ %5017, %4945 ], [ %6417, %__nv_exp2f.exit1315 ] + %7507 = phi float [ %5018, %4945 ], [ %6418, %__nv_exp2f.exit1315 ] + %7508 = phi float [ %5019, %4945 ], [ %6419, %__nv_exp2f.exit1315 ] + %7509 = phi float [ %5020, %4945 ], [ %6420, %__nv_exp2f.exit1315 ] + %7510 = phi float [ %5021, %4945 ], [ %6421, %__nv_exp2f.exit1315 ] + %7511 = phi float [ %5022, %4945 ], [ %6422, %__nv_exp2f.exit1315 ] + %7512 = phi float [ %5023, %4945 ], [ %6423, %__nv_exp2f.exit1315 ] + %7513 = phi float [ %5024, %4945 ], [ %6424, %__nv_exp2f.exit1315 ] + %7514 = phi float [ %5025, %4945 ], [ %6425, %__nv_exp2f.exit1315 ] + %7515 = phi float [ %5026, %4945 ], [ %6426, %__nv_exp2f.exit1315 ] + %7516 = phi float [ %5027, %4945 ], [ %6427, %__nv_exp2f.exit1315 ] + %7517 = phi float [ %5028, %4945 ], [ %6428, %__nv_exp2f.exit1315 ] + %7518 = phi float [ %5029, %4945 ], [ %6429, %__nv_exp2f.exit1315 ] + %7519 = phi float [ %5030, %4945 ], [ %6430, %__nv_exp2f.exit1315 ] + %7520 = phi float [ %5031, %4945 ], [ %6431, %__nv_exp2f.exit1315 ] + %7521 = phi float [ %5032, %4945 ], [ %6432, %__nv_exp2f.exit1315 ] + %7522 = phi float [ %5033, %4945 ], [ %6433, %__nv_exp2f.exit1315 ] + %7523 = phi float [ %5034, %4945 ], [ %6434, %__nv_exp2f.exit1315 ] + %7524 = phi float [ %5035, %4945 ], [ %6435, %__nv_exp2f.exit1315 ] + %7525 = phi float [ %5036, %4945 ], [ %6436, %__nv_exp2f.exit1315 ] + %7526 = phi float [ %5037, %4945 ], [ %6437, %__nv_exp2f.exit1315 ] + %7527 = phi float [ %5038, %4945 ], [ %6438, %__nv_exp2f.exit1315 ] + %7528 = phi float [ %5039, %4945 ], [ %6439, %__nv_exp2f.exit1315 ] + %7529 = phi float [ %5040, %4945 ], [ %6440, %__nv_exp2f.exit1315 ] + %7530 = phi float [ %5041, %4945 ], [ %6441, %__nv_exp2f.exit1315 ] + %7531 = phi float [ %5042, %4945 ], [ %6442, %__nv_exp2f.exit1315 ] + %7532 = phi float [ %5043, %4945 ], [ %6443, %__nv_exp2f.exit1315 ] + %7533 = phi float [ %5044, %4945 ], [ %6444, %__nv_exp2f.exit1315 ] + %7534 = phi float [ %5045, %4945 ], [ %6445, %__nv_exp2f.exit1315 ] + %7535 = phi float [ %5046, %4945 ], [ %6446, %__nv_exp2f.exit1315 ] + %7536 = phi float [ %5047, %4945 ], [ %6447, %__nv_exp2f.exit1315 ] + %7537 = phi float [ %5048, %4945 ], [ %6448, %__nv_exp2f.exit1315 ] + %7538 = phi float [ %5049, %4945 ], [ %6449, %__nv_exp2f.exit1315 ] + %7539 = phi float [ %5050, %4945 ], [ %6450, %__nv_exp2f.exit1315 ] + %7540 = phi float [ %5051, %4945 ], [ %6451, %__nv_exp2f.exit1315 ] + %7541 = phi float [ %5052, %4945 ], [ %6452, %__nv_exp2f.exit1315 ] + %7542 = phi float [ %5053, %4945 ], [ %6453, %__nv_exp2f.exit1315 ] + %7543 = phi float [ %5054, %4945 ], [ %6454, %__nv_exp2f.exit1315 ] + %7544 = phi float [ %5055, %4945 ], [ %6455, %__nv_exp2f.exit1315 ] + %7545 = phi float [ %5056, %4945 ], [ %6456, %__nv_exp2f.exit1315 ] + %7546 = phi float [ %5057, %4945 ], [ %6457, %__nv_exp2f.exit1315 ] + %7547 = phi float [ %5058, %4945 ], [ %6458, %__nv_exp2f.exit1315 ] + %7548 = phi float [ %5059, %4945 ], [ %6459, %__nv_exp2f.exit1315 ] + %7549 = phi float [ %5060, %4945 ], [ %6460, %__nv_exp2f.exit1315 ] + %7550 = phi float [ %5061, %4945 ], [ %6461, %__nv_exp2f.exit1315 ] + %7551 = phi float [ %5062, %4945 ], [ %6462, %__nv_exp2f.exit1315 ] + %7552 = phi float [ %5063, %4945 ], [ %6463, %__nv_exp2f.exit1315 ] + %7553 = phi float [ %5064, %4945 ], [ %6464, %__nv_exp2f.exit1315 ] + %7554 = phi float [ %5065, %4945 ], [ %6465, %__nv_exp2f.exit1315 ] + %7555 = phi float [ %5066, %4945 ], [ %6466, %__nv_exp2f.exit1315 ] + %7556 = phi float [ %5067, %4945 ], [ %6467, %__nv_exp2f.exit1315 ] + %7557 = phi float [ %5068, %4945 ], [ %6468, %__nv_exp2f.exit1315 ] + %7558 = phi float [ %5069, %4945 ], [ %6469, %__nv_exp2f.exit1315 ] + %7559 = phi float [ %5070, %4945 ], [ %6470, %__nv_exp2f.exit1315 ] + %7560 = phi float [ %5071, %4945 ], [ %6471, %__nv_exp2f.exit1315 ] + %7561 = phi float [ %5072, %4945 ], [ %6472, %__nv_exp2f.exit1315 ] + %7562 = phi float [ %5073, %4945 ], [ %6473, %__nv_exp2f.exit1315 ] + %7563 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101,$102,$103,$104,$105,$106,$107,$108,$109,$110,$111,$112,$113,$114,$115,$116,$117,$118,$119,$120,$121,$122,$123,$124,$125,$126,$127\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127"(float %7499, float %7500, float %7501, float %7502, float %7503, float %7504, float %7505, float %7506, float %7507, float %7508, float %7509, float %7510, float %7511, float %7512, float %7513, float %7514, float %7515, float %7516, float %7517, float %7518, float %7519, float %7520, float %7521, float %7522, float %7523, float %7524, float %7525, float %7526, float %7527, float %7528, float %7529, float %7530, float %7531, float %7532, float %7533, float %7534, float %7535, float %7536, float %7537, float %7538, float %7539, float %7540, float %7541, float %7542, float %7543, float %7544, float %7545, float %7546, float %7547, float %7548, float %7549, float %7550, float %7551, float %7552, float %7553, float %7554, float %7555, float %7556, float %7557, float %7558, float %7559, float %7560, float %7561, float %7562, float %7435, float %7436, float %7437, float %7438, float %7439, float %7440, float %7441, float %7442, float %7443, float %7444, float %7445, float %7446, float %7447, float %7448, float %7449, float %7450, float %7451, float %7452, float %7453, float %7454, float %7455, float %7456, float %7457, float %7458, float %7459, float %7460, float %7461, float %7462, float %7463, float %7464, float %7465, float %7466, float %7467, float %7468, float %7469, float %7470, float %7471, float %7472, float %7473, float %7474, float %7475, float %7476, float %7477, float %7478, float %7479, float %7480, float %7481, float %7482, float %7483, float %7484, float %7485, float %7486, float %7487, float %7488, float %7489, float %7490, float %7491, float %7492, float %7493, float %7494, float %7495, float %7496, float %7497, float %7498) #3, !dbg !213 + %7564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 0, !dbg !213 + %7565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 1, !dbg !213 + %7566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 2, !dbg !213 + %7567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 3, !dbg !213 + %7568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 4, !dbg !213 + %7569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 5, !dbg !213 + %7570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 6, !dbg !213 + %7571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 7, !dbg !213 + %7572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 8, !dbg !213 + %7573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 9, !dbg !213 + %7574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 10, !dbg !213 + %7575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 11, !dbg !213 + %7576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 12, !dbg !213 + %7577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 13, !dbg !213 + %7578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 14, !dbg !213 + %7579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 15, !dbg !213 + %7580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 16, !dbg !213 + %7581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 17, !dbg !213 + %7582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 18, !dbg !213 + %7583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 19, !dbg !213 + %7584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 20, !dbg !213 + %7585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 21, !dbg !213 + %7586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 22, !dbg !213 + %7587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 23, !dbg !213 + %7588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 24, !dbg !213 + %7589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 25, !dbg !213 + %7590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 26, !dbg !213 + %7591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 27, !dbg !213 + %7592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 28, !dbg !213 + %7593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 29, !dbg !213 + %7594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 30, !dbg !213 + %7595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 31, !dbg !213 + %7596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 32, !dbg !213 + %7597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 33, !dbg !213 + %7598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 34, !dbg !213 + %7599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 35, !dbg !213 + %7600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 36, !dbg !213 + %7601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 37, !dbg !213 + %7602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 38, !dbg !213 + %7603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 39, !dbg !213 + %7604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 40, !dbg !213 + %7605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 41, !dbg !213 + %7606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 42, !dbg !213 + %7607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 43, !dbg !213 + %7608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 44, !dbg !213 + %7609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 45, !dbg !213 + %7610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 46, !dbg !213 + %7611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 47, !dbg !213 + %7612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 48, !dbg !213 + %7613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 49, !dbg !213 + %7614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 50, !dbg !213 + %7615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 51, !dbg !213 + %7616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 52, !dbg !213 + %7617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 53, !dbg !213 + %7618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 54, !dbg !213 + %7619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 55, !dbg !213 + %7620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 56, !dbg !213 + %7621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 57, !dbg !213 + %7622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 58, !dbg !213 + %7623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 59, !dbg !213 + %7624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 60, !dbg !213 + %7625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 61, !dbg !213 + %7626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 62, !dbg !213 + %7627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 63, !dbg !213 + %7628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 64, !dbg !213 + %7629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 65, !dbg !213 + %7630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 66, !dbg !213 + %7631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 67, !dbg !213 + %7632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 68, !dbg !213 + %7633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 69, !dbg !213 + %7634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 70, !dbg !213 + %7635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 71, !dbg !213 + %7636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 72, !dbg !213 + %7637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 73, !dbg !213 + %7638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 74, !dbg !213 + %7639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 75, !dbg !213 + %7640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 76, !dbg !213 + %7641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 77, !dbg !213 + %7642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 78, !dbg !213 + %7643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 79, !dbg !213 + %7644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 80, !dbg !213 + %7645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 81, !dbg !213 + %7646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 82, !dbg !213 + %7647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 83, !dbg !213 + %7648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 84, !dbg !213 + %7649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 85, !dbg !213 + %7650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 86, !dbg !213 + %7651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 87, !dbg !213 + %7652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 88, !dbg !213 + %7653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 89, !dbg !213 + %7654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 90, !dbg !213 + %7655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 91, !dbg !213 + %7656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 92, !dbg !213 + %7657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 93, !dbg !213 + %7658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 94, !dbg !213 + %7659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 95, !dbg !213 + %7660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 96, !dbg !213 + %7661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 97, !dbg !213 + %7662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 98, !dbg !213 + %7663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 99, !dbg !213 + %7664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 100, !dbg !213 + %7665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 101, !dbg !213 + %7666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 102, !dbg !213 + %7667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 103, !dbg !213 + %7668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 104, !dbg !213 + %7669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 105, !dbg !213 + %7670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 106, !dbg !213 + %7671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 107, !dbg !213 + %7672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 108, !dbg !213 + %7673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 109, !dbg !213 + %7674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 110, !dbg !213 + %7675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 111, !dbg !213 + %7676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 112, !dbg !213 + %7677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 113, !dbg !213 + %7678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 114, !dbg !213 + %7679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 115, !dbg !213 + %7680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 116, !dbg !213 + %7681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 117, !dbg !213 + %7682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 118, !dbg !213 + %7683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 119, !dbg !213 + %7684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 120, !dbg !213 + %7685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 121, !dbg !213 + %7686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 122, !dbg !213 + %7687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 123, !dbg !213 + %7688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 124, !dbg !213 + %7689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 125, !dbg !213 + %7690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 126, !dbg !213 + %7691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7563, 127, !dbg !213 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !213 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !213 + %7692 = getelementptr bfloat, ptr addrspace(1) %5085, i64 %4886, !dbg !286 + %7693 = getelementptr bfloat, ptr addrspace(1) %5085, i64 %4887, !dbg !286 + %7694 = getelementptr bfloat, ptr addrspace(1) %5085, i64 %4888, !dbg !286 + %7695 = getelementptr bfloat, ptr addrspace(1) %5085, i64 %4889, !dbg !286 + %7696 = getelementptr bfloat, ptr addrspace(1) %7692, i64 %4498, !dbg !287 + %7697 = getelementptr bfloat, ptr addrspace(1) %7693, i64 %4498, !dbg !287 + %7698 = getelementptr bfloat, ptr addrspace(1) %7694, i64 %4498, !dbg !287 + %7699 = getelementptr bfloat, ptr addrspace(1) %7695, i64 %4498, !dbg !287 + %7700 = getelementptr bfloat, ptr addrspace(1) %5086, i64 %4890, !dbg !288 + %7701 = getelementptr bfloat, ptr addrspace(1) %5086, i64 %4891, !dbg !288 + %7702 = getelementptr bfloat, ptr addrspace(1) %5086, i64 %4892, !dbg !288 + %7703 = getelementptr bfloat, ptr addrspace(1) %5086, i64 %4893, !dbg !288 + %7704 = getelementptr bfloat, ptr addrspace(1) %7700, i64 %4498, !dbg !289 + %7705 = getelementptr bfloat, ptr addrspace(1) %7701, i64 %4498, !dbg !289 + %7706 = getelementptr bfloat, ptr addrspace(1) %7702, i64 %4498, !dbg !289 + %7707 = getelementptr bfloat, ptr addrspace(1) %7703, i64 %4498, !dbg !289 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4794, ptr addrspace(1) %7696, i32 %4895) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4797, ptr addrspace(1) %7697, i32 %4895) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4799, ptr addrspace(1) %7698, i32 %4895) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4801, ptr addrspace(1) %7699, i32 %4895) #3, !dbg !290 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !290 + %7708 = getelementptr float, ptr addrspace(1) %5087, i64 %4896, !dbg !291 + %7709 = getelementptr float, ptr addrspace(1) %5087, i64 %4897, !dbg !291 + %7710 = getelementptr float, ptr addrspace(1) %5087, i64 %4898, !dbg !291 + %7711 = getelementptr float, ptr addrspace(1) %5087, i64 %4899, !dbg !291 + %7712 = getelementptr float, ptr addrspace(1) %5087, i64 %4900, !dbg !291 + %7713 = getelementptr float, ptr addrspace(1) %5087, i64 %4901, !dbg !291 + %7714 = getelementptr float, ptr addrspace(1) %5087, i64 %4902, !dbg !291 + %7715 = getelementptr float, ptr addrspace(1) %5087, i64 %4903, !dbg !291 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %4813, ptr addrspace(1) %7708, i32 %4904, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4816, ptr addrspace(1) %7709, i32 %4904, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4818, ptr addrspace(1) %7710, i32 %4904, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4820, ptr addrspace(1) %7711, i32 %4904, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4822, ptr addrspace(1) %7712, i32 %4904, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4824, ptr addrspace(1) %7713, i32 %4904, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4826, ptr addrspace(1) %7714, i32 %4904, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4828, ptr addrspace(1) %7715, i32 %4904, i1 %4811) #3, !dbg !292 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !292 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4829, ptr addrspace(1) %7704, i32 %4895) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4830, ptr addrspace(1) %7705, i32 %4895) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4831, ptr addrspace(1) %7706, i32 %4895) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4832, ptr addrspace(1) %7707, i32 %4895) #3, !dbg !290 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !290 + %7716 = getelementptr float, ptr addrspace(1) %5088, i64 %4896, !dbg !293 + %7717 = getelementptr float, ptr addrspace(1) %5088, i64 %4897, !dbg !293 + %7718 = getelementptr float, ptr addrspace(1) %5088, i64 %4898, !dbg !293 + %7719 = getelementptr float, ptr addrspace(1) %5088, i64 %4899, !dbg !293 + %7720 = getelementptr float, ptr addrspace(1) %5088, i64 %4900, !dbg !293 + %7721 = getelementptr float, ptr addrspace(1) %5088, i64 %4901, !dbg !293 + %7722 = getelementptr float, ptr addrspace(1) %5088, i64 %4902, !dbg !293 + %7723 = getelementptr float, ptr addrspace(1) %5088, i64 %4903, !dbg !293 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %4833, ptr addrspace(1) %7716, i32 %4904, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4834, ptr addrspace(1) %7717, i32 %4904, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4835, ptr addrspace(1) %7718, i32 %4904, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4836, ptr addrspace(1) %7719, i32 %4904, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4837, ptr addrspace(1) %7720, i32 %4904, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4838, ptr addrspace(1) %7721, i32 %4904, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4839, ptr addrspace(1) %7722, i32 %4904, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4840, ptr addrspace(1) %7723, i32 %4904, i1 %4811) #3, !dbg !294 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !294 + %7724 = getelementptr i8, ptr addrspace(1) %7696, i64 524288, !dbg !295 + %7725 = getelementptr i8, ptr addrspace(1) %7697, i64 524288, !dbg !295 + %7726 = getelementptr i8, ptr addrspace(1) %7698, i64 524288, !dbg !295 + %7727 = getelementptr i8, ptr addrspace(1) %7699, i64 524288, !dbg !295 + %7728 = getelementptr i8, ptr addrspace(1) %7704, i64 16384, !dbg !296 + %7729 = getelementptr i8, ptr addrspace(1) %7705, i64 16384, !dbg !296 + %7730 = getelementptr i8, ptr addrspace(1) %7706, i64 16384, !dbg !296 + %7731 = getelementptr i8, ptr addrspace(1) %7707, i64 16384, !dbg !296 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4850, ptr addrspace(1) %7724, i32 %4914) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4852, ptr addrspace(1) %7725, i32 %4914) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4853, ptr addrspace(1) %7726, i32 %4914) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4854, ptr addrspace(1) %7727, i32 %4914) #3, !dbg !290 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !290 + %7732 = getelementptr float, ptr addrspace(1) %5087, i64 %4915, !dbg !291 + %7733 = getelementptr float, ptr addrspace(1) %5087, i64 %4916, !dbg !291 + %7734 = getelementptr float, ptr addrspace(1) %5087, i64 %4917, !dbg !291 + %7735 = getelementptr float, ptr addrspace(1) %5087, i64 %4918, !dbg !291 + %7736 = getelementptr float, ptr addrspace(1) %5087, i64 %4919, !dbg !291 + %7737 = getelementptr float, ptr addrspace(1) %5087, i64 %4920, !dbg !291 + %7738 = getelementptr float, ptr addrspace(1) %5087, i64 %4921, !dbg !291 + %7739 = getelementptr float, ptr addrspace(1) %5087, i64 %4922, !dbg !291 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %4863, ptr addrspace(1) %7732, i32 %4923, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4865, ptr addrspace(1) %7733, i32 %4923, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4866, ptr addrspace(1) %7734, i32 %4923, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4867, ptr addrspace(1) %7735, i32 %4923, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4868, ptr addrspace(1) %7736, i32 %4923, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4869, ptr addrspace(1) %7737, i32 %4923, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4870, ptr addrspace(1) %7738, i32 %4923, i1 %4811) #3, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4871, ptr addrspace(1) %7739, i32 %4923, i1 %4811) #3, !dbg !292 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !292 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4872, ptr addrspace(1) %7728, i32 %4914) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4873, ptr addrspace(1) %7729, i32 %4914) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4874, ptr addrspace(1) %7730, i32 %4914) #3, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4875, ptr addrspace(1) %7731, i32 %4914) #3, !dbg !290 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !290 + %7740 = getelementptr float, ptr addrspace(1) %5088, i64 %4915, !dbg !293 + %7741 = getelementptr float, ptr addrspace(1) %5088, i64 %4916, !dbg !293 + %7742 = getelementptr float, ptr addrspace(1) %5088, i64 %4917, !dbg !293 + %7743 = getelementptr float, ptr addrspace(1) %5088, i64 %4918, !dbg !293 + %7744 = getelementptr float, ptr addrspace(1) %5088, i64 %4919, !dbg !293 + %7745 = getelementptr float, ptr addrspace(1) %5088, i64 %4920, !dbg !293 + %7746 = getelementptr float, ptr addrspace(1) %5088, i64 %4921, !dbg !293 + %7747 = getelementptr float, ptr addrspace(1) %5088, i64 %4922, !dbg !293 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %4876, ptr addrspace(1) %7740, i32 %4923, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4877, ptr addrspace(1) %7741, i32 %4923, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4878, ptr addrspace(1) %7742, i32 %4923, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4879, ptr addrspace(1) %7743, i32 %4923, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4880, ptr addrspace(1) %7744, i32 %4923, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4881, ptr addrspace(1) %7745, i32 %4923, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4882, ptr addrspace(1) %7746, i32 %4923, i1 %4811) #3, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %4883, ptr addrspace(1) %7747, i32 %4923, i1 %4811) #3, !dbg !294 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !294 + br i1 %4894, label %.lr.ph1691, label %._crit_edge1692, !dbg !297 + +.lr.ph1691: ; preds = %._crit_edge, %__nv_exp2f.exit1219 + %7748 = phi i32 [ %7757, %__nv_exp2f.exit1219 ], [ -1, %._crit_edge ] + %7749 = phi i32 [ %9619, %__nv_exp2f.exit1219 ], [ 1, %._crit_edge ] + %7750 = phi i32 [ %7760, %__nv_exp2f.exit1219 ], [ -1, %._crit_edge ] + %7751 = phi i32 [ %9622, %__nv_exp2f.exit1219 ], [ 1, %._crit_edge ] + %.pn6361689 = phi i32 [ %9616, %__nv_exp2f.exit1219 ], [ %4913, %._crit_edge ] + %.pn6401688 = phi i32 [ %9615, %__nv_exp2f.exit1219 ], [ %4912, %._crit_edge ] + %.pn6441687 = phi i32 [ %9614, %__nv_exp2f.exit1219 ], [ %4911, %._crit_edge ] + %.pn6481686 = phi i32 [ %9613, %__nv_exp2f.exit1219 ], [ %4910, %._crit_edge ] + %.pn6521685 = phi i32 [ %9612, %__nv_exp2f.exit1219 ], [ %4909, %._crit_edge ] + %.pn6561684 = phi i32 [ %9611, %__nv_exp2f.exit1219 ], [ %4908, %._crit_edge ] + %.pn6601683 = phi i32 [ %9610, %__nv_exp2f.exit1219 ], [ %4907, %._crit_edge ] + %.pn6641682 = phi i32 [ %9609, %__nv_exp2f.exit1219 ], [ %4906, %._crit_edge ] + %.pn5841681 = phi ptr addrspace(1) [ %9608, %__nv_exp2f.exit1219 ], [ %7731, %._crit_edge ] + %.pn6001680 = phi ptr addrspace(1) [ %9607, %__nv_exp2f.exit1219 ], [ %7730, %._crit_edge ] + %.pn6161679 = phi ptr addrspace(1) [ %9606, %__nv_exp2f.exit1219 ], [ %7729, %._crit_edge ] + %.pn6321678 = phi ptr addrspace(1) [ %9605, %__nv_exp2f.exit1219 ], [ %7728, %._crit_edge ] + %.pn5201677 = phi ptr addrspace(1) [ %9602, %__nv_exp2f.exit1219 ], [ %7727, %._crit_edge ] + %.pn5361676 = phi ptr addrspace(1) [ %9601, %__nv_exp2f.exit1219 ], [ %7726, %._crit_edge ] + %.pn5521675 = phi ptr addrspace(1) [ %9600, %__nv_exp2f.exit1219 ], [ %7725, %._crit_edge ] + %.pn5681674 = phi ptr addrspace(1) [ %9599, %__nv_exp2f.exit1219 ], [ %7724, %._crit_edge ] + %.pn3781673 = phi float [ %8816, %__nv_exp2f.exit1219 ], [ %7627, %._crit_edge ] + %.pn3801672 = phi float [ %8815, %__nv_exp2f.exit1219 ], [ %7626, %._crit_edge ] + %.pn3821671 = phi float [ %8814, %__nv_exp2f.exit1219 ], [ %7625, %._crit_edge ] + %.pn3841670 = phi float [ %8813, %__nv_exp2f.exit1219 ], [ %7624, %._crit_edge ] + %.pn3861669 = phi float [ %8812, %__nv_exp2f.exit1219 ], [ %7623, %._crit_edge ] + %.pn3881668 = phi float [ %8811, %__nv_exp2f.exit1219 ], [ %7622, %._crit_edge ] + %.pn3901667 = phi float [ %8810, %__nv_exp2f.exit1219 ], [ %7621, %._crit_edge ] + %.pn3921666 = phi float [ %8809, %__nv_exp2f.exit1219 ], [ %7620, %._crit_edge ] + %.pn3941665 = phi float [ %8808, %__nv_exp2f.exit1219 ], [ %7619, %._crit_edge ] + %.pn3961664 = phi float [ %8807, %__nv_exp2f.exit1219 ], [ %7618, %._crit_edge ] + %.pn3981663 = phi float [ %8806, %__nv_exp2f.exit1219 ], [ %7617, %._crit_edge ] + %.pn4001662 = phi float [ %8805, %__nv_exp2f.exit1219 ], [ %7616, %._crit_edge ] + %.pn4021661 = phi float [ %8804, %__nv_exp2f.exit1219 ], [ %7615, %._crit_edge ] + %.pn4041660 = phi float [ %8803, %__nv_exp2f.exit1219 ], [ %7614, %._crit_edge ] + %.pn4061659 = phi float [ %8802, %__nv_exp2f.exit1219 ], [ %7613, %._crit_edge ] + %.pn4081658 = phi float [ %8801, %__nv_exp2f.exit1219 ], [ %7612, %._crit_edge ] + %.pn4101657 = phi float [ %8800, %__nv_exp2f.exit1219 ], [ %7611, %._crit_edge ] + %.pn4121656 = phi float [ %8799, %__nv_exp2f.exit1219 ], [ %7610, %._crit_edge ] + %.pn4141655 = phi float [ %8798, %__nv_exp2f.exit1219 ], [ %7609, %._crit_edge ] + %.pn4161654 = phi float [ %8797, %__nv_exp2f.exit1219 ], [ %7608, %._crit_edge ] + %.pn4181653 = phi float [ %8796, %__nv_exp2f.exit1219 ], [ %7607, %._crit_edge ] + %.pn4201652 = phi float [ %8795, %__nv_exp2f.exit1219 ], [ %7606, %._crit_edge ] + %.pn4221651 = phi float [ %8794, %__nv_exp2f.exit1219 ], [ %7605, %._crit_edge ] + %.pn4241650 = phi float [ %8793, %__nv_exp2f.exit1219 ], [ %7604, %._crit_edge ] + %.pn4261649 = phi float [ %8792, %__nv_exp2f.exit1219 ], [ %7603, %._crit_edge ] + %.pn4281648 = phi float [ %8791, %__nv_exp2f.exit1219 ], [ %7602, %._crit_edge ] + %.pn4301647 = phi float [ %8790, %__nv_exp2f.exit1219 ], [ %7601, %._crit_edge ] + %.pn4321646 = phi float [ %8789, %__nv_exp2f.exit1219 ], [ %7600, %._crit_edge ] + %.pn4341645 = phi float [ %8788, %__nv_exp2f.exit1219 ], [ %7599, %._crit_edge ] + %.pn4361644 = phi float [ %8787, %__nv_exp2f.exit1219 ], [ %7598, %._crit_edge ] + %.pn4381643 = phi float [ %8786, %__nv_exp2f.exit1219 ], [ %7597, %._crit_edge ] + %.pn4401642 = phi float [ %8785, %__nv_exp2f.exit1219 ], [ %7596, %._crit_edge ] + %.pn4421641 = phi float [ %8784, %__nv_exp2f.exit1219 ], [ %7595, %._crit_edge ] + %.pn4441640 = phi float [ %8783, %__nv_exp2f.exit1219 ], [ %7594, %._crit_edge ] + %.pn4461639 = phi float [ %8782, %__nv_exp2f.exit1219 ], [ %7593, %._crit_edge ] + %.pn4481638 = phi float [ %8781, %__nv_exp2f.exit1219 ], [ %7592, %._crit_edge ] + %.pn4501637 = phi float [ %8780, %__nv_exp2f.exit1219 ], [ %7591, %._crit_edge ] + %.pn4521636 = phi float [ %8779, %__nv_exp2f.exit1219 ], [ %7590, %._crit_edge ] + %.pn4541635 = phi float [ %8778, %__nv_exp2f.exit1219 ], [ %7589, %._crit_edge ] + %.pn4561634 = phi float [ %8777, %__nv_exp2f.exit1219 ], [ %7588, %._crit_edge ] + %.pn4581633 = phi float [ %8776, %__nv_exp2f.exit1219 ], [ %7587, %._crit_edge ] + %.pn4601632 = phi float [ %8775, %__nv_exp2f.exit1219 ], [ %7586, %._crit_edge ] + %.pn4621631 = phi float [ %8774, %__nv_exp2f.exit1219 ], [ %7585, %._crit_edge ] + %.pn4641630 = phi float [ %8773, %__nv_exp2f.exit1219 ], [ %7584, %._crit_edge ] + %.pn4661629 = phi float [ %8772, %__nv_exp2f.exit1219 ], [ %7583, %._crit_edge ] + %.pn4681628 = phi float [ %8771, %__nv_exp2f.exit1219 ], [ %7582, %._crit_edge ] + %.pn4701627 = phi float [ %8770, %__nv_exp2f.exit1219 ], [ %7581, %._crit_edge ] + %.pn4721626 = phi float [ %8769, %__nv_exp2f.exit1219 ], [ %7580, %._crit_edge ] + %.pn4741625 = phi float [ %8768, %__nv_exp2f.exit1219 ], [ %7579, %._crit_edge ] + %.pn4761624 = phi float [ %8767, %__nv_exp2f.exit1219 ], [ %7578, %._crit_edge ] + %.pn4781623 = phi float [ %8766, %__nv_exp2f.exit1219 ], [ %7577, %._crit_edge ] + %.pn4801622 = phi float [ %8765, %__nv_exp2f.exit1219 ], [ %7576, %._crit_edge ] + %.pn4821621 = phi float [ %8764, %__nv_exp2f.exit1219 ], [ %7575, %._crit_edge ] + %.pn4841620 = phi float [ %8763, %__nv_exp2f.exit1219 ], [ %7574, %._crit_edge ] + %.pn4861619 = phi float [ %8762, %__nv_exp2f.exit1219 ], [ %7573, %._crit_edge ] + %.pn4881618 = phi float [ %8761, %__nv_exp2f.exit1219 ], [ %7572, %._crit_edge ] + %.pn4901617 = phi float [ %8760, %__nv_exp2f.exit1219 ], [ %7571, %._crit_edge ] + %.pn4921616 = phi float [ %8759, %__nv_exp2f.exit1219 ], [ %7570, %._crit_edge ] + %.pn4941615 = phi float [ %8758, %__nv_exp2f.exit1219 ], [ %7569, %._crit_edge ] + %.pn4961614 = phi float [ %8757, %__nv_exp2f.exit1219 ], [ %7568, %._crit_edge ] + %.pn4981613 = phi float [ %8756, %__nv_exp2f.exit1219 ], [ %7567, %._crit_edge ] + %.pn5001612 = phi float [ %8755, %__nv_exp2f.exit1219 ], [ %7566, %._crit_edge ] + %.pn5021611 = phi float [ %8754, %__nv_exp2f.exit1219 ], [ %7565, %._crit_edge ] + %.pn5041610 = phi float [ %8753, %__nv_exp2f.exit1219 ], [ %7564, %._crit_edge ] + %.pn2501609 = phi float [ %9576, %__nv_exp2f.exit1219 ], [ %7691, %._crit_edge ] + %.pn2521608 = phi float [ %9575, %__nv_exp2f.exit1219 ], [ %7690, %._crit_edge ] + %.pn2541607 = phi float [ %9574, %__nv_exp2f.exit1219 ], [ %7689, %._crit_edge ] + %.pn2561606 = phi float [ %9573, %__nv_exp2f.exit1219 ], [ %7688, %._crit_edge ] + %.pn2581605 = phi float [ %9572, %__nv_exp2f.exit1219 ], [ %7687, %._crit_edge ] + %.pn2601604 = phi float [ %9571, %__nv_exp2f.exit1219 ], [ %7686, %._crit_edge ] + %.pn2621603 = phi float [ %9570, %__nv_exp2f.exit1219 ], [ %7685, %._crit_edge ] + %.pn2641602 = phi float [ %9569, %__nv_exp2f.exit1219 ], [ %7684, %._crit_edge ] + %.pn2661601 = phi float [ %9568, %__nv_exp2f.exit1219 ], [ %7683, %._crit_edge ] + %.pn2681600 = phi float [ %9567, %__nv_exp2f.exit1219 ], [ %7682, %._crit_edge ] + %.pn2701599 = phi float [ %9566, %__nv_exp2f.exit1219 ], [ %7681, %._crit_edge ] + %.pn2721598 = phi float [ %9565, %__nv_exp2f.exit1219 ], [ %7680, %._crit_edge ] + %.pn2741597 = phi float [ %9564, %__nv_exp2f.exit1219 ], [ %7679, %._crit_edge ] + %.pn2761596 = phi float [ %9563, %__nv_exp2f.exit1219 ], [ %7678, %._crit_edge ] + %.pn2781595 = phi float [ %9562, %__nv_exp2f.exit1219 ], [ %7677, %._crit_edge ] + %.pn2801594 = phi float [ %9561, %__nv_exp2f.exit1219 ], [ %7676, %._crit_edge ] + %.pn2821593 = phi float [ %9560, %__nv_exp2f.exit1219 ], [ %7675, %._crit_edge ] + %.pn2841592 = phi float [ %9559, %__nv_exp2f.exit1219 ], [ %7674, %._crit_edge ] + %.pn2861591 = phi float [ %9558, %__nv_exp2f.exit1219 ], [ %7673, %._crit_edge ] + %.pn2881590 = phi float [ %9557, %__nv_exp2f.exit1219 ], [ %7672, %._crit_edge ] + %.pn2901589 = phi float [ %9556, %__nv_exp2f.exit1219 ], [ %7671, %._crit_edge ] + %.pn2921588 = phi float [ %9555, %__nv_exp2f.exit1219 ], [ %7670, %._crit_edge ] + %.pn2941587 = phi float [ %9554, %__nv_exp2f.exit1219 ], [ %7669, %._crit_edge ] + %.pn2961586 = phi float [ %9553, %__nv_exp2f.exit1219 ], [ %7668, %._crit_edge ] + %.pn2981585 = phi float [ %9552, %__nv_exp2f.exit1219 ], [ %7667, %._crit_edge ] + %.pn3001584 = phi float [ %9551, %__nv_exp2f.exit1219 ], [ %7666, %._crit_edge ] + %.pn3021583 = phi float [ %9550, %__nv_exp2f.exit1219 ], [ %7665, %._crit_edge ] + %.pn3041582 = phi float [ %9549, %__nv_exp2f.exit1219 ], [ %7664, %._crit_edge ] + %.pn3061581 = phi float [ %9548, %__nv_exp2f.exit1219 ], [ %7663, %._crit_edge ] + %.pn3081580 = phi float [ %9547, %__nv_exp2f.exit1219 ], [ %7662, %._crit_edge ] + %.pn3101579 = phi float [ %9546, %__nv_exp2f.exit1219 ], [ %7661, %._crit_edge ] + %.pn3121578 = phi float [ %9545, %__nv_exp2f.exit1219 ], [ %7660, %._crit_edge ] + %.pn3141577 = phi float [ %9544, %__nv_exp2f.exit1219 ], [ %7659, %._crit_edge ] + %.pn3161576 = phi float [ %9543, %__nv_exp2f.exit1219 ], [ %7658, %._crit_edge ] + %.pn3181575 = phi float [ %9542, %__nv_exp2f.exit1219 ], [ %7657, %._crit_edge ] + %.pn3201574 = phi float [ %9541, %__nv_exp2f.exit1219 ], [ %7656, %._crit_edge ] + %.pn3221573 = phi float [ %9540, %__nv_exp2f.exit1219 ], [ %7655, %._crit_edge ] + %.pn3241572 = phi float [ %9539, %__nv_exp2f.exit1219 ], [ %7654, %._crit_edge ] + %.pn3261571 = phi float [ %9538, %__nv_exp2f.exit1219 ], [ %7653, %._crit_edge ] + %.pn3281570 = phi float [ %9537, %__nv_exp2f.exit1219 ], [ %7652, %._crit_edge ] + %.pn3301569 = phi float [ %9536, %__nv_exp2f.exit1219 ], [ %7651, %._crit_edge ] + %.pn3321568 = phi float [ %9535, %__nv_exp2f.exit1219 ], [ %7650, %._crit_edge ] + %.pn3341567 = phi float [ %9534, %__nv_exp2f.exit1219 ], [ %7649, %._crit_edge ] + %.pn3361566 = phi float [ %9533, %__nv_exp2f.exit1219 ], [ %7648, %._crit_edge ] + %.pn3381565 = phi float [ %9532, %__nv_exp2f.exit1219 ], [ %7647, %._crit_edge ] + %.pn3401564 = phi float [ %9531, %__nv_exp2f.exit1219 ], [ %7646, %._crit_edge ] + %.pn3421563 = phi float [ %9530, %__nv_exp2f.exit1219 ], [ %7645, %._crit_edge ] + %.pn3441562 = phi float [ %9529, %__nv_exp2f.exit1219 ], [ %7644, %._crit_edge ] + %.pn3461561 = phi float [ %9528, %__nv_exp2f.exit1219 ], [ %7643, %._crit_edge ] + %.pn3481560 = phi float [ %9527, %__nv_exp2f.exit1219 ], [ %7642, %._crit_edge ] + %.pn3501559 = phi float [ %9526, %__nv_exp2f.exit1219 ], [ %7641, %._crit_edge ] + %.pn3521558 = phi float [ %9525, %__nv_exp2f.exit1219 ], [ %7640, %._crit_edge ] + %.pn3541557 = phi float [ %9524, %__nv_exp2f.exit1219 ], [ %7639, %._crit_edge ] + %.pn3561556 = phi float [ %9523, %__nv_exp2f.exit1219 ], [ %7638, %._crit_edge ] + %.pn3581555 = phi float [ %9522, %__nv_exp2f.exit1219 ], [ %7637, %._crit_edge ] + %.pn3601554 = phi float [ %9521, %__nv_exp2f.exit1219 ], [ %7636, %._crit_edge ] + %.pn3621553 = phi float [ %9520, %__nv_exp2f.exit1219 ], [ %7635, %._crit_edge ] + %.pn3641552 = phi float [ %9519, %__nv_exp2f.exit1219 ], [ %7634, %._crit_edge ] + %.pn3661551 = phi float [ %9518, %__nv_exp2f.exit1219 ], [ %7633, %._crit_edge ] + %.pn3681550 = phi float [ %9517, %__nv_exp2f.exit1219 ], [ %7632, %._crit_edge ] + %.pn3701549 = phi float [ %9516, %__nv_exp2f.exit1219 ], [ %7631, %._crit_edge ] + %.pn3721548 = phi float [ %9515, %__nv_exp2f.exit1219 ], [ %7630, %._crit_edge ] + %.pn3741547 = phi float [ %9514, %__nv_exp2f.exit1219 ], [ %7629, %._crit_edge ] + %.pn3761546 = phi float [ %9513, %__nv_exp2f.exit1219 ], [ %7628, %._crit_edge ] + %7752 = phi i32 [ %9577, %__nv_exp2f.exit1219 ], [ 0, %._crit_edge ] + %7753 = icmp slt i32 %7752, %4924, !dbg !297 + %7754 = icmp slt i32 %7752, %4925, !dbg !297 + %7755 = add i32 %7748, 1, !dbg !297 + %7756 = icmp sgt i32 %7755, 1, !dbg !297 + %7757 = select i1 %7756, i32 0, i32 %7755, !dbg !297 + %7758 = add i32 %7750, 1, !dbg !297 + %7759 = icmp sgt i32 %7758, 2, !dbg !297 + %7760 = select i1 %7759, i32 0, i32 %7758, !dbg !297 + tail call void @llvm.nvvm.cp.async.wait.group(i32 4), !dbg !290 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !290 + %7761 = shl i32 %7760, 13, !dbg !290 + %7762 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %7761, !dbg !290 + %7763 = shl i32 %7757, 6, !dbg !292 + %7764 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %7763, !dbg !292 + %7765 = getelementptr inbounds nuw i8, ptr addrspace(3) %7764, i32 %4812, !dbg !292 + %7766 = load float, ptr addrspace(3) %7765, align 8, !dbg !292 + %7767 = getelementptr inbounds nuw i8, ptr addrspace(3) %7765, i32 4, !dbg !292 + %7768 = load float, ptr addrspace(3) %7767, align 4, !dbg !292 + %7769 = getelementptr inbounds nuw i8, ptr addrspace(3) %7764, i32 %4815, !dbg !292 + %7770 = load float, ptr addrspace(3) %7769, align 8, !dbg !292 + %7771 = getelementptr inbounds nuw i8, ptr addrspace(3) %7769, i32 4, !dbg !292 + %7772 = load float, ptr addrspace(3) %7771, align 4, !dbg !292 + %7773 = getelementptr inbounds nuw i8, ptr addrspace(3) %7764, i32 %4817, !dbg !292 + %7774 = load float, ptr addrspace(3) %7773, align 8, !dbg !292 + %7775 = getelementptr inbounds nuw i8, ptr addrspace(3) %7773, i32 4, !dbg !292 + %7776 = load float, ptr addrspace(3) %7775, align 4, !dbg !292 + %7777 = getelementptr inbounds nuw i8, ptr addrspace(3) %7764, i32 %4819, !dbg !292 + %7778 = load float, ptr addrspace(3) %7777, align 8, !dbg !292 + %7779 = getelementptr inbounds nuw i8, ptr addrspace(3) %7777, i32 4, !dbg !292 + %7780 = load float, ptr addrspace(3) %7779, align 4, !dbg !292 + %7781 = getelementptr inbounds nuw i8, ptr addrspace(3) %7764, i32 %4821, !dbg !292 + %7782 = load float, ptr addrspace(3) %7781, align 8, !dbg !292 + %7783 = getelementptr inbounds nuw i8, ptr addrspace(3) %7781, i32 4, !dbg !292 + %7784 = load float, ptr addrspace(3) %7783, align 4, !dbg !292 + %7785 = getelementptr inbounds nuw i8, ptr addrspace(3) %7764, i32 %4823, !dbg !292 + %7786 = load float, ptr addrspace(3) %7785, align 8, !dbg !292 + %7787 = getelementptr inbounds nuw i8, ptr addrspace(3) %7785, i32 4, !dbg !292 + %7788 = load float, ptr addrspace(3) %7787, align 4, !dbg !292 + %7789 = getelementptr inbounds nuw i8, ptr addrspace(3) %7764, i32 %4825, !dbg !292 + %7790 = load float, ptr addrspace(3) %7789, align 8, !dbg !292 + %7791 = getelementptr inbounds nuw i8, ptr addrspace(3) %7789, i32 4, !dbg !292 + %7792 = load float, ptr addrspace(3) %7791, align 4, !dbg !292 + %7793 = getelementptr inbounds nuw i8, ptr addrspace(3) %7764, i32 %4827, !dbg !292 + %7794 = load float, ptr addrspace(3) %7793, align 8, !dbg !292 + %7795 = getelementptr inbounds nuw i8, ptr addrspace(3) %7793, i32 4, !dbg !292 + %7796 = load float, ptr addrspace(3) %7795, align 4, !dbg !292 + %7797 = fcmp oeq float %7766, 0xFFF0000000000000, !dbg !298 + %7798 = fcmp oeq float %7768, 0xFFF0000000000000, !dbg !298 + %7799 = fcmp oeq float %7770, 0xFFF0000000000000, !dbg !298 + %7800 = fcmp oeq float %7772, 0xFFF0000000000000, !dbg !298 + %7801 = fcmp oeq float %7774, 0xFFF0000000000000, !dbg !298 + %7802 = fcmp oeq float %7776, 0xFFF0000000000000, !dbg !298 + %7803 = fcmp oeq float %7778, 0xFFF0000000000000, !dbg !298 + %7804 = fcmp oeq float %7780, 0xFFF0000000000000, !dbg !298 + %7805 = fcmp oeq float %7782, 0xFFF0000000000000, !dbg !298 + %7806 = fcmp oeq float %7784, 0xFFF0000000000000, !dbg !298 + %7807 = fcmp oeq float %7786, 0xFFF0000000000000, !dbg !298 + %7808 = fcmp oeq float %7788, 0xFFF0000000000000, !dbg !298 + %7809 = fcmp oeq float %7790, 0xFFF0000000000000, !dbg !298 + %7810 = fcmp oeq float %7792, 0xFFF0000000000000, !dbg !298 + %7811 = fcmp oeq float %7794, 0xFFF0000000000000, !dbg !298 + %7812 = fcmp oeq float %7796, 0xFFF0000000000000, !dbg !298 + %7813 = select i1 %7797, float 0.000000e+00, float %7766, !dbg !299 + %7814 = select i1 %7798, float 0.000000e+00, float %7768, !dbg !299 + %7815 = select i1 %7799, float 0.000000e+00, float %7770, !dbg !299 + %7816 = select i1 %7800, float 0.000000e+00, float %7772, !dbg !299 + %7817 = select i1 %7801, float 0.000000e+00, float %7774, !dbg !299 + %7818 = select i1 %7802, float 0.000000e+00, float %7776, !dbg !299 + %7819 = select i1 %7803, float 0.000000e+00, float %7778, !dbg !299 + %7820 = select i1 %7804, float 0.000000e+00, float %7780, !dbg !299 + %7821 = select i1 %7805, float 0.000000e+00, float %7782, !dbg !299 + %7822 = select i1 %7806, float 0.000000e+00, float %7784, !dbg !299 + %7823 = select i1 %7807, float 0.000000e+00, float %7786, !dbg !299 + %7824 = select i1 %7808, float 0.000000e+00, float %7788, !dbg !299 + %7825 = select i1 %7809, float 0.000000e+00, float %7790, !dbg !299 + %7826 = select i1 %7810, float 0.000000e+00, float %7792, !dbg !299 + %7827 = select i1 %7811, float 0.000000e+00, float %7794, !dbg !299 + %7828 = select i1 %7812, float 0.000000e+00, float %7796, !dbg !299 + %7829 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %36, i32 0, i32 31), !dbg !300 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !300 + %7830 = shl i32 %7829, 11, !dbg !300 + %7831 = and i32 %7830, 8192, !dbg !300 + %7832 = add i32 %7831, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !300 + %7833 = lshr exact i32 %7832, 4, !dbg !300 + %7834 = and i32 %7833, 16383, !dbg !300 + %7835 = zext nneg i32 %7834 to i64, !dbg !300 + %7836 = or disjoint i64 %7835, 4611686293372403712, !dbg !300 + %7837 = ptrtoint ptr addrspace(3) %7762 to i32, !dbg !300 + %7838 = lshr exact i32 %7837, 4, !dbg !300 + %7839 = and i32 %7838, 16383, !dbg !300 + %7840 = zext nneg i32 %7839 to i64, !dbg !300 + %7841 = or disjoint i64 %7840, 4611686293338849280, !dbg !300 + %7842 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %7836, i64 %7841) #3, !dbg !300 + %7843 = or disjoint i32 %7831, 32, !dbg !300 + %7844 = add i32 %7843, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !300 + %7845 = lshr exact i32 %7844, 4, !dbg !300 + %7846 = and i32 %7845, 16383, !dbg !300 + %7847 = zext nneg i32 %7846 to i64, !dbg !300 + %7848 = or disjoint i64 %7847, 4611686293372403712, !dbg !300 + %7849 = add i32 %7837, 32, !dbg !300 + %7850 = lshr exact i32 %7849, 4, !dbg !300 + %7851 = and i32 %7850, 16383, !dbg !300 + %7852 = zext nneg i32 %7851 to i64, !dbg !300 + %7853 = or disjoint i64 %7852, 4611686293338849280, !dbg !300 + %7854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 0, !dbg !300 + %7855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 1, !dbg !300 + %7856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 2, !dbg !300 + %7857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 3, !dbg !300 + %7858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 4, !dbg !300 + %7859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 5, !dbg !300 + %7860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 6, !dbg !300 + %7861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 7, !dbg !300 + %7862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 8, !dbg !300 + %7863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 9, !dbg !300 + %7864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 10, !dbg !300 + %7865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 11, !dbg !300 + %7866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 12, !dbg !300 + %7867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 13, !dbg !300 + %7868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 14, !dbg !300 + %7869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 15, !dbg !300 + %7870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 16, !dbg !300 + %7871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 17, !dbg !300 + %7872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 18, !dbg !300 + %7873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 19, !dbg !300 + %7874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 20, !dbg !300 + %7875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 21, !dbg !300 + %7876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 22, !dbg !300 + %7877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 23, !dbg !300 + %7878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 24, !dbg !300 + %7879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 25, !dbg !300 + %7880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 26, !dbg !300 + %7881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 27, !dbg !300 + %7882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 28, !dbg !300 + %7883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 29, !dbg !300 + %7884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 30, !dbg !300 + %7885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7842, 31, !dbg !300 + %7886 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7854, float %7855, float %7856, float %7857, float %7858, float %7859, float %7860, float %7861, float %7862, float %7863, float %7864, float %7865, float %7866, float %7867, float %7868, float %7869, float %7870, float %7871, float %7872, float %7873, float %7874, float %7875, float %7876, float %7877, float %7878, float %7879, float %7880, float %7881, float %7882, float %7883, float %7884, float %7885, i64 %7848, i64 %7853, i1 true) #3, !dbg !300 + %7887 = or disjoint i32 %7831, 64, !dbg !300 + %7888 = add i32 %7887, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !300 + %7889 = lshr exact i32 %7888, 4, !dbg !300 + %7890 = and i32 %7889, 16383, !dbg !300 + %7891 = zext nneg i32 %7890 to i64, !dbg !300 + %7892 = or disjoint i64 %7891, 4611686293372403712, !dbg !300 + %7893 = add i32 %7837, 64, !dbg !300 + %7894 = lshr exact i32 %7893, 4, !dbg !300 + %7895 = and i32 %7894, 16383, !dbg !300 + %7896 = zext nneg i32 %7895 to i64, !dbg !300 + %7897 = or disjoint i64 %7896, 4611686293338849280, !dbg !300 + %7898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 0, !dbg !300 + %7899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 1, !dbg !300 + %7900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 2, !dbg !300 + %7901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 3, !dbg !300 + %7902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 4, !dbg !300 + %7903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 5, !dbg !300 + %7904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 6, !dbg !300 + %7905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 7, !dbg !300 + %7906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 8, !dbg !300 + %7907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 9, !dbg !300 + %7908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 10, !dbg !300 + %7909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 11, !dbg !300 + %7910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 12, !dbg !300 + %7911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 13, !dbg !300 + %7912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 14, !dbg !300 + %7913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 15, !dbg !300 + %7914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 16, !dbg !300 + %7915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 17, !dbg !300 + %7916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 18, !dbg !300 + %7917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 19, !dbg !300 + %7918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 20, !dbg !300 + %7919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 21, !dbg !300 + %7920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 22, !dbg !300 + %7921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 23, !dbg !300 + %7922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 24, !dbg !300 + %7923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 25, !dbg !300 + %7924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 26, !dbg !300 + %7925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 27, !dbg !300 + %7926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 28, !dbg !300 + %7927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 29, !dbg !300 + %7928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 30, !dbg !300 + %7929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7886, 31, !dbg !300 + %7930 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7898, float %7899, float %7900, float %7901, float %7902, float %7903, float %7904, float %7905, float %7906, float %7907, float %7908, float %7909, float %7910, float %7911, float %7912, float %7913, float %7914, float %7915, float %7916, float %7917, float %7918, float %7919, float %7920, float %7921, float %7922, float %7923, float %7924, float %7925, float %7926, float %7927, float %7928, float %7929, i64 %7892, i64 %7897, i1 true) #3, !dbg !300 + %7931 = or disjoint i32 %7831, 96, !dbg !300 + %7932 = add i32 %7931, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !300 + %7933 = lshr exact i32 %7932, 4, !dbg !300 + %7934 = and i32 %7933, 16383, !dbg !300 + %7935 = zext nneg i32 %7934 to i64, !dbg !300 + %7936 = or disjoint i64 %7935, 4611686293372403712, !dbg !300 + %7937 = add i32 %7837, 96, !dbg !300 + %7938 = lshr exact i32 %7937, 4, !dbg !300 + %7939 = and i32 %7938, 16383, !dbg !300 + %7940 = zext nneg i32 %7939 to i64, !dbg !300 + %7941 = or disjoint i64 %7940, 4611686293338849280, !dbg !300 + %7942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 0, !dbg !300 + %7943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 1, !dbg !300 + %7944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 2, !dbg !300 + %7945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 3, !dbg !300 + %7946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 4, !dbg !300 + %7947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 5, !dbg !300 + %7948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 6, !dbg !300 + %7949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 7, !dbg !300 + %7950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 8, !dbg !300 + %7951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 9, !dbg !300 + %7952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 10, !dbg !300 + %7953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 11, !dbg !300 + %7954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 12, !dbg !300 + %7955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 13, !dbg !300 + %7956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 14, !dbg !300 + %7957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 15, !dbg !300 + %7958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 16, !dbg !300 + %7959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 17, !dbg !300 + %7960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 18, !dbg !300 + %7961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 19, !dbg !300 + %7962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 20, !dbg !300 + %7963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 21, !dbg !300 + %7964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 22, !dbg !300 + %7965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 23, !dbg !300 + %7966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 24, !dbg !300 + %7967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 25, !dbg !300 + %7968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 26, !dbg !300 + %7969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 27, !dbg !300 + %7970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 28, !dbg !300 + %7971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 29, !dbg !300 + %7972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 30, !dbg !300 + %7973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7930, 31, !dbg !300 + %7974 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7942, float %7943, float %7944, float %7945, float %7946, float %7947, float %7948, float %7949, float %7950, float %7951, float %7952, float %7953, float %7954, float %7955, float %7956, float %7957, float %7958, float %7959, float %7960, float %7961, float %7962, float %7963, float %7964, float %7965, float %7966, float %7967, float %7968, float %7969, float %7970, float %7971, float %7972, float %7973, i64 %7936, i64 %7941, i1 true) #3, !dbg !300 + %7975 = or disjoint i32 %7831, 16384, !dbg !300 + %7976 = add i32 %7975, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !300 + %7977 = lshr exact i32 %7976, 4, !dbg !300 + %7978 = and i32 %7977, 16383, !dbg !300 + %7979 = zext nneg i32 %7978 to i64, !dbg !300 + %7980 = or disjoint i64 %7979, 4611686293372403712, !dbg !300 + %7981 = add i32 %7837, 8192, !dbg !300 + %7982 = lshr exact i32 %7981, 4, !dbg !300 + %7983 = and i32 %7982, 16383, !dbg !300 + %7984 = zext nneg i32 %7983 to i64, !dbg !300 + %7985 = or disjoint i64 %7984, 4611686293338849280, !dbg !300 + %7986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 0, !dbg !300 + %7987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 1, !dbg !300 + %7988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 2, !dbg !300 + %7989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 3, !dbg !300 + %7990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 4, !dbg !300 + %7991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 5, !dbg !300 + %7992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 6, !dbg !300 + %7993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 7, !dbg !300 + %7994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 8, !dbg !300 + %7995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 9, !dbg !300 + %7996 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 10, !dbg !300 + %7997 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 11, !dbg !300 + %7998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 12, !dbg !300 + %7999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 13, !dbg !300 + %8000 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 14, !dbg !300 + %8001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 15, !dbg !300 + %8002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 16, !dbg !300 + %8003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 17, !dbg !300 + %8004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 18, !dbg !300 + %8005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 19, !dbg !300 + %8006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 20, !dbg !300 + %8007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 21, !dbg !300 + %8008 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 22, !dbg !300 + %8009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 23, !dbg !300 + %8010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 24, !dbg !300 + %8011 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 25, !dbg !300 + %8012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 26, !dbg !300 + %8013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 27, !dbg !300 + %8014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 28, !dbg !300 + %8015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 29, !dbg !300 + %8016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 30, !dbg !300 + %8017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7974, 31, !dbg !300 + %8018 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7986, float %7987, float %7988, float %7989, float %7990, float %7991, float %7992, float %7993, float %7994, float %7995, float %7996, float %7997, float %7998, float %7999, float %8000, float %8001, float %8002, float %8003, float %8004, float %8005, float %8006, float %8007, float %8008, float %8009, float %8010, float %8011, float %8012, float %8013, float %8014, float %8015, float %8016, float %8017, i64 %7980, i64 %7985, i1 true) #3, !dbg !300 + %8019 = or disjoint i32 %7831, 16416, !dbg !300 + %8020 = add i32 %8019, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !300 + %8021 = lshr exact i32 %8020, 4, !dbg !300 + %8022 = and i32 %8021, 16383, !dbg !300 + %8023 = zext nneg i32 %8022 to i64, !dbg !300 + %8024 = or disjoint i64 %8023, 4611686293372403712, !dbg !300 + %8025 = add i32 %7837, 8224, !dbg !300 + %8026 = lshr exact i32 %8025, 4, !dbg !300 + %8027 = and i32 %8026, 16383, !dbg !300 + %8028 = zext nneg i32 %8027 to i64, !dbg !300 + %8029 = or disjoint i64 %8028, 4611686293338849280, !dbg !300 + %8030 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 0, !dbg !300 + %8031 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 1, !dbg !300 + %8032 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 2, !dbg !300 + %8033 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 3, !dbg !300 + %8034 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 4, !dbg !300 + %8035 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 5, !dbg !300 + %8036 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 6, !dbg !300 + %8037 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 7, !dbg !300 + %8038 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 8, !dbg !300 + %8039 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 9, !dbg !300 + %8040 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 10, !dbg !300 + %8041 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 11, !dbg !300 + %8042 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 12, !dbg !300 + %8043 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 13, !dbg !300 + %8044 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 14, !dbg !300 + %8045 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 15, !dbg !300 + %8046 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 16, !dbg !300 + %8047 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 17, !dbg !300 + %8048 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 18, !dbg !300 + %8049 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 19, !dbg !300 + %8050 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 20, !dbg !300 + %8051 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 21, !dbg !300 + %8052 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 22, !dbg !300 + %8053 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 23, !dbg !300 + %8054 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 24, !dbg !300 + %8055 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 25, !dbg !300 + %8056 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 26, !dbg !300 + %8057 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 27, !dbg !300 + %8058 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 28, !dbg !300 + %8059 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 29, !dbg !300 + %8060 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 30, !dbg !300 + %8061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8018, 31, !dbg !300 + %8062 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8030, float %8031, float %8032, float %8033, float %8034, float %8035, float %8036, float %8037, float %8038, float %8039, float %8040, float %8041, float %8042, float %8043, float %8044, float %8045, float %8046, float %8047, float %8048, float %8049, float %8050, float %8051, float %8052, float %8053, float %8054, float %8055, float %8056, float %8057, float %8058, float %8059, float %8060, float %8061, i64 %8024, i64 %8029, i1 true) #3, !dbg !300 + %8063 = or disjoint i32 %7831, 16448, !dbg !300 + %8064 = add i32 %8063, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !300 + %8065 = lshr exact i32 %8064, 4, !dbg !300 + %8066 = and i32 %8065, 16383, !dbg !300 + %8067 = zext nneg i32 %8066 to i64, !dbg !300 + %8068 = or disjoint i64 %8067, 4611686293372403712, !dbg !300 + %8069 = add i32 %7837, 8256, !dbg !300 + %8070 = lshr exact i32 %8069, 4, !dbg !300 + %8071 = and i32 %8070, 16383, !dbg !300 + %8072 = zext nneg i32 %8071 to i64, !dbg !300 + %8073 = or disjoint i64 %8072, 4611686293338849280, !dbg !300 + %8074 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 0, !dbg !300 + %8075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 1, !dbg !300 + %8076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 2, !dbg !300 + %8077 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 3, !dbg !300 + %8078 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 4, !dbg !300 + %8079 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 5, !dbg !300 + %8080 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 6, !dbg !300 + %8081 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 7, !dbg !300 + %8082 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 8, !dbg !300 + %8083 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 9, !dbg !300 + %8084 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 10, !dbg !300 + %8085 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 11, !dbg !300 + %8086 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 12, !dbg !300 + %8087 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 13, !dbg !300 + %8088 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 14, !dbg !300 + %8089 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 15, !dbg !300 + %8090 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 16, !dbg !300 + %8091 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 17, !dbg !300 + %8092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 18, !dbg !300 + %8093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 19, !dbg !300 + %8094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 20, !dbg !300 + %8095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 21, !dbg !300 + %8096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 22, !dbg !300 + %8097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 23, !dbg !300 + %8098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 24, !dbg !300 + %8099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 25, !dbg !300 + %8100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 26, !dbg !300 + %8101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 27, !dbg !300 + %8102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 28, !dbg !300 + %8103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 29, !dbg !300 + %8104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 30, !dbg !300 + %8105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8062, 31, !dbg !300 + %8106 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8074, float %8075, float %8076, float %8077, float %8078, float %8079, float %8080, float %8081, float %8082, float %8083, float %8084, float %8085, float %8086, float %8087, float %8088, float %8089, float %8090, float %8091, float %8092, float %8093, float %8094, float %8095, float %8096, float %8097, float %8098, float %8099, float %8100, float %8101, float %8102, float %8103, float %8104, float %8105, i64 %8068, i64 %8073, i1 true) #3, !dbg !300 + %8107 = or disjoint i32 %7831, 16480, !dbg !300 + %8108 = add i32 %8107, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328) to i32), !dbg !300 + %8109 = lshr exact i32 %8108, 4, !dbg !300 + %8110 = and i32 %8109, 16383, !dbg !300 + %8111 = zext nneg i32 %8110 to i64, !dbg !300 + %8112 = or disjoint i64 %8111, 4611686293372403712, !dbg !300 + %8113 = add i32 %7837, 8288, !dbg !300 + %8114 = lshr exact i32 %8113, 4, !dbg !300 + %8115 = and i32 %8114, 16383, !dbg !300 + %8116 = zext nneg i32 %8115 to i64, !dbg !300 + %8117 = or disjoint i64 %8116, 4611686293338849280, !dbg !300 + %8118 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 0, !dbg !300 + %8119 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 1, !dbg !300 + %8120 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 2, !dbg !300 + %8121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 3, !dbg !300 + %8122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 4, !dbg !300 + %8123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 5, !dbg !300 + %8124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 6, !dbg !300 + %8125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 7, !dbg !300 + %8126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 8, !dbg !300 + %8127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 9, !dbg !300 + %8128 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 10, !dbg !300 + %8129 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 11, !dbg !300 + %8130 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 12, !dbg !300 + %8131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 13, !dbg !300 + %8132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 14, !dbg !300 + %8133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 15, !dbg !300 + %8134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 16, !dbg !300 + %8135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 17, !dbg !300 + %8136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 18, !dbg !300 + %8137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 19, !dbg !300 + %8138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 20, !dbg !300 + %8139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 21, !dbg !300 + %8140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 22, !dbg !300 + %8141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 23, !dbg !300 + %8142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 24, !dbg !300 + %8143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 25, !dbg !300 + %8144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 26, !dbg !300 + %8145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 27, !dbg !300 + %8146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 28, !dbg !300 + %8147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 29, !dbg !300 + %8148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 30, !dbg !300 + %8149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8106, 31, !dbg !300 + %8150 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8118, float %8119, float %8120, float %8121, float %8122, float %8123, float %8124, float %8125, float %8126, float %8127, float %8128, float %8129, float %8130, float %8131, float %8132, float %8133, float %8134, float %8135, float %8136, float %8137, float %8138, float %8139, float %8140, float %8141, float %8142, float %8143, float %8144, float %8145, float %8146, float %8147, float %8148, float %8149, i64 %8112, i64 %8117, i1 true) #3, !dbg !300 + %8151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 0, !dbg !300 + %8152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 1, !dbg !300 + %8153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 2, !dbg !300 + %8154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 3, !dbg !300 + %8155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 4, !dbg !300 + %8156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 5, !dbg !300 + %8157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 6, !dbg !300 + %8158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 7, !dbg !300 + %8159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 8, !dbg !300 + %8160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 9, !dbg !300 + %8161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 10, !dbg !300 + %8162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 11, !dbg !300 + %8163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 12, !dbg !300 + %8164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 13, !dbg !300 + %8165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 14, !dbg !300 + %8166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 15, !dbg !300 + %8167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 16, !dbg !300 + %8168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 17, !dbg !300 + %8169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 18, !dbg !300 + %8170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 19, !dbg !300 + %8171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 20, !dbg !300 + %8172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 21, !dbg !300 + %8173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 22, !dbg !300 + %8174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 23, !dbg !300 + %8175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 24, !dbg !300 + %8176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 25, !dbg !300 + %8177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 26, !dbg !300 + %8178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 27, !dbg !300 + %8179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 28, !dbg !300 + %8180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 29, !dbg !300 + %8181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 30, !dbg !300 + %8182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8150, 31, !dbg !300 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !300 + %8183 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %8151, float %8152, float %8153, float %8154, float %8155, float %8156, float %8157, float %8158, float %8159, float %8160, float %8161, float %8162, float %8163, float %8164, float %8165, float %8166, float %8167, float %8168, float %8169, float %8170, float %8171, float %8172, float %8173, float %8174, float %8175, float %8176, float %8177, float %8178, float %8179, float %8180, float %8181, float %8182, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 99328), i32 0, i32 0, ptr addrspace(3) %7762, i32 0, i32 0) #3, !dbg !300 + %8184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 0, !dbg !300 + %8185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 1, !dbg !300 + %8186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 2, !dbg !300 + %8187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 3, !dbg !300 + %8188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 4, !dbg !300 + %8189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 5, !dbg !300 + %8190 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 6, !dbg !300 + %8191 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 7, !dbg !300 + %8192 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 8, !dbg !300 + %8193 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 9, !dbg !300 + %8194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 10, !dbg !300 + %8195 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 11, !dbg !300 + %8196 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 12, !dbg !300 + %8197 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 13, !dbg !300 + %8198 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 14, !dbg !300 + %8199 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 15, !dbg !300 + %8200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 16, !dbg !300 + %8201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 17, !dbg !300 + %8202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 18, !dbg !300 + %8203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 19, !dbg !300 + %8204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 20, !dbg !300 + %8205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 21, !dbg !300 + %8206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 22, !dbg !300 + %8207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 23, !dbg !300 + %8208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 24, !dbg !300 + %8209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 25, !dbg !300 + %8210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 26, !dbg !300 + %8211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 27, !dbg !300 + %8212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 28, !dbg !300 + %8213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 29, !dbg !300 + %8214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 30, !dbg !300 + %8215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8183, 31, !dbg !300 + %8216 = fmul float %8184, 0x3FB6A09E60000000, !dbg !301 + %8217 = fmul float %8185, 0x3FB6A09E60000000, !dbg !301 + %8218 = fmul float %8186, 0x3FB6A09E60000000, !dbg !301 + %8219 = fmul float %8187, 0x3FB6A09E60000000, !dbg !301 + %8220 = fmul float %8188, 0x3FB6A09E60000000, !dbg !301 + %8221 = fmul float %8189, 0x3FB6A09E60000000, !dbg !301 + %8222 = fmul float %8190, 0x3FB6A09E60000000, !dbg !301 + %8223 = fmul float %8191, 0x3FB6A09E60000000, !dbg !301 + %8224 = fmul float %8192, 0x3FB6A09E60000000, !dbg !301 + %8225 = fmul float %8193, 0x3FB6A09E60000000, !dbg !301 + %8226 = fmul float %8194, 0x3FB6A09E60000000, !dbg !301 + %8227 = fmul float %8195, 0x3FB6A09E60000000, !dbg !301 + %8228 = fmul float %8196, 0x3FB6A09E60000000, !dbg !301 + %8229 = fmul float %8197, 0x3FB6A09E60000000, !dbg !301 + %8230 = fmul float %8198, 0x3FB6A09E60000000, !dbg !301 + %8231 = fmul float %8199, 0x3FB6A09E60000000, !dbg !301 + %8232 = fmul float %8200, 0x3FB6A09E60000000, !dbg !301 + %8233 = fmul float %8201, 0x3FB6A09E60000000, !dbg !301 + %8234 = fmul float %8202, 0x3FB6A09E60000000, !dbg !301 + %8235 = fmul float %8203, 0x3FB6A09E60000000, !dbg !301 + %8236 = fmul float %8204, 0x3FB6A09E60000000, !dbg !301 + %8237 = fmul float %8205, 0x3FB6A09E60000000, !dbg !301 + %8238 = fmul float %8206, 0x3FB6A09E60000000, !dbg !301 + %8239 = fmul float %8207, 0x3FB6A09E60000000, !dbg !301 + %8240 = fmul float %8208, 0x3FB6A09E60000000, !dbg !301 + %8241 = fmul float %8209, 0x3FB6A09E60000000, !dbg !301 + %8242 = fmul float %8210, 0x3FB6A09E60000000, !dbg !301 + %8243 = fmul float %8211, 0x3FB6A09E60000000, !dbg !301 + %8244 = fmul float %8212, 0x3FB6A09E60000000, !dbg !301 + %8245 = fmul float %8213, 0x3FB6A09E60000000, !dbg !301 + %8246 = fmul float %8214, 0x3FB6A09E60000000, !dbg !301 + %8247 = fmul float %8215, 0x3FB6A09E60000000, !dbg !301 + %8248 = fmul float %8216, 0x3FF7154760000000, !dbg !302 + %8249 = fmul float %8217, 0x3FF7154760000000, !dbg !302 + %8250 = fmul float %8218, 0x3FF7154760000000, !dbg !302 + %8251 = fmul float %8219, 0x3FF7154760000000, !dbg !302 + %8252 = fmul float %8220, 0x3FF7154760000000, !dbg !302 + %8253 = fmul float %8221, 0x3FF7154760000000, !dbg !302 + %8254 = fmul float %8222, 0x3FF7154760000000, !dbg !302 + %8255 = fmul float %8223, 0x3FF7154760000000, !dbg !302 + %8256 = fmul float %8224, 0x3FF7154760000000, !dbg !302 + %8257 = fmul float %8225, 0x3FF7154760000000, !dbg !302 + %8258 = fmul float %8226, 0x3FF7154760000000, !dbg !302 + %8259 = fmul float %8227, 0x3FF7154760000000, !dbg !302 + %8260 = fmul float %8228, 0x3FF7154760000000, !dbg !302 + %8261 = fmul float %8229, 0x3FF7154760000000, !dbg !302 + %8262 = fmul float %8230, 0x3FF7154760000000, !dbg !302 + %8263 = fmul float %8231, 0x3FF7154760000000, !dbg !302 + %8264 = fmul float %8232, 0x3FF7154760000000, !dbg !302 + %8265 = fmul float %8233, 0x3FF7154760000000, !dbg !302 + %8266 = fmul float %8234, 0x3FF7154760000000, !dbg !302 + %8267 = fmul float %8235, 0x3FF7154760000000, !dbg !302 + %8268 = fmul float %8236, 0x3FF7154760000000, !dbg !302 + %8269 = fmul float %8237, 0x3FF7154760000000, !dbg !302 + %8270 = fmul float %8238, 0x3FF7154760000000, !dbg !302 + %8271 = fmul float %8239, 0x3FF7154760000000, !dbg !302 + %8272 = fmul float %8240, 0x3FF7154760000000, !dbg !302 + %8273 = fmul float %8241, 0x3FF7154760000000, !dbg !302 + %8274 = fmul float %8242, 0x3FF7154760000000, !dbg !302 + %8275 = fmul float %8243, 0x3FF7154760000000, !dbg !302 + %8276 = fmul float %8244, 0x3FF7154760000000, !dbg !302 + %8277 = fmul float %8245, 0x3FF7154760000000, !dbg !302 + %8278 = fmul float %8246, 0x3FF7154760000000, !dbg !302 + %8279 = fmul float %8247, 0x3FF7154760000000, !dbg !302 + %8280 = fsub float %8248, %7813, !dbg !303 + %8281 = fsub float %8249, %7814, !dbg !303 + %8282 = fsub float %8250, %7813, !dbg !303 + %8283 = fsub float %8251, %7814, !dbg !303 + %8284 = fsub float %8252, %7815, !dbg !303 + %8285 = fsub float %8253, %7816, !dbg !303 + %8286 = fsub float %8254, %7815, !dbg !303 + %8287 = fsub float %8255, %7816, !dbg !303 + %8288 = fsub float %8256, %7817, !dbg !303 + %8289 = fsub float %8257, %7818, !dbg !303 + %8290 = fsub float %8258, %7817, !dbg !303 + %8291 = fsub float %8259, %7818, !dbg !303 + %8292 = fsub float %8260, %7819, !dbg !303 + %8293 = fsub float %8261, %7820, !dbg !303 + %8294 = fsub float %8262, %7819, !dbg !303 + %8295 = fsub float %8263, %7820, !dbg !303 + %8296 = fsub float %8264, %7821, !dbg !303 + %8297 = fsub float %8265, %7822, !dbg !303 + %8298 = fsub float %8266, %7821, !dbg !303 + %8299 = fsub float %8267, %7822, !dbg !303 + %8300 = fsub float %8268, %7823, !dbg !303 + %8301 = fsub float %8269, %7824, !dbg !303 + %8302 = fsub float %8270, %7823, !dbg !303 + %8303 = fsub float %8271, %7824, !dbg !303 + %8304 = fsub float %8272, %7825, !dbg !303 + %8305 = fsub float %8273, %7826, !dbg !303 + %8306 = fsub float %8274, %7825, !dbg !303 + %8307 = fsub float %8275, %7826, !dbg !303 + %8308 = fsub float %8276, %7827, !dbg !303 + %8309 = fsub float %8277, %7828, !dbg !303 + %8310 = fsub float %8278, %7827, !dbg !303 + %8311 = fsub float %8279, %7828, !dbg !303 + %8312 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i = icmp eq i32 %8312, 0, !dbg !304 + br i1 %.not.i, label %8315, label %8313, !dbg !304 + +8313: ; preds = %.lr.ph1691 + %8314 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8280) #3, !dbg !304 + br label %__nv_exp2f.exit, !dbg !304 + +8315: ; preds = %.lr.ph1691 + %8316 = tail call float @llvm.nvvm.ex2.approx.f(float %8280) #3, !dbg !304 + br label %__nv_exp2f.exit, !dbg !304 + +__nv_exp2f.exit: ; preds = %8313, %8315 + %.0.i = phi float [ %8314, %8313 ], [ %8316, %8315 ], !dbg !304 + %8317 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1127 = icmp eq i32 %8317, 0, !dbg !304 + br i1 %.not.i1127, label %8320, label %8318, !dbg !304 + +8318: ; preds = %__nv_exp2f.exit + %8319 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8281) #3, !dbg !304 + br label %__nv_exp2f.exit1129, !dbg !304 + +8320: ; preds = %__nv_exp2f.exit + %8321 = tail call float @llvm.nvvm.ex2.approx.f(float %8281) #3, !dbg !304 + br label %__nv_exp2f.exit1129, !dbg !304 + +__nv_exp2f.exit1129: ; preds = %8318, %8320 + %.0.i1128 = phi float [ %8319, %8318 ], [ %8321, %8320 ], !dbg !304 + %8322 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1130 = icmp eq i32 %8322, 0, !dbg !304 + br i1 %.not.i1130, label %8325, label %8323, !dbg !304 + +8323: ; preds = %__nv_exp2f.exit1129 + %8324 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8282) #3, !dbg !304 + br label %__nv_exp2f.exit1132, !dbg !304 + +8325: ; preds = %__nv_exp2f.exit1129 + %8326 = tail call float @llvm.nvvm.ex2.approx.f(float %8282) #3, !dbg !304 + br label %__nv_exp2f.exit1132, !dbg !304 + +__nv_exp2f.exit1132: ; preds = %8323, %8325 + %.0.i1131 = phi float [ %8324, %8323 ], [ %8326, %8325 ], !dbg !304 + %8327 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1133 = icmp eq i32 %8327, 0, !dbg !304 + br i1 %.not.i1133, label %8330, label %8328, !dbg !304 + +8328: ; preds = %__nv_exp2f.exit1132 + %8329 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8283) #3, !dbg !304 + br label %__nv_exp2f.exit1135, !dbg !304 + +8330: ; preds = %__nv_exp2f.exit1132 + %8331 = tail call float @llvm.nvvm.ex2.approx.f(float %8283) #3, !dbg !304 + br label %__nv_exp2f.exit1135, !dbg !304 + +__nv_exp2f.exit1135: ; preds = %8328, %8330 + %.0.i1134 = phi float [ %8329, %8328 ], [ %8331, %8330 ], !dbg !304 + %8332 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1136 = icmp eq i32 %8332, 0, !dbg !304 + br i1 %.not.i1136, label %8335, label %8333, !dbg !304 + +8333: ; preds = %__nv_exp2f.exit1135 + %8334 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8284) #3, !dbg !304 + br label %__nv_exp2f.exit1138, !dbg !304 + +8335: ; preds = %__nv_exp2f.exit1135 + %8336 = tail call float @llvm.nvvm.ex2.approx.f(float %8284) #3, !dbg !304 + br label %__nv_exp2f.exit1138, !dbg !304 + +__nv_exp2f.exit1138: ; preds = %8333, %8335 + %.0.i1137 = phi float [ %8334, %8333 ], [ %8336, %8335 ], !dbg !304 + %8337 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1139 = icmp eq i32 %8337, 0, !dbg !304 + br i1 %.not.i1139, label %8340, label %8338, !dbg !304 + +8338: ; preds = %__nv_exp2f.exit1138 + %8339 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8285) #3, !dbg !304 + br label %__nv_exp2f.exit1141, !dbg !304 + +8340: ; preds = %__nv_exp2f.exit1138 + %8341 = tail call float @llvm.nvvm.ex2.approx.f(float %8285) #3, !dbg !304 + br label %__nv_exp2f.exit1141, !dbg !304 + +__nv_exp2f.exit1141: ; preds = %8338, %8340 + %.0.i1140 = phi float [ %8339, %8338 ], [ %8341, %8340 ], !dbg !304 + %8342 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1142 = icmp eq i32 %8342, 0, !dbg !304 + br i1 %.not.i1142, label %8345, label %8343, !dbg !304 + +8343: ; preds = %__nv_exp2f.exit1141 + %8344 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8286) #3, !dbg !304 + br label %__nv_exp2f.exit1144, !dbg !304 + +8345: ; preds = %__nv_exp2f.exit1141 + %8346 = tail call float @llvm.nvvm.ex2.approx.f(float %8286) #3, !dbg !304 + br label %__nv_exp2f.exit1144, !dbg !304 + +__nv_exp2f.exit1144: ; preds = %8343, %8345 + %.0.i1143 = phi float [ %8344, %8343 ], [ %8346, %8345 ], !dbg !304 + %8347 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1145 = icmp eq i32 %8347, 0, !dbg !304 + br i1 %.not.i1145, label %8350, label %8348, !dbg !304 + +8348: ; preds = %__nv_exp2f.exit1144 + %8349 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8287) #3, !dbg !304 + br label %__nv_exp2f.exit1147, !dbg !304 + +8350: ; preds = %__nv_exp2f.exit1144 + %8351 = tail call float @llvm.nvvm.ex2.approx.f(float %8287) #3, !dbg !304 + br label %__nv_exp2f.exit1147, !dbg !304 + +__nv_exp2f.exit1147: ; preds = %8348, %8350 + %.0.i1146 = phi float [ %8349, %8348 ], [ %8351, %8350 ], !dbg !304 + %8352 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1148 = icmp eq i32 %8352, 0, !dbg !304 + br i1 %.not.i1148, label %8355, label %8353, !dbg !304 + +8353: ; preds = %__nv_exp2f.exit1147 + %8354 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8288) #3, !dbg !304 + br label %__nv_exp2f.exit1150, !dbg !304 + +8355: ; preds = %__nv_exp2f.exit1147 + %8356 = tail call float @llvm.nvvm.ex2.approx.f(float %8288) #3, !dbg !304 + br label %__nv_exp2f.exit1150, !dbg !304 + +__nv_exp2f.exit1150: ; preds = %8353, %8355 + %.0.i1149 = phi float [ %8354, %8353 ], [ %8356, %8355 ], !dbg !304 + %8357 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1151 = icmp eq i32 %8357, 0, !dbg !304 + br i1 %.not.i1151, label %8360, label %8358, !dbg !304 + +8358: ; preds = %__nv_exp2f.exit1150 + %8359 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8289) #3, !dbg !304 + br label %__nv_exp2f.exit1153, !dbg !304 + +8360: ; preds = %__nv_exp2f.exit1150 + %8361 = tail call float @llvm.nvvm.ex2.approx.f(float %8289) #3, !dbg !304 + br label %__nv_exp2f.exit1153, !dbg !304 + +__nv_exp2f.exit1153: ; preds = %8358, %8360 + %.0.i1152 = phi float [ %8359, %8358 ], [ %8361, %8360 ], !dbg !304 + %8362 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1154 = icmp eq i32 %8362, 0, !dbg !304 + br i1 %.not.i1154, label %8365, label %8363, !dbg !304 + +8363: ; preds = %__nv_exp2f.exit1153 + %8364 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8290) #3, !dbg !304 + br label %__nv_exp2f.exit1156, !dbg !304 + +8365: ; preds = %__nv_exp2f.exit1153 + %8366 = tail call float @llvm.nvvm.ex2.approx.f(float %8290) #3, !dbg !304 + br label %__nv_exp2f.exit1156, !dbg !304 + +__nv_exp2f.exit1156: ; preds = %8363, %8365 + %.0.i1155 = phi float [ %8364, %8363 ], [ %8366, %8365 ], !dbg !304 + %8367 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1157 = icmp eq i32 %8367, 0, !dbg !304 + br i1 %.not.i1157, label %8370, label %8368, !dbg !304 + +8368: ; preds = %__nv_exp2f.exit1156 + %8369 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8291) #3, !dbg !304 + br label %__nv_exp2f.exit1159, !dbg !304 + +8370: ; preds = %__nv_exp2f.exit1156 + %8371 = tail call float @llvm.nvvm.ex2.approx.f(float %8291) #3, !dbg !304 + br label %__nv_exp2f.exit1159, !dbg !304 + +__nv_exp2f.exit1159: ; preds = %8368, %8370 + %.0.i1158 = phi float [ %8369, %8368 ], [ %8371, %8370 ], !dbg !304 + %8372 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1160 = icmp eq i32 %8372, 0, !dbg !304 + br i1 %.not.i1160, label %8375, label %8373, !dbg !304 + +8373: ; preds = %__nv_exp2f.exit1159 + %8374 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8292) #3, !dbg !304 + br label %__nv_exp2f.exit1162, !dbg !304 + +8375: ; preds = %__nv_exp2f.exit1159 + %8376 = tail call float @llvm.nvvm.ex2.approx.f(float %8292) #3, !dbg !304 + br label %__nv_exp2f.exit1162, !dbg !304 + +__nv_exp2f.exit1162: ; preds = %8373, %8375 + %.0.i1161 = phi float [ %8374, %8373 ], [ %8376, %8375 ], !dbg !304 + %8377 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1163 = icmp eq i32 %8377, 0, !dbg !304 + br i1 %.not.i1163, label %8380, label %8378, !dbg !304 + +8378: ; preds = %__nv_exp2f.exit1162 + %8379 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8293) #3, !dbg !304 + br label %__nv_exp2f.exit1165, !dbg !304 + +8380: ; preds = %__nv_exp2f.exit1162 + %8381 = tail call float @llvm.nvvm.ex2.approx.f(float %8293) #3, !dbg !304 + br label %__nv_exp2f.exit1165, !dbg !304 + +__nv_exp2f.exit1165: ; preds = %8378, %8380 + %.0.i1164 = phi float [ %8379, %8378 ], [ %8381, %8380 ], !dbg !304 + %8382 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1166 = icmp eq i32 %8382, 0, !dbg !304 + br i1 %.not.i1166, label %8385, label %8383, !dbg !304 + +8383: ; preds = %__nv_exp2f.exit1165 + %8384 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8294) #3, !dbg !304 + br label %__nv_exp2f.exit1168, !dbg !304 + +8385: ; preds = %__nv_exp2f.exit1165 + %8386 = tail call float @llvm.nvvm.ex2.approx.f(float %8294) #3, !dbg !304 + br label %__nv_exp2f.exit1168, !dbg !304 + +__nv_exp2f.exit1168: ; preds = %8383, %8385 + %.0.i1167 = phi float [ %8384, %8383 ], [ %8386, %8385 ], !dbg !304 + %8387 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1169 = icmp eq i32 %8387, 0, !dbg !304 + br i1 %.not.i1169, label %8390, label %8388, !dbg !304 + +8388: ; preds = %__nv_exp2f.exit1168 + %8389 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8295) #3, !dbg !304 + br label %__nv_exp2f.exit1171, !dbg !304 + +8390: ; preds = %__nv_exp2f.exit1168 + %8391 = tail call float @llvm.nvvm.ex2.approx.f(float %8295) #3, !dbg !304 + br label %__nv_exp2f.exit1171, !dbg !304 + +__nv_exp2f.exit1171: ; preds = %8388, %8390 + %.0.i1170 = phi float [ %8389, %8388 ], [ %8391, %8390 ], !dbg !304 + %8392 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1172 = icmp eq i32 %8392, 0, !dbg !304 + br i1 %.not.i1172, label %8395, label %8393, !dbg !304 + +8393: ; preds = %__nv_exp2f.exit1171 + %8394 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8296) #3, !dbg !304 + br label %__nv_exp2f.exit1174, !dbg !304 + +8395: ; preds = %__nv_exp2f.exit1171 + %8396 = tail call float @llvm.nvvm.ex2.approx.f(float %8296) #3, !dbg !304 + br label %__nv_exp2f.exit1174, !dbg !304 + +__nv_exp2f.exit1174: ; preds = %8393, %8395 + %.0.i1173 = phi float [ %8394, %8393 ], [ %8396, %8395 ], !dbg !304 + %8397 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1175 = icmp eq i32 %8397, 0, !dbg !304 + br i1 %.not.i1175, label %8400, label %8398, !dbg !304 + +8398: ; preds = %__nv_exp2f.exit1174 + %8399 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8297) #3, !dbg !304 + br label %__nv_exp2f.exit1177, !dbg !304 + +8400: ; preds = %__nv_exp2f.exit1174 + %8401 = tail call float @llvm.nvvm.ex2.approx.f(float %8297) #3, !dbg !304 + br label %__nv_exp2f.exit1177, !dbg !304 + +__nv_exp2f.exit1177: ; preds = %8398, %8400 + %.0.i1176 = phi float [ %8399, %8398 ], [ %8401, %8400 ], !dbg !304 + %8402 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1178 = icmp eq i32 %8402, 0, !dbg !304 + br i1 %.not.i1178, label %8405, label %8403, !dbg !304 + +8403: ; preds = %__nv_exp2f.exit1177 + %8404 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8298) #3, !dbg !304 + br label %__nv_exp2f.exit1180, !dbg !304 + +8405: ; preds = %__nv_exp2f.exit1177 + %8406 = tail call float @llvm.nvvm.ex2.approx.f(float %8298) #3, !dbg !304 + br label %__nv_exp2f.exit1180, !dbg !304 + +__nv_exp2f.exit1180: ; preds = %8403, %8405 + %.0.i1179 = phi float [ %8404, %8403 ], [ %8406, %8405 ], !dbg !304 + %8407 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1181 = icmp eq i32 %8407, 0, !dbg !304 + br i1 %.not.i1181, label %8410, label %8408, !dbg !304 + +8408: ; preds = %__nv_exp2f.exit1180 + %8409 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8299) #3, !dbg !304 + br label %__nv_exp2f.exit1183, !dbg !304 + +8410: ; preds = %__nv_exp2f.exit1180 + %8411 = tail call float @llvm.nvvm.ex2.approx.f(float %8299) #3, !dbg !304 + br label %__nv_exp2f.exit1183, !dbg !304 + +__nv_exp2f.exit1183: ; preds = %8408, %8410 + %.0.i1182 = phi float [ %8409, %8408 ], [ %8411, %8410 ], !dbg !304 + %8412 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1184 = icmp eq i32 %8412, 0, !dbg !304 + br i1 %.not.i1184, label %8415, label %8413, !dbg !304 + +8413: ; preds = %__nv_exp2f.exit1183 + %8414 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8300) #3, !dbg !304 + br label %__nv_exp2f.exit1186, !dbg !304 + +8415: ; preds = %__nv_exp2f.exit1183 + %8416 = tail call float @llvm.nvvm.ex2.approx.f(float %8300) #3, !dbg !304 + br label %__nv_exp2f.exit1186, !dbg !304 + +__nv_exp2f.exit1186: ; preds = %8413, %8415 + %.0.i1185 = phi float [ %8414, %8413 ], [ %8416, %8415 ], !dbg !304 + %8417 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1187 = icmp eq i32 %8417, 0, !dbg !304 + br i1 %.not.i1187, label %8420, label %8418, !dbg !304 + +8418: ; preds = %__nv_exp2f.exit1186 + %8419 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8301) #3, !dbg !304 + br label %__nv_exp2f.exit1189, !dbg !304 + +8420: ; preds = %__nv_exp2f.exit1186 + %8421 = tail call float @llvm.nvvm.ex2.approx.f(float %8301) #3, !dbg !304 + br label %__nv_exp2f.exit1189, !dbg !304 + +__nv_exp2f.exit1189: ; preds = %8418, %8420 + %.0.i1188 = phi float [ %8419, %8418 ], [ %8421, %8420 ], !dbg !304 + %8422 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1190 = icmp eq i32 %8422, 0, !dbg !304 + br i1 %.not.i1190, label %8425, label %8423, !dbg !304 + +8423: ; preds = %__nv_exp2f.exit1189 + %8424 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8302) #3, !dbg !304 + br label %__nv_exp2f.exit1192, !dbg !304 + +8425: ; preds = %__nv_exp2f.exit1189 + %8426 = tail call float @llvm.nvvm.ex2.approx.f(float %8302) #3, !dbg !304 + br label %__nv_exp2f.exit1192, !dbg !304 + +__nv_exp2f.exit1192: ; preds = %8423, %8425 + %.0.i1191 = phi float [ %8424, %8423 ], [ %8426, %8425 ], !dbg !304 + %8427 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1193 = icmp eq i32 %8427, 0, !dbg !304 + br i1 %.not.i1193, label %8430, label %8428, !dbg !304 + +8428: ; preds = %__nv_exp2f.exit1192 + %8429 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8303) #3, !dbg !304 + br label %__nv_exp2f.exit1195, !dbg !304 + +8430: ; preds = %__nv_exp2f.exit1192 + %8431 = tail call float @llvm.nvvm.ex2.approx.f(float %8303) #3, !dbg !304 + br label %__nv_exp2f.exit1195, !dbg !304 + +__nv_exp2f.exit1195: ; preds = %8428, %8430 + %.0.i1194 = phi float [ %8429, %8428 ], [ %8431, %8430 ], !dbg !304 + %8432 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1196 = icmp eq i32 %8432, 0, !dbg !304 + br i1 %.not.i1196, label %8435, label %8433, !dbg !304 + +8433: ; preds = %__nv_exp2f.exit1195 + %8434 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8304) #3, !dbg !304 + br label %__nv_exp2f.exit1198, !dbg !304 + +8435: ; preds = %__nv_exp2f.exit1195 + %8436 = tail call float @llvm.nvvm.ex2.approx.f(float %8304) #3, !dbg !304 + br label %__nv_exp2f.exit1198, !dbg !304 + +__nv_exp2f.exit1198: ; preds = %8433, %8435 + %.0.i1197 = phi float [ %8434, %8433 ], [ %8436, %8435 ], !dbg !304 + %8437 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1199 = icmp eq i32 %8437, 0, !dbg !304 + br i1 %.not.i1199, label %8440, label %8438, !dbg !304 + +8438: ; preds = %__nv_exp2f.exit1198 + %8439 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8305) #3, !dbg !304 + br label %__nv_exp2f.exit1201, !dbg !304 + +8440: ; preds = %__nv_exp2f.exit1198 + %8441 = tail call float @llvm.nvvm.ex2.approx.f(float %8305) #3, !dbg !304 + br label %__nv_exp2f.exit1201, !dbg !304 + +__nv_exp2f.exit1201: ; preds = %8438, %8440 + %.0.i1200 = phi float [ %8439, %8438 ], [ %8441, %8440 ], !dbg !304 + %8442 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1202 = icmp eq i32 %8442, 0, !dbg !304 + br i1 %.not.i1202, label %8445, label %8443, !dbg !304 + +8443: ; preds = %__nv_exp2f.exit1201 + %8444 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8306) #3, !dbg !304 + br label %__nv_exp2f.exit1204, !dbg !304 + +8445: ; preds = %__nv_exp2f.exit1201 + %8446 = tail call float @llvm.nvvm.ex2.approx.f(float %8306) #3, !dbg !304 + br label %__nv_exp2f.exit1204, !dbg !304 + +__nv_exp2f.exit1204: ; preds = %8443, %8445 + %.0.i1203 = phi float [ %8444, %8443 ], [ %8446, %8445 ], !dbg !304 + %8447 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1205 = icmp eq i32 %8447, 0, !dbg !304 + br i1 %.not.i1205, label %8450, label %8448, !dbg !304 + +8448: ; preds = %__nv_exp2f.exit1204 + %8449 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8307) #3, !dbg !304 + br label %__nv_exp2f.exit1207, !dbg !304 + +8450: ; preds = %__nv_exp2f.exit1204 + %8451 = tail call float @llvm.nvvm.ex2.approx.f(float %8307) #3, !dbg !304 + br label %__nv_exp2f.exit1207, !dbg !304 + +__nv_exp2f.exit1207: ; preds = %8448, %8450 + %.0.i1206 = phi float [ %8449, %8448 ], [ %8451, %8450 ], !dbg !304 + %8452 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1208 = icmp eq i32 %8452, 0, !dbg !304 + br i1 %.not.i1208, label %8455, label %8453, !dbg !304 + +8453: ; preds = %__nv_exp2f.exit1207 + %8454 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8308) #3, !dbg !304 + br label %__nv_exp2f.exit1210, !dbg !304 + +8455: ; preds = %__nv_exp2f.exit1207 + %8456 = tail call float @llvm.nvvm.ex2.approx.f(float %8308) #3, !dbg !304 + br label %__nv_exp2f.exit1210, !dbg !304 + +__nv_exp2f.exit1210: ; preds = %8453, %8455 + %.0.i1209 = phi float [ %8454, %8453 ], [ %8456, %8455 ], !dbg !304 + %8457 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1211 = icmp eq i32 %8457, 0, !dbg !304 + br i1 %.not.i1211, label %8460, label %8458, !dbg !304 + +8458: ; preds = %__nv_exp2f.exit1210 + %8459 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8309) #3, !dbg !304 + br label %__nv_exp2f.exit1213, !dbg !304 + +8460: ; preds = %__nv_exp2f.exit1210 + %8461 = tail call float @llvm.nvvm.ex2.approx.f(float %8309) #3, !dbg !304 + br label %__nv_exp2f.exit1213, !dbg !304 + +__nv_exp2f.exit1213: ; preds = %8458, %8460 + %.0.i1212 = phi float [ %8459, %8458 ], [ %8461, %8460 ], !dbg !304 + %8462 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1214 = icmp eq i32 %8462, 0, !dbg !304 + br i1 %.not.i1214, label %8465, label %8463, !dbg !304 + +8463: ; preds = %__nv_exp2f.exit1213 + %8464 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8310) #3, !dbg !304 + br label %__nv_exp2f.exit1216, !dbg !304 + +8465: ; preds = %__nv_exp2f.exit1213 + %8466 = tail call float @llvm.nvvm.ex2.approx.f(float %8310) #3, !dbg !304 + br label %__nv_exp2f.exit1216, !dbg !304 + +__nv_exp2f.exit1216: ; preds = %8463, %8465 + %.0.i1215 = phi float [ %8464, %8463 ], [ %8466, %8465 ], !dbg !304 + %8467 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !304 + %.not.i1217 = icmp eq i32 %8467, 0, !dbg !304 + br i1 %.not.i1217, label %8470, label %8468, !dbg !304 + +8468: ; preds = %__nv_exp2f.exit1216 + %8469 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8311) #3, !dbg !304 + br label %__nv_exp2f.exit1219, !dbg !304 + +8470: ; preds = %__nv_exp2f.exit1216 + %8471 = tail call float @llvm.nvvm.ex2.approx.f(float %8311) #3, !dbg !304 + br label %__nv_exp2f.exit1219, !dbg !304 + +__nv_exp2f.exit1219: ; preds = %8468, %8470 + %.0.i1218 = phi float [ %8469, %8468 ], [ %8471, %8470 ], !dbg !304 + %8472 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %7761, !dbg !290 + %8473 = insertelement <2 x float> poison, float %.0.i, i64 0, !dbg !305 + %8474 = insertelement <2 x float> %8473, float %.0.i1128, i64 1, !dbg !305 + %8475 = fptrunc <2 x float> %8474 to <2 x bfloat>, !dbg !305 + %8476 = insertelement <2 x float> poison, float %.0.i1131, i64 0, !dbg !305 + %8477 = insertelement <2 x float> %8476, float %.0.i1134, i64 1, !dbg !305 + %8478 = fptrunc <2 x float> %8477 to <2 x bfloat>, !dbg !305 + %8479 = insertelement <2 x float> poison, float %.0.i1137, i64 0, !dbg !305 + %8480 = insertelement <2 x float> %8479, float %.0.i1140, i64 1, !dbg !305 + %8481 = fptrunc <2 x float> %8480 to <2 x bfloat>, !dbg !305 + %8482 = insertelement <2 x float> poison, float %.0.i1143, i64 0, !dbg !305 + %8483 = insertelement <2 x float> %8482, float %.0.i1146, i64 1, !dbg !305 + %8484 = fptrunc <2 x float> %8483 to <2 x bfloat>, !dbg !305 + %8485 = insertelement <2 x float> poison, float %.0.i1149, i64 0, !dbg !305 + %8486 = insertelement <2 x float> %8485, float %.0.i1152, i64 1, !dbg !305 + %8487 = fptrunc <2 x float> %8486 to <2 x bfloat>, !dbg !305 + %8488 = insertelement <2 x float> poison, float %.0.i1155, i64 0, !dbg !305 + %8489 = insertelement <2 x float> %8488, float %.0.i1158, i64 1, !dbg !305 + %8490 = fptrunc <2 x float> %8489 to <2 x bfloat>, !dbg !305 + %8491 = insertelement <2 x float> poison, float %.0.i1161, i64 0, !dbg !305 + %8492 = insertelement <2 x float> %8491, float %.0.i1164, i64 1, !dbg !305 + %8493 = fptrunc <2 x float> %8492 to <2 x bfloat>, !dbg !305 + %8494 = insertelement <2 x float> poison, float %.0.i1167, i64 0, !dbg !305 + %8495 = insertelement <2 x float> %8494, float %.0.i1170, i64 1, !dbg !305 + %8496 = fptrunc <2 x float> %8495 to <2 x bfloat>, !dbg !305 + %8497 = insertelement <2 x float> poison, float %.0.i1173, i64 0, !dbg !305 + %8498 = insertelement <2 x float> %8497, float %.0.i1176, i64 1, !dbg !305 + %8499 = fptrunc <2 x float> %8498 to <2 x bfloat>, !dbg !305 + %8500 = insertelement <2 x float> poison, float %.0.i1179, i64 0, !dbg !305 + %8501 = insertelement <2 x float> %8500, float %.0.i1182, i64 1, !dbg !305 + %8502 = fptrunc <2 x float> %8501 to <2 x bfloat>, !dbg !305 + %8503 = insertelement <2 x float> poison, float %.0.i1185, i64 0, !dbg !305 + %8504 = insertelement <2 x float> %8503, float %.0.i1188, i64 1, !dbg !305 + %8505 = fptrunc <2 x float> %8504 to <2 x bfloat>, !dbg !305 + %8506 = insertelement <2 x float> poison, float %.0.i1191, i64 0, !dbg !305 + %8507 = insertelement <2 x float> %8506, float %.0.i1194, i64 1, !dbg !305 + %8508 = fptrunc <2 x float> %8507 to <2 x bfloat>, !dbg !305 + %8509 = insertelement <2 x float> poison, float %.0.i1197, i64 0, !dbg !305 + %8510 = insertelement <2 x float> %8509, float %.0.i1200, i64 1, !dbg !305 + %8511 = fptrunc <2 x float> %8510 to <2 x bfloat>, !dbg !305 + %8512 = insertelement <2 x float> poison, float %.0.i1203, i64 0, !dbg !305 + %8513 = insertelement <2 x float> %8512, float %.0.i1206, i64 1, !dbg !305 + %8514 = fptrunc <2 x float> %8513 to <2 x bfloat>, !dbg !305 + %8515 = insertelement <2 x float> poison, float %.0.i1209, i64 0, !dbg !305 + %8516 = insertelement <2 x float> %8515, float %.0.i1212, i64 1, !dbg !305 + %8517 = fptrunc <2 x float> %8516 to <2 x bfloat>, !dbg !305 + %8518 = insertelement <2 x float> poison, float %.0.i1215, i64 0, !dbg !305 + %8519 = insertelement <2 x float> %8518, float %.0.i1218, i64 1, !dbg !305 + %8520 = fptrunc <2 x float> %8519 to <2 x bfloat>, !dbg !305 + %8521 = bitcast <2 x bfloat> %8475 to i32, !dbg !306 + %8522 = bitcast <2 x bfloat> %8478 to i32, !dbg !306 + %8523 = bitcast <2 x bfloat> %8481 to i32, !dbg !306 + %8524 = bitcast <2 x bfloat> %8484 to i32, !dbg !306 + %8525 = bitcast <2 x bfloat> %8487 to i32, !dbg !306 + %8526 = bitcast <2 x bfloat> %8490 to i32, !dbg !306 + %8527 = bitcast <2 x bfloat> %8493 to i32, !dbg !306 + %8528 = bitcast <2 x bfloat> %8496 to i32, !dbg !306 + %8529 = bitcast <2 x bfloat> %8499 to i32, !dbg !306 + %8530 = bitcast <2 x bfloat> %8502 to i32, !dbg !306 + %8531 = bitcast <2 x bfloat> %8505 to i32, !dbg !306 + %8532 = bitcast <2 x bfloat> %8508 to i32, !dbg !306 + %8533 = bitcast <2 x bfloat> %8511 to i32, !dbg !306 + %8534 = bitcast <2 x bfloat> %8514 to i32, !dbg !306 + %8535 = bitcast <2 x bfloat> %8517 to i32, !dbg !306 + %8536 = bitcast <2 x bfloat> %8520 to i32, !dbg !306 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !306 + %8537 = ptrtoint ptr addrspace(3) %8472 to i32, !dbg !306 + %8538 = lshr exact i32 %8537, 4, !dbg !306 + %8539 = and i32 %8538, 16383, !dbg !306 + %8540 = zext nneg i32 %8539 to i64, !dbg !306 + %8541 = or disjoint i64 %8540, 4611686293338849280, !dbg !306 + %8542 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %.pn5041610, float %.pn5021611, float %.pn5001612, float %.pn4981613, float %.pn4961614, float %.pn4941615, float %.pn4921616, float %.pn4901617, float %.pn4881618, float %.pn4861619, float %.pn4841620, float %.pn4821621, float %.pn4801622, float %.pn4781623, float %.pn4761624, float %.pn4741625, float %.pn4721626, float %.pn4701627, float %.pn4681628, float %.pn4661629, float %.pn4641630, float %.pn4621631, float %.pn4601632, float %.pn4581633, float %.pn4561634, float %.pn4541635, float %.pn4521636, float %.pn4501637, float %.pn4481638, float %.pn4461639, float %.pn4441640, float %.pn4421641, float %.pn4401642, float %.pn4381643, float %.pn4361644, float %.pn4341645, float %.pn4321646, float %.pn4301647, float %.pn4281648, float %.pn4261649, float %.pn4241650, float %.pn4221651, float %.pn4201652, float %.pn4181653, float %.pn4161654, float %.pn4141655, float %.pn4121656, float %.pn4101657, float %.pn4081658, float %.pn4061659, float %.pn4041660, float %.pn4021661, float %.pn4001662, float %.pn3981663, float %.pn3961664, float %.pn3941665, float %.pn3921666, float %.pn3901667, float %.pn3881668, float %.pn3861669, float %.pn3841670, float %.pn3821671, float %.pn3801672, float %.pn3781673, i32 %8521, i32 %8522, i32 %8523, i32 %8524, i64 %8541, i1 true) #3, !dbg !306 + %8543 = add i32 %8537, 2048, !dbg !306 + %8544 = lshr exact i32 %8543, 4, !dbg !306 + %8545 = and i32 %8544, 16383, !dbg !306 + %8546 = zext nneg i32 %8545 to i64, !dbg !306 + %8547 = or disjoint i64 %8546, 4611686293338849280, !dbg !306 + %8548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 0, !dbg !306 + %8549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 1, !dbg !306 + %8550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 2, !dbg !306 + %8551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 3, !dbg !306 + %8552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 4, !dbg !306 + %8553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 5, !dbg !306 + %8554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 6, !dbg !306 + %8555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 7, !dbg !306 + %8556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 8, !dbg !306 + %8557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 9, !dbg !306 + %8558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 10, !dbg !306 + %8559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 11, !dbg !306 + %8560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 12, !dbg !306 + %8561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 13, !dbg !306 + %8562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 14, !dbg !306 + %8563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 15, !dbg !306 + %8564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 16, !dbg !306 + %8565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 17, !dbg !306 + %8566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 18, !dbg !306 + %8567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 19, !dbg !306 + %8568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 20, !dbg !306 + %8569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 21, !dbg !306 + %8570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 22, !dbg !306 + %8571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 23, !dbg !306 + %8572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 24, !dbg !306 + %8573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 25, !dbg !306 + %8574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 26, !dbg !306 + %8575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 27, !dbg !306 + %8576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 28, !dbg !306 + %8577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 29, !dbg !306 + %8578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 30, !dbg !306 + %8579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 31, !dbg !306 + %8580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 32, !dbg !306 + %8581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 33, !dbg !306 + %8582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 34, !dbg !306 + %8583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 35, !dbg !306 + %8584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 36, !dbg !306 + %8585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 37, !dbg !306 + %8586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 38, !dbg !306 + %8587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 39, !dbg !306 + %8588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 40, !dbg !306 + %8589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 41, !dbg !306 + %8590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 42, !dbg !306 + %8591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 43, !dbg !306 + %8592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 44, !dbg !306 + %8593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 45, !dbg !306 + %8594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 46, !dbg !306 + %8595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 47, !dbg !306 + %8596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 48, !dbg !306 + %8597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 49, !dbg !306 + %8598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 50, !dbg !306 + %8599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 51, !dbg !306 + %8600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 52, !dbg !306 + %8601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 53, !dbg !306 + %8602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 54, !dbg !306 + %8603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 55, !dbg !306 + %8604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 56, !dbg !306 + %8605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 57, !dbg !306 + %8606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 58, !dbg !306 + %8607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 59, !dbg !306 + %8608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 60, !dbg !306 + %8609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 61, !dbg !306 + %8610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 62, !dbg !306 + %8611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8542, 63, !dbg !306 + %8612 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %8548, float %8549, float %8550, float %8551, float %8552, float %8553, float %8554, float %8555, float %8556, float %8557, float %8558, float %8559, float %8560, float %8561, float %8562, float %8563, float %8564, float %8565, float %8566, float %8567, float %8568, float %8569, float %8570, float %8571, float %8572, float %8573, float %8574, float %8575, float %8576, float %8577, float %8578, float %8579, float %8580, float %8581, float %8582, float %8583, float %8584, float %8585, float %8586, float %8587, float %8588, float %8589, float %8590, float %8591, float %8592, float %8593, float %8594, float %8595, float %8596, float %8597, float %8598, float %8599, float %8600, float %8601, float %8602, float %8603, float %8604, float %8605, float %8606, float %8607, float %8608, float %8609, float %8610, float %8611, i32 %8525, i32 %8526, i32 %8527, i32 %8528, i64 %8547, i1 true) #3, !dbg !306 + %8613 = add i32 %8537, 4096, !dbg !306 + %8614 = lshr exact i32 %8613, 4, !dbg !306 + %8615 = and i32 %8614, 16383, !dbg !306 + %8616 = zext nneg i32 %8615 to i64, !dbg !306 + %8617 = or disjoint i64 %8616, 4611686293338849280, !dbg !306 + %8618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 0, !dbg !306 + %8619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 1, !dbg !306 + %8620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 2, !dbg !306 + %8621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 3, !dbg !306 + %8622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 4, !dbg !306 + %8623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 5, !dbg !306 + %8624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 6, !dbg !306 + %8625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 7, !dbg !306 + %8626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 8, !dbg !306 + %8627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 9, !dbg !306 + %8628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 10, !dbg !306 + %8629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 11, !dbg !306 + %8630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 12, !dbg !306 + %8631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 13, !dbg !306 + %8632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 14, !dbg !306 + %8633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 15, !dbg !306 + %8634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 16, !dbg !306 + %8635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 17, !dbg !306 + %8636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 18, !dbg !306 + %8637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 19, !dbg !306 + %8638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 20, !dbg !306 + %8639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 21, !dbg !306 + %8640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 22, !dbg !306 + %8641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 23, !dbg !306 + %8642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 24, !dbg !306 + %8643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 25, !dbg !306 + %8644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 26, !dbg !306 + %8645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 27, !dbg !306 + %8646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 28, !dbg !306 + %8647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 29, !dbg !306 + %8648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 30, !dbg !306 + %8649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 31, !dbg !306 + %8650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 32, !dbg !306 + %8651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 33, !dbg !306 + %8652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 34, !dbg !306 + %8653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 35, !dbg !306 + %8654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 36, !dbg !306 + %8655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 37, !dbg !306 + %8656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 38, !dbg !306 + %8657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 39, !dbg !306 + %8658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 40, !dbg !306 + %8659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 41, !dbg !306 + %8660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 42, !dbg !306 + %8661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 43, !dbg !306 + %8662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 44, !dbg !306 + %8663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 45, !dbg !306 + %8664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 46, !dbg !306 + %8665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 47, !dbg !306 + %8666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 48, !dbg !306 + %8667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 49, !dbg !306 + %8668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 50, !dbg !306 + %8669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 51, !dbg !306 + %8670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 52, !dbg !306 + %8671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 53, !dbg !306 + %8672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 54, !dbg !306 + %8673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 55, !dbg !306 + %8674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 56, !dbg !306 + %8675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 57, !dbg !306 + %8676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 58, !dbg !306 + %8677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 59, !dbg !306 + %8678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 60, !dbg !306 + %8679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 61, !dbg !306 + %8680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 62, !dbg !306 + %8681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8612, 63, !dbg !306 + %8682 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %8618, float %8619, float %8620, float %8621, float %8622, float %8623, float %8624, float %8625, float %8626, float %8627, float %8628, float %8629, float %8630, float %8631, float %8632, float %8633, float %8634, float %8635, float %8636, float %8637, float %8638, float %8639, float %8640, float %8641, float %8642, float %8643, float %8644, float %8645, float %8646, float %8647, float %8648, float %8649, float %8650, float %8651, float %8652, float %8653, float %8654, float %8655, float %8656, float %8657, float %8658, float %8659, float %8660, float %8661, float %8662, float %8663, float %8664, float %8665, float %8666, float %8667, float %8668, float %8669, float %8670, float %8671, float %8672, float %8673, float %8674, float %8675, float %8676, float %8677, float %8678, float %8679, float %8680, float %8681, i32 %8529, i32 %8530, i32 %8531, i32 %8532, i64 %8617, i1 true) #3, !dbg !306 + %8683 = add i32 %8537, 6144, !dbg !306 + %8684 = lshr exact i32 %8683, 4, !dbg !306 + %8685 = and i32 %8684, 16383, !dbg !306 + %8686 = zext nneg i32 %8685 to i64, !dbg !306 + %8687 = or disjoint i64 %8686, 4611686293338849280, !dbg !306 + %8688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 0, !dbg !306 + %8689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 1, !dbg !306 + %8690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 2, !dbg !306 + %8691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 3, !dbg !306 + %8692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 4, !dbg !306 + %8693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 5, !dbg !306 + %8694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 6, !dbg !306 + %8695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 7, !dbg !306 + %8696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 8, !dbg !306 + %8697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 9, !dbg !306 + %8698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 10, !dbg !306 + %8699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 11, !dbg !306 + %8700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 12, !dbg !306 + %8701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 13, !dbg !306 + %8702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 14, !dbg !306 + %8703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 15, !dbg !306 + %8704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 16, !dbg !306 + %8705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 17, !dbg !306 + %8706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 18, !dbg !306 + %8707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 19, !dbg !306 + %8708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 20, !dbg !306 + %8709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 21, !dbg !306 + %8710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 22, !dbg !306 + %8711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 23, !dbg !306 + %8712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 24, !dbg !306 + %8713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 25, !dbg !306 + %8714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 26, !dbg !306 + %8715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 27, !dbg !306 + %8716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 28, !dbg !306 + %8717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 29, !dbg !306 + %8718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 30, !dbg !306 + %8719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 31, !dbg !306 + %8720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 32, !dbg !306 + %8721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 33, !dbg !306 + %8722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 34, !dbg !306 + %8723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 35, !dbg !306 + %8724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 36, !dbg !306 + %8725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 37, !dbg !306 + %8726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 38, !dbg !306 + %8727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 39, !dbg !306 + %8728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 40, !dbg !306 + %8729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 41, !dbg !306 + %8730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 42, !dbg !306 + %8731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 43, !dbg !306 + %8732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 44, !dbg !306 + %8733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 45, !dbg !306 + %8734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 46, !dbg !306 + %8735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 47, !dbg !306 + %8736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 48, !dbg !306 + %8737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 49, !dbg !306 + %8738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 50, !dbg !306 + %8739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 51, !dbg !306 + %8740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 52, !dbg !306 + %8741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 53, !dbg !306 + %8742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 54, !dbg !306 + %8743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 55, !dbg !306 + %8744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 56, !dbg !306 + %8745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 57, !dbg !306 + %8746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 58, !dbg !306 + %8747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 59, !dbg !306 + %8748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 60, !dbg !306 + %8749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 61, !dbg !306 + %8750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 62, !dbg !306 + %8751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8682, 63, !dbg !306 + %8752 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %8688, float %8689, float %8690, float %8691, float %8692, float %8693, float %8694, float %8695, float %8696, float %8697, float %8698, float %8699, float %8700, float %8701, float %8702, float %8703, float %8704, float %8705, float %8706, float %8707, float %8708, float %8709, float %8710, float %8711, float %8712, float %8713, float %8714, float %8715, float %8716, float %8717, float %8718, float %8719, float %8720, float %8721, float %8722, float %8723, float %8724, float %8725, float %8726, float %8727, float %8728, float %8729, float %8730, float %8731, float %8732, float %8733, float %8734, float %8735, float %8736, float %8737, float %8738, float %8739, float %8740, float %8741, float %8742, float %8743, float %8744, float %8745, float %8746, float %8747, float %8748, float %8749, float %8750, float %8751, i32 %8533, i32 %8534, i32 %8535, i32 %8536, i64 %8687, i1 true) #3, !dbg !306 + %8753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 0, !dbg !306 + %8754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 1, !dbg !306 + %8755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 2, !dbg !306 + %8756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 3, !dbg !306 + %8757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 4, !dbg !306 + %8758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 5, !dbg !306 + %8759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 6, !dbg !306 + %8760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 7, !dbg !306 + %8761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 8, !dbg !306 + %8762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 9, !dbg !306 + %8763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 10, !dbg !306 + %8764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 11, !dbg !306 + %8765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 12, !dbg !306 + %8766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 13, !dbg !306 + %8767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 14, !dbg !306 + %8768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 15, !dbg !306 + %8769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 16, !dbg !306 + %8770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 17, !dbg !306 + %8771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 18, !dbg !306 + %8772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 19, !dbg !306 + %8773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 20, !dbg !306 + %8774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 21, !dbg !306 + %8775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 22, !dbg !306 + %8776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 23, !dbg !306 + %8777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 24, !dbg !306 + %8778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 25, !dbg !306 + %8779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 26, !dbg !306 + %8780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 27, !dbg !306 + %8781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 28, !dbg !306 + %8782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 29, !dbg !306 + %8783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 30, !dbg !306 + %8784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 31, !dbg !306 + %8785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 32, !dbg !306 + %8786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 33, !dbg !306 + %8787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 34, !dbg !306 + %8788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 35, !dbg !306 + %8789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 36, !dbg !306 + %8790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 37, !dbg !306 + %8791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 38, !dbg !306 + %8792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 39, !dbg !306 + %8793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 40, !dbg !306 + %8794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 41, !dbg !306 + %8795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 42, !dbg !306 + %8796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 43, !dbg !306 + %8797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 44, !dbg !306 + %8798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 45, !dbg !306 + %8799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 46, !dbg !306 + %8800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 47, !dbg !306 + %8801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 48, !dbg !306 + %8802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 49, !dbg !306 + %8803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 50, !dbg !306 + %8804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 51, !dbg !306 + %8805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 52, !dbg !306 + %8806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 53, !dbg !306 + %8807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 54, !dbg !306 + %8808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 55, !dbg !306 + %8809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 56, !dbg !306 + %8810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 57, !dbg !306 + %8811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 58, !dbg !306 + %8812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 59, !dbg !306 + %8813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 60, !dbg !306 + %8814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 61, !dbg !306 + %8815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 62, !dbg !306 + %8816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8752, 63, !dbg !306 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !306 + %8817 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %7763, !dbg !294 + %8818 = getelementptr inbounds nuw i8, ptr addrspace(3) %8817, i32 %4812, !dbg !294 + %8819 = getelementptr inbounds nuw i8, ptr addrspace(3) %8817, i32 %4815, !dbg !294 + %8820 = getelementptr inbounds nuw i8, ptr addrspace(3) %8817, i32 %4817, !dbg !294 + %8821 = getelementptr inbounds nuw i8, ptr addrspace(3) %8817, i32 %4819, !dbg !294 + %8822 = getelementptr inbounds nuw i8, ptr addrspace(3) %8817, i32 %4821, !dbg !294 + %8823 = getelementptr inbounds nuw i8, ptr addrspace(3) %8817, i32 %4823, !dbg !294 + %8824 = getelementptr inbounds nuw i8, ptr addrspace(3) %8817, i32 %4825, !dbg !294 + %8825 = getelementptr inbounds nuw i8, ptr addrspace(3) %8817, i32 %4827, !dbg !294 + %8826 = add i32 %7831, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !307 + %8827 = lshr exact i32 %8826, 4, !dbg !307 + %8828 = and i32 %8827, 16383, !dbg !307 + %8829 = zext nneg i32 %8828 to i64, !dbg !307 + %8830 = or disjoint i64 %8829, 4611686293372403712, !dbg !307 + %8831 = add i32 %7843, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !307 + %8832 = lshr exact i32 %8831, 4, !dbg !307 + %8833 = and i32 %8832, 16383, !dbg !307 + %8834 = zext nneg i32 %8833 to i64, !dbg !307 + %8835 = or disjoint i64 %8834, 4611686293372403712, !dbg !307 + %8836 = add i32 %8537, 32, !dbg !307 + %8837 = lshr exact i32 %8836, 4, !dbg !307 + %8838 = and i32 %8837, 16383, !dbg !307 + %8839 = zext nneg i32 %8838 to i64, !dbg !307 + %8840 = or disjoint i64 %8839, 4611686293338849280, !dbg !307 + %8841 = add i32 %7887, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !307 + %8842 = lshr exact i32 %8841, 4, !dbg !307 + %8843 = and i32 %8842, 16383, !dbg !307 + %8844 = zext nneg i32 %8843 to i64, !dbg !307 + %8845 = or disjoint i64 %8844, 4611686293372403712, !dbg !307 + %8846 = add i32 %8537, 64, !dbg !307 + %8847 = lshr exact i32 %8846, 4, !dbg !307 + %8848 = and i32 %8847, 16383, !dbg !307 + %8849 = zext nneg i32 %8848 to i64, !dbg !307 + %8850 = or disjoint i64 %8849, 4611686293338849280, !dbg !307 + %8851 = add i32 %7931, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !307 + %8852 = lshr exact i32 %8851, 4, !dbg !307 + %8853 = and i32 %8852, 16383, !dbg !307 + %8854 = zext nneg i32 %8853 to i64, !dbg !307 + %8855 = or disjoint i64 %8854, 4611686293372403712, !dbg !307 + %8856 = add i32 %8537, 96, !dbg !307 + %8857 = lshr exact i32 %8856, 4, !dbg !307 + %8858 = and i32 %8857, 16383, !dbg !307 + %8859 = zext nneg i32 %8858 to i64, !dbg !307 + %8860 = or disjoint i64 %8859, 4611686293338849280, !dbg !307 + %8861 = add i32 %7975, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !307 + %8862 = lshr exact i32 %8861, 4, !dbg !307 + %8863 = and i32 %8862, 16383, !dbg !307 + %8864 = zext nneg i32 %8863 to i64, !dbg !307 + %8865 = or disjoint i64 %8864, 4611686293372403712, !dbg !307 + %8866 = add i32 %8537, 8192, !dbg !307 + %8867 = lshr exact i32 %8866, 4, !dbg !307 + %8868 = and i32 %8867, 16383, !dbg !307 + %8869 = zext nneg i32 %8868 to i64, !dbg !307 + %8870 = or disjoint i64 %8869, 4611686293338849280, !dbg !307 + %8871 = add i32 %8019, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !307 + %8872 = lshr exact i32 %8871, 4, !dbg !307 + %8873 = and i32 %8872, 16383, !dbg !307 + %8874 = zext nneg i32 %8873 to i64, !dbg !307 + %8875 = or disjoint i64 %8874, 4611686293372403712, !dbg !307 + %8876 = add i32 %8537, 8224, !dbg !307 + %8877 = lshr exact i32 %8876, 4, !dbg !307 + %8878 = and i32 %8877, 16383, !dbg !307 + %8879 = zext nneg i32 %8878 to i64, !dbg !307 + %8880 = or disjoint i64 %8879, 4611686293338849280, !dbg !307 + %8881 = add i32 %8063, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !307 + %8882 = lshr exact i32 %8881, 4, !dbg !307 + %8883 = and i32 %8882, 16383, !dbg !307 + %8884 = zext nneg i32 %8883 to i64, !dbg !307 + %8885 = or disjoint i64 %8884, 4611686293372403712, !dbg !307 + %8886 = add i32 %8537, 8256, !dbg !307 + %8887 = lshr exact i32 %8886, 4, !dbg !307 + %8888 = and i32 %8887, 16383, !dbg !307 + %8889 = zext nneg i32 %8888 to i64, !dbg !307 + %8890 = or disjoint i64 %8889, 4611686293338849280, !dbg !307 + %8891 = add i32 %8107, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096) to i32), !dbg !307 + %8892 = lshr exact i32 %8891, 4, !dbg !307 + %8893 = and i32 %8892, 16383, !dbg !307 + %8894 = zext nneg i32 %8893 to i64, !dbg !307 + %8895 = or disjoint i64 %8894, 4611686293372403712, !dbg !307 + %8896 = add i32 %8537, 8288, !dbg !307 + %8897 = lshr exact i32 %8896, 4, !dbg !307 + %8898 = and i32 %8897, 16383, !dbg !307 + %8899 = zext nneg i32 %8898 to i64, !dbg !307 + %8900 = or disjoint i64 %8899, 4611686293338849280, !dbg !307 + %8901 = load <2 x float>, ptr addrspace(3) %8825, align 8, !dbg !294 + %8902 = load <2 x float>, ptr addrspace(3) %8824, align 8, !dbg !294 + %8903 = load <2 x float>, ptr addrspace(3) %8823, align 8, !dbg !294 + %8904 = load <2 x float>, ptr addrspace(3) %8822, align 8, !dbg !294 + %8905 = load <2 x float>, ptr addrspace(3) %8821, align 8, !dbg !294 + %8906 = load <2 x float>, ptr addrspace(3) %8820, align 8, !dbg !294 + %8907 = load <2 x float>, ptr addrspace(3) %8819, align 8, !dbg !294 + %8908 = load <2 x float>, ptr addrspace(3) %8818, align 8, !dbg !294 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !307 + %8909 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %8830, i64 %8541) #3, !dbg !307 + %8910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 0, !dbg !307 + %8911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 1, !dbg !307 + %8912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 2, !dbg !307 + %8913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 3, !dbg !307 + %8914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 4, !dbg !307 + %8915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 5, !dbg !307 + %8916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 6, !dbg !307 + %8917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 7, !dbg !307 + %8918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 8, !dbg !307 + %8919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 9, !dbg !307 + %8920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 10, !dbg !307 + %8921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 11, !dbg !307 + %8922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 12, !dbg !307 + %8923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 13, !dbg !307 + %8924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 14, !dbg !307 + %8925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 15, !dbg !307 + %8926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 16, !dbg !307 + %8927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 17, !dbg !307 + %8928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 18, !dbg !307 + %8929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 19, !dbg !307 + %8930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 20, !dbg !307 + %8931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 21, !dbg !307 + %8932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 22, !dbg !307 + %8933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 23, !dbg !307 + %8934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 24, !dbg !307 + %8935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 25, !dbg !307 + %8936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 26, !dbg !307 + %8937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 27, !dbg !307 + %8938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 28, !dbg !307 + %8939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 29, !dbg !307 + %8940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 30, !dbg !307 + %8941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8909, 31, !dbg !307 + %8942 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8910, float %8911, float %8912, float %8913, float %8914, float %8915, float %8916, float %8917, float %8918, float %8919, float %8920, float %8921, float %8922, float %8923, float %8924, float %8925, float %8926, float %8927, float %8928, float %8929, float %8930, float %8931, float %8932, float %8933, float %8934, float %8935, float %8936, float %8937, float %8938, float %8939, float %8940, float %8941, i64 %8835, i64 %8840, i1 true) #3, !dbg !307 + %8943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 0, !dbg !307 + %8944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 1, !dbg !307 + %8945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 2, !dbg !307 + %8946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 3, !dbg !307 + %8947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 4, !dbg !307 + %8948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 5, !dbg !307 + %8949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 6, !dbg !307 + %8950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 7, !dbg !307 + %8951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 8, !dbg !307 + %8952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 9, !dbg !307 + %8953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 10, !dbg !307 + %8954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 11, !dbg !307 + %8955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 12, !dbg !307 + %8956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 13, !dbg !307 + %8957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 14, !dbg !307 + %8958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 15, !dbg !307 + %8959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 16, !dbg !307 + %8960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 17, !dbg !307 + %8961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 18, !dbg !307 + %8962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 19, !dbg !307 + %8963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 20, !dbg !307 + %8964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 21, !dbg !307 + %8965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 22, !dbg !307 + %8966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 23, !dbg !307 + %8967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 24, !dbg !307 + %8968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 25, !dbg !307 + %8969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 26, !dbg !307 + %8970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 27, !dbg !307 + %8971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 28, !dbg !307 + %8972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 29, !dbg !307 + %8973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 30, !dbg !307 + %8974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8942, 31, !dbg !307 + %8975 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8943, float %8944, float %8945, float %8946, float %8947, float %8948, float %8949, float %8950, float %8951, float %8952, float %8953, float %8954, float %8955, float %8956, float %8957, float %8958, float %8959, float %8960, float %8961, float %8962, float %8963, float %8964, float %8965, float %8966, float %8967, float %8968, float %8969, float %8970, float %8971, float %8972, float %8973, float %8974, i64 %8845, i64 %8850, i1 true) #3, !dbg !307 + %8976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 0, !dbg !307 + %8977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 1, !dbg !307 + %8978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 2, !dbg !307 + %8979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 3, !dbg !307 + %8980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 4, !dbg !307 + %8981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 5, !dbg !307 + %8982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 6, !dbg !307 + %8983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 7, !dbg !307 + %8984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 8, !dbg !307 + %8985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 9, !dbg !307 + %8986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 10, !dbg !307 + %8987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 11, !dbg !307 + %8988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 12, !dbg !307 + %8989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 13, !dbg !307 + %8990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 14, !dbg !307 + %8991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 15, !dbg !307 + %8992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 16, !dbg !307 + %8993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 17, !dbg !307 + %8994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 18, !dbg !307 + %8995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 19, !dbg !307 + %8996 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 20, !dbg !307 + %8997 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 21, !dbg !307 + %8998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 22, !dbg !307 + %8999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 23, !dbg !307 + %9000 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 24, !dbg !307 + %9001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 25, !dbg !307 + %9002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 26, !dbg !307 + %9003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 27, !dbg !307 + %9004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 28, !dbg !307 + %9005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 29, !dbg !307 + %9006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 30, !dbg !307 + %9007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8975, 31, !dbg !307 + %9008 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8976, float %8977, float %8978, float %8979, float %8980, float %8981, float %8982, float %8983, float %8984, float %8985, float %8986, float %8987, float %8988, float %8989, float %8990, float %8991, float %8992, float %8993, float %8994, float %8995, float %8996, float %8997, float %8998, float %8999, float %9000, float %9001, float %9002, float %9003, float %9004, float %9005, float %9006, float %9007, i64 %8855, i64 %8860, i1 true) #3, !dbg !307 + %9009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 0, !dbg !307 + %9010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 1, !dbg !307 + %9011 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 2, !dbg !307 + %9012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 3, !dbg !307 + %9013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 4, !dbg !307 + %9014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 5, !dbg !307 + %9015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 6, !dbg !307 + %9016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 7, !dbg !307 + %9017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 8, !dbg !307 + %9018 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 9, !dbg !307 + %9019 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 10, !dbg !307 + %9020 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 11, !dbg !307 + %9021 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 12, !dbg !307 + %9022 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 13, !dbg !307 + %9023 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 14, !dbg !307 + %9024 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 15, !dbg !307 + %9025 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 16, !dbg !307 + %9026 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 17, !dbg !307 + %9027 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 18, !dbg !307 + %9028 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 19, !dbg !307 + %9029 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 20, !dbg !307 + %9030 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 21, !dbg !307 + %9031 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 22, !dbg !307 + %9032 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 23, !dbg !307 + %9033 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 24, !dbg !307 + %9034 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 25, !dbg !307 + %9035 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 26, !dbg !307 + %9036 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 27, !dbg !307 + %9037 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 28, !dbg !307 + %9038 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 29, !dbg !307 + %9039 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 30, !dbg !307 + %9040 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9008, 31, !dbg !307 + %9041 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9009, float %9010, float %9011, float %9012, float %9013, float %9014, float %9015, float %9016, float %9017, float %9018, float %9019, float %9020, float %9021, float %9022, float %9023, float %9024, float %9025, float %9026, float %9027, float %9028, float %9029, float %9030, float %9031, float %9032, float %9033, float %9034, float %9035, float %9036, float %9037, float %9038, float %9039, float %9040, i64 %8865, i64 %8870, i1 true) #3, !dbg !307 + %9042 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 0, !dbg !307 + %9043 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 1, !dbg !307 + %9044 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 2, !dbg !307 + %9045 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 3, !dbg !307 + %9046 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 4, !dbg !307 + %9047 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 5, !dbg !307 + %9048 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 6, !dbg !307 + %9049 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 7, !dbg !307 + %9050 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 8, !dbg !307 + %9051 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 9, !dbg !307 + %9052 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 10, !dbg !307 + %9053 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 11, !dbg !307 + %9054 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 12, !dbg !307 + %9055 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 13, !dbg !307 + %9056 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 14, !dbg !307 + %9057 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 15, !dbg !307 + %9058 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 16, !dbg !307 + %9059 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 17, !dbg !307 + %9060 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 18, !dbg !307 + %9061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 19, !dbg !307 + %9062 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 20, !dbg !307 + %9063 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 21, !dbg !307 + %9064 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 22, !dbg !307 + %9065 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 23, !dbg !307 + %9066 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 24, !dbg !307 + %9067 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 25, !dbg !307 + %9068 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 26, !dbg !307 + %9069 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 27, !dbg !307 + %9070 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 28, !dbg !307 + %9071 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 29, !dbg !307 + %9072 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 30, !dbg !307 + %9073 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9041, 31, !dbg !307 + %9074 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9042, float %9043, float %9044, float %9045, float %9046, float %9047, float %9048, float %9049, float %9050, float %9051, float %9052, float %9053, float %9054, float %9055, float %9056, float %9057, float %9058, float %9059, float %9060, float %9061, float %9062, float %9063, float %9064, float %9065, float %9066, float %9067, float %9068, float %9069, float %9070, float %9071, float %9072, float %9073, i64 %8875, i64 %8880, i1 true) #3, !dbg !307 + %9075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 0, !dbg !307 + %9076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 1, !dbg !307 + %9077 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 2, !dbg !307 + %9078 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 3, !dbg !307 + %9079 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 4, !dbg !307 + %9080 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 5, !dbg !307 + %9081 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 6, !dbg !307 + %9082 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 7, !dbg !307 + %9083 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 8, !dbg !307 + %9084 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 9, !dbg !307 + %9085 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 10, !dbg !307 + %9086 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 11, !dbg !307 + %9087 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 12, !dbg !307 + %9088 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 13, !dbg !307 + %9089 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 14, !dbg !307 + %9090 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 15, !dbg !307 + %9091 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 16, !dbg !307 + %9092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 17, !dbg !307 + %9093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 18, !dbg !307 + %9094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 19, !dbg !307 + %9095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 20, !dbg !307 + %9096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 21, !dbg !307 + %9097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 22, !dbg !307 + %9098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 23, !dbg !307 + %9099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 24, !dbg !307 + %9100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 25, !dbg !307 + %9101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 26, !dbg !307 + %9102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 27, !dbg !307 + %9103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 28, !dbg !307 + %9104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 29, !dbg !307 + %9105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 30, !dbg !307 + %9106 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9074, 31, !dbg !307 + %9107 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9075, float %9076, float %9077, float %9078, float %9079, float %9080, float %9081, float %9082, float %9083, float %9084, float %9085, float %9086, float %9087, float %9088, float %9089, float %9090, float %9091, float %9092, float %9093, float %9094, float %9095, float %9096, float %9097, float %9098, float %9099, float %9100, float %9101, float %9102, float %9103, float %9104, float %9105, float %9106, i64 %8885, i64 %8890, i1 true) #3, !dbg !307 + %9108 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 0, !dbg !307 + %9109 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 1, !dbg !307 + %9110 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 2, !dbg !307 + %9111 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 3, !dbg !307 + %9112 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 4, !dbg !307 + %9113 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 5, !dbg !307 + %9114 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 6, !dbg !307 + %9115 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 7, !dbg !307 + %9116 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 8, !dbg !307 + %9117 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 9, !dbg !307 + %9118 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 10, !dbg !307 + %9119 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 11, !dbg !307 + %9120 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 12, !dbg !307 + %9121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 13, !dbg !307 + %9122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 14, !dbg !307 + %9123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 15, !dbg !307 + %9124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 16, !dbg !307 + %9125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 17, !dbg !307 + %9126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 18, !dbg !307 + %9127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 19, !dbg !307 + %9128 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 20, !dbg !307 + %9129 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 21, !dbg !307 + %9130 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 22, !dbg !307 + %9131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 23, !dbg !307 + %9132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 24, !dbg !307 + %9133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 25, !dbg !307 + %9134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 26, !dbg !307 + %9135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 27, !dbg !307 + %9136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 28, !dbg !307 + %9137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 29, !dbg !307 + %9138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 30, !dbg !307 + %9139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9107, 31, !dbg !307 + %9140 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9108, float %9109, float %9110, float %9111, float %9112, float %9113, float %9114, float %9115, float %9116, float %9117, float %9118, float %9119, float %9120, float %9121, float %9122, float %9123, float %9124, float %9125, float %9126, float %9127, float %9128, float %9129, float %9130, float %9131, float %9132, float %9133, float %9134, float %9135, float %9136, float %9137, float %9138, float %9139, i64 %8895, i64 %8900, i1 true) #3, !dbg !307 + %9141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 0, !dbg !307 + %9142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 1, !dbg !307 + %9143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 2, !dbg !307 + %9144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 3, !dbg !307 + %9145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 4, !dbg !307 + %9146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 5, !dbg !307 + %9147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 6, !dbg !307 + %9148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 7, !dbg !307 + %9149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 8, !dbg !307 + %9150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 9, !dbg !307 + %9151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 10, !dbg !307 + %9152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 11, !dbg !307 + %9153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 12, !dbg !307 + %9154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 13, !dbg !307 + %9155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 14, !dbg !307 + %9156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 15, !dbg !307 + %9157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 16, !dbg !307 + %9158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 17, !dbg !307 + %9159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 18, !dbg !307 + %9160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 19, !dbg !307 + %9161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 20, !dbg !307 + %9162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 21, !dbg !307 + %9163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 22, !dbg !307 + %9164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 23, !dbg !307 + %9165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 24, !dbg !307 + %9166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 25, !dbg !307 + %9167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 26, !dbg !307 + %9168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 27, !dbg !307 + %9169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 28, !dbg !307 + %9170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 29, !dbg !307 + %9171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 30, !dbg !307 + %9172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9140, 31, !dbg !307 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !307 + %9173 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %9141, float %9142, float %9143, float %9144, float %9145, float %9146, float %9147, float %9148, float %9149, float %9150, float %9151, float %9152, float %9153, float %9154, float %9155, float %9156, float %9157, float %9158, float %9159, float %9160, float %9161, float %9162, float %9163, float %9164, float %9165, float %9166, float %9167, float %9168, float %9169, float %9170, float %9171, float %9172, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 132096), i32 0, i32 0, ptr addrspace(3) %8472, i32 0, i32 0) #3, !dbg !307 + %9174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 0, !dbg !307 + %9175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 1, !dbg !307 + %9176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 2, !dbg !307 + %9177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 3, !dbg !307 + %9178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 4, !dbg !307 + %9179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 5, !dbg !307 + %9180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 6, !dbg !307 + %9181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 7, !dbg !307 + %9182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 8, !dbg !307 + %9183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 9, !dbg !307 + %9184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 10, !dbg !307 + %9185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 11, !dbg !307 + %9186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 12, !dbg !307 + %9187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 13, !dbg !307 + %9188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 14, !dbg !307 + %9189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 15, !dbg !307 + %9190 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 16, !dbg !307 + %9191 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 17, !dbg !307 + %9192 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 18, !dbg !307 + %9193 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 19, !dbg !307 + %9194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 20, !dbg !307 + %9195 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 21, !dbg !307 + %9196 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 22, !dbg !307 + %9197 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 23, !dbg !307 + %9198 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 24, !dbg !307 + %9199 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 25, !dbg !307 + %9200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 26, !dbg !307 + %9201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 27, !dbg !307 + %9202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 28, !dbg !307 + %9203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 29, !dbg !307 + %9204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 30, !dbg !307 + %9205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9173, 31, !dbg !307 + %9206 = insertelement <2 x float> poison, float %9176, i64 0, !dbg !308 + %9207 = insertelement <2 x float> %9206, float %9177, i64 1, !dbg !308 + %9208 = fsub <2 x float> %9207, %8908, !dbg !308 + %9209 = insertelement <2 x float> poison, float %9180, i64 0, !dbg !308 + %9210 = insertelement <2 x float> %9209, float %9181, i64 1, !dbg !308 + %9211 = fsub <2 x float> %9210, %8907, !dbg !308 + %9212 = insertelement <2 x float> poison, float %9184, i64 0, !dbg !308 + %9213 = insertelement <2 x float> %9212, float %9185, i64 1, !dbg !308 + %9214 = fsub <2 x float> %9213, %8906, !dbg !308 + %9215 = insertelement <2 x float> poison, float %9188, i64 0, !dbg !308 + %9216 = insertelement <2 x float> %9215, float %9189, i64 1, !dbg !308 + %9217 = fsub <2 x float> %9216, %8905, !dbg !308 + %9218 = insertelement <2 x float> poison, float %9192, i64 0, !dbg !308 + %9219 = insertelement <2 x float> %9218, float %9193, i64 1, !dbg !308 + %9220 = fsub <2 x float> %9219, %8904, !dbg !308 + %9221 = insertelement <2 x float> poison, float %9196, i64 0, !dbg !308 + %9222 = insertelement <2 x float> %9221, float %9197, i64 1, !dbg !308 + %9223 = fsub <2 x float> %9222, %8903, !dbg !308 + %9224 = insertelement <2 x float> poison, float %9200, i64 0, !dbg !308 + %9225 = insertelement <2 x float> %9224, float %9201, i64 1, !dbg !308 + %9226 = fsub <2 x float> %9225, %8902, !dbg !308 + %9227 = insertelement <2 x float> poison, float %9204, i64 0, !dbg !308 + %9228 = insertelement <2 x float> %9227, float %9205, i64 1, !dbg !308 + %9229 = fsub <2 x float> %9228, %8901, !dbg !308 + %9230 = fmul <2 x float> %8477, %9208, !dbg !309 + %9231 = fmul <2 x float> %8483, %9211, !dbg !309 + %9232 = fmul <2 x float> %8489, %9214, !dbg !309 + %9233 = fmul <2 x float> %8495, %9217, !dbg !309 + %9234 = fmul <2 x float> %8501, %9220, !dbg !309 + %9235 = fmul <2 x float> %8507, %9223, !dbg !309 + %9236 = fmul <2 x float> %8513, %9226, !dbg !309 + %9237 = fmul <2 x float> %8519, %9229, !dbg !309 + %9238 = insertelement <2 x float> poison, float %9174, i64 0, !dbg !308 + %9239 = insertelement <2 x float> %9238, float %9175, i64 1, !dbg !308 + %9240 = fsub <2 x float> %9239, %8908, !dbg !308 + %9241 = fmul <2 x float> %8474, %9240, !dbg !309 + %9242 = fptrunc <2 x float> %9241 to <2 x bfloat>, !dbg !310 + %9243 = fptrunc <2 x float> %9230 to <2 x bfloat>, !dbg !310 + %9244 = insertelement <2 x float> poison, float %9178, i64 0, !dbg !308 + %9245 = insertelement <2 x float> %9244, float %9179, i64 1, !dbg !308 + %9246 = fsub <2 x float> %9245, %8907, !dbg !308 + %9247 = fmul <2 x float> %8480, %9246, !dbg !309 + %9248 = fptrunc <2 x float> %9247 to <2 x bfloat>, !dbg !310 + %9249 = fptrunc <2 x float> %9231 to <2 x bfloat>, !dbg !310 + %9250 = insertelement <2 x float> poison, float %9182, i64 0, !dbg !308 + %9251 = insertelement <2 x float> %9250, float %9183, i64 1, !dbg !308 + %9252 = fsub <2 x float> %9251, %8906, !dbg !308 + %9253 = fmul <2 x float> %8486, %9252, !dbg !309 + %9254 = fptrunc <2 x float> %9253 to <2 x bfloat>, !dbg !310 + %9255 = fptrunc <2 x float> %9232 to <2 x bfloat>, !dbg !310 + %9256 = insertelement <2 x float> poison, float %9186, i64 0, !dbg !308 + %9257 = insertelement <2 x float> %9256, float %9187, i64 1, !dbg !308 + %9258 = fsub <2 x float> %9257, %8905, !dbg !308 + %9259 = fmul <2 x float> %8492, %9258, !dbg !309 + %9260 = fptrunc <2 x float> %9259 to <2 x bfloat>, !dbg !310 + %9261 = fptrunc <2 x float> %9233 to <2 x bfloat>, !dbg !310 + %9262 = insertelement <2 x float> poison, float %9190, i64 0, !dbg !308 + %9263 = insertelement <2 x float> %9262, float %9191, i64 1, !dbg !308 + %9264 = fsub <2 x float> %9263, %8904, !dbg !308 + %9265 = fmul <2 x float> %8498, %9264, !dbg !309 + %9266 = fptrunc <2 x float> %9265 to <2 x bfloat>, !dbg !310 + %9267 = fptrunc <2 x float> %9234 to <2 x bfloat>, !dbg !310 + %9268 = insertelement <2 x float> poison, float %9194, i64 0, !dbg !308 + %9269 = insertelement <2 x float> %9268, float %9195, i64 1, !dbg !308 + %9270 = fsub <2 x float> %9269, %8903, !dbg !308 + %9271 = fmul <2 x float> %8504, %9270, !dbg !309 + %9272 = fptrunc <2 x float> %9271 to <2 x bfloat>, !dbg !310 + %9273 = fptrunc <2 x float> %9235 to <2 x bfloat>, !dbg !310 + %9274 = insertelement <2 x float> poison, float %9198, i64 0, !dbg !308 + %9275 = insertelement <2 x float> %9274, float %9199, i64 1, !dbg !308 + %9276 = fsub <2 x float> %9275, %8902, !dbg !308 + %9277 = fmul <2 x float> %8510, %9276, !dbg !309 + %9278 = fptrunc <2 x float> %9277 to <2 x bfloat>, !dbg !310 + %9279 = fptrunc <2 x float> %9236 to <2 x bfloat>, !dbg !310 + %9280 = insertelement <2 x float> poison, float %9202, i64 0, !dbg !308 + %9281 = insertelement <2 x float> %9280, float %9203, i64 1, !dbg !308 + %9282 = fsub <2 x float> %9281, %8901, !dbg !308 + %9283 = fmul <2 x float> %8516, %9282, !dbg !309 + %9284 = fptrunc <2 x float> %9283 to <2 x bfloat>, !dbg !310 + %9285 = fptrunc <2 x float> %9237 to <2 x bfloat>, !dbg !310 + %9286 = bitcast <2 x bfloat> %9242 to i32, !dbg !311 + %9287 = bitcast <2 x bfloat> %9243 to i32, !dbg !311 + %9288 = bitcast <2 x bfloat> %9248 to i32, !dbg !311 + %9289 = bitcast <2 x bfloat> %9249 to i32, !dbg !311 + %9290 = bitcast <2 x bfloat> %9254 to i32, !dbg !311 + %9291 = bitcast <2 x bfloat> %9255 to i32, !dbg !311 + %9292 = bitcast <2 x bfloat> %9260 to i32, !dbg !311 + %9293 = bitcast <2 x bfloat> %9261 to i32, !dbg !311 + %9294 = bitcast <2 x bfloat> %9266 to i32, !dbg !311 + %9295 = bitcast <2 x bfloat> %9267 to i32, !dbg !311 + %9296 = bitcast <2 x bfloat> %9272 to i32, !dbg !311 + %9297 = bitcast <2 x bfloat> %9273 to i32, !dbg !311 + %9298 = bitcast <2 x bfloat> %9278 to i32, !dbg !311 + %9299 = bitcast <2 x bfloat> %9279 to i32, !dbg !311 + %9300 = bitcast <2 x bfloat> %9284 to i32, !dbg !311 + %9301 = bitcast <2 x bfloat> %9285 to i32, !dbg !311 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !311 + %9302 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %.pn3761546, float %.pn3741547, float %.pn3721548, float %.pn3701549, float %.pn3681550, float %.pn3661551, float %.pn3641552, float %.pn3621553, float %.pn3601554, float %.pn3581555, float %.pn3561556, float %.pn3541557, float %.pn3521558, float %.pn3501559, float %.pn3481560, float %.pn3461561, float %.pn3441562, float %.pn3421563, float %.pn3401564, float %.pn3381565, float %.pn3361566, float %.pn3341567, float %.pn3321568, float %.pn3301569, float %.pn3281570, float %.pn3261571, float %.pn3241572, float %.pn3221573, float %.pn3201574, float %.pn3181575, float %.pn3161576, float %.pn3141577, float %.pn3121578, float %.pn3101579, float %.pn3081580, float %.pn3061581, float %.pn3041582, float %.pn3021583, float %.pn3001584, float %.pn2981585, float %.pn2961586, float %.pn2941587, float %.pn2921588, float %.pn2901589, float %.pn2881590, float %.pn2861591, float %.pn2841592, float %.pn2821593, float %.pn2801594, float %.pn2781595, float %.pn2761596, float %.pn2741597, float %.pn2721598, float %.pn2701599, float %.pn2681600, float %.pn2661601, float %.pn2641602, float %.pn2621603, float %.pn2601604, float %.pn2581605, float %.pn2561606, float %.pn2541607, float %.pn2521608, float %.pn2501609, i32 %9286, i32 %9287, i32 %9288, i32 %9289, i64 %7841, i1 true) #3, !dbg !311 + %9303 = add i32 %7837, 2048, !dbg !311 + %9304 = lshr exact i32 %9303, 4, !dbg !311 + %9305 = and i32 %9304, 16383, !dbg !311 + %9306 = zext nneg i32 %9305 to i64, !dbg !311 + %9307 = or disjoint i64 %9306, 4611686293338849280, !dbg !311 + %9308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 0, !dbg !311 + %9309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 1, !dbg !311 + %9310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 2, !dbg !311 + %9311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 3, !dbg !311 + %9312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 4, !dbg !311 + %9313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 5, !dbg !311 + %9314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 6, !dbg !311 + %9315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 7, !dbg !311 + %9316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 8, !dbg !311 + %9317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 9, !dbg !311 + %9318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 10, !dbg !311 + %9319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 11, !dbg !311 + %9320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 12, !dbg !311 + %9321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 13, !dbg !311 + %9322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 14, !dbg !311 + %9323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 15, !dbg !311 + %9324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 16, !dbg !311 + %9325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 17, !dbg !311 + %9326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 18, !dbg !311 + %9327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 19, !dbg !311 + %9328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 20, !dbg !311 + %9329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 21, !dbg !311 + %9330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 22, !dbg !311 + %9331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 23, !dbg !311 + %9332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 24, !dbg !311 + %9333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 25, !dbg !311 + %9334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 26, !dbg !311 + %9335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 27, !dbg !311 + %9336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 28, !dbg !311 + %9337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 29, !dbg !311 + %9338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 30, !dbg !311 + %9339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 31, !dbg !311 + %9340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 32, !dbg !311 + %9341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 33, !dbg !311 + %9342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 34, !dbg !311 + %9343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 35, !dbg !311 + %9344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 36, !dbg !311 + %9345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 37, !dbg !311 + %9346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 38, !dbg !311 + %9347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 39, !dbg !311 + %9348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 40, !dbg !311 + %9349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 41, !dbg !311 + %9350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 42, !dbg !311 + %9351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 43, !dbg !311 + %9352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 44, !dbg !311 + %9353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 45, !dbg !311 + %9354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 46, !dbg !311 + %9355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 47, !dbg !311 + %9356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 48, !dbg !311 + %9357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 49, !dbg !311 + %9358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 50, !dbg !311 + %9359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 51, !dbg !311 + %9360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 52, !dbg !311 + %9361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 53, !dbg !311 + %9362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 54, !dbg !311 + %9363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 55, !dbg !311 + %9364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 56, !dbg !311 + %9365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 57, !dbg !311 + %9366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 58, !dbg !311 + %9367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 59, !dbg !311 + %9368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 60, !dbg !311 + %9369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 61, !dbg !311 + %9370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 62, !dbg !311 + %9371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9302, 63, !dbg !311 + %9372 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %9308, float %9309, float %9310, float %9311, float %9312, float %9313, float %9314, float %9315, float %9316, float %9317, float %9318, float %9319, float %9320, float %9321, float %9322, float %9323, float %9324, float %9325, float %9326, float %9327, float %9328, float %9329, float %9330, float %9331, float %9332, float %9333, float %9334, float %9335, float %9336, float %9337, float %9338, float %9339, float %9340, float %9341, float %9342, float %9343, float %9344, float %9345, float %9346, float %9347, float %9348, float %9349, float %9350, float %9351, float %9352, float %9353, float %9354, float %9355, float %9356, float %9357, float %9358, float %9359, float %9360, float %9361, float %9362, float %9363, float %9364, float %9365, float %9366, float %9367, float %9368, float %9369, float %9370, float %9371, i32 %9290, i32 %9291, i32 %9292, i32 %9293, i64 %9307, i1 true) #3, !dbg !311 + %9373 = add i32 %7837, 4096, !dbg !311 + %9374 = lshr exact i32 %9373, 4, !dbg !311 + %9375 = and i32 %9374, 16383, !dbg !311 + %9376 = zext nneg i32 %9375 to i64, !dbg !311 + %9377 = or disjoint i64 %9376, 4611686293338849280, !dbg !311 + %9378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 0, !dbg !311 + %9379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 1, !dbg !311 + %9380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 2, !dbg !311 + %9381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 3, !dbg !311 + %9382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 4, !dbg !311 + %9383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 5, !dbg !311 + %9384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 6, !dbg !311 + %9385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 7, !dbg !311 + %9386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 8, !dbg !311 + %9387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 9, !dbg !311 + %9388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 10, !dbg !311 + %9389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 11, !dbg !311 + %9390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 12, !dbg !311 + %9391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 13, !dbg !311 + %9392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 14, !dbg !311 + %9393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 15, !dbg !311 + %9394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 16, !dbg !311 + %9395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 17, !dbg !311 + %9396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 18, !dbg !311 + %9397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 19, !dbg !311 + %9398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 20, !dbg !311 + %9399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 21, !dbg !311 + %9400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 22, !dbg !311 + %9401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 23, !dbg !311 + %9402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 24, !dbg !311 + %9403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 25, !dbg !311 + %9404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 26, !dbg !311 + %9405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 27, !dbg !311 + %9406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 28, !dbg !311 + %9407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 29, !dbg !311 + %9408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 30, !dbg !311 + %9409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 31, !dbg !311 + %9410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 32, !dbg !311 + %9411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 33, !dbg !311 + %9412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 34, !dbg !311 + %9413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 35, !dbg !311 + %9414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 36, !dbg !311 + %9415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 37, !dbg !311 + %9416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 38, !dbg !311 + %9417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 39, !dbg !311 + %9418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 40, !dbg !311 + %9419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 41, !dbg !311 + %9420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 42, !dbg !311 + %9421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 43, !dbg !311 + %9422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 44, !dbg !311 + %9423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 45, !dbg !311 + %9424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 46, !dbg !311 + %9425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 47, !dbg !311 + %9426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 48, !dbg !311 + %9427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 49, !dbg !311 + %9428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 50, !dbg !311 + %9429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 51, !dbg !311 + %9430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 52, !dbg !311 + %9431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 53, !dbg !311 + %9432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 54, !dbg !311 + %9433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 55, !dbg !311 + %9434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 56, !dbg !311 + %9435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 57, !dbg !311 + %9436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 58, !dbg !311 + %9437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 59, !dbg !311 + %9438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 60, !dbg !311 + %9439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 61, !dbg !311 + %9440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 62, !dbg !311 + %9441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9372, 63, !dbg !311 + %9442 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %9378, float %9379, float %9380, float %9381, float %9382, float %9383, float %9384, float %9385, float %9386, float %9387, float %9388, float %9389, float %9390, float %9391, float %9392, float %9393, float %9394, float %9395, float %9396, float %9397, float %9398, float %9399, float %9400, float %9401, float %9402, float %9403, float %9404, float %9405, float %9406, float %9407, float %9408, float %9409, float %9410, float %9411, float %9412, float %9413, float %9414, float %9415, float %9416, float %9417, float %9418, float %9419, float %9420, float %9421, float %9422, float %9423, float %9424, float %9425, float %9426, float %9427, float %9428, float %9429, float %9430, float %9431, float %9432, float %9433, float %9434, float %9435, float %9436, float %9437, float %9438, float %9439, float %9440, float %9441, i32 %9294, i32 %9295, i32 %9296, i32 %9297, i64 %9377, i1 true) #3, !dbg !311 + %9443 = add i32 %7837, 6144, !dbg !311 + %9444 = lshr exact i32 %9443, 4, !dbg !311 + %9445 = and i32 %9444, 16383, !dbg !311 + %9446 = zext nneg i32 %9445 to i64, !dbg !311 + %9447 = or disjoint i64 %9446, 4611686293338849280, !dbg !311 + %9448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 0, !dbg !311 + %9449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 1, !dbg !311 + %9450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 2, !dbg !311 + %9451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 3, !dbg !311 + %9452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 4, !dbg !311 + %9453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 5, !dbg !311 + %9454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 6, !dbg !311 + %9455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 7, !dbg !311 + %9456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 8, !dbg !311 + %9457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 9, !dbg !311 + %9458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 10, !dbg !311 + %9459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 11, !dbg !311 + %9460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 12, !dbg !311 + %9461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 13, !dbg !311 + %9462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 14, !dbg !311 + %9463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 15, !dbg !311 + %9464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 16, !dbg !311 + %9465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 17, !dbg !311 + %9466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 18, !dbg !311 + %9467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 19, !dbg !311 + %9468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 20, !dbg !311 + %9469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 21, !dbg !311 + %9470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 22, !dbg !311 + %9471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 23, !dbg !311 + %9472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 24, !dbg !311 + %9473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 25, !dbg !311 + %9474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 26, !dbg !311 + %9475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 27, !dbg !311 + %9476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 28, !dbg !311 + %9477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 29, !dbg !311 + %9478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 30, !dbg !311 + %9479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 31, !dbg !311 + %9480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 32, !dbg !311 + %9481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 33, !dbg !311 + %9482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 34, !dbg !311 + %9483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 35, !dbg !311 + %9484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 36, !dbg !311 + %9485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 37, !dbg !311 + %9486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 38, !dbg !311 + %9487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 39, !dbg !311 + %9488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 40, !dbg !311 + %9489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 41, !dbg !311 + %9490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 42, !dbg !311 + %9491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 43, !dbg !311 + %9492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 44, !dbg !311 + %9493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 45, !dbg !311 + %9494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 46, !dbg !311 + %9495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 47, !dbg !311 + %9496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 48, !dbg !311 + %9497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 49, !dbg !311 + %9498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 50, !dbg !311 + %9499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 51, !dbg !311 + %9500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 52, !dbg !311 + %9501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 53, !dbg !311 + %9502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 54, !dbg !311 + %9503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 55, !dbg !311 + %9504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 56, !dbg !311 + %9505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 57, !dbg !311 + %9506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 58, !dbg !311 + %9507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 59, !dbg !311 + %9508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 60, !dbg !311 + %9509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 61, !dbg !311 + %9510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 62, !dbg !311 + %9511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9442, 63, !dbg !311 + %9512 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %9448, float %9449, float %9450, float %9451, float %9452, float %9453, float %9454, float %9455, float %9456, float %9457, float %9458, float %9459, float %9460, float %9461, float %9462, float %9463, float %9464, float %9465, float %9466, float %9467, float %9468, float %9469, float %9470, float %9471, float %9472, float %9473, float %9474, float %9475, float %9476, float %9477, float %9478, float %9479, float %9480, float %9481, float %9482, float %9483, float %9484, float %9485, float %9486, float %9487, float %9488, float %9489, float %9490, float %9491, float %9492, float %9493, float %9494, float %9495, float %9496, float %9497, float %9498, float %9499, float %9500, float %9501, float %9502, float %9503, float %9504, float %9505, float %9506, float %9507, float %9508, float %9509, float %9510, float %9511, i32 %9298, i32 %9299, i32 %9300, i32 %9301, i64 %9447, i1 true) #3, !dbg !311 + %9513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 0, !dbg !311 + %9514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 1, !dbg !311 + %9515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 2, !dbg !311 + %9516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 3, !dbg !311 + %9517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 4, !dbg !311 + %9518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 5, !dbg !311 + %9519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 6, !dbg !311 + %9520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 7, !dbg !311 + %9521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 8, !dbg !311 + %9522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 9, !dbg !311 + %9523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 10, !dbg !311 + %9524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 11, !dbg !311 + %9525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 12, !dbg !311 + %9526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 13, !dbg !311 + %9527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 14, !dbg !311 + %9528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 15, !dbg !311 + %9529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 16, !dbg !311 + %9530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 17, !dbg !311 + %9531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 18, !dbg !311 + %9532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 19, !dbg !311 + %9533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 20, !dbg !311 + %9534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 21, !dbg !311 + %9535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 22, !dbg !311 + %9536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 23, !dbg !311 + %9537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 24, !dbg !311 + %9538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 25, !dbg !311 + %9539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 26, !dbg !311 + %9540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 27, !dbg !311 + %9541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 28, !dbg !311 + %9542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 29, !dbg !311 + %9543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 30, !dbg !311 + %9544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 31, !dbg !311 + %9545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 32, !dbg !311 + %9546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 33, !dbg !311 + %9547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 34, !dbg !311 + %9548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 35, !dbg !311 + %9549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 36, !dbg !311 + %9550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 37, !dbg !311 + %9551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 38, !dbg !311 + %9552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 39, !dbg !311 + %9553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 40, !dbg !311 + %9554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 41, !dbg !311 + %9555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 42, !dbg !311 + %9556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 43, !dbg !311 + %9557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 44, !dbg !311 + %9558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 45, !dbg !311 + %9559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 46, !dbg !311 + %9560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 47, !dbg !311 + %9561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 48, !dbg !311 + %9562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 49, !dbg !311 + %9563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 50, !dbg !311 + %9564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 51, !dbg !311 + %9565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 52, !dbg !311 + %9566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 53, !dbg !311 + %9567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 54, !dbg !311 + %9568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 55, !dbg !311 + %9569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 56, !dbg !311 + %9570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 57, !dbg !311 + %9571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 58, !dbg !311 + %9572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 59, !dbg !311 + %9573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 60, !dbg !311 + %9574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 61, !dbg !311 + %9575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 62, !dbg !311 + %9576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9512, 63, !dbg !311 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !311 + %9577 = add nuw nsw i32 %7752, 1, !dbg !297 + %9578 = lshr i32 %9577, 1, !dbg !312 + %9579 = zext nneg i32 %9578 to i64, !dbg !313 + %9580 = getelementptr i32, ptr addrspace(1) %4756, i64 %9579, !dbg !313 + %9581 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !314 + %9582 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %9580, i64 %9581, i1 %7754) #3, !dbg !314 + %9583 = add nuw nsw i32 %9578, 1, !dbg !315 + %9584 = icmp slt i32 %9583, %4760, !dbg !316 + %9585 = getelementptr i8, ptr addrspace(1) %9580, i64 4, !dbg !317 + %9586 = and i1 %7754, %9584, !dbg !297 + %9587 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !318 + %9588 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %9585, i64 %9587, i1 %9586) #3, !dbg !318 + %9589 = and i32 %7752, 1, !dbg !319 + %9590 = sub i32 %9588, %9582, !dbg !320 + %9591 = shl i32 %9590, 7, !dbg !321 + %9592 = add i32 %9591, -64, !dbg !322 + %9593 = xor i32 %9589, 1, !dbg !323 + %9594 = mul nuw nsw i32 %9592, %9593, !dbg !323 + %9595 = shl nuw nsw i32 %9589, 6, !dbg !324 + %9596 = add i32 %9594, %9595, !dbg !325 + %9597 = shl i32 %9596, 12, !dbg !326 + %9598 = sext i32 %9597 to i64, !dbg !295 + %9599 = getelementptr bfloat, ptr addrspace(1) %.pn5681674, i64 %9598, !dbg !295 + %9600 = getelementptr bfloat, ptr addrspace(1) %.pn5521675, i64 %9598, !dbg !295 + %9601 = getelementptr bfloat, ptr addrspace(1) %.pn5361676, i64 %9598, !dbg !295 + %9602 = getelementptr bfloat, ptr addrspace(1) %.pn5201677, i64 %9598, !dbg !295 + %9603 = shl i32 %9596, 7, !dbg !327 + %9604 = sext i32 %9603 to i64, !dbg !296 + %9605 = getelementptr bfloat, ptr addrspace(1) %.pn6321678, i64 %9604, !dbg !296 + %9606 = getelementptr bfloat, ptr addrspace(1) %.pn6161679, i64 %9604, !dbg !296 + %9607 = getelementptr bfloat, ptr addrspace(1) %.pn6001680, i64 %9604, !dbg !296 + %9608 = getelementptr bfloat, ptr addrspace(1) %.pn5841681, i64 %9604, !dbg !296 + %9609 = add i32 %9596, %.pn6641682, !dbg !328 + %9610 = add i32 %9596, %.pn6601683, !dbg !328 + %9611 = add i32 %9596, %.pn6561684, !dbg !328 + %9612 = add i32 %9596, %.pn6521685, !dbg !328 + %9613 = add i32 %9596, %.pn6481686, !dbg !328 + %9614 = add i32 %9596, %.pn6441687, !dbg !328 + %9615 = add i32 %9596, %.pn6401688, !dbg !328 + %9616 = add i32 %9596, %.pn6361689, !dbg !328 + %9617 = add i32 %7749, 1, !dbg !297 + %9618 = icmp sgt i32 %9617, 1, !dbg !297 + %9619 = select i1 %9618, i32 0, i32 %9617, !dbg !297 + %9620 = add i32 %7751, 1, !dbg !297 + %9621 = icmp sgt i32 %9620, 2, !dbg !297 + %9622 = select i1 %9621, i32 0, i32 %9620, !dbg !297 + %9623 = shl i32 %9622, 13, !dbg !290 + %9624 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %9623, !dbg !290 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !290 + %9625 = getelementptr inbounds nuw i8, ptr addrspace(3) %9624, i32 %4793, !dbg !290 + %9626 = select i1 %7753, i32 16, i32 0, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %9625, ptr addrspace(1) %9599, i32 %9626) #3, !dbg !290 + %9627 = getelementptr inbounds nuw i8, ptr addrspace(3) %9624, i32 %4796, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %9627, ptr addrspace(1) %9600, i32 %9626) #3, !dbg !290 + %9628 = getelementptr inbounds nuw i8, ptr addrspace(3) %9624, i32 %4798, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %9628, ptr addrspace(1) %9601, i32 %9626) #3, !dbg !290 + %9629 = getelementptr inbounds nuw i8, ptr addrspace(3) %9624, i32 %4800, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %9629, ptr addrspace(1) %9602, i32 %9626) #3, !dbg !290 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !290 + %9630 = sext i32 %9609 to i64, !dbg !291 + %9631 = getelementptr float, ptr addrspace(1) %5087, i64 %9630, !dbg !291 + %9632 = sext i32 %9610 to i64, !dbg !291 + %9633 = getelementptr float, ptr addrspace(1) %5087, i64 %9632, !dbg !291 + %9634 = sext i32 %9611 to i64, !dbg !291 + %9635 = getelementptr float, ptr addrspace(1) %5087, i64 %9634, !dbg !291 + %9636 = sext i32 %9612 to i64, !dbg !291 + %9637 = getelementptr float, ptr addrspace(1) %5087, i64 %9636, !dbg !291 + %9638 = sext i32 %9613 to i64, !dbg !291 + %9639 = getelementptr float, ptr addrspace(1) %5087, i64 %9638, !dbg !291 + %9640 = sext i32 %9614 to i64, !dbg !291 + %9641 = getelementptr float, ptr addrspace(1) %5087, i64 %9640, !dbg !291 + %9642 = sext i32 %9615 to i64, !dbg !291 + %9643 = getelementptr float, ptr addrspace(1) %5087, i64 %9642, !dbg !291 + %9644 = sext i32 %9616 to i64, !dbg !291 + %9645 = getelementptr float, ptr addrspace(1) %5087, i64 %9644, !dbg !291 + %9646 = shl i32 %9619, 6, !dbg !292 + %9647 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %9646, !dbg !292 + %9648 = getelementptr inbounds nuw i8, ptr addrspace(3) %9647, i32 %4812, !dbg !292 + %9649 = select i1 %7753, i32 8, i32 0, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %9648, ptr addrspace(1) %9631, i32 %9649, i1 %4811) #3, !dbg !292 + %9650 = getelementptr inbounds nuw i8, ptr addrspace(3) %9647, i32 %4815, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9650, ptr addrspace(1) %9633, i32 %9649, i1 %4811) #3, !dbg !292 + %9651 = getelementptr inbounds nuw i8, ptr addrspace(3) %9647, i32 %4817, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9651, ptr addrspace(1) %9635, i32 %9649, i1 %4811) #3, !dbg !292 + %9652 = getelementptr inbounds nuw i8, ptr addrspace(3) %9647, i32 %4819, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9652, ptr addrspace(1) %9637, i32 %9649, i1 %4811) #3, !dbg !292 + %9653 = getelementptr inbounds nuw i8, ptr addrspace(3) %9647, i32 %4821, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9653, ptr addrspace(1) %9639, i32 %9649, i1 %4811) #3, !dbg !292 + %9654 = getelementptr inbounds nuw i8, ptr addrspace(3) %9647, i32 %4823, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9654, ptr addrspace(1) %9641, i32 %9649, i1 %4811) #3, !dbg !292 + %9655 = getelementptr inbounds nuw i8, ptr addrspace(3) %9647, i32 %4825, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9655, ptr addrspace(1) %9643, i32 %9649, i1 %4811) #3, !dbg !292 + %9656 = getelementptr inbounds nuw i8, ptr addrspace(3) %9647, i32 %4827, !dbg !292 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9656, ptr addrspace(1) %9645, i32 %9649, i1 %4811) #3, !dbg !292 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !292 + %9657 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %9623, !dbg !290 + %9658 = getelementptr inbounds nuw i8, ptr addrspace(3) %9657, i32 %4793, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %9658, ptr addrspace(1) %9605, i32 %9626) #3, !dbg !290 + %9659 = getelementptr inbounds nuw i8, ptr addrspace(3) %9657, i32 %4796, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %9659, ptr addrspace(1) %9606, i32 %9626) #3, !dbg !290 + %9660 = getelementptr inbounds nuw i8, ptr addrspace(3) %9657, i32 %4798, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %9660, ptr addrspace(1) %9607, i32 %9626) #3, !dbg !290 + %9661 = getelementptr inbounds nuw i8, ptr addrspace(3) %9657, i32 %4800, !dbg !290 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %9661, ptr addrspace(1) %9608, i32 %9626) #3, !dbg !290 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !290 + %9662 = getelementptr float, ptr addrspace(1) %5088, i64 %9630, !dbg !293 + %9663 = getelementptr float, ptr addrspace(1) %5088, i64 %9632, !dbg !293 + %9664 = getelementptr float, ptr addrspace(1) %5088, i64 %9634, !dbg !293 + %9665 = getelementptr float, ptr addrspace(1) %5088, i64 %9636, !dbg !293 + %9666 = getelementptr float, ptr addrspace(1) %5088, i64 %9638, !dbg !293 + %9667 = getelementptr float, ptr addrspace(1) %5088, i64 %9640, !dbg !293 + %9668 = getelementptr float, ptr addrspace(1) %5088, i64 %9642, !dbg !293 + %9669 = getelementptr float, ptr addrspace(1) %5088, i64 %9644, !dbg !293 + %9670 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98816), i32 %9646, !dbg !294 + %9671 = getelementptr inbounds nuw i8, ptr addrspace(3) %9670, i32 %4812, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %9671, ptr addrspace(1) %9662, i32 %9649, i1 %4811) #3, !dbg !294 + %9672 = getelementptr inbounds nuw i8, ptr addrspace(3) %9670, i32 %4815, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9672, ptr addrspace(1) %9663, i32 %9649, i1 %4811) #3, !dbg !294 + %9673 = getelementptr inbounds nuw i8, ptr addrspace(3) %9670, i32 %4817, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9673, ptr addrspace(1) %9664, i32 %9649, i1 %4811) #3, !dbg !294 + %9674 = getelementptr inbounds nuw i8, ptr addrspace(3) %9670, i32 %4819, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9674, ptr addrspace(1) %9665, i32 %9649, i1 %4811) #3, !dbg !294 + %9675 = getelementptr inbounds nuw i8, ptr addrspace(3) %9670, i32 %4821, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9675, ptr addrspace(1) %9666, i32 %9649, i1 %4811) #3, !dbg !294 + %9676 = getelementptr inbounds nuw i8, ptr addrspace(3) %9670, i32 %4823, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9676, ptr addrspace(1) %9667, i32 %9649, i1 %4811) #3, !dbg !294 + %9677 = getelementptr inbounds nuw i8, ptr addrspace(3) %9670, i32 %4825, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9677, ptr addrspace(1) %9668, i32 %9649, i1 %4811) #3, !dbg !294 + %9678 = getelementptr inbounds nuw i8, ptr addrspace(3) %9670, i32 %4827, !dbg !294 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %9678, ptr addrspace(1) %9669, i32 %9649, i1 %4811) #3, !dbg !294 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !294 + %exitcond2122.not = icmp eq i32 %9577, %smax2121, !dbg !297 + br i1 %exitcond2122.not, label %._crit_edge1692, label %.lr.ph1691, !dbg !297 + +._crit_edge1692: ; preds = %__nv_exp2f.exit1219, %._crit_edge + %.pn376.lcssa = phi float [ %7628, %._crit_edge ], [ %9513, %__nv_exp2f.exit1219 ] + %.pn374.lcssa = phi float [ %7629, %._crit_edge ], [ %9514, %__nv_exp2f.exit1219 ] + %.pn372.lcssa = phi float [ %7630, %._crit_edge ], [ %9515, %__nv_exp2f.exit1219 ] + %.pn370.lcssa = phi float [ %7631, %._crit_edge ], [ %9516, %__nv_exp2f.exit1219 ] + %.pn368.lcssa = phi float [ %7632, %._crit_edge ], [ %9517, %__nv_exp2f.exit1219 ] + %.pn366.lcssa = phi float [ %7633, %._crit_edge ], [ %9518, %__nv_exp2f.exit1219 ] + %.pn364.lcssa = phi float [ %7634, %._crit_edge ], [ %9519, %__nv_exp2f.exit1219 ] + %.pn362.lcssa = phi float [ %7635, %._crit_edge ], [ %9520, %__nv_exp2f.exit1219 ] + %.pn360.lcssa = phi float [ %7636, %._crit_edge ], [ %9521, %__nv_exp2f.exit1219 ] + %.pn358.lcssa = phi float [ %7637, %._crit_edge ], [ %9522, %__nv_exp2f.exit1219 ] + %.pn356.lcssa = phi float [ %7638, %._crit_edge ], [ %9523, %__nv_exp2f.exit1219 ] + %.pn354.lcssa = phi float [ %7639, %._crit_edge ], [ %9524, %__nv_exp2f.exit1219 ] + %.pn352.lcssa = phi float [ %7640, %._crit_edge ], [ %9525, %__nv_exp2f.exit1219 ] + %.pn350.lcssa = phi float [ %7641, %._crit_edge ], [ %9526, %__nv_exp2f.exit1219 ] + %.pn348.lcssa = phi float [ %7642, %._crit_edge ], [ %9527, %__nv_exp2f.exit1219 ] + %.pn346.lcssa = phi float [ %7643, %._crit_edge ], [ %9528, %__nv_exp2f.exit1219 ] + %.pn344.lcssa = phi float [ %7644, %._crit_edge ], [ %9529, %__nv_exp2f.exit1219 ] + %.pn342.lcssa = phi float [ %7645, %._crit_edge ], [ %9530, %__nv_exp2f.exit1219 ] + %.pn340.lcssa = phi float [ %7646, %._crit_edge ], [ %9531, %__nv_exp2f.exit1219 ] + %.pn338.lcssa = phi float [ %7647, %._crit_edge ], [ %9532, %__nv_exp2f.exit1219 ] + %.pn336.lcssa = phi float [ %7648, %._crit_edge ], [ %9533, %__nv_exp2f.exit1219 ] + %.pn334.lcssa = phi float [ %7649, %._crit_edge ], [ %9534, %__nv_exp2f.exit1219 ] + %.pn332.lcssa = phi float [ %7650, %._crit_edge ], [ %9535, %__nv_exp2f.exit1219 ] + %.pn330.lcssa = phi float [ %7651, %._crit_edge ], [ %9536, %__nv_exp2f.exit1219 ] + %.pn328.lcssa = phi float [ %7652, %._crit_edge ], [ %9537, %__nv_exp2f.exit1219 ] + %.pn326.lcssa = phi float [ %7653, %._crit_edge ], [ %9538, %__nv_exp2f.exit1219 ] + %.pn324.lcssa = phi float [ %7654, %._crit_edge ], [ %9539, %__nv_exp2f.exit1219 ] + %.pn322.lcssa = phi float [ %7655, %._crit_edge ], [ %9540, %__nv_exp2f.exit1219 ] + %.pn320.lcssa = phi float [ %7656, %._crit_edge ], [ %9541, %__nv_exp2f.exit1219 ] + %.pn318.lcssa = phi float [ %7657, %._crit_edge ], [ %9542, %__nv_exp2f.exit1219 ] + %.pn316.lcssa = phi float [ %7658, %._crit_edge ], [ %9543, %__nv_exp2f.exit1219 ] + %.pn314.lcssa = phi float [ %7659, %._crit_edge ], [ %9544, %__nv_exp2f.exit1219 ] + %.pn312.lcssa = phi float [ %7660, %._crit_edge ], [ %9545, %__nv_exp2f.exit1219 ] + %.pn310.lcssa = phi float [ %7661, %._crit_edge ], [ %9546, %__nv_exp2f.exit1219 ] + %.pn308.lcssa = phi float [ %7662, %._crit_edge ], [ %9547, %__nv_exp2f.exit1219 ] + %.pn306.lcssa = phi float [ %7663, %._crit_edge ], [ %9548, %__nv_exp2f.exit1219 ] + %.pn304.lcssa = phi float [ %7664, %._crit_edge ], [ %9549, %__nv_exp2f.exit1219 ] + %.pn302.lcssa = phi float [ %7665, %._crit_edge ], [ %9550, %__nv_exp2f.exit1219 ] + %.pn300.lcssa = phi float [ %7666, %._crit_edge ], [ %9551, %__nv_exp2f.exit1219 ] + %.pn298.lcssa = phi float [ %7667, %._crit_edge ], [ %9552, %__nv_exp2f.exit1219 ] + %.pn296.lcssa = phi float [ %7668, %._crit_edge ], [ %9553, %__nv_exp2f.exit1219 ] + %.pn294.lcssa = phi float [ %7669, %._crit_edge ], [ %9554, %__nv_exp2f.exit1219 ] + %.pn292.lcssa = phi float [ %7670, %._crit_edge ], [ %9555, %__nv_exp2f.exit1219 ] + %.pn290.lcssa = phi float [ %7671, %._crit_edge ], [ %9556, %__nv_exp2f.exit1219 ] + %.pn288.lcssa = phi float [ %7672, %._crit_edge ], [ %9557, %__nv_exp2f.exit1219 ] + %.pn286.lcssa = phi float [ %7673, %._crit_edge ], [ %9558, %__nv_exp2f.exit1219 ] + %.pn284.lcssa = phi float [ %7674, %._crit_edge ], [ %9559, %__nv_exp2f.exit1219 ] + %.pn282.lcssa = phi float [ %7675, %._crit_edge ], [ %9560, %__nv_exp2f.exit1219 ] + %.pn280.lcssa = phi float [ %7676, %._crit_edge ], [ %9561, %__nv_exp2f.exit1219 ] + %.pn278.lcssa = phi float [ %7677, %._crit_edge ], [ %9562, %__nv_exp2f.exit1219 ] + %.pn276.lcssa = phi float [ %7678, %._crit_edge ], [ %9563, %__nv_exp2f.exit1219 ] + %.pn274.lcssa = phi float [ %7679, %._crit_edge ], [ %9564, %__nv_exp2f.exit1219 ] + %.pn272.lcssa = phi float [ %7680, %._crit_edge ], [ %9565, %__nv_exp2f.exit1219 ] + %.pn270.lcssa = phi float [ %7681, %._crit_edge ], [ %9566, %__nv_exp2f.exit1219 ] + %.pn268.lcssa = phi float [ %7682, %._crit_edge ], [ %9567, %__nv_exp2f.exit1219 ] + %.pn266.lcssa = phi float [ %7683, %._crit_edge ], [ %9568, %__nv_exp2f.exit1219 ] + %.pn264.lcssa = phi float [ %7684, %._crit_edge ], [ %9569, %__nv_exp2f.exit1219 ] + %.pn262.lcssa = phi float [ %7685, %._crit_edge ], [ %9570, %__nv_exp2f.exit1219 ] + %.pn260.lcssa = phi float [ %7686, %._crit_edge ], [ %9571, %__nv_exp2f.exit1219 ] + %.pn258.lcssa = phi float [ %7687, %._crit_edge ], [ %9572, %__nv_exp2f.exit1219 ] + %.pn256.lcssa = phi float [ %7688, %._crit_edge ], [ %9573, %__nv_exp2f.exit1219 ] + %.pn254.lcssa = phi float [ %7689, %._crit_edge ], [ %9574, %__nv_exp2f.exit1219 ] + %.pn252.lcssa = phi float [ %7690, %._crit_edge ], [ %9575, %__nv_exp2f.exit1219 ] + %.pn250.lcssa = phi float [ %7691, %._crit_edge ], [ %9576, %__nv_exp2f.exit1219 ] + %.pn504.lcssa = phi float [ %7564, %._crit_edge ], [ %8753, %__nv_exp2f.exit1219 ] + %.pn502.lcssa = phi float [ %7565, %._crit_edge ], [ %8754, %__nv_exp2f.exit1219 ] + %.pn500.lcssa = phi float [ %7566, %._crit_edge ], [ %8755, %__nv_exp2f.exit1219 ] + %.pn498.lcssa = phi float [ %7567, %._crit_edge ], [ %8756, %__nv_exp2f.exit1219 ] + %.pn496.lcssa = phi float [ %7568, %._crit_edge ], [ %8757, %__nv_exp2f.exit1219 ] + %.pn494.lcssa = phi float [ %7569, %._crit_edge ], [ %8758, %__nv_exp2f.exit1219 ] + %.pn492.lcssa = phi float [ %7570, %._crit_edge ], [ %8759, %__nv_exp2f.exit1219 ] + %.pn490.lcssa = phi float [ %7571, %._crit_edge ], [ %8760, %__nv_exp2f.exit1219 ] + %.pn488.lcssa = phi float [ %7572, %._crit_edge ], [ %8761, %__nv_exp2f.exit1219 ] + %.pn486.lcssa = phi float [ %7573, %._crit_edge ], [ %8762, %__nv_exp2f.exit1219 ] + %.pn484.lcssa = phi float [ %7574, %._crit_edge ], [ %8763, %__nv_exp2f.exit1219 ] + %.pn482.lcssa = phi float [ %7575, %._crit_edge ], [ %8764, %__nv_exp2f.exit1219 ] + %.pn480.lcssa = phi float [ %7576, %._crit_edge ], [ %8765, %__nv_exp2f.exit1219 ] + %.pn478.lcssa = phi float [ %7577, %._crit_edge ], [ %8766, %__nv_exp2f.exit1219 ] + %.pn476.lcssa = phi float [ %7578, %._crit_edge ], [ %8767, %__nv_exp2f.exit1219 ] + %.pn474.lcssa = phi float [ %7579, %._crit_edge ], [ %8768, %__nv_exp2f.exit1219 ] + %.pn472.lcssa = phi float [ %7580, %._crit_edge ], [ %8769, %__nv_exp2f.exit1219 ] + %.pn470.lcssa = phi float [ %7581, %._crit_edge ], [ %8770, %__nv_exp2f.exit1219 ] + %.pn468.lcssa = phi float [ %7582, %._crit_edge ], [ %8771, %__nv_exp2f.exit1219 ] + %.pn466.lcssa = phi float [ %7583, %._crit_edge ], [ %8772, %__nv_exp2f.exit1219 ] + %.pn464.lcssa = phi float [ %7584, %._crit_edge ], [ %8773, %__nv_exp2f.exit1219 ] + %.pn462.lcssa = phi float [ %7585, %._crit_edge ], [ %8774, %__nv_exp2f.exit1219 ] + %.pn460.lcssa = phi float [ %7586, %._crit_edge ], [ %8775, %__nv_exp2f.exit1219 ] + %.pn458.lcssa = phi float [ %7587, %._crit_edge ], [ %8776, %__nv_exp2f.exit1219 ] + %.pn456.lcssa = phi float [ %7588, %._crit_edge ], [ %8777, %__nv_exp2f.exit1219 ] + %.pn454.lcssa = phi float [ %7589, %._crit_edge ], [ %8778, %__nv_exp2f.exit1219 ] + %.pn452.lcssa = phi float [ %7590, %._crit_edge ], [ %8779, %__nv_exp2f.exit1219 ] + %.pn450.lcssa = phi float [ %7591, %._crit_edge ], [ %8780, %__nv_exp2f.exit1219 ] + %.pn448.lcssa = phi float [ %7592, %._crit_edge ], [ %8781, %__nv_exp2f.exit1219 ] + %.pn446.lcssa = phi float [ %7593, %._crit_edge ], [ %8782, %__nv_exp2f.exit1219 ] + %.pn444.lcssa = phi float [ %7594, %._crit_edge ], [ %8783, %__nv_exp2f.exit1219 ] + %.pn442.lcssa = phi float [ %7595, %._crit_edge ], [ %8784, %__nv_exp2f.exit1219 ] + %.pn440.lcssa = phi float [ %7596, %._crit_edge ], [ %8785, %__nv_exp2f.exit1219 ] + %.pn438.lcssa = phi float [ %7597, %._crit_edge ], [ %8786, %__nv_exp2f.exit1219 ] + %.pn436.lcssa = phi float [ %7598, %._crit_edge ], [ %8787, %__nv_exp2f.exit1219 ] + %.pn434.lcssa = phi float [ %7599, %._crit_edge ], [ %8788, %__nv_exp2f.exit1219 ] + %.pn432.lcssa = phi float [ %7600, %._crit_edge ], [ %8789, %__nv_exp2f.exit1219 ] + %.pn430.lcssa = phi float [ %7601, %._crit_edge ], [ %8790, %__nv_exp2f.exit1219 ] + %.pn428.lcssa = phi float [ %7602, %._crit_edge ], [ %8791, %__nv_exp2f.exit1219 ] + %.pn426.lcssa = phi float [ %7603, %._crit_edge ], [ %8792, %__nv_exp2f.exit1219 ] + %.pn424.lcssa = phi float [ %7604, %._crit_edge ], [ %8793, %__nv_exp2f.exit1219 ] + %.pn422.lcssa = phi float [ %7605, %._crit_edge ], [ %8794, %__nv_exp2f.exit1219 ] + %.pn420.lcssa = phi float [ %7606, %._crit_edge ], [ %8795, %__nv_exp2f.exit1219 ] + %.pn418.lcssa = phi float [ %7607, %._crit_edge ], [ %8796, %__nv_exp2f.exit1219 ] + %.pn416.lcssa = phi float [ %7608, %._crit_edge ], [ %8797, %__nv_exp2f.exit1219 ] + %.pn414.lcssa = phi float [ %7609, %._crit_edge ], [ %8798, %__nv_exp2f.exit1219 ] + %.pn412.lcssa = phi float [ %7610, %._crit_edge ], [ %8799, %__nv_exp2f.exit1219 ] + %.pn410.lcssa = phi float [ %7611, %._crit_edge ], [ %8800, %__nv_exp2f.exit1219 ] + %.pn408.lcssa = phi float [ %7612, %._crit_edge ], [ %8801, %__nv_exp2f.exit1219 ] + %.pn406.lcssa = phi float [ %7613, %._crit_edge ], [ %8802, %__nv_exp2f.exit1219 ] + %.pn404.lcssa = phi float [ %7614, %._crit_edge ], [ %8803, %__nv_exp2f.exit1219 ] + %.pn402.lcssa = phi float [ %7615, %._crit_edge ], [ %8804, %__nv_exp2f.exit1219 ] + %.pn400.lcssa = phi float [ %7616, %._crit_edge ], [ %8805, %__nv_exp2f.exit1219 ] + %.pn398.lcssa = phi float [ %7617, %._crit_edge ], [ %8806, %__nv_exp2f.exit1219 ] + %.pn396.lcssa = phi float [ %7618, %._crit_edge ], [ %8807, %__nv_exp2f.exit1219 ] + %.pn394.lcssa = phi float [ %7619, %._crit_edge ], [ %8808, %__nv_exp2f.exit1219 ] + %.pn392.lcssa = phi float [ %7620, %._crit_edge ], [ %8809, %__nv_exp2f.exit1219 ] + %.pn390.lcssa = phi float [ %7621, %._crit_edge ], [ %8810, %__nv_exp2f.exit1219 ] + %.pn388.lcssa = phi float [ %7622, %._crit_edge ], [ %8811, %__nv_exp2f.exit1219 ] + %.pn386.lcssa = phi float [ %7623, %._crit_edge ], [ %8812, %__nv_exp2f.exit1219 ] + %.pn384.lcssa = phi float [ %7624, %._crit_edge ], [ %8813, %__nv_exp2f.exit1219 ] + %.pn382.lcssa = phi float [ %7625, %._crit_edge ], [ %8814, %__nv_exp2f.exit1219 ] + %.pn380.lcssa = phi float [ %7626, %._crit_edge ], [ %8815, %__nv_exp2f.exit1219 ] + %.pn378.lcssa = phi float [ %7627, %._crit_edge ], [ %8816, %__nv_exp2f.exit1219 ] + %9679 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101,$102,$103,$104,$105,$106,$107,$108,$109,$110,$111,$112,$113,$114,$115,$116,$117,$118,$119,$120,$121,$122,$123,$124,$125,$126,$127\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127"(float %.pn504.lcssa, float %.pn502.lcssa, float %.pn500.lcssa, float %.pn498.lcssa, float %.pn496.lcssa, float %.pn494.lcssa, float %.pn492.lcssa, float %.pn490.lcssa, float %.pn488.lcssa, float %.pn486.lcssa, float %.pn484.lcssa, float %.pn482.lcssa, float %.pn480.lcssa, float %.pn478.lcssa, float %.pn476.lcssa, float %.pn474.lcssa, float %.pn472.lcssa, float %.pn470.lcssa, float %.pn468.lcssa, float %.pn466.lcssa, float %.pn464.lcssa, float %.pn462.lcssa, float %.pn460.lcssa, float %.pn458.lcssa, float %.pn456.lcssa, float %.pn454.lcssa, float %.pn452.lcssa, float %.pn450.lcssa, float %.pn448.lcssa, float %.pn446.lcssa, float %.pn444.lcssa, float %.pn442.lcssa, float %.pn440.lcssa, float %.pn438.lcssa, float %.pn436.lcssa, float %.pn434.lcssa, float %.pn432.lcssa, float %.pn430.lcssa, float %.pn428.lcssa, float %.pn426.lcssa, float %.pn424.lcssa, float %.pn422.lcssa, float %.pn420.lcssa, float %.pn418.lcssa, float %.pn416.lcssa, float %.pn414.lcssa, float %.pn412.lcssa, float %.pn410.lcssa, float %.pn408.lcssa, float %.pn406.lcssa, float %.pn404.lcssa, float %.pn402.lcssa, float %.pn400.lcssa, float %.pn398.lcssa, float %.pn396.lcssa, float %.pn394.lcssa, float %.pn392.lcssa, float %.pn390.lcssa, float %.pn388.lcssa, float %.pn386.lcssa, float %.pn384.lcssa, float %.pn382.lcssa, float %.pn380.lcssa, float %.pn378.lcssa, float %.pn376.lcssa, float %.pn374.lcssa, float %.pn372.lcssa, float %.pn370.lcssa, float %.pn368.lcssa, float %.pn366.lcssa, float %.pn364.lcssa, float %.pn362.lcssa, float %.pn360.lcssa, float %.pn358.lcssa, float %.pn356.lcssa, float %.pn354.lcssa, float %.pn352.lcssa, float %.pn350.lcssa, float %.pn348.lcssa, float %.pn346.lcssa, float %.pn344.lcssa, float %.pn342.lcssa, float %.pn340.lcssa, float %.pn338.lcssa, float %.pn336.lcssa, float %.pn334.lcssa, float %.pn332.lcssa, float %.pn330.lcssa, float %.pn328.lcssa, float %.pn326.lcssa, float %.pn324.lcssa, float %.pn322.lcssa, float %.pn320.lcssa, float %.pn318.lcssa, float %.pn316.lcssa, float %.pn314.lcssa, float %.pn312.lcssa, float %.pn310.lcssa, float %.pn308.lcssa, float %.pn306.lcssa, float %.pn304.lcssa, float %.pn302.lcssa, float %.pn300.lcssa, float %.pn298.lcssa, float %.pn296.lcssa, float %.pn294.lcssa, float %.pn292.lcssa, float %.pn290.lcssa, float %.pn288.lcssa, float %.pn286.lcssa, float %.pn284.lcssa, float %.pn282.lcssa, float %.pn280.lcssa, float %.pn278.lcssa, float %.pn276.lcssa, float %.pn274.lcssa, float %.pn272.lcssa, float %.pn270.lcssa, float %.pn268.lcssa, float %.pn266.lcssa, float %.pn264.lcssa, float %.pn262.lcssa, float %.pn260.lcssa, float %.pn258.lcssa, float %.pn256.lcssa, float %.pn254.lcssa, float %.pn252.lcssa, float %.pn250.lcssa) #3, !dbg !297 + %9680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 0, !dbg !297 + %9681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 1, !dbg !297 + %9682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 2, !dbg !297 + %9683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 3, !dbg !297 + %9684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 4, !dbg !297 + %9685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 5, !dbg !297 + %9686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 6, !dbg !297 + %9687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 7, !dbg !297 + %9688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 8, !dbg !297 + %9689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 9, !dbg !297 + %9690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 10, !dbg !297 + %9691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 11, !dbg !297 + %9692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 12, !dbg !297 + %9693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 13, !dbg !297 + %9694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 14, !dbg !297 + %9695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 15, !dbg !297 + %9696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 16, !dbg !297 + %9697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 17, !dbg !297 + %9698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 18, !dbg !297 + %9699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 19, !dbg !297 + %9700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 20, !dbg !297 + %9701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 21, !dbg !297 + %9702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 22, !dbg !297 + %9703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 23, !dbg !297 + %9704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 24, !dbg !297 + %9705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 25, !dbg !297 + %9706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 26, !dbg !297 + %9707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 27, !dbg !297 + %9708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 28, !dbg !297 + %9709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 29, !dbg !297 + %9710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 30, !dbg !297 + %9711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 31, !dbg !297 + %9712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 32, !dbg !297 + %9713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 33, !dbg !297 + %9714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 34, !dbg !297 + %9715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 35, !dbg !297 + %9716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 36, !dbg !297 + %9717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 37, !dbg !297 + %9718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 38, !dbg !297 + %9719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 39, !dbg !297 + %9720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 40, !dbg !297 + %9721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 41, !dbg !297 + %9722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 42, !dbg !297 + %9723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 43, !dbg !297 + %9724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 44, !dbg !297 + %9725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 45, !dbg !297 + %9726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 46, !dbg !297 + %9727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 47, !dbg !297 + %9728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 48, !dbg !297 + %9729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 49, !dbg !297 + %9730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 50, !dbg !297 + %9731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 51, !dbg !297 + %9732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 52, !dbg !297 + %9733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 53, !dbg !297 + %9734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 54, !dbg !297 + %9735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 55, !dbg !297 + %9736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 56, !dbg !297 + %9737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 57, !dbg !297 + %9738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 58, !dbg !297 + %9739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 59, !dbg !297 + %9740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 60, !dbg !297 + %9741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 61, !dbg !297 + %9742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 62, !dbg !297 + %9743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 63, !dbg !297 + %9744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 64, !dbg !297 + %9745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 65, !dbg !297 + %9746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 66, !dbg !297 + %9747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 67, !dbg !297 + %9748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 68, !dbg !297 + %9749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 69, !dbg !297 + %9750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 70, !dbg !297 + %9751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 71, !dbg !297 + %9752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 72, !dbg !297 + %9753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 73, !dbg !297 + %9754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 74, !dbg !297 + %9755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 75, !dbg !297 + %9756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 76, !dbg !297 + %9757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 77, !dbg !297 + %9758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 78, !dbg !297 + %9759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 79, !dbg !297 + %9760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 80, !dbg !297 + %9761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 81, !dbg !297 + %9762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 82, !dbg !297 + %9763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 83, !dbg !297 + %9764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 84, !dbg !297 + %9765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 85, !dbg !297 + %9766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 86, !dbg !297 + %9767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 87, !dbg !297 + %9768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 88, !dbg !297 + %9769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 89, !dbg !297 + %9770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 90, !dbg !297 + %9771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 91, !dbg !297 + %9772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 92, !dbg !297 + %9773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 93, !dbg !297 + %9774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 94, !dbg !297 + %9775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 95, !dbg !297 + %9776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 96, !dbg !297 + %9777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 97, !dbg !297 + %9778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 98, !dbg !297 + %9779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 99, !dbg !297 + %9780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 100, !dbg !297 + %9781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 101, !dbg !297 + %9782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 102, !dbg !297 + %9783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 103, !dbg !297 + %9784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 104, !dbg !297 + %9785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 105, !dbg !297 + %9786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 106, !dbg !297 + %9787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 107, !dbg !297 + %9788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 108, !dbg !297 + %9789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 109, !dbg !297 + %9790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 110, !dbg !297 + %9791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 111, !dbg !297 + %9792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 112, !dbg !297 + %9793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 113, !dbg !297 + %9794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 114, !dbg !297 + %9795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 115, !dbg !297 + %9796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 116, !dbg !297 + %9797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 117, !dbg !297 + %9798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 118, !dbg !297 + %9799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 119, !dbg !297 + %9800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 120, !dbg !297 + %9801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 121, !dbg !297 + %9802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 122, !dbg !297 + %9803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 123, !dbg !297 + %9804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 124, !dbg !297 + %9805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 125, !dbg !297 + %9806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 126, !dbg !297 + %9807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9679, 127, !dbg !297 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !297 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !297 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !227 + %exitcond2123.not = icmp eq i64 %indvars.iv.next, 4, !dbg !227 + br i1 %exitcond2123.not, label %9808, label %4945, !dbg !227 + +9808: ; preds = %._crit_edge1692 + %9809 = getelementptr bfloat, ptr addrspace(1) %34, i64 %4480, !dbg !329 + %9810 = getelementptr bfloat, ptr addrspace(1) %34, i64 %4482, !dbg !329 + %9811 = getelementptr bfloat, ptr addrspace(1) %34, i64 %4484, !dbg !329 + %9812 = getelementptr bfloat, ptr addrspace(1) %34, i64 %4486, !dbg !329 + %9813 = getelementptr bfloat, ptr addrspace(1) %34, i64 %4488, !dbg !329 + %9814 = getelementptr bfloat, ptr addrspace(1) %34, i64 %4490, !dbg !329 + %9815 = getelementptr bfloat, ptr addrspace(1) %34, i64 %4492, !dbg !329 + %9816 = getelementptr bfloat, ptr addrspace(1) %34, i64 %4494, !dbg !329 + %9817 = getelementptr bfloat, ptr addrspace(1) %9809, i64 %4498, !dbg !330 + %9818 = getelementptr bfloat, ptr addrspace(1) %9810, i64 %4498, !dbg !330 + %9819 = getelementptr bfloat, ptr addrspace(1) %9811, i64 %4498, !dbg !330 + %9820 = getelementptr bfloat, ptr addrspace(1) %9812, i64 %4498, !dbg !330 + %9821 = getelementptr bfloat, ptr addrspace(1) %9813, i64 %4498, !dbg !330 + %9822 = getelementptr bfloat, ptr addrspace(1) %9814, i64 %4498, !dbg !330 + %9823 = getelementptr bfloat, ptr addrspace(1) %9815, i64 %4498, !dbg !330 + %9824 = getelementptr bfloat, ptr addrspace(1) %9816, i64 %4498, !dbg !330 + %9825 = insertelement <2 x float> poison, float %9680, i64 0, !dbg !331 + %9826 = insertelement <2 x float> %9825, float %9681, i64 1, !dbg !331 + %9827 = fptrunc <2 x float> %9826 to <2 x bfloat>, !dbg !331 + %9828 = insertelement <2 x float> poison, float %9682, i64 0, !dbg !331 + %9829 = insertelement <2 x float> %9828, float %9683, i64 1, !dbg !331 + %9830 = fptrunc <2 x float> %9829 to <2 x bfloat>, !dbg !331 + %9831 = insertelement <2 x float> poison, float %9684, i64 0, !dbg !331 + %9832 = insertelement <2 x float> %9831, float %9685, i64 1, !dbg !331 + %9833 = fptrunc <2 x float> %9832 to <2 x bfloat>, !dbg !331 + %9834 = insertelement <2 x float> poison, float %9686, i64 0, !dbg !331 + %9835 = insertelement <2 x float> %9834, float %9687, i64 1, !dbg !331 + %9836 = fptrunc <2 x float> %9835 to <2 x bfloat>, !dbg !331 + %9837 = insertelement <2 x float> poison, float %9688, i64 0, !dbg !331 + %9838 = insertelement <2 x float> %9837, float %9689, i64 1, !dbg !331 + %9839 = fptrunc <2 x float> %9838 to <2 x bfloat>, !dbg !331 + %9840 = insertelement <2 x float> poison, float %9690, i64 0, !dbg !331 + %9841 = insertelement <2 x float> %9840, float %9691, i64 1, !dbg !331 + %9842 = fptrunc <2 x float> %9841 to <2 x bfloat>, !dbg !331 + %9843 = insertelement <2 x float> poison, float %9692, i64 0, !dbg !331 + %9844 = insertelement <2 x float> %9843, float %9693, i64 1, !dbg !331 + %9845 = fptrunc <2 x float> %9844 to <2 x bfloat>, !dbg !331 + %9846 = insertelement <2 x float> poison, float %9694, i64 0, !dbg !331 + %9847 = insertelement <2 x float> %9846, float %9695, i64 1, !dbg !331 + %9848 = fptrunc <2 x float> %9847 to <2 x bfloat>, !dbg !331 + %9849 = insertelement <2 x float> poison, float %9696, i64 0, !dbg !331 + %9850 = insertelement <2 x float> %9849, float %9697, i64 1, !dbg !331 + %9851 = fptrunc <2 x float> %9850 to <2 x bfloat>, !dbg !331 + %9852 = insertelement <2 x float> poison, float %9698, i64 0, !dbg !331 + %9853 = insertelement <2 x float> %9852, float %9699, i64 1, !dbg !331 + %9854 = fptrunc <2 x float> %9853 to <2 x bfloat>, !dbg !331 + %9855 = insertelement <2 x float> poison, float %9700, i64 0, !dbg !331 + %9856 = insertelement <2 x float> %9855, float %9701, i64 1, !dbg !331 + %9857 = fptrunc <2 x float> %9856 to <2 x bfloat>, !dbg !331 + %9858 = insertelement <2 x float> poison, float %9702, i64 0, !dbg !331 + %9859 = insertelement <2 x float> %9858, float %9703, i64 1, !dbg !331 + %9860 = fptrunc <2 x float> %9859 to <2 x bfloat>, !dbg !331 + %9861 = insertelement <2 x float> poison, float %9704, i64 0, !dbg !331 + %9862 = insertelement <2 x float> %9861, float %9705, i64 1, !dbg !331 + %9863 = fptrunc <2 x float> %9862 to <2 x bfloat>, !dbg !331 + %9864 = insertelement <2 x float> poison, float %9706, i64 0, !dbg !331 + %9865 = insertelement <2 x float> %9864, float %9707, i64 1, !dbg !331 + %9866 = fptrunc <2 x float> %9865 to <2 x bfloat>, !dbg !331 + %9867 = insertelement <2 x float> poison, float %9708, i64 0, !dbg !331 + %9868 = insertelement <2 x float> %9867, float %9709, i64 1, !dbg !331 + %9869 = fptrunc <2 x float> %9868 to <2 x bfloat>, !dbg !331 + %9870 = insertelement <2 x float> poison, float %9710, i64 0, !dbg !331 + %9871 = insertelement <2 x float> %9870, float %9711, i64 1, !dbg !331 + %9872 = fptrunc <2 x float> %9871 to <2 x bfloat>, !dbg !331 + %9873 = insertelement <2 x float> poison, float %9712, i64 0, !dbg !331 + %9874 = insertelement <2 x float> %9873, float %9713, i64 1, !dbg !331 + %9875 = fptrunc <2 x float> %9874 to <2 x bfloat>, !dbg !331 + %9876 = insertelement <2 x float> poison, float %9714, i64 0, !dbg !331 + %9877 = insertelement <2 x float> %9876, float %9715, i64 1, !dbg !331 + %9878 = fptrunc <2 x float> %9877 to <2 x bfloat>, !dbg !331 + %9879 = insertelement <2 x float> poison, float %9716, i64 0, !dbg !331 + %9880 = insertelement <2 x float> %9879, float %9717, i64 1, !dbg !331 + %9881 = fptrunc <2 x float> %9880 to <2 x bfloat>, !dbg !331 + %9882 = insertelement <2 x float> poison, float %9718, i64 0, !dbg !331 + %9883 = insertelement <2 x float> %9882, float %9719, i64 1, !dbg !331 + %9884 = fptrunc <2 x float> %9883 to <2 x bfloat>, !dbg !331 + %9885 = insertelement <2 x float> poison, float %9720, i64 0, !dbg !331 + %9886 = insertelement <2 x float> %9885, float %9721, i64 1, !dbg !331 + %9887 = fptrunc <2 x float> %9886 to <2 x bfloat>, !dbg !331 + %9888 = insertelement <2 x float> poison, float %9722, i64 0, !dbg !331 + %9889 = insertelement <2 x float> %9888, float %9723, i64 1, !dbg !331 + %9890 = fptrunc <2 x float> %9889 to <2 x bfloat>, !dbg !331 + %9891 = insertelement <2 x float> poison, float %9724, i64 0, !dbg !331 + %9892 = insertelement <2 x float> %9891, float %9725, i64 1, !dbg !331 + %9893 = fptrunc <2 x float> %9892 to <2 x bfloat>, !dbg !331 + %9894 = insertelement <2 x float> poison, float %9726, i64 0, !dbg !331 + %9895 = insertelement <2 x float> %9894, float %9727, i64 1, !dbg !331 + %9896 = fptrunc <2 x float> %9895 to <2 x bfloat>, !dbg !331 + %9897 = insertelement <2 x float> poison, float %9728, i64 0, !dbg !331 + %9898 = insertelement <2 x float> %9897, float %9729, i64 1, !dbg !331 + %9899 = fptrunc <2 x float> %9898 to <2 x bfloat>, !dbg !331 + %9900 = insertelement <2 x float> poison, float %9730, i64 0, !dbg !331 + %9901 = insertelement <2 x float> %9900, float %9731, i64 1, !dbg !331 + %9902 = fptrunc <2 x float> %9901 to <2 x bfloat>, !dbg !331 + %9903 = insertelement <2 x float> poison, float %9732, i64 0, !dbg !331 + %9904 = insertelement <2 x float> %9903, float %9733, i64 1, !dbg !331 + %9905 = fptrunc <2 x float> %9904 to <2 x bfloat>, !dbg !331 + %9906 = insertelement <2 x float> poison, float %9734, i64 0, !dbg !331 + %9907 = insertelement <2 x float> %9906, float %9735, i64 1, !dbg !331 + %9908 = fptrunc <2 x float> %9907 to <2 x bfloat>, !dbg !331 + %9909 = insertelement <2 x float> poison, float %9736, i64 0, !dbg !331 + %9910 = insertelement <2 x float> %9909, float %9737, i64 1, !dbg !331 + %9911 = fptrunc <2 x float> %9910 to <2 x bfloat>, !dbg !331 + %9912 = insertelement <2 x float> poison, float %9738, i64 0, !dbg !331 + %9913 = insertelement <2 x float> %9912, float %9739, i64 1, !dbg !331 + %9914 = fptrunc <2 x float> %9913 to <2 x bfloat>, !dbg !331 + %9915 = insertelement <2 x float> poison, float %9740, i64 0, !dbg !331 + %9916 = insertelement <2 x float> %9915, float %9741, i64 1, !dbg !331 + %9917 = fptrunc <2 x float> %9916 to <2 x bfloat>, !dbg !331 + %9918 = insertelement <2 x float> poison, float %9742, i64 0, !dbg !331 + %9919 = insertelement <2 x float> %9918, float %9743, i64 1, !dbg !331 + %9920 = fptrunc <2 x float> %9919 to <2 x bfloat>, !dbg !331 + %9921 = shl nuw nsw i32 %4713, 13, !dbg !331 + %9922 = shl nuw nsw i32 %35, 5, !dbg !331 + %9923 = and i32 %9922, 7264, !dbg !331 + %9924 = and i32 %35, 24, !dbg !331 + %9925 = shl nuw nsw i32 %9924, 4, !dbg !331 + %9926 = shl nuw nsw i32 %35, 2, !dbg !331 + %9927 = and i32 %9926, 16, !dbg !331 + %9928 = or disjoint i32 %9921, %9927, !dbg !331 + %9929 = or disjoint i32 %9923, %9925, !dbg !331 + %9930 = or disjoint i32 %9928, %9929, !dbg !331 + %9931 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %9930, !dbg !331 + %9932 = bitcast <2 x bfloat> %9827 to i32, !dbg !331 + %9933 = bitcast <2 x bfloat> %9833 to i32, !dbg !331 + %9934 = bitcast <2 x bfloat> %9839 to i32, !dbg !331 + %9935 = bitcast <2 x bfloat> %9845 to i32, !dbg !331 + %9936 = insertelement <4 x i32> poison, i32 %9932, i64 0, !dbg !331 + %9937 = insertelement <4 x i32> %9936, i32 %9933, i64 1, !dbg !331 + %9938 = insertelement <4 x i32> %9937, i32 %9934, i64 2, !dbg !331 + %9939 = insertelement <4 x i32> %9938, i32 %9935, i64 3, !dbg !331 + store <4 x i32> %9939, ptr addrspace(3) %9931, align 16, !dbg !331 + %9940 = getelementptr inbounds nuw i8, ptr addrspace(3) %9931, i32 512, !dbg !331 + %9941 = bitcast <2 x bfloat> %9830 to i32, !dbg !331 + %9942 = bitcast <2 x bfloat> %9836 to i32, !dbg !331 + %9943 = bitcast <2 x bfloat> %9842 to i32, !dbg !331 + %9944 = bitcast <2 x bfloat> %9848 to i32, !dbg !331 + %9945 = insertelement <4 x i32> poison, i32 %9941, i64 0, !dbg !331 + %9946 = insertelement <4 x i32> %9945, i32 %9942, i64 1, !dbg !331 + %9947 = insertelement <4 x i32> %9946, i32 %9943, i64 2, !dbg !331 + %9948 = insertelement <4 x i32> %9947, i32 %9944, i64 3, !dbg !331 + store <4 x i32> %9948, ptr addrspace(3) %9940, align 16, !dbg !331 + %9949 = xor i32 %9930, 32, !dbg !331 + %9950 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %9949, !dbg !331 + %9951 = bitcast <2 x bfloat> %9851 to i32, !dbg !331 + %9952 = bitcast <2 x bfloat> %9857 to i32, !dbg !331 + %9953 = bitcast <2 x bfloat> %9863 to i32, !dbg !331 + %9954 = bitcast <2 x bfloat> %9869 to i32, !dbg !331 + %9955 = insertelement <4 x i32> poison, i32 %9951, i64 0, !dbg !331 + %9956 = insertelement <4 x i32> %9955, i32 %9952, i64 1, !dbg !331 + %9957 = insertelement <4 x i32> %9956, i32 %9953, i64 2, !dbg !331 + %9958 = insertelement <4 x i32> %9957, i32 %9954, i64 3, !dbg !331 + store <4 x i32> %9958, ptr addrspace(3) %9950, align 16, !dbg !331 + %9959 = getelementptr inbounds nuw i8, ptr addrspace(3) %9950, i32 512, !dbg !331 + %9960 = bitcast <2 x bfloat> %9854 to i32, !dbg !331 + %9961 = bitcast <2 x bfloat> %9860 to i32, !dbg !331 + %9962 = bitcast <2 x bfloat> %9866 to i32, !dbg !331 + %9963 = bitcast <2 x bfloat> %9872 to i32, !dbg !331 + %9964 = insertelement <4 x i32> poison, i32 %9960, i64 0, !dbg !331 + %9965 = insertelement <4 x i32> %9964, i32 %9961, i64 1, !dbg !331 + %9966 = insertelement <4 x i32> %9965, i32 %9962, i64 2, !dbg !331 + %9967 = insertelement <4 x i32> %9966, i32 %9963, i64 3, !dbg !331 + store <4 x i32> %9967, ptr addrspace(3) %9959, align 16, !dbg !331 + %9968 = xor i32 %9930, 64, !dbg !331 + %9969 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %9968, !dbg !331 + %9970 = bitcast <2 x bfloat> %9875 to i32, !dbg !331 + %9971 = bitcast <2 x bfloat> %9881 to i32, !dbg !331 + %9972 = bitcast <2 x bfloat> %9887 to i32, !dbg !331 + %9973 = bitcast <2 x bfloat> %9893 to i32, !dbg !331 + %9974 = insertelement <4 x i32> poison, i32 %9970, i64 0, !dbg !331 + %9975 = insertelement <4 x i32> %9974, i32 %9971, i64 1, !dbg !331 + %9976 = insertelement <4 x i32> %9975, i32 %9972, i64 2, !dbg !331 + %9977 = insertelement <4 x i32> %9976, i32 %9973, i64 3, !dbg !331 + store <4 x i32> %9977, ptr addrspace(3) %9969, align 16, !dbg !331 + %9978 = getelementptr inbounds nuw i8, ptr addrspace(3) %9969, i32 512, !dbg !331 + %9979 = bitcast <2 x bfloat> %9878 to i32, !dbg !331 + %9980 = bitcast <2 x bfloat> %9884 to i32, !dbg !331 + %9981 = bitcast <2 x bfloat> %9890 to i32, !dbg !331 + %9982 = bitcast <2 x bfloat> %9896 to i32, !dbg !331 + %9983 = insertelement <4 x i32> poison, i32 %9979, i64 0, !dbg !331 + %9984 = insertelement <4 x i32> %9983, i32 %9980, i64 1, !dbg !331 + %9985 = insertelement <4 x i32> %9984, i32 %9981, i64 2, !dbg !331 + %9986 = insertelement <4 x i32> %9985, i32 %9982, i64 3, !dbg !331 + store <4 x i32> %9986, ptr addrspace(3) %9978, align 16, !dbg !331 + %9987 = xor i32 %9930, 96, !dbg !331 + %9988 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %9987, !dbg !331 + %9989 = bitcast <2 x bfloat> %9899 to i32, !dbg !331 + %9990 = bitcast <2 x bfloat> %9905 to i32, !dbg !331 + %9991 = bitcast <2 x bfloat> %9911 to i32, !dbg !331 + %9992 = bitcast <2 x bfloat> %9917 to i32, !dbg !331 + %9993 = insertelement <4 x i32> poison, i32 %9989, i64 0, !dbg !331 + %9994 = insertelement <4 x i32> %9993, i32 %9990, i64 1, !dbg !331 + %9995 = insertelement <4 x i32> %9994, i32 %9991, i64 2, !dbg !331 + %9996 = insertelement <4 x i32> %9995, i32 %9992, i64 3, !dbg !331 + store <4 x i32> %9996, ptr addrspace(3) %9988, align 16, !dbg !331 + %9997 = getelementptr inbounds nuw i8, ptr addrspace(3) %9988, i32 512, !dbg !331 + %9998 = bitcast <2 x bfloat> %9902 to i32, !dbg !331 + %9999 = bitcast <2 x bfloat> %9908 to i32, !dbg !331 + %10000 = bitcast <2 x bfloat> %9914 to i32, !dbg !331 + %10001 = bitcast <2 x bfloat> %9920 to i32, !dbg !331 + %10002 = insertelement <4 x i32> poison, i32 %9998, i64 0, !dbg !331 + %10003 = insertelement <4 x i32> %10002, i32 %9999, i64 1, !dbg !331 + %10004 = insertelement <4 x i32> %10003, i32 %10000, i64 2, !dbg !331 + %10005 = insertelement <4 x i32> %10004, i32 %10001, i64 3, !dbg !331 + store <4 x i32> %10005, ptr addrspace(3) %9997, align 16, !dbg !331 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !331 + %10006 = shl nuw nsw i32 %9924, 10, !dbg !331 + %10007 = shl nuw nsw i32 %4713, 5, !dbg !331 + %10008 = and i32 %9926, 1008, !dbg !331 + %10009 = or disjoint i32 %10006, %10007, !dbg !331 + %10010 = xor i32 %10009, %10008, !dbg !331 + %10011 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %10010, !dbg !331 + %10012 = ptrtoint ptr addrspace(3) %10011 to i32, !dbg !331 + %10013 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10012) #3, !dbg !331 + %10014 = extractvalue { i32, i32, i32, i32 } %10013, 0, !dbg !331 + %10015 = extractvalue { i32, i32, i32, i32 } %10013, 1, !dbg !331 + %10016 = extractvalue { i32, i32, i32, i32 } %10013, 2, !dbg !331 + %10017 = extractvalue { i32, i32, i32, i32 } %10013, 3, !dbg !331 + %10018 = getelementptr inbounds nuw i8, ptr addrspace(3) %10011, i32 1024, !dbg !331 + %10019 = ptrtoint ptr addrspace(3) %10018 to i32, !dbg !331 + %10020 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10019) #3, !dbg !331 + %10021 = extractvalue { i32, i32, i32, i32 } %10020, 0, !dbg !331 + %10022 = extractvalue { i32, i32, i32, i32 } %10020, 1, !dbg !331 + %10023 = extractvalue { i32, i32, i32, i32 } %10020, 2, !dbg !331 + %10024 = extractvalue { i32, i32, i32, i32 } %10020, 3, !dbg !331 + %10025 = getelementptr inbounds nuw i8, ptr addrspace(3) %10011, i32 2048, !dbg !331 + %10026 = ptrtoint ptr addrspace(3) %10025 to i32, !dbg !331 + %10027 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10026) #3, !dbg !331 + %10028 = extractvalue { i32, i32, i32, i32 } %10027, 0, !dbg !331 + %10029 = extractvalue { i32, i32, i32, i32 } %10027, 1, !dbg !331 + %10030 = extractvalue { i32, i32, i32, i32 } %10027, 2, !dbg !331 + %10031 = extractvalue { i32, i32, i32, i32 } %10027, 3, !dbg !331 + %10032 = getelementptr inbounds nuw i8, ptr addrspace(3) %10011, i32 3072, !dbg !331 + %10033 = ptrtoint ptr addrspace(3) %10032 to i32, !dbg !331 + %10034 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10033) #3, !dbg !331 + %10035 = extractvalue { i32, i32, i32, i32 } %10034, 0, !dbg !331 + %10036 = extractvalue { i32, i32, i32, i32 } %10034, 1, !dbg !331 + %10037 = extractvalue { i32, i32, i32, i32 } %10034, 2, !dbg !331 + %10038 = extractvalue { i32, i32, i32, i32 } %10034, 3, !dbg !331 + %10039 = getelementptr inbounds nuw i8, ptr addrspace(3) %10011, i32 4096, !dbg !331 + %10040 = ptrtoint ptr addrspace(3) %10039 to i32, !dbg !331 + %10041 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10040) #3, !dbg !331 + %10042 = extractvalue { i32, i32, i32, i32 } %10041, 0, !dbg !331 + %10043 = extractvalue { i32, i32, i32, i32 } %10041, 1, !dbg !331 + %10044 = extractvalue { i32, i32, i32, i32 } %10041, 2, !dbg !331 + %10045 = extractvalue { i32, i32, i32, i32 } %10041, 3, !dbg !331 + %10046 = getelementptr inbounds nuw i8, ptr addrspace(3) %10011, i32 5120, !dbg !331 + %10047 = ptrtoint ptr addrspace(3) %10046 to i32, !dbg !331 + %10048 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10047) #3, !dbg !331 + %10049 = extractvalue { i32, i32, i32, i32 } %10048, 0, !dbg !331 + %10050 = extractvalue { i32, i32, i32, i32 } %10048, 1, !dbg !331 + %10051 = extractvalue { i32, i32, i32, i32 } %10048, 2, !dbg !331 + %10052 = extractvalue { i32, i32, i32, i32 } %10048, 3, !dbg !331 + %10053 = getelementptr inbounds nuw i8, ptr addrspace(3) %10011, i32 6144, !dbg !331 + %10054 = ptrtoint ptr addrspace(3) %10053 to i32, !dbg !331 + %10055 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10054) #3, !dbg !331 + %10056 = extractvalue { i32, i32, i32, i32 } %10055, 0, !dbg !331 + %10057 = extractvalue { i32, i32, i32, i32 } %10055, 1, !dbg !331 + %10058 = extractvalue { i32, i32, i32, i32 } %10055, 2, !dbg !331 + %10059 = extractvalue { i32, i32, i32, i32 } %10055, 3, !dbg !331 + %10060 = getelementptr inbounds nuw i8, ptr addrspace(3) %10011, i32 7168, !dbg !331 + %10061 = ptrtoint ptr addrspace(3) %10060 to i32, !dbg !331 + %10062 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10061) #3, !dbg !331 + %10063 = extractvalue { i32, i32, i32, i32 } %10062, 0, !dbg !331 + %10064 = extractvalue { i32, i32, i32, i32 } %10062, 1, !dbg !331 + %10065 = extractvalue { i32, i32, i32, i32 } %10062, 2, !dbg !331 + %10066 = extractvalue { i32, i32, i32, i32 } %10062, 3, !dbg !331 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %10014, i32 %10015, i32 %10016, i32 %10017, ptr addrspace(1) %9817) #3, !dbg !331 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %10021, i32 %10022, i32 %10023, i32 %10024, ptr addrspace(1) %9818) #3, !dbg !331 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %10028, i32 %10029, i32 %10030, i32 %10031, ptr addrspace(1) %9819) #3, !dbg !331 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %10035, i32 %10036, i32 %10037, i32 %10038, ptr addrspace(1) %9820) #3, !dbg !331 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %10042, i32 %10043, i32 %10044, i32 %10045, ptr addrspace(1) %9821) #3, !dbg !331 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %10049, i32 %10050, i32 %10051, i32 %10052, ptr addrspace(1) %9822) #3, !dbg !331 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %10056, i32 %10057, i32 %10058, i32 %10059, ptr addrspace(1) %9823) #3, !dbg !331 + tail call void asm sideeffect "st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l"(i32 %10063, i32 %10064, i32 %10065, i32 %10066, ptr addrspace(1) %9824) #3, !dbg !331 + %10067 = insertelement <2 x float> poison, float %9744, i64 0, !dbg !332 + %10068 = insertelement <2 x float> %10067, float %9745, i64 1, !dbg !332 + %10069 = fmul <2 x float> %10068, splat (float 0x3FB6A09E60000000), !dbg !332 + %10070 = insertelement <2 x float> poison, float %9746, i64 0, !dbg !332 + %10071 = insertelement <2 x float> %10070, float %9747, i64 1, !dbg !332 + %10072 = fmul <2 x float> %10071, splat (float 0x3FB6A09E60000000), !dbg !332 + %10073 = insertelement <2 x float> poison, float %9748, i64 0, !dbg !332 + %10074 = insertelement <2 x float> %10073, float %9749, i64 1, !dbg !332 + %10075 = fmul <2 x float> %10074, splat (float 0x3FB6A09E60000000), !dbg !332 + %10076 = insertelement <2 x float> poison, float %9750, i64 0, !dbg !332 + %10077 = insertelement <2 x float> %10076, float %9751, i64 1, !dbg !332 + %10078 = fmul <2 x float> %10077, splat (float 0x3FB6A09E60000000), !dbg !332 + %10079 = insertelement <2 x float> poison, float %9752, i64 0, !dbg !332 + %10080 = insertelement <2 x float> %10079, float %9753, i64 1, !dbg !332 + %10081 = fmul <2 x float> %10080, splat (float 0x3FB6A09E60000000), !dbg !332 + %10082 = insertelement <2 x float> poison, float %9754, i64 0, !dbg !332 + %10083 = insertelement <2 x float> %10082, float %9755, i64 1, !dbg !332 + %10084 = fmul <2 x float> %10083, splat (float 0x3FB6A09E60000000), !dbg !332 + %10085 = insertelement <2 x float> poison, float %9756, i64 0, !dbg !332 + %10086 = insertelement <2 x float> %10085, float %9757, i64 1, !dbg !332 + %10087 = fmul <2 x float> %10086, splat (float 0x3FB6A09E60000000), !dbg !332 + %10088 = insertelement <2 x float> poison, float %9758, i64 0, !dbg !332 + %10089 = insertelement <2 x float> %10088, float %9759, i64 1, !dbg !332 + %10090 = fmul <2 x float> %10089, splat (float 0x3FB6A09E60000000), !dbg !332 + %10091 = insertelement <2 x float> poison, float %9760, i64 0, !dbg !332 + %10092 = insertelement <2 x float> %10091, float %9761, i64 1, !dbg !332 + %10093 = fmul <2 x float> %10092, splat (float 0x3FB6A09E60000000), !dbg !332 + %10094 = insertelement <2 x float> poison, float %9762, i64 0, !dbg !332 + %10095 = insertelement <2 x float> %10094, float %9763, i64 1, !dbg !332 + %10096 = fmul <2 x float> %10095, splat (float 0x3FB6A09E60000000), !dbg !332 + %10097 = insertelement <2 x float> poison, float %9764, i64 0, !dbg !332 + %10098 = insertelement <2 x float> %10097, float %9765, i64 1, !dbg !332 + %10099 = fmul <2 x float> %10098, splat (float 0x3FB6A09E60000000), !dbg !332 + %10100 = insertelement <2 x float> poison, float %9766, i64 0, !dbg !332 + %10101 = insertelement <2 x float> %10100, float %9767, i64 1, !dbg !332 + %10102 = fmul <2 x float> %10101, splat (float 0x3FB6A09E60000000), !dbg !332 + %10103 = insertelement <2 x float> poison, float %9768, i64 0, !dbg !332 + %10104 = insertelement <2 x float> %10103, float %9769, i64 1, !dbg !332 + %10105 = fmul <2 x float> %10104, splat (float 0x3FB6A09E60000000), !dbg !332 + %10106 = insertelement <2 x float> poison, float %9770, i64 0, !dbg !332 + %10107 = insertelement <2 x float> %10106, float %9771, i64 1, !dbg !332 + %10108 = fmul <2 x float> %10107, splat (float 0x3FB6A09E60000000), !dbg !332 + %10109 = insertelement <2 x float> poison, float %9772, i64 0, !dbg !332 + %10110 = insertelement <2 x float> %10109, float %9773, i64 1, !dbg !332 + %10111 = fmul <2 x float> %10110, splat (float 0x3FB6A09E60000000), !dbg !332 + %10112 = insertelement <2 x float> poison, float %9774, i64 0, !dbg !332 + %10113 = insertelement <2 x float> %10112, float %9775, i64 1, !dbg !332 + %10114 = fmul <2 x float> %10113, splat (float 0x3FB6A09E60000000), !dbg !332 + %10115 = insertelement <2 x float> poison, float %9776, i64 0, !dbg !332 + %10116 = insertelement <2 x float> %10115, float %9777, i64 1, !dbg !332 + %10117 = fmul <2 x float> %10116, splat (float 0x3FB6A09E60000000), !dbg !332 + %10118 = insertelement <2 x float> poison, float %9778, i64 0, !dbg !332 + %10119 = insertelement <2 x float> %10118, float %9779, i64 1, !dbg !332 + %10120 = fmul <2 x float> %10119, splat (float 0x3FB6A09E60000000), !dbg !332 + %10121 = insertelement <2 x float> poison, float %9780, i64 0, !dbg !332 + %10122 = insertelement <2 x float> %10121, float %9781, i64 1, !dbg !332 + %10123 = fmul <2 x float> %10122, splat (float 0x3FB6A09E60000000), !dbg !332 + %10124 = insertelement <2 x float> poison, float %9782, i64 0, !dbg !332 + %10125 = insertelement <2 x float> %10124, float %9783, i64 1, !dbg !332 + %10126 = fmul <2 x float> %10125, splat (float 0x3FB6A09E60000000), !dbg !332 + %10127 = insertelement <2 x float> poison, float %9784, i64 0, !dbg !332 + %10128 = insertelement <2 x float> %10127, float %9785, i64 1, !dbg !332 + %10129 = fmul <2 x float> %10128, splat (float 0x3FB6A09E60000000), !dbg !332 + %10130 = insertelement <2 x float> poison, float %9786, i64 0, !dbg !332 + %10131 = insertelement <2 x float> %10130, float %9787, i64 1, !dbg !332 + %10132 = fmul <2 x float> %10131, splat (float 0x3FB6A09E60000000), !dbg !332 + %10133 = insertelement <2 x float> poison, float %9788, i64 0, !dbg !332 + %10134 = insertelement <2 x float> %10133, float %9789, i64 1, !dbg !332 + %10135 = fmul <2 x float> %10134, splat (float 0x3FB6A09E60000000), !dbg !332 + %10136 = insertelement <2 x float> poison, float %9790, i64 0, !dbg !332 + %10137 = insertelement <2 x float> %10136, float %9791, i64 1, !dbg !332 + %10138 = fmul <2 x float> %10137, splat (float 0x3FB6A09E60000000), !dbg !332 + %10139 = insertelement <2 x float> poison, float %9792, i64 0, !dbg !332 + %10140 = insertelement <2 x float> %10139, float %9793, i64 1, !dbg !332 + %10141 = fmul <2 x float> %10140, splat (float 0x3FB6A09E60000000), !dbg !332 + %10142 = insertelement <2 x float> poison, float %9794, i64 0, !dbg !332 + %10143 = insertelement <2 x float> %10142, float %9795, i64 1, !dbg !332 + %10144 = fmul <2 x float> %10143, splat (float 0x3FB6A09E60000000), !dbg !332 + %10145 = insertelement <2 x float> poison, float %9796, i64 0, !dbg !332 + %10146 = insertelement <2 x float> %10145, float %9797, i64 1, !dbg !332 + %10147 = fmul <2 x float> %10146, splat (float 0x3FB6A09E60000000), !dbg !332 + %10148 = insertelement <2 x float> poison, float %9798, i64 0, !dbg !332 + %10149 = insertelement <2 x float> %10148, float %9799, i64 1, !dbg !332 + %10150 = fmul <2 x float> %10149, splat (float 0x3FB6A09E60000000), !dbg !332 + %10151 = insertelement <2 x float> poison, float %9800, i64 0, !dbg !332 + %10152 = insertelement <2 x float> %10151, float %9801, i64 1, !dbg !332 + %10153 = fmul <2 x float> %10152, splat (float 0x3FB6A09E60000000), !dbg !332 + %10154 = insertelement <2 x float> poison, float %9802, i64 0, !dbg !332 + %10155 = insertelement <2 x float> %10154, float %9803, i64 1, !dbg !332 + %10156 = fmul <2 x float> %10155, splat (float 0x3FB6A09E60000000), !dbg !332 + %10157 = insertelement <2 x float> poison, float %9804, i64 0, !dbg !332 + %10158 = insertelement <2 x float> %10157, float %9805, i64 1, !dbg !332 + %10159 = fmul <2 x float> %10158, splat (float 0x3FB6A09E60000000), !dbg !332 + %10160 = insertelement <2 x float> poison, float %9806, i64 0, !dbg !332 + %10161 = insertelement <2 x float> %10160, float %9807, i64 1, !dbg !332 + %10162 = fmul <2 x float> %10161, splat (float 0x3FB6A09E60000000), !dbg !332 + %10163 = or disjoint i32 %4472, %30, !dbg !333 + %10164 = or disjoint i32 %10163, %4497, !dbg !333 + %10165 = or disjoint i32 %4473, %30, !dbg !333 + %10166 = or disjoint i32 %10165, %4497, !dbg !333 + %10167 = or disjoint i32 %4474, %30, !dbg !333 + %10168 = or disjoint i32 %10167, %4497, !dbg !333 + %10169 = or disjoint i32 %4475, %30, !dbg !333 + %10170 = or disjoint i32 %10169, %4497, !dbg !333 + %10171 = or disjoint i32 %4476, %30, !dbg !333 + %10172 = or disjoint i32 %10171, %4497, !dbg !333 + %10173 = or disjoint i32 %4477, %30, !dbg !333 + %10174 = or disjoint i32 %10173, %4497, !dbg !333 + %10175 = or disjoint i32 %4478, %30, !dbg !333 + %10176 = or disjoint i32 %10175, %4497, !dbg !333 + %10177 = or disjoint i32 %4479, %30, !dbg !333 + %10178 = or disjoint i32 %10177, %4497, !dbg !333 + %10179 = sext i32 %10164 to i64, !dbg !334 + %10180 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10179, !dbg !334 + %10181 = sext i32 %10166 to i64, !dbg !334 + %10182 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10181, !dbg !334 + %10183 = sext i32 %10168 to i64, !dbg !334 + %10184 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10183, !dbg !334 + %10185 = sext i32 %10170 to i64, !dbg !334 + %10186 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10185, !dbg !334 + %10187 = sext i32 %10172 to i64, !dbg !334 + %10188 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10187, !dbg !334 + %10189 = sext i32 %10174 to i64, !dbg !334 + %10190 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10189, !dbg !334 + %10191 = sext i32 %10176 to i64, !dbg !334 + %10192 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10191, !dbg !334 + %10193 = sext i32 %10178 to i64, !dbg !334 + %10194 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10193, !dbg !334 + %10195 = fptrunc <2 x float> %10069 to <2 x bfloat>, !dbg !335 + %10196 = fptrunc <2 x float> %10072 to <2 x bfloat>, !dbg !335 + %10197 = fptrunc <2 x float> %10075 to <2 x bfloat>, !dbg !335 + %10198 = fptrunc <2 x float> %10078 to <2 x bfloat>, !dbg !335 + %10199 = fptrunc <2 x float> %10081 to <2 x bfloat>, !dbg !335 + %10200 = fptrunc <2 x float> %10084 to <2 x bfloat>, !dbg !335 + %10201 = fptrunc <2 x float> %10087 to <2 x bfloat>, !dbg !335 + %10202 = fptrunc <2 x float> %10090 to <2 x bfloat>, !dbg !335 + %10203 = fptrunc <2 x float> %10093 to <2 x bfloat>, !dbg !335 + %10204 = fptrunc <2 x float> %10096 to <2 x bfloat>, !dbg !335 + %10205 = fptrunc <2 x float> %10099 to <2 x bfloat>, !dbg !335 + %10206 = fptrunc <2 x float> %10102 to <2 x bfloat>, !dbg !335 + %10207 = fptrunc <2 x float> %10105 to <2 x bfloat>, !dbg !335 + %10208 = fptrunc <2 x float> %10108 to <2 x bfloat>, !dbg !335 + %10209 = fptrunc <2 x float> %10111 to <2 x bfloat>, !dbg !335 + %10210 = fptrunc <2 x float> %10114 to <2 x bfloat>, !dbg !335 + %10211 = fptrunc <2 x float> %10117 to <2 x bfloat>, !dbg !335 + %10212 = fptrunc <2 x float> %10120 to <2 x bfloat>, !dbg !335 + %10213 = fptrunc <2 x float> %10123 to <2 x bfloat>, !dbg !335 + %10214 = fptrunc <2 x float> %10126 to <2 x bfloat>, !dbg !335 + %10215 = fptrunc <2 x float> %10129 to <2 x bfloat>, !dbg !335 + %10216 = fptrunc <2 x float> %10132 to <2 x bfloat>, !dbg !335 + %10217 = fptrunc <2 x float> %10135 to <2 x bfloat>, !dbg !335 + %10218 = fptrunc <2 x float> %10138 to <2 x bfloat>, !dbg !335 + %10219 = fptrunc <2 x float> %10141 to <2 x bfloat>, !dbg !335 + %10220 = fptrunc <2 x float> %10144 to <2 x bfloat>, !dbg !335 + %10221 = fptrunc <2 x float> %10147 to <2 x bfloat>, !dbg !335 + %10222 = fptrunc <2 x float> %10150 to <2 x bfloat>, !dbg !335 + %10223 = fptrunc <2 x float> %10153 to <2 x bfloat>, !dbg !335 + %10224 = fptrunc <2 x float> %10156 to <2 x bfloat>, !dbg !335 + %10225 = fptrunc <2 x float> %10159 to <2 x bfloat>, !dbg !335 + %10226 = fptrunc <2 x float> %10162 to <2 x bfloat>, !dbg !335 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !335 + %10227 = bitcast <2 x bfloat> %10195 to i32, !dbg !335 + %10228 = bitcast <2 x bfloat> %10197 to i32, !dbg !335 + %10229 = bitcast <2 x bfloat> %10199 to i32, !dbg !335 + %10230 = bitcast <2 x bfloat> %10201 to i32, !dbg !335 + %10231 = insertelement <4 x i32> poison, i32 %10227, i64 0, !dbg !335 + %10232 = insertelement <4 x i32> %10231, i32 %10228, i64 1, !dbg !335 + %10233 = insertelement <4 x i32> %10232, i32 %10229, i64 2, !dbg !335 + %10234 = insertelement <4 x i32> %10233, i32 %10230, i64 3, !dbg !335 + store <4 x i32> %10234, ptr addrspace(3) %9931, align 16, !dbg !335 + %10235 = bitcast <2 x bfloat> %10196 to i32, !dbg !335 + %10236 = bitcast <2 x bfloat> %10198 to i32, !dbg !335 + %10237 = bitcast <2 x bfloat> %10200 to i32, !dbg !335 + %10238 = bitcast <2 x bfloat> %10202 to i32, !dbg !335 + %10239 = insertelement <4 x i32> poison, i32 %10235, i64 0, !dbg !335 + %10240 = insertelement <4 x i32> %10239, i32 %10236, i64 1, !dbg !335 + %10241 = insertelement <4 x i32> %10240, i32 %10237, i64 2, !dbg !335 + %10242 = insertelement <4 x i32> %10241, i32 %10238, i64 3, !dbg !335 + store <4 x i32> %10242, ptr addrspace(3) %9940, align 16, !dbg !335 + %10243 = bitcast <2 x bfloat> %10203 to i32, !dbg !335 + %10244 = bitcast <2 x bfloat> %10205 to i32, !dbg !335 + %10245 = bitcast <2 x bfloat> %10207 to i32, !dbg !335 + %10246 = bitcast <2 x bfloat> %10209 to i32, !dbg !335 + %10247 = insertelement <4 x i32> poison, i32 %10243, i64 0, !dbg !335 + %10248 = insertelement <4 x i32> %10247, i32 %10244, i64 1, !dbg !335 + %10249 = insertelement <4 x i32> %10248, i32 %10245, i64 2, !dbg !335 + %10250 = insertelement <4 x i32> %10249, i32 %10246, i64 3, !dbg !335 + store <4 x i32> %10250, ptr addrspace(3) %9950, align 16, !dbg !335 + %10251 = bitcast <2 x bfloat> %10204 to i32, !dbg !335 + %10252 = bitcast <2 x bfloat> %10206 to i32, !dbg !335 + %10253 = bitcast <2 x bfloat> %10208 to i32, !dbg !335 + %10254 = bitcast <2 x bfloat> %10210 to i32, !dbg !335 + %10255 = insertelement <4 x i32> poison, i32 %10251, i64 0, !dbg !335 + %10256 = insertelement <4 x i32> %10255, i32 %10252, i64 1, !dbg !335 + %10257 = insertelement <4 x i32> %10256, i32 %10253, i64 2, !dbg !335 + %10258 = insertelement <4 x i32> %10257, i32 %10254, i64 3, !dbg !335 + store <4 x i32> %10258, ptr addrspace(3) %9959, align 16, !dbg !335 + %10259 = bitcast <2 x bfloat> %10211 to i32, !dbg !335 + %10260 = bitcast <2 x bfloat> %10213 to i32, !dbg !335 + %10261 = bitcast <2 x bfloat> %10215 to i32, !dbg !335 + %10262 = bitcast <2 x bfloat> %10217 to i32, !dbg !335 + %10263 = insertelement <4 x i32> poison, i32 %10259, i64 0, !dbg !335 + %10264 = insertelement <4 x i32> %10263, i32 %10260, i64 1, !dbg !335 + %10265 = insertelement <4 x i32> %10264, i32 %10261, i64 2, !dbg !335 + %10266 = insertelement <4 x i32> %10265, i32 %10262, i64 3, !dbg !335 + store <4 x i32> %10266, ptr addrspace(3) %9969, align 16, !dbg !335 + %10267 = bitcast <2 x bfloat> %10212 to i32, !dbg !335 + %10268 = bitcast <2 x bfloat> %10214 to i32, !dbg !335 + %10269 = bitcast <2 x bfloat> %10216 to i32, !dbg !335 + %10270 = bitcast <2 x bfloat> %10218 to i32, !dbg !335 + %10271 = insertelement <4 x i32> poison, i32 %10267, i64 0, !dbg !335 + %10272 = insertelement <4 x i32> %10271, i32 %10268, i64 1, !dbg !335 + %10273 = insertelement <4 x i32> %10272, i32 %10269, i64 2, !dbg !335 + %10274 = insertelement <4 x i32> %10273, i32 %10270, i64 3, !dbg !335 + store <4 x i32> %10274, ptr addrspace(3) %9978, align 16, !dbg !335 + %10275 = bitcast <2 x bfloat> %10219 to i32, !dbg !335 + %10276 = bitcast <2 x bfloat> %10221 to i32, !dbg !335 + %10277 = bitcast <2 x bfloat> %10223 to i32, !dbg !335 + %10278 = bitcast <2 x bfloat> %10225 to i32, !dbg !335 + %10279 = insertelement <4 x i32> poison, i32 %10275, i64 0, !dbg !335 + %10280 = insertelement <4 x i32> %10279, i32 %10276, i64 1, !dbg !335 + %10281 = insertelement <4 x i32> %10280, i32 %10277, i64 2, !dbg !335 + %10282 = insertelement <4 x i32> %10281, i32 %10278, i64 3, !dbg !335 + store <4 x i32> %10282, ptr addrspace(3) %9988, align 16, !dbg !335 + %10283 = bitcast <2 x bfloat> %10220 to i32, !dbg !335 + %10284 = bitcast <2 x bfloat> %10222 to i32, !dbg !335 + %10285 = bitcast <2 x bfloat> %10224 to i32, !dbg !335 + %10286 = bitcast <2 x bfloat> %10226 to i32, !dbg !335 + %10287 = insertelement <4 x i32> poison, i32 %10283, i64 0, !dbg !335 + %10288 = insertelement <4 x i32> %10287, i32 %10284, i64 1, !dbg !335 + %10289 = insertelement <4 x i32> %10288, i32 %10285, i64 2, !dbg !335 + %10290 = insertelement <4 x i32> %10289, i32 %10286, i64 3, !dbg !335 + store <4 x i32> %10290, ptr addrspace(3) %9997, align 16, !dbg !335 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !335 + %10291 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10012) #3, !dbg !335 + %10292 = extractvalue { i32, i32, i32, i32 } %10291, 0, !dbg !335 + %10293 = extractvalue { i32, i32, i32, i32 } %10291, 1, !dbg !335 + %10294 = extractvalue { i32, i32, i32, i32 } %10291, 2, !dbg !335 + %10295 = extractvalue { i32, i32, i32, i32 } %10291, 3, !dbg !335 + %10296 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10019) #3, !dbg !335 + %10297 = extractvalue { i32, i32, i32, i32 } %10296, 0, !dbg !335 + %10298 = extractvalue { i32, i32, i32, i32 } %10296, 1, !dbg !335 + %10299 = extractvalue { i32, i32, i32, i32 } %10296, 2, !dbg !335 + %10300 = extractvalue { i32, i32, i32, i32 } %10296, 3, !dbg !335 + %10301 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10026) #3, !dbg !335 + %10302 = extractvalue { i32, i32, i32, i32 } %10301, 0, !dbg !335 + %10303 = extractvalue { i32, i32, i32, i32 } %10301, 1, !dbg !335 + %10304 = extractvalue { i32, i32, i32, i32 } %10301, 2, !dbg !335 + %10305 = extractvalue { i32, i32, i32, i32 } %10301, 3, !dbg !335 + %10306 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10033) #3, !dbg !335 + %10307 = extractvalue { i32, i32, i32, i32 } %10306, 0, !dbg !335 + %10308 = extractvalue { i32, i32, i32, i32 } %10306, 1, !dbg !335 + %10309 = extractvalue { i32, i32, i32, i32 } %10306, 2, !dbg !335 + %10310 = extractvalue { i32, i32, i32, i32 } %10306, 3, !dbg !335 + %10311 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10040) #3, !dbg !335 + %10312 = extractvalue { i32, i32, i32, i32 } %10311, 0, !dbg !335 + %10313 = extractvalue { i32, i32, i32, i32 } %10311, 1, !dbg !335 + %10314 = extractvalue { i32, i32, i32, i32 } %10311, 2, !dbg !335 + %10315 = extractvalue { i32, i32, i32, i32 } %10311, 3, !dbg !335 + %10316 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10047) #3, !dbg !335 + %10317 = extractvalue { i32, i32, i32, i32 } %10316, 0, !dbg !335 + %10318 = extractvalue { i32, i32, i32, i32 } %10316, 1, !dbg !335 + %10319 = extractvalue { i32, i32, i32, i32 } %10316, 2, !dbg !335 + %10320 = extractvalue { i32, i32, i32, i32 } %10316, 3, !dbg !335 + %10321 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10054) #3, !dbg !335 + %10322 = extractvalue { i32, i32, i32, i32 } %10321, 0, !dbg !335 + %10323 = extractvalue { i32, i32, i32, i32 } %10321, 1, !dbg !335 + %10324 = extractvalue { i32, i32, i32, i32 } %10321, 2, !dbg !335 + %10325 = extractvalue { i32, i32, i32, i32 } %10321, 3, !dbg !335 + %10326 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10061) #3, !dbg !335 + %10327 = extractvalue { i32, i32, i32, i32 } %10326, 0, !dbg !335 + %10328 = extractvalue { i32, i32, i32, i32 } %10326, 1, !dbg !335 + %10329 = extractvalue { i32, i32, i32, i32 } %10326, 2, !dbg !335 + %10330 = extractvalue { i32, i32, i32, i32 } %10326, 3, !dbg !335 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10292, i32 %10293, i32 %10294, i32 %10295, ptr addrspace(1) %10180, i1 true) #3, !dbg !335 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10297, i32 %10298, i32 %10299, i32 %10300, ptr addrspace(1) %10182, i1 true) #3, !dbg !335 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10302, i32 %10303, i32 %10304, i32 %10305, ptr addrspace(1) %10184, i1 true) #3, !dbg !335 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10307, i32 %10308, i32 %10309, i32 %10310, ptr addrspace(1) %10186, i1 true) #3, !dbg !335 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10312, i32 %10313, i32 %10314, i32 %10315, ptr addrspace(1) %10188, i1 true) #3, !dbg !335 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10317, i32 %10318, i32 %10319, i32 %10320, ptr addrspace(1) %10190, i1 true) #3, !dbg !335 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10322, i32 %10323, i32 %10324, i32 %10325, ptr addrspace(1) %10192, i1 true) #3, !dbg !335 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10327, i32 %10328, i32 %10329, i32 %10330, ptr addrspace(1) %10194, i1 true) #3, !dbg !335 + br label %10331, !dbg !24 + +10331: ; preds = %._crit_edge1859, %9808 + ret void, !dbg !336 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smin.i32(i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.commit.group() #3 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.wait.group(i32 immarg) #3 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.idx.i32(i32, i32, i32, i32) #4 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.fence.sync.aligned() #5 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.commit_group.sync.aligned() #5 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #6 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #7 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #7 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smax.i32(i32, i32) #8 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.umin.i32(i32, i32) #8 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } +attributes #4 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #5 = { convergent nounwind } +attributes #6 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #7 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #8 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_tem_fused_zeros_1", linkageName: "triton_tem_fused_zeros_1", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 111, column: 24, scope: !5) +!9 = !DILocation(line: 115, column: 27, scope: !5) +!10 = !DILocation(line: 116, column: 28, scope: !5) +!11 = !DILocation(line: 117, column: 23, scope: !5) +!12 = !DILocation(line: 124, column: 25, scope: !5) +!13 = !DILocation(line: 124, column: 47, scope: !5) +!14 = !DILocation(line: 124, column: 35, scope: !5) +!15 = !DILocation(line: 124, column: 59, scope: !5) +!16 = !DILocation(line: 128, column: 50, scope: !5) +!17 = !DILocation(line: 128, column: 37, scope: !5) +!18 = !DILocation(line: 128, column: 61, scope: !5) +!19 = !DILocation(line: 131, column: 9, scope: !5) +!20 = !DILocation(line: 132, column: 9, scope: !5) +!21 = !DILocation(line: 133, column: 10, scope: !5) +!22 = !DILocation(line: 136, column: 26, scope: !5) +!23 = !DILocation(line: 139, column: 14, scope: !5) +!24 = !DILocation(line: 139, column: 7, scope: !5) +!25 = !DILocation(line: 140, column: 24, scope: !5) +!26 = !DILocation(line: 144, column: 29, scope: !5) +!27 = !DILocation(line: 144, column: 54, scope: !5) +!28 = !DILocation(line: 144, column: 44, scope: !5) +!29 = !DILocation(line: 145, column: 35, scope: !5) +!30 = !DILocation(line: 154, column: 55, scope: !5) +!31 = !DILocation(line: 154, column: 78, scope: !5) +!32 = !DILocation(line: 155, column: 50, scope: !5) +!33 = !DILocation(line: 155, column: 83, scope: !5) +!34 = !DILocation(line: 155, column: 68, scope: !5) +!35 = !DILocation(line: 158, column: 30, scope: !5) +!36 = !DILocation(line: 158, column: 52, scope: !5) +!37 = !DILocation(line: 158, column: 40, scope: !5) +!38 = !DILocation(line: 158, column: 63, scope: !5) +!39 = !DILocation(line: 159, column: 32, scope: !5) +!40 = !DILocation(line: 159, column: 42, scope: !5) +!41 = !DILocation(line: 159, column: 66, scope: !5) +!42 = !DILocation(line: 161, column: 46, scope: !5) +!43 = !DILocation(line: 161, column: 56, scope: !5) +!44 = !DILocation(line: 163, column: 17, scope: !5) +!45 = !DILocation(line: 164, column: 19, scope: !5) +!46 = !DILocation(line: 167, column: 19, scope: !5) +!47 = !DILocation(line: 168, column: 21, scope: !5) +!48 = !DILocation(line: 169, column: 25, scope: !5) +!49 = !DILocation(line: 174, column: 36, scope: !5) +!50 = !DILocation(line: 175, column: 29, scope: !5) +!51 = !DILocation(line: 825, column: 38, scope: !52, inlinedAt: !53) +!52 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!53 = !DILocation(line: 178, column: 107, scope: !5) +!54 = !DILocation(line: 825, column: 20, scope: !52, inlinedAt: !53) +!55 = !DILocation(line: 825, column: 56, scope: !52, inlinedAt: !53) +!56 = !DILocation(line: 825, column: 49, scope: !52, inlinedAt: !53) +!57 = !DILocation(line: 835, column: 23, scope: !52, inlinedAt: !53) +!58 = !DILocation(line: 825, column: 38, scope: !52, inlinedAt: !59) +!59 = !DILocation(line: 179, column: 111, scope: !5) +!60 = !DILocation(line: 825, column: 20, scope: !52, inlinedAt: !59) +!61 = !DILocation(line: 825, column: 49, scope: !52, inlinedAt: !59) +!62 = !DILocation(line: 835, column: 23, scope: !52, inlinedAt: !59) +!63 = !DILocation(line: 185, column: 34, scope: !5) +!64 = !DILocation(line: 185, column: 25, scope: !5) +!65 = !DILocation(line: 186, column: 33, scope: !5) +!66 = !DILocation(line: 186, column: 26, scope: !5) +!67 = !DILocation(line: 190, column: 30, scope: !5) +!68 = !DILocation(line: 190, column: 50, scope: !5) +!69 = !DILocation(line: 195, column: 30, scope: !5) +!70 = !DILocation(line: 196, column: 27, scope: !5) +!71 = !DILocation(line: 196, column: 41, scope: !5) +!72 = !DILocation(line: 197, column: 53, scope: !5) +!73 = !DILocation(line: 197, column: 39, scope: !5) +!74 = !DILocation(line: 199, column: 42, scope: !5) +!75 = !DILocation(line: 199, column: 29, scope: !5) +!76 = !DILocation(line: 390, column: 37, scope: !52, inlinedAt: !77) +!77 = !DILocation(line: 207, column: 12, scope: !5) +!78 = !DILocation(line: 390, column: 18, scope: !52, inlinedAt: !77) +!79 = !DILocation(line: 390, column: 49, scope: !52, inlinedAt: !77) +!80 = !DILocation(line: 391, column: 18, scope: !52, inlinedAt: !77) +!81 = !DILocation(line: 391, column: 49, scope: !52, inlinedAt: !77) +!82 = !DILocation(line: 395, column: 43, scope: !52, inlinedAt: !77) +!83 = !DILocation(line: 485, column: 34, scope: !52, inlinedAt: !77) +!84 = !DILocation(line: 397, column: 28, scope: !52, inlinedAt: !77) +!85 = !DILocation(line: 485, column: 23, scope: !52, inlinedAt: !77) +!86 = !DILocation(line: 488, column: 23, scope: !52, inlinedAt: !77) +!87 = !DILocation(line: 835, column: 23, scope: !52, inlinedAt: !77) +!88 = !DILocation(line: 414, column: 19, scope: !52, inlinedAt: !77) +!89 = !DILocation(line: 415, column: 19, scope: !52, inlinedAt: !77) +!90 = !DILocation(line: 459, column: 19, scope: !52, inlinedAt: !77) +!91 = !DILocation(line: 395, column: 63, scope: !52, inlinedAt: !77) +!92 = !DILocation(line: 504, column: 24, scope: !52, inlinedAt: !77) +!93 = !DILocation(line: 461, column: 14, scope: !52, inlinedAt: !77) +!94 = !DILocation(line: 482, column: 23, scope: !52, inlinedAt: !77) +!95 = !DILocation(line: 494, column: 24, scope: !52, inlinedAt: !77) +!96 = !DILocation(line: 490, column: 23, scope: !52, inlinedAt: !77) +!97 = !DILocation(line: 493, column: 24, scope: !52, inlinedAt: !77) +!98 = !DILocation(line: 496, column: 25, scope: !52, inlinedAt: !77) +!99 = !DILocation(line: 497, column: 92, scope: !52, inlinedAt: !77) +!100 = !DILocation(line: 503, column: 25, scope: !52, inlinedAt: !77) +!101 = !DILocation(line: 500, column: 24, scope: !52, inlinedAt: !77) +!102 = !DILocation(line: 501, column: 24, scope: !52, inlinedAt: !77) +!103 = !DILocation(line: 502, column: 39, scope: !52, inlinedAt: !77) +!104 = !DILocation(line: 505, column: 24, scope: !52, inlinedAt: !77) +!105 = !DILocation(line: 506, column: 23, scope: !52, inlinedAt: !77) +!106 = !DILocation(line: 513, column: 39, scope: !52, inlinedAt: !77) +!107 = !DILocation(line: 514, column: 25, scope: !52, inlinedAt: !77) +!108 = !DILocation(line: 515, column: 24, scope: !52, inlinedAt: !77) +!109 = !DILocation(line: 516, column: 24, scope: !52, inlinedAt: !77) +!110 = !DILocation(line: 524, column: 27, scope: !52, inlinedAt: !77) +!111 = !DILocation(line: 521, column: 69, scope: !52, inlinedAt: !77) +!112 = !DILocation(line: 525, column: 39, scope: !52, inlinedAt: !77) +!113 = !DILocation(line: 525, column: 21, scope: !52, inlinedAt: !77) +!114 = !DILocation(line: 530, column: 20, scope: !52, inlinedAt: !77) +!115 = !DILocation(line: 531, column: 19, scope: !52, inlinedAt: !77) +!116 = !DILocation(line: 531, column: 14, scope: !52, inlinedAt: !77) +!117 = !DILocation(line: 551, column: 15, scope: !52, inlinedAt: !77) +!118 = !DILocation(line: 549, column: 43, scope: !52, inlinedAt: !77) +!119 = !DILocation(line: 553, column: 21, scope: !52, inlinedAt: !77) +!120 = !DILocation(line: 417, column: 19, scope: !52, inlinedAt: !77) +!121 = !DILocation(line: 788, column: 33, scope: !52, inlinedAt: !77) +!122 = !DILocation(line: 789, column: 38, scope: !52, inlinedAt: !77) +!123 = !DILocation(line: 789, column: 24, scope: !52, inlinedAt: !77) +!124 = !DILocation(line: 790, column: 109, scope: !52, inlinedAt: !77) +!125 = !DILocation(line: 790, column: 113, scope: !52, inlinedAt: !77) +!126 = !DILocation(line: 790, column: 55, scope: !52, inlinedAt: !77) +!127 = !DILocation(line: 790, column: 25, scope: !52, inlinedAt: !77) +!128 = !DILocation(line: 791, column: 35, scope: !52, inlinedAt: !77) +!129 = !DILocation(line: 792, column: 34, scope: !52, inlinedAt: !77) +!130 = !DILocation(line: 792, column: 48, scope: !52, inlinedAt: !77) +!131 = !DILocation(line: 792, column: 63, scope: !52, inlinedAt: !77) +!132 = !DILocation(line: 793, column: 29, scope: !52, inlinedAt: !77) +!133 = !DILocation(line: 793, column: 61, scope: !52, inlinedAt: !77) +!134 = !DILocation(line: 793, column: 42, scope: !52, inlinedAt: !77) +!135 = !DILocation(line: 414, column: 28, scope: !52, inlinedAt: !77) +!136 = !DILocation(line: 214, column: 39, scope: !5) +!137 = !DILocation(line: 215, column: 31, scope: !5) +!138 = !DILocation(line: 215, column: 45, scope: !5) +!139 = !DILocation(line: 216, column: 62, scope: !5) +!140 = !DILocation(line: 216, column: 43, scope: !5) +!141 = !DILocation(line: 218, column: 33, scope: !5) +!142 = !DILocation(line: 390, column: 37, scope: !52, inlinedAt: !143) +!143 = !DILocation(line: 226, column: 16, scope: !5) +!144 = !DILocation(line: 390, column: 18, scope: !52, inlinedAt: !143) +!145 = !DILocation(line: 390, column: 49, scope: !52, inlinedAt: !143) +!146 = !DILocation(line: 391, column: 18, scope: !52, inlinedAt: !143) +!147 = !DILocation(line: 391, column: 49, scope: !52, inlinedAt: !143) +!148 = !DILocation(line: 395, column: 43, scope: !52, inlinedAt: !143) +!149 = !DILocation(line: 397, column: 28, scope: !52, inlinedAt: !143) +!150 = !DILocation(line: 835, column: 23, scope: !52, inlinedAt: !143) +!151 = !DILocation(line: 414, column: 19, scope: !52, inlinedAt: !143) +!152 = !DILocation(line: 415, column: 19, scope: !52, inlinedAt: !143) +!153 = !DILocation(line: 459, column: 19, scope: !52, inlinedAt: !143) +!154 = !DILocation(line: 395, column: 63, scope: !52, inlinedAt: !143) +!155 = !DILocation(line: 531, column: 19, scope: !52, inlinedAt: !143) +!156 = !DILocation(line: 461, column: 14, scope: !52, inlinedAt: !143) +!157 = !DILocation(line: 524, column: 27, scope: !52, inlinedAt: !143) +!158 = !DILocation(line: 525, column: 39, scope: !52, inlinedAt: !143) +!159 = !DILocation(line: 525, column: 21, scope: !52, inlinedAt: !143) +!160 = !DILocation(line: 530, column: 20, scope: !52, inlinedAt: !143) +!161 = !DILocation(line: 531, column: 14, scope: !52, inlinedAt: !143) +!162 = !DILocation(line: 551, column: 15, scope: !52, inlinedAt: !143) +!163 = !DILocation(line: 553, column: 21, scope: !52, inlinedAt: !143) +!164 = !DILocation(line: 788, column: 33, scope: !52, inlinedAt: !143) +!165 = !DILocation(line: 789, column: 38, scope: !52, inlinedAt: !143) +!166 = !DILocation(line: 789, column: 24, scope: !52, inlinedAt: !143) +!167 = !DILocation(line: 790, column: 109, scope: !52, inlinedAt: !143) +!168 = !DILocation(line: 790, column: 113, scope: !52, inlinedAt: !143) +!169 = !DILocation(line: 790, column: 55, scope: !52, inlinedAt: !143) +!170 = !DILocation(line: 790, column: 25, scope: !52, inlinedAt: !143) +!171 = !DILocation(line: 791, column: 35, scope: !52, inlinedAt: !143) +!172 = !DILocation(line: 792, column: 34, scope: !52, inlinedAt: !143) +!173 = !DILocation(line: 792, column: 48, scope: !52, inlinedAt: !143) +!174 = !DILocation(line: 792, column: 63, scope: !52, inlinedAt: !143) +!175 = !DILocation(line: 414, column: 28, scope: !52, inlinedAt: !143) +!176 = !DILocation(line: 793, column: 29, scope: !52, inlinedAt: !143) +!177 = !DILocation(line: 231, column: 24, scope: !5) +!178 = !DILocation(line: 231, column: 56, scope: !5) +!179 = !DILocation(line: 232, column: 14, scope: !5) +!180 = !DILocation(line: 234, column: 30, scope: !5) +!181 = !DILocation(line: 252, column: 25, scope: !5) +!182 = !DILocation(line: 253, column: 29, scope: !5) +!183 = !DILocation(line: 825, column: 38, scope: !52, inlinedAt: !184) +!184 = !DILocation(line: 256, column: 107, scope: !5) +!185 = !DILocation(line: 825, column: 20, scope: !52, inlinedAt: !184) +!186 = !DILocation(line: 825, column: 56, scope: !52, inlinedAt: !184) +!187 = !DILocation(line: 825, column: 49, scope: !52, inlinedAt: !184) +!188 = !DILocation(line: 835, column: 23, scope: !52, inlinedAt: !184) +!189 = !DILocation(line: 825, column: 20, scope: !52, inlinedAt: !190) +!190 = !DILocation(line: 257, column: 107, scope: !5) +!191 = !DILocation(line: 825, column: 49, scope: !52, inlinedAt: !190) +!192 = !DILocation(line: 835, column: 23, scope: !52, inlinedAt: !190) +!193 = !DILocation(line: 263, column: 32, scope: !5) +!194 = !DILocation(line: 266, column: 56, scope: !5) +!195 = !DILocation(line: 281, column: 58, scope: !5) +!196 = !DILocation(line: 281, column: 80, scope: !5) +!197 = !DILocation(line: 282, column: 53, scope: !5) +!198 = !DILocation(line: 282, column: 81, scope: !5) +!199 = !DILocation(line: 282, column: 70, scope: !5) +!200 = !DILocation(line: 286, column: 32, scope: !5) +!201 = !DILocation(line: 287, column: 30, scope: !5) +!202 = !DILocation(line: 287, column: 43, scope: !5) +!203 = !DILocation(line: 288, column: 55, scope: !5) +!204 = !DILocation(line: 288, column: 42, scope: !5) +!205 = !DILocation(line: 290, column: 45, scope: !5) +!206 = !DILocation(line: 290, column: 32, scope: !5) +!207 = !DILocation(line: 601, column: 37, scope: !52, inlinedAt: !208) +!208 = !DILocation(line: 298, column: 16, scope: !5) +!209 = !DILocation(line: 602, column: 38, scope: !52, inlinedAt: !208) +!210 = !DILocation(line: 608, column: 42, scope: !52, inlinedAt: !208) +!211 = !DILocation(line: 608, column: 61, scope: !52, inlinedAt: !208) +!212 = !DILocation(line: 701, column: 35, scope: !52, inlinedAt: !208) +!213 = !DILocation(line: 610, column: 28, scope: !52, inlinedAt: !208) +!214 = !DILocation(line: 701, column: 24, scope: !52, inlinedAt: !208) +!215 = !DILocation(line: 306, column: 41, scope: !5) +!216 = !DILocation(line: 307, column: 34, scope: !5) +!217 = !DILocation(line: 307, column: 47, scope: !5) +!218 = !DILocation(line: 308, column: 64, scope: !5) +!219 = !DILocation(line: 308, column: 46, scope: !5) +!220 = !DILocation(line: 310, column: 36, scope: !5) +!221 = !DILocation(line: 601, column: 37, scope: !52, inlinedAt: !222) +!222 = !DILocation(line: 318, column: 20, scope: !5) +!223 = !DILocation(line: 602, column: 38, scope: !52, inlinedAt: !222) +!224 = !DILocation(line: 608, column: 42, scope: !52, inlinedAt: !222) +!225 = !DILocation(line: 608, column: 61, scope: !52, inlinedAt: !222) +!226 = !DILocation(line: 676, column: 20, scope: !52, inlinedAt: !208) +!227 = !DILocation(line: 262, column: 30, scope: !5) +!228 = !DILocation(line: 263, column: 51, scope: !5) +!229 = !DILocation(line: 266, column: 44, scope: !5) +!230 = !DILocation(line: 266, column: 67, scope: !5) +!231 = !DILocation(line: 267, column: 36, scope: !5) +!232 = !DILocation(line: 267, column: 46, scope: !5) +!233 = !DILocation(line: 267, column: 70, scope: !5) +!234 = !DILocation(line: 269, column: 50, scope: !5) +!235 = !DILocation(line: 269, column: 60, scope: !5) +!236 = !DILocation(line: 271, column: 21, scope: !5) +!237 = !DILocation(line: 272, column: 23, scope: !5) +!238 = !DILocation(line: 275, column: 25, scope: !5) +!239 = !DILocation(line: 276, column: 29, scope: !5) +!240 = !DILocation(line: 601, column: 18, scope: !52, inlinedAt: !208) +!241 = !DILocation(line: 601, column: 49, scope: !52, inlinedAt: !208) +!242 = !DILocation(line: 602, column: 19, scope: !52, inlinedAt: !208) +!243 = !DILocation(line: 602, column: 51, scope: !52, inlinedAt: !208) +!244 = !DILocation(line: 835, column: 23, scope: !52, inlinedAt: !208) +!245 = !DILocation(line: 672, column: 28, scope: !52, inlinedAt: !208) +!246 = !DILocation(line: 672, column: 22, scope: !52, inlinedAt: !208) +!247 = !DILocation(line: 746, column: 29, scope: !52, inlinedAt: !208) +!248 = !DILocation(line: 746, column: 21, scope: !52, inlinedAt: !208) +!249 = !DILocation(line: 626, column: 19, scope: !52, inlinedAt: !208) +!250 = !DILocation(line: 627, column: 19, scope: !52, inlinedAt: !208) +!251 = !DILocation(line: 675, column: 26, scope: !52, inlinedAt: !208) +!252 = !DILocation(line: 675, column: 46, scope: !52, inlinedAt: !208) +!253 = !DILocation(line: 678, column: 15, scope: !52, inlinedAt: !208) +!254 = !DILocation(line: 698, column: 25, scope: !52, inlinedAt: !208) +!255 = !DILocation(line: 704, column: 24, scope: !52, inlinedAt: !208) +!256 = !DILocation(line: 706, column: 24, scope: !52, inlinedAt: !208) +!257 = !DILocation(line: 739, column: 27, scope: !52, inlinedAt: !208) +!258 = !DILocation(line: 736, column: 69, scope: !52, inlinedAt: !208) +!259 = !DILocation(line: 740, column: 40, scope: !52, inlinedAt: !208) +!260 = !DILocation(line: 740, column: 22, scope: !52, inlinedAt: !208) +!261 = !DILocation(line: 744, column: 24, scope: !52, inlinedAt: !208) +!262 = !DILocation(line: 744, column: 43, scope: !52, inlinedAt: !208) +!263 = !DILocation(line: 750, column: 20, scope: !52, inlinedAt: !208) +!264 = !DILocation(line: 751, column: 22, scope: !52, inlinedAt: !208) +!265 = !DILocation(line: 751, column: 16, scope: !52, inlinedAt: !208) +!266 = !DILocation(line: 775, column: 24, scope: !52, inlinedAt: !208) +!267 = !DILocation(line: 773, column: 45, scope: !52, inlinedAt: !208) +!268 = !DILocation(line: 775, column: 43, scope: !52, inlinedAt: !208) +!269 = !DILocation(line: 628, column: 19, scope: !52, inlinedAt: !208) +!270 = !DILocation(line: 788, column: 33, scope: !52, inlinedAt: !208) +!271 = !DILocation(line: 789, column: 38, scope: !52, inlinedAt: !208) +!272 = !DILocation(line: 789, column: 24, scope: !52, inlinedAt: !208) +!273 = !DILocation(line: 790, column: 109, scope: !52, inlinedAt: !208) +!274 = !DILocation(line: 790, column: 113, scope: !52, inlinedAt: !208) +!275 = !DILocation(line: 790, column: 55, scope: !52, inlinedAt: !208) +!276 = !DILocation(line: 790, column: 25, scope: !52, inlinedAt: !208) +!277 = !DILocation(line: 791, column: 35, scope: !52, inlinedAt: !208) +!278 = !DILocation(line: 792, column: 34, scope: !52, inlinedAt: !208) +!279 = !DILocation(line: 792, column: 48, scope: !52, inlinedAt: !208) +!280 = !DILocation(line: 792, column: 63, scope: !52, inlinedAt: !208) +!281 = !DILocation(line: 793, column: 29, scope: !52, inlinedAt: !208) +!282 = !DILocation(line: 793, column: 61, scope: !52, inlinedAt: !208) +!283 = !DILocation(line: 793, column: 42, scope: !52, inlinedAt: !208) +!284 = !DILocation(line: 626, column: 28, scope: !52, inlinedAt: !208) +!285 = !DILocation(line: 627, column: 28, scope: !52, inlinedAt: !208) +!286 = !DILocation(line: 601, column: 18, scope: !52, inlinedAt: !222) +!287 = !DILocation(line: 601, column: 49, scope: !52, inlinedAt: !222) +!288 = !DILocation(line: 602, column: 19, scope: !52, inlinedAt: !222) +!289 = !DILocation(line: 602, column: 51, scope: !52, inlinedAt: !222) +!290 = !DILocation(line: 835, column: 23, scope: !52, inlinedAt: !222) +!291 = !DILocation(line: 672, column: 28, scope: !52, inlinedAt: !222) +!292 = !DILocation(line: 672, column: 22, scope: !52, inlinedAt: !222) +!293 = !DILocation(line: 746, column: 29, scope: !52, inlinedAt: !222) +!294 = !DILocation(line: 746, column: 21, scope: !52, inlinedAt: !222) +!295 = !DILocation(line: 626, column: 19, scope: !52, inlinedAt: !222) +!296 = !DILocation(line: 627, column: 19, scope: !52, inlinedAt: !222) +!297 = !DILocation(line: 610, column: 28, scope: !52, inlinedAt: !222) +!298 = !DILocation(line: 675, column: 26, scope: !52, inlinedAt: !222) +!299 = !DILocation(line: 675, column: 46, scope: !52, inlinedAt: !222) +!300 = !DILocation(line: 676, column: 20, scope: !52, inlinedAt: !222) +!301 = !DILocation(line: 678, column: 15, scope: !52, inlinedAt: !222) +!302 = !DILocation(line: 739, column: 27, scope: !52, inlinedAt: !222) +!303 = !DILocation(line: 740, column: 40, scope: !52, inlinedAt: !222) +!304 = !DILocation(line: 740, column: 22, scope: !52, inlinedAt: !222) +!305 = !DILocation(line: 744, column: 24, scope: !52, inlinedAt: !222) +!306 = !DILocation(line: 744, column: 43, scope: !52, inlinedAt: !222) +!307 = !DILocation(line: 750, column: 20, scope: !52, inlinedAt: !222) +!308 = !DILocation(line: 751, column: 22, scope: !52, inlinedAt: !222) +!309 = !DILocation(line: 751, column: 16, scope: !52, inlinedAt: !222) +!310 = !DILocation(line: 775, column: 24, scope: !52, inlinedAt: !222) +!311 = !DILocation(line: 775, column: 43, scope: !52, inlinedAt: !222) +!312 = !DILocation(line: 788, column: 33, scope: !52, inlinedAt: !222) +!313 = !DILocation(line: 789, column: 38, scope: !52, inlinedAt: !222) +!314 = !DILocation(line: 789, column: 24, scope: !52, inlinedAt: !222) +!315 = !DILocation(line: 790, column: 109, scope: !52, inlinedAt: !222) +!316 = !DILocation(line: 790, column: 113, scope: !52, inlinedAt: !222) +!317 = !DILocation(line: 790, column: 55, scope: !52, inlinedAt: !222) +!318 = !DILocation(line: 790, column: 25, scope: !52, inlinedAt: !222) +!319 = !DILocation(line: 791, column: 35, scope: !52, inlinedAt: !222) +!320 = !DILocation(line: 792, column: 34, scope: !52, inlinedAt: !222) +!321 = !DILocation(line: 792, column: 48, scope: !52, inlinedAt: !222) +!322 = !DILocation(line: 792, column: 63, scope: !52, inlinedAt: !222) +!323 = !DILocation(line: 793, column: 29, scope: !52, inlinedAt: !222) +!324 = !DILocation(line: 793, column: 61, scope: !52, inlinedAt: !222) +!325 = !DILocation(line: 793, column: 42, scope: !52, inlinedAt: !222) +!326 = !DILocation(line: 626, column: 28, scope: !52, inlinedAt: !222) +!327 = !DILocation(line: 627, column: 28, scope: !52, inlinedAt: !222) +!328 = !DILocation(line: 628, column: 19, scope: !52, inlinedAt: !222) +!329 = !DILocation(line: 323, column: 23, scope: !5) +!330 = !DILocation(line: 323, column: 55, scope: !5) +!331 = !DILocation(line: 330, column: 30, scope: !5) +!332 = !DILocation(line: 334, column: 14, scope: !5) +!333 = !DILocation(line: 344, column: 58, scope: !5) +!334 = !DILocation(line: 345, column: 29, scope: !5) +!335 = !DILocation(line: 345, column: 69, scope: !5) +!336 = !DILocation(line: 139, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..0ed18187b2236e889fbb0fb937ec6b578d88a62d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.ptx @@ -0,0 +1,6891 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_tem_fused_zeros_1 // -- Begin function triton_tem_fused_zeros_1 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_tem_fused_zeros_1 +.visible .entry triton_tem_fused_zeros_1( + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_0, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_1, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_2, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_3, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_4, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_5, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_6, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_7, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_8, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_9, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_10, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_11, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_12, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_13, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_14, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_15, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_16, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_17, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_18, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_19 +) +.reqntid 256 +{ + .reg .pred %p<582>; + .reg .b16 %rs<190>; + .reg .b32 %r<13996>; + .reg .b64 %rd<1040>; + .loc 1 18 0 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:18:0 + +// %bb.0: + ld.param.b64 %rd193, [triton_tem_fused_zeros_1_param_16]; + ld.param.b64 %rd183, [triton_tem_fused_zeros_1_param_5]; + ld.param.b64 %rd182, [triton_tem_fused_zeros_1_param_4]; + ld.param.b64 %rd181, [triton_tem_fused_zeros_1_param_3]; + ld.param.b64 %rd180, [triton_tem_fused_zeros_1_param_0]; +$L__tmp0: + .loc 1 111 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:111:24 + mov.u32 %r1, %ctaid.x; + ld.param.b64 %rd195, [triton_tem_fused_zeros_1_param_1]; + .loc 1 115 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:115:27 + mov.u32 %r2, %ctaid.y; + ld.param.b64 %rd196, [triton_tem_fused_zeros_1_param_2]; + .loc 1 116 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:116:28 + mov.u32 %r3, %ctaid.z; + .loc 1 117 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:117:23 + and.b32 %r4, %r2, 1; + .loc 1 124 25 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:124:25 + shl.b32 %r1941, %r3, 18; + .loc 1 124 47 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:124:47 + shl.b32 %r1942, %r4, 21; + .loc 1 124 35 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:124:35 + add.s32 %r1943, %r1942, %r1941; + .loc 1 131 9 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:131:9 + mul.wide.s32 %rd198, %r1943, 2; + add.s64 %rd1, %rd195, %rd198; + .loc 1 132 9 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:132:9 + add.s64 %rd2, %rd196, %rd198; + .loc 1 136 26 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:136:26 + mov.u32 %r6, %tid.x; + shr.u32 %r7, %r6, 5; + and.b32 %r8, %r6, 240; + bfe.u32 %r9, %r6, 4, 4; + or.b32 %r10, %r9, 16; + or.b32 %r11, %r9, 32; + or.b32 %r12, %r9, 48; + or.b32 %r13, %r9, 64; + or.b32 %r14, %r9, 80; + or.b32 %r15, %r9, 96; + or.b32 %r16, %r9, 112; + shr.u32 %r1945, %r6, 1; + and.b32 %r1946, %r1945, 112; + bfe.u32 %r1947, %r6, 2, 3; + or.b32 %r17, %r1946, %r1947; + or.b32 %r18, %r17, 8; + .loc 1 139 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:139:14 + setp.lt.u32 %p1, %r1, 16; + .loc 1 139 7 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:139:7 + @%p1 bra $L__BB0_8; +// %bb.1: + .loc 1 0 7 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:0:7 + ld.param.b64 %rd190, [triton_tem_fused_zeros_1_param_13]; + ld.param.b64 %rd189, [triton_tem_fused_zeros_1_param_12]; + ld.param.b64 %rd186, [triton_tem_fused_zeros_1_param_9]; + ld.param.b64 %rd185, [triton_tem_fused_zeros_1_param_8]; + ld.param.b64 %rd184, [triton_tem_fused_zeros_1_param_6]; + .loc 1 140 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:140:24 + add.s32 %r8397, %r1, -16; + .loc 1 144 29 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:144:29 + shr.u32 %r8398, %r8397, 4; + .loc 1 144 54 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:144:54 + shl.b32 %r8399, %r3, 2; + .loc 1 144 44 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:144:44 + add.s32 %r8400, %r8398, %r8399; + .loc 1 145 35 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:145:35 + and.b32 %r8401, %r1, 15; + .loc 1 154 55 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:154:55 + shl.b32 %r8402, %r4, 4; + .loc 1 154 78 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:154:78 + or.b32 %r8403, %r8402, %r8401; + .loc 1 155 50 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:155:50 + shl.b32 %r8404, %r4, 8; + .loc 1 155 83 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:155:83 + shl.b32 %r8405, %r8401, 4; + .loc 1 155 68 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:155:68 + or.b32 %r8406, %r8404, %r8405; + .loc 1 158 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:158:30 + shl.b32 %r8407, %r8400, 7; + .loc 1 158 52 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:158:52 + shl.b32 %r8408, %r2, 23; + .loc 1 158 40 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:158:40 + add.s32 %r8409, %r8407, %r8408; + .loc 1 159 42 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:159:42 + mad.lo.s32 %r8410, %r8400, 262016, %r8409; + .loc 1 161 46 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:161:46 + shl.b32 %r8411, %r2, 16; + shl.b32 %r8412, %r8400, 11; + add.s32 %r8413, %r8412, %r8411; + .loc 1 163 17 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:163:17 + mul.wide.s32 %rd724, %r8409, 2; + add.s64 %rd725, %rd180, %rd724; + .loc 1 164 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:164:19 + mad.wide.s32 %rd726, %r8410, 2, %rd183; + .loc 1 168 21 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:168:21 + mul.wide.s32 %rd727, %r8413, 4; + add.s64 %rd728, %rd181, %rd727; + .loc 1 169 25 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:169:25 + add.s64 %rd729, %rd182, %rd727; + .loc 1 174 36 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:174:36 + shl.b32 %r8414, %r8401, 7; + .loc 1 175 29 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:175:29 + or.b32 %r8415, %r8414, %r9; + or.b32 %r8416, %r10, %r8414; + or.b32 %r8417, %r11, %r8414; + or.b32 %r8418, %r12, %r8414; + or.b32 %r8419, %r13, %r8414; + or.b32 %r8420, %r14, %r8414; + or.b32 %r8421, %r15, %r8414; + or.b32 %r8422, %r16, %r8414; + or.b32 %r19, %r18, %r8414; + or.b32 %r21, %r17, %r8414; +$L__tmp1: + .loc 1 825 38 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:825:38 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:178:107 ] + shl.b32 %r8423, %r8415, 12; + shl.b32 %r8424, %r8416, 12; + shl.b32 %r8425, %r8417, 12; + shl.b32 %r8426, %r8418, 12; + shl.b32 %r8427, %r8419, 12; + shl.b32 %r8428, %r8420, 12; + shl.b32 %r8429, %r8421, 12; + shl.b32 %r8430, %r8422, 12; + .loc 1 825 20 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:825:20 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:178:107 ] + mad.wide.u32 %rd730, %r8423, 2, %rd725; + mad.wide.u32 %rd731, %r8424, 2, %rd725; + mad.wide.u32 %rd732, %r8425, 2, %rd725; + mad.wide.u32 %rd733, %r8426, 2, %rd725; + mad.wide.u32 %rd734, %r8427, 2, %rd725; + mad.wide.u32 %rd735, %r8428, 2, %rd725; + mad.wide.u32 %rd736, %r8429, 2, %rd725; + mad.wide.u32 %rd737, %r8430, 2, %rd725; + .loc 1 825 56 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:825:56 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:178:107 ] + shl.b32 %r8431, %r6, 3; + and.b32 %r8432, %r8431, 120; + .loc 1 825 49 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:825:49 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:178:107 ] + cvt.u64.u32 %rd13, %r8432; + mul.wide.u32 %rd738, %r8432, 2; + add.s64 %rd684, %rd730, %rd738; + add.s64 %rd685, %rd731, %rd738; + add.s64 %rd686, %rd732, %rd738; + add.s64 %rd687, %rd733, %rd738; + add.s64 %rd688, %rd734, %rd738; + add.s64 %rd689, %rd735, %rd738; + add.s64 %rd690, %rd736, %rd738; + add.s64 %rd691, %rd737, %rd738; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:178:107 ] + // begin inline asm + mov.u32 %r8294, 0x0; + mov.u32 %r8295, 0x0; + mov.u32 %r8296, 0x0; + mov.u32 %r8297, 0x0; + ld.global.v4.b32 { %r8294, %r8295, %r8296, %r8297 }, [ %rd684 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8298, 0x0; + mov.u32 %r8299, 0x0; + mov.u32 %r8300, 0x0; + mov.u32 %r8301, 0x0; + ld.global.v4.b32 { %r8298, %r8299, %r8300, %r8301 }, [ %rd685 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8302, 0x0; + mov.u32 %r8303, 0x0; + mov.u32 %r8304, 0x0; + mov.u32 %r8305, 0x0; + ld.global.v4.b32 { %r8302, %r8303, %r8304, %r8305 }, [ %rd686 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8306, 0x0; + mov.u32 %r8307, 0x0; + mov.u32 %r8308, 0x0; + mov.u32 %r8309, 0x0; + ld.global.v4.b32 { %r8306, %r8307, %r8308, %r8309 }, [ %rd687 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8310, 0x0; + mov.u32 %r8311, 0x0; + mov.u32 %r8312, 0x0; + mov.u32 %r8313, 0x0; + ld.global.v4.b32 { %r8310, %r8311, %r8312, %r8313 }, [ %rd688 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8314, 0x0; + mov.u32 %r8315, 0x0; + mov.u32 %r8316, 0x0; + mov.u32 %r8317, 0x0; + ld.global.v4.b32 { %r8314, %r8315, %r8316, %r8317 }, [ %rd689 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8318, 0x0; + mov.u32 %r8319, 0x0; + mov.u32 %r8320, 0x0; + mov.u32 %r8321, 0x0; + ld.global.v4.b32 { %r8318, %r8319, %r8320, %r8321 }, [ %rd690 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8322, 0x0; + mov.u32 %r8323, 0x0; + mov.u32 %r8324, 0x0; + mov.u32 %r8325, 0x0; + ld.global.v4.b32 { %r8322, %r8323, %r8324, %r8325 }, [ %rd691 + 0 ]; + // end inline asm + shl.b32 %r8433, %r6, 4; + and.b32 %r8434, %r8433, 112; + shl.b32 %r8435, %r8, 3; + and.b32 %r8436, %r6, 112; + and.b32 %r8437, %r6, 8; + shl.b32 %r8438, %r8437, 11; + or.b32 %r8439, %r8434, %r8435; + xor.b32 %r8440, %r8439, %r8436; + or.b32 %r8441, %r8440, %r8438; + mov.b32 %r8442, global_smem; + add.s32 %r8443, %r8442, %r8441; + st.shared.v4.b32 [%r8443+98304], {%r8294, %r8295, %r8296, %r8297}; + st.shared.v4.b32 [%r8443+100352], {%r8298, %r8299, %r8300, %r8301}; + st.shared.v4.b32 [%r8443+102400], {%r8302, %r8303, %r8304, %r8305}; + st.shared.v4.b32 [%r8443+104448], {%r8306, %r8307, %r8308, %r8309}; + st.shared.v4.b32 [%r8443+106496], {%r8310, %r8311, %r8312, %r8313}; + st.shared.v4.b32 [%r8443+108544], {%r8314, %r8315, %r8316, %r8317}; + st.shared.v4.b32 [%r8443+110592], {%r8318, %r8319, %r8320, %r8321}; + st.shared.v4.b32 [%r8443+112640], {%r8322, %r8323, %r8324, %r8325}; +$L__tmp2: + .loc 1 825 38 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:825:38 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:179:111 ] + shl.b32 %r8444, %r8415, 7; + shl.b32 %r8445, %r8416, 7; + shl.b32 %r8446, %r8417, 7; + shl.b32 %r8447, %r8418, 7; + shl.b32 %r8448, %r8419, 7; + shl.b32 %r8449, %r8420, 7; + shl.b32 %r8450, %r8421, 7; + shl.b32 %r8451, %r8422, 7; + .loc 1 825 20 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:825:20 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:179:111 ] + mad.wide.u32 %rd739, %r8444, 2, %rd726; + mad.wide.u32 %rd740, %r8445, 2, %rd726; + mad.wide.u32 %rd741, %r8446, 2, %rd726; + mad.wide.u32 %rd742, %r8447, 2, %rd726; + mad.wide.u32 %rd743, %r8448, 2, %rd726; + mad.wide.u32 %rd744, %r8449, 2, %rd726; + mad.wide.u32 %rd745, %r8450, 2, %rd726; + mad.wide.u32 %rd746, %r8451, 2, %rd726; + .loc 1 825 49 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:825:49 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:179:111 ] + add.s64 %rd692, %rd739, %rd738; + add.s64 %rd693, %rd740, %rd738; + add.s64 %rd694, %rd741, %rd738; + add.s64 %rd695, %rd742, %rd738; + add.s64 %rd696, %rd743, %rd738; + add.s64 %rd697, %rd744, %rd738; + add.s64 %rd698, %rd745, %rd738; + add.s64 %rd699, %rd746, %rd738; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:179:111 ] + // begin inline asm + mov.u32 %r8326, 0x0; + mov.u32 %r8327, 0x0; + mov.u32 %r8328, 0x0; + mov.u32 %r8329, 0x0; + ld.global.v4.b32 { %r8326, %r8327, %r8328, %r8329 }, [ %rd692 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8330, 0x0; + mov.u32 %r8331, 0x0; + mov.u32 %r8332, 0x0; + mov.u32 %r8333, 0x0; + ld.global.v4.b32 { %r8330, %r8331, %r8332, %r8333 }, [ %rd693 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8334, 0x0; + mov.u32 %r8335, 0x0; + mov.u32 %r8336, 0x0; + mov.u32 %r8337, 0x0; + ld.global.v4.b32 { %r8334, %r8335, %r8336, %r8337 }, [ %rd694 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8338, 0x0; + mov.u32 %r8339, 0x0; + mov.u32 %r8340, 0x0; + mov.u32 %r8341, 0x0; + ld.global.v4.b32 { %r8338, %r8339, %r8340, %r8341 }, [ %rd695 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8342, 0x0; + mov.u32 %r8343, 0x0; + mov.u32 %r8344, 0x0; + mov.u32 %r8345, 0x0; + ld.global.v4.b32 { %r8342, %r8343, %r8344, %r8345 }, [ %rd696 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8346, 0x0; + mov.u32 %r8347, 0x0; + mov.u32 %r8348, 0x0; + mov.u32 %r8349, 0x0; + ld.global.v4.b32 { %r8346, %r8347, %r8348, %r8349 }, [ %rd697 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8350, 0x0; + mov.u32 %r8351, 0x0; + mov.u32 %r8352, 0x0; + mov.u32 %r8353, 0x0; + ld.global.v4.b32 { %r8350, %r8351, %r8352, %r8353 }, [ %rd698 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8354, 0x0; + mov.u32 %r8355, 0x0; + mov.u32 %r8356, 0x0; + mov.u32 %r8357, 0x0; + ld.global.v4.b32 { %r8354, %r8355, %r8356, %r8357 }, [ %rd699 + 0 ]; + // end inline asm + st.shared.v4.b32 [%r8443+131072], {%r8326, %r8327, %r8328, %r8329}; + st.shared.v4.b32 [%r8443+133120], {%r8330, %r8331, %r8332, %r8333}; + st.shared.v4.b32 [%r8443+135168], {%r8334, %r8335, %r8336, %r8337}; + st.shared.v4.b32 [%r8443+137216], {%r8338, %r8339, %r8340, %r8341}; + st.shared.v4.b32 [%r8443+139264], {%r8342, %r8343, %r8344, %r8345}; + st.shared.v4.b32 [%r8443+141312], {%r8346, %r8347, %r8348, %r8349}; + st.shared.v4.b32 [%r8443+143360], {%r8350, %r8351, %r8352, %r8353}; + st.shared.v4.b32 [%r8443+145408], {%r8354, %r8355, %r8356, %r8357}; +$L__tmp3: + .loc 1 185 34 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:185:34 + mul.wide.u32 %rd747, %r21, 4; + add.s64 %rd700, %rd729, %rd747; + mul.wide.u32 %rd748, %r19, 4; + add.s64 %rd701, %rd729, %rd748; + .loc 1 185 25 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:185:25 + // begin inline asm + mov.u32 %r8358, 0x0; + ld.global.b32 { %r8358 }, [ %rd700 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8359, 0x0; + ld.global.b32 { %r8359 }, [ %rd701 + 0 ]; + // end inline asm + .loc 1 186 33 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:186:33 + add.s64 %rd702, %rd728, %rd747; + add.s64 %rd703, %rd728, %rd748; + .loc 1 186 26 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:186:26 + // begin inline asm + mov.u32 %r8360, 0x0; + ld.global.b32 { %r8360 }, [ %rd702 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8361, 0x0; + ld.global.b32 { %r8361 }, [ %rd703 + 0 ]; + // end inline asm + .loc 1 190 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:190:30 + setp.eq.f32 %p292, %r8360, 0fFF800000; + setp.eq.f32 %p293, %r8361, 0fFF800000; + .loc 1 190 50 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:190:50 + selp.f32 %r53, 0f00000000, %r8360, %p292; + selp.f32 %r54, 0f00000000, %r8361, %p293; + .loc 1 195 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:195:30 + cvt.u64.u32 %rd16, %r8406; + mad.wide.u32 %rd704, %r8406, 4, %rd186; + .loc 1 196 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:196:27 + // begin inline asm + mov.u32 %r8362, 0x0; + ld.global.b32 { %r8362 }, [ %rd704 + 0 ]; + // end inline asm + .loc 1 196 41 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:196:41 + shl.b32 %r55, %r8362, 7; + .loc 1 197 53 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:197:53 + cvt.u64.u32 %rd18, %r8403; + mad.wide.u32 %rd705, %r8403, 4, %rd185; + .loc 1 197 39 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:197:39 + // begin inline asm + mov.u32 %r8363, 0x0; + ld.global.b32 { %r8363 }, [ %rd705 + 0 ]; + // end inline asm + .loc 1 199 42 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:199:42 + and.b32 %r57, %r6, 3; + .loc 1 199 29 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:199:29 + or.b32 %r8452, %r55, %r9; + or.b32 %r8453, %r55, %r10; + or.b32 %r8454, %r55, %r11; + or.b32 %r8455, %r55, %r12; +$L__tmp4: + .loc 1 390 37 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:390:37 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + shl.b32 %r8456, %r8452, 7; + shl.b32 %r8457, %r8453, 7; + shl.b32 %r8458, %r8454, 7; + shl.b32 %r8459, %r8455, 7; + .loc 1 390 18 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:390:18 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.wide.s32 %rd749, %r8456, 2; + add.s64 %rd750, %rd1, %rd749; + mul.wide.s32 %rd751, %r8457, 2; + add.s64 %rd752, %rd1, %rd751; + mul.wide.s32 %rd753, %r8458, 2; + add.s64 %rd754, %rd1, %rd753; + mul.wide.s32 %rd755, %r8459, 2; + add.s64 %rd756, %rd1, %rd755; + .loc 1 390 49 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:390:49 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + add.s64 %rd708, %rd750, %rd738; + add.s64 %rd709, %rd752, %rd738; + add.s64 %rd710, %rd754, %rd738; + add.s64 %rd711, %rd756, %rd738; + .loc 1 391 18 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:391:18 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + add.s64 %rd757, %rd2, %rd749; + add.s64 %rd758, %rd2, %rd751; + add.s64 %rd759, %rd2, %rd753; + add.s64 %rd760, %rd2, %rd755; + .loc 1 391 49 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:391:49 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + add.s64 %rd712, %rd757, %rd738; + add.s64 %rd713, %rd758, %rd738; + add.s64 %rd714, %rd759, %rd738; + add.s64 %rd715, %rd760, %rd738; + .loc 1 395 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:395:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + shl.b32 %r58, %r8363, 1; + .loc 1 485 34 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:485:34 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mad.wide.u32 %rd707, %r2, 8, %rd193; + .loc 1 397 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:397:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + setp.lt.s32 %p294, %r58, 1; + setp.gt.s32 %p291, %r58, 0; + .loc 1 485 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:485:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + // begin inline asm + mov.u64 %rd19, 0x0; + @%p291 ld.global.b64 { %rd19 }, [ %rd707 + 0 ]; + // end inline asm + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + shl.b32 %r8460, %r8437, 10; + or.b32 %r59, %r8440, %r8460; + add.s32 %r10751, %r8442, %r59; + selp.b32 %r8365, 16, 0, %p291; + // begin inline asm + cp.async.cg.shared.global [ %r10751 + 0 ], [ %rd708 + 0 ], 0x10, %r8365; + // end inline asm + add.s32 %r10753, %r10751, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r10753 + 0 ], [ %rd709 + 0 ], 0x10, %r8365; + // end inline asm + add.s32 %r10755, %r10751, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r10755 + 0 ], [ %rd710 + 0 ], 0x10, %r8365; + // end inline asm + add.s32 %r10757, %r10751, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r10757 + 0 ], [ %rd711 + 0 ], 0x10, %r8365; + // end inline asm + cp.async.commit_group; + add.s32 %r8372, %r10751, 49152; + // begin inline asm + cp.async.cg.shared.global [ %r8372 + 0 ], [ %rd712 + 0 ], 0x10, %r8365; + // end inline asm + add.s32 %r8374, %r10751, 51200; + // begin inline asm + cp.async.cg.shared.global [ %r8374 + 0 ], [ %rd713 + 0 ], 0x10, %r8365; + // end inline asm + add.s32 %r8376, %r10751, 53248; + // begin inline asm + cp.async.cg.shared.global [ %r8376 + 0 ], [ %rd714 + 0 ], 0x10, %r8365; + // end inline asm + add.s32 %r8378, %r10751, 55296; + // begin inline asm + cp.async.cg.shared.global [ %r8378 + 0 ], [ %rd715 + 0 ], 0x10, %r8365; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:397:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + setp.gt.s32 %p295, %r58, 1; + .loc 1 414 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:414:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + add.s64 %rd1014, %rd708, 16384; + add.s64 %rd1013, %rd709, 16384; + add.s64 %rd1012, %rd710, 16384; + add.s64 %rd1011, %rd711, 16384; + .loc 1 415 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:415:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + add.s64 %rd1010, %rd712, 16384; + add.s64 %rd1009, %rd713, 16384; + add.s64 %rd1008, %rd714, 16384; + add.s64 %rd1007, %rd715, 16384; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + bar.sync 0; + add.s32 %r8380, %r10751, 16384; + selp.b32 %r8381, 16, 0, %p295; + // begin inline asm + cp.async.cg.shared.global [ %r8380 + 0 ], [ %rd1014 + 0 ], 0x10, %r8381; + // end inline asm + add.s32 %r8382, %r10751, 18432; + // begin inline asm + cp.async.cg.shared.global [ %r8382 + 0 ], [ %rd1013 + 0 ], 0x10, %r8381; + // end inline asm + add.s32 %r8384, %r10751, 20480; + // begin inline asm + cp.async.cg.shared.global [ %r8384 + 0 ], [ %rd1012 + 0 ], 0x10, %r8381; + // end inline asm + add.s32 %r8386, %r10751, 22528; + // begin inline asm + cp.async.cg.shared.global [ %r8386 + 0 ], [ %rd1011 + 0 ], 0x10, %r8381; + // end inline asm + cp.async.commit_group; + add.s32 %r8388, %r10751, 65536; + // begin inline asm + cp.async.cg.shared.global [ %r8388 + 0 ], [ %rd1010 + 0 ], 0x10, %r8381; + // end inline asm + add.s32 %r8390, %r10751, 67584; + // begin inline asm + cp.async.cg.shared.global [ %r8390 + 0 ], [ %rd1009 + 0 ], 0x10, %r8381; + // end inline asm + add.s32 %r8392, %r10751, 69632; + // begin inline asm + cp.async.cg.shared.global [ %r8392 + 0 ], [ %rd1008 + 0 ], 0x10, %r8381; + // end inline asm + add.s32 %r8394, %r10751, 71680; + // begin inline asm + cp.async.cg.shared.global [ %r8394 + 0 ], [ %rd1007 + 0 ], 0x10, %r8381; + // end inline asm + cp.async.commit_group; + .loc 1 459 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:459:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + mov.b32 %r13037, 0f00000000; + mov.b32 %r13038, %r13037; + mov.b32 %r13039, %r13037; + mov.b32 %r13040, %r13037; + mov.b32 %r13041, %r13037; + mov.b32 %r13042, %r13037; + mov.b32 %r13043, %r13037; + mov.b32 %r13044, %r13037; + mov.b32 %r13045, %r13037; + mov.b32 %r13046, %r13037; + mov.b32 %r13047, %r13037; + mov.b32 %r13048, %r13037; + mov.b32 %r13049, %r13037; + mov.b32 %r13050, %r13037; + mov.b32 %r13051, %r13037; + mov.b32 %r13052, %r13037; + mov.b32 %r13053, %r13037; + mov.b32 %r13054, %r13037; + mov.b32 %r13055, %r13037; + mov.b32 %r13056, %r13037; + mov.b32 %r13057, %r13037; + mov.b32 %r13058, %r13037; + mov.b32 %r13059, %r13037; + mov.b32 %r13060, %r13037; + mov.b32 %r13061, %r13037; + mov.b32 %r13062, %r13037; + mov.b32 %r13063, %r13037; + mov.b32 %r13064, %r13037; + mov.b32 %r13065, %r13037; + mov.b32 %r13066, %r13037; + mov.b32 %r13067, %r13037; + mov.b32 %r13068, %r13037; + mov.b32 %r13069, %r13037; + mov.b32 %r13070, %r13037; + mov.b32 %r13071, %r13037; + mov.b32 %r13072, %r13037; + mov.b32 %r13073, %r13037; + mov.b32 %r13074, %r13037; + mov.b32 %r13075, %r13037; + mov.b32 %r13076, %r13037; + mov.b32 %r13077, %r13037; + mov.b32 %r13078, %r13037; + mov.b32 %r13079, %r13037; + mov.b32 %r13080, %r13037; + mov.b32 %r13081, %r13037; + mov.b32 %r13082, %r13037; + mov.b32 %r13083, %r13037; + mov.b32 %r13084, %r13037; + mov.b32 %r13085, %r13037; + mov.b32 %r13086, %r13037; + mov.b32 %r13087, %r13037; + mov.b32 %r13088, %r13037; + mov.b32 %r13089, %r13037; + mov.b32 %r13090, %r13037; + mov.b32 %r13091, %r13037; + mov.b32 %r13092, %r13037; + mov.b32 %r13093, %r13037; + mov.b32 %r13094, %r13037; + mov.b32 %r13095, %r13037; + mov.b32 %r13096, %r13037; + mov.b32 %r13097, %r13037; + mov.b32 %r13098, %r13037; + mov.b32 %r13099, %r13037; + mov.b32 %r13100, %r13037; + .loc 1 397 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:397:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + @%p294 bra $L__BB0_4; +$L__tmp5: +// %bb.2: // %.lr.ph1846 + .loc 1 0 0 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:0 + cvt.u64.u32 %rd14, %r21; + cvt.u64.u32 %rd15, %r19; +$L__tmp6: + .loc 1 395 63 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:395:63 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + min.u32 %r76, %r58, 32; +$L__tmp7: + .loc 1 199 42 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:199:42 + shl.b32 %r8466, %r57, 1; + or.b32 %r13117, %r55, %r8466; + add.s32 %r77, %r76, -2; + add.s32 %r78, %r76, -1; + or.b32 %r13116, %r13117, 1; + or.b32 %r13115, %r13117, 8; + or.b32 %r13114, %r13117, 9; + or.b32 %r13113, %r13117, 16; + or.b32 %r13112, %r13117, 17; + or.b32 %r13111, %r13117, 24; + or.b32 %r13110, %r13117, 25; + or.b32 %r13109, %r13117, 32; + or.b32 %r13108, %r13117, 33; + or.b32 %r13107, %r13117, 40; + or.b32 %r13106, %r13117, 41; + or.b32 %r13105, %r13117, 48; + or.b32 %r13104, %r13117, 49; + or.b32 %r13103, %r13117, 56; + or.b32 %r13102, %r13117, 57; + mov.b32 %r9018, 0; + mov.b32 %r13037, 0f00000000; + mov.b32 %r13036, 1; + mov.b32 %r13035, -1; + mov.b32 %r13034, 64; + setp.gt.s64 %p316, %rd19, %rd15; + setp.gt.s64 %p317, %rd19, %rd14; + mov.b32 %r13038, %r13037; + mov.b32 %r13039, %r13037; + mov.b32 %r13040, %r13037; + mov.b32 %r13041, %r13037; + mov.b32 %r13042, %r13037; + mov.b32 %r13043, %r13037; + mov.b32 %r13044, %r13037; + mov.b32 %r13045, %r13037; + mov.b32 %r13046, %r13037; + mov.b32 %r13047, %r13037; + mov.b32 %r13048, %r13037; + mov.b32 %r13049, %r13037; + mov.b32 %r13050, %r13037; + mov.b32 %r13051, %r13037; + mov.b32 %r13052, %r13037; + mov.b32 %r13053, %r13037; + mov.b32 %r13054, %r13037; + mov.b32 %r13055, %r13037; + mov.b32 %r13056, %r13037; + mov.b32 %r13057, %r13037; + mov.b32 %r13058, %r13037; + mov.b32 %r13059, %r13037; + mov.b32 %r13060, %r13037; + mov.b32 %r13061, %r13037; + mov.b32 %r13062, %r13037; + mov.b32 %r13063, %r13037; + mov.b32 %r13064, %r13037; + mov.b32 %r13065, %r13037; + mov.b32 %r13066, %r13037; + mov.b32 %r13067, %r13037; + mov.b32 %r13068, %r13037; + mov.b32 %r13069, %r13037; + mov.b32 %r13070, %r13037; + mov.b32 %r13071, %r13037; + mov.b32 %r13072, %r13037; + mov.b32 %r13073, %r13037; + mov.b32 %r13074, %r13037; + mov.b32 %r13075, %r13037; + mov.b32 %r13076, %r13037; + mov.b32 %r13077, %r13037; + mov.b32 %r13078, %r13037; + mov.b32 %r13079, %r13037; + mov.b32 %r13080, %r13037; + mov.b32 %r13081, %r13037; + mov.b32 %r13082, %r13037; + mov.b32 %r13083, %r13037; + mov.b32 %r13084, %r13037; + mov.b32 %r13085, %r13037; + mov.b32 %r13086, %r13037; + mov.b32 %r13087, %r13037; + mov.b32 %r13088, %r13037; + mov.b32 %r13089, %r13037; + mov.b32 %r13090, %r13037; + mov.b32 %r13091, %r13037; + mov.b32 %r13092, %r13037; + mov.b32 %r13093, %r13037; + mov.b32 %r13094, %r13037; + mov.b32 %r13095, %r13037; + mov.b32 %r13096, %r13037; + mov.b32 %r13097, %r13037; + mov.b32 %r13098, %r13037; + mov.b32 %r13099, %r13037; + mov.b32 %r13100, %r13037; + mov.b32 %r13101, %r9018; +$L__BB0_3: // %__nv_exp2f.exit1414 + // =>This Inner Loop Header: Depth=1 + .loc 1 0 42 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:0:42 + cvt.u32.u64 %r10125, %rd15; + cvt.u32.u64 %r10126, %rd14; +$L__tmp8: + .loc 1 397 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:397:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + setp.lt.s32 %p318, %r13101, %r77; + setp.lt.s32 %p314, %r13101, %r78; + add.s32 %r10127, %r13035, 1; + setp.gt.s32 %p319, %r10127, 2; + selp.b32 %r13035, 0, %r10127, %p319; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r10128, %r13035, 14; + add.s32 %r9020, %r8442, %r10128; + .loc 1 459 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:459:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + shfl.sync.idx.b32 %r10130, %r7, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r10131, %r10130, 11; + and.b32 %r10132, %r10131, 8192; + add.s32 %r8979, %r8442, 98304; + add.s32 %r10133, %r10132, %r8979; + bfe.u32 %r10134, %r10133, 4, 14; + cvt.u64.u32 %rd811, %r10134; + or.b64 %rd761, %rd811, 4611686293372403712; + bfe.u32 %r10135, %r9020, 4, 14; + cvt.u64.u32 %rd812, %r10135; + or.b64 %rd762, %rd812, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8584,%r8585,%r8586,%r8587,%r8588,%r8589,%r8590,%r8591,%r8592,%r8593,%r8594}, %rd761, %rd762, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r10136, %r10132, 32; + add.s32 %r10137, %r10136, %r8979; + bfe.u32 %r10138, %r10137, 4, 14; + cvt.u64.u32 %rd813, %r10138; + or.b64 %rd763, %rd813, 4611686293372403712; + add.s32 %r10139, %r9020, 32; + bfe.u32 %r10140, %r10139, 4, 14; + cvt.u64.u32 %rd814, %r10140; + or.b64 %rd764, %rd814, 4611686293338849280; + mov.pred %p296, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8584,%r8585,%r8586,%r8587,%r8588,%r8589,%r8590,%r8591,%r8592,%r8593,%r8594}, %rd763, %rd764, %p296, 1, 1, 0, 0; + // end inline asm + or.b32 %r10141, %r10132, 64; + add.s32 %r10142, %r10141, %r8979; + bfe.u32 %r10143, %r10142, 4, 14; + cvt.u64.u32 %rd815, %r10143; + or.b64 %rd765, %rd815, 4611686293372403712; + add.s32 %r10144, %r9020, 64; + bfe.u32 %r10145, %r10144, 4, 14; + cvt.u64.u32 %rd816, %r10145; + or.b64 %rd766, %rd816, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8584,%r8585,%r8586,%r8587,%r8588,%r8589,%r8590,%r8591,%r8592,%r8593,%r8594}, %rd765, %rd766, %p296, 1, 1, 0, 0; + // end inline asm + or.b32 %r10146, %r10132, 96; + add.s32 %r10147, %r10146, %r8979; + bfe.u32 %r10148, %r10147, 4, 14; + cvt.u64.u32 %rd817, %r10148; + or.b64 %rd767, %rd817, 4611686293372403712; + add.s32 %r10149, %r9020, 96; + bfe.u32 %r10150, %r10149, 4, 14; + cvt.u64.u32 %rd818, %r10150; + or.b64 %rd768, %rd818, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8584,%r8585,%r8586,%r8587,%r8588,%r8589,%r8590,%r8591,%r8592,%r8593,%r8594}, %rd767, %rd768, %p296, 1, 1, 0, 0; + // end inline asm + or.b32 %r10151, %r10132, 16384; + add.s32 %r10152, %r10151, %r8979; + bfe.u32 %r10153, %r10152, 4, 14; + cvt.u64.u32 %rd819, %r10153; + or.b64 %rd769, %rd819, 4611686293372403712; + add.s32 %r10154, %r9020, 8192; + bfe.u32 %r10155, %r10154, 4, 14; + cvt.u64.u32 %rd820, %r10155; + or.b64 %rd770, %rd820, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8584,%r8585,%r8586,%r8587,%r8588,%r8589,%r8590,%r8591,%r8592,%r8593,%r8594}, %rd769, %rd770, %p296, 1, 1, 0, 0; + // end inline asm + or.b32 %r10156, %r10132, 16416; + add.s32 %r10157, %r10156, %r8979; + bfe.u32 %r10158, %r10157, 4, 14; + cvt.u64.u32 %rd821, %r10158; + or.b64 %rd771, %rd821, 4611686293372403712; + add.s32 %r10159, %r9020, 8224; + bfe.u32 %r10160, %r10159, 4, 14; + cvt.u64.u32 %rd822, %r10160; + or.b64 %rd772, %rd822, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8584,%r8585,%r8586,%r8587,%r8588,%r8589,%r8590,%r8591,%r8592,%r8593,%r8594}, %rd771, %rd772, %p296, 1, 1, 0, 0; + // end inline asm + or.b32 %r10161, %r10132, 16448; + add.s32 %r10162, %r10161, %r8979; + bfe.u32 %r10163, %r10162, 4, 14; + cvt.u64.u32 %rd823, %r10163; + or.b64 %rd773, %rd823, 4611686293372403712; + add.s32 %r10164, %r9020, 8256; + bfe.u32 %r10165, %r10164, 4, 14; + cvt.u64.u32 %rd824, %r10165; + or.b64 %rd774, %rd824, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8584,%r8585,%r8586,%r8587,%r8588,%r8589,%r8590,%r8591,%r8592,%r8593,%r8594}, %rd773, %rd774, %p296, 1, 1, 0, 0; + // end inline asm + or.b32 %r10166, %r10132, 16480; + add.s32 %r10167, %r10166, %r8979; + bfe.u32 %r10168, %r10167, 4, 14; + cvt.u64.u32 %rd825, %r10168; + or.b64 %rd775, %rd825, 4611686293372403712; + add.s32 %r10169, %r9020, 8288; + bfe.u32 %r10170, %r10169, 4, 14; + cvt.u64.u32 %rd826, %r10170; + or.b64 %rd776, %rd826, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8584,%r8585,%r8586,%r8587,%r8588,%r8589,%r8590,%r8591,%r8592,%r8593,%r8594}, %rd775, %rd776, %p296, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r8980, %r9018; + mov.b32 %r8981, %r9018; + mov.b32 %r8983, %r9018; + mov.b32 %r8984, %r9018; + mov.b32 %r8982, %r9020; + // begin inline asm + // wait for regs: %r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8584,%r8585,%r8586,%r8587,%r8588,%r8589,%r8590,%r8591,%r8592,%r8593,%r8594,%r8979,%r8980,%r8981,%r8982,%r8983,%r8984 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 461 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:461:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10171, %r8563, 0f3DB504F3; + mul.f32 %r10172, %r8564, 0f3DB504F3; + mul.f32 %r10173, %r8565, 0f3DB504F3; + mul.f32 %r10174, %r8566, 0f3DB504F3; + mul.f32 %r10175, %r8567, 0f3DB504F3; + mul.f32 %r10176, %r8568, 0f3DB504F3; + mul.f32 %r10177, %r8569, 0f3DB504F3; + mul.f32 %r10178, %r8570, 0f3DB504F3; + mul.f32 %r10179, %r8571, 0f3DB504F3; + mul.f32 %r10180, %r8572, 0f3DB504F3; + mul.f32 %r10181, %r8573, 0f3DB504F3; + mul.f32 %r10182, %r8574, 0f3DB504F3; + mul.f32 %r10183, %r8575, 0f3DB504F3; + mul.f32 %r10184, %r8576, 0f3DB504F3; + mul.f32 %r10185, %r8577, 0f3DB504F3; + mul.f32 %r10186, %r8578, 0f3DB504F3; + mul.f32 %r10187, %r8579, 0f3DB504F3; + mul.f32 %r10188, %r8580, 0f3DB504F3; + mul.f32 %r10189, %r8581, 0f3DB504F3; + mul.f32 %r10190, %r8582, 0f3DB504F3; + mul.f32 %r10191, %r8583, 0f3DB504F3; + mul.f32 %r10192, %r8584, 0f3DB504F3; + mul.f32 %r10193, %r8585, 0f3DB504F3; + mul.f32 %r10194, %r8586, 0f3DB504F3; + mul.f32 %r10195, %r8587, 0f3DB504F3; + mul.f32 %r10196, %r8588, 0f3DB504F3; + mul.f32 %r10197, %r8589, 0f3DB504F3; + mul.f32 %r10198, %r8590, 0f3DB504F3; + mul.f32 %r10199, %r8591, 0f3DB504F3; + mul.f32 %r10200, %r8592, 0f3DB504F3; + mul.f32 %r10201, %r8593, 0f3DB504F3; + mul.f32 %r10202, %r8594, 0f3DB504F3; + .loc 1 482 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:482:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + setp.ge.s32 %p320, %r10126, %r13117; + setp.ge.s32 %p321, %r10126, %r13116; + setp.ge.s32 %p322, %r10125, %r13117; + setp.ge.s32 %p323, %r10125, %r13116; + setp.ge.s32 %p324, %r10126, %r13115; + setp.ge.s32 %p325, %r10126, %r13114; + setp.ge.s32 %p326, %r10125, %r13115; + setp.ge.s32 %p327, %r10125, %r13114; + setp.ge.s32 %p328, %r10126, %r13113; + setp.ge.s32 %p329, %r10126, %r13112; + setp.ge.s32 %p330, %r10125, %r13113; + setp.ge.s32 %p331, %r10125, %r13112; + setp.ge.s32 %p332, %r10126, %r13111; + setp.ge.s32 %p333, %r10126, %r13110; + setp.ge.s32 %p334, %r10125, %r13111; + setp.ge.s32 %p335, %r10125, %r13110; + setp.ge.s32 %p336, %r10126, %r13109; + setp.ge.s32 %p337, %r10126, %r13108; + setp.ge.s32 %p338, %r10125, %r13109; + setp.ge.s32 %p339, %r10125, %r13108; + setp.ge.s32 %p340, %r10126, %r13107; + setp.ge.s32 %p341, %r10126, %r13106; + setp.ge.s32 %p342, %r10125, %r13107; + setp.ge.s32 %p343, %r10125, %r13106; + setp.ge.s32 %p344, %r10126, %r13105; + setp.ge.s32 %p345, %r10126, %r13104; + setp.ge.s32 %p346, %r10125, %r13105; + setp.ge.s32 %p347, %r10125, %r13104; + setp.ge.s32 %p348, %r10126, %r13103; + setp.ge.s32 %p349, %r10126, %r13102; + setp.ge.s32 %p350, %r10125, %r13103; + setp.ge.s32 %p351, %r10125, %r13102; + .loc 1 490 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:490:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + and.pred %p352, %p317, %p320; + and.pred %p353, %p317, %p321; + and.pred %p354, %p316, %p322; + and.pred %p355, %p316, %p323; + and.pred %p356, %p317, %p324; + and.pred %p357, %p317, %p325; + and.pred %p358, %p316, %p326; + and.pred %p359, %p316, %p327; + and.pred %p360, %p317, %p328; + and.pred %p361, %p317, %p329; + and.pred %p362, %p316, %p330; + and.pred %p363, %p316, %p331; + and.pred %p364, %p317, %p332; + and.pred %p365, %p317, %p333; + and.pred %p366, %p316, %p334; + and.pred %p367, %p316, %p335; + and.pred %p368, %p317, %p336; + and.pred %p369, %p317, %p337; + and.pred %p370, %p316, %p338; + and.pred %p371, %p316, %p339; + and.pred %p372, %p317, %p340; + and.pred %p373, %p317, %p341; + and.pred %p374, %p316, %p342; + and.pred %p375, %p316, %p343; + and.pred %p376, %p317, %p344; + and.pred %p377, %p317, %p345; + and.pred %p378, %p316, %p346; + and.pred %p379, %p316, %p347; + and.pred %p380, %p317, %p348; + and.pred %p381, %p317, %p349; + and.pred %p382, %p316, %p350; + and.pred %p383, %p316, %p351; + .loc 1 493 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:493:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + setp.gt.s32 %p384, %r13117, 2047; + setp.gt.s32 %p385, %r13115, 2047; + setp.gt.s32 %p386, %r13113, 2047; + setp.gt.s32 %p387, %r13111, 2047; + setp.gt.s32 %p388, %r13109, 2047; + setp.gt.s32 %p389, %r13107, 2047; + setp.gt.s32 %p390, %r13105, 2047; + setp.gt.s32 %p391, %r13103, 2047; + .loc 1 494 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:494:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + shr.s32 %r10203, %r13102, 31; + shr.u32 %r10204, %r10203, 21; + add.s32 %r10205, %r13102, %r10204; + and.b32 %r10206, %r10205, -2048; + sub.s32 %r10207, %r13102, %r10206; + shr.s32 %r10208, %r13104, 31; + shr.u32 %r10209, %r10208, 21; + add.s32 %r10210, %r13104, %r10209; + and.b32 %r10211, %r10210, -2048; + sub.s32 %r10212, %r13104, %r10211; + shr.s32 %r10213, %r13106, 31; + shr.u32 %r10214, %r10213, 21; + add.s32 %r10215, %r13106, %r10214; + and.b32 %r10216, %r10215, -2048; + sub.s32 %r10217, %r13106, %r10216; + shr.s32 %r10218, %r13108, 31; + shr.u32 %r10219, %r10218, 21; + add.s32 %r10220, %r13108, %r10219; + and.b32 %r10221, %r10220, -2048; + sub.s32 %r10222, %r13108, %r10221; + shr.s32 %r10223, %r13110, 31; + shr.u32 %r10224, %r10223, 21; + add.s32 %r10225, %r13110, %r10224; + and.b32 %r10226, %r10225, -2048; + sub.s32 %r10227, %r13110, %r10226; + shr.s32 %r10228, %r13112, 31; + shr.u32 %r10229, %r10228, 21; + add.s32 %r10230, %r13112, %r10229; + and.b32 %r10231, %r10230, -2048; + sub.s32 %r10232, %r13112, %r10231; + shr.s32 %r10233, %r13114, 31; + shr.u32 %r10234, %r10233, 21; + add.s32 %r10235, %r13114, %r10234; + and.b32 %r10236, %r10235, -2048; + sub.s32 %r10237, %r13114, %r10236; + shr.s32 %r10238, %r13116, 31; + shr.u32 %r10239, %r10238, 21; + add.s32 %r10240, %r13116, %r10239; + and.b32 %r10241, %r10240, -2048; + sub.s32 %r10242, %r13116, %r10241; + .loc 1 496 25 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:496:25 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + setp.ne.b32 %p392, %r10242, 0; + setp.ne.b32 %p393, %r10237, 0; + setp.ne.b32 %p394, %r10232, 0; + setp.ne.b32 %p395, %r10227, 0; + setp.ne.b32 %p396, %r10222, 0; + setp.ne.b32 %p397, %r10217, 0; + setp.ne.b32 %p398, %r10212, 0; + setp.ne.b32 %p399, %r10207, 0; + .loc 1 497 92 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:497:92 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + and.b32 %r10243, %r13103, -2147481601; + and.b32 %r10244, %r13105, -2147481601; + and.b32 %r10245, %r13107, -2147481601; + and.b32 %r10246, %r13109, -2147481601; + and.b32 %r10247, %r13111, -2147481601; + and.b32 %r10248, %r13113, -2147481601; + and.b32 %r10249, %r13115, -2147481601; + and.b32 %r10250, %r13117, -2147481601; + setp.gt.u32 %p400, %r10250, -2147483648; + setp.gt.u32 %p401, %r10249, -2147483648; + setp.gt.u32 %p402, %r10248, -2147483648; + setp.gt.u32 %p403, %r10247, -2147483648; + setp.gt.u32 %p404, %r10246, -2147483648; + setp.gt.u32 %p405, %r10245, -2147483648; + setp.gt.u32 %p406, %r10244, -2147483648; + setp.gt.u32 %p407, %r10243, -2147483648; + .loc 1 503 25 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:503:25 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + prmt.b32 %r10251, %r13115, %r13117, 0x5410U; + prmt.b32 %r10252, %r13111, %r13113, 0x5410U; + prmt.b32 %r10253, %r13107, %r13109, 0x5410U; + prmt.b32 %r10254, %r13103, %r13105, 0x5410U; + and.b32 %r10255, %r10254, 134154239; + and.b32 %r10256, %r10253, 134154239; + and.b32 %r10257, %r10252, 134154239; + and.b32 %r10258, %r10251, 134154239; + .loc 1 504 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:504:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mov.b32 {%rs65, %rs66}, %r10258; + cvt.u64.u16 %rd827, %rs66; + cvt.u64.u16 %rd828, %rs65; + mov.b32 {%rs67, %rs68}, %r10257; + cvt.u64.u16 %rd829, %rs68; + cvt.u64.u16 %rd830, %rs67; + mov.b32 {%rs69, %rs70}, %r10256; + cvt.u64.u16 %rd831, %rs70; + cvt.u64.u16 %rd832, %rs69; + mov.b32 {%rs71, %rs72}, %r10255; + cvt.u64.u16 %rd833, %rs72; + cvt.u64.u16 %rd834, %rs71; + setp.gt.s64 %p408, %rd19, %rd834; + setp.gt.s64 %p409, %rd19, %rd833; + setp.gt.s64 %p410, %rd19, %rd832; + setp.gt.s64 %p411, %rd19, %rd831; + setp.gt.s64 %p412, %rd19, %rd830; + setp.gt.s64 %p413, %rd19, %rd829; + setp.gt.s64 %p414, %rd19, %rd828; + setp.gt.s64 %p415, %rd19, %rd827; + .loc 1 501 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:501:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + add.s32 %r10259, %r10242, 2048; + add.s32 %r10260, %r10237, 2048; + add.s32 %r10261, %r10232, 2048; + add.s32 %r10262, %r10227, 2048; + add.s32 %r10263, %r10222, 2048; + add.s32 %r10264, %r10217, 2048; + add.s32 %r10265, %r10212, 2048; + add.s32 %r10266, %r10207, 2048; + .loc 1 502 39 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:502:39 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b32 %r10267, %r10266, %r10207, %p399; + selp.b32 %r10268, %r10267, %r10207, %p407; + selp.b32 %r10269, %r10265, %r10212, %p398; + selp.b32 %r10270, %r10269, %r10212, %p406; + selp.b32 %r10271, %r10264, %r10217, %p397; + selp.b32 %r10272, %r10271, %r10217, %p405; + selp.b32 %r10273, %r10263, %r10222, %p396; + selp.b32 %r10274, %r10273, %r10222, %p404; + selp.b32 %r10275, %r10262, %r10227, %p395; + selp.b32 %r10276, %r10275, %r10227, %p403; + selp.b32 %r10277, %r10261, %r10232, %p394; + selp.b32 %r10278, %r10277, %r10232, %p402; + selp.b32 %r10279, %r10260, %r10237, %p393; + selp.b32 %r10280, %r10279, %r10237, %p401; + selp.b32 %r10281, %r10259, %r10242, %p392; + selp.b32 %r10282, %r10281, %r10242, %p400; + .loc 1 504 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:504:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.s64.s32 %rd835, %r10282; + cvt.s64.s32 %rd836, %r10280; + cvt.s64.s32 %rd837, %r10278; + cvt.s64.s32 %rd838, %r10276; + cvt.s64.s32 %rd839, %r10274; + cvt.s64.s32 %rd840, %r10272; + cvt.s64.s32 %rd841, %r10270; + cvt.s64.s32 %rd842, %r10268; + setp.gt.s64 %p416, %rd19, %rd842; + setp.gt.s64 %p417, %rd19, %rd841; + setp.gt.s64 %p418, %rd19, %rd840; + setp.gt.s64 %p419, %rd19, %rd839; + setp.gt.s64 %p420, %rd19, %rd838; + setp.gt.s64 %p421, %rd19, %rd837; + setp.gt.s64 %p422, %rd19, %rd836; + setp.gt.s64 %p423, %rd19, %rd835; + .loc 1 505 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:505:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + and.pred %p424, %p384, %p415; + and.pred %p425, %p384, %p423; + and.pred %p426, %p385, %p414; + and.pred %p427, %p385, %p422; + and.pred %p428, %p386, %p413; + and.pred %p429, %p386, %p421; + and.pred %p430, %p387, %p412; + and.pred %p431, %p387, %p420; + and.pred %p432, %p388, %p411; + and.pred %p433, %p388, %p419; + and.pred %p434, %p389, %p410; + and.pred %p435, %p389, %p418; + and.pred %p436, %p390, %p409; + and.pred %p437, %p390, %p417; + and.pred %p438, %p391, %p408; + and.pred %p439, %p391, %p416; + .loc 1 506 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:506:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + sub.s32 %r10283, %r13106, %r21; + sub.s32 %r10284, %r13107, %r21; + sub.s32 %r10285, %r13106, %r19; + sub.s32 %r10286, %r13107, %r19; + sub.s32 %r10287, %r13108, %r21; + sub.s32 %r10288, %r13109, %r21; + sub.s32 %r10289, %r13108, %r19; + sub.s32 %r10290, %r13109, %r19; + sub.s32 %r10291, %r13102, %r21; + sub.s32 %r10292, %r13103, %r21; + sub.s32 %r10293, %r13102, %r19; + sub.s32 %r10294, %r13103, %r19; + sub.s32 %r10295, %r13104, %r21; + sub.s32 %r10296, %r13105, %r21; + sub.s32 %r10297, %r13104, %r19; + sub.s32 %r10298, %r13105, %r19; + sub.s32 %r10299, %r13110, %r19; + sub.s32 %r10300, %r13111, %r19; + sub.s32 %r10301, %r13110, %r21; + sub.s32 %r10302, %r13111, %r21; + sub.s32 %r10303, %r13112, %r19; + sub.s32 %r10304, %r13113, %r19; + sub.s32 %r10305, %r13112, %r21; + sub.s32 %r10306, %r13113, %r21; + sub.s32 %r10307, %r13114, %r19; + sub.s32 %r10308, %r13115, %r19; + sub.s32 %r10309, %r13114, %r21; + sub.s32 %r10310, %r13115, %r21; + sub.s32 %r10311, %r13116, %r19; + sub.s32 %r10312, %r13117, %r19; + sub.s32 %r10313, %r13116, %r21; + sub.s32 %r10314, %r13117, %r21; + .loc 1 513 39 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:513:39 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + and.b32 %r10315, %r10314, 2047; + and.b32 %r10316, %r10313, 2047; + and.b32 %r10317, %r10312, 2047; + and.b32 %r10318, %r10311, 2047; + and.b32 %r10319, %r10310, 2047; + and.b32 %r10320, %r10309, 2047; + and.b32 %r10321, %r10308, 2047; + and.b32 %r10322, %r10307, 2047; + and.b32 %r10323, %r10306, 2047; + and.b32 %r10324, %r10305, 2047; + and.b32 %r10325, %r10304, 2047; + and.b32 %r10326, %r10303, 2047; + and.b32 %r10327, %r10302, 2047; + and.b32 %r10328, %r10301, 2047; + and.b32 %r10329, %r10300, 2047; + and.b32 %r10330, %r10299, 2047; + and.b32 %r10331, %r10298, 2047; + and.b32 %r10332, %r10297, 2047; + and.b32 %r10333, %r10296, 2047; + and.b32 %r10334, %r10295, 2047; + and.b32 %r10335, %r10294, 2047; + and.b32 %r10336, %r10293, 2047; + and.b32 %r10337, %r10292, 2047; + and.b32 %r10338, %r10291, 2047; + and.b32 %r10339, %r10290, 2047; + and.b32 %r10340, %r10289, 2047; + and.b32 %r10341, %r10288, 2047; + and.b32 %r10342, %r10287, 2047; + and.b32 %r10343, %r10286, 2047; + and.b32 %r10344, %r10285, 2047; + and.b32 %r10345, %r10284, 2047; + and.b32 %r10346, %r10283, 2047; + .loc 1 514 25 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:514:25 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + setp.eq.b32 %p440, %r10346, 0; + selp.b16 %rs73, 1, 0, %p440; + shl.b16 %rs74, %rs73, 2; + setp.eq.b32 %p441, %r10345, 0; + selp.b16 %rs75, -1, 0, %p441; + shl.b16 %rs76, %rs75, 3; + or.b16 %rs77, %rs76, %rs74; + setp.eq.b32 %p442, %r10344, 0; + selp.b16 %rs78, 1, 0, %p442; + setp.eq.b32 %p443, %r10343, 0; + selp.b16 %rs79, -1, 0, %p443; + shl.b16 %rs80, %rs79, 1; + or.b16 %rs81, %rs78, %rs80; + and.b16 %rs82, %rs81, 3; + or.b16 %rs83, %rs82, %rs77; + and.b16 %rs84, %rs83, 15; + shl.b16 %rs85, %rs84, 8; + setp.eq.b32 %p444, %r10342, 0; + selp.b16 %rs86, 1, 0, %p444; + shl.b16 %rs87, %rs86, 2; + setp.eq.b32 %p445, %r10341, 0; + selp.b16 %rs88, -1, 0, %p445; + shl.b16 %rs89, %rs88, 3; + or.b16 %rs90, %rs89, %rs87; + setp.eq.b32 %p446, %r10340, 0; + selp.b16 %rs91, 1, 0, %p446; + setp.eq.b32 %p447, %r10339, 0; + selp.b16 %rs92, -1, 0, %p447; + shl.b16 %rs93, %rs92, 1; + or.b16 %rs94, %rs91, %rs93; + and.b16 %rs95, %rs94, 3; + or.b16 %rs96, %rs95, %rs90; + shl.b16 %rs97, %rs96, 12; + or.b16 %rs98, %rs97, %rs85; + setp.eq.b32 %p448, %r10338, 0; + selp.b16 %rs99, 1, 0, %p448; + shl.b16 %rs100, %rs99, 2; + setp.eq.b32 %p449, %r10337, 0; + selp.b16 %rs101, -1, 0, %p449; + shl.b16 %rs102, %rs101, 3; + or.b16 %rs103, %rs102, %rs100; + setp.eq.b32 %p450, %r10336, 0; + selp.b16 %rs104, 1, 0, %p450; + setp.eq.b32 %p451, %r10335, 0; + selp.b16 %rs105, -1, 0, %p451; + shl.b16 %rs106, %rs105, 1; + or.b16 %rs107, %rs104, %rs106; + and.b16 %rs108, %rs107, 3; + or.b16 %rs109, %rs108, %rs103; + and.b16 %rs110, %rs109, 15; + setp.eq.b32 %p452, %r10334, 0; + selp.b16 %rs111, 1, 0, %p452; + shl.b16 %rs112, %rs111, 2; + setp.eq.b32 %p453, %r10333, 0; + selp.b16 %rs113, -1, 0, %p453; + shl.b16 %rs114, %rs113, 3; + or.b16 %rs115, %rs114, %rs112; + setp.eq.b32 %p454, %r10332, 0; + selp.b16 %rs116, 1, 0, %p454; + setp.eq.b32 %p455, %r10331, 0; + selp.b16 %rs117, -1, 0, %p455; + shl.b16 %rs118, %rs117, 1; + or.b16 %rs119, %rs116, %rs118; + and.b16 %rs120, %rs119, 3; + or.b16 %rs121, %rs120, %rs115; + shl.b16 %rs122, %rs121, 4; + or.b16 %rs123, %rs110, %rs122; + and.b16 %rs124, %rs123, 255; + or.b16 %rs125, %rs124, %rs98; + cvt.u32.u16 %r10347, %rs125; + setp.eq.b32 %p456, %r10330, 0; + setp.eq.b32 %p457, %r10329, 0; + setp.eq.b32 %p458, %r10328, 0; + setp.eq.b32 %p459, %r10327, 0; + setp.eq.b32 %p460, %r10326, 0; + setp.eq.b32 %p461, %r10325, 0; + setp.eq.b32 %p462, %r10324, 0; + setp.eq.b32 %p463, %r10323, 0; + setp.eq.b32 %p464, %r10322, 0; + setp.eq.b32 %p465, %r10321, 0; + setp.eq.b32 %p466, %r10320, 0; + setp.eq.b32 %p467, %r10319, 0; + setp.eq.b32 %p468, %r10318, 0; + setp.eq.b32 %p469, %r10317, 0; + setp.eq.b32 %p470, %r10316, 0; + setp.eq.b32 %p471, %r10315, 0; + .loc 1 515 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:515:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + and.pred %p472, %p471, %p424; + and.pred %p473, %p470, %p425; + and.pred %p474, %p469, %p424; + and.pred %p475, %p468, %p425; + and.pred %p476, %p467, %p426; + and.pred %p477, %p466, %p427; + and.pred %p478, %p465, %p426; + and.pred %p479, %p464, %p427; + and.pred %p480, %p463, %p428; + and.pred %p481, %p462, %p429; + and.pred %p482, %p461, %p428; + and.pred %p483, %p460, %p429; + and.pred %p484, %p459, %p430; + and.pred %p485, %p458, %p431; + and.pred %p486, %p457, %p430; + and.pred %p487, %p456, %p431; + shr.u32 %r10348, %r10347, 15; + and.b32 %r10349, %r10348, 1; + setp.ne.b32 %p488, %r10349, 0; + and.pred %p489, %p488, %p432; + shr.u32 %r10350, %r10347, 14; + and.b32 %r10351, %r10350, 1; + setp.ne.b32 %p490, %r10351, 0; + and.pred %p491, %p490, %p433; + shr.u32 %r10352, %r10347, 13; + and.b32 %r10353, %r10352, 1; + setp.ne.b32 %p492, %r10353, 0; + and.pred %p493, %p492, %p432; + shr.u32 %r10354, %r10347, 12; + and.b32 %r10355, %r10354, 1; + setp.ne.b32 %p494, %r10355, 0; + and.pred %p495, %p494, %p433; + shr.u32 %r10356, %r10347, 11; + and.b32 %r10357, %r10356, 1; + setp.ne.b32 %p496, %r10357, 0; + and.pred %p497, %p496, %p434; + shr.u32 %r10358, %r10347, 10; + and.b32 %r10359, %r10358, 1; + setp.ne.b32 %p498, %r10359, 0; + and.pred %p499, %p498, %p435; + shr.u32 %r10360, %r10347, 9; + and.b32 %r10361, %r10360, 1; + setp.ne.b32 %p500, %r10361, 0; + and.pred %p501, %p500, %p434; + shr.u32 %r10362, %r10347, 8; + and.b32 %r10363, %r10362, 1; + setp.ne.b32 %p502, %r10363, 0; + and.pred %p503, %p502, %p435; + shr.u32 %r10364, %r10347, 7; + and.b32 %r10365, %r10364, 1; + setp.ne.b32 %p504, %r10365, 0; + and.pred %p505, %p504, %p436; + shr.u32 %r10366, %r10347, 6; + and.b32 %r10367, %r10366, 1; + setp.ne.b32 %p506, %r10367, 0; + and.pred %p507, %p506, %p437; + shr.u32 %r10368, %r10347, 5; + and.b32 %r10369, %r10368, 1; + setp.ne.b32 %p508, %r10369, 0; + and.pred %p509, %p508, %p436; + shr.u32 %r10370, %r10347, 4; + and.b32 %r10371, %r10370, 1; + setp.ne.b32 %p510, %r10371, 0; + and.pred %p511, %p510, %p437; + shr.u32 %r10372, %r10347, 3; + and.b32 %r10373, %r10372, 1; + setp.ne.b32 %p512, %r10373, 0; + and.pred %p513, %p512, %p438; + shr.u32 %r10374, %r10347, 2; + and.b32 %r10375, %r10374, 1; + setp.ne.b32 %p514, %r10375, 0; + and.pred %p515, %p514, %p439; + shr.u32 %r10376, %r10347, 1; + and.b32 %r10377, %r10376, 1; + setp.ne.b32 %p516, %r10377, 0; + and.pred %p517, %p516, %p438; + and.pred %p518, %p450, %p439; + .loc 1 516 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:516:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + or.pred %p519, %p352, %p472; + or.pred %p520, %p353, %p473; + or.pred %p521, %p354, %p474; + or.pred %p522, %p355, %p475; + or.pred %p523, %p356, %p476; + or.pred %p524, %p357, %p477; + or.pred %p525, %p358, %p478; + or.pred %p526, %p359, %p479; + or.pred %p527, %p360, %p480; + or.pred %p528, %p361, %p481; + or.pred %p529, %p362, %p482; + or.pred %p530, %p363, %p483; + or.pred %p531, %p364, %p484; + or.pred %p532, %p365, %p485; + or.pred %p533, %p366, %p486; + or.pred %p534, %p367, %p487; + or.pred %p535, %p368, %p489; + or.pred %p536, %p369, %p491; + or.pred %p537, %p370, %p493; + or.pred %p538, %p371, %p495; + or.pred %p539, %p372, %p497; + or.pred %p540, %p373, %p499; + or.pred %p541, %p374, %p501; + or.pred %p542, %p375, %p503; + or.pred %p543, %p376, %p505; + or.pred %p544, %p377, %p507; + or.pred %p545, %p378, %p509; + or.pred %p546, %p379, %p511; + or.pred %p547, %p380, %p513; + or.pred %p548, %p381, %p515; + or.pred %p549, %p382, %p517; + or.pred %p550, %p383, %p518; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10378, %r10171, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10379, %r10378, 0fFF800000, %p519; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10380, %r10172, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10381, %r10380, 0fFF800000, %p520; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10382, %r10173, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10383, %r10382, 0fFF800000, %p521; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10384, %r10174, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10385, %r10384, 0fFF800000, %p522; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10386, %r10175, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10387, %r10386, 0fFF800000, %p523; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10388, %r10176, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10389, %r10388, 0fFF800000, %p524; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10390, %r10177, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10391, %r10390, 0fFF800000, %p525; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10392, %r10178, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10393, %r10392, 0fFF800000, %p526; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10394, %r10179, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10395, %r10394, 0fFF800000, %p527; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10396, %r10180, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10397, %r10396, 0fFF800000, %p528; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10398, %r10181, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10399, %r10398, 0fFF800000, %p529; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10400, %r10182, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10401, %r10400, 0fFF800000, %p530; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10402, %r10183, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10403, %r10402, 0fFF800000, %p531; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10404, %r10184, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10405, %r10404, 0fFF800000, %p532; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10406, %r10185, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10407, %r10406, 0fFF800000, %p533; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10408, %r10186, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10409, %r10408, 0fFF800000, %p534; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10410, %r10187, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10411, %r10410, 0fFF800000, %p535; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10412, %r10188, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10413, %r10412, 0fFF800000, %p536; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10414, %r10189, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10415, %r10414, 0fFF800000, %p537; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10416, %r10190, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10417, %r10416, 0fFF800000, %p538; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10418, %r10191, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10419, %r10418, 0fFF800000, %p539; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10420, %r10192, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10421, %r10420, 0fFF800000, %p540; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10422, %r10193, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10423, %r10422, 0fFF800000, %p541; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10424, %r10194, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10425, %r10424, 0fFF800000, %p542; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10426, %r10195, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10427, %r10426, 0fFF800000, %p543; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10428, %r10196, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10429, %r10428, 0fFF800000, %p544; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10430, %r10197, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10431, %r10430, 0fFF800000, %p545; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10432, %r10198, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10433, %r10432, 0fFF800000, %p546; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10434, %r10199, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10435, %r10434, 0fFF800000, %p547; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10436, %r10200, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10437, %r10436, 0fFF800000, %p548; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10438, %r10201, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10439, %r10438, 0fFF800000, %p549; + .loc 1 524 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:524:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10440, %r10202, 0f3FB8AA3B; + .loc 1 521 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:521:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.f32 %r10441, %r10440, 0fFF800000, %p550; + .loc 1 525 39 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:525:39 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + sub.f32 %r10442, %r10379, %r53; + sub.f32 %r10443, %r10381, %r53; + sub.f32 %r10444, %r10383, %r54; + sub.f32 %r10445, %r10385, %r54; + sub.f32 %r10446, %r10387, %r53; + sub.f32 %r10447, %r10389, %r53; + sub.f32 %r10448, %r10391, %r54; + sub.f32 %r10449, %r10393, %r54; + sub.f32 %r10450, %r10395, %r53; + sub.f32 %r10451, %r10397, %r53; + sub.f32 %r10452, %r10399, %r54; + sub.f32 %r10453, %r10401, %r54; + sub.f32 %r10454, %r10403, %r53; + sub.f32 %r10455, %r10405, %r53; + sub.f32 %r10456, %r10407, %r54; + sub.f32 %r10457, %r10409, %r54; + sub.f32 %r10458, %r10411, %r53; + sub.f32 %r10459, %r10413, %r53; + sub.f32 %r10460, %r10415, %r54; + sub.f32 %r10461, %r10417, %r54; + sub.f32 %r10462, %r10419, %r53; + sub.f32 %r10463, %r10421, %r53; + sub.f32 %r10464, %r10423, %r54; + sub.f32 %r10465, %r10425, %r54; + sub.f32 %r10466, %r10427, %r53; + sub.f32 %r10467, %r10429, %r53; + sub.f32 %r10468, %r10431, %r54; + sub.f32 %r10469, %r10433, %r54; + sub.f32 %r10470, %r10435, %r53; + sub.f32 %r10471, %r10437, %r53; + sub.f32 %r10472, %r10439, %r54; + sub.f32 %r10473, %r10441, %r54; + .loc 1 525 21 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:525:21 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + ex2.approx.ftz.f32 %r10474, %r10442; + ex2.approx.ftz.f32 %r10475, %r10443; + ex2.approx.ftz.f32 %r10476, %r10444; + ex2.approx.ftz.f32 %r10477, %r10445; + ex2.approx.ftz.f32 %r10478, %r10446; + ex2.approx.ftz.f32 %r10479, %r10447; + ex2.approx.ftz.f32 %r10480, %r10448; + ex2.approx.ftz.f32 %r10481, %r10449; + ex2.approx.ftz.f32 %r10482, %r10450; + ex2.approx.ftz.f32 %r10483, %r10451; + ex2.approx.ftz.f32 %r10484, %r10452; + ex2.approx.ftz.f32 %r10485, %r10453; + ex2.approx.ftz.f32 %r10486, %r10454; + ex2.approx.ftz.f32 %r10487, %r10455; + ex2.approx.ftz.f32 %r10488, %r10456; + ex2.approx.ftz.f32 %r10489, %r10457; + ex2.approx.ftz.f32 %r10490, %r10458; + ex2.approx.ftz.f32 %r10491, %r10459; + ex2.approx.ftz.f32 %r10492, %r10460; + ex2.approx.ftz.f32 %r10493, %r10461; + ex2.approx.ftz.f32 %r10494, %r10462; + ex2.approx.ftz.f32 %r10495, %r10463; + ex2.approx.ftz.f32 %r10496, %r10464; + ex2.approx.ftz.f32 %r10497, %r10465; + ex2.approx.ftz.f32 %r10498, %r10466; + ex2.approx.ftz.f32 %r10499, %r10467; + ex2.approx.ftz.f32 %r10500, %r10468; + ex2.approx.ftz.f32 %r10501, %r10469; + ex2.approx.ftz.f32 %r10502, %r10470; + ex2.approx.ftz.f32 %r10503, %r10471; + ex2.approx.ftz.f32 %r10504, %r10472; + ex2.approx.ftz.f32 %r10505, %r10473; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + add.s32 %r10506, %r8442, 49152; + add.s32 %r9538, %r10506, %r10128; + .loc 1 530 20 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:530:20 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + wgmma.fence.sync.aligned; + add.s32 %r9535, %r8442, 131072; + add.s32 %r10507, %r10132, %r9535; + bfe.u32 %r10508, %r10507, 4, 14; + cvt.u64.u32 %rd843, %r10508; + or.b64 %rd777, %rd843, 4611686293372403712; + bfe.u32 %r10509, %r9538, 4, 14; + cvt.u64.u32 %rd844, %r10509; + or.b64 %rd778, %rd844, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9119,%r9120,%r9121,%r9122,%r9123,%r9124,%r9125,%r9126,%r9127,%r9128,%r9129,%r9130,%r9131,%r9132,%r9133,%r9134,%r9135,%r9136,%r9137,%r9138,%r9139,%r9140,%r9141,%r9142,%r9143,%r9144,%r9145,%r9146,%r9147,%r9148,%r9149,%r9150}, %rd777, %rd778, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r10510, %r10136, %r9535; + bfe.u32 %r10511, %r10510, 4, 14; + cvt.u64.u32 %rd845, %r10511; + or.b64 %rd779, %rd845, 4611686293372403712; + add.s32 %r10512, %r9538, 32; + bfe.u32 %r10513, %r10512, 4, 14; + cvt.u64.u32 %rd846, %r10513; + or.b64 %rd780, %rd846, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9119,%r9120,%r9121,%r9122,%r9123,%r9124,%r9125,%r9126,%r9127,%r9128,%r9129,%r9130,%r9131,%r9132,%r9133,%r9134,%r9135,%r9136,%r9137,%r9138,%r9139,%r9140,%r9141,%r9142,%r9143,%r9144,%r9145,%r9146,%r9147,%r9148,%r9149,%r9150}, %rd779, %rd780, %p296, 1, 1, 0, 0; + // end inline asm + add.s32 %r10514, %r10141, %r9535; + bfe.u32 %r10515, %r10514, 4, 14; + cvt.u64.u32 %rd847, %r10515; + or.b64 %rd781, %rd847, 4611686293372403712; + add.s32 %r10516, %r9538, 64; + bfe.u32 %r10517, %r10516, 4, 14; + cvt.u64.u32 %rd848, %r10517; + or.b64 %rd782, %rd848, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9119,%r9120,%r9121,%r9122,%r9123,%r9124,%r9125,%r9126,%r9127,%r9128,%r9129,%r9130,%r9131,%r9132,%r9133,%r9134,%r9135,%r9136,%r9137,%r9138,%r9139,%r9140,%r9141,%r9142,%r9143,%r9144,%r9145,%r9146,%r9147,%r9148,%r9149,%r9150}, %rd781, %rd782, %p296, 1, 1, 0, 0; + // end inline asm + add.s32 %r10518, %r10146, %r9535; + bfe.u32 %r10519, %r10518, 4, 14; + cvt.u64.u32 %rd849, %r10519; + or.b64 %rd783, %rd849, 4611686293372403712; + add.s32 %r10520, %r9538, 96; + bfe.u32 %r10521, %r10520, 4, 14; + cvt.u64.u32 %rd850, %r10521; + or.b64 %rd784, %rd850, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9119,%r9120,%r9121,%r9122,%r9123,%r9124,%r9125,%r9126,%r9127,%r9128,%r9129,%r9130,%r9131,%r9132,%r9133,%r9134,%r9135,%r9136,%r9137,%r9138,%r9139,%r9140,%r9141,%r9142,%r9143,%r9144,%r9145,%r9146,%r9147,%r9148,%r9149,%r9150}, %rd783, %rd784, %p296, 1, 1, 0, 0; + // end inline asm + add.s32 %r10522, %r10151, %r9535; + bfe.u32 %r10523, %r10522, 4, 14; + cvt.u64.u32 %rd851, %r10523; + or.b64 %rd785, %rd851, 4611686293372403712; + add.s32 %r10524, %r9538, 8192; + bfe.u32 %r10525, %r10524, 4, 14; + cvt.u64.u32 %rd852, %r10525; + or.b64 %rd786, %rd852, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9119,%r9120,%r9121,%r9122,%r9123,%r9124,%r9125,%r9126,%r9127,%r9128,%r9129,%r9130,%r9131,%r9132,%r9133,%r9134,%r9135,%r9136,%r9137,%r9138,%r9139,%r9140,%r9141,%r9142,%r9143,%r9144,%r9145,%r9146,%r9147,%r9148,%r9149,%r9150}, %rd785, %rd786, %p296, 1, 1, 0, 0; + // end inline asm + add.s32 %r10526, %r10156, %r9535; + bfe.u32 %r10527, %r10526, 4, 14; + cvt.u64.u32 %rd853, %r10527; + or.b64 %rd787, %rd853, 4611686293372403712; + add.s32 %r10528, %r9538, 8224; + bfe.u32 %r10529, %r10528, 4, 14; + cvt.u64.u32 %rd854, %r10529; + or.b64 %rd788, %rd854, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9119,%r9120,%r9121,%r9122,%r9123,%r9124,%r9125,%r9126,%r9127,%r9128,%r9129,%r9130,%r9131,%r9132,%r9133,%r9134,%r9135,%r9136,%r9137,%r9138,%r9139,%r9140,%r9141,%r9142,%r9143,%r9144,%r9145,%r9146,%r9147,%r9148,%r9149,%r9150}, %rd787, %rd788, %p296, 1, 1, 0, 0; + // end inline asm + add.s32 %r10530, %r10161, %r9535; + bfe.u32 %r10531, %r10530, 4, 14; + cvt.u64.u32 %rd855, %r10531; + or.b64 %rd789, %rd855, 4611686293372403712; + add.s32 %r10532, %r9538, 8256; + bfe.u32 %r10533, %r10532, 4, 14; + cvt.u64.u32 %rd856, %r10533; + or.b64 %rd790, %rd856, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9119,%r9120,%r9121,%r9122,%r9123,%r9124,%r9125,%r9126,%r9127,%r9128,%r9129,%r9130,%r9131,%r9132,%r9133,%r9134,%r9135,%r9136,%r9137,%r9138,%r9139,%r9140,%r9141,%r9142,%r9143,%r9144,%r9145,%r9146,%r9147,%r9148,%r9149,%r9150}, %rd789, %rd790, %p296, 1, 1, 0, 0; + // end inline asm + add.s32 %r10534, %r10166, %r9535; + bfe.u32 %r10535, %r10534, 4, 14; + cvt.u64.u32 %rd857, %r10535; + or.b64 %rd791, %rd857, 4611686293372403712; + add.s32 %r10536, %r9538, 8288; + bfe.u32 %r10537, %r10536, 4, 14; + cvt.u64.u32 %rd858, %r10537; + or.b64 %rd792, %rd858, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r9119,%r9120,%r9121,%r9122,%r9123,%r9124,%r9125,%r9126,%r9127,%r9128,%r9129,%r9130,%r9131,%r9132,%r9133,%r9134,%r9135,%r9136,%r9137,%r9138,%r9139,%r9140,%r9141,%r9142,%r9143,%r9144,%r9145,%r9146,%r9147,%r9148,%r9149,%r9150}, %rd791, %rd792, %p296, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r9536, %r9018; + mov.b32 %r9537, %r9018; + mov.b32 %r9539, %r9018; + mov.b32 %r9540, %r9018; + // begin inline asm + // wait for regs: %r9119,%r9120,%r9121,%r9122,%r9123,%r9124,%r9125,%r9126,%r9127,%r9128,%r9129,%r9130,%r9131,%r9132,%r9133,%r9134,%r9135,%r9136,%r9137,%r9138,%r9139,%r9140,%r9141,%r9142,%r9143,%r9144,%r9145,%r9146,%r9147,%r9148,%r9149,%r9150,%r9535,%r9536,%r9537,%r9538,%r9539,%r9540 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 531 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + sub.f32 %r10538, %r9119, %r8358; + sub.f32 %r10539, %r9120, %r8358; + sub.f32 %r10540, %r9121, %r8359; + sub.f32 %r10541, %r9122, %r8359; + sub.f32 %r10542, %r9123, %r8358; + sub.f32 %r10543, %r9124, %r8358; + sub.f32 %r10544, %r9125, %r8359; + sub.f32 %r10545, %r9126, %r8359; + sub.f32 %r10546, %r9127, %r8358; + sub.f32 %r10547, %r9128, %r8358; + sub.f32 %r10548, %r9129, %r8359; + sub.f32 %r10549, %r9130, %r8359; + sub.f32 %r10550, %r9131, %r8358; + sub.f32 %r10551, %r9132, %r8358; + sub.f32 %r10552, %r9133, %r8359; + sub.f32 %r10553, %r9134, %r8359; + sub.f32 %r10554, %r9135, %r8358; + sub.f32 %r10555, %r9136, %r8358; + sub.f32 %r10556, %r9137, %r8359; + sub.f32 %r10557, %r9138, %r8359; + sub.f32 %r10558, %r9139, %r8358; + sub.f32 %r10559, %r9140, %r8358; + sub.f32 %r10560, %r9141, %r8359; + sub.f32 %r10561, %r9142, %r8359; + sub.f32 %r10562, %r9143, %r8358; + sub.f32 %r10563, %r9144, %r8358; + sub.f32 %r10564, %r9145, %r8359; + sub.f32 %r10565, %r9146, %r8359; + sub.f32 %r10566, %r9147, %r8358; + sub.f32 %r10567, %r9148, %r8358; + sub.f32 %r10568, %r9149, %r8359; + sub.f32 %r10569, %r9150, %r8359; + .loc 1 531 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.f32 %r10570, %r10474, %r10538; + mul.f32 %r10571, %r10475, %r10539; + mul.f32 %r10572, %r10476, %r10540; + mul.f32 %r10573, %r10477, %r10541; + mul.f32 %r10574, %r10478, %r10542; + mul.f32 %r10575, %r10479, %r10543; + mul.f32 %r10576, %r10480, %r10544; + mul.f32 %r10577, %r10481, %r10545; + mul.f32 %r10578, %r10482, %r10546; + mul.f32 %r10579, %r10483, %r10547; + mul.f32 %r10580, %r10484, %r10548; + mul.f32 %r10581, %r10485, %r10549; + mul.f32 %r10582, %r10486, %r10550; + mul.f32 %r10583, %r10487, %r10551; + mul.f32 %r10584, %r10488, %r10552; + mul.f32 %r10585, %r10489, %r10553; + mul.f32 %r10586, %r10490, %r10554; + mul.f32 %r10587, %r10491, %r10555; + mul.f32 %r10588, %r10492, %r10556; + mul.f32 %r10589, %r10493, %r10557; + mul.f32 %r10590, %r10494, %r10558; + mul.f32 %r10591, %r10495, %r10559; + mul.f32 %r10592, %r10496, %r10560; + mul.f32 %r10593, %r10497, %r10561; + mul.f32 %r10594, %r10498, %r10562; + mul.f32 %r10595, %r10499, %r10563; + mul.f32 %r10596, %r10500, %r10564; + mul.f32 %r10597, %r10501, %r10565; + mul.f32 %r10598, %r10502, %r10566; + mul.f32 %r10599, %r10503, %r10567; + mul.f32 %r10600, %r10504, %r10568; + mul.f32 %r10601, %r10505, %r10569; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs126, %r10570; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs127, %rs126, 0x0000, %p519; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs128, %r10571; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs129, %rs128, 0x0000, %p520; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs130, %r10572; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs131, %rs130, 0x0000, %p521; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs132, %r10573; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs133, %rs132, 0x0000, %p522; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs134, %r10574; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs135, %rs134, 0x0000, %p523; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs136, %r10575; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs137, %rs136, 0x0000, %p524; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs138, %r10576; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs139, %rs138, 0x0000, %p525; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs140, %r10577; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs141, %rs140, 0x0000, %p526; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs142, %r10578; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs143, %rs142, 0x0000, %p527; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs144, %r10579; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs145, %rs144, 0x0000, %p528; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs146, %r10580; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs147, %rs146, 0x0000, %p529; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs148, %r10581; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs149, %rs148, 0x0000, %p530; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs150, %r10582; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs151, %rs150, 0x0000, %p531; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs152, %r10583; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs153, %rs152, 0x0000, %p532; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs154, %r10584; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs155, %rs154, 0x0000, %p533; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs156, %r10585; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs157, %rs156, 0x0000, %p534; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs158, %r10586; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs159, %rs158, 0x0000, %p535; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs160, %r10587; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs161, %rs160, 0x0000, %p536; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs162, %r10588; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs163, %rs162, 0x0000, %p537; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs164, %r10589; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs165, %rs164, 0x0000, %p538; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs166, %r10590; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs167, %rs166, 0x0000, %p539; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs168, %r10591; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs169, %rs168, 0x0000, %p540; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs170, %r10592; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs171, %rs170, 0x0000, %p541; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs172, %r10593; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs173, %rs172, 0x0000, %p542; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs174, %r10594; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs175, %rs174, 0x0000, %p543; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs176, %r10595; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs177, %rs176, 0x0000, %p544; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs178, %r10596; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs179, %rs178, 0x0000, %p545; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs180, %r10597; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs181, %rs180, 0x0000, %p546; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs182, %r10598; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs183, %rs182, 0x0000, %p547; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs184, %r10599; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs185, %rs184, 0x0000, %p548; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs186, %r10600; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs187, %rs186, 0x0000, %p549; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + cvt.rn.bf16.f32 %rs188, %r10601; + .loc 1 549 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:549:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + selp.b16 %rs189, %rs188, 0x0000, %p550; + .loc 1 553 21 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:553:21 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mov.b32 %r9707, {%rs127, %rs129}; + mov.b32 %r9708, {%rs131, %rs133}; + mov.b32 %r9709, {%rs135, %rs137}; + mov.b32 %r9710, {%rs139, %rs141}; + mov.b32 %r9839, {%rs143, %rs145}; + mov.b32 %r9840, {%rs147, %rs149}; + mov.b32 %r9841, {%rs151, %rs153}; + mov.b32 %r9842, {%rs155, %rs157}; + mov.b32 %r9971, {%rs159, %rs161}; + mov.b32 %r9972, {%rs163, %rs165}; + mov.b32 %r9973, {%rs167, %rs169}; + mov.b32 %r9974, {%rs171, %rs173}; + mov.b32 %r10103, {%rs175, %rs177}; + mov.b32 %r10104, {%rs179, %rs181}; + mov.b32 %r10105, {%rs183, %rs185}; + mov.b32 %r10106, {%rs187, %rs189}; + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100}, {%r9707,%r9708,%r9709,%r9710}, %rd762, %p296, 1, 1, 1; + // end inline asm + add.s32 %r10602, %r9020, 2048; + bfe.u32 %r10603, %r10602, 4, 14; + cvt.u64.u32 %rd859, %r10603; + or.b64 %rd794, %rd859, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100}, {%r9839,%r9840,%r9841,%r9842}, %rd794, %p296, 1, 1, 1; + // end inline asm + add.s32 %r10604, %r9020, 4096; + bfe.u32 %r10605, %r10604, 4, 14; + cvt.u64.u32 %rd860, %r10605; + or.b64 %rd795, %rd860, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100}, {%r9971,%r9972,%r9973,%r9974}, %rd795, %p296, 1, 1, 1; + // end inline asm + add.s32 %r10606, %r9020, 6144; + bfe.u32 %r10607, %r10606, 4, 14; + cvt.u64.u32 %rd861, %r10607; + or.b64 %rd796, %rd861, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100}, {%r10103,%r10104,%r10105,%r10106}, %rd796, %p296, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 417 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:417:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + add.s32 %r13102, %r13034, %r13102; + add.s32 %r13103, %r13034, %r13103; + add.s32 %r13104, %r13034, %r13104; + add.s32 %r13105, %r13034, %r13105; + add.s32 %r13106, %r13034, %r13106; + add.s32 %r13107, %r13034, %r13107; + add.s32 %r13108, %r13034, %r13108; + add.s32 %r13109, %r13034, %r13109; + add.s32 %r13110, %r13034, %r13110; + add.s32 %r13111, %r13034, %r13111; + add.s32 %r13112, %r13034, %r13112; + add.s32 %r13113, %r13034, %r13113; + add.s32 %r13114, %r13034, %r13114; + add.s32 %r13115, %r13034, %r13115; + add.s32 %r13116, %r13034, %r13116; + add.s32 %r13117, %r13034, %r13117; + .loc 1 397 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:397:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + add.s32 %r260, %r13101, 1; + .loc 1 788 33 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:788:33 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + shr.u32 %r10608, %r260, 1; + .loc 1 789 38 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:789:38 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mad.wide.u32 %rd798, %r10608, 4, %rd704; + .loc 1 789 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:789:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + // begin inline asm + mov.u64 %rd797, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd797, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10107, 0x0; + @%p314 ld.global.L1::evict_last.L2::cache_hint.b32 { %r10107 }, [ %rd798 + 0 ], %rd797; + // end inline asm + .loc 1 790 109 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:790:109 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + add.s32 %r10609, %r10608, 1; + .loc 1 790 113 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:790:113 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + setp.lt.s32 %p551, %r10609, %r8363; + .loc 1 790 55 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:790:55 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + add.s64 %rd801, %rd798, 4; + .loc 1 397 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:397:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + and.pred %p315, %p314, %p551; + .loc 1 790 25 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:790:25 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + // begin inline asm + mov.u64 %rd800, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd800, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r10108, 0x0; + @%p315 ld.global.L1::evict_last.L2::cache_hint.b32 { %r10108 }, [ %rd801 + 0 ], %rd800; + // end inline asm + .loc 1 791 35 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:791:35 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + and.b32 %r10610, %r13101, 1; + .loc 1 792 34 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:792:34 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + sub.s32 %r10611, %r10108, %r10107; + .loc 1 792 48 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:792:48 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + shl.b32 %r10612, %r10611, 7; + .loc 1 792 63 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:792:63 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + add.s32 %r10613, %r10612, -64; + .loc 1 793 29 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:793:29 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + xor.b32 %r10614, %r10610, 1; + .loc 1 793 61 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:793:61 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + shl.b32 %r10615, %r10610, 6; + .loc 1 793 42 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:793:42 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mad.lo.s32 %r13034, %r10613, %r10614, %r10615; + .loc 1 414 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:414:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + shl.b32 %r10616, %r13034, 7; + .loc 1 414 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:414:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + mul.wide.s32 %rd862, %r10616, 2; + add.s64 %rd1014, %rd1014, %rd862; + add.s64 %rd1013, %rd1013, %rd862; + add.s64 %rd1012, %rd1012, %rd862; + add.s64 %rd1011, %rd1011, %rd862; + .loc 1 415 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:415:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + add.s64 %rd1010, %rd1010, %rd862; + add.s64 %rd1009, %rd1009, %rd862; + add.s64 %rd1008, %rd1008, %rd862; + add.s64 %rd1007, %rd1007, %rd862; + .loc 1 397 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:397:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + add.s32 %r10617, %r13036, 1; + setp.gt.s32 %p552, %r10617, 2; + selp.b32 %r13036, 0, %r10617, %p552; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + shl.b32 %r10618, %r13036, 14; + add.s32 %r10619, %r8442, %r10618; + bar.sync 0; + add.s32 %r10109, %r10619, %r59; + selp.b32 %r10110, 16, 0, %p318; + // begin inline asm + cp.async.cg.shared.global [ %r10109 + 0 ], [ %rd1014 + 0 ], 0x10, %r10110; + // end inline asm + add.s32 %r10111, %r10109, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r10111 + 0 ], [ %rd1013 + 0 ], 0x10, %r10110; + // end inline asm + add.s32 %r10113, %r10109, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r10113 + 0 ], [ %rd1012 + 0 ], 0x10, %r10110; + // end inline asm + add.s32 %r10115, %r10109, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r10115 + 0 ], [ %rd1011 + 0 ], 0x10, %r10110; + // end inline asm + cp.async.commit_group; + add.s32 %r10620, %r10506, %r10618; + add.s32 %r10117, %r10620, %r59; + // begin inline asm + cp.async.cg.shared.global [ %r10117 + 0 ], [ %rd1010 + 0 ], 0x10, %r10110; + // end inline asm + add.s32 %r10119, %r10117, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r10119 + 0 ], [ %rd1009 + 0 ], 0x10, %r10110; + // end inline asm + add.s32 %r10121, %r10117, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r10121 + 0 ], [ %rd1008 + 0 ], 0x10, %r10110; + // end inline asm + add.s32 %r10123, %r10117, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r10123 + 0 ], [ %rd1007 + 0 ], 0x10, %r10110; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:397:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + setp.ne.b32 %p553, %r76, %r260; + mov.b32 %r13101, %r260; + @%p553 bra $L__BB0_3; +$L__tmp9: +$L__BB0_4: // %._crit_edge1847 + .loc 1 0 0 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:0 + add.s64 %rd4, %rd184, %rd724; +$L__tmp10: + cvt.u64.u32 %rd5, %r8423; + cvt.u64.u32 %rd6, %r8424; + cvt.u64.u32 %rd7, %r8425; + cvt.u64.u32 %rd8, %r8426; + cvt.u64.u32 %rd9, %r8427; + cvt.u64.u32 %rd10, %r8428; + cvt.u64.u32 %rd11, %r8429; + cvt.u64.u32 %rd12, %r8430; +$L__tmp11: + .loc 1 397 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:397:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:207:12 ] + // begin inline asm + // wait for regs: %r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp12: + .loc 1 214 39 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:214:39 + shl.b64 %rd881, %rd16, 2; + add.s64 %rd863, %rd190, %rd881; + .loc 1 215 31 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:215:31 + // begin inline asm + mov.u32 %r10749, 0x0; + ld.global.b32 { %r10749 }, [ %rd863 + 0 ]; + // end inline asm + .loc 1 215 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:215:45 + shl.b32 %r10783, %r10749, 7; + .loc 1 216 62 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:216:62 + shl.b64 %rd882, %rd18, 2; + add.s64 %rd864, %rd189, %rd882; + .loc 1 216 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:216:43 + // begin inline asm + mov.u32 %r10750, 0x0; + ld.global.b32 { %r10750 }, [ %rd864 + 0 ]; + // end inline asm + .loc 1 218 33 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:218:33 + or.b32 %r10784, %r10783, %r9; + or.b32 %r10785, %r10783, %r10; + or.b32 %r10786, %r10783, %r11; + or.b32 %r10787, %r10783, %r12; +$L__tmp13: + .loc 1 390 37 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:390:37 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + shl.b32 %r10788, %r10784, 7; + shl.b32 %r10789, %r10785, 7; + shl.b32 %r10790, %r10786, 7; + shl.b32 %r10791, %r10787, 7; + .loc 1 390 18 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:390:18 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.wide.s32 %rd883, %r10788, 2; + add.s64 %rd884, %rd1, %rd883; + mul.wide.s32 %rd885, %r10789, 2; + add.s64 %rd886, %rd1, %rd885; + mul.wide.s32 %rd887, %r10790, 2; + add.s64 %rd888, %rd1, %rd887; + mul.wide.s32 %rd889, %r10791, 2; + add.s64 %rd890, %rd1, %rd889; + .loc 1 390 49 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:390:49 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + shl.b64 %rd891, %rd13, 1; + add.s64 %rd865, %rd884, %rd891; + add.s64 %rd866, %rd886, %rd891; + add.s64 %rd867, %rd888, %rd891; + add.s64 %rd868, %rd890, %rd891; + .loc 1 391 18 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:391:18 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + add.s64 %rd892, %rd2, %rd883; + add.s64 %rd893, %rd2, %rd885; + add.s64 %rd894, %rd2, %rd887; + add.s64 %rd895, %rd2, %rd889; + .loc 1 391 49 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:391:49 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + add.s64 %rd869, %rd892, %rd891; + add.s64 %rd870, %rd893, %rd891; + add.s64 %rd871, %rd894, %rd891; + add.s64 %rd872, %rd895, %rd891; + .loc 1 395 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:395:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + shl.b32 %r328, %r10750, 1; + .loc 1 397 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:397:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + setp.lt.s32 %p554, %r328, 1; + setp.gt.s32 %p555, %r328, 0; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + selp.b32 %r10752, 16, 0, %p555; + // begin inline asm + cp.async.cg.shared.global [ %r10751 + 0 ], [ %rd865 + 0 ], 0x10, %r10752; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10753 + 0 ], [ %rd866 + 0 ], 0x10, %r10752; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10755 + 0 ], [ %rd867 + 0 ], 0x10, %r10752; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10757 + 0 ], [ %rd868 + 0 ], 0x10, %r10752; + // end inline asm + cp.async.commit_group; + // begin inline asm + cp.async.cg.shared.global [ %r8372 + 0 ], [ %rd869 + 0 ], 0x10, %r10752; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r8374 + 0 ], [ %rd870 + 0 ], 0x10, %r10752; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r8376 + 0 ], [ %rd871 + 0 ], 0x10, %r10752; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r8378 + 0 ], [ %rd872 + 0 ], 0x10, %r10752; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:397:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + setp.gt.s32 %p556, %r328, 1; + .loc 1 414 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:414:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + add.s64 %rd1022, %rd865, 16384; + add.s64 %rd1021, %rd866, 16384; + add.s64 %rd1020, %rd867, 16384; + add.s64 %rd1019, %rd868, 16384; + .loc 1 415 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:415:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + add.s64 %rd1018, %rd869, 16384; + add.s64 %rd1017, %rd870, 16384; + add.s64 %rd1016, %rd871, 16384; + add.s64 %rd1015, %rd872, 16384; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + bar.sync 0; + selp.b32 %r10768, 16, 0, %p556; + // begin inline asm + cp.async.cg.shared.global [ %r8380 + 0 ], [ %rd1022 + 0 ], 0x10, %r10768; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r8382 + 0 ], [ %rd1021 + 0 ], 0x10, %r10768; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r8384 + 0 ], [ %rd1020 + 0 ], 0x10, %r10768; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r8386 + 0 ], [ %rd1019 + 0 ], 0x10, %r10768; + // end inline asm + cp.async.commit_group; + // begin inline asm + cp.async.cg.shared.global [ %r8388 + 0 ], [ %rd1018 + 0 ], 0x10, %r10768; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r8390 + 0 ], [ %rd1017 + 0 ], 0x10, %r10768; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r8392 + 0 ], [ %rd1016 + 0 ], 0x10, %r10768; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r8394 + 0 ], [ %rd1015 + 0 ], 0x10, %r10768; + // end inline asm + cp.async.commit_group; + .loc 1 459 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:459:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + .loc 1 397 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:397:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + @%p554 bra $L__BB0_7; +// %bb.5: // %.lr.ph1858 + .loc 1 395 63 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:395:63 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + min.u32 %r393, %r328, 32; + add.s32 %r394, %r393, -2; + add.s32 %r395, %r393, -1; + .loc 1 531 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mov.b64 %rd61, {%r8359, %r8359}; + mov.b64 %rd62, {%r8358, %r8358}; + mov.b32 %r11346, 0; + mov.b32 %r13183, 1; + mov.b32 %r13182, -1; + mov.b32 %r13248, %r11346; +$L__BB0_6: // %__nv_exp2f.exit1318 + // =>This Inner Loop Header: Depth=1 + .loc 1 397 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:397:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + setp.lt.s32 %p577, %r13248, %r394; + setp.lt.s32 %p575, %r13248, %r395; + add.s32 %r12453, %r13182, 1; + setp.gt.s32 %p578, %r12453, 2; + selp.b32 %r13182, 0, %r12453, %p578; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r12454, %r13182, 14; + add.s32 %r11348, %r8442, %r12454; + .loc 1 459 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:459:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + shfl.sync.idx.b32 %r12456, %r7, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r12457, %r12456, 11; + and.b32 %r12458, %r12457, 8192; + add.s32 %r11307, %r8442, 98304; + add.s32 %r12459, %r12458, %r11307; + bfe.u32 %r12460, %r12459, 4, 14; + cvt.u64.u32 %rd946, %r12460; + or.b64 %rd896, %rd946, 4611686293372403712; + bfe.u32 %r12461, %r11348, 4, 14; + cvt.u64.u32 %rd947, %r12461; + or.b64 %rd897, %rd947, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10891,%r10892,%r10893,%r10894,%r10895,%r10896,%r10897,%r10898,%r10899,%r10900,%r10901,%r10902,%r10903,%r10904,%r10905,%r10906,%r10907,%r10908,%r10909,%r10910,%r10911,%r10912,%r10913,%r10914,%r10915,%r10916,%r10917,%r10918,%r10919,%r10920,%r10921,%r10922}, %rd896, %rd897, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r12462, %r12458, 32; + add.s32 %r12463, %r12462, %r11307; + bfe.u32 %r12464, %r12463, 4, 14; + cvt.u64.u32 %rd948, %r12464; + or.b64 %rd898, %rd948, 4611686293372403712; + add.s32 %r12465, %r11348, 32; + bfe.u32 %r12466, %r12465, 4, 14; + cvt.u64.u32 %rd949, %r12466; + or.b64 %rd899, %rd949, 4611686293338849280; + mov.pred %p557, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10891,%r10892,%r10893,%r10894,%r10895,%r10896,%r10897,%r10898,%r10899,%r10900,%r10901,%r10902,%r10903,%r10904,%r10905,%r10906,%r10907,%r10908,%r10909,%r10910,%r10911,%r10912,%r10913,%r10914,%r10915,%r10916,%r10917,%r10918,%r10919,%r10920,%r10921,%r10922}, %rd898, %rd899, %p557, 1, 1, 0, 0; + // end inline asm + or.b32 %r12467, %r12458, 64; + add.s32 %r12468, %r12467, %r11307; + bfe.u32 %r12469, %r12468, 4, 14; + cvt.u64.u32 %rd950, %r12469; + or.b64 %rd900, %rd950, 4611686293372403712; + add.s32 %r12470, %r11348, 64; + bfe.u32 %r12471, %r12470, 4, 14; + cvt.u64.u32 %rd951, %r12471; + or.b64 %rd901, %rd951, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10891,%r10892,%r10893,%r10894,%r10895,%r10896,%r10897,%r10898,%r10899,%r10900,%r10901,%r10902,%r10903,%r10904,%r10905,%r10906,%r10907,%r10908,%r10909,%r10910,%r10911,%r10912,%r10913,%r10914,%r10915,%r10916,%r10917,%r10918,%r10919,%r10920,%r10921,%r10922}, %rd900, %rd901, %p557, 1, 1, 0, 0; + // end inline asm + or.b32 %r12472, %r12458, 96; + add.s32 %r12473, %r12472, %r11307; + bfe.u32 %r12474, %r12473, 4, 14; + cvt.u64.u32 %rd952, %r12474; + or.b64 %rd902, %rd952, 4611686293372403712; + add.s32 %r12475, %r11348, 96; + bfe.u32 %r12476, %r12475, 4, 14; + cvt.u64.u32 %rd953, %r12476; + or.b64 %rd903, %rd953, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10891,%r10892,%r10893,%r10894,%r10895,%r10896,%r10897,%r10898,%r10899,%r10900,%r10901,%r10902,%r10903,%r10904,%r10905,%r10906,%r10907,%r10908,%r10909,%r10910,%r10911,%r10912,%r10913,%r10914,%r10915,%r10916,%r10917,%r10918,%r10919,%r10920,%r10921,%r10922}, %rd902, %rd903, %p557, 1, 1, 0, 0; + // end inline asm + or.b32 %r12477, %r12458, 16384; + add.s32 %r12478, %r12477, %r11307; + bfe.u32 %r12479, %r12478, 4, 14; + cvt.u64.u32 %rd954, %r12479; + or.b64 %rd904, %rd954, 4611686293372403712; + add.s32 %r12480, %r11348, 8192; + bfe.u32 %r12481, %r12480, 4, 14; + cvt.u64.u32 %rd955, %r12481; + or.b64 %rd905, %rd955, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10891,%r10892,%r10893,%r10894,%r10895,%r10896,%r10897,%r10898,%r10899,%r10900,%r10901,%r10902,%r10903,%r10904,%r10905,%r10906,%r10907,%r10908,%r10909,%r10910,%r10911,%r10912,%r10913,%r10914,%r10915,%r10916,%r10917,%r10918,%r10919,%r10920,%r10921,%r10922}, %rd904, %rd905, %p557, 1, 1, 0, 0; + // end inline asm + or.b32 %r12482, %r12458, 16416; + add.s32 %r12483, %r12482, %r11307; + bfe.u32 %r12484, %r12483, 4, 14; + cvt.u64.u32 %rd956, %r12484; + or.b64 %rd906, %rd956, 4611686293372403712; + add.s32 %r12485, %r11348, 8224; + bfe.u32 %r12486, %r12485, 4, 14; + cvt.u64.u32 %rd957, %r12486; + or.b64 %rd907, %rd957, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10891,%r10892,%r10893,%r10894,%r10895,%r10896,%r10897,%r10898,%r10899,%r10900,%r10901,%r10902,%r10903,%r10904,%r10905,%r10906,%r10907,%r10908,%r10909,%r10910,%r10911,%r10912,%r10913,%r10914,%r10915,%r10916,%r10917,%r10918,%r10919,%r10920,%r10921,%r10922}, %rd906, %rd907, %p557, 1, 1, 0, 0; + // end inline asm + or.b32 %r12487, %r12458, 16448; + add.s32 %r12488, %r12487, %r11307; + bfe.u32 %r12489, %r12488, 4, 14; + cvt.u64.u32 %rd958, %r12489; + or.b64 %rd908, %rd958, 4611686293372403712; + add.s32 %r12490, %r11348, 8256; + bfe.u32 %r12491, %r12490, 4, 14; + cvt.u64.u32 %rd959, %r12491; + or.b64 %rd909, %rd959, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10891,%r10892,%r10893,%r10894,%r10895,%r10896,%r10897,%r10898,%r10899,%r10900,%r10901,%r10902,%r10903,%r10904,%r10905,%r10906,%r10907,%r10908,%r10909,%r10910,%r10911,%r10912,%r10913,%r10914,%r10915,%r10916,%r10917,%r10918,%r10919,%r10920,%r10921,%r10922}, %rd908, %rd909, %p557, 1, 1, 0, 0; + // end inline asm + or.b32 %r12492, %r12458, 16480; + add.s32 %r12493, %r12492, %r11307; + bfe.u32 %r12494, %r12493, 4, 14; + cvt.u64.u32 %rd960, %r12494; + or.b64 %rd910, %rd960, 4611686293372403712; + add.s32 %r12495, %r11348, 8288; + bfe.u32 %r12496, %r12495, 4, 14; + cvt.u64.u32 %rd961, %r12496; + or.b64 %rd911, %rd961, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10891,%r10892,%r10893,%r10894,%r10895,%r10896,%r10897,%r10898,%r10899,%r10900,%r10901,%r10902,%r10903,%r10904,%r10905,%r10906,%r10907,%r10908,%r10909,%r10910,%r10911,%r10912,%r10913,%r10914,%r10915,%r10916,%r10917,%r10918,%r10919,%r10920,%r10921,%r10922}, %rd910, %rd911, %p557, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r11308, %r11346; + mov.b32 %r11309, %r11346; + mov.b32 %r11311, %r11346; + mov.b32 %r11312, %r11346; + mov.b32 %r11310, %r11348; + // begin inline asm + // wait for regs: %r10891,%r10892,%r10893,%r10894,%r10895,%r10896,%r10897,%r10898,%r10899,%r10900,%r10901,%r10902,%r10903,%r10904,%r10905,%r10906,%r10907,%r10908,%r10909,%r10910,%r10911,%r10912,%r10913,%r10914,%r10915,%r10916,%r10917,%r10918,%r10919,%r10920,%r10921,%r10922,%r11307,%r11308,%r11309,%r11310,%r11311,%r11312 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 461 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:461:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.f32 %r12497, %r10891, 0f3DB504F3; + mul.f32 %r12498, %r10892, 0f3DB504F3; + mul.f32 %r12499, %r10893, 0f3DB504F3; + mul.f32 %r12500, %r10894, 0f3DB504F3; + mul.f32 %r12501, %r10895, 0f3DB504F3; + mul.f32 %r12502, %r10896, 0f3DB504F3; + mul.f32 %r12503, %r10897, 0f3DB504F3; + mul.f32 %r12504, %r10898, 0f3DB504F3; + mul.f32 %r12505, %r10899, 0f3DB504F3; + mul.f32 %r12506, %r10900, 0f3DB504F3; + mul.f32 %r12507, %r10901, 0f3DB504F3; + mul.f32 %r12508, %r10902, 0f3DB504F3; + mul.f32 %r12509, %r10903, 0f3DB504F3; + mul.f32 %r12510, %r10904, 0f3DB504F3; + mul.f32 %r12511, %r10905, 0f3DB504F3; + mul.f32 %r12512, %r10906, 0f3DB504F3; + mul.f32 %r12513, %r10907, 0f3DB504F3; + mul.f32 %r12514, %r10908, 0f3DB504F3; + mul.f32 %r12515, %r10909, 0f3DB504F3; + mul.f32 %r12516, %r10910, 0f3DB504F3; + mul.f32 %r12517, %r10911, 0f3DB504F3; + mul.f32 %r12518, %r10912, 0f3DB504F3; + mul.f32 %r12519, %r10913, 0f3DB504F3; + mul.f32 %r12520, %r10914, 0f3DB504F3; + mul.f32 %r12521, %r10915, 0f3DB504F3; + mul.f32 %r12522, %r10916, 0f3DB504F3; + mul.f32 %r12523, %r10917, 0f3DB504F3; + mul.f32 %r12524, %r10918, 0f3DB504F3; + mul.f32 %r12525, %r10919, 0f3DB504F3; + mul.f32 %r12526, %r10920, 0f3DB504F3; + mul.f32 %r12527, %r10921, 0f3DB504F3; + mul.f32 %r12528, %r10922, 0f3DB504F3; + .loc 1 525 39 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:525:39 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + neg.f32 %r12529, %r53; + fma.rn.f32 %r12530, %r12497, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12531, %r12498, 0f3FB8AA3B, %r12529; + neg.f32 %r12532, %r54; + fma.rn.f32 %r12533, %r12499, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12534, %r12500, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12535, %r12501, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12536, %r12502, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12537, %r12503, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12538, %r12504, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12539, %r12505, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12540, %r12506, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12541, %r12507, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12542, %r12508, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12543, %r12509, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12544, %r12510, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12545, %r12511, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12546, %r12512, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12547, %r12513, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12548, %r12514, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12549, %r12515, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12550, %r12516, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12551, %r12517, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12552, %r12518, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12553, %r12519, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12554, %r12520, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12555, %r12521, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12556, %r12522, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12557, %r12523, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12558, %r12524, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12559, %r12525, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12560, %r12526, 0f3FB8AA3B, %r12529; + fma.rn.f32 %r12561, %r12527, 0f3FB8AA3B, %r12532; + fma.rn.f32 %r12562, %r12528, 0f3FB8AA3B, %r12532; + .loc 1 525 21 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:525:21 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + ex2.approx.ftz.f32 %r12563, %r12530; + ex2.approx.ftz.f32 %r12564, %r12531; + ex2.approx.ftz.f32 %r12565, %r12533; + ex2.approx.ftz.f32 %r12566, %r12534; + ex2.approx.ftz.f32 %r12567, %r12535; + ex2.approx.ftz.f32 %r12568, %r12536; + ex2.approx.ftz.f32 %r12569, %r12537; + ex2.approx.ftz.f32 %r12570, %r12538; + ex2.approx.ftz.f32 %r12571, %r12539; + ex2.approx.ftz.f32 %r12572, %r12540; + ex2.approx.ftz.f32 %r12573, %r12541; + ex2.approx.ftz.f32 %r12574, %r12542; + ex2.approx.ftz.f32 %r12575, %r12543; + ex2.approx.ftz.f32 %r12576, %r12544; + ex2.approx.ftz.f32 %r12577, %r12545; + ex2.approx.ftz.f32 %r12578, %r12546; + ex2.approx.ftz.f32 %r12579, %r12547; + ex2.approx.ftz.f32 %r12580, %r12548; + ex2.approx.ftz.f32 %r12581, %r12549; + ex2.approx.ftz.f32 %r12582, %r12550; + ex2.approx.ftz.f32 %r12583, %r12551; + ex2.approx.ftz.f32 %r12584, %r12552; + ex2.approx.ftz.f32 %r12585, %r12553; + ex2.approx.ftz.f32 %r12586, %r12554; + ex2.approx.ftz.f32 %r12587, %r12555; + ex2.approx.ftz.f32 %r12588, %r12556; + ex2.approx.ftz.f32 %r12589, %r12557; + ex2.approx.ftz.f32 %r12590, %r12558; + ex2.approx.ftz.f32 %r12591, %r12559; + ex2.approx.ftz.f32 %r12592, %r12560; + ex2.approx.ftz.f32 %r12593, %r12561; + ex2.approx.ftz.f32 %r12594, %r12562; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + add.s32 %r12595, %r8442, 49152; + add.s32 %r11866, %r12595, %r12454; + .loc 1 530 20 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:530:20 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + wgmma.fence.sync.aligned; + add.s32 %r11863, %r8442, 131072; + add.s32 %r12596, %r12458, %r11863; + bfe.u32 %r12597, %r12596, 4, 14; + cvt.u64.u32 %rd962, %r12597; + or.b64 %rd912, %rd962, 4611686293372403712; + bfe.u32 %r12598, %r11866, 4, 14; + cvt.u64.u32 %rd963, %r12598; + or.b64 %rd913, %rd963, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11447,%r11448,%r11449,%r11450,%r11451,%r11452,%r11453,%r11454,%r11455,%r11456,%r11457,%r11458,%r11459,%r11460,%r11461,%r11462,%r11463,%r11464,%r11465,%r11466,%r11467,%r11468,%r11469,%r11470,%r11471,%r11472,%r11473,%r11474,%r11475,%r11476,%r11477,%r11478}, %rd912, %rd913, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r12599, %r12462, %r11863; + bfe.u32 %r12600, %r12599, 4, 14; + cvt.u64.u32 %rd964, %r12600; + or.b64 %rd914, %rd964, 4611686293372403712; + add.s32 %r12601, %r11866, 32; + bfe.u32 %r12602, %r12601, 4, 14; + cvt.u64.u32 %rd965, %r12602; + or.b64 %rd915, %rd965, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11447,%r11448,%r11449,%r11450,%r11451,%r11452,%r11453,%r11454,%r11455,%r11456,%r11457,%r11458,%r11459,%r11460,%r11461,%r11462,%r11463,%r11464,%r11465,%r11466,%r11467,%r11468,%r11469,%r11470,%r11471,%r11472,%r11473,%r11474,%r11475,%r11476,%r11477,%r11478}, %rd914, %rd915, %p557, 1, 1, 0, 0; + // end inline asm + add.s32 %r12603, %r12467, %r11863; + bfe.u32 %r12604, %r12603, 4, 14; + cvt.u64.u32 %rd966, %r12604; + or.b64 %rd916, %rd966, 4611686293372403712; + add.s32 %r12605, %r11866, 64; + bfe.u32 %r12606, %r12605, 4, 14; + cvt.u64.u32 %rd967, %r12606; + or.b64 %rd917, %rd967, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11447,%r11448,%r11449,%r11450,%r11451,%r11452,%r11453,%r11454,%r11455,%r11456,%r11457,%r11458,%r11459,%r11460,%r11461,%r11462,%r11463,%r11464,%r11465,%r11466,%r11467,%r11468,%r11469,%r11470,%r11471,%r11472,%r11473,%r11474,%r11475,%r11476,%r11477,%r11478}, %rd916, %rd917, %p557, 1, 1, 0, 0; + // end inline asm + add.s32 %r12607, %r12472, %r11863; + bfe.u32 %r12608, %r12607, 4, 14; + cvt.u64.u32 %rd968, %r12608; + or.b64 %rd918, %rd968, 4611686293372403712; + add.s32 %r12609, %r11866, 96; + bfe.u32 %r12610, %r12609, 4, 14; + cvt.u64.u32 %rd969, %r12610; + or.b64 %rd919, %rd969, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11447,%r11448,%r11449,%r11450,%r11451,%r11452,%r11453,%r11454,%r11455,%r11456,%r11457,%r11458,%r11459,%r11460,%r11461,%r11462,%r11463,%r11464,%r11465,%r11466,%r11467,%r11468,%r11469,%r11470,%r11471,%r11472,%r11473,%r11474,%r11475,%r11476,%r11477,%r11478}, %rd918, %rd919, %p557, 1, 1, 0, 0; + // end inline asm + add.s32 %r12611, %r12477, %r11863; + bfe.u32 %r12612, %r12611, 4, 14; + cvt.u64.u32 %rd970, %r12612; + or.b64 %rd920, %rd970, 4611686293372403712; + add.s32 %r12613, %r11866, 8192; + bfe.u32 %r12614, %r12613, 4, 14; + cvt.u64.u32 %rd971, %r12614; + or.b64 %rd921, %rd971, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11447,%r11448,%r11449,%r11450,%r11451,%r11452,%r11453,%r11454,%r11455,%r11456,%r11457,%r11458,%r11459,%r11460,%r11461,%r11462,%r11463,%r11464,%r11465,%r11466,%r11467,%r11468,%r11469,%r11470,%r11471,%r11472,%r11473,%r11474,%r11475,%r11476,%r11477,%r11478}, %rd920, %rd921, %p557, 1, 1, 0, 0; + // end inline asm + add.s32 %r12615, %r12482, %r11863; + bfe.u32 %r12616, %r12615, 4, 14; + cvt.u64.u32 %rd972, %r12616; + or.b64 %rd922, %rd972, 4611686293372403712; + add.s32 %r12617, %r11866, 8224; + bfe.u32 %r12618, %r12617, 4, 14; + cvt.u64.u32 %rd973, %r12618; + or.b64 %rd923, %rd973, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11447,%r11448,%r11449,%r11450,%r11451,%r11452,%r11453,%r11454,%r11455,%r11456,%r11457,%r11458,%r11459,%r11460,%r11461,%r11462,%r11463,%r11464,%r11465,%r11466,%r11467,%r11468,%r11469,%r11470,%r11471,%r11472,%r11473,%r11474,%r11475,%r11476,%r11477,%r11478}, %rd922, %rd923, %p557, 1, 1, 0, 0; + // end inline asm + add.s32 %r12619, %r12487, %r11863; + bfe.u32 %r12620, %r12619, 4, 14; + cvt.u64.u32 %rd974, %r12620; + or.b64 %rd924, %rd974, 4611686293372403712; + add.s32 %r12621, %r11866, 8256; + bfe.u32 %r12622, %r12621, 4, 14; + cvt.u64.u32 %rd975, %r12622; + or.b64 %rd925, %rd975, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11447,%r11448,%r11449,%r11450,%r11451,%r11452,%r11453,%r11454,%r11455,%r11456,%r11457,%r11458,%r11459,%r11460,%r11461,%r11462,%r11463,%r11464,%r11465,%r11466,%r11467,%r11468,%r11469,%r11470,%r11471,%r11472,%r11473,%r11474,%r11475,%r11476,%r11477,%r11478}, %rd924, %rd925, %p557, 1, 1, 0, 0; + // end inline asm + add.s32 %r12623, %r12492, %r11863; + bfe.u32 %r12624, %r12623, 4, 14; + cvt.u64.u32 %rd976, %r12624; + or.b64 %rd926, %rd976, 4611686293372403712; + add.s32 %r12625, %r11866, 8288; + bfe.u32 %r12626, %r12625, 4, 14; + cvt.u64.u32 %rd977, %r12626; + or.b64 %rd927, %rd977, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11447,%r11448,%r11449,%r11450,%r11451,%r11452,%r11453,%r11454,%r11455,%r11456,%r11457,%r11458,%r11459,%r11460,%r11461,%r11462,%r11463,%r11464,%r11465,%r11466,%r11467,%r11468,%r11469,%r11470,%r11471,%r11472,%r11473,%r11474,%r11475,%r11476,%r11477,%r11478}, %rd926, %rd927, %p557, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r11864, %r11346; + mov.b32 %r11865, %r11346; + mov.b32 %r11867, %r11346; + mov.b32 %r11868, %r11346; + // begin inline asm + // wait for regs: %r11447,%r11448,%r11449,%r11450,%r11451,%r11452,%r11453,%r11454,%r11455,%r11456,%r11457,%r11458,%r11459,%r11460,%r11461,%r11462,%r11463,%r11464,%r11465,%r11466,%r11467,%r11468,%r11469,%r11470,%r11471,%r11472,%r11473,%r11474,%r11475,%r11476,%r11477,%r11478,%r11863,%r11864,%r11865,%r11866,%r11867,%r11868 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 531 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mov.b64 {%r12627, %r12628}, %rd62; + sub.f32 %r12629, %r11448, %r12628; + sub.f32 %r12630, %r11447, %r12627; + .loc 1 531 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.f32 %r12631, %r12563, %r12630; + mul.f32 %r12632, %r12564, %r12629; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + cvt.rn.bf16x2.f32 %r12035, %r12632, %r12631; + .loc 1 531 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mov.b64 {%r12633, %r12634}, %rd61; + sub.f32 %r12635, %r11450, %r12634; + sub.f32 %r12636, %r11449, %r12633; + .loc 1 531 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.f32 %r12637, %r12565, %r12636; + mul.f32 %r12638, %r12566, %r12635; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + cvt.rn.bf16x2.f32 %r12036, %r12638, %r12637; + .loc 1 531 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + sub.f32 %r12639, %r11452, %r12628; + sub.f32 %r12640, %r11451, %r12627; + .loc 1 531 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.f32 %r12641, %r12567, %r12640; + mul.f32 %r12642, %r12568, %r12639; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + cvt.rn.bf16x2.f32 %r12037, %r12642, %r12641; + .loc 1 531 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + sub.f32 %r12643, %r11454, %r12634; + sub.f32 %r12644, %r11453, %r12633; + .loc 1 531 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.f32 %r12645, %r12569, %r12644; + mul.f32 %r12646, %r12570, %r12643; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + cvt.rn.bf16x2.f32 %r12038, %r12646, %r12645; + .loc 1 531 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + sub.f32 %r12647, %r11456, %r12628; + sub.f32 %r12648, %r11455, %r12627; + .loc 1 531 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.f32 %r12649, %r12571, %r12648; + mul.f32 %r12650, %r12572, %r12647; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + cvt.rn.bf16x2.f32 %r12167, %r12650, %r12649; + .loc 1 531 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + sub.f32 %r12651, %r11458, %r12634; + sub.f32 %r12652, %r11457, %r12633; + .loc 1 531 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.f32 %r12653, %r12573, %r12652; + mul.f32 %r12654, %r12574, %r12651; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + cvt.rn.bf16x2.f32 %r12168, %r12654, %r12653; + .loc 1 531 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + sub.f32 %r12655, %r11460, %r12628; + sub.f32 %r12656, %r11459, %r12627; + .loc 1 531 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.f32 %r12657, %r12575, %r12656; + mul.f32 %r12658, %r12576, %r12655; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + cvt.rn.bf16x2.f32 %r12169, %r12658, %r12657; + .loc 1 531 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + sub.f32 %r12659, %r11462, %r12634; + sub.f32 %r12660, %r11461, %r12633; + .loc 1 531 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.f32 %r12661, %r12577, %r12660; + mul.f32 %r12662, %r12578, %r12659; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + cvt.rn.bf16x2.f32 %r12170, %r12662, %r12661; + .loc 1 531 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + sub.f32 %r12663, %r11464, %r12628; + sub.f32 %r12664, %r11463, %r12627; + .loc 1 531 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.f32 %r12665, %r12579, %r12664; + mul.f32 %r12666, %r12580, %r12663; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + cvt.rn.bf16x2.f32 %r12299, %r12666, %r12665; + .loc 1 531 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + sub.f32 %r12667, %r11466, %r12634; + sub.f32 %r12668, %r11465, %r12633; + .loc 1 531 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.f32 %r12669, %r12581, %r12668; + mul.f32 %r12670, %r12582, %r12667; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + cvt.rn.bf16x2.f32 %r12300, %r12670, %r12669; + .loc 1 531 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + sub.f32 %r12671, %r11468, %r12628; + sub.f32 %r12672, %r11467, %r12627; + .loc 1 531 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.f32 %r12673, %r12583, %r12672; + mul.f32 %r12674, %r12584, %r12671; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + cvt.rn.bf16x2.f32 %r12301, %r12674, %r12673; + .loc 1 531 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + sub.f32 %r12675, %r11470, %r12634; + sub.f32 %r12676, %r11469, %r12633; + .loc 1 531 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.f32 %r12677, %r12585, %r12676; + mul.f32 %r12678, %r12586, %r12675; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + cvt.rn.bf16x2.f32 %r12302, %r12678, %r12677; + .loc 1 531 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + sub.f32 %r12679, %r11472, %r12628; + sub.f32 %r12680, %r11471, %r12627; + .loc 1 531 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.f32 %r12681, %r12587, %r12680; + mul.f32 %r12682, %r12588, %r12679; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + cvt.rn.bf16x2.f32 %r12431, %r12682, %r12681; + .loc 1 531 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + sub.f32 %r12683, %r11474, %r12634; + sub.f32 %r12684, %r11473, %r12633; + .loc 1 531 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.f32 %r12685, %r12589, %r12684; + mul.f32 %r12686, %r12590, %r12683; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + cvt.rn.bf16x2.f32 %r12432, %r12686, %r12685; + .loc 1 531 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + sub.f32 %r12687, %r11476, %r12628; + sub.f32 %r12688, %r11475, %r12627; + .loc 1 531 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.f32 %r12689, %r12591, %r12688; + mul.f32 %r12690, %r12592, %r12687; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + cvt.rn.bf16x2.f32 %r12433, %r12690, %r12689; + .loc 1 531 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + sub.f32 %r12691, %r11478, %r12634; + sub.f32 %r12692, %r11477, %r12633; + .loc 1 531 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:531:14 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.f32 %r12693, %r12593, %r12692; + mul.f32 %r12694, %r12594, %r12691; + .loc 1 551 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:551:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + cvt.rn.bf16x2.f32 %r12434, %r12694, %r12693; + .loc 1 553 21 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:553:21 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100}, {%r12035,%r12036,%r12037,%r12038}, %rd897, %p557, 1, 1, 1; + // end inline asm + add.s32 %r12695, %r11348, 2048; + bfe.u32 %r12696, %r12695, 4, 14; + cvt.u64.u32 %rd978, %r12696; + or.b64 %rd929, %rd978, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100}, {%r12167,%r12168,%r12169,%r12170}, %rd929, %p557, 1, 1, 1; + // end inline asm + add.s32 %r12697, %r11348, 4096; + bfe.u32 %r12698, %r12697, 4, 14; + cvt.u64.u32 %rd979, %r12698; + or.b64 %rd930, %rd979, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100}, {%r12299,%r12300,%r12301,%r12302}, %rd930, %p557, 1, 1, 1; + // end inline asm + add.s32 %r12699, %r11348, 6144; + bfe.u32 %r12700, %r12699, 4, 14; + cvt.u64.u32 %rd980, %r12700; + or.b64 %rd931, %rd980, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100}, {%r12431,%r12432,%r12433,%r12434}, %rd931, %p557, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 397 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:397:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + add.s32 %r528, %r13248, 1; + .loc 1 788 33 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:788:33 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + shr.u32 %r12701, %r528, 1; + .loc 1 789 38 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:789:38 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mad.wide.u32 %rd933, %r12701, 4, %rd863; + .loc 1 789 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:789:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + // begin inline asm + mov.u64 %rd932, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd932, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r12435, 0x0; + @%p575 ld.global.L1::evict_last.L2::cache_hint.b32 { %r12435 }, [ %rd933 + 0 ], %rd932; + // end inline asm + .loc 1 790 109 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:790:109 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + add.s32 %r12702, %r12701, 1; + .loc 1 790 113 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:790:113 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + setp.lt.s32 %p579, %r12702, %r10750; + .loc 1 790 55 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:790:55 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + add.s64 %rd936, %rd933, 4; + .loc 1 397 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:397:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + and.pred %p576, %p575, %p579; + .loc 1 790 25 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:790:25 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + // begin inline asm + mov.u64 %rd935, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd935, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r12436, 0x0; + @%p576 ld.global.L1::evict_last.L2::cache_hint.b32 { %r12436 }, [ %rd936 + 0 ], %rd935; + // end inline asm + .loc 1 791 35 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:791:35 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + and.b32 %r12703, %r13248, 1; + .loc 1 792 34 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:792:34 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + sub.s32 %r12704, %r12436, %r12435; + .loc 1 792 48 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:792:48 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + shl.b32 %r12705, %r12704, 7; + .loc 1 792 63 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:792:63 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + add.s32 %r12706, %r12705, 33554368; + .loc 1 414 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:414:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + shl.b32 %r12707, %r12703, 13; + .loc 1 793 29 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:793:29 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + shl.b32 %r12708, %r12703, 7; + xor.b32 %r12709, %r12708, 128; + .loc 1 414 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:414:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mad.lo.s32 %r12710, %r12709, %r12706, %r12707; + .loc 1 414 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:414:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + mul.wide.s32 %rd981, %r12710, 2; + add.s64 %rd1022, %rd1022, %rd981; + add.s64 %rd1021, %rd1021, %rd981; + add.s64 %rd1020, %rd1020, %rd981; + add.s64 %rd1019, %rd1019, %rd981; + .loc 1 415 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:415:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + add.s64 %rd1018, %rd1018, %rd981; + add.s64 %rd1017, %rd1017, %rd981; + add.s64 %rd1016, %rd1016, %rd981; + add.s64 %rd1015, %rd1015, %rd981; + .loc 1 397 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:397:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + add.s32 %r12711, %r13183, 1; + setp.gt.s32 %p580, %r12711, 2; + selp.b32 %r13183, 0, %r12711, %p580; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + shl.b32 %r12712, %r13183, 14; + add.s32 %r12713, %r8442, %r12712; + bar.sync 0; + add.s32 %r12437, %r12713, %r59; + selp.b32 %r12438, 16, 0, %p577; + // begin inline asm + cp.async.cg.shared.global [ %r12437 + 0 ], [ %rd1022 + 0 ], 0x10, %r12438; + // end inline asm + add.s32 %r12439, %r12437, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r12439 + 0 ], [ %rd1021 + 0 ], 0x10, %r12438; + // end inline asm + add.s32 %r12441, %r12437, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r12441 + 0 ], [ %rd1020 + 0 ], 0x10, %r12438; + // end inline asm + add.s32 %r12443, %r12437, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r12443 + 0 ], [ %rd1019 + 0 ], 0x10, %r12438; + // end inline asm + cp.async.commit_group; + add.s32 %r12714, %r12595, %r12712; + add.s32 %r12445, %r12714, %r59; + // begin inline asm + cp.async.cg.shared.global [ %r12445 + 0 ], [ %rd1018 + 0 ], 0x10, %r12438; + // end inline asm + add.s32 %r12447, %r12445, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r12447 + 0 ], [ %rd1017 + 0 ], 0x10, %r12438; + // end inline asm + add.s32 %r12449, %r12445, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r12449 + 0 ], [ %rd1016 + 0 ], 0x10, %r12438; + // end inline asm + add.s32 %r12451, %r12445, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r12451 + 0 ], [ %rd1015 + 0 ], 0x10, %r12438; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:397:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:226:16 ] + setp.ne.b32 %p581, %r393, %r528; + mov.b32 %r13248, %r528; + @%p581 bra $L__BB0_6; +$L__BB0_7: // %._crit_edge1859 + // begin inline asm + // wait for regs: %r13037,%r13038,%r13039,%r13040,%r13041,%r13042,%r13043,%r13044,%r13045,%r13046,%r13047,%r13048,%r13049,%r13050,%r13051,%r13052,%r13053,%r13054,%r13055,%r13056,%r13057,%r13058,%r13059,%r13060,%r13061,%r13062,%r13063,%r13064,%r13065,%r13066,%r13067,%r13068,%r13069,%r13070,%r13071,%r13072,%r13073,%r13074,%r13075,%r13076,%r13077,%r13078,%r13079,%r13080,%r13081,%r13082,%r13083,%r13084,%r13085,%r13086,%r13087,%r13088,%r13089,%r13090,%r13091,%r13092,%r13093,%r13094,%r13095,%r13096,%r13097,%r13098,%r13099,%r13100 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp14: + .loc 1 231 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:231:24 + shl.b64 %rd990, %rd5, 1; + add.s64 %rd991, %rd4, %rd990; + shl.b64 %rd992, %rd6, 1; + add.s64 %rd993, %rd4, %rd992; + shl.b64 %rd994, %rd7, 1; + add.s64 %rd995, %rd4, %rd994; + shl.b64 %rd996, %rd8, 1; + add.s64 %rd997, %rd4, %rd996; + shl.b64 %rd998, %rd9, 1; + add.s64 %rd999, %rd4, %rd998; + shl.b64 %rd1000, %rd10, 1; + add.s64 %rd1001, %rd4, %rd1000; + shl.b64 %rd1002, %rd11, 1; + add.s64 %rd1003, %rd4, %rd1002; + shl.b64 %rd1004, %rd12, 1; + add.s64 %rd1005, %rd4, %rd1004; + .loc 1 231 56 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:231:56 + add.s64 %rd982, %rd991, %rd891; + add.s64 %rd983, %rd993, %rd891; + add.s64 %rd984, %rd995, %rd891; + add.s64 %rd985, %rd997, %rd891; + add.s64 %rd986, %rd999, %rd891; + add.s64 %rd987, %rd1001, %rd891; + add.s64 %rd988, %rd1003, %rd891; + add.s64 %rd989, %rd1005, %rd891; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12915, %r13037, 0f3DB504F3; + mul.f32 %r12916, %r13038, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12917, %r12916, %r12915; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12918, %r13039, 0f3DB504F3; + mul.f32 %r12919, %r13040, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12920, %r12919, %r12918; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12921, %r13041, 0f3DB504F3; + mul.f32 %r12922, %r13042, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12923, %r12922, %r12921; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12924, %r13043, 0f3DB504F3; + mul.f32 %r12925, %r13044, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12926, %r12925, %r12924; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12927, %r13045, 0f3DB504F3; + mul.f32 %r12928, %r13046, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12929, %r12928, %r12927; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12930, %r13047, 0f3DB504F3; + mul.f32 %r12931, %r13048, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12932, %r12931, %r12930; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12933, %r13049, 0f3DB504F3; + mul.f32 %r12934, %r13050, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12935, %r12934, %r12933; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12936, %r13051, 0f3DB504F3; + mul.f32 %r12937, %r13052, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12938, %r12937, %r12936; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12939, %r13053, 0f3DB504F3; + mul.f32 %r12940, %r13054, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12941, %r12940, %r12939; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12942, %r13055, 0f3DB504F3; + mul.f32 %r12943, %r13056, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12944, %r12943, %r12942; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12945, %r13057, 0f3DB504F3; + mul.f32 %r12946, %r13058, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12947, %r12946, %r12945; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12948, %r13059, 0f3DB504F3; + mul.f32 %r12949, %r13060, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12950, %r12949, %r12948; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12951, %r13061, 0f3DB504F3; + mul.f32 %r12952, %r13062, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12953, %r12952, %r12951; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12954, %r13063, 0f3DB504F3; + mul.f32 %r12955, %r13064, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12956, %r12955, %r12954; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12957, %r13065, 0f3DB504F3; + mul.f32 %r12958, %r13066, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12959, %r12958, %r12957; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12960, %r13067, 0f3DB504F3; + mul.f32 %r12961, %r13068, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12962, %r12961, %r12960; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12963, %r13069, 0f3DB504F3; + mul.f32 %r12964, %r13070, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12965, %r12964, %r12963; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12966, %r13071, 0f3DB504F3; + mul.f32 %r12967, %r13072, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12968, %r12967, %r12966; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12969, %r13073, 0f3DB504F3; + mul.f32 %r12970, %r13074, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12971, %r12970, %r12969; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12972, %r13075, 0f3DB504F3; + mul.f32 %r12973, %r13076, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12974, %r12973, %r12972; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12975, %r13077, 0f3DB504F3; + mul.f32 %r12976, %r13078, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12977, %r12976, %r12975; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12978, %r13079, 0f3DB504F3; + mul.f32 %r12979, %r13080, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12980, %r12979, %r12978; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12981, %r13081, 0f3DB504F3; + mul.f32 %r12982, %r13082, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12983, %r12982, %r12981; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12984, %r13083, 0f3DB504F3; + mul.f32 %r12985, %r13084, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12986, %r12985, %r12984; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12987, %r13085, 0f3DB504F3; + mul.f32 %r12988, %r13086, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12989, %r12988, %r12987; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12990, %r13087, 0f3DB504F3; + mul.f32 %r12991, %r13088, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12992, %r12991, %r12990; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12993, %r13089, 0f3DB504F3; + mul.f32 %r12994, %r13090, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12995, %r12994, %r12993; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12996, %r13091, 0f3DB504F3; + mul.f32 %r12997, %r13092, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r12998, %r12997, %r12996; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r12999, %r13093, 0f3DB504F3; + mul.f32 %r13000, %r13094, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r13001, %r13000, %r12999; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r13002, %r13095, 0f3DB504F3; + mul.f32 %r13003, %r13096, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r13004, %r13003, %r13002; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r13005, %r13097, 0f3DB504F3; + mul.f32 %r13006, %r13098, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r13007, %r13006, %r13005; + .loc 1 232 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:232:14 + mul.f32 %r13008, %r13099, 0f3DB504F3; + mul.f32 %r13009, %r13100, 0f3DB504F3; + .loc 1 234 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:234:30 + cvt.rn.bf16x2.f32 %r13010, %r13009, %r13008; + shl.b32 %r13011, %r57, 13; + shl.b32 %r13012, %r6, 5; + and.b32 %r13013, %r13012, 7264; + and.b32 %r13014, %r6, 24; + shl.b32 %r13015, %r13014, 4; + shl.b32 %r13016, %r6, 2; + and.b32 %r13017, %r13016, 16; + or.b32 %r13018, %r13011, %r13017; + or.b32 %r13019, %r13013, %r13015; + or.b32 %r13020, %r13018, %r13019; + add.s32 %r13022, %r8442, %r13020; + st.shared.v4.b32 [%r13022], {%r12917, %r12923, %r12929, %r12935}; + st.shared.v4.b32 [%r13022+512], {%r12920, %r12926, %r12932, %r12938}; + xor.b32 %r13023, %r13020, 32; + add.s32 %r13024, %r8442, %r13023; + st.shared.v4.b32 [%r13024], {%r12941, %r12947, %r12953, %r12959}; + st.shared.v4.b32 [%r13024+512], {%r12944, %r12950, %r12956, %r12962}; + xor.b32 %r13025, %r13020, 64; + add.s32 %r13026, %r8442, %r13025; + st.shared.v4.b32 [%r13026], {%r12965, %r12971, %r12977, %r12983}; + st.shared.v4.b32 [%r13026+512], {%r12968, %r12974, %r12980, %r12986}; + xor.b32 %r13027, %r13020, 96; + add.s32 %r13028, %r8442, %r13027; + st.shared.v4.b32 [%r13028], {%r12989, %r12995, %r13001, %r13007}; + st.shared.v4.b32 [%r13028+512], {%r12992, %r12998, %r13004, %r13010}; + bar.sync 0; + shl.b32 %r13029, %r13014, 10; + shl.b32 %r13030, %r57, 5; + and.b32 %r13031, %r13016, 1008; + or.b32 %r13032, %r13029, %r13030; + xor.b32 %r13033, %r13032, %r13031; + add.s32 %r12847, %r8442, %r13033; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r12883, %r12884, %r12885, %r12886}, [%r12847]; + // end inline asm + add.s32 %r12852, %r12847, 1024; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r12887, %r12888, %r12889, %r12890}, [%r12852]; + // end inline asm + add.s32 %r12857, %r12847, 2048; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r12891, %r12892, %r12893, %r12894}, [%r12857]; + // end inline asm + add.s32 %r12862, %r12847, 3072; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r12895, %r12896, %r12897, %r12898}, [%r12862]; + // end inline asm + add.s32 %r12867, %r12847, 4096; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r12899, %r12900, %r12901, %r12902}, [%r12867]; + // end inline asm + add.s32 %r12872, %r12847, 5120; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r12903, %r12904, %r12905, %r12906}, [%r12872]; + // end inline asm + add.s32 %r12877, %r12847, 6144; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r12907, %r12908, %r12909, %r12910}, [%r12877]; + // end inline asm + add.s32 %r12882, %r12847, 7168; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r12911, %r12912, %r12913, %r12914}, [%r12882]; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd982 + 0 ], { %r12883, %r12884, %r12885, %r12886 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd983 + 0 ], { %r12887, %r12888, %r12889, %r12890 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd984 + 0 ], { %r12891, %r12892, %r12893, %r12894 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd985 + 0 ], { %r12895, %r12896, %r12897, %r12898 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd986 + 0 ], { %r12899, %r12900, %r12901, %r12902 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd987 + 0 ], { %r12903, %r12904, %r12905, %r12906 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd988 + 0 ], { %r12907, %r12908, %r12909, %r12910 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd989 + 0 ], { %r12911, %r12912, %r12913, %r12914 }; + // end inline asm + .loc 1 139 7 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:139:7 + bra.uni $L__BB0_17; +$L__BB0_8: + .loc 1 0 7 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:0:7 + ld.param.b64 %rd194, [triton_tem_fused_zeros_1_param_17]; + ld.param.b64 %rd192, [triton_tem_fused_zeros_1_param_15]; + ld.param.b64 %rd191, [triton_tem_fused_zeros_1_param_14]; + ld.param.b64 %rd188, [triton_tem_fused_zeros_1_param_11]; + ld.param.b64 %rd187, [triton_tem_fused_zeros_1_param_10]; + ld.param.b64 %rd197, [triton_tem_fused_zeros_1_param_7]; + shl.b32 %r1944, %r2, 21; + add.s32 %r5, %r1941, %r1944; + mad.wide.s32 %rd3, %r5, 2, %rd197; + .loc 1 252 25 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:252:25 + shl.b32 %r2017, %r1, 7; + .loc 1 253 29 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:253:29 + or.b32 %r2018, %r9, %r2017; + or.b32 %r2019, %r10, %r2017; + or.b32 %r2020, %r11, %r2017; + or.b32 %r2021, %r12, %r2017; + or.b32 %r2022, %r13, %r2017; + or.b32 %r2023, %r14, %r2017; + or.b32 %r2024, %r15, %r2017; + or.b32 %r2025, %r16, %r2017; + or.b32 %r594, %r17, %r2017; + or.b32 %r595, %r18, %r2017; +$L__tmp15: + .loc 1 825 38 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:825:38 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:256:107 ] + shl.b32 %r2026, %r2018, 7; + shl.b32 %r2027, %r2019, 7; + shl.b32 %r2028, %r2020, 7; + shl.b32 %r2029, %r2021, 7; + shl.b32 %r2030, %r2022, 7; + shl.b32 %r2031, %r2023, 7; + shl.b32 %r2032, %r2024, 7; + shl.b32 %r2033, %r2025, 7; + .loc 1 825 20 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:825:20 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:256:107 ] + cvt.u64.u32 %rd79, %r2026; + mul.wide.u32 %rd222, %r2026, 2; + add.s64 %rd223, %rd1, %rd222; + cvt.u64.u32 %rd80, %r2027; + mul.wide.u32 %rd224, %r2027, 2; + add.s64 %rd225, %rd1, %rd224; + cvt.u64.u32 %rd81, %r2028; + mul.wide.u32 %rd226, %r2028, 2; + add.s64 %rd227, %rd1, %rd226; + cvt.u64.u32 %rd82, %r2029; + mul.wide.u32 %rd228, %r2029, 2; + add.s64 %rd229, %rd1, %rd228; + cvt.u64.u32 %rd83, %r2030; + mul.wide.u32 %rd230, %r2030, 2; + add.s64 %rd231, %rd1, %rd230; + cvt.u64.u32 %rd84, %r2031; + mul.wide.u32 %rd232, %r2031, 2; + add.s64 %rd233, %rd1, %rd232; + cvt.u64.u32 %rd85, %r2032; + mul.wide.u32 %rd234, %r2032, 2; + add.s64 %rd235, %rd1, %rd234; + cvt.u64.u32 %rd86, %r2033; + mul.wide.u32 %rd236, %r2033, 2; + add.s64 %rd237, %rd1, %rd236; + .loc 1 825 56 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:825:56 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:256:107 ] + shl.b32 %r2034, %r6, 3; + and.b32 %r2035, %r2034, 120; + .loc 1 825 49 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:825:49 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:256:107 ] + cvt.u64.u32 %rd87, %r2035; + mul.wide.u32 %rd238, %r2035, 2; + add.s64 %rd199, %rd223, %rd238; + add.s64 %rd200, %rd225, %rd238; + add.s64 %rd201, %rd227, %rd238; + add.s64 %rd202, %rd229, %rd238; + add.s64 %rd203, %rd231, %rd238; + add.s64 %rd204, %rd233, %rd238; + add.s64 %rd205, %rd235, %rd238; + add.s64 %rd206, %rd237, %rd238; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:256:107 ] + // begin inline asm + mov.u32 %r1948, 0x0; + mov.u32 %r1949, 0x0; + mov.u32 %r1950, 0x0; + mov.u32 %r1951, 0x0; + ld.global.v4.b32 { %r1948, %r1949, %r1950, %r1951 }, [ %rd199 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1952, 0x0; + mov.u32 %r1953, 0x0; + mov.u32 %r1954, 0x0; + mov.u32 %r1955, 0x0; + ld.global.v4.b32 { %r1952, %r1953, %r1954, %r1955 }, [ %rd200 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1956, 0x0; + mov.u32 %r1957, 0x0; + mov.u32 %r1958, 0x0; + mov.u32 %r1959, 0x0; + ld.global.v4.b32 { %r1956, %r1957, %r1958, %r1959 }, [ %rd201 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1960, 0x0; + mov.u32 %r1961, 0x0; + mov.u32 %r1962, 0x0; + mov.u32 %r1963, 0x0; + ld.global.v4.b32 { %r1960, %r1961, %r1962, %r1963 }, [ %rd202 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1964, 0x0; + mov.u32 %r1965, 0x0; + mov.u32 %r1966, 0x0; + mov.u32 %r1967, 0x0; + ld.global.v4.b32 { %r1964, %r1965, %r1966, %r1967 }, [ %rd203 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1968, 0x0; + mov.u32 %r1969, 0x0; + mov.u32 %r1970, 0x0; + mov.u32 %r1971, 0x0; + ld.global.v4.b32 { %r1968, %r1969, %r1970, %r1971 }, [ %rd204 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1972, 0x0; + mov.u32 %r1973, 0x0; + mov.u32 %r1974, 0x0; + mov.u32 %r1975, 0x0; + ld.global.v4.b32 { %r1972, %r1973, %r1974, %r1975 }, [ %rd205 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1976, 0x0; + mov.u32 %r1977, 0x0; + mov.u32 %r1978, 0x0; + mov.u32 %r1979, 0x0; + ld.global.v4.b32 { %r1976, %r1977, %r1978, %r1979 }, [ %rd206 + 0 ]; + // end inline asm + shl.b32 %r2036, %r6, 4; + and.b32 %r2037, %r2036, 112; + shl.b32 %r2038, %r8, 3; + and.b32 %r2039, %r6, 112; + and.b32 %r2040, %r6, 8; + shl.b32 %r2041, %r2040, 11; + or.b32 %r2042, %r2037, %r2038; + xor.b32 %r2043, %r2042, %r2039; + or.b32 %r2044, %r2043, %r2041; + mov.b32 %r2045, global_smem; + add.s32 %r2046, %r2045, %r2044; + st.shared.v4.b32 [%r2046+99328], {%r1948, %r1949, %r1950, %r1951}; + st.shared.v4.b32 [%r2046+101376], {%r1952, %r1953, %r1954, %r1955}; + st.shared.v4.b32 [%r2046+103424], {%r1956, %r1957, %r1958, %r1959}; + st.shared.v4.b32 [%r2046+105472], {%r1960, %r1961, %r1962, %r1963}; + st.shared.v4.b32 [%r2046+107520], {%r1964, %r1965, %r1966, %r1967}; + st.shared.v4.b32 [%r2046+109568], {%r1968, %r1969, %r1970, %r1971}; + st.shared.v4.b32 [%r2046+111616], {%r1972, %r1973, %r1974, %r1975}; + st.shared.v4.b32 [%r2046+113664], {%r1976, %r1977, %r1978, %r1979}; +$L__tmp16: + .loc 1 825 20 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:825:20 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:257:107 ] + add.s64 %rd239, %rd2, %rd222; + add.s64 %rd240, %rd2, %rd224; + add.s64 %rd241, %rd2, %rd226; + add.s64 %rd242, %rd2, %rd228; + add.s64 %rd243, %rd2, %rd230; + add.s64 %rd244, %rd2, %rd232; + add.s64 %rd245, %rd2, %rd234; + add.s64 %rd246, %rd2, %rd236; + .loc 1 825 49 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:825:49 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:257:107 ] + add.s64 %rd207, %rd239, %rd238; + add.s64 %rd208, %rd240, %rd238; + add.s64 %rd209, %rd241, %rd238; + add.s64 %rd210, %rd242, %rd238; + add.s64 %rd211, %rd243, %rd238; + add.s64 %rd212, %rd244, %rd238; + add.s64 %rd213, %rd245, %rd238; + add.s64 %rd214, %rd246, %rd238; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:257:107 ] + // begin inline asm + mov.u32 %r1980, 0x0; + mov.u32 %r1981, 0x0; + mov.u32 %r1982, 0x0; + mov.u32 %r1983, 0x0; + ld.global.v4.b32 { %r1980, %r1981, %r1982, %r1983 }, [ %rd207 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1984, 0x0; + mov.u32 %r1985, 0x0; + mov.u32 %r1986, 0x0; + mov.u32 %r1987, 0x0; + ld.global.v4.b32 { %r1984, %r1985, %r1986, %r1987 }, [ %rd208 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1988, 0x0; + mov.u32 %r1989, 0x0; + mov.u32 %r1990, 0x0; + mov.u32 %r1991, 0x0; + ld.global.v4.b32 { %r1988, %r1989, %r1990, %r1991 }, [ %rd209 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1992, 0x0; + mov.u32 %r1993, 0x0; + mov.u32 %r1994, 0x0; + mov.u32 %r1995, 0x0; + ld.global.v4.b32 { %r1992, %r1993, %r1994, %r1995 }, [ %rd210 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r1996, 0x0; + mov.u32 %r1997, 0x0; + mov.u32 %r1998, 0x0; + mov.u32 %r1999, 0x0; + ld.global.v4.b32 { %r1996, %r1997, %r1998, %r1999 }, [ %rd211 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2000, 0x0; + mov.u32 %r2001, 0x0; + mov.u32 %r2002, 0x0; + mov.u32 %r2003, 0x0; + ld.global.v4.b32 { %r2000, %r2001, %r2002, %r2003 }, [ %rd212 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2004, 0x0; + mov.u32 %r2005, 0x0; + mov.u32 %r2006, 0x0; + mov.u32 %r2007, 0x0; + ld.global.v4.b32 { %r2004, %r2005, %r2006, %r2007 }, [ %rd213 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2008, 0x0; + mov.u32 %r2009, 0x0; + mov.u32 %r2010, 0x0; + mov.u32 %r2011, 0x0; + ld.global.v4.b32 { %r2008, %r2009, %r2010, %r2011 }, [ %rd214 + 0 ]; + // end inline asm + st.shared.v4.b32 [%r2046+132096], {%r1980, %r1981, %r1982, %r1983}; + st.shared.v4.b32 [%r2046+134144], {%r1984, %r1985, %r1986, %r1987}; + st.shared.v4.b32 [%r2046+136192], {%r1988, %r1989, %r1990, %r1991}; + st.shared.v4.b32 [%r2046+138240], {%r1992, %r1993, %r1994, %r1995}; + st.shared.v4.b32 [%r2046+140288], {%r1996, %r1997, %r1998, %r1999}; + st.shared.v4.b32 [%r2046+142336], {%r2000, %r2001, %r2002, %r2003}; + st.shared.v4.b32 [%r2046+144384], {%r2004, %r2005, %r2006, %r2007}; + st.shared.v4.b32 [%r2046+146432], {%r2008, %r2009, %r2010, %r2011}; +$L__tmp17: + .loc 1 266 56 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:266:56 + shl.b32 %r596, %r2, 23; + .loc 1 281 58 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:281:58 + shl.b32 %r2047, %r4, 4; + .loc 1 281 80 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:281:80 + or.b32 %r2048, %r2047, %r1; + .loc 1 282 53 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:282:53 + shl.b32 %r2049, %r4, 8; + .loc 1 282 81 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:282:81 + shl.b32 %r2050, %r1, 4; + .loc 1 282 70 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:282:70 + or.b32 %r2051, %r2049, %r2050; + .loc 1 286 32 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:286:32 + mul.wide.u32 %rd247, %r2051, 4; + add.s64 %rd215, %rd188, %rd247; + mov.pred %p2, -1; + .loc 1 287 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:287:30 + // begin inline asm + mov.u32 %r2012, 0x0; + @%p2 ld.global.b32 { %r2012 }, [ %rd215 + 0 ]; + // end inline asm + .loc 1 287 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:287:43 + shl.b32 %r597, %r2012, 7; + .loc 1 288 55 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:288:55 + mul.wide.u32 %rd248, %r2048, 4; + add.s64 %rd216, %rd187, %rd248; + .loc 1 288 42 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:288:42 + // begin inline asm + mov.u32 %r2013, 0x0; + @%p2 ld.global.b32 { %r2013 }, [ %rd216 + 0 ]; + // end inline asm + .loc 1 290 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:290:45 + and.b32 %r599, %r6, 3; + shl.b32 %r600, %r599, 1; + or.b32 %r2052, %r600, 8; + or.b32 %r2053, %r600, 16; + or.b32 %r2054, %r600, 24; + or.b32 %r2055, %r600, 32; + or.b32 %r2056, %r600, 40; + or.b32 %r2057, %r600, 48; + or.b32 %r2058, %r600, 56; + .loc 1 290 32 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:290:32 + or.b32 %r2059, %r597, %r9; + or.b32 %r2060, %r597, %r10; + or.b32 %r2061, %r597, %r11; + or.b32 %r2062, %r597, %r12; + or.b32 %r702, %r597, %r600; + or.b32 %r701, %r702, 1; + or.b32 %r700, %r597, %r2052; + or.b32 %r699, %r702, 9; + or.b32 %r698, %r597, %r2053; + or.b32 %r697, %r702, 17; + or.b32 %r696, %r597, %r2054; + or.b32 %r695, %r702, 25; + or.b32 %r694, %r597, %r2055; + or.b32 %r693, %r702, 33; + or.b32 %r692, %r597, %r2056; + or.b32 %r691, %r702, 41; + or.b32 %r690, %r597, %r2057; + or.b32 %r689, %r702, 49; + or.b32 %r688, %r597, %r2058; + or.b32 %r687, %r702, 57; +$L__tmp18: + .loc 1 601 37 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:601:37 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + shl.b32 %r2063, %r2059, 12; + shl.b32 %r2064, %r2060, 12; + shl.b32 %r2065, %r2061, 12; + shl.b32 %r2066, %r2062, 12; + .loc 1 602 38 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:602:38 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + shl.b32 %r2067, %r2059, 7; + shl.b32 %r2068, %r2060, 7; + shl.b32 %r2069, %r2061, 7; + shl.b32 %r2070, %r2062, 7; + .loc 1 608 42 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:608:42 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + shl.b32 %r601, %r2013, 1; + .loc 1 608 61 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:608:61 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + min.s32 %r2071, %r601, 32; + .loc 1 701 35 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:701:35 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mad.wide.u32 %rd218, %r2, 8, %rd193; + .loc 1 610 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:610:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + setp.gt.s32 %p4, %r601, 0; + .loc 1 701 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:701:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + // begin inline asm + mov.u64 %rd217, 0x0; + @%p4 ld.global.b64 { %rd217 }, [ %rd218 + 0 ]; + // end inline asm +$L__tmp19: + .loc 1 306 41 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:306:41 + add.s64 %rd219, %rd192, %rd247; + .loc 1 307 34 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:307:34 + // begin inline asm + mov.u32 %r2014, 0x0; + @%p2 ld.global.b32 { %r2014 }, [ %rd219 + 0 ]; + // end inline asm + .loc 1 307 47 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:307:47 + shl.b32 %r602, %r2014, 7; + .loc 1 308 64 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:308:64 + add.s64 %rd220, %rd191, %rd248; + .loc 1 308 46 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:308:46 + // begin inline asm + mov.u32 %r2015, 0x0; + @%p2 ld.global.b32 { %r2015 }, [ %rd220 + 0 ]; + // end inline asm + .loc 1 310 36 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:310:36 + or.b32 %r2072, %r602, %r9; + or.b32 %r2073, %r602, %r10; + or.b32 %r2074, %r602, %r11; + or.b32 %r2075, %r602, %r12; + or.b32 %r2076, %r602, %r600; + or.b32 %r2077, %r602, %r2052; + or.b32 %r2078, %r602, %r2053; + or.b32 %r2079, %r602, %r2054; + or.b32 %r2080, %r602, %r2055; + or.b32 %r2081, %r602, %r2056; + or.b32 %r2082, %r602, %r2057; + or.b32 %r2083, %r602, %r2058; +$L__tmp20: + .loc 1 601 37 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:601:37 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + shl.b32 %r2084, %r2072, 12; + shl.b32 %r2085, %r2073, 12; + shl.b32 %r2086, %r2074, 12; + shl.b32 %r2087, %r2075, 12; + .loc 1 602 38 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:602:38 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + shl.b32 %r2088, %r2072, 7; + shl.b32 %r2089, %r2073, 7; + shl.b32 %r2090, %r2074, 7; + shl.b32 %r2091, %r2075, 7; + .loc 1 608 42 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:608:42 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + shl.b32 %r604, %r2015, 1; + .loc 1 608 61 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:608:61 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + min.s32 %r2092, %r604, 32; +$L__tmp21: + .loc 1 676 20 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:676:20 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + shl.b32 %r605, %r2, 16; + cvt.s64.s32 %rd90, %r2063; + cvt.s64.s32 %rd91, %r2064; + cvt.s64.s32 %rd92, %r2065; + cvt.s64.s32 %rd93, %r2066; + cvt.s64.s32 %rd94, %r2067; + cvt.s64.s32 %rd95, %r2068; + cvt.s64.s32 %rd96, %r2069; + cvt.s64.s32 %rd97, %r2070; + shl.b32 %r2093, %r2040, 10; + or.b32 %r606, %r2043, %r2093; + add.s32 %r5062, %r2045, %r606; + selp.b32 %r2096, 16, 0, %p4; + add.s32 %r5064, %r5062, 2048; + add.s32 %r5066, %r5062, 4096; + add.s32 %r5068, %r5062, 6144; + cvt.s64.s32 %rd98, %r702; + and.b32 %r612, %r6, 252; + shl.b32 %r613, %r599, 3; + add.s32 %r2094, %r2045, %r613; + add.s32 %r5070, %r2094, 98304; + selp.b32 %r2104, 8, 0, %p4; + add.s32 %r5072, %r2094, 98336; + add.s32 %r5074, %r2094, 98368; + add.s32 %r5076, %r2094, 98400; + add.s32 %r5078, %r2094, 98432; + add.s32 %r5080, %r2094, 98464; + add.s32 %r5082, %r2094, 98496; + add.s32 %r5084, %r2094, 98528; + add.s32 %r5086, %r5062, 49152; + add.s32 %r5088, %r5062, 51200; + add.s32 %r5090, %r5062, 53248; + add.s32 %r5092, %r5062, 55296; + add.s32 %r5094, %r2094, 98816; + add.s32 %r5096, %r2094, 98848; + add.s32 %r5098, %r2094, 98880; + add.s32 %r5100, %r2094, 98912; + add.s32 %r5102, %r2094, 98944; + add.s32 %r5104, %r2094, 98976; + add.s32 %r5106, %r2094, 99008; + add.s32 %r5108, %r2094, 99040; + setp.gt.s32 %p7, %r601, 1; + or.b32 %r635, %r702, 64; + or.b32 %r636, %r700, 64; + or.b32 %r637, %r698, 64; + or.b32 %r638, %r696, 64; + or.b32 %r639, %r694, 64; + or.b32 %r640, %r692, 64; + or.b32 %r641, %r690, 64; + or.b32 %r642, %r688, 64; + add.s32 %r5110, %r5062, 16384; + selp.b32 %r2144, 16, 0, %p7; + add.s32 %r5112, %r5062, 18432; + add.s32 %r5114, %r5062, 20480; + add.s32 %r5116, %r5062, 22528; + add.s32 %r5118, %r2094, 98560; + selp.b32 %r2152, 8, 0, %p7; + add.s32 %r5120, %r2094, 98592; + add.s32 %r5122, %r2094, 98624; + add.s32 %r5124, %r2094, 98656; + add.s32 %r5126, %r2094, 98688; + add.s32 %r5128, %r2094, 98720; + add.s32 %r5130, %r2094, 98752; + add.s32 %r5132, %r2094, 98784; + add.s32 %r5134, %r5062, 65536; + add.s32 %r5136, %r5062, 67584; + add.s32 %r5138, %r5062, 69632; + add.s32 %r5140, %r5062, 71680; + add.s32 %r5142, %r2094, 99072; + add.s32 %r5144, %r2094, 99104; + add.s32 %r5146, %r2094, 99136; + add.s32 %r5148, %r2094, 99168; + add.s32 %r5150, %r2094, 99200; + add.s32 %r5152, %r2094, 99232; + add.s32 %r5154, %r2094, 99264; + add.s32 %r5156, %r2094, 99296; + add.s32 %r669, %r2071, -2; + add.s32 %r670, %r2071, -1; + cvt.s64.s32 %rd99, %r2084; + cvt.s64.s32 %rd100, %r2085; + cvt.s64.s32 %rd101, %r2086; + cvt.s64.s32 %rd102, %r2087; + cvt.s64.s32 %rd103, %r2088; + cvt.s64.s32 %rd104, %r2089; + cvt.s64.s32 %rd105, %r2090; + cvt.s64.s32 %rd106, %r2091; + setp.gt.s32 %p8, %r604, 0; + selp.b32 %r5063, 16, 0, %p8; + cvt.s64.s32 %rd107, %r2076; + selp.b32 %r5071, 8, 0, %p8; + setp.gt.s32 %p9, %r604, 1; + or.b32 %r673, %r2076, 64; + or.b32 %r674, %r2077, 64; + or.b32 %r675, %r2078, 64; + or.b32 %r676, %r2079, 64; + or.b32 %r677, %r2080, 64; + or.b32 %r678, %r2081, 64; + or.b32 %r679, %r2082, 64; + or.b32 %r680, %r2083, 64; + selp.b32 %r5111, 16, 0, %p9; + selp.b32 %r5119, 8, 0, %p9; + add.s32 %r683, %r2092, -2; + add.s32 %r684, %r2092, -1; +$L__tmp22: + .loc 1 262 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:262:30 + max.s32 %r685, %r2071, 1; + max.s32 %r686, %r2092, 1; + mul.wide.u32 %rd108, %r3, 4; + mov.b32 %r13518, 0f00000000; + mov.b64 %rd1023, 0; + shl.b64 %rd299, %rd90, 1; + shl.b64 %rd301, %rd91, 1; + shl.b64 %rd303, %rd92, 1; + shl.b64 %rd305, %rd93, 1; + shl.b64 %rd308, %rd94, 1; + shl.b64 %rd310, %rd95, 1; + shl.b64 %rd312, %rd96, 1; + shl.b64 %rd314, %rd97, 1; + shl.b64 %rd316, %rd98, 2; + shl.b64 %rd507, %rd99, 1; + shl.b64 %rd509, %rd100, 1; + shl.b64 %rd511, %rd101, 1; + shl.b64 %rd513, %rd102, 1; + shl.b64 %rd516, %rd103, 1; + shl.b64 %rd518, %rd104, 1; + shl.b64 %rd520, %rd105, 1; + shl.b64 %rd522, %rd106, 1; + shl.b64 %rd524, %rd107, 2; + mov.b32 %r13519, %r13518; + mov.b32 %r13520, %r13518; + mov.b32 %r13521, %r13518; + mov.b32 %r13522, %r13518; + mov.b32 %r13523, %r13518; + mov.b32 %r13524, %r13518; + mov.b32 %r13525, %r13518; + mov.b32 %r13526, %r13518; + mov.b32 %r13527, %r13518; + mov.b32 %r13528, %r13518; + mov.b32 %r13529, %r13518; + mov.b32 %r13530, %r13518; + mov.b32 %r13531, %r13518; + mov.b32 %r13532, %r13518; + mov.b32 %r13533, %r13518; + mov.b32 %r13534, %r13518; + mov.b32 %r13535, %r13518; + mov.b32 %r13536, %r13518; + mov.b32 %r13537, %r13518; + mov.b32 %r13538, %r13518; + mov.b32 %r13539, %r13518; + mov.b32 %r13540, %r13518; + mov.b32 %r13541, %r13518; + mov.b32 %r13542, %r13518; + mov.b32 %r13543, %r13518; + mov.b32 %r13544, %r13518; + mov.b32 %r13545, %r13518; + mov.b32 %r13546, %r13518; + mov.b32 %r13547, %r13518; + mov.b32 %r13548, %r13518; + mov.b32 %r13549, %r13518; + mov.b32 %r13550, %r13518; + mov.b32 %r13551, %r13518; + mov.b32 %r13552, %r13518; + mov.b32 %r13553, %r13518; + mov.b32 %r13554, %r13518; + mov.b32 %r13555, %r13518; + mov.b32 %r13556, %r13518; + mov.b32 %r13557, %r13518; + mov.b32 %r13558, %r13518; + mov.b32 %r13559, %r13518; + mov.b32 %r13560, %r13518; + mov.b32 %r13561, %r13518; + mov.b32 %r13562, %r13518; + mov.b32 %r13563, %r13518; + mov.b32 %r13564, %r13518; + mov.b32 %r13565, %r13518; + mov.b32 %r13566, %r13518; + mov.b32 %r13567, %r13518; + mov.b32 %r13568, %r13518; + mov.b32 %r13569, %r13518; + mov.b32 %r13570, %r13518; + mov.b32 %r13571, %r13518; + mov.b32 %r13572, %r13518; + mov.b32 %r13573, %r13518; + mov.b32 %r13574, %r13518; + mov.b32 %r13575, %r13518; + mov.b32 %r13576, %r13518; + mov.b32 %r13577, %r13518; + mov.b32 %r13578, %r13518; + mov.b32 %r13579, %r13518; + mov.b32 %r13580, %r13518; + mov.b32 %r13581, %r13518; + mov.b32 %r13454, %r13518; + mov.b32 %r13455, %r13518; + mov.b32 %r13456, %r13518; + mov.b32 %r13457, %r13518; + mov.b32 %r13458, %r13518; + mov.b32 %r13459, %r13518; + mov.b32 %r13460, %r13518; + mov.b32 %r13461, %r13518; + mov.b32 %r13462, %r13518; + mov.b32 %r13463, %r13518; + mov.b32 %r13464, %r13518; + mov.b32 %r13465, %r13518; + mov.b32 %r13466, %r13518; + mov.b32 %r13467, %r13518; + mov.b32 %r13468, %r13518; + mov.b32 %r13469, %r13518; + mov.b32 %r13470, %r13518; + mov.b32 %r13471, %r13518; + mov.b32 %r13472, %r13518; + mov.b32 %r13473, %r13518; + mov.b32 %r13474, %r13518; + mov.b32 %r13475, %r13518; + mov.b32 %r13476, %r13518; + mov.b32 %r13477, %r13518; + mov.b32 %r13478, %r13518; + mov.b32 %r13479, %r13518; + mov.b32 %r13480, %r13518; + mov.b32 %r13481, %r13518; + mov.b32 %r13482, %r13518; + mov.b32 %r13483, %r13518; + mov.b32 %r13484, %r13518; + mov.b32 %r13485, %r13518; + mov.b32 %r13486, %r13518; + mov.b32 %r13487, %r13518; + mov.b32 %r13488, %r13518; + mov.b32 %r13489, %r13518; + mov.b32 %r13490, %r13518; + mov.b32 %r13491, %r13518; + mov.b32 %r13492, %r13518; + mov.b32 %r13493, %r13518; + mov.b32 %r13494, %r13518; + mov.b32 %r13495, %r13518; + mov.b32 %r13496, %r13518; + mov.b32 %r13497, %r13518; + mov.b32 %r13498, %r13518; + mov.b32 %r13499, %r13518; + mov.b32 %r13500, %r13518; + mov.b32 %r13501, %r13518; + mov.b32 %r13502, %r13518; + mov.b32 %r13503, %r13518; + mov.b32 %r13504, %r13518; + mov.b32 %r13505, %r13518; + mov.b32 %r13506, %r13518; + mov.b32 %r13507, %r13518; + mov.b32 %r13508, %r13518; + mov.b32 %r13509, %r13518; + mov.b32 %r13510, %r13518; + mov.b32 %r13511, %r13518; + mov.b32 %r13512, %r13518; + mov.b32 %r13513, %r13518; + mov.b32 %r13514, %r13518; + mov.b32 %r13515, %r13518; + mov.b32 %r13516, %r13518; + mov.b32 %r13517, %r13518; + bra.uni $L__BB0_9; +$L__BB0_15: // %._crit_edge1692 + // in Loop: Header=BB0_9 Depth=1 +$L__tmp23: + .loc 1 610 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:610:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + // begin inline asm + // wait for regs: %r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517,%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp24: + .loc 1 262 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:262:30 + add.s64 %rd1023, %rd1023, 1; + setp.ne.b64 %p282, %rd1023, 4; + @%p282 bra $L__BB0_9; + bra.uni $L__BB0_16; +$L__BB0_9: // =>This Loop Header: Depth=1 + // Child Loop BB0_11 Depth 2 + // Child Loop BB0_14 Depth 2 + .loc 1 0 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:0:30 + setp.eq.b32 %p186, %r612, 0; +$L__tmp25: + .loc 1 610 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:610:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + setp.lt.s32 %p42, %r601, 1; +$L__tmp26: + .loc 1 263 51 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:263:51 + add.s64 %rd297, %rd1023, %rd108; + .loc 1 266 44 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:266:44 + cvt.u32.u64 %r2191, %rd297; + shl.b32 %r2192, %r2191, 7; + add.s32 %r2193, %r2192, %r596; + .loc 1 267 36 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:267:36 + shl.b32 %r2194, %r2191, 18; + .loc 1 267 46 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:267:46 + add.s32 %r2195, %r2194, %r596; + .loc 1 269 50 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:269:50 + shl.b32 %r2196, %r2191, 11; + add.s32 %r2197, %r2196, %r605; + .loc 1 271 21 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:271:21 + mad.wide.s32 %rd126, %r2193, 2, %rd180; + .loc 1 272 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:272:23 + mad.wide.s32 %rd127, %r2195, 2, %rd183; + .loc 1 275 25 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:275:25 + mul.wide.s32 %rd298, %r2197, 4; + add.s64 %rd128, %rd181, %rd298; + .loc 1 276 29 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:276:29 + add.s64 %rd129, %rd182, %rd298; +$L__tmp27: + .loc 1 601 18 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:601:18 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s64 %rd300, %rd126, %rd299; + add.s64 %rd302, %rd126, %rd301; + add.s64 %rd304, %rd126, %rd303; + add.s64 %rd306, %rd126, %rd305; + .loc 1 601 49 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:601:49 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + shl.b64 %rd307, %rd87, 1; + add.s64 %rd249, %rd300, %rd307; + add.s64 %rd250, %rd302, %rd307; + add.s64 %rd251, %rd304, %rd307; + add.s64 %rd252, %rd306, %rd307; + .loc 1 602 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:602:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s64 %rd309, %rd127, %rd308; + add.s64 %rd311, %rd127, %rd310; + add.s64 %rd313, %rd127, %rd312; + add.s64 %rd315, %rd127, %rd314; + .loc 1 602 51 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:602:51 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s64 %rd261, %rd309, %rd307; + add.s64 %rd262, %rd311, %rd307; + add.s64 %rd263, %rd313, %rd307; + add.s64 %rd264, %rd315, %rd307; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + // begin inline asm + cp.async.cg.shared.global [ %r5062 + 0 ], [ %rd249 + 0 ], 0x10, %r2096; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5064 + 0 ], [ %rd250 + 0 ], 0x10, %r2096; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5066 + 0 ], [ %rd251 + 0 ], 0x10, %r2096; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5068 + 0 ], [ %rd252 + 0 ], 0x10, %r2096; + // end inline asm + cp.async.commit_group; + .loc 1 672 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:672:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s64 %rd253, %rd128, %rd316; + cvt.s64.s32 %rd317, %r597; + cvt.u64.u32 %rd130, %r600; + add.s64 %rd318, %rd317, %rd130; + shl.b64 %rd319, %rd318, 2; + add.s64 %rd320, %rd128, %rd319; + add.s64 %rd254, %rd320, 32; + add.s64 %rd255, %rd320, 64; + add.s64 %rd256, %rd320, 96; + add.s64 %rd257, %rd320, 128; + add.s64 %rd258, %rd320, 160; + add.s64 %rd259, %rd320, 192; + add.s64 %rd260, %rd320, 224; + .loc 1 672 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:672:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5070 + 0 ], [ %rd253 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5072 + 0 ], [ %rd254 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5074 + 0 ], [ %rd255 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5076 + 0 ], [ %rd256 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5078 + 0 ], [ %rd257 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5080 + 0 ], [ %rd258 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5082 + 0 ], [ %rd259 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5084 + 0 ], [ %rd260 + 0 ], 0x8, %r2104; + // end inline asm + cp.async.commit_group; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + // begin inline asm + cp.async.cg.shared.global [ %r5086 + 0 ], [ %rd261 + 0 ], 0x10, %r2096; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5088 + 0 ], [ %rd262 + 0 ], 0x10, %r2096; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5090 + 0 ], [ %rd263 + 0 ], 0x10, %r2096; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5092 + 0 ], [ %rd264 + 0 ], 0x10, %r2096; + // end inline asm + cp.async.commit_group; + .loc 1 746 29 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:746:29 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s64 %rd265, %rd129, %rd316; + add.s64 %rd321, %rd129, %rd319; + add.s64 %rd266, %rd321, 32; + add.s64 %rd267, %rd321, 64; + add.s64 %rd268, %rd321, 96; + add.s64 %rd269, %rd321, 128; + add.s64 %rd270, %rd321, 160; + add.s64 %rd271, %rd321, 192; + add.s64 %rd272, %rd321, 224; + .loc 1 746 21 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:746:21 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5094 + 0 ], [ %rd265 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5096 + 0 ], [ %rd266 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5098 + 0 ], [ %rd267 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5100 + 0 ], [ %rd268 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5102 + 0 ], [ %rd269 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5104 + 0 ], [ %rd270 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5106 + 0 ], [ %rd271 + 0 ], 0x8, %r2104; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5108 + 0 ], [ %rd272 + 0 ], 0x8, %r2104; + // end inline asm + cp.async.commit_group; + .loc 1 626 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:626:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s64 %rd1031, %rd249, 524288; + add.s64 %rd1030, %rd250, 524288; + add.s64 %rd1029, %rd251, 524288; + add.s64 %rd1028, %rd252, 524288; + .loc 1 627 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:627:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s64 %rd1027, %rd261, 16384; + add.s64 %rd1026, %rd262, 16384; + add.s64 %rd1025, %rd263, 16384; + add.s64 %rd1024, %rd264, 16384; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + bar.sync 0; + // begin inline asm + cp.async.cg.shared.global [ %r5110 + 0 ], [ %rd1031 + 0 ], 0x10, %r2144; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5112 + 0 ], [ %rd1030 + 0 ], 0x10, %r2144; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5114 + 0 ], [ %rd1029 + 0 ], 0x10, %r2144; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5116 + 0 ], [ %rd1028 + 0 ], 0x10, %r2144; + // end inline asm + cp.async.commit_group; + .loc 1 672 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:672:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s64 %rd277, %rd253, 256; + add.s64 %rd322, %rd319, 256; + add.s64 %rd323, %rd128, %rd322; + add.s64 %rd278, %rd323, 32; + add.s64 %rd279, %rd323, 64; + add.s64 %rd280, %rd323, 96; + add.s64 %rd281, %rd323, 128; + add.s64 %rd282, %rd323, 160; + add.s64 %rd283, %rd323, 192; + add.s64 %rd284, %rd323, 224; + .loc 1 672 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:672:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5118 + 0 ], [ %rd277 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5120 + 0 ], [ %rd278 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5122 + 0 ], [ %rd279 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5124 + 0 ], [ %rd280 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5126 + 0 ], [ %rd281 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5128 + 0 ], [ %rd282 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5130 + 0 ], [ %rd283 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5132 + 0 ], [ %rd284 + 0 ], 0x8, %r2152; + // end inline asm + cp.async.commit_group; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + // begin inline asm + cp.async.cg.shared.global [ %r5134 + 0 ], [ %rd1027 + 0 ], 0x10, %r2144; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5136 + 0 ], [ %rd1026 + 0 ], 0x10, %r2144; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5138 + 0 ], [ %rd1025 + 0 ], 0x10, %r2144; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5140 + 0 ], [ %rd1024 + 0 ], 0x10, %r2144; + // end inline asm + cp.async.commit_group; + .loc 1 746 29 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:746:29 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s64 %rd289, %rd265, 256; + add.s64 %rd324, %rd129, %rd322; + add.s64 %rd290, %rd324, 32; + add.s64 %rd291, %rd324, 64; + add.s64 %rd292, %rd324, 96; + add.s64 %rd293, %rd324, 128; + add.s64 %rd294, %rd324, 160; + add.s64 %rd295, %rd324, 192; + add.s64 %rd296, %rd324, 224; + .loc 1 746 21 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:746:21 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5142 + 0 ], [ %rd289 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5144 + 0 ], [ %rd290 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5146 + 0 ], [ %rd291 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5148 + 0 ], [ %rd292 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5150 + 0 ], [ %rd293 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5152 + 0 ], [ %rd294 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5154 + 0 ], [ %rd295 + 0 ], 0x8, %r2152; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5156 + 0 ], [ %rd296 + 0 ], 0x8, %r2152; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:610:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + @%p42 bra $L__BB0_12; +// %bb.10: // %.lr.ph.preheader + // in Loop: Header=BB0_9 Depth=1 + .loc 1 0 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:0:28 + mov.b32 %r2753, 0; + mov.b32 %r13443, 1; + mov.b32 %r13442, -1; + mov.b32 %r13441, 64; + mov.b32 %r13444, %r13442; + mov.b32 %r13445, %r13443; + mov.b32 %r13446, %r642; + mov.b32 %r13447, %r641; + mov.b32 %r13448, %r640; + mov.b32 %r13449, %r639; + mov.b32 %r13450, %r638; + mov.b32 %r13451, %r637; + mov.b32 %r13452, %r636; + mov.b32 %r13453, %r635; + mov.b32 %r13582, %r2753; + mov.b32 %r13583, %r687; + mov.b32 %r13584, %r688; + mov.b32 %r13585, %r689; + mov.b32 %r13586, %r690; + mov.b32 %r13587, %r691; + mov.b32 %r13588, %r692; + mov.b32 %r13589, %r693; + mov.b32 %r13590, %r694; + mov.b32 %r13591, %r695; + mov.b32 %r13592, %r696; + mov.b32 %r13593, %r697; + mov.b32 %r13594, %r698; + mov.b32 %r13595, %r699; + mov.b32 %r13596, %r700; + mov.b32 %r13597, %r701; + mov.b32 %r13598, %r702; +$L__BB0_11: // %.lr.ph + // Parent Loop BB0_9 Depth=1 + // => This Inner Loop Header: Depth=2 + .loc 1 610 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:610:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + setp.lt.s32 %p83, %r13582, %r669; + setp.lt.s32 %p65, %r13582, %r670; + add.s32 %r4420, %r13442, 1; + setp.gt.s32 %p84, %r4420, 1; + selp.b32 %r13442, 0, %r4420, %p84; + add.s32 %r4421, %r13444, 1; + setp.gt.s32 %p85, %r4421, 2; + selp.b32 %r13444, 0, %r4421, %p85; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cp.async.wait_group 4; + bar.sync 0; + shl.b32 %r4422, %r13444, 14; + add.s32 %r2755, %r2045, %r4422; + .loc 1 672 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:672:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + shl.b32 %r4424, %r13442, 8; + add.s32 %r4425, %r2045, 98304; + add.s32 %r4426, %r4425, %r4424; + add.s32 %r4427, %r4426, %r613; + ld.shared.v2.b32 {%r4428, %r4429}, [%r4427]; + ld.shared.v2.b32 {%r4430, %r4431}, [%r4427+32]; + ld.shared.v2.b32 {%r4432, %r4433}, [%r4427+64]; + ld.shared.v2.b32 {%r4434, %r4435}, [%r4427+96]; + ld.shared.v2.b32 {%r4436, %r4437}, [%r4427+128]; + ld.shared.v2.b32 {%r4438, %r4439}, [%r4427+160]; + ld.shared.v2.b32 {%r4440, %r4441}, [%r4427+192]; + ld.shared.v2.b32 {%r4442, %r4443}, [%r4427+224]; + .loc 1 675 26 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:675:26 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + setp.eq.f32 %p86, %r4428, 0fFF800000; + setp.eq.f32 %p87, %r4429, 0fFF800000; + setp.eq.f32 %p88, %r4430, 0fFF800000; + setp.eq.f32 %p89, %r4431, 0fFF800000; + setp.eq.f32 %p90, %r4432, 0fFF800000; + setp.eq.f32 %p91, %r4433, 0fFF800000; + setp.eq.f32 %p92, %r4434, 0fFF800000; + setp.eq.f32 %p93, %r4435, 0fFF800000; + setp.eq.f32 %p94, %r4436, 0fFF800000; + setp.eq.f32 %p95, %r4437, 0fFF800000; + setp.eq.f32 %p96, %r4438, 0fFF800000; + setp.eq.f32 %p97, %r4439, 0fFF800000; + setp.eq.f32 %p98, %r4440, 0fFF800000; + setp.eq.f32 %p99, %r4441, 0fFF800000; + setp.eq.f32 %p100, %r4442, 0fFF800000; + setp.eq.f32 %p101, %r4443, 0fFF800000; + .loc 1 675 46 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:675:46 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4444, 0f00000000, %r4428, %p86; + selp.f32 %r4445, 0f00000000, %r4429, %p87; + selp.f32 %r4446, 0f00000000, %r4430, %p88; + selp.f32 %r4447, 0f00000000, %r4431, %p89; + selp.f32 %r4448, 0f00000000, %r4432, %p90; + selp.f32 %r4449, 0f00000000, %r4433, %p91; + selp.f32 %r4450, 0f00000000, %r4434, %p92; + selp.f32 %r4451, 0f00000000, %r4435, %p93; + selp.f32 %r4452, 0f00000000, %r4436, %p94; + selp.f32 %r4453, 0f00000000, %r4437, %p95; + selp.f32 %r4454, 0f00000000, %r4438, %p96; + selp.f32 %r4455, 0f00000000, %r4439, %p97; + selp.f32 %r4456, 0f00000000, %r4440, %p98; + selp.f32 %r4457, 0f00000000, %r4441, %p99; + selp.f32 %r4458, 0f00000000, %r4442, %p100; + selp.f32 %r4459, 0f00000000, %r4443, %p101; + .loc 1 676 20 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:676:20 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + shfl.sync.idx.b32 %r4460, %r7, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r4461, %r4460, 11; + and.b32 %r4462, %r4461, 8192; + add.s32 %r2714, %r2045, 99328; + add.s32 %r4463, %r4462, %r2714; + bfe.u32 %r4464, %r4463, 4, 14; + cvt.u64.u32 %rd395, %r4464; + or.b64 %rd325, %rd395, 4611686293372403712; + bfe.u32 %r4465, %r2755, 4, 14; + cvt.u64.u32 %rd396, %r4465; + or.b64 %rd326, %rd396, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2298,%r2299,%r2300,%r2301,%r2302,%r2303,%r2304,%r2305,%r2306,%r2307,%r2308,%r2309,%r2310,%r2311,%r2312,%r2313,%r2314,%r2315,%r2316,%r2317,%r2318,%r2319,%r2320,%r2321,%r2322,%r2323,%r2324,%r2325,%r2326,%r2327,%r2328,%r2329}, %rd325, %rd326, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r4466, %r4462, 32; + add.s32 %r4467, %r4466, %r2714; + bfe.u32 %r4468, %r4467, 4, 14; + cvt.u64.u32 %rd397, %r4468; + or.b64 %rd327, %rd397, 4611686293372403712; + add.s32 %r4469, %r2755, 32; + bfe.u32 %r4470, %r4469, 4, 14; + cvt.u64.u32 %rd398, %r4470; + or.b64 %rd328, %rd398, 4611686293338849280; + mov.pred %p43, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2298,%r2299,%r2300,%r2301,%r2302,%r2303,%r2304,%r2305,%r2306,%r2307,%r2308,%r2309,%r2310,%r2311,%r2312,%r2313,%r2314,%r2315,%r2316,%r2317,%r2318,%r2319,%r2320,%r2321,%r2322,%r2323,%r2324,%r2325,%r2326,%r2327,%r2328,%r2329}, %rd327, %rd328, %p43, 1, 1, 0, 0; + // end inline asm + or.b32 %r4471, %r4462, 64; + add.s32 %r4472, %r4471, %r2714; + bfe.u32 %r4473, %r4472, 4, 14; + cvt.u64.u32 %rd399, %r4473; + or.b64 %rd329, %rd399, 4611686293372403712; + add.s32 %r4474, %r2755, 64; + bfe.u32 %r4475, %r4474, 4, 14; + cvt.u64.u32 %rd400, %r4475; + or.b64 %rd330, %rd400, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2298,%r2299,%r2300,%r2301,%r2302,%r2303,%r2304,%r2305,%r2306,%r2307,%r2308,%r2309,%r2310,%r2311,%r2312,%r2313,%r2314,%r2315,%r2316,%r2317,%r2318,%r2319,%r2320,%r2321,%r2322,%r2323,%r2324,%r2325,%r2326,%r2327,%r2328,%r2329}, %rd329, %rd330, %p43, 1, 1, 0, 0; + // end inline asm + or.b32 %r4476, %r4462, 96; + add.s32 %r4477, %r4476, %r2714; + bfe.u32 %r4478, %r4477, 4, 14; + cvt.u64.u32 %rd401, %r4478; + or.b64 %rd331, %rd401, 4611686293372403712; + add.s32 %r4479, %r2755, 96; + bfe.u32 %r4480, %r4479, 4, 14; + cvt.u64.u32 %rd402, %r4480; + or.b64 %rd332, %rd402, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2298,%r2299,%r2300,%r2301,%r2302,%r2303,%r2304,%r2305,%r2306,%r2307,%r2308,%r2309,%r2310,%r2311,%r2312,%r2313,%r2314,%r2315,%r2316,%r2317,%r2318,%r2319,%r2320,%r2321,%r2322,%r2323,%r2324,%r2325,%r2326,%r2327,%r2328,%r2329}, %rd331, %rd332, %p43, 1, 1, 0, 0; + // end inline asm + or.b32 %r4481, %r4462, 16384; + add.s32 %r4482, %r4481, %r2714; + bfe.u32 %r4483, %r4482, 4, 14; + cvt.u64.u32 %rd403, %r4483; + or.b64 %rd333, %rd403, 4611686293372403712; + add.s32 %r4484, %r2755, 8192; + bfe.u32 %r4485, %r4484, 4, 14; + cvt.u64.u32 %rd404, %r4485; + or.b64 %rd334, %rd404, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2298,%r2299,%r2300,%r2301,%r2302,%r2303,%r2304,%r2305,%r2306,%r2307,%r2308,%r2309,%r2310,%r2311,%r2312,%r2313,%r2314,%r2315,%r2316,%r2317,%r2318,%r2319,%r2320,%r2321,%r2322,%r2323,%r2324,%r2325,%r2326,%r2327,%r2328,%r2329}, %rd333, %rd334, %p43, 1, 1, 0, 0; + // end inline asm + or.b32 %r4486, %r4462, 16416; + add.s32 %r4487, %r4486, %r2714; + bfe.u32 %r4488, %r4487, 4, 14; + cvt.u64.u32 %rd405, %r4488; + or.b64 %rd335, %rd405, 4611686293372403712; + add.s32 %r4489, %r2755, 8224; + bfe.u32 %r4490, %r4489, 4, 14; + cvt.u64.u32 %rd406, %r4490; + or.b64 %rd336, %rd406, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2298,%r2299,%r2300,%r2301,%r2302,%r2303,%r2304,%r2305,%r2306,%r2307,%r2308,%r2309,%r2310,%r2311,%r2312,%r2313,%r2314,%r2315,%r2316,%r2317,%r2318,%r2319,%r2320,%r2321,%r2322,%r2323,%r2324,%r2325,%r2326,%r2327,%r2328,%r2329}, %rd335, %rd336, %p43, 1, 1, 0, 0; + // end inline asm + or.b32 %r4491, %r4462, 16448; + add.s32 %r4492, %r4491, %r2714; + bfe.u32 %r4493, %r4492, 4, 14; + cvt.u64.u32 %rd407, %r4493; + or.b64 %rd337, %rd407, 4611686293372403712; + add.s32 %r4494, %r2755, 8256; + bfe.u32 %r4495, %r4494, 4, 14; + cvt.u64.u32 %rd408, %r4495; + or.b64 %rd338, %rd408, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2298,%r2299,%r2300,%r2301,%r2302,%r2303,%r2304,%r2305,%r2306,%r2307,%r2308,%r2309,%r2310,%r2311,%r2312,%r2313,%r2314,%r2315,%r2316,%r2317,%r2318,%r2319,%r2320,%r2321,%r2322,%r2323,%r2324,%r2325,%r2326,%r2327,%r2328,%r2329}, %rd337, %rd338, %p43, 1, 1, 0, 0; + // end inline asm + or.b32 %r4496, %r4462, 16480; + add.s32 %r4497, %r4496, %r2714; + bfe.u32 %r4498, %r4497, 4, 14; + cvt.u64.u32 %rd409, %r4498; + or.b64 %rd339, %rd409, 4611686293372403712; + add.s32 %r4499, %r2755, 8288; + bfe.u32 %r4500, %r4499, 4, 14; + cvt.u64.u32 %rd410, %r4500; + or.b64 %rd340, %rd410, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2298,%r2299,%r2300,%r2301,%r2302,%r2303,%r2304,%r2305,%r2306,%r2307,%r2308,%r2309,%r2310,%r2311,%r2312,%r2313,%r2314,%r2315,%r2316,%r2317,%r2318,%r2319,%r2320,%r2321,%r2322,%r2323,%r2324,%r2325,%r2326,%r2327,%r2328,%r2329}, %rd339, %rd340, %p43, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r2717, %r2755; + mov.b32 %r2715, %r2753; + mov.b32 %r2716, %r2753; + mov.b32 %r2718, %r2753; + mov.b32 %r2719, %r2753; + // begin inline asm + // wait for regs: %r2298,%r2299,%r2300,%r2301,%r2302,%r2303,%r2304,%r2305,%r2306,%r2307,%r2308,%r2309,%r2310,%r2311,%r2312,%r2313,%r2314,%r2315,%r2316,%r2317,%r2318,%r2319,%r2320,%r2321,%r2322,%r2323,%r2324,%r2325,%r2326,%r2327,%r2328,%r2329,%r2714,%r2715,%r2716,%r2717,%r2718,%r2719 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 678 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:678:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4501, %r2298, 0f3DB504F3; + mul.f32 %r4502, %r2299, 0f3DB504F3; + mul.f32 %r4503, %r2300, 0f3DB504F3; + mul.f32 %r4504, %r2301, 0f3DB504F3; + mul.f32 %r4505, %r2302, 0f3DB504F3; + mul.f32 %r4506, %r2303, 0f3DB504F3; + mul.f32 %r4507, %r2304, 0f3DB504F3; + mul.f32 %r4508, %r2305, 0f3DB504F3; + mul.f32 %r4509, %r2306, 0f3DB504F3; + mul.f32 %r4510, %r2307, 0f3DB504F3; + mul.f32 %r4511, %r2308, 0f3DB504F3; + mul.f32 %r4512, %r2309, 0f3DB504F3; + mul.f32 %r4513, %r2310, 0f3DB504F3; + mul.f32 %r4514, %r2311, 0f3DB504F3; + mul.f32 %r4515, %r2312, 0f3DB504F3; + mul.f32 %r4516, %r2313, 0f3DB504F3; + mul.f32 %r4517, %r2314, 0f3DB504F3; + mul.f32 %r4518, %r2315, 0f3DB504F3; + mul.f32 %r4519, %r2316, 0f3DB504F3; + mul.f32 %r4520, %r2317, 0f3DB504F3; + mul.f32 %r4521, %r2318, 0f3DB504F3; + mul.f32 %r4522, %r2319, 0f3DB504F3; + mul.f32 %r4523, %r2320, 0f3DB504F3; + mul.f32 %r4524, %r2321, 0f3DB504F3; + mul.f32 %r4525, %r2322, 0f3DB504F3; + mul.f32 %r4526, %r2323, 0f3DB504F3; + mul.f32 %r4527, %r2324, 0f3DB504F3; + mul.f32 %r4528, %r2325, 0f3DB504F3; + mul.f32 %r4529, %r2326, 0f3DB504F3; + mul.f32 %r4530, %r2327, 0f3DB504F3; + mul.f32 %r4531, %r2328, 0f3DB504F3; + mul.f32 %r4532, %r2329, 0f3DB504F3; + .loc 1 698 25 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:698:25 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + setp.ge.s32 %p102, %r13598, %r594; + setp.ge.s32 %p103, %r13597, %r594; + setp.ge.s32 %p104, %r13598, %r595; + setp.ge.s32 %p105, %r13597, %r595; + setp.ge.s32 %p106, %r13596, %r594; + setp.ge.s32 %p107, %r13595, %r594; + setp.ge.s32 %p108, %r13596, %r595; + setp.ge.s32 %p109, %r13595, %r595; + setp.ge.s32 %p110, %r13594, %r594; + setp.ge.s32 %p111, %r13593, %r594; + setp.ge.s32 %p112, %r13594, %r595; + setp.ge.s32 %p113, %r13593, %r595; + setp.ge.s32 %p114, %r13592, %r594; + setp.ge.s32 %p115, %r13591, %r594; + setp.ge.s32 %p116, %r13592, %r595; + setp.ge.s32 %p117, %r13591, %r595; + setp.ge.s32 %p118, %r13590, %r594; + setp.ge.s32 %p119, %r13589, %r594; + setp.ge.s32 %p120, %r13590, %r595; + setp.ge.s32 %p121, %r13589, %r595; + setp.ge.s32 %p122, %r13588, %r594; + setp.ge.s32 %p123, %r13587, %r594; + setp.ge.s32 %p124, %r13588, %r595; + setp.ge.s32 %p125, %r13587, %r595; + setp.ge.s32 %p126, %r13586, %r594; + setp.ge.s32 %p127, %r13585, %r594; + setp.ge.s32 %p128, %r13586, %r595; + setp.ge.s32 %p129, %r13585, %r595; + setp.ge.s32 %p130, %r13584, %r594; + setp.ge.s32 %p131, %r13583, %r594; + setp.ge.s32 %p132, %r13584, %r595; + setp.ge.s32 %p133, %r13583, %r595; + .loc 1 704 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:704:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.s64.s32 %rd411, %r13598; + cvt.s64.s32 %rd412, %r13597; + cvt.s64.s32 %rd413, %r13596; + cvt.s64.s32 %rd414, %r13595; + cvt.s64.s32 %rd415, %r13594; + cvt.s64.s32 %rd416, %r13593; + cvt.s64.s32 %rd417, %r13592; + cvt.s64.s32 %rd418, %r13591; + cvt.s64.s32 %rd419, %r13590; + cvt.s64.s32 %rd420, %r13589; + cvt.s64.s32 %rd421, %r13588; + cvt.s64.s32 %rd422, %r13587; + cvt.s64.s32 %rd423, %r13586; + cvt.s64.s32 %rd424, %r13585; + cvt.s64.s32 %rd425, %r13584; + cvt.s64.s32 %rd426, %r13583; + setp.gt.s64 %p134, %rd217, %rd426; + setp.gt.s64 %p135, %rd217, %rd425; + setp.gt.s64 %p136, %rd217, %rd424; + setp.gt.s64 %p137, %rd217, %rd423; + setp.gt.s64 %p138, %rd217, %rd422; + setp.gt.s64 %p139, %rd217, %rd421; + setp.gt.s64 %p140, %rd217, %rd420; + setp.gt.s64 %p141, %rd217, %rd419; + setp.gt.s64 %p142, %rd217, %rd418; + setp.gt.s64 %p143, %rd217, %rd417; + setp.gt.s64 %p144, %rd217, %rd416; + setp.gt.s64 %p145, %rd217, %rd415; + setp.gt.s64 %p146, %rd217, %rd414; + setp.gt.s64 %p147, %rd217, %rd413; + setp.gt.s64 %p148, %rd217, %rd412; + setp.gt.s64 %p149, %rd217, %rd411; + .loc 1 706 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:706:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + and.pred %p150, %p102, %p149; + and.pred %p151, %p103, %p148; + and.pred %p152, %p104, %p149; + and.pred %p153, %p105, %p148; + and.pred %p154, %p106, %p147; + and.pred %p155, %p107, %p146; + and.pred %p156, %p108, %p147; + and.pred %p157, %p109, %p146; + and.pred %p158, %p110, %p145; + and.pred %p159, %p111, %p144; + and.pred %p160, %p112, %p145; + and.pred %p161, %p113, %p144; + and.pred %p162, %p114, %p143; + and.pred %p163, %p115, %p142; + and.pred %p164, %p116, %p143; + and.pred %p165, %p117, %p142; + and.pred %p166, %p118, %p141; + and.pred %p167, %p119, %p140; + and.pred %p168, %p120, %p141; + and.pred %p169, %p121, %p140; + and.pred %p170, %p122, %p139; + and.pred %p171, %p123, %p138; + and.pred %p172, %p124, %p139; + and.pred %p173, %p125, %p138; + and.pred %p174, %p126, %p137; + and.pred %p175, %p127, %p136; + and.pred %p176, %p128, %p137; + and.pred %p177, %p129, %p136; + and.pred %p178, %p130, %p135; + and.pred %p179, %p131, %p134; + and.pred %p180, %p132, %p135; + and.pred %p181, %p133, %p134; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4533, %r4501, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4534, %r4533, 0fFF800000, %p150; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4535, %r4502, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4536, %r4535, 0fFF800000, %p151; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4537, %r4503, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4538, %r4537, 0fFF800000, %p152; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4539, %r4504, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4540, %r4539, 0fFF800000, %p153; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4541, %r4505, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4542, %r4541, 0fFF800000, %p154; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4543, %r4506, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4544, %r4543, 0fFF800000, %p155; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4545, %r4507, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4546, %r4545, 0fFF800000, %p156; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4547, %r4508, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4548, %r4547, 0fFF800000, %p157; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4549, %r4509, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4550, %r4549, 0fFF800000, %p158; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4551, %r4510, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4552, %r4551, 0fFF800000, %p159; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4553, %r4511, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4554, %r4553, 0fFF800000, %p160; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4555, %r4512, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4556, %r4555, 0fFF800000, %p161; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4557, %r4513, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4558, %r4557, 0fFF800000, %p162; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4559, %r4514, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4560, %r4559, 0fFF800000, %p163; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4561, %r4515, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4562, %r4561, 0fFF800000, %p164; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4563, %r4516, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4564, %r4563, 0fFF800000, %p165; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4565, %r4517, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4566, %r4565, 0fFF800000, %p166; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4567, %r4518, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4568, %r4567, 0fFF800000, %p167; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4569, %r4519, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4570, %r4569, 0fFF800000, %p168; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4571, %r4520, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4572, %r4571, 0fFF800000, %p169; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4573, %r4521, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4574, %r4573, 0fFF800000, %p170; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4575, %r4522, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4576, %r4575, 0fFF800000, %p171; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4577, %r4523, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4578, %r4577, 0fFF800000, %p172; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4579, %r4524, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4580, %r4579, 0fFF800000, %p173; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4581, %r4525, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4582, %r4581, 0fFF800000, %p174; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4583, %r4526, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4584, %r4583, 0fFF800000, %p175; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4585, %r4527, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4586, %r4585, 0fFF800000, %p176; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4587, %r4528, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4588, %r4587, 0fFF800000, %p177; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4589, %r4529, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4590, %r4589, 0fFF800000, %p178; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4591, %r4530, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4592, %r4591, 0fFF800000, %p179; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4593, %r4531, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4594, %r4593, 0fFF800000, %p180; + .loc 1 739 27 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:739:27 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4595, %r4532, 0f3FB8AA3B; + .loc 1 736 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:736:69 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.f32 %r4596, %r4595, 0fFF800000, %p181; + .loc 1 740 40 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:740:40 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + sub.f32 %r4597, %r4534, %r4444; + sub.f32 %r4598, %r4536, %r4445; + sub.f32 %r4599, %r4538, %r4444; + sub.f32 %r4600, %r4540, %r4445; + sub.f32 %r4601, %r4542, %r4446; + sub.f32 %r4602, %r4544, %r4447; + sub.f32 %r4603, %r4546, %r4446; + sub.f32 %r4604, %r4548, %r4447; + sub.f32 %r4605, %r4550, %r4448; + sub.f32 %r4606, %r4552, %r4449; + sub.f32 %r4607, %r4554, %r4448; + sub.f32 %r4608, %r4556, %r4449; + sub.f32 %r4609, %r4558, %r4450; + sub.f32 %r4610, %r4560, %r4451; + sub.f32 %r4611, %r4562, %r4450; + sub.f32 %r4612, %r4564, %r4451; + sub.f32 %r4613, %r4566, %r4452; + sub.f32 %r4614, %r4568, %r4453; + sub.f32 %r4615, %r4570, %r4452; + sub.f32 %r4616, %r4572, %r4453; + sub.f32 %r4617, %r4574, %r4454; + sub.f32 %r4618, %r4576, %r4455; + sub.f32 %r4619, %r4578, %r4454; + sub.f32 %r4620, %r4580, %r4455; + sub.f32 %r4621, %r4582, %r4456; + sub.f32 %r4622, %r4584, %r4457; + sub.f32 %r4623, %r4586, %r4456; + sub.f32 %r4624, %r4588, %r4457; + sub.f32 %r4625, %r4590, %r4458; + sub.f32 %r4626, %r4592, %r4459; + sub.f32 %r4627, %r4594, %r4458; + sub.f32 %r4628, %r4596, %r4459; + .loc 1 740 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:740:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + ex2.approx.ftz.f32 %r4629, %r4597; + ex2.approx.ftz.f32 %r4630, %r4598; + ex2.approx.ftz.f32 %r4631, %r4599; + ex2.approx.ftz.f32 %r4632, %r4600; + ex2.approx.ftz.f32 %r4633, %r4601; + ex2.approx.ftz.f32 %r4634, %r4602; + ex2.approx.ftz.f32 %r4635, %r4603; + ex2.approx.ftz.f32 %r4636, %r4604; + ex2.approx.ftz.f32 %r4637, %r4605; + ex2.approx.ftz.f32 %r4638, %r4606; + ex2.approx.ftz.f32 %r4639, %r4607; + ex2.approx.ftz.f32 %r4640, %r4608; + ex2.approx.ftz.f32 %r4641, %r4609; + ex2.approx.ftz.f32 %r4642, %r4610; + ex2.approx.ftz.f32 %r4643, %r4611; + ex2.approx.ftz.f32 %r4644, %r4612; + ex2.approx.ftz.f32 %r4645, %r4613; + ex2.approx.ftz.f32 %r4646, %r4614; + ex2.approx.ftz.f32 %r4647, %r4615; + ex2.approx.ftz.f32 %r4648, %r4616; + ex2.approx.ftz.f32 %r4649, %r4617; + ex2.approx.ftz.f32 %r4650, %r4618; + ex2.approx.ftz.f32 %r4651, %r4619; + ex2.approx.ftz.f32 %r4652, %r4620; + ex2.approx.ftz.f32 %r4653, %r4621; + ex2.approx.ftz.f32 %r4654, %r4622; + ex2.approx.ftz.f32 %r4655, %r4623; + ex2.approx.ftz.f32 %r4656, %r4624; + ex2.approx.ftz.f32 %r4657, %r4625; + ex2.approx.ftz.f32 %r4658, %r4626; + ex2.approx.ftz.f32 %r4659, %r4627; + ex2.approx.ftz.f32 %r4660, %r4628; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s32 %r4661, %r2045, 49152; + add.s32 %r3801, %r4661, %r4422; + .loc 1 744 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:744:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16x2.f32 %r2886, %r4630, %r4629; + cvt.rn.bf16x2.f32 %r2887, %r4632, %r4631; + cvt.rn.bf16x2.f32 %r2888, %r4634, %r4633; + cvt.rn.bf16x2.f32 %r2889, %r4636, %r4635; + cvt.rn.bf16x2.f32 %r3018, %r4638, %r4637; + cvt.rn.bf16x2.f32 %r3019, %r4640, %r4639; + cvt.rn.bf16x2.f32 %r3020, %r4642, %r4641; + cvt.rn.bf16x2.f32 %r3021, %r4644, %r4643; + cvt.rn.bf16x2.f32 %r3150, %r4646, %r4645; + cvt.rn.bf16x2.f32 %r3151, %r4648, %r4647; + cvt.rn.bf16x2.f32 %r3152, %r4650, %r4649; + cvt.rn.bf16x2.f32 %r3153, %r4652, %r4651; + cvt.rn.bf16x2.f32 %r3282, %r4654, %r4653; + cvt.rn.bf16x2.f32 %r3283, %r4656, %r4655; + cvt.rn.bf16x2.f32 %r3284, %r4658, %r4657; + cvt.rn.bf16x2.f32 %r3285, %r4660, %r4659; + .loc 1 744 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:744:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + wgmma.fence.sync.aligned; + bfe.u32 %r4662, %r3801, 4, 14; + cvt.u64.u32 %rd427, %r4662; + or.b64 %rd341, %rd427, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517}, {%r2886,%r2887,%r2888,%r2889}, %rd341, %p43, 1, 1, 1; + // end inline asm + add.s32 %r4663, %r3801, 2048; + bfe.u32 %r4664, %r4663, 4, 14; + cvt.u64.u32 %rd428, %r4664; + or.b64 %rd342, %rd428, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517}, {%r3018,%r3019,%r3020,%r3021}, %rd342, %p43, 1, 1, 1; + // end inline asm + add.s32 %r4665, %r3801, 4096; + bfe.u32 %r4666, %r4665, 4, 14; + cvt.u64.u32 %rd429, %r4666; + or.b64 %rd343, %rd429, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517}, {%r3150,%r3151,%r3152,%r3153}, %rd343, %p43, 1, 1, 1; + // end inline asm + add.s32 %r4667, %r3801, 6144; + bfe.u32 %r4668, %r4667, 4, 14; + cvt.u64.u32 %rd430, %r4668; + or.b64 %rd344, %rd430, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517}, {%r3282,%r3283,%r3284,%r3285}, %rd344, %p43, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 746 21 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:746:21 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s32 %r4669, %r2045, 98816; + add.s32 %r4670, %r4669, %r4424; + add.s32 %r4671, %r4670, %r613; + ld.shared.v2.b32 {%r4672, %r4673}, [%r4671]; + ld.shared.v2.b32 {%r4674, %r4675}, [%r4671+32]; + ld.shared.v2.b32 {%r4676, %r4677}, [%r4671+64]; + ld.shared.v2.b32 {%r4678, %r4679}, [%r4671+96]; + ld.shared.v2.b32 {%r4680, %r4681}, [%r4671+128]; + ld.shared.v2.b32 {%r4682, %r4683}, [%r4671+160]; + ld.shared.v2.b32 {%r4684, %r4685}, [%r4671+192]; + ld.shared.v2.b32 {%r4686, %r4687}, [%r4671+224]; + .loc 1 750 20 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:750:20 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + wgmma.fence.sync.aligned; + add.s32 %r3798, %r2045, 132096; + add.s32 %r4688, %r4462, %r3798; + bfe.u32 %r4689, %r4688, 4, 14; + cvt.u64.u32 %rd431, %r4689; + or.b64 %rd345, %rd431, 4611686293372403712; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3382,%r3383,%r3384,%r3385,%r3386,%r3387,%r3388,%r3389,%r3390,%r3391,%r3392,%r3393,%r3394,%r3395,%r3396,%r3397,%r3398,%r3399,%r3400,%r3401,%r3402,%r3403,%r3404,%r3405,%r3406,%r3407,%r3408,%r3409,%r3410,%r3411,%r3412,%r3413}, %rd345, %rd341, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r4690, %r4466, %r3798; + bfe.u32 %r4691, %r4690, 4, 14; + cvt.u64.u32 %rd432, %r4691; + or.b64 %rd347, %rd432, 4611686293372403712; + add.s32 %r4692, %r3801, 32; + bfe.u32 %r4693, %r4692, 4, 14; + cvt.u64.u32 %rd433, %r4693; + or.b64 %rd348, %rd433, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3382,%r3383,%r3384,%r3385,%r3386,%r3387,%r3388,%r3389,%r3390,%r3391,%r3392,%r3393,%r3394,%r3395,%r3396,%r3397,%r3398,%r3399,%r3400,%r3401,%r3402,%r3403,%r3404,%r3405,%r3406,%r3407,%r3408,%r3409,%r3410,%r3411,%r3412,%r3413}, %rd347, %rd348, %p43, 1, 1, 0, 0; + // end inline asm + add.s32 %r4694, %r4471, %r3798; + bfe.u32 %r4695, %r4694, 4, 14; + cvt.u64.u32 %rd434, %r4695; + or.b64 %rd349, %rd434, 4611686293372403712; + add.s32 %r4696, %r3801, 64; + bfe.u32 %r4697, %r4696, 4, 14; + cvt.u64.u32 %rd435, %r4697; + or.b64 %rd350, %rd435, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3382,%r3383,%r3384,%r3385,%r3386,%r3387,%r3388,%r3389,%r3390,%r3391,%r3392,%r3393,%r3394,%r3395,%r3396,%r3397,%r3398,%r3399,%r3400,%r3401,%r3402,%r3403,%r3404,%r3405,%r3406,%r3407,%r3408,%r3409,%r3410,%r3411,%r3412,%r3413}, %rd349, %rd350, %p43, 1, 1, 0, 0; + // end inline asm + add.s32 %r4698, %r4476, %r3798; + bfe.u32 %r4699, %r4698, 4, 14; + cvt.u64.u32 %rd436, %r4699; + or.b64 %rd351, %rd436, 4611686293372403712; + add.s32 %r4700, %r3801, 96; + bfe.u32 %r4701, %r4700, 4, 14; + cvt.u64.u32 %rd437, %r4701; + or.b64 %rd352, %rd437, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3382,%r3383,%r3384,%r3385,%r3386,%r3387,%r3388,%r3389,%r3390,%r3391,%r3392,%r3393,%r3394,%r3395,%r3396,%r3397,%r3398,%r3399,%r3400,%r3401,%r3402,%r3403,%r3404,%r3405,%r3406,%r3407,%r3408,%r3409,%r3410,%r3411,%r3412,%r3413}, %rd351, %rd352, %p43, 1, 1, 0, 0; + // end inline asm + add.s32 %r4702, %r4481, %r3798; + bfe.u32 %r4703, %r4702, 4, 14; + cvt.u64.u32 %rd438, %r4703; + or.b64 %rd353, %rd438, 4611686293372403712; + add.s32 %r4704, %r3801, 8192; + bfe.u32 %r4705, %r4704, 4, 14; + cvt.u64.u32 %rd439, %r4705; + or.b64 %rd354, %rd439, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3382,%r3383,%r3384,%r3385,%r3386,%r3387,%r3388,%r3389,%r3390,%r3391,%r3392,%r3393,%r3394,%r3395,%r3396,%r3397,%r3398,%r3399,%r3400,%r3401,%r3402,%r3403,%r3404,%r3405,%r3406,%r3407,%r3408,%r3409,%r3410,%r3411,%r3412,%r3413}, %rd353, %rd354, %p43, 1, 1, 0, 0; + // end inline asm + add.s32 %r4706, %r4486, %r3798; + bfe.u32 %r4707, %r4706, 4, 14; + cvt.u64.u32 %rd440, %r4707; + or.b64 %rd355, %rd440, 4611686293372403712; + add.s32 %r4708, %r3801, 8224; + bfe.u32 %r4709, %r4708, 4, 14; + cvt.u64.u32 %rd441, %r4709; + or.b64 %rd356, %rd441, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3382,%r3383,%r3384,%r3385,%r3386,%r3387,%r3388,%r3389,%r3390,%r3391,%r3392,%r3393,%r3394,%r3395,%r3396,%r3397,%r3398,%r3399,%r3400,%r3401,%r3402,%r3403,%r3404,%r3405,%r3406,%r3407,%r3408,%r3409,%r3410,%r3411,%r3412,%r3413}, %rd355, %rd356, %p43, 1, 1, 0, 0; + // end inline asm + add.s32 %r4710, %r4491, %r3798; + bfe.u32 %r4711, %r4710, 4, 14; + cvt.u64.u32 %rd442, %r4711; + or.b64 %rd357, %rd442, 4611686293372403712; + add.s32 %r4712, %r3801, 8256; + bfe.u32 %r4713, %r4712, 4, 14; + cvt.u64.u32 %rd443, %r4713; + or.b64 %rd358, %rd443, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3382,%r3383,%r3384,%r3385,%r3386,%r3387,%r3388,%r3389,%r3390,%r3391,%r3392,%r3393,%r3394,%r3395,%r3396,%r3397,%r3398,%r3399,%r3400,%r3401,%r3402,%r3403,%r3404,%r3405,%r3406,%r3407,%r3408,%r3409,%r3410,%r3411,%r3412,%r3413}, %rd357, %rd358, %p43, 1, 1, 0, 0; + // end inline asm + add.s32 %r4714, %r4496, %r3798; + bfe.u32 %r4715, %r4714, 4, 14; + cvt.u64.u32 %rd444, %r4715; + or.b64 %rd359, %rd444, 4611686293372403712; + add.s32 %r4716, %r3801, 8288; + bfe.u32 %r4717, %r4716, 4, 14; + cvt.u64.u32 %rd445, %r4717; + or.b64 %rd360, %rd445, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3382,%r3383,%r3384,%r3385,%r3386,%r3387,%r3388,%r3389,%r3390,%r3391,%r3392,%r3393,%r3394,%r3395,%r3396,%r3397,%r3398,%r3399,%r3400,%r3401,%r3402,%r3403,%r3404,%r3405,%r3406,%r3407,%r3408,%r3409,%r3410,%r3411,%r3412,%r3413}, %rd359, %rd360, %p43, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r3803, %r2753; + mov.b32 %r3799, %r2753; + mov.b32 %r3800, %r2753; + mov.b32 %r3802, %r2753; + // begin inline asm + // wait for regs: %r3382,%r3383,%r3384,%r3385,%r3386,%r3387,%r3388,%r3389,%r3390,%r3391,%r3392,%r3393,%r3394,%r3395,%r3396,%r3397,%r3398,%r3399,%r3400,%r3401,%r3402,%r3403,%r3404,%r3405,%r3406,%r3407,%r3408,%r3409,%r3410,%r3411,%r3412,%r3413,%r3798,%r3799,%r3800,%r3801,%r3802,%r3803 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 751 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + sub.f32 %r4718, %r3382, %r4672; + sub.f32 %r4719, %r3383, %r4673; + sub.f32 %r4720, %r3384, %r4672; + sub.f32 %r4721, %r3385, %r4673; + sub.f32 %r4722, %r3386, %r4674; + sub.f32 %r4723, %r3387, %r4675; + sub.f32 %r4724, %r3388, %r4674; + sub.f32 %r4725, %r3389, %r4675; + sub.f32 %r4726, %r3390, %r4676; + sub.f32 %r4727, %r3391, %r4677; + sub.f32 %r4728, %r3392, %r4676; + sub.f32 %r4729, %r3393, %r4677; + sub.f32 %r4730, %r3394, %r4678; + sub.f32 %r4731, %r3395, %r4679; + sub.f32 %r4732, %r3396, %r4678; + sub.f32 %r4733, %r3397, %r4679; + sub.f32 %r4734, %r3398, %r4680; + sub.f32 %r4735, %r3399, %r4681; + sub.f32 %r4736, %r3400, %r4680; + sub.f32 %r4737, %r3401, %r4681; + sub.f32 %r4738, %r3402, %r4682; + sub.f32 %r4739, %r3403, %r4683; + sub.f32 %r4740, %r3404, %r4682; + sub.f32 %r4741, %r3405, %r4683; + sub.f32 %r4742, %r3406, %r4684; + sub.f32 %r4743, %r3407, %r4685; + sub.f32 %r4744, %r3408, %r4684; + sub.f32 %r4745, %r3409, %r4685; + sub.f32 %r4746, %r3410, %r4686; + sub.f32 %r4747, %r3411, %r4687; + sub.f32 %r4748, %r3412, %r4686; + sub.f32 %r4749, %r3413, %r4687; + .loc 1 751 16 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:16 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.f32 %r4750, %r4629, %r4718; + mul.f32 %r4751, %r4630, %r4719; + mul.f32 %r4752, %r4631, %r4720; + mul.f32 %r4753, %r4632, %r4721; + mul.f32 %r4754, %r4633, %r4722; + mul.f32 %r4755, %r4634, %r4723; + mul.f32 %r4756, %r4635, %r4724; + mul.f32 %r4757, %r4636, %r4725; + mul.f32 %r4758, %r4637, %r4726; + mul.f32 %r4759, %r4638, %r4727; + mul.f32 %r4760, %r4639, %r4728; + mul.f32 %r4761, %r4640, %r4729; + mul.f32 %r4762, %r4641, %r4730; + mul.f32 %r4763, %r4642, %r4731; + mul.f32 %r4764, %r4643, %r4732; + mul.f32 %r4765, %r4644, %r4733; + mul.f32 %r4766, %r4645, %r4734; + mul.f32 %r4767, %r4646, %r4735; + mul.f32 %r4768, %r4647, %r4736; + mul.f32 %r4769, %r4648, %r4737; + mul.f32 %r4770, %r4649, %r4738; + mul.f32 %r4771, %r4650, %r4739; + mul.f32 %r4772, %r4651, %r4740; + mul.f32 %r4773, %r4652, %r4741; + mul.f32 %r4774, %r4653, %r4742; + mul.f32 %r4775, %r4654, %r4743; + mul.f32 %r4776, %r4655, %r4744; + mul.f32 %r4777, %r4656, %r4745; + mul.f32 %r4778, %r4657, %r4746; + mul.f32 %r4779, %r4658, %r4747; + mul.f32 %r4780, %r4659, %r4748; + mul.f32 %r4781, %r4660, %r4749; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs1, %r4750; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs2, %rs1, 0x0000, %p150; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs3, %r4751; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs4, %rs3, 0x0000, %p151; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs5, %r4752; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs6, %rs5, 0x0000, %p152; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs7, %r4753; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs8, %rs7, 0x0000, %p153; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs9, %r4754; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs10, %rs9, 0x0000, %p154; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs11, %r4755; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs12, %rs11, 0x0000, %p155; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs13, %r4756; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs14, %rs13, 0x0000, %p156; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs15, %r4757; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs16, %rs15, 0x0000, %p157; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs17, %r4758; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs18, %rs17, 0x0000, %p158; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs19, %r4759; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs20, %rs19, 0x0000, %p159; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs21, %r4760; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs22, %rs21, 0x0000, %p160; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs23, %r4761; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs24, %rs23, 0x0000, %p161; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs25, %r4762; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs26, %rs25, 0x0000, %p162; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs27, %r4763; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs28, %rs27, 0x0000, %p163; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs29, %r4764; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs30, %rs29, 0x0000, %p164; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs31, %r4765; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs32, %rs31, 0x0000, %p165; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs33, %r4766; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs34, %rs33, 0x0000, %p166; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs35, %r4767; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs36, %rs35, 0x0000, %p167; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs37, %r4768; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs38, %rs37, 0x0000, %p168; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs39, %r4769; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs40, %rs39, 0x0000, %p169; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs41, %r4770; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs42, %rs41, 0x0000, %p170; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs43, %r4771; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs44, %rs43, 0x0000, %p171; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs45, %r4772; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs46, %rs45, 0x0000, %p172; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs47, %r4773; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs48, %rs47, 0x0000, %p173; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs49, %r4774; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs50, %rs49, 0x0000, %p174; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs51, %r4775; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs52, %rs51, 0x0000, %p175; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs53, %r4776; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs54, %rs53, 0x0000, %p176; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs55, %r4777; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs56, %rs55, 0x0000, %p177; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs57, %r4778; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs58, %rs57, 0x0000, %p178; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs59, %r4779; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs60, %rs59, 0x0000, %p179; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs61, %r4780; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs62, %rs61, 0x0000, %p180; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + cvt.rn.bf16.f32 %rs63, %r4781; + .loc 1 773 45 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:773:45 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + selp.b16 %rs64, %rs63, 0x0000, %p181; + .loc 1 775 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mov.b32 %r3970, {%rs2, %rs4}; + mov.b32 %r3971, {%rs6, %rs8}; + mov.b32 %r3972, {%rs10, %rs12}; + mov.b32 %r3973, {%rs14, %rs16}; + mov.b32 %r4102, {%rs18, %rs20}; + mov.b32 %r4103, {%rs22, %rs24}; + mov.b32 %r4104, {%rs26, %rs28}; + mov.b32 %r4105, {%rs30, %rs32}; + mov.b32 %r4234, {%rs34, %rs36}; + mov.b32 %r4235, {%rs38, %rs40}; + mov.b32 %r4236, {%rs42, %rs44}; + mov.b32 %r4237, {%rs46, %rs48}; + mov.b32 %r4366, {%rs50, %rs52}; + mov.b32 %r4367, {%rs54, %rs56}; + mov.b32 %r4368, {%rs58, %rs60}; + mov.b32 %r4369, {%rs62, %rs64}; + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581}, {%r3970,%r3971,%r3972,%r3973}, %rd326, %p43, 1, 1, 1; + // end inline asm + add.s32 %r4782, %r2755, 2048; + bfe.u32 %r4783, %r4782, 4, 14; + cvt.u64.u32 %rd446, %r4783; + or.b64 %rd362, %rd446, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581}, {%r4102,%r4103,%r4104,%r4105}, %rd362, %p43, 1, 1, 1; + // end inline asm + add.s32 %r4784, %r2755, 4096; + bfe.u32 %r4785, %r4784, 4, 14; + cvt.u64.u32 %rd447, %r4785; + or.b64 %rd363, %rd447, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581}, {%r4234,%r4235,%r4236,%r4237}, %rd363, %p43, 1, 1, 1; + // end inline asm + add.s32 %r4786, %r2755, 6144; + bfe.u32 %r4787, %r4786, 4, 14; + cvt.u64.u32 %rd448, %r4787; + or.b64 %rd364, %rd448, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581}, {%r4366,%r4367,%r4368,%r4369}, %rd364, %p43, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 628 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:628:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s32 %r13583, %r13583, %r13441; + add.s32 %r13584, %r13584, %r13441; + add.s32 %r13585, %r13585, %r13441; + add.s32 %r13586, %r13586, %r13441; + add.s32 %r13587, %r13587, %r13441; + add.s32 %r13588, %r13588, %r13441; + add.s32 %r13589, %r13589, %r13441; + add.s32 %r13590, %r13590, %r13441; + add.s32 %r13591, %r13591, %r13441; + add.s32 %r13592, %r13592, %r13441; + add.s32 %r13593, %r13593, %r13441; + add.s32 %r13594, %r13594, %r13441; + add.s32 %r13595, %r13595, %r13441; + add.s32 %r13596, %r13596, %r13441; + add.s32 %r13597, %r13597, %r13441; + add.s32 %r13598, %r13598, %r13441; + .loc 1 610 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:610:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s32 %r1135, %r13582, 1; + .loc 1 788 33 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:788:33 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + shr.u32 %r4788, %r1135, 1; + .loc 1 789 38 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:789:38 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mad.wide.u32 %rd366, %r4788, 4, %rd215; + .loc 1 789 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:789:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + // begin inline asm + mov.u64 %rd365, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd365, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4370, 0x0; + @%p65 ld.global.L1::evict_last.L2::cache_hint.b32 { %r4370 }, [ %rd366 + 0 ], %rd365; + // end inline asm + .loc 1 790 109 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:790:109 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s32 %r4789, %r4788, 1; + .loc 1 790 113 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:790:113 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + setp.lt.s32 %p182, %r4789, %r2013; + .loc 1 790 55 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:790:55 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s64 %rd369, %rd366, 4; + .loc 1 610 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:610:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + and.pred %p66, %p65, %p182; + .loc 1 790 25 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:790:25 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + // begin inline asm + mov.u64 %rd368, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd368, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4371, 0x0; + @%p66 ld.global.L1::evict_last.L2::cache_hint.b32 { %r4371 }, [ %rd369 + 0 ], %rd368; + // end inline asm + .loc 1 791 35 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:791:35 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + and.b32 %r4790, %r13582, 1; + .loc 1 792 34 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:792:34 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + sub.s32 %r4791, %r4371, %r4370; + .loc 1 792 48 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:792:48 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + shl.b32 %r4792, %r4791, 7; + .loc 1 792 63 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:792:63 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s32 %r4793, %r4792, -64; + .loc 1 793 29 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:793:29 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + xor.b32 %r4794, %r4790, 1; + .loc 1 793 61 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:793:61 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + shl.b32 %r4795, %r4790, 6; + .loc 1 793 42 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:793:42 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mad.lo.s32 %r13441, %r4793, %r4794, %r4795; + .loc 1 626 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:626:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + shl.b32 %r4796, %r13441, 12; + .loc 1 626 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:626:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.wide.s32 %rd449, %r4796, 2; + add.s64 %rd1031, %rd1031, %rd449; + add.s64 %rd1030, %rd1030, %rd449; + add.s64 %rd1029, %rd1029, %rd449; + add.s64 %rd1028, %rd1028, %rd449; + .loc 1 627 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:627:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + shl.b32 %r4797, %r13441, 7; + .loc 1 627 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:627:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.wide.s32 %rd450, %r4797, 2; + add.s64 %rd1027, %rd1027, %rd450; + add.s64 %rd1026, %rd1026, %rd450; + add.s64 %rd1025, %rd1025, %rd450; + add.s64 %rd1024, %rd1024, %rd450; + .loc 1 628 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:628:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s32 %r13453, %r13441, %r13453; + add.s32 %r13452, %r13441, %r13452; + add.s32 %r13451, %r13441, %r13451; + add.s32 %r13450, %r13441, %r13450; + add.s32 %r13449, %r13441, %r13449; + add.s32 %r13448, %r13441, %r13448; + add.s32 %r13447, %r13441, %r13447; + add.s32 %r13446, %r13441, %r13446; + .loc 1 610 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:610:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s32 %r4798, %r13443, 1; + setp.gt.s32 %p183, %r4798, 1; + selp.b32 %r13443, 0, %r4798, %p183; + add.s32 %r4799, %r13445, 1; + setp.gt.s32 %p184, %r4799, 2; + selp.b32 %r13445, 0, %r4799, %p184; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + shl.b32 %r4800, %r13445, 14; + add.s32 %r4801, %r2045, %r4800; + bar.sync 0; + add.s32 %r4372, %r4801, %r606; + selp.b32 %r4373, 16, 0, %p83; + // begin inline asm + cp.async.cg.shared.global [ %r4372 + 0 ], [ %rd1031 + 0 ], 0x10, %r4373; + // end inline asm + add.s32 %r4374, %r4372, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r4374 + 0 ], [ %rd1030 + 0 ], 0x10, %r4373; + // end inline asm + add.s32 %r4376, %r4372, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r4376 + 0 ], [ %rd1029 + 0 ], 0x10, %r4373; + // end inline asm + add.s32 %r4378, %r4372, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r4378 + 0 ], [ %rd1028 + 0 ], 0x10, %r4373; + // end inline asm + cp.async.commit_group; + .loc 1 672 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:672:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + mul.wide.s32 %rd451, %r13453, 4; + add.s64 %rd375, %rd128, %rd451; + mul.wide.s32 %rd452, %r13452, 4; + add.s64 %rd376, %rd128, %rd452; + mul.wide.s32 %rd453, %r13451, 4; + add.s64 %rd377, %rd128, %rd453; + mul.wide.s32 %rd454, %r13450, 4; + add.s64 %rd378, %rd128, %rd454; + mul.wide.s32 %rd455, %r13449, 4; + add.s64 %rd379, %rd128, %rd455; + mul.wide.s32 %rd456, %r13448, 4; + add.s64 %rd380, %rd128, %rd456; + mul.wide.s32 %rd457, %r13447, 4; + add.s64 %rd381, %rd128, %rd457; + mul.wide.s32 %rd458, %r13446, 4; + add.s64 %rd382, %rd128, %rd458; + .loc 1 672 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:672:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + shl.b32 %r4802, %r13443, 8; + add.s32 %r4803, %r4425, %r4802; + add.s32 %r4380, %r4803, %r613; + selp.b32 %r4381, 8, 0, %p83; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4380 + 0 ], [ %rd375 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4382, %r4380, 32; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4382 + 0 ], [ %rd376 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4384, %r4380, 64; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4384 + 0 ], [ %rd377 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4386, %r4380, 96; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4386 + 0 ], [ %rd378 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4388, %r4380, 128; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4388 + 0 ], [ %rd379 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4390, %r4380, 160; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4390 + 0 ], [ %rd380 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4392, %r4380, 192; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4392 + 0 ], [ %rd381 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4394, %r4380, 224; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4394 + 0 ], [ %rd382 + 0 ], 0x8, %r4381; + // end inline asm + cp.async.commit_group; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s32 %r4804, %r4661, %r4800; + add.s32 %r4396, %r4804, %r606; + // begin inline asm + cp.async.cg.shared.global [ %r4396 + 0 ], [ %rd1027 + 0 ], 0x10, %r4373; + // end inline asm + add.s32 %r4398, %r4396, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r4398 + 0 ], [ %rd1026 + 0 ], 0x10, %r4373; + // end inline asm + add.s32 %r4400, %r4396, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r4400 + 0 ], [ %rd1025 + 0 ], 0x10, %r4373; + // end inline asm + add.s32 %r4402, %r4396, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r4402 + 0 ], [ %rd1024 + 0 ], 0x10, %r4373; + // end inline asm + cp.async.commit_group; + .loc 1 746 29 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:746:29 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s64 %rd387, %rd129, %rd451; + add.s64 %rd388, %rd129, %rd452; + add.s64 %rd389, %rd129, %rd453; + add.s64 %rd390, %rd129, %rd454; + add.s64 %rd391, %rd129, %rd455; + add.s64 %rd392, %rd129, %rd456; + add.s64 %rd393, %rd129, %rd457; + add.s64 %rd394, %rd129, %rd458; + .loc 1 746 21 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:746:21 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + add.s32 %r4805, %r4669, %r4802; + add.s32 %r4404, %r4805, %r613; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4404 + 0 ], [ %rd387 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4406, %r4404, 32; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4406 + 0 ], [ %rd388 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4408, %r4404, 64; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4408 + 0 ], [ %rd389 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4410, %r4404, 96; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4410 + 0 ], [ %rd390 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4412, %r4404, 128; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4412 + 0 ], [ %rd391 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4414, %r4404, 160; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4414 + 0 ], [ %rd392 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4416, %r4404, 192; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4416 + 0 ], [ %rd393 + 0 ], 0x8, %r4381; + // end inline asm + add.s32 %r4418, %r4404, 224; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r4418 + 0 ], [ %rd394 + 0 ], 0x8, %r4381; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:610:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + setp.ne.b32 %p185, %r685, %r1135; + mov.b32 %r13582, %r1135; + @%p185 bra $L__BB0_11; +$L__BB0_12: // %._crit_edge + // in Loop: Header=BB0_9 Depth=1 + .loc 1 0 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:0:28 + setp.lt.s32 %p218, %r604, 1; + .loc 1 610 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:610:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:298:16 ] + // begin inline asm + // wait for regs: %r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517,%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp28: + .loc 1 601 18 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:601:18 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s64 %rd508, %rd126, %rd507; + add.s64 %rd510, %rd126, %rd509; + add.s64 %rd512, %rd126, %rd511; + add.s64 %rd514, %rd126, %rd513; + .loc 1 601 49 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:601:49 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s64 %rd459, %rd508, %rd307; + add.s64 %rd460, %rd510, %rd307; + add.s64 %rd461, %rd512, %rd307; + add.s64 %rd462, %rd514, %rd307; + .loc 1 602 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:602:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s64 %rd517, %rd127, %rd516; + add.s64 %rd519, %rd127, %rd518; + add.s64 %rd521, %rd127, %rd520; + add.s64 %rd523, %rd127, %rd522; + .loc 1 602 51 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:602:51 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s64 %rd471, %rd517, %rd307; + add.s64 %rd472, %rd519, %rd307; + add.s64 %rd473, %rd521, %rd307; + add.s64 %rd474, %rd523, %rd307; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + // begin inline asm + cp.async.cg.shared.global [ %r5062 + 0 ], [ %rd459 + 0 ], 0x10, %r5063; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5064 + 0 ], [ %rd460 + 0 ], 0x10, %r5063; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5066 + 0 ], [ %rd461 + 0 ], 0x10, %r5063; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5068 + 0 ], [ %rd462 + 0 ], 0x10, %r5063; + // end inline asm + cp.async.commit_group; + .loc 1 672 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:672:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s64 %rd463, %rd128, %rd524; + cvt.s64.s32 %rd525, %r602; + add.s64 %rd526, %rd525, %rd130; + shl.b64 %rd527, %rd526, 2; + add.s64 %rd528, %rd128, %rd527; + add.s64 %rd464, %rd528, 32; + add.s64 %rd465, %rd528, 64; + add.s64 %rd466, %rd528, 96; + add.s64 %rd467, %rd528, 128; + add.s64 %rd468, %rd528, 160; + add.s64 %rd469, %rd528, 192; + add.s64 %rd470, %rd528, 224; + .loc 1 672 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:672:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5070 + 0 ], [ %rd463 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5072 + 0 ], [ %rd464 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5074 + 0 ], [ %rd465 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5076 + 0 ], [ %rd466 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5078 + 0 ], [ %rd467 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5080 + 0 ], [ %rd468 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5082 + 0 ], [ %rd469 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5084 + 0 ], [ %rd470 + 0 ], 0x8, %r5071; + // end inline asm + cp.async.commit_group; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + // begin inline asm + cp.async.cg.shared.global [ %r5086 + 0 ], [ %rd471 + 0 ], 0x10, %r5063; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5088 + 0 ], [ %rd472 + 0 ], 0x10, %r5063; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5090 + 0 ], [ %rd473 + 0 ], 0x10, %r5063; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5092 + 0 ], [ %rd474 + 0 ], 0x10, %r5063; + // end inline asm + cp.async.commit_group; + .loc 1 746 29 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:746:29 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s64 %rd475, %rd129, %rd524; + add.s64 %rd529, %rd129, %rd527; + add.s64 %rd476, %rd529, 32; + add.s64 %rd477, %rd529, 64; + add.s64 %rd478, %rd529, 96; + add.s64 %rd479, %rd529, 128; + add.s64 %rd480, %rd529, 160; + add.s64 %rd481, %rd529, 192; + add.s64 %rd482, %rd529, 224; + .loc 1 746 21 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:746:21 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5094 + 0 ], [ %rd475 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5096 + 0 ], [ %rd476 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5098 + 0 ], [ %rd477 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5100 + 0 ], [ %rd478 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5102 + 0 ], [ %rd479 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5104 + 0 ], [ %rd480 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5106 + 0 ], [ %rd481 + 0 ], 0x8, %r5071; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5108 + 0 ], [ %rd482 + 0 ], 0x8, %r5071; + // end inline asm + cp.async.commit_group; + .loc 1 626 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:626:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s64 %rd1039, %rd459, 524288; + add.s64 %rd1038, %rd460, 524288; + add.s64 %rd1037, %rd461, 524288; + add.s64 %rd1036, %rd462, 524288; + .loc 1 627 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:627:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s64 %rd1035, %rd471, 16384; + add.s64 %rd1034, %rd472, 16384; + add.s64 %rd1033, %rd473, 16384; + add.s64 %rd1032, %rd474, 16384; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + bar.sync 0; + // begin inline asm + cp.async.cg.shared.global [ %r5110 + 0 ], [ %rd1039 + 0 ], 0x10, %r5111; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5112 + 0 ], [ %rd1038 + 0 ], 0x10, %r5111; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5114 + 0 ], [ %rd1037 + 0 ], 0x10, %r5111; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5116 + 0 ], [ %rd1036 + 0 ], 0x10, %r5111; + // end inline asm + cp.async.commit_group; + .loc 1 672 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:672:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s64 %rd487, %rd463, 256; + add.s64 %rd530, %rd527, 256; + add.s64 %rd531, %rd128, %rd530; + add.s64 %rd488, %rd531, 32; + add.s64 %rd489, %rd531, 64; + add.s64 %rd490, %rd531, 96; + add.s64 %rd491, %rd531, 128; + add.s64 %rd492, %rd531, 160; + add.s64 %rd493, %rd531, 192; + add.s64 %rd494, %rd531, 224; + .loc 1 672 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:672:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5118 + 0 ], [ %rd487 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5120 + 0 ], [ %rd488 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5122 + 0 ], [ %rd489 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5124 + 0 ], [ %rd490 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5126 + 0 ], [ %rd491 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5128 + 0 ], [ %rd492 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5130 + 0 ], [ %rd493 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5132 + 0 ], [ %rd494 + 0 ], 0x8, %r5119; + // end inline asm + cp.async.commit_group; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + // begin inline asm + cp.async.cg.shared.global [ %r5134 + 0 ], [ %rd1035 + 0 ], 0x10, %r5111; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5136 + 0 ], [ %rd1034 + 0 ], 0x10, %r5111; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5138 + 0 ], [ %rd1033 + 0 ], 0x10, %r5111; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r5140 + 0 ], [ %rd1032 + 0 ], 0x10, %r5111; + // end inline asm + cp.async.commit_group; + .loc 1 746 29 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:746:29 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s64 %rd499, %rd475, 256; + add.s64 %rd532, %rd129, %rd530; + add.s64 %rd500, %rd532, 32; + add.s64 %rd501, %rd532, 64; + add.s64 %rd502, %rd532, 96; + add.s64 %rd503, %rd532, 128; + add.s64 %rd504, %rd532, 160; + add.s64 %rd505, %rd532, 192; + add.s64 %rd506, %rd532, 224; + .loc 1 746 21 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:746:21 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5142 + 0 ], [ %rd499 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5144 + 0 ], [ %rd500 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5146 + 0 ], [ %rd501 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5148 + 0 ], [ %rd502 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5150 + 0 ], [ %rd503 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5152 + 0 ], [ %rd504 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5154 + 0 ], [ %rd505 + 0 ], 0x8, %r5119; + // end inline asm + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r5156 + 0 ], [ %rd506 + 0 ], 0x8, %r5119; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:610:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + @%p218 bra $L__BB0_15; +// %bb.13: // %.lr.ph1691.preheader + // in Loop: Header=BB0_9 Depth=1 + .loc 1 0 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:0:28 + mov.b32 %r5712, 0; + mov.b32 %r13728, 1; + mov.b32 %r13727, -1; + mov.b32 %r13729, %r13727; + mov.b32 %r13730, %r13728; + mov.b32 %r13731, %r680; + mov.b32 %r13732, %r679; + mov.b32 %r13733, %r678; + mov.b32 %r13734, %r677; + mov.b32 %r13735, %r676; + mov.b32 %r13736, %r675; + mov.b32 %r13737, %r674; + mov.b32 %r13738, %r673; + mov.b32 %r13867, %r5712; +$L__BB0_14: // %.lr.ph1691 + // Parent Loop BB0_9 Depth=1 + // => This Inner Loop Header: Depth=2 + .loc 1 610 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:610:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + setp.lt.s32 %p259, %r13867, %r683; + setp.lt.s32 %p241, %r13867, %r684; + add.s32 %r7379, %r13727, 1; + setp.gt.s32 %p260, %r7379, 1; + selp.b32 %r13727, 0, %r7379, %p260; + add.s32 %r7380, %r13729, 1; + setp.gt.s32 %p261, %r7380, 2; + selp.b32 %r13729, 0, %r7380, %p261; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + cp.async.wait_group 4; + bar.sync 0; + shl.b32 %r7381, %r13729, 14; + add.s32 %r5714, %r2045, %r7381; + .loc 1 672 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:672:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + shl.b32 %r7383, %r13727, 8; + add.s32 %r7384, %r2045, 98304; + add.s32 %r7385, %r7384, %r7383; + add.s32 %r7386, %r7385, %r613; + ld.shared.v2.b32 {%r7387, %r7388}, [%r7386]; + ld.shared.v2.b32 {%r7389, %r7390}, [%r7386+32]; + ld.shared.v2.b32 {%r7391, %r7392}, [%r7386+64]; + ld.shared.v2.b32 {%r7393, %r7394}, [%r7386+96]; + ld.shared.v2.b32 {%r7395, %r7396}, [%r7386+128]; + ld.shared.v2.b32 {%r7397, %r7398}, [%r7386+160]; + ld.shared.v2.b32 {%r7399, %r7400}, [%r7386+192]; + ld.shared.v2.b32 {%r7401, %r7402}, [%r7386+224]; + .loc 1 675 26 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:675:26 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + setp.eq.f32 %p262, %r7387, 0fFF800000; + setp.eq.f32 %p263, %r7388, 0fFF800000; + setp.eq.f32 %p264, %r7389, 0fFF800000; + setp.eq.f32 %p265, %r7390, 0fFF800000; + setp.eq.f32 %p266, %r7391, 0fFF800000; + setp.eq.f32 %p267, %r7392, 0fFF800000; + setp.eq.f32 %p268, %r7393, 0fFF800000; + setp.eq.f32 %p269, %r7394, 0fFF800000; + setp.eq.f32 %p270, %r7395, 0fFF800000; + setp.eq.f32 %p271, %r7396, 0fFF800000; + setp.eq.f32 %p272, %r7397, 0fFF800000; + setp.eq.f32 %p273, %r7398, 0fFF800000; + setp.eq.f32 %p274, %r7399, 0fFF800000; + setp.eq.f32 %p275, %r7400, 0fFF800000; + setp.eq.f32 %p276, %r7401, 0fFF800000; + setp.eq.f32 %p277, %r7402, 0fFF800000; + .loc 1 675 46 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:675:46 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + selp.f32 %r7403, 0f00000000, %r7387, %p262; + selp.f32 %r7404, 0f00000000, %r7388, %p263; + selp.f32 %r7405, 0f00000000, %r7389, %p264; + selp.f32 %r7406, 0f00000000, %r7390, %p265; + selp.f32 %r7407, 0f00000000, %r7391, %p266; + selp.f32 %r7408, 0f00000000, %r7392, %p267; + selp.f32 %r7409, 0f00000000, %r7393, %p268; + selp.f32 %r7410, 0f00000000, %r7394, %p269; + selp.f32 %r7411, 0f00000000, %r7395, %p270; + selp.f32 %r7412, 0f00000000, %r7396, %p271; + selp.f32 %r7413, 0f00000000, %r7397, %p272; + selp.f32 %r7414, 0f00000000, %r7398, %p273; + selp.f32 %r7415, 0f00000000, %r7399, %p274; + selp.f32 %r7416, 0f00000000, %r7400, %p275; + selp.f32 %r7417, 0f00000000, %r7401, %p276; + selp.f32 %r7418, 0f00000000, %r7402, %p277; + .loc 1 676 20 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:676:20 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + shfl.sync.idx.b32 %r7419, %r7, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r7420, %r7419, 11; + and.b32 %r7421, %r7420, 8192; + add.s32 %r5673, %r2045, 99328; + add.s32 %r7422, %r7421, %r5673; + bfe.u32 %r7423, %r7422, 4, 14; + cvt.u64.u32 %rd603, %r7423; + or.b64 %rd533, %rd603, 4611686293372403712; + bfe.u32 %r7424, %r5714, 4, 14; + cvt.u64.u32 %rd604, %r7424; + or.b64 %rd534, %rd604, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5257,%r5258,%r5259,%r5260,%r5261,%r5262,%r5263,%r5264,%r5265,%r5266,%r5267,%r5268,%r5269,%r5270,%r5271,%r5272,%r5273,%r5274,%r5275,%r5276,%r5277,%r5278,%r5279,%r5280,%r5281,%r5282,%r5283,%r5284,%r5285,%r5286,%r5287,%r5288}, %rd533, %rd534, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r7425, %r7421, 32; + add.s32 %r7426, %r7425, %r5673; + bfe.u32 %r7427, %r7426, 4, 14; + cvt.u64.u32 %rd605, %r7427; + or.b64 %rd535, %rd605, 4611686293372403712; + add.s32 %r7428, %r5714, 32; + bfe.u32 %r7429, %r7428, 4, 14; + cvt.u64.u32 %rd606, %r7429; + or.b64 %rd536, %rd606, 4611686293338849280; + mov.pred %p219, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5257,%r5258,%r5259,%r5260,%r5261,%r5262,%r5263,%r5264,%r5265,%r5266,%r5267,%r5268,%r5269,%r5270,%r5271,%r5272,%r5273,%r5274,%r5275,%r5276,%r5277,%r5278,%r5279,%r5280,%r5281,%r5282,%r5283,%r5284,%r5285,%r5286,%r5287,%r5288}, %rd535, %rd536, %p219, 1, 1, 0, 0; + // end inline asm + or.b32 %r7430, %r7421, 64; + add.s32 %r7431, %r7430, %r5673; + bfe.u32 %r7432, %r7431, 4, 14; + cvt.u64.u32 %rd607, %r7432; + or.b64 %rd537, %rd607, 4611686293372403712; + add.s32 %r7433, %r5714, 64; + bfe.u32 %r7434, %r7433, 4, 14; + cvt.u64.u32 %rd608, %r7434; + or.b64 %rd538, %rd608, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5257,%r5258,%r5259,%r5260,%r5261,%r5262,%r5263,%r5264,%r5265,%r5266,%r5267,%r5268,%r5269,%r5270,%r5271,%r5272,%r5273,%r5274,%r5275,%r5276,%r5277,%r5278,%r5279,%r5280,%r5281,%r5282,%r5283,%r5284,%r5285,%r5286,%r5287,%r5288}, %rd537, %rd538, %p219, 1, 1, 0, 0; + // end inline asm + or.b32 %r7435, %r7421, 96; + add.s32 %r7436, %r7435, %r5673; + bfe.u32 %r7437, %r7436, 4, 14; + cvt.u64.u32 %rd609, %r7437; + or.b64 %rd539, %rd609, 4611686293372403712; + add.s32 %r7438, %r5714, 96; + bfe.u32 %r7439, %r7438, 4, 14; + cvt.u64.u32 %rd610, %r7439; + or.b64 %rd540, %rd610, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5257,%r5258,%r5259,%r5260,%r5261,%r5262,%r5263,%r5264,%r5265,%r5266,%r5267,%r5268,%r5269,%r5270,%r5271,%r5272,%r5273,%r5274,%r5275,%r5276,%r5277,%r5278,%r5279,%r5280,%r5281,%r5282,%r5283,%r5284,%r5285,%r5286,%r5287,%r5288}, %rd539, %rd540, %p219, 1, 1, 0, 0; + // end inline asm + or.b32 %r7440, %r7421, 16384; + add.s32 %r7441, %r7440, %r5673; + bfe.u32 %r7442, %r7441, 4, 14; + cvt.u64.u32 %rd611, %r7442; + or.b64 %rd541, %rd611, 4611686293372403712; + add.s32 %r7443, %r5714, 8192; + bfe.u32 %r7444, %r7443, 4, 14; + cvt.u64.u32 %rd612, %r7444; + or.b64 %rd542, %rd612, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5257,%r5258,%r5259,%r5260,%r5261,%r5262,%r5263,%r5264,%r5265,%r5266,%r5267,%r5268,%r5269,%r5270,%r5271,%r5272,%r5273,%r5274,%r5275,%r5276,%r5277,%r5278,%r5279,%r5280,%r5281,%r5282,%r5283,%r5284,%r5285,%r5286,%r5287,%r5288}, %rd541, %rd542, %p219, 1, 1, 0, 0; + // end inline asm + or.b32 %r7445, %r7421, 16416; + add.s32 %r7446, %r7445, %r5673; + bfe.u32 %r7447, %r7446, 4, 14; + cvt.u64.u32 %rd613, %r7447; + or.b64 %rd543, %rd613, 4611686293372403712; + add.s32 %r7448, %r5714, 8224; + bfe.u32 %r7449, %r7448, 4, 14; + cvt.u64.u32 %rd614, %r7449; + or.b64 %rd544, %rd614, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5257,%r5258,%r5259,%r5260,%r5261,%r5262,%r5263,%r5264,%r5265,%r5266,%r5267,%r5268,%r5269,%r5270,%r5271,%r5272,%r5273,%r5274,%r5275,%r5276,%r5277,%r5278,%r5279,%r5280,%r5281,%r5282,%r5283,%r5284,%r5285,%r5286,%r5287,%r5288}, %rd543, %rd544, %p219, 1, 1, 0, 0; + // end inline asm + or.b32 %r7450, %r7421, 16448; + add.s32 %r7451, %r7450, %r5673; + bfe.u32 %r7452, %r7451, 4, 14; + cvt.u64.u32 %rd615, %r7452; + or.b64 %rd545, %rd615, 4611686293372403712; + add.s32 %r7453, %r5714, 8256; + bfe.u32 %r7454, %r7453, 4, 14; + cvt.u64.u32 %rd616, %r7454; + or.b64 %rd546, %rd616, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5257,%r5258,%r5259,%r5260,%r5261,%r5262,%r5263,%r5264,%r5265,%r5266,%r5267,%r5268,%r5269,%r5270,%r5271,%r5272,%r5273,%r5274,%r5275,%r5276,%r5277,%r5278,%r5279,%r5280,%r5281,%r5282,%r5283,%r5284,%r5285,%r5286,%r5287,%r5288}, %rd545, %rd546, %p219, 1, 1, 0, 0; + // end inline asm + or.b32 %r7455, %r7421, 16480; + add.s32 %r7456, %r7455, %r5673; + bfe.u32 %r7457, %r7456, 4, 14; + cvt.u64.u32 %rd617, %r7457; + or.b64 %rd547, %rd617, 4611686293372403712; + add.s32 %r7458, %r5714, 8288; + bfe.u32 %r7459, %r7458, 4, 14; + cvt.u64.u32 %rd618, %r7459; + or.b64 %rd548, %rd618, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5257,%r5258,%r5259,%r5260,%r5261,%r5262,%r5263,%r5264,%r5265,%r5266,%r5267,%r5268,%r5269,%r5270,%r5271,%r5272,%r5273,%r5274,%r5275,%r5276,%r5277,%r5278,%r5279,%r5280,%r5281,%r5282,%r5283,%r5284,%r5285,%r5286,%r5287,%r5288}, %rd547, %rd548, %p219, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r5674, %r5712; + mov.b32 %r5675, %r5712; + mov.b32 %r5677, %r5712; + mov.b32 %r5678, %r5712; + mov.b32 %r5676, %r5714; + // begin inline asm + // wait for regs: %r5257,%r5258,%r5259,%r5260,%r5261,%r5262,%r5263,%r5264,%r5265,%r5266,%r5267,%r5268,%r5269,%r5270,%r5271,%r5272,%r5273,%r5274,%r5275,%r5276,%r5277,%r5278,%r5279,%r5280,%r5281,%r5282,%r5283,%r5284,%r5285,%r5286,%r5287,%r5288,%r5673,%r5674,%r5675,%r5676,%r5677,%r5678 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 678 15 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:678:15 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + mul.f32 %r7460, %r5257, 0f3DB504F3; + mul.f32 %r7461, %r5258, 0f3DB504F3; + mul.f32 %r7462, %r5259, 0f3DB504F3; + mul.f32 %r7463, %r5260, 0f3DB504F3; + mul.f32 %r7464, %r5261, 0f3DB504F3; + mul.f32 %r7465, %r5262, 0f3DB504F3; + mul.f32 %r7466, %r5263, 0f3DB504F3; + mul.f32 %r7467, %r5264, 0f3DB504F3; + mul.f32 %r7468, %r5265, 0f3DB504F3; + mul.f32 %r7469, %r5266, 0f3DB504F3; + mul.f32 %r7470, %r5267, 0f3DB504F3; + mul.f32 %r7471, %r5268, 0f3DB504F3; + mul.f32 %r7472, %r5269, 0f3DB504F3; + mul.f32 %r7473, %r5270, 0f3DB504F3; + mul.f32 %r7474, %r5271, 0f3DB504F3; + mul.f32 %r7475, %r5272, 0f3DB504F3; + mul.f32 %r7476, %r5273, 0f3DB504F3; + mul.f32 %r7477, %r5274, 0f3DB504F3; + mul.f32 %r7478, %r5275, 0f3DB504F3; + mul.f32 %r7479, %r5276, 0f3DB504F3; + mul.f32 %r7480, %r5277, 0f3DB504F3; + mul.f32 %r7481, %r5278, 0f3DB504F3; + mul.f32 %r7482, %r5279, 0f3DB504F3; + mul.f32 %r7483, %r5280, 0f3DB504F3; + mul.f32 %r7484, %r5281, 0f3DB504F3; + mul.f32 %r7485, %r5282, 0f3DB504F3; + mul.f32 %r7486, %r5283, 0f3DB504F3; + mul.f32 %r7487, %r5284, 0f3DB504F3; + mul.f32 %r7488, %r5285, 0f3DB504F3; + mul.f32 %r7489, %r5286, 0f3DB504F3; + mul.f32 %r7490, %r5287, 0f3DB504F3; + mul.f32 %r7491, %r5288, 0f3DB504F3; + .loc 1 740 40 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:740:40 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + neg.f32 %r7492, %r7403; + fma.rn.f32 %r7493, %r7460, 0f3FB8AA3B, %r7492; + neg.f32 %r7494, %r7404; + fma.rn.f32 %r7495, %r7461, 0f3FB8AA3B, %r7494; + fma.rn.f32 %r7496, %r7462, 0f3FB8AA3B, %r7492; + fma.rn.f32 %r7497, %r7463, 0f3FB8AA3B, %r7494; + neg.f32 %r7498, %r7405; + fma.rn.f32 %r7499, %r7464, 0f3FB8AA3B, %r7498; + neg.f32 %r7500, %r7406; + fma.rn.f32 %r7501, %r7465, 0f3FB8AA3B, %r7500; + fma.rn.f32 %r7502, %r7466, 0f3FB8AA3B, %r7498; + fma.rn.f32 %r7503, %r7467, 0f3FB8AA3B, %r7500; + neg.f32 %r7504, %r7407; + fma.rn.f32 %r7505, %r7468, 0f3FB8AA3B, %r7504; + neg.f32 %r7506, %r7408; + fma.rn.f32 %r7507, %r7469, 0f3FB8AA3B, %r7506; + fma.rn.f32 %r7508, %r7470, 0f3FB8AA3B, %r7504; + fma.rn.f32 %r7509, %r7471, 0f3FB8AA3B, %r7506; + neg.f32 %r7510, %r7409; + fma.rn.f32 %r7511, %r7472, 0f3FB8AA3B, %r7510; + neg.f32 %r7512, %r7410; + fma.rn.f32 %r7513, %r7473, 0f3FB8AA3B, %r7512; + fma.rn.f32 %r7514, %r7474, 0f3FB8AA3B, %r7510; + fma.rn.f32 %r7515, %r7475, 0f3FB8AA3B, %r7512; + neg.f32 %r7516, %r7411; + fma.rn.f32 %r7517, %r7476, 0f3FB8AA3B, %r7516; + neg.f32 %r7518, %r7412; + fma.rn.f32 %r7519, %r7477, 0f3FB8AA3B, %r7518; + fma.rn.f32 %r7520, %r7478, 0f3FB8AA3B, %r7516; + fma.rn.f32 %r7521, %r7479, 0f3FB8AA3B, %r7518; + neg.f32 %r7522, %r7413; + fma.rn.f32 %r7523, %r7480, 0f3FB8AA3B, %r7522; + neg.f32 %r7524, %r7414; + fma.rn.f32 %r7525, %r7481, 0f3FB8AA3B, %r7524; + fma.rn.f32 %r7526, %r7482, 0f3FB8AA3B, %r7522; + fma.rn.f32 %r7527, %r7483, 0f3FB8AA3B, %r7524; + neg.f32 %r7528, %r7415; + fma.rn.f32 %r7529, %r7484, 0f3FB8AA3B, %r7528; + neg.f32 %r7530, %r7416; + fma.rn.f32 %r7531, %r7485, 0f3FB8AA3B, %r7530; + fma.rn.f32 %r7532, %r7486, 0f3FB8AA3B, %r7528; + fma.rn.f32 %r7533, %r7487, 0f3FB8AA3B, %r7530; + neg.f32 %r7534, %r7417; + fma.rn.f32 %r7535, %r7488, 0f3FB8AA3B, %r7534; + neg.f32 %r7536, %r7418; + fma.rn.f32 %r7537, %r7489, 0f3FB8AA3B, %r7536; + fma.rn.f32 %r7538, %r7490, 0f3FB8AA3B, %r7534; + fma.rn.f32 %r7539, %r7491, 0f3FB8AA3B, %r7536; + .loc 1 740 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:740:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + ex2.approx.ftz.f32 %r7540, %r7493; + ex2.approx.ftz.f32 %r7541, %r7495; + ex2.approx.ftz.f32 %r7542, %r7496; + ex2.approx.ftz.f32 %r7543, %r7497; + ex2.approx.ftz.f32 %r7544, %r7499; + ex2.approx.ftz.f32 %r7545, %r7501; + ex2.approx.ftz.f32 %r7546, %r7502; + ex2.approx.ftz.f32 %r7547, %r7503; + ex2.approx.ftz.f32 %r7548, %r7505; + ex2.approx.ftz.f32 %r7549, %r7507; + ex2.approx.ftz.f32 %r7550, %r7508; + ex2.approx.ftz.f32 %r7551, %r7509; + ex2.approx.ftz.f32 %r7552, %r7511; + ex2.approx.ftz.f32 %r7553, %r7513; + ex2.approx.ftz.f32 %r7554, %r7514; + ex2.approx.ftz.f32 %r7555, %r7515; + ex2.approx.ftz.f32 %r7556, %r7517; + ex2.approx.ftz.f32 %r7557, %r7519; + ex2.approx.ftz.f32 %r7558, %r7520; + ex2.approx.ftz.f32 %r7559, %r7521; + ex2.approx.ftz.f32 %r7560, %r7523; + ex2.approx.ftz.f32 %r7561, %r7525; + ex2.approx.ftz.f32 %r7562, %r7526; + ex2.approx.ftz.f32 %r7563, %r7527; + ex2.approx.ftz.f32 %r7564, %r7529; + ex2.approx.ftz.f32 %r7565, %r7531; + ex2.approx.ftz.f32 %r7566, %r7532; + ex2.approx.ftz.f32 %r7567, %r7533; + ex2.approx.ftz.f32 %r7568, %r7535; + ex2.approx.ftz.f32 %r7569, %r7537; + ex2.approx.ftz.f32 %r7570, %r7538; + ex2.approx.ftz.f32 %r7571, %r7539; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s32 %r7572, %r2045, 49152; + add.s32 %r6760, %r7572, %r7381; + .loc 1 744 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:744:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + cvt.rn.bf16x2.f32 %r5845, %r7541, %r7540; + cvt.rn.bf16x2.f32 %r5846, %r7543, %r7542; + cvt.rn.bf16x2.f32 %r5847, %r7545, %r7544; + cvt.rn.bf16x2.f32 %r5848, %r7547, %r7546; + cvt.rn.bf16x2.f32 %r5977, %r7549, %r7548; + cvt.rn.bf16x2.f32 %r5978, %r7551, %r7550; + cvt.rn.bf16x2.f32 %r5979, %r7553, %r7552; + cvt.rn.bf16x2.f32 %r5980, %r7555, %r7554; + cvt.rn.bf16x2.f32 %r6109, %r7557, %r7556; + cvt.rn.bf16x2.f32 %r6110, %r7559, %r7558; + cvt.rn.bf16x2.f32 %r6111, %r7561, %r7560; + cvt.rn.bf16x2.f32 %r6112, %r7563, %r7562; + cvt.rn.bf16x2.f32 %r6241, %r7565, %r7564; + cvt.rn.bf16x2.f32 %r6242, %r7567, %r7566; + cvt.rn.bf16x2.f32 %r6243, %r7569, %r7568; + cvt.rn.bf16x2.f32 %r6244, %r7571, %r7570; + .loc 1 744 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:744:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + wgmma.fence.sync.aligned; + bfe.u32 %r7573, %r6760, 4, 14; + cvt.u64.u32 %rd619, %r7573; + or.b64 %rd549, %rd619, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517}, {%r5845,%r5846,%r5847,%r5848}, %rd549, %p219, 1, 1, 1; + // end inline asm + add.s32 %r7574, %r6760, 2048; + bfe.u32 %r7575, %r7574, 4, 14; + cvt.u64.u32 %rd620, %r7575; + or.b64 %rd550, %rd620, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517}, {%r5977,%r5978,%r5979,%r5980}, %rd550, %p219, 1, 1, 1; + // end inline asm + add.s32 %r7576, %r6760, 4096; + bfe.u32 %r7577, %r7576, 4, 14; + cvt.u64.u32 %rd621, %r7577; + or.b64 %rd551, %rd621, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517}, {%r6109,%r6110,%r6111,%r6112}, %rd551, %p219, 1, 1, 1; + // end inline asm + add.s32 %r7578, %r6760, 6144; + bfe.u32 %r7579, %r7578, 4, 14; + cvt.u64.u32 %rd622, %r7579; + or.b64 %rd552, %rd622, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13454,%r13455,%r13456,%r13457,%r13458,%r13459,%r13460,%r13461,%r13462,%r13463,%r13464,%r13465,%r13466,%r13467,%r13468,%r13469,%r13470,%r13471,%r13472,%r13473,%r13474,%r13475,%r13476,%r13477,%r13478,%r13479,%r13480,%r13481,%r13482,%r13483,%r13484,%r13485,%r13486,%r13487,%r13488,%r13489,%r13490,%r13491,%r13492,%r13493,%r13494,%r13495,%r13496,%r13497,%r13498,%r13499,%r13500,%r13501,%r13502,%r13503,%r13504,%r13505,%r13506,%r13507,%r13508,%r13509,%r13510,%r13511,%r13512,%r13513,%r13514,%r13515,%r13516,%r13517}, {%r6241,%r6242,%r6243,%r6244}, %rd552, %p219, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 746 21 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:746:21 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s32 %r7580, %r2045, 98816; + add.s32 %r7581, %r7580, %r7383; + add.s32 %r7582, %r7581, %r613; + .loc 1 750 20 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:750:20 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s32 %r6757, %r2045, 132096; + add.s32 %r7583, %r7421, %r6757; + bfe.u32 %r7584, %r7583, 4, 14; + cvt.u64.u32 %rd623, %r7584; + or.b64 %rd553, %rd623, 4611686293372403712; + add.s32 %r7585, %r7425, %r6757; + bfe.u32 %r7586, %r7585, 4, 14; + cvt.u64.u32 %rd624, %r7586; + or.b64 %rd555, %rd624, 4611686293372403712; + add.s32 %r7587, %r6760, 32; + bfe.u32 %r7588, %r7587, 4, 14; + cvt.u64.u32 %rd625, %r7588; + or.b64 %rd556, %rd625, 4611686293338849280; + add.s32 %r7589, %r7430, %r6757; + bfe.u32 %r7590, %r7589, 4, 14; + cvt.u64.u32 %rd626, %r7590; + or.b64 %rd557, %rd626, 4611686293372403712; + add.s32 %r7591, %r6760, 64; + bfe.u32 %r7592, %r7591, 4, 14; + cvt.u64.u32 %rd627, %r7592; + or.b64 %rd558, %rd627, 4611686293338849280; + add.s32 %r7593, %r7435, %r6757; + bfe.u32 %r7594, %r7593, 4, 14; + cvt.u64.u32 %rd628, %r7594; + or.b64 %rd559, %rd628, 4611686293372403712; + add.s32 %r7595, %r6760, 96; + bfe.u32 %r7596, %r7595, 4, 14; + cvt.u64.u32 %rd629, %r7596; + or.b64 %rd560, %rd629, 4611686293338849280; + add.s32 %r7597, %r7440, %r6757; + bfe.u32 %r7598, %r7597, 4, 14; + cvt.u64.u32 %rd630, %r7598; + or.b64 %rd561, %rd630, 4611686293372403712; + add.s32 %r7599, %r6760, 8192; + bfe.u32 %r7600, %r7599, 4, 14; + cvt.u64.u32 %rd631, %r7600; + or.b64 %rd562, %rd631, 4611686293338849280; + add.s32 %r7601, %r7445, %r6757; + bfe.u32 %r7602, %r7601, 4, 14; + cvt.u64.u32 %rd632, %r7602; + or.b64 %rd563, %rd632, 4611686293372403712; + add.s32 %r7603, %r6760, 8224; + bfe.u32 %r7604, %r7603, 4, 14; + cvt.u64.u32 %rd633, %r7604; + or.b64 %rd564, %rd633, 4611686293338849280; + add.s32 %r7605, %r7450, %r6757; + bfe.u32 %r7606, %r7605, 4, 14; + cvt.u64.u32 %rd634, %r7606; + or.b64 %rd565, %rd634, 4611686293372403712; + add.s32 %r7607, %r6760, 8256; + bfe.u32 %r7608, %r7607, 4, 14; + cvt.u64.u32 %rd635, %r7608; + or.b64 %rd566, %rd635, 4611686293338849280; + add.s32 %r7609, %r7455, %r6757; + bfe.u32 %r7610, %r7609, 4, 14; + cvt.u64.u32 %rd636, %r7610; + or.b64 %rd567, %rd636, 4611686293372403712; + add.s32 %r7611, %r6760, 8288; + bfe.u32 %r7612, %r7611, 4, 14; + cvt.u64.u32 %rd637, %r7612; + or.b64 %rd568, %rd637, 4611686293338849280; + .loc 1 746 21 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:746:21 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + ld.shared.v2.b32 {%r7613, %r7614}, [%r7582+224]; + ld.shared.v2.b32 {%r7615, %r7616}, [%r7582+192]; + ld.shared.v2.b32 {%r7617, %r7618}, [%r7582+160]; + ld.shared.v2.b32 {%r7619, %r7620}, [%r7582+128]; + ld.shared.v2.b32 {%r7621, %r7622}, [%r7582+96]; + ld.shared.v2.b32 {%r7623, %r7624}, [%r7582+64]; + ld.shared.v2.b32 {%r7625, %r7626}, [%r7582+32]; + ld.shared.v2.b32 {%r7627, %r7628}, [%r7582]; + .loc 1 750 20 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:750:20 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r6341,%r6342,%r6343,%r6344,%r6345,%r6346,%r6347,%r6348,%r6349,%r6350,%r6351,%r6352,%r6353,%r6354,%r6355,%r6356,%r6357,%r6358,%r6359,%r6360,%r6361,%r6362,%r6363,%r6364,%r6365,%r6366,%r6367,%r6368,%r6369,%r6370,%r6371,%r6372}, %rd553, %rd549, 0, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r6341,%r6342,%r6343,%r6344,%r6345,%r6346,%r6347,%r6348,%r6349,%r6350,%r6351,%r6352,%r6353,%r6354,%r6355,%r6356,%r6357,%r6358,%r6359,%r6360,%r6361,%r6362,%r6363,%r6364,%r6365,%r6366,%r6367,%r6368,%r6369,%r6370,%r6371,%r6372}, %rd555, %rd556, %p219, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r6341,%r6342,%r6343,%r6344,%r6345,%r6346,%r6347,%r6348,%r6349,%r6350,%r6351,%r6352,%r6353,%r6354,%r6355,%r6356,%r6357,%r6358,%r6359,%r6360,%r6361,%r6362,%r6363,%r6364,%r6365,%r6366,%r6367,%r6368,%r6369,%r6370,%r6371,%r6372}, %rd557, %rd558, %p219, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r6341,%r6342,%r6343,%r6344,%r6345,%r6346,%r6347,%r6348,%r6349,%r6350,%r6351,%r6352,%r6353,%r6354,%r6355,%r6356,%r6357,%r6358,%r6359,%r6360,%r6361,%r6362,%r6363,%r6364,%r6365,%r6366,%r6367,%r6368,%r6369,%r6370,%r6371,%r6372}, %rd559, %rd560, %p219, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r6341,%r6342,%r6343,%r6344,%r6345,%r6346,%r6347,%r6348,%r6349,%r6350,%r6351,%r6352,%r6353,%r6354,%r6355,%r6356,%r6357,%r6358,%r6359,%r6360,%r6361,%r6362,%r6363,%r6364,%r6365,%r6366,%r6367,%r6368,%r6369,%r6370,%r6371,%r6372}, %rd561, %rd562, %p219, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r6341,%r6342,%r6343,%r6344,%r6345,%r6346,%r6347,%r6348,%r6349,%r6350,%r6351,%r6352,%r6353,%r6354,%r6355,%r6356,%r6357,%r6358,%r6359,%r6360,%r6361,%r6362,%r6363,%r6364,%r6365,%r6366,%r6367,%r6368,%r6369,%r6370,%r6371,%r6372}, %rd563, %rd564, %p219, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r6341,%r6342,%r6343,%r6344,%r6345,%r6346,%r6347,%r6348,%r6349,%r6350,%r6351,%r6352,%r6353,%r6354,%r6355,%r6356,%r6357,%r6358,%r6359,%r6360,%r6361,%r6362,%r6363,%r6364,%r6365,%r6366,%r6367,%r6368,%r6369,%r6370,%r6371,%r6372}, %rd565, %rd566, %p219, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r6341,%r6342,%r6343,%r6344,%r6345,%r6346,%r6347,%r6348,%r6349,%r6350,%r6351,%r6352,%r6353,%r6354,%r6355,%r6356,%r6357,%r6358,%r6359,%r6360,%r6361,%r6362,%r6363,%r6364,%r6365,%r6366,%r6367,%r6368,%r6369,%r6370,%r6371,%r6372}, %rd567, %rd568, %p219, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r6762, %r5712; + mov.b32 %r6758, %r5712; + mov.b32 %r6759, %r5712; + mov.b32 %r6761, %r5712; + // begin inline asm + // wait for regs: %r6341,%r6342,%r6343,%r6344,%r6345,%r6346,%r6347,%r6348,%r6349,%r6350,%r6351,%r6352,%r6353,%r6354,%r6355,%r6356,%r6357,%r6358,%r6359,%r6360,%r6361,%r6362,%r6363,%r6364,%r6365,%r6366,%r6367,%r6368,%r6369,%r6370,%r6371,%r6372,%r6757,%r6758,%r6759,%r6760,%r6761,%r6762 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 751 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + sub.f32 %r7629, %r6344, %r7628; + sub.f32 %r7630, %r6343, %r7627; + sub.f32 %r7631, %r6348, %r7626; + sub.f32 %r7632, %r6347, %r7625; + sub.f32 %r7633, %r6352, %r7624; + sub.f32 %r7634, %r6351, %r7623; + sub.f32 %r7635, %r6356, %r7622; + sub.f32 %r7636, %r6355, %r7621; + sub.f32 %r7637, %r6360, %r7620; + sub.f32 %r7638, %r6359, %r7619; + sub.f32 %r7639, %r6364, %r7618; + sub.f32 %r7640, %r6363, %r7617; + sub.f32 %r7641, %r6368, %r7616; + sub.f32 %r7642, %r6367, %r7615; + sub.f32 %r7643, %r6372, %r7614; + sub.f32 %r7644, %r6371, %r7613; + .loc 1 751 16 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:16 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + mul.f32 %r7645, %r7542, %r7630; + mul.f32 %r7646, %r7543, %r7629; + mul.f32 %r7647, %r7546, %r7632; + mul.f32 %r7648, %r7547, %r7631; + mul.f32 %r7649, %r7550, %r7634; + mul.f32 %r7650, %r7551, %r7633; + mul.f32 %r7651, %r7554, %r7636; + mul.f32 %r7652, %r7555, %r7635; + mul.f32 %r7653, %r7558, %r7638; + mul.f32 %r7654, %r7559, %r7637; + mul.f32 %r7655, %r7562, %r7640; + mul.f32 %r7656, %r7563, %r7639; + mul.f32 %r7657, %r7566, %r7642; + mul.f32 %r7658, %r7567, %r7641; + mul.f32 %r7659, %r7570, %r7644; + mul.f32 %r7660, %r7571, %r7643; + .loc 1 751 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + sub.f32 %r7661, %r6342, %r7628; + sub.f32 %r7662, %r6341, %r7627; + .loc 1 751 16 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:16 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + mul.f32 %r7663, %r7540, %r7662; + mul.f32 %r7664, %r7541, %r7661; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + cvt.rn.bf16x2.f32 %r6929, %r7664, %r7663; + cvt.rn.bf16x2.f32 %r6930, %r7646, %r7645; + .loc 1 751 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + sub.f32 %r7665, %r6346, %r7626; + sub.f32 %r7666, %r6345, %r7625; + .loc 1 751 16 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:16 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + mul.f32 %r7667, %r7544, %r7666; + mul.f32 %r7668, %r7545, %r7665; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + cvt.rn.bf16x2.f32 %r6931, %r7668, %r7667; + cvt.rn.bf16x2.f32 %r6932, %r7648, %r7647; + .loc 1 751 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + sub.f32 %r7669, %r6350, %r7624; + sub.f32 %r7670, %r6349, %r7623; + .loc 1 751 16 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:16 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + mul.f32 %r7671, %r7548, %r7670; + mul.f32 %r7672, %r7549, %r7669; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + cvt.rn.bf16x2.f32 %r7061, %r7672, %r7671; + cvt.rn.bf16x2.f32 %r7062, %r7650, %r7649; + .loc 1 751 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + sub.f32 %r7673, %r6354, %r7622; + sub.f32 %r7674, %r6353, %r7621; + .loc 1 751 16 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:16 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + mul.f32 %r7675, %r7552, %r7674; + mul.f32 %r7676, %r7553, %r7673; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + cvt.rn.bf16x2.f32 %r7063, %r7676, %r7675; + cvt.rn.bf16x2.f32 %r7064, %r7652, %r7651; + .loc 1 751 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + sub.f32 %r7677, %r6358, %r7620; + sub.f32 %r7678, %r6357, %r7619; + .loc 1 751 16 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:16 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + mul.f32 %r7679, %r7556, %r7678; + mul.f32 %r7680, %r7557, %r7677; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + cvt.rn.bf16x2.f32 %r7193, %r7680, %r7679; + cvt.rn.bf16x2.f32 %r7194, %r7654, %r7653; + .loc 1 751 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + sub.f32 %r7681, %r6362, %r7618; + sub.f32 %r7682, %r6361, %r7617; + .loc 1 751 16 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:16 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + mul.f32 %r7683, %r7560, %r7682; + mul.f32 %r7684, %r7561, %r7681; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + cvt.rn.bf16x2.f32 %r7195, %r7684, %r7683; + cvt.rn.bf16x2.f32 %r7196, %r7656, %r7655; + .loc 1 751 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + sub.f32 %r7685, %r6366, %r7616; + sub.f32 %r7686, %r6365, %r7615; + .loc 1 751 16 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:16 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + mul.f32 %r7687, %r7564, %r7686; + mul.f32 %r7688, %r7565, %r7685; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + cvt.rn.bf16x2.f32 %r7325, %r7688, %r7687; + cvt.rn.bf16x2.f32 %r7326, %r7658, %r7657; + .loc 1 751 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + sub.f32 %r7689, %r6370, %r7614; + sub.f32 %r7690, %r6369, %r7613; + .loc 1 751 16 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:751:16 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + mul.f32 %r7691, %r7568, %r7690; + mul.f32 %r7692, %r7569, %r7689; + .loc 1 775 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + cvt.rn.bf16x2.f32 %r7327, %r7692, %r7691; + cvt.rn.bf16x2.f32 %r7328, %r7660, %r7659; + .loc 1 775 43 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:775:43 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581}, {%r6929,%r6930,%r6931,%r6932}, %rd534, %p219, 1, 1, 1; + // end inline asm + add.s32 %r7693, %r5714, 2048; + bfe.u32 %r7694, %r7693, 4, 14; + cvt.u64.u32 %rd638, %r7694; + or.b64 %rd570, %rd638, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581}, {%r7061,%r7062,%r7063,%r7064}, %rd570, %p219, 1, 1, 1; + // end inline asm + add.s32 %r7695, %r5714, 4096; + bfe.u32 %r7696, %r7695, 4, 14; + cvt.u64.u32 %rd639, %r7696; + or.b64 %rd571, %rd639, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581}, {%r7193,%r7194,%r7195,%r7196}, %rd571, %p219, 1, 1, 1; + // end inline asm + add.s32 %r7697, %r5714, 6144; + bfe.u32 %r7698, %r7697, 4, 14; + cvt.u64.u32 %rd640, %r7698; + or.b64 %rd572, %rd640, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13518,%r13519,%r13520,%r13521,%r13522,%r13523,%r13524,%r13525,%r13526,%r13527,%r13528,%r13529,%r13530,%r13531,%r13532,%r13533,%r13534,%r13535,%r13536,%r13537,%r13538,%r13539,%r13540,%r13541,%r13542,%r13543,%r13544,%r13545,%r13546,%r13547,%r13548,%r13549,%r13550,%r13551,%r13552,%r13553,%r13554,%r13555,%r13556,%r13557,%r13558,%r13559,%r13560,%r13561,%r13562,%r13563,%r13564,%r13565,%r13566,%r13567,%r13568,%r13569,%r13570,%r13571,%r13572,%r13573,%r13574,%r13575,%r13576,%r13577,%r13578,%r13579,%r13580,%r13581}, {%r7325,%r7326,%r7327,%r7328}, %rd572, %p219, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 610 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:610:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s32 %r1674, %r13867, 1; + .loc 1 788 33 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:788:33 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + shr.u32 %r7699, %r1674, 1; + .loc 1 789 38 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:789:38 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + mad.wide.u32 %rd574, %r7699, 4, %rd219; + .loc 1 789 24 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:789:24 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + // begin inline asm + mov.u64 %rd573, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd573, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r7329, 0x0; + @%p241 ld.global.L1::evict_last.L2::cache_hint.b32 { %r7329 }, [ %rd574 + 0 ], %rd573; + // end inline asm + .loc 1 790 109 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:790:109 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s32 %r7700, %r7699, 1; + .loc 1 790 113 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:790:113 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + setp.lt.s32 %p278, %r7700, %r2015; + .loc 1 790 55 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:790:55 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s64 %rd577, %rd574, 4; + .loc 1 610 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:610:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + and.pred %p242, %p241, %p278; + .loc 1 790 25 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:790:25 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + // begin inline asm + mov.u64 %rd576, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd576, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r7330, 0x0; + @%p242 ld.global.L1::evict_last.L2::cache_hint.b32 { %r7330 }, [ %rd577 + 0 ], %rd576; + // end inline asm + .loc 1 791 35 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:791:35 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + and.b32 %r7701, %r13867, 1; + .loc 1 792 34 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:792:34 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + sub.s32 %r7702, %r7330, %r7329; + .loc 1 792 48 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:792:48 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + shl.b32 %r7703, %r7702, 7; + .loc 1 792 63 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:792:63 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s32 %r7704, %r7703, -64; + .loc 1 793 29 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:793:29 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + xor.b32 %r7705, %r7701, 1; + .loc 1 793 61 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:793:61 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + shl.b32 %r7706, %r7701, 6; + .loc 1 793 42 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:793:42 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + mad.lo.s32 %r7707, %r7704, %r7705, %r7706; + .loc 1 626 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:626:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + shl.b32 %r7708, %r7707, 12; + .loc 1 626 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:626:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + mul.wide.s32 %rd641, %r7708, 2; + add.s64 %rd1039, %rd1039, %rd641; + add.s64 %rd1038, %rd1038, %rd641; + add.s64 %rd1037, %rd1037, %rd641; + add.s64 %rd1036, %rd1036, %rd641; + .loc 1 627 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:627:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + shl.b32 %r7709, %r7707, 7; + .loc 1 627 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:627:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + mul.wide.s32 %rd642, %r7709, 2; + add.s64 %rd1035, %rd1035, %rd642; + add.s64 %rd1034, %rd1034, %rd642; + add.s64 %rd1033, %rd1033, %rd642; + add.s64 %rd1032, %rd1032, %rd642; + .loc 1 628 19 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:628:19 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s32 %r13738, %r7707, %r13738; + add.s32 %r13737, %r7707, %r13737; + add.s32 %r13736, %r7707, %r13736; + add.s32 %r13735, %r7707, %r13735; + add.s32 %r13734, %r7707, %r13734; + add.s32 %r13733, %r7707, %r13733; + add.s32 %r13732, %r7707, %r13732; + add.s32 %r13731, %r7707, %r13731; + .loc 1 610 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:610:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s32 %r7710, %r13728, 1; + setp.gt.s32 %p279, %r7710, 1; + selp.b32 %r13728, 0, %r7710, %p279; + add.s32 %r7711, %r13730, 1; + setp.gt.s32 %p280, %r7711, 2; + selp.b32 %r13730, 0, %r7711, %p280; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + shl.b32 %r7712, %r13730, 14; + add.s32 %r7713, %r2045, %r7712; + bar.sync 0; + add.s32 %r7331, %r7713, %r606; + selp.b32 %r7332, 16, 0, %p259; + // begin inline asm + cp.async.cg.shared.global [ %r7331 + 0 ], [ %rd1039 + 0 ], 0x10, %r7332; + // end inline asm + add.s32 %r7333, %r7331, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r7333 + 0 ], [ %rd1038 + 0 ], 0x10, %r7332; + // end inline asm + add.s32 %r7335, %r7331, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r7335 + 0 ], [ %rd1037 + 0 ], 0x10, %r7332; + // end inline asm + add.s32 %r7337, %r7331, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r7337 + 0 ], [ %rd1036 + 0 ], 0x10, %r7332; + // end inline asm + cp.async.commit_group; + .loc 1 672 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:672:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + mul.wide.s32 %rd643, %r13738, 4; + add.s64 %rd583, %rd128, %rd643; + mul.wide.s32 %rd644, %r13737, 4; + add.s64 %rd584, %rd128, %rd644; + mul.wide.s32 %rd645, %r13736, 4; + add.s64 %rd585, %rd128, %rd645; + mul.wide.s32 %rd646, %r13735, 4; + add.s64 %rd586, %rd128, %rd646; + mul.wide.s32 %rd647, %r13734, 4; + add.s64 %rd587, %rd128, %rd647; + mul.wide.s32 %rd648, %r13733, 4; + add.s64 %rd588, %rd128, %rd648; + mul.wide.s32 %rd649, %r13732, 4; + add.s64 %rd589, %rd128, %rd649; + mul.wide.s32 %rd650, %r13731, 4; + add.s64 %rd590, %rd128, %rd650; + .loc 1 672 22 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:672:22 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + shl.b32 %r7714, %r13728, 8; + add.s32 %r7715, %r7384, %r7714; + add.s32 %r7339, %r7715, %r613; + selp.b32 %r7340, 8, 0, %p259; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7339 + 0 ], [ %rd583 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7341, %r7339, 32; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7341 + 0 ], [ %rd584 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7343, %r7339, 64; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7343 + 0 ], [ %rd585 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7345, %r7339, 96; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7345 + 0 ], [ %rd586 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7347, %r7339, 128; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7347 + 0 ], [ %rd587 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7349, %r7339, 160; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7349 + 0 ], [ %rd588 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7351, %r7339, 192; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7351 + 0 ], [ %rd589 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7353, %r7339, 224; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7353 + 0 ], [ %rd590 + 0 ], 0x8, %r7340; + // end inline asm + cp.async.commit_group; + .loc 1 835 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:835:23 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s32 %r7716, %r7572, %r7712; + add.s32 %r7355, %r7716, %r606; + // begin inline asm + cp.async.cg.shared.global [ %r7355 + 0 ], [ %rd1035 + 0 ], 0x10, %r7332; + // end inline asm + add.s32 %r7357, %r7355, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r7357 + 0 ], [ %rd1034 + 0 ], 0x10, %r7332; + // end inline asm + add.s32 %r7359, %r7355, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r7359 + 0 ], [ %rd1033 + 0 ], 0x10, %r7332; + // end inline asm + add.s32 %r7361, %r7355, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r7361 + 0 ], [ %rd1032 + 0 ], 0x10, %r7332; + // end inline asm + cp.async.commit_group; + .loc 1 746 29 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:746:29 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s64 %rd595, %rd129, %rd643; + add.s64 %rd596, %rd129, %rd644; + add.s64 %rd597, %rd129, %rd645; + add.s64 %rd598, %rd129, %rd646; + add.s64 %rd599, %rd129, %rd647; + add.s64 %rd600, %rd129, %rd648; + add.s64 %rd601, %rd129, %rd649; + add.s64 %rd602, %rd129, %rd650; + .loc 1 746 21 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:746:21 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + add.s32 %r7717, %r7580, %r7714; + add.s32 %r7363, %r7717, %r613; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7363 + 0 ], [ %rd595 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7365, %r7363, 32; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7365 + 0 ], [ %rd596 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7367, %r7363, 64; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7367 + 0 ], [ %rd597 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7369, %r7363, 96; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7369 + 0 ], [ %rd598 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7371, %r7363, 128; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7371 + 0 ], [ %rd599 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7373, %r7363, 160; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7373 + 0 ], [ %rd600 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7375, %r7363, 192; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7375 + 0 ], [ %rd601 + 0 ], 0x8, %r7340; + // end inline asm + add.s32 %r7377, %r7363, 224; + // begin inline asm + @%p186 cp.async.ca.shared.global [ %r7377 + 0 ], [ %rd602 + 0 ], 0x8, %r7340; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:610:28 @[ c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:318:20 ] + setp.ne.b32 %p281, %r686, %r1674; + mov.b32 %r13867, %r1674; + @%p281 bra $L__BB0_14; + bra.uni $L__BB0_15; +$L__tmp29: +$L__BB0_16: + .loc 1 0 28 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:0:28 + cvt.u32.u64 %r8118, %rd87; + cvt.u32.u64 %r8119, %rd86; + cvt.u32.u64 %r8120, %rd85; + cvt.u32.u64 %r8121, %rd84; + cvt.u32.u64 %r8122, %rd83; + cvt.u32.u64 %r8123, %rd82; + cvt.u32.u64 %r8124, %rd81; + cvt.u32.u64 %r8125, %rd80; + cvt.u32.u64 %r8126, %rd79; + .loc 1 323 23 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:323:23 + shl.b64 %rd667, %rd79, 1; + add.s64 %rd668, %rd3, %rd667; + shl.b64 %rd669, %rd80, 1; + add.s64 %rd670, %rd3, %rd669; + shl.b64 %rd671, %rd81, 1; + add.s64 %rd672, %rd3, %rd671; + shl.b64 %rd673, %rd82, 1; + add.s64 %rd674, %rd3, %rd673; + shl.b64 %rd675, %rd83, 1; + add.s64 %rd676, %rd3, %rd675; + shl.b64 %rd677, %rd84, 1; + add.s64 %rd678, %rd3, %rd677; + shl.b64 %rd679, %rd85, 1; + add.s64 %rd680, %rd3, %rd679; + shl.b64 %rd681, %rd86, 1; + add.s64 %rd682, %rd3, %rd681; + .loc 1 323 55 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:323:55 + add.s64 %rd651, %rd668, %rd307; + add.s64 %rd652, %rd670, %rd307; + add.s64 %rd653, %rd672, %rd307; + add.s64 %rd654, %rd674, %rd307; + add.s64 %rd655, %rd676, %rd307; + add.s64 %rd656, %rd678, %rd307; + add.s64 %rd657, %rd680, %rd307; + add.s64 %rd658, %rd682, %rd307; + .loc 1 330 30 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:330:30 + cvt.rn.bf16x2.f32 %r8127, %r13455, %r13454; + cvt.rn.bf16x2.f32 %r8128, %r13457, %r13456; + cvt.rn.bf16x2.f32 %r8129, %r13459, %r13458; + cvt.rn.bf16x2.f32 %r8130, %r13461, %r13460; + cvt.rn.bf16x2.f32 %r8131, %r13463, %r13462; + cvt.rn.bf16x2.f32 %r8132, %r13465, %r13464; + cvt.rn.bf16x2.f32 %r8133, %r13467, %r13466; + cvt.rn.bf16x2.f32 %r8134, %r13469, %r13468; + cvt.rn.bf16x2.f32 %r8135, %r13471, %r13470; + cvt.rn.bf16x2.f32 %r8136, %r13473, %r13472; + cvt.rn.bf16x2.f32 %r8137, %r13475, %r13474; + cvt.rn.bf16x2.f32 %r8138, %r13477, %r13476; + cvt.rn.bf16x2.f32 %r8139, %r13479, %r13478; + cvt.rn.bf16x2.f32 %r8140, %r13481, %r13480; + cvt.rn.bf16x2.f32 %r8141, %r13483, %r13482; + cvt.rn.bf16x2.f32 %r8142, %r13485, %r13484; + cvt.rn.bf16x2.f32 %r8143, %r13487, %r13486; + cvt.rn.bf16x2.f32 %r8144, %r13489, %r13488; + cvt.rn.bf16x2.f32 %r8145, %r13491, %r13490; + cvt.rn.bf16x2.f32 %r8146, %r13493, %r13492; + cvt.rn.bf16x2.f32 %r8147, %r13495, %r13494; + cvt.rn.bf16x2.f32 %r8148, %r13497, %r13496; + cvt.rn.bf16x2.f32 %r8149, %r13499, %r13498; + cvt.rn.bf16x2.f32 %r8150, %r13501, %r13500; + cvt.rn.bf16x2.f32 %r8151, %r13503, %r13502; + cvt.rn.bf16x2.f32 %r8152, %r13505, %r13504; + cvt.rn.bf16x2.f32 %r8153, %r13507, %r13506; + cvt.rn.bf16x2.f32 %r8154, %r13509, %r13508; + cvt.rn.bf16x2.f32 %r8155, %r13511, %r13510; + cvt.rn.bf16x2.f32 %r8156, %r13513, %r13512; + cvt.rn.bf16x2.f32 %r8157, %r13515, %r13514; + cvt.rn.bf16x2.f32 %r8158, %r13517, %r13516; + shl.b32 %r8159, %r599, 13; + shl.b32 %r8160, %r6, 5; + and.b32 %r8161, %r8160, 7264; + and.b32 %r8162, %r6, 24; + shl.b32 %r8163, %r8162, 4; + shl.b32 %r8164, %r6, 2; + and.b32 %r8165, %r8164, 16; + or.b32 %r8166, %r8159, %r8165; + or.b32 %r8167, %r8161, %r8163; + or.b32 %r8168, %r8166, %r8167; + add.s32 %r8170, %r2045, %r8168; + st.shared.v4.b32 [%r8170], {%r8127, %r8129, %r8131, %r8133}; + st.shared.v4.b32 [%r8170+512], {%r8128, %r8130, %r8132, %r8134}; + xor.b32 %r8171, %r8168, 32; + add.s32 %r8172, %r2045, %r8171; + st.shared.v4.b32 [%r8172], {%r8135, %r8137, %r8139, %r8141}; + st.shared.v4.b32 [%r8172+512], {%r8136, %r8138, %r8140, %r8142}; + xor.b32 %r8173, %r8168, 64; + add.s32 %r8174, %r2045, %r8173; + st.shared.v4.b32 [%r8174], {%r8143, %r8145, %r8147, %r8149}; + st.shared.v4.b32 [%r8174+512], {%r8144, %r8146, %r8148, %r8150}; + xor.b32 %r8175, %r8168, 96; + add.s32 %r8176, %r2045, %r8175; + st.shared.v4.b32 [%r8176], {%r8151, %r8153, %r8155, %r8157}; + st.shared.v4.b32 [%r8176+512], {%r8152, %r8154, %r8156, %r8158}; + bar.sync 0; + shl.b32 %r8177, %r8162, 10; + shl.b32 %r8178, %r599, 5; + and.b32 %r8179, %r8164, 1008; + or.b32 %r8180, %r8177, %r8178; + xor.b32 %r8181, %r8180, %r8179; + add.s32 %r7978, %r2045, %r8181; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8014, %r8015, %r8016, %r8017}, [%r7978]; + // end inline asm + add.s32 %r7983, %r7978, 1024; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8018, %r8019, %r8020, %r8021}, [%r7983]; + // end inline asm + add.s32 %r7988, %r7978, 2048; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8022, %r8023, %r8024, %r8025}, [%r7988]; + // end inline asm + add.s32 %r7993, %r7978, 3072; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8026, %r8027, %r8028, %r8029}, [%r7993]; + // end inline asm + add.s32 %r7998, %r7978, 4096; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8030, %r8031, %r8032, %r8033}, [%r7998]; + // end inline asm + add.s32 %r8003, %r7978, 5120; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8034, %r8035, %r8036, %r8037}, [%r8003]; + // end inline asm + add.s32 %r8008, %r7978, 6144; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8038, %r8039, %r8040, %r8041}, [%r8008]; + // end inline asm + add.s32 %r8013, %r7978, 7168; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8042, %r8043, %r8044, %r8045}, [%r8013]; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd651 + 0 ], { %r8014, %r8015, %r8016, %r8017 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd652 + 0 ], { %r8018, %r8019, %r8020, %r8021 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd653 + 0 ], { %r8022, %r8023, %r8024, %r8025 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd654 + 0 ], { %r8026, %r8027, %r8028, %r8029 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd655 + 0 ], { %r8030, %r8031, %r8032, %r8033 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd656 + 0 ], { %r8034, %r8035, %r8036, %r8037 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd657 + 0 ], { %r8038, %r8039, %r8040, %r8041 }; + // end inline asm + // begin inline asm + st.global.v4.b32 [ %rd658 + 0 ], { %r8042, %r8043, %r8044, %r8045 }; + // end inline asm + .loc 1 334 14 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:334:14 + mul.f32 %r8182, %r13518, 0f3DB504F3; + mul.f32 %r8183, %r13519, 0f3DB504F3; + mul.f32 %r8184, %r13520, 0f3DB504F3; + mul.f32 %r8185, %r13521, 0f3DB504F3; + mul.f32 %r8186, %r13522, 0f3DB504F3; + mul.f32 %r8187, %r13523, 0f3DB504F3; + mul.f32 %r8188, %r13524, 0f3DB504F3; + mul.f32 %r8189, %r13525, 0f3DB504F3; + mul.f32 %r8190, %r13526, 0f3DB504F3; + mul.f32 %r8191, %r13527, 0f3DB504F3; + mul.f32 %r8192, %r13528, 0f3DB504F3; + mul.f32 %r8193, %r13529, 0f3DB504F3; + mul.f32 %r8194, %r13530, 0f3DB504F3; + mul.f32 %r8195, %r13531, 0f3DB504F3; + mul.f32 %r8196, %r13532, 0f3DB504F3; + mul.f32 %r8197, %r13533, 0f3DB504F3; + mul.f32 %r8198, %r13534, 0f3DB504F3; + mul.f32 %r8199, %r13535, 0f3DB504F3; + mul.f32 %r8200, %r13536, 0f3DB504F3; + mul.f32 %r8201, %r13537, 0f3DB504F3; + mul.f32 %r8202, %r13538, 0f3DB504F3; + mul.f32 %r8203, %r13539, 0f3DB504F3; + mul.f32 %r8204, %r13540, 0f3DB504F3; + mul.f32 %r8205, %r13541, 0f3DB504F3; + mul.f32 %r8206, %r13542, 0f3DB504F3; + mul.f32 %r8207, %r13543, 0f3DB504F3; + mul.f32 %r8208, %r13544, 0f3DB504F3; + mul.f32 %r8209, %r13545, 0f3DB504F3; + mul.f32 %r8210, %r13546, 0f3DB504F3; + mul.f32 %r8211, %r13547, 0f3DB504F3; + mul.f32 %r8212, %r13548, 0f3DB504F3; + mul.f32 %r8213, %r13549, 0f3DB504F3; + mul.f32 %r8214, %r13550, 0f3DB504F3; + mul.f32 %r8215, %r13551, 0f3DB504F3; + mul.f32 %r8216, %r13552, 0f3DB504F3; + mul.f32 %r8217, %r13553, 0f3DB504F3; + mul.f32 %r8218, %r13554, 0f3DB504F3; + mul.f32 %r8219, %r13555, 0f3DB504F3; + mul.f32 %r8220, %r13556, 0f3DB504F3; + mul.f32 %r8221, %r13557, 0f3DB504F3; + mul.f32 %r8222, %r13558, 0f3DB504F3; + mul.f32 %r8223, %r13559, 0f3DB504F3; + mul.f32 %r8224, %r13560, 0f3DB504F3; + mul.f32 %r8225, %r13561, 0f3DB504F3; + mul.f32 %r8226, %r13562, 0f3DB504F3; + mul.f32 %r8227, %r13563, 0f3DB504F3; + mul.f32 %r8228, %r13564, 0f3DB504F3; + mul.f32 %r8229, %r13565, 0f3DB504F3; + mul.f32 %r8230, %r13566, 0f3DB504F3; + mul.f32 %r8231, %r13567, 0f3DB504F3; + mul.f32 %r8232, %r13568, 0f3DB504F3; + mul.f32 %r8233, %r13569, 0f3DB504F3; + mul.f32 %r8234, %r13570, 0f3DB504F3; + mul.f32 %r8235, %r13571, 0f3DB504F3; + mul.f32 %r8236, %r13572, 0f3DB504F3; + mul.f32 %r8237, %r13573, 0f3DB504F3; + mul.f32 %r8238, %r13574, 0f3DB504F3; + mul.f32 %r8239, %r13575, 0f3DB504F3; + mul.f32 %r8240, %r13576, 0f3DB504F3; + mul.f32 %r8241, %r13577, 0f3DB504F3; + mul.f32 %r8242, %r13578, 0f3DB504F3; + mul.f32 %r8243, %r13579, 0f3DB504F3; + mul.f32 %r8244, %r13580, 0f3DB504F3; + mul.f32 %r8245, %r13581, 0f3DB504F3; + .loc 1 344 58 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:344:58 + or.b32 %r8246, %r8126, %r5; + or.b32 %r8247, %r8246, %r8118; + or.b32 %r8248, %r8125, %r5; + or.b32 %r8249, %r8248, %r8118; + or.b32 %r8250, %r8124, %r5; + or.b32 %r8251, %r8250, %r8118; + or.b32 %r8252, %r8123, %r5; + or.b32 %r8253, %r8252, %r8118; + or.b32 %r8254, %r8122, %r5; + or.b32 %r8255, %r8254, %r8118; + or.b32 %r8256, %r8121, %r5; + or.b32 %r8257, %r8256, %r8118; + or.b32 %r8258, %r8120, %r5; + or.b32 %r8259, %r8258, %r8118; + or.b32 %r8260, %r8119, %r5; + or.b32 %r8261, %r8260, %r8118; + .loc 1 345 29 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:345:29 + mad.wide.s32 %rd659, %r8247, 2, %rd194; + mad.wide.s32 %rd660, %r8249, 2, %rd194; + mad.wide.s32 %rd661, %r8251, 2, %rd194; + mad.wide.s32 %rd662, %r8253, 2, %rd194; + mad.wide.s32 %rd663, %r8255, 2, %rd194; + mad.wide.s32 %rd664, %r8257, 2, %rd194; + mad.wide.s32 %rd665, %r8259, 2, %rd194; + mad.wide.s32 %rd666, %r8261, 2, %rd194; + .loc 1 345 69 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:345:69 + cvt.rn.bf16x2.f32 %r8262, %r8183, %r8182; + cvt.rn.bf16x2.f32 %r8263, %r8185, %r8184; + cvt.rn.bf16x2.f32 %r8264, %r8187, %r8186; + cvt.rn.bf16x2.f32 %r8265, %r8189, %r8188; + cvt.rn.bf16x2.f32 %r8266, %r8191, %r8190; + cvt.rn.bf16x2.f32 %r8267, %r8193, %r8192; + cvt.rn.bf16x2.f32 %r8268, %r8195, %r8194; + cvt.rn.bf16x2.f32 %r8269, %r8197, %r8196; + cvt.rn.bf16x2.f32 %r8270, %r8199, %r8198; + cvt.rn.bf16x2.f32 %r8271, %r8201, %r8200; + cvt.rn.bf16x2.f32 %r8272, %r8203, %r8202; + cvt.rn.bf16x2.f32 %r8273, %r8205, %r8204; + cvt.rn.bf16x2.f32 %r8274, %r8207, %r8206; + cvt.rn.bf16x2.f32 %r8275, %r8209, %r8208; + cvt.rn.bf16x2.f32 %r8276, %r8211, %r8210; + cvt.rn.bf16x2.f32 %r8277, %r8213, %r8212; + cvt.rn.bf16x2.f32 %r8278, %r8215, %r8214; + cvt.rn.bf16x2.f32 %r8279, %r8217, %r8216; + cvt.rn.bf16x2.f32 %r8280, %r8219, %r8218; + cvt.rn.bf16x2.f32 %r8281, %r8221, %r8220; + cvt.rn.bf16x2.f32 %r8282, %r8223, %r8222; + cvt.rn.bf16x2.f32 %r8283, %r8225, %r8224; + cvt.rn.bf16x2.f32 %r8284, %r8227, %r8226; + cvt.rn.bf16x2.f32 %r8285, %r8229, %r8228; + cvt.rn.bf16x2.f32 %r8286, %r8231, %r8230; + cvt.rn.bf16x2.f32 %r8287, %r8233, %r8232; + cvt.rn.bf16x2.f32 %r8288, %r8235, %r8234; + cvt.rn.bf16x2.f32 %r8289, %r8237, %r8236; + cvt.rn.bf16x2.f32 %r8290, %r8239, %r8238; + cvt.rn.bf16x2.f32 %r8291, %r8241, %r8240; + cvt.rn.bf16x2.f32 %r8292, %r8243, %r8242; + cvt.rn.bf16x2.f32 %r8293, %r8245, %r8244; + bar.sync 0; + st.shared.v4.b32 [%r8170], {%r8262, %r8264, %r8266, %r8268}; + st.shared.v4.b32 [%r8170+512], {%r8263, %r8265, %r8267, %r8269}; + st.shared.v4.b32 [%r8172], {%r8270, %r8272, %r8274, %r8276}; + st.shared.v4.b32 [%r8172+512], {%r8271, %r8273, %r8275, %r8277}; + st.shared.v4.b32 [%r8174], {%r8278, %r8280, %r8282, %r8284}; + st.shared.v4.b32 [%r8174+512], {%r8279, %r8281, %r8283, %r8285}; + st.shared.v4.b32 [%r8176], {%r8286, %r8288, %r8290, %r8292}; + st.shared.v4.b32 [%r8176+512], {%r8287, %r8289, %r8291, %r8293}; + bar.sync 0; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8086, %r8087, %r8088, %r8089}, [%r7978]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8090, %r8091, %r8092, %r8093}, [%r7983]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8094, %r8095, %r8096, %r8097}, [%r7988]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8098, %r8099, %r8100, %r8101}, [%r7993]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8102, %r8103, %r8104, %r8105}, [%r7998]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8106, %r8107, %r8108, %r8109}, [%r8003]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8110, %r8111, %r8112, %r8113}, [%r8008]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r8114, %r8115, %r8116, %r8117}, [%r8013]; + // end inline asm + mov.pred %p283, -1; + // begin inline asm + @%p283 st.global.v4.b32 [ %rd659 + 0 ], { %r8086, %r8087, %r8088, %r8089 }; + // end inline asm + // begin inline asm + @%p283 st.global.v4.b32 [ %rd660 + 0 ], { %r8090, %r8091, %r8092, %r8093 }; + // end inline asm + // begin inline asm + @%p283 st.global.v4.b32 [ %rd661 + 0 ], { %r8094, %r8095, %r8096, %r8097 }; + // end inline asm + // begin inline asm + @%p283 st.global.v4.b32 [ %rd662 + 0 ], { %r8098, %r8099, %r8100, %r8101 }; + // end inline asm + // begin inline asm + @%p283 st.global.v4.b32 [ %rd663 + 0 ], { %r8102, %r8103, %r8104, %r8105 }; + // end inline asm + // begin inline asm + @%p283 st.global.v4.b32 [ %rd664 + 0 ], { %r8106, %r8107, %r8108, %r8109 }; + // end inline asm + // begin inline asm + @%p283 st.global.v4.b32 [ %rd665 + 0 ], { %r8110, %r8111, %r8112, %r8113 }; + // end inline asm + // begin inline asm + @%p283 st.global.v4.b32 [ %rd666 + 0 ], { %r8114, %r8115, %r8116, %r8117 }; + // end inline asm +$L__BB0_17: + .loc 1 139 4 // c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py:139:4 + ret; +$L__tmp30: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 6 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 404 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x18d DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 52 +.b8 119 +.b8 119 +.b8 53 +.b8 112 +.b8 109 +.b8 108 +.b8 114 +.b8 54 +.b8 97 +.b8 109 +.b8 101 +.b8 114 +.b8 112 +.b8 114 +.b8 104 +.b8 55 +.b8 118 +.b8 51 +.b8 105 +.b8 98 +.b8 105 +.b8 111 +.b8 104 +.b8 51 +.b8 121 +.b8 118 +.b8 98 +.b8 104 +.b8 101 +.b8 109 +.b8 100 +.b8 113 +.b8 115 +.b8 104 +.b8 55 +.b8 103 +.b8 99 +.b8 114 +.b8 108 +.b8 120 +.b8 106 +.b8 110 +.b8 104 +.b8 110 +.b8 112 +.b8 107 +.b8 107 +.b8 116 +.b8 114 +.b8 98 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 52 +.b8 119 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 116 +.b8 101 +.b8 109 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0xf1 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 178 // DW_AT_call_line +.b8 107 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd3:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp2 // DW_AT_low_pc +.b64 $L__tmp3 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 179 // DW_AT_call_line +.b8 111 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xeb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp4 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 207 // DW_AT_call_line +.b8 12 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x103:0x17 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp10 // DW_AT_low_pc +.b64 $L__tmp11 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 0 // DW_AT_call_line +.b8 4 // Abbrev [4] 0x11a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp13 // DW_AT_low_pc +.b64 $L__tmp14 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 226 // DW_AT_call_line +.b8 16 // DW_AT_call_column +.b8 6 // Abbrev [6] 0x132:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp15 // DW_AT_low_pc +.b64 $L__tmp16 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 0 // DW_AT_call_line +.b8 1 +.b8 107 // DW_AT_call_column +.b8 6 // Abbrev [6] 0x14b:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp16 // DW_AT_low_pc +.b64 $L__tmp17 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 1 // DW_AT_call_line +.b8 1 +.b8 107 // DW_AT_call_column +.b8 6 // Abbrev [6] 0x164:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp18 // DW_AT_low_pc +.b64 $L__tmp28 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 1 +.b8 16 // DW_AT_call_column +.b8 6 // Abbrev [6] 0x17d:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp20 // DW_AT_low_pc +.b64 $L__tmp29 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 62 // DW_AT_call_line +.b8 1 +.b8 20 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.source new file mode 100644 index 0000000000000000000000000000000000000000..8a81d50a991a931095d85d902ce5d86e9e19813c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.source @@ -0,0 +1,2072 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":18:0) +#loc201 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":32:0) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":812:0) +#loc221 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":348:0) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":423:0) +#loc318 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":797:0) +#loc321 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":781:0) +#loc342 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":559:0) +#loc374 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":634:0) +#loc445 = loc("arg_Q"(#loc)) +#loc446 = loc("arg_K"(#loc)) +#loc447 = loc("arg_V"(#loc)) +#loc448 = loc("arg_LSE"(#loc)) +#loc449 = loc("arg_DELTA"(#loc)) +#loc450 = loc("arg_DO"(#loc)) +#loc451 = loc("arg_DQ"(#loc)) +#loc452 = loc("arg_DV"(#loc)) +#loc453 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc454 = loc("arg_KV_IDX"(#loc)) +#loc455 = loc("arg_Q_NUM_BLKS"(#loc)) +#loc456 = loc("arg_Q_IDX"(#loc)) +#loc457 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc458 = loc("arg_FULL_KV_IDX"(#loc)) +#loc459 = loc("arg_FULL_Q_NUM_BLKS"(#loc)) +#loc460 = loc("arg_FULL_Q_IDX"(#loc)) +#loc461 = loc("in_ptr16"(#loc)) +#loc462 = loc("out_ptr0"(#loc)) +#loc647 = loc("x"(#loc201)) +#loc648 = loc("ptr"(#loc211)) +#loc649 = loc("offs_m"(#loc211)) +#loc650 = loc("offs_n"(#loc211)) +#loc651 = loc("stride_m"(#loc211)) +#loc652 = loc("stride_n"(#loc211)) +#loc653 = loc("M_LEN"(#loc211)) +#loc660 = loc("arg_Q"(#loc221)) +#loc661 = loc("arg_K"(#loc221)) +#loc662 = loc("arg_V"(#loc221)) +#loc663 = loc("arg_LSE"(#loc221)) +#loc664 = loc("arg_DELTA"(#loc221)) +#loc665 = loc("arg_DO"(#loc221)) +#loc666 = loc("arg_DQ"(#loc221)) +#loc667 = loc("arg_DV"(#loc221)) +#loc668 = loc("arg_KV_NUM_BLKS"(#loc221)) +#loc669 = loc("arg_KV_IDX"(#loc221)) +#loc670 = loc("arg_Q_NUM_BLKS"(#loc221)) +#loc671 = loc("arg_Q_IDX"(#loc221)) +#loc672 = loc("arg_FULL_KV_NUM_BLKS"(#loc221)) +#loc673 = loc("arg_FULL_KV_IDX"(#loc221)) +#loc674 = loc("arg_FULL_Q_NUM_BLKS"(#loc221)) +#loc675 = loc("arg_FULL_Q_IDX"(#loc221)) +#loc676 = loc("in_ptr16"(#loc221)) +#loc677 = loc("out_ptr0"(#loc221)) +#loc678 = loc("K"(#loc221)) +#loc679 = loc("V"(#loc221)) +#loc680 = loc("dq"(#loc221)) +#loc681 = loc("q"(#loc221)) +#loc682 = loc("do"(#loc221)) +#loc683 = loc("Di"(#loc221)) +#loc684 = loc("lse"(#loc221)) +#loc685 = loc("off_z"(#loc221)) +#loc686 = loc("off_hq"(#loc221)) +#loc687 = loc("offs_m2"(#loc221)) +#loc688 = loc("offs_n2"(#loc221)) +#loc689 = loc("stride_kn"(#loc221)) +#loc690 = loc("stride_kd"(#loc221)) +#loc691 = loc("stride_vn"(#loc221)) +#loc692 = loc("stride_vd"(#loc221)) +#loc693 = loc("kv_indices"(#loc221)) +#loc694 = loc("sparse_kv_num_blocks"(#loc221)) +#loc723 = loc("arg_Q"(#loc253)) +#loc724 = loc("arg_K"(#loc253)) +#loc725 = loc("arg_V"(#loc253)) +#loc726 = loc("arg_LSE"(#loc253)) +#loc727 = loc("arg_DELTA"(#loc253)) +#loc728 = loc("arg_DO"(#loc253)) +#loc729 = loc("arg_DQ"(#loc253)) +#loc730 = loc("arg_DV"(#loc253)) +#loc731 = loc("arg_KV_NUM_BLKS"(#loc253)) +#loc732 = loc("arg_KV_IDX"(#loc253)) +#loc733 = loc("arg_Q_NUM_BLKS"(#loc253)) +#loc734 = loc("arg_Q_IDX"(#loc253)) +#loc735 = loc("arg_FULL_KV_NUM_BLKS"(#loc253)) +#loc736 = loc("arg_FULL_KV_IDX"(#loc253)) +#loc737 = loc("arg_FULL_Q_NUM_BLKS"(#loc253)) +#loc738 = loc("arg_FULL_Q_IDX"(#loc253)) +#loc739 = loc("in_ptr16"(#loc253)) +#loc740 = loc("out_ptr0"(#loc253)) +#loc741 = loc("dq"(#loc253)) +#loc742 = loc("q"(#loc253)) +#loc743 = loc("kT_ptrs"(#loc253)) +#loc744 = loc("vT_ptrs"(#loc253)) +#loc745 = loc("do"(#loc253)) +#loc746 = loc("Di"(#loc253)) +#loc747 = loc("lse"(#loc253)) +#loc748 = loc("Q_LEN"(#loc253)) +#loc749 = loc("KV_LEN"(#loc253)) +#loc750 = loc("off_z"(#loc253)) +#loc751 = loc("off_hq"(#loc253)) +#loc752 = loc("offs_m2"(#loc253)) +#loc753 = loc("offs_n2"(#loc253)) +#loc754 = loc("offs_k"(#loc253)) +#loc755 = loc("offs_v"(#loc253)) +#loc756 = loc("stride_kn"(#loc253)) +#loc757 = loc("stride_kd"(#loc253)) +#loc758 = loc("stride_vn"(#loc253)) +#loc759 = loc("stride_vd"(#loc253)) +#loc760 = loc("kv_indices"(#loc253)) +#loc761 = loc("sparse_kv_num_blocks"(#loc253)) +#loc824 = loc("N_LEN"(#loc211)) +#loc825 = loc("indices"(#loc318)) +#loc826 = loc("loop_iter"(#loc321)) +#loc827 = loc("col_indices"(#loc321)) +#loc828 = loc("total_blocks"(#loc321)) +#loc847 = loc("arg_Q"(#loc342)) +#loc848 = loc("arg_K"(#loc342)) +#loc849 = loc("arg_V"(#loc342)) +#loc850 = loc("arg_LSE"(#loc342)) +#loc851 = loc("arg_DELTA"(#loc342)) +#loc852 = loc("arg_DO"(#loc342)) +#loc853 = loc("arg_DQ"(#loc342)) +#loc854 = loc("arg_DV"(#loc342)) +#loc855 = loc("arg_KV_NUM_BLKS"(#loc342)) +#loc856 = loc("arg_KV_IDX"(#loc342)) +#loc857 = loc("arg_Q_NUM_BLKS"(#loc342)) +#loc858 = loc("arg_Q_IDX"(#loc342)) +#loc859 = loc("arg_FULL_KV_NUM_BLKS"(#loc342)) +#loc860 = loc("arg_FULL_KV_IDX"(#loc342)) +#loc861 = loc("arg_FULL_Q_NUM_BLKS"(#loc342)) +#loc862 = loc("arg_FULL_Q_IDX"(#loc342)) +#loc863 = loc("in_ptr16"(#loc342)) +#loc864 = loc("out_ptr0"(#loc342)) +#loc865 = loc("Q"(#loc342)) +#loc866 = loc("DO"(#loc342)) +#loc867 = loc("DELTA"(#loc342)) +#loc868 = loc("LSE"(#loc342)) +#loc869 = loc("dk"(#loc342)) +#loc870 = loc("dv"(#loc342)) +#loc871 = loc("k"(#loc342)) +#loc872 = loc("v"(#loc342)) +#loc873 = loc("off_z"(#loc342)) +#loc874 = loc("off_hq"(#loc342)) +#loc875 = loc("offs_n1"(#loc342)) +#loc876 = loc("offs_m1"(#loc342)) +#loc877 = loc("stride_qm"(#loc342)) +#loc878 = loc("stride_qd"(#loc342)) +#loc879 = loc("stride_dom"(#loc342)) +#loc880 = loc("stride_dod"(#loc342)) +#loc881 = loc("q_indices"(#loc342)) +#loc882 = loc("sparse_q_num_blocks"(#loc342)) +#loc910 = loc("arg_Q"(#loc374)) +#loc911 = loc("arg_K"(#loc374)) +#loc912 = loc("arg_V"(#loc374)) +#loc913 = loc("arg_LSE"(#loc374)) +#loc914 = loc("arg_DELTA"(#loc374)) +#loc915 = loc("arg_DO"(#loc374)) +#loc916 = loc("arg_DQ"(#loc374)) +#loc917 = loc("arg_DV"(#loc374)) +#loc918 = loc("arg_KV_NUM_BLKS"(#loc374)) +#loc919 = loc("arg_KV_IDX"(#loc374)) +#loc920 = loc("arg_Q_NUM_BLKS"(#loc374)) +#loc921 = loc("arg_Q_IDX"(#loc374)) +#loc922 = loc("arg_FULL_KV_NUM_BLKS"(#loc374)) +#loc923 = loc("arg_FULL_KV_IDX"(#loc374)) +#loc924 = loc("arg_FULL_Q_NUM_BLKS"(#loc374)) +#loc925 = loc("arg_FULL_Q_IDX"(#loc374)) +#loc926 = loc("in_ptr16"(#loc374)) +#loc927 = loc("out_ptr0"(#loc374)) +#loc928 = loc("dk"(#loc374)) +#loc929 = loc("dv"(#loc374)) +#loc930 = loc("qT_ptrs"(#loc374)) +#loc931 = loc("k"(#loc374)) +#loc932 = loc("v"(#loc374)) +#loc933 = loc("do_ptrs"(#loc374)) +#loc934 = loc("DELTA"(#loc374)) +#loc935 = loc("LSE"(#loc374)) +#loc936 = loc("Q_LEN"(#loc374)) +#loc937 = loc("KV_LEN"(#loc374)) +#loc938 = loc("off_z"(#loc374)) +#loc939 = loc("off_hq"(#loc374)) +#loc940 = loc("offs_n1"(#loc374)) +#loc941 = loc("offs_m1"(#loc374)) +#loc942 = loc("offs_k"(#loc374)) +#loc943 = loc("offs_v"(#loc374)) +#loc944 = loc("stride_qm"(#loc374)) +#loc945 = loc("stride_qd"(#loc374)) +#loc946 = loc("stride_dom"(#loc374)) +#loc947 = loc("stride_dod"(#loc374)) +#loc948 = loc("q_indices"(#loc374)) +#loc949 = loc("sparse_q_num_blocks"(#loc374)) +module { + tt.func public @triton_tem_fused_zeros_1(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_DELTA: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DELTA"(#loc)), %arg_DO: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DO"(#loc)), %arg_DQ: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DQ"(#loc)), %arg_DV: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DV"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_NUM_BLKS"(#loc)), %arg_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %arg_FULL_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_NUM_BLKS"(#loc)), %arg_FULL_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_IDX"(#loc)), %in_ptr16: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr16"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc))) attributes {noinline = false} { + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c2097152_i32 = arith.constant 2097152 : i32 loc(#loc2) + %c262144_i32 = arith.constant 262144 : i32 loc(#loc2) + %c128_i32_0 = arith.constant 128 : i32 loc(#loc2) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc2) + %c2097152_i32_2 = arith.constant 2097152 : i32 loc(#loc3) + %c262144_i32_3 = arith.constant 262144 : i32 loc(#loc3) + %c128_i32_4 = arith.constant 128 : i32 loc(#loc3) + %c1_i32_5 = arith.constant 1 : i32 loc(#loc3) + %c8388608_i32_6 = arith.constant 8388608 : i32 loc(#loc4) + %c262144_i32_7 = arith.constant 262144 : i32 loc(#loc4) + %c128_i32_8 = arith.constant 128 : i32 loc(#loc4) + %c1_i32_9 = arith.constant 1 : i32 loc(#loc4) + %c8388608_i32_10 = arith.constant 8388608 : i32 loc(#loc5) + %c128_i32_11 = arith.constant 128 : i32 loc(#loc5) + %c4096_i32_12 = arith.constant 4096 : i32 loc(#loc5) + %c1_i32_13 = arith.constant 1 : i32 loc(#loc5) + %c2097152_i32_14 = arith.constant 2097152 : i32 loc(#loc6) + %c262144_i32_15 = arith.constant 262144 : i32 loc(#loc6) + %c128_i32_16 = arith.constant 128 : i32 loc(#loc6) + %c1_i32_17 = arith.constant 1 : i32 loc(#loc6) + %ZQ = arith.constant 2 : i32 loc(#loc463) + %HQ = arith.constant 32 : i32 loc(#loc464) + %HKV = arith.constant 8 : i32 loc(#loc465) + %Q_LEN = arith.constant 2048 : i32 loc(#loc466) + %ZKV = arith.constant 2 : i32 loc(#loc467) + %KV_LEN = arith.constant 2048 : i32 loc(#loc468) + %pid = tt.get_program_id x : i32 loc(#loc469) + %NUM_KV_BLOCKS = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_128_"(%KV_LEN) : (i32) -> i32 loc(#loc470) + %NUM_Q_BLOCKS = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_128_"(%Q_LEN) : (i32) -> i32 loc(#loc471) + %off_zq = tt.get_program_id y : i32 loc(#loc472) + %off_hkv = tt.get_program_id z : i32 loc(#loc473) + %off_zkv = arith.remsi %off_zq, %ZKV : i32 loc(#loc474) + %SPARSE_Z = arith.constant 2 : i32 loc(#loc475) + %SPARSE_HQ = arith.constant 1 : i32 loc(#loc476) + %sparse_idx_z = arith.remsi %off_zq, %SPARSE_Z : i32 loc(#loc477) + %k_adj = arith.muli %c262144_i32, %off_hkv : i32 loc(#loc478) + %k_adj_18 = arith.muli %c2097152_i32, %off_zkv : i32 loc(#loc479) + %k_adj_19 = arith.addi %k_adj, %k_adj_18 : i32 loc(#loc480) + %k_adj_20 = arith.extsi %k_adj_19 : i32 to i64 loc(#loc481) + %v_adj = arith.muli %c262144_i32_3, %off_hkv : i32 loc(#loc482) + %v_adj_21 = arith.muli %c2097152_i32_2, %off_zkv : i32 loc(#loc483) + %v_adj_22 = arith.addi %v_adj, %v_adj_21 : i32 loc(#loc484) + %v_adj_23 = arith.extsi %v_adj_22 : i32 to i64 loc(#loc485) + %dv_adj = arith.muli %c262144_i32_15, %off_hkv : i32 loc(#loc486) + %dv_adj_24 = arith.muli %c2097152_i32_14, %off_zq : i32 loc(#loc487) + %dv_adj_25 = arith.addi %dv_adj, %dv_adj_24 : i32 loc(#loc488) + %dv_adj_26 = arith.extsi %dv_adj_25 : i32 to i64 loc(#loc489) + %K = tt.addptr %arg_K, %k_adj_20 : !tt.ptr, i64 loc(#loc490) + %V = tt.addptr %arg_V, %v_adj_23 : !tt.ptr, i64 loc(#loc491) + %DV = tt.addptr %arg_DV, %dv_adj_26 : !tt.ptr, i64 loc(#loc492) + %RCP_LN2 = arith.constant 1.44269502 : f32 loc(#loc493) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc494) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc495) + %0 = arith.cmpi sge, %pid, %NUM_KV_BLOCKS : i32 loc(#loc40) + %1:2 = scf.if %0 -> (i32, i32) { + %off_pid = arith.subi %pid, %NUM_KV_BLOCKS : i32 loc(#loc496) + %SPARSE_Q_MULTIPLE = arith.constant 1 : i32 loc(#loc1018) + %SPARSE_KV_MULTIPLE = arith.constant 2 : i32 loc(#loc1019) + %off_hq2 = arith.divsi %off_pid, %NUM_Q_BLOCKS : i32 loc(#loc499) + %off_hq2_27 = arith.constant 4 : i32 loc(#loc500) + %off_hq2_28 = arith.constant 4 : i32 loc(#loc500) + %off_hq2_29 = arith.muli %off_hkv, %off_hq2_28 : i32 loc(#loc500) + %off_hq2_30 = arith.addi %off_hq2, %off_hq2_29 : i32 loc(#loc501) + %start_m2_block = arith.remsi %off_pid, %NUM_Q_BLOCKS : i32 loc(#loc502) + %off_pid_mask = arith.divsi %start_m2_block, %SPARSE_Q_MULTIPLE : i32 loc(#loc503) + %stride_kv_num_blks_h = arith.constant 16 : i32 loc(#loc504) + %stride_kv_idx_h = arith.constant 256 : i32 loc(#loc505) + %stride_kv_idx_m = arith.constant 16 : i32 loc(#loc506) + %sparse_idx_hq2 = arith.remsi %off_hq2_30, %SPARSE_HQ : i32 loc(#loc507) + %sparse_hz_offset = arith.muli %sparse_idx_z, %SPARSE_HQ : i32 loc(#loc508) + %sparse_hz_offset_31 = arith.addi %sparse_hz_offset, %sparse_idx_hq2 : i32 loc(#loc509) + %sparse_kv_num_blks_offset = arith.muli %sparse_hz_offset_31, %stride_kv_num_blks_h : i32 loc(#loc510) + %sparse_kv_num_blks_offset_32 = arith.addi %sparse_kv_num_blks_offset, %off_pid_mask : i32 loc(#loc511) + %sparse_kv_idx_offset = arith.muli %sparse_hz_offset_31, %stride_kv_idx_h : i32 loc(#loc512) + %sparse_kv_idx_offset_33 = arith.muli %off_pid_mask, %stride_kv_idx_m : i32 loc(#loc513) + %sparse_kv_idx_offset_34 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_33 : i32 loc(#loc514) + %q_adj2 = arith.muli %c128_i32, %off_hq2_30 : i32 loc(#loc515) + %q_adj2_35 = arith.muli %c8388608_i32, %off_zq : i32 loc(#loc516) + %q_adj2_36 = arith.addi %q_adj2, %q_adj2_35 : i32 loc(#loc517) + %q_adj2_37 = arith.extsi %q_adj2_36 : i32 to i64 loc(#loc518) + %do_adj2 = arith.muli %c262144_i32_7, %off_hq2_30 : i32 loc(#loc519) + %do_adj2_38 = arith.muli %c8388608_i32_6, %off_zq : i32 loc(#loc520) + %do_adj2_39 = arith.addi %do_adj2, %do_adj2_38 : i32 loc(#loc521) + %do_adj2_40 = arith.extsi %do_adj2_39 : i32 to i64 loc(#loc522) + %dq_adj2 = arith.muli %c128_i32_11, %off_hq2_30 : i32 loc(#loc523) + %dq_adj2_41 = arith.muli %c8388608_i32_10, %off_zq : i32 loc(#loc524) + %dq_adj2_42 = arith.addi %dq_adj2, %dq_adj2_41 : i32 loc(#loc525) + %dq_adj2_43 = arith.extsi %dq_adj2_42 : i32 to i64 loc(#loc526) + %off_chz2 = arith.muli %off_zq, %HQ : i32 loc(#loc527) + %off_chz2_44 = arith.addi %off_chz2, %off_hq2_30 : i32 loc(#loc528) + %off_chz2_45 = arith.muli %off_chz2_44, %Q_LEN : i32 loc(#loc529) + %off_chz2_46 = arith.extsi %off_chz2_45 : i32 to i64 loc(#loc530) + %Q2 = tt.addptr %arg_Q, %q_adj2_37 : !tt.ptr, i64 loc(#loc531) + %DO2 = tt.addptr %arg_DO, %do_adj2_40 : !tt.ptr, i64 loc(#loc532) + %DQ2 = tt.addptr %arg_DQ, %dq_adj2_43 : !tt.ptr, i64 loc(#loc533) + %LSE2 = tt.addptr %arg_LSE, %off_chz2_46 : !tt.ptr, i64 loc(#loc534) + %DELTA2 = tt.addptr %arg_DELTA, %off_chz2_46 : !tt.ptr, i64 loc(#loc535) + %dq = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc536) + %start_m2 = arith.constant 128 : i32 loc(#loc537) + %start_m2_47 = arith.constant 128 : i32 loc(#loc537) + %start_m2_48 = arith.muli %start_m2_block, %start_m2_47 : i32 loc(#loc537) + %offs_m2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc538) + %offs_m2_49 = tt.splat %start_m2_48 : i32 -> tensor<128xi32> loc(#loc539) + %offs_m2_50 = arith.addi %offs_m2_49, %offs_m2 : tensor<128xi32> loc(#loc539) + %q = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%Q2, %offs_m2_50, %offs_k, %c4096_i32, %c1_i32, %Q_LEN) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc540) + %do = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%DO2, %offs_m2_50, %offs_v, %c128_i32_8, %c1_i32_9, %Q_LEN) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc541) + %Di = tt.splat %DELTA2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc542) + %Di_51 = tt.addptr %Di, %offs_m2_50 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc542) + %Di_52 = tt.load %Di_51 : tensor<128x!tt.ptr> loc(#loc543) + %lse = tt.splat %LSE2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc544) + %lse_53 = tt.addptr %lse, %offs_m2_50 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc544) + %lse_54 = tt.load %lse_53 : tensor<128x!tt.ptr> loc(#loc545) + %lse_55 = arith.constant 0xFF800000 : f32 loc(#loc546) + %lse_56 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc546) + %lse_57 = arith.cmpf oeq, %lse_54, %lse_56 : tensor<128xf32> loc(#loc546) + %lse_58 = arith.constant 0.000000e+00 : f32 loc(#loc547) + %lse_59 = arith.constant 0.000000e+00 : f32 loc(#loc547) + %lse_60 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc547) + %lse_61 = arith.select %lse_57, %lse_60, %lse_54 : tensor<128xi1>, tensor<128xf32> loc(#loc547) + %lse_62 = tt.expand_dims %lse_61 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc548) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_34 : !tt.ptr, i32 loc(#loc549) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc550) + %kv_start_63 = arith.constant 128 : i32 loc(#loc551) + %kv_start_64 = arith.constant 128 : i32 loc(#loc551) + %kv_start_65 = arith.muli %kv_start, %kv_start_64 : i32 loc(#loc551) + %sparse_kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_32 : !tt.ptr, i32 loc(#loc552) + %sparse_kv_num_blocks_66 = tt.load %sparse_kv_num_blocks : !tt.ptr loc(#loc553) + %offs_n2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc554) + %offs_n2_67 = tt.splat %kv_start_65 : i32 -> tensor<64xi32> loc(#loc555) + %offs_n2_68 = arith.addi %offs_n2_67, %offs_n2 : tensor<64xi32> loc(#loc555) + %dq_69 = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(35,)cconstexpr_bf16__(36,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %K, %V, %dq, %q, %do, %Di_52, %lse_62, %off_zq, %off_hq2_30, %offs_m2_50, %offs_n2_68, %c128_i32_0, %c1_i32_1, %c128_i32_4, %c1_i32_5, %kv_indices, %sparse_kv_num_blocks_66) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc556) + %kv_indices_70 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_34 : !tt.ptr, i32 loc(#loc557) + %kv_start_71 = tt.load %kv_indices_70 : !tt.ptr loc(#loc558) + %kv_start_72 = arith.constant 128 : i32 loc(#loc559) + %kv_start_73 = arith.constant 128 : i32 loc(#loc559) + %kv_start_74 = arith.muli %kv_start_71, %kv_start_73 : i32 loc(#loc559) + %sparse_kv_num_blocks_75 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_32 : !tt.ptr, i32 loc(#loc560) + %sparse_kv_num_blocks_76 = tt.load %sparse_kv_num_blocks_75 : !tt.ptr loc(#loc561) + %offs_n2_77 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc562) + %offs_n2_78 = tt.splat %kv_start_74 : i32 -> tensor<64xi32> loc(#loc563) + %offs_n2_79 = arith.addi %offs_n2_78, %offs_n2_77 : tensor<64xi32> loc(#loc563) + %dq_80 = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(35,)cconstexpr_bf16__(36,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %K, %V, %dq_69, %q, %do, %Di_52, %lse_62, %off_zq, %off_hq2_30, %offs_m2_50, %offs_n2_79, %c128_i32_0, %c1_i32_1, %c128_i32_4, %c1_i32_5, %kv_indices_70, %sparse_kv_num_blocks_76) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc564) + %dq_ptrs = tt.expand_dims %offs_m2_50 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc565) + %dq_ptrs_81 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc566) + %dq_ptrs_82 = arith.muli %dq_ptrs, %dq_ptrs_81 : tensor<128x1xi32> loc(#loc566) + %dq_ptrs_83 = tt.splat %DQ2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc567) + %dq_ptrs_84 = tt.addptr %dq_ptrs_83, %dq_ptrs_82 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc567) + %dq_ptrs_85 = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc568) + %dq_ptrs_86 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc569) + %dq_ptrs_87 = arith.muli %dq_ptrs_85, %dq_ptrs_86 : tensor<1x128xi32> loc(#loc569) + %dq_ptrs_88 = tt.broadcast %dq_ptrs_84 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc570) + %dq_ptrs_89 = tt.broadcast %dq_ptrs_87 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc570) + %dq_ptrs_90 = tt.addptr %dq_ptrs_88, %dq_ptrs_89 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc570) + %dq_91 = arith.constant 0.0883883461 : f32 loc(#loc571) + %dq_92 = arith.constant 0.0883883461 : f32 loc(#loc571) + %dq_93 = arith.constant dense<0.0883883461> : tensor<128x128xf32> loc(#loc571) + %dq_94 = arith.mulf %dq_80, %dq_93 : tensor<128x128xf32> loc(#loc571) + %2 = arith.truncf %dq_94 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc118) + tt.store %dq_ptrs_90, %2 : tensor<128x128x!tt.ptr> loc(#loc118) + scf.yield %SPARSE_KV_MULTIPLE, %SPARSE_Q_MULTIPLE : i32, i32 loc(#loc118) + } else { + %SPARSE_Q_MULTIPLE = arith.constant 2 : i32 loc(#loc1020) + %SPARSE_KV_MULTIPLE = arith.constant 1 : i32 loc(#loc1021) + %pid_mask = arith.divsi %pid, %SPARSE_KV_MULTIPLE : i32 loc(#loc574) + %stride_q_num_blks_h = arith.constant 16 : i32 loc(#loc575) + %stride_q_idx_h = arith.constant 256 : i32 loc(#loc576) + %stride_q_idx_n = arith.constant 16 : i32 loc(#loc577) + %dv = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc578) + %dk = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc579) + %start_n1 = arith.constant 128 : i32 loc(#loc580) + %start_n1_27 = arith.constant 128 : i32 loc(#loc580) + %start_n1_28 = arith.muli %pid, %start_n1_27 : i32 loc(#loc580) + %offs_n1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc581) + %offs_n1_29 = tt.splat %start_n1_28 : i32 -> tensor<128xi32> loc(#loc582) + %offs_n1_30 = arith.addi %offs_n1_29, %offs_n1 : tensor<128xi32> loc(#loc582) + %k = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%K, %offs_n1_30, %offs_k, %c128_i32_0, %c1_i32_1, %KV_LEN) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc583) + %v = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%V, %offs_n1_30, %offs_v, %c128_i32_4, %c1_i32_5, %KV_LEN) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc584) + %c0_i32 = arith.constant 0 : i32 loc(#loc132) + %c4_i32 = arith.constant 4 : i32 loc(#loc132) + %c1_i32_31 = arith.constant 1 : i32 loc(#loc132) + %2 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc132) + %3 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc132) + %4 = arith.bitcast %c1_i32_31 : i32 to i32 loc(#loc132) + %5 = ub.poison : i32 loc(#loc132) + %dk_32:2 = scf.for %off_g = %2 to %3 step %4 iter_args(%dv_64 = %dv, %dk_65 = %dk) -> (tensor<128x128xf32>, tensor<128x128xf32>) : i32 { + %off_hq1 = arith.constant 4 : i32 loc(#loc586) + %off_hq1_66 = arith.constant 4 : i32 loc(#loc586) + %off_hq1_67 = arith.muli %off_hkv, %off_hq1_66 : i32 loc(#loc586) + %off_hq1_68 = arith.addi %off_hq1_67, %off_g : i32 loc(#loc587) + %q_adj1 = arith.muli %c128_i32, %off_hq1_68 : i32 loc(#loc588) + %q_adj1_69 = arith.muli %c8388608_i32, %off_zq : i32 loc(#loc589) + %q_adj1_70 = arith.addi %q_adj1, %q_adj1_69 : i32 loc(#loc590) + %q_adj1_71 = arith.extsi %q_adj1_70 : i32 to i64 loc(#loc591) + %do_adj1 = arith.muli %c262144_i32_7, %off_hq1_68 : i32 loc(#loc592) + %do_adj1_72 = arith.muli %c8388608_i32_6, %off_zq : i32 loc(#loc593) + %do_adj1_73 = arith.addi %do_adj1, %do_adj1_72 : i32 loc(#loc594) + %do_adj1_74 = arith.extsi %do_adj1_73 : i32 to i64 loc(#loc595) + %dq_adj1 = arith.muli %c128_i32_11, %off_hq1_68 : i32 loc(#loc596) + %dq_adj1_75 = arith.muli %c8388608_i32_10, %off_zq : i32 loc(#loc597) + %dq_adj1_76 = arith.addi %dq_adj1, %dq_adj1_75 : i32 loc(#loc598) + %dq_adj1_77 = arith.extsi %dq_adj1_76 : i32 to i64 loc(#loc599) + %off_chz1 = arith.muli %off_zq, %HQ : i32 loc(#loc600) + %off_chz1_78 = arith.addi %off_chz1, %off_hq1_68 : i32 loc(#loc601) + %off_chz1_79 = arith.muli %off_chz1_78, %Q_LEN : i32 loc(#loc602) + %off_chz1_80 = arith.extsi %off_chz1_79 : i32 to i64 loc(#loc603) + %Q1 = tt.addptr %arg_Q, %q_adj1_71 : !tt.ptr, i64 loc(#loc604) + %DO1 = tt.addptr %arg_DO, %do_adj1_74 : !tt.ptr, i64 loc(#loc605) + %LSE1 = tt.addptr %arg_LSE, %off_chz1_80 : !tt.ptr, i64 loc(#loc606) + %DELTA1 = tt.addptr %arg_DELTA, %off_chz1_80 : !tt.ptr, i64 loc(#loc607) + %sparse_idx_hq1 = arith.remsi %off_hq1_68, %SPARSE_HQ : i32 loc(#loc608) + %sparse_hz_offset = arith.muli %sparse_idx_z, %SPARSE_HQ : i32 loc(#loc609) + %sparse_hz_offset_81 = arith.addi %sparse_hz_offset, %sparse_idx_hq1 : i32 loc(#loc610) + %sparse_q_num_blks_offset = arith.muli %sparse_hz_offset_81, %stride_q_num_blks_h : i32 loc(#loc611) + %sparse_q_num_blks_offset_82 = arith.addi %sparse_q_num_blks_offset, %pid_mask : i32 loc(#loc612) + %sparse_q_idx_offset = arith.muli %sparse_hz_offset_81, %stride_q_idx_h : i32 loc(#loc613) + %sparse_q_idx_offset_83 = arith.muli %pid_mask, %stride_q_idx_n : i32 loc(#loc614) + %sparse_q_idx_offset_84 = arith.addi %sparse_q_idx_offset, %sparse_q_idx_offset_83 : i32 loc(#loc615) + %q_indices = tt.addptr %arg_Q_IDX, %sparse_q_idx_offset_84 : !tt.ptr, i32 loc(#loc616) + %q_start = tt.load %q_indices : !tt.ptr loc(#loc617) + %q_start_85 = arith.constant 128 : i32 loc(#loc618) + %q_start_86 = arith.constant 128 : i32 loc(#loc618) + %q_start_87 = arith.muli %q_start, %q_start_86 : i32 loc(#loc618) + %sparse_q_num_blocks = tt.addptr %arg_Q_NUM_BLKS, %sparse_q_num_blks_offset_82 : !tt.ptr, i32 loc(#loc619) + %sparse_q_num_blocks_88 = tt.load %sparse_q_num_blocks : !tt.ptr loc(#loc620) + %offs_m1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc621) + %offs_m1_89 = tt.splat %q_start_87 : i32 -> tensor<64xi32> loc(#loc622) + %offs_m1_90 = arith.addi %offs_m1_89, %offs_m1 : tensor<64xi32> loc(#loc622) + %11:2 = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(36,)cconstexpr_bf16__(37,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %Q1, %DO1, %DELTA1, %LSE1, %dk_65, %dv_64, %k, %v, %off_zq, %off_hq1_68, %offs_n1_30, %offs_m1_90, %c4096_i32, %c1_i32, %c128_i32_8, %c1_i32_9, %q_indices, %sparse_q_num_blocks_88) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc170) + %q_indices_91 = tt.addptr %arg_FULL_Q_IDX, %sparse_q_idx_offset_84 : !tt.ptr, i32 loc(#loc623) + %q_start_92 = tt.load %q_indices_91 : !tt.ptr loc(#loc624) + %q_start_93 = arith.constant 128 : i32 loc(#loc625) + %q_start_94 = arith.constant 128 : i32 loc(#loc625) + %q_start_95 = arith.muli %q_start_92, %q_start_94 : i32 loc(#loc625) + %sparse_q_num_blocks_96 = tt.addptr %arg_FULL_Q_NUM_BLKS, %sparse_q_num_blks_offset_82 : !tt.ptr, i32 loc(#loc626) + %sparse_q_num_blocks_97 = tt.load %sparse_q_num_blocks_96 : !tt.ptr loc(#loc627) + %offs_m1_98 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc628) + %offs_m1_99 = tt.splat %q_start_95 : i32 -> tensor<64xi32> loc(#loc629) + %offs_m1_100 = arith.addi %offs_m1_99, %offs_m1_98 : tensor<64xi32> loc(#loc629) + %12:2 = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(36,)cconstexpr_bf16__(37,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %Q1, %DO1, %DELTA1, %LSE1, %11#0, %11#1, %k, %v, %off_zq, %off_hq1_68, %offs_n1_30, %offs_m1_100, %c4096_i32, %c1_i32, %c128_i32_8, %c1_i32_9, %q_indices_91, %sparse_q_num_blocks_97) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc178) + scf.yield %12#1, %12#0 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc179) + } loc(#loc1022) + %dv_ptrs = tt.expand_dims %offs_n1_30 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc630) + %dv_ptrs_33 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc631) + %dv_ptrs_34 = arith.muli %dv_ptrs, %dv_ptrs_33 : tensor<128x1xi32> loc(#loc631) + %dv_ptrs_35 = tt.splat %DV : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc632) + %dv_ptrs_36 = tt.addptr %dv_ptrs_35, %dv_ptrs_34 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc632) + %dv_ptrs_37 = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc633) + %dv_ptrs_38 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc634) + %dv_ptrs_39 = arith.muli %dv_ptrs_37, %dv_ptrs_38 : tensor<1x128xi32> loc(#loc634) + %dv_ptrs_40 = tt.broadcast %dv_ptrs_36 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc635) + %dv_ptrs_41 = tt.broadcast %dv_ptrs_39 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc635) + %dv_ptrs_42 = tt.addptr %dv_ptrs_40, %dv_ptrs_41 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc635) + %index_n = tt.expand_dims %offs_n1_30 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc636) + %index_k = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc637) + %index_v = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc638) + %6 = arith.truncf %dk_32#0 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc189) + tt.store %dv_ptrs_42, %6 : tensor<128x128x!tt.ptr> loc(#loc189) + %dk_43 = arith.constant 0.0883883461 : f32 loc(#loc639) + %dk_44 = arith.constant 0.0883883461 : f32 loc(#loc639) + %dk_45 = arith.constant dense<0.0883883461> : tensor<128x128xf32> loc(#loc639) + %dk_46 = arith.mulf %dk_32#1, %dk_45 : tensor<128x128xf32> loc(#loc639) + %mask = arith.constant dense<2048> : tensor<128x1xi32> loc(#loc640) + %mask_47 = arith.cmpi slt, %index_n, %mask : tensor<128x1xi32> loc(#loc640) + %xindex = arith.constant 128 : i32 loc(#loc641) + %xindex_48 = arith.constant 128 : i32 loc(#loc641) + %xindex_49 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc641) + %xindex_50 = arith.muli %xindex_49, %index_n : tensor<128x1xi32> loc(#loc641) + %xindex_51 = tt.broadcast %index_k : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc642) + %xindex_52 = tt.broadcast %xindex_50 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc642) + %xindex_53 = arith.addi %xindex_51, %xindex_52 : tensor<128x128xi32> loc(#loc642) + %xindex_54 = arith.constant 262144 : i32 loc(#loc643) + %xindex_55 = arith.constant 262144 : i32 loc(#loc643) + %xindex_56 = arith.muli %xindex_55, %off_hkv : i32 loc(#loc643) + %xindex_57 = tt.splat %xindex_56 : i32 -> tensor<128x128xi32> loc(#loc644) + %xindex_58 = arith.addi %xindex_53, %xindex_57 : tensor<128x128xi32> loc(#loc644) + %xindex_59 = arith.constant 2097152 : i32 loc(#loc645) + %xindex_60 = arith.constant 2097152 : i32 loc(#loc645) + %xindex_61 = arith.muli %xindex_60, %off_zq : i32 loc(#loc645) + %xindex_62 = tt.splat %xindex_61 : i32 -> tensor<128x128xi32> loc(#loc646) + %xindex_63 = arith.addi %xindex_58, %xindex_62 : tensor<128x128xi32> loc(#loc646) + %7 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc198) + %8 = tt.addptr %7, %xindex_63 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc198) + %9 = tt.broadcast %mask_47 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc199) + %10 = arith.truncf %dk_46 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc199) + tt.store %8, %10, %9 : tensor<128x128x!tt.ptr> loc(#loc199) + scf.yield %SPARSE_KV_MULTIPLE, %SPARSE_Q_MULTIPLE : i32, i32 loc(#loc199) + } loc(#loc41) + tt.return loc(#loc200) + } loc(#loc) + tt.func private @"triton.language.standard.cdiv__i32__(1,)cconstexpr_128_"(%x: i32 loc("x"(#loc201))) -> i32 attributes {noinline = false} { + %c128_i32 = arith.constant 128 : i32 loc(#loc202) + %c128_i32_0 = arith.constant 128 : i32 loc(#loc202) + %0 = arith.addi %x, %c128_i32_0 : i32 loc(#loc202) + %c1_i32 = arith.constant 1 : i32 loc(#loc203) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc203) + %1 = arith.subi %0, %c1_i32_1 : i32 loc(#loc203) + %c128_i32_2 = arith.constant 128 : i32 loc(#loc204) + %c128_i32_3 = arith.constant 128 : i32 loc(#loc204) + %2 = arith.divsi %1, %c128_i32_3 : i32 loc(#loc204) + tt.return %2 : i32 loc(#loc205) + ^bb1: // no predecessors + %3 = ub.poison : i32 loc(#loc206) + tt.return %3 : i32 loc(#loc206) + } loc(#loc201) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() -> tensor<128x128xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc208) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc208) + tt.return %cst_0 : tensor<128x128xf32> loc(#loc209) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc210) + tt.return %0 : tensor<128x128xf32> loc(#loc210) + } loc(#loc207) + tt.func private @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: !tt.ptr loc("ptr"(#loc211)), %offs_m: tensor<128xi32> loc("offs_m"(#loc211)), %offs_n: tensor<128xi32> loc("offs_n"(#loc211)), %stride_m: i32 loc("stride_m"(#loc211)), %stride_n: i32 loc("stride_n"(#loc211)), %M_LEN: i32 loc("M_LEN"(#loc211))) -> tensor<128x128xbf16> attributes {noinline = false} { + %ptr_0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc654) + %ptr_1 = tt.splat %stride_m : i32 -> tensor<128x1xi32> loc(#loc655) + %ptr_2 = arith.muli %ptr_0, %ptr_1 : tensor<128x1xi32> loc(#loc655) + %ptr_3 = tt.splat %ptr : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc656) + %ptr_4 = tt.addptr %ptr_3, %ptr_2 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc656) + %ptr_5 = tt.expand_dims %offs_n {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc657) + %ptr_6 = tt.splat %stride_n : i32 -> tensor<1x128xi32> loc(#loc658) + %ptr_7 = arith.muli %ptr_5, %ptr_6 : tensor<1x128xi32> loc(#loc658) + %ptr_8 = tt.broadcast %ptr_4 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc659) + %ptr_9 = tt.broadcast %ptr_7 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc659) + %ptr_10 = tt.addptr %ptr_8, %ptr_9 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc659) + %0 = tt.load %ptr_10 : tensor<128x128x!tt.ptr> loc(#loc218) + tt.return %0 : tensor<128x128xbf16> loc(#loc219) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x128xbf16> loc(#loc220) + tt.return %1 : tensor<128x128xbf16> loc(#loc220) + } loc(#loc211) + tt.func private @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(35,)cconstexpr_bf16__(36,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc221)), %arg_K: !tt.ptr loc("arg_K"(#loc221)), %arg_V: !tt.ptr loc("arg_V"(#loc221)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc221)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc221)), %arg_DO: !tt.ptr loc("arg_DO"(#loc221)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc221)), %arg_DV: !tt.ptr loc("arg_DV"(#loc221)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc221)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc221)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc221)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc221)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc221)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc221)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc221)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc221)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc221)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc221)), %K: !tt.ptr loc("K"(#loc221)), %V: !tt.ptr loc("V"(#loc221)), %dq: tensor<128x128xf32> loc("dq"(#loc221)), %q: tensor<128x128xbf16> loc("q"(#loc221)), %do: tensor<128x128xbf16> loc("do"(#loc221)), %Di: tensor<128xf32> loc("Di"(#loc221)), %lse: tensor<128x1xf32> loc("lse"(#loc221)), %off_z: i32 loc("off_z"(#loc221)), %off_hq: i32 loc("off_hq"(#loc221)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc221)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc221)), %stride_kn: i32 loc("stride_kn"(#loc221)), %stride_kd: i32 loc("stride_kd"(#loc221)), %stride_vn: i32 loc("stride_vn"(#loc221)), %stride_vd: i32 loc("stride_vd"(#loc221)), %kv_indices: !tt.ptr loc("kv_indices"(#loc221)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc221))) -> tensor<128x128xf32> attributes {noinline = false} { + %Q_LEN = arith.constant 2048 : i32 loc(#loc695) + %KV_LEN = arith.constant 2048 : i32 loc(#loc696) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc697) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc698) + %kT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc699) + %kT_ptrs_0 = tt.splat %stride_kn : i32 -> tensor<1x64xi32> loc(#loc700) + %kT_ptrs_1 = arith.muli %kT_ptrs, %kT_ptrs_0 : tensor<1x64xi32> loc(#loc700) + %kT_ptrs_2 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc701) + %kT_ptrs_3 = tt.addptr %kT_ptrs_2, %kT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc701) + %kT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc702) + %kT_ptrs_5 = tt.splat %stride_kd : i32 -> tensor<128x1xi32> loc(#loc703) + %kT_ptrs_6 = arith.muli %kT_ptrs_4, %kT_ptrs_5 : tensor<128x1xi32> loc(#loc703) + %kT_ptrs_7 = tt.broadcast %kT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc704) + %kT_ptrs_8 = tt.broadcast %kT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc704) + %kT_ptrs_9 = tt.addptr %kT_ptrs_7, %kT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc704) + %vT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc705) + %vT_ptrs_10 = tt.splat %stride_vn : i32 -> tensor<1x64xi32> loc(#loc706) + %vT_ptrs_11 = arith.muli %vT_ptrs, %vT_ptrs_10 : tensor<1x64xi32> loc(#loc706) + %vT_ptrs_12 = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc707) + %vT_ptrs_13 = tt.addptr %vT_ptrs_12, %vT_ptrs_11 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc707) + %vT_ptrs_14 = tt.expand_dims %offs_v {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc708) + %vT_ptrs_15 = tt.splat %stride_vd : i32 -> tensor<128x1xi32> loc(#loc709) + %vT_ptrs_16 = arith.muli %vT_ptrs_14, %vT_ptrs_15 : tensor<128x1xi32> loc(#loc709) + %vT_ptrs_17 = tt.broadcast %vT_ptrs_13 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc710) + %vT_ptrs_18 = tt.broadcast %vT_ptrs_16 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc710) + %vT_ptrs_19 = tt.addptr %vT_ptrs_17, %vT_ptrs_18 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc710) + %hi = arith.constant 2 : i32 loc(#loc711) + %hi_20 = arith.constant 2 : i32 loc(#loc711) + %hi_21 = arith.muli %sparse_kv_num_blocks, %hi_20 : i32 loc(#loc711) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%KV_LEN) : (i32) -> i32 loc(#loc712) + %hi_23 = arith.constant 1 : i32 loc(#loc713) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc713) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc714) + %c0_i32 = arith.constant 0 : i32 loc(#loc242) + %c1_i32 = arith.constant 1 : i32 loc(#loc242) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc242) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc242) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc242) + %3 = ub.poison : i32 loc(#loc242) + %vT_ptrs_26:4 = scf.for %start_n = %0 to %1 step %2 iter_args(%dq_27 = %dq, %offs_n2_28 = %offs_n2, %kT_ptrs_29 = %kT_ptrs_9, %vT_ptrs_30 = %vT_ptrs_19) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %dq_31 = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(39,)cconstexpr_bf16__(40,)cconstexpr_1_d_44269504__(41,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %dq_27, %q, %kT_ptrs_29, %vT_ptrs_30, %do, %Di, %lse, %Q_LEN, %KV_LEN, %off_z, %off_hq, %offs_m2, %offs_n2_28, %offs_k, %offs_v, %stride_kn, %stride_kd, %stride_vn, %stride_vd, %kv_indices, %sparse_kv_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc716) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %sparse_kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc717) + %kT_ptrs_32 = arith.muli %offset, %stride_kn : i32 loc(#loc718) + %kT_ptrs_33 = tt.splat %kT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc719) + %kT_ptrs_34 = tt.addptr %kT_ptrs_29, %kT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc719) + %vT_ptrs_35 = arith.muli %offset, %stride_vn : i32 loc(#loc720) + %vT_ptrs_36 = tt.splat %vT_ptrs_35 : i32 -> tensor<128x64xi32> loc(#loc721) + %vT_ptrs_37 = tt.addptr %vT_ptrs_30, %vT_ptrs_36 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc721) + %offs_n2_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc722) + %offs_n2_39 = arith.addi %offs_n2_28, %offs_n2_38 : tensor<64xi32> loc(#loc722) + scf.yield %dq_31, %offs_n2_39, %kT_ptrs_34, %vT_ptrs_37 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc250) + } loc(#loc1027) + tt.return %vT_ptrs_26#0 : tensor<128x128xf32> loc(#loc251) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc252) + tt.return %4 : tensor<128x128xf32> loc(#loc252) + } loc(#loc221) + tt.func private @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%x: i32 loc("x"(#loc201))) -> i32 attributes {noinline = false} { + %c64_i32 = arith.constant 64 : i32 loc(#loc202) + %c64_i32_0 = arith.constant 64 : i32 loc(#loc202) + %0 = arith.addi %x, %c64_i32_0 : i32 loc(#loc202) + %c1_i32 = arith.constant 1 : i32 loc(#loc203) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc203) + %1 = arith.subi %0, %c1_i32_1 : i32 loc(#loc203) + %c64_i32_2 = arith.constant 64 : i32 loc(#loc204) + %c64_i32_3 = arith.constant 64 : i32 loc(#loc204) + %2 = arith.divsi %1, %c64_i32_3 : i32 loc(#loc204) + tt.return %2 : i32 loc(#loc205) + ^bb1: // no predecessors + %3 = ub.poison : i32 loc(#loc206) + tt.return %3 : i32 loc(#loc206) + } loc(#loc201) + tt.func private @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(39,)cconstexpr_bf16__(40,)cconstexpr_1_d_44269504__(41,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc253)), %arg_K: !tt.ptr loc("arg_K"(#loc253)), %arg_V: !tt.ptr loc("arg_V"(#loc253)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc253)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc253)), %arg_DO: !tt.ptr loc("arg_DO"(#loc253)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc253)), %arg_DV: !tt.ptr loc("arg_DV"(#loc253)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc253)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc253)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc253)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc253)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc253)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc253)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc253)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc253)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc253)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc253)), %dq: tensor<128x128xf32> loc("dq"(#loc253)), %q: tensor<128x128xbf16> loc("q"(#loc253)), %kT_ptrs: tensor<128x64x!tt.ptr> loc("kT_ptrs"(#loc253)), %vT_ptrs: tensor<128x64x!tt.ptr> loc("vT_ptrs"(#loc253)), %do: tensor<128x128xbf16> loc("do"(#loc253)), %Di: tensor<128xf32> loc("Di"(#loc253)), %lse: tensor<128x1xf32> loc("lse"(#loc253)), %Q_LEN: i32 loc("Q_LEN"(#loc253)), %KV_LEN: i32 loc("KV_LEN"(#loc253)), %off_z: i32 loc("off_z"(#loc253)), %off_hq: i32 loc("off_hq"(#loc253)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc253)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc253)), %offs_k: tensor<128xi32> loc("offs_k"(#loc253)), %offs_v: tensor<128xi32> loc("offs_v"(#loc253)), %stride_kn: i32 loc("stride_kn"(#loc253)), %stride_kd: i32 loc("stride_kd"(#loc253)), %stride_vn: i32 loc("stride_vn"(#loc253)), %stride_vd: i32 loc("stride_vd"(#loc253)), %kv_indices: !tt.ptr loc("kv_indices"(#loc253)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc253))) -> tensor<128x128xf32> attributes {noinline = false} { + %kT = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(7,)cconstexpr_128_"(%kT_ptrs, %offs_k, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc762) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc763) + %qk_0 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc763) + %qk_1 = tt.dot %q, %kT, %qk_0, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc763) + %qk_2 = arith.constant 0.0883883461 : f32 loc(#loc764) + %qk_3 = arith.constant 0.0883883461 : f32 loc(#loc764) + %qk_4 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc764) + %qk_5 = arith.mulf %qk_1, %qk_4 : tensor<128x64xf32> loc(#loc764) + %n = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc765) + %n_6 = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.get_bounded_indices__i32S1_64S__(1,)cconstexpr_None_"(%n) : (tensor<1x64xi32>) -> tensor<1x64xi32> loc(#loc766) + %m = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc767) + %m_7 = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.get_bounded_indices__i32S128_1S__(1,)cconstexpr_None_"(%m) : (tensor<128x1xi32>) -> tensor<128x1xi32> loc(#loc768) + %tmp1 = arith.constant false loc(#loc769) + %tmp1_8 = arith.constant dense : tensor<1xi1> loc(#loc769) + %tmp4 = tt.broadcast %m_7 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc770) + %tmp4_9 = tt.broadcast %n_6 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc770) + %tmp4_10 = arith.cmpi sge, %tmp4, %tmp4_9 : tensor<128x64xi32> loc(#loc770) + %tmp5 = arith.extsi %n_6 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc771) + %tmp7 = tt.addptr %in_ptr16, %off_z : !tt.ptr, i32 loc(#loc772) + %tmp7_11 = tt.load %tmp7 : !tt.ptr loc(#loc773) + %tmp8 = tt.splat %tmp7_11 : i64 -> tensor<1x64xi64> loc(#loc774) + %tmp8_12 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc774) + %tmp9 = arith.extsi %m_7 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc775) + %tmp10 = tt.splat %tmp7_11 : i64 -> tensor<128x1xi64> loc(#loc776) + %tmp10_13 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc776) + %tmp11 = tt.broadcast %tmp8_12 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc777) + %tmp11_14 = tt.broadcast %tmp10_13 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc777) + %tmp11_15 = arith.andi %tmp11, %tmp11_14 : tensor<128x64xi1> loc(#loc777) + %tmp12 = arith.andi %tmp4_10, %tmp11_15 : tensor<128x64xi1> loc(#loc778) + %tmp13 = tt.expand_dims %tmp1_8 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc779) + %tmp13_16 = tt.broadcast %tmp13 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc779) + %tmp13_17 = arith.ori %tmp13_16, %tmp12 : tensor<128x64xi1> loc(#loc779) + %tmp14 = arith.constant 2048 : i32 loc(#loc780) + %tmp14_18 = arith.constant dense<2048> : tensor<1xi32> loc(#loc780) + %tmp15 = tt.expand_dims %tmp14_18 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc781) + %tmp15_19 = tt.broadcast %tmp15 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc781) + %tmp15_20 = arith.cmpi sge, %n_6, %tmp15_19 : tensor<1x64xi32> loc(#loc781) + %tmp16 = tt.expand_dims %tmp14_18 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc782) + %tmp16_21 = tt.broadcast %tmp16 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc782) + %tmp16_22 = arith.remsi %n_6, %tmp16_21 : tensor<1x64xi32> loc(#loc782) + %tmp17 = arith.constant 0 : i32 loc(#loc783) + %tmp17_23 = arith.constant dense<0> : tensor<1xi32> loc(#loc783) + %tmp18 = tt.expand_dims %tmp17_23 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc784) + %tmp18_24 = tt.broadcast %tmp18 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc784) + %tmp18_25 = arith.cmpi ne, %tmp16_22, %tmp18_24 : tensor<1x64xi32> loc(#loc784) + %tmp19 = arith.constant 0 : i32 loc(#loc785) + %tmp19_26 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc785) + %tmp19_27 = arith.cmpi slt, %tmp16_22, %tmp19_26 : tensor<1x64xi32> loc(#loc785) + %tmp20 = arith.constant 0 : i32 loc(#loc786) + %tmp20_28 = arith.constant dense<0> : tensor<1xi32> loc(#loc786) + %tmp20_29 = arith.cmpi slt, %tmp14_18, %tmp20_28 : tensor<1xi32> loc(#loc786) + %tmp21 = tt.expand_dims %tmp20_29 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc787) + %tmp21_30 = tt.broadcast %tmp21 : tensor<1x1xi1> -> tensor<1x64xi1> loc(#loc787) + %tmp21_31 = arith.cmpi ne, %tmp19_27, %tmp21_30 : tensor<1x64xi1> loc(#loc787) + %tmp22 = arith.andi %tmp18_25, %tmp21_31 : tensor<1x64xi1> loc(#loc788) + %tmp23 = tt.expand_dims %tmp14_18 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc789) + %tmp23_32 = tt.broadcast %tmp23 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc789) + %tmp23_33 = arith.addi %tmp16_22, %tmp23_32 : tensor<1x64xi32> loc(#loc789) + %tmp24 = arith.select %tmp22, %tmp23_33, %tmp16_22 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc790) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc791) + %tmp26 = tt.splat %tmp7_11 : i64 -> tensor<1x64xi64> loc(#loc792) + %tmp26_34 = arith.cmpi slt, %tmp25, %tmp26 : tensor<1x64xi64> loc(#loc792) + %tmp27 = arith.andi %tmp15_20, %tmp26_34 : tensor<1x64xi1> loc(#loc793) + %tmp28 = tt.broadcast %n_6 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc794) + %tmp28_35 = tt.broadcast %m_7 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc794) + %tmp28_36 = arith.subi %tmp28, %tmp28_35 : tensor<128x64xi32> loc(#loc794) + %tmp29 = tt.expand_dims %tmp14_18 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc795) + %tmp29_37 = tt.broadcast %tmp29 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc795) + %tmp29_38 = arith.remsi %tmp28_36, %tmp29_37 : tensor<128x64xi32> loc(#loc795) + %tmp30 = tt.expand_dims %tmp17_23 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc796) + %tmp30_39 = tt.broadcast %tmp30 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc796) + %tmp30_40 = arith.cmpi ne, %tmp29_38, %tmp30_39 : tensor<128x64xi32> loc(#loc796) + %tmp31 = arith.constant 0 : i32 loc(#loc797) + %tmp31_41 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc797) + %tmp31_42 = arith.cmpi slt, %tmp29_38, %tmp31_41 : tensor<128x64xi32> loc(#loc797) + %tmp32 = tt.expand_dims %tmp20_29 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc798) + %tmp32_43 = tt.broadcast %tmp32 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc798) + %tmp32_44 = arith.cmpi ne, %tmp31_42, %tmp32_43 : tensor<128x64xi1> loc(#loc798) + %tmp33 = arith.andi %tmp30_40, %tmp32_44 : tensor<128x64xi1> loc(#loc799) + %tmp34 = tt.expand_dims %tmp14_18 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc800) + %tmp34_45 = tt.broadcast %tmp34 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc800) + %tmp34_46 = arith.addi %tmp29_38, %tmp34_45 : tensor<128x64xi32> loc(#loc800) + %tmp35 = arith.select %tmp33, %tmp34_46, %tmp29_38 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc801) + %tmp36 = tt.expand_dims %tmp17_23 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc802) + %tmp36_47 = tt.broadcast %tmp36 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc802) + %tmp36_48 = arith.cmpi eq, %tmp35, %tmp36_47 : tensor<128x64xi32> loc(#loc802) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc803) + %tmp37_49 = arith.andi %tmp37, %tmp36_48 : tensor<128x64xi1> loc(#loc803) + %tmp38 = arith.ori %tmp13_17, %tmp37_49 : tensor<128x64xi1> loc(#loc804) + %post_mod_scores = arith.constant 0xFF800000 : f32 loc(#loc805) + %post_mod_scores_50 = arith.constant 0xFF800000 : f32 loc(#loc805) + %post_mod_scores_51 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc805) + %post_mod_scores_52 = arith.select %tmp38, %qk_5, %post_mod_scores_51 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc805) + %post_mod_scores_53 = arith.constant 1.44269502 : f32 loc(#loc806) + %post_mod_scores_54 = arith.constant 1.44269502 : f32 loc(#loc806) + %post_mod_scores_55 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc806) + %post_mod_scores_56 = arith.mulf %post_mod_scores_52, %post_mod_scores_55 : tensor<128x64xf32> loc(#loc806) + %p = tt.broadcast %lse : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc807) + %p_57 = arith.subf %post_mod_scores_56, %p : tensor<128x64xf32> loc(#loc807) + %p_58 = math.exp2 %p_57 : tensor<128x64xf32> loc(#loc808) + %vT = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(7,)cconstexpr_128_"(%vT_ptrs, %offs_v, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc809) + %dp = arith.constant 0.000000e+00 : f32 loc(#loc810) + %dp_59 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc810) + %dp_60 = tt.dot %do, %vT, %dp_59, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc810) + %ds = tt.expand_dims %Di {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc811) + %ds_61 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc812) + %ds_62 = arith.subf %dp_60, %ds_61 : tensor<128x64xf32> loc(#loc812) + %ds_63 = arith.mulf %p_58, %ds_62 : tensor<128x64xf32> loc(#loc813) + %scatter_mask = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc814) + %scatter_mask_64 = tt.splat %Q_LEN : i32 -> tensor<128x1xi32> loc(#loc815) + %scatter_mask_65 = arith.cmpi slt, %scatter_mask, %scatter_mask_64 : tensor<128x1xi32> loc(#loc815) + %scatter_mask_66 = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc816) + %scatter_mask_67 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc817) + %scatter_mask_68 = arith.cmpi slt, %scatter_mask_66, %scatter_mask_67 : tensor<1x64xi32> loc(#loc817) + %scatter_mask_69 = tt.broadcast %scatter_mask_65 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc818) + %scatter_mask_70 = tt.broadcast %scatter_mask_68 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc818) + %scatter_mask_71 = arith.andi %scatter_mask_69, %scatter_mask_70 : tensor<128x64xi1> loc(#loc818) + %ds_72 = arith.constant 0.000000e+00 : f32 loc(#loc819) + %ds_73 = arith.constant 0.000000e+00 : f32 loc(#loc819) + %ds_74 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc819) + %ds_75 = arith.select %tmp38, %ds_63, %ds_74 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc819) + %ds_76 = arith.truncf %ds_75 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc820) + %dq_77 = tt.trans %kT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc821) + %dq_78 = arith.constant 0.000000e+00 : f32 loc(#loc822) + %dq_79 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc822) + %dq_80 = tt.dot %ds_76, %dq_77, %dq_79, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc822) + %dq_81 = arith.addf %dq, %dq_80 : tensor<128x128xf32> loc(#loc823) + tt.return %dq_81 : tensor<128x128xf32> loc(#loc316) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc317) + tt.return %0 : tensor<128x128xf32> loc(#loc317) + } loc(#loc253) + tt.func private @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(7,)cconstexpr_128_"(%ptr: tensor<128x64x!tt.ptr> loc("ptr"(#loc211)), %offs_m: tensor<128xi32> loc("offs_m"(#loc211)), %offs_n: tensor<64xi32> loc("offs_n"(#loc211)), %N_LEN: i32 loc("N_LEN"(#loc211))) -> tensor<128x64xbf16> attributes {noinline = false} { + %0 = tt.load %ptr : tensor<128x64x!tt.ptr> loc(#loc218) + tt.return %0 : tensor<128x64xbf16> loc(#loc219) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x64xbf16> loc(#loc220) + tt.return %1 : tensor<128x64xbf16> loc(#loc220) + } loc(#loc211) + tt.func private @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.get_bounded_indices__i32S1_64S__(1,)cconstexpr_None_"(%indices: tensor<1x64xi32> loc("indices"(#loc318))) -> tensor<1x64xi32> attributes {noinline = false} { + tt.return %indices : tensor<1x64xi32> loc(#loc319) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x64xi32> loc(#loc320) + tt.return %0 : tensor<1x64xi32> loc(#loc320) + } loc(#loc318) + tt.func private @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.get_bounded_indices__i32S128_1S__(1,)cconstexpr_None_"(%indices: tensor<128x1xi32> loc("indices"(#loc318))) -> tensor<128x1xi32> attributes {noinline = false} { + tt.return %indices : tensor<128x1xi32> loc(#loc319) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x1xi32> loc(#loc320) + tt.return %0 : tensor<128x1xi32> loc(#loc320) + } loc(#loc318) + tt.func private @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%loop_iter: i32 loc("loop_iter"(#loc321)), %col_indices: !tt.ptr loc("col_indices"(#loc321)), %total_blocks: i32 loc("total_blocks"(#loc321))) -> i32 attributes {noinline = false} { + %cur_block_idx = arith.constant 2 : i32 loc(#loc829) + %cur_block_idx_0 = arith.constant 2 : i32 loc(#loc829) + %cur_block_idx_1 = arith.divsi %loop_iter, %cur_block_idx_0 : i32 loc(#loc829) + %cur_block = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc830) + %cur_block_2 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc831) + %next_block = arith.constant 1 : i32 loc(#loc832) + %next_block_3 = arith.constant 1 : i32 loc(#loc832) + %next_block_4 = arith.addi %cur_block_idx_1, %next_block_3 : i32 loc(#loc832) + %next_block_5 = arith.cmpi slt, %next_block_4, %total_blocks : i32 loc(#loc833) + %next_block_6 = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc834) + %next_block_7 = arith.constant 1 : i32 loc(#loc835) + %next_block_8 = tt.addptr %next_block_6, %next_block_7 : !tt.ptr, i32 loc(#loc835) + %next_block_9 = tt.load %next_block_8, %next_block_5 evictionPolicy = evict_last : !tt.ptr loc(#loc836) + %needs_jump = arith.constant 1 : i32 loc(#loc837) + %needs_jump_10 = arith.constant 1 : i32 loc(#loc837) + %needs_jump_11 = arith.addi %loop_iter, %needs_jump_10 : i32 loc(#loc837) + %needs_jump_12 = arith.constant 2 : i32 loc(#loc838) + %needs_jump_13 = arith.constant 2 : i32 loc(#loc838) + %needs_jump_14 = arith.remsi %needs_jump_11, %needs_jump_13 : i32 loc(#loc838) + %needs_jump_15 = arith.constant 0 : i32 loc(#loc839) + %needs_jump_16 = arith.cmpi eq, %needs_jump_14, %needs_jump_15 : i32 loc(#loc839) + %jump_to_block = arith.subi %next_block_9, %cur_block_2 : i32 loc(#loc840) + %jump_to_block_17 = arith.constant 128 : i32 loc(#loc841) + %jump_to_block_18 = arith.constant 128 : i32 loc(#loc841) + %jump_to_block_19 = arith.muli %jump_to_block, %jump_to_block_18 : i32 loc(#loc841) + %jump_to_block_20 = arith.constant 64 : i32 loc(#loc842) + %jump_to_block_21 = arith.constant 64 : i32 loc(#loc842) + %jump_to_block_22 = arith.subi %jump_to_block_19, %jump_to_block_21 : i32 loc(#loc842) + %offset = arith.extui %needs_jump_16 : i1 to i32 loc(#loc843) + %offset_23 = arith.muli %jump_to_block_22, %offset : i32 loc(#loc843) + %offset_24 = arith.constant 1 : i32 loc(#loc844) + %offset_25 = arith.constant 1 : i32 loc(#loc844) + %offset_26 = arith.extui %needs_jump_16 : i1 to i32 loc(#loc844) + %offset_27 = arith.subi %offset_25, %offset_26 : i32 loc(#loc844) + %offset_28 = arith.constant 64 : i32 loc(#loc845) + %offset_29 = arith.constant 64 : i32 loc(#loc845) + %offset_30 = arith.muli %offset_27, %offset_29 : i32 loc(#loc845) + %offset_31 = arith.addi %offset_23, %offset_30 : i32 loc(#loc846) + tt.return %offset_31 : i32 loc(#loc340) + ^bb1: // no predecessors + %0 = ub.poison : i32 loc(#loc341) + tt.return %0 : i32 loc(#loc341) + } loc(#loc321) + tt.func private @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(35,)cconstexpr_bf16__(36,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc221)), %arg_K: !tt.ptr loc("arg_K"(#loc221)), %arg_V: !tt.ptr loc("arg_V"(#loc221)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc221)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc221)), %arg_DO: !tt.ptr loc("arg_DO"(#loc221)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc221)), %arg_DV: !tt.ptr loc("arg_DV"(#loc221)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc221)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc221)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc221)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc221)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc221)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc221)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc221)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc221)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc221)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc221)), %K: !tt.ptr loc("K"(#loc221)), %V: !tt.ptr loc("V"(#loc221)), %dq: tensor<128x128xf32> loc("dq"(#loc221)), %q: tensor<128x128xbf16> loc("q"(#loc221)), %do: tensor<128x128xbf16> loc("do"(#loc221)), %Di: tensor<128xf32> loc("Di"(#loc221)), %lse: tensor<128x1xf32> loc("lse"(#loc221)), %off_z: i32 loc("off_z"(#loc221)), %off_hq: i32 loc("off_hq"(#loc221)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc221)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc221)), %stride_kn: i32 loc("stride_kn"(#loc221)), %stride_kd: i32 loc("stride_kd"(#loc221)), %stride_vn: i32 loc("stride_vn"(#loc221)), %stride_vd: i32 loc("stride_vd"(#loc221)), %kv_indices: !tt.ptr loc("kv_indices"(#loc221)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc221))) -> tensor<128x128xf32> attributes {noinline = false} { + %Q_LEN = arith.constant 2048 : i32 loc(#loc695) + %KV_LEN = arith.constant 2048 : i32 loc(#loc696) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc697) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc698) + %kT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc699) + %kT_ptrs_0 = tt.splat %stride_kn : i32 -> tensor<1x64xi32> loc(#loc700) + %kT_ptrs_1 = arith.muli %kT_ptrs, %kT_ptrs_0 : tensor<1x64xi32> loc(#loc700) + %kT_ptrs_2 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc701) + %kT_ptrs_3 = tt.addptr %kT_ptrs_2, %kT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc701) + %kT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc702) + %kT_ptrs_5 = tt.splat %stride_kd : i32 -> tensor<128x1xi32> loc(#loc703) + %kT_ptrs_6 = arith.muli %kT_ptrs_4, %kT_ptrs_5 : tensor<128x1xi32> loc(#loc703) + %kT_ptrs_7 = tt.broadcast %kT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc704) + %kT_ptrs_8 = tt.broadcast %kT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc704) + %kT_ptrs_9 = tt.addptr %kT_ptrs_7, %kT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc704) + %vT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc705) + %vT_ptrs_10 = tt.splat %stride_vn : i32 -> tensor<1x64xi32> loc(#loc706) + %vT_ptrs_11 = arith.muli %vT_ptrs, %vT_ptrs_10 : tensor<1x64xi32> loc(#loc706) + %vT_ptrs_12 = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc707) + %vT_ptrs_13 = tt.addptr %vT_ptrs_12, %vT_ptrs_11 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc707) + %vT_ptrs_14 = tt.expand_dims %offs_v {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc708) + %vT_ptrs_15 = tt.splat %stride_vd : i32 -> tensor<128x1xi32> loc(#loc709) + %vT_ptrs_16 = arith.muli %vT_ptrs_14, %vT_ptrs_15 : tensor<128x1xi32> loc(#loc709) + %vT_ptrs_17 = tt.broadcast %vT_ptrs_13 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc710) + %vT_ptrs_18 = tt.broadcast %vT_ptrs_16 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc710) + %vT_ptrs_19 = tt.addptr %vT_ptrs_17, %vT_ptrs_18 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc710) + %hi = arith.constant 2 : i32 loc(#loc711) + %hi_20 = arith.constant 2 : i32 loc(#loc711) + %hi_21 = arith.muli %sparse_kv_num_blocks, %hi_20 : i32 loc(#loc711) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%KV_LEN) : (i32) -> i32 loc(#loc712) + %hi_23 = arith.constant 1 : i32 loc(#loc713) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc713) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc714) + %c0_i32 = arith.constant 0 : i32 loc(#loc242) + %c1_i32 = arith.constant 1 : i32 loc(#loc242) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc242) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc242) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc242) + %3 = ub.poison : i32 loc(#loc242) + %vT_ptrs_26:4 = scf.for %start_n = %0 to %1 step %2 iter_args(%dq_27 = %dq, %offs_n2_28 = %offs_n2, %kT_ptrs_29 = %kT_ptrs_9, %vT_ptrs_30 = %vT_ptrs_19) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %dq_31 = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(39,)cconstexpr_bf16__(40,)cconstexpr_1_d_44269504__(41,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %dq_27, %q, %kT_ptrs_29, %vT_ptrs_30, %do, %Di, %lse, %Q_LEN, %KV_LEN, %off_z, %off_hq, %offs_m2, %offs_n2_28, %offs_k, %offs_v, %stride_kn, %stride_kd, %stride_vn, %stride_vd, %kv_indices, %sparse_kv_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc716) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %sparse_kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc717) + %kT_ptrs_32 = arith.muli %offset, %stride_kn : i32 loc(#loc718) + %kT_ptrs_33 = tt.splat %kT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc719) + %kT_ptrs_34 = tt.addptr %kT_ptrs_29, %kT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc719) + %vT_ptrs_35 = arith.muli %offset, %stride_vn : i32 loc(#loc720) + %vT_ptrs_36 = tt.splat %vT_ptrs_35 : i32 -> tensor<128x64xi32> loc(#loc721) + %vT_ptrs_37 = tt.addptr %vT_ptrs_30, %vT_ptrs_36 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc721) + %offs_n2_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc722) + %offs_n2_39 = arith.addi %offs_n2_28, %offs_n2_38 : tensor<64xi32> loc(#loc722) + scf.yield %dq_31, %offs_n2_39, %kT_ptrs_34, %vT_ptrs_37 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc250) + } loc(#loc1027) + tt.return %vT_ptrs_26#0 : tensor<128x128xf32> loc(#loc251) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc252) + tt.return %4 : tensor<128x128xf32> loc(#loc252) + } loc(#loc221) + tt.func private @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(39,)cconstexpr_bf16__(40,)cconstexpr_1_d_44269504__(41,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc253)), %arg_K: !tt.ptr loc("arg_K"(#loc253)), %arg_V: !tt.ptr loc("arg_V"(#loc253)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc253)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc253)), %arg_DO: !tt.ptr loc("arg_DO"(#loc253)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc253)), %arg_DV: !tt.ptr loc("arg_DV"(#loc253)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc253)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc253)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc253)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc253)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc253)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc253)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc253)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc253)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc253)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc253)), %dq: tensor<128x128xf32> loc("dq"(#loc253)), %q: tensor<128x128xbf16> loc("q"(#loc253)), %kT_ptrs: tensor<128x64x!tt.ptr> loc("kT_ptrs"(#loc253)), %vT_ptrs: tensor<128x64x!tt.ptr> loc("vT_ptrs"(#loc253)), %do: tensor<128x128xbf16> loc("do"(#loc253)), %Di: tensor<128xf32> loc("Di"(#loc253)), %lse: tensor<128x1xf32> loc("lse"(#loc253)), %Q_LEN: i32 loc("Q_LEN"(#loc253)), %KV_LEN: i32 loc("KV_LEN"(#loc253)), %off_z: i32 loc("off_z"(#loc253)), %off_hq: i32 loc("off_hq"(#loc253)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc253)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc253)), %offs_k: tensor<128xi32> loc("offs_k"(#loc253)), %offs_v: tensor<128xi32> loc("offs_v"(#loc253)), %stride_kn: i32 loc("stride_kn"(#loc253)), %stride_kd: i32 loc("stride_kd"(#loc253)), %stride_vn: i32 loc("stride_vn"(#loc253)), %stride_vd: i32 loc("stride_vd"(#loc253)), %kv_indices: !tt.ptr loc("kv_indices"(#loc253)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc253))) -> tensor<128x128xf32> attributes {noinline = false} { + %kT = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(7,)cconstexpr_128_"(%kT_ptrs, %offs_k, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc762) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc763) + %qk_0 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc763) + %qk_1 = tt.dot %q, %kT, %qk_0, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc763) + %qk_2 = arith.constant 0.0883883461 : f32 loc(#loc764) + %qk_3 = arith.constant 0.0883883461 : f32 loc(#loc764) + %qk_4 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc764) + %qk_5 = arith.mulf %qk_1, %qk_4 : tensor<128x64xf32> loc(#loc764) + %n = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc765) + %n_6 = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.get_bounded_indices__i32S1_64S__(1,)cconstexpr_None_"(%n) : (tensor<1x64xi32>) -> tensor<1x64xi32> loc(#loc766) + %m = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc767) + %m_7 = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.get_bounded_indices__i32S128_1S__(1,)cconstexpr_None_"(%m) : (tensor<128x1xi32>) -> tensor<128x1xi32> loc(#loc768) + %post_mod_scores = arith.constant 1.44269502 : f32 loc(#loc806) + %post_mod_scores_8 = arith.constant 1.44269502 : f32 loc(#loc806) + %post_mod_scores_9 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc806) + %post_mod_scores_10 = arith.mulf %qk_5, %post_mod_scores_9 : tensor<128x64xf32> loc(#loc806) + %p = tt.broadcast %lse : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc807) + %p_11 = arith.subf %post_mod_scores_10, %p : tensor<128x64xf32> loc(#loc807) + %p_12 = math.exp2 %p_11 : tensor<128x64xf32> loc(#loc808) + %vT = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(7,)cconstexpr_128_"(%vT_ptrs, %offs_v, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc809) + %dp = arith.constant 0.000000e+00 : f32 loc(#loc810) + %dp_13 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc810) + %dp_14 = tt.dot %do, %vT, %dp_13, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc810) + %ds = tt.expand_dims %Di {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc811) + %ds_15 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc812) + %ds_16 = arith.subf %dp_14, %ds_15 : tensor<128x64xf32> loc(#loc812) + %ds_17 = arith.mulf %p_12, %ds_16 : tensor<128x64xf32> loc(#loc813) + %scatter_mask = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc814) + %scatter_mask_18 = tt.splat %Q_LEN : i32 -> tensor<128x1xi32> loc(#loc815) + %scatter_mask_19 = arith.cmpi slt, %scatter_mask, %scatter_mask_18 : tensor<128x1xi32> loc(#loc815) + %scatter_mask_20 = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc816) + %scatter_mask_21 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc817) + %scatter_mask_22 = arith.cmpi slt, %scatter_mask_20, %scatter_mask_21 : tensor<1x64xi32> loc(#loc817) + %scatter_mask_23 = tt.broadcast %scatter_mask_19 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc818) + %scatter_mask_24 = tt.broadcast %scatter_mask_22 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc818) + %scatter_mask_25 = arith.andi %scatter_mask_23, %scatter_mask_24 : tensor<128x64xi1> loc(#loc818) + %ds_26 = arith.truncf %ds_17 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc820) + %dq_27 = tt.trans %kT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc821) + %dq_28 = arith.constant 0.000000e+00 : f32 loc(#loc822) + %dq_29 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc822) + %dq_30 = tt.dot %ds_26, %dq_27, %dq_29, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc822) + %dq_31 = arith.addf %dq, %dq_30 : tensor<128x128xf32> loc(#loc823) + tt.return %dq_31 : tensor<128x128xf32> loc(#loc316) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc317) + tt.return %0 : tensor<128x128xf32> loc(#loc317) + } loc(#loc253) + tt.func private @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(36,)cconstexpr_bf16__(37,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc342)), %arg_K: !tt.ptr loc("arg_K"(#loc342)), %arg_V: !tt.ptr loc("arg_V"(#loc342)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc342)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc342)), %arg_DO: !tt.ptr loc("arg_DO"(#loc342)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc342)), %arg_DV: !tt.ptr loc("arg_DV"(#loc342)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc342)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc342)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc342)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc342)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc342)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc342)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc342)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc342)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc342)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc342)), %Q: !tt.ptr loc("Q"(#loc342)), %DO: !tt.ptr loc("DO"(#loc342)), %DELTA: !tt.ptr loc("DELTA"(#loc342)), %LSE: !tt.ptr loc("LSE"(#loc342)), %dk: tensor<128x128xf32> loc("dk"(#loc342)), %dv: tensor<128x128xf32> loc("dv"(#loc342)), %k: tensor<128x128xbf16> loc("k"(#loc342)), %v: tensor<128x128xbf16> loc("v"(#loc342)), %off_z: i32 loc("off_z"(#loc342)), %off_hq: i32 loc("off_hq"(#loc342)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc342)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc342)), %stride_qm: i32 loc("stride_qm"(#loc342)), %stride_qd: i32 loc("stride_qd"(#loc342)), %stride_dom: i32 loc("stride_dom"(#loc342)), %stride_dod: i32 loc("stride_dod"(#loc342)), %q_indices: !tt.ptr loc("q_indices"(#loc342)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc342))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %Q_LEN = arith.constant 2048 : i32 loc(#loc883) + %KV_LEN = arith.constant 2048 : i32 loc(#loc884) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc885) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc886) + %qT_ptrs = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc887) + %qT_ptrs_0 = tt.splat %stride_qm : i32 -> tensor<1x64xi32> loc(#loc888) + %qT_ptrs_1 = arith.muli %qT_ptrs, %qT_ptrs_0 : tensor<1x64xi32> loc(#loc888) + %qT_ptrs_2 = tt.splat %Q : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc889) + %qT_ptrs_3 = tt.addptr %qT_ptrs_2, %qT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc889) + %qT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc890) + %qT_ptrs_5 = tt.splat %stride_qd : i32 -> tensor<128x1xi32> loc(#loc891) + %qT_ptrs_6 = arith.muli %qT_ptrs_4, %qT_ptrs_5 : tensor<128x1xi32> loc(#loc891) + %qT_ptrs_7 = tt.broadcast %qT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc892) + %qT_ptrs_8 = tt.broadcast %qT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc892) + %qT_ptrs_9 = tt.addptr %qT_ptrs_7, %qT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc892) + %do_ptrs = tt.expand_dims %offs_m1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc893) + %do_ptrs_10 = tt.splat %stride_dom : i32 -> tensor<64x1xi32> loc(#loc894) + %do_ptrs_11 = arith.muli %do_ptrs, %do_ptrs_10 : tensor<64x1xi32> loc(#loc894) + %do_ptrs_12 = tt.splat %DO : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc895) + %do_ptrs_13 = tt.addptr %do_ptrs_12, %do_ptrs_11 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc895) + %do_ptrs_14 = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc896) + %do_ptrs_15 = tt.splat %stride_dod : i32 -> tensor<1x128xi32> loc(#loc897) + %do_ptrs_16 = arith.muli %do_ptrs_14, %do_ptrs_15 : tensor<1x128xi32> loc(#loc897) + %do_ptrs_17 = tt.broadcast %do_ptrs_13 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc898) + %do_ptrs_18 = tt.broadcast %do_ptrs_16 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc898) + %do_ptrs_19 = tt.addptr %do_ptrs_17, %do_ptrs_18 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc898) + %hi = arith.constant 2 : i32 loc(#loc899) + %hi_20 = arith.constant 2 : i32 loc(#loc899) + %hi_21 = arith.muli %sparse_q_num_blocks, %hi_20 : i32 loc(#loc899) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%Q_LEN) : (i32) -> i32 loc(#loc900) + %hi_23 = arith.constant 1 : i32 loc(#loc901) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc901) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc902) + %c0_i32 = arith.constant 0 : i32 loc(#loc363) + %c1_i32 = arith.constant 1 : i32 loc(#loc363) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc363) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc363) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc363) + %3 = ub.poison : i32 loc(#loc363) + %do_ptrs_26:5 = scf.for %start_m = %0 to %1 step %2 iter_args(%dk_27 = %dk, %dv_28 = %dv, %offs_m1_29 = %offs_m1, %qT_ptrs_30 = %qT_ptrs_9, %do_ptrs_31 = %do_ptrs_19) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %6:2 = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(40,)cconstexpr_bf16__(41,)cconstexpr_1_d_44269504__(42,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %dk_27, %dv_28, %qT_ptrs_30, %k, %v, %do_ptrs_31, %DELTA, %LSE, %Q_LEN, %KV_LEN, %off_z, %off_hq, %offs_n1, %offs_m1_29, %offs_k, %offs_v, %stride_qm, %stride_qd, %stride_dom, %stride_dod, %q_indices, %sparse_q_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<64x128x!tt.ptr>, !tt.ptr, !tt.ptr, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc364) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_m, %q_indices, %sparse_q_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc904) + %qT_ptrs_32 = arith.muli %offset, %stride_qm : i32 loc(#loc905) + %qT_ptrs_33 = tt.splat %qT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc906) + %qT_ptrs_34 = tt.addptr %qT_ptrs_30, %qT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc906) + %do_ptrs_35 = arith.muli %offset, %stride_dom : i32 loc(#loc907) + %do_ptrs_36 = tt.splat %do_ptrs_35 : i32 -> tensor<64x128xi32> loc(#loc908) + %do_ptrs_37 = tt.addptr %do_ptrs_31, %do_ptrs_36 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc908) + %offs_m1_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc909) + %offs_m1_39 = arith.addi %offs_m1_29, %offs_m1_38 : tensor<64xi32> loc(#loc909) + scf.yield %6#0, %6#1, %offs_m1_39, %qT_ptrs_34, %do_ptrs_37 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc371) + } loc(#loc1029) + tt.return %do_ptrs_26#0, %do_ptrs_26#1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc372) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc373) + %5 = ub.poison : tensor<128x128xf32> loc(#loc373) + tt.return %4, %5 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc373) + } loc(#loc342) + tt.func private @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(40,)cconstexpr_bf16__(41,)cconstexpr_1_d_44269504__(42,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc374)), %arg_K: !tt.ptr loc("arg_K"(#loc374)), %arg_V: !tt.ptr loc("arg_V"(#loc374)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc374)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc374)), %arg_DO: !tt.ptr loc("arg_DO"(#loc374)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc374)), %arg_DV: !tt.ptr loc("arg_DV"(#loc374)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc374)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc374)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc374)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc374)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc374)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc374)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc374)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc374)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc374)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc374)), %dk: tensor<128x128xf32> loc("dk"(#loc374)), %dv: tensor<128x128xf32> loc("dv"(#loc374)), %qT_ptrs: tensor<128x64x!tt.ptr> loc("qT_ptrs"(#loc374)), %k: tensor<128x128xbf16> loc("k"(#loc374)), %v: tensor<128x128xbf16> loc("v"(#loc374)), %do_ptrs: tensor<64x128x!tt.ptr> loc("do_ptrs"(#loc374)), %DELTA: !tt.ptr loc("DELTA"(#loc374)), %LSE: !tt.ptr loc("LSE"(#loc374)), %Q_LEN: i32 loc("Q_LEN"(#loc374)), %KV_LEN: i32 loc("KV_LEN"(#loc374)), %off_z: i32 loc("off_z"(#loc374)), %off_hq: i32 loc("off_hq"(#loc374)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc374)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc374)), %offs_k: tensor<128xi32> loc("offs_k"(#loc374)), %offs_v: tensor<128xi32> loc("offs_v"(#loc374)), %stride_qm: i32 loc("stride_qm"(#loc374)), %stride_qd: i32 loc("stride_qd"(#loc374)), %stride_dom: i32 loc("stride_dom"(#loc374)), %stride_dod: i32 loc("stride_dod"(#loc374)), %q_indices: !tt.ptr loc("q_indices"(#loc374)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc374))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %qT = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(7,)cconstexpr_128_"(%qT_ptrs, %offs_k, %offs_m1, %Q_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc950) + %lse = tt.splat %LSE : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc951) + %lse_0 = tt.addptr %lse, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc951) + %lse_1 = tt.load %lse_0 : tensor<64x!tt.ptr> loc(#loc952) + %lse_2 = arith.constant 0xFF800000 : f32 loc(#loc953) + %lse_3 = arith.constant dense<0xFF800000> : tensor<64xf32> loc(#loc953) + %lse_4 = arith.cmpf oeq, %lse_1, %lse_3 : tensor<64xf32> loc(#loc953) + %lse_5 = arith.constant 0.000000e+00 : f32 loc(#loc954) + %lse_6 = arith.constant 0.000000e+00 : f32 loc(#loc954) + %lse_7 = arith.constant dense<0.000000e+00> : tensor<64xf32> loc(#loc954) + %lse_8 = arith.select %lse_4, %lse_7, %lse_1 : tensor<64xi1>, tensor<64xf32> loc(#loc954) + %qkT = arith.constant 0.000000e+00 : f32 loc(#loc955) + %qkT_9 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc955) + %qkT_10 = tt.dot %k, %qT, %qkT_9, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc955) + %qkT_11 = arith.constant 0.0883883461 : f32 loc(#loc956) + %qkT_12 = arith.constant 0.0883883461 : f32 loc(#loc956) + %qkT_13 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc956) + %qkT_14 = arith.mulf %qkT_10, %qkT_13 : tensor<128x64xf32> loc(#loc956) + %m = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc957) + %m_15 = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.get_bounded_indices__i32S1_64S__(1,)cconstexpr_None_"(%m) : (tensor<1x64xi32>) -> tensor<1x64xi32> loc(#loc958) + %n = tt.expand_dims %offs_n1 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc959) + %n_16 = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.get_bounded_indices__i32S128_1S__(1,)cconstexpr_None_"(%n) : (tensor<128x1xi32>) -> tensor<128x1xi32> loc(#loc960) + %tmp41 = arith.constant false loc(#loc961) + %tmp41_17 = arith.constant dense : tensor<1xi1> loc(#loc961) + %tmp44 = tt.broadcast %m_15 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc962) + %tmp44_18 = tt.broadcast %n_16 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc962) + %tmp44_19 = arith.cmpi sge, %tmp44, %tmp44_18 : tensor<128x64xi32> loc(#loc962) + %tmp45 = arith.extsi %n_16 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc963) + %tmp47 = tt.addptr %in_ptr16, %off_z : !tt.ptr, i32 loc(#loc964) + %tmp47_20 = tt.load %tmp47 : !tt.ptr loc(#loc965) + %tmp48 = tt.splat %tmp47_20 : i64 -> tensor<128x1xi64> loc(#loc966) + %tmp48_21 = arith.cmpi slt, %tmp45, %tmp48 : tensor<128x1xi64> loc(#loc966) + %tmp49 = arith.extsi %m_15 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc967) + %tmp50 = tt.splat %tmp47_20 : i64 -> tensor<1x64xi64> loc(#loc968) + %tmp50_22 = arith.cmpi slt, %tmp49, %tmp50 : tensor<1x64xi64> loc(#loc968) + %tmp51 = tt.broadcast %tmp48_21 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc969) + %tmp51_23 = tt.broadcast %tmp50_22 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc969) + %tmp51_24 = arith.andi %tmp51, %tmp51_23 : tensor<128x64xi1> loc(#loc969) + %tmp52 = arith.andi %tmp44_19, %tmp51_24 : tensor<128x64xi1> loc(#loc970) + %tmp53 = tt.expand_dims %tmp41_17 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc971) + %tmp53_25 = tt.broadcast %tmp53 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc971) + %tmp53_26 = arith.ori %tmp53_25, %tmp52 : tensor<128x64xi1> loc(#loc971) + %tmp54 = arith.constant 2048 : i32 loc(#loc972) + %tmp54_27 = arith.constant dense<2048> : tensor<1xi32> loc(#loc972) + %tmp55 = tt.expand_dims %tmp54_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc973) + %tmp55_28 = tt.broadcast %tmp55 : tensor<1x1xi32> -> tensor<128x1xi32> loc(#loc973) + %tmp55_29 = arith.cmpi sge, %n_16, %tmp55_28 : tensor<128x1xi32> loc(#loc973) + %tmp56 = tt.expand_dims %tmp54_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc974) + %tmp56_30 = tt.broadcast %tmp56 : tensor<1x1xi32> -> tensor<128x1xi32> loc(#loc974) + %tmp56_31 = arith.remsi %n_16, %tmp56_30 : tensor<128x1xi32> loc(#loc974) + %tmp57 = arith.constant 0 : i32 loc(#loc975) + %tmp57_32 = arith.constant dense<0> : tensor<1xi32> loc(#loc975) + %tmp58 = tt.expand_dims %tmp57_32 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc976) + %tmp58_33 = tt.broadcast %tmp58 : tensor<1x1xi32> -> tensor<128x1xi32> loc(#loc976) + %tmp58_34 = arith.cmpi ne, %tmp56_31, %tmp58_33 : tensor<128x1xi32> loc(#loc976) + %tmp59 = arith.constant 0 : i32 loc(#loc977) + %tmp59_35 = arith.constant dense<0> : tensor<128x1xi32> loc(#loc977) + %tmp59_36 = arith.cmpi slt, %tmp56_31, %tmp59_35 : tensor<128x1xi32> loc(#loc977) + %tmp60 = arith.constant 0 : i32 loc(#loc978) + %tmp60_37 = arith.constant dense<0> : tensor<1xi32> loc(#loc978) + %tmp60_38 = arith.cmpi slt, %tmp54_27, %tmp60_37 : tensor<1xi32> loc(#loc978) + %tmp61 = tt.expand_dims %tmp60_38 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc979) + %tmp61_39 = tt.broadcast %tmp61 : tensor<1x1xi1> -> tensor<128x1xi1> loc(#loc979) + %tmp61_40 = arith.cmpi ne, %tmp59_36, %tmp61_39 : tensor<128x1xi1> loc(#loc979) + %tmp62 = arith.andi %tmp58_34, %tmp61_40 : tensor<128x1xi1> loc(#loc980) + %tmp63 = tt.expand_dims %tmp54_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc981) + %tmp63_41 = tt.broadcast %tmp63 : tensor<1x1xi32> -> tensor<128x1xi32> loc(#loc981) + %tmp63_42 = arith.addi %tmp56_31, %tmp63_41 : tensor<128x1xi32> loc(#loc981) + %tmp64 = arith.select %tmp62, %tmp63_42, %tmp56_31 : tensor<128x1xi1>, tensor<128x1xi32> loc(#loc982) + %tmp65 = arith.extsi %tmp64 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc983) + %tmp66 = tt.splat %tmp47_20 : i64 -> tensor<128x1xi64> loc(#loc984) + %tmp66_43 = arith.cmpi slt, %tmp65, %tmp66 : tensor<128x1xi64> loc(#loc984) + %tmp67 = arith.andi %tmp55_29, %tmp66_43 : tensor<128x1xi1> loc(#loc985) + %tmp68 = tt.broadcast %n_16 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc986) + %tmp68_44 = tt.broadcast %m_15 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc986) + %tmp68_45 = arith.subi %tmp68, %tmp68_44 : tensor<128x64xi32> loc(#loc986) + %tmp69 = tt.expand_dims %tmp54_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc987) + %tmp69_46 = tt.broadcast %tmp69 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc987) + %tmp69_47 = arith.remsi %tmp68_45, %tmp69_46 : tensor<128x64xi32> loc(#loc987) + %tmp70 = tt.expand_dims %tmp57_32 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc988) + %tmp70_48 = tt.broadcast %tmp70 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc988) + %tmp70_49 = arith.cmpi ne, %tmp69_47, %tmp70_48 : tensor<128x64xi32> loc(#loc988) + %tmp71 = arith.constant 0 : i32 loc(#loc989) + %tmp71_50 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc989) + %tmp71_51 = arith.cmpi slt, %tmp69_47, %tmp71_50 : tensor<128x64xi32> loc(#loc989) + %tmp72 = tt.expand_dims %tmp60_38 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc990) + %tmp72_52 = tt.broadcast %tmp72 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc990) + %tmp72_53 = arith.cmpi ne, %tmp71_51, %tmp72_52 : tensor<128x64xi1> loc(#loc990) + %tmp73 = arith.andi %tmp70_49, %tmp72_53 : tensor<128x64xi1> loc(#loc991) + %tmp74 = tt.expand_dims %tmp54_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc992) + %tmp74_54 = tt.broadcast %tmp74 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc992) + %tmp74_55 = arith.addi %tmp69_47, %tmp74_54 : tensor<128x64xi32> loc(#loc992) + %tmp75 = arith.select %tmp73, %tmp74_55, %tmp69_47 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc993) + %tmp76 = tt.expand_dims %tmp57_32 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc994) + %tmp76_56 = tt.broadcast %tmp76 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc994) + %tmp76_57 = arith.cmpi eq, %tmp75, %tmp76_56 : tensor<128x64xi32> loc(#loc994) + %tmp77 = tt.broadcast %tmp67 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc995) + %tmp77_58 = arith.andi %tmp77, %tmp76_57 : tensor<128x64xi1> loc(#loc995) + %tmp78 = arith.ori %tmp53_26, %tmp77_58 : tensor<128x64xi1> loc(#loc996) + %post_mod_scores = arith.constant 0xFF800000 : f32 loc(#loc997) + %post_mod_scores_59 = arith.constant 0xFF800000 : f32 loc(#loc997) + %post_mod_scores_60 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc997) + %post_mod_scores_61 = arith.select %tmp78, %qkT_14, %post_mod_scores_60 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc997) + %post_mod_scores_62 = arith.constant 1.44269502 : f32 loc(#loc998) + %post_mod_scores_63 = arith.constant 1.44269502 : f32 loc(#loc998) + %post_mod_scores_64 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc998) + %post_mod_scores_65 = arith.mulf %post_mod_scores_61, %post_mod_scores_64 : tensor<128x64xf32> loc(#loc998) + %pT = tt.expand_dims %lse_8 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc999) + %pT_66 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1000) + %pT_67 = arith.subf %post_mod_scores_65, %pT_66 : tensor<128x64xf32> loc(#loc1000) + %pT_68 = math.exp2 %pT_67 : tensor<128x64xf32> loc(#loc1001) + %do = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.load_checked_2d__Pbf16S64_128S_i32S64S_i32S128S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%do_ptrs, %offs_m1, %offs_v, %Q_LEN) : (tensor<64x128x!tt.ptr>, tensor<64xi32>, tensor<128xi32>, i32) -> tensor<64x128xbf16> loc(#loc1002) + %dv_69 = arith.truncf %pT_68 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1003) + %dv_70 = arith.constant 0.000000e+00 : f32 loc(#loc1004) + %dv_71 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1004) + %dv_72 = tt.dot %dv_69, %do, %dv_71, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1004) + %dv_73 = arith.addf %dv, %dv_72 : tensor<128x128xf32> loc(#loc1005) + %Di = tt.splat %DELTA : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc1006) + %Di_74 = tt.addptr %Di, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc1006) + %Di_75 = tt.load %Di_74 : tensor<64x!tt.ptr> loc(#loc1007) + %dpT = tt.trans %do {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc1008) + %dpT_76 = arith.constant 0.000000e+00 : f32 loc(#loc1009) + %dpT_77 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1009) + %dpT_78 = tt.dot %v, %dpT, %dpT_77, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc1009) + %dsT = tt.expand_dims %Di_75 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc1010) + %dsT_79 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1011) + %dsT_80 = arith.subf %dpT_78, %dsT_79 : tensor<128x64xf32> loc(#loc1011) + %dsT_81 = arith.mulf %pT_68, %dsT_80 : tensor<128x64xf32> loc(#loc1012) + %dsT_82 = arith.constant 0.000000e+00 : f32 loc(#loc1013) + %dsT_83 = arith.constant 0.000000e+00 : f32 loc(#loc1013) + %dsT_84 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1013) + %dsT_85 = arith.select %tmp78, %dsT_81, %dsT_84 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1013) + %dk_86 = arith.truncf %dsT_85 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1014) + %dk_87 = tt.trans %qT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc1015) + %dk_88 = arith.constant 0.000000e+00 : f32 loc(#loc1016) + %dk_89 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1016) + %dk_90 = tt.dot %dk_86, %dk_87, %dk_89, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1016) + %dk_91 = arith.addf %dk, %dk_90 : tensor<128x128xf32> loc(#loc1017) + tt.return %dk_91, %dv_73 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc443) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc444) + %1 = ub.poison : tensor<128x128xf32> loc(#loc444) + tt.return %0, %1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc444) + } loc(#loc374) + tt.func private @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.load_checked_2d__Pbf16S64_128S_i32S64S_i32S128S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: tensor<64x128x!tt.ptr> loc("ptr"(#loc211)), %offs_m: tensor<64xi32> loc("offs_m"(#loc211)), %offs_n: tensor<128xi32> loc("offs_n"(#loc211)), %M_LEN: i32 loc("M_LEN"(#loc211))) -> tensor<64x128xbf16> attributes {noinline = false} { + %0 = tt.load %ptr : tensor<64x128x!tt.ptr> loc(#loc218) + tt.return %0 : tensor<64x128xbf16> loc(#loc219) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x128xbf16> loc(#loc220) + tt.return %1 : tensor<64x128xbf16> loc(#loc220) + } loc(#loc211) + tt.func private @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(36,)cconstexpr_bf16__(37,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc342)), %arg_K: !tt.ptr loc("arg_K"(#loc342)), %arg_V: !tt.ptr loc("arg_V"(#loc342)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc342)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc342)), %arg_DO: !tt.ptr loc("arg_DO"(#loc342)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc342)), %arg_DV: !tt.ptr loc("arg_DV"(#loc342)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc342)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc342)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc342)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc342)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc342)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc342)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc342)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc342)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc342)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc342)), %Q: !tt.ptr loc("Q"(#loc342)), %DO: !tt.ptr loc("DO"(#loc342)), %DELTA: !tt.ptr loc("DELTA"(#loc342)), %LSE: !tt.ptr loc("LSE"(#loc342)), %dk: tensor<128x128xf32> loc("dk"(#loc342)), %dv: tensor<128x128xf32> loc("dv"(#loc342)), %k: tensor<128x128xbf16> loc("k"(#loc342)), %v: tensor<128x128xbf16> loc("v"(#loc342)), %off_z: i32 loc("off_z"(#loc342)), %off_hq: i32 loc("off_hq"(#loc342)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc342)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc342)), %stride_qm: i32 loc("stride_qm"(#loc342)), %stride_qd: i32 loc("stride_qd"(#loc342)), %stride_dom: i32 loc("stride_dom"(#loc342)), %stride_dod: i32 loc("stride_dod"(#loc342)), %q_indices: !tt.ptr loc("q_indices"(#loc342)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc342))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %Q_LEN = arith.constant 2048 : i32 loc(#loc883) + %KV_LEN = arith.constant 2048 : i32 loc(#loc884) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc885) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc886) + %qT_ptrs = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc887) + %qT_ptrs_0 = tt.splat %stride_qm : i32 -> tensor<1x64xi32> loc(#loc888) + %qT_ptrs_1 = arith.muli %qT_ptrs, %qT_ptrs_0 : tensor<1x64xi32> loc(#loc888) + %qT_ptrs_2 = tt.splat %Q : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc889) + %qT_ptrs_3 = tt.addptr %qT_ptrs_2, %qT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc889) + %qT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc890) + %qT_ptrs_5 = tt.splat %stride_qd : i32 -> tensor<128x1xi32> loc(#loc891) + %qT_ptrs_6 = arith.muli %qT_ptrs_4, %qT_ptrs_5 : tensor<128x1xi32> loc(#loc891) + %qT_ptrs_7 = tt.broadcast %qT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc892) + %qT_ptrs_8 = tt.broadcast %qT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc892) + %qT_ptrs_9 = tt.addptr %qT_ptrs_7, %qT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc892) + %do_ptrs = tt.expand_dims %offs_m1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc893) + %do_ptrs_10 = tt.splat %stride_dom : i32 -> tensor<64x1xi32> loc(#loc894) + %do_ptrs_11 = arith.muli %do_ptrs, %do_ptrs_10 : tensor<64x1xi32> loc(#loc894) + %do_ptrs_12 = tt.splat %DO : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc895) + %do_ptrs_13 = tt.addptr %do_ptrs_12, %do_ptrs_11 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc895) + %do_ptrs_14 = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc896) + %do_ptrs_15 = tt.splat %stride_dod : i32 -> tensor<1x128xi32> loc(#loc897) + %do_ptrs_16 = arith.muli %do_ptrs_14, %do_ptrs_15 : tensor<1x128xi32> loc(#loc897) + %do_ptrs_17 = tt.broadcast %do_ptrs_13 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc898) + %do_ptrs_18 = tt.broadcast %do_ptrs_16 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc898) + %do_ptrs_19 = tt.addptr %do_ptrs_17, %do_ptrs_18 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc898) + %hi = arith.constant 2 : i32 loc(#loc899) + %hi_20 = arith.constant 2 : i32 loc(#loc899) + %hi_21 = arith.muli %sparse_q_num_blocks, %hi_20 : i32 loc(#loc899) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%Q_LEN) : (i32) -> i32 loc(#loc900) + %hi_23 = arith.constant 1 : i32 loc(#loc901) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc901) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc902) + %c0_i32 = arith.constant 0 : i32 loc(#loc363) + %c1_i32 = arith.constant 1 : i32 loc(#loc363) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc363) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc363) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc363) + %3 = ub.poison : i32 loc(#loc363) + %do_ptrs_26:5 = scf.for %start_m = %0 to %1 step %2 iter_args(%dk_27 = %dk, %dv_28 = %dv, %offs_m1_29 = %offs_m1, %qT_ptrs_30 = %qT_ptrs_9, %do_ptrs_31 = %do_ptrs_19) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %6:2 = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(40,)cconstexpr_bf16__(41,)cconstexpr_1_d_44269504__(42,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %dk_27, %dv_28, %qT_ptrs_30, %k, %v, %do_ptrs_31, %DELTA, %LSE, %Q_LEN, %KV_LEN, %off_z, %off_hq, %offs_n1, %offs_m1_29, %offs_k, %offs_v, %stride_qm, %stride_qd, %stride_dom, %stride_dod, %q_indices, %sparse_q_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<64x128x!tt.ptr>, !tt.ptr, !tt.ptr, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc364) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_m, %q_indices, %sparse_q_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc904) + %qT_ptrs_32 = arith.muli %offset, %stride_qm : i32 loc(#loc905) + %qT_ptrs_33 = tt.splat %qT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc906) + %qT_ptrs_34 = tt.addptr %qT_ptrs_30, %qT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc906) + %do_ptrs_35 = arith.muli %offset, %stride_dom : i32 loc(#loc907) + %do_ptrs_36 = tt.splat %do_ptrs_35 : i32 -> tensor<64x128xi32> loc(#loc908) + %do_ptrs_37 = tt.addptr %do_ptrs_31, %do_ptrs_36 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc908) + %offs_m1_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc909) + %offs_m1_39 = arith.addi %offs_m1_29, %offs_m1_38 : tensor<64xi32> loc(#loc909) + scf.yield %6#0, %6#1, %offs_m1_39, %qT_ptrs_34, %do_ptrs_37 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc371) + } loc(#loc1029) + tt.return %do_ptrs_26#0, %do_ptrs_26#1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc372) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc373) + %5 = ub.poison : tensor<128x128xf32> loc(#loc373) + tt.return %4, %5 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc373) + } loc(#loc342) + tt.func private @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(40,)cconstexpr_bf16__(41,)cconstexpr_1_d_44269504__(42,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc374)), %arg_K: !tt.ptr loc("arg_K"(#loc374)), %arg_V: !tt.ptr loc("arg_V"(#loc374)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc374)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc374)), %arg_DO: !tt.ptr loc("arg_DO"(#loc374)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc374)), %arg_DV: !tt.ptr loc("arg_DV"(#loc374)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc374)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc374)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc374)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc374)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc374)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc374)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc374)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc374)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc374)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc374)), %dk: tensor<128x128xf32> loc("dk"(#loc374)), %dv: tensor<128x128xf32> loc("dv"(#loc374)), %qT_ptrs: tensor<128x64x!tt.ptr> loc("qT_ptrs"(#loc374)), %k: tensor<128x128xbf16> loc("k"(#loc374)), %v: tensor<128x128xbf16> loc("v"(#loc374)), %do_ptrs: tensor<64x128x!tt.ptr> loc("do_ptrs"(#loc374)), %DELTA: !tt.ptr loc("DELTA"(#loc374)), %LSE: !tt.ptr loc("LSE"(#loc374)), %Q_LEN: i32 loc("Q_LEN"(#loc374)), %KV_LEN: i32 loc("KV_LEN"(#loc374)), %off_z: i32 loc("off_z"(#loc374)), %off_hq: i32 loc("off_hq"(#loc374)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc374)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc374)), %offs_k: tensor<128xi32> loc("offs_k"(#loc374)), %offs_v: tensor<128xi32> loc("offs_v"(#loc374)), %stride_qm: i32 loc("stride_qm"(#loc374)), %stride_qd: i32 loc("stride_qd"(#loc374)), %stride_dom: i32 loc("stride_dom"(#loc374)), %stride_dod: i32 loc("stride_dod"(#loc374)), %q_indices: !tt.ptr loc("q_indices"(#loc374)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc374))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %qT = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(7,)cconstexpr_128_"(%qT_ptrs, %offs_k, %offs_m1, %Q_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc950) + %lse = tt.splat %LSE : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc951) + %lse_0 = tt.addptr %lse, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc951) + %lse_1 = tt.load %lse_0 : tensor<64x!tt.ptr> loc(#loc952) + %lse_2 = arith.constant 0xFF800000 : f32 loc(#loc953) + %lse_3 = arith.constant dense<0xFF800000> : tensor<64xf32> loc(#loc953) + %lse_4 = arith.cmpf oeq, %lse_1, %lse_3 : tensor<64xf32> loc(#loc953) + %lse_5 = arith.constant 0.000000e+00 : f32 loc(#loc954) + %lse_6 = arith.constant 0.000000e+00 : f32 loc(#loc954) + %lse_7 = arith.constant dense<0.000000e+00> : tensor<64xf32> loc(#loc954) + %lse_8 = arith.select %lse_4, %lse_7, %lse_1 : tensor<64xi1>, tensor<64xf32> loc(#loc954) + %qkT = arith.constant 0.000000e+00 : f32 loc(#loc955) + %qkT_9 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc955) + %qkT_10 = tt.dot %k, %qT, %qkT_9, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc955) + %qkT_11 = arith.constant 0.0883883461 : f32 loc(#loc956) + %qkT_12 = arith.constant 0.0883883461 : f32 loc(#loc956) + %qkT_13 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc956) + %qkT_14 = arith.mulf %qkT_10, %qkT_13 : tensor<128x64xf32> loc(#loc956) + %m = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc957) + %m_15 = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.get_bounded_indices__i32S1_64S__(1,)cconstexpr_None_"(%m) : (tensor<1x64xi32>) -> tensor<1x64xi32> loc(#loc958) + %n = tt.expand_dims %offs_n1 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc959) + %n_16 = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.get_bounded_indices__i32S128_1S__(1,)cconstexpr_None_"(%n) : (tensor<128x1xi32>) -> tensor<128x1xi32> loc(#loc960) + %post_mod_scores = arith.constant 1.44269502 : f32 loc(#loc998) + %post_mod_scores_17 = arith.constant 1.44269502 : f32 loc(#loc998) + %post_mod_scores_18 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc998) + %post_mod_scores_19 = arith.mulf %qkT_14, %post_mod_scores_18 : tensor<128x64xf32> loc(#loc998) + %pT = tt.expand_dims %lse_8 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc999) + %pT_20 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1000) + %pT_21 = arith.subf %post_mod_scores_19, %pT_20 : tensor<128x64xf32> loc(#loc1000) + %pT_22 = math.exp2 %pT_21 : tensor<128x64xf32> loc(#loc1001) + %do = tt.call @"torch._inductor.runtime.compile_tasks.c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.load_checked_2d__Pbf16S64_128S_i32S64S_i32S128S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%do_ptrs, %offs_m1, %offs_v, %Q_LEN) : (tensor<64x128x!tt.ptr>, tensor<64xi32>, tensor<128xi32>, i32) -> tensor<64x128xbf16> loc(#loc1002) + %dv_23 = arith.truncf %pT_22 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1003) + %dv_24 = arith.constant 0.000000e+00 : f32 loc(#loc1004) + %dv_25 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1004) + %dv_26 = tt.dot %dv_23, %do, %dv_25, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1004) + %dv_27 = arith.addf %dv, %dv_26 : tensor<128x128xf32> loc(#loc1005) + %Di = tt.splat %DELTA : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc1006) + %Di_28 = tt.addptr %Di, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc1006) + %Di_29 = tt.load %Di_28 : tensor<64x!tt.ptr> loc(#loc1007) + %dpT = tt.trans %do {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc1008) + %dpT_30 = arith.constant 0.000000e+00 : f32 loc(#loc1009) + %dpT_31 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1009) + %dpT_32 = tt.dot %v, %dpT, %dpT_31, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc1009) + %dsT = tt.expand_dims %Di_29 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc1010) + %dsT_33 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1011) + %dsT_34 = arith.subf %dpT_32, %dsT_33 : tensor<128x64xf32> loc(#loc1011) + %dsT_35 = arith.mulf %pT_22, %dsT_34 : tensor<128x64xf32> loc(#loc1012) + %dk_36 = arith.truncf %dsT_35 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1014) + %dk_37 = tt.trans %qT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc1015) + %dk_38 = arith.constant 0.000000e+00 : f32 loc(#loc1016) + %dk_39 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1016) + %dk_40 = tt.dot %dk_36, %dk_37, %dk_39, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1016) + %dk_41 = arith.addf %dk, %dk_40 : tensor<128x128xf32> loc(#loc1017) + tt.return %dk_41, %dv_27 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc443) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc444) + %1 = ub.poison : tensor<128x128xf32> loc(#loc444) + tt.return %0, %1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc444) + } loc(#loc374) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":94:49) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":95:49) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":96:49) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":97:53) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":99:53) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":100:53) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":102:9) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":103:9) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":104:10) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":105:12) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":106:10) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":107:13) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":111:24) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":112:36) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":113:34) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":115:27) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":116:28) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":117:23) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":119:15) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":120:16) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":122:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":124:25) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":124:47) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":124:35) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":124:59) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":125:25) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":125:47) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":125:35) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":125:59) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":128:27) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":128:50) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":128:37) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":128:61) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":131:9) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":132:9) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":133:10) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":135:14) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":136:26) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":137:26) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":139:14) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":139:7) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":140:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":142:29) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":143:30) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":144:29) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":144:54) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":144:44) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":145:35) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":146:41) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":147:31) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":148:26) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":149:26) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":151:35) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":152:42) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":152:54) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":154:55) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":154:78) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":155:50) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":155:83) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":155:68) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":158:30) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":158:52) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":158:40) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":158:63) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":159:32) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":159:55) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":159:42) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":159:66) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":160:32) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":160:55) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":160:42) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":160:66) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":161:30) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":161:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":161:46) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":161:56) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":163:17) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":164:19) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":167:19) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":168:21) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":169:25) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":172:22) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":174:36) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":175:42) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":175:29) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":178:107) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":179:111) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":185:34) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":185:25) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":186:33) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":186:26) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":190:30) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":190:50) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":191:18) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":195:30) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":196:27) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":196:41) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":197:53) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":197:39) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":199:42) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":199:29) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":207:12) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":214:39) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":215:31) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":215:45) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":216:62) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":216:43) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":218:46) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":218:33) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":226:16) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":231:32) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":231:43) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":231:24) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":231:63) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":231:74) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":231:56) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":232:14) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":234:30) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":239:29) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":240:30) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":242:26) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":244:30) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":245:25) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":246:25) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":249:22) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":250:22) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":252:25) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":253:42) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":253:29) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":256:107) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":257:107) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":262:30) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":263:32) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":263:51) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":266:34) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":266:56) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":266:44) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":266:67) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":267:36) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":267:59) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":267:46) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":267:70) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":268:36) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":268:59) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":268:46) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":268:70) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":269:34) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":269:39) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":269:50) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":269:60) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":271:21) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":272:23) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":275:25) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":276:29) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":278:39) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":279:46) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":279:58) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":281:58) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":281:80) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":282:53) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":282:81) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":282:70) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":286:32) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":287:30) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":287:43) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":288:55) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":288:42) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":290:45) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":290:32) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":298:16) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":306:41) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":307:34) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":307:47) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":308:64) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":308:46) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":310:49) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":310:36) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":318:20) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":303:12) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":323:31) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":323:42) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":323:23) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":323:62) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":323:73) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":323:55) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":325:26) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":326:25) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":327:25) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":330:30) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":334:14) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":337:29) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":344:31) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":344:27) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":344:48) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":344:41) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":344:66) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":344:58) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":345:29) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":345:69) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":139:4) +#loc202 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:16) +#loc203 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc204 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc205 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:11) +#loc206 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:4) +#loc207 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc208 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc209 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc210 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":825:27) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":825:38) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":825:20) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":825:56) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":825:67) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":825:49) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":835:23) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":835:15) +#loc220 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":828:4) +#loc222 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":384:12) +#loc223 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":385:13) +#loc224 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":387:26) +#loc225 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":388:26) +#loc226 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":390:26) +#loc227 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":390:37) +#loc228 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":390:18) +#loc229 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":390:56) +#loc230 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":390:67) +#loc231 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":390:49) +#loc232 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":391:26) +#loc233 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":391:37) +#loc234 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":391:18) +#loc235 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":391:56) +#loc236 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":391:67) +#loc237 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":391:49) +#loc238 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":395:43) +#loc239 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":395:90) +#loc240 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":395:101) +#loc241 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":395:63) +#loc242 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":397:28) +#loc243 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":405:12) +#loc244 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":411:64) +#loc245 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":414:28) +#loc246 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":414:19) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":415:28) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":415:19) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":417:19) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":417:8) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":419:11) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":419:4) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":458:105) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":459:19) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":461:14) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":464:36) +#loc258 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":464:46) +#loc259 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":467:36) +#loc260 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":467:46) +#loc261 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":479:35) +#loc262 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":482:23) +#loc263 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":483:23) +#loc264 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":485:34) +#loc265 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":485:23) +#loc266 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":486:22) +#loc267 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":487:23) +#loc268 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":488:23) +#loc269 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":489:23) +#loc270 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":490:23) +#loc271 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":491:23) +#loc272 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":492:35) +#loc273 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":493:24) +#loc274 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":494:24) +#loc275 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":495:32) +#loc276 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":496:25) +#loc277 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":497:92) +#loc278 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":498:92) +#loc279 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":499:25) +#loc280 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":500:24) +#loc281 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":501:24) +#loc282 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":502:39) +#loc283 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":503:25) +#loc284 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":504:24) +#loc285 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":505:24) +#loc286 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":506:23) +#loc287 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":507:25) +#loc288 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":508:25) +#loc289 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":509:92) +#loc290 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":510:25) +#loc291 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":511:24) +#loc292 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":512:24) +#loc293 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":513:39) +#loc294 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":514:25) +#loc295 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":515:24) +#loc296 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":516:24) +#loc297 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":521:69) +#loc298 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":524:27) +#loc299 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":525:39) +#loc300 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":525:21) +#loc301 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":528:104) +#loc302 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":530:20) +#loc303 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":531:22) +#loc304 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":531:19) +#loc305 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":531:14) +#loc306 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":542:32) +#loc307 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":542:43) +#loc308 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":542:62) +#loc309 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":542:73) +#loc310 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":542:54) +#loc311 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":549:43) +#loc312 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":551:15) +#loc313 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":553:30) +#loc314 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":553:21) +#loc315 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":553:10) +#loc316 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":555:11) +#loc317 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":555:4) +#loc319 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":798:11) +#loc320 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":798:4) +#loc322 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":788:33) +#loc323 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":789:38) +#loc324 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":789:24) +#loc325 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":790:109) +#loc326 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":790:113) +#loc327 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":790:39) +#loc328 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":790:55) +#loc329 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":790:25) +#loc330 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":791:30) +#loc331 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":791:35) +#loc332 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":791:60) +#loc333 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":792:34) +#loc334 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":792:48) +#loc335 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":792:63) +#loc336 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":793:29) +#loc337 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":793:47) +#loc338 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":793:61) +#loc339 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":793:42) +#loc340 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":794:11) +#loc341 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":794:4) +#loc343 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":595:12) +#loc344 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":596:13) +#loc345 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":598:26) +#loc346 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":599:26) +#loc347 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":601:26) +#loc348 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":601:37) +#loc349 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":601:18) +#loc350 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":601:56) +#loc351 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":601:67) +#loc352 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":601:49) +#loc353 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":602:27) +#loc354 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":602:38) +#loc355 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":602:19) +#loc356 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":602:58) +#loc357 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":602:69) +#loc358 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":602:51) +#loc359 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":608:42) +#loc360 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":608:87) +#loc361 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":608:98) +#loc362 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":608:61) +#loc363 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":610:28) +#loc364 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":618:12) +#loc365 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":623:62) +#loc366 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":626:28) +#loc367 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":626:19) +#loc368 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":627:28) +#loc369 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":627:19) +#loc370 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":628:19) +#loc371 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":628:8) +#loc372 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":630:11) +#loc373 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":630:4) +#loc375 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":669:105) +#loc376 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":672:28) +#loc377 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":672:22) +#loc378 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":675:26) +#loc379 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":675:46) +#loc380 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":676:20) +#loc381 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":678:15) +#loc382 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":680:36) +#loc383 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":680:46) +#loc384 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":683:36) +#loc385 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":683:46) +#loc386 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":695:36) +#loc387 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":698:25) +#loc388 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":699:25) +#loc389 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":701:35) +#loc390 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":701:24) +#loc391 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":702:24) +#loc392 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":703:25) +#loc393 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":704:24) +#loc394 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":705:24) +#loc395 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":706:24) +#loc396 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":707:24) +#loc397 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":708:35) +#loc398 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":709:25) +#loc399 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":710:25) +#loc400 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":711:32) +#loc401 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":712:25) +#loc402 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":713:92) +#loc403 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":714:92) +#loc404 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":715:25) +#loc405 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":716:24) +#loc406 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":717:24) +#loc407 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":718:39) +#loc408 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":719:25) +#loc409 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":720:24) +#loc410 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":721:24) +#loc411 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":722:24) +#loc412 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":723:25) +#loc413 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":724:25) +#loc414 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":725:92) +#loc415 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":726:25) +#loc416 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":727:24) +#loc417 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":728:24) +#loc418 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":729:39) +#loc419 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":730:25) +#loc420 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":731:24) +#loc421 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":732:24) +#loc422 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":736:69) +#loc423 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":739:27) +#loc424 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":740:44) +#loc425 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":740:40) +#loc426 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":740:22) +#loc427 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":741:99) +#loc428 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":744:24) +#loc429 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":744:43) +#loc430 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":744:10) +#loc431 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":746:29) +#loc432 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":746:21) +#loc433 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":750:29) +#loc434 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":750:20) +#loc435 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":751:25) +#loc436 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":751:22) +#loc437 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":751:16) +#loc438 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":773:45) +#loc439 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":775:24) +#loc440 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":775:52) +#loc441 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":775:43) +#loc442 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":775:10) +#loc443 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":777:11) +#loc444 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":777:4) +#loc463 = loc("ZQ"(#loc7)) +#loc464 = loc("HQ"(#loc8)) +#loc465 = loc("HKV"(#loc9)) +#loc466 = loc("Q_LEN"(#loc10)) +#loc467 = loc("ZKV"(#loc11)) +#loc468 = loc("KV_LEN"(#loc12)) +#loc469 = loc("pid"(#loc13)) +#loc470 = loc("NUM_KV_BLOCKS"(#loc14)) +#loc471 = loc("NUM_Q_BLOCKS"(#loc15)) +#loc472 = loc("off_zq"(#loc16)) +#loc473 = loc("off_hkv"(#loc17)) +#loc474 = loc("off_zkv"(#loc18)) +#loc475 = loc("SPARSE_Z"(#loc19)) +#loc476 = loc("SPARSE_HQ"(#loc20)) +#loc477 = loc("sparse_idx_z"(#loc21)) +#loc478 = loc("k_adj"(#loc22)) +#loc479 = loc("k_adj"(#loc23)) +#loc480 = loc("k_adj"(#loc24)) +#loc481 = loc("k_adj"(#loc25)) +#loc482 = loc("v_adj"(#loc26)) +#loc483 = loc("v_adj"(#loc27)) +#loc484 = loc("v_adj"(#loc28)) +#loc485 = loc("v_adj"(#loc29)) +#loc486 = loc("dv_adj"(#loc30)) +#loc487 = loc("dv_adj"(#loc31)) +#loc488 = loc("dv_adj"(#loc32)) +#loc489 = loc("dv_adj"(#loc33)) +#loc490 = loc("K"(#loc34)) +#loc491 = loc("V"(#loc35)) +#loc492 = loc("DV"(#loc36)) +#loc493 = loc("RCP_LN2"(#loc37)) +#loc494 = loc("offs_k"(#loc38)) +#loc495 = loc("offs_v"(#loc39)) +#loc496 = loc("off_pid"(#loc42)) +#loc497 = loc("SPARSE_Q_MULTIPLE"(#loc43)) +#loc498 = loc("SPARSE_KV_MULTIPLE"(#loc44)) +#loc499 = loc("off_hq2"(#loc45)) +#loc500 = loc("off_hq2"(#loc46)) +#loc501 = loc("off_hq2"(#loc47)) +#loc502 = loc("start_m2_block"(#loc48)) +#loc503 = loc("off_pid_mask"(#loc49)) +#loc504 = loc("stride_kv_num_blks_h"(#loc50)) +#loc505 = loc("stride_kv_idx_h"(#loc51)) +#loc506 = loc("stride_kv_idx_m"(#loc52)) +#loc507 = loc("sparse_idx_hq2"(#loc53)) +#loc508 = loc("sparse_hz_offset"(#loc54)) +#loc509 = loc("sparse_hz_offset"(#loc55)) +#loc510 = loc("sparse_kv_num_blks_offset"(#loc56)) +#loc511 = loc("sparse_kv_num_blks_offset"(#loc57)) +#loc512 = loc("sparse_kv_idx_offset"(#loc58)) +#loc513 = loc("sparse_kv_idx_offset"(#loc59)) +#loc514 = loc("sparse_kv_idx_offset"(#loc60)) +#loc515 = loc("q_adj2"(#loc61)) +#loc516 = loc("q_adj2"(#loc62)) +#loc517 = loc("q_adj2"(#loc63)) +#loc518 = loc("q_adj2"(#loc64)) +#loc519 = loc("do_adj2"(#loc65)) +#loc520 = loc("do_adj2"(#loc66)) +#loc521 = loc("do_adj2"(#loc67)) +#loc522 = loc("do_adj2"(#loc68)) +#loc523 = loc("dq_adj2"(#loc69)) +#loc524 = loc("dq_adj2"(#loc70)) +#loc525 = loc("dq_adj2"(#loc71)) +#loc526 = loc("dq_adj2"(#loc72)) +#loc527 = loc("off_chz2"(#loc73)) +#loc528 = loc("off_chz2"(#loc74)) +#loc529 = loc("off_chz2"(#loc75)) +#loc530 = loc("off_chz2"(#loc76)) +#loc531 = loc("Q2"(#loc77)) +#loc532 = loc("DO2"(#loc78)) +#loc533 = loc("DQ2"(#loc79)) +#loc534 = loc("LSE2"(#loc80)) +#loc535 = loc("DELTA2"(#loc81)) +#loc536 = loc("dq"(#loc82)) +#loc537 = loc("start_m2"(#loc83)) +#loc538 = loc("offs_m2"(#loc84)) +#loc539 = loc("offs_m2"(#loc85)) +#loc540 = loc("q"(#loc86)) +#loc541 = loc("do"(#loc87)) +#loc542 = loc("Di"(#loc88)) +#loc543 = loc("Di"(#loc89)) +#loc544 = loc("lse"(#loc90)) +#loc545 = loc("lse"(#loc91)) +#loc546 = loc("lse"(#loc92)) +#loc547 = loc("lse"(#loc93)) +#loc548 = loc("lse"(#loc94)) +#loc549 = loc("kv_indices"(#loc95)) +#loc550 = loc("kv_start"(#loc96)) +#loc551 = loc("kv_start"(#loc97)) +#loc552 = loc("sparse_kv_num_blocks"(#loc98)) +#loc553 = loc("sparse_kv_num_blocks"(#loc99)) +#loc554 = loc("offs_n2"(#loc100)) +#loc555 = loc("offs_n2"(#loc101)) +#loc556 = loc("dq"(#loc102)) +#loc557 = loc("kv_indices"(#loc103)) +#loc558 = loc("kv_start"(#loc104)) +#loc559 = loc("kv_start"(#loc105)) +#loc560 = loc("sparse_kv_num_blocks"(#loc106)) +#loc561 = loc("sparse_kv_num_blocks"(#loc107)) +#loc562 = loc("offs_n2"(#loc108)) +#loc563 = loc("offs_n2"(#loc109)) +#loc564 = loc("dq"(#loc110)) +#loc565 = loc("dq_ptrs"(#loc111)) +#loc566 = loc("dq_ptrs"(#loc112)) +#loc567 = loc("dq_ptrs"(#loc113)) +#loc568 = loc("dq_ptrs"(#loc114)) +#loc569 = loc("dq_ptrs"(#loc115)) +#loc570 = loc("dq_ptrs"(#loc116)) +#loc571 = loc("dq"(#loc117)) +#loc572 = loc("SPARSE_Q_MULTIPLE"(#loc119)) +#loc573 = loc("SPARSE_KV_MULTIPLE"(#loc120)) +#loc574 = loc("pid_mask"(#loc121)) +#loc575 = loc("stride_q_num_blks_h"(#loc122)) +#loc576 = loc("stride_q_idx_h"(#loc123)) +#loc577 = loc("stride_q_idx_n"(#loc124)) +#loc578 = loc("dv"(#loc125)) +#loc579 = loc("dk"(#loc126)) +#loc580 = loc("start_n1"(#loc127)) +#loc581 = loc("offs_n1"(#loc128)) +#loc582 = loc("offs_n1"(#loc129)) +#loc583 = loc("k"(#loc130)) +#loc584 = loc("v"(#loc131)) +#loc585 = loc("dv"(#loc132)) +#loc586 = loc("off_hq1"(#loc133)) +#loc587 = loc("off_hq1"(#loc134)) +#loc588 = loc("q_adj1"(#loc135)) +#loc589 = loc("q_adj1"(#loc136)) +#loc590 = loc("q_adj1"(#loc137)) +#loc591 = loc("q_adj1"(#loc138)) +#loc592 = loc("do_adj1"(#loc139)) +#loc593 = loc("do_adj1"(#loc140)) +#loc594 = loc("do_adj1"(#loc141)) +#loc595 = loc("do_adj1"(#loc142)) +#loc596 = loc("dq_adj1"(#loc143)) +#loc597 = loc("dq_adj1"(#loc144)) +#loc598 = loc("dq_adj1"(#loc145)) +#loc599 = loc("dq_adj1"(#loc146)) +#loc600 = loc("off_chz1"(#loc147)) +#loc601 = loc("off_chz1"(#loc148)) +#loc602 = loc("off_chz1"(#loc149)) +#loc603 = loc("off_chz1"(#loc150)) +#loc604 = loc("Q1"(#loc151)) +#loc605 = loc("DO1"(#loc152)) +#loc606 = loc("LSE1"(#loc153)) +#loc607 = loc("DELTA1"(#loc154)) +#loc608 = loc("sparse_idx_hq1"(#loc155)) +#loc609 = loc("sparse_hz_offset"(#loc156)) +#loc610 = loc("sparse_hz_offset"(#loc157)) +#loc611 = loc("sparse_q_num_blks_offset"(#loc158)) +#loc612 = loc("sparse_q_num_blks_offset"(#loc159)) +#loc613 = loc("sparse_q_idx_offset"(#loc160)) +#loc614 = loc("sparse_q_idx_offset"(#loc161)) +#loc615 = loc("sparse_q_idx_offset"(#loc162)) +#loc616 = loc("q_indices"(#loc163)) +#loc617 = loc("q_start"(#loc164)) +#loc618 = loc("q_start"(#loc165)) +#loc619 = loc("sparse_q_num_blocks"(#loc166)) +#loc620 = loc("sparse_q_num_blocks"(#loc167)) +#loc621 = loc("offs_m1"(#loc168)) +#loc622 = loc("offs_m1"(#loc169)) +#loc623 = loc("q_indices"(#loc171)) +#loc624 = loc("q_start"(#loc172)) +#loc625 = loc("q_start"(#loc173)) +#loc626 = loc("sparse_q_num_blocks"(#loc174)) +#loc627 = loc("sparse_q_num_blocks"(#loc175)) +#loc628 = loc("offs_m1"(#loc176)) +#loc629 = loc("offs_m1"(#loc177)) +#loc630 = loc("dv_ptrs"(#loc180)) +#loc631 = loc("dv_ptrs"(#loc181)) +#loc632 = loc("dv_ptrs"(#loc182)) +#loc633 = loc("dv_ptrs"(#loc183)) +#loc634 = loc("dv_ptrs"(#loc184)) +#loc635 = loc("dv_ptrs"(#loc185)) +#loc636 = loc("index_n"(#loc186)) +#loc637 = loc("index_k"(#loc187)) +#loc638 = loc("index_v"(#loc188)) +#loc639 = loc("dk"(#loc190)) +#loc640 = loc("mask"(#loc191)) +#loc641 = loc("xindex"(#loc192)) +#loc642 = loc("xindex"(#loc193)) +#loc643 = loc("xindex"(#loc194)) +#loc644 = loc("xindex"(#loc195)) +#loc645 = loc("xindex"(#loc196)) +#loc646 = loc("xindex"(#loc197)) +#loc654 = loc("ptr"(#loc212)) +#loc655 = loc("ptr"(#loc213)) +#loc656 = loc("ptr"(#loc214)) +#loc657 = loc("ptr"(#loc215)) +#loc658 = loc("ptr"(#loc216)) +#loc659 = loc("ptr"(#loc217)) +#loc695 = loc("Q_LEN"(#loc222)) +#loc696 = loc("KV_LEN"(#loc223)) +#loc697 = loc("offs_k"(#loc224)) +#loc698 = loc("offs_v"(#loc225)) +#loc699 = loc("kT_ptrs"(#loc226)) +#loc700 = loc("kT_ptrs"(#loc227)) +#loc701 = loc("kT_ptrs"(#loc228)) +#loc702 = loc("kT_ptrs"(#loc229)) +#loc703 = loc("kT_ptrs"(#loc230)) +#loc704 = loc("kT_ptrs"(#loc231)) +#loc705 = loc("vT_ptrs"(#loc232)) +#loc706 = loc("vT_ptrs"(#loc233)) +#loc707 = loc("vT_ptrs"(#loc234)) +#loc708 = loc("vT_ptrs"(#loc235)) +#loc709 = loc("vT_ptrs"(#loc236)) +#loc710 = loc("vT_ptrs"(#loc237)) +#loc711 = loc("hi"(#loc238)) +#loc712 = loc("hi"(#loc239)) +#loc713 = loc("hi"(#loc240)) +#loc714 = loc("hi"(#loc241)) +#loc715 = loc("dq"(#loc242)) +#loc716 = loc("dq"(#loc243)) +#loc717 = loc("offset"(#loc244)) +#loc718 = loc("kT_ptrs"(#loc245)) +#loc719 = loc("kT_ptrs"(#loc246)) +#loc720 = loc("vT_ptrs"(#loc247)) +#loc721 = loc("vT_ptrs"(#loc248)) +#loc722 = loc("offs_n2"(#loc249)) +#loc762 = loc("kT"(#loc254)) +#loc763 = loc("qk"(#loc255)) +#loc764 = loc("qk"(#loc256)) +#loc765 = loc("n"(#loc257)) +#loc766 = loc("n"(#loc258)) +#loc767 = loc("m"(#loc259)) +#loc768 = loc("m"(#loc260)) +#loc769 = loc("tmp1"(#loc261)) +#loc770 = loc("tmp4"(#loc262)) +#loc771 = loc("tmp5"(#loc263)) +#loc772 = loc("tmp7"(#loc264)) +#loc773 = loc("tmp7"(#loc265)) +#loc774 = loc("tmp8"(#loc266)) +#loc775 = loc("tmp9"(#loc267)) +#loc776 = loc("tmp10"(#loc268)) +#loc777 = loc("tmp11"(#loc269)) +#loc778 = loc("tmp12"(#loc270)) +#loc779 = loc("tmp13"(#loc271)) +#loc780 = loc("tmp14"(#loc272)) +#loc781 = loc("tmp15"(#loc273)) +#loc782 = loc("tmp16"(#loc274)) +#loc783 = loc("tmp17"(#loc275)) +#loc784 = loc("tmp18"(#loc276)) +#loc785 = loc("tmp19"(#loc277)) +#loc786 = loc("tmp20"(#loc278)) +#loc787 = loc("tmp21"(#loc279)) +#loc788 = loc("tmp22"(#loc280)) +#loc789 = loc("tmp23"(#loc281)) +#loc790 = loc("tmp24"(#loc282)) +#loc791 = loc("tmp25"(#loc283)) +#loc792 = loc("tmp26"(#loc284)) +#loc793 = loc("tmp27"(#loc285)) +#loc794 = loc("tmp28"(#loc286)) +#loc795 = loc("tmp29"(#loc287)) +#loc796 = loc("tmp30"(#loc288)) +#loc797 = loc("tmp31"(#loc289)) +#loc798 = loc("tmp32"(#loc290)) +#loc799 = loc("tmp33"(#loc291)) +#loc800 = loc("tmp34"(#loc292)) +#loc801 = loc("tmp35"(#loc293)) +#loc802 = loc("tmp36"(#loc294)) +#loc803 = loc("tmp37"(#loc295)) +#loc804 = loc("tmp38"(#loc296)) +#loc805 = loc("post_mod_scores"(#loc297)) +#loc806 = loc("post_mod_scores"(#loc298)) +#loc807 = loc("p"(#loc299)) +#loc808 = loc("p"(#loc300)) +#loc809 = loc("vT"(#loc301)) +#loc810 = loc("dp"(#loc302)) +#loc811 = loc("ds"(#loc303)) +#loc812 = loc("ds"(#loc304)) +#loc813 = loc("ds"(#loc305)) +#loc814 = loc("scatter_mask"(#loc306)) +#loc815 = loc("scatter_mask"(#loc307)) +#loc816 = loc("scatter_mask"(#loc308)) +#loc817 = loc("scatter_mask"(#loc309)) +#loc818 = loc("scatter_mask"(#loc310)) +#loc819 = loc("ds"(#loc311)) +#loc820 = loc("ds"(#loc312)) +#loc821 = loc("dq"(#loc313)) +#loc822 = loc("dq"(#loc314)) +#loc823 = loc("dq"(#loc315)) +#loc829 = loc("cur_block_idx"(#loc322)) +#loc830 = loc("cur_block"(#loc323)) +#loc831 = loc("cur_block"(#loc324)) +#loc832 = loc("next_block"(#loc325)) +#loc833 = loc("next_block"(#loc326)) +#loc834 = loc("next_block"(#loc327)) +#loc835 = loc("next_block"(#loc328)) +#loc836 = loc("next_block"(#loc329)) +#loc837 = loc("needs_jump"(#loc330)) +#loc838 = loc("needs_jump"(#loc331)) +#loc839 = loc("needs_jump"(#loc332)) +#loc840 = loc("jump_to_block"(#loc333)) +#loc841 = loc("jump_to_block"(#loc334)) +#loc842 = loc("jump_to_block"(#loc335)) +#loc843 = loc("offset"(#loc336)) +#loc844 = loc("offset"(#loc337)) +#loc845 = loc("offset"(#loc338)) +#loc846 = loc("offset"(#loc339)) +#loc883 = loc("Q_LEN"(#loc343)) +#loc884 = loc("KV_LEN"(#loc344)) +#loc885 = loc("offs_k"(#loc345)) +#loc886 = loc("offs_v"(#loc346)) +#loc887 = loc("qT_ptrs"(#loc347)) +#loc888 = loc("qT_ptrs"(#loc348)) +#loc889 = loc("qT_ptrs"(#loc349)) +#loc890 = loc("qT_ptrs"(#loc350)) +#loc891 = loc("qT_ptrs"(#loc351)) +#loc892 = loc("qT_ptrs"(#loc352)) +#loc893 = loc("do_ptrs"(#loc353)) +#loc894 = loc("do_ptrs"(#loc354)) +#loc895 = loc("do_ptrs"(#loc355)) +#loc896 = loc("do_ptrs"(#loc356)) +#loc897 = loc("do_ptrs"(#loc357)) +#loc898 = loc("do_ptrs"(#loc358)) +#loc899 = loc("hi"(#loc359)) +#loc900 = loc("hi"(#loc360)) +#loc901 = loc("hi"(#loc361)) +#loc902 = loc("hi"(#loc362)) +#loc903 = loc("dk"(#loc363)) +#loc904 = loc("offset"(#loc365)) +#loc905 = loc("qT_ptrs"(#loc366)) +#loc906 = loc("qT_ptrs"(#loc367)) +#loc907 = loc("do_ptrs"(#loc368)) +#loc908 = loc("do_ptrs"(#loc369)) +#loc909 = loc("offs_m1"(#loc370)) +#loc950 = loc("qT"(#loc375)) +#loc951 = loc("lse"(#loc376)) +#loc952 = loc("lse"(#loc377)) +#loc953 = loc("lse"(#loc378)) +#loc954 = loc("lse"(#loc379)) +#loc955 = loc("qkT"(#loc380)) +#loc956 = loc("qkT"(#loc381)) +#loc957 = loc("m"(#loc382)) +#loc958 = loc("m"(#loc383)) +#loc959 = loc("n"(#loc384)) +#loc960 = loc("n"(#loc385)) +#loc961 = loc("tmp41"(#loc386)) +#loc962 = loc("tmp44"(#loc387)) +#loc963 = loc("tmp45"(#loc388)) +#loc964 = loc("tmp47"(#loc389)) +#loc965 = loc("tmp47"(#loc390)) +#loc966 = loc("tmp48"(#loc391)) +#loc967 = loc("tmp49"(#loc392)) +#loc968 = loc("tmp50"(#loc393)) +#loc969 = loc("tmp51"(#loc394)) +#loc970 = loc("tmp52"(#loc395)) +#loc971 = loc("tmp53"(#loc396)) +#loc972 = loc("tmp54"(#loc397)) +#loc973 = loc("tmp55"(#loc398)) +#loc974 = loc("tmp56"(#loc399)) +#loc975 = loc("tmp57"(#loc400)) +#loc976 = loc("tmp58"(#loc401)) +#loc977 = loc("tmp59"(#loc402)) +#loc978 = loc("tmp60"(#loc403)) +#loc979 = loc("tmp61"(#loc404)) +#loc980 = loc("tmp62"(#loc405)) +#loc981 = loc("tmp63"(#loc406)) +#loc982 = loc("tmp64"(#loc407)) +#loc983 = loc("tmp65"(#loc408)) +#loc984 = loc("tmp66"(#loc409)) +#loc985 = loc("tmp67"(#loc410)) +#loc986 = loc("tmp68"(#loc411)) +#loc987 = loc("tmp69"(#loc412)) +#loc988 = loc("tmp70"(#loc413)) +#loc989 = loc("tmp71"(#loc414)) +#loc990 = loc("tmp72"(#loc415)) +#loc991 = loc("tmp73"(#loc416)) +#loc992 = loc("tmp74"(#loc417)) +#loc993 = loc("tmp75"(#loc418)) +#loc994 = loc("tmp76"(#loc419)) +#loc995 = loc("tmp77"(#loc420)) +#loc996 = loc("tmp78"(#loc421)) +#loc997 = loc("post_mod_scores"(#loc422)) +#loc998 = loc("post_mod_scores"(#loc423)) +#loc999 = loc("pT"(#loc424)) +#loc1000 = loc("pT"(#loc425)) +#loc1001 = loc("pT"(#loc426)) +#loc1002 = loc("do"(#loc427)) +#loc1003 = loc("dv"(#loc428)) +#loc1004 = loc("dv"(#loc429)) +#loc1005 = loc("dv"(#loc430)) +#loc1006 = loc("Di"(#loc431)) +#loc1007 = loc("Di"(#loc432)) +#loc1008 = loc("dpT"(#loc433)) +#loc1009 = loc("dpT"(#loc434)) +#loc1010 = loc("dsT"(#loc435)) +#loc1011 = loc("dsT"(#loc436)) +#loc1012 = loc("dsT"(#loc437)) +#loc1013 = loc("dsT"(#loc438)) +#loc1014 = loc("dk"(#loc439)) +#loc1015 = loc("dk"(#loc440)) +#loc1016 = loc("dk"(#loc441)) +#loc1017 = loc("dk"(#loc442)) +#loc1018 = loc("SPARSE_Q_MULTIPLE"(#loc497)) +#loc1019 = loc("SPARSE_KV_MULTIPLE"(#loc498)) +#loc1020 = loc("SPARSE_Q_MULTIPLE"(#loc572)) +#loc1021 = loc("SPARSE_KV_MULTIPLE"(#loc573)) +#loc1022 = loc("dk"(#loc585)) +#loc1023 = loc("offs_n2"(#loc715)) +#loc1024 = loc("dv"(#loc903)) +#loc1025 = loc("kT_ptrs"(#loc1023)) +#loc1026 = loc("offs_m1"(#loc1024)) +#loc1027 = loc("vT_ptrs"(#loc1025)) +#loc1028 = loc("qT_ptrs"(#loc1026)) +#loc1029 = loc("do_ptrs"(#loc1028)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..8c93922551a713084a0bfbd3fb39c4d3b602c3d1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.ttgir @@ -0,0 +1,1749 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":18:0) +#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 64, 16]}> +#mma1 = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 128, 16]}> +#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}> +#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}> +#shared2 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}> +#smem = #ttg.shared_memory +#loc289 = loc("arg_Q"(#loc)) +#loc290 = loc("arg_K"(#loc)) +#loc291 = loc("arg_V"(#loc)) +#loc292 = loc("arg_LSE"(#loc)) +#loc293 = loc("arg_DELTA"(#loc)) +#loc294 = loc("arg_DO"(#loc)) +#loc295 = loc("arg_DQ"(#loc)) +#loc296 = loc("arg_DV"(#loc)) +#loc297 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc298 = loc("arg_KV_IDX"(#loc)) +#loc299 = loc("arg_Q_NUM_BLKS"(#loc)) +#loc300 = loc("arg_Q_IDX"(#loc)) +#loc301 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc302 = loc("arg_FULL_KV_IDX"(#loc)) +#loc303 = loc("arg_FULL_Q_NUM_BLKS"(#loc)) +#loc304 = loc("arg_FULL_Q_IDX"(#loc)) +#loc305 = loc("in_ptr16"(#loc)) +#loc306 = loc("out_ptr0"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_tem_fused_zeros_1(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_DELTA: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DELTA"(#loc)), %arg_DO: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DO"(#loc)), %arg_DQ: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DQ"(#loc)), %arg_DV: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DV"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_NUM_BLKS"(#loc)), %arg_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %arg_FULL_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_NUM_BLKS"(#loc)), %arg_FULL_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_IDX"(#loc)), %in_ptr16: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr16"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<128x1xi32, #mma> loc(#loc1) + %cst_0 = arith.constant dense<2048> : tensor<128x1xi32, #mma> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<4096> : tensor<128x1xi32, #blocked> loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c2097152_i32 = arith.constant 2097152 : i32 loc(#loc1) + %c262144_i32 = arith.constant 262144 : i32 loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_3 = arith.constant dense<0.0883883461> : tensor<128x128xf32, #mma1> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma1> loc(#loc1) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_6 = arith.constant dense<0.0883883461> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_7 = arith.constant dense<0xFF800000> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_8 = arith.constant dense<1.44269502> : tensor<128x64xf32, #mma> loc(#loc1) + %true = arith.constant true loc(#loc1) + %c-1_i32 = arith.constant -1 : i32 loc(#loc1) + %c3_i32 = arith.constant 3 : i32 loc(#loc1) + %cst_9 = arith.constant dense<8192> : tensor<128x64xi32, #blocked1> loc(#loc1) + %cst_10 = arith.constant dense<262144> : tensor<128x64xi32, #blocked1> loc(#loc1) + %cst_11 = arith.constant dense<8192> : tensor<64x128xi32, #blocked> loc(#loc1) + %cst_12 = arith.constant dense<64> : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1) + %cst_13 = arith.constant dense<2048> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_14 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_15 = arith.constant dense<4096> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_16 = arith.constant dense<128> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_17 = arith.constant dense<2048> : tensor<1x64xi32, #mma> loc(#loc1) + %cst_18 = arith.constant dense<2048> : tensor<128x64xi32, #mma> loc(#loc1) + %cst_19 = arith.constant dense<0xFF800000> : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1) + %cst_20 = arith.constant dense<0.000000e+00> : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc1) + %cst_21 = arith.constant dense<0> : tensor<128x64xi32, #mma> loc(#loc1) + %cst_22 = arith.constant dense<0> : tensor<1x64xi32, #mma> loc(#loc1) + %cst_23 = arith.constant dense<0.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %cst_24 = arith.constant dense<0xFF800000> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %pid = tt.get_program_id x : i32 loc(#loc307) + %off_zq = tt.get_program_id y : i32 loc(#loc308) + %off_hkv = tt.get_program_id z : i32 loc(#loc309) + %off_zkv = arith.remsi %off_zq, %c2_i32 : i32 loc(#loc310) + %k_adj = arith.muli %off_hkv, %c262144_i32 : i32 loc(#loc311) + %k_adj_25 = arith.muli %off_zkv, %c2097152_i32 : i32 loc(#loc312) + %k_adj_26 = arith.addi %k_adj, %k_adj_25 : i32 loc(#loc313) + %k_adj_27 = arith.extsi %k_adj_26 : i32 to i64 loc(#loc314) + %dv_adj = arith.muli %off_zq, %c2097152_i32 : i32 loc(#loc315) + %dv_adj_28 = arith.addi %k_adj, %dv_adj : i32 loc(#loc316) + %dv_adj_29 = arith.extsi %dv_adj_28 : i32 to i64 loc(#loc317) + %K = tt.addptr %arg_K, %k_adj_27 : !tt.ptr, i64 loc(#loc318) + %V = tt.addptr %arg_V, %k_adj_27 : !tt.ptr, i64 loc(#loc319) + %DV = tt.addptr %arg_DV, %dv_adj_29 : !tt.ptr, i64 loc(#loc320) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc321) + %offs_k_30 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc321) + %0 = arith.cmpi sge, %pid, %c16_i32 : i32 loc(#loc17) + scf.if %0 { + %off_pid = arith.subi %pid, %c16_i32 : i32 loc(#loc322) + %off_hq2 = arith.divsi %off_pid, %c16_i32 : i32 loc(#loc323) + %off_hq2_31 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc324) + %off_hq2_32 = arith.addi %off_hq2, %off_hq2_31 : i32 loc(#loc325) + %start_m2_block = arith.remsi %off_pid, %c16_i32 : i32 loc(#loc326) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %c16_i32 : i32 loc(#loc327) + %sparse_kv_num_blks_offset_33 = arith.addi %sparse_kv_num_blks_offset, %start_m2_block : i32 loc(#loc328) + %sparse_kv_idx_offset = arith.muli %off_zkv, %c256_i32 : i32 loc(#loc329) + %sparse_kv_idx_offset_34 = arith.muli %start_m2_block, %c16_i32 : i32 loc(#loc330) + %sparse_kv_idx_offset_35 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_34 : i32 loc(#loc331) + %q_adj2 = arith.muli %off_hq2_32, %c128_i32 : i32 loc(#loc332) + %q_adj2_36 = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc333) + %q_adj2_37 = arith.addi %q_adj2, %q_adj2_36 : i32 loc(#loc334) + %q_adj2_38 = arith.extsi %q_adj2_37 : i32 to i64 loc(#loc335) + %do_adj2 = arith.muli %off_hq2_32, %c262144_i32 : i32 loc(#loc336) + %do_adj2_39 = arith.addi %do_adj2, %q_adj2_36 : i32 loc(#loc337) + %do_adj2_40 = arith.extsi %do_adj2_39 : i32 to i64 loc(#loc338) + %off_chz2 = arith.muli %off_zq, %c32_i32 : i32 loc(#loc339) + %off_chz2_41 = arith.addi %off_chz2, %off_hq2_32 : i32 loc(#loc340) + %off_chz2_42 = arith.muli %off_chz2_41, %c2048_i32 : i32 loc(#loc341) + %off_chz2_43 = arith.extsi %off_chz2_42 : i32 to i64 loc(#loc342) + %Q2 = tt.addptr %arg_Q, %q_adj2_38 : !tt.ptr, i64 loc(#loc343) + %DO2 = tt.addptr %arg_DO, %do_adj2_40 : !tt.ptr, i64 loc(#loc344) + %DQ2 = tt.addptr %arg_DQ, %q_adj2_38 : !tt.ptr, i64 loc(#loc345) + %LSE2 = tt.addptr %arg_LSE, %off_chz2_43 : !tt.ptr, i64 loc(#loc346) + %DELTA2 = tt.addptr %arg_DELTA, %off_chz2_43 : !tt.ptr, i64 loc(#loc347) + %start_m2 = arith.muli %start_m2_block, %c128_i32 : i32 loc(#loc348) + %offs_m2 = tt.splat %start_m2 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc349) + %offs_m2_44 = tt.splat %start_m2 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc349) + %offs_m2_45 = arith.addi %offs_m2, %offs_k : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc349) + %offs_m2_46 = arith.addi %offs_m2_44, %offs_k_30 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc349) + %ptr = tt.expand_dims %offs_m2_45 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc584) + %ptr_47 = tt.expand_dims %offs_m2_46 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xi32, #mma> loc(#loc584) + %ptr_48 = arith.muli %ptr, %cst_2 : tensor<128x1xi32, #blocked> loc(#loc585) + %ptr_49 = tt.splat %Q2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc586) + %ptr_50 = tt.addptr %ptr_49, %ptr_48 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc586) + %ptr_51 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc587) + %ptr_52 = tt.expand_dims %ptr_51 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc587) + %ptr_53 = tt.broadcast %ptr_50 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc588) + %ptr_54 = tt.broadcast %ptr_52 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc588) + %ptr_55 = tt.addptr %ptr_53, %ptr_54 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc588) + %q = tt.load %ptr_55 : tensor<128x128x!tt.ptr, #blocked> loc(#loc589) + %q_56 = ttg.local_alloc %q : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc589) + %ptr_57 = arith.muli %ptr, %cst_1 : tensor<128x1xi32, #blocked> loc(#loc590) + %ptr_58 = tt.splat %DO2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc591) + %ptr_59 = tt.addptr %ptr_58, %ptr_57 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc591) + %ptr_60 = tt.broadcast %ptr_59 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc592) + %ptr_61 = tt.addptr %ptr_60, %ptr_54 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc592) + %do = tt.load %ptr_61 : tensor<128x128x!tt.ptr, #blocked> loc(#loc593) + %do_62 = ttg.local_alloc %do : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc593) + %Di = tt.splat %DELTA2 : !tt.ptr -> tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc357) + %Di_63 = tt.addptr %Di, %offs_m2_46 : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc357) + %Di_64 = tt.load %Di_63 : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc358) + %lse = tt.splat %LSE2 : !tt.ptr -> tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc359) + %lse_65 = tt.addptr %lse, %offs_m2_46 : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc359) + %lse_66 = tt.load %lse_65 : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc360) + %lse_67 = arith.cmpf oeq, %lse_66, %cst_24 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc361) + %lse_68 = arith.select %lse_67, %cst_23, %lse_66 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc362) + %lse_69 = tt.expand_dims %lse_68 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc363) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_35 : !tt.ptr, i32 loc(#loc364) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc365) + %kv_start_70 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc366) + %sparse_kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_33 : !tt.ptr, i32 loc(#loc367) + %sparse_kv_num_blocks_71 = tt.load %sparse_kv_num_blocks : !tt.ptr loc(#loc368) + %offs_n2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc369) + %offs_n2_72 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc369) + %offs_n2_73 = tt.splat %kv_start_70 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc370) + %offs_n2_74 = tt.splat %kv_start_70 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc370) + %offs_n2_75 = arith.addi %offs_n2_73, %offs_n2 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc370) + %offs_n2_76 = arith.addi %offs_n2_74, %offs_n2_72 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc370) + %kT_ptrs = tt.expand_dims %offs_n2_75 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc594) + %kT_ptrs_77 = arith.muli %kT_ptrs, %cst_16 : tensor<1x64xi32, #blocked1> loc(#loc595) + %kT_ptrs_78 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc596) + %kT_ptrs_79 = tt.addptr %kT_ptrs_78, %kT_ptrs_77 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc596) + %kT_ptrs_80 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc597) + %kT_ptrs_81 = tt.expand_dims %kT_ptrs_80 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xi32, #blocked1> loc(#loc597) + %kT_ptrs_82 = tt.broadcast %kT_ptrs_79 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc598) + %kT_ptrs_83 = tt.broadcast %kT_ptrs_81 : tensor<128x1xi32, #blocked1> -> tensor<128x64xi32, #blocked1> loc(#loc598) + %kT_ptrs_84 = tt.addptr %kT_ptrs_82, %kT_ptrs_83 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc598) + %vT_ptrs = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc599) + %vT_ptrs_85 = tt.addptr %vT_ptrs, %kT_ptrs_77 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc599) + %vT_ptrs_86 = tt.broadcast %vT_ptrs_85 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc600) + %vT_ptrs_87 = tt.addptr %vT_ptrs_86, %kT_ptrs_83 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc600) + %hi = arith.muli %sparse_kv_num_blocks_71, %c2_i32 : i32 loc(#loc601) + %hi_88 = arith.minsi %hi, %c32_i32 : i32 loc(#loc602) + %tmp4 = tt.broadcast %ptr_47 : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc747) + %tmp7 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc748) + %vT_ptrs_89 = arith.cmpi sgt, %hi_88, %c0_i32 : i32 loc(#loc886) + %tmp7_90 = tt.load %tmp7, %vT_ptrs_89 : !tt.ptr loc(#loc750) + %tmp8 = tt.splat %tmp7_90 : i64 -> tensor<1x64xi64, #mma> loc(#loc751) + %tmp9 = arith.extsi %ptr_47 : tensor<128x1xi32, #mma> to tensor<128x1xi64, #mma> loc(#loc752) + %tmp10 = tt.splat %tmp7_90 : i64 -> tensor<128x1xi64, #mma> loc(#loc753) + %tmp10_91 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64, #mma> loc(#loc753) + %tmp11 = tt.broadcast %tmp10_91 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc754) + %p = tt.broadcast %lse_69 : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc755) + %ds = tt.expand_dims %Di_64 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc756) + %ds_92 = tt.broadcast %ds : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc757) + %kT = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc881) + %vT = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc882) + %kT_93 = ttg.memdesc_index %kT[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc881) + %vT_ptrs_94 = tt.splat %vT_ptrs_89 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc886) + %kT_95 = ttg.async_copy_global_to_local %kT_ptrs_84, %kT_93 mask %vT_ptrs_94 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc881) + %kT_96 = ttg.async_commit_group tokens %kT_95 loc(#loc881) + %vT_97 = ttg.memdesc_index %vT[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc882) + %vT_98 = ttg.async_copy_global_to_local %vT_ptrs_87, %vT_97 mask %vT_ptrs_94 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc882) + %vT_99 = ttg.async_commit_group tokens %vT_98 loc(#loc882) + %vT_ptrs_100 = arith.cmpi sgt, %hi_88, %c1_i32 : i32 loc(#loc886) + %kT_ptrs_101 = tt.addptr %kT_ptrs_84, %cst_9 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc605) + %vT_ptrs_102 = tt.addptr %vT_ptrs_87, %cst_9 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc606) + %kT_103 = ttg.memdesc_index %kT[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc881) + %vT_ptrs_104 = tt.splat %vT_ptrs_100 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc886) + %kT_105 = ttg.async_copy_global_to_local %kT_ptrs_101, %kT_103 mask %vT_ptrs_104 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc881) + %kT_106 = ttg.async_commit_group tokens %kT_105 loc(#loc881) + %vT_107 = ttg.memdesc_index %vT[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc882) + %vT_108 = ttg.async_copy_global_to_local %vT_ptrs_102, %vT_107 mask %vT_ptrs_104 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc882) + %vT_109 = ttg.async_commit_group tokens %vT_108 loc(#loc882) + ttng.fence_async_shared {bCluster = false} loc(#loc760) + %vT_ptrs_110:11 = scf.for %vT_ptrs_156 = %c0_i32 to %hi_88 step %c1_i32 iter_args(%arg19 = %cst_4, %kT_ptrs_157 = %kT_ptrs_101, %vT_ptrs_158 = %vT_ptrs_102, %offs_n2_159 = %offs_n2_76, %arg23 = %c1_i32, %arg24 = %c-1_i32, %kT_160 = %kT_96, %kT_161 = %kT_106, %vT_162 = %vT_99, %vT_163 = %vT_109, %arg29 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 { + %vT_ptrs_164 = arith.subi %hi_88, %c2_i32 : i32 loc(#loc886) + %vT_ptrs_165 = arith.cmpi slt, %vT_ptrs_156, %vT_ptrs_164 : i32 loc(#loc886) + %vT_ptrs_166 = arith.subi %hi_88, %c1_i32 : i32 loc(#loc886) + %vT_ptrs_167 = arith.cmpi slt, %vT_ptrs_156, %vT_ptrs_166 : i32 loc(#loc886) + %vT_ptrs_168 = arith.addi %arg24, %c1_i32 : i32 loc(#loc886) + %vT_ptrs_169 = arith.cmpi sge, %vT_ptrs_168, %c3_i32 : i32 loc(#loc886) + %vT_ptrs_170 = arith.select %vT_ptrs_169, %c0_i32, %vT_ptrs_168 : i32 loc(#loc886) + %kT_171 = ttg.async_wait %kT_160, %vT_162 {num = 2 : i32} loc(#loc881) + %kT_172 = ttg.memdesc_index %kT[%vT_ptrs_170] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc881) + %dq_173 = ttg.memdesc_trans %kT_172 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc761) + %qk = ttng.warp_group_dot %q_56, %kT_172, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc760) + %qk_174:3 = ttng.warp_group_dot_wait %qk, %q_56, %kT_172 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc760) + %qk_175 = arith.mulf %qk_174#0, %cst_6 : tensor<128x64xf32, #mma> loc(#loc762) + %n = tt.expand_dims %offs_n2_159 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc763) + %tmp4_176 = tt.broadcast %n : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc747) + %tmp4_177 = arith.cmpi sge, %tmp4, %tmp4_176 : tensor<128x64xi32, #mma> loc(#loc747) + %tmp5 = arith.extsi %n : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc764) + %tmp8_178 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64, #mma> loc(#loc751) + %tmp11_179 = tt.broadcast %tmp8_178 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc754) + %tmp11_180 = arith.andi %tmp11_179, %tmp11 : tensor<128x64xi1, #mma> loc(#loc754) + %tmp12 = arith.andi %tmp4_177, %tmp11_180 : tensor<128x64xi1, #mma> loc(#loc765) + %tmp15 = arith.cmpi sge, %n, %cst_17 : tensor<1x64xi32, #mma> loc(#loc766) + %tmp16 = arith.remsi %n, %cst_17 : tensor<1x64xi32, #mma> loc(#loc767) + %tmp18 = arith.cmpi ne, %tmp16, %cst_22 : tensor<1x64xi32, #mma> loc(#loc768) + %tmp19 = arith.cmpi slt, %tmp16, %cst_22 : tensor<1x64xi32, #mma> loc(#loc769) + %tmp22 = arith.andi %tmp18, %tmp19 : tensor<1x64xi1, #mma> loc(#loc770) + %tmp23 = arith.addi %tmp16, %cst_17 : tensor<1x64xi32, #mma> loc(#loc771) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1, #mma>, tensor<1x64xi32, #mma> loc(#loc772) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc773) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64, #mma> loc(#loc774) + %tmp27 = arith.andi %tmp15, %tmp26 : tensor<1x64xi1, #mma> loc(#loc775) + %tmp28 = arith.subi %tmp4_176, %tmp4 : tensor<128x64xi32, #mma> loc(#loc776) + %tmp29 = arith.remsi %tmp28, %cst_18 : tensor<128x64xi32, #mma> loc(#loc777) + %tmp30 = arith.cmpi ne, %tmp29, %cst_21 : tensor<128x64xi32, #mma> loc(#loc778) + %tmp31 = arith.cmpi slt, %tmp29, %cst_21 : tensor<128x64xi32, #mma> loc(#loc779) + %tmp33 = arith.andi %tmp30, %tmp31 : tensor<128x64xi1, #mma> loc(#loc780) + %tmp34 = arith.addi %tmp29, %cst_18 : tensor<128x64xi32, #mma> loc(#loc781) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29 : tensor<128x64xi1, #mma>, tensor<128x64xi32, #mma> loc(#loc782) + %tmp36 = arith.cmpi eq, %tmp35, %cst_21 : tensor<128x64xi32, #mma> loc(#loc783) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc784) + %tmp37_181 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1, #mma> loc(#loc784) + %tmp38 = arith.ori %tmp12, %tmp37_181 : tensor<128x64xi1, #mma> loc(#loc785) + %post_mod_scores = arith.select %tmp38, %qk_175, %cst_7 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc786) + %post_mod_scores_182 = arith.mulf %post_mod_scores, %cst_8 : tensor<128x64xf32, #mma> loc(#loc787) + %p_183 = arith.subf %post_mod_scores_182, %p : tensor<128x64xf32, #mma> loc(#loc755) + %p_184 = math.exp2 %p_183 : tensor<128x64xf32, #mma> loc(#loc788) + %vT_185 = ttg.memdesc_index %vT[%vT_ptrs_170] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc882) + %dp = ttng.warp_group_dot %do_62, %vT_185, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc789) + %dp_186:3 = ttng.warp_group_dot_wait %dp, %do_62, %vT_185 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc789) + %ds_187 = arith.subf %dp_186#0, %ds_92 : tensor<128x64xf32, #mma> loc(#loc757) + %ds_188 = arith.mulf %p_184, %ds_187 : tensor<128x64xf32, #mma> loc(#loc790) + %ds_189 = arith.select %tmp38, %ds_188, %cst_5 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc791) + %ds_190 = arith.truncf %ds_189 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc792) + %ds_191 = ttg.convert_layout %ds_190 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc792) + %dq_192 = ttng.warp_group_dot %ds_191, %dq_173, %arg19 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc793) + %offs_n2_193 = tt.splat %arg29 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc607) + %offs_n2_194 = arith.addi %offs_n2_159, %offs_n2_193 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc607) + %vT_ptrs_195 = arith.addi %vT_ptrs_156, %c1_i32 : i32 loc(#loc886) + %cur_block_idx = arith.divsi %vT_ptrs_195, %c2_i32 : i32 loc(#loc794) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc795) + %cur_block_196 = tt.load %cur_block, %vT_ptrs_167 evictionPolicy = evict_last : !tt.ptr loc(#loc796) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc797) + %next_block_197 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_71 : i32 loc(#loc798) + %next_block_198 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc799) + %vT_ptrs_199 = arith.andi %vT_ptrs_167, %next_block_197 : i1 loc(#loc886) + %next_block_200 = tt.load %next_block_198, %vT_ptrs_199 evictionPolicy = evict_last : !tt.ptr loc(#loc800) + %needs_jump = arith.addi %vT_ptrs_156, %c2_i32 : i32 loc(#loc801) + %needs_jump_201 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc802) + %needs_jump_202 = arith.cmpi eq, %needs_jump_201, %c0_i32 : i32 loc(#loc803) + %jump_to_block = arith.subi %next_block_200, %cur_block_196 : i32 loc(#loc804) + %jump_to_block_203 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc805) + %jump_to_block_204 = arith.subi %jump_to_block_203, %c64_i32 : i32 loc(#loc806) + %offset = arith.extui %needs_jump_202 : i1 to i32 loc(#loc807) + %offset_205 = arith.muli %jump_to_block_204, %offset : i32 loc(#loc807) + %offset_206 = arith.subi %c1_i32, %offset : i32 loc(#loc808) + %offset_207 = arith.muli %offset_206, %c64_i32 : i32 loc(#loc809) + %offset_208 = arith.addi %offset_205, %offset_207 : i32 loc(#loc810) + %kT_ptrs_209 = arith.muli %offset_208, %c128_i32 : i32 loc(#loc609) + %kT_ptrs_210 = tt.splat %kT_ptrs_209 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc605) + %kT_ptrs_211 = tt.addptr %kT_ptrs_157, %kT_ptrs_210 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc605) + %vT_ptrs_212 = tt.addptr %vT_ptrs_158, %kT_ptrs_210 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc606) + %vT_ptrs_213 = arith.addi %arg23, %c1_i32 : i32 loc(#loc886) + %vT_ptrs_214 = arith.cmpi sge, %vT_ptrs_213, %c3_i32 : i32 loc(#loc886) + %vT_ptrs_215 = arith.select %vT_ptrs_214, %c0_i32, %vT_ptrs_213 : i32 loc(#loc886) + %kT_216 = ttg.memdesc_index %kT[%vT_ptrs_215] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc881) + %vT_ptrs_217 = tt.splat %vT_ptrs_165 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc886) + %kT_218 = ttg.async_copy_global_to_local %kT_ptrs_211, %kT_216 mask %vT_ptrs_217 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc881) + %kT_219 = ttg.async_commit_group tokens %kT_218 loc(#loc881) + %vT_220 = ttg.memdesc_index %vT[%vT_ptrs_215] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc882) + %vT_221 = ttg.async_copy_global_to_local %vT_ptrs_212, %vT_220 mask %vT_ptrs_217 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc882) + %vT_222 = ttg.async_commit_group tokens %vT_221 loc(#loc882) + scf.yield %dq_192, %kT_ptrs_211, %vT_ptrs_212, %offs_n2_194, %vT_ptrs_215, %vT_ptrs_170, %kT_161, %kT_219, %vT_163, %vT_222, %offset_208 : tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc886) + } loc(#loc886) + %vT_ptrs_111 = ttng.warp_group_dot_wait %vT_ptrs_110#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc886) + %vT_ptrs_112 = ttg.async_wait {num = 0 : i32} loc(#loc886) + ttg.local_dealloc %vT : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc886) + ttg.local_dealloc %kT : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc886) + %kv_indices_113 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_35 : !tt.ptr, i32 loc(#loc451) + %kv_start_114 = tt.load %kv_indices_113 : !tt.ptr loc(#loc452) + %kv_start_115 = arith.muli %kv_start_114, %c128_i32 : i32 loc(#loc453) + %sparse_kv_num_blocks_116 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_33 : !tt.ptr, i32 loc(#loc454) + %sparse_kv_num_blocks_117 = tt.load %sparse_kv_num_blocks_116 : !tt.ptr loc(#loc455) + %offs_n2_118 = tt.splat %kv_start_115 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc456) + %offs_n2_119 = arith.addi %offs_n2_118, %offs_n2 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc456) + %kT_ptrs_120 = tt.expand_dims %offs_n2_119 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc610) + %kT_ptrs_121 = arith.muli %kT_ptrs_120, %cst_16 : tensor<1x64xi32, #blocked1> loc(#loc611) + %kT_ptrs_122 = tt.addptr %kT_ptrs_78, %kT_ptrs_121 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc612) + %kT_ptrs_123 = tt.broadcast %kT_ptrs_122 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc613) + %kT_ptrs_124 = tt.addptr %kT_ptrs_123, %kT_ptrs_83 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc613) + %vT_ptrs_125 = tt.addptr %vT_ptrs, %kT_ptrs_121 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc614) + %vT_ptrs_126 = tt.broadcast %vT_ptrs_125 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc615) + %vT_ptrs_127 = tt.addptr %vT_ptrs_126, %kT_ptrs_83 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc615) + %hi_128 = arith.muli %sparse_kv_num_blocks_117, %c2_i32 : i32 loc(#loc616) + %hi_129 = arith.minsi %hi_128, %c32_i32 : i32 loc(#loc617) + %kT_130 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc883) + %vT_131 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc884) + %vT_ptrs_132 = arith.cmpi sgt, %hi_129, %c0_i32 : i32 loc(#loc887) + %kT_133 = ttg.memdesc_index %kT_130[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc883) + %vT_ptrs_134 = tt.splat %vT_ptrs_132 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc887) + %kT_135 = ttg.async_copy_global_to_local %kT_ptrs_124, %kT_133 mask %vT_ptrs_134 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc883) + %kT_136 = ttg.async_commit_group tokens %kT_135 loc(#loc883) + %vT_137 = ttg.memdesc_index %vT_131[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc884) + %vT_138 = ttg.async_copy_global_to_local %vT_ptrs_127, %vT_137 mask %vT_ptrs_134 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc884) + %vT_139 = ttg.async_commit_group tokens %vT_138 loc(#loc884) + %vT_ptrs_140 = arith.cmpi sgt, %hi_129, %c1_i32 : i32 loc(#loc887) + %kT_ptrs_141 = tt.addptr %kT_ptrs_124, %cst_9 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc619) + %vT_ptrs_142 = tt.addptr %vT_ptrs_127, %cst_9 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc620) + %kT_143 = ttg.memdesc_index %kT_130[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc883) + %vT_ptrs_144 = tt.splat %vT_ptrs_140 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc887) + %kT_145 = ttg.async_copy_global_to_local %kT_ptrs_141, %kT_143 mask %vT_ptrs_144 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc883) + %kT_146 = ttg.async_commit_group tokens %kT_145 loc(#loc883) + %vT_147 = ttg.memdesc_index %vT_131[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc884) + %vT_148 = ttg.async_copy_global_to_local %vT_ptrs_142, %vT_147 mask %vT_ptrs_144 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc884) + %vT_149 = ttg.async_commit_group tokens %vT_148 loc(#loc884) + ttng.fence_async_shared {bCluster = false} loc(#loc813) + %vT_ptrs_150:9 = scf.for %vT_ptrs_156 = %c0_i32 to %hi_129 step %c1_i32 iter_args(%vT_ptrs_157 = %vT_ptrs_111, %kT_ptrs_158 = %kT_ptrs_141, %vT_ptrs_159 = %vT_ptrs_142, %arg22 = %c1_i32, %arg23 = %c-1_i32, %kT_160 = %kT_136, %kT_161 = %kT_146, %vT_162 = %vT_139, %vT_163 = %vT_149) -> (tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token) : i32 { + %vT_ptrs_164 = arith.subi %hi_129, %c2_i32 : i32 loc(#loc887) + %vT_ptrs_165 = arith.cmpi slt, %vT_ptrs_156, %vT_ptrs_164 : i32 loc(#loc887) + %vT_ptrs_166 = arith.subi %hi_129, %c1_i32 : i32 loc(#loc887) + %vT_ptrs_167 = arith.cmpi slt, %vT_ptrs_156, %vT_ptrs_166 : i32 loc(#loc887) + %vT_ptrs_168 = arith.addi %arg23, %c1_i32 : i32 loc(#loc887) + %vT_ptrs_169 = arith.cmpi sge, %vT_ptrs_168, %c3_i32 : i32 loc(#loc887) + %vT_ptrs_170 = arith.select %vT_ptrs_169, %c0_i32, %vT_ptrs_168 : i32 loc(#loc887) + %kT_171 = ttg.async_wait %kT_160, %vT_162 {num = 2 : i32} loc(#loc883) + %kT_172 = ttg.memdesc_index %kT_130[%vT_ptrs_170] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc883) + %dq_173 = ttg.memdesc_trans %kT_172 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc814) + %qk = ttng.warp_group_dot %q_56, %kT_172, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc813) + %qk_174:3 = ttng.warp_group_dot_wait %qk, %q_56, %kT_172 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc813) + %qk_175 = arith.mulf %qk_174#0, %cst_6 : tensor<128x64xf32, #mma> loc(#loc815) + %post_mod_scores = arith.mulf %qk_175, %cst_8 : tensor<128x64xf32, #mma> loc(#loc816) + %p_176 = arith.subf %post_mod_scores, %p : tensor<128x64xf32, #mma> loc(#loc817) + %p_177 = math.exp2 %p_176 : tensor<128x64xf32, #mma> loc(#loc818) + %vT_178 = ttg.memdesc_index %vT_131[%vT_ptrs_170] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc884) + %dp = ttng.warp_group_dot %do_62, %vT_178, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc819) + %dp_179:3 = ttng.warp_group_dot_wait %dp, %do_62, %vT_178 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc819) + %ds_180 = arith.subf %dp_179#0, %ds_92 : tensor<128x64xf32, #mma> loc(#loc820) + %ds_181 = arith.mulf %p_177, %ds_180 : tensor<128x64xf32, #mma> loc(#loc821) + %ds_182 = arith.truncf %ds_181 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc822) + %ds_183 = ttg.convert_layout %ds_182 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc822) + %dq_184 = ttng.warp_group_dot %ds_183, %dq_173, %vT_ptrs_157 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc823) + %vT_ptrs_185 = arith.addi %vT_ptrs_156, %c1_i32 : i32 loc(#loc887) + %cur_block_idx = arith.divsi %vT_ptrs_185, %c2_i32 : i32 loc(#loc824) + %cur_block = tt.addptr %kv_indices_113, %cur_block_idx : !tt.ptr, i32 loc(#loc825) + %cur_block_186 = tt.load %cur_block, %vT_ptrs_167 evictionPolicy = evict_last : !tt.ptr loc(#loc826) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc827) + %next_block_187 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_117 : i32 loc(#loc828) + %next_block_188 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc829) + %vT_ptrs_189 = arith.andi %vT_ptrs_167, %next_block_187 : i1 loc(#loc887) + %next_block_190 = tt.load %next_block_188, %vT_ptrs_189 evictionPolicy = evict_last : !tt.ptr loc(#loc830) + %needs_jump = arith.addi %vT_ptrs_156, %c2_i32 : i32 loc(#loc831) + %needs_jump_191 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc832) + %needs_jump_192 = arith.cmpi eq, %needs_jump_191, %c0_i32 : i32 loc(#loc833) + %jump_to_block = arith.subi %next_block_190, %cur_block_186 : i32 loc(#loc834) + %jump_to_block_193 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc835) + %jump_to_block_194 = arith.subi %jump_to_block_193, %c64_i32 : i32 loc(#loc836) + %offset = arith.extui %needs_jump_192 : i1 to i32 loc(#loc837) + %offset_195 = arith.muli %jump_to_block_194, %offset : i32 loc(#loc837) + %offset_196 = arith.subi %c1_i32, %offset : i32 loc(#loc838) + %offset_197 = arith.muli %offset_196, %c64_i32 : i32 loc(#loc839) + %offset_198 = arith.addi %offset_195, %offset_197 : i32 loc(#loc840) + %kT_ptrs_199 = arith.muli %offset_198, %c128_i32 : i32 loc(#loc622) + %kT_ptrs_200 = tt.splat %kT_ptrs_199 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc619) + %kT_ptrs_201 = tt.addptr %kT_ptrs_158, %kT_ptrs_200 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc619) + %vT_ptrs_202 = tt.addptr %vT_ptrs_159, %kT_ptrs_200 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc620) + %vT_ptrs_203 = arith.addi %arg22, %c1_i32 : i32 loc(#loc887) + %vT_ptrs_204 = arith.cmpi sge, %vT_ptrs_203, %c3_i32 : i32 loc(#loc887) + %vT_ptrs_205 = arith.select %vT_ptrs_204, %c0_i32, %vT_ptrs_203 : i32 loc(#loc887) + %kT_206 = ttg.memdesc_index %kT_130[%vT_ptrs_205] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc883) + %vT_ptrs_207 = tt.splat %vT_ptrs_165 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc887) + %kT_208 = ttg.async_copy_global_to_local %kT_ptrs_201, %kT_206 mask %vT_ptrs_207 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc883) + %kT_209 = ttg.async_commit_group tokens %kT_208 loc(#loc883) + %vT_210 = ttg.memdesc_index %vT_131[%vT_ptrs_205] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc884) + %vT_211 = ttg.async_copy_global_to_local %vT_ptrs_202, %vT_210 mask %vT_ptrs_207 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc884) + %vT_212 = ttg.async_commit_group tokens %vT_211 loc(#loc884) + scf.yield %dq_184, %kT_ptrs_201, %vT_ptrs_202, %vT_ptrs_205, %vT_ptrs_170, %kT_161, %kT_209, %vT_163, %vT_212 : tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token loc(#loc887) + } loc(#loc887) + %vT_ptrs_151 = ttng.warp_group_dot_wait %vT_ptrs_150#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc887) + %vT_ptrs_152 = ttg.async_wait {num = 0 : i32} loc(#loc887) + ttg.local_dealloc %vT_131 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc887) + ttg.local_dealloc %kT_130 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc887) + %dq_ptrs = tt.splat %DQ2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc458) + %dq_ptrs_153 = tt.addptr %dq_ptrs, %ptr_48 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc458) + %dq_ptrs_154 = tt.broadcast %dq_ptrs_153 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc459) + %dq_ptrs_155 = tt.addptr %dq_ptrs_154, %ptr_54 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc459) + %dq = arith.mulf %vT_ptrs_151, %cst_3 : tensor<128x128xf32, #mma1> loc(#loc460) + %1 = arith.truncf %dq : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc159) + %2 = ttg.convert_layout %1 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc159) + tt.store %dq_ptrs_155, %2 : tensor<128x128x!tt.ptr, #blocked> loc(#loc159) + } else { + %start_n1 = arith.muli %pid, %c128_i32 : i32 loc(#loc461) + %offs_n1 = tt.splat %start_n1 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc462) + %offs_n1_31 = tt.splat %start_n1 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc462) + %offs_n1_32 = arith.addi %offs_n1, %offs_k : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc462) + %offs_n1_33 = arith.addi %offs_n1_31, %offs_k_30 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc462) + %ptr = tt.expand_dims %offs_n1_32 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc623) + %ptr_34 = tt.expand_dims %offs_n1_33 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xi32, #mma> loc(#loc623) + %ptr_35 = arith.muli %ptr, %cst_1 : tensor<128x1xi32, #blocked> loc(#loc624) + %ptr_36 = tt.splat %K : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc625) + %ptr_37 = tt.addptr %ptr_36, %ptr_35 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc625) + %ptr_38 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc626) + %ptr_39 = tt.expand_dims %ptr_38 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc626) + %ptr_40 = tt.broadcast %ptr_37 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc627) + %ptr_41 = tt.broadcast %ptr_39 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc627) + %ptr_42 = tt.addptr %ptr_40, %ptr_41 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc627) + %k = tt.load %ptr_42 : tensor<128x128x!tt.ptr, #blocked> loc(#loc628) + %k_43 = ttg.local_alloc %k : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc628) + %ptr_44 = tt.splat %V : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc629) + %ptr_45 = tt.addptr %ptr_44, %ptr_35 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc629) + %ptr_46 = tt.broadcast %ptr_45 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc630) + %ptr_47 = tt.addptr %ptr_46, %ptr_41 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc630) + %v = tt.load %ptr_47 : tensor<128x128x!tt.ptr, #blocked> loc(#loc631) + %v_48 = ttg.local_alloc %v : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc631) + %off_hq1 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc465) + %q_adj1 = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc466) + %off_chz1 = arith.muli %off_zq, %c32_i32 : i32 loc(#loc467) + %sparse_q_num_blks_offset = arith.muli %off_zkv, %c16_i32 : i32 loc(#loc468) + %sparse_q_num_blks_offset_49 = arith.addi %sparse_q_num_blks_offset, %pid : i32 loc(#loc469) + %sparse_q_idx_offset = arith.muli %off_zkv, %c256_i32 : i32 loc(#loc470) + %sparse_q_idx_offset_50 = arith.muli %pid, %c16_i32 : i32 loc(#loc471) + %sparse_q_idx_offset_51 = arith.addi %sparse_q_idx_offset, %sparse_q_idx_offset_50 : i32 loc(#loc472) + %q_indices = tt.addptr %arg_Q_IDX, %sparse_q_idx_offset_51 : !tt.ptr, i32 loc(#loc473) + %q_start = tt.load %q_indices, %true : !tt.ptr loc(#loc474) + %q_start_52 = arith.muli %q_start, %c128_i32 : i32 loc(#loc475) + %sparse_q_num_blocks = tt.addptr %arg_Q_NUM_BLKS, %sparse_q_num_blks_offset_49 : !tt.ptr, i32 loc(#loc476) + %sparse_q_num_blocks_53 = tt.load %sparse_q_num_blocks, %true : !tt.ptr loc(#loc477) + %offs_m1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc478) + %offs_m1_54 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc478) + %offs_m1_55 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc478) + %offs_m1_56 = tt.splat %q_start_52 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc479) + %offs_m1_57 = tt.splat %q_start_52 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc479) + %offs_m1_58 = tt.splat %q_start_52 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc479) + %offs_m1_59 = arith.addi %offs_m1_56, %offs_m1 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc479) + %offs_m1_60 = arith.addi %offs_m1_57, %offs_m1_54 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc479) + %offs_m1_61 = arith.addi %offs_m1_58, %offs_m1_55 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc479) + %qT_ptrs = tt.expand_dims %offs_m1_59 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc632) + %qT_ptrs_62 = arith.muli %qT_ptrs, %cst_15 : tensor<1x64xi32, #blocked1> loc(#loc633) + %qT_ptrs_63 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc634) + %qT_ptrs_64 = tt.expand_dims %qT_ptrs_63 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xi32, #blocked1> loc(#loc634) + %qT_ptrs_65 = tt.broadcast %qT_ptrs_64 : tensor<128x1xi32, #blocked1> -> tensor<128x64xi32, #blocked1> loc(#loc635) + %do_ptrs = tt.expand_dims %offs_m1_61 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc636) + %do_ptrs_66 = arith.muli %do_ptrs, %cst_14 : tensor<64x1xi32, #blocked> loc(#loc637) + %do_ptrs_67 = tt.broadcast %ptr_39 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked> loc(#loc638) + %hi = arith.muli %sparse_q_num_blocks_53, %c2_i32 : i32 loc(#loc639) + %hi_68 = arith.minsi %hi, %c32_i32 : i32 loc(#loc640) + %tmp44 = tt.broadcast %ptr_34 : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc641) + %tmp45 = arith.extsi %ptr_34 : tensor<128x1xi32, #mma> to tensor<128x1xi64, #mma> loc(#loc642) + %tmp47 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc643) + %do_ptrs_69 = arith.cmpi sgt, %hi_68, %c0_i32 : i32 loc(#loc889) + %tmp47_70 = tt.load %tmp47, %do_ptrs_69 : !tt.ptr loc(#loc645) + %tmp48 = tt.splat %tmp47_70 : i64 -> tensor<128x1xi64, #mma> loc(#loc646) + %tmp48_71 = arith.cmpi slt, %tmp45, %tmp48 : tensor<128x1xi64, #mma> loc(#loc646) + %tmp50 = tt.splat %tmp47_70 : i64 -> tensor<1x64xi64, #mma> loc(#loc647) + %tmp51 = tt.broadcast %tmp48_71 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc648) + %tmp55 = arith.cmpi sge, %ptr_34, %cst_0 : tensor<128x1xi32, #mma> loc(#loc649) + %tmp56 = arith.remsi %ptr_34, %cst_0 : tensor<128x1xi32, #mma> loc(#loc650) + %tmp58 = arith.cmpi ne, %tmp56, %cst : tensor<128x1xi32, #mma> loc(#loc651) + %tmp59 = arith.cmpi slt, %tmp56, %cst : tensor<128x1xi32, #mma> loc(#loc652) + %tmp62 = arith.andi %tmp58, %tmp59 : tensor<128x1xi1, #mma> loc(#loc653) + %tmp63 = arith.addi %tmp56, %cst_0 : tensor<128x1xi32, #mma> loc(#loc654) + %tmp64 = arith.select %tmp62, %tmp63, %tmp56 : tensor<128x1xi1, #mma>, tensor<128x1xi32, #mma> loc(#loc655) + %tmp65 = arith.extsi %tmp64 : tensor<128x1xi32, #mma> to tensor<128x1xi64, #mma> loc(#loc656) + %tmp66 = arith.cmpi slt, %tmp65, %tmp48 : tensor<128x1xi64, #mma> loc(#loc657) + %tmp67 = arith.andi %tmp55, %tmp66 : tensor<128x1xi1, #mma> loc(#loc658) + %tmp77 = tt.broadcast %tmp67 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc659) + %q_indices_72 = tt.addptr %arg_FULL_Q_IDX, %sparse_q_idx_offset_51 : !tt.ptr, i32 loc(#loc509) + %q_start_73 = tt.load %q_indices_72, %true : !tt.ptr loc(#loc510) + %q_start_74 = arith.muli %q_start_73, %c128_i32 : i32 loc(#loc511) + %sparse_q_num_blocks_75 = tt.addptr %arg_FULL_Q_NUM_BLKS, %sparse_q_num_blks_offset_49 : !tt.ptr, i32 loc(#loc512) + %sparse_q_num_blocks_76 = tt.load %sparse_q_num_blocks_75, %true : !tt.ptr loc(#loc513) + %offs_m1_77 = tt.splat %q_start_74 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc514) + %offs_m1_78 = tt.splat %q_start_74 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc514) + %offs_m1_79 = tt.splat %q_start_74 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc514) + %offs_m1_80 = arith.addi %offs_m1_77, %offs_m1 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc514) + %offs_m1_81 = arith.addi %offs_m1_78, %offs_m1_54 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc514) + %offs_m1_82 = arith.addi %offs_m1_79, %offs_m1_55 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc514) + %qT_ptrs_83 = tt.expand_dims %offs_m1_80 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc660) + %qT_ptrs_84 = arith.muli %qT_ptrs_83, %cst_15 : tensor<1x64xi32, #blocked1> loc(#loc661) + %do_ptrs_85 = tt.expand_dims %offs_m1_82 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc662) + %do_ptrs_86 = arith.muli %do_ptrs_85, %cst_14 : tensor<64x1xi32, #blocked> loc(#loc663) + %hi_87 = arith.muli %sparse_q_num_blocks_76, %c2_i32 : i32 loc(#loc664) + %hi_88 = arith.minsi %hi_87, %c32_i32 : i32 loc(#loc665) + ttng.fence_async_shared {bCluster = false} loc(#loc666) + %dk:2 = scf.for %dk_98 = %c0_i32 to %c4_i32 step %c1_i32 iter_args(%arg19 = %cst_4, %arg20 = %cst_4) -> (tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>) : i32 { + %off_hq1_99 = arith.addi %off_hq1, %dk_98 : i32 loc(#loc517) + %q_adj1_100 = arith.muli %off_hq1_99, %c128_i32 : i32 loc(#loc518) + %q_adj1_101 = arith.addi %q_adj1_100, %q_adj1 : i32 loc(#loc519) + %q_adj1_102 = arith.extsi %q_adj1_101 : i32 to i64 loc(#loc520) + %do_adj1 = arith.muli %off_hq1_99, %c262144_i32 : i32 loc(#loc521) + %do_adj1_103 = arith.addi %do_adj1, %q_adj1 : i32 loc(#loc522) + %do_adj1_104 = arith.extsi %do_adj1_103 : i32 to i64 loc(#loc523) + %off_chz1_105 = arith.addi %off_chz1, %off_hq1_99 : i32 loc(#loc524) + %off_chz1_106 = arith.muli %off_chz1_105, %c2048_i32 : i32 loc(#loc525) + %off_chz1_107 = arith.extsi %off_chz1_106 : i32 to i64 loc(#loc526) + %Q1 = tt.addptr %arg_Q, %q_adj1_102 : !tt.ptr, i64 loc(#loc527) + %DO1 = tt.addptr %arg_DO, %do_adj1_104 : !tt.ptr, i64 loc(#loc528) + %LSE1 = tt.addptr %arg_LSE, %off_chz1_107 : !tt.ptr, i64 loc(#loc529) + %DELTA1 = tt.addptr %arg_DELTA, %off_chz1_107 : !tt.ptr, i64 loc(#loc530) + %qT_ptrs_108 = tt.splat %Q1 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc668) + %qT_ptrs_109 = tt.addptr %qT_ptrs_108, %qT_ptrs_62 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc668) + %qT_ptrs_110 = tt.broadcast %qT_ptrs_109 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc635) + %qT_ptrs_111 = tt.addptr %qT_ptrs_110, %qT_ptrs_65 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc635) + %do_ptrs_112 = tt.splat %DO1 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked> loc(#loc669) + %do_ptrs_113 = tt.addptr %do_ptrs_112, %do_ptrs_66 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc669) + %do_ptrs_114 = tt.broadcast %do_ptrs_113 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc638) + %do_ptrs_115 = tt.addptr %do_ptrs_114, %do_ptrs_67 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc638) + %lse = tt.splat %LSE1 : !tt.ptr -> tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc670) + %Di = tt.splat %DELTA1 : !tt.ptr -> tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc671) + %qT = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc842) + %lse_116 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc673) + %do = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc843) + %Di_117 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc675) + %qT_118 = ttg.memdesc_index %qT[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc842) + %do_ptrs_119 = tt.splat %do_ptrs_69 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc889) + %qT_120 = ttg.async_copy_global_to_local %qT_ptrs_111, %qT_118 mask %do_ptrs_119 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc842) + %qT_121 = ttg.async_commit_group tokens %qT_120 loc(#loc842) + %lse_122 = tt.addptr %lse, %offs_m1_60 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc670) + %lse_123 = ttg.memdesc_index %lse_116[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc673) + %do_ptrs_124 = tt.splat %do_ptrs_69 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc889) + %lse_125 = ttg.async_copy_global_to_local %lse_122, %lse_123 mask %do_ptrs_124 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc673) + %lse_126 = ttg.async_commit_group tokens %lse_125 loc(#loc673) + %do_127 = ttg.memdesc_index %do[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc843) + %do_ptrs_128 = tt.splat %do_ptrs_69 : i1 -> tensor<64x128xi1, #blocked> loc(#loc889) + %do_129 = ttg.async_copy_global_to_local %do_ptrs_115, %do_127 mask %do_ptrs_128 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc843) + %do_130 = ttg.async_commit_group tokens %do_129 loc(#loc843) + %Di_131 = tt.addptr %Di, %offs_m1_60 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc671) + %Di_132 = ttg.memdesc_index %Di_117[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc675) + %Di_133 = ttg.async_copy_global_to_local %Di_131, %Di_132 mask %do_ptrs_124 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc675) + %Di_134 = ttg.async_commit_group tokens %Di_133 loc(#loc675) + %do_ptrs_135 = arith.cmpi sgt, %hi_68, %c1_i32 : i32 loc(#loc889) + %qT_ptrs_136 = tt.addptr %qT_ptrs_111, %cst_10 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc676) + %do_ptrs_137 = tt.addptr %do_ptrs_115, %cst_11 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc677) + %offs_m1_138 = arith.addi %offs_m1_60, %cst_12 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc678) + %qT_139 = ttg.memdesc_index %qT[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc842) + %do_ptrs_140 = tt.splat %do_ptrs_135 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc889) + %qT_141 = ttg.async_copy_global_to_local %qT_ptrs_136, %qT_139 mask %do_ptrs_140 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc842) + %qT_142 = ttg.async_commit_group tokens %qT_141 loc(#loc842) + %lse_143 = tt.addptr %lse, %offs_m1_138 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc670) + %lse_144 = ttg.memdesc_index %lse_116[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc673) + %do_ptrs_145 = tt.splat %do_ptrs_135 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc889) + %lse_146 = ttg.async_copy_global_to_local %lse_143, %lse_144 mask %do_ptrs_145 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc673) + %lse_147 = ttg.async_commit_group tokens %lse_146 loc(#loc673) + %do_148 = ttg.memdesc_index %do[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc843) + %do_ptrs_149 = tt.splat %do_ptrs_135 : i1 -> tensor<64x128xi1, #blocked> loc(#loc889) + %do_150 = ttg.async_copy_global_to_local %do_ptrs_137, %do_148 mask %do_ptrs_149 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc843) + %do_151 = ttg.async_commit_group tokens %do_150 loc(#loc843) + %Di_152 = tt.addptr %Di, %offs_m1_138 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc671) + %Di_153 = ttg.memdesc_index %Di_117[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc675) + %Di_154 = ttg.async_copy_global_to_local %Di_152, %Di_153 mask %do_ptrs_145 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc675) + %Di_155 = ttg.async_commit_group tokens %Di_154 loc(#loc675) + %do_ptrs_156:19 = scf.for %do_ptrs_211 = %c0_i32 to %hi_68 step %c1_i32 iter_args(%arg22 = %arg20, %arg23 = %arg19, %qT_ptrs_212 = %qT_ptrs_136, %do_ptrs_213 = %do_ptrs_137, %offs_m1_214 = %offs_m1_138, %arg27 = %c1_i32, %arg28 = %c-1_i32, %arg29 = %c1_i32, %arg30 = %c-1_i32, %qT_215 = %qT_121, %qT_216 = %qT_142, %lse_217 = %lse_126, %lse_218 = %lse_147, %do_219 = %do_130, %do_220 = %do_151, %Di_221 = %Di_134, %Di_222 = %Di_155, %arg39 = %c64_i32, %offs_m1_223 = %offs_m1_60) -> (tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64x128x!tt.ptr, #blocked>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>) : i32 { + %do_ptrs_224 = arith.subi %hi_68, %c2_i32 : i32 loc(#loc889) + %do_ptrs_225 = arith.cmpi slt, %do_ptrs_211, %do_ptrs_224 : i32 loc(#loc889) + %do_ptrs_226 = arith.subi %hi_68, %c1_i32 : i32 loc(#loc889) + %do_ptrs_227 = arith.cmpi slt, %do_ptrs_211, %do_ptrs_226 : i32 loc(#loc889) + %do_ptrs_228 = arith.addi %arg30, %c1_i32 : i32 loc(#loc889) + %do_ptrs_229 = arith.cmpi sge, %do_ptrs_228, %c2_i32 : i32 loc(#loc889) + %do_ptrs_230 = arith.select %do_ptrs_229, %c0_i32, %do_ptrs_228 : i32 loc(#loc889) + %do_ptrs_231 = arith.addi %arg28, %c1_i32 : i32 loc(#loc889) + %do_ptrs_232 = arith.cmpi sge, %do_ptrs_231, %c3_i32 : i32 loc(#loc889) + %do_ptrs_233 = arith.select %do_ptrs_232, %c0_i32, %do_ptrs_231 : i32 loc(#loc889) + %qT_234 = ttg.async_wait %qT_215, %lse_217, %do_219, %Di_221 {num = 4 : i32} loc(#loc842) + %qT_235 = ttg.memdesc_index %qT[%do_ptrs_233] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc842) + %dk_236 = ttg.memdesc_trans %qT_235 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc679) + %lse_237 = ttg.memdesc_index %lse_116[%do_ptrs_230] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc673) + %lse_238 = ttg.local_load %lse_237 token %qT_234 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc673) + %lse_239 = arith.cmpf oeq, %lse_238, %cst_19 : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc680) + %lse_240 = arith.select %lse_239, %cst_20, %lse_238 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc681) + %qkT = ttng.warp_group_dot %k_43, %qT_235, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc666) + %qkT_241:3 = ttng.warp_group_dot_wait %qkT, %k_43, %qT_235 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc666) + %qkT_242 = arith.mulf %qkT_241#0, %cst_6 : tensor<128x64xf32, #mma> loc(#loc682) + %m = tt.expand_dims %offs_m1_223 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc683) + %tmp44_243 = tt.broadcast %m : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc641) + %tmp44_244 = arith.cmpi sge, %tmp44_243, %tmp44 : tensor<128x64xi32, #mma> loc(#loc641) + %tmp49 = arith.extsi %m : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc684) + %tmp50_245 = arith.cmpi slt, %tmp49, %tmp50 : tensor<1x64xi64, #mma> loc(#loc647) + %tmp51_246 = tt.broadcast %tmp50_245 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc648) + %tmp51_247 = arith.andi %tmp51, %tmp51_246 : tensor<128x64xi1, #mma> loc(#loc648) + %tmp52 = arith.andi %tmp44_244, %tmp51_247 : tensor<128x64xi1, #mma> loc(#loc685) + %tmp68 = arith.subi %tmp44, %tmp44_243 : tensor<128x64xi32, #mma> loc(#loc686) + %tmp69 = arith.remsi %tmp68, %cst_18 : tensor<128x64xi32, #mma> loc(#loc687) + %tmp70 = arith.cmpi ne, %tmp69, %cst_21 : tensor<128x64xi32, #mma> loc(#loc688) + %tmp71 = arith.cmpi slt, %tmp69, %cst_21 : tensor<128x64xi32, #mma> loc(#loc689) + %tmp73 = arith.andi %tmp70, %tmp71 : tensor<128x64xi1, #mma> loc(#loc690) + %tmp74 = arith.addi %tmp69, %cst_18 : tensor<128x64xi32, #mma> loc(#loc691) + %tmp75 = arith.select %tmp73, %tmp74, %tmp69 : tensor<128x64xi1, #mma>, tensor<128x64xi32, #mma> loc(#loc692) + %tmp76 = arith.cmpi eq, %tmp75, %cst_21 : tensor<128x64xi32, #mma> loc(#loc693) + %tmp77_248 = arith.andi %tmp77, %tmp76 : tensor<128x64xi1, #mma> loc(#loc659) + %tmp78 = arith.ori %tmp52, %tmp77_248 : tensor<128x64xi1, #mma> loc(#loc694) + %post_mod_scores = arith.select %tmp78, %qkT_242, %cst_7 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc695) + %post_mod_scores_249 = arith.mulf %post_mod_scores, %cst_8 : tensor<128x64xf32, #mma> loc(#loc696) + %pT = tt.expand_dims %lse_240 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xf32, #mma> loc(#loc697) + %pT_250 = tt.broadcast %pT : tensor<1x64xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc698) + %pT_251 = arith.subf %post_mod_scores_249, %pT_250 : tensor<128x64xf32, #mma> loc(#loc698) + %pT_252 = math.exp2 %pT_251 : tensor<128x64xf32, #mma> loc(#loc699) + %do_253 = ttg.memdesc_index %do[%do_ptrs_233] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc843) + %dpT = ttg.memdesc_trans %do_253 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc700) + %dv = arith.truncf %pT_252 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc701) + %dv_254 = ttg.convert_layout %dv : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc701) + %dv_255 = ttng.warp_group_dot %dv_254, %do_253, %arg23 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc702) + %Di_256 = ttg.memdesc_index %Di_117[%do_ptrs_230] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc675) + %Di_257 = ttg.local_load %Di_256 token %qT_234 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc675) + %dpT_258 = ttng.warp_group_dot %v_48, %dpT, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc703) + %dpT_259:3 = ttng.warp_group_dot_wait %dpT_258, %v_48, %dpT {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc703) + %dsT = tt.expand_dims %Di_257 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xf32, #mma> loc(#loc704) + %dsT_260 = tt.broadcast %dsT : tensor<1x64xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc705) + %dsT_261 = arith.subf %dpT_259#0, %dsT_260 : tensor<128x64xf32, #mma> loc(#loc705) + %dsT_262 = arith.mulf %pT_252, %dsT_261 : tensor<128x64xf32, #mma> loc(#loc706) + %dsT_263 = arith.select %tmp78, %dsT_262, %cst_5 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc707) + %dk_264 = arith.truncf %dsT_263 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc708) + %dk_265 = ttg.convert_layout %dk_264 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc708) + %dk_266 = ttng.warp_group_dot %dk_265, %dk_236, %arg22 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc709) + %offs_m1_267 = tt.splat %arg39 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc678) + %offs_m1_268 = arith.addi %offs_m1_223, %offs_m1_267 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc678) + %do_ptrs_269 = arith.addi %do_ptrs_211, %c1_i32 : i32 loc(#loc889) + %cur_block_idx = arith.divsi %do_ptrs_269, %c2_i32 : i32 loc(#loc844) + %cur_block = tt.addptr %q_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc845) + %cur_block_270 = tt.load %cur_block, %do_ptrs_227 evictionPolicy = evict_last : !tt.ptr loc(#loc846) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc847) + %next_block_271 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_53 : i32 loc(#loc848) + %next_block_272 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc849) + %do_ptrs_273 = arith.andi %do_ptrs_227, %next_block_271 : i1 loc(#loc889) + %next_block_274 = tt.load %next_block_272, %do_ptrs_273 evictionPolicy = evict_last : !tt.ptr loc(#loc850) + %needs_jump = arith.addi %do_ptrs_211, %c2_i32 : i32 loc(#loc851) + %needs_jump_275 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc852) + %needs_jump_276 = arith.cmpi eq, %needs_jump_275, %c0_i32 : i32 loc(#loc853) + %jump_to_block = arith.subi %next_block_274, %cur_block_270 : i32 loc(#loc854) + %jump_to_block_277 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc855) + %jump_to_block_278 = arith.subi %jump_to_block_277, %c64_i32 : i32 loc(#loc856) + %offset = arith.extui %needs_jump_276 : i1 to i32 loc(#loc857) + %offset_279 = arith.muli %jump_to_block_278, %offset : i32 loc(#loc857) + %offset_280 = arith.subi %c1_i32, %offset : i32 loc(#loc858) + %offset_281 = arith.muli %offset_280, %c64_i32 : i32 loc(#loc859) + %offset_282 = arith.addi %offset_279, %offset_281 : i32 loc(#loc860) + %qT_ptrs_283 = arith.muli %offset_282, %c4096_i32 : i32 loc(#loc711) + %qT_ptrs_284 = tt.splat %qT_ptrs_283 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc676) + %qT_ptrs_285 = tt.addptr %qT_ptrs_212, %qT_ptrs_284 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc676) + %do_ptrs_286 = arith.muli %offset_282, %c128_i32 : i32 loc(#loc712) + %do_ptrs_287 = tt.splat %do_ptrs_286 : i32 -> tensor<64x128xi32, #blocked> loc(#loc677) + %do_ptrs_288 = tt.addptr %do_ptrs_213, %do_ptrs_287 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc677) + %offs_m1_289 = tt.splat %offset_282 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc678) + %offs_m1_290 = arith.addi %offs_m1_214, %offs_m1_289 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc678) + %do_ptrs_291 = arith.addi %arg29, %c1_i32 : i32 loc(#loc889) + %do_ptrs_292 = arith.cmpi sge, %do_ptrs_291, %c2_i32 : i32 loc(#loc889) + %do_ptrs_293 = arith.select %do_ptrs_292, %c0_i32, %do_ptrs_291 : i32 loc(#loc889) + %do_ptrs_294 = arith.addi %arg27, %c1_i32 : i32 loc(#loc889) + %do_ptrs_295 = arith.cmpi sge, %do_ptrs_294, %c3_i32 : i32 loc(#loc889) + %do_ptrs_296 = arith.select %do_ptrs_295, %c0_i32, %do_ptrs_294 : i32 loc(#loc889) + %qT_297 = ttg.memdesc_index %qT[%do_ptrs_296] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc842) + %do_ptrs_298 = tt.splat %do_ptrs_225 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc889) + %qT_299 = ttg.async_copy_global_to_local %qT_ptrs_285, %qT_297 mask %do_ptrs_298 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc842) + %qT_300 = ttg.async_commit_group tokens %qT_299 loc(#loc842) + %lse_301 = tt.addptr %lse, %offs_m1_290 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc670) + %lse_302 = ttg.memdesc_index %lse_116[%do_ptrs_293] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc673) + %do_ptrs_303 = tt.splat %do_ptrs_225 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc889) + %lse_304 = ttg.async_copy_global_to_local %lse_301, %lse_302 mask %do_ptrs_303 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc673) + %lse_305 = ttg.async_commit_group tokens %lse_304 loc(#loc673) + %do_306 = ttg.memdesc_index %do[%do_ptrs_296] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc843) + %do_ptrs_307 = tt.splat %do_ptrs_225 : i1 -> tensor<64x128xi1, #blocked> loc(#loc889) + %do_308 = ttg.async_copy_global_to_local %do_ptrs_288, %do_306 mask %do_ptrs_307 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc843) + %do_309 = ttg.async_commit_group tokens %do_308 loc(#loc843) + %Di_310 = tt.addptr %Di, %offs_m1_290 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc671) + %Di_311 = ttg.memdesc_index %Di_117[%do_ptrs_293] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc675) + %Di_312 = ttg.async_copy_global_to_local %Di_310, %Di_311 mask %do_ptrs_303 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc675) + %Di_313 = ttg.async_commit_group tokens %Di_312 loc(#loc675) + scf.yield %dk_266, %dv_255, %qT_ptrs_285, %do_ptrs_288, %offs_m1_290, %do_ptrs_296, %do_ptrs_233, %do_ptrs_293, %do_ptrs_230, %qT_216, %qT_300, %lse_218, %lse_305, %do_220, %do_309, %Di_222, %Di_313, %offset_282, %offs_m1_268 : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64x128x!tt.ptr, #blocked>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc889) + } loc(#loc889) + %do_ptrs_157:2 = ttng.warp_group_dot_wait %do_ptrs_156#1, %do_ptrs_156#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1> loc(#loc889) + %do_ptrs_158 = ttg.async_wait {num = 0 : i32} loc(#loc889) + ttg.local_dealloc %Di_117 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc889) + ttg.local_dealloc %do : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc889) + ttg.local_dealloc %lse_116 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc889) + ttg.local_dealloc %qT : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc889) + %qT_ptrs_159 = tt.addptr %qT_ptrs_108, %qT_ptrs_84 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc713) + %qT_ptrs_160 = tt.broadcast %qT_ptrs_159 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc714) + %qT_ptrs_161 = tt.addptr %qT_ptrs_160, %qT_ptrs_65 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc714) + %do_ptrs_162 = tt.addptr %do_ptrs_112, %do_ptrs_86 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc715) + %do_ptrs_163 = tt.broadcast %do_ptrs_162 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc716) + %do_ptrs_164 = tt.addptr %do_ptrs_163, %do_ptrs_67 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc716) + %qT_165 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc861) + %lse_166 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc718) + %do_167 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc862) + %Di_168 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc720) + %do_ptrs_169 = arith.cmpi sgt, %hi_88, %c0_i32 : i32 loc(#loc890) + %qT_170 = ttg.memdesc_index %qT_165[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc861) + %do_ptrs_171 = tt.splat %do_ptrs_169 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc890) + %qT_172 = ttg.async_copy_global_to_local %qT_ptrs_161, %qT_170 mask %do_ptrs_171 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc861) + %qT_173 = ttg.async_commit_group tokens %qT_172 loc(#loc861) + %lse_174 = tt.addptr %lse, %offs_m1_81 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc721) + %lse_175 = ttg.memdesc_index %lse_166[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc718) + %do_ptrs_176 = tt.splat %do_ptrs_169 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc890) + %lse_177 = ttg.async_copy_global_to_local %lse_174, %lse_175 mask %do_ptrs_176 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc718) + %lse_178 = ttg.async_commit_group tokens %lse_177 loc(#loc718) + %do_179 = ttg.memdesc_index %do_167[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc862) + %do_ptrs_180 = tt.splat %do_ptrs_169 : i1 -> tensor<64x128xi1, #blocked> loc(#loc890) + %do_181 = ttg.async_copy_global_to_local %do_ptrs_164, %do_179 mask %do_ptrs_180 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc862) + %do_182 = ttg.async_commit_group tokens %do_181 loc(#loc862) + %Di_183 = tt.addptr %Di, %offs_m1_81 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc722) + %Di_184 = ttg.memdesc_index %Di_168[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc720) + %Di_185 = ttg.async_copy_global_to_local %Di_183, %Di_184 mask %do_ptrs_176 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc720) + %Di_186 = ttg.async_commit_group tokens %Di_185 loc(#loc720) + %do_ptrs_187 = arith.cmpi sgt, %hi_88, %c1_i32 : i32 loc(#loc890) + %qT_ptrs_188 = tt.addptr %qT_ptrs_161, %cst_10 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc723) + %do_ptrs_189 = tt.addptr %do_ptrs_164, %cst_11 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc724) + %offs_m1_190 = arith.addi %offs_m1_81, %cst_12 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc725) + %qT_191 = ttg.memdesc_index %qT_165[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc861) + %do_ptrs_192 = tt.splat %do_ptrs_187 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc890) + %qT_193 = ttg.async_copy_global_to_local %qT_ptrs_188, %qT_191 mask %do_ptrs_192 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc861) + %qT_194 = ttg.async_commit_group tokens %qT_193 loc(#loc861) + %lse_195 = tt.addptr %lse, %offs_m1_190 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc721) + %lse_196 = ttg.memdesc_index %lse_166[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc718) + %do_ptrs_197 = tt.splat %do_ptrs_187 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc890) + %lse_198 = ttg.async_copy_global_to_local %lse_195, %lse_196 mask %do_ptrs_197 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc718) + %lse_199 = ttg.async_commit_group tokens %lse_198 loc(#loc718) + %do_200 = ttg.memdesc_index %do_167[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc862) + %do_ptrs_201 = tt.splat %do_ptrs_187 : i1 -> tensor<64x128xi1, #blocked> loc(#loc890) + %do_202 = ttg.async_copy_global_to_local %do_ptrs_189, %do_200 mask %do_ptrs_201 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc862) + %do_203 = ttg.async_commit_group tokens %do_202 loc(#loc862) + %Di_204 = tt.addptr %Di, %offs_m1_190 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc722) + %Di_205 = ttg.memdesc_index %Di_168[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc720) + %Di_206 = ttg.async_copy_global_to_local %Di_204, %Di_205 mask %do_ptrs_197 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc720) + %Di_207 = ttg.async_commit_group tokens %Di_206 loc(#loc720) + %do_ptrs_208:17 = scf.for %do_ptrs_211 = %c0_i32 to %hi_88 step %c1_i32 iter_args(%do_ptrs_212 = %do_ptrs_157#1, %do_ptrs_213 = %do_ptrs_157#0, %qT_ptrs_214 = %qT_ptrs_188, %do_ptrs_215 = %do_ptrs_189, %offs_m1_216 = %offs_m1_190, %arg27 = %c1_i32, %arg28 = %c-1_i32, %arg29 = %c1_i32, %arg30 = %c-1_i32, %qT_217 = %qT_173, %qT_218 = %qT_194, %lse_219 = %lse_178, %lse_220 = %lse_199, %do_221 = %do_182, %do_222 = %do_203, %Di_223 = %Di_186, %Di_224 = %Di_207) -> (tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64x128x!tt.ptr, #blocked>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token) : i32 { + %do_ptrs_225 = arith.subi %hi_88, %c2_i32 : i32 loc(#loc890) + %do_ptrs_226 = arith.cmpi slt, %do_ptrs_211, %do_ptrs_225 : i32 loc(#loc890) + %do_ptrs_227 = arith.subi %hi_88, %c1_i32 : i32 loc(#loc890) + %do_ptrs_228 = arith.cmpi slt, %do_ptrs_211, %do_ptrs_227 : i32 loc(#loc890) + %do_ptrs_229 = arith.addi %arg30, %c1_i32 : i32 loc(#loc890) + %do_ptrs_230 = arith.cmpi sge, %do_ptrs_229, %c2_i32 : i32 loc(#loc890) + %do_ptrs_231 = arith.select %do_ptrs_230, %c0_i32, %do_ptrs_229 : i32 loc(#loc890) + %do_ptrs_232 = arith.addi %arg28, %c1_i32 : i32 loc(#loc890) + %do_ptrs_233 = arith.cmpi sge, %do_ptrs_232, %c3_i32 : i32 loc(#loc890) + %do_ptrs_234 = arith.select %do_ptrs_233, %c0_i32, %do_ptrs_232 : i32 loc(#loc890) + %qT_235 = ttg.async_wait %qT_217, %lse_219, %do_221, %Di_223 {num = 4 : i32} loc(#loc861) + %qT_236 = ttg.memdesc_index %qT_165[%do_ptrs_234] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc861) + %dk_237 = ttg.memdesc_trans %qT_236 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc726) + %lse_238 = ttg.memdesc_index %lse_166[%do_ptrs_231] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc718) + %lse_239 = ttg.local_load %lse_238 token %qT_235 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc718) + %lse_240 = arith.cmpf oeq, %lse_239, %cst_19 : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc727) + %lse_241 = arith.select %lse_240, %cst_20, %lse_239 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc728) + %qkT = ttng.warp_group_dot %k_43, %qT_236, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc729) + %qkT_242:3 = ttng.warp_group_dot_wait %qkT, %k_43, %qT_236 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc729) + %qkT_243 = arith.mulf %qkT_242#0, %cst_6 : tensor<128x64xf32, #mma> loc(#loc730) + %post_mod_scores = arith.mulf %qkT_243, %cst_8 : tensor<128x64xf32, #mma> loc(#loc731) + %pT = tt.expand_dims %lse_241 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xf32, #mma> loc(#loc732) + %pT_244 = tt.broadcast %pT : tensor<1x64xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc733) + %pT_245 = arith.subf %post_mod_scores, %pT_244 : tensor<128x64xf32, #mma> loc(#loc733) + %pT_246 = math.exp2 %pT_245 : tensor<128x64xf32, #mma> loc(#loc734) + %do_247 = ttg.memdesc_index %do_167[%do_ptrs_234] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc862) + %dpT = ttg.memdesc_trans %do_247 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc735) + %dv = arith.truncf %pT_246 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc736) + %dv_248 = ttg.convert_layout %dv : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc736) + %dv_249 = ttng.warp_group_dot %dv_248, %do_247, %do_ptrs_213 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc737) + %Di_250 = ttg.memdesc_index %Di_168[%do_ptrs_231] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc720) + %Di_251 = ttg.local_load %Di_250 token %qT_235 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc720) + %dpT_252 = ttng.warp_group_dot %v_48, %dpT, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc738) + %dpT_253:3 = ttng.warp_group_dot_wait %dpT_252, %v_48, %dpT {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc738) + %dsT = tt.expand_dims %Di_251 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xf32, #mma> loc(#loc739) + %dsT_254 = tt.broadcast %dsT : tensor<1x64xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc740) + %dsT_255 = arith.subf %dpT_253#0, %dsT_254 : tensor<128x64xf32, #mma> loc(#loc740) + %dsT_256 = arith.mulf %pT_246, %dsT_255 : tensor<128x64xf32, #mma> loc(#loc741) + %dk_257 = arith.truncf %dsT_256 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc742) + %dk_258 = ttg.convert_layout %dk_257 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc742) + %dk_259 = ttng.warp_group_dot %dk_258, %dk_237, %do_ptrs_212 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc743) + %do_ptrs_260 = arith.addi %do_ptrs_211, %c1_i32 : i32 loc(#loc890) + %cur_block_idx = arith.divsi %do_ptrs_260, %c2_i32 : i32 loc(#loc863) + %cur_block = tt.addptr %q_indices_72, %cur_block_idx : !tt.ptr, i32 loc(#loc864) + %cur_block_261 = tt.load %cur_block, %do_ptrs_228 evictionPolicy = evict_last : !tt.ptr loc(#loc865) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc866) + %next_block_262 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_76 : i32 loc(#loc867) + %next_block_263 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc868) + %do_ptrs_264 = arith.andi %do_ptrs_228, %next_block_262 : i1 loc(#loc890) + %next_block_265 = tt.load %next_block_263, %do_ptrs_264 evictionPolicy = evict_last : !tt.ptr loc(#loc869) + %needs_jump = arith.addi %do_ptrs_211, %c2_i32 : i32 loc(#loc870) + %needs_jump_266 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc871) + %needs_jump_267 = arith.cmpi eq, %needs_jump_266, %c0_i32 : i32 loc(#loc872) + %jump_to_block = arith.subi %next_block_265, %cur_block_261 : i32 loc(#loc873) + %jump_to_block_268 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc874) + %jump_to_block_269 = arith.subi %jump_to_block_268, %c64_i32 : i32 loc(#loc875) + %offset = arith.extui %needs_jump_267 : i1 to i32 loc(#loc876) + %offset_270 = arith.muli %jump_to_block_269, %offset : i32 loc(#loc876) + %offset_271 = arith.subi %c1_i32, %offset : i32 loc(#loc877) + %offset_272 = arith.muli %offset_271, %c64_i32 : i32 loc(#loc878) + %offset_273 = arith.addi %offset_270, %offset_272 : i32 loc(#loc879) + %qT_ptrs_274 = arith.muli %offset_273, %c4096_i32 : i32 loc(#loc745) + %qT_ptrs_275 = tt.splat %qT_ptrs_274 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc723) + %qT_ptrs_276 = tt.addptr %qT_ptrs_214, %qT_ptrs_275 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc723) + %do_ptrs_277 = arith.muli %offset_273, %c128_i32 : i32 loc(#loc746) + %do_ptrs_278 = tt.splat %do_ptrs_277 : i32 -> tensor<64x128xi32, #blocked> loc(#loc724) + %do_ptrs_279 = tt.addptr %do_ptrs_215, %do_ptrs_278 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc724) + %offs_m1_280 = tt.splat %offset_273 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc725) + %offs_m1_281 = arith.addi %offs_m1_216, %offs_m1_280 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc725) + %do_ptrs_282 = arith.addi %arg29, %c1_i32 : i32 loc(#loc890) + %do_ptrs_283 = arith.cmpi sge, %do_ptrs_282, %c2_i32 : i32 loc(#loc890) + %do_ptrs_284 = arith.select %do_ptrs_283, %c0_i32, %do_ptrs_282 : i32 loc(#loc890) + %do_ptrs_285 = arith.addi %arg27, %c1_i32 : i32 loc(#loc890) + %do_ptrs_286 = arith.cmpi sge, %do_ptrs_285, %c3_i32 : i32 loc(#loc890) + %do_ptrs_287 = arith.select %do_ptrs_286, %c0_i32, %do_ptrs_285 : i32 loc(#loc890) + %qT_288 = ttg.memdesc_index %qT_165[%do_ptrs_287] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc861) + %do_ptrs_289 = tt.splat %do_ptrs_226 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc890) + %qT_290 = ttg.async_copy_global_to_local %qT_ptrs_276, %qT_288 mask %do_ptrs_289 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc861) + %qT_291 = ttg.async_commit_group tokens %qT_290 loc(#loc861) + %lse_292 = tt.addptr %lse, %offs_m1_281 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc721) + %lse_293 = ttg.memdesc_index %lse_166[%do_ptrs_284] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc718) + %do_ptrs_294 = tt.splat %do_ptrs_226 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc890) + %lse_295 = ttg.async_copy_global_to_local %lse_292, %lse_293 mask %do_ptrs_294 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc718) + %lse_296 = ttg.async_commit_group tokens %lse_295 loc(#loc718) + %do_297 = ttg.memdesc_index %do_167[%do_ptrs_287] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc862) + %do_ptrs_298 = tt.splat %do_ptrs_226 : i1 -> tensor<64x128xi1, #blocked> loc(#loc890) + %do_299 = ttg.async_copy_global_to_local %do_ptrs_279, %do_297 mask %do_ptrs_298 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc862) + %do_300 = ttg.async_commit_group tokens %do_299 loc(#loc862) + %Di_301 = tt.addptr %Di, %offs_m1_281 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc722) + %Di_302 = ttg.memdesc_index %Di_168[%do_ptrs_284] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc720) + %Di_303 = ttg.async_copy_global_to_local %Di_301, %Di_302 mask %do_ptrs_294 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc720) + %Di_304 = ttg.async_commit_group tokens %Di_303 loc(#loc720) + scf.yield %dk_259, %dv_249, %qT_ptrs_276, %do_ptrs_279, %offs_m1_281, %do_ptrs_287, %do_ptrs_234, %do_ptrs_284, %do_ptrs_231, %qT_218, %qT_291, %lse_220, %lse_296, %do_222, %do_300, %Di_224, %Di_304 : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64x128x!tt.ptr, #blocked>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>>, i32, i32, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token loc(#loc890) + } loc(#loc890) + %do_ptrs_209:2 = ttng.warp_group_dot_wait %do_ptrs_208#1, %do_ptrs_208#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1> loc(#loc890) + %do_ptrs_210 = ttg.async_wait {num = 0 : i32} loc(#loc890) + ttg.local_dealloc %Di_168 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc890) + ttg.local_dealloc %do_167 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc890) + ttg.local_dealloc %lse_166 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc890) + ttg.local_dealloc %qT_165 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc890) + scf.yield %do_ptrs_209#0, %do_ptrs_209#1 : tensor<128x128xf32, #mma1>, tensor<128x128xf32, #mma1> loc(#loc277) + } loc(#loc667) + %dv_ptrs = tt.splat %DV : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc577) + %dv_ptrs_89 = tt.addptr %dv_ptrs, %ptr_35 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc577) + %dv_ptrs_90 = tt.broadcast %dv_ptrs_89 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc578) + %dv_ptrs_91 = tt.addptr %dv_ptrs_90, %ptr_41 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc578) + %1 = arith.truncf %dk#0 : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc280) + %2 = ttg.convert_layout %1 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc280) + tt.store %dv_ptrs_91, %2 : tensor<128x128x!tt.ptr, #blocked> loc(#loc280) + %dk_92 = arith.mulf %dk#1, %cst_3 : tensor<128x128xf32, #mma1> loc(#loc579) + %mask = arith.cmpi slt, %ptr, %cst_13 : tensor<128x1xi32, #blocked> loc(#loc580) + %xindex = tt.broadcast %ptr_35 : tensor<128x1xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc581) + %xindex_93 = arith.addi %ptr_41, %xindex : tensor<128x128xi32, #blocked> loc(#loc581) + %xindex_94 = tt.splat %k_adj : i32 -> tensor<128x128xi32, #blocked> loc(#loc582) + %xindex_95 = arith.addi %xindex_93, %xindex_94 : tensor<128x128xi32, #blocked> loc(#loc582) + %xindex_96 = tt.splat %dv_adj : i32 -> tensor<128x128xi32, #blocked> loc(#loc583) + %xindex_97 = arith.addi %xindex_95, %xindex_96 : tensor<128x128xi32, #blocked> loc(#loc583) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr, #blocked> loc(#loc286) + %4 = tt.addptr %3, %xindex_97 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc286) + %5 = tt.broadcast %mask : tensor<128x1xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc287) + %6 = arith.truncf %dk_92 : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc287) + %7 = ttg.convert_layout %6 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc287) + tt.store %4, %7, %5 : tensor<128x128x!tt.ptr, #blocked> loc(#loc287) + } loc(#loc18) + tt.return loc(#loc288) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":111:24) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":115:27) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":116:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":117:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":124:25) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":124:47) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":124:35) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":124:59) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":128:50) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":128:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":128:61) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":131:9) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":132:9) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":133:10) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":136:26) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":139:14) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":139:7) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":140:24) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":144:29) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":144:54) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":144:44) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":145:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":154:55) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":154:78) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":155:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":155:83) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":155:68) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":158:30) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":158:52) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":158:40) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":158:63) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":159:32) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":159:42) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":159:66) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":161:30) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":161:35) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":161:46) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":161:56) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":163:17) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":164:19) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":167:19) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":168:21) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":169:25) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":174:36) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":175:29) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":825:27) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":178:107) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":825:38) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":825:20) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":825:56) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":825:49) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":835:23) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":179:111) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":185:34) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":185:25) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":186:33) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":186:26) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":190:30) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":190:50) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":191:18) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":195:30) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":196:27) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":196:41) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":197:53) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":197:39) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":199:42) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":199:29) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":390:26) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":207:12) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":390:37) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":390:18) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":390:56) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":390:49) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":391:18) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":391:49) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":395:43) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":395:63) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":482:23) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":405:12) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":485:34) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":397:28) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":485:23) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":486:22) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":487:23) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":488:23) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":489:23) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":525:39) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":531:22) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":531:19) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":458:105) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":528:104) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":414:19) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":415:19) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":459:19) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":553:30) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":461:14) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":464:36) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":483:23) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":490:23) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":493:24) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":494:24) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":496:25) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":497:92) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":500:24) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":501:24) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":502:39) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":503:25) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":504:24) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":505:24) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":506:23) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":507:25) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":508:25) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":509:92) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":511:24) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":512:24) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":513:39) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":514:25) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":515:24) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":516:24) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":521:69) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":524:27) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":525:21) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":530:20) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":531:14) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":549:43) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":551:15) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":553:21) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":417:19) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":788:33) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":411:64) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":789:38) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":789:24) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":790:109) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":790:113) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":790:55) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":790:25) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":791:30) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":791:35) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":791:60) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":792:34) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":792:48) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":792:63) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":793:29) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":793:47) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":793:61) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":793:42) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":414:28) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":214:39) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":215:31) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":215:45) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":216:62) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":216:43) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":218:33) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":226:16) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":231:24) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":231:56) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":232:14) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":234:30) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":252:25) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":253:29) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":256:107) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":257:107) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":263:32) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":266:56) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":269:34) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":281:58) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":281:80) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":282:53) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":282:81) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":282:70) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":286:32) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":287:30) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":287:43) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":288:55) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":288:42) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":290:45) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":290:32) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":601:26) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":298:16) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":601:37) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":601:56) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":601:49) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":602:27) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":602:38) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":602:51) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":608:42) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":608:61) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":698:25) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":618:12) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":699:25) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":701:35) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":610:28) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":701:24) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":702:24) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":704:24) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":705:24) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":709:25) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":710:25) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":712:25) +#loc201 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":713:92) +#loc202 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":716:24) +#loc203 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":717:24) +#loc204 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":718:39) +#loc205 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":719:25) +#loc206 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":720:24) +#loc207 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":721:24) +#loc208 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":731:24) +#loc209 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":306:41) +#loc210 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":307:34) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":307:47) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":308:64) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":308:46) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":310:36) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":318:20) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":676:20) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":262:30) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":263:51) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":266:34) +#loc220 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":266:44) +#loc221 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":266:67) +#loc222 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":267:36) +#loc223 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":267:46) +#loc224 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":267:70) +#loc225 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":269:39) +#loc226 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":269:50) +#loc227 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":269:60) +#loc228 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":271:21) +#loc229 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":272:23) +#loc230 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":275:25) +#loc231 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":276:29) +#loc232 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":601:18) +#loc233 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":602:19) +#loc234 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":672:28) +#loc235 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":746:29) +#loc236 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":669:105) +#loc237 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":672:22) +#loc238 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":741:99) +#loc239 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":746:21) +#loc240 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":626:19) +#loc241 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":627:19) +#loc242 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":628:19) +#loc243 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":775:52) +#loc244 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":675:26) +#loc245 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":675:46) +#loc246 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":678:15) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":680:36) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":703:25) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":706:24) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":722:24) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":723:25) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":724:25) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":725:92) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":727:24) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":728:24) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":729:39) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":730:25) +#loc258 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":732:24) +#loc259 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":736:69) +#loc260 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":739:27) +#loc261 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":740:44) +#loc262 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":740:40) +#loc263 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":740:22) +#loc264 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":750:29) +#loc265 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":744:24) +#loc266 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":744:43) +#loc267 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":750:20) +#loc268 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":751:25) +#loc269 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":751:22) +#loc270 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":751:16) +#loc271 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":773:45) +#loc272 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":775:24) +#loc273 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":775:43) +#loc274 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":623:62) +#loc275 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":626:28) +#loc276 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":627:28) +#loc277 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":303:12) +#loc278 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":323:23) +#loc279 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":323:55) +#loc280 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":330:30) +#loc281 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":334:14) +#loc282 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":337:29) +#loc283 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":344:27) +#loc284 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":344:41) +#loc285 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":344:58) +#loc286 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":345:29) +#loc287 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":345:69) +#loc288 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":139:4) +#loc307 = loc("pid"(#loc2)) +#loc308 = loc("off_zq"(#loc3)) +#loc309 = loc("off_hkv"(#loc4)) +#loc310 = loc("off_zkv"(#loc5)) +#loc311 = loc("k_adj"(#loc6)) +#loc312 = loc("k_adj"(#loc7)) +#loc313 = loc("k_adj"(#loc8)) +#loc314 = loc("k_adj"(#loc9)) +#loc315 = loc("dv_adj"(#loc10)) +#loc316 = loc("dv_adj"(#loc11)) +#loc317 = loc("dv_adj"(#loc12)) +#loc318 = loc("K"(#loc13)) +#loc319 = loc("V"(#loc14)) +#loc320 = loc("DV"(#loc15)) +#loc321 = loc("offs_k"(#loc16)) +#loc322 = loc("off_pid"(#loc19)) +#loc323 = loc("off_hq2"(#loc20)) +#loc324 = loc("off_hq2"(#loc21)) +#loc325 = loc("off_hq2"(#loc22)) +#loc326 = loc("start_m2_block"(#loc23)) +#loc327 = loc("sparse_kv_num_blks_offset"(#loc24)) +#loc328 = loc("sparse_kv_num_blks_offset"(#loc25)) +#loc329 = loc("sparse_kv_idx_offset"(#loc26)) +#loc330 = loc("sparse_kv_idx_offset"(#loc27)) +#loc331 = loc("sparse_kv_idx_offset"(#loc28)) +#loc332 = loc("q_adj2"(#loc29)) +#loc333 = loc("q_adj2"(#loc30)) +#loc334 = loc("q_adj2"(#loc31)) +#loc335 = loc("q_adj2"(#loc32)) +#loc336 = loc("do_adj2"(#loc33)) +#loc337 = loc("do_adj2"(#loc34)) +#loc338 = loc("do_adj2"(#loc35)) +#loc339 = loc("off_chz2"(#loc36)) +#loc340 = loc("off_chz2"(#loc37)) +#loc341 = loc("off_chz2"(#loc38)) +#loc342 = loc("off_chz2"(#loc39)) +#loc343 = loc("Q2"(#loc40)) +#loc344 = loc("DO2"(#loc41)) +#loc345 = loc("DQ2"(#loc42)) +#loc346 = loc("LSE2"(#loc43)) +#loc347 = loc("DELTA2"(#loc44)) +#loc348 = loc("start_m2"(#loc45)) +#loc349 = loc("offs_m2"(#loc46)) +#loc350 = loc("ptr"(#loc47)) +#loc351 = loc("q"(#loc48)) +#loc352 = loc("ptr"(#loc49)) +#loc353 = loc("ptr"(#loc50)) +#loc354 = loc("ptr"(#loc51)) +#loc355 = loc("ptr"(#loc52)) +#loc356 = loc("do"(#loc54)) +#loc357 = loc("Di"(#loc55)) +#loc358 = loc("Di"(#loc56)) +#loc359 = loc("lse"(#loc57)) +#loc360 = loc("lse"(#loc58)) +#loc361 = loc("lse"(#loc59)) +#loc362 = loc("lse"(#loc60)) +#loc363 = loc("lse"(#loc61)) +#loc364 = loc("kv_indices"(#loc62)) +#loc365 = loc("kv_start"(#loc63)) +#loc366 = loc("kv_start"(#loc64)) +#loc367 = loc("sparse_kv_num_blocks"(#loc65)) +#loc368 = loc("sparse_kv_num_blocks"(#loc66)) +#loc369 = loc("offs_n2"(#loc67)) +#loc370 = loc("offs_n2"(#loc68)) +#loc371 = loc("kT_ptrs"(#loc69)) +#loc372 = loc("dq"(#loc70)) +#loc373 = loc("kT_ptrs"(#loc71)) +#loc374 = loc("kT_ptrs"(#loc72)) +#loc375 = loc("kT_ptrs"(#loc73)) +#loc376 = loc("kT_ptrs"(#loc74)) +#loc377 = loc("vT_ptrs"(#loc75)) +#loc378 = loc("vT_ptrs"(#loc76)) +#loc379 = loc("hi"(#loc77)) +#loc380 = loc("hi"(#loc78)) +#loc381 = loc("tmp4"(#loc79)) +#loc382 = loc("dq"(#loc80)) +#loc383 = loc("tmp7"(#loc81)) +#loc384 = loc("dq"(#loc82)) +#loc385 = loc("tmp7"(#loc83)) +#loc386 = loc("tmp8"(#loc84)) +#loc387 = loc("tmp9"(#loc85)) +#loc388 = loc("tmp10"(#loc86)) +#loc389 = loc("tmp11"(#loc87)) +#loc390 = loc("p"(#loc88)) +#loc391 = loc("ds"(#loc89)) +#loc392 = loc("ds"(#loc90)) +#loc393 = loc("kT"(#loc91)) +#loc394 = loc("vT"(#loc92)) +#loc395 = loc("kT_ptrs"(#loc93)) +#loc396 = loc("vT_ptrs"(#loc94)) +#loc397 = loc("qk"(#loc95)) +#loc398 = loc("dq"(#loc96)) +#loc399 = loc("qk"(#loc97)) +#loc400 = loc("n"(#loc98)) +#loc401 = loc("tmp5"(#loc99)) +#loc402 = loc("tmp12"(#loc100)) +#loc403 = loc("tmp15"(#loc101)) +#loc404 = loc("tmp16"(#loc102)) +#loc405 = loc("tmp18"(#loc103)) +#loc406 = loc("tmp19"(#loc104)) +#loc407 = loc("tmp22"(#loc105)) +#loc408 = loc("tmp23"(#loc106)) +#loc409 = loc("tmp24"(#loc107)) +#loc410 = loc("tmp25"(#loc108)) +#loc411 = loc("tmp26"(#loc109)) +#loc412 = loc("tmp27"(#loc110)) +#loc413 = loc("tmp28"(#loc111)) +#loc414 = loc("tmp29"(#loc112)) +#loc415 = loc("tmp30"(#loc113)) +#loc416 = loc("tmp31"(#loc114)) +#loc417 = loc("tmp33"(#loc115)) +#loc418 = loc("tmp34"(#loc116)) +#loc419 = loc("tmp35"(#loc117)) +#loc420 = loc("tmp36"(#loc118)) +#loc421 = loc("tmp37"(#loc119)) +#loc422 = loc("tmp38"(#loc120)) +#loc423 = loc("post_mod_scores"(#loc121)) +#loc424 = loc("post_mod_scores"(#loc122)) +#loc425 = loc("p"(#loc123)) +#loc426 = loc("dp"(#loc124)) +#loc427 = loc("ds"(#loc125)) +#loc428 = loc("ds"(#loc126)) +#loc429 = loc("ds"(#loc127)) +#loc430 = loc("dq"(#loc128)) +#loc431 = loc("offs_n2"(#loc129)) +#loc432 = loc("cur_block_idx"(#loc130)) +#loc433 = loc("offset"(#loc131)) +#loc434 = loc("cur_block"(#loc132)) +#loc435 = loc("cur_block"(#loc133)) +#loc436 = loc("next_block"(#loc134)) +#loc437 = loc("next_block"(#loc135)) +#loc438 = loc("next_block"(#loc136)) +#loc439 = loc("next_block"(#loc137)) +#loc440 = loc("needs_jump"(#loc138)) +#loc441 = loc("needs_jump"(#loc139)) +#loc442 = loc("needs_jump"(#loc140)) +#loc443 = loc("jump_to_block"(#loc141)) +#loc444 = loc("jump_to_block"(#loc142)) +#loc445 = loc("jump_to_block"(#loc143)) +#loc446 = loc("offset"(#loc144)) +#loc447 = loc("offset"(#loc145)) +#loc448 = loc("offset"(#loc146)) +#loc449 = loc("offset"(#loc147)) +#loc450 = loc("kT_ptrs"(#loc148)) +#loc451 = loc("kv_indices"(#loc149)) +#loc452 = loc("kv_start"(#loc150)) +#loc453 = loc("kv_start"(#loc151)) +#loc454 = loc("sparse_kv_num_blocks"(#loc152)) +#loc455 = loc("sparse_kv_num_blocks"(#loc153)) +#loc456 = loc("offs_n2"(#loc154)) +#loc457 = loc("dq"(#loc155)) +#loc458 = loc("dq_ptrs"(#loc156)) +#loc459 = loc("dq_ptrs"(#loc157)) +#loc460 = loc("dq"(#loc158)) +#loc461 = loc("start_n1"(#loc160)) +#loc462 = loc("offs_n1"(#loc161)) +#loc463 = loc("k"(#loc162)) +#loc464 = loc("v"(#loc163)) +#loc465 = loc("off_hq1"(#loc164)) +#loc466 = loc("q_adj1"(#loc165)) +#loc467 = loc("off_chz1"(#loc166)) +#loc468 = loc("sparse_q_num_blks_offset"(#loc167)) +#loc469 = loc("sparse_q_num_blks_offset"(#loc168)) +#loc470 = loc("sparse_q_idx_offset"(#loc169)) +#loc471 = loc("sparse_q_idx_offset"(#loc170)) +#loc472 = loc("sparse_q_idx_offset"(#loc171)) +#loc473 = loc("q_indices"(#loc172)) +#loc474 = loc("q_start"(#loc173)) +#loc475 = loc("q_start"(#loc174)) +#loc476 = loc("sparse_q_num_blocks"(#loc175)) +#loc477 = loc("sparse_q_num_blocks"(#loc176)) +#loc478 = loc("offs_m1"(#loc177)) +#loc479 = loc("offs_m1"(#loc178)) +#loc480 = loc("qT_ptrs"(#loc179)) +#loc481 = loc("qT_ptrs"(#loc181)) +#loc482 = loc("qT_ptrs"(#loc182)) +#loc483 = loc("qT_ptrs"(#loc183)) +#loc484 = loc("do_ptrs"(#loc184)) +#loc485 = loc("do_ptrs"(#loc185)) +#loc486 = loc("do_ptrs"(#loc186)) +#loc487 = loc("hi"(#loc187)) +#loc488 = loc("hi"(#loc188)) +#loc489 = loc("tmp44"(#loc189)) +#loc490 = loc(callsite(#loc190 at #loc180)) +#loc491 = loc("tmp45"(#loc191)) +#loc492 = loc("tmp47"(#loc192)) +#loc493 = loc("dk"(#loc193)) +#loc494 = loc("tmp47"(#loc194)) +#loc495 = loc("tmp48"(#loc195)) +#loc496 = loc("tmp50"(#loc196)) +#loc497 = loc("tmp51"(#loc197)) +#loc498 = loc("tmp55"(#loc198)) +#loc499 = loc("tmp56"(#loc199)) +#loc500 = loc("tmp58"(#loc200)) +#loc501 = loc("tmp59"(#loc201)) +#loc502 = loc("tmp62"(#loc202)) +#loc503 = loc("tmp63"(#loc203)) +#loc504 = loc("tmp64"(#loc204)) +#loc505 = loc("tmp65"(#loc205)) +#loc506 = loc("tmp66"(#loc206)) +#loc507 = loc("tmp67"(#loc207)) +#loc508 = loc("tmp77"(#loc208)) +#loc509 = loc("q_indices"(#loc209)) +#loc510 = loc("q_start"(#loc210)) +#loc511 = loc("q_start"(#loc211)) +#loc512 = loc("sparse_q_num_blocks"(#loc212)) +#loc513 = loc("sparse_q_num_blocks"(#loc213)) +#loc514 = loc("offs_m1"(#loc214)) +#loc515 = loc("qkT"(#loc216)) +#loc516 = loc("dv"(#loc217)) +#loc517 = loc("off_hq1"(#loc218)) +#loc518 = loc("q_adj1"(#loc219)) +#loc519 = loc("q_adj1"(#loc220)) +#loc520 = loc("q_adj1"(#loc221)) +#loc521 = loc("do_adj1"(#loc222)) +#loc522 = loc("do_adj1"(#loc223)) +#loc523 = loc("do_adj1"(#loc224)) +#loc524 = loc("off_chz1"(#loc225)) +#loc525 = loc("off_chz1"(#loc226)) +#loc526 = loc("off_chz1"(#loc227)) +#loc527 = loc("Q1"(#loc228)) +#loc528 = loc("DO1"(#loc229)) +#loc529 = loc("LSE1"(#loc230)) +#loc530 = loc("DELTA1"(#loc231)) +#loc531 = loc("qT_ptrs"(#loc232)) +#loc532 = loc("do_ptrs"(#loc233)) +#loc533 = loc("lse"(#loc234)) +#loc534 = loc("Di"(#loc235)) +#loc535 = loc("qT"(#loc236)) +#loc536 = loc("lse"(#loc237)) +#loc537 = loc("do"(#loc238)) +#loc538 = loc("Di"(#loc239)) +#loc539 = loc("qT_ptrs"(#loc240)) +#loc540 = loc("do_ptrs"(#loc241)) +#loc541 = loc("offs_m1"(#loc242)) +#loc542 = loc("dk"(#loc243)) +#loc543 = loc("lse"(#loc244)) +#loc544 = loc("lse"(#loc245)) +#loc545 = loc("qkT"(#loc246)) +#loc546 = loc("m"(#loc247)) +#loc547 = loc("tmp49"(#loc248)) +#loc548 = loc("tmp52"(#loc249)) +#loc549 = loc("tmp68"(#loc250)) +#loc550 = loc("tmp69"(#loc251)) +#loc551 = loc("tmp70"(#loc252)) +#loc552 = loc("tmp71"(#loc253)) +#loc553 = loc("tmp73"(#loc254)) +#loc554 = loc("tmp74"(#loc255)) +#loc555 = loc("tmp75"(#loc256)) +#loc556 = loc("tmp76"(#loc257)) +#loc557 = loc("tmp78"(#loc258)) +#loc558 = loc("post_mod_scores"(#loc259)) +#loc559 = loc("post_mod_scores"(#loc260)) +#loc560 = loc("pT"(#loc261)) +#loc561 = loc("pT"(#loc262)) +#loc562 = loc("pT"(#loc263)) +#loc563 = loc("dpT"(#loc264)) +#loc564 = loc("dv"(#loc265)) +#loc565 = loc("dv"(#loc266)) +#loc566 = loc("dpT"(#loc267)) +#loc567 = loc("dsT"(#loc268)) +#loc568 = loc("dsT"(#loc269)) +#loc569 = loc("dsT"(#loc270)) +#loc570 = loc("dsT"(#loc271)) +#loc571 = loc("dk"(#loc272)) +#loc572 = loc("dk"(#loc273)) +#loc573 = loc("offset"(#loc274)) +#loc574 = loc("qT_ptrs"(#loc275)) +#loc575 = loc("do_ptrs"(#loc276)) +#loc576 = loc(callsite(#loc190 at #loc215)) +#loc577 = loc("dv_ptrs"(#loc278)) +#loc578 = loc("dv_ptrs"(#loc279)) +#loc579 = loc("dk"(#loc281)) +#loc580 = loc("mask"(#loc282)) +#loc581 = loc("xindex"(#loc283)) +#loc582 = loc("xindex"(#loc284)) +#loc583 = loc("xindex"(#loc285)) +#loc584 = loc(callsite(#loc350 at #loc351)) +#loc585 = loc(callsite(#loc352 at #loc351)) +#loc586 = loc(callsite(#loc353 at #loc351)) +#loc587 = loc(callsite(#loc354 at #loc351)) +#loc588 = loc(callsite(#loc355 at #loc351)) +#loc589 = loc(callsite(#loc53 at #loc351)) +#loc590 = loc(callsite(#loc352 at #loc356)) +#loc591 = loc(callsite(#loc353 at #loc356)) +#loc592 = loc(callsite(#loc355 at #loc356)) +#loc593 = loc(callsite(#loc53 at #loc356)) +#loc594 = loc(callsite(#loc371 at #loc372)) +#loc595 = loc(callsite(#loc373 at #loc372)) +#loc596 = loc(callsite(#loc374 at #loc372)) +#loc597 = loc(callsite(#loc375 at #loc372)) +#loc598 = loc(callsite(#loc376 at #loc372)) +#loc599 = loc(callsite(#loc377 at #loc372)) +#loc600 = loc(callsite(#loc378 at #loc372)) +#loc601 = loc(callsite(#loc379 at #loc372)) +#loc602 = loc(callsite(#loc380 at #loc372)) +#loc603 = loc(callsite(#loc382 at #loc372)) +#loc604 = loc("offs_n2"(#loc384)) +#loc605 = loc(callsite(#loc395 at #loc372)) +#loc606 = loc(callsite(#loc396 at #loc372)) +#loc607 = loc(callsite(#loc431 at #loc372)) +#loc608 = loc(callsite(#loc433 at #loc372)) +#loc609 = loc(callsite(#loc450 at #loc372)) +#loc610 = loc(callsite(#loc371 at #loc457)) +#loc611 = loc(callsite(#loc373 at #loc457)) +#loc612 = loc(callsite(#loc374 at #loc457)) +#loc613 = loc(callsite(#loc376 at #loc457)) +#loc614 = loc(callsite(#loc377 at #loc457)) +#loc615 = loc(callsite(#loc378 at #loc457)) +#loc616 = loc(callsite(#loc379 at #loc457)) +#loc617 = loc(callsite(#loc380 at #loc457)) +#loc618 = loc(callsite(#loc382 at #loc457)) +#loc619 = loc(callsite(#loc395 at #loc457)) +#loc620 = loc(callsite(#loc396 at #loc457)) +#loc621 = loc(callsite(#loc433 at #loc457)) +#loc622 = loc(callsite(#loc450 at #loc457)) +#loc623 = loc(callsite(#loc350 at #loc463)) +#loc624 = loc(callsite(#loc352 at #loc463)) +#loc625 = loc(callsite(#loc353 at #loc463)) +#loc626 = loc(callsite(#loc354 at #loc463)) +#loc627 = loc(callsite(#loc355 at #loc463)) +#loc628 = loc(callsite(#loc53 at #loc463)) +#loc629 = loc(callsite(#loc353 at #loc464)) +#loc630 = loc(callsite(#loc355 at #loc464)) +#loc631 = loc(callsite(#loc53 at #loc464)) +#loc632 = loc(callsite(#loc480 at #loc180)) +#loc633 = loc(callsite(#loc481 at #loc180)) +#loc634 = loc(callsite(#loc482 at #loc180)) +#loc635 = loc(callsite(#loc483 at #loc180)) +#loc636 = loc(callsite(#loc484 at #loc180)) +#loc637 = loc(callsite(#loc485 at #loc180)) +#loc638 = loc(callsite(#loc486 at #loc180)) +#loc639 = loc(callsite(#loc487 at #loc180)) +#loc640 = loc(callsite(#loc488 at #loc180)) +#loc641 = loc(callsite(#loc489 at #loc490)) +#loc642 = loc(callsite(#loc491 at #loc490)) +#loc643 = loc(callsite(#loc492 at #loc490)) +#loc644 = loc("dv"(#loc493)) +#loc645 = loc(callsite(#loc494 at #loc490)) +#loc646 = loc(callsite(#loc495 at #loc490)) +#loc647 = loc(callsite(#loc496 at #loc490)) +#loc648 = loc(callsite(#loc497 at #loc490)) +#loc649 = loc(callsite(#loc498 at #loc490)) +#loc650 = loc(callsite(#loc499 at #loc490)) +#loc651 = loc(callsite(#loc500 at #loc490)) +#loc652 = loc(callsite(#loc501 at #loc490)) +#loc653 = loc(callsite(#loc502 at #loc490)) +#loc654 = loc(callsite(#loc503 at #loc490)) +#loc655 = loc(callsite(#loc504 at #loc490)) +#loc656 = loc(callsite(#loc505 at #loc490)) +#loc657 = loc(callsite(#loc506 at #loc490)) +#loc658 = loc(callsite(#loc507 at #loc490)) +#loc659 = loc(callsite(#loc508 at #loc490)) +#loc660 = loc(callsite(#loc480 at #loc215)) +#loc661 = loc(callsite(#loc481 at #loc215)) +#loc662 = loc(callsite(#loc484 at #loc215)) +#loc663 = loc(callsite(#loc485 at #loc215)) +#loc664 = loc(callsite(#loc487 at #loc215)) +#loc665 = loc(callsite(#loc488 at #loc215)) +#loc666 = loc(callsite(#loc515 at #loc490)) +#loc667 = loc("dk"(#loc516)) +#loc668 = loc(callsite(#loc531 at #loc180)) +#loc669 = loc(callsite(#loc532 at #loc180)) +#loc670 = loc(callsite(#loc533 at #loc490)) +#loc671 = loc(callsite(#loc534 at #loc490)) +#loc672 = loc(callsite(#loc535 at #loc490)) +#loc673 = loc(callsite(#loc536 at #loc490)) +#loc674 = loc(callsite(#loc537 at #loc490)) +#loc675 = loc(callsite(#loc538 at #loc490)) +#loc676 = loc(callsite(#loc539 at #loc180)) +#loc677 = loc(callsite(#loc540 at #loc180)) +#loc678 = loc(callsite(#loc541 at #loc180)) +#loc679 = loc(callsite(#loc542 at #loc490)) +#loc680 = loc(callsite(#loc543 at #loc490)) +#loc681 = loc(callsite(#loc544 at #loc490)) +#loc682 = loc(callsite(#loc545 at #loc490)) +#loc683 = loc(callsite(#loc546 at #loc490)) +#loc684 = loc(callsite(#loc547 at #loc490)) +#loc685 = loc(callsite(#loc548 at #loc490)) +#loc686 = loc(callsite(#loc549 at #loc490)) +#loc687 = loc(callsite(#loc550 at #loc490)) +#loc688 = loc(callsite(#loc551 at #loc490)) +#loc689 = loc(callsite(#loc552 at #loc490)) +#loc690 = loc(callsite(#loc553 at #loc490)) +#loc691 = loc(callsite(#loc554 at #loc490)) +#loc692 = loc(callsite(#loc555 at #loc490)) +#loc693 = loc(callsite(#loc556 at #loc490)) +#loc694 = loc(callsite(#loc557 at #loc490)) +#loc695 = loc(callsite(#loc558 at #loc490)) +#loc696 = loc(callsite(#loc559 at #loc490)) +#loc697 = loc(callsite(#loc560 at #loc490)) +#loc698 = loc(callsite(#loc561 at #loc490)) +#loc699 = loc(callsite(#loc562 at #loc490)) +#loc700 = loc(callsite(#loc563 at #loc490)) +#loc701 = loc(callsite(#loc564 at #loc490)) +#loc702 = loc(callsite(#loc565 at #loc490)) +#loc703 = loc(callsite(#loc566 at #loc490)) +#loc704 = loc(callsite(#loc567 at #loc490)) +#loc705 = loc(callsite(#loc568 at #loc490)) +#loc706 = loc(callsite(#loc569 at #loc490)) +#loc707 = loc(callsite(#loc570 at #loc490)) +#loc708 = loc(callsite(#loc571 at #loc490)) +#loc709 = loc(callsite(#loc572 at #loc490)) +#loc710 = loc(callsite(#loc573 at #loc180)) +#loc711 = loc(callsite(#loc574 at #loc180)) +#loc712 = loc(callsite(#loc575 at #loc180)) +#loc713 = loc(callsite(#loc531 at #loc215)) +#loc714 = loc(callsite(#loc483 at #loc215)) +#loc715 = loc(callsite(#loc532 at #loc215)) +#loc716 = loc(callsite(#loc486 at #loc215)) +#loc717 = loc(callsite(#loc535 at #loc576)) +#loc718 = loc(callsite(#loc536 at #loc576)) +#loc719 = loc(callsite(#loc537 at #loc576)) +#loc720 = loc(callsite(#loc538 at #loc576)) +#loc721 = loc(callsite(#loc533 at #loc576)) +#loc722 = loc(callsite(#loc534 at #loc576)) +#loc723 = loc(callsite(#loc539 at #loc215)) +#loc724 = loc(callsite(#loc540 at #loc215)) +#loc725 = loc(callsite(#loc541 at #loc215)) +#loc726 = loc(callsite(#loc542 at #loc576)) +#loc727 = loc(callsite(#loc543 at #loc576)) +#loc728 = loc(callsite(#loc544 at #loc576)) +#loc729 = loc(callsite(#loc515 at #loc576)) +#loc730 = loc(callsite(#loc545 at #loc576)) +#loc731 = loc(callsite(#loc559 at #loc576)) +#loc732 = loc(callsite(#loc560 at #loc576)) +#loc733 = loc(callsite(#loc561 at #loc576)) +#loc734 = loc(callsite(#loc562 at #loc576)) +#loc735 = loc(callsite(#loc563 at #loc576)) +#loc736 = loc(callsite(#loc564 at #loc576)) +#loc737 = loc(callsite(#loc565 at #loc576)) +#loc738 = loc(callsite(#loc566 at #loc576)) +#loc739 = loc(callsite(#loc567 at #loc576)) +#loc740 = loc(callsite(#loc568 at #loc576)) +#loc741 = loc(callsite(#loc569 at #loc576)) +#loc742 = loc(callsite(#loc571 at #loc576)) +#loc743 = loc(callsite(#loc572 at #loc576)) +#loc744 = loc(callsite(#loc573 at #loc215)) +#loc745 = loc(callsite(#loc574 at #loc215)) +#loc746 = loc(callsite(#loc575 at #loc215)) +#loc747 = loc(callsite(#loc381 at #loc603)) +#loc748 = loc(callsite(#loc383 at #loc603)) +#loc749 = loc("kT_ptrs"(#loc604)) +#loc750 = loc(callsite(#loc385 at #loc603)) +#loc751 = loc(callsite(#loc386 at #loc603)) +#loc752 = loc(callsite(#loc387 at #loc603)) +#loc753 = loc(callsite(#loc388 at #loc603)) +#loc754 = loc(callsite(#loc389 at #loc603)) +#loc755 = loc(callsite(#loc390 at #loc603)) +#loc756 = loc(callsite(#loc391 at #loc603)) +#loc757 = loc(callsite(#loc392 at #loc603)) +#loc758 = loc(callsite(#loc393 at #loc603)) +#loc759 = loc(callsite(#loc394 at #loc603)) +#loc760 = loc(callsite(#loc397 at #loc603)) +#loc761 = loc(callsite(#loc398 at #loc603)) +#loc762 = loc(callsite(#loc399 at #loc603)) +#loc763 = loc(callsite(#loc400 at #loc603)) +#loc764 = loc(callsite(#loc401 at #loc603)) +#loc765 = loc(callsite(#loc402 at #loc603)) +#loc766 = loc(callsite(#loc403 at #loc603)) +#loc767 = loc(callsite(#loc404 at #loc603)) +#loc768 = loc(callsite(#loc405 at #loc603)) +#loc769 = loc(callsite(#loc406 at #loc603)) +#loc770 = loc(callsite(#loc407 at #loc603)) +#loc771 = loc(callsite(#loc408 at #loc603)) +#loc772 = loc(callsite(#loc409 at #loc603)) +#loc773 = loc(callsite(#loc410 at #loc603)) +#loc774 = loc(callsite(#loc411 at #loc603)) +#loc775 = loc(callsite(#loc412 at #loc603)) +#loc776 = loc(callsite(#loc413 at #loc603)) +#loc777 = loc(callsite(#loc414 at #loc603)) +#loc778 = loc(callsite(#loc415 at #loc603)) +#loc779 = loc(callsite(#loc416 at #loc603)) +#loc780 = loc(callsite(#loc417 at #loc603)) +#loc781 = loc(callsite(#loc418 at #loc603)) +#loc782 = loc(callsite(#loc419 at #loc603)) +#loc783 = loc(callsite(#loc420 at #loc603)) +#loc784 = loc(callsite(#loc421 at #loc603)) +#loc785 = loc(callsite(#loc422 at #loc603)) +#loc786 = loc(callsite(#loc423 at #loc603)) +#loc787 = loc(callsite(#loc424 at #loc603)) +#loc788 = loc(callsite(#loc425 at #loc603)) +#loc789 = loc(callsite(#loc426 at #loc603)) +#loc790 = loc(callsite(#loc427 at #loc603)) +#loc791 = loc(callsite(#loc428 at #loc603)) +#loc792 = loc(callsite(#loc429 at #loc603)) +#loc793 = loc(callsite(#loc430 at #loc603)) +#loc794 = loc(callsite(#loc432 at #loc608)) +#loc795 = loc(callsite(#loc434 at #loc608)) +#loc796 = loc(callsite(#loc435 at #loc608)) +#loc797 = loc(callsite(#loc436 at #loc608)) +#loc798 = loc(callsite(#loc437 at #loc608)) +#loc799 = loc(callsite(#loc438 at #loc608)) +#loc800 = loc(callsite(#loc439 at #loc608)) +#loc801 = loc(callsite(#loc440 at #loc608)) +#loc802 = loc(callsite(#loc441 at #loc608)) +#loc803 = loc(callsite(#loc442 at #loc608)) +#loc804 = loc(callsite(#loc443 at #loc608)) +#loc805 = loc(callsite(#loc444 at #loc608)) +#loc806 = loc(callsite(#loc445 at #loc608)) +#loc807 = loc(callsite(#loc446 at #loc608)) +#loc808 = loc(callsite(#loc447 at #loc608)) +#loc809 = loc(callsite(#loc448 at #loc608)) +#loc810 = loc(callsite(#loc449 at #loc608)) +#loc811 = loc(callsite(#loc393 at #loc618)) +#loc812 = loc(callsite(#loc394 at #loc618)) +#loc813 = loc(callsite(#loc397 at #loc618)) +#loc814 = loc(callsite(#loc398 at #loc618)) +#loc815 = loc(callsite(#loc399 at #loc618)) +#loc816 = loc(callsite(#loc424 at #loc618)) +#loc817 = loc(callsite(#loc390 at #loc618)) +#loc818 = loc(callsite(#loc425 at #loc618)) +#loc819 = loc(callsite(#loc426 at #loc618)) +#loc820 = loc(callsite(#loc392 at #loc618)) +#loc821 = loc(callsite(#loc427 at #loc618)) +#loc822 = loc(callsite(#loc429 at #loc618)) +#loc823 = loc(callsite(#loc430 at #loc618)) +#loc824 = loc(callsite(#loc432 at #loc621)) +#loc825 = loc(callsite(#loc434 at #loc621)) +#loc826 = loc(callsite(#loc435 at #loc621)) +#loc827 = loc(callsite(#loc436 at #loc621)) +#loc828 = loc(callsite(#loc437 at #loc621)) +#loc829 = loc(callsite(#loc438 at #loc621)) +#loc830 = loc(callsite(#loc439 at #loc621)) +#loc831 = loc(callsite(#loc440 at #loc621)) +#loc832 = loc(callsite(#loc441 at #loc621)) +#loc833 = loc(callsite(#loc442 at #loc621)) +#loc834 = loc(callsite(#loc443 at #loc621)) +#loc835 = loc(callsite(#loc444 at #loc621)) +#loc836 = loc(callsite(#loc445 at #loc621)) +#loc837 = loc(callsite(#loc446 at #loc621)) +#loc838 = loc(callsite(#loc447 at #loc621)) +#loc839 = loc(callsite(#loc448 at #loc621)) +#loc840 = loc(callsite(#loc449 at #loc621)) +#loc841 = loc("offs_m1"(#loc644)) +#loc842 = loc(callsite(#loc53 at #loc672)) +#loc843 = loc(callsite(#loc53 at #loc674)) +#loc844 = loc(callsite(#loc432 at #loc710)) +#loc845 = loc(callsite(#loc434 at #loc710)) +#loc846 = loc(callsite(#loc435 at #loc710)) +#loc847 = loc(callsite(#loc436 at #loc710)) +#loc848 = loc(callsite(#loc437 at #loc710)) +#loc849 = loc(callsite(#loc438 at #loc710)) +#loc850 = loc(callsite(#loc439 at #loc710)) +#loc851 = loc(callsite(#loc440 at #loc710)) +#loc852 = loc(callsite(#loc441 at #loc710)) +#loc853 = loc(callsite(#loc442 at #loc710)) +#loc854 = loc(callsite(#loc443 at #loc710)) +#loc855 = loc(callsite(#loc444 at #loc710)) +#loc856 = loc(callsite(#loc445 at #loc710)) +#loc857 = loc(callsite(#loc446 at #loc710)) +#loc858 = loc(callsite(#loc447 at #loc710)) +#loc859 = loc(callsite(#loc448 at #loc710)) +#loc860 = loc(callsite(#loc449 at #loc710)) +#loc861 = loc(callsite(#loc53 at #loc717)) +#loc862 = loc(callsite(#loc53 at #loc719)) +#loc863 = loc(callsite(#loc432 at #loc744)) +#loc864 = loc(callsite(#loc434 at #loc744)) +#loc865 = loc(callsite(#loc435 at #loc744)) +#loc866 = loc(callsite(#loc436 at #loc744)) +#loc867 = loc(callsite(#loc437 at #loc744)) +#loc868 = loc(callsite(#loc438 at #loc744)) +#loc869 = loc(callsite(#loc439 at #loc744)) +#loc870 = loc(callsite(#loc440 at #loc744)) +#loc871 = loc(callsite(#loc441 at #loc744)) +#loc872 = loc(callsite(#loc442 at #loc744)) +#loc873 = loc(callsite(#loc443 at #loc744)) +#loc874 = loc(callsite(#loc444 at #loc744)) +#loc875 = loc(callsite(#loc445 at #loc744)) +#loc876 = loc(callsite(#loc446 at #loc744)) +#loc877 = loc(callsite(#loc447 at #loc744)) +#loc878 = loc(callsite(#loc448 at #loc744)) +#loc879 = loc(callsite(#loc449 at #loc744)) +#loc880 = loc("vT_ptrs"(#loc749)) +#loc881 = loc(callsite(#loc53 at #loc758)) +#loc882 = loc(callsite(#loc53 at #loc759)) +#loc883 = loc(callsite(#loc53 at #loc811)) +#loc884 = loc(callsite(#loc53 at #loc812)) +#loc885 = loc("qT_ptrs"(#loc841)) +#loc886 = loc(callsite(#loc880 at #loc372)) +#loc887 = loc(callsite(#loc880 at #loc457)) +#loc888 = loc("do_ptrs"(#loc885)) +#loc889 = loc(callsite(#loc888 at #loc180)) +#loc890 = loc(callsite(#loc888 at #loc215)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..e7ea77b00b204906cc0c1648dea63608a314fe73 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HULI6VL32CI7Q6UGP75BIUMHXEMSAWXMPR3LDJJLGOCFNMUFGPVQ/triton_tem_fused_zeros_1.ttir @@ -0,0 +1,1440 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":18:0) +#loc291 = loc("arg_Q"(#loc)) +#loc292 = loc("arg_K"(#loc)) +#loc293 = loc("arg_V"(#loc)) +#loc294 = loc("arg_LSE"(#loc)) +#loc295 = loc("arg_DELTA"(#loc)) +#loc296 = loc("arg_DO"(#loc)) +#loc297 = loc("arg_DQ"(#loc)) +#loc298 = loc("arg_DV"(#loc)) +#loc299 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc300 = loc("arg_KV_IDX"(#loc)) +#loc301 = loc("arg_Q_NUM_BLKS"(#loc)) +#loc302 = loc("arg_Q_IDX"(#loc)) +#loc303 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc304 = loc("arg_FULL_KV_IDX"(#loc)) +#loc305 = loc("arg_FULL_Q_NUM_BLKS"(#loc)) +#loc306 = loc("arg_FULL_Q_IDX"(#loc)) +#loc307 = loc("in_ptr16"(#loc)) +#loc308 = loc("out_ptr0"(#loc)) +module { + tt.func public @triton_tem_fused_zeros_1(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_DELTA: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DELTA"(#loc)), %arg_DO: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DO"(#loc)), %arg_DQ: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DQ"(#loc)), %arg_DV: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DV"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_NUM_BLKS"(#loc)), %arg_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %arg_FULL_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_NUM_BLKS"(#loc)), %arg_FULL_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_IDX"(#loc)), %in_ptr16: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr16"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_0 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<128x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<64xf32> loc(#loc1) + %cst_4 = arith.constant dense<0xFF800000> : tensor<64xf32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_5 = arith.constant dense<2048> : tensor<128x64xi32> loc(#loc1) + %cst_6 = arith.constant dense<2048> : tensor<1x64xi32> loc(#loc1) + %cst_7 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1) + %cst_8 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1) + %cst_9 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc1) + %cst_10 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc1) + %cst_11 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1) + %cst_12 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1) + %cst_13 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_14 = arith.constant dense<2048> : tensor<128x1xi32> loc(#loc1) + %cst_15 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc1) + %cst_16 = arith.constant dense<0.0883883461> : tensor<128x128xf32> loc(#loc1) + %cst_17 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc1) + %cst_18 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc1) + %cst_19 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c2097152_i32 = arith.constant 2097152 : i32 loc(#loc1) + %c262144_i32 = arith.constant 262144 : i32 loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %pid = tt.get_program_id x : i32 loc(#loc309) + %off_zq = tt.get_program_id y : i32 loc(#loc310) + %off_hkv = tt.get_program_id z : i32 loc(#loc311) + %off_zkv = arith.remsi %off_zq, %c2_i32 : i32 loc(#loc312) + %k_adj = arith.muli %off_hkv, %c262144_i32 : i32 loc(#loc313) + %k_adj_20 = arith.muli %off_zkv, %c2097152_i32 : i32 loc(#loc314) + %k_adj_21 = arith.addi %k_adj, %k_adj_20 : i32 loc(#loc315) + %k_adj_22 = arith.extsi %k_adj_21 : i32 to i64 loc(#loc316) + %dv_adj = arith.muli %off_zq, %c2097152_i32 : i32 loc(#loc317) + %dv_adj_23 = arith.addi %k_adj, %dv_adj : i32 loc(#loc318) + %dv_adj_24 = arith.extsi %dv_adj_23 : i32 to i64 loc(#loc319) + %K = tt.addptr %arg_K, %k_adj_22 : !tt.ptr, i64 loc(#loc320) + %V = tt.addptr %arg_V, %k_adj_22 : !tt.ptr, i64 loc(#loc321) + %DV = tt.addptr %arg_DV, %dv_adj_24 : !tt.ptr, i64 loc(#loc322) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc323) + %0 = arith.cmpi sge, %pid, %c16_i32 : i32 loc(#loc17) + scf.if %0 { + %off_pid = arith.subi %pid, %c16_i32 : i32 loc(#loc324) + %off_hq2 = arith.divsi %off_pid, %c16_i32 : i32 loc(#loc325) + %off_hq2_25 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc326) + %off_hq2_26 = arith.addi %off_hq2, %off_hq2_25 : i32 loc(#loc327) + %start_m2_block = arith.remsi %off_pid, %c16_i32 : i32 loc(#loc328) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %c16_i32 : i32 loc(#loc329) + %sparse_kv_num_blks_offset_27 = arith.addi %sparse_kv_num_blks_offset, %start_m2_block : i32 loc(#loc330) + %sparse_kv_idx_offset = arith.muli %off_zkv, %c256_i32 : i32 loc(#loc331) + %sparse_kv_idx_offset_28 = arith.muli %start_m2_block, %c16_i32 : i32 loc(#loc332) + %sparse_kv_idx_offset_29 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_28 : i32 loc(#loc333) + %q_adj2 = arith.muli %off_hq2_26, %c128_i32 : i32 loc(#loc334) + %q_adj2_30 = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc335) + %q_adj2_31 = arith.addi %q_adj2, %q_adj2_30 : i32 loc(#loc336) + %q_adj2_32 = arith.extsi %q_adj2_31 : i32 to i64 loc(#loc337) + %do_adj2 = arith.muli %off_hq2_26, %c262144_i32 : i32 loc(#loc338) + %do_adj2_33 = arith.addi %do_adj2, %q_adj2_30 : i32 loc(#loc339) + %do_adj2_34 = arith.extsi %do_adj2_33 : i32 to i64 loc(#loc340) + %off_chz2 = arith.muli %off_zq, %c32_i32 : i32 loc(#loc341) + %off_chz2_35 = arith.addi %off_chz2, %off_hq2_26 : i32 loc(#loc342) + %off_chz2_36 = arith.muli %off_chz2_35, %c2048_i32 : i32 loc(#loc343) + %off_chz2_37 = arith.extsi %off_chz2_36 : i32 to i64 loc(#loc344) + %Q2 = tt.addptr %arg_Q, %q_adj2_32 : !tt.ptr, i64 loc(#loc345) + %DO2 = tt.addptr %arg_DO, %do_adj2_34 : !tt.ptr, i64 loc(#loc346) + %DQ2 = tt.addptr %arg_DQ, %q_adj2_32 : !tt.ptr, i64 loc(#loc347) + %LSE2 = tt.addptr %arg_LSE, %off_chz2_37 : !tt.ptr, i64 loc(#loc348) + %DELTA2 = tt.addptr %arg_DELTA, %off_chz2_37 : !tt.ptr, i64 loc(#loc349) + %start_m2 = arith.muli %start_m2_block, %c128_i32 : i32 loc(#loc350) + %offs_m2 = tt.splat %start_m2 : i32 -> tensor<128xi32> loc(#loc351) + %offs_m2_38 = arith.addi %offs_m2, %offs_k : tensor<128xi32> loc(#loc351) + %ptr = tt.expand_dims %offs_m2_38 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc588) + %ptr_39 = arith.muli %ptr, %cst_17 : tensor<128x1xi32> loc(#loc589) + %ptr_40 = tt.splat %Q2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc590) + %ptr_41 = tt.addptr %ptr_40, %ptr_39 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc590) + %ptr_42 = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc591) + %ptr_43 = tt.broadcast %ptr_41 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc592) + %ptr_44 = tt.broadcast %ptr_42 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc592) + %ptr_45 = tt.addptr %ptr_43, %ptr_44 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc592) + %q = tt.load %ptr_45 : tensor<128x128x!tt.ptr> loc(#loc593) + %ptr_46 = arith.muli %ptr, %cst_15 : tensor<128x1xi32> loc(#loc594) + %ptr_47 = tt.splat %DO2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc595) + %ptr_48 = tt.addptr %ptr_47, %ptr_46 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc595) + %ptr_49 = tt.broadcast %ptr_48 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc596) + %ptr_50 = tt.addptr %ptr_49, %ptr_44 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc596) + %do = tt.load %ptr_50 : tensor<128x128x!tt.ptr> loc(#loc597) + %Di = tt.splat %DELTA2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc359) + %Di_51 = tt.addptr %Di, %offs_m2_38 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc359) + %Di_52 = tt.load %Di_51 : tensor<128x!tt.ptr> loc(#loc360) + %lse = tt.splat %LSE2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc361) + %lse_53 = tt.addptr %lse, %offs_m2_38 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc361) + %lse_54 = tt.load %lse_53 : tensor<128x!tt.ptr> loc(#loc362) + %lse_55 = arith.cmpf oeq, %lse_54, %cst_19 : tensor<128xf32> loc(#loc363) + %lse_56 = arith.select %lse_55, %cst_18, %lse_54 : tensor<128xi1>, tensor<128xf32> loc(#loc364) + %lse_57 = tt.expand_dims %lse_56 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc365) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_29 : !tt.ptr, i32 loc(#loc366) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc367) + %kv_start_58 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc368) + %sparse_kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_27 : !tt.ptr, i32 loc(#loc369) + %sparse_kv_num_blocks_59 = tt.load %sparse_kv_num_blocks : !tt.ptr loc(#loc370) + %offs_n2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc371) + %offs_n2_60 = tt.splat %kv_start_58 : i32 -> tensor<64xi32> loc(#loc372) + %offs_n2_61 = arith.addi %offs_n2_60, %offs_n2 : tensor<64xi32> loc(#loc372) + %kT_ptrs = tt.expand_dims %offs_n2_61 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc598) + %kT_ptrs_62 = arith.muli %kT_ptrs, %cst_1 : tensor<1x64xi32> loc(#loc599) + %kT_ptrs_63 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc600) + %kT_ptrs_64 = tt.addptr %kT_ptrs_63, %kT_ptrs_62 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc600) + %kT_ptrs_65 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc601) + %kT_ptrs_66 = tt.broadcast %kT_ptrs_64 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc602) + %kT_ptrs_67 = tt.broadcast %kT_ptrs_65 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc602) + %kT_ptrs_68 = tt.addptr %kT_ptrs_66, %kT_ptrs_67 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc602) + %vT_ptrs = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc603) + %vT_ptrs_69 = tt.addptr %vT_ptrs, %kT_ptrs_62 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc603) + %vT_ptrs_70 = tt.broadcast %vT_ptrs_69 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc604) + %vT_ptrs_71 = tt.addptr %vT_ptrs_70, %kT_ptrs_67 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc604) + %hi = arith.muli %sparse_kv_num_blocks_59, %c2_i32 : i32 loc(#loc605) + %hi_72 = arith.minsi %hi, %c32_i32 : i32 loc(#loc606) + %vT_ptrs_73:4 = scf.for %start_n = %c0_i32 to %hi_72 step %c1_i32 iter_args(%dq_95 = %cst_13, %offs_n2_96 = %offs_n2_61, %kT_ptrs_97 = %kT_ptrs_68, %vT_ptrs_98 = %vT_ptrs_71) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %kT = tt.load %kT_ptrs_97 : tensor<128x64x!tt.ptr> loc(#loc889) + %qk = tt.dot %q, %kT, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc756) + %qk_99 = arith.mulf %qk, %cst_11 : tensor<128x64xf32> loc(#loc757) + %n = tt.expand_dims %offs_n2_96 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc758) + %tmp4 = tt.broadcast %ptr : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc759) + %tmp4_100 = tt.broadcast %n : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc759) + %tmp4_101 = arith.cmpi sge, %tmp4, %tmp4_100 : tensor<128x64xi32> loc(#loc759) + %tmp5 = arith.extsi %n : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc760) + %tmp7 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc761) + %tmp7_102 = tt.load %tmp7 : !tt.ptr loc(#loc762) + %tmp8 = tt.splat %tmp7_102 : i64 -> tensor<1x64xi64> loc(#loc763) + %tmp8_103 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc763) + %tmp9 = arith.extsi %ptr : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc764) + %tmp10 = tt.splat %tmp7_102 : i64 -> tensor<128x1xi64> loc(#loc765) + %tmp10_104 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc765) + %tmp11 = tt.broadcast %tmp8_103 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc766) + %tmp11_105 = tt.broadcast %tmp10_104 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc766) + %tmp11_106 = arith.andi %tmp11, %tmp11_105 : tensor<128x64xi1> loc(#loc766) + %tmp12 = arith.andi %tmp4_101, %tmp11_106 : tensor<128x64xi1> loc(#loc767) + %tmp15 = arith.cmpi sge, %n, %cst_6 : tensor<1x64xi32> loc(#loc768) + %tmp16 = arith.remsi %n, %cst_6 : tensor<1x64xi32> loc(#loc769) + %tmp18 = arith.cmpi ne, %tmp16, %cst_10 : tensor<1x64xi32> loc(#loc770) + %tmp19 = arith.cmpi slt, %tmp16, %cst_10 : tensor<1x64xi32> loc(#loc771) + %tmp22 = arith.andi %tmp18, %tmp19 : tensor<1x64xi1> loc(#loc772) + %tmp23 = arith.addi %tmp16, %cst_6 : tensor<1x64xi32> loc(#loc773) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc774) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc775) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64> loc(#loc776) + %tmp27 = arith.andi %tmp15, %tmp26 : tensor<1x64xi1> loc(#loc777) + %tmp28 = arith.subi %tmp4_100, %tmp4 : tensor<128x64xi32> loc(#loc778) + %tmp29 = arith.remsi %tmp28, %cst_5 : tensor<128x64xi32> loc(#loc779) + %tmp30 = arith.cmpi ne, %tmp29, %cst_9 : tensor<128x64xi32> loc(#loc780) + %tmp31 = arith.cmpi slt, %tmp29, %cst_9 : tensor<128x64xi32> loc(#loc781) + %tmp33 = arith.andi %tmp30, %tmp31 : tensor<128x64xi1> loc(#loc782) + %tmp34 = arith.addi %tmp29, %cst_5 : tensor<128x64xi32> loc(#loc783) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc784) + %tmp36 = arith.cmpi eq, %tmp35, %cst_9 : tensor<128x64xi32> loc(#loc785) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc786) + %tmp37_107 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1> loc(#loc786) + %tmp38 = arith.ori %tmp12, %tmp37_107 : tensor<128x64xi1> loc(#loc787) + %post_mod_scores = arith.select %tmp38, %qk_99, %cst_8 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc788) + %post_mod_scores_108 = arith.mulf %post_mod_scores, %cst_7 : tensor<128x64xf32> loc(#loc789) + %p = tt.broadcast %lse_57 : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc790) + %p_109 = arith.subf %post_mod_scores_108, %p : tensor<128x64xf32> loc(#loc790) + %p_110 = math.exp2 %p_109 : tensor<128x64xf32> loc(#loc791) + %vT = tt.load %vT_ptrs_98 : tensor<128x64x!tt.ptr> loc(#loc890) + %dp = tt.dot %do, %vT, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc793) + %ds = tt.expand_dims %Di_52 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc794) + %ds_111 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc795) + %ds_112 = arith.subf %dp, %ds_111 : tensor<128x64xf32> loc(#loc795) + %ds_113 = arith.mulf %p_110, %ds_112 : tensor<128x64xf32> loc(#loc796) + %ds_114 = arith.select %tmp38, %ds_113, %cst_12 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc797) + %ds_115 = arith.truncf %ds_114 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc798) + %dq_116 = tt.trans %kT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc799) + %dq_117 = tt.dot %ds_115, %dq_116, %dq_95, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc800) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc801) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc802) + %cur_block_118 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc803) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc804) + %next_block_119 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_59 : i32 loc(#loc805) + %next_block_120 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc806) + %next_block_121 = tt.load %next_block_120, %next_block_119 evictionPolicy = evict_last : !tt.ptr loc(#loc807) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc808) + %needs_jump_122 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc809) + %needs_jump_123 = arith.cmpi eq, %needs_jump_122, %c0_i32 : i32 loc(#loc810) + %jump_to_block = arith.subi %next_block_121, %cur_block_118 : i32 loc(#loc811) + %jump_to_block_124 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc812) + %jump_to_block_125 = arith.subi %jump_to_block_124, %c64_i32 : i32 loc(#loc813) + %offset = arith.extui %needs_jump_123 : i1 to i32 loc(#loc814) + %offset_126 = arith.muli %jump_to_block_125, %offset : i32 loc(#loc814) + %offset_127 = arith.subi %c1_i32, %offset : i32 loc(#loc815) + %offset_128 = arith.muli %offset_127, %c64_i32 : i32 loc(#loc816) + %offset_129 = arith.addi %offset_126, %offset_128 : i32 loc(#loc817) + %kT_ptrs_130 = arith.muli %offset_129, %c128_i32 : i32 loc(#loc610) + %kT_ptrs_131 = tt.splat %kT_ptrs_130 : i32 -> tensor<128x64xi32> loc(#loc611) + %kT_ptrs_132 = tt.addptr %kT_ptrs_97, %kT_ptrs_131 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc611) + %vT_ptrs_133 = tt.addptr %vT_ptrs_98, %kT_ptrs_131 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc612) + %offs_n2_134 = tt.splat %offset_129 : i32 -> tensor<64xi32> loc(#loc613) + %offs_n2_135 = arith.addi %offs_n2_96, %offs_n2_134 : tensor<64xi32> loc(#loc613) + scf.yield %dq_117, %offs_n2_135, %kT_ptrs_132, %vT_ptrs_133 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc614) + } loc(#loc894) + %kv_indices_74 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_29 : !tt.ptr, i32 loc(#loc453) + %kv_start_75 = tt.load %kv_indices_74 : !tt.ptr loc(#loc454) + %kv_start_76 = arith.muli %kv_start_75, %c128_i32 : i32 loc(#loc455) + %sparse_kv_num_blocks_77 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_27 : !tt.ptr, i32 loc(#loc456) + %sparse_kv_num_blocks_78 = tt.load %sparse_kv_num_blocks_77 : !tt.ptr loc(#loc457) + %offs_n2_79 = tt.splat %kv_start_76 : i32 -> tensor<64xi32> loc(#loc458) + %offs_n2_80 = arith.addi %offs_n2_79, %offs_n2 : tensor<64xi32> loc(#loc458) + %kT_ptrs_81 = tt.expand_dims %offs_n2_80 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc615) + %kT_ptrs_82 = arith.muli %kT_ptrs_81, %cst_1 : tensor<1x64xi32> loc(#loc616) + %kT_ptrs_83 = tt.addptr %kT_ptrs_63, %kT_ptrs_82 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc617) + %kT_ptrs_84 = tt.broadcast %kT_ptrs_83 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc618) + %kT_ptrs_85 = tt.addptr %kT_ptrs_84, %kT_ptrs_67 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc618) + %vT_ptrs_86 = tt.addptr %vT_ptrs, %kT_ptrs_82 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc619) + %vT_ptrs_87 = tt.broadcast %vT_ptrs_86 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc620) + %vT_ptrs_88 = tt.addptr %vT_ptrs_87, %kT_ptrs_67 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc620) + %hi_89 = arith.muli %sparse_kv_num_blocks_78, %c2_i32 : i32 loc(#loc621) + %hi_90 = arith.minsi %hi_89, %c32_i32 : i32 loc(#loc622) + %vT_ptrs_91:4 = scf.for %start_n = %c0_i32 to %hi_90 step %c1_i32 iter_args(%dq_95 = %vT_ptrs_73#0, %offs_n2_96 = %offs_n2_80, %kT_ptrs_97 = %kT_ptrs_85, %vT_ptrs_98 = %vT_ptrs_88) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %kT = tt.load %kT_ptrs_97 : tensor<128x64x!tt.ptr> loc(#loc891) + %qk = tt.dot %q, %kT, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc819) + %qk_99 = arith.mulf %qk, %cst_11 : tensor<128x64xf32> loc(#loc820) + %post_mod_scores = arith.mulf %qk_99, %cst_7 : tensor<128x64xf32> loc(#loc821) + %p = tt.broadcast %lse_57 : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc822) + %p_100 = arith.subf %post_mod_scores, %p : tensor<128x64xf32> loc(#loc822) + %p_101 = math.exp2 %p_100 : tensor<128x64xf32> loc(#loc823) + %vT = tt.load %vT_ptrs_98 : tensor<128x64x!tt.ptr> loc(#loc892) + %dp = tt.dot %do, %vT, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc825) + %ds = tt.expand_dims %Di_52 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc826) + %ds_102 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc827) + %ds_103 = arith.subf %dp, %ds_102 : tensor<128x64xf32> loc(#loc827) + %ds_104 = arith.mulf %p_101, %ds_103 : tensor<128x64xf32> loc(#loc828) + %ds_105 = arith.truncf %ds_104 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc829) + %dq_106 = tt.trans %kT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc830) + %dq_107 = tt.dot %ds_105, %dq_106, %dq_95, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc831) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc832) + %cur_block = tt.addptr %kv_indices_74, %cur_block_idx : !tt.ptr, i32 loc(#loc833) + %cur_block_108 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc834) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc835) + %next_block_109 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_78 : i32 loc(#loc836) + %next_block_110 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc837) + %next_block_111 = tt.load %next_block_110, %next_block_109 evictionPolicy = evict_last : !tt.ptr loc(#loc838) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc839) + %needs_jump_112 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc840) + %needs_jump_113 = arith.cmpi eq, %needs_jump_112, %c0_i32 : i32 loc(#loc841) + %jump_to_block = arith.subi %next_block_111, %cur_block_108 : i32 loc(#loc842) + %jump_to_block_114 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc843) + %jump_to_block_115 = arith.subi %jump_to_block_114, %c64_i32 : i32 loc(#loc844) + %offset = arith.extui %needs_jump_113 : i1 to i32 loc(#loc845) + %offset_116 = arith.muli %jump_to_block_115, %offset : i32 loc(#loc845) + %offset_117 = arith.subi %c1_i32, %offset : i32 loc(#loc846) + %offset_118 = arith.muli %offset_117, %c64_i32 : i32 loc(#loc847) + %offset_119 = arith.addi %offset_116, %offset_118 : i32 loc(#loc848) + %kT_ptrs_120 = arith.muli %offset_119, %c128_i32 : i32 loc(#loc625) + %kT_ptrs_121 = tt.splat %kT_ptrs_120 : i32 -> tensor<128x64xi32> loc(#loc626) + %kT_ptrs_122 = tt.addptr %kT_ptrs_97, %kT_ptrs_121 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc626) + %vT_ptrs_123 = tt.addptr %vT_ptrs_98, %kT_ptrs_121 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc627) + %offs_n2_124 = tt.splat %offset_119 : i32 -> tensor<64xi32> loc(#loc628) + %offs_n2_125 = arith.addi %offs_n2_96, %offs_n2_124 : tensor<64xi32> loc(#loc628) + scf.yield %dq_107, %offs_n2_125, %kT_ptrs_122, %vT_ptrs_123 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc629) + } loc(#loc895) + %dq_ptrs = tt.splat %DQ2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc460) + %dq_ptrs_92 = tt.addptr %dq_ptrs, %ptr_39 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc460) + %dq_ptrs_93 = tt.broadcast %dq_ptrs_92 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc461) + %dq_ptrs_94 = tt.addptr %dq_ptrs_93, %ptr_44 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc461) + %dq = arith.mulf %vT_ptrs_91#0, %cst_16 : tensor<128x128xf32> loc(#loc462) + %1 = arith.truncf %dq : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc160) + tt.store %dq_ptrs_94, %1 : tensor<128x128x!tt.ptr> loc(#loc160) + } else { + %start_n1 = arith.muli %pid, %c128_i32 : i32 loc(#loc463) + %offs_n1 = tt.splat %start_n1 : i32 -> tensor<128xi32> loc(#loc464) + %offs_n1_25 = arith.addi %offs_n1, %offs_k : tensor<128xi32> loc(#loc464) + %ptr = tt.expand_dims %offs_n1_25 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc630) + %ptr_26 = arith.muli %ptr, %cst_15 : tensor<128x1xi32> loc(#loc631) + %ptr_27 = tt.splat %K : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc632) + %ptr_28 = tt.addptr %ptr_27, %ptr_26 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc632) + %ptr_29 = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc633) + %ptr_30 = tt.broadcast %ptr_28 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc634) + %ptr_31 = tt.broadcast %ptr_29 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc634) + %ptr_32 = tt.addptr %ptr_30, %ptr_31 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc634) + %k = tt.load %ptr_32 : tensor<128x128x!tt.ptr> loc(#loc635) + %ptr_33 = tt.splat %V : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc636) + %ptr_34 = tt.addptr %ptr_33, %ptr_26 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc636) + %ptr_35 = tt.broadcast %ptr_34 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc637) + %ptr_36 = tt.addptr %ptr_35, %ptr_31 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc637) + %v = tt.load %ptr_36 : tensor<128x128x!tt.ptr> loc(#loc638) + %dk:2 = scf.for %off_g = %c0_i32 to %c4_i32 step %c1_i32 iter_args(%dv = %cst_13, %dk_46 = %cst_13) -> (tensor<128x128xf32>, tensor<128x128xf32>) : i32 { + %off_hq1 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc468) + %off_hq1_47 = arith.addi %off_hq1, %off_g : i32 loc(#loc469) + %q_adj1 = arith.muli %off_hq1_47, %c128_i32 : i32 loc(#loc470) + %q_adj1_48 = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc471) + %q_adj1_49 = arith.addi %q_adj1, %q_adj1_48 : i32 loc(#loc472) + %q_adj1_50 = arith.extsi %q_adj1_49 : i32 to i64 loc(#loc473) + %do_adj1 = arith.muli %off_hq1_47, %c262144_i32 : i32 loc(#loc474) + %do_adj1_51 = arith.addi %do_adj1, %q_adj1_48 : i32 loc(#loc475) + %do_adj1_52 = arith.extsi %do_adj1_51 : i32 to i64 loc(#loc476) + %off_chz1 = arith.muli %off_zq, %c32_i32 : i32 loc(#loc477) + %off_chz1_53 = arith.addi %off_chz1, %off_hq1_47 : i32 loc(#loc478) + %off_chz1_54 = arith.muli %off_chz1_53, %c2048_i32 : i32 loc(#loc479) + %off_chz1_55 = arith.extsi %off_chz1_54 : i32 to i64 loc(#loc480) + %Q1 = tt.addptr %arg_Q, %q_adj1_50 : !tt.ptr, i64 loc(#loc481) + %DO1 = tt.addptr %arg_DO, %do_adj1_52 : !tt.ptr, i64 loc(#loc482) + %LSE1 = tt.addptr %arg_LSE, %off_chz1_55 : !tt.ptr, i64 loc(#loc483) + %DELTA1 = tt.addptr %arg_DELTA, %off_chz1_55 : !tt.ptr, i64 loc(#loc484) + %sparse_q_num_blks_offset = arith.muli %off_zkv, %c16_i32 : i32 loc(#loc485) + %sparse_q_num_blks_offset_56 = arith.addi %sparse_q_num_blks_offset, %pid : i32 loc(#loc486) + %sparse_q_idx_offset = arith.muli %off_zkv, %c256_i32 : i32 loc(#loc487) + %sparse_q_idx_offset_57 = arith.muli %pid, %c16_i32 : i32 loc(#loc488) + %sparse_q_idx_offset_58 = arith.addi %sparse_q_idx_offset, %sparse_q_idx_offset_57 : i32 loc(#loc489) + %q_indices = tt.addptr %arg_Q_IDX, %sparse_q_idx_offset_58 : !tt.ptr, i32 loc(#loc490) + %q_start = tt.load %q_indices : !tt.ptr loc(#loc491) + %q_start_59 = arith.muli %q_start, %c128_i32 : i32 loc(#loc492) + %sparse_q_num_blocks = tt.addptr %arg_Q_NUM_BLKS, %sparse_q_num_blks_offset_56 : !tt.ptr, i32 loc(#loc493) + %sparse_q_num_blocks_60 = tt.load %sparse_q_num_blocks : !tt.ptr loc(#loc494) + %offs_m1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc495) + %offs_m1_61 = tt.splat %q_start_59 : i32 -> tensor<64xi32> loc(#loc496) + %offs_m1_62 = arith.addi %offs_m1_61, %offs_m1 : tensor<64xi32> loc(#loc496) + %qT_ptrs = tt.expand_dims %offs_m1_62 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc640) + %qT_ptrs_63 = arith.muli %qT_ptrs, %cst_0 : tensor<1x64xi32> loc(#loc641) + %qT_ptrs_64 = tt.splat %Q1 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc642) + %qT_ptrs_65 = tt.addptr %qT_ptrs_64, %qT_ptrs_63 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc642) + %qT_ptrs_66 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc643) + %qT_ptrs_67 = tt.broadcast %qT_ptrs_65 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc644) + %qT_ptrs_68 = tt.broadcast %qT_ptrs_66 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc644) + %qT_ptrs_69 = tt.addptr %qT_ptrs_67, %qT_ptrs_68 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc644) + %do_ptrs = tt.expand_dims %offs_m1_62 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc645) + %do_ptrs_70 = arith.muli %do_ptrs, %cst : tensor<64x1xi32> loc(#loc646) + %do_ptrs_71 = tt.splat %DO1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc647) + %do_ptrs_72 = tt.addptr %do_ptrs_71, %do_ptrs_70 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc647) + %do_ptrs_73 = tt.broadcast %do_ptrs_72 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc648) + %do_ptrs_74 = tt.broadcast %ptr_29 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc648) + %do_ptrs_75 = tt.addptr %do_ptrs_73, %do_ptrs_74 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc648) + %hi = arith.muli %sparse_q_num_blocks_60, %c2_i32 : i32 loc(#loc649) + %hi_76 = arith.minsi %hi, %c32_i32 : i32 loc(#loc650) + %do_ptrs_77:5 = scf.for %start_m = %c0_i32 to %hi_76 step %c1_i32 iter_args(%dk_98 = %dk_46, %dv_99 = %dv, %offs_m1_100 = %offs_m1_62, %qT_ptrs_101 = %qT_ptrs_69, %do_ptrs_102 = %do_ptrs_75) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %qT = tt.load %qT_ptrs_101 : tensor<128x64x!tt.ptr> loc(#loc850) + %lse = tt.splat %LSE1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc653) + %lse_103 = tt.addptr %lse, %offs_m1_100 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc653) + %lse_104 = tt.load %lse_103 : tensor<64x!tt.ptr> loc(#loc654) + %lse_105 = arith.cmpf oeq, %lse_104, %cst_4 : tensor<64xf32> loc(#loc655) + %lse_106 = arith.select %lse_105, %cst_3, %lse_104 : tensor<64xi1>, tensor<64xf32> loc(#loc656) + %qkT = tt.dot %k, %qT, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc657) + %qkT_107 = arith.mulf %qkT, %cst_11 : tensor<128x64xf32> loc(#loc658) + %m = tt.expand_dims %offs_m1_100 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc659) + %tmp44 = tt.broadcast %m : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc660) + %tmp44_108 = tt.broadcast %ptr : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc660) + %tmp44_109 = arith.cmpi sge, %tmp44, %tmp44_108 : tensor<128x64xi32> loc(#loc660) + %tmp45 = arith.extsi %ptr : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc661) + %tmp47 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc662) + %tmp47_110 = tt.load %tmp47 : !tt.ptr loc(#loc663) + %tmp48 = tt.splat %tmp47_110 : i64 -> tensor<128x1xi64> loc(#loc664) + %tmp48_111 = arith.cmpi slt, %tmp45, %tmp48 : tensor<128x1xi64> loc(#loc664) + %tmp49 = arith.extsi %m : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc665) + %tmp50 = tt.splat %tmp47_110 : i64 -> tensor<1x64xi64> loc(#loc666) + %tmp50_112 = arith.cmpi slt, %tmp49, %tmp50 : tensor<1x64xi64> loc(#loc666) + %tmp51 = tt.broadcast %tmp48_111 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc667) + %tmp51_113 = tt.broadcast %tmp50_112 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc667) + %tmp51_114 = arith.andi %tmp51, %tmp51_113 : tensor<128x64xi1> loc(#loc667) + %tmp52 = arith.andi %tmp44_109, %tmp51_114 : tensor<128x64xi1> loc(#loc668) + %tmp55 = arith.cmpi sge, %ptr, %cst_14 : tensor<128x1xi32> loc(#loc669) + %tmp56 = arith.remsi %ptr, %cst_14 : tensor<128x1xi32> loc(#loc670) + %tmp58 = arith.cmpi ne, %tmp56, %cst_2 : tensor<128x1xi32> loc(#loc671) + %tmp59 = arith.cmpi slt, %tmp56, %cst_2 : tensor<128x1xi32> loc(#loc672) + %tmp62 = arith.andi %tmp58, %tmp59 : tensor<128x1xi1> loc(#loc673) + %tmp63 = arith.addi %tmp56, %cst_14 : tensor<128x1xi32> loc(#loc674) + %tmp64 = arith.select %tmp62, %tmp63, %tmp56 : tensor<128x1xi1>, tensor<128x1xi32> loc(#loc675) + %tmp65 = arith.extsi %tmp64 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc676) + %tmp66 = arith.cmpi slt, %tmp65, %tmp48 : tensor<128x1xi64> loc(#loc677) + %tmp67 = arith.andi %tmp55, %tmp66 : tensor<128x1xi1> loc(#loc678) + %tmp68 = arith.subi %tmp44_108, %tmp44 : tensor<128x64xi32> loc(#loc679) + %tmp69 = arith.remsi %tmp68, %cst_5 : tensor<128x64xi32> loc(#loc680) + %tmp70 = arith.cmpi ne, %tmp69, %cst_9 : tensor<128x64xi32> loc(#loc681) + %tmp71 = arith.cmpi slt, %tmp69, %cst_9 : tensor<128x64xi32> loc(#loc682) + %tmp73 = arith.andi %tmp70, %tmp71 : tensor<128x64xi1> loc(#loc683) + %tmp74 = arith.addi %tmp69, %cst_5 : tensor<128x64xi32> loc(#loc684) + %tmp75 = arith.select %tmp73, %tmp74, %tmp69 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc685) + %tmp76 = arith.cmpi eq, %tmp75, %cst_9 : tensor<128x64xi32> loc(#loc686) + %tmp77 = tt.broadcast %tmp67 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc687) + %tmp77_115 = arith.andi %tmp77, %tmp76 : tensor<128x64xi1> loc(#loc687) + %tmp78 = arith.ori %tmp52, %tmp77_115 : tensor<128x64xi1> loc(#loc688) + %post_mod_scores = arith.select %tmp78, %qkT_107, %cst_8 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc689) + %post_mod_scores_116 = arith.mulf %post_mod_scores, %cst_7 : tensor<128x64xf32> loc(#loc690) + %pT = tt.expand_dims %lse_106 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc691) + %pT_117 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc692) + %pT_118 = arith.subf %post_mod_scores_116, %pT_117 : tensor<128x64xf32> loc(#loc692) + %pT_119 = math.exp2 %pT_118 : tensor<128x64xf32> loc(#loc693) + %do = tt.load %do_ptrs_102 : tensor<64x128x!tt.ptr> loc(#loc851) + %dv_120 = arith.truncf %pT_119 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc695) + %dv_121 = tt.dot %dv_120, %do, %dv_99, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc696) + %Di = tt.splat %DELTA1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc697) + %Di_122 = tt.addptr %Di, %offs_m1_100 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc697) + %Di_123 = tt.load %Di_122 : tensor<64x!tt.ptr> loc(#loc698) + %dpT = tt.trans %do {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc699) + %dpT_124 = tt.dot %v, %dpT, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc700) + %dsT = tt.expand_dims %Di_123 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc701) + %dsT_125 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc702) + %dsT_126 = arith.subf %dpT_124, %dsT_125 : tensor<128x64xf32> loc(#loc702) + %dsT_127 = arith.mulf %pT_119, %dsT_126 : tensor<128x64xf32> loc(#loc703) + %dsT_128 = arith.select %tmp78, %dsT_127, %cst_12 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc704) + %dk_129 = arith.truncf %dsT_128 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc705) + %dk_130 = tt.trans %qT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc706) + %dk_131 = tt.dot %dk_129, %dk_130, %dk_98, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc707) + %cur_block_idx = arith.divsi %start_m, %c2_i32 : i32 loc(#loc852) + %cur_block = tt.addptr %q_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc853) + %cur_block_132 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc854) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc855) + %next_block_133 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_60 : i32 loc(#loc856) + %next_block_134 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc857) + %next_block_135 = tt.load %next_block_134, %next_block_133 evictionPolicy = evict_last : !tt.ptr loc(#loc858) + %needs_jump = arith.addi %start_m, %c1_i32 : i32 loc(#loc859) + %needs_jump_136 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc860) + %needs_jump_137 = arith.cmpi eq, %needs_jump_136, %c0_i32 : i32 loc(#loc861) + %jump_to_block = arith.subi %next_block_135, %cur_block_132 : i32 loc(#loc862) + %jump_to_block_138 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc863) + %jump_to_block_139 = arith.subi %jump_to_block_138, %c64_i32 : i32 loc(#loc864) + %offset = arith.extui %needs_jump_137 : i1 to i32 loc(#loc865) + %offset_140 = arith.muli %jump_to_block_139, %offset : i32 loc(#loc865) + %offset_141 = arith.subi %c1_i32, %offset : i32 loc(#loc866) + %offset_142 = arith.muli %offset_141, %c64_i32 : i32 loc(#loc867) + %offset_143 = arith.addi %offset_140, %offset_142 : i32 loc(#loc868) + %qT_ptrs_144 = arith.muli %offset_143, %c4096_i32 : i32 loc(#loc709) + %qT_ptrs_145 = tt.splat %qT_ptrs_144 : i32 -> tensor<128x64xi32> loc(#loc710) + %qT_ptrs_146 = tt.addptr %qT_ptrs_101, %qT_ptrs_145 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc710) + %do_ptrs_147 = arith.muli %offset_143, %c128_i32 : i32 loc(#loc711) + %do_ptrs_148 = tt.splat %do_ptrs_147 : i32 -> tensor<64x128xi32> loc(#loc712) + %do_ptrs_149 = tt.addptr %do_ptrs_102, %do_ptrs_148 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc712) + %offs_m1_150 = tt.splat %offset_143 : i32 -> tensor<64xi32> loc(#loc713) + %offs_m1_151 = arith.addi %offs_m1_100, %offs_m1_150 : tensor<64xi32> loc(#loc713) + scf.yield %dk_131, %dv_121, %offs_m1_151, %qT_ptrs_146, %do_ptrs_149 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc572) + } loc(#loc897) + %q_indices_78 = tt.addptr %arg_FULL_Q_IDX, %sparse_q_idx_offset_58 : !tt.ptr, i32 loc(#loc573) + %q_start_79 = tt.load %q_indices_78 : !tt.ptr loc(#loc574) + %q_start_80 = arith.muli %q_start_79, %c128_i32 : i32 loc(#loc575) + %sparse_q_num_blocks_81 = tt.addptr %arg_FULL_Q_NUM_BLKS, %sparse_q_num_blks_offset_56 : !tt.ptr, i32 loc(#loc576) + %sparse_q_num_blocks_82 = tt.load %sparse_q_num_blocks_81 : !tt.ptr loc(#loc577) + %offs_m1_83 = tt.splat %q_start_80 : i32 -> tensor<64xi32> loc(#loc578) + %offs_m1_84 = arith.addi %offs_m1_83, %offs_m1 : tensor<64xi32> loc(#loc578) + %qT_ptrs_85 = tt.expand_dims %offs_m1_84 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc714) + %qT_ptrs_86 = arith.muli %qT_ptrs_85, %cst_0 : tensor<1x64xi32> loc(#loc715) + %qT_ptrs_87 = tt.addptr %qT_ptrs_64, %qT_ptrs_86 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc716) + %qT_ptrs_88 = tt.broadcast %qT_ptrs_87 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc717) + %qT_ptrs_89 = tt.addptr %qT_ptrs_88, %qT_ptrs_68 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc717) + %do_ptrs_90 = tt.expand_dims %offs_m1_84 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc718) + %do_ptrs_91 = arith.muli %do_ptrs_90, %cst : tensor<64x1xi32> loc(#loc719) + %do_ptrs_92 = tt.addptr %do_ptrs_71, %do_ptrs_91 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc720) + %do_ptrs_93 = tt.broadcast %do_ptrs_92 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc721) + %do_ptrs_94 = tt.addptr %do_ptrs_93, %do_ptrs_74 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc721) + %hi_95 = arith.muli %sparse_q_num_blocks_82, %c2_i32 : i32 loc(#loc722) + %hi_96 = arith.minsi %hi_95, %c32_i32 : i32 loc(#loc723) + %do_ptrs_97:5 = scf.for %start_m = %c0_i32 to %hi_96 step %c1_i32 iter_args(%dk_98 = %do_ptrs_77#0, %dv_99 = %do_ptrs_77#1, %offs_m1_100 = %offs_m1_84, %qT_ptrs_101 = %qT_ptrs_89, %do_ptrs_102 = %do_ptrs_94) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %qT = tt.load %qT_ptrs_101 : tensor<128x64x!tt.ptr> loc(#loc869) + %lse = tt.splat %LSE1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc725) + %lse_103 = tt.addptr %lse, %offs_m1_100 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc725) + %lse_104 = tt.load %lse_103 : tensor<64x!tt.ptr> loc(#loc726) + %lse_105 = arith.cmpf oeq, %lse_104, %cst_4 : tensor<64xf32> loc(#loc727) + %lse_106 = arith.select %lse_105, %cst_3, %lse_104 : tensor<64xi1>, tensor<64xf32> loc(#loc728) + %qkT = tt.dot %k, %qT, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc729) + %qkT_107 = arith.mulf %qkT, %cst_11 : tensor<128x64xf32> loc(#loc730) + %post_mod_scores = arith.mulf %qkT_107, %cst_7 : tensor<128x64xf32> loc(#loc731) + %pT = tt.expand_dims %lse_106 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc732) + %pT_108 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc733) + %pT_109 = arith.subf %post_mod_scores, %pT_108 : tensor<128x64xf32> loc(#loc733) + %pT_110 = math.exp2 %pT_109 : tensor<128x64xf32> loc(#loc734) + %do = tt.load %do_ptrs_102 : tensor<64x128x!tt.ptr> loc(#loc870) + %dv_111 = arith.truncf %pT_110 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc736) + %dv_112 = tt.dot %dv_111, %do, %dv_99, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc737) + %Di = tt.splat %DELTA1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc738) + %Di_113 = tt.addptr %Di, %offs_m1_100 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc738) + %Di_114 = tt.load %Di_113 : tensor<64x!tt.ptr> loc(#loc739) + %dpT = tt.trans %do {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc740) + %dpT_115 = tt.dot %v, %dpT, %cst_12, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc741) + %dsT = tt.expand_dims %Di_114 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc742) + %dsT_116 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc743) + %dsT_117 = arith.subf %dpT_115, %dsT_116 : tensor<128x64xf32> loc(#loc743) + %dsT_118 = arith.mulf %pT_110, %dsT_117 : tensor<128x64xf32> loc(#loc744) + %dk_119 = arith.truncf %dsT_118 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc745) + %dk_120 = tt.trans %qT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc746) + %dk_121 = tt.dot %dk_119, %dk_120, %dk_98, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc747) + %cur_block_idx = arith.divsi %start_m, %c2_i32 : i32 loc(#loc871) + %cur_block = tt.addptr %q_indices_78, %cur_block_idx : !tt.ptr, i32 loc(#loc872) + %cur_block_122 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc873) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc874) + %next_block_123 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_82 : i32 loc(#loc875) + %next_block_124 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc876) + %next_block_125 = tt.load %next_block_124, %next_block_123 evictionPolicy = evict_last : !tt.ptr loc(#loc877) + %needs_jump = arith.addi %start_m, %c1_i32 : i32 loc(#loc878) + %needs_jump_126 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc879) + %needs_jump_127 = arith.cmpi eq, %needs_jump_126, %c0_i32 : i32 loc(#loc880) + %jump_to_block = arith.subi %next_block_125, %cur_block_122 : i32 loc(#loc881) + %jump_to_block_128 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc882) + %jump_to_block_129 = arith.subi %jump_to_block_128, %c64_i32 : i32 loc(#loc883) + %offset = arith.extui %needs_jump_127 : i1 to i32 loc(#loc884) + %offset_130 = arith.muli %jump_to_block_129, %offset : i32 loc(#loc884) + %offset_131 = arith.subi %c1_i32, %offset : i32 loc(#loc885) + %offset_132 = arith.muli %offset_131, %c64_i32 : i32 loc(#loc886) + %offset_133 = arith.addi %offset_130, %offset_132 : i32 loc(#loc887) + %qT_ptrs_134 = arith.muli %offset_133, %c4096_i32 : i32 loc(#loc749) + %qT_ptrs_135 = tt.splat %qT_ptrs_134 : i32 -> tensor<128x64xi32> loc(#loc750) + %qT_ptrs_136 = tt.addptr %qT_ptrs_101, %qT_ptrs_135 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc750) + %do_ptrs_137 = arith.muli %offset_133, %c128_i32 : i32 loc(#loc751) + %do_ptrs_138 = tt.splat %do_ptrs_137 : i32 -> tensor<64x128xi32> loc(#loc752) + %do_ptrs_139 = tt.addptr %do_ptrs_102, %do_ptrs_138 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc752) + %offs_m1_140 = tt.splat %offset_133 : i32 -> tensor<64xi32> loc(#loc753) + %offs_m1_141 = arith.addi %offs_m1_100, %offs_m1_140 : tensor<64xi32> loc(#loc753) + scf.yield %dk_121, %dv_112, %offs_m1_141, %qT_ptrs_136, %do_ptrs_139 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc580) + } loc(#loc898) + scf.yield %do_ptrs_97#1, %do_ptrs_97#0 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc279) + } loc(#loc639) + %dv_ptrs = tt.splat %DV : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc581) + %dv_ptrs_37 = tt.addptr %dv_ptrs, %ptr_26 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc581) + %dv_ptrs_38 = tt.broadcast %dv_ptrs_37 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc582) + %dv_ptrs_39 = tt.addptr %dv_ptrs_38, %ptr_31 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc582) + %1 = arith.truncf %dk#0 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc282) + tt.store %dv_ptrs_39, %1 : tensor<128x128x!tt.ptr> loc(#loc282) + %dk_40 = arith.mulf %dk#1, %cst_16 : tensor<128x128xf32> loc(#loc583) + %mask = arith.cmpi slt, %ptr, %cst_14 : tensor<128x1xi32> loc(#loc584) + %xindex = tt.broadcast %ptr_26 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc585) + %xindex_41 = arith.addi %ptr_31, %xindex : tensor<128x128xi32> loc(#loc585) + %xindex_42 = tt.splat %k_adj : i32 -> tensor<128x128xi32> loc(#loc586) + %xindex_43 = arith.addi %xindex_41, %xindex_42 : tensor<128x128xi32> loc(#loc586) + %xindex_44 = tt.splat %dv_adj : i32 -> tensor<128x128xi32> loc(#loc587) + %xindex_45 = arith.addi %xindex_43, %xindex_44 : tensor<128x128xi32> loc(#loc587) + %2 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc288) + %3 = tt.addptr %2, %xindex_45 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc288) + %4 = tt.broadcast %mask : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc289) + %5 = arith.truncf %dk_40 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc289) + tt.store %3, %5, %4 : tensor<128x128x!tt.ptr> loc(#loc289) + } loc(#loc18) + tt.return loc(#loc290) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":111:24) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":115:27) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":116:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":117:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":124:25) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":124:47) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":124:35) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":124:59) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":128:50) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":128:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":128:61) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":131:9) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":132:9) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":133:10) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":136:26) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":139:14) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":139:7) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":140:24) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":144:29) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":144:54) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":144:44) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":145:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":154:55) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":154:78) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":155:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":155:83) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":155:68) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":158:30) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":158:52) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":158:40) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":158:63) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":159:32) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":159:42) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":159:66) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":161:30) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":161:35) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":161:46) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":161:56) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":163:17) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":164:19) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":167:19) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":168:21) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":169:25) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":174:36) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":175:29) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":825:27) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":178:107) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":825:38) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":825:20) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":825:56) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":825:49) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":835:23) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":179:111) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":185:34) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":185:25) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":186:33) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":186:26) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":190:30) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":190:50) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":191:18) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":195:30) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":196:27) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":196:41) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":197:53) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":197:39) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":199:42) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":199:29) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":390:26) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":207:12) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":390:37) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":390:18) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":390:56) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":390:49) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":391:18) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":391:49) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":395:43) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":395:63) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":397:28) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":458:105) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":405:12) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":459:19) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":461:14) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":464:36) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":482:23) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":483:23) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":485:34) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":485:23) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":486:22) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":487:23) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":488:23) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":489:23) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":490:23) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":493:24) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":494:24) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":496:25) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":497:92) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":500:24) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":501:24) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":502:39) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":503:25) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":504:24) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":505:24) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":506:23) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":507:25) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":508:25) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":509:92) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":511:24) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":512:24) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":513:39) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":514:25) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":515:24) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":516:24) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":521:69) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":524:27) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":525:39) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":525:21) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":528:104) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":530:20) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":531:22) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":531:19) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":531:14) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":549:43) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":551:15) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":553:30) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":553:21) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":788:33) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":411:64) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":789:38) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":789:24) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":790:109) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":790:113) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":790:55) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":790:25) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":791:30) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":791:35) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":791:60) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":792:34) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":792:48) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":792:63) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":793:29) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":793:47) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":793:61) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":793:42) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":414:28) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":414:19) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":415:19) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":417:19) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":417:8) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":214:39) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":215:31) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":215:45) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":216:62) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":216:43) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":218:33) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":226:16) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":231:24) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":231:56) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":232:14) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":234:30) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":252:25) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":253:29) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":256:107) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":257:107) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":262:30) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":263:32) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":263:51) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":266:34) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":266:56) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":266:44) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":266:67) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":267:36) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":267:46) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":267:70) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":269:34) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":269:39) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":269:50) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":269:60) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":271:21) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":272:23) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":275:25) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":276:29) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":281:58) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":281:80) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":282:53) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":282:81) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":282:70) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":286:32) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":287:30) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":287:43) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":288:55) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":288:42) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":290:45) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":290:32) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":601:26) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":298:16) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":601:37) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":601:18) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":601:56) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":601:49) +#loc201 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":602:27) +#loc202 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":602:38) +#loc203 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":602:19) +#loc204 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":602:51) +#loc205 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":608:42) +#loc206 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":608:61) +#loc207 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":610:28) +#loc208 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":669:105) +#loc209 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":618:12) +#loc210 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":672:28) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":672:22) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":675:26) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":675:46) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":676:20) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":678:15) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":680:36) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":698:25) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":699:25) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":701:35) +#loc220 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":701:24) +#loc221 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":702:24) +#loc222 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":703:25) +#loc223 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":704:24) +#loc224 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":705:24) +#loc225 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":706:24) +#loc226 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":709:25) +#loc227 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":710:25) +#loc228 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":712:25) +#loc229 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":713:92) +#loc230 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":716:24) +#loc231 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":717:24) +#loc232 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":718:39) +#loc233 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":719:25) +#loc234 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":720:24) +#loc235 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":721:24) +#loc236 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":722:24) +#loc237 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":723:25) +#loc238 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":724:25) +#loc239 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":725:92) +#loc240 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":727:24) +#loc241 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":728:24) +#loc242 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":729:39) +#loc243 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":730:25) +#loc244 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":731:24) +#loc245 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":732:24) +#loc246 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":736:69) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":739:27) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":740:44) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":740:40) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":740:22) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":741:99) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":744:24) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":744:43) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":746:29) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":746:21) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":750:29) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":750:20) +#loc258 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":751:25) +#loc259 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":751:22) +#loc260 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":751:16) +#loc261 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":773:45) +#loc262 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":775:24) +#loc263 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":775:52) +#loc264 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":775:43) +#loc265 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":623:62) +#loc266 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":626:28) +#loc267 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":626:19) +#loc268 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":627:28) +#loc269 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":627:19) +#loc270 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":628:19) +#loc271 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":628:8) +#loc272 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":306:41) +#loc273 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":307:34) +#loc274 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":307:47) +#loc275 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":308:64) +#loc276 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":308:46) +#loc277 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":310:36) +#loc278 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":318:20) +#loc279 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":303:12) +#loc280 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":323:23) +#loc281 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":323:55) +#loc282 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":330:30) +#loc283 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":334:14) +#loc284 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":337:29) +#loc285 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":344:27) +#loc286 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":344:41) +#loc287 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":344:58) +#loc288 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":345:29) +#loc289 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":345:69) +#loc290 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4w/c4ww5pmlr6amerprh7v3ibioh3yvbhemdqsh7gcrlxjnhnpkktrb.py":139:4) +#loc309 = loc("pid"(#loc2)) +#loc310 = loc("off_zq"(#loc3)) +#loc311 = loc("off_hkv"(#loc4)) +#loc312 = loc("off_zkv"(#loc5)) +#loc313 = loc("k_adj"(#loc6)) +#loc314 = loc("k_adj"(#loc7)) +#loc315 = loc("k_adj"(#loc8)) +#loc316 = loc("k_adj"(#loc9)) +#loc317 = loc("dv_adj"(#loc10)) +#loc318 = loc("dv_adj"(#loc11)) +#loc319 = loc("dv_adj"(#loc12)) +#loc320 = loc("K"(#loc13)) +#loc321 = loc("V"(#loc14)) +#loc322 = loc("DV"(#loc15)) +#loc323 = loc("offs_k"(#loc16)) +#loc324 = loc("off_pid"(#loc19)) +#loc325 = loc("off_hq2"(#loc20)) +#loc326 = loc("off_hq2"(#loc21)) +#loc327 = loc("off_hq2"(#loc22)) +#loc328 = loc("start_m2_block"(#loc23)) +#loc329 = loc("sparse_kv_num_blks_offset"(#loc24)) +#loc330 = loc("sparse_kv_num_blks_offset"(#loc25)) +#loc331 = loc("sparse_kv_idx_offset"(#loc26)) +#loc332 = loc("sparse_kv_idx_offset"(#loc27)) +#loc333 = loc("sparse_kv_idx_offset"(#loc28)) +#loc334 = loc("q_adj2"(#loc29)) +#loc335 = loc("q_adj2"(#loc30)) +#loc336 = loc("q_adj2"(#loc31)) +#loc337 = loc("q_adj2"(#loc32)) +#loc338 = loc("do_adj2"(#loc33)) +#loc339 = loc("do_adj2"(#loc34)) +#loc340 = loc("do_adj2"(#loc35)) +#loc341 = loc("off_chz2"(#loc36)) +#loc342 = loc("off_chz2"(#loc37)) +#loc343 = loc("off_chz2"(#loc38)) +#loc344 = loc("off_chz2"(#loc39)) +#loc345 = loc("Q2"(#loc40)) +#loc346 = loc("DO2"(#loc41)) +#loc347 = loc("DQ2"(#loc42)) +#loc348 = loc("LSE2"(#loc43)) +#loc349 = loc("DELTA2"(#loc44)) +#loc350 = loc("start_m2"(#loc45)) +#loc351 = loc("offs_m2"(#loc46)) +#loc352 = loc("ptr"(#loc47)) +#loc353 = loc("q"(#loc48)) +#loc354 = loc("ptr"(#loc49)) +#loc355 = loc("ptr"(#loc50)) +#loc356 = loc("ptr"(#loc51)) +#loc357 = loc("ptr"(#loc52)) +#loc358 = loc("do"(#loc54)) +#loc359 = loc("Di"(#loc55)) +#loc360 = loc("Di"(#loc56)) +#loc361 = loc("lse"(#loc57)) +#loc362 = loc("lse"(#loc58)) +#loc363 = loc("lse"(#loc59)) +#loc364 = loc("lse"(#loc60)) +#loc365 = loc("lse"(#loc61)) +#loc366 = loc("kv_indices"(#loc62)) +#loc367 = loc("kv_start"(#loc63)) +#loc368 = loc("kv_start"(#loc64)) +#loc369 = loc("sparse_kv_num_blocks"(#loc65)) +#loc370 = loc("sparse_kv_num_blocks"(#loc66)) +#loc371 = loc("offs_n2"(#loc67)) +#loc372 = loc("offs_n2"(#loc68)) +#loc373 = loc("kT_ptrs"(#loc69)) +#loc374 = loc("dq"(#loc70)) +#loc375 = loc("kT_ptrs"(#loc71)) +#loc376 = loc("kT_ptrs"(#loc72)) +#loc377 = loc("kT_ptrs"(#loc73)) +#loc378 = loc("kT_ptrs"(#loc74)) +#loc379 = loc("vT_ptrs"(#loc75)) +#loc380 = loc("vT_ptrs"(#loc76)) +#loc381 = loc("hi"(#loc77)) +#loc382 = loc("hi"(#loc78)) +#loc383 = loc("dq"(#loc79)) +#loc384 = loc("kT"(#loc80)) +#loc385 = loc("dq"(#loc81)) +#loc386 = loc("qk"(#loc82)) +#loc387 = loc("qk"(#loc83)) +#loc388 = loc("n"(#loc84)) +#loc389 = loc("tmp4"(#loc85)) +#loc390 = loc("tmp5"(#loc86)) +#loc391 = loc("tmp7"(#loc87)) +#loc392 = loc("tmp7"(#loc88)) +#loc393 = loc("tmp8"(#loc89)) +#loc394 = loc("tmp9"(#loc90)) +#loc395 = loc("tmp10"(#loc91)) +#loc396 = loc("tmp11"(#loc92)) +#loc397 = loc("tmp12"(#loc93)) +#loc398 = loc("tmp15"(#loc94)) +#loc399 = loc("tmp16"(#loc95)) +#loc400 = loc("tmp18"(#loc96)) +#loc401 = loc("tmp19"(#loc97)) +#loc402 = loc("tmp22"(#loc98)) +#loc403 = loc("tmp23"(#loc99)) +#loc404 = loc("tmp24"(#loc100)) +#loc405 = loc("tmp25"(#loc101)) +#loc406 = loc("tmp26"(#loc102)) +#loc407 = loc("tmp27"(#loc103)) +#loc408 = loc("tmp28"(#loc104)) +#loc409 = loc("tmp29"(#loc105)) +#loc410 = loc("tmp30"(#loc106)) +#loc411 = loc("tmp31"(#loc107)) +#loc412 = loc("tmp33"(#loc108)) +#loc413 = loc("tmp34"(#loc109)) +#loc414 = loc("tmp35"(#loc110)) +#loc415 = loc("tmp36"(#loc111)) +#loc416 = loc("tmp37"(#loc112)) +#loc417 = loc("tmp38"(#loc113)) +#loc418 = loc("post_mod_scores"(#loc114)) +#loc419 = loc("post_mod_scores"(#loc115)) +#loc420 = loc("p"(#loc116)) +#loc421 = loc("p"(#loc117)) +#loc422 = loc("vT"(#loc118)) +#loc423 = loc("dp"(#loc119)) +#loc424 = loc("ds"(#loc120)) +#loc425 = loc("ds"(#loc121)) +#loc426 = loc("ds"(#loc122)) +#loc427 = loc("ds"(#loc123)) +#loc428 = loc("ds"(#loc124)) +#loc429 = loc("dq"(#loc125)) +#loc430 = loc("dq"(#loc126)) +#loc431 = loc("cur_block_idx"(#loc127)) +#loc432 = loc("offset"(#loc128)) +#loc433 = loc("cur_block"(#loc129)) +#loc434 = loc("cur_block"(#loc130)) +#loc435 = loc("next_block"(#loc131)) +#loc436 = loc("next_block"(#loc132)) +#loc437 = loc("next_block"(#loc133)) +#loc438 = loc("next_block"(#loc134)) +#loc439 = loc("needs_jump"(#loc135)) +#loc440 = loc("needs_jump"(#loc136)) +#loc441 = loc("needs_jump"(#loc137)) +#loc442 = loc("jump_to_block"(#loc138)) +#loc443 = loc("jump_to_block"(#loc139)) +#loc444 = loc("jump_to_block"(#loc140)) +#loc445 = loc("offset"(#loc141)) +#loc446 = loc("offset"(#loc142)) +#loc447 = loc("offset"(#loc143)) +#loc448 = loc("offset"(#loc144)) +#loc449 = loc("kT_ptrs"(#loc145)) +#loc450 = loc("kT_ptrs"(#loc146)) +#loc451 = loc("vT_ptrs"(#loc147)) +#loc452 = loc("offs_n2"(#loc148)) +#loc453 = loc("kv_indices"(#loc150)) +#loc454 = loc("kv_start"(#loc151)) +#loc455 = loc("kv_start"(#loc152)) +#loc456 = loc("sparse_kv_num_blocks"(#loc153)) +#loc457 = loc("sparse_kv_num_blocks"(#loc154)) +#loc458 = loc("offs_n2"(#loc155)) +#loc459 = loc("dq"(#loc156)) +#loc460 = loc("dq_ptrs"(#loc157)) +#loc461 = loc("dq_ptrs"(#loc158)) +#loc462 = loc("dq"(#loc159)) +#loc463 = loc("start_n1"(#loc161)) +#loc464 = loc("offs_n1"(#loc162)) +#loc465 = loc("k"(#loc163)) +#loc466 = loc("v"(#loc164)) +#loc467 = loc("dv"(#loc165)) +#loc468 = loc("off_hq1"(#loc166)) +#loc469 = loc("off_hq1"(#loc167)) +#loc470 = loc("q_adj1"(#loc168)) +#loc471 = loc("q_adj1"(#loc169)) +#loc472 = loc("q_adj1"(#loc170)) +#loc473 = loc("q_adj1"(#loc171)) +#loc474 = loc("do_adj1"(#loc172)) +#loc475 = loc("do_adj1"(#loc173)) +#loc476 = loc("do_adj1"(#loc174)) +#loc477 = loc("off_chz1"(#loc175)) +#loc478 = loc("off_chz1"(#loc176)) +#loc479 = loc("off_chz1"(#loc177)) +#loc480 = loc("off_chz1"(#loc178)) +#loc481 = loc("Q1"(#loc179)) +#loc482 = loc("DO1"(#loc180)) +#loc483 = loc("LSE1"(#loc181)) +#loc484 = loc("DELTA1"(#loc182)) +#loc485 = loc("sparse_q_num_blks_offset"(#loc183)) +#loc486 = loc("sparse_q_num_blks_offset"(#loc184)) +#loc487 = loc("sparse_q_idx_offset"(#loc185)) +#loc488 = loc("sparse_q_idx_offset"(#loc186)) +#loc489 = loc("sparse_q_idx_offset"(#loc187)) +#loc490 = loc("q_indices"(#loc188)) +#loc491 = loc("q_start"(#loc189)) +#loc492 = loc("q_start"(#loc190)) +#loc493 = loc("sparse_q_num_blocks"(#loc191)) +#loc494 = loc("sparse_q_num_blocks"(#loc192)) +#loc495 = loc("offs_m1"(#loc193)) +#loc496 = loc("offs_m1"(#loc194)) +#loc497 = loc("qT_ptrs"(#loc195)) +#loc498 = loc("qT_ptrs"(#loc197)) +#loc499 = loc("qT_ptrs"(#loc198)) +#loc500 = loc("qT_ptrs"(#loc199)) +#loc501 = loc("qT_ptrs"(#loc200)) +#loc502 = loc("do_ptrs"(#loc201)) +#loc503 = loc("do_ptrs"(#loc202)) +#loc504 = loc("do_ptrs"(#loc203)) +#loc505 = loc("do_ptrs"(#loc204)) +#loc506 = loc("hi"(#loc205)) +#loc507 = loc("hi"(#loc206)) +#loc508 = loc("dk"(#loc207)) +#loc509 = loc("qT"(#loc208)) +#loc510 = loc(callsite(#loc209 at #loc196)) +#loc511 = loc("lse"(#loc210)) +#loc512 = loc("lse"(#loc211)) +#loc513 = loc("lse"(#loc212)) +#loc514 = loc("lse"(#loc213)) +#loc515 = loc("qkT"(#loc214)) +#loc516 = loc("qkT"(#loc215)) +#loc517 = loc("m"(#loc216)) +#loc518 = loc("tmp44"(#loc217)) +#loc519 = loc("tmp45"(#loc218)) +#loc520 = loc("tmp47"(#loc219)) +#loc521 = loc("tmp47"(#loc220)) +#loc522 = loc("tmp48"(#loc221)) +#loc523 = loc("tmp49"(#loc222)) +#loc524 = loc("tmp50"(#loc223)) +#loc525 = loc("tmp51"(#loc224)) +#loc526 = loc("tmp52"(#loc225)) +#loc527 = loc("tmp55"(#loc226)) +#loc528 = loc("tmp56"(#loc227)) +#loc529 = loc("tmp58"(#loc228)) +#loc530 = loc("tmp59"(#loc229)) +#loc531 = loc("tmp62"(#loc230)) +#loc532 = loc("tmp63"(#loc231)) +#loc533 = loc("tmp64"(#loc232)) +#loc534 = loc("tmp65"(#loc233)) +#loc535 = loc("tmp66"(#loc234)) +#loc536 = loc("tmp67"(#loc235)) +#loc537 = loc("tmp68"(#loc236)) +#loc538 = loc("tmp69"(#loc237)) +#loc539 = loc("tmp70"(#loc238)) +#loc540 = loc("tmp71"(#loc239)) +#loc541 = loc("tmp73"(#loc240)) +#loc542 = loc("tmp74"(#loc241)) +#loc543 = loc("tmp75"(#loc242)) +#loc544 = loc("tmp76"(#loc243)) +#loc545 = loc("tmp77"(#loc244)) +#loc546 = loc("tmp78"(#loc245)) +#loc547 = loc("post_mod_scores"(#loc246)) +#loc548 = loc("post_mod_scores"(#loc247)) +#loc549 = loc("pT"(#loc248)) +#loc550 = loc("pT"(#loc249)) +#loc551 = loc("pT"(#loc250)) +#loc552 = loc("do"(#loc251)) +#loc553 = loc("dv"(#loc252)) +#loc554 = loc("dv"(#loc253)) +#loc555 = loc("Di"(#loc254)) +#loc556 = loc("Di"(#loc255)) +#loc557 = loc("dpT"(#loc256)) +#loc558 = loc("dpT"(#loc257)) +#loc559 = loc("dsT"(#loc258)) +#loc560 = loc("dsT"(#loc259)) +#loc561 = loc("dsT"(#loc260)) +#loc562 = loc("dsT"(#loc261)) +#loc563 = loc("dk"(#loc262)) +#loc564 = loc("dk"(#loc263)) +#loc565 = loc("dk"(#loc264)) +#loc566 = loc("offset"(#loc265)) +#loc567 = loc("qT_ptrs"(#loc266)) +#loc568 = loc("qT_ptrs"(#loc267)) +#loc569 = loc("do_ptrs"(#loc268)) +#loc570 = loc("do_ptrs"(#loc269)) +#loc571 = loc("offs_m1"(#loc270)) +#loc572 = loc(callsite(#loc271 at #loc196)) +#loc573 = loc("q_indices"(#loc272)) +#loc574 = loc("q_start"(#loc273)) +#loc575 = loc("q_start"(#loc274)) +#loc576 = loc("sparse_q_num_blocks"(#loc275)) +#loc577 = loc("sparse_q_num_blocks"(#loc276)) +#loc578 = loc("offs_m1"(#loc277)) +#loc579 = loc(callsite(#loc209 at #loc278)) +#loc580 = loc(callsite(#loc271 at #loc278)) +#loc581 = loc("dv_ptrs"(#loc280)) +#loc582 = loc("dv_ptrs"(#loc281)) +#loc583 = loc("dk"(#loc283)) +#loc584 = loc("mask"(#loc284)) +#loc585 = loc("xindex"(#loc285)) +#loc586 = loc("xindex"(#loc286)) +#loc587 = loc("xindex"(#loc287)) +#loc588 = loc(callsite(#loc352 at #loc353)) +#loc589 = loc(callsite(#loc354 at #loc353)) +#loc590 = loc(callsite(#loc355 at #loc353)) +#loc591 = loc(callsite(#loc356 at #loc353)) +#loc592 = loc(callsite(#loc357 at #loc353)) +#loc593 = loc(callsite(#loc53 at #loc353)) +#loc594 = loc(callsite(#loc354 at #loc358)) +#loc595 = loc(callsite(#loc355 at #loc358)) +#loc596 = loc(callsite(#loc357 at #loc358)) +#loc597 = loc(callsite(#loc53 at #loc358)) +#loc598 = loc(callsite(#loc373 at #loc374)) +#loc599 = loc(callsite(#loc375 at #loc374)) +#loc600 = loc(callsite(#loc376 at #loc374)) +#loc601 = loc(callsite(#loc377 at #loc374)) +#loc602 = loc(callsite(#loc378 at #loc374)) +#loc603 = loc(callsite(#loc379 at #loc374)) +#loc604 = loc(callsite(#loc380 at #loc374)) +#loc605 = loc(callsite(#loc381 at #loc374)) +#loc606 = loc(callsite(#loc382 at #loc374)) +#loc607 = loc("offs_n2"(#loc383)) +#loc608 = loc(callsite(#loc385 at #loc374)) +#loc609 = loc(callsite(#loc432 at #loc374)) +#loc610 = loc(callsite(#loc449 at #loc374)) +#loc611 = loc(callsite(#loc450 at #loc374)) +#loc612 = loc(callsite(#loc451 at #loc374)) +#loc613 = loc(callsite(#loc452 at #loc374)) +#loc614 = loc(callsite(#loc149 at #loc374)) +#loc615 = loc(callsite(#loc373 at #loc459)) +#loc616 = loc(callsite(#loc375 at #loc459)) +#loc617 = loc(callsite(#loc376 at #loc459)) +#loc618 = loc(callsite(#loc378 at #loc459)) +#loc619 = loc(callsite(#loc379 at #loc459)) +#loc620 = loc(callsite(#loc380 at #loc459)) +#loc621 = loc(callsite(#loc381 at #loc459)) +#loc622 = loc(callsite(#loc382 at #loc459)) +#loc623 = loc(callsite(#loc385 at #loc459)) +#loc624 = loc(callsite(#loc432 at #loc459)) +#loc625 = loc(callsite(#loc449 at #loc459)) +#loc626 = loc(callsite(#loc450 at #loc459)) +#loc627 = loc(callsite(#loc451 at #loc459)) +#loc628 = loc(callsite(#loc452 at #loc459)) +#loc629 = loc(callsite(#loc149 at #loc459)) +#loc630 = loc(callsite(#loc352 at #loc465)) +#loc631 = loc(callsite(#loc354 at #loc465)) +#loc632 = loc(callsite(#loc355 at #loc465)) +#loc633 = loc(callsite(#loc356 at #loc465)) +#loc634 = loc(callsite(#loc357 at #loc465)) +#loc635 = loc(callsite(#loc53 at #loc465)) +#loc636 = loc(callsite(#loc355 at #loc466)) +#loc637 = loc(callsite(#loc357 at #loc466)) +#loc638 = loc(callsite(#loc53 at #loc466)) +#loc639 = loc("dk"(#loc467)) +#loc640 = loc(callsite(#loc497 at #loc196)) +#loc641 = loc(callsite(#loc498 at #loc196)) +#loc642 = loc(callsite(#loc499 at #loc196)) +#loc643 = loc(callsite(#loc500 at #loc196)) +#loc644 = loc(callsite(#loc501 at #loc196)) +#loc645 = loc(callsite(#loc502 at #loc196)) +#loc646 = loc(callsite(#loc503 at #loc196)) +#loc647 = loc(callsite(#loc504 at #loc196)) +#loc648 = loc(callsite(#loc505 at #loc196)) +#loc649 = loc(callsite(#loc506 at #loc196)) +#loc650 = loc(callsite(#loc507 at #loc196)) +#loc651 = loc("dv"(#loc508)) +#loc652 = loc(callsite(#loc509 at #loc510)) +#loc653 = loc(callsite(#loc511 at #loc510)) +#loc654 = loc(callsite(#loc512 at #loc510)) +#loc655 = loc(callsite(#loc513 at #loc510)) +#loc656 = loc(callsite(#loc514 at #loc510)) +#loc657 = loc(callsite(#loc515 at #loc510)) +#loc658 = loc(callsite(#loc516 at #loc510)) +#loc659 = loc(callsite(#loc517 at #loc510)) +#loc660 = loc(callsite(#loc518 at #loc510)) +#loc661 = loc(callsite(#loc519 at #loc510)) +#loc662 = loc(callsite(#loc520 at #loc510)) +#loc663 = loc(callsite(#loc521 at #loc510)) +#loc664 = loc(callsite(#loc522 at #loc510)) +#loc665 = loc(callsite(#loc523 at #loc510)) +#loc666 = loc(callsite(#loc524 at #loc510)) +#loc667 = loc(callsite(#loc525 at #loc510)) +#loc668 = loc(callsite(#loc526 at #loc510)) +#loc669 = loc(callsite(#loc527 at #loc510)) +#loc670 = loc(callsite(#loc528 at #loc510)) +#loc671 = loc(callsite(#loc529 at #loc510)) +#loc672 = loc(callsite(#loc530 at #loc510)) +#loc673 = loc(callsite(#loc531 at #loc510)) +#loc674 = loc(callsite(#loc532 at #loc510)) +#loc675 = loc(callsite(#loc533 at #loc510)) +#loc676 = loc(callsite(#loc534 at #loc510)) +#loc677 = loc(callsite(#loc535 at #loc510)) +#loc678 = loc(callsite(#loc536 at #loc510)) +#loc679 = loc(callsite(#loc537 at #loc510)) +#loc680 = loc(callsite(#loc538 at #loc510)) +#loc681 = loc(callsite(#loc539 at #loc510)) +#loc682 = loc(callsite(#loc540 at #loc510)) +#loc683 = loc(callsite(#loc541 at #loc510)) +#loc684 = loc(callsite(#loc542 at #loc510)) +#loc685 = loc(callsite(#loc543 at #loc510)) +#loc686 = loc(callsite(#loc544 at #loc510)) +#loc687 = loc(callsite(#loc545 at #loc510)) +#loc688 = loc(callsite(#loc546 at #loc510)) +#loc689 = loc(callsite(#loc547 at #loc510)) +#loc690 = loc(callsite(#loc548 at #loc510)) +#loc691 = loc(callsite(#loc549 at #loc510)) +#loc692 = loc(callsite(#loc550 at #loc510)) +#loc693 = loc(callsite(#loc551 at #loc510)) +#loc694 = loc(callsite(#loc552 at #loc510)) +#loc695 = loc(callsite(#loc553 at #loc510)) +#loc696 = loc(callsite(#loc554 at #loc510)) +#loc697 = loc(callsite(#loc555 at #loc510)) +#loc698 = loc(callsite(#loc556 at #loc510)) +#loc699 = loc(callsite(#loc557 at #loc510)) +#loc700 = loc(callsite(#loc558 at #loc510)) +#loc701 = loc(callsite(#loc559 at #loc510)) +#loc702 = loc(callsite(#loc560 at #loc510)) +#loc703 = loc(callsite(#loc561 at #loc510)) +#loc704 = loc(callsite(#loc562 at #loc510)) +#loc705 = loc(callsite(#loc563 at #loc510)) +#loc706 = loc(callsite(#loc564 at #loc510)) +#loc707 = loc(callsite(#loc565 at #loc510)) +#loc708 = loc(callsite(#loc566 at #loc196)) +#loc709 = loc(callsite(#loc567 at #loc196)) +#loc710 = loc(callsite(#loc568 at #loc196)) +#loc711 = loc(callsite(#loc569 at #loc196)) +#loc712 = loc(callsite(#loc570 at #loc196)) +#loc713 = loc(callsite(#loc571 at #loc196)) +#loc714 = loc(callsite(#loc497 at #loc278)) +#loc715 = loc(callsite(#loc498 at #loc278)) +#loc716 = loc(callsite(#loc499 at #loc278)) +#loc717 = loc(callsite(#loc501 at #loc278)) +#loc718 = loc(callsite(#loc502 at #loc278)) +#loc719 = loc(callsite(#loc503 at #loc278)) +#loc720 = loc(callsite(#loc504 at #loc278)) +#loc721 = loc(callsite(#loc505 at #loc278)) +#loc722 = loc(callsite(#loc506 at #loc278)) +#loc723 = loc(callsite(#loc507 at #loc278)) +#loc724 = loc(callsite(#loc509 at #loc579)) +#loc725 = loc(callsite(#loc511 at #loc579)) +#loc726 = loc(callsite(#loc512 at #loc579)) +#loc727 = loc(callsite(#loc513 at #loc579)) +#loc728 = loc(callsite(#loc514 at #loc579)) +#loc729 = loc(callsite(#loc515 at #loc579)) +#loc730 = loc(callsite(#loc516 at #loc579)) +#loc731 = loc(callsite(#loc548 at #loc579)) +#loc732 = loc(callsite(#loc549 at #loc579)) +#loc733 = loc(callsite(#loc550 at #loc579)) +#loc734 = loc(callsite(#loc551 at #loc579)) +#loc735 = loc(callsite(#loc552 at #loc579)) +#loc736 = loc(callsite(#loc553 at #loc579)) +#loc737 = loc(callsite(#loc554 at #loc579)) +#loc738 = loc(callsite(#loc555 at #loc579)) +#loc739 = loc(callsite(#loc556 at #loc579)) +#loc740 = loc(callsite(#loc557 at #loc579)) +#loc741 = loc(callsite(#loc558 at #loc579)) +#loc742 = loc(callsite(#loc559 at #loc579)) +#loc743 = loc(callsite(#loc560 at #loc579)) +#loc744 = loc(callsite(#loc561 at #loc579)) +#loc745 = loc(callsite(#loc563 at #loc579)) +#loc746 = loc(callsite(#loc564 at #loc579)) +#loc747 = loc(callsite(#loc565 at #loc579)) +#loc748 = loc(callsite(#loc566 at #loc278)) +#loc749 = loc(callsite(#loc567 at #loc278)) +#loc750 = loc(callsite(#loc568 at #loc278)) +#loc751 = loc(callsite(#loc569 at #loc278)) +#loc752 = loc(callsite(#loc570 at #loc278)) +#loc753 = loc(callsite(#loc571 at #loc278)) +#loc754 = loc("kT_ptrs"(#loc607)) +#loc755 = loc(callsite(#loc384 at #loc608)) +#loc756 = loc(callsite(#loc386 at #loc608)) +#loc757 = loc(callsite(#loc387 at #loc608)) +#loc758 = loc(callsite(#loc388 at #loc608)) +#loc759 = loc(callsite(#loc389 at #loc608)) +#loc760 = loc(callsite(#loc390 at #loc608)) +#loc761 = loc(callsite(#loc391 at #loc608)) +#loc762 = loc(callsite(#loc392 at #loc608)) +#loc763 = loc(callsite(#loc393 at #loc608)) +#loc764 = loc(callsite(#loc394 at #loc608)) +#loc765 = loc(callsite(#loc395 at #loc608)) +#loc766 = loc(callsite(#loc396 at #loc608)) +#loc767 = loc(callsite(#loc397 at #loc608)) +#loc768 = loc(callsite(#loc398 at #loc608)) +#loc769 = loc(callsite(#loc399 at #loc608)) +#loc770 = loc(callsite(#loc400 at #loc608)) +#loc771 = loc(callsite(#loc401 at #loc608)) +#loc772 = loc(callsite(#loc402 at #loc608)) +#loc773 = loc(callsite(#loc403 at #loc608)) +#loc774 = loc(callsite(#loc404 at #loc608)) +#loc775 = loc(callsite(#loc405 at #loc608)) +#loc776 = loc(callsite(#loc406 at #loc608)) +#loc777 = loc(callsite(#loc407 at #loc608)) +#loc778 = loc(callsite(#loc408 at #loc608)) +#loc779 = loc(callsite(#loc409 at #loc608)) +#loc780 = loc(callsite(#loc410 at #loc608)) +#loc781 = loc(callsite(#loc411 at #loc608)) +#loc782 = loc(callsite(#loc412 at #loc608)) +#loc783 = loc(callsite(#loc413 at #loc608)) +#loc784 = loc(callsite(#loc414 at #loc608)) +#loc785 = loc(callsite(#loc415 at #loc608)) +#loc786 = loc(callsite(#loc416 at #loc608)) +#loc787 = loc(callsite(#loc417 at #loc608)) +#loc788 = loc(callsite(#loc418 at #loc608)) +#loc789 = loc(callsite(#loc419 at #loc608)) +#loc790 = loc(callsite(#loc420 at #loc608)) +#loc791 = loc(callsite(#loc421 at #loc608)) +#loc792 = loc(callsite(#loc422 at #loc608)) +#loc793 = loc(callsite(#loc423 at #loc608)) +#loc794 = loc(callsite(#loc424 at #loc608)) +#loc795 = loc(callsite(#loc425 at #loc608)) +#loc796 = loc(callsite(#loc426 at #loc608)) +#loc797 = loc(callsite(#loc427 at #loc608)) +#loc798 = loc(callsite(#loc428 at #loc608)) +#loc799 = loc(callsite(#loc429 at #loc608)) +#loc800 = loc(callsite(#loc430 at #loc608)) +#loc801 = loc(callsite(#loc431 at #loc609)) +#loc802 = loc(callsite(#loc433 at #loc609)) +#loc803 = loc(callsite(#loc434 at #loc609)) +#loc804 = loc(callsite(#loc435 at #loc609)) +#loc805 = loc(callsite(#loc436 at #loc609)) +#loc806 = loc(callsite(#loc437 at #loc609)) +#loc807 = loc(callsite(#loc438 at #loc609)) +#loc808 = loc(callsite(#loc439 at #loc609)) +#loc809 = loc(callsite(#loc440 at #loc609)) +#loc810 = loc(callsite(#loc441 at #loc609)) +#loc811 = loc(callsite(#loc442 at #loc609)) +#loc812 = loc(callsite(#loc443 at #loc609)) +#loc813 = loc(callsite(#loc444 at #loc609)) +#loc814 = loc(callsite(#loc445 at #loc609)) +#loc815 = loc(callsite(#loc446 at #loc609)) +#loc816 = loc(callsite(#loc447 at #loc609)) +#loc817 = loc(callsite(#loc448 at #loc609)) +#loc818 = loc(callsite(#loc384 at #loc623)) +#loc819 = loc(callsite(#loc386 at #loc623)) +#loc820 = loc(callsite(#loc387 at #loc623)) +#loc821 = loc(callsite(#loc419 at #loc623)) +#loc822 = loc(callsite(#loc420 at #loc623)) +#loc823 = loc(callsite(#loc421 at #loc623)) +#loc824 = loc(callsite(#loc422 at #loc623)) +#loc825 = loc(callsite(#loc423 at #loc623)) +#loc826 = loc(callsite(#loc424 at #loc623)) +#loc827 = loc(callsite(#loc425 at #loc623)) +#loc828 = loc(callsite(#loc426 at #loc623)) +#loc829 = loc(callsite(#loc428 at #loc623)) +#loc830 = loc(callsite(#loc429 at #loc623)) +#loc831 = loc(callsite(#loc430 at #loc623)) +#loc832 = loc(callsite(#loc431 at #loc624)) +#loc833 = loc(callsite(#loc433 at #loc624)) +#loc834 = loc(callsite(#loc434 at #loc624)) +#loc835 = loc(callsite(#loc435 at #loc624)) +#loc836 = loc(callsite(#loc436 at #loc624)) +#loc837 = loc(callsite(#loc437 at #loc624)) +#loc838 = loc(callsite(#loc438 at #loc624)) +#loc839 = loc(callsite(#loc439 at #loc624)) +#loc840 = loc(callsite(#loc440 at #loc624)) +#loc841 = loc(callsite(#loc441 at #loc624)) +#loc842 = loc(callsite(#loc442 at #loc624)) +#loc843 = loc(callsite(#loc443 at #loc624)) +#loc844 = loc(callsite(#loc444 at #loc624)) +#loc845 = loc(callsite(#loc445 at #loc624)) +#loc846 = loc(callsite(#loc446 at #loc624)) +#loc847 = loc(callsite(#loc447 at #loc624)) +#loc848 = loc(callsite(#loc448 at #loc624)) +#loc849 = loc("offs_m1"(#loc651)) +#loc850 = loc(callsite(#loc53 at #loc652)) +#loc851 = loc(callsite(#loc53 at #loc694)) +#loc852 = loc(callsite(#loc431 at #loc708)) +#loc853 = loc(callsite(#loc433 at #loc708)) +#loc854 = loc(callsite(#loc434 at #loc708)) +#loc855 = loc(callsite(#loc435 at #loc708)) +#loc856 = loc(callsite(#loc436 at #loc708)) +#loc857 = loc(callsite(#loc437 at #loc708)) +#loc858 = loc(callsite(#loc438 at #loc708)) +#loc859 = loc(callsite(#loc439 at #loc708)) +#loc860 = loc(callsite(#loc440 at #loc708)) +#loc861 = loc(callsite(#loc441 at #loc708)) +#loc862 = loc(callsite(#loc442 at #loc708)) +#loc863 = loc(callsite(#loc443 at #loc708)) +#loc864 = loc(callsite(#loc444 at #loc708)) +#loc865 = loc(callsite(#loc445 at #loc708)) +#loc866 = loc(callsite(#loc446 at #loc708)) +#loc867 = loc(callsite(#loc447 at #loc708)) +#loc868 = loc(callsite(#loc448 at #loc708)) +#loc869 = loc(callsite(#loc53 at #loc724)) +#loc870 = loc(callsite(#loc53 at #loc735)) +#loc871 = loc(callsite(#loc431 at #loc748)) +#loc872 = loc(callsite(#loc433 at #loc748)) +#loc873 = loc(callsite(#loc434 at #loc748)) +#loc874 = loc(callsite(#loc435 at #loc748)) +#loc875 = loc(callsite(#loc436 at #loc748)) +#loc876 = loc(callsite(#loc437 at #loc748)) +#loc877 = loc(callsite(#loc438 at #loc748)) +#loc878 = loc(callsite(#loc439 at #loc748)) +#loc879 = loc(callsite(#loc440 at #loc748)) +#loc880 = loc(callsite(#loc441 at #loc748)) +#loc881 = loc(callsite(#loc442 at #loc748)) +#loc882 = loc(callsite(#loc443 at #loc748)) +#loc883 = loc(callsite(#loc444 at #loc748)) +#loc884 = loc(callsite(#loc445 at #loc748)) +#loc885 = loc(callsite(#loc446 at #loc748)) +#loc886 = loc(callsite(#loc447 at #loc748)) +#loc887 = loc(callsite(#loc448 at #loc748)) +#loc888 = loc("vT_ptrs"(#loc754)) +#loc889 = loc(callsite(#loc53 at #loc755)) +#loc890 = loc(callsite(#loc53 at #loc792)) +#loc891 = loc(callsite(#loc53 at #loc818)) +#loc892 = loc(callsite(#loc53 at #loc824)) +#loc893 = loc("qT_ptrs"(#loc849)) +#loc894 = loc(callsite(#loc888 at #loc374)) +#loc895 = loc(callsite(#loc888 at #loc459)) +#loc896 = loc("do_ptrs"(#loc893)) +#loc897 = loc(callsite(#loc896 at #loc196)) +#loc898 = loc(callsite(#loc896 at #loc278)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/__grp__triton_red_fused__to_copy_sum_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/__grp__triton_red_fused__to_copy_sum_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a9fdb87d3a561fa325ea95155c477fb6d25a6e12 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/__grp__triton_red_fused__to_copy_sum_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_sum_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.source", "triton_red_fused__to_copy_sum_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttir", "triton_red_fused__to_copy_sum_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttgir", "triton_red_fused__to_copy_sum_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.llir", "triton_red_fused__to_copy_sum_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ptx", "triton_red_fused__to_copy_sum_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.cubin", "triton_red_fused__to_copy_sum_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..ec3d7a2647d53dcb0df52698f17d763381728f18 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f737489312fbe2bdc5a3a555c46a66a359a2eef3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.json @@ -0,0 +1 @@ +{"hash": "3de7644d9cd8133c07031313b905c396b634b96b77247524ade62d0919a3570b", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_sum_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..1ad73d30fcb137a67ad0dda164e2354a4b16889a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.llir @@ -0,0 +1,158 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__to_copy_sum_2(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %.fr5 = freeze i32 %4, !dbg !8 + %10 = icmp slt i32 %9, %.fr5, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 15, !dbg !9 + %13 = zext nneg i32 %9 to i64, !dbg !10 + %14 = icmp sgt i32 %5, 0, !dbg !11 + br i1 %14, label %.lr.ph, label %._crit_edge, !dbg !11 + +.lr.ph: ; preds = %8 + %15 = mul i64 %2, %13, !dbg !10 + %16 = getelementptr i32, ptr addrspace(1) %0, i64 %15 + br i1 %10, label %.lr.ph.split, label %.lr.ph.split.us + +.lr.ph.split.us: ; preds = %.lr.ph, %.lr.ph.split.us + %17 = phi i32 [ %23, %.lr.ph.split.us ], [ 0, %.lr.ph ] + %18 = or disjoint i32 %17, %12, !dbg !12 + %19 = sext i32 %18 to i64, !dbg !13 + %20 = getelementptr i32, ptr addrspace(1) %16, i64 %19, !dbg !14 + %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #3, !dbg !15 + %22 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %20, i64 %21, i1 false) #3, !dbg !15 + %23 = add i32 %17, 16, !dbg !11 + %24 = icmp slt i32 %23, %5, !dbg !11 + br i1 %24, label %.lr.ph.split.us, label %._crit_edge, !dbg !11 + +.lr.ph.split: ; preds = %.lr.ph, %.lr.ph.split + %25 = phi i64 [ %33, %.lr.ph.split ], [ 0, %.lr.ph ] + %26 = phi i32 [ %34, %.lr.ph.split ], [ 0, %.lr.ph ] + %27 = or disjoint i32 %26, %12, !dbg !12 + %28 = icmp slt i32 %27, %5, !dbg !16 + %29 = sext i32 %27 to i64, !dbg !13 + %30 = getelementptr i32, ptr addrspace(1) %16, i64 %29, !dbg !14 + %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #3, !dbg !15 + %32 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %30, i64 %31, i1 %28) #3, !dbg !15 + %narrow = select i1 %28, i32 %32, i32 0, !dbg !17 + %spec.select = sext i32 %narrow to i64, !dbg !17 + %33 = add i64 %25, %spec.select, !dbg !17 + %34 = add i32 %26, 16, !dbg !11 + %35 = icmp slt i32 %34, %5, !dbg !11 + br i1 %35, label %.lr.ph.split, label %._crit_edge, !dbg !11 + +._crit_edge: ; preds = %.lr.ph.split.us, %.lr.ph.split, %8 + %.lcssa = phi i64 [ 0, %8 ], [ %33, %.lr.ph.split ], [ 0, %.lr.ph.split.us ], !dbg !18 + %extelt.offset = lshr i64 %.lcssa, 32, !dbg !19 + %36 = trunc nuw i64 %extelt.offset to i32, !dbg !19 + %37 = trunc i64 %.lcssa to i32, !dbg !19 + %38 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %37, i32 8, i32 31), !dbg !19 + %39 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %36, i32 8, i32 31), !dbg !19 + %40 = insertelement <2 x i32> poison, i32 %38, i64 0, !dbg !19 + %41 = insertelement <2 x i32> %40, i32 %39, i64 1, !dbg !19 + %42 = bitcast <2 x i32> %41 to i64, !dbg !19 + %43 = add i64 %.lcssa, %42, !dbg !23 + %extelt.offset2 = lshr i64 %43, 32, !dbg !19 + %44 = trunc nuw i64 %extelt.offset2 to i32, !dbg !19 + %45 = trunc i64 %43 to i32, !dbg !19 + %46 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %45, i32 4, i32 31), !dbg !19 + %47 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 4, i32 31), !dbg !19 + %48 = insertelement <2 x i32> poison, i32 %46, i64 0, !dbg !19 + %49 = insertelement <2 x i32> %48, i32 %47, i64 1, !dbg !19 + %50 = bitcast <2 x i32> %49 to i64, !dbg !19 + %51 = add i64 %43, %50, !dbg !23 + %extelt.offset3 = lshr i64 %51, 32, !dbg !19 + %52 = trunc nuw i64 %extelt.offset3 to i32, !dbg !19 + %53 = trunc i64 %51 to i32, !dbg !19 + %54 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %53, i32 2, i32 31), !dbg !19 + %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 2, i32 31), !dbg !19 + %56 = insertelement <2 x i32> poison, i32 %54, i64 0, !dbg !19 + %57 = insertelement <2 x i32> %56, i32 %55, i64 1, !dbg !19 + %58 = bitcast <2 x i32> %57 to i64, !dbg !19 + %59 = add i64 %51, %58, !dbg !23 + %extelt.offset4 = lshr i64 %59, 32, !dbg !19 + %60 = trunc nuw i64 %extelt.offset4 to i32, !dbg !19 + %61 = trunc i64 %59 to i32, !dbg !19 + %62 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %61, i32 1, i32 31), !dbg !19 + %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 1, i32 31), !dbg !19 + %64 = insertelement <2 x i32> poison, i32 %62, i64 0, !dbg !19 + %65 = insertelement <2 x i32> %64, i32 %63, i64 1, !dbg !19 + %66 = bitcast <2 x i32> %65 to i64, !dbg !19 + %67 = add i64 %59, %66, !dbg !23 + %.frozen = freeze i64 %3, !dbg !24 + %68 = sdiv i64 %13, %.frozen, !dbg !24 + %69 = mul i64 %68, %.frozen, !dbg !25 + %.decomposed = sub i64 %13, %69, !dbg !25 + %70 = trunc i64 %67 to i32, !dbg !26 + %71 = icmp slt i64 %3, 2, !dbg !27 + %72 = icmp sgt i64 %3, 1, !dbg !28 + %73 = select i1 %72, i64 %3, i64 0, !dbg !29 + %74 = zext i1 %71 to i64, !dbg !30 + %75 = add i64 %73, %74, !dbg !31 + %76 = mul i64 %68, %75, !dbg !32 + %77 = getelementptr i32, ptr addrspace(1) %1, i64 %.decomposed, !dbg !33 + %78 = getelementptr i32, ptr addrspace(1) %77, i64 %76, !dbg !33 + %79 = and i32 %11, 63, !dbg !34 + %80 = icmp eq i32 %79, 0, !dbg !34 + %81 = and i1 %80, %10, !dbg !34 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %70, ptr addrspace(1) %78, i1 %81) #3, !dbg !34 + ret void, !dbg !35 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_sum_2", linkageName: "triton_red_fused__to_copy_sum_2", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 21, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 21, scope: !4) +!9 = !DILocation(line: 24, column: 37, scope: !4) +!10 = !DILocation(line: 34, column: 45, scope: !4) +!11 = !DILocation(line: 28, column: 40, scope: !4) +!12 = !DILocation(line: 29, column: 31, scope: !4) +!13 = !DILocation(line: 34, column: 41, scope: !4) +!14 = !DILocation(line: 34, column: 34, scope: !4) +!15 = !DILocation(line: 34, column: 50, scope: !4) +!16 = !DILocation(line: 30, column: 29, scope: !4) +!17 = !DILocation(line: 38, column: 48, scope: !4) +!18 = !DILocation(line: 27, column: 43, scope: !4) +!19 = !DILocation(line: 291, column: 36, scope: !20, inlinedAt: !22) +!20 = distinct !DILexicalBlockFile(scope: !4, file: !21, discriminator: 0) +!21 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!22 = !DILocation(line: 39, column: 25, scope: !4) +!23 = !DILocation(line: 261, column: 15, scope: !20, inlinedAt: !22) +!24 = !DILocation(line: 41, column: 19, scope: !4) +!25 = !DILocation(line: 40, column: 19, scope: !4) +!26 = !DILocation(line: 42, column: 19, scope: !4) +!27 = !DILocation(line: 43, column: 49, scope: !4) +!28 = !DILocation(line: 43, column: 75, scope: !4) +!29 = !DILocation(line: 43, column: 66, scope: !4) +!30 = !DILocation(line: 43, scope: !4) +!31 = !DILocation(line: 43, column: 57, scope: !4) +!32 = !DILocation(line: 43, column: 34, scope: !4) +!33 = !DILocation(line: 43, column: 25, scope: !4) +!34 = !DILocation(line: 43, column: 88, scope: !4) +!35 = !DILocation(line: 43, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..9676d0f0ca260762fdd12575694be3adea5842ab --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ptx @@ -0,0 +1,448 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_sum_2 // -- Begin function triton_red_fused__to_copy_sum_2 + // @triton_red_fused__to_copy_sum_2 +.visible .entry triton_red_fused__to_copy_sum_2( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_sum_2_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_sum_2_param_1, + .param .u64 triton_red_fused__to_copy_sum_2_param_2, + .param .u64 triton_red_fused__to_copy_sum_2_param_3, + .param .u32 triton_red_fused__to_copy_sum_2_param_4, + .param .u32 triton_red_fused__to_copy_sum_2_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_sum_2_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_sum_2_param_7 +) +.reqntid 64 +{ + .reg .pred %p<13>; + .reg .b32 %r<45>; + .reg .b64 %rd<60>; + .loc 1 18 0 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:18:0 + +// %bb.0: + ld.param.b32 %r8, [triton_red_fused__to_copy_sum_2_param_5]; + ld.param.b64 %rd14, [triton_red_fused__to_copy_sum_2_param_3]; +$L__tmp0: + .loc 1 21 28 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:21:28 + mov.u32 %r9, %ctaid.x; + ld.param.b32 %r10, [triton_red_fused__to_copy_sum_2_param_4]; + .loc 1 24 37 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:24:37 + mov.u32 %r2, %tid.x; + .loc 1 34 45 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:34:45 + cvt.u64.u32 %rd1, %r9; + .loc 1 28 40 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:28:40 + setp.lt.s32 %p1, %r8, 1; + mov.b64 %rd58, 0; + cvt.u32.u64 %r42, %rd1; + @%p1 bra $L__BB0_6; +// %bb.1: // %.lr.ph + .loc 1 0 40 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:0:40 + ld.param.b64 %rd13, [triton_red_fused__to_copy_sum_2_param_2]; + ld.param.b64 %rd11, [triton_red_fused__to_copy_sum_2_param_0]; + and.b32 %r3, %r2, 15; + .loc 1 23 21 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:23:21 + setp.lt.s32 %p2, %r42, %r10; + .loc 1 34 45 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:34:45 + mul.lo.s64 %rd16, %rd13, %rd1; + shl.b64 %rd17, %rd16, 2; + add.s64 %rd2, %rd11, %rd17; + @%p2 bra $L__BB0_4; + bra.uni $L__BB0_2; +$L__BB0_4: // %.lr.ph.split.preheader + .loc 1 0 45 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:0:45 + mov.b32 %r44, 0; + mov.b64 %rd58, 0; +$L__BB0_5: // %.lr.ph.split + // =>This Inner Loop Header: Depth=1 + .loc 1 30 29 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:30:29 + add.s32 %r17, %r3, %r44; + setp.lt.s32 %p5, %r17, %r8; + .loc 1 34 34 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:34:34 + mad.wide.s32 %rd24, %r17, 4, %rd2; + .loc 1 34 50 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:34:50 + // begin inline asm + mov.u64 %rd23, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd23, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r16, 0x0; + @%p5 ld.global.L1::evict_first.L2::cache_hint.b32 { %r16 }, [ %rd24 + 0 ], %rd23; + // end inline asm + .loc 1 38 48 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:38:48 + selp.b32 %r18, %r16, 0, %p5; + cvt.s64.s32 %rd26, %r18; + add.s64 %rd58, %rd58, %rd26; + .loc 1 28 40 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:28:40 + add.s32 %r44, %r44, 16; + setp.lt.s32 %p6, %r44, %r8; + @%p6 bra $L__BB0_5; + bra.uni $L__BB0_6; +$L__BB0_2: // %.lr.ph.split.us.preheader + .loc 1 0 40 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:0:40 + mov.b32 %r43, 0; +$L__BB0_3: // %.lr.ph.split.us + // =>This Inner Loop Header: Depth=1 + .loc 1 34 41 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:34:41 + add.s32 %r14, %r3, %r43; + .loc 1 34 34 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:34:34 + mad.wide.s32 %rd19, %r14, 4, %rd2; + .loc 1 34 50 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:34:50 + // begin inline asm + mov.u64 %rd18, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd18, 1.0; + // end inline asm + mov.pred %p3, 0; + // begin inline asm + mov.u32 %r13, 0x0; + @%p3 ld.global.L1::evict_first.L2::cache_hint.b32 { %r13 }, [ %rd19 + 0 ], %rd18; + // end inline asm + .loc 1 28 40 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:28:40 + add.s32 %r43, %r43, 16; + setp.lt.s32 %p4, %r43, %r8; + @%p4 bra $L__BB0_3; +$L__BB0_6: // %._crit_edge + .loc 1 0 40 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:0:40 + ld.param.b64 %rd12, [triton_red_fused__to_copy_sum_2_param_1]; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:39:25 ] + mov.b64 {_, %r19}, %rd58; + cvt.u32.u64 %r20, %rd58; + shfl.sync.bfly.b32 %r21, %r20, 8, 31, -1; + shfl.sync.bfly.b32 %r22, %r19, 8, 31, -1; + cvt.u64.u32 %rd27, %r21; + cvt.u64.u32 %rd28, %r22; + shl.b64 %rd29, %rd28, 32; + or.b64 %rd30, %rd27, %rd29; + .loc 2 261 15 // standard.py:261:15 @[ cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:39:25 ] + add.s64 %rd31, %rd58, %rd30; + .loc 2 291 36 // standard.py:291:36 @[ cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:39:25 ] + mov.b64 {_, %r23}, %rd31; + cvt.u32.u64 %r24, %rd31; + shfl.sync.bfly.b32 %r25, %r24, 4, 31, -1; + shfl.sync.bfly.b32 %r26, %r23, 4, 31, -1; + cvt.u64.u32 %rd32, %r25; + cvt.u64.u32 %rd33, %r26; + shl.b64 %rd34, %rd33, 32; + or.b64 %rd35, %rd32, %rd34; + .loc 2 261 15 // standard.py:261:15 @[ cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:39:25 ] + add.s64 %rd36, %rd31, %rd35; + .loc 2 291 36 // standard.py:291:36 @[ cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:39:25 ] + mov.b64 {_, %r27}, %rd36; + cvt.u32.u64 %r28, %rd36; + shfl.sync.bfly.b32 %r29, %r28, 2, 31, -1; + shfl.sync.bfly.b32 %r30, %r27, 2, 31, -1; + cvt.u64.u32 %rd37, %r29; + cvt.u64.u32 %rd38, %r30; + shl.b64 %rd39, %rd38, 32; + or.b64 %rd40, %rd37, %rd39; + .loc 2 261 15 // standard.py:261:15 @[ cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:39:25 ] + add.s64 %rd41, %rd36, %rd40; + .loc 2 291 36 // standard.py:291:36 @[ cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:39:25 ] + mov.b64 {_, %r31}, %rd41; + cvt.u32.u64 %r32, %rd41; + shfl.sync.bfly.b32 %r33, %r32, 1, 31, -1; + shfl.sync.bfly.b32 %r34, %r31, 1, 31, -1; + cvt.u64.u32 %rd42, %r33; + cvt.u64.u32 %rd43, %r34; + shl.b64 %rd44, %rd43, 32; + or.b64 %rd45, %rd42, %rd44; + .loc 2 261 15 // standard.py:261:15 @[ cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:39:25 ] + add.s64 %rd6, %rd41, %rd45; +$L__tmp2: + .loc 1 41 19 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:41:19 + and.b64 %rd46, %rd14, -4294967296; + setp.ne.b64 %p7, %rd46, 0; + @%p7 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + div.s64 %rd59, %rd1, %rd14; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r35, %rd14; + div.u32 %r37, %r42, %r35; + cvt.u64.u32 %rd59, %r37; +$L__BB0_9: + .loc 1 23 21 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:23:21 + setp.lt.s32 %p9, %r42, %r10; + .loc 1 40 19 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:40:19 + mul.lo.s64 %rd48, %rd59, %rd14; + sub.s64 %rd49, %rd1, %rd48; + .loc 1 42 19 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:42:19 + cvt.u32.u64 %r38, %rd6; + .loc 1 43 49 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:43:49 + setp.lt.s64 %p10, %rd14, 2; + .loc 1 43 75 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:43:75 + setp.gt.s64 %p11, %rd14, 1; + .loc 1 43 66 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:43:66 + selp.b64 %rd50, %rd14, 0, %p11; + .loc 1 43 0 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:43 + selp.b64 %rd51, 1, 0, %p10; + .loc 1 43 57 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:43:57 + add.s64 %rd52, %rd50, %rd51; + .loc 1 43 34 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:43:34 + mul.lo.s64 %rd53, %rd59, %rd52; + .loc 1 43 25 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:43:25 + shl.b64 %rd54, %rd49, 2; + add.s64 %rd55, %rd12, %rd54; + shl.b64 %rd56, %rd53, 2; + add.s64 %rd47, %rd55, %rd56; + .loc 1 43 88 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:43:88 + and.b32 %r40, %r2, 63; + setp.eq.b32 %p12, %r40, 0; + and.pred %p8, %p12, %p9; + // begin inline asm + @%p8 st.global.b32 [ %rd47 + 0 ], { %r38 }; + // end inline asm + .loc 1 43 4 // cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py:43:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 216 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd1 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 109 +.b8 115 +.b8 115 +.b8 52 +.b8 118 +.b8 107 +.b8 105 +.b8 114 +.b8 117 +.b8 53 +.b8 115 +.b8 119 +.b8 104 +.b8 120 +.b8 121 +.b8 51 +.b8 51 +.b8 104 +.b8 110 +.b8 115 +.b8 101 +.b8 50 +.b8 102 +.b8 120 +.b8 114 +.b8 99 +.b8 110 +.b8 103 +.b8 112 +.b8 105 +.b8 119 +.b8 55 +.b8 111 +.b8 122 +.b8 98 +.b8 100 +.b8 117 +.b8 52 +.b8 109 +.b8 119 +.b8 98 +.b8 99 +.b8 102 +.b8 113 +.b8 108 +.b8 99 +.b8 122 +.b8 110 +.b8 108 +.b8 121 +.b8 120 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 109 +.b8 115 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x22 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xad:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xc2:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 39 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.source b/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.source new file mode 100644 index 0000000000000000000000000000000000000000..c75c760b907fd15c3d325c8dc8bb5eb83f331709 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.source @@ -0,0 +1,180 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":18:0) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc40 = loc(unknown) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc47 = loc("in_ptr0"(#loc)) +#loc48 = loc("out_ptr1"(#loc)) +#loc49 = loc("ks0"(#loc)) +#loc50 = loc("ks1"(#loc)) +#loc51 = loc("xnumel"(#loc)) +#loc52 = loc("r0_numel"(#loc)) +#loc79 = loc("input"(#loc38)) +#loc80 = loc("a"(#loc43)) +#loc81 = loc("b"(#loc43)) +module { + tt.func public @triton_red_fused__to_copy_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc53) + %xoffset_0 = arith.constant 1 : i32 loc(#loc54) + %xoffset_1 = arith.constant 1 : i32 loc(#loc54) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc54) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc55) + %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc56) + %xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<1x1xi32> loc(#loc57) + %xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<1x1xi32> loc(#loc57) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc58) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<1x1xi32> loc(#loc58) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc59) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc60) + %_tmp3 = arith.constant 0 : i64 loc(#loc61) + %_tmp3_8 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc61) + %c0_i32 = arith.constant 0 : i32 loc(#loc10) + %c16_i32 = arith.constant 16 : i32 loc(#loc10) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc10) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc10) + %2 = arith.bitcast %c16_i32 : i32 to i32 loc(#loc10) + %3 = ub.poison : i32 loc(#loc10) + %_tmp3_9 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_18 = %_tmp3_8) -> (tensor<1x16xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32> loc(#loc63) + %r0_index_19 = arith.addi %r0_index, %r0_base_7 : tensor<1x16xi32> loc(#loc63) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32> loc(#loc64) + %r0_mask_20 = arith.cmpi slt, %r0_index_19, %r0_mask : tensor<1x16xi32> loc(#loc64) + %tmp0 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc65) + %tmp0_21 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc65) + %tmp0_22 = arith.muli %tmp0_21, %tmp0 : tensor<1x1xi64> loc(#loc65) + %tmp0_23 = arith.extsi %r0_index_19 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc66) + %tmp0_24 = tt.broadcast %tmp0_22 : tensor<1x1xi64> -> tensor<1x16xi64> loc(#loc66) + %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<1x16xi64> loc(#loc66) + %tmp0_26 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc67) + %tmp0_27 = tt.addptr %tmp0_26, %tmp0_25 : tensor<1x16x!tt.ptr>, tensor<1x16xi64> loc(#loc67) + %tmp0_28 = tt.broadcast %xmask_6 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc68) + %tmp0_29 = arith.andi %r0_mask_20, %tmp0_28 : tensor<1x16xi1> loc(#loc68) + %tmp0_30 = arith.constant 0.000000e+00 : f32 loc(#loc69) + %tmp0_31 = arith.constant dense<0.000000e+00> : tensor<1x16xf32> loc(#loc69) + %tmp0_32 = arith.fptosi %tmp0_31 : tensor<1x16xf32> to tensor<1x16xi32> loc(#loc69) + %tmp0_33 = tt.load %tmp0_27, %tmp0_29, %tmp0_32 evictionPolicy = evict_first : tensor<1x16x!tt.ptr> loc(#loc69) + %tmp1 = arith.extsi %tmp0_33 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc70) + %tmp4 = arith.addi %_tmp3_18, %tmp1 : tensor<1x16xi64> loc(#loc71) + %_tmp3_34 = tt.broadcast %xmask_6 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc72) + %_tmp3_35 = arith.andi %r0_mask_20, %_tmp3_34 : tensor<1x16xi1> loc(#loc72) + %_tmp3_36 = arith.select %_tmp3_35, %tmp4, %_tmp3_18 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc73) + scf.yield %_tmp3_36 : tensor<1x16xi64> loc(#loc22) + } loc(#loc62) + %tmp3 = tt.call @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp3_9) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc74) + %tmp3_10 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc75) + %x2 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc76) + %x2_11 = tt.splat %ks1 : i64 -> tensor<1x1xi64> loc(#loc76) + %x2_12 = arith.remsi %x2, %x2_11 : tensor<1x1xi64> loc(#loc76) + %x3 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc77) + %x3_13 = tt.splat %ks1 : i64 -> tensor<1x1xi64> loc(#loc77) + %x3_14 = arith.divsi %x3, %x3_13 : tensor<1x1xi64> loc(#loc77) + %tmp5 = arith.trunci %tmp3_10 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc78) + %c1_i32 = arith.constant 1 : i32 loc(#loc28) + %4 = arith.extsi %c1_i32 : i32 to i64 loc(#loc28) + %5 = arith.cmpi sge, %4, %ks1 : i64 loc(#loc28) + %c1_i32_15 = arith.constant 1 : i32 loc(#loc29) + %c1_i32_16 = arith.constant 1 : i32 loc(#loc29) + %6 = arith.extui %5 : i1 to i32 loc(#loc29) + %7 = arith.muli %c1_i32_16, %6 : i32 loc(#loc29) + %c1_i32_17 = arith.constant 1 : i32 loc(#loc30) + %8 = arith.extsi %c1_i32_17 : i32 to i64 loc(#loc30) + %9 = arith.cmpi sgt, %ks1, %8 : i64 loc(#loc30) + %10 = arith.extui %9 : i1 to i64 loc(#loc31) + %11 = arith.muli %ks1, %10 : i64 loc(#loc31) + %12 = arith.extsi %7 : i32 to i64 loc(#loc32) + %13 = arith.addi %12, %11 : i64 loc(#loc32) + %14 = tt.splat %13 : i64 -> tensor<1x1xi64> loc(#loc33) + %15 = arith.muli %x3_14, %14 : tensor<1x1xi64> loc(#loc33) + %16 = arith.addi %x2_12, %15 : tensor<1x1xi64> loc(#loc34) + %17 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc35) + %18 = tt.addptr %17, %16 : tensor<1x1x!tt.ptr>, tensor<1x1xi64> loc(#loc35) + tt.store %18, %tmp5, %xmask_6 : tensor<1x1x!tt.ptr> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x16xi64> loc("input"(#loc38))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc39) + tt.reduce.return %2 : i64 loc(#loc39) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc39) + tt.return %0 : tensor<1xi64> loc(#loc41) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc42) + tt.return %1 : tensor<1xi64> loc(#loc42) + } loc(#loc38) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc43)), %b: i64 loc("b"(#loc43))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc44) + tt.return %0 : i64 loc(#loc45) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc46) + tt.return %1 : i64 loc(#loc46) + } loc(#loc43) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":21:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":21:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":22:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":22:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":22:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":24:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":24:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":27:43) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":28:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":29:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":30:29) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":34:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":34:41) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":34:34) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":34:60) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":34:50) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":35:23) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":37:23) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":38:35) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":38:48) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":38:8) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":39:25) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":39:28) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":40:19) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":41:19) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":42:19) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:49) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:75) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:66) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:57) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:30) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:88) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:4) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc53 = loc("xoffset"(#loc1)) +#loc54 = loc("xoffset"(#loc2)) +#loc55 = loc("xindex"(#loc3)) +#loc56 = loc("xindex"(#loc4)) +#loc57 = loc("xindex"(#loc5)) +#loc58 = loc("xmask"(#loc6)) +#loc59 = loc("r0_base"(#loc7)) +#loc60 = loc("r0_base"(#loc8)) +#loc61 = loc("_tmp3"(#loc9)) +#loc62 = loc("_tmp3"(#loc10)) +#loc63 = loc("r0_index"(#loc11)) +#loc64 = loc("r0_mask"(#loc12)) +#loc65 = loc("tmp0"(#loc13)) +#loc66 = loc("tmp0"(#loc14)) +#loc67 = loc("tmp0"(#loc15)) +#loc68 = loc("tmp0"(#loc16)) +#loc69 = loc("tmp0"(#loc17)) +#loc70 = loc("tmp1"(#loc18)) +#loc71 = loc("tmp4"(#loc19)) +#loc72 = loc("_tmp3"(#loc20)) +#loc73 = loc("_tmp3"(#loc21)) +#loc74 = loc("tmp3"(#loc23)) +#loc75 = loc("tmp3"(#loc24)) +#loc76 = loc("x2"(#loc25)) +#loc77 = loc("x3"(#loc26)) +#loc78 = loc("tmp5"(#loc27)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..562217d721374ff717e27cd093e88b3b0824b4cc --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttgir @@ -0,0 +1,123 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [2, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":18:0) +#loc1 = loc(unknown) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":39:25) +#loc34 = loc("in_ptr0"(#loc)) +#loc35 = loc("out_ptr1"(#loc)) +#loc36 = loc("ks0"(#loc)) +#loc37 = loc("ks1"(#loc)) +#loc38 = loc("xnumel"(#loc)) +#loc39 = loc("r0_numel"(#loc)) +#loc54 = loc("tmp3"(#loc18)) +#loc63 = loc(callsite(#loc1 at #loc54)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x16xi64, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x16xi32, #blocked> loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc40) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc41) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc42) + %r0_base_1 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc42) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32, #blocked> loc(#loc43) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc44) + %tmp0_2 = arith.muli %ks0, %tmp0 : i64 loc(#loc44) + %tmp0_3 = tt.splat %tmp0_2 : i64 -> tensor<1x16xi64, #blocked> loc(#loc60) + %tmp0_4 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> loc(#loc46) + %tmp0_5 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked> loc(#loc61) + %_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_7 = %cst) -> (tensor<1x16xi64, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32, #blocked> loc(#loc49) + %r0_index_8 = arith.addi %r0_index, %r0_base_1 : tensor<1x16xi32, #blocked> loc(#loc49) + %r0_mask_9 = arith.cmpi slt, %r0_index_8, %r0_mask : tensor<1x16xi32, #blocked> loc(#loc43) + %tmp0_10 = arith.extsi %r0_index_8 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc45) + %tmp0_11 = arith.addi %tmp0_10, %tmp0_3 : tensor<1x16xi64, #blocked> loc(#loc45) + %tmp0_12 = tt.addptr %tmp0_4, %tmp0_11 : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi64, #blocked> loc(#loc46) + %tmp0_13 = arith.andi %r0_mask_9, %tmp0_5 : tensor<1x16xi1, #blocked> loc(#loc47) + %tmp0_14 = tt.load %tmp0_12, %tmp0_13, %cst_0 evictionPolicy = evict_first : tensor<1x16x!tt.ptr, #blocked> loc(#loc50) + %tmp1 = arith.extsi %tmp0_14 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc51) + %tmp4 = arith.addi %_tmp3_7, %tmp1 : tensor<1x16xi64, #blocked> loc(#loc52) + %_tmp3_15 = arith.select %tmp0_13, %tmp4, %_tmp3_7 : tensor<1x16xi1, #blocked>, tensor<1x16xi64, #blocked> loc(#loc53) + scf.yield %_tmp3_15 : tensor<1x16xi64, #blocked> loc(#loc16) + } loc(#loc48) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_7: i64 loc(callsite(#loc1 at #loc54)), %tmp3_8: i64 loc(callsite(#loc1 at #loc54))): + %tmp3_9 = arith.addi %tmp3_7, %tmp3_8 : i64 loc(#loc64) + tt.reduce.return %tmp3_9 : i64 loc(#loc62) + }) : (tensor<1x16xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc62) + %0 = ttg.convert_layout %tmp3 : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc20) + %tmp3_6 = tt.expand_dims %0 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi64, #blocked1> loc(#loc55) + %x2 = arith.remsi %tmp0, %ks1 : i64 loc(#loc56) + %x3 = arith.divsi %tmp0, %ks1 : i64 loc(#loc57) + %tmp5 = arith.trunci %tmp3_6 : tensor<1x1xi64, #blocked1> to tensor<1x1xi32, #blocked1> loc(#loc58) + %1 = arith.cmpi sle, %ks1, %c1_i64 : i64 loc(#loc25) + %2 = arith.cmpi sgt, %ks1, %c1_i64 : i64 loc(#loc26) + %3 = arith.extui %2 : i1 to i64 loc(#loc27) + %4 = arith.muli %ks1, %3 : i64 loc(#loc27) + %5 = arith.extui %1 : i1 to i64 loc(#loc59) + %6 = arith.addi %5, %4 : i64 loc(#loc28) + %7 = arith.muli %x3, %6 : i64 loc(#loc30) + %8 = arith.addi %x2, %7 : i64 loc(#loc31) + %9 = tt.addptr %out_ptr1, %8 : !tt.ptr, i64 loc(#loc32) + %10 = tt.splat %9 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc20) + %11 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked1> loc(#loc20) + tt.store %10, %tmp5, %11 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc20) + tt.return loc(#loc33) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":21:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":23:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":24:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":30:29) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":34:45) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":34:41) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":34:34) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":34:60) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":28:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":29:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":34:50) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":35:23) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":37:23) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":38:48) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":38:8) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:88) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":39:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":40:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":41:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":42:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:49) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:75) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:66) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:57) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:30) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:4) +#loc40 = loc("xoffset"(#loc2)) +#loc41 = loc("xmask"(#loc3)) +#loc42 = loc("r0_base"(#loc4)) +#loc43 = loc("r0_mask"(#loc5)) +#loc44 = loc("tmp0"(#loc6)) +#loc45 = loc("tmp0"(#loc7)) +#loc46 = loc("tmp0"(#loc8)) +#loc47 = loc("tmp0"(#loc9)) +#loc48 = loc("_tmp3"(#loc10)) +#loc49 = loc("r0_index"(#loc11)) +#loc50 = loc("tmp0"(#loc12)) +#loc51 = loc("tmp1"(#loc13)) +#loc52 = loc("tmp4"(#loc14)) +#loc53 = loc("_tmp3"(#loc15)) +#loc55 = loc("tmp3"(#loc21)) +#loc56 = loc("x2"(#loc22)) +#loc57 = loc("x3"(#loc23)) +#loc58 = loc("tmp5"(#loc24)) +#loc59 = loc(fused[#loc28, #loc29]) +#loc60 = loc(fused[#loc45, #loc44]) +#loc61 = loc(fused[#loc47, #loc41]) +#loc62 = loc(callsite(#loc17 at #loc54)) +#loc64 = loc(callsite(#loc19 at #loc62)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..fa522595a6b90e9b5dd6bf30ec0f4d17cb8c83c7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/HXTWITM43AJTYBYDCMJ3SBODS23DJOLLO4SHKJFN4YWQSGNDK4FQ/triton_red_fused__to_copy_sum_2.ttir @@ -0,0 +1,125 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":18:0) +#loc1 = loc(unknown) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":39:25) +#loc36 = loc("in_ptr0"(#loc)) +#loc37 = loc("out_ptr1"(#loc)) +#loc38 = loc("ks0"(#loc)) +#loc39 = loc("ks1"(#loc)) +#loc40 = loc("xnumel"(#loc)) +#loc41 = loc("r0_numel"(#loc)) +#loc58 = loc("tmp3"(#loc20)) +#loc67 = loc(callsite(#loc1 at #loc58)) +module { + tt.func public @triton_red_fused__to_copy_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst = arith.constant dense<0> : tensor<1x16xi32> loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %_tmp3 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc42) + %xoffset = tt.get_program_id x : i32 loc(#loc43) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc44) + %xmask_0 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc44) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc45) + %r0_base_1 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc46) + %_tmp3_2 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_5 = %_tmp3) -> (tensor<1x16xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32> loc(#loc48) + %r0_index_6 = arith.addi %r0_index, %r0_base_1 : tensor<1x16xi32> loc(#loc48) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32> loc(#loc49) + %r0_mask_7 = arith.cmpi slt, %r0_index_6, %r0_mask : tensor<1x16xi32> loc(#loc49) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc50) + %tmp0_8 = arith.muli %ks0, %tmp0 : i64 loc(#loc50) + %tmp0_9 = arith.extsi %r0_index_6 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc51) + %tmp0_10 = tt.splat %tmp0_8 : i64 -> tensor<1x16xi64> loc(#loc64) + %tmp0_11 = arith.addi %tmp0_9, %tmp0_10 : tensor<1x16xi64> loc(#loc51) + %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc52) + %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<1x16x!tt.ptr>, tensor<1x16xi64> loc(#loc52) + %tmp0_14 = tt.splat %xmask : i1 -> tensor<1x16xi1> loc(#loc65) + %tmp0_15 = arith.andi %r0_mask_7, %tmp0_14 : tensor<1x16xi1> loc(#loc53) + %tmp0_16 = tt.load %tmp0_13, %tmp0_15, %cst evictionPolicy = evict_first : tensor<1x16x!tt.ptr> loc(#loc54) + %tmp1 = arith.extsi %tmp0_16 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc55) + %tmp4 = arith.addi %_tmp3_5, %tmp1 : tensor<1x16xi64> loc(#loc56) + %_tmp3_17 = arith.select %tmp0_15, %tmp4, %_tmp3_5 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc57) + scf.yield %_tmp3_17 : tensor<1x16xi64> loc(#loc18) + } loc(#loc47) + %tmp3 = "tt.reduce"(%_tmp3_2) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_5: i64 loc(callsite(#loc1 at #loc58)), %tmp3_6: i64 loc(callsite(#loc1 at #loc58))): + %tmp3_7 = arith.addi %tmp3_5, %tmp3_6 : i64 loc(#loc68) + tt.reduce.return %tmp3_7 : i64 loc(#loc66) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc66) + %tmp3_3 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc59) + %x2 = arith.extsi %xoffset : i32 to i64 loc(#loc60) + %x2_4 = arith.remsi %x2, %ks1 : i64 loc(#loc60) + %x3 = arith.divsi %x2, %ks1 : i64 loc(#loc61) + %tmp5 = arith.trunci %tmp3_3 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc62) + %0 = arith.cmpi sle, %ks1, %c1_i64 : i64 loc(#loc26) + %1 = arith.cmpi sgt, %ks1, %c1_i64 : i64 loc(#loc27) + %2 = arith.extui %1 : i1 to i64 loc(#loc28) + %3 = arith.muli %ks1, %2 : i64 loc(#loc28) + %4 = arith.extui %0 : i1 to i64 loc(#loc63) + %5 = arith.addi %4, %3 : i64 loc(#loc29) + %6 = arith.muli %x3, %5 : i64 loc(#loc31) + %7 = arith.addi %x2_4, %6 : i64 loc(#loc32) + %8 = tt.addptr %out_ptr1, %7 : !tt.ptr, i64 loc(#loc33) + %9 = tt.splat %8 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc33) + tt.store %9, %tmp5, %xmask_0 : tensor<1x1x!tt.ptr> loc(#loc34) + tt.return loc(#loc35) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":28:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":27:43) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":21:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":23:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":24:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":24:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":29:31) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":30:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":34:45) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":34:41) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":34:34) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":34:60) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":34:50) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":35:23) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":37:23) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":38:48) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":38:8) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":39:28) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":40:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":41:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":42:19) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:49) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:75) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:66) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:57) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:41) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:34) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:30) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:88) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ms/cmss4vkiru5swhxy33hnse2fxrcngpiw7ozbdu4mwbcfqlcznlyx.py":43:4) +#loc42 = loc("_tmp3"(#loc3)) +#loc43 = loc("xoffset"(#loc4)) +#loc44 = loc("xmask"(#loc5)) +#loc45 = loc("r0_base"(#loc6)) +#loc46 = loc("r0_base"(#loc7)) +#loc47 = loc("_tmp3"(#loc2)) +#loc48 = loc("r0_index"(#loc8)) +#loc49 = loc("r0_mask"(#loc9)) +#loc50 = loc("tmp0"(#loc10)) +#loc51 = loc("tmp0"(#loc11)) +#loc52 = loc("tmp0"(#loc12)) +#loc53 = loc("tmp0"(#loc13)) +#loc54 = loc("tmp0"(#loc14)) +#loc55 = loc("tmp1"(#loc15)) +#loc56 = loc("tmp4"(#loc16)) +#loc57 = loc("_tmp3"(#loc17)) +#loc59 = loc("tmp3"(#loc22)) +#loc60 = loc("x2"(#loc23)) +#loc61 = loc("x3"(#loc24)) +#loc62 = loc("tmp5"(#loc25)) +#loc63 = loc(fused[#loc29, #loc30]) +#loc64 = loc(fused[#loc51, #loc50]) +#loc65 = loc(fused[#loc53, #loc44]) +#loc66 = loc(callsite(#loc19 at #loc58)) +#loc68 = loc(callsite(#loc21 at #loc66)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/__grp__triton_red_fused_eq_mul_squeeze_sum_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/__grp__triton_red_fused_eq_mul_squeeze_sum_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4f0772dc4742dcb6418aeaf7f7484947a9498701 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/__grp__triton_red_fused_eq_mul_squeeze_sum_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_eq_mul_squeeze_sum_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.source", "triton_red_fused_eq_mul_squeeze_sum_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.ttir", "triton_red_fused_eq_mul_squeeze_sum_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.ttgir", "triton_red_fused_eq_mul_squeeze_sum_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.llir", "triton_red_fused_eq_mul_squeeze_sum_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.ptx", "triton_red_fused_eq_mul_squeeze_sum_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.cubin", "triton_red_fused_eq_mul_squeeze_sum_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..78eafd1f051eb9e4b5355e0e54615040a0a99afe Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a03a6b274ca2b979131173ddc5c11dff7422c776 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.json @@ -0,0 +1 @@ +{"hash": "46cab71f17a459abde83c8e920121a423e1c0f897e41cd4bdf4405f51261c531", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_eq_mul_squeeze_sum_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..b07b33ea7779015a3471a39edfef62718fdd17f7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.llir @@ -0,0 +1,362 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_eq_mul_squeeze_sum_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = icmp samesign ult i32 %9, 2, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = shl nuw nsw i32 %11, 1, !dbg !9 + %13 = and i32 %12, 1022, !dbg !9 + %14 = shl i32 %9, 13, !dbg !10 + %15 = or disjoint i32 %13, %14 + %16 = or disjoint i32 %12, %14 + %17 = sext i32 %15 to i64, !dbg !11 + %18 = or i32 %16, 1024, !dbg !12 + %19 = getelementptr i64, ptr addrspace(1) %0, i64 %17, !dbg !13 + %20 = sext i32 %18 to i64, !dbg !13 + %21 = getelementptr i64, ptr addrspace(1) %0, i64 %20, !dbg !13 + %22 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !14 + %23 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %19, i64 %22, i1 %10) #5, !dbg !14 + %24 = extractvalue { i64, i64 } %23, 0, !dbg !14 + %25 = extractvalue { i64, i64 } %23, 1, !dbg !14 + %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !14 + %27 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %21, i64 %26, i1 %10) #5, !dbg !14 + %28 = extractvalue { i64, i64 } %27, 0, !dbg !14 + %29 = extractvalue { i64, i64 } %27, 1, !dbg !14 + %30 = getelementptr i64, ptr addrspace(1) %1, i64 %17, !dbg !15 + %31 = getelementptr i64, ptr addrspace(1) %1, i64 %20, !dbg !15 + %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !16 + %33 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %30, i64 %32, i1 %10) #5, !dbg !16 + %34 = extractvalue { i64, i64 } %33, 0, !dbg !16 + %35 = extractvalue { i64, i64 } %33, 1, !dbg !16 + %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !16 + %37 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %31, i64 %36, i1 %10) #5, !dbg !16 + %38 = extractvalue { i64, i64 } %37, 0, !dbg !16 + %39 = extractvalue { i64, i64 } %37, 1, !dbg !16 + %40 = getelementptr i64, ptr addrspace(1) %2, i64 %17, !dbg !17 + %41 = getelementptr i64, ptr addrspace(1) %2, i64 %20, !dbg !17 + %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %43 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %40, i64 %42, i1 %10) #5, !dbg !18 + %44 = extractvalue { i64, i64 } %43, 0, !dbg !18 + %45 = extractvalue { i64, i64 } %43, 1, !dbg !18 + %46 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %47 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %41, i64 %46, i1 %10) #5, !dbg !18 + %48 = extractvalue { i64, i64 } %47, 0, !dbg !18 + %49 = extractvalue { i64, i64 } %47, 1, !dbg !18 + %50 = or disjoint i64 %17, 2048, !dbg !12 + %51 = or i32 %16, 3072, !dbg !12 + %52 = getelementptr i64, ptr addrspace(1) %0, i64 %50, !dbg !13 + %53 = sext i32 %51 to i64, !dbg !13 + %54 = getelementptr i64, ptr addrspace(1) %0, i64 %53, !dbg !13 + %55 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !14 + %56 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %52, i64 %55, i1 %10) #5, !dbg !14 + %57 = extractvalue { i64, i64 } %56, 0, !dbg !14 + %58 = extractvalue { i64, i64 } %56, 1, !dbg !14 + %59 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !14 + %60 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %54, i64 %59, i1 %10) #5, !dbg !14 + %61 = extractvalue { i64, i64 } %60, 0, !dbg !14 + %62 = extractvalue { i64, i64 } %60, 1, !dbg !14 + %63 = getelementptr i64, ptr addrspace(1) %1, i64 %50, !dbg !15 + %64 = getelementptr i64, ptr addrspace(1) %1, i64 %53, !dbg !15 + %65 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !16 + %66 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %63, i64 %65, i1 %10) #5, !dbg !16 + %67 = extractvalue { i64, i64 } %66, 0, !dbg !16 + %68 = extractvalue { i64, i64 } %66, 1, !dbg !16 + %69 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !16 + %70 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %64, i64 %69, i1 %10) #5, !dbg !16 + %71 = extractvalue { i64, i64 } %70, 0, !dbg !16 + %72 = extractvalue { i64, i64 } %70, 1, !dbg !16 + %73 = getelementptr i64, ptr addrspace(1) %2, i64 %50, !dbg !17 + %74 = getelementptr i64, ptr addrspace(1) %2, i64 %53, !dbg !17 + %75 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %76 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %73, i64 %75, i1 %10) #5, !dbg !18 + %77 = extractvalue { i64, i64 } %76, 0, !dbg !18 + %78 = extractvalue { i64, i64 } %76, 1, !dbg !18 + %79 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %80 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %74, i64 %79, i1 %10) #5, !dbg !18 + %81 = extractvalue { i64, i64 } %80, 0, !dbg !18 + %82 = extractvalue { i64, i64 } %80, 1, !dbg !18 + %83 = or disjoint i64 %17, 4096, !dbg !12 + %84 = or i32 %16, 5120, !dbg !12 + %85 = getelementptr i64, ptr addrspace(1) %0, i64 %83, !dbg !13 + %86 = sext i32 %84 to i64, !dbg !13 + %87 = getelementptr i64, ptr addrspace(1) %0, i64 %86, !dbg !13 + %88 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !14 + %89 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %85, i64 %88, i1 %10) #5, !dbg !14 + %90 = extractvalue { i64, i64 } %89, 0, !dbg !14 + %91 = extractvalue { i64, i64 } %89, 1, !dbg !14 + %92 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !14 + %93 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %87, i64 %92, i1 %10) #5, !dbg !14 + %94 = extractvalue { i64, i64 } %93, 0, !dbg !14 + %95 = extractvalue { i64, i64 } %93, 1, !dbg !14 + %96 = getelementptr i64, ptr addrspace(1) %1, i64 %83, !dbg !15 + %97 = getelementptr i64, ptr addrspace(1) %1, i64 %86, !dbg !15 + %98 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !16 + %99 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %96, i64 %98, i1 %10) #5, !dbg !16 + %100 = extractvalue { i64, i64 } %99, 0, !dbg !16 + %101 = extractvalue { i64, i64 } %99, 1, !dbg !16 + %102 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !16 + %103 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %97, i64 %102, i1 %10) #5, !dbg !16 + %104 = extractvalue { i64, i64 } %103, 0, !dbg !16 + %105 = extractvalue { i64, i64 } %103, 1, !dbg !16 + %106 = getelementptr i64, ptr addrspace(1) %2, i64 %83, !dbg !17 + %107 = getelementptr i64, ptr addrspace(1) %2, i64 %86, !dbg !17 + %108 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %109 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %106, i64 %108, i1 %10) #5, !dbg !18 + %110 = extractvalue { i64, i64 } %109, 0, !dbg !18 + %111 = extractvalue { i64, i64 } %109, 1, !dbg !18 + %112 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %113 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %107, i64 %112, i1 %10) #5, !dbg !18 + %114 = extractvalue { i64, i64 } %113, 0, !dbg !18 + %115 = extractvalue { i64, i64 } %113, 1, !dbg !18 + %116 = or disjoint i64 %17, 6144, !dbg !12 + %117 = or i32 %16, 7168, !dbg !12 + %118 = getelementptr i64, ptr addrspace(1) %0, i64 %116, !dbg !13 + %119 = sext i32 %117 to i64, !dbg !13 + %120 = getelementptr i64, ptr addrspace(1) %0, i64 %119, !dbg !13 + %121 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !14 + %122 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %118, i64 %121, i1 %10) #5, !dbg !14 + %123 = extractvalue { i64, i64 } %122, 0, !dbg !14 + %124 = extractvalue { i64, i64 } %122, 1, !dbg !14 + %125 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !14 + %126 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %120, i64 %125, i1 %10) #5, !dbg !14 + %127 = extractvalue { i64, i64 } %126, 0, !dbg !14 + %128 = extractvalue { i64, i64 } %126, 1, !dbg !14 + %129 = getelementptr i64, ptr addrspace(1) %1, i64 %116, !dbg !15 + %130 = getelementptr i64, ptr addrspace(1) %1, i64 %119, !dbg !15 + %131 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !16 + %132 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %129, i64 %131, i1 %10) #5, !dbg !16 + %133 = extractvalue { i64, i64 } %132, 0, !dbg !16 + %134 = extractvalue { i64, i64 } %132, 1, !dbg !16 + %135 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !16 + %136 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %130, i64 %135, i1 %10) #5, !dbg !16 + %137 = extractvalue { i64, i64 } %136, 0, !dbg !16 + %138 = extractvalue { i64, i64 } %136, 1, !dbg !16 + %139 = getelementptr i64, ptr addrspace(1) %2, i64 %116, !dbg !17 + %140 = getelementptr i64, ptr addrspace(1) %2, i64 %119, !dbg !17 + %141 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %142 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %139, i64 %141, i1 %10) #5, !dbg !18 + %143 = extractvalue { i64, i64 } %142, 0, !dbg !18 + %144 = extractvalue { i64, i64 } %142, 1, !dbg !18 + %145 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %146 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { $0, $1 }, [ $2 + 0 ], $3;", "=l,=l,l,l,b"(ptr addrspace(1) %140, i64 %145, i1 %10) #5, !dbg !18 + %147 = extractvalue { i64, i64 } %146, 0, !dbg !18 + %148 = extractvalue { i64, i64 } %146, 1, !dbg !18 + %149 = insertelement <16 x i64> poison, i64 %57, i64 0, !dbg !19 + %150 = insertelement <16 x i64> %149, i64 %24, i64 1, !dbg !19 + %151 = insertelement <16 x i64> %150, i64 %90, i64 2, !dbg !19 + %152 = insertelement <16 x i64> %151, i64 %123, i64 3, !dbg !19 + %153 = insertelement <16 x i64> %152, i64 %58, i64 4, !dbg !19 + %154 = insertelement <16 x i64> %153, i64 %25, i64 5, !dbg !19 + %155 = insertelement <16 x i64> %154, i64 %91, i64 6, !dbg !19 + %156 = insertelement <16 x i64> %155, i64 %124, i64 7, !dbg !19 + %157 = insertelement <16 x i64> %156, i64 %61, i64 8, !dbg !19 + %158 = insertelement <16 x i64> %157, i64 %28, i64 9, !dbg !19 + %159 = insertelement <16 x i64> %158, i64 %94, i64 10, !dbg !19 + %160 = insertelement <16 x i64> %159, i64 %127, i64 11, !dbg !19 + %161 = insertelement <16 x i64> %160, i64 %62, i64 12, !dbg !19 + %162 = insertelement <16 x i64> %161, i64 %29, i64 13, !dbg !19 + %163 = insertelement <16 x i64> %162, i64 %95, i64 14, !dbg !19 + %164 = insertelement <16 x i64> %163, i64 %128, i64 15, !dbg !19 + %165 = insertelement <16 x i64> poison, i64 %67, i64 0, !dbg !19 + %166 = insertelement <16 x i64> %165, i64 %34, i64 1, !dbg !19 + %167 = insertelement <16 x i64> %166, i64 %100, i64 2, !dbg !19 + %168 = insertelement <16 x i64> %167, i64 %133, i64 3, !dbg !19 + %169 = insertelement <16 x i64> %168, i64 %68, i64 4, !dbg !19 + %170 = insertelement <16 x i64> %169, i64 %35, i64 5, !dbg !19 + %171 = insertelement <16 x i64> %170, i64 %101, i64 6, !dbg !19 + %172 = insertelement <16 x i64> %171, i64 %134, i64 7, !dbg !19 + %173 = insertelement <16 x i64> %172, i64 %71, i64 8, !dbg !19 + %174 = insertelement <16 x i64> %173, i64 %38, i64 9, !dbg !19 + %175 = insertelement <16 x i64> %174, i64 %104, i64 10, !dbg !19 + %176 = insertelement <16 x i64> %175, i64 %137, i64 11, !dbg !19 + %177 = insertelement <16 x i64> %176, i64 %72, i64 12, !dbg !19 + %178 = insertelement <16 x i64> %177, i64 %39, i64 13, !dbg !19 + %179 = insertelement <16 x i64> %178, i64 %105, i64 14, !dbg !19 + %180 = insertelement <16 x i64> %179, i64 %138, i64 15, !dbg !19 + %181 = icmp eq <16 x i64> %164, %180, !dbg !19 + %182 = insertelement <16 x i1> poison, i1 %10, i64 0, !dbg !20 + %183 = shufflevector <16 x i1> %182, <16 x i1> poison, <16 x i32> zeroinitializer, !dbg !20 + %184 = select <16 x i1> %183, <16 x i1> %181, <16 x i1> zeroinitializer, !dbg !20 + %185 = insertelement <16 x i64> poison, i64 %77, i64 0, !dbg !20 + %186 = insertelement <16 x i64> %185, i64 %44, i64 1, !dbg !20 + %187 = insertelement <16 x i64> %186, i64 %110, i64 2, !dbg !20 + %188 = insertelement <16 x i64> %187, i64 %143, i64 3, !dbg !20 + %189 = insertelement <16 x i64> %188, i64 %78, i64 4, !dbg !20 + %190 = insertelement <16 x i64> %189, i64 %45, i64 5, !dbg !20 + %191 = insertelement <16 x i64> %190, i64 %111, i64 6, !dbg !20 + %192 = insertelement <16 x i64> %191, i64 %144, i64 7, !dbg !20 + %193 = insertelement <16 x i64> %192, i64 %81, i64 8, !dbg !20 + %194 = insertelement <16 x i64> %193, i64 %48, i64 9, !dbg !20 + %195 = insertelement <16 x i64> %194, i64 %114, i64 10, !dbg !20 + %196 = insertelement <16 x i64> %195, i64 %147, i64 11, !dbg !20 + %197 = insertelement <16 x i64> %196, i64 %82, i64 12, !dbg !20 + %198 = insertelement <16 x i64> %197, i64 %49, i64 13, !dbg !20 + %199 = insertelement <16 x i64> %198, i64 %115, i64 14, !dbg !20 + %200 = insertelement <16 x i64> %199, i64 %148, i64 15, !dbg !20 + %201 = select <16 x i1> %184, <16 x i64> %200, <16 x i64> zeroinitializer, !dbg !20 + %202 = and i32 %11, 31, !dbg !9 + %203 = lshr i32 %11, 5, !dbg !9 + %204 = tail call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %201), !dbg !21 + %extelt.offset = lshr i64 %204, 32, !dbg !25 + %205 = trunc nuw i64 %extelt.offset to i32, !dbg !25 + %206 = trunc i64 %204 to i32, !dbg !25 + %207 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %206, i32 16, i32 31), !dbg !25 + %208 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %205, i32 16, i32 31), !dbg !25 + %209 = insertelement <2 x i32> poison, i32 %207, i64 0, !dbg !25 + %210 = insertelement <2 x i32> %209, i32 %208, i64 1, !dbg !25 + %211 = bitcast <2 x i32> %210 to i64, !dbg !25 + %212 = add i64 %204, %211, !dbg !21 + %extelt.offset1 = lshr i64 %212, 32, !dbg !25 + %213 = trunc nuw i64 %extelt.offset1 to i32, !dbg !25 + %214 = trunc i64 %212 to i32, !dbg !25 + %215 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %214, i32 8, i32 31), !dbg !25 + %216 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %213, i32 8, i32 31), !dbg !25 + %217 = insertelement <2 x i32> poison, i32 %215, i64 0, !dbg !25 + %218 = insertelement <2 x i32> %217, i32 %216, i64 1, !dbg !25 + %219 = bitcast <2 x i32> %218 to i64, !dbg !25 + %220 = add i64 %212, %219, !dbg !21 + %extelt.offset2 = lshr i64 %220, 32, !dbg !25 + %221 = trunc nuw i64 %extelt.offset2 to i32, !dbg !25 + %222 = trunc i64 %220 to i32, !dbg !25 + %223 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 4, i32 31), !dbg !25 + %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 4, i32 31), !dbg !25 + %225 = insertelement <2 x i32> poison, i32 %223, i64 0, !dbg !25 + %226 = insertelement <2 x i32> %225, i32 %224, i64 1, !dbg !25 + %227 = bitcast <2 x i32> %226 to i64, !dbg !25 + %228 = add i64 %220, %227, !dbg !21 + %extelt.offset3 = lshr i64 %228, 32, !dbg !25 + %229 = trunc nuw i64 %extelt.offset3 to i32, !dbg !25 + %230 = trunc i64 %228 to i32, !dbg !25 + %231 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %230, i32 2, i32 31), !dbg !25 + %232 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %229, i32 2, i32 31), !dbg !25 + %233 = insertelement <2 x i32> poison, i32 %231, i64 0, !dbg !25 + %234 = insertelement <2 x i32> %233, i32 %232, i64 1, !dbg !25 + %235 = bitcast <2 x i32> %234 to i64, !dbg !25 + %236 = add i64 %228, %235, !dbg !21 + %extelt.offset4 = lshr i64 %236, 32, !dbg !25 + %237 = trunc nuw i64 %extelt.offset4 to i32, !dbg !25 + %238 = trunc i64 %236 to i32, !dbg !25 + %239 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %238, i32 1, i32 31), !dbg !25 + %240 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %237, i32 1, i32 31), !dbg !25 + %241 = insertelement <2 x i32> poison, i32 %239, i64 0, !dbg !25 + %242 = insertelement <2 x i32> %241, i32 %240, i64 1, !dbg !25 + %243 = bitcast <2 x i32> %242 to i64, !dbg !25 + %244 = add i64 %236, %243, !dbg !21 + %245 = and i32 %203, 15, !dbg !25 + %246 = icmp eq i32 %202, 0, !dbg !25 + %247 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %245, !dbg !25 + %248 = insertelement <1 x i64> poison, i64 %244, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %247, <1 x i64> %248, i1 %246) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %249 = icmp samesign ult i32 %11, 16, !dbg !25 + %250 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %11, !dbg !25 + %251 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %250, i1 %249) #5, !dbg !25 + %extelt.offset5 = lshr i64 %251, 32, !dbg !25 + %252 = trunc nuw i64 %extelt.offset5 to i32, !dbg !25 + %253 = trunc i64 %251 to i32, !dbg !25 + %254 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %253, i32 8, i32 31), !dbg !25 + %255 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 8, i32 31), !dbg !25 + %256 = insertelement <2 x i32> poison, i32 %254, i64 0, !dbg !25 + %257 = insertelement <2 x i32> %256, i32 %255, i64 1, !dbg !25 + %258 = bitcast <2 x i32> %257 to i64, !dbg !25 + %259 = add i64 %251, %258, !dbg !21 + %extelt.offset6 = lshr i64 %259, 32, !dbg !25 + %260 = trunc nuw i64 %extelt.offset6 to i32, !dbg !25 + %261 = trunc i64 %259 to i32, !dbg !25 + %262 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %261, i32 4, i32 31), !dbg !25 + %263 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %260, i32 4, i32 31), !dbg !25 + %264 = insertelement <2 x i32> poison, i32 %262, i64 0, !dbg !25 + %265 = insertelement <2 x i32> %264, i32 %263, i64 1, !dbg !25 + %266 = bitcast <2 x i32> %265 to i64, !dbg !25 + %267 = add i64 %259, %266, !dbg !21 + %extelt.offset7 = lshr i64 %267, 32, !dbg !25 + %268 = trunc nuw i64 %extelt.offset7 to i32, !dbg !25 + %269 = trunc i64 %267 to i32, !dbg !25 + %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 2, i32 31), !dbg !25 + %271 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 2, i32 31), !dbg !25 + %272 = insertelement <2 x i32> poison, i32 %270, i64 0, !dbg !25 + %273 = insertelement <2 x i32> %272, i32 %271, i64 1, !dbg !25 + %274 = bitcast <2 x i32> %273 to i64, !dbg !25 + %275 = add i64 %267, %274, !dbg !21 + %extelt.offset8 = lshr i64 %275, 32, !dbg !25 + %276 = trunc nuw i64 %extelt.offset8 to i32, !dbg !25 + %277 = trunc i64 %275 to i32, !dbg !25 + %278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %277, i32 1, i32 31), !dbg !25 + %279 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 1, i32 31), !dbg !25 + %280 = insertelement <2 x i32> poison, i32 %278, i64 0, !dbg !25 + %281 = insertelement <2 x i32> %280, i32 %279, i64 1, !dbg !25 + %282 = bitcast <2 x i32> %281 to i64, !dbg !25 + %283 = add i64 %275, %282, !dbg !21 + %284 = icmp eq i32 %11, 0, !dbg !25 + %285 = insertelement <1 x i64> poison, i64 %283, i64 0, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %250, <1 x i64> %285, i1 %284) #5, !dbg !25 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !25 + %286 = load i64, ptr addrspace(3) @global_smem, align 16, !dbg !25 + %287 = zext nneg i32 %9 to i64, !dbg !26 + %288 = getelementptr i64, ptr addrspace(1) %3, i64 %287, !dbg !26 + %289 = and i32 %11, 511, !dbg !27 + %290 = icmp eq i32 %289, 0, !dbg !27 + %291 = and i1 %10, %290, !dbg !27 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %286, ptr addrspace(1) %288, i1 %291) #5, !dbg !27 + ret void, !dbg !28 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_eq_mul_squeeze_sum_2", linkageName: "triton_red_fused_eq_mul_squeeze_sum_2", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 25, column: 21, scope: !4) +!9 = !DILocation(line: 26, column: 37, scope: !4) +!10 = !DILocation(line: 36, column: 46, scope: !4) +!11 = !DILocation(line: 30, column: 40, scope: !4) +!12 = !DILocation(line: 36, column: 41, scope: !4) +!13 = !DILocation(line: 36, column: 34, scope: !4) +!14 = !DILocation(line: 36, column: 51, scope: !4) +!15 = !DILocation(line: 37, column: 34, scope: !4) +!16 = !DILocation(line: 37, column: 51, scope: !4) +!17 = !DILocation(line: 38, column: 34, scope: !4) +!18 = !DILocation(line: 38, column: 51, scope: !4) +!19 = !DILocation(line: 39, column: 23, scope: !4) +!20 = !DILocation(line: 44, column: 48, scope: !4) +!21 = !DILocation(line: 261, column: 15, scope: !22, inlinedAt: !24) +!22 = distinct !DILexicalBlockFile(scope: !4, file: !23, discriminator: 0) +!23 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!24 = !DILocation(line: 45, column: 25, scope: !4) +!25 = !DILocation(line: 291, column: 36, scope: !22, inlinedAt: !24) +!26 = !DILocation(line: 46, column: 25, scope: !4) +!27 = !DILocation(line: 46, column: 36, scope: !4) +!28 = !DILocation(line: 46, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..daa5af773021c66da5032d6c96ade145da5ec18f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.ptx @@ -0,0 +1,776 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_eq_mul_squeeze_sum_2 // -- Begin function triton_red_fused_eq_mul_squeeze_sum_2 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_eq_mul_squeeze_sum_2 +.visible .entry triton_red_fused_eq_mul_squeeze_sum_2( + .param .u64 .ptr .global .align 1 triton_red_fused_eq_mul_squeeze_sum_2_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_eq_mul_squeeze_sum_2_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_eq_mul_squeeze_sum_2_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused_eq_mul_squeeze_sum_2_param_3, + .param .u32 triton_red_fused_eq_mul_squeeze_sum_2_param_4, + .param .u32 triton_red_fused_eq_mul_squeeze_sum_2_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_eq_mul_squeeze_sum_2_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_eq_mul_squeeze_sum_2_param_7 +) +.reqntid 512 +{ + .reg .pred %p<46>; + .reg .b32 %r<57>; + .reg .b64 %rd<225>; + .loc 1 18 0 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:18:0 +$L__func_begin0: + .loc 1 18 0 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:18:0 + +// %bb.0: + ld.param.b64 %rd126, [triton_red_fused_eq_mul_squeeze_sum_2_param_0]; + ld.param.b64 %rd127, [triton_red_fused_eq_mul_squeeze_sum_2_param_1]; +$L__tmp0: + .loc 1 23 28 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:23:28 + mov.u32 %r4, %ctaid.x; + .loc 1 25 21 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:25:21 + setp.lt.u32 %p1, %r4, 2; + ld.param.b64 %rd128, [triton_red_fused_eq_mul_squeeze_sum_2_param_2]; + ld.param.b64 %rd129, [triton_red_fused_eq_mul_squeeze_sum_2_param_3]; + .loc 1 26 37 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:26:37 + mov.u32 %r5, %tid.x; + shl.b32 %r6, %r5, 1; + and.b32 %r7, %r6, 1022; + .loc 1 36 46 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:36:46 + shl.b32 %r8, %r4, 13; + or.b32 %r9, %r7, %r8; + or.b32 %r10, %r6, %r8; + .loc 1 36 41 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:36:41 + or.b32 %r11, %r10, 1024; + .loc 1 36 34 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:36:34 + mul.wide.s32 %rd130, %r9, 8; + add.s64 %rd4, %rd126, %rd130; + mul.wide.s32 %rd131, %r11, 8; + add.s64 %rd9, %rd126, %rd131; + .loc 1 36 51 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:36:51 + // begin inline asm + mov.u64 %rd5, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd5, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd2, 0x0; + mov.u64 %rd3, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd2, %rd3 }, [ %rd4 + 0 ], %rd5; + // end inline asm + // begin inline asm + mov.u64 %rd10, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd10, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd7, 0x0; + mov.u64 %rd8, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd7, %rd8 }, [ %rd9 + 0 ], %rd10; + // end inline asm + .loc 1 37 34 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:37:34 + add.s64 %rd14, %rd127, %rd130; + add.s64 %rd19, %rd127, %rd131; + .loc 1 37 51 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:37:51 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd12, 0x0; + mov.u64 %rd13, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd12, %rd13 }, [ %rd14 + 0 ], %rd15; + // end inline asm + // begin inline asm + mov.u64 %rd20, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd20, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd17, 0x0; + mov.u64 %rd18, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd17, %rd18 }, [ %rd19 + 0 ], %rd20; + // end inline asm + .loc 1 38 34 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:38:34 + add.s64 %rd24, %rd128, %rd130; + add.s64 %rd29, %rd128, %rd131; + .loc 1 38 51 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:38:51 + // begin inline asm + mov.u64 %rd25, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd25, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd22, 0x0; + mov.u64 %rd23, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd22, %rd23 }, [ %rd24 + 0 ], %rd25; + // end inline asm + // begin inline asm + mov.u64 %rd30, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd30, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd27, 0x0; + mov.u64 %rd28, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd27, %rd28 }, [ %rd29 + 0 ], %rd30; + // end inline asm + .loc 1 36 41 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:36:41 + or.b32 %r12, %r10, 3072; + .loc 1 36 34 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:36:34 + add.s64 %rd34, %rd4, 16384; + mul.wide.s32 %rd132, %r12, 8; + add.s64 %rd39, %rd126, %rd132; + .loc 1 36 51 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:36:51 + // begin inline asm + mov.u64 %rd35, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd35, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd32, 0x0; + mov.u64 %rd33, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd32, %rd33 }, [ %rd34 + 0 ], %rd35; + // end inline asm + // begin inline asm + mov.u64 %rd40, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd40, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd37, 0x0; + mov.u64 %rd38, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd37, %rd38 }, [ %rd39 + 0 ], %rd40; + // end inline asm + .loc 1 37 34 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:37:34 + add.s64 %rd44, %rd14, 16384; + add.s64 %rd49, %rd127, %rd132; + .loc 1 37 51 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:37:51 + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd45, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd42, 0x0; + mov.u64 %rd43, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd42, %rd43 }, [ %rd44 + 0 ], %rd45; + // end inline asm + // begin inline asm + mov.u64 %rd50, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd50, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd47, 0x0; + mov.u64 %rd48, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd47, %rd48 }, [ %rd49 + 0 ], %rd50; + // end inline asm + .loc 1 38 34 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:38:34 + add.s64 %rd54, %rd24, 16384; + add.s64 %rd59, %rd128, %rd132; + .loc 1 38 51 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:38:51 + // begin inline asm + mov.u64 %rd55, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd55, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd52, 0x0; + mov.u64 %rd53, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd52, %rd53 }, [ %rd54 + 0 ], %rd55; + // end inline asm + // begin inline asm + mov.u64 %rd60, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd60, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd57, 0x0; + mov.u64 %rd58, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd57, %rd58 }, [ %rd59 + 0 ], %rd60; + // end inline asm + .loc 1 36 41 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:36:41 + or.b32 %r13, %r10, 5120; + .loc 1 36 34 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:36:34 + add.s64 %rd64, %rd4, 32768; + mul.wide.s32 %rd133, %r13, 8; + add.s64 %rd69, %rd126, %rd133; + .loc 1 36 51 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:36:51 + // begin inline asm + mov.u64 %rd65, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd65, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd62, 0x0; + mov.u64 %rd63, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd62, %rd63 }, [ %rd64 + 0 ], %rd65; + // end inline asm + // begin inline asm + mov.u64 %rd70, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd70, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd67, 0x0; + mov.u64 %rd68, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd67, %rd68 }, [ %rd69 + 0 ], %rd70; + // end inline asm + .loc 1 37 34 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:37:34 + add.s64 %rd74, %rd14, 32768; + add.s64 %rd79, %rd127, %rd133; + .loc 1 37 51 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:37:51 + // begin inline asm + mov.u64 %rd75, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd75, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd72, 0x0; + mov.u64 %rd73, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd72, %rd73 }, [ %rd74 + 0 ], %rd75; + // end inline asm + // begin inline asm + mov.u64 %rd80, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd80, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd77, 0x0; + mov.u64 %rd78, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd77, %rd78 }, [ %rd79 + 0 ], %rd80; + // end inline asm + .loc 1 38 34 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:38:34 + add.s64 %rd84, %rd24, 32768; + add.s64 %rd89, %rd128, %rd133; + .loc 1 38 51 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:38:51 + // begin inline asm + mov.u64 %rd85, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd85, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd82, 0x0; + mov.u64 %rd83, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd82, %rd83 }, [ %rd84 + 0 ], %rd85; + // end inline asm + // begin inline asm + mov.u64 %rd90, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd90, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd87, 0x0; + mov.u64 %rd88, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd87, %rd88 }, [ %rd89 + 0 ], %rd90; + // end inline asm + .loc 1 36 41 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:36:41 + or.b32 %r14, %r10, 7168; + .loc 1 36 34 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:36:34 + add.s64 %rd94, %rd4, 49152; + mul.wide.s32 %rd134, %r14, 8; + add.s64 %rd99, %rd126, %rd134; + .loc 1 36 51 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:36:51 + // begin inline asm + mov.u64 %rd95, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd95, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd92, 0x0; + mov.u64 %rd93, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd92, %rd93 }, [ %rd94 + 0 ], %rd95; + // end inline asm + // begin inline asm + mov.u64 %rd100, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd100, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd97, 0x0; + mov.u64 %rd98, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd97, %rd98 }, [ %rd99 + 0 ], %rd100; + // end inline asm + .loc 1 37 34 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:37:34 + add.s64 %rd104, %rd14, 49152; + add.s64 %rd109, %rd127, %rd134; + .loc 1 37 51 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:37:51 + // begin inline asm + mov.u64 %rd105, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd105, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd102, 0x0; + mov.u64 %rd103, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd102, %rd103 }, [ %rd104 + 0 ], %rd105; + // end inline asm + // begin inline asm + mov.u64 %rd110, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd110, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd107, 0x0; + mov.u64 %rd108, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd107, %rd108 }, [ %rd109 + 0 ], %rd110; + // end inline asm + .loc 1 38 34 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:38:34 + add.s64 %rd114, %rd24, 49152; + add.s64 %rd119, %rd128, %rd134; + .loc 1 38 51 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:38:51 + // begin inline asm + mov.u64 %rd115, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd115, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd112, 0x0; + mov.u64 %rd113, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd112, %rd113 }, [ %rd114 + 0 ], %rd115; + // end inline asm + // begin inline asm + mov.u64 %rd120, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd120, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd117, 0x0; + mov.u64 %rd118, 0x0; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b64 { %rd117, %rd118 }, [ %rd119 + 0 ], %rd120; + // end inline asm + .loc 1 39 23 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:39:23 + setp.eq.b64 %p29, %rd38, %rd48; + setp.eq.b64 %p30, %rd33, %rd43; + setp.eq.b64 %p31, %rd37, %rd47; + setp.eq.b64 %p32, %rd32, %rd42; + setp.eq.b64 %p33, %rd68, %rd78; + setp.eq.b64 %p34, %rd63, %rd73; + setp.eq.b64 %p35, %rd67, %rd77; + setp.eq.b64 %p36, %rd62, %rd72; + setp.eq.b64 %p37, %rd8, %rd18; + setp.eq.b64 %p38, %rd3, %rd13; + setp.eq.b64 %p39, %rd7, %rd17; + setp.eq.b64 %p40, %rd2, %rd12; + setp.eq.b64 %p41, %rd98, %rd108; + setp.eq.b64 %p42, %rd93, %rd103; + setp.eq.b64 %p43, %rd97, %rd107; + setp.eq.b64 %p44, %rd92, %rd102; + .loc 1 44 48 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:44:48 + selp.b64 %rd135, %rd112, 0, %p44; + selp.b64 %rd136, %rd135, 0, %p1; + selp.b64 %rd137, %rd117, 0, %p43; + selp.b64 %rd138, %rd137, 0, %p1; + selp.b64 %rd139, %rd113, 0, %p42; + selp.b64 %rd140, %rd139, 0, %p1; + selp.b64 %rd141, %rd118, 0, %p41; + selp.b64 %rd142, %rd141, 0, %p1; + selp.b64 %rd143, %rd22, 0, %p40; + selp.b64 %rd144, %rd143, 0, %p1; + selp.b64 %rd145, %rd27, 0, %p39; + selp.b64 %rd146, %rd145, 0, %p1; + selp.b64 %rd147, %rd23, 0, %p38; + selp.b64 %rd148, %rd147, 0, %p1; + selp.b64 %rd149, %rd28, 0, %p37; + selp.b64 %rd150, %rd149, 0, %p1; + selp.b64 %rd151, %rd82, 0, %p36; + selp.b64 %rd152, %rd151, 0, %p1; + selp.b64 %rd153, %rd87, 0, %p35; + selp.b64 %rd154, %rd153, 0, %p1; + selp.b64 %rd155, %rd83, 0, %p34; + selp.b64 %rd156, %rd155, 0, %p1; + selp.b64 %rd157, %rd88, 0, %p33; + selp.b64 %rd158, %rd157, 0, %p1; + selp.b64 %rd159, %rd52, 0, %p32; + selp.b64 %rd160, %rd159, 0, %p1; + selp.b64 %rd161, %rd57, 0, %p31; + selp.b64 %rd162, %rd161, 0, %p1; + selp.b64 %rd163, %rd53, 0, %p30; + selp.b64 %rd164, %rd163, 0, %p1; + selp.b64 %rd165, %rd58, 0, %p29; + selp.b64 %rd166, %rd165, 0, %p1; + .loc 1 26 37 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:26:37 + and.b32 %r15, %r5, 31; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + add.s64 %rd167, %rd164, %rd166; + add.s64 %rd168, %rd160, %rd162; + add.s64 %rd169, %rd168, %rd167; + add.s64 %rd170, %rd156, %rd158; + add.s64 %rd171, %rd152, %rd154; + add.s64 %rd172, %rd171, %rd170; + add.s64 %rd173, %rd169, %rd172; + add.s64 %rd174, %rd148, %rd150; + add.s64 %rd175, %rd144, %rd146; + add.s64 %rd176, %rd175, %rd174; + add.s64 %rd177, %rd140, %rd142; + add.s64 %rd178, %rd136, %rd138; + add.s64 %rd179, %rd178, %rd177; + add.s64 %rd180, %rd176, %rd179; + add.s64 %rd181, %rd173, %rd180; + .loc 2 291 36 // standard.py:291:36 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + mov.b64 {_, %r16}, %rd181; + cvt.u32.u64 %r17, %rd181; + shfl.sync.bfly.b32 %r18, %r17, 16, 31, -1; + shfl.sync.bfly.b32 %r19, %r16, 16, 31, -1; + cvt.u64.u32 %rd182, %r18; + cvt.u64.u32 %rd183, %r19; + shl.b64 %rd184, %rd183, 32; + or.b64 %rd185, %rd182, %rd184; + .loc 2 261 15 // standard.py:261:15 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + add.s64 %rd186, %rd181, %rd185; + .loc 2 291 36 // standard.py:291:36 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + mov.b64 {_, %r20}, %rd186; + cvt.u32.u64 %r21, %rd186; + shfl.sync.bfly.b32 %r22, %r21, 8, 31, -1; + shfl.sync.bfly.b32 %r23, %r20, 8, 31, -1; + cvt.u64.u32 %rd187, %r22; + cvt.u64.u32 %rd188, %r23; + shl.b64 %rd189, %rd188, 32; + or.b64 %rd190, %rd187, %rd189; + .loc 2 261 15 // standard.py:261:15 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + add.s64 %rd191, %rd186, %rd190; + .loc 2 291 36 // standard.py:291:36 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + mov.b64 {_, %r24}, %rd191; + cvt.u32.u64 %r25, %rd191; + shfl.sync.bfly.b32 %r26, %r25, 4, 31, -1; + shfl.sync.bfly.b32 %r27, %r24, 4, 31, -1; + cvt.u64.u32 %rd192, %r26; + cvt.u64.u32 %rd193, %r27; + shl.b64 %rd194, %rd193, 32; + or.b64 %rd195, %rd192, %rd194; + .loc 2 261 15 // standard.py:261:15 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + add.s64 %rd196, %rd191, %rd195; + .loc 2 291 36 // standard.py:291:36 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + mov.b64 {_, %r28}, %rd196; + cvt.u32.u64 %r29, %rd196; + shfl.sync.bfly.b32 %r30, %r29, 2, 31, -1; + shfl.sync.bfly.b32 %r31, %r28, 2, 31, -1; + cvt.u64.u32 %rd197, %r30; + cvt.u64.u32 %rd198, %r31; + shl.b64 %rd199, %rd198, 32; + or.b64 %rd200, %rd197, %rd199; + .loc 2 261 15 // standard.py:261:15 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + add.s64 %rd201, %rd196, %rd200; + .loc 2 291 36 // standard.py:291:36 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + mov.b64 {_, %r32}, %rd201; + cvt.u32.u64 %r33, %rd201; + shfl.sync.bfly.b32 %r34, %r33, 1, 31, -1; + shfl.sync.bfly.b32 %r35, %r32, 1, 31, -1; + cvt.u64.u32 %rd202, %r34; + cvt.u64.u32 %rd203, %r35; + shl.b64 %rd204, %rd203, 32; + or.b64 %rd205, %rd202, %rd204; + .loc 2 261 15 // standard.py:261:15 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + add.s64 %rd121, %rd201, %rd205; + .loc 2 291 36 // standard.py:291:36 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + setp.eq.b32 %p25, %r15, 0; + shr.u32 %r36, %r5, 2; + and.b32 %r37, %r36, 120; + mov.b32 %r38, global_smem; + add.s32 %r1, %r38, %r37; + // begin inline asm + @%p25 st.shared.b64 [ %r1 + 0 ], %rd121; + // end inline asm + bar.sync 0; + setp.lt.u32 %p26, %r5, 16; + shl.b32 %r39, %r5, 3; + add.s32 %r2, %r38, %r39; + // begin inline asm + @%p26 ld.shared.b64 %rd122, [ %r2 + 0 ]; + // end inline asm + mov.b64 {_, %r40}, %rd122; + cvt.u32.u64 %r41, %rd122; + shfl.sync.bfly.b32 %r42, %r41, 8, 31, -1; + shfl.sync.bfly.b32 %r43, %r40, 8, 31, -1; + cvt.u64.u32 %rd206, %r42; + cvt.u64.u32 %rd207, %r43; + shl.b64 %rd208, %rd207, 32; + or.b64 %rd209, %rd206, %rd208; + .loc 2 261 15 // standard.py:261:15 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + add.s64 %rd210, %rd122, %rd209; + .loc 2 291 36 // standard.py:291:36 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + mov.b64 {_, %r44}, %rd210; + cvt.u32.u64 %r45, %rd210; + shfl.sync.bfly.b32 %r46, %r45, 4, 31, -1; + shfl.sync.bfly.b32 %r47, %r44, 4, 31, -1; + cvt.u64.u32 %rd211, %r46; + cvt.u64.u32 %rd212, %r47; + shl.b64 %rd213, %rd212, 32; + or.b64 %rd214, %rd211, %rd213; + .loc 2 261 15 // standard.py:261:15 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + add.s64 %rd215, %rd210, %rd214; + .loc 2 291 36 // standard.py:291:36 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + mov.b64 {_, %r48}, %rd215; + cvt.u32.u64 %r49, %rd215; + shfl.sync.bfly.b32 %r50, %r49, 2, 31, -1; + shfl.sync.bfly.b32 %r51, %r48, 2, 31, -1; + cvt.u64.u32 %rd216, %r50; + cvt.u64.u32 %rd217, %r51; + shl.b64 %rd218, %rd217, 32; + or.b64 %rd219, %rd216, %rd218; + .loc 2 261 15 // standard.py:261:15 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + add.s64 %rd220, %rd215, %rd219; + .loc 2 291 36 // standard.py:291:36 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + mov.b64 {_, %r52}, %rd220; + cvt.u32.u64 %r53, %rd220; + shfl.sync.bfly.b32 %r54, %r53, 1, 31, -1; + shfl.sync.bfly.b32 %r55, %r52, 1, 31, -1; + cvt.u64.u32 %rd221, %r54; + cvt.u64.u32 %rd222, %r55; + shl.b64 %rd223, %rd222, 32; + or.b64 %rd224, %rd221, %rd223; + .loc 2 261 15 // standard.py:261:15 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + add.s64 %rd123, %rd220, %rd224; + .loc 2 291 36 // standard.py:291:36 @[ czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:45:25 ] + setp.eq.b32 %p27, %r5, 0; + // begin inline asm + @%p27 st.shared.b64 [ %r2 + 0 ], %rd123; + // end inline asm + bar.sync 0; + ld.shared.b64 %rd124, [global_smem]; +$L__tmp2: + .loc 1 46 25 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:46:25 + mad.wide.u32 %rd125, %r4, 8, %rd129; + .loc 1 46 36 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:46:36 + and.b32 %r56, %r5, 511; + setp.eq.b32 %p45, %r56, 0; + and.pred %p28, %p1, %p45; + // begin inline asm + @%p28 st.global.b64 [ %rd125 + 0 ], { %rd124 }; + // end inline asm + .loc 1 46 4 // czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py:46:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 222 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd7 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 122 +.b8 98 +.b8 107 +.b8 117 +.b8 115 +.b8 100 +.b8 98 +.b8 109 +.b8 105 +.b8 121 +.b8 54 +.b8 107 +.b8 105 +.b8 107 +.b8 113 +.b8 52 +.b8 113 +.b8 111 +.b8 100 +.b8 55 +.b8 109 +.b8 100 +.b8 120 +.b8 54 +.b8 119 +.b8 106 +.b8 53 +.b8 104 +.b8 115 +.b8 107 +.b8 103 +.b8 104 +.b8 99 +.b8 100 +.b8 110 +.b8 53 +.b8 103 +.b8 114 +.b8 52 +.b8 122 +.b8 54 +.b8 104 +.b8 99 +.b8 100 +.b8 121 +.b8 52 +.b8 118 +.b8 51 +.b8 110 +.b8 98 +.b8 122 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 122 +.b8 98 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x28 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xb3:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xc8:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.source b/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.source new file mode 100644 index 0000000000000000000000000000000000000000..065b1d128d9fd5cdf9415edce99c141e97072a0d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.source @@ -0,0 +1,203 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":18:0) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc44 = loc(unknown) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc51 = loc("in_ptr0"(#loc)) +#loc52 = loc("in_ptr1"(#loc)) +#loc53 = loc("in_ptr2"(#loc)) +#loc54 = loc("out_ptr0"(#loc)) +#loc55 = loc("xnumel"(#loc)) +#loc56 = loc("r0_numel"(#loc)) +#loc94 = loc("input"(#loc42)) +#loc95 = loc("a"(#loc47)) +#loc96 = loc("b"(#loc47)) +module { + tt.func public @triton_red_fused_eq_mul_squeeze_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 2 : i32 loc(#loc57) + %r0_numel_1 = arith.constant 8192 : i32 loc(#loc58) + %xoffset = tt.get_program_id x : i32 loc(#loc59) + %xoffset_2 = arith.constant 1 : i32 loc(#loc60) + %xoffset_3 = arith.constant 1 : i32 loc(#loc60) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc60) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc61) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc62) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc63) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc63) + %xmask = arith.constant dense<2> : tensor<1x1xi32> loc(#loc64) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc64) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc65) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc66) + %_tmp7 = arith.constant 0 : i64 loc(#loc67) + %_tmp7_10 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc67) + %c0_i32 = arith.constant 0 : i32 loc(#loc12) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc12) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc12) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc12) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc12) + %3 = ub.poison : i32 loc(#loc12) + %_tmp7_11 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp7_13 = %_tmp7_10) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc69) + %r0_index_14 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc69) + %r0_mask = arith.constant dense<8192> : tensor<1x2048xi32> loc(#loc70) + %r0_mask_15 = arith.cmpi slt, %r0_index_14, %r0_mask : tensor<1x2048xi32> loc(#loc70) + %tmp0 = arith.constant 8192 : i32 loc(#loc71) + %tmp0_16 = arith.constant 8192 : i32 loc(#loc71) + %tmp0_17 = arith.constant dense<8192> : tensor<1x1xi32> loc(#loc71) + %tmp0_18 = arith.muli %tmp0_17, %xindex_7 : tensor<1x1xi32> loc(#loc71) + %tmp0_19 = tt.broadcast %tmp0_18 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc72) + %tmp0_20 = arith.addi %r0_index_14, %tmp0_19 : tensor<1x2048xi32> loc(#loc72) + %tmp0_21 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc73) + %tmp0_22 = tt.addptr %tmp0_21, %tmp0_20 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc73) + %tmp0_23 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc74) + %tmp0_24 = arith.andi %r0_mask_15, %tmp0_23 : tensor<1x2048xi1> loc(#loc74) + %tmp0_25 = arith.constant 0.000000e+00 : f32 loc(#loc75) + %tmp0_26 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc75) + %tmp0_27 = arith.fptosi %tmp0_26 : tensor<1x2048xf32> to tensor<1x2048xi64> loc(#loc75) + %tmp0_28 = tt.load %tmp0_22, %tmp0_24, %tmp0_27 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc75) + %tmp1 = arith.constant 8192 : i32 loc(#loc76) + %tmp1_29 = arith.constant 8192 : i32 loc(#loc76) + %tmp1_30 = arith.constant dense<8192> : tensor<1x1xi32> loc(#loc76) + %tmp1_31 = arith.muli %tmp1_30, %xindex_7 : tensor<1x1xi32> loc(#loc76) + %tmp1_32 = tt.broadcast %tmp1_31 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc77) + %tmp1_33 = arith.addi %r0_index_14, %tmp1_32 : tensor<1x2048xi32> loc(#loc77) + %tmp1_34 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc78) + %tmp1_35 = tt.addptr %tmp1_34, %tmp1_33 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc78) + %tmp1_36 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc79) + %tmp1_37 = arith.andi %r0_mask_15, %tmp1_36 : tensor<1x2048xi1> loc(#loc79) + %tmp1_38 = arith.constant 0.000000e+00 : f32 loc(#loc80) + %tmp1_39 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc80) + %tmp1_40 = arith.fptosi %tmp1_39 : tensor<1x2048xf32> to tensor<1x2048xi64> loc(#loc80) + %tmp1_41 = tt.load %tmp1_35, %tmp1_37, %tmp1_40 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc80) + %tmp4 = arith.constant 8192 : i32 loc(#loc81) + %tmp4_42 = arith.constant 8192 : i32 loc(#loc81) + %tmp4_43 = arith.constant dense<8192> : tensor<1x1xi32> loc(#loc81) + %tmp4_44 = arith.muli %tmp4_43, %xindex_7 : tensor<1x1xi32> loc(#loc81) + %tmp4_45 = tt.broadcast %tmp4_44 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc82) + %tmp4_46 = arith.addi %r0_index_14, %tmp4_45 : tensor<1x2048xi32> loc(#loc82) + %tmp4_47 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc83) + %tmp4_48 = tt.addptr %tmp4_47, %tmp4_46 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc83) + %tmp4_49 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc84) + %tmp4_50 = arith.andi %r0_mask_15, %tmp4_49 : tensor<1x2048xi1> loc(#loc84) + %tmp4_51 = arith.constant 0.000000e+00 : f32 loc(#loc85) + %tmp4_52 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc85) + %tmp4_53 = arith.fptosi %tmp4_52 : tensor<1x2048xf32> to tensor<1x2048xi64> loc(#loc85) + %tmp4_54 = tt.load %tmp4_48, %tmp4_50, %tmp4_53 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc85) + %tmp2 = arith.cmpi eq, %tmp0_28, %tmp1_41 : tensor<1x2048xi64> loc(#loc86) + %tmp3 = arith.extui %tmp2 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc87) + %tmp5 = arith.muli %tmp3, %tmp4_54 : tensor<1x2048xi64> loc(#loc88) + %tmp8 = arith.addi %_tmp7_13, %tmp5 : tensor<1x2048xi64> loc(#loc89) + %_tmp7_55 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc90) + %_tmp7_56 = arith.andi %r0_mask_15, %_tmp7_55 : tensor<1x2048xi1> loc(#loc90) + %_tmp7_57 = arith.select %_tmp7_56, %tmp8, %_tmp7_13 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc91) + scf.yield %_tmp7_57 : tensor<1x2048xi64> loc(#loc36) + } loc(#loc68) + %tmp7 = tt.call @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp7_11) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc92) + %tmp7_12 = tt.expand_dims %tmp7 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc93) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc39) + %5 = tt.addptr %4, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc39) + tt.store %5, %tmp7_12, %xmask_8 : tensor<1x1x!tt.ptr> loc(#loc40) + tt.return loc(#loc41) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2048xi64> loc("input"(#loc42))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc43) + tt.reduce.return %2 : i64 loc(#loc43) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc43) + tt.return %0 : tensor<1xi64> loc(#loc45) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc46) + tt.return %1 : tensor<1xi64> loc(#loc46) + } loc(#loc42) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc47)), %b: i64 loc("b"(#loc47))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc48) + tt.return %0 : i64 loc(#loc49) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc50) + tt.return %1 : i64 loc(#loc50) + } loc(#loc47) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":25:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":29:43) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":30:40) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":31:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":32:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":36:46) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":36:41) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":36:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":36:61) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":36:51) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":37:46) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":37:41) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":37:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":37:61) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":37:51) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":38:46) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":38:41) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":38:34) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":38:61) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":38:51) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":39:23) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":40:23) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":41:22) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":43:23) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":44:35) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":44:48) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":44:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":45:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":45:28) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":46:25) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":46:36) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":46:4) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc57 = loc("xnumel"(#loc1)) +#loc58 = loc("r0_numel"(#loc2)) +#loc59 = loc("xoffset"(#loc3)) +#loc60 = loc("xoffset"(#loc4)) +#loc61 = loc("xindex"(#loc5)) +#loc62 = loc("xindex"(#loc6)) +#loc63 = loc("xindex"(#loc7)) +#loc64 = loc("xmask"(#loc8)) +#loc65 = loc("r0_base"(#loc9)) +#loc66 = loc("r0_base"(#loc10)) +#loc67 = loc("_tmp7"(#loc11)) +#loc68 = loc("_tmp7"(#loc12)) +#loc69 = loc("r0_index"(#loc13)) +#loc70 = loc("r0_mask"(#loc14)) +#loc71 = loc("tmp0"(#loc15)) +#loc72 = loc("tmp0"(#loc16)) +#loc73 = loc("tmp0"(#loc17)) +#loc74 = loc("tmp0"(#loc18)) +#loc75 = loc("tmp0"(#loc19)) +#loc76 = loc("tmp1"(#loc20)) +#loc77 = loc("tmp1"(#loc21)) +#loc78 = loc("tmp1"(#loc22)) +#loc79 = loc("tmp1"(#loc23)) +#loc80 = loc("tmp1"(#loc24)) +#loc81 = loc("tmp4"(#loc25)) +#loc82 = loc("tmp4"(#loc26)) +#loc83 = loc("tmp4"(#loc27)) +#loc84 = loc("tmp4"(#loc28)) +#loc85 = loc("tmp4"(#loc29)) +#loc86 = loc("tmp2"(#loc30)) +#loc87 = loc("tmp3"(#loc31)) +#loc88 = loc("tmp5"(#loc32)) +#loc89 = loc("tmp8"(#loc33)) +#loc90 = loc("_tmp7"(#loc34)) +#loc91 = loc("_tmp7"(#loc35)) +#loc92 = loc("tmp7"(#loc37)) +#loc93 = loc("tmp7"(#loc38)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..ade96a6a1849292db7bed3ed7b047b9ceb0ad19a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.ttgir @@ -0,0 +1,116 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":18:0) +#loc1 = loc(unknown) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":45:25) +#loc30 = loc("in_ptr0"(#loc)) +#loc31 = loc("in_ptr1"(#loc)) +#loc32 = loc("in_ptr2"(#loc)) +#loc33 = loc("out_ptr0"(#loc)) +#loc34 = loc("xnumel"(#loc)) +#loc35 = loc("r0_numel"(#loc)) +#loc56 = loc("tmp7"(#loc24)) +#loc61 = loc(callsite(#loc1 at #loc56)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_eq_mul_squeeze_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<8192> : tensor<1x2048xi32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c8192_i32 = arith.constant 8192 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x2048xi64, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc36) + %xmask = arith.cmpi slt, %xoffset, %c2_i32 : i32 loc(#loc37) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc38) + %r0_base_1 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc38) + %tmp0 = arith.muli %xoffset, %c8192_i32 : i32 loc(#loc39) + %tmp0_2 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc58) + %tmp0_3 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc41) + %tmp0_4 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc59) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc43) + %tmp4 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc44) + %_tmp7 = scf.for %_tmp7_7 = %c0_i32 to %c8192_i32 step %c2048_i32 iter_args(%arg7 = %cst_0) -> (tensor<1x2048xi64, #blocked>) : i32 { + %r0_index = tt.splat %_tmp7_7 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc46) + %r0_index_8 = arith.addi %r0_index, %r0_base_1 : tensor<1x2048xi32, #blocked> loc(#loc46) + %r0_mask = arith.cmpi slt, %r0_index_8, %cst : tensor<1x2048xi32, #blocked> loc(#loc47) + %tmp0_9 = arith.addi %r0_index_8, %tmp0_2 : tensor<1x2048xi32, #blocked> loc(#loc40) + %tmp0_10 = tt.addptr %tmp0_3, %tmp0_9 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc41) + %tmp0_11 = arith.andi %r0_mask, %tmp0_4 : tensor<1x2048xi1, #blocked> loc(#loc42) + %tmp0_12 = tt.load %tmp0_10, %tmp0_11, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc48) + %tmp1_13 = tt.addptr %tmp1, %tmp0_9 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc43) + %tmp1_14 = tt.load %tmp1_13, %tmp0_11, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc49) + %tmp4_15 = tt.addptr %tmp4, %tmp0_9 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc44) + %tmp4_16 = tt.load %tmp4_15, %tmp0_11, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc50) + %tmp2 = arith.cmpi eq, %tmp0_12, %tmp1_14 : tensor<1x2048xi64, #blocked> loc(#loc51) + %tmp3 = arith.extui %tmp2 : tensor<1x2048xi1, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc52) + %tmp5 = arith.muli %tmp3, %tmp4_16 : tensor<1x2048xi64, #blocked> loc(#loc53) + %tmp8 = arith.addi %arg7, %tmp5 : tensor<1x2048xi64, #blocked> loc(#loc54) + %_tmp7_17 = arith.select %tmp0_11, %tmp8, %arg7 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc55) + scf.yield %_tmp7_17 : tensor<1x2048xi64, #blocked> loc(#loc22) + } loc(#loc45) + %tmp7 = "tt.reduce"(%_tmp7) <{axis = 1 : i32}> ({ + ^bb0(%tmp7_7: i64 loc(callsite(#loc1 at #loc56)), %tmp7_8: i64 loc(callsite(#loc1 at #loc56))): + %tmp7_9 = arith.addi %tmp7_7, %tmp7_8 : i64 loc(#loc62) + tt.reduce.return %tmp7_9 : i64 loc(#loc60) + }) : (tensor<1x2048xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc60) + %tmp7_5 = ttg.convert_layout %tmp7 : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc57) + %tmp7_6 = tt.expand_dims %tmp7_5 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi64, #blocked1> loc(#loc57) + %0 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc27) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc28) + %2 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked1> loc(#loc28) + tt.store %1, %tmp7_6, %2 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc28) + tt.return loc(#loc29) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":25:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":26:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":36:46) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":36:41) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":36:34) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":36:61) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":37:34) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":38:34) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":30:40) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":31:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":32:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":36:51) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":37:51) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":38:51) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":39:23) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":40:23) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":41:22) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":43:23) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":44:48) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":44:8) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":45:28) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":46:25) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":46:36) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":46:4) +#loc36 = loc("xoffset"(#loc2)) +#loc37 = loc("xmask"(#loc3)) +#loc38 = loc("r0_base"(#loc4)) +#loc39 = loc("tmp0"(#loc5)) +#loc40 = loc("tmp0"(#loc6)) +#loc41 = loc("tmp0"(#loc7)) +#loc42 = loc("tmp0"(#loc8)) +#loc43 = loc("tmp1"(#loc9)) +#loc44 = loc("tmp4"(#loc10)) +#loc45 = loc("_tmp7"(#loc11)) +#loc46 = loc("r0_index"(#loc12)) +#loc47 = loc("r0_mask"(#loc13)) +#loc48 = loc("tmp0"(#loc14)) +#loc49 = loc("tmp1"(#loc15)) +#loc50 = loc("tmp4"(#loc16)) +#loc51 = loc("tmp2"(#loc17)) +#loc52 = loc("tmp3"(#loc18)) +#loc53 = loc("tmp5"(#loc19)) +#loc54 = loc("tmp8"(#loc20)) +#loc55 = loc("_tmp7"(#loc21)) +#loc57 = loc("tmp7"(#loc26)) +#loc58 = loc(fused[#loc40, #loc39]) +#loc59 = loc(fused[#loc42, #loc37]) +#loc60 = loc(callsite(#loc23 at #loc56)) +#loc62 = loc(callsite(#loc25 at #loc60)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..002a90ea81a682d5a116fe6a779571cdfa11461a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/I3FLOHYXURM2XXUDZDUSAEQ2II7BYD4JPZA42S67IQC7KETBYUYQ/triton_red_fused_eq_mul_squeeze_sum_2.ttir @@ -0,0 +1,115 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":18:0) +#loc3 = loc(unknown) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":45:25) +#loc31 = loc("in_ptr0"(#loc)) +#loc32 = loc("in_ptr1"(#loc)) +#loc33 = loc("in_ptr2"(#loc)) +#loc34 = loc("out_ptr0"(#loc)) +#loc35 = loc("xnumel"(#loc)) +#loc36 = loc("r0_numel"(#loc)) +#loc58 = loc("tmp7"(#loc25)) +#loc63 = loc(callsite(#loc3 at #loc58)) +module { + tt.func public @triton_red_fused_eq_mul_squeeze_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xmask = arith.constant 2 : i32 loc(#loc37) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc2) + %c8192_i32 = arith.constant 8192 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst = arith.constant dense<8192> : tensor<1x2048xi32> loc(#loc3) + %cst_0 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc38) + %xmask_1 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc37) + %xmask_2 = tt.splat %xmask_1 : i1 -> tensor<1x1xi1> loc(#loc37) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc39) + %r0_base_3 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc40) + %_tmp7 = scf.for %r0_offset = %c0_i32 to %c8192_i32 step %c2048_i32 iter_args(%_tmp7_5 = %cst_0) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc42) + %r0_index_6 = arith.addi %r0_index, %r0_base_3 : tensor<1x2048xi32> loc(#loc42) + %r0_mask = arith.cmpi slt, %r0_index_6, %cst : tensor<1x2048xi32> loc(#loc43) + %tmp0 = arith.muli %xoffset, %c8192_i32 : i32 loc(#loc44) + %tmp0_7 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc60) + %tmp0_8 = arith.addi %r0_index_6, %tmp0_7 : tensor<1x2048xi32> loc(#loc45) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc46) + %tmp0_10 = tt.addptr %tmp0_9, %tmp0_8 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc46) + %tmp0_11 = tt.splat %xmask_1 : i1 -> tensor<1x2048xi1> loc(#loc61) + %tmp0_12 = arith.andi %r0_mask, %tmp0_11 : tensor<1x2048xi1> loc(#loc47) + %tmp0_13 = tt.load %tmp0_10, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc48) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc49) + %tmp1_14 = tt.addptr %tmp1, %tmp0_8 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc49) + %tmp1_15 = tt.load %tmp1_14, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc50) + %tmp4 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc51) + %tmp4_16 = tt.addptr %tmp4, %tmp0_8 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc51) + %tmp4_17 = tt.load %tmp4_16, %tmp0_12, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc52) + %tmp2 = arith.cmpi eq, %tmp0_13, %tmp1_15 : tensor<1x2048xi64> loc(#loc53) + %tmp3 = arith.extui %tmp2 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc54) + %tmp5 = arith.muli %tmp3, %tmp4_17 : tensor<1x2048xi64> loc(#loc55) + %tmp8 = arith.addi %_tmp7_5, %tmp5 : tensor<1x2048xi64> loc(#loc56) + %_tmp7_18 = arith.select %tmp0_12, %tmp8, %_tmp7_5 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc57) + scf.yield %_tmp7_18 : tensor<1x2048xi64> loc(#loc23) + } loc(#loc41) + %tmp7 = "tt.reduce"(%_tmp7) <{axis = 1 : i32}> ({ + ^bb0(%tmp7_5: i64 loc(callsite(#loc3 at #loc58)), %tmp7_6: i64 loc(callsite(#loc3 at #loc58))): + %tmp7_7 = arith.addi %tmp7_5, %tmp7_6 : i64 loc(#loc64) + tt.reduce.return %tmp7_7 : i64 loc(#loc62) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc62) + %tmp7_4 = tt.expand_dims %tmp7 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc59) + %0 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc28) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc28) + tt.store %1, %tmp7_4, %xmask_2 : tensor<1x1x!tt.ptr> loc(#loc29) + tt.return loc(#loc30) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":25:21) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":30:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":23:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":26:27) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":31:31) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":32:29) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":36:46) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":36:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":36:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":36:61) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":36:51) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":37:34) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":37:51) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":38:34) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":38:51) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":39:23) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":40:23) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":41:22) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":43:23) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":44:48) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":44:8) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":45:28) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":46:25) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":46:36) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zb/czbkusdbmiy6kikq4qod7mdx6wj5hskghcdn5gr4z6hcdy4v3nbz.py":46:4) +#loc37 = loc("xmask"(#loc1)) +#loc38 = loc("xoffset"(#loc4)) +#loc39 = loc("r0_base"(#loc5)) +#loc40 = loc("r0_base"(#loc6)) +#loc41 = loc("_tmp7"(#loc2)) +#loc42 = loc("r0_index"(#loc7)) +#loc43 = loc("r0_mask"(#loc8)) +#loc44 = loc("tmp0"(#loc9)) +#loc45 = loc("tmp0"(#loc10)) +#loc46 = loc("tmp0"(#loc11)) +#loc47 = loc("tmp0"(#loc12)) +#loc48 = loc("tmp0"(#loc13)) +#loc49 = loc("tmp1"(#loc14)) +#loc50 = loc("tmp1"(#loc15)) +#loc51 = loc("tmp4"(#loc16)) +#loc52 = loc("tmp4"(#loc17)) +#loc53 = loc("tmp2"(#loc18)) +#loc54 = loc("tmp3"(#loc19)) +#loc55 = loc("tmp5"(#loc20)) +#loc56 = loc("tmp8"(#loc21)) +#loc57 = loc("_tmp7"(#loc22)) +#loc59 = loc("tmp7"(#loc27)) +#loc60 = loc(fused[#loc45, #loc44]) +#loc61 = loc(fused[#loc47, #loc37]) +#loc62 = loc(callsite(#loc24 at #loc58)) +#loc64 = loc(callsite(#loc26 at #loc62)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/__grp__triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/__grp__triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..07d646bb82bd50f208209946f415ee48f3ae4e76 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/__grp__triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..aff6ac4158c086cb685bfcb0db7fa5105cb2a4fe Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..08cd7b2db2a3ac379394dffd27f4c9ab6d5f4fed --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"hash": "40f13ce30f2963a4d8cc6f57d784a290154914937086d3d5d42b778f7bc2b17c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..8a1289cbe5aff7bf82da2353016d03be499c9a5b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.llir @@ -0,0 +1,182 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_argmax_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 6, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = and i32 %9, 252, !dbg !9 + %11 = lshr exact i32 %10, 2, !dbg !9 + %12 = or disjoint i32 %11, %8, !dbg !10 + %13 = and i32 %9, 3, !dbg !11 + %14 = sdiv i32 %12, 2048, !dbg !12 + %15 = mul i32 %12, 32000 + %16 = mul i32 %14, 224000 + %17 = add i32 %16, %15 + %18 = zext nneg i32 %13 to i64, !dbg !13 + br label %19, !dbg !13 + +19: ; preds = %6, %19 + %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %19 ] + %20 = phi i32 [ 2147483647, %6 ], [ %44, %19 ] + %21 = phi float [ 0xFFF0000000000000, %6 ], [ %43, %19 ] + %22 = or disjoint i64 %indvars.iv, %18, !dbg !14 + %23 = trunc nuw nsw i64 %22 to i32, !dbg !15 + %24 = add i32 %17, %23, !dbg !15 + %25 = sext i32 %24 to i64, !dbg !16 + %26 = getelementptr float, ptr addrspace(1) %0, i64 %25, !dbg !16 + %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %28 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %26, i64 %27, i1 true) #4, !dbg !17 + %29 = bitcast i32 %28 to float, !dbg !17 + %30 = fcmp ogt float %21, %29, !dbg !18 + %31 = fcmp oeq float %21, %29, !dbg !22 + %32 = fcmp uno float %21, 0.000000e+00, !dbg !23 + %33 = fcmp uno float %29, 0.000000e+00, !dbg !24 + %34 = xor i1 %33, true, !dbg !25 + %35 = and i1 %32, %34, !dbg !26 + %36 = or i1 %30, %35, !dbg !27 + %37 = and i1 %32, %33, !dbg !28 + %38 = or i1 %31, %37, !dbg !29 + %39 = sext i32 %20 to i64, !dbg !30 + %40 = icmp sgt i64 %22, %39, !dbg !30 + %41 = and i1 %40, %38, !dbg !31 + %42 = or i1 %36, %41, !dbg !32 + %43 = select i1 %42, float %21, float %29, !dbg !33 + %44 = select i1 %42, i32 %20, i32 %23, !dbg !34 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4, !dbg !13 + %45 = icmp samesign ult i64 %indvars.iv, 31996, !dbg !13 + br i1 %45, label %19, label %46, !dbg !13 + +46: ; preds = %19 + %47 = and i32 %9, 63, !dbg !9 + %48 = or disjoint i32 %8, %47, !dbg !10 + %49 = bitcast float %43 to i32, !dbg !35 + %50 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %49, i32 2, i32 31), !dbg !35 + %51 = bitcast i32 %50 to float, !dbg !35 + %52 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 2, i32 31), !dbg !35 + %53 = fcmp ogt float %43, %51, !dbg !37 + %54 = fcmp oeq float %43, %51, !dbg !38 + %55 = fcmp uno float %43, 0.000000e+00, !dbg !39 + %56 = fcmp uno float %51, 0.000000e+00, !dbg !40 + %57 = xor i1 %56, true, !dbg !41 + %58 = and i1 %55, %57, !dbg !42 + %59 = or i1 %53, %58, !dbg !43 + %60 = and i1 %55, %56, !dbg !44 + %61 = or i1 %54, %60, !dbg !45 + %62 = icmp slt i32 %44, %52, !dbg !46 + %63 = and i1 %62, %61, !dbg !47 + %64 = or i1 %59, %63, !dbg !48 + %65 = select i1 %64, float %43, float %51, !dbg !49 + %66 = select i1 %64, i32 %44, i32 %52, !dbg !50 + %67 = bitcast float %65 to i32, !dbg !35 + %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 1, i32 31), !dbg !35 + %69 = bitcast i32 %68 to float, !dbg !35 + %70 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %66, i32 1, i32 31), !dbg !35 + %71 = fcmp ogt float %65, %69, !dbg !37 + %72 = fcmp oeq float %65, %69, !dbg !38 + %73 = fcmp uno float %65, 0.000000e+00, !dbg !39 + %74 = fcmp uno float %69, 0.000000e+00, !dbg !40 + %75 = xor i1 %74, true, !dbg !41 + %76 = and i1 %73, %75, !dbg !42 + %77 = or i1 %71, %76, !dbg !43 + %78 = and i1 %74, %73, !dbg !44 + %79 = or i1 %72, %78, !dbg !45 + %80 = icmp slt i32 %66, %70, !dbg !46 + %81 = and i1 %80, %79, !dbg !47 + %82 = or i1 %77, %81, !dbg !48 + %83 = select i1 %82, i32 %66, i32 %70, !dbg !50 + %84 = sext i32 %48 to i64, !dbg !51 + %85 = getelementptr i64, ptr addrspace(1) %1, i64 %84, !dbg !51 + %86 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %10, !dbg !52 + %87 = insertelement <1 x i32> poison, i32 %83, i64 0, !dbg !52 + store <1 x i32> %87, ptr addrspace(3) %86, align 4, !dbg !52 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !52 + %88 = shl nuw nsw i32 %47, 2, !dbg !52 + %89 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %88, !dbg !52 + %90 = load i32, ptr addrspace(3) %89, align 4, !dbg !52 + %91 = sext i32 %90 to i64, !dbg !52 + %92 = and i32 %9, 192, !dbg !52 + %93 = icmp eq i32 %92, 0, !dbg !52 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %91, ptr addrspace(1) %85, i1 %93) #4, !dbg !52 + ret void, !dbg !53 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_1", linkageName: "triton_red_fused_argmax_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 19, scope: !4) +!13 = !DILocation(line: 33, column: 40, scope: !4) +!14 = !DILocation(line: 34, column: 31, scope: !4) +!15 = !DILocation(line: 39, column: 52, scope: !4) +!16 = !DILocation(line: 39, column: 34, scope: !4) +!17 = !DILocation(line: 39, column: 66, scope: !4) +!18 = !DILocation(line: 144, column: 21, scope: !19, inlinedAt: !21) +!19 = distinct !DILexicalBlockFile(scope: !4, file: !20, discriminator: 0) +!20 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!21 = !DILocation(line: 42, column: 38, scope: !4) +!22 = !DILocation(line: 145, column: 23, scope: !19, inlinedAt: !21) +!23 = !DILocation(line: 147, column: 29, scope: !19, inlinedAt: !21) +!24 = !DILocation(line: 148, column: 29, scope: !19, inlinedAt: !21) +!25 = !DILocation(line: 149, column: 31, scope: !19, inlinedAt: !21) +!26 = !DILocation(line: 149, column: 27, scope: !19, inlinedAt: !21) +!27 = !DILocation(line: 149, column: 16, scope: !19, inlinedAt: !21) +!28 = !DILocation(line: 151, column: 27, scope: !19, inlinedAt: !21) +!29 = !DILocation(line: 151, column: 17, scope: !19, inlinedAt: !21) +!30 = !DILocation(line: 154, column: 31, scope: !19, inlinedAt: !21) +!31 = !DILocation(line: 154, column: 21, scope: !19, inlinedAt: !21) +!32 = !DILocation(line: 154, column: 12, scope: !19, inlinedAt: !21) +!33 = !DILocation(line: 155, column: 35, scope: !19, inlinedAt: !21) +!34 = !DILocation(line: 155, column: 69, scope: !19, inlinedAt: !21) +!35 = !DILocation(line: 165, column: 42, scope: !19, inlinedAt: !36) +!36 = !DILocation(line: 46, column: 75, scope: !4) +!37 = !DILocation(line: 144, column: 21, scope: !19, inlinedAt: !36) +!38 = !DILocation(line: 145, column: 23, scope: !19, inlinedAt: !36) +!39 = !DILocation(line: 147, column: 29, scope: !19, inlinedAt: !36) +!40 = !DILocation(line: 148, column: 29, scope: !19, inlinedAt: !36) +!41 = !DILocation(line: 149, column: 31, scope: !19, inlinedAt: !36) +!42 = !DILocation(line: 149, column: 27, scope: !19, inlinedAt: !36) +!43 = !DILocation(line: 149, column: 16, scope: !19, inlinedAt: !36) +!44 = !DILocation(line: 151, column: 27, scope: !19, inlinedAt: !36) +!45 = !DILocation(line: 151, column: 17, scope: !19, inlinedAt: !36) +!46 = !DILocation(line: 154, column: 31, scope: !19, inlinedAt: !36) +!47 = !DILocation(line: 154, column: 21, scope: !19, inlinedAt: !36) +!48 = !DILocation(line: 154, column: 12, scope: !19, inlinedAt: !36) +!49 = !DILocation(line: 155, column: 35, scope: !19, inlinedAt: !36) +!50 = !DILocation(line: 155, column: 69, scope: !19, inlinedAt: !36) +!51 = !DILocation(line: 48, column: 25, scope: !4) +!52 = !DILocation(line: 48, column: 36, scope: !4) +!53 = !DILocation(line: 48, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..ff8668bc723c171a356b411286bffa16bc514868 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.ptx @@ -0,0 +1,433 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_argmax_1 // -- Begin function triton_red_fused_argmax_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_argmax_1 +.visible .entry triton_red_fused_argmax_1( + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_1, + .param .u32 triton_red_fused_argmax_1_param_2, + .param .u32 triton_red_fused_argmax_1_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_5 +) +.reqntid 256 +{ + .reg .pred %p<33>; + .reg .b32 %r<48>; + .reg .b64 %rd<17>; + .loc 1 18 0 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:18:0 +$L__func_begin0: + .loc 1 18 0 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:18:0 + +// %bb.0: + ld.param.b64 %rd6, [triton_red_fused_argmax_1_param_1]; + ld.param.b64 %rd5, [triton_red_fused_argmax_1_param_0]; +$L__tmp0: + .loc 1 23 28 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:23:28 + mov.u32 %r10, %ctaid.x; + .loc 1 23 33 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:23:33 + shl.b32 %r1, %r10, 6; + .loc 1 24 44 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 252; + bfe.u32 %r11, %r2, 2, 6; + .loc 1 24 23 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:23 + or.b32 %r12, %r11, %r1; + .loc 1 26 37 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:26:37 + and.b32 %r13, %r2, 3; + .loc 1 29 19 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:29:19 + bfe.s32 %r14, %r10, 25, 1; + shr.u32 %r15, %r14, 21; + add.s32 %r16, %r12, %r15; + shr.s32 %r17, %r16, 11; + mul.lo.s32 %r18, %r17, 224000; + .loc 1 33 40 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:33:40 + cvt.u64.u32 %rd1, %r13; + mad.lo.s32 %r19, %r10, 2048000, %r18; + mad.lo.s32 %r20, %r11, 32000, %r19; + .loc 1 39 34 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:34 + or.b32 %r21, %r13, %r20; + cvt.u64.u32 %rd2, %r21; + mov.b32 %r47, 0fFF800000; + mov.b32 %r46, 2147483647; + mov.b64 %rd16, 0; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 39 52 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:52 + add.s64 %rd11, %rd1, %rd16; + .loc 1 39 34 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:34 + add.s64 %rd12, %rd2, %rd16; + cvt.u32.u64 %r24, %rd12; + mad.wide.s32 %rd9, %r24, 4, %rd5; + .loc 1 39 66 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:66 + // begin inline asm + mov.u64 %rd8, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd8, 1.0; + // end inline asm + mov.b32 %r23, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r22, %r23; + @%p1 ld.global.L1::evict_first.L2::cache_hint.b32 { %r22 }, [ %rd9 + 0 ], %rd8; + // end inline asm +$L__tmp1: + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.f32 %p2, %r47, %r22; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.eq.f32 %p3, %r47, %r22; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.nan.f32 %p4, %r47, %r47; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.nan.f32 %p5, %r22, %r22; + setp.num.f32 %p6, %r22, %r22; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p7, %p4, %p6; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p8, %p2, %p7; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p9, %p4, %p5; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p10, %p3, %p9; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + cvt.s64.s32 %rd13, %r46; + setp.gt.s64 %p11, %rd11, %rd13; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p12, %p11, %p10; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p13, %p8, %p12; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + selp.f32 %r47, %r47, %r22, %p13; + cvt.u32.u64 %r25, %rd11; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + selp.b32 %r46, %r46, %r25, %p13; +$L__tmp2: + .loc 1 33 40 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:33:40 + add.s64 %rd4, %rd16, 4; + setp.lt.u64 %p14, %rd16, 31996; + mov.b64 %rd16, %rd4; + @%p14 bra $L__BB0_1; +// %bb.2: + .loc 1 24 44 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:44 + and.b32 %r26, %r2, 63; + .loc 1 24 23 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:23 + or.b32 %r27, %r1, %r26; +$L__tmp3: + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r28, %r47, 2, 31, -1; + shfl.sync.bfly.b32 %r29, %r46, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p16, %r47, %r28; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p17, %r47, %r28; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p18, %r47, %r47; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p19, %r28, %r28; + setp.num.f32 %p20, %r28, %r28; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p21, %p18, %p20; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p22, %p16, %p21; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p23, %p18, %p19; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p24, %p17, %p23; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p25, %r46, %r29; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p26, %p25, %p24; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p27, %p22, %p26; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r30, %r47, %r28, %p27; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r31, %r46, %r29, %p27; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r32, %r30, 1, 31, -1; + shfl.sync.bfly.b32 %r33, %r31, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p28, %r30, %r32; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p29, %r30, %r32; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p30, %r30, %r30; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p31, %r32, %r32; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p32, %r31, %r33; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r34, %r31, %r33, %p30; + selp.b32 %r35, %r34, %r33, %p31; + selp.b32 %r36, %r31, %r35, %p29; + selp.b32 %r37, %r36, %r33, %p32; + selp.b32 %r38, %r37, %r31, %p31; + selp.b32 %r39, %r38, %r37, %p30; + selp.b32 %r40, %r31, %r39, %p28; +$L__tmp4: + .loc 1 48 25 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:48:25 + mad.wide.s32 %rd15, %r27, 8, %rd6; + .loc 1 48 36 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:48:36 + mov.b32 %r41, global_smem; + add.s32 %r42, %r41, %r3; + st.shared.b32 [%r42], %r40; + bar.sync 0; + shl.b32 %r43, %r26, 2; + add.s32 %r44, %r41, %r43; + ld.shared.s32 %rd14, [%r44]; + and.b32 %r45, %r2, 192; + setp.eq.b32 %p15, %r45, 0; + // begin inline asm + @%p15 st.global.b64 [ %rd15 + 0 ], { %rd14 }; + // end inline asm + .loc 1 48 4 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:48:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 234 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 101 +.b8 53 +.b8 114 +.b8 111 +.b8 97 +.b8 116 +.b8 115 +.b8 101 +.b8 122 +.b8 111 +.b8 100 +.b8 113 +.b8 102 +.b8 53 +.b8 113 +.b8 112 +.b8 100 +.b8 51 +.b8 50 +.b8 104 +.b8 107 +.b8 115 +.b8 112 +.b8 116 +.b8 52 +.b8 102 +.b8 99 +.b8 102 +.b8 105 +.b8 113 +.b8 111 +.b8 55 +.b8 106 +.b8 111 +.b8 98 +.b8 111 +.b8 109 +.b8 55 +.b8 112 +.b8 122 +.b8 104 +.b8 115 +.b8 102 +.b8 116 +.b8 117 +.b8 107 +.b8 50 +.b8 55 +.b8 103 +.b8 106 +.b8 105 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 101 +.b8 53 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.source new file mode 100644 index 0000000000000000000000000000000000000000..864cab0a05da55e5949e9545c4508fcf9f81304f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.source @@ -0,0 +1,315 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":18:0) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc45 = loc(unknown) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("out_ptr0"(#loc)) +#loc72 = loc("xnumel"(#loc)) +#loc73 = loc("r0_numel"(#loc)) +#loc100 = loc("a_value"(#loc33)) +#loc101 = loc("a_index"(#loc33)) +#loc102 = loc("b_value"(#loc33)) +#loc103 = loc("b_index"(#loc33)) +#loc116 = loc("x"(#loc53)) +#loc117 = loc("x"(#loc57)) +#loc118 = loc("value"(#loc66)) +#loc119 = loc("index"(#loc66)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 16384 : i32 loc(#loc74) + %r0_numel_1 = arith.constant 32000 : i32 loc(#loc75) + %xoffset = tt.get_program_id x : i32 loc(#loc76) + %xoffset_2 = arith.constant 64 : i32 loc(#loc77) + %xoffset_3 = arith.constant 64 : i32 loc(#loc77) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc77) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc78) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc79) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc80) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc80) + %xmask = arith.constant true loc(#loc81) + %xmask_8 = arith.constant dense : tensor<64x4xi1> loc(#loc81) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc82) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc83) + %x0 = arith.constant 2048 : i32 loc(#loc84) + %x0_10 = arith.constant 2048 : i32 loc(#loc84) + %x0_11 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc84) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc84) + %x1 = arith.constant 2048 : i32 loc(#loc85) + %x1_13 = arith.constant 2048 : i32 loc(#loc85) + %x1_14 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc85) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc85) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc86) + %_tmp2_16 = arith.constant dense<0xFF800000> : tensor<64x4xf32> loc(#loc86) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc87) + %_tmp2_index_17 = arith.constant dense<2147483647> : tensor<64x4xi32> loc(#loc87) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c4_i32 = arith.constant 4 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp2_index_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_19 = %_tmp2_16, %_tmp2_index_20 = %_tmp2_index_17) -> (tensor<64x4xf32>, tensor<64x4xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc89) + %r0_index_21 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc89) + %r0_mask = arith.constant dense<32000> : tensor<1x4xi32> loc(#loc90) + %r0_mask_22 = arith.cmpi slt, %r0_index_21, %r0_mask : tensor<1x4xi32> loc(#loc90) + %tmp0 = arith.constant 32000 : i32 loc(#loc91) + %tmp0_23 = arith.constant 32000 : i32 loc(#loc91) + %tmp0_24 = arith.constant dense<32000> : tensor<64x1xi32> loc(#loc91) + %tmp0_25 = arith.muli %tmp0_24, %x0_12 : tensor<64x1xi32> loc(#loc91) + %tmp0_26 = tt.broadcast %r0_index_21 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc92) + %tmp0_27 = tt.broadcast %tmp0_25 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc92) + %tmp0_28 = arith.addi %tmp0_26, %tmp0_27 : tensor<64x4xi32> loc(#loc92) + %tmp0_29 = arith.constant 65760000 : i32 loc(#loc93) + %tmp0_30 = arith.constant 65760000 : i32 loc(#loc93) + %tmp0_31 = arith.constant dense<65760000> : tensor<64x1xi32> loc(#loc93) + %tmp0_32 = arith.muli %tmp0_31, %x1_15 : tensor<64x1xi32> loc(#loc93) + %tmp0_33 = tt.broadcast %tmp0_32 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc94) + %tmp0_34 = arith.addi %tmp0_28, %tmp0_33 : tensor<64x4xi32> loc(#loc94) + %tmp0_35 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc95) + %tmp0_36 = tt.addptr %tmp0_35, %tmp0_34 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc95) + %tmp0_37 = arith.constant 0.000000e+00 : f32 loc(#loc96) + %tmp0_38 = tt.broadcast %r0_mask_22 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc96) + %tmp0_39 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc96) + %tmp0_40 = tt.load %tmp0_36, %tmp0_38, %tmp0_39 evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc96) + %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S64_4S_i32S64_4S_fp32S64_4S_i32S1_4S__(%_tmp2_19, %_tmp2_index_20, %tmp0_40, %r0_index_21) : (tensor<64x4xf32>, tensor<64x4xi32>, tensor<64x4xf32>, tensor<1x4xi32>) -> (tensor<64x4xf32>, tensor<64x4xi32>) loc(#loc24) + %_tmp2_41 = tt.broadcast %r0_mask_22 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc97) + %_tmp2_42 = arith.select %_tmp2_41, %8#0, %_tmp2_19 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc97) + %_tmp2_index_43 = tt.broadcast %r0_mask_22 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc98) + %_tmp2_index_44 = arith.select %_tmp2_index_43, %8#1, %_tmp2_index_20 : tensor<64x4xi1>, tensor<64x4xi32> loc(#loc98) + scf.yield %_tmp2_42, %_tmp2_index_44 : tensor<64x4xf32>, tensor<64x4xi32> loc(#loc27) + } loc(#loc120) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S64_4S_i32S64_4S__(2,)cconstexpr_1_"(%_tmp2_index_18#0, %_tmp2_index_18#1) : (tensor<64x4xf32>, tensor<64x4xi32>) -> (tensor<64xf32>, tensor<64xi32>) loc(#loc28) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc99) + %5 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc30) + %6 = tt.addptr %5, %xindex_7 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc30) + %7 = arith.extsi %tmp2 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc31) + tt.store %6, %7 : tensor<64x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S64_4S_i32S64_4S_fp32S64_4S_i32S1_4S__(%a_value: tensor<64x4xf32> loc("a_value"(#loc33)), %a_index: tensor<64x4xi32> loc("a_index"(#loc33)), %b_value: tensor<64x4xf32> loc("b_value"(#loc33)), %b_index: tensor<1x4xi32> loc("b_index"(#loc33))) -> (tensor<64x4xf32>, tensor<64x4xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<64x4xf32> loc(#loc121) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<64x4xf32> loc(#loc122) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S64_4S__(%a_value) : (tensor<64x4xf32>) -> i1 loc(#loc36) + %1:2 = scf.if %0 -> (tensor<64x4xi1>, tensor<64x4xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<64x4xf32> loc(#loc106) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<64x4xf32> loc(#loc107) + %mask_4 = arith.constant true loc(#loc108) + %mask_5 = arith.constant dense : tensor<64x4xi1> loc(#loc108) + %mask_6 = arith.xori %b_isnan, %mask_5 : tensor<64x4xi1> loc(#loc108) + %mask_7 = arith.andi %a_isnan, %mask_6 : tensor<64x4xi1> loc(#loc109) + %mask_8 = arith.ori %mask, %mask_7 : tensor<64x4xi1> loc(#loc123) + %equal_9 = arith.andi %a_isnan, %b_isnan : tensor<64x4xi1> loc(#loc111) + %equal_10 = arith.ori %equal, %equal_9 : tensor<64x4xi1> loc(#loc124) + scf.yield %mask_8, %equal_10 : tensor<64x4xi1>, tensor<64x4xi1> loc(#loc124) + } else { + scf.yield %mask, %equal : tensor<64x4xi1>, tensor<64x4xi1> loc(#loc45) + } loc(#loc37) + %mask_0 = tt.broadcast %b_index : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc113) + %mask_1 = arith.cmpi slt, %a_index, %mask_0 : tensor<64x4xi32> loc(#loc113) + %mask_2 = arith.andi %1#1, %mask_1 : tensor<64x4xi1> loc(#loc114) + %mask_3 = arith.ori %1#0, %mask_2 : tensor<64x4xi1> loc(#loc115) + %2 = arith.select %mask_3, %a_value, %b_value : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc49) + %3 = tt.broadcast %b_index : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc50) + %4 = arith.select %mask_3, %a_index, %3 : tensor<64x4xi1>, tensor<64x4xi32> loc(#loc50) + tt.return %2, %4 : tensor<64x4xf32>, tensor<64x4xi32> loc(#loc51) + ^bb1: // no predecessors + %5 = ub.poison : tensor<64x4xf32> loc(#loc52) + %6 = ub.poison : tensor<64x4xi32> loc(#loc52) + tt.return %5, %6 : tensor<64x4xf32>, tensor<64x4xi32> loc(#loc52) + } loc(#loc33) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S64_4S__(%x: tensor<64x4xf32> loc("x"(#loc53))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S64_4S__(%x) : (tensor<64x4xf32>) -> tensor<64x4xf32> loc(#loc54) + %true = arith.constant true loc(#loc55) + tt.return %true : i1 loc(#loc55) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc56) + tt.return %1 : i1 loc(#loc56) + } loc(#loc53) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S64_4S__(%x: tensor<64x4xf32> loc("x"(#loc57))) -> tensor<64x4xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc59) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<64x4xf32> loc(#loc59) + %4 = arith.addf %x, %3 : tensor<64x4xf32> loc(#loc59) + tt.return %4 : tensor<64x4xf32> loc(#loc60) + ^bb1: // no predecessors + %5 = ub.poison : tensor<64x4xf32> loc(#loc61) + tt.return %5 : tensor<64x4xf32> loc(#loc61) + } loc(#loc57) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc63) + %cst = arith.constant dense : tensor<1xi1> loc(#loc63) + tt.return %cst : tensor<1xi1> loc(#loc64) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc65) + tt.return %0 : tensor<1xi1> loc(#loc65) + } loc(#loc62) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S64_4S_i32S64_4S__(2,)cconstexpr_1_"(%value: tensor<64x4xf32> loc("value"(#loc66)), %index: tensor<64x4xi32> loc("index"(#loc66))) -> (tensor<64xf32>, tensor<64xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc67) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc67) + }) : (tensor<64x4xf32>, tensor<64x4xi32>) -> (tensor<64xf32>, tensor<64xi32>) loc(#loc67) + tt.return %0#0, %0#1 : tensor<64xf32>, tensor<64xi32> loc(#loc68) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc69) + %2 = ub.poison : tensor<64xi32> loc(#loc69) + tt.return %1, %2 : tensor<64xf32>, tensor<64xi32> loc(#loc69) + } loc(#loc66) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc33)), %a_index: i32 loc("a_index"(#loc33)), %b_value: f32 loc("b_value"(#loc33)), %b_index: i32 loc("b_index"(#loc33))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc121) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc122) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc36) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc106) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc107) + %mask_3 = arith.constant true loc(#loc108) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc108) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc109) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc123) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc111) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc124) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc124) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc45) + } loc(#loc37) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc113) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc114) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc115) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc49) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc50) + tt.return %2, %3 : f32, i32 loc(#loc51) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc52) + %5 = ub.poison : i32 loc(#loc52) + tt.return %4, %5 : f32, i32 loc(#loc52) + } loc(#loc33) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc53))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc54) + %true = arith.constant true loc(#loc55) + tt.return %true : i1 loc(#loc55) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc56) + tt.return %1 : i1 loc(#loc56) + } loc(#loc53) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc57))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc59) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc59) + tt.return %3 : tensor<1xf32> loc(#loc60) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc61) + tt.return %4 : tensor<1xf32> loc(#loc61) + } loc(#loc57) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":29:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":30:55) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":31:58) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":33:40) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":34:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":35:29) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:47) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:61) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:52) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:66) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":42:38) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":44:46) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:58) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:8) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":46:75) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":47:20) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:25) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:36) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:4) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc74 = loc("xnumel"(#loc1)) +#loc75 = loc("r0_numel"(#loc2)) +#loc76 = loc("xoffset"(#loc3)) +#loc77 = loc("xoffset"(#loc4)) +#loc78 = loc("xindex"(#loc5)) +#loc79 = loc("xindex"(#loc6)) +#loc80 = loc("xindex"(#loc7)) +#loc81 = loc("xmask"(#loc8)) +#loc82 = loc("r0_base"(#loc9)) +#loc83 = loc("r0_base"(#loc10)) +#loc84 = loc("x0"(#loc11)) +#loc85 = loc("x1"(#loc12)) +#loc86 = loc("_tmp2"(#loc13)) +#loc87 = loc("_tmp2_index"(#loc14)) +#loc88 = loc("_tmp2"(#loc15)) +#loc89 = loc("r0_index"(#loc16)) +#loc90 = loc("r0_mask"(#loc17)) +#loc91 = loc("tmp0"(#loc18)) +#loc92 = loc("tmp0"(#loc19)) +#loc93 = loc("tmp0"(#loc20)) +#loc94 = loc("tmp0"(#loc21)) +#loc95 = loc("tmp0"(#loc22)) +#loc96 = loc("tmp0"(#loc23)) +#loc97 = loc("_tmp2"(#loc25)) +#loc98 = loc("_tmp2_index"(#loc26)) +#loc99 = loc("tmp2"(#loc29)) +#loc104 = loc("mask"(#loc34)) +#loc105 = loc("equal"(#loc35)) +#loc106 = loc("a_isnan"(#loc38)) +#loc107 = loc("b_isnan"(#loc39)) +#loc108 = loc("mask"(#loc40)) +#loc109 = loc("mask"(#loc41)) +#loc110 = loc("mask"(#loc42)) +#loc111 = loc("equal"(#loc43)) +#loc112 = loc("equal"(#loc44)) +#loc113 = loc("mask"(#loc46)) +#loc114 = loc("mask"(#loc47)) +#loc115 = loc("mask"(#loc48)) +#loc120 = loc("_tmp2_index"(#loc88)) +#loc121 = loc("mask"(#loc104)) +#loc122 = loc("equal"(#loc105)) +#loc123 = loc("mask"(#loc110)) +#loc124 = loc("equal"(#loc112)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..c896dcb96a5f896ec1ab21b41641a079e7c17fe5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.ttgir @@ -0,0 +1,203 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":18:0) +#loc1 = loc(unknown) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":46:75) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc79 = loc(callsite(#loc1 at #loc37)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<2048> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<32000> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<65760000> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %cst_3 = arith.constant dense : tensor<64x4xi1, #blocked> loc(#loc1) + %true = arith.constant true loc(#loc1) + %cst_4 = arith.constant dense<32000> : tensor<1x4xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<2147483647> : tensor<64x4xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<0xFF800000> : tensor<64x4xf32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc46) + %xoffset_7 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc47) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc48) + %xindex_8 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc48) + %xindex_9 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc48) + %xindex_10 = tt.expand_dims %xindex_8 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc48) + %xindex_11 = tt.splat %xoffset_7 : i32 -> tensor<64x1xi32, #blocked> loc(#loc49) + %xindex_12 = tt.splat %xoffset_7 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc49) + %xindex_13 = arith.addi %xindex_11, %xindex_9 : tensor<64x1xi32, #blocked> loc(#loc49) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<64x1xi32, #blocked1> loc(#loc49) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc50) + %r0_base_15 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4xi32, #blocked> loc(#loc50) + %x0 = arith.remsi %xindex_13, %cst : tensor<64x1xi32, #blocked> loc(#loc51) + %x1 = arith.divsi %xindex_13, %cst : tensor<64x1xi32, #blocked> loc(#loc52) + %tmp0 = arith.muli %x0, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc53) + %tmp0_16 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc54) + %tmp0_17 = arith.muli %x1, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc55) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc56) + %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked> loc(#loc57) + %_tmp2_index:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c4_i32 iter_args(%_tmp2 = %cst_6, %_tmp2_index_20 = %cst_5) -> (tensor<64x4xf32, #blocked>, tensor<64x4xi32, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked> loc(#loc59) + %r0_index_21 = arith.addi %r0_index, %r0_base_15 : tensor<1x4xi32, #blocked> loc(#loc59) + %r0_mask = arith.cmpi slt, %r0_index_21, %cst_4 : tensor<1x4xi32, #blocked> loc(#loc60) + %tmp0_22 = tt.broadcast %r0_index_21 : tensor<1x4xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc54) + %tmp0_23 = arith.addi %tmp0_22, %tmp0_16 : tensor<64x4xi32, #blocked> loc(#loc54) + %tmp0_24 = arith.addi %tmp0_23, %tmp0_18 : tensor<64x4xi32, #blocked> loc(#loc56) + %tmp0_25 = tt.addptr %tmp0_19, %tmp0_24 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi32, #blocked> loc(#loc57) + %tmp0_26 = tt.broadcast %r0_mask : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc61) + %tmp0_27 = tt.load %tmp0_25, %tmp0_26, %cst_2 evictionPolicy = evict_first : tensor<64x4x!tt.ptr, #blocked> loc(#loc61) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_27 : tensor<64x4xf32, #blocked> loc(#loc104) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_27 : tensor<64x4xf32, #blocked> loc(#loc105) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<64x4xf32, #blocked> loc(#loc84) + %b_isnan = arith.cmpf une, %tmp0_27, %tmp0_27 : tensor<64x4xf32, #blocked> loc(#loc85) + %mask_28 = arith.xori %b_isnan, %cst_3 : tensor<64x4xi1, #blocked> loc(#loc86) + %mask_29 = arith.andi %a_isnan, %mask_28 : tensor<64x4xi1, #blocked> loc(#loc87) + %mask_30 = arith.ori %mask, %mask_29 : tensor<64x4xi1, #blocked> loc(#loc106) + %equal_31 = arith.andi %a_isnan, %b_isnan : tensor<64x4xi1, #blocked> loc(#loc89) + %equal_32 = arith.ori %equal, %equal_31 : tensor<64x4xi1, #blocked> loc(#loc107) + %mask_33 = arith.cmpi slt, %_tmp2_index_20, %tmp0_22 : tensor<64x4xi32, #blocked> loc(#loc91) + %mask_34 = arith.andi %equal_32, %mask_33 : tensor<64x4xi1, #blocked> loc(#loc92) + %mask_35 = arith.ori %mask_30, %mask_34 : tensor<64x4xi1, #blocked> loc(#loc93) + %5 = arith.select %mask_35, %_tmp2, %tmp0_27 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc74) + %6 = arith.select %mask_35, %_tmp2_index_20, %tmp0_22 : tensor<64x4xi1, #blocked>, tensor<64x4xi32, #blocked> loc(#loc75) + %_tmp2_36 = arith.select %tmp0_26, %5, %_tmp2 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc76) + %_tmp2_index_37 = arith.select %tmp0_26, %6, %_tmp2_index_20 : tensor<64x4xi1, #blocked>, tensor<64x4xi32, #blocked> loc(#loc77) + scf.yield %_tmp2_36, %_tmp2_index_37 : tensor<64x4xf32, #blocked>, tensor<64x4xi32, #blocked> loc(#loc35) + } loc(#loc81) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc37)), %arg5: i32 loc(callsite(#loc1 at #loc37)), %arg6: f32 loc(callsite(#loc1 at #loc37)), %arg7: i32 loc(callsite(#loc1 at #loc37))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc108) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc109) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc94) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc95) + %mask_20 = arith.xori %b_isnan, %true : i1 loc(#loc96) + %mask_21 = arith.andi %a_isnan, %mask_20 : i1 loc(#loc97) + %mask_22 = arith.ori %mask, %mask_21 : i1 loc(#loc110) + %equal_23 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc98) + %equal_24 = arith.ori %equal, %equal_23 : i1 loc(#loc111) + %mask_25 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc99) + %mask_26 = arith.andi %equal_24, %mask_25 : i1 loc(#loc100) + %mask_27 = arith.ori %mask_22, %mask_26 : i1 loc(#loc101) + %5 = arith.select %mask_27, %arg4, %arg6 : f32 loc(#loc102) + %6 = arith.select %mask_27, %arg5, %arg7 : i32 loc(#loc103) + tt.reduce.return %5, %6 : f32, i32 loc(#loc78) + }) : (tensor<64x4xf32, #blocked>, tensor<64x4xi32, #blocked>) -> (tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc78) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc80) + %1 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc39) + %2 = tt.addptr %1, %xindex_14 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc39) + %3 = ttg.convert_layout %tmp2 : tensor<64x1xi32, #blocked> -> tensor<64x1xi32, #blocked1> loc(#loc40) + %4 = arith.extsi %3 : tensor<64x1xi32, #blocked1> to tensor<64x1xi64, #blocked1> loc(#loc40) + tt.store %2, %4 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc40) + tt.return loc(#loc41) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":29:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:47) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:61) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:52) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:34) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":33:40) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":34:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":35:29) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:66) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":42:38) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":44:46) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:58) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:8) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":47:20) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:25) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:36) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:4) +#loc46 = loc("xoffset"(#loc2)) +#loc47 = loc("xoffset"(#loc3)) +#loc48 = loc("xindex"(#loc4)) +#loc49 = loc("xindex"(#loc5)) +#loc50 = loc("r0_base"(#loc6)) +#loc51 = loc("x0"(#loc7)) +#loc52 = loc("x1"(#loc8)) +#loc53 = loc("tmp0"(#loc9)) +#loc54 = loc("tmp0"(#loc10)) +#loc55 = loc("tmp0"(#loc11)) +#loc56 = loc("tmp0"(#loc12)) +#loc57 = loc("tmp0"(#loc13)) +#loc58 = loc("_tmp2"(#loc14)) +#loc59 = loc("r0_index"(#loc15)) +#loc60 = loc("r0_mask"(#loc16)) +#loc61 = loc("tmp0"(#loc17)) +#loc62 = loc("mask"(#loc18)) +#loc63 = loc("equal"(#loc20)) +#loc64 = loc("a_isnan"(#loc21)) +#loc65 = loc("b_isnan"(#loc22)) +#loc66 = loc("mask"(#loc23)) +#loc67 = loc("mask"(#loc24)) +#loc68 = loc("mask"(#loc25)) +#loc69 = loc("equal"(#loc26)) +#loc70 = loc("equal"(#loc27)) +#loc71 = loc("mask"(#loc28)) +#loc72 = loc("mask"(#loc29)) +#loc73 = loc("mask"(#loc30)) +#loc74 = loc(callsite(#loc31 at #loc19)) +#loc75 = loc(callsite(#loc32 at #loc19)) +#loc76 = loc("_tmp2"(#loc33)) +#loc77 = loc("_tmp2_index"(#loc34)) +#loc78 = loc(callsite(#loc36 at #loc37)) +#loc80 = loc("tmp2"(#loc38)) +#loc81 = loc("_tmp2_index"(#loc58)) +#loc82 = loc("mask"(#loc62)) +#loc83 = loc("equal"(#loc63)) +#loc84 = loc(callsite(#loc64 at #loc19)) +#loc85 = loc(callsite(#loc65 at #loc19)) +#loc86 = loc(callsite(#loc66 at #loc19)) +#loc87 = loc(callsite(#loc67 at #loc19)) +#loc88 = loc("mask"(#loc68)) +#loc89 = loc(callsite(#loc69 at #loc19)) +#loc90 = loc("equal"(#loc70)) +#loc91 = loc(callsite(#loc71 at #loc19)) +#loc92 = loc(callsite(#loc72 at #loc19)) +#loc93 = loc(callsite(#loc73 at #loc19)) +#loc94 = loc(callsite(#loc64 at #loc78)) +#loc95 = loc(callsite(#loc65 at #loc78)) +#loc96 = loc(callsite(#loc66 at #loc78)) +#loc97 = loc(callsite(#loc67 at #loc78)) +#loc98 = loc(callsite(#loc69 at #loc78)) +#loc99 = loc(callsite(#loc71 at #loc78)) +#loc100 = loc(callsite(#loc72 at #loc78)) +#loc101 = loc(callsite(#loc73 at #loc78)) +#loc102 = loc(callsite(#loc31 at #loc78)) +#loc103 = loc(callsite(#loc32 at #loc78)) +#loc104 = loc(callsite(#loc82 at #loc19)) +#loc105 = loc(callsite(#loc83 at #loc19)) +#loc106 = loc(callsite(#loc88 at #loc19)) +#loc107 = loc(callsite(#loc90 at #loc19)) +#loc108 = loc(callsite(#loc82 at #loc78)) +#loc109 = loc(callsite(#loc83 at #loc78)) +#loc110 = loc(callsite(#loc88 at #loc78)) +#loc111 = loc(callsite(#loc90 at #loc78)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..07eb645e73fcd7ca56934e1bf156638e8027e416 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/IDYTZYYPFFR2JWGMN5L5PBFCSAKUSFETOCDNHVOUFN3Y666CWF6A/triton_red_fused_argmax_1.ttir @@ -0,0 +1,204 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":46:75) +#loc46 = loc("in_ptr0"(#loc)) +#loc47 = loc("out_ptr0"(#loc)) +#loc48 = loc("xnumel"(#loc)) +#loc49 = loc("r0_numel"(#loc)) +#loc50 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %true = arith.constant true loc(#loc50) + %cst = arith.constant dense : tensor<64x4xi1> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc3) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc1) + %cst_1 = arith.constant dense<65760000> : tensor<64x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<32000> : tensor<64x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<32000> : tensor<1x4xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<64x4xi32> loc(#loc51) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<64x4xf32> loc(#loc52) + %cst_4 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc53) + %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc54) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc55) + %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc56) + %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32> loc(#loc57) + %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<64x1xi32> loc(#loc57) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc58) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc59) + %x0 = arith.remsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc60) + %x1 = arith.divsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc61) + %_tmp2_index_10:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c4_i32 iter_args(%_tmp2_11 = %_tmp2, %_tmp2_index_12 = %_tmp2_index) -> (tensor<64x4xf32>, tensor<64x4xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc63) + %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc63) + %r0_mask = arith.cmpi slt, %r0_index_13, %cst_3 : tensor<1x4xi32> loc(#loc64) + %tmp0 = arith.muli %x0, %cst_2 : tensor<64x1xi32> loc(#loc65) + %tmp0_14 = tt.broadcast %r0_index_13 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc66) + %tmp0_15 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc66) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<64x4xi32> loc(#loc66) + %tmp0_17 = arith.muli %x1, %cst_1 : tensor<64x1xi32> loc(#loc67) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc68) + %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<64x4xi32> loc(#loc68) + %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc69) + %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc69) + %tmp0_22 = tt.broadcast %r0_mask : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc70) + %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_0 evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc70) + %mask = arith.cmpf ogt, %_tmp2_11, %tmp0_23 : tensor<64x4xf32> loc(#loc112) + %equal = arith.cmpf oeq, %_tmp2_11, %tmp0_23 : tensor<64x4xf32> loc(#loc113) + %a_isnan = arith.cmpf une, %_tmp2_11, %_tmp2_11 : tensor<64x4xf32> loc(#loc92) + %b_isnan = arith.cmpf une, %tmp0_23, %tmp0_23 : tensor<64x4xf32> loc(#loc93) + %mask_24 = arith.xori %b_isnan, %cst : tensor<64x4xi1> loc(#loc94) + %mask_25 = arith.andi %a_isnan, %mask_24 : tensor<64x4xi1> loc(#loc95) + %mask_26 = arith.ori %mask, %mask_25 : tensor<64x4xi1> loc(#loc114) + %equal_27 = arith.andi %a_isnan, %b_isnan : tensor<64x4xi1> loc(#loc97) + %equal_28 = arith.ori %equal, %equal_27 : tensor<64x4xi1> loc(#loc115) + %mask_29 = arith.cmpi slt, %_tmp2_index_12, %tmp0_14 : tensor<64x4xi32> loc(#loc99) + %mask_30 = arith.andi %equal_28, %mask_29 : tensor<64x4xi1> loc(#loc100) + %mask_31 = arith.ori %mask_26, %mask_30 : tensor<64x4xi1> loc(#loc101) + %4 = arith.select %mask_31, %_tmp2_11, %tmp0_23 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc83) + %5 = arith.select %mask_31, %_tmp2_index_12, %tmp0_14 : tensor<64x4xi1>, tensor<64x4xi32> loc(#loc84) + %_tmp2_32 = arith.select %tmp0_22, %4, %_tmp2_11 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc85) + %_tmp2_index_33 = arith.select %tmp0_22, %5, %_tmp2_index_12 : tensor<64x4xi1>, tensor<64x4xi32> loc(#loc86) + scf.yield %_tmp2_32, %_tmp2_index_33 : tensor<64x4xf32>, tensor<64x4xi32> loc(#loc40) + } loc(#loc89) + %0:2 = "tt.reduce"(%_tmp2_index_10#0, %_tmp2_index_10#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc2)), %arg5: i32 loc(callsite(#loc1 at #loc2)), %arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc116) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc117) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc102) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc103) + %mask_11 = arith.xori %b_isnan, %true : i1 loc(#loc104) + %mask_12 = arith.andi %a_isnan, %mask_11 : i1 loc(#loc105) + %mask_13 = arith.ori %mask, %mask_12 : i1 loc(#loc118) + %equal_14 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc106) + %equal_15 = arith.ori %equal, %equal_14 : i1 loc(#loc119) + %mask_16 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc107) + %mask_17 = arith.andi %equal_15, %mask_16 : i1 loc(#loc108) + %mask_18 = arith.ori %mask_13, %mask_17 : i1 loc(#loc109) + %4 = arith.select %mask_18, %arg4, %arg6 : f32 loc(#loc110) + %5 = arith.select %mask_18, %arg5, %arg7 : i32 loc(#loc111) + tt.reduce.return %4, %5 : f32, i32 loc(#loc87) + }) : (tensor<64x4xf32>, tensor<64x4xi32>) -> (tensor<64xf32>, tensor<64xi32>) loc(#loc87) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc88) + %1 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc43) + %2 = tt.addptr %1, %xindex_8 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc43) + %3 = arith.extsi %tmp2 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc44) + tt.store %2, %3 : tensor<64x1x!tt.ptr> loc(#loc44) + tt.return loc(#loc45) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":33:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":31:58) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":30:55) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:33) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:44) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:23) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:27) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:37) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":28:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":29:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":34:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":35:29) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:47) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:41) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:61) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:52) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:66) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":42:38) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":44:46) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:58) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:8) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":47:20) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:25) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:36) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:4) +#loc51 = loc("_tmp2_index"(#loc4)) +#loc52 = loc("_tmp2"(#loc5)) +#loc53 = loc("xoffset"(#loc6)) +#loc54 = loc("xoffset"(#loc7)) +#loc55 = loc("xindex"(#loc8)) +#loc56 = loc("xindex"(#loc9)) +#loc57 = loc("xindex"(#loc10)) +#loc58 = loc("r0_base"(#loc11)) +#loc59 = loc("r0_base"(#loc12)) +#loc60 = loc("x0"(#loc13)) +#loc61 = loc("x1"(#loc14)) +#loc62 = loc("_tmp2"(#loc3)) +#loc63 = loc("r0_index"(#loc15)) +#loc64 = loc("r0_mask"(#loc16)) +#loc65 = loc("tmp0"(#loc17)) +#loc66 = loc("tmp0"(#loc18)) +#loc67 = loc("tmp0"(#loc19)) +#loc68 = loc("tmp0"(#loc20)) +#loc69 = loc("tmp0"(#loc21)) +#loc70 = loc("tmp0"(#loc22)) +#loc71 = loc("mask"(#loc23)) +#loc72 = loc("equal"(#loc25)) +#loc73 = loc("a_isnan"(#loc26)) +#loc74 = loc("b_isnan"(#loc27)) +#loc75 = loc("mask"(#loc28)) +#loc76 = loc("mask"(#loc29)) +#loc77 = loc("mask"(#loc30)) +#loc78 = loc("equal"(#loc31)) +#loc79 = loc("equal"(#loc32)) +#loc80 = loc("mask"(#loc33)) +#loc81 = loc("mask"(#loc34)) +#loc82 = loc("mask"(#loc35)) +#loc83 = loc(callsite(#loc36 at #loc24)) +#loc84 = loc(callsite(#loc37 at #loc24)) +#loc85 = loc("_tmp2"(#loc38)) +#loc86 = loc("_tmp2_index"(#loc39)) +#loc87 = loc(callsite(#loc41 at #loc2)) +#loc88 = loc("tmp2"(#loc42)) +#loc89 = loc("_tmp2_index"(#loc62)) +#loc90 = loc("mask"(#loc71)) +#loc91 = loc("equal"(#loc72)) +#loc92 = loc(callsite(#loc73 at #loc24)) +#loc93 = loc(callsite(#loc74 at #loc24)) +#loc94 = loc(callsite(#loc75 at #loc24)) +#loc95 = loc(callsite(#loc76 at #loc24)) +#loc96 = loc("mask"(#loc77)) +#loc97 = loc(callsite(#loc78 at #loc24)) +#loc98 = loc("equal"(#loc79)) +#loc99 = loc(callsite(#loc80 at #loc24)) +#loc100 = loc(callsite(#loc81 at #loc24)) +#loc101 = loc(callsite(#loc82 at #loc24)) +#loc102 = loc(callsite(#loc73 at #loc87)) +#loc103 = loc(callsite(#loc74 at #loc87)) +#loc104 = loc(callsite(#loc75 at #loc87)) +#loc105 = loc(callsite(#loc76 at #loc87)) +#loc106 = loc(callsite(#loc78 at #loc87)) +#loc107 = loc(callsite(#loc80 at #loc87)) +#loc108 = loc(callsite(#loc81 at #loc87)) +#loc109 = loc(callsite(#loc82 at #loc87)) +#loc110 = loc(callsite(#loc36 at #loc87)) +#loc111 = loc(callsite(#loc37 at #loc87)) +#loc112 = loc(callsite(#loc90 at #loc24)) +#loc113 = loc(callsite(#loc91 at #loc24)) +#loc114 = loc(callsite(#loc96 at #loc24)) +#loc115 = loc(callsite(#loc98 at #loc24)) +#loc116 = loc(callsite(#loc90 at #loc87)) +#loc117 = loc(callsite(#loc91 at #loc87)) +#loc118 = loc(callsite(#loc96 at #loc87)) +#loc119 = loc(callsite(#loc98 at #loc87)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/__grp__triton_poi_fused_new_zeros_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/__grp__triton_poi_fused_new_zeros_1.json new file mode 100644 index 0000000000000000000000000000000000000000..697d4a4c2d5fa2cac393343ae6c1b0ca2dd8ddd4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/__grp__triton_poi_fused_new_zeros_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_new_zeros_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.source", "triton_poi_fused_new_zeros_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.ttir", "triton_poi_fused_new_zeros_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.ttgir", "triton_poi_fused_new_zeros_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.llir", "triton_poi_fused_new_zeros_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.ptx", "triton_poi_fused_new_zeros_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.cubin", "triton_poi_fused_new_zeros_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..eaf604f03a3c0bc1b16fe5f6c214289e9d7f3b87 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.json new file mode 100644 index 0000000000000000000000000000000000000000..dfda29abf9cfcb47086983a448bd5be2c17561c6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.json @@ -0,0 +1 @@ +{"hash": "435c513025e01c69d24d8bc2bafc9bf938fe6c063737adc957a016e6b8967268", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_new_zeros_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..c96d1e6258cef248c9dd0a2b831e592399dfc647 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.llir @@ -0,0 +1,46 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_new_zeros_1(ptr addrspace(1) %0, i32 %1, ptr addrspace(1) readnone captures(none) %2, ptr addrspace(1) readnone captures(none) %3) local_unnamed_addr #0 !dbg !4 { + %5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %6 = shl i32 %5, 7, !dbg !8 + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %8 = and i32 %7, 127, !dbg !9 + %9 = or disjoint i32 %6, %8, !dbg !10 + %10 = icmp slt i32 %9, 2176, !dbg !11 + %11 = sext i32 %9 to i64, !dbg !12 + %12 = getelementptr i32, ptr addrspace(1) %0, i64 %11, !dbg !12 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 0, ptr addrspace(1) %12, i1 %10) #2, !dbg !13 + ret void, !dbg !14 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_new_zeros_1", linkageName: "triton_poi_fused_new_zeros_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 22, column: 21, scope: !4) +!12 = !DILocation(line: 25, column: 25, scope: !4) +!13 = !DILocation(line: 25, column: 36, scope: !4) +!14 = !DILocation(line: 25, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..0b65231c9ba5c8332658070810691a703252f4b9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.ptx @@ -0,0 +1,206 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_new_zeros_1 // -- Begin function triton_poi_fused_new_zeros_1 + // @triton_poi_fused_new_zeros_1 +.visible .entry triton_poi_fused_new_zeros_1( + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_1_param_0, + .param .u32 triton_poi_fused_new_zeros_1_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_1_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_1_param_3 +) +.reqntid 128 +{ + .reg .pred %p<2>; + .reg .b32 %r<7>; + .reg .b64 %rd<3>; + .loc 1 18 0 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:18:0 +$L__func_begin0: + .loc 1 18 0 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:18:0 + +// %bb.0: + ld.param.b64 %rd2, [triton_poi_fused_new_zeros_1_param_0]; +$L__tmp0: + .loc 1 20 28 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:20:28 + mov.u32 %r2, %ctaid.x; + .loc 1 20 33 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:20:33 + shl.b32 %r3, %r2, 7; + .loc 1 21 36 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:21:36 + mov.u32 %r4, %tid.x; + and.b32 %r5, %r4, 127; + .loc 1 21 23 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:21:23 + or.b32 %r6, %r3, %r5; + .loc 1 22 21 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:22:21 + setp.lt.s32 %p1, %r6, 2176; + .loc 1 25 25 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:25:25 + mad.wide.s32 %rd1, %r6, 4, %rd2; + mov.b32 %r1, 0; + .loc 1 25 36 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:25:36 + // begin inline asm + @%p1 st.global.b32 [ %rd1 + 0 ], { %r1 }; + // end inline asm + .loc 1 25 4 // chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py:25:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 104 +.b8 119 +.b8 109 +.b8 52 +.b8 52 +.b8 106 +.b8 100 +.b8 113 +.b8 116 +.b8 111 +.b8 118 +.b8 121 +.b8 112 +.b8 119 +.b8 113 +.b8 107 +.b8 110 +.b8 101 +.b8 118 +.b8 113 +.b8 118 +.b8 122 +.b8 50 +.b8 100 +.b8 50 +.b8 120 +.b8 114 +.b8 97 +.b8 122 +.b8 99 +.b8 101 +.b8 98 +.b8 52 +.b8 99 +.b8 105 +.b8 50 +.b8 101 +.b8 114 +.b8 111 +.b8 111 +.b8 122 +.b8 52 +.b8 116 +.b8 97 +.b8 104 +.b8 108 +.b8 111 +.b8 99 +.b8 118 +.b8 122 +.b8 118 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 104 +.b8 119 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.source new file mode 100644 index 0000000000000000000000000000000000000000..f56cc22cde323f07a1319106e98aee16e3c8efb7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.source @@ -0,0 +1,41 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":18:0) +#loc11 = loc("out_ptr0"(#loc)) +#loc12 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_new_zeros_1(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 2176 : i32 loc(#loc13) + %xoffset = tt.get_program_id x : i32 loc(#loc14) + %xoffset_1 = arith.constant 128 : i32 loc(#loc15) + %xoffset_2 = arith.constant 128 : i32 loc(#loc15) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc15) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc16) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<128xi32> loc(#loc17) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<128xi32> loc(#loc17) + %xmask = arith.constant dense<2176> : tensor<128xi32> loc(#loc18) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<128xi32> loc(#loc18) + %tmp0 = arith.constant 0 : i32 loc(#loc19) + %tmp0_7 = arith.constant dense<0> : tensor<1xi32> loc(#loc19) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc8) + %1 = tt.addptr %0, %xindex_5 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc8) + %cst = arith.constant dense<0> : tensor<128xi32> loc(#loc9) + tt.store %1, %cst, %xmask_6 : tensor<128x!tt.ptr> loc(#loc9) + tt.return loc(#loc10) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":20:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":20:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":21:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":21:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":22:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":24:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":25:25) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":25:36) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":25:4) +#loc13 = loc("xnumel"(#loc1)) +#loc14 = loc("xoffset"(#loc2)) +#loc15 = loc("xoffset"(#loc3)) +#loc16 = loc("xindex"(#loc4)) +#loc17 = loc("xindex"(#loc5)) +#loc18 = loc("xmask"(#loc6)) +#loc19 = loc("tmp0"(#loc7)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..065e7e38567254281296be0dcd5efb4b81d8931b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.ttgir @@ -0,0 +1,35 @@ +#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_new_zeros_1(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %cst = arith.constant dense<2176> : tensor<128xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<128xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc12) + %xoffset_1 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc13) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked> loc(#loc14) + %xindex_2 = tt.splat %xoffset_1 : i32 -> tensor<128xi32, #blocked> loc(#loc15) + %xindex_3 = arith.addi %xindex_2, %xindex : tensor<128xi32, #blocked> loc(#loc15) + %xmask = arith.cmpi slt, %xindex_3, %cst : tensor<128xi32, #blocked> loc(#loc16) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr, #blocked> loc(#loc7) + %1 = tt.addptr %0, %xindex_3 : tensor<128x!tt.ptr, #blocked>, tensor<128xi32, #blocked> loc(#loc7) + tt.store %1, %cst_0, %xmask : tensor<128x!tt.ptr, #blocked> loc(#loc8) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":20:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":20:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":21:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":21:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":22:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":25:25) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":25:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":25:4) +#loc12 = loc("xoffset"(#loc2)) +#loc13 = loc("xoffset"(#loc3)) +#loc14 = loc("xindex"(#loc4)) +#loc15 = loc("xindex"(#loc5)) +#loc16 = loc("xmask"(#loc6)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..f8faff1b16557668507902ed03cdab782bcd168c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/INOFCMBF4AOGTUSNRPBLV7E37E4P43AGG4323SKXUALONOEWOJUA/triton_poi_fused_new_zeros_1.ttir @@ -0,0 +1,34 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_new_zeros_1(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<128xi32> loc(#loc1) + %xmask = arith.constant dense<2176> : tensor<128xi32> loc(#loc12) + %c128_i32 = arith.constant 128 : i32 loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc13) + %xoffset_0 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc14) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc15) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<128xi32> loc(#loc16) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<128xi32> loc(#loc16) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<128xi32> loc(#loc12) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc8) + %1 = tt.addptr %0, %xindex_2 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc8) + tt.store %1, %cst, %xmask_3 : tensor<128x!tt.ptr> loc(#loc1) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":25:36) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":22:21) +#loc3 = loc(unknown) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":20:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":20:33) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":21:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":21:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":25:25) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hw/chwm44jdqtovypwqknevqvz2d2xrazceb4ci2erooz4tahlocvzv.py":25:4) +#loc12 = loc("xmask"(#loc2)) +#loc13 = loc("xoffset"(#loc4)) +#loc14 = loc("xoffset"(#loc5)) +#loc15 = loc("xindex"(#loc6)) +#loc16 = loc("xindex"(#loc7)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/__grp__triton_red_fused__to_copy_clone_slice_sum_transpose_5.json b/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/__grp__triton_red_fused__to_copy_clone_slice_sum_transpose_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6a6f2a304c586ed37089f978c100d84c08a8c8e5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/__grp__triton_red_fused__to_copy_clone_slice_sum_transpose_5.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_clone_slice_sum_transpose_5.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin", "triton_red_fused__to_copy_clone_slice_sum_transpose_5.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin new file mode 100644 index 0000000000000000000000000000000000000000..579ff0e33013a1786258155ddf4091d203b8cf3a Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json b/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1015e7a98d0783fffaced7eb1f8bbfeb2d07ee63 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.json @@ -0,0 +1 @@ +{"hash": "4baac217b40eeef03b4662a4341ac284b2fd62d1dd17cf76e5585c3c1c4b0d3a", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_clone_slice_sum_transpose_5"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir new file mode 100644 index 0000000000000000000000000000000000000000..d4bbd7ad03e98a969a23dc03661f93aa5e45b5bf --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir @@ -0,0 +1,206 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__to_copy_clone_slice_sum_transpose_5(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = shl i32 %9, 3, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 7, !dbg !9 + %13 = or disjoint i32 %10, %12, !dbg !10 + %14 = icmp slt i32 %13, %4, !dbg !11 + %.fr = freeze i1 %14 + %15 = lshr i32 %11, 3, !dbg !12 + %16 = and i32 %15, 7, !dbg !12 + %17 = or disjoint i32 %16, 8, !dbg !12 + %18 = sext i32 %13 to i64, !dbg !13 + %.frozen = freeze i64 %2, !dbg !14 + %19 = sdiv i64 %18, %.frozen, !dbg !14 + %20 = mul i64 %19, %.frozen, !dbg !13 + %.decomposed = sub i64 %18, %20, !dbg !13 + %21 = icmp sgt i32 %5, 0, !dbg !15 + br i1 %21, label %.lr.ph, label %._crit_edge, !dbg !15 + +.lr.ph: ; preds = %8 + %22 = mul i64 %3, %2, !dbg !16 + %23 = mul i64 %22, %19, !dbg !17 + %24 = getelementptr i32, ptr addrspace(1) %0, i64 %.decomposed + %invariant.gep = getelementptr i32, ptr addrspace(1) %24, i64 %23, !dbg !15 + br i1 %.fr, label %.lr.ph.split, label %.lr.ph.split.us + +.lr.ph.split.us: ; preds = %.lr.ph, %.lr.ph.split.us + %25 = phi i32 [ %36, %.lr.ph.split.us ], [ 0, %.lr.ph ] + %26 = or disjoint i32 %25, %16, !dbg !18 + %27 = or disjoint i32 %17, %25, !dbg !18 + %28 = sext i32 %26 to i64, !dbg !19 + %29 = sext i32 %27 to i64, !dbg !19 + %30 = mul i64 %2, %28, !dbg !19 + %31 = mul i64 %2, %29, !dbg !19 + %gep.us = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %30, !dbg !20 + %gep5.us = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %31, !dbg !20 + %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %33 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep.us, i64 %32, i1 false) #4, !dbg !21 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %35 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep5.us, i64 %34, i1 false) #4, !dbg !21 + %36 = add i32 %25, 16, !dbg !15 + %37 = icmp slt i32 %36, %5, !dbg !15 + br i1 %37, label %.lr.ph.split.us, label %._crit_edge, !dbg !15 + +.lr.ph.split: ; preds = %.lr.ph, %.lr.ph.split + %38 = phi i64 [ %53, %.lr.ph.split ], [ 0, %.lr.ph ] + %39 = phi i64 [ %55, %.lr.ph.split ], [ 0, %.lr.ph ] + %40 = phi i32 [ %56, %.lr.ph.split ], [ 0, %.lr.ph ] + %41 = or disjoint i32 %40, %16, !dbg !18 + %42 = or disjoint i32 %17, %40, !dbg !18 + %43 = icmp slt i32 %41, %5, !dbg !22 + %44 = icmp slt i32 %42, %5, !dbg !22 + %45 = sext i32 %41 to i64, !dbg !19 + %46 = sext i32 %42 to i64, !dbg !19 + %47 = mul i64 %2, %45, !dbg !19 + %48 = mul i64 %2, %46, !dbg !19 + %gep = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %47, !dbg !20 + %gep5 = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %48, !dbg !20 + %49 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %50 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep, i64 %49, i1 %43) #4, !dbg !21 + %51 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %52 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep5, i64 %51, i1 %44) #4, !dbg !21 + %narrow = select i1 %43, i32 %50, i32 0, !dbg !23 + %spec.select = sext i32 %narrow to i64, !dbg !23 + %53 = add i64 %38, %spec.select, !dbg !23 + %narrow6 = select i1 %44, i32 %52, i32 0, !dbg !23 + %54 = sext i32 %narrow6 to i64, !dbg !23 + %55 = add i64 %39, %54, !dbg !23 + %56 = add i32 %40, 16, !dbg !15 + %57 = icmp slt i32 %56, %5, !dbg !15 + br i1 %57, label %.lr.ph.split, label %._crit_edge, !dbg !15 + +._crit_edge: ; preds = %.lr.ph.split.us, %.lr.ph.split, %8 + %58 = phi i64 [ 0, %8 ], [ %53, %.lr.ph.split ], [ 0, %.lr.ph.split.us ], !dbg !24 + %59 = phi i64 [ 0, %8 ], [ %55, %.lr.ph.split ], [ 0, %.lr.ph.split.us ], !dbg !24 + %60 = and i32 %11, 24, !dbg !9 + %61 = lshr i32 %11, 5, !dbg !9 + %62 = add i64 %58, %59, !dbg !25 + %extelt.offset = lshr i64 %62, 32, !dbg !29 + %63 = trunc nuw i64 %extelt.offset to i32, !dbg !29 + %64 = trunc i64 %62 to i32, !dbg !29 + %65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 16, i32 31), !dbg !29 + %66 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 16, i32 31), !dbg !29 + %67 = insertelement <2 x i32> poison, i32 %65, i64 0, !dbg !29 + %68 = insertelement <2 x i32> %67, i32 %66, i64 1, !dbg !29 + %69 = bitcast <2 x i32> %68 to i64, !dbg !29 + %70 = add i64 %62, %69, !dbg !25 + %extelt.offset2 = lshr i64 %70, 32, !dbg !29 + %71 = trunc nuw i64 %extelt.offset2 to i32, !dbg !29 + %72 = trunc i64 %70 to i32, !dbg !29 + %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 8, i32 31), !dbg !29 + %74 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 8, i32 31), !dbg !29 + %75 = insertelement <2 x i32> poison, i32 %73, i64 0, !dbg !29 + %76 = insertelement <2 x i32> %75, i32 %74, i64 1, !dbg !29 + %77 = bitcast <2 x i32> %76 to i64, !dbg !29 + %78 = add i64 %70, %77, !dbg !25 + %79 = and i32 %61, 1, !dbg !29 + %80 = icmp eq i32 %60, 0, !dbg !29 + %.idx = shl nuw nsw i32 %12, 4, !dbg !29 + %81 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx, !dbg !29 + %82 = getelementptr i64, ptr addrspace(3) %81, i32 %79, !dbg !29 + %83 = insertelement <1 x i64> poison, i64 %78, i64 0, !dbg !29 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %82, <1 x i64> %83, i1 %80) #4, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !29 + %84 = icmp samesign ult i32 %11, 16, !dbg !29 + %85 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %11, !dbg !29 + %86 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %85, i1 %84) #4, !dbg !29 + %extelt.offset3 = lshr i64 %86, 32, !dbg !29 + %87 = trunc nuw i64 %extelt.offset3 to i32, !dbg !29 + %88 = trunc i64 %86 to i32, !dbg !29 + %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 1, i32 31), !dbg !29 + %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 1, i32 31), !dbg !29 + %91 = insertelement <2 x i32> poison, i32 %89, i64 0, !dbg !29 + %92 = insertelement <2 x i32> %91, i32 %90, i64 1, !dbg !29 + %93 = bitcast <2 x i32> %92 to i64, !dbg !29 + %94 = add i64 %86, %93, !dbg !25 + %95 = and i32 %11, 1009, !dbg !29 + %96 = icmp eq i32 %95, 0, !dbg !29 + %97 = insertelement <1 x i64> poison, i64 %94, i64 0, !dbg !29 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %85, <1 x i64> %97, i1 %96) #4, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !29 + %98 = load i64, ptr addrspace(3) %81, align 16, !dbg !29 + %99 = trunc i64 %98 to i32, !dbg !30 + %100 = icmp slt i64 %2, 2, !dbg !31 + %101 = icmp sgt i64 %2, 1, !dbg !32 + %102 = select i1 %101, i64 %2, i64 0, !dbg !33 + %103 = zext i1 %100 to i64, !dbg !34 + %104 = add i64 %102, %103, !dbg !35 + %105 = mul i64 %19, %104, !dbg !36 + %106 = getelementptr i32, ptr addrspace(1) %1, i64 %.decomposed, !dbg !37 + %107 = getelementptr i32, ptr addrspace(1) %106, i64 %105, !dbg !37 + %108 = and i32 %11, 56, !dbg !38 + %109 = icmp eq i32 %108, 0, !dbg !38 + %110 = and i1 %109, %.fr, !dbg !38 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %99, ptr addrspace(1) %107, i1 %110) #4, !dbg !38 + ret void, !dbg !39 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_clone_slice_sum_transpose_5", linkageName: "triton_red_fused__to_copy_clone_slice_sum_transpose_5", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 21, column: 28, scope: !4) +!8 = !DILocation(line: 21, column: 33, scope: !4) +!9 = !DILocation(line: 22, column: 44, scope: !4) +!10 = !DILocation(line: 22, column: 23, scope: !4) +!11 = !DILocation(line: 23, column: 21, scope: !4) +!12 = !DILocation(line: 24, column: 37, scope: !4) +!13 = !DILocation(line: 26, column: 19, scope: !4) +!14 = !DILocation(line: 27, column: 19, scope: !4) +!15 = !DILocation(line: 30, column: 40, scope: !4) +!16 = !DILocation(line: 36, column: 54, scope: !4) +!17 = !DILocation(line: 36, column: 58, scope: !4) +!18 = !DILocation(line: 31, column: 31, scope: !4) +!19 = !DILocation(line: 36, column: 43, scope: !4) +!20 = !DILocation(line: 36, column: 34, scope: !4) +!21 = !DILocation(line: 36, column: 63, scope: !4) +!22 = !DILocation(line: 32, column: 29, scope: !4) +!23 = !DILocation(line: 40, column: 48, scope: !4) +!24 = !DILocation(line: 28, column: 43, scope: !4) +!25 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !28) +!26 = distinct !DILexicalBlockFile(scope: !4, file: !27, discriminator: 0) +!27 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!28 = !DILocation(line: 41, column: 25, scope: !4) +!29 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !28) +!30 = !DILocation(line: 42, column: 19, scope: !4) +!31 = !DILocation(line: 43, column: 49, scope: !4) +!32 = !DILocation(line: 43, column: 75, scope: !4) +!33 = !DILocation(line: 43, column: 66, scope: !4) +!34 = !DILocation(line: 43, scope: !4) +!35 = !DILocation(line: 43, column: 57, scope: !4) +!36 = !DILocation(line: 43, column: 34, scope: !4) +!37 = !DILocation(line: 43, column: 25, scope: !4) +!38 = !DILocation(line: 43, column: 88, scope: !4) +!39 = !DILocation(line: 43, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx new file mode 100644 index 0000000000000000000000000000000000000000..09ce920defad6f50739aff175181de5d80ad7592 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx @@ -0,0 +1,536 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_clone_slice_sum_transpose_5 // -- Begin function triton_red_fused__to_copy_clone_slice_sum_transpose_5 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__to_copy_clone_slice_sum_transpose_5 +.visible .entry triton_red_fused__to_copy_clone_slice_sum_transpose_5( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_1, + .param .u64 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_2, + .param .u64 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_3, + .param .u32 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_4, + .param .u32 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_7 +) +.reqntid 64 +{ + .reg .pred %p<17>; + .reg .b32 %r<57>; + .reg .b64 %rd<87>; + .loc 1 18 0 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:18:0 + +// %bb.0: + ld.param.b32 %r8, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_5]; + ld.param.b64 %rd16, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_2]; +$L__tmp0: + .loc 1 21 28 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:21:28 + mov.u32 %r9, %ctaid.x; + .loc 1 21 33 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:21:33 + shl.b32 %r10, %r9, 3; + .loc 1 22 44 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:22:44 + mov.u32 %r1, %tid.x; + ld.param.b32 %r11, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_4]; + and.b32 %r2, %r1, 7; + .loc 1 22 23 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:22:23 + or.b32 %r13, %r10, %r2; + .loc 1 26 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:26:19 + cvt.s64.s32 %rd1, %r13; + .loc 1 27 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:27:19 + or.b64 %rd19, %rd1, %rd16; + and.b64 %rd20, %rd19, -4294967296; + setp.ne.b64 %p2, %rd20, 0; + @%p2 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd82, %rd1, %rd16; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r15, %rd16; + cvt.u32.u64 %r16, %rd1; + div.u32 %r17, %r16, %r15; + cvt.u64.u32 %rd82, %r17; +$L__BB0_3: + .loc 1 0 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:19 + ld.param.b64 %rd15, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_1]; + setp.lt.s32 %p1, %r13, %r11; + .loc 1 26 19 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:26:19 + mul.lo.s64 %rd22, %rd82, %rd16; + sub.s64 %rd6, %rd1, %rd22; + .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40 + setp.lt.s32 %p3, %r8, 1; + mov.b64 %rd85, 0; + shl.b64 %rd81, %rd6, 2; + mov.b64 %rd86, %rd85; + @%p3 bra $L__BB0_9; +// %bb.4: // %.lr.ph + .loc 1 0 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:40 + ld.param.b64 %rd17, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_3]; + ld.param.b64 %rd14, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_0]; + bfe.u32 %r3, %r1, 3, 3; + .loc 1 36 54 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:54 + mul.lo.s64 %rd23, %rd17, %rd16; + .loc 1 36 58 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:58 + mul.lo.s64 %rd24, %rd23, %rd82; + add.s64 %rd26, %rd14, %rd81; + .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40 + shl.b64 %rd27, %rd24, 2; + add.s64 %rd7, %rd26, %rd27; + @%p1 bra $L__BB0_7; + bra.uni $L__BB0_5; +$L__BB0_7: // %.lr.ph.split.preheader + .loc 1 0 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:40 + mov.b32 %r56, 0; + mov.b64 %rd85, 0; + mov.b64 %rd86, %rd85; +$L__BB0_8: // %.lr.ph.split + // =>This Inner Loop Header: Depth=1 + .loc 1 31 31 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:31:31 + add.s32 %r26, %r3, %r56; + .loc 1 32 29 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:32:29 + add.s32 %r27, %r26, 8; + setp.lt.s32 %p7, %r26, %r8; + setp.lt.s32 %p8, %r27, %r8; + .loc 1 36 43 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:43 + cvt.s64.s32 %rd48, %r26; + cvt.s64.s32 %rd49, %r27; + mul.lo.s64 %rd50, %rd16, %rd48; + mul.lo.s64 %rd51, %rd16, %rd49; + .loc 1 36 34 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:34 + shl.b64 %rd52, %rd50, 2; + add.s64 %rd43, %rd7, %rd52; + shl.b64 %rd53, %rd51, 2; + add.s64 %rd46, %rd7, %rd53; + .loc 1 36 63 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:63 + // begin inline asm + mov.u64 %rd42, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd42, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r24, 0x0; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b32 { %r24 }, [ %rd43 + 0 ], %rd42; + // end inline asm + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r25, 0x0; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b32 { %r25 }, [ %rd46 + 0 ], %rd45; + // end inline asm + .loc 1 40 48 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:40:48 + selp.b32 %r28, %r24, 0, %p7; + cvt.s64.s32 %rd54, %r28; + add.s64 %rd85, %rd85, %rd54; + selp.b32 %r29, %r25, 0, %p8; + cvt.s64.s32 %rd55, %r29; + add.s64 %rd86, %rd86, %rd55; + .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40 + add.s32 %r56, %r56, 16; + setp.lt.s32 %p9, %r56, %r8; + @%p9 bra $L__BB0_8; + bra.uni $L__BB0_9; +$L__BB0_5: // %.lr.ph.split.us.preheader + .loc 1 0 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:0:40 + mov.b32 %r55, 0; +$L__BB0_6: // %.lr.ph.split.us + // =>This Inner Loop Header: Depth=1 + .loc 1 31 31 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:31:31 + add.s32 %r21, %r3, %r55; + .loc 1 36 43 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:43 + add.s32 %r22, %r21, 8; + cvt.s64.s32 %rd35, %r21; + cvt.s64.s32 %rd36, %r22; + mul.lo.s64 %rd37, %rd16, %rd35; + mul.lo.s64 %rd38, %rd16, %rd36; + .loc 1 36 34 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:34 + shl.b64 %rd39, %rd37, 2; + add.s64 %rd29, %rd7, %rd39; + shl.b64 %rd40, %rd38, 2; + add.s64 %rd32, %rd7, %rd40; + .loc 1 36 63 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:36:63 + // begin inline asm + mov.u64 %rd28, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd28, 1.0; + // end inline asm + mov.pred %p4, 0; + // begin inline asm + mov.u32 %r19, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b32 { %r19 }, [ %rd29 + 0 ], %rd28; + // end inline asm + // begin inline asm + mov.u64 %rd31, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd31, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r20, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b32 { %r20 }, [ %rd32 + 0 ], %rd31; + // end inline asm + .loc 1 30 40 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:30:40 + add.s32 %r55, %r55, 16; + setp.lt.s32 %p6, %r55, %r8; + mov.b64 %rd86, %rd85; + @%p6 bra $L__BB0_6; +$L__BB0_9: // %._crit_edge + .loc 1 22 44 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:22:44 + and.b32 %r34, %r1, 24; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + add.s64 %rd60, %rd85, %rd86; + .loc 2 291 36 // standard.py:291:36 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + mov.b64 {_, %r35}, %rd60; + cvt.u32.u64 %r36, %rd60; + shfl.sync.bfly.b32 %r37, %r36, 16, 31, -1; + shfl.sync.bfly.b32 %r38, %r35, 16, 31, -1; + cvt.u64.u32 %rd61, %r37; + cvt.u64.u32 %rd62, %r38; + shl.b64 %rd63, %rd62, 32; + or.b64 %rd64, %rd61, %rd63; + .loc 2 261 15 // standard.py:261:15 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + add.s64 %rd65, %rd60, %rd64; + .loc 2 291 36 // standard.py:291:36 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + mov.b64 {_, %r39}, %rd65; + cvt.u32.u64 %r40, %rd65; + shfl.sync.bfly.b32 %r41, %r40, 8, 31, -1; + shfl.sync.bfly.b32 %r42, %r39, 8, 31, -1; + cvt.u64.u32 %rd66, %r41; + cvt.u64.u32 %rd67, %r42; + shl.b64 %rd68, %rd67, 32; + or.b64 %rd69, %rd66, %rd68; + .loc 2 261 15 // standard.py:261:15 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + add.s64 %rd56, %rd65, %rd69; + .loc 2 291 36 // standard.py:291:36 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + setp.eq.b32 %p10, %r34, 0; + shl.b32 %r43, %r2, 4; + mov.b32 %r44, global_smem; + add.s32 %r45, %r44, %r43; + shr.u32 %r46, %r1, 2; + and.b32 %r47, %r46, 8; + add.s32 %r30, %r45, %r47; + // begin inline asm + @%p10 st.shared.b64 [ %r30 + 0 ], %rd56; + // end inline asm + bar.sync 0; + setp.lt.u32 %p11, %r1, 16; + shl.b32 %r48, %r1, 3; + add.s32 %r31, %r44, %r48; + // begin inline asm + @%p11 ld.shared.b64 %rd57, [ %r31 + 0 ]; + // end inline asm + mov.b64 {_, %r49}, %rd57; + cvt.u32.u64 %r50, %rd57; + shfl.sync.bfly.b32 %r51, %r50, 1, 31, -1; + shfl.sync.bfly.b32 %r52, %r49, 1, 31, -1; + cvt.u64.u32 %rd70, %r51; + cvt.u64.u32 %rd71, %r52; + shl.b64 %rd72, %rd71, 32; + or.b64 %rd73, %rd70, %rd72; + .loc 2 261 15 // standard.py:261:15 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + add.s64 %rd58, %rd57, %rd73; + .loc 2 291 36 // standard.py:291:36 @[ c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:41:25 ] + and.b32 %r53, %r1, 1009; + setp.eq.b32 %p12, %r53, 0; + // begin inline asm + @%p12 st.shared.b64 [ %r31 + 0 ], %rd58; + // end inline asm + bar.sync 0; + ld.shared.b32 %r33, [%r45]; +$L__tmp2: + .loc 1 43 49 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:49 + setp.lt.s64 %p14, %rd16, 2; + .loc 1 43 75 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:75 + setp.gt.s64 %p15, %rd16, 1; + .loc 1 43 66 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:66 + selp.b64 %rd74, %rd16, 0, %p15; + .loc 1 43 0 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43 + selp.b64 %rd75, 1, 0, %p14; + .loc 1 43 57 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:57 + add.s64 %rd76, %rd74, %rd75; + .loc 1 43 34 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:34 + mul.lo.s64 %rd77, %rd82, %rd76; + .loc 1 43 25 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:25 + add.s64 %rd79, %rd15, %rd81; + shl.b64 %rd80, %rd77, 2; + add.s64 %rd59, %rd79, %rd80; + .loc 1 43 88 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:88 + and.b32 %r54, %r1, 56; + setp.eq.b32 %p16, %r54, 0; + and.pred %p13, %p16, %p1; + // begin inline asm + @%p13 st.global.b32 [ %rd59 + 0 ], { %r33 }; + // end inline asm + .loc 1 43 4 // c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py:43:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 238 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe7 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 53 +.b8 117 +.b8 99 +.b8 102 +.b8 50 +.b8 105 +.b8 116 +.b8 54 +.b8 102 +.b8 119 +.b8 104 +.b8 109 +.b8 114 +.b8 117 +.b8 51 +.b8 50 +.b8 118 +.b8 112 +.b8 121 +.b8 50 +.b8 120 +.b8 109 +.b8 119 +.b8 107 +.b8 107 +.b8 52 +.b8 118 +.b8 103 +.b8 55 +.b8 112 +.b8 112 +.b8 110 +.b8 111 +.b8 120 +.b8 109 +.b8 104 +.b8 110 +.b8 107 +.b8 102 +.b8 54 +.b8 105 +.b8 102 +.b8 112 +.b8 98 +.b8 50 +.b8 104 +.b8 50 +.b8 116 +.b8 106 +.b8 115 +.b8 108 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 53 +.b8 117 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x38 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 99 +.b8 108 +.b8 111 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 108 +.b8 105 +.b8 99 +.b8 101 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 116 +.b8 114 +.b8 97 +.b8 110 +.b8 115 +.b8 112 +.b8 111 +.b8 115 +.b8 101 +.b8 95 +.b8 53 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc3:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xd8:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source b/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source new file mode 100644 index 0000000000000000000000000000000000000000..095c42d6ec4ed273b0914c61f8f3a6e8ee7bcacc --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.source @@ -0,0 +1,193 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":18:0) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc43 = loc(unknown) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc50 = loc("in_ptr0"(#loc)) +#loc51 = loc("out_ptr1"(#loc)) +#loc52 = loc("ks0"(#loc)) +#loc53 = loc("ks1"(#loc)) +#loc54 = loc("xnumel"(#loc)) +#loc55 = loc("r0_numel"(#loc)) +#loc85 = loc("input"(#loc41)) +#loc86 = loc("a"(#loc46)) +#loc87 = loc("b"(#loc46)) +module { + tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc56) + %xoffset_0 = arith.constant 8 : i32 loc(#loc57) + %xoffset_1 = arith.constant 8 : i32 loc(#loc57) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc57) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc58) + %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc59) + %xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<8x1xi32> loc(#loc60) + %xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<8x1xi32> loc(#loc60) + %xmask = tt.splat %xnumel : i32 -> tensor<8x1xi32> loc(#loc61) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<8x1xi32> loc(#loc61) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc62) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc63) + %x0 = arith.extsi %xindex_5 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc64) + %x0_8 = tt.splat %ks0 : i64 -> tensor<8x1xi64> loc(#loc64) + %x0_9 = arith.remsi %x0, %x0_8 : tensor<8x1xi64> loc(#loc64) + %x1 = arith.extsi %xindex_5 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc65) + %x1_10 = tt.splat %ks0 : i64 -> tensor<8x1xi64> loc(#loc65) + %x1_11 = arith.divsi %x1, %x1_10 : tensor<8x1xi64> loc(#loc65) + %_tmp3 = arith.constant 0 : i64 loc(#loc66) + %_tmp3_12 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc66) + %c0_i32 = arith.constant 0 : i32 loc(#loc12) + %c16_i32 = arith.constant 16 : i32 loc(#loc12) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc12) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc12) + %2 = arith.bitcast %c16_i32 : i32 to i32 loc(#loc12) + %3 = ub.poison : i32 loc(#loc12) + %_tmp3_13 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_18 = %_tmp3_12) -> (tensor<8x16xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32> loc(#loc68) + %r0_index_19 = arith.addi %r0_index, %r0_base_7 : tensor<1x16xi32> loc(#loc68) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32> loc(#loc69) + %r0_mask_20 = arith.cmpi slt, %r0_index_19, %r0_mask : tensor<1x16xi32> loc(#loc69) + %tmp0 = arith.extsi %r0_index_19 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc70) + %tmp0_21 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc70) + %tmp0_22 = arith.muli %tmp0_21, %tmp0 : tensor<1x16xi64> loc(#loc70) + %tmp0_23 = tt.broadcast %x0_9 : tensor<8x1xi64> -> tensor<8x16xi64> loc(#loc71) + %tmp0_24 = tt.broadcast %tmp0_22 : tensor<1x16xi64> -> tensor<8x16xi64> loc(#loc71) + %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<8x16xi64> loc(#loc71) + %tmp0_26 = arith.muli %ks0, %ks1 : i64 loc(#loc72) + %tmp0_27 = tt.splat %tmp0_26 : i64 -> tensor<8x1xi64> loc(#loc73) + %tmp0_28 = arith.muli %tmp0_27, %x1_11 : tensor<8x1xi64> loc(#loc73) + %tmp0_29 = tt.broadcast %tmp0_28 : tensor<8x1xi64> -> tensor<8x16xi64> loc(#loc74) + %tmp0_30 = arith.addi %tmp0_25, %tmp0_29 : tensor<8x16xi64> loc(#loc74) + %tmp0_31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc75) + %tmp0_32 = tt.addptr %tmp0_31, %tmp0_30 : tensor<8x16x!tt.ptr>, tensor<8x16xi64> loc(#loc75) + %tmp0_33 = tt.broadcast %r0_mask_20 : tensor<1x16xi1> -> tensor<8x16xi1> loc(#loc76) + %tmp0_34 = tt.broadcast %xmask_6 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc76) + %tmp0_35 = arith.andi %tmp0_33, %tmp0_34 : tensor<8x16xi1> loc(#loc76) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc77) + %tmp0_37 = arith.constant dense<0.000000e+00> : tensor<8x16xf32> loc(#loc77) + %tmp0_38 = arith.fptosi %tmp0_37 : tensor<8x16xf32> to tensor<8x16xi32> loc(#loc77) + %tmp0_39 = tt.load %tmp0_32, %tmp0_35, %tmp0_38 evictionPolicy = evict_last : tensor<8x16x!tt.ptr> loc(#loc77) + %tmp1 = arith.extsi %tmp0_39 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc78) + %tmp4 = arith.addi %_tmp3_18, %tmp1 : tensor<8x16xi64> loc(#loc79) + %_tmp3_40 = tt.broadcast %r0_mask_20 : tensor<1x16xi1> -> tensor<8x16xi1> loc(#loc80) + %_tmp3_41 = tt.broadcast %xmask_6 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc80) + %_tmp3_42 = arith.andi %_tmp3_40, %_tmp3_41 : tensor<8x16xi1> loc(#loc80) + %_tmp3_43 = arith.select %_tmp3_42, %tmp4, %_tmp3_18 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc81) + scf.yield %_tmp3_43 : tensor<8x16xi64> loc(#loc27) + } loc(#loc67) + %tmp3 = tt.call @"triton.language.standard.sum__i64S8_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp3_13) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc82) + %tmp3_14 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc83) + %tmp5 = arith.trunci %tmp3_14 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc84) + %c1_i32 = arith.constant 1 : i32 loc(#loc31) + %4 = arith.extsi %c1_i32 : i32 to i64 loc(#loc31) + %5 = arith.cmpi sge, %4, %ks0 : i64 loc(#loc31) + %c1_i32_15 = arith.constant 1 : i32 loc(#loc32) + %c1_i32_16 = arith.constant 1 : i32 loc(#loc32) + %6 = arith.extui %5 : i1 to i32 loc(#loc32) + %7 = arith.muli %c1_i32_16, %6 : i32 loc(#loc32) + %c1_i32_17 = arith.constant 1 : i32 loc(#loc33) + %8 = arith.extsi %c1_i32_17 : i32 to i64 loc(#loc33) + %9 = arith.cmpi sgt, %ks0, %8 : i64 loc(#loc33) + %10 = arith.extui %9 : i1 to i64 loc(#loc34) + %11 = arith.muli %ks0, %10 : i64 loc(#loc34) + %12 = arith.extsi %7 : i32 to i64 loc(#loc35) + %13 = arith.addi %12, %11 : i64 loc(#loc35) + %14 = tt.splat %13 : i64 -> tensor<8x1xi64> loc(#loc36) + %15 = arith.muli %x1_11, %14 : tensor<8x1xi64> loc(#loc36) + %16 = arith.addi %x0_9, %15 : tensor<8x1xi64> loc(#loc37) + %17 = tt.splat %out_ptr1 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc38) + %18 = tt.addptr %17, %16 : tensor<8x1x!tt.ptr>, tensor<8x1xi64> loc(#loc38) + tt.store %18, %tmp5, %xmask_6 : tensor<8x1x!tt.ptr> loc(#loc39) + tt.return loc(#loc40) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S8_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x16xi64> loc("input"(#loc41))) -> tensor<8xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc42) + tt.reduce.return %2 : i64 loc(#loc42) + }) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc42) + tt.return %0 : tensor<8xi64> loc(#loc44) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8xi64> loc(#loc45) + tt.return %1 : tensor<8xi64> loc(#loc45) + } loc(#loc41) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc46)), %b: i64 loc("b"(#loc46))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc47) + tt.return %0 : i64 loc(#loc48) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc49) + tt.return %1 : i64 loc(#loc49) + } loc(#loc46) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":26:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":28:43) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":30:40) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":31:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":32:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:39) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:54) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:58) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:50) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:73) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:63) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":37:23) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":39:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:48) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:8) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:25) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:28) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":42:19) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:49) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:75) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:66) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:57) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:34) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:30) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:88) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:4) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc56 = loc("xoffset"(#loc1)) +#loc57 = loc("xoffset"(#loc2)) +#loc58 = loc("xindex"(#loc3)) +#loc59 = loc("xindex"(#loc4)) +#loc60 = loc("xindex"(#loc5)) +#loc61 = loc("xmask"(#loc6)) +#loc62 = loc("r0_base"(#loc7)) +#loc63 = loc("r0_base"(#loc8)) +#loc64 = loc("x0"(#loc9)) +#loc65 = loc("x1"(#loc10)) +#loc66 = loc("_tmp3"(#loc11)) +#loc67 = loc("_tmp3"(#loc12)) +#loc68 = loc("r0_index"(#loc13)) +#loc69 = loc("r0_mask"(#loc14)) +#loc70 = loc("tmp0"(#loc15)) +#loc71 = loc("tmp0"(#loc16)) +#loc72 = loc("tmp0"(#loc17)) +#loc73 = loc("tmp0"(#loc18)) +#loc74 = loc("tmp0"(#loc19)) +#loc75 = loc("tmp0"(#loc20)) +#loc76 = loc("tmp0"(#loc21)) +#loc77 = loc("tmp0"(#loc22)) +#loc78 = loc("tmp1"(#loc23)) +#loc79 = loc("tmp4"(#loc24)) +#loc80 = loc("_tmp3"(#loc25)) +#loc81 = loc("_tmp3"(#loc26)) +#loc82 = loc("tmp3"(#loc28)) +#loc83 = loc("tmp3"(#loc29)) +#loc84 = loc("tmp5"(#loc30)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..d665b693c374020b58e13a2547dc0259372f780b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir @@ -0,0 +1,147 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 2], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":18:0) +#loc1 = loc(unknown) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:25) +#loc40 = loc("in_ptr0"(#loc)) +#loc41 = loc("out_ptr1"(#loc)) +#loc42 = loc("ks0"(#loc)) +#loc43 = loc("ks1"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc68 = loc("tmp3"(#loc26)) +#loc73 = loc(callsite(#loc1 at #loc68)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<8x16xi64, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<8x16xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc46) + %xoffset_1 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc47) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc48) + %xindex_2 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc48) + %xindex_3 = tt.splat %xoffset_1 : i32 -> tensor<8x1xi32, #blocked> loc(#loc49) + %xindex_4 = arith.addi %xindex_3, %xindex_2 : tensor<8x1xi32, #blocked> loc(#loc49) + %xmask = tt.splat %xnumel : i32 -> tensor<8x1xi32, #blocked> loc(#loc50) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<8x1xi32, #blocked> loc(#loc50) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc51) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc51) + %x0 = arith.extsi %xindex_4 : tensor<8x1xi32, #blocked> to tensor<8x1xi64, #blocked> loc(#loc52) + %x0_7 = tt.splat %ks0 : i64 -> tensor<8x1xi64, #blocked> loc(#loc52) + %x0_8 = arith.remsi %x0, %x0_7 : tensor<8x1xi64, #blocked> loc(#loc52) + %x1 = arith.divsi %x0, %x0_7 : tensor<8x1xi64, #blocked> loc(#loc53) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32, #blocked> loc(#loc54) + %tmp0 = tt.splat %ks0 : i64 -> tensor<1x16xi64, #blocked> loc(#loc55) + %tmp0_9 = tt.broadcast %x0_8 : tensor<8x1xi64, #blocked> -> tensor<8x16xi64, #blocked> loc(#loc56) + %tmp0_10 = arith.muli %ks0, %ks1 : i64 loc(#loc57) + %tmp0_11 = tt.splat %tmp0_10 : i64 -> tensor<8x1xi64, #blocked> loc(#loc58) + %tmp0_12 = arith.muli %tmp0_11, %x1 : tensor<8x1xi64, #blocked> loc(#loc58) + %tmp0_13 = tt.broadcast %tmp0_12 : tensor<8x1xi64, #blocked> -> tensor<8x16xi64, #blocked> loc(#loc59) + %tmp0_14 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc60) + %tmp0_15 = tt.broadcast %xmask_5 : tensor<8x1xi1, #blocked> -> tensor<8x16xi1, #blocked> loc(#loc61) + %_tmp3 = scf.for %_tmp3_17 = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_18 = %cst) -> (tensor<8x16xi64, #blocked>) : i32 { + %r0_index = tt.splat %_tmp3_17 : i32 -> tensor<1x16xi32, #blocked> loc(#loc63) + %r0_index_19 = arith.addi %r0_index, %r0_base_6 : tensor<1x16xi32, #blocked> loc(#loc63) + %r0_mask_20 = arith.cmpi slt, %r0_index_19, %r0_mask : tensor<1x16xi32, #blocked> loc(#loc54) + %tmp0_21 = arith.extsi %r0_index_19 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc55) + %tmp0_22 = arith.muli %tmp0, %tmp0_21 : tensor<1x16xi64, #blocked> loc(#loc55) + %tmp0_23 = tt.broadcast %tmp0_22 : tensor<1x16xi64, #blocked> -> tensor<8x16xi64, #blocked> loc(#loc56) + %tmp0_24 = arith.addi %tmp0_9, %tmp0_23 : tensor<8x16xi64, #blocked> loc(#loc56) + %tmp0_25 = arith.addi %tmp0_24, %tmp0_13 : tensor<8x16xi64, #blocked> loc(#loc59) + %tmp0_26 = tt.addptr %tmp0_14, %tmp0_25 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi64, #blocked> loc(#loc60) + %tmp0_27 = tt.broadcast %r0_mask_20 : tensor<1x16xi1, #blocked> -> tensor<8x16xi1, #blocked> loc(#loc61) + %tmp0_28 = arith.andi %tmp0_27, %tmp0_15 : tensor<8x16xi1, #blocked> loc(#loc61) + %tmp0_29 = tt.load %tmp0_26, %tmp0_28, %cst_0 evictionPolicy = evict_last : tensor<8x16x!tt.ptr, #blocked> loc(#loc64) + %tmp1 = arith.extsi %tmp0_29 : tensor<8x16xi32, #blocked> to tensor<8x16xi64, #blocked> loc(#loc65) + %tmp4 = arith.addi %_tmp3_18, %tmp1 : tensor<8x16xi64, #blocked> loc(#loc66) + %_tmp3_30 = arith.select %tmp0_28, %tmp4, %_tmp3_18 : tensor<8x16xi1, #blocked>, tensor<8x16xi64, #blocked> loc(#loc67) + scf.yield %_tmp3_30 : tensor<8x16xi64, #blocked> loc(#loc24) + } loc(#loc62) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_17: i64 loc(callsite(#loc1 at #loc68)), %tmp3_18: i64 loc(callsite(#loc1 at #loc68))): + %tmp3_19 = arith.addi %tmp3_17, %tmp3_18 : i64 loc(#loc74) + tt.reduce.return %tmp3_19 : i64 loc(#loc72) + }) : (tensor<8x16xi64, #blocked>) -> tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc72) + %tmp3_16 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi64, #blocked> loc(#loc69) + %tmp5 = arith.trunci %tmp3_16 : tensor<8x1xi64, #blocked> to tensor<8x1xi32, #blocked> loc(#loc70) + %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc30) + %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc31) + %2 = arith.extui %1 : i1 to i64 loc(#loc32) + %3 = arith.muli %ks0, %2 : i64 loc(#loc32) + %4 = arith.extui %0 : i1 to i64 loc(#loc71) + %5 = arith.addi %4, %3 : i64 loc(#loc33) + %6 = tt.splat %5 : i64 -> tensor<8x1xi64, #blocked> loc(#loc35) + %7 = arith.muli %x1, %6 : tensor<8x1xi64, #blocked> loc(#loc35) + %8 = arith.addi %x0_8, %7 : tensor<8x1xi64, #blocked> loc(#loc36) + %9 = tt.splat %out_ptr1 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked> loc(#loc37) + %10 = tt.addptr %9, %8 : tensor<8x1x!tt.ptr, #blocked>, tensor<8x1xi64, #blocked> loc(#loc37) + tt.store %10, %tmp5, %xmask_5 : tensor<8x1x!tt.ptr, #blocked> loc(#loc38) + tt.return loc(#loc39) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":26:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":27:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":32:29) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:43) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:39) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:54) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:58) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:50) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:34) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:73) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":30:40) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":31:31) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:63) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":37:23) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":39:23) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:48) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:8) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":42:19) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:49) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:75) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:66) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:57) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:41) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:34) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:30) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:88) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:4) +#loc46 = loc("xoffset"(#loc2)) +#loc47 = loc("xoffset"(#loc3)) +#loc48 = loc("xindex"(#loc4)) +#loc49 = loc("xindex"(#loc5)) +#loc50 = loc("xmask"(#loc6)) +#loc51 = loc("r0_base"(#loc7)) +#loc52 = loc("x0"(#loc8)) +#loc53 = loc("x1"(#loc9)) +#loc54 = loc("r0_mask"(#loc10)) +#loc55 = loc("tmp0"(#loc11)) +#loc56 = loc("tmp0"(#loc12)) +#loc57 = loc("tmp0"(#loc13)) +#loc58 = loc("tmp0"(#loc14)) +#loc59 = loc("tmp0"(#loc15)) +#loc60 = loc("tmp0"(#loc16)) +#loc61 = loc("tmp0"(#loc17)) +#loc62 = loc("_tmp3"(#loc18)) +#loc63 = loc("r0_index"(#loc19)) +#loc64 = loc("tmp0"(#loc20)) +#loc65 = loc("tmp1"(#loc21)) +#loc66 = loc("tmp4"(#loc22)) +#loc67 = loc("_tmp3"(#loc23)) +#loc69 = loc("tmp3"(#loc28)) +#loc70 = loc("tmp5"(#loc29)) +#loc71 = loc(fused[#loc33, #loc34]) +#loc72 = loc(callsite(#loc25 at #loc68)) +#loc74 = loc(callsite(#loc27 at #loc72)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir new file mode 100644 index 0000000000000000000000000000000000000000..663a9ffba96d6ca6ef71f78d1b63f142e331b05d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttir @@ -0,0 +1,152 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":18:0) +#loc1 = loc(unknown) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:25) +#loc43 = loc("in_ptr0"(#loc)) +#loc44 = loc("out_ptr1"(#loc)) +#loc45 = loc("ks0"(#loc)) +#loc46 = loc("ks1"(#loc)) +#loc47 = loc("xnumel"(#loc)) +#loc48 = loc("r0_numel"(#loc)) +#loc74 = loc("tmp3"(#loc29)) +#loc79 = loc(callsite(#loc1 at #loc74)) +module { + tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst = arith.constant dense<0> : tensor<8x16xi32> loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %_tmp3 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc49) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc50) + %xoffset_0 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc51) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc52) + %xindex_1 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc53) + %xindex_2 = tt.splat %xoffset_0 : i32 -> tensor<8x1xi32> loc(#loc54) + %xindex_3 = arith.addi %xindex_2, %xindex_1 : tensor<8x1xi32> loc(#loc54) + %xmask = tt.splat %xnumel : i32 -> tensor<8x1xi32> loc(#loc55) + %xmask_4 = arith.cmpi slt, %xindex_3, %xmask : tensor<8x1xi32> loc(#loc55) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc56) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc57) + %x0 = arith.extsi %xindex_3 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc58) + %x0_6 = tt.splat %ks0 : i64 -> tensor<8x1xi64> loc(#loc58) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<8x1xi64> loc(#loc58) + %x1 = arith.divsi %x0, %x0_6 : tensor<8x1xi64> loc(#loc59) + %_tmp3_8 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_10 = %_tmp3) -> (tensor<8x16xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16xi32> loc(#loc61) + %r0_index_11 = arith.addi %r0_index, %r0_base_5 : tensor<1x16xi32> loc(#loc61) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32> loc(#loc62) + %r0_mask_12 = arith.cmpi slt, %r0_index_11, %r0_mask : tensor<1x16xi32> loc(#loc62) + %tmp0 = arith.extsi %r0_index_11 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc63) + %tmp0_13 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc63) + %tmp0_14 = arith.muli %tmp0_13, %tmp0 : tensor<1x16xi64> loc(#loc63) + %tmp0_15 = tt.broadcast %x0_7 : tensor<8x1xi64> -> tensor<8x16xi64> loc(#loc64) + %tmp0_16 = tt.broadcast %tmp0_14 : tensor<1x16xi64> -> tensor<8x16xi64> loc(#loc64) + %tmp0_17 = arith.addi %tmp0_15, %tmp0_16 : tensor<8x16xi64> loc(#loc64) + %tmp0_18 = arith.muli %ks0, %ks1 : i64 loc(#loc65) + %tmp0_19 = tt.splat %tmp0_18 : i64 -> tensor<8x1xi64> loc(#loc66) + %tmp0_20 = arith.muli %tmp0_19, %x1 : tensor<8x1xi64> loc(#loc66) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<8x1xi64> -> tensor<8x16xi64> loc(#loc67) + %tmp0_22 = arith.addi %tmp0_17, %tmp0_21 : tensor<8x16xi64> loc(#loc67) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc68) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<8x16x!tt.ptr>, tensor<8x16xi64> loc(#loc68) + %tmp0_25 = tt.broadcast %r0_mask_12 : tensor<1x16xi1> -> tensor<8x16xi1> loc(#loc69) + %tmp0_26 = tt.broadcast %xmask_4 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc69) + %tmp0_27 = arith.andi %tmp0_25, %tmp0_26 : tensor<8x16xi1> loc(#loc69) + %tmp0_28 = tt.load %tmp0_24, %tmp0_27, %cst evictionPolicy = evict_last : tensor<8x16x!tt.ptr> loc(#loc70) + %tmp1 = arith.extsi %tmp0_28 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc71) + %tmp4 = arith.addi %_tmp3_10, %tmp1 : tensor<8x16xi64> loc(#loc72) + %_tmp3_29 = arith.select %tmp0_27, %tmp4, %_tmp3_10 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc73) + scf.yield %_tmp3_29 : tensor<8x16xi64> loc(#loc27) + } loc(#loc60) + %tmp3 = "tt.reduce"(%_tmp3_8) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_10: i64 loc(callsite(#loc1 at #loc74)), %tmp3_11: i64 loc(callsite(#loc1 at #loc74))): + %tmp3_12 = arith.addi %tmp3_10, %tmp3_11 : i64 loc(#loc80) + tt.reduce.return %tmp3_12 : i64 loc(#loc78) + }) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc78) + %tmp3_9 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc75) + %tmp5 = arith.trunci %tmp3_9 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc76) + %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc33) + %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc34) + %2 = arith.extui %1 : i1 to i64 loc(#loc35) + %3 = arith.muli %ks0, %2 : i64 loc(#loc35) + %4 = arith.extui %0 : i1 to i64 loc(#loc77) + %5 = arith.addi %4, %3 : i64 loc(#loc36) + %6 = tt.splat %5 : i64 -> tensor<8x1xi64> loc(#loc38) + %7 = arith.muli %x1, %6 : tensor<8x1xi64> loc(#loc38) + %8 = arith.addi %x0_7, %7 : tensor<8x1xi64> loc(#loc39) + %9 = tt.splat %out_ptr1 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc40) + %10 = tt.addptr %9, %8 : tensor<8x1x!tt.ptr>, tensor<8x1xi64> loc(#loc40) + tt.store %10, %tmp5, %xmask_4 : tensor<8x1x!tt.ptr> loc(#loc41) + tt.return loc(#loc42) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":30:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":28:43) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":21:33) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:44) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":22:23) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":23:21) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:27) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":24:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":26:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":27:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":31:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":32:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:43) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:39) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:54) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:58) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:50) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:73) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":36:63) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":37:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":39:23) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:48) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":40:8) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":41:28) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":42:19) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:49) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:75) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:66) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:57) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:41) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:34) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:30) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:25) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:88) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5u/c5ucf2it6fwhmru32vpy2xmwkk4vg7ppnoxmhnkf6ifpb2h2tjsl.py":43:4) +#loc49 = loc("_tmp3"(#loc3)) +#loc50 = loc("xoffset"(#loc4)) +#loc51 = loc("xoffset"(#loc5)) +#loc52 = loc("xindex"(#loc6)) +#loc53 = loc("xindex"(#loc7)) +#loc54 = loc("xindex"(#loc8)) +#loc55 = loc("xmask"(#loc9)) +#loc56 = loc("r0_base"(#loc10)) +#loc57 = loc("r0_base"(#loc11)) +#loc58 = loc("x0"(#loc12)) +#loc59 = loc("x1"(#loc13)) +#loc60 = loc("_tmp3"(#loc2)) +#loc61 = loc("r0_index"(#loc14)) +#loc62 = loc("r0_mask"(#loc15)) +#loc63 = loc("tmp0"(#loc16)) +#loc64 = loc("tmp0"(#loc17)) +#loc65 = loc("tmp0"(#loc18)) +#loc66 = loc("tmp0"(#loc19)) +#loc67 = loc("tmp0"(#loc20)) +#loc68 = loc("tmp0"(#loc21)) +#loc69 = loc("tmp0"(#loc22)) +#loc70 = loc("tmp0"(#loc23)) +#loc71 = loc("tmp1"(#loc24)) +#loc72 = loc("tmp4"(#loc25)) +#loc73 = loc("_tmp3"(#loc26)) +#loc75 = loc("tmp3"(#loc31)) +#loc76 = loc("tmp5"(#loc32)) +#loc77 = loc(fused[#loc36, #loc37]) +#loc78 = loc(callsite(#loc28 at #loc74)) +#loc80 = loc(callsite(#loc30 at #loc78)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f4c043bbd48f6bf54304fc4df546f77afe65a924 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/__grp__triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin", "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..a33b1ddc631240089d9918d75fe3542184ae888b Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7a9511149dc32a2d666aeaefa4ec5b43ed6533d3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"hash": "571e82f1c43639fab6318c86608c3ec5501b4c57931b7c7476d3dfbfbd6a706d", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..1f3996e733de245604c1e50119af1c95f3344afb --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.llir @@ -0,0 +1,266 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py\00" +@assertMessage_0 = internal constant [90 x i8] c"index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i64 %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %13 = icmp samesign ult i32 %12, 128, !dbg !11 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %15 = and i32 %14, 31, !dbg !12 + %16 = zext nneg i32 %12 to i64, !dbg !13 + %17 = mul i64 %5, %16, !dbg !13 + %18 = icmp sgt i32 %8, 0, !dbg !14 + br i1 %18, label %.lr.ph, label %._crit_edge, !dbg !14 + +.lr.ph: ; preds = %11 + %19 = getelementptr i32, ptr addrspace(1) %0, i64 %17 + br i1 %13, label %.lr.ph.split, label %.lr.ph.split.us + +.lr.ph.split.us: ; preds = %.lr.ph, %.lr.ph.split.us + %20 = phi i32 [ %26, %.lr.ph.split.us ], [ 0, %.lr.ph ] + %21 = or disjoint i32 %20, %15, !dbg !15 + %22 = sext i32 %21 to i64, !dbg !16 + %23 = getelementptr i32, ptr addrspace(1) %19, i64 %22, !dbg !17 + %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %25 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %23, i64 %24, i1 false) #5, !dbg !18 + %26 = add i32 %20, 32, !dbg !14 + %27 = icmp slt i32 %26, %8, !dbg !14 + br i1 %27, label %.lr.ph.split.us, label %._crit_edge, !dbg !14 + +.lr.ph.split: ; preds = %.lr.ph, %.lr.ph.split + %28 = phi i64 [ %36, %.lr.ph.split ], [ 0, %.lr.ph ] + %29 = phi i32 [ %37, %.lr.ph.split ], [ 0, %.lr.ph ] + %30 = or disjoint i32 %29, %15, !dbg !15 + %31 = icmp slt i32 %30, %8, !dbg !19 + %32 = sext i32 %30 to i64, !dbg !16 + %33 = getelementptr i32, ptr addrspace(1) %19, i64 %32, !dbg !17 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %35 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %33, i64 %34, i1 %31) #5, !dbg !18 + %narrow16 = select i1 %31, i32 %35, i32 0, !dbg !20 + %spec.select = sext i32 %narrow16 to i64, !dbg !20 + %36 = add i64 %28, %spec.select, !dbg !20 + %37 = add i32 %29, 32, !dbg !14 + %38 = icmp slt i32 %37, %8, !dbg !14 + br i1 %38, label %.lr.ph.split, label %._crit_edge, !dbg !14 + +._crit_edge: ; preds = %.lr.ph.split.us, %.lr.ph.split, %11 + %.lcssa = phi i64 [ 0, %11 ], [ %36, %.lr.ph.split ], [ 0, %.lr.ph.split.us ], !dbg !21 + %extelt.offset = lshr i64 %.lcssa, 32, !dbg !22 + %39 = trunc nuw i64 %extelt.offset to i32, !dbg !22 + %40 = trunc i64 %.lcssa to i32, !dbg !22 + %41 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %40, i32 16, i32 31), !dbg !22 + %42 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 16, i32 31), !dbg !22 + %43 = insertelement <2 x i32> poison, i32 %41, i64 0, !dbg !22 + %44 = insertelement <2 x i32> %43, i32 %42, i64 1, !dbg !22 + %45 = bitcast <2 x i32> %44 to i64, !dbg !22 + %46 = add i64 %.lcssa, %45, !dbg !26 + %extelt.offset3 = lshr i64 %46, 32, !dbg !22 + %47 = trunc nuw i64 %extelt.offset3 to i32, !dbg !22 + %48 = trunc i64 %46 to i32, !dbg !22 + %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 8, i32 31), !dbg !22 + %50 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %47, i32 8, i32 31), !dbg !22 + %51 = insertelement <2 x i32> poison, i32 %49, i64 0, !dbg !22 + %52 = insertelement <2 x i32> %51, i32 %50, i64 1, !dbg !22 + %53 = bitcast <2 x i32> %52 to i64, !dbg !22 + %54 = add i64 %46, %53, !dbg !26 + %extelt.offset4 = lshr i64 %54, 32, !dbg !22 + %55 = trunc nuw i64 %extelt.offset4 to i32, !dbg !22 + %56 = trunc i64 %54 to i32, !dbg !22 + %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 4, i32 31), !dbg !22 + %58 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 4, i32 31), !dbg !22 + %59 = insertelement <2 x i32> poison, i32 %57, i64 0, !dbg !22 + %60 = insertelement <2 x i32> %59, i32 %58, i64 1, !dbg !22 + %61 = bitcast <2 x i32> %60 to i64, !dbg !22 + %62 = add i64 %54, %61, !dbg !26 + %extelt.offset5 = lshr i64 %62, 32, !dbg !22 + %63 = trunc nuw i64 %extelt.offset5 to i32, !dbg !22 + %64 = trunc i64 %62 to i32, !dbg !22 + %65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 2, i32 31), !dbg !22 + %66 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 2, i32 31), !dbg !22 + %67 = insertelement <2 x i32> poison, i32 %65, i64 0, !dbg !22 + %68 = insertelement <2 x i32> %67, i32 %66, i64 1, !dbg !22 + %69 = bitcast <2 x i32> %68 to i64, !dbg !22 + %70 = add i64 %62, %69, !dbg !26 + %extelt.offset6 = lshr i64 %70, 32, !dbg !22 + %71 = trunc nuw i64 %extelt.offset6 to i32, !dbg !22 + %72 = trunc i64 %70 to i32, !dbg !22 + %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 1, i32 31), !dbg !22 + %74 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 1, i32 31), !dbg !22 + %75 = insertelement <2 x i32> poison, i32 %73, i64 0, !dbg !22 + %76 = insertelement <2 x i32> %75, i32 %74, i64 1, !dbg !22 + %77 = bitcast <2 x i32> %76 to i64, !dbg !22 + %78 = add i64 %70, %77, !dbg !26 + %79 = trunc i64 %78 to i32, !dbg !27 + %80 = getelementptr i32, ptr addrspace(1) %2, i64 %16, !dbg !28 + %81 = and i32 %14, 32, !dbg !29 + %82 = icmp eq i32 %81, 0, !dbg !29 + %83 = and i32 %14, 63, !dbg !29 + %84 = icmp eq i32 %83, 0, !dbg !29 + %85 = and i1 %13, %84, !dbg !29 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %79, ptr addrspace(1) %80, i1 %85) #5, !dbg !29 + %86 = icmp slt i64 %5, 2, !dbg !30 + %87 = icmp sgt i64 %5, 1, !dbg !31 + %88 = select i1 %87, i64 %5, i64 0, !dbg !32 + %89 = zext i1 %86 to i64, !dbg !33 + %90 = add i64 %88, %89, !dbg !34 + %91 = mul i64 %90, %16, !dbg !35 + %92 = add i64 %5, 1, !dbg !36 + %93 = add i64 %6, 127, !dbg !37 + %94 = sdiv i64 %93, 128, !dbg !38 + %95 = and i64 %93, 127, !dbg !42 + %.not = icmp ne i64 %95, 0, !dbg !42 + %96 = icmp slt i64 %93, 0, !dbg !43 + %narrow = and i1 %96, %.not, !dbg !44 + %97 = sext i1 %narrow to i64, !dbg !44 + %98 = add nsw i64 %94, %97, !dbg !44 + br i1 %18, label %.lr.ph14, label %._crit_edge15, !dbg !45 + +.lr.ph14: ; preds = %._crit_edge, %119 + %99 = phi i32 [ %131, %119 ], [ 0, %._crit_edge ] + %100 = or disjoint i32 %99, %15, !dbg !46 + %101 = icmp slt i32 %100, %8, !dbg !47 + %102 = sext i32 %100 to i64, !dbg !48 + %103 = add i64 %91, %102, !dbg !48 + %104 = getelementptr i64, ptr addrspace(1) %1, i64 %103, !dbg !49 + %105 = and i1 %13, %101, !dbg !50 + %106 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !51 + %107 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %106, i1 %105) #5, !dbg !51 + %108 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !51 + %109 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %108, i1 %105) #5, !dbg !51 + %110 = icmp slt i32 %100, %79, !dbg !52 + %sext7 = shl i64 %109, 32, !dbg !53 + %111 = ashr exact i64 %sext7, 32, !dbg !53 + %112 = select i1 %110, i64 %111, i64 %5, !dbg !53 + %113 = icmp slt i64 %112, 0, !dbg !54 + %114 = select i1 %113, i64 %92, i64 0, !dbg !55 + %115 = add i64 %114, %112, !dbg !55 + %116 = icmp slt i64 %115, 0, !dbg !56 + %117 = icmp sgt i64 %115, %98, !dbg !57 + %.not12 = or i1 %116, %117, !dbg !58 + %.not9 = and i1 %105, %.not12, !dbg !59 + br i1 %.not9, label %118, label %119, !dbg !59 + +118: ; preds = %.lr.ph14 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 59, ptr nonnull @assertFunc_0, i64 1), !dbg !59 + unreachable, !dbg !59 + +119: ; preds = %.lr.ph14 + %sext = shl i64 %107, 32, !dbg !53 + %120 = ashr exact i64 %sext, 32, !dbg !53 + %121 = select i1 %110, i64 %120, i64 %5, !dbg !53 + %122 = icmp slt i64 %121, 0, !dbg !54 + %123 = select i1 %122, i64 %92, i64 0, !dbg !55 + %124 = trunc i64 %109 to i32, !dbg !60 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !59 + %125 = getelementptr i32, ptr addrspace(1) %3, i64 %103, !dbg !61 + %126 = and i1 %82, %105, !dbg !62 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %124, ptr addrspace(1) %125, i1 %126) #5, !dbg !62 + %127 = getelementptr i32, ptr addrspace(1) %4, i64 %121, !dbg !63 + %128 = getelementptr i32, ptr addrspace(1) %127, i64 %123, !dbg !63 + %129 = getelementptr i32, ptr addrspace(1) %128, i64 %16, !dbg !63 + %130 = getelementptr i32, ptr addrspace(1) %129, i64 %17, !dbg !63 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %130, i1 %126) #5, !dbg !64 + %131 = add i32 %99, 32, !dbg !45 + %132 = icmp slt i32 %131, %8, !dbg !45 + br i1 %132, label %.lr.ph14, label %._crit_edge15, !dbg !45 + +._crit_edge15: ; preds = %119, %._crit_edge + ret void, !dbg !65 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="64" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2", linkageName: "triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 22, column: 28, scope: !9) +!11 = !DILocation(line: 24, column: 21, scope: !9) +!12 = !DILocation(line: 25, column: 37, scope: !9) +!13 = !DILocation(line: 35, column: 45, scope: !9) +!14 = !DILocation(line: 29, column: 40, scope: !9) +!15 = !DILocation(line: 30, column: 31, scope: !9) +!16 = !DILocation(line: 35, column: 41, scope: !9) +!17 = !DILocation(line: 35, column: 34, scope: !9) +!18 = !DILocation(line: 35, column: 50, scope: !9) +!19 = !DILocation(line: 31, column: 29, scope: !9) +!20 = !DILocation(line: 39, column: 48, scope: !9) +!21 = !DILocation(line: 28, column: 43, scope: !9) +!22 = !DILocation(line: 291, column: 36, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !9, file: !24, discriminator: 0) +!24 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!25 = !DILocation(line: 40, column: 25, scope: !9) +!26 = !DILocation(line: 261, column: 15, scope: !23, inlinedAt: !25) +!27 = !DILocation(line: 41, column: 19, scope: !9) +!28 = !DILocation(line: 42, column: 25, scope: !9) +!29 = !DILocation(line: 42, column: 36, scope: !9) +!30 = !DILocation(line: 49, column: 60, scope: !9) +!31 = !DILocation(line: 49, column: 86, scope: !9) +!32 = !DILocation(line: 49, column: 77, scope: !9) +!33 = !DILocation(line: 49, scope: !9) +!34 = !DILocation(line: 49, column: 68, scope: !9) +!35 = !DILocation(line: 49, column: 45, scope: !9) +!36 = !DILocation(line: 55, column: 20, scope: !9) +!37 = !DILocation(line: 59, column: 94, scope: !9) +!38 = !DILocation(line: 72, column: 16, scope: !39, inlinedAt: !41) +!39 = distinct !DILexicalBlockFile(scope: !9, file: !40, discriminator: 0) +!40 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!41 = !DILocation(line: 59, column: 100, scope: !9) +!42 = !DILocation(line: 74, column: 34, scope: !39, inlinedAt: !41) +!43 = !DILocation(line: 75, column: 25, scope: !39, inlinedAt: !41) +!44 = !DILocation(line: 75, column: 47, scope: !39, inlinedAt: !41) +!45 = !DILocation(line: 43, column: 40, scope: !9) +!46 = !DILocation(line: 44, column: 31, scope: !9) +!47 = !DILocation(line: 45, column: 29, scope: !9) +!48 = !DILocation(line: 49, column: 41, scope: !9) +!49 = !DILocation(line: 49, column: 34, scope: !9) +!50 = !DILocation(line: 49, column: 103, scope: !9) +!51 = !DILocation(line: 49, column: 93, scope: !9) +!52 = !DILocation(line: 52, column: 22, scope: !9) +!53 = !DILocation(line: 54, column: 37, scope: !9) +!54 = !DILocation(line: 57, column: 24, scope: !9) +!55 = !DILocation(line: 58, column: 39, scope: !9) +!56 = !DILocation(line: 59, column: 32, scope: !9) +!57 = !DILocation(line: 59, column: 50, scope: !9) +!58 = !DILocation(line: 59, column: 112, scope: !9) +!59 = !DILocation(line: 59, column: 130, scope: !9) +!60 = !DILocation(line: 50, column: 23, scope: !9) +!61 = !DILocation(line: 61, column: 29, scope: !9) +!62 = !DILocation(line: 61, column: 94, scope: !9) +!63 = !DILocation(line: 62, column: 29, scope: !9) +!64 = !DILocation(line: 62, column: 95, scope: !9) +!65 = !DILocation(line: 43, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..adf2b04902e89963e99f7bcbbfd929ea67e11a4c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ptx @@ -0,0 +1,640 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 // -- Begin function triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 50, 120, 47, 99, 50, 120, 117, 110, 116, 115, 52, 122, 110, 116, 100, 54, 53, 112, 97, 98, 103, 107, 107, 120, 103, 53, 121, 108, 121, 104, 55, 115, 97, 104, 102, 121, 111, 103, 122, 109, 103, 108, 106, 102, 105, 108, 106, 100, 117, 105, 52, 111, 51, 54, 53, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[90] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 53, 32, 60, 32, 49, 32, 43, 32, 40, 116, 114, 105, 116, 111, 110, 95, 104, 101, 108, 112, 101, 114, 115, 46, 100, 105, 118, 95, 102, 108, 111, 111, 114, 95, 105, 110, 116, 101, 103, 101, 114, 40, 49, 50, 55, 32, 43, 32, 107, 115, 49, 44, 32, 32, 49, 50, 56, 41, 41}; + // @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2 +.visible .entry triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_4, + .param .u64 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_5, + .param .u64 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_6, + .param .u32 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_7, + .param .u32 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_9, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_10 +) +.reqntid 64 +{ + .reg .pred %p<32>; + .reg .b32 %r<53>; + .reg .b64 %rd<103>; + .loc 1 18 0 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:18:0 + +// %bb.0: + ld.param.b32 %r12, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_8]; + ld.param.b64 %rd18, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_5]; + ld.param.b64 %rd15, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_2]; +$L__tmp0: + .loc 1 22 28 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:22:28 + mov.u32 %r13, %ctaid.x; + .loc 1 25 37 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:25:37 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + .loc 1 35 45 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:35:45 + cvt.u64.u32 %rd1, %r13; + mul.lo.s64 %rd2, %rd18, %rd1; + .loc 1 29 40 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:29:40 + setp.lt.s32 %p2, %r12, 1; + mov.b64 %rd102, 0; + cvt.u32.u64 %r49, %rd1; + shl.b64 %rd100, %rd2, 2; + @%p2 bra $L__BB0_6; +// %bb.1: // %.lr.ph + .loc 1 0 40 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:0:40 + ld.param.b64 %rd13, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_0]; + .loc 1 24 21 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:24:21 + setp.lt.u32 %p3, %r49, 128; + add.s64 %rd3, %rd13, %rd100; + @%p3 bra $L__BB0_4; + bra.uni $L__BB0_2; +$L__BB0_4: // %.lr.ph.split.preheader + .loc 1 0 21 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:0:21 + mov.b32 %r51, 0; + mov.b64 %rd102, 0; +$L__BB0_5: // %.lr.ph.split + // =>This Inner Loop Header: Depth=1 + .loc 1 31 29 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:31:29 + add.s32 %r20, %r2, %r51; + setp.lt.s32 %p6, %r20, %r12; + .loc 1 35 34 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:35:34 + mad.wide.s32 %rd28, %r20, 4, %rd3; + .loc 1 35 50 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:35:50 + // begin inline asm + mov.u64 %rd27, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd27, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r19, 0x0; + @%p6 ld.global.L1::evict_first.L2::cache_hint.b32 { %r19 }, [ %rd28 + 0 ], %rd27; + // end inline asm + .loc 1 39 48 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:39:48 + selp.b32 %r21, %r19, 0, %p6; + cvt.s64.s32 %rd30, %r21; + add.s64 %rd102, %rd102, %rd30; + .loc 1 29 40 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:29:40 + add.s32 %r51, %r51, 32; + setp.lt.s32 %p7, %r51, %r12; + @%p7 bra $L__BB0_5; + bra.uni $L__BB0_6; +$L__BB0_2: // %.lr.ph.split.us.preheader + .loc 1 0 40 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:0:40 + mov.b32 %r50, 0; +$L__BB0_3: // %.lr.ph.split.us + // =>This Inner Loop Header: Depth=1 + .loc 1 35 41 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:35:41 + add.s32 %r17, %r2, %r50; + .loc 1 35 34 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:35:34 + mad.wide.s32 %rd23, %r17, 4, %rd3; + .loc 1 35 50 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:35:50 + // begin inline asm + mov.u64 %rd22, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd22, 1.0; + // end inline asm + mov.pred %p4, 0; + // begin inline asm + mov.u32 %r16, 0x0; + @%p4 ld.global.L1::evict_first.L2::cache_hint.b32 { %r16 }, [ %rd23 + 0 ], %rd22; + // end inline asm + .loc 1 29 40 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:29:40 + add.s32 %r50, %r50, 32; + setp.lt.s32 %p5, %r50, %r12; + @%p5 bra $L__BB0_3; +$L__BB0_6: // %._crit_edge + .loc 1 24 21 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:24:21 + setp.lt.u32 %p10, %r49, 128; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:40:25 ] + mov.b64 {_, %r24}, %rd102; + cvt.u32.u64 %r25, %rd102; + shfl.sync.bfly.b32 %r26, %r25, 16, 31, -1; + shfl.sync.bfly.b32 %r27, %r24, 16, 31, -1; + cvt.u64.u32 %rd32, %r26; + cvt.u64.u32 %rd33, %r27; + shl.b64 %rd34, %rd33, 32; + or.b64 %rd35, %rd32, %rd34; + .loc 2 261 15 // standard.py:261:15 @[ c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:40:25 ] + add.s64 %rd36, %rd102, %rd35; + .loc 2 291 36 // standard.py:291:36 @[ c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:40:25 ] + mov.b64 {_, %r28}, %rd36; + cvt.u32.u64 %r29, %rd36; + shfl.sync.bfly.b32 %r30, %r29, 8, 31, -1; + shfl.sync.bfly.b32 %r31, %r28, 8, 31, -1; + cvt.u64.u32 %rd37, %r30; + cvt.u64.u32 %rd38, %r31; + shl.b64 %rd39, %rd38, 32; + or.b64 %rd40, %rd37, %rd39; + .loc 2 261 15 // standard.py:261:15 @[ c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:40:25 ] + add.s64 %rd41, %rd36, %rd40; + .loc 2 291 36 // standard.py:291:36 @[ c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:40:25 ] + mov.b64 {_, %r32}, %rd41; + cvt.u32.u64 %r33, %rd41; + shfl.sync.bfly.b32 %r34, %r33, 4, 31, -1; + shfl.sync.bfly.b32 %r35, %r32, 4, 31, -1; + cvt.u64.u32 %rd42, %r34; + cvt.u64.u32 %rd43, %r35; + shl.b64 %rd44, %rd43, 32; + or.b64 %rd45, %rd42, %rd44; + .loc 2 261 15 // standard.py:261:15 @[ c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:40:25 ] + add.s64 %rd46, %rd41, %rd45; + .loc 2 291 36 // standard.py:291:36 @[ c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:40:25 ] + mov.b64 {_, %r36}, %rd46; + cvt.u32.u64 %r37, %rd46; + shfl.sync.bfly.b32 %r38, %r37, 2, 31, -1; + shfl.sync.bfly.b32 %r39, %r36, 2, 31, -1; + cvt.u64.u32 %rd47, %r38; + cvt.u64.u32 %rd48, %r39; + shl.b64 %rd49, %rd48, 32; + or.b64 %rd50, %rd47, %rd49; + .loc 2 261 15 // standard.py:261:15 @[ c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:40:25 ] + add.s64 %rd51, %rd46, %rd50; + .loc 2 291 36 // standard.py:291:36 @[ c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:40:25 ] + mov.b64 {_, %r40}, %rd51; + cvt.u32.u64 %r41, %rd51; + shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1; + shfl.sync.bfly.b32 %r43, %r40, 1, 31, -1; + cvt.u64.u32 %rd52, %r42; + .loc 2 261 15 // standard.py:261:15 @[ c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:40:25 ] + add.s64 %rd53, %rd51, %rd52; +$L__tmp2: + .loc 1 41 19 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:41:19 + cvt.u32.u64 %r22, %rd53; + .loc 1 42 25 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:42:25 + shl.b64 %rd54, %rd1, 2; + add.s64 %rd31, %rd15, %rd54; + .loc 1 42 36 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:42:36 + and.b32 %r44, %r1, 63; + setp.eq.b32 %p11, %r44, 0; + and.pred %p8, %p10, %p11; + // begin inline asm + @%p8 st.global.b32 [ %rd31 + 0 ], { %r22 }; + // end inline asm + .loc 1 43 40 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:43:40 + @%p2 bra $L__BB0_11; +// %bb.7: // %.lr.ph14.preheader + .loc 1 0 40 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:0:40 + ld.param.b64 %rd19, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_6]; + ld.param.b64 %rd17, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_4]; + ld.param.b64 %rd16, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_3]; + ld.param.b64 %rd14, [triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2_param_1]; + and.b32 %r8, %r1, 32; + setp.lt.s64 %p12, %rd18, 2; + setp.gt.s64 %p13, %rd18, 1; + selp.b64 %rd55, %rd18, 0, %p13; + selp.b64 %rd56, 1, 0, %p12; + add.s64 %rd57, %rd55, %rd56; + mul.lo.s64 %rd7, %rd57, %rd1; + add.s64 %rd8, %rd18, 1; + add.s64 %rd58, %rd19, 127; + shr.s64 %rd59, %rd58, 63; + shr.u64 %rd60, %rd59, 57; + add.s64 %rd61, %rd58, %rd60; + shr.s64 %rd62, %rd61, 7; + and.b64 %rd63, %rd58, 127; + setp.ne.b64 %p14, %rd63, 0; + setp.lt.s64 %p15, %rd58, 0; + and.pred %p16, %p15, %p14; + selp.b64 %rd64, -1, 0, %p16; + add.s64 %rd9, %rd62, %rd64; + mov.b32 %r52, 0; +$L__BB0_8: // %.lr.ph14 + // =>This Inner Loop Header: Depth=1 + .loc 1 45 29 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:45:29 + add.s32 %r10, %r2, %r52; + setp.lt.s32 %p20, %r10, %r12; + .loc 1 49 41 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:49:41 + cvt.s64.s32 %rd73, %r10; + add.s64 %rd10, %rd7, %rd73; + .loc 1 49 34 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:49:34 + shl.b64 %rd74, %rd10, 3; + add.s64 %rd67, %rd14, %rd74; + .loc 1 49 103 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:49:103 + and.pred %p18, %p10, %p20; + .loc 1 49 93 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:49:93 + // begin inline asm + mov.u64 %rd65, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd65, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd66, 0x0; + @%p18 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd66 }, [ %rd67 + 0 ], %rd65; + // end inline asm + // begin inline asm + mov.u64 %rd69, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd69, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd70, 0x0; + @%p18 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd70 }, [ %rd67 + 0 ], %rd69; + // end inline asm + .loc 1 52 22 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:52:22 + setp.lt.s32 %p21, %r10, %r22; + .loc 1 54 37 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:54:37 + cvt.s64.s32 %rd75, %rd70; + selp.b64 %rd76, %rd75, %rd18, %p21; + .loc 1 58 39 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:58:39 + shr.s64 %rd77, %rd76, 63; + and.b64 %rd78, %rd77, %rd8; + add.s64 %rd79, %rd78, %rd76; + .loc 1 59 32 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:59:32 + setp.lt.s64 %p22, %rd79, 0; + .loc 1 59 50 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:59:50 + setp.gt.s64 %p23, %rd79, %rd9; + .loc 1 59 112 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:59:112 + or.pred %p24, %p22, %p23; + .loc 1 59 130 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:59:130 + and.pred %p25, %p18, %p24; + not.pred %p26, %p25; + @%p26 bra $L__BB0_10; + bra.uni $L__BB0_9; +$L__BB0_10: // in Loop: Header=BB0_8 Depth=1 + .loc 1 42 36 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:42:36 + setp.eq.b32 %p30, %r8, 0; + .loc 1 54 37 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:54:37 + cvt.s64.s32 %rd82, %rd66; + selp.b64 %rd83, %rd82, %rd18, %p21; + .loc 1 58 39 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:58:39 + shr.s64 %rd84, %rd83, 63; + and.b64 %rd85, %rd84, %rd8; + .loc 1 50 23 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:50:23 + cvt.u32.u64 %r47, %rd70; + .loc 1 59 130 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:59:130 + bar.sync 0; + .loc 1 61 29 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:61:29 + shl.b64 %rd86, %rd10, 2; + add.s64 %rd80, %rd16, %rd86; + .loc 1 61 94 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:61:94 + and.pred %p27, %p30, %p18; + // begin inline asm + @%p27 st.global.b32 [ %rd80 + 0 ], { %r47 }; + // end inline asm + .loc 1 62 29 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:62:29 + shl.b64 %rd87, %rd83, 2; + add.s64 %rd88, %rd17, %rd87; + shl.b64 %rd89, %rd85, 2; + add.s64 %rd90, %rd88, %rd89; + add.s64 %rd92, %rd90, %rd54; + add.s64 %rd81, %rd92, %rd100; + mov.b32 %r48, 1; + .loc 1 62 95 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:62:95 + // begin inline asm + @%p27 st.global.b32 [ %rd81 + 0 ], { %r48 }; + // end inline asm + .loc 1 43 40 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:43:40 + add.s32 %r52, %r52, 32; + setp.lt.s32 %p31, %r52, %r12; + @%p31 bra $L__BB0_8; +$L__BB0_11: // %._crit_edge15 + .loc 1 43 4 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:43:4 + ret; +$L__BB0_9: + .loc 1 59 130 // c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py:59:130 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd94, assertFunc_0; + cvta.global.u64 %rd95, %rd94; + st.param.b64 [param3], %rd95; + mov.b64 %rd96, assertFile_0; + cvta.global.u64 %rd97, %rd96; + st.param.b64 [param1], %rd97; + mov.b64 %rd98, assertMessage_0; + cvta.global.u64 %rd99, %rd98; + st.param.b64 [param0], %rd99; + st.param.b64 [param4], 1; + st.param.b32 [param2], 59; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 281 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x112 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 50 +.b8 120 +.b8 117 +.b8 110 +.b8 116 +.b8 115 +.b8 52 +.b8 122 +.b8 110 +.b8 116 +.b8 100 +.b8 54 +.b8 53 +.b8 112 +.b8 97 +.b8 98 +.b8 103 +.b8 107 +.b8 107 +.b8 120 +.b8 103 +.b8 53 +.b8 121 +.b8 108 +.b8 121 +.b8 104 +.b8 55 +.b8 115 +.b8 97 +.b8 104 +.b8 102 +.b8 121 +.b8 111 +.b8 103 +.b8 122 +.b8 109 +.b8 103 +.b8 108 +.b8 106 +.b8 102 +.b8 105 +.b8 108 +.b8 106 +.b8 100 +.b8 117 +.b8 105 +.b8 52 +.b8 111 +.b8 51 +.b8 54 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 50 +.b8 120 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x63 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 112 +.b8 117 +.b8 116 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 110 +.b8 101 +.b8 119 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 115 +.b8 99 +.b8 97 +.b8 108 +.b8 97 +.b8 114 +.b8 95 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 119 +.b8 104 +.b8 101 +.b8 114 +.b8 101 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xee:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x103:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 40 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source b/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source new file mode 100644 index 0000000000000000000000000000000000000000..9eefae78474dab5bf8b487400243b43ee9727f63 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.source @@ -0,0 +1,379 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":18:0) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc79 = loc(unknown) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0) +#loc97 = loc("in_ptr0"(#loc)) +#loc98 = loc("in_ptr1"(#loc)) +#loc99 = loc("out_ptr1"(#loc)) +#loc100 = loc("out_ptr2"(#loc)) +#loc101 = loc("out_ptr3"(#loc)) +#loc102 = loc("ks0"(#loc)) +#loc103 = loc("ks1"(#loc)) +#loc104 = loc("xnumel"(#loc)) +#loc105 = loc("r0_numel"(#loc)) +#loc151 = loc("input"(#loc77)) +#loc152 = loc("a"(#loc82)) +#loc153 = loc("b"(#loc82)) +#loc154 = loc("a"(#loc86)) +module { + tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 128 : i32 loc(#loc106) + %xoffset = tt.get_program_id x : i32 loc(#loc107) + %xoffset_1 = arith.constant 1 : i32 loc(#loc108) + %xoffset_2 = arith.constant 1 : i32 loc(#loc108) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc108) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc109) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc110) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc111) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc111) + %xmask = arith.constant dense<128> : tensor<1x1xi32> loc(#loc112) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc112) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc113) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc114) + %_tmp3 = arith.constant 0 : i64 loc(#loc115) + %_tmp3_9 = arith.constant dense<0> : tensor<1x32xi64> loc(#loc115) + %c0_i32 = arith.constant 0 : i32 loc(#loc11) + %c32_i32 = arith.constant 32 : i32 loc(#loc11) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc11) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc11) + %2 = arith.bitcast %c32_i32 : i32 to i32 loc(#loc11) + %3 = ub.poison : i32 loc(#loc11) + %_tmp3_10 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_14 = %_tmp3_9) -> (tensor<1x32xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc117) + %r0_index_15 = arith.addi %r0_index, %r0_base_8 : tensor<1x32xi32> loc(#loc117) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc118) + %r0_mask_16 = arith.cmpi slt, %r0_index_15, %r0_mask : tensor<1x32xi32> loc(#loc118) + %tmp0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc119) + %tmp0_17 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc119) + %tmp0_18 = arith.muli %tmp0_17, %tmp0 : tensor<1x1xi64> loc(#loc119) + %tmp0_19 = arith.extsi %r0_index_15 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc120) + %tmp0_20 = tt.broadcast %tmp0_18 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc120) + %tmp0_21 = arith.addi %tmp0_19, %tmp0_20 : tensor<1x32xi64> loc(#loc120) + %tmp0_22 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc121) + %tmp0_23 = tt.addptr %tmp0_22, %tmp0_21 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc121) + %tmp0_24 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc122) + %tmp0_25 = arith.andi %r0_mask_16, %tmp0_24 : tensor<1x32xi1> loc(#loc122) + %tmp0_26 = arith.constant 0.000000e+00 : f32 loc(#loc123) + %tmp0_27 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc123) + %tmp0_28 = arith.fptosi %tmp0_27 : tensor<1x32xf32> to tensor<1x32xi32> loc(#loc123) + %tmp0_29 = tt.load %tmp0_23, %tmp0_25, %tmp0_28 evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc123) + %tmp1 = arith.extsi %tmp0_29 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc124) + %tmp4 = arith.addi %_tmp3_14, %tmp1 : tensor<1x32xi64> loc(#loc125) + %_tmp3_30 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc126) + %_tmp3_31 = arith.andi %r0_mask_16, %_tmp3_30 : tensor<1x32xi1> loc(#loc126) + %_tmp3_32 = arith.select %_tmp3_31, %tmp4, %_tmp3_14 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc127) + scf.yield %_tmp3_32 : tensor<1x32xi64> loc(#loc23) + } loc(#loc116) + %tmp3 = tt.call @"triton.language.standard.sum__i64S1_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp3_10) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc128) + %tmp3_11 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc129) + %tmp5 = arith.trunci %tmp3_11 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc130) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc27) + %5 = tt.addptr %4, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc27) + tt.store %5, %tmp5, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc28) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc29) + %c32_i32_13 = arith.constant 32 : i32 loc(#loc29) + %6 = arith.bitcast %c0_i32_12 : i32 to i32 loc(#loc29) + %7 = arith.bitcast %r0_numel : i32 to i32 loc(#loc29) + %8 = arith.bitcast %c32_i32_13 : i32 to i32 loc(#loc29) + %9 = ub.poison : i32 loc(#loc29) + scf.for %r0_offset = %6 to %7 step %8 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc131) + %r0_index_14 = arith.addi %r0_index, %r0_base_8 : tensor<1x32xi32> loc(#loc131) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc132) + %r0_mask_15 = arith.cmpi slt, %r0_index_14, %r0_mask : tensor<1x32xi32> loc(#loc132) + %tmp6 = arith.constant 1 : i32 loc(#loc133) + %tmp6_16 = arith.extsi %tmp6 : i32 to i64 loc(#loc133) + %tmp6_17 = arith.cmpi sge, %tmp6_16, %ks0 : i64 loc(#loc133) + %tmp6_18 = arith.constant 1 : i32 loc(#loc134) + %tmp6_19 = arith.constant 1 : i32 loc(#loc134) + %tmp6_20 = arith.extui %tmp6_17 : i1 to i32 loc(#loc134) + %tmp6_21 = arith.muli %tmp6_19, %tmp6_20 : i32 loc(#loc134) + %tmp6_22 = arith.constant 1 : i32 loc(#loc135) + %tmp6_23 = arith.extsi %tmp6_22 : i32 to i64 loc(#loc135) + %tmp6_24 = arith.cmpi sgt, %ks0, %tmp6_23 : i64 loc(#loc135) + %tmp6_25 = arith.extui %tmp6_24 : i1 to i64 loc(#loc136) + %tmp6_26 = arith.muli %ks0, %tmp6_25 : i64 loc(#loc136) + %tmp6_27 = arith.extsi %tmp6_21 : i32 to i64 loc(#loc137) + %tmp6_28 = arith.addi %tmp6_27, %tmp6_26 : i64 loc(#loc137) + %tmp6_29 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc138) + %tmp6_30 = tt.splat %tmp6_28 : i64 -> tensor<1x1xi64> loc(#loc138) + %tmp6_31 = arith.muli %tmp6_29, %tmp6_30 : tensor<1x1xi64> loc(#loc138) + %tmp6_32 = arith.extsi %r0_index_14 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc139) + %tmp6_33 = tt.broadcast %tmp6_31 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc139) + %tmp6_34 = arith.addi %tmp6_32, %tmp6_33 : tensor<1x32xi64> loc(#loc139) + %tmp6_35 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc140) + %tmp6_36 = tt.addptr %tmp6_35, %tmp6_34 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc140) + %tmp6_37 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc141) + %tmp6_38 = arith.andi %r0_mask_15, %tmp6_37 : tensor<1x32xi1> loc(#loc141) + %tmp6_39 = arith.constant 0.000000e+00 : f32 loc(#loc142) + %tmp6_40 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc142) + %tmp6_41 = arith.fptosi %tmp6_40 : tensor<1x32xf32> to tensor<1x32xi64> loc(#loc142) + %tmp6_42 = tt.load %tmp6_36, %tmp6_38, %tmp6_41 evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc142) + %tmp7 = arith.trunci %tmp6_42 : tensor<1x32xi64> to tensor<1x32xi32> loc(#loc143) + %tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32> -> tensor<1x32xi32> loc(#loc144) + %tmp9_43 = arith.cmpi slt, %r0_index_14, %tmp9 : tensor<1x32xi32> loc(#loc144) + %tmp11 = arith.extsi %tmp7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc145) + %tmp11_44 = tt.splat %ks0 : i64 -> tensor<1x32xi64> loc(#loc145) + %tmp11_45 = arith.select %tmp9_43, %tmp11, %tmp11_44 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc145) + %tmp12 = arith.constant 1 : i32 loc(#loc146) + %tmp12_46 = arith.constant 1 : i64 loc(#loc146) + %tmp12_47 = arith.addi %tmp12_46, %ks0 : i64 loc(#loc146) + %tmp13 = tt.splat %tmp12_47 : i64 -> tensor<1x32xi64> loc(#loc147) + %tmp13_48 = arith.addi %tmp11_45, %tmp13 : tensor<1x32xi64> loc(#loc147) + %tmp14 = arith.constant 0 : i32 loc(#loc148) + %tmp14_49 = arith.extsi %tmp14 : i32 to i64 loc(#loc148) + %tmp14_50 = tt.splat %tmp14_49 : i64 -> tensor<1x32xi64> loc(#loc148) + %tmp14_51 = arith.cmpi slt, %tmp11_45, %tmp14_50 : tensor<1x32xi64> loc(#loc148) + %tmp15 = arith.select %tmp14_51, %tmp13_48, %tmp11_45 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc149) + %c0_i32_52 = arith.constant 0 : i32 loc(#loc49) + %10 = arith.extsi %c0_i32_52 : i32 to i64 loc(#loc49) + %11 = tt.splat %10 : i64 -> tensor<1x32xi64> loc(#loc49) + %12 = arith.cmpi sle, %11, %tmp15 : tensor<1x32xi64> loc(#loc49) + %c127_i32 = arith.constant 127 : i32 loc(#loc50) + %c127_i64 = arith.constant 127 : i64 loc(#loc50) + %13 = arith.addi %c127_i64, %ks1 : i64 loc(#loc50) + %14 = tt.call @"torch._inductor.runtime.triton_helpers.div_floor_integer__i64__(1,)cconstexpr_128_"(%13) : (i64) -> i64 loc(#loc51) + %c1_i32 = arith.constant 1 : i32 loc(#loc52) + %c1_i64 = arith.constant 1 : i64 loc(#loc52) + %15 = arith.addi %c1_i64, %14 : i64 loc(#loc52) + %16 = tt.splat %15 : i64 -> tensor<1x32xi64> loc(#loc53) + %17 = arith.cmpi slt, %tmp15, %16 : tensor<1x32xi64> loc(#loc53) + %18 = arith.andi %12, %17 : tensor<1x32xi1> loc(#loc54) + %19 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc55) + %20 = arith.andi %r0_mask_15, %19 : tensor<1x32xi1> loc(#loc55) + %true = arith.constant true loc(#loc56) + %cst = arith.constant dense : tensor<1x32xi1> loc(#loc56) + %21 = arith.xori %20, %cst : tensor<1x32xi1> loc(#loc56) + %22 = arith.ori %18, %21 : tensor<1x32xi1> loc(#loc57) + tt.assert %22, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1> loc(#loc58) + %tmp17 = arith.constant 1 : i32 loc(#loc150) + %tmp17_53 = arith.constant dense<1> : tensor<1x1xi32> loc(#loc150) + %c1_i32_54 = arith.constant 1 : i32 loc(#loc60) + %23 = arith.extsi %c1_i32_54 : i32 to i64 loc(#loc60) + %24 = arith.cmpi sge, %23, %ks0 : i64 loc(#loc60) + %c1_i32_55 = arith.constant 1 : i32 loc(#loc61) + %c1_i32_56 = arith.constant 1 : i32 loc(#loc61) + %25 = arith.extui %24 : i1 to i32 loc(#loc61) + %26 = arith.muli %c1_i32_56, %25 : i32 loc(#loc61) + %c1_i32_57 = arith.constant 1 : i32 loc(#loc62) + %27 = arith.extsi %c1_i32_57 : i32 to i64 loc(#loc62) + %28 = arith.cmpi sgt, %ks0, %27 : i64 loc(#loc62) + %29 = arith.extui %28 : i1 to i64 loc(#loc63) + %30 = arith.muli %ks0, %29 : i64 loc(#loc63) + %31 = arith.extsi %26 : i32 to i64 loc(#loc64) + %32 = arith.addi %31, %30 : i64 loc(#loc64) + %33 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc65) + %34 = tt.splat %32 : i64 -> tensor<1x1xi64> loc(#loc65) + %35 = arith.muli %33, %34 : tensor<1x1xi64> loc(#loc65) + %36 = arith.extsi %r0_index_14 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc66) + %37 = tt.broadcast %35 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc66) + %38 = arith.addi %36, %37 : tensor<1x32xi64> loc(#loc66) + %39 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc67) + %40 = tt.addptr %39, %38 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc67) + %41 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc68) + %42 = arith.andi %r0_mask_15, %41 : tensor<1x32xi1> loc(#loc68) + tt.store %40, %tmp7, %42 : tensor<1x32x!tt.ptr> loc(#loc69) + %43 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc70) + %44 = tt.broadcast %43 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc70) + %45 = arith.addi %tmp15, %44 : tensor<1x32xi64> loc(#loc70) + %46 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc71) + %47 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc71) + %48 = arith.muli %47, %46 : tensor<1x1xi64> loc(#loc71) + %49 = tt.broadcast %48 : tensor<1x1xi64> -> tensor<1x32xi64> loc(#loc72) + %50 = arith.addi %45, %49 : tensor<1x32xi64> loc(#loc72) + %51 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc73) + %52 = tt.addptr %51, %50 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc73) + %53 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x32xi1> loc(#loc74) + %54 = arith.andi %r0_mask_15, %53 : tensor<1x32xi1> loc(#loc74) + %cst_58 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc75) + tt.store %52, %cst_58, %54 : tensor<1x32x!tt.ptr> loc(#loc75) + } loc(#loc29) + tt.return loc(#loc76) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_32S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x32xi64> loc("input"(#loc77))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc78) + tt.reduce.return %2 : i64 loc(#loc78) + }) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc78) + tt.return %0 : tensor<1xi64> loc(#loc80) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc81) + tt.return %1 : tensor<1xi64> loc(#loc81) + } loc(#loc77) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc82)), %b: i64 loc("b"(#loc82))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc83) + tt.return %0 : i64 loc(#loc84) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc85) + tt.return %1 : i64 loc(#loc85) + } loc(#loc82) + tt.func private @"torch._inductor.runtime.triton_helpers.div_floor_integer__i64__(1,)cconstexpr_128_"(%a: i64 loc("a"(#loc86))) -> i64 attributes {noinline = false} { + %quot = arith.constant 128 : i32 loc(#loc155) + %quot_0 = arith.constant 128 : i64 loc(#loc155) + %quot_1 = arith.divsi %a, %quot_0 : i64 loc(#loc155) + %remainder = arith.constant 128 : i32 loc(#loc156) + %remainder_2 = arith.constant 128 : i64 loc(#loc156) + %remainder_3 = arith.remsi %a, %remainder_2 : i64 loc(#loc156) + %fixed = arith.constant 0 : i32 loc(#loc157) + %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc157) + %fixed_5 = arith.cmpi ne, %remainder_3, %fixed_4 : i64 loc(#loc157) + %fixed_6 = arith.constant 1 : i32 loc(#loc158) + %fixed_7 = arith.constant 1 : i64 loc(#loc158) + %fixed_8 = arith.subi %quot_1, %fixed_7 : i64 loc(#loc158) + %fixed_9 = arith.select %fixed_5, %fixed_8, %quot_1 : i64 loc(#loc159) + %c0_i32 = arith.constant 0 : i32 loc(#loc92) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc92) + %1 = arith.cmpi slt, %a, %0 : i64 loc(#loc92) + %false = arith.constant false loc(#loc93) + %2 = arith.cmpi ne, %1, %false : i1 loc(#loc93) + %3 = arith.select %2, %fixed_9, %quot_1 : i64 loc(#loc94) + tt.return %3 : i64 loc(#loc95) + ^bb1: // no predecessors + %4 = ub.poison : i64 loc(#loc96) + tt.return %4 : i64 loc(#loc96) + } loc(#loc86) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":28:43) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":29:40) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":30:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":31:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":35:45) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":35:41) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":35:34) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":35:60) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":35:50) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":36:23) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":38:23) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":39:35) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":39:48) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":39:8) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":40:25) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":40:28) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":41:19) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":42:25) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":42:36) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":43:40) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":44:31) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":45:29) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:60) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:52) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:86) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:77) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:68) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:45) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:41) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:34) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:103) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:93) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":50:23) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":52:22) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":54:37) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":55:20) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":56:24) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":57:24) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":58:39) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:32) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:94) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:100) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:55) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:50) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:42) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:122) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:112) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:110) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:130) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":60:35) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":61:55) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":61:47) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":61:81) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":61:72) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":61:63) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":61:40) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":61:36) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":61:29) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":61:104) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":61:94) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":62:53) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":62:62) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":62:58) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":62:29) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":62:105) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":62:95) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":43:4) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4) +#loc106 = loc("xnumel"(#loc1)) +#loc107 = loc("xoffset"(#loc2)) +#loc108 = loc("xoffset"(#loc3)) +#loc109 = loc("xindex"(#loc4)) +#loc110 = loc("xindex"(#loc5)) +#loc111 = loc("xindex"(#loc6)) +#loc112 = loc("xmask"(#loc7)) +#loc113 = loc("r0_base"(#loc8)) +#loc114 = loc("r0_base"(#loc9)) +#loc115 = loc("_tmp3"(#loc10)) +#loc116 = loc("_tmp3"(#loc11)) +#loc117 = loc("r0_index"(#loc12)) +#loc118 = loc("r0_mask"(#loc13)) +#loc119 = loc("tmp0"(#loc14)) +#loc120 = loc("tmp0"(#loc15)) +#loc121 = loc("tmp0"(#loc16)) +#loc122 = loc("tmp0"(#loc17)) +#loc123 = loc("tmp0"(#loc18)) +#loc124 = loc("tmp1"(#loc19)) +#loc125 = loc("tmp4"(#loc20)) +#loc126 = loc("_tmp3"(#loc21)) +#loc127 = loc("_tmp3"(#loc22)) +#loc128 = loc("tmp3"(#loc24)) +#loc129 = loc("tmp3"(#loc25)) +#loc130 = loc("tmp5"(#loc26)) +#loc131 = loc("r0_index"(#loc30)) +#loc132 = loc("r0_mask"(#loc31)) +#loc133 = loc("tmp6"(#loc32)) +#loc134 = loc("tmp6"(#loc33)) +#loc135 = loc("tmp6"(#loc34)) +#loc136 = loc("tmp6"(#loc35)) +#loc137 = loc("tmp6"(#loc36)) +#loc138 = loc("tmp6"(#loc37)) +#loc139 = loc("tmp6"(#loc38)) +#loc140 = loc("tmp6"(#loc39)) +#loc141 = loc("tmp6"(#loc40)) +#loc142 = loc("tmp6"(#loc41)) +#loc143 = loc("tmp7"(#loc42)) +#loc144 = loc("tmp9"(#loc43)) +#loc145 = loc("tmp11"(#loc44)) +#loc146 = loc("tmp12"(#loc45)) +#loc147 = loc("tmp13"(#loc46)) +#loc148 = loc("tmp14"(#loc47)) +#loc149 = loc("tmp15"(#loc48)) +#loc150 = loc("tmp17"(#loc59)) +#loc155 = loc("quot"(#loc87)) +#loc156 = loc("remainder"(#loc88)) +#loc157 = loc("fixed"(#loc89)) +#loc158 = loc("fixed"(#loc90)) +#loc159 = loc("fixed"(#loc91)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..f91375c284ed244d081868a18e7dd58dbcfced8f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttgir @@ -0,0 +1,271 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":18:0) +#loc1 = loc(unknown) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":40:25) +#loc68 = loc("in_ptr0"(#loc)) +#loc69 = loc("in_ptr1"(#loc)) +#loc70 = loc("out_ptr1"(#loc)) +#loc71 = loc("out_ptr2"(#loc)) +#loc72 = loc("out_ptr3"(#loc)) +#loc73 = loc("ks0"(#loc)) +#loc74 = loc("ks1"(#loc)) +#loc75 = loc("xnumel"(#loc)) +#loc76 = loc("r0_numel"(#loc)) +#loc91 = loc("tmp3"(#loc18)) +#loc124 = loc(callsite(#loc1 at #loc91)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x32xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x32xi64, #blocked1> loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c127_i64 = arith.constant 127 : i64 loc(#loc1) + %cst_1 = arith.constant dense : tensor<1x32xi1, #blocked1> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<1x32xi32, #blocked1> loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x32xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc77) + %xmask = arith.cmpi slt, %xoffset, %c128_i32 : i32 loc(#loc78) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc79) + %r0_base_4 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc79) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> loc(#loc79) + %r0_base_6 = tt.expand_dims %r0_base_4 {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> loc(#loc79) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32, #blocked1> loc(#loc80) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc81) + %tmp0_7 = arith.muli %ks0, %tmp0 : i64 loc(#loc81) + %tmp0_8 = tt.splat %tmp0_7 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc121) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc83) + %tmp0_10 = tt.splat %xmask : i1 -> tensor<1x32xi1, #blocked1> loc(#loc122) + %_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 iter_args(%_tmp3_31 = %cst_0) -> (tensor<1x32xi64, #blocked1>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked1> loc(#loc86) + %r0_index_32 = arith.addi %r0_index, %r0_base_6 : tensor<1x32xi32, #blocked1> loc(#loc86) + %r0_mask_33 = arith.cmpi slt, %r0_index_32, %r0_mask : tensor<1x32xi32, #blocked1> loc(#loc80) + %tmp0_34 = arith.extsi %r0_index_32 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc82) + %tmp0_35 = arith.addi %tmp0_34, %tmp0_8 : tensor<1x32xi64, #blocked1> loc(#loc82) + %tmp0_36 = tt.addptr %tmp0_9, %tmp0_35 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc83) + %tmp0_37 = arith.andi %r0_mask_33, %tmp0_10 : tensor<1x32xi1, #blocked1> loc(#loc84) + %tmp0_38 = tt.load %tmp0_36, %tmp0_37, %cst_2 evictionPolicy = evict_first : tensor<1x32x!tt.ptr, #blocked1> loc(#loc87) + %tmp1 = arith.extsi %tmp0_38 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc88) + %tmp4 = arith.addi %_tmp3_31, %tmp1 : tensor<1x32xi64, #blocked1> loc(#loc89) + %_tmp3_39 = arith.select %tmp0_37, %tmp4, %_tmp3_31 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc90) + scf.yield %_tmp3_39 : tensor<1x32xi64, #blocked1> loc(#loc16) + } loc(#loc85) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_31: i64 loc(callsite(#loc1 at #loc91)), %tmp3_32: i64 loc(callsite(#loc1 at #loc91))): + %tmp3_33 = arith.addi %tmp3_31, %tmp3_32 : i64 loc(#loc133) + tt.reduce.return %tmp3_33 : i64 loc(#loc123) + }) : (tensor<1x32xi64, #blocked1>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc123) + %0 = ttg.convert_layout %tmp3 : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc20) + %tmp3_11 = tt.expand_dims %0 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc92) + %tmp3_12 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi64, #blocked1> loc(#loc92) + %tmp5 = arith.trunci %tmp3_11 : tensor<1x1xi64, #blocked> to tensor<1x1xi32, #blocked> loc(#loc93) + %tmp5_13 = arith.trunci %tmp3_12 : tensor<1x1xi64, #blocked1> to tensor<1x1xi32, #blocked1> loc(#loc93) + %1 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc23) + %2 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc24) + %3 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc24) + tt.store %2, %tmp5, %3 : tensor<1x1x!tt.ptr, #blocked> loc(#loc24) + %r0_mask_14 = tt.splat %r0_numel : i32 -> tensor<1x32xi32, #blocked> loc(#loc94) + %tmp6 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc95) + %tmp6_15 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc96) + %tmp6_16 = arith.extui %tmp6_15 : i1 to i64 loc(#loc97) + %tmp6_17 = arith.muli %ks0, %tmp6_16 : i64 loc(#loc97) + %tmp6_18 = arith.extui %tmp6 : i1 to i64 loc(#loc125) + %tmp6_19 = arith.addi %tmp6_18, %tmp6_17 : i64 loc(#loc98) + %tmp6_20 = arith.muli %tmp0, %tmp6_19 : i64 loc(#loc100) + %tmp6_21 = tt.splat %tmp6_20 : i64 -> tensor<1x32xi64, #blocked> loc(#loc126) + %tmp6_22 = tt.splat %tmp6_20 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc126) + %tmp6_23 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked> loc(#loc102) + %tmp6_24 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc102) + %tmp6_25 = tt.splat %xmask : i1 -> tensor<1x32xi1, #blocked> loc(#loc127) + %tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32, #blocked> -> tensor<1x32xi32, #blocked> loc(#loc104) + %tmp9_26 = tt.broadcast %tmp5_13 : tensor<1x1xi32, #blocked1> -> tensor<1x32xi32, #blocked1> loc(#loc104) + %tmp11 = tt.splat %ks0 : i64 -> tensor<1x32xi64, #blocked> loc(#loc105) + %tmp11_27 = tt.splat %ks0 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc105) + %tmp12 = arith.addi %ks0, %c1_i64 : i64 loc(#loc106) + %tmp13 = tt.splat %tmp12 : i64 -> tensor<1x32xi64, #blocked> loc(#loc107) + %tmp13_28 = tt.splat %tmp12 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc107) + %4 = arith.addi %ks1, %c127_i64 : i64 loc(#loc39) + %quot = arith.divsi %4, %c128_i64 : i64 loc(#loc128) + %remainder = arith.remsi %4, %c128_i64 : i64 loc(#loc129) + %fixed = arith.cmpi ne, %remainder, %c0_i64 : i64 loc(#loc130) + %fixed_29 = arith.subi %quot, %c1_i64 : i64 loc(#loc131) + %fixed_30 = arith.select %fixed, %fixed_29, %quot : i64 loc(#loc132) + %5 = arith.cmpi slt, %4, %c0_i64 : i64 loc(#loc113) + %6 = arith.select %5, %fixed_30, %quot : i64 loc(#loc114) + %7 = arith.addi %6, %c1_i64 : i64 loc(#loc48) + %8 = tt.splat %7 : i64 -> tensor<1x32xi64, #blocked1> loc(#loc49) + %9 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked1> loc(#loc50) + %10 = tt.splat %tmp0 : i64 -> tensor<1x32xi64, #blocked> loc(#loc51) + %11 = tt.splat %tmp0_7 : i64 -> tensor<1x32xi64, #blocked> loc(#loc115) + %12 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x32x!tt.ptr, #blocked> loc(#loc54) + scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked> loc(#loc116) + %r0_index_31 = tt.splat %r0_offset : i32 -> tensor<1x32xi32, #blocked1> loc(#loc116) + %r0_index_32 = arith.addi %r0_index, %r0_base_5 : tensor<1x32xi32, #blocked> loc(#loc116) + %r0_index_33 = arith.addi %r0_index_31, %r0_base_6 : tensor<1x32xi32, #blocked1> loc(#loc116) + %r0_mask_34 = arith.cmpi slt, %r0_index_32, %r0_mask_14 : tensor<1x32xi32, #blocked> loc(#loc94) + %r0_mask_35 = arith.cmpi slt, %r0_index_33, %r0_mask : tensor<1x32xi32, #blocked1> loc(#loc94) + %tmp6_36 = arith.extsi %r0_index_32 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> loc(#loc101) + %tmp6_37 = arith.extsi %r0_index_33 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc101) + %tmp6_38 = arith.addi %tmp6_36, %tmp6_21 : tensor<1x32xi64, #blocked> loc(#loc101) + %tmp6_39 = arith.addi %tmp6_37, %tmp6_22 : tensor<1x32xi64, #blocked1> loc(#loc101) + %tmp6_40 = tt.addptr %tmp6_23, %tmp6_38 : tensor<1x32x!tt.ptr, #blocked>, tensor<1x32xi64, #blocked> loc(#loc102) + %tmp6_41 = tt.addptr %tmp6_24, %tmp6_39 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc102) + %tmp6_42 = arith.andi %r0_mask_34, %tmp6_25 : tensor<1x32xi1, #blocked> loc(#loc103) + %tmp6_43 = arith.andi %r0_mask_35, %tmp0_10 : tensor<1x32xi1, #blocked1> loc(#loc103) + %tmp6_44 = tt.load %tmp6_40, %tmp6_42, %cst evictionPolicy = evict_first : tensor<1x32x!tt.ptr, #blocked> loc(#loc117) + %tmp6_45 = tt.load %tmp6_41, %tmp6_43, %cst_0 evictionPolicy = evict_first : tensor<1x32x!tt.ptr, #blocked1> loc(#loc117) + %tmp7 = arith.trunci %tmp6_44 : tensor<1x32xi64, #blocked> to tensor<1x32xi32, #blocked> loc(#loc118) + %tmp7_46 = arith.trunci %tmp6_45 : tensor<1x32xi64, #blocked1> to tensor<1x32xi32, #blocked1> loc(#loc118) + %tmp9_47 = arith.cmpi slt, %r0_index_32, %tmp9 : tensor<1x32xi32, #blocked> loc(#loc104) + %tmp9_48 = arith.cmpi slt, %r0_index_33, %tmp9_26 : tensor<1x32xi32, #blocked1> loc(#loc104) + %tmp11_49 = arith.extsi %tmp7 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> loc(#loc105) + %tmp11_50 = arith.extsi %tmp7_46 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> loc(#loc105) + %tmp11_51 = arith.select %tmp9_47, %tmp11_49, %tmp11 : tensor<1x32xi1, #blocked>, tensor<1x32xi64, #blocked> loc(#loc105) + %tmp11_52 = arith.select %tmp9_48, %tmp11_50, %tmp11_27 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc105) + %tmp13_53 = arith.addi %tmp11_51, %tmp13 : tensor<1x32xi64, #blocked> loc(#loc107) + %tmp13_54 = arith.addi %tmp11_52, %tmp13_28 : tensor<1x32xi64, #blocked1> loc(#loc107) + %tmp14 = arith.cmpi slt, %tmp11_51, %cst : tensor<1x32xi64, #blocked> loc(#loc119) + %tmp14_55 = arith.cmpi slt, %tmp11_52, %cst_0 : tensor<1x32xi64, #blocked1> loc(#loc119) + %tmp15 = arith.select %tmp14, %tmp13_53, %tmp11_51 : tensor<1x32xi1, #blocked>, tensor<1x32xi64, #blocked> loc(#loc120) + %tmp15_56 = arith.select %tmp14_55, %tmp13_54, %tmp11_52 : tensor<1x32xi1, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc120) + %13 = arith.cmpi sge, %tmp15_56, %cst_0 : tensor<1x32xi64, #blocked1> loc(#loc61) + %14 = arith.cmpi slt, %tmp15_56, %8 : tensor<1x32xi64, #blocked1> loc(#loc49) + %15 = arith.andi %13, %14 : tensor<1x32xi1, #blocked1> loc(#loc62) + %16 = arith.xori %tmp6_43, %cst_1 : tensor<1x32xi1, #blocked1> loc(#loc63) + %17 = arith.ori %15, %16 : tensor<1x32xi1, #blocked1> loc(#loc64) + tt.assert %17, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1, #blocked1> loc(#loc65) + %18 = tt.addptr %9, %tmp6_39 : tensor<1x32x!tt.ptr, #blocked1>, tensor<1x32xi64, #blocked1> loc(#loc50) + tt.store %18, %tmp7_46, %tmp6_43 : tensor<1x32x!tt.ptr, #blocked1> loc(#loc66) + %19 = arith.addi %tmp15, %10 : tensor<1x32xi64, #blocked> loc(#loc51) + %20 = arith.addi %19, %11 : tensor<1x32xi64, #blocked> loc(#loc52) + %21 = tt.addptr %12, %20 : tensor<1x32x!tt.ptr, #blocked>, tensor<1x32xi64, #blocked> loc(#loc54) + tt.store %21, %cst_3, %tmp6_42 : tensor<1x32x!tt.ptr, #blocked> loc(#loc20) + } loc(#loc55) + tt.return loc(#loc67) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":31:29) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":35:45) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":35:41) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":35:34) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":35:60) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":29:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":30:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":35:50) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":36:23) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":38:23) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":39:48) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":39:8) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":62:95) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":40:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":41:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":42:25) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":42:36) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":45:29) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:60) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:86) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:77) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:68) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:52) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:45) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:103) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":52:22) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":54:37) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":55:20) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":56:24) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:94) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:100) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:55) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:50) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":61:29) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":62:53) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":62:58) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":62:62) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":62:29) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":43:40) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":44:31) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:93) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":50:23) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":57:24) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":58:39) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:32) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:112) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:110) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:130) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":61:94) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":43:4) +#loc77 = loc("xoffset"(#loc2)) +#loc78 = loc("xmask"(#loc3)) +#loc79 = loc("r0_base"(#loc4)) +#loc80 = loc("r0_mask"(#loc5)) +#loc81 = loc("tmp0"(#loc6)) +#loc82 = loc("tmp0"(#loc7)) +#loc83 = loc("tmp0"(#loc8)) +#loc84 = loc("tmp0"(#loc9)) +#loc85 = loc("_tmp3"(#loc10)) +#loc86 = loc("r0_index"(#loc11)) +#loc87 = loc("tmp0"(#loc12)) +#loc88 = loc("tmp1"(#loc13)) +#loc89 = loc("tmp4"(#loc14)) +#loc90 = loc("_tmp3"(#loc15)) +#loc92 = loc("tmp3"(#loc21)) +#loc93 = loc("tmp5"(#loc22)) +#loc94 = loc("r0_mask"(#loc25)) +#loc95 = loc("tmp6"(#loc26)) +#loc96 = loc("tmp6"(#loc27)) +#loc97 = loc("tmp6"(#loc28)) +#loc98 = loc("tmp6"(#loc29)) +#loc99 = loc("tmp6"(#loc30)) +#loc100 = loc("tmp6"(#loc31)) +#loc101 = loc("tmp6"(#loc32)) +#loc102 = loc("tmp6"(#loc33)) +#loc103 = loc("tmp6"(#loc34)) +#loc104 = loc("tmp9"(#loc35)) +#loc105 = loc("tmp11"(#loc36)) +#loc106 = loc("tmp12"(#loc37)) +#loc107 = loc("tmp13"(#loc38)) +#loc108 = loc("quot"(#loc40)) +#loc109 = loc("remainder"(#loc42)) +#loc110 = loc("fixed"(#loc43)) +#loc111 = loc("fixed"(#loc44)) +#loc112 = loc("fixed"(#loc45)) +#loc113 = loc(callsite(#loc46 at #loc41)) +#loc114 = loc(callsite(#loc47 at #loc41)) +#loc115 = loc(fused[#loc52, #loc53]) +#loc116 = loc("r0_index"(#loc56)) +#loc117 = loc("tmp6"(#loc57)) +#loc118 = loc("tmp7"(#loc58)) +#loc119 = loc("tmp14"(#loc59)) +#loc120 = loc("tmp15"(#loc60)) +#loc121 = loc(fused[#loc82, #loc81]) +#loc122 = loc(fused[#loc84, #loc78]) +#loc123 = loc(callsite(#loc17 at #loc91)) +#loc125 = loc(fused[#loc98, #loc99]) +#loc126 = loc(fused[#loc101, #loc100]) +#loc127 = loc(fused[#loc103, #loc78]) +#loc128 = loc(callsite(#loc108 at #loc41)) +#loc129 = loc(callsite(#loc109 at #loc41)) +#loc130 = loc(callsite(#loc110 at #loc41)) +#loc131 = loc(callsite(#loc111 at #loc41)) +#loc132 = loc(callsite(#loc112 at #loc41)) +#loc133 = loc(callsite(#loc19 at #loc123)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..af22c6b48cf15ffe0cea68608cacea700c1206a5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/K4PIF4OEGY47VNRRRSDGBDB6YVIBWTCXSMNXY5DW2PP37PLKOBWQ/triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2.ttir @@ -0,0 +1,247 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":18:0) +#loc2 = loc(unknown) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":40:25) +#loc69 = loc("in_ptr0"(#loc)) +#loc70 = loc("in_ptr1"(#loc)) +#loc71 = loc("out_ptr1"(#loc)) +#loc72 = loc("out_ptr2"(#loc)) +#loc73 = loc("out_ptr3"(#loc)) +#loc74 = loc("ks0"(#loc)) +#loc75 = loc("ks1"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc93 = loc("tmp3"(#loc19)) +#loc126 = loc(callsite(#loc2 at #loc93)) +module { + tt.func public @triton_red_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xmask = arith.constant 128 : i32 loc(#loc78) + %c128_i64 = arith.constant 128 : i64 loc(#loc2) + %c0_i64 = arith.constant 0 : i64 loc(#loc2) + %cst = arith.constant dense<0> : tensor<1x32xi32> loc(#loc2) + %c32_i32 = arith.constant 32 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst_0 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc2) + %cst_1 = arith.constant dense : tensor<1x32xi1> loc(#loc2) + %c127_i64 = arith.constant 127 : i64 loc(#loc2) + %c1_i64 = arith.constant 1 : i64 loc(#loc2) + %cst_2 = arith.constant dense<0> : tensor<1x32xi64> loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc79) + %xmask_3 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc78) + %xmask_4 = tt.splat %xmask_3 : i1 -> tensor<1x1xi1> loc(#loc78) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc80) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc81) + %_tmp3 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 iter_args(%_tmp3_7 = %cst_2) -> (tensor<1x32xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc83) + %r0_index_8 = arith.addi %r0_index, %r0_base_5 : tensor<1x32xi32> loc(#loc83) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc84) + %r0_mask_9 = arith.cmpi slt, %r0_index_8, %r0_mask : tensor<1x32xi32> loc(#loc84) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc85) + %tmp0_10 = arith.muli %ks0, %tmp0 : i64 loc(#loc85) + %tmp0_11 = arith.extsi %r0_index_8 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc86) + %tmp0_12 = tt.splat %tmp0_10 : i64 -> tensor<1x32xi64> loc(#loc123) + %tmp0_13 = arith.addi %tmp0_11, %tmp0_12 : tensor<1x32xi64> loc(#loc86) + %tmp0_14 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc87) + %tmp0_15 = tt.addptr %tmp0_14, %tmp0_13 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc87) + %tmp0_16 = tt.splat %xmask_3 : i1 -> tensor<1x32xi1> loc(#loc124) + %tmp0_17 = arith.andi %r0_mask_9, %tmp0_16 : tensor<1x32xi1> loc(#loc88) + %tmp0_18 = tt.load %tmp0_15, %tmp0_17, %cst evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc89) + %tmp1 = arith.extsi %tmp0_18 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc90) + %tmp4 = arith.addi %_tmp3_7, %tmp1 : tensor<1x32xi64> loc(#loc91) + %_tmp3_19 = arith.select %tmp0_17, %tmp4, %_tmp3_7 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc92) + scf.yield %_tmp3_19 : tensor<1x32xi64> loc(#loc17) + } loc(#loc82) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_7: i64 loc(callsite(#loc2 at #loc93)), %tmp3_8: i64 loc(callsite(#loc2 at #loc93))): + %tmp3_9 = arith.addi %tmp3_7, %tmp3_8 : i64 loc(#loc135) + tt.reduce.return %tmp3_9 : i64 loc(#loc125) + }) : (tensor<1x32xi64>) -> tensor<1xi64> loc(#loc125) + %tmp3_6 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc94) + %tmp5 = arith.trunci %tmp3_6 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc95) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc23) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc23) + tt.store %1, %tmp5, %xmask_4 : tensor<1x1x!tt.ptr> loc(#loc24) + scf.for %r0_offset = %c0_i32 to %r0_numel step %c32_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc96) + %r0_index_7 = arith.addi %r0_index, %r0_base_5 : tensor<1x32xi32> loc(#loc96) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x32xi32> loc(#loc97) + %r0_mask_8 = arith.cmpi slt, %r0_index_7, %r0_mask : tensor<1x32xi32> loc(#loc97) + %tmp6 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc98) + %tmp6_9 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc99) + %tmp6_10 = arith.extui %tmp6_9 : i1 to i64 loc(#loc100) + %tmp6_11 = arith.muli %ks0, %tmp6_10 : i64 loc(#loc100) + %tmp6_12 = arith.extui %tmp6 : i1 to i64 loc(#loc127) + %tmp6_13 = arith.addi %tmp6_12, %tmp6_11 : i64 loc(#loc101) + %tmp6_14 = arith.extsi %xoffset : i32 to i64 loc(#loc103) + %tmp6_15 = arith.muli %tmp6_14, %tmp6_13 : i64 loc(#loc103) + %tmp6_16 = arith.extsi %r0_index_7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc104) + %tmp6_17 = tt.splat %tmp6_15 : i64 -> tensor<1x32xi64> loc(#loc128) + %tmp6_18 = arith.addi %tmp6_16, %tmp6_17 : tensor<1x32xi64> loc(#loc104) + %tmp6_19 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc105) + %tmp6_20 = tt.addptr %tmp6_19, %tmp6_18 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc105) + %tmp6_21 = tt.splat %xmask_3 : i1 -> tensor<1x32xi1> loc(#loc129) + %tmp6_22 = arith.andi %r0_mask_8, %tmp6_21 : tensor<1x32xi1> loc(#loc106) + %tmp6_23 = tt.load %tmp6_20, %tmp6_22, %cst_2 evictionPolicy = evict_first : tensor<1x32x!tt.ptr> loc(#loc107) + %tmp7 = arith.trunci %tmp6_23 : tensor<1x32xi64> to tensor<1x32xi32> loc(#loc108) + %tmp9 = tt.broadcast %tmp5 : tensor<1x1xi32> -> tensor<1x32xi32> loc(#loc109) + %tmp9_24 = arith.cmpi slt, %r0_index_7, %tmp9 : tensor<1x32xi32> loc(#loc109) + %tmp11 = arith.extsi %tmp7 : tensor<1x32xi32> to tensor<1x32xi64> loc(#loc110) + %tmp11_25 = tt.splat %ks0 : i64 -> tensor<1x32xi64> loc(#loc110) + %tmp11_26 = arith.select %tmp9_24, %tmp11, %tmp11_25 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc110) + %tmp12 = arith.addi %ks0, %c1_i64 : i64 loc(#loc111) + %tmp13 = tt.splat %tmp12 : i64 -> tensor<1x32xi64> loc(#loc112) + %tmp13_27 = arith.addi %tmp11_26, %tmp13 : tensor<1x32xi64> loc(#loc112) + %tmp14 = arith.cmpi slt, %tmp11_26, %cst_2 : tensor<1x32xi64> loc(#loc113) + %tmp15 = arith.select %tmp14, %tmp13_27, %tmp11_26 : tensor<1x32xi1>, tensor<1x32xi64> loc(#loc114) + %2 = arith.cmpi sge, %tmp15, %cst_2 : tensor<1x32xi64> loc(#loc45) + %3 = arith.addi %ks1, %c127_i64 : i64 loc(#loc46) + %quot = arith.divsi %3, %c128_i64 : i64 loc(#loc130) + %remainder = arith.remsi %3, %c128_i64 : i64 loc(#loc131) + %fixed = arith.cmpi ne, %remainder, %c0_i64 : i64 loc(#loc132) + %fixed_28 = arith.subi %quot, %c1_i64 : i64 loc(#loc133) + %fixed_29 = arith.select %fixed, %fixed_28, %quot : i64 loc(#loc134) + %4 = arith.cmpi slt, %3, %c0_i64 : i64 loc(#loc120) + %5 = arith.select %4, %fixed_29, %quot : i64 loc(#loc121) + %6 = arith.addi %5, %c1_i64 : i64 loc(#loc55) + %7 = tt.splat %6 : i64 -> tensor<1x32xi64> loc(#loc56) + %8 = arith.cmpi slt, %tmp15, %7 : tensor<1x32xi64> loc(#loc56) + %9 = arith.andi %2, %8 : tensor<1x32xi1> loc(#loc57) + %10 = arith.xori %tmp6_22, %cst_1 : tensor<1x32xi1> loc(#loc58) + %11 = arith.ori %9, %10 : tensor<1x32xi1> loc(#loc59) + tt.assert %11, "index out of bounds: 0 <= tmp15 < 1 + (triton_helpers.div_floor_integer(127 + ks1, 128))" : tensor<1x32xi1> loc(#loc60) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc61) + %13 = tt.addptr %12, %tmp6_18 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc61) + tt.store %13, %tmp7, %tmp6_22 : tensor<1x32x!tt.ptr> loc(#loc62) + %14 = tt.splat %tmp6_14 : i64 -> tensor<1x32xi64> loc(#loc63) + %15 = arith.addi %tmp15, %14 : tensor<1x32xi64> loc(#loc63) + %16 = arith.muli %ks0, %tmp6_14 : i64 loc(#loc64) + %17 = tt.splat %16 : i64 -> tensor<1x32xi64> loc(#loc122) + %18 = arith.addi %15, %17 : tensor<1x32xi64> loc(#loc65) + %19 = tt.splat %out_ptr3 : !tt.ptr -> tensor<1x32x!tt.ptr> loc(#loc66) + %20 = tt.addptr %19, %18 : tensor<1x32x!tt.ptr>, tensor<1x32xi64> loc(#loc66) + tt.store %20, %cst_0, %tmp6_22 : tensor<1x32x!tt.ptr> loc(#loc67) + } loc(#loc25) + tt.return loc(#loc68) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":24:21) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":22:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":25:27) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":25:37) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":29:40) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":30:31) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":31:29) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":35:45) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":35:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":35:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":35:60) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":35:50) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":36:23) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":38:23) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":39:48) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":39:8) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":40:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":41:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":42:25) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":42:36) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":43:40) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":44:31) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":45:29) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:60) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:86) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:77) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:68) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:52) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:45) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:41) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:34) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:103) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":49:93) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":50:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":52:22) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":54:37) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":55:20) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":56:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":57:24) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":58:39) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:32) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:94) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:100) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:55) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:50) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:42) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:112) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:110) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":59:130) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":61:29) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":61:94) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":62:53) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":62:62) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":62:58) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":62:29) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":62:95) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2x/c2xunts4zntd65pabgkkxg5ylyh7sahfyogzmgljfiljdui4o365.py":43:4) +#loc78 = loc("xmask"(#loc1)) +#loc79 = loc("xoffset"(#loc3)) +#loc80 = loc("r0_base"(#loc4)) +#loc81 = loc("r0_base"(#loc5)) +#loc82 = loc("_tmp3"(#loc6)) +#loc83 = loc("r0_index"(#loc7)) +#loc84 = loc("r0_mask"(#loc8)) +#loc85 = loc("tmp0"(#loc9)) +#loc86 = loc("tmp0"(#loc10)) +#loc87 = loc("tmp0"(#loc11)) +#loc88 = loc("tmp0"(#loc12)) +#loc89 = loc("tmp0"(#loc13)) +#loc90 = loc("tmp1"(#loc14)) +#loc91 = loc("tmp4"(#loc15)) +#loc92 = loc("_tmp3"(#loc16)) +#loc94 = loc("tmp3"(#loc21)) +#loc95 = loc("tmp5"(#loc22)) +#loc96 = loc("r0_index"(#loc26)) +#loc97 = loc("r0_mask"(#loc27)) +#loc98 = loc("tmp6"(#loc28)) +#loc99 = loc("tmp6"(#loc29)) +#loc100 = loc("tmp6"(#loc30)) +#loc101 = loc("tmp6"(#loc31)) +#loc102 = loc("tmp6"(#loc32)) +#loc103 = loc("tmp6"(#loc33)) +#loc104 = loc("tmp6"(#loc34)) +#loc105 = loc("tmp6"(#loc35)) +#loc106 = loc("tmp6"(#loc36)) +#loc107 = loc("tmp6"(#loc37)) +#loc108 = loc("tmp7"(#loc38)) +#loc109 = loc("tmp9"(#loc39)) +#loc110 = loc("tmp11"(#loc40)) +#loc111 = loc("tmp12"(#loc41)) +#loc112 = loc("tmp13"(#loc42)) +#loc113 = loc("tmp14"(#loc43)) +#loc114 = loc("tmp15"(#loc44)) +#loc115 = loc("quot"(#loc47)) +#loc116 = loc("remainder"(#loc49)) +#loc117 = loc("fixed"(#loc50)) +#loc118 = loc("fixed"(#loc51)) +#loc119 = loc("fixed"(#loc52)) +#loc120 = loc(callsite(#loc53 at #loc48)) +#loc121 = loc(callsite(#loc54 at #loc48)) +#loc122 = loc(fused[#loc65, #loc64]) +#loc123 = loc(fused[#loc86, #loc85]) +#loc124 = loc(fused[#loc88, #loc78]) +#loc125 = loc(callsite(#loc18 at #loc93)) +#loc127 = loc(fused[#loc101, #loc102]) +#loc128 = loc(fused[#loc104, #loc103]) +#loc129 = loc(fused[#loc106, #loc78]) +#loc130 = loc(callsite(#loc115 at #loc48)) +#loc131 = loc(callsite(#loc116 at #loc48)) +#loc132 = loc(callsite(#loc117 at #loc48)) +#loc133 = loc(callsite(#loc118 at #loc48)) +#loc134 = loc(callsite(#loc119 at #loc48)) +#loc135 = loc(callsite(#loc20 at #loc125)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/__grp__triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/__grp__triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..deb24c799a66dc898a3d9a7a0629ab5ba618d1fd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/__grp__triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..e5cdf67e428691ba042b276a2320bc682f4b2d24 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..86bae31e130821cf72394459718eaa271f72fd9e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"hash": "519d2d6d1d5f736c729d00db5be52245ff987e34efd87be6f89f1c79297f4591", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..9a49b39ab7ab055107307432c41e2314c0842eb3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.llir @@ -0,0 +1,483 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_argmax_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 6, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = and i32 %9, 496, !dbg !9 + %11 = lshr exact i32 %10, 4, !dbg !9 + %12 = or disjoint i32 %11, %8, !dbg !10 + %13 = or disjoint i32 %12, 32, !dbg !10 + %14 = shl nuw nsw i32 %9, 2, !dbg !11 + %15 = and i32 %14, 60, !dbg !11 + %16 = sdiv i32 %12, 2048, !dbg !12 + %17 = sdiv i32 %13, 2048, !dbg !12 + %18 = mul i32 %12, 32000 + %19 = mul i32 %16, 224000 + %20 = add i32 %19, %18 + %21 = mul i32 %13, 32000 + %22 = mul i32 %17, 224000 + %23 = add i32 %22, %21 + %24 = zext nneg i32 %15 to i64, !dbg !13 + br label %25, !dbg !13 + +25: ; preds = %6, %25 + %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %25 ] + %26 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %91, %25 ] + %27 = phi <2 x i32> [ splat (i32 2147483647), %6 ], [ %97, %25 ] + %28 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %92, %25 ] + %29 = phi <2 x i32> [ splat (i32 2147483647), %6 ], [ %120, %25 ] + %30 = phi <4 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %117, %25 ] + %31 = phi <4 x i32> [ splat (i32 2147483647), %6 ], [ %121, %25 ] + %32 = or disjoint i64 %indvars.iv, %24, !dbg !14 + %33 = trunc nuw nsw i64 %32 to i32, !dbg !15 + %34 = add i32 %20, %33, !dbg !15 + %35 = add i32 %23, %33, !dbg !15 + %36 = sext i32 %34 to i64, !dbg !16 + %37 = getelementptr float, ptr addrspace(1) %0, i64 %36, !dbg !16 + %38 = sext i32 %35 to i64, !dbg !16 + %39 = getelementptr float, ptr addrspace(1) %0, i64 %38, !dbg !16 + %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %41 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %37, i64 %40, i1 true) #4, !dbg !17 + %42 = extractvalue { i32, i32, i32, i32 } %41, 0, !dbg !17 + %43 = extractvalue { i32, i32, i32, i32 } %41, 1, !dbg !17 + %44 = extractvalue { i32, i32, i32, i32 } %41, 2, !dbg !17 + %45 = extractvalue { i32, i32, i32, i32 } %41, 3, !dbg !17 + %46 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %47 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %39, i64 %46, i1 true) #4, !dbg !17 + %48 = extractvalue { i32, i32, i32, i32 } %47, 0, !dbg !17 + %49 = extractvalue { i32, i32, i32, i32 } %47, 1, !dbg !17 + %50 = extractvalue { i32, i32, i32, i32 } %47, 2, !dbg !17 + %51 = extractvalue { i32, i32, i32, i32 } %47, 3, !dbg !17 + %52 = fcmp uno <2 x float> %26, zeroinitializer, !dbg !18 + %53 = fcmp uno <4 x float> %30, zeroinitializer, !dbg !18 + %54 = fcmp uno <2 x float> %28, zeroinitializer, !dbg !18 + %55 = sext <2 x i32> %27 to <2 x i64>, !dbg !22 + %56 = sext <2 x i32> %29 to <2 x i64>, !dbg !22 + %57 = insertelement <2 x i32> poison, i32 %42, i64 0, !dbg !17 + %58 = insertelement <2 x i32> %57, i32 %43, i64 1, !dbg !17 + %59 = bitcast <2 x i32> %58 to <2 x float>, !dbg !17 + %60 = fcmp ogt <2 x float> %26, %59, !dbg !23 + %61 = fcmp oeq <2 x float> %26, %59, !dbg !24 + %62 = fcmp uno <2 x float> %59, zeroinitializer, !dbg !25 + %63 = xor <2 x i1> %62, splat (i1 true), !dbg !26 + %64 = and <2 x i1> %52, %63, !dbg !27 + %65 = or <2 x i1> %60, %64, !dbg !28 + %66 = and <2 x i1> %52, %62, !dbg !29 + %67 = or <2 x i1> %61, %66, !dbg !30 + %68 = insertelement <2 x i64> poison, i64 %32, i64 0, !dbg !22 + %69 = shufflevector <2 x i64> %68, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !22 + %70 = icmp sgt <2 x i64> %69, %55, !dbg !22 + %71 = icmp sge <2 x i64> %69, %55, !dbg !22 + %72 = shufflevector <2 x i1> %70, <2 x i1> %71, <2 x i32> , !dbg !22 + %73 = and <2 x i1> %72, %67, !dbg !31 + %74 = or <2 x i1> %65, %73, !dbg !32 + %75 = insertelement <2 x i32> poison, i32 %48, i64 0, !dbg !17 + %76 = insertelement <2 x i32> %75, i32 %49, i64 1, !dbg !17 + %77 = bitcast <2 x i32> %76 to <2 x float>, !dbg !17 + %78 = fcmp ogt <2 x float> %28, %77, !dbg !23 + %79 = fcmp oeq <2 x float> %28, %77, !dbg !24 + %80 = fcmp uno <2 x float> %77, zeroinitializer, !dbg !25 + %81 = xor <2 x i1> %80, splat (i1 true), !dbg !26 + %82 = and <2 x i1> %54, %81, !dbg !27 + %83 = or <2 x i1> %78, %82, !dbg !28 + %84 = and <2 x i1> %54, %80, !dbg !29 + %85 = or <2 x i1> %79, %84, !dbg !30 + %86 = icmp sgt <2 x i64> %69, %56, !dbg !22 + %87 = icmp sge <2 x i64> %69, %56, !dbg !22 + %88 = shufflevector <2 x i1> %86, <2 x i1> %87, <2 x i32> , !dbg !22 + %89 = and <2 x i1> %88, %85, !dbg !31 + %90 = or <2 x i1> %83, %89, !dbg !32 + %91 = select <2 x i1> %74, <2 x float> %26, <2 x float> %59, !dbg !33 + %92 = select <2 x i1> %90, <2 x float> %28, <2 x float> %77, !dbg !33 + %93 = trunc nuw nsw i64 %32 to i32, !dbg !34 + %94 = or disjoint i32 %93, 1, !dbg !34 + %95 = insertelement <2 x i32> poison, i32 %33, i64 0, !dbg !34 + %96 = insertelement <2 x i32> %95, i32 %94, i64 1, !dbg !34 + %97 = select <2 x i1> %74, <2 x i32> %27, <2 x i32> %96, !dbg !34 + %98 = or disjoint <2 x i64> %69, , !dbg !14 + %99 = shufflevector <2 x i64> %98, <2 x i64> poison, <4 x i32> , !dbg !14 + %100 = insertelement <4 x i32> poison, i32 %51, i64 0, !dbg !17 + %101 = insertelement <4 x i32> %100, i32 %50, i64 1, !dbg !17 + %102 = insertelement <4 x i32> %101, i32 %45, i64 2, !dbg !17 + %103 = insertelement <4 x i32> %102, i32 %44, i64 3, !dbg !17 + %104 = bitcast <4 x i32> %103 to <4 x float>, !dbg !17 + %105 = fcmp ogt <4 x float> %30, %104, !dbg !23 + %106 = fcmp oeq <4 x float> %30, %104, !dbg !24 + %107 = fcmp uno <4 x float> %104, zeroinitializer, !dbg !25 + %108 = xor <4 x i1> %107, splat (i1 true), !dbg !26 + %109 = and <4 x i1> %53, %108, !dbg !27 + %110 = or <4 x i1> %105, %109, !dbg !28 + %111 = and <4 x i1> %53, %107, !dbg !29 + %112 = or <4 x i1> %106, %111, !dbg !30 + %113 = sext <4 x i32> %31 to <4 x i64>, !dbg !22 + %114 = icmp sgt <4 x i64> %99, %113, !dbg !22 + %115 = and <4 x i1> %114, %112, !dbg !31 + %116 = or <4 x i1> %110, %115, !dbg !32 + %117 = select <4 x i1> %116, <4 x float> %30, <4 x float> %104, !dbg !33 + %118 = trunc <2 x i64> %98 to <2 x i32>, !dbg !34 + %119 = shufflevector <2 x i32> %118, <2 x i32> poison, <4 x i32> , !dbg !34 + %120 = select <2 x i1> %90, <2 x i32> %29, <2 x i32> %96, !dbg !34 + %121 = select <4 x i1> %116, <4 x i32> %31, <4 x i32> %119, !dbg !34 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 64, !dbg !13 + %122 = icmp samesign ult i64 %indvars.iv, 31936, !dbg !13 + br i1 %122, label %25, label %123, !dbg !13 + +123: ; preds = %25 + %124 = and i32 %9, 63, !dbg !9 + %125 = or disjoint i32 %8, %124, !dbg !10 + %126 = shufflevector <2 x float> %91, <2 x float> poison, <2 x i32> , !dbg !35 + %127 = fcmp ogt <2 x float> %91, %126, !dbg !35 + %128 = fcmp oeq <2 x float> %91, %126, !dbg !35 + %129 = shufflevector <2 x i1> %127, <2 x i1> %128, <2 x i32> , !dbg !35 + %130 = extractelement <2 x float> %91, i64 0, !dbg !37 + %131 = fcmp uno float %130, 0.000000e+00, !dbg !37 + %132 = extractelement <2 x float> %91, i64 1, !dbg !38 + %133 = fcmp uno float %132, 0.000000e+00, !dbg !38 + %134 = xor i1 %133, true, !dbg !39 + %135 = insertelement <2 x i1> poison, i1 %131, i64 0, !dbg !40 + %136 = shufflevector <2 x i1> %135, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !40 + %137 = insertelement <2 x i1> poison, i1 %134, i64 0, !dbg !40 + %138 = insertelement <2 x i1> %137, i1 %133, i64 1, !dbg !40 + %139 = and <2 x i1> %136, %138, !dbg !40 + %140 = or <2 x i1> %129, %139, !dbg !41 + %141 = extractelement <2 x i32> %97, i64 0, !dbg !42 + %142 = extractelement <2 x i32> %97, i64 1, !dbg !42 + %143 = icmp slt i32 %141, %142, !dbg !42 + %144 = extractelement <2 x i1> %140, i64 1, !dbg !43 + %145 = and i1 %143, %144, !dbg !43 + %146 = extractelement <2 x i1> %140, i64 0, !dbg !44 + %147 = or i1 %146, %145, !dbg !44 + %148 = select i1 %147, float %130, float %132, !dbg !45 + %149 = select i1 %147, i32 %141, i32 %142, !dbg !46 + %150 = extractelement <4 x float> %117, i64 3, !dbg !35 + %151 = fcmp ogt float %148, %150, !dbg !35 + %152 = fcmp oeq float %148, %150, !dbg !47 + %153 = fcmp uno float %148, 0.000000e+00, !dbg !37 + %154 = fcmp uno <4 x float> %117, zeroinitializer, !dbg !38 + %155 = extractelement <4 x i1> %154, i64 3, !dbg !48 + %156 = xor i1 %155, true, !dbg !39 + %157 = and i1 %153, %156, !dbg !40 + %158 = or i1 %151, %157, !dbg !41 + %159 = and i1 %155, %153, !dbg !48 + %160 = or i1 %152, %159, !dbg !49 + %161 = extractelement <4 x i32> %121, i64 3, !dbg !42 + %162 = icmp slt i32 %149, %161, !dbg !42 + %163 = and i1 %162, %160, !dbg !43 + %164 = or i1 %158, %163, !dbg !44 + %165 = select i1 %164, float %148, float %150, !dbg !45 + %166 = select i1 %164, i32 %149, i32 %161, !dbg !46 + %167 = extractelement <4 x float> %117, i64 2, !dbg !35 + %168 = fcmp ogt float %165, %167, !dbg !35 + %169 = fcmp oeq float %165, %167, !dbg !47 + %170 = fcmp uno float %165, 0.000000e+00, !dbg !37 + %171 = extractelement <4 x i1> %154, i64 2, !dbg !48 + %172 = xor i1 %171, true, !dbg !39 + %173 = and i1 %170, %172, !dbg !40 + %174 = or i1 %168, %173, !dbg !41 + %175 = and i1 %171, %170, !dbg !48 + %176 = or i1 %169, %175, !dbg !49 + %177 = extractelement <4 x i32> %121, i64 2, !dbg !42 + %178 = icmp slt i32 %166, %177, !dbg !42 + %179 = and i1 %178, %176, !dbg !43 + %180 = or i1 %174, %179, !dbg !44 + %181 = select i1 %180, float %165, float %167, !dbg !45 + %182 = select i1 %180, i32 %166, i32 %177, !dbg !46 + %183 = shufflevector <2 x float> %92, <2 x float> poison, <2 x i32> , !dbg !35 + %184 = fcmp ogt <2 x float> %92, %183, !dbg !35 + %185 = fcmp oeq <2 x float> %92, %183, !dbg !35 + %186 = shufflevector <2 x i1> %184, <2 x i1> %185, <2 x i32> , !dbg !35 + %187 = extractelement <2 x float> %92, i64 0, !dbg !37 + %188 = fcmp uno float %187, 0.000000e+00, !dbg !37 + %189 = extractelement <2 x float> %92, i64 1, !dbg !38 + %190 = fcmp uno float %189, 0.000000e+00, !dbg !38 + %191 = xor i1 %190, true, !dbg !39 + %192 = insertelement <2 x i1> poison, i1 %188, i64 0, !dbg !40 + %193 = shufflevector <2 x i1> %192, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !40 + %194 = insertelement <2 x i1> poison, i1 %191, i64 0, !dbg !40 + %195 = insertelement <2 x i1> %194, i1 %190, i64 1, !dbg !40 + %196 = and <2 x i1> %193, %195, !dbg !40 + %197 = or <2 x i1> %186, %196, !dbg !41 + %198 = extractelement <2 x i32> %120, i64 0, !dbg !42 + %199 = extractelement <2 x i32> %120, i64 1, !dbg !42 + %200 = icmp slt i32 %198, %199, !dbg !42 + %201 = extractelement <2 x i1> %197, i64 1, !dbg !43 + %202 = and i1 %200, %201, !dbg !43 + %203 = extractelement <2 x i1> %197, i64 0, !dbg !44 + %204 = or i1 %203, %202, !dbg !44 + %205 = select i1 %204, float %187, float %189, !dbg !45 + %206 = select i1 %204, i32 %198, i32 %199, !dbg !46 + %207 = extractelement <4 x float> %117, i64 1, !dbg !35 + %208 = fcmp ogt float %205, %207, !dbg !35 + %209 = fcmp oeq float %205, %207, !dbg !47 + %210 = fcmp uno float %205, 0.000000e+00, !dbg !37 + %211 = extractelement <4 x i1> %154, i64 1, !dbg !48 + %212 = xor i1 %211, true, !dbg !39 + %213 = and i1 %210, %212, !dbg !40 + %214 = or i1 %208, %213, !dbg !41 + %215 = and i1 %211, %210, !dbg !48 + %216 = or i1 %209, %215, !dbg !49 + %217 = extractelement <4 x i32> %121, i64 1, !dbg !42 + %218 = icmp slt i32 %206, %217, !dbg !42 + %219 = and i1 %218, %216, !dbg !43 + %220 = or i1 %214, %219, !dbg !44 + %221 = select i1 %220, float %205, float %207, !dbg !45 + %222 = select i1 %220, i32 %206, i32 %217, !dbg !46 + %223 = extractelement <4 x float> %117, i64 0, !dbg !35 + %224 = fcmp ogt float %221, %223, !dbg !35 + %225 = fcmp oeq float %221, %223, !dbg !47 + %226 = fcmp uno float %221, 0.000000e+00, !dbg !37 + %227 = extractelement <4 x i1> %154, i64 0, !dbg !48 + %228 = xor i1 %227, true, !dbg !39 + %229 = and i1 %226, %228, !dbg !40 + %230 = or i1 %224, %229, !dbg !41 + %231 = and i1 %227, %226, !dbg !48 + %232 = or i1 %225, %231, !dbg !49 + %233 = extractelement <4 x i32> %121, i64 0, !dbg !42 + %234 = icmp slt i32 %222, %233, !dbg !42 + %235 = and i1 %234, %232, !dbg !43 + %236 = or i1 %230, %235, !dbg !44 + %237 = select i1 %236, float %221, float %223, !dbg !45 + %238 = select i1 %236, i32 %222, i32 %233, !dbg !46 + %239 = bitcast float %181 to i32, !dbg !50 + %240 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %239, i32 8, i32 31), !dbg !50 + %241 = bitcast i32 %240 to float, !dbg !50 + %242 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %182, i32 8, i32 31), !dbg !50 + %243 = fcmp ogt float %181, %241, !dbg !35 + %244 = fcmp oeq float %181, %241, !dbg !47 + %245 = fcmp uno float %181, 0.000000e+00, !dbg !37 + %246 = fcmp uno float %241, 0.000000e+00, !dbg !38 + %247 = xor i1 %246, true, !dbg !39 + %248 = and i1 %245, %247, !dbg !40 + %249 = or i1 %243, %248, !dbg !41 + %250 = and i1 %245, %246, !dbg !48 + %251 = or i1 %244, %250, !dbg !49 + %252 = icmp slt i32 %182, %242, !dbg !42 + %253 = and i1 %252, %251, !dbg !43 + %254 = or i1 %249, %253, !dbg !44 + %255 = select i1 %254, float %181, float %241, !dbg !45 + %256 = select i1 %254, i32 %182, i32 %242, !dbg !46 + %257 = bitcast float %255 to i32, !dbg !50 + %258 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %257, i32 4, i32 31), !dbg !50 + %259 = bitcast i32 %258 to float, !dbg !50 + %260 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %256, i32 4, i32 31), !dbg !50 + %261 = fcmp ogt float %255, %259, !dbg !35 + %262 = fcmp oeq float %255, %259, !dbg !47 + %263 = fcmp uno float %255, 0.000000e+00, !dbg !37 + %264 = fcmp uno float %259, 0.000000e+00, !dbg !38 + %265 = xor i1 %264, true, !dbg !39 + %266 = and i1 %263, %265, !dbg !40 + %267 = or i1 %261, %266, !dbg !41 + %268 = and i1 %264, %263, !dbg !48 + %269 = or i1 %262, %268, !dbg !49 + %270 = icmp slt i32 %256, %260, !dbg !42 + %271 = and i1 %270, %269, !dbg !43 + %272 = or i1 %267, %271, !dbg !44 + %273 = select i1 %272, float %255, float %259, !dbg !45 + %274 = select i1 %272, i32 %256, i32 %260, !dbg !46 + %275 = bitcast float %273 to i32, !dbg !50 + %276 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %275, i32 2, i32 31), !dbg !50 + %277 = bitcast i32 %276 to float, !dbg !50 + %278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %274, i32 2, i32 31), !dbg !50 + %279 = fcmp ogt float %273, %277, !dbg !35 + %280 = fcmp oeq float %273, %277, !dbg !47 + %281 = fcmp uno float %273, 0.000000e+00, !dbg !37 + %282 = fcmp uno float %277, 0.000000e+00, !dbg !38 + %283 = xor i1 %282, true, !dbg !39 + %284 = and i1 %281, %283, !dbg !40 + %285 = or i1 %279, %284, !dbg !41 + %286 = and i1 %282, %281, !dbg !48 + %287 = or i1 %280, %286, !dbg !49 + %288 = icmp slt i32 %274, %278, !dbg !42 + %289 = and i1 %288, %287, !dbg !43 + %290 = or i1 %285, %289, !dbg !44 + %291 = select i1 %290, float %273, float %277, !dbg !45 + %292 = select i1 %290, i32 %274, i32 %278, !dbg !46 + %293 = bitcast float %291 to i32, !dbg !50 + %294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 1, i32 31), !dbg !50 + %295 = bitcast i32 %294 to float, !dbg !50 + %296 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %292, i32 1, i32 31), !dbg !50 + %297 = fcmp ogt float %291, %295, !dbg !35 + %298 = fcmp oeq float %291, %295, !dbg !47 + %299 = fcmp uno float %291, 0.000000e+00, !dbg !37 + %300 = fcmp uno float %295, 0.000000e+00, !dbg !38 + %301 = xor i1 %300, true, !dbg !39 + %302 = and i1 %299, %301, !dbg !40 + %303 = or i1 %297, %302, !dbg !41 + %304 = and i1 %300, %299, !dbg !48 + %305 = or i1 %298, %304, !dbg !49 + %306 = icmp slt i32 %292, %296, !dbg !42 + %307 = and i1 %306, %305, !dbg !43 + %308 = or i1 %303, %307, !dbg !44 + %309 = select i1 %308, i32 %292, i32 %296, !dbg !46 + %310 = bitcast float %237 to i32, !dbg !50 + %311 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %310, i32 8, i32 31), !dbg !50 + %312 = bitcast i32 %311 to float, !dbg !50 + %313 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %238, i32 8, i32 31), !dbg !50 + %314 = fcmp ogt float %237, %312, !dbg !35 + %315 = fcmp oeq float %237, %312, !dbg !47 + %316 = fcmp uno float %237, 0.000000e+00, !dbg !37 + %317 = fcmp uno float %312, 0.000000e+00, !dbg !38 + %318 = xor i1 %317, true, !dbg !39 + %319 = and i1 %316, %318, !dbg !40 + %320 = or i1 %314, %319, !dbg !41 + %321 = and i1 %316, %317, !dbg !48 + %322 = or i1 %315, %321, !dbg !49 + %323 = icmp slt i32 %238, %313, !dbg !42 + %324 = and i1 %323, %322, !dbg !43 + %325 = or i1 %320, %324, !dbg !44 + %326 = select i1 %325, float %237, float %312, !dbg !45 + %327 = select i1 %325, i32 %238, i32 %313, !dbg !46 + %328 = bitcast float %326 to i32, !dbg !50 + %329 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %328, i32 4, i32 31), !dbg !50 + %330 = bitcast i32 %329 to float, !dbg !50 + %331 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %327, i32 4, i32 31), !dbg !50 + %332 = fcmp ogt float %326, %330, !dbg !35 + %333 = fcmp oeq float %326, %330, !dbg !47 + %334 = fcmp uno float %326, 0.000000e+00, !dbg !37 + %335 = fcmp uno float %330, 0.000000e+00, !dbg !38 + %336 = xor i1 %335, true, !dbg !39 + %337 = and i1 %334, %336, !dbg !40 + %338 = or i1 %332, %337, !dbg !41 + %339 = and i1 %335, %334, !dbg !48 + %340 = or i1 %333, %339, !dbg !49 + %341 = icmp slt i32 %327, %331, !dbg !42 + %342 = and i1 %341, %340, !dbg !43 + %343 = or i1 %338, %342, !dbg !44 + %344 = select i1 %343, float %326, float %330, !dbg !45 + %345 = select i1 %343, i32 %327, i32 %331, !dbg !46 + %346 = bitcast float %344 to i32, !dbg !50 + %347 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %346, i32 2, i32 31), !dbg !50 + %348 = bitcast i32 %347 to float, !dbg !50 + %349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %345, i32 2, i32 31), !dbg !50 + %350 = fcmp ogt float %344, %348, !dbg !35 + %351 = fcmp oeq float %344, %348, !dbg !47 + %352 = fcmp uno float %344, 0.000000e+00, !dbg !37 + %353 = fcmp uno float %348, 0.000000e+00, !dbg !38 + %354 = xor i1 %353, true, !dbg !39 + %355 = and i1 %352, %354, !dbg !40 + %356 = or i1 %350, %355, !dbg !41 + %357 = and i1 %353, %352, !dbg !48 + %358 = or i1 %351, %357, !dbg !49 + %359 = icmp slt i32 %345, %349, !dbg !42 + %360 = and i1 %359, %358, !dbg !43 + %361 = or i1 %356, %360, !dbg !44 + %362 = select i1 %361, float %344, float %348, !dbg !45 + %363 = select i1 %361, i32 %345, i32 %349, !dbg !46 + %364 = bitcast float %362 to i32, !dbg !50 + %365 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %364, i32 1, i32 31), !dbg !50 + %366 = bitcast i32 %365 to float, !dbg !50 + %367 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %363, i32 1, i32 31), !dbg !50 + %368 = fcmp ogt float %362, %366, !dbg !35 + %369 = fcmp oeq float %362, %366, !dbg !47 + %370 = fcmp uno float %362, 0.000000e+00, !dbg !37 + %371 = fcmp uno float %366, 0.000000e+00, !dbg !38 + %372 = xor i1 %371, true, !dbg !39 + %373 = and i1 %370, %372, !dbg !40 + %374 = or i1 %368, %373, !dbg !41 + %375 = and i1 %371, %370, !dbg !48 + %376 = or i1 %369, %375, !dbg !49 + %377 = icmp slt i32 %363, %367, !dbg !42 + %378 = and i1 %377, %376, !dbg !43 + %379 = or i1 %374, %378, !dbg !44 + %380 = select i1 %379, i32 %363, i32 %367, !dbg !46 + %381 = sext i32 %125 to i64, !dbg !51 + %382 = getelementptr i64, ptr addrspace(1) %1, i64 %381, !dbg !51 + %383 = lshr exact i32 %10, 2, !dbg !52 + %384 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %383, !dbg !52 + %385 = insertelement <1 x i32> poison, i32 %309, i64 0, !dbg !52 + store <1 x i32> %385, ptr addrspace(3) %384, align 4, !dbg !52 + %386 = getelementptr inbounds nuw i8, ptr addrspace(3) %384, i32 128, !dbg !52 + %387 = insertelement <1 x i32> poison, i32 %380, i64 0, !dbg !52 + store <1 x i32> %387, ptr addrspace(3) %386, align 4, !dbg !52 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !52 + %388 = shl nuw nsw i32 %124, 2, !dbg !52 + %389 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %388, !dbg !52 + %390 = load i32, ptr addrspace(3) %389, align 4, !dbg !52 + %391 = sext i32 %390 to i64, !dbg !52 + %392 = and i32 %9, 448, !dbg !52 + %393 = icmp eq i32 %392, 0, !dbg !52 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %391, ptr addrspace(1) %382, i1 %393) #4, !dbg !52 + ret void, !dbg !53 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_1", linkageName: "triton_red_fused_argmax_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 19, scope: !4) +!13 = !DILocation(line: 33, column: 40, scope: !4) +!14 = !DILocation(line: 34, column: 31, scope: !4) +!15 = !DILocation(line: 39, column: 52, scope: !4) +!16 = !DILocation(line: 39, column: 34, scope: !4) +!17 = !DILocation(line: 39, column: 66, scope: !4) +!18 = !DILocation(line: 147, column: 29, scope: !19, inlinedAt: !21) +!19 = distinct !DILexicalBlockFile(scope: !4, file: !20, discriminator: 0) +!20 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!21 = !DILocation(line: 42, column: 38, scope: !4) +!22 = !DILocation(line: 154, column: 31, scope: !19, inlinedAt: !21) +!23 = !DILocation(line: 144, column: 21, scope: !19, inlinedAt: !21) +!24 = !DILocation(line: 145, column: 23, scope: !19, inlinedAt: !21) +!25 = !DILocation(line: 148, column: 29, scope: !19, inlinedAt: !21) +!26 = !DILocation(line: 149, column: 31, scope: !19, inlinedAt: !21) +!27 = !DILocation(line: 149, column: 27, scope: !19, inlinedAt: !21) +!28 = !DILocation(line: 149, column: 16, scope: !19, inlinedAt: !21) +!29 = !DILocation(line: 151, column: 27, scope: !19, inlinedAt: !21) +!30 = !DILocation(line: 151, column: 17, scope: !19, inlinedAt: !21) +!31 = !DILocation(line: 154, column: 21, scope: !19, inlinedAt: !21) +!32 = !DILocation(line: 154, column: 12, scope: !19, inlinedAt: !21) +!33 = !DILocation(line: 155, column: 35, scope: !19, inlinedAt: !21) +!34 = !DILocation(line: 155, column: 69, scope: !19, inlinedAt: !21) +!35 = !DILocation(line: 144, column: 21, scope: !19, inlinedAt: !36) +!36 = !DILocation(line: 46, column: 75, scope: !4) +!37 = !DILocation(line: 147, column: 29, scope: !19, inlinedAt: !36) +!38 = !DILocation(line: 148, column: 29, scope: !19, inlinedAt: !36) +!39 = !DILocation(line: 149, column: 31, scope: !19, inlinedAt: !36) +!40 = !DILocation(line: 149, column: 27, scope: !19, inlinedAt: !36) +!41 = !DILocation(line: 149, column: 16, scope: !19, inlinedAt: !36) +!42 = !DILocation(line: 154, column: 31, scope: !19, inlinedAt: !36) +!43 = !DILocation(line: 154, column: 21, scope: !19, inlinedAt: !36) +!44 = !DILocation(line: 154, column: 12, scope: !19, inlinedAt: !36) +!45 = !DILocation(line: 155, column: 35, scope: !19, inlinedAt: !36) +!46 = !DILocation(line: 155, column: 69, scope: !19, inlinedAt: !36) +!47 = !DILocation(line: 145, column: 23, scope: !19, inlinedAt: !36) +!48 = !DILocation(line: 151, column: 27, scope: !19, inlinedAt: !36) +!49 = !DILocation(line: 151, column: 17, scope: !19, inlinedAt: !36) +!50 = !DILocation(line: 165, column: 42, scope: !19, inlinedAt: !36) +!51 = !DILocation(line: 48, column: 25, scope: !4) +!52 = !DILocation(line: 48, column: 36, scope: !4) +!53 = !DILocation(line: 48, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..51a986d74f86e91a79a0229106707eaa0c41966b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.ptx @@ -0,0 +1,977 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_argmax_1 // -- Begin function triton_red_fused_argmax_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_argmax_1 +.visible .entry triton_red_fused_argmax_1( + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_1, + .param .u32 triton_red_fused_argmax_1_param_2, + .param .u32 triton_red_fused_argmax_1_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_5 +) +.reqntid 512 +{ + .reg .pred %p<255>; + .reg .b32 %r<173>; + .reg .b64 %rd<63>; + .loc 1 18 0 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:18:0 +$L__func_begin0: + .loc 1 18 0 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:18:0 + +// %bb.0: + ld.param.b64 %rd15, [triton_red_fused_argmax_1_param_1]; + ld.param.b64 %rd14, [triton_red_fused_argmax_1_param_0]; +$L__tmp0: + .loc 1 23 28 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:23:28 + mov.u32 %r26, %ctaid.x; + .loc 1 23 33 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:23:33 + shl.b32 %r1, %r26, 6; + .loc 1 24 44 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 496; + bfe.u32 %r27, %r2, 4, 5; + .loc 1 24 23 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:23 + or.b32 %r28, %r27, %r1; + .loc 1 26 37 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:26:37 + shl.b32 %r29, %r2, 2; + and.b32 %r30, %r29, 60; + .loc 1 29 19 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:29:19 + bfe.s32 %r31, %r26, 25, 1; + shr.u32 %r32, %r31, 21; + add.s32 %r33, %r32, %r28; + shr.s32 %r34, %r33, 11; + add.s32 %r35, %r33, 32; + shr.s32 %r36, %r35, 11; + .loc 1 33 40 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:33:40 + cvt.u64.u32 %rd1, %r30; + mul.lo.s32 %r37, %r26, 2048000; + mad.lo.s32 %r38, %r36, 224000, %r37; + mul.lo.s32 %r39, %r27, 32000; + add.s32 %r40, %r38, %r39; + or.b32 %r41, %r40, %r30; + cvt.u64.u32 %rd2, %r41; + mad.lo.s32 %r42, %r34, 224000, %r37; + add.s32 %r43, %r42, %r39; + or.b32 %r44, %r43, %r30; + cvt.u64.u32 %rd3, %r44; + mov.b32 %r45, 0fFF800000; + mov.b64 %rd59, {%r45, %r45}; + mov.b32 %r165, 2147483647; + mov.b64 %rd58, -64; + mov.b32 %r166, %r165; + mov.b64 %rd60, %rd59; + mov.b32 %r167, %r165; + mov.b32 %r168, %r165; + mov.b64 %rd61, %rd59; + mov.b64 %rd62, %rd59; + mov.b32 %r169, %r165; + mov.b32 %r170, %r165; + mov.b32 %r171, %r165; + mov.b32 %r172, %r165; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 39 52 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:52 + add.s64 %rd26, %rd1, %rd58; + add.s64 %rd27, %rd26, 64; + add.s64 %rd28, %rd3, %rd58; + .loc 1 39 34 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:34 + add.s64 %rd29, %rd2, %rd58; + cvt.u32.u64 %r62, %rd28; + add.s32 %r63, %r62, 64; + mad.wide.s32 %rd21, %r63, 4, %rd14; + cvt.u32.u64 %r64, %rd29; + add.s32 %r65, %r64, 1024064; + mad.wide.s32 %rd24, %r65, 4, %rd14; + .loc 1 39 66 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:66 + // begin inline asm + mov.u64 %rd20, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd20, 1.0; + // end inline asm + mov.b32 %r50, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r46, %r50; + mov.u32 %r47, %r50; + mov.u32 %r48, %r50; + mov.u32 %r49, %r50; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r46, %r47, %r48, %r49 }, [ %rd21 + 0 ], %rd20; + // end inline asm + // begin inline asm + mov.u64 %rd23, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd23, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r54, %r50; + mov.u32 %r55, %r50; + mov.u32 %r56, %r50; + mov.u32 %r57, %r50; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r54, %r55, %r56, %r57 }, [ %rd24 + 0 ], %rd23; + // end inline asm +$L__tmp1: + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + mov.b64 {%r66, %r67}, %rd59; + setp.nan.f32 %p3, %r67, %r67; + setp.nan.f32 %p4, %r66, %r66; + mov.b64 {%r68, %r69}, %rd62; + setp.nan.f32 %p5, %r69, %r69; + setp.nan.f32 %p6, %r68, %r68; + mov.b64 {%r70, %r71}, %rd61; + setp.nan.f32 %p7, %r71, %r71; + setp.nan.f32 %p8, %r70, %r70; + mov.b64 {%r72, %r73}, %rd60; + setp.nan.f32 %p9, %r73, %r73; + setp.nan.f32 %p10, %r72, %r72; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + cvt.s64.s32 %rd30, %r166; + cvt.s64.s32 %rd31, %r165; + cvt.s64.s32 %rd32, %r168; + cvt.s64.s32 %rd33, %r167; +$L__tmp2: + .loc 1 39 66 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:66 + cvt.u64.u32 %rd34, %r46; + cvt.u64.u32 %rd35, %r47; + shl.b64 %rd36, %rd35, 32; + or.b64 %rd37, %rd34, %rd36; +$L__tmp3: + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + mov.b64 {%r74, %r75}, %rd37; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.f32 %p11, %r66, %r74; + setp.gt.f32 %p12, %r67, %r75; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.eq.f32 %p13, %r67, %r75; + setp.eq.f32 %p14, %r66, %r74; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.nan.f32 %p15, %r74, %r74; + setp.nan.f32 %p16, %r75, %r75; + setp.num.f32 %p17, %r75, %r75; + setp.num.f32 %p18, %r74, %r74; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p19, %p4, %p18; + and.pred %p20, %p3, %p17; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p21, %p12, %p20; + or.pred %p22, %p11, %p19; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p23, %p3, %p16; + and.pred %p24, %p4, %p15; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p25, %p14, %p24; + or.pred %p26, %p13, %p23; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.s64 %p27, %rd27, %rd31; + setp.ge.s64 %p28, %rd27, %rd30; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p29, %p28, %p26; + and.pred %p30, %p27, %p25; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p31, %p22, %p30; + or.pred %p32, %p21, %p29; +$L__tmp4: + .loc 1 39 66 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:66 + cvt.u64.u32 %rd38, %r54; + cvt.u64.u32 %rd39, %r55; + shl.b64 %rd40, %rd39, 32; + or.b64 %rd41, %rd38, %rd40; +$L__tmp5: + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + mov.b64 {%r76, %r77}, %rd41; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.f32 %p33, %r72, %r76; + setp.gt.f32 %p34, %r73, %r77; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.eq.f32 %p35, %r73, %r77; + setp.eq.f32 %p36, %r72, %r76; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.nan.f32 %p37, %r76, %r76; + setp.nan.f32 %p38, %r77, %r77; + setp.num.f32 %p39, %r77, %r77; + setp.num.f32 %p40, %r76, %r76; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p41, %p10, %p40; + and.pred %p42, %p9, %p39; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p43, %p34, %p42; + or.pred %p44, %p33, %p41; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p45, %p9, %p38; + and.pred %p46, %p10, %p37; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p47, %p36, %p46; + or.pred %p48, %p35, %p45; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.s64 %p49, %rd27, %rd33; + setp.ge.s64 %p50, %rd27, %rd32; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p51, %p50, %p48; + and.pred %p52, %p49, %p47; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p53, %p44, %p52; + or.pred %p54, %p43, %p51; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + selp.f32 %r78, %r67, %r75, %p32; + selp.f32 %r79, %r66, %r74, %p31; + mov.b64 %rd59, {%r79, %r78}; + selp.f32 %r80, %r73, %r77, %p54; + selp.f32 %r81, %r72, %r76, %p53; + mov.b64 %rd60, {%r81, %r80}; + cvt.u32.u64 %r82, %rd27; + cvt.u32.u64 %r83, %rd26; + add.s32 %r84, %r83, 65; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + selp.b32 %r166, %r166, %r84, %p32; + selp.b32 %r165, %r165, %r82, %p31; +$L__tmp6: + .loc 1 34 31 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:34:31 + add.s64 %rd42, %rd26, 66; + add.s64 %rd43, %rd26, 67; + .loc 1 39 66 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:66 + cvt.u64.u32 %rd44, %r49; + cvt.u64.u32 %rd45, %r48; + shl.b64 %rd46, %rd45, 32; + or.b64 %rd47, %rd44, %rd46; + cvt.u64.u32 %rd48, %r57; + cvt.u64.u32 %rd49, %r56; + shl.b64 %rd50, %rd49, 32; + or.b64 %rd51, %rd48, %rd50; +$L__tmp7: + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + mov.b64 {%r85, %r86}, %rd51; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.f32 %p55, %r70, %r85; + setp.gt.f32 %p56, %r71, %r86; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + mov.b64 {%r87, %r88}, %rd47; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.f32 %p57, %r68, %r87; + setp.gt.f32 %p58, %r69, %r88; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.eq.f32 %p59, %r69, %r88; + setp.eq.f32 %p60, %r68, %r87; + setp.eq.f32 %p61, %r71, %r86; + setp.eq.f32 %p62, %r70, %r85; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.nan.f32 %p63, %r85, %r85; + setp.nan.f32 %p64, %r86, %r86; + setp.nan.f32 %p65, %r87, %r87; + setp.nan.f32 %p66, %r88, %r88; + setp.num.f32 %p67, %r88, %r88; + setp.num.f32 %p68, %r87, %r87; + setp.num.f32 %p69, %r86, %r86; + setp.num.f32 %p70, %r85, %r85; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p71, %p8, %p70; + and.pred %p72, %p7, %p69; + and.pred %p73, %p6, %p68; + and.pred %p74, %p5, %p67; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p75, %p58, %p74; + or.pred %p76, %p57, %p73; + or.pred %p77, %p56, %p72; + or.pred %p78, %p55, %p71; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p79, %p5, %p66; + and.pred %p80, %p6, %p65; + and.pred %p81, %p7, %p64; + and.pred %p82, %p8, %p63; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p83, %p62, %p82; + or.pred %p84, %p61, %p81; + or.pred %p85, %p60, %p80; + or.pred %p86, %p59, %p79; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + cvt.s64.s32 %rd52, %r172; + cvt.s64.s32 %rd53, %r171; + cvt.s64.s32 %rd54, %r170; + cvt.s64.s32 %rd55, %r169; + setp.gt.s64 %p87, %rd43, %rd55; + setp.gt.s64 %p88, %rd42, %rd54; + setp.gt.s64 %p89, %rd43, %rd53; + setp.gt.s64 %p90, %rd42, %rd52; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p91, %p90, %p86; + and.pred %p92, %p89, %p85; + and.pred %p93, %p88, %p84; + and.pred %p94, %p87, %p83; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p95, %p78, %p94; + or.pred %p96, %p77, %p93; + or.pred %p97, %p76, %p92; + or.pred %p98, %p75, %p91; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + selp.f32 %r89, %r69, %r88, %p98; + selp.f32 %r90, %r68, %r87, %p97; + mov.b64 %rd62, {%r90, %r89}; + selp.f32 %r91, %r71, %r86, %p96; + selp.f32 %r92, %r70, %r85, %p95; + mov.b64 %rd61, {%r92, %r91}; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + cvt.u32.u64 %r93, %rd43; + cvt.u32.u64 %r94, %rd42; + selp.b32 %r168, %r168, %r84, %p54; + selp.b32 %r167, %r167, %r82, %p53; + selp.b32 %r172, %r172, %r94, %p98; + selp.b32 %r171, %r171, %r93, %p97; + selp.b32 %r170, %r170, %r94, %p96; + selp.b32 %r169, %r169, %r93, %p95; +$L__tmp8: + .loc 1 33 40 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:33:40 + add.s64 %rd58, %rd58, 64; + setp.lt.u64 %p99, %rd58, 31936; + @%p99 bra $L__BB0_1; +// %bb.2: + .loc 1 24 44 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:44 + and.b32 %r95, %r2, 63; + .loc 1 24 23 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:23 + or.b32 %r96, %r1, %r95; +$L__tmp9: + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + mov.b64 {%r97, %r98}, %rd59; + setp.gt.f32 %p101, %r97, %r98; + setp.eq.f32 %p102, %r98, %r97; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p103, %r97, %r97; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.num.f32 %p104, %r98, %r98; + setp.nan.f32 %p105, %r98, %r98; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p106, %p103, %p105; + and.pred %p107, %p103, %p104; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p108, %p101, %p107; + or.pred %p109, %p102, %p106; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p110, %r165, %r166; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p111, %p110, %p109; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p112, %p108, %p111; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r99, %r97, %r98, %p112; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r100, %r165, %r166, %p112; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + mov.b64 {%r101, %r102}, %rd62; + setp.gt.f32 %p113, %r99, %r102; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p114, %r99, %r102; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p115, %r99, %r99; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + mov.b64 {%r103, %r104}, %rd61; + setp.nan.f32 %p116, %r103, %r103; + setp.num.f32 %p117, %r103, %r103; + setp.nan.f32 %p118, %r104, %r104; + setp.num.f32 %p119, %r104, %r104; + setp.nan.f32 %p120, %r101, %r101; + setp.num.f32 %p121, %r101, %r101; + setp.nan.f32 %p122, %r102, %r102; + setp.num.f32 %p123, %r102, %r102; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p124, %p115, %p123; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p125, %p113, %p124; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p126, %p122, %p115; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p127, %p114, %p126; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p128, %r100, %r172; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p129, %p128, %p127; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p130, %p125, %p129; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r105, %r99, %r102, %p130; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r106, %r100, %r172, %p130; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p131, %r105, %r101; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p132, %r105, %r101; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p133, %r105, %r105; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p134, %p133, %p121; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p135, %p131, %p134; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p136, %p120, %p133; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p137, %p132, %p136; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p138, %r106, %r171; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p139, %p138, %p137; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p140, %p135, %p139; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r107, %r105, %r101, %p140; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r108, %r106, %r171, %p140; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + mov.b64 {%r109, %r110}, %rd60; + setp.gt.f32 %p141, %r109, %r110; + setp.eq.f32 %p142, %r110, %r109; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p143, %r109, %r109; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.num.f32 %p144, %r110, %r110; + setp.nan.f32 %p145, %r110, %r110; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p146, %p143, %p145; + and.pred %p147, %p143, %p144; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p148, %p141, %p147; + or.pred %p149, %p142, %p146; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p150, %r167, %r168; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p151, %p150, %p149; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p152, %p148, %p151; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r111, %r109, %r110, %p152; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r112, %r167, %r168, %p152; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p153, %r111, %r104; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p154, %r111, %r104; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p155, %r111, %r111; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p156, %p155, %p119; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p157, %p153, %p156; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p158, %p118, %p155; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p159, %p154, %p158; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p160, %r112, %r170; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p161, %p160, %p159; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p162, %p157, %p161; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r113, %r111, %r104, %p162; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r114, %r112, %r170, %p162; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p163, %r113, %r103; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p164, %r113, %r103; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p165, %r113, %r113; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p166, %p165, %p117; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p167, %p163, %p166; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p168, %p116, %p165; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p169, %p164, %p168; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p170, %r114, %r169; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p171, %p170, %p169; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p172, %p167, %p171; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r115, %r113, %r103, %p172; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r116, %r114, %r169, %p172; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r117, %r107, 8, 31, -1; + shfl.sync.bfly.b32 %r118, %r108, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p173, %r107, %r117; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p174, %r107, %r117; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p175, %r107, %r107; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p176, %r117, %r117; + setp.num.f32 %p177, %r117, %r117; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p178, %p175, %p177; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p179, %p173, %p178; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p180, %p175, %p176; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p181, %p174, %p180; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p182, %r108, %r118; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p183, %p182, %p181; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p184, %p179, %p183; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r119, %r107, %r117, %p184; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r120, %r108, %r118, %p184; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r121, %r119, 4, 31, -1; + shfl.sync.bfly.b32 %r122, %r120, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p185, %r119, %r121; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p186, %r119, %r121; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p187, %r119, %r119; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p188, %r121, %r121; + setp.num.f32 %p189, %r121, %r121; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p190, %p187, %p189; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p191, %p185, %p190; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p192, %p188, %p187; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p193, %p186, %p192; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p194, %r120, %r122; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p195, %p194, %p193; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p196, %p191, %p195; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r123, %r119, %r121, %p196; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r124, %r120, %r122, %p196; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r125, %r123, 2, 31, -1; + shfl.sync.bfly.b32 %r126, %r124, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p197, %r123, %r125; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p198, %r123, %r125; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p199, %r123, %r123; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p200, %r125, %r125; + setp.num.f32 %p201, %r125, %r125; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p202, %p199, %p201; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p203, %p197, %p202; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p204, %p200, %p199; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p205, %p198, %p204; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p206, %r124, %r126; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p207, %p206, %p205; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p208, %p203, %p207; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r127, %r123, %r125, %p208; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r128, %r124, %r126, %p208; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r129, %r127, 1, 31, -1; + shfl.sync.bfly.b32 %r130, %r128, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p209, %r127, %r129; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p210, %r127, %r129; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p211, %r127, %r127; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p212, %r129, %r129; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p213, %r128, %r130; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r131, %r128, %r130, %p211; + selp.b32 %r132, %r131, %r130, %p212; + selp.b32 %r133, %r128, %r132, %p210; + selp.b32 %r134, %r133, %r130, %p213; + selp.b32 %r135, %r134, %r128, %p212; + selp.b32 %r136, %r135, %r134, %p211; + selp.b32 %r137, %r128, %r136, %p209; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r138, %r115, 8, 31, -1; + shfl.sync.bfly.b32 %r139, %r116, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p214, %r115, %r138; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p215, %r115, %r138; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p216, %r115, %r115; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p217, %r138, %r138; + setp.num.f32 %p218, %r138, %r138; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p219, %p216, %p218; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p220, %p214, %p219; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p221, %p216, %p217; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p222, %p215, %p221; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p223, %r116, %r139; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p224, %p223, %p222; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p225, %p220, %p224; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r140, %r115, %r138, %p225; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r141, %r116, %r139, %p225; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r142, %r140, 4, 31, -1; + shfl.sync.bfly.b32 %r143, %r141, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p226, %r140, %r142; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p227, %r140, %r142; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p228, %r140, %r140; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p229, %r142, %r142; + setp.num.f32 %p230, %r142, %r142; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p231, %p228, %p230; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p232, %p226, %p231; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p233, %p229, %p228; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p234, %p227, %p233; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p235, %r141, %r143; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p236, %p235, %p234; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p237, %p232, %p236; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r144, %r140, %r142, %p237; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r145, %r141, %r143, %p237; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r146, %r144, 2, 31, -1; + shfl.sync.bfly.b32 %r147, %r145, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p238, %r144, %r146; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p239, %r144, %r146; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p240, %r144, %r144; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p241, %r146, %r146; + setp.num.f32 %p242, %r146, %r146; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p243, %p240, %p242; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p244, %p238, %p243; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p245, %p241, %p240; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p246, %p239, %p245; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p247, %r145, %r147; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p248, %p247, %p246; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p249, %p244, %p248; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r148, %r144, %r146, %p249; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r149, %r145, %r147, %p249; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r150, %r148, 1, 31, -1; + shfl.sync.bfly.b32 %r151, %r149, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p250, %r148, %r150; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p251, %r148, %r150; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p252, %r148, %r148; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p253, %r150, %r150; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p254, %r149, %r151; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r152, %r149, %r151, %p252; + selp.b32 %r153, %r152, %r151, %p253; + selp.b32 %r154, %r149, %r153, %p251; + selp.b32 %r155, %r154, %r151, %p254; + selp.b32 %r156, %r155, %r149, %p253; + selp.b32 %r157, %r156, %r155, %p252; + selp.b32 %r158, %r149, %r157, %p250; +$L__tmp10: + .loc 1 48 25 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:48:25 + mad.wide.s32 %rd57, %r96, 8, %rd15; + .loc 1 48 36 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:48:36 + shr.u32 %r159, %r3, 2; + mov.b32 %r160, global_smem; + add.s32 %r161, %r160, %r159; + st.shared.b32 [%r161], %r137; + st.shared.b32 [%r161+128], %r158; + bar.sync 0; + shl.b32 %r162, %r95, 2; + add.s32 %r163, %r160, %r162; + ld.shared.s32 %rd56, [%r163]; + and.b32 %r164, %r2, 448; + setp.eq.b32 %p100, %r164, 0; + // begin inline asm + @%p100 st.global.b64 [ %rd57 + 0 ], { %rd56 }; + // end inline asm + .loc 1 48 4 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:48:4 + ret; +$L__tmp11: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 234 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 101 +.b8 53 +.b8 114 +.b8 111 +.b8 97 +.b8 116 +.b8 115 +.b8 101 +.b8 122 +.b8 111 +.b8 100 +.b8 113 +.b8 102 +.b8 53 +.b8 113 +.b8 112 +.b8 100 +.b8 51 +.b8 50 +.b8 104 +.b8 107 +.b8 115 +.b8 112 +.b8 116 +.b8 52 +.b8 102 +.b8 99 +.b8 102 +.b8 105 +.b8 113 +.b8 111 +.b8 55 +.b8 106 +.b8 111 +.b8 98 +.b8 111 +.b8 109 +.b8 55 +.b8 112 +.b8 122 +.b8 104 +.b8 115 +.b8 102 +.b8 116 +.b8 117 +.b8 107 +.b8 50 +.b8 55 +.b8 103 +.b8 106 +.b8 105 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 101 +.b8 53 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp9 // DW_AT_low_pc +.b64 $L__tmp10 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.source new file mode 100644 index 0000000000000000000000000000000000000000..389dfd5730af5985591a8cf657cab835aea92afb --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.source @@ -0,0 +1,315 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":18:0) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc45 = loc(unknown) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("out_ptr0"(#loc)) +#loc72 = loc("xnumel"(#loc)) +#loc73 = loc("r0_numel"(#loc)) +#loc100 = loc("a_value"(#loc33)) +#loc101 = loc("a_index"(#loc33)) +#loc102 = loc("b_value"(#loc33)) +#loc103 = loc("b_index"(#loc33)) +#loc116 = loc("x"(#loc53)) +#loc117 = loc("x"(#loc57)) +#loc118 = loc("value"(#loc66)) +#loc119 = loc("index"(#loc66)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 16384 : i32 loc(#loc74) + %r0_numel_1 = arith.constant 32000 : i32 loc(#loc75) + %xoffset = tt.get_program_id x : i32 loc(#loc76) + %xoffset_2 = arith.constant 64 : i32 loc(#loc77) + %xoffset_3 = arith.constant 64 : i32 loc(#loc77) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc77) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc78) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc79) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc80) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc80) + %xmask = arith.constant true loc(#loc81) + %xmask_8 = arith.constant dense : tensor<64x64xi1> loc(#loc81) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc82) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc83) + %x0 = arith.constant 2048 : i32 loc(#loc84) + %x0_10 = arith.constant 2048 : i32 loc(#loc84) + %x0_11 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc84) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc84) + %x1 = arith.constant 2048 : i32 loc(#loc85) + %x1_13 = arith.constant 2048 : i32 loc(#loc85) + %x1_14 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc85) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc85) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc86) + %_tmp2_16 = arith.constant dense<0xFF800000> : tensor<64x64xf32> loc(#loc86) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc87) + %_tmp2_index_17 = arith.constant dense<2147483647> : tensor<64x64xi32> loc(#loc87) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c64_i32 = arith.constant 64 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp2_index_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_19 = %_tmp2_16, %_tmp2_index_20 = %_tmp2_index_17) -> (tensor<64x64xf32>, tensor<64x64xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc89) + %r0_index_21 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc89) + %r0_mask = arith.constant dense<32000> : tensor<1x64xi32> loc(#loc90) + %r0_mask_22 = arith.cmpi slt, %r0_index_21, %r0_mask : tensor<1x64xi32> loc(#loc90) + %tmp0 = arith.constant 32000 : i32 loc(#loc91) + %tmp0_23 = arith.constant 32000 : i32 loc(#loc91) + %tmp0_24 = arith.constant dense<32000> : tensor<64x1xi32> loc(#loc91) + %tmp0_25 = arith.muli %tmp0_24, %x0_12 : tensor<64x1xi32> loc(#loc91) + %tmp0_26 = tt.broadcast %r0_index_21 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc92) + %tmp0_27 = tt.broadcast %tmp0_25 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc92) + %tmp0_28 = arith.addi %tmp0_26, %tmp0_27 : tensor<64x64xi32> loc(#loc92) + %tmp0_29 = arith.constant 65760000 : i32 loc(#loc93) + %tmp0_30 = arith.constant 65760000 : i32 loc(#loc93) + %tmp0_31 = arith.constant dense<65760000> : tensor<64x1xi32> loc(#loc93) + %tmp0_32 = arith.muli %tmp0_31, %x1_15 : tensor<64x1xi32> loc(#loc93) + %tmp0_33 = tt.broadcast %tmp0_32 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc94) + %tmp0_34 = arith.addi %tmp0_28, %tmp0_33 : tensor<64x64xi32> loc(#loc94) + %tmp0_35 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc95) + %tmp0_36 = tt.addptr %tmp0_35, %tmp0_34 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc95) + %tmp0_37 = arith.constant 0.000000e+00 : f32 loc(#loc96) + %tmp0_38 = tt.broadcast %r0_mask_22 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc96) + %tmp0_39 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc96) + %tmp0_40 = tt.load %tmp0_36, %tmp0_38, %tmp0_39 evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc96) + %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S64_64S_i32S64_64S_fp32S64_64S_i32S1_64S__(%_tmp2_19, %_tmp2_index_20, %tmp0_40, %r0_index_21) : (tensor<64x64xf32>, tensor<64x64xi32>, tensor<64x64xf32>, tensor<1x64xi32>) -> (tensor<64x64xf32>, tensor<64x64xi32>) loc(#loc24) + %_tmp2_41 = tt.broadcast %r0_mask_22 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc97) + %_tmp2_42 = arith.select %_tmp2_41, %8#0, %_tmp2_19 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc97) + %_tmp2_index_43 = tt.broadcast %r0_mask_22 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc98) + %_tmp2_index_44 = arith.select %_tmp2_index_43, %8#1, %_tmp2_index_20 : tensor<64x64xi1>, tensor<64x64xi32> loc(#loc98) + scf.yield %_tmp2_42, %_tmp2_index_44 : tensor<64x64xf32>, tensor<64x64xi32> loc(#loc27) + } loc(#loc120) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S64_64S_i32S64_64S__(2,)cconstexpr_1_"(%_tmp2_index_18#0, %_tmp2_index_18#1) : (tensor<64x64xf32>, tensor<64x64xi32>) -> (tensor<64xf32>, tensor<64xi32>) loc(#loc28) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc99) + %5 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc30) + %6 = tt.addptr %5, %xindex_7 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc30) + %7 = arith.extsi %tmp2 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc31) + tt.store %6, %7 : tensor<64x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S64_64S_i32S64_64S_fp32S64_64S_i32S1_64S__(%a_value: tensor<64x64xf32> loc("a_value"(#loc33)), %a_index: tensor<64x64xi32> loc("a_index"(#loc33)), %b_value: tensor<64x64xf32> loc("b_value"(#loc33)), %b_index: tensor<1x64xi32> loc("b_index"(#loc33))) -> (tensor<64x64xf32>, tensor<64x64xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<64x64xf32> loc(#loc121) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<64x64xf32> loc(#loc122) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S64_64S__(%a_value) : (tensor<64x64xf32>) -> i1 loc(#loc36) + %1:2 = scf.if %0 -> (tensor<64x64xi1>, tensor<64x64xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<64x64xf32> loc(#loc106) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<64x64xf32> loc(#loc107) + %mask_4 = arith.constant true loc(#loc108) + %mask_5 = arith.constant dense : tensor<64x64xi1> loc(#loc108) + %mask_6 = arith.xori %b_isnan, %mask_5 : tensor<64x64xi1> loc(#loc108) + %mask_7 = arith.andi %a_isnan, %mask_6 : tensor<64x64xi1> loc(#loc109) + %mask_8 = arith.ori %mask, %mask_7 : tensor<64x64xi1> loc(#loc123) + %equal_9 = arith.andi %a_isnan, %b_isnan : tensor<64x64xi1> loc(#loc111) + %equal_10 = arith.ori %equal, %equal_9 : tensor<64x64xi1> loc(#loc124) + scf.yield %mask_8, %equal_10 : tensor<64x64xi1>, tensor<64x64xi1> loc(#loc124) + } else { + scf.yield %mask, %equal : tensor<64x64xi1>, tensor<64x64xi1> loc(#loc45) + } loc(#loc37) + %mask_0 = tt.broadcast %b_index : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc113) + %mask_1 = arith.cmpi slt, %a_index, %mask_0 : tensor<64x64xi32> loc(#loc113) + %mask_2 = arith.andi %1#1, %mask_1 : tensor<64x64xi1> loc(#loc114) + %mask_3 = arith.ori %1#0, %mask_2 : tensor<64x64xi1> loc(#loc115) + %2 = arith.select %mask_3, %a_value, %b_value : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc49) + %3 = tt.broadcast %b_index : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc50) + %4 = arith.select %mask_3, %a_index, %3 : tensor<64x64xi1>, tensor<64x64xi32> loc(#loc50) + tt.return %2, %4 : tensor<64x64xf32>, tensor<64x64xi32> loc(#loc51) + ^bb1: // no predecessors + %5 = ub.poison : tensor<64x64xf32> loc(#loc52) + %6 = ub.poison : tensor<64x64xi32> loc(#loc52) + tt.return %5, %6 : tensor<64x64xf32>, tensor<64x64xi32> loc(#loc52) + } loc(#loc33) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S64_64S__(%x: tensor<64x64xf32> loc("x"(#loc53))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S64_64S__(%x) : (tensor<64x64xf32>) -> tensor<64x64xf32> loc(#loc54) + %true = arith.constant true loc(#loc55) + tt.return %true : i1 loc(#loc55) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc56) + tt.return %1 : i1 loc(#loc56) + } loc(#loc53) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S64_64S__(%x: tensor<64x64xf32> loc("x"(#loc57))) -> tensor<64x64xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc59) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<64x64xf32> loc(#loc59) + %4 = arith.addf %x, %3 : tensor<64x64xf32> loc(#loc59) + tt.return %4 : tensor<64x64xf32> loc(#loc60) + ^bb1: // no predecessors + %5 = ub.poison : tensor<64x64xf32> loc(#loc61) + tt.return %5 : tensor<64x64xf32> loc(#loc61) + } loc(#loc57) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc63) + %cst = arith.constant dense : tensor<1xi1> loc(#loc63) + tt.return %cst : tensor<1xi1> loc(#loc64) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc65) + tt.return %0 : tensor<1xi1> loc(#loc65) + } loc(#loc62) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S64_64S_i32S64_64S__(2,)cconstexpr_1_"(%value: tensor<64x64xf32> loc("value"(#loc66)), %index: tensor<64x64xi32> loc("index"(#loc66))) -> (tensor<64xf32>, tensor<64xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc67) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc67) + }) : (tensor<64x64xf32>, tensor<64x64xi32>) -> (tensor<64xf32>, tensor<64xi32>) loc(#loc67) + tt.return %0#0, %0#1 : tensor<64xf32>, tensor<64xi32> loc(#loc68) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc69) + %2 = ub.poison : tensor<64xi32> loc(#loc69) + tt.return %1, %2 : tensor<64xf32>, tensor<64xi32> loc(#loc69) + } loc(#loc66) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc33)), %a_index: i32 loc("a_index"(#loc33)), %b_value: f32 loc("b_value"(#loc33)), %b_index: i32 loc("b_index"(#loc33))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc121) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc122) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc36) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc106) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc107) + %mask_3 = arith.constant true loc(#loc108) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc108) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc109) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc123) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc111) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc124) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc124) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc45) + } loc(#loc37) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc113) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc114) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc115) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc49) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc50) + tt.return %2, %3 : f32, i32 loc(#loc51) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc52) + %5 = ub.poison : i32 loc(#loc52) + tt.return %4, %5 : f32, i32 loc(#loc52) + } loc(#loc33) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc53))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc54) + %true = arith.constant true loc(#loc55) + tt.return %true : i1 loc(#loc55) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc56) + tt.return %1 : i1 loc(#loc56) + } loc(#loc53) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc57))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc59) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc59) + tt.return %3 : tensor<1xf32> loc(#loc60) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc61) + tt.return %4 : tensor<1xf32> loc(#loc61) + } loc(#loc57) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":29:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":30:55) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":31:58) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":33:40) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":34:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":35:29) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:47) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:61) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:52) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:66) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":42:38) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":44:46) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:58) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:8) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":46:75) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":47:20) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:25) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:36) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:4) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc74 = loc("xnumel"(#loc1)) +#loc75 = loc("r0_numel"(#loc2)) +#loc76 = loc("xoffset"(#loc3)) +#loc77 = loc("xoffset"(#loc4)) +#loc78 = loc("xindex"(#loc5)) +#loc79 = loc("xindex"(#loc6)) +#loc80 = loc("xindex"(#loc7)) +#loc81 = loc("xmask"(#loc8)) +#loc82 = loc("r0_base"(#loc9)) +#loc83 = loc("r0_base"(#loc10)) +#loc84 = loc("x0"(#loc11)) +#loc85 = loc("x1"(#loc12)) +#loc86 = loc("_tmp2"(#loc13)) +#loc87 = loc("_tmp2_index"(#loc14)) +#loc88 = loc("_tmp2"(#loc15)) +#loc89 = loc("r0_index"(#loc16)) +#loc90 = loc("r0_mask"(#loc17)) +#loc91 = loc("tmp0"(#loc18)) +#loc92 = loc("tmp0"(#loc19)) +#loc93 = loc("tmp0"(#loc20)) +#loc94 = loc("tmp0"(#loc21)) +#loc95 = loc("tmp0"(#loc22)) +#loc96 = loc("tmp0"(#loc23)) +#loc97 = loc("_tmp2"(#loc25)) +#loc98 = loc("_tmp2_index"(#loc26)) +#loc99 = loc("tmp2"(#loc29)) +#loc104 = loc("mask"(#loc34)) +#loc105 = loc("equal"(#loc35)) +#loc106 = loc("a_isnan"(#loc38)) +#loc107 = loc("b_isnan"(#loc39)) +#loc108 = loc("mask"(#loc40)) +#loc109 = loc("mask"(#loc41)) +#loc110 = loc("mask"(#loc42)) +#loc111 = loc("equal"(#loc43)) +#loc112 = loc("equal"(#loc44)) +#loc113 = loc("mask"(#loc46)) +#loc114 = loc("mask"(#loc47)) +#loc115 = loc("mask"(#loc48)) +#loc120 = loc("_tmp2_index"(#loc88)) +#loc121 = loc("mask"(#loc104)) +#loc122 = loc("equal"(#loc105)) +#loc123 = loc("mask"(#loc110)) +#loc124 = loc("equal"(#loc112)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..deef11aad6a5f88ab0027f2c5ce750613672ee46 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.ttgir @@ -0,0 +1,202 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [16, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 8], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":18:0) +#loc1 = loc(unknown) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":46:75) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc79 = loc(callsite(#loc1 at #loc37)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<65760000> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<32000> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<32000> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<2048> : tensor<64x1xi32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %true = arith.constant true loc(#loc1) + %cst_4 = arith.constant dense : tensor<64x64xi1, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<2147483647> : tensor<64x64xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<0xFF800000> : tensor<64x64xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc46) + %xoffset_7 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc47) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc48) + %xindex_8 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc48) + %xindex_9 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc48) + %xindex_10 = tt.expand_dims %xindex_8 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc48) + %xindex_11 = tt.splat %xoffset_7 : i32 -> tensor<64x1xi32, #blocked> loc(#loc49) + %xindex_12 = tt.splat %xoffset_7 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc49) + %xindex_13 = arith.addi %xindex_11, %xindex_9 : tensor<64x1xi32, #blocked> loc(#loc49) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<64x1xi32, #blocked1> loc(#loc49) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc50) + %r0_base_15 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc50) + %x0 = arith.remsi %xindex_13, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc51) + %x1 = arith.divsi %xindex_13, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc52) + %tmp0 = arith.muli %x0, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc53) + %tmp0_16 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc54) + %tmp0_17 = arith.muli %x1, %cst : tensor<64x1xi32, #blocked> loc(#loc55) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc56) + %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> loc(#loc57) + %_tmp2_index:2 = scf.for %_tmp2_index_20 = %c0_i32 to %c32000_i32 step %c64_i32 iter_args(%_tmp2 = %cst_6, %_tmp2_index_21 = %cst_5) -> (tensor<64x64xf32, #blocked>, tensor<64x64xi32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp2_index_20 : i32 -> tensor<1x64xi32, #blocked> loc(#loc59) + %r0_index_22 = arith.addi %r0_index, %r0_base_15 : tensor<1x64xi32, #blocked> loc(#loc59) + %r0_mask = arith.cmpi slt, %r0_index_22, %cst_1 : tensor<1x64xi32, #blocked> loc(#loc60) + %tmp0_23 = tt.broadcast %r0_index_22 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc54) + %tmp0_24 = arith.addi %tmp0_23, %tmp0_16 : tensor<64x64xi32, #blocked> loc(#loc54) + %tmp0_25 = arith.addi %tmp0_24, %tmp0_18 : tensor<64x64xi32, #blocked> loc(#loc56) + %tmp0_26 = tt.addptr %tmp0_19, %tmp0_25 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi32, #blocked> loc(#loc57) + %tmp0_27 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc61) + %tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst_3 evictionPolicy = evict_first : tensor<64x64x!tt.ptr, #blocked> loc(#loc61) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_28 : tensor<64x64xf32, #blocked> loc(#loc104) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_28 : tensor<64x64xf32, #blocked> loc(#loc105) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<64x64xf32, #blocked> loc(#loc84) + %b_isnan = arith.cmpf une, %tmp0_28, %tmp0_28 : tensor<64x64xf32, #blocked> loc(#loc85) + %mask_29 = arith.xori %b_isnan, %cst_4 : tensor<64x64xi1, #blocked> loc(#loc86) + %mask_30 = arith.andi %a_isnan, %mask_29 : tensor<64x64xi1, #blocked> loc(#loc87) + %mask_31 = arith.ori %mask, %mask_30 : tensor<64x64xi1, #blocked> loc(#loc106) + %equal_32 = arith.andi %a_isnan, %b_isnan : tensor<64x64xi1, #blocked> loc(#loc89) + %equal_33 = arith.ori %equal, %equal_32 : tensor<64x64xi1, #blocked> loc(#loc107) + %mask_34 = arith.cmpi slt, %_tmp2_index_21, %tmp0_23 : tensor<64x64xi32, #blocked> loc(#loc91) + %mask_35 = arith.andi %equal_33, %mask_34 : tensor<64x64xi1, #blocked> loc(#loc92) + %mask_36 = arith.ori %mask_31, %mask_35 : tensor<64x64xi1, #blocked> loc(#loc93) + %5 = arith.select %mask_36, %_tmp2, %tmp0_28 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc74) + %6 = arith.select %mask_36, %_tmp2_index_21, %tmp0_23 : tensor<64x64xi1, #blocked>, tensor<64x64xi32, #blocked> loc(#loc75) + %_tmp2_37 = arith.select %tmp0_27, %5, %_tmp2 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc76) + %_tmp2_index_38 = arith.select %tmp0_27, %6, %_tmp2_index_21 : tensor<64x64xi1, #blocked>, tensor<64x64xi32, #blocked> loc(#loc77) + scf.yield %_tmp2_37, %_tmp2_index_38 : tensor<64x64xf32, #blocked>, tensor<64x64xi32, #blocked> loc(#loc35) + } loc(#loc81) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc37)), %arg5: i32 loc(callsite(#loc1 at #loc37)), %arg6: f32 loc(callsite(#loc1 at #loc37)), %arg7: i32 loc(callsite(#loc1 at #loc37))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc108) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc109) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc94) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc95) + %mask_20 = arith.xori %b_isnan, %true : i1 loc(#loc96) + %mask_21 = arith.andi %a_isnan, %mask_20 : i1 loc(#loc97) + %mask_22 = arith.ori %mask, %mask_21 : i1 loc(#loc110) + %equal_23 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc98) + %equal_24 = arith.ori %equal, %equal_23 : i1 loc(#loc111) + %mask_25 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc99) + %mask_26 = arith.andi %equal_24, %mask_25 : i1 loc(#loc100) + %mask_27 = arith.ori %mask_22, %mask_26 : i1 loc(#loc101) + %5 = arith.select %mask_27, %arg4, %arg6 : f32 loc(#loc102) + %6 = arith.select %mask_27, %arg5, %arg7 : i32 loc(#loc103) + tt.reduce.return %5, %6 : f32, i32 loc(#loc78) + }) : (tensor<64x64xf32, #blocked>, tensor<64x64xi32, #blocked>) -> (tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc78) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc80) + %1 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc39) + %2 = tt.addptr %1, %xindex_14 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc39) + %3 = ttg.convert_layout %tmp2 : tensor<64x1xi32, #blocked> -> tensor<64x1xi32, #blocked1> loc(#loc40) + %4 = arith.extsi %3 : tensor<64x1xi32, #blocked1> to tensor<64x1xi64, #blocked1> loc(#loc40) + tt.store %2, %4 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc40) + tt.return loc(#loc41) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":29:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:47) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:61) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:52) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:34) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":33:40) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":34:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":35:29) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:66) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":42:38) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":44:46) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:58) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:8) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":47:20) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:25) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:36) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:4) +#loc46 = loc("xoffset"(#loc2)) +#loc47 = loc("xoffset"(#loc3)) +#loc48 = loc("xindex"(#loc4)) +#loc49 = loc("xindex"(#loc5)) +#loc50 = loc("r0_base"(#loc6)) +#loc51 = loc("x0"(#loc7)) +#loc52 = loc("x1"(#loc8)) +#loc53 = loc("tmp0"(#loc9)) +#loc54 = loc("tmp0"(#loc10)) +#loc55 = loc("tmp0"(#loc11)) +#loc56 = loc("tmp0"(#loc12)) +#loc57 = loc("tmp0"(#loc13)) +#loc58 = loc("_tmp2"(#loc14)) +#loc59 = loc("r0_index"(#loc15)) +#loc60 = loc("r0_mask"(#loc16)) +#loc61 = loc("tmp0"(#loc17)) +#loc62 = loc("mask"(#loc18)) +#loc63 = loc("equal"(#loc20)) +#loc64 = loc("a_isnan"(#loc21)) +#loc65 = loc("b_isnan"(#loc22)) +#loc66 = loc("mask"(#loc23)) +#loc67 = loc("mask"(#loc24)) +#loc68 = loc("mask"(#loc25)) +#loc69 = loc("equal"(#loc26)) +#loc70 = loc("equal"(#loc27)) +#loc71 = loc("mask"(#loc28)) +#loc72 = loc("mask"(#loc29)) +#loc73 = loc("mask"(#loc30)) +#loc74 = loc(callsite(#loc31 at #loc19)) +#loc75 = loc(callsite(#loc32 at #loc19)) +#loc76 = loc("_tmp2"(#loc33)) +#loc77 = loc("_tmp2_index"(#loc34)) +#loc78 = loc(callsite(#loc36 at #loc37)) +#loc80 = loc("tmp2"(#loc38)) +#loc81 = loc("_tmp2_index"(#loc58)) +#loc82 = loc("mask"(#loc62)) +#loc83 = loc("equal"(#loc63)) +#loc84 = loc(callsite(#loc64 at #loc19)) +#loc85 = loc(callsite(#loc65 at #loc19)) +#loc86 = loc(callsite(#loc66 at #loc19)) +#loc87 = loc(callsite(#loc67 at #loc19)) +#loc88 = loc("mask"(#loc68)) +#loc89 = loc(callsite(#loc69 at #loc19)) +#loc90 = loc("equal"(#loc70)) +#loc91 = loc(callsite(#loc71 at #loc19)) +#loc92 = loc(callsite(#loc72 at #loc19)) +#loc93 = loc(callsite(#loc73 at #loc19)) +#loc94 = loc(callsite(#loc64 at #loc78)) +#loc95 = loc(callsite(#loc65 at #loc78)) +#loc96 = loc(callsite(#loc66 at #loc78)) +#loc97 = loc(callsite(#loc67 at #loc78)) +#loc98 = loc(callsite(#loc69 at #loc78)) +#loc99 = loc(callsite(#loc71 at #loc78)) +#loc100 = loc(callsite(#loc72 at #loc78)) +#loc101 = loc(callsite(#loc73 at #loc78)) +#loc102 = loc(callsite(#loc31 at #loc78)) +#loc103 = loc(callsite(#loc32 at #loc78)) +#loc104 = loc(callsite(#loc82 at #loc19)) +#loc105 = loc(callsite(#loc83 at #loc19)) +#loc106 = loc(callsite(#loc88 at #loc19)) +#loc107 = loc(callsite(#loc90 at #loc19)) +#loc108 = loc(callsite(#loc82 at #loc78)) +#loc109 = loc(callsite(#loc83 at #loc78)) +#loc110 = loc(callsite(#loc88 at #loc78)) +#loc111 = loc(callsite(#loc90 at #loc78)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..5faa3dd312c740ede9f386dcff7b7d12a9ec35b4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/KGOS23I5L5ZWY4U5ADNVXZJCIX7ZQ7RU57MHXZXYT4OHSKL7IWIQ/triton_red_fused_argmax_1.ttir @@ -0,0 +1,200 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":46:75) +#loc45 = loc("in_ptr0"(#loc)) +#loc46 = loc("out_ptr0"(#loc)) +#loc47 = loc("xnumel"(#loc)) +#loc48 = loc("r0_numel"(#loc)) +#loc49 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %true = arith.constant true loc(#loc49) + %cst = arith.constant dense : tensor<64x64xi1> loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc1) + %cst_1 = arith.constant dense<65760000> : tensor<64x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<32000> : tensor<64x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<32000> : tensor<1x64xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<64x64xi32> loc(#loc50) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<64x64xf32> loc(#loc51) + %cst_4 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc52) + %xoffset_5 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc53) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc54) + %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc55) + %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<64x1xi32> loc(#loc56) + %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<64x1xi32> loc(#loc56) + %r0_base = tt.expand_dims %xindex {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc57) + %x0 = arith.remsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc58) + %x1 = arith.divsi %xindex_8, %cst_4 : tensor<64x1xi32> loc(#loc59) + %_tmp2_index_9:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c64_i32 iter_args(%_tmp2_10 = %_tmp2, %_tmp2_index_11 = %_tmp2_index) -> (tensor<64x64xf32>, tensor<64x64xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc61) + %r0_index_12 = arith.addi %r0_index, %r0_base : tensor<1x64xi32> loc(#loc61) + %r0_mask = arith.cmpi slt, %r0_index_12, %cst_3 : tensor<1x64xi32> loc(#loc62) + %tmp0 = arith.muli %x0, %cst_2 : tensor<64x1xi32> loc(#loc63) + %tmp0_13 = tt.broadcast %r0_index_12 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc64) + %tmp0_14 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc64) + %tmp0_15 = arith.addi %tmp0_13, %tmp0_14 : tensor<64x64xi32> loc(#loc64) + %tmp0_16 = arith.muli %x1, %cst_1 : tensor<64x1xi32> loc(#loc65) + %tmp0_17 = tt.broadcast %tmp0_16 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc66) + %tmp0_18 = arith.addi %tmp0_15, %tmp0_17 : tensor<64x64xi32> loc(#loc66) + %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc67) + %tmp0_20 = tt.addptr %tmp0_19, %tmp0_18 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc67) + %tmp0_21 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc68) + %tmp0_22 = tt.load %tmp0_20, %tmp0_21, %cst_0 evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc68) + %mask = arith.cmpf ogt, %_tmp2_10, %tmp0_22 : tensor<64x64xf32> loc(#loc110) + %equal = arith.cmpf oeq, %_tmp2_10, %tmp0_22 : tensor<64x64xf32> loc(#loc111) + %a_isnan = arith.cmpf une, %_tmp2_10, %_tmp2_10 : tensor<64x64xf32> loc(#loc90) + %b_isnan = arith.cmpf une, %tmp0_22, %tmp0_22 : tensor<64x64xf32> loc(#loc91) + %mask_23 = arith.xori %b_isnan, %cst : tensor<64x64xi1> loc(#loc92) + %mask_24 = arith.andi %a_isnan, %mask_23 : tensor<64x64xi1> loc(#loc93) + %mask_25 = arith.ori %mask, %mask_24 : tensor<64x64xi1> loc(#loc112) + %equal_26 = arith.andi %a_isnan, %b_isnan : tensor<64x64xi1> loc(#loc95) + %equal_27 = arith.ori %equal, %equal_26 : tensor<64x64xi1> loc(#loc113) + %mask_28 = arith.cmpi slt, %_tmp2_index_11, %tmp0_13 : tensor<64x64xi32> loc(#loc97) + %mask_29 = arith.andi %equal_27, %mask_28 : tensor<64x64xi1> loc(#loc98) + %mask_30 = arith.ori %mask_25, %mask_29 : tensor<64x64xi1> loc(#loc99) + %4 = arith.select %mask_30, %_tmp2_10, %tmp0_22 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc81) + %5 = arith.select %mask_30, %_tmp2_index_11, %tmp0_13 : tensor<64x64xi1>, tensor<64x64xi32> loc(#loc82) + %_tmp2_31 = arith.select %tmp0_21, %4, %_tmp2_10 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc83) + %_tmp2_index_32 = arith.select %tmp0_21, %5, %_tmp2_index_11 : tensor<64x64xi1>, tensor<64x64xi32> loc(#loc84) + scf.yield %_tmp2_31, %_tmp2_index_32 : tensor<64x64xf32>, tensor<64x64xi32> loc(#loc39) + } loc(#loc87) + %0:2 = "tt.reduce"(%_tmp2_index_9#0, %_tmp2_index_9#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc2)), %arg5: i32 loc(callsite(#loc1 at #loc2)), %arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc114) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc115) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc100) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc101) + %mask_10 = arith.xori %b_isnan, %true : i1 loc(#loc102) + %mask_11 = arith.andi %a_isnan, %mask_10 : i1 loc(#loc103) + %mask_12 = arith.ori %mask, %mask_11 : i1 loc(#loc116) + %equal_13 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc104) + %equal_14 = arith.ori %equal, %equal_13 : i1 loc(#loc117) + %mask_15 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc105) + %mask_16 = arith.andi %equal_14, %mask_15 : i1 loc(#loc106) + %mask_17 = arith.ori %mask_12, %mask_16 : i1 loc(#loc107) + %4 = arith.select %mask_17, %arg4, %arg6 : f32 loc(#loc108) + %5 = arith.select %mask_17, %arg5, %arg7 : i32 loc(#loc109) + tt.reduce.return %4, %5 : f32, i32 loc(#loc85) + }) : (tensor<64x64xf32>, tensor<64x64xi32>) -> (tensor<64xf32>, tensor<64xi32>) loc(#loc85) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc86) + %1 = tt.splat %out_ptr0 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc42) + %2 = tt.addptr %1, %xindex_8 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc42) + %3 = arith.extsi %tmp2 : tensor<64x1xi32> to tensor<64x1xi64> loc(#loc43) + tt.store %2, %3 : tensor<64x1x!tt.ptr> loc(#loc43) + tt.return loc(#loc44) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":33:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":31:58) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":30:55) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:33) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:44) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:23) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":28:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":29:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":34:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":35:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:47) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:61) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:52) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:66) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":42:38) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":44:46) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:58) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:8) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":47:20) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:25) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:36) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:4) +#loc50 = loc("_tmp2_index"(#loc4)) +#loc51 = loc("_tmp2"(#loc5)) +#loc52 = loc("xoffset"(#loc6)) +#loc53 = loc("xoffset"(#loc7)) +#loc54 = loc("xindex"(#loc8)) +#loc55 = loc("xindex"(#loc9)) +#loc56 = loc("xindex"(#loc10)) +#loc57 = loc("r0_base"(#loc11)) +#loc58 = loc("x0"(#loc12)) +#loc59 = loc("x1"(#loc13)) +#loc60 = loc("_tmp2"(#loc3)) +#loc61 = loc("r0_index"(#loc14)) +#loc62 = loc("r0_mask"(#loc15)) +#loc63 = loc("tmp0"(#loc16)) +#loc64 = loc("tmp0"(#loc17)) +#loc65 = loc("tmp0"(#loc18)) +#loc66 = loc("tmp0"(#loc19)) +#loc67 = loc("tmp0"(#loc20)) +#loc68 = loc("tmp0"(#loc21)) +#loc69 = loc("mask"(#loc22)) +#loc70 = loc("equal"(#loc24)) +#loc71 = loc("a_isnan"(#loc25)) +#loc72 = loc("b_isnan"(#loc26)) +#loc73 = loc("mask"(#loc27)) +#loc74 = loc("mask"(#loc28)) +#loc75 = loc("mask"(#loc29)) +#loc76 = loc("equal"(#loc30)) +#loc77 = loc("equal"(#loc31)) +#loc78 = loc("mask"(#loc32)) +#loc79 = loc("mask"(#loc33)) +#loc80 = loc("mask"(#loc34)) +#loc81 = loc(callsite(#loc35 at #loc23)) +#loc82 = loc(callsite(#loc36 at #loc23)) +#loc83 = loc("_tmp2"(#loc37)) +#loc84 = loc("_tmp2_index"(#loc38)) +#loc85 = loc(callsite(#loc40 at #loc2)) +#loc86 = loc("tmp2"(#loc41)) +#loc87 = loc("_tmp2_index"(#loc60)) +#loc88 = loc("mask"(#loc69)) +#loc89 = loc("equal"(#loc70)) +#loc90 = loc(callsite(#loc71 at #loc23)) +#loc91 = loc(callsite(#loc72 at #loc23)) +#loc92 = loc(callsite(#loc73 at #loc23)) +#loc93 = loc(callsite(#loc74 at #loc23)) +#loc94 = loc("mask"(#loc75)) +#loc95 = loc(callsite(#loc76 at #loc23)) +#loc96 = loc("equal"(#loc77)) +#loc97 = loc(callsite(#loc78 at #loc23)) +#loc98 = loc(callsite(#loc79 at #loc23)) +#loc99 = loc(callsite(#loc80 at #loc23)) +#loc100 = loc(callsite(#loc71 at #loc85)) +#loc101 = loc(callsite(#loc72 at #loc85)) +#loc102 = loc(callsite(#loc73 at #loc85)) +#loc103 = loc(callsite(#loc74 at #loc85)) +#loc104 = loc(callsite(#loc76 at #loc85)) +#loc105 = loc(callsite(#loc78 at #loc85)) +#loc106 = loc(callsite(#loc79 at #loc85)) +#loc107 = loc(callsite(#loc80 at #loc85)) +#loc108 = loc(callsite(#loc35 at #loc85)) +#loc109 = loc(callsite(#loc36 at #loc85)) +#loc110 = loc(callsite(#loc88 at #loc23)) +#loc111 = loc(callsite(#loc89 at #loc23)) +#loc112 = loc(callsite(#loc94 at #loc23)) +#loc113 = loc(callsite(#loc96 at #loc23)) +#loc114 = loc(callsite(#loc88 at #loc85)) +#loc115 = loc(callsite(#loc89 at #loc85)) +#loc116 = loc(callsite(#loc94 at #loc85)) +#loc117 = loc(callsite(#loc96 at #loc85)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/KJGBFCDPODDSTMI2D3ESJLJ6JDTKTCDGYNBY5NRKEHPZNNN6VV2Q/cuda_utils.cpython-311-x86_64-linux-gnu.so b/SpecForge-ext/cache/compiled_kernels/triton/6/KJGBFCDPODDSTMI2D3ESJLJ6JDTKTCDGYNBY5NRKEHPZNNN6VV2Q/cuda_utils.cpython-311-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..4f2a511bbfa973a86a39fc35a394a9fe89a34330 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/KJGBFCDPODDSTMI2D3ESJLJ6JDTKTCDGYNBY5NRKEHPZNNN6VV2Q/cuda_utils.cpython-311-x86_64-linux-gnu.so differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..13d8c96d3871eac69eae51dd24da1e9fe87814a3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..f9bd1572da0c3432af3a98e8097268485c7b41ad Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..32d2127246cab191f7838b615183d45f6cc89457 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "557116aa68f0d96e604b17a4a612d20d162114ae40f60983ba5fe5d85ca89e5f", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 8, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..f210eac83dd43e13f968a431b402011b5cd1176c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.llir @@ -0,0 +1,139 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %10 = and i32 %9, 31, !dbg !8 + %11 = lshr i32 %9, 5, !dbg !8 + %12 = shl nuw nsw i32 %9, 1, !dbg !8 + %13 = and i32 %12, 126, !dbg !8 + %14 = lshr i32 %8, 4, !dbg !9 + %15 = and i32 %14, 3968, !dbg !9 + %16 = shl i32 %8, 12, !dbg !10 + %17 = and i32 %16, 8384512, !dbg !10 + %18 = or disjoint i32 %15, %17, !dbg !11 + %19 = shl i32 %8, 7, !dbg !12 + %20 = and i32 %19, -8388608, !dbg !12 + %21 = or disjoint i32 %18, %20, !dbg !13 + %22 = or disjoint i32 %21, %13, !dbg !13 + %23 = sext i32 %22 to i64, !dbg !14 + %24 = getelementptr bfloat, ptr addrspace(1) %0, i64 %23, !dbg !14 + %25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !15 + %26 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %24, i64 %25, i1 true) #4, !dbg !15 + %27 = bitcast i32 %26 to <2 x bfloat>, !dbg !15 + %28 = or disjoint i32 %13, %19, !dbg !16 + %29 = sext i32 %28 to i64, !dbg !17 + %30 = getelementptr bfloat, ptr addrspace(1) %1, i64 %29, !dbg !17 + %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !18 + %32 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %30, i64 %31, i1 true) #4, !dbg !18 + %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !18 + %34 = fpext <2 x bfloat> %27 to <2 x float>, !dbg !19 + %35 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !20 + %36 = fmul <2 x float> %34, %35, !dbg !21 + %37 = fadd <2 x float> %36, zeroinitializer, !dbg !22 + %shift = shufflevector <2 x float> %37, <2 x float> poison, <2 x i32> , !dbg !23 + %foldExtExtBinop = fadd <2 x float> %37, %shift, !dbg !23 + %38 = extractelement <2 x float> %foldExtExtBinop, i64 0, !dbg !23 + %39 = bitcast float %38 to i32, !dbg !27 + %40 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 16, i32 31), !dbg !27 + %41 = bitcast i32 %40 to float, !dbg !27 + %42 = fadd float %38, %41, !dbg !23 + %43 = bitcast float %42 to i32, !dbg !27 + %44 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %43, i32 8, i32 31), !dbg !27 + %45 = bitcast i32 %44 to float, !dbg !27 + %46 = fadd float %42, %45, !dbg !23 + %47 = bitcast float %46 to i32, !dbg !27 + %48 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %47, i32 4, i32 31), !dbg !27 + %49 = bitcast i32 %48 to float, !dbg !27 + %50 = fadd float %46, %49, !dbg !23 + %51 = bitcast float %50 to i32, !dbg !27 + %52 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %51, i32 2, i32 31), !dbg !27 + %53 = bitcast i32 %52 to float, !dbg !27 + %54 = fadd float %50, %53, !dbg !23 + %55 = bitcast float %54 to i32, !dbg !27 + %56 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 1, i32 31), !dbg !27 + %57 = bitcast i32 %56 to float, !dbg !27 + %58 = fadd float %54, %57, !dbg !23 + %59 = and i32 %11, 1, !dbg !27 + %60 = icmp eq i32 %10, 0, !dbg !27 + %61 = getelementptr float, ptr addrspace(3) @global_smem, i32 %59, !dbg !27 + %62 = bitcast float %58 to <1 x i32>, !dbg !27 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %61, <1 x i32> %62, i1 %60) #4, !dbg !27 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !27 + %63 = icmp samesign ult i32 %9, 2, !dbg !27 + %64 = getelementptr float, ptr addrspace(3) @global_smem, i32 %9, !dbg !27 + %65 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %64, i1 %63) #4, !dbg !27 + %66 = bitcast i32 %65 to float, !dbg !27 + %67 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %65, i32 1, i32 31), !dbg !27 + %68 = bitcast i32 %67 to float, !dbg !27 + %69 = fadd float %66, %68, !dbg !23 + %70 = icmp eq i32 %9, 0, !dbg !27 + %71 = bitcast float %69 to <1 x i32>, !dbg !27 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %64, <1 x i32> %71, i1 %70) #4, !dbg !27 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !27 + %72 = load i32, ptr addrspace(3) @global_smem, align 16, !dbg !27 + %73 = zext nneg i32 %8 to i64, !dbg !28 + %74 = getelementptr float, ptr addrspace(1) %2, i64 %73, !dbg !28 + %75 = and i32 %9, 63, !dbg !29 + %76 = icmp eq i32 %75, 0, !dbg !29 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %72, ptr addrspace(1) %74, i1 %76) #4, !dbg !29 + ret void, !dbg !30 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 26, column: 37, scope: !4) +!9 = !DILocation(line: 39, column: 45, scope: !4) +!10 = !DILocation(line: 39, column: 55, scope: !4) +!11 = !DILocation(line: 39, column: 50, scope: !4) +!12 = !DILocation(line: 39, column: 68, scope: !4) +!13 = !DILocation(line: 39, column: 60, scope: !4) +!14 = !DILocation(line: 39, column: 34, scope: !4) +!15 = !DILocation(line: 39, column: 73, scope: !4) +!16 = !DILocation(line: 40, column: 41, scope: !4) +!17 = !DILocation(line: 40, column: 34, scope: !4) +!18 = !DILocation(line: 40, column: 50, scope: !4) +!19 = !DILocation(line: 39, column: 127, scope: !4) +!20 = !DILocation(line: 40, column: 104, scope: !4) +!21 = !DILocation(line: 41, column: 22, scope: !4) +!22 = !DILocation(line: 43, column: 23, scope: !4) +!23 = !DILocation(line: 261, column: 15, scope: !24, inlinedAt: !26) +!24 = distinct !DILexicalBlockFile(scope: !4, file: !25, discriminator: 0) +!25 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!26 = !DILocation(line: 45, column: 25, scope: !4) +!27 = !DILocation(line: 291, column: 36, scope: !24, inlinedAt: !26) +!28 = !DILocation(line: 49, column: 25, scope: !4) +!29 = !DILocation(line: 49, column: 36, scope: !4) +!30 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..039c2b14ff0cc36c7844bd47f233e173313370b7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.ptx @@ -0,0 +1,389 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u32 triton_red_fused_zeros_0_param_3, + .param .u32 triton_red_fused_zeros_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_6 +) +.reqntid 64 +{ + .reg .pred %p<7>; + .reg .b16 %rs<5>; + .reg .b32 %r<49>; + .reg .b64 %rd<11>; + .loc 1 18 0 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_red_fused_zeros_0_param_0]; + ld.param.b64 %rd9, [triton_red_fused_zeros_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:23:28 + mov.u32 %r12, %ctaid.x; + ld.param.b64 %rd10, [triton_red_fused_zeros_0_param_2]; + .loc 1 26 37 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:26:37 + mov.u32 %r13, %tid.x; + and.b32 %r14, %r13, 31; + shl.b32 %r15, %r13, 1; + and.b32 %r16, %r15, 126; + .loc 1 39 45 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:45 + shr.u32 %r17, %r12, 4; + and.b32 %r18, %r17, 3968; + .loc 1 39 55 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:55 + shl.b32 %r19, %r12, 12; + and.b32 %r20, %r19, 8384512; + .loc 1 39 50 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:50 + or.b32 %r21, %r18, %r20; + .loc 1 39 68 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:68 + shl.b32 %r22, %r12, 7; + and.b32 %r23, %r22, -8388608; + .loc 1 39 60 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:60 + or.b32 %r24, %r21, %r23; + or.b32 %r25, %r24, %r16; + .loc 1 39 34 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:34 + mad.wide.s32 %rd2, %r25, 2, %rd8; + .loc 1 39 73 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:73 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd3, 1.0; + // end inline asm + mov.b32 %r2, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r2; + @%p1 ld.global.L1::evict_first.L2::cache_hint.b32 { %r1 }, [ %rd2 + 0 ], %rd3; + // end inline asm + .loc 1 40 41 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:41 + or.b32 %r26, %r16, %r22; + .loc 1 40 34 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:34 + mad.wide.s32 %rd5, %r26, 2, %rd9; + .loc 1 40 50 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:50 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r3, %r2; + @%p1 ld.global.L1::evict_first.L2::cache_hint.b32 { %r3 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 39 127 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:127 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r27, %rs1; + cvt.f32.bf16 %r28, %rs2; + .loc 1 40 104 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:104 + mov.b32 {%rs3, %rs4}, %r3; + cvt.f32.bf16 %r29, %rs3; + cvt.f32.bf16 %r30, %rs4; + .loc 1 43 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:43:23 + fma.rn.f32 %r31, %r28, %r30, 0f00000000; + fma.rn.f32 %r32, %r27, %r29, 0f00000000; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r33, %r32, %r31; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r34, %r33, 16, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r35, %r33, %r34; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r36, %r35, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r37, %r35, %r36; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r38, %r37, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r39, %r37, %r38; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r40, %r39, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r41, %r39, %r40; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r6, %r41, %r42; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + setp.eq.b32 %p3, %r14, 0; + shr.u32 %r43, %r13, 3; + and.b32 %r44, %r43, 4; + mov.b32 %r45, global_smem; + add.s32 %r5, %r45, %r44; + // begin inline asm + @%p3 st.shared.b32 [ %r5 + 0 ], %r6; + // end inline asm + bar.sync 0; + setp.lt.u32 %p4, %r13, 2; + shl.b32 %r46, %r13, 2; + add.s32 %r8, %r45, %r46; + // begin inline asm + @%p4 ld.shared.b32 %r7, [ %r8 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r47, %r7, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r10, %r7, %r47; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + setp.eq.b32 %p5, %r13, 0; + // begin inline asm + @%p5 st.shared.b32 [ %r8 + 0 ], %r10; + // end inline asm + bar.sync 0; + ld.shared.b32 %r11, [global_smem]; +$L__tmp2: + .loc 1 49 25 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:49:25 + mad.wide.u32 %rd7, %r12, 4, %rd10; + .loc 1 49 36 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:49:36 + and.b32 %r48, %r13, 63; + setp.eq.b32 %p6, %r48, 0; + // begin inline asm + @%p6 st.global.b32 [ %rd7 + 0 ], { %r11 }; + // end inline asm + .loc 1 49 4 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:49:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 209 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xca DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 100 +.b8 113 +.b8 120 +.b8 120 +.b8 101 +.b8 118 +.b8 100 +.b8 121 +.b8 115 +.b8 115 +.b8 111 +.b8 121 +.b8 117 +.b8 116 +.b8 50 +.b8 101 +.b8 117 +.b8 119 +.b8 53 +.b8 53 +.b8 121 +.b8 50 +.b8 55 +.b8 99 +.b8 97 +.b8 104 +.b8 113 +.b8 113 +.b8 99 +.b8 103 +.b8 109 +.b8 118 +.b8 121 +.b8 117 +.b8 104 +.b8 100 +.b8 105 +.b8 104 +.b8 98 +.b8 52 +.b8 116 +.b8 109 +.b8 110 +.b8 101 +.b8 114 +.b8 55 +.b8 99 +.b8 102 +.b8 99 +.b8 55 +.b8 102 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 100 +.b8 113 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..dca5438f60db548cd3bc73ac05d32f8da72fb7fa --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.source @@ -0,0 +1,217 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":18:0) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc46 = loc(unknown) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc53 = loc("in_ptr0"(#loc)) +#loc54 = loc("in_ptr1"(#loc)) +#loc55 = loc("out_ptr1"(#loc)) +#loc56 = loc("xnumel"(#loc)) +#loc57 = loc("r0_numel"(#loc)) +#loc97 = loc("input"(#loc44)) +#loc98 = loc("a"(#loc49)) +#loc99 = loc("b"(#loc49)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 524288 : i32 loc(#loc58) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc59) + %xoffset = tt.get_program_id x : i32 loc(#loc60) + %xoffset_2 = arith.constant 1 : i32 loc(#loc61) + %xoffset_3 = arith.constant 1 : i32 loc(#loc61) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc61) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc62) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc63) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc64) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc64) + %xmask = arith.constant true loc(#loc65) + %xmask_8 = arith.constant dense : tensor<1x128xi1> loc(#loc65) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc66) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc67) + %x0 = arith.constant 2048 : i32 loc(#loc68) + %x0_10 = arith.constant 2048 : i32 loc(#loc68) + %x0_11 = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc68) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<1x1xi32> loc(#loc68) + %x1 = arith.constant 2048 : i32 loc(#loc69) + %x1_13 = arith.constant 2048 : i32 loc(#loc69) + %x1_14 = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc69) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<1x1xi32> loc(#loc69) + %x1_16 = arith.constant 32 : i32 loc(#loc70) + %x1_17 = arith.constant 32 : i32 loc(#loc70) + %x1_18 = arith.constant dense<32> : tensor<1x1xi32> loc(#loc70) + %x1_19 = arith.remsi %x1_15, %x1_18 : tensor<1x1xi32> loc(#loc70) + %x2 = arith.constant 65536 : i32 loc(#loc71) + %x2_20 = arith.constant 65536 : i32 loc(#loc71) + %x2_21 = arith.constant dense<65536> : tensor<1x1xi32> loc(#loc71) + %x2_22 = arith.divsi %xindex_7, %x2_21 : tensor<1x1xi32> loc(#loc71) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc72) + %_tmp4_23 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc72) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c128_i32 = arith.constant 128 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_24 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_27 = %_tmp4_23) -> (tensor<1x128xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc74) + %r0_index_28 = arith.addi %r0_index, %r0_base_9 : tensor<1x128xi32> loc(#loc74) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc75) + %r0_mask_29 = arith.cmpi slt, %r0_index_28, %r0_mask : tensor<1x128xi32> loc(#loc75) + %tmp0 = arith.constant 128 : i32 loc(#loc76) + %tmp0_30 = arith.constant 128 : i32 loc(#loc76) + %tmp0_31 = arith.constant dense<128> : tensor<1x1xi32> loc(#loc76) + %tmp0_32 = arith.muli %tmp0_31, %x1_19 : tensor<1x1xi32> loc(#loc76) + %tmp0_33 = tt.broadcast %tmp0_32 : tensor<1x1xi32> -> tensor<1x128xi32> loc(#loc77) + %tmp0_34 = arith.addi %r0_index_28, %tmp0_33 : tensor<1x128xi32> loc(#loc77) + %tmp0_35 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_36 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_37 = arith.constant dense<4096> : tensor<1x1xi32> loc(#loc78) + %tmp0_38 = arith.muli %tmp0_37, %x0_12 : tensor<1x1xi32> loc(#loc78) + %tmp0_39 = tt.broadcast %tmp0_38 : tensor<1x1xi32> -> tensor<1x128xi32> loc(#loc79) + %tmp0_40 = arith.addi %tmp0_34, %tmp0_39 : tensor<1x128xi32> loc(#loc79) + %tmp0_41 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_42 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_43 = arith.constant dense<8388608> : tensor<1x1xi32> loc(#loc80) + %tmp0_44 = arith.muli %tmp0_43, %x2_22 : tensor<1x1xi32> loc(#loc80) + %tmp0_45 = tt.broadcast %tmp0_44 : tensor<1x1xi32> -> tensor<1x128xi32> loc(#loc81) + %tmp0_46 = arith.addi %tmp0_40, %tmp0_45 : tensor<1x128xi32> loc(#loc81) + %tmp0_47 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc82) + %tmp0_48 = tt.addptr %tmp0_47, %tmp0_46 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc82) + %tmp0_49 = arith.constant 0.000000e+00 : f32 loc(#loc83) + %tmp0_50 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc83) + %tmp0_51 = arith.truncf %tmp0_50 : tensor<1x128xf32> to tensor<1x128xbf16> loc(#loc83) + %tmp0_52 = tt.load %tmp0_48, %r0_mask_29, %tmp0_51 evictionPolicy = evict_first : tensor<1x128x!tt.ptr> loc(#loc83) + %tmp0_53 = arith.extf %tmp0_52 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc84) + %tmp1 = arith.constant 128 : i32 loc(#loc85) + %tmp1_54 = arith.constant 128 : i32 loc(#loc85) + %tmp1_55 = arith.constant dense<128> : tensor<1x1xi32> loc(#loc85) + %tmp1_56 = arith.muli %tmp1_55, %xindex_7 : tensor<1x1xi32> loc(#loc85) + %tmp1_57 = tt.broadcast %tmp1_56 : tensor<1x1xi32> -> tensor<1x128xi32> loc(#loc86) + %tmp1_58 = arith.addi %r0_index_28, %tmp1_57 : tensor<1x128xi32> loc(#loc86) + %tmp1_59 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc87) + %tmp1_60 = tt.addptr %tmp1_59, %tmp1_58 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc87) + %tmp1_61 = arith.constant 0.000000e+00 : f32 loc(#loc88) + %tmp1_62 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc88) + %tmp1_63 = arith.truncf %tmp1_62 : tensor<1x128xf32> to tensor<1x128xbf16> loc(#loc88) + %tmp1_64 = tt.load %tmp1_60, %r0_mask_29, %tmp1_63 evictionPolicy = evict_first : tensor<1x128x!tt.ptr> loc(#loc88) + %tmp1_65 = arith.extf %tmp1_64 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc89) + %tmp2 = arith.mulf %tmp0_53, %tmp1_65 : tensor<1x128xf32> loc(#loc90) + %tmp5 = arith.addf %_tmp4_27, %tmp2 : tensor<1x128xf32> loc(#loc91) + %_tmp4_66 = arith.select %r0_mask_29, %tmp5, %_tmp4_27 : tensor<1x128xi1>, tensor<1x128xf32> loc(#loc92) + scf.yield %_tmp4_66 : tensor<1x128xf32> loc(#loc36) + } loc(#loc73) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S1_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_24) : (tensor<1x128xf32>) -> tensor<1xf32> loc(#loc93) + %tmp4_25 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc94) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc95) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<1x1xf32> loc(#loc96) + %tmp8_26 = arith.subf %tmp4_25, %tmp8 : tensor<1x1xf32> loc(#loc96) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc41) + %5 = tt.addptr %4, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc41) + tt.store %5, %tmp8_26 : tensor<1x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S1_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x128xf32> loc("input"(#loc44))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc45) + tt.reduce.return %2 : f32 loc(#loc45) + }) : (tensor<1x128xf32>) -> tensor<1xf32> loc(#loc45) + tt.return %0 : tensor<1xf32> loc(#loc47) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc48) + tt.return %1 : tensor<1xf32> loc(#loc48) + } loc(#loc44) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc49)), %b: f32 loc("b"(#loc49))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc50) + tt.return %0 : f32 loc(#loc51) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc52) + tt.return %1 : f32 loc(#loc52) + } loc(#loc49) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":30:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":32:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:68) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:60) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:73) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:127) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:45) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:50) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:104) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":41:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":43:23) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:40) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:28) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":47:11) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":48:18) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:4) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc58 = loc("xnumel"(#loc1)) +#loc59 = loc("r0_numel"(#loc2)) +#loc60 = loc("xoffset"(#loc3)) +#loc61 = loc("xoffset"(#loc4)) +#loc62 = loc("xindex"(#loc5)) +#loc63 = loc("xindex"(#loc6)) +#loc64 = loc("xindex"(#loc7)) +#loc65 = loc("xmask"(#loc8)) +#loc66 = loc("r0_base"(#loc9)) +#loc67 = loc("r0_base"(#loc10)) +#loc68 = loc("x0"(#loc11)) +#loc69 = loc("x1"(#loc12)) +#loc70 = loc("x1"(#loc13)) +#loc71 = loc("x2"(#loc14)) +#loc72 = loc("_tmp4"(#loc15)) +#loc73 = loc("_tmp4"(#loc16)) +#loc74 = loc("r0_index"(#loc17)) +#loc75 = loc("r0_mask"(#loc18)) +#loc76 = loc("tmp0"(#loc19)) +#loc77 = loc("tmp0"(#loc20)) +#loc78 = loc("tmp0"(#loc21)) +#loc79 = loc("tmp0"(#loc22)) +#loc80 = loc("tmp0"(#loc23)) +#loc81 = loc("tmp0"(#loc24)) +#loc82 = loc("tmp0"(#loc25)) +#loc83 = loc("tmp0"(#loc26)) +#loc84 = loc("tmp0"(#loc27)) +#loc85 = loc("tmp1"(#loc28)) +#loc86 = loc("tmp1"(#loc29)) +#loc87 = loc("tmp1"(#loc30)) +#loc88 = loc("tmp1"(#loc31)) +#loc89 = loc("tmp1"(#loc32)) +#loc90 = loc("tmp2"(#loc33)) +#loc91 = loc("tmp5"(#loc34)) +#loc92 = loc("_tmp4"(#loc35)) +#loc93 = loc("tmp4"(#loc37)) +#loc94 = loc("tmp4"(#loc38)) +#loc95 = loc("tmp7"(#loc39)) +#loc96 = loc("tmp8"(#loc40)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..d011f2acda041e351e3a8df230b7f91fc04125d0 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,128 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":18:0) +#loc1 = loc(unknown) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:25) +#loc33 = loc("in_ptr0"(#loc)) +#loc34 = loc("in_ptr1"(#loc)) +#loc35 = loc("out_ptr1"(#loc)) +#loc36 = loc("xnumel"(#loc)) +#loc37 = loc("r0_numel"(#loc)) +#loc62 = loc("tmp4"(#loc27)) +#loc69 = loc(callsite(#loc1 at #loc62)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c65536_i32 = arith.constant 65536 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x128xbf16, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x128xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc38) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc39) + %r0_base_2 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc39) + %x0 = arith.remsi %xoffset, %c2048_i32 : i32 loc(#loc40) + %x1 = arith.divsi %xoffset, %c2048_i32 : i32 loc(#loc41) + %x1_3 = arith.remsi %x1, %c32_i32 : i32 loc(#loc42) + %x2 = arith.divsi %xoffset, %c65536_i32 : i32 loc(#loc43) + %r0_mask = arith.cmpi slt, %r0_base_2, %cst : tensor<1x128xi32, #blocked> loc(#loc44) + %tmp0 = arith.muli %x1_3, %c128_i32 : i32 loc(#loc45) + %tmp0_4 = tt.splat %tmp0 : i32 -> tensor<1x128xi32, #blocked> loc(#loc64) + %tmp0_5 = arith.addi %r0_base_2, %tmp0_4 : tensor<1x128xi32, #blocked> loc(#loc46) + %tmp0_6 = arith.muli %x0, %c4096_i32 : i32 loc(#loc47) + %tmp0_7 = tt.splat %tmp0_6 : i32 -> tensor<1x128xi32, #blocked> loc(#loc65) + %tmp0_8 = arith.addi %tmp0_5, %tmp0_7 : tensor<1x128xi32, #blocked> loc(#loc48) + %tmp0_9 = arith.muli %x2, %c8388608_i32 : i32 loc(#loc49) + %tmp0_10 = tt.splat %tmp0_9 : i32 -> tensor<1x128xi32, #blocked> loc(#loc66) + %tmp0_11 = arith.addi %tmp0_8, %tmp0_10 : tensor<1x128xi32, #blocked> loc(#loc50) + %tmp0_12 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked> loc(#loc51) + %tmp0_13 = tt.addptr %tmp0_12, %tmp0_11 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi32, #blocked> loc(#loc51) + %tmp0_14 = tt.load %tmp0_13, %r0_mask, %cst_0 evictionPolicy = evict_first : tensor<1x128x!tt.ptr, #blocked> loc(#loc52) + %tmp0_15 = arith.extf %tmp0_14 : tensor<1x128xbf16, #blocked> to tensor<1x128xf32, #blocked> loc(#loc53) + %tmp1 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc54) + %tmp1_16 = tt.splat %tmp1 : i32 -> tensor<1x128xi32, #blocked> loc(#loc67) + %tmp1_17 = arith.addi %r0_base_2, %tmp1_16 : tensor<1x128xi32, #blocked> loc(#loc55) + %tmp1_18 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr, #blocked> loc(#loc56) + %tmp1_19 = tt.addptr %tmp1_18, %tmp1_17 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi32, #blocked> loc(#loc56) + %tmp1_20 = tt.load %tmp1_19, %r0_mask, %cst_0 evictionPolicy = evict_first : tensor<1x128x!tt.ptr, #blocked> loc(#loc57) + %tmp1_21 = arith.extf %tmp1_20 : tensor<1x128xbf16, #blocked> to tensor<1x128xf32, #blocked> loc(#loc58) + %tmp2 = arith.mulf %tmp0_15, %tmp1_21 : tensor<1x128xf32, #blocked> loc(#loc59) + %tmp5 = arith.addf %tmp2, %cst_1 : tensor<1x128xf32, #blocked> loc(#loc60) + %_tmp4 = arith.select %r0_mask, %tmp5, %cst_1 : tensor<1x128xi1, #blocked>, tensor<1x128xf32, #blocked> loc(#loc61) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_24: f32 loc(callsite(#loc1 at #loc62)), %tmp4_25: f32 loc(callsite(#loc1 at #loc62))): + %tmp4_26 = arith.addf %tmp4_24, %tmp4_25 : f32 loc(#loc70) + tt.reduce.return %tmp4_26 : f32 loc(#loc68) + }) : (tensor<1x128xf32, #blocked>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc68) + %tmp4_22 = ttg.convert_layout %tmp4 : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc63) + %tmp4_23 = tt.expand_dims %tmp4_22 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xf32, #blocked1> loc(#loc63) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc30) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc31) + tt.store %1, %tmp4_23 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":28:19) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:29) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":30:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":35:29) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:45) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:55) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:50) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:68) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:60) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:34) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:73) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:127) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:50) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:104) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":41:22) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":43:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:40) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:28) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:25) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:36) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:4) +#loc38 = loc("xoffset"(#loc2)) +#loc39 = loc("r0_base"(#loc3)) +#loc40 = loc("x0"(#loc4)) +#loc41 = loc("x1"(#loc5)) +#loc42 = loc("x1"(#loc6)) +#loc43 = loc("x2"(#loc7)) +#loc44 = loc("r0_mask"(#loc8)) +#loc45 = loc("tmp0"(#loc9)) +#loc46 = loc("tmp0"(#loc10)) +#loc47 = loc("tmp0"(#loc11)) +#loc48 = loc("tmp0"(#loc12)) +#loc49 = loc("tmp0"(#loc13)) +#loc50 = loc("tmp0"(#loc14)) +#loc51 = loc("tmp0"(#loc15)) +#loc52 = loc("tmp0"(#loc16)) +#loc53 = loc("tmp0"(#loc17)) +#loc54 = loc("tmp1"(#loc18)) +#loc55 = loc("tmp1"(#loc19)) +#loc56 = loc("tmp1"(#loc20)) +#loc57 = loc("tmp1"(#loc21)) +#loc58 = loc("tmp1"(#loc22)) +#loc59 = loc("tmp2"(#loc23)) +#loc60 = loc("tmp5"(#loc24)) +#loc61 = loc("_tmp4"(#loc25)) +#loc63 = loc("tmp4"(#loc29)) +#loc64 = loc(fused[#loc46, #loc45]) +#loc65 = loc(fused[#loc48, #loc47]) +#loc66 = loc(fused[#loc50, #loc49]) +#loc67 = loc(fused[#loc55, #loc54]) +#loc68 = loc(callsite(#loc26 at #loc62)) +#loc70 = loc(callsite(#loc28 at #loc68)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..d1f8da993d575ab9476a071c904489de838232df --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/KVYRNKTI6DMW4YCLC6SKMEWSBULCCFFOID3ATA52L7S5QXFITZPQ/triton_red_fused_zeros_0.ttir @@ -0,0 +1,127 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":18:0) +#loc3 = loc(unknown) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:25) +#loc34 = loc("in_ptr0"(#loc)) +#loc35 = loc("in_ptr1"(#loc)) +#loc36 = loc("out_ptr1"(#loc)) +#loc37 = loc("xnumel"(#loc)) +#loc38 = loc("r0_numel"(#loc)) +#loc64 = loc("tmp4"(#loc28)) +#loc71 = loc(callsite(#loc3 at #loc64)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %tmp0 = arith.constant 8388608 : i32 loc(#loc39) + %tmp0_0 = arith.constant 4096 : i32 loc(#loc40) + %c128_i32 = arith.constant 128 : i32 loc(#loc3) + %x2 = arith.constant 65536 : i32 loc(#loc41) + %x1 = arith.constant 32 : i32 loc(#loc42) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc3) + %cst = arith.constant dense<0.000000e+00> : tensor<1x128xbf16> loc(#loc3) + %cst_1 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc3) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc43) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc44) + %r0_base_3 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc45) + %x0 = arith.remsi %xoffset, %c2048_i32 : i32 loc(#loc46) + %x1_4 = arith.divsi %xoffset, %c2048_i32 : i32 loc(#loc47) + %x1_5 = arith.remsi %x1_4, %x1 : i32 loc(#loc42) + %x2_6 = arith.divsi %xoffset, %x2 : i32 loc(#loc41) + %r0_mask = arith.cmpi slt, %r0_base_3, %cst_1 : tensor<1x128xi32> loc(#loc48) + %tmp0_7 = arith.muli %x1_5, %c128_i32 : i32 loc(#loc49) + %tmp0_8 = tt.splat %tmp0_7 : i32 -> tensor<1x128xi32> loc(#loc66) + %tmp0_9 = arith.addi %r0_base_3, %tmp0_8 : tensor<1x128xi32> loc(#loc50) + %tmp0_10 = arith.muli %x0, %tmp0_0 : i32 loc(#loc40) + %tmp0_11 = tt.splat %tmp0_10 : i32 -> tensor<1x128xi32> loc(#loc67) + %tmp0_12 = arith.addi %tmp0_9, %tmp0_11 : tensor<1x128xi32> loc(#loc51) + %tmp0_13 = arith.muli %x2_6, %tmp0 : i32 loc(#loc39) + %tmp0_14 = tt.splat %tmp0_13 : i32 -> tensor<1x128xi32> loc(#loc68) + %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<1x128xi32> loc(#loc52) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc53) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc53) + %tmp0_18 = tt.load %tmp0_17, %r0_mask, %cst evictionPolicy = evict_first : tensor<1x128x!tt.ptr> loc(#loc54) + %tmp0_19 = arith.extf %tmp0_18 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc55) + %tmp1 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc56) + %tmp1_20 = tt.splat %tmp1 : i32 -> tensor<1x128xi32> loc(#loc69) + %tmp1_21 = arith.addi %r0_base_3, %tmp1_20 : tensor<1x128xi32> loc(#loc57) + %tmp1_22 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x128x!tt.ptr> loc(#loc58) + %tmp1_23 = tt.addptr %tmp1_22, %tmp1_21 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> loc(#loc58) + %tmp1_24 = tt.load %tmp1_23, %r0_mask, %cst evictionPolicy = evict_first : tensor<1x128x!tt.ptr> loc(#loc59) + %tmp1_25 = arith.extf %tmp1_24 : tensor<1x128xbf16> to tensor<1x128xf32> loc(#loc60) + %tmp2 = arith.mulf %tmp0_19, %tmp1_25 : tensor<1x128xf32> loc(#loc61) + %tmp5 = arith.addf %tmp2, %cst_2 : tensor<1x128xf32> loc(#loc62) + %_tmp4 = arith.select %r0_mask, %tmp5, %cst_2 : tensor<1x128xi1>, tensor<1x128xf32> loc(#loc63) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_27: f32 loc(callsite(#loc3 at #loc64)), %tmp4_28: f32 loc(callsite(#loc3 at #loc64))): + %tmp4_29 = arith.addf %tmp4_27, %tmp4_28 : f32 loc(#loc72) + tt.reduce.return %tmp4_29 : f32 loc(#loc70) + }) : (tensor<1x128xf32>) -> tensor<1xf32> loc(#loc70) + %tmp4_26 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc65) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc31) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc31) + tt.store %1, %tmp4_26 : tensor<1x1x!tt.ptr> loc(#loc32) + tt.return loc(#loc33) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:68) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:55) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":30:19) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:29) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":28:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:21) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":35:29) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:45) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:41) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:50) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:60) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:34) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:73) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:127) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:104) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":41:22) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":43:23) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:40) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:28) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:25) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:36) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:4) +#loc39 = loc("tmp0"(#loc1)) +#loc40 = loc("tmp0"(#loc2)) +#loc41 = loc("x2"(#loc4)) +#loc42 = loc("x1"(#loc5)) +#loc43 = loc("xoffset"(#loc6)) +#loc44 = loc("r0_base"(#loc7)) +#loc45 = loc("r0_base"(#loc8)) +#loc46 = loc("x0"(#loc9)) +#loc47 = loc("x1"(#loc10)) +#loc48 = loc("r0_mask"(#loc11)) +#loc49 = loc("tmp0"(#loc12)) +#loc50 = loc("tmp0"(#loc13)) +#loc51 = loc("tmp0"(#loc14)) +#loc52 = loc("tmp0"(#loc15)) +#loc53 = loc("tmp0"(#loc16)) +#loc54 = loc("tmp0"(#loc17)) +#loc55 = loc("tmp0"(#loc18)) +#loc56 = loc("tmp1"(#loc19)) +#loc57 = loc("tmp1"(#loc20)) +#loc58 = loc("tmp1"(#loc21)) +#loc59 = loc("tmp1"(#loc22)) +#loc60 = loc("tmp1"(#loc23)) +#loc61 = loc("tmp2"(#loc24)) +#loc62 = loc("tmp5"(#loc25)) +#loc63 = loc("_tmp4"(#loc26)) +#loc65 = loc("tmp4"(#loc30)) +#loc66 = loc(fused[#loc50, #loc49]) +#loc67 = loc(fused[#loc51, #loc40]) +#loc68 = loc(fused[#loc52, #loc39]) +#loc69 = loc(fused[#loc57, #loc56]) +#loc70 = loc(callsite(#loc27 at #loc64)) +#loc72 = loc(callsite(#loc29 at #loc70)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/__grp__triton_poi_fused_clone_slice_4.json b/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/__grp__triton_poi_fused_clone_slice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9a6cbeae7d3387761bf2d4aa1d2f5f1619a7e427 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/__grp__triton_poi_fused_clone_slice_4.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_clone_slice_4.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.source", "triton_poi_fused_clone_slice_4.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.ttir", "triton_poi_fused_clone_slice_4.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.ttgir", "triton_poi_fused_clone_slice_4.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.llir", "triton_poi_fused_clone_slice_4.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.ptx", "triton_poi_fused_clone_slice_4.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.cubin", "triton_poi_fused_clone_slice_4.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.cubin new file mode 100644 index 0000000000000000000000000000000000000000..b9329dbda083930eef7bf5bd0efb0b72c5469574 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.json b/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.json new file mode 100644 index 0000000000000000000000000000000000000000..deaf08a82270d2b8fa62e8f69821248df3b38e92 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.json @@ -0,0 +1 @@ +{"hash": "5f0dc9fd695a7b450b134bbc5bfeffbf44007407a1fe7efd7e0338169af7002b", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_clone_slice_4"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.llir new file mode 100644 index 0000000000000000000000000000000000000000..54e54fb63eefe15b3da41e657790fc3f4aeaffe1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.llir @@ -0,0 +1,77 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_clone_slice_4(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 8, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = shl nuw nsw i32 %9, 1, !dbg !9 + %11 = and i32 %10, 254, !dbg !9 + %12 = or disjoint i32 %11, %8, !dbg !10 + %13 = or disjoint i32 %12, 1, !dbg !10 + %14 = icmp slt i32 %12, %3, !dbg !11 + %15 = icmp slt i32 %13, %3, !dbg !11 + %16 = sext i32 %12 to i64, !dbg !12 + %17 = sext i32 %13 to i64, !dbg !12 + %.frozen = freeze i64 %2, !dbg !13 + %18 = sdiv i64 %16, %.frozen, !dbg !13 + %19 = mul i64 %18, %.frozen, !dbg !12 + %.decomposed = sub i64 %16, %19, !dbg !12 + %.frozen1 = freeze i64 %2, !dbg !13 + %20 = sdiv i64 %17, %.frozen1, !dbg !13 + %21 = mul i64 %20, %.frozen1, !dbg !12 + %.decomposed2 = sub i64 %17, %21, !dbg !12 + %22 = mul i64 %18, %2, !dbg !14 + %23 = mul i64 %20, %2, !dbg !14 + %24 = getelementptr i32, ptr addrspace(1) %0, i64 %.decomposed, !dbg !15 + %25 = getelementptr i32, ptr addrspace(1) %24, i64 %18, !dbg !15 + %26 = getelementptr i32, ptr addrspace(1) %25, i64 %22, !dbg !15 + %27 = getelementptr i32, ptr addrspace(1) %0, i64 %.decomposed2, !dbg !15 + %28 = getelementptr i32, ptr addrspace(1) %27, i64 %20, !dbg !15 + %29 = getelementptr i32, ptr addrspace(1) %28, i64 %23, !dbg !15 + %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !16 + %31 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %26, i64 %30, i1 %14) #2, !dbg !16 + %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !16 + %33 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %29, i64 %32, i1 %15) #2, !dbg !16 + %34 = getelementptr i32, ptr addrspace(1) %1, i64 %16, !dbg !17 + %35 = getelementptr i32, ptr addrspace(1) %1, i64 %17, !dbg !17 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %31, ptr addrspace(1) %34, i1 %14) #2, !dbg !18 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %33, ptr addrspace(1) %35, i1 %15) #2, !dbg !18 + ret void, !dbg !19 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_clone_slice_4", linkageName: "triton_poi_fused_clone_slice_4", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 19, column: 28, scope: !4) +!8 = !DILocation(line: 19, column: 33, scope: !4) +!9 = !DILocation(line: 20, column: 36, scope: !4) +!10 = !DILocation(line: 20, column: 23, scope: !4) +!11 = !DILocation(line: 21, column: 21, scope: !4) +!12 = !DILocation(line: 22, column: 19, scope: !4) +!13 = !DILocation(line: 23, column: 19, scope: !4) +!14 = !DILocation(line: 25, column: 44, scope: !4) +!15 = !DILocation(line: 25, column: 30, scope: !4) +!16 = !DILocation(line: 25, column: 49, scope: !4) +!17 = !DILocation(line: 26, column: 25, scope: !4) +!18 = !DILocation(line: 26, column: 36, scope: !4) +!19 = !DILocation(line: 26, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.ptx new file mode 100644 index 0000000000000000000000000000000000000000..337728f78247299ffdef8abea43b80511a1afea6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.ptx @@ -0,0 +1,289 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_clone_slice_4 // -- Begin function triton_poi_fused_clone_slice_4 + // @triton_poi_fused_clone_slice_4 +.visible .entry triton_poi_fused_clone_slice_4( + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_slice_4_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_slice_4_param_1, + .param .u64 triton_poi_fused_clone_slice_4_param_2, + .param .u32 triton_poi_fused_clone_slice_4_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_slice_4_param_4, + .param .u64 .ptr .global .align 1 triton_poi_fused_clone_slice_4_param_5 +) +.reqntid 128 +{ + .reg .pred %p<7>; + .reg .b32 %r<23>; + .reg .b64 %rd<46>; + .loc 1 18 0 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:18:0 + +// %bb.0: + ld.param.b64 %rd13, [triton_poi_fused_clone_slice_4_param_2]; +$L__tmp0: + .loc 1 19 28 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:19:28 + mov.u32 %r2, %ctaid.x; + .loc 1 19 33 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:19:33 + shl.b32 %r3, %r2, 8; + .loc 1 20 36 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:20:36 + mov.u32 %r4, %tid.x; + shl.b32 %r5, %r4, 1; + and.b32 %r6, %r5, 254; + .loc 1 20 23 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:20:23 + or.b32 %r7, %r6, %r3; + or.b32 %r8, %r7, 1; + .loc 1 22 19 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:22:19 + cvt.s64.s32 %rd1, %r7; + cvt.s64.s32 %rd2, %r8; + .loc 1 23 19 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:23:19 + or.b64 %rd15, %rd1, %rd13; + and.b64 %rd16, %rd15, -4294967296; + setp.ne.b64 %p1, %rd16, 0; + cvt.u32.u64 %r22, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd44, %rd1, %rd13; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r9, %rd13; + div.u32 %r11, %r22, %r9; + cvt.u64.u32 %rd44, %r11; +$L__BB0_3: + .loc 1 0 19 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:0:19 + ld.param.b32 %r1, [triton_poi_fused_clone_slice_4_param_3]; + ld.param.b64 %rd12, [triton_poi_fused_clone_slice_4_param_1]; + ld.param.b64 %rd11, [triton_poi_fused_clone_slice_4_param_0]; + .loc 1 22 19 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:22:19 + mul.lo.s64 %rd17, %rd44, %rd13; + sub.s64 %rd7, %rd1, %rd17; + .loc 1 23 19 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:23:19 + or.b64 %rd18, %rd2, %rd13; + and.b64 %rd19, %rd18, -4294967296; + setp.ne.b64 %p2, %rd19, 0; + cvt.u32.u64 %r21, %rd2; + @%p2 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd45, %rd2, %rd13; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r12, %rd13; + div.u32 %r14, %r21, %r12; + cvt.u64.u32 %rd45, %r14; +$L__BB0_6: + .loc 1 21 21 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:21:21 + setp.lt.s32 %p4, %r21, %r1; + setp.lt.s32 %p3, %r22, %r1; + .loc 1 22 19 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:22:19 + mul.lo.s64 %rd28, %rd45, %rd13; + .loc 1 25 30 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:25:30 + shl.b64 %rd31, %rd7, 2; + add.s64 %rd32, %rd11, %rd31; + shl.b64 %rd33, %rd44, 2; + add.s64 %rd34, %rd32, %rd33; + shl.b64 %rd35, %rd17, 2; + add.s64 %rd21, %rd34, %rd35; + sub.s64 %rd36, %rd1, %rd28; + shl.b64 %rd37, %rd36, 2; + add.s64 %rd38, %rd11, %rd37; + shl.b64 %rd39, %rd45, 2; + add.s64 %rd40, %rd38, %rd39; + shl.b64 %rd41, %rd28, 2; + add.s64 %rd42, %rd40, %rd41; + add.s64 %rd24, %rd42, 4; + .loc 1 25 49 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:25:49 + // begin inline asm + mov.u64 %rd22, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd22, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r17, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b32 { %r17 }, [ %rd21 + 0 ], %rd22; + // end inline asm + // begin inline asm + mov.u64 %rd25, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd25, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r18, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b32 { %r18 }, [ %rd24 + 0 ], %rd25; + // end inline asm + .loc 1 26 25 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:26:25 + shl.b64 %rd43, %rd1, 2; + add.s64 %rd26, %rd12, %rd43; + add.s64 %rd27, %rd26, 4; + .loc 1 26 36 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:26:36 + // begin inline asm + @%p3 st.global.b32 [ %rd26 + 0 ], { %r17 }; + // end inline asm + // begin inline asm + @%p4 st.global.b32 [ %rd27 + 0 ], { %r18 }; + // end inline asm + .loc 1 26 4 // cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py:26:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 110 +.b8 52 +.b8 52 +.b8 122 +.b8 104 +.b8 102 +.b8 115 +.b8 113 +.b8 115 +.b8 119 +.b8 108 +.b8 102 +.b8 99 +.b8 97 +.b8 119 +.b8 101 +.b8 111 +.b8 100 +.b8 116 +.b8 111 +.b8 104 +.b8 113 +.b8 116 +.b8 55 +.b8 115 +.b8 116 +.b8 100 +.b8 110 +.b8 118 +.b8 107 +.b8 105 +.b8 99 +.b8 52 +.b8 115 +.b8 113 +.b8 120 +.b8 111 +.b8 52 +.b8 107 +.b8 97 +.b8 116 +.b8 109 +.b8 97 +.b8 119 +.b8 118 +.b8 51 +.b8 117 +.b8 114 +.b8 107 +.b8 101 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 110 +.b8 52 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.source b/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.source new file mode 100644 index 0000000000000000000000000000000000000000..2b503ae96341462635e4e969272a9a50557c176a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.source @@ -0,0 +1,62 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":18:0) +#loc16 = loc("in_ptr0"(#loc)) +#loc17 = loc("out_ptr0"(#loc)) +#loc18 = loc("ks0"(#loc)) +#loc19 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_clone_slice_4(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc20) + %xoffset_0 = arith.constant 256 : i32 loc(#loc21) + %xoffset_1 = arith.constant 256 : i32 loc(#loc21) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc21) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc22) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<256xi32> loc(#loc23) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<256xi32> loc(#loc23) + %xmask = tt.splat %xnumel : i32 -> tensor<256xi32> loc(#loc24) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<256xi32> loc(#loc24) + %x0 = arith.extsi %xindex_4 : tensor<256xi32> to tensor<256xi64> loc(#loc25) + %x0_6 = tt.splat %ks0 : i64 -> tensor<256xi64> loc(#loc25) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<256xi64> loc(#loc25) + %x1 = arith.extsi %xindex_4 : tensor<256xi32> to tensor<256xi64> loc(#loc26) + %x1_8 = tt.splat %ks0 : i64 -> tensor<256xi64> loc(#loc26) + %x1_9 = arith.divsi %x1, %x1_8 : tensor<256xi64> loc(#loc26) + %tmp0 = arith.addi %x0_7, %x1_9 : tensor<256xi64> loc(#loc27) + %tmp0_10 = tt.splat %ks0 : i64 -> tensor<256xi64> loc(#loc28) + %tmp0_11 = arith.muli %tmp0_10, %x1_9 : tensor<256xi64> loc(#loc28) + %tmp0_12 = arith.addi %tmp0, %tmp0_11 : tensor<256xi64> loc(#loc29) + %tmp0_13 = tt.splat %in_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc30) + %tmp0_14 = tt.addptr %tmp0_13, %tmp0_12 : tensor<256x!tt.ptr>, tensor<256xi64> loc(#loc30) + %tmp0_15 = tt.load %tmp0_14, %xmask_5 evictionPolicy = evict_last : tensor<256x!tt.ptr> loc(#loc31) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc13) + %1 = tt.addptr %0, %xindex_4 : tensor<256x!tt.ptr>, tensor<256xi32> loc(#loc13) + tt.store %1, %tmp0_15, %xmask_5 : tensor<256x!tt.ptr> loc(#loc14) + tt.return loc(#loc15) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":22:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":23:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:35) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:44) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:30) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:49) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":26:25) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":26:36) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":26:4) +#loc20 = loc("xoffset"(#loc1)) +#loc21 = loc("xoffset"(#loc2)) +#loc22 = loc("xindex"(#loc3)) +#loc23 = loc("xindex"(#loc4)) +#loc24 = loc("xmask"(#loc5)) +#loc25 = loc("x0"(#loc6)) +#loc26 = loc("x1"(#loc7)) +#loc27 = loc("tmp0"(#loc8)) +#loc28 = loc("tmp0"(#loc9)) +#loc29 = loc("tmp0"(#loc10)) +#loc30 = loc("tmp0"(#loc11)) +#loc31 = loc("tmp0"(#loc12)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..f60e2af22ee762265c2b3c9b433f4457e3eb54ff --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.ttgir @@ -0,0 +1,60 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":18:0) +#loc17 = loc("in_ptr0"(#loc)) +#loc18 = loc("out_ptr0"(#loc)) +#loc19 = loc("ks0"(#loc)) +#loc20 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_clone_slice_4(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc21) + %xoffset_0 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc22) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> loc(#loc23) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<256xi32, #blocked> loc(#loc24) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<256xi32, #blocked> loc(#loc24) + %xmask = tt.splat %xnumel : i32 -> tensor<256xi32, #blocked> loc(#loc25) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<256xi32, #blocked> loc(#loc25) + %x0 = arith.extsi %xindex_2 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked> loc(#loc26) + %x0_4 = tt.splat %ks0 : i64 -> tensor<256xi64, #blocked> loc(#loc26) + %x0_5 = arith.remsi %x0, %x0_4 : tensor<256xi64, #blocked> loc(#loc26) + %x1 = arith.divsi %x0, %x0_4 : tensor<256xi64, #blocked> loc(#loc27) + %tmp0 = arith.addi %x0_5, %x1 : tensor<256xi64, #blocked> loc(#loc28) + %tmp0_6 = arith.muli %x0_4, %x1 : tensor<256xi64, #blocked> loc(#loc29) + %tmp0_7 = arith.addi %tmp0, %tmp0_6 : tensor<256xi64, #blocked> loc(#loc30) + %tmp0_8 = tt.splat %in_ptr0 : !tt.ptr -> tensor<256x!tt.ptr, #blocked> loc(#loc31) + %tmp0_9 = tt.addptr %tmp0_8, %tmp0_7 : tensor<256x!tt.ptr, #blocked>, tensor<256xi64, #blocked> loc(#loc31) + %tmp0_10 = tt.load %tmp0_9, %xmask_3 evictionPolicy = evict_last : tensor<256x!tt.ptr, #blocked> loc(#loc32) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr, #blocked> loc(#loc14) + %1 = tt.addptr %0, %xindex_2 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> loc(#loc14) + tt.store %1, %tmp0_10, %xmask_3 : tensor<256x!tt.ptr, #blocked> loc(#loc15) + tt.return loc(#loc16) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":23:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:35) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:44) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:40) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:30) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:49) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":26:25) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":26:36) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":26:4) +#loc21 = loc("xoffset"(#loc2)) +#loc22 = loc("xoffset"(#loc3)) +#loc23 = loc("xindex"(#loc4)) +#loc24 = loc("xindex"(#loc5)) +#loc25 = loc("xmask"(#loc6)) +#loc26 = loc("x0"(#loc7)) +#loc27 = loc("x1"(#loc8)) +#loc28 = loc("tmp0"(#loc9)) +#loc29 = loc("tmp0"(#loc10)) +#loc30 = loc("tmp0"(#loc11)) +#loc31 = loc("tmp0"(#loc12)) +#loc32 = loc("tmp0"(#loc13)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.ttir new file mode 100644 index 0000000000000000000000000000000000000000..0bf58aee7299308b1fce3b4638ebbc5851f3af9b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/L4G4T7LJLJ5UKCYTJO6FX7X7X5CAA5AHUH7H57L6AM4BNGXXAAVQ/triton_poi_fused_clone_slice_4.ttir @@ -0,0 +1,59 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":18:0) +#loc17 = loc("in_ptr0"(#loc)) +#loc18 = loc("out_ptr0"(#loc)) +#loc19 = loc("ks0"(#loc)) +#loc20 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_clone_slice_4(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc21) + %xoffset_0 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc22) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc23) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<256xi32> loc(#loc24) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<256xi32> loc(#loc24) + %xmask = tt.splat %xnumel : i32 -> tensor<256xi32> loc(#loc25) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<256xi32> loc(#loc25) + %x0 = arith.extsi %xindex_2 : tensor<256xi32> to tensor<256xi64> loc(#loc26) + %x0_4 = tt.splat %ks0 : i64 -> tensor<256xi64> loc(#loc26) + %x0_5 = arith.remsi %x0, %x0_4 : tensor<256xi64> loc(#loc26) + %x1 = arith.divsi %x0, %x0_4 : tensor<256xi64> loc(#loc27) + %tmp0 = arith.addi %x0_5, %x1 : tensor<256xi64> loc(#loc28) + %tmp0_6 = arith.muli %x0_4, %x1 : tensor<256xi64> loc(#loc29) + %tmp0_7 = arith.addi %tmp0, %tmp0_6 : tensor<256xi64> loc(#loc30) + %tmp0_8 = tt.splat %in_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc31) + %tmp0_9 = tt.addptr %tmp0_8, %tmp0_7 : tensor<256x!tt.ptr>, tensor<256xi64> loc(#loc31) + %tmp0_10 = tt.load %tmp0_9, %xmask_3 evictionPolicy = evict_last : tensor<256x!tt.ptr> loc(#loc32) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc14) + %1 = tt.addptr %0, %xindex_2 : tensor<256x!tt.ptr>, tensor<256xi32> loc(#loc14) + tt.store %1, %tmp0_10, %xmask_3 : tensor<256x!tt.ptr> loc(#loc15) + tt.return loc(#loc16) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":23:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:35) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:44) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:40) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:30) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":25:49) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":26:25) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":26:36) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/n4/cn44zhfsqswlfcaweodtohqt7stdnvkic4sqxo4katmawv3urke5.py":26:4) +#loc21 = loc("xoffset"(#loc2)) +#loc22 = loc("xoffset"(#loc3)) +#loc23 = loc("xindex"(#loc4)) +#loc24 = loc("xindex"(#loc5)) +#loc25 = loc("xmask"(#loc6)) +#loc26 = loc("x0"(#loc7)) +#loc27 = loc("x1"(#loc8)) +#loc28 = loc("tmp0"(#loc9)) +#loc29 = loc("tmp0"(#loc10)) +#loc30 = loc("tmp0"(#loc11)) +#loc31 = loc("tmp0"(#loc12)) +#loc32 = loc("tmp0"(#loc13)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2c0aaad15fea0fc5370a4e8d1fda7fdda7257fe8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..49124a97622932eaeb8c53c88384f9025c0d6799 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a31d3465996b3eb2d0b5e3d1518f1f67155f21bb --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"hash": "5ba9f4e64c639c4d16c0a7b21fc2e2d3b3625bf05e3033f3891fd9bc804e93f9", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 1024, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..c9a4f7b0f34f0b0972aaa012a4b1204c1748ebcd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir @@ -0,0 +1,1110 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_1 = internal constant [8 x i8] c"unknown\00" +@assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py\00" +@assertMessage_1 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp49 < 17\00" +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py\00" +@assertMessage_0 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp40 < 17\00" +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %13 = shl i32 %12, 3, !dbg !11 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %15 = and i32 %14, 56, !dbg !12 + %16 = lshr exact i32 %15, 3, !dbg !12 + %17 = and i32 %14, 7, !dbg !12 + %18 = or disjoint i32 %16, %13, !dbg !13 + %19 = icmp slt i32 %18, 128, !dbg !14 + %20 = shl nuw nsw i32 %17, 1, !dbg !15 + %21 = or disjoint i32 %20, 1, !dbg !15 + %22 = shl i32 %18, 4, !dbg !16 + %23 = or disjoint i32 %22, %20, !dbg !17 + %24 = sext i32 %23 to i64, !dbg !18 + %25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !18 + %26 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %25, i1 %19) #5, !dbg !19 + %27 = extractvalue { i64, i64 } %26, 0, !dbg !19 + %28 = extractvalue { i64, i64 } %26, 1, !dbg !19 + %29 = add i64 %27, -1, !dbg !20 + %30 = icmp ult i64 %29, 16383, !dbg !20 + %31 = add i64 %28, -1, !dbg !20 + %32 = icmp ult i64 %31, 16383, !dbg !20 + %33 = zext i1 %30 to i32, !dbg !21 + %34 = zext i1 %32 to i32, !dbg !21 + %35 = and i32 %14, 1, !dbg !22 + %36 = lshr i32 %14, 1, !dbg !22 + %.lobit = and i32 %36, 1, !dbg !22 + %37 = lshr i32 %14, 2, !dbg !22 + %.lobit1 = and i32 %37, 1, !dbg !22 + %38 = xor i32 %35, 1, !dbg !26 + %39 = xor i32 %.lobit, 1, !dbg !26 + %40 = xor i32 %.lobit1, 1, !dbg !26 + %41 = xor i1 %30, true, !dbg !27 + %42 = and i1 %32, %41, !dbg !27 + %43 = trunc i32 %14 to i1, !dbg !28 + %44 = xor i1 %42, %43, !dbg !28 + %45 = xor i32 %33, %34, !dbg !29 + %46 = select i1 %44, i32 %45, i32 0, !dbg !30 + %47 = xor i32 %46, %33, !dbg !31 + %48 = xor i32 %46, %34, !dbg !31 + %49 = xor i32 %21, %20, !dbg !32 + %50 = select i1 %44, i32 %49, i32 0, !dbg !33 + %51 = xor i32 %50, %20, !dbg !34 + %52 = xor i32 %50, %21, !dbg !34 + %53 = mul nuw nsw i32 %47, %38, !dbg !35 + %54 = mul nuw nsw i32 %48, %38, !dbg !35 + %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %53, i32 1, i32 31), !dbg !36 + %56 = add i32 %53, %55, !dbg !39 + %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 1, i32 31), !dbg !36 + %58 = add i32 %54, %57, !dbg !39 + %59 = mul nuw nsw i32 %47, %35, !dbg !40 + %60 = mul nuw nsw i32 %48, %35, !dbg !40 + %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %59, i32 1, i32 31), !dbg !36 + %62 = add i32 %59, %61, !dbg !39 + %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 1, i32 31), !dbg !36 + %64 = add i32 %60, %63, !dbg !39 + %65 = mul nuw nsw i32 %51, %38, !dbg !41 + %66 = mul nuw nsw i32 %52, %38, !dbg !41 + %67 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %65, i32 1, i32 31), !dbg !36 + %68 = add i32 %65, %67, !dbg !39 + %69 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %66, i32 1, i32 31), !dbg !36 + %70 = add i32 %66, %69, !dbg !39 + %71 = mul nuw nsw i32 %51, %35, !dbg !42 + %72 = mul nuw nsw i32 %52, %35, !dbg !42 + %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 1, i32 31), !dbg !36 + %74 = add i32 %71, %73, !dbg !39 + %75 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 1, i32 31), !dbg !36 + %76 = add i32 %72, %75, !dbg !39 + %77 = trunc i32 %36 to i1, !dbg !28 + %78 = insertelement <2 x i32> poison, i32 %56, i64 0, !dbg !28 + %79 = insertelement <2 x i32> %78, i32 %58, i64 1, !dbg !28 + %80 = insertelement <2 x i32> poison, i32 %62, i64 0, !dbg !28 + %81 = insertelement <2 x i32> %80, i32 %64, i64 1, !dbg !28 + %82 = icmp sge <2 x i32> %79, %81, !dbg !28 + %83 = icmp ne <2 x i32> %79, %81, !dbg !28 + %84 = insertelement <2 x i32> poison, i32 %68, i64 0, !dbg !28 + %85 = insertelement <2 x i32> %84, i32 %70, i64 1, !dbg !28 + %86 = insertelement <2 x i32> poison, i32 %74, i64 0, !dbg !28 + %87 = insertelement <2 x i32> %86, i32 %76, i64 1, !dbg !28 + %88 = icmp sle <2 x i32> %85, %87, !dbg !28 + %89 = or <2 x i1> %83, %88, !dbg !28 + %90 = and <2 x i1> %82, %89, !dbg !28 + %91 = insertelement <2 x i1> poison, i1 %77, i64 0, !dbg !28 + %92 = shufflevector <2 x i1> %91, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !28 + %93 = xor <2 x i1> %90, %92, !dbg !28 + %94 = xor <2 x i32> %79, %81, !dbg !29 + %95 = select <2 x i1> %93, <2 x i32> zeroinitializer, <2 x i32> %94, !dbg !30 + %96 = insertelement <2 x i32> poison, i32 %47, i64 0, !dbg !31 + %97 = insertelement <2 x i32> %96, i32 %48, i64 1, !dbg !31 + %98 = xor <2 x i32> %95, %97, !dbg !31 + %99 = xor <2 x i32> %85, %87, !dbg !32 + %100 = select <2 x i1> %93, <2 x i32> zeroinitializer, <2 x i32> %99, !dbg !33 + %101 = insertelement <2 x i32> poison, i32 %51, i64 0, !dbg !34 + %102 = insertelement <2 x i32> %101, i32 %52, i64 1, !dbg !34 + %103 = xor <2 x i32> %100, %102, !dbg !34 + %104 = extractelement <2 x i32> %98, i64 0, !dbg !28 + %105 = extractelement <2 x i32> %98, i64 1, !dbg !28 + %106 = icmp sge i32 %104, %105, !dbg !28 + %107 = icmp ne i32 %104, %105, !dbg !28 + %108 = extractelement <2 x i32> %103, i64 0, !dbg !28 + %109 = extractelement <2 x i32> %103, i64 1, !dbg !28 + %110 = icmp sle i32 %108, %109, !dbg !28 + %111 = or i1 %107, %110, !dbg !28 + %112 = and i1 %106, %111, !dbg !28 + %.not3 = xor i1 %112, %77, !dbg !28 + %113 = xor i32 %104, %105, !dbg !29 + %114 = select i1 %.not3, i32 0, i32 %113, !dbg !30 + %115 = xor i32 %108, %109, !dbg !32 + %116 = select i1 %.not3, i32 0, i32 %115, !dbg !33 + %117 = xor i32 %116, %108, !dbg !34 + %118 = xor i32 %116, %109, !dbg !34 + %119 = mul nuw nsw i32 %117, %39, !dbg !41 + %120 = mul nuw nsw i32 %118, %39, !dbg !41 + %121 = mul nuw nsw i32 %117, %.lobit, !dbg !42 + %122 = mul nuw nsw i32 %118, %.lobit, !dbg !42 + %123 = trunc i32 %37 to i1, !dbg !28 + %124 = insertelement <2 x i32> poison, i32 %114, i64 0, !dbg !31 + %125 = shufflevector <2 x i32> %124, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !31 + %126 = xor <2 x i32> %125, %98, !dbg !31 + %127 = extractelement <2 x i32> %126, i64 0, !dbg !40 + %128 = mul nuw nsw i32 %127, %39, !dbg !35 + %129 = extractelement <2 x i32> %126, i64 1, !dbg !40 + %130 = mul nuw nsw i32 %129, %39, !dbg !35 + %131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 2, i32 31), !dbg !36 + %132 = add i32 %128, %131, !dbg !39 + %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %130, i32 2, i32 31), !dbg !36 + %134 = add i32 %130, %133, !dbg !39 + %135 = mul nuw nsw i32 %127, %.lobit, !dbg !40 + %136 = mul nuw nsw i32 %129, %.lobit, !dbg !40 + %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 2, i32 31), !dbg !36 + %138 = add i32 %135, %137, !dbg !39 + %139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 2, i32 31), !dbg !36 + %140 = add i32 %136, %139, !dbg !39 + %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %119, i32 2, i32 31), !dbg !36 + %142 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 2, i32 31), !dbg !36 + %143 = insertelement <2 x i32> poison, i32 %119, i64 0, !dbg !39 + %144 = insertelement <2 x i32> %143, i32 %120, i64 1, !dbg !39 + %145 = insertelement <2 x i32> poison, i32 %141, i64 0, !dbg !39 + %146 = insertelement <2 x i32> %145, i32 %142, i64 1, !dbg !39 + %147 = add <2 x i32> %144, %146, !dbg !39 + %148 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 2, i32 31), !dbg !36 + %149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 2, i32 31), !dbg !36 + %150 = insertelement <2 x i32> poison, i32 %121, i64 0, !dbg !39 + %151 = insertelement <2 x i32> %150, i32 %122, i64 1, !dbg !39 + %152 = insertelement <2 x i32> poison, i32 %148, i64 0, !dbg !39 + %153 = insertelement <2 x i32> %152, i32 %149, i64 1, !dbg !39 + %154 = add <2 x i32> %151, %153, !dbg !39 + %155 = insertelement <2 x i32> poison, i32 %132, i64 0, !dbg !28 + %156 = insertelement <2 x i32> %155, i32 %134, i64 1, !dbg !28 + %157 = insertelement <2 x i32> poison, i32 %138, i64 0, !dbg !28 + %158 = insertelement <2 x i32> %157, i32 %140, i64 1, !dbg !28 + %159 = icmp sge <2 x i32> %156, %158, !dbg !28 + %160 = icmp ne <2 x i32> %156, %158, !dbg !28 + %161 = xor <2 x i32> %156, %158, !dbg !29 + %162 = icmp sle <2 x i32> %147, %154, !dbg !28 + %163 = or <2 x i1> %160, %162, !dbg !28 + %164 = and <2 x i1> %159, %163, !dbg !28 + %165 = insertelement <2 x i1> poison, i1 %123, i64 0, !dbg !28 + %166 = shufflevector <2 x i1> %165, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !28 + %167 = xor <2 x i1> %164, %166, !dbg !28 + %168 = select <2 x i1> %167, <2 x i32> zeroinitializer, <2 x i32> %161, !dbg !30 + %169 = xor <2 x i32> %168, %126, !dbg !31 + %170 = xor <2 x i32> %147, %154, !dbg !32 + %171 = select <2 x i1> %167, <2 x i32> zeroinitializer, <2 x i32> %170, !dbg !33 + %172 = insertelement <2 x i32> poison, i32 %117, i64 0, !dbg !34 + %173 = insertelement <2 x i32> %172, i32 %118, i64 1, !dbg !34 + %174 = xor <2 x i32> %171, %173, !dbg !34 + %175 = extractelement <2 x i32> %169, i64 0, !dbg !40 + %176 = mul nuw nsw i32 %175, %38, !dbg !35 + %177 = extractelement <2 x i32> %169, i64 1, !dbg !40 + %178 = mul nuw nsw i32 %177, %38, !dbg !35 + %179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 1, i32 31), !dbg !36 + %180 = add i32 %176, %179, !dbg !39 + %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 1, i32 31), !dbg !36 + %182 = add i32 %178, %181, !dbg !39 + %183 = mul nuw nsw i32 %175, %35, !dbg !40 + %184 = mul nuw nsw i32 %177, %35, !dbg !40 + %185 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %183, i32 1, i32 31), !dbg !36 + %186 = add i32 %183, %185, !dbg !39 + %187 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 1, i32 31), !dbg !36 + %188 = add i32 %184, %187, !dbg !39 + %189 = extractelement <2 x i32> %174, i64 0, !dbg !42 + %190 = mul nuw nsw i32 %189, %38, !dbg !41 + %191 = extractelement <2 x i32> %174, i64 1, !dbg !42 + %192 = mul nuw nsw i32 %191, %38, !dbg !41 + %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %190, i32 1, i32 31), !dbg !36 + %194 = add i32 %190, %193, !dbg !39 + %195 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 1, i32 31), !dbg !36 + %196 = add i32 %192, %195, !dbg !39 + %197 = mul nuw nsw i32 %189, %35, !dbg !42 + %198 = mul nuw nsw i32 %191, %35, !dbg !42 + %199 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %197, i32 1, i32 31), !dbg !36 + %200 = add i32 %197, %199, !dbg !39 + %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 1, i32 31), !dbg !36 + %202 = add i32 %198, %201, !dbg !39 + %203 = insertelement <2 x i32> poison, i32 %180, i64 0, !dbg !28 + %204 = insertelement <2 x i32> %203, i32 %182, i64 1, !dbg !28 + %205 = insertelement <2 x i32> poison, i32 %186, i64 0, !dbg !28 + %206 = insertelement <2 x i32> %205, i32 %188, i64 1, !dbg !28 + %207 = icmp sge <2 x i32> %204, %206, !dbg !28 + %208 = icmp ne <2 x i32> %204, %206, !dbg !28 + %209 = insertelement <2 x i32> poison, i32 %194, i64 0, !dbg !28 + %210 = insertelement <2 x i32> %209, i32 %196, i64 1, !dbg !28 + %211 = insertelement <2 x i32> poison, i32 %200, i64 0, !dbg !28 + %212 = insertelement <2 x i32> %211, i32 %202, i64 1, !dbg !28 + %213 = icmp sle <2 x i32> %210, %212, !dbg !28 + %214 = or <2 x i1> %208, %213, !dbg !28 + %215 = and <2 x i1> %207, %214, !dbg !28 + %216 = xor <2 x i1> %215, %166, !dbg !28 + %217 = xor <2 x i32> %204, %206, !dbg !29 + %218 = select <2 x i1> %216, <2 x i32> zeroinitializer, <2 x i32> %217, !dbg !30 + %219 = xor <2 x i32> %218, %169, !dbg !31 + %220 = xor <2 x i32> %210, %212, !dbg !32 + %221 = select <2 x i1> %216, <2 x i32> zeroinitializer, <2 x i32> %220, !dbg !33 + %222 = xor <2 x i32> %221, %174, !dbg !34 + %223 = extractelement <2 x i32> %219, i64 0, !dbg !28 + %224 = extractelement <2 x i32> %219, i64 1, !dbg !28 + %225 = icmp sge i32 %223, %224, !dbg !28 + %226 = icmp ne i32 %223, %224, !dbg !28 + %227 = extractelement <2 x i32> %222, i64 0, !dbg !28 + %228 = extractelement <2 x i32> %222, i64 1, !dbg !28 + %229 = icmp sle i32 %227, %228, !dbg !28 + %230 = or i1 %226, %229, !dbg !28 + %231 = and i1 %225, %230, !dbg !28 + %.not8 = xor i1 %231, %123, !dbg !28 + %232 = xor i32 %223, %224, !dbg !29 + %233 = select i1 %.not8, i32 0, i32 %232, !dbg !30 + %234 = insertelement <2 x i32> poison, i32 %233, i64 0, !dbg !31 + %235 = shufflevector <2 x i32> %234, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !31 + %236 = xor <2 x i32> %235, %219, !dbg !31 + %237 = xor i32 %227, %228, !dbg !32 + %238 = select i1 %.not8, i32 0, i32 %237, !dbg !33 + %239 = insertelement <2 x i32> poison, i32 %238, i64 0, !dbg !34 + %240 = shufflevector <2 x i32> %239, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !34 + %241 = xor <2 x i32> %240, %222, !dbg !34 + %242 = extractelement <2 x i32> %236, i64 0, !dbg !40 + %243 = mul nuw nsw i32 %242, %40, !dbg !35 + %244 = extractelement <2 x i32> %236, i64 1, !dbg !40 + %245 = mul nuw nsw i32 %244, %40, !dbg !35 + %246 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %243, i32 4, i32 31), !dbg !36 + %247 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %245, i32 4, i32 31), !dbg !36 + %248 = insertelement <2 x i32> poison, i32 %243, i64 0, !dbg !39 + %249 = insertelement <2 x i32> %248, i32 %245, i64 1, !dbg !39 + %250 = insertelement <2 x i32> poison, i32 %246, i64 0, !dbg !39 + %251 = insertelement <2 x i32> %250, i32 %247, i64 1, !dbg !39 + %252 = add <2 x i32> %249, %251, !dbg !39 + %253 = mul nuw nsw i32 %242, %.lobit1, !dbg !40 + %254 = mul nuw nsw i32 %244, %.lobit1, !dbg !40 + %255 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %253, i32 4, i32 31), !dbg !36 + %256 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %254, i32 4, i32 31), !dbg !36 + %257 = insertelement <2 x i32> poison, i32 %253, i64 0, !dbg !39 + %258 = insertelement <2 x i32> %257, i32 %254, i64 1, !dbg !39 + %259 = insertelement <2 x i32> poison, i32 %255, i64 0, !dbg !39 + %260 = insertelement <2 x i32> %259, i32 %256, i64 1, !dbg !39 + %261 = add <2 x i32> %258, %260, !dbg !39 + %262 = extractelement <2 x i32> %241, i64 0, !dbg !42 + %263 = mul nuw nsw i32 %262, %40, !dbg !41 + %264 = extractelement <2 x i32> %241, i64 1, !dbg !42 + %265 = mul nuw nsw i32 %264, %40, !dbg !41 + %266 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %263, i32 4, i32 31), !dbg !36 + %267 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %265, i32 4, i32 31), !dbg !36 + %268 = insertelement <2 x i32> poison, i32 %263, i64 0, !dbg !39 + %269 = insertelement <2 x i32> %268, i32 %265, i64 1, !dbg !39 + %270 = insertelement <2 x i32> poison, i32 %266, i64 0, !dbg !39 + %271 = insertelement <2 x i32> %270, i32 %267, i64 1, !dbg !39 + %272 = add <2 x i32> %269, %271, !dbg !39 + %273 = mul nuw nsw i32 %262, %.lobit1, !dbg !42 + %274 = mul nuw nsw i32 %264, %.lobit1, !dbg !42 + %275 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %273, i32 4, i32 31), !dbg !36 + %276 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %274, i32 4, i32 31), !dbg !36 + %277 = insertelement <2 x i32> poison, i32 %273, i64 0, !dbg !39 + %278 = insertelement <2 x i32> %277, i32 %274, i64 1, !dbg !39 + %279 = insertelement <2 x i32> poison, i32 %275, i64 0, !dbg !39 + %280 = insertelement <2 x i32> %279, i32 %276, i64 1, !dbg !39 + %281 = add <2 x i32> %278, %280, !dbg !39 + %282 = icmp slt <2 x i32> %252, %261, !dbg !27 + %283 = icmp eq <2 x i32> %252, %261, !dbg !43 + %284 = icmp sgt <2 x i32> %272, %281, !dbg !44 + %285 = and <2 x i1> %283, %284, !dbg !45 + %286 = or <2 x i1> %282, %285, !dbg !46 + %287 = xor <2 x i32> %252, %261, !dbg !29 + %288 = select <2 x i1> %286, <2 x i32> %287, <2 x i32> zeroinitializer, !dbg !30 + %289 = xor <2 x i32> %288, %236, !dbg !31 + %290 = xor <2 x i32> %272, %281, !dbg !32 + %291 = select <2 x i1> %286, <2 x i32> %290, <2 x i32> zeroinitializer, !dbg !33 + %292 = xor <2 x i32> %291, %241, !dbg !34 + %293 = insertelement <2 x i32> poison, i32 %39, i64 0, !dbg !35 + %294 = shufflevector <2 x i32> %293, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !35 + %295 = mul nuw nsw <2 x i32> %289, %294, !dbg !35 + %296 = extractelement <2 x i32> %295, i64 0, !dbg !36 + %297 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %296, i32 2, i32 31), !dbg !36 + %298 = extractelement <2 x i32> %295, i64 1, !dbg !36 + %299 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 2, i32 31), !dbg !36 + %300 = insertelement <2 x i32> poison, i32 %297, i64 0, !dbg !39 + %301 = insertelement <2 x i32> %300, i32 %299, i64 1, !dbg !39 + %302 = add <2 x i32> %295, %301, !dbg !39 + %303 = insertelement <2 x i32> poison, i32 %.lobit, i64 0, !dbg !40 + %304 = shufflevector <2 x i32> %303, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !40 + %305 = mul nuw nsw <2 x i32> %289, %304, !dbg !40 + %306 = extractelement <2 x i32> %305, i64 0, !dbg !36 + %307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %306, i32 2, i32 31), !dbg !36 + %308 = extractelement <2 x i32> %305, i64 1, !dbg !36 + %309 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 2, i32 31), !dbg !36 + %310 = insertelement <2 x i32> poison, i32 %307, i64 0, !dbg !39 + %311 = insertelement <2 x i32> %310, i32 %309, i64 1, !dbg !39 + %312 = add <2 x i32> %305, %311, !dbg !39 + %313 = extractelement <2 x i32> %292, i64 0, !dbg !42 + %314 = mul nuw nsw i32 %313, %39, !dbg !41 + %315 = extractelement <2 x i32> %292, i64 1, !dbg !42 + %316 = mul nuw nsw i32 %315, %39, !dbg !41 + %317 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 2, i32 31), !dbg !36 + %318 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %316, i32 2, i32 31), !dbg !36 + %319 = insertelement <2 x i32> poison, i32 %314, i64 0, !dbg !39 + %320 = insertelement <2 x i32> %319, i32 %316, i64 1, !dbg !39 + %321 = insertelement <2 x i32> poison, i32 %317, i64 0, !dbg !39 + %322 = insertelement <2 x i32> %321, i32 %318, i64 1, !dbg !39 + %323 = add <2 x i32> %320, %322, !dbg !39 + %324 = mul nuw nsw i32 %313, %.lobit, !dbg !42 + %325 = mul nuw nsw i32 %315, %.lobit, !dbg !42 + %326 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %324, i32 2, i32 31), !dbg !36 + %327 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %325, i32 2, i32 31), !dbg !36 + %328 = insertelement <2 x i32> poison, i32 %324, i64 0, !dbg !39 + %329 = insertelement <2 x i32> %328, i32 %325, i64 1, !dbg !39 + %330 = insertelement <2 x i32> poison, i32 %326, i64 0, !dbg !39 + %331 = insertelement <2 x i32> %330, i32 %327, i64 1, !dbg !39 + %332 = add <2 x i32> %329, %331, !dbg !39 + %333 = icmp slt <2 x i32> %302, %312, !dbg !27 + %334 = icmp eq <2 x i32> %302, %312, !dbg !43 + %335 = icmp sgt <2 x i32> %323, %332, !dbg !44 + %336 = and <2 x i1> %334, %335, !dbg !45 + %337 = or <2 x i1> %333, %336, !dbg !46 + %338 = xor <2 x i32> %302, %312, !dbg !29 + %339 = select <2 x i1> %337, <2 x i32> %338, <2 x i32> zeroinitializer, !dbg !30 + %340 = xor <2 x i32> %339, %289, !dbg !31 + %341 = xor <2 x i32> %323, %332, !dbg !32 + %342 = select <2 x i1> %337, <2 x i32> %341, <2 x i32> zeroinitializer, !dbg !33 + %343 = xor <2 x i32> %342, %292, !dbg !34 + %344 = insertelement <2 x i32> poison, i32 %38, i64 0, !dbg !35 + %345 = shufflevector <2 x i32> %344, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !35 + %346 = mul nuw nsw <2 x i32> %340, %345, !dbg !35 + %347 = extractelement <2 x i32> %346, i64 0, !dbg !36 + %348 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %347, i32 1, i32 31), !dbg !36 + %349 = extractelement <2 x i32> %346, i64 1, !dbg !36 + %350 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %349, i32 1, i32 31), !dbg !36 + %351 = insertelement <2 x i32> poison, i32 %348, i64 0, !dbg !39 + %352 = insertelement <2 x i32> %351, i32 %350, i64 1, !dbg !39 + %353 = add <2 x i32> %346, %352, !dbg !39 + %354 = insertelement <2 x i32> poison, i32 %35, i64 0, !dbg !40 + %355 = shufflevector <2 x i32> %354, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !40 + %356 = mul nuw nsw <2 x i32> %340, %355, !dbg !40 + %357 = extractelement <2 x i32> %356, i64 0, !dbg !36 + %358 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %357, i32 1, i32 31), !dbg !36 + %359 = extractelement <2 x i32> %356, i64 1, !dbg !36 + %360 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %359, i32 1, i32 31), !dbg !36 + %361 = insertelement <2 x i32> poison, i32 %358, i64 0, !dbg !39 + %362 = insertelement <2 x i32> %361, i32 %360, i64 1, !dbg !39 + %363 = add <2 x i32> %356, %362, !dbg !39 + %364 = mul nuw nsw <2 x i32> %343, %345, !dbg !41 + %365 = extractelement <2 x i32> %364, i64 0, !dbg !36 + %366 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %365, i32 1, i32 31), !dbg !36 + %367 = extractelement <2 x i32> %364, i64 1, !dbg !36 + %368 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %367, i32 1, i32 31), !dbg !36 + %369 = insertelement <2 x i32> poison, i32 %366, i64 0, !dbg !39 + %370 = insertelement <2 x i32> %369, i32 %368, i64 1, !dbg !39 + %371 = add <2 x i32> %364, %370, !dbg !39 + %372 = mul nuw nsw <2 x i32> %343, %355, !dbg !42 + %373 = extractelement <2 x i32> %372, i64 0, !dbg !36 + %374 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %373, i32 1, i32 31), !dbg !36 + %375 = extractelement <2 x i32> %372, i64 1, !dbg !36 + %376 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %375, i32 1, i32 31), !dbg !36 + %377 = insertelement <2 x i32> poison, i32 %374, i64 0, !dbg !39 + %378 = insertelement <2 x i32> %377, i32 %376, i64 1, !dbg !39 + %379 = add <2 x i32> %372, %378, !dbg !39 + %380 = icmp slt <2 x i32> %353, %363, !dbg !27 + %381 = icmp eq <2 x i32> %353, %363, !dbg !43 + %382 = icmp sgt <2 x i32> %371, %379, !dbg !44 + %383 = and <2 x i1> %381, %382, !dbg !45 + %384 = or <2 x i1> %380, %383, !dbg !46 + %385 = xor <2 x i32> %353, %363, !dbg !29 + %386 = select <2 x i1> %384, <2 x i32> %385, <2 x i32> zeroinitializer, !dbg !30 + %387 = xor <2 x i32> %386, %340, !dbg !31 + %388 = xor <2 x i32> %371, %379, !dbg !32 + %389 = select <2 x i1> %384, <2 x i32> %388, <2 x i32> zeroinitializer, !dbg !33 + %390 = xor <2 x i32> %389, %343, !dbg !34 + %391 = extractelement <2 x i32> %387, i64 0, !dbg !27 + %392 = extractelement <2 x i32> %387, i64 1, !dbg !27 + %393 = icmp slt i32 %391, %392, !dbg !27 + %394 = icmp eq i32 %391, %392, !dbg !43 + %395 = extractelement <2 x i32> %390, i64 0, !dbg !44 + %396 = extractelement <2 x i32> %390, i64 1, !dbg !44 + %397 = icmp sgt i32 %395, %396, !dbg !44 + %398 = and i1 %394, %397, !dbg !45 + %399 = or i1 %393, %398, !dbg !46 + %400 = xor i32 %395, %396, !dbg !32 + %401 = select i1 %399, i32 %400, i32 0, !dbg !33 + %402 = xor i32 %401, %395, !dbg !34 + %403 = xor i32 %401, %396, !dbg !34 + %404 = icmp eq i64 %27, 16384, !dbg !47 + %405 = icmp eq i64 %28, 16384, !dbg !47 + %406 = zext i1 %404 to i32, !dbg !21 + %407 = zext i1 %405 to i32, !dbg !21 + %408 = xor i1 %404, true, !dbg !48 + %409 = and i1 %405, %408, !dbg !48 + %410 = xor i1 %409, %43, !dbg !50 + %411 = xor i32 %406, %407, !dbg !51 + %412 = select i1 %410, i32 %411, i32 0, !dbg !52 + %413 = xor i32 %412, %406, !dbg !53 + %414 = xor i32 %412, %407, !dbg !53 + %415 = select i1 %410, i32 %49, i32 0, !dbg !54 + %416 = xor i32 %415, %20, !dbg !55 + %417 = xor i32 %415, %21, !dbg !55 + %418 = mul nuw nsw i32 %413, %38, !dbg !56 + %419 = mul nuw nsw i32 %414, %38, !dbg !56 + %420 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %418, i32 1, i32 31), !dbg !57 + %421 = add i32 %420, %418, !dbg !58 + %422 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %419, i32 1, i32 31), !dbg !57 + %423 = add i32 %422, %419, !dbg !58 + %424 = mul nuw nsw i32 %413, %35, !dbg !59 + %425 = mul nuw nsw i32 %414, %35, !dbg !59 + %426 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %424, i32 1, i32 31), !dbg !57 + %427 = add i32 %426, %424, !dbg !58 + %428 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %425, i32 1, i32 31), !dbg !57 + %429 = add i32 %428, %425, !dbg !58 + %430 = mul nuw nsw i32 %416, %38, !dbg !60 + %431 = mul nuw nsw i32 %417, %38, !dbg !60 + %432 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %430, i32 1, i32 31), !dbg !57 + %433 = add i32 %432, %430, !dbg !58 + %434 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %431, i32 1, i32 31), !dbg !57 + %435 = add i32 %434, %431, !dbg !58 + %436 = mul nuw nsw i32 %416, %35, !dbg !61 + %437 = mul nuw nsw i32 %417, %35, !dbg !61 + %438 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %436, i32 1, i32 31), !dbg !57 + %439 = add i32 %438, %436, !dbg !58 + %440 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %437, i32 1, i32 31), !dbg !57 + %441 = add i32 %440, %437, !dbg !58 + %442 = insertelement <2 x i32> poison, i32 %421, i64 0, !dbg !50 + %443 = insertelement <2 x i32> %442, i32 %423, i64 1, !dbg !50 + %444 = insertelement <2 x i32> poison, i32 %427, i64 0, !dbg !50 + %445 = insertelement <2 x i32> %444, i32 %429, i64 1, !dbg !50 + %446 = icmp sge <2 x i32> %443, %445, !dbg !50 + %447 = icmp ne <2 x i32> %443, %445, !dbg !50 + %448 = insertelement <2 x i32> poison, i32 %433, i64 0, !dbg !50 + %449 = insertelement <2 x i32> %448, i32 %435, i64 1, !dbg !50 + %450 = insertelement <2 x i32> poison, i32 %439, i64 0, !dbg !50 + %451 = insertelement <2 x i32> %450, i32 %441, i64 1, !dbg !50 + %452 = icmp sle <2 x i32> %449, %451, !dbg !50 + %453 = or <2 x i1> %447, %452, !dbg !50 + %454 = and <2 x i1> %446, %453, !dbg !50 + %455 = xor <2 x i1> %454, %92, !dbg !50 + %456 = insertelement <2 x i32> %451, i32 %429, i64 1, !dbg !51 + %457 = insertelement <2 x i32> %449, i32 %423, i64 1, !dbg !51 + %458 = xor <2 x i32> %456, %457, !dbg !51 + %459 = insertelement <2 x i32> poison, i32 %441, i64 0, !dbg !51 + %460 = insertelement <2 x i32> %459, i32 %427, i64 1, !dbg !51 + %461 = insertelement <2 x i32> poison, i32 %435, i64 0, !dbg !51 + %462 = insertelement <2 x i32> %461, i32 %421, i64 1, !dbg !51 + %463 = xor <2 x i32> %460, %462, !dbg !51 + %464 = select <2 x i1> %455, <2 x i32> zeroinitializer, <2 x i32> %458, !dbg !52 + %465 = shufflevector <2 x i1> %455, <2 x i1> poison, <2 x i32> , !dbg !52 + %466 = select <2 x i1> %465, <2 x i32> zeroinitializer, <2 x i32> %463, !dbg !52 + %467 = insertelement <2 x i32> poison, i32 %416, i64 0, !dbg !53 + %468 = insertelement <2 x i32> %467, i32 %414, i64 1, !dbg !53 + %469 = xor <2 x i32> %464, %468, !dbg !53 + %470 = insertelement <2 x i32> poison, i32 %417, i64 0, !dbg !53 + %471 = insertelement <2 x i32> %470, i32 %413, i64 1, !dbg !53 + %472 = xor <2 x i32> %466, %471, !dbg !53 + %473 = extractelement <2 x i32> %469, i64 1, !dbg !50 + %474 = extractelement <2 x i32> %472, i64 1, !dbg !50 + %475 = icmp sge i32 %474, %473, !dbg !50 + %476 = icmp ne <2 x i32> %469, %472, !dbg !50 + %477 = icmp sle <2 x i32> %469, %472, !dbg !50 + %shift = shufflevector <2 x i1> %476, <2 x i1> poison, <2 x i32> , !dbg !50 + %foldExtExtBinop = or <2 x i1> %shift, %477, !dbg !50 + %478 = extractelement <2 x i1> %foldExtExtBinop, i64 0, !dbg !50 + %479 = and i1 %475, %478, !dbg !50 + %.not12 = xor i1 %479, %77, !dbg !50 + %480 = xor i32 %473, %474, !dbg !51 + %481 = select i1 %.not12, i32 0, i32 %480, !dbg !52 + %482 = xor i32 %481, %474, !dbg !53 + %483 = xor i32 %481, %473, !dbg !53 + %484 = extractelement <2 x i32> %469, i64 0, !dbg !62 + %485 = extractelement <2 x i32> %472, i64 0, !dbg !62 + %486 = xor i32 %485, %484, !dbg !62 + %487 = select i1 %.not12, i32 0, i32 %486, !dbg !54 + %488 = xor i32 %487, %484, !dbg !55 + %489 = xor i32 %487, %485, !dbg !55 + %490 = mul nuw nsw i32 %482, %39, !dbg !56 + %491 = mul nuw nsw i32 %483, %39, !dbg !56 + %492 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %490, i32 2, i32 31), !dbg !57 + %493 = add i32 %490, %492, !dbg !58 + %494 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %491, i32 2, i32 31), !dbg !57 + %495 = add i32 %491, %494, !dbg !58 + %496 = mul nuw nsw i32 %482, %.lobit, !dbg !59 + %497 = mul nuw nsw i32 %483, %.lobit, !dbg !59 + %498 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %496, i32 2, i32 31), !dbg !57 + %499 = add i32 %496, %498, !dbg !58 + %500 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %497, i32 2, i32 31), !dbg !57 + %501 = add i32 %497, %500, !dbg !58 + %502 = mul nuw nsw i32 %488, %39, !dbg !60 + %503 = mul nuw nsw i32 %489, %39, !dbg !60 + %504 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %502, i32 2, i32 31), !dbg !57 + %505 = add i32 %502, %504, !dbg !58 + %506 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %503, i32 2, i32 31), !dbg !57 + %507 = add i32 %503, %506, !dbg !58 + %508 = mul nuw nsw i32 %488, %.lobit, !dbg !61 + %509 = mul nuw nsw i32 %489, %.lobit, !dbg !61 + %510 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %508, i32 2, i32 31), !dbg !57 + %511 = add i32 %508, %510, !dbg !58 + %512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %509, i32 2, i32 31), !dbg !57 + %513 = add i32 %509, %512, !dbg !58 + %514 = insertelement <2 x i32> poison, i32 %493, i64 0, !dbg !50 + %515 = insertelement <2 x i32> %514, i32 %495, i64 1, !dbg !50 + %516 = insertelement <2 x i32> poison, i32 %499, i64 0, !dbg !50 + %517 = insertelement <2 x i32> %516, i32 %501, i64 1, !dbg !50 + %518 = icmp sge <2 x i32> %515, %517, !dbg !50 + %519 = icmp ne <2 x i32> %515, %517, !dbg !50 + %520 = xor i32 %493, %499, !dbg !51 + %521 = xor i32 %495, %501, !dbg !51 + %522 = insertelement <2 x i32> poison, i32 %505, i64 0, !dbg !50 + %523 = insertelement <2 x i32> %522, i32 %507, i64 1, !dbg !50 + %524 = insertelement <2 x i32> poison, i32 %511, i64 0, !dbg !50 + %525 = insertelement <2 x i32> %524, i32 %513, i64 1, !dbg !50 + %526 = icmp sle <2 x i32> %523, %525, !dbg !50 + %527 = or <2 x i1> %519, %526, !dbg !50 + %528 = and <2 x i1> %518, %527, !dbg !50 + %529 = xor <2 x i1> %528, %166, !dbg !50 + %530 = extractelement <2 x i1> %529, i64 0, !dbg !52 + %531 = select i1 %530, i32 0, i32 %520, !dbg !52 + %532 = extractelement <2 x i1> %529, i64 1, !dbg !52 + %533 = select i1 %532, i32 0, i32 %521, !dbg !52 + %534 = xor i32 %531, %482, !dbg !53 + %535 = xor i32 %533, %483, !dbg !53 + %536 = xor <2 x i32> %523, %525, !dbg !62 + %537 = select <2 x i1> %529, <2 x i32> zeroinitializer, <2 x i32> %536, !dbg !54 + %538 = insertelement <2 x i32> poison, i32 %488, i64 0, !dbg !55 + %539 = insertelement <2 x i32> %538, i32 %489, i64 1, !dbg !55 + %540 = xor <2 x i32> %537, %539, !dbg !55 + %541 = mul nuw nsw i32 %534, %38, !dbg !56 + %542 = mul nuw nsw i32 %535, %38, !dbg !56 + %543 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %541, i32 1, i32 31), !dbg !57 + %544 = add i32 %541, %543, !dbg !58 + %545 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %542, i32 1, i32 31), !dbg !57 + %546 = add i32 %542, %545, !dbg !58 + %547 = mul nuw nsw i32 %534, %35, !dbg !59 + %548 = mul nuw nsw i32 %535, %35, !dbg !59 + %549 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %547, i32 1, i32 31), !dbg !57 + %550 = add i32 %547, %549, !dbg !58 + %551 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %548, i32 1, i32 31), !dbg !57 + %552 = add i32 %548, %551, !dbg !58 + %553 = extractelement <2 x i32> %540, i64 0, !dbg !61 + %554 = mul nuw nsw i32 %553, %38, !dbg !60 + %555 = extractelement <2 x i32> %540, i64 1, !dbg !61 + %556 = mul nuw nsw i32 %555, %38, !dbg !60 + %557 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %554, i32 1, i32 31), !dbg !57 + %558 = add i32 %554, %557, !dbg !58 + %559 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %556, i32 1, i32 31), !dbg !57 + %560 = add i32 %556, %559, !dbg !58 + %561 = mul nuw nsw i32 %553, %35, !dbg !61 + %562 = mul nuw nsw i32 %555, %35, !dbg !61 + %563 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %561, i32 1, i32 31), !dbg !57 + %564 = add i32 %561, %563, !dbg !58 + %565 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %562, i32 1, i32 31), !dbg !57 + %566 = add i32 %562, %565, !dbg !58 + %567 = insertelement <2 x i32> poison, i32 %544, i64 0, !dbg !50 + %568 = insertelement <2 x i32> %567, i32 %546, i64 1, !dbg !50 + %569 = insertelement <2 x i32> poison, i32 %550, i64 0, !dbg !50 + %570 = insertelement <2 x i32> %569, i32 %552, i64 1, !dbg !50 + %571 = icmp sge <2 x i32> %568, %570, !dbg !50 + %572 = icmp ne <2 x i32> %568, %570, !dbg !50 + %573 = insertelement <2 x i32> poison, i32 %558, i64 0, !dbg !50 + %574 = insertelement <2 x i32> %573, i32 %560, i64 1, !dbg !50 + %575 = insertelement <2 x i32> poison, i32 %564, i64 0, !dbg !50 + %576 = insertelement <2 x i32> %575, i32 %566, i64 1, !dbg !50 + %577 = icmp sle <2 x i32> %574, %576, !dbg !50 + %578 = or <2 x i1> %572, %577, !dbg !50 + %579 = and <2 x i1> %571, %578, !dbg !50 + %580 = xor <2 x i1> %579, %166, !dbg !50 + %581 = xor <2 x i32> %568, %570, !dbg !51 + %582 = select <2 x i1> %580, <2 x i32> zeroinitializer, <2 x i32> %581, !dbg !52 + %583 = insertelement <2 x i32> poison, i32 %534, i64 0, !dbg !53 + %584 = insertelement <2 x i32> %583, i32 %535, i64 1, !dbg !53 + %585 = xor <2 x i32> %582, %584, !dbg !53 + %586 = xor <2 x i32> %574, %576, !dbg !62 + %587 = select <2 x i1> %580, <2 x i32> zeroinitializer, <2 x i32> %586, !dbg !54 + %588 = xor <2 x i32> %587, %540, !dbg !55 + %589 = extractelement <2 x i32> %585, i64 0, !dbg !50 + %590 = extractelement <2 x i32> %585, i64 1, !dbg !50 + %591 = icmp sge i32 %589, %590, !dbg !50 + %592 = icmp ne i32 %589, %590, !dbg !50 + %593 = extractelement <2 x i32> %588, i64 0, !dbg !50 + %594 = extractelement <2 x i32> %588, i64 1, !dbg !50 + %595 = icmp sle i32 %593, %594, !dbg !50 + %596 = or i1 %592, %595, !dbg !50 + %597 = and i1 %591, %596, !dbg !50 + %.not17 = xor i1 %597, %123, !dbg !50 + %598 = xor i32 %589, %590, !dbg !51 + %599 = select i1 %.not17, i32 0, i32 %598, !dbg !52 + %600 = xor i32 %599, %589, !dbg !53 + %601 = xor i32 %599, %590, !dbg !53 + %602 = xor i32 %593, %594, !dbg !62 + %603 = select i1 %.not17, i32 0, i32 %602, !dbg !54 + %604 = xor i32 %603, %593, !dbg !55 + %605 = xor i32 %603, %594, !dbg !55 + %606 = mul nuw nsw i32 %600, %40, !dbg !56 + %607 = mul nuw nsw i32 %601, %40, !dbg !56 + %608 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %606, i32 4, i32 31), !dbg !57 + %609 = add i32 %606, %608, !dbg !58 + %610 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %607, i32 4, i32 31), !dbg !57 + %611 = add i32 %607, %610, !dbg !58 + %612 = mul nuw nsw i32 %600, %.lobit1, !dbg !59 + %613 = mul nuw nsw i32 %601, %.lobit1, !dbg !59 + %614 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %612, i32 4, i32 31), !dbg !57 + %615 = add i32 %612, %614, !dbg !58 + %616 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %613, i32 4, i32 31), !dbg !57 + %617 = add i32 %613, %616, !dbg !58 + %618 = mul nuw nsw i32 %604, %40, !dbg !60 + %619 = mul nuw nsw i32 %605, %40, !dbg !60 + %620 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %618, i32 4, i32 31), !dbg !57 + %621 = add i32 %618, %620, !dbg !58 + %622 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %619, i32 4, i32 31), !dbg !57 + %623 = add i32 %619, %622, !dbg !58 + %624 = mul nuw nsw i32 %604, %.lobit1, !dbg !61 + %625 = mul nuw nsw i32 %605, %.lobit1, !dbg !61 + %626 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %624, i32 4, i32 31), !dbg !57 + %627 = add i32 %624, %626, !dbg !58 + %628 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %625, i32 4, i32 31), !dbg !57 + %629 = add i32 %625, %628, !dbg !58 + %630 = insertelement <2 x i32> poison, i32 %609, i64 0, !dbg !48 + %631 = insertelement <2 x i32> %630, i32 %611, i64 1, !dbg !48 + %632 = insertelement <2 x i32> poison, i32 %615, i64 0, !dbg !48 + %633 = insertelement <2 x i32> %632, i32 %617, i64 1, !dbg !48 + %634 = icmp slt <2 x i32> %631, %633, !dbg !48 + %635 = icmp eq <2 x i32> %631, %633, !dbg !63 + %636 = insertelement <2 x i32> poison, i32 %621, i64 0, !dbg !64 + %637 = insertelement <2 x i32> %636, i32 %623, i64 1, !dbg !64 + %638 = insertelement <2 x i32> poison, i32 %627, i64 0, !dbg !64 + %639 = insertelement <2 x i32> %638, i32 %629, i64 1, !dbg !64 + %640 = icmp sgt <2 x i32> %637, %639, !dbg !64 + %641 = and <2 x i1> %635, %640, !dbg !65 + %642 = or <2 x i1> %634, %641, !dbg !66 + %643 = xor <2 x i32> %631, %633, !dbg !51 + %644 = select <2 x i1> %642, <2 x i32> %643, <2 x i32> zeroinitializer, !dbg !52 + %645 = insertelement <2 x i32> poison, i32 %600, i64 0, !dbg !53 + %646 = insertelement <2 x i32> %645, i32 %601, i64 1, !dbg !53 + %647 = xor <2 x i32> %644, %646, !dbg !53 + %648 = xor i32 %621, %627, !dbg !62 + %649 = xor i32 %623, %629, !dbg !62 + %650 = extractelement <2 x i1> %642, i64 0, !dbg !54 + %651 = select i1 %650, i32 %648, i32 0, !dbg !54 + %652 = extractelement <2 x i1> %642, i64 1, !dbg !54 + %653 = select i1 %652, i32 %649, i32 0, !dbg !54 + %654 = xor i32 %651, %604, !dbg !55 + %655 = xor i32 %653, %605, !dbg !55 + %656 = extractelement <2 x i32> %647, i64 0, !dbg !59 + %657 = mul nuw nsw i32 %656, %39, !dbg !56 + %658 = extractelement <2 x i32> %647, i64 1, !dbg !59 + %659 = mul nuw nsw i32 %658, %39, !dbg !56 + %660 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %657, i32 2, i32 31), !dbg !57 + %661 = add i32 %657, %660, !dbg !58 + %662 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %659, i32 2, i32 31), !dbg !57 + %663 = add i32 %659, %662, !dbg !58 + %664 = mul nuw nsw i32 %656, %.lobit, !dbg !59 + %665 = mul nuw nsw i32 %658, %.lobit, !dbg !59 + %666 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %664, i32 2, i32 31), !dbg !57 + %667 = add i32 %664, %666, !dbg !58 + %668 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %665, i32 2, i32 31), !dbg !57 + %669 = add i32 %665, %668, !dbg !58 + %670 = mul nuw nsw i32 %654, %39, !dbg !60 + %671 = mul nuw nsw i32 %655, %39, !dbg !60 + %672 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %670, i32 2, i32 31), !dbg !57 + %673 = add i32 %670, %672, !dbg !58 + %674 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %671, i32 2, i32 31), !dbg !57 + %675 = add i32 %671, %674, !dbg !58 + %676 = mul nuw nsw i32 %654, %.lobit, !dbg !61 + %677 = mul nuw nsw i32 %655, %.lobit, !dbg !61 + %678 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %676, i32 2, i32 31), !dbg !57 + %679 = add i32 %676, %678, !dbg !58 + %680 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %677, i32 2, i32 31), !dbg !57 + %681 = add i32 %677, %680, !dbg !58 + %682 = insertelement <2 x i32> poison, i32 %661, i64 0, !dbg !48 + %683 = insertelement <2 x i32> %682, i32 %663, i64 1, !dbg !48 + %684 = insertelement <2 x i32> poison, i32 %667, i64 0, !dbg !48 + %685 = insertelement <2 x i32> %684, i32 %669, i64 1, !dbg !48 + %686 = icmp slt <2 x i32> %683, %685, !dbg !48 + %687 = icmp eq <2 x i32> %683, %685, !dbg !63 + %688 = insertelement <2 x i32> poison, i32 %673, i64 0, !dbg !64 + %689 = insertelement <2 x i32> %688, i32 %675, i64 1, !dbg !64 + %690 = insertelement <2 x i32> poison, i32 %679, i64 0, !dbg !64 + %691 = insertelement <2 x i32> %690, i32 %681, i64 1, !dbg !64 + %692 = icmp sgt <2 x i32> %689, %691, !dbg !64 + %693 = and <2 x i1> %687, %692, !dbg !65 + %694 = or <2 x i1> %686, %693, !dbg !66 + %695 = xor <2 x i32> %683, %685, !dbg !51 + %696 = select <2 x i1> %694, <2 x i32> %695, <2 x i32> zeroinitializer, !dbg !52 + %697 = xor <2 x i32> %696, %647, !dbg !53 + %698 = xor i32 %673, %679, !dbg !62 + %699 = xor i32 %675, %681, !dbg !62 + %700 = extractelement <2 x i1> %694, i64 0, !dbg !54 + %701 = select i1 %700, i32 %698, i32 0, !dbg !54 + %702 = extractelement <2 x i1> %694, i64 1, !dbg !54 + %703 = select i1 %702, i32 %699, i32 0, !dbg !54 + %704 = xor i32 %701, %654, !dbg !55 + %705 = xor i32 %703, %655, !dbg !55 + %706 = extractelement <2 x i32> %697, i64 0, !dbg !59 + %707 = mul nuw nsw i32 %706, %38, !dbg !56 + %708 = extractelement <2 x i32> %697, i64 1, !dbg !59 + %709 = mul nuw nsw i32 %708, %38, !dbg !56 + %710 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %707, i32 1, i32 31), !dbg !57 + %711 = add i32 %707, %710, !dbg !58 + %712 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %709, i32 1, i32 31), !dbg !57 + %713 = add i32 %709, %712, !dbg !58 + %714 = mul nuw nsw i32 %706, %35, !dbg !59 + %715 = mul nuw nsw i32 %708, %35, !dbg !59 + %716 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %714, i32 1, i32 31), !dbg !57 + %717 = add i32 %714, %716, !dbg !58 + %718 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %715, i32 1, i32 31), !dbg !57 + %719 = add i32 %715, %718, !dbg !58 + %720 = mul nuw nsw i32 %704, %38, !dbg !60 + %721 = mul nuw nsw i32 %705, %38, !dbg !60 + %722 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %720, i32 1, i32 31), !dbg !57 + %723 = add i32 %720, %722, !dbg !58 + %724 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %721, i32 1, i32 31), !dbg !57 + %725 = add i32 %721, %724, !dbg !58 + %726 = mul nuw nsw i32 %704, %35, !dbg !61 + %727 = mul nuw nsw i32 %705, %35, !dbg !61 + %728 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %726, i32 1, i32 31), !dbg !57 + %729 = add i32 %726, %728, !dbg !58 + %730 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %727, i32 1, i32 31), !dbg !57 + %731 = add i32 %727, %730, !dbg !58 + %732 = insertelement <2 x i32> poison, i32 %711, i64 0, !dbg !48 + %733 = insertelement <2 x i32> %732, i32 %713, i64 1, !dbg !48 + %734 = insertelement <2 x i32> poison, i32 %717, i64 0, !dbg !48 + %735 = insertelement <2 x i32> %734, i32 %719, i64 1, !dbg !48 + %736 = icmp slt <2 x i32> %733, %735, !dbg !48 + %737 = icmp eq <2 x i32> %733, %735, !dbg !63 + %738 = insertelement <2 x i32> poison, i32 %723, i64 0, !dbg !64 + %739 = insertelement <2 x i32> %738, i32 %725, i64 1, !dbg !64 + %740 = insertelement <2 x i32> poison, i32 %729, i64 0, !dbg !64 + %741 = insertelement <2 x i32> %740, i32 %731, i64 1, !dbg !64 + %742 = icmp sgt <2 x i32> %739, %741, !dbg !64 + %743 = and <2 x i1> %737, %742, !dbg !65 + %744 = or <2 x i1> %736, %743, !dbg !66 + %745 = xor <2 x i32> %733, %735, !dbg !51 + %746 = select <2 x i1> %744, <2 x i32> %745, <2 x i32> zeroinitializer, !dbg !52 + %747 = xor <2 x i32> %746, %697, !dbg !53 + %748 = xor i32 %723, %729, !dbg !62 + %749 = xor i32 %725, %731, !dbg !62 + %750 = extractelement <2 x i1> %744, i64 0, !dbg !54 + %751 = select i1 %750, i32 %748, i32 0, !dbg !54 + %752 = extractelement <2 x i1> %744, i64 1, !dbg !54 + %753 = select i1 %752, i32 %749, i32 0, !dbg !54 + %754 = xor i32 %751, %704, !dbg !55 + %755 = xor i32 %753, %705, !dbg !55 + %756 = extractelement <2 x i32> %747, i64 0, !dbg !48 + %757 = extractelement <2 x i32> %747, i64 1, !dbg !48 + %758 = icmp slt i32 %756, %757, !dbg !48 + %759 = icmp eq i32 %756, %757, !dbg !63 + %760 = icmp sgt i32 %754, %755, !dbg !64 + %761 = and i1 %759, %760, !dbg !65 + %762 = or i1 %758, %761, !dbg !66 + %763 = xor i32 %754, %755, !dbg !62 + %764 = select i1 %762, i32 %763, i32 0, !dbg !54 + %765 = xor i32 %764, %754, !dbg !55 + %766 = xor i32 %764, %755, !dbg !55 + %narrow = select i1 %19, i1 %30, i1 false, !dbg !67 + %767 = zext i1 %narrow to i64, !dbg !67 + %narrow18 = select i1 %19, i1 %32, i1 false, !dbg !67 + %768 = zext i1 %narrow18 to i64, !dbg !67 + %769 = add nuw nsw i64 %767, %768, !dbg !68 + %770 = trunc nuw nsw i64 %769 to i32, !dbg !70 + %771 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %770, i32 4, i32 31), !dbg !70 + %772 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 4, i32 31), !dbg !70 + %773 = insertelement <2 x i32> poison, i32 %771, i64 0, !dbg !70 + %774 = insertelement <2 x i32> %773, i32 %772, i64 1, !dbg !70 + %775 = bitcast <2 x i32> %774 to i64, !dbg !70 + %776 = add i64 %769, %775, !dbg !68 + %extelt.offset = lshr i64 %776, 32, !dbg !70 + %777 = trunc nuw i64 %extelt.offset to i32, !dbg !70 + %778 = trunc i64 %776 to i32, !dbg !70 + %779 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %778, i32 2, i32 31), !dbg !70 + %780 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %777, i32 2, i32 31), !dbg !70 + %781 = insertelement <2 x i32> poison, i32 %779, i64 0, !dbg !70 + %782 = insertelement <2 x i32> %781, i32 %780, i64 1, !dbg !70 + %783 = bitcast <2 x i32> %782 to i64, !dbg !70 + %784 = add i64 %776, %783, !dbg !68 + %extelt.offset19 = lshr i64 %784, 32, !dbg !70 + %785 = trunc nuw i64 %extelt.offset19 to i32, !dbg !70 + %786 = trunc i64 %784 to i32, !dbg !70 + %787 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %786, i32 1, i32 31), !dbg !70 + %788 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %785, i32 1, i32 31), !dbg !70 + %789 = insertelement <2 x i32> poison, i32 %787, i64 0, !dbg !70 + %790 = insertelement <2 x i32> %789, i32 %788, i64 1, !dbg !70 + %791 = bitcast <2 x i32> %790 to i64, !dbg !70 + %792 = add i64 %784, %791, !dbg !68 + %793 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %15, !dbg !71 + %794 = insertelement <1 x i64> poison, i64 %792, i64 0, !dbg !71 + store <1 x i64> %794, ptr addrspace(3) %793, align 8, !dbg !71 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !71 + %795 = shl nuw nsw i32 %17, 3, !dbg !71 + %796 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %795, !dbg !71 + %797 = load i64, ptr addrspace(3) %796, align 8, !dbg !71 + %narrow20 = select i1 %19, i1 %404, i1 false, !dbg !72 + %798 = zext i1 %narrow20 to i64, !dbg !72 + %narrow21 = select i1 %19, i1 %405, i1 false, !dbg !72 + %799 = zext i1 %narrow21 to i64, !dbg !72 + %800 = add nuw nsw i64 %798, %799, !dbg !73 + %801 = trunc nuw nsw i64 %800 to i32, !dbg !75 + %802 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %801, i32 4, i32 31), !dbg !75 + %803 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 4, i32 31), !dbg !75 + %804 = insertelement <2 x i32> poison, i32 %802, i64 0, !dbg !75 + %805 = insertelement <2 x i32> %804, i32 %803, i64 1, !dbg !75 + %806 = bitcast <2 x i32> %805 to i64, !dbg !75 + %807 = add i64 %800, %806, !dbg !73 + %extelt.offset23 = lshr i64 %807, 32, !dbg !75 + %808 = trunc nuw i64 %extelt.offset23 to i32, !dbg !75 + %809 = trunc i64 %807 to i32, !dbg !75 + %810 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %809, i32 2, i32 31), !dbg !75 + %811 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %808, i32 2, i32 31), !dbg !75 + %812 = insertelement <2 x i32> poison, i32 %810, i64 0, !dbg !75 + %813 = insertelement <2 x i32> %812, i32 %811, i64 1, !dbg !75 + %814 = bitcast <2 x i32> %813 to i64, !dbg !75 + %815 = add i64 %807, %814, !dbg !73 + %extelt.offset24 = lshr i64 %815, 32, !dbg !75 + %816 = trunc nuw i64 %extelt.offset24 to i32, !dbg !75 + %817 = trunc i64 %815 to i32, !dbg !75 + %818 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %817, i32 1, i32 31), !dbg !75 + %819 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %816, i32 1, i32 31), !dbg !75 + %820 = insertelement <2 x i32> poison, i32 %818, i64 0, !dbg !75 + %821 = insertelement <2 x i32> %820, i32 %819, i64 1, !dbg !75 + %822 = bitcast <2 x i32> %821 to i64, !dbg !75 + %823 = add i64 %815, %822, !dbg !73 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !76 + %824 = insertelement <1 x i64> poison, i64 %823, i64 0, !dbg !76 + store <1 x i64> %824, ptr addrspace(3) %793, align 8, !dbg !76 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !76 + %825 = load i64, ptr addrspace(3) %796, align 8, !dbg !76 + %826 = trunc i64 %792 to i32, !dbg !71 + %827 = icmp slt i32 %20, %826, !dbg !77 + %828 = icmp slt i32 %21, %826, !dbg !77 + %829 = select i1 %827, i32 %402, i32 16, !dbg !78 + %830 = select i1 %828, i32 %403, i32 16, !dbg !78 + %831 = add i32 %829, 17, !dbg !79 + %832 = add i32 %830, 17, !dbg !79 + %833 = icmp slt i32 %829, 0, !dbg !80 + %834 = icmp slt i32 %830, 0, !dbg !80 + %835 = select i1 %833, i32 %831, i32 %829, !dbg !81 + %836 = select i1 %834, i32 %832, i32 %830, !dbg !81 + %837 = icmp ugt i32 %835, 16, !dbg !82 + %838 = icmp ugt i32 %836, 16, !dbg !82 + %.not2629 = or i1 %837, %838, !dbg !83 + %839 = and i1 %19, %.not2629, !dbg !83 + br i1 %839, label %840, label %841, !dbg !83 + +840: ; preds = %11 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 71, ptr nonnull @assertFunc_0, i64 1), !dbg !83 + unreachable, !dbg !83 + +841: ; preds = %11 + %842 = trunc i64 %823 to i32, !dbg !76 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !83 + %843 = icmp slt i32 %20, %842, !dbg !84 + %844 = icmp slt i32 %21, %842, !dbg !84 + %845 = select i1 %843, i32 %765, i32 16, !dbg !85 + %846 = select i1 %844, i32 %766, i32 16, !dbg !85 + %847 = add i32 %845, 17, !dbg !86 + %848 = add i32 %846, 17, !dbg !86 + %849 = icmp slt i32 %845, 0, !dbg !87 + %850 = icmp slt i32 %846, 0, !dbg !87 + %851 = select i1 %849, i32 %847, i32 %845, !dbg !88 + %852 = select i1 %850, i32 %848, i32 %846, !dbg !88 + %853 = icmp ugt i32 %851, 16, !dbg !89 + %854 = icmp ugt i32 %852, 16, !dbg !89 + %.not3134 = or i1 %853, %854, !dbg !90 + %855 = and i1 %19, %.not3134, !dbg !90 + br i1 %855, label %856, label %857, !dbg !90 + +856: ; preds = %841 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 80, ptr nonnull @assertFunc_1, i64 1), !dbg !90 + unreachable, !dbg !90 + +857: ; preds = %841 + %858 = trunc i64 %825 to i32, !dbg !76 + %859 = trunc i64 %797 to i32, !dbg !71 + %860 = or disjoint i32 %13, %17, !dbg !13 + %861 = icmp slt i32 %860, 128, !dbg !14 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !90 + %862 = sext i32 %860 to i64, !dbg !91 + %863 = getelementptr i32, ptr addrspace(1) %1, i64 %862, !dbg !91 + %864 = icmp eq i32 %15, 0, !dbg !92 + %865 = and i1 %864, %861, !dbg !92 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %859, ptr addrspace(1) %863, i1 %865) #5, !dbg !92 + %866 = getelementptr i32, ptr addrspace(1) %2, i64 %862, !dbg !93 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %858, ptr addrspace(1) %866, i1 %865) #5, !dbg !94 + %867 = getelementptr i32, ptr addrspace(1) %3, i64 %24, !dbg !95 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %402, i32 %403, ptr addrspace(1) %867, i1 %19) #5, !dbg !96 + %868 = mul i32 %18, 17, !dbg !97 + %869 = add i32 %835, %868, !dbg !98 + %870 = add i32 %836, %868, !dbg !98 + %871 = sext i32 %869 to i64, !dbg !99 + %872 = getelementptr i32, ptr addrspace(1) %4, i64 %871, !dbg !99 + %873 = sext i32 %870 to i64, !dbg !99 + %874 = getelementptr i32, ptr addrspace(1) %4, i64 %873, !dbg !99 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !100 + %875 = ptrtoint ptr addrspace(1) %872 to i64, !dbg !100 + %876 = ptrtoint ptr addrspace(1) %874 to i64, !dbg !100 + %877 = shl nuw nsw i32 %14, 5, !dbg !100 + %878 = and i32 %877, 96, !dbg !100 + %879 = and i32 %14, 48, !dbg !100 + %880 = shl nuw nsw i32 %879, 3, !dbg !100 + %881 = shl nuw nsw i32 %14, 1, !dbg !100 + %882 = and i32 %881, 120, !dbg !100 + %883 = or disjoint i32 %878, %880, !dbg !100 + %884 = xor i32 %883, %882, !dbg !100 + %885 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %884, !dbg !100 + %886 = insertelement <1 x i64> poison, i64 %875, i64 0, !dbg !100 + store <1 x i64> %886, ptr addrspace(3) %885, align 8, !dbg !100 + %887 = xor i32 %884, 520, !dbg !100 + %888 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %887, !dbg !100 + %889 = insertelement <1 x i64> poison, i64 %876, i64 0, !dbg !100 + store <1 x i64> %889, ptr addrspace(3) %888, align 8, !dbg !100 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !100 + %890 = shl nuw nsw i32 %14, 6, !dbg !100 + %891 = and i32 %890, 384, !dbg !100 + %892 = shl nuw nsw i32 %17, 4, !dbg !100 + %893 = shl nuw nsw i32 %879, 1, !dbg !100 + %894 = and i32 %14, 8, !dbg !100 + %895 = icmp eq i32 %894, 0, !dbg !100 + %896 = select i1 %895, i32 0, i32 520, !dbg !100 + %897 = or disjoint i32 %891, %892, !dbg !100 + %898 = xor i32 %897, %893, !dbg !100 + %899 = or disjoint i32 %898, %896, !dbg !100 + %900 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %899, !dbg !100 + %901 = load i64, ptr addrspace(3) %900, align 8, !dbg !100 + %902 = xor i32 %899, 8, !dbg !100 + %903 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %902, !dbg !100 + %904 = load i64, ptr addrspace(3) %903, align 8, !dbg !100 + %905 = inttoptr i64 %901 to ptr addrspace(1), !dbg !100 + %906 = inttoptr i64 %904 to ptr addrspace(1), !dbg !100 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %905, i1 %861) #5, !dbg !100 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %906, i1 %861) #5, !dbg !100 + %907 = getelementptr i32, ptr addrspace(1) %5, i64 %24, !dbg !101 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %765, i32 %766, ptr addrspace(1) %907, i1 %19) #5, !dbg !102 + %908 = add i32 %851, %868, !dbg !103 + %909 = add i32 %852, %868, !dbg !103 + %910 = sext i32 %908 to i64, !dbg !104 + %911 = getelementptr i32, ptr addrspace(1) %6, i64 %910, !dbg !104 + %912 = sext i32 %909 to i64, !dbg !104 + %913 = getelementptr i32, ptr addrspace(1) %6, i64 %912, !dbg !104 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !105 + %914 = ptrtoint ptr addrspace(1) %911 to i64, !dbg !105 + %915 = ptrtoint ptr addrspace(1) %913 to i64, !dbg !105 + %916 = insertelement <1 x i64> poison, i64 %914, i64 0, !dbg !105 + store <1 x i64> %916, ptr addrspace(3) %885, align 8, !dbg !105 + %917 = insertelement <1 x i64> poison, i64 %915, i64 0, !dbg !105 + store <1 x i64> %917, ptr addrspace(3) %888, align 8, !dbg !105 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !105 + %918 = load i64, ptr addrspace(3) %900, align 8, !dbg !105 + %919 = load i64, ptr addrspace(3) %903, align 8, !dbg !105 + %920 = inttoptr i64 %918 to ptr addrspace(1), !dbg !105 + %921 = inttoptr i64 %919 to ptr addrspace(1), !dbg !105 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %920, i1 %861) #5, !dbg !105 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %921, i1 %861) #5, !dbg !105 + ret void, !dbg !106 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="64" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", linkageName: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 24, column: 28, scope: !9) +!11 = !DILocation(line: 24, column: 33, scope: !9) +!12 = !DILocation(line: 25, column: 44, scope: !9) +!13 = !DILocation(line: 25, column: 23, scope: !9) +!14 = !DILocation(line: 26, column: 21, scope: !9) +!15 = !DILocation(line: 27, column: 38, scope: !9) +!16 = !DILocation(line: 34, column: 40, scope: !9) +!17 = !DILocation(line: 34, column: 37, scope: !9) +!18 = !DILocation(line: 34, column: 30, scope: !9) +!19 = !DILocation(line: 34, column: 45, scope: !9) +!20 = !DILocation(line: 39, column: 18, scope: !9) +!21 = !DILocation(line: 0, scope: !9) +!22 = !DILocation(line: 627, column: 44, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !9, file: !24, discriminator: 0) +!24 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!25 = !DILocation(line: 46, column: 71, scope: !9) +!26 = !DILocation(line: 537, column: 21, scope: !23, inlinedAt: !25) +!27 = !DILocation(line: 574, column: 22, scope: !23, inlinedAt: !25) +!28 = !DILocation(line: 599, column: 28, scope: !23, inlinedAt: !25) +!29 = !DILocation(line: 600, column: 38, scope: !23, inlinedAt: !25) +!30 = !DILocation(line: 600, column: 46, scope: !23, inlinedAt: !25) +!31 = !DILocation(line: 600, column: 15, scope: !23, inlinedAt: !25) +!32 = !DILocation(line: 601, column: 48, scope: !23, inlinedAt: !25) +!33 = !DILocation(line: 601, column: 59, scope: !23, inlinedAt: !25) +!34 = !DILocation(line: 601, column: 22, scope: !23, inlinedAt: !25) +!35 = !DILocation(line: 538, column: 40, scope: !23, inlinedAt: !25) +!36 = !DILocation(line: 291, column: 36, scope: !37, inlinedAt: !25) +!37 = distinct !DILexicalBlockFile(scope: !9, file: !38, discriminator: 0) +!38 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!39 = !DILocation(line: 261, column: 15, scope: !37, inlinedAt: !25) +!40 = !DILocation(line: 539, column: 41, scope: !23, inlinedAt: !25) +!41 = !DILocation(line: 548, column: 23, scope: !23, inlinedAt: !25) +!42 = !DILocation(line: 551, column: 23, scope: !23, inlinedAt: !25) +!43 = !DILocation(line: 591, column: 21, scope: !23, inlinedAt: !25) +!44 = !DILocation(line: 594, column: 40, scope: !23, inlinedAt: !25) +!45 = !DILocation(line: 594, column: 29, scope: !23, inlinedAt: !25) +!46 = !DILocation(line: 594, column: 23, scope: !23, inlinedAt: !25) +!47 = !DILocation(line: 47, column: 20, scope: !9) +!48 = !DILocation(line: 574, column: 22, scope: !23, inlinedAt: !49) +!49 = !DILocation(line: 51, column: 71, scope: !9) +!50 = !DILocation(line: 599, column: 28, scope: !23, inlinedAt: !49) +!51 = !DILocation(line: 600, column: 38, scope: !23, inlinedAt: !49) +!52 = !DILocation(line: 600, column: 46, scope: !23, inlinedAt: !49) +!53 = !DILocation(line: 600, column: 15, scope: !23, inlinedAt: !49) +!54 = !DILocation(line: 601, column: 59, scope: !23, inlinedAt: !49) +!55 = !DILocation(line: 601, column: 22, scope: !23, inlinedAt: !49) +!56 = !DILocation(line: 538, column: 40, scope: !23, inlinedAt: !49) +!57 = !DILocation(line: 291, column: 36, scope: !37, inlinedAt: !49) +!58 = !DILocation(line: 261, column: 15, scope: !37, inlinedAt: !49) +!59 = !DILocation(line: 539, column: 41, scope: !23, inlinedAt: !49) +!60 = !DILocation(line: 548, column: 23, scope: !23, inlinedAt: !49) +!61 = !DILocation(line: 551, column: 23, scope: !23, inlinedAt: !49) +!62 = !DILocation(line: 601, column: 48, scope: !23, inlinedAt: !49) +!63 = !DILocation(line: 591, column: 21, scope: !23, inlinedAt: !49) +!64 = !DILocation(line: 594, column: 40, scope: !23, inlinedAt: !49) +!65 = !DILocation(line: 594, column: 29, scope: !23, inlinedAt: !49) +!66 = !DILocation(line: 594, column: 23, scope: !23, inlinedAt: !49) +!67 = !DILocation(line: 54, column: 35, scope: !9) +!68 = !DILocation(line: 261, column: 15, scope: !37, inlinedAt: !69) +!69 = !DILocation(line: 55, column: 26, scope: !9) +!70 = !DILocation(line: 291, column: 36, scope: !37, inlinedAt: !69) +!71 = !DILocation(line: 60, column: 21, scope: !9) +!72 = !DILocation(line: 58, column: 35, scope: !9) +!73 = !DILocation(line: 261, column: 15, scope: !37, inlinedAt: !74) +!74 = !DILocation(line: 59, column: 26, scope: !9) +!75 = !DILocation(line: 291, column: 36, scope: !37, inlinedAt: !74) +!76 = !DILocation(line: 61, column: 21, scope: !9) +!77 = !DILocation(line: 64, column: 19, scope: !9) +!78 = !DILocation(line: 66, column: 35, scope: !9) +!79 = !DILocation(line: 68, column: 20, scope: !9) +!80 = !DILocation(line: 69, column: 20, scope: !9) +!81 = !DILocation(line: 70, column: 35, scope: !9) +!82 = !DILocation(line: 71, column: 38, scope: !9) +!83 = !DILocation(line: 71, column: 63, scope: !9) +!84 = !DILocation(line: 75, column: 19, scope: !9) +!85 = !DILocation(line: 76, column: 35, scope: !9) +!86 = !DILocation(line: 77, column: 20, scope: !9) +!87 = !DILocation(line: 78, column: 20, scope: !9) +!88 = !DILocation(line: 79, column: 35, scope: !9) +!89 = !DILocation(line: 80, column: 38, scope: !9) +!90 = !DILocation(line: 80, column: 63, scope: !9) +!91 = !DILocation(line: 81, column: 25, scope: !9) +!92 = !DILocation(line: 81, column: 37, scope: !9) +!93 = !DILocation(line: 82, column: 25, scope: !9) +!94 = !DILocation(line: 82, column: 37, scope: !9) +!95 = !DILocation(line: 83, column: 25, scope: !9) +!96 = !DILocation(line: 83, column: 47, scope: !9) +!97 = !DILocation(line: 84, column: 52, scope: !9) +!98 = !DILocation(line: 84, column: 49, scope: !9) +!99 = !DILocation(line: 84, column: 25, scope: !9) +!100 = !DILocation(line: 84, column: 85, scope: !9) +!101 = !DILocation(line: 85, column: 25, scope: !9) +!102 = !DILocation(line: 85, column: 47, scope: !9) +!103 = !DILocation(line: 86, column: 49, scope: !9) +!104 = !DILocation(line: 86, column: 25, scope: !9) +!105 = !DILocation(line: 86, column: 85, scope: !9) +!106 = !DILocation(line: 86, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..bba020fe68227b272b66017595391e12cea739b3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx @@ -0,0 +1,1774 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 // -- Begin function triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 114, 107, 47, 99, 114, 107, 120, 112, 119, 105, 119, 104, 122, 107, 118, 117, 110, 55, 105, 53, 100, 50, 112, 101, 103, 111, 102, 116, 104, 121, 102, 105, 106, 110, 53, 119, 121, 103, 119, 110, 97, 101, 118, 51, 116, 119, 119, 108, 114, 98, 117, 111, 106, 113, 101, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 57, 32, 60, 32, 49, 55}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 114, 107, 47, 99, 114, 107, 120, 112, 119, 105, 119, 104, 122, 107, 118, 117, 110, 55, 105, 53, 100, 50, 112, 101, 103, 111, 102, 116, 104, 121, 102, 105, 106, 110, 53, 119, 121, 103, 119, 110, 97, 101, 118, 51, 116, 119, 119, 108, 114, 98, 117, 111, 106, 113, 101, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 48, 32, 60, 32, 49, 55}; +.extern .shared .align 16 .b8 global_smem[]; + // @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.visible .entry triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_7, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_8, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_9, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_10 +) +.reqntid 64 +{ + .reg .pred %p<204>; + .reg .b32 %r<600>; + .reg .b64 %rd<78>; + .loc 1 18 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:18:0 +$L__func_begin0: + .loc 1 18 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:18:0 + +// %bb.0: + ld.param.b64 %rd14, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0]; +$L__tmp0: + .loc 1 24 28 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:24:28 + mov.u32 %r16, %ctaid.x; + .loc 1 24 33 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:24:33 + shl.b32 %r1, %r16, 3; + .loc 1 25 44 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:25:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 56; + bfe.u32 %r17, %r2, 3, 3; + and.b32 %r4, %r2, 7; + .loc 1 25 23 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:25:23 + or.b32 %r5, %r17, %r1; + .loc 1 26 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:26:21 + setp.gt.s32 %p2, %r5, 127; + setp.lt.s32 %p1, %r5, 128; + .loc 1 27 38 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:27:38 + shl.b32 %r6, %r4, 1; + or.b32 %r7, %r6, 1; + .loc 1 34 40 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:34:40 + shl.b32 %r18, %r5, 4; + .loc 1 34 37 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:34:37 + or.b32 %r19, %r18, %r6; + .loc 1 34 30 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:34:30 + mad.wide.s32 %rd13, %r19, 8, %rd14; + .loc 1 34 45 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:34:45 + // begin inline asm + mov.u64 %rd11, 0x0; + mov.u64 %rd12, 0x0; + @%p1 ld.global.v2.b64 { %rd11, %rd12 }, [ %rd13 + 0 ]; + // end inline asm + .loc 1 39 18 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:39:18 + add.s64 %rd15, %rd11, -1; + setp.gt.u64 %p3, %rd15, 16382; + setp.lt.u64 %p4, %rd15, 16383; + add.s64 %rd16, %rd12, -1; + setp.lt.u64 %p5, %rd16, 16383; + .loc 1 0 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:0 + selp.b32 %r20, 1, 0, %p4; + selp.b32 %r21, 1, 0, %p5; +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.b32 %r22, %r2, 1; + shr.u32 %r23, %r2, 1; + bfe.u32 %r24, %r2, 1, 1; + shr.u32 %r25, %r2, 2; + bfe.u32 %r26, %r2, 2, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r27, %r22, 1; + xor.b32 %r28, %r24, 1; + xor.b32 %r29, %r26, 1; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.pred %p6, %p5, %p3; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ne.b32 %p7, %r22, 0; + xor.pred %p8, %p6, %p7; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r30, %r20, %r21; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r31, %r30, 0, %p8; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r32, %r31, %r20; + xor.b32 %r33, %r31, %r21; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r34, %r7, %r6; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r35, %r34, 0, %p8; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r36, %r35, %r6; + xor.b32 %r37, %r35, %r7; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r38, %r32, %r27; + mul.lo.s32 %r39, %r33, %r27; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r40, %r38, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r41, %r38, %r40; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r42, %r39, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r43, %r39, %r42; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r44, %r32, %r22; + mul.lo.s32 %r45, %r33, %r22; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r46, %r44, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r47, %r44, %r46; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r48, %r45, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r49, %r45, %r48; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r50, %r36, %r27; + mul.lo.s32 %r51, %r37, %r27; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r52, %r50, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r53, %r50, %r52; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r54, %r51, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r55, %r51, %r54; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r56, %r36, %r22; + mul.lo.s32 %r57, %r37, %r22; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r58, %r56, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r59, %r56, %r58; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r60, %r57, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r61, %r57, %r60; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.b32 %r62, %r23, 1; + setp.ne.b32 %p9, %r62, 0; + setp.ge.s32 %p10, %r41, %r47; + setp.ge.s32 %p11, %r43, %r49; + setp.ne.b32 %p12, %r43, %r49; + setp.ne.b32 %p13, %r41, %r47; + setp.le.s32 %p14, %r55, %r61; + setp.le.s32 %p15, %r53, %r59; + or.pred %p16, %p13, %p15; + or.pred %p17, %p12, %p14; + and.pred %p18, %p11, %p17; + and.pred %p19, %p10, %p16; + xor.pred %p20, %p19, %p9; + xor.pred %p21, %p18, %p9; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r63, %r41, %r47; + xor.b32 %r64, %r43, %r49; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r65, 0, %r64, %p21; + selp.b32 %r66, 0, %r63, %p20; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r67, %r66, %r32; + xor.b32 %r68, %r65, %r33; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r69, %r53, %r59; + xor.b32 %r70, %r55, %r61; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r71, 0, %r70, %p21; + selp.b32 %r72, 0, %r69, %p20; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r73, %r72, %r36; + xor.b32 %r74, %r71, %r37; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.s32 %p22, %r67, %r68; + setp.ne.b32 %p23, %r67, %r68; + setp.le.s32 %p24, %r73, %r74; + or.pred %p25, %p23, %p24; + and.pred %p26, %p22, %p25; + xor.pred %p27, %p26, %p9; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r75, %r67, %r68; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r76, 0, %r75, %p27; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r77, %r73, %r74; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r78, 0, %r77, %p27; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r79, %r78, %r73; + xor.b32 %r80, %r78, %r74; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r81, %r79, %r28; + mul.lo.s32 %r82, %r80, %r28; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r83, %r79, %r24; + mul.lo.s32 %r84, %r80, %r24; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.b32 %r85, %r25, 1; + setp.ne.b32 %p28, %r85, 0; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r86, %r76, %r68; + xor.b32 %r87, %r76, %r67; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r88, %r87, %r28; + mul.lo.s32 %r89, %r86, %r28; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r90, %r88, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r91, %r88, %r90; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r92, %r89, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r93, %r89, %r92; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r94, %r87, %r24; + mul.lo.s32 %r95, %r86, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r96, %r94, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r97, %r94, %r96; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r98, %r95, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r99, %r95, %r98; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r100, %r81, 2, 31, -1; + shfl.sync.bfly.b32 %r101, %r82, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r102, %r82, %r101; + add.s32 %r103, %r81, %r100; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r104, %r83, 2, 31, -1; + shfl.sync.bfly.b32 %r105, %r84, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r106, %r84, %r105; + add.s32 %r107, %r83, %r104; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.s32 %p29, %r93, %r99; + setp.ge.s32 %p30, %r91, %r97; + setp.ne.b32 %p31, %r91, %r97; + setp.ne.b32 %p32, %r93, %r99; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r108, %r93, %r99; + xor.b32 %r109, %r91, %r97; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.le.s32 %p33, %r103, %r107; + setp.le.s32 %p34, %r102, %r106; + or.pred %p35, %p32, %p34; + or.pred %p36, %p31, %p33; + and.pred %p37, %p30, %p36; + and.pred %p38, %p29, %p35; + xor.pred %p39, %p38, %p28; + xor.pred %p40, %p37, %p28; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r110, 0, %r109, %p40; + selp.b32 %r111, 0, %r108, %p39; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r112, %r111, %r86; + xor.b32 %r113, %r110, %r87; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r114, %r102, %r106; + xor.b32 %r115, %r103, %r107; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r116, 0, %r115, %p40; + selp.b32 %r117, 0, %r114, %p39; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r118, %r117, %r80; + xor.b32 %r119, %r116, %r79; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r120, %r113, %r27; + mul.lo.s32 %r121, %r112, %r27; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r122, %r120, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r123, %r120, %r122; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r124, %r121, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r125, %r121, %r124; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r126, %r113, %r22; + mul.lo.s32 %r127, %r112, %r22; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r128, %r126, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r129, %r126, %r128; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r130, %r127, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r131, %r127, %r130; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r132, %r119, %r27; + mul.lo.s32 %r133, %r118, %r27; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r134, %r132, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r135, %r132, %r134; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r136, %r133, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r137, %r133, %r136; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r138, %r119, %r22; + mul.lo.s32 %r139, %r118, %r22; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r140, %r138, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r141, %r138, %r140; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r142, %r139, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r143, %r139, %r142; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.s32 %p41, %r123, %r129; + setp.ge.s32 %p42, %r125, %r131; + setp.ne.b32 %p43, %r125, %r131; + setp.ne.b32 %p44, %r123, %r129; + setp.le.s32 %p45, %r137, %r143; + setp.le.s32 %p46, %r135, %r141; + or.pred %p47, %p44, %p46; + or.pred %p48, %p43, %p45; + and.pred %p49, %p42, %p48; + and.pred %p50, %p41, %p47; + xor.pred %p51, %p50, %p28; + xor.pred %p52, %p49, %p28; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r144, %r123, %r129; + xor.b32 %r145, %r125, %r131; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r146, 0, %r145, %p52; + selp.b32 %r147, 0, %r144, %p51; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r148, %r147, %r113; + xor.b32 %r149, %r146, %r112; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r150, %r135, %r141; + xor.b32 %r151, %r137, %r143; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r152, 0, %r151, %p52; + selp.b32 %r153, 0, %r150, %p51; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r154, %r153, %r119; + xor.b32 %r155, %r152, %r118; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.s32 %p53, %r148, %r149; + setp.ne.b32 %p54, %r148, %r149; + setp.le.s32 %p55, %r154, %r155; + or.pred %p56, %p54, %p55; + and.pred %p57, %p53, %p56; + xor.pred %p58, %p57, %p28; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r156, %r148, %r149; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r157, 0, %r156, %p58; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r158, %r157, %r149; + xor.b32 %r159, %r157, %r148; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r160, %r154, %r155; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r161, 0, %r160, %p58; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r162, %r161, %r155; + xor.b32 %r163, %r161, %r154; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r164, %r159, %r29; + mul.lo.s32 %r165, %r158, %r29; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r166, %r164, 4, 31, -1; + shfl.sync.bfly.b32 %r167, %r165, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r168, %r164, %r166; + add.s32 %r169, %r165, %r167; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r170, %r159, %r26; + mul.lo.s32 %r171, %r158, %r26; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r172, %r170, 4, 31, -1; + shfl.sync.bfly.b32 %r173, %r171, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r174, %r170, %r172; + add.s32 %r175, %r171, %r173; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r176, %r163, %r29; + mul.lo.s32 %r177, %r162, %r29; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r178, %r176, 4, 31, -1; + shfl.sync.bfly.b32 %r179, %r177, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r180, %r177, %r179; + add.s32 %r181, %r176, %r178; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r182, %r163, %r26; + mul.lo.s32 %r183, %r162, %r26; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r184, %r182, 4, 31, -1; + shfl.sync.bfly.b32 %r185, %r183, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r186, %r183, %r185; + add.s32 %r187, %r182, %r184; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.lt.s32 %p59, %r169, %r175; + setp.lt.s32 %p60, %r168, %r174; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p61, %r168, %r174; + setp.eq.b32 %p62, %r169, %r175; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p63, %r181, %r187; + setp.gt.s32 %p64, %r180, %r186; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.pred %p65, %p62, %p64; + and.pred %p66, %p61, %p63; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + or.pred %p67, %p60, %p66; + or.pred %p68, %p59, %p65; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r188, %r168, %r174; + xor.b32 %r189, %r169, %r175; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r190, %r189, 0, %p68; + selp.b32 %r191, %r188, 0, %p67; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r192, %r191, %r159; + xor.b32 %r193, %r190, %r158; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r194, %r180, %r186; + xor.b32 %r195, %r181, %r187; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r196, %r195, 0, %p67; + selp.b32 %r197, %r194, 0, %p68; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r198, %r197, %r162; + xor.b32 %r199, %r196, %r163; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r200, %r193, %r28; + mul.lo.s32 %r201, %r192, %r28; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r202, %r201, 2, 31, -1; + shfl.sync.bfly.b32 %r203, %r200, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r204, %r201, %r202; + add.s32 %r205, %r200, %r203; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r206, %r193, %r24; + mul.lo.s32 %r207, %r192, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r208, %r207, 2, 31, -1; + shfl.sync.bfly.b32 %r209, %r206, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r210, %r207, %r208; + add.s32 %r211, %r206, %r209; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r212, %r199, %r28; + mul.lo.s32 %r213, %r198, %r28; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r214, %r212, 2, 31, -1; + shfl.sync.bfly.b32 %r215, %r213, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r216, %r213, %r215; + add.s32 %r217, %r212, %r214; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r218, %r199, %r24; + mul.lo.s32 %r219, %r198, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r220, %r218, 2, 31, -1; + shfl.sync.bfly.b32 %r221, %r219, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r222, %r219, %r221; + add.s32 %r223, %r218, %r220; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.lt.s32 %p69, %r205, %r211; + setp.lt.s32 %p70, %r204, %r210; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p71, %r204, %r210; + setp.eq.b32 %p72, %r205, %r211; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p73, %r217, %r223; + setp.gt.s32 %p74, %r216, %r222; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.pred %p75, %p72, %p74; + and.pred %p76, %p71, %p73; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + or.pred %p77, %p70, %p76; + or.pred %p78, %p69, %p75; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r224, %r204, %r210; + xor.b32 %r225, %r205, %r211; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r226, %r225, 0, %p78; + selp.b32 %r227, %r224, 0, %p77; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r228, %r227, %r192; + xor.b32 %r229, %r226, %r193; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r230, %r217, %r223; + xor.b32 %r231, %r216, %r222; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r232, %r231, 0, %p78; + selp.b32 %r233, %r230, 0, %p77; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r234, %r233, %r199; + xor.b32 %r235, %r232, %r198; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r236, %r229, %r27; + mul.lo.s32 %r237, %r228, %r27; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r238, %r237, 1, 31, -1; + shfl.sync.bfly.b32 %r239, %r236, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r240, %r237, %r238; + add.s32 %r241, %r236, %r239; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r242, %r229, %r22; + mul.lo.s32 %r243, %r228, %r22; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r244, %r243, 1, 31, -1; + shfl.sync.bfly.b32 %r245, %r242, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r246, %r243, %r244; + add.s32 %r247, %r242, %r245; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r248, %r235, %r27; + mul.lo.s32 %r249, %r234, %r27; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r250, %r249, 1, 31, -1; + shfl.sync.bfly.b32 %r251, %r248, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r252, %r248, %r251; + add.s32 %r253, %r249, %r250; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r254, %r235, %r22; + mul.lo.s32 %r255, %r234, %r22; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r256, %r255, 1, 31, -1; + shfl.sync.bfly.b32 %r257, %r254, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r258, %r254, %r257; + add.s32 %r259, %r255, %r256; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.lt.s32 %p79, %r241, %r247; + setp.lt.s32 %p80, %r240, %r246; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p81, %r240, %r246; + setp.eq.b32 %p82, %r241, %r247; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p83, %r253, %r259; + setp.gt.s32 %p84, %r252, %r258; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.pred %p85, %p82, %p84; + and.pred %p86, %p81, %p83; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + or.pred %p87, %p80, %p86; + or.pred %p88, %p79, %p85; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r260, %r240, %r246; + xor.b32 %r261, %r241, %r247; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r262, %r261, 0, %p88; + selp.b32 %r263, %r260, 0, %p87; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r264, %r263, %r228; + xor.b32 %r265, %r262, %r229; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r266, %r253, %r259; + xor.b32 %r267, %r252, %r258; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r268, %r267, 0, %p88; + selp.b32 %r269, %r266, 0, %p87; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r270, %r269, %r234; + xor.b32 %r271, %r268, %r235; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.lt.s32 %p89, %r264, %r265; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p90, %r264, %r265; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p91, %r270, %r271; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r272, %r270, %r271; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r273, %r272, 0, %p91; + selp.b32 %r274, %r273, 0, %p90; + selp.b32 %r275, %r272, %r274, %p89; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r8, %r275, %r270; + xor.b32 %r9, %r275, %r271; +$L__tmp2: + .loc 1 47 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:47:20 + setp.ne.b64 %p92, %rd11, 16384; + setp.eq.b64 %p93, %rd11, 16384; + setp.eq.b64 %p94, %rd12, 16384; + .loc 1 0 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:0 + selp.b32 %r276, 1, 0, %p93; + selp.b32 %r277, 1, 0, %p94; +$L__tmp3: + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + and.pred %p95, %p94, %p92; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.pred %p96, %p95, %p7; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r278, %r276, %r277; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r279, %r278, 0, %p96; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r280, %r279, %r276; + xor.b32 %r281, %r279, %r277; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r282, %r34, 0, %p96; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r283, %r282, %r6; + xor.b32 %r284, %r282, %r7; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r285, %r280, %r27; + mul.lo.s32 %r286, %r281, %r27; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r287, %r285, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r288, %r287, %r285; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r289, %r286, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r290, %r289, %r286; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r291, %r280, %r22; + mul.lo.s32 %r292, %r281, %r22; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r293, %r291, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r294, %r293, %r291; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r295, %r292, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r296, %r295, %r292; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r297, %r283, %r27; + mul.lo.s32 %r298, %r284, %r27; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r299, %r297, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r300, %r299, %r297; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r301, %r298, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r302, %r301, %r298; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r303, %r283, %r22; + mul.lo.s32 %r304, %r284, %r22; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r305, %r303, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r306, %r305, %r303; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r307, %r304, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r308, %r307, %r304; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.s32 %p97, %r288, %r294; + setp.ge.s32 %p98, %r290, %r296; + setp.ne.b32 %p99, %r290, %r296; + setp.ne.b32 %p100, %r288, %r294; + setp.le.s32 %p101, %r302, %r308; + setp.le.s32 %p102, %r300, %r306; + or.pred %p103, %p100, %p102; + or.pred %p104, %p99, %p101; + and.pred %p105, %p98, %p104; + and.pred %p106, %p97, %p103; + xor.pred %p107, %p106, %p9; + xor.pred %p108, %p105, %p9; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r309, %r296, %r290; + xor.b32 %r310, %r306, %r300; + xor.b32 %r311, %r294, %r288; + xor.b32 %r312, %r308, %r302; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r313, 0, %r309, %p108; + selp.b32 %r314, 0, %r310, %p107; + selp.b32 %r315, 0, %r311, %p107; + selp.b32 %r316, 0, %r312, %p108; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r317, %r314, %r283; + xor.b32 %r318, %r313, %r281; + xor.b32 %r319, %r316, %r284; + xor.b32 %r320, %r315, %r280; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.s32 %p109, %r320, %r318; + setp.ne.b32 %p110, %r318, %r320; + setp.le.s32 %p111, %r317, %r319; + or.pred %p112, %p110, %p111; + and.pred %p113, %p109, %p112; + xor.pred %p114, %p113, %p9; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r321, %r318, %r320; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r322, 0, %r321, %p114; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r323, %r322, %r320; + xor.b32 %r324, %r322, %r318; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r325, %r319, %r317; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r326, 0, %r325, %p114; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r327, %r326, %r317; + xor.b32 %r328, %r326, %r319; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r329, %r323, %r28; + mul.lo.s32 %r330, %r324, %r28; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r331, %r329, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r332, %r329, %r331; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r333, %r330, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r334, %r330, %r333; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r335, %r323, %r24; + mul.lo.s32 %r336, %r324, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r337, %r335, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r338, %r335, %r337; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r339, %r336, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r340, %r336, %r339; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r341, %r327, %r28; + mul.lo.s32 %r342, %r328, %r28; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r343, %r341, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r344, %r341, %r343; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r345, %r342, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r346, %r342, %r345; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r347, %r327, %r24; + mul.lo.s32 %r348, %r328, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r349, %r347, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r350, %r347, %r349; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r351, %r348, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r352, %r348, %r351; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.s32 %p115, %r334, %r340; + setp.ge.s32 %p116, %r332, %r338; + setp.ne.b32 %p117, %r332, %r338; + setp.ne.b32 %p118, %r334, %r340; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r353, %r332, %r338; + xor.b32 %r354, %r334, %r340; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.le.s32 %p119, %r344, %r350; + setp.le.s32 %p120, %r346, %r352; + or.pred %p121, %p118, %p120; + or.pred %p122, %p117, %p119; + and.pred %p123, %p116, %p122; + and.pred %p124, %p115, %p121; + xor.pred %p125, %p124, %p28; + xor.pred %p126, %p123, %p28; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r355, 0, %r353, %p126; + selp.b32 %r356, 0, %r354, %p125; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r357, %r355, %r323; + xor.b32 %r358, %r356, %r324; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r359, %r346, %r352; + xor.b32 %r360, %r344, %r350; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r361, 0, %r360, %p126; + selp.b32 %r362, 0, %r359, %p125; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r363, %r362, %r328; + xor.b32 %r364, %r361, %r327; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r365, %r357, %r27; + mul.lo.s32 %r366, %r358, %r27; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r367, %r365, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r368, %r365, %r367; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r369, %r366, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r370, %r366, %r369; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r371, %r357, %r22; + mul.lo.s32 %r372, %r358, %r22; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r373, %r371, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r374, %r371, %r373; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r375, %r372, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r376, %r372, %r375; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r377, %r364, %r27; + mul.lo.s32 %r378, %r363, %r27; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r379, %r377, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r380, %r377, %r379; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r381, %r378, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r382, %r378, %r381; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r383, %r364, %r22; + mul.lo.s32 %r384, %r363, %r22; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r385, %r383, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r386, %r383, %r385; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r387, %r384, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r388, %r384, %r387; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.s32 %p127, %r368, %r374; + setp.ge.s32 %p128, %r370, %r376; + setp.ne.b32 %p129, %r370, %r376; + setp.ne.b32 %p130, %r368, %r374; + setp.le.s32 %p131, %r382, %r388; + setp.le.s32 %p132, %r380, %r386; + or.pred %p133, %p130, %p132; + or.pred %p134, %p129, %p131; + and.pred %p135, %p128, %p134; + and.pred %p136, %p127, %p133; + xor.pred %p137, %p136, %p28; + xor.pred %p138, %p135, %p28; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r389, %r368, %r374; + xor.b32 %r390, %r370, %r376; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r391, 0, %r390, %p138; + selp.b32 %r392, 0, %r389, %p137; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r393, %r392, %r357; + xor.b32 %r394, %r391, %r358; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r395, %r380, %r386; + xor.b32 %r396, %r382, %r388; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r397, 0, %r396, %p138; + selp.b32 %r398, 0, %r395, %p137; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r399, %r398, %r364; + xor.b32 %r400, %r397, %r363; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.s32 %p139, %r393, %r394; + setp.ne.b32 %p140, %r393, %r394; + setp.le.s32 %p141, %r399, %r400; + or.pred %p142, %p140, %p141; + and.pred %p143, %p139, %p142; + xor.pred %p144, %p143, %p28; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r401, %r393, %r394; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r402, 0, %r401, %p144; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r403, %r402, %r393; + xor.b32 %r404, %r402, %r394; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r405, %r399, %r400; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r406, 0, %r405, %p144; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r407, %r406, %r399; + xor.b32 %r408, %r406, %r400; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r409, %r403, %r29; + mul.lo.s32 %r410, %r404, %r29; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r411, %r409, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r412, %r409, %r411; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r413, %r410, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r414, %r410, %r413; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r415, %r403, %r26; + mul.lo.s32 %r416, %r404, %r26; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r417, %r415, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r418, %r415, %r417; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r419, %r416, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r420, %r416, %r419; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r421, %r407, %r29; + mul.lo.s32 %r422, %r408, %r29; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r423, %r421, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r424, %r421, %r423; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r425, %r422, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r426, %r422, %r425; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r427, %r407, %r26; + mul.lo.s32 %r428, %r408, %r26; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r429, %r427, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r430, %r427, %r429; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r431, %r428, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r432, %r428, %r431; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.lt.s32 %p145, %r412, %r418; + setp.lt.s32 %p146, %r414, %r420; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.eq.b32 %p147, %r414, %r420; + setp.eq.b32 %p148, %r412, %r418; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.gt.s32 %p149, %r426, %r432; + setp.gt.s32 %p150, %r424, %r430; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + and.pred %p151, %p148, %p150; + and.pred %p152, %p147, %p149; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + or.pred %p153, %p146, %p152; + or.pred %p154, %p145, %p151; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r433, %r414, %r420; + xor.b32 %r434, %r412, %r418; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r435, %r434, 0, %p154; + selp.b32 %r436, %r433, 0, %p153; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r437, %r436, %r404; + xor.b32 %r438, %r435, %r403; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r439, %r424, %r430; + xor.b32 %r440, %r426, %r432; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r441, %r439, 0, %p154; + selp.b32 %r442, %r440, 0, %p153; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r443, %r441, %r407; + xor.b32 %r444, %r442, %r408; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r445, %r438, %r28; + mul.lo.s32 %r446, %r437, %r28; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r447, %r445, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r448, %r445, %r447; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r449, %r446, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r450, %r446, %r449; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r451, %r438, %r24; + mul.lo.s32 %r452, %r437, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r453, %r451, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r454, %r451, %r453; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r455, %r452, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r456, %r452, %r455; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r457, %r443, %r28; + mul.lo.s32 %r458, %r444, %r28; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r459, %r457, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r460, %r457, %r459; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r461, %r458, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r462, %r458, %r461; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r463, %r443, %r24; + mul.lo.s32 %r464, %r444, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r465, %r463, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r466, %r463, %r465; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r467, %r464, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r468, %r464, %r467; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.lt.s32 %p155, %r448, %r454; + setp.lt.s32 %p156, %r450, %r456; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.eq.b32 %p157, %r450, %r456; + setp.eq.b32 %p158, %r448, %r454; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.gt.s32 %p159, %r462, %r468; + setp.gt.s32 %p160, %r460, %r466; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + and.pred %p161, %p158, %p160; + and.pred %p162, %p157, %p159; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + or.pred %p163, %p156, %p162; + or.pred %p164, %p155, %p161; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r469, %r450, %r456; + xor.b32 %r470, %r448, %r454; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r471, %r470, 0, %p164; + selp.b32 %r472, %r469, 0, %p163; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r473, %r472, %r437; + xor.b32 %r474, %r471, %r438; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r475, %r460, %r466; + xor.b32 %r476, %r462, %r468; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r477, %r475, 0, %p164; + selp.b32 %r478, %r476, 0, %p163; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r479, %r477, %r443; + xor.b32 %r480, %r478, %r444; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r481, %r474, %r27; + mul.lo.s32 %r482, %r473, %r27; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r483, %r481, 1, 31, -1; + shfl.sync.bfly.b32 %r485, %r482, 1, 31, -1; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r487, %r474, %r22; + mul.lo.s32 %r488, %r473, %r22; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r489, %r487, 1, 31, -1; + shfl.sync.bfly.b32 %r491, %r488, 1, 31, -1; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r493, %r479, %r27; + mul.lo.s32 %r494, %r480, %r27; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r495, %r493, 1, 31, -1; + shfl.sync.bfly.b32 %r497, %r494, 1, 31, -1; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r499, %r479, %r22; + mul.lo.s32 %r500, %r480, %r22; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r501, %r499, 1, 31, -1; + shfl.sync.bfly.b32 %r503, %r500, 1, 31, -1; +$L__tmp4: + .loc 1 54 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:54:35 + and.pred %p178, %p1, %p4; + selp.b64 %rd17, 1, 0, %p178; + and.pred %p179, %p1, %p5; + selp.b64 %rd18, 1, 0, %p179; +$L__tmp5: + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + add.s64 %rd19, %rd17, %rd18; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + cvt.u32.u64 %r521, %rd19; + shfl.sync.bfly.b32 %r522, %r521, 4, 31, -1; + mov.b32 %r523, 0; + shfl.sync.bfly.b32 %r524, %r523, 4, 31, -1; + cvt.u64.u32 %rd20, %r522; + cvt.u64.u32 %rd21, %r524; + shl.b64 %rd22, %rd21, 32; + or.b64 %rd23, %rd20, %rd22; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + add.s64 %rd24, %rd19, %rd23; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + mov.b64 {_, %r525}, %rd24; + cvt.u32.u64 %r526, %rd24; + shfl.sync.bfly.b32 %r527, %r526, 2, 31, -1; + shfl.sync.bfly.b32 %r528, %r525, 2, 31, -1; + cvt.u64.u32 %rd25, %r527; + cvt.u64.u32 %rd26, %r528; + shl.b64 %rd27, %rd26, 32; + or.b64 %rd28, %rd25, %rd27; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + add.s64 %rd29, %rd24, %rd28; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + mov.b64 {_, %r529}, %rd29; + cvt.u32.u64 %r530, %rd29; + shfl.sync.bfly.b32 %r531, %r530, 1, 31, -1; + shfl.sync.bfly.b32 %r532, %r529, 1, 31, -1; + cvt.u64.u32 %rd30, %r531; + cvt.u64.u32 %rd31, %r532; + shl.b64 %rd32, %rd31, 32; + or.b64 %rd33, %rd30, %rd32; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + add.s64 %rd34, %rd29, %rd33; +$L__tmp6: + .loc 1 60 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:60:21 + mov.b32 %r533, global_smem; + add.s32 %r534, %r533, %r3; + st.shared.b64 [%r534], %rd34; + bar.sync 0; + shl.b32 %r535, %r4, 3; + add.s32 %r536, %r533, %r535; + ld.shared.b64 %rd2, [%r536]; + .loc 1 58 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:58:35 + and.pred %p180, %p1, %p93; + selp.b64 %rd35, 1, 0, %p180; + and.pred %p181, %p1, %p94; + selp.b64 %rd36, 1, 0, %p181; +$L__tmp7: + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + add.s64 %rd37, %rd35, %rd36; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + cvt.u32.u64 %r537, %rd37; + shfl.sync.bfly.b32 %r538, %r537, 4, 31, -1; + shfl.sync.bfly.b32 %r539, %r523, 4, 31, -1; + cvt.u64.u32 %rd38, %r538; + cvt.u64.u32 %rd39, %r539; + shl.b64 %rd40, %rd39, 32; + or.b64 %rd41, %rd38, %rd40; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + add.s64 %rd42, %rd37, %rd41; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + mov.b64 {_, %r540}, %rd42; + cvt.u32.u64 %r541, %rd42; + shfl.sync.bfly.b32 %r542, %r541, 2, 31, -1; + shfl.sync.bfly.b32 %r543, %r540, 2, 31, -1; + cvt.u64.u32 %rd43, %r542; + cvt.u64.u32 %rd44, %r543; + shl.b64 %rd45, %rd44, 32; + or.b64 %rd46, %rd43, %rd45; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + add.s64 %rd47, %rd42, %rd46; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + mov.b64 {_, %r544}, %rd47; + cvt.u32.u64 %r545, %rd47; + shfl.sync.bfly.b32 %r546, %r545, 1, 31, -1; + shfl.sync.bfly.b32 %r547, %r544, 1, 31, -1; + cvt.u64.u32 %rd48, %r546; + cvt.u64.u32 %rd49, %r547; + shl.b64 %rd50, %rd49, 32; + or.b64 %rd51, %rd48, %rd50; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + add.s64 %rd3, %rd47, %rd51; +$L__tmp8: + .loc 1 61 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:61:21 + bar.sync 0; + st.shared.b64 [%r534], %rd3; + bar.sync 0; + .loc 1 60 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:60:21 + cvt.u32.u64 %r548, %rd34; + .loc 1 64 19 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:64:19 + setp.lt.s32 %p182, %r6, %r548; + setp.lt.s32 %p183, %r7, %r548; + .loc 1 66 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:66:35 + selp.b32 %r549, %r8, 16, %p182; + selp.b32 %r550, %r9, 16, %p183; + .loc 1 68 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:68:20 + add.s32 %r551, %r549, 17; + add.s32 %r552, %r550, 17; + .loc 1 69 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:69:20 + setp.lt.s32 %p184, %r549, 0; + setp.lt.s32 %p185, %r550, 0; + .loc 1 70 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:70:35 + selp.b32 %r12, %r551, %r549, %p184; + selp.b32 %r13, %r552, %r550, %p185; + .loc 1 71 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:71:63 + max.u32 %r553, %r12, %r13; + setp.lt.u32 %p186, %r553, 17; + or.pred %p187, %p2, %p186; + @%p187 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + .loc 1 0 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:0 + add.s32 %r484, %r481, %r483; + add.s32 %r486, %r482, %r485; + add.s32 %r490, %r487, %r489; + add.s32 %r492, %r488, %r491; + add.s32 %r496, %r493, %r495; + add.s32 %r498, %r494, %r497; + add.s32 %r502, %r499, %r501; + add.s32 %r504, %r500, %r503; + setp.lt.s32 %p165, %r486, %r492; + setp.lt.s32 %p166, %r484, %r490; + setp.eq.b32 %p167, %r484, %r490; + setp.eq.b32 %p168, %r486, %r492; + setp.gt.s32 %p169, %r496, %r502; + setp.gt.s32 %p170, %r498, %r504; + and.pred %p171, %p168, %p170; + and.pred %p172, %p167, %p169; + or.pred %p173, %p166, %p172; + or.pred %p174, %p165, %p171; + xor.b32 %r505, %r484, %r490; + xor.b32 %r506, %r486, %r492; + selp.b32 %r507, %r506, 0, %p174; + selp.b32 %r508, %r505, 0, %p173; + xor.b32 %r509, %r508, %r474; + xor.b32 %r510, %r507, %r473; + xor.b32 %r511, %r496, %r502; + xor.b32 %r512, %r498, %r504; + selp.b32 %r513, %r511, 0, %p173; + selp.b32 %r514, %r512, 0, %p174; + xor.b32 %r515, %r513, %r479; + xor.b32 %r516, %r514, %r480; + setp.lt.s32 %p175, %r509, %r510; + setp.eq.b32 %p176, %r509, %r510; + setp.gt.s32 %p177, %r515, %r516; + xor.b32 %r517, %r515, %r516; + selp.b32 %r518, %r517, 0, %p177; + selp.b32 %r519, %r518, 0, %p176; + selp.b32 %r520, %r517, %r519, %p175; + xor.b32 %r10, %r520, %r515; + xor.b32 %r11, %r520, %r516; + .loc 1 61 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:61:21 + ld.shared.b64 %rd4, [%r536]; + cvt.u32.u64 %r554, %rd3; + .loc 1 71 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:71:63 + bar.sync 0; + .loc 1 75 19 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:75:19 + setp.lt.s32 %p189, %r6, %r554; + setp.lt.s32 %p190, %r7, %r554; + .loc 1 76 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:76:35 + selp.b32 %r555, %r10, 16, %p189; + selp.b32 %r556, %r11, 16, %p190; + .loc 1 77 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:77:20 + add.s32 %r557, %r555, 17; + add.s32 %r558, %r556, 17; + .loc 1 78 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:78:20 + setp.lt.s32 %p191, %r555, 0; + setp.lt.s32 %p192, %r556, 0; + .loc 1 79 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:79:35 + selp.b32 %r14, %r557, %r555, %p191; + selp.b32 %r15, %r558, %r556, %p192; + .loc 1 80 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:80:63 + max.u32 %r559, %r14, %r15; + setp.lt.u32 %p193, %r559, 17; + or.pred %p194, %p2, %p193; + @%p194 bra $L__BB0_4; + bra.uni $L__BB0_3; +$L__BB0_4: + .loc 1 0 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:0:63 + ld.param.b64 %rd10, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6]; + ld.param.b64 %rd9, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5]; + ld.param.b64 %rd8, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4]; + ld.param.b64 %rd7, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3]; + ld.param.b64 %rd6, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2]; + ld.param.b64 %rd5, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1]; + cvt.s64.s32 %rd1, %r19; + .loc 1 61 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:61:21 + cvt.u32.u64 %r561, %rd4; + .loc 1 60 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:60:21 + cvt.u32.u64 %r560, %rd2; + .loc 1 25 23 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:25:23 + or.b32 %r570, %r1, %r4; + .loc 1 26 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:26:21 + setp.lt.s32 %p198, %r570, 128; + .loc 1 80 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:80:63 + bar.sync 0; + .loc 1 81 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:81:25 + mul.wide.s32 %rd60, %r570, 4; + add.s64 %rd52, %rd5, %rd60; + .loc 1 81 37 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:81:37 + setp.eq.b32 %p203, %r3, 0; + and.pred %p195, %p203, %p198; + // begin inline asm + @%p195 st.global.b32 [ %rd52 + 0 ], { %r560 }; + // end inline asm + .loc 1 82 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:82:25 + add.s64 %rd53, %rd6, %rd60; + .loc 1 82 37 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:82:37 + // begin inline asm + @%p195 st.global.b32 [ %rd53 + 0 ], { %r561 }; + // end inline asm + .loc 1 83 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:83:25 + shl.b64 %rd61, %rd1, 2; + add.s64 %rd54, %rd7, %rd61; + .loc 1 83 47 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:83:47 + // begin inline asm + @%p1 st.global.v2.b32 [ %rd54 + 0 ], { %r8, %r9 }; + // end inline asm + .loc 1 84 52 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:84:52 + mul.lo.s32 %r571, %r5, 17; + .loc 1 84 49 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:84:49 + add.s32 %r572, %r12, %r571; + add.s32 %r573, %r13, %r571; + .loc 1 84 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:84:25 + mad.wide.s32 %rd62, %r572, 4, %rd8; + mad.wide.s32 %rd63, %r573, 4, %rd8; + .loc 1 84 85 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:84:85 + bar.sync 0; + shl.b32 %r574, %r2, 5; + and.b32 %r575, %r574, 96; + and.b32 %r576, %r2, 48; + shl.b32 %r577, %r576, 3; + shl.b32 %r578, %r2, 1; + and.b32 %r579, %r578, 120; + or.b32 %r580, %r575, %r577; + xor.b32 %r581, %r580, %r579; + add.s32 %r583, %r533, %r581; + st.shared.b64 [%r583], %rd62; + xor.b32 %r584, %r581, 8; + add.s32 %r585, %r533, %r584; + st.shared.b64 [%r585+512], %rd63; + bar.sync 0; + shl.b32 %r586, %r2, 6; + and.b32 %r587, %r586, 384; + shl.b32 %r588, %r4, 4; + shl.b32 %r589, %r576, 1; + bfe.s32 %r590, %r2, 3, 1; + and.b32 %r591, %r590, 520; + or.b32 %r592, %r587, %r588; + xor.b32 %r593, %r592, %r589; + or.b32 %r594, %r593, %r591; + add.s32 %r595, %r533, %r594; + ld.shared.b64 %rd55, [%r595]; + xor.b32 %r596, %r594, 8; + add.s32 %r597, %r533, %r596; + ld.shared.b64 %rd56, [%r597]; + mov.b32 %r564, 1; + // begin inline asm + @%p198 st.global.b32 [ %rd55 + 0 ], { %r564 }; + // end inline asm + // begin inline asm + @%p198 st.global.b32 [ %rd56 + 0 ], { %r564 }; + // end inline asm + .loc 1 85 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:85:25 + add.s64 %rd57, %rd9, %rd61; + .loc 1 85 47 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:85:47 + // begin inline asm + @%p1 st.global.v2.b32 [ %rd57 + 0 ], { %r10, %r11 }; + // end inline asm + .loc 1 86 49 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:86:49 + add.s32 %r598, %r14, %r571; + add.s32 %r599, %r15, %r571; + .loc 1 86 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:86:25 + mad.wide.s32 %rd64, %r598, 4, %rd10; + mad.wide.s32 %rd65, %r599, 4, %rd10; + .loc 1 86 85 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:86:85 + bar.sync 0; + st.shared.b64 [%r583], %rd64; + st.shared.b64 [%r585+512], %rd65; + bar.sync 0; + ld.shared.b64 %rd58, [%r595]; + ld.shared.b64 %rd59, [%r597]; + // begin inline asm + @%p198 st.global.b32 [ %rd58 + 0 ], { %r564 }; + // end inline asm + // begin inline asm + @%p198 st.global.b32 [ %rd59 + 0 ], { %r564 }; + // end inline asm + .loc 1 86 4 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:86:4 + ret; +$L__BB0_1: + .loc 1 71 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:71:63 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd72, assertFunc_0; + cvta.global.u64 %rd73, %rd72; + st.param.b64 [param3], %rd73; + mov.b64 %rd74, assertFile_0; + cvta.global.u64 %rd75, %rd74; + st.param.b64 [param1], %rd75; + mov.b64 %rd76, assertMessage_0; + cvta.global.u64 %rd77, %rd76; + st.param.b64 [param0], %rd77; + st.param.b64 [param4], 1; + st.param.b32 [param2], 71; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__BB0_3: + .loc 1 80 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:80:63 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd66, assertFunc_1; + cvta.global.u64 %rd67, %rd66; + st.param.b64 [param3], %rd67; + mov.b64 %rd68, assertFile_1; + cvta.global.u64 %rd69, %rd68; + st.param.b64 [param1], %rd69; + mov.b64 %rd70, assertMessage_1; + cvta.global.u64 %rd71, %rd70; + st.param.b64 [param0], %rd71; + st.param.b64 [param4], 1; + st.param.b32 [param2], 80; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 376 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x171 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 114 +.b8 107 +.b8 120 +.b8 112 +.b8 119 +.b8 105 +.b8 119 +.b8 104 +.b8 122 +.b8 107 +.b8 118 +.b8 117 +.b8 110 +.b8 55 +.b8 105 +.b8 53 +.b8 100 +.b8 50 +.b8 112 +.b8 101 +.b8 103 +.b8 111 +.b8 102 +.b8 116 +.b8 104 +.b8 121 +.b8 102 +.b8 105 +.b8 106 +.b8 110 +.b8 53 +.b8 119 +.b8 121 +.b8 103 +.b8 119 +.b8 110 +.b8 97 +.b8 101 +.b8 118 +.b8 51 +.b8 116 +.b8 119 +.b8 119 +.b8 108 +.b8 114 +.b8 98 +.b8 117 +.b8 111 +.b8 106 +.b8 113 +.b8 101 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 114 +.b8 107 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x7a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 116 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 112 +.b8 117 +.b8 116 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 110 +.b8 101 +.b8 119 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 115 +.b8 99 +.b8 97 +.b8 108 +.b8 97 +.b8 114 +.b8 95 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 119 +.b8 104 +.b8 101 +.b8 114 +.b8 101 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x105:0x76 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x132:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x14a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 55 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x162:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 59 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source b/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source new file mode 100644 index 0000000000000000000000000000000000000000..7031ba5e5213b015e46317b3bc7432d4ff039b1c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source @@ -0,0 +1,1405 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":18:0) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc141 = loc(unknown) +#loc166 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc170 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc175 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc179 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc188 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc193 = loc("in_ptr0"(#loc)) +#loc194 = loc("out_ptr4"(#loc)) +#loc195 = loc("out_ptr5"(#loc)) +#loc196 = loc("out_ptr6"(#loc)) +#loc197 = loc("out_ptr7"(#loc)) +#loc198 = loc("out_ptr8"(#loc)) +#loc199 = loc("out_ptr9"(#loc)) +#loc200 = loc("xnumel"(#loc)) +#loc201 = loc("r0_numel"(#loc)) +#loc257 = loc("x"(#loc91)) +#loc258 = loc("idxs"(#loc91)) +#loc259 = loc("x"(#loc95)) +#loc260 = loc("idxs"(#loc95)) +#loc265 = loc("x"(#loc103)) +#loc266 = loc("idxs"(#loc103)) +#loc267 = loc("flip"(#loc103)) +#loc323 = loc("input"(#loc166)) +#loc324 = loc("a"(#loc170)) +#loc325 = loc("b"(#loc170)) +#loc327 = loc("x"(#loc175)) +#loc328 = loc("x"(#loc179)) +#loc329 = loc("input"(#loc188)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 128 : i32 loc(#loc202) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc203) + %xoffset = tt.get_program_id x : i32 loc(#loc204) + %xoffset_2 = arith.constant 8 : i32 loc(#loc205) + %xoffset_3 = arith.constant 8 : i32 loc(#loc205) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc205) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc206) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc207) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc208) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc208) + %xmask = arith.constant dense<128> : tensor<8x1xi32> loc(#loc209) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<8x1xi32> loc(#loc209) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc210) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc211) + %r0_offset = arith.constant 0 : i32 loc(#loc212) + %r0_mask = arith.constant true loc(#loc213) + %r0_mask_10 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %tmp0 = arith.constant 16 : i32 loc(#loc214) + %tmp0_11 = arith.constant 16 : i32 loc(#loc214) + %tmp0_12 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc214) + %tmp0_13 = arith.muli %tmp0_12, %xindex_7 : tensor<8x1xi32> loc(#loc214) + %tmp0_14 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc215) + %tmp0_15 = tt.broadcast %tmp0_13 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc215) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<8x16xi32> loc(#loc215) + %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc216) + %tmp0_18 = tt.addptr %tmp0_17, %tmp0_16 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc216) + %tmp0_19 = arith.constant 0.000000e+00 : f32 loc(#loc217) + %tmp0_20 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc217) + %tmp0_21 = arith.constant dense<0.000000e+00> : tensor<8x16xf32> loc(#loc217) + %tmp0_22 = arith.fptosi %tmp0_21 : tensor<8x16xf32> to tensor<8x16xi64> loc(#loc217) + %tmp0_23 = tt.load %tmp0_18, %tmp0_20, %tmp0_22 : tensor<8x16x!tt.ptr> loc(#loc217) + %tmp1 = arith.constant 0 : i64 loc(#loc218) + %tmp1_24 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc218) + %tmp2 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc219) + %tmp2_25 = arith.cmpi sgt, %tmp0_23, %tmp2 : tensor<8x16xi64> loc(#loc219) + %tmp3 = arith.constant 16384 : i64 loc(#loc220) + %tmp3_26 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc220) + %tmp4 = arith.constant dense<16384> : tensor<8x16xi64> loc(#loc221) + %tmp4_27 = arith.cmpi slt, %tmp0_23, %tmp4 : tensor<8x16xi64> loc(#loc221) + %tmp5 = arith.andi %tmp2_25, %tmp4_27 : tensor<8x16xi1> loc(#loc222) + %tmp6 = arith.extui %tmp5 : tensor<8x16xi1> to tensor<8x16xi8> loc(#loc223) + %tmp7 = arith.extsi %tmp6 : tensor<8x16xi8> to tensor<8x16xi32> loc(#loc224) + %tmp9 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc225) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16> -> tensor<8x16xi16> loc(#loc226) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp7, %tmp11) : (tensor<8x16xi32>, tensor<8x16xi16>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc26) + %tmp14 = arith.constant dense<16384> : tensor<8x16xi64> loc(#loc227) + %tmp14_28 = arith.cmpi eq, %tmp0_23, %tmp14 : tensor<8x16xi64> loc(#loc227) + %tmp15 = arith.extui %tmp14_28 : tensor<8x16xi1> to tensor<8x16xi8> loc(#loc228) + %tmp16 = arith.extsi %tmp15 : tensor<8x16xi8> to tensor<8x16xi32> loc(#loc229) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp16, %tmp11) : (tensor<8x16xi32>, tensor<8x16xi16>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc30) + %tmp20 = arith.extsi %tmp7 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc230) + %tmp23 = arith.constant 0 : i32 loc(#loc231) + %tmp23_29 = arith.constant 0 : i64 loc(#loc231) + %tmp23_30 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc231) + %tmp23_31 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc231) + %tmp23_32 = arith.select %tmp23_31, %tmp20, %tmp23_30 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc231) + %tmp24 = tt.call @"triton.language.standard.sum__i64S8_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp23_32) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc232) + %tmp24_33 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc233) + %tmp25 = arith.extsi %tmp16 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc234) + %tmp28 = arith.constant 0 : i32 loc(#loc235) + %tmp28_34 = arith.constant 0 : i64 loc(#loc235) + %tmp28_35 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc235) + %tmp28_36 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc235) + %tmp28_37 = arith.select %tmp28_36, %tmp25, %tmp28_35 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc235) + %tmp29 = tt.call @"triton.language.standard.sum__i64S8_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp28_37) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc236) + %tmp29_38 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc237) + %tmp30 = arith.trunci %tmp24_33 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc238) + %tmp31 = arith.trunci %tmp29_38 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc239) + %tmp32 = arith.extsi %0#1 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc240) + %tmp33 = arith.trunci %tmp32 : tensor<8x16xi64> to tensor<8x16xi32> loc(#loc241) + %tmp34 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc242) + %tmp34_39 = tt.broadcast %tmp30 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc242) + %tmp34_40 = arith.cmpi slt, %tmp34, %tmp34_39 : tensor<8x16xi32> loc(#loc242) + %tmp35 = arith.constant 16 : i32 loc(#loc243) + %tmp35_41 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc243) + %tmp36 = arith.constant dense<16> : tensor<8x16xi32> loc(#loc244) + %tmp36_42 = arith.select %tmp34_40, %tmp33, %tmp36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc244) + %tmp37 = arith.constant 17 : i32 loc(#loc245) + %tmp37_43 = arith.constant dense<17> : tensor<8x16xi32> loc(#loc245) + %tmp38 = arith.addi %tmp36_42, %tmp37_43 : tensor<8x16xi32> loc(#loc246) + %tmp39 = arith.constant 0 : i32 loc(#loc247) + %tmp39_44 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc247) + %tmp39_45 = arith.cmpi slt, %tmp36_42, %tmp39_44 : tensor<8x16xi32> loc(#loc247) + %tmp40 = arith.select %tmp39_45, %tmp38, %tmp36_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc248) + %c0_i32 = arith.constant 0 : i32 loc(#loc50) + %cst = arith.constant dense<0> : tensor<8x16xi32> loc(#loc50) + %2 = arith.cmpi sle, %cst, %tmp40 : tensor<8x16xi32> loc(#loc50) + %c17_i32 = arith.constant 17 : i32 loc(#loc51) + %cst_46 = arith.constant dense<17> : tensor<8x16xi32> loc(#loc51) + %3 = arith.cmpi slt, %tmp40, %cst_46 : tensor<8x16xi32> loc(#loc51) + %4 = arith.andi %2, %3 : tensor<8x16xi1> loc(#loc52) + %true = arith.constant true loc(#loc53) + %cst_47 = arith.constant dense : tensor<8x1xi1> loc(#loc53) + %5 = arith.xori %xmask_8, %cst_47 : tensor<8x1xi1> loc(#loc53) + %6 = tt.broadcast %5 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc54) + %7 = arith.ori %4, %6 : tensor<8x16xi1> loc(#loc54) + tt.assert %7, "index out of bounds: 0 <= tmp40 < 17" : tensor<8x16xi1> loc(#loc55) + %tmp42 = arith.constant 1 : i32 loc(#loc249) + %tmp42_48 = arith.constant dense<1> : tensor<1x1xi32> loc(#loc249) + %tmp43 = arith.extsi %1#1 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc250) + %tmp44 = arith.trunci %tmp43 : tensor<8x16xi64> to tensor<8x16xi32> loc(#loc251) + %tmp45 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc252) + %tmp45_49 = tt.broadcast %tmp31 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc252) + %tmp45_50 = arith.cmpi slt, %tmp45, %tmp45_49 : tensor<8x16xi32> loc(#loc252) + %tmp46 = arith.constant dense<16> : tensor<8x16xi32> loc(#loc253) + %tmp46_51 = arith.select %tmp45_50, %tmp44, %tmp46 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc253) + %tmp47 = arith.addi %tmp46_51, %tmp37_43 : tensor<8x16xi32> loc(#loc254) + %tmp48 = arith.constant 0 : i32 loc(#loc255) + %tmp48_52 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc255) + %tmp48_53 = arith.cmpi slt, %tmp46_51, %tmp48_52 : tensor<8x16xi32> loc(#loc255) + %tmp49 = arith.select %tmp48_53, %tmp47, %tmp46_51 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc256) + %c0_i32_54 = arith.constant 0 : i32 loc(#loc64) + %cst_55 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc64) + %8 = arith.cmpi sle, %cst_55, %tmp49 : tensor<8x16xi32> loc(#loc64) + %c17_i32_56 = arith.constant 17 : i32 loc(#loc65) + %cst_57 = arith.constant dense<17> : tensor<8x16xi32> loc(#loc65) + %9 = arith.cmpi slt, %tmp49, %cst_57 : tensor<8x16xi32> loc(#loc65) + %10 = arith.andi %8, %9 : tensor<8x16xi1> loc(#loc66) + %true_58 = arith.constant true loc(#loc67) + %cst_59 = arith.constant dense : tensor<8x1xi1> loc(#loc67) + %11 = arith.xori %xmask_8, %cst_59 : tensor<8x1xi1> loc(#loc67) + %12 = tt.broadcast %11 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc68) + %13 = arith.ori %10, %12 : tensor<8x16xi1> loc(#loc68) + tt.assert %13, "index out of bounds: 0 <= tmp49 < 17" : tensor<8x16xi1> loc(#loc69) + %14 = tt.splat %out_ptr4 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc70) + %15 = tt.addptr %14, %xindex_7 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc70) + tt.store %15, %tmp30, %xmask_8 : tensor<8x1x!tt.ptr> loc(#loc71) + %16 = tt.splat %out_ptr5 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc72) + %17 = tt.addptr %16, %xindex_7 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc72) + tt.store %17, %tmp31, %xmask_8 : tensor<8x1x!tt.ptr> loc(#loc73) + %c16_i32 = arith.constant 16 : i32 loc(#loc74) + %c16_i32_60 = arith.constant 16 : i32 loc(#loc74) + %cst_61 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc74) + %18 = arith.muli %cst_61, %xindex_7 : tensor<8x1xi32> loc(#loc74) + %19 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc75) + %20 = tt.broadcast %18 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc75) + %21 = arith.addi %19, %20 : tensor<8x16xi32> loc(#loc75) + %22 = tt.splat %out_ptr6 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc76) + %23 = tt.addptr %22, %21 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc76) + %24 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc77) + tt.store %23, %tmp33, %24 : tensor<8x16x!tt.ptr> loc(#loc77) + %c17_i32_62 = arith.constant 17 : i32 loc(#loc78) + %c17_i32_63 = arith.constant 17 : i32 loc(#loc78) + %cst_64 = arith.constant dense<17> : tensor<8x1xi32> loc(#loc78) + %25 = arith.muli %cst_64, %xindex_7 : tensor<8x1xi32> loc(#loc78) + %26 = tt.broadcast %25 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc79) + %27 = arith.addi %tmp40, %26 : tensor<8x16xi32> loc(#loc79) + %28 = tt.splat %out_ptr7 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc80) + %29 = tt.addptr %28, %27 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc80) + %cst_65 = arith.constant dense<1> : tensor<8x16xi32> loc(#loc81) + %30 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc81) + tt.store %29, %cst_65, %30 : tensor<8x16x!tt.ptr> loc(#loc81) + %c16_i32_66 = arith.constant 16 : i32 loc(#loc82) + %c16_i32_67 = arith.constant 16 : i32 loc(#loc82) + %cst_68 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc82) + %31 = arith.muli %cst_68, %xindex_7 : tensor<8x1xi32> loc(#loc82) + %32 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc83) + %33 = tt.broadcast %31 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc83) + %34 = arith.addi %32, %33 : tensor<8x16xi32> loc(#loc83) + %35 = tt.splat %out_ptr8 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc84) + %36 = tt.addptr %35, %34 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc84) + %37 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc85) + tt.store %36, %tmp44, %37 : tensor<8x16x!tt.ptr> loc(#loc85) + %c17_i32_69 = arith.constant 17 : i32 loc(#loc86) + %c17_i32_70 = arith.constant 17 : i32 loc(#loc86) + %cst_71 = arith.constant dense<17> : tensor<8x1xi32> loc(#loc86) + %38 = arith.muli %cst_71, %xindex_7 : tensor<8x1xi32> loc(#loc86) + %39 = tt.broadcast %38 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc87) + %40 = arith.addi %tmp49, %39 : tensor<8x16xi32> loc(#loc87) + %41 = tt.splat %out_ptr9 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc88) + %42 = tt.addptr %41, %40 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc88) + %cst_72 = arith.constant dense<1> : tensor<8x16xi32> loc(#loc89) + %43 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc89) + tt.store %42, %cst_72, %43 : tensor<8x16x!tt.ptr> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc91)), %idxs: tensor<8x16xi16> loc("idxs"(#loc91))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<8x16xi32>, tensor<8x16xi16>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc92) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc92) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc92) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc92) + tt.return %3#0, %3#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc93) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc94) + %5 = ub.poison : tensor<8x16xi32> loc(#loc94) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc94) + } loc(#loc91) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc95)), %idxs: tensor<8x16xi16> loc("idxs"(#loc95))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i16S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi16>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + tt.return %0#0, %0#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc101) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x16xi32> loc(#loc102) + %2 = ub.poison : tensor<8x16xi32> loc(#loc102) + tt.return %1, %2 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i16S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi16> loc("idxs"(#loc103)), %flip: tensor<8x16xi32> loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi16> -> tensor<64x2x1xi16> loc(#loc282) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc283) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc284) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<64x2x1xi16> loc(#loc284) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<64x2x1xi16>) -> tensor<64x1xi32> loc(#loc285) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc286) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc287) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc288) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc289) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<64x2x1xi16> loc(#loc289) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<64x2x1xi16>) -> tensor<64x1xi32> loc(#loc290) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc291) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc292) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_27 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_28 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_49 = arith.constant true loc(#loc300) + %cond_50 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<8x16xi1> loc(#loc300) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<8x16xi1> loc(#loc301) + %cond_53 = arith.ori %cond, %cond_52 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_53 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_50 = arith.ori %eq, %eq_49 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_50 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<8x16xi32> loc(#loc306) + %cond_30 = arith.andi %3, %cond_29 : tensor<8x16xi1> loc(#loc307) + %cond_31 = arith.ori %1, %cond_30 : tensor<8x16xi1> loc(#loc308) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<8x16xi1> loc(#loc309) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<8x16xi1> loc(#loc310) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<8x16xi1> loc(#loc311) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<8x16xi1> loc(#loc312) + %cond_36 = arith.extui %cond_35 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc313) + %cond_37 = arith.xori %cond_36, %flip : tensor<8x16xi32> loc(#loc313) + %cond_38 = arith.constant 0 : i32 loc(#loc314) + %cond_39 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc314) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<8x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_43 = arith.xori %x, %ret_42 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<8x16xi32> loc(#loc319) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S8_16S__(%idxs) : (tensor<8x16xi16>) -> tensor<8x16xi16> loc(#loc320) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<8x16xi16> to tensor<8x16xi32> loc(#loc321) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_47 = arith.extsi %idxs : tensor<8x16xi16> to tensor<8x16xi32> loc(#loc322) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_43, %new_idxs_48 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x1xi32> loc("input"(#loc166))) -> tensor<64x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc167) + tt.return %0 : tensor<64x1xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x1xi32> loc(#loc169) + tt.return %1 : tensor<64x1xi32> loc(#loc169) + } loc(#loc166) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc170)), %b: i32 loc("b"(#loc170))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc171) + tt.return %0 : i32 loc(#loc172) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc173) + tt.return %1 : i32 loc(#loc173) + } loc(#loc170) + tt.func private @"triton.language.standard.sum__i16S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x1xi16> loc("input"(#loc166))) -> tensor<64x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc326) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc167) + tt.return %0 : tensor<64x1xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x1xi32> loc(#loc169) + tt.return %1 : tensor<64x1xi32> loc(#loc169) + } loc(#loc166) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%x: tensor<8x16xi32> loc("x"(#loc175))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc176) + %false = arith.constant false loc(#loc177) + tt.return %false : i1 loc(#loc177) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc178) + tt.return %1 : i1 loc(#loc178) + } loc(#loc175) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S8_16S__(%x: tensor<8x16xi32> loc("x"(#loc179))) -> tensor<8x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc180) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc181) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc181) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<8x16xi32> loc(#loc181) + %4 = arith.addi %x, %3 : tensor<8x16xi32> loc(#loc181) + tt.return %4 : tensor<8x16xi32> loc(#loc182) + ^bb1: // no predecessors + %5 = ub.poison : tensor<8x16xi32> loc(#loc183) + tt.return %5 : tensor<8x16xi32> loc(#loc183) + } loc(#loc179) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc185) + %cst = arith.constant dense : tensor<1xi1> loc(#loc185) + tt.return %cst : tensor<1xi1> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc187) + tt.return %0 : tensor<1xi1> loc(#loc187) + } loc(#loc184) + tt.func private @triton.language.standard.zeros_like__i32S8_16S__(%input: tensor<8x16xi32> loc("input"(#loc188))) -> tensor<8x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<8x16xi32> loc(#loc189) + tt.return %0 : tensor<8x16xi32> loc(#loc190) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x16xi32> loc(#loc191) + tt.return %1 : tensor<8x16xi32> loc(#loc191) + } loc(#loc188) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<8x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc185) + %cst = arith.constant dense<0> : tensor<8x16xi32> loc(#loc185) + tt.return %cst : tensor<8x16xi32> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<8x16xi32> loc(#loc187) + tt.return %0 : tensor<8x16xi32> loc(#loc187) + } loc(#loc184) + tt.func private @triton.language.standard.zeros_like__i16S8_16S__(%input: tensor<8x16xi16> loc("input"(#loc188))) -> tensor<8x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<8x16xi16> loc(#loc189) + tt.return %0 : tensor<8x16xi16> loc(#loc190) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x16xi16> loc(#loc191) + tt.return %1 : tensor<8x16xi16> loc(#loc191) + } loc(#loc188) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<8x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc185) + %cst = arith.constant dense<0> : tensor<8x16xi16> loc(#loc185) + tt.return %cst : tensor<8x16xi16> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<8x16xi16> loc(#loc187) + tt.return %0 : tensor<8x16xi16> loc(#loc187) + } loc(#loc184) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc95)), %idxs: tensor<8x16xi32> loc("idxs"(#loc95))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + tt.return %1#0, %1#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc101) + ^bb1: // no predecessors + %2 = ub.poison : tensor<8x16xi32> loc(#loc102) + %3 = ub.poison : tensor<8x16xi32> loc(#loc102) + tt.return %2, %3 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: tensor<8x16xi32> loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x2xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<32x2x2xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x2xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x2xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<8x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<8x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<8x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<8x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x2x2xi32> loc("input"(#loc166))) -> tensor<32x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc167) + tt.return %0 : tensor<32x2xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x2xi32> loc(#loc169) + tt.return %1 : tensor<32x2xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: tensor<8x16xi32> loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x1xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x1xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<8x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<8x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<8x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<8x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc95)), %idxs: tensor<8x16xi32> loc("idxs"(#loc95))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + tt.return %2#0, %2#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc101) + ^bb1: // no predecessors + %3 = ub.poison : tensor<8x16xi32> loc(#loc102) + %4 = ub.poison : tensor<8x16xi32> loc(#loc102) + tt.return %3, %4 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: tensor<8x16xi32> loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<16x2x4xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<16x2x4xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<16x2x4xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<16x2x4xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<8x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<8x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<8x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<8x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<16x2x4xi32> loc("input"(#loc166))) -> tensor<16x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc167) + tt.return %0 : tensor<16x4xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<16x4xi32> loc(#loc169) + tt.return %1 : tensor<16x4xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc95)), %idxs: tensor<8x16xi32> loc("idxs"(#loc95))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc330) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + tt.return %3#0, %3#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc101) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc102) + %5 = ub.poison : tensor<8x16xi32> loc(#loc102) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x8xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<8x2x8xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x8xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x8xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x8xi32> loc("input"(#loc166))) -> tensor<8x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc167) + tt.return %0 : tensor<8x8xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x8xi32> loc(#loc169) + tt.return %1 : tensor<8x8xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<16x2x4xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<16x2x4xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<16x2x4xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<16x2x4xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x2xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<32x2x2xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x2xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x2xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x1xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x1xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i64S8_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x16xi64> loc("input"(#loc166))) -> tensor<8xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc167) + tt.reduce.return %2 : i64 loc(#loc167) + }) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc167) + tt.return %0 : tensor<8xi64> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8xi64> loc(#loc169) + tt.return %1 : tensor<8xi64> loc(#loc169) + } loc(#loc166) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc170)), %b: i64 loc("b"(#loc170))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc171) + tt.return %0 : i64 loc(#loc172) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc173) + tt.return %1 : i64 loc(#loc173) + } loc(#loc170) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:30) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:45) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":35:30) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":36:18) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":37:34) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":38:18) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":39:18) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":40:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":41:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":43:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":45:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":46:71) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":47:20) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":48:21) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":49:21) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":51:71) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":52:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":54:35) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:26) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:29) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":56:21) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":58:35) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:26) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:29) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":60:21) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":61:21) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":62:21) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":63:21) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":64:19) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":65:32) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":66:35) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":67:44) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":68:20) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":69:20) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":70:35) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:28) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:46) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:38) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:55) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:53) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:63) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":72:31) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":73:21) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":74:21) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":75:19) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":76:35) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":77:20) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":78:20) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":79:35) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:28) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:46) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:38) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:55) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:53) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:63) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:37) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:37) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:32) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:47) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:52) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:49) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:25) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:85) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:35) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:32) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:25) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:47) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:52) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:49) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:85) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:4) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc140 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc142 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc143 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc144 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc145 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc146 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc147 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc148 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc149 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc150 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc151 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc152 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc153 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc154 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc155 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc156 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc157 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc158 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc159 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc160 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc161 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc162 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc163 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc164 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc165 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc167 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc168 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc169 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc171 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc172 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc173 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc174 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc176 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc177 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc178 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc180 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc181 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc182 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc183 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc184 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc185 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc186 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc187 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc189 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc190 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc191 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc192 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc202 = loc("xnumel"(#loc1)) +#loc203 = loc("r0_numel"(#loc2)) +#loc204 = loc("xoffset"(#loc3)) +#loc205 = loc("xoffset"(#loc4)) +#loc206 = loc("xindex"(#loc5)) +#loc207 = loc("xindex"(#loc6)) +#loc208 = loc("xindex"(#loc7)) +#loc209 = loc("xmask"(#loc8)) +#loc210 = loc("r0_index"(#loc9)) +#loc211 = loc("r0_index"(#loc10)) +#loc212 = loc("r0_offset"(#loc11)) +#loc213 = loc("r0_mask"(#loc12)) +#loc214 = loc("tmp0"(#loc13)) +#loc215 = loc("tmp0"(#loc14)) +#loc216 = loc("tmp0"(#loc15)) +#loc217 = loc("tmp0"(#loc16)) +#loc218 = loc("tmp1"(#loc17)) +#loc219 = loc("tmp2"(#loc18)) +#loc220 = loc("tmp3"(#loc19)) +#loc221 = loc("tmp4"(#loc20)) +#loc222 = loc("tmp5"(#loc21)) +#loc223 = loc("tmp6"(#loc22)) +#loc224 = loc("tmp7"(#loc23)) +#loc225 = loc("tmp9"(#loc24)) +#loc226 = loc("tmp11"(#loc25)) +#loc227 = loc("tmp14"(#loc27)) +#loc228 = loc("tmp15"(#loc28)) +#loc229 = loc("tmp16"(#loc29)) +#loc230 = loc("tmp20"(#loc31)) +#loc231 = loc("tmp23"(#loc32)) +#loc232 = loc("tmp24"(#loc33)) +#loc233 = loc("tmp24"(#loc34)) +#loc234 = loc("tmp25"(#loc35)) +#loc235 = loc("tmp28"(#loc36)) +#loc236 = loc("tmp29"(#loc37)) +#loc237 = loc("tmp29"(#loc38)) +#loc238 = loc("tmp30"(#loc39)) +#loc239 = loc("tmp31"(#loc40)) +#loc240 = loc("tmp32"(#loc41)) +#loc241 = loc("tmp33"(#loc42)) +#loc242 = loc("tmp34"(#loc43)) +#loc243 = loc("tmp35"(#loc44)) +#loc244 = loc("tmp36"(#loc45)) +#loc245 = loc("tmp37"(#loc46)) +#loc246 = loc("tmp38"(#loc47)) +#loc247 = loc("tmp39"(#loc48)) +#loc248 = loc("tmp40"(#loc49)) +#loc249 = loc("tmp42"(#loc56)) +#loc250 = loc("tmp43"(#loc57)) +#loc251 = loc("tmp44"(#loc58)) +#loc252 = loc("tmp45"(#loc59)) +#loc253 = loc("tmp46"(#loc60)) +#loc254 = loc("tmp47"(#loc61)) +#loc255 = loc("tmp48"(#loc62)) +#loc256 = loc("tmp49"(#loc63)) +#loc261 = loc("flip"(#loc96)) +#loc262 = loc("flip"(#loc97)) +#loc263 = loc("flip"(#loc98)) +#loc264 = loc("flip"(#loc99)) +#loc268 = loc("y"(#loc104)) +#loc269 = loc("right_mask"(#loc105)) +#loc270 = loc("right_mask"(#loc106)) +#loc271 = loc("left_mask"(#loc107)) +#loc272 = loc("ileft"(#loc108)) +#loc273 = loc("ileft"(#loc109)) +#loc274 = loc("ileft"(#loc110)) +#loc275 = loc("ileft"(#loc111)) +#loc276 = loc("iright"(#loc112)) +#loc277 = loc("iright"(#loc113)) +#loc278 = loc("iright"(#loc114)) +#loc279 = loc("iright"(#loc115)) +#loc280 = loc("ileft"(#loc116)) +#loc281 = loc("iright"(#loc117)) +#loc282 = loc("y_idx"(#loc118)) +#loc283 = loc("left_idx"(#loc119)) +#loc284 = loc("left_idx"(#loc120)) +#loc285 = loc("left_idx"(#loc121)) +#loc286 = loc("left_idx"(#loc122)) +#loc287 = loc("left_idx"(#loc123)) +#loc288 = loc("right_idx"(#loc124)) +#loc289 = loc("right_idx"(#loc125)) +#loc290 = loc("right_idx"(#loc126)) +#loc291 = loc("right_idx"(#loc127)) +#loc292 = loc("right_idx"(#loc128)) +#loc293 = loc("left_idx"(#loc129)) +#loc294 = loc("right_idx"(#loc130)) +#loc295 = loc("left_valid_mask"(#loc131)) +#loc296 = loc("right_valid_mask"(#loc132)) +#loc297 = loc("left_isnan"(#loc133)) +#loc298 = loc("right_isnan"(#loc134)) +#loc299 = loc("cond"(#loc135)) +#loc300 = loc("cond"(#loc138)) +#loc301 = loc("cond"(#loc139)) +#loc302 = loc("cond"(#loc140)) +#loc303 = loc("eq"(#loc142)) +#loc304 = loc("eq"(#loc145)) +#loc305 = loc("eq"(#loc146)) +#loc306 = loc("cond"(#loc147)) +#loc307 = loc("cond"(#loc148)) +#loc308 = loc("cond"(#loc149)) +#loc309 = loc("cond"(#loc150)) +#loc310 = loc("cond"(#loc151)) +#loc311 = loc("cond"(#loc152)) +#loc312 = loc("cond"(#loc153)) +#loc313 = loc("cond"(#loc154)) +#loc314 = loc("cond"(#loc155)) +#loc315 = loc("ret"(#loc156)) +#loc316 = loc("ret"(#loc157)) +#loc317 = loc("ret"(#loc158)) +#loc318 = loc("ret"(#loc159)) +#loc319 = loc("new_idxs"(#loc160)) +#loc320 = loc("new_idxs"(#loc161)) +#loc321 = loc("new_idxs"(#loc162)) +#loc322 = loc("new_idxs"(#loc163)) +#loc326 = loc("input"(#loc174)) +#loc330 = loc("flip"(#loc192)) +#loc331 = loc("cond"(#loc299)) +#loc332 = loc("cond"(#loc302)) +#loc333 = loc("eq"(#loc303)) +#loc334 = loc("eq"(#loc305)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..320d991c58533c753092fc56a2fdfea64c2e013f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir @@ -0,0 +1,1480 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [4, 8], warpsPerCTA = [2, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1, 2], threadsPerWarp = [4, 2, 4], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1, 2], threadsPerWarp = [8, 2, 2], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked3 = #ttg.blocked<{sizePerThread = [1, 1, 2], threadsPerWarp = [16, 2, 1], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked4 = #ttg.blocked<{sizePerThread = [1, 2, 1], threadsPerWarp = [32, 1, 1], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 2], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":18:0) +#loc1 = loc(unknown) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":46:71) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":51:71) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:26) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:26) +#loc117 = loc("in_ptr0"(#loc)) +#loc118 = loc("out_ptr4"(#loc)) +#loc119 = loc("out_ptr5"(#loc)) +#loc120 = loc("out_ptr6"(#loc)) +#loc121 = loc("out_ptr7"(#loc)) +#loc122 = loc("out_ptr8"(#loc)) +#loc123 = loc("out_ptr9"(#loc)) +#loc124 = loc("xnumel"(#loc)) +#loc125 = loc("r0_numel"(#loc)) +#loc144 = loc(callsite(#loc20 at #loc21)) +#loc150 = loc("ileft"(#loc29)) +#loc154 = loc("iright"(#loc34)) +#loc163 = loc("left_idx"(#loc43)) +#loc168 = loc("right_idx"(#loc48)) +#loc189 = loc(callsite(#loc20 at #loc69)) +#loc192 = loc("tmp24"(#loc72)) +#loc197 = loc("tmp29"(#loc77)) +#loc214 = loc(callsite(#loc25 at #loc144)) +#loc218 = loc(callsite(#loc25 at #loc189)) +#loc221 = loc(callsite(#loc1 at #loc192)) +#loc224 = loc(callsite(#loc1 at #loc197)) +#loc228 = loc(callsite(#loc150 at #loc214)) +#loc232 = loc(callsite(#loc154 at #loc214)) +#loc240 = loc(callsite(#loc163 at #loc214)) +#loc245 = loc(callsite(#loc168 at #loc214)) +#loc265 = loc(callsite(#loc150 at #loc218)) +#loc269 = loc(callsite(#loc154 at #loc218)) +#loc287 = loc(callsite(#loc163 at #loc218)) +#loc291 = loc(callsite(#loc168 at #loc218)) +#loc301 = loc(callsite(#loc1 at #loc228)) +#loc303 = loc(callsite(#loc1 at #loc232)) +#loc306 = loc(callsite(#loc1 at #loc240)) +#loc309 = loc(callsite(#loc1 at #loc245)) +#loc311 = loc(callsite(#loc1 at #loc265)) +#loc313 = loc(callsite(#loc1 at #loc269)) +#loc315 = loc(callsite(#loc1 at #loc287)) +#loc317 = loc(callsite(#loc1 at #loc291)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<8x1xi1, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<17> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked2> loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked3> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked4> loc(#loc1) + %cst_5 = arith.constant dense<16> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<8x1xi32, #blocked5> loc(#loc1) + %cst_7 = arith.constant dense<128> : tensor<8x1xi32, #blocked> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %cst_8 = arith.constant dense<1> : tensor<8x16xi32, #blocked5> loc(#loc1) + %cst_9 = arith.constant dense<0> : tensor<8x16xi32, #blocked> loc(#loc1) + %cst_10 = arith.constant dense<17> : tensor<8x16xi32, #blocked> loc(#loc1) + %cst_11 = arith.constant dense<16> : tensor<8x16xi32, #blocked> loc(#loc1) + %cst_12 = arith.constant dense<16384> : tensor<8x16xi64, #blocked> loc(#loc1) + %cst_13 = arith.constant dense<0> : tensor<8x16xi64, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc126) + %xoffset_14 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc127) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc128) + %xindex_15 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc128) + %xindex_16 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc128) + %xindex_17 = tt.expand_dims %xindex_15 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<8x1xi32, #blocked5> loc(#loc128) + %xindex_18 = tt.splat %xoffset_14 : i32 -> tensor<8x1xi32, #blocked> loc(#loc129) + %xindex_19 = tt.splat %xoffset_14 : i32 -> tensor<8x1xi32, #blocked5> loc(#loc129) + %xindex_20 = arith.addi %xindex_18, %xindex_16 : tensor<8x1xi32, #blocked> loc(#loc129) + %xindex_21 = arith.addi %xindex_19, %xindex_17 : tensor<8x1xi32, #blocked5> loc(#loc129) + %xmask = arith.cmpi slt, %xindex_20, %cst_7 : tensor<8x1xi32, #blocked> loc(#loc130) + %xmask_22 = arith.cmpi slt, %xindex_21, %cst_6 : tensor<8x1xi32, #blocked5> loc(#loc130) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc131) + %r0_index_23 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc131) + %tmp0 = arith.muli %xindex_20, %cst_5 : tensor<8x1xi32, #blocked> loc(#loc132) + %tmp0_24 = tt.broadcast %r0_index_23 : tensor<1x16xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc133) + %tmp0_25 = tt.broadcast %tmp0 : tensor<8x1xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc133) + %tmp0_26 = arith.addi %tmp0_24, %tmp0_25 : tensor<8x16xi32, #blocked> loc(#loc133) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc134) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi32, #blocked> loc(#loc134) + %tmp0_29 = tt.broadcast %xmask : tensor<8x1xi1, #blocked> -> tensor<8x16xi1, #blocked> loc(#loc135) + %tmp0_30 = tt.broadcast %xmask_22 : tensor<8x1xi1, #blocked5> -> tensor<8x16xi1, #blocked5> loc(#loc135) + %tmp0_31 = tt.load %tmp0_28, %tmp0_29, %cst_13 : tensor<8x16x!tt.ptr, #blocked> loc(#loc135) + %tmp2 = arith.cmpi sgt, %tmp0_31, %cst_13 : tensor<8x16xi64, #blocked> loc(#loc136) + %tmp4 = arith.cmpi slt, %tmp0_31, %cst_12 : tensor<8x16xi64, #blocked> loc(#loc137) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<8x16xi1, #blocked> loc(#loc138) + %tmp7 = arith.extui %tmp5 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc210) + %tmp9 = arith.trunci %r0_index_23 : tensor<1x16xi32, #blocked> to tensor<1x16xi16, #blocked> loc(#loc141) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16, #blocked> -> tensor<8x16xi16, #blocked> loc(#loc142) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> loc(#loc211) + %flip_32 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> loc(#loc211) + %flip_33 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> loc(#loc211) + %flip_34 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> loc(#loc211) + %flip_35 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> loc(#loc211) + %flip_36 = tt.expand_dims %flip_32 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> loc(#loc211) + %flip_37 = tt.expand_dims %flip_33 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> loc(#loc211) + %flip_38 = tt.expand_dims %flip_34 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> loc(#loc211) + %flip_39 = tt.expand_dims %flip_35 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> -> tensor<1x2x1xi32, #blocked3> loc(#loc211) + %flip_40 = tt.expand_dims %flip_36 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> -> tensor<1x2x1xi32, #blocked4> loc(#loc211) + %flip_41 = tt.expand_dims %flip_37 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> -> tensor<1x2x1xi32, #blocked2> loc(#loc211) + %flip_42 = tt.expand_dims %flip_38 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> -> tensor<1x2x1xi32, #blocked1> loc(#loc211) + %flip_43 = tt.broadcast %flip_39 : tensor<1x2x1xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc212) + %flip_44 = tt.reshape %flip_43 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc213) + %y = tt.reshape %tmp7 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc225) + %left_mask = arith.subi %cst_4, %flip_40 : tensor<1x2x1xi32, #blocked4> loc(#loc226) + %left_mask_45 = arith.subi %cst_3, %flip_39 : tensor<1x2x1xi32, #blocked3> loc(#loc226) + %left_mask_46 = arith.subi %cst_2, %flip_41 : tensor<1x2x1xi32, #blocked2> loc(#loc226) + %left_mask_47 = arith.subi %cst_1, %flip_42 : tensor<1x2x1xi32, #blocked1> loc(#loc226) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc227) + %ileft_48 = arith.muli %y, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc227) + %ileft_49 = "tt.reduce"(%ileft_48) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_50 = tt.expand_dims %ileft_49 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc229) + %ileft_51 = tt.broadcast %ileft_50 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc230) + %iright = tt.broadcast %flip_40 : tensor<1x2x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc231) + %iright_52 = arith.muli %y, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc231) + %iright_53 = "tt.reduce"(%iright_52) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_54 = tt.expand_dims %iright_53 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc233) + %iright_55 = tt.broadcast %iright_54 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc234) + %ileft_56 = tt.reshape %ileft_51 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_57 = tt.reshape %iright_55 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx = tt.reshape %tmp11 : tensor<8x16xi16, #blocked> -> tensor<64x2x1xi16, #blocked4> loc(#loc237) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc238) + %left_idx_58 = tt.broadcast %left_idx : tensor<1x2x1xi16, #blocked4> -> tensor<64x2x1xi16, #blocked4> loc(#loc239) + %left_idx_59 = arith.muli %y_idx, %left_idx_58 : tensor<64x2x1xi16, #blocked4> loc(#loc239) + %input = arith.extsi %left_idx_59 : tensor<64x2x1xi16, #blocked4> to tensor<64x2x1xi32, #blocked4> loc(#loc304) + %left_idx_60 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_61 = tt.expand_dims %left_idx_60 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc241) + %left_idx_62 = tt.broadcast %left_idx_61 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc242) + %right_idx = arith.trunci %flip_40 : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc243) + %right_idx_63 = tt.broadcast %right_idx : tensor<1x2x1xi16, #blocked4> -> tensor<64x2x1xi16, #blocked4> loc(#loc244) + %right_idx_64 = arith.muli %y_idx, %right_idx_63 : tensor<64x2x1xi16, #blocked4> loc(#loc244) + %input_65 = arith.extsi %right_idx_64 : tensor<64x2x1xi16, #blocked4> to tensor<64x2x1xi32, #blocked4> loc(#loc307) + %right_idx_66 = "tt.reduce"(%input_65) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_67 = tt.expand_dims %right_idx_66 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc246) + %right_idx_68 = tt.broadcast %right_idx_67 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc247) + %left_idx_69 = tt.reshape %left_idx_62 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_70 = tt.reshape %right_idx_68 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond = arith.cmpi slt, %ileft_56, %iright_57 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq = arith.cmpi eq, %ileft_56, %iright_57 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_71 = arith.cmpi sgt, %left_idx_69, %right_idx_70 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_72 = arith.andi %eq, %cond_71 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_73 = arith.ori %cond, %cond_72 : tensor<8x16xi1, #blocked> loc(#loc254) + %cond_74 = arith.extui %cond_73 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc255) + %cond_75 = arith.xori %cond_74, %flip_44 : tensor<8x16xi32, #blocked> loc(#loc255) + %cond_76 = arith.cmpi ne, %cond_75, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc256) + %ret = arith.xori %ileft_56, %iright_57 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_77 = arith.select %cond_76, %ret, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_78 = arith.xori %tmp7, %ret_77 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs = arith.xori %left_idx_69, %right_idx_70 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_79 = arith.select %cond_76, %new_idxs, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_80 = arith.extsi %tmp9 : tensor<1x16xi16, #blocked> to tensor<1x16xi32, #blocked> loc(#loc262) + %new_idxs_81 = tt.broadcast %new_idxs_80 : tensor<1x16xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc262) + %new_idxs_82 = arith.xori %new_idxs_81, %new_idxs_79 : tensor<8x16xi32, #blocked> loc(#loc262) + %flip_83 = tt.broadcast %flip_41 : tensor<1x2x1xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc212) + %flip_84 = tt.reshape %flip_83 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc213) + %y_85 = tt.reshape %ret_78 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc225) + %ileft_86 = tt.broadcast %left_mask_45 : tensor<1x2x1xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc227) + %ileft_87 = arith.muli %y_85, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc227) + %ileft_88 = "tt.reduce"(%ileft_87) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_89 = tt.expand_dims %ileft_88 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc229) + %ileft_90 = tt.broadcast %ileft_89 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc230) + %iright_91 = arith.muli %y_85, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc231) + %iright_92 = "tt.reduce"(%iright_91) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_93 = tt.expand_dims %iright_92 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc233) + %iright_94 = tt.broadcast %iright_93 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc234) + %ileft_95 = tt.reshape %ileft_90 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_96 = tt.reshape %iright_94 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_97 = tt.reshape %new_idxs_82 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc237) + %left_idx_98 = arith.muli %y_idx_97, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc239) + %left_idx_99 = "tt.reduce"(%left_idx_98) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_100 = tt.expand_dims %left_idx_99 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc241) + %left_idx_101 = tt.broadcast %left_idx_100 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc242) + %right_idx_102 = arith.muli %y_idx_97, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc244) + %right_idx_103 = "tt.reduce"(%right_idx_102) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_104 = tt.expand_dims %right_idx_103 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc246) + %right_idx_105 = tt.broadcast %right_idx_104 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc247) + %left_idx_106 = tt.reshape %left_idx_101 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_107 = tt.reshape %right_idx_105 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_108 = arith.cmpi slt, %ileft_95, %iright_96 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_109 = arith.cmpi eq, %ileft_95, %iright_96 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_110 = arith.cmpi sgt, %left_idx_106, %right_idx_107 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_111 = arith.andi %eq_109, %cond_110 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_112 = arith.ori %cond_108, %cond_111 : tensor<8x16xi1, #blocked> loc(#loc254) + %cond_113 = arith.extui %cond_112 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc255) + %cond_114 = arith.xori %cond_113, %flip_84 : tensor<8x16xi32, #blocked> loc(#loc255) + %cond_115 = arith.cmpi ne, %cond_114, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc256) + %ret_116 = arith.xori %ileft_95, %iright_96 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_117 = arith.select %cond_115, %ret_116, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_118 = arith.xori %ret_78, %ret_117 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_119 = arith.xori %left_idx_106, %right_idx_107 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_120 = arith.select %cond_115, %new_idxs_119, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_121 = arith.xori %new_idxs_82, %new_idxs_120 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_122 = tt.reshape %ret_118 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc225) + %ileft_123 = arith.muli %y_122, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc227) + %ileft_124 = "tt.reduce"(%ileft_123) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_125 = tt.expand_dims %ileft_124 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc229) + %ileft_126 = tt.broadcast %ileft_125 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc230) + %iright_127 = arith.muli %y_122, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc231) + %iright_128 = "tt.reduce"(%iright_127) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_129 = tt.expand_dims %iright_128 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc233) + %iright_130 = tt.broadcast %iright_129 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc234) + %ileft_131 = tt.reshape %ileft_126 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_132 = tt.reshape %iright_130 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_133 = tt.reshape %new_idxs_121 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc237) + %left_idx_134 = arith.muli %y_idx_133, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc239) + %left_idx_135 = "tt.reduce"(%left_idx_134) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_136 = tt.expand_dims %left_idx_135 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc241) + %left_idx_137 = tt.broadcast %left_idx_136 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc242) + %right_idx_138 = arith.muli %y_idx_133, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc244) + %right_idx_139 = "tt.reduce"(%right_idx_138) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_140 = tt.expand_dims %right_idx_139 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc246) + %right_idx_141 = tt.broadcast %right_idx_140 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc247) + %left_idx_142 = tt.reshape %left_idx_137 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_143 = tt.reshape %right_idx_141 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_144 = arith.cmpi slt, %ileft_131, %iright_132 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_145 = arith.cmpi eq, %ileft_131, %iright_132 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_146 = arith.cmpi sgt, %left_idx_142, %right_idx_143 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_147 = arith.andi %eq_145, %cond_146 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_148 = arith.ori %cond_144, %cond_147 : tensor<8x16xi1, #blocked> loc(#loc254) + %cond_149 = arith.extui %cond_148 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc255) + %cond_150 = arith.xori %cond_149, %flip_84 : tensor<8x16xi32, #blocked> loc(#loc255) + %cond_151 = arith.cmpi ne, %cond_150, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc256) + %ret_152 = arith.xori %ileft_131, %iright_132 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_153 = arith.select %cond_151, %ret_152, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_154 = arith.xori %ret_118, %ret_153 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_155 = arith.xori %left_idx_142, %right_idx_143 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_156 = arith.select %cond_151, %new_idxs_155, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_157 = arith.xori %new_idxs_121, %new_idxs_156 : tensor<8x16xi32, #blocked> loc(#loc262) + %flip_158 = tt.broadcast %flip_42 : tensor<1x2x1xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc212) + %flip_159 = tt.reshape %flip_158 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc213) + %y_160 = tt.reshape %ret_154 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc225) + %ileft_161 = tt.broadcast %left_mask_46 : tensor<1x2x1xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc227) + %ileft_162 = arith.muli %y_160, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc227) + %ileft_163 = "tt.reduce"(%ileft_162) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc300) + %ileft_164 = tt.expand_dims %ileft_163 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc229) + %ileft_165 = tt.broadcast %ileft_164 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc230) + %iright_166 = arith.muli %y_160, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc231) + %iright_167 = "tt.reduce"(%iright_166) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %iright_168 = tt.expand_dims %iright_167 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc233) + %iright_169 = tt.broadcast %iright_168 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc234) + %ileft_170 = tt.reshape %ileft_165 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_171 = tt.reshape %iright_169 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_172 = tt.reshape %new_idxs_157 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc237) + %left_idx_173 = arith.muli %y_idx_172, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc239) + %left_idx_174 = "tt.reduce"(%left_idx_173) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %left_idx_175 = tt.expand_dims %left_idx_174 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc241) + %left_idx_176 = tt.broadcast %left_idx_175 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc242) + %right_idx_177 = arith.muli %y_idx_172, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc244) + %right_idx_178 = "tt.reduce"(%right_idx_177) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc308) + %right_idx_179 = tt.expand_dims %right_idx_178 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc246) + %right_idx_180 = tt.broadcast %right_idx_179 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc247) + %left_idx_181 = tt.reshape %left_idx_176 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_182 = tt.reshape %right_idx_180 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_183 = arith.cmpi slt, %ileft_170, %iright_171 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_184 = arith.cmpi eq, %ileft_170, %iright_171 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_185 = arith.cmpi sgt, %left_idx_181, %right_idx_182 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_186 = arith.andi %eq_184, %cond_185 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_187 = arith.ori %cond_183, %cond_186 : tensor<8x16xi1, #blocked> loc(#loc254) + %cond_188 = arith.extui %cond_187 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc255) + %cond_189 = arith.xori %cond_188, %flip_159 : tensor<8x16xi32, #blocked> loc(#loc255) + %cond_190 = arith.cmpi ne, %cond_189, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc256) + %ret_191 = arith.xori %ileft_170, %iright_171 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_192 = arith.select %cond_190, %ret_191, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_193 = arith.xori %ret_154, %ret_192 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_194 = arith.xori %left_idx_181, %right_idx_182 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_195 = arith.select %cond_190, %new_idxs_194, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_196 = arith.xori %new_idxs_157, %new_idxs_195 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_197 = tt.reshape %ret_193 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc225) + %ileft_198 = arith.muli %y_197, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc227) + %ileft_199 = "tt.reduce"(%ileft_198) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_200 = tt.expand_dims %ileft_199 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc229) + %ileft_201 = tt.broadcast %ileft_200 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc230) + %iright_202 = arith.muli %y_197, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc231) + %iright_203 = "tt.reduce"(%iright_202) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_204 = tt.expand_dims %iright_203 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc233) + %iright_205 = tt.broadcast %iright_204 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc234) + %ileft_206 = tt.reshape %ileft_201 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_207 = tt.reshape %iright_205 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_208 = tt.reshape %new_idxs_196 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc237) + %left_idx_209 = arith.muli %y_idx_208, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc239) + %left_idx_210 = "tt.reduce"(%left_idx_209) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_211 = tt.expand_dims %left_idx_210 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc241) + %left_idx_212 = tt.broadcast %left_idx_211 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc242) + %right_idx_213 = arith.muli %y_idx_208, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc244) + %right_idx_214 = "tt.reduce"(%right_idx_213) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_215 = tt.expand_dims %right_idx_214 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc246) + %right_idx_216 = tt.broadcast %right_idx_215 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc247) + %left_idx_217 = tt.reshape %left_idx_212 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_218 = tt.reshape %right_idx_216 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_219 = arith.cmpi slt, %ileft_206, %iright_207 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_220 = arith.cmpi eq, %ileft_206, %iright_207 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_221 = arith.cmpi sgt, %left_idx_217, %right_idx_218 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_222 = arith.andi %eq_220, %cond_221 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_223 = arith.ori %cond_219, %cond_222 : tensor<8x16xi1, #blocked> loc(#loc254) + %cond_224 = arith.extui %cond_223 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc255) + %cond_225 = arith.xori %cond_224, %flip_159 : tensor<8x16xi32, #blocked> loc(#loc255) + %cond_226 = arith.cmpi ne, %cond_225, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc256) + %ret_227 = arith.xori %ileft_206, %iright_207 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_228 = arith.select %cond_226, %ret_227, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_229 = arith.xori %ret_193, %ret_228 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_230 = arith.xori %left_idx_217, %right_idx_218 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_231 = arith.select %cond_226, %new_idxs_230, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_232 = arith.xori %new_idxs_196, %new_idxs_231 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_233 = tt.reshape %ret_229 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc225) + %ileft_234 = arith.muli %y_233, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc227) + %ileft_235 = "tt.reduce"(%ileft_234) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_236 = tt.expand_dims %ileft_235 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc229) + %ileft_237 = tt.broadcast %ileft_236 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc230) + %iright_238 = arith.muli %y_233, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc231) + %iright_239 = "tt.reduce"(%iright_238) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_240 = tt.expand_dims %iright_239 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc233) + %iright_241 = tt.broadcast %iright_240 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc234) + %ileft_242 = tt.reshape %ileft_237 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_243 = tt.reshape %iright_241 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_244 = tt.reshape %new_idxs_232 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc237) + %left_idx_245 = arith.muli %y_idx_244, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc239) + %left_idx_246 = "tt.reduce"(%left_idx_245) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_247 = tt.expand_dims %left_idx_246 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc241) + %left_idx_248 = tt.broadcast %left_idx_247 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc242) + %right_idx_249 = arith.muli %y_idx_244, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc244) + %right_idx_250 = "tt.reduce"(%right_idx_249) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_251 = tt.expand_dims %right_idx_250 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc246) + %right_idx_252 = tt.broadcast %right_idx_251 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc247) + %left_idx_253 = tt.reshape %left_idx_248 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_254 = tt.reshape %right_idx_252 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_255 = arith.cmpi slt, %ileft_242, %iright_243 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_256 = arith.cmpi eq, %ileft_242, %iright_243 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_257 = arith.cmpi sgt, %left_idx_253, %right_idx_254 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_258 = arith.andi %eq_256, %cond_257 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_259 = arith.ori %cond_255, %cond_258 : tensor<8x16xi1, #blocked> loc(#loc254) + %cond_260 = arith.extui %cond_259 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc255) + %cond_261 = arith.xori %cond_260, %flip_159 : tensor<8x16xi32, #blocked> loc(#loc255) + %cond_262 = arith.cmpi ne, %cond_261, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc256) + %ret_263 = arith.xori %ileft_242, %iright_243 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_264 = arith.select %cond_262, %ret_263, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_265 = arith.xori %ret_229, %ret_264 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_266 = arith.xori %left_idx_253, %right_idx_254 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_267 = arith.select %cond_262, %new_idxs_266, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_268 = arith.xori %new_idxs_232, %new_idxs_267 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_269 = tt.reshape %ret_265 : tensor<8x16xi32, #blocked> -> tensor<8x2x8xi32, #blocked1> loc(#loc225) + %ileft_270 = tt.broadcast %left_mask_47 : tensor<1x2x1xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc227) + %ileft_271 = arith.muli %y_269, %ileft_270 : tensor<8x2x8xi32, #blocked1> loc(#loc227) + %ileft_272 = "tt.reduce"(%ileft_271) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc300) + %ileft_273 = tt.expand_dims %ileft_272 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc229) + %ileft_274 = tt.broadcast %ileft_273 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc230) + %iright_275 = arith.muli %y_269, %flip_158 : tensor<8x2x8xi32, #blocked1> loc(#loc231) + %iright_276 = "tt.reduce"(%iright_275) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc302) + %iright_277 = tt.expand_dims %iright_276 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc233) + %iright_278 = tt.broadcast %iright_277 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc234) + %ileft_279 = tt.reshape %ileft_274 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_280 = tt.reshape %iright_278 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_281 = tt.reshape %new_idxs_268 : tensor<8x16xi32, #blocked> -> tensor<8x2x8xi32, #blocked1> loc(#loc237) + %left_idx_282 = arith.muli %y_idx_281, %ileft_270 : tensor<8x2x8xi32, #blocked1> loc(#loc239) + %left_idx_283 = "tt.reduce"(%left_idx_282) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc305) + %left_idx_284 = tt.expand_dims %left_idx_283 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc241) + %left_idx_285 = tt.broadcast %left_idx_284 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc242) + %right_idx_286 = arith.muli %y_idx_281, %flip_158 : tensor<8x2x8xi32, #blocked1> loc(#loc244) + %right_idx_287 = "tt.reduce"(%right_idx_286) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc308) + %right_idx_288 = tt.expand_dims %right_idx_287 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc246) + %right_idx_289 = tt.broadcast %right_idx_288 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc247) + %left_idx_290 = tt.reshape %left_idx_285 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_291 = tt.reshape %right_idx_289 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_292 = arith.cmpi slt, %ileft_279, %iright_280 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_293 = arith.cmpi eq, %ileft_279, %iright_280 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_294 = arith.cmpi sgt, %left_idx_290, %right_idx_291 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_295 = arith.andi %eq_293, %cond_294 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_296 = arith.ori %cond_292, %cond_295 : tensor<8x16xi1, #blocked> loc(#loc254) + %ret_297 = arith.xori %ileft_279, %iright_280 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_298 = arith.select %cond_296, %ret_297, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_299 = arith.xori %ret_265, %ret_298 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_300 = arith.xori %left_idx_290, %right_idx_291 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_301 = arith.select %cond_296, %new_idxs_300, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_302 = arith.xori %new_idxs_268, %new_idxs_301 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_303 = tt.reshape %ret_299 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc225) + %ileft_304 = arith.muli %y_303, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc227) + %ileft_305 = "tt.reduce"(%ileft_304) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc300) + %ileft_306 = tt.expand_dims %ileft_305 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc229) + %ileft_307 = tt.broadcast %ileft_306 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc230) + %iright_308 = arith.muli %y_303, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc231) + %iright_309 = "tt.reduce"(%iright_308) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %iright_310 = tt.expand_dims %iright_309 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc233) + %iright_311 = tt.broadcast %iright_310 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc234) + %ileft_312 = tt.reshape %ileft_307 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_313 = tt.reshape %iright_311 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_314 = tt.reshape %new_idxs_302 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc237) + %left_idx_315 = arith.muli %y_idx_314, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc239) + %left_idx_316 = "tt.reduce"(%left_idx_315) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %left_idx_317 = tt.expand_dims %left_idx_316 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc241) + %left_idx_318 = tt.broadcast %left_idx_317 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc242) + %right_idx_319 = arith.muli %y_idx_314, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc244) + %right_idx_320 = "tt.reduce"(%right_idx_319) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc308) + %right_idx_321 = tt.expand_dims %right_idx_320 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc246) + %right_idx_322 = tt.broadcast %right_idx_321 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc247) + %left_idx_323 = tt.reshape %left_idx_318 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_324 = tt.reshape %right_idx_322 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_325 = arith.cmpi slt, %ileft_312, %iright_313 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_326 = arith.cmpi eq, %ileft_312, %iright_313 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_327 = arith.cmpi sgt, %left_idx_323, %right_idx_324 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_328 = arith.andi %eq_326, %cond_327 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_329 = arith.ori %cond_325, %cond_328 : tensor<8x16xi1, #blocked> loc(#loc254) + %ret_330 = arith.xori %ileft_312, %iright_313 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_331 = arith.select %cond_329, %ret_330, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_332 = arith.xori %ret_299, %ret_331 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_333 = arith.xori %left_idx_323, %right_idx_324 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_334 = arith.select %cond_329, %new_idxs_333, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_335 = arith.xori %new_idxs_302, %new_idxs_334 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_336 = tt.reshape %ret_332 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc225) + %ileft_337 = arith.muli %y_336, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc227) + %ileft_338 = "tt.reduce"(%ileft_337) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_339 = tt.expand_dims %ileft_338 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc229) + %ileft_340 = tt.broadcast %ileft_339 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc230) + %iright_341 = arith.muli %y_336, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc231) + %iright_342 = "tt.reduce"(%iright_341) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_343 = tt.expand_dims %iright_342 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc233) + %iright_344 = tt.broadcast %iright_343 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc234) + %ileft_345 = tt.reshape %ileft_340 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_346 = tt.reshape %iright_344 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_347 = tt.reshape %new_idxs_335 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc237) + %left_idx_348 = arith.muli %y_idx_347, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc239) + %left_idx_349 = "tt.reduce"(%left_idx_348) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_350 = tt.expand_dims %left_idx_349 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc241) + %left_idx_351 = tt.broadcast %left_idx_350 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc242) + %right_idx_352 = arith.muli %y_idx_347, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc244) + %right_idx_353 = "tt.reduce"(%right_idx_352) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_354 = tt.expand_dims %right_idx_353 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc246) + %right_idx_355 = tt.broadcast %right_idx_354 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc247) + %left_idx_356 = tt.reshape %left_idx_351 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_357 = tt.reshape %right_idx_355 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_358 = arith.cmpi slt, %ileft_345, %iright_346 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_359 = arith.cmpi eq, %ileft_345, %iright_346 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_360 = arith.cmpi sgt, %left_idx_356, %right_idx_357 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_361 = arith.andi %eq_359, %cond_360 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_362 = arith.ori %cond_358, %cond_361 : tensor<8x16xi1, #blocked> loc(#loc254) + %ret_363 = arith.xori %ileft_345, %iright_346 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_364 = arith.select %cond_362, %ret_363, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_365 = arith.xori %ret_332, %ret_364 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_366 = arith.xori %left_idx_356, %right_idx_357 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_367 = arith.select %cond_362, %new_idxs_366, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_368 = arith.xori %new_idxs_335, %new_idxs_367 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_369 = tt.reshape %ret_365 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc225) + %ileft_370 = arith.muli %y_369, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc227) + %ileft_371 = "tt.reduce"(%ileft_370) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_372 = tt.expand_dims %ileft_371 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc229) + %ileft_373 = tt.broadcast %ileft_372 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc230) + %iright_374 = arith.muli %y_369, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc231) + %iright_375 = "tt.reduce"(%iright_374) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_376 = tt.expand_dims %iright_375 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc233) + %iright_377 = tt.broadcast %iright_376 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc234) + %ileft_378 = tt.reshape %ileft_373 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_379 = tt.reshape %iright_377 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_380 = tt.reshape %new_idxs_368 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc237) + %left_idx_381 = arith.muli %y_idx_380, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc239) + %left_idx_382 = "tt.reduce"(%left_idx_381) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_383 = tt.expand_dims %left_idx_382 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc241) + %left_idx_384 = tt.broadcast %left_idx_383 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc242) + %right_idx_385 = arith.muli %y_idx_380, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc244) + %right_idx_386 = "tt.reduce"(%right_idx_385) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_387 = tt.expand_dims %right_idx_386 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc246) + %right_idx_388 = tt.broadcast %right_idx_387 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc247) + %left_idx_389 = tt.reshape %left_idx_384 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_390 = tt.reshape %right_idx_388 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_391 = arith.cmpi slt, %ileft_378, %iright_379 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_392 = arith.cmpi eq, %ileft_378, %iright_379 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_393 = arith.cmpi sgt, %left_idx_389, %right_idx_390 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_394 = arith.andi %eq_392, %cond_393 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_395 = arith.ori %cond_391, %cond_394 : tensor<8x16xi1, #blocked> loc(#loc254) + %new_idxs_396 = arith.xori %left_idx_389, %right_idx_390 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_397 = arith.select %cond_395, %new_idxs_396, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_398 = arith.xori %new_idxs_368, %new_idxs_397 : tensor<8x16xi32, #blocked> loc(#loc262) + %tmp14 = arith.cmpi eq, %tmp0_31, %cst_12 : tensor<8x16xi64, #blocked> loc(#loc186) + %tmp16 = arith.extui %tmp14 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc217) + %y_399 = tt.reshape %tmp16 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc263) + %ileft_400 = arith.muli %y_399, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc264) + %ileft_401 = "tt.reduce"(%ileft_400) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_402 = tt.expand_dims %ileft_401 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc266) + %ileft_403 = tt.broadcast %ileft_402 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc267) + %iright_404 = arith.muli %y_399, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc268) + %iright_405 = "tt.reduce"(%iright_404) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_406 = tt.expand_dims %iright_405 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc270) + %iright_407 = tt.broadcast %iright_406 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc271) + %ileft_408 = tt.reshape %ileft_403 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_409 = tt.reshape %iright_407 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc273) + %cond_410 = arith.cmpi slt, %ileft_408, %iright_409 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_411 = arith.cmpi eq, %ileft_408, %iright_409 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_412 = arith.andi %eq_411, %cond_71 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_413 = arith.ori %cond_410, %cond_412 : tensor<8x16xi1, #blocked> loc(#loc277) + %cond_414 = arith.extui %cond_413 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc278) + %cond_415 = arith.xori %cond_414, %flip_44 : tensor<8x16xi32, #blocked> loc(#loc278) + %cond_416 = arith.cmpi ne, %cond_415, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc279) + %ret_417 = arith.xori %ileft_408, %iright_409 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_418 = arith.select %cond_416, %ret_417, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_419 = arith.xori %tmp16, %ret_418 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_420 = arith.select %cond_416, %new_idxs, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_421 = arith.xori %new_idxs_81, %new_idxs_420 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_422 = tt.reshape %ret_419 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc263) + %ileft_423 = arith.muli %y_422, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc264) + %ileft_424 = "tt.reduce"(%ileft_423) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_425 = tt.expand_dims %ileft_424 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc266) + %ileft_426 = tt.broadcast %ileft_425 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc267) + %iright_427 = arith.muli %y_422, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc268) + %iright_428 = "tt.reduce"(%iright_427) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_429 = tt.expand_dims %iright_428 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc270) + %iright_430 = tt.broadcast %iright_429 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc271) + %ileft_431 = tt.reshape %ileft_426 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_432 = tt.reshape %iright_430 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_433 = tt.reshape %new_idxs_421 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc285) + %left_idx_434 = arith.muli %y_idx_433, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc286) + %left_idx_435 = "tt.reduce"(%left_idx_434) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_436 = tt.expand_dims %left_idx_435 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc288) + %left_idx_437 = tt.broadcast %left_idx_436 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc289) + %right_idx_438 = arith.muli %y_idx_433, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc290) + %right_idx_439 = "tt.reduce"(%right_idx_438) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_440 = tt.expand_dims %right_idx_439 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc292) + %right_idx_441 = tt.broadcast %right_idx_440 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc293) + %left_idx_442 = tt.reshape %left_idx_437 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_443 = tt.reshape %right_idx_441 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_444 = arith.cmpi slt, %ileft_431, %iright_432 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_445 = arith.cmpi eq, %ileft_431, %iright_432 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_446 = arith.cmpi sgt, %left_idx_442, %right_idx_443 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_447 = arith.andi %eq_445, %cond_446 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_448 = arith.ori %cond_444, %cond_447 : tensor<8x16xi1, #blocked> loc(#loc277) + %cond_449 = arith.extui %cond_448 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc278) + %cond_450 = arith.xori %cond_449, %flip_84 : tensor<8x16xi32, #blocked> loc(#loc278) + %cond_451 = arith.cmpi ne, %cond_450, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc279) + %ret_452 = arith.xori %ileft_431, %iright_432 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_453 = arith.select %cond_451, %ret_452, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_454 = arith.xori %ret_419, %ret_453 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_455 = arith.xori %left_idx_442, %right_idx_443 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_456 = arith.select %cond_451, %new_idxs_455, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_457 = arith.xori %new_idxs_421, %new_idxs_456 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_458 = tt.reshape %ret_454 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc263) + %ileft_459 = arith.muli %y_458, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc264) + %ileft_460 = "tt.reduce"(%ileft_459) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_461 = tt.expand_dims %ileft_460 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc266) + %ileft_462 = tt.broadcast %ileft_461 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc267) + %iright_463 = arith.muli %y_458, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc268) + %iright_464 = "tt.reduce"(%iright_463) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_465 = tt.expand_dims %iright_464 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc270) + %iright_466 = tt.broadcast %iright_465 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc271) + %ileft_467 = tt.reshape %ileft_462 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_468 = tt.reshape %iright_466 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_469 = tt.reshape %new_idxs_457 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc285) + %left_idx_470 = arith.muli %y_idx_469, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc286) + %left_idx_471 = "tt.reduce"(%left_idx_470) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_472 = tt.expand_dims %left_idx_471 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc288) + %left_idx_473 = tt.broadcast %left_idx_472 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc289) + %right_idx_474 = arith.muli %y_idx_469, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc290) + %right_idx_475 = "tt.reduce"(%right_idx_474) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_476 = tt.expand_dims %right_idx_475 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc292) + %right_idx_477 = tt.broadcast %right_idx_476 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc293) + %left_idx_478 = tt.reshape %left_idx_473 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_479 = tt.reshape %right_idx_477 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_480 = arith.cmpi slt, %ileft_467, %iright_468 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_481 = arith.cmpi eq, %ileft_467, %iright_468 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_482 = arith.cmpi sgt, %left_idx_478, %right_idx_479 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_483 = arith.andi %eq_481, %cond_482 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_484 = arith.ori %cond_480, %cond_483 : tensor<8x16xi1, #blocked> loc(#loc277) + %cond_485 = arith.extui %cond_484 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc278) + %cond_486 = arith.xori %cond_485, %flip_84 : tensor<8x16xi32, #blocked> loc(#loc278) + %cond_487 = arith.cmpi ne, %cond_486, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc279) + %ret_488 = arith.xori %ileft_467, %iright_468 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_489 = arith.select %cond_487, %ret_488, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_490 = arith.xori %ret_454, %ret_489 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_491 = arith.xori %left_idx_478, %right_idx_479 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_492 = arith.select %cond_487, %new_idxs_491, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_493 = arith.xori %new_idxs_457, %new_idxs_492 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_494 = tt.reshape %ret_490 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc263) + %ileft_495 = arith.muli %y_494, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc264) + %ileft_496 = "tt.reduce"(%ileft_495) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc310) + %ileft_497 = tt.expand_dims %ileft_496 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc266) + %ileft_498 = tt.broadcast %ileft_497 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc267) + %iright_499 = arith.muli %y_494, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc268) + %iright_500 = "tt.reduce"(%iright_499) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc312) + %iright_501 = tt.expand_dims %iright_500 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc270) + %iright_502 = tt.broadcast %iright_501 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc271) + %ileft_503 = tt.reshape %ileft_498 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_504 = tt.reshape %iright_502 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_505 = tt.reshape %new_idxs_493 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc285) + %left_idx_506 = arith.muli %y_idx_505, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc286) + %left_idx_507 = "tt.reduce"(%left_idx_506) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc314) + %left_idx_508 = tt.expand_dims %left_idx_507 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc288) + %left_idx_509 = tt.broadcast %left_idx_508 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc289) + %right_idx_510 = arith.muli %y_idx_505, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc290) + %right_idx_511 = "tt.reduce"(%right_idx_510) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc316) + %right_idx_512 = tt.expand_dims %right_idx_511 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc292) + %right_idx_513 = tt.broadcast %right_idx_512 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc293) + %left_idx_514 = tt.reshape %left_idx_509 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_515 = tt.reshape %right_idx_513 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_516 = arith.cmpi slt, %ileft_503, %iright_504 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_517 = arith.cmpi eq, %ileft_503, %iright_504 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_518 = arith.cmpi sgt, %left_idx_514, %right_idx_515 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_519 = arith.andi %eq_517, %cond_518 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_520 = arith.ori %cond_516, %cond_519 : tensor<8x16xi1, #blocked> loc(#loc277) + %cond_521 = arith.extui %cond_520 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc278) + %cond_522 = arith.xori %cond_521, %flip_159 : tensor<8x16xi32, #blocked> loc(#loc278) + %cond_523 = arith.cmpi ne, %cond_522, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc279) + %ret_524 = arith.xori %ileft_503, %iright_504 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_525 = arith.select %cond_523, %ret_524, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_526 = arith.xori %ret_490, %ret_525 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_527 = arith.xori %left_idx_514, %right_idx_515 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_528 = arith.select %cond_523, %new_idxs_527, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_529 = arith.xori %new_idxs_493, %new_idxs_528 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_530 = tt.reshape %ret_526 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc263) + %ileft_531 = arith.muli %y_530, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc264) + %ileft_532 = "tt.reduce"(%ileft_531) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_533 = tt.expand_dims %ileft_532 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc266) + %ileft_534 = tt.broadcast %ileft_533 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc267) + %iright_535 = arith.muli %y_530, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc268) + %iright_536 = "tt.reduce"(%iright_535) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_537 = tt.expand_dims %iright_536 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc270) + %iright_538 = tt.broadcast %iright_537 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc271) + %ileft_539 = tt.reshape %ileft_534 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_540 = tt.reshape %iright_538 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_541 = tt.reshape %new_idxs_529 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc285) + %left_idx_542 = arith.muli %y_idx_541, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc286) + %left_idx_543 = "tt.reduce"(%left_idx_542) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_544 = tt.expand_dims %left_idx_543 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc288) + %left_idx_545 = tt.broadcast %left_idx_544 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc289) + %right_idx_546 = arith.muli %y_idx_541, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc290) + %right_idx_547 = "tt.reduce"(%right_idx_546) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_548 = tt.expand_dims %right_idx_547 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc292) + %right_idx_549 = tt.broadcast %right_idx_548 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc293) + %left_idx_550 = tt.reshape %left_idx_545 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_551 = tt.reshape %right_idx_549 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_552 = arith.cmpi slt, %ileft_539, %iright_540 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_553 = arith.cmpi eq, %ileft_539, %iright_540 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_554 = arith.cmpi sgt, %left_idx_550, %right_idx_551 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_555 = arith.andi %eq_553, %cond_554 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_556 = arith.ori %cond_552, %cond_555 : tensor<8x16xi1, #blocked> loc(#loc277) + %cond_557 = arith.extui %cond_556 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc278) + %cond_558 = arith.xori %cond_557, %flip_159 : tensor<8x16xi32, #blocked> loc(#loc278) + %cond_559 = arith.cmpi ne, %cond_558, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc279) + %ret_560 = arith.xori %ileft_539, %iright_540 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_561 = arith.select %cond_559, %ret_560, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_562 = arith.xori %ret_526, %ret_561 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_563 = arith.xori %left_idx_550, %right_idx_551 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_564 = arith.select %cond_559, %new_idxs_563, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_565 = arith.xori %new_idxs_529, %new_idxs_564 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_566 = tt.reshape %ret_562 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc263) + %ileft_567 = arith.muli %y_566, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc264) + %ileft_568 = "tt.reduce"(%ileft_567) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_569 = tt.expand_dims %ileft_568 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc266) + %ileft_570 = tt.broadcast %ileft_569 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc267) + %iright_571 = arith.muli %y_566, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc268) + %iright_572 = "tt.reduce"(%iright_571) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_573 = tt.expand_dims %iright_572 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc270) + %iright_574 = tt.broadcast %iright_573 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc271) + %ileft_575 = tt.reshape %ileft_570 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_576 = tt.reshape %iright_574 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_577 = tt.reshape %new_idxs_565 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc285) + %left_idx_578 = arith.muli %y_idx_577, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc286) + %left_idx_579 = "tt.reduce"(%left_idx_578) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_580 = tt.expand_dims %left_idx_579 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc288) + %left_idx_581 = tt.broadcast %left_idx_580 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc289) + %right_idx_582 = arith.muli %y_idx_577, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc290) + %right_idx_583 = "tt.reduce"(%right_idx_582) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_584 = tt.expand_dims %right_idx_583 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc292) + %right_idx_585 = tt.broadcast %right_idx_584 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc293) + %left_idx_586 = tt.reshape %left_idx_581 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_587 = tt.reshape %right_idx_585 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_588 = arith.cmpi slt, %ileft_575, %iright_576 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_589 = arith.cmpi eq, %ileft_575, %iright_576 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_590 = arith.cmpi sgt, %left_idx_586, %right_idx_587 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_591 = arith.andi %eq_589, %cond_590 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_592 = arith.ori %cond_588, %cond_591 : tensor<8x16xi1, #blocked> loc(#loc277) + %cond_593 = arith.extui %cond_592 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc278) + %cond_594 = arith.xori %cond_593, %flip_159 : tensor<8x16xi32, #blocked> loc(#loc278) + %cond_595 = arith.cmpi ne, %cond_594, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc279) + %ret_596 = arith.xori %ileft_575, %iright_576 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_597 = arith.select %cond_595, %ret_596, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_598 = arith.xori %ret_562, %ret_597 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_599 = arith.xori %left_idx_586, %right_idx_587 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_600 = arith.select %cond_595, %new_idxs_599, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_601 = arith.xori %new_idxs_565, %new_idxs_600 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_602 = tt.reshape %ret_598 : tensor<8x16xi32, #blocked> -> tensor<8x2x8xi32, #blocked1> loc(#loc263) + %ileft_603 = arith.muli %y_602, %ileft_270 : tensor<8x2x8xi32, #blocked1> loc(#loc264) + %ileft_604 = "tt.reduce"(%ileft_603) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc310) + %ileft_605 = tt.expand_dims %ileft_604 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc266) + %ileft_606 = tt.broadcast %ileft_605 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc267) + %iright_607 = arith.muli %y_602, %flip_158 : tensor<8x2x8xi32, #blocked1> loc(#loc268) + %iright_608 = "tt.reduce"(%iright_607) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc312) + %iright_609 = tt.expand_dims %iright_608 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc270) + %iright_610 = tt.broadcast %iright_609 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc271) + %ileft_611 = tt.reshape %ileft_606 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_612 = tt.reshape %iright_610 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_613 = tt.reshape %new_idxs_601 : tensor<8x16xi32, #blocked> -> tensor<8x2x8xi32, #blocked1> loc(#loc285) + %left_idx_614 = arith.muli %y_idx_613, %ileft_270 : tensor<8x2x8xi32, #blocked1> loc(#loc286) + %left_idx_615 = "tt.reduce"(%left_idx_614) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc314) + %left_idx_616 = tt.expand_dims %left_idx_615 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc288) + %left_idx_617 = tt.broadcast %left_idx_616 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc289) + %right_idx_618 = arith.muli %y_idx_613, %flip_158 : tensor<8x2x8xi32, #blocked1> loc(#loc290) + %right_idx_619 = "tt.reduce"(%right_idx_618) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc316) + %right_idx_620 = tt.expand_dims %right_idx_619 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc292) + %right_idx_621 = tt.broadcast %right_idx_620 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc293) + %left_idx_622 = tt.reshape %left_idx_617 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_623 = tt.reshape %right_idx_621 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_624 = arith.cmpi slt, %ileft_611, %iright_612 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_625 = arith.cmpi eq, %ileft_611, %iright_612 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_626 = arith.cmpi sgt, %left_idx_622, %right_idx_623 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_627 = arith.andi %eq_625, %cond_626 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_628 = arith.ori %cond_624, %cond_627 : tensor<8x16xi1, #blocked> loc(#loc277) + %ret_629 = arith.xori %ileft_611, %iright_612 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_630 = arith.select %cond_628, %ret_629, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_631 = arith.xori %ret_598, %ret_630 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_632 = arith.xori %left_idx_622, %right_idx_623 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_633 = arith.select %cond_628, %new_idxs_632, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_634 = arith.xori %new_idxs_601, %new_idxs_633 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_635 = tt.reshape %ret_631 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc263) + %ileft_636 = arith.muli %y_635, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc264) + %ileft_637 = "tt.reduce"(%ileft_636) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc310) + %ileft_638 = tt.expand_dims %ileft_637 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc266) + %ileft_639 = tt.broadcast %ileft_638 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc267) + %iright_640 = arith.muli %y_635, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc268) + %iright_641 = "tt.reduce"(%iright_640) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc312) + %iright_642 = tt.expand_dims %iright_641 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc270) + %iright_643 = tt.broadcast %iright_642 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc271) + %ileft_644 = tt.reshape %ileft_639 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_645 = tt.reshape %iright_643 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_646 = tt.reshape %new_idxs_634 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc285) + %left_idx_647 = arith.muli %y_idx_646, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc286) + %left_idx_648 = "tt.reduce"(%left_idx_647) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc314) + %left_idx_649 = tt.expand_dims %left_idx_648 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc288) + %left_idx_650 = tt.broadcast %left_idx_649 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc289) + %right_idx_651 = arith.muli %y_idx_646, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc290) + %right_idx_652 = "tt.reduce"(%right_idx_651) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc316) + %right_idx_653 = tt.expand_dims %right_idx_652 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc292) + %right_idx_654 = tt.broadcast %right_idx_653 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc293) + %left_idx_655 = tt.reshape %left_idx_650 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_656 = tt.reshape %right_idx_654 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_657 = arith.cmpi slt, %ileft_644, %iright_645 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_658 = arith.cmpi eq, %ileft_644, %iright_645 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_659 = arith.cmpi sgt, %left_idx_655, %right_idx_656 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_660 = arith.andi %eq_658, %cond_659 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_661 = arith.ori %cond_657, %cond_660 : tensor<8x16xi1, #blocked> loc(#loc277) + %ret_662 = arith.xori %ileft_644, %iright_645 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_663 = arith.select %cond_661, %ret_662, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_664 = arith.xori %ret_631, %ret_663 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_665 = arith.xori %left_idx_655, %right_idx_656 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_666 = arith.select %cond_661, %new_idxs_665, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_667 = arith.xori %new_idxs_634, %new_idxs_666 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_668 = tt.reshape %ret_664 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc263) + %ileft_669 = arith.muli %y_668, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc264) + %ileft_670 = "tt.reduce"(%ileft_669) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_671 = tt.expand_dims %ileft_670 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc266) + %ileft_672 = tt.broadcast %ileft_671 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc267) + %iright_673 = arith.muli %y_668, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc268) + %iright_674 = "tt.reduce"(%iright_673) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_675 = tt.expand_dims %iright_674 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc270) + %iright_676 = tt.broadcast %iright_675 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc271) + %ileft_677 = tt.reshape %ileft_672 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_678 = tt.reshape %iright_676 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_679 = tt.reshape %new_idxs_667 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc285) + %left_idx_680 = arith.muli %y_idx_679, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc286) + %left_idx_681 = "tt.reduce"(%left_idx_680) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_682 = tt.expand_dims %left_idx_681 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc288) + %left_idx_683 = tt.broadcast %left_idx_682 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc289) + %right_idx_684 = arith.muli %y_idx_679, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc290) + %right_idx_685 = "tt.reduce"(%right_idx_684) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_686 = tt.expand_dims %right_idx_685 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc292) + %right_idx_687 = tt.broadcast %right_idx_686 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc293) + %left_idx_688 = tt.reshape %left_idx_683 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_689 = tt.reshape %right_idx_687 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_690 = arith.cmpi slt, %ileft_677, %iright_678 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_691 = arith.cmpi eq, %ileft_677, %iright_678 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_692 = arith.cmpi sgt, %left_idx_688, %right_idx_689 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_693 = arith.andi %eq_691, %cond_692 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_694 = arith.ori %cond_690, %cond_693 : tensor<8x16xi1, #blocked> loc(#loc277) + %ret_695 = arith.xori %ileft_677, %iright_678 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_696 = arith.select %cond_694, %ret_695, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_697 = arith.xori %ret_664, %ret_696 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_698 = arith.xori %left_idx_688, %right_idx_689 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_699 = arith.select %cond_694, %new_idxs_698, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_700 = arith.xori %new_idxs_667, %new_idxs_699 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_701 = tt.reshape %ret_697 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc263) + %ileft_702 = arith.muli %y_701, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc264) + %ileft_703 = "tt.reduce"(%ileft_702) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_704 = tt.expand_dims %ileft_703 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc266) + %ileft_705 = tt.broadcast %ileft_704 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc267) + %iright_706 = arith.muli %y_701, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc268) + %iright_707 = "tt.reduce"(%iright_706) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_708 = tt.expand_dims %iright_707 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc270) + %iright_709 = tt.broadcast %iright_708 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc271) + %ileft_710 = tt.reshape %ileft_705 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_711 = tt.reshape %iright_709 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_712 = tt.reshape %new_idxs_700 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc285) + %left_idx_713 = arith.muli %y_idx_712, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc286) + %left_idx_714 = "tt.reduce"(%left_idx_713) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_715 = tt.expand_dims %left_idx_714 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc288) + %left_idx_716 = tt.broadcast %left_idx_715 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc289) + %right_idx_717 = arith.muli %y_idx_712, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc290) + %right_idx_718 = "tt.reduce"(%right_idx_717) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_719 = tt.expand_dims %right_idx_718 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc292) + %right_idx_720 = tt.broadcast %right_idx_719 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc293) + %left_idx_721 = tt.reshape %left_idx_716 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_722 = tt.reshape %right_idx_720 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_723 = arith.cmpi slt, %ileft_710, %iright_711 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_724 = arith.cmpi eq, %ileft_710, %iright_711 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_725 = arith.cmpi sgt, %left_idx_721, %right_idx_722 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_726 = arith.andi %eq_724, %cond_725 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_727 = arith.ori %cond_723, %cond_726 : tensor<8x16xi1, #blocked> loc(#loc277) + %new_idxs_728 = arith.xori %left_idx_721, %right_idx_722 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_729 = arith.select %cond_727, %new_idxs_728, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_730 = arith.xori %new_idxs_700, %new_idxs_729 : tensor<8x16xi32, #blocked> loc(#loc284) + %tmp20 = arith.extui %tmp5 : tensor<8x16xi1, #blocked> to tensor<8x16xi64, #blocked> loc(#loc219) + %tmp23 = arith.select %tmp0_29, %tmp20, %cst_13 : tensor<8x16xi1, #blocked>, tensor<8x16xi64, #blocked> loc(#loc191) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_741: i64 loc(callsite(#loc1 at #loc192)), %tmp24_742: i64 loc(callsite(#loc1 at #loc192))): + %tmp24_743 = arith.addi %tmp24_741, %tmp24_742 : i64 loc(#loc298) + tt.reduce.return %tmp24_743 : i64 loc(#loc220) + }) : (tensor<8x16xi64, #blocked>) -> tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc220) + %tmp30 = ttg.convert_layout %tmp24 : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc193) + %tmp24_731 = tt.expand_dims %tmp30 {axis = 1 : i32} : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<8x1xi64, #blocked5> loc(#loc194) + %tmp24_732 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi64, #blocked> loc(#loc194) + %tmp25 = arith.extui %tmp14 : tensor<8x16xi1, #blocked> to tensor<8x16xi64, #blocked> loc(#loc222) + %tmp28 = arith.select %tmp0_29, %tmp25, %cst_13 : tensor<8x16xi1, #blocked>, tensor<8x16xi64, #blocked> loc(#loc196) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_741: i64 loc(callsite(#loc1 at #loc197)), %tmp29_742: i64 loc(callsite(#loc1 at #loc197))): + %tmp29_743 = arith.addi %tmp29_741, %tmp29_742 : i64 loc(#loc299) + tt.reduce.return %tmp29_743 : i64 loc(#loc223) + }) : (tensor<8x16xi64, #blocked>) -> tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc223) + %tmp31 = ttg.convert_layout %tmp29 : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc198) + %tmp29_733 = tt.expand_dims %tmp31 {axis = 1 : i32} : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<8x1xi64, #blocked5> loc(#loc199) + %tmp29_734 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi64, #blocked> loc(#loc199) + %tmp30_735 = arith.trunci %tmp24_731 : tensor<8x1xi64, #blocked5> to tensor<8x1xi32, #blocked5> loc(#loc193) + %tmp30_736 = arith.trunci %tmp24_732 : tensor<8x1xi64, #blocked> to tensor<8x1xi32, #blocked> loc(#loc193) + %tmp31_737 = arith.trunci %tmp29_733 : tensor<8x1xi64, #blocked5> to tensor<8x1xi32, #blocked5> loc(#loc198) + %tmp31_738 = arith.trunci %tmp29_734 : tensor<8x1xi64, #blocked> to tensor<8x1xi32, #blocked> loc(#loc198) + %tmp34 = tt.broadcast %tmp30_736 : tensor<8x1xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc200) + %tmp34_739 = arith.cmpi slt, %tmp0_24, %tmp34 : tensor<8x16xi32, #blocked> loc(#loc200) + %tmp36 = arith.select %tmp34_739, %new_idxs_398, %cst_11 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc201) + %tmp38 = arith.addi %tmp36, %cst_10 : tensor<8x16xi32, #blocked> loc(#loc202) + %tmp39 = arith.cmpi slt, %tmp36, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc203) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc204) + %0 = arith.cmpi sge, %tmp40, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc85) + %1 = arith.cmpi slt, %tmp40, %cst_10 : tensor<8x16xi32, #blocked> loc(#loc86) + %2 = arith.andi %0, %1 : tensor<8x16xi1, #blocked> loc(#loc87) + %3 = arith.xori %xmask, %cst : tensor<8x1xi1, #blocked> loc(#loc88) + %4 = tt.broadcast %3 : tensor<8x1xi1, #blocked> -> tensor<8x16xi1, #blocked> loc(#loc89) + %5 = arith.ori %2, %4 : tensor<8x16xi1, #blocked> loc(#loc89) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<8x16xi1, #blocked> loc(#loc90) + %tmp45 = tt.broadcast %tmp31_738 : tensor<8x1xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc205) + %tmp45_740 = arith.cmpi slt, %tmp0_24, %tmp45 : tensor<8x16xi32, #blocked> loc(#loc205) + %tmp46 = arith.select %tmp45_740, %new_idxs_730, %cst_11 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc206) + %tmp47 = arith.addi %tmp46, %cst_10 : tensor<8x16xi32, #blocked> loc(#loc207) + %tmp48 = arith.cmpi slt, %tmp46, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc208) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc209) + %6 = arith.cmpi sge, %tmp49, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc96) + %7 = arith.cmpi slt, %tmp49, %cst_10 : tensor<8x16xi32, #blocked> loc(#loc97) + %8 = arith.andi %6, %7 : tensor<8x16xi1, #blocked> loc(#loc98) + %9 = arith.ori %8, %4 : tensor<8x16xi1, #blocked> loc(#loc99) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<8x16xi1, #blocked> loc(#loc100) + %10 = tt.splat %out_ptr4 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked5> loc(#loc101) + %11 = tt.addptr %10, %xindex_21 : tensor<8x1x!tt.ptr, #blocked5>, tensor<8x1xi32, #blocked5> loc(#loc101) + tt.store %11, %tmp30_735, %xmask_22 : tensor<8x1x!tt.ptr, #blocked5> loc(#loc102) + %12 = tt.splat %out_ptr5 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked5> loc(#loc103) + %13 = tt.addptr %12, %xindex_21 : tensor<8x1x!tt.ptr, #blocked5>, tensor<8x1xi32, #blocked5> loc(#loc103) + tt.store %13, %tmp31_737, %xmask_22 : tensor<8x1x!tt.ptr, #blocked5> loc(#loc104) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc105) + %15 = tt.addptr %14, %tmp0_26 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi32, #blocked> loc(#loc105) + tt.store %15, %new_idxs_398, %tmp0_29 : tensor<8x16x!tt.ptr, #blocked> loc(#loc106) + %16 = arith.muli %xindex_20, %cst_0 : tensor<8x1xi32, #blocked> loc(#loc107) + %17 = tt.broadcast %16 : tensor<8x1xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc108) + %18 = arith.addi %tmp40, %17 : tensor<8x16xi32, #blocked> loc(#loc108) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc109) + %20 = tt.addptr %19, %18 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi32, #blocked> loc(#loc109) + %21 = ttg.convert_layout %20 : tensor<8x16x!tt.ptr, #blocked> -> tensor<8x16x!tt.ptr, #blocked5> loc(#loc110) + tt.store %21, %cst_8, %tmp0_30 : tensor<8x16x!tt.ptr, #blocked5> loc(#loc110) + %22 = tt.splat %out_ptr8 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc111) + %23 = tt.addptr %22, %tmp0_26 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi32, #blocked> loc(#loc111) + tt.store %23, %new_idxs_730, %tmp0_29 : tensor<8x16x!tt.ptr, #blocked> loc(#loc112) + %24 = arith.addi %tmp49, %17 : tensor<8x16xi32, #blocked> loc(#loc113) + %25 = tt.splat %out_ptr9 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc114) + %26 = tt.addptr %25, %24 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi32, #blocked> loc(#loc114) + %27 = ttg.convert_layout %26 : tensor<8x16x!tt.ptr, #blocked> -> tensor<8x16x!tt.ptr, #blocked5> loc(#loc115) + tt.store %27, %cst_8, %tmp0_30 : tensor<8x16x!tt.ptr, #blocked5> loc(#loc115) + tt.return loc(#loc116) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":26:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:40) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:30) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":36:18) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":38:18) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":39:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":41:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":40:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":43:19) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":45:34) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":47:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":49:21) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":48:21) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":52:20) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":54:35) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":60:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:29) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":56:21) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":58:35) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":61:21) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:29) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":64:19) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":66:35) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":68:20) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":69:20) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":70:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:28) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:46) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:38) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:55) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:53) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:63) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":75:19) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":76:35) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":77:20) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":78:20) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":79:35) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:28) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:46) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:38) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:53) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:63) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:25) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:37) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:25) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:37) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:25) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:47) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:52) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:49) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:25) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:85) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:25) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:47) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:49) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:85) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:4) +#loc126 = loc("xoffset"(#loc2)) +#loc127 = loc("xoffset"(#loc3)) +#loc128 = loc("xindex"(#loc4)) +#loc129 = loc("xindex"(#loc5)) +#loc130 = loc("xmask"(#loc6)) +#loc131 = loc("r0_index"(#loc7)) +#loc132 = loc("tmp0"(#loc8)) +#loc133 = loc("tmp0"(#loc9)) +#loc134 = loc("tmp0"(#loc10)) +#loc135 = loc("tmp0"(#loc11)) +#loc136 = loc("tmp2"(#loc12)) +#loc137 = loc("tmp4"(#loc13)) +#loc138 = loc("tmp5"(#loc14)) +#loc139 = loc("tmp7"(#loc15)) +#loc140 = loc("tmp6"(#loc16)) +#loc141 = loc("tmp9"(#loc17)) +#loc142 = loc("tmp11"(#loc18)) +#loc143 = loc("flip"(#loc19)) +#loc145 = loc("flip"(#loc22)) +#loc146 = loc("flip"(#loc23)) +#loc147 = loc("y"(#loc24)) +#loc148 = loc("left_mask"(#loc26)) +#loc149 = loc("ileft"(#loc27)) +#loc151 = loc("ileft"(#loc31)) +#loc152 = loc("ileft"(#loc32)) +#loc153 = loc("iright"(#loc33)) +#loc155 = loc("iright"(#loc35)) +#loc156 = loc("iright"(#loc36)) +#loc157 = loc("ileft"(#loc37)) +#loc158 = loc("iright"(#loc38)) +#loc159 = loc("y_idx"(#loc39)) +#loc160 = loc("left_idx"(#loc40)) +#loc161 = loc("left_idx"(#loc41)) +#loc162 = loc("input"(#loc42)) +#loc164 = loc("left_idx"(#loc44)) +#loc165 = loc("left_idx"(#loc45)) +#loc166 = loc("right_idx"(#loc46)) +#loc167 = loc("right_idx"(#loc47)) +#loc169 = loc("right_idx"(#loc49)) +#loc170 = loc("right_idx"(#loc50)) +#loc171 = loc("left_idx"(#loc51)) +#loc172 = loc("right_idx"(#loc52)) +#loc173 = loc("cond"(#loc53)) +#loc174 = loc("eq"(#loc54)) +#loc175 = loc("cond"(#loc55)) +#loc176 = loc("cond"(#loc56)) +#loc177 = loc("cond"(#loc57)) +#loc178 = loc("cond"(#loc58)) +#loc179 = loc("cond"(#loc59)) +#loc180 = loc("ret"(#loc60)) +#loc181 = loc("ret"(#loc61)) +#loc182 = loc("ret"(#loc62)) +#loc183 = loc("new_idxs"(#loc63)) +#loc184 = loc("new_idxs"(#loc64)) +#loc185 = loc("new_idxs"(#loc65)) +#loc186 = loc("tmp14"(#loc66)) +#loc187 = loc("tmp16"(#loc67)) +#loc188 = loc("tmp15"(#loc68)) +#loc190 = loc("tmp20"(#loc70)) +#loc191 = loc("tmp23"(#loc71)) +#loc193 = loc("tmp30"(#loc73)) +#loc194 = loc("tmp24"(#loc74)) +#loc195 = loc("tmp25"(#loc75)) +#loc196 = loc("tmp28"(#loc76)) +#loc198 = loc("tmp31"(#loc78)) +#loc199 = loc("tmp29"(#loc79)) +#loc200 = loc("tmp34"(#loc80)) +#loc201 = loc("tmp36"(#loc81)) +#loc202 = loc("tmp38"(#loc82)) +#loc203 = loc("tmp39"(#loc83)) +#loc204 = loc("tmp40"(#loc84)) +#loc205 = loc("tmp45"(#loc91)) +#loc206 = loc("tmp46"(#loc92)) +#loc207 = loc("tmp47"(#loc93)) +#loc208 = loc("tmp48"(#loc94)) +#loc209 = loc("tmp49"(#loc95)) +#loc210 = loc(fused[#loc139, #loc140]) +#loc211 = loc(callsite(#loc143 at #loc144)) +#loc212 = loc(callsite(#loc145 at #loc144)) +#loc213 = loc(callsite(#loc146 at #loc144)) +#loc215 = loc("cond"(#loc173)) +#loc216 = loc("eq"(#loc174)) +#loc217 = loc(fused[#loc187, #loc188]) +#loc219 = loc(fused[#loc190, #loc139, #loc140]) +#loc220 = loc(callsite(#loc28 at #loc192)) +#loc222 = loc(fused[#loc195, #loc187, #loc188]) +#loc223 = loc(callsite(#loc28 at #loc197)) +#loc225 = loc(callsite(#loc147 at #loc214)) +#loc226 = loc(callsite(#loc148 at #loc214)) +#loc227 = loc(callsite(#loc149 at #loc214)) +#loc229 = loc(callsite(#loc151 at #loc214)) +#loc230 = loc(callsite(#loc152 at #loc214)) +#loc231 = loc(callsite(#loc153 at #loc214)) +#loc233 = loc(callsite(#loc155 at #loc214)) +#loc234 = loc(callsite(#loc156 at #loc214)) +#loc235 = loc(callsite(#loc157 at #loc214)) +#loc236 = loc(callsite(#loc158 at #loc214)) +#loc237 = loc(callsite(#loc159 at #loc214)) +#loc238 = loc(callsite(#loc160 at #loc214)) +#loc239 = loc(callsite(#loc161 at #loc214)) +#loc241 = loc(callsite(#loc164 at #loc214)) +#loc242 = loc(callsite(#loc165 at #loc214)) +#loc243 = loc(callsite(#loc166 at #loc214)) +#loc244 = loc(callsite(#loc167 at #loc214)) +#loc246 = loc(callsite(#loc169 at #loc214)) +#loc247 = loc(callsite(#loc170 at #loc214)) +#loc248 = loc(callsite(#loc171 at #loc214)) +#loc249 = loc(callsite(#loc172 at #loc214)) +#loc250 = loc(callsite(#loc215 at #loc214)) +#loc251 = loc(callsite(#loc216 at #loc214)) +#loc252 = loc(callsite(#loc175 at #loc214)) +#loc253 = loc(callsite(#loc176 at #loc214)) +#loc254 = loc(callsite(#loc177 at #loc214)) +#loc255 = loc(callsite(#loc178 at #loc214)) +#loc256 = loc(callsite(#loc179 at #loc214)) +#loc257 = loc(callsite(#loc180 at #loc214)) +#loc258 = loc(callsite(#loc181 at #loc214)) +#loc259 = loc(callsite(#loc182 at #loc214)) +#loc260 = loc(callsite(#loc183 at #loc214)) +#loc261 = loc(callsite(#loc184 at #loc214)) +#loc262 = loc(callsite(#loc185 at #loc214)) +#loc263 = loc(callsite(#loc147 at #loc218)) +#loc264 = loc(callsite(#loc149 at #loc218)) +#loc266 = loc(callsite(#loc151 at #loc218)) +#loc267 = loc(callsite(#loc152 at #loc218)) +#loc268 = loc(callsite(#loc153 at #loc218)) +#loc270 = loc(callsite(#loc155 at #loc218)) +#loc271 = loc(callsite(#loc156 at #loc218)) +#loc272 = loc(callsite(#loc157 at #loc218)) +#loc273 = loc(callsite(#loc158 at #loc218)) +#loc274 = loc(callsite(#loc215 at #loc218)) +#loc275 = loc(callsite(#loc216 at #loc218)) +#loc276 = loc(callsite(#loc176 at #loc218)) +#loc277 = loc(callsite(#loc177 at #loc218)) +#loc278 = loc(callsite(#loc178 at #loc218)) +#loc279 = loc(callsite(#loc179 at #loc218)) +#loc280 = loc(callsite(#loc180 at #loc218)) +#loc281 = loc(callsite(#loc181 at #loc218)) +#loc282 = loc(callsite(#loc182 at #loc218)) +#loc283 = loc(callsite(#loc184 at #loc218)) +#loc284 = loc(callsite(#loc185 at #loc218)) +#loc285 = loc(callsite(#loc159 at #loc218)) +#loc286 = loc(callsite(#loc161 at #loc218)) +#loc288 = loc(callsite(#loc164 at #loc218)) +#loc289 = loc(callsite(#loc165 at #loc218)) +#loc290 = loc(callsite(#loc167 at #loc218)) +#loc292 = loc(callsite(#loc169 at #loc218)) +#loc293 = loc(callsite(#loc170 at #loc218)) +#loc294 = loc(callsite(#loc171 at #loc218)) +#loc295 = loc(callsite(#loc172 at #loc218)) +#loc296 = loc(callsite(#loc175 at #loc218)) +#loc297 = loc(callsite(#loc183 at #loc218)) +#loc298 = loc(callsite(#loc30 at #loc220)) +#loc299 = loc(callsite(#loc30 at #loc223)) +#loc300 = loc(callsite(#loc28 at #loc228)) +#loc302 = loc(callsite(#loc28 at #loc232)) +#loc304 = loc(callsite(#loc162 at #loc240)) +#loc305 = loc(callsite(#loc28 at #loc240)) +#loc307 = loc(callsite(#loc162 at #loc245)) +#loc308 = loc(callsite(#loc28 at #loc245)) +#loc310 = loc(callsite(#loc28 at #loc265)) +#loc312 = loc(callsite(#loc28 at #loc269)) +#loc314 = loc(callsite(#loc28 at #loc287)) +#loc316 = loc(callsite(#loc28 at #loc291)) +#loc318 = loc(callsite(#loc30 at #loc300)) +#loc319 = loc(callsite(#loc30 at #loc302)) +#loc320 = loc(callsite(#loc30 at #loc305)) +#loc321 = loc(callsite(#loc30 at #loc308)) +#loc322 = loc(callsite(#loc30 at #loc310)) +#loc323 = loc(callsite(#loc30 at #loc312)) +#loc324 = loc(callsite(#loc30 at #loc314)) +#loc325 = loc(callsite(#loc30 at #loc316)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..d08bbf34762dd01a547615e7674f2c91fd9dc241 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/LOU7JZSMMOOE2FWAU6ZB7QXC2OZWEW7QLYYDH44JD7M3ZACOSP4Q/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir @@ -0,0 +1,1451 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":18:0) +#loc1 = loc(unknown) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":46:71) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":51:71) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:26) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:26) +#loc120 = loc("in_ptr0"(#loc)) +#loc121 = loc("out_ptr4"(#loc)) +#loc122 = loc("out_ptr5"(#loc)) +#loc123 = loc("out_ptr6"(#loc)) +#loc124 = loc("out_ptr7"(#loc)) +#loc125 = loc("out_ptr8"(#loc)) +#loc126 = loc("out_ptr9"(#loc)) +#loc127 = loc("xnumel"(#loc)) +#loc128 = loc("r0_numel"(#loc)) +#loc149 = loc(callsite(#loc22 at #loc23)) +#loc156 = loc("ileft"(#loc32)) +#loc160 = loc("iright"(#loc37)) +#loc169 = loc("left_idx"(#loc46)) +#loc174 = loc("right_idx"(#loc51)) +#loc195 = loc(callsite(#loc22 at #loc72)) +#loc198 = loc("tmp24"(#loc75)) +#loc202 = loc("tmp29"(#loc79)) +#loc221 = loc(callsite(#loc28 at #loc149)) +#loc225 = loc(callsite(#loc28 at #loc195)) +#loc228 = loc(callsite(#loc1 at #loc198)) +#loc231 = loc(callsite(#loc1 at #loc202)) +#loc235 = loc(callsite(#loc156 at #loc221)) +#loc239 = loc(callsite(#loc160 at #loc221)) +#loc247 = loc(callsite(#loc169 at #loc221)) +#loc252 = loc(callsite(#loc174 at #loc221)) +#loc272 = loc(callsite(#loc156 at #loc225)) +#loc276 = loc(callsite(#loc160 at #loc225)) +#loc294 = loc(callsite(#loc169 at #loc225)) +#loc298 = loc(callsite(#loc174 at #loc225)) +#loc308 = loc(callsite(#loc1 at #loc235)) +#loc310 = loc(callsite(#loc1 at #loc239)) +#loc313 = loc(callsite(#loc1 at #loc247)) +#loc316 = loc(callsite(#loc1 at #loc252)) +#loc318 = loc(callsite(#loc1 at #loc272)) +#loc320 = loc(callsite(#loc1 at #loc276)) +#loc322 = loc(callsite(#loc1 at #loc294)) +#loc324 = loc(callsite(#loc1 at #loc298)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<8x16xi32> loc(#loc1) + %cst_1 = arith.constant dense<17> : tensor<8x1xi32> loc(#loc1) + %cst_2 = arith.constant dense : tensor<8x1xi1> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc1) + %cst_4 = arith.constant dense<17> : tensor<8x16xi32> loc(#loc1) + %cst_5 = arith.constant dense<16> : tensor<8x16xi32> loc(#loc1) + %cst_6 = arith.constant dense<16384> : tensor<8x16xi64> loc(#loc1) + %cst_7 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc1) + %cst_8 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc1) + %xmask = arith.constant dense<128> : tensor<8x1xi32> loc(#loc129) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc130) + %xoffset_9 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc131) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc132) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc133) + %xindex_11 = tt.splat %xoffset_9 : i32 -> tensor<8x1xi32> loc(#loc134) + %xindex_12 = arith.addi %xindex_11, %xindex_10 : tensor<8x1xi32> loc(#loc134) + %xmask_13 = arith.cmpi slt, %xindex_12, %xmask : tensor<8x1xi32> loc(#loc129) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc135) + %r0_index_14 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc136) + %tmp0 = arith.muli %xindex_12, %cst_8 : tensor<8x1xi32> loc(#loc137) + %tmp0_15 = tt.broadcast %r0_index_14 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc138) + %tmp0_16 = tt.broadcast %tmp0 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc138) + %tmp0_17 = arith.addi %tmp0_15, %tmp0_16 : tensor<8x16xi32> loc(#loc138) + %tmp0_18 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc139) + %tmp0_19 = tt.addptr %tmp0_18, %tmp0_17 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc139) + %tmp0_20 = tt.broadcast %xmask_13 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc140) + %tmp0_21 = tt.load %tmp0_19, %tmp0_20, %cst_7 : tensor<8x16x!tt.ptr> loc(#loc140) + %tmp2 = arith.cmpi sgt, %tmp0_21, %cst_7 : tensor<8x16xi64> loc(#loc141) + %tmp4 = arith.cmpi slt, %tmp0_21, %cst_6 : tensor<8x16xi64> loc(#loc142) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<8x16xi1> loc(#loc143) + %tmp7 = arith.extui %tmp5 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc216) + %tmp9 = arith.trunci %r0_index_14 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc146) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16> -> tensor<8x16xi16> loc(#loc147) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc217) + %flip_22 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc218) + %flip_23 = tt.expand_dims %flip_22 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc218) + %flip_24 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc219) + %flip_25 = tt.reshape %flip_24 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc220) + %y = tt.reshape %tmp7 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc232) + %left_mask = arith.subi %cst, %flip_23 : tensor<1x2x1xi32> loc(#loc233) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc234) + %ileft_26 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc234) + %ileft_27 = "tt.reduce"(%ileft_26) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc307) + %ileft_28 = tt.expand_dims %ileft_27 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc236) + %ileft_29 = tt.broadcast %ileft_28 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc237) + %iright = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc238) + %iright_30 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc238) + %iright_31 = "tt.reduce"(%iright_30) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc309) + %iright_32 = tt.expand_dims %iright_31 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc240) + %iright_33 = tt.broadcast %iright_32 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc241) + %ileft_34 = tt.reshape %ileft_29 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_35 = tt.reshape %iright_33 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx = tt.reshape %tmp11 : tensor<8x16xi16> -> tensor<64x2x1xi16> loc(#loc244) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc245) + %left_idx_36 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc246) + %left_idx_37 = arith.muli %y_idx, %left_idx_36 : tensor<64x2x1xi16> loc(#loc246) + %input = arith.extsi %left_idx_37 : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc311) + %left_idx_38 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc312) + %left_idx_39 = tt.expand_dims %left_idx_38 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc248) + %left_idx_40 = tt.broadcast %left_idx_39 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc249) + %right_idx = arith.trunci %flip_23 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc250) + %right_idx_41 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc251) + %right_idx_42 = arith.muli %y_idx, %right_idx_41 : tensor<64x2x1xi16> loc(#loc251) + %input_43 = arith.extsi %right_idx_42 : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc314) + %right_idx_44 = "tt.reduce"(%input_43) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc315) + %right_idx_45 = tt.expand_dims %right_idx_44 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc253) + %right_idx_46 = tt.broadcast %right_idx_45 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc254) + %left_idx_47 = tt.reshape %left_idx_40 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_48 = tt.reshape %right_idx_46 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc256) + %cond = arith.cmpi slt, %ileft_34, %iright_35 : tensor<8x16xi32> loc(#loc257) + %eq = arith.cmpi eq, %ileft_34, %iright_35 : tensor<8x16xi32> loc(#loc258) + %cond_49 = arith.cmpi sgt, %left_idx_47, %right_idx_48 : tensor<8x16xi32> loc(#loc259) + %cond_50 = arith.andi %eq, %cond_49 : tensor<8x16xi1> loc(#loc260) + %cond_51 = arith.ori %cond, %cond_50 : tensor<8x16xi1> loc(#loc261) + %cond_52 = arith.extui %cond_51 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc262) + %cond_53 = arith.xori %cond_52, %flip_25 : tensor<8x16xi32> loc(#loc262) + %cond_54 = arith.cmpi ne, %cond_53, %cst_3 : tensor<8x16xi32> loc(#loc263) + %ret = arith.xori %ileft_34, %iright_35 : tensor<8x16xi32> loc(#loc264) + %ret_55 = arith.select %cond_54, %ret, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_56 = arith.xori %tmp7, %ret_55 : tensor<8x16xi32> loc(#loc266) + %new_idxs = arith.xori %left_idx_47, %right_idx_48 : tensor<8x16xi32> loc(#loc267) + %new_idxs_57 = arith.select %cond_54, %new_idxs, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_58 = arith.extsi %tmp9 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc269) + %new_idxs_59 = tt.broadcast %new_idxs_58 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc269) + %new_idxs_60 = arith.xori %new_idxs_59, %new_idxs_57 : tensor<8x16xi32> loc(#loc269) + %flip_61 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc219) + %flip_62 = tt.reshape %flip_61 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc220) + %y_63 = tt.reshape %ret_56 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc232) + %ileft_64 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc234) + %ileft_65 = arith.muli %y_63, %ileft_64 : tensor<32x2x2xi32> loc(#loc234) + %ileft_66 = "tt.reduce"(%ileft_65) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc307) + %ileft_67 = tt.expand_dims %ileft_66 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc236) + %ileft_68 = tt.broadcast %ileft_67 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc237) + %iright_69 = arith.muli %y_63, %flip_24 : tensor<32x2x2xi32> loc(#loc238) + %iright_70 = "tt.reduce"(%iright_69) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc309) + %iright_71 = tt.expand_dims %iright_70 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc240) + %iright_72 = tt.broadcast %iright_71 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc241) + %ileft_73 = tt.reshape %ileft_68 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_74 = tt.reshape %iright_72 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_75 = tt.reshape %new_idxs_60 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc244) + %left_idx_76 = arith.muli %y_idx_75, %ileft_64 : tensor<32x2x2xi32> loc(#loc246) + %left_idx_77 = "tt.reduce"(%left_idx_76) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc312) + %left_idx_78 = tt.expand_dims %left_idx_77 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc248) + %left_idx_79 = tt.broadcast %left_idx_78 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc249) + %right_idx_80 = arith.muli %y_idx_75, %flip_24 : tensor<32x2x2xi32> loc(#loc251) + %right_idx_81 = "tt.reduce"(%right_idx_80) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc315) + %right_idx_82 = tt.expand_dims %right_idx_81 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc253) + %right_idx_83 = tt.broadcast %right_idx_82 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc254) + %left_idx_84 = tt.reshape %left_idx_79 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_85 = tt.reshape %right_idx_83 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_86 = arith.cmpi slt, %ileft_73, %iright_74 : tensor<8x16xi32> loc(#loc257) + %eq_87 = arith.cmpi eq, %ileft_73, %iright_74 : tensor<8x16xi32> loc(#loc258) + %cond_88 = arith.cmpi sgt, %left_idx_84, %right_idx_85 : tensor<8x16xi32> loc(#loc259) + %cond_89 = arith.andi %eq_87, %cond_88 : tensor<8x16xi1> loc(#loc260) + %cond_90 = arith.ori %cond_86, %cond_89 : tensor<8x16xi1> loc(#loc261) + %cond_91 = arith.extui %cond_90 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc262) + %cond_92 = arith.xori %cond_91, %flip_62 : tensor<8x16xi32> loc(#loc262) + %cond_93 = arith.cmpi ne, %cond_92, %cst_3 : tensor<8x16xi32> loc(#loc263) + %ret_94 = arith.xori %ileft_73, %iright_74 : tensor<8x16xi32> loc(#loc264) + %ret_95 = arith.select %cond_93, %ret_94, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_96 = arith.xori %ret_56, %ret_95 : tensor<8x16xi32> loc(#loc266) + %new_idxs_97 = arith.xori %left_idx_84, %right_idx_85 : tensor<8x16xi32> loc(#loc267) + %new_idxs_98 = arith.select %cond_93, %new_idxs_97, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_99 = arith.xori %new_idxs_60, %new_idxs_98 : tensor<8x16xi32> loc(#loc269) + %y_100 = tt.reshape %ret_96 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc232) + %ileft_101 = arith.muli %y_100, %ileft : tensor<64x2x1xi32> loc(#loc234) + %ileft_102 = "tt.reduce"(%ileft_101) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc307) + %ileft_103 = tt.expand_dims %ileft_102 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc236) + %ileft_104 = tt.broadcast %ileft_103 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc237) + %iright_105 = arith.muli %y_100, %iright : tensor<64x2x1xi32> loc(#loc238) + %iright_106 = "tt.reduce"(%iright_105) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc309) + %iright_107 = tt.expand_dims %iright_106 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc240) + %iright_108 = tt.broadcast %iright_107 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc241) + %ileft_109 = tt.reshape %ileft_104 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_110 = tt.reshape %iright_108 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_111 = tt.reshape %new_idxs_99 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc244) + %left_idx_112 = arith.muli %y_idx_111, %ileft : tensor<64x2x1xi32> loc(#loc246) + %left_idx_113 = "tt.reduce"(%left_idx_112) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc312) + %left_idx_114 = tt.expand_dims %left_idx_113 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc248) + %left_idx_115 = tt.broadcast %left_idx_114 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc249) + %right_idx_116 = arith.muli %y_idx_111, %iright : tensor<64x2x1xi32> loc(#loc251) + %right_idx_117 = "tt.reduce"(%right_idx_116) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc315) + %right_idx_118 = tt.expand_dims %right_idx_117 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc253) + %right_idx_119 = tt.broadcast %right_idx_118 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc254) + %left_idx_120 = tt.reshape %left_idx_115 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_121 = tt.reshape %right_idx_119 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_122 = arith.cmpi slt, %ileft_109, %iright_110 : tensor<8x16xi32> loc(#loc257) + %eq_123 = arith.cmpi eq, %ileft_109, %iright_110 : tensor<8x16xi32> loc(#loc258) + %cond_124 = arith.cmpi sgt, %left_idx_120, %right_idx_121 : tensor<8x16xi32> loc(#loc259) + %cond_125 = arith.andi %eq_123, %cond_124 : tensor<8x16xi1> loc(#loc260) + %cond_126 = arith.ori %cond_122, %cond_125 : tensor<8x16xi1> loc(#loc261) + %cond_127 = arith.extui %cond_126 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc262) + %cond_128 = arith.xori %cond_127, %flip_62 : tensor<8x16xi32> loc(#loc262) + %cond_129 = arith.cmpi ne, %cond_128, %cst_3 : tensor<8x16xi32> loc(#loc263) + %ret_130 = arith.xori %ileft_109, %iright_110 : tensor<8x16xi32> loc(#loc264) + %ret_131 = arith.select %cond_129, %ret_130, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_132 = arith.xori %ret_96, %ret_131 : tensor<8x16xi32> loc(#loc266) + %new_idxs_133 = arith.xori %left_idx_120, %right_idx_121 : tensor<8x16xi32> loc(#loc267) + %new_idxs_134 = arith.select %cond_129, %new_idxs_133, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_135 = arith.xori %new_idxs_99, %new_idxs_134 : tensor<8x16xi32> loc(#loc269) + %flip_136 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc219) + %flip_137 = tt.reshape %flip_136 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc220) + %y_138 = tt.reshape %ret_132 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc232) + %ileft_139 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc234) + %ileft_140 = arith.muli %y_138, %ileft_139 : tensor<16x2x4xi32> loc(#loc234) + %ileft_141 = "tt.reduce"(%ileft_140) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc307) + %ileft_142 = tt.expand_dims %ileft_141 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc236) + %ileft_143 = tt.broadcast %ileft_142 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc237) + %iright_144 = arith.muli %y_138, %flip_61 : tensor<16x2x4xi32> loc(#loc238) + %iright_145 = "tt.reduce"(%iright_144) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc309) + %iright_146 = tt.expand_dims %iright_145 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc240) + %iright_147 = tt.broadcast %iright_146 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc241) + %ileft_148 = tt.reshape %ileft_143 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_149 = tt.reshape %iright_147 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_150 = tt.reshape %new_idxs_135 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc244) + %left_idx_151 = arith.muli %y_idx_150, %ileft_139 : tensor<16x2x4xi32> loc(#loc246) + %left_idx_152 = "tt.reduce"(%left_idx_151) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc312) + %left_idx_153 = tt.expand_dims %left_idx_152 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc248) + %left_idx_154 = tt.broadcast %left_idx_153 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc249) + %right_idx_155 = arith.muli %y_idx_150, %flip_61 : tensor<16x2x4xi32> loc(#loc251) + %right_idx_156 = "tt.reduce"(%right_idx_155) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc315) + %right_idx_157 = tt.expand_dims %right_idx_156 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc253) + %right_idx_158 = tt.broadcast %right_idx_157 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc254) + %left_idx_159 = tt.reshape %left_idx_154 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_160 = tt.reshape %right_idx_158 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_161 = arith.cmpi slt, %ileft_148, %iright_149 : tensor<8x16xi32> loc(#loc257) + %eq_162 = arith.cmpi eq, %ileft_148, %iright_149 : tensor<8x16xi32> loc(#loc258) + %cond_163 = arith.cmpi sgt, %left_idx_159, %right_idx_160 : tensor<8x16xi32> loc(#loc259) + %cond_164 = arith.andi %eq_162, %cond_163 : tensor<8x16xi1> loc(#loc260) + %cond_165 = arith.ori %cond_161, %cond_164 : tensor<8x16xi1> loc(#loc261) + %cond_166 = arith.extui %cond_165 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc262) + %cond_167 = arith.xori %cond_166, %flip_137 : tensor<8x16xi32> loc(#loc262) + %cond_168 = arith.cmpi ne, %cond_167, %cst_3 : tensor<8x16xi32> loc(#loc263) + %ret_169 = arith.xori %ileft_148, %iright_149 : tensor<8x16xi32> loc(#loc264) + %ret_170 = arith.select %cond_168, %ret_169, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_171 = arith.xori %ret_132, %ret_170 : tensor<8x16xi32> loc(#loc266) + %new_idxs_172 = arith.xori %left_idx_159, %right_idx_160 : tensor<8x16xi32> loc(#loc267) + %new_idxs_173 = arith.select %cond_168, %new_idxs_172, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_174 = arith.xori %new_idxs_135, %new_idxs_173 : tensor<8x16xi32> loc(#loc269) + %y_175 = tt.reshape %ret_171 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc232) + %ileft_176 = arith.muli %y_175, %ileft_64 : tensor<32x2x2xi32> loc(#loc234) + %ileft_177 = "tt.reduce"(%ileft_176) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc307) + %ileft_178 = tt.expand_dims %ileft_177 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc236) + %ileft_179 = tt.broadcast %ileft_178 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc237) + %iright_180 = arith.muli %y_175, %flip_24 : tensor<32x2x2xi32> loc(#loc238) + %iright_181 = "tt.reduce"(%iright_180) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc309) + %iright_182 = tt.expand_dims %iright_181 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc240) + %iright_183 = tt.broadcast %iright_182 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc241) + %ileft_184 = tt.reshape %ileft_179 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_185 = tt.reshape %iright_183 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_186 = tt.reshape %new_idxs_174 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc244) + %left_idx_187 = arith.muli %y_idx_186, %ileft_64 : tensor<32x2x2xi32> loc(#loc246) + %left_idx_188 = "tt.reduce"(%left_idx_187) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc312) + %left_idx_189 = tt.expand_dims %left_idx_188 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc248) + %left_idx_190 = tt.broadcast %left_idx_189 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc249) + %right_idx_191 = arith.muli %y_idx_186, %flip_24 : tensor<32x2x2xi32> loc(#loc251) + %right_idx_192 = "tt.reduce"(%right_idx_191) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc315) + %right_idx_193 = tt.expand_dims %right_idx_192 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc253) + %right_idx_194 = tt.broadcast %right_idx_193 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc254) + %left_idx_195 = tt.reshape %left_idx_190 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_196 = tt.reshape %right_idx_194 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_197 = arith.cmpi slt, %ileft_184, %iright_185 : tensor<8x16xi32> loc(#loc257) + %eq_198 = arith.cmpi eq, %ileft_184, %iright_185 : tensor<8x16xi32> loc(#loc258) + %cond_199 = arith.cmpi sgt, %left_idx_195, %right_idx_196 : tensor<8x16xi32> loc(#loc259) + %cond_200 = arith.andi %eq_198, %cond_199 : tensor<8x16xi1> loc(#loc260) + %cond_201 = arith.ori %cond_197, %cond_200 : tensor<8x16xi1> loc(#loc261) + %cond_202 = arith.extui %cond_201 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc262) + %cond_203 = arith.xori %cond_202, %flip_137 : tensor<8x16xi32> loc(#loc262) + %cond_204 = arith.cmpi ne, %cond_203, %cst_3 : tensor<8x16xi32> loc(#loc263) + %ret_205 = arith.xori %ileft_184, %iright_185 : tensor<8x16xi32> loc(#loc264) + %ret_206 = arith.select %cond_204, %ret_205, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_207 = arith.xori %ret_171, %ret_206 : tensor<8x16xi32> loc(#loc266) + %new_idxs_208 = arith.xori %left_idx_195, %right_idx_196 : tensor<8x16xi32> loc(#loc267) + %new_idxs_209 = arith.select %cond_204, %new_idxs_208, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_210 = arith.xori %new_idxs_174, %new_idxs_209 : tensor<8x16xi32> loc(#loc269) + %y_211 = tt.reshape %ret_207 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc232) + %ileft_212 = arith.muli %y_211, %ileft : tensor<64x2x1xi32> loc(#loc234) + %ileft_213 = "tt.reduce"(%ileft_212) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc307) + %ileft_214 = tt.expand_dims %ileft_213 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc236) + %ileft_215 = tt.broadcast %ileft_214 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc237) + %iright_216 = arith.muli %y_211, %iright : tensor<64x2x1xi32> loc(#loc238) + %iright_217 = "tt.reduce"(%iright_216) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc309) + %iright_218 = tt.expand_dims %iright_217 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc240) + %iright_219 = tt.broadcast %iright_218 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc241) + %ileft_220 = tt.reshape %ileft_215 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_221 = tt.reshape %iright_219 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_222 = tt.reshape %new_idxs_210 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc244) + %left_idx_223 = arith.muli %y_idx_222, %ileft : tensor<64x2x1xi32> loc(#loc246) + %left_idx_224 = "tt.reduce"(%left_idx_223) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc312) + %left_idx_225 = tt.expand_dims %left_idx_224 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc248) + %left_idx_226 = tt.broadcast %left_idx_225 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc249) + %right_idx_227 = arith.muli %y_idx_222, %iright : tensor<64x2x1xi32> loc(#loc251) + %right_idx_228 = "tt.reduce"(%right_idx_227) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc315) + %right_idx_229 = tt.expand_dims %right_idx_228 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc253) + %right_idx_230 = tt.broadcast %right_idx_229 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc254) + %left_idx_231 = tt.reshape %left_idx_226 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_232 = tt.reshape %right_idx_230 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_233 = arith.cmpi slt, %ileft_220, %iright_221 : tensor<8x16xi32> loc(#loc257) + %eq_234 = arith.cmpi eq, %ileft_220, %iright_221 : tensor<8x16xi32> loc(#loc258) + %cond_235 = arith.cmpi sgt, %left_idx_231, %right_idx_232 : tensor<8x16xi32> loc(#loc259) + %cond_236 = arith.andi %eq_234, %cond_235 : tensor<8x16xi1> loc(#loc260) + %cond_237 = arith.ori %cond_233, %cond_236 : tensor<8x16xi1> loc(#loc261) + %cond_238 = arith.extui %cond_237 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc262) + %cond_239 = arith.xori %cond_238, %flip_137 : tensor<8x16xi32> loc(#loc262) + %cond_240 = arith.cmpi ne, %cond_239, %cst_3 : tensor<8x16xi32> loc(#loc263) + %ret_241 = arith.xori %ileft_220, %iright_221 : tensor<8x16xi32> loc(#loc264) + %ret_242 = arith.select %cond_240, %ret_241, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_243 = arith.xori %ret_207, %ret_242 : tensor<8x16xi32> loc(#loc266) + %new_idxs_244 = arith.xori %left_idx_231, %right_idx_232 : tensor<8x16xi32> loc(#loc267) + %new_idxs_245 = arith.select %cond_240, %new_idxs_244, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_246 = arith.xori %new_idxs_210, %new_idxs_245 : tensor<8x16xi32> loc(#loc269) + %y_247 = tt.reshape %ret_243 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc232) + %ileft_248 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc234) + %ileft_249 = arith.muli %y_247, %ileft_248 : tensor<8x2x8xi32> loc(#loc234) + %ileft_250 = "tt.reduce"(%ileft_249) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc307) + %ileft_251 = tt.expand_dims %ileft_250 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc236) + %ileft_252 = tt.broadcast %ileft_251 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc237) + %iright_253 = arith.muli %y_247, %flip_136 : tensor<8x2x8xi32> loc(#loc238) + %iright_254 = "tt.reduce"(%iright_253) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc309) + %iright_255 = tt.expand_dims %iright_254 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc240) + %iright_256 = tt.broadcast %iright_255 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc241) + %ileft_257 = tt.reshape %ileft_252 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_258 = tt.reshape %iright_256 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_259 = tt.reshape %new_idxs_246 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc244) + %left_idx_260 = arith.muli %y_idx_259, %ileft_248 : tensor<8x2x8xi32> loc(#loc246) + %left_idx_261 = "tt.reduce"(%left_idx_260) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc312) + %left_idx_262 = tt.expand_dims %left_idx_261 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc248) + %left_idx_263 = tt.broadcast %left_idx_262 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc249) + %right_idx_264 = arith.muli %y_idx_259, %flip_136 : tensor<8x2x8xi32> loc(#loc251) + %right_idx_265 = "tt.reduce"(%right_idx_264) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc315) + %right_idx_266 = tt.expand_dims %right_idx_265 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc253) + %right_idx_267 = tt.broadcast %right_idx_266 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc254) + %left_idx_268 = tt.reshape %left_idx_263 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_269 = tt.reshape %right_idx_267 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_270 = arith.cmpi slt, %ileft_257, %iright_258 : tensor<8x16xi32> loc(#loc257) + %eq_271 = arith.cmpi eq, %ileft_257, %iright_258 : tensor<8x16xi32> loc(#loc258) + %cond_272 = arith.cmpi sgt, %left_idx_268, %right_idx_269 : tensor<8x16xi32> loc(#loc259) + %cond_273 = arith.andi %eq_271, %cond_272 : tensor<8x16xi1> loc(#loc260) + %cond_274 = arith.ori %cond_270, %cond_273 : tensor<8x16xi1> loc(#loc261) + %ret_275 = arith.xori %ileft_257, %iright_258 : tensor<8x16xi32> loc(#loc264) + %ret_276 = arith.select %cond_274, %ret_275, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_277 = arith.xori %ret_243, %ret_276 : tensor<8x16xi32> loc(#loc266) + %new_idxs_278 = arith.xori %left_idx_268, %right_idx_269 : tensor<8x16xi32> loc(#loc267) + %new_idxs_279 = arith.select %cond_274, %new_idxs_278, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_280 = arith.xori %new_idxs_246, %new_idxs_279 : tensor<8x16xi32> loc(#loc269) + %y_281 = tt.reshape %ret_277 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc232) + %ileft_282 = arith.muli %y_281, %ileft_139 : tensor<16x2x4xi32> loc(#loc234) + %ileft_283 = "tt.reduce"(%ileft_282) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc307) + %ileft_284 = tt.expand_dims %ileft_283 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc236) + %ileft_285 = tt.broadcast %ileft_284 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc237) + %iright_286 = arith.muli %y_281, %flip_61 : tensor<16x2x4xi32> loc(#loc238) + %iright_287 = "tt.reduce"(%iright_286) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc309) + %iright_288 = tt.expand_dims %iright_287 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc240) + %iright_289 = tt.broadcast %iright_288 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc241) + %ileft_290 = tt.reshape %ileft_285 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_291 = tt.reshape %iright_289 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_292 = tt.reshape %new_idxs_280 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc244) + %left_idx_293 = arith.muli %y_idx_292, %ileft_139 : tensor<16x2x4xi32> loc(#loc246) + %left_idx_294 = "tt.reduce"(%left_idx_293) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc312) + %left_idx_295 = tt.expand_dims %left_idx_294 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc248) + %left_idx_296 = tt.broadcast %left_idx_295 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc249) + %right_idx_297 = arith.muli %y_idx_292, %flip_61 : tensor<16x2x4xi32> loc(#loc251) + %right_idx_298 = "tt.reduce"(%right_idx_297) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc315) + %right_idx_299 = tt.expand_dims %right_idx_298 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc253) + %right_idx_300 = tt.broadcast %right_idx_299 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc254) + %left_idx_301 = tt.reshape %left_idx_296 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_302 = tt.reshape %right_idx_300 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_303 = arith.cmpi slt, %ileft_290, %iright_291 : tensor<8x16xi32> loc(#loc257) + %eq_304 = arith.cmpi eq, %ileft_290, %iright_291 : tensor<8x16xi32> loc(#loc258) + %cond_305 = arith.cmpi sgt, %left_idx_301, %right_idx_302 : tensor<8x16xi32> loc(#loc259) + %cond_306 = arith.andi %eq_304, %cond_305 : tensor<8x16xi1> loc(#loc260) + %cond_307 = arith.ori %cond_303, %cond_306 : tensor<8x16xi1> loc(#loc261) + %ret_308 = arith.xori %ileft_290, %iright_291 : tensor<8x16xi32> loc(#loc264) + %ret_309 = arith.select %cond_307, %ret_308, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_310 = arith.xori %ret_277, %ret_309 : tensor<8x16xi32> loc(#loc266) + %new_idxs_311 = arith.xori %left_idx_301, %right_idx_302 : tensor<8x16xi32> loc(#loc267) + %new_idxs_312 = arith.select %cond_307, %new_idxs_311, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_313 = arith.xori %new_idxs_280, %new_idxs_312 : tensor<8x16xi32> loc(#loc269) + %y_314 = tt.reshape %ret_310 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc232) + %ileft_315 = arith.muli %y_314, %ileft_64 : tensor<32x2x2xi32> loc(#loc234) + %ileft_316 = "tt.reduce"(%ileft_315) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc307) + %ileft_317 = tt.expand_dims %ileft_316 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc236) + %ileft_318 = tt.broadcast %ileft_317 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc237) + %iright_319 = arith.muli %y_314, %flip_24 : tensor<32x2x2xi32> loc(#loc238) + %iright_320 = "tt.reduce"(%iright_319) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc309) + %iright_321 = tt.expand_dims %iright_320 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc240) + %iright_322 = tt.broadcast %iright_321 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc241) + %ileft_323 = tt.reshape %ileft_318 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_324 = tt.reshape %iright_322 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_325 = tt.reshape %new_idxs_313 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc244) + %left_idx_326 = arith.muli %y_idx_325, %ileft_64 : tensor<32x2x2xi32> loc(#loc246) + %left_idx_327 = "tt.reduce"(%left_idx_326) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc312) + %left_idx_328 = tt.expand_dims %left_idx_327 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc248) + %left_idx_329 = tt.broadcast %left_idx_328 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc249) + %right_idx_330 = arith.muli %y_idx_325, %flip_24 : tensor<32x2x2xi32> loc(#loc251) + %right_idx_331 = "tt.reduce"(%right_idx_330) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc315) + %right_idx_332 = tt.expand_dims %right_idx_331 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc253) + %right_idx_333 = tt.broadcast %right_idx_332 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc254) + %left_idx_334 = tt.reshape %left_idx_329 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_335 = tt.reshape %right_idx_333 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_336 = arith.cmpi slt, %ileft_323, %iright_324 : tensor<8x16xi32> loc(#loc257) + %eq_337 = arith.cmpi eq, %ileft_323, %iright_324 : tensor<8x16xi32> loc(#loc258) + %cond_338 = arith.cmpi sgt, %left_idx_334, %right_idx_335 : tensor<8x16xi32> loc(#loc259) + %cond_339 = arith.andi %eq_337, %cond_338 : tensor<8x16xi1> loc(#loc260) + %cond_340 = arith.ori %cond_336, %cond_339 : tensor<8x16xi1> loc(#loc261) + %ret_341 = arith.xori %ileft_323, %iright_324 : tensor<8x16xi32> loc(#loc264) + %ret_342 = arith.select %cond_340, %ret_341, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_343 = arith.xori %ret_310, %ret_342 : tensor<8x16xi32> loc(#loc266) + %new_idxs_344 = arith.xori %left_idx_334, %right_idx_335 : tensor<8x16xi32> loc(#loc267) + %new_idxs_345 = arith.select %cond_340, %new_idxs_344, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_346 = arith.xori %new_idxs_313, %new_idxs_345 : tensor<8x16xi32> loc(#loc269) + %y_347 = tt.reshape %ret_343 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc232) + %ileft_348 = arith.muli %y_347, %ileft : tensor<64x2x1xi32> loc(#loc234) + %ileft_349 = "tt.reduce"(%ileft_348) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc307) + %ileft_350 = tt.expand_dims %ileft_349 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc236) + %ileft_351 = tt.broadcast %ileft_350 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc237) + %iright_352 = arith.muli %y_347, %iright : tensor<64x2x1xi32> loc(#loc238) + %iright_353 = "tt.reduce"(%iright_352) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc309) + %iright_354 = tt.expand_dims %iright_353 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc240) + %iright_355 = tt.broadcast %iright_354 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc241) + %ileft_356 = tt.reshape %ileft_351 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_357 = tt.reshape %iright_355 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_358 = tt.reshape %new_idxs_346 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc244) + %left_idx_359 = arith.muli %y_idx_358, %ileft : tensor<64x2x1xi32> loc(#loc246) + %left_idx_360 = "tt.reduce"(%left_idx_359) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc312) + %left_idx_361 = tt.expand_dims %left_idx_360 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc248) + %left_idx_362 = tt.broadcast %left_idx_361 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc249) + %right_idx_363 = arith.muli %y_idx_358, %iright : tensor<64x2x1xi32> loc(#loc251) + %right_idx_364 = "tt.reduce"(%right_idx_363) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc315) + %right_idx_365 = tt.expand_dims %right_idx_364 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc253) + %right_idx_366 = tt.broadcast %right_idx_365 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc254) + %left_idx_367 = tt.reshape %left_idx_362 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_368 = tt.reshape %right_idx_366 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_369 = arith.cmpi slt, %ileft_356, %iright_357 : tensor<8x16xi32> loc(#loc257) + %eq_370 = arith.cmpi eq, %ileft_356, %iright_357 : tensor<8x16xi32> loc(#loc258) + %cond_371 = arith.cmpi sgt, %left_idx_367, %right_idx_368 : tensor<8x16xi32> loc(#loc259) + %cond_372 = arith.andi %eq_370, %cond_371 : tensor<8x16xi1> loc(#loc260) + %cond_373 = arith.ori %cond_369, %cond_372 : tensor<8x16xi1> loc(#loc261) + %new_idxs_374 = arith.xori %left_idx_367, %right_idx_368 : tensor<8x16xi32> loc(#loc267) + %new_idxs_375 = arith.select %cond_373, %new_idxs_374, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_376 = arith.xori %new_idxs_346, %new_idxs_375 : tensor<8x16xi32> loc(#loc269) + %tmp14 = arith.cmpi eq, %tmp0_21, %cst_6 : tensor<8x16xi64> loc(#loc192) + %tmp16 = arith.extui %tmp14 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc224) + %y_377 = tt.reshape %tmp16 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc270) + %ileft_378 = arith.muli %y_377, %ileft : tensor<64x2x1xi32> loc(#loc271) + %ileft_379 = "tt.reduce"(%ileft_378) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc317) + %ileft_380 = tt.expand_dims %ileft_379 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc273) + %ileft_381 = tt.broadcast %ileft_380 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc274) + %iright_382 = arith.muli %y_377, %iright : tensor<64x2x1xi32> loc(#loc275) + %iright_383 = "tt.reduce"(%iright_382) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc319) + %iright_384 = tt.expand_dims %iright_383 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc277) + %iright_385 = tt.broadcast %iright_384 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc278) + %ileft_386 = tt.reshape %ileft_381 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_387 = tt.reshape %iright_385 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %cond_388 = arith.cmpi slt, %ileft_386, %iright_387 : tensor<8x16xi32> loc(#loc281) + %eq_389 = arith.cmpi eq, %ileft_386, %iright_387 : tensor<8x16xi32> loc(#loc282) + %cond_390 = arith.andi %eq_389, %cond_49 : tensor<8x16xi1> loc(#loc283) + %cond_391 = arith.ori %cond_388, %cond_390 : tensor<8x16xi1> loc(#loc284) + %cond_392 = arith.extui %cond_391 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc285) + %cond_393 = arith.xori %cond_392, %flip_25 : tensor<8x16xi32> loc(#loc285) + %cond_394 = arith.cmpi ne, %cond_393, %cst_3 : tensor<8x16xi32> loc(#loc286) + %ret_395 = arith.xori %ileft_386, %iright_387 : tensor<8x16xi32> loc(#loc287) + %ret_396 = arith.select %cond_394, %ret_395, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_397 = arith.xori %tmp16, %ret_396 : tensor<8x16xi32> loc(#loc289) + %new_idxs_398 = arith.select %cond_394, %new_idxs, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_399 = arith.xori %new_idxs_59, %new_idxs_398 : tensor<8x16xi32> loc(#loc291) + %y_400 = tt.reshape %ret_397 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc270) + %ileft_401 = arith.muli %y_400, %ileft_64 : tensor<32x2x2xi32> loc(#loc271) + %ileft_402 = "tt.reduce"(%ileft_401) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc317) + %ileft_403 = tt.expand_dims %ileft_402 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc273) + %ileft_404 = tt.broadcast %ileft_403 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc274) + %iright_405 = arith.muli %y_400, %flip_24 : tensor<32x2x2xi32> loc(#loc275) + %iright_406 = "tt.reduce"(%iright_405) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc319) + %iright_407 = tt.expand_dims %iright_406 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc277) + %iright_408 = tt.broadcast %iright_407 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc278) + %ileft_409 = tt.reshape %ileft_404 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_410 = tt.reshape %iright_408 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_411 = tt.reshape %new_idxs_399 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc292) + %left_idx_412 = arith.muli %y_idx_411, %ileft_64 : tensor<32x2x2xi32> loc(#loc293) + %left_idx_413 = "tt.reduce"(%left_idx_412) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc321) + %left_idx_414 = tt.expand_dims %left_idx_413 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc295) + %left_idx_415 = tt.broadcast %left_idx_414 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc296) + %right_idx_416 = arith.muli %y_idx_411, %flip_24 : tensor<32x2x2xi32> loc(#loc297) + %right_idx_417 = "tt.reduce"(%right_idx_416) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc323) + %right_idx_418 = tt.expand_dims %right_idx_417 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc299) + %right_idx_419 = tt.broadcast %right_idx_418 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc300) + %left_idx_420 = tt.reshape %left_idx_415 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_421 = tt.reshape %right_idx_419 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_422 = arith.cmpi slt, %ileft_409, %iright_410 : tensor<8x16xi32> loc(#loc281) + %eq_423 = arith.cmpi eq, %ileft_409, %iright_410 : tensor<8x16xi32> loc(#loc282) + %cond_424 = arith.cmpi sgt, %left_idx_420, %right_idx_421 : tensor<8x16xi32> loc(#loc303) + %cond_425 = arith.andi %eq_423, %cond_424 : tensor<8x16xi1> loc(#loc283) + %cond_426 = arith.ori %cond_422, %cond_425 : tensor<8x16xi1> loc(#loc284) + %cond_427 = arith.extui %cond_426 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc285) + %cond_428 = arith.xori %cond_427, %flip_62 : tensor<8x16xi32> loc(#loc285) + %cond_429 = arith.cmpi ne, %cond_428, %cst_3 : tensor<8x16xi32> loc(#loc286) + %ret_430 = arith.xori %ileft_409, %iright_410 : tensor<8x16xi32> loc(#loc287) + %ret_431 = arith.select %cond_429, %ret_430, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_432 = arith.xori %ret_397, %ret_431 : tensor<8x16xi32> loc(#loc289) + %new_idxs_433 = arith.xori %left_idx_420, %right_idx_421 : tensor<8x16xi32> loc(#loc304) + %new_idxs_434 = arith.select %cond_429, %new_idxs_433, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_435 = arith.xori %new_idxs_399, %new_idxs_434 : tensor<8x16xi32> loc(#loc291) + %y_436 = tt.reshape %ret_432 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc270) + %ileft_437 = arith.muli %y_436, %ileft : tensor<64x2x1xi32> loc(#loc271) + %ileft_438 = "tt.reduce"(%ileft_437) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc317) + %ileft_439 = tt.expand_dims %ileft_438 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc273) + %ileft_440 = tt.broadcast %ileft_439 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc274) + %iright_441 = arith.muli %y_436, %iright : tensor<64x2x1xi32> loc(#loc275) + %iright_442 = "tt.reduce"(%iright_441) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc319) + %iright_443 = tt.expand_dims %iright_442 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc277) + %iright_444 = tt.broadcast %iright_443 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc278) + %ileft_445 = tt.reshape %ileft_440 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_446 = tt.reshape %iright_444 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_447 = tt.reshape %new_idxs_435 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc292) + %left_idx_448 = arith.muli %y_idx_447, %ileft : tensor<64x2x1xi32> loc(#loc293) + %left_idx_449 = "tt.reduce"(%left_idx_448) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc321) + %left_idx_450 = tt.expand_dims %left_idx_449 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc295) + %left_idx_451 = tt.broadcast %left_idx_450 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc296) + %right_idx_452 = arith.muli %y_idx_447, %iright : tensor<64x2x1xi32> loc(#loc297) + %right_idx_453 = "tt.reduce"(%right_idx_452) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc323) + %right_idx_454 = tt.expand_dims %right_idx_453 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc299) + %right_idx_455 = tt.broadcast %right_idx_454 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc300) + %left_idx_456 = tt.reshape %left_idx_451 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_457 = tt.reshape %right_idx_455 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_458 = arith.cmpi slt, %ileft_445, %iright_446 : tensor<8x16xi32> loc(#loc281) + %eq_459 = arith.cmpi eq, %ileft_445, %iright_446 : tensor<8x16xi32> loc(#loc282) + %cond_460 = arith.cmpi sgt, %left_idx_456, %right_idx_457 : tensor<8x16xi32> loc(#loc303) + %cond_461 = arith.andi %eq_459, %cond_460 : tensor<8x16xi1> loc(#loc283) + %cond_462 = arith.ori %cond_458, %cond_461 : tensor<8x16xi1> loc(#loc284) + %cond_463 = arith.extui %cond_462 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc285) + %cond_464 = arith.xori %cond_463, %flip_62 : tensor<8x16xi32> loc(#loc285) + %cond_465 = arith.cmpi ne, %cond_464, %cst_3 : tensor<8x16xi32> loc(#loc286) + %ret_466 = arith.xori %ileft_445, %iright_446 : tensor<8x16xi32> loc(#loc287) + %ret_467 = arith.select %cond_465, %ret_466, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_468 = arith.xori %ret_432, %ret_467 : tensor<8x16xi32> loc(#loc289) + %new_idxs_469 = arith.xori %left_idx_456, %right_idx_457 : tensor<8x16xi32> loc(#loc304) + %new_idxs_470 = arith.select %cond_465, %new_idxs_469, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_471 = arith.xori %new_idxs_435, %new_idxs_470 : tensor<8x16xi32> loc(#loc291) + %y_472 = tt.reshape %ret_468 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc270) + %ileft_473 = arith.muli %y_472, %ileft_139 : tensor<16x2x4xi32> loc(#loc271) + %ileft_474 = "tt.reduce"(%ileft_473) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc317) + %ileft_475 = tt.expand_dims %ileft_474 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc273) + %ileft_476 = tt.broadcast %ileft_475 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc274) + %iright_477 = arith.muli %y_472, %flip_61 : tensor<16x2x4xi32> loc(#loc275) + %iright_478 = "tt.reduce"(%iright_477) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc319) + %iright_479 = tt.expand_dims %iright_478 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc277) + %iright_480 = tt.broadcast %iright_479 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc278) + %ileft_481 = tt.reshape %ileft_476 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_482 = tt.reshape %iright_480 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_483 = tt.reshape %new_idxs_471 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc292) + %left_idx_484 = arith.muli %y_idx_483, %ileft_139 : tensor<16x2x4xi32> loc(#loc293) + %left_idx_485 = "tt.reduce"(%left_idx_484) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc321) + %left_idx_486 = tt.expand_dims %left_idx_485 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc295) + %left_idx_487 = tt.broadcast %left_idx_486 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc296) + %right_idx_488 = arith.muli %y_idx_483, %flip_61 : tensor<16x2x4xi32> loc(#loc297) + %right_idx_489 = "tt.reduce"(%right_idx_488) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc323) + %right_idx_490 = tt.expand_dims %right_idx_489 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc299) + %right_idx_491 = tt.broadcast %right_idx_490 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc300) + %left_idx_492 = tt.reshape %left_idx_487 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_493 = tt.reshape %right_idx_491 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_494 = arith.cmpi slt, %ileft_481, %iright_482 : tensor<8x16xi32> loc(#loc281) + %eq_495 = arith.cmpi eq, %ileft_481, %iright_482 : tensor<8x16xi32> loc(#loc282) + %cond_496 = arith.cmpi sgt, %left_idx_492, %right_idx_493 : tensor<8x16xi32> loc(#loc303) + %cond_497 = arith.andi %eq_495, %cond_496 : tensor<8x16xi1> loc(#loc283) + %cond_498 = arith.ori %cond_494, %cond_497 : tensor<8x16xi1> loc(#loc284) + %cond_499 = arith.extui %cond_498 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc285) + %cond_500 = arith.xori %cond_499, %flip_137 : tensor<8x16xi32> loc(#loc285) + %cond_501 = arith.cmpi ne, %cond_500, %cst_3 : tensor<8x16xi32> loc(#loc286) + %ret_502 = arith.xori %ileft_481, %iright_482 : tensor<8x16xi32> loc(#loc287) + %ret_503 = arith.select %cond_501, %ret_502, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_504 = arith.xori %ret_468, %ret_503 : tensor<8x16xi32> loc(#loc289) + %new_idxs_505 = arith.xori %left_idx_492, %right_idx_493 : tensor<8x16xi32> loc(#loc304) + %new_idxs_506 = arith.select %cond_501, %new_idxs_505, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_507 = arith.xori %new_idxs_471, %new_idxs_506 : tensor<8x16xi32> loc(#loc291) + %y_508 = tt.reshape %ret_504 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc270) + %ileft_509 = arith.muli %y_508, %ileft_64 : tensor<32x2x2xi32> loc(#loc271) + %ileft_510 = "tt.reduce"(%ileft_509) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc317) + %ileft_511 = tt.expand_dims %ileft_510 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc273) + %ileft_512 = tt.broadcast %ileft_511 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc274) + %iright_513 = arith.muli %y_508, %flip_24 : tensor<32x2x2xi32> loc(#loc275) + %iright_514 = "tt.reduce"(%iright_513) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc319) + %iright_515 = tt.expand_dims %iright_514 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc277) + %iright_516 = tt.broadcast %iright_515 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc278) + %ileft_517 = tt.reshape %ileft_512 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_518 = tt.reshape %iright_516 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_519 = tt.reshape %new_idxs_507 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc292) + %left_idx_520 = arith.muli %y_idx_519, %ileft_64 : tensor<32x2x2xi32> loc(#loc293) + %left_idx_521 = "tt.reduce"(%left_idx_520) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc321) + %left_idx_522 = tt.expand_dims %left_idx_521 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc295) + %left_idx_523 = tt.broadcast %left_idx_522 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc296) + %right_idx_524 = arith.muli %y_idx_519, %flip_24 : tensor<32x2x2xi32> loc(#loc297) + %right_idx_525 = "tt.reduce"(%right_idx_524) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc323) + %right_idx_526 = tt.expand_dims %right_idx_525 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc299) + %right_idx_527 = tt.broadcast %right_idx_526 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc300) + %left_idx_528 = tt.reshape %left_idx_523 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_529 = tt.reshape %right_idx_527 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_530 = arith.cmpi slt, %ileft_517, %iright_518 : tensor<8x16xi32> loc(#loc281) + %eq_531 = arith.cmpi eq, %ileft_517, %iright_518 : tensor<8x16xi32> loc(#loc282) + %cond_532 = arith.cmpi sgt, %left_idx_528, %right_idx_529 : tensor<8x16xi32> loc(#loc303) + %cond_533 = arith.andi %eq_531, %cond_532 : tensor<8x16xi1> loc(#loc283) + %cond_534 = arith.ori %cond_530, %cond_533 : tensor<8x16xi1> loc(#loc284) + %cond_535 = arith.extui %cond_534 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc285) + %cond_536 = arith.xori %cond_535, %flip_137 : tensor<8x16xi32> loc(#loc285) + %cond_537 = arith.cmpi ne, %cond_536, %cst_3 : tensor<8x16xi32> loc(#loc286) + %ret_538 = arith.xori %ileft_517, %iright_518 : tensor<8x16xi32> loc(#loc287) + %ret_539 = arith.select %cond_537, %ret_538, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_540 = arith.xori %ret_504, %ret_539 : tensor<8x16xi32> loc(#loc289) + %new_idxs_541 = arith.xori %left_idx_528, %right_idx_529 : tensor<8x16xi32> loc(#loc304) + %new_idxs_542 = arith.select %cond_537, %new_idxs_541, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_543 = arith.xori %new_idxs_507, %new_idxs_542 : tensor<8x16xi32> loc(#loc291) + %y_544 = tt.reshape %ret_540 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc270) + %ileft_545 = arith.muli %y_544, %ileft : tensor<64x2x1xi32> loc(#loc271) + %ileft_546 = "tt.reduce"(%ileft_545) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc317) + %ileft_547 = tt.expand_dims %ileft_546 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc273) + %ileft_548 = tt.broadcast %ileft_547 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc274) + %iright_549 = arith.muli %y_544, %iright : tensor<64x2x1xi32> loc(#loc275) + %iright_550 = "tt.reduce"(%iright_549) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc319) + %iright_551 = tt.expand_dims %iright_550 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc277) + %iright_552 = tt.broadcast %iright_551 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc278) + %ileft_553 = tt.reshape %ileft_548 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_554 = tt.reshape %iright_552 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_555 = tt.reshape %new_idxs_543 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc292) + %left_idx_556 = arith.muli %y_idx_555, %ileft : tensor<64x2x1xi32> loc(#loc293) + %left_idx_557 = "tt.reduce"(%left_idx_556) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc321) + %left_idx_558 = tt.expand_dims %left_idx_557 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc295) + %left_idx_559 = tt.broadcast %left_idx_558 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc296) + %right_idx_560 = arith.muli %y_idx_555, %iright : tensor<64x2x1xi32> loc(#loc297) + %right_idx_561 = "tt.reduce"(%right_idx_560) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc323) + %right_idx_562 = tt.expand_dims %right_idx_561 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc299) + %right_idx_563 = tt.broadcast %right_idx_562 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc300) + %left_idx_564 = tt.reshape %left_idx_559 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_565 = tt.reshape %right_idx_563 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_566 = arith.cmpi slt, %ileft_553, %iright_554 : tensor<8x16xi32> loc(#loc281) + %eq_567 = arith.cmpi eq, %ileft_553, %iright_554 : tensor<8x16xi32> loc(#loc282) + %cond_568 = arith.cmpi sgt, %left_idx_564, %right_idx_565 : tensor<8x16xi32> loc(#loc303) + %cond_569 = arith.andi %eq_567, %cond_568 : tensor<8x16xi1> loc(#loc283) + %cond_570 = arith.ori %cond_566, %cond_569 : tensor<8x16xi1> loc(#loc284) + %cond_571 = arith.extui %cond_570 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc285) + %cond_572 = arith.xori %cond_571, %flip_137 : tensor<8x16xi32> loc(#loc285) + %cond_573 = arith.cmpi ne, %cond_572, %cst_3 : tensor<8x16xi32> loc(#loc286) + %ret_574 = arith.xori %ileft_553, %iright_554 : tensor<8x16xi32> loc(#loc287) + %ret_575 = arith.select %cond_573, %ret_574, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_576 = arith.xori %ret_540, %ret_575 : tensor<8x16xi32> loc(#loc289) + %new_idxs_577 = arith.xori %left_idx_564, %right_idx_565 : tensor<8x16xi32> loc(#loc304) + %new_idxs_578 = arith.select %cond_573, %new_idxs_577, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_579 = arith.xori %new_idxs_543, %new_idxs_578 : tensor<8x16xi32> loc(#loc291) + %y_580 = tt.reshape %ret_576 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc270) + %ileft_581 = arith.muli %y_580, %ileft_248 : tensor<8x2x8xi32> loc(#loc271) + %ileft_582 = "tt.reduce"(%ileft_581) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc317) + %ileft_583 = tt.expand_dims %ileft_582 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc273) + %ileft_584 = tt.broadcast %ileft_583 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc274) + %iright_585 = arith.muli %y_580, %flip_136 : tensor<8x2x8xi32> loc(#loc275) + %iright_586 = "tt.reduce"(%iright_585) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc319) + %iright_587 = tt.expand_dims %iright_586 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc277) + %iright_588 = tt.broadcast %iright_587 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc278) + %ileft_589 = tt.reshape %ileft_584 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_590 = tt.reshape %iright_588 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_591 = tt.reshape %new_idxs_579 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc292) + %left_idx_592 = arith.muli %y_idx_591, %ileft_248 : tensor<8x2x8xi32> loc(#loc293) + %left_idx_593 = "tt.reduce"(%left_idx_592) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc321) + %left_idx_594 = tt.expand_dims %left_idx_593 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc295) + %left_idx_595 = tt.broadcast %left_idx_594 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc296) + %right_idx_596 = arith.muli %y_idx_591, %flip_136 : tensor<8x2x8xi32> loc(#loc297) + %right_idx_597 = "tt.reduce"(%right_idx_596) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc323) + %right_idx_598 = tt.expand_dims %right_idx_597 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc299) + %right_idx_599 = tt.broadcast %right_idx_598 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc300) + %left_idx_600 = tt.reshape %left_idx_595 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_601 = tt.reshape %right_idx_599 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_602 = arith.cmpi slt, %ileft_589, %iright_590 : tensor<8x16xi32> loc(#loc281) + %eq_603 = arith.cmpi eq, %ileft_589, %iright_590 : tensor<8x16xi32> loc(#loc282) + %cond_604 = arith.cmpi sgt, %left_idx_600, %right_idx_601 : tensor<8x16xi32> loc(#loc303) + %cond_605 = arith.andi %eq_603, %cond_604 : tensor<8x16xi1> loc(#loc283) + %cond_606 = arith.ori %cond_602, %cond_605 : tensor<8x16xi1> loc(#loc284) + %ret_607 = arith.xori %ileft_589, %iright_590 : tensor<8x16xi32> loc(#loc287) + %ret_608 = arith.select %cond_606, %ret_607, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_609 = arith.xori %ret_576, %ret_608 : tensor<8x16xi32> loc(#loc289) + %new_idxs_610 = arith.xori %left_idx_600, %right_idx_601 : tensor<8x16xi32> loc(#loc304) + %new_idxs_611 = arith.select %cond_606, %new_idxs_610, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_612 = arith.xori %new_idxs_579, %new_idxs_611 : tensor<8x16xi32> loc(#loc291) + %y_613 = tt.reshape %ret_609 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc270) + %ileft_614 = arith.muli %y_613, %ileft_139 : tensor<16x2x4xi32> loc(#loc271) + %ileft_615 = "tt.reduce"(%ileft_614) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc317) + %ileft_616 = tt.expand_dims %ileft_615 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc273) + %ileft_617 = tt.broadcast %ileft_616 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc274) + %iright_618 = arith.muli %y_613, %flip_61 : tensor<16x2x4xi32> loc(#loc275) + %iright_619 = "tt.reduce"(%iright_618) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc319) + %iright_620 = tt.expand_dims %iright_619 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc277) + %iright_621 = tt.broadcast %iright_620 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc278) + %ileft_622 = tt.reshape %ileft_617 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_623 = tt.reshape %iright_621 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_624 = tt.reshape %new_idxs_612 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc292) + %left_idx_625 = arith.muli %y_idx_624, %ileft_139 : tensor<16x2x4xi32> loc(#loc293) + %left_idx_626 = "tt.reduce"(%left_idx_625) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc321) + %left_idx_627 = tt.expand_dims %left_idx_626 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc295) + %left_idx_628 = tt.broadcast %left_idx_627 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc296) + %right_idx_629 = arith.muli %y_idx_624, %flip_61 : tensor<16x2x4xi32> loc(#loc297) + %right_idx_630 = "tt.reduce"(%right_idx_629) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc323) + %right_idx_631 = tt.expand_dims %right_idx_630 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc299) + %right_idx_632 = tt.broadcast %right_idx_631 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc300) + %left_idx_633 = tt.reshape %left_idx_628 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_634 = tt.reshape %right_idx_632 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_635 = arith.cmpi slt, %ileft_622, %iright_623 : tensor<8x16xi32> loc(#loc281) + %eq_636 = arith.cmpi eq, %ileft_622, %iright_623 : tensor<8x16xi32> loc(#loc282) + %cond_637 = arith.cmpi sgt, %left_idx_633, %right_idx_634 : tensor<8x16xi32> loc(#loc303) + %cond_638 = arith.andi %eq_636, %cond_637 : tensor<8x16xi1> loc(#loc283) + %cond_639 = arith.ori %cond_635, %cond_638 : tensor<8x16xi1> loc(#loc284) + %ret_640 = arith.xori %ileft_622, %iright_623 : tensor<8x16xi32> loc(#loc287) + %ret_641 = arith.select %cond_639, %ret_640, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_642 = arith.xori %ret_609, %ret_641 : tensor<8x16xi32> loc(#loc289) + %new_idxs_643 = arith.xori %left_idx_633, %right_idx_634 : tensor<8x16xi32> loc(#loc304) + %new_idxs_644 = arith.select %cond_639, %new_idxs_643, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_645 = arith.xori %new_idxs_612, %new_idxs_644 : tensor<8x16xi32> loc(#loc291) + %y_646 = tt.reshape %ret_642 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc270) + %ileft_647 = arith.muli %y_646, %ileft_64 : tensor<32x2x2xi32> loc(#loc271) + %ileft_648 = "tt.reduce"(%ileft_647) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc317) + %ileft_649 = tt.expand_dims %ileft_648 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc273) + %ileft_650 = tt.broadcast %ileft_649 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc274) + %iright_651 = arith.muli %y_646, %flip_24 : tensor<32x2x2xi32> loc(#loc275) + %iright_652 = "tt.reduce"(%iright_651) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc319) + %iright_653 = tt.expand_dims %iright_652 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc277) + %iright_654 = tt.broadcast %iright_653 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc278) + %ileft_655 = tt.reshape %ileft_650 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_656 = tt.reshape %iright_654 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_657 = tt.reshape %new_idxs_645 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc292) + %left_idx_658 = arith.muli %y_idx_657, %ileft_64 : tensor<32x2x2xi32> loc(#loc293) + %left_idx_659 = "tt.reduce"(%left_idx_658) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc321) + %left_idx_660 = tt.expand_dims %left_idx_659 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc295) + %left_idx_661 = tt.broadcast %left_idx_660 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc296) + %right_idx_662 = arith.muli %y_idx_657, %flip_24 : tensor<32x2x2xi32> loc(#loc297) + %right_idx_663 = "tt.reduce"(%right_idx_662) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc323) + %right_idx_664 = tt.expand_dims %right_idx_663 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc299) + %right_idx_665 = tt.broadcast %right_idx_664 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc300) + %left_idx_666 = tt.reshape %left_idx_661 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_667 = tt.reshape %right_idx_665 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_668 = arith.cmpi slt, %ileft_655, %iright_656 : tensor<8x16xi32> loc(#loc281) + %eq_669 = arith.cmpi eq, %ileft_655, %iright_656 : tensor<8x16xi32> loc(#loc282) + %cond_670 = arith.cmpi sgt, %left_idx_666, %right_idx_667 : tensor<8x16xi32> loc(#loc303) + %cond_671 = arith.andi %eq_669, %cond_670 : tensor<8x16xi1> loc(#loc283) + %cond_672 = arith.ori %cond_668, %cond_671 : tensor<8x16xi1> loc(#loc284) + %ret_673 = arith.xori %ileft_655, %iright_656 : tensor<8x16xi32> loc(#loc287) + %ret_674 = arith.select %cond_672, %ret_673, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_675 = arith.xori %ret_642, %ret_674 : tensor<8x16xi32> loc(#loc289) + %new_idxs_676 = arith.xori %left_idx_666, %right_idx_667 : tensor<8x16xi32> loc(#loc304) + %new_idxs_677 = arith.select %cond_672, %new_idxs_676, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_678 = arith.xori %new_idxs_645, %new_idxs_677 : tensor<8x16xi32> loc(#loc291) + %y_679 = tt.reshape %ret_675 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc270) + %ileft_680 = arith.muli %y_679, %ileft : tensor<64x2x1xi32> loc(#loc271) + %ileft_681 = "tt.reduce"(%ileft_680) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc317) + %ileft_682 = tt.expand_dims %ileft_681 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc273) + %ileft_683 = tt.broadcast %ileft_682 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc274) + %iright_684 = arith.muli %y_679, %iright : tensor<64x2x1xi32> loc(#loc275) + %iright_685 = "tt.reduce"(%iright_684) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc319) + %iright_686 = tt.expand_dims %iright_685 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc277) + %iright_687 = tt.broadcast %iright_686 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc278) + %ileft_688 = tt.reshape %ileft_683 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_689 = tt.reshape %iright_687 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_690 = tt.reshape %new_idxs_678 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc292) + %left_idx_691 = arith.muli %y_idx_690, %ileft : tensor<64x2x1xi32> loc(#loc293) + %left_idx_692 = "tt.reduce"(%left_idx_691) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc321) + %left_idx_693 = tt.expand_dims %left_idx_692 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc295) + %left_idx_694 = tt.broadcast %left_idx_693 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc296) + %right_idx_695 = arith.muli %y_idx_690, %iright : tensor<64x2x1xi32> loc(#loc297) + %right_idx_696 = "tt.reduce"(%right_idx_695) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc323) + %right_idx_697 = tt.expand_dims %right_idx_696 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc299) + %right_idx_698 = tt.broadcast %right_idx_697 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc300) + %left_idx_699 = tt.reshape %left_idx_694 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_700 = tt.reshape %right_idx_698 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_701 = arith.cmpi slt, %ileft_688, %iright_689 : tensor<8x16xi32> loc(#loc281) + %eq_702 = arith.cmpi eq, %ileft_688, %iright_689 : tensor<8x16xi32> loc(#loc282) + %cond_703 = arith.cmpi sgt, %left_idx_699, %right_idx_700 : tensor<8x16xi32> loc(#loc303) + %cond_704 = arith.andi %eq_702, %cond_703 : tensor<8x16xi1> loc(#loc283) + %cond_705 = arith.ori %cond_701, %cond_704 : tensor<8x16xi1> loc(#loc284) + %new_idxs_706 = arith.xori %left_idx_699, %right_idx_700 : tensor<8x16xi32> loc(#loc304) + %new_idxs_707 = arith.select %cond_705, %new_idxs_706, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_708 = arith.xori %new_idxs_678, %new_idxs_707 : tensor<8x16xi32> loc(#loc291) + %tmp20 = arith.extui %tmp5 : tensor<8x16xi1> to tensor<8x16xi64> loc(#loc226) + %tmp23 = arith.select %tmp0_20, %tmp20, %cst_7 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc197) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_713: i64 loc(callsite(#loc1 at #loc198)), %tmp24_714: i64 loc(callsite(#loc1 at #loc198))): + %tmp24_715 = arith.addi %tmp24_713, %tmp24_714 : i64 loc(#loc305) + tt.reduce.return %tmp24_715 : i64 loc(#loc227) + }) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc227) + %tmp24_709 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc199) + %tmp25 = arith.extui %tmp14 : tensor<8x16xi1> to tensor<8x16xi64> loc(#loc229) + %tmp28 = arith.select %tmp0_20, %tmp25, %cst_7 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc201) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_713: i64 loc(callsite(#loc1 at #loc202)), %tmp29_714: i64 loc(callsite(#loc1 at #loc202))): + %tmp29_715 = arith.addi %tmp29_713, %tmp29_714 : i64 loc(#loc306) + tt.reduce.return %tmp29_715 : i64 loc(#loc230) + }) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc230) + %tmp29_710 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc203) + %tmp30 = arith.trunci %tmp24_709 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc204) + %tmp31 = arith.trunci %tmp29_710 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc205) + %tmp34 = tt.broadcast %tmp30 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc206) + %tmp34_711 = arith.cmpi slt, %tmp0_15, %tmp34 : tensor<8x16xi32> loc(#loc206) + %tmp36 = arith.select %tmp34_711, %new_idxs_376, %cst_5 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc207) + %tmp38 = arith.addi %tmp36, %cst_4 : tensor<8x16xi32> loc(#loc208) + %tmp39 = arith.cmpi slt, %tmp36, %cst_3 : tensor<8x16xi32> loc(#loc209) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc210) + %0 = arith.cmpi sge, %tmp40, %cst_3 : tensor<8x16xi32> loc(#loc88) + %1 = arith.cmpi slt, %tmp40, %cst_4 : tensor<8x16xi32> loc(#loc89) + %2 = arith.andi %0, %1 : tensor<8x16xi1> loc(#loc90) + %3 = arith.xori %xmask_13, %cst_2 : tensor<8x1xi1> loc(#loc91) + %4 = tt.broadcast %3 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc92) + %5 = arith.ori %2, %4 : tensor<8x16xi1> loc(#loc92) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<8x16xi1> loc(#loc93) + %tmp45 = tt.broadcast %tmp31 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc211) + %tmp45_712 = arith.cmpi slt, %tmp0_15, %tmp45 : tensor<8x16xi32> loc(#loc211) + %tmp46 = arith.select %tmp45_712, %new_idxs_708, %cst_5 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc212) + %tmp47 = arith.addi %tmp46, %cst_4 : tensor<8x16xi32> loc(#loc213) + %tmp48 = arith.cmpi slt, %tmp46, %cst_3 : tensor<8x16xi32> loc(#loc214) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc215) + %6 = arith.cmpi sge, %tmp49, %cst_3 : tensor<8x16xi32> loc(#loc99) + %7 = arith.cmpi slt, %tmp49, %cst_4 : tensor<8x16xi32> loc(#loc100) + %8 = arith.andi %6, %7 : tensor<8x16xi1> loc(#loc101) + %9 = arith.ori %8, %4 : tensor<8x16xi1> loc(#loc102) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<8x16xi1> loc(#loc103) + %10 = tt.splat %out_ptr4 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc104) + %11 = tt.addptr %10, %xindex_12 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc104) + tt.store %11, %tmp30, %xmask_13 : tensor<8x1x!tt.ptr> loc(#loc105) + %12 = tt.splat %out_ptr5 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc106) + %13 = tt.addptr %12, %xindex_12 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc106) + tt.store %13, %tmp31, %xmask_13 : tensor<8x1x!tt.ptr> loc(#loc107) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc108) + %15 = tt.addptr %14, %tmp0_17 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc108) + tt.store %15, %new_idxs_376, %tmp0_20 : tensor<8x16x!tt.ptr> loc(#loc109) + %16 = arith.muli %xindex_12, %cst_1 : tensor<8x1xi32> loc(#loc110) + %17 = tt.broadcast %16 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc111) + %18 = arith.addi %tmp40, %17 : tensor<8x16xi32> loc(#loc111) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc112) + %20 = tt.addptr %19, %18 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc112) + tt.store %20, %cst_0, %tmp0_20 : tensor<8x16x!tt.ptr> loc(#loc113) + %21 = tt.splat %out_ptr8 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc114) + %22 = tt.addptr %21, %tmp0_17 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc114) + tt.store %22, %new_idxs_708, %tmp0_20 : tensor<8x16x!tt.ptr> loc(#loc115) + %23 = arith.addi %tmp49, %17 : tensor<8x16xi32> loc(#loc116) + %24 = tt.splat %out_ptr9 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc117) + %25 = tt.addptr %24, %23 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc117) + tt.store %25, %cst_0, %tmp0_20 : tensor<8x16x!tt.ptr> loc(#loc118) + tt.return loc(#loc119) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":26:21) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:38) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:30) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":36:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":38:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":39:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":41:19) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":40:19) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":43:19) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":45:34) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":47:20) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":49:21) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":48:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":52:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":54:35) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:29) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":56:21) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":58:35) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:29) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":60:21) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":61:21) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":64:19) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":66:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":68:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":69:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":70:35) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:28) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:46) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:38) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:55) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:53) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:63) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":75:19) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":76:35) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":77:20) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":78:20) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":79:35) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:28) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:46) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:38) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:53) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:63) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:25) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:37) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:25) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:37) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:25) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:47) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:52) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:49) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:25) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:85) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:47) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:49) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:25) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:85) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:4) +#loc129 = loc("xmask"(#loc2)) +#loc130 = loc("xoffset"(#loc3)) +#loc131 = loc("xoffset"(#loc4)) +#loc132 = loc("xindex"(#loc5)) +#loc133 = loc("xindex"(#loc6)) +#loc134 = loc("xindex"(#loc7)) +#loc135 = loc("r0_index"(#loc8)) +#loc136 = loc("r0_index"(#loc9)) +#loc137 = loc("tmp0"(#loc10)) +#loc138 = loc("tmp0"(#loc11)) +#loc139 = loc("tmp0"(#loc12)) +#loc140 = loc("tmp0"(#loc13)) +#loc141 = loc("tmp2"(#loc14)) +#loc142 = loc("tmp4"(#loc15)) +#loc143 = loc("tmp5"(#loc16)) +#loc144 = loc("tmp7"(#loc17)) +#loc145 = loc("tmp6"(#loc18)) +#loc146 = loc("tmp9"(#loc19)) +#loc147 = loc("tmp11"(#loc20)) +#loc148 = loc("flip"(#loc21)) +#loc150 = loc("flip"(#loc24)) +#loc151 = loc("flip"(#loc25)) +#loc152 = loc("flip"(#loc26)) +#loc153 = loc("y"(#loc27)) +#loc154 = loc("left_mask"(#loc29)) +#loc155 = loc("ileft"(#loc30)) +#loc157 = loc("ileft"(#loc34)) +#loc158 = loc("ileft"(#loc35)) +#loc159 = loc("iright"(#loc36)) +#loc161 = loc("iright"(#loc38)) +#loc162 = loc("iright"(#loc39)) +#loc163 = loc("ileft"(#loc40)) +#loc164 = loc("iright"(#loc41)) +#loc165 = loc("y_idx"(#loc42)) +#loc166 = loc("left_idx"(#loc43)) +#loc167 = loc("left_idx"(#loc44)) +#loc168 = loc("input"(#loc45)) +#loc170 = loc("left_idx"(#loc47)) +#loc171 = loc("left_idx"(#loc48)) +#loc172 = loc("right_idx"(#loc49)) +#loc173 = loc("right_idx"(#loc50)) +#loc175 = loc("right_idx"(#loc52)) +#loc176 = loc("right_idx"(#loc53)) +#loc177 = loc("left_idx"(#loc54)) +#loc178 = loc("right_idx"(#loc55)) +#loc179 = loc("cond"(#loc56)) +#loc180 = loc("eq"(#loc57)) +#loc181 = loc("cond"(#loc58)) +#loc182 = loc("cond"(#loc59)) +#loc183 = loc("cond"(#loc60)) +#loc184 = loc("cond"(#loc61)) +#loc185 = loc("cond"(#loc62)) +#loc186 = loc("ret"(#loc63)) +#loc187 = loc("ret"(#loc64)) +#loc188 = loc("ret"(#loc65)) +#loc189 = loc("new_idxs"(#loc66)) +#loc190 = loc("new_idxs"(#loc67)) +#loc191 = loc("new_idxs"(#loc68)) +#loc192 = loc("tmp14"(#loc69)) +#loc193 = loc("tmp16"(#loc70)) +#loc194 = loc("tmp15"(#loc71)) +#loc196 = loc("tmp20"(#loc73)) +#loc197 = loc("tmp23"(#loc74)) +#loc199 = loc("tmp24"(#loc76)) +#loc200 = loc("tmp25"(#loc77)) +#loc201 = loc("tmp28"(#loc78)) +#loc203 = loc("tmp29"(#loc80)) +#loc204 = loc("tmp30"(#loc81)) +#loc205 = loc("tmp31"(#loc82)) +#loc206 = loc("tmp34"(#loc83)) +#loc207 = loc("tmp36"(#loc84)) +#loc208 = loc("tmp38"(#loc85)) +#loc209 = loc("tmp39"(#loc86)) +#loc210 = loc("tmp40"(#loc87)) +#loc211 = loc("tmp45"(#loc94)) +#loc212 = loc("tmp46"(#loc95)) +#loc213 = loc("tmp47"(#loc96)) +#loc214 = loc("tmp48"(#loc97)) +#loc215 = loc("tmp49"(#loc98)) +#loc216 = loc(fused[#loc144, #loc145]) +#loc217 = loc(callsite(#loc148 at #loc149)) +#loc218 = loc(callsite(#loc150 at #loc149)) +#loc219 = loc(callsite(#loc151 at #loc149)) +#loc220 = loc(callsite(#loc152 at #loc149)) +#loc222 = loc("cond"(#loc179)) +#loc223 = loc("eq"(#loc180)) +#loc224 = loc(fused[#loc193, #loc194]) +#loc226 = loc(fused[#loc196, #loc144, #loc145]) +#loc227 = loc(callsite(#loc31 at #loc198)) +#loc229 = loc(fused[#loc200, #loc193, #loc194]) +#loc230 = loc(callsite(#loc31 at #loc202)) +#loc232 = loc(callsite(#loc153 at #loc221)) +#loc233 = loc(callsite(#loc154 at #loc221)) +#loc234 = loc(callsite(#loc155 at #loc221)) +#loc236 = loc(callsite(#loc157 at #loc221)) +#loc237 = loc(callsite(#loc158 at #loc221)) +#loc238 = loc(callsite(#loc159 at #loc221)) +#loc240 = loc(callsite(#loc161 at #loc221)) +#loc241 = loc(callsite(#loc162 at #loc221)) +#loc242 = loc(callsite(#loc163 at #loc221)) +#loc243 = loc(callsite(#loc164 at #loc221)) +#loc244 = loc(callsite(#loc165 at #loc221)) +#loc245 = loc(callsite(#loc166 at #loc221)) +#loc246 = loc(callsite(#loc167 at #loc221)) +#loc248 = loc(callsite(#loc170 at #loc221)) +#loc249 = loc(callsite(#loc171 at #loc221)) +#loc250 = loc(callsite(#loc172 at #loc221)) +#loc251 = loc(callsite(#loc173 at #loc221)) +#loc253 = loc(callsite(#loc175 at #loc221)) +#loc254 = loc(callsite(#loc176 at #loc221)) +#loc255 = loc(callsite(#loc177 at #loc221)) +#loc256 = loc(callsite(#loc178 at #loc221)) +#loc257 = loc(callsite(#loc222 at #loc221)) +#loc258 = loc(callsite(#loc223 at #loc221)) +#loc259 = loc(callsite(#loc181 at #loc221)) +#loc260 = loc(callsite(#loc182 at #loc221)) +#loc261 = loc(callsite(#loc183 at #loc221)) +#loc262 = loc(callsite(#loc184 at #loc221)) +#loc263 = loc(callsite(#loc185 at #loc221)) +#loc264 = loc(callsite(#loc186 at #loc221)) +#loc265 = loc(callsite(#loc187 at #loc221)) +#loc266 = loc(callsite(#loc188 at #loc221)) +#loc267 = loc(callsite(#loc189 at #loc221)) +#loc268 = loc(callsite(#loc190 at #loc221)) +#loc269 = loc(callsite(#loc191 at #loc221)) +#loc270 = loc(callsite(#loc153 at #loc225)) +#loc271 = loc(callsite(#loc155 at #loc225)) +#loc273 = loc(callsite(#loc157 at #loc225)) +#loc274 = loc(callsite(#loc158 at #loc225)) +#loc275 = loc(callsite(#loc159 at #loc225)) +#loc277 = loc(callsite(#loc161 at #loc225)) +#loc278 = loc(callsite(#loc162 at #loc225)) +#loc279 = loc(callsite(#loc163 at #loc225)) +#loc280 = loc(callsite(#loc164 at #loc225)) +#loc281 = loc(callsite(#loc222 at #loc225)) +#loc282 = loc(callsite(#loc223 at #loc225)) +#loc283 = loc(callsite(#loc182 at #loc225)) +#loc284 = loc(callsite(#loc183 at #loc225)) +#loc285 = loc(callsite(#loc184 at #loc225)) +#loc286 = loc(callsite(#loc185 at #loc225)) +#loc287 = loc(callsite(#loc186 at #loc225)) +#loc288 = loc(callsite(#loc187 at #loc225)) +#loc289 = loc(callsite(#loc188 at #loc225)) +#loc290 = loc(callsite(#loc190 at #loc225)) +#loc291 = loc(callsite(#loc191 at #loc225)) +#loc292 = loc(callsite(#loc165 at #loc225)) +#loc293 = loc(callsite(#loc167 at #loc225)) +#loc295 = loc(callsite(#loc170 at #loc225)) +#loc296 = loc(callsite(#loc171 at #loc225)) +#loc297 = loc(callsite(#loc173 at #loc225)) +#loc299 = loc(callsite(#loc175 at #loc225)) +#loc300 = loc(callsite(#loc176 at #loc225)) +#loc301 = loc(callsite(#loc177 at #loc225)) +#loc302 = loc(callsite(#loc178 at #loc225)) +#loc303 = loc(callsite(#loc181 at #loc225)) +#loc304 = loc(callsite(#loc189 at #loc225)) +#loc305 = loc(callsite(#loc33 at #loc227)) +#loc306 = loc(callsite(#loc33 at #loc230)) +#loc307 = loc(callsite(#loc31 at #loc235)) +#loc309 = loc(callsite(#loc31 at #loc239)) +#loc311 = loc(callsite(#loc168 at #loc247)) +#loc312 = loc(callsite(#loc31 at #loc247)) +#loc314 = loc(callsite(#loc168 at #loc252)) +#loc315 = loc(callsite(#loc31 at #loc252)) +#loc317 = loc(callsite(#loc31 at #loc272)) +#loc319 = loc(callsite(#loc31 at #loc276)) +#loc321 = loc(callsite(#loc31 at #loc294)) +#loc323 = loc(callsite(#loc31 at #loc298)) +#loc325 = loc(callsite(#loc33 at #loc307)) +#loc326 = loc(callsite(#loc33 at #loc309)) +#loc327 = loc(callsite(#loc33 at #loc312)) +#loc328 = loc(callsite(#loc33 at #loc315)) +#loc329 = loc(callsite(#loc33 at #loc317)) +#loc330 = loc(callsite(#loc33 at #loc319)) +#loc331 = loc(callsite(#loc33 at #loc321)) +#loc332 = loc(callsite(#loc33 at #loc323)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9d9114c121932ec3181a0c25140ad3e6089ea5c2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..87a9338db022ff0b0893766debe01931f24cc8f8 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bc5b25328db5fc9eb4dd6c35149726b6ab1f9db7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"hash": "60f2d3ab39632ea77296d3fdab14a07a1f50c8b26c01a5a2e82581727a89defe", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 4096, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..97940a4d1bda65f846308a2d652286d3d5ab1ab2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir @@ -0,0 +1,1533 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_1 = internal constant [8 x i8] c"unknown\00" +@assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py\00" +@assertMessage_1 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp49 < 17\00" +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py\00" +@assertMessage_0 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp40 < 17\00" +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %13 = shl i32 %12, 5, !dbg !11 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %15 = and i32 %14, 124, !dbg !12 + %16 = lshr exact i32 %15, 2, !dbg !12 + %17 = and i32 %14, 31, !dbg !12 + %18 = or disjoint i32 %16, %13, !dbg !13 + %19 = icmp slt i32 %18, 32, !dbg !14 + %20 = and i32 %14, 3, !dbg !15 + %21 = shl i32 %18, 4, !dbg !16 + %22 = and i32 %14, 1, !dbg !17 + %23 = lshr i32 %14, 1, !dbg !17 + %.lobit = and i32 %23, 1, !dbg !17 + %24 = xor i32 %22, 1, !dbg !21 + %25 = xor i32 %.lobit, 1, !dbg !21 + %26 = trunc i32 %14 to i1, !dbg !22 + %27 = trunc i32 %23 to i1, !dbg !22 + %28 = insertelement <2 x i1> poison, i1 %27, i64 0, !dbg !23 + %29 = shufflevector <2 x i1> %28, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !23 + %30 = insertelement <4 x i32> poison, i32 %25, i64 0, !dbg !25 + %31 = shufflevector <4 x i32> %30, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !25 + %32 = insertelement <4 x i32> poison, i32 %.lobit, i64 0, !dbg !26 + %33 = shufflevector <4 x i32> %32, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !26 + %34 = insertelement <4 x i32> poison, i32 %24, i64 0, !dbg !27 + %35 = shufflevector <4 x i32> %34, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !27 + %36 = insertelement <4 x i32> poison, i32 %22, i64 0, !dbg !28 + %37 = shufflevector <4 x i32> %36, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !28 + %38 = insertelement <2 x i1> poison, i1 %26, i64 0, !dbg !22 + %39 = shufflevector <2 x i1> %38, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !22 + %40 = insertelement <2 x i32> poison, i32 %24, i64 0, !dbg !29 + %41 = shufflevector <2 x i32> %40, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !29 + %42 = insertelement <2 x i32> poison, i32 %22, i64 0, !dbg !30 + %43 = shufflevector <2 x i32> %42, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !30 + %44 = insertelement <2 x i32> %42, i32 %24, i64 1, !dbg !31 + %45 = insertelement <2 x i32> %40, i32 %22, i64 1, !dbg !32 + %46 = insertelement <4 x i1> poison, i1 %19, i64 0, !dbg !33 + %47 = shufflevector <4 x i1> %46, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !33 + %48 = shl nuw nsw i32 %15, 1, !dbg !34 + %49 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %48, !dbg !34 + %50 = shl nuw nsw i32 %17, 3, !dbg !34 + %51 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %50, !dbg !34 + %52 = shl nuw nsw i32 %20, 2, !dbg !15 + %53 = or disjoint i32 %52, 1, !dbg !15 + %54 = or disjoint i32 %52, 2, !dbg !15 + %55 = or disjoint i32 %52, 3, !dbg !15 + %56 = or disjoint i32 %21, %52, !dbg !35 + %57 = or disjoint i32 %21, %54, !dbg !35 + %58 = sext i32 %56 to i64, !dbg !36 + %59 = getelementptr i64, ptr addrspace(1) %0, i64 %58, !dbg !36 + %60 = sext i32 %57 to i64, !dbg !36 + %61 = getelementptr i64, ptr addrspace(1) %0, i64 %60, !dbg !36 + %62 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %59, i1 %19) #6, !dbg !37 + %63 = extractvalue { i64, i64 } %62, 0, !dbg !37 + %64 = extractvalue { i64, i64 } %62, 1, !dbg !37 + %65 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %61, i1 %19) #6, !dbg !37 + %66 = extractvalue { i64, i64 } %65, 0, !dbg !37 + %67 = extractvalue { i64, i64 } %65, 1, !dbg !37 + %68 = xor i32 %53, %52, !dbg !38 + %69 = xor i32 %54, %55, !dbg !38 + %70 = insertelement <4 x i64> poison, i64 %63, i64 0, !dbg !39 + %71 = insertelement <4 x i64> %70, i64 %64, i64 1, !dbg !39 + %72 = insertelement <4 x i64> %71, i64 %66, i64 2, !dbg !39 + %73 = insertelement <4 x i64> %72, i64 %67, i64 3, !dbg !39 + %74 = add <4 x i64> %73, splat (i64 -1), !dbg !39 + %75 = icmp ult <4 x i64> %74, splat (i64 16383), !dbg !39 + %76 = extractelement <4 x i1> %75, i64 0, !dbg !40 + %77 = zext i1 %76 to i32, !dbg !41 + %78 = extractelement <4 x i1> %75, i64 1, !dbg !40 + %79 = zext i1 %78 to i32, !dbg !41 + %80 = extractelement <4 x i1> %75, i64 2, !dbg !42 + %81 = zext i1 %80 to i32, !dbg !41 + %82 = extractelement <4 x i1> %75, i64 3, !dbg !42 + %83 = zext i1 %82 to i32, !dbg !41 + %84 = xor i1 %76, true, !dbg !40 + %85 = and i1 %78, %84, !dbg !40 + %.not = xor i1 %82, true, !dbg !42 + %86 = or i1 %80, %.not, !dbg !42 + %87 = xor i32 %77, %79, !dbg !43 + %88 = xor i32 %81, %83, !dbg !43 + %89 = select i1 %85, i32 %87, i32 0, !dbg !44 + %90 = select i1 %86, i32 %88, i32 0, !dbg !44 + %91 = xor i32 %89, %77, !dbg !45 + %92 = xor i32 %89, %79, !dbg !45 + %93 = xor i32 %90, %81, !dbg !45 + %94 = xor i32 %90, %83, !dbg !45 + %95 = select i1 %85, i32 %68, i32 0, !dbg !46 + %96 = select i1 %86, i32 %69, i32 0, !dbg !46 + %97 = xor i32 %95, %52, !dbg !47 + %98 = xor i32 %95, %53, !dbg !47 + %99 = xor i32 %96, %54, !dbg !47 + %100 = xor i32 %96, %55, !dbg !47 + %101 = insertelement <2 x i32> poison, i32 %91, i64 0, !dbg !22 + %102 = insertelement <2 x i32> %101, i32 %92, i64 1, !dbg !22 + %103 = insertelement <2 x i32> poison, i32 %93, i64 0, !dbg !22 + %104 = insertelement <2 x i32> %103, i32 %94, i64 1, !dbg !22 + %105 = icmp samesign uge <2 x i32> %102, %104, !dbg !22 + %106 = insertelement <2 x i32> %104, i32 %92, i64 1, !dbg !22 + %107 = insertelement <2 x i32> %102, i32 %94, i64 1, !dbg !22 + %108 = icmp ne <2 x i32> %106, %107, !dbg !22 + %109 = insertelement <2 x i32> poison, i32 %97, i64 0, !dbg !22 + %110 = insertelement <2 x i32> %109, i32 %98, i64 1, !dbg !22 + %111 = insertelement <2 x i32> poison, i32 %99, i64 0, !dbg !22 + %112 = insertelement <2 x i32> %111, i32 %100, i64 1, !dbg !22 + %113 = icmp ule <2 x i32> %110, %112, !dbg !22 + %114 = or <2 x i1> %113, %108, !dbg !22 + %115 = and <2 x i1> %105, %114, !dbg !22 + %116 = xor <2 x i1> %115, %39, !dbg !22 + %117 = xor <2 x i32> %107, %106, !dbg !43 + %118 = select <2 x i1> %116, <2 x i32> zeroinitializer, <2 x i32> %117, !dbg !44 + %119 = extractelement <2 x i32> %118, i64 0, !dbg !45 + %120 = xor i32 %119, %91, !dbg !45 + %121 = extractelement <2 x i32> %118, i64 1, !dbg !45 + %122 = xor i32 %121, %92, !dbg !45 + %123 = xor <2 x i32> %118, %104, !dbg !45 + %124 = xor i32 %99, %97, !dbg !38 + %125 = xor i32 %100, %98, !dbg !38 + %126 = extractelement <2 x i1> %116, i64 0, !dbg !46 + %127 = select i1 %126, i32 0, i32 %124, !dbg !46 + %128 = extractelement <2 x i1> %116, i64 1, !dbg !46 + %129 = select i1 %128, i32 0, i32 %125, !dbg !46 + %130 = xor i32 %127, %97, !dbg !47 + %131 = xor i32 %129, %98, !dbg !47 + %132 = xor i32 %127, %99, !dbg !47 + %133 = xor i32 %129, %100, !dbg !47 + %134 = icmp samesign uge i32 %120, %122, !dbg !22 + %135 = icmp ne i32 %120, %122, !dbg !22 + %136 = icmp samesign ule i32 %130, %131, !dbg !22 + %137 = or i1 %135, %136, !dbg !22 + %138 = and i1 %134, %137, !dbg !22 + %.not3 = xor i1 %138, %26, !dbg !22 + %139 = extractelement <2 x i32> %123, i64 0, !dbg !22 + %140 = extractelement <2 x i32> %123, i64 1, !dbg !22 + %141 = icmp samesign uge i32 %139, %140, !dbg !22 + %142 = icmp ne i32 %139, %140, !dbg !22 + %143 = icmp samesign ule i32 %132, %133, !dbg !22 + %144 = or i1 %142, %143, !dbg !22 + %145 = and i1 %141, %144, !dbg !22 + %.not4 = xor i1 %145, %26, !dbg !22 + %146 = xor i32 %120, %122, !dbg !43 + %147 = xor i32 %139, %140, !dbg !43 + %148 = select i1 %.not3, i32 0, i32 %146, !dbg !44 + %149 = select i1 %.not4, i32 0, i32 %147, !dbg !44 + %150 = xor i32 %148, %120, !dbg !45 + %151 = xor i32 %148, %122, !dbg !45 + %152 = xor i32 %149, %139, !dbg !45 + %153 = xor i32 %149, %140, !dbg !45 + %154 = xor i32 %130, %131, !dbg !38 + %155 = xor i32 %132, %133, !dbg !38 + %156 = select i1 %.not3, i32 0, i32 %154, !dbg !46 + %157 = select i1 %.not4, i32 0, i32 %155, !dbg !46 + %158 = xor i32 %156, %130, !dbg !47 + %159 = xor i32 %156, %131, !dbg !47 + %160 = xor i32 %157, %132, !dbg !47 + %161 = xor i32 %157, %133, !dbg !47 + %162 = mul nuw nsw i32 %150, %24, !dbg !29 + %163 = mul nuw nsw i32 %151, %24, !dbg !29 + %164 = mul nuw nsw i32 %152, %24, !dbg !29 + %165 = mul nuw nsw i32 %153, %24, !dbg !29 + %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %162, i32 1, i32 31), !dbg !48 + %167 = add i32 %162, %166, !dbg !51 + %168 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %163, i32 1, i32 31), !dbg !48 + %169 = add i32 %163, %168, !dbg !51 + %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 1, i32 31), !dbg !48 + %171 = add i32 %164, %170, !dbg !51 + %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 1, i32 31), !dbg !48 + %173 = add i32 %165, %172, !dbg !51 + %174 = mul nuw nsw i32 %150, %22, !dbg !30 + %175 = mul nuw nsw i32 %151, %22, !dbg !30 + %176 = mul nuw nsw i32 %152, %22, !dbg !30 + %177 = mul nuw nsw i32 %153, %22, !dbg !30 + %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 1, i32 31), !dbg !48 + %179 = add i32 %174, %178, !dbg !51 + %180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 1, i32 31), !dbg !48 + %181 = add i32 %175, %180, !dbg !51 + %182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 1, i32 31), !dbg !48 + %183 = add i32 %176, %182, !dbg !51 + %184 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 1, i32 31), !dbg !48 + %185 = add i32 %177, %184, !dbg !51 + %186 = mul nuw nsw i32 %158, %24, !dbg !32 + %187 = mul nuw nsw i32 %159, %24, !dbg !32 + %188 = mul nuw nsw i32 %160, %24, !dbg !32 + %189 = mul nuw nsw i32 %161, %24, !dbg !32 + %190 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %186, i32 1, i32 31), !dbg !48 + %191 = add i32 %186, %190, !dbg !51 + %192 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %187, i32 1, i32 31), !dbg !48 + %193 = add i32 %187, %192, !dbg !51 + %194 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %188, i32 1, i32 31), !dbg !48 + %195 = add i32 %188, %194, !dbg !51 + %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %189, i32 1, i32 31), !dbg !48 + %197 = add i32 %189, %196, !dbg !51 + %198 = mul nuw nsw i32 %158, %22, !dbg !31 + %199 = mul nuw nsw i32 %159, %22, !dbg !31 + %200 = mul nuw nsw i32 %160, %22, !dbg !31 + %201 = mul nuw nsw i32 %161, %22, !dbg !31 + %202 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 1, i32 31), !dbg !48 + %203 = add i32 %198, %202, !dbg !51 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %199, i32 1, i32 31), !dbg !48 + %205 = add i32 %199, %204, !dbg !51 + %206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 1, i32 31), !dbg !48 + %207 = add i32 %200, %206, !dbg !51 + %208 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %201, i32 1, i32 31), !dbg !48 + %209 = add i32 %201, %208, !dbg !51 + %210 = icmp sge i32 %167, %179, !dbg !22 + %211 = icmp ne i32 %167, %179, !dbg !22 + %212 = icmp sle i32 %191, %203, !dbg !22 + %213 = or i1 %211, %212, !dbg !22 + %214 = and i1 %210, %213, !dbg !22 + %.not5 = xor i1 %214, %27, !dbg !22 + %215 = icmp sge i32 %169, %181, !dbg !22 + %216 = icmp ne i32 %169, %181, !dbg !22 + %217 = icmp sle i32 %193, %205, !dbg !22 + %218 = or i1 %216, %217, !dbg !22 + %219 = and i1 %215, %218, !dbg !22 + %.not6 = xor i1 %219, %27, !dbg !22 + %220 = icmp sge i32 %171, %183, !dbg !22 + %221 = icmp ne i32 %171, %183, !dbg !22 + %222 = icmp sle i32 %195, %207, !dbg !22 + %223 = or i1 %221, %222, !dbg !22 + %224 = and i1 %220, %223, !dbg !22 + %.not7 = xor i1 %224, %27, !dbg !22 + %225 = icmp sge i32 %173, %185, !dbg !22 + %226 = icmp ne i32 %173, %185, !dbg !22 + %227 = icmp sle i32 %197, %209, !dbg !22 + %228 = or i1 %226, %227, !dbg !22 + %229 = and i1 %225, %228, !dbg !22 + %.not8 = xor i1 %229, %27, !dbg !22 + %230 = xor i32 %167, %179, !dbg !43 + %231 = xor i32 %169, %181, !dbg !43 + %232 = xor i32 %171, %183, !dbg !43 + %233 = xor i32 %173, %185, !dbg !43 + %234 = select i1 %.not5, i32 0, i32 %230, !dbg !44 + %235 = select i1 %.not6, i32 0, i32 %231, !dbg !44 + %236 = select i1 %.not7, i32 0, i32 %232, !dbg !44 + %237 = select i1 %.not8, i32 0, i32 %233, !dbg !44 + %238 = xor i32 %234, %150, !dbg !45 + %239 = xor i32 %235, %151, !dbg !45 + %240 = xor i32 %236, %152, !dbg !45 + %241 = xor i32 %237, %153, !dbg !45 + %242 = xor i32 %191, %203, !dbg !38 + %243 = xor i32 %193, %205, !dbg !38 + %244 = xor i32 %195, %207, !dbg !38 + %245 = xor i32 %197, %209, !dbg !38 + %246 = select i1 %.not5, i32 0, i32 %242, !dbg !46 + %247 = select i1 %.not6, i32 0, i32 %243, !dbg !46 + %248 = select i1 %.not7, i32 0, i32 %244, !dbg !46 + %249 = select i1 %.not8, i32 0, i32 %245, !dbg !46 + %250 = xor i32 %246, %158, !dbg !47 + %251 = xor i32 %247, %159, !dbg !47 + %252 = xor i32 %248, %160, !dbg !47 + %253 = xor i32 %249, %161, !dbg !47 + %254 = icmp sge i32 %238, %240, !dbg !22 + %255 = icmp ne i32 %238, %240, !dbg !22 + %256 = icmp sle i32 %250, %252, !dbg !22 + %257 = or i1 %255, %256, !dbg !22 + %258 = and i1 %254, %257, !dbg !22 + %.not9 = xor i1 %258, %27, !dbg !22 + %259 = icmp sge i32 %239, %241, !dbg !22 + %260 = icmp ne i32 %239, %241, !dbg !22 + %261 = icmp sle i32 %251, %253, !dbg !22 + %262 = or i1 %260, %261, !dbg !22 + %263 = and i1 %259, %262, !dbg !22 + %.not10 = xor i1 %263, %27, !dbg !22 + %264 = xor i32 %238, %240, !dbg !43 + %265 = xor i32 %239, %241, !dbg !43 + %266 = select i1 %.not9, i32 0, i32 %264, !dbg !44 + %267 = select i1 %.not10, i32 0, i32 %265, !dbg !44 + %268 = xor i32 %266, %238, !dbg !45 + %269 = xor i32 %267, %239, !dbg !45 + %270 = xor i32 %266, %240, !dbg !45 + %271 = xor i32 %267, %241, !dbg !45 + %272 = xor i32 %250, %252, !dbg !38 + %273 = xor i32 %251, %253, !dbg !38 + %274 = select i1 %.not9, i32 0, i32 %272, !dbg !46 + %275 = select i1 %.not10, i32 0, i32 %273, !dbg !46 + %276 = xor i32 %274, %250, !dbg !47 + %277 = xor i32 %275, %251, !dbg !47 + %278 = xor i32 %274, %252, !dbg !47 + %279 = xor i32 %275, %253, !dbg !47 + %280 = icmp sge i32 %268, %269, !dbg !22 + %281 = icmp ne i32 %268, %269, !dbg !22 + %282 = icmp sle i32 %276, %277, !dbg !22 + %283 = or i1 %281, %282, !dbg !22 + %284 = and i1 %280, %283, !dbg !22 + %.not11 = xor i1 %284, %27, !dbg !22 + %285 = icmp sge i32 %270, %271, !dbg !22 + %286 = icmp ne i32 %270, %271, !dbg !22 + %287 = icmp sle i32 %278, %279, !dbg !22 + %288 = or i1 %286, %287, !dbg !22 + %289 = and i1 %285, %288, !dbg !22 + %.not12 = xor i1 %289, %27, !dbg !22 + %290 = xor i32 %268, %269, !dbg !43 + %291 = xor i32 %270, %271, !dbg !43 + %292 = select i1 %.not11, i32 0, i32 %290, !dbg !44 + %293 = select i1 %.not12, i32 0, i32 %291, !dbg !44 + %294 = xor i32 %276, %277, !dbg !38 + %295 = xor i32 %278, %279, !dbg !38 + %296 = select i1 %.not11, i32 0, i32 %294, !dbg !46 + %297 = select i1 %.not12, i32 0, i32 %295, !dbg !46 + %298 = xor i32 %296, %276, !dbg !47 + %299 = xor i32 %296, %277, !dbg !47 + %300 = xor i32 %297, %278, !dbg !47 + %301 = xor i32 %297, %279, !dbg !47 + %302 = mul nuw nsw i32 %298, %25, !dbg !32 + %303 = mul nuw nsw i32 %299, %25, !dbg !32 + %304 = mul nuw nsw i32 %300, %25, !dbg !32 + %305 = mul nuw nsw i32 %301, %25, !dbg !32 + %306 = mul nuw nsw i32 %298, %.lobit, !dbg !31 + %307 = mul nuw nsw i32 %299, %.lobit, !dbg !31 + %308 = mul nuw nsw i32 %300, %.lobit, !dbg !31 + %309 = mul nuw nsw i32 %301, %.lobit, !dbg !31 + %310 = insertelement <2 x i32> poison, i32 %292, i64 0, !dbg !45 + %311 = shufflevector <2 x i32> %310, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !45 + %312 = insertelement <2 x i32> poison, i32 %269, i64 0, !dbg !45 + %313 = insertelement <2 x i32> %312, i32 %268, i64 1, !dbg !45 + %314 = xor <2 x i32> %311, %313, !dbg !45 + %315 = insertelement <2 x i32> poison, i32 %293, i64 0, !dbg !45 + %316 = shufflevector <2 x i32> %315, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !45 + %317 = insertelement <2 x i32> poison, i32 %271, i64 0, !dbg !45 + %318 = insertelement <2 x i32> %317, i32 %270, i64 1, !dbg !45 + %319 = xor <2 x i32> %316, %318, !dbg !45 + %320 = extractelement <2 x i32> %314, i64 1, !dbg !30 + %321 = mul nuw nsw i32 %320, %25, !dbg !29 + %322 = extractelement <2 x i32> %314, i64 0, !dbg !30 + %323 = mul nuw nsw i32 %322, %25, !dbg !29 + %324 = extractelement <2 x i32> %319, i64 1, !dbg !30 + %325 = mul nuw nsw i32 %324, %25, !dbg !29 + %326 = extractelement <2 x i32> %319, i64 0, !dbg !30 + %327 = mul nuw nsw i32 %326, %25, !dbg !29 + %328 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %321, i32 2, i32 31), !dbg !48 + %329 = add i32 %321, %328, !dbg !51 + %330 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %323, i32 2, i32 31), !dbg !48 + %331 = add i32 %323, %330, !dbg !51 + %332 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %325, i32 2, i32 31), !dbg !48 + %333 = add i32 %325, %332, !dbg !51 + %334 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %327, i32 2, i32 31), !dbg !48 + %335 = add i32 %327, %334, !dbg !51 + %336 = mul nuw nsw i32 %320, %.lobit, !dbg !30 + %337 = mul nuw nsw i32 %322, %.lobit, !dbg !30 + %338 = mul nuw nsw i32 %324, %.lobit, !dbg !30 + %339 = mul nuw nsw i32 %326, %.lobit, !dbg !30 + %340 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %336, i32 2, i32 31), !dbg !48 + %341 = add i32 %336, %340, !dbg !51 + %342 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %337, i32 2, i32 31), !dbg !48 + %343 = add i32 %337, %342, !dbg !51 + %344 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %338, i32 2, i32 31), !dbg !48 + %345 = add i32 %338, %344, !dbg !51 + %346 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %339, i32 2, i32 31), !dbg !48 + %347 = add i32 %339, %346, !dbg !51 + %348 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %302, i32 2, i32 31), !dbg !48 + %349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %303, i32 2, i32 31), !dbg !48 + %350 = insertelement <2 x i32> poison, i32 %303, i64 0, !dbg !51 + %351 = insertelement <2 x i32> %350, i32 %302, i64 1, !dbg !51 + %352 = insertelement <2 x i32> poison, i32 %349, i64 0, !dbg !51 + %353 = insertelement <2 x i32> %352, i32 %348, i64 1, !dbg !51 + %354 = add <2 x i32> %351, %353, !dbg !51 + %355 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %304, i32 2, i32 31), !dbg !48 + %356 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %305, i32 2, i32 31), !dbg !48 + %357 = insertelement <2 x i32> poison, i32 %305, i64 0, !dbg !51 + %358 = insertelement <2 x i32> %357, i32 %304, i64 1, !dbg !51 + %359 = insertelement <2 x i32> poison, i32 %356, i64 0, !dbg !51 + %360 = insertelement <2 x i32> %359, i32 %355, i64 1, !dbg !51 + %361 = add <2 x i32> %358, %360, !dbg !51 + %362 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %306, i32 2, i32 31), !dbg !48 + %363 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %307, i32 2, i32 31), !dbg !48 + %364 = insertelement <2 x i32> poison, i32 %307, i64 0, !dbg !51 + %365 = insertelement <2 x i32> %364, i32 %306, i64 1, !dbg !51 + %366 = insertelement <2 x i32> poison, i32 %363, i64 0, !dbg !51 + %367 = insertelement <2 x i32> %366, i32 %362, i64 1, !dbg !51 + %368 = add <2 x i32> %365, %367, !dbg !51 + %369 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 2, i32 31), !dbg !48 + %370 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %309, i32 2, i32 31), !dbg !48 + %371 = insertelement <2 x i32> poison, i32 %309, i64 0, !dbg !51 + %372 = insertelement <2 x i32> %371, i32 %308, i64 1, !dbg !51 + %373 = insertelement <2 x i32> poison, i32 %370, i64 0, !dbg !51 + %374 = insertelement <2 x i32> %373, i32 %369, i64 1, !dbg !51 + %375 = add <2 x i32> %372, %374, !dbg !51 + %376 = insertelement <2 x i32> poison, i32 %331, i64 0, !dbg !40 + %377 = insertelement <2 x i32> %376, i32 %329, i64 1, !dbg !40 + %378 = insertelement <2 x i32> poison, i32 %343, i64 0, !dbg !40 + %379 = insertelement <2 x i32> %378, i32 %341, i64 1, !dbg !40 + %380 = icmp slt <2 x i32> %377, %379, !dbg !40 + %381 = icmp eq <2 x i32> %377, %379, !dbg !52 + %382 = icmp sgt <2 x i32> %354, %368, !dbg !53 + %383 = and <2 x i1> %381, %382, !dbg !54 + %384 = or <2 x i1> %380, %383, !dbg !55 + %385 = insertelement <2 x i32> poison, i32 %335, i64 0, !dbg !40 + %386 = insertelement <2 x i32> %385, i32 %333, i64 1, !dbg !40 + %387 = insertelement <2 x i32> poison, i32 %347, i64 0, !dbg !40 + %388 = insertelement <2 x i32> %387, i32 %345, i64 1, !dbg !40 + %389 = icmp slt <2 x i32> %386, %388, !dbg !40 + %390 = icmp eq <2 x i32> %386, %388, !dbg !52 + %391 = icmp sgt <2 x i32> %361, %375, !dbg !53 + %392 = and <2 x i1> %390, %391, !dbg !54 + %393 = or <2 x i1> %389, %392, !dbg !55 + %394 = xor <2 x i32> %377, %379, !dbg !43 + %395 = xor <2 x i32> %386, %388, !dbg !43 + %396 = select <2 x i1> %384, <2 x i32> %394, <2 x i32> zeroinitializer, !dbg !44 + %397 = select <2 x i1> %393, <2 x i32> %395, <2 x i32> zeroinitializer, !dbg !44 + %398 = xor <2 x i32> %396, %314, !dbg !45 + %399 = xor <2 x i32> %397, %319, !dbg !45 + %400 = xor <2 x i32> %354, %368, !dbg !38 + %401 = xor <2 x i32> %361, %375, !dbg !38 + %402 = select <2 x i1> %384, <2 x i32> %400, <2 x i32> zeroinitializer, !dbg !46 + %403 = select <2 x i1> %393, <2 x i32> %401, <2 x i32> zeroinitializer, !dbg !46 + %404 = insertelement <2 x i32> poison, i32 %299, i64 0, !dbg !47 + %405 = insertelement <2 x i32> %404, i32 %298, i64 1, !dbg !47 + %406 = xor <2 x i32> %402, %405, !dbg !47 + %407 = insertelement <2 x i32> poison, i32 %301, i64 0, !dbg !47 + %408 = insertelement <2 x i32> %407, i32 %300, i64 1, !dbg !47 + %409 = xor <2 x i32> %403, %408, !dbg !47 + %410 = mul nuw nsw <2 x i32> %398, %41, !dbg !29 + %411 = mul nuw nsw <2 x i32> %399, %41, !dbg !29 + %412 = extractelement <2 x i32> %410, i64 1, !dbg !48 + %413 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %412, i32 1, i32 31), !dbg !48 + %414 = extractelement <2 x i32> %410, i64 0, !dbg !48 + %415 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %414, i32 1, i32 31), !dbg !48 + %416 = insertelement <2 x i32> poison, i32 %415, i64 0, !dbg !51 + %417 = insertelement <2 x i32> %416, i32 %413, i64 1, !dbg !51 + %418 = add <2 x i32> %410, %417, !dbg !51 + %419 = extractelement <2 x i32> %411, i64 1, !dbg !48 + %420 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %419, i32 1, i32 31), !dbg !48 + %421 = extractelement <2 x i32> %411, i64 0, !dbg !48 + %422 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %421, i32 1, i32 31), !dbg !48 + %423 = insertelement <2 x i32> poison, i32 %422, i64 0, !dbg !51 + %424 = insertelement <2 x i32> %423, i32 %420, i64 1, !dbg !51 + %425 = add <2 x i32> %411, %424, !dbg !51 + %426 = mul nuw nsw <2 x i32> %398, %43, !dbg !30 + %427 = mul nuw nsw <2 x i32> %399, %43, !dbg !30 + %428 = extractelement <2 x i32> %426, i64 1, !dbg !48 + %429 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %428, i32 1, i32 31), !dbg !48 + %430 = extractelement <2 x i32> %426, i64 0, !dbg !48 + %431 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %430, i32 1, i32 31), !dbg !48 + %432 = insertelement <2 x i32> poison, i32 %431, i64 0, !dbg !51 + %433 = insertelement <2 x i32> %432, i32 %429, i64 1, !dbg !51 + %434 = add <2 x i32> %426, %433, !dbg !51 + %435 = extractelement <2 x i32> %427, i64 1, !dbg !48 + %436 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %435, i32 1, i32 31), !dbg !48 + %437 = extractelement <2 x i32> %427, i64 0, !dbg !48 + %438 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %437, i32 1, i32 31), !dbg !48 + %439 = insertelement <2 x i32> poison, i32 %438, i64 0, !dbg !51 + %440 = insertelement <2 x i32> %439, i32 %436, i64 1, !dbg !51 + %441 = add <2 x i32> %427, %440, !dbg !51 + %442 = mul nuw nsw <2 x i32> %406, %41, !dbg !32 + %443 = extractelement <2 x i32> %442, i64 1, !dbg !48 + %444 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %443, i32 1, i32 31), !dbg !48 + %445 = mul nuw nsw <2 x i32> %409, %41, !dbg !32 + %446 = extractelement <2 x i32> %442, i64 0, !dbg !48 + %447 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %446, i32 1, i32 31), !dbg !48 + %448 = insertelement <2 x i32> poison, i32 %447, i64 0, !dbg !51 + %449 = insertelement <2 x i32> %448, i32 %444, i64 1, !dbg !51 + %450 = add <2 x i32> %442, %449, !dbg !51 + %451 = mul nuw nsw <2 x i32> %409, %44, !dbg !31 + %452 = extractelement <2 x i32> %445, i64 1, !dbg !48 + %453 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %452, i32 1, i32 31), !dbg !48 + %454 = mul nuw nsw <2 x i32> %409, %45, !dbg !32 + %455 = extractelement <2 x i32> %445, i64 0, !dbg !48 + %456 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %455, i32 1, i32 31), !dbg !48 + %457 = insertelement <2 x i32> poison, i32 %456, i64 0, !dbg !51 + %458 = insertelement <2 x i32> %457, i32 %453, i64 1, !dbg !51 + %459 = add <2 x i32> %445, %458, !dbg !51 + %460 = mul nuw nsw <2 x i32> %406, %43, !dbg !31 + %461 = mul nuw nsw <2 x i32> %409, %43, !dbg !31 + %462 = extractelement <2 x i32> %460, i64 1, !dbg !48 + %463 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %462, i32 1, i32 31), !dbg !48 + %464 = extractelement <2 x i32> %460, i64 0, !dbg !48 + %465 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %464, i32 1, i32 31), !dbg !48 + %466 = insertelement <2 x i32> poison, i32 %465, i64 0, !dbg !51 + %467 = insertelement <2 x i32> %466, i32 %463, i64 1, !dbg !51 + %468 = add <2 x i32> %460, %467, !dbg !51 + %469 = extractelement <2 x i32> %461, i64 1, !dbg !48 + %470 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %469, i32 1, i32 31), !dbg !48 + %471 = insertelement <2 x i32> %458, i32 %470, i64 1, !dbg !51 + %472 = add <2 x i32> %454, %471, !dbg !51 + %473 = extractelement <2 x i32> %461, i64 0, !dbg !48 + %474 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %473, i32 1, i32 31), !dbg !48 + %475 = insertelement <2 x i32> %471, i32 %474, i64 0, !dbg !51 + %476 = add <2 x i32> %475, %461, !dbg !51 + %477 = insertelement <2 x i32> %475, i32 %453, i64 1, !dbg !51 + %478 = add <2 x i32> %477, %451, !dbg !51 + %479 = icmp slt <2 x i32> %418, %434, !dbg !40 + %480 = icmp slt <2 x i32> %425, %441, !dbg !40 + %481 = icmp eq <2 x i32> %418, %434, !dbg !52 + %482 = icmp eq <2 x i32> %425, %441, !dbg !52 + %483 = icmp sgt <2 x i32> %450, %468, !dbg !53 + %484 = icmp sgt <2 x i32> %459, %476, !dbg !53 + %485 = and <2 x i1> %481, %483, !dbg !54 + %486 = and <2 x i1> %482, %484, !dbg !54 + %487 = or <2 x i1> %479, %485, !dbg !55 + %488 = or <2 x i1> %480, %486, !dbg !55 + %489 = xor <2 x i32> %418, %434, !dbg !43 + %490 = xor <2 x i32> %425, %441, !dbg !43 + %491 = select <2 x i1> %487, <2 x i32> %489, <2 x i32> zeroinitializer, !dbg !44 + %492 = select <2 x i1> %488, <2 x i32> %490, <2 x i32> zeroinitializer, !dbg !44 + %493 = xor <2 x i32> %491, %398, !dbg !45 + %494 = xor <2 x i32> %492, %399, !dbg !45 + %495 = xor <2 x i32> %450, %468, !dbg !38 + %496 = xor <2 x i32> %478, %472, !dbg !38 + %497 = select <2 x i1> %487, <2 x i32> %495, <2 x i32> zeroinitializer, !dbg !46 + %498 = select <2 x i1> %488, <2 x i32> %496, <2 x i32> zeroinitializer, !dbg !46 + %499 = xor <2 x i32> %497, %406, !dbg !47 + %500 = xor <2 x i32> %498, %409, !dbg !47 + %501 = icmp slt <2 x i32> %493, %494, !dbg !40 + %502 = icmp eq <2 x i32> %493, %494, !dbg !52 + %503 = icmp sgt <2 x i32> %499, %500, !dbg !53 + %504 = and <2 x i1> %502, %503, !dbg !54 + %505 = or <2 x i1> %501, %504, !dbg !55 + %506 = xor <2 x i32> %493, %494, !dbg !43 + %507 = select <2 x i1> %505, <2 x i32> %506, <2 x i32> zeroinitializer, !dbg !44 + %508 = shufflevector <2 x i32> %507, <2 x i32> poison, <2 x i32> , !dbg !45 + %509 = shufflevector <2 x i32> %493, <2 x i32> %494, <2 x i32> , !dbg !45 + %510 = xor <2 x i32> %508, %509, !dbg !45 + %511 = shufflevector <2 x i32> %507, <2 x i32> poison, <2 x i32> , !dbg !45 + %512 = shufflevector <2 x i32> %494, <2 x i32> %493, <2 x i32> , !dbg !45 + %513 = xor <2 x i32> %511, %512, !dbg !45 + %514 = shufflevector <2 x i32> %507, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !45 + %515 = shufflevector <2 x i32> %494, <2 x i32> %493, <2 x i32> , !dbg !45 + %516 = xor <2 x i32> %514, %515, !dbg !45 + %517 = shufflevector <2 x i32> %494, <2 x i32> %493, <2 x i32> , !dbg !45 + %518 = xor <2 x i32> %507, %517, !dbg !45 + %519 = xor <2 x i32> %499, %500, !dbg !38 + %520 = select <2 x i1> %505, <2 x i32> %519, <2 x i32> zeroinitializer, !dbg !46 + %521 = xor <2 x i32> %520, %499, !dbg !47 + %522 = xor <2 x i32> %520, %500, !dbg !47 + %523 = icmp slt <2 x i32> %513, %516, !dbg !40 + %524 = icmp eq <2 x i32> %518, %510, !dbg !52 + %525 = shufflevector <2 x i32> %521, <2 x i32> %522, <2 x i32> , !dbg !53 + %526 = shufflevector <2 x i32> %522, <2 x i32> %521, <2 x i32> , !dbg !53 + %527 = icmp sgt <2 x i32> %525, %526, !dbg !53 + %528 = and <2 x i1> %524, %527, !dbg !54 + %529 = or <2 x i1> %523, %528, !dbg !55 + %530 = xor <2 x i32> %526, %525, !dbg !38 + %531 = select <2 x i1> %529, <2 x i32> %530, <2 x i32> zeroinitializer, !dbg !46 + %532 = shufflevector <2 x i32> %531, <2 x i32> poison, <4 x i32> , !dbg !46 + %533 = shufflevector <2 x i32> %521, <2 x i32> %522, <4 x i32> , !dbg !47 + %534 = xor <4 x i32> %532, %533, !dbg !47 + %535 = select <4 x i1> %47, <4 x i1> %75, <4 x i1> zeroinitializer, !dbg !33 + %536 = bitcast <4 x i1> %535 to i4, !dbg !56 + %537 = tail call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 %536), !dbg !56 + %538 = zext nneg i4 %537 to i64, !dbg !56 + %539 = zext nneg i4 %537 to i32, !dbg !58 + %540 = icmp eq <4 x i64> %73, splat (i64 16384), !dbg !59 + %541 = extractelement <4 x i1> %540, i64 0, !dbg !60 + %542 = zext i1 %541 to i32, !dbg !41 + %543 = extractelement <4 x i1> %540, i64 1, !dbg !60 + %544 = zext i1 %543 to i32, !dbg !41 + %545 = extractelement <4 x i1> %540, i64 2, !dbg !61 + %546 = zext i1 %545 to i32, !dbg !41 + %547 = extractelement <4 x i1> %540, i64 3, !dbg !61 + %548 = zext i1 %547 to i32, !dbg !41 + %549 = xor i1 %541, true, !dbg !60 + %550 = and i1 %543, %549, !dbg !60 + %.not13 = xor i1 %547, true, !dbg !61 + %551 = or i1 %545, %.not13, !dbg !61 + %552 = xor i32 %542, %544, !dbg !62 + %553 = xor i32 %546, %548, !dbg !62 + %554 = select i1 %550, i32 %552, i32 0, !dbg !63 + %555 = select i1 %551, i32 %553, i32 0, !dbg !63 + %556 = xor i32 %554, %542, !dbg !64 + %557 = xor i32 %554, %544, !dbg !64 + %558 = xor i32 %555, %546, !dbg !64 + %559 = xor i32 %555, %548, !dbg !64 + %560 = select i1 %550, i32 %68, i32 0, !dbg !65 + %561 = select i1 %551, i32 %69, i32 0, !dbg !65 + %562 = xor i32 %560, %52, !dbg !66 + %563 = xor i32 %560, %53, !dbg !66 + %564 = xor i32 %561, %54, !dbg !66 + %565 = xor i32 %561, %55, !dbg !66 + %566 = insertelement <2 x i32> poison, i32 %556, i64 0, !dbg !23 + %567 = insertelement <2 x i32> %566, i32 %557, i64 1, !dbg !23 + %568 = insertelement <2 x i32> poison, i32 %558, i64 0, !dbg !23 + %569 = insertelement <2 x i32> %568, i32 %559, i64 1, !dbg !23 + %570 = icmp samesign uge <2 x i32> %567, %569, !dbg !23 + %571 = insertelement <2 x i32> %569, i32 %557, i64 1, !dbg !23 + %572 = insertelement <2 x i32> %567, i32 %559, i64 1, !dbg !23 + %573 = icmp ne <2 x i32> %571, %572, !dbg !23 + %574 = insertelement <2 x i32> poison, i32 %562, i64 0, !dbg !23 + %575 = insertelement <2 x i32> %574, i32 %563, i64 1, !dbg !23 + %576 = insertelement <2 x i32> poison, i32 %564, i64 0, !dbg !23 + %577 = insertelement <2 x i32> %576, i32 %565, i64 1, !dbg !23 + %578 = icmp ule <2 x i32> %575, %577, !dbg !23 + %579 = or <2 x i1> %578, %573, !dbg !23 + %580 = and <2 x i1> %570, %579, !dbg !23 + %581 = xor <2 x i1> %580, %39, !dbg !23 + %582 = xor <2 x i32> %572, %571, !dbg !62 + %583 = select <2 x i1> %581, <2 x i32> zeroinitializer, <2 x i32> %582, !dbg !63 + %584 = extractelement <2 x i32> %583, i64 0, !dbg !64 + %585 = xor i32 %584, %556, !dbg !64 + %586 = extractelement <2 x i32> %583, i64 1, !dbg !64 + %587 = xor i32 %586, %557, !dbg !64 + %588 = xor <2 x i32> %583, %569, !dbg !64 + %589 = xor i32 %564, %562, !dbg !67 + %590 = xor i32 %565, %563, !dbg !67 + %591 = extractelement <2 x i1> %581, i64 0, !dbg !65 + %592 = select i1 %591, i32 0, i32 %589, !dbg !65 + %593 = extractelement <2 x i1> %581, i64 1, !dbg !65 + %594 = select i1 %593, i32 0, i32 %590, !dbg !65 + %595 = xor i32 %592, %562, !dbg !66 + %596 = xor i32 %594, %563, !dbg !66 + %597 = xor i32 %592, %564, !dbg !66 + %598 = xor i32 %594, %565, !dbg !66 + %599 = icmp samesign uge i32 %585, %587, !dbg !23 + %600 = icmp ne i32 %585, %587, !dbg !23 + %601 = icmp samesign ule i32 %595, %596, !dbg !23 + %602 = or i1 %600, %601, !dbg !23 + %603 = and i1 %599, %602, !dbg !23 + %.not16 = xor i1 %603, %26, !dbg !23 + %604 = extractelement <2 x i32> %588, i64 0, !dbg !23 + %605 = extractelement <2 x i32> %588, i64 1, !dbg !23 + %606 = icmp samesign uge i32 %604, %605, !dbg !23 + %607 = icmp ne i32 %604, %605, !dbg !23 + %608 = icmp samesign ule i32 %597, %598, !dbg !23 + %609 = or i1 %607, %608, !dbg !23 + %610 = and i1 %606, %609, !dbg !23 + %.not17 = xor i1 %610, %26, !dbg !23 + %611 = xor i32 %585, %587, !dbg !62 + %612 = xor i32 %604, %605, !dbg !62 + %613 = select i1 %.not16, i32 0, i32 %611, !dbg !63 + %614 = select i1 %.not17, i32 0, i32 %612, !dbg !63 + %615 = xor i32 %613, %585, !dbg !64 + %616 = xor i32 %613, %587, !dbg !64 + %617 = xor i32 %614, %604, !dbg !64 + %618 = xor i32 %614, %605, !dbg !64 + %619 = xor i32 %595, %596, !dbg !67 + %620 = xor i32 %597, %598, !dbg !67 + %621 = select i1 %.not16, i32 0, i32 %619, !dbg !65 + %622 = select i1 %.not17, i32 0, i32 %620, !dbg !65 + %623 = mul nuw nsw i32 %615, %24, !dbg !27 + %624 = mul nuw nsw i32 %616, %24, !dbg !27 + %625 = mul nuw nsw i32 %617, %24, !dbg !27 + %626 = mul nuw nsw i32 %618, %24, !dbg !27 + %627 = mul nuw nsw i32 %615, %22, !dbg !28 + %628 = mul nuw nsw i32 %616, %22, !dbg !28 + %629 = mul nuw nsw i32 %617, %22, !dbg !28 + %630 = mul nuw nsw i32 %618, %22, !dbg !28 + %631 = insertelement <4 x i32> poison, i32 %622, i64 0, !dbg !66 + %632 = insertelement <4 x i32> %631, i32 %621, i64 1, !dbg !66 + %633 = shufflevector <4 x i32> %632, <4 x i32> poison, <4 x i32> , !dbg !66 + %634 = insertelement <4 x i32> poison, i32 %595, i64 0, !dbg !66 + %635 = insertelement <4 x i32> %634, i32 %596, i64 1, !dbg !66 + %636 = insertelement <4 x i32> %635, i32 %597, i64 2, !dbg !66 + %637 = insertelement <4 x i32> %636, i32 %598, i64 3, !dbg !66 + %638 = xor <4 x i32> %633, %637, !dbg !66 + %639 = extractelement <4 x i32> %638, i64 0, !dbg !26 + %640 = mul nuw nsw i32 %639, %24, !dbg !25 + %641 = extractelement <4 x i32> %638, i64 1, !dbg !26 + %642 = mul nuw nsw i32 %641, %24, !dbg !25 + %643 = extractelement <4 x i32> %638, i64 2, !dbg !26 + %644 = mul nuw nsw i32 %643, %24, !dbg !25 + %645 = extractelement <4 x i32> %638, i64 3, !dbg !26 + %646 = mul nuw nsw i32 %645, %24, !dbg !25 + %647 = mul nuw nsw i32 %639, %22, !dbg !26 + %648 = mul nuw nsw i32 %641, %22, !dbg !26 + %649 = mul nuw nsw i32 %643, %22, !dbg !26 + %650 = mul nuw nsw i32 %645, %22, !dbg !26 + %651 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %623, i32 1, i32 31), !dbg !68 + %652 = add i32 %651, %623, !dbg !69 + %653 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %624, i32 1, i32 31), !dbg !68 + %654 = add i32 %653, %624, !dbg !69 + %655 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %625, i32 1, i32 31), !dbg !68 + %656 = add i32 %655, %625, !dbg !69 + %657 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %626, i32 1, i32 31), !dbg !68 + %658 = add i32 %657, %626, !dbg !69 + %659 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %627, i32 1, i32 31), !dbg !68 + %660 = add i32 %659, %627, !dbg !69 + %661 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %628, i32 1, i32 31), !dbg !68 + %662 = add i32 %661, %628, !dbg !69 + %663 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %629, i32 1, i32 31), !dbg !68 + %664 = add i32 %663, %629, !dbg !69 + %665 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %630, i32 1, i32 31), !dbg !68 + %666 = add i32 %665, %630, !dbg !69 + %667 = icmp sge i32 %654, %662, !dbg !23 + %668 = icmp ne i32 %654, %662, !dbg !23 + %669 = insertelement <2 x i32> poison, i32 %652, i64 0, !dbg !23 + %670 = insertelement <2 x i32> %669, i32 %656, i64 1, !dbg !23 + %671 = insertelement <2 x i32> poison, i32 %660, i64 0, !dbg !23 + %672 = insertelement <2 x i32> %671, i32 %664, i64 1, !dbg !23 + %673 = icmp sge <2 x i32> %670, %672, !dbg !23 + %674 = icmp ne <2 x i32> %670, %672, !dbg !23 + %675 = icmp sge i32 %658, %666, !dbg !23 + %676 = icmp ne i32 %658, %666, !dbg !23 + %677 = xor i32 %660, %652, !dbg !62 + %678 = xor i32 %662, %654, !dbg !62 + %679 = xor i32 %664, %656, !dbg !62 + %680 = xor i32 %666, %658, !dbg !62 + %681 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %640, i32 1, i32 31), !dbg !68 + %682 = add i32 %681, %640, !dbg !69 + %683 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %642, i32 1, i32 31), !dbg !68 + %684 = add i32 %683, %642, !dbg !69 + %685 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %644, i32 1, i32 31), !dbg !68 + %686 = add i32 %685, %644, !dbg !69 + %687 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %646, i32 1, i32 31), !dbg !68 + %688 = add i32 %687, %646, !dbg !69 + %689 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %647, i32 1, i32 31), !dbg !68 + %690 = add i32 %689, %647, !dbg !69 + %691 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %648, i32 1, i32 31), !dbg !68 + %692 = add i32 %691, %648, !dbg !69 + %693 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %649, i32 1, i32 31), !dbg !68 + %694 = add i32 %693, %649, !dbg !69 + %695 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %650, i32 1, i32 31), !dbg !68 + %696 = add i32 %695, %650, !dbg !69 + %697 = insertelement <2 x i32> poison, i32 %682, i64 0, !dbg !23 + %698 = insertelement <2 x i32> %697, i32 %686, i64 1, !dbg !23 + %699 = insertelement <2 x i32> poison, i32 %690, i64 0, !dbg !23 + %700 = insertelement <2 x i32> %699, i32 %694, i64 1, !dbg !23 + %701 = icmp sle <2 x i32> %698, %700, !dbg !23 + %702 = or <2 x i1> %674, %701, !dbg !23 + %703 = and <2 x i1> %673, %702, !dbg !23 + %704 = xor <2 x i1> %703, %29, !dbg !23 + %705 = insertelement <2 x i32> poison, i32 %677, i64 0, !dbg !63 + %706 = insertelement <2 x i32> %705, i32 %679, i64 1, !dbg !63 + %707 = select <2 x i1> %704, <2 x i32> zeroinitializer, <2 x i32> %706, !dbg !63 + %708 = insertelement <2 x i32> poison, i32 %615, i64 0, !dbg !64 + %709 = insertelement <2 x i32> %708, i32 %617, i64 1, !dbg !64 + %710 = xor <2 x i32> %707, %709, !dbg !64 + %711 = insertelement <2 x i32> poison, i32 %688, i64 0, !dbg !23 + %712 = insertelement <2 x i32> %711, i32 %684, i64 1, !dbg !23 + %713 = insertelement <2 x i32> poison, i32 %696, i64 0, !dbg !23 + %714 = insertelement <2 x i32> %713, i32 %692, i64 1, !dbg !23 + %715 = icmp sle <2 x i32> %712, %714, !dbg !23 + %716 = insertelement <2 x i1> poison, i1 %676, i64 0, !dbg !23 + %717 = insertelement <2 x i1> %716, i1 %668, i64 1, !dbg !23 + %718 = or <2 x i1> %717, %715, !dbg !23 + %719 = insertelement <2 x i1> poison, i1 %675, i64 0, !dbg !23 + %720 = insertelement <2 x i1> %719, i1 %667, i64 1, !dbg !23 + %721 = and <2 x i1> %720, %718, !dbg !23 + %722 = xor <2 x i1> %721, %29, !dbg !23 + %723 = insertelement <2 x i32> poison, i32 %680, i64 0, !dbg !63 + %724 = insertelement <2 x i32> %723, i32 %678, i64 1, !dbg !63 + %725 = select <2 x i1> %722, <2 x i32> zeroinitializer, <2 x i32> %724, !dbg !63 + %726 = insertelement <2 x i32> poison, i32 %618, i64 0, !dbg !64 + %727 = insertelement <2 x i32> %726, i32 %616, i64 1, !dbg !64 + %728 = xor <2 x i32> %725, %727, !dbg !64 + %729 = xor i32 %690, %682, !dbg !67 + %730 = xor i32 %692, %684, !dbg !67 + %731 = xor i32 %694, %686, !dbg !67 + %732 = xor i32 %696, %688, !dbg !67 + %733 = shufflevector <2 x i1> %704, <2 x i1> %722, <4 x i32> , !dbg !65 + %734 = insertelement <4 x i32> poison, i32 %729, i64 0, !dbg !65 + %735 = insertelement <4 x i32> %734, i32 %730, i64 1, !dbg !65 + %736 = insertelement <4 x i32> %735, i32 %731, i64 2, !dbg !65 + %737 = insertelement <4 x i32> %736, i32 %732, i64 3, !dbg !65 + %738 = select <4 x i1> %733, <4 x i32> zeroinitializer, <4 x i32> %737, !dbg !65 + %739 = xor <4 x i32> %738, %638, !dbg !66 + %740 = extractelement <2 x i32> %710, i64 0, !dbg !23 + %741 = extractelement <2 x i32> %710, i64 1, !dbg !23 + %742 = icmp sge i32 %740, %741, !dbg !23 + %743 = icmp ne i32 %740, %741, !dbg !23 + %shift = shufflevector <4 x i32> %739, <4 x i32> poison, <4 x i32> , !dbg !23 + %744 = icmp sle <4 x i32> %739, %shift, !dbg !23 + %745 = extractelement <4 x i1> %744, i64 0, !dbg !23 + %746 = or i1 %743, %745, !dbg !23 + %747 = and i1 %742, %746, !dbg !23 + %748 = extractelement <2 x i32> %728, i64 0, !dbg !23 + %749 = extractelement <2 x i32> %728, i64 1, !dbg !23 + %750 = icmp sge i32 %749, %748, !dbg !23 + %751 = icmp ne i32 %749, %748, !dbg !23 + %shift53 = shufflevector <4 x i32> %739, <4 x i32> poison, <4 x i32> , !dbg !23 + %752 = icmp sle <4 x i32> %739, %shift53, !dbg !23 + %753 = extractelement <4 x i1> %752, i64 1, !dbg !23 + %754 = or i1 %751, %753, !dbg !23 + %755 = and i1 %750, %754, !dbg !23 + %756 = insertelement <2 x i1> poison, i1 %755, i64 0, !dbg !23 + %757 = insertelement <2 x i1> %756, i1 %747, i64 1, !dbg !23 + %758 = xor <2 x i1> %757, %29, !dbg !23 + %759 = xor i32 %741, %740, !dbg !62 + %760 = xor i32 %748, %749, !dbg !62 + %761 = extractelement <2 x i1> %758, i64 1, !dbg !63 + %762 = select i1 %761, i32 0, i32 %759, !dbg !63 + %763 = extractelement <2 x i1> %758, i64 0, !dbg !63 + %764 = select i1 %763, i32 0, i32 %760, !dbg !63 + %765 = xor i32 %762, %740, !dbg !64 + %766 = xor i32 %764, %749, !dbg !64 + %767 = xor i32 %762, %741, !dbg !64 + %768 = xor i32 %764, %748, !dbg !64 + %769 = shufflevector <4 x i32> %739, <4 x i32> poison, <2 x i32> , !dbg !67 + %770 = shufflevector <4 x i32> %739, <4 x i32> poison, <2 x i32> , !dbg !67 + %771 = xor <2 x i32> %769, %770, !dbg !67 + %772 = select <2 x i1> %758, <2 x i32> zeroinitializer, <2 x i32> %771, !dbg !65 + %773 = shufflevector <2 x i32> %772, <2 x i32> poison, <4 x i32> , !dbg !65 + %774 = shufflevector <2 x i32> %772, <2 x i32> poison, <2 x i32> , !dbg !66 + %775 = shufflevector <4 x i32> %739, <4 x i32> poison, <2 x i32> , !dbg !66 + %776 = xor <2 x i32> %774, %775, !dbg !66 + %777 = shufflevector <4 x i32> %739, <4 x i32> poison, <2 x i32> , !dbg !66 + %778 = xor <2 x i32> %772, %777, !dbg !66 + %779 = icmp sge i32 %765, %766, !dbg !23 + %780 = icmp ne i32 %765, %766, !dbg !23 + %781 = extractelement <2 x i32> %778, i64 1, !dbg !23 + %782 = extractelement <2 x i32> %776, i64 1, !dbg !23 + %783 = icmp sle i32 %781, %782, !dbg !23 + %784 = or i1 %780, %783, !dbg !23 + %785 = and i1 %779, %784, !dbg !23 + %.not24 = xor i1 %785, %27, !dbg !23 + %786 = icmp sge i32 %767, %768, !dbg !23 + %787 = icmp ne i32 %767, %768, !dbg !23 + %788 = extractelement <2 x i32> %778, i64 0, !dbg !23 + %789 = extractelement <2 x i32> %776, i64 0, !dbg !23 + %790 = icmp sle i32 %789, %788, !dbg !23 + %791 = or i1 %787, %790, !dbg !23 + %792 = and i1 %786, %791, !dbg !23 + %.not25 = xor i1 %792, %27, !dbg !23 + %793 = xor i32 %766, %765, !dbg !62 + %794 = xor i32 %768, %767, !dbg !62 + %795 = select i1 %.not24, i32 0, i32 %793, !dbg !63 + %796 = select i1 %.not25, i32 0, i32 %794, !dbg !63 + %797 = insertelement <2 x i32> poison, i32 %796, i64 0, !dbg !64 + %798 = insertelement <2 x i32> %797, i32 %795, i64 1, !dbg !64 + %799 = insertelement <2 x i32> poison, i32 %767, i64 0, !dbg !64 + %800 = insertelement <2 x i32> %799, i32 %765, i64 1, !dbg !64 + %801 = xor <2 x i32> %798, %800, !dbg !64 + %802 = insertelement <2 x i32> poison, i32 %768, i64 0, !dbg !64 + %803 = insertelement <2 x i32> %802, i32 %766, i64 1, !dbg !64 + %804 = xor <2 x i32> %798, %803, !dbg !64 + %805 = xor i32 %782, %781, !dbg !67 + %806 = xor i32 %788, %789, !dbg !67 + %807 = insertelement <2 x i1> poison, i1 %.not25, i64 0, !dbg !65 + %808 = insertelement <2 x i1> %807, i1 %.not24, i64 1, !dbg !65 + %809 = insertelement <2 x i32> poison, i32 %806, i64 0, !dbg !65 + %810 = insertelement <2 x i32> %809, i32 %805, i64 1, !dbg !65 + %811 = select <2 x i1> %808, <2 x i32> zeroinitializer, <2 x i32> %810, !dbg !65 + %812 = shufflevector <2 x i32> %811, <2 x i32> poison, <4 x i32> , !dbg !66 + %813 = xor <4 x i32> %773, %812, !dbg !66 + %814 = xor <2 x i32> %811, %776, !dbg !66 + %815 = xor <2 x i32> %811, %778, !dbg !66 + %816 = extractelement <2 x i32> %801, i64 1, !dbg !28 + %817 = mul nuw nsw i32 %816, %25, !dbg !27 + %818 = extractelement <2 x i32> %804, i64 1, !dbg !28 + %819 = mul nuw nsw i32 %818, %25, !dbg !27 + %820 = extractelement <2 x i32> %801, i64 0, !dbg !28 + %821 = mul nuw nsw i32 %820, %25, !dbg !27 + %822 = extractelement <2 x i32> %804, i64 0, !dbg !28 + %823 = mul nuw nsw i32 %822, %25, !dbg !27 + %824 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %817, i32 2, i32 31), !dbg !68 + %825 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %819, i32 2, i32 31), !dbg !68 + %826 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %821, i32 2, i32 31), !dbg !68 + %827 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %823, i32 2, i32 31), !dbg !68 + %828 = insertelement <4 x i32> poison, i32 %817, i64 0, !dbg !69 + %829 = insertelement <4 x i32> %828, i32 %819, i64 1, !dbg !69 + %830 = insertelement <4 x i32> %829, i32 %821, i64 2, !dbg !69 + %831 = insertelement <4 x i32> %830, i32 %823, i64 3, !dbg !69 + %832 = insertelement <4 x i32> poison, i32 %824, i64 0, !dbg !69 + %833 = insertelement <4 x i32> %832, i32 %825, i64 1, !dbg !69 + %834 = insertelement <4 x i32> %833, i32 %826, i64 2, !dbg !69 + %835 = insertelement <4 x i32> %834, i32 %827, i64 3, !dbg !69 + %836 = add <4 x i32> %831, %835, !dbg !69 + %837 = mul nuw nsw i32 %816, %.lobit, !dbg !28 + %838 = mul nuw nsw i32 %818, %.lobit, !dbg !28 + %839 = mul nuw nsw i32 %820, %.lobit, !dbg !28 + %840 = mul nuw nsw i32 %822, %.lobit, !dbg !28 + %841 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %837, i32 2, i32 31), !dbg !68 + %842 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %838, i32 2, i32 31), !dbg !68 + %843 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %839, i32 2, i32 31), !dbg !68 + %844 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %840, i32 2, i32 31), !dbg !68 + %845 = insertelement <4 x i32> poison, i32 %837, i64 0, !dbg !69 + %846 = insertelement <4 x i32> %845, i32 %838, i64 1, !dbg !69 + %847 = insertelement <4 x i32> %846, i32 %839, i64 2, !dbg !69 + %848 = insertelement <4 x i32> %847, i32 %840, i64 3, !dbg !69 + %849 = insertelement <4 x i32> poison, i32 %841, i64 0, !dbg !69 + %850 = insertelement <4 x i32> %849, i32 %842, i64 1, !dbg !69 + %851 = insertelement <4 x i32> %850, i32 %843, i64 2, !dbg !69 + %852 = insertelement <4 x i32> %851, i32 %844, i64 3, !dbg !69 + %853 = add <4 x i32> %848, %852, !dbg !69 + %854 = shufflevector <2 x i32> %815, <2 x i32> %814, <4 x i32> , !dbg !25 + %855 = mul nuw nsw <4 x i32> %854, %31, !dbg !25 + %856 = extractelement <4 x i32> %855, i64 0, !dbg !68 + %857 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %856, i32 2, i32 31), !dbg !68 + %858 = extractelement <4 x i32> %855, i64 1, !dbg !68 + %859 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %858, i32 2, i32 31), !dbg !68 + %860 = extractelement <4 x i32> %855, i64 2, !dbg !68 + %861 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %860, i32 2, i32 31), !dbg !68 + %862 = extractelement <4 x i32> %855, i64 3, !dbg !68 + %863 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %862, i32 2, i32 31), !dbg !68 + %864 = insertelement <4 x i32> poison, i32 %857, i64 0, !dbg !69 + %865 = insertelement <4 x i32> %864, i32 %859, i64 1, !dbg !69 + %866 = insertelement <4 x i32> %865, i32 %861, i64 2, !dbg !69 + %867 = insertelement <4 x i32> %866, i32 %863, i64 3, !dbg !69 + %868 = add <4 x i32> %855, %867, !dbg !69 + %869 = mul nuw nsw <4 x i32> %854, %33, !dbg !26 + %870 = extractelement <4 x i32> %869, i64 0, !dbg !68 + %871 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %870, i32 2, i32 31), !dbg !68 + %872 = extractelement <4 x i32> %869, i64 1, !dbg !68 + %873 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %872, i32 2, i32 31), !dbg !68 + %874 = extractelement <4 x i32> %869, i64 2, !dbg !68 + %875 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %874, i32 2, i32 31), !dbg !68 + %876 = extractelement <4 x i32> %869, i64 3, !dbg !68 + %877 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %876, i32 2, i32 31), !dbg !68 + %878 = insertelement <4 x i32> poison, i32 %871, i64 0, !dbg !69 + %879 = insertelement <4 x i32> %878, i32 %873, i64 1, !dbg !69 + %880 = insertelement <4 x i32> %879, i32 %875, i64 2, !dbg !69 + %881 = insertelement <4 x i32> %880, i32 %877, i64 3, !dbg !69 + %882 = add <4 x i32> %869, %881, !dbg !69 + %883 = icmp slt <4 x i32> %836, %853, !dbg !60 + %884 = icmp eq <4 x i32> %836, %853, !dbg !70 + %885 = icmp sgt <4 x i32> %868, %882, !dbg !71 + %886 = and <4 x i1> %884, %885, !dbg !72 + %887 = or <4 x i1> %883, %886, !dbg !73 + %888 = or <4 x i1> %883, %886, !dbg !73 + %889 = shufflevector <4 x i1> %888, <4 x i1> poison, <2 x i32> , !dbg !73 + %890 = or <4 x i1> %883, %886, !dbg !73 + %891 = shufflevector <4 x i1> %890, <4 x i1> poison, <2 x i32> , !dbg !73 + %foldExtExtBinop = xor <4 x i32> %836, %853, !dbg !62 + %foldExtExtBinop55 = xor <4 x i32> %836, %853, !dbg !62 + %foldExtExtBinop57 = xor <4 x i32> %836, %853, !dbg !62 + %foldExtExtBinop59 = xor <4 x i32> %836, %853, !dbg !62 + %892 = shufflevector <2 x i1> %889, <2 x i1> %891, <2 x i32> , !dbg !63 + %893 = shufflevector <4 x i32> %foldExtExtBinop57, <4 x i32> %foldExtExtBinop, <2 x i32> , !dbg !63 + %894 = select <2 x i1> %892, <2 x i32> %893, <2 x i32> zeroinitializer, !dbg !63 + %895 = shufflevector <2 x i1> %891, <2 x i1> %889, <2 x i32> , !dbg !63 + %896 = shufflevector <4 x i32> %foldExtExtBinop59, <4 x i32> %foldExtExtBinop55, <2 x i32> , !dbg !63 + %897 = select <2 x i1> %895, <2 x i32> %896, <2 x i32> zeroinitializer, !dbg !63 + %898 = shufflevector <2 x i32> %894, <2 x i32> %897, <2 x i32> , !dbg !64 + %899 = shufflevector <2 x i32> %801, <2 x i32> %804, <2 x i32> , !dbg !64 + %900 = xor <2 x i32> %898, %899, !dbg !64 + %901 = xor <2 x i32> %894, %801, !dbg !64 + %902 = xor <2 x i32> %897, %804, !dbg !64 + %903 = shufflevector <2 x i32> %897, <2 x i32> %894, <2 x i32> , !dbg !64 + %904 = shufflevector <2 x i32> %804, <2 x i32> %801, <2 x i32> , !dbg !64 + %905 = xor <2 x i32> %903, %904, !dbg !64 + %906 = xor <4 x i32> %868, %882, !dbg !67 + %907 = xor <4 x i32> %868, %882, !dbg !67 + %908 = shufflevector <4 x i32> %907, <4 x i32> poison, <2 x i32> , !dbg !67 + %909 = xor <4 x i32> %868, %882, !dbg !67 + %910 = shufflevector <4 x i32> %909, <4 x i32> poison, <2 x i32> , !dbg !67 + %911 = select <4 x i1> %887, <4 x i32> %906, <4 x i32> zeroinitializer, !dbg !65 + %912 = select <2 x i1> %889, <2 x i32> %908, <2 x i32> zeroinitializer, !dbg !65 + %913 = select <2 x i1> %891, <2 x i32> %910, <2 x i32> zeroinitializer, !dbg !65 + %914 = xor <4 x i32> %813, %911, !dbg !66 + %915 = shufflevector <2 x i32> %913, <2 x i32> %912, <2 x i32> , !dbg !66 + %916 = shufflevector <2 x i32> %815, <2 x i32> %814, <2 x i32> , !dbg !66 + %917 = xor <2 x i32> %915, %916, !dbg !66 + %918 = shufflevector <2 x i32> %912, <2 x i32> %913, <2 x i32> , !dbg !66 + %919 = shufflevector <2 x i32> %814, <2 x i32> %815, <2 x i32> , !dbg !66 + %920 = xor <2 x i32> %918, %919, !dbg !66 + %921 = shufflevector <2 x i32> %913, <2 x i32> %912, <2 x i32> , !dbg !66 + %922 = shufflevector <2 x i32> %815, <2 x i32> %814, <2 x i32> , !dbg !66 + %923 = xor <2 x i32> %921, %922, !dbg !66 + %924 = shufflevector <2 x i32> %912, <2 x i32> %913, <2 x i32> , !dbg !66 + %925 = shufflevector <2 x i32> %814, <2 x i32> %815, <2 x i32> , !dbg !66 + %926 = xor <2 x i32> %924, %925, !dbg !66 + %927 = shufflevector <2 x i32> %901, <2 x i32> %902, <4 x i32> , !dbg !27 + %928 = mul nuw nsw <4 x i32> %927, %35, !dbg !27 + %929 = extractelement <4 x i32> %928, i64 0, !dbg !68 + %930 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %929, i32 1, i32 31), !dbg !68 + %931 = extractelement <4 x i32> %928, i64 1, !dbg !68 + %932 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %931, i32 1, i32 31), !dbg !68 + %933 = extractelement <4 x i32> %928, i64 2, !dbg !68 + %934 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %933, i32 1, i32 31), !dbg !68 + %935 = extractelement <4 x i32> %928, i64 3, !dbg !68 + %936 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %935, i32 1, i32 31), !dbg !68 + %937 = insertelement <4 x i32> poison, i32 %930, i64 0, !dbg !69 + %938 = insertelement <4 x i32> %937, i32 %932, i64 1, !dbg !69 + %939 = insertelement <4 x i32> %938, i32 %934, i64 2, !dbg !69 + %940 = insertelement <4 x i32> %939, i32 %936, i64 3, !dbg !69 + %941 = add <4 x i32> %928, %940, !dbg !69 + %942 = mul nuw nsw <4 x i32> %927, %37, !dbg !28 + %943 = extractelement <4 x i32> %942, i64 0, !dbg !68 + %944 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %943, i32 1, i32 31), !dbg !68 + %945 = extractelement <4 x i32> %942, i64 1, !dbg !68 + %946 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %945, i32 1, i32 31), !dbg !68 + %947 = extractelement <4 x i32> %942, i64 2, !dbg !68 + %948 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %947, i32 1, i32 31), !dbg !68 + %949 = extractelement <4 x i32> %942, i64 3, !dbg !68 + %950 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %949, i32 1, i32 31), !dbg !68 + %951 = insertelement <4 x i32> poison, i32 %944, i64 0, !dbg !69 + %952 = insertelement <4 x i32> %951, i32 %946, i64 1, !dbg !69 + %953 = insertelement <4 x i32> %952, i32 %948, i64 2, !dbg !69 + %954 = insertelement <4 x i32> %953, i32 %950, i64 3, !dbg !69 + %955 = add <4 x i32> %942, %954, !dbg !69 + %956 = extractelement <2 x i32> %926, i64 1, !dbg !26 + %957 = mul nuw nsw i32 %956, %24, !dbg !25 + %958 = extractelement <2 x i32> %923, i64 1, !dbg !26 + %959 = mul nuw nsw i32 %958, %24, !dbg !25 + %960 = extractelement <2 x i32> %926, i64 0, !dbg !26 + %961 = mul nuw nsw i32 %960, %24, !dbg !25 + %962 = extractelement <2 x i32> %923, i64 0, !dbg !26 + %963 = mul nuw nsw i32 %962, %24, !dbg !25 + %964 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %957, i32 1, i32 31), !dbg !68 + %965 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %959, i32 1, i32 31), !dbg !68 + %966 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %961, i32 1, i32 31), !dbg !68 + %967 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %963, i32 1, i32 31), !dbg !68 + %968 = insertelement <2 x i32> poison, i32 %961, i64 0, !dbg !69 + %969 = insertelement <2 x i32> %968, i32 %959, i64 1, !dbg !69 + %970 = insertelement <2 x i32> poison, i32 %966, i64 0, !dbg !69 + %971 = insertelement <2 x i32> %970, i32 %965, i64 1, !dbg !69 + %972 = add <2 x i32> %969, %971, !dbg !69 + %973 = insertelement <4 x i32> poison, i32 %964, i64 0, !dbg !69 + %974 = insertelement <4 x i32> %973, i32 %965, i64 1, !dbg !69 + %975 = insertelement <4 x i32> %974, i32 %966, i64 2, !dbg !69 + %976 = insertelement <4 x i32> %975, i32 %967, i64 3, !dbg !69 + %977 = insertelement <4 x i32> poison, i32 %957, i64 0, !dbg !69 + %978 = insertelement <4 x i32> %977, i32 %959, i64 1, !dbg !69 + %979 = insertelement <4 x i32> %978, i32 %961, i64 2, !dbg !69 + %980 = insertelement <4 x i32> %979, i32 %963, i64 3, !dbg !69 + %981 = add <4 x i32> %976, %980, !dbg !69 + %982 = insertelement <2 x i32> poison, i32 %963, i64 0, !dbg !69 + %983 = insertelement <2 x i32> %982, i32 %957, i64 1, !dbg !69 + %984 = insertelement <2 x i32> poison, i32 %967, i64 0, !dbg !69 + %985 = insertelement <2 x i32> %984, i32 %964, i64 1, !dbg !69 + %986 = add <2 x i32> %983, %985, !dbg !69 + %987 = mul nuw nsw i32 %956, %22, !dbg !26 + %988 = mul nuw nsw i32 %958, %22, !dbg !26 + %989 = mul nuw nsw i32 %960, %22, !dbg !26 + %990 = mul nuw nsw i32 %962, %22, !dbg !26 + %991 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %987, i32 1, i32 31), !dbg !68 + %992 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %988, i32 1, i32 31), !dbg !68 + %993 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %989, i32 1, i32 31), !dbg !68 + %994 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %990, i32 1, i32 31), !dbg !68 + %995 = insertelement <2 x i32> poison, i32 %993, i64 0, !dbg !69 + %996 = insertelement <2 x i32> %995, i32 %992, i64 1, !dbg !69 + %997 = insertelement <2 x i32> poison, i32 %989, i64 0, !dbg !69 + %998 = insertelement <2 x i32> %997, i32 %988, i64 1, !dbg !69 + %999 = add <2 x i32> %996, %998, !dbg !69 + %1000 = insertelement <4 x i32> poison, i32 %991, i64 0, !dbg !69 + %1001 = insertelement <4 x i32> %1000, i32 %992, i64 1, !dbg !69 + %1002 = insertelement <4 x i32> %1001, i32 %993, i64 2, !dbg !69 + %1003 = insertelement <4 x i32> %1002, i32 %994, i64 3, !dbg !69 + %1004 = insertelement <4 x i32> poison, i32 %987, i64 0, !dbg !69 + %1005 = insertelement <4 x i32> %1004, i32 %988, i64 1, !dbg !69 + %1006 = insertelement <4 x i32> %1005, i32 %989, i64 2, !dbg !69 + %1007 = insertelement <4 x i32> %1006, i32 %990, i64 3, !dbg !69 + %1008 = add <4 x i32> %1003, %1007, !dbg !69 + %1009 = insertelement <2 x i32> poison, i32 %994, i64 0, !dbg !69 + %1010 = insertelement <2 x i32> %1009, i32 %991, i64 1, !dbg !69 + %1011 = insertelement <2 x i32> poison, i32 %990, i64 0, !dbg !69 + %1012 = insertelement <2 x i32> %1011, i32 %987, i64 1, !dbg !69 + %1013 = add <2 x i32> %1010, %1012, !dbg !69 + %1014 = icmp slt <4 x i32> %941, %955, !dbg !60 + %1015 = icmp slt <4 x i32> %941, %955, !dbg !60 + %1016 = shufflevector <4 x i1> %1015, <4 x i1> poison, <2 x i32> , !dbg !60 + %1017 = icmp slt <4 x i32> %941, %955, !dbg !60 + %1018 = shufflevector <4 x i1> %1017, <4 x i1> poison, <2 x i32> , !dbg !60 + %1019 = icmp eq <4 x i32> %941, %955, !dbg !70 + %1020 = icmp sgt <4 x i32> %981, %1008, !dbg !71 + %1021 = and <4 x i1> %1019, %1020, !dbg !72 + %1022 = and <4 x i1> %1019, %1020, !dbg !72 + %1023 = shufflevector <4 x i1> %1022, <4 x i1> poison, <2 x i32> , !dbg !72 + %1024 = and <4 x i1> %1019, %1020, !dbg !72 + %1025 = shufflevector <4 x i1> %1024, <4 x i1> poison, <2 x i32> , !dbg !72 + %1026 = or <4 x i1> %1014, %1021, !dbg !73 + %1027 = or <2 x i1> %1016, %1023, !dbg !73 + %1028 = or <2 x i1> %1018, %1025, !dbg !73 + %1029 = shufflevector <2 x i1> %1018, <2 x i1> %1016, <2 x i32> , !dbg !73 + %1030 = shufflevector <2 x i1> %1025, <2 x i1> %1023, <2 x i32> , !dbg !73 + %1031 = or <2 x i1> %1029, %1030, !dbg !73 + %1032 = shufflevector <2 x i1> %1016, <2 x i1> %1018, <2 x i32> , !dbg !73 + %1033 = shufflevector <2 x i1> %1023, <2 x i1> %1025, <2 x i32> , !dbg !73 + %1034 = or <2 x i1> %1032, %1033, !dbg !73 + %1035 = xor <4 x i32> %941, %955, !dbg !62 + %1036 = shufflevector <4 x i32> %1035, <4 x i32> poison, <2 x i32> , !dbg !62 + %1037 = xor <4 x i32> %941, %955, !dbg !62 + %1038 = shufflevector <4 x i32> %1037, <4 x i32> poison, <2 x i32> , !dbg !62 + %1039 = shufflevector <2 x i1> %1034, <2 x i1> %1031, <2 x i32> , !dbg !63 + %1040 = shufflevector <2 x i32> %1036, <2 x i32> %1038, <2 x i32> , !dbg !63 + %1041 = select <2 x i1> %1039, <2 x i32> %1040, <2 x i32> zeroinitializer, !dbg !63 + %1042 = select <2 x i1> %1034, <2 x i32> %1036, <2 x i32> zeroinitializer, !dbg !63 + %1043 = select <2 x i1> %1031, <2 x i32> %1038, <2 x i32> zeroinitializer, !dbg !63 + %1044 = shufflevector <2 x i1> %1031, <2 x i1> %1034, <2 x i32> , !dbg !63 + %1045 = shufflevector <2 x i32> %1038, <2 x i32> %1036, <2 x i32> , !dbg !63 + %1046 = select <2 x i1> %1044, <2 x i32> %1045, <2 x i32> zeroinitializer, !dbg !63 + %1047 = xor <2 x i32> %1041, %900, !dbg !64 + %1048 = xor <2 x i32> %1042, %901, !dbg !64 + %1049 = xor <2 x i32> %1043, %902, !dbg !64 + %1050 = xor <2 x i32> %1046, %905, !dbg !64 + %1051 = shufflevector <2 x i32> %1013, <2 x i32> %999, <4 x i32> , !dbg !67 + %1052 = shufflevector <2 x i32> %986, <2 x i32> %972, <4 x i32> , !dbg !67 + %1053 = xor <4 x i32> %1051, %1052, !dbg !67 + %1054 = xor <2 x i32> %999, %972, !dbg !67 + %1055 = xor <2 x i32> %1013, %986, !dbg !67 + %1056 = select <4 x i1> %1026, <4 x i32> %1053, <4 x i32> zeroinitializer, !dbg !65 + %1057 = select <2 x i1> %1027, <2 x i32> %1054, <2 x i32> zeroinitializer, !dbg !65 + %1058 = select <2 x i1> %1028, <2 x i32> %1055, <2 x i32> zeroinitializer, !dbg !65 + %1059 = shufflevector <2 x i1> %1031, <2 x i1> %1034, <2 x i32> , !dbg !65 + %1060 = shufflevector <2 x i32> %1054, <2 x i32> %1055, <2 x i32> , !dbg !65 + %1061 = select <2 x i1> %1059, <2 x i32> %1060, <2 x i32> zeroinitializer, !dbg !65 + %1062 = shufflevector <2 x i1> %1031, <2 x i1> %1034, <2 x i32> , !dbg !65 + %1063 = shufflevector <2 x i32> %1055, <2 x i32> %1054, <2 x i32> , !dbg !65 + %1064 = select <2 x i1> %1062, <2 x i32> %1063, <2 x i32> zeroinitializer, !dbg !65 + %1065 = shufflevector <2 x i32> %1055, <2 x i32> %1054, <2 x i32> , !dbg !65 + %1066 = select <2 x i1> %1031, <2 x i32> %1065, <2 x i32> zeroinitializer, !dbg !65 + %1067 = shufflevector <2 x i32> %1054, <2 x i32> %1055, <2 x i32> , !dbg !65 + %1068 = select <2 x i1> %1034, <2 x i32> %1067, <2 x i32> zeroinitializer, !dbg !65 + %1069 = xor <4 x i32> %914, %1056, !dbg !66 + %1070 = xor <2 x i32> %1061, %917, !dbg !66 + %1071 = xor <2 x i32> %1064, %920, !dbg !66 + %1072 = xor <2 x i32> %1066, %923, !dbg !66 + %1073 = xor <2 x i32> %1068, %926, !dbg !66 + %1074 = extractelement <2 x i32> %1048, i64 0, !dbg !60 + %1075 = extractelement <2 x i32> %1048, i64 1, !dbg !60 + %1076 = icmp slt i32 %1075, %1074, !dbg !60 + %1077 = extractelement <2 x i32> %1049, i64 0, !dbg !60 + %1078 = extractelement <2 x i32> %1049, i64 1, !dbg !60 + %1079 = icmp slt i32 %1078, %1077, !dbg !60 + %1080 = icmp eq i32 %1075, %1074, !dbg !70 + %1081 = icmp eq i32 %1078, %1077, !dbg !70 + %shift61 = shufflevector <2 x i32> %1073, <2 x i32> poison, <2 x i32> , !dbg !71 + %1082 = icmp sgt <2 x i32> %shift61, %1073, !dbg !71 + %1083 = extractelement <2 x i1> %1082, i64 0, !dbg !71 + %shift62 = shufflevector <2 x i32> %1072, <2 x i32> poison, <2 x i32> , !dbg !71 + %1084 = icmp sgt <2 x i32> %shift62, %1072, !dbg !71 + %1085 = extractelement <2 x i1> %1084, i64 0, !dbg !71 + %1086 = and i1 %1080, %1083, !dbg !72 + %1087 = and i1 %1081, %1085, !dbg !72 + %1088 = insertelement <2 x i1> poison, i1 %1076, i64 0, !dbg !73 + %1089 = insertelement <2 x i1> %1088, i1 %1079, i64 1, !dbg !73 + %1090 = insertelement <2 x i1> poison, i1 %1086, i64 0, !dbg !73 + %1091 = insertelement <2 x i1> %1090, i1 %1087, i64 1, !dbg !73 + %1092 = or <2 x i1> %1089, %1091, !dbg !73 + %1093 = shufflevector <2 x i32> %1048, <2 x i32> %1049, <2 x i32> , !dbg !62 + %1094 = shufflevector <2 x i32> %1048, <2 x i32> %1049, <2 x i32> , !dbg !62 + %1095 = xor <2 x i32> %1093, %1094, !dbg !62 + %1096 = select <2 x i1> %1092, <2 x i32> %1095, <2 x i32> zeroinitializer, !dbg !63 + %1097 = xor <2 x i32> %1096, %1047, !dbg !64 + %1098 = shufflevector <2 x i32> %1096, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !64 + %1099 = xor <2 x i32> %1098, %1048, !dbg !64 + %1100 = shufflevector <2 x i32> %1096, <2 x i32> poison, <2 x i32> , !dbg !64 + %1101 = shufflevector <2 x i32> %1096, <2 x i32> poison, <2 x i32> , !dbg !64 + %1102 = xor <2 x i32> %1101, %1049, !dbg !64 + %1103 = xor <2 x i32> %1100, %1050, !dbg !64 + %1104 = xor <2 x i32> %1071, %1070, !dbg !67 + %1105 = shufflevector <2 x i1> %1092, <2 x i1> poison, <2 x i32> , !dbg !65 + %1106 = select <2 x i1> %1105, <2 x i32> %1104, <2 x i32> zeroinitializer, !dbg !65 + %1107 = shufflevector <2 x i32> %1106, <2 x i32> poison, <4 x i32> , !dbg !66 + %1108 = xor <4 x i32> %1069, %1107, !dbg !66 + %1109 = shufflevector <2 x i32> %1106, <2 x i32> poison, <2 x i32> , !dbg !66 + %1110 = xor <2 x i32> %1057, %1109, !dbg !66 + %1111 = shufflevector <2 x i32> %1106, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !66 + %1112 = xor <2 x i32> %1111, %1072, !dbg !66 + %1113 = shufflevector <2 x i32> %1106, <2 x i32> poison, <2 x i32> , !dbg !66 + %1114 = xor <2 x i32> %1113, %1073, !dbg !66 + %1115 = icmp slt <2 x i32> %1099, %1102, !dbg !60 + %1116 = icmp eq <2 x i32> %1097, %1103, !dbg !70 + %1117 = icmp sgt <2 x i32> %1114, %1112, !dbg !71 + %1118 = and <2 x i1> %1116, %1117, !dbg !72 + %1119 = or <2 x i1> %1115, %1118, !dbg !73 + %1120 = xor <2 x i32> %1110, %1058, !dbg !67 + %1121 = xor <2 x i32> %1120, %814, !dbg !67 + %1122 = xor <2 x i32> %1121, %912, !dbg !67 + %1123 = xor <2 x i32> %1122, %815, !dbg !67 + %1124 = xor <2 x i32> %1123, %913, !dbg !67 + %1125 = xor <2 x i32> %1124, %1106, !dbg !67 + %1126 = select <2 x i1> %1119, <2 x i32> %1125, <2 x i32> zeroinitializer, !dbg !65 + %1127 = shufflevector <2 x i32> %1126, <2 x i32> poison, <4 x i32> , !dbg !65 + %1128 = xor <4 x i32> %1108, %1127, !dbg !66 + %1129 = xor <4 x i32> %1128, %739, !dbg !66 + %1130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %539, i32 2, i32 31), !dbg !58 + %1131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 2, i32 31), !dbg !58 + %1132 = insertelement <2 x i32> poison, i32 %1130, i64 0, !dbg !58 + %1133 = insertelement <2 x i32> %1132, i32 %1131, i64 1, !dbg !58 + %1134 = bitcast <2 x i32> %1133 to i64, !dbg !58 + %1135 = add i64 %538, %1134, !dbg !56 + %extelt.offset = lshr i64 %1135, 32, !dbg !58 + %1136 = trunc nuw i64 %extelt.offset to i32, !dbg !58 + %1137 = trunc i64 %1135 to i32, !dbg !58 + %1138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1137, i32 1, i32 31), !dbg !58 + %1139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1136, i32 1, i32 31), !dbg !58 + %1140 = insertelement <2 x i32> poison, i32 %1138, i64 0, !dbg !58 + %1141 = insertelement <2 x i32> %1140, i32 %1139, i64 1, !dbg !58 + %1142 = bitcast <2 x i32> %1141 to i64, !dbg !58 + %1143 = add i64 %1135, %1142, !dbg !56 + %1144 = insertelement <1 x i64> poison, i64 %1143, i64 0, !dbg !34 + store <1 x i64> %1144, ptr addrspace(3) %49, align 8, !dbg !34 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !34 + %1145 = load i64, ptr addrspace(3) %51, align 8, !dbg !34 + %1146 = select <4 x i1> %47, <4 x i1> %540, <4 x i1> zeroinitializer, !dbg !74 + %1147 = bitcast <4 x i1> %1146 to i4, !dbg !75 + %1148 = tail call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 %1147), !dbg !75 + %1149 = zext nneg i4 %1148 to i64, !dbg !75 + %1150 = zext nneg i4 %1148 to i32, !dbg !77 + %1151 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1150, i32 2, i32 31), !dbg !77 + %1152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 2, i32 31), !dbg !77 + %1153 = insertelement <2 x i32> poison, i32 %1151, i64 0, !dbg !77 + %1154 = insertelement <2 x i32> %1153, i32 %1152, i64 1, !dbg !77 + %1155 = bitcast <2 x i32> %1154 to i64, !dbg !77 + %1156 = add i64 %1149, %1155, !dbg !75 + %extelt.offset34 = lshr i64 %1156, 32, !dbg !77 + %1157 = trunc nuw i64 %extelt.offset34 to i32, !dbg !77 + %1158 = trunc i64 %1156 to i32, !dbg !77 + %1159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1158, i32 1, i32 31), !dbg !77 + %1160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1157, i32 1, i32 31), !dbg !77 + %1161 = insertelement <2 x i32> poison, i32 %1159, i64 0, !dbg !77 + %1162 = insertelement <2 x i32> %1161, i32 %1160, i64 1, !dbg !77 + %1163 = bitcast <2 x i32> %1162 to i64, !dbg !77 + %1164 = add i64 %1156, %1163, !dbg !75 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !78 + %1165 = insertelement <1 x i64> poison, i64 %1164, i64 0, !dbg !78 + store <1 x i64> %1165, ptr addrspace(3) %49, align 8, !dbg !78 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !78 + %1166 = load i64, ptr addrspace(3) %51, align 8, !dbg !78 + %1167 = trunc i64 %1143 to i32, !dbg !34 + %1168 = insertelement <4 x i32> poison, i32 %55, i64 0, !dbg !79 + %1169 = insertelement <4 x i32> %1168, i32 %54, i64 1, !dbg !79 + %1170 = insertelement <4 x i32> %1169, i32 %53, i64 2, !dbg !79 + %1171 = insertelement <4 x i32> %1170, i32 %52, i64 3, !dbg !79 + %1172 = insertelement <4 x i32> poison, i32 %1167, i64 0, !dbg !79 + %1173 = shufflevector <4 x i32> %1172, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !79 + %1174 = icmp slt <4 x i32> %1171, %1173, !dbg !79 + %1175 = select <4 x i1> %1174, <4 x i32> %534, <4 x i32> splat (i32 16), !dbg !80 + %1176 = add <4 x i32> %1175, splat (i32 17), !dbg !81 + %1177 = icmp slt <4 x i32> %1175, zeroinitializer, !dbg !82 + %1178 = select <4 x i1> %1177, <4 x i32> %1176, <4 x i32> %1175, !dbg !83 + %1179 = icmp ugt <4 x i32> %1178, splat (i32 16), !dbg !84 + %shift63 = shufflevector <4 x i1> %1179, <4 x i1> poison, <4 x i32> , !dbg !85 + %foldExtExtBinop64 = or <4 x i1> %shift63, %1179, !dbg !85 + %shift66 = shufflevector <4 x i1> %foldExtExtBinop64, <4 x i1> poison, <4 x i32> , !dbg !85 + %foldExtExtBinop67 = or <4 x i1> %1179, %shift66, !dbg !85 + %shift69 = shufflevector <4 x i1> %foldExtExtBinop67, <4 x i1> poison, <4 x i32> , !dbg !85 + %foldExtExtBinop70 = or <4 x i1> %1179, %shift69, !dbg !85 + %1180 = extractelement <4 x i1> %foldExtExtBinop70, i64 0, !dbg !85 + %1181 = and i1 %19, %1180, !dbg !85 + br i1 %1181, label %1182, label %1183, !dbg !85 + +1182: ; preds = %11 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 71, ptr nonnull @assertFunc_0, i64 1), !dbg !85 + unreachable, !dbg !85 + +1183: ; preds = %11 + %1184 = trunc i64 %1164 to i32, !dbg !78 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !85 + %1185 = insertelement <4 x i32> poison, i32 %52, i64 0, !dbg !86 + %1186 = insertelement <4 x i32> %1185, i32 %53, i64 1, !dbg !86 + %1187 = insertelement <4 x i32> %1186, i32 %54, i64 2, !dbg !86 + %1188 = insertelement <4 x i32> %1187, i32 %55, i64 3, !dbg !86 + %1189 = insertelement <4 x i32> poison, i32 %1184, i64 0, !dbg !86 + %1190 = shufflevector <4 x i32> %1189, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !86 + %1191 = icmp slt <4 x i32> %1188, %1190, !dbg !86 + %1192 = select <4 x i1> %1191, <4 x i32> %1129, <4 x i32> splat (i32 16), !dbg !87 + %1193 = add <4 x i32> %1192, splat (i32 17), !dbg !88 + %1194 = icmp slt <4 x i32> %1192, zeroinitializer, !dbg !89 + %1195 = select <4 x i1> %1194, <4 x i32> %1193, <4 x i32> %1192, !dbg !90 + %1196 = icmp ugt <4 x i32> %1195, splat (i32 16), !dbg !91 + %1197 = bitcast <4 x i1> %1196 to i4, !dbg !92 + %1198 = icmp ne i4 %1197, 0, !dbg !92 + %1199 = and i1 %19, %1198, !dbg !92 + br i1 %1199, label %1200, label %1201, !dbg !92 + +1200: ; preds = %1183 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 80, ptr nonnull @assertFunc_1, i64 1), !dbg !92 + unreachable, !dbg !92 + +1201: ; preds = %1183 + %1202 = trunc i64 %1166 to i32, !dbg !78 + %1203 = trunc i64 %1145 to i32, !dbg !34 + %1204 = or disjoint i32 %13, %17, !dbg !13 + %1205 = icmp slt i32 %1204, 32, !dbg !14 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !92 + %1206 = sext i32 %1204 to i64, !dbg !93 + %1207 = getelementptr i32, ptr addrspace(1) %1, i64 %1206, !dbg !93 + %1208 = and i32 %14, 96, !dbg !94 + %1209 = icmp eq i32 %1208, 0, !dbg !94 + %1210 = and i1 %1209, %1205, !dbg !94 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %1203, ptr addrspace(1) %1207, i1 %1210) #6, !dbg !94 + %1211 = getelementptr i32, ptr addrspace(1) %2, i64 %1206, !dbg !95 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %1202, ptr addrspace(1) %1211, i1 %1210) #6, !dbg !96 + %1212 = getelementptr i32, ptr addrspace(1) %3, i64 %58, !dbg !97 + %1213 = extractelement <4 x i32> %534, i64 0, !dbg !98 + %1214 = extractelement <4 x i32> %534, i64 1, !dbg !98 + %1215 = extractelement <4 x i32> %534, i64 2, !dbg !98 + %1216 = extractelement <4 x i32> %534, i64 3, !dbg !98 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1216, i32 %1215, i32 %1214, i32 %1213, ptr addrspace(1) %1212, i1 %19) #6, !dbg !98 + %1217 = mul i32 %18, 17, !dbg !99 + %1218 = extractelement <4 x i32> %1178, i64 3, !dbg !100 + %1219 = add i32 %1218, %1217, !dbg !100 + %1220 = extractelement <4 x i32> %1178, i64 2, !dbg !100 + %1221 = add i32 %1220, %1217, !dbg !100 + %1222 = extractelement <4 x i32> %1178, i64 1, !dbg !100 + %1223 = add i32 %1222, %1217, !dbg !100 + %1224 = extractelement <4 x i32> %1178, i64 0, !dbg !100 + %1225 = add i32 %1224, %1217, !dbg !100 + %1226 = sext i32 %1219 to i64, !dbg !101 + %1227 = getelementptr i32, ptr addrspace(1) %4, i64 %1226, !dbg !101 + %1228 = sext i32 %1221 to i64, !dbg !101 + %1229 = getelementptr i32, ptr addrspace(1) %4, i64 %1228, !dbg !101 + %1230 = sext i32 %1223 to i64, !dbg !101 + %1231 = getelementptr i32, ptr addrspace(1) %4, i64 %1230, !dbg !101 + %1232 = sext i32 %1225 to i64, !dbg !101 + %1233 = getelementptr i32, ptr addrspace(1) %4, i64 %1232, !dbg !101 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !102 + %1234 = ptrtoint ptr addrspace(1) %1227 to i64, !dbg !102 + %1235 = ptrtoint ptr addrspace(1) %1229 to i64, !dbg !102 + %1236 = ptrtoint ptr addrspace(1) %1231 to i64, !dbg !102 + %1237 = ptrtoint ptr addrspace(1) %1233 to i64, !dbg !102 + %1238 = and i32 %14, 48, !dbg !102 + %1239 = shl nuw nsw i32 %1238, 6, !dbg !102 + %1240 = shl nuw nsw i32 %14, 3, !dbg !102 + %1241 = and i32 %1240, 120, !dbg !102 + %1242 = lshr exact i32 %1238, 1, !dbg !102 + %1243 = shl nuw nsw i32 %14, 1, !dbg !102 + %1244 = and i32 %1243, 128, !dbg !102 + %1245 = or disjoint i32 %1239, %1241, !dbg !102 + %1246 = xor i32 %1245, %1242, !dbg !102 + %1247 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1244, !dbg !102 + %1248 = getelementptr inbounds nuw i8, ptr addrspace(3) %1247, i32 %1246, !dbg !102 + %1249 = insertelement <1 x i64> poison, i64 %1234, i64 0, !dbg !102 + store <1 x i64> %1249, ptr addrspace(3) %1248, align 8, !dbg !102 + %1250 = getelementptr inbounds nuw i8, ptr addrspace(3) %1248, i32 256, !dbg !102 + %1251 = insertelement <1 x i64> poison, i64 %1235, i64 0, !dbg !102 + store <1 x i64> %1251, ptr addrspace(3) %1250, align 8, !dbg !102 + %1252 = getelementptr inbounds nuw i8, ptr addrspace(3) %1248, i32 512, !dbg !102 + %1253 = insertelement <1 x i64> poison, i64 %1236, i64 0, !dbg !102 + store <1 x i64> %1253, ptr addrspace(3) %1252, align 8, !dbg !102 + %1254 = getelementptr inbounds nuw i8, ptr addrspace(3) %1248, i32 768, !dbg !102 + %1255 = insertelement <1 x i64> poison, i64 %1237, i64 0, !dbg !102 + store <1 x i64> %1255, ptr addrspace(3) %1254, align 8, !dbg !102 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !102 + %1256 = and i32 %14, 12, !dbg !102 + %1257 = shl nuw nsw i32 %1256, 8, !dbg !102 + %1258 = shl nuw nsw i32 %20, 5, !dbg !102 + %1259 = and i32 %1240, 896, !dbg !102 + %1260 = shl nuw nsw i32 %1256, 1, !dbg !102 + %1261 = or disjoint i32 %1257, %1258, !dbg !102 + %1262 = or disjoint i32 %1261, %1259, !dbg !102 + %1263 = or disjoint i32 %1262, %1260, !dbg !102 + %1264 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1263, !dbg !102 + %1265 = load i64, ptr addrspace(3) %1264, align 8, !dbg !102 + %1266 = xor i32 %1263, 8, !dbg !102 + %1267 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1266, !dbg !102 + %1268 = load i64, ptr addrspace(3) %1267, align 8, !dbg !102 + %1269 = xor i32 %1263, 16, !dbg !102 + %1270 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1269, !dbg !102 + %1271 = load i64, ptr addrspace(3) %1270, align 8, !dbg !102 + %1272 = xor i32 %1263, 24, !dbg !102 + %1273 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1272, !dbg !102 + %1274 = load i64, ptr addrspace(3) %1273, align 8, !dbg !102 + %1275 = inttoptr i64 %1265 to ptr addrspace(1), !dbg !102 + %1276 = inttoptr i64 %1268 to ptr addrspace(1), !dbg !102 + %1277 = inttoptr i64 %1271 to ptr addrspace(1), !dbg !102 + %1278 = inttoptr i64 %1274 to ptr addrspace(1), !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1275, i1 %1205) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1276, i1 %1205) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1277, i1 %1205) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1278, i1 %1205) #6, !dbg !102 + %1279 = getelementptr i32, ptr addrspace(1) %5, i64 %58, !dbg !103 + %1280 = extractelement <4 x i32> %1129, i64 0, !dbg !104 + %1281 = extractelement <4 x i32> %1129, i64 1, !dbg !104 + %1282 = extractelement <4 x i32> %1129, i64 2, !dbg !104 + %1283 = extractelement <4 x i32> %1129, i64 3, !dbg !104 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1280, i32 %1281, i32 %1282, i32 %1283, ptr addrspace(1) %1279, i1 %19) #6, !dbg !104 + %1284 = extractelement <4 x i32> %1195, i64 0, !dbg !105 + %1285 = add i32 %1284, %1217, !dbg !105 + %1286 = extractelement <4 x i32> %1195, i64 1, !dbg !105 + %1287 = add i32 %1286, %1217, !dbg !105 + %1288 = extractelement <4 x i32> %1195, i64 2, !dbg !105 + %1289 = add i32 %1288, %1217, !dbg !105 + %1290 = extractelement <4 x i32> %1195, i64 3, !dbg !105 + %1291 = add i32 %1290, %1217, !dbg !105 + %1292 = sext i32 %1285 to i64, !dbg !106 + %1293 = getelementptr i32, ptr addrspace(1) %6, i64 %1292, !dbg !106 + %1294 = sext i32 %1287 to i64, !dbg !106 + %1295 = getelementptr i32, ptr addrspace(1) %6, i64 %1294, !dbg !106 + %1296 = sext i32 %1289 to i64, !dbg !106 + %1297 = getelementptr i32, ptr addrspace(1) %6, i64 %1296, !dbg !106 + %1298 = sext i32 %1291 to i64, !dbg !106 + %1299 = getelementptr i32, ptr addrspace(1) %6, i64 %1298, !dbg !106 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !107 + %1300 = ptrtoint ptr addrspace(1) %1293 to i64, !dbg !107 + %1301 = ptrtoint ptr addrspace(1) %1295 to i64, !dbg !107 + %1302 = ptrtoint ptr addrspace(1) %1297 to i64, !dbg !107 + %1303 = ptrtoint ptr addrspace(1) %1299 to i64, !dbg !107 + %1304 = insertelement <1 x i64> poison, i64 %1300, i64 0, !dbg !107 + store <1 x i64> %1304, ptr addrspace(3) %1248, align 8, !dbg !107 + %1305 = insertelement <1 x i64> poison, i64 %1301, i64 0, !dbg !107 + store <1 x i64> %1305, ptr addrspace(3) %1250, align 8, !dbg !107 + %1306 = insertelement <1 x i64> poison, i64 %1302, i64 0, !dbg !107 + store <1 x i64> %1306, ptr addrspace(3) %1252, align 8, !dbg !107 + %1307 = insertelement <1 x i64> poison, i64 %1303, i64 0, !dbg !107 + store <1 x i64> %1307, ptr addrspace(3) %1254, align 8, !dbg !107 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !107 + %1308 = load i64, ptr addrspace(3) %1264, align 8, !dbg !107 + %1309 = load i64, ptr addrspace(3) %1267, align 8, !dbg !107 + %1310 = load i64, ptr addrspace(3) %1270, align 8, !dbg !107 + %1311 = load i64, ptr addrspace(3) %1273, align 8, !dbg !107 + %1312 = inttoptr i64 %1308 to ptr addrspace(1), !dbg !107 + %1313 = inttoptr i64 %1309 to ptr addrspace(1), !dbg !107 + %1314 = inttoptr i64 %1310 to ptr addrspace(1), !dbg !107 + %1315 = inttoptr i64 %1311 to ptr addrspace(1), !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1312, i1 %1205) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1313, i1 %1205) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1314, i1 %1205) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1315, i1 %1205) #6, !dbg !107 + ret void, !dbg !108 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i4 @llvm.ctpop.i4(i4) #5 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="128" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", linkageName: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 24, column: 28, scope: !9) +!11 = !DILocation(line: 24, column: 33, scope: !9) +!12 = !DILocation(line: 25, column: 44, scope: !9) +!13 = !DILocation(line: 25, column: 23, scope: !9) +!14 = !DILocation(line: 26, column: 21, scope: !9) +!15 = !DILocation(line: 27, column: 38, scope: !9) +!16 = !DILocation(line: 34, column: 40, scope: !9) +!17 = !DILocation(line: 627, column: 44, scope: !18, inlinedAt: !20) +!18 = distinct !DILexicalBlockFile(scope: !9, file: !19, discriminator: 0) +!19 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!20 = !DILocation(line: 46, column: 71, scope: !9) +!21 = !DILocation(line: 537, column: 21, scope: !18, inlinedAt: !20) +!22 = !DILocation(line: 599, column: 28, scope: !18, inlinedAt: !20) +!23 = !DILocation(line: 599, column: 28, scope: !18, inlinedAt: !24) +!24 = !DILocation(line: 51, column: 71, scope: !9) +!25 = !DILocation(line: 548, column: 23, scope: !18, inlinedAt: !24) +!26 = !DILocation(line: 551, column: 23, scope: !18, inlinedAt: !24) +!27 = !DILocation(line: 538, column: 40, scope: !18, inlinedAt: !24) +!28 = !DILocation(line: 539, column: 41, scope: !18, inlinedAt: !24) +!29 = !DILocation(line: 538, column: 40, scope: !18, inlinedAt: !20) +!30 = !DILocation(line: 539, column: 41, scope: !18, inlinedAt: !20) +!31 = !DILocation(line: 551, column: 23, scope: !18, inlinedAt: !20) +!32 = !DILocation(line: 548, column: 23, scope: !18, inlinedAt: !20) +!33 = !DILocation(line: 54, column: 35, scope: !9) +!34 = !DILocation(line: 60, column: 21, scope: !9) +!35 = !DILocation(line: 34, column: 37, scope: !9) +!36 = !DILocation(line: 34, column: 30, scope: !9) +!37 = !DILocation(line: 34, column: 45, scope: !9) +!38 = !DILocation(line: 601, column: 48, scope: !18, inlinedAt: !20) +!39 = !DILocation(line: 39, column: 18, scope: !9) +!40 = !DILocation(line: 574, column: 22, scope: !18, inlinedAt: !20) +!41 = !DILocation(line: 0, scope: !9) +!42 = !DILocation(line: 599, column: 19, scope: !18, inlinedAt: !20) +!43 = !DILocation(line: 600, column: 38, scope: !18, inlinedAt: !20) +!44 = !DILocation(line: 600, column: 46, scope: !18, inlinedAt: !20) +!45 = !DILocation(line: 600, column: 15, scope: !18, inlinedAt: !20) +!46 = !DILocation(line: 601, column: 59, scope: !18, inlinedAt: !20) +!47 = !DILocation(line: 601, column: 22, scope: !18, inlinedAt: !20) +!48 = !DILocation(line: 291, column: 36, scope: !49, inlinedAt: !20) +!49 = distinct !DILexicalBlockFile(scope: !9, file: !50, discriminator: 0) +!50 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!51 = !DILocation(line: 261, column: 15, scope: !49, inlinedAt: !20) +!52 = !DILocation(line: 591, column: 21, scope: !18, inlinedAt: !20) +!53 = !DILocation(line: 594, column: 40, scope: !18, inlinedAt: !20) +!54 = !DILocation(line: 594, column: 29, scope: !18, inlinedAt: !20) +!55 = !DILocation(line: 594, column: 23, scope: !18, inlinedAt: !20) +!56 = !DILocation(line: 261, column: 15, scope: !49, inlinedAt: !57) +!57 = !DILocation(line: 55, column: 26, scope: !9) +!58 = !DILocation(line: 291, column: 36, scope: !49, inlinedAt: !57) +!59 = !DILocation(line: 47, column: 20, scope: !9) +!60 = !DILocation(line: 574, column: 22, scope: !18, inlinedAt: !24) +!61 = !DILocation(line: 599, column: 19, scope: !18, inlinedAt: !24) +!62 = !DILocation(line: 600, column: 38, scope: !18, inlinedAt: !24) +!63 = !DILocation(line: 600, column: 46, scope: !18, inlinedAt: !24) +!64 = !DILocation(line: 600, column: 15, scope: !18, inlinedAt: !24) +!65 = !DILocation(line: 601, column: 59, scope: !18, inlinedAt: !24) +!66 = !DILocation(line: 601, column: 22, scope: !18, inlinedAt: !24) +!67 = !DILocation(line: 601, column: 48, scope: !18, inlinedAt: !24) +!68 = !DILocation(line: 291, column: 36, scope: !49, inlinedAt: !24) +!69 = !DILocation(line: 261, column: 15, scope: !49, inlinedAt: !24) +!70 = !DILocation(line: 591, column: 21, scope: !18, inlinedAt: !24) +!71 = !DILocation(line: 594, column: 40, scope: !18, inlinedAt: !24) +!72 = !DILocation(line: 594, column: 29, scope: !18, inlinedAt: !24) +!73 = !DILocation(line: 594, column: 23, scope: !18, inlinedAt: !24) +!74 = !DILocation(line: 58, column: 35, scope: !9) +!75 = !DILocation(line: 261, column: 15, scope: !49, inlinedAt: !76) +!76 = !DILocation(line: 59, column: 26, scope: !9) +!77 = !DILocation(line: 291, column: 36, scope: !49, inlinedAt: !76) +!78 = !DILocation(line: 61, column: 21, scope: !9) +!79 = !DILocation(line: 64, column: 19, scope: !9) +!80 = !DILocation(line: 66, column: 35, scope: !9) +!81 = !DILocation(line: 68, column: 20, scope: !9) +!82 = !DILocation(line: 69, column: 20, scope: !9) +!83 = !DILocation(line: 70, column: 35, scope: !9) +!84 = !DILocation(line: 71, column: 38, scope: !9) +!85 = !DILocation(line: 71, column: 63, scope: !9) +!86 = !DILocation(line: 75, column: 19, scope: !9) +!87 = !DILocation(line: 76, column: 35, scope: !9) +!88 = !DILocation(line: 77, column: 20, scope: !9) +!89 = !DILocation(line: 78, column: 20, scope: !9) +!90 = !DILocation(line: 79, column: 35, scope: !9) +!91 = !DILocation(line: 80, column: 38, scope: !9) +!92 = !DILocation(line: 80, column: 63, scope: !9) +!93 = !DILocation(line: 81, column: 25, scope: !9) +!94 = !DILocation(line: 81, column: 37, scope: !9) +!95 = !DILocation(line: 82, column: 25, scope: !9) +!96 = !DILocation(line: 82, column: 37, scope: !9) +!97 = !DILocation(line: 83, column: 25, scope: !9) +!98 = !DILocation(line: 83, column: 47, scope: !9) +!99 = !DILocation(line: 84, column: 52, scope: !9) +!100 = !DILocation(line: 84, column: 49, scope: !9) +!101 = !DILocation(line: 84, column: 25, scope: !9) +!102 = !DILocation(line: 84, column: 85, scope: !9) +!103 = !DILocation(line: 85, column: 25, scope: !9) +!104 = !DILocation(line: 85, column: 47, scope: !9) +!105 = !DILocation(line: 86, column: 49, scope: !9) +!106 = !DILocation(line: 86, column: 25, scope: !9) +!107 = !DILocation(line: 86, column: 85, scope: !9) +!108 = !DILocation(line: 86, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..59132bc0e778ee0c3af599a936d58c2f6a587572 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx @@ -0,0 +1,2103 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 // -- Begin function triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 55, 122, 47, 99, 55, 122, 50, 106, 98, 106, 117, 98, 51, 97, 117, 112, 103, 110, 101, 99, 104, 111, 108, 54, 53, 118, 107, 118, 105, 53, 114, 117, 119, 112, 121, 108, 122, 111, 115, 100, 98, 113, 118, 115, 99, 100, 121, 120, 109, 114, 101, 98, 51, 106, 121, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 57, 32, 60, 32, 49, 55}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 55, 122, 47, 99, 55, 122, 50, 106, 98, 106, 117, 98, 51, 97, 117, 112, 103, 110, 101, 99, 104, 111, 108, 54, 53, 118, 107, 118, 105, 53, 114, 117, 119, 112, 121, 108, 122, 111, 115, 100, 98, 113, 118, 115, 99, 100, 121, 120, 109, 114, 101, 98, 51, 106, 121, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 48, 32, 60, 32, 49, 55}; +.extern .shared .align 16 .b8 global_smem[]; + // @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.visible .entry triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_7, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_8, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_9, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_10 +) +.reqntid 128 +{ + .reg .pred %p<324>; + .reg .b16 %rs<37>; + .reg .b32 %r<812>; + .reg .b64 %rd<82>; + .loc 1 18 0 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:18:0 + +// %bb.0: + ld.param.b64 %rd17, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0]; +$L__tmp0: + .loc 1 24 28 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:24:28 + mov.u32 %r26, %ctaid.x; + .loc 1 24 33 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:24:33 + shl.b32 %r1, %r26, 5; + .loc 1 25 44 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:25:44 + mov.u32 %r2, %tid.x; + and.b32 %r27, %r2, 124; + bfe.u32 %r28, %r2, 2, 5; + and.b32 %r3, %r2, 31; + .loc 1 25 23 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:25:23 + or.b32 %r4, %r28, %r1; + .loc 1 26 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:26:21 + setp.gt.s32 %p3, %r4, 31; + setp.lt.s32 %p2, %r4, 32; + .loc 1 27 38 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:27:38 + and.b32 %r5, %r2, 3; + .loc 1 34 40 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:34:40 + shl.b32 %r29, %r4, 4; +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.b32 %r30, %r2, 1; + shr.u32 %r31, %r2, 1; + bfe.u32 %r32, %r2, 1, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r33, %r30, 1; + xor.b32 %r34, %r32, 1; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.ne.b32 %p4, %r30, 0; + and.b32 %r35, %r31, 1; + setp.ne.b32 %p5, %r35, 0; +$L__tmp2: + .loc 1 60 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:60:21 + shl.b32 %r36, %r27, 1; + mov.b32 %r37, global_smem; + add.s32 %r38, %r37, %r36; + shl.b32 %r39, %r3, 3; + add.s32 %r40, %r37, %r39; + .loc 1 27 38 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:27:38 + shl.b32 %r6, %r5, 2; + or.b32 %r7, %r6, 1; + or.b32 %r8, %r6, 2; + or.b32 %r9, %r6, 3; + .loc 1 34 37 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:34:37 + or.b32 %r41, %r29, %r6; + .loc 1 34 30 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:34:30 + mad.wide.s32 %rd13, %r41, 8, %rd17; + cvt.s64.s32 %rd18, %r29; + cvt.u64.u32 %rd19, %r6; + or.b64 %rd20, %rd18, %rd19; + shl.b64 %rd21, %rd20, 3; + add.s64 %rd22, %rd17, %rd21; + add.s64 %rd16, %rd22, 16; + .loc 1 34 45 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:34:45 + // begin inline asm + mov.u64 %rd11, 0x0; + mov.u64 %rd12, 0x0; + @%p2 ld.global.v2.b64 { %rd11, %rd12 }, [ %rd13 + 0 ]; + // end inline asm + // begin inline asm + mov.u64 %rd14, 0x0; + mov.u64 %rd15, 0x0; + @%p2 ld.global.v2.b64 { %rd14, %rd15 }, [ %rd16 + 0 ]; + // end inline asm +$L__tmp3: + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r42, %r7, %r6; + xor.b32 %r43, %r8, %r9; +$L__tmp4: + .loc 1 39 18 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:39:18 + add.s64 %rd23, %rd12, -1; + add.s64 %rd24, %rd14, -1; + add.s64 %rd25, %rd11, -1; + add.s64 %rd26, %rd15, -1; + setp.gt.u64 %p6, %rd26, 16382; + setp.gt.u64 %p7, %rd25, 16382; + setp.lt.u64 %p8, %rd26, 16383; + setp.lt.u64 %p9, %rd24, 16383; + setp.lt.u64 %p10, %rd23, 16383; + setp.lt.u64 %p11, %rd25, 16383; + .loc 1 0 0 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:0 + selp.b32 %r44, 1, 0, %p11; + selp.b32 %r45, 1, 0, %p10; + selp.b32 %r46, 1, 0, %p9; + selp.b32 %r47, 1, 0, %p8; +$L__tmp5: + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.pred %p12, %p10, %p7; + .loc 2 599 19 // triton_helpers.py:599:19 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + or.pred %p13, %p9, %p6; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r48, %r44, %r45; + xor.b32 %r49, %r46, %r47; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r50, %r48, 0, %p12; + selp.b32 %r51, %r49, 0, %p13; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r52, %r50, %r44; + xor.b32 %r53, %r50, %r45; + xor.b32 %r54, %r51, %r46; + xor.b32 %r55, %r51, %r47; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r56, %r42, 0, %p12; + selp.b32 %r57, %r43, 0, %p13; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r58, %r56, %r6; + xor.b32 %r59, %r56, %r7; + xor.b32 %r60, %r57, %r8; + xor.b32 %r61, %r57, %r9; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.ge.u32 %p14, %r52, %r54; + setp.ge.u32 %p15, %r53, %r55; + setp.ne.b32 %p16, %r53, %r55; + setp.ne.b32 %p17, %r54, %r52; + setp.le.u32 %p18, %r59, %r61; + setp.le.u32 %p19, %r58, %r60; + or.pred %p20, %p19, %p17; + or.pred %p21, %p18, %p16; + and.pred %p22, %p15, %p21; + and.pred %p23, %p14, %p20; + xor.pred %p24, %p23, %p4; + xor.pred %p25, %p22, %p4; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r62, %r52, %r54; + xor.b32 %r63, %r55, %r53; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r64, 0, %r63, %p25; + selp.b32 %r65, 0, %r62, %p24; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r66, %r65, %r52; + xor.b32 %r67, %r64, %r53; + xor.b32 %r68, %r64, %r55; + xor.b32 %r69, %r65, %r54; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r70, %r60, %r58; + xor.b32 %r71, %r61, %r59; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r72, 0, %r70, %p24; + selp.b32 %r73, 0, %r71, %p25; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r74, %r72, %r58; + xor.b32 %r75, %r73, %r59; + xor.b32 %r76, %r72, %r60; + xor.b32 %r77, %r73, %r61; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.ge.u32 %p26, %r66, %r67; + setp.ne.b32 %p27, %r66, %r67; + setp.le.u32 %p28, %r74, %r75; + or.pred %p29, %p27, %p28; + and.pred %p30, %p26, %p29; + xor.pred %p31, %p30, %p4; + setp.ge.u32 %p32, %r69, %r68; + setp.ne.b32 %p33, %r69, %r68; + setp.le.u32 %p34, %r76, %r77; + or.pred %p35, %p33, %p34; + and.pred %p36, %p32, %p35; + xor.pred %p37, %p36, %p4; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r78, %r66, %r67; + xor.b32 %r79, %r69, %r68; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r80, 0, %r78, %p31; + selp.b32 %r81, 0, %r79, %p37; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r82, %r80, %r66; + xor.b32 %r83, %r80, %r67; + xor.b32 %r84, %r81, %r69; + xor.b32 %r85, %r81, %r68; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r86, %r74, %r75; + xor.b32 %r87, %r76, %r77; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r88, 0, %r86, %p31; + selp.b32 %r89, 0, %r87, %p37; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r90, %r88, %r74; + xor.b32 %r91, %r88, %r75; + xor.b32 %r92, %r89, %r76; + xor.b32 %r93, %r89, %r77; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r94, %r82, %r33; + mul.lo.s32 %r95, %r83, %r33; + mul.lo.s32 %r96, %r84, %r33; + mul.lo.s32 %r97, %r85, %r33; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r98, %r94, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r99, %r94, %r98; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r100, %r95, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r101, %r95, %r100; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r102, %r96, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r103, %r96, %r102; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r104, %r97, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r105, %r97, %r104; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r106, %r82, %r30; + mul.lo.s32 %r107, %r83, %r30; + mul.lo.s32 %r108, %r84, %r30; + mul.lo.s32 %r109, %r85, %r30; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r110, %r106, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r111, %r106, %r110; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r112, %r107, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r113, %r107, %r112; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r114, %r108, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r115, %r108, %r114; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r116, %r109, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r117, %r109, %r116; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r118, %r90, %r33; + mul.lo.s32 %r119, %r91, %r33; + mul.lo.s32 %r120, %r92, %r33; + mul.lo.s32 %r121, %r93, %r33; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r122, %r118, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r123, %r118, %r122; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r124, %r119, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r125, %r119, %r124; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r126, %r120, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r127, %r120, %r126; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r128, %r121, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r129, %r121, %r128; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r130, %r90, %r30; + mul.lo.s32 %r131, %r91, %r30; + mul.lo.s32 %r132, %r92, %r30; + mul.lo.s32 %r133, %r93, %r30; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r134, %r130, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r135, %r130, %r134; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r136, %r131, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r137, %r131, %r136; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r138, %r132, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r139, %r132, %r138; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r140, %r133, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r141, %r133, %r140; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.ge.s32 %p38, %r99, %r111; + setp.ne.b32 %p39, %r99, %r111; + setp.le.s32 %p40, %r123, %r135; + or.pred %p41, %p39, %p40; + and.pred %p42, %p38, %p41; + xor.pred %p43, %p42, %p5; + setp.ge.s32 %p44, %r101, %r113; + setp.ne.b32 %p45, %r101, %r113; + setp.le.s32 %p46, %r125, %r137; + or.pred %p47, %p45, %p46; + and.pred %p48, %p44, %p47; + xor.pred %p49, %p48, %p5; + setp.ge.s32 %p50, %r103, %r115; + setp.ne.b32 %p51, %r103, %r115; + setp.le.s32 %p52, %r127, %r139; + or.pred %p53, %p51, %p52; + and.pred %p54, %p50, %p53; + xor.pred %p55, %p54, %p5; + setp.ge.s32 %p56, %r105, %r117; + setp.ne.b32 %p57, %r105, %r117; + setp.le.s32 %p58, %r129, %r141; + or.pred %p59, %p57, %p58; + and.pred %p60, %p56, %p59; + xor.pred %p61, %p60, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r142, %r99, %r111; + xor.b32 %r143, %r101, %r113; + xor.b32 %r144, %r103, %r115; + xor.b32 %r145, %r105, %r117; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r146, 0, %r142, %p43; + selp.b32 %r147, 0, %r143, %p49; + selp.b32 %r148, 0, %r144, %p55; + selp.b32 %r149, 0, %r145, %p61; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r150, %r146, %r82; + xor.b32 %r151, %r147, %r83; + xor.b32 %r152, %r148, %r84; + xor.b32 %r153, %r149, %r85; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r154, %r123, %r135; + xor.b32 %r155, %r125, %r137; + xor.b32 %r156, %r127, %r139; + xor.b32 %r157, %r129, %r141; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r158, 0, %r154, %p43; + selp.b32 %r159, 0, %r155, %p49; + selp.b32 %r160, 0, %r156, %p55; + selp.b32 %r161, 0, %r157, %p61; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r162, %r158, %r90; + xor.b32 %r163, %r159, %r91; + xor.b32 %r164, %r160, %r92; + xor.b32 %r165, %r161, %r93; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.ge.s32 %p62, %r150, %r152; + setp.ne.b32 %p63, %r150, %r152; + setp.le.s32 %p64, %r162, %r164; + or.pred %p65, %p63, %p64; + and.pred %p66, %p62, %p65; + xor.pred %p67, %p66, %p5; + setp.ge.s32 %p68, %r151, %r153; + setp.ne.b32 %p69, %r151, %r153; + setp.le.s32 %p70, %r163, %r165; + or.pred %p71, %p69, %p70; + and.pred %p72, %p68, %p71; + xor.pred %p73, %p72, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r166, %r150, %r152; + xor.b32 %r167, %r151, %r153; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r168, 0, %r166, %p67; + selp.b32 %r169, 0, %r167, %p73; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r170, %r168, %r150; + xor.b32 %r171, %r169, %r151; + xor.b32 %r172, %r168, %r152; + xor.b32 %r173, %r169, %r153; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r174, %r162, %r164; + xor.b32 %r175, %r163, %r165; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r176, 0, %r174, %p67; + selp.b32 %r177, 0, %r175, %p73; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r178, %r176, %r162; + xor.b32 %r179, %r177, %r163; + xor.b32 %r180, %r176, %r164; + xor.b32 %r181, %r177, %r165; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.ge.s32 %p74, %r170, %r171; + setp.ne.b32 %p75, %r170, %r171; + setp.le.s32 %p76, %r178, %r179; + or.pred %p77, %p75, %p76; + and.pred %p78, %p74, %p77; + xor.pred %p79, %p78, %p5; + setp.ge.s32 %p80, %r172, %r173; + setp.ne.b32 %p81, %r172, %r173; + setp.le.s32 %p82, %r180, %r181; + or.pred %p83, %p81, %p82; + and.pred %p84, %p80, %p83; + xor.pred %p85, %p84, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r182, %r170, %r171; + xor.b32 %r183, %r172, %r173; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r184, 0, %r182, %p79; + selp.b32 %r185, 0, %r183, %p85; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r186, %r178, %r179; + xor.b32 %r187, %r180, %r181; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r188, 0, %r186, %p79; + selp.b32 %r189, 0, %r187, %p85; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r190, %r188, %r178; + xor.b32 %r191, %r188, %r179; + xor.b32 %r192, %r189, %r180; + xor.b32 %r193, %r189, %r181; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r194, %r190, %r34; + mul.lo.s32 %r195, %r191, %r34; + mul.lo.s32 %r196, %r192, %r34; + mul.lo.s32 %r197, %r193, %r34; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r198, %r190, %r32; + mul.lo.s32 %r199, %r191, %r32; + mul.lo.s32 %r200, %r192, %r32; + mul.lo.s32 %r201, %r193, %r32; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r202, %r184, %r171; + xor.b32 %r203, %r184, %r170; + xor.b32 %r204, %r185, %r173; + xor.b32 %r205, %r185, %r172; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r206, %r203, %r34; + mul.lo.s32 %r207, %r202, %r34; + mul.lo.s32 %r208, %r205, %r34; + mul.lo.s32 %r209, %r204, %r34; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r210, %r206, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r211, %r206, %r210; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r212, %r207, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r213, %r207, %r212; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r214, %r208, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r215, %r208, %r214; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r216, %r209, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r217, %r209, %r216; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r218, %r203, %r32; + mul.lo.s32 %r219, %r202, %r32; + mul.lo.s32 %r220, %r205, %r32; + mul.lo.s32 %r221, %r204, %r32; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r222, %r218, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r223, %r218, %r222; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r224, %r219, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r225, %r219, %r224; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r226, %r220, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r227, %r220, %r226; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r228, %r221, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r229, %r221, %r228; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r230, %r194, 2, 31, -1; + shfl.sync.bfly.b32 %r231, %r195, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r232, %r195, %r231; + add.s32 %r233, %r194, %r230; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r234, %r196, 2, 31, -1; + shfl.sync.bfly.b32 %r235, %r197, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r236, %r197, %r235; + add.s32 %r237, %r196, %r234; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r238, %r198, 2, 31, -1; + shfl.sync.bfly.b32 %r239, %r199, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r240, %r199, %r239; + add.s32 %r241, %r198, %r238; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r242, %r200, 2, 31, -1; + shfl.sync.bfly.b32 %r243, %r201, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r244, %r201, %r243; + add.s32 %r245, %r200, %r242; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.lt.s32 %p86, %r213, %r225; + setp.lt.s32 %p87, %r211, %r223; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.eq.b32 %p88, %r211, %r223; + setp.eq.b32 %p89, %r213, %r225; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.gt.s32 %p90, %r233, %r241; + setp.gt.s32 %p91, %r232, %r240; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.pred %p92, %p89, %p91; + and.pred %p93, %p88, %p90; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + or.pred %p94, %p87, %p93; + or.pred %p95, %p86, %p92; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.lt.s32 %p96, %r217, %r229; + setp.lt.s32 %p97, %r215, %r227; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.eq.b32 %p98, %r215, %r227; + setp.eq.b32 %p99, %r217, %r229; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.gt.s32 %p100, %r237, %r245; + setp.gt.s32 %p101, %r236, %r244; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.pred %p102, %p99, %p101; + and.pred %p103, %p98, %p100; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + or.pred %p104, %p97, %p103; + or.pred %p105, %p96, %p102; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r246, %r211, %r223; + xor.b32 %r247, %r213, %r225; + xor.b32 %r248, %r215, %r227; + xor.b32 %r249, %r217, %r229; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r250, %r247, 0, %p95; + selp.b32 %r251, %r246, 0, %p94; + selp.b32 %r252, %r249, 0, %p105; + selp.b32 %r253, %r248, 0, %p104; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r254, %r251, %r203; + xor.b32 %r255, %r250, %r202; + xor.b32 %r256, %r253, %r205; + xor.b32 %r257, %r252, %r204; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r258, %r233, %r241; + xor.b32 %r259, %r232, %r240; + xor.b32 %r260, %r237, %r245; + xor.b32 %r261, %r236, %r244; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r262, %r259, 0, %p95; + selp.b32 %r263, %r258, 0, %p94; + selp.b32 %r264, %r261, 0, %p105; + selp.b32 %r265, %r260, 0, %p104; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r266, %r263, %r190; + xor.b32 %r267, %r262, %r191; + xor.b32 %r268, %r265, %r192; + xor.b32 %r269, %r264, %r193; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r270, %r255, %r33; + mul.lo.s32 %r271, %r254, %r33; + mul.lo.s32 %r272, %r257, %r33; + mul.lo.s32 %r273, %r256, %r33; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r274, %r271, 1, 31, -1; + shfl.sync.bfly.b32 %r275, %r270, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r276, %r271, %r274; + add.s32 %r277, %r270, %r275; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r278, %r273, 1, 31, -1; + shfl.sync.bfly.b32 %r279, %r272, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r280, %r273, %r278; + add.s32 %r281, %r272, %r279; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r282, %r255, %r30; + mul.lo.s32 %r283, %r254, %r30; + mul.lo.s32 %r284, %r257, %r30; + mul.lo.s32 %r285, %r256, %r30; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r286, %r283, 1, 31, -1; + shfl.sync.bfly.b32 %r287, %r282, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r288, %r283, %r286; + add.s32 %r289, %r282, %r287; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r290, %r285, 1, 31, -1; + shfl.sync.bfly.b32 %r291, %r284, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r292, %r285, %r290; + add.s32 %r293, %r284, %r291; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r294, %r267, %r33; + mul.lo.s32 %r295, %r266, %r33; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r296, %r295, 1, 31, -1; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r297, %r269, %r33; + mul.lo.s32 %r298, %r268, %r33; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r299, %r294, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r300, %r294, %r299; + add.s32 %r301, %r295, %r296; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r302, %r269, %r30; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r303, %r298, 1, 31, -1; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r304, %r268, %r30; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r305, %r297, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r306, %r297, %r305; + add.s32 %r307, %r298, %r303; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r308, %r267, %r30; + mul.lo.s32 %r309, %r266, %r30; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r310, %r309, 1, 31, -1; + shfl.sync.bfly.b32 %r311, %r308, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r312, %r308, %r311; + add.s32 %r313, %r309, %r310; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r314, %r304, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r315, %r304, %r314; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r316, %r302, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r317, %r316, %r302; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.lt.s32 %p106, %r277, %r289; + setp.lt.s32 %p107, %r276, %r288; + setp.lt.s32 %p108, %r281, %r293; + setp.lt.s32 %p109, %r280, %r292; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.eq.b32 %p110, %r276, %r288; + setp.eq.b32 %p111, %r277, %r289; + setp.eq.b32 %p112, %r280, %r292; + setp.eq.b32 %p113, %r281, %r293; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.gt.s32 %p114, %r301, %r313; + setp.gt.s32 %p115, %r300, %r312; + setp.gt.s32 %p116, %r307, %r315; + setp.gt.s32 %p117, %r306, %r317; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.pred %p118, %p111, %p115; + and.pred %p119, %p110, %p114; + and.pred %p120, %p113, %p117; + and.pred %p121, %p112, %p116; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + or.pred %p122, %p107, %p119; + or.pred %p123, %p106, %p118; + or.pred %p124, %p109, %p121; + or.pred %p125, %p108, %p120; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r318, %r276, %r288; + xor.b32 %r319, %r277, %r289; + xor.b32 %r320, %r280, %r292; + xor.b32 %r321, %r281, %r293; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r322, %r319, 0, %p123; + selp.b32 %r323, %r318, 0, %p122; + selp.b32 %r324, %r321, 0, %p125; + selp.b32 %r325, %r320, 0, %p124; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r326, %r323, %r254; + xor.b32 %r327, %r322, %r255; + xor.b32 %r328, %r325, %r256; + xor.b32 %r329, %r324, %r257; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r330, %r300, %r312; + xor.b32 %r331, %r301, %r313; + xor.b32 %r332, %r317, %r306; + xor.b32 %r333, %r307, %r315; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r334, %r331, 0, %p122; + selp.b32 %r335, %r330, 0, %p123; + selp.b32 %r336, %r333, 0, %p124; + selp.b32 %r337, %r332, 0, %p125; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r338, %r335, %r267; + xor.b32 %r339, %r334, %r266; + xor.b32 %r340, %r337, %r269; + xor.b32 %r341, %r336, %r268; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.lt.s32 %p126, %r327, %r329; + setp.lt.s32 %p127, %r326, %r328; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.eq.b32 %p128, %r326, %r328; + setp.eq.b32 %p129, %r327, %r329; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.gt.s32 %p130, %r339, %r341; + setp.gt.s32 %p131, %r338, %r340; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.pred %p132, %p129, %p131; + and.pred %p133, %p128, %p130; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + or.pred %p134, %p127, %p133; + or.pred %p135, %p126, %p132; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r342, %r326, %r328; + xor.b32 %r343, %r327, %r329; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r344, %r343, 0, %p135; + selp.b32 %r345, %r342, 0, %p134; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r346, %r345, %r328; + xor.b32 %r347, %r344, %r327; + xor.b32 %r348, %r345, %r326; + xor.b32 %r349, %r344, %r329; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r350, %r339, %r341; + xor.b32 %r351, %r338, %r340; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r352, %r351, 0, %p135; + selp.b32 %r353, %r350, 0, %p134; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r354, %r353, %r339; + xor.b32 %r355, %r352, %r338; + xor.b32 %r356, %r353, %r341; + xor.b32 %r357, %r352, %r340; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.lt.s32 %p136, %r348, %r347; + setp.lt.s32 %p137, %r346, %r349; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.eq.b32 %p138, %r348, %r347; + setp.eq.b32 %p139, %r349, %r346; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.gt.s32 %p140, %r354, %r355; + setp.gt.s32 %p141, %r356, %r357; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r358, %r355, %r354; + xor.b32 %r359, %r357, %r356; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r360, %r359, 0, %p141; + selp.b32 %r361, %r360, 0, %p139; + selp.b32 %r362, %r359, %r361, %p137; + selp.b32 %r363, %r358, 0, %p140; + selp.b32 %r364, %r363, 0, %p138; + selp.b32 %r365, %r358, %r364, %p136; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r12, %r365, %r355; + xor.b32 %r13, %r365, %r354; + xor.b32 %r11, %r362, %r356; + xor.b32 %r10, %r362, %r357; +$L__tmp6: + .loc 1 54 35 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:54:35 + and.pred %p142, %p2, %p9; + selp.b16 %rs1, 1, 0, %p142; + shl.b16 %rs2, %rs1, 2; + and.pred %p143, %p2, %p8; + selp.b16 %rs3, -1, 0, %p143; + shl.b16 %rs4, %rs3, 3; + or.b16 %rs5, %rs4, %rs2; + and.pred %p144, %p2, %p11; + selp.b16 %rs6, 1, 0, %p144; + and.pred %p145, %p2, %p10; + selp.b16 %rs7, -1, 0, %p145; + shl.b16 %rs8, %rs7, 1; + or.b16 %rs9, %rs6, %rs8; + and.b16 %rs10, %rs9, 3; + or.b16 %rs11, %rs10, %rs5; +$L__tmp7: + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + and.b16 %rs12, %rs11, 15; + cvt.u32.u16 %r366, %rs12; + popc.b32 %r367, %r366; + cvt.u64.u32 %rd27, %r367; +$L__tmp8: + .loc 1 47 20 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:47:20 + setp.ne.b64 %p146, %rd15, 16384; + setp.ne.b64 %p147, %rd11, 16384; + setp.eq.b64 %p148, %rd15, 16384; + setp.eq.b64 %p149, %rd14, 16384; + setp.eq.b64 %p150, %rd12, 16384; + setp.eq.b64 %p151, %rd11, 16384; + .loc 1 0 0 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:0 + selp.b32 %r368, 1, 0, %p151; + selp.b32 %r369, 1, 0, %p150; + selp.b32 %r370, 1, 0, %p149; + selp.b32 %r371, 1, 0, %p148; +$L__tmp9: + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + and.pred %p152, %p150, %p147; + .loc 2 599 19 // triton_helpers.py:599:19 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + or.pred %p153, %p149, %p146; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r372, %r368, %r369; + xor.b32 %r373, %r370, %r371; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r374, %r372, 0, %p152; + selp.b32 %r375, %r373, 0, %p153; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r376, %r374, %r368; + xor.b32 %r377, %r374, %r369; + xor.b32 %r378, %r375, %r370; + xor.b32 %r379, %r375, %r371; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r380, %r42, 0, %p152; + selp.b32 %r381, %r43, 0, %p153; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r382, %r380, %r6; + xor.b32 %r383, %r380, %r7; + xor.b32 %r384, %r381, %r8; + xor.b32 %r385, %r381, %r9; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.ge.u32 %p154, %r376, %r378; + setp.ge.u32 %p155, %r377, %r379; + setp.ne.b32 %p156, %r377, %r379; + setp.ne.b32 %p157, %r378, %r376; + setp.le.u32 %p158, %r383, %r385; + setp.le.u32 %p159, %r382, %r384; + or.pred %p160, %p159, %p157; + or.pred %p161, %p158, %p156; + and.pred %p162, %p155, %p161; + and.pred %p163, %p154, %p160; + xor.pred %p164, %p163, %p4; + xor.pred %p165, %p162, %p4; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r386, %r376, %r378; + xor.b32 %r387, %r379, %r377; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r388, 0, %r387, %p165; + selp.b32 %r389, 0, %r386, %p164; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r390, %r389, %r376; + xor.b32 %r391, %r388, %r377; + xor.b32 %r392, %r388, %r379; + xor.b32 %r393, %r389, %r378; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r394, %r384, %r382; + xor.b32 %r395, %r385, %r383; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r396, 0, %r394, %p164; + selp.b32 %r397, 0, %r395, %p165; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r398, %r396, %r382; + xor.b32 %r399, %r397, %r383; + xor.b32 %r400, %r396, %r384; + xor.b32 %r401, %r397, %r385; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.ge.u32 %p166, %r390, %r391; + setp.ne.b32 %p167, %r390, %r391; + setp.le.u32 %p168, %r398, %r399; + or.pred %p169, %p167, %p168; + and.pred %p170, %p166, %p169; + xor.pred %p171, %p170, %p4; + setp.ge.u32 %p172, %r393, %r392; + setp.ne.b32 %p173, %r393, %r392; + setp.le.u32 %p174, %r400, %r401; + or.pred %p175, %p173, %p174; + and.pred %p176, %p172, %p175; + xor.pred %p177, %p176, %p4; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r402, %r390, %r391; + xor.b32 %r403, %r393, %r392; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r404, 0, %r402, %p171; + selp.b32 %r405, 0, %r403, %p177; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r406, %r404, %r390; + xor.b32 %r407, %r404, %r391; + xor.b32 %r408, %r405, %r393; + xor.b32 %r409, %r405, %r392; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r410, %r398, %r399; + xor.b32 %r411, %r400, %r401; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r412, 0, %r410, %p171; + selp.b32 %r413, 0, %r411, %p177; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r414, %r406, %r33; + mul.lo.s32 %r415, %r407, %r33; + mul.lo.s32 %r416, %r408, %r33; + mul.lo.s32 %r417, %r409, %r33; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r418, %r406, %r30; + mul.lo.s32 %r419, %r407, %r30; + mul.lo.s32 %r420, %r408, %r30; + mul.lo.s32 %r421, %r409, %r30; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r422, %r413, %r401; + xor.b32 %r423, %r413, %r400; + xor.b32 %r424, %r412, %r399; + xor.b32 %r425, %r412, %r398; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r426, %r425, %r33; + mul.lo.s32 %r427, %r424, %r33; + mul.lo.s32 %r428, %r423, %r33; + mul.lo.s32 %r429, %r422, %r33; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r430, %r425, %r30; + mul.lo.s32 %r431, %r424, %r30; + mul.lo.s32 %r432, %r423, %r30; + mul.lo.s32 %r433, %r422, %r30; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r434, %r414, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r435, %r434, %r414; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r436, %r415, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r437, %r436, %r415; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r438, %r416, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r439, %r438, %r416; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r440, %r417, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r441, %r440, %r417; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r442, %r418, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r443, %r442, %r418; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r444, %r419, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r445, %r444, %r419; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r446, %r420, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r447, %r446, %r420; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r448, %r421, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r449, %r448, %r421; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.ge.s32 %p178, %r437, %r445; + setp.ne.b32 %p179, %r437, %r445; + setp.ge.s32 %p180, %r435, %r443; + setp.ge.s32 %p181, %r439, %r447; + setp.ne.b32 %p182, %r439, %r447; + setp.ne.b32 %p183, %r435, %r443; + setp.ge.s32 %p184, %r441, %r449; + setp.ne.b32 %p185, %r441, %r449; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r450, %r443, %r435; + xor.b32 %r451, %r445, %r437; + xor.b32 %r452, %r447, %r439; + xor.b32 %r453, %r449, %r441; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r454, %r426, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r455, %r454, %r426; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r456, %r427, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r457, %r456, %r427; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r458, %r428, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r459, %r458, %r428; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r460, %r429, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r461, %r460, %r429; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r462, %r430, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r463, %r462, %r430; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r464, %r431, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r465, %r464, %r431; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r466, %r432, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r467, %r466, %r432; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r468, %r433, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r469, %r468, %r433; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.le.s32 %p186, %r459, %r467; + setp.le.s32 %p187, %r455, %r463; + or.pred %p188, %p183, %p187; + or.pred %p189, %p182, %p186; + and.pred %p190, %p181, %p189; + and.pred %p191, %p180, %p188; + xor.pred %p192, %p191, %p5; + xor.pred %p193, %p190, %p5; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r470, 0, %r452, %p193; + selp.b32 %r471, 0, %r450, %p192; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r472, %r471, %r406; + xor.b32 %r473, %r470, %r408; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.le.s32 %p194, %r461, %r469; + setp.le.s32 %p195, %r457, %r465; + or.pred %p196, %p179, %p195; + or.pred %p197, %p185, %p194; + and.pred %p198, %p184, %p197; + and.pred %p199, %p178, %p196; + xor.pred %p200, %p199, %p5; + xor.pred %p201, %p198, %p5; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r474, 0, %r453, %p201; + selp.b32 %r475, 0, %r451, %p200; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r476, %r475, %r407; + xor.b32 %r477, %r474, %r409; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r478, %r463, %r455; + xor.b32 %r479, %r465, %r457; + xor.b32 %r480, %r467, %r459; + xor.b32 %r481, %r469, %r461; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r482, 0, %r481, %p201; + selp.b32 %r483, 0, %r479, %p200; + selp.b32 %r484, 0, %r480, %p193; + selp.b32 %r485, 0, %r478, %p192; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r486, %r485, %r425; + xor.b32 %r487, %r484, %r423; + xor.b32 %r488, %r483, %r424; + xor.b32 %r489, %r482, %r422; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.ge.s32 %p202, %r472, %r473; + setp.ne.b32 %p203, %r472, %r473; + setp.le.s32 %p204, %r488, %r489; + setp.le.s32 %p205, %r486, %r487; + or.pred %p206, %p203, %p205; + and.pred %p207, %p202, %p206; + setp.ge.s32 %p208, %r476, %r477; + setp.ne.b32 %p209, %r476, %r477; + or.pred %p210, %p209, %p204; + and.pred %p211, %p208, %p210; + xor.pred %p212, %p211, %p5; + xor.pred %p213, %p207, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r490, %r473, %r472; + xor.b32 %r491, %r477, %r476; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r492, 0, %r490, %p213; + selp.b32 %r493, 0, %r491, %p212; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r494, %r492, %r472; + xor.b32 %r495, %r493, %r476; + xor.b32 %r496, %r492, %r473; + xor.b32 %r497, %r493, %r477; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r498, %r487, %r486; + xor.b32 %r499, %r489, %r488; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r500, 0, %r499, %p212; + selp.b32 %r501, 0, %r498, %p213; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r502, %r501, %r487; + xor.b32 %r503, %r500, %r488; + xor.b32 %r504, %r500, %r489; + xor.b32 %r505, %r501, %r486; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.ge.s32 %p214, %r494, %r495; + setp.ne.b32 %p215, %r494, %r495; + setp.le.s32 %p216, %r505, %r503; + or.pred %p217, %p215, %p216; + and.pred %p218, %p214, %p217; + xor.pred %p219, %p218, %p5; + setp.ge.s32 %p220, %r496, %r497; + setp.ne.b32 %p221, %r496, %r497; + setp.le.s32 %p222, %r502, %r504; + or.pred %p223, %p221, %p222; + and.pred %p224, %p220, %p223; + xor.pred %p225, %p224, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r506, %r495, %r494; + xor.b32 %r507, %r497, %r496; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r508, 0, %r506, %p219; + selp.b32 %r509, 0, %r507, %p225; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r510, %r509, %r496; + xor.b32 %r511, %r508, %r494; + xor.b32 %r512, %r509, %r497; + xor.b32 %r513, %r508, %r495; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r514, %r503, %r505; + xor.b32 %r515, %r504, %r502; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r516, 0, %r515, %p225; + selp.b32 %r517, 0, %r514, %p219; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r522, %r517, %r503; + xor.b32 %r523, %r516, %r502; + xor.b32 %r524, %r517, %r505; + xor.b32 %r525, %r516, %r504; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r526, %r511, %r34; + mul.lo.s32 %r527, %r513, %r34; + mul.lo.s32 %r528, %r510, %r34; + mul.lo.s32 %r529, %r512, %r34; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r530, %r526, 2, 31, -1; + shfl.sync.bfly.b32 %r531, %r527, 2, 31, -1; + shfl.sync.bfly.b32 %r532, %r528, 2, 31, -1; + shfl.sync.bfly.b32 %r533, %r529, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r534, %r527, %r531; + add.s32 %r535, %r529, %r533; + add.s32 %r536, %r526, %r530; + add.s32 %r537, %r528, %r532; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r538, %r511, %r32; + mul.lo.s32 %r539, %r513, %r32; + mul.lo.s32 %r540, %r510, %r32; + mul.lo.s32 %r541, %r512, %r32; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r542, %r538, 2, 31, -1; + shfl.sync.bfly.b32 %r543, %r539, 2, 31, -1; + shfl.sync.bfly.b32 %r544, %r540, 2, 31, -1; + shfl.sync.bfly.b32 %r545, %r541, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r546, %r539, %r543; + add.s32 %r547, %r541, %r545; + add.s32 %r548, %r538, %r542; + add.s32 %r549, %r540, %r544; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r550, %r525, %r34; + mul.lo.s32 %r551, %r523, %r34; + mul.lo.s32 %r552, %r522, %r34; + mul.lo.s32 %r553, %r524, %r34; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r554, %r553, 2, 31, -1; + shfl.sync.bfly.b32 %r555, %r552, 2, 31, -1; + shfl.sync.bfly.b32 %r556, %r551, 2, 31, -1; + shfl.sync.bfly.b32 %r557, %r550, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r558, %r551, %r556; + add.s32 %r559, %r553, %r554; + add.s32 %r560, %r550, %r557; + add.s32 %r561, %r552, %r555; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r562, %r525, %r32; + mul.lo.s32 %r563, %r523, %r32; + mul.lo.s32 %r564, %r522, %r32; + mul.lo.s32 %r565, %r524, %r32; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r566, %r565, 2, 31, -1; + shfl.sync.bfly.b32 %r567, %r564, 2, 31, -1; + shfl.sync.bfly.b32 %r568, %r563, 2, 31, -1; + shfl.sync.bfly.b32 %r569, %r562, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r570, %r563, %r568; + add.s32 %r571, %r565, %r566; + add.s32 %r572, %r562, %r569; + add.s32 %r573, %r564, %r567; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.lt.s32 %p226, %r537, %r549; + setp.lt.s32 %p227, %r536, %r548; + setp.lt.s32 %p228, %r535, %r547; + setp.lt.s32 %p229, %r534, %r546; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.eq.b32 %p230, %r534, %r546; + setp.eq.b32 %p231, %r535, %r547; + setp.eq.b32 %p232, %r536, %r548; + setp.eq.b32 %p233, %r537, %r549; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.gt.s32 %p234, %r561, %r573; + setp.gt.s32 %p235, %r560, %r572; + setp.gt.s32 %p236, %r559, %r571; + setp.gt.s32 %p237, %r558, %r570; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + and.pred %p238, %p233, %p237; + and.pred %p239, %p232, %p236; + and.pred %p240, %p231, %p235; + and.pred %p241, %p230, %p234; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + or.pred %p242, %p229, %p241; + or.pred %p243, %p228, %p240; + or.pred %p244, %p227, %p239; + or.pred %p245, %p226, %p238; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r574, %r534, %r546; + xor.b32 %r575, %r535, %r547; + xor.b32 %r576, %r536, %r548; + xor.b32 %r577, %r537, %r549; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r578, %r577, 0, %p245; + selp.b32 %r579, %r576, 0, %p244; + selp.b32 %r580, %r575, 0, %p243; + selp.b32 %r581, %r574, 0, %p242; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r582, %r581, %r513; + xor.b32 %r583, %r579, %r511; + xor.b32 %r584, %r578, %r510; + xor.b32 %r585, %r580, %r512; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r586, %r561, %r573; + xor.b32 %r587, %r558, %r570; + xor.b32 %r588, %r560, %r572; + xor.b32 %r589, %r559, %r571; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r590, %r589, 0, %p244; + selp.b32 %r591, %r588, 0, %p243; + selp.b32 %r592, %r587, 0, %p245; + selp.b32 %r593, %r586, 0, %p242; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r598, %r593, %r522; + xor.b32 %r599, %r590, %r524; + xor.b32 %r600, %r591, %r525; + xor.b32 %r601, %r592, %r523; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r602, %r585, %r33; + mul.lo.s32 %r603, %r584, %r33; + mul.lo.s32 %r604, %r582, %r33; + mul.lo.s32 %r605, %r583, %r33; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r606, %r605, 1, 31, -1; + shfl.sync.bfly.b32 %r607, %r604, 1, 31, -1; + shfl.sync.bfly.b32 %r608, %r603, 1, 31, -1; + shfl.sync.bfly.b32 %r609, %r602, 1, 31, -1; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r614, %r585, %r30; + mul.lo.s32 %r615, %r584, %r30; + mul.lo.s32 %r616, %r582, %r30; + mul.lo.s32 %r617, %r583, %r30; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r618, %r617, 1, 31, -1; + shfl.sync.bfly.b32 %r619, %r616, 1, 31, -1; + shfl.sync.bfly.b32 %r620, %r615, 1, 31, -1; + shfl.sync.bfly.b32 %r621, %r614, 1, 31, -1; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r626, %r599, %r33; + mul.lo.s32 %r627, %r598, %r33; + mul.lo.s32 %r628, %r601, %r33; + mul.lo.s32 %r629, %r600, %r33; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r630, %r626, 1, 31, -1; + shfl.sync.bfly.b32 %r631, %r627, 1, 31, -1; + shfl.sync.bfly.b32 %r632, %r628, 1, 31, -1; + shfl.sync.bfly.b32 %r633, %r629, 1, 31, -1; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r638, %r599, %r30; + mul.lo.s32 %r639, %r598, %r30; + mul.lo.s32 %r640, %r601, %r30; + mul.lo.s32 %r641, %r600, %r30; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r642, %r638, 1, 31, -1; + shfl.sync.bfly.b32 %r643, %r639, 1, 31, -1; + shfl.sync.bfly.b32 %r644, %r640, 1, 31, -1; + shfl.sync.bfly.b32 %r645, %r641, 1, 31, -1; +$L__tmp10: + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + shfl.sync.bfly.b32 %r720, %r367, 2, 31, -1; + mov.b32 %r721, 0; + shfl.sync.bfly.b32 %r722, %r721, 2, 31, -1; + cvt.u64.u32 %rd28, %r720; + cvt.u64.u32 %rd29, %r722; + shl.b64 %rd30, %rd29, 32; + or.b64 %rd31, %rd28, %rd30; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + add.s64 %rd32, %rd27, %rd31; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + mov.b64 {_, %r723}, %rd32; + cvt.u32.u64 %r724, %rd32; + shfl.sync.bfly.b32 %r725, %r724, 1, 31, -1; + shfl.sync.bfly.b32 %r726, %r723, 1, 31, -1; + cvt.u64.u32 %rd33, %r725; + cvt.u64.u32 %rd34, %r726; + shl.b64 %rd35, %rd34, 32; + or.b64 %rd36, %rd33, %rd35; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + add.s64 %rd37, %rd32, %rd36; +$L__tmp11: + .loc 1 60 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:60:21 + st.shared.b64 [%r38], %rd37; + bar.sync 0; + ld.shared.b64 %rd2, [%r40]; + .loc 1 58 35 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:58:35 + and.pred %p282, %p2, %p149; + selp.b16 %rs13, 1, 0, %p282; + shl.b16 %rs14, %rs13, 2; + and.pred %p283, %p2, %p148; + selp.b16 %rs15, -1, 0, %p283; + shl.b16 %rs16, %rs15, 3; + or.b16 %rs17, %rs16, %rs14; + and.pred %p284, %p2, %p151; + selp.b16 %rs18, 1, 0, %p284; + and.pred %p285, %p2, %p150; + selp.b16 %rs19, -1, 0, %p285; + shl.b16 %rs20, %rs19, 1; + or.b16 %rs21, %rs18, %rs20; + and.b16 %rs22, %rs21, 3; + or.b16 %rs23, %rs22, %rs17; +$L__tmp12: + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + and.b16 %rs24, %rs23, 15; + cvt.u32.u16 %r727, %rs24; + popc.b32 %r728, %r727; + cvt.u64.u32 %rd38, %r728; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + shfl.sync.bfly.b32 %r729, %r728, 2, 31, -1; + shfl.sync.bfly.b32 %r730, %r721, 2, 31, -1; + cvt.u64.u32 %rd39, %r729; + cvt.u64.u32 %rd40, %r730; + shl.b64 %rd41, %rd40, 32; + or.b64 %rd42, %rd39, %rd41; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + add.s64 %rd43, %rd38, %rd42; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + mov.b64 {_, %r731}, %rd43; + cvt.u32.u64 %r732, %rd43; + shfl.sync.bfly.b32 %r733, %r732, 1, 31, -1; + shfl.sync.bfly.b32 %r734, %r731, 1, 31, -1; + cvt.u64.u32 %rd44, %r733; + cvt.u64.u32 %rd45, %r734; + shl.b64 %rd46, %rd45, 32; + or.b64 %rd47, %rd44, %rd46; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + add.s64 %rd3, %rd43, %rd47; +$L__tmp13: + .loc 1 61 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:61:21 + bar.sync 0; + st.shared.b64 [%r38], %rd3; + bar.sync 0; + .loc 1 60 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:60:21 + cvt.u32.u64 %r735, %rd37; + .loc 1 64 19 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:64:19 + setp.lt.s32 %p286, %r7, %r735; + setp.lt.s32 %p287, %r6, %r735; + setp.lt.s32 %p288, %r8, %r735; + setp.lt.s32 %p289, %r9, %r735; + .loc 1 66 35 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:66:35 + selp.b32 %r736, %r10, 16, %p289; + selp.b32 %r737, %r11, 16, %p288; + selp.b32 %r738, %r13, 16, %p287; + selp.b32 %r739, %r12, 16, %p286; + .loc 1 68 20 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:68:20 + add.s32 %r740, %r739, 17; + add.s32 %r741, %r738, 17; + add.s32 %r742, %r737, 17; + add.s32 %r743, %r736, 17; + .loc 1 69 20 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:69:20 + setp.lt.s32 %p290, %r739, 0; + setp.lt.s32 %p291, %r738, 0; + setp.lt.s32 %p292, %r737, 0; + setp.lt.s32 %p293, %r736, 0; + .loc 1 70 35 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:70:35 + selp.b32 %r18, %r743, %r736, %p293; + selp.b32 %r19, %r742, %r737, %p292; + selp.b32 %r21, %r741, %r738, %p291; + selp.b32 %r20, %r740, %r739, %p290; + .loc 1 71 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:71:63 + max.u32 %r744, %r21, %r20; + max.u32 %r745, %r19, %r744; + max.u32 %r746, %r18, %r745; + setp.lt.u32 %p294, %r746, 17; + or.pred %p295, %p3, %p294; + @%p295 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + .loc 1 0 0 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:0 + xor.b32 %r518, %r501, %r517; + xor.b32 %r519, %r500, %r516; + xor.b32 %r520, %r501, %r516; + xor.b32 %r521, %r500, %r517; + xor.b32 %r594, %r521, %r593; + xor.b32 %r595, %r520, %r592; + xor.b32 %r596, %r519, %r591; + xor.b32 %r597, %r518, %r590; + add.s32 %r610, %r604, %r607; + add.s32 %r611, %r602, %r609; + add.s32 %r612, %r605, %r606; + add.s32 %r613, %r603, %r608; + add.s32 %r622, %r616, %r619; + add.s32 %r623, %r614, %r621; + add.s32 %r624, %r617, %r618; + add.s32 %r625, %r615, %r620; + add.s32 %r634, %r628, %r632; + add.s32 %r635, %r627, %r631; + add.s32 %r636, %r626, %r630; + add.s32 %r637, %r629, %r633; + add.s32 %r646, %r644, %r640; + add.s32 %r647, %r643, %r639; + add.s32 %r648, %r642, %r638; + add.s32 %r649, %r645, %r641; + setp.lt.s32 %p246, %r613, %r625; + setp.lt.s32 %p247, %r612, %r624; + setp.lt.s32 %p248, %r611, %r623; + setp.lt.s32 %p249, %r610, %r622; + setp.eq.b32 %p250, %r610, %r622; + setp.eq.b32 %p251, %r611, %r623; + setp.eq.b32 %p252, %r612, %r624; + setp.eq.b32 %p253, %r613, %r625; + setp.gt.s32 %p254, %r635, %r647; + setp.gt.s32 %p255, %r637, %r649; + setp.gt.s32 %p256, %r636, %r648; + setp.gt.s32 %p257, %r634, %r646; + and.pred %p258, %p253, %p257; + and.pred %p259, %p252, %p256; + and.pred %p260, %p251, %p255; + and.pred %p261, %p250, %p254; + or.pred %p262, %p249, %p261; + or.pred %p263, %p248, %p260; + or.pred %p264, %p247, %p259; + or.pred %p265, %p246, %p258; + xor.b32 %r650, %r610, %r622; + xor.b32 %r651, %r611, %r623; + xor.b32 %r652, %r612, %r624; + xor.b32 %r653, %r613, %r625; + selp.b32 %r654, %r653, 0, %p265; + selp.b32 %r655, %r652, 0, %p264; + selp.b32 %r656, %r651, 0, %p263; + selp.b32 %r657, %r650, 0, %p262; + xor.b32 %r658, %r654, %r584; + xor.b32 %r659, %r655, %r583; + xor.b32 %r660, %r657, %r582; + xor.b32 %r661, %r656, %r585; + xor.b32 %r662, %r646, %r634; + xor.b32 %r663, %r647, %r635; + xor.b32 %r664, %r648, %r636; + xor.b32 %r665, %r649, %r637; + selp.b32 %r666, %r663, 0, %p262; + selp.b32 %r667, %r662, 0, %p265; + selp.b32 %r668, %r665, 0, %p263; + selp.b32 %r669, %r664, 0, %p264; + xor.b32 %r670, %r597, %r669; + xor.b32 %r671, %r596, %r668; + xor.b32 %r672, %r669, %r599; + xor.b32 %r673, %r667, %r601; + xor.b32 %r674, %r666, %r598; + xor.b32 %r675, %r668, %r600; + setp.lt.s32 %p266, %r659, %r658; + setp.lt.s32 %p267, %r660, %r661; + setp.eq.b32 %p268, %r659, %r658; + setp.eq.b32 %p269, %r660, %r661; + setp.gt.s32 %p270, %r672, %r673; + setp.gt.s32 %p271, %r674, %r675; + and.pred %p272, %p268, %p270; + and.pred %p273, %p269, %p271; + or.pred %p274, %p266, %p272; + or.pred %p275, %p267, %p273; + xor.b32 %r676, %r658, %r659; + xor.b32 %r677, %r661, %r660; + selp.b32 %r678, %r677, 0, %p275; + selp.b32 %r679, %r676, 0, %p274; + xor.b32 %r680, %r679, %r658; + xor.b32 %r681, %r678, %r660; + xor.b32 %r682, %r679, %r659; + xor.b32 %r683, %r678, %r661; + xor.b32 %r684, %r675, %r674; + xor.b32 %r685, %r673, %r672; + selp.b32 %r686, %r685, 0, %p274; + selp.b32 %r687, %r684, 0, %p275; + xor.b32 %r688, %r671, %r687; + xor.b32 %r689, %r667, %r686; + xor.b32 %r690, %r689, %r595; + xor.b32 %r691, %r666, %r687; + xor.b32 %r692, %r691, %r594; + xor.b32 %r693, %r670, %r686; + xor.b32 %r694, %r687, %r675; + xor.b32 %r695, %r687, %r674; + xor.b32 %r696, %r686, %r673; + xor.b32 %r697, %r686, %r672; + setp.lt.s32 %p276, %r682, %r681; + setp.lt.s32 %p277, %r680, %r683; + setp.eq.b32 %p278, %r681, %r682; + setp.eq.b32 %p279, %r680, %r683; + setp.gt.s32 %p280, %r697, %r695; + setp.gt.s32 %p281, %r696, %r694; + xor.b32 %r698, %r689, %r668; + xor.b32 %r699, %r691, %r669; + xor.b32 %r700, %r699, %r522; + xor.b32 %r701, %r698, %r523; + xor.b32 %r702, %r701, %r592; + xor.b32 %r703, %r700, %r593; + xor.b32 %r704, %r703, %r524; + xor.b32 %r705, %r702, %r525; + xor.b32 %r706, %r705, %r591; + xor.b32 %r707, %r704, %r590; + xor.b32 %r708, %r707, %r686; + xor.b32 %r709, %r706, %r687; + selp.b32 %r710, %r709, 0, %p281; + selp.b32 %r711, %r710, 0, %p279; + selp.b32 %r712, %r709, %r711, %p277; + selp.b32 %r713, %r708, 0, %p280; + selp.b32 %r714, %r713, 0, %p278; + selp.b32 %r715, %r708, %r714, %p276; + xor.b32 %r716, %r693, %r715; + xor.b32 %r717, %r692, %r715; + xor.b32 %r718, %r690, %r712; + xor.b32 %r719, %r688, %r712; + xor.b32 %r17, %r719, %r489; + xor.b32 %r16, %r718, %r487; + xor.b32 %r15, %r717, %r488; + xor.b32 %r14, %r716, %r486; + .loc 1 61 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:61:21 + ld.shared.b64 %rd4, [%r40]; + cvt.u32.u64 %r747, %rd3; + .loc 1 71 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:71:63 + bar.sync 0; + .loc 1 75 19 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:75:19 + setp.lt.s32 %p297, %r8, %r747; + setp.lt.s32 %p298, %r9, %r747; + setp.lt.s32 %p299, %r6, %r747; + setp.lt.s32 %p300, %r7, %r747; + .loc 1 76 35 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:76:35 + selp.b32 %r748, %r15, 16, %p300; + selp.b32 %r749, %r14, 16, %p299; + selp.b32 %r750, %r17, 16, %p298; + selp.b32 %r751, %r16, 16, %p297; + .loc 1 77 20 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:77:20 + add.s32 %r752, %r751, 17; + add.s32 %r753, %r750, 17; + add.s32 %r754, %r749, 17; + add.s32 %r755, %r748, 17; + .loc 1 78 20 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:78:20 + setp.lt.s32 %p301, %r751, 0; + setp.lt.s32 %p302, %r750, 0; + setp.lt.s32 %p303, %r749, 0; + setp.lt.s32 %p304, %r748, 0; + .loc 1 79 35 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:79:35 + selp.b32 %r23, %r755, %r748, %p304; + selp.b32 %r22, %r754, %r749, %p303; + selp.b32 %r25, %r753, %r750, %p302; + selp.b32 %r24, %r752, %r751, %p301; + .loc 1 80 38 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:80:38 + setp.gt.u32 %p305, %r24, 16; + selp.b16 %rs25, 1, 0, %p305; + shl.b16 %rs26, %rs25, 2; + setp.gt.u32 %p306, %r25, 16; + selp.b16 %rs27, -1, 0, %p306; + shl.b16 %rs28, %rs27, 3; + or.b16 %rs29, %rs28, %rs26; + setp.gt.u32 %p307, %r22, 16; + selp.b16 %rs30, 1, 0, %p307; + setp.gt.u32 %p308, %r23, 16; + selp.b16 %rs31, -1, 0, %p308; + shl.b16 %rs32, %rs31, 1; + or.b16 %rs33, %rs30, %rs32; + and.b16 %rs34, %rs33, 3; + or.b16 %rs35, %rs34, %rs29; + .loc 1 80 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:80:63 + and.b16 %rs36, %rs35, 15; + setp.eq.b16 %p309, %rs36, 0; + or.pred %p310, %p3, %p309; + @%p310 bra $L__BB0_4; + bra.uni $L__BB0_3; +$L__BB0_4: + .loc 1 0 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:0:63 + ld.param.b64 %rd10, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6]; + ld.param.b64 %rd9, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5]; + ld.param.b64 %rd8, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4]; + ld.param.b64 %rd7, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3]; + ld.param.b64 %rd6, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2]; + ld.param.b64 %rd5, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1]; + cvt.s64.s32 %rd1, %r41; + .loc 1 61 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:61:21 + cvt.u32.u64 %r757, %rd4; + .loc 1 60 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:60:21 + cvt.u32.u64 %r756, %rd2; + .loc 1 25 23 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:25:23 + or.b32 %r774, %r1, %r3; + .loc 1 26 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:26:21 + setp.lt.s32 %p314, %r774, 32; + .loc 1 80 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:80:63 + bar.sync 0; + .loc 1 81 25 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:81:25 + mul.wide.s32 %rd60, %r774, 4; + add.s64 %rd48, %rd5, %rd60; + .loc 1 81 37 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:81:37 + and.b32 %r775, %r2, 96; + setp.eq.b32 %p323, %r775, 0; + and.pred %p311, %p323, %p314; + // begin inline asm + @%p311 st.global.b32 [ %rd48 + 0 ], { %r756 }; + // end inline asm + .loc 1 82 25 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:82:25 + add.s64 %rd49, %rd6, %rd60; + .loc 1 82 37 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:82:37 + // begin inline asm + @%p311 st.global.b32 [ %rd49 + 0 ], { %r757 }; + // end inline asm + .loc 1 83 25 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:83:25 + shl.b64 %rd61, %rd1, 2; + add.s64 %rd50, %rd7, %rd61; + .loc 1 83 47 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:83:47 + // begin inline asm + @%p2 st.global.v4.b32 [ %rd50 + 0 ], { %r13, %r12, %r11, %r10 }; + // end inline asm + .loc 1 84 52 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:84:52 + mul.lo.s32 %r776, %r4, 17; + .loc 1 84 49 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:84:49 + add.s32 %r777, %r21, %r776; + add.s32 %r778, %r20, %r776; + add.s32 %r779, %r19, %r776; + add.s32 %r780, %r18, %r776; + .loc 1 84 25 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:84:25 + mad.wide.s32 %rd62, %r777, 4, %rd8; + mad.wide.s32 %rd63, %r778, 4, %rd8; + mad.wide.s32 %rd64, %r779, 4, %rd8; + mad.wide.s32 %rd65, %r780, 4, %rd8; + .loc 1 84 85 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:84:85 + bar.sync 0; + and.b32 %r781, %r2, 48; + shl.b32 %r782, %r781, 6; + shl.b32 %r783, %r2, 3; + and.b32 %r784, %r783, 120; + shr.u32 %r785, %r781, 1; + shl.b32 %r786, %r2, 1; + and.b32 %r787, %r786, 128; + or.b32 %r788, %r782, %r784; + xor.b32 %r789, %r788, %r785; + add.s32 %r791, %r37, %r787; + add.s32 %r792, %r791, %r789; + st.shared.b64 [%r792], %rd62; + st.shared.b64 [%r792+256], %rd63; + st.shared.b64 [%r792+512], %rd64; + st.shared.b64 [%r792+768], %rd65; + bar.sync 0; + and.b32 %r793, %r2, 12; + shl.b32 %r794, %r793, 8; + shl.b32 %r795, %r5, 5; + and.b32 %r796, %r783, 896; + shl.b32 %r797, %r793, 1; + or.b32 %r798, %r794, %r795; + or.b32 %r799, %r798, %r796; + or.b32 %r800, %r799, %r797; + add.s32 %r801, %r37, %r800; + ld.shared.b64 %rd51, [%r801]; + xor.b32 %r802, %r800, 8; + add.s32 %r803, %r37, %r802; + ld.shared.b64 %rd52, [%r803]; + xor.b32 %r804, %r800, 16; + add.s32 %r805, %r37, %r804; + ld.shared.b64 %rd53, [%r805]; + xor.b32 %r806, %r800, 24; + add.s32 %r807, %r37, %r806; + ld.shared.b64 %rd54, [%r807]; + mov.b32 %r762, 1; + // begin inline asm + @%p314 st.global.b32 [ %rd51 + 0 ], { %r762 }; + // end inline asm + // begin inline asm + @%p314 st.global.b32 [ %rd52 + 0 ], { %r762 }; + // end inline asm + // begin inline asm + @%p314 st.global.b32 [ %rd53 + 0 ], { %r762 }; + // end inline asm + // begin inline asm + @%p314 st.global.b32 [ %rd54 + 0 ], { %r762 }; + // end inline asm + .loc 1 85 25 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:85:25 + add.s64 %rd55, %rd9, %rd61; + .loc 1 85 47 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:85:47 + // begin inline asm + @%p2 st.global.v4.b32 [ %rd55 + 0 ], { %r14, %r15, %r16, %r17 }; + // end inline asm + .loc 1 86 49 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:86:49 + add.s32 %r808, %r22, %r776; + add.s32 %r809, %r23, %r776; + add.s32 %r810, %r24, %r776; + add.s32 %r811, %r25, %r776; + .loc 1 86 25 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:86:25 + mad.wide.s32 %rd66, %r808, 4, %rd10; + mad.wide.s32 %rd67, %r809, 4, %rd10; + mad.wide.s32 %rd68, %r810, 4, %rd10; + mad.wide.s32 %rd69, %r811, 4, %rd10; + .loc 1 86 85 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:86:85 + bar.sync 0; + st.shared.b64 [%r792], %rd66; + st.shared.b64 [%r792+256], %rd67; + st.shared.b64 [%r792+512], %rd68; + st.shared.b64 [%r792+768], %rd69; + bar.sync 0; + ld.shared.b64 %rd56, [%r801]; + ld.shared.b64 %rd57, [%r803]; + ld.shared.b64 %rd58, [%r805]; + ld.shared.b64 %rd59, [%r807]; + // begin inline asm + @%p314 st.global.b32 [ %rd56 + 0 ], { %r762 }; + // end inline asm + // begin inline asm + @%p314 st.global.b32 [ %rd57 + 0 ], { %r762 }; + // end inline asm + // begin inline asm + @%p314 st.global.b32 [ %rd58 + 0 ], { %r762 }; + // end inline asm + // begin inline asm + @%p314 st.global.b32 [ %rd59 + 0 ], { %r762 }; + // end inline asm + .loc 1 86 4 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:86:4 + ret; +$L__BB0_1: + .loc 1 71 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:71:63 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd76, assertFunc_0; + cvta.global.u64 %rd77, %rd76; + st.param.b64 [param3], %rd77; + mov.b64 %rd78, assertFile_0; + cvta.global.u64 %rd79, %rd78; + st.param.b64 [param1], %rd79; + mov.b64 %rd80, assertMessage_0; + cvta.global.u64 %rd81, %rd80; + st.param.b64 [param0], %rd81; + st.param.b64 [param4], 1; + st.param.b32 [param2], 71; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__BB0_3: + .loc 1 80 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:80:63 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd70, assertFunc_1; + cvta.global.u64 %rd71, %rd70; + st.param.b64 [param3], %rd71; + mov.b64 %rd72, assertFile_1; + cvta.global.u64 %rd73, %rd72; + st.param.b64 [param1], %rd73; + mov.b64 %rd74, assertMessage_1; + cvta.global.u64 %rd75, %rd74; + st.param.b64 [param0], %rd75; + st.param.b64 [param4], 1; + st.param.b32 [param2], 80; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp14: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 376 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x171 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 55 +.b8 122 +.b8 50 +.b8 106 +.b8 98 +.b8 106 +.b8 117 +.b8 98 +.b8 51 +.b8 97 +.b8 117 +.b8 112 +.b8 103 +.b8 110 +.b8 101 +.b8 99 +.b8 104 +.b8 111 +.b8 108 +.b8 54 +.b8 53 +.b8 118 +.b8 107 +.b8 118 +.b8 105 +.b8 53 +.b8 114 +.b8 117 +.b8 119 +.b8 112 +.b8 121 +.b8 108 +.b8 122 +.b8 111 +.b8 115 +.b8 100 +.b8 98 +.b8 113 +.b8 118 +.b8 115 +.b8 99 +.b8 100 +.b8 121 +.b8 120 +.b8 109 +.b8 114 +.b8 101 +.b8 98 +.b8 51 +.b8 106 +.b8 121 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 55 +.b8 122 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x7a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 116 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 112 +.b8 117 +.b8 116 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 110 +.b8 101 +.b8 119 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 115 +.b8 99 +.b8 97 +.b8 108 +.b8 97 +.b8 114 +.b8 95 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 119 +.b8 104 +.b8 101 +.b8 114 +.b8 101 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x105:0x76 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x132:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp11 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 55 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x14a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp9 // DW_AT_low_pc +.b64 $L__tmp10 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x162:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp12 // DW_AT_low_pc +.b64 $L__tmp13 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 59 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source b/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source new file mode 100644 index 0000000000000000000000000000000000000000..d8b5e386ca50dfc1b4fa1774f784632410eb06cb --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source @@ -0,0 +1,1405 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":18:0) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc141 = loc(unknown) +#loc166 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc170 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc175 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc179 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc188 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc193 = loc("in_ptr0"(#loc)) +#loc194 = loc("out_ptr4"(#loc)) +#loc195 = loc("out_ptr5"(#loc)) +#loc196 = loc("out_ptr6"(#loc)) +#loc197 = loc("out_ptr7"(#loc)) +#loc198 = loc("out_ptr8"(#loc)) +#loc199 = loc("out_ptr9"(#loc)) +#loc200 = loc("xnumel"(#loc)) +#loc201 = loc("r0_numel"(#loc)) +#loc257 = loc("x"(#loc91)) +#loc258 = loc("idxs"(#loc91)) +#loc259 = loc("x"(#loc95)) +#loc260 = loc("idxs"(#loc95)) +#loc265 = loc("x"(#loc103)) +#loc266 = loc("idxs"(#loc103)) +#loc267 = loc("flip"(#loc103)) +#loc323 = loc("input"(#loc166)) +#loc324 = loc("a"(#loc170)) +#loc325 = loc("b"(#loc170)) +#loc327 = loc("x"(#loc175)) +#loc328 = loc("x"(#loc179)) +#loc329 = loc("input"(#loc188)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 32 : i32 loc(#loc202) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc203) + %xoffset = tt.get_program_id x : i32 loc(#loc204) + %xoffset_2 = arith.constant 32 : i32 loc(#loc205) + %xoffset_3 = arith.constant 32 : i32 loc(#loc205) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc205) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc206) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc207) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<32x1xi32> loc(#loc208) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<32x1xi32> loc(#loc208) + %xmask = arith.constant dense<32> : tensor<32x1xi32> loc(#loc209) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<32x1xi32> loc(#loc209) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc210) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc211) + %r0_offset = arith.constant 0 : i32 loc(#loc212) + %r0_mask = arith.constant true loc(#loc213) + %r0_mask_10 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %tmp0 = arith.constant 16 : i32 loc(#loc214) + %tmp0_11 = arith.constant 16 : i32 loc(#loc214) + %tmp0_12 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc214) + %tmp0_13 = arith.muli %tmp0_12, %xindex_7 : tensor<32x1xi32> loc(#loc214) + %tmp0_14 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc215) + %tmp0_15 = tt.broadcast %tmp0_13 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc215) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<32x16xi32> loc(#loc215) + %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc216) + %tmp0_18 = tt.addptr %tmp0_17, %tmp0_16 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc216) + %tmp0_19 = arith.constant 0.000000e+00 : f32 loc(#loc217) + %tmp0_20 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc217) + %tmp0_21 = arith.constant dense<0.000000e+00> : tensor<32x16xf32> loc(#loc217) + %tmp0_22 = arith.fptosi %tmp0_21 : tensor<32x16xf32> to tensor<32x16xi64> loc(#loc217) + %tmp0_23 = tt.load %tmp0_18, %tmp0_20, %tmp0_22 : tensor<32x16x!tt.ptr> loc(#loc217) + %tmp1 = arith.constant 0 : i64 loc(#loc218) + %tmp1_24 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc218) + %tmp2 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc219) + %tmp2_25 = arith.cmpi sgt, %tmp0_23, %tmp2 : tensor<32x16xi64> loc(#loc219) + %tmp3 = arith.constant 16384 : i64 loc(#loc220) + %tmp3_26 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc220) + %tmp4 = arith.constant dense<16384> : tensor<32x16xi64> loc(#loc221) + %tmp4_27 = arith.cmpi slt, %tmp0_23, %tmp4 : tensor<32x16xi64> loc(#loc221) + %tmp5 = arith.andi %tmp2_25, %tmp4_27 : tensor<32x16xi1> loc(#loc222) + %tmp6 = arith.extui %tmp5 : tensor<32x16xi1> to tensor<32x16xi8> loc(#loc223) + %tmp7 = arith.extsi %tmp6 : tensor<32x16xi8> to tensor<32x16xi32> loc(#loc224) + %tmp9 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc225) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16> -> tensor<32x16xi16> loc(#loc226) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp7, %tmp11) : (tensor<32x16xi32>, tensor<32x16xi16>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc26) + %tmp14 = arith.constant dense<16384> : tensor<32x16xi64> loc(#loc227) + %tmp14_28 = arith.cmpi eq, %tmp0_23, %tmp14 : tensor<32x16xi64> loc(#loc227) + %tmp15 = arith.extui %tmp14_28 : tensor<32x16xi1> to tensor<32x16xi8> loc(#loc228) + %tmp16 = arith.extsi %tmp15 : tensor<32x16xi8> to tensor<32x16xi32> loc(#loc229) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp16, %tmp11) : (tensor<32x16xi32>, tensor<32x16xi16>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc30) + %tmp20 = arith.extsi %tmp7 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc230) + %tmp23 = arith.constant 0 : i32 loc(#loc231) + %tmp23_29 = arith.constant 0 : i64 loc(#loc231) + %tmp23_30 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc231) + %tmp23_31 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc231) + %tmp23_32 = arith.select %tmp23_31, %tmp20, %tmp23_30 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc231) + %tmp24 = tt.call @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp23_32) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc232) + %tmp24_33 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc233) + %tmp25 = arith.extsi %tmp16 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc234) + %tmp28 = arith.constant 0 : i32 loc(#loc235) + %tmp28_34 = arith.constant 0 : i64 loc(#loc235) + %tmp28_35 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc235) + %tmp28_36 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc235) + %tmp28_37 = arith.select %tmp28_36, %tmp25, %tmp28_35 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc235) + %tmp29 = tt.call @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp28_37) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc236) + %tmp29_38 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc237) + %tmp30 = arith.trunci %tmp24_33 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc238) + %tmp31 = arith.trunci %tmp29_38 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc239) + %tmp32 = arith.extsi %0#1 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc240) + %tmp33 = arith.trunci %tmp32 : tensor<32x16xi64> to tensor<32x16xi32> loc(#loc241) + %tmp34 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc242) + %tmp34_39 = tt.broadcast %tmp30 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc242) + %tmp34_40 = arith.cmpi slt, %tmp34, %tmp34_39 : tensor<32x16xi32> loc(#loc242) + %tmp35 = arith.constant 16 : i32 loc(#loc243) + %tmp35_41 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc243) + %tmp36 = arith.constant dense<16> : tensor<32x16xi32> loc(#loc244) + %tmp36_42 = arith.select %tmp34_40, %tmp33, %tmp36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc244) + %tmp37 = arith.constant 17 : i32 loc(#loc245) + %tmp37_43 = arith.constant dense<17> : tensor<32x16xi32> loc(#loc245) + %tmp38 = arith.addi %tmp36_42, %tmp37_43 : tensor<32x16xi32> loc(#loc246) + %tmp39 = arith.constant 0 : i32 loc(#loc247) + %tmp39_44 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc247) + %tmp39_45 = arith.cmpi slt, %tmp36_42, %tmp39_44 : tensor<32x16xi32> loc(#loc247) + %tmp40 = arith.select %tmp39_45, %tmp38, %tmp36_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc248) + %c0_i32 = arith.constant 0 : i32 loc(#loc50) + %cst = arith.constant dense<0> : tensor<32x16xi32> loc(#loc50) + %2 = arith.cmpi sle, %cst, %tmp40 : tensor<32x16xi32> loc(#loc50) + %c17_i32 = arith.constant 17 : i32 loc(#loc51) + %cst_46 = arith.constant dense<17> : tensor<32x16xi32> loc(#loc51) + %3 = arith.cmpi slt, %tmp40, %cst_46 : tensor<32x16xi32> loc(#loc51) + %4 = arith.andi %2, %3 : tensor<32x16xi1> loc(#loc52) + %true = arith.constant true loc(#loc53) + %cst_47 = arith.constant dense : tensor<32x1xi1> loc(#loc53) + %5 = arith.xori %xmask_8, %cst_47 : tensor<32x1xi1> loc(#loc53) + %6 = tt.broadcast %5 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc54) + %7 = arith.ori %4, %6 : tensor<32x16xi1> loc(#loc54) + tt.assert %7, "index out of bounds: 0 <= tmp40 < 17" : tensor<32x16xi1> loc(#loc55) + %tmp42 = arith.constant 1 : i32 loc(#loc249) + %tmp42_48 = arith.constant dense<1> : tensor<1x1xi32> loc(#loc249) + %tmp43 = arith.extsi %1#1 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc250) + %tmp44 = arith.trunci %tmp43 : tensor<32x16xi64> to tensor<32x16xi32> loc(#loc251) + %tmp45 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc252) + %tmp45_49 = tt.broadcast %tmp31 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc252) + %tmp45_50 = arith.cmpi slt, %tmp45, %tmp45_49 : tensor<32x16xi32> loc(#loc252) + %tmp46 = arith.constant dense<16> : tensor<32x16xi32> loc(#loc253) + %tmp46_51 = arith.select %tmp45_50, %tmp44, %tmp46 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc253) + %tmp47 = arith.addi %tmp46_51, %tmp37_43 : tensor<32x16xi32> loc(#loc254) + %tmp48 = arith.constant 0 : i32 loc(#loc255) + %tmp48_52 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc255) + %tmp48_53 = arith.cmpi slt, %tmp46_51, %tmp48_52 : tensor<32x16xi32> loc(#loc255) + %tmp49 = arith.select %tmp48_53, %tmp47, %tmp46_51 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc256) + %c0_i32_54 = arith.constant 0 : i32 loc(#loc64) + %cst_55 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc64) + %8 = arith.cmpi sle, %cst_55, %tmp49 : tensor<32x16xi32> loc(#loc64) + %c17_i32_56 = arith.constant 17 : i32 loc(#loc65) + %cst_57 = arith.constant dense<17> : tensor<32x16xi32> loc(#loc65) + %9 = arith.cmpi slt, %tmp49, %cst_57 : tensor<32x16xi32> loc(#loc65) + %10 = arith.andi %8, %9 : tensor<32x16xi1> loc(#loc66) + %true_58 = arith.constant true loc(#loc67) + %cst_59 = arith.constant dense : tensor<32x1xi1> loc(#loc67) + %11 = arith.xori %xmask_8, %cst_59 : tensor<32x1xi1> loc(#loc67) + %12 = tt.broadcast %11 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc68) + %13 = arith.ori %10, %12 : tensor<32x16xi1> loc(#loc68) + tt.assert %13, "index out of bounds: 0 <= tmp49 < 17" : tensor<32x16xi1> loc(#loc69) + %14 = tt.splat %out_ptr4 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc70) + %15 = tt.addptr %14, %xindex_7 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc70) + tt.store %15, %tmp30, %xmask_8 : tensor<32x1x!tt.ptr> loc(#loc71) + %16 = tt.splat %out_ptr5 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc72) + %17 = tt.addptr %16, %xindex_7 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc72) + tt.store %17, %tmp31, %xmask_8 : tensor<32x1x!tt.ptr> loc(#loc73) + %c16_i32 = arith.constant 16 : i32 loc(#loc74) + %c16_i32_60 = arith.constant 16 : i32 loc(#loc74) + %cst_61 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc74) + %18 = arith.muli %cst_61, %xindex_7 : tensor<32x1xi32> loc(#loc74) + %19 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc75) + %20 = tt.broadcast %18 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc75) + %21 = arith.addi %19, %20 : tensor<32x16xi32> loc(#loc75) + %22 = tt.splat %out_ptr6 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc76) + %23 = tt.addptr %22, %21 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc76) + %24 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc77) + tt.store %23, %tmp33, %24 : tensor<32x16x!tt.ptr> loc(#loc77) + %c17_i32_62 = arith.constant 17 : i32 loc(#loc78) + %c17_i32_63 = arith.constant 17 : i32 loc(#loc78) + %cst_64 = arith.constant dense<17> : tensor<32x1xi32> loc(#loc78) + %25 = arith.muli %cst_64, %xindex_7 : tensor<32x1xi32> loc(#loc78) + %26 = tt.broadcast %25 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc79) + %27 = arith.addi %tmp40, %26 : tensor<32x16xi32> loc(#loc79) + %28 = tt.splat %out_ptr7 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc80) + %29 = tt.addptr %28, %27 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc80) + %cst_65 = arith.constant dense<1> : tensor<32x16xi32> loc(#loc81) + %30 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc81) + tt.store %29, %cst_65, %30 : tensor<32x16x!tt.ptr> loc(#loc81) + %c16_i32_66 = arith.constant 16 : i32 loc(#loc82) + %c16_i32_67 = arith.constant 16 : i32 loc(#loc82) + %cst_68 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc82) + %31 = arith.muli %cst_68, %xindex_7 : tensor<32x1xi32> loc(#loc82) + %32 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc83) + %33 = tt.broadcast %31 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc83) + %34 = arith.addi %32, %33 : tensor<32x16xi32> loc(#loc83) + %35 = tt.splat %out_ptr8 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc84) + %36 = tt.addptr %35, %34 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc84) + %37 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc85) + tt.store %36, %tmp44, %37 : tensor<32x16x!tt.ptr> loc(#loc85) + %c17_i32_69 = arith.constant 17 : i32 loc(#loc86) + %c17_i32_70 = arith.constant 17 : i32 loc(#loc86) + %cst_71 = arith.constant dense<17> : tensor<32x1xi32> loc(#loc86) + %38 = arith.muli %cst_71, %xindex_7 : tensor<32x1xi32> loc(#loc86) + %39 = tt.broadcast %38 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc87) + %40 = arith.addi %tmp49, %39 : tensor<32x16xi32> loc(#loc87) + %41 = tt.splat %out_ptr9 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc88) + %42 = tt.addptr %41, %40 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc88) + %cst_72 = arith.constant dense<1> : tensor<32x16xi32> loc(#loc89) + %43 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc89) + tt.store %42, %cst_72, %43 : tensor<32x16x!tt.ptr> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc91)), %idxs: tensor<32x16xi16> loc("idxs"(#loc91))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<32x16xi32>, tensor<32x16xi16>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc92) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc92) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc92) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc92) + tt.return %3#0, %3#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc93) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc94) + %5 = ub.poison : tensor<32x16xi32> loc(#loc94) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc94) + } loc(#loc91) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc95)), %idxs: tensor<32x16xi16> loc("idxs"(#loc95))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i16S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi16>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + tt.return %0#0, %0#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc101) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi32> loc(#loc102) + %2 = ub.poison : tensor<32x16xi32> loc(#loc102) + tt.return %1, %2 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i16S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi16> loc("idxs"(#loc103)), %flip: tensor<32x16xi32> loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi16> -> tensor<256x2x1xi16> loc(#loc282) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc283) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc284) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<256x2x1xi16> loc(#loc284) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<256x2x1xi16>) -> tensor<256x1xi32> loc(#loc285) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc286) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc287) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc288) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc289) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<256x2x1xi16> loc(#loc289) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<256x2x1xi16>) -> tensor<256x1xi32> loc(#loc290) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc291) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc292) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_27 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_28 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_49 = arith.constant true loc(#loc300) + %cond_50 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<32x16xi1> loc(#loc300) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<32x16xi1> loc(#loc301) + %cond_53 = arith.ori %cond, %cond_52 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_53 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_50 = arith.ori %eq, %eq_49 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_50 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<32x16xi32> loc(#loc306) + %cond_30 = arith.andi %3, %cond_29 : tensor<32x16xi1> loc(#loc307) + %cond_31 = arith.ori %1, %cond_30 : tensor<32x16xi1> loc(#loc308) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<32x16xi1> loc(#loc309) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<32x16xi1> loc(#loc310) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<32x16xi1> loc(#loc311) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<32x16xi1> loc(#loc312) + %cond_36 = arith.extui %cond_35 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc313) + %cond_37 = arith.xori %cond_36, %flip : tensor<32x16xi32> loc(#loc313) + %cond_38 = arith.constant 0 : i32 loc(#loc314) + %cond_39 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc314) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<32x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_43 = arith.xori %x, %ret_42 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<32x16xi32> loc(#loc319) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S32_16S__(%idxs) : (tensor<32x16xi16>) -> tensor<32x16xi16> loc(#loc320) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<32x16xi16> to tensor<32x16xi32> loc(#loc321) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_47 = arith.extsi %idxs : tensor<32x16xi16> to tensor<32x16xi32> loc(#loc322) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_43, %new_idxs_48 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<256x2x1xi32> loc("input"(#loc166))) -> tensor<256x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc167) + tt.return %0 : tensor<256x1xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<256x1xi32> loc(#loc169) + tt.return %1 : tensor<256x1xi32> loc(#loc169) + } loc(#loc166) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc170)), %b: i32 loc("b"(#loc170))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc171) + tt.return %0 : i32 loc(#loc172) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc173) + tt.return %1 : i32 loc(#loc173) + } loc(#loc170) + tt.func private @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<256x2x1xi16> loc("input"(#loc166))) -> tensor<256x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc326) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc167) + tt.return %0 : tensor<256x1xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<256x1xi32> loc(#loc169) + tt.return %1 : tensor<256x1xi32> loc(#loc169) + } loc(#loc166) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%x: tensor<32x16xi32> loc("x"(#loc175))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc176) + %false = arith.constant false loc(#loc177) + tt.return %false : i1 loc(#loc177) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc178) + tt.return %1 : i1 loc(#loc178) + } loc(#loc175) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S32_16S__(%x: tensor<32x16xi32> loc("x"(#loc179))) -> tensor<32x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc180) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc181) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc181) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<32x16xi32> loc(#loc181) + %4 = arith.addi %x, %3 : tensor<32x16xi32> loc(#loc181) + tt.return %4 : tensor<32x16xi32> loc(#loc182) + ^bb1: // no predecessors + %5 = ub.poison : tensor<32x16xi32> loc(#loc183) + tt.return %5 : tensor<32x16xi32> loc(#loc183) + } loc(#loc179) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc185) + %cst = arith.constant dense : tensor<1xi1> loc(#loc185) + tt.return %cst : tensor<1xi1> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc187) + tt.return %0 : tensor<1xi1> loc(#loc187) + } loc(#loc184) + tt.func private @triton.language.standard.zeros_like__i32S32_16S__(%input: tensor<32x16xi32> loc("input"(#loc188))) -> tensor<32x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<32x16xi32> loc(#loc189) + tt.return %0 : tensor<32x16xi32> loc(#loc190) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi32> loc(#loc191) + tt.return %1 : tensor<32x16xi32> loc(#loc191) + } loc(#loc188) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<32x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc185) + %cst = arith.constant dense<0> : tensor<32x16xi32> loc(#loc185) + tt.return %cst : tensor<32x16xi32> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<32x16xi32> loc(#loc187) + tt.return %0 : tensor<32x16xi32> loc(#loc187) + } loc(#loc184) + tt.func private @triton.language.standard.zeros_like__i16S32_16S__(%input: tensor<32x16xi16> loc("input"(#loc188))) -> tensor<32x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<32x16xi16> loc(#loc189) + tt.return %0 : tensor<32x16xi16> loc(#loc190) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi16> loc(#loc191) + tt.return %1 : tensor<32x16xi16> loc(#loc191) + } loc(#loc188) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<32x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc185) + %cst = arith.constant dense<0> : tensor<32x16xi16> loc(#loc185) + tt.return %cst : tensor<32x16xi16> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<32x16xi16> loc(#loc187) + tt.return %0 : tensor<32x16xi16> loc(#loc187) + } loc(#loc184) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc95)), %idxs: tensor<32x16xi32> loc("idxs"(#loc95))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + tt.return %1#0, %1#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc101) + ^bb1: // no predecessors + %2 = ub.poison : tensor<32x16xi32> loc(#loc102) + %3 = ub.poison : tensor<32x16xi32> loc(#loc102) + tt.return %2, %3 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: tensor<32x16xi32> loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<128x2x2xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<128x2x2xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<128x2x2xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<128x2x2xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x2x2xi32> loc("input"(#loc166))) -> tensor<128x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc167) + tt.return %0 : tensor<128x2xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x2xi32> loc(#loc169) + tt.return %1 : tensor<128x2xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: tensor<32x16xi32> loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x1xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x1xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc95)), %idxs: tensor<32x16xi32> loc("idxs"(#loc95))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + tt.return %2#0, %2#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc101) + ^bb1: // no predecessors + %3 = ub.poison : tensor<32x16xi32> loc(#loc102) + %4 = ub.poison : tensor<32x16xi32> loc(#loc102) + tt.return %3, %4 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: tensor<32x16xi32> loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x4xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<64x2x4xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x4xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x4xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x4xi32> loc("input"(#loc166))) -> tensor<64x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc167) + tt.return %0 : tensor<64x4xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x4xi32> loc(#loc169) + tt.return %1 : tensor<64x4xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc95)), %idxs: tensor<32x16xi32> loc("idxs"(#loc95))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc330) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + tt.return %3#0, %3#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc101) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc102) + %5 = ub.poison : tensor<32x16xi32> loc(#loc102) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x8xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<32x2x8xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x8xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x8xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x2x8xi32> loc("input"(#loc166))) -> tensor<32x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc167) + tt.return %0 : tensor<32x8xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x8xi32> loc(#loc169) + tt.return %1 : tensor<32x8xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x4xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<64x2x4xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x4xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x4xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<128x2x2xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<128x2x2xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<128x2x2xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<128x2x2xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x1xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x1xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x16xi64> loc("input"(#loc166))) -> tensor<32xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc167) + tt.reduce.return %2 : i64 loc(#loc167) + }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc167) + tt.return %0 : tensor<32xi64> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32xi64> loc(#loc169) + tt.return %1 : tensor<32xi64> loc(#loc169) + } loc(#loc166) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc170)), %b: i64 loc("b"(#loc170))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc171) + tt.return %0 : i64 loc(#loc172) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc173) + tt.return %1 : i64 loc(#loc173) + } loc(#loc170) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:30) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:45) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":35:30) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":36:18) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":37:34) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":38:18) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":39:18) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":40:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":41:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":43:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":45:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":46:71) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":47:20) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":48:21) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":49:21) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":51:71) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":52:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":54:35) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":55:26) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":55:29) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":56:21) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":58:35) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":59:26) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":59:29) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":60:21) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":61:21) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":62:21) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":63:21) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":64:19) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":65:32) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":66:35) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":67:44) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":68:20) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":69:20) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":70:35) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:28) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:46) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:38) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:55) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:53) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:63) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":72:31) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":73:21) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":74:21) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":75:19) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":76:35) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":77:20) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":78:20) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":79:35) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:28) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:46) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:38) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:55) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:53) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:63) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":81:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":81:37) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":82:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":82:37) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:32) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:47) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:52) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:49) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:25) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:85) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:35) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:32) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:25) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:47) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:52) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:49) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:85) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:4) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc140 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc142 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc143 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc144 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc145 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc146 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc147 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc148 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc149 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc150 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc151 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc152 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc153 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc154 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc155 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc156 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc157 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc158 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc159 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc160 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc161 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc162 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc163 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc164 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc165 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc167 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc168 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc169 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc171 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc172 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc173 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc174 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc176 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc177 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc178 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc180 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc181 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc182 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc183 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc184 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc185 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc186 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc187 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc189 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc190 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc191 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc192 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc202 = loc("xnumel"(#loc1)) +#loc203 = loc("r0_numel"(#loc2)) +#loc204 = loc("xoffset"(#loc3)) +#loc205 = loc("xoffset"(#loc4)) +#loc206 = loc("xindex"(#loc5)) +#loc207 = loc("xindex"(#loc6)) +#loc208 = loc("xindex"(#loc7)) +#loc209 = loc("xmask"(#loc8)) +#loc210 = loc("r0_index"(#loc9)) +#loc211 = loc("r0_index"(#loc10)) +#loc212 = loc("r0_offset"(#loc11)) +#loc213 = loc("r0_mask"(#loc12)) +#loc214 = loc("tmp0"(#loc13)) +#loc215 = loc("tmp0"(#loc14)) +#loc216 = loc("tmp0"(#loc15)) +#loc217 = loc("tmp0"(#loc16)) +#loc218 = loc("tmp1"(#loc17)) +#loc219 = loc("tmp2"(#loc18)) +#loc220 = loc("tmp3"(#loc19)) +#loc221 = loc("tmp4"(#loc20)) +#loc222 = loc("tmp5"(#loc21)) +#loc223 = loc("tmp6"(#loc22)) +#loc224 = loc("tmp7"(#loc23)) +#loc225 = loc("tmp9"(#loc24)) +#loc226 = loc("tmp11"(#loc25)) +#loc227 = loc("tmp14"(#loc27)) +#loc228 = loc("tmp15"(#loc28)) +#loc229 = loc("tmp16"(#loc29)) +#loc230 = loc("tmp20"(#loc31)) +#loc231 = loc("tmp23"(#loc32)) +#loc232 = loc("tmp24"(#loc33)) +#loc233 = loc("tmp24"(#loc34)) +#loc234 = loc("tmp25"(#loc35)) +#loc235 = loc("tmp28"(#loc36)) +#loc236 = loc("tmp29"(#loc37)) +#loc237 = loc("tmp29"(#loc38)) +#loc238 = loc("tmp30"(#loc39)) +#loc239 = loc("tmp31"(#loc40)) +#loc240 = loc("tmp32"(#loc41)) +#loc241 = loc("tmp33"(#loc42)) +#loc242 = loc("tmp34"(#loc43)) +#loc243 = loc("tmp35"(#loc44)) +#loc244 = loc("tmp36"(#loc45)) +#loc245 = loc("tmp37"(#loc46)) +#loc246 = loc("tmp38"(#loc47)) +#loc247 = loc("tmp39"(#loc48)) +#loc248 = loc("tmp40"(#loc49)) +#loc249 = loc("tmp42"(#loc56)) +#loc250 = loc("tmp43"(#loc57)) +#loc251 = loc("tmp44"(#loc58)) +#loc252 = loc("tmp45"(#loc59)) +#loc253 = loc("tmp46"(#loc60)) +#loc254 = loc("tmp47"(#loc61)) +#loc255 = loc("tmp48"(#loc62)) +#loc256 = loc("tmp49"(#loc63)) +#loc261 = loc("flip"(#loc96)) +#loc262 = loc("flip"(#loc97)) +#loc263 = loc("flip"(#loc98)) +#loc264 = loc("flip"(#loc99)) +#loc268 = loc("y"(#loc104)) +#loc269 = loc("right_mask"(#loc105)) +#loc270 = loc("right_mask"(#loc106)) +#loc271 = loc("left_mask"(#loc107)) +#loc272 = loc("ileft"(#loc108)) +#loc273 = loc("ileft"(#loc109)) +#loc274 = loc("ileft"(#loc110)) +#loc275 = loc("ileft"(#loc111)) +#loc276 = loc("iright"(#loc112)) +#loc277 = loc("iright"(#loc113)) +#loc278 = loc("iright"(#loc114)) +#loc279 = loc("iright"(#loc115)) +#loc280 = loc("ileft"(#loc116)) +#loc281 = loc("iright"(#loc117)) +#loc282 = loc("y_idx"(#loc118)) +#loc283 = loc("left_idx"(#loc119)) +#loc284 = loc("left_idx"(#loc120)) +#loc285 = loc("left_idx"(#loc121)) +#loc286 = loc("left_idx"(#loc122)) +#loc287 = loc("left_idx"(#loc123)) +#loc288 = loc("right_idx"(#loc124)) +#loc289 = loc("right_idx"(#loc125)) +#loc290 = loc("right_idx"(#loc126)) +#loc291 = loc("right_idx"(#loc127)) +#loc292 = loc("right_idx"(#loc128)) +#loc293 = loc("left_idx"(#loc129)) +#loc294 = loc("right_idx"(#loc130)) +#loc295 = loc("left_valid_mask"(#loc131)) +#loc296 = loc("right_valid_mask"(#loc132)) +#loc297 = loc("left_isnan"(#loc133)) +#loc298 = loc("right_isnan"(#loc134)) +#loc299 = loc("cond"(#loc135)) +#loc300 = loc("cond"(#loc138)) +#loc301 = loc("cond"(#loc139)) +#loc302 = loc("cond"(#loc140)) +#loc303 = loc("eq"(#loc142)) +#loc304 = loc("eq"(#loc145)) +#loc305 = loc("eq"(#loc146)) +#loc306 = loc("cond"(#loc147)) +#loc307 = loc("cond"(#loc148)) +#loc308 = loc("cond"(#loc149)) +#loc309 = loc("cond"(#loc150)) +#loc310 = loc("cond"(#loc151)) +#loc311 = loc("cond"(#loc152)) +#loc312 = loc("cond"(#loc153)) +#loc313 = loc("cond"(#loc154)) +#loc314 = loc("cond"(#loc155)) +#loc315 = loc("ret"(#loc156)) +#loc316 = loc("ret"(#loc157)) +#loc317 = loc("ret"(#loc158)) +#loc318 = loc("ret"(#loc159)) +#loc319 = loc("new_idxs"(#loc160)) +#loc320 = loc("new_idxs"(#loc161)) +#loc321 = loc("new_idxs"(#loc162)) +#loc322 = loc("new_idxs"(#loc163)) +#loc326 = loc("input"(#loc174)) +#loc330 = loc("flip"(#loc192)) +#loc331 = loc("cond"(#loc299)) +#loc332 = loc("cond"(#loc302)) +#loc333 = loc("eq"(#loc303)) +#loc334 = loc("eq"(#loc305)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..93dd75332a43f12b7a05e6bea3e57664413d96ea --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir @@ -0,0 +1,1480 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1, 4], threadsPerWarp = [8, 2, 2], warpsPerCTA = [4, 1, 1], order = [2, 1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1, 4], threadsPerWarp = [16, 2, 1], warpsPerCTA = [4, 1, 1], order = [2, 1, 0]}> +#blocked3 = #ttg.blocked<{sizePerThread = [1, 2, 2], threadsPerWarp = [32, 1, 1], warpsPerCTA = [4, 1, 1], order = [2, 1, 0]}> +#blocked4 = #ttg.blocked<{sizePerThread = [2, 2, 1], threadsPerWarp = [32, 1, 1], warpsPerCTA = [4, 1, 1], order = [2, 1, 0]}> +#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":18:0) +#loc1 = loc(unknown) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":46:71) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":51:71) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":55:26) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":59:26) +#loc117 = loc("in_ptr0"(#loc)) +#loc118 = loc("out_ptr4"(#loc)) +#loc119 = loc("out_ptr5"(#loc)) +#loc120 = loc("out_ptr6"(#loc)) +#loc121 = loc("out_ptr7"(#loc)) +#loc122 = loc("out_ptr8"(#loc)) +#loc123 = loc("out_ptr9"(#loc)) +#loc124 = loc("xnumel"(#loc)) +#loc125 = loc("r0_numel"(#loc)) +#loc144 = loc(callsite(#loc20 at #loc21)) +#loc150 = loc("ileft"(#loc29)) +#loc154 = loc("iright"(#loc34)) +#loc163 = loc("left_idx"(#loc43)) +#loc168 = loc("right_idx"(#loc48)) +#loc189 = loc(callsite(#loc20 at #loc69)) +#loc192 = loc("tmp24"(#loc72)) +#loc197 = loc("tmp29"(#loc77)) +#loc214 = loc(callsite(#loc25 at #loc144)) +#loc218 = loc(callsite(#loc25 at #loc189)) +#loc221 = loc(callsite(#loc1 at #loc192)) +#loc224 = loc(callsite(#loc1 at #loc197)) +#loc228 = loc(callsite(#loc150 at #loc214)) +#loc232 = loc(callsite(#loc154 at #loc214)) +#loc240 = loc(callsite(#loc163 at #loc214)) +#loc245 = loc(callsite(#loc168 at #loc214)) +#loc265 = loc(callsite(#loc150 at #loc218)) +#loc269 = loc(callsite(#loc154 at #loc218)) +#loc287 = loc(callsite(#loc163 at #loc218)) +#loc291 = loc(callsite(#loc168 at #loc218)) +#loc301 = loc(callsite(#loc1 at #loc228)) +#loc303 = loc(callsite(#loc1 at #loc232)) +#loc306 = loc(callsite(#loc1 at #loc240)) +#loc309 = loc(callsite(#loc1 at #loc245)) +#loc311 = loc(callsite(#loc1 at #loc265)) +#loc313 = loc(callsite(#loc1 at #loc269)) +#loc315 = loc(callsite(#loc1 at #loc287)) +#loc317 = loc(callsite(#loc1 at #loc291)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<32x1xi1, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<17> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked2> loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked3> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked4> loc(#loc1) + %cst_5 = arith.constant dense<16> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<32> : tensor<32x1xi32, #blocked5> loc(#loc1) + %cst_7 = arith.constant dense<32> : tensor<32x1xi32, #blocked> loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %cst_8 = arith.constant dense<1> : tensor<32x16xi32, #blocked5> loc(#loc1) + %cst_9 = arith.constant dense<0> : tensor<32x16xi32, #blocked> loc(#loc1) + %cst_10 = arith.constant dense<17> : tensor<32x16xi32, #blocked> loc(#loc1) + %cst_11 = arith.constant dense<16> : tensor<32x16xi32, #blocked> loc(#loc1) + %cst_12 = arith.constant dense<16384> : tensor<32x16xi64, #blocked> loc(#loc1) + %cst_13 = arith.constant dense<0> : tensor<32x16xi64, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc126) + %xoffset_14 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc127) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc128) + %xindex_15 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc128) + %xindex_16 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc128) + %xindex_17 = tt.expand_dims %xindex_15 {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<32x1xi32, #blocked5> loc(#loc128) + %xindex_18 = tt.splat %xoffset_14 : i32 -> tensor<32x1xi32, #blocked> loc(#loc129) + %xindex_19 = tt.splat %xoffset_14 : i32 -> tensor<32x1xi32, #blocked5> loc(#loc129) + %xindex_20 = arith.addi %xindex_18, %xindex_16 : tensor<32x1xi32, #blocked> loc(#loc129) + %xindex_21 = arith.addi %xindex_19, %xindex_17 : tensor<32x1xi32, #blocked5> loc(#loc129) + %xmask = arith.cmpi slt, %xindex_20, %cst_7 : tensor<32x1xi32, #blocked> loc(#loc130) + %xmask_22 = arith.cmpi slt, %xindex_21, %cst_6 : tensor<32x1xi32, #blocked5> loc(#loc130) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc131) + %r0_index_23 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc131) + %tmp0 = arith.muli %xindex_20, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc132) + %tmp0_24 = tt.broadcast %r0_index_23 : tensor<1x16xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc133) + %tmp0_25 = tt.broadcast %tmp0 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc133) + %tmp0_26 = arith.addi %tmp0_24, %tmp0_25 : tensor<32x16xi32, #blocked> loc(#loc133) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc134) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc134) + %tmp0_29 = tt.broadcast %xmask : tensor<32x1xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc135) + %tmp0_30 = tt.broadcast %xmask_22 : tensor<32x1xi1, #blocked5> -> tensor<32x16xi1, #blocked5> loc(#loc135) + %tmp0_31 = tt.load %tmp0_28, %tmp0_29, %cst_13 : tensor<32x16x!tt.ptr, #blocked> loc(#loc135) + %tmp2 = arith.cmpi sgt, %tmp0_31, %cst_13 : tensor<32x16xi64, #blocked> loc(#loc136) + %tmp4 = arith.cmpi slt, %tmp0_31, %cst_12 : tensor<32x16xi64, #blocked> loc(#loc137) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<32x16xi1, #blocked> loc(#loc138) + %tmp7 = arith.extui %tmp5 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc210) + %tmp9 = arith.trunci %r0_index_23 : tensor<1x16xi32, #blocked> to tensor<1x16xi16, #blocked> loc(#loc141) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16, #blocked> -> tensor<32x16xi16, #blocked> loc(#loc142) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> loc(#loc211) + %flip_32 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> loc(#loc211) + %flip_33 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> loc(#loc211) + %flip_34 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> loc(#loc211) + %flip_35 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> loc(#loc211) + %flip_36 = tt.expand_dims %flip_32 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> loc(#loc211) + %flip_37 = tt.expand_dims %flip_33 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> loc(#loc211) + %flip_38 = tt.expand_dims %flip_34 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> loc(#loc211) + %flip_39 = tt.expand_dims %flip_35 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> -> tensor<1x2x1xi32, #blocked3> loc(#loc211) + %flip_40 = tt.expand_dims %flip_36 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> -> tensor<1x2x1xi32, #blocked4> loc(#loc211) + %flip_41 = tt.expand_dims %flip_37 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> -> tensor<1x2x1xi32, #blocked2> loc(#loc211) + %flip_42 = tt.expand_dims %flip_38 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> -> tensor<1x2x1xi32, #blocked1> loc(#loc211) + %flip_43 = tt.broadcast %flip_39 : tensor<1x2x1xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc212) + %flip_44 = tt.reshape %flip_43 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc213) + %y = tt.reshape %tmp7 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc225) + %left_mask = arith.subi %cst_4, %flip_40 : tensor<1x2x1xi32, #blocked4> loc(#loc226) + %left_mask_45 = arith.subi %cst_3, %flip_39 : tensor<1x2x1xi32, #blocked3> loc(#loc226) + %left_mask_46 = arith.subi %cst_2, %flip_41 : tensor<1x2x1xi32, #blocked2> loc(#loc226) + %left_mask_47 = arith.subi %cst_1, %flip_42 : tensor<1x2x1xi32, #blocked1> loc(#loc226) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc227) + %ileft_48 = arith.muli %y, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc227) + %ileft_49 = "tt.reduce"(%ileft_48) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_50 = tt.expand_dims %ileft_49 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc229) + %ileft_51 = tt.broadcast %ileft_50 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc230) + %iright = tt.broadcast %flip_40 : tensor<1x2x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc231) + %iright_52 = arith.muli %y, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc231) + %iright_53 = "tt.reduce"(%iright_52) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_54 = tt.expand_dims %iright_53 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc233) + %iright_55 = tt.broadcast %iright_54 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc234) + %ileft_56 = tt.reshape %ileft_51 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_57 = tt.reshape %iright_55 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx = tt.reshape %tmp11 : tensor<32x16xi16, #blocked> -> tensor<256x2x1xi16, #blocked4> loc(#loc237) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc238) + %left_idx_58 = tt.broadcast %left_idx : tensor<1x2x1xi16, #blocked4> -> tensor<256x2x1xi16, #blocked4> loc(#loc239) + %left_idx_59 = arith.muli %y_idx, %left_idx_58 : tensor<256x2x1xi16, #blocked4> loc(#loc239) + %input = arith.extsi %left_idx_59 : tensor<256x2x1xi16, #blocked4> to tensor<256x2x1xi32, #blocked4> loc(#loc304) + %left_idx_60 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_61 = tt.expand_dims %left_idx_60 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc241) + %left_idx_62 = tt.broadcast %left_idx_61 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc242) + %right_idx = arith.trunci %flip_40 : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc243) + %right_idx_63 = tt.broadcast %right_idx : tensor<1x2x1xi16, #blocked4> -> tensor<256x2x1xi16, #blocked4> loc(#loc244) + %right_idx_64 = arith.muli %y_idx, %right_idx_63 : tensor<256x2x1xi16, #blocked4> loc(#loc244) + %input_65 = arith.extsi %right_idx_64 : tensor<256x2x1xi16, #blocked4> to tensor<256x2x1xi32, #blocked4> loc(#loc307) + %right_idx_66 = "tt.reduce"(%input_65) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_67 = tt.expand_dims %right_idx_66 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc246) + %right_idx_68 = tt.broadcast %right_idx_67 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc247) + %left_idx_69 = tt.reshape %left_idx_62 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_70 = tt.reshape %right_idx_68 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond = arith.cmpi slt, %ileft_56, %iright_57 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq = arith.cmpi eq, %ileft_56, %iright_57 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_71 = arith.cmpi sgt, %left_idx_69, %right_idx_70 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_72 = arith.andi %eq, %cond_71 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_73 = arith.ori %cond, %cond_72 : tensor<32x16xi1, #blocked> loc(#loc254) + %cond_74 = arith.extui %cond_73 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc255) + %cond_75 = arith.xori %cond_74, %flip_44 : tensor<32x16xi32, #blocked> loc(#loc255) + %cond_76 = arith.cmpi ne, %cond_75, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc256) + %ret = arith.xori %ileft_56, %iright_57 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_77 = arith.select %cond_76, %ret, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_78 = arith.xori %tmp7, %ret_77 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs = arith.xori %left_idx_69, %right_idx_70 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_79 = arith.select %cond_76, %new_idxs, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_80 = arith.extsi %tmp9 : tensor<1x16xi16, #blocked> to tensor<1x16xi32, #blocked> loc(#loc262) + %new_idxs_81 = tt.broadcast %new_idxs_80 : tensor<1x16xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc262) + %new_idxs_82 = arith.xori %new_idxs_81, %new_idxs_79 : tensor<32x16xi32, #blocked> loc(#loc262) + %flip_83 = tt.broadcast %flip_41 : tensor<1x2x1xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc212) + %flip_84 = tt.reshape %flip_83 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc213) + %y_85 = tt.reshape %ret_78 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc225) + %ileft_86 = tt.broadcast %left_mask_45 : tensor<1x2x1xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc227) + %ileft_87 = arith.muli %y_85, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc227) + %ileft_88 = "tt.reduce"(%ileft_87) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_89 = tt.expand_dims %ileft_88 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc229) + %ileft_90 = tt.broadcast %ileft_89 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc230) + %iright_91 = arith.muli %y_85, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc231) + %iright_92 = "tt.reduce"(%iright_91) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_93 = tt.expand_dims %iright_92 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc233) + %iright_94 = tt.broadcast %iright_93 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc234) + %ileft_95 = tt.reshape %ileft_90 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_96 = tt.reshape %iright_94 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_97 = tt.reshape %new_idxs_82 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc237) + %left_idx_98 = arith.muli %y_idx_97, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc239) + %left_idx_99 = "tt.reduce"(%left_idx_98) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_100 = tt.expand_dims %left_idx_99 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc241) + %left_idx_101 = tt.broadcast %left_idx_100 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc242) + %right_idx_102 = arith.muli %y_idx_97, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc244) + %right_idx_103 = "tt.reduce"(%right_idx_102) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_104 = tt.expand_dims %right_idx_103 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc246) + %right_idx_105 = tt.broadcast %right_idx_104 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc247) + %left_idx_106 = tt.reshape %left_idx_101 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_107 = tt.reshape %right_idx_105 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_108 = arith.cmpi slt, %ileft_95, %iright_96 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_109 = arith.cmpi eq, %ileft_95, %iright_96 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_110 = arith.cmpi sgt, %left_idx_106, %right_idx_107 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_111 = arith.andi %eq_109, %cond_110 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_112 = arith.ori %cond_108, %cond_111 : tensor<32x16xi1, #blocked> loc(#loc254) + %cond_113 = arith.extui %cond_112 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc255) + %cond_114 = arith.xori %cond_113, %flip_84 : tensor<32x16xi32, #blocked> loc(#loc255) + %cond_115 = arith.cmpi ne, %cond_114, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc256) + %ret_116 = arith.xori %ileft_95, %iright_96 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_117 = arith.select %cond_115, %ret_116, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_118 = arith.xori %ret_78, %ret_117 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_119 = arith.xori %left_idx_106, %right_idx_107 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_120 = arith.select %cond_115, %new_idxs_119, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_121 = arith.xori %new_idxs_82, %new_idxs_120 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_122 = tt.reshape %ret_118 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc225) + %ileft_123 = arith.muli %y_122, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc227) + %ileft_124 = "tt.reduce"(%ileft_123) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_125 = tt.expand_dims %ileft_124 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc229) + %ileft_126 = tt.broadcast %ileft_125 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc230) + %iright_127 = arith.muli %y_122, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc231) + %iright_128 = "tt.reduce"(%iright_127) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_129 = tt.expand_dims %iright_128 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc233) + %iright_130 = tt.broadcast %iright_129 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc234) + %ileft_131 = tt.reshape %ileft_126 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_132 = tt.reshape %iright_130 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_133 = tt.reshape %new_idxs_121 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc237) + %left_idx_134 = arith.muli %y_idx_133, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc239) + %left_idx_135 = "tt.reduce"(%left_idx_134) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_136 = tt.expand_dims %left_idx_135 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc241) + %left_idx_137 = tt.broadcast %left_idx_136 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc242) + %right_idx_138 = arith.muli %y_idx_133, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc244) + %right_idx_139 = "tt.reduce"(%right_idx_138) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_140 = tt.expand_dims %right_idx_139 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc246) + %right_idx_141 = tt.broadcast %right_idx_140 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc247) + %left_idx_142 = tt.reshape %left_idx_137 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_143 = tt.reshape %right_idx_141 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_144 = arith.cmpi slt, %ileft_131, %iright_132 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_145 = arith.cmpi eq, %ileft_131, %iright_132 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_146 = arith.cmpi sgt, %left_idx_142, %right_idx_143 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_147 = arith.andi %eq_145, %cond_146 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_148 = arith.ori %cond_144, %cond_147 : tensor<32x16xi1, #blocked> loc(#loc254) + %cond_149 = arith.extui %cond_148 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc255) + %cond_150 = arith.xori %cond_149, %flip_84 : tensor<32x16xi32, #blocked> loc(#loc255) + %cond_151 = arith.cmpi ne, %cond_150, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc256) + %ret_152 = arith.xori %ileft_131, %iright_132 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_153 = arith.select %cond_151, %ret_152, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_154 = arith.xori %ret_118, %ret_153 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_155 = arith.xori %left_idx_142, %right_idx_143 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_156 = arith.select %cond_151, %new_idxs_155, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_157 = arith.xori %new_idxs_121, %new_idxs_156 : tensor<32x16xi32, #blocked> loc(#loc262) + %flip_158 = tt.broadcast %flip_42 : tensor<1x2x1xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc212) + %flip_159 = tt.reshape %flip_158 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc213) + %y_160 = tt.reshape %ret_154 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc225) + %ileft_161 = tt.broadcast %left_mask_46 : tensor<1x2x1xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc227) + %ileft_162 = arith.muli %y_160, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc227) + %ileft_163 = "tt.reduce"(%ileft_162) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc300) + %ileft_164 = tt.expand_dims %ileft_163 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc229) + %ileft_165 = tt.broadcast %ileft_164 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc230) + %iright_166 = arith.muli %y_160, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc231) + %iright_167 = "tt.reduce"(%iright_166) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %iright_168 = tt.expand_dims %iright_167 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc233) + %iright_169 = tt.broadcast %iright_168 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc234) + %ileft_170 = tt.reshape %ileft_165 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_171 = tt.reshape %iright_169 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_172 = tt.reshape %new_idxs_157 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc237) + %left_idx_173 = arith.muli %y_idx_172, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc239) + %left_idx_174 = "tt.reduce"(%left_idx_173) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %left_idx_175 = tt.expand_dims %left_idx_174 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc241) + %left_idx_176 = tt.broadcast %left_idx_175 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc242) + %right_idx_177 = arith.muli %y_idx_172, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc244) + %right_idx_178 = "tt.reduce"(%right_idx_177) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc308) + %right_idx_179 = tt.expand_dims %right_idx_178 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc246) + %right_idx_180 = tt.broadcast %right_idx_179 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc247) + %left_idx_181 = tt.reshape %left_idx_176 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_182 = tt.reshape %right_idx_180 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_183 = arith.cmpi slt, %ileft_170, %iright_171 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_184 = arith.cmpi eq, %ileft_170, %iright_171 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_185 = arith.cmpi sgt, %left_idx_181, %right_idx_182 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_186 = arith.andi %eq_184, %cond_185 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_187 = arith.ori %cond_183, %cond_186 : tensor<32x16xi1, #blocked> loc(#loc254) + %cond_188 = arith.extui %cond_187 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc255) + %cond_189 = arith.xori %cond_188, %flip_159 : tensor<32x16xi32, #blocked> loc(#loc255) + %cond_190 = arith.cmpi ne, %cond_189, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc256) + %ret_191 = arith.xori %ileft_170, %iright_171 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_192 = arith.select %cond_190, %ret_191, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_193 = arith.xori %ret_154, %ret_192 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_194 = arith.xori %left_idx_181, %right_idx_182 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_195 = arith.select %cond_190, %new_idxs_194, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_196 = arith.xori %new_idxs_157, %new_idxs_195 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_197 = tt.reshape %ret_193 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc225) + %ileft_198 = arith.muli %y_197, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc227) + %ileft_199 = "tt.reduce"(%ileft_198) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_200 = tt.expand_dims %ileft_199 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc229) + %ileft_201 = tt.broadcast %ileft_200 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc230) + %iright_202 = arith.muli %y_197, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc231) + %iright_203 = "tt.reduce"(%iright_202) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_204 = tt.expand_dims %iright_203 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc233) + %iright_205 = tt.broadcast %iright_204 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc234) + %ileft_206 = tt.reshape %ileft_201 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_207 = tt.reshape %iright_205 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_208 = tt.reshape %new_idxs_196 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc237) + %left_idx_209 = arith.muli %y_idx_208, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc239) + %left_idx_210 = "tt.reduce"(%left_idx_209) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_211 = tt.expand_dims %left_idx_210 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc241) + %left_idx_212 = tt.broadcast %left_idx_211 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc242) + %right_idx_213 = arith.muli %y_idx_208, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc244) + %right_idx_214 = "tt.reduce"(%right_idx_213) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_215 = tt.expand_dims %right_idx_214 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc246) + %right_idx_216 = tt.broadcast %right_idx_215 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc247) + %left_idx_217 = tt.reshape %left_idx_212 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_218 = tt.reshape %right_idx_216 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_219 = arith.cmpi slt, %ileft_206, %iright_207 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_220 = arith.cmpi eq, %ileft_206, %iright_207 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_221 = arith.cmpi sgt, %left_idx_217, %right_idx_218 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_222 = arith.andi %eq_220, %cond_221 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_223 = arith.ori %cond_219, %cond_222 : tensor<32x16xi1, #blocked> loc(#loc254) + %cond_224 = arith.extui %cond_223 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc255) + %cond_225 = arith.xori %cond_224, %flip_159 : tensor<32x16xi32, #blocked> loc(#loc255) + %cond_226 = arith.cmpi ne, %cond_225, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc256) + %ret_227 = arith.xori %ileft_206, %iright_207 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_228 = arith.select %cond_226, %ret_227, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_229 = arith.xori %ret_193, %ret_228 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_230 = arith.xori %left_idx_217, %right_idx_218 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_231 = arith.select %cond_226, %new_idxs_230, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_232 = arith.xori %new_idxs_196, %new_idxs_231 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_233 = tt.reshape %ret_229 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc225) + %ileft_234 = arith.muli %y_233, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc227) + %ileft_235 = "tt.reduce"(%ileft_234) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_236 = tt.expand_dims %ileft_235 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc229) + %ileft_237 = tt.broadcast %ileft_236 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc230) + %iright_238 = arith.muli %y_233, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc231) + %iright_239 = "tt.reduce"(%iright_238) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_240 = tt.expand_dims %iright_239 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc233) + %iright_241 = tt.broadcast %iright_240 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc234) + %ileft_242 = tt.reshape %ileft_237 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_243 = tt.reshape %iright_241 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_244 = tt.reshape %new_idxs_232 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc237) + %left_idx_245 = arith.muli %y_idx_244, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc239) + %left_idx_246 = "tt.reduce"(%left_idx_245) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_247 = tt.expand_dims %left_idx_246 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc241) + %left_idx_248 = tt.broadcast %left_idx_247 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc242) + %right_idx_249 = arith.muli %y_idx_244, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc244) + %right_idx_250 = "tt.reduce"(%right_idx_249) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_251 = tt.expand_dims %right_idx_250 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc246) + %right_idx_252 = tt.broadcast %right_idx_251 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc247) + %left_idx_253 = tt.reshape %left_idx_248 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_254 = tt.reshape %right_idx_252 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_255 = arith.cmpi slt, %ileft_242, %iright_243 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_256 = arith.cmpi eq, %ileft_242, %iright_243 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_257 = arith.cmpi sgt, %left_idx_253, %right_idx_254 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_258 = arith.andi %eq_256, %cond_257 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_259 = arith.ori %cond_255, %cond_258 : tensor<32x16xi1, #blocked> loc(#loc254) + %cond_260 = arith.extui %cond_259 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc255) + %cond_261 = arith.xori %cond_260, %flip_159 : tensor<32x16xi32, #blocked> loc(#loc255) + %cond_262 = arith.cmpi ne, %cond_261, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc256) + %ret_263 = arith.xori %ileft_242, %iright_243 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_264 = arith.select %cond_262, %ret_263, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_265 = arith.xori %ret_229, %ret_264 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_266 = arith.xori %left_idx_253, %right_idx_254 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_267 = arith.select %cond_262, %new_idxs_266, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_268 = arith.xori %new_idxs_232, %new_idxs_267 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_269 = tt.reshape %ret_265 : tensor<32x16xi32, #blocked> -> tensor<32x2x8xi32, #blocked1> loc(#loc225) + %ileft_270 = tt.broadcast %left_mask_47 : tensor<1x2x1xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc227) + %ileft_271 = arith.muli %y_269, %ileft_270 : tensor<32x2x8xi32, #blocked1> loc(#loc227) + %ileft_272 = "tt.reduce"(%ileft_271) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc300) + %ileft_273 = tt.expand_dims %ileft_272 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc229) + %ileft_274 = tt.broadcast %ileft_273 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc230) + %iright_275 = arith.muli %y_269, %flip_158 : tensor<32x2x8xi32, #blocked1> loc(#loc231) + %iright_276 = "tt.reduce"(%iright_275) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc302) + %iright_277 = tt.expand_dims %iright_276 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc233) + %iright_278 = tt.broadcast %iright_277 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc234) + %ileft_279 = tt.reshape %ileft_274 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_280 = tt.reshape %iright_278 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_281 = tt.reshape %new_idxs_268 : tensor<32x16xi32, #blocked> -> tensor<32x2x8xi32, #blocked1> loc(#loc237) + %left_idx_282 = arith.muli %y_idx_281, %ileft_270 : tensor<32x2x8xi32, #blocked1> loc(#loc239) + %left_idx_283 = "tt.reduce"(%left_idx_282) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc305) + %left_idx_284 = tt.expand_dims %left_idx_283 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc241) + %left_idx_285 = tt.broadcast %left_idx_284 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc242) + %right_idx_286 = arith.muli %y_idx_281, %flip_158 : tensor<32x2x8xi32, #blocked1> loc(#loc244) + %right_idx_287 = "tt.reduce"(%right_idx_286) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc308) + %right_idx_288 = tt.expand_dims %right_idx_287 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc246) + %right_idx_289 = tt.broadcast %right_idx_288 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc247) + %left_idx_290 = tt.reshape %left_idx_285 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_291 = tt.reshape %right_idx_289 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_292 = arith.cmpi slt, %ileft_279, %iright_280 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_293 = arith.cmpi eq, %ileft_279, %iright_280 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_294 = arith.cmpi sgt, %left_idx_290, %right_idx_291 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_295 = arith.andi %eq_293, %cond_294 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_296 = arith.ori %cond_292, %cond_295 : tensor<32x16xi1, #blocked> loc(#loc254) + %ret_297 = arith.xori %ileft_279, %iright_280 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_298 = arith.select %cond_296, %ret_297, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_299 = arith.xori %ret_265, %ret_298 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_300 = arith.xori %left_idx_290, %right_idx_291 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_301 = arith.select %cond_296, %new_idxs_300, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_302 = arith.xori %new_idxs_268, %new_idxs_301 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_303 = tt.reshape %ret_299 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc225) + %ileft_304 = arith.muli %y_303, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc227) + %ileft_305 = "tt.reduce"(%ileft_304) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc300) + %ileft_306 = tt.expand_dims %ileft_305 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc229) + %ileft_307 = tt.broadcast %ileft_306 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc230) + %iright_308 = arith.muli %y_303, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc231) + %iright_309 = "tt.reduce"(%iright_308) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %iright_310 = tt.expand_dims %iright_309 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc233) + %iright_311 = tt.broadcast %iright_310 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc234) + %ileft_312 = tt.reshape %ileft_307 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_313 = tt.reshape %iright_311 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_314 = tt.reshape %new_idxs_302 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc237) + %left_idx_315 = arith.muli %y_idx_314, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc239) + %left_idx_316 = "tt.reduce"(%left_idx_315) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %left_idx_317 = tt.expand_dims %left_idx_316 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc241) + %left_idx_318 = tt.broadcast %left_idx_317 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc242) + %right_idx_319 = arith.muli %y_idx_314, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc244) + %right_idx_320 = "tt.reduce"(%right_idx_319) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc308) + %right_idx_321 = tt.expand_dims %right_idx_320 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc246) + %right_idx_322 = tt.broadcast %right_idx_321 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc247) + %left_idx_323 = tt.reshape %left_idx_318 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_324 = tt.reshape %right_idx_322 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_325 = arith.cmpi slt, %ileft_312, %iright_313 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_326 = arith.cmpi eq, %ileft_312, %iright_313 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_327 = arith.cmpi sgt, %left_idx_323, %right_idx_324 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_328 = arith.andi %eq_326, %cond_327 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_329 = arith.ori %cond_325, %cond_328 : tensor<32x16xi1, #blocked> loc(#loc254) + %ret_330 = arith.xori %ileft_312, %iright_313 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_331 = arith.select %cond_329, %ret_330, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_332 = arith.xori %ret_299, %ret_331 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_333 = arith.xori %left_idx_323, %right_idx_324 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_334 = arith.select %cond_329, %new_idxs_333, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_335 = arith.xori %new_idxs_302, %new_idxs_334 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_336 = tt.reshape %ret_332 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc225) + %ileft_337 = arith.muli %y_336, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc227) + %ileft_338 = "tt.reduce"(%ileft_337) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_339 = tt.expand_dims %ileft_338 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc229) + %ileft_340 = tt.broadcast %ileft_339 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc230) + %iright_341 = arith.muli %y_336, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc231) + %iright_342 = "tt.reduce"(%iright_341) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_343 = tt.expand_dims %iright_342 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc233) + %iright_344 = tt.broadcast %iright_343 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc234) + %ileft_345 = tt.reshape %ileft_340 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_346 = tt.reshape %iright_344 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_347 = tt.reshape %new_idxs_335 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc237) + %left_idx_348 = arith.muli %y_idx_347, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc239) + %left_idx_349 = "tt.reduce"(%left_idx_348) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_350 = tt.expand_dims %left_idx_349 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc241) + %left_idx_351 = tt.broadcast %left_idx_350 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc242) + %right_idx_352 = arith.muli %y_idx_347, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc244) + %right_idx_353 = "tt.reduce"(%right_idx_352) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_354 = tt.expand_dims %right_idx_353 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc246) + %right_idx_355 = tt.broadcast %right_idx_354 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc247) + %left_idx_356 = tt.reshape %left_idx_351 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_357 = tt.reshape %right_idx_355 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_358 = arith.cmpi slt, %ileft_345, %iright_346 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_359 = arith.cmpi eq, %ileft_345, %iright_346 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_360 = arith.cmpi sgt, %left_idx_356, %right_idx_357 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_361 = arith.andi %eq_359, %cond_360 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_362 = arith.ori %cond_358, %cond_361 : tensor<32x16xi1, #blocked> loc(#loc254) + %ret_363 = arith.xori %ileft_345, %iright_346 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_364 = arith.select %cond_362, %ret_363, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_365 = arith.xori %ret_332, %ret_364 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_366 = arith.xori %left_idx_356, %right_idx_357 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_367 = arith.select %cond_362, %new_idxs_366, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_368 = arith.xori %new_idxs_335, %new_idxs_367 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_369 = tt.reshape %ret_365 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc225) + %ileft_370 = arith.muli %y_369, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc227) + %ileft_371 = "tt.reduce"(%ileft_370) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_372 = tt.expand_dims %ileft_371 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc229) + %ileft_373 = tt.broadcast %ileft_372 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc230) + %iright_374 = arith.muli %y_369, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc231) + %iright_375 = "tt.reduce"(%iright_374) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_376 = tt.expand_dims %iright_375 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc233) + %iright_377 = tt.broadcast %iright_376 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc234) + %ileft_378 = tt.reshape %ileft_373 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_379 = tt.reshape %iright_377 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_380 = tt.reshape %new_idxs_368 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc237) + %left_idx_381 = arith.muli %y_idx_380, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc239) + %left_idx_382 = "tt.reduce"(%left_idx_381) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_383 = tt.expand_dims %left_idx_382 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc241) + %left_idx_384 = tt.broadcast %left_idx_383 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc242) + %right_idx_385 = arith.muli %y_idx_380, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc244) + %right_idx_386 = "tt.reduce"(%right_idx_385) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_387 = tt.expand_dims %right_idx_386 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc246) + %right_idx_388 = tt.broadcast %right_idx_387 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc247) + %left_idx_389 = tt.reshape %left_idx_384 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_390 = tt.reshape %right_idx_388 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_391 = arith.cmpi slt, %ileft_378, %iright_379 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_392 = arith.cmpi eq, %ileft_378, %iright_379 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_393 = arith.cmpi sgt, %left_idx_389, %right_idx_390 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_394 = arith.andi %eq_392, %cond_393 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_395 = arith.ori %cond_391, %cond_394 : tensor<32x16xi1, #blocked> loc(#loc254) + %new_idxs_396 = arith.xori %left_idx_389, %right_idx_390 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_397 = arith.select %cond_395, %new_idxs_396, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_398 = arith.xori %new_idxs_368, %new_idxs_397 : tensor<32x16xi32, #blocked> loc(#loc262) + %tmp14 = arith.cmpi eq, %tmp0_31, %cst_12 : tensor<32x16xi64, #blocked> loc(#loc186) + %tmp16 = arith.extui %tmp14 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc217) + %y_399 = tt.reshape %tmp16 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc263) + %ileft_400 = arith.muli %y_399, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc264) + %ileft_401 = "tt.reduce"(%ileft_400) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_402 = tt.expand_dims %ileft_401 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc266) + %ileft_403 = tt.broadcast %ileft_402 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc267) + %iright_404 = arith.muli %y_399, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc268) + %iright_405 = "tt.reduce"(%iright_404) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_406 = tt.expand_dims %iright_405 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc270) + %iright_407 = tt.broadcast %iright_406 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc271) + %ileft_408 = tt.reshape %ileft_403 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_409 = tt.reshape %iright_407 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc273) + %cond_410 = arith.cmpi slt, %ileft_408, %iright_409 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_411 = arith.cmpi eq, %ileft_408, %iright_409 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_412 = arith.andi %eq_411, %cond_71 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_413 = arith.ori %cond_410, %cond_412 : tensor<32x16xi1, #blocked> loc(#loc277) + %cond_414 = arith.extui %cond_413 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc278) + %cond_415 = arith.xori %cond_414, %flip_44 : tensor<32x16xi32, #blocked> loc(#loc278) + %cond_416 = arith.cmpi ne, %cond_415, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc279) + %ret_417 = arith.xori %ileft_408, %iright_409 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_418 = arith.select %cond_416, %ret_417, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_419 = arith.xori %tmp16, %ret_418 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_420 = arith.select %cond_416, %new_idxs, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_421 = arith.xori %new_idxs_81, %new_idxs_420 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_422 = tt.reshape %ret_419 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc263) + %ileft_423 = arith.muli %y_422, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc264) + %ileft_424 = "tt.reduce"(%ileft_423) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_425 = tt.expand_dims %ileft_424 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc266) + %ileft_426 = tt.broadcast %ileft_425 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc267) + %iright_427 = arith.muli %y_422, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc268) + %iright_428 = "tt.reduce"(%iright_427) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_429 = tt.expand_dims %iright_428 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc270) + %iright_430 = tt.broadcast %iright_429 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc271) + %ileft_431 = tt.reshape %ileft_426 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_432 = tt.reshape %iright_430 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_433 = tt.reshape %new_idxs_421 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc285) + %left_idx_434 = arith.muli %y_idx_433, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc286) + %left_idx_435 = "tt.reduce"(%left_idx_434) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_436 = tt.expand_dims %left_idx_435 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc288) + %left_idx_437 = tt.broadcast %left_idx_436 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc289) + %right_idx_438 = arith.muli %y_idx_433, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc290) + %right_idx_439 = "tt.reduce"(%right_idx_438) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_440 = tt.expand_dims %right_idx_439 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc292) + %right_idx_441 = tt.broadcast %right_idx_440 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc293) + %left_idx_442 = tt.reshape %left_idx_437 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_443 = tt.reshape %right_idx_441 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_444 = arith.cmpi slt, %ileft_431, %iright_432 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_445 = arith.cmpi eq, %ileft_431, %iright_432 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_446 = arith.cmpi sgt, %left_idx_442, %right_idx_443 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_447 = arith.andi %eq_445, %cond_446 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_448 = arith.ori %cond_444, %cond_447 : tensor<32x16xi1, #blocked> loc(#loc277) + %cond_449 = arith.extui %cond_448 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc278) + %cond_450 = arith.xori %cond_449, %flip_84 : tensor<32x16xi32, #blocked> loc(#loc278) + %cond_451 = arith.cmpi ne, %cond_450, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc279) + %ret_452 = arith.xori %ileft_431, %iright_432 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_453 = arith.select %cond_451, %ret_452, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_454 = arith.xori %ret_419, %ret_453 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_455 = arith.xori %left_idx_442, %right_idx_443 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_456 = arith.select %cond_451, %new_idxs_455, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_457 = arith.xori %new_idxs_421, %new_idxs_456 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_458 = tt.reshape %ret_454 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc263) + %ileft_459 = arith.muli %y_458, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc264) + %ileft_460 = "tt.reduce"(%ileft_459) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_461 = tt.expand_dims %ileft_460 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc266) + %ileft_462 = tt.broadcast %ileft_461 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc267) + %iright_463 = arith.muli %y_458, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc268) + %iright_464 = "tt.reduce"(%iright_463) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_465 = tt.expand_dims %iright_464 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc270) + %iright_466 = tt.broadcast %iright_465 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc271) + %ileft_467 = tt.reshape %ileft_462 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_468 = tt.reshape %iright_466 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_469 = tt.reshape %new_idxs_457 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc285) + %left_idx_470 = arith.muli %y_idx_469, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc286) + %left_idx_471 = "tt.reduce"(%left_idx_470) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_472 = tt.expand_dims %left_idx_471 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc288) + %left_idx_473 = tt.broadcast %left_idx_472 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc289) + %right_idx_474 = arith.muli %y_idx_469, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc290) + %right_idx_475 = "tt.reduce"(%right_idx_474) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_476 = tt.expand_dims %right_idx_475 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc292) + %right_idx_477 = tt.broadcast %right_idx_476 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc293) + %left_idx_478 = tt.reshape %left_idx_473 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_479 = tt.reshape %right_idx_477 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_480 = arith.cmpi slt, %ileft_467, %iright_468 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_481 = arith.cmpi eq, %ileft_467, %iright_468 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_482 = arith.cmpi sgt, %left_idx_478, %right_idx_479 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_483 = arith.andi %eq_481, %cond_482 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_484 = arith.ori %cond_480, %cond_483 : tensor<32x16xi1, #blocked> loc(#loc277) + %cond_485 = arith.extui %cond_484 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc278) + %cond_486 = arith.xori %cond_485, %flip_84 : tensor<32x16xi32, #blocked> loc(#loc278) + %cond_487 = arith.cmpi ne, %cond_486, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc279) + %ret_488 = arith.xori %ileft_467, %iright_468 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_489 = arith.select %cond_487, %ret_488, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_490 = arith.xori %ret_454, %ret_489 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_491 = arith.xori %left_idx_478, %right_idx_479 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_492 = arith.select %cond_487, %new_idxs_491, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_493 = arith.xori %new_idxs_457, %new_idxs_492 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_494 = tt.reshape %ret_490 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc263) + %ileft_495 = arith.muli %y_494, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc264) + %ileft_496 = "tt.reduce"(%ileft_495) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc310) + %ileft_497 = tt.expand_dims %ileft_496 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc266) + %ileft_498 = tt.broadcast %ileft_497 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc267) + %iright_499 = arith.muli %y_494, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc268) + %iright_500 = "tt.reduce"(%iright_499) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc312) + %iright_501 = tt.expand_dims %iright_500 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc270) + %iright_502 = tt.broadcast %iright_501 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc271) + %ileft_503 = tt.reshape %ileft_498 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_504 = tt.reshape %iright_502 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_505 = tt.reshape %new_idxs_493 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc285) + %left_idx_506 = arith.muli %y_idx_505, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc286) + %left_idx_507 = "tt.reduce"(%left_idx_506) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc314) + %left_idx_508 = tt.expand_dims %left_idx_507 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc288) + %left_idx_509 = tt.broadcast %left_idx_508 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc289) + %right_idx_510 = arith.muli %y_idx_505, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc290) + %right_idx_511 = "tt.reduce"(%right_idx_510) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc316) + %right_idx_512 = tt.expand_dims %right_idx_511 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc292) + %right_idx_513 = tt.broadcast %right_idx_512 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc293) + %left_idx_514 = tt.reshape %left_idx_509 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_515 = tt.reshape %right_idx_513 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_516 = arith.cmpi slt, %ileft_503, %iright_504 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_517 = arith.cmpi eq, %ileft_503, %iright_504 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_518 = arith.cmpi sgt, %left_idx_514, %right_idx_515 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_519 = arith.andi %eq_517, %cond_518 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_520 = arith.ori %cond_516, %cond_519 : tensor<32x16xi1, #blocked> loc(#loc277) + %cond_521 = arith.extui %cond_520 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc278) + %cond_522 = arith.xori %cond_521, %flip_159 : tensor<32x16xi32, #blocked> loc(#loc278) + %cond_523 = arith.cmpi ne, %cond_522, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc279) + %ret_524 = arith.xori %ileft_503, %iright_504 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_525 = arith.select %cond_523, %ret_524, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_526 = arith.xori %ret_490, %ret_525 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_527 = arith.xori %left_idx_514, %right_idx_515 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_528 = arith.select %cond_523, %new_idxs_527, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_529 = arith.xori %new_idxs_493, %new_idxs_528 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_530 = tt.reshape %ret_526 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc263) + %ileft_531 = arith.muli %y_530, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc264) + %ileft_532 = "tt.reduce"(%ileft_531) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_533 = tt.expand_dims %ileft_532 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc266) + %ileft_534 = tt.broadcast %ileft_533 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc267) + %iright_535 = arith.muli %y_530, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc268) + %iright_536 = "tt.reduce"(%iright_535) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_537 = tt.expand_dims %iright_536 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc270) + %iright_538 = tt.broadcast %iright_537 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc271) + %ileft_539 = tt.reshape %ileft_534 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_540 = tt.reshape %iright_538 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_541 = tt.reshape %new_idxs_529 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc285) + %left_idx_542 = arith.muli %y_idx_541, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc286) + %left_idx_543 = "tt.reduce"(%left_idx_542) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_544 = tt.expand_dims %left_idx_543 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc288) + %left_idx_545 = tt.broadcast %left_idx_544 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc289) + %right_idx_546 = arith.muli %y_idx_541, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc290) + %right_idx_547 = "tt.reduce"(%right_idx_546) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_548 = tt.expand_dims %right_idx_547 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc292) + %right_idx_549 = tt.broadcast %right_idx_548 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc293) + %left_idx_550 = tt.reshape %left_idx_545 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_551 = tt.reshape %right_idx_549 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_552 = arith.cmpi slt, %ileft_539, %iright_540 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_553 = arith.cmpi eq, %ileft_539, %iright_540 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_554 = arith.cmpi sgt, %left_idx_550, %right_idx_551 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_555 = arith.andi %eq_553, %cond_554 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_556 = arith.ori %cond_552, %cond_555 : tensor<32x16xi1, #blocked> loc(#loc277) + %cond_557 = arith.extui %cond_556 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc278) + %cond_558 = arith.xori %cond_557, %flip_159 : tensor<32x16xi32, #blocked> loc(#loc278) + %cond_559 = arith.cmpi ne, %cond_558, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc279) + %ret_560 = arith.xori %ileft_539, %iright_540 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_561 = arith.select %cond_559, %ret_560, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_562 = arith.xori %ret_526, %ret_561 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_563 = arith.xori %left_idx_550, %right_idx_551 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_564 = arith.select %cond_559, %new_idxs_563, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_565 = arith.xori %new_idxs_529, %new_idxs_564 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_566 = tt.reshape %ret_562 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc263) + %ileft_567 = arith.muli %y_566, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc264) + %ileft_568 = "tt.reduce"(%ileft_567) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_569 = tt.expand_dims %ileft_568 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc266) + %ileft_570 = tt.broadcast %ileft_569 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc267) + %iright_571 = arith.muli %y_566, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc268) + %iright_572 = "tt.reduce"(%iright_571) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_573 = tt.expand_dims %iright_572 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc270) + %iright_574 = tt.broadcast %iright_573 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc271) + %ileft_575 = tt.reshape %ileft_570 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_576 = tt.reshape %iright_574 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_577 = tt.reshape %new_idxs_565 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc285) + %left_idx_578 = arith.muli %y_idx_577, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc286) + %left_idx_579 = "tt.reduce"(%left_idx_578) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_580 = tt.expand_dims %left_idx_579 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc288) + %left_idx_581 = tt.broadcast %left_idx_580 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc289) + %right_idx_582 = arith.muli %y_idx_577, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc290) + %right_idx_583 = "tt.reduce"(%right_idx_582) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_584 = tt.expand_dims %right_idx_583 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc292) + %right_idx_585 = tt.broadcast %right_idx_584 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc293) + %left_idx_586 = tt.reshape %left_idx_581 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_587 = tt.reshape %right_idx_585 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_588 = arith.cmpi slt, %ileft_575, %iright_576 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_589 = arith.cmpi eq, %ileft_575, %iright_576 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_590 = arith.cmpi sgt, %left_idx_586, %right_idx_587 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_591 = arith.andi %eq_589, %cond_590 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_592 = arith.ori %cond_588, %cond_591 : tensor<32x16xi1, #blocked> loc(#loc277) + %cond_593 = arith.extui %cond_592 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc278) + %cond_594 = arith.xori %cond_593, %flip_159 : tensor<32x16xi32, #blocked> loc(#loc278) + %cond_595 = arith.cmpi ne, %cond_594, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc279) + %ret_596 = arith.xori %ileft_575, %iright_576 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_597 = arith.select %cond_595, %ret_596, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_598 = arith.xori %ret_562, %ret_597 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_599 = arith.xori %left_idx_586, %right_idx_587 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_600 = arith.select %cond_595, %new_idxs_599, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_601 = arith.xori %new_idxs_565, %new_idxs_600 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_602 = tt.reshape %ret_598 : tensor<32x16xi32, #blocked> -> tensor<32x2x8xi32, #blocked1> loc(#loc263) + %ileft_603 = arith.muli %y_602, %ileft_270 : tensor<32x2x8xi32, #blocked1> loc(#loc264) + %ileft_604 = "tt.reduce"(%ileft_603) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc310) + %ileft_605 = tt.expand_dims %ileft_604 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc266) + %ileft_606 = tt.broadcast %ileft_605 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc267) + %iright_607 = arith.muli %y_602, %flip_158 : tensor<32x2x8xi32, #blocked1> loc(#loc268) + %iright_608 = "tt.reduce"(%iright_607) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc312) + %iright_609 = tt.expand_dims %iright_608 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc270) + %iright_610 = tt.broadcast %iright_609 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc271) + %ileft_611 = tt.reshape %ileft_606 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_612 = tt.reshape %iright_610 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_613 = tt.reshape %new_idxs_601 : tensor<32x16xi32, #blocked> -> tensor<32x2x8xi32, #blocked1> loc(#loc285) + %left_idx_614 = arith.muli %y_idx_613, %ileft_270 : tensor<32x2x8xi32, #blocked1> loc(#loc286) + %left_idx_615 = "tt.reduce"(%left_idx_614) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc314) + %left_idx_616 = tt.expand_dims %left_idx_615 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc288) + %left_idx_617 = tt.broadcast %left_idx_616 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc289) + %right_idx_618 = arith.muli %y_idx_613, %flip_158 : tensor<32x2x8xi32, #blocked1> loc(#loc290) + %right_idx_619 = "tt.reduce"(%right_idx_618) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc316) + %right_idx_620 = tt.expand_dims %right_idx_619 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc292) + %right_idx_621 = tt.broadcast %right_idx_620 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc293) + %left_idx_622 = tt.reshape %left_idx_617 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_623 = tt.reshape %right_idx_621 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_624 = arith.cmpi slt, %ileft_611, %iright_612 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_625 = arith.cmpi eq, %ileft_611, %iright_612 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_626 = arith.cmpi sgt, %left_idx_622, %right_idx_623 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_627 = arith.andi %eq_625, %cond_626 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_628 = arith.ori %cond_624, %cond_627 : tensor<32x16xi1, #blocked> loc(#loc277) + %ret_629 = arith.xori %ileft_611, %iright_612 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_630 = arith.select %cond_628, %ret_629, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_631 = arith.xori %ret_598, %ret_630 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_632 = arith.xori %left_idx_622, %right_idx_623 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_633 = arith.select %cond_628, %new_idxs_632, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_634 = arith.xori %new_idxs_601, %new_idxs_633 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_635 = tt.reshape %ret_631 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc263) + %ileft_636 = arith.muli %y_635, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc264) + %ileft_637 = "tt.reduce"(%ileft_636) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc310) + %ileft_638 = tt.expand_dims %ileft_637 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc266) + %ileft_639 = tt.broadcast %ileft_638 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc267) + %iright_640 = arith.muli %y_635, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc268) + %iright_641 = "tt.reduce"(%iright_640) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc312) + %iright_642 = tt.expand_dims %iright_641 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc270) + %iright_643 = tt.broadcast %iright_642 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc271) + %ileft_644 = tt.reshape %ileft_639 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_645 = tt.reshape %iright_643 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_646 = tt.reshape %new_idxs_634 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc285) + %left_idx_647 = arith.muli %y_idx_646, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc286) + %left_idx_648 = "tt.reduce"(%left_idx_647) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc314) + %left_idx_649 = tt.expand_dims %left_idx_648 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc288) + %left_idx_650 = tt.broadcast %left_idx_649 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc289) + %right_idx_651 = arith.muli %y_idx_646, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc290) + %right_idx_652 = "tt.reduce"(%right_idx_651) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc316) + %right_idx_653 = tt.expand_dims %right_idx_652 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc292) + %right_idx_654 = tt.broadcast %right_idx_653 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc293) + %left_idx_655 = tt.reshape %left_idx_650 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_656 = tt.reshape %right_idx_654 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_657 = arith.cmpi slt, %ileft_644, %iright_645 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_658 = arith.cmpi eq, %ileft_644, %iright_645 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_659 = arith.cmpi sgt, %left_idx_655, %right_idx_656 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_660 = arith.andi %eq_658, %cond_659 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_661 = arith.ori %cond_657, %cond_660 : tensor<32x16xi1, #blocked> loc(#loc277) + %ret_662 = arith.xori %ileft_644, %iright_645 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_663 = arith.select %cond_661, %ret_662, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_664 = arith.xori %ret_631, %ret_663 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_665 = arith.xori %left_idx_655, %right_idx_656 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_666 = arith.select %cond_661, %new_idxs_665, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_667 = arith.xori %new_idxs_634, %new_idxs_666 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_668 = tt.reshape %ret_664 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc263) + %ileft_669 = arith.muli %y_668, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc264) + %ileft_670 = "tt.reduce"(%ileft_669) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_671 = tt.expand_dims %ileft_670 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc266) + %ileft_672 = tt.broadcast %ileft_671 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc267) + %iright_673 = arith.muli %y_668, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc268) + %iright_674 = "tt.reduce"(%iright_673) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_675 = tt.expand_dims %iright_674 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc270) + %iright_676 = tt.broadcast %iright_675 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc271) + %ileft_677 = tt.reshape %ileft_672 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_678 = tt.reshape %iright_676 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_679 = tt.reshape %new_idxs_667 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc285) + %left_idx_680 = arith.muli %y_idx_679, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc286) + %left_idx_681 = "tt.reduce"(%left_idx_680) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_682 = tt.expand_dims %left_idx_681 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc288) + %left_idx_683 = tt.broadcast %left_idx_682 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc289) + %right_idx_684 = arith.muli %y_idx_679, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc290) + %right_idx_685 = "tt.reduce"(%right_idx_684) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_686 = tt.expand_dims %right_idx_685 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc292) + %right_idx_687 = tt.broadcast %right_idx_686 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc293) + %left_idx_688 = tt.reshape %left_idx_683 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_689 = tt.reshape %right_idx_687 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_690 = arith.cmpi slt, %ileft_677, %iright_678 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_691 = arith.cmpi eq, %ileft_677, %iright_678 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_692 = arith.cmpi sgt, %left_idx_688, %right_idx_689 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_693 = arith.andi %eq_691, %cond_692 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_694 = arith.ori %cond_690, %cond_693 : tensor<32x16xi1, #blocked> loc(#loc277) + %ret_695 = arith.xori %ileft_677, %iright_678 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_696 = arith.select %cond_694, %ret_695, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_697 = arith.xori %ret_664, %ret_696 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_698 = arith.xori %left_idx_688, %right_idx_689 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_699 = arith.select %cond_694, %new_idxs_698, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_700 = arith.xori %new_idxs_667, %new_idxs_699 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_701 = tt.reshape %ret_697 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc263) + %ileft_702 = arith.muli %y_701, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc264) + %ileft_703 = "tt.reduce"(%ileft_702) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_704 = tt.expand_dims %ileft_703 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc266) + %ileft_705 = tt.broadcast %ileft_704 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc267) + %iright_706 = arith.muli %y_701, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc268) + %iright_707 = "tt.reduce"(%iright_706) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_708 = tt.expand_dims %iright_707 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc270) + %iright_709 = tt.broadcast %iright_708 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc271) + %ileft_710 = tt.reshape %ileft_705 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_711 = tt.reshape %iright_709 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_712 = tt.reshape %new_idxs_700 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc285) + %left_idx_713 = arith.muli %y_idx_712, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc286) + %left_idx_714 = "tt.reduce"(%left_idx_713) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_715 = tt.expand_dims %left_idx_714 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc288) + %left_idx_716 = tt.broadcast %left_idx_715 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc289) + %right_idx_717 = arith.muli %y_idx_712, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc290) + %right_idx_718 = "tt.reduce"(%right_idx_717) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_719 = tt.expand_dims %right_idx_718 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc292) + %right_idx_720 = tt.broadcast %right_idx_719 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc293) + %left_idx_721 = tt.reshape %left_idx_716 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_722 = tt.reshape %right_idx_720 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_723 = arith.cmpi slt, %ileft_710, %iright_711 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_724 = arith.cmpi eq, %ileft_710, %iright_711 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_725 = arith.cmpi sgt, %left_idx_721, %right_idx_722 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_726 = arith.andi %eq_724, %cond_725 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_727 = arith.ori %cond_723, %cond_726 : tensor<32x16xi1, #blocked> loc(#loc277) + %new_idxs_728 = arith.xori %left_idx_721, %right_idx_722 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_729 = arith.select %cond_727, %new_idxs_728, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_730 = arith.xori %new_idxs_700, %new_idxs_729 : tensor<32x16xi32, #blocked> loc(#loc284) + %tmp20 = arith.extui %tmp5 : tensor<32x16xi1, #blocked> to tensor<32x16xi64, #blocked> loc(#loc219) + %tmp23 = arith.select %tmp0_29, %tmp20, %cst_13 : tensor<32x16xi1, #blocked>, tensor<32x16xi64, #blocked> loc(#loc191) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_741: i64 loc(callsite(#loc1 at #loc192)), %tmp24_742: i64 loc(callsite(#loc1 at #loc192))): + %tmp24_743 = arith.addi %tmp24_741, %tmp24_742 : i64 loc(#loc298) + tt.reduce.return %tmp24_743 : i64 loc(#loc220) + }) : (tensor<32x16xi64, #blocked>) -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc220) + %tmp30 = ttg.convert_layout %tmp24 : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc193) + %tmp24_731 = tt.expand_dims %tmp30 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<32x1xi64, #blocked5> loc(#loc194) + %tmp24_732 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi64, #blocked> loc(#loc194) + %tmp25 = arith.extui %tmp14 : tensor<32x16xi1, #blocked> to tensor<32x16xi64, #blocked> loc(#loc222) + %tmp28 = arith.select %tmp0_29, %tmp25, %cst_13 : tensor<32x16xi1, #blocked>, tensor<32x16xi64, #blocked> loc(#loc196) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_741: i64 loc(callsite(#loc1 at #loc197)), %tmp29_742: i64 loc(callsite(#loc1 at #loc197))): + %tmp29_743 = arith.addi %tmp29_741, %tmp29_742 : i64 loc(#loc299) + tt.reduce.return %tmp29_743 : i64 loc(#loc223) + }) : (tensor<32x16xi64, #blocked>) -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc223) + %tmp31 = ttg.convert_layout %tmp29 : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc198) + %tmp29_733 = tt.expand_dims %tmp31 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<32x1xi64, #blocked5> loc(#loc199) + %tmp29_734 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi64, #blocked> loc(#loc199) + %tmp30_735 = arith.trunci %tmp24_731 : tensor<32x1xi64, #blocked5> to tensor<32x1xi32, #blocked5> loc(#loc193) + %tmp30_736 = arith.trunci %tmp24_732 : tensor<32x1xi64, #blocked> to tensor<32x1xi32, #blocked> loc(#loc193) + %tmp31_737 = arith.trunci %tmp29_733 : tensor<32x1xi64, #blocked5> to tensor<32x1xi32, #blocked5> loc(#loc198) + %tmp31_738 = arith.trunci %tmp29_734 : tensor<32x1xi64, #blocked> to tensor<32x1xi32, #blocked> loc(#loc198) + %tmp34 = tt.broadcast %tmp30_736 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc200) + %tmp34_739 = arith.cmpi slt, %tmp0_24, %tmp34 : tensor<32x16xi32, #blocked> loc(#loc200) + %tmp36 = arith.select %tmp34_739, %new_idxs_398, %cst_11 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc201) + %tmp38 = arith.addi %tmp36, %cst_10 : tensor<32x16xi32, #blocked> loc(#loc202) + %tmp39 = arith.cmpi slt, %tmp36, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc203) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc204) + %0 = arith.cmpi sge, %tmp40, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc85) + %1 = arith.cmpi slt, %tmp40, %cst_10 : tensor<32x16xi32, #blocked> loc(#loc86) + %2 = arith.andi %0, %1 : tensor<32x16xi1, #blocked> loc(#loc87) + %3 = arith.xori %xmask, %cst : tensor<32x1xi1, #blocked> loc(#loc88) + %4 = tt.broadcast %3 : tensor<32x1xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc89) + %5 = arith.ori %2, %4 : tensor<32x16xi1, #blocked> loc(#loc89) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<32x16xi1, #blocked> loc(#loc90) + %tmp45 = tt.broadcast %tmp31_738 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc205) + %tmp45_740 = arith.cmpi slt, %tmp0_24, %tmp45 : tensor<32x16xi32, #blocked> loc(#loc205) + %tmp46 = arith.select %tmp45_740, %new_idxs_730, %cst_11 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc206) + %tmp47 = arith.addi %tmp46, %cst_10 : tensor<32x16xi32, #blocked> loc(#loc207) + %tmp48 = arith.cmpi slt, %tmp46, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc208) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc209) + %6 = arith.cmpi sge, %tmp49, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc96) + %7 = arith.cmpi slt, %tmp49, %cst_10 : tensor<32x16xi32, #blocked> loc(#loc97) + %8 = arith.andi %6, %7 : tensor<32x16xi1, #blocked> loc(#loc98) + %9 = arith.ori %8, %4 : tensor<32x16xi1, #blocked> loc(#loc99) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<32x16xi1, #blocked> loc(#loc100) + %10 = tt.splat %out_ptr4 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked5> loc(#loc101) + %11 = tt.addptr %10, %xindex_21 : tensor<32x1x!tt.ptr, #blocked5>, tensor<32x1xi32, #blocked5> loc(#loc101) + tt.store %11, %tmp30_735, %xmask_22 : tensor<32x1x!tt.ptr, #blocked5> loc(#loc102) + %12 = tt.splat %out_ptr5 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked5> loc(#loc103) + %13 = tt.addptr %12, %xindex_21 : tensor<32x1x!tt.ptr, #blocked5>, tensor<32x1xi32, #blocked5> loc(#loc103) + tt.store %13, %tmp31_737, %xmask_22 : tensor<32x1x!tt.ptr, #blocked5> loc(#loc104) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc105) + %15 = tt.addptr %14, %tmp0_26 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc105) + tt.store %15, %new_idxs_398, %tmp0_29 : tensor<32x16x!tt.ptr, #blocked> loc(#loc106) + %16 = arith.muli %xindex_20, %cst_0 : tensor<32x1xi32, #blocked> loc(#loc107) + %17 = tt.broadcast %16 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc108) + %18 = arith.addi %tmp40, %17 : tensor<32x16xi32, #blocked> loc(#loc108) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc109) + %20 = tt.addptr %19, %18 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc109) + %21 = ttg.convert_layout %20 : tensor<32x16x!tt.ptr, #blocked> -> tensor<32x16x!tt.ptr, #blocked5> loc(#loc110) + tt.store %21, %cst_8, %tmp0_30 : tensor<32x16x!tt.ptr, #blocked5> loc(#loc110) + %22 = tt.splat %out_ptr8 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc111) + %23 = tt.addptr %22, %tmp0_26 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc111) + tt.store %23, %new_idxs_730, %tmp0_29 : tensor<32x16x!tt.ptr, #blocked> loc(#loc112) + %24 = arith.addi %tmp49, %17 : tensor<32x16xi32, #blocked> loc(#loc113) + %25 = tt.splat %out_ptr9 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc114) + %26 = tt.addptr %25, %24 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc114) + %27 = ttg.convert_layout %26 : tensor<32x16x!tt.ptr, #blocked> -> tensor<32x16x!tt.ptr, #blocked5> loc(#loc115) + tt.store %27, %cst_8, %tmp0_30 : tensor<32x16x!tt.ptr, #blocked5> loc(#loc115) + tt.return loc(#loc116) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":24:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":26:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":27:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:40) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:30) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":36:18) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":38:18) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":39:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":41:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":40:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":43:19) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":45:34) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":47:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":49:21) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":48:21) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":52:20) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":54:35) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":60:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":55:29) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":56:21) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":58:35) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":61:21) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":59:29) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":64:19) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":66:35) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":68:20) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":69:20) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":70:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:28) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:46) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:38) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:55) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:53) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:63) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":75:19) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":76:35) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":77:20) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":78:20) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":79:35) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:28) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:46) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:38) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:53) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:63) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":81:25) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":81:37) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":82:25) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":82:37) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:25) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:47) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:52) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:49) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:25) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:85) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:25) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:47) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:49) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:85) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:4) +#loc126 = loc("xoffset"(#loc2)) +#loc127 = loc("xoffset"(#loc3)) +#loc128 = loc("xindex"(#loc4)) +#loc129 = loc("xindex"(#loc5)) +#loc130 = loc("xmask"(#loc6)) +#loc131 = loc("r0_index"(#loc7)) +#loc132 = loc("tmp0"(#loc8)) +#loc133 = loc("tmp0"(#loc9)) +#loc134 = loc("tmp0"(#loc10)) +#loc135 = loc("tmp0"(#loc11)) +#loc136 = loc("tmp2"(#loc12)) +#loc137 = loc("tmp4"(#loc13)) +#loc138 = loc("tmp5"(#loc14)) +#loc139 = loc("tmp7"(#loc15)) +#loc140 = loc("tmp6"(#loc16)) +#loc141 = loc("tmp9"(#loc17)) +#loc142 = loc("tmp11"(#loc18)) +#loc143 = loc("flip"(#loc19)) +#loc145 = loc("flip"(#loc22)) +#loc146 = loc("flip"(#loc23)) +#loc147 = loc("y"(#loc24)) +#loc148 = loc("left_mask"(#loc26)) +#loc149 = loc("ileft"(#loc27)) +#loc151 = loc("ileft"(#loc31)) +#loc152 = loc("ileft"(#loc32)) +#loc153 = loc("iright"(#loc33)) +#loc155 = loc("iright"(#loc35)) +#loc156 = loc("iright"(#loc36)) +#loc157 = loc("ileft"(#loc37)) +#loc158 = loc("iright"(#loc38)) +#loc159 = loc("y_idx"(#loc39)) +#loc160 = loc("left_idx"(#loc40)) +#loc161 = loc("left_idx"(#loc41)) +#loc162 = loc("input"(#loc42)) +#loc164 = loc("left_idx"(#loc44)) +#loc165 = loc("left_idx"(#loc45)) +#loc166 = loc("right_idx"(#loc46)) +#loc167 = loc("right_idx"(#loc47)) +#loc169 = loc("right_idx"(#loc49)) +#loc170 = loc("right_idx"(#loc50)) +#loc171 = loc("left_idx"(#loc51)) +#loc172 = loc("right_idx"(#loc52)) +#loc173 = loc("cond"(#loc53)) +#loc174 = loc("eq"(#loc54)) +#loc175 = loc("cond"(#loc55)) +#loc176 = loc("cond"(#loc56)) +#loc177 = loc("cond"(#loc57)) +#loc178 = loc("cond"(#loc58)) +#loc179 = loc("cond"(#loc59)) +#loc180 = loc("ret"(#loc60)) +#loc181 = loc("ret"(#loc61)) +#loc182 = loc("ret"(#loc62)) +#loc183 = loc("new_idxs"(#loc63)) +#loc184 = loc("new_idxs"(#loc64)) +#loc185 = loc("new_idxs"(#loc65)) +#loc186 = loc("tmp14"(#loc66)) +#loc187 = loc("tmp16"(#loc67)) +#loc188 = loc("tmp15"(#loc68)) +#loc190 = loc("tmp20"(#loc70)) +#loc191 = loc("tmp23"(#loc71)) +#loc193 = loc("tmp30"(#loc73)) +#loc194 = loc("tmp24"(#loc74)) +#loc195 = loc("tmp25"(#loc75)) +#loc196 = loc("tmp28"(#loc76)) +#loc198 = loc("tmp31"(#loc78)) +#loc199 = loc("tmp29"(#loc79)) +#loc200 = loc("tmp34"(#loc80)) +#loc201 = loc("tmp36"(#loc81)) +#loc202 = loc("tmp38"(#loc82)) +#loc203 = loc("tmp39"(#loc83)) +#loc204 = loc("tmp40"(#loc84)) +#loc205 = loc("tmp45"(#loc91)) +#loc206 = loc("tmp46"(#loc92)) +#loc207 = loc("tmp47"(#loc93)) +#loc208 = loc("tmp48"(#loc94)) +#loc209 = loc("tmp49"(#loc95)) +#loc210 = loc(fused[#loc139, #loc140]) +#loc211 = loc(callsite(#loc143 at #loc144)) +#loc212 = loc(callsite(#loc145 at #loc144)) +#loc213 = loc(callsite(#loc146 at #loc144)) +#loc215 = loc("cond"(#loc173)) +#loc216 = loc("eq"(#loc174)) +#loc217 = loc(fused[#loc187, #loc188]) +#loc219 = loc(fused[#loc190, #loc139, #loc140]) +#loc220 = loc(callsite(#loc28 at #loc192)) +#loc222 = loc(fused[#loc195, #loc187, #loc188]) +#loc223 = loc(callsite(#loc28 at #loc197)) +#loc225 = loc(callsite(#loc147 at #loc214)) +#loc226 = loc(callsite(#loc148 at #loc214)) +#loc227 = loc(callsite(#loc149 at #loc214)) +#loc229 = loc(callsite(#loc151 at #loc214)) +#loc230 = loc(callsite(#loc152 at #loc214)) +#loc231 = loc(callsite(#loc153 at #loc214)) +#loc233 = loc(callsite(#loc155 at #loc214)) +#loc234 = loc(callsite(#loc156 at #loc214)) +#loc235 = loc(callsite(#loc157 at #loc214)) +#loc236 = loc(callsite(#loc158 at #loc214)) +#loc237 = loc(callsite(#loc159 at #loc214)) +#loc238 = loc(callsite(#loc160 at #loc214)) +#loc239 = loc(callsite(#loc161 at #loc214)) +#loc241 = loc(callsite(#loc164 at #loc214)) +#loc242 = loc(callsite(#loc165 at #loc214)) +#loc243 = loc(callsite(#loc166 at #loc214)) +#loc244 = loc(callsite(#loc167 at #loc214)) +#loc246 = loc(callsite(#loc169 at #loc214)) +#loc247 = loc(callsite(#loc170 at #loc214)) +#loc248 = loc(callsite(#loc171 at #loc214)) +#loc249 = loc(callsite(#loc172 at #loc214)) +#loc250 = loc(callsite(#loc215 at #loc214)) +#loc251 = loc(callsite(#loc216 at #loc214)) +#loc252 = loc(callsite(#loc175 at #loc214)) +#loc253 = loc(callsite(#loc176 at #loc214)) +#loc254 = loc(callsite(#loc177 at #loc214)) +#loc255 = loc(callsite(#loc178 at #loc214)) +#loc256 = loc(callsite(#loc179 at #loc214)) +#loc257 = loc(callsite(#loc180 at #loc214)) +#loc258 = loc(callsite(#loc181 at #loc214)) +#loc259 = loc(callsite(#loc182 at #loc214)) +#loc260 = loc(callsite(#loc183 at #loc214)) +#loc261 = loc(callsite(#loc184 at #loc214)) +#loc262 = loc(callsite(#loc185 at #loc214)) +#loc263 = loc(callsite(#loc147 at #loc218)) +#loc264 = loc(callsite(#loc149 at #loc218)) +#loc266 = loc(callsite(#loc151 at #loc218)) +#loc267 = loc(callsite(#loc152 at #loc218)) +#loc268 = loc(callsite(#loc153 at #loc218)) +#loc270 = loc(callsite(#loc155 at #loc218)) +#loc271 = loc(callsite(#loc156 at #loc218)) +#loc272 = loc(callsite(#loc157 at #loc218)) +#loc273 = loc(callsite(#loc158 at #loc218)) +#loc274 = loc(callsite(#loc215 at #loc218)) +#loc275 = loc(callsite(#loc216 at #loc218)) +#loc276 = loc(callsite(#loc176 at #loc218)) +#loc277 = loc(callsite(#loc177 at #loc218)) +#loc278 = loc(callsite(#loc178 at #loc218)) +#loc279 = loc(callsite(#loc179 at #loc218)) +#loc280 = loc(callsite(#loc180 at #loc218)) +#loc281 = loc(callsite(#loc181 at #loc218)) +#loc282 = loc(callsite(#loc182 at #loc218)) +#loc283 = loc(callsite(#loc184 at #loc218)) +#loc284 = loc(callsite(#loc185 at #loc218)) +#loc285 = loc(callsite(#loc159 at #loc218)) +#loc286 = loc(callsite(#loc161 at #loc218)) +#loc288 = loc(callsite(#loc164 at #loc218)) +#loc289 = loc(callsite(#loc165 at #loc218)) +#loc290 = loc(callsite(#loc167 at #loc218)) +#loc292 = loc(callsite(#loc169 at #loc218)) +#loc293 = loc(callsite(#loc170 at #loc218)) +#loc294 = loc(callsite(#loc171 at #loc218)) +#loc295 = loc(callsite(#loc172 at #loc218)) +#loc296 = loc(callsite(#loc175 at #loc218)) +#loc297 = loc(callsite(#loc183 at #loc218)) +#loc298 = loc(callsite(#loc30 at #loc220)) +#loc299 = loc(callsite(#loc30 at #loc223)) +#loc300 = loc(callsite(#loc28 at #loc228)) +#loc302 = loc(callsite(#loc28 at #loc232)) +#loc304 = loc(callsite(#loc162 at #loc240)) +#loc305 = loc(callsite(#loc28 at #loc240)) +#loc307 = loc(callsite(#loc162 at #loc245)) +#loc308 = loc(callsite(#loc28 at #loc245)) +#loc310 = loc(callsite(#loc28 at #loc265)) +#loc312 = loc(callsite(#loc28 at #loc269)) +#loc314 = loc(callsite(#loc28 at #loc287)) +#loc316 = loc(callsite(#loc28 at #loc291)) +#loc318 = loc(callsite(#loc30 at #loc300)) +#loc319 = loc(callsite(#loc30 at #loc302)) +#loc320 = loc(callsite(#loc30 at #loc305)) +#loc321 = loc(callsite(#loc30 at #loc308)) +#loc322 = loc(callsite(#loc30 at #loc310)) +#loc323 = loc(callsite(#loc30 at #loc312)) +#loc324 = loc(callsite(#loc30 at #loc314)) +#loc325 = loc(callsite(#loc30 at #loc316)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..b1db1c3bb9a40a964e8b13e060b3697a3b6707d7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/MDZNHKZZMMXKO4UW2P62WFFAPIPVBSFSNQA2LIXIEWAXE6UJ337A/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir @@ -0,0 +1,1451 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":18:0) +#loc1 = loc(unknown) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":46:71) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":51:71) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":55:26) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":59:26) +#loc120 = loc("in_ptr0"(#loc)) +#loc121 = loc("out_ptr4"(#loc)) +#loc122 = loc("out_ptr5"(#loc)) +#loc123 = loc("out_ptr6"(#loc)) +#loc124 = loc("out_ptr7"(#loc)) +#loc125 = loc("out_ptr8"(#loc)) +#loc126 = loc("out_ptr9"(#loc)) +#loc127 = loc("xnumel"(#loc)) +#loc128 = loc("r0_numel"(#loc)) +#loc149 = loc(callsite(#loc22 at #loc23)) +#loc156 = loc("ileft"(#loc32)) +#loc160 = loc("iright"(#loc37)) +#loc169 = loc("left_idx"(#loc46)) +#loc174 = loc("right_idx"(#loc51)) +#loc195 = loc(callsite(#loc22 at #loc72)) +#loc198 = loc("tmp24"(#loc75)) +#loc202 = loc("tmp29"(#loc79)) +#loc221 = loc(callsite(#loc28 at #loc149)) +#loc225 = loc(callsite(#loc28 at #loc195)) +#loc228 = loc(callsite(#loc1 at #loc198)) +#loc231 = loc(callsite(#loc1 at #loc202)) +#loc235 = loc(callsite(#loc156 at #loc221)) +#loc239 = loc(callsite(#loc160 at #loc221)) +#loc247 = loc(callsite(#loc169 at #loc221)) +#loc252 = loc(callsite(#loc174 at #loc221)) +#loc272 = loc(callsite(#loc156 at #loc225)) +#loc276 = loc(callsite(#loc160 at #loc225)) +#loc294 = loc(callsite(#loc169 at #loc225)) +#loc298 = loc(callsite(#loc174 at #loc225)) +#loc308 = loc(callsite(#loc1 at #loc235)) +#loc310 = loc(callsite(#loc1 at #loc239)) +#loc313 = loc(callsite(#loc1 at #loc247)) +#loc316 = loc(callsite(#loc1 at #loc252)) +#loc318 = loc(callsite(#loc1 at #loc272)) +#loc320 = loc(callsite(#loc1 at #loc276)) +#loc322 = loc(callsite(#loc1 at #loc294)) +#loc324 = loc(callsite(#loc1 at #loc298)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<32x16xi32> loc(#loc1) + %cst_1 = arith.constant dense<17> : tensor<32x1xi32> loc(#loc1) + %cst_2 = arith.constant dense : tensor<32x1xi1> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc1) + %cst_4 = arith.constant dense<17> : tensor<32x16xi32> loc(#loc1) + %cst_5 = arith.constant dense<16> : tensor<32x16xi32> loc(#loc1) + %cst_6 = arith.constant dense<16384> : tensor<32x16xi64> loc(#loc1) + %cst_7 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc1) + %cst_8 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc1) + %xmask = arith.constant dense<32> : tensor<32x1xi32> loc(#loc129) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc130) + %xoffset_9 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc131) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc132) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc133) + %xindex_11 = tt.splat %xoffset_9 : i32 -> tensor<32x1xi32> loc(#loc134) + %xindex_12 = arith.addi %xindex_11, %xindex_10 : tensor<32x1xi32> loc(#loc134) + %xmask_13 = arith.cmpi slt, %xindex_12, %xmask : tensor<32x1xi32> loc(#loc129) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc135) + %r0_index_14 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc136) + %tmp0 = arith.muli %xindex_12, %cst_8 : tensor<32x1xi32> loc(#loc137) + %tmp0_15 = tt.broadcast %r0_index_14 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc138) + %tmp0_16 = tt.broadcast %tmp0 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc138) + %tmp0_17 = arith.addi %tmp0_15, %tmp0_16 : tensor<32x16xi32> loc(#loc138) + %tmp0_18 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc139) + %tmp0_19 = tt.addptr %tmp0_18, %tmp0_17 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc139) + %tmp0_20 = tt.broadcast %xmask_13 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc140) + %tmp0_21 = tt.load %tmp0_19, %tmp0_20, %cst_7 : tensor<32x16x!tt.ptr> loc(#loc140) + %tmp2 = arith.cmpi sgt, %tmp0_21, %cst_7 : tensor<32x16xi64> loc(#loc141) + %tmp4 = arith.cmpi slt, %tmp0_21, %cst_6 : tensor<32x16xi64> loc(#loc142) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<32x16xi1> loc(#loc143) + %tmp7 = arith.extui %tmp5 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc216) + %tmp9 = arith.trunci %r0_index_14 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc146) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16> -> tensor<32x16xi16> loc(#loc147) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc217) + %flip_22 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc218) + %flip_23 = tt.expand_dims %flip_22 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc218) + %flip_24 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc219) + %flip_25 = tt.reshape %flip_24 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc220) + %y = tt.reshape %tmp7 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc232) + %left_mask = arith.subi %cst, %flip_23 : tensor<1x2x1xi32> loc(#loc233) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc234) + %ileft_26 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc234) + %ileft_27 = "tt.reduce"(%ileft_26) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc307) + %ileft_28 = tt.expand_dims %ileft_27 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc236) + %ileft_29 = tt.broadcast %ileft_28 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc237) + %iright = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc238) + %iright_30 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc238) + %iright_31 = "tt.reduce"(%iright_30) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc309) + %iright_32 = tt.expand_dims %iright_31 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc240) + %iright_33 = tt.broadcast %iright_32 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc241) + %ileft_34 = tt.reshape %ileft_29 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_35 = tt.reshape %iright_33 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx = tt.reshape %tmp11 : tensor<32x16xi16> -> tensor<256x2x1xi16> loc(#loc244) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc245) + %left_idx_36 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc246) + %left_idx_37 = arith.muli %y_idx, %left_idx_36 : tensor<256x2x1xi16> loc(#loc246) + %input = arith.extsi %left_idx_37 : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc311) + %left_idx_38 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc312) + %left_idx_39 = tt.expand_dims %left_idx_38 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc248) + %left_idx_40 = tt.broadcast %left_idx_39 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc249) + %right_idx = arith.trunci %flip_23 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc250) + %right_idx_41 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc251) + %right_idx_42 = arith.muli %y_idx, %right_idx_41 : tensor<256x2x1xi16> loc(#loc251) + %input_43 = arith.extsi %right_idx_42 : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc314) + %right_idx_44 = "tt.reduce"(%input_43) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc315) + %right_idx_45 = tt.expand_dims %right_idx_44 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc253) + %right_idx_46 = tt.broadcast %right_idx_45 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc254) + %left_idx_47 = tt.reshape %left_idx_40 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_48 = tt.reshape %right_idx_46 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc256) + %cond = arith.cmpi slt, %ileft_34, %iright_35 : tensor<32x16xi32> loc(#loc257) + %eq = arith.cmpi eq, %ileft_34, %iright_35 : tensor<32x16xi32> loc(#loc258) + %cond_49 = arith.cmpi sgt, %left_idx_47, %right_idx_48 : tensor<32x16xi32> loc(#loc259) + %cond_50 = arith.andi %eq, %cond_49 : tensor<32x16xi1> loc(#loc260) + %cond_51 = arith.ori %cond, %cond_50 : tensor<32x16xi1> loc(#loc261) + %cond_52 = arith.extui %cond_51 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc262) + %cond_53 = arith.xori %cond_52, %flip_25 : tensor<32x16xi32> loc(#loc262) + %cond_54 = arith.cmpi ne, %cond_53, %cst_3 : tensor<32x16xi32> loc(#loc263) + %ret = arith.xori %ileft_34, %iright_35 : tensor<32x16xi32> loc(#loc264) + %ret_55 = arith.select %cond_54, %ret, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_56 = arith.xori %tmp7, %ret_55 : tensor<32x16xi32> loc(#loc266) + %new_idxs = arith.xori %left_idx_47, %right_idx_48 : tensor<32x16xi32> loc(#loc267) + %new_idxs_57 = arith.select %cond_54, %new_idxs, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_58 = arith.extsi %tmp9 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc269) + %new_idxs_59 = tt.broadcast %new_idxs_58 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc269) + %new_idxs_60 = arith.xori %new_idxs_59, %new_idxs_57 : tensor<32x16xi32> loc(#loc269) + %flip_61 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc219) + %flip_62 = tt.reshape %flip_61 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc220) + %y_63 = tt.reshape %ret_56 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc232) + %ileft_64 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc234) + %ileft_65 = arith.muli %y_63, %ileft_64 : tensor<128x2x2xi32> loc(#loc234) + %ileft_66 = "tt.reduce"(%ileft_65) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc307) + %ileft_67 = tt.expand_dims %ileft_66 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc236) + %ileft_68 = tt.broadcast %ileft_67 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc237) + %iright_69 = arith.muli %y_63, %flip_24 : tensor<128x2x2xi32> loc(#loc238) + %iright_70 = "tt.reduce"(%iright_69) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc309) + %iright_71 = tt.expand_dims %iright_70 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc240) + %iright_72 = tt.broadcast %iright_71 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc241) + %ileft_73 = tt.reshape %ileft_68 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_74 = tt.reshape %iright_72 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_75 = tt.reshape %new_idxs_60 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc244) + %left_idx_76 = arith.muli %y_idx_75, %ileft_64 : tensor<128x2x2xi32> loc(#loc246) + %left_idx_77 = "tt.reduce"(%left_idx_76) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc312) + %left_idx_78 = tt.expand_dims %left_idx_77 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc248) + %left_idx_79 = tt.broadcast %left_idx_78 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc249) + %right_idx_80 = arith.muli %y_idx_75, %flip_24 : tensor<128x2x2xi32> loc(#loc251) + %right_idx_81 = "tt.reduce"(%right_idx_80) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc315) + %right_idx_82 = tt.expand_dims %right_idx_81 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc253) + %right_idx_83 = tt.broadcast %right_idx_82 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc254) + %left_idx_84 = tt.reshape %left_idx_79 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_85 = tt.reshape %right_idx_83 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_86 = arith.cmpi slt, %ileft_73, %iright_74 : tensor<32x16xi32> loc(#loc257) + %eq_87 = arith.cmpi eq, %ileft_73, %iright_74 : tensor<32x16xi32> loc(#loc258) + %cond_88 = arith.cmpi sgt, %left_idx_84, %right_idx_85 : tensor<32x16xi32> loc(#loc259) + %cond_89 = arith.andi %eq_87, %cond_88 : tensor<32x16xi1> loc(#loc260) + %cond_90 = arith.ori %cond_86, %cond_89 : tensor<32x16xi1> loc(#loc261) + %cond_91 = arith.extui %cond_90 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc262) + %cond_92 = arith.xori %cond_91, %flip_62 : tensor<32x16xi32> loc(#loc262) + %cond_93 = arith.cmpi ne, %cond_92, %cst_3 : tensor<32x16xi32> loc(#loc263) + %ret_94 = arith.xori %ileft_73, %iright_74 : tensor<32x16xi32> loc(#loc264) + %ret_95 = arith.select %cond_93, %ret_94, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_96 = arith.xori %ret_56, %ret_95 : tensor<32x16xi32> loc(#loc266) + %new_idxs_97 = arith.xori %left_idx_84, %right_idx_85 : tensor<32x16xi32> loc(#loc267) + %new_idxs_98 = arith.select %cond_93, %new_idxs_97, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_99 = arith.xori %new_idxs_60, %new_idxs_98 : tensor<32x16xi32> loc(#loc269) + %y_100 = tt.reshape %ret_96 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc232) + %ileft_101 = arith.muli %y_100, %ileft : tensor<256x2x1xi32> loc(#loc234) + %ileft_102 = "tt.reduce"(%ileft_101) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc307) + %ileft_103 = tt.expand_dims %ileft_102 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc236) + %ileft_104 = tt.broadcast %ileft_103 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc237) + %iright_105 = arith.muli %y_100, %iright : tensor<256x2x1xi32> loc(#loc238) + %iright_106 = "tt.reduce"(%iright_105) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc309) + %iright_107 = tt.expand_dims %iright_106 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc240) + %iright_108 = tt.broadcast %iright_107 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc241) + %ileft_109 = tt.reshape %ileft_104 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_110 = tt.reshape %iright_108 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_111 = tt.reshape %new_idxs_99 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc244) + %left_idx_112 = arith.muli %y_idx_111, %ileft : tensor<256x2x1xi32> loc(#loc246) + %left_idx_113 = "tt.reduce"(%left_idx_112) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc312) + %left_idx_114 = tt.expand_dims %left_idx_113 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc248) + %left_idx_115 = tt.broadcast %left_idx_114 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc249) + %right_idx_116 = arith.muli %y_idx_111, %iright : tensor<256x2x1xi32> loc(#loc251) + %right_idx_117 = "tt.reduce"(%right_idx_116) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc315) + %right_idx_118 = tt.expand_dims %right_idx_117 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc253) + %right_idx_119 = tt.broadcast %right_idx_118 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc254) + %left_idx_120 = tt.reshape %left_idx_115 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_121 = tt.reshape %right_idx_119 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_122 = arith.cmpi slt, %ileft_109, %iright_110 : tensor<32x16xi32> loc(#loc257) + %eq_123 = arith.cmpi eq, %ileft_109, %iright_110 : tensor<32x16xi32> loc(#loc258) + %cond_124 = arith.cmpi sgt, %left_idx_120, %right_idx_121 : tensor<32x16xi32> loc(#loc259) + %cond_125 = arith.andi %eq_123, %cond_124 : tensor<32x16xi1> loc(#loc260) + %cond_126 = arith.ori %cond_122, %cond_125 : tensor<32x16xi1> loc(#loc261) + %cond_127 = arith.extui %cond_126 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc262) + %cond_128 = arith.xori %cond_127, %flip_62 : tensor<32x16xi32> loc(#loc262) + %cond_129 = arith.cmpi ne, %cond_128, %cst_3 : tensor<32x16xi32> loc(#loc263) + %ret_130 = arith.xori %ileft_109, %iright_110 : tensor<32x16xi32> loc(#loc264) + %ret_131 = arith.select %cond_129, %ret_130, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_132 = arith.xori %ret_96, %ret_131 : tensor<32x16xi32> loc(#loc266) + %new_idxs_133 = arith.xori %left_idx_120, %right_idx_121 : tensor<32x16xi32> loc(#loc267) + %new_idxs_134 = arith.select %cond_129, %new_idxs_133, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_135 = arith.xori %new_idxs_99, %new_idxs_134 : tensor<32x16xi32> loc(#loc269) + %flip_136 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc219) + %flip_137 = tt.reshape %flip_136 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc220) + %y_138 = tt.reshape %ret_132 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc232) + %ileft_139 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc234) + %ileft_140 = arith.muli %y_138, %ileft_139 : tensor<64x2x4xi32> loc(#loc234) + %ileft_141 = "tt.reduce"(%ileft_140) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc307) + %ileft_142 = tt.expand_dims %ileft_141 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc236) + %ileft_143 = tt.broadcast %ileft_142 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc237) + %iright_144 = arith.muli %y_138, %flip_61 : tensor<64x2x4xi32> loc(#loc238) + %iright_145 = "tt.reduce"(%iright_144) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc309) + %iright_146 = tt.expand_dims %iright_145 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc240) + %iright_147 = tt.broadcast %iright_146 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc241) + %ileft_148 = tt.reshape %ileft_143 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_149 = tt.reshape %iright_147 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_150 = tt.reshape %new_idxs_135 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc244) + %left_idx_151 = arith.muli %y_idx_150, %ileft_139 : tensor<64x2x4xi32> loc(#loc246) + %left_idx_152 = "tt.reduce"(%left_idx_151) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc312) + %left_idx_153 = tt.expand_dims %left_idx_152 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc248) + %left_idx_154 = tt.broadcast %left_idx_153 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc249) + %right_idx_155 = arith.muli %y_idx_150, %flip_61 : tensor<64x2x4xi32> loc(#loc251) + %right_idx_156 = "tt.reduce"(%right_idx_155) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc315) + %right_idx_157 = tt.expand_dims %right_idx_156 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc253) + %right_idx_158 = tt.broadcast %right_idx_157 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc254) + %left_idx_159 = tt.reshape %left_idx_154 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_160 = tt.reshape %right_idx_158 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_161 = arith.cmpi slt, %ileft_148, %iright_149 : tensor<32x16xi32> loc(#loc257) + %eq_162 = arith.cmpi eq, %ileft_148, %iright_149 : tensor<32x16xi32> loc(#loc258) + %cond_163 = arith.cmpi sgt, %left_idx_159, %right_idx_160 : tensor<32x16xi32> loc(#loc259) + %cond_164 = arith.andi %eq_162, %cond_163 : tensor<32x16xi1> loc(#loc260) + %cond_165 = arith.ori %cond_161, %cond_164 : tensor<32x16xi1> loc(#loc261) + %cond_166 = arith.extui %cond_165 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc262) + %cond_167 = arith.xori %cond_166, %flip_137 : tensor<32x16xi32> loc(#loc262) + %cond_168 = arith.cmpi ne, %cond_167, %cst_3 : tensor<32x16xi32> loc(#loc263) + %ret_169 = arith.xori %ileft_148, %iright_149 : tensor<32x16xi32> loc(#loc264) + %ret_170 = arith.select %cond_168, %ret_169, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_171 = arith.xori %ret_132, %ret_170 : tensor<32x16xi32> loc(#loc266) + %new_idxs_172 = arith.xori %left_idx_159, %right_idx_160 : tensor<32x16xi32> loc(#loc267) + %new_idxs_173 = arith.select %cond_168, %new_idxs_172, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_174 = arith.xori %new_idxs_135, %new_idxs_173 : tensor<32x16xi32> loc(#loc269) + %y_175 = tt.reshape %ret_171 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc232) + %ileft_176 = arith.muli %y_175, %ileft_64 : tensor<128x2x2xi32> loc(#loc234) + %ileft_177 = "tt.reduce"(%ileft_176) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc307) + %ileft_178 = tt.expand_dims %ileft_177 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc236) + %ileft_179 = tt.broadcast %ileft_178 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc237) + %iright_180 = arith.muli %y_175, %flip_24 : tensor<128x2x2xi32> loc(#loc238) + %iright_181 = "tt.reduce"(%iright_180) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc309) + %iright_182 = tt.expand_dims %iright_181 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc240) + %iright_183 = tt.broadcast %iright_182 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc241) + %ileft_184 = tt.reshape %ileft_179 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_185 = tt.reshape %iright_183 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_186 = tt.reshape %new_idxs_174 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc244) + %left_idx_187 = arith.muli %y_idx_186, %ileft_64 : tensor<128x2x2xi32> loc(#loc246) + %left_idx_188 = "tt.reduce"(%left_idx_187) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc312) + %left_idx_189 = tt.expand_dims %left_idx_188 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc248) + %left_idx_190 = tt.broadcast %left_idx_189 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc249) + %right_idx_191 = arith.muli %y_idx_186, %flip_24 : tensor<128x2x2xi32> loc(#loc251) + %right_idx_192 = "tt.reduce"(%right_idx_191) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc315) + %right_idx_193 = tt.expand_dims %right_idx_192 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc253) + %right_idx_194 = tt.broadcast %right_idx_193 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc254) + %left_idx_195 = tt.reshape %left_idx_190 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_196 = tt.reshape %right_idx_194 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_197 = arith.cmpi slt, %ileft_184, %iright_185 : tensor<32x16xi32> loc(#loc257) + %eq_198 = arith.cmpi eq, %ileft_184, %iright_185 : tensor<32x16xi32> loc(#loc258) + %cond_199 = arith.cmpi sgt, %left_idx_195, %right_idx_196 : tensor<32x16xi32> loc(#loc259) + %cond_200 = arith.andi %eq_198, %cond_199 : tensor<32x16xi1> loc(#loc260) + %cond_201 = arith.ori %cond_197, %cond_200 : tensor<32x16xi1> loc(#loc261) + %cond_202 = arith.extui %cond_201 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc262) + %cond_203 = arith.xori %cond_202, %flip_137 : tensor<32x16xi32> loc(#loc262) + %cond_204 = arith.cmpi ne, %cond_203, %cst_3 : tensor<32x16xi32> loc(#loc263) + %ret_205 = arith.xori %ileft_184, %iright_185 : tensor<32x16xi32> loc(#loc264) + %ret_206 = arith.select %cond_204, %ret_205, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_207 = arith.xori %ret_171, %ret_206 : tensor<32x16xi32> loc(#loc266) + %new_idxs_208 = arith.xori %left_idx_195, %right_idx_196 : tensor<32x16xi32> loc(#loc267) + %new_idxs_209 = arith.select %cond_204, %new_idxs_208, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_210 = arith.xori %new_idxs_174, %new_idxs_209 : tensor<32x16xi32> loc(#loc269) + %y_211 = tt.reshape %ret_207 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc232) + %ileft_212 = arith.muli %y_211, %ileft : tensor<256x2x1xi32> loc(#loc234) + %ileft_213 = "tt.reduce"(%ileft_212) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc307) + %ileft_214 = tt.expand_dims %ileft_213 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc236) + %ileft_215 = tt.broadcast %ileft_214 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc237) + %iright_216 = arith.muli %y_211, %iright : tensor<256x2x1xi32> loc(#loc238) + %iright_217 = "tt.reduce"(%iright_216) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc309) + %iright_218 = tt.expand_dims %iright_217 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc240) + %iright_219 = tt.broadcast %iright_218 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc241) + %ileft_220 = tt.reshape %ileft_215 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_221 = tt.reshape %iright_219 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_222 = tt.reshape %new_idxs_210 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc244) + %left_idx_223 = arith.muli %y_idx_222, %ileft : tensor<256x2x1xi32> loc(#loc246) + %left_idx_224 = "tt.reduce"(%left_idx_223) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc312) + %left_idx_225 = tt.expand_dims %left_idx_224 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc248) + %left_idx_226 = tt.broadcast %left_idx_225 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc249) + %right_idx_227 = arith.muli %y_idx_222, %iright : tensor<256x2x1xi32> loc(#loc251) + %right_idx_228 = "tt.reduce"(%right_idx_227) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc315) + %right_idx_229 = tt.expand_dims %right_idx_228 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc253) + %right_idx_230 = tt.broadcast %right_idx_229 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc254) + %left_idx_231 = tt.reshape %left_idx_226 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_232 = tt.reshape %right_idx_230 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_233 = arith.cmpi slt, %ileft_220, %iright_221 : tensor<32x16xi32> loc(#loc257) + %eq_234 = arith.cmpi eq, %ileft_220, %iright_221 : tensor<32x16xi32> loc(#loc258) + %cond_235 = arith.cmpi sgt, %left_idx_231, %right_idx_232 : tensor<32x16xi32> loc(#loc259) + %cond_236 = arith.andi %eq_234, %cond_235 : tensor<32x16xi1> loc(#loc260) + %cond_237 = arith.ori %cond_233, %cond_236 : tensor<32x16xi1> loc(#loc261) + %cond_238 = arith.extui %cond_237 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc262) + %cond_239 = arith.xori %cond_238, %flip_137 : tensor<32x16xi32> loc(#loc262) + %cond_240 = arith.cmpi ne, %cond_239, %cst_3 : tensor<32x16xi32> loc(#loc263) + %ret_241 = arith.xori %ileft_220, %iright_221 : tensor<32x16xi32> loc(#loc264) + %ret_242 = arith.select %cond_240, %ret_241, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_243 = arith.xori %ret_207, %ret_242 : tensor<32x16xi32> loc(#loc266) + %new_idxs_244 = arith.xori %left_idx_231, %right_idx_232 : tensor<32x16xi32> loc(#loc267) + %new_idxs_245 = arith.select %cond_240, %new_idxs_244, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_246 = arith.xori %new_idxs_210, %new_idxs_245 : tensor<32x16xi32> loc(#loc269) + %y_247 = tt.reshape %ret_243 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc232) + %ileft_248 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc234) + %ileft_249 = arith.muli %y_247, %ileft_248 : tensor<32x2x8xi32> loc(#loc234) + %ileft_250 = "tt.reduce"(%ileft_249) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc307) + %ileft_251 = tt.expand_dims %ileft_250 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc236) + %ileft_252 = tt.broadcast %ileft_251 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc237) + %iright_253 = arith.muli %y_247, %flip_136 : tensor<32x2x8xi32> loc(#loc238) + %iright_254 = "tt.reduce"(%iright_253) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc309) + %iright_255 = tt.expand_dims %iright_254 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc240) + %iright_256 = tt.broadcast %iright_255 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc241) + %ileft_257 = tt.reshape %ileft_252 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_258 = tt.reshape %iright_256 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_259 = tt.reshape %new_idxs_246 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc244) + %left_idx_260 = arith.muli %y_idx_259, %ileft_248 : tensor<32x2x8xi32> loc(#loc246) + %left_idx_261 = "tt.reduce"(%left_idx_260) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc312) + %left_idx_262 = tt.expand_dims %left_idx_261 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc248) + %left_idx_263 = tt.broadcast %left_idx_262 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc249) + %right_idx_264 = arith.muli %y_idx_259, %flip_136 : tensor<32x2x8xi32> loc(#loc251) + %right_idx_265 = "tt.reduce"(%right_idx_264) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc315) + %right_idx_266 = tt.expand_dims %right_idx_265 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc253) + %right_idx_267 = tt.broadcast %right_idx_266 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc254) + %left_idx_268 = tt.reshape %left_idx_263 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_269 = tt.reshape %right_idx_267 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_270 = arith.cmpi slt, %ileft_257, %iright_258 : tensor<32x16xi32> loc(#loc257) + %eq_271 = arith.cmpi eq, %ileft_257, %iright_258 : tensor<32x16xi32> loc(#loc258) + %cond_272 = arith.cmpi sgt, %left_idx_268, %right_idx_269 : tensor<32x16xi32> loc(#loc259) + %cond_273 = arith.andi %eq_271, %cond_272 : tensor<32x16xi1> loc(#loc260) + %cond_274 = arith.ori %cond_270, %cond_273 : tensor<32x16xi1> loc(#loc261) + %ret_275 = arith.xori %ileft_257, %iright_258 : tensor<32x16xi32> loc(#loc264) + %ret_276 = arith.select %cond_274, %ret_275, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_277 = arith.xori %ret_243, %ret_276 : tensor<32x16xi32> loc(#loc266) + %new_idxs_278 = arith.xori %left_idx_268, %right_idx_269 : tensor<32x16xi32> loc(#loc267) + %new_idxs_279 = arith.select %cond_274, %new_idxs_278, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_280 = arith.xori %new_idxs_246, %new_idxs_279 : tensor<32x16xi32> loc(#loc269) + %y_281 = tt.reshape %ret_277 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc232) + %ileft_282 = arith.muli %y_281, %ileft_139 : tensor<64x2x4xi32> loc(#loc234) + %ileft_283 = "tt.reduce"(%ileft_282) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc307) + %ileft_284 = tt.expand_dims %ileft_283 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc236) + %ileft_285 = tt.broadcast %ileft_284 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc237) + %iright_286 = arith.muli %y_281, %flip_61 : tensor<64x2x4xi32> loc(#loc238) + %iright_287 = "tt.reduce"(%iright_286) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc309) + %iright_288 = tt.expand_dims %iright_287 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc240) + %iright_289 = tt.broadcast %iright_288 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc241) + %ileft_290 = tt.reshape %ileft_285 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_291 = tt.reshape %iright_289 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_292 = tt.reshape %new_idxs_280 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc244) + %left_idx_293 = arith.muli %y_idx_292, %ileft_139 : tensor<64x2x4xi32> loc(#loc246) + %left_idx_294 = "tt.reduce"(%left_idx_293) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc312) + %left_idx_295 = tt.expand_dims %left_idx_294 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc248) + %left_idx_296 = tt.broadcast %left_idx_295 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc249) + %right_idx_297 = arith.muli %y_idx_292, %flip_61 : tensor<64x2x4xi32> loc(#loc251) + %right_idx_298 = "tt.reduce"(%right_idx_297) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc315) + %right_idx_299 = tt.expand_dims %right_idx_298 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc253) + %right_idx_300 = tt.broadcast %right_idx_299 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc254) + %left_idx_301 = tt.reshape %left_idx_296 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_302 = tt.reshape %right_idx_300 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_303 = arith.cmpi slt, %ileft_290, %iright_291 : tensor<32x16xi32> loc(#loc257) + %eq_304 = arith.cmpi eq, %ileft_290, %iright_291 : tensor<32x16xi32> loc(#loc258) + %cond_305 = arith.cmpi sgt, %left_idx_301, %right_idx_302 : tensor<32x16xi32> loc(#loc259) + %cond_306 = arith.andi %eq_304, %cond_305 : tensor<32x16xi1> loc(#loc260) + %cond_307 = arith.ori %cond_303, %cond_306 : tensor<32x16xi1> loc(#loc261) + %ret_308 = arith.xori %ileft_290, %iright_291 : tensor<32x16xi32> loc(#loc264) + %ret_309 = arith.select %cond_307, %ret_308, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_310 = arith.xori %ret_277, %ret_309 : tensor<32x16xi32> loc(#loc266) + %new_idxs_311 = arith.xori %left_idx_301, %right_idx_302 : tensor<32x16xi32> loc(#loc267) + %new_idxs_312 = arith.select %cond_307, %new_idxs_311, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_313 = arith.xori %new_idxs_280, %new_idxs_312 : tensor<32x16xi32> loc(#loc269) + %y_314 = tt.reshape %ret_310 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc232) + %ileft_315 = arith.muli %y_314, %ileft_64 : tensor<128x2x2xi32> loc(#loc234) + %ileft_316 = "tt.reduce"(%ileft_315) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc307) + %ileft_317 = tt.expand_dims %ileft_316 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc236) + %ileft_318 = tt.broadcast %ileft_317 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc237) + %iright_319 = arith.muli %y_314, %flip_24 : tensor<128x2x2xi32> loc(#loc238) + %iright_320 = "tt.reduce"(%iright_319) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc309) + %iright_321 = tt.expand_dims %iright_320 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc240) + %iright_322 = tt.broadcast %iright_321 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc241) + %ileft_323 = tt.reshape %ileft_318 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_324 = tt.reshape %iright_322 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_325 = tt.reshape %new_idxs_313 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc244) + %left_idx_326 = arith.muli %y_idx_325, %ileft_64 : tensor<128x2x2xi32> loc(#loc246) + %left_idx_327 = "tt.reduce"(%left_idx_326) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc312) + %left_idx_328 = tt.expand_dims %left_idx_327 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc248) + %left_idx_329 = tt.broadcast %left_idx_328 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc249) + %right_idx_330 = arith.muli %y_idx_325, %flip_24 : tensor<128x2x2xi32> loc(#loc251) + %right_idx_331 = "tt.reduce"(%right_idx_330) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc315) + %right_idx_332 = tt.expand_dims %right_idx_331 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc253) + %right_idx_333 = tt.broadcast %right_idx_332 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc254) + %left_idx_334 = tt.reshape %left_idx_329 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_335 = tt.reshape %right_idx_333 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_336 = arith.cmpi slt, %ileft_323, %iright_324 : tensor<32x16xi32> loc(#loc257) + %eq_337 = arith.cmpi eq, %ileft_323, %iright_324 : tensor<32x16xi32> loc(#loc258) + %cond_338 = arith.cmpi sgt, %left_idx_334, %right_idx_335 : tensor<32x16xi32> loc(#loc259) + %cond_339 = arith.andi %eq_337, %cond_338 : tensor<32x16xi1> loc(#loc260) + %cond_340 = arith.ori %cond_336, %cond_339 : tensor<32x16xi1> loc(#loc261) + %ret_341 = arith.xori %ileft_323, %iright_324 : tensor<32x16xi32> loc(#loc264) + %ret_342 = arith.select %cond_340, %ret_341, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_343 = arith.xori %ret_310, %ret_342 : tensor<32x16xi32> loc(#loc266) + %new_idxs_344 = arith.xori %left_idx_334, %right_idx_335 : tensor<32x16xi32> loc(#loc267) + %new_idxs_345 = arith.select %cond_340, %new_idxs_344, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_346 = arith.xori %new_idxs_313, %new_idxs_345 : tensor<32x16xi32> loc(#loc269) + %y_347 = tt.reshape %ret_343 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc232) + %ileft_348 = arith.muli %y_347, %ileft : tensor<256x2x1xi32> loc(#loc234) + %ileft_349 = "tt.reduce"(%ileft_348) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc307) + %ileft_350 = tt.expand_dims %ileft_349 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc236) + %ileft_351 = tt.broadcast %ileft_350 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc237) + %iright_352 = arith.muli %y_347, %iright : tensor<256x2x1xi32> loc(#loc238) + %iright_353 = "tt.reduce"(%iright_352) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc309) + %iright_354 = tt.expand_dims %iright_353 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc240) + %iright_355 = tt.broadcast %iright_354 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc241) + %ileft_356 = tt.reshape %ileft_351 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_357 = tt.reshape %iright_355 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_358 = tt.reshape %new_idxs_346 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc244) + %left_idx_359 = arith.muli %y_idx_358, %ileft : tensor<256x2x1xi32> loc(#loc246) + %left_idx_360 = "tt.reduce"(%left_idx_359) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc312) + %left_idx_361 = tt.expand_dims %left_idx_360 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc248) + %left_idx_362 = tt.broadcast %left_idx_361 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc249) + %right_idx_363 = arith.muli %y_idx_358, %iright : tensor<256x2x1xi32> loc(#loc251) + %right_idx_364 = "tt.reduce"(%right_idx_363) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc315) + %right_idx_365 = tt.expand_dims %right_idx_364 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc253) + %right_idx_366 = tt.broadcast %right_idx_365 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc254) + %left_idx_367 = tt.reshape %left_idx_362 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_368 = tt.reshape %right_idx_366 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_369 = arith.cmpi slt, %ileft_356, %iright_357 : tensor<32x16xi32> loc(#loc257) + %eq_370 = arith.cmpi eq, %ileft_356, %iright_357 : tensor<32x16xi32> loc(#loc258) + %cond_371 = arith.cmpi sgt, %left_idx_367, %right_idx_368 : tensor<32x16xi32> loc(#loc259) + %cond_372 = arith.andi %eq_370, %cond_371 : tensor<32x16xi1> loc(#loc260) + %cond_373 = arith.ori %cond_369, %cond_372 : tensor<32x16xi1> loc(#loc261) + %new_idxs_374 = arith.xori %left_idx_367, %right_idx_368 : tensor<32x16xi32> loc(#loc267) + %new_idxs_375 = arith.select %cond_373, %new_idxs_374, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_376 = arith.xori %new_idxs_346, %new_idxs_375 : tensor<32x16xi32> loc(#loc269) + %tmp14 = arith.cmpi eq, %tmp0_21, %cst_6 : tensor<32x16xi64> loc(#loc192) + %tmp16 = arith.extui %tmp14 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc224) + %y_377 = tt.reshape %tmp16 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc270) + %ileft_378 = arith.muli %y_377, %ileft : tensor<256x2x1xi32> loc(#loc271) + %ileft_379 = "tt.reduce"(%ileft_378) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc317) + %ileft_380 = tt.expand_dims %ileft_379 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc273) + %ileft_381 = tt.broadcast %ileft_380 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc274) + %iright_382 = arith.muli %y_377, %iright : tensor<256x2x1xi32> loc(#loc275) + %iright_383 = "tt.reduce"(%iright_382) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc319) + %iright_384 = tt.expand_dims %iright_383 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc277) + %iright_385 = tt.broadcast %iright_384 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc278) + %ileft_386 = tt.reshape %ileft_381 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_387 = tt.reshape %iright_385 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %cond_388 = arith.cmpi slt, %ileft_386, %iright_387 : tensor<32x16xi32> loc(#loc281) + %eq_389 = arith.cmpi eq, %ileft_386, %iright_387 : tensor<32x16xi32> loc(#loc282) + %cond_390 = arith.andi %eq_389, %cond_49 : tensor<32x16xi1> loc(#loc283) + %cond_391 = arith.ori %cond_388, %cond_390 : tensor<32x16xi1> loc(#loc284) + %cond_392 = arith.extui %cond_391 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc285) + %cond_393 = arith.xori %cond_392, %flip_25 : tensor<32x16xi32> loc(#loc285) + %cond_394 = arith.cmpi ne, %cond_393, %cst_3 : tensor<32x16xi32> loc(#loc286) + %ret_395 = arith.xori %ileft_386, %iright_387 : tensor<32x16xi32> loc(#loc287) + %ret_396 = arith.select %cond_394, %ret_395, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_397 = arith.xori %tmp16, %ret_396 : tensor<32x16xi32> loc(#loc289) + %new_idxs_398 = arith.select %cond_394, %new_idxs, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_399 = arith.xori %new_idxs_59, %new_idxs_398 : tensor<32x16xi32> loc(#loc291) + %y_400 = tt.reshape %ret_397 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc270) + %ileft_401 = arith.muli %y_400, %ileft_64 : tensor<128x2x2xi32> loc(#loc271) + %ileft_402 = "tt.reduce"(%ileft_401) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc317) + %ileft_403 = tt.expand_dims %ileft_402 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc273) + %ileft_404 = tt.broadcast %ileft_403 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc274) + %iright_405 = arith.muli %y_400, %flip_24 : tensor<128x2x2xi32> loc(#loc275) + %iright_406 = "tt.reduce"(%iright_405) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc319) + %iright_407 = tt.expand_dims %iright_406 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc277) + %iright_408 = tt.broadcast %iright_407 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc278) + %ileft_409 = tt.reshape %ileft_404 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_410 = tt.reshape %iright_408 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_411 = tt.reshape %new_idxs_399 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc292) + %left_idx_412 = arith.muli %y_idx_411, %ileft_64 : tensor<128x2x2xi32> loc(#loc293) + %left_idx_413 = "tt.reduce"(%left_idx_412) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc321) + %left_idx_414 = tt.expand_dims %left_idx_413 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc295) + %left_idx_415 = tt.broadcast %left_idx_414 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc296) + %right_idx_416 = arith.muli %y_idx_411, %flip_24 : tensor<128x2x2xi32> loc(#loc297) + %right_idx_417 = "tt.reduce"(%right_idx_416) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc323) + %right_idx_418 = tt.expand_dims %right_idx_417 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc299) + %right_idx_419 = tt.broadcast %right_idx_418 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc300) + %left_idx_420 = tt.reshape %left_idx_415 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_421 = tt.reshape %right_idx_419 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_422 = arith.cmpi slt, %ileft_409, %iright_410 : tensor<32x16xi32> loc(#loc281) + %eq_423 = arith.cmpi eq, %ileft_409, %iright_410 : tensor<32x16xi32> loc(#loc282) + %cond_424 = arith.cmpi sgt, %left_idx_420, %right_idx_421 : tensor<32x16xi32> loc(#loc303) + %cond_425 = arith.andi %eq_423, %cond_424 : tensor<32x16xi1> loc(#loc283) + %cond_426 = arith.ori %cond_422, %cond_425 : tensor<32x16xi1> loc(#loc284) + %cond_427 = arith.extui %cond_426 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc285) + %cond_428 = arith.xori %cond_427, %flip_62 : tensor<32x16xi32> loc(#loc285) + %cond_429 = arith.cmpi ne, %cond_428, %cst_3 : tensor<32x16xi32> loc(#loc286) + %ret_430 = arith.xori %ileft_409, %iright_410 : tensor<32x16xi32> loc(#loc287) + %ret_431 = arith.select %cond_429, %ret_430, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_432 = arith.xori %ret_397, %ret_431 : tensor<32x16xi32> loc(#loc289) + %new_idxs_433 = arith.xori %left_idx_420, %right_idx_421 : tensor<32x16xi32> loc(#loc304) + %new_idxs_434 = arith.select %cond_429, %new_idxs_433, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_435 = arith.xori %new_idxs_399, %new_idxs_434 : tensor<32x16xi32> loc(#loc291) + %y_436 = tt.reshape %ret_432 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc270) + %ileft_437 = arith.muli %y_436, %ileft : tensor<256x2x1xi32> loc(#loc271) + %ileft_438 = "tt.reduce"(%ileft_437) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc317) + %ileft_439 = tt.expand_dims %ileft_438 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc273) + %ileft_440 = tt.broadcast %ileft_439 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc274) + %iright_441 = arith.muli %y_436, %iright : tensor<256x2x1xi32> loc(#loc275) + %iright_442 = "tt.reduce"(%iright_441) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc319) + %iright_443 = tt.expand_dims %iright_442 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc277) + %iright_444 = tt.broadcast %iright_443 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc278) + %ileft_445 = tt.reshape %ileft_440 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_446 = tt.reshape %iright_444 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_447 = tt.reshape %new_idxs_435 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc292) + %left_idx_448 = arith.muli %y_idx_447, %ileft : tensor<256x2x1xi32> loc(#loc293) + %left_idx_449 = "tt.reduce"(%left_idx_448) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc321) + %left_idx_450 = tt.expand_dims %left_idx_449 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc295) + %left_idx_451 = tt.broadcast %left_idx_450 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc296) + %right_idx_452 = arith.muli %y_idx_447, %iright : tensor<256x2x1xi32> loc(#loc297) + %right_idx_453 = "tt.reduce"(%right_idx_452) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc323) + %right_idx_454 = tt.expand_dims %right_idx_453 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc299) + %right_idx_455 = tt.broadcast %right_idx_454 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc300) + %left_idx_456 = tt.reshape %left_idx_451 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_457 = tt.reshape %right_idx_455 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_458 = arith.cmpi slt, %ileft_445, %iright_446 : tensor<32x16xi32> loc(#loc281) + %eq_459 = arith.cmpi eq, %ileft_445, %iright_446 : tensor<32x16xi32> loc(#loc282) + %cond_460 = arith.cmpi sgt, %left_idx_456, %right_idx_457 : tensor<32x16xi32> loc(#loc303) + %cond_461 = arith.andi %eq_459, %cond_460 : tensor<32x16xi1> loc(#loc283) + %cond_462 = arith.ori %cond_458, %cond_461 : tensor<32x16xi1> loc(#loc284) + %cond_463 = arith.extui %cond_462 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc285) + %cond_464 = arith.xori %cond_463, %flip_62 : tensor<32x16xi32> loc(#loc285) + %cond_465 = arith.cmpi ne, %cond_464, %cst_3 : tensor<32x16xi32> loc(#loc286) + %ret_466 = arith.xori %ileft_445, %iright_446 : tensor<32x16xi32> loc(#loc287) + %ret_467 = arith.select %cond_465, %ret_466, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_468 = arith.xori %ret_432, %ret_467 : tensor<32x16xi32> loc(#loc289) + %new_idxs_469 = arith.xori %left_idx_456, %right_idx_457 : tensor<32x16xi32> loc(#loc304) + %new_idxs_470 = arith.select %cond_465, %new_idxs_469, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_471 = arith.xori %new_idxs_435, %new_idxs_470 : tensor<32x16xi32> loc(#loc291) + %y_472 = tt.reshape %ret_468 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc270) + %ileft_473 = arith.muli %y_472, %ileft_139 : tensor<64x2x4xi32> loc(#loc271) + %ileft_474 = "tt.reduce"(%ileft_473) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc317) + %ileft_475 = tt.expand_dims %ileft_474 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc273) + %ileft_476 = tt.broadcast %ileft_475 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc274) + %iright_477 = arith.muli %y_472, %flip_61 : tensor<64x2x4xi32> loc(#loc275) + %iright_478 = "tt.reduce"(%iright_477) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc319) + %iright_479 = tt.expand_dims %iright_478 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc277) + %iright_480 = tt.broadcast %iright_479 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc278) + %ileft_481 = tt.reshape %ileft_476 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_482 = tt.reshape %iright_480 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_483 = tt.reshape %new_idxs_471 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc292) + %left_idx_484 = arith.muli %y_idx_483, %ileft_139 : tensor<64x2x4xi32> loc(#loc293) + %left_idx_485 = "tt.reduce"(%left_idx_484) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc321) + %left_idx_486 = tt.expand_dims %left_idx_485 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc295) + %left_idx_487 = tt.broadcast %left_idx_486 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc296) + %right_idx_488 = arith.muli %y_idx_483, %flip_61 : tensor<64x2x4xi32> loc(#loc297) + %right_idx_489 = "tt.reduce"(%right_idx_488) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc323) + %right_idx_490 = tt.expand_dims %right_idx_489 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc299) + %right_idx_491 = tt.broadcast %right_idx_490 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc300) + %left_idx_492 = tt.reshape %left_idx_487 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_493 = tt.reshape %right_idx_491 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_494 = arith.cmpi slt, %ileft_481, %iright_482 : tensor<32x16xi32> loc(#loc281) + %eq_495 = arith.cmpi eq, %ileft_481, %iright_482 : tensor<32x16xi32> loc(#loc282) + %cond_496 = arith.cmpi sgt, %left_idx_492, %right_idx_493 : tensor<32x16xi32> loc(#loc303) + %cond_497 = arith.andi %eq_495, %cond_496 : tensor<32x16xi1> loc(#loc283) + %cond_498 = arith.ori %cond_494, %cond_497 : tensor<32x16xi1> loc(#loc284) + %cond_499 = arith.extui %cond_498 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc285) + %cond_500 = arith.xori %cond_499, %flip_137 : tensor<32x16xi32> loc(#loc285) + %cond_501 = arith.cmpi ne, %cond_500, %cst_3 : tensor<32x16xi32> loc(#loc286) + %ret_502 = arith.xori %ileft_481, %iright_482 : tensor<32x16xi32> loc(#loc287) + %ret_503 = arith.select %cond_501, %ret_502, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_504 = arith.xori %ret_468, %ret_503 : tensor<32x16xi32> loc(#loc289) + %new_idxs_505 = arith.xori %left_idx_492, %right_idx_493 : tensor<32x16xi32> loc(#loc304) + %new_idxs_506 = arith.select %cond_501, %new_idxs_505, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_507 = arith.xori %new_idxs_471, %new_idxs_506 : tensor<32x16xi32> loc(#loc291) + %y_508 = tt.reshape %ret_504 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc270) + %ileft_509 = arith.muli %y_508, %ileft_64 : tensor<128x2x2xi32> loc(#loc271) + %ileft_510 = "tt.reduce"(%ileft_509) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc317) + %ileft_511 = tt.expand_dims %ileft_510 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc273) + %ileft_512 = tt.broadcast %ileft_511 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc274) + %iright_513 = arith.muli %y_508, %flip_24 : tensor<128x2x2xi32> loc(#loc275) + %iright_514 = "tt.reduce"(%iright_513) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc319) + %iright_515 = tt.expand_dims %iright_514 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc277) + %iright_516 = tt.broadcast %iright_515 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc278) + %ileft_517 = tt.reshape %ileft_512 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_518 = tt.reshape %iright_516 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_519 = tt.reshape %new_idxs_507 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc292) + %left_idx_520 = arith.muli %y_idx_519, %ileft_64 : tensor<128x2x2xi32> loc(#loc293) + %left_idx_521 = "tt.reduce"(%left_idx_520) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc321) + %left_idx_522 = tt.expand_dims %left_idx_521 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc295) + %left_idx_523 = tt.broadcast %left_idx_522 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc296) + %right_idx_524 = arith.muli %y_idx_519, %flip_24 : tensor<128x2x2xi32> loc(#loc297) + %right_idx_525 = "tt.reduce"(%right_idx_524) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc323) + %right_idx_526 = tt.expand_dims %right_idx_525 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc299) + %right_idx_527 = tt.broadcast %right_idx_526 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc300) + %left_idx_528 = tt.reshape %left_idx_523 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_529 = tt.reshape %right_idx_527 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_530 = arith.cmpi slt, %ileft_517, %iright_518 : tensor<32x16xi32> loc(#loc281) + %eq_531 = arith.cmpi eq, %ileft_517, %iright_518 : tensor<32x16xi32> loc(#loc282) + %cond_532 = arith.cmpi sgt, %left_idx_528, %right_idx_529 : tensor<32x16xi32> loc(#loc303) + %cond_533 = arith.andi %eq_531, %cond_532 : tensor<32x16xi1> loc(#loc283) + %cond_534 = arith.ori %cond_530, %cond_533 : tensor<32x16xi1> loc(#loc284) + %cond_535 = arith.extui %cond_534 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc285) + %cond_536 = arith.xori %cond_535, %flip_137 : tensor<32x16xi32> loc(#loc285) + %cond_537 = arith.cmpi ne, %cond_536, %cst_3 : tensor<32x16xi32> loc(#loc286) + %ret_538 = arith.xori %ileft_517, %iright_518 : tensor<32x16xi32> loc(#loc287) + %ret_539 = arith.select %cond_537, %ret_538, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_540 = arith.xori %ret_504, %ret_539 : tensor<32x16xi32> loc(#loc289) + %new_idxs_541 = arith.xori %left_idx_528, %right_idx_529 : tensor<32x16xi32> loc(#loc304) + %new_idxs_542 = arith.select %cond_537, %new_idxs_541, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_543 = arith.xori %new_idxs_507, %new_idxs_542 : tensor<32x16xi32> loc(#loc291) + %y_544 = tt.reshape %ret_540 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc270) + %ileft_545 = arith.muli %y_544, %ileft : tensor<256x2x1xi32> loc(#loc271) + %ileft_546 = "tt.reduce"(%ileft_545) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc317) + %ileft_547 = tt.expand_dims %ileft_546 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc273) + %ileft_548 = tt.broadcast %ileft_547 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc274) + %iright_549 = arith.muli %y_544, %iright : tensor<256x2x1xi32> loc(#loc275) + %iright_550 = "tt.reduce"(%iright_549) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc319) + %iright_551 = tt.expand_dims %iright_550 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc277) + %iright_552 = tt.broadcast %iright_551 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc278) + %ileft_553 = tt.reshape %ileft_548 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_554 = tt.reshape %iright_552 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_555 = tt.reshape %new_idxs_543 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc292) + %left_idx_556 = arith.muli %y_idx_555, %ileft : tensor<256x2x1xi32> loc(#loc293) + %left_idx_557 = "tt.reduce"(%left_idx_556) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc321) + %left_idx_558 = tt.expand_dims %left_idx_557 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc295) + %left_idx_559 = tt.broadcast %left_idx_558 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc296) + %right_idx_560 = arith.muli %y_idx_555, %iright : tensor<256x2x1xi32> loc(#loc297) + %right_idx_561 = "tt.reduce"(%right_idx_560) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc323) + %right_idx_562 = tt.expand_dims %right_idx_561 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc299) + %right_idx_563 = tt.broadcast %right_idx_562 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc300) + %left_idx_564 = tt.reshape %left_idx_559 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_565 = tt.reshape %right_idx_563 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_566 = arith.cmpi slt, %ileft_553, %iright_554 : tensor<32x16xi32> loc(#loc281) + %eq_567 = arith.cmpi eq, %ileft_553, %iright_554 : tensor<32x16xi32> loc(#loc282) + %cond_568 = arith.cmpi sgt, %left_idx_564, %right_idx_565 : tensor<32x16xi32> loc(#loc303) + %cond_569 = arith.andi %eq_567, %cond_568 : tensor<32x16xi1> loc(#loc283) + %cond_570 = arith.ori %cond_566, %cond_569 : tensor<32x16xi1> loc(#loc284) + %cond_571 = arith.extui %cond_570 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc285) + %cond_572 = arith.xori %cond_571, %flip_137 : tensor<32x16xi32> loc(#loc285) + %cond_573 = arith.cmpi ne, %cond_572, %cst_3 : tensor<32x16xi32> loc(#loc286) + %ret_574 = arith.xori %ileft_553, %iright_554 : tensor<32x16xi32> loc(#loc287) + %ret_575 = arith.select %cond_573, %ret_574, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_576 = arith.xori %ret_540, %ret_575 : tensor<32x16xi32> loc(#loc289) + %new_idxs_577 = arith.xori %left_idx_564, %right_idx_565 : tensor<32x16xi32> loc(#loc304) + %new_idxs_578 = arith.select %cond_573, %new_idxs_577, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_579 = arith.xori %new_idxs_543, %new_idxs_578 : tensor<32x16xi32> loc(#loc291) + %y_580 = tt.reshape %ret_576 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc270) + %ileft_581 = arith.muli %y_580, %ileft_248 : tensor<32x2x8xi32> loc(#loc271) + %ileft_582 = "tt.reduce"(%ileft_581) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc317) + %ileft_583 = tt.expand_dims %ileft_582 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc273) + %ileft_584 = tt.broadcast %ileft_583 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc274) + %iright_585 = arith.muli %y_580, %flip_136 : tensor<32x2x8xi32> loc(#loc275) + %iright_586 = "tt.reduce"(%iright_585) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc319) + %iright_587 = tt.expand_dims %iright_586 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc277) + %iright_588 = tt.broadcast %iright_587 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc278) + %ileft_589 = tt.reshape %ileft_584 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_590 = tt.reshape %iright_588 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_591 = tt.reshape %new_idxs_579 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc292) + %left_idx_592 = arith.muli %y_idx_591, %ileft_248 : tensor<32x2x8xi32> loc(#loc293) + %left_idx_593 = "tt.reduce"(%left_idx_592) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc321) + %left_idx_594 = tt.expand_dims %left_idx_593 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc295) + %left_idx_595 = tt.broadcast %left_idx_594 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc296) + %right_idx_596 = arith.muli %y_idx_591, %flip_136 : tensor<32x2x8xi32> loc(#loc297) + %right_idx_597 = "tt.reduce"(%right_idx_596) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc323) + %right_idx_598 = tt.expand_dims %right_idx_597 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc299) + %right_idx_599 = tt.broadcast %right_idx_598 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc300) + %left_idx_600 = tt.reshape %left_idx_595 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_601 = tt.reshape %right_idx_599 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_602 = arith.cmpi slt, %ileft_589, %iright_590 : tensor<32x16xi32> loc(#loc281) + %eq_603 = arith.cmpi eq, %ileft_589, %iright_590 : tensor<32x16xi32> loc(#loc282) + %cond_604 = arith.cmpi sgt, %left_idx_600, %right_idx_601 : tensor<32x16xi32> loc(#loc303) + %cond_605 = arith.andi %eq_603, %cond_604 : tensor<32x16xi1> loc(#loc283) + %cond_606 = arith.ori %cond_602, %cond_605 : tensor<32x16xi1> loc(#loc284) + %ret_607 = arith.xori %ileft_589, %iright_590 : tensor<32x16xi32> loc(#loc287) + %ret_608 = arith.select %cond_606, %ret_607, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_609 = arith.xori %ret_576, %ret_608 : tensor<32x16xi32> loc(#loc289) + %new_idxs_610 = arith.xori %left_idx_600, %right_idx_601 : tensor<32x16xi32> loc(#loc304) + %new_idxs_611 = arith.select %cond_606, %new_idxs_610, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_612 = arith.xori %new_idxs_579, %new_idxs_611 : tensor<32x16xi32> loc(#loc291) + %y_613 = tt.reshape %ret_609 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc270) + %ileft_614 = arith.muli %y_613, %ileft_139 : tensor<64x2x4xi32> loc(#loc271) + %ileft_615 = "tt.reduce"(%ileft_614) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc317) + %ileft_616 = tt.expand_dims %ileft_615 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc273) + %ileft_617 = tt.broadcast %ileft_616 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc274) + %iright_618 = arith.muli %y_613, %flip_61 : tensor<64x2x4xi32> loc(#loc275) + %iright_619 = "tt.reduce"(%iright_618) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc319) + %iright_620 = tt.expand_dims %iright_619 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc277) + %iright_621 = tt.broadcast %iright_620 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc278) + %ileft_622 = tt.reshape %ileft_617 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_623 = tt.reshape %iright_621 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_624 = tt.reshape %new_idxs_612 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc292) + %left_idx_625 = arith.muli %y_idx_624, %ileft_139 : tensor<64x2x4xi32> loc(#loc293) + %left_idx_626 = "tt.reduce"(%left_idx_625) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc321) + %left_idx_627 = tt.expand_dims %left_idx_626 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc295) + %left_idx_628 = tt.broadcast %left_idx_627 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc296) + %right_idx_629 = arith.muli %y_idx_624, %flip_61 : tensor<64x2x4xi32> loc(#loc297) + %right_idx_630 = "tt.reduce"(%right_idx_629) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc323) + %right_idx_631 = tt.expand_dims %right_idx_630 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc299) + %right_idx_632 = tt.broadcast %right_idx_631 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc300) + %left_idx_633 = tt.reshape %left_idx_628 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_634 = tt.reshape %right_idx_632 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_635 = arith.cmpi slt, %ileft_622, %iright_623 : tensor<32x16xi32> loc(#loc281) + %eq_636 = arith.cmpi eq, %ileft_622, %iright_623 : tensor<32x16xi32> loc(#loc282) + %cond_637 = arith.cmpi sgt, %left_idx_633, %right_idx_634 : tensor<32x16xi32> loc(#loc303) + %cond_638 = arith.andi %eq_636, %cond_637 : tensor<32x16xi1> loc(#loc283) + %cond_639 = arith.ori %cond_635, %cond_638 : tensor<32x16xi1> loc(#loc284) + %ret_640 = arith.xori %ileft_622, %iright_623 : tensor<32x16xi32> loc(#loc287) + %ret_641 = arith.select %cond_639, %ret_640, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_642 = arith.xori %ret_609, %ret_641 : tensor<32x16xi32> loc(#loc289) + %new_idxs_643 = arith.xori %left_idx_633, %right_idx_634 : tensor<32x16xi32> loc(#loc304) + %new_idxs_644 = arith.select %cond_639, %new_idxs_643, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_645 = arith.xori %new_idxs_612, %new_idxs_644 : tensor<32x16xi32> loc(#loc291) + %y_646 = tt.reshape %ret_642 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc270) + %ileft_647 = arith.muli %y_646, %ileft_64 : tensor<128x2x2xi32> loc(#loc271) + %ileft_648 = "tt.reduce"(%ileft_647) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc317) + %ileft_649 = tt.expand_dims %ileft_648 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc273) + %ileft_650 = tt.broadcast %ileft_649 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc274) + %iright_651 = arith.muli %y_646, %flip_24 : tensor<128x2x2xi32> loc(#loc275) + %iright_652 = "tt.reduce"(%iright_651) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc319) + %iright_653 = tt.expand_dims %iright_652 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc277) + %iright_654 = tt.broadcast %iright_653 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc278) + %ileft_655 = tt.reshape %ileft_650 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_656 = tt.reshape %iright_654 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_657 = tt.reshape %new_idxs_645 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc292) + %left_idx_658 = arith.muli %y_idx_657, %ileft_64 : tensor<128x2x2xi32> loc(#loc293) + %left_idx_659 = "tt.reduce"(%left_idx_658) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc321) + %left_idx_660 = tt.expand_dims %left_idx_659 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc295) + %left_idx_661 = tt.broadcast %left_idx_660 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc296) + %right_idx_662 = arith.muli %y_idx_657, %flip_24 : tensor<128x2x2xi32> loc(#loc297) + %right_idx_663 = "tt.reduce"(%right_idx_662) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc323) + %right_idx_664 = tt.expand_dims %right_idx_663 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc299) + %right_idx_665 = tt.broadcast %right_idx_664 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc300) + %left_idx_666 = tt.reshape %left_idx_661 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_667 = tt.reshape %right_idx_665 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_668 = arith.cmpi slt, %ileft_655, %iright_656 : tensor<32x16xi32> loc(#loc281) + %eq_669 = arith.cmpi eq, %ileft_655, %iright_656 : tensor<32x16xi32> loc(#loc282) + %cond_670 = arith.cmpi sgt, %left_idx_666, %right_idx_667 : tensor<32x16xi32> loc(#loc303) + %cond_671 = arith.andi %eq_669, %cond_670 : tensor<32x16xi1> loc(#loc283) + %cond_672 = arith.ori %cond_668, %cond_671 : tensor<32x16xi1> loc(#loc284) + %ret_673 = arith.xori %ileft_655, %iright_656 : tensor<32x16xi32> loc(#loc287) + %ret_674 = arith.select %cond_672, %ret_673, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_675 = arith.xori %ret_642, %ret_674 : tensor<32x16xi32> loc(#loc289) + %new_idxs_676 = arith.xori %left_idx_666, %right_idx_667 : tensor<32x16xi32> loc(#loc304) + %new_idxs_677 = arith.select %cond_672, %new_idxs_676, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_678 = arith.xori %new_idxs_645, %new_idxs_677 : tensor<32x16xi32> loc(#loc291) + %y_679 = tt.reshape %ret_675 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc270) + %ileft_680 = arith.muli %y_679, %ileft : tensor<256x2x1xi32> loc(#loc271) + %ileft_681 = "tt.reduce"(%ileft_680) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc317) + %ileft_682 = tt.expand_dims %ileft_681 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc273) + %ileft_683 = tt.broadcast %ileft_682 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc274) + %iright_684 = arith.muli %y_679, %iright : tensor<256x2x1xi32> loc(#loc275) + %iright_685 = "tt.reduce"(%iright_684) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc319) + %iright_686 = tt.expand_dims %iright_685 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc277) + %iright_687 = tt.broadcast %iright_686 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc278) + %ileft_688 = tt.reshape %ileft_683 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_689 = tt.reshape %iright_687 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_690 = tt.reshape %new_idxs_678 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc292) + %left_idx_691 = arith.muli %y_idx_690, %ileft : tensor<256x2x1xi32> loc(#loc293) + %left_idx_692 = "tt.reduce"(%left_idx_691) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc321) + %left_idx_693 = tt.expand_dims %left_idx_692 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc295) + %left_idx_694 = tt.broadcast %left_idx_693 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc296) + %right_idx_695 = arith.muli %y_idx_690, %iright : tensor<256x2x1xi32> loc(#loc297) + %right_idx_696 = "tt.reduce"(%right_idx_695) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc323) + %right_idx_697 = tt.expand_dims %right_idx_696 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc299) + %right_idx_698 = tt.broadcast %right_idx_697 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc300) + %left_idx_699 = tt.reshape %left_idx_694 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_700 = tt.reshape %right_idx_698 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_701 = arith.cmpi slt, %ileft_688, %iright_689 : tensor<32x16xi32> loc(#loc281) + %eq_702 = arith.cmpi eq, %ileft_688, %iright_689 : tensor<32x16xi32> loc(#loc282) + %cond_703 = arith.cmpi sgt, %left_idx_699, %right_idx_700 : tensor<32x16xi32> loc(#loc303) + %cond_704 = arith.andi %eq_702, %cond_703 : tensor<32x16xi1> loc(#loc283) + %cond_705 = arith.ori %cond_701, %cond_704 : tensor<32x16xi1> loc(#loc284) + %new_idxs_706 = arith.xori %left_idx_699, %right_idx_700 : tensor<32x16xi32> loc(#loc304) + %new_idxs_707 = arith.select %cond_705, %new_idxs_706, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_708 = arith.xori %new_idxs_678, %new_idxs_707 : tensor<32x16xi32> loc(#loc291) + %tmp20 = arith.extui %tmp5 : tensor<32x16xi1> to tensor<32x16xi64> loc(#loc226) + %tmp23 = arith.select %tmp0_20, %tmp20, %cst_7 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc197) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_713: i64 loc(callsite(#loc1 at #loc198)), %tmp24_714: i64 loc(callsite(#loc1 at #loc198))): + %tmp24_715 = arith.addi %tmp24_713, %tmp24_714 : i64 loc(#loc305) + tt.reduce.return %tmp24_715 : i64 loc(#loc227) + }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc227) + %tmp24_709 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc199) + %tmp25 = arith.extui %tmp14 : tensor<32x16xi1> to tensor<32x16xi64> loc(#loc229) + %tmp28 = arith.select %tmp0_20, %tmp25, %cst_7 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc201) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_713: i64 loc(callsite(#loc1 at #loc202)), %tmp29_714: i64 loc(callsite(#loc1 at #loc202))): + %tmp29_715 = arith.addi %tmp29_713, %tmp29_714 : i64 loc(#loc306) + tt.reduce.return %tmp29_715 : i64 loc(#loc230) + }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc230) + %tmp29_710 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc203) + %tmp30 = arith.trunci %tmp24_709 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc204) + %tmp31 = arith.trunci %tmp29_710 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc205) + %tmp34 = tt.broadcast %tmp30 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc206) + %tmp34_711 = arith.cmpi slt, %tmp0_15, %tmp34 : tensor<32x16xi32> loc(#loc206) + %tmp36 = arith.select %tmp34_711, %new_idxs_376, %cst_5 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc207) + %tmp38 = arith.addi %tmp36, %cst_4 : tensor<32x16xi32> loc(#loc208) + %tmp39 = arith.cmpi slt, %tmp36, %cst_3 : tensor<32x16xi32> loc(#loc209) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc210) + %0 = arith.cmpi sge, %tmp40, %cst_3 : tensor<32x16xi32> loc(#loc88) + %1 = arith.cmpi slt, %tmp40, %cst_4 : tensor<32x16xi32> loc(#loc89) + %2 = arith.andi %0, %1 : tensor<32x16xi1> loc(#loc90) + %3 = arith.xori %xmask_13, %cst_2 : tensor<32x1xi1> loc(#loc91) + %4 = tt.broadcast %3 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc92) + %5 = arith.ori %2, %4 : tensor<32x16xi1> loc(#loc92) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<32x16xi1> loc(#loc93) + %tmp45 = tt.broadcast %tmp31 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc211) + %tmp45_712 = arith.cmpi slt, %tmp0_15, %tmp45 : tensor<32x16xi32> loc(#loc211) + %tmp46 = arith.select %tmp45_712, %new_idxs_708, %cst_5 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc212) + %tmp47 = arith.addi %tmp46, %cst_4 : tensor<32x16xi32> loc(#loc213) + %tmp48 = arith.cmpi slt, %tmp46, %cst_3 : tensor<32x16xi32> loc(#loc214) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc215) + %6 = arith.cmpi sge, %tmp49, %cst_3 : tensor<32x16xi32> loc(#loc99) + %7 = arith.cmpi slt, %tmp49, %cst_4 : tensor<32x16xi32> loc(#loc100) + %8 = arith.andi %6, %7 : tensor<32x16xi1> loc(#loc101) + %9 = arith.ori %8, %4 : tensor<32x16xi1> loc(#loc102) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<32x16xi1> loc(#loc103) + %10 = tt.splat %out_ptr4 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc104) + %11 = tt.addptr %10, %xindex_12 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc104) + tt.store %11, %tmp30, %xmask_13 : tensor<32x1x!tt.ptr> loc(#loc105) + %12 = tt.splat %out_ptr5 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc106) + %13 = tt.addptr %12, %xindex_12 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc106) + tt.store %13, %tmp31, %xmask_13 : tensor<32x1x!tt.ptr> loc(#loc107) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc108) + %15 = tt.addptr %14, %tmp0_17 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc108) + tt.store %15, %new_idxs_376, %tmp0_20 : tensor<32x16x!tt.ptr> loc(#loc109) + %16 = arith.muli %xindex_12, %cst_1 : tensor<32x1xi32> loc(#loc110) + %17 = tt.broadcast %16 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc111) + %18 = arith.addi %tmp40, %17 : tensor<32x16xi32> loc(#loc111) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc112) + %20 = tt.addptr %19, %18 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc112) + tt.store %20, %cst_0, %tmp0_20 : tensor<32x16x!tt.ptr> loc(#loc113) + %21 = tt.splat %out_ptr8 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc114) + %22 = tt.addptr %21, %tmp0_17 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc114) + tt.store %22, %new_idxs_708, %tmp0_20 : tensor<32x16x!tt.ptr> loc(#loc115) + %23 = arith.addi %tmp49, %17 : tensor<32x16xi32> loc(#loc116) + %24 = tt.splat %out_ptr9 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc117) + %25 = tt.addptr %24, %23 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc117) + tt.store %25, %cst_0, %tmp0_20 : tensor<32x16x!tt.ptr> loc(#loc118) + tt.return loc(#loc119) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":26:21) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":27:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":27:38) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:30) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":36:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":38:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":39:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":41:19) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":40:19) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":43:19) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":45:34) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":47:20) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":49:21) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":48:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":52:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":54:35) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":55:29) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":56:21) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":58:35) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":59:29) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":60:21) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":61:21) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":64:19) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":66:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":68:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":69:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":70:35) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:28) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:46) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:38) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:55) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:53) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:63) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":75:19) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":76:35) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":77:20) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":78:20) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":79:35) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:28) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:46) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:38) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:53) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:63) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":81:25) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":81:37) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":82:25) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":82:37) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:25) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:47) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:52) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:49) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:25) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:85) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:47) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:49) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:25) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:85) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:4) +#loc129 = loc("xmask"(#loc2)) +#loc130 = loc("xoffset"(#loc3)) +#loc131 = loc("xoffset"(#loc4)) +#loc132 = loc("xindex"(#loc5)) +#loc133 = loc("xindex"(#loc6)) +#loc134 = loc("xindex"(#loc7)) +#loc135 = loc("r0_index"(#loc8)) +#loc136 = loc("r0_index"(#loc9)) +#loc137 = loc("tmp0"(#loc10)) +#loc138 = loc("tmp0"(#loc11)) +#loc139 = loc("tmp0"(#loc12)) +#loc140 = loc("tmp0"(#loc13)) +#loc141 = loc("tmp2"(#loc14)) +#loc142 = loc("tmp4"(#loc15)) +#loc143 = loc("tmp5"(#loc16)) +#loc144 = loc("tmp7"(#loc17)) +#loc145 = loc("tmp6"(#loc18)) +#loc146 = loc("tmp9"(#loc19)) +#loc147 = loc("tmp11"(#loc20)) +#loc148 = loc("flip"(#loc21)) +#loc150 = loc("flip"(#loc24)) +#loc151 = loc("flip"(#loc25)) +#loc152 = loc("flip"(#loc26)) +#loc153 = loc("y"(#loc27)) +#loc154 = loc("left_mask"(#loc29)) +#loc155 = loc("ileft"(#loc30)) +#loc157 = loc("ileft"(#loc34)) +#loc158 = loc("ileft"(#loc35)) +#loc159 = loc("iright"(#loc36)) +#loc161 = loc("iright"(#loc38)) +#loc162 = loc("iright"(#loc39)) +#loc163 = loc("ileft"(#loc40)) +#loc164 = loc("iright"(#loc41)) +#loc165 = loc("y_idx"(#loc42)) +#loc166 = loc("left_idx"(#loc43)) +#loc167 = loc("left_idx"(#loc44)) +#loc168 = loc("input"(#loc45)) +#loc170 = loc("left_idx"(#loc47)) +#loc171 = loc("left_idx"(#loc48)) +#loc172 = loc("right_idx"(#loc49)) +#loc173 = loc("right_idx"(#loc50)) +#loc175 = loc("right_idx"(#loc52)) +#loc176 = loc("right_idx"(#loc53)) +#loc177 = loc("left_idx"(#loc54)) +#loc178 = loc("right_idx"(#loc55)) +#loc179 = loc("cond"(#loc56)) +#loc180 = loc("eq"(#loc57)) +#loc181 = loc("cond"(#loc58)) +#loc182 = loc("cond"(#loc59)) +#loc183 = loc("cond"(#loc60)) +#loc184 = loc("cond"(#loc61)) +#loc185 = loc("cond"(#loc62)) +#loc186 = loc("ret"(#loc63)) +#loc187 = loc("ret"(#loc64)) +#loc188 = loc("ret"(#loc65)) +#loc189 = loc("new_idxs"(#loc66)) +#loc190 = loc("new_idxs"(#loc67)) +#loc191 = loc("new_idxs"(#loc68)) +#loc192 = loc("tmp14"(#loc69)) +#loc193 = loc("tmp16"(#loc70)) +#loc194 = loc("tmp15"(#loc71)) +#loc196 = loc("tmp20"(#loc73)) +#loc197 = loc("tmp23"(#loc74)) +#loc199 = loc("tmp24"(#loc76)) +#loc200 = loc("tmp25"(#loc77)) +#loc201 = loc("tmp28"(#loc78)) +#loc203 = loc("tmp29"(#loc80)) +#loc204 = loc("tmp30"(#loc81)) +#loc205 = loc("tmp31"(#loc82)) +#loc206 = loc("tmp34"(#loc83)) +#loc207 = loc("tmp36"(#loc84)) +#loc208 = loc("tmp38"(#loc85)) +#loc209 = loc("tmp39"(#loc86)) +#loc210 = loc("tmp40"(#loc87)) +#loc211 = loc("tmp45"(#loc94)) +#loc212 = loc("tmp46"(#loc95)) +#loc213 = loc("tmp47"(#loc96)) +#loc214 = loc("tmp48"(#loc97)) +#loc215 = loc("tmp49"(#loc98)) +#loc216 = loc(fused[#loc144, #loc145]) +#loc217 = loc(callsite(#loc148 at #loc149)) +#loc218 = loc(callsite(#loc150 at #loc149)) +#loc219 = loc(callsite(#loc151 at #loc149)) +#loc220 = loc(callsite(#loc152 at #loc149)) +#loc222 = loc("cond"(#loc179)) +#loc223 = loc("eq"(#loc180)) +#loc224 = loc(fused[#loc193, #loc194]) +#loc226 = loc(fused[#loc196, #loc144, #loc145]) +#loc227 = loc(callsite(#loc31 at #loc198)) +#loc229 = loc(fused[#loc200, #loc193, #loc194]) +#loc230 = loc(callsite(#loc31 at #loc202)) +#loc232 = loc(callsite(#loc153 at #loc221)) +#loc233 = loc(callsite(#loc154 at #loc221)) +#loc234 = loc(callsite(#loc155 at #loc221)) +#loc236 = loc(callsite(#loc157 at #loc221)) +#loc237 = loc(callsite(#loc158 at #loc221)) +#loc238 = loc(callsite(#loc159 at #loc221)) +#loc240 = loc(callsite(#loc161 at #loc221)) +#loc241 = loc(callsite(#loc162 at #loc221)) +#loc242 = loc(callsite(#loc163 at #loc221)) +#loc243 = loc(callsite(#loc164 at #loc221)) +#loc244 = loc(callsite(#loc165 at #loc221)) +#loc245 = loc(callsite(#loc166 at #loc221)) +#loc246 = loc(callsite(#loc167 at #loc221)) +#loc248 = loc(callsite(#loc170 at #loc221)) +#loc249 = loc(callsite(#loc171 at #loc221)) +#loc250 = loc(callsite(#loc172 at #loc221)) +#loc251 = loc(callsite(#loc173 at #loc221)) +#loc253 = loc(callsite(#loc175 at #loc221)) +#loc254 = loc(callsite(#loc176 at #loc221)) +#loc255 = loc(callsite(#loc177 at #loc221)) +#loc256 = loc(callsite(#loc178 at #loc221)) +#loc257 = loc(callsite(#loc222 at #loc221)) +#loc258 = loc(callsite(#loc223 at #loc221)) +#loc259 = loc(callsite(#loc181 at #loc221)) +#loc260 = loc(callsite(#loc182 at #loc221)) +#loc261 = loc(callsite(#loc183 at #loc221)) +#loc262 = loc(callsite(#loc184 at #loc221)) +#loc263 = loc(callsite(#loc185 at #loc221)) +#loc264 = loc(callsite(#loc186 at #loc221)) +#loc265 = loc(callsite(#loc187 at #loc221)) +#loc266 = loc(callsite(#loc188 at #loc221)) +#loc267 = loc(callsite(#loc189 at #loc221)) +#loc268 = loc(callsite(#loc190 at #loc221)) +#loc269 = loc(callsite(#loc191 at #loc221)) +#loc270 = loc(callsite(#loc153 at #loc225)) +#loc271 = loc(callsite(#loc155 at #loc225)) +#loc273 = loc(callsite(#loc157 at #loc225)) +#loc274 = loc(callsite(#loc158 at #loc225)) +#loc275 = loc(callsite(#loc159 at #loc225)) +#loc277 = loc(callsite(#loc161 at #loc225)) +#loc278 = loc(callsite(#loc162 at #loc225)) +#loc279 = loc(callsite(#loc163 at #loc225)) +#loc280 = loc(callsite(#loc164 at #loc225)) +#loc281 = loc(callsite(#loc222 at #loc225)) +#loc282 = loc(callsite(#loc223 at #loc225)) +#loc283 = loc(callsite(#loc182 at #loc225)) +#loc284 = loc(callsite(#loc183 at #loc225)) +#loc285 = loc(callsite(#loc184 at #loc225)) +#loc286 = loc(callsite(#loc185 at #loc225)) +#loc287 = loc(callsite(#loc186 at #loc225)) +#loc288 = loc(callsite(#loc187 at #loc225)) +#loc289 = loc(callsite(#loc188 at #loc225)) +#loc290 = loc(callsite(#loc190 at #loc225)) +#loc291 = loc(callsite(#loc191 at #loc225)) +#loc292 = loc(callsite(#loc165 at #loc225)) +#loc293 = loc(callsite(#loc167 at #loc225)) +#loc295 = loc(callsite(#loc170 at #loc225)) +#loc296 = loc(callsite(#loc171 at #loc225)) +#loc297 = loc(callsite(#loc173 at #loc225)) +#loc299 = loc(callsite(#loc175 at #loc225)) +#loc300 = loc(callsite(#loc176 at #loc225)) +#loc301 = loc(callsite(#loc177 at #loc225)) +#loc302 = loc(callsite(#loc178 at #loc225)) +#loc303 = loc(callsite(#loc181 at #loc225)) +#loc304 = loc(callsite(#loc189 at #loc225)) +#loc305 = loc(callsite(#loc33 at #loc227)) +#loc306 = loc(callsite(#loc33 at #loc230)) +#loc307 = loc(callsite(#loc31 at #loc235)) +#loc309 = loc(callsite(#loc31 at #loc239)) +#loc311 = loc(callsite(#loc168 at #loc247)) +#loc312 = loc(callsite(#loc31 at #loc247)) +#loc314 = loc(callsite(#loc168 at #loc252)) +#loc315 = loc(callsite(#loc31 at #loc252)) +#loc317 = loc(callsite(#loc31 at #loc272)) +#loc319 = loc(callsite(#loc31 at #loc276)) +#loc321 = loc(callsite(#loc31 at #loc294)) +#loc323 = loc(callsite(#loc31 at #loc298)) +#loc325 = loc(callsite(#loc33 at #loc307)) +#loc326 = loc(callsite(#loc33 at #loc309)) +#loc327 = loc(callsite(#loc33 at #loc312)) +#loc328 = loc(callsite(#loc33 at #loc315)) +#loc329 = loc(callsite(#loc33 at #loc317)) +#loc330 = loc(callsite(#loc33 at #loc319)) +#loc331 = loc(callsite(#loc33 at #loc321)) +#loc332 = loc(callsite(#loc33 at #loc323)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..cb153e39ab77f1c17738cdb108dc177b8d777fe8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..a43582b3e7ef9fd97eae5fb58d26206a00d22151 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..11c7de3c47e0fa29c86b2471e1e2744db9cb7389 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"hash": "630ccd12478ddf12822e0847854f344e935aaa9a987a3a83645f1c5d49c45bd9", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..3d86a08ebfbbe318d2a9efb7470cef6de8a23d4b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir @@ -0,0 +1,840 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_1 = internal constant [8 x i8] c"unknown\00" +@assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py\00" +@assertMessage_1 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp49 < 17\00" +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py\00" +@assertMessage_0 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp40 < 17\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %13 = icmp samesign ult i32 %12, 128, !dbg !11 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %15 = and i32 %14, 15, !dbg !12 + %16 = shl i32 %12, 4, !dbg !13 + %17 = or disjoint i32 %15, %16, !dbg !14 + %18 = sext i32 %17 to i64, !dbg !15 + %19 = getelementptr i64, ptr addrspace(1) %0, i64 %18, !dbg !15 + %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 %13) #5, !dbg !16 + %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 %13) #5, !dbg !16 + %22 = add i64 %20, -1, !dbg !17 + %23 = icmp ult i64 %22, 16383, !dbg !17 + %24 = add i64 %21, -1, !dbg !17 + %25 = icmp ult i64 %24, 16383, !dbg !17 + %26 = zext i1 %23 to i32, !dbg !18 + %27 = lshr i32 %14, 1, !dbg !19 + %.lobit = and i32 %27, 1, !dbg !19 + %28 = and i32 %14, 1, !dbg !19 + %29 = lshr i32 %14, 2, !dbg !19 + %.lobit1 = and i32 %29, 1, !dbg !19 + %30 = lshr i32 %14, 3, !dbg !19 + %.lobit2 = and i32 %30, 1, !dbg !19 + %31 = xor i32 %28, 1, !dbg !23 + %32 = xor i32 %.lobit, 1, !dbg !23 + %33 = xor i32 %.lobit1, 1, !dbg !23 + %34 = xor i32 %.lobit2, 1, !dbg !23 + %35 = select i1 %23, i32 %31, i32 0, !dbg !24 + %36 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %35, i32 1, i32 31), !dbg !25 + %37 = add i32 %35, %36, !dbg !28 + %38 = select i1 %23, i32 %28, i32 0, !dbg !29 + %39 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %38, i32 1, i32 31), !dbg !25 + %40 = add i32 %38, %39, !dbg !28 + %41 = mul nuw nsw i32 %31, %15, !dbg !30 + %42 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %41, i32 1, i32 31), !dbg !25 + %43 = add i32 %42, %41, !dbg !28 + %44 = mul nuw nsw i32 %15, %28, !dbg !31 + %45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 1, i32 31), !dbg !25 + %46 = add i32 %45, %44, !dbg !28 + %47 = icmp slt i32 %37, %40, !dbg !32 + %48 = icmp eq i32 %37, %40, !dbg !33 + %49 = icmp sgt i32 %43, %46, !dbg !34 + %50 = and i1 %48, %49, !dbg !35 + %51 = or i1 %47, %50, !dbg !36 + %52 = trunc i32 %27 to i1, !dbg !37 + %53 = xor i1 %51, %52, !dbg !37 + %54 = xor i32 %37, %40, !dbg !38 + %55 = select i1 %53, i32 %54, i32 0, !dbg !39 + %56 = xor i32 %55, %26, !dbg !40 + %57 = xor i32 %46, %43, !dbg !41 + %58 = select i1 %53, i32 %57, i32 0, !dbg !42 + %59 = xor i32 %58, %15, !dbg !43 + %60 = mul nuw nsw i32 %56, %32, !dbg !24 + %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 2, i32 31), !dbg !25 + %62 = add i32 %60, %61, !dbg !28 + %63 = mul nuw nsw i32 %56, %.lobit, !dbg !29 + %64 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 2, i32 31), !dbg !25 + %65 = add i32 %63, %64, !dbg !28 + %66 = mul nuw nsw i32 %59, %32, !dbg !30 + %67 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %66, i32 2, i32 31), !dbg !25 + %68 = add i32 %66, %67, !dbg !28 + %69 = mul nuw nsw i32 %59, %.lobit, !dbg !31 + %70 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %69, i32 2, i32 31), !dbg !25 + %71 = add i32 %69, %70, !dbg !28 + %72 = trunc i32 %29 to i1, !dbg !37 + %73 = icmp sge i32 %62, %65, !dbg !37 + %74 = icmp ne i32 %62, %65, !dbg !37 + %75 = icmp sle i32 %68, %71, !dbg !37 + %76 = or i1 %74, %75, !dbg !37 + %77 = and i1 %73, %76, !dbg !37 + %.not = xor i1 %77, %72, !dbg !37 + %78 = xor i32 %62, %65, !dbg !38 + %79 = select i1 %.not, i32 0, i32 %78, !dbg !39 + %80 = xor i32 %79, %56, !dbg !40 + %81 = xor i32 %68, %71, !dbg !41 + %82 = select i1 %.not, i32 0, i32 %81, !dbg !42 + %83 = xor i32 %82, %59, !dbg !43 + %84 = mul nuw nsw i32 %80, %31, !dbg !24 + %85 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %84, i32 1, i32 31), !dbg !25 + %86 = add i32 %84, %85, !dbg !28 + %87 = mul nuw nsw i32 %80, %28, !dbg !29 + %88 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 1, i32 31), !dbg !25 + %89 = add i32 %87, %88, !dbg !28 + %90 = mul nuw nsw i32 %83, %31, !dbg !30 + %91 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %90, i32 1, i32 31), !dbg !25 + %92 = add i32 %90, %91, !dbg !28 + %93 = mul nuw nsw i32 %83, %28, !dbg !31 + %94 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %93, i32 1, i32 31), !dbg !25 + %95 = add i32 %93, %94, !dbg !28 + %96 = icmp sge i32 %86, %89, !dbg !37 + %97 = icmp ne i32 %86, %89, !dbg !37 + %98 = icmp sle i32 %92, %95, !dbg !37 + %99 = or i1 %97, %98, !dbg !37 + %100 = and i1 %96, %99, !dbg !37 + %.not3 = xor i1 %100, %72, !dbg !37 + %101 = xor i32 %86, %89, !dbg !38 + %102 = select i1 %.not3, i32 0, i32 %101, !dbg !39 + %103 = xor i32 %102, %80, !dbg !40 + %104 = xor i32 %92, %95, !dbg !41 + %105 = select i1 %.not3, i32 0, i32 %104, !dbg !42 + %106 = xor i32 %105, %83, !dbg !43 + %107 = mul nuw nsw i32 %103, %33, !dbg !24 + %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 4, i32 31), !dbg !25 + %109 = add i32 %107, %108, !dbg !28 + %110 = mul nuw nsw i32 %103, %.lobit1, !dbg !29 + %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 4, i32 31), !dbg !25 + %112 = add i32 %110, %111, !dbg !28 + %113 = mul nuw nsw i32 %106, %33, !dbg !30 + %114 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %113, i32 4, i32 31), !dbg !25 + %115 = add i32 %113, %114, !dbg !28 + %116 = mul nuw nsw i32 %106, %.lobit1, !dbg !31 + %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 4, i32 31), !dbg !25 + %118 = add i32 %116, %117, !dbg !28 + %119 = trunc i32 %30 to i1, !dbg !37 + %120 = icmp sge i32 %109, %112, !dbg !37 + %121 = icmp ne i32 %109, %112, !dbg !37 + %122 = icmp sle i32 %115, %118, !dbg !37 + %123 = or i1 %121, %122, !dbg !37 + %124 = and i1 %120, %123, !dbg !37 + %.not4 = xor i1 %124, %119, !dbg !37 + %125 = xor i32 %109, %112, !dbg !38 + %126 = select i1 %.not4, i32 0, i32 %125, !dbg !39 + %127 = xor i32 %126, %103, !dbg !40 + %128 = xor i32 %115, %118, !dbg !41 + %129 = select i1 %.not4, i32 0, i32 %128, !dbg !42 + %130 = xor i32 %129, %106, !dbg !43 + %131 = mul nuw nsw i32 %127, %32, !dbg !24 + %132 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %131, i32 2, i32 31), !dbg !25 + %133 = add i32 %131, %132, !dbg !28 + %134 = mul nuw nsw i32 %127, %.lobit, !dbg !29 + %135 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 2, i32 31), !dbg !25 + %136 = add i32 %134, %135, !dbg !28 + %137 = mul nuw nsw i32 %130, %32, !dbg !30 + %138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 2, i32 31), !dbg !25 + %139 = add i32 %137, %138, !dbg !28 + %140 = mul nuw nsw i32 %130, %.lobit, !dbg !31 + %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 2, i32 31), !dbg !25 + %142 = add i32 %140, %141, !dbg !28 + %143 = icmp sge i32 %133, %136, !dbg !37 + %144 = icmp ne i32 %133, %136, !dbg !37 + %145 = icmp sle i32 %139, %142, !dbg !37 + %146 = or i1 %144, %145, !dbg !37 + %147 = and i1 %143, %146, !dbg !37 + %.not5 = xor i1 %147, %119, !dbg !37 + %148 = xor i32 %133, %136, !dbg !38 + %149 = select i1 %.not5, i32 0, i32 %148, !dbg !39 + %150 = xor i32 %149, %127, !dbg !40 + %151 = xor i32 %139, %142, !dbg !41 + %152 = select i1 %.not5, i32 0, i32 %151, !dbg !42 + %153 = xor i32 %152, %130, !dbg !43 + %154 = mul nuw nsw i32 %150, %31, !dbg !24 + %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 1, i32 31), !dbg !25 + %156 = add i32 %154, %155, !dbg !28 + %157 = mul nuw nsw i32 %150, %28, !dbg !29 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !25 + %159 = add i32 %157, %158, !dbg !28 + %160 = mul nuw nsw i32 %153, %31, !dbg !30 + %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 1, i32 31), !dbg !25 + %162 = add i32 %160, %161, !dbg !28 + %163 = mul nuw nsw i32 %153, %28, !dbg !31 + %164 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %163, i32 1, i32 31), !dbg !25 + %165 = add i32 %163, %164, !dbg !28 + %166 = icmp sge i32 %156, %159, !dbg !37 + %167 = icmp ne i32 %156, %159, !dbg !37 + %168 = icmp sle i32 %162, %165, !dbg !37 + %169 = or i1 %167, %168, !dbg !37 + %170 = and i1 %166, %169, !dbg !37 + %.not6 = xor i1 %170, %119, !dbg !37 + %171 = xor i32 %156, %159, !dbg !38 + %172 = select i1 %.not6, i32 0, i32 %171, !dbg !39 + %173 = xor i32 %172, %150, !dbg !40 + %174 = xor i32 %162, %165, !dbg !41 + %175 = select i1 %.not6, i32 0, i32 %174, !dbg !42 + %176 = xor i32 %175, %153, !dbg !43 + %177 = mul nuw nsw i32 %173, %34, !dbg !24 + %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 8, i32 31), !dbg !25 + %179 = add i32 %177, %178, !dbg !28 + %180 = mul nuw nsw i32 %173, %.lobit2, !dbg !29 + %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 8, i32 31), !dbg !25 + %182 = add i32 %180, %181, !dbg !28 + %183 = mul nuw nsw i32 %176, %34, !dbg !30 + %184 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %183, i32 8, i32 31), !dbg !25 + %185 = add i32 %183, %184, !dbg !28 + %186 = mul nuw nsw i32 %176, %.lobit2, !dbg !31 + %187 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %186, i32 8, i32 31), !dbg !25 + %188 = add i32 %186, %187, !dbg !28 + %189 = icmp slt i32 %179, %182, !dbg !32 + %190 = icmp eq i32 %179, %182, !dbg !33 + %191 = icmp sgt i32 %185, %188, !dbg !34 + %192 = and i1 %190, %191, !dbg !35 + %193 = or i1 %189, %192, !dbg !36 + %194 = xor i32 %179, %182, !dbg !38 + %195 = select i1 %193, i32 %194, i32 0, !dbg !39 + %196 = xor i32 %195, %173, !dbg !40 + %197 = xor i32 %185, %188, !dbg !41 + %198 = select i1 %193, i32 %197, i32 0, !dbg !42 + %199 = xor i32 %198, %176, !dbg !43 + %200 = mul nuw nsw i32 %196, %33, !dbg !24 + %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 4, i32 31), !dbg !25 + %202 = add i32 %200, %201, !dbg !28 + %203 = mul nuw nsw i32 %196, %.lobit1, !dbg !29 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 4, i32 31), !dbg !25 + %205 = add i32 %203, %204, !dbg !28 + %206 = mul nuw nsw i32 %199, %33, !dbg !30 + %207 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %206, i32 4, i32 31), !dbg !25 + %208 = add i32 %206, %207, !dbg !28 + %209 = mul nuw nsw i32 %199, %.lobit1, !dbg !31 + %210 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %209, i32 4, i32 31), !dbg !25 + %211 = add i32 %209, %210, !dbg !28 + %212 = icmp slt i32 %202, %205, !dbg !32 + %213 = icmp eq i32 %202, %205, !dbg !33 + %214 = icmp sgt i32 %208, %211, !dbg !34 + %215 = and i1 %213, %214, !dbg !35 + %216 = or i1 %212, %215, !dbg !36 + %217 = xor i32 %202, %205, !dbg !38 + %218 = select i1 %216, i32 %217, i32 0, !dbg !39 + %219 = xor i32 %218, %196, !dbg !40 + %220 = xor i32 %208, %211, !dbg !41 + %221 = select i1 %216, i32 %220, i32 0, !dbg !42 + %222 = xor i32 %221, %199, !dbg !43 + %223 = mul nuw nsw i32 %219, %32, !dbg !24 + %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 2, i32 31), !dbg !25 + %225 = add i32 %223, %224, !dbg !28 + %226 = mul nuw nsw i32 %219, %.lobit, !dbg !29 + %227 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %226, i32 2, i32 31), !dbg !25 + %228 = add i32 %226, %227, !dbg !28 + %229 = mul nuw nsw i32 %222, %32, !dbg !30 + %230 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %229, i32 2, i32 31), !dbg !25 + %231 = add i32 %229, %230, !dbg !28 + %232 = mul nuw nsw i32 %222, %.lobit, !dbg !31 + %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 2, i32 31), !dbg !25 + %234 = add i32 %232, %233, !dbg !28 + %235 = icmp slt i32 %225, %228, !dbg !32 + %236 = icmp eq i32 %225, %228, !dbg !33 + %237 = icmp sgt i32 %231, %234, !dbg !34 + %238 = and i1 %236, %237, !dbg !35 + %239 = or i1 %235, %238, !dbg !36 + %240 = xor i32 %225, %228, !dbg !38 + %241 = select i1 %239, i32 %240, i32 0, !dbg !39 + %242 = xor i32 %241, %219, !dbg !40 + %243 = xor i32 %231, %234, !dbg !41 + %244 = select i1 %239, i32 %243, i32 0, !dbg !42 + %245 = xor i32 %244, %222, !dbg !43 + %246 = mul nuw nsw i32 %242, %31, !dbg !24 + %247 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 1, i32 31), !dbg !25 + %248 = add i32 %246, %247, !dbg !28 + %249 = mul nuw nsw i32 %242, %28, !dbg !29 + %250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %249, i32 1, i32 31), !dbg !25 + %251 = add i32 %249, %250, !dbg !28 + %252 = mul nuw nsw i32 %245, %31, !dbg !30 + %253 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 1, i32 31), !dbg !25 + %254 = add i32 %252, %253, !dbg !28 + %255 = mul nuw nsw i32 %245, %28, !dbg !31 + %256 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %255, i32 1, i32 31), !dbg !25 + %257 = add i32 %255, %256, !dbg !28 + %258 = icmp slt i32 %248, %251, !dbg !32 + %259 = icmp eq i32 %248, %251, !dbg !33 + %260 = icmp sgt i32 %254, %257, !dbg !34 + %261 = and i1 %259, %260, !dbg !35 + %262 = or i1 %258, %261, !dbg !36 + %263 = xor i32 %254, %257, !dbg !41 + %264 = select i1 %262, i32 %263, i32 0, !dbg !42 + %265 = xor i32 %264, %245, !dbg !43 + %266 = icmp eq i64 %20, 16384, !dbg !44 + %267 = icmp eq i64 %21, 16384, !dbg !44 + %268 = zext i1 %266 to i32, !dbg !18 + %269 = select i1 %266, i32 %31, i32 0, !dbg !45 + %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 1, i32 31), !dbg !47 + %271 = add i32 %270, %269, !dbg !48 + %272 = select i1 %266, i32 %28, i32 0, !dbg !49 + %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 1, i32 31), !dbg !47 + %274 = add i32 %273, %272, !dbg !48 + %275 = icmp slt i32 %271, %274, !dbg !50 + %276 = icmp eq i32 %271, %274, !dbg !51 + %277 = and i1 %49, %276, !dbg !52 + %278 = or i1 %275, %277, !dbg !53 + %279 = xor i1 %278, %52, !dbg !54 + %280 = xor i32 %274, %271, !dbg !55 + %281 = select i1 %279, i32 %280, i32 0, !dbg !56 + %282 = xor i32 %281, %268, !dbg !57 + %283 = select i1 %279, i32 %57, i32 0, !dbg !58 + %284 = xor i32 %283, %15, !dbg !59 + %285 = mul nuw nsw i32 %282, %32, !dbg !45 + %286 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 2, i32 31), !dbg !47 + %287 = add i32 %285, %286, !dbg !48 + %288 = mul nuw nsw i32 %282, %.lobit, !dbg !49 + %289 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %288, i32 2, i32 31), !dbg !47 + %290 = add i32 %288, %289, !dbg !48 + %291 = mul nuw nsw i32 %284, %32, !dbg !60 + %292 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %291, i32 2, i32 31), !dbg !47 + %293 = add i32 %291, %292, !dbg !48 + %294 = mul nuw nsw i32 %284, %.lobit, !dbg !61 + %295 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %294, i32 2, i32 31), !dbg !47 + %296 = add i32 %294, %295, !dbg !48 + %297 = icmp sge i32 %287, %290, !dbg !54 + %298 = icmp ne i32 %287, %290, !dbg !54 + %299 = icmp sle i32 %293, %296, !dbg !54 + %300 = or i1 %298, %299, !dbg !54 + %301 = and i1 %297, %300, !dbg !54 + %.not8 = xor i1 %301, %72, !dbg !54 + %302 = xor i32 %287, %290, !dbg !55 + %303 = select i1 %.not8, i32 0, i32 %302, !dbg !56 + %304 = xor i32 %303, %282, !dbg !57 + %305 = xor i32 %293, %296, !dbg !62 + %306 = select i1 %.not8, i32 0, i32 %305, !dbg !58 + %307 = xor i32 %306, %284, !dbg !59 + %308 = mul nuw nsw i32 %304, %31, !dbg !45 + %309 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 1, i32 31), !dbg !47 + %310 = add i32 %308, %309, !dbg !48 + %311 = mul nuw nsw i32 %304, %28, !dbg !49 + %312 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %311, i32 1, i32 31), !dbg !47 + %313 = add i32 %311, %312, !dbg !48 + %314 = mul nuw nsw i32 %307, %31, !dbg !60 + %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !47 + %316 = add i32 %314, %315, !dbg !48 + %317 = mul nuw nsw i32 %307, %28, !dbg !61 + %318 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %317, i32 1, i32 31), !dbg !47 + %319 = add i32 %317, %318, !dbg !48 + %320 = icmp sge i32 %310, %313, !dbg !54 + %321 = icmp ne i32 %310, %313, !dbg !54 + %322 = icmp sle i32 %316, %319, !dbg !54 + %323 = or i1 %321, %322, !dbg !54 + %324 = and i1 %320, %323, !dbg !54 + %.not9 = xor i1 %324, %72, !dbg !54 + %325 = xor i32 %310, %313, !dbg !55 + %326 = select i1 %.not9, i32 0, i32 %325, !dbg !56 + %327 = xor i32 %326, %304, !dbg !57 + %328 = xor i32 %316, %319, !dbg !62 + %329 = select i1 %.not9, i32 0, i32 %328, !dbg !58 + %330 = xor i32 %329, %307, !dbg !59 + %331 = mul nuw nsw i32 %327, %33, !dbg !45 + %332 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %331, i32 4, i32 31), !dbg !47 + %333 = add i32 %331, %332, !dbg !48 + %334 = mul nuw nsw i32 %327, %.lobit1, !dbg !49 + %335 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %334, i32 4, i32 31), !dbg !47 + %336 = add i32 %334, %335, !dbg !48 + %337 = mul nuw nsw i32 %330, %33, !dbg !60 + %338 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %337, i32 4, i32 31), !dbg !47 + %339 = add i32 %337, %338, !dbg !48 + %340 = mul nuw nsw i32 %330, %.lobit1, !dbg !61 + %341 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %340, i32 4, i32 31), !dbg !47 + %342 = add i32 %340, %341, !dbg !48 + %343 = icmp sge i32 %333, %336, !dbg !54 + %344 = icmp ne i32 %333, %336, !dbg !54 + %345 = icmp sle i32 %339, %342, !dbg !54 + %346 = or i1 %344, %345, !dbg !54 + %347 = and i1 %343, %346, !dbg !54 + %.not10 = xor i1 %347, %119, !dbg !54 + %348 = xor i32 %333, %336, !dbg !55 + %349 = select i1 %.not10, i32 0, i32 %348, !dbg !56 + %350 = xor i32 %349, %327, !dbg !57 + %351 = xor i32 %339, %342, !dbg !62 + %352 = select i1 %.not10, i32 0, i32 %351, !dbg !58 + %353 = xor i32 %352, %330, !dbg !59 + %354 = mul nuw nsw i32 %350, %32, !dbg !45 + %355 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %354, i32 2, i32 31), !dbg !47 + %356 = add i32 %354, %355, !dbg !48 + %357 = mul nuw nsw i32 %350, %.lobit, !dbg !49 + %358 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %357, i32 2, i32 31), !dbg !47 + %359 = add i32 %357, %358, !dbg !48 + %360 = mul nuw nsw i32 %353, %32, !dbg !60 + %361 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %360, i32 2, i32 31), !dbg !47 + %362 = add i32 %360, %361, !dbg !48 + %363 = mul nuw nsw i32 %353, %.lobit, !dbg !61 + %364 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %363, i32 2, i32 31), !dbg !47 + %365 = add i32 %363, %364, !dbg !48 + %366 = icmp sge i32 %356, %359, !dbg !54 + %367 = icmp ne i32 %356, %359, !dbg !54 + %368 = icmp sle i32 %362, %365, !dbg !54 + %369 = or i1 %367, %368, !dbg !54 + %370 = and i1 %366, %369, !dbg !54 + %.not11 = xor i1 %370, %119, !dbg !54 + %371 = xor i32 %356, %359, !dbg !55 + %372 = select i1 %.not11, i32 0, i32 %371, !dbg !56 + %373 = xor i32 %372, %350, !dbg !57 + %374 = xor i32 %362, %365, !dbg !62 + %375 = select i1 %.not11, i32 0, i32 %374, !dbg !58 + %376 = xor i32 %375, %353, !dbg !59 + %377 = mul nuw nsw i32 %373, %31, !dbg !45 + %378 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %377, i32 1, i32 31), !dbg !47 + %379 = add i32 %377, %378, !dbg !48 + %380 = mul nuw nsw i32 %373, %28, !dbg !49 + %381 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %380, i32 1, i32 31), !dbg !47 + %382 = add i32 %380, %381, !dbg !48 + %383 = mul nuw nsw i32 %376, %31, !dbg !60 + %384 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %383, i32 1, i32 31), !dbg !47 + %385 = add i32 %383, %384, !dbg !48 + %386 = mul nuw nsw i32 %376, %28, !dbg !61 + %387 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %386, i32 1, i32 31), !dbg !47 + %388 = add i32 %386, %387, !dbg !48 + %389 = icmp sge i32 %379, %382, !dbg !54 + %390 = icmp ne i32 %379, %382, !dbg !54 + %391 = icmp sle i32 %385, %388, !dbg !54 + %392 = or i1 %390, %391, !dbg !54 + %393 = and i1 %389, %392, !dbg !54 + %.not12 = xor i1 %393, %119, !dbg !54 + %394 = xor i32 %379, %382, !dbg !55 + %395 = select i1 %.not12, i32 0, i32 %394, !dbg !56 + %396 = xor i32 %395, %373, !dbg !57 + %397 = xor i32 %385, %388, !dbg !62 + %398 = select i1 %.not12, i32 0, i32 %397, !dbg !58 + %399 = xor i32 %398, %376, !dbg !59 + %400 = mul nuw nsw i32 %396, %34, !dbg !45 + %401 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %400, i32 8, i32 31), !dbg !47 + %402 = add i32 %400, %401, !dbg !48 + %403 = mul nuw nsw i32 %396, %.lobit2, !dbg !49 + %404 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %403, i32 8, i32 31), !dbg !47 + %405 = add i32 %403, %404, !dbg !48 + %406 = mul nuw nsw i32 %399, %34, !dbg !60 + %407 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %406, i32 8, i32 31), !dbg !47 + %408 = add i32 %406, %407, !dbg !48 + %409 = mul nuw nsw i32 %399, %.lobit2, !dbg !61 + %410 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %409, i32 8, i32 31), !dbg !47 + %411 = add i32 %409, %410, !dbg !48 + %412 = icmp slt i32 %402, %405, !dbg !50 + %413 = icmp eq i32 %402, %405, !dbg !51 + %414 = icmp sgt i32 %408, %411, !dbg !63 + %415 = and i1 %413, %414, !dbg !52 + %416 = or i1 %412, %415, !dbg !53 + %417 = xor i32 %402, %405, !dbg !55 + %418 = select i1 %416, i32 %417, i32 0, !dbg !56 + %419 = xor i32 %418, %396, !dbg !57 + %420 = xor i32 %408, %411, !dbg !62 + %421 = select i1 %416, i32 %420, i32 0, !dbg !58 + %422 = xor i32 %421, %399, !dbg !59 + %423 = mul nuw nsw i32 %419, %33, !dbg !45 + %424 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %423, i32 4, i32 31), !dbg !47 + %425 = add i32 %423, %424, !dbg !48 + %426 = mul nuw nsw i32 %419, %.lobit1, !dbg !49 + %427 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %426, i32 4, i32 31), !dbg !47 + %428 = add i32 %426, %427, !dbg !48 + %429 = mul nuw nsw i32 %422, %33, !dbg !60 + %430 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %429, i32 4, i32 31), !dbg !47 + %431 = add i32 %429, %430, !dbg !48 + %432 = mul nuw nsw i32 %422, %.lobit1, !dbg !61 + %433 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %432, i32 4, i32 31), !dbg !47 + %434 = add i32 %432, %433, !dbg !48 + %435 = icmp slt i32 %425, %428, !dbg !50 + %436 = icmp eq i32 %425, %428, !dbg !51 + %437 = icmp sgt i32 %431, %434, !dbg !63 + %438 = and i1 %436, %437, !dbg !52 + %439 = or i1 %435, %438, !dbg !53 + %440 = xor i32 %425, %428, !dbg !55 + %441 = select i1 %439, i32 %440, i32 0, !dbg !56 + %442 = xor i32 %441, %419, !dbg !57 + %443 = xor i32 %431, %434, !dbg !62 + %444 = select i1 %439, i32 %443, i32 0, !dbg !58 + %445 = xor i32 %444, %422, !dbg !59 + %446 = mul nuw nsw i32 %442, %32, !dbg !45 + %447 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %446, i32 2, i32 31), !dbg !47 + %448 = add i32 %446, %447, !dbg !48 + %449 = mul nuw nsw i32 %442, %.lobit, !dbg !49 + %450 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %449, i32 2, i32 31), !dbg !47 + %451 = add i32 %449, %450, !dbg !48 + %452 = mul nuw nsw i32 %445, %32, !dbg !60 + %453 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %452, i32 2, i32 31), !dbg !47 + %454 = add i32 %452, %453, !dbg !48 + %455 = mul nuw nsw i32 %445, %.lobit, !dbg !61 + %456 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %455, i32 2, i32 31), !dbg !47 + %457 = add i32 %455, %456, !dbg !48 + %458 = icmp slt i32 %448, %451, !dbg !50 + %459 = icmp eq i32 %448, %451, !dbg !51 + %460 = icmp sgt i32 %454, %457, !dbg !63 + %461 = and i1 %459, %460, !dbg !52 + %462 = or i1 %458, %461, !dbg !53 + %463 = xor i32 %448, %451, !dbg !55 + %464 = select i1 %462, i32 %463, i32 0, !dbg !56 + %465 = xor i32 %464, %442, !dbg !57 + %466 = xor i32 %454, %457, !dbg !62 + %467 = select i1 %462, i32 %466, i32 0, !dbg !58 + %468 = xor i32 %467, %445, !dbg !59 + %469 = mul nuw nsw i32 %465, %31, !dbg !45 + %470 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %469, i32 1, i32 31), !dbg !47 + %471 = add i32 %469, %470, !dbg !48 + %472 = mul nuw nsw i32 %465, %28, !dbg !49 + %473 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %472, i32 1, i32 31), !dbg !47 + %474 = add i32 %472, %473, !dbg !48 + %475 = mul nuw nsw i32 %468, %31, !dbg !60 + %476 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %475, i32 1, i32 31), !dbg !47 + %477 = add i32 %475, %476, !dbg !48 + %478 = mul nuw nsw i32 %468, %28, !dbg !61 + %479 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %478, i32 1, i32 31), !dbg !47 + %480 = add i32 %478, %479, !dbg !48 + %481 = icmp slt i32 %471, %474, !dbg !50 + %482 = icmp eq i32 %471, %474, !dbg !51 + %483 = icmp sgt i32 %477, %480, !dbg !63 + %484 = and i1 %482, %483, !dbg !52 + %485 = or i1 %481, %484, !dbg !53 + %486 = xor i32 %477, %480, !dbg !62 + %487 = select i1 %485, i32 %486, i32 0, !dbg !58 + %488 = xor i32 %487, %468, !dbg !59 + %narrow = select i1 %13, i1 %23, i1 false, !dbg !64 + %489 = zext i1 %narrow to i64, !dbg !64 + %narrow13 = select i1 %13, i1 %25, i1 false, !dbg !64 + %490 = zext i1 %narrow13 to i64, !dbg !64 + %491 = zext i1 %narrow to i32, !dbg !65 + %492 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %491, i32 8, i32 31), !dbg !65 + %493 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 8, i32 31), !dbg !65 + %494 = insertelement <2 x i32> poison, i32 %492, i64 0, !dbg !65 + %495 = insertelement <2 x i32> %494, i32 %493, i64 1, !dbg !65 + %496 = bitcast <2 x i32> %495 to i64, !dbg !65 + %497 = add i64 %496, %489, !dbg !67 + %extelt.offset = lshr i64 %497, 32, !dbg !65 + %498 = trunc nuw i64 %extelt.offset to i32, !dbg !65 + %499 = trunc i64 %497 to i32, !dbg !65 + %500 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %499, i32 4, i32 31), !dbg !65 + %501 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %498, i32 4, i32 31), !dbg !65 + %502 = insertelement <2 x i32> poison, i32 %500, i64 0, !dbg !65 + %503 = insertelement <2 x i32> %502, i32 %501, i64 1, !dbg !65 + %504 = bitcast <2 x i32> %503 to i64, !dbg !65 + %505 = add i64 %497, %504, !dbg !67 + %extelt.offset14 = lshr i64 %505, 32, !dbg !65 + %506 = trunc nuw i64 %extelt.offset14 to i32, !dbg !65 + %507 = trunc i64 %505 to i32, !dbg !65 + %508 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %507, i32 2, i32 31), !dbg !65 + %509 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %506, i32 2, i32 31), !dbg !65 + %510 = insertelement <2 x i32> poison, i32 %508, i64 0, !dbg !65 + %511 = insertelement <2 x i32> %510, i32 %509, i64 1, !dbg !65 + %512 = bitcast <2 x i32> %511 to i64, !dbg !65 + %513 = add i64 %505, %512, !dbg !67 + %extelt.offset15 = lshr i64 %513, 32, !dbg !65 + %514 = trunc nuw i64 %extelt.offset15 to i32, !dbg !65 + %515 = trunc i64 %513 to i32, !dbg !65 + %516 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %515, i32 1, i32 31), !dbg !65 + %517 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %514, i32 1, i32 31), !dbg !65 + %518 = insertelement <2 x i32> poison, i32 %516, i64 0, !dbg !65 + %519 = insertelement <2 x i32> %518, i32 %517, i64 1, !dbg !65 + %520 = bitcast <2 x i32> %519 to i64, !dbg !65 + %521 = add i64 %513, %520, !dbg !67 + %522 = zext i1 %narrow13 to i32, !dbg !65 + %523 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %522, i32 8, i32 31), !dbg !65 + %524 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 8, i32 31), !dbg !65 + %525 = insertelement <2 x i32> poison, i32 %523, i64 0, !dbg !65 + %526 = insertelement <2 x i32> %525, i32 %524, i64 1, !dbg !65 + %527 = bitcast <2 x i32> %526 to i64, !dbg !65 + %528 = add i64 %527, %490, !dbg !67 + %extelt.offset17 = lshr i64 %528, 32, !dbg !65 + %529 = trunc nuw i64 %extelt.offset17 to i32, !dbg !65 + %530 = trunc i64 %528 to i32, !dbg !65 + %531 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %530, i32 4, i32 31), !dbg !65 + %532 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %529, i32 4, i32 31), !dbg !65 + %533 = insertelement <2 x i32> poison, i32 %531, i64 0, !dbg !65 + %534 = insertelement <2 x i32> %533, i32 %532, i64 1, !dbg !65 + %535 = bitcast <2 x i32> %534 to i64, !dbg !65 + %536 = add i64 %528, %535, !dbg !67 + %extelt.offset18 = lshr i64 %536, 32, !dbg !65 + %537 = trunc nuw i64 %extelt.offset18 to i32, !dbg !65 + %538 = trunc i64 %536 to i32, !dbg !65 + %539 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %538, i32 2, i32 31), !dbg !65 + %540 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %537, i32 2, i32 31), !dbg !65 + %541 = insertelement <2 x i32> poison, i32 %539, i64 0, !dbg !65 + %542 = insertelement <2 x i32> %541, i32 %540, i64 1, !dbg !65 + %543 = bitcast <2 x i32> %542 to i64, !dbg !65 + %544 = add i64 %536, %543, !dbg !67 + %extelt.offset19 = lshr i64 %544, 32, !dbg !65 + %545 = trunc nuw i64 %extelt.offset19 to i32, !dbg !65 + %546 = trunc i64 %544 to i32, !dbg !65 + %547 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %546, i32 1, i32 31), !dbg !65 + %548 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %545, i32 1, i32 31), !dbg !65 + %narrow20 = select i1 %13, i1 %266, i1 false, !dbg !68 + %549 = zext i1 %narrow20 to i64, !dbg !68 + %narrow21 = select i1 %13, i1 %267, i1 false, !dbg !68 + %550 = zext i1 %narrow21 to i64, !dbg !68 + %551 = zext i1 %narrow20 to i32, !dbg !69 + %552 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %551, i32 8, i32 31), !dbg !69 + %553 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 8, i32 31), !dbg !69 + %554 = insertelement <2 x i32> poison, i32 %552, i64 0, !dbg !69 + %555 = insertelement <2 x i32> %554, i32 %553, i64 1, !dbg !69 + %556 = bitcast <2 x i32> %555 to i64, !dbg !69 + %557 = add i64 %556, %549, !dbg !71 + %extelt.offset23 = lshr i64 %557, 32, !dbg !69 + %558 = trunc nuw i64 %extelt.offset23 to i32, !dbg !69 + %559 = trunc i64 %557 to i32, !dbg !69 + %560 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %559, i32 4, i32 31), !dbg !69 + %561 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %558, i32 4, i32 31), !dbg !69 + %562 = insertelement <2 x i32> poison, i32 %560, i64 0, !dbg !69 + %563 = insertelement <2 x i32> %562, i32 %561, i64 1, !dbg !69 + %564 = bitcast <2 x i32> %563 to i64, !dbg !69 + %565 = add i64 %557, %564, !dbg !71 + %extelt.offset24 = lshr i64 %565, 32, !dbg !69 + %566 = trunc nuw i64 %extelt.offset24 to i32, !dbg !69 + %567 = trunc i64 %565 to i32, !dbg !69 + %568 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %567, i32 2, i32 31), !dbg !69 + %569 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %566, i32 2, i32 31), !dbg !69 + %570 = insertelement <2 x i32> poison, i32 %568, i64 0, !dbg !69 + %571 = insertelement <2 x i32> %570, i32 %569, i64 1, !dbg !69 + %572 = bitcast <2 x i32> %571 to i64, !dbg !69 + %573 = add i64 %565, %572, !dbg !71 + %extelt.offset25 = lshr i64 %573, 32, !dbg !69 + %574 = trunc nuw i64 %extelt.offset25 to i32, !dbg !69 + %575 = trunc i64 %573 to i32, !dbg !69 + %576 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %575, i32 1, i32 31), !dbg !69 + %577 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %574, i32 1, i32 31), !dbg !69 + %578 = zext i1 %narrow21 to i32, !dbg !69 + %579 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %578, i32 8, i32 31), !dbg !69 + %580 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 8, i32 31), !dbg !69 + %581 = insertelement <2 x i32> poison, i32 %579, i64 0, !dbg !69 + %582 = insertelement <2 x i32> %581, i32 %580, i64 1, !dbg !69 + %583 = bitcast <2 x i32> %582 to i64, !dbg !69 + %584 = add i64 %583, %550, !dbg !71 + %extelt.offset27 = lshr i64 %584, 32, !dbg !69 + %585 = trunc nuw i64 %extelt.offset27 to i32, !dbg !69 + %586 = trunc i64 %584 to i32, !dbg !69 + %587 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %586, i32 4, i32 31), !dbg !69 + %588 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %585, i32 4, i32 31), !dbg !69 + %589 = insertelement <2 x i32> poison, i32 %587, i64 0, !dbg !69 + %590 = insertelement <2 x i32> %589, i32 %588, i64 1, !dbg !69 + %591 = bitcast <2 x i32> %590 to i64, !dbg !69 + %592 = add i64 %584, %591, !dbg !71 + %extelt.offset28 = lshr i64 %592, 32, !dbg !69 + %593 = trunc nuw i64 %extelt.offset28 to i32, !dbg !69 + %594 = trunc i64 %592 to i32, !dbg !69 + %595 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %594, i32 2, i32 31), !dbg !69 + %596 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %593, i32 2, i32 31), !dbg !69 + %597 = insertelement <2 x i32> poison, i32 %595, i64 0, !dbg !69 + %598 = insertelement <2 x i32> %597, i32 %596, i64 1, !dbg !69 + %599 = bitcast <2 x i32> %598 to i64, !dbg !69 + %600 = add i64 %592, %599, !dbg !71 + %extelt.offset29 = lshr i64 %600, 32, !dbg !69 + %601 = trunc nuw i64 %extelt.offset29 to i32, !dbg !69 + %602 = trunc i64 %600 to i32, !dbg !69 + %603 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %602, i32 1, i32 31), !dbg !69 + %604 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %601, i32 1, i32 31), !dbg !69 + %605 = trunc i64 %521 to i32, !dbg !72 + %606 = icmp slt i32 %15, %605, !dbg !73 + %607 = select i1 %606, i32 %265, i32 16, !dbg !74 + %608 = add i32 %607, 17, !dbg !75 + %609 = icmp slt i32 %607, 0, !dbg !76 + %610 = select i1 %609, i32 %608, i32 %607, !dbg !77 + %611 = icmp ult i32 %610, 17, !dbg !78 + %612 = icmp samesign ugt i32 %12, 127, !dbg !18 + %613 = or i1 %612, %611, !dbg !79 + br i1 %613, label %615, label %614, !dbg !80 + +614: ; preds = %11 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 71, ptr nonnull @assertFunc_0, i64 1), !dbg !80 + unreachable, !dbg !80 + +615: ; preds = %11 + %616 = insertelement <2 x i32> poison, i32 %576, i64 0, !dbg !69 + %617 = insertelement <2 x i32> %616, i32 %577, i64 1, !dbg !69 + %618 = bitcast <2 x i32> %617 to i64, !dbg !69 + %619 = add i64 %573, %618, !dbg !71 + %620 = trunc i64 %619 to i32, !dbg !81 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !80 + %621 = icmp slt i32 %15, %620, !dbg !82 + %622 = select i1 %621, i32 %488, i32 16, !dbg !83 + %623 = add i32 %622, 17, !dbg !84 + %624 = icmp slt i32 %622, 0, !dbg !85 + %625 = select i1 %624, i32 %623, i32 %622, !dbg !86 + %626 = icmp ult i32 %625, 17, !dbg !87 + %627 = or i1 %612, %626, !dbg !88 + br i1 %627, label %629, label %628, !dbg !89 + +628: ; preds = %615 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 80, ptr nonnull @assertFunc_1, i64 1), !dbg !89 + unreachable, !dbg !89 + +629: ; preds = %615 + %630 = insertelement <2 x i32> poison, i32 %603, i64 0, !dbg !69 + %631 = insertelement <2 x i32> %630, i32 %604, i64 1, !dbg !69 + %632 = bitcast <2 x i32> %631 to i64, !dbg !69 + %633 = add i64 %600, %632, !dbg !71 + %634 = trunc i64 %633 to i32, !dbg !81 + %635 = insertelement <2 x i32> poison, i32 %547, i64 0, !dbg !65 + %636 = insertelement <2 x i32> %635, i32 %548, i64 1, !dbg !65 + %637 = bitcast <2 x i32> %636 to i64, !dbg !65 + %638 = add i64 %544, %637, !dbg !67 + %639 = trunc i64 %638 to i32, !dbg !72 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !89 + %640 = zext nneg i32 %12 to i64, !dbg !90 + %641 = getelementptr i32, ptr addrspace(1) %1, i64 %640, !dbg !90 + %642 = and i32 %14, 63, !dbg !91 + %643 = icmp eq i32 %642, 0, !dbg !91 + %644 = and i1 %13, %643, !dbg !91 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %639, ptr addrspace(1) %641, i1 %644) #5, !dbg !91 + %645 = getelementptr i32, ptr addrspace(1) %2, i64 %640, !dbg !92 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %634, ptr addrspace(1) %645, i1 %644) #5, !dbg !93 + %646 = getelementptr i32, ptr addrspace(1) %3, i64 %18, !dbg !94 + %647 = and i32 %14, 48, !dbg !95 + %648 = icmp eq i32 %647, 0, !dbg !95 + %649 = and i1 %13, %648, !dbg !95 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %265, ptr addrspace(1) %646, i1 %649) #5, !dbg !95 + %650 = mul i32 %12, 17, !dbg !96 + %651 = add i32 %610, %650, !dbg !97 + %652 = sext i32 %651 to i64, !dbg !98 + %653 = getelementptr i32, ptr addrspace(1) %4, i64 %652, !dbg !98 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %653, i1 %649) #5, !dbg !99 + %654 = getelementptr i32, ptr addrspace(1) %5, i64 %18, !dbg !100 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %488, ptr addrspace(1) %654, i1 %649) #5, !dbg !101 + %655 = add i32 %625, %650, !dbg !102 + %656 = sext i32 %655 to i64, !dbg !103 + %657 = getelementptr i32, ptr addrspace(1) %6, i64 %656, !dbg !103 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %657, i1 %649) #5, !dbg !104 + ret void, !dbg !105 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="64" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", linkageName: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 24, column: 28, scope: !9) +!11 = !DILocation(line: 26, column: 21, scope: !9) +!12 = !DILocation(line: 27, column: 38, scope: !9) +!13 = !DILocation(line: 34, column: 40, scope: !9) +!14 = !DILocation(line: 34, column: 37, scope: !9) +!15 = !DILocation(line: 34, column: 30, scope: !9) +!16 = !DILocation(line: 34, column: 45, scope: !9) +!17 = !DILocation(line: 39, column: 18, scope: !9) +!18 = !DILocation(line: 0, scope: !9) +!19 = !DILocation(line: 627, column: 44, scope: !20, inlinedAt: !22) +!20 = distinct !DILexicalBlockFile(scope: !9, file: !21, discriminator: 0) +!21 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!22 = !DILocation(line: 46, column: 71, scope: !9) +!23 = !DILocation(line: 537, column: 21, scope: !20, inlinedAt: !22) +!24 = !DILocation(line: 538, column: 40, scope: !20, inlinedAt: !22) +!25 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !22) +!26 = distinct !DILexicalBlockFile(scope: !9, file: !27, discriminator: 0) +!27 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!28 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !22) +!29 = !DILocation(line: 539, column: 41, scope: !20, inlinedAt: !22) +!30 = !DILocation(line: 548, column: 23, scope: !20, inlinedAt: !22) +!31 = !DILocation(line: 551, column: 23, scope: !20, inlinedAt: !22) +!32 = !DILocation(line: 574, column: 22, scope: !20, inlinedAt: !22) +!33 = !DILocation(line: 591, column: 21, scope: !20, inlinedAt: !22) +!34 = !DILocation(line: 594, column: 40, scope: !20, inlinedAt: !22) +!35 = !DILocation(line: 594, column: 29, scope: !20, inlinedAt: !22) +!36 = !DILocation(line: 594, column: 23, scope: !20, inlinedAt: !22) +!37 = !DILocation(line: 599, column: 28, scope: !20, inlinedAt: !22) +!38 = !DILocation(line: 600, column: 38, scope: !20, inlinedAt: !22) +!39 = !DILocation(line: 600, column: 46, scope: !20, inlinedAt: !22) +!40 = !DILocation(line: 600, column: 15, scope: !20, inlinedAt: !22) +!41 = !DILocation(line: 601, column: 48, scope: !20, inlinedAt: !22) +!42 = !DILocation(line: 601, column: 59, scope: !20, inlinedAt: !22) +!43 = !DILocation(line: 601, column: 22, scope: !20, inlinedAt: !22) +!44 = !DILocation(line: 47, column: 20, scope: !9) +!45 = !DILocation(line: 538, column: 40, scope: !20, inlinedAt: !46) +!46 = !DILocation(line: 51, column: 71, scope: !9) +!47 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !46) +!48 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !46) +!49 = !DILocation(line: 539, column: 41, scope: !20, inlinedAt: !46) +!50 = !DILocation(line: 574, column: 22, scope: !20, inlinedAt: !46) +!51 = !DILocation(line: 591, column: 21, scope: !20, inlinedAt: !46) +!52 = !DILocation(line: 594, column: 29, scope: !20, inlinedAt: !46) +!53 = !DILocation(line: 594, column: 23, scope: !20, inlinedAt: !46) +!54 = !DILocation(line: 599, column: 28, scope: !20, inlinedAt: !46) +!55 = !DILocation(line: 600, column: 38, scope: !20, inlinedAt: !46) +!56 = !DILocation(line: 600, column: 46, scope: !20, inlinedAt: !46) +!57 = !DILocation(line: 600, column: 15, scope: !20, inlinedAt: !46) +!58 = !DILocation(line: 601, column: 59, scope: !20, inlinedAt: !46) +!59 = !DILocation(line: 601, column: 22, scope: !20, inlinedAt: !46) +!60 = !DILocation(line: 548, column: 23, scope: !20, inlinedAt: !46) +!61 = !DILocation(line: 551, column: 23, scope: !20, inlinedAt: !46) +!62 = !DILocation(line: 601, column: 48, scope: !20, inlinedAt: !46) +!63 = !DILocation(line: 594, column: 40, scope: !20, inlinedAt: !46) +!64 = !DILocation(line: 54, column: 35, scope: !9) +!65 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !66) +!66 = !DILocation(line: 55, column: 26, scope: !9) +!67 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !66) +!68 = !DILocation(line: 58, column: 35, scope: !9) +!69 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !70) +!70 = !DILocation(line: 59, column: 26, scope: !9) +!71 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !70) +!72 = !DILocation(line: 60, column: 21, scope: !9) +!73 = !DILocation(line: 64, column: 19, scope: !9) +!74 = !DILocation(line: 66, column: 35, scope: !9) +!75 = !DILocation(line: 68, column: 20, scope: !9) +!76 = !DILocation(line: 69, column: 20, scope: !9) +!77 = !DILocation(line: 70, column: 35, scope: !9) +!78 = !DILocation(line: 71, column: 38, scope: !9) +!79 = !DILocation(line: 71, column: 53, scope: !9) +!80 = !DILocation(line: 71, column: 63, scope: !9) +!81 = !DILocation(line: 61, column: 21, scope: !9) +!82 = !DILocation(line: 75, column: 19, scope: !9) +!83 = !DILocation(line: 76, column: 35, scope: !9) +!84 = !DILocation(line: 77, column: 20, scope: !9) +!85 = !DILocation(line: 78, column: 20, scope: !9) +!86 = !DILocation(line: 79, column: 35, scope: !9) +!87 = !DILocation(line: 80, column: 38, scope: !9) +!88 = !DILocation(line: 80, column: 53, scope: !9) +!89 = !DILocation(line: 80, column: 63, scope: !9) +!90 = !DILocation(line: 81, column: 25, scope: !9) +!91 = !DILocation(line: 81, column: 37, scope: !9) +!92 = !DILocation(line: 82, column: 25, scope: !9) +!93 = !DILocation(line: 82, column: 37, scope: !9) +!94 = !DILocation(line: 83, column: 25, scope: !9) +!95 = !DILocation(line: 83, column: 47, scope: !9) +!96 = !DILocation(line: 84, column: 52, scope: !9) +!97 = !DILocation(line: 84, column: 49, scope: !9) +!98 = !DILocation(line: 84, column: 25, scope: !9) +!99 = !DILocation(line: 84, column: 85, scope: !9) +!100 = !DILocation(line: 85, column: 25, scope: !9) +!101 = !DILocation(line: 85, column: 47, scope: !9) +!102 = !DILocation(line: 86, column: 49, scope: !9) +!103 = !DILocation(line: 86, column: 25, scope: !9) +!104 = !DILocation(line: 86, column: 85, scope: !9) +!105 = !DILocation(line: 86, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..60363dd33cfff03a49315ed55984571d8a030f42 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx @@ -0,0 +1,1673 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 // -- Begin function triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 114, 107, 47, 99, 114, 107, 120, 112, 119, 105, 119, 104, 122, 107, 118, 117, 110, 55, 105, 53, 100, 50, 112, 101, 103, 111, 102, 116, 104, 121, 102, 105, 106, 110, 53, 119, 121, 103, 119, 110, 97, 101, 118, 51, 116, 119, 119, 108, 114, 98, 117, 111, 106, 113, 101, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 57, 32, 60, 32, 49, 55}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 114, 107, 47, 99, 114, 107, 120, 112, 119, 105, 119, 104, 122, 107, 118, 117, 110, 55, 105, 53, 100, 50, 112, 101, 103, 111, 102, 116, 104, 121, 102, 105, 106, 110, 53, 119, 121, 103, 119, 110, 97, 101, 118, 51, 116, 119, 119, 108, 114, 98, 117, 111, 106, 113, 101, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 48, 32, 60, 32, 49, 55}; + // @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.visible .entry triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_7, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_8, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_9, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_10 +) +.reqntid 64 +{ + .reg .pred %p<140>; + .reg .b32 %r<453>; + .reg .b64 %rd<107>; + .loc 1 18 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:18:0 +$L__func_begin0: + .loc 1 18 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:18:0 + +// %bb.0: + ld.param.b64 %rd15, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0]; +$L__tmp0: + .loc 1 24 28 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:24:28 + mov.u32 %r1, %ctaid.x; + .loc 1 26 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:26:21 + setp.lt.u32 %p2, %r1, 128; + .loc 1 27 38 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:27:38 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 15; + .loc 1 34 40 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:34:40 + shl.b32 %r14, %r1, 4; + .loc 1 34 37 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:34:37 + or.b32 %r15, %r3, %r14; + .loc 1 34 30 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:34:30 + mad.wide.s32 %rd12, %r15, 8, %rd15; + .loc 1 34 45 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:34:45 + // begin inline asm + mov.u64 %rd11, 0x0; + @%p2 ld.global.b64 { %rd11 }, [ %rd12 + 0 ]; + // end inline asm + // begin inline asm + mov.u64 %rd13, 0x0; + @%p2 ld.global.b64 { %rd13 }, [ %rd12 + 0 ]; + // end inline asm + .loc 1 39 18 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:39:18 + add.s64 %rd16, %rd11, -1; + setp.lt.u64 %p3, %rd16, 16383; + add.s64 %rd17, %rd13, -1; + setp.lt.u64 %p4, %rd17, 16383; + .loc 1 0 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:0 + selp.b32 %r16, 1, 0, %p3; +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shr.u32 %r17, %r2, 1; + bfe.u32 %r18, %r2, 1, 1; + and.b32 %r19, %r2, 1; + shr.u32 %r20, %r2, 2; + bfe.u32 %r21, %r2, 2, 1; + shr.u32 %r22, %r2, 3; + bfe.u32 %r23, %r2, 3, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r24, %r19, 1; + xor.b32 %r25, %r18, 1; + xor.b32 %r26, %r21, 1; + xor.b32 %r27, %r23, 1; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r28, %r24, 0, %p3; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r29, %r28, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r30, %r28, %r29; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r31, %r19, 0, %p3; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r32, %r31, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r33, %r31, %r32; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r34, %r24, %r3; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r35, %r34, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r36, %r35, %r34; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r37, %r3, %r19; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r38, %r37, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r39, %r38, %r37; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.lt.s32 %p5, %r30, %r33; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p6, %r30, %r33; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p7, %r36, %r39; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.pred %p8, %p6, %p7; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + or.pred %p9, %p5, %p8; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.b32 %r40, %r17, 1; + setp.ne.b32 %p10, %r40, 0; + xor.pred %p11, %p9, %p10; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r41, %r30, %r33; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r42, %r41, 0, %p11; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r43, %r42, %r16; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r44, %r39, %r36; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r45, %r44, 0, %p11; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r46, %r45, %r3; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r47, %r43, %r25; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r48, %r47, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r49, %r47, %r48; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r50, %r43, %r18; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r51, %r50, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r52, %r50, %r51; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r53, %r46, %r25; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r54, %r53, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r55, %r53, %r54; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r56, %r46, %r18; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r57, %r56, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r58, %r56, %r57; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.b32 %r59, %r20, 1; + setp.ne.b32 %p12, %r59, 0; + setp.ge.s32 %p13, %r49, %r52; + setp.ne.b32 %p14, %r49, %r52; + setp.le.s32 %p15, %r55, %r58; + or.pred %p16, %p14, %p15; + and.pred %p17, %p13, %p16; + xor.pred %p18, %p17, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r60, %r49, %r52; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r61, 0, %r60, %p18; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r62, %r61, %r43; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r63, %r55, %r58; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r64, 0, %r63, %p18; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r65, %r64, %r46; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r66, %r62, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r67, %r66, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r68, %r66, %r67; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r69, %r62, %r19; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r70, %r69, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r71, %r69, %r70; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r72, %r65, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r73, %r72, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r74, %r72, %r73; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r75, %r65, %r19; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r76, %r75, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r77, %r75, %r76; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.s32 %p19, %r68, %r71; + setp.ne.b32 %p20, %r68, %r71; + setp.le.s32 %p21, %r74, %r77; + or.pred %p22, %p20, %p21; + and.pred %p23, %p19, %p22; + xor.pred %p24, %p23, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r78, %r68, %r71; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r79, 0, %r78, %p24; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r80, %r79, %r62; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r81, %r74, %r77; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r82, 0, %r81, %p24; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r83, %r82, %r65; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r84, %r80, %r26; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r85, %r84, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r86, %r84, %r85; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r87, %r80, %r21; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r88, %r87, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r89, %r87, %r88; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r90, %r83, %r26; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r91, %r90, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r92, %r90, %r91; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r93, %r83, %r21; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r94, %r93, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r95, %r93, %r94; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.b32 %r96, %r22, 1; + setp.ne.b32 %p25, %r96, 0; + setp.ge.s32 %p26, %r86, %r89; + setp.ne.b32 %p27, %r86, %r89; + setp.le.s32 %p28, %r92, %r95; + or.pred %p29, %p27, %p28; + and.pred %p30, %p26, %p29; + xor.pred %p31, %p30, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r97, %r86, %r89; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r98, 0, %r97, %p31; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r99, %r98, %r80; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r100, %r92, %r95; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r101, 0, %r100, %p31; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r102, %r101, %r83; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r103, %r99, %r25; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r104, %r103, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r105, %r103, %r104; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r106, %r99, %r18; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r107, %r106, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r108, %r106, %r107; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r109, %r102, %r25; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r110, %r109, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r111, %r109, %r110; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r112, %r102, %r18; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r113, %r112, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r114, %r112, %r113; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.s32 %p32, %r105, %r108; + setp.ne.b32 %p33, %r105, %r108; + setp.le.s32 %p34, %r111, %r114; + or.pred %p35, %p33, %p34; + and.pred %p36, %p32, %p35; + xor.pred %p37, %p36, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r115, %r105, %r108; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r116, 0, %r115, %p37; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r117, %r116, %r99; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r118, %r111, %r114; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r119, 0, %r118, %p37; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r120, %r119, %r102; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r121, %r117, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r122, %r121, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r123, %r121, %r122; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r124, %r117, %r19; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r125, %r124, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r126, %r124, %r125; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r127, %r120, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r128, %r127, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r129, %r127, %r128; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r130, %r120, %r19; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r131, %r130, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r132, %r130, %r131; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.ge.s32 %p38, %r123, %r126; + setp.ne.b32 %p39, %r123, %r126; + setp.le.s32 %p40, %r129, %r132; + or.pred %p41, %p39, %p40; + and.pred %p42, %p38, %p41; + xor.pred %p43, %p42, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r133, %r123, %r126; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r134, 0, %r133, %p43; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r135, %r134, %r117; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r136, %r129, %r132; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r137, 0, %r136, %p43; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r138, %r137, %r120; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r139, %r135, %r27; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r140, %r139, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r141, %r139, %r140; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r142, %r135, %r23; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r143, %r142, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r144, %r142, %r143; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r145, %r138, %r27; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r146, %r145, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r147, %r145, %r146; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r148, %r138, %r23; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r149, %r148, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r150, %r148, %r149; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.lt.s32 %p44, %r141, %r144; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p45, %r141, %r144; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p46, %r147, %r150; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.pred %p47, %p45, %p46; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + or.pred %p48, %p44, %p47; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r151, %r141, %r144; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r152, %r151, 0, %p48; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r153, %r152, %r135; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r154, %r147, %r150; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r155, %r154, 0, %p48; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r156, %r155, %r138; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r157, %r153, %r26; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r158, %r157, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r159, %r157, %r158; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r160, %r153, %r21; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r161, %r160, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r162, %r160, %r161; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r163, %r156, %r26; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r164, %r163, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r165, %r163, %r164; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r166, %r156, %r21; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r167, %r166, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r168, %r166, %r167; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.lt.s32 %p49, %r159, %r162; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p50, %r159, %r162; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p51, %r165, %r168; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.pred %p52, %p50, %p51; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + or.pred %p53, %p49, %p52; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r169, %r159, %r162; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r170, %r169, 0, %p53; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r171, %r170, %r153; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r172, %r165, %r168; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r173, %r172, 0, %p53; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r174, %r173, %r156; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r175, %r171, %r25; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r176, %r175, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r177, %r175, %r176; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r178, %r171, %r18; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r179, %r178, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r180, %r178, %r179; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r181, %r174, %r25; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r182, %r181, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r183, %r181, %r182; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r184, %r174, %r18; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r185, %r184, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r186, %r184, %r185; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.lt.s32 %p54, %r177, %r180; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p55, %r177, %r180; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p56, %r183, %r186; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + and.pred %p57, %p55, %p56; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + or.pred %p58, %p54, %p57; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r187, %r177, %r180; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r188, %r187, 0, %p58; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r189, %r188, %r171; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r190, %r183, %r186; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r191, %r190, 0, %p58; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r192, %r191, %r174; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r193, %r189, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r194, %r193, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r195, %r193, %r194; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r196, %r189, %r19; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r197, %r196, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r198, %r196, %r197; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r199, %r192, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r200, %r199, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r201, %r199, %r200; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + mul.lo.s32 %r202, %r192, %r19; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + shfl.sync.bfly.b32 %r203, %r202, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + add.s32 %r204, %r202, %r203; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.lt.s32 %p59, %r195, %r198; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.eq.b32 %p60, %r195, %r198; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + setp.gt.s32 %p61, %r201, %r204; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r205, %r201, %r204; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + selp.b32 %r206, %r205, 0, %p61; + selp.b32 %r207, %r206, 0, %p60; + selp.b32 %r208, %r205, %r207, %p59; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:46:71 ] + xor.b32 %r4, %r208, %r192; +$L__tmp2: + .loc 1 47 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:47:20 + setp.eq.b64 %p62, %rd11, 16384; + setp.eq.b64 %p63, %rd13, 16384; + .loc 1 0 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:0 + selp.b32 %r209, 1, 0, %p62; +$L__tmp3: + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r210, %r24, 0, %p62; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r211, %r210, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r212, %r211, %r210; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r213, %r19, 0, %p62; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r214, %r213, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r215, %r214, %r213; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.lt.s32 %p64, %r212, %r215; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.eq.b32 %p65, %r212, %r215; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + and.pred %p66, %p7, %p65; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + or.pred %p67, %p64, %p66; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.pred %p68, %p67, %p10; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r216, %r215, %r212; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r217, %r216, 0, %p68; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r218, %r217, %r209; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r219, %r44, 0, %p68; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r220, %r219, %r3; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r221, %r218, %r25; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r222, %r221, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r223, %r221, %r222; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r224, %r218, %r18; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r225, %r224, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r226, %r224, %r225; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r227, %r220, %r25; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r228, %r227, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r229, %r227, %r228; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r230, %r220, %r18; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r231, %r230, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r232, %r230, %r231; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.s32 %p69, %r223, %r226; + setp.ne.b32 %p70, %r223, %r226; + setp.le.s32 %p71, %r229, %r232; + or.pred %p72, %p70, %p71; + and.pred %p73, %p69, %p72; + xor.pred %p74, %p73, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r233, %r223, %r226; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r234, 0, %r233, %p74; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r235, %r234, %r218; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r236, %r229, %r232; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r237, 0, %r236, %p74; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r238, %r237, %r220; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r239, %r235, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r240, %r239, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r241, %r239, %r240; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r242, %r235, %r19; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r243, %r242, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r244, %r242, %r243; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r245, %r238, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r246, %r245, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r247, %r245, %r246; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r248, %r238, %r19; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r249, %r248, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r250, %r248, %r249; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.s32 %p75, %r241, %r244; + setp.ne.b32 %p76, %r241, %r244; + setp.le.s32 %p77, %r247, %r250; + or.pred %p78, %p76, %p77; + and.pred %p79, %p75, %p78; + xor.pred %p80, %p79, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r251, %r241, %r244; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r252, 0, %r251, %p80; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r253, %r252, %r235; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r254, %r247, %r250; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r255, 0, %r254, %p80; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r256, %r255, %r238; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r257, %r253, %r26; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r258, %r257, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r259, %r257, %r258; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r260, %r253, %r21; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r261, %r260, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r262, %r260, %r261; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r263, %r256, %r26; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r264, %r263, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r265, %r263, %r264; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r266, %r256, %r21; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r267, %r266, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r268, %r266, %r267; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.s32 %p81, %r259, %r262; + setp.ne.b32 %p82, %r259, %r262; + setp.le.s32 %p83, %r265, %r268; + or.pred %p84, %p82, %p83; + and.pred %p85, %p81, %p84; + xor.pred %p86, %p85, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r269, %r259, %r262; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r270, 0, %r269, %p86; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r271, %r270, %r253; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r272, %r265, %r268; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r273, 0, %r272, %p86; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r274, %r273, %r256; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r275, %r271, %r25; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r276, %r275, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r277, %r275, %r276; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r278, %r271, %r18; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r279, %r278, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r280, %r278, %r279; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r281, %r274, %r25; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r282, %r281, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r283, %r281, %r282; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r284, %r274, %r18; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r285, %r284, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r286, %r284, %r285; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.s32 %p87, %r277, %r280; + setp.ne.b32 %p88, %r277, %r280; + setp.le.s32 %p89, %r283, %r286; + or.pred %p90, %p88, %p89; + and.pred %p91, %p87, %p90; + xor.pred %p92, %p91, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r287, %r277, %r280; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r288, 0, %r287, %p92; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r289, %r288, %r271; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r290, %r283, %r286; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r291, 0, %r290, %p92; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r292, %r291, %r274; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r293, %r289, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r294, %r293, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r295, %r293, %r294; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r296, %r289, %r19; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r297, %r296, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r298, %r296, %r297; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r299, %r292, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r300, %r299, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r301, %r299, %r300; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r302, %r292, %r19; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r303, %r302, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r304, %r302, %r303; + .loc 2 599 28 // triton_helpers.py:599:28 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.ge.s32 %p93, %r295, %r298; + setp.ne.b32 %p94, %r295, %r298; + setp.le.s32 %p95, %r301, %r304; + or.pred %p96, %p94, %p95; + and.pred %p97, %p93, %p96; + xor.pred %p98, %p97, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r305, %r295, %r298; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r306, 0, %r305, %p98; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r307, %r306, %r289; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r308, %r301, %r304; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r309, 0, %r308, %p98; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r310, %r309, %r292; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r311, %r307, %r27; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r312, %r311, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r313, %r311, %r312; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r314, %r307, %r23; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r315, %r314, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r316, %r314, %r315; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r317, %r310, %r27; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r318, %r317, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r319, %r317, %r318; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r320, %r310, %r23; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r321, %r320, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r322, %r320, %r321; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.lt.s32 %p99, %r313, %r316; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.eq.b32 %p100, %r313, %r316; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.gt.s32 %p101, %r319, %r322; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + and.pred %p102, %p100, %p101; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + or.pred %p103, %p99, %p102; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r323, %r313, %r316; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r324, %r323, 0, %p103; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r325, %r324, %r307; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r326, %r319, %r322; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r327, %r326, 0, %p103; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r328, %r327, %r310; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r329, %r325, %r26; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r330, %r329, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r331, %r329, %r330; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r332, %r325, %r21; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r333, %r332, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r334, %r332, %r333; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r335, %r328, %r26; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r336, %r335, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r337, %r335, %r336; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r338, %r328, %r21; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r339, %r338, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r340, %r338, %r339; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.lt.s32 %p104, %r331, %r334; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.eq.b32 %p105, %r331, %r334; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.gt.s32 %p106, %r337, %r340; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + and.pred %p107, %p105, %p106; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + or.pred %p108, %p104, %p107; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r341, %r331, %r334; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r342, %r341, 0, %p108; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r343, %r342, %r325; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r344, %r337, %r340; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r345, %r344, 0, %p108; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r346, %r345, %r328; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r347, %r343, %r25; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r348, %r347, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r349, %r347, %r348; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r350, %r343, %r18; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r351, %r350, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r352, %r350, %r351; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r353, %r346, %r25; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r354, %r353, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r355, %r353, %r354; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r356, %r346, %r18; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r357, %r356, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + add.s32 %r358, %r356, %r357; + .loc 2 574 22 // triton_helpers.py:574:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.lt.s32 %p109, %r349, %r352; + .loc 2 591 21 // triton_helpers.py:591:21 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.eq.b32 %p110, %r349, %r352; + .loc 2 594 40 // triton_helpers.py:594:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + setp.gt.s32 %p111, %r355, %r358; + .loc 2 594 29 // triton_helpers.py:594:29 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + and.pred %p112, %p110, %p111; + .loc 2 594 23 // triton_helpers.py:594:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + or.pred %p113, %p109, %p112; + .loc 2 600 38 // triton_helpers.py:600:38 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r359, %r349, %r352; + .loc 2 600 46 // triton_helpers.py:600:46 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r360, %r359, 0, %p113; + .loc 2 600 15 // triton_helpers.py:600:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r361, %r360, %r343; + .loc 2 601 48 // triton_helpers.py:601:48 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r362, %r355, %r358; + .loc 2 601 59 // triton_helpers.py:601:59 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + selp.b32 %r363, %r362, 0, %p113; + .loc 2 601 22 // triton_helpers.py:601:22 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + xor.b32 %r364, %r363, %r346; + .loc 2 538 40 // triton_helpers.py:538:40 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r365, %r361, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r366, %r365, 1, 31, -1; + .loc 2 539 41 // triton_helpers.py:539:41 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r368, %r361, %r19; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r369, %r368, 1, 31, -1; + .loc 2 548 23 // triton_helpers.py:548:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r371, %r364, %r24; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r372, %r371, 1, 31, -1; + .loc 2 551 23 // triton_helpers.py:551:23 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + mul.lo.s32 %r374, %r364, %r19; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:51:71 ] + shfl.sync.bfly.b32 %r375, %r374, 1, 31, -1; +$L__tmp4: + .loc 1 54 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:54:35 + and.pred %p117, %p2, %p3; + selp.b64 %rd18, 1, 0, %p117; + and.pred %p118, %p2, %p4; + selp.b64 %rd19, 1, 0, %p118; +$L__tmp5: + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + selp.b32 %r381, 1, 0, %p117; + shfl.sync.bfly.b32 %r382, %r381, 8, 31, -1; + mov.b32 %r383, 0; + shfl.sync.bfly.b32 %r384, %r383, 8, 31, -1; + cvt.u64.u32 %rd20, %r382; + cvt.u64.u32 %rd21, %r384; + shl.b64 %rd22, %rd21, 32; + or.b64 %rd23, %rd20, %rd22; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + add.s64 %rd24, %rd23, %rd18; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + mov.b64 {_, %r385}, %rd24; + cvt.u32.u64 %r386, %rd24; + shfl.sync.bfly.b32 %r387, %r386, 4, 31, -1; + shfl.sync.bfly.b32 %r388, %r385, 4, 31, -1; + cvt.u64.u32 %rd25, %r387; + cvt.u64.u32 %rd26, %r388; + shl.b64 %rd27, %rd26, 32; + or.b64 %rd28, %rd25, %rd27; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + add.s64 %rd29, %rd24, %rd28; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + mov.b64 {_, %r389}, %rd29; + cvt.u32.u64 %r390, %rd29; + shfl.sync.bfly.b32 %r391, %r390, 2, 31, -1; + shfl.sync.bfly.b32 %r392, %r389, 2, 31, -1; + cvt.u64.u32 %rd30, %r391; + cvt.u64.u32 %rd31, %r392; + shl.b64 %rd32, %rd31, 32; + or.b64 %rd33, %rd30, %rd32; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + add.s64 %rd34, %rd29, %rd33; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + mov.b64 {_, %r393}, %rd34; + cvt.u32.u64 %r394, %rd34; + shfl.sync.bfly.b32 %r395, %r394, 1, 31, -1; + shfl.sync.bfly.b32 %r396, %r393, 1, 31, -1; + cvt.u64.u32 %rd35, %r395; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + add.s64 %rd36, %rd34, %rd35; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + selp.b32 %r397, 1, 0, %p118; + shfl.sync.bfly.b32 %r398, %r397, 8, 31, -1; + shfl.sync.bfly.b32 %r399, %r383, 8, 31, -1; + cvt.u64.u32 %rd37, %r398; + cvt.u64.u32 %rd38, %r399; + shl.b64 %rd39, %rd38, 32; + or.b64 %rd40, %rd37, %rd39; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + add.s64 %rd41, %rd40, %rd19; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + mov.b64 {_, %r400}, %rd41; + cvt.u32.u64 %r401, %rd41; + shfl.sync.bfly.b32 %r402, %r401, 4, 31, -1; + shfl.sync.bfly.b32 %r403, %r400, 4, 31, -1; + cvt.u64.u32 %rd42, %r402; + cvt.u64.u32 %rd43, %r403; + shl.b64 %rd44, %rd43, 32; + or.b64 %rd45, %rd42, %rd44; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + add.s64 %rd46, %rd41, %rd45; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + mov.b64 {_, %r404}, %rd46; + cvt.u32.u64 %r405, %rd46; + shfl.sync.bfly.b32 %r406, %r405, 2, 31, -1; + shfl.sync.bfly.b32 %r407, %r404, 2, 31, -1; + cvt.u64.u32 %rd47, %r406; + cvt.u64.u32 %rd48, %r407; + shl.b64 %rd49, %rd48, 32; + or.b64 %rd50, %rd47, %rd49; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + add.s64 %rd2, %rd46, %rd50; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + mov.b64 {_, %r408}, %rd2; + cvt.u32.u64 %r409, %rd2; + shfl.sync.bfly.b32 %r6, %r409, 1, 31, -1; + shfl.sync.bfly.b32 %r7, %r408, 1, 31, -1; +$L__tmp6: + .loc 1 58 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:58:35 + and.pred %p119, %p2, %p62; + selp.b64 %rd51, 1, 0, %p119; + and.pred %p120, %p2, %p63; + selp.b64 %rd52, 1, 0, %p120; +$L__tmp7: + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + selp.b32 %r410, 1, 0, %p119; + shfl.sync.bfly.b32 %r411, %r410, 8, 31, -1; + shfl.sync.bfly.b32 %r412, %r383, 8, 31, -1; + cvt.u64.u32 %rd53, %r411; + cvt.u64.u32 %rd54, %r412; + shl.b64 %rd55, %rd54, 32; + or.b64 %rd56, %rd53, %rd55; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + add.s64 %rd57, %rd56, %rd51; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + mov.b64 {_, %r413}, %rd57; + cvt.u32.u64 %r414, %rd57; + shfl.sync.bfly.b32 %r415, %r414, 4, 31, -1; + shfl.sync.bfly.b32 %r416, %r413, 4, 31, -1; + cvt.u64.u32 %rd58, %r415; + cvt.u64.u32 %rd59, %r416; + shl.b64 %rd60, %rd59, 32; + or.b64 %rd61, %rd58, %rd60; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + add.s64 %rd62, %rd57, %rd61; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + mov.b64 {_, %r417}, %rd62; + cvt.u32.u64 %r418, %rd62; + shfl.sync.bfly.b32 %r419, %r418, 2, 31, -1; + shfl.sync.bfly.b32 %r420, %r417, 2, 31, -1; + cvt.u64.u32 %rd63, %r419; + cvt.u64.u32 %rd64, %r420; + shl.b64 %rd65, %rd64, 32; + or.b64 %rd66, %rd63, %rd65; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + add.s64 %rd3, %rd62, %rd66; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + mov.b64 {_, %r421}, %rd3; + cvt.u32.u64 %r422, %rd3; + shfl.sync.bfly.b32 %r8, %r422, 1, 31, -1; + shfl.sync.bfly.b32 %r9, %r421, 1, 31, -1; + selp.b32 %r423, 1, 0, %p120; + shfl.sync.bfly.b32 %r424, %r423, 8, 31, -1; + shfl.sync.bfly.b32 %r425, %r383, 8, 31, -1; + cvt.u64.u32 %rd67, %r424; + cvt.u64.u32 %rd68, %r425; + shl.b64 %rd69, %rd68, 32; + or.b64 %rd70, %rd67, %rd69; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + add.s64 %rd71, %rd70, %rd52; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + mov.b64 {_, %r426}, %rd71; + cvt.u32.u64 %r427, %rd71; + shfl.sync.bfly.b32 %r428, %r427, 4, 31, -1; + shfl.sync.bfly.b32 %r429, %r426, 4, 31, -1; + cvt.u64.u32 %rd72, %r428; + cvt.u64.u32 %rd73, %r429; + shl.b64 %rd74, %rd73, 32; + or.b64 %rd75, %rd72, %rd74; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + add.s64 %rd76, %rd71, %rd75; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + mov.b64 {_, %r430}, %rd76; + cvt.u32.u64 %r431, %rd76; + shfl.sync.bfly.b32 %r432, %r431, 2, 31, -1; + shfl.sync.bfly.b32 %r433, %r430, 2, 31, -1; + cvt.u64.u32 %rd77, %r432; + cvt.u64.u32 %rd78, %r433; + shl.b64 %rd79, %rd78, 32; + or.b64 %rd80, %rd77, %rd79; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + add.s64 %rd4, %rd76, %rd80; + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + mov.b64 {_, %r434}, %rd4; + cvt.u32.u64 %r435, %rd4; + shfl.sync.bfly.b32 %r10, %r435, 1, 31, -1; + shfl.sync.bfly.b32 %r11, %r434, 1, 31, -1; +$L__tmp8: + .loc 1 60 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:60:21 + cvt.u32.u64 %r436, %rd36; + .loc 1 64 19 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:64:19 + setp.lt.s32 %p121, %r3, %r436; + .loc 1 66 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:66:35 + selp.b32 %r437, %r4, 16, %p121; + .loc 1 68 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:68:20 + add.s32 %r438, %r437, 17; + .loc 1 69 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:69:20 + setp.lt.s32 %p122, %r437, 0; + .loc 1 70 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:70:35 + selp.b32 %r12, %r438, %r437, %p122; + .loc 1 71 38 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:71:38 + setp.lt.u32 %p123, %r12, 17; + .loc 1 0 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:0 + setp.gt.u32 %p124, %r1, 127; + .loc 1 71 53 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:71:53 + or.pred %p125, %p124, %p123; + .loc 1 71 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:71:63 + @%p125 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: +$L__tmp9: + .loc 1 18 0 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:18 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:0 ] + add.s32 %r367, %r365, %r366; + add.s32 %r370, %r368, %r369; + add.s32 %r373, %r371, %r372; + add.s32 %r376, %r374, %r375; + setp.lt.s32 %p114, %r367, %r370; + setp.eq.b32 %p115, %r367, %r370; + setp.gt.s32 %p116, %r373, %r376; + xor.b32 %r377, %r373, %r376; + selp.b32 %r378, %r377, 0, %p116; + selp.b32 %r379, %r378, 0, %p115; + selp.b32 %r380, %r377, %r379, %p114; + xor.b32 %r5, %r380, %r364; +$L__tmp10: + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + cvt.u64.u32 %rd87, %r8; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + add.s64 %rd88, %rd3, %rd87; +$L__tmp11: + .loc 1 61 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:61:21 + cvt.u32.u64 %r439, %rd88; + .loc 1 71 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:71:63 + bar.sync 0; + .loc 1 75 19 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:75:19 + setp.lt.s32 %p127, %r3, %r439; + .loc 1 76 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:76:35 + selp.b32 %r440, %r5, 16, %p127; + .loc 1 77 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:77:20 + add.s32 %r441, %r440, 17; + .loc 1 78 20 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:78:20 + setp.lt.s32 %p128, %r440, 0; + .loc 1 79 35 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:79:35 + selp.b32 %r13, %r441, %r440, %p128; + .loc 1 80 38 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:80:38 + setp.lt.u32 %p129, %r13, 17; + .loc 1 80 53 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:80:53 + or.pred %p130, %p124, %p129; + .loc 1 80 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:80:63 + @%p130 bra $L__BB0_4; + bra.uni $L__BB0_3; +$L__BB0_4: + .loc 1 0 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:0:63 + ld.param.b64 %rd10, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6]; + ld.param.b64 %rd9, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5]; + ld.param.b64 %rd8, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4]; + ld.param.b64 %rd7, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3]; + ld.param.b64 %rd6, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2]; + ld.param.b64 %rd5, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1]; + cvt.s64.s32 %rd1, %r15; +$L__tmp12: + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + cvt.u64.u32 %rd101, %r10; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:59:26 ] + add.s64 %rd102, %rd4, %rd101; +$L__tmp13: + .loc 1 61 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:61:21 + cvt.u32.u64 %r443, %rd102; +$L__tmp14: + .loc 3 291 36 // standard.py:291:36 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + cvt.u64.u32 %rd103, %r6; + .loc 3 261 15 // standard.py:261:15 @[ crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:55:26 ] + add.s64 %rd104, %rd2, %rd103; +$L__tmp15: + .loc 1 60 21 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:60:21 + cvt.u32.u64 %r442, %rd104; + .loc 1 80 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:80:63 + bar.sync 0; + .loc 1 81 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:81:25 + mul.wide.u32 %rd105, %r1, 4; + add.s64 %rd95, %rd5, %rd105; + .loc 1 81 37 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:81:37 + and.b32 %r448, %r2, 63; + setp.eq.b32 %p138, %r448, 0; + and.pred %p131, %p2, %p138; + // begin inline asm + @%p131 st.global.b32 [ %rd95 + 0 ], { %r442 }; + // end inline asm + .loc 1 82 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:82:25 + add.s64 %rd96, %rd6, %rd105; + .loc 1 82 37 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:82:37 + // begin inline asm + @%p131 st.global.b32 [ %rd96 + 0 ], { %r443 }; + // end inline asm + .loc 1 83 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:83:25 + shl.b64 %rd106, %rd1, 2; + add.s64 %rd97, %rd7, %rd106; + .loc 1 83 47 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:83:47 + and.b32 %r449, %r2, 48; + setp.eq.b32 %p139, %r449, 0; + and.pred %p133, %p2, %p139; + // begin inline asm + @%p133 st.global.b32 [ %rd97 + 0 ], { %r4 }; + // end inline asm + .loc 1 84 52 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:84:52 + mul.lo.s32 %r450, %r1, 17; + .loc 1 84 49 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:84:49 + add.s32 %r451, %r12, %r450; + .loc 1 84 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:84:25 + mad.wide.s32 %rd98, %r451, 4, %rd8; + mov.b32 %r445, 1; + .loc 1 84 85 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:84:85 + // begin inline asm + @%p133 st.global.b32 [ %rd98 + 0 ], { %r445 }; + // end inline asm + .loc 1 85 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:85:25 + add.s64 %rd99, %rd9, %rd106; + .loc 1 85 47 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:85:47 + // begin inline asm + @%p133 st.global.b32 [ %rd99 + 0 ], { %r5 }; + // end inline asm + .loc 1 86 49 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:86:49 + add.s32 %r452, %r13, %r450; + .loc 1 86 25 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:86:25 + mad.wide.s32 %rd100, %r452, 4, %rd10; + .loc 1 86 85 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:86:85 + // begin inline asm + @%p133 st.global.b32 [ %rd100 + 0 ], { %r445 }; + // end inline asm + .loc 1 86 4 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:86:4 + ret; +$L__BB0_1: + .loc 1 71 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:71:63 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd81, assertFunc_0; + cvta.global.u64 %rd82, %rd81; + st.param.b64 [param3], %rd82; + mov.b64 %rd83, assertFile_0; + cvta.global.u64 %rd84, %rd83; + st.param.b64 [param1], %rd84; + mov.b64 %rd85, assertMessage_0; + cvta.global.u64 %rd86, %rd85; + st.param.b64 [param0], %rd86; + st.param.b64 [param4], 1; + st.param.b32 [param2], 71; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__BB0_3: + .loc 1 80 63 // crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py:80:63 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd89, assertFunc_1; + cvta.global.u64 %rd90, %rd89; + st.param.b64 [param3], %rd90; + mov.b64 %rd91, assertFile_1; + cvta.global.u64 %rd92, %rd91; + st.param.b64 [param1], %rd92; + mov.b64 %rd93, assertMessage_1; + cvta.global.u64 %rd94, %rd93; + st.param.b64 [param0], %rd94; + st.param.b64 [param4], 1; + st.param.b32 [param2], 80; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__tmp16: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 399 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x188 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 114 +.b8 107 +.b8 120 +.b8 112 +.b8 119 +.b8 105 +.b8 119 +.b8 104 +.b8 122 +.b8 107 +.b8 118 +.b8 117 +.b8 110 +.b8 55 +.b8 105 +.b8 53 +.b8 100 +.b8 50 +.b8 112 +.b8 101 +.b8 103 +.b8 111 +.b8 102 +.b8 116 +.b8 104 +.b8 121 +.b8 102 +.b8 105 +.b8 106 +.b8 110 +.b8 53 +.b8 119 +.b8 121 +.b8 103 +.b8 119 +.b8 110 +.b8 97 +.b8 101 +.b8 118 +.b8 51 +.b8 116 +.b8 119 +.b8 119 +.b8 108 +.b8 114 +.b8 98 +.b8 117 +.b8 111 +.b8 106 +.b8 113 +.b8 101 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 114 +.b8 107 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x7a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 116 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 112 +.b8 117 +.b8 116 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 110 +.b8 101 +.b8 119 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 115 +.b8 99 +.b8 97 +.b8 108 +.b8 97 +.b8 114 +.b8 95 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 119 +.b8 104 +.b8 101 +.b8 114 +.b8 101 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x105:0x8d DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x132:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x14a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp15 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 55 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x162:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp13 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 59 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x17a:0x17 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp9 // DW_AT_low_pc +.b64 $L__tmp10 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 0 // DW_AT_call_line +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source b/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source new file mode 100644 index 0000000000000000000000000000000000000000..8e30c8fc6b9a182021a63e5f408449c63cc6ad2c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source @@ -0,0 +1,1397 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":18:0) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc140 = loc(unknown) +#loc165 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc169 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc174 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc178 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc187 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc192 = loc("in_ptr0"(#loc)) +#loc193 = loc("out_ptr4"(#loc)) +#loc194 = loc("out_ptr5"(#loc)) +#loc195 = loc("out_ptr6"(#loc)) +#loc196 = loc("out_ptr7"(#loc)) +#loc197 = loc("out_ptr8"(#loc)) +#loc198 = loc("out_ptr9"(#loc)) +#loc199 = loc("xnumel"(#loc)) +#loc200 = loc("r0_numel"(#loc)) +#loc255 = loc("x"(#loc90)) +#loc256 = loc("idxs"(#loc90)) +#loc257 = loc("x"(#loc94)) +#loc258 = loc("idxs"(#loc94)) +#loc263 = loc("x"(#loc102)) +#loc264 = loc("idxs"(#loc102)) +#loc265 = loc("flip"(#loc102)) +#loc321 = loc("input"(#loc165)) +#loc322 = loc("a"(#loc169)) +#loc323 = loc("b"(#loc169)) +#loc325 = loc("x"(#loc174)) +#loc326 = loc("x"(#loc178)) +#loc327 = loc("input"(#loc187)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 128 : i32 loc(#loc201) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc202) + %xoffset = tt.get_program_id x : i32 loc(#loc203) + %xoffset_2 = arith.constant 1 : i32 loc(#loc204) + %xoffset_3 = arith.constant 1 : i32 loc(#loc204) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc204) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc205) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc206) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc207) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc207) + %xmask = arith.constant dense<128> : tensor<1x1xi32> loc(#loc208) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc208) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc209) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc210) + %r0_offset = arith.constant 0 : i32 loc(#loc211) + %r0_mask = arith.constant true loc(#loc212) + %r0_mask_10 = arith.constant dense : tensor<1x16xi1> loc(#loc212) + %tmp0 = arith.constant 16 : i32 loc(#loc213) + %tmp0_11 = arith.constant 16 : i32 loc(#loc213) + %tmp0_12 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc213) + %tmp0_13 = arith.muli %tmp0_12, %xindex_7 : tensor<1x1xi32> loc(#loc213) + %tmp0_14 = tt.broadcast %tmp0_13 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc214) + %tmp0_15 = arith.addi %r0_index_9, %tmp0_14 : tensor<1x16xi32> loc(#loc214) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc215) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc215) + %tmp0_18 = arith.constant 0.000000e+00 : f32 loc(#loc216) + %tmp0_19 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc216) + %tmp0_20 = arith.constant dense<0.000000e+00> : tensor<1x16xf32> loc(#loc216) + %tmp0_21 = arith.fptosi %tmp0_20 : tensor<1x16xf32> to tensor<1x16xi64> loc(#loc216) + %tmp0_22 = tt.load %tmp0_17, %tmp0_19, %tmp0_21 : tensor<1x16x!tt.ptr> loc(#loc216) + %tmp1 = arith.constant 0 : i64 loc(#loc217) + %tmp1_23 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc217) + %tmp2 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc218) + %tmp2_24 = arith.cmpi sgt, %tmp0_22, %tmp2 : tensor<1x16xi64> loc(#loc218) + %tmp3 = arith.constant 16384 : i64 loc(#loc219) + %tmp3_25 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc219) + %tmp4 = arith.constant dense<16384> : tensor<1x16xi64> loc(#loc220) + %tmp4_26 = arith.cmpi slt, %tmp0_22, %tmp4 : tensor<1x16xi64> loc(#loc220) + %tmp5 = arith.andi %tmp2_24, %tmp4_26 : tensor<1x16xi1> loc(#loc221) + %tmp6 = arith.extui %tmp5 : tensor<1x16xi1> to tensor<1x16xi8> loc(#loc222) + %tmp7 = arith.extsi %tmp6 : tensor<1x16xi8> to tensor<1x16xi32> loc(#loc223) + %tmp9 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc224) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp7, %tmp9) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc25) + %tmp14 = arith.constant dense<16384> : tensor<1x16xi64> loc(#loc225) + %tmp14_27 = arith.cmpi eq, %tmp0_22, %tmp14 : tensor<1x16xi64> loc(#loc225) + %tmp15 = arith.extui %tmp14_27 : tensor<1x16xi1> to tensor<1x16xi8> loc(#loc226) + %tmp16 = arith.extsi %tmp15 : tensor<1x16xi8> to tensor<1x16xi32> loc(#loc227) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp16, %tmp9) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc29) + %tmp20 = arith.extsi %tmp7 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc228) + %tmp23 = arith.constant 0 : i32 loc(#loc229) + %tmp23_28 = arith.constant 0 : i64 loc(#loc229) + %tmp23_29 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc229) + %tmp23_30 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc229) + %tmp23_31 = arith.select %tmp23_30, %tmp20, %tmp23_29 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc229) + %tmp24 = tt.call @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp23_31) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc230) + %tmp24_32 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc231) + %tmp25 = arith.extsi %tmp16 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc232) + %tmp28 = arith.constant 0 : i32 loc(#loc233) + %tmp28_33 = arith.constant 0 : i64 loc(#loc233) + %tmp28_34 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc233) + %tmp28_35 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc233) + %tmp28_36 = arith.select %tmp28_35, %tmp25, %tmp28_34 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc233) + %tmp29 = tt.call @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp28_36) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc234) + %tmp29_37 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc235) + %tmp30 = arith.trunci %tmp24_32 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc236) + %tmp31 = arith.trunci %tmp29_37 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc237) + %tmp32 = arith.extsi %0#1 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc238) + %tmp33 = arith.trunci %tmp32 : tensor<1x16xi64> to tensor<1x16xi32> loc(#loc239) + %tmp34 = tt.broadcast %tmp30 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc240) + %tmp34_38 = arith.cmpi slt, %r0_index_9, %tmp34 : tensor<1x16xi32> loc(#loc240) + %tmp35 = arith.constant 16 : i32 loc(#loc241) + %tmp35_39 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc241) + %tmp36 = arith.constant dense<16> : tensor<1x16xi32> loc(#loc242) + %tmp36_40 = arith.select %tmp34_38, %tmp33, %tmp36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc242) + %tmp37 = arith.constant 17 : i32 loc(#loc243) + %tmp37_41 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc243) + %tmp38 = arith.addi %tmp36_40, %tmp37_41 : tensor<1x16xi32> loc(#loc244) + %tmp39 = arith.constant 0 : i32 loc(#loc245) + %tmp39_42 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc245) + %tmp39_43 = arith.cmpi slt, %tmp36_40, %tmp39_42 : tensor<1x16xi32> loc(#loc245) + %tmp40 = arith.select %tmp39_43, %tmp38, %tmp36_40 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc246) + %c0_i32 = arith.constant 0 : i32 loc(#loc49) + %cst = arith.constant dense<0> : tensor<1x16xi32> loc(#loc49) + %2 = arith.cmpi sle, %cst, %tmp40 : tensor<1x16xi32> loc(#loc49) + %c17_i32 = arith.constant 17 : i32 loc(#loc50) + %cst_44 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc50) + %3 = arith.cmpi slt, %tmp40, %cst_44 : tensor<1x16xi32> loc(#loc50) + %4 = arith.andi %2, %3 : tensor<1x16xi1> loc(#loc51) + %true = arith.constant true loc(#loc52) + %cst_45 = arith.constant dense : tensor<1x1xi1> loc(#loc52) + %5 = arith.xori %xmask_8, %cst_45 : tensor<1x1xi1> loc(#loc52) + %6 = tt.broadcast %5 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc53) + %7 = arith.ori %4, %6 : tensor<1x16xi1> loc(#loc53) + tt.assert %7, "index out of bounds: 0 <= tmp40 < 17" : tensor<1x16xi1> loc(#loc54) + %tmp42 = arith.constant 1 : i32 loc(#loc247) + %tmp42_46 = arith.constant dense<1> : tensor<1x1xi32> loc(#loc247) + %tmp43 = arith.extsi %1#1 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc248) + %tmp44 = arith.trunci %tmp43 : tensor<1x16xi64> to tensor<1x16xi32> loc(#loc249) + %tmp45 = tt.broadcast %tmp31 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc250) + %tmp45_47 = arith.cmpi slt, %r0_index_9, %tmp45 : tensor<1x16xi32> loc(#loc250) + %tmp46 = arith.constant dense<16> : tensor<1x16xi32> loc(#loc251) + %tmp46_48 = arith.select %tmp45_47, %tmp44, %tmp46 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc251) + %tmp47 = arith.addi %tmp46_48, %tmp37_41 : tensor<1x16xi32> loc(#loc252) + %tmp48 = arith.constant 0 : i32 loc(#loc253) + %tmp48_49 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc253) + %tmp48_50 = arith.cmpi slt, %tmp46_48, %tmp48_49 : tensor<1x16xi32> loc(#loc253) + %tmp49 = arith.select %tmp48_50, %tmp47, %tmp46_48 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc254) + %c0_i32_51 = arith.constant 0 : i32 loc(#loc63) + %cst_52 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc63) + %8 = arith.cmpi sle, %cst_52, %tmp49 : tensor<1x16xi32> loc(#loc63) + %c17_i32_53 = arith.constant 17 : i32 loc(#loc64) + %cst_54 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc64) + %9 = arith.cmpi slt, %tmp49, %cst_54 : tensor<1x16xi32> loc(#loc64) + %10 = arith.andi %8, %9 : tensor<1x16xi1> loc(#loc65) + %true_55 = arith.constant true loc(#loc66) + %cst_56 = arith.constant dense : tensor<1x1xi1> loc(#loc66) + %11 = arith.xori %xmask_8, %cst_56 : tensor<1x1xi1> loc(#loc66) + %12 = tt.broadcast %11 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc67) + %13 = arith.ori %10, %12 : tensor<1x16xi1> loc(#loc67) + tt.assert %13, "index out of bounds: 0 <= tmp49 < 17" : tensor<1x16xi1> loc(#loc68) + %14 = tt.splat %out_ptr4 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc69) + %15 = tt.addptr %14, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc69) + tt.store %15, %tmp30, %xmask_8 : tensor<1x1x!tt.ptr> loc(#loc70) + %16 = tt.splat %out_ptr5 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc71) + %17 = tt.addptr %16, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc71) + tt.store %17, %tmp31, %xmask_8 : tensor<1x1x!tt.ptr> loc(#loc72) + %c16_i32 = arith.constant 16 : i32 loc(#loc73) + %c16_i32_57 = arith.constant 16 : i32 loc(#loc73) + %cst_58 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc73) + %18 = arith.muli %cst_58, %xindex_7 : tensor<1x1xi32> loc(#loc73) + %19 = tt.broadcast %18 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc74) + %20 = arith.addi %r0_index_9, %19 : tensor<1x16xi32> loc(#loc74) + %21 = tt.splat %out_ptr6 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc75) + %22 = tt.addptr %21, %20 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc75) + %23 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc76) + tt.store %22, %tmp33, %23 : tensor<1x16x!tt.ptr> loc(#loc76) + %c17_i32_59 = arith.constant 17 : i32 loc(#loc77) + %c17_i32_60 = arith.constant 17 : i32 loc(#loc77) + %cst_61 = arith.constant dense<17> : tensor<1x1xi32> loc(#loc77) + %24 = arith.muli %cst_61, %xindex_7 : tensor<1x1xi32> loc(#loc77) + %25 = tt.broadcast %24 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc78) + %26 = arith.addi %tmp40, %25 : tensor<1x16xi32> loc(#loc78) + %27 = tt.splat %out_ptr7 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc79) + %28 = tt.addptr %27, %26 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc79) + %cst_62 = arith.constant dense<1> : tensor<1x16xi32> loc(#loc80) + %29 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc80) + tt.store %28, %cst_62, %29 : tensor<1x16x!tt.ptr> loc(#loc80) + %c16_i32_63 = arith.constant 16 : i32 loc(#loc81) + %c16_i32_64 = arith.constant 16 : i32 loc(#loc81) + %cst_65 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc81) + %30 = arith.muli %cst_65, %xindex_7 : tensor<1x1xi32> loc(#loc81) + %31 = tt.broadcast %30 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc82) + %32 = arith.addi %r0_index_9, %31 : tensor<1x16xi32> loc(#loc82) + %33 = tt.splat %out_ptr8 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc83) + %34 = tt.addptr %33, %32 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc83) + %35 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc84) + tt.store %34, %tmp44, %35 : tensor<1x16x!tt.ptr> loc(#loc84) + %c17_i32_66 = arith.constant 17 : i32 loc(#loc85) + %c17_i32_67 = arith.constant 17 : i32 loc(#loc85) + %cst_68 = arith.constant dense<17> : tensor<1x1xi32> loc(#loc85) + %36 = arith.muli %cst_68, %xindex_7 : tensor<1x1xi32> loc(#loc85) + %37 = tt.broadcast %36 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc86) + %38 = arith.addi %tmp49, %37 : tensor<1x16xi32> loc(#loc86) + %39 = tt.splat %out_ptr9 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc87) + %40 = tt.addptr %39, %38 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc87) + %cst_69 = arith.constant dense<1> : tensor<1x16xi32> loc(#loc88) + %41 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc88) + tt.store %40, %cst_69, %41 : tensor<1x16x!tt.ptr> loc(#loc88) + tt.return loc(#loc89) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc90)), %idxs: tensor<1x16xi16> loc("idxs"(#loc90))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc91) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc91) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc91) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc91) + tt.return %3#0, %3#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc92) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc93) + %5 = ub.poison : tensor<1x16xi32> loc(#loc93) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc93) + } loc(#loc90) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc94)), %idxs: tensor<1x16xi16> loc("idxs"(#loc94))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc259) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc260) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc260) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc261) + %flip_3 = tt.reshape %flip_2 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc262) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i16S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi16>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + tt.return %0#0, %0#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc100) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi32> loc(#loc101) + %2 = ub.poison : tensor<1x16xi32> loc(#loc101) + tt.return %1, %2 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc101) + } loc(#loc94) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i16S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi16> loc("idxs"(#loc102)), %flip: tensor<1x16xi32> loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi16> -> tensor<8x2x1xi16> loc(#loc280) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc281) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc282) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<8x2x1xi16> loc(#loc282) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<8x2x1xi16>) -> tensor<8x1xi32> loc(#loc283) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc284) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc285) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc286) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc287) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<8x2x1xi16> loc(#loc287) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<8x2x1xi16>) -> tensor<8x1xi32> loc(#loc288) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc289) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc290) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_27 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_28 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_49 = arith.constant true loc(#loc298) + %cond_50 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<1x16xi1> loc(#loc298) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<1x16xi1> loc(#loc299) + %cond_53 = arith.ori %cond, %cond_52 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_53 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_50 = arith.ori %eq, %eq_49 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_50 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<1x16xi32> loc(#loc304) + %cond_30 = arith.andi %3, %cond_29 : tensor<1x16xi1> loc(#loc305) + %cond_31 = arith.ori %1, %cond_30 : tensor<1x16xi1> loc(#loc306) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<1x16xi1> loc(#loc307) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<1x16xi1> loc(#loc308) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<1x16xi1> loc(#loc309) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<1x16xi1> loc(#loc310) + %cond_36 = arith.extui %cond_35 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc311) + %cond_37 = arith.xori %cond_36, %flip : tensor<1x16xi32> loc(#loc311) + %cond_38 = arith.constant 0 : i32 loc(#loc312) + %cond_39 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc312) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<1x16xi32> loc(#loc312) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_43 = arith.xori %x, %ret_42 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<1x16xi32> loc(#loc317) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S1_16S__(%idxs) : (tensor<1x16xi16>) -> tensor<1x16xi16> loc(#loc318) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc319) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_47 = arith.extsi %idxs : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc320) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_43, %new_idxs_48 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x1xi32> loc("input"(#loc165))) -> tensor<8x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc166) + tt.reduce.return %2 : i32 loc(#loc166) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc166) + tt.return %0 : tensor<8x1xi32> loc(#loc167) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x1xi32> loc(#loc168) + tt.return %1 : tensor<8x1xi32> loc(#loc168) + } loc(#loc165) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc169)), %b: i32 loc("b"(#loc169))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc170) + tt.return %0 : i32 loc(#loc171) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc172) + tt.return %1 : i32 loc(#loc172) + } loc(#loc169) + tt.func private @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x1xi16> loc("input"(#loc165))) -> tensor<8x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc324) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc166) + tt.reduce.return %2 : i32 loc(#loc166) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc166) + tt.return %0 : tensor<8x1xi32> loc(#loc167) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x1xi32> loc(#loc168) + tt.return %1 : tensor<8x1xi32> loc(#loc168) + } loc(#loc165) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%x: tensor<1x16xi32> loc("x"(#loc174))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc175) + %false = arith.constant false loc(#loc176) + tt.return %false : i1 loc(#loc176) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc177) + tt.return %1 : i1 loc(#loc177) + } loc(#loc174) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S1_16S__(%x: tensor<1x16xi32> loc("x"(#loc178))) -> tensor<1x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc179) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc180) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc180) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc180) + %4 = arith.addi %x, %3 : tensor<1x16xi32> loc(#loc180) + tt.return %4 : tensor<1x16xi32> loc(#loc181) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x16xi32> loc(#loc182) + tt.return %5 : tensor<1x16xi32> loc(#loc182) + } loc(#loc178) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc184) + %cst = arith.constant dense : tensor<1xi1> loc(#loc184) + tt.return %cst : tensor<1xi1> loc(#loc185) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc186) + tt.return %0 : tensor<1xi1> loc(#loc186) + } loc(#loc183) + tt.func private @triton.language.standard.zeros_like__i32S1_16S__(%input: tensor<1x16xi32> loc("input"(#loc187))) -> tensor<1x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<1x16xi32> loc(#loc188) + tt.return %0 : tensor<1x16xi32> loc(#loc189) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi32> loc(#loc190) + tt.return %1 : tensor<1x16xi32> loc(#loc190) + } loc(#loc187) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<1x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc184) + %cst = arith.constant dense<0> : tensor<1x16xi32> loc(#loc184) + tt.return %cst : tensor<1x16xi32> loc(#loc185) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16xi32> loc(#loc186) + tt.return %0 : tensor<1x16xi32> loc(#loc186) + } loc(#loc183) + tt.func private @triton.language.standard.zeros_like__i16S1_16S__(%input: tensor<1x16xi16> loc("input"(#loc187))) -> tensor<1x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<1x16xi16> loc(#loc188) + tt.return %0 : tensor<1x16xi16> loc(#loc189) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi16> loc(#loc190) + tt.return %1 : tensor<1x16xi16> loc(#loc190) + } loc(#loc187) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<1x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc184) + %cst = arith.constant dense<0> : tensor<1x16xi16> loc(#loc184) + tt.return %cst : tensor<1x16xi16> loc(#loc185) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16xi16> loc(#loc186) + tt.return %0 : tensor<1x16xi16> loc(#loc186) + } loc(#loc183) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc94)), %idxs: tensor<1x16xi32> loc("idxs"(#loc94))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc259) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc260) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc260) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc261) + %flip_3 = tt.reshape %flip_2 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc262) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + tt.return %1#0, %1#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc100) + ^bb1: // no predecessors + %2 = ub.poison : tensor<1x16xi32> loc(#loc101) + %3 = ub.poison : tensor<1x16xi32> loc(#loc101) + tt.return %2, %3 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc101) + } loc(#loc94) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: tensor<1x16xi32> loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<4x2x2xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<4x2x2xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<4x2x2xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<4x2x2xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc298) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc298) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc299) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc311) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc311) + %cond_36 = arith.constant 0 : i32 loc(#loc312) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc312) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc312) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x2x2xi32> loc("input"(#loc165))) -> tensor<4x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc166) + tt.reduce.return %2 : i32 loc(#loc166) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc166) + tt.return %0 : tensor<4x2xi32> loc(#loc167) + ^bb1: // no predecessors + %1 = ub.poison : tensor<4x2xi32> loc(#loc168) + tt.return %1 : tensor<4x2xi32> loc(#loc168) + } loc(#loc165) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: tensor<1x16xi32> loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x1xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x1xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc298) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc298) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc299) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc311) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc311) + %cond_36 = arith.constant 0 : i32 loc(#loc312) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc312) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc312) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc94)), %idxs: tensor<1x16xi32> loc("idxs"(#loc94))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc259) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc260) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc260) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc261) + %flip_3 = tt.reshape %flip_2 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc262) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + tt.return %2#0, %2#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc100) + ^bb1: // no predecessors + %3 = ub.poison : tensor<1x16xi32> loc(#loc101) + %4 = ub.poison : tensor<1x16xi32> loc(#loc101) + tt.return %3, %4 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc101) + } loc(#loc94) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: tensor<1x16xi32> loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<2x2x4xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<2x2x4xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<2x2x4xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<2x2x4xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc298) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc298) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc299) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc311) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc311) + %cond_36 = arith.constant 0 : i32 loc(#loc312) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc312) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc312) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<2x2x4xi32> loc("input"(#loc165))) -> tensor<2x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc166) + tt.reduce.return %2 : i32 loc(#loc166) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc166) + tt.return %0 : tensor<2x4xi32> loc(#loc167) + ^bb1: // no predecessors + %1 = ub.poison : tensor<2x4xi32> loc(#loc168) + tt.return %1 : tensor<2x4xi32> loc(#loc168) + } loc(#loc165) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc94)), %idxs: tensor<1x16xi32> loc("idxs"(#loc94))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc328) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + tt.return %3#0, %3#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc100) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc101) + %5 = ub.poison : tensor<1x16xi32> loc(#loc101) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc101) + } loc(#loc94) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: i1 loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<1x2x8xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<1x2x8xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<1x2x8xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<1x2x8xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc298) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc298) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc299) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc311) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc311) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2x8xi32> loc("input"(#loc165))) -> tensor<1x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc166) + tt.reduce.return %2 : i32 loc(#loc166) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc166) + tt.return %0 : tensor<1x8xi32> loc(#loc167) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x8xi32> loc(#loc168) + tt.return %1 : tensor<1x8xi32> loc(#loc168) + } loc(#loc165) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: i1 loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<2x2x4xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<2x2x4xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<2x2x4xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<2x2x4xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc298) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc298) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc299) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc311) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc311) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: i1 loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<4x2x2xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<4x2x2xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<4x2x2xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<4x2x2xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc298) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc298) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc299) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc311) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc311) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: i1 loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x1xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x1xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc298) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc298) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc299) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc311) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc311) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x16xi64> loc("input"(#loc165))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc166) + tt.reduce.return %2 : i64 loc(#loc166) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc166) + tt.return %0 : tensor<1xi64> loc(#loc167) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc168) + tt.return %1 : tensor<1xi64> loc(#loc168) + } loc(#loc165) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc169)), %b: i64 loc("b"(#loc169))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc170) + tt.return %0 : i64 loc(#loc171) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc172) + tt.return %1 : i64 loc(#loc172) + } loc(#loc169) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:30) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:45) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":35:30) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":36:18) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":37:34) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":38:18) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":39:18) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":40:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":41:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":43:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":46:71) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":47:20) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":48:21) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":49:21) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":51:71) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":52:20) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":54:35) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:26) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:29) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":56:21) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":58:35) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:26) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:29) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":60:21) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":61:21) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":62:21) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":63:21) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":64:19) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":65:32) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":66:35) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":67:44) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":68:20) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":69:20) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":70:35) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:28) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:46) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:38) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:55) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:53) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:63) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":72:31) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":73:21) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":74:21) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":75:19) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":76:35) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":77:20) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":78:20) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":79:35) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:28) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:46) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:38) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:55) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:53) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:63) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:25) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:37) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:25) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:37) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:35) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:32) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:47) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:52) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:49) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:25) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:85) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:35) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:32) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:25) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:47) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:52) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:49) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:25) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:85) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:4) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc141 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc142 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc143 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc144 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc145 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc146 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc147 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc148 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc149 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc150 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc151 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc152 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc153 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc154 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc155 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc156 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc157 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc158 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc159 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc160 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc161 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc162 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc163 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc164 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc166 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc167 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc168 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc170 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc171 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc172 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc173 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc175 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc176 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc177 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc179 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc180 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc181 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc182 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc183 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc184 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc185 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc186 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc188 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc189 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc190 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc191 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc201 = loc("xnumel"(#loc1)) +#loc202 = loc("r0_numel"(#loc2)) +#loc203 = loc("xoffset"(#loc3)) +#loc204 = loc("xoffset"(#loc4)) +#loc205 = loc("xindex"(#loc5)) +#loc206 = loc("xindex"(#loc6)) +#loc207 = loc("xindex"(#loc7)) +#loc208 = loc("xmask"(#loc8)) +#loc209 = loc("r0_index"(#loc9)) +#loc210 = loc("r0_index"(#loc10)) +#loc211 = loc("r0_offset"(#loc11)) +#loc212 = loc("r0_mask"(#loc12)) +#loc213 = loc("tmp0"(#loc13)) +#loc214 = loc("tmp0"(#loc14)) +#loc215 = loc("tmp0"(#loc15)) +#loc216 = loc("tmp0"(#loc16)) +#loc217 = loc("tmp1"(#loc17)) +#loc218 = loc("tmp2"(#loc18)) +#loc219 = loc("tmp3"(#loc19)) +#loc220 = loc("tmp4"(#loc20)) +#loc221 = loc("tmp5"(#loc21)) +#loc222 = loc("tmp6"(#loc22)) +#loc223 = loc("tmp7"(#loc23)) +#loc224 = loc("tmp9"(#loc24)) +#loc225 = loc("tmp14"(#loc26)) +#loc226 = loc("tmp15"(#loc27)) +#loc227 = loc("tmp16"(#loc28)) +#loc228 = loc("tmp20"(#loc30)) +#loc229 = loc("tmp23"(#loc31)) +#loc230 = loc("tmp24"(#loc32)) +#loc231 = loc("tmp24"(#loc33)) +#loc232 = loc("tmp25"(#loc34)) +#loc233 = loc("tmp28"(#loc35)) +#loc234 = loc("tmp29"(#loc36)) +#loc235 = loc("tmp29"(#loc37)) +#loc236 = loc("tmp30"(#loc38)) +#loc237 = loc("tmp31"(#loc39)) +#loc238 = loc("tmp32"(#loc40)) +#loc239 = loc("tmp33"(#loc41)) +#loc240 = loc("tmp34"(#loc42)) +#loc241 = loc("tmp35"(#loc43)) +#loc242 = loc("tmp36"(#loc44)) +#loc243 = loc("tmp37"(#loc45)) +#loc244 = loc("tmp38"(#loc46)) +#loc245 = loc("tmp39"(#loc47)) +#loc246 = loc("tmp40"(#loc48)) +#loc247 = loc("tmp42"(#loc55)) +#loc248 = loc("tmp43"(#loc56)) +#loc249 = loc("tmp44"(#loc57)) +#loc250 = loc("tmp45"(#loc58)) +#loc251 = loc("tmp46"(#loc59)) +#loc252 = loc("tmp47"(#loc60)) +#loc253 = loc("tmp48"(#loc61)) +#loc254 = loc("tmp49"(#loc62)) +#loc259 = loc("flip"(#loc95)) +#loc260 = loc("flip"(#loc96)) +#loc261 = loc("flip"(#loc97)) +#loc262 = loc("flip"(#loc98)) +#loc266 = loc("y"(#loc103)) +#loc267 = loc("right_mask"(#loc104)) +#loc268 = loc("right_mask"(#loc105)) +#loc269 = loc("left_mask"(#loc106)) +#loc270 = loc("ileft"(#loc107)) +#loc271 = loc("ileft"(#loc108)) +#loc272 = loc("ileft"(#loc109)) +#loc273 = loc("ileft"(#loc110)) +#loc274 = loc("iright"(#loc111)) +#loc275 = loc("iright"(#loc112)) +#loc276 = loc("iright"(#loc113)) +#loc277 = loc("iright"(#loc114)) +#loc278 = loc("ileft"(#loc115)) +#loc279 = loc("iright"(#loc116)) +#loc280 = loc("y_idx"(#loc117)) +#loc281 = loc("left_idx"(#loc118)) +#loc282 = loc("left_idx"(#loc119)) +#loc283 = loc("left_idx"(#loc120)) +#loc284 = loc("left_idx"(#loc121)) +#loc285 = loc("left_idx"(#loc122)) +#loc286 = loc("right_idx"(#loc123)) +#loc287 = loc("right_idx"(#loc124)) +#loc288 = loc("right_idx"(#loc125)) +#loc289 = loc("right_idx"(#loc126)) +#loc290 = loc("right_idx"(#loc127)) +#loc291 = loc("left_idx"(#loc128)) +#loc292 = loc("right_idx"(#loc129)) +#loc293 = loc("left_valid_mask"(#loc130)) +#loc294 = loc("right_valid_mask"(#loc131)) +#loc295 = loc("left_isnan"(#loc132)) +#loc296 = loc("right_isnan"(#loc133)) +#loc297 = loc("cond"(#loc134)) +#loc298 = loc("cond"(#loc137)) +#loc299 = loc("cond"(#loc138)) +#loc300 = loc("cond"(#loc139)) +#loc301 = loc("eq"(#loc141)) +#loc302 = loc("eq"(#loc144)) +#loc303 = loc("eq"(#loc145)) +#loc304 = loc("cond"(#loc146)) +#loc305 = loc("cond"(#loc147)) +#loc306 = loc("cond"(#loc148)) +#loc307 = loc("cond"(#loc149)) +#loc308 = loc("cond"(#loc150)) +#loc309 = loc("cond"(#loc151)) +#loc310 = loc("cond"(#loc152)) +#loc311 = loc("cond"(#loc153)) +#loc312 = loc("cond"(#loc154)) +#loc313 = loc("ret"(#loc155)) +#loc314 = loc("ret"(#loc156)) +#loc315 = loc("ret"(#loc157)) +#loc316 = loc("ret"(#loc158)) +#loc317 = loc("new_idxs"(#loc159)) +#loc318 = loc("new_idxs"(#loc160)) +#loc319 = loc("new_idxs"(#loc161)) +#loc320 = loc("new_idxs"(#loc162)) +#loc324 = loc("input"(#loc173)) +#loc328 = loc("flip"(#loc191)) +#loc329 = loc("cond"(#loc297)) +#loc330 = loc("cond"(#loc300)) +#loc331 = loc("eq"(#loc301)) +#loc332 = loc("eq"(#loc303)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..b3974fc747af5a820e24abe3ddafd3f63c79019a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir @@ -0,0 +1,1487 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [2, 2, 8], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [4, 2, 4], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked3 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [8, 2, 2], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked4 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [16, 2, 1], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [2, 1], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":18:0) +#loc1 = loc(unknown) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":46:71) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":51:71) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:26) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:26) +#loc113 = loc("in_ptr0"(#loc)) +#loc114 = loc("out_ptr4"(#loc)) +#loc115 = loc("out_ptr5"(#loc)) +#loc116 = loc("out_ptr6"(#loc)) +#loc117 = loc("out_ptr7"(#loc)) +#loc118 = loc("out_ptr8"(#loc)) +#loc119 = loc("out_ptr9"(#loc)) +#loc120 = loc("xnumel"(#loc)) +#loc121 = loc("r0_numel"(#loc)) +#loc136 = loc(callsite(#loc16 at #loc17)) +#loc142 = loc("ileft"(#loc25)) +#loc146 = loc("iright"(#loc30)) +#loc155 = loc("left_idx"(#loc39)) +#loc160 = loc("right_idx"(#loc44)) +#loc181 = loc(callsite(#loc16 at #loc65)) +#loc184 = loc("tmp24"(#loc68)) +#loc188 = loc("tmp29"(#loc72)) +#loc210 = loc(callsite(#loc21 at #loc136)) +#loc214 = loc(callsite(#loc21 at #loc181)) +#loc217 = loc(callsite(#loc1 at #loc184)) +#loc220 = loc(callsite(#loc1 at #loc188)) +#loc225 = loc(callsite(#loc142 at #loc210)) +#loc229 = loc(callsite(#loc146 at #loc210)) +#loc237 = loc(callsite(#loc155 at #loc210)) +#loc242 = loc(callsite(#loc160 at #loc210)) +#loc262 = loc(callsite(#loc142 at #loc214)) +#loc266 = loc(callsite(#loc146 at #loc214)) +#loc284 = loc(callsite(#loc155 at #loc214)) +#loc288 = loc(callsite(#loc160 at #loc214)) +#loc298 = loc(callsite(#loc1 at #loc225)) +#loc300 = loc(callsite(#loc1 at #loc229)) +#loc303 = loc(callsite(#loc1 at #loc237)) +#loc306 = loc(callsite(#loc1 at #loc242)) +#loc308 = loc(callsite(#loc1 at #loc262)) +#loc310 = loc(callsite(#loc1 at #loc266)) +#loc312 = loc(callsite(#loc1 at #loc284)) +#loc314 = loc(callsite(#loc1 at #loc288)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x16xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<16384> : tensor<1x16xi64, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked2> loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked3> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked4> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c17_i32 = arith.constant 17 : i32 loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x16xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<0> : tensor<1x16xi32, #blocked5> loc(#loc1) + %cst_7 = arith.constant dense<17> : tensor<1x16xi32, #blocked5> loc(#loc1) + %cst_8 = arith.constant dense<16> : tensor<1x16xi32, #blocked5> loc(#loc1) + %cst_9 = arith.constant dense<16384> : tensor<1x16xi64, #blocked5> loc(#loc1) + %cst_10 = arith.constant dense<0> : tensor<1x16xi64, #blocked5> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc122) + %xmask = arith.cmpi slt, %xoffset, %c128_i32 : i32 loc(#loc123) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked5}>> loc(#loc124) + %r0_index_11 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc124) + %r0_index_12 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x16xi32, #blocked5> loc(#loc124) + %r0_index_13 = tt.expand_dims %r0_index_11 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc124) + %tmp0 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc125) + %tmp0_14 = tt.splat %tmp0 : i32 -> tensor<1x16xi32, #blocked5> loc(#loc204) + %tmp0_15 = tt.splat %tmp0 : i32 -> tensor<1x16xi32, #blocked> loc(#loc204) + %tmp0_16 = arith.addi %r0_index_12, %tmp0_14 : tensor<1x16xi32, #blocked5> loc(#loc126) + %tmp0_17 = arith.addi %r0_index_13, %tmp0_15 : tensor<1x16xi32, #blocked> loc(#loc126) + %tmp0_18 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc127) + %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> loc(#loc127) + %tmp0_20 = tt.addptr %tmp0_18, %tmp0_16 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc127) + %tmp0_21 = tt.addptr %tmp0_19, %tmp0_17 : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> loc(#loc127) + %tmp0_22 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked5> loc(#loc205) + %tmp0_23 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked> loc(#loc205) + %tmp0_24 = tt.load %tmp0_20, %tmp0_22, %cst_10 : tensor<1x16x!tt.ptr, #blocked5> loc(#loc128) + %tmp0_25 = tt.load %tmp0_21, %tmp0_23, %cst : tensor<1x16x!tt.ptr, #blocked> loc(#loc128) + %tmp2 = arith.cmpi sgt, %tmp0_24, %cst_10 : tensor<1x16xi64, #blocked5> loc(#loc129) + %tmp2_26 = arith.cmpi sgt, %tmp0_25, %cst : tensor<1x16xi64, #blocked> loc(#loc129) + %tmp4 = arith.cmpi slt, %tmp0_24, %cst_9 : tensor<1x16xi64, #blocked5> loc(#loc130) + %tmp4_27 = arith.cmpi slt, %tmp0_25, %cst_0 : tensor<1x16xi64, #blocked> loc(#loc130) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<1x16xi1, #blocked5> loc(#loc131) + %tmp5_28 = arith.andi %tmp2_26, %tmp4_27 : tensor<1x16xi1, #blocked> loc(#loc131) + %tmp7 = arith.extui %tmp5 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc206) + %tmp9 = arith.trunci %r0_index_12 : tensor<1x16xi32, #blocked5> to tensor<1x16xi16, #blocked5> loc(#loc134) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> loc(#loc207) + %flip_29 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> loc(#loc207) + %flip_30 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> loc(#loc207) + %flip_31 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> loc(#loc207) + %flip_32 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> loc(#loc207) + %flip_33 = tt.expand_dims %flip_29 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> loc(#loc207) + %flip_34 = tt.expand_dims %flip_30 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> loc(#loc207) + %flip_35 = tt.expand_dims %flip_31 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> loc(#loc207) + %flip_36 = tt.expand_dims %flip_32 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> -> tensor<1x2x1xi32, #blocked3> loc(#loc207) + %flip_37 = tt.expand_dims %flip_33 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> -> tensor<1x2x1xi32, #blocked4> loc(#loc207) + %flip_38 = tt.expand_dims %flip_34 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> -> tensor<1x2x1xi32, #blocked2> loc(#loc207) + %flip_39 = tt.expand_dims %flip_35 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> -> tensor<1x2x1xi32, #blocked1> loc(#loc207) + %flip_40 = tt.broadcast %flip_36 : tensor<1x2x1xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc208) + %flip_41 = tt.reshape %flip_40 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc209) + %y = tt.reshape %tmp7 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc222) + %left_mask = arith.subi %cst_4, %flip_37 : tensor<1x2x1xi32, #blocked4> loc(#loc223) + %left_mask_42 = arith.subi %cst_3, %flip_36 : tensor<1x2x1xi32, #blocked3> loc(#loc223) + %left_mask_43 = arith.subi %cst_2, %flip_38 : tensor<1x2x1xi32, #blocked2> loc(#loc223) + %left_mask_44 = arith.subi %cst_1, %flip_39 : tensor<1x2x1xi32, #blocked1> loc(#loc223) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc224) + %ileft_45 = arith.muli %y, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc224) + %ileft_46 = "tt.reduce"(%ileft_45) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc297) + %ileft_47 = tt.expand_dims %ileft_46 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc226) + %ileft_48 = tt.broadcast %ileft_47 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc227) + %iright = tt.broadcast %flip_37 : tensor<1x2x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc228) + %iright_49 = arith.muli %y, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc228) + %iright_50 = "tt.reduce"(%iright_49) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc299) + %iright_51 = tt.expand_dims %iright_50 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc230) + %iright_52 = tt.broadcast %iright_51 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc231) + %ileft_53 = tt.reshape %ileft_48 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_54 = tt.reshape %iright_52 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx = tt.reshape %tmp9 : tensor<1x16xi16, #blocked5> -> tensor<8x2x1xi16, #blocked4> loc(#loc234) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc235) + %left_idx_55 = tt.broadcast %left_idx : tensor<1x2x1xi16, #blocked4> -> tensor<8x2x1xi16, #blocked4> loc(#loc236) + %left_idx_56 = arith.muli %y_idx, %left_idx_55 : tensor<8x2x1xi16, #blocked4> loc(#loc236) + %input = arith.extsi %left_idx_56 : tensor<8x2x1xi16, #blocked4> to tensor<8x2x1xi32, #blocked4> loc(#loc301) + %left_idx_57 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %left_idx_58 = tt.expand_dims %left_idx_57 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc238) + %left_idx_59 = tt.broadcast %left_idx_58 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc239) + %right_idx = arith.trunci %flip_37 : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc240) + %right_idx_60 = tt.broadcast %right_idx : tensor<1x2x1xi16, #blocked4> -> tensor<8x2x1xi16, #blocked4> loc(#loc241) + %right_idx_61 = arith.muli %y_idx, %right_idx_60 : tensor<8x2x1xi16, #blocked4> loc(#loc241) + %input_62 = arith.extsi %right_idx_61 : tensor<8x2x1xi16, #blocked4> to tensor<8x2x1xi32, #blocked4> loc(#loc304) + %right_idx_63 = "tt.reduce"(%input_62) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %right_idx_64 = tt.expand_dims %right_idx_63 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc243) + %right_idx_65 = tt.broadcast %right_idx_64 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc244) + %left_idx_66 = tt.reshape %left_idx_59 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_67 = tt.reshape %right_idx_65 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond = arith.cmpi slt, %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq = arith.cmpi eq, %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_68 = arith.cmpi sgt, %left_idx_66, %right_idx_67 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_69 = arith.andi %eq, %cond_68 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_70 = arith.ori %cond, %cond_69 : tensor<1x16xi1, #blocked5> loc(#loc251) + %cond_71 = arith.extui %cond_70 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_72 = arith.xori %cond_71, %flip_41 : tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_73 = arith.cmpi ne, %cond_72, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc253) + %ret = arith.xori %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_74 = arith.select %cond_73, %ret, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_75 = arith.xori %tmp7, %ret_74 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs = arith.xori %left_idx_66, %right_idx_67 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_76 = arith.select %cond_73, %new_idxs, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_77 = arith.extsi %tmp9 : tensor<1x16xi16, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc259) + %new_idxs_78 = arith.xori %new_idxs_77, %new_idxs_76 : tensor<1x16xi32, #blocked5> loc(#loc259) + %flip_79 = tt.broadcast %flip_38 : tensor<1x2x1xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc208) + %flip_80 = tt.reshape %flip_79 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc209) + %y_81 = tt.reshape %ret_75 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc222) + %ileft_82 = tt.broadcast %left_mask_42 : tensor<1x2x1xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc224) + %ileft_83 = arith.muli %y_81, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc224) + %ileft_84 = "tt.reduce"(%ileft_83) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc297) + %ileft_85 = tt.expand_dims %ileft_84 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc226) + %ileft_86 = tt.broadcast %ileft_85 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc227) + %iright_87 = arith.muli %y_81, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc228) + %iright_88 = "tt.reduce"(%iright_87) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc299) + %iright_89 = tt.expand_dims %iright_88 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc230) + %iright_90 = tt.broadcast %iright_89 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc231) + %ileft_91 = tt.reshape %ileft_86 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_92 = tt.reshape %iright_90 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_93 = tt.reshape %new_idxs_78 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc234) + %left_idx_94 = arith.muli %y_idx_93, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc236) + %left_idx_95 = "tt.reduce"(%left_idx_94) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %left_idx_96 = tt.expand_dims %left_idx_95 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc238) + %left_idx_97 = tt.broadcast %left_idx_96 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc239) + %right_idx_98 = arith.muli %y_idx_93, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc241) + %right_idx_99 = "tt.reduce"(%right_idx_98) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %right_idx_100 = tt.expand_dims %right_idx_99 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc243) + %right_idx_101 = tt.broadcast %right_idx_100 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc244) + %left_idx_102 = tt.reshape %left_idx_97 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_103 = tt.reshape %right_idx_101 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_104 = arith.cmpi slt, %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_105 = arith.cmpi eq, %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_106 = arith.cmpi sgt, %left_idx_102, %right_idx_103 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_107 = arith.andi %eq_105, %cond_106 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_108 = arith.ori %cond_104, %cond_107 : tensor<1x16xi1, #blocked5> loc(#loc251) + %cond_109 = arith.extui %cond_108 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_110 = arith.xori %cond_109, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_111 = arith.cmpi ne, %cond_110, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc253) + %ret_112 = arith.xori %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_113 = arith.select %cond_111, %ret_112, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_114 = arith.xori %ret_75, %ret_113 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_115 = arith.xori %left_idx_102, %right_idx_103 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_116 = arith.select %cond_111, %new_idxs_115, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_117 = arith.xori %new_idxs_78, %new_idxs_116 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_118 = tt.reshape %ret_114 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc222) + %ileft_119 = arith.muli %y_118, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc224) + %ileft_120 = "tt.reduce"(%ileft_119) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc297) + %ileft_121 = tt.expand_dims %ileft_120 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc226) + %ileft_122 = tt.broadcast %ileft_121 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc227) + %iright_123 = arith.muli %y_118, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc228) + %iright_124 = "tt.reduce"(%iright_123) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc299) + %iright_125 = tt.expand_dims %iright_124 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc230) + %iright_126 = tt.broadcast %iright_125 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc231) + %ileft_127 = tt.reshape %ileft_122 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_128 = tt.reshape %iright_126 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_129 = tt.reshape %new_idxs_117 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc234) + %left_idx_130 = arith.muli %y_idx_129, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc236) + %left_idx_131 = "tt.reduce"(%left_idx_130) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %left_idx_132 = tt.expand_dims %left_idx_131 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc238) + %left_idx_133 = tt.broadcast %left_idx_132 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc239) + %right_idx_134 = arith.muli %y_idx_129, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc241) + %right_idx_135 = "tt.reduce"(%right_idx_134) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %right_idx_136 = tt.expand_dims %right_idx_135 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc243) + %right_idx_137 = tt.broadcast %right_idx_136 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc244) + %left_idx_138 = tt.reshape %left_idx_133 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_139 = tt.reshape %right_idx_137 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_140 = arith.cmpi slt, %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_141 = arith.cmpi eq, %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_142 = arith.cmpi sgt, %left_idx_138, %right_idx_139 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_143 = arith.andi %eq_141, %cond_142 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_144 = arith.ori %cond_140, %cond_143 : tensor<1x16xi1, #blocked5> loc(#loc251) + %cond_145 = arith.extui %cond_144 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_146 = arith.xori %cond_145, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_147 = arith.cmpi ne, %cond_146, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc253) + %ret_148 = arith.xori %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_149 = arith.select %cond_147, %ret_148, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_150 = arith.xori %ret_114, %ret_149 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_151 = arith.xori %left_idx_138, %right_idx_139 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_152 = arith.select %cond_147, %new_idxs_151, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_153 = arith.xori %new_idxs_117, %new_idxs_152 : tensor<1x16xi32, #blocked5> loc(#loc259) + %flip_154 = tt.broadcast %flip_39 : tensor<1x2x1xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc208) + %flip_155 = tt.reshape %flip_154 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc209) + %y_156 = tt.reshape %ret_150 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc222) + %ileft_157 = tt.broadcast %left_mask_43 : tensor<1x2x1xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc224) + %ileft_158 = arith.muli %y_156, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc224) + %ileft_159 = "tt.reduce"(%ileft_158) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc297) + %ileft_160 = tt.expand_dims %ileft_159 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc226) + %ileft_161 = tt.broadcast %ileft_160 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc227) + %iright_162 = arith.muli %y_156, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc228) + %iright_163 = "tt.reduce"(%iright_162) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc299) + %iright_164 = tt.expand_dims %iright_163 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc230) + %iright_165 = tt.broadcast %iright_164 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc231) + %ileft_166 = tt.reshape %ileft_161 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_167 = tt.reshape %iright_165 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_168 = tt.reshape %new_idxs_153 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc234) + %left_idx_169 = arith.muli %y_idx_168, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc236) + %left_idx_170 = "tt.reduce"(%left_idx_169) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %left_idx_171 = tt.expand_dims %left_idx_170 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc238) + %left_idx_172 = tt.broadcast %left_idx_171 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc239) + %right_idx_173 = arith.muli %y_idx_168, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc241) + %right_idx_174 = "tt.reduce"(%right_idx_173) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %right_idx_175 = tt.expand_dims %right_idx_174 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc243) + %right_idx_176 = tt.broadcast %right_idx_175 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc244) + %left_idx_177 = tt.reshape %left_idx_172 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_178 = tt.reshape %right_idx_176 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_179 = arith.cmpi slt, %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_180 = arith.cmpi eq, %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_181 = arith.cmpi sgt, %left_idx_177, %right_idx_178 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_182 = arith.andi %eq_180, %cond_181 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_183 = arith.ori %cond_179, %cond_182 : tensor<1x16xi1, #blocked5> loc(#loc251) + %cond_184 = arith.extui %cond_183 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_185 = arith.xori %cond_184, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_186 = arith.cmpi ne, %cond_185, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc253) + %ret_187 = arith.xori %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_188 = arith.select %cond_186, %ret_187, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_189 = arith.xori %ret_150, %ret_188 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_190 = arith.xori %left_idx_177, %right_idx_178 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_191 = arith.select %cond_186, %new_idxs_190, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_192 = arith.xori %new_idxs_153, %new_idxs_191 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_193 = tt.reshape %ret_189 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc222) + %ileft_194 = arith.muli %y_193, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc224) + %ileft_195 = "tt.reduce"(%ileft_194) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc297) + %ileft_196 = tt.expand_dims %ileft_195 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc226) + %ileft_197 = tt.broadcast %ileft_196 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc227) + %iright_198 = arith.muli %y_193, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc228) + %iright_199 = "tt.reduce"(%iright_198) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc299) + %iright_200 = tt.expand_dims %iright_199 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc230) + %iright_201 = tt.broadcast %iright_200 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc231) + %ileft_202 = tt.reshape %ileft_197 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_203 = tt.reshape %iright_201 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_204 = tt.reshape %new_idxs_192 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc234) + %left_idx_205 = arith.muli %y_idx_204, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc236) + %left_idx_206 = "tt.reduce"(%left_idx_205) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %left_idx_207 = tt.expand_dims %left_idx_206 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc238) + %left_idx_208 = tt.broadcast %left_idx_207 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc239) + %right_idx_209 = arith.muli %y_idx_204, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc241) + %right_idx_210 = "tt.reduce"(%right_idx_209) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %right_idx_211 = tt.expand_dims %right_idx_210 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc243) + %right_idx_212 = tt.broadcast %right_idx_211 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc244) + %left_idx_213 = tt.reshape %left_idx_208 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_214 = tt.reshape %right_idx_212 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_215 = arith.cmpi slt, %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_216 = arith.cmpi eq, %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_217 = arith.cmpi sgt, %left_idx_213, %right_idx_214 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_218 = arith.andi %eq_216, %cond_217 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_219 = arith.ori %cond_215, %cond_218 : tensor<1x16xi1, #blocked5> loc(#loc251) + %cond_220 = arith.extui %cond_219 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_221 = arith.xori %cond_220, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_222 = arith.cmpi ne, %cond_221, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc253) + %ret_223 = arith.xori %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_224 = arith.select %cond_222, %ret_223, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_225 = arith.xori %ret_189, %ret_224 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_226 = arith.xori %left_idx_213, %right_idx_214 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_227 = arith.select %cond_222, %new_idxs_226, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_228 = arith.xori %new_idxs_192, %new_idxs_227 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_229 = tt.reshape %ret_225 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc222) + %ileft_230 = arith.muli %y_229, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc224) + %ileft_231 = "tt.reduce"(%ileft_230) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc297) + %ileft_232 = tt.expand_dims %ileft_231 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc226) + %ileft_233 = tt.broadcast %ileft_232 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc227) + %iright_234 = arith.muli %y_229, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc228) + %iright_235 = "tt.reduce"(%iright_234) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc299) + %iright_236 = tt.expand_dims %iright_235 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc230) + %iright_237 = tt.broadcast %iright_236 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc231) + %ileft_238 = tt.reshape %ileft_233 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_239 = tt.reshape %iright_237 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_240 = tt.reshape %new_idxs_228 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc234) + %left_idx_241 = arith.muli %y_idx_240, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc236) + %left_idx_242 = "tt.reduce"(%left_idx_241) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %left_idx_243 = tt.expand_dims %left_idx_242 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc238) + %left_idx_244 = tt.broadcast %left_idx_243 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc239) + %right_idx_245 = arith.muli %y_idx_240, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc241) + %right_idx_246 = "tt.reduce"(%right_idx_245) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %right_idx_247 = tt.expand_dims %right_idx_246 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc243) + %right_idx_248 = tt.broadcast %right_idx_247 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc244) + %left_idx_249 = tt.reshape %left_idx_244 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_250 = tt.reshape %right_idx_248 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_251 = arith.cmpi slt, %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_252 = arith.cmpi eq, %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_253 = arith.cmpi sgt, %left_idx_249, %right_idx_250 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_254 = arith.andi %eq_252, %cond_253 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_255 = arith.ori %cond_251, %cond_254 : tensor<1x16xi1, #blocked5> loc(#loc251) + %cond_256 = arith.extui %cond_255 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_257 = arith.xori %cond_256, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_258 = arith.cmpi ne, %cond_257, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc253) + %ret_259 = arith.xori %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_260 = arith.select %cond_258, %ret_259, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_261 = arith.xori %ret_225, %ret_260 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_262 = arith.xori %left_idx_249, %right_idx_250 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_263 = arith.select %cond_258, %new_idxs_262, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_264 = arith.xori %new_idxs_228, %new_idxs_263 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_265 = tt.reshape %ret_261 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc222) + %ileft_266 = tt.broadcast %left_mask_44 : tensor<1x2x1xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc224) + %ileft_267 = arith.muli %y_265, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc224) + %ileft_268 = "tt.reduce"(%ileft_267) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc297) + %ileft_269 = tt.expand_dims %ileft_268 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc226) + %ileft_270 = tt.broadcast %ileft_269 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc227) + %iright_271 = arith.muli %y_265, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc228) + %iright_272 = "tt.reduce"(%iright_271) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc299) + %iright_273 = tt.expand_dims %iright_272 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc230) + %iright_274 = tt.broadcast %iright_273 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc231) + %ileft_275 = tt.reshape %ileft_270 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_276 = tt.reshape %iright_274 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_277 = tt.reshape %new_idxs_264 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc234) + %left_idx_278 = arith.muli %y_idx_277, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc236) + %left_idx_279 = "tt.reduce"(%left_idx_278) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc302) + %left_idx_280 = tt.expand_dims %left_idx_279 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc238) + %left_idx_281 = tt.broadcast %left_idx_280 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc239) + %right_idx_282 = arith.muli %y_idx_277, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc241) + %right_idx_283 = "tt.reduce"(%right_idx_282) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc305) + %right_idx_284 = tt.expand_dims %right_idx_283 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc243) + %right_idx_285 = tt.broadcast %right_idx_284 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc244) + %left_idx_286 = tt.reshape %left_idx_281 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_287 = tt.reshape %right_idx_285 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_288 = arith.cmpi slt, %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_289 = arith.cmpi eq, %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_290 = arith.cmpi sgt, %left_idx_286, %right_idx_287 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_291 = arith.andi %eq_289, %cond_290 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_292 = arith.ori %cond_288, %cond_291 : tensor<1x16xi1, #blocked5> loc(#loc251) + %ret_293 = arith.xori %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_294 = arith.select %cond_292, %ret_293, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_295 = arith.xori %ret_261, %ret_294 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_296 = arith.xori %left_idx_286, %right_idx_287 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_297 = arith.select %cond_292, %new_idxs_296, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_298 = arith.xori %new_idxs_264, %new_idxs_297 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_299 = tt.reshape %ret_295 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc222) + %ileft_300 = arith.muli %y_299, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc224) + %ileft_301 = "tt.reduce"(%ileft_300) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc297) + %ileft_302 = tt.expand_dims %ileft_301 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc226) + %ileft_303 = tt.broadcast %ileft_302 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc227) + %iright_304 = arith.muli %y_299, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc228) + %iright_305 = "tt.reduce"(%iright_304) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc299) + %iright_306 = tt.expand_dims %iright_305 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc230) + %iright_307 = tt.broadcast %iright_306 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc231) + %ileft_308 = tt.reshape %ileft_303 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_309 = tt.reshape %iright_307 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_310 = tt.reshape %new_idxs_298 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc234) + %left_idx_311 = arith.muli %y_idx_310, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc236) + %left_idx_312 = "tt.reduce"(%left_idx_311) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %left_idx_313 = tt.expand_dims %left_idx_312 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc238) + %left_idx_314 = tt.broadcast %left_idx_313 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc239) + %right_idx_315 = arith.muli %y_idx_310, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc241) + %right_idx_316 = "tt.reduce"(%right_idx_315) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %right_idx_317 = tt.expand_dims %right_idx_316 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc243) + %right_idx_318 = tt.broadcast %right_idx_317 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc244) + %left_idx_319 = tt.reshape %left_idx_314 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_320 = tt.reshape %right_idx_318 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_321 = arith.cmpi slt, %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_322 = arith.cmpi eq, %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_323 = arith.cmpi sgt, %left_idx_319, %right_idx_320 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_324 = arith.andi %eq_322, %cond_323 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_325 = arith.ori %cond_321, %cond_324 : tensor<1x16xi1, #blocked5> loc(#loc251) + %ret_326 = arith.xori %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_327 = arith.select %cond_325, %ret_326, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_328 = arith.xori %ret_295, %ret_327 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_329 = arith.xori %left_idx_319, %right_idx_320 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_330 = arith.select %cond_325, %new_idxs_329, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_331 = arith.xori %new_idxs_298, %new_idxs_330 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_332 = tt.reshape %ret_328 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc222) + %ileft_333 = arith.muli %y_332, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc224) + %ileft_334 = "tt.reduce"(%ileft_333) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc297) + %ileft_335 = tt.expand_dims %ileft_334 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc226) + %ileft_336 = tt.broadcast %ileft_335 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc227) + %iright_337 = arith.muli %y_332, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc228) + %iright_338 = "tt.reduce"(%iright_337) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc299) + %iright_339 = tt.expand_dims %iright_338 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc230) + %iright_340 = tt.broadcast %iright_339 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc231) + %ileft_341 = tt.reshape %ileft_336 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_342 = tt.reshape %iright_340 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_343 = tt.reshape %new_idxs_331 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc234) + %left_idx_344 = arith.muli %y_idx_343, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc236) + %left_idx_345 = "tt.reduce"(%left_idx_344) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %left_idx_346 = tt.expand_dims %left_idx_345 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc238) + %left_idx_347 = tt.broadcast %left_idx_346 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc239) + %right_idx_348 = arith.muli %y_idx_343, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc241) + %right_idx_349 = "tt.reduce"(%right_idx_348) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %right_idx_350 = tt.expand_dims %right_idx_349 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc243) + %right_idx_351 = tt.broadcast %right_idx_350 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc244) + %left_idx_352 = tt.reshape %left_idx_347 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_353 = tt.reshape %right_idx_351 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_354 = arith.cmpi slt, %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_355 = arith.cmpi eq, %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_356 = arith.cmpi sgt, %left_idx_352, %right_idx_353 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_357 = arith.andi %eq_355, %cond_356 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_358 = arith.ori %cond_354, %cond_357 : tensor<1x16xi1, #blocked5> loc(#loc251) + %ret_359 = arith.xori %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_360 = arith.select %cond_358, %ret_359, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_361 = arith.xori %ret_328, %ret_360 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_362 = arith.xori %left_idx_352, %right_idx_353 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_363 = arith.select %cond_358, %new_idxs_362, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_364 = arith.xori %new_idxs_331, %new_idxs_363 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_365 = tt.reshape %ret_361 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc222) + %ileft_366 = arith.muli %y_365, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc224) + %ileft_367 = "tt.reduce"(%ileft_366) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc297) + %ileft_368 = tt.expand_dims %ileft_367 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc226) + %ileft_369 = tt.broadcast %ileft_368 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc227) + %iright_370 = arith.muli %y_365, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc228) + %iright_371 = "tt.reduce"(%iright_370) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc299) + %iright_372 = tt.expand_dims %iright_371 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc230) + %iright_373 = tt.broadcast %iright_372 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc231) + %ileft_374 = tt.reshape %ileft_369 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_375 = tt.reshape %iright_373 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_376 = tt.reshape %new_idxs_364 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc234) + %left_idx_377 = arith.muli %y_idx_376, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc236) + %left_idx_378 = "tt.reduce"(%left_idx_377) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %left_idx_379 = tt.expand_dims %left_idx_378 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc238) + %left_idx_380 = tt.broadcast %left_idx_379 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc239) + %right_idx_381 = arith.muli %y_idx_376, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc241) + %right_idx_382 = "tt.reduce"(%right_idx_381) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %right_idx_383 = tt.expand_dims %right_idx_382 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc243) + %right_idx_384 = tt.broadcast %right_idx_383 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc244) + %left_idx_385 = tt.reshape %left_idx_380 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_386 = tt.reshape %right_idx_384 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_387 = arith.cmpi slt, %ileft_374, %iright_375 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_388 = arith.cmpi eq, %ileft_374, %iright_375 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_389 = arith.cmpi sgt, %left_idx_385, %right_idx_386 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_390 = arith.andi %eq_388, %cond_389 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_391 = arith.ori %cond_387, %cond_390 : tensor<1x16xi1, #blocked5> loc(#loc251) + %new_idxs_392 = arith.xori %left_idx_385, %right_idx_386 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_393 = arith.select %cond_391, %new_idxs_392, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_394 = arith.xori %new_idxs_364, %new_idxs_393 : tensor<1x16xi32, #blocked5> loc(#loc259) + %tmp14 = arith.cmpi eq, %tmp0_24, %cst_9 : tensor<1x16xi64, #blocked5> loc(#loc178) + %tmp14_395 = arith.cmpi eq, %tmp0_25, %cst_0 : tensor<1x16xi64, #blocked> loc(#loc178) + %tmp16 = arith.extui %tmp14 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc213) + %y_396 = tt.reshape %tmp16 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc260) + %ileft_397 = arith.muli %y_396, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc261) + %ileft_398 = "tt.reduce"(%ileft_397) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc307) + %ileft_399 = tt.expand_dims %ileft_398 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc263) + %ileft_400 = tt.broadcast %ileft_399 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc264) + %iright_401 = arith.muli %y_396, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc265) + %iright_402 = "tt.reduce"(%iright_401) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc309) + %iright_403 = tt.expand_dims %iright_402 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc267) + %iright_404 = tt.broadcast %iright_403 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc268) + %ileft_405 = tt.reshape %ileft_400 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_406 = tt.reshape %iright_404 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %cond_407 = arith.cmpi slt, %ileft_405, %iright_406 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_408 = arith.cmpi eq, %ileft_405, %iright_406 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_409 = arith.andi %eq_408, %cond_68 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_410 = arith.ori %cond_407, %cond_409 : tensor<1x16xi1, #blocked5> loc(#loc274) + %cond_411 = arith.extui %cond_410 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_412 = arith.xori %cond_411, %flip_41 : tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_413 = arith.cmpi ne, %cond_412, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc276) + %ret_414 = arith.xori %ileft_405, %iright_406 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_415 = arith.select %cond_413, %ret_414, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_416 = arith.xori %tmp16, %ret_415 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_417 = arith.select %cond_413, %new_idxs, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_418 = arith.xori %new_idxs_77, %new_idxs_417 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_419 = tt.reshape %ret_416 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc260) + %ileft_420 = arith.muli %y_419, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc261) + %ileft_421 = "tt.reduce"(%ileft_420) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc307) + %ileft_422 = tt.expand_dims %ileft_421 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc263) + %ileft_423 = tt.broadcast %ileft_422 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc264) + %iright_424 = arith.muli %y_419, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc265) + %iright_425 = "tt.reduce"(%iright_424) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc309) + %iright_426 = tt.expand_dims %iright_425 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc267) + %iright_427 = tt.broadcast %iright_426 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc268) + %ileft_428 = tt.reshape %ileft_423 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_429 = tt.reshape %iright_427 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_430 = tt.reshape %new_idxs_418 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc282) + %left_idx_431 = arith.muli %y_idx_430, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc283) + %left_idx_432 = "tt.reduce"(%left_idx_431) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc311) + %left_idx_433 = tt.expand_dims %left_idx_432 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc285) + %left_idx_434 = tt.broadcast %left_idx_433 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc286) + %right_idx_435 = arith.muli %y_idx_430, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc287) + %right_idx_436 = "tt.reduce"(%right_idx_435) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc313) + %right_idx_437 = tt.expand_dims %right_idx_436 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc289) + %right_idx_438 = tt.broadcast %right_idx_437 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc290) + %left_idx_439 = tt.reshape %left_idx_434 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_440 = tt.reshape %right_idx_438 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_441 = arith.cmpi slt, %ileft_428, %iright_429 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_442 = arith.cmpi eq, %ileft_428, %iright_429 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_443 = arith.cmpi sgt, %left_idx_439, %right_idx_440 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_444 = arith.andi %eq_442, %cond_443 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_445 = arith.ori %cond_441, %cond_444 : tensor<1x16xi1, #blocked5> loc(#loc274) + %cond_446 = arith.extui %cond_445 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_447 = arith.xori %cond_446, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_448 = arith.cmpi ne, %cond_447, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc276) + %ret_449 = arith.xori %ileft_428, %iright_429 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_450 = arith.select %cond_448, %ret_449, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_451 = arith.xori %ret_416, %ret_450 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_452 = arith.xori %left_idx_439, %right_idx_440 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_453 = arith.select %cond_448, %new_idxs_452, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_454 = arith.xori %new_idxs_418, %new_idxs_453 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_455 = tt.reshape %ret_451 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc260) + %ileft_456 = arith.muli %y_455, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc261) + %ileft_457 = "tt.reduce"(%ileft_456) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc307) + %ileft_458 = tt.expand_dims %ileft_457 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc263) + %ileft_459 = tt.broadcast %ileft_458 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc264) + %iright_460 = arith.muli %y_455, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc265) + %iright_461 = "tt.reduce"(%iright_460) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc309) + %iright_462 = tt.expand_dims %iright_461 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc267) + %iright_463 = tt.broadcast %iright_462 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc268) + %ileft_464 = tt.reshape %ileft_459 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_465 = tt.reshape %iright_463 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_466 = tt.reshape %new_idxs_454 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc282) + %left_idx_467 = arith.muli %y_idx_466, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc283) + %left_idx_468 = "tt.reduce"(%left_idx_467) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc311) + %left_idx_469 = tt.expand_dims %left_idx_468 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc285) + %left_idx_470 = tt.broadcast %left_idx_469 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc286) + %right_idx_471 = arith.muli %y_idx_466, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc287) + %right_idx_472 = "tt.reduce"(%right_idx_471) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc313) + %right_idx_473 = tt.expand_dims %right_idx_472 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc289) + %right_idx_474 = tt.broadcast %right_idx_473 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc290) + %left_idx_475 = tt.reshape %left_idx_470 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_476 = tt.reshape %right_idx_474 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_477 = arith.cmpi slt, %ileft_464, %iright_465 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_478 = arith.cmpi eq, %ileft_464, %iright_465 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_479 = arith.cmpi sgt, %left_idx_475, %right_idx_476 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_480 = arith.andi %eq_478, %cond_479 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_481 = arith.ori %cond_477, %cond_480 : tensor<1x16xi1, #blocked5> loc(#loc274) + %cond_482 = arith.extui %cond_481 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_483 = arith.xori %cond_482, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_484 = arith.cmpi ne, %cond_483, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc276) + %ret_485 = arith.xori %ileft_464, %iright_465 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_486 = arith.select %cond_484, %ret_485, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_487 = arith.xori %ret_451, %ret_486 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_488 = arith.xori %left_idx_475, %right_idx_476 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_489 = arith.select %cond_484, %new_idxs_488, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_490 = arith.xori %new_idxs_454, %new_idxs_489 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_491 = tt.reshape %ret_487 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc260) + %ileft_492 = arith.muli %y_491, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc261) + %ileft_493 = "tt.reduce"(%ileft_492) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc307) + %ileft_494 = tt.expand_dims %ileft_493 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc263) + %ileft_495 = tt.broadcast %ileft_494 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc264) + %iright_496 = arith.muli %y_491, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc265) + %iright_497 = "tt.reduce"(%iright_496) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc309) + %iright_498 = tt.expand_dims %iright_497 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc267) + %iright_499 = tt.broadcast %iright_498 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc268) + %ileft_500 = tt.reshape %ileft_495 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_501 = tt.reshape %iright_499 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_502 = tt.reshape %new_idxs_490 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc282) + %left_idx_503 = arith.muli %y_idx_502, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc283) + %left_idx_504 = "tt.reduce"(%left_idx_503) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc311) + %left_idx_505 = tt.expand_dims %left_idx_504 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc285) + %left_idx_506 = tt.broadcast %left_idx_505 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc286) + %right_idx_507 = arith.muli %y_idx_502, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc287) + %right_idx_508 = "tt.reduce"(%right_idx_507) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc313) + %right_idx_509 = tt.expand_dims %right_idx_508 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc289) + %right_idx_510 = tt.broadcast %right_idx_509 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc290) + %left_idx_511 = tt.reshape %left_idx_506 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_512 = tt.reshape %right_idx_510 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_513 = arith.cmpi slt, %ileft_500, %iright_501 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_514 = arith.cmpi eq, %ileft_500, %iright_501 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_515 = arith.cmpi sgt, %left_idx_511, %right_idx_512 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_516 = arith.andi %eq_514, %cond_515 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_517 = arith.ori %cond_513, %cond_516 : tensor<1x16xi1, #blocked5> loc(#loc274) + %cond_518 = arith.extui %cond_517 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_519 = arith.xori %cond_518, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_520 = arith.cmpi ne, %cond_519, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc276) + %ret_521 = arith.xori %ileft_500, %iright_501 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_522 = arith.select %cond_520, %ret_521, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_523 = arith.xori %ret_487, %ret_522 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_524 = arith.xori %left_idx_511, %right_idx_512 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_525 = arith.select %cond_520, %new_idxs_524, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_526 = arith.xori %new_idxs_490, %new_idxs_525 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_527 = tt.reshape %ret_523 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc260) + %ileft_528 = arith.muli %y_527, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc261) + %ileft_529 = "tt.reduce"(%ileft_528) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc307) + %ileft_530 = tt.expand_dims %ileft_529 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc263) + %ileft_531 = tt.broadcast %ileft_530 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc264) + %iright_532 = arith.muli %y_527, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc265) + %iright_533 = "tt.reduce"(%iright_532) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc309) + %iright_534 = tt.expand_dims %iright_533 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc267) + %iright_535 = tt.broadcast %iright_534 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc268) + %ileft_536 = tt.reshape %ileft_531 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_537 = tt.reshape %iright_535 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_538 = tt.reshape %new_idxs_526 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc282) + %left_idx_539 = arith.muli %y_idx_538, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc283) + %left_idx_540 = "tt.reduce"(%left_idx_539) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc311) + %left_idx_541 = tt.expand_dims %left_idx_540 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc285) + %left_idx_542 = tt.broadcast %left_idx_541 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc286) + %right_idx_543 = arith.muli %y_idx_538, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc287) + %right_idx_544 = "tt.reduce"(%right_idx_543) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc313) + %right_idx_545 = tt.expand_dims %right_idx_544 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc289) + %right_idx_546 = tt.broadcast %right_idx_545 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc290) + %left_idx_547 = tt.reshape %left_idx_542 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_548 = tt.reshape %right_idx_546 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_549 = arith.cmpi slt, %ileft_536, %iright_537 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_550 = arith.cmpi eq, %ileft_536, %iright_537 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_551 = arith.cmpi sgt, %left_idx_547, %right_idx_548 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_552 = arith.andi %eq_550, %cond_551 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_553 = arith.ori %cond_549, %cond_552 : tensor<1x16xi1, #blocked5> loc(#loc274) + %cond_554 = arith.extui %cond_553 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_555 = arith.xori %cond_554, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_556 = arith.cmpi ne, %cond_555, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc276) + %ret_557 = arith.xori %ileft_536, %iright_537 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_558 = arith.select %cond_556, %ret_557, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_559 = arith.xori %ret_523, %ret_558 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_560 = arith.xori %left_idx_547, %right_idx_548 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_561 = arith.select %cond_556, %new_idxs_560, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_562 = arith.xori %new_idxs_526, %new_idxs_561 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_563 = tt.reshape %ret_559 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc260) + %ileft_564 = arith.muli %y_563, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc261) + %ileft_565 = "tt.reduce"(%ileft_564) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc307) + %ileft_566 = tt.expand_dims %ileft_565 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc263) + %ileft_567 = tt.broadcast %ileft_566 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc264) + %iright_568 = arith.muli %y_563, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc265) + %iright_569 = "tt.reduce"(%iright_568) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc309) + %iright_570 = tt.expand_dims %iright_569 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc267) + %iright_571 = tt.broadcast %iright_570 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc268) + %ileft_572 = tt.reshape %ileft_567 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_573 = tt.reshape %iright_571 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_574 = tt.reshape %new_idxs_562 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc282) + %left_idx_575 = arith.muli %y_idx_574, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc283) + %left_idx_576 = "tt.reduce"(%left_idx_575) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc311) + %left_idx_577 = tt.expand_dims %left_idx_576 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc285) + %left_idx_578 = tt.broadcast %left_idx_577 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc286) + %right_idx_579 = arith.muli %y_idx_574, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc287) + %right_idx_580 = "tt.reduce"(%right_idx_579) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc313) + %right_idx_581 = tt.expand_dims %right_idx_580 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc289) + %right_idx_582 = tt.broadcast %right_idx_581 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc290) + %left_idx_583 = tt.reshape %left_idx_578 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_584 = tt.reshape %right_idx_582 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_585 = arith.cmpi slt, %ileft_572, %iright_573 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_586 = arith.cmpi eq, %ileft_572, %iright_573 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_587 = arith.cmpi sgt, %left_idx_583, %right_idx_584 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_588 = arith.andi %eq_586, %cond_587 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_589 = arith.ori %cond_585, %cond_588 : tensor<1x16xi1, #blocked5> loc(#loc274) + %cond_590 = arith.extui %cond_589 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_591 = arith.xori %cond_590, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_592 = arith.cmpi ne, %cond_591, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc276) + %ret_593 = arith.xori %ileft_572, %iright_573 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_594 = arith.select %cond_592, %ret_593, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_595 = arith.xori %ret_559, %ret_594 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_596 = arith.xori %left_idx_583, %right_idx_584 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_597 = arith.select %cond_592, %new_idxs_596, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_598 = arith.xori %new_idxs_562, %new_idxs_597 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_599 = tt.reshape %ret_595 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc260) + %ileft_600 = arith.muli %y_599, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc261) + %ileft_601 = "tt.reduce"(%ileft_600) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc307) + %ileft_602 = tt.expand_dims %ileft_601 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc263) + %ileft_603 = tt.broadcast %ileft_602 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc264) + %iright_604 = arith.muli %y_599, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc265) + %iright_605 = "tt.reduce"(%iright_604) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc309) + %iright_606 = tt.expand_dims %iright_605 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc267) + %iright_607 = tt.broadcast %iright_606 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc268) + %ileft_608 = tt.reshape %ileft_603 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_609 = tt.reshape %iright_607 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_610 = tt.reshape %new_idxs_598 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc282) + %left_idx_611 = arith.muli %y_idx_610, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc283) + %left_idx_612 = "tt.reduce"(%left_idx_611) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc311) + %left_idx_613 = tt.expand_dims %left_idx_612 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc285) + %left_idx_614 = tt.broadcast %left_idx_613 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc286) + %right_idx_615 = arith.muli %y_idx_610, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc287) + %right_idx_616 = "tt.reduce"(%right_idx_615) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc313) + %right_idx_617 = tt.expand_dims %right_idx_616 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc289) + %right_idx_618 = tt.broadcast %right_idx_617 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc290) + %left_idx_619 = tt.reshape %left_idx_614 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_620 = tt.reshape %right_idx_618 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_621 = arith.cmpi slt, %ileft_608, %iright_609 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_622 = arith.cmpi eq, %ileft_608, %iright_609 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_623 = arith.cmpi sgt, %left_idx_619, %right_idx_620 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_624 = arith.andi %eq_622, %cond_623 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_625 = arith.ori %cond_621, %cond_624 : tensor<1x16xi1, #blocked5> loc(#loc274) + %ret_626 = arith.xori %ileft_608, %iright_609 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_627 = arith.select %cond_625, %ret_626, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_628 = arith.xori %ret_595, %ret_627 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_629 = arith.xori %left_idx_619, %right_idx_620 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_630 = arith.select %cond_625, %new_idxs_629, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_631 = arith.xori %new_idxs_598, %new_idxs_630 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_632 = tt.reshape %ret_628 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc260) + %ileft_633 = arith.muli %y_632, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc261) + %ileft_634 = "tt.reduce"(%ileft_633) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc307) + %ileft_635 = tt.expand_dims %ileft_634 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc263) + %ileft_636 = tt.broadcast %ileft_635 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc264) + %iright_637 = arith.muli %y_632, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc265) + %iright_638 = "tt.reduce"(%iright_637) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc309) + %iright_639 = tt.expand_dims %iright_638 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc267) + %iright_640 = tt.broadcast %iright_639 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc268) + %ileft_641 = tt.reshape %ileft_636 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_642 = tt.reshape %iright_640 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_643 = tt.reshape %new_idxs_631 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc282) + %left_idx_644 = arith.muli %y_idx_643, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc283) + %left_idx_645 = "tt.reduce"(%left_idx_644) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc311) + %left_idx_646 = tt.expand_dims %left_idx_645 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc285) + %left_idx_647 = tt.broadcast %left_idx_646 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc286) + %right_idx_648 = arith.muli %y_idx_643, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc287) + %right_idx_649 = "tt.reduce"(%right_idx_648) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc313) + %right_idx_650 = tt.expand_dims %right_idx_649 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc289) + %right_idx_651 = tt.broadcast %right_idx_650 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc290) + %left_idx_652 = tt.reshape %left_idx_647 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_653 = tt.reshape %right_idx_651 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_654 = arith.cmpi slt, %ileft_641, %iright_642 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_655 = arith.cmpi eq, %ileft_641, %iright_642 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_656 = arith.cmpi sgt, %left_idx_652, %right_idx_653 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_657 = arith.andi %eq_655, %cond_656 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_658 = arith.ori %cond_654, %cond_657 : tensor<1x16xi1, #blocked5> loc(#loc274) + %ret_659 = arith.xori %ileft_641, %iright_642 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_660 = arith.select %cond_658, %ret_659, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_661 = arith.xori %ret_628, %ret_660 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_662 = arith.xori %left_idx_652, %right_idx_653 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_663 = arith.select %cond_658, %new_idxs_662, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_664 = arith.xori %new_idxs_631, %new_idxs_663 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_665 = tt.reshape %ret_661 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc260) + %ileft_666 = arith.muli %y_665, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc261) + %ileft_667 = "tt.reduce"(%ileft_666) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc307) + %ileft_668 = tt.expand_dims %ileft_667 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc263) + %ileft_669 = tt.broadcast %ileft_668 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc264) + %iright_670 = arith.muli %y_665, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc265) + %iright_671 = "tt.reduce"(%iright_670) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc309) + %iright_672 = tt.expand_dims %iright_671 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc267) + %iright_673 = tt.broadcast %iright_672 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc268) + %ileft_674 = tt.reshape %ileft_669 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_675 = tt.reshape %iright_673 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_676 = tt.reshape %new_idxs_664 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc282) + %left_idx_677 = arith.muli %y_idx_676, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc283) + %left_idx_678 = "tt.reduce"(%left_idx_677) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc311) + %left_idx_679 = tt.expand_dims %left_idx_678 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc285) + %left_idx_680 = tt.broadcast %left_idx_679 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc286) + %right_idx_681 = arith.muli %y_idx_676, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc287) + %right_idx_682 = "tt.reduce"(%right_idx_681) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc313) + %right_idx_683 = tt.expand_dims %right_idx_682 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc289) + %right_idx_684 = tt.broadcast %right_idx_683 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc290) + %left_idx_685 = tt.reshape %left_idx_680 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_686 = tt.reshape %right_idx_684 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_687 = arith.cmpi slt, %ileft_674, %iright_675 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_688 = arith.cmpi eq, %ileft_674, %iright_675 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_689 = arith.cmpi sgt, %left_idx_685, %right_idx_686 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_690 = arith.andi %eq_688, %cond_689 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_691 = arith.ori %cond_687, %cond_690 : tensor<1x16xi1, #blocked5> loc(#loc274) + %ret_692 = arith.xori %ileft_674, %iright_675 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_693 = arith.select %cond_691, %ret_692, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_694 = arith.xori %ret_661, %ret_693 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_695 = arith.xori %left_idx_685, %right_idx_686 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_696 = arith.select %cond_691, %new_idxs_695, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_697 = arith.xori %new_idxs_664, %new_idxs_696 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_698 = tt.reshape %ret_694 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc260) + %ileft_699 = arith.muli %y_698, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc261) + %ileft_700 = "tt.reduce"(%ileft_699) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc307) + %ileft_701 = tt.expand_dims %ileft_700 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc263) + %ileft_702 = tt.broadcast %ileft_701 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc264) + %iright_703 = arith.muli %y_698, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc265) + %iright_704 = "tt.reduce"(%iright_703) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc309) + %iright_705 = tt.expand_dims %iright_704 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc267) + %iright_706 = tt.broadcast %iright_705 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc268) + %ileft_707 = tt.reshape %ileft_702 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_708 = tt.reshape %iright_706 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_709 = tt.reshape %new_idxs_697 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc282) + %left_idx_710 = arith.muli %y_idx_709, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc283) + %left_idx_711 = "tt.reduce"(%left_idx_710) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc311) + %left_idx_712 = tt.expand_dims %left_idx_711 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc285) + %left_idx_713 = tt.broadcast %left_idx_712 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc286) + %right_idx_714 = arith.muli %y_idx_709, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc287) + %right_idx_715 = "tt.reduce"(%right_idx_714) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc313) + %right_idx_716 = tt.expand_dims %right_idx_715 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc289) + %right_idx_717 = tt.broadcast %right_idx_716 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc290) + %left_idx_718 = tt.reshape %left_idx_713 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_719 = tt.reshape %right_idx_717 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_720 = arith.cmpi slt, %ileft_707, %iright_708 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_721 = arith.cmpi eq, %ileft_707, %iright_708 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_722 = arith.cmpi sgt, %left_idx_718, %right_idx_719 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_723 = arith.andi %eq_721, %cond_722 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_724 = arith.ori %cond_720, %cond_723 : tensor<1x16xi1, #blocked5> loc(#loc274) + %new_idxs_725 = arith.xori %left_idx_718, %right_idx_719 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_726 = arith.select %cond_724, %new_idxs_725, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_727 = arith.xori %new_idxs_697, %new_idxs_726 : tensor<1x16xi32, #blocked5> loc(#loc281) + %tmp20 = arith.extui %tmp5 : tensor<1x16xi1, #blocked5> to tensor<1x16xi64, #blocked5> loc(#loc215) + %tmp20_728 = arith.extui %tmp5_28 : tensor<1x16xi1, #blocked> to tensor<1x16xi64, #blocked> loc(#loc215) + %tmp23 = arith.select %tmp0_22, %tmp20, %cst_10 : tensor<1x16xi1, #blocked5>, tensor<1x16xi64, #blocked5> loc(#loc183) + %tmp23_729 = arith.select %tmp0_23, %tmp20_728, %cst : tensor<1x16xi1, #blocked>, tensor<1x16xi64, #blocked> loc(#loc183) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_743: i64 loc(callsite(#loc1 at #loc184)), %tmp24_744: i64 loc(callsite(#loc1 at #loc184))): + %tmp24_745 = arith.addi %tmp24_743, %tmp24_744 : i64 loc(#loc295) + tt.reduce.return %tmp24_745 : i64 loc(#loc216) + }) : (tensor<1x16xi64, #blocked5>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc216) + %tmp24_730 = "tt.reduce"(%tmp23_729) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_743: i64 loc(callsite(#loc1 at #loc184)), %tmp24_744: i64 loc(callsite(#loc1 at #loc184))): + %tmp24_745 = arith.addi %tmp24_743, %tmp24_744 : i64 loc(#loc295) + tt.reduce.return %tmp24_745 : i64 loc(#loc216) + }) : (tensor<1x16xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc216) + %tmp24_731 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<1x1xi64, #blocked5> loc(#loc185) + %tmp24_732 = tt.expand_dims %tmp24_730 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc185) + %tmp25 = arith.extui %tmp14 : tensor<1x16xi1, #blocked5> to tensor<1x16xi64, #blocked5> loc(#loc218) + %tmp25_733 = arith.extui %tmp14_395 : tensor<1x16xi1, #blocked> to tensor<1x16xi64, #blocked> loc(#loc218) + %tmp28 = arith.select %tmp0_22, %tmp25, %cst_10 : tensor<1x16xi1, #blocked5>, tensor<1x16xi64, #blocked5> loc(#loc187) + %tmp28_734 = arith.select %tmp0_23, %tmp25_733, %cst : tensor<1x16xi1, #blocked>, tensor<1x16xi64, #blocked> loc(#loc187) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_743: i64 loc(callsite(#loc1 at #loc188)), %tmp29_744: i64 loc(callsite(#loc1 at #loc188))): + %tmp29_745 = arith.addi %tmp29_743, %tmp29_744 : i64 loc(#loc296) + tt.reduce.return %tmp29_745 : i64 loc(#loc219) + }) : (tensor<1x16xi64, #blocked5>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc219) + %tmp29_735 = "tt.reduce"(%tmp28_734) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_743: i64 loc(callsite(#loc1 at #loc188)), %tmp29_744: i64 loc(callsite(#loc1 at #loc188))): + %tmp29_745 = arith.addi %tmp29_743, %tmp29_744 : i64 loc(#loc296) + tt.reduce.return %tmp29_745 : i64 loc(#loc219) + }) : (tensor<1x16xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc219) + %tmp29_736 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<1x1xi64, #blocked5> loc(#loc189) + %tmp29_737 = tt.expand_dims %tmp29_735 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc189) + %tmp30 = arith.trunci %tmp24_731 : tensor<1x1xi64, #blocked5> to tensor<1x1xi32, #blocked5> loc(#loc190) + %tmp30_738 = arith.trunci %tmp24_732 : tensor<1x1xi64, #blocked> to tensor<1x1xi32, #blocked> loc(#loc190) + %tmp31 = arith.trunci %tmp29_736 : tensor<1x1xi64, #blocked5> to tensor<1x1xi32, #blocked5> loc(#loc191) + %tmp31_739 = arith.trunci %tmp29_737 : tensor<1x1xi64, #blocked> to tensor<1x1xi32, #blocked> loc(#loc191) + %tmp34 = tt.broadcast %tmp30 : tensor<1x1xi32, #blocked5> -> tensor<1x16xi32, #blocked5> loc(#loc192) + %tmp34_740 = arith.cmpi slt, %r0_index_12, %tmp34 : tensor<1x16xi32, #blocked5> loc(#loc192) + %tmp36 = arith.select %tmp34_740, %new_idxs_394, %cst_8 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc193) + %tmp38 = arith.addi %tmp36, %cst_7 : tensor<1x16xi32, #blocked5> loc(#loc194) + %tmp39 = arith.cmpi slt, %tmp36, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc195) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc196) + %0 = arith.cmpi sge, %tmp40, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc81) + %1 = arith.cmpi slt, %tmp40, %cst_7 : tensor<1x16xi32, #blocked5> loc(#loc82) + %2 = arith.andi %0, %1 : tensor<1x16xi1, #blocked5> loc(#loc83) + %xmask_741 = arith.cmpi sge, %xoffset, %c128_i32 : i32 loc(#loc221) + %3 = tt.splat %xmask_741 : i1 -> tensor<1x16xi1, #blocked5> loc(#loc197) + %4 = arith.ori %2, %3 : tensor<1x16xi1, #blocked5> loc(#loc85) + tt.assert %4, "index out of bounds: 0 <= tmp40 < 17" : tensor<1x16xi1, #blocked5> loc(#loc86) + %tmp45 = tt.broadcast %tmp31 : tensor<1x1xi32, #blocked5> -> tensor<1x16xi32, #blocked5> loc(#loc198) + %tmp45_742 = arith.cmpi slt, %r0_index_12, %tmp45 : tensor<1x16xi32, #blocked5> loc(#loc198) + %tmp46 = arith.select %tmp45_742, %new_idxs_727, %cst_8 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc199) + %tmp47 = arith.addi %tmp46, %cst_7 : tensor<1x16xi32, #blocked5> loc(#loc200) + %tmp48 = arith.cmpi slt, %tmp46, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc201) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc202) + %5 = arith.cmpi sge, %tmp49, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc92) + %6 = arith.cmpi slt, %tmp49, %cst_7 : tensor<1x16xi32, #blocked5> loc(#loc93) + %7 = arith.andi %5, %6 : tensor<1x16xi1, #blocked5> loc(#loc94) + %8 = arith.ori %7, %3 : tensor<1x16xi1, #blocked5> loc(#loc95) + tt.assert %8, "index out of bounds: 0 <= tmp49 < 17" : tensor<1x16xi1, #blocked5> loc(#loc96) + %9 = tt.addptr %out_ptr4, %xoffset : !tt.ptr, i32 loc(#loc97) + %10 = tt.splat %9 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc98) + %11 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc98) + tt.store %10, %tmp30_738, %11 : tensor<1x1x!tt.ptr, #blocked> loc(#loc98) + %12 = tt.addptr %out_ptr5, %xoffset : !tt.ptr, i32 loc(#loc99) + %13 = tt.splat %12 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc100) + tt.store %13, %tmp31_739, %11 : tensor<1x1x!tt.ptr, #blocked> loc(#loc100) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc101) + %15 = tt.addptr %14, %tmp0_16 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc101) + tt.store %15, %new_idxs_394, %tmp0_22 : tensor<1x16x!tt.ptr, #blocked5> loc(#loc102) + %16 = arith.muli %xoffset, %c17_i32 : i32 loc(#loc103) + %17 = tt.splat %16 : i32 -> tensor<1x16xi32, #blocked5> loc(#loc203) + %18 = arith.addi %tmp40, %17 : tensor<1x16xi32, #blocked5> loc(#loc104) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc105) + %20 = tt.addptr %19, %18 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc105) + %21 = ttg.convert_layout %20 : tensor<1x16x!tt.ptr, #blocked5> -> tensor<1x16x!tt.ptr, #blocked> loc(#loc106) + tt.store %21, %cst_5, %tmp0_23 : tensor<1x16x!tt.ptr, #blocked> loc(#loc106) + %22 = tt.splat %out_ptr8 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc107) + %23 = tt.addptr %22, %tmp0_16 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc107) + tt.store %23, %new_idxs_727, %tmp0_22 : tensor<1x16x!tt.ptr, #blocked5> loc(#loc108) + %24 = arith.addi %tmp49, %17 : tensor<1x16xi32, #blocked5> loc(#loc109) + %25 = tt.splat %out_ptr9 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc110) + %26 = tt.addptr %25, %24 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc110) + %27 = ttg.convert_layout %26 : tensor<1x16x!tt.ptr, #blocked5> -> tensor<1x16x!tt.ptr, #blocked> loc(#loc111) + tt.store %27, %cst_5, %tmp0_23 : tensor<1x16x!tt.ptr, #blocked> loc(#loc111) + tt.return loc(#loc112) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":26:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:38) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:40) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:30) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:45) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":36:18) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":38:18) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":39:18) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":41:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":40:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":43:19) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":47:20) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":49:21) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":48:21) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":52:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":54:35) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:29) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":56:21) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":58:35) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:29) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":60:21) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":61:21) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":64:19) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":66:35) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":68:20) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":69:20) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":70:35) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:28) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:46) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:38) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:55) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:53) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:63) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":75:19) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":76:35) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":77:20) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":78:20) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":79:35) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:28) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:46) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:38) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:53) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:63) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:25) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:37) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:25) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:37) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:25) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:47) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:52) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:49) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:25) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:85) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:25) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:47) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:49) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:25) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:85) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:4) +#loc122 = loc("xoffset"(#loc2)) +#loc123 = loc("xmask"(#loc3)) +#loc124 = loc("r0_index"(#loc4)) +#loc125 = loc("tmp0"(#loc5)) +#loc126 = loc("tmp0"(#loc6)) +#loc127 = loc("tmp0"(#loc7)) +#loc128 = loc("tmp0"(#loc8)) +#loc129 = loc("tmp2"(#loc9)) +#loc130 = loc("tmp4"(#loc10)) +#loc131 = loc("tmp5"(#loc11)) +#loc132 = loc("tmp7"(#loc12)) +#loc133 = loc("tmp6"(#loc13)) +#loc134 = loc("tmp9"(#loc14)) +#loc135 = loc("flip"(#loc15)) +#loc137 = loc("flip"(#loc18)) +#loc138 = loc("flip"(#loc19)) +#loc139 = loc("y"(#loc20)) +#loc140 = loc("left_mask"(#loc22)) +#loc141 = loc("ileft"(#loc23)) +#loc143 = loc("ileft"(#loc27)) +#loc144 = loc("ileft"(#loc28)) +#loc145 = loc("iright"(#loc29)) +#loc147 = loc("iright"(#loc31)) +#loc148 = loc("iright"(#loc32)) +#loc149 = loc("ileft"(#loc33)) +#loc150 = loc("iright"(#loc34)) +#loc151 = loc("y_idx"(#loc35)) +#loc152 = loc("left_idx"(#loc36)) +#loc153 = loc("left_idx"(#loc37)) +#loc154 = loc("input"(#loc38)) +#loc156 = loc("left_idx"(#loc40)) +#loc157 = loc("left_idx"(#loc41)) +#loc158 = loc("right_idx"(#loc42)) +#loc159 = loc("right_idx"(#loc43)) +#loc161 = loc("right_idx"(#loc45)) +#loc162 = loc("right_idx"(#loc46)) +#loc163 = loc("left_idx"(#loc47)) +#loc164 = loc("right_idx"(#loc48)) +#loc165 = loc("cond"(#loc49)) +#loc166 = loc("eq"(#loc50)) +#loc167 = loc("cond"(#loc51)) +#loc168 = loc("cond"(#loc52)) +#loc169 = loc("cond"(#loc53)) +#loc170 = loc("cond"(#loc54)) +#loc171 = loc("cond"(#loc55)) +#loc172 = loc("ret"(#loc56)) +#loc173 = loc("ret"(#loc57)) +#loc174 = loc("ret"(#loc58)) +#loc175 = loc("new_idxs"(#loc59)) +#loc176 = loc("new_idxs"(#loc60)) +#loc177 = loc("new_idxs"(#loc61)) +#loc178 = loc("tmp14"(#loc62)) +#loc179 = loc("tmp16"(#loc63)) +#loc180 = loc("tmp15"(#loc64)) +#loc182 = loc("tmp20"(#loc66)) +#loc183 = loc("tmp23"(#loc67)) +#loc185 = loc("tmp24"(#loc69)) +#loc186 = loc("tmp25"(#loc70)) +#loc187 = loc("tmp28"(#loc71)) +#loc189 = loc("tmp29"(#loc73)) +#loc190 = loc("tmp30"(#loc74)) +#loc191 = loc("tmp31"(#loc75)) +#loc192 = loc("tmp34"(#loc76)) +#loc193 = loc("tmp36"(#loc77)) +#loc194 = loc("tmp38"(#loc78)) +#loc195 = loc("tmp39"(#loc79)) +#loc196 = loc("tmp40"(#loc80)) +#loc197 = loc(fused[#loc85, #loc84]) +#loc198 = loc("tmp45"(#loc87)) +#loc199 = loc("tmp46"(#loc88)) +#loc200 = loc("tmp47"(#loc89)) +#loc201 = loc("tmp48"(#loc90)) +#loc202 = loc("tmp49"(#loc91)) +#loc203 = loc(fused[#loc104, #loc103]) +#loc204 = loc(fused[#loc126, #loc125]) +#loc205 = loc(fused[#loc128, #loc123]) +#loc206 = loc(fused[#loc132, #loc133]) +#loc207 = loc(callsite(#loc135 at #loc136)) +#loc208 = loc(callsite(#loc137 at #loc136)) +#loc209 = loc(callsite(#loc138 at #loc136)) +#loc211 = loc("cond"(#loc165)) +#loc212 = loc("eq"(#loc166)) +#loc213 = loc(fused[#loc179, #loc180]) +#loc215 = loc(fused[#loc182, #loc132, #loc133]) +#loc216 = loc(callsite(#loc24 at #loc184)) +#loc218 = loc(fused[#loc186, #loc179, #loc180]) +#loc219 = loc(callsite(#loc24 at #loc188)) +#loc221 = loc(fused[#loc84, #loc123]) +#loc222 = loc(callsite(#loc139 at #loc210)) +#loc223 = loc(callsite(#loc140 at #loc210)) +#loc224 = loc(callsite(#loc141 at #loc210)) +#loc226 = loc(callsite(#loc143 at #loc210)) +#loc227 = loc(callsite(#loc144 at #loc210)) +#loc228 = loc(callsite(#loc145 at #loc210)) +#loc230 = loc(callsite(#loc147 at #loc210)) +#loc231 = loc(callsite(#loc148 at #loc210)) +#loc232 = loc(callsite(#loc149 at #loc210)) +#loc233 = loc(callsite(#loc150 at #loc210)) +#loc234 = loc(callsite(#loc151 at #loc210)) +#loc235 = loc(callsite(#loc152 at #loc210)) +#loc236 = loc(callsite(#loc153 at #loc210)) +#loc238 = loc(callsite(#loc156 at #loc210)) +#loc239 = loc(callsite(#loc157 at #loc210)) +#loc240 = loc(callsite(#loc158 at #loc210)) +#loc241 = loc(callsite(#loc159 at #loc210)) +#loc243 = loc(callsite(#loc161 at #loc210)) +#loc244 = loc(callsite(#loc162 at #loc210)) +#loc245 = loc(callsite(#loc163 at #loc210)) +#loc246 = loc(callsite(#loc164 at #loc210)) +#loc247 = loc(callsite(#loc211 at #loc210)) +#loc248 = loc(callsite(#loc212 at #loc210)) +#loc249 = loc(callsite(#loc167 at #loc210)) +#loc250 = loc(callsite(#loc168 at #loc210)) +#loc251 = loc(callsite(#loc169 at #loc210)) +#loc252 = loc(callsite(#loc170 at #loc210)) +#loc253 = loc(callsite(#loc171 at #loc210)) +#loc254 = loc(callsite(#loc172 at #loc210)) +#loc255 = loc(callsite(#loc173 at #loc210)) +#loc256 = loc(callsite(#loc174 at #loc210)) +#loc257 = loc(callsite(#loc175 at #loc210)) +#loc258 = loc(callsite(#loc176 at #loc210)) +#loc259 = loc(callsite(#loc177 at #loc210)) +#loc260 = loc(callsite(#loc139 at #loc214)) +#loc261 = loc(callsite(#loc141 at #loc214)) +#loc263 = loc(callsite(#loc143 at #loc214)) +#loc264 = loc(callsite(#loc144 at #loc214)) +#loc265 = loc(callsite(#loc145 at #loc214)) +#loc267 = loc(callsite(#loc147 at #loc214)) +#loc268 = loc(callsite(#loc148 at #loc214)) +#loc269 = loc(callsite(#loc149 at #loc214)) +#loc270 = loc(callsite(#loc150 at #loc214)) +#loc271 = loc(callsite(#loc211 at #loc214)) +#loc272 = loc(callsite(#loc212 at #loc214)) +#loc273 = loc(callsite(#loc168 at #loc214)) +#loc274 = loc(callsite(#loc169 at #loc214)) +#loc275 = loc(callsite(#loc170 at #loc214)) +#loc276 = loc(callsite(#loc171 at #loc214)) +#loc277 = loc(callsite(#loc172 at #loc214)) +#loc278 = loc(callsite(#loc173 at #loc214)) +#loc279 = loc(callsite(#loc174 at #loc214)) +#loc280 = loc(callsite(#loc176 at #loc214)) +#loc281 = loc(callsite(#loc177 at #loc214)) +#loc282 = loc(callsite(#loc151 at #loc214)) +#loc283 = loc(callsite(#loc153 at #loc214)) +#loc285 = loc(callsite(#loc156 at #loc214)) +#loc286 = loc(callsite(#loc157 at #loc214)) +#loc287 = loc(callsite(#loc159 at #loc214)) +#loc289 = loc(callsite(#loc161 at #loc214)) +#loc290 = loc(callsite(#loc162 at #loc214)) +#loc291 = loc(callsite(#loc163 at #loc214)) +#loc292 = loc(callsite(#loc164 at #loc214)) +#loc293 = loc(callsite(#loc167 at #loc214)) +#loc294 = loc(callsite(#loc175 at #loc214)) +#loc295 = loc(callsite(#loc26 at #loc216)) +#loc296 = loc(callsite(#loc26 at #loc219)) +#loc297 = loc(callsite(#loc24 at #loc225)) +#loc299 = loc(callsite(#loc24 at #loc229)) +#loc301 = loc(callsite(#loc154 at #loc237)) +#loc302 = loc(callsite(#loc24 at #loc237)) +#loc304 = loc(callsite(#loc154 at #loc242)) +#loc305 = loc(callsite(#loc24 at #loc242)) +#loc307 = loc(callsite(#loc24 at #loc262)) +#loc309 = loc(callsite(#loc24 at #loc266)) +#loc311 = loc(callsite(#loc24 at #loc284)) +#loc313 = loc(callsite(#loc24 at #loc288)) +#loc315 = loc(callsite(#loc26 at #loc297)) +#loc316 = loc(callsite(#loc26 at #loc299)) +#loc317 = loc(callsite(#loc26 at #loc302)) +#loc318 = loc(callsite(#loc26 at #loc305)) +#loc319 = loc(callsite(#loc26 at #loc307)) +#loc320 = loc(callsite(#loc26 at #loc309)) +#loc321 = loc(callsite(#loc26 at #loc311)) +#loc322 = loc(callsite(#loc26 at #loc313)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..2b773dddba229c6a42fc6cc5fe0e2d891d962a2e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/MMGM2ESHRXPRFAROBBDYKTZUJ2JVVKU2TB5DVA3EL4OF2SOELPMQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir @@ -0,0 +1,1437 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":18:0) +#loc1 = loc(unknown) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":46:71) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":51:71) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:26) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:26) +#loc115 = loc("in_ptr0"(#loc)) +#loc116 = loc("out_ptr4"(#loc)) +#loc117 = loc("out_ptr5"(#loc)) +#loc118 = loc("out_ptr6"(#loc)) +#loc119 = loc("out_ptr7"(#loc)) +#loc120 = loc("out_ptr8"(#loc)) +#loc121 = loc("out_ptr9"(#loc)) +#loc122 = loc("xnumel"(#loc)) +#loc123 = loc("r0_numel"(#loc)) +#loc139 = loc(callsite(#loc17 at #loc18)) +#loc146 = loc("ileft"(#loc27)) +#loc150 = loc("iright"(#loc32)) +#loc159 = loc("left_idx"(#loc41)) +#loc164 = loc("right_idx"(#loc46)) +#loc185 = loc(callsite(#loc17 at #loc67)) +#loc188 = loc("tmp24"(#loc70)) +#loc192 = loc("tmp29"(#loc74)) +#loc215 = loc(callsite(#loc23 at #loc139)) +#loc219 = loc(callsite(#loc23 at #loc185)) +#loc222 = loc(callsite(#loc1 at #loc188)) +#loc225 = loc(callsite(#loc1 at #loc192)) +#loc229 = loc(callsite(#loc146 at #loc215)) +#loc233 = loc(callsite(#loc150 at #loc215)) +#loc241 = loc(callsite(#loc159 at #loc215)) +#loc246 = loc(callsite(#loc164 at #loc215)) +#loc266 = loc(callsite(#loc146 at #loc219)) +#loc270 = loc(callsite(#loc150 at #loc219)) +#loc288 = loc(callsite(#loc159 at #loc219)) +#loc292 = loc(callsite(#loc164 at #loc219)) +#loc302 = loc(callsite(#loc1 at #loc229)) +#loc304 = loc(callsite(#loc1 at #loc233)) +#loc307 = loc(callsite(#loc1 at #loc241)) +#loc310 = loc(callsite(#loc1 at #loc246)) +#loc312 = loc(callsite(#loc1 at #loc266)) +#loc314 = loc(callsite(#loc1 at #loc270)) +#loc316 = loc(callsite(#loc1 at #loc288)) +#loc318 = loc(callsite(#loc1 at #loc292)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c17_i32 = arith.constant 17 : i32 loc(#loc1) + %true = arith.constant true loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %xmask = arith.constant 128 : i32 loc(#loc124) + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<1x16xi32> loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc1) + %cst_2 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc1) + %cst_3 = arith.constant dense<16> : tensor<1x16xi32> loc(#loc1) + %cst_4 = arith.constant dense<16384> : tensor<1x16xi64> loc(#loc1) + %cst_5 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc125) + %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc124) + %xmask_7 = tt.splat %xmask_6 : i1 -> tensor<1x1xi1> loc(#loc124) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc126) + %r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc127) + %tmp0 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc128) + %tmp0_9 = tt.splat %tmp0 : i32 -> tensor<1x16xi32> loc(#loc208) + %tmp0_10 = arith.addi %r0_index_8, %tmp0_9 : tensor<1x16xi32> loc(#loc129) + %tmp0_11 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc130) + %tmp0_12 = tt.addptr %tmp0_11, %tmp0_10 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc130) + %tmp0_13 = tt.splat %xmask_6 : i1 -> tensor<1x16xi1> loc(#loc209) + %tmp0_14 = tt.load %tmp0_12, %tmp0_13, %cst_5 : tensor<1x16x!tt.ptr> loc(#loc131) + %tmp2 = arith.cmpi sgt, %tmp0_14, %cst_5 : tensor<1x16xi64> loc(#loc132) + %tmp4 = arith.cmpi slt, %tmp0_14, %cst_4 : tensor<1x16xi64> loc(#loc133) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<1x16xi1> loc(#loc134) + %tmp7 = arith.extui %tmp5 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc210) + %tmp9 = arith.trunci %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc137) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc211) + %flip_15 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc212) + %flip_16 = tt.expand_dims %flip_15 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc212) + %flip_17 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc213) + %flip_18 = tt.reshape %flip_17 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc214) + %y = tt.reshape %tmp7 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc226) + %left_mask = arith.subi %cst, %flip_16 : tensor<1x2x1xi32> loc(#loc227) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc228) + %ileft_19 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc228) + %ileft_20 = "tt.reduce"(%ileft_19) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc301) + %ileft_21 = tt.expand_dims %ileft_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc230) + %ileft_22 = tt.broadcast %ileft_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc231) + %iright = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc232) + %iright_23 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc232) + %iright_24 = "tt.reduce"(%iright_23) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc303) + %iright_25 = tt.expand_dims %iright_24 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc234) + %iright_26 = tt.broadcast %iright_25 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc235) + %ileft_27 = tt.reshape %ileft_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_28 = tt.reshape %iright_26 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx = tt.reshape %tmp9 : tensor<1x16xi16> -> tensor<8x2x1xi16> loc(#loc238) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc239) + %left_idx_29 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc240) + %left_idx_30 = arith.muli %y_idx, %left_idx_29 : tensor<8x2x1xi16> loc(#loc240) + %input = arith.extsi %left_idx_30 : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc305) + %left_idx_31 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc306) + %left_idx_32 = tt.expand_dims %left_idx_31 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc242) + %left_idx_33 = tt.broadcast %left_idx_32 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc243) + %right_idx = arith.trunci %flip_16 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc244) + %right_idx_34 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc245) + %right_idx_35 = arith.muli %y_idx, %right_idx_34 : tensor<8x2x1xi16> loc(#loc245) + %input_36 = arith.extsi %right_idx_35 : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc308) + %right_idx_37 = "tt.reduce"(%input_36) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc309) + %right_idx_38 = tt.expand_dims %right_idx_37 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc247) + %right_idx_39 = tt.broadcast %right_idx_38 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc248) + %left_idx_40 = tt.reshape %left_idx_33 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_41 = tt.reshape %right_idx_39 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc250) + %cond = arith.cmpi slt, %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc251) + %eq = arith.cmpi eq, %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc252) + %cond_42 = arith.cmpi sgt, %left_idx_40, %right_idx_41 : tensor<1x16xi32> loc(#loc253) + %cond_43 = arith.andi %eq, %cond_42 : tensor<1x16xi1> loc(#loc254) + %cond_44 = arith.ori %cond, %cond_43 : tensor<1x16xi1> loc(#loc255) + %cond_45 = arith.extui %cond_44 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc256) + %cond_46 = arith.xori %cond_45, %flip_18 : tensor<1x16xi32> loc(#loc256) + %cond_47 = arith.cmpi ne, %cond_46, %cst_1 : tensor<1x16xi32> loc(#loc257) + %ret = arith.xori %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc258) + %ret_48 = arith.select %cond_47, %ret, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_49 = arith.xori %tmp7, %ret_48 : tensor<1x16xi32> loc(#loc260) + %new_idxs = arith.xori %left_idx_40, %right_idx_41 : tensor<1x16xi32> loc(#loc261) + %new_idxs_50 = arith.select %cond_47, %new_idxs, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_51 = arith.extsi %tmp9 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc263) + %new_idxs_52 = arith.xori %new_idxs_51, %new_idxs_50 : tensor<1x16xi32> loc(#loc263) + %flip_53 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc213) + %flip_54 = tt.reshape %flip_53 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc214) + %y_55 = tt.reshape %ret_49 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc226) + %ileft_56 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc228) + %ileft_57 = arith.muli %y_55, %ileft_56 : tensor<4x2x2xi32> loc(#loc228) + %ileft_58 = "tt.reduce"(%ileft_57) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc301) + %ileft_59 = tt.expand_dims %ileft_58 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc230) + %ileft_60 = tt.broadcast %ileft_59 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc231) + %iright_61 = arith.muli %y_55, %flip_17 : tensor<4x2x2xi32> loc(#loc232) + %iright_62 = "tt.reduce"(%iright_61) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc303) + %iright_63 = tt.expand_dims %iright_62 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc234) + %iright_64 = tt.broadcast %iright_63 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc235) + %ileft_65 = tt.reshape %ileft_60 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_66 = tt.reshape %iright_64 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_67 = tt.reshape %new_idxs_52 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc238) + %left_idx_68 = arith.muli %y_idx_67, %ileft_56 : tensor<4x2x2xi32> loc(#loc240) + %left_idx_69 = "tt.reduce"(%left_idx_68) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc306) + %left_idx_70 = tt.expand_dims %left_idx_69 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc242) + %left_idx_71 = tt.broadcast %left_idx_70 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc243) + %right_idx_72 = arith.muli %y_idx_67, %flip_17 : tensor<4x2x2xi32> loc(#loc245) + %right_idx_73 = "tt.reduce"(%right_idx_72) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc309) + %right_idx_74 = tt.expand_dims %right_idx_73 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc247) + %right_idx_75 = tt.broadcast %right_idx_74 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc248) + %left_idx_76 = tt.reshape %left_idx_71 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_77 = tt.reshape %right_idx_75 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_78 = arith.cmpi slt, %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc251) + %eq_79 = arith.cmpi eq, %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc252) + %cond_80 = arith.cmpi sgt, %left_idx_76, %right_idx_77 : tensor<1x16xi32> loc(#loc253) + %cond_81 = arith.andi %eq_79, %cond_80 : tensor<1x16xi1> loc(#loc254) + %cond_82 = arith.ori %cond_78, %cond_81 : tensor<1x16xi1> loc(#loc255) + %cond_83 = arith.extui %cond_82 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc256) + %cond_84 = arith.xori %cond_83, %flip_54 : tensor<1x16xi32> loc(#loc256) + %cond_85 = arith.cmpi ne, %cond_84, %cst_1 : tensor<1x16xi32> loc(#loc257) + %ret_86 = arith.xori %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc258) + %ret_87 = arith.select %cond_85, %ret_86, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_88 = arith.xori %ret_49, %ret_87 : tensor<1x16xi32> loc(#loc260) + %new_idxs_89 = arith.xori %left_idx_76, %right_idx_77 : tensor<1x16xi32> loc(#loc261) + %new_idxs_90 = arith.select %cond_85, %new_idxs_89, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_91 = arith.xori %new_idxs_52, %new_idxs_90 : tensor<1x16xi32> loc(#loc263) + %y_92 = tt.reshape %ret_88 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc226) + %ileft_93 = arith.muli %y_92, %ileft : tensor<8x2x1xi32> loc(#loc228) + %ileft_94 = "tt.reduce"(%ileft_93) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc301) + %ileft_95 = tt.expand_dims %ileft_94 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc230) + %ileft_96 = tt.broadcast %ileft_95 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc231) + %iright_97 = arith.muli %y_92, %iright : tensor<8x2x1xi32> loc(#loc232) + %iright_98 = "tt.reduce"(%iright_97) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc303) + %iright_99 = tt.expand_dims %iright_98 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc234) + %iright_100 = tt.broadcast %iright_99 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc235) + %ileft_101 = tt.reshape %ileft_96 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_102 = tt.reshape %iright_100 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_103 = tt.reshape %new_idxs_91 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc238) + %left_idx_104 = arith.muli %y_idx_103, %ileft : tensor<8x2x1xi32> loc(#loc240) + %left_idx_105 = "tt.reduce"(%left_idx_104) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc306) + %left_idx_106 = tt.expand_dims %left_idx_105 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc242) + %left_idx_107 = tt.broadcast %left_idx_106 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc243) + %right_idx_108 = arith.muli %y_idx_103, %iright : tensor<8x2x1xi32> loc(#loc245) + %right_idx_109 = "tt.reduce"(%right_idx_108) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc309) + %right_idx_110 = tt.expand_dims %right_idx_109 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc247) + %right_idx_111 = tt.broadcast %right_idx_110 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc248) + %left_idx_112 = tt.reshape %left_idx_107 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_113 = tt.reshape %right_idx_111 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_114 = arith.cmpi slt, %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc251) + %eq_115 = arith.cmpi eq, %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc252) + %cond_116 = arith.cmpi sgt, %left_idx_112, %right_idx_113 : tensor<1x16xi32> loc(#loc253) + %cond_117 = arith.andi %eq_115, %cond_116 : tensor<1x16xi1> loc(#loc254) + %cond_118 = arith.ori %cond_114, %cond_117 : tensor<1x16xi1> loc(#loc255) + %cond_119 = arith.extui %cond_118 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc256) + %cond_120 = arith.xori %cond_119, %flip_54 : tensor<1x16xi32> loc(#loc256) + %cond_121 = arith.cmpi ne, %cond_120, %cst_1 : tensor<1x16xi32> loc(#loc257) + %ret_122 = arith.xori %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc258) + %ret_123 = arith.select %cond_121, %ret_122, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_124 = arith.xori %ret_88, %ret_123 : tensor<1x16xi32> loc(#loc260) + %new_idxs_125 = arith.xori %left_idx_112, %right_idx_113 : tensor<1x16xi32> loc(#loc261) + %new_idxs_126 = arith.select %cond_121, %new_idxs_125, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_127 = arith.xori %new_idxs_91, %new_idxs_126 : tensor<1x16xi32> loc(#loc263) + %flip_128 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc213) + %flip_129 = tt.reshape %flip_128 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc214) + %y_130 = tt.reshape %ret_124 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc226) + %ileft_131 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc228) + %ileft_132 = arith.muli %y_130, %ileft_131 : tensor<2x2x4xi32> loc(#loc228) + %ileft_133 = "tt.reduce"(%ileft_132) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc301) + %ileft_134 = tt.expand_dims %ileft_133 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc230) + %ileft_135 = tt.broadcast %ileft_134 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc231) + %iright_136 = arith.muli %y_130, %flip_53 : tensor<2x2x4xi32> loc(#loc232) + %iright_137 = "tt.reduce"(%iright_136) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc303) + %iright_138 = tt.expand_dims %iright_137 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc234) + %iright_139 = tt.broadcast %iright_138 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc235) + %ileft_140 = tt.reshape %ileft_135 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_141 = tt.reshape %iright_139 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_142 = tt.reshape %new_idxs_127 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc238) + %left_idx_143 = arith.muli %y_idx_142, %ileft_131 : tensor<2x2x4xi32> loc(#loc240) + %left_idx_144 = "tt.reduce"(%left_idx_143) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc306) + %left_idx_145 = tt.expand_dims %left_idx_144 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc242) + %left_idx_146 = tt.broadcast %left_idx_145 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc243) + %right_idx_147 = arith.muli %y_idx_142, %flip_53 : tensor<2x2x4xi32> loc(#loc245) + %right_idx_148 = "tt.reduce"(%right_idx_147) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc309) + %right_idx_149 = tt.expand_dims %right_idx_148 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc247) + %right_idx_150 = tt.broadcast %right_idx_149 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc248) + %left_idx_151 = tt.reshape %left_idx_146 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_152 = tt.reshape %right_idx_150 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_153 = arith.cmpi slt, %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc251) + %eq_154 = arith.cmpi eq, %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc252) + %cond_155 = arith.cmpi sgt, %left_idx_151, %right_idx_152 : tensor<1x16xi32> loc(#loc253) + %cond_156 = arith.andi %eq_154, %cond_155 : tensor<1x16xi1> loc(#loc254) + %cond_157 = arith.ori %cond_153, %cond_156 : tensor<1x16xi1> loc(#loc255) + %cond_158 = arith.extui %cond_157 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc256) + %cond_159 = arith.xori %cond_158, %flip_129 : tensor<1x16xi32> loc(#loc256) + %cond_160 = arith.cmpi ne, %cond_159, %cst_1 : tensor<1x16xi32> loc(#loc257) + %ret_161 = arith.xori %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc258) + %ret_162 = arith.select %cond_160, %ret_161, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_163 = arith.xori %ret_124, %ret_162 : tensor<1x16xi32> loc(#loc260) + %new_idxs_164 = arith.xori %left_idx_151, %right_idx_152 : tensor<1x16xi32> loc(#loc261) + %new_idxs_165 = arith.select %cond_160, %new_idxs_164, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_166 = arith.xori %new_idxs_127, %new_idxs_165 : tensor<1x16xi32> loc(#loc263) + %y_167 = tt.reshape %ret_163 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc226) + %ileft_168 = arith.muli %y_167, %ileft_56 : tensor<4x2x2xi32> loc(#loc228) + %ileft_169 = "tt.reduce"(%ileft_168) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc301) + %ileft_170 = tt.expand_dims %ileft_169 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc230) + %ileft_171 = tt.broadcast %ileft_170 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc231) + %iright_172 = arith.muli %y_167, %flip_17 : tensor<4x2x2xi32> loc(#loc232) + %iright_173 = "tt.reduce"(%iright_172) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc303) + %iright_174 = tt.expand_dims %iright_173 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc234) + %iright_175 = tt.broadcast %iright_174 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc235) + %ileft_176 = tt.reshape %ileft_171 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_177 = tt.reshape %iright_175 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_178 = tt.reshape %new_idxs_166 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc238) + %left_idx_179 = arith.muli %y_idx_178, %ileft_56 : tensor<4x2x2xi32> loc(#loc240) + %left_idx_180 = "tt.reduce"(%left_idx_179) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc306) + %left_idx_181 = tt.expand_dims %left_idx_180 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc242) + %left_idx_182 = tt.broadcast %left_idx_181 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc243) + %right_idx_183 = arith.muli %y_idx_178, %flip_17 : tensor<4x2x2xi32> loc(#loc245) + %right_idx_184 = "tt.reduce"(%right_idx_183) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc309) + %right_idx_185 = tt.expand_dims %right_idx_184 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc247) + %right_idx_186 = tt.broadcast %right_idx_185 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc248) + %left_idx_187 = tt.reshape %left_idx_182 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_188 = tt.reshape %right_idx_186 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_189 = arith.cmpi slt, %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc251) + %eq_190 = arith.cmpi eq, %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc252) + %cond_191 = arith.cmpi sgt, %left_idx_187, %right_idx_188 : tensor<1x16xi32> loc(#loc253) + %cond_192 = arith.andi %eq_190, %cond_191 : tensor<1x16xi1> loc(#loc254) + %cond_193 = arith.ori %cond_189, %cond_192 : tensor<1x16xi1> loc(#loc255) + %cond_194 = arith.extui %cond_193 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc256) + %cond_195 = arith.xori %cond_194, %flip_129 : tensor<1x16xi32> loc(#loc256) + %cond_196 = arith.cmpi ne, %cond_195, %cst_1 : tensor<1x16xi32> loc(#loc257) + %ret_197 = arith.xori %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc258) + %ret_198 = arith.select %cond_196, %ret_197, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_199 = arith.xori %ret_163, %ret_198 : tensor<1x16xi32> loc(#loc260) + %new_idxs_200 = arith.xori %left_idx_187, %right_idx_188 : tensor<1x16xi32> loc(#loc261) + %new_idxs_201 = arith.select %cond_196, %new_idxs_200, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_202 = arith.xori %new_idxs_166, %new_idxs_201 : tensor<1x16xi32> loc(#loc263) + %y_203 = tt.reshape %ret_199 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc226) + %ileft_204 = arith.muli %y_203, %ileft : tensor<8x2x1xi32> loc(#loc228) + %ileft_205 = "tt.reduce"(%ileft_204) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc301) + %ileft_206 = tt.expand_dims %ileft_205 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc230) + %ileft_207 = tt.broadcast %ileft_206 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc231) + %iright_208 = arith.muli %y_203, %iright : tensor<8x2x1xi32> loc(#loc232) + %iright_209 = "tt.reduce"(%iright_208) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc303) + %iright_210 = tt.expand_dims %iright_209 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc234) + %iright_211 = tt.broadcast %iright_210 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc235) + %ileft_212 = tt.reshape %ileft_207 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_213 = tt.reshape %iright_211 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_214 = tt.reshape %new_idxs_202 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc238) + %left_idx_215 = arith.muli %y_idx_214, %ileft : tensor<8x2x1xi32> loc(#loc240) + %left_idx_216 = "tt.reduce"(%left_idx_215) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc306) + %left_idx_217 = tt.expand_dims %left_idx_216 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc242) + %left_idx_218 = tt.broadcast %left_idx_217 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc243) + %right_idx_219 = arith.muli %y_idx_214, %iright : tensor<8x2x1xi32> loc(#loc245) + %right_idx_220 = "tt.reduce"(%right_idx_219) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc309) + %right_idx_221 = tt.expand_dims %right_idx_220 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc247) + %right_idx_222 = tt.broadcast %right_idx_221 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc248) + %left_idx_223 = tt.reshape %left_idx_218 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_224 = tt.reshape %right_idx_222 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_225 = arith.cmpi slt, %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc251) + %eq_226 = arith.cmpi eq, %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc252) + %cond_227 = arith.cmpi sgt, %left_idx_223, %right_idx_224 : tensor<1x16xi32> loc(#loc253) + %cond_228 = arith.andi %eq_226, %cond_227 : tensor<1x16xi1> loc(#loc254) + %cond_229 = arith.ori %cond_225, %cond_228 : tensor<1x16xi1> loc(#loc255) + %cond_230 = arith.extui %cond_229 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc256) + %cond_231 = arith.xori %cond_230, %flip_129 : tensor<1x16xi32> loc(#loc256) + %cond_232 = arith.cmpi ne, %cond_231, %cst_1 : tensor<1x16xi32> loc(#loc257) + %ret_233 = arith.xori %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc258) + %ret_234 = arith.select %cond_232, %ret_233, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_235 = arith.xori %ret_199, %ret_234 : tensor<1x16xi32> loc(#loc260) + %new_idxs_236 = arith.xori %left_idx_223, %right_idx_224 : tensor<1x16xi32> loc(#loc261) + %new_idxs_237 = arith.select %cond_232, %new_idxs_236, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_238 = arith.xori %new_idxs_202, %new_idxs_237 : tensor<1x16xi32> loc(#loc263) + %y_239 = tt.reshape %ret_235 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc226) + %ileft_240 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc228) + %ileft_241 = arith.muli %y_239, %ileft_240 : tensor<1x2x8xi32> loc(#loc228) + %ileft_242 = "tt.reduce"(%ileft_241) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc301) + %ileft_243 = tt.expand_dims %ileft_242 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc230) + %ileft_244 = tt.broadcast %ileft_243 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc231) + %iright_245 = arith.muli %y_239, %flip_128 : tensor<1x2x8xi32> loc(#loc232) + %iright_246 = "tt.reduce"(%iright_245) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc303) + %iright_247 = tt.expand_dims %iright_246 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc234) + %iright_248 = tt.broadcast %iright_247 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc235) + %ileft_249 = tt.reshape %ileft_244 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_250 = tt.reshape %iright_248 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_251 = tt.reshape %new_idxs_238 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc238) + %left_idx_252 = arith.muli %y_idx_251, %ileft_240 : tensor<1x2x8xi32> loc(#loc240) + %left_idx_253 = "tt.reduce"(%left_idx_252) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc306) + %left_idx_254 = tt.expand_dims %left_idx_253 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc242) + %left_idx_255 = tt.broadcast %left_idx_254 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc243) + %right_idx_256 = arith.muli %y_idx_251, %flip_128 : tensor<1x2x8xi32> loc(#loc245) + %right_idx_257 = "tt.reduce"(%right_idx_256) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc309) + %right_idx_258 = tt.expand_dims %right_idx_257 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc247) + %right_idx_259 = tt.broadcast %right_idx_258 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc248) + %left_idx_260 = tt.reshape %left_idx_255 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_261 = tt.reshape %right_idx_259 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_262 = arith.cmpi slt, %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc251) + %eq_263 = arith.cmpi eq, %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc252) + %cond_264 = arith.cmpi sgt, %left_idx_260, %right_idx_261 : tensor<1x16xi32> loc(#loc253) + %cond_265 = arith.andi %eq_263, %cond_264 : tensor<1x16xi1> loc(#loc254) + %cond_266 = arith.ori %cond_262, %cond_265 : tensor<1x16xi1> loc(#loc255) + %ret_267 = arith.xori %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc258) + %ret_268 = arith.select %cond_266, %ret_267, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_269 = arith.xori %ret_235, %ret_268 : tensor<1x16xi32> loc(#loc260) + %new_idxs_270 = arith.xori %left_idx_260, %right_idx_261 : tensor<1x16xi32> loc(#loc261) + %new_idxs_271 = arith.select %cond_266, %new_idxs_270, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_272 = arith.xori %new_idxs_238, %new_idxs_271 : tensor<1x16xi32> loc(#loc263) + %y_273 = tt.reshape %ret_269 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc226) + %ileft_274 = arith.muli %y_273, %ileft_131 : tensor<2x2x4xi32> loc(#loc228) + %ileft_275 = "tt.reduce"(%ileft_274) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc301) + %ileft_276 = tt.expand_dims %ileft_275 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc230) + %ileft_277 = tt.broadcast %ileft_276 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc231) + %iright_278 = arith.muli %y_273, %flip_53 : tensor<2x2x4xi32> loc(#loc232) + %iright_279 = "tt.reduce"(%iright_278) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc303) + %iright_280 = tt.expand_dims %iright_279 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc234) + %iright_281 = tt.broadcast %iright_280 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc235) + %ileft_282 = tt.reshape %ileft_277 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_283 = tt.reshape %iright_281 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_284 = tt.reshape %new_idxs_272 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc238) + %left_idx_285 = arith.muli %y_idx_284, %ileft_131 : tensor<2x2x4xi32> loc(#loc240) + %left_idx_286 = "tt.reduce"(%left_idx_285) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc306) + %left_idx_287 = tt.expand_dims %left_idx_286 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc242) + %left_idx_288 = tt.broadcast %left_idx_287 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc243) + %right_idx_289 = arith.muli %y_idx_284, %flip_53 : tensor<2x2x4xi32> loc(#loc245) + %right_idx_290 = "tt.reduce"(%right_idx_289) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc309) + %right_idx_291 = tt.expand_dims %right_idx_290 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc247) + %right_idx_292 = tt.broadcast %right_idx_291 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc248) + %left_idx_293 = tt.reshape %left_idx_288 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_294 = tt.reshape %right_idx_292 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_295 = arith.cmpi slt, %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc251) + %eq_296 = arith.cmpi eq, %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc252) + %cond_297 = arith.cmpi sgt, %left_idx_293, %right_idx_294 : tensor<1x16xi32> loc(#loc253) + %cond_298 = arith.andi %eq_296, %cond_297 : tensor<1x16xi1> loc(#loc254) + %cond_299 = arith.ori %cond_295, %cond_298 : tensor<1x16xi1> loc(#loc255) + %ret_300 = arith.xori %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc258) + %ret_301 = arith.select %cond_299, %ret_300, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_302 = arith.xori %ret_269, %ret_301 : tensor<1x16xi32> loc(#loc260) + %new_idxs_303 = arith.xori %left_idx_293, %right_idx_294 : tensor<1x16xi32> loc(#loc261) + %new_idxs_304 = arith.select %cond_299, %new_idxs_303, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_305 = arith.xori %new_idxs_272, %new_idxs_304 : tensor<1x16xi32> loc(#loc263) + %y_306 = tt.reshape %ret_302 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc226) + %ileft_307 = arith.muli %y_306, %ileft_56 : tensor<4x2x2xi32> loc(#loc228) + %ileft_308 = "tt.reduce"(%ileft_307) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc301) + %ileft_309 = tt.expand_dims %ileft_308 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc230) + %ileft_310 = tt.broadcast %ileft_309 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc231) + %iright_311 = arith.muli %y_306, %flip_17 : tensor<4x2x2xi32> loc(#loc232) + %iright_312 = "tt.reduce"(%iright_311) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc303) + %iright_313 = tt.expand_dims %iright_312 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc234) + %iright_314 = tt.broadcast %iright_313 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc235) + %ileft_315 = tt.reshape %ileft_310 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_316 = tt.reshape %iright_314 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_317 = tt.reshape %new_idxs_305 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc238) + %left_idx_318 = arith.muli %y_idx_317, %ileft_56 : tensor<4x2x2xi32> loc(#loc240) + %left_idx_319 = "tt.reduce"(%left_idx_318) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc306) + %left_idx_320 = tt.expand_dims %left_idx_319 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc242) + %left_idx_321 = tt.broadcast %left_idx_320 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc243) + %right_idx_322 = arith.muli %y_idx_317, %flip_17 : tensor<4x2x2xi32> loc(#loc245) + %right_idx_323 = "tt.reduce"(%right_idx_322) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc309) + %right_idx_324 = tt.expand_dims %right_idx_323 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc247) + %right_idx_325 = tt.broadcast %right_idx_324 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc248) + %left_idx_326 = tt.reshape %left_idx_321 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_327 = tt.reshape %right_idx_325 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_328 = arith.cmpi slt, %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc251) + %eq_329 = arith.cmpi eq, %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc252) + %cond_330 = arith.cmpi sgt, %left_idx_326, %right_idx_327 : tensor<1x16xi32> loc(#loc253) + %cond_331 = arith.andi %eq_329, %cond_330 : tensor<1x16xi1> loc(#loc254) + %cond_332 = arith.ori %cond_328, %cond_331 : tensor<1x16xi1> loc(#loc255) + %ret_333 = arith.xori %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc258) + %ret_334 = arith.select %cond_332, %ret_333, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_335 = arith.xori %ret_302, %ret_334 : tensor<1x16xi32> loc(#loc260) + %new_idxs_336 = arith.xori %left_idx_326, %right_idx_327 : tensor<1x16xi32> loc(#loc261) + %new_idxs_337 = arith.select %cond_332, %new_idxs_336, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_338 = arith.xori %new_idxs_305, %new_idxs_337 : tensor<1x16xi32> loc(#loc263) + %y_339 = tt.reshape %ret_335 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc226) + %ileft_340 = arith.muli %y_339, %ileft : tensor<8x2x1xi32> loc(#loc228) + %ileft_341 = "tt.reduce"(%ileft_340) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc301) + %ileft_342 = tt.expand_dims %ileft_341 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc230) + %ileft_343 = tt.broadcast %ileft_342 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc231) + %iright_344 = arith.muli %y_339, %iright : tensor<8x2x1xi32> loc(#loc232) + %iright_345 = "tt.reduce"(%iright_344) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc303) + %iright_346 = tt.expand_dims %iright_345 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc234) + %iright_347 = tt.broadcast %iright_346 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc235) + %ileft_348 = tt.reshape %ileft_343 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_349 = tt.reshape %iright_347 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_350 = tt.reshape %new_idxs_338 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc238) + %left_idx_351 = arith.muli %y_idx_350, %ileft : tensor<8x2x1xi32> loc(#loc240) + %left_idx_352 = "tt.reduce"(%left_idx_351) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc306) + %left_idx_353 = tt.expand_dims %left_idx_352 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc242) + %left_idx_354 = tt.broadcast %left_idx_353 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc243) + %right_idx_355 = arith.muli %y_idx_350, %iright : tensor<8x2x1xi32> loc(#loc245) + %right_idx_356 = "tt.reduce"(%right_idx_355) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc309) + %right_idx_357 = tt.expand_dims %right_idx_356 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc247) + %right_idx_358 = tt.broadcast %right_idx_357 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc248) + %left_idx_359 = tt.reshape %left_idx_354 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_360 = tt.reshape %right_idx_358 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_361 = arith.cmpi slt, %ileft_348, %iright_349 : tensor<1x16xi32> loc(#loc251) + %eq_362 = arith.cmpi eq, %ileft_348, %iright_349 : tensor<1x16xi32> loc(#loc252) + %cond_363 = arith.cmpi sgt, %left_idx_359, %right_idx_360 : tensor<1x16xi32> loc(#loc253) + %cond_364 = arith.andi %eq_362, %cond_363 : tensor<1x16xi1> loc(#loc254) + %cond_365 = arith.ori %cond_361, %cond_364 : tensor<1x16xi1> loc(#loc255) + %new_idxs_366 = arith.xori %left_idx_359, %right_idx_360 : tensor<1x16xi32> loc(#loc261) + %new_idxs_367 = arith.select %cond_365, %new_idxs_366, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_368 = arith.xori %new_idxs_338, %new_idxs_367 : tensor<1x16xi32> loc(#loc263) + %tmp14 = arith.cmpi eq, %tmp0_14, %cst_4 : tensor<1x16xi64> loc(#loc182) + %tmp16 = arith.extui %tmp14 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc218) + %y_369 = tt.reshape %tmp16 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc264) + %ileft_370 = arith.muli %y_369, %ileft : tensor<8x2x1xi32> loc(#loc265) + %ileft_371 = "tt.reduce"(%ileft_370) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc311) + %ileft_372 = tt.expand_dims %ileft_371 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc267) + %ileft_373 = tt.broadcast %ileft_372 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc268) + %iright_374 = arith.muli %y_369, %iright : tensor<8x2x1xi32> loc(#loc269) + %iright_375 = "tt.reduce"(%iright_374) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc313) + %iright_376 = tt.expand_dims %iright_375 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc271) + %iright_377 = tt.broadcast %iright_376 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc272) + %ileft_378 = tt.reshape %ileft_373 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_379 = tt.reshape %iright_377 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc274) + %cond_380 = arith.cmpi slt, %ileft_378, %iright_379 : tensor<1x16xi32> loc(#loc275) + %eq_381 = arith.cmpi eq, %ileft_378, %iright_379 : tensor<1x16xi32> loc(#loc276) + %cond_382 = arith.andi %eq_381, %cond_42 : tensor<1x16xi1> loc(#loc277) + %cond_383 = arith.ori %cond_380, %cond_382 : tensor<1x16xi1> loc(#loc278) + %cond_384 = arith.extui %cond_383 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc279) + %cond_385 = arith.xori %cond_384, %flip_18 : tensor<1x16xi32> loc(#loc279) + %cond_386 = arith.cmpi ne, %cond_385, %cst_1 : tensor<1x16xi32> loc(#loc280) + %ret_387 = arith.xori %ileft_378, %iright_379 : tensor<1x16xi32> loc(#loc281) + %ret_388 = arith.select %cond_386, %ret_387, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_389 = arith.xori %tmp16, %ret_388 : tensor<1x16xi32> loc(#loc283) + %new_idxs_390 = arith.select %cond_386, %new_idxs, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_391 = arith.xori %new_idxs_51, %new_idxs_390 : tensor<1x16xi32> loc(#loc285) + %y_392 = tt.reshape %ret_389 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc264) + %ileft_393 = arith.muli %y_392, %ileft_56 : tensor<4x2x2xi32> loc(#loc265) + %ileft_394 = "tt.reduce"(%ileft_393) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc311) + %ileft_395 = tt.expand_dims %ileft_394 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc267) + %ileft_396 = tt.broadcast %ileft_395 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc268) + %iright_397 = arith.muli %y_392, %flip_17 : tensor<4x2x2xi32> loc(#loc269) + %iright_398 = "tt.reduce"(%iright_397) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc313) + %iright_399 = tt.expand_dims %iright_398 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc271) + %iright_400 = tt.broadcast %iright_399 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc272) + %ileft_401 = tt.reshape %ileft_396 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_402 = tt.reshape %iright_400 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_403 = tt.reshape %new_idxs_391 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc286) + %left_idx_404 = arith.muli %y_idx_403, %ileft_56 : tensor<4x2x2xi32> loc(#loc287) + %left_idx_405 = "tt.reduce"(%left_idx_404) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc315) + %left_idx_406 = tt.expand_dims %left_idx_405 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc289) + %left_idx_407 = tt.broadcast %left_idx_406 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc290) + %right_idx_408 = arith.muli %y_idx_403, %flip_17 : tensor<4x2x2xi32> loc(#loc291) + %right_idx_409 = "tt.reduce"(%right_idx_408) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc317) + %right_idx_410 = tt.expand_dims %right_idx_409 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc293) + %right_idx_411 = tt.broadcast %right_idx_410 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc294) + %left_idx_412 = tt.reshape %left_idx_407 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_413 = tt.reshape %right_idx_411 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_414 = arith.cmpi slt, %ileft_401, %iright_402 : tensor<1x16xi32> loc(#loc275) + %eq_415 = arith.cmpi eq, %ileft_401, %iright_402 : tensor<1x16xi32> loc(#loc276) + %cond_416 = arith.cmpi sgt, %left_idx_412, %right_idx_413 : tensor<1x16xi32> loc(#loc297) + %cond_417 = arith.andi %eq_415, %cond_416 : tensor<1x16xi1> loc(#loc277) + %cond_418 = arith.ori %cond_414, %cond_417 : tensor<1x16xi1> loc(#loc278) + %cond_419 = arith.extui %cond_418 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc279) + %cond_420 = arith.xori %cond_419, %flip_54 : tensor<1x16xi32> loc(#loc279) + %cond_421 = arith.cmpi ne, %cond_420, %cst_1 : tensor<1x16xi32> loc(#loc280) + %ret_422 = arith.xori %ileft_401, %iright_402 : tensor<1x16xi32> loc(#loc281) + %ret_423 = arith.select %cond_421, %ret_422, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_424 = arith.xori %ret_389, %ret_423 : tensor<1x16xi32> loc(#loc283) + %new_idxs_425 = arith.xori %left_idx_412, %right_idx_413 : tensor<1x16xi32> loc(#loc298) + %new_idxs_426 = arith.select %cond_421, %new_idxs_425, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_427 = arith.xori %new_idxs_391, %new_idxs_426 : tensor<1x16xi32> loc(#loc285) + %y_428 = tt.reshape %ret_424 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc264) + %ileft_429 = arith.muli %y_428, %ileft : tensor<8x2x1xi32> loc(#loc265) + %ileft_430 = "tt.reduce"(%ileft_429) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc311) + %ileft_431 = tt.expand_dims %ileft_430 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc267) + %ileft_432 = tt.broadcast %ileft_431 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc268) + %iright_433 = arith.muli %y_428, %iright : tensor<8x2x1xi32> loc(#loc269) + %iright_434 = "tt.reduce"(%iright_433) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc313) + %iright_435 = tt.expand_dims %iright_434 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc271) + %iright_436 = tt.broadcast %iright_435 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc272) + %ileft_437 = tt.reshape %ileft_432 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_438 = tt.reshape %iright_436 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_439 = tt.reshape %new_idxs_427 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc286) + %left_idx_440 = arith.muli %y_idx_439, %ileft : tensor<8x2x1xi32> loc(#loc287) + %left_idx_441 = "tt.reduce"(%left_idx_440) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc315) + %left_idx_442 = tt.expand_dims %left_idx_441 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc289) + %left_idx_443 = tt.broadcast %left_idx_442 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc290) + %right_idx_444 = arith.muli %y_idx_439, %iright : tensor<8x2x1xi32> loc(#loc291) + %right_idx_445 = "tt.reduce"(%right_idx_444) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc317) + %right_idx_446 = tt.expand_dims %right_idx_445 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc293) + %right_idx_447 = tt.broadcast %right_idx_446 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc294) + %left_idx_448 = tt.reshape %left_idx_443 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_449 = tt.reshape %right_idx_447 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_450 = arith.cmpi slt, %ileft_437, %iright_438 : tensor<1x16xi32> loc(#loc275) + %eq_451 = arith.cmpi eq, %ileft_437, %iright_438 : tensor<1x16xi32> loc(#loc276) + %cond_452 = arith.cmpi sgt, %left_idx_448, %right_idx_449 : tensor<1x16xi32> loc(#loc297) + %cond_453 = arith.andi %eq_451, %cond_452 : tensor<1x16xi1> loc(#loc277) + %cond_454 = arith.ori %cond_450, %cond_453 : tensor<1x16xi1> loc(#loc278) + %cond_455 = arith.extui %cond_454 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc279) + %cond_456 = arith.xori %cond_455, %flip_54 : tensor<1x16xi32> loc(#loc279) + %cond_457 = arith.cmpi ne, %cond_456, %cst_1 : tensor<1x16xi32> loc(#loc280) + %ret_458 = arith.xori %ileft_437, %iright_438 : tensor<1x16xi32> loc(#loc281) + %ret_459 = arith.select %cond_457, %ret_458, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_460 = arith.xori %ret_424, %ret_459 : tensor<1x16xi32> loc(#loc283) + %new_idxs_461 = arith.xori %left_idx_448, %right_idx_449 : tensor<1x16xi32> loc(#loc298) + %new_idxs_462 = arith.select %cond_457, %new_idxs_461, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_463 = arith.xori %new_idxs_427, %new_idxs_462 : tensor<1x16xi32> loc(#loc285) + %y_464 = tt.reshape %ret_460 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc264) + %ileft_465 = arith.muli %y_464, %ileft_131 : tensor<2x2x4xi32> loc(#loc265) + %ileft_466 = "tt.reduce"(%ileft_465) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc311) + %ileft_467 = tt.expand_dims %ileft_466 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc267) + %ileft_468 = tt.broadcast %ileft_467 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc268) + %iright_469 = arith.muli %y_464, %flip_53 : tensor<2x2x4xi32> loc(#loc269) + %iright_470 = "tt.reduce"(%iright_469) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc313) + %iright_471 = tt.expand_dims %iright_470 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc271) + %iright_472 = tt.broadcast %iright_471 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc272) + %ileft_473 = tt.reshape %ileft_468 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_474 = tt.reshape %iright_472 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_475 = tt.reshape %new_idxs_463 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc286) + %left_idx_476 = arith.muli %y_idx_475, %ileft_131 : tensor<2x2x4xi32> loc(#loc287) + %left_idx_477 = "tt.reduce"(%left_idx_476) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc315) + %left_idx_478 = tt.expand_dims %left_idx_477 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc289) + %left_idx_479 = tt.broadcast %left_idx_478 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc290) + %right_idx_480 = arith.muli %y_idx_475, %flip_53 : tensor<2x2x4xi32> loc(#loc291) + %right_idx_481 = "tt.reduce"(%right_idx_480) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc317) + %right_idx_482 = tt.expand_dims %right_idx_481 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc293) + %right_idx_483 = tt.broadcast %right_idx_482 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc294) + %left_idx_484 = tt.reshape %left_idx_479 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_485 = tt.reshape %right_idx_483 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_486 = arith.cmpi slt, %ileft_473, %iright_474 : tensor<1x16xi32> loc(#loc275) + %eq_487 = arith.cmpi eq, %ileft_473, %iright_474 : tensor<1x16xi32> loc(#loc276) + %cond_488 = arith.cmpi sgt, %left_idx_484, %right_idx_485 : tensor<1x16xi32> loc(#loc297) + %cond_489 = arith.andi %eq_487, %cond_488 : tensor<1x16xi1> loc(#loc277) + %cond_490 = arith.ori %cond_486, %cond_489 : tensor<1x16xi1> loc(#loc278) + %cond_491 = arith.extui %cond_490 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc279) + %cond_492 = arith.xori %cond_491, %flip_129 : tensor<1x16xi32> loc(#loc279) + %cond_493 = arith.cmpi ne, %cond_492, %cst_1 : tensor<1x16xi32> loc(#loc280) + %ret_494 = arith.xori %ileft_473, %iright_474 : tensor<1x16xi32> loc(#loc281) + %ret_495 = arith.select %cond_493, %ret_494, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_496 = arith.xori %ret_460, %ret_495 : tensor<1x16xi32> loc(#loc283) + %new_idxs_497 = arith.xori %left_idx_484, %right_idx_485 : tensor<1x16xi32> loc(#loc298) + %new_idxs_498 = arith.select %cond_493, %new_idxs_497, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_499 = arith.xori %new_idxs_463, %new_idxs_498 : tensor<1x16xi32> loc(#loc285) + %y_500 = tt.reshape %ret_496 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc264) + %ileft_501 = arith.muli %y_500, %ileft_56 : tensor<4x2x2xi32> loc(#loc265) + %ileft_502 = "tt.reduce"(%ileft_501) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc311) + %ileft_503 = tt.expand_dims %ileft_502 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc267) + %ileft_504 = tt.broadcast %ileft_503 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc268) + %iright_505 = arith.muli %y_500, %flip_17 : tensor<4x2x2xi32> loc(#loc269) + %iright_506 = "tt.reduce"(%iright_505) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc313) + %iright_507 = tt.expand_dims %iright_506 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc271) + %iright_508 = tt.broadcast %iright_507 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc272) + %ileft_509 = tt.reshape %ileft_504 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_510 = tt.reshape %iright_508 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_511 = tt.reshape %new_idxs_499 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc286) + %left_idx_512 = arith.muli %y_idx_511, %ileft_56 : tensor<4x2x2xi32> loc(#loc287) + %left_idx_513 = "tt.reduce"(%left_idx_512) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc315) + %left_idx_514 = tt.expand_dims %left_idx_513 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc289) + %left_idx_515 = tt.broadcast %left_idx_514 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc290) + %right_idx_516 = arith.muli %y_idx_511, %flip_17 : tensor<4x2x2xi32> loc(#loc291) + %right_idx_517 = "tt.reduce"(%right_idx_516) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc317) + %right_idx_518 = tt.expand_dims %right_idx_517 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc293) + %right_idx_519 = tt.broadcast %right_idx_518 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc294) + %left_idx_520 = tt.reshape %left_idx_515 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_521 = tt.reshape %right_idx_519 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_522 = arith.cmpi slt, %ileft_509, %iright_510 : tensor<1x16xi32> loc(#loc275) + %eq_523 = arith.cmpi eq, %ileft_509, %iright_510 : tensor<1x16xi32> loc(#loc276) + %cond_524 = arith.cmpi sgt, %left_idx_520, %right_idx_521 : tensor<1x16xi32> loc(#loc297) + %cond_525 = arith.andi %eq_523, %cond_524 : tensor<1x16xi1> loc(#loc277) + %cond_526 = arith.ori %cond_522, %cond_525 : tensor<1x16xi1> loc(#loc278) + %cond_527 = arith.extui %cond_526 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc279) + %cond_528 = arith.xori %cond_527, %flip_129 : tensor<1x16xi32> loc(#loc279) + %cond_529 = arith.cmpi ne, %cond_528, %cst_1 : tensor<1x16xi32> loc(#loc280) + %ret_530 = arith.xori %ileft_509, %iright_510 : tensor<1x16xi32> loc(#loc281) + %ret_531 = arith.select %cond_529, %ret_530, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_532 = arith.xori %ret_496, %ret_531 : tensor<1x16xi32> loc(#loc283) + %new_idxs_533 = arith.xori %left_idx_520, %right_idx_521 : tensor<1x16xi32> loc(#loc298) + %new_idxs_534 = arith.select %cond_529, %new_idxs_533, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_535 = arith.xori %new_idxs_499, %new_idxs_534 : tensor<1x16xi32> loc(#loc285) + %y_536 = tt.reshape %ret_532 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc264) + %ileft_537 = arith.muli %y_536, %ileft : tensor<8x2x1xi32> loc(#loc265) + %ileft_538 = "tt.reduce"(%ileft_537) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc311) + %ileft_539 = tt.expand_dims %ileft_538 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc267) + %ileft_540 = tt.broadcast %ileft_539 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc268) + %iright_541 = arith.muli %y_536, %iright : tensor<8x2x1xi32> loc(#loc269) + %iright_542 = "tt.reduce"(%iright_541) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc313) + %iright_543 = tt.expand_dims %iright_542 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc271) + %iright_544 = tt.broadcast %iright_543 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc272) + %ileft_545 = tt.reshape %ileft_540 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_546 = tt.reshape %iright_544 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_547 = tt.reshape %new_idxs_535 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc286) + %left_idx_548 = arith.muli %y_idx_547, %ileft : tensor<8x2x1xi32> loc(#loc287) + %left_idx_549 = "tt.reduce"(%left_idx_548) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc315) + %left_idx_550 = tt.expand_dims %left_idx_549 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc289) + %left_idx_551 = tt.broadcast %left_idx_550 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc290) + %right_idx_552 = arith.muli %y_idx_547, %iright : tensor<8x2x1xi32> loc(#loc291) + %right_idx_553 = "tt.reduce"(%right_idx_552) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc317) + %right_idx_554 = tt.expand_dims %right_idx_553 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc293) + %right_idx_555 = tt.broadcast %right_idx_554 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc294) + %left_idx_556 = tt.reshape %left_idx_551 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_557 = tt.reshape %right_idx_555 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_558 = arith.cmpi slt, %ileft_545, %iright_546 : tensor<1x16xi32> loc(#loc275) + %eq_559 = arith.cmpi eq, %ileft_545, %iright_546 : tensor<1x16xi32> loc(#loc276) + %cond_560 = arith.cmpi sgt, %left_idx_556, %right_idx_557 : tensor<1x16xi32> loc(#loc297) + %cond_561 = arith.andi %eq_559, %cond_560 : tensor<1x16xi1> loc(#loc277) + %cond_562 = arith.ori %cond_558, %cond_561 : tensor<1x16xi1> loc(#loc278) + %cond_563 = arith.extui %cond_562 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc279) + %cond_564 = arith.xori %cond_563, %flip_129 : tensor<1x16xi32> loc(#loc279) + %cond_565 = arith.cmpi ne, %cond_564, %cst_1 : tensor<1x16xi32> loc(#loc280) + %ret_566 = arith.xori %ileft_545, %iright_546 : tensor<1x16xi32> loc(#loc281) + %ret_567 = arith.select %cond_565, %ret_566, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_568 = arith.xori %ret_532, %ret_567 : tensor<1x16xi32> loc(#loc283) + %new_idxs_569 = arith.xori %left_idx_556, %right_idx_557 : tensor<1x16xi32> loc(#loc298) + %new_idxs_570 = arith.select %cond_565, %new_idxs_569, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_571 = arith.xori %new_idxs_535, %new_idxs_570 : tensor<1x16xi32> loc(#loc285) + %y_572 = tt.reshape %ret_568 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc264) + %ileft_573 = arith.muli %y_572, %ileft_240 : tensor<1x2x8xi32> loc(#loc265) + %ileft_574 = "tt.reduce"(%ileft_573) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc311) + %ileft_575 = tt.expand_dims %ileft_574 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc267) + %ileft_576 = tt.broadcast %ileft_575 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc268) + %iright_577 = arith.muli %y_572, %flip_128 : tensor<1x2x8xi32> loc(#loc269) + %iright_578 = "tt.reduce"(%iright_577) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc313) + %iright_579 = tt.expand_dims %iright_578 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc271) + %iright_580 = tt.broadcast %iright_579 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc272) + %ileft_581 = tt.reshape %ileft_576 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_582 = tt.reshape %iright_580 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_583 = tt.reshape %new_idxs_571 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc286) + %left_idx_584 = arith.muli %y_idx_583, %ileft_240 : tensor<1x2x8xi32> loc(#loc287) + %left_idx_585 = "tt.reduce"(%left_idx_584) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc315) + %left_idx_586 = tt.expand_dims %left_idx_585 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc289) + %left_idx_587 = tt.broadcast %left_idx_586 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc290) + %right_idx_588 = arith.muli %y_idx_583, %flip_128 : tensor<1x2x8xi32> loc(#loc291) + %right_idx_589 = "tt.reduce"(%right_idx_588) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc317) + %right_idx_590 = tt.expand_dims %right_idx_589 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc293) + %right_idx_591 = tt.broadcast %right_idx_590 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc294) + %left_idx_592 = tt.reshape %left_idx_587 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_593 = tt.reshape %right_idx_591 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_594 = arith.cmpi slt, %ileft_581, %iright_582 : tensor<1x16xi32> loc(#loc275) + %eq_595 = arith.cmpi eq, %ileft_581, %iright_582 : tensor<1x16xi32> loc(#loc276) + %cond_596 = arith.cmpi sgt, %left_idx_592, %right_idx_593 : tensor<1x16xi32> loc(#loc297) + %cond_597 = arith.andi %eq_595, %cond_596 : tensor<1x16xi1> loc(#loc277) + %cond_598 = arith.ori %cond_594, %cond_597 : tensor<1x16xi1> loc(#loc278) + %ret_599 = arith.xori %ileft_581, %iright_582 : tensor<1x16xi32> loc(#loc281) + %ret_600 = arith.select %cond_598, %ret_599, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_601 = arith.xori %ret_568, %ret_600 : tensor<1x16xi32> loc(#loc283) + %new_idxs_602 = arith.xori %left_idx_592, %right_idx_593 : tensor<1x16xi32> loc(#loc298) + %new_idxs_603 = arith.select %cond_598, %new_idxs_602, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_604 = arith.xori %new_idxs_571, %new_idxs_603 : tensor<1x16xi32> loc(#loc285) + %y_605 = tt.reshape %ret_601 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc264) + %ileft_606 = arith.muli %y_605, %ileft_131 : tensor<2x2x4xi32> loc(#loc265) + %ileft_607 = "tt.reduce"(%ileft_606) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc311) + %ileft_608 = tt.expand_dims %ileft_607 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc267) + %ileft_609 = tt.broadcast %ileft_608 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc268) + %iright_610 = arith.muli %y_605, %flip_53 : tensor<2x2x4xi32> loc(#loc269) + %iright_611 = "tt.reduce"(%iright_610) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc313) + %iright_612 = tt.expand_dims %iright_611 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc271) + %iright_613 = tt.broadcast %iright_612 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc272) + %ileft_614 = tt.reshape %ileft_609 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_615 = tt.reshape %iright_613 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_616 = tt.reshape %new_idxs_604 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc286) + %left_idx_617 = arith.muli %y_idx_616, %ileft_131 : tensor<2x2x4xi32> loc(#loc287) + %left_idx_618 = "tt.reduce"(%left_idx_617) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc315) + %left_idx_619 = tt.expand_dims %left_idx_618 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc289) + %left_idx_620 = tt.broadcast %left_idx_619 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc290) + %right_idx_621 = arith.muli %y_idx_616, %flip_53 : tensor<2x2x4xi32> loc(#loc291) + %right_idx_622 = "tt.reduce"(%right_idx_621) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc317) + %right_idx_623 = tt.expand_dims %right_idx_622 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc293) + %right_idx_624 = tt.broadcast %right_idx_623 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc294) + %left_idx_625 = tt.reshape %left_idx_620 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_626 = tt.reshape %right_idx_624 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_627 = arith.cmpi slt, %ileft_614, %iright_615 : tensor<1x16xi32> loc(#loc275) + %eq_628 = arith.cmpi eq, %ileft_614, %iright_615 : tensor<1x16xi32> loc(#loc276) + %cond_629 = arith.cmpi sgt, %left_idx_625, %right_idx_626 : tensor<1x16xi32> loc(#loc297) + %cond_630 = arith.andi %eq_628, %cond_629 : tensor<1x16xi1> loc(#loc277) + %cond_631 = arith.ori %cond_627, %cond_630 : tensor<1x16xi1> loc(#loc278) + %ret_632 = arith.xori %ileft_614, %iright_615 : tensor<1x16xi32> loc(#loc281) + %ret_633 = arith.select %cond_631, %ret_632, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_634 = arith.xori %ret_601, %ret_633 : tensor<1x16xi32> loc(#loc283) + %new_idxs_635 = arith.xori %left_idx_625, %right_idx_626 : tensor<1x16xi32> loc(#loc298) + %new_idxs_636 = arith.select %cond_631, %new_idxs_635, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_637 = arith.xori %new_idxs_604, %new_idxs_636 : tensor<1x16xi32> loc(#loc285) + %y_638 = tt.reshape %ret_634 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc264) + %ileft_639 = arith.muli %y_638, %ileft_56 : tensor<4x2x2xi32> loc(#loc265) + %ileft_640 = "tt.reduce"(%ileft_639) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc311) + %ileft_641 = tt.expand_dims %ileft_640 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc267) + %ileft_642 = tt.broadcast %ileft_641 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc268) + %iright_643 = arith.muli %y_638, %flip_17 : tensor<4x2x2xi32> loc(#loc269) + %iright_644 = "tt.reduce"(%iright_643) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc313) + %iright_645 = tt.expand_dims %iright_644 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc271) + %iright_646 = tt.broadcast %iright_645 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc272) + %ileft_647 = tt.reshape %ileft_642 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_648 = tt.reshape %iright_646 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_649 = tt.reshape %new_idxs_637 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc286) + %left_idx_650 = arith.muli %y_idx_649, %ileft_56 : tensor<4x2x2xi32> loc(#loc287) + %left_idx_651 = "tt.reduce"(%left_idx_650) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc315) + %left_idx_652 = tt.expand_dims %left_idx_651 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc289) + %left_idx_653 = tt.broadcast %left_idx_652 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc290) + %right_idx_654 = arith.muli %y_idx_649, %flip_17 : tensor<4x2x2xi32> loc(#loc291) + %right_idx_655 = "tt.reduce"(%right_idx_654) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc317) + %right_idx_656 = tt.expand_dims %right_idx_655 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc293) + %right_idx_657 = tt.broadcast %right_idx_656 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc294) + %left_idx_658 = tt.reshape %left_idx_653 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_659 = tt.reshape %right_idx_657 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_660 = arith.cmpi slt, %ileft_647, %iright_648 : tensor<1x16xi32> loc(#loc275) + %eq_661 = arith.cmpi eq, %ileft_647, %iright_648 : tensor<1x16xi32> loc(#loc276) + %cond_662 = arith.cmpi sgt, %left_idx_658, %right_idx_659 : tensor<1x16xi32> loc(#loc297) + %cond_663 = arith.andi %eq_661, %cond_662 : tensor<1x16xi1> loc(#loc277) + %cond_664 = arith.ori %cond_660, %cond_663 : tensor<1x16xi1> loc(#loc278) + %ret_665 = arith.xori %ileft_647, %iright_648 : tensor<1x16xi32> loc(#loc281) + %ret_666 = arith.select %cond_664, %ret_665, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_667 = arith.xori %ret_634, %ret_666 : tensor<1x16xi32> loc(#loc283) + %new_idxs_668 = arith.xori %left_idx_658, %right_idx_659 : tensor<1x16xi32> loc(#loc298) + %new_idxs_669 = arith.select %cond_664, %new_idxs_668, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_670 = arith.xori %new_idxs_637, %new_idxs_669 : tensor<1x16xi32> loc(#loc285) + %y_671 = tt.reshape %ret_667 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc264) + %ileft_672 = arith.muli %y_671, %ileft : tensor<8x2x1xi32> loc(#loc265) + %ileft_673 = "tt.reduce"(%ileft_672) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc311) + %ileft_674 = tt.expand_dims %ileft_673 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc267) + %ileft_675 = tt.broadcast %ileft_674 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc268) + %iright_676 = arith.muli %y_671, %iright : tensor<8x2x1xi32> loc(#loc269) + %iright_677 = "tt.reduce"(%iright_676) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc313) + %iright_678 = tt.expand_dims %iright_677 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc271) + %iright_679 = tt.broadcast %iright_678 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc272) + %ileft_680 = tt.reshape %ileft_675 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_681 = tt.reshape %iright_679 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_682 = tt.reshape %new_idxs_670 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc286) + %left_idx_683 = arith.muli %y_idx_682, %ileft : tensor<8x2x1xi32> loc(#loc287) + %left_idx_684 = "tt.reduce"(%left_idx_683) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc315) + %left_idx_685 = tt.expand_dims %left_idx_684 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc289) + %left_idx_686 = tt.broadcast %left_idx_685 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc290) + %right_idx_687 = arith.muli %y_idx_682, %iright : tensor<8x2x1xi32> loc(#loc291) + %right_idx_688 = "tt.reduce"(%right_idx_687) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc317) + %right_idx_689 = tt.expand_dims %right_idx_688 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc293) + %right_idx_690 = tt.broadcast %right_idx_689 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc294) + %left_idx_691 = tt.reshape %left_idx_686 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_692 = tt.reshape %right_idx_690 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_693 = arith.cmpi slt, %ileft_680, %iright_681 : tensor<1x16xi32> loc(#loc275) + %eq_694 = arith.cmpi eq, %ileft_680, %iright_681 : tensor<1x16xi32> loc(#loc276) + %cond_695 = arith.cmpi sgt, %left_idx_691, %right_idx_692 : tensor<1x16xi32> loc(#loc297) + %cond_696 = arith.andi %eq_694, %cond_695 : tensor<1x16xi1> loc(#loc277) + %cond_697 = arith.ori %cond_693, %cond_696 : tensor<1x16xi1> loc(#loc278) + %new_idxs_698 = arith.xori %left_idx_691, %right_idx_692 : tensor<1x16xi32> loc(#loc298) + %new_idxs_699 = arith.select %cond_697, %new_idxs_698, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_700 = arith.xori %new_idxs_670, %new_idxs_699 : tensor<1x16xi32> loc(#loc285) + %tmp20 = arith.extui %tmp5 : tensor<1x16xi1> to tensor<1x16xi64> loc(#loc220) + %tmp23 = arith.select %tmp0_13, %tmp20, %cst_5 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc187) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_705: i64 loc(callsite(#loc1 at #loc188)), %tmp24_706: i64 loc(callsite(#loc1 at #loc188))): + %tmp24_707 = arith.addi %tmp24_705, %tmp24_706 : i64 loc(#loc299) + tt.reduce.return %tmp24_707 : i64 loc(#loc221) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc221) + %tmp24_701 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc189) + %tmp25 = arith.extui %tmp14 : tensor<1x16xi1> to tensor<1x16xi64> loc(#loc223) + %tmp28 = arith.select %tmp0_13, %tmp25, %cst_5 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc191) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_705: i64 loc(callsite(#loc1 at #loc192)), %tmp29_706: i64 loc(callsite(#loc1 at #loc192))): + %tmp29_707 = arith.addi %tmp29_705, %tmp29_706 : i64 loc(#loc300) + tt.reduce.return %tmp29_707 : i64 loc(#loc224) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc224) + %tmp29_702 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc193) + %tmp30 = arith.trunci %tmp24_701 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc194) + %tmp31 = arith.trunci %tmp29_702 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc195) + %tmp34 = tt.broadcast %tmp30 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc196) + %tmp34_703 = arith.cmpi slt, %r0_index_8, %tmp34 : tensor<1x16xi32> loc(#loc196) + %tmp36 = arith.select %tmp34_703, %new_idxs_368, %cst_3 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc197) + %tmp38 = arith.addi %tmp36, %cst_2 : tensor<1x16xi32> loc(#loc198) + %tmp39 = arith.cmpi slt, %tmp36, %cst_1 : tensor<1x16xi32> loc(#loc199) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc200) + %0 = arith.cmpi sge, %tmp40, %cst_1 : tensor<1x16xi32> loc(#loc83) + %1 = arith.cmpi slt, %tmp40, %cst_2 : tensor<1x16xi32> loc(#loc84) + %2 = arith.andi %0, %1 : tensor<1x16xi1> loc(#loc85) + %3 = arith.xori %xmask_6, %true : i1 loc(#loc86) + %4 = tt.splat %3 : i1 -> tensor<1x16xi1> loc(#loc201) + %5 = arith.ori %2, %4 : tensor<1x16xi1> loc(#loc87) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<1x16xi1> loc(#loc88) + %tmp45 = tt.broadcast %tmp31 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc202) + %tmp45_704 = arith.cmpi slt, %r0_index_8, %tmp45 : tensor<1x16xi32> loc(#loc202) + %tmp46 = arith.select %tmp45_704, %new_idxs_700, %cst_3 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc203) + %tmp47 = arith.addi %tmp46, %cst_2 : tensor<1x16xi32> loc(#loc204) + %tmp48 = arith.cmpi slt, %tmp46, %cst_1 : tensor<1x16xi32> loc(#loc205) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc206) + %6 = arith.cmpi sge, %tmp49, %cst_1 : tensor<1x16xi32> loc(#loc94) + %7 = arith.cmpi slt, %tmp49, %cst_2 : tensor<1x16xi32> loc(#loc95) + %8 = arith.andi %6, %7 : tensor<1x16xi1> loc(#loc96) + %9 = arith.ori %8, %4 : tensor<1x16xi1> loc(#loc97) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<1x16xi1> loc(#loc98) + %10 = tt.addptr %out_ptr4, %xoffset : !tt.ptr, i32 loc(#loc99) + %11 = tt.splat %10 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc99) + tt.store %11, %tmp30, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc100) + %12 = tt.addptr %out_ptr5, %xoffset : !tt.ptr, i32 loc(#loc101) + %13 = tt.splat %12 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc101) + tt.store %13, %tmp31, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc102) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc103) + %15 = tt.addptr %14, %tmp0_10 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc103) + tt.store %15, %new_idxs_368, %tmp0_13 : tensor<1x16x!tt.ptr> loc(#loc104) + %16 = arith.muli %xoffset, %c17_i32 : i32 loc(#loc105) + %17 = tt.splat %16 : i32 -> tensor<1x16xi32> loc(#loc207) + %18 = arith.addi %tmp40, %17 : tensor<1x16xi32> loc(#loc106) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc107) + %20 = tt.addptr %19, %18 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc107) + tt.store %20, %cst_0, %tmp0_13 : tensor<1x16x!tt.ptr> loc(#loc108) + %21 = tt.splat %out_ptr8 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc109) + %22 = tt.addptr %21, %tmp0_10 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc109) + tt.store %22, %new_idxs_700, %tmp0_13 : tensor<1x16x!tt.ptr> loc(#loc110) + %23 = arith.addi %tmp49, %17 : tensor<1x16xi32> loc(#loc111) + %24 = tt.splat %out_ptr9 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc112) + %25 = tt.addptr %24, %23 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc112) + tt.store %25, %cst_0, %tmp0_13 : tensor<1x16x!tt.ptr> loc(#loc113) + tt.return loc(#loc114) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":26:21) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":27:38) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:40) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:30) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":34:45) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":36:18) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":38:18) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":39:18) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":41:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":40:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":43:19) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":47:20) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":49:21) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":48:21) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":52:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":54:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":55:29) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":56:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":58:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":59:29) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":60:21) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":61:21) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":64:19) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":66:35) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":68:20) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":69:20) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":70:35) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:28) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:46) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:38) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:55) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:53) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":71:63) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":75:19) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":76:35) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":77:20) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":78:20) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":79:35) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:28) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:46) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:38) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:53) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":80:63) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:25) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":81:37) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:25) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":82:37) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:25) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":83:47) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:52) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:49) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:25) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":84:85) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:25) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":85:47) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:49) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:25) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:85) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/rk/crkxpwiwhzkvun7i5d2pegofthyfijn5wygwnaev3twwlrbuojqe.py":86:4) +#loc124 = loc("xmask"(#loc2)) +#loc125 = loc("xoffset"(#loc3)) +#loc126 = loc("r0_index"(#loc4)) +#loc127 = loc("r0_index"(#loc5)) +#loc128 = loc("tmp0"(#loc6)) +#loc129 = loc("tmp0"(#loc7)) +#loc130 = loc("tmp0"(#loc8)) +#loc131 = loc("tmp0"(#loc9)) +#loc132 = loc("tmp2"(#loc10)) +#loc133 = loc("tmp4"(#loc11)) +#loc134 = loc("tmp5"(#loc12)) +#loc135 = loc("tmp7"(#loc13)) +#loc136 = loc("tmp6"(#loc14)) +#loc137 = loc("tmp9"(#loc15)) +#loc138 = loc("flip"(#loc16)) +#loc140 = loc("flip"(#loc19)) +#loc141 = loc("flip"(#loc20)) +#loc142 = loc("flip"(#loc21)) +#loc143 = loc("y"(#loc22)) +#loc144 = loc("left_mask"(#loc24)) +#loc145 = loc("ileft"(#loc25)) +#loc147 = loc("ileft"(#loc29)) +#loc148 = loc("ileft"(#loc30)) +#loc149 = loc("iright"(#loc31)) +#loc151 = loc("iright"(#loc33)) +#loc152 = loc("iright"(#loc34)) +#loc153 = loc("ileft"(#loc35)) +#loc154 = loc("iright"(#loc36)) +#loc155 = loc("y_idx"(#loc37)) +#loc156 = loc("left_idx"(#loc38)) +#loc157 = loc("left_idx"(#loc39)) +#loc158 = loc("input"(#loc40)) +#loc160 = loc("left_idx"(#loc42)) +#loc161 = loc("left_idx"(#loc43)) +#loc162 = loc("right_idx"(#loc44)) +#loc163 = loc("right_idx"(#loc45)) +#loc165 = loc("right_idx"(#loc47)) +#loc166 = loc("right_idx"(#loc48)) +#loc167 = loc("left_idx"(#loc49)) +#loc168 = loc("right_idx"(#loc50)) +#loc169 = loc("cond"(#loc51)) +#loc170 = loc("eq"(#loc52)) +#loc171 = loc("cond"(#loc53)) +#loc172 = loc("cond"(#loc54)) +#loc173 = loc("cond"(#loc55)) +#loc174 = loc("cond"(#loc56)) +#loc175 = loc("cond"(#loc57)) +#loc176 = loc("ret"(#loc58)) +#loc177 = loc("ret"(#loc59)) +#loc178 = loc("ret"(#loc60)) +#loc179 = loc("new_idxs"(#loc61)) +#loc180 = loc("new_idxs"(#loc62)) +#loc181 = loc("new_idxs"(#loc63)) +#loc182 = loc("tmp14"(#loc64)) +#loc183 = loc("tmp16"(#loc65)) +#loc184 = loc("tmp15"(#loc66)) +#loc186 = loc("tmp20"(#loc68)) +#loc187 = loc("tmp23"(#loc69)) +#loc189 = loc("tmp24"(#loc71)) +#loc190 = loc("tmp25"(#loc72)) +#loc191 = loc("tmp28"(#loc73)) +#loc193 = loc("tmp29"(#loc75)) +#loc194 = loc("tmp30"(#loc76)) +#loc195 = loc("tmp31"(#loc77)) +#loc196 = loc("tmp34"(#loc78)) +#loc197 = loc("tmp36"(#loc79)) +#loc198 = loc("tmp38"(#loc80)) +#loc199 = loc("tmp39"(#loc81)) +#loc200 = loc("tmp40"(#loc82)) +#loc201 = loc(fused[#loc87, #loc86]) +#loc202 = loc("tmp45"(#loc89)) +#loc203 = loc("tmp46"(#loc90)) +#loc204 = loc("tmp47"(#loc91)) +#loc205 = loc("tmp48"(#loc92)) +#loc206 = loc("tmp49"(#loc93)) +#loc207 = loc(fused[#loc106, #loc105]) +#loc208 = loc(fused[#loc129, #loc128]) +#loc209 = loc(fused[#loc131, #loc124]) +#loc210 = loc(fused[#loc135, #loc136]) +#loc211 = loc(callsite(#loc138 at #loc139)) +#loc212 = loc(callsite(#loc140 at #loc139)) +#loc213 = loc(callsite(#loc141 at #loc139)) +#loc214 = loc(callsite(#loc142 at #loc139)) +#loc216 = loc("cond"(#loc169)) +#loc217 = loc("eq"(#loc170)) +#loc218 = loc(fused[#loc183, #loc184]) +#loc220 = loc(fused[#loc186, #loc135, #loc136]) +#loc221 = loc(callsite(#loc26 at #loc188)) +#loc223 = loc(fused[#loc190, #loc183, #loc184]) +#loc224 = loc(callsite(#loc26 at #loc192)) +#loc226 = loc(callsite(#loc143 at #loc215)) +#loc227 = loc(callsite(#loc144 at #loc215)) +#loc228 = loc(callsite(#loc145 at #loc215)) +#loc230 = loc(callsite(#loc147 at #loc215)) +#loc231 = loc(callsite(#loc148 at #loc215)) +#loc232 = loc(callsite(#loc149 at #loc215)) +#loc234 = loc(callsite(#loc151 at #loc215)) +#loc235 = loc(callsite(#loc152 at #loc215)) +#loc236 = loc(callsite(#loc153 at #loc215)) +#loc237 = loc(callsite(#loc154 at #loc215)) +#loc238 = loc(callsite(#loc155 at #loc215)) +#loc239 = loc(callsite(#loc156 at #loc215)) +#loc240 = loc(callsite(#loc157 at #loc215)) +#loc242 = loc(callsite(#loc160 at #loc215)) +#loc243 = loc(callsite(#loc161 at #loc215)) +#loc244 = loc(callsite(#loc162 at #loc215)) +#loc245 = loc(callsite(#loc163 at #loc215)) +#loc247 = loc(callsite(#loc165 at #loc215)) +#loc248 = loc(callsite(#loc166 at #loc215)) +#loc249 = loc(callsite(#loc167 at #loc215)) +#loc250 = loc(callsite(#loc168 at #loc215)) +#loc251 = loc(callsite(#loc216 at #loc215)) +#loc252 = loc(callsite(#loc217 at #loc215)) +#loc253 = loc(callsite(#loc171 at #loc215)) +#loc254 = loc(callsite(#loc172 at #loc215)) +#loc255 = loc(callsite(#loc173 at #loc215)) +#loc256 = loc(callsite(#loc174 at #loc215)) +#loc257 = loc(callsite(#loc175 at #loc215)) +#loc258 = loc(callsite(#loc176 at #loc215)) +#loc259 = loc(callsite(#loc177 at #loc215)) +#loc260 = loc(callsite(#loc178 at #loc215)) +#loc261 = loc(callsite(#loc179 at #loc215)) +#loc262 = loc(callsite(#loc180 at #loc215)) +#loc263 = loc(callsite(#loc181 at #loc215)) +#loc264 = loc(callsite(#loc143 at #loc219)) +#loc265 = loc(callsite(#loc145 at #loc219)) +#loc267 = loc(callsite(#loc147 at #loc219)) +#loc268 = loc(callsite(#loc148 at #loc219)) +#loc269 = loc(callsite(#loc149 at #loc219)) +#loc271 = loc(callsite(#loc151 at #loc219)) +#loc272 = loc(callsite(#loc152 at #loc219)) +#loc273 = loc(callsite(#loc153 at #loc219)) +#loc274 = loc(callsite(#loc154 at #loc219)) +#loc275 = loc(callsite(#loc216 at #loc219)) +#loc276 = loc(callsite(#loc217 at #loc219)) +#loc277 = loc(callsite(#loc172 at #loc219)) +#loc278 = loc(callsite(#loc173 at #loc219)) +#loc279 = loc(callsite(#loc174 at #loc219)) +#loc280 = loc(callsite(#loc175 at #loc219)) +#loc281 = loc(callsite(#loc176 at #loc219)) +#loc282 = loc(callsite(#loc177 at #loc219)) +#loc283 = loc(callsite(#loc178 at #loc219)) +#loc284 = loc(callsite(#loc180 at #loc219)) +#loc285 = loc(callsite(#loc181 at #loc219)) +#loc286 = loc(callsite(#loc155 at #loc219)) +#loc287 = loc(callsite(#loc157 at #loc219)) +#loc289 = loc(callsite(#loc160 at #loc219)) +#loc290 = loc(callsite(#loc161 at #loc219)) +#loc291 = loc(callsite(#loc163 at #loc219)) +#loc293 = loc(callsite(#loc165 at #loc219)) +#loc294 = loc(callsite(#loc166 at #loc219)) +#loc295 = loc(callsite(#loc167 at #loc219)) +#loc296 = loc(callsite(#loc168 at #loc219)) +#loc297 = loc(callsite(#loc171 at #loc219)) +#loc298 = loc(callsite(#loc179 at #loc219)) +#loc299 = loc(callsite(#loc28 at #loc221)) +#loc300 = loc(callsite(#loc28 at #loc224)) +#loc301 = loc(callsite(#loc26 at #loc229)) +#loc303 = loc(callsite(#loc26 at #loc233)) +#loc305 = loc(callsite(#loc158 at #loc241)) +#loc306 = loc(callsite(#loc26 at #loc241)) +#loc308 = loc(callsite(#loc158 at #loc246)) +#loc309 = loc(callsite(#loc26 at #loc246)) +#loc311 = loc(callsite(#loc26 at #loc266)) +#loc313 = loc(callsite(#loc26 at #loc270)) +#loc315 = loc(callsite(#loc26 at #loc288)) +#loc317 = loc(callsite(#loc26 at #loc292)) +#loc319 = loc(callsite(#loc28 at #loc301)) +#loc320 = loc(callsite(#loc28 at #loc303)) +#loc321 = loc(callsite(#loc28 at #loc306)) +#loc322 = loc(callsite(#loc28 at #loc309)) +#loc323 = loc(callsite(#loc28 at #loc311)) +#loc324 = loc(callsite(#loc28 at #loc313)) +#loc325 = loc(callsite(#loc28 at #loc315)) +#loc326 = loc(callsite(#loc28 at #loc317)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..9d4e6847fbda86b6d16d0cc9117746593a1c911b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/MRV2MIXSABLF33T7TRM5FUUZYRR2HVY4NSXB3BARCQ2ZKCXL2V4Q/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttgir @@ -0,0 +1,284 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":18:0) +#loc91 = loc("in_ptr0"(#loc)) +#loc92 = loc("in_ptr1"(#loc)) +#loc93 = loc("in_ptr2"(#loc)) +#loc94 = loc("in_ptr3"(#loc)) +#loc95 = loc("out_ptr0"(#loc)) +#loc96 = loc("ks0"(#loc)) +#loc97 = loc("ks1"(#loc)) +#loc98 = loc("ks2"(#loc)) +#loc99 = loc("ks3"(#loc)) +#loc100 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<1024xi1, #blocked> loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xbf16, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<1024xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc101) + %xoffset_3 = arith.muli %xoffset, %c1024_i32 : i32 loc(#loc102) + %xindex = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> loc(#loc103) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<1024xi32, #blocked> loc(#loc104) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<1024xi32, #blocked> loc(#loc104) + %xmask = tt.splat %xnumel : i32 -> tensor<1024xi32, #blocked> loc(#loc105) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<1024xi32, #blocked> loc(#loc105) + %x0 = arith.extsi %xindex_5 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked> loc(#loc106) + %x0_7 = tt.splat %ks0 : i64 -> tensor<1024xi64, #blocked> loc(#loc106) + %x0_8 = arith.remsi %x0, %x0_7 : tensor<1024xi64, #blocked> loc(#loc106) + %x1 = arith.divsi %x0, %x0_7 : tensor<1024xi64, #blocked> loc(#loc107) + %x1_9 = tt.splat %ks1 : i64 -> tensor<1024xi64, #blocked> loc(#loc108) + %x1_10 = arith.remsi %x1, %x1_9 : tensor<1024xi64, #blocked> loc(#loc108) + %tmp31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc109) + %tmp31_11 = tt.addptr %tmp31, %xindex_5 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc109) + %tmp31_12 = tt.load %tmp31_11, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc110) + %tmp31_13 = arith.extf %tmp31_12 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc111) + %tmp32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc112) + %tmp32_14 = tt.addptr %tmp32, %x1_10 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc112) + %tmp32_15 = tt.load %tmp32_14, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc113) + %tmp1 = arith.divsi %ks0, %c2_i64 : i64 loc(#loc114) + %tmp2 = tt.splat %tmp1 : i64 -> tensor<1024xi64, #blocked> loc(#loc115) + %tmp2_16 = arith.cmpi sge, %x0_8, %tmp2 : tensor<1024xi64, #blocked> loc(#loc115) + %tmp3 = arith.muli %tmp1, %c-1_i64 : i64 loc(#loc116) + %tmp3_17 = tt.splat %tmp3 : i64 -> tensor<1024xi64, #blocked> loc(#loc117) + %tmp3_18 = arith.addi %x0, %tmp3_17 : tensor<1024xi64, #blocked> loc(#loc117) + %tmp3_19 = tt.addptr %tmp31, %tmp3_18 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc118) + %tmp3_20 = arith.andi %tmp2_16, %xmask_6 : tensor<1024xi1, #blocked> loc(#loc119) + %tmp3_21 = tt.load %tmp3_19, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc120) + %tmp3_22 = arith.extf %tmp3_21 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc121) + %tmp4 = tt.load %tmp32_14, %tmp3_20, %cst_1 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc122) + %tmp5 = tt.splat %ks2 : i64 -> tensor<1024xi64, #blocked> loc(#loc123) + %tmp6 = arith.addi %tmp4, %tmp5 : tensor<1024xi64, #blocked> loc(#loc124) + %tmp7 = arith.cmpi slt, %tmp4, %cst_1 : tensor<1024xi64, #blocked> loc(#loc125) + %tmp8 = arith.select %tmp7, %tmp6, %tmp4 : tensor<1024xi1, #blocked>, tensor<1024xi64, #blocked> loc(#loc126) + %0 = arith.cmpi sge, %tmp8, %cst_1 : tensor<1024xi64, #blocked> loc(#loc28) + %1 = arith.cmpi slt, %tmp8, %tmp5 : tensor<1024xi64, #blocked> loc(#loc29) + %2 = arith.andi %0, %1 : tensor<1024xi1, #blocked> loc(#loc30) + %3 = arith.xori %tmp3_20, %cst : tensor<1024xi1, #blocked> loc(#loc31) + %4 = arith.ori %2, %3 : tensor<1024xi1, #blocked> loc(#loc32) + tt.assert %4, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<1024xi1, #blocked> loc(#loc33) + %tmp10 = arith.addi %x0_8, %tmp3_17 : tensor<1024xi64, #blocked> loc(#loc127) + %tmp10_23 = arith.muli %x0_7, %tmp8 : tensor<1024xi64, #blocked> loc(#loc128) + %tmp10_24 = arith.addi %tmp10, %tmp10_23 : tensor<1024xi64, #blocked> loc(#loc129) + %tmp10_25 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc130) + %tmp10_26 = tt.addptr %tmp10_25, %tmp10_24 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc130) + %tmp10_27 = tt.load %tmp10_26, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc131) + %tmp10_28 = arith.extf %tmp10_27 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc132) + %tmp11 = arith.mulf %tmp3_22, %tmp10_28 : tensor<1024xf32, #blocked> loc(#loc133) + %tmp12 = arith.subf %cst_2, %tmp11 : tensor<1024xf32, #blocked> loc(#loc134) + %tmp16 = arith.select %tmp2_16, %tmp12, %cst_2 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc169) + %tmp17 = arith.cmpi slt, %x0_8, %tmp2 : tensor<1024xi64, #blocked> loc(#loc137) + %tmp18 = arith.addi %x0_7, %x0 : tensor<1024xi64, #blocked> loc(#loc138) + %tmp18_29 = arith.addi %tmp18, %tmp3_17 : tensor<1024xi64, #blocked> loc(#loc139) + %tmp18_30 = tt.addptr %tmp31, %tmp18_29 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc140) + %tmp18_31 = arith.andi %tmp17, %xmask_6 : tensor<1024xi1, #blocked> loc(#loc141) + %tmp18_32 = tt.load %tmp18_30, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc142) + %tmp18_33 = arith.extf %tmp18_32 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc143) + %tmp19 = tt.load %tmp32_14, %tmp18_31, %cst_1 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc144) + %tmp21 = arith.addi %tmp19, %tmp5 : tensor<1024xi64, #blocked> loc(#loc145) + %tmp22 = arith.cmpi slt, %tmp19, %cst_1 : tensor<1024xi64, #blocked> loc(#loc146) + %tmp23 = arith.select %tmp22, %tmp21, %tmp19 : tensor<1024xi1, #blocked>, tensor<1024xi64, #blocked> loc(#loc147) + %5 = arith.cmpi sge, %tmp23, %cst_1 : tensor<1024xi64, #blocked> loc(#loc55) + %6 = arith.cmpi slt, %tmp23, %tmp5 : tensor<1024xi64, #blocked> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<1024xi1, #blocked> loc(#loc57) + %8 = arith.xori %tmp18_31, %cst : tensor<1024xi1, #blocked> loc(#loc58) + %9 = arith.ori %7, %8 : tensor<1024xi1, #blocked> loc(#loc59) + tt.assert %9, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<1024xi1, #blocked> loc(#loc60) + %tmp25 = arith.addi %x0_7, %x0_8 : tensor<1024xi64, #blocked> loc(#loc148) + %tmp25_34 = arith.addi %tmp25, %tmp3_17 : tensor<1024xi64, #blocked> loc(#loc149) + %tmp25_35 = arith.muli %x0_7, %tmp23 : tensor<1024xi64, #blocked> loc(#loc150) + %tmp25_36 = arith.addi %tmp25_34, %tmp25_35 : tensor<1024xi64, #blocked> loc(#loc151) + %tmp25_37 = tt.addptr %tmp10_25, %tmp25_36 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc152) + %tmp25_38 = tt.load %tmp25_37, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc153) + %tmp25_39 = arith.extf %tmp25_38 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc154) + %tmp26 = arith.mulf %tmp18_33, %tmp25_39 : tensor<1024xf32, #blocked> loc(#loc155) + %tmp29 = arith.select %tmp17, %tmp26, %cst_2 : tensor<1024xi1, #blocked>, tensor<1024xf32, #blocked> loc(#loc170) + %tmp30 = arith.addf %tmp16, %tmp29 : tensor<1024xf32, #blocked> loc(#loc158) + %tmp34 = tt.splat %ks3 : i64 -> tensor<1024xi64, #blocked> loc(#loc159) + %tmp34_40 = arith.addi %tmp32_15, %tmp34 : tensor<1024xi64, #blocked> loc(#loc159) + %tmp35 = arith.cmpi slt, %tmp32_15, %cst_1 : tensor<1024xi64, #blocked> loc(#loc160) + %tmp36 = arith.select %tmp35, %tmp34_40, %tmp32_15 : tensor<1024xi1, #blocked>, tensor<1024xi64, #blocked> loc(#loc161) + %10 = arith.cmpi sge, %tmp36, %cst_1 : tensor<1024xi64, #blocked> loc(#loc75) + %11 = arith.cmpi slt, %tmp36, %tmp34 : tensor<1024xi64, #blocked> loc(#loc76) + %12 = arith.andi %10, %11 : tensor<1024xi1, #blocked> loc(#loc77) + %13 = arith.xori %xmask_6, %cst : tensor<1024xi1, #blocked> loc(#loc78) + %14 = arith.ori %12, %13 : tensor<1024xi1, #blocked> loc(#loc79) + tt.assert %14, "index out of bounds: 0 <= tmp36 < ks3" : tensor<1024xi1, #blocked> loc(#loc80) + %tmp38 = arith.muli %x0_7, %tmp36 : tensor<1024xi64, #blocked> loc(#loc162) + %tmp38_41 = arith.addi %x0_8, %tmp38 : tensor<1024xi64, #blocked> loc(#loc163) + %tmp38_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc164) + %tmp38_43 = tt.addptr %tmp38_42, %tmp38_41 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi64, #blocked> loc(#loc164) + %tmp38_44 = tt.load %tmp38_43, %xmask_6 evictionPolicy = evict_last : tensor<1024x!tt.ptr, #blocked> loc(#loc165) + %tmp38_45 = arith.extf %tmp38_44 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> loc(#loc166) + %tmp39 = arith.mulf %tmp31_13, %tmp38_45 : tensor<1024xf32, #blocked> loc(#loc167) + %tmp40 = arith.addf %tmp30, %tmp39 : tensor<1024xf32, #blocked> loc(#loc168) + %15 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> loc(#loc88) + %16 = tt.addptr %15, %xindex_5 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> loc(#loc88) + %17 = arith.truncf %tmp40 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> loc(#loc89) + tt.store %16, %17, %xmask_6 : tensor<1024x!tt.ptr, #blocked> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":24:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":24:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":25:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":25:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":25:76) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":26:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":26:36) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":28:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":29:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:35) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:111) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":31:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":32:32) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":33:18) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":34:18) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":35:32) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:98) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:64) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:108) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:106) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:123) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:58) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:65) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:123) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":38:19) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":39:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":43:34) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":41:34) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":44:19) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:37) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:42) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:68) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:60) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:119) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":46:36) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":48:20) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":49:20) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":50:35) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:100) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:65) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:110) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:108) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:126) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:37) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:64) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:60) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:31) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:72) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:131) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":53:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":56:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":55:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":57:20) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":59:20) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":60:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":61:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:28) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:46) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:38) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:56) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:54) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:64) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:40) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:36) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:31) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:48) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:88) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":64:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":65:20) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":66:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":66:37) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":66:4) +#loc101 = loc("xoffset"(#loc2)) +#loc102 = loc("xoffset"(#loc3)) +#loc103 = loc("xindex"(#loc4)) +#loc104 = loc("xindex"(#loc5)) +#loc105 = loc("xmask"(#loc6)) +#loc106 = loc("x0"(#loc7)) +#loc107 = loc("x1"(#loc8)) +#loc108 = loc("x1"(#loc9)) +#loc109 = loc("tmp31"(#loc10)) +#loc110 = loc("tmp31"(#loc11)) +#loc111 = loc("tmp31"(#loc12)) +#loc112 = loc("tmp32"(#loc13)) +#loc113 = loc("tmp32"(#loc14)) +#loc114 = loc("tmp1"(#loc15)) +#loc115 = loc("tmp2"(#loc16)) +#loc116 = loc("tmp3"(#loc17)) +#loc117 = loc("tmp3"(#loc18)) +#loc118 = loc("tmp3"(#loc19)) +#loc119 = loc("tmp3"(#loc20)) +#loc120 = loc("tmp3"(#loc21)) +#loc121 = loc("tmp3"(#loc22)) +#loc122 = loc("tmp4"(#loc23)) +#loc123 = loc("tmp5"(#loc24)) +#loc124 = loc("tmp6"(#loc25)) +#loc125 = loc("tmp7"(#loc26)) +#loc126 = loc("tmp8"(#loc27)) +#loc127 = loc("tmp10"(#loc34)) +#loc128 = loc("tmp10"(#loc35)) +#loc129 = loc("tmp10"(#loc36)) +#loc130 = loc("tmp10"(#loc37)) +#loc131 = loc("tmp10"(#loc38)) +#loc132 = loc("tmp10"(#loc39)) +#loc133 = loc("tmp11"(#loc40)) +#loc134 = loc("tmp12"(#loc41)) +#loc135 = loc("tmp16"(#loc42)) +#loc136 = loc("tmp14"(#loc43)) +#loc137 = loc("tmp17"(#loc44)) +#loc138 = loc("tmp18"(#loc45)) +#loc139 = loc("tmp18"(#loc46)) +#loc140 = loc("tmp18"(#loc47)) +#loc141 = loc("tmp18"(#loc48)) +#loc142 = loc("tmp18"(#loc49)) +#loc143 = loc("tmp18"(#loc50)) +#loc144 = loc("tmp19"(#loc51)) +#loc145 = loc("tmp21"(#loc52)) +#loc146 = loc("tmp22"(#loc53)) +#loc147 = loc("tmp23"(#loc54)) +#loc148 = loc("tmp25"(#loc61)) +#loc149 = loc("tmp25"(#loc62)) +#loc150 = loc("tmp25"(#loc63)) +#loc151 = loc("tmp25"(#loc64)) +#loc152 = loc("tmp25"(#loc65)) +#loc153 = loc("tmp25"(#loc66)) +#loc154 = loc("tmp25"(#loc67)) +#loc155 = loc("tmp26"(#loc68)) +#loc156 = loc("tmp29"(#loc69)) +#loc157 = loc("tmp28"(#loc70)) +#loc158 = loc("tmp30"(#loc71)) +#loc159 = loc("tmp34"(#loc72)) +#loc160 = loc("tmp35"(#loc73)) +#loc161 = loc("tmp36"(#loc74)) +#loc162 = loc("tmp38"(#loc81)) +#loc163 = loc("tmp38"(#loc82)) +#loc164 = loc("tmp38"(#loc83)) +#loc165 = loc("tmp38"(#loc84)) +#loc166 = loc("tmp38"(#loc85)) +#loc167 = loc("tmp39"(#loc86)) +#loc168 = loc("tmp40"(#loc87)) +#loc169 = loc(fused[#loc135, #loc136]) +#loc170 = loc(fused[#loc156, #loc157]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/__grp__triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/__grp__triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a9465acd6e3ab98c0f584c337181d28e5134007e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/__grp__triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..ce628c07dc044100d78a1a4be146a3fc54209f04 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a8884b7445f0d8904819766b963bdaeaed6d5933 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"hash": "6eb6289585d11f6b33b4460a9035d6921b7db5ca516b4a0a583d17088d2d813a", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..da827d1e0783ed7a307d4895df900765ace8fce7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.llir @@ -0,0 +1,391 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_argmax_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 3, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = and i32 %9, 448, !dbg !9 + %11 = lshr exact i32 %10, 6, !dbg !9 + %12 = or disjoint i32 %11, %8, !dbg !10 + %13 = shl nuw nsw i32 %9, 2, !dbg !11 + %14 = and i32 %13, 252, !dbg !11 + %15 = sdiv i32 %12, 2048, !dbg !12 + %16 = mul i32 %12, 32000 + %17 = mul i32 %15, 224000 + %18 = add i32 %17, %16 + %19 = zext nneg i32 %14 to i64, !dbg !13 + br label %20, !dbg !13 + +20: ; preds = %6, %20 + %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %20 ] + %21 = phi i32 [ 2147483647, %6 ], [ %97, %20 ] + %22 = phi i32 [ 2147483647, %6 ], [ %99, %20 ] + %23 = phi float [ 0xFFF0000000000000, %6 ], [ %89, %20 ] + %24 = phi float [ 0xFFF0000000000000, %6 ], [ %90, %20 ] + %25 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %88, %20 ] + %26 = phi <2 x i32> [ splat (i32 2147483647), %6 ], [ %95, %20 ] + %27 = or disjoint i64 %indvars.iv, %19, !dbg !14 + %28 = or disjoint i64 %27, 2, !dbg !14 + %29 = or disjoint i64 %27, 3, !dbg !14 + %30 = trunc nuw nsw i64 %27 to i32, !dbg !15 + %31 = add i32 %18, %30, !dbg !15 + %32 = sext i32 %31 to i64, !dbg !16 + %33 = getelementptr float, ptr addrspace(1) %0, i64 %32, !dbg !16 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %35 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %33, i64 %34, i1 true) #4, !dbg !17 + %36 = extractvalue { i32, i32, i32, i32 } %35, 0, !dbg !17 + %37 = extractvalue { i32, i32, i32, i32 } %35, 1, !dbg !17 + %38 = extractvalue { i32, i32, i32, i32 } %35, 2, !dbg !17 + %39 = extractvalue { i32, i32, i32, i32 } %35, 3, !dbg !17 + %40 = bitcast i32 %38 to float, !dbg !17 + %41 = bitcast i32 %39 to float, !dbg !17 + %42 = fcmp ogt float %23, %40, !dbg !18 + %43 = fcmp ogt float %24, %41, !dbg !18 + %44 = fcmp oeq float %23, %40, !dbg !22 + %45 = fcmp oeq float %24, %41, !dbg !22 + %46 = fcmp uno <2 x float> %25, zeroinitializer, !dbg !23 + %47 = fcmp uno float %23, 0.000000e+00, !dbg !23 + %48 = fcmp uno float %24, 0.000000e+00, !dbg !23 + %49 = fcmp uno float %40, 0.000000e+00, !dbg !24 + %50 = fcmp uno float %41, 0.000000e+00, !dbg !24 + %51 = xor i1 %49, true, !dbg !25 + %52 = xor i1 %50, true, !dbg !25 + %53 = and i1 %47, %51, !dbg !26 + %54 = and i1 %48, %52, !dbg !26 + %55 = or i1 %42, %53, !dbg !27 + %56 = or i1 %43, %54, !dbg !27 + %57 = and i1 %47, %49, !dbg !28 + %58 = and i1 %48, %50, !dbg !28 + %59 = or i1 %44, %57, !dbg !29 + %60 = or i1 %45, %58, !dbg !29 + %61 = sext <2 x i32> %26 to <2 x i64>, !dbg !30 + %62 = sext i32 %21 to i64, !dbg !30 + %63 = icmp sgt i64 %28, %62, !dbg !30 + %64 = sext i32 %22 to i64, !dbg !30 + %65 = icmp sgt i64 %29, %64, !dbg !30 + %66 = and i1 %63, %59, !dbg !31 + %67 = and i1 %65, %60, !dbg !31 + %68 = insertelement <2 x i32> poison, i32 %36, i64 0, !dbg !17 + %69 = insertelement <2 x i32> %68, i32 %37, i64 1, !dbg !17 + %70 = bitcast <2 x i32> %69 to <2 x float>, !dbg !17 + %71 = fcmp ogt <2 x float> %25, %70, !dbg !18 + %72 = fcmp oeq <2 x float> %25, %70, !dbg !22 + %73 = fcmp uno <2 x float> %70, zeroinitializer, !dbg !24 + %74 = xor <2 x i1> %73, splat (i1 true), !dbg !25 + %75 = and <2 x i1> %46, %74, !dbg !26 + %76 = or <2 x i1> %71, %75, !dbg !27 + %77 = and <2 x i1> %46, %73, !dbg !28 + %78 = or <2 x i1> %72, %77, !dbg !29 + %79 = insertelement <2 x i64> poison, i64 %27, i64 0, !dbg !30 + %80 = shufflevector <2 x i64> %79, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !30 + %81 = icmp sgt <2 x i64> %80, %61, !dbg !30 + %82 = icmp sge <2 x i64> %80, %61, !dbg !30 + %83 = shufflevector <2 x i1> %81, <2 x i1> %82, <2 x i32> , !dbg !30 + %84 = and <2 x i1> %83, %78, !dbg !31 + %85 = or <2 x i1> %76, %84, !dbg !32 + %86 = or i1 %55, %66, !dbg !32 + %87 = or i1 %56, %67, !dbg !32 + %88 = select <2 x i1> %85, <2 x float> %25, <2 x float> %70, !dbg !33 + %89 = select i1 %86, float %23, float %40, !dbg !33 + %90 = select i1 %87, float %24, float %41, !dbg !33 + %91 = trunc nuw nsw i64 %27 to i32, !dbg !34 + %92 = or disjoint i32 %91, 1, !dbg !34 + %93 = insertelement <2 x i32> poison, i32 %30, i64 0, !dbg !34 + %94 = insertelement <2 x i32> %93, i32 %92, i64 1, !dbg !34 + %95 = select <2 x i1> %85, <2 x i32> %26, <2 x i32> %94, !dbg !34 + %96 = trunc nuw nsw i64 %28 to i32, !dbg !34 + %97 = select i1 %86, i32 %21, i32 %96, !dbg !34 + %98 = trunc nuw nsw i64 %29 to i32, !dbg !34 + %99 = select i1 %87, i32 %22, i32 %98, !dbg !34 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 256, !dbg !13 + %100 = icmp samesign ult i64 %indvars.iv, 31744, !dbg !13 + br i1 %100, label %20, label %101, !dbg !13 + +101: ; preds = %20 + %102 = and i32 %9, 7, !dbg !9 + %103 = or disjoint i32 %8, %102, !dbg !10 + %104 = and i32 %9, 31, !dbg !9 + %105 = lshr i32 %9, 5, !dbg !9 + %106 = shufflevector <2 x float> %88, <2 x float> poison, <2 x i32> , !dbg !35 + %107 = fcmp ogt <2 x float> %88, %106, !dbg !35 + %108 = fcmp oeq <2 x float> %88, %106, !dbg !35 + %109 = shufflevector <2 x i1> %107, <2 x i1> %108, <2 x i32> , !dbg !35 + %110 = extractelement <2 x float> %88, i64 0, !dbg !37 + %111 = fcmp uno float %110, 0.000000e+00, !dbg !37 + %112 = extractelement <2 x float> %88, i64 1, !dbg !38 + %113 = fcmp uno float %112, 0.000000e+00, !dbg !38 + %114 = xor i1 %113, true, !dbg !39 + %115 = insertelement <2 x i1> poison, i1 %111, i64 0, !dbg !40 + %116 = shufflevector <2 x i1> %115, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !40 + %117 = insertelement <2 x i1> poison, i1 %114, i64 0, !dbg !40 + %118 = insertelement <2 x i1> %117, i1 %113, i64 1, !dbg !40 + %119 = and <2 x i1> %116, %118, !dbg !40 + %120 = or <2 x i1> %109, %119, !dbg !41 + %121 = extractelement <2 x i32> %95, i64 0, !dbg !42 + %122 = extractelement <2 x i32> %95, i64 1, !dbg !42 + %123 = icmp slt i32 %121, %122, !dbg !42 + %124 = extractelement <2 x i1> %120, i64 1, !dbg !43 + %125 = and i1 %123, %124, !dbg !43 + %126 = extractelement <2 x i1> %120, i64 0, !dbg !44 + %127 = or i1 %126, %125, !dbg !44 + %128 = select i1 %127, float %110, float %112, !dbg !45 + %129 = select i1 %127, i32 %121, i32 %122, !dbg !46 + %130 = fcmp ogt float %128, %89, !dbg !35 + %131 = fcmp oeq float %128, %89, !dbg !47 + %132 = fcmp uno float %128, 0.000000e+00, !dbg !37 + %133 = fcmp uno float %89, 0.000000e+00, !dbg !38 + %134 = xor i1 %133, true, !dbg !39 + %135 = and i1 %132, %134, !dbg !40 + %136 = or i1 %130, %135, !dbg !41 + %137 = and i1 %133, %132, !dbg !48 + %138 = or i1 %131, %137, !dbg !49 + %139 = icmp slt i32 %129, %97, !dbg !42 + %140 = and i1 %139, %138, !dbg !43 + %141 = or i1 %136, %140, !dbg !44 + %142 = select i1 %141, float %128, float %89, !dbg !45 + %143 = select i1 %141, i32 %129, i32 %97, !dbg !46 + %144 = fcmp ogt float %142, %90, !dbg !35 + %145 = fcmp oeq float %142, %90, !dbg !47 + %146 = fcmp uno float %142, 0.000000e+00, !dbg !37 + %147 = fcmp uno float %90, 0.000000e+00, !dbg !38 + %148 = xor i1 %147, true, !dbg !39 + %149 = and i1 %146, %148, !dbg !40 + %150 = or i1 %144, %149, !dbg !41 + %151 = and i1 %147, %146, !dbg !48 + %152 = or i1 %145, %151, !dbg !49 + %153 = icmp slt i32 %143, %99, !dbg !42 + %154 = and i1 %153, %152, !dbg !43 + %155 = or i1 %150, %154, !dbg !44 + %156 = select i1 %155, float %142, float %90, !dbg !45 + %157 = select i1 %155, i32 %143, i32 %99, !dbg !46 + %158 = bitcast float %156 to i32, !dbg !50 + %159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %158, i32 16, i32 31), !dbg !50 + %160 = bitcast i32 %159 to float, !dbg !50 + %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 16, i32 31), !dbg !50 + %162 = fcmp ogt float %156, %160, !dbg !35 + %163 = fcmp oeq float %156, %160, !dbg !47 + %164 = fcmp uno float %156, 0.000000e+00, !dbg !37 + %165 = fcmp uno float %160, 0.000000e+00, !dbg !38 + %166 = xor i1 %165, true, !dbg !39 + %167 = and i1 %164, %166, !dbg !40 + %168 = or i1 %162, %167, !dbg !41 + %169 = and i1 %164, %165, !dbg !48 + %170 = or i1 %163, %169, !dbg !49 + %171 = icmp slt i32 %157, %161, !dbg !42 + %172 = and i1 %171, %170, !dbg !43 + %173 = or i1 %168, %172, !dbg !44 + %174 = select i1 %173, float %156, float %160, !dbg !45 + %175 = select i1 %173, i32 %157, i32 %161, !dbg !46 + %176 = bitcast float %174 to i32, !dbg !50 + %177 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 8, i32 31), !dbg !50 + %178 = bitcast i32 %177 to float, !dbg !50 + %179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 8, i32 31), !dbg !50 + %180 = fcmp ogt float %174, %178, !dbg !35 + %181 = fcmp oeq float %174, %178, !dbg !47 + %182 = fcmp uno float %174, 0.000000e+00, !dbg !37 + %183 = fcmp uno float %178, 0.000000e+00, !dbg !38 + %184 = xor i1 %183, true, !dbg !39 + %185 = and i1 %182, %184, !dbg !40 + %186 = or i1 %180, %185, !dbg !41 + %187 = and i1 %183, %182, !dbg !48 + %188 = or i1 %181, %187, !dbg !49 + %189 = icmp slt i32 %175, %179, !dbg !42 + %190 = and i1 %189, %188, !dbg !43 + %191 = or i1 %186, %190, !dbg !44 + %192 = select i1 %191, float %174, float %178, !dbg !45 + %193 = select i1 %191, i32 %175, i32 %179, !dbg !46 + %194 = bitcast float %192 to i32, !dbg !50 + %195 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %194, i32 4, i32 31), !dbg !50 + %196 = bitcast i32 %195 to float, !dbg !50 + %197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %193, i32 4, i32 31), !dbg !50 + %198 = fcmp ogt float %192, %196, !dbg !35 + %199 = fcmp oeq float %192, %196, !dbg !47 + %200 = fcmp uno float %192, 0.000000e+00, !dbg !37 + %201 = fcmp uno float %196, 0.000000e+00, !dbg !38 + %202 = xor i1 %201, true, !dbg !39 + %203 = and i1 %200, %202, !dbg !40 + %204 = or i1 %198, %203, !dbg !41 + %205 = and i1 %201, %200, !dbg !48 + %206 = or i1 %199, %205, !dbg !49 + %207 = icmp slt i32 %193, %197, !dbg !42 + %208 = and i1 %207, %206, !dbg !43 + %209 = or i1 %204, %208, !dbg !44 + %210 = select i1 %209, float %192, float %196, !dbg !45 + %211 = select i1 %209, i32 %193, i32 %197, !dbg !46 + %212 = bitcast float %210 to i32, !dbg !50 + %213 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %212, i32 2, i32 31), !dbg !50 + %214 = bitcast i32 %213 to float, !dbg !50 + %215 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %211, i32 2, i32 31), !dbg !50 + %216 = fcmp ogt float %210, %214, !dbg !35 + %217 = fcmp oeq float %210, %214, !dbg !47 + %218 = fcmp uno float %210, 0.000000e+00, !dbg !37 + %219 = fcmp uno float %214, 0.000000e+00, !dbg !38 + %220 = xor i1 %219, true, !dbg !39 + %221 = and i1 %218, %220, !dbg !40 + %222 = or i1 %216, %221, !dbg !41 + %223 = and i1 %219, %218, !dbg !48 + %224 = or i1 %217, %223, !dbg !49 + %225 = icmp slt i32 %211, %215, !dbg !42 + %226 = and i1 %225, %224, !dbg !43 + %227 = or i1 %222, %226, !dbg !44 + %228 = select i1 %227, float %210, float %214, !dbg !45 + %229 = select i1 %227, i32 %211, i32 %215, !dbg !46 + %230 = bitcast float %228 to i32, !dbg !50 + %231 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %230, i32 1, i32 31), !dbg !50 + %232 = bitcast i32 %231 to float, !dbg !50 + %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %229, i32 1, i32 31), !dbg !50 + %234 = fcmp ogt float %228, %232, !dbg !35 + %235 = fcmp oeq float %228, %232, !dbg !47 + %236 = fcmp uno float %228, 0.000000e+00, !dbg !37 + %237 = fcmp uno float %232, 0.000000e+00, !dbg !38 + %238 = xor i1 %237, true, !dbg !39 + %239 = and i1 %236, %238, !dbg !40 + %240 = or i1 %234, %239, !dbg !41 + %241 = and i1 %237, %236, !dbg !48 + %242 = or i1 %235, %241, !dbg !49 + %243 = icmp slt i32 %229, %233, !dbg !42 + %244 = and i1 %243, %242, !dbg !43 + %245 = or i1 %240, %244, !dbg !44 + %246 = select i1 %245, i32 %229, i32 %233, !dbg !46 + %247 = and i32 %105, 1, !dbg !50 + %248 = icmp eq i32 %104, 0, !dbg !50 + %249 = lshr exact i32 %10, 5, !dbg !50 + %250 = or disjoint i32 %249, %247, !dbg !50 + %251 = getelementptr float, ptr addrspace(3) @global_smem, i32 %250, !dbg !50 + %252 = select i1 %245, i32 %230, i32 %231, !dbg !45 + %253 = insertelement <1 x i32> poison, i32 %252, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %251, <1 x i32> %253, i1 %248) #4, !dbg !50 + %254 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %250, !dbg !50 + %255 = insertelement <1 x i32> poison, i32 %246, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %254, <1 x i32> %255, i1 %248) #4, !dbg !50 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50 + %256 = icmp samesign ult i32 %9, 16, !dbg !50 + %257 = getelementptr float, ptr addrspace(3) @global_smem, i32 %9, !dbg !50 + %258 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %257, i1 %256) #4, !dbg !50 + %259 = bitcast i32 %258 to float, !dbg !50 + %260 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %9, !dbg !50 + %261 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %260, i1 %256) #4, !dbg !50 + %262 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %258, i32 1, i32 31), !dbg !50 + %263 = bitcast i32 %262 to float, !dbg !50 + %264 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %261, i32 1, i32 31), !dbg !50 + %265 = fcmp ogt float %259, %263, !dbg !35 + %266 = fcmp oeq float %259, %263, !dbg !47 + %267 = fcmp uno float %259, 0.000000e+00, !dbg !37 + %268 = fcmp uno float %263, 0.000000e+00, !dbg !38 + %269 = xor i1 %268, true, !dbg !39 + %270 = and i1 %267, %269, !dbg !40 + %271 = or i1 %265, %270, !dbg !41 + %272 = and i1 %267, %268, !dbg !48 + %273 = or i1 %266, %272, !dbg !49 + %274 = icmp slt i32 %261, %264, !dbg !42 + %275 = and i1 %274, %273, !dbg !43 + %276 = or i1 %271, %275, !dbg !44 + %277 = select i1 %276, i32 %261, i32 %264, !dbg !46 + %278 = and i32 %9, 1009, !dbg !50 + %279 = icmp eq i32 %278, 0, !dbg !50 + %280 = select i1 %276, i32 %258, i32 %262, !dbg !45 + %281 = insertelement <1 x i32> poison, i32 %280, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %257, <1 x i32> %281, i1 %279) #4, !dbg !50 + %282 = insertelement <1 x i32> poison, i32 %277, i64 0, !dbg !50 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %260, <1 x i32> %282, i1 %279) #4, !dbg !50 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50 + %283 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %249, !dbg !50 + %284 = load i32, ptr addrspace(3) %283, align 8, !dbg !50 + %285 = sext i32 %103 to i64, !dbg !51 + %286 = getelementptr i64, ptr addrspace(1) %1, i64 %285, !dbg !51 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !52 + %287 = lshr exact i32 %10, 4, !dbg !52 + %288 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %287, !dbg !52 + %289 = insertelement <1 x i32> poison, i32 %284, i64 0, !dbg !52 + store <1 x i32> %289, ptr addrspace(3) %288, align 4, !dbg !52 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !52 + %290 = shl nuw nsw i32 %102, 2, !dbg !52 + %291 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %290, !dbg !52 + %292 = load i32, ptr addrspace(3) %291, align 4, !dbg !52 + %293 = sext i32 %292 to i64, !dbg !52 + %294 = and i32 %9, 504, !dbg !52 + %295 = icmp eq i32 %294, 0, !dbg !52 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %293, ptr addrspace(1) %286, i1 %295) #4, !dbg !52 + ret void, !dbg !53 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_1", linkageName: "triton_red_fused_argmax_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 19, scope: !4) +!13 = !DILocation(line: 33, column: 40, scope: !4) +!14 = !DILocation(line: 34, column: 31, scope: !4) +!15 = !DILocation(line: 39, column: 52, scope: !4) +!16 = !DILocation(line: 39, column: 34, scope: !4) +!17 = !DILocation(line: 39, column: 66, scope: !4) +!18 = !DILocation(line: 144, column: 21, scope: !19, inlinedAt: !21) +!19 = distinct !DILexicalBlockFile(scope: !4, file: !20, discriminator: 0) +!20 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!21 = !DILocation(line: 42, column: 38, scope: !4) +!22 = !DILocation(line: 145, column: 23, scope: !19, inlinedAt: !21) +!23 = !DILocation(line: 147, column: 29, scope: !19, inlinedAt: !21) +!24 = !DILocation(line: 148, column: 29, scope: !19, inlinedAt: !21) +!25 = !DILocation(line: 149, column: 31, scope: !19, inlinedAt: !21) +!26 = !DILocation(line: 149, column: 27, scope: !19, inlinedAt: !21) +!27 = !DILocation(line: 149, column: 16, scope: !19, inlinedAt: !21) +!28 = !DILocation(line: 151, column: 27, scope: !19, inlinedAt: !21) +!29 = !DILocation(line: 151, column: 17, scope: !19, inlinedAt: !21) +!30 = !DILocation(line: 154, column: 31, scope: !19, inlinedAt: !21) +!31 = !DILocation(line: 154, column: 21, scope: !19, inlinedAt: !21) +!32 = !DILocation(line: 154, column: 12, scope: !19, inlinedAt: !21) +!33 = !DILocation(line: 155, column: 35, scope: !19, inlinedAt: !21) +!34 = !DILocation(line: 155, column: 69, scope: !19, inlinedAt: !21) +!35 = !DILocation(line: 144, column: 21, scope: !19, inlinedAt: !36) +!36 = !DILocation(line: 46, column: 75, scope: !4) +!37 = !DILocation(line: 147, column: 29, scope: !19, inlinedAt: !36) +!38 = !DILocation(line: 148, column: 29, scope: !19, inlinedAt: !36) +!39 = !DILocation(line: 149, column: 31, scope: !19, inlinedAt: !36) +!40 = !DILocation(line: 149, column: 27, scope: !19, inlinedAt: !36) +!41 = !DILocation(line: 149, column: 16, scope: !19, inlinedAt: !36) +!42 = !DILocation(line: 154, column: 31, scope: !19, inlinedAt: !36) +!43 = !DILocation(line: 154, column: 21, scope: !19, inlinedAt: !36) +!44 = !DILocation(line: 154, column: 12, scope: !19, inlinedAt: !36) +!45 = !DILocation(line: 155, column: 35, scope: !19, inlinedAt: !36) +!46 = !DILocation(line: 155, column: 69, scope: !19, inlinedAt: !36) +!47 = !DILocation(line: 145, column: 23, scope: !19, inlinedAt: !36) +!48 = !DILocation(line: 151, column: 27, scope: !19, inlinedAt: !36) +!49 = !DILocation(line: 151, column: 17, scope: !19, inlinedAt: !36) +!50 = !DILocation(line: 165, column: 42, scope: !19, inlinedAt: !36) +!51 = !DILocation(line: 48, column: 25, scope: !4) +!52 = !DILocation(line: 48, column: 36, scope: !4) +!53 = !DILocation(line: 48, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..d1e7682cc2e8b6bd7dd8d011bea2e992223c8aab --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.ptx @@ -0,0 +1,765 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_argmax_1 // -- Begin function triton_red_fused_argmax_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_argmax_1 +.visible .entry triton_red_fused_argmax_1( + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_1, + .param .u32 triton_red_fused_argmax_1_param_2, + .param .u32 triton_red_fused_argmax_1_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_5 +) +.reqntid 512 +{ + .reg .pred %p<166>; + .reg .b32 %r<119>; + .reg .b64 %rd<31>; + .loc 1 18 0 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:18:0 +$L__func_begin0: + .loc 1 18 0 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_red_fused_argmax_1_param_1]; + ld.param.b64 %rd7, [triton_red_fused_argmax_1_param_0]; +$L__tmp0: + .loc 1 23 28 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:23:28 + mov.u32 %r20, %ctaid.x; + .loc 1 23 33 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:23:33 + shl.b32 %r1, %r20, 3; + .loc 1 24 44 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 448; + bfe.u32 %r21, %r2, 6, 3; + .loc 1 24 23 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:23 + or.b32 %r22, %r21, %r1; + .loc 1 26 37 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:26:37 + shl.b32 %r23, %r2, 2; + and.b32 %r24, %r23, 252; + .loc 1 29 19 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:29:19 + bfe.s32 %r25, %r20, 28, 1; + shr.u32 %r26, %r25, 21; + add.s32 %r27, %r22, %r26; + shr.s32 %r28, %r27, 11; + mul.lo.s32 %r29, %r28, 224000; + .loc 1 33 40 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:33:40 + cvt.u64.u32 %rd1, %r24; + mad.lo.s32 %r30, %r20, 256000, %r29; + mad.lo.s32 %r31, %r21, 32000, %r30; + or.b32 %r32, %r31, %r24; + cvt.u64.u32 %rd2, %r32; + mov.b32 %r115, 0fFF800000; + mov.b64 %rd30, {%r115, %r115}; + mov.b32 %r113, 2147483647; + mov.b64 %rd29, -256; + mov.b32 %r114, %r113; + mov.b32 %r116, %r115; + mov.b32 %r117, %r113; + mov.b32 %r118, %r113; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 34 31 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:34:31 + add.s64 %rd14, %rd1, %rd29; + add.s64 %rd15, %rd14, 256; + add.s64 %rd16, %rd14, 258; + .loc 1 39 52 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:52 + add.s64 %rd17, %rd14, 259; + .loc 1 39 34 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:34 + add.s64 %rd18, %rd2, %rd29; + cvt.u32.u64 %r41, %rd18; + add.s32 %r42, %r41, 256; + mad.wide.s32 %rd12, %r42, 4, %rd7; + .loc 1 39 66 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:66 + // begin inline asm + mov.u64 %rd11, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd11, 1.0; + // end inline asm + mov.b32 %r37, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r33, %r37; + mov.u32 %r34, %r37; + mov.u32 %r35, %r37; + mov.u32 %r36, %r37; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd12 + 0 ], %rd11; + // end inline asm +$L__tmp1: + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.f32 %p2, %r115, %r35; + setp.gt.f32 %p3, %r116, %r36; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.eq.f32 %p4, %r115, %r35; + setp.eq.f32 %p5, %r116, %r36; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + mov.b64 {%r43, %r44}, %rd30; + setp.nan.f32 %p6, %r44, %r44; + setp.nan.f32 %p7, %r43, %r43; + setp.nan.f32 %p8, %r115, %r115; + setp.nan.f32 %p9, %r116, %r116; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.nan.f32 %p10, %r35, %r35; + setp.num.f32 %p11, %r35, %r35; + setp.nan.f32 %p12, %r36, %r36; + setp.num.f32 %p13, %r36, %r36; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p14, %p8, %p11; + and.pred %p15, %p9, %p13; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p16, %p2, %p14; + or.pred %p17, %p3, %p15; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p18, %p8, %p10; + and.pred %p19, %p9, %p12; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p20, %p4, %p18; + or.pred %p21, %p5, %p19; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + cvt.s64.s32 %rd19, %r118; + cvt.s64.s32 %rd20, %r117; + cvt.s64.s32 %rd21, %r113; + setp.gt.s64 %p22, %rd16, %rd21; + cvt.s64.s32 %rd22, %r114; + setp.gt.s64 %p23, %rd17, %rd22; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p24, %p22, %p20; + and.pred %p25, %p23, %p21; +$L__tmp2: + .loc 1 39 66 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:66 + cvt.u64.u32 %rd23, %r33; + cvt.u64.u32 %rd24, %r34; + shl.b64 %rd25, %rd24, 32; + or.b64 %rd26, %rd23, %rd25; +$L__tmp3: + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + mov.b64 {%r45, %r46}, %rd26; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.f32 %p26, %r43, %r45; + setp.gt.f32 %p27, %r44, %r46; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.eq.f32 %p28, %r44, %r46; + setp.eq.f32 %p29, %r43, %r45; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.nan.f32 %p30, %r45, %r45; + setp.nan.f32 %p31, %r46, %r46; + setp.num.f32 %p32, %r46, %r46; + setp.num.f32 %p33, %r45, %r45; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p34, %p7, %p33; + and.pred %p35, %p6, %p32; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p36, %p27, %p35; + or.pred %p37, %p26, %p34; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p38, %p6, %p31; + and.pred %p39, %p7, %p30; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p40, %p29, %p39; + or.pred %p41, %p28, %p38; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.s64 %p42, %rd15, %rd20; + setp.ge.s64 %p43, %rd15, %rd19; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p44, %p43, %p41; + and.pred %p45, %p42, %p40; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p46, %p37, %p45; + or.pred %p47, %p36, %p44; + or.pred %p48, %p16, %p24; + or.pred %p49, %p17, %p25; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + selp.f32 %r47, %r44, %r46, %p47; + selp.f32 %r48, %r43, %r45, %p46; + mov.b64 %rd30, {%r48, %r47}; + selp.f32 %r115, %r115, %r35, %p48; + selp.f32 %r116, %r116, %r36, %p49; + cvt.u32.u64 %r49, %rd15; + cvt.u32.u64 %r50, %rd14; + add.s32 %r51, %r50, 257; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + selp.b32 %r118, %r118, %r51, %p47; + selp.b32 %r117, %r117, %r49, %p46; + cvt.u32.u64 %r52, %rd16; + selp.b32 %r113, %r113, %r52, %p48; + cvt.u32.u64 %r53, %rd17; + selp.b32 %r114, %r114, %r53, %p49; +$L__tmp4: + .loc 1 33 40 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:33:40 + add.s64 %rd29, %rd29, 256; + setp.lt.u64 %p50, %rd29, 31744; + @%p50 bra $L__BB0_1; +// %bb.2: + .loc 1 24 44 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:44 + and.b32 %r66, %r2, 7; + .loc 1 24 23 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:23 + or.b32 %r67, %r1, %r66; + .loc 1 24 44 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:44 + and.b32 %r68, %r2, 31; +$L__tmp5: + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + mov.b64 {%r69, %r70}, %rd30; + setp.gt.f32 %p58, %r69, %r70; + setp.eq.f32 %p59, %r70, %r69; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p60, %r69, %r69; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.num.f32 %p61, %r70, %r70; + setp.nan.f32 %p62, %r70, %r70; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p63, %p60, %p62; + and.pred %p64, %p60, %p61; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p65, %p58, %p64; + or.pred %p66, %p59, %p63; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p67, %r117, %r118; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p68, %p67, %p66; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p69, %p65, %p68; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r71, %r69, %r70, %p69; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r72, %r117, %r118, %p69; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p70, %r71, %r115; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p71, %r71, %r115; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p72, %r71, %r71; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p73, %r115, %r115; + setp.num.f32 %p74, %r115, %r115; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p75, %p72, %p74; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p76, %p70, %p75; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p77, %p73, %p72; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p78, %p71, %p77; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p79, %r72, %r113; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p80, %p79, %p78; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p81, %p76, %p80; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r73, %r71, %r115, %p81; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r74, %r72, %r113, %p81; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p82, %r73, %r116; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p83, %r73, %r116; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p84, %r73, %r73; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p85, %r116, %r116; + setp.num.f32 %p86, %r116, %r116; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p87, %p84, %p86; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p88, %p82, %p87; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p89, %p85, %p84; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p90, %p83, %p89; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p91, %r74, %r114; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p92, %p91, %p90; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p93, %p88, %p92; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r75, %r73, %r116, %p93; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r76, %r74, %r114, %p93; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r77, %r75, 16, 31, -1; + shfl.sync.bfly.b32 %r78, %r76, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p94, %r75, %r77; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p95, %r75, %r77; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p96, %r75, %r75; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p97, %r77, %r77; + setp.num.f32 %p98, %r77, %r77; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p99, %p96, %p98; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p100, %p94, %p99; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p101, %p96, %p97; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p102, %p95, %p101; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p103, %r76, %r78; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p104, %p103, %p102; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p105, %p100, %p104; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r79, %r75, %r77, %p105; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r80, %r76, %r78, %p105; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r81, %r79, 8, 31, -1; + shfl.sync.bfly.b32 %r82, %r80, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p106, %r79, %r81; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p107, %r79, %r81; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p108, %r79, %r79; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p109, %r81, %r81; + setp.num.f32 %p110, %r81, %r81; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p111, %p108, %p110; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p112, %p106, %p111; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p113, %p109, %p108; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p114, %p107, %p113; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p115, %r80, %r82; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p116, %p115, %p114; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p117, %p112, %p116; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r83, %r79, %r81, %p117; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r84, %r80, %r82, %p117; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r85, %r83, 4, 31, -1; + shfl.sync.bfly.b32 %r86, %r84, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p118, %r83, %r85; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p119, %r83, %r85; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p120, %r83, %r83; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p121, %r85, %r85; + setp.num.f32 %p122, %r85, %r85; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p123, %p120, %p122; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p124, %p118, %p123; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p125, %p121, %p120; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p126, %p119, %p125; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p127, %r84, %r86; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p128, %p127, %p126; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p129, %p124, %p128; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r87, %r83, %r85, %p129; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r88, %r84, %r86, %p129; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r89, %r87, 2, 31, -1; + shfl.sync.bfly.b32 %r90, %r88, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p130, %r87, %r89; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p131, %r87, %r89; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p132, %r87, %r87; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p133, %r89, %r89; + setp.num.f32 %p134, %r89, %r89; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p135, %p132, %p134; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p136, %p130, %p135; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p137, %p133, %p132; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p138, %p131, %p137; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p139, %r88, %r90; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p140, %p139, %p138; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p141, %p136, %p140; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r91, %r87, %r89, %p141; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r92, %r88, %r90, %p141; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r93, %r91, 1, 31, -1; + shfl.sync.bfly.b32 %r94, %r92, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p142, %r91, %r93; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p143, %r91, %r93; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p144, %r91, %r91; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p145, %r93, %r93; + setp.num.f32 %p146, %r93, %r93; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p147, %p144, %p146; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p148, %p142, %p147; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p149, %p145, %p144; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p150, %p143, %p149; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p151, %r92, %r94; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p152, %p151, %p150; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p153, %p148, %p152; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r57, %r92, %r94, %p153; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + bfe.u32 %r95, %r2, 5, 1; + setp.eq.b32 %p51, %r68, 0; + shr.u32 %r96, %r3, 5; + or.b32 %r97, %r96, %r95; + shl.b32 %r98, %r97, 2; + mov.b32 %r99, global_smem; + add.s32 %r54, %r99, %r98; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r55, %r91, %r93, %p153; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + // begin inline asm + @%p51 st.shared.b32 [ %r54 + 0 ], %r55; + // end inline asm + add.s32 %r100, %r99, 64; + add.s32 %r56, %r100, %r98; + // begin inline asm + @%p51 st.shared.b32 [ %r56 + 0 ], %r57; + // end inline asm + bar.sync 0; + setp.lt.u32 %p53, %r2, 16; + add.s32 %r59, %r99, %r23; + // begin inline asm + @%p53 ld.shared.b32 %r58, [ %r59 + 0 ]; + // end inline asm + add.s32 %r61, %r100, %r23; + // begin inline asm + @%p53 ld.shared.b32 %r60, [ %r61 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r102, %r58, 1, 31, -1; + shfl.sync.bfly.b32 %r103, %r60, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p154, %r58, %r102; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p155, %r58, %r102; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p156, %r58, %r58; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p157, %r102, %r102; + setp.num.f32 %p158, %r102, %r102; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p159, %p156, %p158; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p160, %p154, %p159; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p161, %p156, %p157; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p162, %p155, %p161; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p163, %r60, %r103; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p164, %p163, %p162; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p165, %p160, %p164; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r65, %r60, %r103, %p165; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.b32 %r104, %r2, 1009; + setp.eq.b32 %p55, %r104, 0; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r63, %r58, %r102, %p165; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + // begin inline asm + @%p55 st.shared.b32 [ %r59 + 0 ], %r63; + // end inline asm + // begin inline asm + @%p55 st.shared.b32 [ %r61 + 0 ], %r65; + // end inline asm + bar.sync 0; + shr.u32 %r105, %r3, 3; + add.s32 %r106, %r100, %r105; + ld.shared.b32 %r107, [%r106]; +$L__tmp6: + .loc 1 48 25 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:48:25 + mad.wide.s32 %rd28, %r67, 8, %rd8; + .loc 1 48 36 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:48:36 + bar.sync 0; + shr.u32 %r108, %r3, 4; + add.s32 %r109, %r99, %r108; + st.shared.b32 [%r109], %r107; + bar.sync 0; + shl.b32 %r110, %r66, 2; + add.s32 %r111, %r99, %r110; + ld.shared.s32 %rd27, [%r111]; + and.b32 %r112, %r2, 504; + setp.eq.b32 %p57, %r112, 0; + // begin inline asm + @%p57 st.global.b64 [ %rd28 + 0 ], { %rd27 }; + // end inline asm + .loc 1 48 4 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:48:4 + ret; +$L__tmp7: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 234 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 101 +.b8 53 +.b8 114 +.b8 111 +.b8 97 +.b8 116 +.b8 115 +.b8 101 +.b8 122 +.b8 111 +.b8 100 +.b8 113 +.b8 102 +.b8 53 +.b8 113 +.b8 112 +.b8 100 +.b8 51 +.b8 50 +.b8 104 +.b8 107 +.b8 115 +.b8 112 +.b8 116 +.b8 52 +.b8 102 +.b8 99 +.b8 102 +.b8 105 +.b8 113 +.b8 111 +.b8 55 +.b8 106 +.b8 111 +.b8 98 +.b8 111 +.b8 109 +.b8 55 +.b8 112 +.b8 122 +.b8 104 +.b8 115 +.b8 102 +.b8 116 +.b8 117 +.b8 107 +.b8 50 +.b8 55 +.b8 103 +.b8 106 +.b8 105 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 101 +.b8 53 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.source new file mode 100644 index 0000000000000000000000000000000000000000..53a483fa4807f3a33fa4e2f6ee2419577ade9a71 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.source @@ -0,0 +1,315 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":18:0) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc45 = loc(unknown) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("out_ptr0"(#loc)) +#loc72 = loc("xnumel"(#loc)) +#loc73 = loc("r0_numel"(#loc)) +#loc100 = loc("a_value"(#loc33)) +#loc101 = loc("a_index"(#loc33)) +#loc102 = loc("b_value"(#loc33)) +#loc103 = loc("b_index"(#loc33)) +#loc116 = loc("x"(#loc53)) +#loc117 = loc("x"(#loc57)) +#loc118 = loc("value"(#loc66)) +#loc119 = loc("index"(#loc66)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 16384 : i32 loc(#loc74) + %r0_numel_1 = arith.constant 32000 : i32 loc(#loc75) + %xoffset = tt.get_program_id x : i32 loc(#loc76) + %xoffset_2 = arith.constant 8 : i32 loc(#loc77) + %xoffset_3 = arith.constant 8 : i32 loc(#loc77) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc77) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc78) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc79) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc80) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc80) + %xmask = arith.constant true loc(#loc81) + %xmask_8 = arith.constant dense : tensor<8x256xi1> loc(#loc81) + %r0_base = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc82) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<256xi32> -> tensor<1x256xi32> loc(#loc83) + %x0 = arith.constant 2048 : i32 loc(#loc84) + %x0_10 = arith.constant 2048 : i32 loc(#loc84) + %x0_11 = arith.constant dense<2048> : tensor<8x1xi32> loc(#loc84) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<8x1xi32> loc(#loc84) + %x1 = arith.constant 2048 : i32 loc(#loc85) + %x1_13 = arith.constant 2048 : i32 loc(#loc85) + %x1_14 = arith.constant dense<2048> : tensor<8x1xi32> loc(#loc85) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<8x1xi32> loc(#loc85) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc86) + %_tmp2_16 = arith.constant dense<0xFF800000> : tensor<8x256xf32> loc(#loc86) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc87) + %_tmp2_index_17 = arith.constant dense<2147483647> : tensor<8x256xi32> loc(#loc87) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c256_i32 = arith.constant 256 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c256_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp2_index_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_19 = %_tmp2_16, %_tmp2_index_20 = %_tmp2_index_17) -> (tensor<8x256xf32>, tensor<8x256xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x256xi32> loc(#loc89) + %r0_index_21 = arith.addi %r0_index, %r0_base_9 : tensor<1x256xi32> loc(#loc89) + %r0_mask = arith.constant dense<32000> : tensor<1x256xi32> loc(#loc90) + %r0_mask_22 = arith.cmpi slt, %r0_index_21, %r0_mask : tensor<1x256xi32> loc(#loc90) + %tmp0 = arith.constant 32000 : i32 loc(#loc91) + %tmp0_23 = arith.constant 32000 : i32 loc(#loc91) + %tmp0_24 = arith.constant dense<32000> : tensor<8x1xi32> loc(#loc91) + %tmp0_25 = arith.muli %tmp0_24, %x0_12 : tensor<8x1xi32> loc(#loc91) + %tmp0_26 = tt.broadcast %r0_index_21 : tensor<1x256xi32> -> tensor<8x256xi32> loc(#loc92) + %tmp0_27 = tt.broadcast %tmp0_25 : tensor<8x1xi32> -> tensor<8x256xi32> loc(#loc92) + %tmp0_28 = arith.addi %tmp0_26, %tmp0_27 : tensor<8x256xi32> loc(#loc92) + %tmp0_29 = arith.constant 65760000 : i32 loc(#loc93) + %tmp0_30 = arith.constant 65760000 : i32 loc(#loc93) + %tmp0_31 = arith.constant dense<65760000> : tensor<8x1xi32> loc(#loc93) + %tmp0_32 = arith.muli %tmp0_31, %x1_15 : tensor<8x1xi32> loc(#loc93) + %tmp0_33 = tt.broadcast %tmp0_32 : tensor<8x1xi32> -> tensor<8x256xi32> loc(#loc94) + %tmp0_34 = arith.addi %tmp0_28, %tmp0_33 : tensor<8x256xi32> loc(#loc94) + %tmp0_35 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x256x!tt.ptr> loc(#loc95) + %tmp0_36 = tt.addptr %tmp0_35, %tmp0_34 : tensor<8x256x!tt.ptr>, tensor<8x256xi32> loc(#loc95) + %tmp0_37 = arith.constant 0.000000e+00 : f32 loc(#loc96) + %tmp0_38 = tt.broadcast %r0_mask_22 : tensor<1x256xi1> -> tensor<8x256xi1> loc(#loc96) + %tmp0_39 = arith.constant dense<0.000000e+00> : tensor<8x256xf32> loc(#loc96) + %tmp0_40 = tt.load %tmp0_36, %tmp0_38, %tmp0_39 evictionPolicy = evict_first : tensor<8x256x!tt.ptr> loc(#loc96) + %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S8_256S_i32S8_256S_fp32S8_256S_i32S1_256S__(%_tmp2_19, %_tmp2_index_20, %tmp0_40, %r0_index_21) : (tensor<8x256xf32>, tensor<8x256xi32>, tensor<8x256xf32>, tensor<1x256xi32>) -> (tensor<8x256xf32>, tensor<8x256xi32>) loc(#loc24) + %_tmp2_41 = tt.broadcast %r0_mask_22 : tensor<1x256xi1> -> tensor<8x256xi1> loc(#loc97) + %_tmp2_42 = arith.select %_tmp2_41, %8#0, %_tmp2_19 : tensor<8x256xi1>, tensor<8x256xf32> loc(#loc97) + %_tmp2_index_43 = tt.broadcast %r0_mask_22 : tensor<1x256xi1> -> tensor<8x256xi1> loc(#loc98) + %_tmp2_index_44 = arith.select %_tmp2_index_43, %8#1, %_tmp2_index_20 : tensor<8x256xi1>, tensor<8x256xi32> loc(#loc98) + scf.yield %_tmp2_42, %_tmp2_index_44 : tensor<8x256xf32>, tensor<8x256xi32> loc(#loc27) + } loc(#loc120) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S8_256S_i32S8_256S__(2,)cconstexpr_1_"(%_tmp2_index_18#0, %_tmp2_index_18#1) : (tensor<8x256xf32>, tensor<8x256xi32>) -> (tensor<8xf32>, tensor<8xi32>) loc(#loc28) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc99) + %5 = tt.splat %out_ptr0 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc30) + %6 = tt.addptr %5, %xindex_7 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc30) + %7 = arith.extsi %tmp2 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc31) + tt.store %6, %7 : tensor<8x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S8_256S_i32S8_256S_fp32S8_256S_i32S1_256S__(%a_value: tensor<8x256xf32> loc("a_value"(#loc33)), %a_index: tensor<8x256xi32> loc("a_index"(#loc33)), %b_value: tensor<8x256xf32> loc("b_value"(#loc33)), %b_index: tensor<1x256xi32> loc("b_index"(#loc33))) -> (tensor<8x256xf32>, tensor<8x256xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<8x256xf32> loc(#loc121) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<8x256xf32> loc(#loc122) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S8_256S__(%a_value) : (tensor<8x256xf32>) -> i1 loc(#loc36) + %1:2 = scf.if %0 -> (tensor<8x256xi1>, tensor<8x256xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<8x256xf32> loc(#loc106) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<8x256xf32> loc(#loc107) + %mask_4 = arith.constant true loc(#loc108) + %mask_5 = arith.constant dense : tensor<8x256xi1> loc(#loc108) + %mask_6 = arith.xori %b_isnan, %mask_5 : tensor<8x256xi1> loc(#loc108) + %mask_7 = arith.andi %a_isnan, %mask_6 : tensor<8x256xi1> loc(#loc109) + %mask_8 = arith.ori %mask, %mask_7 : tensor<8x256xi1> loc(#loc123) + %equal_9 = arith.andi %a_isnan, %b_isnan : tensor<8x256xi1> loc(#loc111) + %equal_10 = arith.ori %equal, %equal_9 : tensor<8x256xi1> loc(#loc124) + scf.yield %mask_8, %equal_10 : tensor<8x256xi1>, tensor<8x256xi1> loc(#loc124) + } else { + scf.yield %mask, %equal : tensor<8x256xi1>, tensor<8x256xi1> loc(#loc45) + } loc(#loc37) + %mask_0 = tt.broadcast %b_index : tensor<1x256xi32> -> tensor<8x256xi32> loc(#loc113) + %mask_1 = arith.cmpi slt, %a_index, %mask_0 : tensor<8x256xi32> loc(#loc113) + %mask_2 = arith.andi %1#1, %mask_1 : tensor<8x256xi1> loc(#loc114) + %mask_3 = arith.ori %1#0, %mask_2 : tensor<8x256xi1> loc(#loc115) + %2 = arith.select %mask_3, %a_value, %b_value : tensor<8x256xi1>, tensor<8x256xf32> loc(#loc49) + %3 = tt.broadcast %b_index : tensor<1x256xi32> -> tensor<8x256xi32> loc(#loc50) + %4 = arith.select %mask_3, %a_index, %3 : tensor<8x256xi1>, tensor<8x256xi32> loc(#loc50) + tt.return %2, %4 : tensor<8x256xf32>, tensor<8x256xi32> loc(#loc51) + ^bb1: // no predecessors + %5 = ub.poison : tensor<8x256xf32> loc(#loc52) + %6 = ub.poison : tensor<8x256xi32> loc(#loc52) + tt.return %5, %6 : tensor<8x256xf32>, tensor<8x256xi32> loc(#loc52) + } loc(#loc33) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S8_256S__(%x: tensor<8x256xf32> loc("x"(#loc53))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S8_256S__(%x) : (tensor<8x256xf32>) -> tensor<8x256xf32> loc(#loc54) + %true = arith.constant true loc(#loc55) + tt.return %true : i1 loc(#loc55) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc56) + tt.return %1 : i1 loc(#loc56) + } loc(#loc53) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S8_256S__(%x: tensor<8x256xf32> loc("x"(#loc57))) -> tensor<8x256xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc59) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<8x256xf32> loc(#loc59) + %4 = arith.addf %x, %3 : tensor<8x256xf32> loc(#loc59) + tt.return %4 : tensor<8x256xf32> loc(#loc60) + ^bb1: // no predecessors + %5 = ub.poison : tensor<8x256xf32> loc(#loc61) + tt.return %5 : tensor<8x256xf32> loc(#loc61) + } loc(#loc57) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc63) + %cst = arith.constant dense : tensor<1xi1> loc(#loc63) + tt.return %cst : tensor<1xi1> loc(#loc64) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc65) + tt.return %0 : tensor<1xi1> loc(#loc65) + } loc(#loc62) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S8_256S_i32S8_256S__(2,)cconstexpr_1_"(%value: tensor<8x256xf32> loc("value"(#loc66)), %index: tensor<8x256xi32> loc("index"(#loc66))) -> (tensor<8xf32>, tensor<8xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc67) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc67) + }) : (tensor<8x256xf32>, tensor<8x256xi32>) -> (tensor<8xf32>, tensor<8xi32>) loc(#loc67) + tt.return %0#0, %0#1 : tensor<8xf32>, tensor<8xi32> loc(#loc68) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8xf32> loc(#loc69) + %2 = ub.poison : tensor<8xi32> loc(#loc69) + tt.return %1, %2 : tensor<8xf32>, tensor<8xi32> loc(#loc69) + } loc(#loc66) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc33)), %a_index: i32 loc("a_index"(#loc33)), %b_value: f32 loc("b_value"(#loc33)), %b_index: i32 loc("b_index"(#loc33))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc121) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc122) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc36) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc106) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc107) + %mask_3 = arith.constant true loc(#loc108) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc108) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc109) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc123) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc111) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc124) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc124) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc45) + } loc(#loc37) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc113) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc114) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc115) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc49) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc50) + tt.return %2, %3 : f32, i32 loc(#loc51) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc52) + %5 = ub.poison : i32 loc(#loc52) + tt.return %4, %5 : f32, i32 loc(#loc52) + } loc(#loc33) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc53))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc54) + %true = arith.constant true loc(#loc55) + tt.return %true : i1 loc(#loc55) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc56) + tt.return %1 : i1 loc(#loc56) + } loc(#loc53) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc57))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc59) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc59) + tt.return %3 : tensor<1xf32> loc(#loc60) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc61) + tt.return %4 : tensor<1xf32> loc(#loc61) + } loc(#loc57) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":29:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":30:55) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":31:58) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":33:40) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":34:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":35:29) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:47) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:61) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:52) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:66) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":42:38) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":44:46) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:58) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:8) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":46:75) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":47:20) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:25) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:36) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:4) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc74 = loc("xnumel"(#loc1)) +#loc75 = loc("r0_numel"(#loc2)) +#loc76 = loc("xoffset"(#loc3)) +#loc77 = loc("xoffset"(#loc4)) +#loc78 = loc("xindex"(#loc5)) +#loc79 = loc("xindex"(#loc6)) +#loc80 = loc("xindex"(#loc7)) +#loc81 = loc("xmask"(#loc8)) +#loc82 = loc("r0_base"(#loc9)) +#loc83 = loc("r0_base"(#loc10)) +#loc84 = loc("x0"(#loc11)) +#loc85 = loc("x1"(#loc12)) +#loc86 = loc("_tmp2"(#loc13)) +#loc87 = loc("_tmp2_index"(#loc14)) +#loc88 = loc("_tmp2"(#loc15)) +#loc89 = loc("r0_index"(#loc16)) +#loc90 = loc("r0_mask"(#loc17)) +#loc91 = loc("tmp0"(#loc18)) +#loc92 = loc("tmp0"(#loc19)) +#loc93 = loc("tmp0"(#loc20)) +#loc94 = loc("tmp0"(#loc21)) +#loc95 = loc("tmp0"(#loc22)) +#loc96 = loc("tmp0"(#loc23)) +#loc97 = loc("_tmp2"(#loc25)) +#loc98 = loc("_tmp2_index"(#loc26)) +#loc99 = loc("tmp2"(#loc29)) +#loc104 = loc("mask"(#loc34)) +#loc105 = loc("equal"(#loc35)) +#loc106 = loc("a_isnan"(#loc38)) +#loc107 = loc("b_isnan"(#loc39)) +#loc108 = loc("mask"(#loc40)) +#loc109 = loc("mask"(#loc41)) +#loc110 = loc("mask"(#loc42)) +#loc111 = loc("equal"(#loc43)) +#loc112 = loc("equal"(#loc44)) +#loc113 = loc("mask"(#loc46)) +#loc114 = loc("mask"(#loc47)) +#loc115 = loc("mask"(#loc48)) +#loc120 = loc("_tmp2_index"(#loc88)) +#loc121 = loc("mask"(#loc104)) +#loc122 = loc("equal"(#loc105)) +#loc123 = loc("mask"(#loc110)) +#loc124 = loc("equal"(#loc112)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..8546a3980cae6196154fb9102698dc2bb45be1ea --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.ttgir @@ -0,0 +1,203 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [8, 2], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":18:0) +#loc1 = loc(unknown) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":46:75) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc79 = loc(callsite(#loc1 at #loc37)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<65760000> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<32000> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<32000> : tensor<1x256xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<2048> : tensor<8x1xi32, #blocked> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<8x256xf32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %true = arith.constant true loc(#loc1) + %cst_4 = arith.constant dense : tensor<8x256xi1, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<2147483647> : tensor<8x256xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<0xFF800000> : tensor<8x256xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc46) + %xoffset_7 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc47) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc48) + %xindex_8 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc48) + %xindex_9 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc48) + %xindex_10 = tt.expand_dims %xindex_8 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc48) + %xindex_11 = tt.splat %xoffset_7 : i32 -> tensor<8x1xi32, #blocked> loc(#loc49) + %xindex_12 = tt.splat %xoffset_7 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc49) + %xindex_13 = arith.addi %xindex_11, %xindex_9 : tensor<8x1xi32, #blocked> loc(#loc49) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<8x1xi32, #blocked1> loc(#loc49) + %r0_base = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc50) + %r0_base_15 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<256xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x256xi32, #blocked> loc(#loc50) + %x0 = arith.remsi %xindex_13, %cst_2 : tensor<8x1xi32, #blocked> loc(#loc51) + %x1 = arith.divsi %xindex_13, %cst_2 : tensor<8x1xi32, #blocked> loc(#loc52) + %tmp0 = arith.muli %x0, %cst_0 : tensor<8x1xi32, #blocked> loc(#loc53) + %tmp0_16 = tt.broadcast %tmp0 : tensor<8x1xi32, #blocked> -> tensor<8x256xi32, #blocked> loc(#loc54) + %tmp0_17 = arith.muli %x1, %cst : tensor<8x1xi32, #blocked> loc(#loc55) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<8x1xi32, #blocked> -> tensor<8x256xi32, #blocked> loc(#loc56) + %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x256x!tt.ptr, #blocked> loc(#loc57) + %_tmp2_index:2 = scf.for %_tmp2_index_20 = %c0_i32 to %c32000_i32 step %c256_i32 iter_args(%_tmp2 = %cst_6, %_tmp2_index_21 = %cst_5) -> (tensor<8x256xf32, #blocked>, tensor<8x256xi32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp2_index_20 : i32 -> tensor<1x256xi32, #blocked> loc(#loc59) + %r0_index_22 = arith.addi %r0_index, %r0_base_15 : tensor<1x256xi32, #blocked> loc(#loc59) + %r0_mask = arith.cmpi slt, %r0_index_22, %cst_1 : tensor<1x256xi32, #blocked> loc(#loc60) + %tmp0_23 = tt.broadcast %r0_index_22 : tensor<1x256xi32, #blocked> -> tensor<8x256xi32, #blocked> loc(#loc54) + %tmp0_24 = arith.addi %tmp0_23, %tmp0_16 : tensor<8x256xi32, #blocked> loc(#loc54) + %tmp0_25 = arith.addi %tmp0_24, %tmp0_18 : tensor<8x256xi32, #blocked> loc(#loc56) + %tmp0_26 = tt.addptr %tmp0_19, %tmp0_25 : tensor<8x256x!tt.ptr, #blocked>, tensor<8x256xi32, #blocked> loc(#loc57) + %tmp0_27 = tt.broadcast %r0_mask : tensor<1x256xi1, #blocked> -> tensor<8x256xi1, #blocked> loc(#loc61) + %tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst_3 evictionPolicy = evict_first : tensor<8x256x!tt.ptr, #blocked> loc(#loc61) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_28 : tensor<8x256xf32, #blocked> loc(#loc104) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_28 : tensor<8x256xf32, #blocked> loc(#loc105) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<8x256xf32, #blocked> loc(#loc84) + %b_isnan = arith.cmpf une, %tmp0_28, %tmp0_28 : tensor<8x256xf32, #blocked> loc(#loc85) + %mask_29 = arith.xori %b_isnan, %cst_4 : tensor<8x256xi1, #blocked> loc(#loc86) + %mask_30 = arith.andi %a_isnan, %mask_29 : tensor<8x256xi1, #blocked> loc(#loc87) + %mask_31 = arith.ori %mask, %mask_30 : tensor<8x256xi1, #blocked> loc(#loc106) + %equal_32 = arith.andi %a_isnan, %b_isnan : tensor<8x256xi1, #blocked> loc(#loc89) + %equal_33 = arith.ori %equal, %equal_32 : tensor<8x256xi1, #blocked> loc(#loc107) + %mask_34 = arith.cmpi slt, %_tmp2_index_21, %tmp0_23 : tensor<8x256xi32, #blocked> loc(#loc91) + %mask_35 = arith.andi %equal_33, %mask_34 : tensor<8x256xi1, #blocked> loc(#loc92) + %mask_36 = arith.ori %mask_31, %mask_35 : tensor<8x256xi1, #blocked> loc(#loc93) + %5 = arith.select %mask_36, %_tmp2, %tmp0_28 : tensor<8x256xi1, #blocked>, tensor<8x256xf32, #blocked> loc(#loc74) + %6 = arith.select %mask_36, %_tmp2_index_21, %tmp0_23 : tensor<8x256xi1, #blocked>, tensor<8x256xi32, #blocked> loc(#loc75) + %_tmp2_37 = arith.select %tmp0_27, %5, %_tmp2 : tensor<8x256xi1, #blocked>, tensor<8x256xf32, #blocked> loc(#loc76) + %_tmp2_index_38 = arith.select %tmp0_27, %6, %_tmp2_index_21 : tensor<8x256xi1, #blocked>, tensor<8x256xi32, #blocked> loc(#loc77) + scf.yield %_tmp2_37, %_tmp2_index_38 : tensor<8x256xf32, #blocked>, tensor<8x256xi32, #blocked> loc(#loc35) + } loc(#loc81) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc37)), %arg5: i32 loc(callsite(#loc1 at #loc37)), %arg6: f32 loc(callsite(#loc1 at #loc37)), %arg7: i32 loc(callsite(#loc1 at #loc37))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc108) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc109) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc94) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc95) + %mask_20 = arith.xori %b_isnan, %true : i1 loc(#loc96) + %mask_21 = arith.andi %a_isnan, %mask_20 : i1 loc(#loc97) + %mask_22 = arith.ori %mask, %mask_21 : i1 loc(#loc110) + %equal_23 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc98) + %equal_24 = arith.ori %equal, %equal_23 : i1 loc(#loc111) + %mask_25 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc99) + %mask_26 = arith.andi %equal_24, %mask_25 : i1 loc(#loc100) + %mask_27 = arith.ori %mask_22, %mask_26 : i1 loc(#loc101) + %5 = arith.select %mask_27, %arg4, %arg6 : f32 loc(#loc102) + %6 = arith.select %mask_27, %arg5, %arg7 : i32 loc(#loc103) + tt.reduce.return %5, %6 : f32, i32 loc(#loc78) + }) : (tensor<8x256xf32, #blocked>, tensor<8x256xi32, #blocked>) -> (tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc78) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc80) + %1 = tt.splat %out_ptr0 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked1> loc(#loc39) + %2 = tt.addptr %1, %xindex_14 : tensor<8x1x!tt.ptr, #blocked1>, tensor<8x1xi32, #blocked1> loc(#loc39) + %3 = ttg.convert_layout %tmp2 : tensor<8x1xi32, #blocked> -> tensor<8x1xi32, #blocked1> loc(#loc40) + %4 = arith.extsi %3 : tensor<8x1xi32, #blocked1> to tensor<8x1xi64, #blocked1> loc(#loc40) + tt.store %2, %4 : tensor<8x1x!tt.ptr, #blocked1> loc(#loc40) + tt.return loc(#loc41) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":29:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:47) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:61) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:52) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:34) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":33:40) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":34:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":35:29) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:66) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":42:38) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":44:46) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:58) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:8) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":47:20) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:25) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:36) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:4) +#loc46 = loc("xoffset"(#loc2)) +#loc47 = loc("xoffset"(#loc3)) +#loc48 = loc("xindex"(#loc4)) +#loc49 = loc("xindex"(#loc5)) +#loc50 = loc("r0_base"(#loc6)) +#loc51 = loc("x0"(#loc7)) +#loc52 = loc("x1"(#loc8)) +#loc53 = loc("tmp0"(#loc9)) +#loc54 = loc("tmp0"(#loc10)) +#loc55 = loc("tmp0"(#loc11)) +#loc56 = loc("tmp0"(#loc12)) +#loc57 = loc("tmp0"(#loc13)) +#loc58 = loc("_tmp2"(#loc14)) +#loc59 = loc("r0_index"(#loc15)) +#loc60 = loc("r0_mask"(#loc16)) +#loc61 = loc("tmp0"(#loc17)) +#loc62 = loc("mask"(#loc18)) +#loc63 = loc("equal"(#loc20)) +#loc64 = loc("a_isnan"(#loc21)) +#loc65 = loc("b_isnan"(#loc22)) +#loc66 = loc("mask"(#loc23)) +#loc67 = loc("mask"(#loc24)) +#loc68 = loc("mask"(#loc25)) +#loc69 = loc("equal"(#loc26)) +#loc70 = loc("equal"(#loc27)) +#loc71 = loc("mask"(#loc28)) +#loc72 = loc("mask"(#loc29)) +#loc73 = loc("mask"(#loc30)) +#loc74 = loc(callsite(#loc31 at #loc19)) +#loc75 = loc(callsite(#loc32 at #loc19)) +#loc76 = loc("_tmp2"(#loc33)) +#loc77 = loc("_tmp2_index"(#loc34)) +#loc78 = loc(callsite(#loc36 at #loc37)) +#loc80 = loc("tmp2"(#loc38)) +#loc81 = loc("_tmp2_index"(#loc58)) +#loc82 = loc("mask"(#loc62)) +#loc83 = loc("equal"(#loc63)) +#loc84 = loc(callsite(#loc64 at #loc19)) +#loc85 = loc(callsite(#loc65 at #loc19)) +#loc86 = loc(callsite(#loc66 at #loc19)) +#loc87 = loc(callsite(#loc67 at #loc19)) +#loc88 = loc("mask"(#loc68)) +#loc89 = loc(callsite(#loc69 at #loc19)) +#loc90 = loc("equal"(#loc70)) +#loc91 = loc(callsite(#loc71 at #loc19)) +#loc92 = loc(callsite(#loc72 at #loc19)) +#loc93 = loc(callsite(#loc73 at #loc19)) +#loc94 = loc(callsite(#loc64 at #loc78)) +#loc95 = loc(callsite(#loc65 at #loc78)) +#loc96 = loc(callsite(#loc66 at #loc78)) +#loc97 = loc(callsite(#loc67 at #loc78)) +#loc98 = loc(callsite(#loc69 at #loc78)) +#loc99 = loc(callsite(#loc71 at #loc78)) +#loc100 = loc(callsite(#loc72 at #loc78)) +#loc101 = loc(callsite(#loc73 at #loc78)) +#loc102 = loc(callsite(#loc31 at #loc78)) +#loc103 = loc(callsite(#loc32 at #loc78)) +#loc104 = loc(callsite(#loc82 at #loc19)) +#loc105 = loc(callsite(#loc83 at #loc19)) +#loc106 = loc(callsite(#loc88 at #loc19)) +#loc107 = loc(callsite(#loc90 at #loc19)) +#loc108 = loc(callsite(#loc82 at #loc78)) +#loc109 = loc(callsite(#loc83 at #loc78)) +#loc110 = loc(callsite(#loc88 at #loc78)) +#loc111 = loc(callsite(#loc90 at #loc78)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..7d99137a2939ee2a9279a9930c22f010512484fd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/N23CRFMF2EPWWM5UIYFJANOWSINX3NOKKFVUUCSYHULQRDJNQE5A/triton_red_fused_argmax_1.ttir @@ -0,0 +1,204 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":46:75) +#loc46 = loc("in_ptr0"(#loc)) +#loc47 = loc("out_ptr0"(#loc)) +#loc48 = loc("xnumel"(#loc)) +#loc49 = loc("r0_numel"(#loc)) +#loc50 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %true = arith.constant true loc(#loc50) + %cst = arith.constant dense : tensor<8x256xi1> loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc3) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<8x256xf32> loc(#loc1) + %cst_1 = arith.constant dense<65760000> : tensor<8x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<32000> : tensor<8x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<32000> : tensor<1x256xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<8x256xi32> loc(#loc51) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<8x256xf32> loc(#loc52) + %cst_4 = arith.constant dense<2048> : tensor<8x1xi32> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc53) + %xoffset_5 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc54) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc55) + %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc56) + %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<8x1xi32> loc(#loc57) + %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<8x1xi32> loc(#loc57) + %r0_base = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc58) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<256xi32> -> tensor<1x256xi32> loc(#loc59) + %x0 = arith.remsi %xindex_8, %cst_4 : tensor<8x1xi32> loc(#loc60) + %x1 = arith.divsi %xindex_8, %cst_4 : tensor<8x1xi32> loc(#loc61) + %_tmp2_index_10:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c256_i32 iter_args(%_tmp2_11 = %_tmp2, %_tmp2_index_12 = %_tmp2_index) -> (tensor<8x256xf32>, tensor<8x256xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x256xi32> loc(#loc63) + %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x256xi32> loc(#loc63) + %r0_mask = arith.cmpi slt, %r0_index_13, %cst_3 : tensor<1x256xi32> loc(#loc64) + %tmp0 = arith.muli %x0, %cst_2 : tensor<8x1xi32> loc(#loc65) + %tmp0_14 = tt.broadcast %r0_index_13 : tensor<1x256xi32> -> tensor<8x256xi32> loc(#loc66) + %tmp0_15 = tt.broadcast %tmp0 : tensor<8x1xi32> -> tensor<8x256xi32> loc(#loc66) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<8x256xi32> loc(#loc66) + %tmp0_17 = arith.muli %x1, %cst_1 : tensor<8x1xi32> loc(#loc67) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<8x1xi32> -> tensor<8x256xi32> loc(#loc68) + %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<8x256xi32> loc(#loc68) + %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x256x!tt.ptr> loc(#loc69) + %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<8x256x!tt.ptr>, tensor<8x256xi32> loc(#loc69) + %tmp0_22 = tt.broadcast %r0_mask : tensor<1x256xi1> -> tensor<8x256xi1> loc(#loc70) + %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_0 evictionPolicy = evict_first : tensor<8x256x!tt.ptr> loc(#loc70) + %mask = arith.cmpf ogt, %_tmp2_11, %tmp0_23 : tensor<8x256xf32> loc(#loc112) + %equal = arith.cmpf oeq, %_tmp2_11, %tmp0_23 : tensor<8x256xf32> loc(#loc113) + %a_isnan = arith.cmpf une, %_tmp2_11, %_tmp2_11 : tensor<8x256xf32> loc(#loc92) + %b_isnan = arith.cmpf une, %tmp0_23, %tmp0_23 : tensor<8x256xf32> loc(#loc93) + %mask_24 = arith.xori %b_isnan, %cst : tensor<8x256xi1> loc(#loc94) + %mask_25 = arith.andi %a_isnan, %mask_24 : tensor<8x256xi1> loc(#loc95) + %mask_26 = arith.ori %mask, %mask_25 : tensor<8x256xi1> loc(#loc114) + %equal_27 = arith.andi %a_isnan, %b_isnan : tensor<8x256xi1> loc(#loc97) + %equal_28 = arith.ori %equal, %equal_27 : tensor<8x256xi1> loc(#loc115) + %mask_29 = arith.cmpi slt, %_tmp2_index_12, %tmp0_14 : tensor<8x256xi32> loc(#loc99) + %mask_30 = arith.andi %equal_28, %mask_29 : tensor<8x256xi1> loc(#loc100) + %mask_31 = arith.ori %mask_26, %mask_30 : tensor<8x256xi1> loc(#loc101) + %4 = arith.select %mask_31, %_tmp2_11, %tmp0_23 : tensor<8x256xi1>, tensor<8x256xf32> loc(#loc83) + %5 = arith.select %mask_31, %_tmp2_index_12, %tmp0_14 : tensor<8x256xi1>, tensor<8x256xi32> loc(#loc84) + %_tmp2_32 = arith.select %tmp0_22, %4, %_tmp2_11 : tensor<8x256xi1>, tensor<8x256xf32> loc(#loc85) + %_tmp2_index_33 = arith.select %tmp0_22, %5, %_tmp2_index_12 : tensor<8x256xi1>, tensor<8x256xi32> loc(#loc86) + scf.yield %_tmp2_32, %_tmp2_index_33 : tensor<8x256xf32>, tensor<8x256xi32> loc(#loc40) + } loc(#loc89) + %0:2 = "tt.reduce"(%_tmp2_index_10#0, %_tmp2_index_10#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc2)), %arg5: i32 loc(callsite(#loc1 at #loc2)), %arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc116) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc117) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc102) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc103) + %mask_11 = arith.xori %b_isnan, %true : i1 loc(#loc104) + %mask_12 = arith.andi %a_isnan, %mask_11 : i1 loc(#loc105) + %mask_13 = arith.ori %mask, %mask_12 : i1 loc(#loc118) + %equal_14 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc106) + %equal_15 = arith.ori %equal, %equal_14 : i1 loc(#loc119) + %mask_16 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc107) + %mask_17 = arith.andi %equal_15, %mask_16 : i1 loc(#loc108) + %mask_18 = arith.ori %mask_13, %mask_17 : i1 loc(#loc109) + %4 = arith.select %mask_18, %arg4, %arg6 : f32 loc(#loc110) + %5 = arith.select %mask_18, %arg5, %arg7 : i32 loc(#loc111) + tt.reduce.return %4, %5 : f32, i32 loc(#loc87) + }) : (tensor<8x256xf32>, tensor<8x256xi32>) -> (tensor<8xf32>, tensor<8xi32>) loc(#loc87) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc88) + %1 = tt.splat %out_ptr0 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc43) + %2 = tt.addptr %1, %xindex_8 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc43) + %3 = arith.extsi %tmp2 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc44) + tt.store %2, %3 : tensor<8x1x!tt.ptr> loc(#loc44) + tt.return loc(#loc45) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":33:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":31:58) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":30:55) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:33) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:44) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:23) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:27) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:37) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":28:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":29:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":34:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":35:29) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:47) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:41) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:61) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:52) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:66) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":42:38) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":44:46) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:58) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:8) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":47:20) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:25) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:36) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:4) +#loc51 = loc("_tmp2_index"(#loc4)) +#loc52 = loc("_tmp2"(#loc5)) +#loc53 = loc("xoffset"(#loc6)) +#loc54 = loc("xoffset"(#loc7)) +#loc55 = loc("xindex"(#loc8)) +#loc56 = loc("xindex"(#loc9)) +#loc57 = loc("xindex"(#loc10)) +#loc58 = loc("r0_base"(#loc11)) +#loc59 = loc("r0_base"(#loc12)) +#loc60 = loc("x0"(#loc13)) +#loc61 = loc("x1"(#loc14)) +#loc62 = loc("_tmp2"(#loc3)) +#loc63 = loc("r0_index"(#loc15)) +#loc64 = loc("r0_mask"(#loc16)) +#loc65 = loc("tmp0"(#loc17)) +#loc66 = loc("tmp0"(#loc18)) +#loc67 = loc("tmp0"(#loc19)) +#loc68 = loc("tmp0"(#loc20)) +#loc69 = loc("tmp0"(#loc21)) +#loc70 = loc("tmp0"(#loc22)) +#loc71 = loc("mask"(#loc23)) +#loc72 = loc("equal"(#loc25)) +#loc73 = loc("a_isnan"(#loc26)) +#loc74 = loc("b_isnan"(#loc27)) +#loc75 = loc("mask"(#loc28)) +#loc76 = loc("mask"(#loc29)) +#loc77 = loc("mask"(#loc30)) +#loc78 = loc("equal"(#loc31)) +#loc79 = loc("equal"(#loc32)) +#loc80 = loc("mask"(#loc33)) +#loc81 = loc("mask"(#loc34)) +#loc82 = loc("mask"(#loc35)) +#loc83 = loc(callsite(#loc36 at #loc24)) +#loc84 = loc(callsite(#loc37 at #loc24)) +#loc85 = loc("_tmp2"(#loc38)) +#loc86 = loc("_tmp2_index"(#loc39)) +#loc87 = loc(callsite(#loc41 at #loc2)) +#loc88 = loc("tmp2"(#loc42)) +#loc89 = loc("_tmp2_index"(#loc62)) +#loc90 = loc("mask"(#loc71)) +#loc91 = loc("equal"(#loc72)) +#loc92 = loc(callsite(#loc73 at #loc24)) +#loc93 = loc(callsite(#loc74 at #loc24)) +#loc94 = loc(callsite(#loc75 at #loc24)) +#loc95 = loc(callsite(#loc76 at #loc24)) +#loc96 = loc("mask"(#loc77)) +#loc97 = loc(callsite(#loc78 at #loc24)) +#loc98 = loc("equal"(#loc79)) +#loc99 = loc(callsite(#loc80 at #loc24)) +#loc100 = loc(callsite(#loc81 at #loc24)) +#loc101 = loc(callsite(#loc82 at #loc24)) +#loc102 = loc(callsite(#loc73 at #loc87)) +#loc103 = loc(callsite(#loc74 at #loc87)) +#loc104 = loc(callsite(#loc75 at #loc87)) +#loc105 = loc(callsite(#loc76 at #loc87)) +#loc106 = loc(callsite(#loc78 at #loc87)) +#loc107 = loc(callsite(#loc80 at #loc87)) +#loc108 = loc(callsite(#loc81 at #loc87)) +#loc109 = loc(callsite(#loc82 at #loc87)) +#loc110 = loc(callsite(#loc36 at #loc87)) +#loc111 = loc(callsite(#loc37 at #loc87)) +#loc112 = loc(callsite(#loc90 at #loc24)) +#loc113 = loc(callsite(#loc91 at #loc24)) +#loc114 = loc(callsite(#loc96 at #loc24)) +#loc115 = loc(callsite(#loc98 at #loc24)) +#loc116 = loc(callsite(#loc90 at #loc87)) +#loc117 = loc(callsite(#loc91 at #loc87)) +#loc118 = loc(callsite(#loc96 at #loc87)) +#loc119 = loc(callsite(#loc98 at #loc87)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3d51f3192d146856da2429f60833161d73cbcc33 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..7501c35e7e85b8d212a64c92f4491c084a1401e1 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..519345855530a41d2e3c8c6dd0fe273acf2daf66 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "68c8e7f3cd56162d609f5944278cf25fbc666ee83e5effc55fab58cfccaadf6a", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..9f529704f0f2ba8627fca399755fe5e13917febe --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.llir @@ -0,0 +1,140 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 6, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 252, !dbg !9 + %12 = lshr exact i32 %11, 2, !dbg !9 + %13 = or disjoint i32 %12, %9, !dbg !10 + %14 = and i32 %10, 3, !dbg !11 + %15 = sdiv i32 %13, 2048, !dbg !12 + %16 = mul i32 %15, 2048, !dbg !13 + %.decomposed = sub i32 %13, %16, !dbg !13 + %17 = srem i32 %15, 32, !dbg !14 + %18 = sdiv i32 %13, 65536, !dbg !15 + %19 = shl nsw i32 %17, 7, !dbg !16 + %20 = shl nsw i32 %.decomposed, 12, !dbg !17 + %21 = shl i32 %18, 23, !dbg !18 + %22 = shl i32 %13, 7, !dbg !19 + %23 = add i32 %21, %20 + %24 = add i32 %23, %19 + %25 = zext nneg i32 %14 to i64, !dbg !20 + %26 = sext i32 %22 to i64, !dbg !20 + %invariant.gep = getelementptr bfloat, ptr addrspace(1) %1, i64 %26, !dbg !20 + br label %27, !dbg !20 + +27: ; preds = %7, %27 + %indvars.iv = phi i64 [ 0, %7 ], [ %indvars.iv.next, %27 ] + %28 = phi float [ 0.000000e+00, %7 ], [ %43, %27 ] + %29 = or disjoint i64 %indvars.iv, %25, !dbg !21 + %30 = trunc nuw nsw i64 %29 to i32, !dbg !22 + %31 = add i32 %24, %30, !dbg !22 + %32 = sext i32 %31 to i64, !dbg !23 + %33 = getelementptr bfloat, ptr addrspace(1) %0, i64 %32, !dbg !23 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !24 + %35 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %33, i64 %34, i1 true) #4, !dbg !24 + %36 = bitcast i16 %35 to bfloat, !dbg !24 + %37 = fpext bfloat %36 to float, !dbg !25 + %gep = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %29, !dbg !26 + %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !27 + %39 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %gep, i64 %38, i1 true) #4, !dbg !27 + %40 = bitcast i16 %39 to bfloat, !dbg !27 + %41 = fpext bfloat %40 to float, !dbg !28 + %42 = fmul float %37, %41, !dbg !29 + %43 = fadd float %28, %42, !dbg !30 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4, !dbg !20 + %44 = icmp samesign ult i64 %indvars.iv, 124, !dbg !20 + br i1 %44, label %27, label %45, !dbg !20 + +45: ; preds = %27 + %46 = and i32 %10, 63, !dbg !9 + %47 = or disjoint i32 %9, %46, !dbg !10 + %48 = bitcast float %43 to i32, !dbg !31 + %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 2, i32 31), !dbg !31 + %50 = bitcast i32 %49 to float, !dbg !31 + %51 = fadd float %43, %50, !dbg !35 + %52 = bitcast float %51 to i32, !dbg !31 + %53 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 1, i32 31), !dbg !31 + %54 = bitcast i32 %53 to float, !dbg !31 + %55 = fadd float %51, %54, !dbg !35 + %56 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %11, !dbg !36 + store float %55, ptr addrspace(3) %56, align 4, !dbg !36 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !36 + %57 = shl nuw nsw i32 %46, 2, !dbg !36 + %58 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %57, !dbg !36 + %59 = load i32, ptr addrspace(3) %58, align 4, !dbg !36 + %60 = sext i32 %47 to i64, !dbg !37 + %61 = getelementptr float, ptr addrspace(1) %2, i64 %60, !dbg !37 + %62 = and i32 %10, 192, !dbg !38 + %63 = icmp eq i32 %62, 0, !dbg !38 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %59, ptr addrspace(1) %61, i1 %63) #4, !dbg !38 + ret void, !dbg !39 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 21, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 29, column: 29, scope: !4) +!15 = !DILocation(line: 30, column: 19, scope: !4) +!16 = !DILocation(line: 39, column: 45, scope: !4) +!17 = !DILocation(line: 39, column: 55, scope: !4) +!18 = !DILocation(line: 39, column: 68, scope: !4) +!19 = !DILocation(line: 40, column: 45, scope: !4) +!20 = !DILocation(line: 33, column: 40, scope: !4) +!21 = !DILocation(line: 34, column: 31, scope: !4) +!22 = !DILocation(line: 39, column: 60, scope: !4) +!23 = !DILocation(line: 39, column: 34, scope: !4) +!24 = !DILocation(line: 39, column: 73, scope: !4) +!25 = !DILocation(line: 39, column: 127, scope: !4) +!26 = !DILocation(line: 40, column: 34, scope: !4) +!27 = !DILocation(line: 40, column: 50, scope: !4) +!28 = !DILocation(line: 40, column: 104, scope: !4) +!29 = !DILocation(line: 41, column: 22, scope: !4) +!30 = !DILocation(line: 43, column: 23, scope: !4) +!31 = !DILocation(line: 291, column: 36, scope: !32, inlinedAt: !34) +!32 = distinct !DILexicalBlockFile(scope: !4, file: !33, discriminator: 0) +!33 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!34 = !DILocation(line: 45, column: 25, scope: !4) +!35 = !DILocation(line: 261, column: 15, scope: !32, inlinedAt: !34) +!36 = !DILocation(line: 45, column: 28, scope: !4) +!37 = !DILocation(line: 49, column: 25, scope: !4) +!38 = !DILocation(line: 49, column: 36, scope: !4) +!39 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..3c247ac1c65bbff48bd5ebce7bbc4bf2dc2f1dc3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.ptx @@ -0,0 +1,391 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u32 triton_red_fused_zeros_0_param_3, + .param .u32 triton_red_fused_zeros_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_6 +) +.reqntid 256 +{ + .reg .pred %p<5>; + .reg .b16 %rs<5>; + .reg .b32 %r<52>; + .reg .b64 %rd<25>; + .loc 1 18 0 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_red_fused_zeros_0_param_2]; + ld.param.b64 %rd7, [triton_red_fused_zeros_0_param_0]; + ld.param.b64 %rd10, [triton_red_fused_zeros_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:23:28 + mov.u32 %r7, %ctaid.x; + .loc 1 23 33 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:23:33 + shl.b32 %r1, %r7, 6; + .loc 1 24 44 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:24:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 252; + bfe.u32 %r8, %r2, 2, 6; + .loc 1 24 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:24:23 + or.b32 %r9, %r8, %r1; + .loc 1 26 37 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:26:37 + and.b32 %r10, %r2, 3; + .loc 1 29 21 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:29:21 + bfe.s32 %r11, %r7, 25, 1; + shr.u32 %r12, %r11, 21; + add.s32 %r13, %r9, %r12; + shr.s32 %r14, %r13, 11; + .loc 1 29 29 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:29:29 + shr.u32 %r15, %r14, 27; + add.s32 %r16, %r14, %r15; + and.b32 %r17, %r16, 33554400; + sub.s32 %r18, %r14, %r17; + .loc 1 30 19 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:30:19 + shr.u32 %r19, %r11, 16; + add.s32 %r20, %r9, %r19; + .loc 1 39 45 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:45 + shl.b32 %r21, %r18, 7; + .loc 1 39 68 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:68 + shl.b32 %r22, %r20, 7; + and.b32 %r23, %r22, -8388608; + .loc 1 33 40 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:33:40 + cvt.u64.u32 %rd11, %r10; + shl.b32 %r24, %r7, 13; + shl.b32 %r25, %r8, 7; + or.b32 %r26, %r24, %r25; + cvt.s64.s32 %rd12, %r26; + or.b64 %rd13, %rd12, %rd11; + shl.b64 %rd14, %rd13, 1; + add.s64 %rd23, %rd10, %rd14; + shl.b32 %r27, %r7, 18; + add.s32 %r28, %r23, %r27; + shl.b32 %r29, %r8, 12; + or.b32 %r30, %r28, %r29; + add.s32 %r31, %r30, %r21; + or.b32 %r32, %r31, %r10; + shl.b32 %r33, %r14, 23; + sub.s32 %r34, %r32, %r33; + cvt.u64.u32 %rd2, %r34; + mov.b32 %r51, 0f00000000; + mov.b64 %rd24, -4; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 39 34 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:34 + add.s64 %rd21, %rd2, %rd24; + cvt.u32.u64 %r35, %rd21; + add.s32 %r36, %r35, 4; + mad.wide.s32 %rd16, %r36, 2, %rd7; + .loc 1 39 73 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:73 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0; + // end inline asm + mov.b16 %rs2, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u16 %rs1, %rs2; + @%p1 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs1 }, [ %rd16 + 0 ], %rd15; + // end inline asm + .loc 1 39 127 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:127 + cvt.f32.bf16 %r37, %rs1; + .loc 1 40 50 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:50 + // begin inline asm + mov.u64 %rd18, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd18, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs3, %rs2; + @%p1 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs3 }, [ %rd23 + 0 ], %rd18; + // end inline asm + .loc 1 40 104 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:104 + cvt.f32.bf16 %r38, %rs3; + .loc 1 43 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:43:23 + fma.rn.f32 %r51, %r37, %r38, %r51; + .loc 1 33 40 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:33:40 + add.s64 %rd24, %rd24, 4; + add.s64 %rd23, %rd23, 8; + setp.lt.u64 %p3, %rd24, 124; + @%p3 bra $L__BB0_1; +// %bb.2: + .loc 1 24 44 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:24:44 + and.b32 %r40, %r2, 63; + .loc 1 24 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:24:23 + or.b32 %r41, %r1, %r40; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r42, %r51, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r43, %r51, %r42; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r44, %r43, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r45, %r43, %r44; +$L__tmp2: + .loc 1 45 28 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:28 + mov.b32 %r46, global_smem; + add.s32 %r47, %r46, %r3; + st.shared.b32 [%r47], %r45; + bar.sync 0; + shl.b32 %r48, %r40, 2; + add.s32 %r49, %r46, %r48; + ld.shared.b32 %r39, [%r49]; + .loc 1 49 25 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:49:25 + mad.wide.s32 %rd22, %r41, 4, %rd8; + .loc 1 49 36 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:49:36 + and.b32 %r50, %r2, 192; + setp.eq.b32 %p4, %r50, 0; + // begin inline asm + @%p4 st.global.b32 [ %rd22 + 0 ], { %r39 }; + // end inline asm + .loc 1 49 4 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:49:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 209 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xca DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 111 +.b8 103 +.b8 111 +.b8 108 +.b8 53 +.b8 53 +.b8 99 +.b8 116 +.b8 104 +.b8 107 +.b8 52 +.b8 122 +.b8 101 +.b8 118 +.b8 115 +.b8 121 +.b8 51 +.b8 100 +.b8 108 +.b8 113 +.b8 105 +.b8 121 +.b8 122 +.b8 105 +.b8 112 +.b8 101 +.b8 102 +.b8 118 +.b8 55 +.b8 51 +.b8 53 +.b8 103 +.b8 101 +.b8 50 +.b8 119 +.b8 116 +.b8 97 +.b8 100 +.b8 100 +.b8 118 +.b8 107 +.b8 52 +.b8 51 +.b8 54 +.b8 113 +.b8 104 +.b8 116 +.b8 53 +.b8 110 +.b8 111 +.b8 120 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 111 +.b8 103 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..aa4d720b83fdef413f546267f4445c995ee76362 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.source @@ -0,0 +1,222 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":18:0) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc46 = loc(unknown) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc53 = loc("in_ptr0"(#loc)) +#loc54 = loc("in_ptr1"(#loc)) +#loc55 = loc("out_ptr1"(#loc)) +#loc56 = loc("xnumel"(#loc)) +#loc57 = loc("r0_numel"(#loc)) +#loc97 = loc("input"(#loc44)) +#loc98 = loc("a"(#loc49)) +#loc99 = loc("b"(#loc49)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 131072 : i32 loc(#loc58) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc59) + %xoffset = tt.get_program_id x : i32 loc(#loc60) + %xoffset_2 = arith.constant 64 : i32 loc(#loc61) + %xoffset_3 = arith.constant 64 : i32 loc(#loc61) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc61) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc62) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc63) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc64) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc64) + %xmask = arith.constant true loc(#loc65) + %xmask_8 = arith.constant dense : tensor<64x4xi1> loc(#loc65) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc66) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc67) + %x0 = arith.constant 2048 : i32 loc(#loc68) + %x0_10 = arith.constant 2048 : i32 loc(#loc68) + %x0_11 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc68) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc68) + %x1 = arith.constant 2048 : i32 loc(#loc69) + %x1_13 = arith.constant 2048 : i32 loc(#loc69) + %x1_14 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc69) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc69) + %x1_16 = arith.constant 32 : i32 loc(#loc70) + %x1_17 = arith.constant 32 : i32 loc(#loc70) + %x1_18 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc70) + %x1_19 = arith.remsi %x1_15, %x1_18 : tensor<64x1xi32> loc(#loc70) + %x2 = arith.constant 65536 : i32 loc(#loc71) + %x2_20 = arith.constant 65536 : i32 loc(#loc71) + %x2_21 = arith.constant dense<65536> : tensor<64x1xi32> loc(#loc71) + %x2_22 = arith.divsi %xindex_7, %x2_21 : tensor<64x1xi32> loc(#loc71) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc72) + %_tmp4_23 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc72) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c4_i32 = arith.constant 4 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_24 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_27 = %_tmp4_23) -> (tensor<64x4xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc74) + %r0_index_28 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc74) + %r0_mask = arith.constant dense<128> : tensor<1x4xi32> loc(#loc75) + %r0_mask_29 = arith.cmpi slt, %r0_index_28, %r0_mask : tensor<1x4xi32> loc(#loc75) + %tmp0 = arith.constant 128 : i32 loc(#loc76) + %tmp0_30 = arith.constant 128 : i32 loc(#loc76) + %tmp0_31 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc76) + %tmp0_32 = arith.muli %tmp0_31, %x1_19 : tensor<64x1xi32> loc(#loc76) + %tmp0_33 = tt.broadcast %r0_index_28 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc77) + %tmp0_34 = tt.broadcast %tmp0_32 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc77) + %tmp0_35 = arith.addi %tmp0_33, %tmp0_34 : tensor<64x4xi32> loc(#loc77) + %tmp0_36 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_37 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_38 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc78) + %tmp0_39 = arith.muli %tmp0_38, %x0_12 : tensor<64x1xi32> loc(#loc78) + %tmp0_40 = tt.broadcast %tmp0_39 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc79) + %tmp0_41 = arith.addi %tmp0_35, %tmp0_40 : tensor<64x4xi32> loc(#loc79) + %tmp0_42 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_43 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_44 = arith.constant dense<8388608> : tensor<64x1xi32> loc(#loc80) + %tmp0_45 = arith.muli %tmp0_44, %x2_22 : tensor<64x1xi32> loc(#loc80) + %tmp0_46 = tt.broadcast %tmp0_45 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc81) + %tmp0_47 = arith.addi %tmp0_41, %tmp0_46 : tensor<64x4xi32> loc(#loc81) + %tmp0_48 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc82) + %tmp0_49 = tt.addptr %tmp0_48, %tmp0_47 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc82) + %tmp0_50 = arith.constant 0.000000e+00 : f32 loc(#loc83) + %tmp0_51 = tt.broadcast %r0_mask_29 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc83) + %tmp0_52 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc83) + %tmp0_53 = arith.truncf %tmp0_52 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc83) + %tmp0_54 = tt.load %tmp0_49, %tmp0_51, %tmp0_53 evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc83) + %tmp0_55 = arith.extf %tmp0_54 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc84) + %tmp1 = arith.constant 128 : i32 loc(#loc85) + %tmp1_56 = arith.constant 128 : i32 loc(#loc85) + %tmp1_57 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc85) + %tmp1_58 = arith.muli %tmp1_57, %xindex_7 : tensor<64x1xi32> loc(#loc85) + %tmp1_59 = tt.broadcast %r0_index_28 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc86) + %tmp1_60 = tt.broadcast %tmp1_58 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc86) + %tmp1_61 = arith.addi %tmp1_59, %tmp1_60 : tensor<64x4xi32> loc(#loc86) + %tmp1_62 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc87) + %tmp1_63 = tt.addptr %tmp1_62, %tmp1_61 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc87) + %tmp1_64 = arith.constant 0.000000e+00 : f32 loc(#loc88) + %tmp1_65 = tt.broadcast %r0_mask_29 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc88) + %tmp1_66 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc88) + %tmp1_67 = arith.truncf %tmp1_66 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc88) + %tmp1_68 = tt.load %tmp1_63, %tmp1_65, %tmp1_67 evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc88) + %tmp1_69 = arith.extf %tmp1_68 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc89) + %tmp2 = arith.mulf %tmp0_55, %tmp1_69 : tensor<64x4xf32> loc(#loc90) + %tmp5 = arith.addf %_tmp4_27, %tmp2 : tensor<64x4xf32> loc(#loc91) + %_tmp4_70 = tt.broadcast %r0_mask_29 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc92) + %_tmp4_71 = arith.select %_tmp4_70, %tmp5, %_tmp4_27 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc92) + scf.yield %_tmp4_71 : tensor<64x4xf32> loc(#loc36) + } loc(#loc73) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_24) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc93) + %tmp4_25 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc94) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc95) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<64x1xf32> loc(#loc96) + %tmp8_26 = arith.subf %tmp4_25, %tmp8 : tensor<64x1xf32> loc(#loc96) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc41) + %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc41) + tt.store %5, %tmp8_26 : tensor<64x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x4xf32> loc("input"(#loc44))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc45) + tt.reduce.return %2 : f32 loc(#loc45) + }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc45) + tt.return %0 : tensor<64xf32> loc(#loc47) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc48) + tt.return %1 : tensor<64xf32> loc(#loc48) + } loc(#loc44) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc49)), %b: f32 loc("b"(#loc49))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc50) + tt.return %0 : f32 loc(#loc51) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc52) + tt.return %1 : f32 loc(#loc52) + } loc(#loc49) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":30:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":32:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:68) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:60) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:73) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:127) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:45) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:50) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:104) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":41:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":43:23) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:40) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:28) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":47:11) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":48:18) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:4) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc58 = loc("xnumel"(#loc1)) +#loc59 = loc("r0_numel"(#loc2)) +#loc60 = loc("xoffset"(#loc3)) +#loc61 = loc("xoffset"(#loc4)) +#loc62 = loc("xindex"(#loc5)) +#loc63 = loc("xindex"(#loc6)) +#loc64 = loc("xindex"(#loc7)) +#loc65 = loc("xmask"(#loc8)) +#loc66 = loc("r0_base"(#loc9)) +#loc67 = loc("r0_base"(#loc10)) +#loc68 = loc("x0"(#loc11)) +#loc69 = loc("x1"(#loc12)) +#loc70 = loc("x1"(#loc13)) +#loc71 = loc("x2"(#loc14)) +#loc72 = loc("_tmp4"(#loc15)) +#loc73 = loc("_tmp4"(#loc16)) +#loc74 = loc("r0_index"(#loc17)) +#loc75 = loc("r0_mask"(#loc18)) +#loc76 = loc("tmp0"(#loc19)) +#loc77 = loc("tmp0"(#loc20)) +#loc78 = loc("tmp0"(#loc21)) +#loc79 = loc("tmp0"(#loc22)) +#loc80 = loc("tmp0"(#loc23)) +#loc81 = loc("tmp0"(#loc24)) +#loc82 = loc("tmp0"(#loc25)) +#loc83 = loc("tmp0"(#loc26)) +#loc84 = loc("tmp0"(#loc27)) +#loc85 = loc("tmp1"(#loc28)) +#loc86 = loc("tmp1"(#loc29)) +#loc87 = loc("tmp1"(#loc30)) +#loc88 = loc("tmp1"(#loc31)) +#loc89 = loc("tmp1"(#loc32)) +#loc90 = loc("tmp2"(#loc33)) +#loc91 = loc("tmp5"(#loc34)) +#loc92 = loc("_tmp4"(#loc35)) +#loc93 = loc("tmp4"(#loc37)) +#loc94 = loc("tmp4"(#loc38)) +#loc95 = loc("tmp7"(#loc39)) +#loc96 = loc("tmp8"(#loc40)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..95893a815ded96b37087fed5cdd15bcbe3c4a5b1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,155 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":18:0) +#loc1 = loc(unknown) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:25) +#loc39 = loc("in_ptr0"(#loc)) +#loc40 = loc("in_ptr1"(#loc)) +#loc41 = loc("out_ptr1"(#loc)) +#loc42 = loc("xnumel"(#loc)) +#loc43 = loc("r0_numel"(#loc)) +#loc73 = loc("tmp4"(#loc33)) +#loc76 = loc(callsite(#loc1 at #loc73)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<2048> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<65536> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<4096> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<8388608> : tensor<64x1xi32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<1x4xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc44) + %xoffset_8 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc45) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc46) + %xindex_9 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc46) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc46) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc46) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked> loc(#loc47) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc47) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<64x1xi32, #blocked> loc(#loc47) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<64x1xi32, #blocked1> loc(#loc47) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc48) + %r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4xi32, #blocked> loc(#loc48) + %x0 = arith.remsi %xindex_14, %cst : tensor<64x1xi32, #blocked> loc(#loc49) + %x1 = arith.divsi %xindex_14, %cst : tensor<64x1xi32, #blocked> loc(#loc50) + %x1_17 = arith.remsi %x1, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc51) + %x2 = arith.divsi %xindex_14, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc52) + %tmp0 = arith.muli %x1_17, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc53) + %tmp0_18 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc54) + %tmp0_19 = arith.muli %x0, %cst_3 : tensor<64x1xi32, #blocked> loc(#loc55) + %tmp0_20 = tt.broadcast %tmp0_19 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc56) + %tmp0_21 = arith.muli %x2, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc57) + %tmp0_22 = tt.broadcast %tmp0_21 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc58) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked> loc(#loc59) + %tmp1 = arith.muli %xindex_14, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc60) + %tmp1_24 = tt.broadcast %tmp1 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc61) + %tmp1_25 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked> loc(#loc62) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_28 = %cst_7) -> (tensor<64x4xf32, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked> loc(#loc64) + %r0_index_29 = arith.addi %r0_index, %r0_base_16 : tensor<1x4xi32, #blocked> loc(#loc64) + %r0_mask = arith.cmpi slt, %r0_index_29, %cst_6 : tensor<1x4xi32, #blocked> loc(#loc65) + %tmp0_30 = tt.broadcast %r0_index_29 : tensor<1x4xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc54) + %tmp0_31 = arith.addi %tmp0_30, %tmp0_18 : tensor<64x4xi32, #blocked> loc(#loc54) + %tmp0_32 = arith.addi %tmp0_31, %tmp0_20 : tensor<64x4xi32, #blocked> loc(#loc56) + %tmp0_33 = arith.addi %tmp0_32, %tmp0_22 : tensor<64x4xi32, #blocked> loc(#loc58) + %tmp0_34 = tt.addptr %tmp0_23, %tmp0_33 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi32, #blocked> loc(#loc59) + %tmp0_35 = tt.broadcast %r0_mask : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc66) + %tmp0_36 = tt.load %tmp0_34, %tmp0_35, %cst_5 evictionPolicy = evict_first : tensor<64x4x!tt.ptr, #blocked> loc(#loc66) + %tmp0_37 = arith.extf %tmp0_36 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc67) + %tmp1_38 = arith.addi %tmp0_30, %tmp1_24 : tensor<64x4xi32, #blocked> loc(#loc61) + %tmp1_39 = tt.addptr %tmp1_25, %tmp1_38 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi32, #blocked> loc(#loc62) + %tmp1_40 = tt.load %tmp1_39, %tmp0_35, %cst_5 evictionPolicy = evict_first : tensor<64x4x!tt.ptr, #blocked> loc(#loc68) + %tmp1_41 = arith.extf %tmp1_40 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc69) + %tmp2 = arith.mulf %tmp0_37, %tmp1_41 : tensor<64x4xf32, #blocked> loc(#loc70) + %tmp5 = arith.addf %_tmp4_28, %tmp2 : tensor<64x4xf32, #blocked> loc(#loc71) + %_tmp4_42 = arith.select %tmp0_35, %tmp5, %_tmp4_28 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc72) + scf.yield %_tmp4_42 : tensor<64x4xf32, #blocked> loc(#loc31) + } loc(#loc63) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_28: f32 loc(callsite(#loc1 at #loc73)), %tmp4_29: f32 loc(callsite(#loc1 at #loc73))): + %tmp4_30 = arith.addf %tmp4_28, %tmp4_29 : f32 loc(#loc77) + tt.reduce.return %tmp4_30 : f32 loc(#loc75) + }) : (tensor<64x4xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc75) + %tmp4_26 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc74) + %tmp4_27 = tt.expand_dims %tmp4_26 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc74) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc36) + %1 = tt.addptr %0, %xindex_15 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc36) + tt.store %1, %tmp4_27 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc37) + tt.return loc(#loc38) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":30:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:41) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:55) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:50) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:68) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:60) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":33:40) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":34:31) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":35:29) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:73) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:127) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:104) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":41:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":43:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:40) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:8) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:28) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:36) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:4) +#loc44 = loc("xoffset"(#loc2)) +#loc45 = loc("xoffset"(#loc3)) +#loc46 = loc("xindex"(#loc4)) +#loc47 = loc("xindex"(#loc5)) +#loc48 = loc("r0_base"(#loc6)) +#loc49 = loc("x0"(#loc7)) +#loc50 = loc("x1"(#loc8)) +#loc51 = loc("x1"(#loc9)) +#loc52 = loc("x2"(#loc10)) +#loc53 = loc("tmp0"(#loc11)) +#loc54 = loc("tmp0"(#loc12)) +#loc55 = loc("tmp0"(#loc13)) +#loc56 = loc("tmp0"(#loc14)) +#loc57 = loc("tmp0"(#loc15)) +#loc58 = loc("tmp0"(#loc16)) +#loc59 = loc("tmp0"(#loc17)) +#loc60 = loc("tmp1"(#loc18)) +#loc61 = loc("tmp1"(#loc19)) +#loc62 = loc("tmp1"(#loc20)) +#loc63 = loc("_tmp4"(#loc21)) +#loc64 = loc("r0_index"(#loc22)) +#loc65 = loc("r0_mask"(#loc23)) +#loc66 = loc("tmp0"(#loc24)) +#loc67 = loc("tmp0"(#loc25)) +#loc68 = loc("tmp1"(#loc26)) +#loc69 = loc("tmp1"(#loc27)) +#loc70 = loc("tmp2"(#loc28)) +#loc71 = loc("tmp5"(#loc29)) +#loc72 = loc("_tmp4"(#loc30)) +#loc74 = loc("tmp4"(#loc35)) +#loc75 = loc(callsite(#loc32 at #loc73)) +#loc77 = loc(callsite(#loc34 at #loc75)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..bd00bf09be0d91e43dd1f52b89a247956a46ec34 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NDEOP46NKYLC2YE7LFCCPDHSL66GM3XIHZPP7RK7VNMM7TFK35VA/triton_red_fused_zeros_0.ttir @@ -0,0 +1,152 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":18:0) +#loc1 = loc(unknown) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:25) +#loc41 = loc("in_ptr0"(#loc)) +#loc42 = loc("in_ptr1"(#loc)) +#loc43 = loc("out_ptr1"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc77 = loc("tmp4"(#loc35)) +#loc80 = loc(callsite(#loc1 at #loc77)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x4xbf16> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc2) + %c128_i32 = arith.constant 128 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst_0 = arith.constant dense<8388608> : tensor<64x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<128> : tensor<1x4xi32> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc1) + %x2 = arith.constant dense<65536> : tensor<64x1xi32> loc(#loc46) + %x1 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc47) + %cst_5 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc48) + %xoffset_6 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc49) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc50) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc51) + %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<64x1xi32> loc(#loc52) + %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<64x1xi32> loc(#loc52) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc53) + %r0_base_10 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc54) + %x0 = arith.remsi %xindex_9, %cst_5 : tensor<64x1xi32> loc(#loc55) + %x1_11 = arith.divsi %xindex_9, %cst_5 : tensor<64x1xi32> loc(#loc56) + %x1_12 = arith.remsi %x1_11, %x1 : tensor<64x1xi32> loc(#loc47) + %x2_13 = arith.divsi %xindex_9, %x2 : tensor<64x1xi32> loc(#loc46) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_15 = %cst_4) -> (tensor<64x4xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc58) + %r0_index_16 = arith.addi %r0_index, %r0_base_10 : tensor<1x4xi32> loc(#loc58) + %r0_mask = arith.cmpi slt, %r0_index_16, %cst_3 : tensor<1x4xi32> loc(#loc59) + %tmp0 = arith.muli %x1_12, %cst_2 : tensor<64x1xi32> loc(#loc60) + %tmp0_17 = tt.broadcast %r0_index_16 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc61) + %tmp0_18 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc61) + %tmp0_19 = arith.addi %tmp0_17, %tmp0_18 : tensor<64x4xi32> loc(#loc61) + %tmp0_20 = arith.muli %x0, %cst_1 : tensor<64x1xi32> loc(#loc62) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc63) + %tmp0_22 = arith.addi %tmp0_19, %tmp0_21 : tensor<64x4xi32> loc(#loc63) + %tmp0_23 = arith.muli %x2_13, %cst_0 : tensor<64x1xi32> loc(#loc64) + %tmp0_24 = tt.broadcast %tmp0_23 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc65) + %tmp0_25 = arith.addi %tmp0_22, %tmp0_24 : tensor<64x4xi32> loc(#loc65) + %tmp0_26 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc66) + %tmp0_27 = tt.addptr %tmp0_26, %tmp0_25 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc66) + %tmp0_28 = tt.broadcast %r0_mask : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc67) + %tmp0_29 = tt.load %tmp0_27, %tmp0_28, %cst evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc67) + %tmp0_30 = arith.extf %tmp0_29 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc68) + %tmp1 = arith.muli %xindex_9, %cst_2 : tensor<64x1xi32> loc(#loc69) + %tmp1_31 = tt.broadcast %tmp1 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc70) + %tmp1_32 = arith.addi %tmp0_17, %tmp1_31 : tensor<64x4xi32> loc(#loc70) + %tmp1_33 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc71) + %tmp1_34 = tt.addptr %tmp1_33, %tmp1_32 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc71) + %tmp1_35 = tt.load %tmp1_34, %tmp0_28, %cst evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc72) + %tmp1_36 = arith.extf %tmp1_35 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc73) + %tmp2 = arith.mulf %tmp0_30, %tmp1_36 : tensor<64x4xf32> loc(#loc74) + %tmp5 = arith.addf %_tmp4_15, %tmp2 : tensor<64x4xf32> loc(#loc75) + %_tmp4_37 = arith.select %tmp0_28, %tmp5, %_tmp4_15 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc76) + scf.yield %_tmp4_37 : tensor<64x4xf32> loc(#loc33) + } loc(#loc57) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_15: f32 loc(callsite(#loc1 at #loc77)), %tmp4_16: f32 loc(callsite(#loc1 at #loc77))): + %tmp4_17 = arith.addf %tmp4_15, %tmp4_16 : f32 loc(#loc81) + tt.reduce.return %tmp4_17 : f32 loc(#loc79) + }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc79) + %tmp4_14 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc78) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc38) + %1 = tt.addptr %0, %xindex_9 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc38) + tt.store %1, %tmp4_14 : tensor<64x1x!tt.ptr> loc(#loc39) + tt.return loc(#loc40) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":33:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":30:19) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:29) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:28) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:33) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:36) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:44) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:23) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:27) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":28:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:21) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":34:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":35:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:45) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:55) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:50) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:68) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:60) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:73) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:127) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:45) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:41) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:34) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:50) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:104) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":41:22) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":43:23) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:40) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:8) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:28) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:36) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:4) +#loc46 = loc("x2"(#loc3)) +#loc47 = loc("x1"(#loc4)) +#loc48 = loc("xoffset"(#loc5)) +#loc49 = loc("xoffset"(#loc6)) +#loc50 = loc("xindex"(#loc7)) +#loc51 = loc("xindex"(#loc8)) +#loc52 = loc("xindex"(#loc9)) +#loc53 = loc("r0_base"(#loc10)) +#loc54 = loc("r0_base"(#loc11)) +#loc55 = loc("x0"(#loc12)) +#loc56 = loc("x1"(#loc13)) +#loc57 = loc("_tmp4"(#loc2)) +#loc58 = loc("r0_index"(#loc14)) +#loc59 = loc("r0_mask"(#loc15)) +#loc60 = loc("tmp0"(#loc16)) +#loc61 = loc("tmp0"(#loc17)) +#loc62 = loc("tmp0"(#loc18)) +#loc63 = loc("tmp0"(#loc19)) +#loc64 = loc("tmp0"(#loc20)) +#loc65 = loc("tmp0"(#loc21)) +#loc66 = loc("tmp0"(#loc22)) +#loc67 = loc("tmp0"(#loc23)) +#loc68 = loc("tmp0"(#loc24)) +#loc69 = loc("tmp1"(#loc25)) +#loc70 = loc("tmp1"(#loc26)) +#loc71 = loc("tmp1"(#loc27)) +#loc72 = loc("tmp1"(#loc28)) +#loc73 = loc("tmp1"(#loc29)) +#loc74 = loc("tmp2"(#loc30)) +#loc75 = loc("tmp5"(#loc31)) +#loc76 = loc("_tmp4"(#loc32)) +#loc78 = loc("tmp4"(#loc37)) +#loc79 = loc(callsite(#loc34 at #loc77)) +#loc81 = loc(callsite(#loc36 at #loc79)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/__grp__triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/__grp__triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..741a2bdf810bfecd31550d436b5e7ee22fe19077 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/__grp__triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.source", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttir", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttgir", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.llir", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ptx", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.cubin", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..dfd51eacab63212ad7c8fac96d3c33bd693341ab Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7f8820752361df0653f35f90e16a52bcaf86e0ef --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.json @@ -0,0 +1 @@ +{"hash": "694013ba914f7e2d2157b0d92d54f9ff9e2aa9351febe106c91ad93ece04e98a", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..ccaa88035e108a7e5a3c4c19d995ae27d891e6e2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.llir @@ -0,0 +1,271 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_1 = internal constant [8 x i8] c"unknown\00" +@assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py\00" +@assertMessage_1 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp25 < ks4\00" +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py\00" +@assertMessage_0 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp5 < ks2\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9, i32 %10, ptr addrspace(1) readnone captures(none) %11, ptr addrspace(1) readnone captures(none) %12) local_unnamed_addr #1 !dbg !9 { + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %15 = shl i32 %14, 9, !dbg !11 + %16 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %17 = shl nuw nsw i32 %16, 1, !dbg !12 + %18 = and i32 %17, 510, !dbg !12 + %19 = or disjoint i32 %18, %15, !dbg !13 + %20 = or disjoint i32 %19, 1, !dbg !13 + %21 = icmp slt i32 %19, %10, !dbg !14 + %22 = icmp slt i32 %20, %10, !dbg !14 + %23 = sext i32 %19 to i64, !dbg !15 + %24 = sext i32 %20 to i64, !dbg !15 + %25 = sdiv i64 %23, %5, !dbg !15 + %26 = sdiv i64 %24, %5, !dbg !15 + %27 = srem i64 %25, %6, !dbg !16 + %28 = srem i64 %26, %6, !dbg !16 + %29 = srem i64 %23, %8, !dbg !17 + %30 = srem i64 %24, %8, !dbg !17 + %31 = getelementptr bfloat, ptr addrspace(1) %0, i64 %23, !dbg !18 + %32 = getelementptr bfloat, ptr addrspace(1) %0, i64 %24, !dbg !18 + %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %34 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %31, i64 %33, i1 %21) #4, !dbg !19 + %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %36 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %32, i64 %35, i1 %22) #4, !dbg !19 + %37 = getelementptr i64, ptr addrspace(1) %1, i64 %27, !dbg !20 + %38 = getelementptr i64, ptr addrspace(1) %1, i64 %28, !dbg !20 + %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %37, i64 %39, i1 %21) #4, !dbg !21 + %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %38, i64 %41, i1 %22) #4, !dbg !21 + %43 = icmp slt i64 %40, 0, !dbg !22 + %44 = icmp slt i64 %42, 0, !dbg !22 + %45 = select i1 %43, i64 %7, i64 0, !dbg !23 + %46 = add i64 %45, %40, !dbg !23 + %47 = select i1 %44, i64 %7, i64 0, !dbg !23 + %48 = add i64 %47, %42, !dbg !23 + %49 = icmp slt i64 %46, 0, !dbg !24 + %50 = icmp slt i64 %48, 0, !dbg !24 + %51 = icmp sge i64 %46, %7, !dbg !25 + %52 = icmp sge i64 %48, %7, !dbg !25 + %.not4 = or i1 %49, %51, !dbg !26 + %.not8 = or i1 %50, %52, !dbg !27 + %.not1 = and i1 %21, %.not4, !dbg !28 + %.not5 = and i1 %22, %.not8, !dbg !29 + %53 = or i1 %.not1, %.not5, !dbg !29 + br i1 %53, label %54, label %55, !dbg !29 + +54: ; preds = %13 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 32, ptr nonnull @assertFunc_0, i64 1), !dbg !29 + unreachable, !dbg !29 + +55: ; preds = %13 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !29 + %56 = mul i64 %46, %8, !dbg !30 + %57 = mul i64 %48, %8, !dbg !30 + %58 = getelementptr bfloat, ptr addrspace(1) %2, i64 %29, !dbg !31 + %59 = getelementptr bfloat, ptr addrspace(1) %58, i64 %56, !dbg !31 + %60 = getelementptr bfloat, ptr addrspace(1) %2, i64 %30, !dbg !31 + %61 = getelementptr bfloat, ptr addrspace(1) %60, i64 %57, !dbg !31 + %62 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %63 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %59, i64 %62, i1 %21) #4, !dbg !32 + %64 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %65 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %61, i64 %64, i1 %22) #4, !dbg !32 + %66 = sdiv i64 %8, 2, !dbg !33 + %67 = sub i64 %8, %66, !dbg !34 + %68 = icmp slt i64 %29, %67, !dbg !35 + %69 = icmp slt i64 %30, %67, !dbg !35 + %70 = sub nsw i64 %23, %29, !dbg !36 + %71 = sub nsw i64 %24, %30, !dbg !36 + %72 = getelementptr bfloat, ptr addrspace(1) %0, i64 %70, !dbg !37 + %73 = getelementptr bfloat, ptr addrspace(1) %72, i64 %66, !dbg !37 + %74 = getelementptr bfloat, ptr addrspace(1) %73, i64 %29, !dbg !37 + %75 = getelementptr bfloat, ptr addrspace(1) %0, i64 %71, !dbg !37 + %76 = getelementptr bfloat, ptr addrspace(1) %75, i64 %66, !dbg !37 + %77 = getelementptr bfloat, ptr addrspace(1) %76, i64 %30, !dbg !37 + %78 = and i1 %21, %68, !dbg !38 + %79 = and i1 %22, %69, !dbg !38 + %80 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39 + %81 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %74, i64 %80, i1 %78) #4, !dbg !39 + %82 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39 + %83 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %77, i64 %82, i1 %79) #4, !dbg !39 + %84 = icmp sge i64 %29, %67, !dbg !40 + %85 = icmp sge i64 %30, %67, !dbg !40 + %86 = sub i64 %29, %8, !dbg !41 + %87 = sub i64 %30, %8, !dbg !41 + %88 = getelementptr bfloat, ptr addrspace(1) %72, i64 %86, !dbg !42 + %89 = getelementptr bfloat, ptr addrspace(1) %88, i64 %66, !dbg !42 + %90 = getelementptr bfloat, ptr addrspace(1) %75, i64 %87, !dbg !42 + %91 = getelementptr bfloat, ptr addrspace(1) %90, i64 %66, !dbg !42 + %92 = and i1 %21, %84, !dbg !43 + %93 = and i1 %22, %85, !dbg !43 + %94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44 + %95 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %89, i64 %94, i1 %92) #4, !dbg !44 + %96 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44 + %97 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %91, i64 %96, i1 %93) #4, !dbg !44 + %98 = select i1 %43, i64 %9, i64 0, !dbg !45 + %99 = add i64 %98, %40, !dbg !45 + %100 = select i1 %44, i64 %9, i64 0, !dbg !45 + %101 = add i64 %100, %42, !dbg !45 + %102 = icmp slt i64 %99, 0, !dbg !46 + %103 = icmp slt i64 %101, 0, !dbg !46 + %104 = icmp sge i64 %99, %9, !dbg !47 + %105 = icmp sge i64 %101, %9, !dbg !47 + %.not12 = or i1 %102, %104, !dbg !48 + %.not16 = or i1 %103, %105, !dbg !49 + %.not9 = and i1 %21, %.not12, !dbg !49 + %.not13 = and i1 %22, %.not16, !dbg !50 + %106 = or i1 %.not9, %.not13, !dbg !50 + br i1 %106, label %107, label %108, !dbg !50 + +107: ; preds = %55 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 52, ptr nonnull @assertFunc_1, i64 1), !dbg !50 + unreachable, !dbg !50 + +108: ; preds = %55 + %109 = bitcast i16 %83 to bfloat, !dbg !39 + %110 = fpext bfloat %109 to float, !dbg !51 + %111 = fsub float 0.000000e+00, %110, !dbg !52 + %112 = bitcast i16 %97 to bfloat, !dbg !44 + %113 = fpext bfloat %112 to float, !dbg !53 + %114 = select i1 %69, float %111, float %113, !dbg !54 + %115 = bitcast i16 %81 to bfloat, !dbg !39 + %116 = fpext bfloat %115 to float, !dbg !51 + %117 = fsub float 0.000000e+00, %116, !dbg !52 + %118 = bitcast i16 %95 to bfloat, !dbg !44 + %119 = fpext bfloat %118 to float, !dbg !53 + %120 = select i1 %68, float %117, float %119, !dbg !54 + %121 = bitcast i16 %36 to bfloat, !dbg !19 + %122 = fpext bfloat %121 to float, !dbg !55 + %123 = bitcast i16 %34 to bfloat, !dbg !19 + %124 = fpext bfloat %123 to float, !dbg !55 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50 + %125 = mul i64 %99, %8, !dbg !56 + %126 = mul i64 %101, %8, !dbg !56 + %127 = getelementptr bfloat, ptr addrspace(1) %3, i64 %29, !dbg !57 + %128 = getelementptr bfloat, ptr addrspace(1) %127, i64 %125, !dbg !57 + %129 = getelementptr bfloat, ptr addrspace(1) %3, i64 %30, !dbg !57 + %130 = getelementptr bfloat, ptr addrspace(1) %129, i64 %126, !dbg !57 + %131 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !58 + %132 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %128, i64 %131, i1 %21) #4, !dbg !58 + %133 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !58 + %134 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %130, i64 %133, i1 %22) #4, !dbg !58 + %135 = insertelement <2 x i16> poison, i16 %63, i64 0, !dbg !32 + %136 = insertelement <2 x i16> %135, i16 %132, i64 1, !dbg !32 + %137 = bitcast <2 x i16> %136 to <2 x bfloat>, !dbg !32 + %138 = fpext <2 x bfloat> %137 to <2 x float>, !dbg !59 + %139 = insertelement <2 x float> poison, float %124, i64 0, !dbg !60 + %140 = insertelement <2 x float> %139, float %120, i64 1, !dbg !60 + %141 = fmul <2 x float> %140, %138, !dbg !60 + %142 = insertelement <2 x i16> poison, i16 %65, i64 0, !dbg !32 + %143 = insertelement <2 x i16> %142, i16 %134, i64 1, !dbg !32 + %144 = bitcast <2 x i16> %143 to <2 x bfloat>, !dbg !32 + %145 = fpext <2 x bfloat> %144 to <2 x float>, !dbg !59 + %146 = insertelement <2 x float> poison, float %122, i64 0, !dbg !60 + %147 = insertelement <2 x float> %146, float %114, i64 1, !dbg !60 + %148 = fmul <2 x float> %147, %145, !dbg !60 + %shift = shufflevector <2 x float> %141, <2 x float> poison, <2 x i32> , !dbg !61 + %foldExtExtBinop = fadd <2 x float> %141, %shift, !dbg !61 + %149 = extractelement <2 x float> %foldExtExtBinop, i64 0, !dbg !61 + %shift18 = shufflevector <2 x float> %148, <2 x float> poison, <2 x i32> , !dbg !61 + %foldExtExtBinop19 = fadd <2 x float> %148, %shift18, !dbg !61 + %150 = extractelement <2 x float> %foldExtExtBinop19, i64 0, !dbg !61 + %151 = getelementptr bfloat, ptr addrspace(1) %4, i64 %23, !dbg !62 + %152 = getelementptr bfloat, ptr addrspace(1) %4, i64 %24, !dbg !62 + %153 = fptrunc float %149 to bfloat, !dbg !63 + %154 = fptrunc float %150 to bfloat, !dbg !63 + %155 = bitcast bfloat %153 to i16, !dbg !63 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %155, ptr addrspace(1) %151, i1 %21) #4, !dbg !63 + %156 = bitcast bfloat %154 to i16, !dbg !63 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %156, ptr addrspace(1) %152, i1 %22) #4, !dbg !63 + ret void, !dbg !64 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="256" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0", linkageName: "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 19, column: 28, scope: !9) +!11 = !DILocation(line: 19, column: 33, scope: !9) +!12 = !DILocation(line: 20, column: 36, scope: !9) +!13 = !DILocation(line: 20, column: 23, scope: !9) +!14 = !DILocation(line: 21, column: 21, scope: !9) +!15 = !DILocation(line: 23, column: 21, scope: !9) +!16 = !DILocation(line: 23, column: 28, scope: !9) +!17 = !DILocation(line: 24, column: 19, scope: !9) +!18 = !DILocation(line: 26, column: 30, scope: !9) +!19 = !DILocation(line: 26, column: 35, scope: !9) +!20 = !DILocation(line: 27, column: 30, scope: !9) +!21 = !DILocation(line: 27, column: 35, scope: !9) +!22 = !DILocation(line: 30, column: 18, scope: !9) +!23 = !DILocation(line: 31, column: 32, scope: !9) +!24 = !DILocation(line: 32, column: 28, scope: !9) +!25 = !DILocation(line: 32, column: 44, scope: !9) +!26 = !DILocation(line: 32, column: 37, scope: !9) +!27 = !DILocation(line: 32, column: 54, scope: !9) +!28 = !DILocation(line: 32, column: 52, scope: !9) +!29 = !DILocation(line: 32, column: 62, scope: !9) +!30 = !DILocation(line: 33, column: 39, scope: !9) +!31 = !DILocation(line: 33, column: 30, scope: !9) +!32 = !DILocation(line: 33, column: 46, scope: !9) +!33 = !DILocation(line: 38, column: 31, scope: !9) +!34 = !DILocation(line: 38, column: 18, scope: !9) +!35 = !DILocation(line: 39, column: 19, scope: !9) +!36 = !DILocation(line: 40, column: 35, scope: !9) +!37 = !DILocation(line: 40, column: 31, scope: !9) +!38 = !DILocation(line: 40, column: 68, scope: !9) +!39 = !DILocation(line: 40, column: 60, scope: !9) +!40 = !DILocation(line: 44, column: 20, scope: !9) +!41 = !DILocation(line: 47, column: 47, scope: !9) +!42 = !DILocation(line: 47, column: 31, scope: !9) +!43 = !DILocation(line: 47, column: 81, scope: !9) +!44 = !DILocation(line: 47, column: 73, scope: !9) +!45 = !DILocation(line: 51, column: 34, scope: !9) +!46 = !DILocation(line: 52, column: 28, scope: !9) +!47 = !DILocation(line: 52, column: 46, scope: !9) +!48 = !DILocation(line: 52, column: 38, scope: !9) +!49 = !DILocation(line: 52, column: 54, scope: !9) +!50 = !DILocation(line: 52, column: 64, scope: !9) +!51 = !DILocation(line: 40, column: 119, scope: !9) +!52 = !DILocation(line: 41, column: 13, scope: !9) +!53 = !DILocation(line: 47, column: 132, scope: !9) +!54 = !DILocation(line: 0, scope: !9) +!55 = !DILocation(line: 26, column: 75, scope: !9) +!56 = !DILocation(line: 53, column: 40, scope: !9) +!57 = !DILocation(line: 53, column: 31, scope: !9) +!58 = !DILocation(line: 53, column: 48, scope: !9) +!59 = !DILocation(line: 33, column: 86, scope: !9) +!60 = !DILocation(line: 34, column: 18, scope: !9) +!61 = !DILocation(line: 55, column: 19, scope: !9) +!62 = !DILocation(line: 56, column: 25, scope: !9) +!63 = !DILocation(line: 56, column: 37, scope: !9) +!64 = !DILocation(line: 56, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..10f8c1d328a9487d68dfe8c08074e70369642f11 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ptx @@ -0,0 +1,652 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0 // -- Begin function triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 53, 111, 47, 99, 53, 111, 115, 119, 110, 114, 55, 100, 112, 119, 119, 99, 113, 112, 53, 109, 55, 116, 104, 112, 97, 102, 52, 111, 119, 118, 112, 115, 101, 107, 97, 54, 97, 109, 100, 118, 100, 114, 111, 101, 119, 110, 111, 114, 122, 115, 114, 101, 54, 110, 50, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 50, 53, 32, 60, 32, 107, 115, 52}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 53, 111, 47, 99, 53, 111, 115, 119, 110, 114, 55, 100, 112, 119, 119, 99, 113, 112, 53, 109, 55, 116, 104, 112, 97, 102, 52, 111, 119, 118, 112, 115, 101, 107, 97, 54, 97, 109, 100, 118, 100, 114, 111, 101, 119, 110, 111, 114, 122, 115, 114, 101, 54, 110, 50, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 53, 32, 60, 32, 107, 115, 50}; + // @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0 +.visible .entry triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_4, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_5, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_6, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_7, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_8, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_9, + .param .u32 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_10, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_11, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_12 +) +.reqntid 256 +{ + .reg .pred %p<47>; + .reg .b16 %rs<29>; + .reg .b32 %r<58>; + .reg .b64 %rd<151>; + .loc 1 18 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:18:0 + +// %bb.0: + ld.param.b64 %rd33, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_5]; +$L__tmp0: + .loc 1 19 28 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:19:28 + mov.u32 %r2, %ctaid.x; + .loc 1 19 33 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:19:33 + shl.b32 %r3, %r2, 9; + .loc 1 20 36 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:20:36 + mov.u32 %r4, %tid.x; + shl.b32 %r5, %r4, 1; + and.b32 %r6, %r5, 510; + .loc 1 20 23 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:20:23 + or.b32 %r7, %r6, %r3; + or.b32 %r8, %r7, 1; + .loc 1 23 21 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:23:21 + cvt.s64.s32 %rd1, %r7; + cvt.s64.s32 %rd2, %r8; + or.b64 %rd38, %rd1, %rd33; + and.b64 %rd39, %rd38, -4294967296; + setp.ne.b64 %p1, %rd39, 0; + cvt.u32.u64 %r57, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd145, %rd1, %rd33; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r9, %rd33; + div.u32 %r11, %r57, %r9; + cvt.u64.u32 %rd145, %r11; +$L__BB0_3: + .loc 1 0 21 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0:21 + ld.param.b64 %rd34, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_6]; + .loc 1 23 21 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:23:21 + or.b64 %rd40, %rd2, %rd33; + and.b64 %rd41, %rd40, -4294967296; + setp.ne.b64 %p2, %rd41, 0; + cvt.u32.u64 %r56, %rd2; + @%p2 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd146, %rd2, %rd33; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r12, %rd33; + div.u32 %r14, %r56, %r12; + cvt.u64.u32 %rd146, %r14; +$L__BB0_6: + .loc 1 23 28 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:23:28 + or.b64 %rd42, %rd145, %rd34; + and.b64 %rd43, %rd42, -4294967296; + setp.ne.b64 %p3, %rd43, 0; + @%p3 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + rem.s64 %rd147, %rd145, %rd34; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r15, %rd34; + cvt.u32.u64 %r16, %rd145; + rem.u32 %r17, %r16, %r15; + cvt.u64.u32 %rd147, %r17; +$L__BB0_9: + .loc 1 0 28 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0:28 + ld.param.b64 %rd36, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_8]; + .loc 1 23 28 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:23:28 + or.b64 %rd44, %rd146, %rd34; + and.b64 %rd45, %rd44, -4294967296; + setp.ne.b64 %p4, %rd45, 0; + @%p4 bra $L__BB0_11; + bra.uni $L__BB0_10; +$L__BB0_11: + rem.s64 %rd148, %rd146, %rd34; + bra.uni $L__BB0_12; +$L__BB0_10: + cvt.u32.u64 %r18, %rd34; + cvt.u32.u64 %r19, %rd146; + rem.u32 %r20, %r19, %r18; + cvt.u64.u32 %rd148, %r20; +$L__BB0_12: + .loc 1 24 19 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:24:19 + or.b64 %rd46, %rd1, %rd36; + and.b64 %rd47, %rd46, -4294967296; + setp.ne.b64 %p5, %rd47, 0; + @%p5 bra $L__BB0_14; + bra.uni $L__BB0_13; +$L__BB0_14: + rem.s64 %rd149, %rd1, %rd36; + bra.uni $L__BB0_15; +$L__BB0_13: + cvt.u32.u64 %r21, %rd36; + rem.u32 %r23, %r57, %r21; + cvt.u64.u32 %rd149, %r23; +$L__BB0_15: + .loc 1 0 19 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0:19 + ld.param.b32 %r1, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_10]; + ld.param.b64 %rd35, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_7]; + ld.param.b64 %rd29, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_1]; + ld.param.b64 %rd28, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_0]; + .loc 1 24 19 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:24:19 + or.b64 %rd48, %rd2, %rd36; + and.b64 %rd49, %rd48, -4294967296; + setp.ne.b64 %p6, %rd49, 0; + @%p6 bra $L__BB0_17; + bra.uni $L__BB0_16; +$L__BB0_17: + rem.s64 %rd150, %rd2, %rd36; + bra.uni $L__BB0_18; +$L__BB0_16: + cvt.u32.u64 %r24, %rd36; + rem.u32 %r26, %r56, %r24; + cvt.u64.u32 %rd150, %r26; +$L__BB0_18: + .loc 1 21 21 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:21:21 + setp.lt.s32 %p10, %r56, %r1; + setp.lt.s32 %p9, %r57, %r1; + .loc 1 26 30 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:26:30 + shl.b64 %rd64, %rd1, 1; + add.s64 %rd51, %rd28, %rd64; + add.s64 %rd54, %rd51, 2; + .loc 1 26 35 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:26:35 + // begin inline asm + mov.u64 %rd50, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd50, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs9, 0x0; + @%p9 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd51 + 0 ], %rd50; + // end inline asm + // begin inline asm + mov.u64 %rd53, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs10, 0x0; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs10 }, [ %rd54 + 0 ], %rd53; + // end inline asm + .loc 1 27 30 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:27:30 + shl.b64 %rd65, %rd147, 3; + add.s64 %rd58, %rd29, %rd65; + shl.b64 %rd66, %rd148, 3; + add.s64 %rd62, %rd29, %rd66; + .loc 1 27 35 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:27:35 + // begin inline asm + mov.u64 %rd56, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd56, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd57, 0x0; + @%p9 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd57 }, [ %rd58 + 0 ], %rd56; + // end inline asm + // begin inline asm + mov.u64 %rd60, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd60, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd61, 0x0; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd61 }, [ %rd62 + 0 ], %rd60; + // end inline asm + .loc 1 31 32 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:31:32 + shr.s64 %rd67, %rd57, 63; + and.b64 %rd68, %rd67, %rd35; + add.s64 %rd23, %rd68, %rd57; + shr.s64 %rd69, %rd61, 63; + and.b64 %rd70, %rd69, %rd35; + add.s64 %rd24, %rd70, %rd61; + .loc 1 32 28 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:32:28 + setp.lt.s64 %p11, %rd23, 0; + setp.lt.s64 %p12, %rd24, 0; + .loc 1 32 44 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:32:44 + setp.ge.s64 %p13, %rd23, %rd35; + setp.ge.s64 %p14, %rd24, %rd35; + .loc 1 32 37 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:32:37 + or.pred %p15, %p11, %p13; + .loc 1 32 54 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:32:54 + or.pred %p16, %p12, %p14; + .loc 1 32 52 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:32:52 + and.pred %p17, %p9, %p15; + .loc 1 32 62 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:32:62 + and.pred %p18, %p10, %p16; + or.pred %p19, %p17, %p18; + not.pred %p20, %p19; + @%p20 bra $L__BB0_20; + bra.uni $L__BB0_19; +$L__BB0_20: + .loc 1 0 62 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0:62 + ld.param.b64 %rd37, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_9]; + ld.param.b64 %rd30, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_2]; + .loc 1 32 62 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:32:62 + bar.sync 0; + .loc 1 33 39 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:39 + mul.lo.s64 %rd89, %rd23, %rd36; + mul.lo.s64 %rd90, %rd24, %rd36; + .loc 1 33 30 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:30 + shl.b64 %rd91, %rd149, 1; + add.s64 %rd92, %rd30, %rd91; + shl.b64 %rd93, %rd89, 1; + add.s64 %rd72, %rd92, %rd93; + shl.b64 %rd94, %rd150, 1; + add.s64 %rd95, %rd30, %rd94; + shl.b64 %rd96, %rd90, 1; + add.s64 %rd75, %rd95, %rd96; + .loc 1 33 46 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:46 + // begin inline asm + mov.u64 %rd71, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd71, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs11, 0x0; + @%p9 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd72 + 0 ], %rd71; + // end inline asm + // begin inline asm + mov.u64 %rd74, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd74, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs12, 0x0; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd75 + 0 ], %rd74; + // end inline asm + .loc 1 38 31 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:38:31 + shr.u64 %rd97, %rd36, 63; + add.s64 %rd98, %rd36, %rd97; + shr.s64 %rd99, %rd98, 1; + .loc 1 38 18 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:38:18 + sub.s64 %rd25, %rd36, %rd99; + .loc 1 39 19 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:39:19 + setp.lt.s64 %p27, %rd149, %rd25; + setp.lt.s64 %p28, %rd150, %rd25; + .loc 1 40 35 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:40:35 + sub.s64 %rd100, %rd1, %rd149; + .loc 1 40 31 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:40:31 + shl.b64 %rd101, %rd100, 1; + add.s64 %rd102, %rd28, %rd101; + and.b64 %rd103, %rd98, -2; + add.s64 %rd104, %rd102, %rd103; + add.s64 %rd78, %rd104, %rd91; + sub.s64 %rd105, %rd1, %rd150; + shl.b64 %rd106, %rd105, 1; + add.s64 %rd107, %rd28, %rd106; + add.s64 %rd108, %rd107, %rd103; + add.s64 %rd109, %rd108, %rd94; + add.s64 %rd81, %rd109, 2; + .loc 1 40 68 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:40:68 + and.pred %p23, %p9, %p27; + and.pred %p24, %p10, %p28; + .loc 1 40 60 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:40:60 + // begin inline asm + mov.u64 %rd77, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd77, 1.0; + // end inline asm + mov.b16 %rs14, 0; + // begin inline asm + mov.u16 %rs13, %rs14; + @%p23 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd78 + 0 ], %rd77; + // end inline asm + // begin inline asm + mov.u64 %rd80, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd80, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs14; + @%p24 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd81 + 0 ], %rd80; + // end inline asm + .loc 1 44 20 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:44:20 + setp.ge.s64 %p29, %rd149, %rd25; + setp.ge.s64 %p30, %rd150, %rd25; + .loc 1 47 47 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:47:47 + sub.s64 %rd110, %rd149, %rd36; + .loc 1 47 31 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:47:31 + shl.b64 %rd111, %rd110, 1; + add.s64 %rd84, %rd104, %rd111; + add.s64 %rd87, %rd84, 2; + .loc 1 47 81 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:47:81 + and.pred %p25, %p9, %p29; + and.pred %p26, %p10, %p30; + .loc 1 47 73 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:47:73 + // begin inline asm + mov.u64 %rd83, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd83, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs17, %rs14; + @%p25 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd84 + 0 ], %rd83; + // end inline asm + // begin inline asm + mov.u64 %rd86, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd86, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs19, %rs14; + @%p26 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd87 + 0 ], %rd86; + // end inline asm + .loc 1 51 34 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:51:34 + and.b64 %rd113, %rd67, %rd37; + add.s64 %rd26, %rd113, %rd57; + and.b64 %rd115, %rd69, %rd37; + add.s64 %rd27, %rd115, %rd61; + .loc 1 52 28 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:52:28 + setp.lt.s64 %p31, %rd26, 0; + setp.lt.s64 %p32, %rd27, 0; + .loc 1 52 46 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:52:46 + setp.ge.s64 %p33, %rd26, %rd37; + setp.ge.s64 %p34, %rd27, %rd37; + .loc 1 52 38 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:52:38 + or.pred %p35, %p31, %p33; + .loc 1 52 54 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:52:54 + or.pred %p36, %p32, %p34; + and.pred %p37, %p9, %p35; + .loc 1 52 64 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:52:64 + and.pred %p38, %p10, %p36; + or.pred %p39, %p37, %p38; + not.pred %p40, %p39; + @%p40 bra $L__BB0_22; + bra.uni $L__BB0_21; +$L__BB0_22: + .loc 1 0 64 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0:64 + ld.param.b64 %rd32, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_4]; + ld.param.b64 %rd31, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0_param_3]; + .loc 1 40 119 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:40:119 + cvt.f32.bf16 %r33, %rs15; + mov.b32 %r34, 0f00000000; + .loc 1 41 13 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:41:13 + sub.f32 %r35, %r34, %r33; + .loc 1 47 132 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:47:132 + cvt.f32.bf16 %r36, %rs19; + .loc 1 0 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0 + selp.f32 %r37, %r35, %r36, %p28; + .loc 1 40 119 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:40:119 + cvt.f32.bf16 %r38, %rs13; + .loc 1 41 13 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:41:13 + sub.f32 %r39, %r34, %r38; + .loc 1 47 132 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:47:132 + cvt.f32.bf16 %r40, %rs17; + .loc 1 0 0 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:0 + selp.f32 %r41, %r39, %r40, %p27; + .loc 1 26 75 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:26:75 + cvt.f32.bf16 %r42, %rs10; + cvt.f32.bf16 %r43, %rs9; + .loc 1 52 64 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:52:64 + bar.sync 0; + .loc 1 53 40 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:53:40 + mul.lo.s64 %rd124, %rd26, %rd36; + mul.lo.s64 %rd125, %rd27, %rd36; + .loc 1 53 31 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:53:31 + add.s64 %rd127, %rd31, %rd91; + shl.b64 %rd128, %rd124, 1; + add.s64 %rd117, %rd127, %rd128; + add.s64 %rd130, %rd31, %rd94; + shl.b64 %rd131, %rd125, 1; + add.s64 %rd120, %rd130, %rd131; + .loc 1 53 48 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:53:48 + // begin inline asm + mov.u64 %rd118, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd118, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs21, 0x0; + @%p9 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd117 + 0 ], %rd118; + // end inline asm + // begin inline asm + mov.u64 %rd121, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd121, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs22, 0x0; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd120 + 0 ], %rd121; + // end inline asm + .loc 1 33 46 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:46 + mov.b32 %r44, {%rs11, %rs21}; + .loc 1 33 86 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:86 + mov.b32 {%rs25, %rs26}, %r44; + cvt.f32.bf16 %r45, %rs25; + cvt.f32.bf16 %r46, %rs26; + .loc 1 34 18 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:34:18 + mul.f32 %r47, %r41, %r46; + .loc 1 33 46 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:46 + mov.b32 %r48, {%rs12, %rs22}; + .loc 1 33 86 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:33:86 + mov.b32 {%rs27, %rs28}, %r48; + cvt.f32.bf16 %r49, %rs27; + cvt.f32.bf16 %r50, %rs28; + .loc 1 34 18 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:34:18 + mul.f32 %r51, %r37, %r50; + .loc 1 55 19 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:55:19 + fma.rn.f32 %r52, %r43, %r45, %r47; + fma.rn.f32 %r53, %r42, %r49, %r51; + .loc 1 56 25 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:56:25 + add.s64 %rd122, %rd32, %rd64; + add.s64 %rd123, %rd122, 2; + .loc 1 56 37 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:56:37 + cvt.rn.bf16.f32 %rs23, %r52; + cvt.rn.bf16.f32 %rs24, %r53; + // begin inline asm + @%p9 st.global.b16 [ %rd122 + 0 ], { %rs23 }; + // end inline asm + // begin inline asm + @%p10 st.global.b16 [ %rd123 + 0 ], { %rs24 }; + // end inline asm + .loc 1 56 4 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:56:4 + ret; +$L__BB0_19: + .loc 1 32 62 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:32:62 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd139, assertFunc_0; + cvta.global.u64 %rd140, %rd139; + st.param.b64 [param3], %rd140; + mov.b64 %rd141, assertFile_0; + cvta.global.u64 %rd142, %rd141; + st.param.b64 [param1], %rd142; + mov.b64 %rd143, assertMessage_0; + cvta.global.u64 %rd144, %rd143; + st.param.b64 [param0], %rd144; + st.param.b64 [param4], 1; + st.param.b32 [param2], 32; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__BB0_21: + .loc 1 52 64 // c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py:52:64 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd133, assertFunc_1; + cvta.global.u64 %rd134, %rd133; + st.param.b64 [param3], %rd134; + mov.b64 %rd135, assertFile_1; + cvta.global.u64 %rd136, %rd135; + st.param.b64 [param1], %rd136; + mov.b64 %rd137, assertMessage_1; + cvta.global.u64 %rd138, %rd137; + st.param.b64 [param0], %rd138; + st.param.b64 [param4], 1; + st.param.b32 [param2], 52; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 53 +.b8 111 +.b8 115 +.b8 119 +.b8 110 +.b8 114 +.b8 55 +.b8 100 +.b8 112 +.b8 119 +.b8 119 +.b8 99 +.b8 113 +.b8 112 +.b8 53 +.b8 109 +.b8 55 +.b8 116 +.b8 104 +.b8 112 +.b8 97 +.b8 102 +.b8 52 +.b8 111 +.b8 119 +.b8 118 +.b8 112 +.b8 115 +.b8 101 +.b8 107 +.b8 97 +.b8 54 +.b8 97 +.b8 109 +.b8 100 +.b8 118 +.b8 100 +.b8 114 +.b8 111 +.b8 101 +.b8 119 +.b8 110 +.b8 111 +.b8 114 +.b8 122 +.b8 115 +.b8 114 +.b8 101 +.b8 54 +.b8 110 +.b8 50 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 53 +.b8 111 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.source new file mode 100644 index 0000000000000000000000000000000000000000..9615cf1d20fff7769af85b25bf86421bfe5df1eb --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.source @@ -0,0 +1,299 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":18:0) +#loc78 = loc("in_ptr0"(#loc)) +#loc79 = loc("in_ptr1"(#loc)) +#loc80 = loc("in_ptr2"(#loc)) +#loc81 = loc("in_ptr3"(#loc)) +#loc82 = loc("out_ptr0"(#loc)) +#loc83 = loc("ks0"(#loc)) +#loc84 = loc("ks1"(#loc)) +#loc85 = loc("ks2"(#loc)) +#loc86 = loc("ks3"(#loc)) +#loc87 = loc("ks4"(#loc)) +#loc88 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc89) + %xoffset_0 = arith.constant 512 : i32 loc(#loc90) + %xoffset_1 = arith.constant 512 : i32 loc(#loc90) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc90) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc91) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<512xi32> loc(#loc92) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<512xi32> loc(#loc92) + %xmask = tt.splat %xnumel : i32 -> tensor<512xi32> loc(#loc93) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<512xi32> loc(#loc93) + %x2 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc94) + %x2_6 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc94) + %x2_7 = arith.divsi %x2, %x2_6 : tensor<512xi64> loc(#loc94) + %x2_8 = tt.splat %ks1 : i64 -> tensor<512xi64> loc(#loc95) + %x2_9 = arith.remsi %x2_7, %x2_8 : tensor<512xi64> loc(#loc95) + %x0 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc96) + %x0_10 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc96) + %x0_11 = arith.remsi %x0, %x0_10 : tensor<512xi64> loc(#loc96) + %x5 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc97) + %x5_12 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc97) + %x5_13 = arith.divsi %x5, %x5_12 : tensor<512xi64> loc(#loc97) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc98) + %tmp0_14 = tt.addptr %tmp0, %xindex_4 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc98) + %tmp0_15 = tt.load %tmp0_14, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc99) + %tmp0_16 = arith.extf %tmp0_15 : tensor<512xbf16> to tensor<512xf32> loc(#loc100) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc101) + %tmp1_17 = tt.addptr %tmp1, %x2_9 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc101) + %tmp1_18 = tt.load %tmp1_17, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc102) + %tmp3 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc103) + %tmp3_19 = arith.addi %tmp1_18, %tmp3 : tensor<512xi64> loc(#loc103) + %tmp4 = arith.constant 0 : i32 loc(#loc104) + %tmp4_20 = arith.extsi %tmp4 : i32 to i64 loc(#loc104) + %tmp4_21 = tt.splat %tmp4_20 : i64 -> tensor<512xi64> loc(#loc104) + %tmp4_22 = arith.cmpi slt, %tmp1_18, %tmp4_21 : tensor<512xi64> loc(#loc104) + %tmp5 = arith.select %tmp4_22, %tmp3_19, %tmp1_18 : tensor<512xi1>, tensor<512xi64> loc(#loc105) + %c0_i32 = arith.constant 0 : i32 loc(#loc18) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc18) + %1 = tt.splat %0 : i64 -> tensor<512xi64> loc(#loc18) + %2 = arith.cmpi sle, %1, %tmp5 : tensor<512xi64> loc(#loc18) + %3 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc19) + %4 = arith.cmpi slt, %tmp5, %3 : tensor<512xi64> loc(#loc19) + %5 = arith.andi %2, %4 : tensor<512xi1> loc(#loc20) + %true = arith.constant true loc(#loc21) + %cst = arith.constant dense : tensor<512xi1> loc(#loc21) + %6 = arith.xori %xmask_5, %cst : tensor<512xi1> loc(#loc21) + %7 = arith.ori %5, %6 : tensor<512xi1> loc(#loc22) + tt.assert %7, "index out of bounds: 0 <= tmp5 < ks2" : tensor<512xi1> loc(#loc23) + %tmp7 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc106) + %tmp7_23 = arith.muli %tmp7, %tmp5 : tensor<512xi64> loc(#loc106) + %tmp7_24 = arith.addi %x0_11, %tmp7_23 : tensor<512xi64> loc(#loc107) + %tmp7_25 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc108) + %tmp7_26 = tt.addptr %tmp7_25, %tmp7_24 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc108) + %tmp7_27 = tt.load %tmp7_26, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc109) + %tmp7_28 = arith.extf %tmp7_27 : tensor<512xbf16> to tensor<512xf32> loc(#loc110) + %tmp8 = arith.mulf %tmp0_16, %tmp7_28 : tensor<512xf32> loc(#loc111) + %tmp10 = arith.constant 0 : i64 loc(#loc112) + %tmp10_29 = arith.constant dense<0> : tensor<1xi64> loc(#loc112) + %tmp11 = arith.constant dense<0> : tensor<512xi64> loc(#loc113) + %tmp11_30 = arith.cmpi sge, %x0_11, %tmp11 : tensor<512xi64> loc(#loc113) + %tmp12 = arith.constant 2 : i32 loc(#loc114) + %tmp12_31 = arith.constant 2 : i64 loc(#loc114) + %tmp12_32 = arith.divsi %ks3, %tmp12_31 : i64 loc(#loc114) + %tmp12_33 = arith.constant -1 : i32 loc(#loc115) + %tmp12_34 = arith.constant -1 : i64 loc(#loc115) + %tmp12_35 = arith.muli %tmp12_34, %tmp12_32 : i64 loc(#loc115) + %tmp12_36 = arith.addi %ks3, %tmp12_35 : i64 loc(#loc116) + %tmp13 = tt.splat %tmp12_36 : i64 -> tensor<512xi64> loc(#loc117) + %tmp13_37 = arith.cmpi slt, %x0_11, %tmp13 : tensor<512xi64> loc(#loc117) + %tmp14 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc118) + %tmp14_38 = arith.muli %tmp14, %x5_13 : tensor<512xi64> loc(#loc118) + %tmp14_39 = arith.constant 2 : i32 loc(#loc119) + %tmp14_40 = arith.constant 2 : i64 loc(#loc119) + %tmp14_41 = arith.divsi %ks3, %tmp14_40 : i64 loc(#loc119) + %tmp14_42 = tt.splat %tmp14_41 : i64 -> tensor<512xi64> loc(#loc120) + %tmp14_43 = arith.addi %tmp14_38, %tmp14_42 : tensor<512xi64> loc(#loc120) + %tmp14_44 = arith.addi %tmp14_43, %x0_11 : tensor<512xi64> loc(#loc121) + %tmp14_45 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc122) + %tmp14_46 = tt.addptr %tmp14_45, %tmp14_44 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc122) + %tmp14_47 = arith.andi %tmp13_37, %xmask_5 : tensor<512xi1> loc(#loc123) + %tmp14_48 = arith.constant 0.000000e+00 : f32 loc(#loc124) + %tmp14_49 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc124) + %tmp14_50 = arith.truncf %tmp14_49 : tensor<512xf32> to tensor<512xbf16> loc(#loc124) + %tmp14_51 = tt.load %tmp14_46, %tmp14_47, %tmp14_50 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc124) + %tmp14_52 = arith.extf %tmp14_51 : tensor<512xbf16> to tensor<512xf32> loc(#loc125) + %tmp15 = arith.constant 0.000000e+00 : f32 loc(#loc126) + %tmp15_53 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc126) + %tmp15_54 = arith.subf %tmp15_53, %tmp14_52 : tensor<512xf32> loc(#loc126) + %tmp16 = arith.constant 0.000000e+00 : f32 loc(#loc127) + %tmp16_55 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc127) + %tmp17 = arith.select %tmp13_37, %tmp15_54, %tmp16_55 : tensor<512xi1>, tensor<512xf32> loc(#loc128) + %tmp18 = tt.splat %tmp12_36 : i64 -> tensor<512xi64> loc(#loc129) + %tmp18_56 = arith.cmpi sge, %x0_11, %tmp18 : tensor<512xi64> loc(#loc129) + %tmp20 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc130) + %tmp20_57 = arith.cmpi slt, %x0_11, %tmp20 : tensor<512xi64> loc(#loc130) + %tmp21 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc131) + %tmp21_58 = arith.muli %tmp21, %x5_13 : tensor<512xi64> loc(#loc131) + %tmp21_59 = arith.constant -1 : i32 loc(#loc132) + %tmp21_60 = arith.constant -1 : i64 loc(#loc132) + %tmp21_61 = arith.muli %tmp21_60, %ks3 : i64 loc(#loc132) + %tmp21_62 = tt.splat %tmp21_61 : i64 -> tensor<512xi64> loc(#loc133) + %tmp21_63 = arith.addi %x0_11, %tmp21_62 : tensor<512xi64> loc(#loc133) + %tmp21_64 = arith.constant 2 : i32 loc(#loc134) + %tmp21_65 = arith.constant 2 : i64 loc(#loc134) + %tmp21_66 = arith.divsi %ks3, %tmp21_65 : i64 loc(#loc134) + %tmp21_67 = tt.splat %tmp21_66 : i64 -> tensor<512xi64> loc(#loc135) + %tmp21_68 = arith.addi %tmp21_63, %tmp21_67 : tensor<512xi64> loc(#loc135) + %tmp21_69 = arith.addi %tmp21_58, %tmp21_68 : tensor<512xi64> loc(#loc136) + %tmp21_70 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc137) + %tmp21_71 = tt.addptr %tmp21_70, %tmp21_69 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc137) + %tmp21_72 = arith.andi %tmp18_56, %xmask_5 : tensor<512xi1> loc(#loc138) + %tmp21_73 = arith.constant 0.000000e+00 : f32 loc(#loc139) + %tmp21_74 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc139) + %tmp21_75 = arith.truncf %tmp21_74 : tensor<512xf32> to tensor<512xbf16> loc(#loc139) + %tmp21_76 = tt.load %tmp21_71, %tmp21_72, %tmp21_75 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc139) + %tmp21_77 = arith.extf %tmp21_76 : tensor<512xbf16> to tensor<512xf32> loc(#loc140) + %tmp22 = arith.select %tmp13_37, %tmp17, %tmp21_77 : tensor<512xi1>, tensor<512xf32> loc(#loc141) + %tmp24 = tt.splat %ks4 : i64 -> tensor<512xi64> loc(#loc142) + %tmp24_78 = arith.addi %tmp1_18, %tmp24 : tensor<512xi64> loc(#loc142) + %tmp25 = arith.select %tmp4_22, %tmp24_78, %tmp1_18 : tensor<512xi1>, tensor<512xi64> loc(#loc143) + %c0_i32_79 = arith.constant 0 : i32 loc(#loc62) + %8 = arith.extsi %c0_i32_79 : i32 to i64 loc(#loc62) + %9 = tt.splat %8 : i64 -> tensor<512xi64> loc(#loc62) + %10 = arith.cmpi sle, %9, %tmp25 : tensor<512xi64> loc(#loc62) + %11 = tt.splat %ks4 : i64 -> tensor<512xi64> loc(#loc63) + %12 = arith.cmpi slt, %tmp25, %11 : tensor<512xi64> loc(#loc63) + %13 = arith.andi %10, %12 : tensor<512xi1> loc(#loc64) + %true_80 = arith.constant true loc(#loc65) + %cst_81 = arith.constant dense : tensor<512xi1> loc(#loc65) + %14 = arith.xori %xmask_5, %cst_81 : tensor<512xi1> loc(#loc65) + %15 = arith.ori %13, %14 : tensor<512xi1> loc(#loc66) + tt.assert %15, "index out of bounds: 0 <= tmp25 < ks4" : tensor<512xi1> loc(#loc67) + %tmp27 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc144) + %tmp27_82 = arith.muli %tmp27, %tmp25 : tensor<512xi64> loc(#loc144) + %tmp27_83 = arith.addi %x0_11, %tmp27_82 : tensor<512xi64> loc(#loc145) + %tmp27_84 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc146) + %tmp27_85 = tt.addptr %tmp27_84, %tmp27_83 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc146) + %tmp27_86 = tt.load %tmp27_85, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc147) + %tmp27_87 = arith.extf %tmp27_86 : tensor<512xbf16> to tensor<512xf32> loc(#loc148) + %tmp28 = arith.mulf %tmp22, %tmp27_87 : tensor<512xf32> loc(#loc149) + %tmp29 = arith.addf %tmp8, %tmp28 : tensor<512xf32> loc(#loc150) + %16 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc75) + %17 = tt.addptr %16, %xindex_4 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc75) + %18 = arith.truncf %tmp29 : tensor<512xf32> to tensor<512xbf16> loc(#loc76) + tt.store %17, %18, %xmask_5 : tensor<512x!tt.ptr> loc(#loc76) + tt.return loc(#loc77) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":23:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":24:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":25:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":26:30) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":26:35) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":26:75) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":27:30) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":27:35) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":29:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":30:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":31:32) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:28) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:44) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:37) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:54) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:52) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:62) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:39) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:30) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:46) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:86) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":34:18) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":36:28) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":37:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":38:31) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":38:24) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":38:18) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":39:19) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:35) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:48) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:41) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:54) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:31) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:68) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:60) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:119) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":41:13) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":42:38) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":43:35) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":44:20) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":46:19) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:35) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:52) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:47) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:67) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:60) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:41) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:31) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:81) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:73) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:132) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":48:35) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":50:19) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":51:34) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:28) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:46) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:38) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:56) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:54) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:64) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:40) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:36) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:31) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:48) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:88) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":54:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":55:19) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":56:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":56:37) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":56:4) +#loc89 = loc("xoffset"(#loc1)) +#loc90 = loc("xoffset"(#loc2)) +#loc91 = loc("xindex"(#loc3)) +#loc92 = loc("xindex"(#loc4)) +#loc93 = loc("xmask"(#loc5)) +#loc94 = loc("x2"(#loc6)) +#loc95 = loc("x2"(#loc7)) +#loc96 = loc("x0"(#loc8)) +#loc97 = loc("x5"(#loc9)) +#loc98 = loc("tmp0"(#loc10)) +#loc99 = loc("tmp0"(#loc11)) +#loc100 = loc("tmp0"(#loc12)) +#loc101 = loc("tmp1"(#loc13)) +#loc102 = loc("tmp1"(#loc14)) +#loc103 = loc("tmp3"(#loc15)) +#loc104 = loc("tmp4"(#loc16)) +#loc105 = loc("tmp5"(#loc17)) +#loc106 = loc("tmp7"(#loc24)) +#loc107 = loc("tmp7"(#loc25)) +#loc108 = loc("tmp7"(#loc26)) +#loc109 = loc("tmp7"(#loc27)) +#loc110 = loc("tmp7"(#loc28)) +#loc111 = loc("tmp8"(#loc29)) +#loc112 = loc("tmp10"(#loc30)) +#loc113 = loc("tmp11"(#loc31)) +#loc114 = loc("tmp12"(#loc32)) +#loc115 = loc("tmp12"(#loc33)) +#loc116 = loc("tmp12"(#loc34)) +#loc117 = loc("tmp13"(#loc35)) +#loc118 = loc("tmp14"(#loc36)) +#loc119 = loc("tmp14"(#loc37)) +#loc120 = loc("tmp14"(#loc38)) +#loc121 = loc("tmp14"(#loc39)) +#loc122 = loc("tmp14"(#loc40)) +#loc123 = loc("tmp14"(#loc41)) +#loc124 = loc("tmp14"(#loc42)) +#loc125 = loc("tmp14"(#loc43)) +#loc126 = loc("tmp15"(#loc44)) +#loc127 = loc("tmp16"(#loc45)) +#loc128 = loc("tmp17"(#loc46)) +#loc129 = loc("tmp18"(#loc47)) +#loc130 = loc("tmp20"(#loc48)) +#loc131 = loc("tmp21"(#loc49)) +#loc132 = loc("tmp21"(#loc50)) +#loc133 = loc("tmp21"(#loc51)) +#loc134 = loc("tmp21"(#loc52)) +#loc135 = loc("tmp21"(#loc53)) +#loc136 = loc("tmp21"(#loc54)) +#loc137 = loc("tmp21"(#loc55)) +#loc138 = loc("tmp21"(#loc56)) +#loc139 = loc("tmp21"(#loc57)) +#loc140 = loc("tmp21"(#loc58)) +#loc141 = loc("tmp22"(#loc59)) +#loc142 = loc("tmp24"(#loc60)) +#loc143 = loc("tmp25"(#loc61)) +#loc144 = loc("tmp27"(#loc68)) +#loc145 = loc("tmp27"(#loc69)) +#loc146 = loc("tmp27"(#loc70)) +#loc147 = loc("tmp27"(#loc71)) +#loc148 = loc("tmp27"(#loc72)) +#loc149 = loc("tmp28"(#loc73)) +#loc150 = loc("tmp29"(#loc74)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..b8f882d70f98bfe5bb22ef1e6167932220a2b6b9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttgir @@ -0,0 +1,232 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":18:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("in_ptr1"(#loc)) +#loc72 = loc("in_ptr2"(#loc)) +#loc73 = loc("in_ptr3"(#loc)) +#loc74 = loc("out_ptr0"(#loc)) +#loc75 = loc("ks0"(#loc)) +#loc76 = loc("ks1"(#loc)) +#loc77 = loc("ks2"(#loc)) +#loc78 = loc("ks3"(#loc)) +#loc79 = loc("ks4"(#loc)) +#loc80 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<512xi1, #blocked> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xbf16, #blocked> loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<512xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc81) + %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc82) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc83) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32, #blocked> loc(#loc84) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32, #blocked> loc(#loc84) + %xmask = tt.splat %xnumel : i32 -> tensor<512xi32, #blocked> loc(#loc85) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<512xi32, #blocked> loc(#loc85) + %x2 = arith.extsi %xindex_5 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked> loc(#loc86) + %x2_7 = tt.splat %ks0 : i64 -> tensor<512xi64, #blocked> loc(#loc86) + %x2_8 = arith.divsi %x2, %x2_7 : tensor<512xi64, #blocked> loc(#loc86) + %x2_9 = tt.splat %ks1 : i64 -> tensor<512xi64, #blocked> loc(#loc87) + %x2_10 = arith.remsi %x2_8, %x2_9 : tensor<512xi64, #blocked> loc(#loc87) + %x0 = tt.splat %ks3 : i64 -> tensor<512xi64, #blocked> loc(#loc88) + %x0_11 = arith.remsi %x2, %x0 : tensor<512xi64, #blocked> loc(#loc88) + %x5 = arith.divsi %x2, %x0 : tensor<512xi64, #blocked> loc(#loc89) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc90) + %tmp0_12 = tt.addptr %tmp0, %xindex_5 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc90) + %tmp0_13 = tt.load %tmp0_12, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc91) + %tmp0_14 = arith.extf %tmp0_13 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc92) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc93) + %tmp1_15 = tt.addptr %tmp1, %x2_10 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc93) + %tmp1_16 = tt.load %tmp1_15, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc94) + %tmp3 = tt.splat %ks2 : i64 -> tensor<512xi64, #blocked> loc(#loc95) + %tmp3_17 = arith.addi %tmp1_16, %tmp3 : tensor<512xi64, #blocked> loc(#loc95) + %tmp4 = arith.cmpi slt, %tmp1_16, %cst_1 : tensor<512xi64, #blocked> loc(#loc96) + %tmp5 = arith.select %tmp4, %tmp3_17, %tmp1_16 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked> loc(#loc97) + %0 = arith.cmpi sge, %tmp5, %cst_1 : tensor<512xi64, #blocked> loc(#loc19) + %1 = arith.cmpi slt, %tmp5, %tmp3 : tensor<512xi64, #blocked> loc(#loc20) + %2 = arith.andi %0, %1 : tensor<512xi1, #blocked> loc(#loc21) + %3 = arith.xori %xmask_6, %cst : tensor<512xi1, #blocked> loc(#loc22) + %4 = arith.ori %2, %3 : tensor<512xi1, #blocked> loc(#loc23) + tt.assert %4, "index out of bounds: 0 <= tmp5 < ks2" : tensor<512xi1, #blocked> loc(#loc24) + %tmp7 = arith.muli %x0, %tmp5 : tensor<512xi64, #blocked> loc(#loc98) + %tmp7_18 = arith.addi %x0_11, %tmp7 : tensor<512xi64, #blocked> loc(#loc99) + %tmp7_19 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc100) + %tmp7_20 = tt.addptr %tmp7_19, %tmp7_18 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc100) + %tmp7_21 = tt.load %tmp7_20, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc101) + %tmp7_22 = arith.extf %tmp7_21 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc102) + %tmp8 = arith.mulf %tmp0_14, %tmp7_22 : tensor<512xf32, #blocked> loc(#loc103) + %tmp12 = arith.divsi %ks3, %c2_i64 : i64 loc(#loc104) + %tmp12_23 = arith.subi %ks3, %tmp12 : i64 loc(#loc105) + %tmp13 = tt.splat %tmp12_23 : i64 -> tensor<512xi64, #blocked> loc(#loc106) + %tmp13_24 = arith.cmpi slt, %x0_11, %tmp13 : tensor<512xi64, #blocked> loc(#loc106) + %tmp14 = arith.muli %x0, %x5 : tensor<512xi64, #blocked> loc(#loc107) + %tmp14_25 = tt.splat %tmp12 : i64 -> tensor<512xi64, #blocked> loc(#loc108) + %tmp14_26 = arith.addi %tmp14, %tmp14_25 : tensor<512xi64, #blocked> loc(#loc108) + %tmp14_27 = arith.addi %tmp14_26, %x0_11 : tensor<512xi64, #blocked> loc(#loc109) + %tmp14_28 = tt.addptr %tmp0, %tmp14_27 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc110) + %tmp14_29 = arith.andi %tmp13_24, %xmask_6 : tensor<512xi1, #blocked> loc(#loc111) + %tmp14_30 = tt.load %tmp14_28, %tmp14_29, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc112) + %tmp14_31 = arith.extf %tmp14_30 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc113) + %tmp15 = arith.subf %cst_2, %tmp14_31 : tensor<512xf32, #blocked> loc(#loc114) + %tmp18 = arith.cmpi sge, %x0_11, %tmp13 : tensor<512xi64, #blocked> loc(#loc115) + %tmp21 = arith.muli %ks3, %c-1_i64 : i64 loc(#loc116) + %tmp21_32 = tt.splat %tmp21 : i64 -> tensor<512xi64, #blocked> loc(#loc117) + %tmp21_33 = arith.addi %x0_11, %tmp21_32 : tensor<512xi64, #blocked> loc(#loc117) + %tmp21_34 = arith.addi %tmp21_33, %tmp14_25 : tensor<512xi64, #blocked> loc(#loc118) + %tmp21_35 = arith.addi %tmp14, %tmp21_34 : tensor<512xi64, #blocked> loc(#loc119) + %tmp21_36 = tt.addptr %tmp0, %tmp21_35 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc120) + %tmp21_37 = arith.andi %tmp18, %xmask_6 : tensor<512xi1, #blocked> loc(#loc121) + %tmp21_38 = tt.load %tmp21_36, %tmp21_37, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc122) + %tmp21_39 = arith.extf %tmp21_38 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc123) + %tmp22 = arith.select %tmp13_24, %tmp15, %tmp21_39 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc135) + %tmp24 = tt.splat %ks4 : i64 -> tensor<512xi64, #blocked> loc(#loc126) + %tmp24_40 = arith.addi %tmp1_16, %tmp24 : tensor<512xi64, #blocked> loc(#loc126) + %tmp25 = arith.select %tmp4, %tmp24_40, %tmp1_16 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked> loc(#loc127) + %5 = arith.cmpi sge, %tmp25, %cst_1 : tensor<512xi64, #blocked> loc(#loc55) + %6 = arith.cmpi slt, %tmp25, %tmp24 : tensor<512xi64, #blocked> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<512xi1, #blocked> loc(#loc57) + %8 = arith.ori %7, %3 : tensor<512xi1, #blocked> loc(#loc58) + tt.assert %8, "index out of bounds: 0 <= tmp25 < ks4" : tensor<512xi1, #blocked> loc(#loc59) + %tmp27 = arith.muli %x0, %tmp25 : tensor<512xi64, #blocked> loc(#loc128) + %tmp27_41 = arith.addi %x0_11, %tmp27 : tensor<512xi64, #blocked> loc(#loc129) + %tmp27_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc130) + %tmp27_43 = tt.addptr %tmp27_42, %tmp27_41 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc130) + %tmp27_44 = tt.load %tmp27_43, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc131) + %tmp27_45 = arith.extf %tmp27_44 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc132) + %tmp28 = arith.mulf %tmp22, %tmp27_45 : tensor<512xf32, #blocked> loc(#loc133) + %tmp29 = arith.addf %tmp8, %tmp28 : tensor<512xf32, #blocked> loc(#loc134) + %9 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc67) + %10 = tt.addptr %9, %xindex_5 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc67) + %11 = arith.truncf %tmp29 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc68) + tt.store %10, %11, %xmask_6 : tensor<512x!tt.ptr, #blocked> loc(#loc68) + tt.return loc(#loc69) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":23:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":23:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":24:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":25:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":26:30) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":26:35) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":26:75) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":27:30) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":27:35) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":29:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":30:18) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":31:32) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:28) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:44) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:37) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:54) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:52) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:62) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:39) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:35) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:30) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:46) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:86) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":34:18) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":38:31) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":38:18) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":39:19) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:35) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:41) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:68) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:60) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:119) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":41:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":44:20) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:52) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:47) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:60) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:41) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:81) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:73) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:132) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":48:35) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":43:35) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":50:19) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":51:34) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:46) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:38) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:54) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:64) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:40) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:36) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:31) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:48) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:88) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":54:20) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":55:19) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":56:25) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":56:37) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":56:4) +#loc81 = loc("xoffset"(#loc2)) +#loc82 = loc("xoffset"(#loc3)) +#loc83 = loc("xindex"(#loc4)) +#loc84 = loc("xindex"(#loc5)) +#loc85 = loc("xmask"(#loc6)) +#loc86 = loc("x2"(#loc7)) +#loc87 = loc("x2"(#loc8)) +#loc88 = loc("x0"(#loc9)) +#loc89 = loc("x5"(#loc10)) +#loc90 = loc("tmp0"(#loc11)) +#loc91 = loc("tmp0"(#loc12)) +#loc92 = loc("tmp0"(#loc13)) +#loc93 = loc("tmp1"(#loc14)) +#loc94 = loc("tmp1"(#loc15)) +#loc95 = loc("tmp3"(#loc16)) +#loc96 = loc("tmp4"(#loc17)) +#loc97 = loc("tmp5"(#loc18)) +#loc98 = loc("tmp7"(#loc25)) +#loc99 = loc("tmp7"(#loc26)) +#loc100 = loc("tmp7"(#loc27)) +#loc101 = loc("tmp7"(#loc28)) +#loc102 = loc("tmp7"(#loc29)) +#loc103 = loc("tmp8"(#loc30)) +#loc104 = loc("tmp12"(#loc31)) +#loc105 = loc("tmp12"(#loc32)) +#loc106 = loc("tmp13"(#loc33)) +#loc107 = loc("tmp14"(#loc34)) +#loc108 = loc("tmp14"(#loc35)) +#loc109 = loc("tmp14"(#loc36)) +#loc110 = loc("tmp14"(#loc37)) +#loc111 = loc("tmp14"(#loc38)) +#loc112 = loc("tmp14"(#loc39)) +#loc113 = loc("tmp14"(#loc40)) +#loc114 = loc("tmp15"(#loc41)) +#loc115 = loc("tmp18"(#loc42)) +#loc116 = loc("tmp21"(#loc43)) +#loc117 = loc("tmp21"(#loc44)) +#loc118 = loc("tmp21"(#loc45)) +#loc119 = loc("tmp21"(#loc46)) +#loc120 = loc("tmp21"(#loc47)) +#loc121 = loc("tmp21"(#loc48)) +#loc122 = loc("tmp21"(#loc49)) +#loc123 = loc("tmp21"(#loc50)) +#loc124 = loc("tmp22"(#loc51)) +#loc125 = loc("tmp17"(#loc52)) +#loc126 = loc("tmp24"(#loc53)) +#loc127 = loc("tmp25"(#loc54)) +#loc128 = loc("tmp27"(#loc60)) +#loc129 = loc("tmp27"(#loc61)) +#loc130 = loc("tmp27"(#loc62)) +#loc131 = loc("tmp27"(#loc63)) +#loc132 = loc("tmp27"(#loc64)) +#loc133 = loc("tmp28"(#loc65)) +#loc134 = loc("tmp29"(#loc66)) +#loc135 = loc(fused[#loc124, #loc125]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..7ddfbbb1ae37a849f37c60d6101e8b373c51359a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NFABHOURJ57C2IKXWDMS2VHZ76PCVKJVD7V6CBWJDLMT5TQE5GFA/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0.ttir @@ -0,0 +1,231 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":18:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("in_ptr1"(#loc)) +#loc72 = loc("in_ptr2"(#loc)) +#loc73 = loc("in_ptr3"(#loc)) +#loc74 = loc("out_ptr0"(#loc)) +#loc75 = loc("ks0"(#loc)) +#loc76 = loc("ks1"(#loc)) +#loc77 = loc("ks2"(#loc)) +#loc78 = loc("ks3"(#loc)) +#loc79 = loc("ks4"(#loc)) +#loc80 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<512xbf16> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<512xi64> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %cst_2 = arith.constant dense : tensor<512xi1> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc81) + %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc82) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc83) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc84) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc84) + %xmask = tt.splat %xnumel : i32 -> tensor<512xi32> loc(#loc85) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<512xi32> loc(#loc85) + %x2 = arith.extsi %xindex_5 : tensor<512xi32> to tensor<512xi64> loc(#loc86) + %x2_7 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc86) + %x2_8 = arith.divsi %x2, %x2_7 : tensor<512xi64> loc(#loc86) + %x2_9 = tt.splat %ks1 : i64 -> tensor<512xi64> loc(#loc87) + %x2_10 = arith.remsi %x2_8, %x2_9 : tensor<512xi64> loc(#loc87) + %x0 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc88) + %x0_11 = arith.remsi %x2, %x0 : tensor<512xi64> loc(#loc88) + %x5 = arith.divsi %x2, %x0 : tensor<512xi64> loc(#loc89) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc90) + %tmp0_12 = tt.addptr %tmp0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc90) + %tmp0_13 = tt.load %tmp0_12, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc91) + %tmp0_14 = arith.extf %tmp0_13 : tensor<512xbf16> to tensor<512xf32> loc(#loc92) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc93) + %tmp1_15 = tt.addptr %tmp1, %x2_10 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc93) + %tmp1_16 = tt.load %tmp1_15, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc94) + %tmp3 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc95) + %tmp3_17 = arith.addi %tmp1_16, %tmp3 : tensor<512xi64> loc(#loc95) + %tmp4 = arith.cmpi slt, %tmp1_16, %cst_0 : tensor<512xi64> loc(#loc96) + %tmp5 = arith.select %tmp4, %tmp3_17, %tmp1_16 : tensor<512xi1>, tensor<512xi64> loc(#loc97) + %0 = arith.cmpi sge, %tmp5, %cst_0 : tensor<512xi64> loc(#loc19) + %1 = arith.cmpi slt, %tmp5, %tmp3 : tensor<512xi64> loc(#loc20) + %2 = arith.andi %0, %1 : tensor<512xi1> loc(#loc21) + %3 = arith.xori %xmask_6, %cst_2 : tensor<512xi1> loc(#loc22) + %4 = arith.ori %2, %3 : tensor<512xi1> loc(#loc23) + tt.assert %4, "index out of bounds: 0 <= tmp5 < ks2" : tensor<512xi1> loc(#loc24) + %tmp7 = arith.muli %x0, %tmp5 : tensor<512xi64> loc(#loc98) + %tmp7_18 = arith.addi %x0_11, %tmp7 : tensor<512xi64> loc(#loc99) + %tmp7_19 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc100) + %tmp7_20 = tt.addptr %tmp7_19, %tmp7_18 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc100) + %tmp7_21 = tt.load %tmp7_20, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc101) + %tmp7_22 = arith.extf %tmp7_21 : tensor<512xbf16> to tensor<512xf32> loc(#loc102) + %tmp8 = arith.mulf %tmp0_14, %tmp7_22 : tensor<512xf32> loc(#loc103) + %tmp12 = arith.divsi %ks3, %c2_i64 : i64 loc(#loc104) + %tmp12_23 = arith.subi %ks3, %tmp12 : i64 loc(#loc105) + %tmp13 = tt.splat %tmp12_23 : i64 -> tensor<512xi64> loc(#loc106) + %tmp13_24 = arith.cmpi slt, %x0_11, %tmp13 : tensor<512xi64> loc(#loc106) + %tmp14 = arith.muli %x0, %x5 : tensor<512xi64> loc(#loc107) + %tmp14_25 = tt.splat %tmp12 : i64 -> tensor<512xi64> loc(#loc108) + %tmp14_26 = arith.addi %tmp14, %tmp14_25 : tensor<512xi64> loc(#loc108) + %tmp14_27 = arith.addi %tmp14_26, %x0_11 : tensor<512xi64> loc(#loc109) + %tmp14_28 = tt.addptr %tmp0, %tmp14_27 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc110) + %tmp14_29 = arith.andi %tmp13_24, %xmask_6 : tensor<512xi1> loc(#loc111) + %tmp14_30 = tt.load %tmp14_28, %tmp14_29, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc112) + %tmp14_31 = arith.extf %tmp14_30 : tensor<512xbf16> to tensor<512xf32> loc(#loc113) + %tmp15 = arith.subf %cst_1, %tmp14_31 : tensor<512xf32> loc(#loc114) + %tmp18 = arith.cmpi sge, %x0_11, %tmp13 : tensor<512xi64> loc(#loc115) + %tmp21 = arith.muli %ks3, %c-1_i64 : i64 loc(#loc116) + %tmp21_32 = tt.splat %tmp21 : i64 -> tensor<512xi64> loc(#loc117) + %tmp21_33 = arith.addi %x0_11, %tmp21_32 : tensor<512xi64> loc(#loc117) + %tmp21_34 = arith.addi %tmp21_33, %tmp14_25 : tensor<512xi64> loc(#loc118) + %tmp21_35 = arith.addi %tmp14, %tmp21_34 : tensor<512xi64> loc(#loc119) + %tmp21_36 = tt.addptr %tmp0, %tmp21_35 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc120) + %tmp21_37 = arith.andi %tmp18, %xmask_6 : tensor<512xi1> loc(#loc121) + %tmp21_38 = tt.load %tmp21_36, %tmp21_37, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc122) + %tmp21_39 = arith.extf %tmp21_38 : tensor<512xbf16> to tensor<512xf32> loc(#loc123) + %tmp22 = arith.select %tmp13_24, %tmp15, %tmp21_39 : tensor<512xi1>, tensor<512xf32> loc(#loc135) + %tmp24 = tt.splat %ks4 : i64 -> tensor<512xi64> loc(#loc126) + %tmp24_40 = arith.addi %tmp1_16, %tmp24 : tensor<512xi64> loc(#loc126) + %tmp25 = arith.select %tmp4, %tmp24_40, %tmp1_16 : tensor<512xi1>, tensor<512xi64> loc(#loc127) + %5 = arith.cmpi sge, %tmp25, %cst_0 : tensor<512xi64> loc(#loc55) + %6 = arith.cmpi slt, %tmp25, %tmp24 : tensor<512xi64> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<512xi1> loc(#loc57) + %8 = arith.ori %7, %3 : tensor<512xi1> loc(#loc58) + tt.assert %8, "index out of bounds: 0 <= tmp25 < ks4" : tensor<512xi1> loc(#loc59) + %tmp27 = arith.muli %x0, %tmp25 : tensor<512xi64> loc(#loc128) + %tmp27_41 = arith.addi %x0_11, %tmp27 : tensor<512xi64> loc(#loc129) + %tmp27_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc130) + %tmp27_43 = tt.addptr %tmp27_42, %tmp27_41 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc130) + %tmp27_44 = tt.load %tmp27_43, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc131) + %tmp27_45 = arith.extf %tmp27_44 : tensor<512xbf16> to tensor<512xf32> loc(#loc132) + %tmp28 = arith.mulf %tmp22, %tmp27_45 : tensor<512xf32> loc(#loc133) + %tmp29 = arith.addf %tmp8, %tmp28 : tensor<512xf32> loc(#loc134) + %9 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc67) + %10 = tt.addptr %9, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc67) + %11 = arith.truncf %tmp29 : tensor<512xf32> to tensor<512xbf16> loc(#loc68) + tt.store %10, %11, %xmask_6 : tensor<512x!tt.ptr> loc(#loc68) + tt.return loc(#loc69) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":23:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":23:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":24:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":25:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":26:30) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":26:35) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":26:75) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":27:30) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":27:35) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":29:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":30:18) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":31:32) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:28) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:44) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:37) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:54) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:52) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":32:62) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:39) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:35) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:30) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:46) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":33:86) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":34:18) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":38:31) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":38:18) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":39:19) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:35) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:41) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:68) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:60) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":40:119) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":41:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":44:20) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:52) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:47) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:60) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:41) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:81) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:73) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":47:132) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":48:35) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":43:35) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":50:19) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":51:34) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:46) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:38) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:54) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":52:64) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:40) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:36) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:31) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:48) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":53:88) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":54:20) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":55:19) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":56:25) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":56:37) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/5o/c5oswnr7dpwwcqp5m7thpaf4owvpseka6amdvdroewnorzsre6n2.py":56:4) +#loc81 = loc("xoffset"(#loc2)) +#loc82 = loc("xoffset"(#loc3)) +#loc83 = loc("xindex"(#loc4)) +#loc84 = loc("xindex"(#loc5)) +#loc85 = loc("xmask"(#loc6)) +#loc86 = loc("x2"(#loc7)) +#loc87 = loc("x2"(#loc8)) +#loc88 = loc("x0"(#loc9)) +#loc89 = loc("x5"(#loc10)) +#loc90 = loc("tmp0"(#loc11)) +#loc91 = loc("tmp0"(#loc12)) +#loc92 = loc("tmp0"(#loc13)) +#loc93 = loc("tmp1"(#loc14)) +#loc94 = loc("tmp1"(#loc15)) +#loc95 = loc("tmp3"(#loc16)) +#loc96 = loc("tmp4"(#loc17)) +#loc97 = loc("tmp5"(#loc18)) +#loc98 = loc("tmp7"(#loc25)) +#loc99 = loc("tmp7"(#loc26)) +#loc100 = loc("tmp7"(#loc27)) +#loc101 = loc("tmp7"(#loc28)) +#loc102 = loc("tmp7"(#loc29)) +#loc103 = loc("tmp8"(#loc30)) +#loc104 = loc("tmp12"(#loc31)) +#loc105 = loc("tmp12"(#loc32)) +#loc106 = loc("tmp13"(#loc33)) +#loc107 = loc("tmp14"(#loc34)) +#loc108 = loc("tmp14"(#loc35)) +#loc109 = loc("tmp14"(#loc36)) +#loc110 = loc("tmp14"(#loc37)) +#loc111 = loc("tmp14"(#loc38)) +#loc112 = loc("tmp14"(#loc39)) +#loc113 = loc("tmp14"(#loc40)) +#loc114 = loc("tmp15"(#loc41)) +#loc115 = loc("tmp18"(#loc42)) +#loc116 = loc("tmp21"(#loc43)) +#loc117 = loc("tmp21"(#loc44)) +#loc118 = loc("tmp21"(#loc45)) +#loc119 = loc("tmp21"(#loc46)) +#loc120 = loc("tmp21"(#loc47)) +#loc121 = loc("tmp21"(#loc48)) +#loc122 = loc("tmp21"(#loc49)) +#loc123 = loc("tmp21"(#loc50)) +#loc124 = loc("tmp22"(#loc51)) +#loc125 = loc("tmp17"(#loc52)) +#loc126 = loc("tmp24"(#loc53)) +#loc127 = loc("tmp25"(#loc54)) +#loc128 = loc("tmp27"(#loc60)) +#loc129 = loc("tmp27"(#loc61)) +#loc130 = loc("tmp27"(#loc62)) +#loc131 = loc("tmp27"(#loc63)) +#loc132 = loc("tmp27"(#loc64)) +#loc133 = loc("tmp28"(#loc65)) +#loc134 = loc("tmp29"(#loc66)) +#loc135 = loc(fused[#loc124, #loc125]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a159e122bc381482c04c247bd165a23bdb771952 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..9f8d9b90e9388567a6c49f5aa21c45f538650476 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7848edd58fa6cb351a065f121ef7f66022cbb319 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"hash": "696e31cf28240068f521b5857aff70ca1774b115aa6afc9f623811458c0d1f69", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..ab8e6ae0790b2bd516b39cffe2359a4ecb346601 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir @@ -0,0 +1,840 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_1 = internal constant [8 x i8] c"unknown\00" +@assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py\00" +@assertMessage_1 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp49 < 17\00" +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py\00" +@assertMessage_0 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp40 < 17\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %13 = icmp samesign ult i32 %12, 32, !dbg !11 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %15 = and i32 %14, 15, !dbg !12 + %16 = shl i32 %12, 4, !dbg !13 + %17 = or disjoint i32 %15, %16, !dbg !14 + %18 = sext i32 %17 to i64, !dbg !15 + %19 = getelementptr i64, ptr addrspace(1) %0, i64 %18, !dbg !15 + %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 %13) #5, !dbg !16 + %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 %13) #5, !dbg !16 + %22 = add i64 %20, -1, !dbg !17 + %23 = icmp ult i64 %22, 16383, !dbg !17 + %24 = add i64 %21, -1, !dbg !17 + %25 = icmp ult i64 %24, 16383, !dbg !17 + %26 = zext i1 %23 to i32, !dbg !18 + %27 = lshr i32 %14, 1, !dbg !19 + %.lobit = and i32 %27, 1, !dbg !19 + %28 = and i32 %14, 1, !dbg !19 + %29 = lshr i32 %14, 2, !dbg !19 + %.lobit1 = and i32 %29, 1, !dbg !19 + %30 = lshr i32 %14, 3, !dbg !19 + %.lobit2 = and i32 %30, 1, !dbg !19 + %31 = xor i32 %28, 1, !dbg !23 + %32 = xor i32 %.lobit, 1, !dbg !23 + %33 = xor i32 %.lobit1, 1, !dbg !23 + %34 = xor i32 %.lobit2, 1, !dbg !23 + %35 = select i1 %23, i32 %31, i32 0, !dbg !24 + %36 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %35, i32 1, i32 31), !dbg !25 + %37 = add i32 %35, %36, !dbg !28 + %38 = select i1 %23, i32 %28, i32 0, !dbg !29 + %39 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %38, i32 1, i32 31), !dbg !25 + %40 = add i32 %38, %39, !dbg !28 + %41 = mul nuw nsw i32 %31, %15, !dbg !30 + %42 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %41, i32 1, i32 31), !dbg !25 + %43 = add i32 %42, %41, !dbg !28 + %44 = mul nuw nsw i32 %15, %28, !dbg !31 + %45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 1, i32 31), !dbg !25 + %46 = add i32 %45, %44, !dbg !28 + %47 = icmp slt i32 %37, %40, !dbg !32 + %48 = icmp eq i32 %37, %40, !dbg !33 + %49 = icmp sgt i32 %43, %46, !dbg !34 + %50 = and i1 %48, %49, !dbg !35 + %51 = or i1 %47, %50, !dbg !36 + %52 = trunc i32 %27 to i1, !dbg !37 + %53 = xor i1 %51, %52, !dbg !37 + %54 = xor i32 %37, %40, !dbg !38 + %55 = select i1 %53, i32 %54, i32 0, !dbg !39 + %56 = xor i32 %55, %26, !dbg !40 + %57 = xor i32 %46, %43, !dbg !41 + %58 = select i1 %53, i32 %57, i32 0, !dbg !42 + %59 = xor i32 %58, %15, !dbg !43 + %60 = mul nuw nsw i32 %56, %32, !dbg !24 + %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 2, i32 31), !dbg !25 + %62 = add i32 %60, %61, !dbg !28 + %63 = mul nuw nsw i32 %56, %.lobit, !dbg !29 + %64 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 2, i32 31), !dbg !25 + %65 = add i32 %63, %64, !dbg !28 + %66 = mul nuw nsw i32 %59, %32, !dbg !30 + %67 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %66, i32 2, i32 31), !dbg !25 + %68 = add i32 %66, %67, !dbg !28 + %69 = mul nuw nsw i32 %59, %.lobit, !dbg !31 + %70 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %69, i32 2, i32 31), !dbg !25 + %71 = add i32 %69, %70, !dbg !28 + %72 = trunc i32 %29 to i1, !dbg !37 + %73 = icmp sge i32 %62, %65, !dbg !37 + %74 = icmp ne i32 %62, %65, !dbg !37 + %75 = icmp sle i32 %68, %71, !dbg !37 + %76 = or i1 %74, %75, !dbg !37 + %77 = and i1 %73, %76, !dbg !37 + %.not = xor i1 %77, %72, !dbg !37 + %78 = xor i32 %62, %65, !dbg !38 + %79 = select i1 %.not, i32 0, i32 %78, !dbg !39 + %80 = xor i32 %79, %56, !dbg !40 + %81 = xor i32 %68, %71, !dbg !41 + %82 = select i1 %.not, i32 0, i32 %81, !dbg !42 + %83 = xor i32 %82, %59, !dbg !43 + %84 = mul nuw nsw i32 %80, %31, !dbg !24 + %85 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %84, i32 1, i32 31), !dbg !25 + %86 = add i32 %84, %85, !dbg !28 + %87 = mul nuw nsw i32 %80, %28, !dbg !29 + %88 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 1, i32 31), !dbg !25 + %89 = add i32 %87, %88, !dbg !28 + %90 = mul nuw nsw i32 %83, %31, !dbg !30 + %91 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %90, i32 1, i32 31), !dbg !25 + %92 = add i32 %90, %91, !dbg !28 + %93 = mul nuw nsw i32 %83, %28, !dbg !31 + %94 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %93, i32 1, i32 31), !dbg !25 + %95 = add i32 %93, %94, !dbg !28 + %96 = icmp sge i32 %86, %89, !dbg !37 + %97 = icmp ne i32 %86, %89, !dbg !37 + %98 = icmp sle i32 %92, %95, !dbg !37 + %99 = or i1 %97, %98, !dbg !37 + %100 = and i1 %96, %99, !dbg !37 + %.not3 = xor i1 %100, %72, !dbg !37 + %101 = xor i32 %86, %89, !dbg !38 + %102 = select i1 %.not3, i32 0, i32 %101, !dbg !39 + %103 = xor i32 %102, %80, !dbg !40 + %104 = xor i32 %92, %95, !dbg !41 + %105 = select i1 %.not3, i32 0, i32 %104, !dbg !42 + %106 = xor i32 %105, %83, !dbg !43 + %107 = mul nuw nsw i32 %103, %33, !dbg !24 + %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 4, i32 31), !dbg !25 + %109 = add i32 %107, %108, !dbg !28 + %110 = mul nuw nsw i32 %103, %.lobit1, !dbg !29 + %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 4, i32 31), !dbg !25 + %112 = add i32 %110, %111, !dbg !28 + %113 = mul nuw nsw i32 %106, %33, !dbg !30 + %114 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %113, i32 4, i32 31), !dbg !25 + %115 = add i32 %113, %114, !dbg !28 + %116 = mul nuw nsw i32 %106, %.lobit1, !dbg !31 + %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 4, i32 31), !dbg !25 + %118 = add i32 %116, %117, !dbg !28 + %119 = trunc i32 %30 to i1, !dbg !37 + %120 = icmp sge i32 %109, %112, !dbg !37 + %121 = icmp ne i32 %109, %112, !dbg !37 + %122 = icmp sle i32 %115, %118, !dbg !37 + %123 = or i1 %121, %122, !dbg !37 + %124 = and i1 %120, %123, !dbg !37 + %.not4 = xor i1 %124, %119, !dbg !37 + %125 = xor i32 %109, %112, !dbg !38 + %126 = select i1 %.not4, i32 0, i32 %125, !dbg !39 + %127 = xor i32 %126, %103, !dbg !40 + %128 = xor i32 %115, %118, !dbg !41 + %129 = select i1 %.not4, i32 0, i32 %128, !dbg !42 + %130 = xor i32 %129, %106, !dbg !43 + %131 = mul nuw nsw i32 %127, %32, !dbg !24 + %132 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %131, i32 2, i32 31), !dbg !25 + %133 = add i32 %131, %132, !dbg !28 + %134 = mul nuw nsw i32 %127, %.lobit, !dbg !29 + %135 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 2, i32 31), !dbg !25 + %136 = add i32 %134, %135, !dbg !28 + %137 = mul nuw nsw i32 %130, %32, !dbg !30 + %138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 2, i32 31), !dbg !25 + %139 = add i32 %137, %138, !dbg !28 + %140 = mul nuw nsw i32 %130, %.lobit, !dbg !31 + %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 2, i32 31), !dbg !25 + %142 = add i32 %140, %141, !dbg !28 + %143 = icmp sge i32 %133, %136, !dbg !37 + %144 = icmp ne i32 %133, %136, !dbg !37 + %145 = icmp sle i32 %139, %142, !dbg !37 + %146 = or i1 %144, %145, !dbg !37 + %147 = and i1 %143, %146, !dbg !37 + %.not5 = xor i1 %147, %119, !dbg !37 + %148 = xor i32 %133, %136, !dbg !38 + %149 = select i1 %.not5, i32 0, i32 %148, !dbg !39 + %150 = xor i32 %149, %127, !dbg !40 + %151 = xor i32 %139, %142, !dbg !41 + %152 = select i1 %.not5, i32 0, i32 %151, !dbg !42 + %153 = xor i32 %152, %130, !dbg !43 + %154 = mul nuw nsw i32 %150, %31, !dbg !24 + %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 1, i32 31), !dbg !25 + %156 = add i32 %154, %155, !dbg !28 + %157 = mul nuw nsw i32 %150, %28, !dbg !29 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !25 + %159 = add i32 %157, %158, !dbg !28 + %160 = mul nuw nsw i32 %153, %31, !dbg !30 + %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 1, i32 31), !dbg !25 + %162 = add i32 %160, %161, !dbg !28 + %163 = mul nuw nsw i32 %153, %28, !dbg !31 + %164 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %163, i32 1, i32 31), !dbg !25 + %165 = add i32 %163, %164, !dbg !28 + %166 = icmp sge i32 %156, %159, !dbg !37 + %167 = icmp ne i32 %156, %159, !dbg !37 + %168 = icmp sle i32 %162, %165, !dbg !37 + %169 = or i1 %167, %168, !dbg !37 + %170 = and i1 %166, %169, !dbg !37 + %.not6 = xor i1 %170, %119, !dbg !37 + %171 = xor i32 %156, %159, !dbg !38 + %172 = select i1 %.not6, i32 0, i32 %171, !dbg !39 + %173 = xor i32 %172, %150, !dbg !40 + %174 = xor i32 %162, %165, !dbg !41 + %175 = select i1 %.not6, i32 0, i32 %174, !dbg !42 + %176 = xor i32 %175, %153, !dbg !43 + %177 = mul nuw nsw i32 %173, %34, !dbg !24 + %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 8, i32 31), !dbg !25 + %179 = add i32 %177, %178, !dbg !28 + %180 = mul nuw nsw i32 %173, %.lobit2, !dbg !29 + %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 8, i32 31), !dbg !25 + %182 = add i32 %180, %181, !dbg !28 + %183 = mul nuw nsw i32 %176, %34, !dbg !30 + %184 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %183, i32 8, i32 31), !dbg !25 + %185 = add i32 %183, %184, !dbg !28 + %186 = mul nuw nsw i32 %176, %.lobit2, !dbg !31 + %187 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %186, i32 8, i32 31), !dbg !25 + %188 = add i32 %186, %187, !dbg !28 + %189 = icmp slt i32 %179, %182, !dbg !32 + %190 = icmp eq i32 %179, %182, !dbg !33 + %191 = icmp sgt i32 %185, %188, !dbg !34 + %192 = and i1 %190, %191, !dbg !35 + %193 = or i1 %189, %192, !dbg !36 + %194 = xor i32 %179, %182, !dbg !38 + %195 = select i1 %193, i32 %194, i32 0, !dbg !39 + %196 = xor i32 %195, %173, !dbg !40 + %197 = xor i32 %185, %188, !dbg !41 + %198 = select i1 %193, i32 %197, i32 0, !dbg !42 + %199 = xor i32 %198, %176, !dbg !43 + %200 = mul nuw nsw i32 %196, %33, !dbg !24 + %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 4, i32 31), !dbg !25 + %202 = add i32 %200, %201, !dbg !28 + %203 = mul nuw nsw i32 %196, %.lobit1, !dbg !29 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 4, i32 31), !dbg !25 + %205 = add i32 %203, %204, !dbg !28 + %206 = mul nuw nsw i32 %199, %33, !dbg !30 + %207 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %206, i32 4, i32 31), !dbg !25 + %208 = add i32 %206, %207, !dbg !28 + %209 = mul nuw nsw i32 %199, %.lobit1, !dbg !31 + %210 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %209, i32 4, i32 31), !dbg !25 + %211 = add i32 %209, %210, !dbg !28 + %212 = icmp slt i32 %202, %205, !dbg !32 + %213 = icmp eq i32 %202, %205, !dbg !33 + %214 = icmp sgt i32 %208, %211, !dbg !34 + %215 = and i1 %213, %214, !dbg !35 + %216 = or i1 %212, %215, !dbg !36 + %217 = xor i32 %202, %205, !dbg !38 + %218 = select i1 %216, i32 %217, i32 0, !dbg !39 + %219 = xor i32 %218, %196, !dbg !40 + %220 = xor i32 %208, %211, !dbg !41 + %221 = select i1 %216, i32 %220, i32 0, !dbg !42 + %222 = xor i32 %221, %199, !dbg !43 + %223 = mul nuw nsw i32 %219, %32, !dbg !24 + %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 2, i32 31), !dbg !25 + %225 = add i32 %223, %224, !dbg !28 + %226 = mul nuw nsw i32 %219, %.lobit, !dbg !29 + %227 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %226, i32 2, i32 31), !dbg !25 + %228 = add i32 %226, %227, !dbg !28 + %229 = mul nuw nsw i32 %222, %32, !dbg !30 + %230 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %229, i32 2, i32 31), !dbg !25 + %231 = add i32 %229, %230, !dbg !28 + %232 = mul nuw nsw i32 %222, %.lobit, !dbg !31 + %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 2, i32 31), !dbg !25 + %234 = add i32 %232, %233, !dbg !28 + %235 = icmp slt i32 %225, %228, !dbg !32 + %236 = icmp eq i32 %225, %228, !dbg !33 + %237 = icmp sgt i32 %231, %234, !dbg !34 + %238 = and i1 %236, %237, !dbg !35 + %239 = or i1 %235, %238, !dbg !36 + %240 = xor i32 %225, %228, !dbg !38 + %241 = select i1 %239, i32 %240, i32 0, !dbg !39 + %242 = xor i32 %241, %219, !dbg !40 + %243 = xor i32 %231, %234, !dbg !41 + %244 = select i1 %239, i32 %243, i32 0, !dbg !42 + %245 = xor i32 %244, %222, !dbg !43 + %246 = mul nuw nsw i32 %242, %31, !dbg !24 + %247 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 1, i32 31), !dbg !25 + %248 = add i32 %246, %247, !dbg !28 + %249 = mul nuw nsw i32 %242, %28, !dbg !29 + %250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %249, i32 1, i32 31), !dbg !25 + %251 = add i32 %249, %250, !dbg !28 + %252 = mul nuw nsw i32 %245, %31, !dbg !30 + %253 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 1, i32 31), !dbg !25 + %254 = add i32 %252, %253, !dbg !28 + %255 = mul nuw nsw i32 %245, %28, !dbg !31 + %256 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %255, i32 1, i32 31), !dbg !25 + %257 = add i32 %255, %256, !dbg !28 + %258 = icmp slt i32 %248, %251, !dbg !32 + %259 = icmp eq i32 %248, %251, !dbg !33 + %260 = icmp sgt i32 %254, %257, !dbg !34 + %261 = and i1 %259, %260, !dbg !35 + %262 = or i1 %258, %261, !dbg !36 + %263 = xor i32 %254, %257, !dbg !41 + %264 = select i1 %262, i32 %263, i32 0, !dbg !42 + %265 = xor i32 %264, %245, !dbg !43 + %266 = icmp eq i64 %20, 16384, !dbg !44 + %267 = icmp eq i64 %21, 16384, !dbg !44 + %268 = zext i1 %266 to i32, !dbg !18 + %269 = select i1 %266, i32 %31, i32 0, !dbg !45 + %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 1, i32 31), !dbg !47 + %271 = add i32 %270, %269, !dbg !48 + %272 = select i1 %266, i32 %28, i32 0, !dbg !49 + %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 1, i32 31), !dbg !47 + %274 = add i32 %273, %272, !dbg !48 + %275 = icmp slt i32 %271, %274, !dbg !50 + %276 = icmp eq i32 %271, %274, !dbg !51 + %277 = and i1 %49, %276, !dbg !52 + %278 = or i1 %275, %277, !dbg !53 + %279 = xor i1 %278, %52, !dbg !54 + %280 = xor i32 %274, %271, !dbg !55 + %281 = select i1 %279, i32 %280, i32 0, !dbg !56 + %282 = xor i32 %281, %268, !dbg !57 + %283 = select i1 %279, i32 %57, i32 0, !dbg !58 + %284 = xor i32 %283, %15, !dbg !59 + %285 = mul nuw nsw i32 %282, %32, !dbg !45 + %286 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 2, i32 31), !dbg !47 + %287 = add i32 %285, %286, !dbg !48 + %288 = mul nuw nsw i32 %282, %.lobit, !dbg !49 + %289 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %288, i32 2, i32 31), !dbg !47 + %290 = add i32 %288, %289, !dbg !48 + %291 = mul nuw nsw i32 %284, %32, !dbg !60 + %292 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %291, i32 2, i32 31), !dbg !47 + %293 = add i32 %291, %292, !dbg !48 + %294 = mul nuw nsw i32 %284, %.lobit, !dbg !61 + %295 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %294, i32 2, i32 31), !dbg !47 + %296 = add i32 %294, %295, !dbg !48 + %297 = icmp sge i32 %287, %290, !dbg !54 + %298 = icmp ne i32 %287, %290, !dbg !54 + %299 = icmp sle i32 %293, %296, !dbg !54 + %300 = or i1 %298, %299, !dbg !54 + %301 = and i1 %297, %300, !dbg !54 + %.not8 = xor i1 %301, %72, !dbg !54 + %302 = xor i32 %287, %290, !dbg !55 + %303 = select i1 %.not8, i32 0, i32 %302, !dbg !56 + %304 = xor i32 %303, %282, !dbg !57 + %305 = xor i32 %293, %296, !dbg !62 + %306 = select i1 %.not8, i32 0, i32 %305, !dbg !58 + %307 = xor i32 %306, %284, !dbg !59 + %308 = mul nuw nsw i32 %304, %31, !dbg !45 + %309 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 1, i32 31), !dbg !47 + %310 = add i32 %308, %309, !dbg !48 + %311 = mul nuw nsw i32 %304, %28, !dbg !49 + %312 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %311, i32 1, i32 31), !dbg !47 + %313 = add i32 %311, %312, !dbg !48 + %314 = mul nuw nsw i32 %307, %31, !dbg !60 + %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !47 + %316 = add i32 %314, %315, !dbg !48 + %317 = mul nuw nsw i32 %307, %28, !dbg !61 + %318 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %317, i32 1, i32 31), !dbg !47 + %319 = add i32 %317, %318, !dbg !48 + %320 = icmp sge i32 %310, %313, !dbg !54 + %321 = icmp ne i32 %310, %313, !dbg !54 + %322 = icmp sle i32 %316, %319, !dbg !54 + %323 = or i1 %321, %322, !dbg !54 + %324 = and i1 %320, %323, !dbg !54 + %.not9 = xor i1 %324, %72, !dbg !54 + %325 = xor i32 %310, %313, !dbg !55 + %326 = select i1 %.not9, i32 0, i32 %325, !dbg !56 + %327 = xor i32 %326, %304, !dbg !57 + %328 = xor i32 %316, %319, !dbg !62 + %329 = select i1 %.not9, i32 0, i32 %328, !dbg !58 + %330 = xor i32 %329, %307, !dbg !59 + %331 = mul nuw nsw i32 %327, %33, !dbg !45 + %332 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %331, i32 4, i32 31), !dbg !47 + %333 = add i32 %331, %332, !dbg !48 + %334 = mul nuw nsw i32 %327, %.lobit1, !dbg !49 + %335 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %334, i32 4, i32 31), !dbg !47 + %336 = add i32 %334, %335, !dbg !48 + %337 = mul nuw nsw i32 %330, %33, !dbg !60 + %338 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %337, i32 4, i32 31), !dbg !47 + %339 = add i32 %337, %338, !dbg !48 + %340 = mul nuw nsw i32 %330, %.lobit1, !dbg !61 + %341 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %340, i32 4, i32 31), !dbg !47 + %342 = add i32 %340, %341, !dbg !48 + %343 = icmp sge i32 %333, %336, !dbg !54 + %344 = icmp ne i32 %333, %336, !dbg !54 + %345 = icmp sle i32 %339, %342, !dbg !54 + %346 = or i1 %344, %345, !dbg !54 + %347 = and i1 %343, %346, !dbg !54 + %.not10 = xor i1 %347, %119, !dbg !54 + %348 = xor i32 %333, %336, !dbg !55 + %349 = select i1 %.not10, i32 0, i32 %348, !dbg !56 + %350 = xor i32 %349, %327, !dbg !57 + %351 = xor i32 %339, %342, !dbg !62 + %352 = select i1 %.not10, i32 0, i32 %351, !dbg !58 + %353 = xor i32 %352, %330, !dbg !59 + %354 = mul nuw nsw i32 %350, %32, !dbg !45 + %355 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %354, i32 2, i32 31), !dbg !47 + %356 = add i32 %354, %355, !dbg !48 + %357 = mul nuw nsw i32 %350, %.lobit, !dbg !49 + %358 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %357, i32 2, i32 31), !dbg !47 + %359 = add i32 %357, %358, !dbg !48 + %360 = mul nuw nsw i32 %353, %32, !dbg !60 + %361 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %360, i32 2, i32 31), !dbg !47 + %362 = add i32 %360, %361, !dbg !48 + %363 = mul nuw nsw i32 %353, %.lobit, !dbg !61 + %364 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %363, i32 2, i32 31), !dbg !47 + %365 = add i32 %363, %364, !dbg !48 + %366 = icmp sge i32 %356, %359, !dbg !54 + %367 = icmp ne i32 %356, %359, !dbg !54 + %368 = icmp sle i32 %362, %365, !dbg !54 + %369 = or i1 %367, %368, !dbg !54 + %370 = and i1 %366, %369, !dbg !54 + %.not11 = xor i1 %370, %119, !dbg !54 + %371 = xor i32 %356, %359, !dbg !55 + %372 = select i1 %.not11, i32 0, i32 %371, !dbg !56 + %373 = xor i32 %372, %350, !dbg !57 + %374 = xor i32 %362, %365, !dbg !62 + %375 = select i1 %.not11, i32 0, i32 %374, !dbg !58 + %376 = xor i32 %375, %353, !dbg !59 + %377 = mul nuw nsw i32 %373, %31, !dbg !45 + %378 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %377, i32 1, i32 31), !dbg !47 + %379 = add i32 %377, %378, !dbg !48 + %380 = mul nuw nsw i32 %373, %28, !dbg !49 + %381 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %380, i32 1, i32 31), !dbg !47 + %382 = add i32 %380, %381, !dbg !48 + %383 = mul nuw nsw i32 %376, %31, !dbg !60 + %384 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %383, i32 1, i32 31), !dbg !47 + %385 = add i32 %383, %384, !dbg !48 + %386 = mul nuw nsw i32 %376, %28, !dbg !61 + %387 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %386, i32 1, i32 31), !dbg !47 + %388 = add i32 %386, %387, !dbg !48 + %389 = icmp sge i32 %379, %382, !dbg !54 + %390 = icmp ne i32 %379, %382, !dbg !54 + %391 = icmp sle i32 %385, %388, !dbg !54 + %392 = or i1 %390, %391, !dbg !54 + %393 = and i1 %389, %392, !dbg !54 + %.not12 = xor i1 %393, %119, !dbg !54 + %394 = xor i32 %379, %382, !dbg !55 + %395 = select i1 %.not12, i32 0, i32 %394, !dbg !56 + %396 = xor i32 %395, %373, !dbg !57 + %397 = xor i32 %385, %388, !dbg !62 + %398 = select i1 %.not12, i32 0, i32 %397, !dbg !58 + %399 = xor i32 %398, %376, !dbg !59 + %400 = mul nuw nsw i32 %396, %34, !dbg !45 + %401 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %400, i32 8, i32 31), !dbg !47 + %402 = add i32 %400, %401, !dbg !48 + %403 = mul nuw nsw i32 %396, %.lobit2, !dbg !49 + %404 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %403, i32 8, i32 31), !dbg !47 + %405 = add i32 %403, %404, !dbg !48 + %406 = mul nuw nsw i32 %399, %34, !dbg !60 + %407 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %406, i32 8, i32 31), !dbg !47 + %408 = add i32 %406, %407, !dbg !48 + %409 = mul nuw nsw i32 %399, %.lobit2, !dbg !61 + %410 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %409, i32 8, i32 31), !dbg !47 + %411 = add i32 %409, %410, !dbg !48 + %412 = icmp slt i32 %402, %405, !dbg !50 + %413 = icmp eq i32 %402, %405, !dbg !51 + %414 = icmp sgt i32 %408, %411, !dbg !63 + %415 = and i1 %413, %414, !dbg !52 + %416 = or i1 %412, %415, !dbg !53 + %417 = xor i32 %402, %405, !dbg !55 + %418 = select i1 %416, i32 %417, i32 0, !dbg !56 + %419 = xor i32 %418, %396, !dbg !57 + %420 = xor i32 %408, %411, !dbg !62 + %421 = select i1 %416, i32 %420, i32 0, !dbg !58 + %422 = xor i32 %421, %399, !dbg !59 + %423 = mul nuw nsw i32 %419, %33, !dbg !45 + %424 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %423, i32 4, i32 31), !dbg !47 + %425 = add i32 %423, %424, !dbg !48 + %426 = mul nuw nsw i32 %419, %.lobit1, !dbg !49 + %427 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %426, i32 4, i32 31), !dbg !47 + %428 = add i32 %426, %427, !dbg !48 + %429 = mul nuw nsw i32 %422, %33, !dbg !60 + %430 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %429, i32 4, i32 31), !dbg !47 + %431 = add i32 %429, %430, !dbg !48 + %432 = mul nuw nsw i32 %422, %.lobit1, !dbg !61 + %433 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %432, i32 4, i32 31), !dbg !47 + %434 = add i32 %432, %433, !dbg !48 + %435 = icmp slt i32 %425, %428, !dbg !50 + %436 = icmp eq i32 %425, %428, !dbg !51 + %437 = icmp sgt i32 %431, %434, !dbg !63 + %438 = and i1 %436, %437, !dbg !52 + %439 = or i1 %435, %438, !dbg !53 + %440 = xor i32 %425, %428, !dbg !55 + %441 = select i1 %439, i32 %440, i32 0, !dbg !56 + %442 = xor i32 %441, %419, !dbg !57 + %443 = xor i32 %431, %434, !dbg !62 + %444 = select i1 %439, i32 %443, i32 0, !dbg !58 + %445 = xor i32 %444, %422, !dbg !59 + %446 = mul nuw nsw i32 %442, %32, !dbg !45 + %447 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %446, i32 2, i32 31), !dbg !47 + %448 = add i32 %446, %447, !dbg !48 + %449 = mul nuw nsw i32 %442, %.lobit, !dbg !49 + %450 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %449, i32 2, i32 31), !dbg !47 + %451 = add i32 %449, %450, !dbg !48 + %452 = mul nuw nsw i32 %445, %32, !dbg !60 + %453 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %452, i32 2, i32 31), !dbg !47 + %454 = add i32 %452, %453, !dbg !48 + %455 = mul nuw nsw i32 %445, %.lobit, !dbg !61 + %456 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %455, i32 2, i32 31), !dbg !47 + %457 = add i32 %455, %456, !dbg !48 + %458 = icmp slt i32 %448, %451, !dbg !50 + %459 = icmp eq i32 %448, %451, !dbg !51 + %460 = icmp sgt i32 %454, %457, !dbg !63 + %461 = and i1 %459, %460, !dbg !52 + %462 = or i1 %458, %461, !dbg !53 + %463 = xor i32 %448, %451, !dbg !55 + %464 = select i1 %462, i32 %463, i32 0, !dbg !56 + %465 = xor i32 %464, %442, !dbg !57 + %466 = xor i32 %454, %457, !dbg !62 + %467 = select i1 %462, i32 %466, i32 0, !dbg !58 + %468 = xor i32 %467, %445, !dbg !59 + %469 = mul nuw nsw i32 %465, %31, !dbg !45 + %470 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %469, i32 1, i32 31), !dbg !47 + %471 = add i32 %469, %470, !dbg !48 + %472 = mul nuw nsw i32 %465, %28, !dbg !49 + %473 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %472, i32 1, i32 31), !dbg !47 + %474 = add i32 %472, %473, !dbg !48 + %475 = mul nuw nsw i32 %468, %31, !dbg !60 + %476 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %475, i32 1, i32 31), !dbg !47 + %477 = add i32 %475, %476, !dbg !48 + %478 = mul nuw nsw i32 %468, %28, !dbg !61 + %479 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %478, i32 1, i32 31), !dbg !47 + %480 = add i32 %478, %479, !dbg !48 + %481 = icmp slt i32 %471, %474, !dbg !50 + %482 = icmp eq i32 %471, %474, !dbg !51 + %483 = icmp sgt i32 %477, %480, !dbg !63 + %484 = and i1 %482, %483, !dbg !52 + %485 = or i1 %481, %484, !dbg !53 + %486 = xor i32 %477, %480, !dbg !62 + %487 = select i1 %485, i32 %486, i32 0, !dbg !58 + %488 = xor i32 %487, %468, !dbg !59 + %narrow = select i1 %13, i1 %23, i1 false, !dbg !64 + %489 = zext i1 %narrow to i64, !dbg !64 + %narrow13 = select i1 %13, i1 %25, i1 false, !dbg !64 + %490 = zext i1 %narrow13 to i64, !dbg !64 + %491 = zext i1 %narrow to i32, !dbg !65 + %492 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %491, i32 8, i32 31), !dbg !65 + %493 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 8, i32 31), !dbg !65 + %494 = insertelement <2 x i32> poison, i32 %492, i64 0, !dbg !65 + %495 = insertelement <2 x i32> %494, i32 %493, i64 1, !dbg !65 + %496 = bitcast <2 x i32> %495 to i64, !dbg !65 + %497 = add i64 %496, %489, !dbg !67 + %extelt.offset = lshr i64 %497, 32, !dbg !65 + %498 = trunc nuw i64 %extelt.offset to i32, !dbg !65 + %499 = trunc i64 %497 to i32, !dbg !65 + %500 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %499, i32 4, i32 31), !dbg !65 + %501 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %498, i32 4, i32 31), !dbg !65 + %502 = insertelement <2 x i32> poison, i32 %500, i64 0, !dbg !65 + %503 = insertelement <2 x i32> %502, i32 %501, i64 1, !dbg !65 + %504 = bitcast <2 x i32> %503 to i64, !dbg !65 + %505 = add i64 %497, %504, !dbg !67 + %extelt.offset14 = lshr i64 %505, 32, !dbg !65 + %506 = trunc nuw i64 %extelt.offset14 to i32, !dbg !65 + %507 = trunc i64 %505 to i32, !dbg !65 + %508 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %507, i32 2, i32 31), !dbg !65 + %509 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %506, i32 2, i32 31), !dbg !65 + %510 = insertelement <2 x i32> poison, i32 %508, i64 0, !dbg !65 + %511 = insertelement <2 x i32> %510, i32 %509, i64 1, !dbg !65 + %512 = bitcast <2 x i32> %511 to i64, !dbg !65 + %513 = add i64 %505, %512, !dbg !67 + %extelt.offset15 = lshr i64 %513, 32, !dbg !65 + %514 = trunc nuw i64 %extelt.offset15 to i32, !dbg !65 + %515 = trunc i64 %513 to i32, !dbg !65 + %516 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %515, i32 1, i32 31), !dbg !65 + %517 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %514, i32 1, i32 31), !dbg !65 + %518 = insertelement <2 x i32> poison, i32 %516, i64 0, !dbg !65 + %519 = insertelement <2 x i32> %518, i32 %517, i64 1, !dbg !65 + %520 = bitcast <2 x i32> %519 to i64, !dbg !65 + %521 = add i64 %513, %520, !dbg !67 + %522 = zext i1 %narrow13 to i32, !dbg !65 + %523 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %522, i32 8, i32 31), !dbg !65 + %524 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 8, i32 31), !dbg !65 + %525 = insertelement <2 x i32> poison, i32 %523, i64 0, !dbg !65 + %526 = insertelement <2 x i32> %525, i32 %524, i64 1, !dbg !65 + %527 = bitcast <2 x i32> %526 to i64, !dbg !65 + %528 = add i64 %527, %490, !dbg !67 + %extelt.offset17 = lshr i64 %528, 32, !dbg !65 + %529 = trunc nuw i64 %extelt.offset17 to i32, !dbg !65 + %530 = trunc i64 %528 to i32, !dbg !65 + %531 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %530, i32 4, i32 31), !dbg !65 + %532 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %529, i32 4, i32 31), !dbg !65 + %533 = insertelement <2 x i32> poison, i32 %531, i64 0, !dbg !65 + %534 = insertelement <2 x i32> %533, i32 %532, i64 1, !dbg !65 + %535 = bitcast <2 x i32> %534 to i64, !dbg !65 + %536 = add i64 %528, %535, !dbg !67 + %extelt.offset18 = lshr i64 %536, 32, !dbg !65 + %537 = trunc nuw i64 %extelt.offset18 to i32, !dbg !65 + %538 = trunc i64 %536 to i32, !dbg !65 + %539 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %538, i32 2, i32 31), !dbg !65 + %540 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %537, i32 2, i32 31), !dbg !65 + %541 = insertelement <2 x i32> poison, i32 %539, i64 0, !dbg !65 + %542 = insertelement <2 x i32> %541, i32 %540, i64 1, !dbg !65 + %543 = bitcast <2 x i32> %542 to i64, !dbg !65 + %544 = add i64 %536, %543, !dbg !67 + %extelt.offset19 = lshr i64 %544, 32, !dbg !65 + %545 = trunc nuw i64 %extelt.offset19 to i32, !dbg !65 + %546 = trunc i64 %544 to i32, !dbg !65 + %547 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %546, i32 1, i32 31), !dbg !65 + %548 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %545, i32 1, i32 31), !dbg !65 + %narrow20 = select i1 %13, i1 %266, i1 false, !dbg !68 + %549 = zext i1 %narrow20 to i64, !dbg !68 + %narrow21 = select i1 %13, i1 %267, i1 false, !dbg !68 + %550 = zext i1 %narrow21 to i64, !dbg !68 + %551 = zext i1 %narrow20 to i32, !dbg !69 + %552 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %551, i32 8, i32 31), !dbg !69 + %553 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 8, i32 31), !dbg !69 + %554 = insertelement <2 x i32> poison, i32 %552, i64 0, !dbg !69 + %555 = insertelement <2 x i32> %554, i32 %553, i64 1, !dbg !69 + %556 = bitcast <2 x i32> %555 to i64, !dbg !69 + %557 = add i64 %556, %549, !dbg !71 + %extelt.offset23 = lshr i64 %557, 32, !dbg !69 + %558 = trunc nuw i64 %extelt.offset23 to i32, !dbg !69 + %559 = trunc i64 %557 to i32, !dbg !69 + %560 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %559, i32 4, i32 31), !dbg !69 + %561 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %558, i32 4, i32 31), !dbg !69 + %562 = insertelement <2 x i32> poison, i32 %560, i64 0, !dbg !69 + %563 = insertelement <2 x i32> %562, i32 %561, i64 1, !dbg !69 + %564 = bitcast <2 x i32> %563 to i64, !dbg !69 + %565 = add i64 %557, %564, !dbg !71 + %extelt.offset24 = lshr i64 %565, 32, !dbg !69 + %566 = trunc nuw i64 %extelt.offset24 to i32, !dbg !69 + %567 = trunc i64 %565 to i32, !dbg !69 + %568 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %567, i32 2, i32 31), !dbg !69 + %569 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %566, i32 2, i32 31), !dbg !69 + %570 = insertelement <2 x i32> poison, i32 %568, i64 0, !dbg !69 + %571 = insertelement <2 x i32> %570, i32 %569, i64 1, !dbg !69 + %572 = bitcast <2 x i32> %571 to i64, !dbg !69 + %573 = add i64 %565, %572, !dbg !71 + %extelt.offset25 = lshr i64 %573, 32, !dbg !69 + %574 = trunc nuw i64 %extelt.offset25 to i32, !dbg !69 + %575 = trunc i64 %573 to i32, !dbg !69 + %576 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %575, i32 1, i32 31), !dbg !69 + %577 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %574, i32 1, i32 31), !dbg !69 + %578 = zext i1 %narrow21 to i32, !dbg !69 + %579 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %578, i32 8, i32 31), !dbg !69 + %580 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 8, i32 31), !dbg !69 + %581 = insertelement <2 x i32> poison, i32 %579, i64 0, !dbg !69 + %582 = insertelement <2 x i32> %581, i32 %580, i64 1, !dbg !69 + %583 = bitcast <2 x i32> %582 to i64, !dbg !69 + %584 = add i64 %583, %550, !dbg !71 + %extelt.offset27 = lshr i64 %584, 32, !dbg !69 + %585 = trunc nuw i64 %extelt.offset27 to i32, !dbg !69 + %586 = trunc i64 %584 to i32, !dbg !69 + %587 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %586, i32 4, i32 31), !dbg !69 + %588 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %585, i32 4, i32 31), !dbg !69 + %589 = insertelement <2 x i32> poison, i32 %587, i64 0, !dbg !69 + %590 = insertelement <2 x i32> %589, i32 %588, i64 1, !dbg !69 + %591 = bitcast <2 x i32> %590 to i64, !dbg !69 + %592 = add i64 %584, %591, !dbg !71 + %extelt.offset28 = lshr i64 %592, 32, !dbg !69 + %593 = trunc nuw i64 %extelt.offset28 to i32, !dbg !69 + %594 = trunc i64 %592 to i32, !dbg !69 + %595 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %594, i32 2, i32 31), !dbg !69 + %596 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %593, i32 2, i32 31), !dbg !69 + %597 = insertelement <2 x i32> poison, i32 %595, i64 0, !dbg !69 + %598 = insertelement <2 x i32> %597, i32 %596, i64 1, !dbg !69 + %599 = bitcast <2 x i32> %598 to i64, !dbg !69 + %600 = add i64 %592, %599, !dbg !71 + %extelt.offset29 = lshr i64 %600, 32, !dbg !69 + %601 = trunc nuw i64 %extelt.offset29 to i32, !dbg !69 + %602 = trunc i64 %600 to i32, !dbg !69 + %603 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %602, i32 1, i32 31), !dbg !69 + %604 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %601, i32 1, i32 31), !dbg !69 + %605 = trunc i64 %521 to i32, !dbg !72 + %606 = icmp slt i32 %15, %605, !dbg !73 + %607 = select i1 %606, i32 %265, i32 16, !dbg !74 + %608 = add i32 %607, 17, !dbg !75 + %609 = icmp slt i32 %607, 0, !dbg !76 + %610 = select i1 %609, i32 %608, i32 %607, !dbg !77 + %611 = icmp ult i32 %610, 17, !dbg !78 + %612 = icmp samesign ugt i32 %12, 31, !dbg !18 + %613 = or i1 %612, %611, !dbg !79 + br i1 %613, label %615, label %614, !dbg !80 + +614: ; preds = %11 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 71, ptr nonnull @assertFunc_0, i64 1), !dbg !80 + unreachable, !dbg !80 + +615: ; preds = %11 + %616 = insertelement <2 x i32> poison, i32 %576, i64 0, !dbg !69 + %617 = insertelement <2 x i32> %616, i32 %577, i64 1, !dbg !69 + %618 = bitcast <2 x i32> %617 to i64, !dbg !69 + %619 = add i64 %573, %618, !dbg !71 + %620 = trunc i64 %619 to i32, !dbg !81 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !80 + %621 = icmp slt i32 %15, %620, !dbg !82 + %622 = select i1 %621, i32 %488, i32 16, !dbg !83 + %623 = add i32 %622, 17, !dbg !84 + %624 = icmp slt i32 %622, 0, !dbg !85 + %625 = select i1 %624, i32 %623, i32 %622, !dbg !86 + %626 = icmp ult i32 %625, 17, !dbg !87 + %627 = or i1 %612, %626, !dbg !88 + br i1 %627, label %629, label %628, !dbg !89 + +628: ; preds = %615 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 80, ptr nonnull @assertFunc_1, i64 1), !dbg !89 + unreachable, !dbg !89 + +629: ; preds = %615 + %630 = insertelement <2 x i32> poison, i32 %603, i64 0, !dbg !69 + %631 = insertelement <2 x i32> %630, i32 %604, i64 1, !dbg !69 + %632 = bitcast <2 x i32> %631 to i64, !dbg !69 + %633 = add i64 %600, %632, !dbg !71 + %634 = trunc i64 %633 to i32, !dbg !81 + %635 = insertelement <2 x i32> poison, i32 %547, i64 0, !dbg !65 + %636 = insertelement <2 x i32> %635, i32 %548, i64 1, !dbg !65 + %637 = bitcast <2 x i32> %636 to i64, !dbg !65 + %638 = add i64 %544, %637, !dbg !67 + %639 = trunc i64 %638 to i32, !dbg !72 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !89 + %640 = zext nneg i32 %12 to i64, !dbg !90 + %641 = getelementptr i32, ptr addrspace(1) %1, i64 %640, !dbg !90 + %642 = and i32 %14, 63, !dbg !91 + %643 = icmp eq i32 %642, 0, !dbg !91 + %644 = and i1 %13, %643, !dbg !91 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %639, ptr addrspace(1) %641, i1 %644) #5, !dbg !91 + %645 = getelementptr i32, ptr addrspace(1) %2, i64 %640, !dbg !92 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %634, ptr addrspace(1) %645, i1 %644) #5, !dbg !93 + %646 = getelementptr i32, ptr addrspace(1) %3, i64 %18, !dbg !94 + %647 = and i32 %14, 48, !dbg !95 + %648 = icmp eq i32 %647, 0, !dbg !95 + %649 = and i1 %13, %648, !dbg !95 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %265, ptr addrspace(1) %646, i1 %649) #5, !dbg !95 + %650 = mul i32 %12, 17, !dbg !96 + %651 = add i32 %610, %650, !dbg !97 + %652 = sext i32 %651 to i64, !dbg !98 + %653 = getelementptr i32, ptr addrspace(1) %4, i64 %652, !dbg !98 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %653, i1 %649) #5, !dbg !99 + %654 = getelementptr i32, ptr addrspace(1) %5, i64 %18, !dbg !100 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %488, ptr addrspace(1) %654, i1 %649) #5, !dbg !101 + %655 = add i32 %625, %650, !dbg !102 + %656 = sext i32 %655 to i64, !dbg !103 + %657 = getelementptr i32, ptr addrspace(1) %6, i64 %656, !dbg !103 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %657, i1 %649) #5, !dbg !104 + ret void, !dbg !105 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="64" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", linkageName: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 24, column: 28, scope: !9) +!11 = !DILocation(line: 26, column: 21, scope: !9) +!12 = !DILocation(line: 27, column: 38, scope: !9) +!13 = !DILocation(line: 34, column: 40, scope: !9) +!14 = !DILocation(line: 34, column: 37, scope: !9) +!15 = !DILocation(line: 34, column: 30, scope: !9) +!16 = !DILocation(line: 34, column: 45, scope: !9) +!17 = !DILocation(line: 39, column: 18, scope: !9) +!18 = !DILocation(line: 0, scope: !9) +!19 = !DILocation(line: 627, column: 44, scope: !20, inlinedAt: !22) +!20 = distinct !DILexicalBlockFile(scope: !9, file: !21, discriminator: 0) +!21 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!22 = !DILocation(line: 46, column: 71, scope: !9) +!23 = !DILocation(line: 537, column: 21, scope: !20, inlinedAt: !22) +!24 = !DILocation(line: 538, column: 40, scope: !20, inlinedAt: !22) +!25 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !22) +!26 = distinct !DILexicalBlockFile(scope: !9, file: !27, discriminator: 0) +!27 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!28 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !22) +!29 = !DILocation(line: 539, column: 41, scope: !20, inlinedAt: !22) +!30 = !DILocation(line: 548, column: 23, scope: !20, inlinedAt: !22) +!31 = !DILocation(line: 551, column: 23, scope: !20, inlinedAt: !22) +!32 = !DILocation(line: 574, column: 22, scope: !20, inlinedAt: !22) +!33 = !DILocation(line: 591, column: 21, scope: !20, inlinedAt: !22) +!34 = !DILocation(line: 594, column: 40, scope: !20, inlinedAt: !22) +!35 = !DILocation(line: 594, column: 29, scope: !20, inlinedAt: !22) +!36 = !DILocation(line: 594, column: 23, scope: !20, inlinedAt: !22) +!37 = !DILocation(line: 599, column: 28, scope: !20, inlinedAt: !22) +!38 = !DILocation(line: 600, column: 38, scope: !20, inlinedAt: !22) +!39 = !DILocation(line: 600, column: 46, scope: !20, inlinedAt: !22) +!40 = !DILocation(line: 600, column: 15, scope: !20, inlinedAt: !22) +!41 = !DILocation(line: 601, column: 48, scope: !20, inlinedAt: !22) +!42 = !DILocation(line: 601, column: 59, scope: !20, inlinedAt: !22) +!43 = !DILocation(line: 601, column: 22, scope: !20, inlinedAt: !22) +!44 = !DILocation(line: 47, column: 20, scope: !9) +!45 = !DILocation(line: 538, column: 40, scope: !20, inlinedAt: !46) +!46 = !DILocation(line: 51, column: 71, scope: !9) +!47 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !46) +!48 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !46) +!49 = !DILocation(line: 539, column: 41, scope: !20, inlinedAt: !46) +!50 = !DILocation(line: 574, column: 22, scope: !20, inlinedAt: !46) +!51 = !DILocation(line: 591, column: 21, scope: !20, inlinedAt: !46) +!52 = !DILocation(line: 594, column: 29, scope: !20, inlinedAt: !46) +!53 = !DILocation(line: 594, column: 23, scope: !20, inlinedAt: !46) +!54 = !DILocation(line: 599, column: 28, scope: !20, inlinedAt: !46) +!55 = !DILocation(line: 600, column: 38, scope: !20, inlinedAt: !46) +!56 = !DILocation(line: 600, column: 46, scope: !20, inlinedAt: !46) +!57 = !DILocation(line: 600, column: 15, scope: !20, inlinedAt: !46) +!58 = !DILocation(line: 601, column: 59, scope: !20, inlinedAt: !46) +!59 = !DILocation(line: 601, column: 22, scope: !20, inlinedAt: !46) +!60 = !DILocation(line: 548, column: 23, scope: !20, inlinedAt: !46) +!61 = !DILocation(line: 551, column: 23, scope: !20, inlinedAt: !46) +!62 = !DILocation(line: 601, column: 48, scope: !20, inlinedAt: !46) +!63 = !DILocation(line: 594, column: 40, scope: !20, inlinedAt: !46) +!64 = !DILocation(line: 54, column: 35, scope: !9) +!65 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !66) +!66 = !DILocation(line: 55, column: 26, scope: !9) +!67 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !66) +!68 = !DILocation(line: 58, column: 35, scope: !9) +!69 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !70) +!70 = !DILocation(line: 59, column: 26, scope: !9) +!71 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !70) +!72 = !DILocation(line: 60, column: 21, scope: !9) +!73 = !DILocation(line: 64, column: 19, scope: !9) +!74 = !DILocation(line: 66, column: 35, scope: !9) +!75 = !DILocation(line: 68, column: 20, scope: !9) +!76 = !DILocation(line: 69, column: 20, scope: !9) +!77 = !DILocation(line: 70, column: 35, scope: !9) +!78 = !DILocation(line: 71, column: 38, scope: !9) +!79 = !DILocation(line: 71, column: 53, scope: !9) +!80 = !DILocation(line: 71, column: 63, scope: !9) +!81 = !DILocation(line: 61, column: 21, scope: !9) +!82 = !DILocation(line: 75, column: 19, scope: !9) +!83 = !DILocation(line: 76, column: 35, scope: !9) +!84 = !DILocation(line: 77, column: 20, scope: !9) +!85 = !DILocation(line: 78, column: 20, scope: !9) +!86 = !DILocation(line: 79, column: 35, scope: !9) +!87 = !DILocation(line: 80, column: 38, scope: !9) +!88 = !DILocation(line: 80, column: 53, scope: !9) +!89 = !DILocation(line: 80, column: 63, scope: !9) +!90 = !DILocation(line: 81, column: 25, scope: !9) +!91 = !DILocation(line: 81, column: 37, scope: !9) +!92 = !DILocation(line: 82, column: 25, scope: !9) +!93 = !DILocation(line: 82, column: 37, scope: !9) +!94 = !DILocation(line: 83, column: 25, scope: !9) +!95 = !DILocation(line: 83, column: 47, scope: !9) +!96 = !DILocation(line: 84, column: 52, scope: !9) +!97 = !DILocation(line: 84, column: 49, scope: !9) +!98 = !DILocation(line: 84, column: 25, scope: !9) +!99 = !DILocation(line: 84, column: 85, scope: !9) +!100 = !DILocation(line: 85, column: 25, scope: !9) +!101 = !DILocation(line: 85, column: 47, scope: !9) +!102 = !DILocation(line: 86, column: 49, scope: !9) +!103 = !DILocation(line: 86, column: 25, scope: !9) +!104 = !DILocation(line: 86, column: 85, scope: !9) +!105 = !DILocation(line: 86, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..e8385d8e52e1f654652a7dca2c3b8edebb357221 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx @@ -0,0 +1,1673 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 // -- Begin function triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 55, 122, 47, 99, 55, 122, 50, 106, 98, 106, 117, 98, 51, 97, 117, 112, 103, 110, 101, 99, 104, 111, 108, 54, 53, 118, 107, 118, 105, 53, 114, 117, 119, 112, 121, 108, 122, 111, 115, 100, 98, 113, 118, 115, 99, 100, 121, 120, 109, 114, 101, 98, 51, 106, 121, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 57, 32, 60, 32, 49, 55}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 55, 122, 47, 99, 55, 122, 50, 106, 98, 106, 117, 98, 51, 97, 117, 112, 103, 110, 101, 99, 104, 111, 108, 54, 53, 118, 107, 118, 105, 53, 114, 117, 119, 112, 121, 108, 122, 111, 115, 100, 98, 113, 118, 115, 99, 100, 121, 120, 109, 114, 101, 98, 51, 106, 121, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 48, 32, 60, 32, 49, 55}; + // @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.visible .entry triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_7, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_8, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_9, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_10 +) +.reqntid 64 +{ + .reg .pred %p<140>; + .reg .b32 %r<453>; + .reg .b64 %rd<107>; + .loc 1 18 0 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:18:0 + +// %bb.0: + ld.param.b64 %rd15, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0]; +$L__tmp0: + .loc 1 24 28 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:24:28 + mov.u32 %r1, %ctaid.x; + .loc 1 26 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:26:21 + setp.lt.u32 %p2, %r1, 32; + .loc 1 27 38 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:27:38 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 15; + .loc 1 34 40 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:34:40 + shl.b32 %r14, %r1, 4; + .loc 1 34 37 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:34:37 + or.b32 %r15, %r3, %r14; + .loc 1 34 30 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:34:30 + mad.wide.s32 %rd12, %r15, 8, %rd15; + .loc 1 34 45 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:34:45 + // begin inline asm + mov.u64 %rd11, 0x0; + @%p2 ld.global.b64 { %rd11 }, [ %rd12 + 0 ]; + // end inline asm + // begin inline asm + mov.u64 %rd13, 0x0; + @%p2 ld.global.b64 { %rd13 }, [ %rd12 + 0 ]; + // end inline asm + .loc 1 39 18 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:39:18 + add.s64 %rd16, %rd11, -1; + setp.lt.u64 %p3, %rd16, 16383; + add.s64 %rd17, %rd13, -1; + setp.lt.u64 %p4, %rd17, 16383; + .loc 1 0 0 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:0 + selp.b32 %r16, 1, 0, %p3; +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shr.u32 %r17, %r2, 1; + bfe.u32 %r18, %r2, 1, 1; + and.b32 %r19, %r2, 1; + shr.u32 %r20, %r2, 2; + bfe.u32 %r21, %r2, 2, 1; + shr.u32 %r22, %r2, 3; + bfe.u32 %r23, %r2, 3, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r24, %r19, 1; + xor.b32 %r25, %r18, 1; + xor.b32 %r26, %r21, 1; + xor.b32 %r27, %r23, 1; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r28, %r24, 0, %p3; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r29, %r28, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r30, %r28, %r29; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r31, %r19, 0, %p3; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r32, %r31, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r33, %r31, %r32; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r34, %r24, %r3; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r35, %r34, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r36, %r35, %r34; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r37, %r3, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r38, %r37, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r39, %r38, %r37; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.lt.s32 %p5, %r30, %r33; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.eq.b32 %p6, %r30, %r33; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.gt.s32 %p7, %r36, %r39; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.pred %p8, %p6, %p7; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + or.pred %p9, %p5, %p8; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.b32 %r40, %r17, 1; + setp.ne.b32 %p10, %r40, 0; + xor.pred %p11, %p9, %p10; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r41, %r30, %r33; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r42, %r41, 0, %p11; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r43, %r42, %r16; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r44, %r39, %r36; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r45, %r44, 0, %p11; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r46, %r45, %r3; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r47, %r43, %r25; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r48, %r47, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r49, %r47, %r48; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r50, %r43, %r18; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r51, %r50, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r52, %r50, %r51; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r53, %r46, %r25; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r54, %r53, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r55, %r53, %r54; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r56, %r46, %r18; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r57, %r56, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r58, %r56, %r57; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.b32 %r59, %r20, 1; + setp.ne.b32 %p12, %r59, 0; + setp.ge.s32 %p13, %r49, %r52; + setp.ne.b32 %p14, %r49, %r52; + setp.le.s32 %p15, %r55, %r58; + or.pred %p16, %p14, %p15; + and.pred %p17, %p13, %p16; + xor.pred %p18, %p17, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r60, %r49, %r52; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r61, 0, %r60, %p18; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r62, %r61, %r43; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r63, %r55, %r58; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r64, 0, %r63, %p18; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r65, %r64, %r46; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r66, %r62, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r67, %r66, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r68, %r66, %r67; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r69, %r62, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r70, %r69, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r71, %r69, %r70; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r72, %r65, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r73, %r72, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r74, %r72, %r73; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r75, %r65, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r76, %r75, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r77, %r75, %r76; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.ge.s32 %p19, %r68, %r71; + setp.ne.b32 %p20, %r68, %r71; + setp.le.s32 %p21, %r74, %r77; + or.pred %p22, %p20, %p21; + and.pred %p23, %p19, %p22; + xor.pred %p24, %p23, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r78, %r68, %r71; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r79, 0, %r78, %p24; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r80, %r79, %r62; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r81, %r74, %r77; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r82, 0, %r81, %p24; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r83, %r82, %r65; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r84, %r80, %r26; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r85, %r84, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r86, %r84, %r85; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r87, %r80, %r21; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r88, %r87, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r89, %r87, %r88; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r90, %r83, %r26; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r91, %r90, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r92, %r90, %r91; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r93, %r83, %r21; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r94, %r93, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r95, %r93, %r94; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.b32 %r96, %r22, 1; + setp.ne.b32 %p25, %r96, 0; + setp.ge.s32 %p26, %r86, %r89; + setp.ne.b32 %p27, %r86, %r89; + setp.le.s32 %p28, %r92, %r95; + or.pred %p29, %p27, %p28; + and.pred %p30, %p26, %p29; + xor.pred %p31, %p30, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r97, %r86, %r89; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r98, 0, %r97, %p31; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r99, %r98, %r80; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r100, %r92, %r95; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r101, 0, %r100, %p31; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r102, %r101, %r83; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r103, %r99, %r25; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r104, %r103, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r105, %r103, %r104; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r106, %r99, %r18; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r107, %r106, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r108, %r106, %r107; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r109, %r102, %r25; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r110, %r109, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r111, %r109, %r110; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r112, %r102, %r18; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r113, %r112, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r114, %r112, %r113; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.ge.s32 %p32, %r105, %r108; + setp.ne.b32 %p33, %r105, %r108; + setp.le.s32 %p34, %r111, %r114; + or.pred %p35, %p33, %p34; + and.pred %p36, %p32, %p35; + xor.pred %p37, %p36, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r115, %r105, %r108; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r116, 0, %r115, %p37; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r117, %r116, %r99; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r118, %r111, %r114; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r119, 0, %r118, %p37; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r120, %r119, %r102; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r121, %r117, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r122, %r121, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r123, %r121, %r122; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r124, %r117, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r125, %r124, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r126, %r124, %r125; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r127, %r120, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r128, %r127, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r129, %r127, %r128; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r130, %r120, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r131, %r130, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r132, %r130, %r131; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.ge.s32 %p38, %r123, %r126; + setp.ne.b32 %p39, %r123, %r126; + setp.le.s32 %p40, %r129, %r132; + or.pred %p41, %p39, %p40; + and.pred %p42, %p38, %p41; + xor.pred %p43, %p42, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r133, %r123, %r126; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r134, 0, %r133, %p43; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r135, %r134, %r117; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r136, %r129, %r132; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r137, 0, %r136, %p43; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r138, %r137, %r120; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r139, %r135, %r27; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r140, %r139, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r141, %r139, %r140; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r142, %r135, %r23; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r143, %r142, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r144, %r142, %r143; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r145, %r138, %r27; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r146, %r145, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r147, %r145, %r146; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r148, %r138, %r23; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r149, %r148, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r150, %r148, %r149; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.lt.s32 %p44, %r141, %r144; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.eq.b32 %p45, %r141, %r144; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.gt.s32 %p46, %r147, %r150; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.pred %p47, %p45, %p46; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + or.pred %p48, %p44, %p47; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r151, %r141, %r144; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r152, %r151, 0, %p48; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r153, %r152, %r135; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r154, %r147, %r150; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r155, %r154, 0, %p48; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r156, %r155, %r138; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r157, %r153, %r26; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r158, %r157, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r159, %r157, %r158; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r160, %r153, %r21; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r161, %r160, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r162, %r160, %r161; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r163, %r156, %r26; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r164, %r163, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r165, %r163, %r164; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r166, %r156, %r21; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r167, %r166, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r168, %r166, %r167; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.lt.s32 %p49, %r159, %r162; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.eq.b32 %p50, %r159, %r162; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.gt.s32 %p51, %r165, %r168; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.pred %p52, %p50, %p51; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + or.pred %p53, %p49, %p52; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r169, %r159, %r162; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r170, %r169, 0, %p53; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r171, %r170, %r153; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r172, %r165, %r168; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r173, %r172, 0, %p53; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r174, %r173, %r156; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r175, %r171, %r25; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r176, %r175, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r177, %r175, %r176; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r178, %r171, %r18; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r179, %r178, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r180, %r178, %r179; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r181, %r174, %r25; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r182, %r181, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r183, %r181, %r182; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r184, %r174, %r18; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r185, %r184, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r186, %r184, %r185; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.lt.s32 %p54, %r177, %r180; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.eq.b32 %p55, %r177, %r180; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.gt.s32 %p56, %r183, %r186; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + and.pred %p57, %p55, %p56; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + or.pred %p58, %p54, %p57; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r187, %r177, %r180; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r188, %r187, 0, %p58; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r189, %r188, %r171; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r190, %r183, %r186; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r191, %r190, 0, %p58; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r192, %r191, %r174; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r193, %r189, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r194, %r193, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r195, %r193, %r194; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r196, %r189, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r197, %r196, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r198, %r196, %r197; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r199, %r192, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r200, %r199, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r201, %r199, %r200; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + mul.lo.s32 %r202, %r192, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + shfl.sync.bfly.b32 %r203, %r202, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + add.s32 %r204, %r202, %r203; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.lt.s32 %p59, %r195, %r198; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.eq.b32 %p60, %r195, %r198; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + setp.gt.s32 %p61, %r201, %r204; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r205, %r201, %r204; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + selp.b32 %r206, %r205, 0, %p61; + selp.b32 %r207, %r206, 0, %p60; + selp.b32 %r208, %r205, %r207, %p59; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:46:71 ] + xor.b32 %r4, %r208, %r192; +$L__tmp2: + .loc 1 47 20 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:47:20 + setp.eq.b64 %p62, %rd11, 16384; + setp.eq.b64 %p63, %rd13, 16384; + .loc 1 0 0 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:0 + selp.b32 %r209, 1, 0, %p62; +$L__tmp3: + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r210, %r24, 0, %p62; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r211, %r210, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r212, %r211, %r210; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r213, %r19, 0, %p62; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r214, %r213, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r215, %r214, %r213; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.lt.s32 %p64, %r212, %r215; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.eq.b32 %p65, %r212, %r215; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + and.pred %p66, %p7, %p65; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + or.pred %p67, %p64, %p66; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.pred %p68, %p67, %p10; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r216, %r215, %r212; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r217, %r216, 0, %p68; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r218, %r217, %r209; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r219, %r44, 0, %p68; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r220, %r219, %r3; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r221, %r218, %r25; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r222, %r221, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r223, %r221, %r222; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r224, %r218, %r18; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r225, %r224, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r226, %r224, %r225; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r227, %r220, %r25; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r228, %r227, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r229, %r227, %r228; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r230, %r220, %r18; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r231, %r230, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r232, %r230, %r231; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.ge.s32 %p69, %r223, %r226; + setp.ne.b32 %p70, %r223, %r226; + setp.le.s32 %p71, %r229, %r232; + or.pred %p72, %p70, %p71; + and.pred %p73, %p69, %p72; + xor.pred %p74, %p73, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r233, %r223, %r226; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r234, 0, %r233, %p74; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r235, %r234, %r218; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r236, %r229, %r232; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r237, 0, %r236, %p74; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r238, %r237, %r220; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r239, %r235, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r240, %r239, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r241, %r239, %r240; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r242, %r235, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r243, %r242, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r244, %r242, %r243; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r245, %r238, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r246, %r245, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r247, %r245, %r246; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r248, %r238, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r249, %r248, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r250, %r248, %r249; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.ge.s32 %p75, %r241, %r244; + setp.ne.b32 %p76, %r241, %r244; + setp.le.s32 %p77, %r247, %r250; + or.pred %p78, %p76, %p77; + and.pred %p79, %p75, %p78; + xor.pred %p80, %p79, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r251, %r241, %r244; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r252, 0, %r251, %p80; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r253, %r252, %r235; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r254, %r247, %r250; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r255, 0, %r254, %p80; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r256, %r255, %r238; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r257, %r253, %r26; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r258, %r257, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r259, %r257, %r258; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r260, %r253, %r21; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r261, %r260, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r262, %r260, %r261; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r263, %r256, %r26; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r264, %r263, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r265, %r263, %r264; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r266, %r256, %r21; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r267, %r266, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r268, %r266, %r267; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.ge.s32 %p81, %r259, %r262; + setp.ne.b32 %p82, %r259, %r262; + setp.le.s32 %p83, %r265, %r268; + or.pred %p84, %p82, %p83; + and.pred %p85, %p81, %p84; + xor.pred %p86, %p85, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r269, %r259, %r262; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r270, 0, %r269, %p86; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r271, %r270, %r253; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r272, %r265, %r268; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r273, 0, %r272, %p86; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r274, %r273, %r256; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r275, %r271, %r25; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r276, %r275, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r277, %r275, %r276; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r278, %r271, %r18; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r279, %r278, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r280, %r278, %r279; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r281, %r274, %r25; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r282, %r281, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r283, %r281, %r282; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r284, %r274, %r18; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r285, %r284, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r286, %r284, %r285; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.ge.s32 %p87, %r277, %r280; + setp.ne.b32 %p88, %r277, %r280; + setp.le.s32 %p89, %r283, %r286; + or.pred %p90, %p88, %p89; + and.pred %p91, %p87, %p90; + xor.pred %p92, %p91, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r287, %r277, %r280; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r288, 0, %r287, %p92; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r289, %r288, %r271; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r290, %r283, %r286; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r291, 0, %r290, %p92; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r292, %r291, %r274; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r293, %r289, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r294, %r293, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r295, %r293, %r294; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r296, %r289, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r297, %r296, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r298, %r296, %r297; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r299, %r292, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r300, %r299, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r301, %r299, %r300; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r302, %r292, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r303, %r302, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r304, %r302, %r303; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.ge.s32 %p93, %r295, %r298; + setp.ne.b32 %p94, %r295, %r298; + setp.le.s32 %p95, %r301, %r304; + or.pred %p96, %p94, %p95; + and.pred %p97, %p93, %p96; + xor.pred %p98, %p97, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r305, %r295, %r298; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r306, 0, %r305, %p98; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r307, %r306, %r289; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r308, %r301, %r304; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r309, 0, %r308, %p98; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r310, %r309, %r292; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r311, %r307, %r27; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r312, %r311, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r313, %r311, %r312; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r314, %r307, %r23; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r315, %r314, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r316, %r314, %r315; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r317, %r310, %r27; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r318, %r317, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r319, %r317, %r318; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r320, %r310, %r23; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r321, %r320, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r322, %r320, %r321; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.lt.s32 %p99, %r313, %r316; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.eq.b32 %p100, %r313, %r316; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.gt.s32 %p101, %r319, %r322; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + and.pred %p102, %p100, %p101; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + or.pred %p103, %p99, %p102; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r323, %r313, %r316; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r324, %r323, 0, %p103; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r325, %r324, %r307; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r326, %r319, %r322; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r327, %r326, 0, %p103; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r328, %r327, %r310; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r329, %r325, %r26; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r330, %r329, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r331, %r329, %r330; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r332, %r325, %r21; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r333, %r332, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r334, %r332, %r333; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r335, %r328, %r26; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r336, %r335, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r337, %r335, %r336; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r338, %r328, %r21; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r339, %r338, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r340, %r338, %r339; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.lt.s32 %p104, %r331, %r334; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.eq.b32 %p105, %r331, %r334; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.gt.s32 %p106, %r337, %r340; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + and.pred %p107, %p105, %p106; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + or.pred %p108, %p104, %p107; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r341, %r331, %r334; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r342, %r341, 0, %p108; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r343, %r342, %r325; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r344, %r337, %r340; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r345, %r344, 0, %p108; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r346, %r345, %r328; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r347, %r343, %r25; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r348, %r347, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r349, %r347, %r348; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r350, %r343, %r18; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r351, %r350, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r352, %r350, %r351; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r353, %r346, %r25; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r354, %r353, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r355, %r353, %r354; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r356, %r346, %r18; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r357, %r356, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + add.s32 %r358, %r356, %r357; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.lt.s32 %p109, %r349, %r352; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.eq.b32 %p110, %r349, %r352; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + setp.gt.s32 %p111, %r355, %r358; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + and.pred %p112, %p110, %p111; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + or.pred %p113, %p109, %p112; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r359, %r349, %r352; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r360, %r359, 0, %p113; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r361, %r360, %r343; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r362, %r355, %r358; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + selp.b32 %r363, %r362, 0, %p113; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + xor.b32 %r364, %r363, %r346; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r365, %r361, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r366, %r365, 1, 31, -1; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r368, %r361, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r369, %r368, 1, 31, -1; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r371, %r364, %r24; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r372, %r371, 1, 31, -1; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + mul.lo.s32 %r374, %r364, %r19; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:51:71 ] + shfl.sync.bfly.b32 %r375, %r374, 1, 31, -1; +$L__tmp4: + .loc 1 54 35 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:54:35 + and.pred %p117, %p2, %p3; + selp.b64 %rd18, 1, 0, %p117; + and.pred %p118, %p2, %p4; + selp.b64 %rd19, 1, 0, %p118; +$L__tmp5: + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + selp.b32 %r381, 1, 0, %p117; + shfl.sync.bfly.b32 %r382, %r381, 8, 31, -1; + mov.b32 %r383, 0; + shfl.sync.bfly.b32 %r384, %r383, 8, 31, -1; + cvt.u64.u32 %rd20, %r382; + cvt.u64.u32 %rd21, %r384; + shl.b64 %rd22, %rd21, 32; + or.b64 %rd23, %rd20, %rd22; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + add.s64 %rd24, %rd23, %rd18; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + mov.b64 {_, %r385}, %rd24; + cvt.u32.u64 %r386, %rd24; + shfl.sync.bfly.b32 %r387, %r386, 4, 31, -1; + shfl.sync.bfly.b32 %r388, %r385, 4, 31, -1; + cvt.u64.u32 %rd25, %r387; + cvt.u64.u32 %rd26, %r388; + shl.b64 %rd27, %rd26, 32; + or.b64 %rd28, %rd25, %rd27; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + add.s64 %rd29, %rd24, %rd28; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + mov.b64 {_, %r389}, %rd29; + cvt.u32.u64 %r390, %rd29; + shfl.sync.bfly.b32 %r391, %r390, 2, 31, -1; + shfl.sync.bfly.b32 %r392, %r389, 2, 31, -1; + cvt.u64.u32 %rd30, %r391; + cvt.u64.u32 %rd31, %r392; + shl.b64 %rd32, %rd31, 32; + or.b64 %rd33, %rd30, %rd32; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + add.s64 %rd34, %rd29, %rd33; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + mov.b64 {_, %r393}, %rd34; + cvt.u32.u64 %r394, %rd34; + shfl.sync.bfly.b32 %r395, %r394, 1, 31, -1; + shfl.sync.bfly.b32 %r396, %r393, 1, 31, -1; + cvt.u64.u32 %rd35, %r395; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + add.s64 %rd36, %rd34, %rd35; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + selp.b32 %r397, 1, 0, %p118; + shfl.sync.bfly.b32 %r398, %r397, 8, 31, -1; + shfl.sync.bfly.b32 %r399, %r383, 8, 31, -1; + cvt.u64.u32 %rd37, %r398; + cvt.u64.u32 %rd38, %r399; + shl.b64 %rd39, %rd38, 32; + or.b64 %rd40, %rd37, %rd39; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + add.s64 %rd41, %rd40, %rd19; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + mov.b64 {_, %r400}, %rd41; + cvt.u32.u64 %r401, %rd41; + shfl.sync.bfly.b32 %r402, %r401, 4, 31, -1; + shfl.sync.bfly.b32 %r403, %r400, 4, 31, -1; + cvt.u64.u32 %rd42, %r402; + cvt.u64.u32 %rd43, %r403; + shl.b64 %rd44, %rd43, 32; + or.b64 %rd45, %rd42, %rd44; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + add.s64 %rd46, %rd41, %rd45; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + mov.b64 {_, %r404}, %rd46; + cvt.u32.u64 %r405, %rd46; + shfl.sync.bfly.b32 %r406, %r405, 2, 31, -1; + shfl.sync.bfly.b32 %r407, %r404, 2, 31, -1; + cvt.u64.u32 %rd47, %r406; + cvt.u64.u32 %rd48, %r407; + shl.b64 %rd49, %rd48, 32; + or.b64 %rd50, %rd47, %rd49; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + add.s64 %rd2, %rd46, %rd50; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + mov.b64 {_, %r408}, %rd2; + cvt.u32.u64 %r409, %rd2; + shfl.sync.bfly.b32 %r6, %r409, 1, 31, -1; + shfl.sync.bfly.b32 %r7, %r408, 1, 31, -1; +$L__tmp6: + .loc 1 58 35 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:58:35 + and.pred %p119, %p2, %p62; + selp.b64 %rd51, 1, 0, %p119; + and.pred %p120, %p2, %p63; + selp.b64 %rd52, 1, 0, %p120; +$L__tmp7: + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + selp.b32 %r410, 1, 0, %p119; + shfl.sync.bfly.b32 %r411, %r410, 8, 31, -1; + shfl.sync.bfly.b32 %r412, %r383, 8, 31, -1; + cvt.u64.u32 %rd53, %r411; + cvt.u64.u32 %rd54, %r412; + shl.b64 %rd55, %rd54, 32; + or.b64 %rd56, %rd53, %rd55; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + add.s64 %rd57, %rd56, %rd51; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + mov.b64 {_, %r413}, %rd57; + cvt.u32.u64 %r414, %rd57; + shfl.sync.bfly.b32 %r415, %r414, 4, 31, -1; + shfl.sync.bfly.b32 %r416, %r413, 4, 31, -1; + cvt.u64.u32 %rd58, %r415; + cvt.u64.u32 %rd59, %r416; + shl.b64 %rd60, %rd59, 32; + or.b64 %rd61, %rd58, %rd60; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + add.s64 %rd62, %rd57, %rd61; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + mov.b64 {_, %r417}, %rd62; + cvt.u32.u64 %r418, %rd62; + shfl.sync.bfly.b32 %r419, %r418, 2, 31, -1; + shfl.sync.bfly.b32 %r420, %r417, 2, 31, -1; + cvt.u64.u32 %rd63, %r419; + cvt.u64.u32 %rd64, %r420; + shl.b64 %rd65, %rd64, 32; + or.b64 %rd66, %rd63, %rd65; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + add.s64 %rd3, %rd62, %rd66; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + mov.b64 {_, %r421}, %rd3; + cvt.u32.u64 %r422, %rd3; + shfl.sync.bfly.b32 %r8, %r422, 1, 31, -1; + shfl.sync.bfly.b32 %r9, %r421, 1, 31, -1; + selp.b32 %r423, 1, 0, %p120; + shfl.sync.bfly.b32 %r424, %r423, 8, 31, -1; + shfl.sync.bfly.b32 %r425, %r383, 8, 31, -1; + cvt.u64.u32 %rd67, %r424; + cvt.u64.u32 %rd68, %r425; + shl.b64 %rd69, %rd68, 32; + or.b64 %rd70, %rd67, %rd69; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + add.s64 %rd71, %rd70, %rd52; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + mov.b64 {_, %r426}, %rd71; + cvt.u32.u64 %r427, %rd71; + shfl.sync.bfly.b32 %r428, %r427, 4, 31, -1; + shfl.sync.bfly.b32 %r429, %r426, 4, 31, -1; + cvt.u64.u32 %rd72, %r428; + cvt.u64.u32 %rd73, %r429; + shl.b64 %rd74, %rd73, 32; + or.b64 %rd75, %rd72, %rd74; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + add.s64 %rd76, %rd71, %rd75; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + mov.b64 {_, %r430}, %rd76; + cvt.u32.u64 %r431, %rd76; + shfl.sync.bfly.b32 %r432, %r431, 2, 31, -1; + shfl.sync.bfly.b32 %r433, %r430, 2, 31, -1; + cvt.u64.u32 %rd77, %r432; + cvt.u64.u32 %rd78, %r433; + shl.b64 %rd79, %rd78, 32; + or.b64 %rd80, %rd77, %rd79; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + add.s64 %rd4, %rd76, %rd80; + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + mov.b64 {_, %r434}, %rd4; + cvt.u32.u64 %r435, %rd4; + shfl.sync.bfly.b32 %r10, %r435, 1, 31, -1; + shfl.sync.bfly.b32 %r11, %r434, 1, 31, -1; +$L__tmp8: + .loc 1 60 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:60:21 + cvt.u32.u64 %r436, %rd36; + .loc 1 64 19 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:64:19 + setp.lt.s32 %p121, %r3, %r436; + .loc 1 66 35 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:66:35 + selp.b32 %r437, %r4, 16, %p121; + .loc 1 68 20 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:68:20 + add.s32 %r438, %r437, 17; + .loc 1 69 20 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:69:20 + setp.lt.s32 %p122, %r437, 0; + .loc 1 70 35 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:70:35 + selp.b32 %r12, %r438, %r437, %p122; + .loc 1 71 38 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:71:38 + setp.lt.u32 %p123, %r12, 17; + .loc 1 0 0 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:0 + setp.gt.u32 %p124, %r1, 31; + .loc 1 71 53 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:71:53 + or.pred %p125, %p124, %p123; + .loc 1 71 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:71:63 + @%p125 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: +$L__tmp9: + .loc 1 18 0 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:18 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:0 ] + add.s32 %r367, %r365, %r366; + add.s32 %r370, %r368, %r369; + add.s32 %r373, %r371, %r372; + add.s32 %r376, %r374, %r375; + setp.lt.s32 %p114, %r367, %r370; + setp.eq.b32 %p115, %r367, %r370; + setp.gt.s32 %p116, %r373, %r376; + xor.b32 %r377, %r373, %r376; + selp.b32 %r378, %r377, 0, %p116; + selp.b32 %r379, %r378, 0, %p115; + selp.b32 %r380, %r377, %r379, %p114; + xor.b32 %r5, %r380, %r364; +$L__tmp10: + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + cvt.u64.u32 %rd87, %r8; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + add.s64 %rd88, %rd3, %rd87; +$L__tmp11: + .loc 1 61 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:61:21 + cvt.u32.u64 %r439, %rd88; + .loc 1 71 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:71:63 + bar.sync 0; + .loc 1 75 19 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:75:19 + setp.lt.s32 %p127, %r3, %r439; + .loc 1 76 35 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:76:35 + selp.b32 %r440, %r5, 16, %p127; + .loc 1 77 20 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:77:20 + add.s32 %r441, %r440, 17; + .loc 1 78 20 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:78:20 + setp.lt.s32 %p128, %r440, 0; + .loc 1 79 35 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:79:35 + selp.b32 %r13, %r441, %r440, %p128; + .loc 1 80 38 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:80:38 + setp.lt.u32 %p129, %r13, 17; + .loc 1 80 53 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:80:53 + or.pred %p130, %p124, %p129; + .loc 1 80 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:80:63 + @%p130 bra $L__BB0_4; + bra.uni $L__BB0_3; +$L__BB0_4: + .loc 1 0 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:0:63 + ld.param.b64 %rd10, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6]; + ld.param.b64 %rd9, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5]; + ld.param.b64 %rd8, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4]; + ld.param.b64 %rd7, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3]; + ld.param.b64 %rd6, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2]; + ld.param.b64 %rd5, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1]; + cvt.s64.s32 %rd1, %r15; +$L__tmp12: + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + cvt.u64.u32 %rd101, %r10; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:59:26 ] + add.s64 %rd102, %rd4, %rd101; +$L__tmp13: + .loc 1 61 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:61:21 + cvt.u32.u64 %r443, %rd102; +$L__tmp14: + .loc 3 291 36 // standard.py:291:36 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + cvt.u64.u32 %rd103, %r6; + .loc 3 261 15 // standard.py:261:15 @[ c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:55:26 ] + add.s64 %rd104, %rd2, %rd103; +$L__tmp15: + .loc 1 60 21 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:60:21 + cvt.u32.u64 %r442, %rd104; + .loc 1 80 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:80:63 + bar.sync 0; + .loc 1 81 25 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:81:25 + mul.wide.u32 %rd105, %r1, 4; + add.s64 %rd95, %rd5, %rd105; + .loc 1 81 37 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:81:37 + and.b32 %r448, %r2, 63; + setp.eq.b32 %p138, %r448, 0; + and.pred %p131, %p2, %p138; + // begin inline asm + @%p131 st.global.b32 [ %rd95 + 0 ], { %r442 }; + // end inline asm + .loc 1 82 25 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:82:25 + add.s64 %rd96, %rd6, %rd105; + .loc 1 82 37 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:82:37 + // begin inline asm + @%p131 st.global.b32 [ %rd96 + 0 ], { %r443 }; + // end inline asm + .loc 1 83 25 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:83:25 + shl.b64 %rd106, %rd1, 2; + add.s64 %rd97, %rd7, %rd106; + .loc 1 83 47 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:83:47 + and.b32 %r449, %r2, 48; + setp.eq.b32 %p139, %r449, 0; + and.pred %p133, %p2, %p139; + // begin inline asm + @%p133 st.global.b32 [ %rd97 + 0 ], { %r4 }; + // end inline asm + .loc 1 84 52 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:84:52 + mul.lo.s32 %r450, %r1, 17; + .loc 1 84 49 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:84:49 + add.s32 %r451, %r12, %r450; + .loc 1 84 25 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:84:25 + mad.wide.s32 %rd98, %r451, 4, %rd8; + mov.b32 %r445, 1; + .loc 1 84 85 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:84:85 + // begin inline asm + @%p133 st.global.b32 [ %rd98 + 0 ], { %r445 }; + // end inline asm + .loc 1 85 25 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:85:25 + add.s64 %rd99, %rd9, %rd106; + .loc 1 85 47 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:85:47 + // begin inline asm + @%p133 st.global.b32 [ %rd99 + 0 ], { %r5 }; + // end inline asm + .loc 1 86 49 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:86:49 + add.s32 %r452, %r13, %r450; + .loc 1 86 25 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:86:25 + mad.wide.s32 %rd100, %r452, 4, %rd10; + .loc 1 86 85 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:86:85 + // begin inline asm + @%p133 st.global.b32 [ %rd100 + 0 ], { %r445 }; + // end inline asm + .loc 1 86 4 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:86:4 + ret; +$L__BB0_1: + .loc 1 71 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:71:63 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd81, assertFunc_0; + cvta.global.u64 %rd82, %rd81; + st.param.b64 [param3], %rd82; + mov.b64 %rd83, assertFile_0; + cvta.global.u64 %rd84, %rd83; + st.param.b64 [param1], %rd84; + mov.b64 %rd85, assertMessage_0; + cvta.global.u64 %rd86, %rd85; + st.param.b64 [param0], %rd86; + st.param.b64 [param4], 1; + st.param.b32 [param2], 71; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__BB0_3: + .loc 1 80 63 // c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py:80:63 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd89, assertFunc_1; + cvta.global.u64 %rd90, %rd89; + st.param.b64 [param3], %rd90; + mov.b64 %rd91, assertFile_1; + cvta.global.u64 %rd92, %rd91; + st.param.b64 [param1], %rd92; + mov.b64 %rd93, assertMessage_1; + cvta.global.u64 %rd94, %rd93; + st.param.b64 [param0], %rd94; + st.param.b64 [param4], 1; + st.param.b32 [param2], 80; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__tmp16: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 399 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x188 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 55 +.b8 122 +.b8 50 +.b8 106 +.b8 98 +.b8 106 +.b8 117 +.b8 98 +.b8 51 +.b8 97 +.b8 117 +.b8 112 +.b8 103 +.b8 110 +.b8 101 +.b8 99 +.b8 104 +.b8 111 +.b8 108 +.b8 54 +.b8 53 +.b8 118 +.b8 107 +.b8 118 +.b8 105 +.b8 53 +.b8 114 +.b8 117 +.b8 119 +.b8 112 +.b8 121 +.b8 108 +.b8 122 +.b8 111 +.b8 115 +.b8 100 +.b8 98 +.b8 113 +.b8 118 +.b8 115 +.b8 99 +.b8 100 +.b8 121 +.b8 120 +.b8 109 +.b8 114 +.b8 101 +.b8 98 +.b8 51 +.b8 106 +.b8 121 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 55 +.b8 122 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x7a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 116 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 112 +.b8 117 +.b8 116 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 110 +.b8 101 +.b8 119 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 115 +.b8 99 +.b8 97 +.b8 108 +.b8 97 +.b8 114 +.b8 95 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 119 +.b8 104 +.b8 101 +.b8 114 +.b8 101 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x105:0x8d DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x132:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x14a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp15 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 55 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x162:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp13 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 59 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x17a:0x17 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp9 // DW_AT_low_pc +.b64 $L__tmp10 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 0 // DW_AT_call_line +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source b/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source new file mode 100644 index 0000000000000000000000000000000000000000..033d3ca61519bb6892146d7f3e9e77995dcb288b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source @@ -0,0 +1,1397 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":18:0) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc140 = loc(unknown) +#loc165 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc169 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc174 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc178 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc187 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc192 = loc("in_ptr0"(#loc)) +#loc193 = loc("out_ptr4"(#loc)) +#loc194 = loc("out_ptr5"(#loc)) +#loc195 = loc("out_ptr6"(#loc)) +#loc196 = loc("out_ptr7"(#loc)) +#loc197 = loc("out_ptr8"(#loc)) +#loc198 = loc("out_ptr9"(#loc)) +#loc199 = loc("xnumel"(#loc)) +#loc200 = loc("r0_numel"(#loc)) +#loc255 = loc("x"(#loc90)) +#loc256 = loc("idxs"(#loc90)) +#loc257 = loc("x"(#loc94)) +#loc258 = loc("idxs"(#loc94)) +#loc263 = loc("x"(#loc102)) +#loc264 = loc("idxs"(#loc102)) +#loc265 = loc("flip"(#loc102)) +#loc321 = loc("input"(#loc165)) +#loc322 = loc("a"(#loc169)) +#loc323 = loc("b"(#loc169)) +#loc325 = loc("x"(#loc174)) +#loc326 = loc("x"(#loc178)) +#loc327 = loc("input"(#loc187)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 32 : i32 loc(#loc201) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc202) + %xoffset = tt.get_program_id x : i32 loc(#loc203) + %xoffset_2 = arith.constant 1 : i32 loc(#loc204) + %xoffset_3 = arith.constant 1 : i32 loc(#loc204) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc204) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc205) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc206) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc207) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc207) + %xmask = arith.constant dense<32> : tensor<1x1xi32> loc(#loc208) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc208) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc209) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc210) + %r0_offset = arith.constant 0 : i32 loc(#loc211) + %r0_mask = arith.constant true loc(#loc212) + %r0_mask_10 = arith.constant dense : tensor<1x16xi1> loc(#loc212) + %tmp0 = arith.constant 16 : i32 loc(#loc213) + %tmp0_11 = arith.constant 16 : i32 loc(#loc213) + %tmp0_12 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc213) + %tmp0_13 = arith.muli %tmp0_12, %xindex_7 : tensor<1x1xi32> loc(#loc213) + %tmp0_14 = tt.broadcast %tmp0_13 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc214) + %tmp0_15 = arith.addi %r0_index_9, %tmp0_14 : tensor<1x16xi32> loc(#loc214) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc215) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc215) + %tmp0_18 = arith.constant 0.000000e+00 : f32 loc(#loc216) + %tmp0_19 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc216) + %tmp0_20 = arith.constant dense<0.000000e+00> : tensor<1x16xf32> loc(#loc216) + %tmp0_21 = arith.fptosi %tmp0_20 : tensor<1x16xf32> to tensor<1x16xi64> loc(#loc216) + %tmp0_22 = tt.load %tmp0_17, %tmp0_19, %tmp0_21 : tensor<1x16x!tt.ptr> loc(#loc216) + %tmp1 = arith.constant 0 : i64 loc(#loc217) + %tmp1_23 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc217) + %tmp2 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc218) + %tmp2_24 = arith.cmpi sgt, %tmp0_22, %tmp2 : tensor<1x16xi64> loc(#loc218) + %tmp3 = arith.constant 16384 : i64 loc(#loc219) + %tmp3_25 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc219) + %tmp4 = arith.constant dense<16384> : tensor<1x16xi64> loc(#loc220) + %tmp4_26 = arith.cmpi slt, %tmp0_22, %tmp4 : tensor<1x16xi64> loc(#loc220) + %tmp5 = arith.andi %tmp2_24, %tmp4_26 : tensor<1x16xi1> loc(#loc221) + %tmp6 = arith.extui %tmp5 : tensor<1x16xi1> to tensor<1x16xi8> loc(#loc222) + %tmp7 = arith.extsi %tmp6 : tensor<1x16xi8> to tensor<1x16xi32> loc(#loc223) + %tmp9 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc224) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp7, %tmp9) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc25) + %tmp14 = arith.constant dense<16384> : tensor<1x16xi64> loc(#loc225) + %tmp14_27 = arith.cmpi eq, %tmp0_22, %tmp14 : tensor<1x16xi64> loc(#loc225) + %tmp15 = arith.extui %tmp14_27 : tensor<1x16xi1> to tensor<1x16xi8> loc(#loc226) + %tmp16 = arith.extsi %tmp15 : tensor<1x16xi8> to tensor<1x16xi32> loc(#loc227) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp16, %tmp9) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc29) + %tmp20 = arith.extsi %tmp7 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc228) + %tmp23 = arith.constant 0 : i32 loc(#loc229) + %tmp23_28 = arith.constant 0 : i64 loc(#loc229) + %tmp23_29 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc229) + %tmp23_30 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc229) + %tmp23_31 = arith.select %tmp23_30, %tmp20, %tmp23_29 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc229) + %tmp24 = tt.call @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp23_31) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc230) + %tmp24_32 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc231) + %tmp25 = arith.extsi %tmp16 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc232) + %tmp28 = arith.constant 0 : i32 loc(#loc233) + %tmp28_33 = arith.constant 0 : i64 loc(#loc233) + %tmp28_34 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc233) + %tmp28_35 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc233) + %tmp28_36 = arith.select %tmp28_35, %tmp25, %tmp28_34 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc233) + %tmp29 = tt.call @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp28_36) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc234) + %tmp29_37 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc235) + %tmp30 = arith.trunci %tmp24_32 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc236) + %tmp31 = arith.trunci %tmp29_37 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc237) + %tmp32 = arith.extsi %0#1 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc238) + %tmp33 = arith.trunci %tmp32 : tensor<1x16xi64> to tensor<1x16xi32> loc(#loc239) + %tmp34 = tt.broadcast %tmp30 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc240) + %tmp34_38 = arith.cmpi slt, %r0_index_9, %tmp34 : tensor<1x16xi32> loc(#loc240) + %tmp35 = arith.constant 16 : i32 loc(#loc241) + %tmp35_39 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc241) + %tmp36 = arith.constant dense<16> : tensor<1x16xi32> loc(#loc242) + %tmp36_40 = arith.select %tmp34_38, %tmp33, %tmp36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc242) + %tmp37 = arith.constant 17 : i32 loc(#loc243) + %tmp37_41 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc243) + %tmp38 = arith.addi %tmp36_40, %tmp37_41 : tensor<1x16xi32> loc(#loc244) + %tmp39 = arith.constant 0 : i32 loc(#loc245) + %tmp39_42 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc245) + %tmp39_43 = arith.cmpi slt, %tmp36_40, %tmp39_42 : tensor<1x16xi32> loc(#loc245) + %tmp40 = arith.select %tmp39_43, %tmp38, %tmp36_40 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc246) + %c0_i32 = arith.constant 0 : i32 loc(#loc49) + %cst = arith.constant dense<0> : tensor<1x16xi32> loc(#loc49) + %2 = arith.cmpi sle, %cst, %tmp40 : tensor<1x16xi32> loc(#loc49) + %c17_i32 = arith.constant 17 : i32 loc(#loc50) + %cst_44 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc50) + %3 = arith.cmpi slt, %tmp40, %cst_44 : tensor<1x16xi32> loc(#loc50) + %4 = arith.andi %2, %3 : tensor<1x16xi1> loc(#loc51) + %true = arith.constant true loc(#loc52) + %cst_45 = arith.constant dense : tensor<1x1xi1> loc(#loc52) + %5 = arith.xori %xmask_8, %cst_45 : tensor<1x1xi1> loc(#loc52) + %6 = tt.broadcast %5 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc53) + %7 = arith.ori %4, %6 : tensor<1x16xi1> loc(#loc53) + tt.assert %7, "index out of bounds: 0 <= tmp40 < 17" : tensor<1x16xi1> loc(#loc54) + %tmp42 = arith.constant 1 : i32 loc(#loc247) + %tmp42_46 = arith.constant dense<1> : tensor<1x1xi32> loc(#loc247) + %tmp43 = arith.extsi %1#1 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc248) + %tmp44 = arith.trunci %tmp43 : tensor<1x16xi64> to tensor<1x16xi32> loc(#loc249) + %tmp45 = tt.broadcast %tmp31 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc250) + %tmp45_47 = arith.cmpi slt, %r0_index_9, %tmp45 : tensor<1x16xi32> loc(#loc250) + %tmp46 = arith.constant dense<16> : tensor<1x16xi32> loc(#loc251) + %tmp46_48 = arith.select %tmp45_47, %tmp44, %tmp46 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc251) + %tmp47 = arith.addi %tmp46_48, %tmp37_41 : tensor<1x16xi32> loc(#loc252) + %tmp48 = arith.constant 0 : i32 loc(#loc253) + %tmp48_49 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc253) + %tmp48_50 = arith.cmpi slt, %tmp46_48, %tmp48_49 : tensor<1x16xi32> loc(#loc253) + %tmp49 = arith.select %tmp48_50, %tmp47, %tmp46_48 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc254) + %c0_i32_51 = arith.constant 0 : i32 loc(#loc63) + %cst_52 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc63) + %8 = arith.cmpi sle, %cst_52, %tmp49 : tensor<1x16xi32> loc(#loc63) + %c17_i32_53 = arith.constant 17 : i32 loc(#loc64) + %cst_54 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc64) + %9 = arith.cmpi slt, %tmp49, %cst_54 : tensor<1x16xi32> loc(#loc64) + %10 = arith.andi %8, %9 : tensor<1x16xi1> loc(#loc65) + %true_55 = arith.constant true loc(#loc66) + %cst_56 = arith.constant dense : tensor<1x1xi1> loc(#loc66) + %11 = arith.xori %xmask_8, %cst_56 : tensor<1x1xi1> loc(#loc66) + %12 = tt.broadcast %11 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc67) + %13 = arith.ori %10, %12 : tensor<1x16xi1> loc(#loc67) + tt.assert %13, "index out of bounds: 0 <= tmp49 < 17" : tensor<1x16xi1> loc(#loc68) + %14 = tt.splat %out_ptr4 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc69) + %15 = tt.addptr %14, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc69) + tt.store %15, %tmp30, %xmask_8 : tensor<1x1x!tt.ptr> loc(#loc70) + %16 = tt.splat %out_ptr5 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc71) + %17 = tt.addptr %16, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc71) + tt.store %17, %tmp31, %xmask_8 : tensor<1x1x!tt.ptr> loc(#loc72) + %c16_i32 = arith.constant 16 : i32 loc(#loc73) + %c16_i32_57 = arith.constant 16 : i32 loc(#loc73) + %cst_58 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc73) + %18 = arith.muli %cst_58, %xindex_7 : tensor<1x1xi32> loc(#loc73) + %19 = tt.broadcast %18 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc74) + %20 = arith.addi %r0_index_9, %19 : tensor<1x16xi32> loc(#loc74) + %21 = tt.splat %out_ptr6 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc75) + %22 = tt.addptr %21, %20 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc75) + %23 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc76) + tt.store %22, %tmp33, %23 : tensor<1x16x!tt.ptr> loc(#loc76) + %c17_i32_59 = arith.constant 17 : i32 loc(#loc77) + %c17_i32_60 = arith.constant 17 : i32 loc(#loc77) + %cst_61 = arith.constant dense<17> : tensor<1x1xi32> loc(#loc77) + %24 = arith.muli %cst_61, %xindex_7 : tensor<1x1xi32> loc(#loc77) + %25 = tt.broadcast %24 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc78) + %26 = arith.addi %tmp40, %25 : tensor<1x16xi32> loc(#loc78) + %27 = tt.splat %out_ptr7 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc79) + %28 = tt.addptr %27, %26 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc79) + %cst_62 = arith.constant dense<1> : tensor<1x16xi32> loc(#loc80) + %29 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc80) + tt.store %28, %cst_62, %29 : tensor<1x16x!tt.ptr> loc(#loc80) + %c16_i32_63 = arith.constant 16 : i32 loc(#loc81) + %c16_i32_64 = arith.constant 16 : i32 loc(#loc81) + %cst_65 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc81) + %30 = arith.muli %cst_65, %xindex_7 : tensor<1x1xi32> loc(#loc81) + %31 = tt.broadcast %30 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc82) + %32 = arith.addi %r0_index_9, %31 : tensor<1x16xi32> loc(#loc82) + %33 = tt.splat %out_ptr8 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc83) + %34 = tt.addptr %33, %32 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc83) + %35 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc84) + tt.store %34, %tmp44, %35 : tensor<1x16x!tt.ptr> loc(#loc84) + %c17_i32_66 = arith.constant 17 : i32 loc(#loc85) + %c17_i32_67 = arith.constant 17 : i32 loc(#loc85) + %cst_68 = arith.constant dense<17> : tensor<1x1xi32> loc(#loc85) + %36 = arith.muli %cst_68, %xindex_7 : tensor<1x1xi32> loc(#loc85) + %37 = tt.broadcast %36 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc86) + %38 = arith.addi %tmp49, %37 : tensor<1x16xi32> loc(#loc86) + %39 = tt.splat %out_ptr9 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc87) + %40 = tt.addptr %39, %38 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc87) + %cst_69 = arith.constant dense<1> : tensor<1x16xi32> loc(#loc88) + %41 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc88) + tt.store %40, %cst_69, %41 : tensor<1x16x!tt.ptr> loc(#loc88) + tt.return loc(#loc89) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc90)), %idxs: tensor<1x16xi16> loc("idxs"(#loc90))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc91) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc91) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc91) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc91) + tt.return %3#0, %3#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc92) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc93) + %5 = ub.poison : tensor<1x16xi32> loc(#loc93) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc93) + } loc(#loc90) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc94)), %idxs: tensor<1x16xi16> loc("idxs"(#loc94))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc259) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc260) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc260) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc261) + %flip_3 = tt.reshape %flip_2 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc262) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i16S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi16>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + tt.return %0#0, %0#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc100) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi32> loc(#loc101) + %2 = ub.poison : tensor<1x16xi32> loc(#loc101) + tt.return %1, %2 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc101) + } loc(#loc94) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i16S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi16> loc("idxs"(#loc102)), %flip: tensor<1x16xi32> loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi16> -> tensor<8x2x1xi16> loc(#loc280) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc281) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc282) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<8x2x1xi16> loc(#loc282) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<8x2x1xi16>) -> tensor<8x1xi32> loc(#loc283) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc284) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc285) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc286) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc287) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<8x2x1xi16> loc(#loc287) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<8x2x1xi16>) -> tensor<8x1xi32> loc(#loc288) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc289) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc290) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_27 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_28 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_49 = arith.constant true loc(#loc298) + %cond_50 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<1x16xi1> loc(#loc298) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<1x16xi1> loc(#loc299) + %cond_53 = arith.ori %cond, %cond_52 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_53 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_50 = arith.ori %eq, %eq_49 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_50 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<1x16xi32> loc(#loc304) + %cond_30 = arith.andi %3, %cond_29 : tensor<1x16xi1> loc(#loc305) + %cond_31 = arith.ori %1, %cond_30 : tensor<1x16xi1> loc(#loc306) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<1x16xi1> loc(#loc307) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<1x16xi1> loc(#loc308) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<1x16xi1> loc(#loc309) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<1x16xi1> loc(#loc310) + %cond_36 = arith.extui %cond_35 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc311) + %cond_37 = arith.xori %cond_36, %flip : tensor<1x16xi32> loc(#loc311) + %cond_38 = arith.constant 0 : i32 loc(#loc312) + %cond_39 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc312) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<1x16xi32> loc(#loc312) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_43 = arith.xori %x, %ret_42 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<1x16xi32> loc(#loc317) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S1_16S__(%idxs) : (tensor<1x16xi16>) -> tensor<1x16xi16> loc(#loc318) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc319) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_47 = arith.extsi %idxs : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc320) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_43, %new_idxs_48 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x1xi32> loc("input"(#loc165))) -> tensor<8x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc166) + tt.reduce.return %2 : i32 loc(#loc166) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc166) + tt.return %0 : tensor<8x1xi32> loc(#loc167) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x1xi32> loc(#loc168) + tt.return %1 : tensor<8x1xi32> loc(#loc168) + } loc(#loc165) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc169)), %b: i32 loc("b"(#loc169))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc170) + tt.return %0 : i32 loc(#loc171) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc172) + tt.return %1 : i32 loc(#loc172) + } loc(#loc169) + tt.func private @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x1xi16> loc("input"(#loc165))) -> tensor<8x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc324) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc166) + tt.reduce.return %2 : i32 loc(#loc166) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc166) + tt.return %0 : tensor<8x1xi32> loc(#loc167) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x1xi32> loc(#loc168) + tt.return %1 : tensor<8x1xi32> loc(#loc168) + } loc(#loc165) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%x: tensor<1x16xi32> loc("x"(#loc174))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc175) + %false = arith.constant false loc(#loc176) + tt.return %false : i1 loc(#loc176) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc177) + tt.return %1 : i1 loc(#loc177) + } loc(#loc174) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S1_16S__(%x: tensor<1x16xi32> loc("x"(#loc178))) -> tensor<1x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc179) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc180) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc180) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc180) + %4 = arith.addi %x, %3 : tensor<1x16xi32> loc(#loc180) + tt.return %4 : tensor<1x16xi32> loc(#loc181) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x16xi32> loc(#loc182) + tt.return %5 : tensor<1x16xi32> loc(#loc182) + } loc(#loc178) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc184) + %cst = arith.constant dense : tensor<1xi1> loc(#loc184) + tt.return %cst : tensor<1xi1> loc(#loc185) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc186) + tt.return %0 : tensor<1xi1> loc(#loc186) + } loc(#loc183) + tt.func private @triton.language.standard.zeros_like__i32S1_16S__(%input: tensor<1x16xi32> loc("input"(#loc187))) -> tensor<1x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<1x16xi32> loc(#loc188) + tt.return %0 : tensor<1x16xi32> loc(#loc189) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi32> loc(#loc190) + tt.return %1 : tensor<1x16xi32> loc(#loc190) + } loc(#loc187) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<1x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc184) + %cst = arith.constant dense<0> : tensor<1x16xi32> loc(#loc184) + tt.return %cst : tensor<1x16xi32> loc(#loc185) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16xi32> loc(#loc186) + tt.return %0 : tensor<1x16xi32> loc(#loc186) + } loc(#loc183) + tt.func private @triton.language.standard.zeros_like__i16S1_16S__(%input: tensor<1x16xi16> loc("input"(#loc187))) -> tensor<1x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<1x16xi16> loc(#loc188) + tt.return %0 : tensor<1x16xi16> loc(#loc189) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi16> loc(#loc190) + tt.return %1 : tensor<1x16xi16> loc(#loc190) + } loc(#loc187) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<1x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc184) + %cst = arith.constant dense<0> : tensor<1x16xi16> loc(#loc184) + tt.return %cst : tensor<1x16xi16> loc(#loc185) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16xi16> loc(#loc186) + tt.return %0 : tensor<1x16xi16> loc(#loc186) + } loc(#loc183) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc94)), %idxs: tensor<1x16xi32> loc("idxs"(#loc94))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc259) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc260) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc260) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc261) + %flip_3 = tt.reshape %flip_2 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc262) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + tt.return %1#0, %1#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc100) + ^bb1: // no predecessors + %2 = ub.poison : tensor<1x16xi32> loc(#loc101) + %3 = ub.poison : tensor<1x16xi32> loc(#loc101) + tt.return %2, %3 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc101) + } loc(#loc94) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: tensor<1x16xi32> loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<4x2x2xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<4x2x2xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<4x2x2xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<4x2x2xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc298) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc298) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc299) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc311) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc311) + %cond_36 = arith.constant 0 : i32 loc(#loc312) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc312) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc312) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x2x2xi32> loc("input"(#loc165))) -> tensor<4x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc166) + tt.reduce.return %2 : i32 loc(#loc166) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc166) + tt.return %0 : tensor<4x2xi32> loc(#loc167) + ^bb1: // no predecessors + %1 = ub.poison : tensor<4x2xi32> loc(#loc168) + tt.return %1 : tensor<4x2xi32> loc(#loc168) + } loc(#loc165) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: tensor<1x16xi32> loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x1xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x1xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc298) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc298) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc299) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc311) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc311) + %cond_36 = arith.constant 0 : i32 loc(#loc312) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc312) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc312) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc94)), %idxs: tensor<1x16xi32> loc("idxs"(#loc94))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc259) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc260) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc260) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc261) + %flip_3 = tt.reshape %flip_2 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc262) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + tt.return %2#0, %2#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc100) + ^bb1: // no predecessors + %3 = ub.poison : tensor<1x16xi32> loc(#loc101) + %4 = ub.poison : tensor<1x16xi32> loc(#loc101) + tt.return %3, %4 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc101) + } loc(#loc94) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: tensor<1x16xi32> loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<2x2x4xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<2x2x4xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<2x2x4xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<2x2x4xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc298) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc298) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc299) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc311) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc311) + %cond_36 = arith.constant 0 : i32 loc(#loc312) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc312) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc312) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<2x2x4xi32> loc("input"(#loc165))) -> tensor<2x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc166) + tt.reduce.return %2 : i32 loc(#loc166) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc166) + tt.return %0 : tensor<2x4xi32> loc(#loc167) + ^bb1: // no predecessors + %1 = ub.poison : tensor<2x4xi32> loc(#loc168) + tt.return %1 : tensor<2x4xi32> loc(#loc168) + } loc(#loc165) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc94)), %idxs: tensor<1x16xi32> loc("idxs"(#loc94))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc328) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + tt.return %3#0, %3#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc100) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc101) + %5 = ub.poison : tensor<1x16xi32> loc(#loc101) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc101) + } loc(#loc94) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: i1 loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<1x2x8xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<1x2x8xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<1x2x8xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<1x2x8xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc298) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc298) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc299) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc311) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc311) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2x8xi32> loc("input"(#loc165))) -> tensor<1x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc166) + tt.reduce.return %2 : i32 loc(#loc166) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc166) + tt.return %0 : tensor<1x8xi32> loc(#loc167) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x8xi32> loc(#loc168) + tt.return %1 : tensor<1x8xi32> loc(#loc168) + } loc(#loc165) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: i1 loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<2x2x4xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<2x2x4xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<2x2x4xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<2x2x4xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc298) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc298) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc299) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc311) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc311) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: i1 loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<4x2x2xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<4x2x2xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<4x2x2xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<4x2x2xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc298) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc298) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc299) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc311) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc311) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: i1 loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x1xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x1xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc298) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc298) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc299) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc311) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc311) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x16xi64> loc("input"(#loc165))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc166) + tt.reduce.return %2 : i64 loc(#loc166) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc166) + tt.return %0 : tensor<1xi64> loc(#loc167) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc168) + tt.return %1 : tensor<1xi64> loc(#loc168) + } loc(#loc165) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc169)), %b: i64 loc("b"(#loc169))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc170) + tt.return %0 : i64 loc(#loc171) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc172) + tt.return %1 : i64 loc(#loc172) + } loc(#loc169) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:30) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:45) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":35:30) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":36:18) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":37:34) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":38:18) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":39:18) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":40:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":41:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":43:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":46:71) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":47:20) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":48:21) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":49:21) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":51:71) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":52:20) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":54:35) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":55:26) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":55:29) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":56:21) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":58:35) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":59:26) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":59:29) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":60:21) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":61:21) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":62:21) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":63:21) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":64:19) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":65:32) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":66:35) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":67:44) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":68:20) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":69:20) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":70:35) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:28) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:46) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:38) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:55) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:53) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:63) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":72:31) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":73:21) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":74:21) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":75:19) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":76:35) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":77:20) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":78:20) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":79:35) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:28) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:46) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:38) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:55) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:53) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:63) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":81:25) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":81:37) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":82:25) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":82:37) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:35) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:32) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:47) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:52) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:49) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:25) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:85) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:35) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:32) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:25) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:47) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:52) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:49) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:25) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:85) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:4) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc141 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc142 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc143 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc144 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc145 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc146 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc147 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc148 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc149 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc150 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc151 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc152 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc153 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc154 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc155 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc156 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc157 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc158 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc159 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc160 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc161 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc162 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc163 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc164 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc166 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc167 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc168 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc170 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc171 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc172 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc173 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc175 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc176 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc177 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc179 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc180 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc181 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc182 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc183 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc184 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc185 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc186 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc188 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc189 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc190 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc191 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc201 = loc("xnumel"(#loc1)) +#loc202 = loc("r0_numel"(#loc2)) +#loc203 = loc("xoffset"(#loc3)) +#loc204 = loc("xoffset"(#loc4)) +#loc205 = loc("xindex"(#loc5)) +#loc206 = loc("xindex"(#loc6)) +#loc207 = loc("xindex"(#loc7)) +#loc208 = loc("xmask"(#loc8)) +#loc209 = loc("r0_index"(#loc9)) +#loc210 = loc("r0_index"(#loc10)) +#loc211 = loc("r0_offset"(#loc11)) +#loc212 = loc("r0_mask"(#loc12)) +#loc213 = loc("tmp0"(#loc13)) +#loc214 = loc("tmp0"(#loc14)) +#loc215 = loc("tmp0"(#loc15)) +#loc216 = loc("tmp0"(#loc16)) +#loc217 = loc("tmp1"(#loc17)) +#loc218 = loc("tmp2"(#loc18)) +#loc219 = loc("tmp3"(#loc19)) +#loc220 = loc("tmp4"(#loc20)) +#loc221 = loc("tmp5"(#loc21)) +#loc222 = loc("tmp6"(#loc22)) +#loc223 = loc("tmp7"(#loc23)) +#loc224 = loc("tmp9"(#loc24)) +#loc225 = loc("tmp14"(#loc26)) +#loc226 = loc("tmp15"(#loc27)) +#loc227 = loc("tmp16"(#loc28)) +#loc228 = loc("tmp20"(#loc30)) +#loc229 = loc("tmp23"(#loc31)) +#loc230 = loc("tmp24"(#loc32)) +#loc231 = loc("tmp24"(#loc33)) +#loc232 = loc("tmp25"(#loc34)) +#loc233 = loc("tmp28"(#loc35)) +#loc234 = loc("tmp29"(#loc36)) +#loc235 = loc("tmp29"(#loc37)) +#loc236 = loc("tmp30"(#loc38)) +#loc237 = loc("tmp31"(#loc39)) +#loc238 = loc("tmp32"(#loc40)) +#loc239 = loc("tmp33"(#loc41)) +#loc240 = loc("tmp34"(#loc42)) +#loc241 = loc("tmp35"(#loc43)) +#loc242 = loc("tmp36"(#loc44)) +#loc243 = loc("tmp37"(#loc45)) +#loc244 = loc("tmp38"(#loc46)) +#loc245 = loc("tmp39"(#loc47)) +#loc246 = loc("tmp40"(#loc48)) +#loc247 = loc("tmp42"(#loc55)) +#loc248 = loc("tmp43"(#loc56)) +#loc249 = loc("tmp44"(#loc57)) +#loc250 = loc("tmp45"(#loc58)) +#loc251 = loc("tmp46"(#loc59)) +#loc252 = loc("tmp47"(#loc60)) +#loc253 = loc("tmp48"(#loc61)) +#loc254 = loc("tmp49"(#loc62)) +#loc259 = loc("flip"(#loc95)) +#loc260 = loc("flip"(#loc96)) +#loc261 = loc("flip"(#loc97)) +#loc262 = loc("flip"(#loc98)) +#loc266 = loc("y"(#loc103)) +#loc267 = loc("right_mask"(#loc104)) +#loc268 = loc("right_mask"(#loc105)) +#loc269 = loc("left_mask"(#loc106)) +#loc270 = loc("ileft"(#loc107)) +#loc271 = loc("ileft"(#loc108)) +#loc272 = loc("ileft"(#loc109)) +#loc273 = loc("ileft"(#loc110)) +#loc274 = loc("iright"(#loc111)) +#loc275 = loc("iright"(#loc112)) +#loc276 = loc("iright"(#loc113)) +#loc277 = loc("iright"(#loc114)) +#loc278 = loc("ileft"(#loc115)) +#loc279 = loc("iright"(#loc116)) +#loc280 = loc("y_idx"(#loc117)) +#loc281 = loc("left_idx"(#loc118)) +#loc282 = loc("left_idx"(#loc119)) +#loc283 = loc("left_idx"(#loc120)) +#loc284 = loc("left_idx"(#loc121)) +#loc285 = loc("left_idx"(#loc122)) +#loc286 = loc("right_idx"(#loc123)) +#loc287 = loc("right_idx"(#loc124)) +#loc288 = loc("right_idx"(#loc125)) +#loc289 = loc("right_idx"(#loc126)) +#loc290 = loc("right_idx"(#loc127)) +#loc291 = loc("left_idx"(#loc128)) +#loc292 = loc("right_idx"(#loc129)) +#loc293 = loc("left_valid_mask"(#loc130)) +#loc294 = loc("right_valid_mask"(#loc131)) +#loc295 = loc("left_isnan"(#loc132)) +#loc296 = loc("right_isnan"(#loc133)) +#loc297 = loc("cond"(#loc134)) +#loc298 = loc("cond"(#loc137)) +#loc299 = loc("cond"(#loc138)) +#loc300 = loc("cond"(#loc139)) +#loc301 = loc("eq"(#loc141)) +#loc302 = loc("eq"(#loc144)) +#loc303 = loc("eq"(#loc145)) +#loc304 = loc("cond"(#loc146)) +#loc305 = loc("cond"(#loc147)) +#loc306 = loc("cond"(#loc148)) +#loc307 = loc("cond"(#loc149)) +#loc308 = loc("cond"(#loc150)) +#loc309 = loc("cond"(#loc151)) +#loc310 = loc("cond"(#loc152)) +#loc311 = loc("cond"(#loc153)) +#loc312 = loc("cond"(#loc154)) +#loc313 = loc("ret"(#loc155)) +#loc314 = loc("ret"(#loc156)) +#loc315 = loc("ret"(#loc157)) +#loc316 = loc("ret"(#loc158)) +#loc317 = loc("new_idxs"(#loc159)) +#loc318 = loc("new_idxs"(#loc160)) +#loc319 = loc("new_idxs"(#loc161)) +#loc320 = loc("new_idxs"(#loc162)) +#loc324 = loc("input"(#loc173)) +#loc328 = loc("flip"(#loc191)) +#loc329 = loc("cond"(#loc297)) +#loc330 = loc("cond"(#loc300)) +#loc331 = loc("eq"(#loc301)) +#loc332 = loc("eq"(#loc303)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..8e2a204437ed70df10342c2f630e185cb5b76b03 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir @@ -0,0 +1,1487 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [2, 2, 8], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [4, 2, 4], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked3 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [8, 2, 2], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked4 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [16, 2, 1], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [2, 1], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":18:0) +#loc1 = loc(unknown) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":46:71) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":51:71) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":55:26) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":59:26) +#loc113 = loc("in_ptr0"(#loc)) +#loc114 = loc("out_ptr4"(#loc)) +#loc115 = loc("out_ptr5"(#loc)) +#loc116 = loc("out_ptr6"(#loc)) +#loc117 = loc("out_ptr7"(#loc)) +#loc118 = loc("out_ptr8"(#loc)) +#loc119 = loc("out_ptr9"(#loc)) +#loc120 = loc("xnumel"(#loc)) +#loc121 = loc("r0_numel"(#loc)) +#loc136 = loc(callsite(#loc16 at #loc17)) +#loc142 = loc("ileft"(#loc25)) +#loc146 = loc("iright"(#loc30)) +#loc155 = loc("left_idx"(#loc39)) +#loc160 = loc("right_idx"(#loc44)) +#loc181 = loc(callsite(#loc16 at #loc65)) +#loc184 = loc("tmp24"(#loc68)) +#loc188 = loc("tmp29"(#loc72)) +#loc210 = loc(callsite(#loc21 at #loc136)) +#loc214 = loc(callsite(#loc21 at #loc181)) +#loc217 = loc(callsite(#loc1 at #loc184)) +#loc220 = loc(callsite(#loc1 at #loc188)) +#loc225 = loc(callsite(#loc142 at #loc210)) +#loc229 = loc(callsite(#loc146 at #loc210)) +#loc237 = loc(callsite(#loc155 at #loc210)) +#loc242 = loc(callsite(#loc160 at #loc210)) +#loc262 = loc(callsite(#loc142 at #loc214)) +#loc266 = loc(callsite(#loc146 at #loc214)) +#loc284 = loc(callsite(#loc155 at #loc214)) +#loc288 = loc(callsite(#loc160 at #loc214)) +#loc298 = loc(callsite(#loc1 at #loc225)) +#loc300 = loc(callsite(#loc1 at #loc229)) +#loc303 = loc(callsite(#loc1 at #loc237)) +#loc306 = loc(callsite(#loc1 at #loc242)) +#loc308 = loc(callsite(#loc1 at #loc262)) +#loc310 = loc(callsite(#loc1 at #loc266)) +#loc312 = loc(callsite(#loc1 at #loc284)) +#loc314 = loc(callsite(#loc1 at #loc288)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x16xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<16384> : tensor<1x16xi64, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked2> loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked3> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked4> loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c17_i32 = arith.constant 17 : i32 loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x16xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<0> : tensor<1x16xi32, #blocked5> loc(#loc1) + %cst_7 = arith.constant dense<17> : tensor<1x16xi32, #blocked5> loc(#loc1) + %cst_8 = arith.constant dense<16> : tensor<1x16xi32, #blocked5> loc(#loc1) + %cst_9 = arith.constant dense<16384> : tensor<1x16xi64, #blocked5> loc(#loc1) + %cst_10 = arith.constant dense<0> : tensor<1x16xi64, #blocked5> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc122) + %xmask = arith.cmpi slt, %xoffset, %c32_i32 : i32 loc(#loc123) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked5}>> loc(#loc124) + %r0_index_11 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc124) + %r0_index_12 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x16xi32, #blocked5> loc(#loc124) + %r0_index_13 = tt.expand_dims %r0_index_11 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc124) + %tmp0 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc125) + %tmp0_14 = tt.splat %tmp0 : i32 -> tensor<1x16xi32, #blocked5> loc(#loc204) + %tmp0_15 = tt.splat %tmp0 : i32 -> tensor<1x16xi32, #blocked> loc(#loc204) + %tmp0_16 = arith.addi %r0_index_12, %tmp0_14 : tensor<1x16xi32, #blocked5> loc(#loc126) + %tmp0_17 = arith.addi %r0_index_13, %tmp0_15 : tensor<1x16xi32, #blocked> loc(#loc126) + %tmp0_18 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc127) + %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> loc(#loc127) + %tmp0_20 = tt.addptr %tmp0_18, %tmp0_16 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc127) + %tmp0_21 = tt.addptr %tmp0_19, %tmp0_17 : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> loc(#loc127) + %tmp0_22 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked5> loc(#loc205) + %tmp0_23 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked> loc(#loc205) + %tmp0_24 = tt.load %tmp0_20, %tmp0_22, %cst_10 : tensor<1x16x!tt.ptr, #blocked5> loc(#loc128) + %tmp0_25 = tt.load %tmp0_21, %tmp0_23, %cst : tensor<1x16x!tt.ptr, #blocked> loc(#loc128) + %tmp2 = arith.cmpi sgt, %tmp0_24, %cst_10 : tensor<1x16xi64, #blocked5> loc(#loc129) + %tmp2_26 = arith.cmpi sgt, %tmp0_25, %cst : tensor<1x16xi64, #blocked> loc(#loc129) + %tmp4 = arith.cmpi slt, %tmp0_24, %cst_9 : tensor<1x16xi64, #blocked5> loc(#loc130) + %tmp4_27 = arith.cmpi slt, %tmp0_25, %cst_0 : tensor<1x16xi64, #blocked> loc(#loc130) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<1x16xi1, #blocked5> loc(#loc131) + %tmp5_28 = arith.andi %tmp2_26, %tmp4_27 : tensor<1x16xi1, #blocked> loc(#loc131) + %tmp7 = arith.extui %tmp5 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc206) + %tmp9 = arith.trunci %r0_index_12 : tensor<1x16xi32, #blocked5> to tensor<1x16xi16, #blocked5> loc(#loc134) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> loc(#loc207) + %flip_29 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> loc(#loc207) + %flip_30 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> loc(#loc207) + %flip_31 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> loc(#loc207) + %flip_32 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> loc(#loc207) + %flip_33 = tt.expand_dims %flip_29 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> loc(#loc207) + %flip_34 = tt.expand_dims %flip_30 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> loc(#loc207) + %flip_35 = tt.expand_dims %flip_31 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> loc(#loc207) + %flip_36 = tt.expand_dims %flip_32 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> -> tensor<1x2x1xi32, #blocked3> loc(#loc207) + %flip_37 = tt.expand_dims %flip_33 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> -> tensor<1x2x1xi32, #blocked4> loc(#loc207) + %flip_38 = tt.expand_dims %flip_34 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> -> tensor<1x2x1xi32, #blocked2> loc(#loc207) + %flip_39 = tt.expand_dims %flip_35 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> -> tensor<1x2x1xi32, #blocked1> loc(#loc207) + %flip_40 = tt.broadcast %flip_36 : tensor<1x2x1xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc208) + %flip_41 = tt.reshape %flip_40 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc209) + %y = tt.reshape %tmp7 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc222) + %left_mask = arith.subi %cst_4, %flip_37 : tensor<1x2x1xi32, #blocked4> loc(#loc223) + %left_mask_42 = arith.subi %cst_3, %flip_36 : tensor<1x2x1xi32, #blocked3> loc(#loc223) + %left_mask_43 = arith.subi %cst_2, %flip_38 : tensor<1x2x1xi32, #blocked2> loc(#loc223) + %left_mask_44 = arith.subi %cst_1, %flip_39 : tensor<1x2x1xi32, #blocked1> loc(#loc223) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc224) + %ileft_45 = arith.muli %y, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc224) + %ileft_46 = "tt.reduce"(%ileft_45) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc297) + %ileft_47 = tt.expand_dims %ileft_46 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc226) + %ileft_48 = tt.broadcast %ileft_47 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc227) + %iright = tt.broadcast %flip_37 : tensor<1x2x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc228) + %iright_49 = arith.muli %y, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc228) + %iright_50 = "tt.reduce"(%iright_49) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc299) + %iright_51 = tt.expand_dims %iright_50 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc230) + %iright_52 = tt.broadcast %iright_51 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc231) + %ileft_53 = tt.reshape %ileft_48 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_54 = tt.reshape %iright_52 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx = tt.reshape %tmp9 : tensor<1x16xi16, #blocked5> -> tensor<8x2x1xi16, #blocked4> loc(#loc234) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc235) + %left_idx_55 = tt.broadcast %left_idx : tensor<1x2x1xi16, #blocked4> -> tensor<8x2x1xi16, #blocked4> loc(#loc236) + %left_idx_56 = arith.muli %y_idx, %left_idx_55 : tensor<8x2x1xi16, #blocked4> loc(#loc236) + %input = arith.extsi %left_idx_56 : tensor<8x2x1xi16, #blocked4> to tensor<8x2x1xi32, #blocked4> loc(#loc301) + %left_idx_57 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %left_idx_58 = tt.expand_dims %left_idx_57 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc238) + %left_idx_59 = tt.broadcast %left_idx_58 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc239) + %right_idx = arith.trunci %flip_37 : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc240) + %right_idx_60 = tt.broadcast %right_idx : tensor<1x2x1xi16, #blocked4> -> tensor<8x2x1xi16, #blocked4> loc(#loc241) + %right_idx_61 = arith.muli %y_idx, %right_idx_60 : tensor<8x2x1xi16, #blocked4> loc(#loc241) + %input_62 = arith.extsi %right_idx_61 : tensor<8x2x1xi16, #blocked4> to tensor<8x2x1xi32, #blocked4> loc(#loc304) + %right_idx_63 = "tt.reduce"(%input_62) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %right_idx_64 = tt.expand_dims %right_idx_63 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc243) + %right_idx_65 = tt.broadcast %right_idx_64 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc244) + %left_idx_66 = tt.reshape %left_idx_59 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_67 = tt.reshape %right_idx_65 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond = arith.cmpi slt, %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq = arith.cmpi eq, %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_68 = arith.cmpi sgt, %left_idx_66, %right_idx_67 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_69 = arith.andi %eq, %cond_68 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_70 = arith.ori %cond, %cond_69 : tensor<1x16xi1, #blocked5> loc(#loc251) + %cond_71 = arith.extui %cond_70 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_72 = arith.xori %cond_71, %flip_41 : tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_73 = arith.cmpi ne, %cond_72, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc253) + %ret = arith.xori %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_74 = arith.select %cond_73, %ret, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_75 = arith.xori %tmp7, %ret_74 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs = arith.xori %left_idx_66, %right_idx_67 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_76 = arith.select %cond_73, %new_idxs, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_77 = arith.extsi %tmp9 : tensor<1x16xi16, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc259) + %new_idxs_78 = arith.xori %new_idxs_77, %new_idxs_76 : tensor<1x16xi32, #blocked5> loc(#loc259) + %flip_79 = tt.broadcast %flip_38 : tensor<1x2x1xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc208) + %flip_80 = tt.reshape %flip_79 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc209) + %y_81 = tt.reshape %ret_75 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc222) + %ileft_82 = tt.broadcast %left_mask_42 : tensor<1x2x1xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc224) + %ileft_83 = arith.muli %y_81, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc224) + %ileft_84 = "tt.reduce"(%ileft_83) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc297) + %ileft_85 = tt.expand_dims %ileft_84 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc226) + %ileft_86 = tt.broadcast %ileft_85 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc227) + %iright_87 = arith.muli %y_81, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc228) + %iright_88 = "tt.reduce"(%iright_87) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc299) + %iright_89 = tt.expand_dims %iright_88 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc230) + %iright_90 = tt.broadcast %iright_89 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc231) + %ileft_91 = tt.reshape %ileft_86 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_92 = tt.reshape %iright_90 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_93 = tt.reshape %new_idxs_78 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc234) + %left_idx_94 = arith.muli %y_idx_93, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc236) + %left_idx_95 = "tt.reduce"(%left_idx_94) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %left_idx_96 = tt.expand_dims %left_idx_95 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc238) + %left_idx_97 = tt.broadcast %left_idx_96 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc239) + %right_idx_98 = arith.muli %y_idx_93, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc241) + %right_idx_99 = "tt.reduce"(%right_idx_98) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %right_idx_100 = tt.expand_dims %right_idx_99 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc243) + %right_idx_101 = tt.broadcast %right_idx_100 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc244) + %left_idx_102 = tt.reshape %left_idx_97 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_103 = tt.reshape %right_idx_101 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_104 = arith.cmpi slt, %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_105 = arith.cmpi eq, %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_106 = arith.cmpi sgt, %left_idx_102, %right_idx_103 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_107 = arith.andi %eq_105, %cond_106 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_108 = arith.ori %cond_104, %cond_107 : tensor<1x16xi1, #blocked5> loc(#loc251) + %cond_109 = arith.extui %cond_108 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_110 = arith.xori %cond_109, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_111 = arith.cmpi ne, %cond_110, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc253) + %ret_112 = arith.xori %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_113 = arith.select %cond_111, %ret_112, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_114 = arith.xori %ret_75, %ret_113 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_115 = arith.xori %left_idx_102, %right_idx_103 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_116 = arith.select %cond_111, %new_idxs_115, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_117 = arith.xori %new_idxs_78, %new_idxs_116 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_118 = tt.reshape %ret_114 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc222) + %ileft_119 = arith.muli %y_118, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc224) + %ileft_120 = "tt.reduce"(%ileft_119) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc297) + %ileft_121 = tt.expand_dims %ileft_120 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc226) + %ileft_122 = tt.broadcast %ileft_121 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc227) + %iright_123 = arith.muli %y_118, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc228) + %iright_124 = "tt.reduce"(%iright_123) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc299) + %iright_125 = tt.expand_dims %iright_124 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc230) + %iright_126 = tt.broadcast %iright_125 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc231) + %ileft_127 = tt.reshape %ileft_122 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_128 = tt.reshape %iright_126 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_129 = tt.reshape %new_idxs_117 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc234) + %left_idx_130 = arith.muli %y_idx_129, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc236) + %left_idx_131 = "tt.reduce"(%left_idx_130) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %left_idx_132 = tt.expand_dims %left_idx_131 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc238) + %left_idx_133 = tt.broadcast %left_idx_132 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc239) + %right_idx_134 = arith.muli %y_idx_129, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc241) + %right_idx_135 = "tt.reduce"(%right_idx_134) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %right_idx_136 = tt.expand_dims %right_idx_135 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc243) + %right_idx_137 = tt.broadcast %right_idx_136 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc244) + %left_idx_138 = tt.reshape %left_idx_133 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_139 = tt.reshape %right_idx_137 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_140 = arith.cmpi slt, %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_141 = arith.cmpi eq, %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_142 = arith.cmpi sgt, %left_idx_138, %right_idx_139 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_143 = arith.andi %eq_141, %cond_142 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_144 = arith.ori %cond_140, %cond_143 : tensor<1x16xi1, #blocked5> loc(#loc251) + %cond_145 = arith.extui %cond_144 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_146 = arith.xori %cond_145, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_147 = arith.cmpi ne, %cond_146, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc253) + %ret_148 = arith.xori %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_149 = arith.select %cond_147, %ret_148, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_150 = arith.xori %ret_114, %ret_149 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_151 = arith.xori %left_idx_138, %right_idx_139 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_152 = arith.select %cond_147, %new_idxs_151, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_153 = arith.xori %new_idxs_117, %new_idxs_152 : tensor<1x16xi32, #blocked5> loc(#loc259) + %flip_154 = tt.broadcast %flip_39 : tensor<1x2x1xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc208) + %flip_155 = tt.reshape %flip_154 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc209) + %y_156 = tt.reshape %ret_150 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc222) + %ileft_157 = tt.broadcast %left_mask_43 : tensor<1x2x1xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc224) + %ileft_158 = arith.muli %y_156, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc224) + %ileft_159 = "tt.reduce"(%ileft_158) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc297) + %ileft_160 = tt.expand_dims %ileft_159 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc226) + %ileft_161 = tt.broadcast %ileft_160 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc227) + %iright_162 = arith.muli %y_156, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc228) + %iright_163 = "tt.reduce"(%iright_162) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc299) + %iright_164 = tt.expand_dims %iright_163 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc230) + %iright_165 = tt.broadcast %iright_164 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc231) + %ileft_166 = tt.reshape %ileft_161 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_167 = tt.reshape %iright_165 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_168 = tt.reshape %new_idxs_153 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc234) + %left_idx_169 = arith.muli %y_idx_168, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc236) + %left_idx_170 = "tt.reduce"(%left_idx_169) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %left_idx_171 = tt.expand_dims %left_idx_170 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc238) + %left_idx_172 = tt.broadcast %left_idx_171 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc239) + %right_idx_173 = arith.muli %y_idx_168, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc241) + %right_idx_174 = "tt.reduce"(%right_idx_173) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %right_idx_175 = tt.expand_dims %right_idx_174 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc243) + %right_idx_176 = tt.broadcast %right_idx_175 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc244) + %left_idx_177 = tt.reshape %left_idx_172 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_178 = tt.reshape %right_idx_176 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_179 = arith.cmpi slt, %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_180 = arith.cmpi eq, %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_181 = arith.cmpi sgt, %left_idx_177, %right_idx_178 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_182 = arith.andi %eq_180, %cond_181 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_183 = arith.ori %cond_179, %cond_182 : tensor<1x16xi1, #blocked5> loc(#loc251) + %cond_184 = arith.extui %cond_183 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_185 = arith.xori %cond_184, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_186 = arith.cmpi ne, %cond_185, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc253) + %ret_187 = arith.xori %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_188 = arith.select %cond_186, %ret_187, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_189 = arith.xori %ret_150, %ret_188 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_190 = arith.xori %left_idx_177, %right_idx_178 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_191 = arith.select %cond_186, %new_idxs_190, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_192 = arith.xori %new_idxs_153, %new_idxs_191 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_193 = tt.reshape %ret_189 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc222) + %ileft_194 = arith.muli %y_193, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc224) + %ileft_195 = "tt.reduce"(%ileft_194) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc297) + %ileft_196 = tt.expand_dims %ileft_195 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc226) + %ileft_197 = tt.broadcast %ileft_196 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc227) + %iright_198 = arith.muli %y_193, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc228) + %iright_199 = "tt.reduce"(%iright_198) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc299) + %iright_200 = tt.expand_dims %iright_199 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc230) + %iright_201 = tt.broadcast %iright_200 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc231) + %ileft_202 = tt.reshape %ileft_197 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_203 = tt.reshape %iright_201 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_204 = tt.reshape %new_idxs_192 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc234) + %left_idx_205 = arith.muli %y_idx_204, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc236) + %left_idx_206 = "tt.reduce"(%left_idx_205) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %left_idx_207 = tt.expand_dims %left_idx_206 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc238) + %left_idx_208 = tt.broadcast %left_idx_207 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc239) + %right_idx_209 = arith.muli %y_idx_204, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc241) + %right_idx_210 = "tt.reduce"(%right_idx_209) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %right_idx_211 = tt.expand_dims %right_idx_210 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc243) + %right_idx_212 = tt.broadcast %right_idx_211 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc244) + %left_idx_213 = tt.reshape %left_idx_208 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_214 = tt.reshape %right_idx_212 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_215 = arith.cmpi slt, %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_216 = arith.cmpi eq, %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_217 = arith.cmpi sgt, %left_idx_213, %right_idx_214 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_218 = arith.andi %eq_216, %cond_217 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_219 = arith.ori %cond_215, %cond_218 : tensor<1x16xi1, #blocked5> loc(#loc251) + %cond_220 = arith.extui %cond_219 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_221 = arith.xori %cond_220, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_222 = arith.cmpi ne, %cond_221, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc253) + %ret_223 = arith.xori %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_224 = arith.select %cond_222, %ret_223, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_225 = arith.xori %ret_189, %ret_224 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_226 = arith.xori %left_idx_213, %right_idx_214 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_227 = arith.select %cond_222, %new_idxs_226, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_228 = arith.xori %new_idxs_192, %new_idxs_227 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_229 = tt.reshape %ret_225 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc222) + %ileft_230 = arith.muli %y_229, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc224) + %ileft_231 = "tt.reduce"(%ileft_230) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc297) + %ileft_232 = tt.expand_dims %ileft_231 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc226) + %ileft_233 = tt.broadcast %ileft_232 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc227) + %iright_234 = arith.muli %y_229, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc228) + %iright_235 = "tt.reduce"(%iright_234) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc299) + %iright_236 = tt.expand_dims %iright_235 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc230) + %iright_237 = tt.broadcast %iright_236 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc231) + %ileft_238 = tt.reshape %ileft_233 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_239 = tt.reshape %iright_237 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_240 = tt.reshape %new_idxs_228 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc234) + %left_idx_241 = arith.muli %y_idx_240, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc236) + %left_idx_242 = "tt.reduce"(%left_idx_241) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %left_idx_243 = tt.expand_dims %left_idx_242 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc238) + %left_idx_244 = tt.broadcast %left_idx_243 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc239) + %right_idx_245 = arith.muli %y_idx_240, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc241) + %right_idx_246 = "tt.reduce"(%right_idx_245) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %right_idx_247 = tt.expand_dims %right_idx_246 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc243) + %right_idx_248 = tt.broadcast %right_idx_247 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc244) + %left_idx_249 = tt.reshape %left_idx_244 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_250 = tt.reshape %right_idx_248 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_251 = arith.cmpi slt, %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_252 = arith.cmpi eq, %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_253 = arith.cmpi sgt, %left_idx_249, %right_idx_250 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_254 = arith.andi %eq_252, %cond_253 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_255 = arith.ori %cond_251, %cond_254 : tensor<1x16xi1, #blocked5> loc(#loc251) + %cond_256 = arith.extui %cond_255 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_257 = arith.xori %cond_256, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_258 = arith.cmpi ne, %cond_257, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc253) + %ret_259 = arith.xori %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_260 = arith.select %cond_258, %ret_259, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_261 = arith.xori %ret_225, %ret_260 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_262 = arith.xori %left_idx_249, %right_idx_250 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_263 = arith.select %cond_258, %new_idxs_262, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_264 = arith.xori %new_idxs_228, %new_idxs_263 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_265 = tt.reshape %ret_261 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc222) + %ileft_266 = tt.broadcast %left_mask_44 : tensor<1x2x1xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc224) + %ileft_267 = arith.muli %y_265, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc224) + %ileft_268 = "tt.reduce"(%ileft_267) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc297) + %ileft_269 = tt.expand_dims %ileft_268 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc226) + %ileft_270 = tt.broadcast %ileft_269 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc227) + %iright_271 = arith.muli %y_265, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc228) + %iright_272 = "tt.reduce"(%iright_271) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc299) + %iright_273 = tt.expand_dims %iright_272 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc230) + %iright_274 = tt.broadcast %iright_273 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc231) + %ileft_275 = tt.reshape %ileft_270 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_276 = tt.reshape %iright_274 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_277 = tt.reshape %new_idxs_264 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc234) + %left_idx_278 = arith.muli %y_idx_277, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc236) + %left_idx_279 = "tt.reduce"(%left_idx_278) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc302) + %left_idx_280 = tt.expand_dims %left_idx_279 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc238) + %left_idx_281 = tt.broadcast %left_idx_280 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc239) + %right_idx_282 = arith.muli %y_idx_277, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc241) + %right_idx_283 = "tt.reduce"(%right_idx_282) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc305) + %right_idx_284 = tt.expand_dims %right_idx_283 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc243) + %right_idx_285 = tt.broadcast %right_idx_284 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc244) + %left_idx_286 = tt.reshape %left_idx_281 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_287 = tt.reshape %right_idx_285 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_288 = arith.cmpi slt, %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_289 = arith.cmpi eq, %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_290 = arith.cmpi sgt, %left_idx_286, %right_idx_287 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_291 = arith.andi %eq_289, %cond_290 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_292 = arith.ori %cond_288, %cond_291 : tensor<1x16xi1, #blocked5> loc(#loc251) + %ret_293 = arith.xori %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_294 = arith.select %cond_292, %ret_293, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_295 = arith.xori %ret_261, %ret_294 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_296 = arith.xori %left_idx_286, %right_idx_287 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_297 = arith.select %cond_292, %new_idxs_296, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_298 = arith.xori %new_idxs_264, %new_idxs_297 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_299 = tt.reshape %ret_295 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc222) + %ileft_300 = arith.muli %y_299, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc224) + %ileft_301 = "tt.reduce"(%ileft_300) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc297) + %ileft_302 = tt.expand_dims %ileft_301 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc226) + %ileft_303 = tt.broadcast %ileft_302 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc227) + %iright_304 = arith.muli %y_299, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc228) + %iright_305 = "tt.reduce"(%iright_304) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc299) + %iright_306 = tt.expand_dims %iright_305 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc230) + %iright_307 = tt.broadcast %iright_306 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc231) + %ileft_308 = tt.reshape %ileft_303 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_309 = tt.reshape %iright_307 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_310 = tt.reshape %new_idxs_298 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc234) + %left_idx_311 = arith.muli %y_idx_310, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc236) + %left_idx_312 = "tt.reduce"(%left_idx_311) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %left_idx_313 = tt.expand_dims %left_idx_312 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc238) + %left_idx_314 = tt.broadcast %left_idx_313 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc239) + %right_idx_315 = arith.muli %y_idx_310, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc241) + %right_idx_316 = "tt.reduce"(%right_idx_315) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %right_idx_317 = tt.expand_dims %right_idx_316 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc243) + %right_idx_318 = tt.broadcast %right_idx_317 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc244) + %left_idx_319 = tt.reshape %left_idx_314 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_320 = tt.reshape %right_idx_318 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_321 = arith.cmpi slt, %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_322 = arith.cmpi eq, %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_323 = arith.cmpi sgt, %left_idx_319, %right_idx_320 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_324 = arith.andi %eq_322, %cond_323 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_325 = arith.ori %cond_321, %cond_324 : tensor<1x16xi1, #blocked5> loc(#loc251) + %ret_326 = arith.xori %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_327 = arith.select %cond_325, %ret_326, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_328 = arith.xori %ret_295, %ret_327 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_329 = arith.xori %left_idx_319, %right_idx_320 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_330 = arith.select %cond_325, %new_idxs_329, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_331 = arith.xori %new_idxs_298, %new_idxs_330 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_332 = tt.reshape %ret_328 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc222) + %ileft_333 = arith.muli %y_332, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc224) + %ileft_334 = "tt.reduce"(%ileft_333) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc297) + %ileft_335 = tt.expand_dims %ileft_334 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc226) + %ileft_336 = tt.broadcast %ileft_335 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc227) + %iright_337 = arith.muli %y_332, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc228) + %iright_338 = "tt.reduce"(%iright_337) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc299) + %iright_339 = tt.expand_dims %iright_338 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc230) + %iright_340 = tt.broadcast %iright_339 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc231) + %ileft_341 = tt.reshape %ileft_336 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_342 = tt.reshape %iright_340 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_343 = tt.reshape %new_idxs_331 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc234) + %left_idx_344 = arith.muli %y_idx_343, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc236) + %left_idx_345 = "tt.reduce"(%left_idx_344) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %left_idx_346 = tt.expand_dims %left_idx_345 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc238) + %left_idx_347 = tt.broadcast %left_idx_346 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc239) + %right_idx_348 = arith.muli %y_idx_343, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc241) + %right_idx_349 = "tt.reduce"(%right_idx_348) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %right_idx_350 = tt.expand_dims %right_idx_349 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc243) + %right_idx_351 = tt.broadcast %right_idx_350 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc244) + %left_idx_352 = tt.reshape %left_idx_347 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_353 = tt.reshape %right_idx_351 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_354 = arith.cmpi slt, %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_355 = arith.cmpi eq, %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_356 = arith.cmpi sgt, %left_idx_352, %right_idx_353 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_357 = arith.andi %eq_355, %cond_356 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_358 = arith.ori %cond_354, %cond_357 : tensor<1x16xi1, #blocked5> loc(#loc251) + %ret_359 = arith.xori %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_360 = arith.select %cond_358, %ret_359, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_361 = arith.xori %ret_328, %ret_360 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_362 = arith.xori %left_idx_352, %right_idx_353 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_363 = arith.select %cond_358, %new_idxs_362, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_364 = arith.xori %new_idxs_331, %new_idxs_363 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_365 = tt.reshape %ret_361 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc222) + %ileft_366 = arith.muli %y_365, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc224) + %ileft_367 = "tt.reduce"(%ileft_366) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc297) + %ileft_368 = tt.expand_dims %ileft_367 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc226) + %ileft_369 = tt.broadcast %ileft_368 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc227) + %iright_370 = arith.muli %y_365, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc228) + %iright_371 = "tt.reduce"(%iright_370) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc299) + %iright_372 = tt.expand_dims %iright_371 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc230) + %iright_373 = tt.broadcast %iright_372 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc231) + %ileft_374 = tt.reshape %ileft_369 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_375 = tt.reshape %iright_373 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_376 = tt.reshape %new_idxs_364 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc234) + %left_idx_377 = arith.muli %y_idx_376, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc236) + %left_idx_378 = "tt.reduce"(%left_idx_377) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %left_idx_379 = tt.expand_dims %left_idx_378 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc238) + %left_idx_380 = tt.broadcast %left_idx_379 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc239) + %right_idx_381 = arith.muli %y_idx_376, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc241) + %right_idx_382 = "tt.reduce"(%right_idx_381) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %right_idx_383 = tt.expand_dims %right_idx_382 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc243) + %right_idx_384 = tt.broadcast %right_idx_383 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc244) + %left_idx_385 = tt.reshape %left_idx_380 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_386 = tt.reshape %right_idx_384 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_387 = arith.cmpi slt, %ileft_374, %iright_375 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_388 = arith.cmpi eq, %ileft_374, %iright_375 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_389 = arith.cmpi sgt, %left_idx_385, %right_idx_386 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_390 = arith.andi %eq_388, %cond_389 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_391 = arith.ori %cond_387, %cond_390 : tensor<1x16xi1, #blocked5> loc(#loc251) + %new_idxs_392 = arith.xori %left_idx_385, %right_idx_386 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_393 = arith.select %cond_391, %new_idxs_392, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_394 = arith.xori %new_idxs_364, %new_idxs_393 : tensor<1x16xi32, #blocked5> loc(#loc259) + %tmp14 = arith.cmpi eq, %tmp0_24, %cst_9 : tensor<1x16xi64, #blocked5> loc(#loc178) + %tmp14_395 = arith.cmpi eq, %tmp0_25, %cst_0 : tensor<1x16xi64, #blocked> loc(#loc178) + %tmp16 = arith.extui %tmp14 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc213) + %y_396 = tt.reshape %tmp16 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc260) + %ileft_397 = arith.muli %y_396, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc261) + %ileft_398 = "tt.reduce"(%ileft_397) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc307) + %ileft_399 = tt.expand_dims %ileft_398 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc263) + %ileft_400 = tt.broadcast %ileft_399 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc264) + %iright_401 = arith.muli %y_396, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc265) + %iright_402 = "tt.reduce"(%iright_401) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc309) + %iright_403 = tt.expand_dims %iright_402 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc267) + %iright_404 = tt.broadcast %iright_403 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc268) + %ileft_405 = tt.reshape %ileft_400 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_406 = tt.reshape %iright_404 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %cond_407 = arith.cmpi slt, %ileft_405, %iright_406 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_408 = arith.cmpi eq, %ileft_405, %iright_406 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_409 = arith.andi %eq_408, %cond_68 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_410 = arith.ori %cond_407, %cond_409 : tensor<1x16xi1, #blocked5> loc(#loc274) + %cond_411 = arith.extui %cond_410 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_412 = arith.xori %cond_411, %flip_41 : tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_413 = arith.cmpi ne, %cond_412, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc276) + %ret_414 = arith.xori %ileft_405, %iright_406 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_415 = arith.select %cond_413, %ret_414, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_416 = arith.xori %tmp16, %ret_415 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_417 = arith.select %cond_413, %new_idxs, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_418 = arith.xori %new_idxs_77, %new_idxs_417 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_419 = tt.reshape %ret_416 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc260) + %ileft_420 = arith.muli %y_419, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc261) + %ileft_421 = "tt.reduce"(%ileft_420) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc307) + %ileft_422 = tt.expand_dims %ileft_421 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc263) + %ileft_423 = tt.broadcast %ileft_422 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc264) + %iright_424 = arith.muli %y_419, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc265) + %iright_425 = "tt.reduce"(%iright_424) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc309) + %iright_426 = tt.expand_dims %iright_425 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc267) + %iright_427 = tt.broadcast %iright_426 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc268) + %ileft_428 = tt.reshape %ileft_423 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_429 = tt.reshape %iright_427 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_430 = tt.reshape %new_idxs_418 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc282) + %left_idx_431 = arith.muli %y_idx_430, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc283) + %left_idx_432 = "tt.reduce"(%left_idx_431) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc311) + %left_idx_433 = tt.expand_dims %left_idx_432 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc285) + %left_idx_434 = tt.broadcast %left_idx_433 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc286) + %right_idx_435 = arith.muli %y_idx_430, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc287) + %right_idx_436 = "tt.reduce"(%right_idx_435) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc313) + %right_idx_437 = tt.expand_dims %right_idx_436 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc289) + %right_idx_438 = tt.broadcast %right_idx_437 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc290) + %left_idx_439 = tt.reshape %left_idx_434 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_440 = tt.reshape %right_idx_438 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_441 = arith.cmpi slt, %ileft_428, %iright_429 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_442 = arith.cmpi eq, %ileft_428, %iright_429 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_443 = arith.cmpi sgt, %left_idx_439, %right_idx_440 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_444 = arith.andi %eq_442, %cond_443 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_445 = arith.ori %cond_441, %cond_444 : tensor<1x16xi1, #blocked5> loc(#loc274) + %cond_446 = arith.extui %cond_445 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_447 = arith.xori %cond_446, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_448 = arith.cmpi ne, %cond_447, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc276) + %ret_449 = arith.xori %ileft_428, %iright_429 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_450 = arith.select %cond_448, %ret_449, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_451 = arith.xori %ret_416, %ret_450 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_452 = arith.xori %left_idx_439, %right_idx_440 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_453 = arith.select %cond_448, %new_idxs_452, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_454 = arith.xori %new_idxs_418, %new_idxs_453 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_455 = tt.reshape %ret_451 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc260) + %ileft_456 = arith.muli %y_455, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc261) + %ileft_457 = "tt.reduce"(%ileft_456) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc307) + %ileft_458 = tt.expand_dims %ileft_457 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc263) + %ileft_459 = tt.broadcast %ileft_458 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc264) + %iright_460 = arith.muli %y_455, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc265) + %iright_461 = "tt.reduce"(%iright_460) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc309) + %iright_462 = tt.expand_dims %iright_461 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc267) + %iright_463 = tt.broadcast %iright_462 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc268) + %ileft_464 = tt.reshape %ileft_459 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_465 = tt.reshape %iright_463 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_466 = tt.reshape %new_idxs_454 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc282) + %left_idx_467 = arith.muli %y_idx_466, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc283) + %left_idx_468 = "tt.reduce"(%left_idx_467) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc311) + %left_idx_469 = tt.expand_dims %left_idx_468 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc285) + %left_idx_470 = tt.broadcast %left_idx_469 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc286) + %right_idx_471 = arith.muli %y_idx_466, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc287) + %right_idx_472 = "tt.reduce"(%right_idx_471) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc313) + %right_idx_473 = tt.expand_dims %right_idx_472 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc289) + %right_idx_474 = tt.broadcast %right_idx_473 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc290) + %left_idx_475 = tt.reshape %left_idx_470 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_476 = tt.reshape %right_idx_474 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_477 = arith.cmpi slt, %ileft_464, %iright_465 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_478 = arith.cmpi eq, %ileft_464, %iright_465 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_479 = arith.cmpi sgt, %left_idx_475, %right_idx_476 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_480 = arith.andi %eq_478, %cond_479 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_481 = arith.ori %cond_477, %cond_480 : tensor<1x16xi1, #blocked5> loc(#loc274) + %cond_482 = arith.extui %cond_481 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_483 = arith.xori %cond_482, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_484 = arith.cmpi ne, %cond_483, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc276) + %ret_485 = arith.xori %ileft_464, %iright_465 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_486 = arith.select %cond_484, %ret_485, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_487 = arith.xori %ret_451, %ret_486 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_488 = arith.xori %left_idx_475, %right_idx_476 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_489 = arith.select %cond_484, %new_idxs_488, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_490 = arith.xori %new_idxs_454, %new_idxs_489 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_491 = tt.reshape %ret_487 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc260) + %ileft_492 = arith.muli %y_491, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc261) + %ileft_493 = "tt.reduce"(%ileft_492) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc307) + %ileft_494 = tt.expand_dims %ileft_493 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc263) + %ileft_495 = tt.broadcast %ileft_494 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc264) + %iright_496 = arith.muli %y_491, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc265) + %iright_497 = "tt.reduce"(%iright_496) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc309) + %iright_498 = tt.expand_dims %iright_497 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc267) + %iright_499 = tt.broadcast %iright_498 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc268) + %ileft_500 = tt.reshape %ileft_495 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_501 = tt.reshape %iright_499 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_502 = tt.reshape %new_idxs_490 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc282) + %left_idx_503 = arith.muli %y_idx_502, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc283) + %left_idx_504 = "tt.reduce"(%left_idx_503) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc311) + %left_idx_505 = tt.expand_dims %left_idx_504 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc285) + %left_idx_506 = tt.broadcast %left_idx_505 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc286) + %right_idx_507 = arith.muli %y_idx_502, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc287) + %right_idx_508 = "tt.reduce"(%right_idx_507) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc313) + %right_idx_509 = tt.expand_dims %right_idx_508 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc289) + %right_idx_510 = tt.broadcast %right_idx_509 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc290) + %left_idx_511 = tt.reshape %left_idx_506 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_512 = tt.reshape %right_idx_510 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_513 = arith.cmpi slt, %ileft_500, %iright_501 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_514 = arith.cmpi eq, %ileft_500, %iright_501 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_515 = arith.cmpi sgt, %left_idx_511, %right_idx_512 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_516 = arith.andi %eq_514, %cond_515 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_517 = arith.ori %cond_513, %cond_516 : tensor<1x16xi1, #blocked5> loc(#loc274) + %cond_518 = arith.extui %cond_517 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_519 = arith.xori %cond_518, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_520 = arith.cmpi ne, %cond_519, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc276) + %ret_521 = arith.xori %ileft_500, %iright_501 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_522 = arith.select %cond_520, %ret_521, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_523 = arith.xori %ret_487, %ret_522 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_524 = arith.xori %left_idx_511, %right_idx_512 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_525 = arith.select %cond_520, %new_idxs_524, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_526 = arith.xori %new_idxs_490, %new_idxs_525 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_527 = tt.reshape %ret_523 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc260) + %ileft_528 = arith.muli %y_527, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc261) + %ileft_529 = "tt.reduce"(%ileft_528) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc307) + %ileft_530 = tt.expand_dims %ileft_529 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc263) + %ileft_531 = tt.broadcast %ileft_530 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc264) + %iright_532 = arith.muli %y_527, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc265) + %iright_533 = "tt.reduce"(%iright_532) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc309) + %iright_534 = tt.expand_dims %iright_533 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc267) + %iright_535 = tt.broadcast %iright_534 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc268) + %ileft_536 = tt.reshape %ileft_531 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_537 = tt.reshape %iright_535 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_538 = tt.reshape %new_idxs_526 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc282) + %left_idx_539 = arith.muli %y_idx_538, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc283) + %left_idx_540 = "tt.reduce"(%left_idx_539) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc311) + %left_idx_541 = tt.expand_dims %left_idx_540 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc285) + %left_idx_542 = tt.broadcast %left_idx_541 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc286) + %right_idx_543 = arith.muli %y_idx_538, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc287) + %right_idx_544 = "tt.reduce"(%right_idx_543) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc313) + %right_idx_545 = tt.expand_dims %right_idx_544 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc289) + %right_idx_546 = tt.broadcast %right_idx_545 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc290) + %left_idx_547 = tt.reshape %left_idx_542 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_548 = tt.reshape %right_idx_546 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_549 = arith.cmpi slt, %ileft_536, %iright_537 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_550 = arith.cmpi eq, %ileft_536, %iright_537 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_551 = arith.cmpi sgt, %left_idx_547, %right_idx_548 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_552 = arith.andi %eq_550, %cond_551 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_553 = arith.ori %cond_549, %cond_552 : tensor<1x16xi1, #blocked5> loc(#loc274) + %cond_554 = arith.extui %cond_553 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_555 = arith.xori %cond_554, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_556 = arith.cmpi ne, %cond_555, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc276) + %ret_557 = arith.xori %ileft_536, %iright_537 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_558 = arith.select %cond_556, %ret_557, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_559 = arith.xori %ret_523, %ret_558 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_560 = arith.xori %left_idx_547, %right_idx_548 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_561 = arith.select %cond_556, %new_idxs_560, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_562 = arith.xori %new_idxs_526, %new_idxs_561 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_563 = tt.reshape %ret_559 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc260) + %ileft_564 = arith.muli %y_563, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc261) + %ileft_565 = "tt.reduce"(%ileft_564) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc307) + %ileft_566 = tt.expand_dims %ileft_565 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc263) + %ileft_567 = tt.broadcast %ileft_566 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc264) + %iright_568 = arith.muli %y_563, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc265) + %iright_569 = "tt.reduce"(%iright_568) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc309) + %iright_570 = tt.expand_dims %iright_569 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc267) + %iright_571 = tt.broadcast %iright_570 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc268) + %ileft_572 = tt.reshape %ileft_567 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_573 = tt.reshape %iright_571 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_574 = tt.reshape %new_idxs_562 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc282) + %left_idx_575 = arith.muli %y_idx_574, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc283) + %left_idx_576 = "tt.reduce"(%left_idx_575) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc311) + %left_idx_577 = tt.expand_dims %left_idx_576 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc285) + %left_idx_578 = tt.broadcast %left_idx_577 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc286) + %right_idx_579 = arith.muli %y_idx_574, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc287) + %right_idx_580 = "tt.reduce"(%right_idx_579) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc313) + %right_idx_581 = tt.expand_dims %right_idx_580 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc289) + %right_idx_582 = tt.broadcast %right_idx_581 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc290) + %left_idx_583 = tt.reshape %left_idx_578 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_584 = tt.reshape %right_idx_582 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_585 = arith.cmpi slt, %ileft_572, %iright_573 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_586 = arith.cmpi eq, %ileft_572, %iright_573 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_587 = arith.cmpi sgt, %left_idx_583, %right_idx_584 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_588 = arith.andi %eq_586, %cond_587 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_589 = arith.ori %cond_585, %cond_588 : tensor<1x16xi1, #blocked5> loc(#loc274) + %cond_590 = arith.extui %cond_589 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_591 = arith.xori %cond_590, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_592 = arith.cmpi ne, %cond_591, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc276) + %ret_593 = arith.xori %ileft_572, %iright_573 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_594 = arith.select %cond_592, %ret_593, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_595 = arith.xori %ret_559, %ret_594 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_596 = arith.xori %left_idx_583, %right_idx_584 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_597 = arith.select %cond_592, %new_idxs_596, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_598 = arith.xori %new_idxs_562, %new_idxs_597 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_599 = tt.reshape %ret_595 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc260) + %ileft_600 = arith.muli %y_599, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc261) + %ileft_601 = "tt.reduce"(%ileft_600) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc307) + %ileft_602 = tt.expand_dims %ileft_601 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc263) + %ileft_603 = tt.broadcast %ileft_602 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc264) + %iright_604 = arith.muli %y_599, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc265) + %iright_605 = "tt.reduce"(%iright_604) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc309) + %iright_606 = tt.expand_dims %iright_605 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc267) + %iright_607 = tt.broadcast %iright_606 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc268) + %ileft_608 = tt.reshape %ileft_603 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_609 = tt.reshape %iright_607 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_610 = tt.reshape %new_idxs_598 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc282) + %left_idx_611 = arith.muli %y_idx_610, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc283) + %left_idx_612 = "tt.reduce"(%left_idx_611) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc311) + %left_idx_613 = tt.expand_dims %left_idx_612 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc285) + %left_idx_614 = tt.broadcast %left_idx_613 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc286) + %right_idx_615 = arith.muli %y_idx_610, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc287) + %right_idx_616 = "tt.reduce"(%right_idx_615) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc313) + %right_idx_617 = tt.expand_dims %right_idx_616 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc289) + %right_idx_618 = tt.broadcast %right_idx_617 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc290) + %left_idx_619 = tt.reshape %left_idx_614 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_620 = tt.reshape %right_idx_618 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_621 = arith.cmpi slt, %ileft_608, %iright_609 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_622 = arith.cmpi eq, %ileft_608, %iright_609 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_623 = arith.cmpi sgt, %left_idx_619, %right_idx_620 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_624 = arith.andi %eq_622, %cond_623 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_625 = arith.ori %cond_621, %cond_624 : tensor<1x16xi1, #blocked5> loc(#loc274) + %ret_626 = arith.xori %ileft_608, %iright_609 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_627 = arith.select %cond_625, %ret_626, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_628 = arith.xori %ret_595, %ret_627 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_629 = arith.xori %left_idx_619, %right_idx_620 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_630 = arith.select %cond_625, %new_idxs_629, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_631 = arith.xori %new_idxs_598, %new_idxs_630 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_632 = tt.reshape %ret_628 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc260) + %ileft_633 = arith.muli %y_632, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc261) + %ileft_634 = "tt.reduce"(%ileft_633) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc307) + %ileft_635 = tt.expand_dims %ileft_634 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc263) + %ileft_636 = tt.broadcast %ileft_635 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc264) + %iright_637 = arith.muli %y_632, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc265) + %iright_638 = "tt.reduce"(%iright_637) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc309) + %iright_639 = tt.expand_dims %iright_638 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc267) + %iright_640 = tt.broadcast %iright_639 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc268) + %ileft_641 = tt.reshape %ileft_636 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_642 = tt.reshape %iright_640 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_643 = tt.reshape %new_idxs_631 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc282) + %left_idx_644 = arith.muli %y_idx_643, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc283) + %left_idx_645 = "tt.reduce"(%left_idx_644) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc311) + %left_idx_646 = tt.expand_dims %left_idx_645 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc285) + %left_idx_647 = tt.broadcast %left_idx_646 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc286) + %right_idx_648 = arith.muli %y_idx_643, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc287) + %right_idx_649 = "tt.reduce"(%right_idx_648) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc313) + %right_idx_650 = tt.expand_dims %right_idx_649 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc289) + %right_idx_651 = tt.broadcast %right_idx_650 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc290) + %left_idx_652 = tt.reshape %left_idx_647 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_653 = tt.reshape %right_idx_651 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_654 = arith.cmpi slt, %ileft_641, %iright_642 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_655 = arith.cmpi eq, %ileft_641, %iright_642 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_656 = arith.cmpi sgt, %left_idx_652, %right_idx_653 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_657 = arith.andi %eq_655, %cond_656 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_658 = arith.ori %cond_654, %cond_657 : tensor<1x16xi1, #blocked5> loc(#loc274) + %ret_659 = arith.xori %ileft_641, %iright_642 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_660 = arith.select %cond_658, %ret_659, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_661 = arith.xori %ret_628, %ret_660 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_662 = arith.xori %left_idx_652, %right_idx_653 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_663 = arith.select %cond_658, %new_idxs_662, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_664 = arith.xori %new_idxs_631, %new_idxs_663 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_665 = tt.reshape %ret_661 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc260) + %ileft_666 = arith.muli %y_665, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc261) + %ileft_667 = "tt.reduce"(%ileft_666) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc307) + %ileft_668 = tt.expand_dims %ileft_667 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc263) + %ileft_669 = tt.broadcast %ileft_668 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc264) + %iright_670 = arith.muli %y_665, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc265) + %iright_671 = "tt.reduce"(%iright_670) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc309) + %iright_672 = tt.expand_dims %iright_671 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc267) + %iright_673 = tt.broadcast %iright_672 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc268) + %ileft_674 = tt.reshape %ileft_669 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_675 = tt.reshape %iright_673 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_676 = tt.reshape %new_idxs_664 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc282) + %left_idx_677 = arith.muli %y_idx_676, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc283) + %left_idx_678 = "tt.reduce"(%left_idx_677) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc311) + %left_idx_679 = tt.expand_dims %left_idx_678 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc285) + %left_idx_680 = tt.broadcast %left_idx_679 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc286) + %right_idx_681 = arith.muli %y_idx_676, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc287) + %right_idx_682 = "tt.reduce"(%right_idx_681) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc313) + %right_idx_683 = tt.expand_dims %right_idx_682 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc289) + %right_idx_684 = tt.broadcast %right_idx_683 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc290) + %left_idx_685 = tt.reshape %left_idx_680 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_686 = tt.reshape %right_idx_684 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_687 = arith.cmpi slt, %ileft_674, %iright_675 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_688 = arith.cmpi eq, %ileft_674, %iright_675 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_689 = arith.cmpi sgt, %left_idx_685, %right_idx_686 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_690 = arith.andi %eq_688, %cond_689 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_691 = arith.ori %cond_687, %cond_690 : tensor<1x16xi1, #blocked5> loc(#loc274) + %ret_692 = arith.xori %ileft_674, %iright_675 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_693 = arith.select %cond_691, %ret_692, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_694 = arith.xori %ret_661, %ret_693 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_695 = arith.xori %left_idx_685, %right_idx_686 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_696 = arith.select %cond_691, %new_idxs_695, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_697 = arith.xori %new_idxs_664, %new_idxs_696 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_698 = tt.reshape %ret_694 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc260) + %ileft_699 = arith.muli %y_698, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc261) + %ileft_700 = "tt.reduce"(%ileft_699) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc307) + %ileft_701 = tt.expand_dims %ileft_700 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc263) + %ileft_702 = tt.broadcast %ileft_701 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc264) + %iright_703 = arith.muli %y_698, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc265) + %iright_704 = "tt.reduce"(%iright_703) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc309) + %iright_705 = tt.expand_dims %iright_704 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc267) + %iright_706 = tt.broadcast %iright_705 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc268) + %ileft_707 = tt.reshape %ileft_702 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_708 = tt.reshape %iright_706 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_709 = tt.reshape %new_idxs_697 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc282) + %left_idx_710 = arith.muli %y_idx_709, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc283) + %left_idx_711 = "tt.reduce"(%left_idx_710) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc311) + %left_idx_712 = tt.expand_dims %left_idx_711 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc285) + %left_idx_713 = tt.broadcast %left_idx_712 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc286) + %right_idx_714 = arith.muli %y_idx_709, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc287) + %right_idx_715 = "tt.reduce"(%right_idx_714) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc313) + %right_idx_716 = tt.expand_dims %right_idx_715 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc289) + %right_idx_717 = tt.broadcast %right_idx_716 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc290) + %left_idx_718 = tt.reshape %left_idx_713 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_719 = tt.reshape %right_idx_717 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_720 = arith.cmpi slt, %ileft_707, %iright_708 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_721 = arith.cmpi eq, %ileft_707, %iright_708 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_722 = arith.cmpi sgt, %left_idx_718, %right_idx_719 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_723 = arith.andi %eq_721, %cond_722 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_724 = arith.ori %cond_720, %cond_723 : tensor<1x16xi1, #blocked5> loc(#loc274) + %new_idxs_725 = arith.xori %left_idx_718, %right_idx_719 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_726 = arith.select %cond_724, %new_idxs_725, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_727 = arith.xori %new_idxs_697, %new_idxs_726 : tensor<1x16xi32, #blocked5> loc(#loc281) + %tmp20 = arith.extui %tmp5 : tensor<1x16xi1, #blocked5> to tensor<1x16xi64, #blocked5> loc(#loc215) + %tmp20_728 = arith.extui %tmp5_28 : tensor<1x16xi1, #blocked> to tensor<1x16xi64, #blocked> loc(#loc215) + %tmp23 = arith.select %tmp0_22, %tmp20, %cst_10 : tensor<1x16xi1, #blocked5>, tensor<1x16xi64, #blocked5> loc(#loc183) + %tmp23_729 = arith.select %tmp0_23, %tmp20_728, %cst : tensor<1x16xi1, #blocked>, tensor<1x16xi64, #blocked> loc(#loc183) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_743: i64 loc(callsite(#loc1 at #loc184)), %tmp24_744: i64 loc(callsite(#loc1 at #loc184))): + %tmp24_745 = arith.addi %tmp24_743, %tmp24_744 : i64 loc(#loc295) + tt.reduce.return %tmp24_745 : i64 loc(#loc216) + }) : (tensor<1x16xi64, #blocked5>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc216) + %tmp24_730 = "tt.reduce"(%tmp23_729) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_743: i64 loc(callsite(#loc1 at #loc184)), %tmp24_744: i64 loc(callsite(#loc1 at #loc184))): + %tmp24_745 = arith.addi %tmp24_743, %tmp24_744 : i64 loc(#loc295) + tt.reduce.return %tmp24_745 : i64 loc(#loc216) + }) : (tensor<1x16xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc216) + %tmp24_731 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<1x1xi64, #blocked5> loc(#loc185) + %tmp24_732 = tt.expand_dims %tmp24_730 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc185) + %tmp25 = arith.extui %tmp14 : tensor<1x16xi1, #blocked5> to tensor<1x16xi64, #blocked5> loc(#loc218) + %tmp25_733 = arith.extui %tmp14_395 : tensor<1x16xi1, #blocked> to tensor<1x16xi64, #blocked> loc(#loc218) + %tmp28 = arith.select %tmp0_22, %tmp25, %cst_10 : tensor<1x16xi1, #blocked5>, tensor<1x16xi64, #blocked5> loc(#loc187) + %tmp28_734 = arith.select %tmp0_23, %tmp25_733, %cst : tensor<1x16xi1, #blocked>, tensor<1x16xi64, #blocked> loc(#loc187) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_743: i64 loc(callsite(#loc1 at #loc188)), %tmp29_744: i64 loc(callsite(#loc1 at #loc188))): + %tmp29_745 = arith.addi %tmp29_743, %tmp29_744 : i64 loc(#loc296) + tt.reduce.return %tmp29_745 : i64 loc(#loc219) + }) : (tensor<1x16xi64, #blocked5>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc219) + %tmp29_735 = "tt.reduce"(%tmp28_734) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_743: i64 loc(callsite(#loc1 at #loc188)), %tmp29_744: i64 loc(callsite(#loc1 at #loc188))): + %tmp29_745 = arith.addi %tmp29_743, %tmp29_744 : i64 loc(#loc296) + tt.reduce.return %tmp29_745 : i64 loc(#loc219) + }) : (tensor<1x16xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc219) + %tmp29_736 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<1x1xi64, #blocked5> loc(#loc189) + %tmp29_737 = tt.expand_dims %tmp29_735 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc189) + %tmp30 = arith.trunci %tmp24_731 : tensor<1x1xi64, #blocked5> to tensor<1x1xi32, #blocked5> loc(#loc190) + %tmp30_738 = arith.trunci %tmp24_732 : tensor<1x1xi64, #blocked> to tensor<1x1xi32, #blocked> loc(#loc190) + %tmp31 = arith.trunci %tmp29_736 : tensor<1x1xi64, #blocked5> to tensor<1x1xi32, #blocked5> loc(#loc191) + %tmp31_739 = arith.trunci %tmp29_737 : tensor<1x1xi64, #blocked> to tensor<1x1xi32, #blocked> loc(#loc191) + %tmp34 = tt.broadcast %tmp30 : tensor<1x1xi32, #blocked5> -> tensor<1x16xi32, #blocked5> loc(#loc192) + %tmp34_740 = arith.cmpi slt, %r0_index_12, %tmp34 : tensor<1x16xi32, #blocked5> loc(#loc192) + %tmp36 = arith.select %tmp34_740, %new_idxs_394, %cst_8 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc193) + %tmp38 = arith.addi %tmp36, %cst_7 : tensor<1x16xi32, #blocked5> loc(#loc194) + %tmp39 = arith.cmpi slt, %tmp36, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc195) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc196) + %0 = arith.cmpi sge, %tmp40, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc81) + %1 = arith.cmpi slt, %tmp40, %cst_7 : tensor<1x16xi32, #blocked5> loc(#loc82) + %2 = arith.andi %0, %1 : tensor<1x16xi1, #blocked5> loc(#loc83) + %xmask_741 = arith.cmpi sge, %xoffset, %c32_i32 : i32 loc(#loc221) + %3 = tt.splat %xmask_741 : i1 -> tensor<1x16xi1, #blocked5> loc(#loc197) + %4 = arith.ori %2, %3 : tensor<1x16xi1, #blocked5> loc(#loc85) + tt.assert %4, "index out of bounds: 0 <= tmp40 < 17" : tensor<1x16xi1, #blocked5> loc(#loc86) + %tmp45 = tt.broadcast %tmp31 : tensor<1x1xi32, #blocked5> -> tensor<1x16xi32, #blocked5> loc(#loc198) + %tmp45_742 = arith.cmpi slt, %r0_index_12, %tmp45 : tensor<1x16xi32, #blocked5> loc(#loc198) + %tmp46 = arith.select %tmp45_742, %new_idxs_727, %cst_8 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc199) + %tmp47 = arith.addi %tmp46, %cst_7 : tensor<1x16xi32, #blocked5> loc(#loc200) + %tmp48 = arith.cmpi slt, %tmp46, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc201) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc202) + %5 = arith.cmpi sge, %tmp49, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc92) + %6 = arith.cmpi slt, %tmp49, %cst_7 : tensor<1x16xi32, #blocked5> loc(#loc93) + %7 = arith.andi %5, %6 : tensor<1x16xi1, #blocked5> loc(#loc94) + %8 = arith.ori %7, %3 : tensor<1x16xi1, #blocked5> loc(#loc95) + tt.assert %8, "index out of bounds: 0 <= tmp49 < 17" : tensor<1x16xi1, #blocked5> loc(#loc96) + %9 = tt.addptr %out_ptr4, %xoffset : !tt.ptr, i32 loc(#loc97) + %10 = tt.splat %9 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc98) + %11 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc98) + tt.store %10, %tmp30_738, %11 : tensor<1x1x!tt.ptr, #blocked> loc(#loc98) + %12 = tt.addptr %out_ptr5, %xoffset : !tt.ptr, i32 loc(#loc99) + %13 = tt.splat %12 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc100) + tt.store %13, %tmp31_739, %11 : tensor<1x1x!tt.ptr, #blocked> loc(#loc100) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc101) + %15 = tt.addptr %14, %tmp0_16 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc101) + tt.store %15, %new_idxs_394, %tmp0_22 : tensor<1x16x!tt.ptr, #blocked5> loc(#loc102) + %16 = arith.muli %xoffset, %c17_i32 : i32 loc(#loc103) + %17 = tt.splat %16 : i32 -> tensor<1x16xi32, #blocked5> loc(#loc203) + %18 = arith.addi %tmp40, %17 : tensor<1x16xi32, #blocked5> loc(#loc104) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc105) + %20 = tt.addptr %19, %18 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc105) + %21 = ttg.convert_layout %20 : tensor<1x16x!tt.ptr, #blocked5> -> tensor<1x16x!tt.ptr, #blocked> loc(#loc106) + tt.store %21, %cst_5, %tmp0_23 : tensor<1x16x!tt.ptr, #blocked> loc(#loc106) + %22 = tt.splat %out_ptr8 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc107) + %23 = tt.addptr %22, %tmp0_16 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc107) + tt.store %23, %new_idxs_727, %tmp0_22 : tensor<1x16x!tt.ptr, #blocked5> loc(#loc108) + %24 = arith.addi %tmp49, %17 : tensor<1x16xi32, #blocked5> loc(#loc109) + %25 = tt.splat %out_ptr9 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc110) + %26 = tt.addptr %25, %24 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc110) + %27 = ttg.convert_layout %26 : tensor<1x16x!tt.ptr, #blocked5> -> tensor<1x16x!tt.ptr, #blocked> loc(#loc111) + tt.store %27, %cst_5, %tmp0_23 : tensor<1x16x!tt.ptr, #blocked> loc(#loc111) + tt.return loc(#loc112) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":26:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":27:38) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:40) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:30) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:45) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":36:18) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":38:18) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":39:18) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":41:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":40:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":43:19) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":47:20) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":49:21) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":48:21) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":52:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":54:35) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":55:29) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":56:21) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":58:35) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":59:29) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":60:21) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":61:21) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":64:19) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":66:35) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":68:20) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":69:20) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":70:35) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:28) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:46) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:38) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:55) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:53) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:63) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":75:19) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":76:35) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":77:20) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":78:20) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":79:35) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:28) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:46) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:38) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:53) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:63) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":81:25) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":81:37) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":82:25) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":82:37) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:25) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:47) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:52) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:49) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:25) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:85) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:25) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:47) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:49) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:25) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:85) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:4) +#loc122 = loc("xoffset"(#loc2)) +#loc123 = loc("xmask"(#loc3)) +#loc124 = loc("r0_index"(#loc4)) +#loc125 = loc("tmp0"(#loc5)) +#loc126 = loc("tmp0"(#loc6)) +#loc127 = loc("tmp0"(#loc7)) +#loc128 = loc("tmp0"(#loc8)) +#loc129 = loc("tmp2"(#loc9)) +#loc130 = loc("tmp4"(#loc10)) +#loc131 = loc("tmp5"(#loc11)) +#loc132 = loc("tmp7"(#loc12)) +#loc133 = loc("tmp6"(#loc13)) +#loc134 = loc("tmp9"(#loc14)) +#loc135 = loc("flip"(#loc15)) +#loc137 = loc("flip"(#loc18)) +#loc138 = loc("flip"(#loc19)) +#loc139 = loc("y"(#loc20)) +#loc140 = loc("left_mask"(#loc22)) +#loc141 = loc("ileft"(#loc23)) +#loc143 = loc("ileft"(#loc27)) +#loc144 = loc("ileft"(#loc28)) +#loc145 = loc("iright"(#loc29)) +#loc147 = loc("iright"(#loc31)) +#loc148 = loc("iright"(#loc32)) +#loc149 = loc("ileft"(#loc33)) +#loc150 = loc("iright"(#loc34)) +#loc151 = loc("y_idx"(#loc35)) +#loc152 = loc("left_idx"(#loc36)) +#loc153 = loc("left_idx"(#loc37)) +#loc154 = loc("input"(#loc38)) +#loc156 = loc("left_idx"(#loc40)) +#loc157 = loc("left_idx"(#loc41)) +#loc158 = loc("right_idx"(#loc42)) +#loc159 = loc("right_idx"(#loc43)) +#loc161 = loc("right_idx"(#loc45)) +#loc162 = loc("right_idx"(#loc46)) +#loc163 = loc("left_idx"(#loc47)) +#loc164 = loc("right_idx"(#loc48)) +#loc165 = loc("cond"(#loc49)) +#loc166 = loc("eq"(#loc50)) +#loc167 = loc("cond"(#loc51)) +#loc168 = loc("cond"(#loc52)) +#loc169 = loc("cond"(#loc53)) +#loc170 = loc("cond"(#loc54)) +#loc171 = loc("cond"(#loc55)) +#loc172 = loc("ret"(#loc56)) +#loc173 = loc("ret"(#loc57)) +#loc174 = loc("ret"(#loc58)) +#loc175 = loc("new_idxs"(#loc59)) +#loc176 = loc("new_idxs"(#loc60)) +#loc177 = loc("new_idxs"(#loc61)) +#loc178 = loc("tmp14"(#loc62)) +#loc179 = loc("tmp16"(#loc63)) +#loc180 = loc("tmp15"(#loc64)) +#loc182 = loc("tmp20"(#loc66)) +#loc183 = loc("tmp23"(#loc67)) +#loc185 = loc("tmp24"(#loc69)) +#loc186 = loc("tmp25"(#loc70)) +#loc187 = loc("tmp28"(#loc71)) +#loc189 = loc("tmp29"(#loc73)) +#loc190 = loc("tmp30"(#loc74)) +#loc191 = loc("tmp31"(#loc75)) +#loc192 = loc("tmp34"(#loc76)) +#loc193 = loc("tmp36"(#loc77)) +#loc194 = loc("tmp38"(#loc78)) +#loc195 = loc("tmp39"(#loc79)) +#loc196 = loc("tmp40"(#loc80)) +#loc197 = loc(fused[#loc85, #loc84]) +#loc198 = loc("tmp45"(#loc87)) +#loc199 = loc("tmp46"(#loc88)) +#loc200 = loc("tmp47"(#loc89)) +#loc201 = loc("tmp48"(#loc90)) +#loc202 = loc("tmp49"(#loc91)) +#loc203 = loc(fused[#loc104, #loc103]) +#loc204 = loc(fused[#loc126, #loc125]) +#loc205 = loc(fused[#loc128, #loc123]) +#loc206 = loc(fused[#loc132, #loc133]) +#loc207 = loc(callsite(#loc135 at #loc136)) +#loc208 = loc(callsite(#loc137 at #loc136)) +#loc209 = loc(callsite(#loc138 at #loc136)) +#loc211 = loc("cond"(#loc165)) +#loc212 = loc("eq"(#loc166)) +#loc213 = loc(fused[#loc179, #loc180]) +#loc215 = loc(fused[#loc182, #loc132, #loc133]) +#loc216 = loc(callsite(#loc24 at #loc184)) +#loc218 = loc(fused[#loc186, #loc179, #loc180]) +#loc219 = loc(callsite(#loc24 at #loc188)) +#loc221 = loc(fused[#loc84, #loc123]) +#loc222 = loc(callsite(#loc139 at #loc210)) +#loc223 = loc(callsite(#loc140 at #loc210)) +#loc224 = loc(callsite(#loc141 at #loc210)) +#loc226 = loc(callsite(#loc143 at #loc210)) +#loc227 = loc(callsite(#loc144 at #loc210)) +#loc228 = loc(callsite(#loc145 at #loc210)) +#loc230 = loc(callsite(#loc147 at #loc210)) +#loc231 = loc(callsite(#loc148 at #loc210)) +#loc232 = loc(callsite(#loc149 at #loc210)) +#loc233 = loc(callsite(#loc150 at #loc210)) +#loc234 = loc(callsite(#loc151 at #loc210)) +#loc235 = loc(callsite(#loc152 at #loc210)) +#loc236 = loc(callsite(#loc153 at #loc210)) +#loc238 = loc(callsite(#loc156 at #loc210)) +#loc239 = loc(callsite(#loc157 at #loc210)) +#loc240 = loc(callsite(#loc158 at #loc210)) +#loc241 = loc(callsite(#loc159 at #loc210)) +#loc243 = loc(callsite(#loc161 at #loc210)) +#loc244 = loc(callsite(#loc162 at #loc210)) +#loc245 = loc(callsite(#loc163 at #loc210)) +#loc246 = loc(callsite(#loc164 at #loc210)) +#loc247 = loc(callsite(#loc211 at #loc210)) +#loc248 = loc(callsite(#loc212 at #loc210)) +#loc249 = loc(callsite(#loc167 at #loc210)) +#loc250 = loc(callsite(#loc168 at #loc210)) +#loc251 = loc(callsite(#loc169 at #loc210)) +#loc252 = loc(callsite(#loc170 at #loc210)) +#loc253 = loc(callsite(#loc171 at #loc210)) +#loc254 = loc(callsite(#loc172 at #loc210)) +#loc255 = loc(callsite(#loc173 at #loc210)) +#loc256 = loc(callsite(#loc174 at #loc210)) +#loc257 = loc(callsite(#loc175 at #loc210)) +#loc258 = loc(callsite(#loc176 at #loc210)) +#loc259 = loc(callsite(#loc177 at #loc210)) +#loc260 = loc(callsite(#loc139 at #loc214)) +#loc261 = loc(callsite(#loc141 at #loc214)) +#loc263 = loc(callsite(#loc143 at #loc214)) +#loc264 = loc(callsite(#loc144 at #loc214)) +#loc265 = loc(callsite(#loc145 at #loc214)) +#loc267 = loc(callsite(#loc147 at #loc214)) +#loc268 = loc(callsite(#loc148 at #loc214)) +#loc269 = loc(callsite(#loc149 at #loc214)) +#loc270 = loc(callsite(#loc150 at #loc214)) +#loc271 = loc(callsite(#loc211 at #loc214)) +#loc272 = loc(callsite(#loc212 at #loc214)) +#loc273 = loc(callsite(#loc168 at #loc214)) +#loc274 = loc(callsite(#loc169 at #loc214)) +#loc275 = loc(callsite(#loc170 at #loc214)) +#loc276 = loc(callsite(#loc171 at #loc214)) +#loc277 = loc(callsite(#loc172 at #loc214)) +#loc278 = loc(callsite(#loc173 at #loc214)) +#loc279 = loc(callsite(#loc174 at #loc214)) +#loc280 = loc(callsite(#loc176 at #loc214)) +#loc281 = loc(callsite(#loc177 at #loc214)) +#loc282 = loc(callsite(#loc151 at #loc214)) +#loc283 = loc(callsite(#loc153 at #loc214)) +#loc285 = loc(callsite(#loc156 at #loc214)) +#loc286 = loc(callsite(#loc157 at #loc214)) +#loc287 = loc(callsite(#loc159 at #loc214)) +#loc289 = loc(callsite(#loc161 at #loc214)) +#loc290 = loc(callsite(#loc162 at #loc214)) +#loc291 = loc(callsite(#loc163 at #loc214)) +#loc292 = loc(callsite(#loc164 at #loc214)) +#loc293 = loc(callsite(#loc167 at #loc214)) +#loc294 = loc(callsite(#loc175 at #loc214)) +#loc295 = loc(callsite(#loc26 at #loc216)) +#loc296 = loc(callsite(#loc26 at #loc219)) +#loc297 = loc(callsite(#loc24 at #loc225)) +#loc299 = loc(callsite(#loc24 at #loc229)) +#loc301 = loc(callsite(#loc154 at #loc237)) +#loc302 = loc(callsite(#loc24 at #loc237)) +#loc304 = loc(callsite(#loc154 at #loc242)) +#loc305 = loc(callsite(#loc24 at #loc242)) +#loc307 = loc(callsite(#loc24 at #loc262)) +#loc309 = loc(callsite(#loc24 at #loc266)) +#loc311 = loc(callsite(#loc24 at #loc284)) +#loc313 = loc(callsite(#loc24 at #loc288)) +#loc315 = loc(callsite(#loc26 at #loc297)) +#loc316 = loc(callsite(#loc26 at #loc299)) +#loc317 = loc(callsite(#loc26 at #loc302)) +#loc318 = loc(callsite(#loc26 at #loc305)) +#loc319 = loc(callsite(#loc26 at #loc307)) +#loc320 = loc(callsite(#loc26 at #loc309)) +#loc321 = loc(callsite(#loc26 at #loc311)) +#loc322 = loc(callsite(#loc26 at #loc313)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..213bf9f89cf7635420d2d0d5b3a76f6424eed3a6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir @@ -0,0 +1,1437 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":18:0) +#loc1 = loc(unknown) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":46:71) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":51:71) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":55:26) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":59:26) +#loc115 = loc("in_ptr0"(#loc)) +#loc116 = loc("out_ptr4"(#loc)) +#loc117 = loc("out_ptr5"(#loc)) +#loc118 = loc("out_ptr6"(#loc)) +#loc119 = loc("out_ptr7"(#loc)) +#loc120 = loc("out_ptr8"(#loc)) +#loc121 = loc("out_ptr9"(#loc)) +#loc122 = loc("xnumel"(#loc)) +#loc123 = loc("r0_numel"(#loc)) +#loc139 = loc(callsite(#loc17 at #loc18)) +#loc146 = loc("ileft"(#loc27)) +#loc150 = loc("iright"(#loc32)) +#loc159 = loc("left_idx"(#loc41)) +#loc164 = loc("right_idx"(#loc46)) +#loc185 = loc(callsite(#loc17 at #loc67)) +#loc188 = loc("tmp24"(#loc70)) +#loc192 = loc("tmp29"(#loc74)) +#loc215 = loc(callsite(#loc23 at #loc139)) +#loc219 = loc(callsite(#loc23 at #loc185)) +#loc222 = loc(callsite(#loc1 at #loc188)) +#loc225 = loc(callsite(#loc1 at #loc192)) +#loc229 = loc(callsite(#loc146 at #loc215)) +#loc233 = loc(callsite(#loc150 at #loc215)) +#loc241 = loc(callsite(#loc159 at #loc215)) +#loc246 = loc(callsite(#loc164 at #loc215)) +#loc266 = loc(callsite(#loc146 at #loc219)) +#loc270 = loc(callsite(#loc150 at #loc219)) +#loc288 = loc(callsite(#loc159 at #loc219)) +#loc292 = loc(callsite(#loc164 at #loc219)) +#loc302 = loc(callsite(#loc1 at #loc229)) +#loc304 = loc(callsite(#loc1 at #loc233)) +#loc307 = loc(callsite(#loc1 at #loc241)) +#loc310 = loc(callsite(#loc1 at #loc246)) +#loc312 = loc(callsite(#loc1 at #loc266)) +#loc314 = loc(callsite(#loc1 at #loc270)) +#loc316 = loc(callsite(#loc1 at #loc288)) +#loc318 = loc(callsite(#loc1 at #loc292)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c17_i32 = arith.constant 17 : i32 loc(#loc1) + %true = arith.constant true loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %xmask = arith.constant 32 : i32 loc(#loc124) + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<1x16xi32> loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc1) + %cst_2 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc1) + %cst_3 = arith.constant dense<16> : tensor<1x16xi32> loc(#loc1) + %cst_4 = arith.constant dense<16384> : tensor<1x16xi64> loc(#loc1) + %cst_5 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc125) + %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc124) + %xmask_7 = tt.splat %xmask_6 : i1 -> tensor<1x1xi1> loc(#loc124) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc126) + %r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc127) + %tmp0 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc128) + %tmp0_9 = tt.splat %tmp0 : i32 -> tensor<1x16xi32> loc(#loc208) + %tmp0_10 = arith.addi %r0_index_8, %tmp0_9 : tensor<1x16xi32> loc(#loc129) + %tmp0_11 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc130) + %tmp0_12 = tt.addptr %tmp0_11, %tmp0_10 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc130) + %tmp0_13 = tt.splat %xmask_6 : i1 -> tensor<1x16xi1> loc(#loc209) + %tmp0_14 = tt.load %tmp0_12, %tmp0_13, %cst_5 : tensor<1x16x!tt.ptr> loc(#loc131) + %tmp2 = arith.cmpi sgt, %tmp0_14, %cst_5 : tensor<1x16xi64> loc(#loc132) + %tmp4 = arith.cmpi slt, %tmp0_14, %cst_4 : tensor<1x16xi64> loc(#loc133) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<1x16xi1> loc(#loc134) + %tmp7 = arith.extui %tmp5 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc210) + %tmp9 = arith.trunci %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc137) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc211) + %flip_15 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc212) + %flip_16 = tt.expand_dims %flip_15 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc212) + %flip_17 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc213) + %flip_18 = tt.reshape %flip_17 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc214) + %y = tt.reshape %tmp7 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc226) + %left_mask = arith.subi %cst, %flip_16 : tensor<1x2x1xi32> loc(#loc227) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc228) + %ileft_19 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc228) + %ileft_20 = "tt.reduce"(%ileft_19) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc301) + %ileft_21 = tt.expand_dims %ileft_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc230) + %ileft_22 = tt.broadcast %ileft_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc231) + %iright = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc232) + %iright_23 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc232) + %iright_24 = "tt.reduce"(%iright_23) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc303) + %iright_25 = tt.expand_dims %iright_24 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc234) + %iright_26 = tt.broadcast %iright_25 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc235) + %ileft_27 = tt.reshape %ileft_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_28 = tt.reshape %iright_26 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx = tt.reshape %tmp9 : tensor<1x16xi16> -> tensor<8x2x1xi16> loc(#loc238) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc239) + %left_idx_29 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc240) + %left_idx_30 = arith.muli %y_idx, %left_idx_29 : tensor<8x2x1xi16> loc(#loc240) + %input = arith.extsi %left_idx_30 : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc305) + %left_idx_31 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc306) + %left_idx_32 = tt.expand_dims %left_idx_31 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc242) + %left_idx_33 = tt.broadcast %left_idx_32 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc243) + %right_idx = arith.trunci %flip_16 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc244) + %right_idx_34 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc245) + %right_idx_35 = arith.muli %y_idx, %right_idx_34 : tensor<8x2x1xi16> loc(#loc245) + %input_36 = arith.extsi %right_idx_35 : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc308) + %right_idx_37 = "tt.reduce"(%input_36) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc309) + %right_idx_38 = tt.expand_dims %right_idx_37 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc247) + %right_idx_39 = tt.broadcast %right_idx_38 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc248) + %left_idx_40 = tt.reshape %left_idx_33 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_41 = tt.reshape %right_idx_39 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc250) + %cond = arith.cmpi slt, %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc251) + %eq = arith.cmpi eq, %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc252) + %cond_42 = arith.cmpi sgt, %left_idx_40, %right_idx_41 : tensor<1x16xi32> loc(#loc253) + %cond_43 = arith.andi %eq, %cond_42 : tensor<1x16xi1> loc(#loc254) + %cond_44 = arith.ori %cond, %cond_43 : tensor<1x16xi1> loc(#loc255) + %cond_45 = arith.extui %cond_44 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc256) + %cond_46 = arith.xori %cond_45, %flip_18 : tensor<1x16xi32> loc(#loc256) + %cond_47 = arith.cmpi ne, %cond_46, %cst_1 : tensor<1x16xi32> loc(#loc257) + %ret = arith.xori %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc258) + %ret_48 = arith.select %cond_47, %ret, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_49 = arith.xori %tmp7, %ret_48 : tensor<1x16xi32> loc(#loc260) + %new_idxs = arith.xori %left_idx_40, %right_idx_41 : tensor<1x16xi32> loc(#loc261) + %new_idxs_50 = arith.select %cond_47, %new_idxs, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_51 = arith.extsi %tmp9 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc263) + %new_idxs_52 = arith.xori %new_idxs_51, %new_idxs_50 : tensor<1x16xi32> loc(#loc263) + %flip_53 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc213) + %flip_54 = tt.reshape %flip_53 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc214) + %y_55 = tt.reshape %ret_49 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc226) + %ileft_56 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc228) + %ileft_57 = arith.muli %y_55, %ileft_56 : tensor<4x2x2xi32> loc(#loc228) + %ileft_58 = "tt.reduce"(%ileft_57) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc301) + %ileft_59 = tt.expand_dims %ileft_58 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc230) + %ileft_60 = tt.broadcast %ileft_59 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc231) + %iright_61 = arith.muli %y_55, %flip_17 : tensor<4x2x2xi32> loc(#loc232) + %iright_62 = "tt.reduce"(%iright_61) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc303) + %iright_63 = tt.expand_dims %iright_62 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc234) + %iright_64 = tt.broadcast %iright_63 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc235) + %ileft_65 = tt.reshape %ileft_60 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_66 = tt.reshape %iright_64 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_67 = tt.reshape %new_idxs_52 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc238) + %left_idx_68 = arith.muli %y_idx_67, %ileft_56 : tensor<4x2x2xi32> loc(#loc240) + %left_idx_69 = "tt.reduce"(%left_idx_68) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc306) + %left_idx_70 = tt.expand_dims %left_idx_69 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc242) + %left_idx_71 = tt.broadcast %left_idx_70 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc243) + %right_idx_72 = arith.muli %y_idx_67, %flip_17 : tensor<4x2x2xi32> loc(#loc245) + %right_idx_73 = "tt.reduce"(%right_idx_72) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc309) + %right_idx_74 = tt.expand_dims %right_idx_73 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc247) + %right_idx_75 = tt.broadcast %right_idx_74 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc248) + %left_idx_76 = tt.reshape %left_idx_71 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_77 = tt.reshape %right_idx_75 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_78 = arith.cmpi slt, %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc251) + %eq_79 = arith.cmpi eq, %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc252) + %cond_80 = arith.cmpi sgt, %left_idx_76, %right_idx_77 : tensor<1x16xi32> loc(#loc253) + %cond_81 = arith.andi %eq_79, %cond_80 : tensor<1x16xi1> loc(#loc254) + %cond_82 = arith.ori %cond_78, %cond_81 : tensor<1x16xi1> loc(#loc255) + %cond_83 = arith.extui %cond_82 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc256) + %cond_84 = arith.xori %cond_83, %flip_54 : tensor<1x16xi32> loc(#loc256) + %cond_85 = arith.cmpi ne, %cond_84, %cst_1 : tensor<1x16xi32> loc(#loc257) + %ret_86 = arith.xori %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc258) + %ret_87 = arith.select %cond_85, %ret_86, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_88 = arith.xori %ret_49, %ret_87 : tensor<1x16xi32> loc(#loc260) + %new_idxs_89 = arith.xori %left_idx_76, %right_idx_77 : tensor<1x16xi32> loc(#loc261) + %new_idxs_90 = arith.select %cond_85, %new_idxs_89, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_91 = arith.xori %new_idxs_52, %new_idxs_90 : tensor<1x16xi32> loc(#loc263) + %y_92 = tt.reshape %ret_88 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc226) + %ileft_93 = arith.muli %y_92, %ileft : tensor<8x2x1xi32> loc(#loc228) + %ileft_94 = "tt.reduce"(%ileft_93) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc301) + %ileft_95 = tt.expand_dims %ileft_94 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc230) + %ileft_96 = tt.broadcast %ileft_95 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc231) + %iright_97 = arith.muli %y_92, %iright : tensor<8x2x1xi32> loc(#loc232) + %iright_98 = "tt.reduce"(%iright_97) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc303) + %iright_99 = tt.expand_dims %iright_98 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc234) + %iright_100 = tt.broadcast %iright_99 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc235) + %ileft_101 = tt.reshape %ileft_96 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_102 = tt.reshape %iright_100 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_103 = tt.reshape %new_idxs_91 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc238) + %left_idx_104 = arith.muli %y_idx_103, %ileft : tensor<8x2x1xi32> loc(#loc240) + %left_idx_105 = "tt.reduce"(%left_idx_104) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc306) + %left_idx_106 = tt.expand_dims %left_idx_105 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc242) + %left_idx_107 = tt.broadcast %left_idx_106 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc243) + %right_idx_108 = arith.muli %y_idx_103, %iright : tensor<8x2x1xi32> loc(#loc245) + %right_idx_109 = "tt.reduce"(%right_idx_108) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc309) + %right_idx_110 = tt.expand_dims %right_idx_109 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc247) + %right_idx_111 = tt.broadcast %right_idx_110 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc248) + %left_idx_112 = tt.reshape %left_idx_107 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_113 = tt.reshape %right_idx_111 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_114 = arith.cmpi slt, %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc251) + %eq_115 = arith.cmpi eq, %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc252) + %cond_116 = arith.cmpi sgt, %left_idx_112, %right_idx_113 : tensor<1x16xi32> loc(#loc253) + %cond_117 = arith.andi %eq_115, %cond_116 : tensor<1x16xi1> loc(#loc254) + %cond_118 = arith.ori %cond_114, %cond_117 : tensor<1x16xi1> loc(#loc255) + %cond_119 = arith.extui %cond_118 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc256) + %cond_120 = arith.xori %cond_119, %flip_54 : tensor<1x16xi32> loc(#loc256) + %cond_121 = arith.cmpi ne, %cond_120, %cst_1 : tensor<1x16xi32> loc(#loc257) + %ret_122 = arith.xori %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc258) + %ret_123 = arith.select %cond_121, %ret_122, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_124 = arith.xori %ret_88, %ret_123 : tensor<1x16xi32> loc(#loc260) + %new_idxs_125 = arith.xori %left_idx_112, %right_idx_113 : tensor<1x16xi32> loc(#loc261) + %new_idxs_126 = arith.select %cond_121, %new_idxs_125, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_127 = arith.xori %new_idxs_91, %new_idxs_126 : tensor<1x16xi32> loc(#loc263) + %flip_128 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc213) + %flip_129 = tt.reshape %flip_128 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc214) + %y_130 = tt.reshape %ret_124 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc226) + %ileft_131 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc228) + %ileft_132 = arith.muli %y_130, %ileft_131 : tensor<2x2x4xi32> loc(#loc228) + %ileft_133 = "tt.reduce"(%ileft_132) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc301) + %ileft_134 = tt.expand_dims %ileft_133 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc230) + %ileft_135 = tt.broadcast %ileft_134 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc231) + %iright_136 = arith.muli %y_130, %flip_53 : tensor<2x2x4xi32> loc(#loc232) + %iright_137 = "tt.reduce"(%iright_136) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc303) + %iright_138 = tt.expand_dims %iright_137 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc234) + %iright_139 = tt.broadcast %iright_138 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc235) + %ileft_140 = tt.reshape %ileft_135 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_141 = tt.reshape %iright_139 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_142 = tt.reshape %new_idxs_127 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc238) + %left_idx_143 = arith.muli %y_idx_142, %ileft_131 : tensor<2x2x4xi32> loc(#loc240) + %left_idx_144 = "tt.reduce"(%left_idx_143) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc306) + %left_idx_145 = tt.expand_dims %left_idx_144 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc242) + %left_idx_146 = tt.broadcast %left_idx_145 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc243) + %right_idx_147 = arith.muli %y_idx_142, %flip_53 : tensor<2x2x4xi32> loc(#loc245) + %right_idx_148 = "tt.reduce"(%right_idx_147) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc309) + %right_idx_149 = tt.expand_dims %right_idx_148 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc247) + %right_idx_150 = tt.broadcast %right_idx_149 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc248) + %left_idx_151 = tt.reshape %left_idx_146 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_152 = tt.reshape %right_idx_150 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_153 = arith.cmpi slt, %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc251) + %eq_154 = arith.cmpi eq, %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc252) + %cond_155 = arith.cmpi sgt, %left_idx_151, %right_idx_152 : tensor<1x16xi32> loc(#loc253) + %cond_156 = arith.andi %eq_154, %cond_155 : tensor<1x16xi1> loc(#loc254) + %cond_157 = arith.ori %cond_153, %cond_156 : tensor<1x16xi1> loc(#loc255) + %cond_158 = arith.extui %cond_157 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc256) + %cond_159 = arith.xori %cond_158, %flip_129 : tensor<1x16xi32> loc(#loc256) + %cond_160 = arith.cmpi ne, %cond_159, %cst_1 : tensor<1x16xi32> loc(#loc257) + %ret_161 = arith.xori %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc258) + %ret_162 = arith.select %cond_160, %ret_161, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_163 = arith.xori %ret_124, %ret_162 : tensor<1x16xi32> loc(#loc260) + %new_idxs_164 = arith.xori %left_idx_151, %right_idx_152 : tensor<1x16xi32> loc(#loc261) + %new_idxs_165 = arith.select %cond_160, %new_idxs_164, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_166 = arith.xori %new_idxs_127, %new_idxs_165 : tensor<1x16xi32> loc(#loc263) + %y_167 = tt.reshape %ret_163 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc226) + %ileft_168 = arith.muli %y_167, %ileft_56 : tensor<4x2x2xi32> loc(#loc228) + %ileft_169 = "tt.reduce"(%ileft_168) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc301) + %ileft_170 = tt.expand_dims %ileft_169 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc230) + %ileft_171 = tt.broadcast %ileft_170 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc231) + %iright_172 = arith.muli %y_167, %flip_17 : tensor<4x2x2xi32> loc(#loc232) + %iright_173 = "tt.reduce"(%iright_172) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc303) + %iright_174 = tt.expand_dims %iright_173 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc234) + %iright_175 = tt.broadcast %iright_174 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc235) + %ileft_176 = tt.reshape %ileft_171 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_177 = tt.reshape %iright_175 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_178 = tt.reshape %new_idxs_166 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc238) + %left_idx_179 = arith.muli %y_idx_178, %ileft_56 : tensor<4x2x2xi32> loc(#loc240) + %left_idx_180 = "tt.reduce"(%left_idx_179) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc306) + %left_idx_181 = tt.expand_dims %left_idx_180 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc242) + %left_idx_182 = tt.broadcast %left_idx_181 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc243) + %right_idx_183 = arith.muli %y_idx_178, %flip_17 : tensor<4x2x2xi32> loc(#loc245) + %right_idx_184 = "tt.reduce"(%right_idx_183) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc309) + %right_idx_185 = tt.expand_dims %right_idx_184 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc247) + %right_idx_186 = tt.broadcast %right_idx_185 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc248) + %left_idx_187 = tt.reshape %left_idx_182 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_188 = tt.reshape %right_idx_186 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_189 = arith.cmpi slt, %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc251) + %eq_190 = arith.cmpi eq, %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc252) + %cond_191 = arith.cmpi sgt, %left_idx_187, %right_idx_188 : tensor<1x16xi32> loc(#loc253) + %cond_192 = arith.andi %eq_190, %cond_191 : tensor<1x16xi1> loc(#loc254) + %cond_193 = arith.ori %cond_189, %cond_192 : tensor<1x16xi1> loc(#loc255) + %cond_194 = arith.extui %cond_193 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc256) + %cond_195 = arith.xori %cond_194, %flip_129 : tensor<1x16xi32> loc(#loc256) + %cond_196 = arith.cmpi ne, %cond_195, %cst_1 : tensor<1x16xi32> loc(#loc257) + %ret_197 = arith.xori %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc258) + %ret_198 = arith.select %cond_196, %ret_197, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_199 = arith.xori %ret_163, %ret_198 : tensor<1x16xi32> loc(#loc260) + %new_idxs_200 = arith.xori %left_idx_187, %right_idx_188 : tensor<1x16xi32> loc(#loc261) + %new_idxs_201 = arith.select %cond_196, %new_idxs_200, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_202 = arith.xori %new_idxs_166, %new_idxs_201 : tensor<1x16xi32> loc(#loc263) + %y_203 = tt.reshape %ret_199 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc226) + %ileft_204 = arith.muli %y_203, %ileft : tensor<8x2x1xi32> loc(#loc228) + %ileft_205 = "tt.reduce"(%ileft_204) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc301) + %ileft_206 = tt.expand_dims %ileft_205 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc230) + %ileft_207 = tt.broadcast %ileft_206 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc231) + %iright_208 = arith.muli %y_203, %iright : tensor<8x2x1xi32> loc(#loc232) + %iright_209 = "tt.reduce"(%iright_208) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc303) + %iright_210 = tt.expand_dims %iright_209 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc234) + %iright_211 = tt.broadcast %iright_210 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc235) + %ileft_212 = tt.reshape %ileft_207 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_213 = tt.reshape %iright_211 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_214 = tt.reshape %new_idxs_202 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc238) + %left_idx_215 = arith.muli %y_idx_214, %ileft : tensor<8x2x1xi32> loc(#loc240) + %left_idx_216 = "tt.reduce"(%left_idx_215) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc306) + %left_idx_217 = tt.expand_dims %left_idx_216 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc242) + %left_idx_218 = tt.broadcast %left_idx_217 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc243) + %right_idx_219 = arith.muli %y_idx_214, %iright : tensor<8x2x1xi32> loc(#loc245) + %right_idx_220 = "tt.reduce"(%right_idx_219) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc309) + %right_idx_221 = tt.expand_dims %right_idx_220 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc247) + %right_idx_222 = tt.broadcast %right_idx_221 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc248) + %left_idx_223 = tt.reshape %left_idx_218 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_224 = tt.reshape %right_idx_222 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_225 = arith.cmpi slt, %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc251) + %eq_226 = arith.cmpi eq, %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc252) + %cond_227 = arith.cmpi sgt, %left_idx_223, %right_idx_224 : tensor<1x16xi32> loc(#loc253) + %cond_228 = arith.andi %eq_226, %cond_227 : tensor<1x16xi1> loc(#loc254) + %cond_229 = arith.ori %cond_225, %cond_228 : tensor<1x16xi1> loc(#loc255) + %cond_230 = arith.extui %cond_229 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc256) + %cond_231 = arith.xori %cond_230, %flip_129 : tensor<1x16xi32> loc(#loc256) + %cond_232 = arith.cmpi ne, %cond_231, %cst_1 : tensor<1x16xi32> loc(#loc257) + %ret_233 = arith.xori %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc258) + %ret_234 = arith.select %cond_232, %ret_233, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_235 = arith.xori %ret_199, %ret_234 : tensor<1x16xi32> loc(#loc260) + %new_idxs_236 = arith.xori %left_idx_223, %right_idx_224 : tensor<1x16xi32> loc(#loc261) + %new_idxs_237 = arith.select %cond_232, %new_idxs_236, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_238 = arith.xori %new_idxs_202, %new_idxs_237 : tensor<1x16xi32> loc(#loc263) + %y_239 = tt.reshape %ret_235 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc226) + %ileft_240 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc228) + %ileft_241 = arith.muli %y_239, %ileft_240 : tensor<1x2x8xi32> loc(#loc228) + %ileft_242 = "tt.reduce"(%ileft_241) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc301) + %ileft_243 = tt.expand_dims %ileft_242 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc230) + %ileft_244 = tt.broadcast %ileft_243 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc231) + %iright_245 = arith.muli %y_239, %flip_128 : tensor<1x2x8xi32> loc(#loc232) + %iright_246 = "tt.reduce"(%iright_245) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc303) + %iright_247 = tt.expand_dims %iright_246 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc234) + %iright_248 = tt.broadcast %iright_247 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc235) + %ileft_249 = tt.reshape %ileft_244 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_250 = tt.reshape %iright_248 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_251 = tt.reshape %new_idxs_238 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc238) + %left_idx_252 = arith.muli %y_idx_251, %ileft_240 : tensor<1x2x8xi32> loc(#loc240) + %left_idx_253 = "tt.reduce"(%left_idx_252) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc306) + %left_idx_254 = tt.expand_dims %left_idx_253 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc242) + %left_idx_255 = tt.broadcast %left_idx_254 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc243) + %right_idx_256 = arith.muli %y_idx_251, %flip_128 : tensor<1x2x8xi32> loc(#loc245) + %right_idx_257 = "tt.reduce"(%right_idx_256) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc309) + %right_idx_258 = tt.expand_dims %right_idx_257 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc247) + %right_idx_259 = tt.broadcast %right_idx_258 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc248) + %left_idx_260 = tt.reshape %left_idx_255 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_261 = tt.reshape %right_idx_259 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_262 = arith.cmpi slt, %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc251) + %eq_263 = arith.cmpi eq, %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc252) + %cond_264 = arith.cmpi sgt, %left_idx_260, %right_idx_261 : tensor<1x16xi32> loc(#loc253) + %cond_265 = arith.andi %eq_263, %cond_264 : tensor<1x16xi1> loc(#loc254) + %cond_266 = arith.ori %cond_262, %cond_265 : tensor<1x16xi1> loc(#loc255) + %ret_267 = arith.xori %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc258) + %ret_268 = arith.select %cond_266, %ret_267, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_269 = arith.xori %ret_235, %ret_268 : tensor<1x16xi32> loc(#loc260) + %new_idxs_270 = arith.xori %left_idx_260, %right_idx_261 : tensor<1x16xi32> loc(#loc261) + %new_idxs_271 = arith.select %cond_266, %new_idxs_270, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_272 = arith.xori %new_idxs_238, %new_idxs_271 : tensor<1x16xi32> loc(#loc263) + %y_273 = tt.reshape %ret_269 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc226) + %ileft_274 = arith.muli %y_273, %ileft_131 : tensor<2x2x4xi32> loc(#loc228) + %ileft_275 = "tt.reduce"(%ileft_274) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc301) + %ileft_276 = tt.expand_dims %ileft_275 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc230) + %ileft_277 = tt.broadcast %ileft_276 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc231) + %iright_278 = arith.muli %y_273, %flip_53 : tensor<2x2x4xi32> loc(#loc232) + %iright_279 = "tt.reduce"(%iright_278) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc303) + %iright_280 = tt.expand_dims %iright_279 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc234) + %iright_281 = tt.broadcast %iright_280 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc235) + %ileft_282 = tt.reshape %ileft_277 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_283 = tt.reshape %iright_281 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_284 = tt.reshape %new_idxs_272 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc238) + %left_idx_285 = arith.muli %y_idx_284, %ileft_131 : tensor<2x2x4xi32> loc(#loc240) + %left_idx_286 = "tt.reduce"(%left_idx_285) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc306) + %left_idx_287 = tt.expand_dims %left_idx_286 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc242) + %left_idx_288 = tt.broadcast %left_idx_287 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc243) + %right_idx_289 = arith.muli %y_idx_284, %flip_53 : tensor<2x2x4xi32> loc(#loc245) + %right_idx_290 = "tt.reduce"(%right_idx_289) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc309) + %right_idx_291 = tt.expand_dims %right_idx_290 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc247) + %right_idx_292 = tt.broadcast %right_idx_291 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc248) + %left_idx_293 = tt.reshape %left_idx_288 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_294 = tt.reshape %right_idx_292 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_295 = arith.cmpi slt, %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc251) + %eq_296 = arith.cmpi eq, %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc252) + %cond_297 = arith.cmpi sgt, %left_idx_293, %right_idx_294 : tensor<1x16xi32> loc(#loc253) + %cond_298 = arith.andi %eq_296, %cond_297 : tensor<1x16xi1> loc(#loc254) + %cond_299 = arith.ori %cond_295, %cond_298 : tensor<1x16xi1> loc(#loc255) + %ret_300 = arith.xori %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc258) + %ret_301 = arith.select %cond_299, %ret_300, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_302 = arith.xori %ret_269, %ret_301 : tensor<1x16xi32> loc(#loc260) + %new_idxs_303 = arith.xori %left_idx_293, %right_idx_294 : tensor<1x16xi32> loc(#loc261) + %new_idxs_304 = arith.select %cond_299, %new_idxs_303, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_305 = arith.xori %new_idxs_272, %new_idxs_304 : tensor<1x16xi32> loc(#loc263) + %y_306 = tt.reshape %ret_302 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc226) + %ileft_307 = arith.muli %y_306, %ileft_56 : tensor<4x2x2xi32> loc(#loc228) + %ileft_308 = "tt.reduce"(%ileft_307) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc301) + %ileft_309 = tt.expand_dims %ileft_308 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc230) + %ileft_310 = tt.broadcast %ileft_309 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc231) + %iright_311 = arith.muli %y_306, %flip_17 : tensor<4x2x2xi32> loc(#loc232) + %iright_312 = "tt.reduce"(%iright_311) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc303) + %iright_313 = tt.expand_dims %iright_312 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc234) + %iright_314 = tt.broadcast %iright_313 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc235) + %ileft_315 = tt.reshape %ileft_310 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_316 = tt.reshape %iright_314 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_317 = tt.reshape %new_idxs_305 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc238) + %left_idx_318 = arith.muli %y_idx_317, %ileft_56 : tensor<4x2x2xi32> loc(#loc240) + %left_idx_319 = "tt.reduce"(%left_idx_318) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc306) + %left_idx_320 = tt.expand_dims %left_idx_319 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc242) + %left_idx_321 = tt.broadcast %left_idx_320 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc243) + %right_idx_322 = arith.muli %y_idx_317, %flip_17 : tensor<4x2x2xi32> loc(#loc245) + %right_idx_323 = "tt.reduce"(%right_idx_322) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc309) + %right_idx_324 = tt.expand_dims %right_idx_323 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc247) + %right_idx_325 = tt.broadcast %right_idx_324 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc248) + %left_idx_326 = tt.reshape %left_idx_321 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_327 = tt.reshape %right_idx_325 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_328 = arith.cmpi slt, %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc251) + %eq_329 = arith.cmpi eq, %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc252) + %cond_330 = arith.cmpi sgt, %left_idx_326, %right_idx_327 : tensor<1x16xi32> loc(#loc253) + %cond_331 = arith.andi %eq_329, %cond_330 : tensor<1x16xi1> loc(#loc254) + %cond_332 = arith.ori %cond_328, %cond_331 : tensor<1x16xi1> loc(#loc255) + %ret_333 = arith.xori %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc258) + %ret_334 = arith.select %cond_332, %ret_333, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_335 = arith.xori %ret_302, %ret_334 : tensor<1x16xi32> loc(#loc260) + %new_idxs_336 = arith.xori %left_idx_326, %right_idx_327 : tensor<1x16xi32> loc(#loc261) + %new_idxs_337 = arith.select %cond_332, %new_idxs_336, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_338 = arith.xori %new_idxs_305, %new_idxs_337 : tensor<1x16xi32> loc(#loc263) + %y_339 = tt.reshape %ret_335 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc226) + %ileft_340 = arith.muli %y_339, %ileft : tensor<8x2x1xi32> loc(#loc228) + %ileft_341 = "tt.reduce"(%ileft_340) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc301) + %ileft_342 = tt.expand_dims %ileft_341 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc230) + %ileft_343 = tt.broadcast %ileft_342 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc231) + %iright_344 = arith.muli %y_339, %iright : tensor<8x2x1xi32> loc(#loc232) + %iright_345 = "tt.reduce"(%iright_344) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc303) + %iright_346 = tt.expand_dims %iright_345 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc234) + %iright_347 = tt.broadcast %iright_346 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc235) + %ileft_348 = tt.reshape %ileft_343 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_349 = tt.reshape %iright_347 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_350 = tt.reshape %new_idxs_338 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc238) + %left_idx_351 = arith.muli %y_idx_350, %ileft : tensor<8x2x1xi32> loc(#loc240) + %left_idx_352 = "tt.reduce"(%left_idx_351) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc306) + %left_idx_353 = tt.expand_dims %left_idx_352 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc242) + %left_idx_354 = tt.broadcast %left_idx_353 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc243) + %right_idx_355 = arith.muli %y_idx_350, %iright : tensor<8x2x1xi32> loc(#loc245) + %right_idx_356 = "tt.reduce"(%right_idx_355) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc309) + %right_idx_357 = tt.expand_dims %right_idx_356 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc247) + %right_idx_358 = tt.broadcast %right_idx_357 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc248) + %left_idx_359 = tt.reshape %left_idx_354 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_360 = tt.reshape %right_idx_358 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_361 = arith.cmpi slt, %ileft_348, %iright_349 : tensor<1x16xi32> loc(#loc251) + %eq_362 = arith.cmpi eq, %ileft_348, %iright_349 : tensor<1x16xi32> loc(#loc252) + %cond_363 = arith.cmpi sgt, %left_idx_359, %right_idx_360 : tensor<1x16xi32> loc(#loc253) + %cond_364 = arith.andi %eq_362, %cond_363 : tensor<1x16xi1> loc(#loc254) + %cond_365 = arith.ori %cond_361, %cond_364 : tensor<1x16xi1> loc(#loc255) + %new_idxs_366 = arith.xori %left_idx_359, %right_idx_360 : tensor<1x16xi32> loc(#loc261) + %new_idxs_367 = arith.select %cond_365, %new_idxs_366, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_368 = arith.xori %new_idxs_338, %new_idxs_367 : tensor<1x16xi32> loc(#loc263) + %tmp14 = arith.cmpi eq, %tmp0_14, %cst_4 : tensor<1x16xi64> loc(#loc182) + %tmp16 = arith.extui %tmp14 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc218) + %y_369 = tt.reshape %tmp16 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc264) + %ileft_370 = arith.muli %y_369, %ileft : tensor<8x2x1xi32> loc(#loc265) + %ileft_371 = "tt.reduce"(%ileft_370) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc311) + %ileft_372 = tt.expand_dims %ileft_371 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc267) + %ileft_373 = tt.broadcast %ileft_372 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc268) + %iright_374 = arith.muli %y_369, %iright : tensor<8x2x1xi32> loc(#loc269) + %iright_375 = "tt.reduce"(%iright_374) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc313) + %iright_376 = tt.expand_dims %iright_375 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc271) + %iright_377 = tt.broadcast %iright_376 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc272) + %ileft_378 = tt.reshape %ileft_373 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_379 = tt.reshape %iright_377 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc274) + %cond_380 = arith.cmpi slt, %ileft_378, %iright_379 : tensor<1x16xi32> loc(#loc275) + %eq_381 = arith.cmpi eq, %ileft_378, %iright_379 : tensor<1x16xi32> loc(#loc276) + %cond_382 = arith.andi %eq_381, %cond_42 : tensor<1x16xi1> loc(#loc277) + %cond_383 = arith.ori %cond_380, %cond_382 : tensor<1x16xi1> loc(#loc278) + %cond_384 = arith.extui %cond_383 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc279) + %cond_385 = arith.xori %cond_384, %flip_18 : tensor<1x16xi32> loc(#loc279) + %cond_386 = arith.cmpi ne, %cond_385, %cst_1 : tensor<1x16xi32> loc(#loc280) + %ret_387 = arith.xori %ileft_378, %iright_379 : tensor<1x16xi32> loc(#loc281) + %ret_388 = arith.select %cond_386, %ret_387, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_389 = arith.xori %tmp16, %ret_388 : tensor<1x16xi32> loc(#loc283) + %new_idxs_390 = arith.select %cond_386, %new_idxs, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_391 = arith.xori %new_idxs_51, %new_idxs_390 : tensor<1x16xi32> loc(#loc285) + %y_392 = tt.reshape %ret_389 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc264) + %ileft_393 = arith.muli %y_392, %ileft_56 : tensor<4x2x2xi32> loc(#loc265) + %ileft_394 = "tt.reduce"(%ileft_393) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc311) + %ileft_395 = tt.expand_dims %ileft_394 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc267) + %ileft_396 = tt.broadcast %ileft_395 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc268) + %iright_397 = arith.muli %y_392, %flip_17 : tensor<4x2x2xi32> loc(#loc269) + %iright_398 = "tt.reduce"(%iright_397) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc313) + %iright_399 = tt.expand_dims %iright_398 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc271) + %iright_400 = tt.broadcast %iright_399 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc272) + %ileft_401 = tt.reshape %ileft_396 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_402 = tt.reshape %iright_400 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_403 = tt.reshape %new_idxs_391 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc286) + %left_idx_404 = arith.muli %y_idx_403, %ileft_56 : tensor<4x2x2xi32> loc(#loc287) + %left_idx_405 = "tt.reduce"(%left_idx_404) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc315) + %left_idx_406 = tt.expand_dims %left_idx_405 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc289) + %left_idx_407 = tt.broadcast %left_idx_406 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc290) + %right_idx_408 = arith.muli %y_idx_403, %flip_17 : tensor<4x2x2xi32> loc(#loc291) + %right_idx_409 = "tt.reduce"(%right_idx_408) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc317) + %right_idx_410 = tt.expand_dims %right_idx_409 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc293) + %right_idx_411 = tt.broadcast %right_idx_410 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc294) + %left_idx_412 = tt.reshape %left_idx_407 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_413 = tt.reshape %right_idx_411 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_414 = arith.cmpi slt, %ileft_401, %iright_402 : tensor<1x16xi32> loc(#loc275) + %eq_415 = arith.cmpi eq, %ileft_401, %iright_402 : tensor<1x16xi32> loc(#loc276) + %cond_416 = arith.cmpi sgt, %left_idx_412, %right_idx_413 : tensor<1x16xi32> loc(#loc297) + %cond_417 = arith.andi %eq_415, %cond_416 : tensor<1x16xi1> loc(#loc277) + %cond_418 = arith.ori %cond_414, %cond_417 : tensor<1x16xi1> loc(#loc278) + %cond_419 = arith.extui %cond_418 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc279) + %cond_420 = arith.xori %cond_419, %flip_54 : tensor<1x16xi32> loc(#loc279) + %cond_421 = arith.cmpi ne, %cond_420, %cst_1 : tensor<1x16xi32> loc(#loc280) + %ret_422 = arith.xori %ileft_401, %iright_402 : tensor<1x16xi32> loc(#loc281) + %ret_423 = arith.select %cond_421, %ret_422, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_424 = arith.xori %ret_389, %ret_423 : tensor<1x16xi32> loc(#loc283) + %new_idxs_425 = arith.xori %left_idx_412, %right_idx_413 : tensor<1x16xi32> loc(#loc298) + %new_idxs_426 = arith.select %cond_421, %new_idxs_425, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_427 = arith.xori %new_idxs_391, %new_idxs_426 : tensor<1x16xi32> loc(#loc285) + %y_428 = tt.reshape %ret_424 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc264) + %ileft_429 = arith.muli %y_428, %ileft : tensor<8x2x1xi32> loc(#loc265) + %ileft_430 = "tt.reduce"(%ileft_429) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc311) + %ileft_431 = tt.expand_dims %ileft_430 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc267) + %ileft_432 = tt.broadcast %ileft_431 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc268) + %iright_433 = arith.muli %y_428, %iright : tensor<8x2x1xi32> loc(#loc269) + %iright_434 = "tt.reduce"(%iright_433) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc313) + %iright_435 = tt.expand_dims %iright_434 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc271) + %iright_436 = tt.broadcast %iright_435 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc272) + %ileft_437 = tt.reshape %ileft_432 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_438 = tt.reshape %iright_436 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_439 = tt.reshape %new_idxs_427 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc286) + %left_idx_440 = arith.muli %y_idx_439, %ileft : tensor<8x2x1xi32> loc(#loc287) + %left_idx_441 = "tt.reduce"(%left_idx_440) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc315) + %left_idx_442 = tt.expand_dims %left_idx_441 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc289) + %left_idx_443 = tt.broadcast %left_idx_442 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc290) + %right_idx_444 = arith.muli %y_idx_439, %iright : tensor<8x2x1xi32> loc(#loc291) + %right_idx_445 = "tt.reduce"(%right_idx_444) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc317) + %right_idx_446 = tt.expand_dims %right_idx_445 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc293) + %right_idx_447 = tt.broadcast %right_idx_446 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc294) + %left_idx_448 = tt.reshape %left_idx_443 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_449 = tt.reshape %right_idx_447 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_450 = arith.cmpi slt, %ileft_437, %iright_438 : tensor<1x16xi32> loc(#loc275) + %eq_451 = arith.cmpi eq, %ileft_437, %iright_438 : tensor<1x16xi32> loc(#loc276) + %cond_452 = arith.cmpi sgt, %left_idx_448, %right_idx_449 : tensor<1x16xi32> loc(#loc297) + %cond_453 = arith.andi %eq_451, %cond_452 : tensor<1x16xi1> loc(#loc277) + %cond_454 = arith.ori %cond_450, %cond_453 : tensor<1x16xi1> loc(#loc278) + %cond_455 = arith.extui %cond_454 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc279) + %cond_456 = arith.xori %cond_455, %flip_54 : tensor<1x16xi32> loc(#loc279) + %cond_457 = arith.cmpi ne, %cond_456, %cst_1 : tensor<1x16xi32> loc(#loc280) + %ret_458 = arith.xori %ileft_437, %iright_438 : tensor<1x16xi32> loc(#loc281) + %ret_459 = arith.select %cond_457, %ret_458, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_460 = arith.xori %ret_424, %ret_459 : tensor<1x16xi32> loc(#loc283) + %new_idxs_461 = arith.xori %left_idx_448, %right_idx_449 : tensor<1x16xi32> loc(#loc298) + %new_idxs_462 = arith.select %cond_457, %new_idxs_461, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_463 = arith.xori %new_idxs_427, %new_idxs_462 : tensor<1x16xi32> loc(#loc285) + %y_464 = tt.reshape %ret_460 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc264) + %ileft_465 = arith.muli %y_464, %ileft_131 : tensor<2x2x4xi32> loc(#loc265) + %ileft_466 = "tt.reduce"(%ileft_465) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc311) + %ileft_467 = tt.expand_dims %ileft_466 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc267) + %ileft_468 = tt.broadcast %ileft_467 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc268) + %iright_469 = arith.muli %y_464, %flip_53 : tensor<2x2x4xi32> loc(#loc269) + %iright_470 = "tt.reduce"(%iright_469) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc313) + %iright_471 = tt.expand_dims %iright_470 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc271) + %iright_472 = tt.broadcast %iright_471 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc272) + %ileft_473 = tt.reshape %ileft_468 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_474 = tt.reshape %iright_472 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_475 = tt.reshape %new_idxs_463 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc286) + %left_idx_476 = arith.muli %y_idx_475, %ileft_131 : tensor<2x2x4xi32> loc(#loc287) + %left_idx_477 = "tt.reduce"(%left_idx_476) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc315) + %left_idx_478 = tt.expand_dims %left_idx_477 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc289) + %left_idx_479 = tt.broadcast %left_idx_478 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc290) + %right_idx_480 = arith.muli %y_idx_475, %flip_53 : tensor<2x2x4xi32> loc(#loc291) + %right_idx_481 = "tt.reduce"(%right_idx_480) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc317) + %right_idx_482 = tt.expand_dims %right_idx_481 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc293) + %right_idx_483 = tt.broadcast %right_idx_482 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc294) + %left_idx_484 = tt.reshape %left_idx_479 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_485 = tt.reshape %right_idx_483 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_486 = arith.cmpi slt, %ileft_473, %iright_474 : tensor<1x16xi32> loc(#loc275) + %eq_487 = arith.cmpi eq, %ileft_473, %iright_474 : tensor<1x16xi32> loc(#loc276) + %cond_488 = arith.cmpi sgt, %left_idx_484, %right_idx_485 : tensor<1x16xi32> loc(#loc297) + %cond_489 = arith.andi %eq_487, %cond_488 : tensor<1x16xi1> loc(#loc277) + %cond_490 = arith.ori %cond_486, %cond_489 : tensor<1x16xi1> loc(#loc278) + %cond_491 = arith.extui %cond_490 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc279) + %cond_492 = arith.xori %cond_491, %flip_129 : tensor<1x16xi32> loc(#loc279) + %cond_493 = arith.cmpi ne, %cond_492, %cst_1 : tensor<1x16xi32> loc(#loc280) + %ret_494 = arith.xori %ileft_473, %iright_474 : tensor<1x16xi32> loc(#loc281) + %ret_495 = arith.select %cond_493, %ret_494, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_496 = arith.xori %ret_460, %ret_495 : tensor<1x16xi32> loc(#loc283) + %new_idxs_497 = arith.xori %left_idx_484, %right_idx_485 : tensor<1x16xi32> loc(#loc298) + %new_idxs_498 = arith.select %cond_493, %new_idxs_497, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_499 = arith.xori %new_idxs_463, %new_idxs_498 : tensor<1x16xi32> loc(#loc285) + %y_500 = tt.reshape %ret_496 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc264) + %ileft_501 = arith.muli %y_500, %ileft_56 : tensor<4x2x2xi32> loc(#loc265) + %ileft_502 = "tt.reduce"(%ileft_501) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc311) + %ileft_503 = tt.expand_dims %ileft_502 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc267) + %ileft_504 = tt.broadcast %ileft_503 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc268) + %iright_505 = arith.muli %y_500, %flip_17 : tensor<4x2x2xi32> loc(#loc269) + %iright_506 = "tt.reduce"(%iright_505) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc313) + %iright_507 = tt.expand_dims %iright_506 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc271) + %iright_508 = tt.broadcast %iright_507 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc272) + %ileft_509 = tt.reshape %ileft_504 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_510 = tt.reshape %iright_508 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_511 = tt.reshape %new_idxs_499 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc286) + %left_idx_512 = arith.muli %y_idx_511, %ileft_56 : tensor<4x2x2xi32> loc(#loc287) + %left_idx_513 = "tt.reduce"(%left_idx_512) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc315) + %left_idx_514 = tt.expand_dims %left_idx_513 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc289) + %left_idx_515 = tt.broadcast %left_idx_514 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc290) + %right_idx_516 = arith.muli %y_idx_511, %flip_17 : tensor<4x2x2xi32> loc(#loc291) + %right_idx_517 = "tt.reduce"(%right_idx_516) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc317) + %right_idx_518 = tt.expand_dims %right_idx_517 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc293) + %right_idx_519 = tt.broadcast %right_idx_518 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc294) + %left_idx_520 = tt.reshape %left_idx_515 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_521 = tt.reshape %right_idx_519 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_522 = arith.cmpi slt, %ileft_509, %iright_510 : tensor<1x16xi32> loc(#loc275) + %eq_523 = arith.cmpi eq, %ileft_509, %iright_510 : tensor<1x16xi32> loc(#loc276) + %cond_524 = arith.cmpi sgt, %left_idx_520, %right_idx_521 : tensor<1x16xi32> loc(#loc297) + %cond_525 = arith.andi %eq_523, %cond_524 : tensor<1x16xi1> loc(#loc277) + %cond_526 = arith.ori %cond_522, %cond_525 : tensor<1x16xi1> loc(#loc278) + %cond_527 = arith.extui %cond_526 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc279) + %cond_528 = arith.xori %cond_527, %flip_129 : tensor<1x16xi32> loc(#loc279) + %cond_529 = arith.cmpi ne, %cond_528, %cst_1 : tensor<1x16xi32> loc(#loc280) + %ret_530 = arith.xori %ileft_509, %iright_510 : tensor<1x16xi32> loc(#loc281) + %ret_531 = arith.select %cond_529, %ret_530, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_532 = arith.xori %ret_496, %ret_531 : tensor<1x16xi32> loc(#loc283) + %new_idxs_533 = arith.xori %left_idx_520, %right_idx_521 : tensor<1x16xi32> loc(#loc298) + %new_idxs_534 = arith.select %cond_529, %new_idxs_533, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_535 = arith.xori %new_idxs_499, %new_idxs_534 : tensor<1x16xi32> loc(#loc285) + %y_536 = tt.reshape %ret_532 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc264) + %ileft_537 = arith.muli %y_536, %ileft : tensor<8x2x1xi32> loc(#loc265) + %ileft_538 = "tt.reduce"(%ileft_537) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc311) + %ileft_539 = tt.expand_dims %ileft_538 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc267) + %ileft_540 = tt.broadcast %ileft_539 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc268) + %iright_541 = arith.muli %y_536, %iright : tensor<8x2x1xi32> loc(#loc269) + %iright_542 = "tt.reduce"(%iright_541) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc313) + %iright_543 = tt.expand_dims %iright_542 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc271) + %iright_544 = tt.broadcast %iright_543 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc272) + %ileft_545 = tt.reshape %ileft_540 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_546 = tt.reshape %iright_544 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_547 = tt.reshape %new_idxs_535 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc286) + %left_idx_548 = arith.muli %y_idx_547, %ileft : tensor<8x2x1xi32> loc(#loc287) + %left_idx_549 = "tt.reduce"(%left_idx_548) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc315) + %left_idx_550 = tt.expand_dims %left_idx_549 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc289) + %left_idx_551 = tt.broadcast %left_idx_550 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc290) + %right_idx_552 = arith.muli %y_idx_547, %iright : tensor<8x2x1xi32> loc(#loc291) + %right_idx_553 = "tt.reduce"(%right_idx_552) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc317) + %right_idx_554 = tt.expand_dims %right_idx_553 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc293) + %right_idx_555 = tt.broadcast %right_idx_554 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc294) + %left_idx_556 = tt.reshape %left_idx_551 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_557 = tt.reshape %right_idx_555 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_558 = arith.cmpi slt, %ileft_545, %iright_546 : tensor<1x16xi32> loc(#loc275) + %eq_559 = arith.cmpi eq, %ileft_545, %iright_546 : tensor<1x16xi32> loc(#loc276) + %cond_560 = arith.cmpi sgt, %left_idx_556, %right_idx_557 : tensor<1x16xi32> loc(#loc297) + %cond_561 = arith.andi %eq_559, %cond_560 : tensor<1x16xi1> loc(#loc277) + %cond_562 = arith.ori %cond_558, %cond_561 : tensor<1x16xi1> loc(#loc278) + %cond_563 = arith.extui %cond_562 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc279) + %cond_564 = arith.xori %cond_563, %flip_129 : tensor<1x16xi32> loc(#loc279) + %cond_565 = arith.cmpi ne, %cond_564, %cst_1 : tensor<1x16xi32> loc(#loc280) + %ret_566 = arith.xori %ileft_545, %iright_546 : tensor<1x16xi32> loc(#loc281) + %ret_567 = arith.select %cond_565, %ret_566, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_568 = arith.xori %ret_532, %ret_567 : tensor<1x16xi32> loc(#loc283) + %new_idxs_569 = arith.xori %left_idx_556, %right_idx_557 : tensor<1x16xi32> loc(#loc298) + %new_idxs_570 = arith.select %cond_565, %new_idxs_569, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_571 = arith.xori %new_idxs_535, %new_idxs_570 : tensor<1x16xi32> loc(#loc285) + %y_572 = tt.reshape %ret_568 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc264) + %ileft_573 = arith.muli %y_572, %ileft_240 : tensor<1x2x8xi32> loc(#loc265) + %ileft_574 = "tt.reduce"(%ileft_573) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc311) + %ileft_575 = tt.expand_dims %ileft_574 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc267) + %ileft_576 = tt.broadcast %ileft_575 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc268) + %iright_577 = arith.muli %y_572, %flip_128 : tensor<1x2x8xi32> loc(#loc269) + %iright_578 = "tt.reduce"(%iright_577) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc313) + %iright_579 = tt.expand_dims %iright_578 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc271) + %iright_580 = tt.broadcast %iright_579 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc272) + %ileft_581 = tt.reshape %ileft_576 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_582 = tt.reshape %iright_580 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_583 = tt.reshape %new_idxs_571 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc286) + %left_idx_584 = arith.muli %y_idx_583, %ileft_240 : tensor<1x2x8xi32> loc(#loc287) + %left_idx_585 = "tt.reduce"(%left_idx_584) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc315) + %left_idx_586 = tt.expand_dims %left_idx_585 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc289) + %left_idx_587 = tt.broadcast %left_idx_586 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc290) + %right_idx_588 = arith.muli %y_idx_583, %flip_128 : tensor<1x2x8xi32> loc(#loc291) + %right_idx_589 = "tt.reduce"(%right_idx_588) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc317) + %right_idx_590 = tt.expand_dims %right_idx_589 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc293) + %right_idx_591 = tt.broadcast %right_idx_590 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc294) + %left_idx_592 = tt.reshape %left_idx_587 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_593 = tt.reshape %right_idx_591 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_594 = arith.cmpi slt, %ileft_581, %iright_582 : tensor<1x16xi32> loc(#loc275) + %eq_595 = arith.cmpi eq, %ileft_581, %iright_582 : tensor<1x16xi32> loc(#loc276) + %cond_596 = arith.cmpi sgt, %left_idx_592, %right_idx_593 : tensor<1x16xi32> loc(#loc297) + %cond_597 = arith.andi %eq_595, %cond_596 : tensor<1x16xi1> loc(#loc277) + %cond_598 = arith.ori %cond_594, %cond_597 : tensor<1x16xi1> loc(#loc278) + %ret_599 = arith.xori %ileft_581, %iright_582 : tensor<1x16xi32> loc(#loc281) + %ret_600 = arith.select %cond_598, %ret_599, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_601 = arith.xori %ret_568, %ret_600 : tensor<1x16xi32> loc(#loc283) + %new_idxs_602 = arith.xori %left_idx_592, %right_idx_593 : tensor<1x16xi32> loc(#loc298) + %new_idxs_603 = arith.select %cond_598, %new_idxs_602, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_604 = arith.xori %new_idxs_571, %new_idxs_603 : tensor<1x16xi32> loc(#loc285) + %y_605 = tt.reshape %ret_601 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc264) + %ileft_606 = arith.muli %y_605, %ileft_131 : tensor<2x2x4xi32> loc(#loc265) + %ileft_607 = "tt.reduce"(%ileft_606) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc311) + %ileft_608 = tt.expand_dims %ileft_607 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc267) + %ileft_609 = tt.broadcast %ileft_608 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc268) + %iright_610 = arith.muli %y_605, %flip_53 : tensor<2x2x4xi32> loc(#loc269) + %iright_611 = "tt.reduce"(%iright_610) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc313) + %iright_612 = tt.expand_dims %iright_611 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc271) + %iright_613 = tt.broadcast %iright_612 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc272) + %ileft_614 = tt.reshape %ileft_609 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_615 = tt.reshape %iright_613 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_616 = tt.reshape %new_idxs_604 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc286) + %left_idx_617 = arith.muli %y_idx_616, %ileft_131 : tensor<2x2x4xi32> loc(#loc287) + %left_idx_618 = "tt.reduce"(%left_idx_617) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc315) + %left_idx_619 = tt.expand_dims %left_idx_618 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc289) + %left_idx_620 = tt.broadcast %left_idx_619 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc290) + %right_idx_621 = arith.muli %y_idx_616, %flip_53 : tensor<2x2x4xi32> loc(#loc291) + %right_idx_622 = "tt.reduce"(%right_idx_621) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc317) + %right_idx_623 = tt.expand_dims %right_idx_622 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc293) + %right_idx_624 = tt.broadcast %right_idx_623 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc294) + %left_idx_625 = tt.reshape %left_idx_620 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_626 = tt.reshape %right_idx_624 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_627 = arith.cmpi slt, %ileft_614, %iright_615 : tensor<1x16xi32> loc(#loc275) + %eq_628 = arith.cmpi eq, %ileft_614, %iright_615 : tensor<1x16xi32> loc(#loc276) + %cond_629 = arith.cmpi sgt, %left_idx_625, %right_idx_626 : tensor<1x16xi32> loc(#loc297) + %cond_630 = arith.andi %eq_628, %cond_629 : tensor<1x16xi1> loc(#loc277) + %cond_631 = arith.ori %cond_627, %cond_630 : tensor<1x16xi1> loc(#loc278) + %ret_632 = arith.xori %ileft_614, %iright_615 : tensor<1x16xi32> loc(#loc281) + %ret_633 = arith.select %cond_631, %ret_632, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_634 = arith.xori %ret_601, %ret_633 : tensor<1x16xi32> loc(#loc283) + %new_idxs_635 = arith.xori %left_idx_625, %right_idx_626 : tensor<1x16xi32> loc(#loc298) + %new_idxs_636 = arith.select %cond_631, %new_idxs_635, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_637 = arith.xori %new_idxs_604, %new_idxs_636 : tensor<1x16xi32> loc(#loc285) + %y_638 = tt.reshape %ret_634 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc264) + %ileft_639 = arith.muli %y_638, %ileft_56 : tensor<4x2x2xi32> loc(#loc265) + %ileft_640 = "tt.reduce"(%ileft_639) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc311) + %ileft_641 = tt.expand_dims %ileft_640 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc267) + %ileft_642 = tt.broadcast %ileft_641 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc268) + %iright_643 = arith.muli %y_638, %flip_17 : tensor<4x2x2xi32> loc(#loc269) + %iright_644 = "tt.reduce"(%iright_643) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc313) + %iright_645 = tt.expand_dims %iright_644 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc271) + %iright_646 = tt.broadcast %iright_645 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc272) + %ileft_647 = tt.reshape %ileft_642 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_648 = tt.reshape %iright_646 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_649 = tt.reshape %new_idxs_637 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc286) + %left_idx_650 = arith.muli %y_idx_649, %ileft_56 : tensor<4x2x2xi32> loc(#loc287) + %left_idx_651 = "tt.reduce"(%left_idx_650) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc315) + %left_idx_652 = tt.expand_dims %left_idx_651 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc289) + %left_idx_653 = tt.broadcast %left_idx_652 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc290) + %right_idx_654 = arith.muli %y_idx_649, %flip_17 : tensor<4x2x2xi32> loc(#loc291) + %right_idx_655 = "tt.reduce"(%right_idx_654) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc317) + %right_idx_656 = tt.expand_dims %right_idx_655 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc293) + %right_idx_657 = tt.broadcast %right_idx_656 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc294) + %left_idx_658 = tt.reshape %left_idx_653 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_659 = tt.reshape %right_idx_657 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_660 = arith.cmpi slt, %ileft_647, %iright_648 : tensor<1x16xi32> loc(#loc275) + %eq_661 = arith.cmpi eq, %ileft_647, %iright_648 : tensor<1x16xi32> loc(#loc276) + %cond_662 = arith.cmpi sgt, %left_idx_658, %right_idx_659 : tensor<1x16xi32> loc(#loc297) + %cond_663 = arith.andi %eq_661, %cond_662 : tensor<1x16xi1> loc(#loc277) + %cond_664 = arith.ori %cond_660, %cond_663 : tensor<1x16xi1> loc(#loc278) + %ret_665 = arith.xori %ileft_647, %iright_648 : tensor<1x16xi32> loc(#loc281) + %ret_666 = arith.select %cond_664, %ret_665, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_667 = arith.xori %ret_634, %ret_666 : tensor<1x16xi32> loc(#loc283) + %new_idxs_668 = arith.xori %left_idx_658, %right_idx_659 : tensor<1x16xi32> loc(#loc298) + %new_idxs_669 = arith.select %cond_664, %new_idxs_668, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_670 = arith.xori %new_idxs_637, %new_idxs_669 : tensor<1x16xi32> loc(#loc285) + %y_671 = tt.reshape %ret_667 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc264) + %ileft_672 = arith.muli %y_671, %ileft : tensor<8x2x1xi32> loc(#loc265) + %ileft_673 = "tt.reduce"(%ileft_672) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc311) + %ileft_674 = tt.expand_dims %ileft_673 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc267) + %ileft_675 = tt.broadcast %ileft_674 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc268) + %iright_676 = arith.muli %y_671, %iright : tensor<8x2x1xi32> loc(#loc269) + %iright_677 = "tt.reduce"(%iright_676) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc313) + %iright_678 = tt.expand_dims %iright_677 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc271) + %iright_679 = tt.broadcast %iright_678 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc272) + %ileft_680 = tt.reshape %ileft_675 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_681 = tt.reshape %iright_679 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_682 = tt.reshape %new_idxs_670 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc286) + %left_idx_683 = arith.muli %y_idx_682, %ileft : tensor<8x2x1xi32> loc(#loc287) + %left_idx_684 = "tt.reduce"(%left_idx_683) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc315) + %left_idx_685 = tt.expand_dims %left_idx_684 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc289) + %left_idx_686 = tt.broadcast %left_idx_685 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc290) + %right_idx_687 = arith.muli %y_idx_682, %iright : tensor<8x2x1xi32> loc(#loc291) + %right_idx_688 = "tt.reduce"(%right_idx_687) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc317) + %right_idx_689 = tt.expand_dims %right_idx_688 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc293) + %right_idx_690 = tt.broadcast %right_idx_689 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc294) + %left_idx_691 = tt.reshape %left_idx_686 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_692 = tt.reshape %right_idx_690 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_693 = arith.cmpi slt, %ileft_680, %iright_681 : tensor<1x16xi32> loc(#loc275) + %eq_694 = arith.cmpi eq, %ileft_680, %iright_681 : tensor<1x16xi32> loc(#loc276) + %cond_695 = arith.cmpi sgt, %left_idx_691, %right_idx_692 : tensor<1x16xi32> loc(#loc297) + %cond_696 = arith.andi %eq_694, %cond_695 : tensor<1x16xi1> loc(#loc277) + %cond_697 = arith.ori %cond_693, %cond_696 : tensor<1x16xi1> loc(#loc278) + %new_idxs_698 = arith.xori %left_idx_691, %right_idx_692 : tensor<1x16xi32> loc(#loc298) + %new_idxs_699 = arith.select %cond_697, %new_idxs_698, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_700 = arith.xori %new_idxs_670, %new_idxs_699 : tensor<1x16xi32> loc(#loc285) + %tmp20 = arith.extui %tmp5 : tensor<1x16xi1> to tensor<1x16xi64> loc(#loc220) + %tmp23 = arith.select %tmp0_13, %tmp20, %cst_5 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc187) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_705: i64 loc(callsite(#loc1 at #loc188)), %tmp24_706: i64 loc(callsite(#loc1 at #loc188))): + %tmp24_707 = arith.addi %tmp24_705, %tmp24_706 : i64 loc(#loc299) + tt.reduce.return %tmp24_707 : i64 loc(#loc221) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc221) + %tmp24_701 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc189) + %tmp25 = arith.extui %tmp14 : tensor<1x16xi1> to tensor<1x16xi64> loc(#loc223) + %tmp28 = arith.select %tmp0_13, %tmp25, %cst_5 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc191) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_705: i64 loc(callsite(#loc1 at #loc192)), %tmp29_706: i64 loc(callsite(#loc1 at #loc192))): + %tmp29_707 = arith.addi %tmp29_705, %tmp29_706 : i64 loc(#loc300) + tt.reduce.return %tmp29_707 : i64 loc(#loc224) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc224) + %tmp29_702 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc193) + %tmp30 = arith.trunci %tmp24_701 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc194) + %tmp31 = arith.trunci %tmp29_702 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc195) + %tmp34 = tt.broadcast %tmp30 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc196) + %tmp34_703 = arith.cmpi slt, %r0_index_8, %tmp34 : tensor<1x16xi32> loc(#loc196) + %tmp36 = arith.select %tmp34_703, %new_idxs_368, %cst_3 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc197) + %tmp38 = arith.addi %tmp36, %cst_2 : tensor<1x16xi32> loc(#loc198) + %tmp39 = arith.cmpi slt, %tmp36, %cst_1 : tensor<1x16xi32> loc(#loc199) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc200) + %0 = arith.cmpi sge, %tmp40, %cst_1 : tensor<1x16xi32> loc(#loc83) + %1 = arith.cmpi slt, %tmp40, %cst_2 : tensor<1x16xi32> loc(#loc84) + %2 = arith.andi %0, %1 : tensor<1x16xi1> loc(#loc85) + %3 = arith.xori %xmask_6, %true : i1 loc(#loc86) + %4 = tt.splat %3 : i1 -> tensor<1x16xi1> loc(#loc201) + %5 = arith.ori %2, %4 : tensor<1x16xi1> loc(#loc87) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<1x16xi1> loc(#loc88) + %tmp45 = tt.broadcast %tmp31 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc202) + %tmp45_704 = arith.cmpi slt, %r0_index_8, %tmp45 : tensor<1x16xi32> loc(#loc202) + %tmp46 = arith.select %tmp45_704, %new_idxs_700, %cst_3 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc203) + %tmp47 = arith.addi %tmp46, %cst_2 : tensor<1x16xi32> loc(#loc204) + %tmp48 = arith.cmpi slt, %tmp46, %cst_1 : tensor<1x16xi32> loc(#loc205) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc206) + %6 = arith.cmpi sge, %tmp49, %cst_1 : tensor<1x16xi32> loc(#loc94) + %7 = arith.cmpi slt, %tmp49, %cst_2 : tensor<1x16xi32> loc(#loc95) + %8 = arith.andi %6, %7 : tensor<1x16xi1> loc(#loc96) + %9 = arith.ori %8, %4 : tensor<1x16xi1> loc(#loc97) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<1x16xi1> loc(#loc98) + %10 = tt.addptr %out_ptr4, %xoffset : !tt.ptr, i32 loc(#loc99) + %11 = tt.splat %10 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc99) + tt.store %11, %tmp30, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc100) + %12 = tt.addptr %out_ptr5, %xoffset : !tt.ptr, i32 loc(#loc101) + %13 = tt.splat %12 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc101) + tt.store %13, %tmp31, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc102) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc103) + %15 = tt.addptr %14, %tmp0_10 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc103) + tt.store %15, %new_idxs_368, %tmp0_13 : tensor<1x16x!tt.ptr> loc(#loc104) + %16 = arith.muli %xoffset, %c17_i32 : i32 loc(#loc105) + %17 = tt.splat %16 : i32 -> tensor<1x16xi32> loc(#loc207) + %18 = arith.addi %tmp40, %17 : tensor<1x16xi32> loc(#loc106) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc107) + %20 = tt.addptr %19, %18 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc107) + tt.store %20, %cst_0, %tmp0_13 : tensor<1x16x!tt.ptr> loc(#loc108) + %21 = tt.splat %out_ptr8 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc109) + %22 = tt.addptr %21, %tmp0_10 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc109) + tt.store %22, %new_idxs_700, %tmp0_13 : tensor<1x16x!tt.ptr> loc(#loc110) + %23 = arith.addi %tmp49, %17 : tensor<1x16xi32> loc(#loc111) + %24 = tt.splat %out_ptr9 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc112) + %25 = tt.addptr %24, %23 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc112) + tt.store %25, %cst_0, %tmp0_13 : tensor<1x16x!tt.ptr> loc(#loc113) + tt.return loc(#loc114) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":26:21) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":27:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":27:38) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:40) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:30) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":34:45) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":36:18) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":38:18) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":39:18) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":41:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":40:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":43:19) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":47:20) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":49:21) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":48:21) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":52:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":54:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":55:29) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":56:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":58:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":59:29) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":60:21) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":61:21) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":64:19) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":66:35) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":68:20) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":69:20) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":70:35) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:28) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:46) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:38) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:55) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:53) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":71:63) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":75:19) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":76:35) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":77:20) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":78:20) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":79:35) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:28) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:46) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:38) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:53) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":80:63) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":81:25) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":81:37) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":82:25) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":82:37) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:25) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":83:47) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:52) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:49) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:25) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":84:85) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:25) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":85:47) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:49) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:25) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:85) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/7z/c7z2jbjub3aupgnechol65vkvi5ruwpylzosdbqvscdyxmreb3jy.py":86:4) +#loc124 = loc("xmask"(#loc2)) +#loc125 = loc("xoffset"(#loc3)) +#loc126 = loc("r0_index"(#loc4)) +#loc127 = loc("r0_index"(#loc5)) +#loc128 = loc("tmp0"(#loc6)) +#loc129 = loc("tmp0"(#loc7)) +#loc130 = loc("tmp0"(#loc8)) +#loc131 = loc("tmp0"(#loc9)) +#loc132 = loc("tmp2"(#loc10)) +#loc133 = loc("tmp4"(#loc11)) +#loc134 = loc("tmp5"(#loc12)) +#loc135 = loc("tmp7"(#loc13)) +#loc136 = loc("tmp6"(#loc14)) +#loc137 = loc("tmp9"(#loc15)) +#loc138 = loc("flip"(#loc16)) +#loc140 = loc("flip"(#loc19)) +#loc141 = loc("flip"(#loc20)) +#loc142 = loc("flip"(#loc21)) +#loc143 = loc("y"(#loc22)) +#loc144 = loc("left_mask"(#loc24)) +#loc145 = loc("ileft"(#loc25)) +#loc147 = loc("ileft"(#loc29)) +#loc148 = loc("ileft"(#loc30)) +#loc149 = loc("iright"(#loc31)) +#loc151 = loc("iright"(#loc33)) +#loc152 = loc("iright"(#loc34)) +#loc153 = loc("ileft"(#loc35)) +#loc154 = loc("iright"(#loc36)) +#loc155 = loc("y_idx"(#loc37)) +#loc156 = loc("left_idx"(#loc38)) +#loc157 = loc("left_idx"(#loc39)) +#loc158 = loc("input"(#loc40)) +#loc160 = loc("left_idx"(#loc42)) +#loc161 = loc("left_idx"(#loc43)) +#loc162 = loc("right_idx"(#loc44)) +#loc163 = loc("right_idx"(#loc45)) +#loc165 = loc("right_idx"(#loc47)) +#loc166 = loc("right_idx"(#loc48)) +#loc167 = loc("left_idx"(#loc49)) +#loc168 = loc("right_idx"(#loc50)) +#loc169 = loc("cond"(#loc51)) +#loc170 = loc("eq"(#loc52)) +#loc171 = loc("cond"(#loc53)) +#loc172 = loc("cond"(#loc54)) +#loc173 = loc("cond"(#loc55)) +#loc174 = loc("cond"(#loc56)) +#loc175 = loc("cond"(#loc57)) +#loc176 = loc("ret"(#loc58)) +#loc177 = loc("ret"(#loc59)) +#loc178 = loc("ret"(#loc60)) +#loc179 = loc("new_idxs"(#loc61)) +#loc180 = loc("new_idxs"(#loc62)) +#loc181 = loc("new_idxs"(#loc63)) +#loc182 = loc("tmp14"(#loc64)) +#loc183 = loc("tmp16"(#loc65)) +#loc184 = loc("tmp15"(#loc66)) +#loc186 = loc("tmp20"(#loc68)) +#loc187 = loc("tmp23"(#loc69)) +#loc189 = loc("tmp24"(#loc71)) +#loc190 = loc("tmp25"(#loc72)) +#loc191 = loc("tmp28"(#loc73)) +#loc193 = loc("tmp29"(#loc75)) +#loc194 = loc("tmp30"(#loc76)) +#loc195 = loc("tmp31"(#loc77)) +#loc196 = loc("tmp34"(#loc78)) +#loc197 = loc("tmp36"(#loc79)) +#loc198 = loc("tmp38"(#loc80)) +#loc199 = loc("tmp39"(#loc81)) +#loc200 = loc("tmp40"(#loc82)) +#loc201 = loc(fused[#loc87, #loc86]) +#loc202 = loc("tmp45"(#loc89)) +#loc203 = loc("tmp46"(#loc90)) +#loc204 = loc("tmp47"(#loc91)) +#loc205 = loc("tmp48"(#loc92)) +#loc206 = loc("tmp49"(#loc93)) +#loc207 = loc(fused[#loc106, #loc105]) +#loc208 = loc(fused[#loc129, #loc128]) +#loc209 = loc(fused[#loc131, #loc124]) +#loc210 = loc(fused[#loc135, #loc136]) +#loc211 = loc(callsite(#loc138 at #loc139)) +#loc212 = loc(callsite(#loc140 at #loc139)) +#loc213 = loc(callsite(#loc141 at #loc139)) +#loc214 = loc(callsite(#loc142 at #loc139)) +#loc216 = loc("cond"(#loc169)) +#loc217 = loc("eq"(#loc170)) +#loc218 = loc(fused[#loc183, #loc184]) +#loc220 = loc(fused[#loc186, #loc135, #loc136]) +#loc221 = loc(callsite(#loc26 at #loc188)) +#loc223 = loc(fused[#loc190, #loc183, #loc184]) +#loc224 = loc(callsite(#loc26 at #loc192)) +#loc226 = loc(callsite(#loc143 at #loc215)) +#loc227 = loc(callsite(#loc144 at #loc215)) +#loc228 = loc(callsite(#loc145 at #loc215)) +#loc230 = loc(callsite(#loc147 at #loc215)) +#loc231 = loc(callsite(#loc148 at #loc215)) +#loc232 = loc(callsite(#loc149 at #loc215)) +#loc234 = loc(callsite(#loc151 at #loc215)) +#loc235 = loc(callsite(#loc152 at #loc215)) +#loc236 = loc(callsite(#loc153 at #loc215)) +#loc237 = loc(callsite(#loc154 at #loc215)) +#loc238 = loc(callsite(#loc155 at #loc215)) +#loc239 = loc(callsite(#loc156 at #loc215)) +#loc240 = loc(callsite(#loc157 at #loc215)) +#loc242 = loc(callsite(#loc160 at #loc215)) +#loc243 = loc(callsite(#loc161 at #loc215)) +#loc244 = loc(callsite(#loc162 at #loc215)) +#loc245 = loc(callsite(#loc163 at #loc215)) +#loc247 = loc(callsite(#loc165 at #loc215)) +#loc248 = loc(callsite(#loc166 at #loc215)) +#loc249 = loc(callsite(#loc167 at #loc215)) +#loc250 = loc(callsite(#loc168 at #loc215)) +#loc251 = loc(callsite(#loc216 at #loc215)) +#loc252 = loc(callsite(#loc217 at #loc215)) +#loc253 = loc(callsite(#loc171 at #loc215)) +#loc254 = loc(callsite(#loc172 at #loc215)) +#loc255 = loc(callsite(#loc173 at #loc215)) +#loc256 = loc(callsite(#loc174 at #loc215)) +#loc257 = loc(callsite(#loc175 at #loc215)) +#loc258 = loc(callsite(#loc176 at #loc215)) +#loc259 = loc(callsite(#loc177 at #loc215)) +#loc260 = loc(callsite(#loc178 at #loc215)) +#loc261 = loc(callsite(#loc179 at #loc215)) +#loc262 = loc(callsite(#loc180 at #loc215)) +#loc263 = loc(callsite(#loc181 at #loc215)) +#loc264 = loc(callsite(#loc143 at #loc219)) +#loc265 = loc(callsite(#loc145 at #loc219)) +#loc267 = loc(callsite(#loc147 at #loc219)) +#loc268 = loc(callsite(#loc148 at #loc219)) +#loc269 = loc(callsite(#loc149 at #loc219)) +#loc271 = loc(callsite(#loc151 at #loc219)) +#loc272 = loc(callsite(#loc152 at #loc219)) +#loc273 = loc(callsite(#loc153 at #loc219)) +#loc274 = loc(callsite(#loc154 at #loc219)) +#loc275 = loc(callsite(#loc216 at #loc219)) +#loc276 = loc(callsite(#loc217 at #loc219)) +#loc277 = loc(callsite(#loc172 at #loc219)) +#loc278 = loc(callsite(#loc173 at #loc219)) +#loc279 = loc(callsite(#loc174 at #loc219)) +#loc280 = loc(callsite(#loc175 at #loc219)) +#loc281 = loc(callsite(#loc176 at #loc219)) +#loc282 = loc(callsite(#loc177 at #loc219)) +#loc283 = loc(callsite(#loc178 at #loc219)) +#loc284 = loc(callsite(#loc180 at #loc219)) +#loc285 = loc(callsite(#loc181 at #loc219)) +#loc286 = loc(callsite(#loc155 at #loc219)) +#loc287 = loc(callsite(#loc157 at #loc219)) +#loc289 = loc(callsite(#loc160 at #loc219)) +#loc290 = loc(callsite(#loc161 at #loc219)) +#loc291 = loc(callsite(#loc163 at #loc219)) +#loc293 = loc(callsite(#loc165 at #loc219)) +#loc294 = loc(callsite(#loc166 at #loc219)) +#loc295 = loc(callsite(#loc167 at #loc219)) +#loc296 = loc(callsite(#loc168 at #loc219)) +#loc297 = loc(callsite(#loc171 at #loc219)) +#loc298 = loc(callsite(#loc179 at #loc219)) +#loc299 = loc(callsite(#loc28 at #loc221)) +#loc300 = loc(callsite(#loc28 at #loc224)) +#loc301 = loc(callsite(#loc26 at #loc229)) +#loc303 = loc(callsite(#loc26 at #loc233)) +#loc305 = loc(callsite(#loc158 at #loc241)) +#loc306 = loc(callsite(#loc26 at #loc241)) +#loc308 = loc(callsite(#loc158 at #loc246)) +#loc309 = loc(callsite(#loc26 at #loc246)) +#loc311 = loc(callsite(#loc26 at #loc266)) +#loc313 = loc(callsite(#loc26 at #loc270)) +#loc315 = loc(callsite(#loc26 at #loc288)) +#loc317 = loc(callsite(#loc26 at #loc292)) +#loc319 = loc(callsite(#loc28 at #loc301)) +#loc320 = loc(callsite(#loc28 at #loc303)) +#loc321 = loc(callsite(#loc28 at #loc306)) +#loc322 = loc(callsite(#loc28 at #loc309)) +#loc323 = loc(callsite(#loc28 at #loc311)) +#loc324 = loc(callsite(#loc28 at #loc313)) +#loc325 = loc(callsite(#loc28 at #loc315)) +#loc326 = loc(callsite(#loc28 at #loc317)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/__grp__triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/__grp__triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b39d0acf31c17cad9409b50e21dfadd7389858ff --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/__grp__triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.source", "triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttir", "triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttgir", "triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.llir", "triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ptx", "triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.cubin", "triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..be6de662757432d08da0e6c69d02728ad15fb0a3 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cf3e86d59a02e44e959707a8c0a8ba5904a9a448 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.json @@ -0,0 +1 @@ +{"hash": "6bc0a0a44b9f6f25ef0277ce55f93ab52e67f7a366e08ecb9da8e4dff0b5a738", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..c3b4fb532dd2c92a161a5b4f60dab3bb66765eb1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.llir @@ -0,0 +1,424 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = icmp samesign ult i32 %7, 512, !dbg !8 + %9 = lshr i32 %7, 8, !dbg !9 + %10 = zext nneg i32 %9 to i64, !dbg !10 + %11 = getelementptr i64, ptr addrspace(1) %0, i64 %10, !dbg !10 + %12 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !11 + %13 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %11, i64 %12, i1 %8) #5, !dbg !11 + %14 = insertelement <4 x i64> poison, i64 %13, i64 0, !dbg !12 + %15 = shufflevector <4 x i64> %14, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !12 + %16 = shl i32 %7, 7, !dbg !13 + %17 = and i32 %16, 1920, !dbg !13 + %18 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !14 + %19 = and i32 %18, 127 + %20 = or disjoint i32 %17, %19 + %21 = zext nneg i32 %20 to i64 + %22 = icmp sgt i64 %13, %21 + %23 = insertelement <4 x i1> poison, i1 %22, i64 0, !dbg !15 + %24 = shufflevector <4 x i1> %23, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !15 + %25 = insertelement <4 x i1> poison, i1 %8, i64 0, !dbg !16 + %26 = shufflevector <4 x i1> %25, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !16 + %27 = zext nneg i32 %20 to i64, !dbg !17 + %28 = shl i32 %7, 3, !dbg !18 + %29 = and i32 %28, 1920, !dbg !18 + %30 = zext nneg i32 %29 to i64, !dbg !17 + %31 = and i32 %18, 511, !dbg !14 + %32 = zext nneg i32 %31 to i64, !dbg !17 + %33 = lshr i64 %32, 7, !dbg !19 + %34 = lshr i32 %18, 7, !dbg !19 + %35 = or disjoint i64 %33, %30, !dbg !20 + %36 = or disjoint i32 %29, %34, !dbg !20 + %37 = or i32 %36, 4, !dbg !20 + %38 = or disjoint i64 %35, 8, !dbg !20 + %39 = or i32 %36, 12, !dbg !20 + %40 = icmp samesign uge i64 %35, %27, !dbg !21 + %41 = icmp samesign uge i32 %37, %20, !dbg !21 + %42 = icmp samesign uge i64 %38, %27, !dbg !21 + %43 = icmp samesign uge i32 %39, %20, !dbg !21 + %44 = zext nneg i32 %37 to i64, !dbg !12 + %45 = zext nneg i32 %39 to i64, !dbg !12 + %46 = insertelement <4 x i64> poison, i64 %35, i64 0, !dbg !12 + %47 = insertelement <4 x i64> %46, i64 %44, i64 1, !dbg !12 + %48 = insertelement <4 x i64> %47, i64 %38, i64 2, !dbg !12 + %49 = insertelement <4 x i64> %48, i64 %45, i64 3, !dbg !12 + %50 = icmp sgt <4 x i64> %15, %49, !dbg !12 + %51 = insertelement <4 x i1> poison, i1 %40, i64 0, !dbg !15 + %52 = insertelement <4 x i1> %51, i1 %41, i64 1, !dbg !15 + %53 = insertelement <4 x i1> %52, i1 %42, i64 2, !dbg !15 + %54 = insertelement <4 x i1> %53, i1 %43, i64 3, !dbg !15 + %55 = and <4 x i1> %54, %50, !dbg !15 + %56 = and <4 x i1> %24, %55, !dbg !15 + %57 = select <4 x i1> %26, <4 x i1> %56, <4 x i1> zeroinitializer, !dbg !16 + %58 = zext <4 x i1> %57 to <4 x i64>, !dbg !16 + %59 = lshr i64 %32, 7, !dbg !19 + %60 = lshr i32 %18, 7, !dbg !19 + %61 = or disjoint i32 %60, 16, !dbg !19 + %62 = or disjoint i64 %59, %30, !dbg !20 + %63 = or disjoint i64 %62, 16, !dbg !20 + %64 = or disjoint i32 %29, %61, !dbg !20 + %65 = or i32 %64, 4, !dbg !20 + %66 = or disjoint i64 %62, 24, !dbg !20 + %67 = or i32 %64, 12, !dbg !20 + %68 = icmp samesign uge i64 %63, %27, !dbg !21 + %69 = icmp samesign uge i32 %65, %20, !dbg !21 + %70 = icmp samesign uge i64 %66, %27, !dbg !21 + %71 = icmp samesign uge i32 %67, %20, !dbg !21 + %72 = zext nneg i32 %65 to i64, !dbg !12 + %73 = zext nneg i32 %67 to i64, !dbg !12 + %74 = insertelement <4 x i64> poison, i64 %63, i64 0, !dbg !12 + %75 = insertelement <4 x i64> %74, i64 %72, i64 1, !dbg !12 + %76 = insertelement <4 x i64> %75, i64 %66, i64 2, !dbg !12 + %77 = insertelement <4 x i64> %76, i64 %73, i64 3, !dbg !12 + %78 = icmp sgt <4 x i64> %15, %77, !dbg !12 + %79 = insertelement <4 x i1> poison, i1 %68, i64 0, !dbg !15 + %80 = insertelement <4 x i1> %79, i1 %69, i64 1, !dbg !15 + %81 = insertelement <4 x i1> %80, i1 %70, i64 2, !dbg !15 + %82 = insertelement <4 x i1> %81, i1 %71, i64 3, !dbg !15 + %83 = and <4 x i1> %82, %78, !dbg !15 + %84 = and <4 x i1> %24, %83, !dbg !15 + %85 = select <4 x i1> %26, <4 x i1> %84, <4 x i1> zeroinitializer, !dbg !16 + %86 = zext <4 x i1> %85 to <4 x i64>, !dbg !16 + %87 = add nuw nsw <4 x i64> %58, %86, !dbg !16 + %88 = lshr i64 %32, 7, !dbg !19 + %89 = lshr i32 %18, 7, !dbg !19 + %90 = or disjoint i32 %89, 32, !dbg !19 + %91 = or disjoint i64 %88, %30, !dbg !20 + %92 = or disjoint i64 %91, 32, !dbg !20 + %93 = or disjoint i32 %29, %90, !dbg !20 + %94 = or i32 %93, 4, !dbg !20 + %95 = or disjoint i64 %91, 40, !dbg !20 + %96 = or i32 %93, 12, !dbg !20 + %97 = icmp samesign uge i64 %92, %27, !dbg !21 + %98 = icmp samesign uge i32 %94, %20, !dbg !21 + %99 = icmp samesign uge i64 %95, %27, !dbg !21 + %100 = icmp samesign uge i32 %96, %20, !dbg !21 + %101 = zext nneg i32 %94 to i64, !dbg !12 + %102 = zext nneg i32 %96 to i64, !dbg !12 + %103 = insertelement <4 x i64> poison, i64 %92, i64 0, !dbg !12 + %104 = insertelement <4 x i64> %103, i64 %101, i64 1, !dbg !12 + %105 = insertelement <4 x i64> %104, i64 %95, i64 2, !dbg !12 + %106 = insertelement <4 x i64> %105, i64 %102, i64 3, !dbg !12 + %107 = icmp sgt <4 x i64> %15, %106, !dbg !12 + %108 = insertelement <4 x i1> poison, i1 %97, i64 0, !dbg !15 + %109 = insertelement <4 x i1> %108, i1 %98, i64 1, !dbg !15 + %110 = insertelement <4 x i1> %109, i1 %99, i64 2, !dbg !15 + %111 = insertelement <4 x i1> %110, i1 %100, i64 3, !dbg !15 + %112 = and <4 x i1> %111, %107, !dbg !15 + %113 = and <4 x i1> %24, %112, !dbg !15 + %114 = select <4 x i1> %26, <4 x i1> %113, <4 x i1> zeroinitializer, !dbg !16 + %115 = zext <4 x i1> %114 to <4 x i64>, !dbg !16 + %116 = add nuw nsw <4 x i64> %87, %115, !dbg !16 + %117 = lshr i64 %32, 7, !dbg !19 + %118 = lshr i32 %18, 7, !dbg !19 + %119 = or disjoint i32 %118, 48, !dbg !19 + %120 = or disjoint i64 %117, %30, !dbg !20 + %121 = or disjoint i64 %120, 48, !dbg !20 + %122 = or disjoint i32 %29, %119, !dbg !20 + %123 = or i32 %122, 4, !dbg !20 + %124 = or disjoint i64 %120, 56, !dbg !20 + %125 = or i32 %122, 12, !dbg !20 + %126 = icmp samesign uge i64 %121, %27, !dbg !21 + %127 = icmp samesign uge i32 %123, %20, !dbg !21 + %128 = icmp samesign uge i64 %124, %27, !dbg !21 + %129 = icmp samesign uge i32 %125, %20, !dbg !21 + %130 = zext nneg i32 %123 to i64, !dbg !12 + %131 = zext nneg i32 %125 to i64, !dbg !12 + %132 = insertelement <4 x i64> poison, i64 %121, i64 0, !dbg !12 + %133 = insertelement <4 x i64> %132, i64 %130, i64 1, !dbg !12 + %134 = insertelement <4 x i64> %133, i64 %124, i64 2, !dbg !12 + %135 = insertelement <4 x i64> %134, i64 %131, i64 3, !dbg !12 + %136 = icmp sgt <4 x i64> %15, %135, !dbg !12 + %137 = insertelement <4 x i1> poison, i1 %126, i64 0, !dbg !15 + %138 = insertelement <4 x i1> %137, i1 %127, i64 1, !dbg !15 + %139 = insertelement <4 x i1> %138, i1 %128, i64 2, !dbg !15 + %140 = insertelement <4 x i1> %139, i1 %129, i64 3, !dbg !15 + %141 = and <4 x i1> %140, %136, !dbg !15 + %142 = and <4 x i1> %24, %141, !dbg !15 + %143 = select <4 x i1> %26, <4 x i1> %142, <4 x i1> zeroinitializer, !dbg !16 + %144 = zext <4 x i1> %143 to <4 x i64>, !dbg !16 + %145 = add nuw nsw <4 x i64> %116, %144, !dbg !16 + %146 = lshr i64 %32, 7, !dbg !19 + %147 = lshr i32 %18, 7, !dbg !19 + %148 = or disjoint i32 %147, 64, !dbg !19 + %149 = or disjoint i64 %146, %30, !dbg !20 + %150 = or disjoint i64 %149, 64, !dbg !20 + %151 = or disjoint i32 %29, %148, !dbg !20 + %152 = or i32 %151, 4, !dbg !20 + %153 = or disjoint i64 %149, 72, !dbg !20 + %154 = or i32 %151, 12, !dbg !20 + %155 = icmp samesign uge i64 %150, %27, !dbg !21 + %156 = icmp samesign uge i32 %152, %20, !dbg !21 + %157 = icmp samesign uge i64 %153, %27, !dbg !21 + %158 = icmp samesign uge i32 %154, %20, !dbg !21 + %159 = zext nneg i32 %152 to i64, !dbg !12 + %160 = zext nneg i32 %154 to i64, !dbg !12 + %161 = insertelement <4 x i64> poison, i64 %150, i64 0, !dbg !12 + %162 = insertelement <4 x i64> %161, i64 %159, i64 1, !dbg !12 + %163 = insertelement <4 x i64> %162, i64 %153, i64 2, !dbg !12 + %164 = insertelement <4 x i64> %163, i64 %160, i64 3, !dbg !12 + %165 = icmp sgt <4 x i64> %15, %164, !dbg !12 + %166 = insertelement <4 x i1> poison, i1 %155, i64 0, !dbg !15 + %167 = insertelement <4 x i1> %166, i1 %156, i64 1, !dbg !15 + %168 = insertelement <4 x i1> %167, i1 %157, i64 2, !dbg !15 + %169 = insertelement <4 x i1> %168, i1 %158, i64 3, !dbg !15 + %170 = and <4 x i1> %169, %165, !dbg !15 + %171 = and <4 x i1> %24, %170, !dbg !15 + %172 = select <4 x i1> %26, <4 x i1> %171, <4 x i1> zeroinitializer, !dbg !16 + %173 = zext <4 x i1> %172 to <4 x i64>, !dbg !16 + %174 = add nuw nsw <4 x i64> %145, %173, !dbg !16 + %175 = lshr i64 %32, 7, !dbg !19 + %176 = lshr i32 %18, 7, !dbg !19 + %177 = or disjoint i32 %176, 80, !dbg !19 + %178 = or disjoint i64 %175, %30, !dbg !20 + %179 = or disjoint i64 %178, 80, !dbg !20 + %180 = or disjoint i32 %29, %177, !dbg !20 + %181 = or i32 %180, 4, !dbg !20 + %182 = or disjoint i64 %178, 88, !dbg !20 + %183 = or i32 %180, 12, !dbg !20 + %184 = icmp samesign uge i64 %179, %27, !dbg !21 + %185 = icmp samesign uge i32 %181, %20, !dbg !21 + %186 = icmp samesign uge i64 %182, %27, !dbg !21 + %187 = icmp samesign uge i32 %183, %20, !dbg !21 + %188 = zext nneg i32 %181 to i64, !dbg !12 + %189 = zext nneg i32 %183 to i64, !dbg !12 + %190 = insertelement <4 x i64> poison, i64 %179, i64 0, !dbg !12 + %191 = insertelement <4 x i64> %190, i64 %188, i64 1, !dbg !12 + %192 = insertelement <4 x i64> %191, i64 %182, i64 2, !dbg !12 + %193 = insertelement <4 x i64> %192, i64 %189, i64 3, !dbg !12 + %194 = icmp sgt <4 x i64> %15, %193, !dbg !12 + %195 = insertelement <4 x i1> poison, i1 %184, i64 0, !dbg !15 + %196 = insertelement <4 x i1> %195, i1 %185, i64 1, !dbg !15 + %197 = insertelement <4 x i1> %196, i1 %186, i64 2, !dbg !15 + %198 = insertelement <4 x i1> %197, i1 %187, i64 3, !dbg !15 + %199 = and <4 x i1> %198, %194, !dbg !15 + %200 = and <4 x i1> %24, %199, !dbg !15 + %201 = select <4 x i1> %26, <4 x i1> %200, <4 x i1> zeroinitializer, !dbg !16 + %202 = zext <4 x i1> %201 to <4 x i64>, !dbg !16 + %203 = add nuw nsw <4 x i64> %174, %202, !dbg !16 + %204 = lshr i64 %32, 7, !dbg !19 + %205 = lshr i32 %18, 7, !dbg !19 + %206 = or disjoint i32 %205, 96, !dbg !19 + %207 = or disjoint i64 %204, %30, !dbg !20 + %208 = or disjoint i64 %207, 96, !dbg !20 + %209 = or disjoint i32 %29, %206, !dbg !20 + %210 = or i32 %209, 4, !dbg !20 + %211 = or disjoint i64 %207, 104, !dbg !20 + %212 = or i32 %209, 12, !dbg !20 + %213 = icmp samesign uge i64 %208, %27, !dbg !21 + %214 = icmp samesign uge i32 %210, %20, !dbg !21 + %215 = icmp samesign uge i64 %211, %27, !dbg !21 + %216 = icmp samesign uge i32 %212, %20, !dbg !21 + %217 = zext nneg i32 %210 to i64, !dbg !12 + %218 = zext nneg i32 %212 to i64, !dbg !12 + %219 = insertelement <4 x i64> poison, i64 %208, i64 0, !dbg !12 + %220 = insertelement <4 x i64> %219, i64 %217, i64 1, !dbg !12 + %221 = insertelement <4 x i64> %220, i64 %211, i64 2, !dbg !12 + %222 = insertelement <4 x i64> %221, i64 %218, i64 3, !dbg !12 + %223 = icmp sgt <4 x i64> %15, %222, !dbg !12 + %224 = insertelement <4 x i1> poison, i1 %213, i64 0, !dbg !15 + %225 = insertelement <4 x i1> %224, i1 %214, i64 1, !dbg !15 + %226 = insertelement <4 x i1> %225, i1 %215, i64 2, !dbg !15 + %227 = insertelement <4 x i1> %226, i1 %216, i64 3, !dbg !15 + %228 = and <4 x i1> %227, %223, !dbg !15 + %229 = and <4 x i1> %24, %228, !dbg !15 + %230 = select <4 x i1> %26, <4 x i1> %229, <4 x i1> zeroinitializer, !dbg !16 + %231 = zext <4 x i1> %230 to <4 x i64>, !dbg !16 + %232 = add nuw nsw <4 x i64> %203, %231, !dbg !16 + %233 = lshr i64 %32, 7, !dbg !19 + %234 = lshr i32 %18, 7, !dbg !19 + %235 = or disjoint i32 %234, 112, !dbg !19 + %236 = or disjoint i64 %233, %30, !dbg !20 + %237 = or disjoint i64 %236, 112, !dbg !20 + %238 = or disjoint i32 %29, %235, !dbg !20 + %239 = or i32 %238, 4, !dbg !20 + %240 = or disjoint i64 %236, 120, !dbg !20 + %241 = or i32 %238, 12, !dbg !20 + %242 = icmp samesign uge i64 %237, %27, !dbg !21 + %243 = icmp samesign uge i32 %239, %20, !dbg !21 + %244 = icmp samesign uge i64 %240, %27, !dbg !21 + %245 = icmp samesign uge i32 %241, %20, !dbg !21 + %246 = zext nneg i32 %239 to i64, !dbg !12 + %247 = zext nneg i32 %241 to i64, !dbg !12 + %248 = insertelement <4 x i64> poison, i64 %237, i64 0, !dbg !12 + %249 = insertelement <4 x i64> %248, i64 %246, i64 1, !dbg !12 + %250 = insertelement <4 x i64> %249, i64 %240, i64 2, !dbg !12 + %251 = insertelement <4 x i64> %250, i64 %247, i64 3, !dbg !12 + %252 = icmp sgt <4 x i64> %15, %251, !dbg !12 + %253 = insertelement <4 x i1> poison, i1 %242, i64 0, !dbg !15 + %254 = insertelement <4 x i1> %253, i1 %243, i64 1, !dbg !15 + %255 = insertelement <4 x i1> %254, i1 %244, i64 2, !dbg !15 + %256 = insertelement <4 x i1> %255, i1 %245, i64 3, !dbg !15 + %257 = and <4 x i1> %256, %252, !dbg !15 + %258 = and <4 x i1> %24, %257, !dbg !15 + %259 = select <4 x i1> %26, <4 x i1> %258, <4 x i1> zeroinitializer, !dbg !16 + %260 = zext <4 x i1> %259 to <4 x i64>, !dbg !16 + %261 = add <4 x i64> %232, %260, !dbg !16 + %262 = and i32 %18, 31, !dbg !14 + %263 = lshr i32 %18, 5, !dbg !14 + %264 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %261), !dbg !22 + %extelt.offset = lshr i64 %264, 32, !dbg !26 + %265 = trunc nuw i64 %extelt.offset to i32, !dbg !26 + %266 = trunc i64 %264 to i32, !dbg !26 + %267 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %266, i32 16, i32 31), !dbg !26 + %268 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %265, i32 16, i32 31), !dbg !26 + %269 = insertelement <2 x i32> poison, i32 %267, i64 0, !dbg !26 + %270 = insertelement <2 x i32> %269, i32 %268, i64 1, !dbg !26 + %271 = bitcast <2 x i32> %270 to i64, !dbg !26 + %272 = add i64 %264, %271, !dbg !22 + %extelt.offset1 = lshr i64 %272, 32, !dbg !26 + %273 = trunc nuw i64 %extelt.offset1 to i32, !dbg !26 + %274 = trunc i64 %272 to i32, !dbg !26 + %275 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %274, i32 8, i32 31), !dbg !26 + %276 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %273, i32 8, i32 31), !dbg !26 + %277 = insertelement <2 x i32> poison, i32 %275, i64 0, !dbg !26 + %278 = insertelement <2 x i32> %277, i32 %276, i64 1, !dbg !26 + %279 = bitcast <2 x i32> %278 to i64, !dbg !26 + %280 = add i64 %272, %279, !dbg !22 + %extelt.offset2 = lshr i64 %280, 32, !dbg !26 + %281 = trunc nuw i64 %extelt.offset2 to i32, !dbg !26 + %282 = trunc i64 %280 to i32, !dbg !26 + %283 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %282, i32 4, i32 31), !dbg !26 + %284 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %281, i32 4, i32 31), !dbg !26 + %285 = insertelement <2 x i32> poison, i32 %283, i64 0, !dbg !26 + %286 = insertelement <2 x i32> %285, i32 %284, i64 1, !dbg !26 + %287 = bitcast <2 x i32> %286 to i64, !dbg !26 + %288 = add i64 %280, %287, !dbg !22 + %extelt.offset3 = lshr i64 %288, 32, !dbg !26 + %289 = trunc nuw i64 %extelt.offset3 to i32, !dbg !26 + %290 = trunc i64 %288 to i32, !dbg !26 + %291 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %290, i32 2, i32 31), !dbg !26 + %292 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %289, i32 2, i32 31), !dbg !26 + %293 = insertelement <2 x i32> poison, i32 %291, i64 0, !dbg !26 + %294 = insertelement <2 x i32> %293, i32 %292, i64 1, !dbg !26 + %295 = bitcast <2 x i32> %294 to i64, !dbg !26 + %296 = add i64 %288, %295, !dbg !22 + %extelt.offset4 = lshr i64 %296, 32, !dbg !26 + %297 = trunc nuw i64 %extelt.offset4 to i32, !dbg !26 + %298 = trunc i64 %296 to i32, !dbg !26 + %299 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 1, i32 31), !dbg !26 + %300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %297, i32 1, i32 31), !dbg !26 + %301 = insertelement <2 x i32> poison, i32 %299, i64 0, !dbg !26 + %302 = insertelement <2 x i32> %301, i32 %300, i64 1, !dbg !26 + %303 = bitcast <2 x i32> %302 to i64, !dbg !26 + %304 = add i64 %296, %303, !dbg !22 + %305 = and i32 %263, 15, !dbg !26 + %306 = icmp eq i32 %262, 0, !dbg !26 + %307 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %305, !dbg !26 + %308 = insertelement <1 x i64> poison, i64 %304, i64 0, !dbg !26 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %307, <1 x i64> %308, i1 %306) #5, !dbg !26 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !26 + %309 = icmp samesign ult i32 %18, 16, !dbg !26 + %310 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %18, !dbg !26 + %311 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %310, i1 %309) #5, !dbg !26 + %extelt.offset5 = lshr i64 %311, 32, !dbg !26 + %312 = trunc nuw i64 %extelt.offset5 to i32, !dbg !26 + %313 = trunc i64 %311 to i32, !dbg !26 + %314 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %313, i32 8, i32 31), !dbg !26 + %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %312, i32 8, i32 31), !dbg !26 + %316 = insertelement <2 x i32> poison, i32 %314, i64 0, !dbg !26 + %317 = insertelement <2 x i32> %316, i32 %315, i64 1, !dbg !26 + %318 = bitcast <2 x i32> %317 to i64, !dbg !26 + %319 = add i64 %311, %318, !dbg !22 + %extelt.offset6 = lshr i64 %319, 32, !dbg !26 + %320 = trunc nuw i64 %extelt.offset6 to i32, !dbg !26 + %321 = trunc i64 %319 to i32, !dbg !26 + %322 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %321, i32 4, i32 31), !dbg !26 + %323 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %320, i32 4, i32 31), !dbg !26 + %324 = insertelement <2 x i32> poison, i32 %322, i64 0, !dbg !26 + %325 = insertelement <2 x i32> %324, i32 %323, i64 1, !dbg !26 + %326 = bitcast <2 x i32> %325 to i64, !dbg !26 + %327 = add i64 %319, %326, !dbg !22 + %extelt.offset7 = lshr i64 %327, 32, !dbg !26 + %328 = trunc nuw i64 %extelt.offset7 to i32, !dbg !26 + %329 = trunc i64 %327 to i32, !dbg !26 + %330 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %329, i32 2, i32 31), !dbg !26 + %331 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %328, i32 2, i32 31), !dbg !26 + %332 = insertelement <2 x i32> poison, i32 %330, i64 0, !dbg !26 + %333 = insertelement <2 x i32> %332, i32 %331, i64 1, !dbg !26 + %334 = bitcast <2 x i32> %333 to i64, !dbg !26 + %335 = add i64 %327, %334, !dbg !22 + %extelt.offset8 = lshr i64 %335, 32, !dbg !26 + %336 = trunc nuw i64 %extelt.offset8 to i32, !dbg !26 + %337 = trunc i64 %335 to i32, !dbg !26 + %338 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %337, i32 1, i32 31), !dbg !26 + %339 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %336, i32 1, i32 31), !dbg !26 + %340 = insertelement <2 x i32> poison, i32 %338, i64 0, !dbg !26 + %341 = insertelement <2 x i32> %340, i32 %339, i64 1, !dbg !26 + %342 = bitcast <2 x i32> %341 to i64, !dbg !26 + %343 = add i64 %335, %342, !dbg !22 + %344 = icmp eq i32 %18, 0, !dbg !26 + %345 = insertelement <1 x i64> poison, i64 %343, i64 0, !dbg !26 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %310, <1 x i64> %345, i1 %344) #5, !dbg !26 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !26 + %346 = load i64, ptr addrspace(3) @global_smem, align 16, !dbg !26 + %347 = zext nneg i32 %7 to i64, !dbg !27 + %348 = getelementptr i64, ptr addrspace(1) %1, i64 %347, !dbg !27 + %349 = icmp eq i32 %31, 0, !dbg !28 + %350 = and i1 %8, %349, !dbg !28 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %346, ptr addrspace(1) %348, i1 %350) #5, !dbg !28 + ret void, !dbg !29 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0", linkageName: "triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 25, column: 21, scope: !4) +!9 = !DILocation(line: 30, column: 19, scope: !4) +!10 = !DILocation(line: 31, column: 30, scope: !4) +!11 = !DILocation(line: 31, column: 35, scope: !4) +!12 = !DILocation(line: 45, column: 22, scope: !4) +!13 = !DILocation(line: 42, column: 26, scope: !4) +!14 = !DILocation(line: 26, column: 37, scope: !4) +!15 = !DILocation(line: 47, column: 22, scope: !4) +!16 = !DILocation(line: 70, column: 50, scope: !4) +!17 = !DILocation(line: 34, column: 40, scope: !4) +!18 = !DILocation(line: 41, column: 26, scope: !4) +!19 = !DILocation(line: 39, column: 27, scope: !4) +!20 = !DILocation(line: 41, column: 22, scope: !4) +!21 = !DILocation(line: 43, column: 23, scope: !4) +!22 = !DILocation(line: 261, column: 15, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !4, file: !24, discriminator: 0) +!24 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!25 = !DILocation(line: 71, column: 27, scope: !4) +!26 = !DILocation(line: 291, column: 36, scope: !23, inlinedAt: !25) +!27 = !DILocation(line: 72, column: 25, scope: !4) +!28 = !DILocation(line: 72, column: 37, scope: !4) +!29 = !DILocation(line: 72, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..6a6f3d2465dbcbc7d91ab8c8382e89d09a679026 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ptx @@ -0,0 +1,824 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0 // -- Begin function triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0 +.visible .entry triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0( + .param .u64 .ptr .global .align 1 triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0_param_1, + .param .u32 triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0_param_2, + .param .u32 triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0_param_5 +) +.reqntid 512 +{ + .reg .pred %p<200>; + .reg .b32 %r<73>; + .reg .b64 %rd<154>; + .loc 1 18 0 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:18:0 + +// %bb.0: + ld.param.b64 %rd10, [triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0_param_0]; + ld.param.b64 %rd11, [triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:23:28 + mov.u32 %r4, %ctaid.x; + .loc 1 25 21 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:25:21 + setp.lt.u32 %p1, %r4, 512; + .loc 1 30 19 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:30:19 + shr.u32 %r5, %r4, 8; + .loc 1 31 30 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:31:30 + mad.wide.u32 %rd3, %r5, 8, %rd10; + .loc 1 31 35 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:31:35 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd2, 0x0; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd2 }, [ %rd3 + 0 ], %rd4; + // end inline asm + .loc 1 42 26 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:42:26 + shl.b32 %r6, %r4, 7; + and.b32 %r7, %r6, 1920; + .loc 1 26 37 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:26:37 + mov.u32 %r8, %tid.x; + and.b32 %r9, %r8, 127; + or.b32 %r10, %r7, %r9; + cvt.u64.u32 %rd12, %r10; + setp.gt.s64 %p6, %rd2, %rd12; + .loc 1 41 26 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:41:26 + shl.b32 %r11, %r4, 3; + and.b32 %r12, %r11, 1920; + .loc 1 34 40 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:34:40 + cvt.u64.u32 %rd13, %r12; + .loc 1 26 37 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:26:37 + and.b32 %r13, %r8, 511; + .loc 1 34 40 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:34:40 + cvt.u64.u32 %rd14, %r13; + .loc 1 39 27 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:39:27 + shr.u64 %rd15, %rd14, 7; + shr.u32 %r14, %r8, 7; + .loc 1 41 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:41:22 + or.b64 %rd16, %rd15, %rd13; + or.b32 %r15, %r12, %r14; + or.b32 %r16, %r15, 4; + or.b64 %rd17, %rd16, 8; + or.b32 %r17, %r15, 12; + .loc 1 43 23 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:43:23 + setp.ge.u64 %p7, %rd16, %rd12; + setp.ge.u32 %p8, %r16, %r10; + setp.ge.u64 %p9, %rd17, %rd12; + setp.ge.u32 %p10, %r17, %r10; + .loc 1 45 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:45:22 + cvt.u64.u32 %rd18, %r16; + cvt.u64.u32 %rd19, %r17; + setp.gt.s64 %p11, %rd2, %rd18; + setp.gt.s64 %p12, %rd2, %rd19; + setp.gt.s64 %p13, %rd2, %rd16; + setp.gt.s64 %p14, %rd2, %rd17; + .loc 1 47 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:47:22 + and.pred %p15, %p14, %p6; + and.pred %p16, %p13, %p6; + and.pred %p17, %p12, %p6; + and.pred %p18, %p11, %p6; + and.pred %p19, %p18, %p8; + and.pred %p21, %p17, %p10; + and.pred %p23, %p16, %p7; + and.pred %p25, %p15, %p9; + .loc 1 70 50 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:70:50 + and.pred %p27, %p1, %p25; + and.pred %p28, %p1, %p23; + and.pred %p29, %p1, %p21; + and.pred %p30, %p1, %p19; + selp.b64 %rd20, 1, 0, %p30; + selp.b64 %rd21, 1, 0, %p29; + selp.b64 %rd22, 1, 0, %p28; + selp.b64 %rd23, 1, 0, %p27; + .loc 1 41 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:41:22 + or.b64 %rd24, %rd16, 16; + or.b32 %r18, %r15, 20; + or.b64 %rd25, %rd16, 24; + or.b32 %r19, %r15, 28; + .loc 1 43 23 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:43:23 + setp.ge.u64 %p31, %rd24, %rd12; + setp.ge.u32 %p32, %r18, %r10; + setp.ge.u64 %p33, %rd25, %rd12; + setp.ge.u32 %p34, %r19, %r10; + .loc 1 45 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:45:22 + cvt.u64.u32 %rd26, %r18; + cvt.u64.u32 %rd27, %r19; + setp.gt.s64 %p35, %rd2, %rd26; + setp.gt.s64 %p36, %rd2, %rd27; + setp.gt.s64 %p37, %rd2, %rd24; + setp.gt.s64 %p38, %rd2, %rd25; + .loc 1 47 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:47:22 + and.pred %p39, %p38, %p6; + and.pred %p40, %p37, %p6; + and.pred %p41, %p36, %p6; + and.pred %p42, %p35, %p6; + and.pred %p43, %p42, %p32; + and.pred %p45, %p41, %p34; + and.pred %p47, %p40, %p31; + and.pred %p49, %p39, %p33; + .loc 1 70 50 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:70:50 + and.pred %p51, %p1, %p49; + and.pred %p52, %p1, %p47; + and.pred %p53, %p1, %p45; + and.pred %p54, %p1, %p43; + selp.b64 %rd28, 1, 0, %p54; + selp.b64 %rd29, 1, 0, %p53; + selp.b64 %rd30, 1, 0, %p52; + selp.b64 %rd31, 1, 0, %p51; + add.s64 %rd32, %rd23, %rd31; + add.s64 %rd33, %rd22, %rd30; + add.s64 %rd34, %rd21, %rd29; + add.s64 %rd35, %rd20, %rd28; + .loc 1 41 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:41:22 + or.b64 %rd36, %rd16, 32; + or.b32 %r20, %r15, 36; + or.b64 %rd37, %rd16, 40; + or.b32 %r21, %r15, 44; + .loc 1 43 23 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:43:23 + setp.ge.u64 %p55, %rd36, %rd12; + setp.ge.u32 %p56, %r20, %r10; + setp.ge.u64 %p57, %rd37, %rd12; + setp.ge.u32 %p58, %r21, %r10; + .loc 1 45 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:45:22 + cvt.u64.u32 %rd38, %r20; + cvt.u64.u32 %rd39, %r21; + setp.gt.s64 %p59, %rd2, %rd37; + setp.gt.s64 %p60, %rd2, %rd36; + setp.gt.s64 %p61, %rd2, %rd39; + setp.gt.s64 %p62, %rd2, %rd38; + .loc 1 47 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:47:22 + and.pred %p63, %p62, %p6; + and.pred %p64, %p61, %p6; + and.pred %p65, %p60, %p6; + and.pred %p66, %p59, %p6; + and.pred %p67, %p66, %p57; + and.pred %p69, %p65, %p55; + and.pred %p71, %p64, %p58; + and.pred %p73, %p63, %p56; + .loc 1 70 50 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:70:50 + and.pred %p75, %p1, %p73; + and.pred %p76, %p1, %p71; + and.pred %p77, %p1, %p69; + and.pred %p78, %p1, %p67; + selp.b64 %rd40, 1, 0, %p78; + selp.b64 %rd41, 1, 0, %p77; + selp.b64 %rd42, 1, 0, %p76; + selp.b64 %rd43, 1, 0, %p75; + add.s64 %rd44, %rd35, %rd43; + add.s64 %rd45, %rd34, %rd42; + add.s64 %rd46, %rd33, %rd41; + add.s64 %rd47, %rd32, %rd40; + .loc 1 41 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:41:22 + or.b64 %rd48, %rd16, 48; + or.b32 %r22, %r15, 52; + or.b64 %rd49, %rd16, 56; + or.b32 %r23, %r15, 60; + .loc 1 43 23 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:43:23 + setp.ge.u64 %p79, %rd48, %rd12; + setp.ge.u32 %p80, %r22, %r10; + setp.ge.u64 %p81, %rd49, %rd12; + setp.ge.u32 %p82, %r23, %r10; + .loc 1 45 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:45:22 + cvt.u64.u32 %rd50, %r22; + cvt.u64.u32 %rd51, %r23; + setp.gt.s64 %p83, %rd2, %rd50; + setp.gt.s64 %p84, %rd2, %rd51; + setp.gt.s64 %p85, %rd2, %rd48; + setp.gt.s64 %p86, %rd2, %rd49; + .loc 1 47 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:47:22 + and.pred %p87, %p86, %p6; + and.pred %p88, %p85, %p6; + and.pred %p89, %p84, %p6; + and.pred %p90, %p83, %p6; + and.pred %p91, %p90, %p80; + and.pred %p93, %p89, %p82; + and.pred %p95, %p88, %p79; + and.pred %p97, %p87, %p81; + .loc 1 70 50 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:70:50 + and.pred %p99, %p1, %p97; + and.pred %p100, %p1, %p95; + and.pred %p101, %p1, %p93; + and.pred %p102, %p1, %p91; + selp.b64 %rd52, 1, 0, %p102; + selp.b64 %rd53, 1, 0, %p101; + selp.b64 %rd54, 1, 0, %p100; + selp.b64 %rd55, 1, 0, %p99; + add.s64 %rd56, %rd47, %rd55; + add.s64 %rd57, %rd46, %rd54; + add.s64 %rd58, %rd45, %rd53; + add.s64 %rd59, %rd44, %rd52; + .loc 1 41 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:41:22 + or.b64 %rd60, %rd16, 64; + or.b32 %r24, %r15, 68; + or.b64 %rd61, %rd16, 72; + or.b32 %r25, %r15, 76; + .loc 1 43 23 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:43:23 + setp.ge.u64 %p103, %rd60, %rd12; + setp.ge.u32 %p104, %r24, %r10; + setp.ge.u64 %p105, %rd61, %rd12; + setp.ge.u32 %p106, %r25, %r10; + .loc 1 45 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:45:22 + cvt.u64.u32 %rd62, %r24; + cvt.u64.u32 %rd63, %r25; + setp.gt.s64 %p107, %rd2, %rd61; + setp.gt.s64 %p108, %rd2, %rd60; + setp.gt.s64 %p109, %rd2, %rd63; + setp.gt.s64 %p110, %rd2, %rd62; + .loc 1 47 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:47:22 + and.pred %p111, %p110, %p6; + and.pred %p112, %p109, %p6; + and.pred %p113, %p108, %p6; + and.pred %p114, %p107, %p6; + and.pred %p115, %p114, %p105; + and.pred %p117, %p113, %p103; + and.pred %p119, %p112, %p106; + and.pred %p121, %p111, %p104; + .loc 1 70 50 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:70:50 + and.pred %p123, %p1, %p121; + and.pred %p124, %p1, %p119; + and.pred %p125, %p1, %p117; + and.pred %p126, %p1, %p115; + selp.b64 %rd64, 1, 0, %p126; + selp.b64 %rd65, 1, 0, %p125; + selp.b64 %rd66, 1, 0, %p124; + selp.b64 %rd67, 1, 0, %p123; + add.s64 %rd68, %rd59, %rd67; + add.s64 %rd69, %rd58, %rd66; + add.s64 %rd70, %rd57, %rd65; + add.s64 %rd71, %rd56, %rd64; + .loc 1 41 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:41:22 + or.b64 %rd72, %rd16, 80; + or.b32 %r26, %r15, 84; + or.b64 %rd73, %rd16, 88; + or.b32 %r27, %r15, 92; + .loc 1 43 23 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:43:23 + setp.ge.u64 %p127, %rd72, %rd12; + setp.ge.u32 %p128, %r26, %r10; + setp.ge.u64 %p129, %rd73, %rd12; + setp.ge.u32 %p130, %r27, %r10; + .loc 1 45 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:45:22 + cvt.u64.u32 %rd74, %r26; + cvt.u64.u32 %rd75, %r27; + setp.gt.s64 %p131, %rd2, %rd74; + setp.gt.s64 %p132, %rd2, %rd75; + setp.gt.s64 %p133, %rd2, %rd72; + setp.gt.s64 %p134, %rd2, %rd73; + .loc 1 47 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:47:22 + and.pred %p135, %p134, %p6; + and.pred %p136, %p133, %p6; + and.pred %p137, %p132, %p6; + and.pred %p138, %p131, %p6; + and.pred %p139, %p138, %p128; + and.pred %p141, %p137, %p130; + and.pred %p143, %p136, %p127; + and.pred %p145, %p135, %p129; + .loc 1 70 50 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:70:50 + and.pred %p147, %p1, %p145; + and.pred %p148, %p1, %p143; + and.pred %p149, %p1, %p141; + and.pred %p150, %p1, %p139; + selp.b64 %rd76, 1, 0, %p150; + selp.b64 %rd77, 1, 0, %p149; + selp.b64 %rd78, 1, 0, %p148; + selp.b64 %rd79, 1, 0, %p147; + add.s64 %rd80, %rd71, %rd79; + add.s64 %rd81, %rd70, %rd78; + add.s64 %rd82, %rd69, %rd77; + add.s64 %rd83, %rd68, %rd76; + .loc 1 41 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:41:22 + or.b64 %rd84, %rd16, 96; + or.b32 %r28, %r15, 100; + or.b64 %rd85, %rd16, 104; + or.b32 %r29, %r15, 108; + .loc 1 43 23 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:43:23 + setp.ge.u64 %p151, %rd84, %rd12; + setp.ge.u32 %p152, %r28, %r10; + setp.ge.u64 %p153, %rd85, %rd12; + setp.ge.u32 %p154, %r29, %r10; + .loc 1 45 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:45:22 + cvt.u64.u32 %rd86, %r28; + cvt.u64.u32 %rd87, %r29; + setp.gt.s64 %p155, %rd2, %rd85; + setp.gt.s64 %p156, %rd2, %rd84; + setp.gt.s64 %p157, %rd2, %rd87; + setp.gt.s64 %p158, %rd2, %rd86; + .loc 1 47 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:47:22 + and.pred %p159, %p158, %p6; + and.pred %p160, %p157, %p6; + and.pred %p161, %p156, %p6; + and.pred %p162, %p155, %p6; + and.pred %p163, %p162, %p153; + and.pred %p165, %p161, %p151; + and.pred %p167, %p160, %p154; + and.pred %p169, %p159, %p152; + .loc 1 70 50 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:70:50 + and.pred %p171, %p1, %p169; + and.pred %p172, %p1, %p167; + and.pred %p173, %p1, %p165; + and.pred %p174, %p1, %p163; + selp.b64 %rd88, 1, 0, %p174; + selp.b64 %rd89, 1, 0, %p173; + selp.b64 %rd90, 1, 0, %p172; + selp.b64 %rd91, 1, 0, %p171; + add.s64 %rd92, %rd83, %rd91; + add.s64 %rd93, %rd82, %rd90; + add.s64 %rd94, %rd81, %rd89; + add.s64 %rd95, %rd80, %rd88; + .loc 1 41 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:41:22 + or.b64 %rd96, %rd16, 112; + or.b32 %r30, %r15, 116; + or.b64 %rd97, %rd16, 120; + or.b32 %r31, %r15, 124; + .loc 1 43 23 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:43:23 + setp.ge.u64 %p175, %rd96, %rd12; + setp.ge.u32 %p176, %r30, %r10; + setp.ge.u64 %p177, %rd97, %rd12; + setp.ge.u32 %p178, %r31, %r10; + .loc 1 45 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:45:22 + cvt.u64.u32 %rd98, %r30; + cvt.u64.u32 %rd99, %r31; + setp.gt.s64 %p179, %rd2, %rd98; + setp.gt.s64 %p180, %rd2, %rd99; + setp.gt.s64 %p181, %rd2, %rd96; + setp.gt.s64 %p182, %rd2, %rd97; + .loc 1 47 22 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:47:22 + and.pred %p183, %p182, %p6; + and.pred %p184, %p181, %p6; + and.pred %p185, %p180, %p6; + and.pred %p186, %p179, %p6; + and.pred %p187, %p186, %p176; + and.pred %p189, %p185, %p178; + and.pred %p191, %p184, %p175; + and.pred %p193, %p183, %p177; + .loc 1 70 50 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:70:50 + and.pred %p195, %p1, %p193; + and.pred %p196, %p1, %p191; + and.pred %p197, %p1, %p189; + and.pred %p198, %p1, %p187; + selp.b64 %rd100, 1, 0, %p198; + selp.b64 %rd101, 1, 0, %p197; + selp.b64 %rd102, 1, 0, %p196; + selp.b64 %rd103, 1, 0, %p195; + add.s64 %rd104, %rd95, %rd103; + add.s64 %rd105, %rd94, %rd102; + add.s64 %rd106, %rd93, %rd101; + add.s64 %rd107, %rd92, %rd100; + .loc 1 26 37 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:26:37 + and.b32 %r32, %r8, 31; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + add.s64 %rd108, %rd107, %rd106; + add.s64 %rd109, %rd105, %rd104; + add.s64 %rd110, %rd109, %rd108; + .loc 2 291 36 // standard.py:291:36 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + mov.b64 {_, %r33}, %rd110; + cvt.u32.u64 %r34, %rd110; + shfl.sync.bfly.b32 %r35, %r34, 16, 31, -1; + shfl.sync.bfly.b32 %r36, %r33, 16, 31, -1; + cvt.u64.u32 %rd111, %r35; + cvt.u64.u32 %rd112, %r36; + shl.b64 %rd113, %rd112, 32; + or.b64 %rd114, %rd111, %rd113; + .loc 2 261 15 // standard.py:261:15 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + add.s64 %rd115, %rd110, %rd114; + .loc 2 291 36 // standard.py:291:36 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + mov.b64 {_, %r37}, %rd115; + cvt.u32.u64 %r38, %rd115; + shfl.sync.bfly.b32 %r39, %r38, 8, 31, -1; + shfl.sync.bfly.b32 %r40, %r37, 8, 31, -1; + cvt.u64.u32 %rd116, %r39; + cvt.u64.u32 %rd117, %r40; + shl.b64 %rd118, %rd117, 32; + or.b64 %rd119, %rd116, %rd118; + .loc 2 261 15 // standard.py:261:15 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + add.s64 %rd120, %rd115, %rd119; + .loc 2 291 36 // standard.py:291:36 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + mov.b64 {_, %r41}, %rd120; + cvt.u32.u64 %r42, %rd120; + shfl.sync.bfly.b32 %r43, %r42, 4, 31, -1; + shfl.sync.bfly.b32 %r44, %r41, 4, 31, -1; + cvt.u64.u32 %rd121, %r43; + cvt.u64.u32 %rd122, %r44; + shl.b64 %rd123, %rd122, 32; + or.b64 %rd124, %rd121, %rd123; + .loc 2 261 15 // standard.py:261:15 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + add.s64 %rd125, %rd120, %rd124; + .loc 2 291 36 // standard.py:291:36 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + mov.b64 {_, %r45}, %rd125; + cvt.u32.u64 %r46, %rd125; + shfl.sync.bfly.b32 %r47, %r46, 2, 31, -1; + shfl.sync.bfly.b32 %r48, %r45, 2, 31, -1; + cvt.u64.u32 %rd126, %r47; + cvt.u64.u32 %rd127, %r48; + shl.b64 %rd128, %rd127, 32; + or.b64 %rd129, %rd126, %rd128; + .loc 2 261 15 // standard.py:261:15 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + add.s64 %rd130, %rd125, %rd129; + .loc 2 291 36 // standard.py:291:36 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + mov.b64 {_, %r49}, %rd130; + cvt.u32.u64 %r50, %rd130; + shfl.sync.bfly.b32 %r51, %r50, 1, 31, -1; + shfl.sync.bfly.b32 %r52, %r49, 1, 31, -1; + cvt.u64.u32 %rd131, %r51; + cvt.u64.u32 %rd132, %r52; + shl.b64 %rd133, %rd132, 32; + or.b64 %rd134, %rd131, %rd133; + .loc 2 261 15 // standard.py:261:15 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + add.s64 %rd5, %rd130, %rd134; + .loc 2 291 36 // standard.py:291:36 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + setp.eq.b32 %p2, %r32, 0; + shr.u32 %r53, %r8, 2; + and.b32 %r54, %r53, 120; + mov.b32 %r55, global_smem; + add.s32 %r1, %r55, %r54; + // begin inline asm + @%p2 st.shared.b64 [ %r1 + 0 ], %rd5; + // end inline asm + bar.sync 0; + setp.lt.u32 %p3, %r8, 16; + shl.b32 %r56, %r8, 3; + add.s32 %r2, %r55, %r56; + // begin inline asm + @%p3 ld.shared.b64 %rd6, [ %r2 + 0 ]; + // end inline asm + mov.b64 {_, %r57}, %rd6; + cvt.u32.u64 %r58, %rd6; + shfl.sync.bfly.b32 %r59, %r58, 8, 31, -1; + shfl.sync.bfly.b32 %r60, %r57, 8, 31, -1; + cvt.u64.u32 %rd135, %r59; + cvt.u64.u32 %rd136, %r60; + shl.b64 %rd137, %rd136, 32; + or.b64 %rd138, %rd135, %rd137; + .loc 2 261 15 // standard.py:261:15 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + add.s64 %rd139, %rd6, %rd138; + .loc 2 291 36 // standard.py:291:36 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + mov.b64 {_, %r61}, %rd139; + cvt.u32.u64 %r62, %rd139; + shfl.sync.bfly.b32 %r63, %r62, 4, 31, -1; + shfl.sync.bfly.b32 %r64, %r61, 4, 31, -1; + cvt.u64.u32 %rd140, %r63; + cvt.u64.u32 %rd141, %r64; + shl.b64 %rd142, %rd141, 32; + or.b64 %rd143, %rd140, %rd142; + .loc 2 261 15 // standard.py:261:15 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + add.s64 %rd144, %rd139, %rd143; + .loc 2 291 36 // standard.py:291:36 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + mov.b64 {_, %r65}, %rd144; + cvt.u32.u64 %r66, %rd144; + shfl.sync.bfly.b32 %r67, %r66, 2, 31, -1; + shfl.sync.bfly.b32 %r68, %r65, 2, 31, -1; + cvt.u64.u32 %rd145, %r67; + cvt.u64.u32 %rd146, %r68; + shl.b64 %rd147, %rd146, 32; + or.b64 %rd148, %rd145, %rd147; + .loc 2 261 15 // standard.py:261:15 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + add.s64 %rd149, %rd144, %rd148; + .loc 2 291 36 // standard.py:291:36 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + mov.b64 {_, %r69}, %rd149; + cvt.u32.u64 %r70, %rd149; + shfl.sync.bfly.b32 %r71, %r70, 1, 31, -1; + shfl.sync.bfly.b32 %r72, %r69, 1, 31, -1; + cvt.u64.u32 %rd150, %r71; + cvt.u64.u32 %rd151, %r72; + shl.b64 %rd152, %rd151, 32; + or.b64 %rd153, %rd150, %rd152; + .loc 2 261 15 // standard.py:261:15 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + add.s64 %rd7, %rd149, %rd153; + .loc 2 291 36 // standard.py:291:36 @[ cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:71:27 ] + setp.eq.b32 %p4, %r8, 0; + // begin inline asm + @%p4 st.shared.b64 [ %r2 + 0 ], %rd7; + // end inline asm + bar.sync 0; + ld.shared.b64 %rd8, [global_smem]; +$L__tmp2: + .loc 1 72 25 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:72:25 + mad.wide.u32 %rd9, %r4, 8, %rd11; + .loc 1 72 37 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:72:37 + setp.eq.b32 %p199, %r13, 0; + and.pred %p5, %p1, %p199; + // begin inline asm + @%p5 st.global.b64 [ %rd9 + 0 ], { %rd8 }; + // end inline asm + .loc 1 72 4 // cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py:72:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 279 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x110 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 113 +.b8 55 +.b8 101 +.b8 100 +.b8 51 +.b8 110 +.b8 101 +.b8 100 +.b8 51 +.b8 98 +.b8 108 +.b8 55 +.b8 117 +.b8 109 +.b8 102 +.b8 103 +.b8 54 +.b8 101 +.b8 105 +.b8 97 +.b8 108 +.b8 97 +.b8 107 +.b8 106 +.b8 100 +.b8 119 +.b8 102 +.b8 55 +.b8 114 +.b8 104 +.b8 110 +.b8 111 +.b8 52 +.b8 109 +.b8 101 +.b8 97 +.b8 121 +.b8 119 +.b8 51 +.b8 52 +.b8 106 +.b8 97 +.b8 107 +.b8 107 +.b8 117 +.b8 122 +.b8 54 +.b8 98 +.b8 103 +.b8 97 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 113 +.b8 55 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x61 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 111 +.b8 114 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 101 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 109 +.b8 117 +.b8 116 +.b8 101 +.b8 95 +.b8 114 +.b8 101 +.b8 109 +.b8 97 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 114 +.b8 95 +.b8 115 +.b8 117 +.b8 98 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xec:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x101:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 71 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..7755368d7c4ca8381e1213be625837170f2a9911 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.source @@ -0,0 +1,308 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":18:0) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc68 = loc(unknown) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc75 = loc("in_ptr0"(#loc)) +#loc76 = loc("out_ptr0"(#loc)) +#loc77 = loc("xnumel"(#loc)) +#loc78 = loc("r0_numel"(#loc)) +#loc140 = loc("input"(#loc66)) +#loc141 = loc("a"(#loc71)) +#loc142 = loc("b"(#loc71)) +module { + tt.func public @triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 512 : i32 loc(#loc79) + %r0_numel_1 = arith.constant 16384 : i32 loc(#loc80) + %xoffset = tt.get_program_id x : i32 loc(#loc81) + %xoffset_2 = arith.constant 1 : i32 loc(#loc82) + %xoffset_3 = arith.constant 1 : i32 loc(#loc82) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc82) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc83) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc84) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc85) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc85) + %xmask = arith.constant dense<512> : tensor<1x1xi32> loc(#loc86) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc86) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc87) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc88) + %x1 = arith.constant 16 : i32 loc(#loc89) + %x1_10 = arith.constant 16 : i32 loc(#loc89) + %x1_11 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc89) + %x1_12 = arith.divsi %xindex_7, %x1_11 : tensor<1x1xi32> loc(#loc89) + %x1_13 = arith.constant 16 : i32 loc(#loc90) + %x1_14 = arith.constant 16 : i32 loc(#loc90) + %x1_15 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc90) + %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<1x1xi32> loc(#loc90) + %x0 = arith.constant 16 : i32 loc(#loc91) + %x0_17 = arith.constant 16 : i32 loc(#loc91) + %x0_18 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc91) + %x0_19 = arith.remsi %xindex_7, %x0_18 : tensor<1x1xi32> loc(#loc91) + %x2 = arith.constant 256 : i32 loc(#loc92) + %x2_20 = arith.constant 256 : i32 loc(#loc92) + %x2_21 = arith.constant dense<256> : tensor<1x1xi32> loc(#loc92) + %x2_22 = arith.divsi %xindex_7, %x2_21 : tensor<1x1xi32> loc(#loc92) + %tmp3 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc93) + %tmp3_23 = tt.addptr %tmp3, %x2_22 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc93) + %tmp3_24 = tt.load %tmp3_23, %xmask_8 evictionPolicy = evict_last : tensor<1x1x!tt.ptr> loc(#loc94) + %_tmp29 = arith.constant 0 : i64 loc(#loc95) + %_tmp29_25 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc95) + %c0_i32 = arith.constant 0 : i32 loc(#loc18) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc18) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc18) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc18) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc18) + %3 = ub.poison : i32 loc(#loc18) + %_tmp29_26 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp29_28 = %_tmp29_25) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc97) + %r0_index_29 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc97) + %r0_mask = arith.constant dense<16384> : tensor<1x2048xi32> loc(#loc98) + %r0_mask_30 = arith.cmpi slt, %r0_index_29, %r0_mask : tensor<1x2048xi32> loc(#loc98) + %r0_4 = arith.constant 128 : i32 loc(#loc99) + %r0_4_31 = arith.constant 128 : i32 loc(#loc99) + %r0_4_32 = arith.constant dense<128> : tensor<1x2048xi32> loc(#loc99) + %r0_4_33 = arith.divsi %r0_index_29, %r0_4_32 : tensor<1x2048xi32> loc(#loc99) + %r0_3 = arith.constant 128 : i32 loc(#loc100) + %r0_3_34 = arith.constant 128 : i32 loc(#loc100) + %r0_3_35 = arith.constant dense<128> : tensor<1x2048xi32> loc(#loc100) + %r0_3_36 = arith.remsi %r0_index_29, %r0_3_35 : tensor<1x2048xi32> loc(#loc100) + %tmp0 = arith.constant 128 : i32 loc(#loc101) + %tmp0_37 = arith.constant 128 : i32 loc(#loc101) + %tmp0_38 = arith.constant dense<128> : tensor<1x1xi32> loc(#loc101) + %tmp0_39 = arith.muli %tmp0_38, %x1_16 : tensor<1x1xi32> loc(#loc101) + %tmp0_40 = tt.broadcast %tmp0_39 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc102) + %tmp0_41 = arith.addi %r0_4_33, %tmp0_40 : tensor<1x2048xi32> loc(#loc102) + %tmp1 = arith.constant 128 : i32 loc(#loc103) + %tmp1_42 = arith.constant 128 : i32 loc(#loc103) + %tmp1_43 = arith.constant dense<128> : tensor<1x1xi32> loc(#loc103) + %tmp1_44 = arith.muli %tmp1_43, %x0_19 : tensor<1x1xi32> loc(#loc103) + %tmp1_45 = tt.broadcast %tmp1_44 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc104) + %tmp1_46 = arith.addi %r0_3_36, %tmp1_45 : tensor<1x2048xi32> loc(#loc104) + %tmp2 = arith.cmpi sge, %tmp0_41, %tmp1_46 : tensor<1x2048xi32> loc(#loc105) + %tmp4 = arith.extsi %tmp1_46 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc106) + %tmp4_47 = tt.broadcast %tmp3_24 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc106) + %tmp4_48 = arith.cmpi slt, %tmp4, %tmp4_47 : tensor<1x2048xi64> loc(#loc106) + %tmp5 = arith.extsi %tmp0_41 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc107) + %tmp5_49 = tt.broadcast %tmp3_24 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc107) + %tmp5_50 = arith.cmpi slt, %tmp5, %tmp5_49 : tensor<1x2048xi64> loc(#loc107) + %tmp6 = arith.andi %tmp4_48, %tmp5_50 : tensor<1x2048xi1> loc(#loc108) + %tmp7 = arith.andi %tmp2, %tmp6 : tensor<1x2048xi1> loc(#loc109) + %tmp8 = arith.constant false loc(#loc110) + %tmp8_51 = arith.constant dense : tensor<1x1xi1> loc(#loc110) + %tmp9 = arith.constant dense : tensor<1x2048xi1> loc(#loc111) + %tmp9_52 = arith.ori %tmp9, %tmp7 : tensor<1x2048xi1> loc(#loc111) + %tmp10 = arith.constant 2048 : i64 loc(#loc112) + %tmp10_53 = arith.constant dense<2048> : tensor<1x1xi64> loc(#loc112) + %tmp11 = arith.extsi %tmp1_46 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc113) + %tmp11_54 = arith.constant dense<2048> : tensor<1x2048xi64> loc(#loc113) + %tmp11_55 = arith.cmpi sge, %tmp11, %tmp11_54 : tensor<1x2048xi64> loc(#loc113) + %tmp12 = arith.andi %tmp11_55, %tmp4_48 : tensor<1x2048xi1> loc(#loc114) + %tmp13 = arith.constant -1 : i32 loc(#loc115) + %tmp13_56 = arith.constant -1 : i32 loc(#loc115) + %tmp13_57 = arith.constant dense<-1> : tensor<1x2048xi32> loc(#loc115) + %tmp13_58 = arith.muli %tmp13_57, %r0_4_33 : tensor<1x2048xi32> loc(#loc115) + %tmp13_59 = arith.addi %r0_3_36, %tmp13_58 : tensor<1x2048xi32> loc(#loc116) + %tmp13_60 = arith.constant -128 : i32 loc(#loc117) + %tmp13_61 = arith.constant -128 : i32 loc(#loc117) + %tmp13_62 = arith.constant dense<-128> : tensor<1x1xi32> loc(#loc117) + %tmp13_63 = arith.muli %tmp13_62, %x1_16 : tensor<1x1xi32> loc(#loc117) + %tmp13_64 = tt.broadcast %tmp13_63 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc118) + %tmp13_65 = arith.addi %tmp13_59, %tmp13_64 : tensor<1x2048xi32> loc(#loc118) + %tmp13_66 = arith.constant 128 : i32 loc(#loc119) + %tmp13_67 = arith.constant 128 : i32 loc(#loc119) + %tmp13_68 = arith.constant dense<128> : tensor<1x1xi32> loc(#loc119) + %tmp13_69 = arith.muli %tmp13_68, %x0_19 : tensor<1x1xi32> loc(#loc119) + %tmp13_70 = tt.broadcast %tmp13_69 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc120) + %tmp13_71 = arith.addi %tmp13_65, %tmp13_70 : tensor<1x2048xi32> loc(#loc120) + %tmp14 = arith.extsi %tmp13_71 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc121) + %tmp14_72 = arith.constant dense<2048> : tensor<1x2048xi64> loc(#loc121) + %tmp14_73 = arith.remsi %tmp14, %tmp14_72 : tensor<1x2048xi64> loc(#loc121) + %tmp15 = arith.constant 0 : i32 loc(#loc122) + %tmp15_74 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc122) + %tmp16 = arith.extsi %tmp15_74 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc123) + %tmp16_75 = tt.broadcast %tmp16 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc123) + %tmp16_76 = arith.cmpi ne, %tmp14_73, %tmp16_75 : tensor<1x2048xi64> loc(#loc123) + %tmp17 = arith.constant 0 : i32 loc(#loc124) + %tmp17_77 = arith.extsi %tmp17 : i32 to i64 loc(#loc124) + %tmp17_78 = tt.splat %tmp17_77 : i64 -> tensor<1x2048xi64> loc(#loc124) + %tmp17_79 = arith.cmpi slt, %tmp14_73, %tmp17_78 : tensor<1x2048xi64> loc(#loc124) + %tmp18 = arith.constant 0 : i32 loc(#loc125) + %tmp18_80 = arith.extsi %tmp18 : i32 to i64 loc(#loc125) + %tmp18_81 = tt.splat %tmp18_80 : i64 -> tensor<1x1xi64> loc(#loc125) + %tmp18_82 = arith.cmpi slt, %tmp10_53, %tmp18_81 : tensor<1x1xi64> loc(#loc125) + %tmp19 = tt.broadcast %tmp18_82 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc126) + %tmp19_83 = arith.cmpi ne, %tmp17_79, %tmp19 : tensor<1x2048xi1> loc(#loc126) + %tmp20 = arith.andi %tmp16_76, %tmp19_83 : tensor<1x2048xi1> loc(#loc127) + %tmp21 = arith.constant dense<2048> : tensor<1x2048xi64> loc(#loc128) + %tmp21_84 = arith.addi %tmp14_73, %tmp21 : tensor<1x2048xi64> loc(#loc128) + %tmp22 = arith.select %tmp20, %tmp21_84, %tmp14_73 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc129) + %tmp23 = arith.constant 0 : i64 loc(#loc130) + %tmp23_85 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc130) + %tmp24 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc131) + %tmp24_86 = arith.cmpi eq, %tmp22, %tmp24 : tensor<1x2048xi64> loc(#loc131) + %tmp25 = arith.andi %tmp12, %tmp24_86 : tensor<1x2048xi1> loc(#loc132) + %tmp26 = arith.ori %tmp9_52, %tmp25 : tensor<1x2048xi1> loc(#loc133) + %tmp27 = arith.extui %tmp26 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc134) + %tmp30 = arith.addi %_tmp29_28, %tmp27 : tensor<1x2048xi64> loc(#loc135) + %_tmp29_87 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc136) + %_tmp29_88 = arith.andi %r0_mask_30, %_tmp29_87 : tensor<1x2048xi1> loc(#loc136) + %_tmp29_89 = arith.select %_tmp29_88, %tmp30, %_tmp29_28 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc137) + scf.yield %_tmp29_89 : tensor<1x2048xi64> loc(#loc60) + } loc(#loc96) + %tmp29 = tt.call @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp29_26) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc138) + %tmp29_27 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc139) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc63) + %5 = tt.addptr %4, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc63) + tt.store %5, %tmp29_27, %xmask_8 : tensor<1x1x!tt.ptr> loc(#loc64) + tt.return loc(#loc65) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2048xi64> loc("input"(#loc66))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc67) + tt.reduce.return %2 : i64 loc(#loc67) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc67) + tt.return %0 : tensor<1xi64> loc(#loc69) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc70) + tt.return %1 : tensor<1xi64> loc(#loc70) + } loc(#loc66) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc71)), %b: i64 loc("b"(#loc71))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc72) + tt.return %0 : i64 loc(#loc73) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc74) + tt.return %1 : i64 loc(#loc74) + } loc(#loc71) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":25:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":28:21) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":28:27) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":29:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":30:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":31:30) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":31:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":32:44) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":34:40) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":35:31) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":36:29) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":39:27) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":40:27) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":41:26) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":41:22) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":42:26) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":42:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":43:23) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":44:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":45:22) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":46:22) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":47:22) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":48:38) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":49:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":50:38) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":51:24) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":52:24) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":53:29) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":53:24) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":53:45) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":53:38) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":53:55) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":53:51) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":54:25) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":55:35) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":56:25) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":57:92) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":58:92) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":59:25) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":60:24) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":61:24) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":62:39) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":63:35) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":64:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":65:24) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":66:23) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":67:25) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":69:25) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":70:36) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":70:50) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":70:8) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":71:27) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":71:30) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":72:25) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":72:37) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":72:4) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc79 = loc("xnumel"(#loc1)) +#loc80 = loc("r0_numel"(#loc2)) +#loc81 = loc("xoffset"(#loc3)) +#loc82 = loc("xoffset"(#loc4)) +#loc83 = loc("xindex"(#loc5)) +#loc84 = loc("xindex"(#loc6)) +#loc85 = loc("xindex"(#loc7)) +#loc86 = loc("xmask"(#loc8)) +#loc87 = loc("r0_base"(#loc9)) +#loc88 = loc("r0_base"(#loc10)) +#loc89 = loc("x1"(#loc11)) +#loc90 = loc("x1"(#loc12)) +#loc91 = loc("x0"(#loc13)) +#loc92 = loc("x2"(#loc14)) +#loc93 = loc("tmp3"(#loc15)) +#loc94 = loc("tmp3"(#loc16)) +#loc95 = loc("_tmp29"(#loc17)) +#loc96 = loc("_tmp29"(#loc18)) +#loc97 = loc("r0_index"(#loc19)) +#loc98 = loc("r0_mask"(#loc20)) +#loc99 = loc("r0_4"(#loc21)) +#loc100 = loc("r0_3"(#loc22)) +#loc101 = loc("tmp0"(#loc23)) +#loc102 = loc("tmp0"(#loc24)) +#loc103 = loc("tmp1"(#loc25)) +#loc104 = loc("tmp1"(#loc26)) +#loc105 = loc("tmp2"(#loc27)) +#loc106 = loc("tmp4"(#loc28)) +#loc107 = loc("tmp5"(#loc29)) +#loc108 = loc("tmp6"(#loc30)) +#loc109 = loc("tmp7"(#loc31)) +#loc110 = loc("tmp8"(#loc32)) +#loc111 = loc("tmp9"(#loc33)) +#loc112 = loc("tmp10"(#loc34)) +#loc113 = loc("tmp11"(#loc35)) +#loc114 = loc("tmp12"(#loc36)) +#loc115 = loc("tmp13"(#loc37)) +#loc116 = loc("tmp13"(#loc38)) +#loc117 = loc("tmp13"(#loc39)) +#loc118 = loc("tmp13"(#loc40)) +#loc119 = loc("tmp13"(#loc41)) +#loc120 = loc("tmp13"(#loc42)) +#loc121 = loc("tmp14"(#loc43)) +#loc122 = loc("tmp15"(#loc44)) +#loc123 = loc("tmp16"(#loc45)) +#loc124 = loc("tmp17"(#loc46)) +#loc125 = loc("tmp18"(#loc47)) +#loc126 = loc("tmp19"(#loc48)) +#loc127 = loc("tmp20"(#loc49)) +#loc128 = loc("tmp21"(#loc50)) +#loc129 = loc("tmp22"(#loc51)) +#loc130 = loc("tmp23"(#loc52)) +#loc131 = loc("tmp24"(#loc53)) +#loc132 = loc("tmp25"(#loc54)) +#loc133 = loc("tmp26"(#loc55)) +#loc134 = loc("tmp27"(#loc56)) +#loc135 = loc("tmp30"(#loc57)) +#loc136 = loc("_tmp29"(#loc58)) +#loc137 = loc("_tmp29"(#loc59)) +#loc138 = loc("tmp29"(#loc61)) +#loc139 = loc("tmp29"(#loc62)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..50c7c0868bc56587cf4c8f0b9cf34244ae00ebc8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttgir @@ -0,0 +1,193 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":18:0) +#loc1 = loc(unknown) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":71:27) +#loc52 = loc("in_ptr0"(#loc)) +#loc53 = loc("out_ptr0"(#loc)) +#loc54 = loc("xnumel"(#loc)) +#loc55 = loc("r0_numel"(#loc)) +#loc98 = loc("tmp29"(#loc46)) +#loc105 = loc(callsite(#loc1 at #loc98)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x2048xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<16384> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<2048> : tensor<1x2048xi64, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c-128_i32 = arith.constant -128 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc56) + %xmask = arith.cmpi slt, %xoffset, %c512_i32 : i32 loc(#loc57) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc58) + %r0_base_3 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc58) + %x1 = arith.divsi %xoffset, %c16_i32 : i32 loc(#loc59) + %x1_4 = arith.remsi %x1, %c16_i32 : i32 loc(#loc60) + %x0 = arith.remsi %xoffset, %c16_i32 : i32 loc(#loc61) + %x2 = arith.divsi %xoffset, %c256_i32 : i32 loc(#loc62) + %tmp3 = tt.addptr %in_ptr0, %x2 : !tt.ptr, i32 loc(#loc63) + %tmp3_5 = tt.splat %tmp3 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc64) + %tmp3_6 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc64) + %tmp3_7 = tt.load %tmp3_5, %tmp3_6 evictionPolicy = evict_last : tensor<1x1x!tt.ptr, #blocked> loc(#loc64) + %tmp0 = arith.muli %x1_4, %c128_i32 : i32 loc(#loc65) + %tmp0_8 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc100) + %tmp1 = arith.muli %x0, %c128_i32 : i32 loc(#loc67) + %tmp1_9 = tt.splat %tmp1 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc101) + %tmp4 = tt.broadcast %tmp3_7 : tensor<1x1xi64, #blocked> -> tensor<1x2048xi64, #blocked> loc(#loc69) + %tmp13 = arith.muli %x1_4, %c-128_i32 : i32 loc(#loc70) + %tmp13_10 = tt.splat %tmp13 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc102) + %_tmp29 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc103) + %_tmp29_11 = scf.for %r0_offset = %c0_i32 to %c16384_i32 step %c2048_i32 iter_args(%_tmp29_13 = %cst) -> (tensor<1x2048xi64, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc74) + %r0_index_14 = arith.addi %r0_index, %r0_base_3 : tensor<1x2048xi32, #blocked> loc(#loc74) + %r0_mask = arith.cmpi slt, %r0_index_14, %cst_0 : tensor<1x2048xi32, #blocked> loc(#loc75) + %r0_4 = arith.divsi %r0_index_14, %cst_1 : tensor<1x2048xi32, #blocked> loc(#loc76) + %r0_3 = arith.remsi %r0_index_14, %cst_1 : tensor<1x2048xi32, #blocked> loc(#loc77) + %tmp0_15 = arith.addi %r0_4, %tmp0_8 : tensor<1x2048xi32, #blocked> loc(#loc66) + %tmp1_16 = arith.addi %r0_3, %tmp1_9 : tensor<1x2048xi32, #blocked> loc(#loc68) + %tmp2 = arith.cmpi sge, %tmp0_15, %tmp1_16 : tensor<1x2048xi32, #blocked> loc(#loc78) + %tmp4_17 = arith.extsi %tmp1_16 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc69) + %tmp4_18 = arith.cmpi slt, %tmp4_17, %tmp4 : tensor<1x2048xi64, #blocked> loc(#loc69) + %tmp5 = arith.extsi %tmp0_15 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc79) + %tmp5_19 = arith.cmpi slt, %tmp5, %tmp4 : tensor<1x2048xi64, #blocked> loc(#loc79) + %tmp6 = arith.andi %tmp4_18, %tmp5_19 : tensor<1x2048xi1, #blocked> loc(#loc80) + %tmp7 = arith.andi %tmp2, %tmp6 : tensor<1x2048xi1, #blocked> loc(#loc81) + %tmp11 = arith.cmpi sge, %tmp4_17, %cst_2 : tensor<1x2048xi64, #blocked> loc(#loc82) + %tmp12 = arith.andi %tmp11, %tmp4_18 : tensor<1x2048xi1, #blocked> loc(#loc83) + %tmp13_20 = arith.subi %r0_3, %r0_4 : tensor<1x2048xi32, #blocked> loc(#loc84) + %tmp13_21 = arith.addi %tmp13_20, %tmp13_10 : tensor<1x2048xi32, #blocked> loc(#loc71) + %tmp13_22 = arith.addi %tmp13_21, %tmp1_9 : tensor<1x2048xi32, #blocked> loc(#loc85) + %tmp14 = arith.extsi %tmp13_22 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc86) + %tmp14_23 = arith.remsi %tmp14, %cst_2 : tensor<1x2048xi64, #blocked> loc(#loc86) + %tmp16 = arith.cmpi ne, %tmp14_23, %cst : tensor<1x2048xi64, #blocked> loc(#loc87) + %tmp17 = arith.cmpi slt, %tmp14_23, %cst : tensor<1x2048xi64, #blocked> loc(#loc88) + %tmp20 = arith.andi %tmp16, %tmp17 : tensor<1x2048xi1, #blocked> loc(#loc89) + %tmp21 = arith.addi %tmp14_23, %cst_2 : tensor<1x2048xi64, #blocked> loc(#loc90) + %tmp22 = arith.select %tmp20, %tmp21, %tmp14_23 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc91) + %tmp24 = arith.cmpi eq, %tmp22, %cst : tensor<1x2048xi64, #blocked> loc(#loc92) + %tmp25 = arith.andi %tmp12, %tmp24 : tensor<1x2048xi1, #blocked> loc(#loc93) + %tmp26 = arith.ori %tmp7, %tmp25 : tensor<1x2048xi1, #blocked> loc(#loc94) + %tmp27 = arith.extui %tmp26 : tensor<1x2048xi1, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc95) + %tmp30 = arith.addi %_tmp29_13, %tmp27 : tensor<1x2048xi64, #blocked> loc(#loc96) + %_tmp29_24 = arith.andi %r0_mask, %_tmp29 : tensor<1x2048xi1, #blocked> loc(#loc72) + %_tmp29_25 = arith.select %_tmp29_24, %tmp30, %_tmp29_13 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc97) + scf.yield %_tmp29_25 : tensor<1x2048xi64, #blocked> loc(#loc44) + } loc(#loc73) + %tmp29 = "tt.reduce"(%_tmp29_11) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_13: i64 loc(callsite(#loc1 at #loc98)), %tmp29_14: i64 loc(callsite(#loc1 at #loc98))): + %tmp29_15 = arith.addi %tmp29_13, %tmp29_14 : i64 loc(#loc106) + tt.reduce.return %tmp29_15 : i64 loc(#loc104) + }) : (tensor<1x2048xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc104) + %0 = ttg.convert_layout %tmp29 : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc48) + %tmp29_12 = tt.expand_dims %0 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi64, #blocked1> loc(#loc99) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc50) + %2 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc48) + %3 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked1> loc(#loc48) + tt.store %2, %tmp29_12, %3 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc48) + tt.return loc(#loc51) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":25:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":26:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":28:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":28:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":29:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":30:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":31:30) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":31:35) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":41:26) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":41:22) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":42:26) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":42:22) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":44:22) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":53:45) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":53:38) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":70:36) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":34:40) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":35:31) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":36:29) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":39:27) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":40:27) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":43:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":45:22) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":46:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":47:22) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":51:24) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":52:24) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":53:24) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":53:51) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":54:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":56:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":57:92) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":60:24) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":61:24) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":62:39) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":64:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":65:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":66:23) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":67:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":69:25) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":70:50) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":70:8) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":72:37) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":71:30) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":72:25) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":72:4) +#loc56 = loc("xoffset"(#loc2)) +#loc57 = loc("xmask"(#loc3)) +#loc58 = loc("r0_base"(#loc4)) +#loc59 = loc("x1"(#loc5)) +#loc60 = loc("x1"(#loc6)) +#loc61 = loc("x0"(#loc7)) +#loc62 = loc("x2"(#loc8)) +#loc63 = loc("tmp3"(#loc9)) +#loc64 = loc("tmp3"(#loc10)) +#loc65 = loc("tmp0"(#loc11)) +#loc66 = loc("tmp0"(#loc12)) +#loc67 = loc("tmp1"(#loc13)) +#loc68 = loc("tmp1"(#loc14)) +#loc69 = loc("tmp4"(#loc15)) +#loc70 = loc("tmp13"(#loc16)) +#loc71 = loc("tmp13"(#loc17)) +#loc72 = loc("_tmp29"(#loc18)) +#loc73 = loc("_tmp29"(#loc19)) +#loc74 = loc("r0_index"(#loc20)) +#loc75 = loc("r0_mask"(#loc21)) +#loc76 = loc("r0_4"(#loc22)) +#loc77 = loc("r0_3"(#loc23)) +#loc78 = loc("tmp2"(#loc24)) +#loc79 = loc("tmp5"(#loc25)) +#loc80 = loc("tmp6"(#loc26)) +#loc81 = loc("tmp7"(#loc27)) +#loc82 = loc("tmp11"(#loc28)) +#loc83 = loc("tmp12"(#loc29)) +#loc84 = loc("tmp13"(#loc30)) +#loc85 = loc("tmp13"(#loc31)) +#loc86 = loc("tmp14"(#loc32)) +#loc87 = loc("tmp16"(#loc33)) +#loc88 = loc("tmp17"(#loc34)) +#loc89 = loc("tmp20"(#loc35)) +#loc90 = loc("tmp21"(#loc36)) +#loc91 = loc("tmp22"(#loc37)) +#loc92 = loc("tmp24"(#loc38)) +#loc93 = loc("tmp25"(#loc39)) +#loc94 = loc("tmp26"(#loc40)) +#loc95 = loc("tmp27"(#loc41)) +#loc96 = loc("tmp30"(#loc42)) +#loc97 = loc("_tmp29"(#loc43)) +#loc99 = loc("tmp29"(#loc49)) +#loc100 = loc(fused[#loc66, #loc65]) +#loc101 = loc(fused[#loc68, #loc67]) +#loc102 = loc(fused[#loc71, #loc70]) +#loc103 = loc(fused[#loc72, #loc57]) +#loc104 = loc(callsite(#loc45 at #loc98)) +#loc106 = loc(callsite(#loc47 at #loc104)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..60a1ada789e59ce5cbfd8b9d05bd6c019d129ce7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/NPAKBJCLT5XSL3YCO7HFL6J2WUXGP55DM3QI5S45VDSN74FVU44A/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttir @@ -0,0 +1,191 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":18:0) +#loc1 = loc(unknown) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":71:27) +#loc53 = loc("in_ptr0"(#loc)) +#loc54 = loc("out_ptr0"(#loc)) +#loc55 = loc("xnumel"(#loc)) +#loc56 = loc("r0_numel"(#loc)) +#loc100 = loc("tmp29"(#loc47)) +#loc107 = loc(callsite(#loc1 at #loc100)) +module { + tt.func public @triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c-128_i32 = arith.constant -128 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %x2 = arith.constant 256 : i32 loc(#loc57) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %xmask = arith.constant 512 : i32 loc(#loc58) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc4) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc4) + %c0_i32 = arith.constant 0 : i32 loc(#loc4) + %cst = arith.constant dense<2048> : tensor<1x2048xi64> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<1x2048xi32> loc(#loc1) + %cst_1 = arith.constant dense<16384> : tensor<1x2048xi32> loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc59) + %xmask_3 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc58) + %xmask_4 = tt.splat %xmask_3 : i1 -> tensor<1x1xi1> loc(#loc58) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc60) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc61) + %x1 = arith.divsi %xoffset, %c16_i32 : i32 loc(#loc62) + %x1_6 = arith.remsi %x1, %c16_i32 : i32 loc(#loc63) + %x0 = arith.remsi %xoffset, %c16_i32 : i32 loc(#loc64) + %x2_7 = arith.divsi %xoffset, %x2 : i32 loc(#loc57) + %tmp3 = tt.addptr %in_ptr0, %x2_7 : !tt.ptr, i32 loc(#loc65) + %tmp3_8 = tt.splat %tmp3 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc65) + %tmp3_9 = tt.load %tmp3_8, %xmask_4 evictionPolicy = evict_last : tensor<1x1x!tt.ptr> loc(#loc66) + %_tmp29 = scf.for %r0_offset = %c0_i32 to %c16384_i32 step %c2048_i32 iter_args(%_tmp29_11 = %cst_2) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc68) + %r0_index_12 = arith.addi %r0_index, %r0_base_5 : tensor<1x2048xi32> loc(#loc68) + %r0_mask = arith.cmpi slt, %r0_index_12, %cst_1 : tensor<1x2048xi32> loc(#loc69) + %r0_4 = arith.divsi %r0_index_12, %cst_0 : tensor<1x2048xi32> loc(#loc70) + %r0_3 = arith.remsi %r0_index_12, %cst_0 : tensor<1x2048xi32> loc(#loc71) + %tmp0 = arith.muli %x1_6, %c128_i32 : i32 loc(#loc72) + %tmp0_13 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc102) + %tmp0_14 = arith.addi %r0_4, %tmp0_13 : tensor<1x2048xi32> loc(#loc73) + %tmp1 = arith.muli %x0, %c128_i32 : i32 loc(#loc74) + %tmp1_15 = tt.splat %tmp1 : i32 -> tensor<1x2048xi32> loc(#loc103) + %tmp1_16 = arith.addi %r0_3, %tmp1_15 : tensor<1x2048xi32> loc(#loc75) + %tmp2 = arith.cmpi sge, %tmp0_14, %tmp1_16 : tensor<1x2048xi32> loc(#loc76) + %tmp4 = arith.extsi %tmp1_16 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc77) + %tmp4_17 = tt.broadcast %tmp3_9 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc77) + %tmp4_18 = arith.cmpi slt, %tmp4, %tmp4_17 : tensor<1x2048xi64> loc(#loc77) + %tmp5 = arith.extsi %tmp0_14 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc78) + %tmp5_19 = arith.cmpi slt, %tmp5, %tmp4_17 : tensor<1x2048xi64> loc(#loc78) + %tmp6 = arith.andi %tmp4_18, %tmp5_19 : tensor<1x2048xi1> loc(#loc79) + %tmp7 = arith.andi %tmp2, %tmp6 : tensor<1x2048xi1> loc(#loc80) + %tmp11 = arith.cmpi sge, %tmp4, %cst : tensor<1x2048xi64> loc(#loc81) + %tmp12 = arith.andi %tmp11, %tmp4_18 : tensor<1x2048xi1> loc(#loc82) + %tmp13 = arith.subi %r0_3, %r0_4 : tensor<1x2048xi32> loc(#loc83) + %tmp13_20 = arith.muli %x1_6, %c-128_i32 : i32 loc(#loc84) + %tmp13_21 = tt.splat %tmp13_20 : i32 -> tensor<1x2048xi32> loc(#loc104) + %tmp13_22 = arith.addi %tmp13, %tmp13_21 : tensor<1x2048xi32> loc(#loc85) + %tmp13_23 = arith.addi %tmp13_22, %tmp1_15 : tensor<1x2048xi32> loc(#loc86) + %tmp14 = arith.extsi %tmp13_23 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc87) + %tmp14_24 = arith.remsi %tmp14, %cst : tensor<1x2048xi64> loc(#loc87) + %tmp16 = arith.cmpi ne, %tmp14_24, %cst_2 : tensor<1x2048xi64> loc(#loc88) + %tmp17 = arith.cmpi slt, %tmp14_24, %cst_2 : tensor<1x2048xi64> loc(#loc89) + %tmp20 = arith.andi %tmp16, %tmp17 : tensor<1x2048xi1> loc(#loc90) + %tmp21 = arith.addi %tmp14_24, %cst : tensor<1x2048xi64> loc(#loc91) + %tmp22 = arith.select %tmp20, %tmp21, %tmp14_24 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc92) + %tmp24 = arith.cmpi eq, %tmp22, %cst_2 : tensor<1x2048xi64> loc(#loc93) + %tmp25 = arith.andi %tmp12, %tmp24 : tensor<1x2048xi1> loc(#loc94) + %tmp26 = arith.ori %tmp7, %tmp25 : tensor<1x2048xi1> loc(#loc95) + %tmp27 = arith.extui %tmp26 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc96) + %tmp30 = arith.addi %_tmp29_11, %tmp27 : tensor<1x2048xi64> loc(#loc97) + %_tmp29_25 = tt.splat %xmask_3 : i1 -> tensor<1x2048xi1> loc(#loc105) + %_tmp29_26 = arith.andi %r0_mask, %_tmp29_25 : tensor<1x2048xi1> loc(#loc98) + %_tmp29_27 = arith.select %_tmp29_26, %tmp30, %_tmp29_11 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc99) + scf.yield %_tmp29_27 : tensor<1x2048xi64> loc(#loc45) + } loc(#loc67) + %tmp29 = "tt.reduce"(%_tmp29) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_11: i64 loc(callsite(#loc1 at #loc100)), %tmp29_12: i64 loc(callsite(#loc1 at #loc100))): + %tmp29_13 = arith.addi %tmp29_11, %tmp29_12 : i64 loc(#loc108) + tt.reduce.return %tmp29_13 : i64 loc(#loc106) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc106) + %tmp29_10 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc101) + %0 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc50) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc50) + tt.store %1, %tmp29_10, %xmask_4 : tensor<1x1x!tt.ptr> loc(#loc51) + tt.return loc(#loc52) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":30:19) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":25:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":34:40) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":23:28) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":26:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":26:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":28:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":28:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":29:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":31:30) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":31:35) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":35:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":36:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":39:27) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":40:27) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":41:26) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":41:22) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":42:26) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":42:22) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":43:23) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":44:22) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":45:22) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":46:22) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":47:22) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":51:24) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":52:24) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":53:24) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":53:45) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":53:38) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":53:51) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":54:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":56:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":57:92) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":60:24) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":61:24) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":62:39) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":64:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":65:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":66:23) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":67:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":69:25) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":70:36) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":70:50) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":70:8) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":71:30) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":72:25) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":72:37) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/q7/cq7ed3ned3bl7umfg6eialakjdwf7rhno4meayw34jakkuz6bga5.py":72:4) +#loc57 = loc("x2"(#loc2)) +#loc58 = loc("xmask"(#loc3)) +#loc59 = loc("xoffset"(#loc5)) +#loc60 = loc("r0_base"(#loc6)) +#loc61 = loc("r0_base"(#loc7)) +#loc62 = loc("x1"(#loc8)) +#loc63 = loc("x1"(#loc9)) +#loc64 = loc("x0"(#loc10)) +#loc65 = loc("tmp3"(#loc11)) +#loc66 = loc("tmp3"(#loc12)) +#loc67 = loc("_tmp29"(#loc4)) +#loc68 = loc("r0_index"(#loc13)) +#loc69 = loc("r0_mask"(#loc14)) +#loc70 = loc("r0_4"(#loc15)) +#loc71 = loc("r0_3"(#loc16)) +#loc72 = loc("tmp0"(#loc17)) +#loc73 = loc("tmp0"(#loc18)) +#loc74 = loc("tmp1"(#loc19)) +#loc75 = loc("tmp1"(#loc20)) +#loc76 = loc("tmp2"(#loc21)) +#loc77 = loc("tmp4"(#loc22)) +#loc78 = loc("tmp5"(#loc23)) +#loc79 = loc("tmp6"(#loc24)) +#loc80 = loc("tmp7"(#loc25)) +#loc81 = loc("tmp11"(#loc26)) +#loc82 = loc("tmp12"(#loc27)) +#loc83 = loc("tmp13"(#loc28)) +#loc84 = loc("tmp13"(#loc29)) +#loc85 = loc("tmp13"(#loc30)) +#loc86 = loc("tmp13"(#loc31)) +#loc87 = loc("tmp14"(#loc32)) +#loc88 = loc("tmp16"(#loc33)) +#loc89 = loc("tmp17"(#loc34)) +#loc90 = loc("tmp20"(#loc35)) +#loc91 = loc("tmp21"(#loc36)) +#loc92 = loc("tmp22"(#loc37)) +#loc93 = loc("tmp24"(#loc38)) +#loc94 = loc("tmp25"(#loc39)) +#loc95 = loc("tmp26"(#loc40)) +#loc96 = loc("tmp27"(#loc41)) +#loc97 = loc("tmp30"(#loc42)) +#loc98 = loc("_tmp29"(#loc43)) +#loc99 = loc("_tmp29"(#loc44)) +#loc101 = loc("tmp29"(#loc49)) +#loc102 = loc(fused[#loc73, #loc72]) +#loc103 = loc(fused[#loc75, #loc74]) +#loc104 = loc(fused[#loc85, #loc84]) +#loc105 = loc(fused[#loc98, #loc58]) +#loc106 = loc(callsite(#loc46 at #loc100)) +#loc108 = loc(callsite(#loc48 at #loc106)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/__grp__triton_poi_fused__to_copy_6.json b/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/__grp__triton_poi_fused__to_copy_6.json new file mode 100644 index 0000000000000000000000000000000000000000..39918e8069ed607a25e5d8e81e9ed621d0d79d24 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/__grp__triton_poi_fused__to_copy_6.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused__to_copy_6.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.source", "triton_poi_fused__to_copy_6.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.ttir", "triton_poi_fused__to_copy_6.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.ttgir", "triton_poi_fused__to_copy_6.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.llir", "triton_poi_fused__to_copy_6.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.ptx", "triton_poi_fused__to_copy_6.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.cubin", "triton_poi_fused__to_copy_6.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.cubin new file mode 100644 index 0000000000000000000000000000000000000000..596b3f642391af8b85f1586327c4a336a97cabdb Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.json b/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.json new file mode 100644 index 0000000000000000000000000000000000000000..07222745ad33697eea351ba1815ebe08dd1fbeec --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.json @@ -0,0 +1 @@ +{"hash": "7f9c8aa7eedbe585cc5b068fb965e1bfd3f89c3a015f7be4c9ecc1922aecec7c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__to_copy_6"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.llir new file mode 100644 index 0000000000000000000000000000000000000000..25353843e8f4d7668f3e015fba9795aa8b1d05ae --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.llir @@ -0,0 +1,111 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused__to_copy_6(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i64 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = shl i32 %9, 8, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 127, !dbg !9 + %13 = or disjoint i32 %10, %12, !dbg !10 + %14 = or disjoint i32 %13, 128, !dbg !10 + %15 = icmp slt i32 %13, %5, !dbg !11 + %16 = icmp slt i32 %14, %5, !dbg !11 + %17 = sext i32 %13 to i64, !dbg !12 + %18 = sext i32 %14 to i64, !dbg !12 + %.frozen = freeze i64 %2, !dbg !13 + %19 = sdiv i64 %17, %.frozen, !dbg !13 + %20 = mul i64 %19, %.frozen, !dbg !12 + %.decomposed = sub i64 %17, %20, !dbg !12 + %.frozen1 = freeze i64 %2, !dbg !13 + %21 = sdiv i64 %18, %.frozen1, !dbg !13 + %22 = mul i64 %21, %.frozen1, !dbg !12 + %.decomposed2 = sub i64 %18, %22, !dbg !12 + %23 = srem i64 %19, %3, !dbg !14 + %24 = srem i64 %21, %3, !dbg !14 + %25 = sdiv i64 %17, %4, !dbg !15 + %26 = sdiv i64 %18, %4, !dbg !15 + %27 = insertelement <2 x i64> poison, i64 %3, i64 0, !dbg !16 + %28 = insertelement <2 x i64> %27, i64 %2, i64 1, !dbg !16 + %29 = icmp slt <2 x i64> %28, splat (i64 2), !dbg !16 + %30 = icmp sgt <2 x i64> %28, splat (i64 1), !dbg !17 + %31 = select <2 x i1> %30, <2 x i64> %28, <2 x i64> zeroinitializer, !dbg !18 + %32 = zext <2 x i1> %29 to <2 x i64>, !dbg !19 + %33 = add <2 x i64> %31, %32, !dbg !20 + %34 = extractelement <2 x i64> %33, i64 0, !dbg !21 + %35 = mul i64 %.decomposed, %34, !dbg !22 + %36 = mul i64 %.decomposed2, %34, !dbg !22 + %37 = extractelement <2 x i64> %33, i64 1, !dbg !21 + %38 = mul i64 %34, %37, !dbg !21 + %39 = mul i64 %38, %25, !dbg !23 + %40 = mul i64 %38, %26, !dbg !23 + %41 = getelementptr i64, ptr addrspace(1) %0, i64 %23, !dbg !24 + %42 = getelementptr i64, ptr addrspace(1) %41, i64 %35, !dbg !24 + %43 = getelementptr i64, ptr addrspace(1) %42, i64 %39, !dbg !24 + %44 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !24 + %45 = getelementptr i64, ptr addrspace(1) %44, i64 %36, !dbg !24 + %46 = getelementptr i64, ptr addrspace(1) %45, i64 %40, !dbg !24 + %47 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !25 + %48 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %43, i64 %47, i1 %15) #2, !dbg !25 + %49 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !25 + %50 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %46, i64 %49, i1 %16) #2, !dbg !25 + %51 = trunc i64 %48 to i32, !dbg !26 + %52 = trunc i64 %50 to i32, !dbg !26 + %53 = mul i64 %23, %37, !dbg !27 + %54 = mul i64 %24, %37, !dbg !27 + %55 = getelementptr i32, ptr addrspace(1) %1, i64 %.decomposed, !dbg !28 + %56 = getelementptr i32, ptr addrspace(1) %55, i64 %53, !dbg !28 + %57 = getelementptr i32, ptr addrspace(1) %56, i64 %39, !dbg !28 + %58 = getelementptr i32, ptr addrspace(1) %1, i64 %.decomposed2, !dbg !28 + %59 = getelementptr i32, ptr addrspace(1) %58, i64 %54, !dbg !28 + %60 = getelementptr i32, ptr addrspace(1) %59, i64 %40, !dbg !28 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %51, ptr addrspace(1) %57, i1 %15) #2, !dbg !29 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %52, ptr addrspace(1) %60, i1 %16) #2, !dbg !29 + ret void, !dbg !30 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused__to_copy_6", linkageName: "triton_poi_fused__to_copy_6", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 19, column: 28, scope: !4) +!8 = !DILocation(line: 19, column: 33, scope: !4) +!9 = !DILocation(line: 20, column: 36, scope: !4) +!10 = !DILocation(line: 20, column: 23, scope: !4) +!11 = !DILocation(line: 21, column: 21, scope: !4) +!12 = !DILocation(line: 22, column: 19, scope: !4) +!13 = !DILocation(line: 23, column: 21, scope: !4) +!14 = !DILocation(line: 23, column: 28, scope: !4) +!15 = !DILocation(line: 24, column: 19, scope: !4) +!16 = !DILocation(line: 25, column: 54, scope: !4) +!17 = !DILocation(line: 25, column: 80, scope: !4) +!18 = !DILocation(line: 25, column: 71, scope: !4) +!19 = !DILocation(line: 25, scope: !4) +!20 = !DILocation(line: 25, column: 62, scope: !4) +!21 = !DILocation(line: 25, column: 91, scope: !4) +!22 = !DILocation(line: 25, column: 39, scope: !4) +!23 = !DILocation(line: 25, column: 138, scope: !4) +!24 = !DILocation(line: 25, column: 30, scope: !4) +!25 = !DILocation(line: 25, column: 186, scope: !4) +!26 = !DILocation(line: 26, column: 19, scope: !4) +!27 = !DILocation(line: 27, column: 34, scope: !4) +!28 = !DILocation(line: 27, column: 25, scope: !4) +!29 = !DILocation(line: 27, column: 187, scope: !4) +!30 = !DILocation(line: 27, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.ptx new file mode 100644 index 0000000000000000000000000000000000000000..a227761b824e5c565214c52127467a2fa0c20c72 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.ptx @@ -0,0 +1,390 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused__to_copy_6 // -- Begin function triton_poi_fused__to_copy_6 + // @triton_poi_fused__to_copy_6 +.visible .entry triton_poi_fused__to_copy_6( + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_6_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_6_param_1, + .param .u64 triton_poi_fused__to_copy_6_param_2, + .param .u64 triton_poi_fused__to_copy_6_param_3, + .param .u64 triton_poi_fused__to_copy_6_param_4, + .param .u32 triton_poi_fused__to_copy_6_param_5, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_6_param_6, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_6_param_7 +) +.reqntid 128 +{ + .reg .pred %p<15>; + .reg .b32 %r<34>; + .reg .b64 %rd<95>; + .loc 1 18 0 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:18:0 + +// %bb.0: + ld.param.b64 %rd27, [triton_poi_fused__to_copy_6_param_2]; +$L__tmp0: + .loc 1 19 28 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:19:28 + mov.u32 %r2, %ctaid.x; + .loc 1 19 33 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:19:33 + shl.b32 %r3, %r2, 8; + .loc 1 20 36 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:20:36 + mov.u32 %r4, %tid.x; + and.b32 %r5, %r4, 127; + .loc 1 20 23 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:20:23 + or.b32 %r6, %r3, %r5; + or.b32 %r7, %r6, 128; + .loc 1 22 19 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:22:19 + cvt.s64.s32 %rd1, %r6; + cvt.s64.s32 %rd2, %r7; + .loc 1 23 21 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:23:21 + or.b64 %rd31, %rd1, %rd27; + and.b64 %rd32, %rd31, -4294967296; + setp.ne.b64 %p1, %rd32, 0; + cvt.u32.u64 %r33, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd89, %rd1, %rd27; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r8, %rd27; + div.u32 %r10, %r33, %r8; + cvt.u64.u32 %rd89, %r10; +$L__BB0_3: + .loc 1 0 21 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:0:21 + ld.param.b64 %rd28, [triton_poi_fused__to_copy_6_param_3]; + .loc 1 23 21 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:23:21 + or.b64 %rd34, %rd2, %rd27; + and.b64 %rd35, %rd34, -4294967296; + setp.ne.b64 %p2, %rd35, 0; + cvt.u32.u64 %r32, %rd2; + @%p2 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd90, %rd2, %rd27; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r11, %rd27; + div.u32 %r13, %r32, %r11; + cvt.u64.u32 %rd90, %r13; +$L__BB0_6: + .loc 1 23 28 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:23:28 + or.b64 %rd36, %rd89, %rd28; + and.b64 %rd37, %rd36, -4294967296; + setp.ne.b64 %p3, %rd37, 0; + @%p3 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + rem.s64 %rd91, %rd89, %rd28; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r14, %rd28; + cvt.u32.u64 %r15, %rd89; + rem.u32 %r16, %r15, %r14; + cvt.u64.u32 %rd91, %r16; +$L__BB0_9: + .loc 1 0 28 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:0:28 + ld.param.b64 %rd29, [triton_poi_fused__to_copy_6_param_4]; + .loc 1 23 28 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:23:28 + or.b64 %rd38, %rd90, %rd28; + and.b64 %rd39, %rd38, -4294967296; + setp.ne.b64 %p4, %rd39, 0; + @%p4 bra $L__BB0_11; + bra.uni $L__BB0_10; +$L__BB0_11: + rem.s64 %rd92, %rd90, %rd28; + bra.uni $L__BB0_12; +$L__BB0_10: + cvt.u32.u64 %r17, %rd28; + cvt.u32.u64 %r18, %rd90; + rem.u32 %r19, %r18, %r17; + cvt.u64.u32 %rd92, %r19; +$L__BB0_12: + .loc 1 0 28 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:0:28 + mul.lo.s64 %rd33, %rd89, %rd27; + mul.lo.s64 %rd11, %rd90, %rd27; + .loc 1 24 19 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:24:19 + or.b64 %rd40, %rd1, %rd29; + and.b64 %rd41, %rd40, -4294967296; + setp.ne.b64 %p5, %rd41, 0; + @%p5 bra $L__BB0_14; + bra.uni $L__BB0_13; +$L__BB0_14: + div.s64 %rd93, %rd1, %rd29; + bra.uni $L__BB0_15; +$L__BB0_13: + cvt.u32.u64 %r20, %rd29; + div.u32 %r22, %r33, %r20; + cvt.u64.u32 %rd93, %r22; +$L__BB0_15: + .loc 1 0 19 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:0:19 + ld.param.b32 %r1, [triton_poi_fused__to_copy_6_param_5]; + ld.param.b64 %rd26, [triton_poi_fused__to_copy_6_param_1]; + ld.param.b64 %rd25, [triton_poi_fused__to_copy_6_param_0]; + sub.s64 %rd7, %rd1, %rd33; + sub.s64 %rd12, %rd2, %rd11; + .loc 1 24 19 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:24:19 + or.b64 %rd42, %rd2, %rd29; + and.b64 %rd43, %rd42, -4294967296; + setp.ne.b64 %p6, %rd43, 0; + @%p6 bra $L__BB0_17; + bra.uni $L__BB0_16; +$L__BB0_17: + div.s64 %rd94, %rd2, %rd29; + bra.uni $L__BB0_18; +$L__BB0_16: + cvt.u32.u64 %r23, %rd29; + div.u32 %r25, %r32, %r23; + cvt.u64.u32 %rd94, %r25; +$L__BB0_18: + .loc 1 21 21 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:21:21 + setp.lt.s32 %p8, %r32, %r1; + setp.lt.s32 %p7, %r33, %r1; + .loc 1 25 54 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25:54 + setp.lt.s64 %p11, %rd27, 2; + setp.lt.s64 %p12, %rd28, 2; + .loc 1 25 80 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25:80 + setp.gt.s64 %p13, %rd27, 1; + setp.gt.s64 %p14, %rd28, 1; + .loc 1 25 71 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25:71 + selp.b64 %rd54, %rd28, 0, %p14; + selp.b64 %rd55, %rd27, 0, %p13; + .loc 1 25 0 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25 + selp.b64 %rd56, 1, 0, %p12; + selp.b64 %rd57, 1, 0, %p11; + .loc 1 25 62 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25:62 + add.s64 %rd58, %rd55, %rd57; + add.s64 %rd59, %rd54, %rd56; + .loc 1 25 39 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25:39 + mul.lo.s64 %rd60, %rd7, %rd59; + mul.lo.s64 %rd61, %rd12, %rd59; + .loc 1 25 91 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25:91 + mul.lo.s64 %rd62, %rd59, %rd58; + .loc 1 25 138 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25:138 + mul.lo.s64 %rd63, %rd62, %rd93; + mul.lo.s64 %rd64, %rd62, %rd94; + .loc 1 25 30 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25:30 + shl.b64 %rd65, %rd91, 3; + add.s64 %rd66, %rd25, %rd65; + shl.b64 %rd67, %rd60, 3; + add.s64 %rd68, %rd66, %rd67; + shl.b64 %rd69, %rd63, 3; + add.s64 %rd46, %rd68, %rd69; + shl.b64 %rd70, %rd92, 3; + add.s64 %rd71, %rd25, %rd70; + shl.b64 %rd72, %rd61, 3; + add.s64 %rd73, %rd71, %rd72; + shl.b64 %rd74, %rd64, 3; + add.s64 %rd50, %rd73, %rd74; + .loc 1 25 186 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:25:186 + // begin inline asm + mov.u64 %rd47, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd47, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd45, 0x0; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd45 }, [ %rd46 + 0 ], %rd47; + // end inline asm + // begin inline asm + mov.u64 %rd51, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd51, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd49, 0x0; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd49 }, [ %rd50 + 0 ], %rd51; + // end inline asm + .loc 1 26 19 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:26:19 + cvt.u32.u64 %r26, %rd45; + cvt.u32.u64 %r27, %rd49; + .loc 1 27 34 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:27:34 + mul.lo.s64 %rd75, %rd91, %rd58; + mul.lo.s64 %rd76, %rd92, %rd58; + .loc 1 27 25 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:27:25 + shl.b64 %rd77, %rd7, 2; + add.s64 %rd78, %rd26, %rd77; + shl.b64 %rd79, %rd75, 2; + add.s64 %rd80, %rd78, %rd79; + shl.b64 %rd81, %rd63, 2; + add.s64 %rd52, %rd80, %rd81; + sub.s64 %rd82, %rd1, %rd11; + shl.b64 %rd83, %rd82, 2; + add.s64 %rd84, %rd26, %rd83; + shl.b64 %rd85, %rd76, 2; + add.s64 %rd86, %rd84, %rd85; + shl.b64 %rd87, %rd64, 2; + add.s64 %rd88, %rd86, %rd87; + add.s64 %rd53, %rd88, 512; + .loc 1 27 187 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:27:187 + // begin inline asm + @%p7 st.global.b32 [ %rd52 + 0 ], { %r26 }; + // end inline asm + // begin inline asm + @%p8 st.global.b32 [ %rd53 + 0 ], { %r27 }; + // end inline asm + .loc 1 27 4 // c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py:27:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 51 +.b8 53 +.b8 104 +.b8 117 +.b8 113 +.b8 112 +.b8 54 +.b8 110 +.b8 103 +.b8 122 +.b8 104 +.b8 54 +.b8 55 +.b8 107 +.b8 116 +.b8 51 +.b8 50 +.b8 107 +.b8 117 +.b8 120 +.b8 111 +.b8 113 +.b8 112 +.b8 103 +.b8 104 +.b8 99 +.b8 51 +.b8 50 +.b8 102 +.b8 115 +.b8 116 +.b8 118 +.b8 52 +.b8 122 +.b8 111 +.b8 103 +.b8 99 +.b8 111 +.b8 117 +.b8 122 +.b8 97 +.b8 98 +.b8 100 +.b8 120 +.b8 120 +.b8 119 +.b8 116 +.b8 97 +.b8 51 +.b8 115 +.b8 108 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 51 +.b8 53 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.source b/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.source new file mode 100644 index 0000000000000000000000000000000000000000..180982542abf73379b1598065809bae068e69905 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.source @@ -0,0 +1,226 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":18:0) +#loc56 = loc("in_ptr0"(#loc)) +#loc57 = loc("out_ptr0"(#loc)) +#loc58 = loc("ks0"(#loc)) +#loc59 = loc("ks1"(#loc)) +#loc60 = loc("ks2"(#loc)) +#loc61 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__to_copy_6(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc62) + %xoffset_0 = arith.constant 256 : i32 loc(#loc63) + %xoffset_1 = arith.constant 256 : i32 loc(#loc63) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc63) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc64) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<256xi32> loc(#loc65) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<256xi32> loc(#loc65) + %xmask = tt.splat %xnumel : i32 -> tensor<256xi32> loc(#loc66) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<256xi32> loc(#loc66) + %x0 = arith.extsi %xindex_4 : tensor<256xi32> to tensor<256xi64> loc(#loc67) + %x0_6 = tt.splat %ks0 : i64 -> tensor<256xi64> loc(#loc67) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<256xi64> loc(#loc67) + %x1 = arith.extsi %xindex_4 : tensor<256xi32> to tensor<256xi64> loc(#loc68) + %x1_8 = tt.splat %ks0 : i64 -> tensor<256xi64> loc(#loc68) + %x1_9 = arith.divsi %x1, %x1_8 : tensor<256xi64> loc(#loc68) + %x1_10 = tt.splat %ks1 : i64 -> tensor<256xi64> loc(#loc69) + %x1_11 = arith.remsi %x1_9, %x1_10 : tensor<256xi64> loc(#loc69) + %x2 = arith.extsi %xindex_4 : tensor<256xi32> to tensor<256xi64> loc(#loc70) + %x2_12 = tt.splat %ks2 : i64 -> tensor<256xi64> loc(#loc70) + %x2_13 = arith.divsi %x2, %x2_12 : tensor<256xi64> loc(#loc70) + %tmp0 = arith.constant 1 : i32 loc(#loc71) + %tmp0_14 = arith.extsi %tmp0 : i32 to i64 loc(#loc71) + %tmp0_15 = arith.cmpi sge, %tmp0_14, %ks1 : i64 loc(#loc71) + %tmp0_16 = arith.constant 1 : i32 loc(#loc72) + %tmp0_17 = arith.constant 1 : i32 loc(#loc72) + %tmp0_18 = arith.extui %tmp0_15 : i1 to i32 loc(#loc72) + %tmp0_19 = arith.muli %tmp0_17, %tmp0_18 : i32 loc(#loc72) + %tmp0_20 = arith.constant 1 : i32 loc(#loc73) + %tmp0_21 = arith.extsi %tmp0_20 : i32 to i64 loc(#loc73) + %tmp0_22 = arith.cmpi sgt, %ks1, %tmp0_21 : i64 loc(#loc73) + %tmp0_23 = arith.extui %tmp0_22 : i1 to i64 loc(#loc74) + %tmp0_24 = arith.muli %ks1, %tmp0_23 : i64 loc(#loc74) + %tmp0_25 = arith.extsi %tmp0_19 : i32 to i64 loc(#loc75) + %tmp0_26 = arith.addi %tmp0_25, %tmp0_24 : i64 loc(#loc75) + %tmp0_27 = tt.splat %tmp0_26 : i64 -> tensor<256xi64> loc(#loc76) + %tmp0_28 = arith.muli %x0_7, %tmp0_27 : tensor<256xi64> loc(#loc76) + %tmp0_29 = arith.addi %x1_11, %tmp0_28 : tensor<256xi64> loc(#loc77) + %tmp0_30 = arith.constant 1 : i32 loc(#loc78) + %tmp0_31 = arith.extsi %tmp0_30 : i32 to i64 loc(#loc78) + %tmp0_32 = arith.cmpi sge, %tmp0_31, %ks0 : i64 loc(#loc78) + %tmp0_33 = arith.constant 1 : i32 loc(#loc79) + %tmp0_34 = arith.constant 1 : i32 loc(#loc79) + %tmp0_35 = arith.extui %tmp0_32 : i1 to i32 loc(#loc79) + %tmp0_36 = arith.muli %tmp0_34, %tmp0_35 : i32 loc(#loc79) + %tmp0_37 = arith.constant 1 : i32 loc(#loc80) + %tmp0_38 = arith.extsi %tmp0_37 : i32 to i64 loc(#loc80) + %tmp0_39 = arith.cmpi sgt, %ks0, %tmp0_38 : i64 loc(#loc80) + %tmp0_40 = arith.extui %tmp0_39 : i1 to i64 loc(#loc81) + %tmp0_41 = arith.muli %ks0, %tmp0_40 : i64 loc(#loc81) + %tmp0_42 = arith.extsi %tmp0_36 : i32 to i64 loc(#loc82) + %tmp0_43 = arith.addi %tmp0_42, %tmp0_41 : i64 loc(#loc82) + %tmp0_44 = tt.splat %tmp0_43 : i64 -> tensor<256xi64> loc(#loc83) + %tmp0_45 = arith.muli %x2_13, %tmp0_44 : tensor<256xi64> loc(#loc83) + %tmp0_46 = arith.constant 1 : i32 loc(#loc84) + %tmp0_47 = arith.extsi %tmp0_46 : i32 to i64 loc(#loc84) + %tmp0_48 = arith.cmpi sge, %tmp0_47, %ks1 : i64 loc(#loc84) + %tmp0_49 = arith.constant 1 : i32 loc(#loc85) + %tmp0_50 = arith.constant 1 : i32 loc(#loc85) + %tmp0_51 = arith.extui %tmp0_48 : i1 to i32 loc(#loc85) + %tmp0_52 = arith.muli %tmp0_50, %tmp0_51 : i32 loc(#loc85) + %tmp0_53 = arith.constant 1 : i32 loc(#loc86) + %tmp0_54 = arith.extsi %tmp0_53 : i32 to i64 loc(#loc86) + %tmp0_55 = arith.cmpi sgt, %ks1, %tmp0_54 : i64 loc(#loc86) + %tmp0_56 = arith.extui %tmp0_55 : i1 to i64 loc(#loc87) + %tmp0_57 = arith.muli %ks1, %tmp0_56 : i64 loc(#loc87) + %tmp0_58 = arith.extsi %tmp0_52 : i32 to i64 loc(#loc88) + %tmp0_59 = arith.addi %tmp0_58, %tmp0_57 : i64 loc(#loc88) + %tmp0_60 = tt.splat %tmp0_59 : i64 -> tensor<256xi64> loc(#loc89) + %tmp0_61 = arith.muli %tmp0_45, %tmp0_60 : tensor<256xi64> loc(#loc89) + %tmp0_62 = arith.addi %tmp0_29, %tmp0_61 : tensor<256xi64> loc(#loc90) + %tmp0_63 = tt.splat %in_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc91) + %tmp0_64 = tt.addptr %tmp0_63, %tmp0_62 : tensor<256x!tt.ptr>, tensor<256xi64> loc(#loc91) + %tmp0_65 = tt.load %tmp0_64, %xmask_5 evictionPolicy = evict_last : tensor<256x!tt.ptr> loc(#loc92) + %tmp1 = arith.trunci %tmp0_65 : tensor<256xi64> to tensor<256xi32> loc(#loc93) + %c1_i32 = arith.constant 1 : i32 loc(#loc33) + %0 = arith.extsi %c1_i32 : i32 to i64 loc(#loc33) + %1 = arith.cmpi sge, %0, %ks0 : i64 loc(#loc33) + %c1_i32_66 = arith.constant 1 : i32 loc(#loc34) + %c1_i32_67 = arith.constant 1 : i32 loc(#loc34) + %2 = arith.extui %1 : i1 to i32 loc(#loc34) + %3 = arith.muli %c1_i32_67, %2 : i32 loc(#loc34) + %c1_i32_68 = arith.constant 1 : i32 loc(#loc35) + %4 = arith.extsi %c1_i32_68 : i32 to i64 loc(#loc35) + %5 = arith.cmpi sgt, %ks0, %4 : i64 loc(#loc35) + %6 = arith.extui %5 : i1 to i64 loc(#loc36) + %7 = arith.muli %ks0, %6 : i64 loc(#loc36) + %8 = arith.extsi %3 : i32 to i64 loc(#loc37) + %9 = arith.addi %8, %7 : i64 loc(#loc37) + %10 = tt.splat %9 : i64 -> tensor<256xi64> loc(#loc38) + %11 = arith.muli %x1_11, %10 : tensor<256xi64> loc(#loc38) + %12 = arith.addi %x0_7, %11 : tensor<256xi64> loc(#loc39) + %c1_i32_69 = arith.constant 1 : i32 loc(#loc40) + %13 = arith.extsi %c1_i32_69 : i32 to i64 loc(#loc40) + %14 = arith.cmpi sge, %13, %ks0 : i64 loc(#loc40) + %c1_i32_70 = arith.constant 1 : i32 loc(#loc41) + %c1_i32_71 = arith.constant 1 : i32 loc(#loc41) + %15 = arith.extui %14 : i1 to i32 loc(#loc41) + %16 = arith.muli %c1_i32_71, %15 : i32 loc(#loc41) + %c1_i32_72 = arith.constant 1 : i32 loc(#loc42) + %17 = arith.extsi %c1_i32_72 : i32 to i64 loc(#loc42) + %18 = arith.cmpi sgt, %ks0, %17 : i64 loc(#loc42) + %19 = arith.extui %18 : i1 to i64 loc(#loc43) + %20 = arith.muli %ks0, %19 : i64 loc(#loc43) + %21 = arith.extsi %16 : i32 to i64 loc(#loc44) + %22 = arith.addi %21, %20 : i64 loc(#loc44) + %23 = tt.splat %22 : i64 -> tensor<256xi64> loc(#loc45) + %24 = arith.muli %x2_13, %23 : tensor<256xi64> loc(#loc45) + %c1_i32_73 = arith.constant 1 : i32 loc(#loc46) + %25 = arith.extsi %c1_i32_73 : i32 to i64 loc(#loc46) + %26 = arith.cmpi sge, %25, %ks1 : i64 loc(#loc46) + %c1_i32_74 = arith.constant 1 : i32 loc(#loc47) + %c1_i32_75 = arith.constant 1 : i32 loc(#loc47) + %27 = arith.extui %26 : i1 to i32 loc(#loc47) + %28 = arith.muli %c1_i32_75, %27 : i32 loc(#loc47) + %c1_i32_76 = arith.constant 1 : i32 loc(#loc48) + %29 = arith.extsi %c1_i32_76 : i32 to i64 loc(#loc48) + %30 = arith.cmpi sgt, %ks1, %29 : i64 loc(#loc48) + %31 = arith.extui %30 : i1 to i64 loc(#loc49) + %32 = arith.muli %ks1, %31 : i64 loc(#loc49) + %33 = arith.extsi %28 : i32 to i64 loc(#loc50) + %34 = arith.addi %33, %32 : i64 loc(#loc50) + %35 = tt.splat %34 : i64 -> tensor<256xi64> loc(#loc51) + %36 = arith.muli %24, %35 : tensor<256xi64> loc(#loc51) + %37 = arith.addi %12, %36 : tensor<256xi64> loc(#loc52) + %38 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc53) + %39 = tt.addptr %38, %37 : tensor<256x!tt.ptr>, tensor<256xi64> loc(#loc53) + tt.store %39, %tmp1, %xmask_5 : tensor<256x!tt.ptr> loc(#loc54) + tt.return loc(#loc55) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":22:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":23:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":23:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":24:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:54) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:46) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:80) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:71) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:62) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:39) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:106) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:98) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:132) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:123) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:114) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:91) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:153) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:145) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:179) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:170) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:161) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:138) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:87) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:30) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:186) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":26:19) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:49) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:41) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:75) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:66) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:57) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:34) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:30) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:101) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:93) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:127) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:118) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:109) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:86) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:148) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:140) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:174) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:165) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:156) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:133) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:82) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:187) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:4) +#loc62 = loc("xoffset"(#loc1)) +#loc63 = loc("xoffset"(#loc2)) +#loc64 = loc("xindex"(#loc3)) +#loc65 = loc("xindex"(#loc4)) +#loc66 = loc("xmask"(#loc5)) +#loc67 = loc("x0"(#loc6)) +#loc68 = loc("x1"(#loc7)) +#loc69 = loc("x1"(#loc8)) +#loc70 = loc("x2"(#loc9)) +#loc71 = loc("tmp0"(#loc10)) +#loc72 = loc("tmp0"(#loc11)) +#loc73 = loc("tmp0"(#loc12)) +#loc74 = loc("tmp0"(#loc13)) +#loc75 = loc("tmp0"(#loc14)) +#loc76 = loc("tmp0"(#loc15)) +#loc77 = loc("tmp0"(#loc16)) +#loc78 = loc("tmp0"(#loc17)) +#loc79 = loc("tmp0"(#loc18)) +#loc80 = loc("tmp0"(#loc19)) +#loc81 = loc("tmp0"(#loc20)) +#loc82 = loc("tmp0"(#loc21)) +#loc83 = loc("tmp0"(#loc22)) +#loc84 = loc("tmp0"(#loc23)) +#loc85 = loc("tmp0"(#loc24)) +#loc86 = loc("tmp0"(#loc25)) +#loc87 = loc("tmp0"(#loc26)) +#loc88 = loc("tmp0"(#loc27)) +#loc89 = loc("tmp0"(#loc28)) +#loc90 = loc("tmp0"(#loc29)) +#loc91 = loc("tmp0"(#loc30)) +#loc92 = loc("tmp0"(#loc31)) +#loc93 = loc("tmp1"(#loc32)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..180527a3822a09b590cefeb22079268169fc53c9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.ttgir @@ -0,0 +1,122 @@ +#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":18:0) +#loc35 = loc("in_ptr0"(#loc)) +#loc36 = loc("out_ptr0"(#loc)) +#loc37 = loc("ks0"(#loc)) +#loc38 = loc("ks1"(#loc)) +#loc39 = loc("ks2"(#loc)) +#loc40 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused__to_copy_6(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc41) + %xoffset_0 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc42) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> loc(#loc43) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<256xi32, #blocked> loc(#loc44) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<256xi32, #blocked> loc(#loc44) + %xmask = tt.splat %xnumel : i32 -> tensor<256xi32, #blocked> loc(#loc45) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<256xi32, #blocked> loc(#loc45) + %x0 = arith.extsi %xindex_2 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked> loc(#loc46) + %x0_4 = tt.splat %ks0 : i64 -> tensor<256xi64, #blocked> loc(#loc46) + %x0_5 = arith.remsi %x0, %x0_4 : tensor<256xi64, #blocked> loc(#loc46) + %x1 = arith.divsi %x0, %x0_4 : tensor<256xi64, #blocked> loc(#loc47) + %x1_6 = tt.splat %ks1 : i64 -> tensor<256xi64, #blocked> loc(#loc48) + %x1_7 = arith.remsi %x1, %x1_6 : tensor<256xi64, #blocked> loc(#loc48) + %x2 = tt.splat %ks2 : i64 -> tensor<256xi64, #blocked> loc(#loc49) + %x2_8 = arith.divsi %x0, %x2 : tensor<256xi64, #blocked> loc(#loc49) + %tmp0 = arith.cmpi sle, %ks1, %c1_i64 : i64 loc(#loc50) + %tmp0_9 = arith.cmpi sgt, %ks1, %c1_i64 : i64 loc(#loc51) + %tmp0_10 = arith.extui %tmp0_9 : i1 to i64 loc(#loc52) + %tmp0_11 = arith.muli %ks1, %tmp0_10 : i64 loc(#loc52) + %tmp0_12 = arith.extui %tmp0 : i1 to i64 loc(#loc68) + %tmp0_13 = arith.addi %tmp0_12, %tmp0_11 : i64 loc(#loc53) + %tmp0_14 = tt.splat %tmp0_13 : i64 -> tensor<256xi64, #blocked> loc(#loc55) + %tmp0_15 = arith.muli %x0_5, %tmp0_14 : tensor<256xi64, #blocked> loc(#loc55) + %tmp0_16 = arith.addi %x1_7, %tmp0_15 : tensor<256xi64, #blocked> loc(#loc56) + %tmp0_17 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc57) + %tmp0_18 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc58) + %tmp0_19 = arith.extui %tmp0_18 : i1 to i64 loc(#loc59) + %tmp0_20 = arith.muli %ks0, %tmp0_19 : i64 loc(#loc59) + %tmp0_21 = arith.extui %tmp0_17 : i1 to i64 loc(#loc69) + %tmp0_22 = arith.addi %tmp0_21, %tmp0_20 : i64 loc(#loc60) + %tmp0_23 = tt.splat %tmp0_22 : i64 -> tensor<256xi64, #blocked> loc(#loc62) + %tmp0_24 = arith.muli %x2_8, %tmp0_23 : tensor<256xi64, #blocked> loc(#loc62) + %tmp0_25 = arith.muli %tmp0_24, %tmp0_14 : tensor<256xi64, #blocked> loc(#loc63) + %tmp0_26 = arith.addi %tmp0_16, %tmp0_25 : tensor<256xi64, #blocked> loc(#loc64) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<256x!tt.ptr, #blocked> loc(#loc65) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<256x!tt.ptr, #blocked>, tensor<256xi64, #blocked> loc(#loc65) + %tmp0_29 = tt.load %tmp0_28, %xmask_3 evictionPolicy = evict_last : tensor<256x!tt.ptr, #blocked> loc(#loc66) + %tmp1 = arith.trunci %tmp0_29 : tensor<256xi64, #blocked> to tensor<256xi32, #blocked> loc(#loc67) + %0 = arith.muli %x1_7, %tmp0_23 : tensor<256xi64, #blocked> loc(#loc29) + %1 = arith.addi %x0_5, %0 : tensor<256xi64, #blocked> loc(#loc30) + %2 = arith.addi %1, %tmp0_25 : tensor<256xi64, #blocked> loc(#loc31) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr, #blocked> loc(#loc32) + %4 = tt.addptr %3, %2 : tensor<256x!tt.ptr, #blocked>, tensor<256xi64, #blocked> loc(#loc32) + tt.store %4, %tmp1, %xmask_3 : tensor<256x!tt.ptr, #blocked> loc(#loc33) + tt.return loc(#loc34) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":23:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":23:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":24:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:54) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:80) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:71) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:62) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:46) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:39) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:35) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:106) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:132) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:123) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:114) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:98) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:91) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:138) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:87) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:30) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:186) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":26:19) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:34) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:30) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:82) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:187) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:4) +#loc41 = loc("xoffset"(#loc2)) +#loc42 = loc("xoffset"(#loc3)) +#loc43 = loc("xindex"(#loc4)) +#loc44 = loc("xindex"(#loc5)) +#loc45 = loc("xmask"(#loc6)) +#loc46 = loc("x0"(#loc7)) +#loc47 = loc("x1"(#loc8)) +#loc48 = loc("x1"(#loc9)) +#loc49 = loc("x2"(#loc10)) +#loc50 = loc("tmp0"(#loc11)) +#loc51 = loc("tmp0"(#loc12)) +#loc52 = loc("tmp0"(#loc13)) +#loc53 = loc("tmp0"(#loc14)) +#loc54 = loc("tmp0"(#loc15)) +#loc55 = loc("tmp0"(#loc16)) +#loc56 = loc("tmp0"(#loc17)) +#loc57 = loc("tmp0"(#loc18)) +#loc58 = loc("tmp0"(#loc19)) +#loc59 = loc("tmp0"(#loc20)) +#loc60 = loc("tmp0"(#loc21)) +#loc61 = loc("tmp0"(#loc22)) +#loc62 = loc("tmp0"(#loc23)) +#loc63 = loc("tmp0"(#loc24)) +#loc64 = loc("tmp0"(#loc25)) +#loc65 = loc("tmp0"(#loc26)) +#loc66 = loc("tmp0"(#loc27)) +#loc67 = loc("tmp1"(#loc28)) +#loc68 = loc(fused[#loc53, #loc54]) +#loc69 = loc(fused[#loc60, #loc61]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.ttir new file mode 100644 index 0000000000000000000000000000000000000000..a1e71fdfa79338c8d1383c93eb0e678844433377 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/P6OIVJ7O3PSYLTC3A2H3SZPBX7J7RHB2AFPXXZGJ5TAZEKXM5R6A/triton_poi_fused__to_copy_6.ttir @@ -0,0 +1,121 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":18:0) +#loc35 = loc("in_ptr0"(#loc)) +#loc36 = loc("out_ptr0"(#loc)) +#loc37 = loc("ks0"(#loc)) +#loc38 = loc("ks1"(#loc)) +#loc39 = loc("ks2"(#loc)) +#loc40 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__to_copy_6(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc41) + %xoffset_0 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc42) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc43) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<256xi32> loc(#loc44) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<256xi32> loc(#loc44) + %xmask = tt.splat %xnumel : i32 -> tensor<256xi32> loc(#loc45) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<256xi32> loc(#loc45) + %x0 = arith.extsi %xindex_2 : tensor<256xi32> to tensor<256xi64> loc(#loc46) + %x0_4 = tt.splat %ks0 : i64 -> tensor<256xi64> loc(#loc46) + %x0_5 = arith.remsi %x0, %x0_4 : tensor<256xi64> loc(#loc46) + %x1 = arith.divsi %x0, %x0_4 : tensor<256xi64> loc(#loc47) + %x1_6 = tt.splat %ks1 : i64 -> tensor<256xi64> loc(#loc48) + %x1_7 = arith.remsi %x1, %x1_6 : tensor<256xi64> loc(#loc48) + %x2 = tt.splat %ks2 : i64 -> tensor<256xi64> loc(#loc49) + %x2_8 = arith.divsi %x0, %x2 : tensor<256xi64> loc(#loc49) + %tmp0 = arith.cmpi sle, %ks1, %c1_i64 : i64 loc(#loc50) + %tmp0_9 = arith.cmpi sgt, %ks1, %c1_i64 : i64 loc(#loc51) + %tmp0_10 = arith.extui %tmp0_9 : i1 to i64 loc(#loc52) + %tmp0_11 = arith.muli %ks1, %tmp0_10 : i64 loc(#loc52) + %tmp0_12 = arith.extui %tmp0 : i1 to i64 loc(#loc68) + %tmp0_13 = arith.addi %tmp0_12, %tmp0_11 : i64 loc(#loc53) + %tmp0_14 = tt.splat %tmp0_13 : i64 -> tensor<256xi64> loc(#loc55) + %tmp0_15 = arith.muli %x0_5, %tmp0_14 : tensor<256xi64> loc(#loc55) + %tmp0_16 = arith.addi %x1_7, %tmp0_15 : tensor<256xi64> loc(#loc56) + %tmp0_17 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc57) + %tmp0_18 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc58) + %tmp0_19 = arith.extui %tmp0_18 : i1 to i64 loc(#loc59) + %tmp0_20 = arith.muli %ks0, %tmp0_19 : i64 loc(#loc59) + %tmp0_21 = arith.extui %tmp0_17 : i1 to i64 loc(#loc69) + %tmp0_22 = arith.addi %tmp0_21, %tmp0_20 : i64 loc(#loc60) + %tmp0_23 = tt.splat %tmp0_22 : i64 -> tensor<256xi64> loc(#loc62) + %tmp0_24 = arith.muli %x2_8, %tmp0_23 : tensor<256xi64> loc(#loc62) + %tmp0_25 = arith.muli %tmp0_24, %tmp0_14 : tensor<256xi64> loc(#loc63) + %tmp0_26 = arith.addi %tmp0_16, %tmp0_25 : tensor<256xi64> loc(#loc64) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc65) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<256x!tt.ptr>, tensor<256xi64> loc(#loc65) + %tmp0_29 = tt.load %tmp0_28, %xmask_3 evictionPolicy = evict_last : tensor<256x!tt.ptr> loc(#loc66) + %tmp1 = arith.trunci %tmp0_29 : tensor<256xi64> to tensor<256xi32> loc(#loc67) + %0 = arith.muli %x1_7, %tmp0_23 : tensor<256xi64> loc(#loc29) + %1 = arith.addi %x0_5, %0 : tensor<256xi64> loc(#loc30) + %2 = arith.addi %1, %tmp0_25 : tensor<256xi64> loc(#loc31) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc32) + %4 = tt.addptr %3, %2 : tensor<256x!tt.ptr>, tensor<256xi64> loc(#loc32) + tt.store %4, %tmp1, %xmask_3 : tensor<256x!tt.ptr> loc(#loc33) + tt.return loc(#loc34) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":23:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":23:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":24:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:54) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:80) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:71) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:62) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:46) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:39) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:35) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:106) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:132) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:123) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:114) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:98) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:91) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:138) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:87) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:30) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":25:186) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":26:19) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:34) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:30) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:82) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:187) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/35/c35huqp6ngzh67kt32kuxoqpghc32fstv4zogcouzabdxxwta3sl.py":27:4) +#loc41 = loc("xoffset"(#loc2)) +#loc42 = loc("xoffset"(#loc3)) +#loc43 = loc("xindex"(#loc4)) +#loc44 = loc("xindex"(#loc5)) +#loc45 = loc("xmask"(#loc6)) +#loc46 = loc("x0"(#loc7)) +#loc47 = loc("x1"(#loc8)) +#loc48 = loc("x1"(#loc9)) +#loc49 = loc("x2"(#loc10)) +#loc50 = loc("tmp0"(#loc11)) +#loc51 = loc("tmp0"(#loc12)) +#loc52 = loc("tmp0"(#loc13)) +#loc53 = loc("tmp0"(#loc14)) +#loc54 = loc("tmp0"(#loc15)) +#loc55 = loc("tmp0"(#loc16)) +#loc56 = loc("tmp0"(#loc17)) +#loc57 = loc("tmp0"(#loc18)) +#loc58 = loc("tmp0"(#loc19)) +#loc59 = loc("tmp0"(#loc20)) +#loc60 = loc("tmp0"(#loc21)) +#loc61 = loc("tmp0"(#loc22)) +#loc62 = loc("tmp0"(#loc23)) +#loc63 = loc("tmp0"(#loc24)) +#loc64 = loc("tmp0"(#loc25)) +#loc65 = loc("tmp0"(#loc26)) +#loc66 = loc("tmp0"(#loc27)) +#loc67 = loc("tmp1"(#loc28)) +#loc68 = loc(fused[#loc53, #loc54]) +#loc69 = loc(fused[#loc60, #loc61]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0f6344cd251368b59009f5537bb95f759c873f74 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/__grp__triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin", "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..67e5f1bf38ee4c566af3c5bc20b855b63dbc9763 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json new file mode 100644 index 0000000000000000000000000000000000000000..22bfc1518c83b6fd9604df97bcb9059f95b0d753 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.json @@ -0,0 +1 @@ +{"hash": "7ab17e986bfaecf6dc9d7cbf00c92242b35cb3cbc1c1350011e320aeb0988ac5", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..79d1166056ea87229e018a7f05455f8c8da8936d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.llir @@ -0,0 +1,295 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i64 %4, i64 %5, i32 %6, i32 %7, ptr addrspace(1) readnone captures(none) %8, ptr addrspace(1) readnone captures(none) %9) local_unnamed_addr #0 !dbg !4 { + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %12 = icmp slt i32 %11, %6, !dbg !8 + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %14 = and i32 %13, 511, !dbg !9 + %15 = zext nneg i32 %11 to i64, !dbg !10 + %.frozen = freeze i64 %3, !dbg !11 + %16 = sdiv i64 %15, %.frozen, !dbg !11 + %17 = mul i64 %16, %.frozen, !dbg !10 + %.decomposed = sub i64 %15, %17, !dbg !10 + %.lhs.trunc = trunc nsw i64 %16 to i32, !dbg !12 + %18 = srem i32 %.lhs.trunc, 16, !dbg !12 + %.sext = sext i32 %18 to i64, !dbg !12 + %19 = sdiv i64 %15, %5, !dbg !13 + %20 = shl nuw nsw i64 %.decomposed, 7, !dbg !14 + %21 = shl nsw i64 %.sext, 7, !dbg !15 + %22 = getelementptr i64, ptr addrspace(1) %0, i64 %19, !dbg !16 + %23 = and i32 %13, 127 + %24 = zext nneg i32 %23 to i64 + %25 = or disjoint i64 %20, %24 + %26 = icmp slt i64 %25, %4 + %27 = and i1 %12, %26 + %28 = icmp samesign ugt i64 %.decomposed, 15 + %29 = and i64 %25, 2047 + %30 = sub nsw i64 %.decomposed, %.sext + %31 = shl nsw i64 %30, 7 + %32 = select i1 %12, i1 %26, i1 false + %33 = zext nneg i32 %14 to i64, !dbg !17 + %34 = zext nneg i32 %13 to i64, !dbg !17 + %35 = insertelement <4 x i1> poison, i1 %32, i64 0, !dbg !18 + %36 = shufflevector <4 x i1> %35, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !18 + %37 = insertelement <4 x i64> poison, i64 %21, i64 0, !dbg !19 + %38 = shufflevector <4 x i64> %37, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !19 + %39 = insertelement <4 x i64> poison, i64 %25, i64 0, !dbg !20 + %40 = shufflevector <4 x i64> %39, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !20 + %41 = insertelement <4 x i1> poison, i1 %28, i64 0, !dbg !21 + %42 = shufflevector <4 x i1> %41, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !21 + %43 = insertelement <4 x i64> poison, i64 %31, i64 0, !dbg !22 + %44 = shufflevector <4 x i64> %43, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !22 + %45 = insertelement <4 x i32> poison, i32 %23, i64 0, !dbg !23 + %46 = shufflevector <4 x i32> %45, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !23 + %47 = insertelement <4 x i64> poison, i64 %29, i64 0, !dbg !24 + %48 = shufflevector <4 x i64> %47, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !24 + br label %49, !dbg !17 + +49: ; preds = %10, %49 + %indvars.iv = phi i64 [ 0, %10 ], [ %indvars.iv.next, %49 ] + %50 = phi <4 x i64> [ zeroinitializer, %10 ], [ %100, %49 ] + %51 = or disjoint i64 %indvars.iv, %33, !dbg !25 + %52 = or disjoint i64 %indvars.iv, %34, !dbg !25 + %53 = lshr i64 %51, 7, !dbg !26 + %54 = lshr i64 %52, 7, !dbg !26 + %55 = trunc nuw nsw i64 %54 to i32, !dbg !26 + %56 = or i32 %55, 4, !dbg !26 + %57 = or disjoint i64 %53, 8, !dbg !26 + %58 = or i32 %55, 12, !dbg !26 + %59 = zext nneg i32 %56 to i64, !dbg !19 + %60 = zext nneg i32 %58 to i64, !dbg !19 + %61 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !27 + %62 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %22, i64 %61, i1 %27) #5, !dbg !27 + %63 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !27 + %64 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %22, i64 %63, i1 %27) #5, !dbg !27 + %65 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !27 + %66 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %22, i64 %65, i1 %27) #5, !dbg !27 + %67 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !27 + %68 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %22, i64 %67, i1 %27) #5, !dbg !27 + %69 = trunc nuw nsw i64 %53 to i32, !dbg !23 + %70 = trunc nuw nsw i64 %57 to i32, !dbg !23 + %71 = insertelement <4 x i64> poison, i64 %53, i64 0, !dbg !19 + %72 = insertelement <4 x i64> %71, i64 %59, i64 1, !dbg !19 + %73 = insertelement <4 x i64> %72, i64 %57, i64 2, !dbg !19 + %74 = insertelement <4 x i64> %73, i64 %60, i64 3, !dbg !19 + %75 = or disjoint <4 x i64> %38, %74, !dbg !19 + %76 = icmp sge <4 x i64> %75, %40, !dbg !20 + %77 = insertelement <4 x i64> poison, i64 %62, i64 0, !dbg !28 + %78 = insertelement <4 x i64> %77, i64 %64, i64 1, !dbg !28 + %79 = insertelement <4 x i64> %78, i64 %66, i64 2, !dbg !28 + %80 = insertelement <4 x i64> %79, i64 %68, i64 3, !dbg !28 + %81 = icmp slt <4 x i64> %40, %80, !dbg !28 + %82 = icmp slt <4 x i64> %75, %80, !dbg !29 + %83 = and <4 x i1> %81, %82, !dbg !30 + %84 = and <4 x i1> %76, %83, !dbg !31 + %85 = icmp slt <4 x i64> %48, %80, !dbg !24 + %86 = insertelement <4 x i32> poison, i32 %69, i64 0, !dbg !23 + %87 = insertelement <4 x i32> %86, i32 %56, i64 1, !dbg !23 + %88 = insertelement <4 x i32> %87, i32 %70, i64 2, !dbg !23 + %89 = insertelement <4 x i32> %88, i32 %58, i64 3, !dbg !23 + %90 = sub nsw <4 x i32> %46, %89, !dbg !23 + %91 = zext <4 x i32> %90 to <4 x i64>, !dbg !23 + %92 = add nsw <4 x i64> %44, %91, !dbg !22 + %93 = and <4 x i64> %92, splat (i64 2047), !dbg !32 + %94 = icmp eq <4 x i64> %93, zeroinitializer, !dbg !33 + %95 = and <4 x i1> %94, %85, !dbg !21 + %96 = and <4 x i1> %42, %95, !dbg !21 + %97 = or <4 x i1> %84, %96, !dbg !34 + %98 = select <4 x i1> %36, <4 x i1> %97, <4 x i1> zeroinitializer, !dbg !18 + %99 = zext <4 x i1> %98 to <4 x i64>, !dbg !18 + %100 = add <4 x i64> %50, %99, !dbg !18 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !17 + %101 = icmp samesign ult i64 %indvars.iv, 14336, !dbg !17 + br i1 %101, label %49, label %102, !dbg !17 + +102: ; preds = %49 + %103 = and i32 %13, 31, !dbg !9 + %104 = lshr i32 %13, 5, !dbg !9 + %105 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %100), !dbg !35 + %extelt.offset = lshr i64 %105, 32, !dbg !39 + %106 = trunc nuw i64 %extelt.offset to i32, !dbg !39 + %107 = trunc i64 %105 to i32, !dbg !39 + %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 16, i32 31), !dbg !39 + %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 16, i32 31), !dbg !39 + %110 = insertelement <2 x i32> poison, i32 %108, i64 0, !dbg !39 + %111 = insertelement <2 x i32> %110, i32 %109, i64 1, !dbg !39 + %112 = bitcast <2 x i32> %111 to i64, !dbg !39 + %113 = add i64 %105, %112, !dbg !35 + %extelt.offset1 = lshr i64 %113, 32, !dbg !39 + %114 = trunc nuw i64 %extelt.offset1 to i32, !dbg !39 + %115 = trunc i64 %113 to i32, !dbg !39 + %116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %115, i32 8, i32 31), !dbg !39 + %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 8, i32 31), !dbg !39 + %118 = insertelement <2 x i32> poison, i32 %116, i64 0, !dbg !39 + %119 = insertelement <2 x i32> %118, i32 %117, i64 1, !dbg !39 + %120 = bitcast <2 x i32> %119 to i64, !dbg !39 + %121 = add i64 %113, %120, !dbg !35 + %extelt.offset2 = lshr i64 %121, 32, !dbg !39 + %122 = trunc nuw i64 %extelt.offset2 to i32, !dbg !39 + %123 = trunc i64 %121 to i32, !dbg !39 + %124 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %123, i32 4, i32 31), !dbg !39 + %125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 4, i32 31), !dbg !39 + %126 = insertelement <2 x i32> poison, i32 %124, i64 0, !dbg !39 + %127 = insertelement <2 x i32> %126, i32 %125, i64 1, !dbg !39 + %128 = bitcast <2 x i32> %127 to i64, !dbg !39 + %129 = add i64 %121, %128, !dbg !35 + %extelt.offset3 = lshr i64 %129, 32, !dbg !39 + %130 = trunc nuw i64 %extelt.offset3 to i32, !dbg !39 + %131 = trunc i64 %129 to i32, !dbg !39 + %132 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %131, i32 2, i32 31), !dbg !39 + %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %130, i32 2, i32 31), !dbg !39 + %134 = insertelement <2 x i32> poison, i32 %132, i64 0, !dbg !39 + %135 = insertelement <2 x i32> %134, i32 %133, i64 1, !dbg !39 + %136 = bitcast <2 x i32> %135 to i64, !dbg !39 + %137 = add i64 %129, %136, !dbg !35 + %extelt.offset4 = lshr i64 %137, 32, !dbg !39 + %138 = trunc nuw i64 %extelt.offset4 to i32, !dbg !39 + %139 = trunc i64 %137 to i32, !dbg !39 + %140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 1, i32 31), !dbg !39 + %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %138, i32 1, i32 31), !dbg !39 + %142 = insertelement <2 x i32> poison, i32 %140, i64 0, !dbg !39 + %143 = insertelement <2 x i32> %142, i32 %141, i64 1, !dbg !39 + %144 = bitcast <2 x i32> %143 to i64, !dbg !39 + %145 = add i64 %137, %144, !dbg !35 + %146 = and i32 %104, 15, !dbg !39 + %147 = icmp eq i32 %103, 0, !dbg !39 + %148 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %146, !dbg !39 + %149 = insertelement <1 x i64> poison, i64 %145, i64 0, !dbg !39 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %148, <1 x i64> %149, i1 %147) #5, !dbg !39 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !39 + %150 = icmp samesign ult i32 %13, 16, !dbg !39 + %151 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %13, !dbg !39 + %152 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %151, i1 %150) #5, !dbg !39 + %extelt.offset5 = lshr i64 %152, 32, !dbg !39 + %153 = trunc nuw i64 %extelt.offset5 to i32, !dbg !39 + %154 = trunc i64 %152 to i32, !dbg !39 + %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 8, i32 31), !dbg !39 + %156 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 8, i32 31), !dbg !39 + %157 = insertelement <2 x i32> poison, i32 %155, i64 0, !dbg !39 + %158 = insertelement <2 x i32> %157, i32 %156, i64 1, !dbg !39 + %159 = bitcast <2 x i32> %158 to i64, !dbg !39 + %160 = add i64 %152, %159, !dbg !35 + %extelt.offset6 = lshr i64 %160, 32, !dbg !39 + %161 = trunc nuw i64 %extelt.offset6 to i32, !dbg !39 + %162 = trunc i64 %160 to i32, !dbg !39 + %163 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %162, i32 4, i32 31), !dbg !39 + %164 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 4, i32 31), !dbg !39 + %165 = insertelement <2 x i32> poison, i32 %163, i64 0, !dbg !39 + %166 = insertelement <2 x i32> %165, i32 %164, i64 1, !dbg !39 + %167 = bitcast <2 x i32> %166 to i64, !dbg !39 + %168 = add i64 %160, %167, !dbg !35 + %extelt.offset7 = lshr i64 %168, 32, !dbg !39 + %169 = trunc nuw i64 %extelt.offset7 to i32, !dbg !39 + %170 = trunc i64 %168 to i32, !dbg !39 + %171 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %170, i32 2, i32 31), !dbg !39 + %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 2, i32 31), !dbg !39 + %173 = insertelement <2 x i32> poison, i32 %171, i64 0, !dbg !39 + %174 = insertelement <2 x i32> %173, i32 %172, i64 1, !dbg !39 + %175 = bitcast <2 x i32> %174 to i64, !dbg !39 + %176 = add i64 %168, %175, !dbg !35 + %extelt.offset8 = lshr i64 %176, 32, !dbg !39 + %177 = trunc nuw i64 %extelt.offset8 to i32, !dbg !39 + %178 = trunc i64 %176 to i32, !dbg !39 + %179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 1, i32 31), !dbg !39 + %180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 1, i32 31), !dbg !39 + %181 = insertelement <2 x i32> poison, i32 %179, i64 0, !dbg !39 + %182 = insertelement <2 x i32> %181, i32 %180, i64 1, !dbg !39 + %183 = bitcast <2 x i32> %182 to i64, !dbg !39 + %184 = add i64 %176, %183, !dbg !35 + %185 = icmp eq i32 %13, 0, !dbg !39 + %186 = insertelement <1 x i64> poison, i64 %184, i64 0, !dbg !39 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %151, <1 x i64> %186, i1 %185) #5, !dbg !39 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !39 + %187 = load i64, ptr addrspace(3) @global_smem, align 16, !dbg !39 + %188 = add i64 %187, -1, !dbg !40 + %189 = icmp ult i64 %188, 16383, !dbg !40 + %190 = zext i1 %189 to i32, !dbg !41 + %191 = icmp eq i64 %187, 16384, !dbg !42 + %192 = zext i1 %191 to i32, !dbg !41 + %193 = getelementptr i32, ptr addrspace(1) %1, i64 %15, !dbg !43 + %194 = icmp eq i32 %14, 0, !dbg !44 + %195 = and i1 %194, %12, !dbg !44 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %190, ptr addrspace(1) %193, i1 %195) #5, !dbg !44 + %196 = getelementptr i32, ptr addrspace(1) %2, i64 %15, !dbg !45 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %192, ptr addrspace(1) %196, i1 %195) #5, !dbg !46 + ret void, !dbg !47 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1", linkageName: "triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 24, column: 21, scope: !4) +!9 = !DILocation(line: 25, column: 37, scope: !4) +!10 = !DILocation(line: 27, column: 19, scope: !4) +!11 = !DILocation(line: 28, column: 21, scope: !4) +!12 = !DILocation(line: 28, column: 28, scope: !4) +!13 = !DILocation(line: 29, column: 19, scope: !4) +!14 = !DILocation(line: 39, column: 26, scope: !4) +!15 = !DILocation(line: 42, column: 26, scope: !4) +!16 = !DILocation(line: 45, column: 34, scope: !4) +!17 = !DILocation(line: 32, column: 40, scope: !4) +!18 = !DILocation(line: 76, column: 50, scope: !4) +!19 = !DILocation(line: 42, column: 22, scope: !4) +!20 = !DILocation(line: 44, column: 23, scope: !4) +!21 = !DILocation(line: 69, column: 24, scope: !4) +!22 = !DILocation(line: 57, column: 51, scope: !4) +!23 = !DILocation(line: 57, column: 38, scope: !4) +!24 = !DILocation(line: 55, column: 24, scope: !4) +!25 = !DILocation(line: 33, column: 31, scope: !4) +!26 = !DILocation(line: 38, column: 27, scope: !4) +!27 = !DILocation(line: 45, column: 76, scope: !4) +!28 = !DILocation(line: 46, column: 22, scope: !4) +!29 = !DILocation(line: 47, column: 22, scope: !4) +!30 = !DILocation(line: 48, column: 22, scope: !4) +!31 = !DILocation(line: 49, column: 23, scope: !4) +!32 = !DILocation(line: 66, column: 39, scope: !4) +!33 = !DILocation(line: 68, column: 25, scope: !4) +!34 = !DILocation(line: 70, column: 24, scope: !4) +!35 = !DILocation(line: 261, column: 15, scope: !36, inlinedAt: !38) +!36 = distinct !DILexicalBlockFile(scope: !4, file: !37, discriminator: 0) +!37 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!38 = !DILocation(line: 77, column: 27, scope: !4) +!39 = !DILocation(line: 291, column: 36, scope: !36, inlinedAt: !38) +!40 = !DILocation(line: 82, column: 20, scope: !4) +!41 = !DILocation(line: 0, scope: !4) +!42 = !DILocation(line: 85, column: 21, scope: !4) +!43 = !DILocation(line: 88, column: 25, scope: !4) +!44 = !DILocation(line: 88, column: 37, scope: !4) +!45 = !DILocation(line: 89, column: 25, scope: !4) +!46 = !DILocation(line: 89, column: 37, scope: !4) +!47 = !DILocation(line: 89, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..f8decea442812913c9a69b2ce9bd3c4ae044fb8d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ptx @@ -0,0 +1,736 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 // -- Begin function triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1 +.visible .entry triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_2, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_3, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_4, + .param .u64 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_5, + .param .u32 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_6, + .param .u32 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_7, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_9 +) +.reqntid 512 +{ + .reg .pred %p<68>; + .reg .b32 %r<83>; + .reg .b64 %rd<168>; + .loc 1 18 0 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:18:0 + +// %bb.0: + ld.param.b64 %rd45, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_5]; +$L__tmp0: + .loc 1 22 28 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:22:28 + mov.u32 %r11, %ctaid.x; + .loc 1 25 37 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:25:37 + mov.u32 %r1, %tid.x; + ld.param.b64 %rd46, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_3]; + .loc 1 27 19 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:27:19 + cvt.u64.u32 %rd1, %r11; + .loc 1 28 21 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:28:21 + and.b64 %rd47, %rd46, -4294967296; + setp.ne.b64 %p10, %rd47, 0; + cvt.u32.u64 %r81, %rd1; + @%p10 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd160, %rd1, %rd46; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r12, %rd46; + div.u32 %r14, %r81, %r12; + cvt.u64.u32 %rd160, %r14; +$L__BB0_3: + .loc 1 0 21 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:0:21 + ld.param.b32 %r10, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_6]; + ld.param.b64 %rd44, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_4]; + ld.param.b64 %rd41, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_0]; + and.b32 %r2, %r1, 511; + .loc 1 27 19 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:27:19 + mul.lo.s64 %rd48, %rd160, %rd46; + sub.s64 %rd6, %rd1, %rd48; + .loc 1 28 28 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:28:28 + cvt.u32.u64 %r15, %rd160; + shr.s32 %r16, %r15, 31; + shr.u32 %r17, %r16, 28; + add.s32 %r18, %r15, %r17; + and.b32 %r19, %r18, -16; + sub.s32 %r20, %r15, %r19; + cvt.s64.s32 %rd7, %r20; + .loc 1 29 19 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:29:19 + and.b64 %rd49, %rd45, -4294967296; + setp.ne.b64 %p11, %rd49, 0; + @%p11 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd161, %rd1, %rd45; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r21, %rd45; + div.u32 %r23, %r81, %r21; + cvt.u64.u32 %rd161, %r23; +$L__BB0_6: + .loc 1 0 19 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:0:19 + ld.param.b64 %rd43, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_2]; + ld.param.b64 %rd42, [triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1_param_1]; + .loc 1 24 21 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:24:21 + setp.lt.s32 %p12, %r81, %r10; + .loc 1 39 26 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:39:26 + shl.b64 %rd55, %rd6, 7; + .loc 1 42 26 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:42:26 + shl.b64 %rd12, %rd7, 7; + .loc 1 45 34 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:45:34 + shl.b64 %rd56, %rd161, 3; + add.s64 %rd65, %rd41, %rd56; + and.b32 %r3, %r1, 127; + cvt.u64.u32 %rd57, %r3; + or.b64 %rd19, %rd55, %rd57; + setp.lt.s64 %p13, %rd19, %rd44; + and.pred %p14, %p12, %p13; + setp.gt.u64 %p6, %rd6, 15; + and.b64 %rd24, %rd19, 2047; + sub.s64 %rd59, %rd6, %rd7; + shl.b64 %rd20, %rd59, 7; + .loc 1 32 40 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:32:40 + cvt.u64.u32 %rd60, %r2; + cvt.u64.u32 %rd61, %r1; + shr.u64 %rd62, %rd61, 7; + cvt.u32.u64 %r82, %rd62; + shr.u64 %rd162, %rd60, 7; + mov.b64 %rd164, 0; + mov.b64 %rd163, -2048; + mov.b64 %rd165, %rd164; + mov.b64 %rd166, %rd164; + mov.b64 %rd167, %rd164; +$L__BB0_7: // =>This Inner Loop Header: Depth=1 + .loc 1 38 27 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:38:27 + or.b32 %r25, %r82, 4; + add.s64 %rd79, %rd162, 8; + or.b32 %r26, %r82, 12; + .loc 1 42 22 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:42:22 + cvt.u64.u32 %rd80, %r25; + cvt.u64.u32 %rd81, %r26; + .loc 1 45 76 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:45:76 + // begin inline asm + mov.u64 %rd63, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd63, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd64, 0x0; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd64 }, [ %rd65 + 0 ], %rd63; + // end inline asm + // begin inline asm + mov.u64 %rd67, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd67, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd68, 0x0; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd68 }, [ %rd65 + 0 ], %rd67; + // end inline asm + // begin inline asm + mov.u64 %rd71, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd71, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd72, 0x0; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd72 }, [ %rd65 + 0 ], %rd71; + // end inline asm + // begin inline asm + mov.u64 %rd75, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd75, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd76, 0x0; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd76 }, [ %rd65 + 0 ], %rd75; + // end inline asm + .loc 1 42 22 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:42:22 + or.b64 %rd82, %rd12, %rd80; + or.b64 %rd83, %rd12, %rd162; + or.b64 %rd84, %rd12, %rd79; + or.b64 %rd85, %rd12, %rd81; + .loc 1 44 23 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:44:23 + setp.ge.s64 %p18, %rd85, %rd19; + setp.ge.s64 %p19, %rd84, %rd19; + setp.ge.s64 %p20, %rd83, %rd19; + setp.ge.s64 %p21, %rd82, %rd19; + .loc 1 48 22 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:48:22 + max.s64 %rd86, %rd19, %rd85; + setp.lt.s64 %p22, %rd86, %rd76; + max.s64 %rd87, %rd19, %rd84; + setp.lt.s64 %p23, %rd87, %rd72; + max.s64 %rd88, %rd19, %rd83; + setp.lt.s64 %p24, %rd88, %rd64; + max.s64 %rd89, %rd19, %rd82; + setp.lt.s64 %p25, %rd89, %rd68; + .loc 1 49 23 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:49:23 + and.pred %p26, %p21, %p25; + and.pred %p27, %p20, %p24; + and.pred %p28, %p19, %p23; + and.pred %p29, %p18, %p22; + .loc 1 55 24 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:55:24 + setp.lt.s64 %p30, %rd24, %rd68; + setp.lt.s64 %p31, %rd24, %rd64; + setp.lt.s64 %p32, %rd24, %rd72; + setp.lt.s64 %p33, %rd24, %rd76; + cvt.u32.u64 %r27, %rd162; + cvt.u32.u64 %r28, %rd79; + .loc 1 57 38 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:57:38 + sub.s32 %r29, %r3, %r25; + sub.s32 %r30, %r3, %r27; + sub.s32 %r31, %r3, %r28; + sub.s32 %r32, %r3, %r26; + cvt.u64.u32 %rd90, %r32; + cvt.u64.u32 %rd91, %r31; + cvt.u64.u32 %rd92, %r30; + cvt.u64.u32 %rd93, %r29; + .loc 1 57 51 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:57:51 + add.s64 %rd94, %rd20, %rd93; + add.s64 %rd95, %rd20, %rd92; + add.s64 %rd96, %rd20, %rd91; + add.s64 %rd97, %rd20, %rd90; + .loc 1 66 39 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:66:39 + and.b64 %rd98, %rd97, 2047; + and.b64 %rd99, %rd96, 2047; + and.b64 %rd100, %rd95, 2047; + and.b64 %rd101, %rd94, 2047; + .loc 1 68 25 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:68:25 + setp.eq.b64 %p34, %rd101, 0; + setp.eq.b64 %p35, %rd100, 0; + setp.eq.b64 %p36, %rd99, 0; + setp.eq.b64 %p37, %rd98, 0; + .loc 1 69 24 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:69:24 + and.pred %p38, %p37, %p33; + and.pred %p39, %p36, %p32; + and.pred %p40, %p35, %p31; + and.pred %p41, %p34, %p30; + and.pred %p42, %p6, %p41; + and.pred %p43, %p6, %p40; + and.pred %p44, %p6, %p39; + and.pred %p45, %p6, %p38; + .loc 1 70 24 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:70:24 + or.pred %p46, %p29, %p45; + or.pred %p48, %p28, %p44; + or.pred %p50, %p27, %p43; + or.pred %p52, %p26, %p42; + .loc 1 76 50 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:76:50 + and.pred %p54, %p14, %p52; + and.pred %p55, %p14, %p50; + and.pred %p56, %p14, %p48; + and.pred %p57, %p14, %p46; + selp.b64 %rd102, 1, 0, %p57; + selp.b64 %rd103, 1, 0, %p56; + selp.b64 %rd104, 1, 0, %p55; + selp.b64 %rd105, 1, 0, %p54; + add.s64 %rd165, %rd165, %rd105; + add.s64 %rd164, %rd164, %rd104; + add.s64 %rd166, %rd166, %rd103; + add.s64 %rd167, %rd167, %rd102; + .loc 1 32 40 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:32:40 + add.s64 %rd163, %rd163, 2048; + add.s32 %r82, %r82, 16; + add.s64 %rd162, %rd162, 16; + setp.lt.u64 %p58, %rd163, 14336; + @%p58 bra $L__BB0_7; +// %bb.8: + .loc 1 25 37 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:25:37 + and.b32 %r39, %r1, 31; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd111, %rd164, %rd166; + add.s64 %rd112, %rd165, %rd167; + add.s64 %rd113, %rd111, %rd112; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + mov.b64 {_, %r40}, %rd113; + cvt.u32.u64 %r41, %rd113; + shfl.sync.bfly.b32 %r42, %r41, 16, 31, -1; + shfl.sync.bfly.b32 %r43, %r40, 16, 31, -1; + cvt.u64.u32 %rd114, %r42; + cvt.u64.u32 %rd115, %r43; + shl.b64 %rd116, %rd115, 32; + or.b64 %rd117, %rd114, %rd116; + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd118, %rd113, %rd117; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + mov.b64 {_, %r44}, %rd118; + cvt.u32.u64 %r45, %rd118; + shfl.sync.bfly.b32 %r46, %r45, 8, 31, -1; + shfl.sync.bfly.b32 %r47, %r44, 8, 31, -1; + cvt.u64.u32 %rd119, %r46; + cvt.u64.u32 %rd120, %r47; + shl.b64 %rd121, %rd120, 32; + or.b64 %rd122, %rd119, %rd121; + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd123, %rd118, %rd122; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + mov.b64 {_, %r48}, %rd123; + cvt.u32.u64 %r49, %rd123; + shfl.sync.bfly.b32 %r50, %r49, 4, 31, -1; + shfl.sync.bfly.b32 %r51, %r48, 4, 31, -1; + cvt.u64.u32 %rd124, %r50; + cvt.u64.u32 %rd125, %r51; + shl.b64 %rd126, %rd125, 32; + or.b64 %rd127, %rd124, %rd126; + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd128, %rd123, %rd127; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + mov.b64 {_, %r52}, %rd128; + cvt.u32.u64 %r53, %rd128; + shfl.sync.bfly.b32 %r54, %r53, 2, 31, -1; + shfl.sync.bfly.b32 %r55, %r52, 2, 31, -1; + cvt.u64.u32 %rd129, %r54; + cvt.u64.u32 %rd130, %r55; + shl.b64 %rd131, %rd130, 32; + or.b64 %rd132, %rd129, %rd131; + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd133, %rd128, %rd132; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + mov.b64 {_, %r56}, %rd133; + cvt.u32.u64 %r57, %rd133; + shfl.sync.bfly.b32 %r58, %r57, 1, 31, -1; + shfl.sync.bfly.b32 %r59, %r56, 1, 31, -1; + cvt.u64.u32 %rd134, %r58; + cvt.u64.u32 %rd135, %r59; + shl.b64 %rd136, %rd135, 32; + or.b64 %rd137, %rd134, %rd136; + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd106, %rd133, %rd137; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + setp.eq.b32 %p59, %r39, 0; + shr.u32 %r60, %r1, 2; + and.b32 %r61, %r60, 120; + mov.b32 %r62, global_smem; + add.s32 %r33, %r62, %r61; + // begin inline asm + @%p59 st.shared.b64 [ %r33 + 0 ], %rd106; + // end inline asm + bar.sync 0; + setp.lt.u32 %p60, %r1, 16; + shl.b32 %r63, %r1, 3; + add.s32 %r34, %r62, %r63; + // begin inline asm + @%p60 ld.shared.b64 %rd107, [ %r34 + 0 ]; + // end inline asm + mov.b64 {_, %r64}, %rd107; + cvt.u32.u64 %r65, %rd107; + shfl.sync.bfly.b32 %r66, %r65, 8, 31, -1; + shfl.sync.bfly.b32 %r67, %r64, 8, 31, -1; + cvt.u64.u32 %rd138, %r66; + cvt.u64.u32 %rd139, %r67; + shl.b64 %rd140, %rd139, 32; + or.b64 %rd141, %rd138, %rd140; + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd142, %rd107, %rd141; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + mov.b64 {_, %r68}, %rd142; + cvt.u32.u64 %r69, %rd142; + shfl.sync.bfly.b32 %r70, %r69, 4, 31, -1; + shfl.sync.bfly.b32 %r71, %r68, 4, 31, -1; + cvt.u64.u32 %rd143, %r70; + cvt.u64.u32 %rd144, %r71; + shl.b64 %rd145, %rd144, 32; + or.b64 %rd146, %rd143, %rd145; + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd147, %rd142, %rd146; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + mov.b64 {_, %r72}, %rd147; + cvt.u32.u64 %r73, %rd147; + shfl.sync.bfly.b32 %r74, %r73, 2, 31, -1; + shfl.sync.bfly.b32 %r75, %r72, 2, 31, -1; + cvt.u64.u32 %rd148, %r74; + cvt.u64.u32 %rd149, %r75; + shl.b64 %rd150, %rd149, 32; + or.b64 %rd151, %rd148, %rd150; + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd152, %rd147, %rd151; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + mov.b64 {_, %r76}, %rd152; + cvt.u32.u64 %r77, %rd152; + shfl.sync.bfly.b32 %r78, %r77, 1, 31, -1; + shfl.sync.bfly.b32 %r79, %r76, 1, 31, -1; + cvt.u64.u32 %rd153, %r78; + cvt.u64.u32 %rd154, %r79; + shl.b64 %rd155, %rd154, 32; + or.b64 %rd156, %rd153, %rd155; + .loc 2 261 15 // standard.py:261:15 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + add.s64 %rd108, %rd152, %rd156; + .loc 2 291 36 // standard.py:291:36 @[ cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:77:27 ] + setp.eq.b32 %p61, %r1, 0; + // begin inline asm + @%p61 st.shared.b64 [ %r34 + 0 ], %rd108; + // end inline asm + bar.sync 0; + ld.shared.b64 %rd157, [global_smem]; +$L__tmp2: + .loc 1 82 20 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:82:20 + add.s64 %rd158, %rd157, -1; + setp.lt.u64 %p65, %rd158, 16383; + .loc 1 0 0 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:0 + selp.b32 %r36, 1, 0, %p65; + .loc 1 85 21 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:85:21 + setp.eq.b64 %p66, %rd157, 16384; + .loc 1 0 0 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:0 + selp.b32 %r37, 1, 0, %p66; + .loc 1 88 25 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:88:25 + shl.b64 %rd159, %rd1, 2; + add.s64 %rd109, %rd42, %rd159; + .loc 1 88 37 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:88:37 + setp.eq.b32 %p67, %r2, 0; + and.pred %p62, %p67, %p12; + // begin inline asm + @%p62 st.global.b32 [ %rd109 + 0 ], { %r36 }; + // end inline asm + .loc 1 89 25 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:89:25 + add.s64 %rd110, %rd43, %rd159; + .loc 1 89 37 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:89:37 + // begin inline asm + @%p62 st.global.b32 [ %rd110 + 0 ], { %r37 }; + // end inline asm + .loc 1 89 4 // cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py:89:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 307 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x12c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 98 +.b8 105 +.b8 103 +.b8 101 +.b8 121 +.b8 110 +.b8 107 +.b8 109 +.b8 97 +.b8 109 +.b8 105 +.b8 114 +.b8 122 +.b8 114 +.b8 97 +.b8 53 +.b8 111 +.b8 99 +.b8 100 +.b8 101 +.b8 107 +.b8 52 +.b8 118 +.b8 102 +.b8 101 +.b8 51 +.b8 105 +.b8 100 +.b8 110 +.b8 104 +.b8 50 +.b8 107 +.b8 114 +.b8 50 +.b8 98 +.b8 102 +.b8 115 +.b8 99 +.b8 98 +.b8 120 +.b8 116 +.b8 105 +.b8 105 +.b8 109 +.b8 51 +.b8 114 +.b8 113 +.b8 53 +.b8 100 +.b8 102 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 98 +.b8 105 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x7d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 111 +.b8 114 +.b8 95 +.b8 99 +.b8 111 +.b8 110 +.b8 115 +.b8 116 +.b8 97 +.b8 110 +.b8 116 +.b8 95 +.b8 112 +.b8 97 +.b8 100 +.b8 95 +.b8 110 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 101 +.b8 95 +.b8 103 +.b8 116 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 109 +.b8 117 +.b8 116 +.b8 101 +.b8 95 +.b8 114 +.b8 101 +.b8 109 +.b8 97 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 114 +.b8 95 +.b8 115 +.b8 117 +.b8 98 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x108:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11d:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 77 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source new file mode 100644 index 0000000000000000000000000000000000000000..657b226042b434b9e99202f494479320717a7a81 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.source @@ -0,0 +1,395 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":18:0) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc91 = loc(unknown) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc98 = loc("in_ptr0"(#loc)) +#loc99 = loc("out_ptr1"(#loc)) +#loc100 = loc("out_ptr2"(#loc)) +#loc101 = loc("ks0"(#loc)) +#loc102 = loc("ks1"(#loc)) +#loc103 = loc("ks2"(#loc)) +#loc104 = loc("xnumel"(#loc)) +#loc105 = loc("r0_numel"(#loc)) +#loc188 = loc("input"(#loc89)) +#loc189 = loc("a"(#loc94)) +#loc190 = loc("b"(#loc94)) +module { + tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 {tt.divisibility = 16 : i32} loc("ks2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 16384 : i32 loc(#loc106) + %xoffset = tt.get_program_id x : i32 loc(#loc107) + %xoffset_1 = arith.constant 1 : i32 loc(#loc108) + %xoffset_2 = arith.constant 1 : i32 loc(#loc108) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc108) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc109) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc110) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc111) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc111) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc112) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc112) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc113) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc114) + %x0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc115) + %x0_9 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc115) + %x0_10 = arith.remsi %x0, %x0_9 : tensor<1x1xi64> loc(#loc115) + %x1 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc116) + %x1_11 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc116) + %x1_12 = arith.divsi %x1, %x1_11 : tensor<1x1xi64> loc(#loc116) + %x1_13 = arith.constant 16 : i32 loc(#loc117) + %x1_14 = arith.constant 16 : i64 loc(#loc117) + %x1_15 = arith.constant dense<16> : tensor<1x1xi64> loc(#loc117) + %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<1x1xi64> loc(#loc117) + %x2 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc118) + %x2_17 = tt.splat %ks2 : i64 -> tensor<1x1xi64> loc(#loc118) + %x2_18 = arith.divsi %x2, %x2_17 : tensor<1x1xi64> loc(#loc118) + %_tmp36 = arith.constant 0 : i64 loc(#loc119) + %_tmp36_19 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc119) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp36_20 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp36_24 = %_tmp36_19) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc121) + %r0_index_25 = arith.addi %r0_index, %r0_base_8 : tensor<1x2048xi32> loc(#loc121) + %r0_mask = arith.constant dense<16384> : tensor<1x2048xi32> loc(#loc122) + %r0_mask_26 = arith.cmpi slt, %r0_index_25, %r0_mask : tensor<1x2048xi32> loc(#loc122) + %r0_3 = arith.constant 128 : i32 loc(#loc123) + %r0_3_27 = arith.constant 128 : i32 loc(#loc123) + %r0_3_28 = arith.constant dense<128> : tensor<1x2048xi32> loc(#loc123) + %r0_3_29 = arith.remsi %r0_index_25, %r0_3_28 : tensor<1x2048xi32> loc(#loc123) + %r0_4 = arith.constant 128 : i32 loc(#loc124) + %r0_4_30 = arith.constant 128 : i32 loc(#loc124) + %r0_4_31 = arith.constant dense<128> : tensor<1x2048xi32> loc(#loc124) + %r0_4_32 = arith.divsi %r0_index_25, %r0_4_31 : tensor<1x2048xi32> loc(#loc124) + %tmp0 = arith.constant 128 : i32 loc(#loc125) + %tmp0_33 = arith.constant 128 : i64 loc(#loc125) + %tmp0_34 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc125) + %tmp0_35 = arith.muli %tmp0_34, %x0_10 : tensor<1x1xi64> loc(#loc125) + %tmp0_36 = arith.extsi %r0_3_29 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc126) + %tmp0_37 = tt.broadcast %tmp0_35 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc126) + %tmp0_38 = arith.addi %tmp0_36, %tmp0_37 : tensor<1x2048xi64> loc(#loc126) + %tmp2 = tt.splat %ks1 : i64 -> tensor<1x2048xi64> loc(#loc127) + %tmp2_39 = arith.cmpi slt, %tmp0_38, %tmp2 : tensor<1x2048xi64> loc(#loc127) + %tmp3 = arith.constant 128 : i32 loc(#loc128) + %tmp3_40 = arith.constant 128 : i64 loc(#loc128) + %tmp3_41 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc128) + %tmp3_42 = arith.muli %tmp3_41, %x1_16 : tensor<1x1xi64> loc(#loc128) + %tmp3_43 = arith.extsi %r0_4_32 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc129) + %tmp3_44 = tt.broadcast %tmp3_42 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc129) + %tmp3_45 = arith.addi %tmp3_43, %tmp3_44 : tensor<1x2048xi64> loc(#loc129) + %tmp4 = arith.constant 128 : i32 loc(#loc130) + %tmp4_46 = arith.constant 128 : i64 loc(#loc130) + %tmp4_47 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc130) + %tmp4_48 = arith.muli %tmp4_47, %x0_10 : tensor<1x1xi64> loc(#loc130) + %tmp4_49 = arith.extsi %r0_3_29 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc131) + %tmp4_50 = tt.broadcast %tmp4_48 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc131) + %tmp4_51 = arith.addi %tmp4_49, %tmp4_50 : tensor<1x2048xi64> loc(#loc131) + %tmp5 = arith.cmpi sge, %tmp3_45, %tmp4_51 : tensor<1x2048xi64> loc(#loc132) + %tmp6 = tt.broadcast %x2_18 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc133) + %tmp6_52 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc134) + %tmp6_53 = tt.addptr %tmp6_52, %tmp6 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc134) + %tmp6_54 = arith.andi %r0_mask_26, %tmp2_39 : tensor<1x2048xi1> loc(#loc135) + %tmp6_55 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc136) + %tmp6_56 = arith.andi %tmp6_54, %tmp6_55 : tensor<1x2048xi1> loc(#loc136) + %tmp6_57 = arith.constant 0.000000e+00 : f32 loc(#loc137) + %tmp6_58 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc137) + %tmp6_59 = arith.fptosi %tmp6_58 : tensor<1x2048xf32> to tensor<1x2048xi64> loc(#loc137) + %tmp6_60 = tt.load %tmp6_53, %tmp6_56, %tmp6_59 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc137) + %tmp7 = arith.cmpi slt, %tmp4_51, %tmp6_60 : tensor<1x2048xi64> loc(#loc138) + %tmp8 = arith.cmpi slt, %tmp3_45, %tmp6_60 : tensor<1x2048xi64> loc(#loc139) + %tmp9 = arith.andi %tmp7, %tmp8 : tensor<1x2048xi1> loc(#loc140) + %tmp10 = arith.andi %tmp5, %tmp9 : tensor<1x2048xi1> loc(#loc141) + %tmp11 = arith.constant false loc(#loc142) + %tmp11_61 = arith.constant dense : tensor<1x1xi1> loc(#loc142) + %tmp12 = arith.constant dense : tensor<1x2048xi1> loc(#loc143) + %tmp12_62 = arith.ori %tmp12, %tmp10 : tensor<1x2048xi1> loc(#loc143) + %tmp13 = arith.constant 2048 : i64 loc(#loc144) + %tmp13_63 = arith.constant dense<2048> : tensor<1x1xi64> loc(#loc144) + %tmp14 = arith.constant dense<2048> : tensor<1x2048xi64> loc(#loc145) + %tmp14_64 = arith.cmpi sge, %tmp4_51, %tmp14 : tensor<1x2048xi64> loc(#loc145) + %tmp15 = arith.constant 128 : i32 loc(#loc146) + %tmp15_65 = arith.constant 128 : i64 loc(#loc146) + %tmp15_66 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc146) + %tmp15_67 = arith.muli %tmp15_66, %x0_10 : tensor<1x1xi64> loc(#loc146) + %tmp15_68 = arith.extsi %r0_3_29 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc147) + %tmp15_69 = tt.broadcast %tmp15_67 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc147) + %tmp15_70 = arith.addi %tmp15_68, %tmp15_69 : tensor<1x2048xi64> loc(#loc147) + %tmp15_71 = arith.constant 2048 : i32 loc(#loc148) + %tmp15_72 = arith.constant 2048 : i64 loc(#loc148) + %tmp15_73 = arith.constant dense<2048> : tensor<1x2048xi64> loc(#loc148) + %tmp15_74 = arith.remsi %tmp15_70, %tmp15_73 : tensor<1x2048xi64> loc(#loc148) + %tmp16 = arith.cmpi slt, %tmp15_74, %tmp6_60 : tensor<1x2048xi64> loc(#loc149) + %tmp17 = arith.andi %tmp14_64, %tmp16 : tensor<1x2048xi1> loc(#loc150) + %tmp18 = arith.constant -1 : i32 loc(#loc151) + %tmp18_75 = arith.constant -1 : i32 loc(#loc151) + %tmp18_76 = arith.constant dense<-1> : tensor<1x2048xi32> loc(#loc151) + %tmp18_77 = arith.muli %tmp18_76, %r0_4_32 : tensor<1x2048xi32> loc(#loc151) + %tmp18_78 = arith.addi %r0_3_29, %tmp18_77 : tensor<1x2048xi32> loc(#loc152) + %tmp18_79 = arith.constant -128 : i32 loc(#loc153) + %tmp18_80 = arith.constant -128 : i64 loc(#loc153) + %tmp18_81 = arith.constant dense<-128> : tensor<1x1xi64> loc(#loc153) + %tmp18_82 = arith.muli %tmp18_81, %x1_16 : tensor<1x1xi64> loc(#loc153) + %tmp18_83 = arith.extsi %tmp18_78 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc154) + %tmp18_84 = tt.broadcast %tmp18_82 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc154) + %tmp18_85 = arith.addi %tmp18_83, %tmp18_84 : tensor<1x2048xi64> loc(#loc154) + %tmp18_86 = arith.constant 128 : i32 loc(#loc155) + %tmp18_87 = arith.constant 128 : i64 loc(#loc155) + %tmp18_88 = arith.constant dense<128> : tensor<1x1xi64> loc(#loc155) + %tmp18_89 = arith.muli %tmp18_88, %x0_10 : tensor<1x1xi64> loc(#loc155) + %tmp18_90 = tt.broadcast %tmp18_89 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc156) + %tmp18_91 = arith.addi %tmp18_85, %tmp18_90 : tensor<1x2048xi64> loc(#loc156) + %tmp19 = arith.constant dense<2048> : tensor<1x2048xi64> loc(#loc157) + %tmp19_92 = arith.remsi %tmp18_91, %tmp19 : tensor<1x2048xi64> loc(#loc157) + %tmp20 = arith.constant 0 : i32 loc(#loc158) + %tmp20_93 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc158) + %tmp21 = arith.extsi %tmp20_93 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc159) + %tmp21_94 = tt.broadcast %tmp21 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc159) + %tmp21_95 = arith.cmpi ne, %tmp19_92, %tmp21_94 : tensor<1x2048xi64> loc(#loc159) + %tmp22 = arith.constant 0 : i32 loc(#loc160) + %tmp22_96 = arith.extsi %tmp22 : i32 to i64 loc(#loc160) + %tmp22_97 = tt.splat %tmp22_96 : i64 -> tensor<1x2048xi64> loc(#loc160) + %tmp22_98 = arith.cmpi slt, %tmp19_92, %tmp22_97 : tensor<1x2048xi64> loc(#loc160) + %tmp23 = arith.constant 0 : i32 loc(#loc161) + %tmp23_99 = arith.extsi %tmp23 : i32 to i64 loc(#loc161) + %tmp23_100 = tt.splat %tmp23_99 : i64 -> tensor<1x1xi64> loc(#loc161) + %tmp23_101 = arith.cmpi slt, %tmp13_63, %tmp23_100 : tensor<1x1xi64> loc(#loc161) + %tmp24 = tt.broadcast %tmp23_101 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc162) + %tmp24_102 = arith.cmpi ne, %tmp22_98, %tmp24 : tensor<1x2048xi1> loc(#loc162) + %tmp25 = arith.andi %tmp21_95, %tmp24_102 : tensor<1x2048xi1> loc(#loc163) + %tmp26 = arith.constant dense<2048> : tensor<1x2048xi64> loc(#loc164) + %tmp26_103 = arith.addi %tmp19_92, %tmp26 : tensor<1x2048xi64> loc(#loc164) + %tmp27 = arith.select %tmp25, %tmp26_103, %tmp19_92 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc165) + %tmp28 = arith.constant 0 : i64 loc(#loc166) + %tmp28_104 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc166) + %tmp29 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc167) + %tmp29_105 = arith.cmpi eq, %tmp27, %tmp29 : tensor<1x2048xi64> loc(#loc167) + %tmp30 = arith.andi %tmp17, %tmp29_105 : tensor<1x2048xi1> loc(#loc168) + %tmp31 = arith.ori %tmp12_62, %tmp30 : tensor<1x2048xi1> loc(#loc169) + %tmp32 = arith.constant false loc(#loc170) + %tmp32_106 = arith.constant dense : tensor<1x2048xi1> loc(#loc170) + %tmp33 = arith.select %tmp2_39, %tmp31, %tmp32_106 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc171) + %tmp34 = arith.extui %tmp33 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc172) + %tmp37 = arith.addi %_tmp36_24, %tmp34 : tensor<1x2048xi64> loc(#loc173) + %_tmp36_107 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc174) + %_tmp36_108 = arith.andi %r0_mask_26, %_tmp36_107 : tensor<1x2048xi1> loc(#loc174) + %_tmp36_109 = arith.select %_tmp36_108, %tmp37, %_tmp36_24 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc175) + scf.yield %_tmp36_109 : tensor<1x2048xi64> loc(#loc71) + } loc(#loc120) + %tmp36 = tt.call @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp36_20) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc176) + %tmp36_21 = tt.expand_dims %tmp36 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc177) + %tmp38 = arith.constant 0 : i64 loc(#loc178) + %tmp38_22 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc178) + %tmp39 = arith.cmpi sgt, %tmp36_21, %tmp38_22 : tensor<1x1xi64> loc(#loc179) + %tmp40 = arith.constant 16384 : i64 loc(#loc180) + %tmp40_23 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc180) + %tmp41 = arith.cmpi slt, %tmp36_21, %tmp40_23 : tensor<1x1xi64> loc(#loc181) + %tmp42 = arith.andi %tmp39, %tmp41 : tensor<1x1xi1> loc(#loc182) + %tmp43 = arith.extui %tmp42 : tensor<1x1xi1> to tensor<1x1xi8> loc(#loc183) + %tmp44 = arith.extsi %tmp43 : tensor<1x1xi8> to tensor<1x1xi32> loc(#loc184) + %tmp45 = arith.cmpi eq, %tmp36_21, %tmp40_23 : tensor<1x1xi64> loc(#loc185) + %tmp46 = arith.extui %tmp45 : tensor<1x1xi1> to tensor<1x1xi8> loc(#loc186) + %tmp47 = arith.extsi %tmp46 : tensor<1x1xi8> to tensor<1x1xi32> loc(#loc187) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc84) + %5 = tt.addptr %4, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc84) + tt.store %5, %tmp44, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc85) + %6 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc86) + %7 = tt.addptr %6, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc86) + tt.store %7, %tmp47, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc87) + tt.return loc(#loc88) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2048xi64> loc("input"(#loc89))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc90) + tt.reduce.return %2 : i64 loc(#loc90) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc90) + tt.return %0 : tensor<1xi64> loc(#loc92) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc93) + tt.return %1 : tensor<1xi64> loc(#loc93) + } loc(#loc89) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc94)), %b: i64 loc("b"(#loc94))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc95) + tt.return %0 : i64 loc(#loc96) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc97) + tt.return %1 : i64 loc(#loc97) + } loc(#loc94) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":28:21) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":28:28) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":29:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":30:44) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":32:40) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":33:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":34:29) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":37:27) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":38:27) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":39:26) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":39:22) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":41:22) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":42:26) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":42:22) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":43:26) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":43:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":44:23) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:54) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:34) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:86) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:93) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:76) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":46:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":47:22) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":48:22) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":49:23) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":50:39) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":51:24) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":52:38) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":53:24) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":54:29) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":54:25) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":54:35) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":55:24) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":56:24) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:29) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:24) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:45) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:38) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:55) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:51) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":58:25) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":59:35) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":60:25) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":61:92) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":62:92) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":63:25) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":64:24) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":65:24) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":66:39) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":67:35) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":68:25) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":69:24) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":70:24) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":71:44) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":72:38) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":73:25) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":75:25) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":76:36) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":76:50) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":76:8) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":77:27) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":77:30) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":78:31) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":79:20) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":80:35) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":81:20) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":82:20) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":83:21) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":84:21) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":85:21) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":86:21) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":87:21) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":88:25) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":88:37) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":89:25) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":89:37) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":89:4) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc106 = loc("r0_numel"(#loc1)) +#loc107 = loc("xoffset"(#loc2)) +#loc108 = loc("xoffset"(#loc3)) +#loc109 = loc("xindex"(#loc4)) +#loc110 = loc("xindex"(#loc5)) +#loc111 = loc("xindex"(#loc6)) +#loc112 = loc("xmask"(#loc7)) +#loc113 = loc("r0_base"(#loc8)) +#loc114 = loc("r0_base"(#loc9)) +#loc115 = loc("x0"(#loc10)) +#loc116 = loc("x1"(#loc11)) +#loc117 = loc("x1"(#loc12)) +#loc118 = loc("x2"(#loc13)) +#loc119 = loc("_tmp36"(#loc14)) +#loc120 = loc("_tmp36"(#loc15)) +#loc121 = loc("r0_index"(#loc16)) +#loc122 = loc("r0_mask"(#loc17)) +#loc123 = loc("r0_3"(#loc18)) +#loc124 = loc("r0_4"(#loc19)) +#loc125 = loc("tmp0"(#loc20)) +#loc126 = loc("tmp0"(#loc21)) +#loc127 = loc("tmp2"(#loc22)) +#loc128 = loc("tmp3"(#loc23)) +#loc129 = loc("tmp3"(#loc24)) +#loc130 = loc("tmp4"(#loc25)) +#loc131 = loc("tmp4"(#loc26)) +#loc132 = loc("tmp5"(#loc27)) +#loc133 = loc("tmp6"(#loc28)) +#loc134 = loc("tmp6"(#loc29)) +#loc135 = loc("tmp6"(#loc30)) +#loc136 = loc("tmp6"(#loc31)) +#loc137 = loc("tmp6"(#loc32)) +#loc138 = loc("tmp7"(#loc33)) +#loc139 = loc("tmp8"(#loc34)) +#loc140 = loc("tmp9"(#loc35)) +#loc141 = loc("tmp10"(#loc36)) +#loc142 = loc("tmp11"(#loc37)) +#loc143 = loc("tmp12"(#loc38)) +#loc144 = loc("tmp13"(#loc39)) +#loc145 = loc("tmp14"(#loc40)) +#loc146 = loc("tmp15"(#loc41)) +#loc147 = loc("tmp15"(#loc42)) +#loc148 = loc("tmp15"(#loc43)) +#loc149 = loc("tmp16"(#loc44)) +#loc150 = loc("tmp17"(#loc45)) +#loc151 = loc("tmp18"(#loc46)) +#loc152 = loc("tmp18"(#loc47)) +#loc153 = loc("tmp18"(#loc48)) +#loc154 = loc("tmp18"(#loc49)) +#loc155 = loc("tmp18"(#loc50)) +#loc156 = loc("tmp18"(#loc51)) +#loc157 = loc("tmp19"(#loc52)) +#loc158 = loc("tmp20"(#loc53)) +#loc159 = loc("tmp21"(#loc54)) +#loc160 = loc("tmp22"(#loc55)) +#loc161 = loc("tmp23"(#loc56)) +#loc162 = loc("tmp24"(#loc57)) +#loc163 = loc("tmp25"(#loc58)) +#loc164 = loc("tmp26"(#loc59)) +#loc165 = loc("tmp27"(#loc60)) +#loc166 = loc("tmp28"(#loc61)) +#loc167 = loc("tmp29"(#loc62)) +#loc168 = loc("tmp30"(#loc63)) +#loc169 = loc("tmp31"(#loc64)) +#loc170 = loc("tmp32"(#loc65)) +#loc171 = loc("tmp33"(#loc66)) +#loc172 = loc("tmp34"(#loc67)) +#loc173 = loc("tmp37"(#loc68)) +#loc174 = loc("_tmp36"(#loc69)) +#loc175 = loc("_tmp36"(#loc70)) +#loc176 = loc("tmp36"(#loc72)) +#loc177 = loc("tmp36"(#loc73)) +#loc178 = loc("tmp38"(#loc74)) +#loc179 = loc("tmp39"(#loc75)) +#loc180 = loc("tmp40"(#loc76)) +#loc181 = loc("tmp41"(#loc77)) +#loc182 = loc("tmp42"(#loc78)) +#loc183 = loc("tmp43"(#loc79)) +#loc184 = loc("tmp44"(#loc80)) +#loc185 = loc("tmp45"(#loc81)) +#loc186 = loc("tmp46"(#loc82)) +#loc187 = loc("tmp47"(#loc83)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..374c279aaa66598206a4d8c8055b59076d020847 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttgir @@ -0,0 +1,243 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":18:0) +#loc1 = loc(unknown) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":77:27) +#loc68 = loc("in_ptr0"(#loc)) +#loc69 = loc("out_ptr1"(#loc)) +#loc70 = loc("out_ptr2"(#loc)) +#loc71 = loc("ks0"(#loc)) +#loc72 = loc("ks1"(#loc)) +#loc73 = loc("ks2"(#loc)) +#loc74 = loc("xnumel"(#loc)) +#loc75 = loc("r0_numel"(#loc)) +#loc124 = loc("tmp36"(#loc52)) +#loc139 = loc(callsite(#loc1 at #loc124)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 {tt.divisibility = 16 : i32} loc("ks2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<2048> : tensor<1x2048xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<16384> : tensor<1x2048xi32, #blocked> loc(#loc1) + %c-128_i64 = arith.constant -128 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %c16_i64 = arith.constant 16 : i64 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_2 = arith.constant dense<16384> : tensor<1x1xi64, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<1x1xi64, #blocked> loc(#loc1) + %cst_4 = arith.constant dense : tensor<1x2048xi1, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<0> : tensor<1x2048xi64, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc76) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc77) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc78) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc78) + %x0 = arith.extsi %xoffset : i32 to i64 loc(#loc79) + %x0_7 = arith.remsi %x0, %ks0 : i64 loc(#loc79) + %x1 = arith.divsi %x0, %ks0 : i64 loc(#loc80) + %x1_8 = arith.remsi %x1, %c16_i64 : i64 loc(#loc81) + %x2 = arith.divsi %x0, %ks2 : i64 loc(#loc82) + %tmp0 = arith.muli %x0_7, %c128_i64 : i64 loc(#loc83) + %tmp0_9 = tt.splat %tmp0 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc134) + %tmp2 = tt.splat %ks1 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc85) + %tmp3 = arith.muli %x1_8, %c128_i64 : i64 loc(#loc86) + %tmp3_10 = tt.splat %tmp3 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc135) + %tmp6 = tt.addptr %in_ptr0, %x2 : !tt.ptr, i64 loc(#loc88) + %tmp6_11 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc136) + %tmp6_12 = tt.splat %tmp6 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc90) + %tmp18 = arith.muli %x1_8, %c-128_i64 : i64 loc(#loc91) + %tmp18_13 = tt.splat %tmp18 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc137) + %_tmp36 = scf.for %_tmp36_15 = %c0_i32 to %c16384_i32 step %c2048_i32 iter_args(%arg9 = %cst_5) -> (tensor<1x2048xi64, #blocked>) : i32 { + %r0_index = tt.splat %_tmp36_15 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc94) + %r0_index_16 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32, #blocked> loc(#loc94) + %r0_mask = arith.cmpi slt, %r0_index_16, %cst_1 : tensor<1x2048xi32, #blocked> loc(#loc95) + %r0_3 = arith.remsi %r0_index_16, %cst_0 : tensor<1x2048xi32, #blocked> loc(#loc96) + %r0_4 = arith.divsi %r0_index_16, %cst_0 : tensor<1x2048xi32, #blocked> loc(#loc97) + %tmp0_17 = arith.extsi %r0_3 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc84) + %tmp0_18 = arith.addi %tmp0_17, %tmp0_9 : tensor<1x2048xi64, #blocked> loc(#loc84) + %tmp2_19 = arith.cmpi slt, %tmp0_18, %tmp2 : tensor<1x2048xi64, #blocked> loc(#loc85) + %tmp3_20 = arith.extsi %r0_4 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc87) + %tmp3_21 = arith.addi %tmp3_20, %tmp3_10 : tensor<1x2048xi64, #blocked> loc(#loc87) + %tmp5 = arith.cmpi sge, %tmp3_21, %tmp0_18 : tensor<1x2048xi64, #blocked> loc(#loc98) + %tmp6_22 = arith.andi %r0_mask, %tmp2_19 : tensor<1x2048xi1, #blocked> loc(#loc99) + %tmp6_23 = arith.andi %tmp6_22, %tmp6_11 : tensor<1x2048xi1, #blocked> loc(#loc89) + %tmp6_24 = tt.load %tmp6_12, %tmp6_23, %cst_5 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc90) + %tmp7 = arith.cmpi slt, %tmp0_18, %tmp6_24 : tensor<1x2048xi64, #blocked> loc(#loc100) + %tmp8 = arith.cmpi slt, %tmp3_21, %tmp6_24 : tensor<1x2048xi64, #blocked> loc(#loc101) + %tmp9 = arith.andi %tmp7, %tmp8 : tensor<1x2048xi1, #blocked> loc(#loc102) + %tmp10 = arith.andi %tmp5, %tmp9 : tensor<1x2048xi1, #blocked> loc(#loc103) + %tmp14 = arith.cmpi sge, %tmp0_18, %cst : tensor<1x2048xi64, #blocked> loc(#loc104) + %tmp15 = arith.remsi %tmp0_18, %cst : tensor<1x2048xi64, #blocked> loc(#loc105) + %tmp16 = arith.cmpi slt, %tmp15, %tmp6_24 : tensor<1x2048xi64, #blocked> loc(#loc106) + %tmp17 = arith.andi %tmp14, %tmp16 : tensor<1x2048xi1, #blocked> loc(#loc107) + %tmp18_25 = arith.subi %r0_3, %r0_4 : tensor<1x2048xi32, #blocked> loc(#loc108) + %tmp18_26 = arith.extsi %tmp18_25 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc92) + %tmp18_27 = arith.addi %tmp18_26, %tmp18_13 : tensor<1x2048xi64, #blocked> loc(#loc92) + %tmp18_28 = arith.addi %tmp18_27, %tmp0_9 : tensor<1x2048xi64, #blocked> loc(#loc109) + %tmp19 = arith.remsi %tmp18_28, %cst : tensor<1x2048xi64, #blocked> loc(#loc110) + %tmp21 = arith.cmpi ne, %tmp19, %cst_5 : tensor<1x2048xi64, #blocked> loc(#loc111) + %tmp22 = arith.cmpi slt, %tmp19, %cst_5 : tensor<1x2048xi64, #blocked> loc(#loc112) + %tmp25 = arith.andi %tmp21, %tmp22 : tensor<1x2048xi1, #blocked> loc(#loc113) + %tmp26 = arith.addi %tmp19, %cst : tensor<1x2048xi64, #blocked> loc(#loc114) + %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc115) + %tmp29 = arith.cmpi eq, %tmp27, %cst_5 : tensor<1x2048xi64, #blocked> loc(#loc116) + %tmp30 = arith.andi %tmp17, %tmp29 : tensor<1x2048xi1, #blocked> loc(#loc117) + %tmp31 = arith.ori %tmp10, %tmp30 : tensor<1x2048xi1, #blocked> loc(#loc118) + %tmp33 = arith.select %tmp2_19, %tmp31, %cst_4 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi1, #blocked> loc(#loc119) + %tmp34 = arith.extui %tmp33 : tensor<1x2048xi1, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc120) + %tmp37 = arith.addi %arg9, %tmp34 : tensor<1x2048xi64, #blocked> loc(#loc121) + %_tmp36_29 = arith.andi %r0_mask, %tmp6_11 : tensor<1x2048xi1, #blocked> loc(#loc122) + %_tmp36_30 = arith.select %_tmp36_29, %tmp37, %arg9 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc123) + scf.yield %_tmp36_30 : tensor<1x2048xi64, #blocked> loc(#loc50) + } loc(#loc93) + %tmp36 = "tt.reduce"(%_tmp36) <{axis = 1 : i32}> ({ + ^bb0(%tmp36_15: i64 loc(callsite(#loc1 at #loc124)), %tmp36_16: i64 loc(callsite(#loc1 at #loc124))): + %tmp36_17 = arith.addi %tmp36_15, %tmp36_16 : i64 loc(#loc142) + tt.reduce.return %tmp36_17 : i64 loc(#loc138) + }) : (tensor<1x2048xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc138) + %tmp36_14 = tt.expand_dims %tmp36 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc125) + %tmp39 = arith.cmpi sgt, %tmp36_14, %cst_3 : tensor<1x1xi64, #blocked> loc(#loc126) + %tmp41 = arith.cmpi slt, %tmp36_14, %cst_2 : tensor<1x1xi64, #blocked> loc(#loc127) + %tmp42 = arith.andi %tmp39, %tmp41 : tensor<1x1xi1, #blocked> loc(#loc128) + %tmp44 = arith.extui %tmp42 : tensor<1x1xi1, #blocked> to tensor<1x1xi32, #blocked> loc(#loc140) + %tmp45 = arith.cmpi eq, %tmp36_14, %cst_2 : tensor<1x1xi64, #blocked> loc(#loc131) + %tmp47 = arith.extui %tmp45 : tensor<1x1xi1, #blocked> to tensor<1x1xi32, #blocked> loc(#loc141) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc63) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc64) + %2 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc64) + tt.store %1, %tmp44, %2 : tensor<1x1x!tt.ptr, #blocked> loc(#loc64) + %3 = tt.addptr %out_ptr2, %xoffset : !tt.ptr, i32 loc(#loc65) + %4 = tt.splat %3 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc66) + tt.store %4, %tmp47, %2 : tensor<1x1x!tt.ptr, #blocked> loc(#loc66) + tt.return loc(#loc67) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":27:19) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":28:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":28:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":29:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":39:26) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":39:22) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":41:22) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":42:26) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":42:22) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:34) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:93) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:76) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:45) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:38) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":32:40) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":33:31) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":34:29) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":37:27) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":38:27) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":44:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:86) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":46:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":47:22) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":48:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":49:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":53:24) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":54:35) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":55:24) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":56:24) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:24) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:51) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":58:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":60:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":61:92) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":64:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":65:24) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":66:39) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":68:25) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":69:24) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":70:24) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":72:38) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":73:25) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":75:25) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":76:36) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":76:50) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":76:8) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":77:30) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":79:20) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":81:20) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":82:20) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":84:21) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":83:21) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":85:21) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":87:21) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":86:21) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":88:25) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":88:37) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":89:25) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":89:37) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":89:4) +#loc76 = loc("xoffset"(#loc2)) +#loc77 = loc("xmask"(#loc3)) +#loc78 = loc("r0_base"(#loc4)) +#loc79 = loc("x0"(#loc5)) +#loc80 = loc("x1"(#loc6)) +#loc81 = loc("x1"(#loc7)) +#loc82 = loc("x2"(#loc8)) +#loc83 = loc("tmp0"(#loc9)) +#loc84 = loc("tmp0"(#loc10)) +#loc85 = loc("tmp2"(#loc11)) +#loc86 = loc("tmp3"(#loc12)) +#loc87 = loc("tmp3"(#loc13)) +#loc88 = loc("tmp6"(#loc14)) +#loc89 = loc("tmp6"(#loc15)) +#loc90 = loc("tmp6"(#loc16)) +#loc91 = loc("tmp18"(#loc17)) +#loc92 = loc("tmp18"(#loc18)) +#loc93 = loc("_tmp36"(#loc19)) +#loc94 = loc("r0_index"(#loc20)) +#loc95 = loc("r0_mask"(#loc21)) +#loc96 = loc("r0_3"(#loc22)) +#loc97 = loc("r0_4"(#loc23)) +#loc98 = loc("tmp5"(#loc24)) +#loc99 = loc("tmp6"(#loc25)) +#loc100 = loc("tmp7"(#loc26)) +#loc101 = loc("tmp8"(#loc27)) +#loc102 = loc("tmp9"(#loc28)) +#loc103 = loc("tmp10"(#loc29)) +#loc104 = loc("tmp14"(#loc30)) +#loc105 = loc("tmp15"(#loc31)) +#loc106 = loc("tmp16"(#loc32)) +#loc107 = loc("tmp17"(#loc33)) +#loc108 = loc("tmp18"(#loc34)) +#loc109 = loc("tmp18"(#loc35)) +#loc110 = loc("tmp19"(#loc36)) +#loc111 = loc("tmp21"(#loc37)) +#loc112 = loc("tmp22"(#loc38)) +#loc113 = loc("tmp25"(#loc39)) +#loc114 = loc("tmp26"(#loc40)) +#loc115 = loc("tmp27"(#loc41)) +#loc116 = loc("tmp29"(#loc42)) +#loc117 = loc("tmp30"(#loc43)) +#loc118 = loc("tmp31"(#loc44)) +#loc119 = loc("tmp33"(#loc45)) +#loc120 = loc("tmp34"(#loc46)) +#loc121 = loc("tmp37"(#loc47)) +#loc122 = loc("_tmp36"(#loc48)) +#loc123 = loc("_tmp36"(#loc49)) +#loc125 = loc("tmp36"(#loc54)) +#loc126 = loc("tmp39"(#loc55)) +#loc127 = loc("tmp41"(#loc56)) +#loc128 = loc("tmp42"(#loc57)) +#loc129 = loc("tmp44"(#loc58)) +#loc130 = loc("tmp43"(#loc59)) +#loc131 = loc("tmp45"(#loc60)) +#loc132 = loc("tmp47"(#loc61)) +#loc133 = loc("tmp46"(#loc62)) +#loc134 = loc(fused[#loc84, #loc83]) +#loc135 = loc(fused[#loc87, #loc86]) +#loc136 = loc(fused[#loc89, #loc77]) +#loc137 = loc(fused[#loc92, #loc91]) +#loc138 = loc(callsite(#loc51 at #loc124)) +#loc140 = loc(fused[#loc129, #loc130]) +#loc141 = loc(fused[#loc132, #loc133]) +#loc142 = loc(callsite(#loc53 at #loc138)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..c842dd863db1d0710440e89388a792a97cadf365 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/PKYX5GDL7LWPNXE5PS7QBSJCIKZVZM6LYHATKAAR4MQK5MEYRLCQ/triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1.ttir @@ -0,0 +1,246 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":18:0) +#loc1 = loc(unknown) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":77:27) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("out_ptr1"(#loc)) +#loc72 = loc("out_ptr2"(#loc)) +#loc73 = loc("ks0"(#loc)) +#loc74 = loc("ks1"(#loc)) +#loc75 = loc("ks2"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc128 = loc("tmp36"(#loc54)) +#loc143 = loc(callsite(#loc1 at #loc128)) +module { + tt.func public @triton_red_fused__to_copy_arange_bitwise_and_bitwise_or_constant_pad_nd_eq_ge_gt_index_lt_permute_remainder_sub_sum_view_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 {tt.divisibility = 16 : i32} loc("ks2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c-128_i64 = arith.constant -128 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %x1 = arith.constant 16 : i64 loc(#loc78) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc3) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %tmp40 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc79) + %cst = arith.constant dense<0> : tensor<1x1xi64> loc(#loc1) + %cst_0 = arith.constant dense<2048> : tensor<1x2048xi64> loc(#loc1) + %cst_1 = arith.constant dense : tensor<1x2048xi1> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<1x2048xi32> loc(#loc1) + %cst_3 = arith.constant dense<16384> : tensor<1x2048xi32> loc(#loc1) + %cst_4 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc80) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc81) + %xmask_5 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc81) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc82) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc83) + %x0 = arith.extsi %xoffset : i32 to i64 loc(#loc84) + %x0_7 = arith.remsi %x0, %ks0 : i64 loc(#loc84) + %x1_8 = arith.divsi %x0, %ks0 : i64 loc(#loc85) + %x1_9 = arith.remsi %x1_8, %x1 : i64 loc(#loc78) + %x2 = arith.divsi %x0, %ks2 : i64 loc(#loc86) + %_tmp36 = scf.for %r0_offset = %c0_i32 to %c16384_i32 step %c2048_i32 iter_args(%_tmp36_11 = %cst_4) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc88) + %r0_index_12 = arith.addi %r0_index, %r0_base_6 : tensor<1x2048xi32> loc(#loc88) + %r0_mask = arith.cmpi slt, %r0_index_12, %cst_3 : tensor<1x2048xi32> loc(#loc89) + %r0_3 = arith.remsi %r0_index_12, %cst_2 : tensor<1x2048xi32> loc(#loc90) + %r0_4 = arith.divsi %r0_index_12, %cst_2 : tensor<1x2048xi32> loc(#loc91) + %tmp0 = arith.muli %x0_7, %c128_i64 : i64 loc(#loc92) + %tmp0_13 = arith.extsi %r0_3 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc93) + %tmp0_14 = tt.splat %tmp0 : i64 -> tensor<1x2048xi64> loc(#loc138) + %tmp0_15 = arith.addi %tmp0_13, %tmp0_14 : tensor<1x2048xi64> loc(#loc93) + %tmp2 = tt.splat %ks1 : i64 -> tensor<1x2048xi64> loc(#loc94) + %tmp2_16 = arith.cmpi slt, %tmp0_15, %tmp2 : tensor<1x2048xi64> loc(#loc94) + %tmp3 = arith.muli %x1_9, %c128_i64 : i64 loc(#loc95) + %tmp3_17 = arith.extsi %r0_4 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc96) + %tmp3_18 = tt.splat %tmp3 : i64 -> tensor<1x2048xi64> loc(#loc139) + %tmp3_19 = arith.addi %tmp3_17, %tmp3_18 : tensor<1x2048xi64> loc(#loc96) + %tmp5 = arith.cmpi sge, %tmp3_19, %tmp0_15 : tensor<1x2048xi64> loc(#loc97) + %tmp6 = tt.addptr %in_ptr0, %x2 : !tt.ptr, i64 loc(#loc98) + %tmp6_20 = tt.splat %tmp6 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc98) + %tmp6_21 = arith.andi %r0_mask, %tmp2_16 : tensor<1x2048xi1> loc(#loc99) + %tmp6_22 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc140) + %tmp6_23 = arith.andi %tmp6_21, %tmp6_22 : tensor<1x2048xi1> loc(#loc100) + %tmp6_24 = tt.load %tmp6_20, %tmp6_23, %cst_4 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc101) + %tmp7 = arith.cmpi slt, %tmp0_15, %tmp6_24 : tensor<1x2048xi64> loc(#loc102) + %tmp8 = arith.cmpi slt, %tmp3_19, %tmp6_24 : tensor<1x2048xi64> loc(#loc103) + %tmp9 = arith.andi %tmp7, %tmp8 : tensor<1x2048xi1> loc(#loc104) + %tmp10 = arith.andi %tmp5, %tmp9 : tensor<1x2048xi1> loc(#loc105) + %tmp14 = arith.cmpi sge, %tmp0_15, %cst_0 : tensor<1x2048xi64> loc(#loc106) + %tmp15 = arith.remsi %tmp0_15, %cst_0 : tensor<1x2048xi64> loc(#loc107) + %tmp16 = arith.cmpi slt, %tmp15, %tmp6_24 : tensor<1x2048xi64> loc(#loc108) + %tmp17 = arith.andi %tmp14, %tmp16 : tensor<1x2048xi1> loc(#loc109) + %tmp18 = arith.subi %r0_3, %r0_4 : tensor<1x2048xi32> loc(#loc110) + %tmp18_25 = arith.muli %x1_9, %c-128_i64 : i64 loc(#loc111) + %tmp18_26 = arith.extsi %tmp18 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc112) + %tmp18_27 = tt.splat %tmp18_25 : i64 -> tensor<1x2048xi64> loc(#loc141) + %tmp18_28 = arith.addi %tmp18_26, %tmp18_27 : tensor<1x2048xi64> loc(#loc112) + %tmp18_29 = arith.addi %tmp18_28, %tmp0_14 : tensor<1x2048xi64> loc(#loc113) + %tmp19 = arith.remsi %tmp18_29, %cst_0 : tensor<1x2048xi64> loc(#loc114) + %tmp21 = arith.cmpi ne, %tmp19, %cst_4 : tensor<1x2048xi64> loc(#loc115) + %tmp22 = arith.cmpi slt, %tmp19, %cst_4 : tensor<1x2048xi64> loc(#loc116) + %tmp25 = arith.andi %tmp21, %tmp22 : tensor<1x2048xi1> loc(#loc117) + %tmp26 = arith.addi %tmp19, %cst_0 : tensor<1x2048xi64> loc(#loc118) + %tmp27 = arith.select %tmp25, %tmp26, %tmp19 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc119) + %tmp29 = arith.cmpi eq, %tmp27, %cst_4 : tensor<1x2048xi64> loc(#loc120) + %tmp30 = arith.andi %tmp17, %tmp29 : tensor<1x2048xi1> loc(#loc121) + %tmp31 = arith.ori %tmp10, %tmp30 : tensor<1x2048xi1> loc(#loc122) + %tmp33 = arith.select %tmp2_16, %tmp31, %cst_1 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc123) + %tmp34 = arith.extui %tmp33 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc124) + %tmp37 = arith.addi %_tmp36_11, %tmp34 : tensor<1x2048xi64> loc(#loc125) + %_tmp36_30 = arith.andi %r0_mask, %tmp6_22 : tensor<1x2048xi1> loc(#loc126) + %_tmp36_31 = arith.select %_tmp36_30, %tmp37, %_tmp36_11 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc127) + scf.yield %_tmp36_31 : tensor<1x2048xi64> loc(#loc52) + } loc(#loc87) + %tmp36 = "tt.reduce"(%_tmp36) <{axis = 1 : i32}> ({ + ^bb0(%tmp36_11: i64 loc(callsite(#loc1 at #loc128)), %tmp36_12: i64 loc(callsite(#loc1 at #loc128))): + %tmp36_13 = arith.addi %tmp36_11, %tmp36_12 : i64 loc(#loc146) + tt.reduce.return %tmp36_13 : i64 loc(#loc142) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc142) + %tmp36_10 = tt.expand_dims %tmp36 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc129) + %tmp39 = arith.cmpi sgt, %tmp36_10, %cst : tensor<1x1xi64> loc(#loc130) + %tmp41 = arith.cmpi slt, %tmp36_10, %tmp40 : tensor<1x1xi64> loc(#loc131) + %tmp42 = arith.andi %tmp39, %tmp41 : tensor<1x1xi1> loc(#loc132) + %tmp44 = arith.extui %tmp42 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc144) + %tmp45 = arith.cmpi eq, %tmp36_10, %tmp40 : tensor<1x1xi64> loc(#loc135) + %tmp47 = arith.extui %tmp45 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc145) + %0 = tt.addptr %out_ptr1, %xoffset : !tt.ptr, i32 loc(#loc65) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc65) + tt.store %1, %tmp44, %xmask_5 : tensor<1x1x!tt.ptr> loc(#loc66) + %2 = tt.addptr %out_ptr2, %xoffset : !tt.ptr, i32 loc(#loc67) + %3 = tt.splat %2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc67) + tt.store %3, %tmp47, %xmask_5 : tensor<1x1x!tt.ptr> loc(#loc68) + tt.return loc(#loc69) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":28:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":32:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":80:35) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":22:28) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":24:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":25:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":25:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":27:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":28:21) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":29:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":33:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":34:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":37:27) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":38:27) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":39:26) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":39:22) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":41:22) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":42:26) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":42:22) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":44:23) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:86) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:93) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":45:76) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":46:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":47:22) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":48:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":49:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":53:24) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":54:35) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":55:24) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":56:24) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:24) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:45) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:38) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":57:51) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":58:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":60:25) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":61:92) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":64:24) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":65:24) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":66:39) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":68:25) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":69:24) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":70:24) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":72:38) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":73:25) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":75:25) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":76:36) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":76:50) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":76:8) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":77:30) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":79:20) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":81:20) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":82:20) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":84:21) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":83:21) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":85:21) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":87:21) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":86:21) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":88:25) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":88:37) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":89:25) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":89:37) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bi/cbigeynkmamirzra5ocdek4vfe3idnh2kr2bfscbxtiim3rq5df5.py":89:4) +#loc78 = loc("x1"(#loc2)) +#loc79 = loc("tmp40"(#loc4)) +#loc80 = loc("xoffset"(#loc5)) +#loc81 = loc("xmask"(#loc6)) +#loc82 = loc("r0_base"(#loc7)) +#loc83 = loc("r0_base"(#loc8)) +#loc84 = loc("x0"(#loc9)) +#loc85 = loc("x1"(#loc10)) +#loc86 = loc("x2"(#loc11)) +#loc87 = loc("_tmp36"(#loc3)) +#loc88 = loc("r0_index"(#loc12)) +#loc89 = loc("r0_mask"(#loc13)) +#loc90 = loc("r0_3"(#loc14)) +#loc91 = loc("r0_4"(#loc15)) +#loc92 = loc("tmp0"(#loc16)) +#loc93 = loc("tmp0"(#loc17)) +#loc94 = loc("tmp2"(#loc18)) +#loc95 = loc("tmp3"(#loc19)) +#loc96 = loc("tmp3"(#loc20)) +#loc97 = loc("tmp5"(#loc21)) +#loc98 = loc("tmp6"(#loc22)) +#loc99 = loc("tmp6"(#loc23)) +#loc100 = loc("tmp6"(#loc24)) +#loc101 = loc("tmp6"(#loc25)) +#loc102 = loc("tmp7"(#loc26)) +#loc103 = loc("tmp8"(#loc27)) +#loc104 = loc("tmp9"(#loc28)) +#loc105 = loc("tmp10"(#loc29)) +#loc106 = loc("tmp14"(#loc30)) +#loc107 = loc("tmp15"(#loc31)) +#loc108 = loc("tmp16"(#loc32)) +#loc109 = loc("tmp17"(#loc33)) +#loc110 = loc("tmp18"(#loc34)) +#loc111 = loc("tmp18"(#loc35)) +#loc112 = loc("tmp18"(#loc36)) +#loc113 = loc("tmp18"(#loc37)) +#loc114 = loc("tmp19"(#loc38)) +#loc115 = loc("tmp21"(#loc39)) +#loc116 = loc("tmp22"(#loc40)) +#loc117 = loc("tmp25"(#loc41)) +#loc118 = loc("tmp26"(#loc42)) +#loc119 = loc("tmp27"(#loc43)) +#loc120 = loc("tmp29"(#loc44)) +#loc121 = loc("tmp30"(#loc45)) +#loc122 = loc("tmp31"(#loc46)) +#loc123 = loc("tmp33"(#loc47)) +#loc124 = loc("tmp34"(#loc48)) +#loc125 = loc("tmp37"(#loc49)) +#loc126 = loc("_tmp36"(#loc50)) +#loc127 = loc("_tmp36"(#loc51)) +#loc129 = loc("tmp36"(#loc56)) +#loc130 = loc("tmp39"(#loc57)) +#loc131 = loc("tmp41"(#loc58)) +#loc132 = loc("tmp42"(#loc59)) +#loc133 = loc("tmp44"(#loc60)) +#loc134 = loc("tmp43"(#loc61)) +#loc135 = loc("tmp45"(#loc62)) +#loc136 = loc("tmp47"(#loc63)) +#loc137 = loc("tmp46"(#loc64)) +#loc138 = loc(fused[#loc93, #loc92]) +#loc139 = loc(fused[#loc96, #loc95]) +#loc140 = loc(fused[#loc100, #loc81]) +#loc141 = loc(fused[#loc112, #loc111]) +#loc142 = loc(callsite(#loc53 at #loc128)) +#loc144 = loc(fused[#loc133, #loc134]) +#loc145 = loc(fused[#loc136, #loc137]) +#loc146 = loc(callsite(#loc55 at #loc142)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/__grp__triton_tem_fused_zeros_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/__grp__triton_tem_fused_zeros_1.json new file mode 100644 index 0000000000000000000000000000000000000000..983c903213e135109b5794e3bce5fadf25d1293c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/__grp__triton_tem_fused_zeros_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_tem_fused_zeros_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.source", "triton_tem_fused_zeros_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.ttir", "triton_tem_fused_zeros_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.ttgir", "triton_tem_fused_zeros_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.llir", "triton_tem_fused_zeros_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.ptx", "triton_tem_fused_zeros_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.cubin", "triton_tem_fused_zeros_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5224593700065936b2d5abeb8064e7f8b0e92232 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.json @@ -0,0 +1 @@ +{"hash": "7cb195ae2f72d6b5fdac64176adb964b11654c7f88e5c7e542ca2985d90da540", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 173056, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_zeros_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..aca55970d92d5380d9a9f6f59668e9d03705a289 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.llir @@ -0,0 +1,13554 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_tem_fused_zeros_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, ptr addrspace(1) %10, ptr addrspace(1) %11, ptr addrspace(1) %12, ptr addrspace(1) %13, ptr addrspace(1) %14, ptr addrspace(1) %15, ptr addrspace(1) %16, ptr addrspace(1) %17, i32 %18, i32 %19, i32 %20, i32 %21, ptr addrspace(1) readnone captures(none) %22, ptr addrspace(1) readnone captures(none) %23) local_unnamed_addr #0 !dbg !5 { + %25 = shl i32 %18, 10, !dbg !8 + %26 = shl i32 %18, 7, !dbg !9 + %27 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %28 = add i32 %18, 127, !dbg !11 + %29 = sdiv i32 %28, 128, !dbg !15 + %30 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !16 + %31 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !17 + %32 = and i32 %30, 7, !dbg !18 + %33 = mul i32 %26, %31, !dbg !19 + %34 = mul i32 %25, %32, !dbg !20 + %35 = add i32 %33, %34, !dbg !21 + %36 = sext i32 %35 to i64, !dbg !22 + %37 = mul i32 %25, %30, !dbg !23 + %38 = add i32 %33, %37, !dbg !24 + %39 = sext i32 %38 to i64, !dbg !25 + %40 = getelementptr bfloat, ptr addrspace(1) %1, i64 %36, !dbg !26 + %41 = getelementptr bfloat, ptr addrspace(1) %2, i64 %36, !dbg !27 + %42 = getelementptr bfloat, ptr addrspace(1) %7, i64 %39, !dbg !28 + %43 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !29 + %44 = lshr i32 %43, 5, !dbg !29 + %45 = and i32 %43, 240, !dbg !29 + %46 = lshr exact i32 %45, 4, !dbg !29 + %47 = or disjoint i32 %46, 16, !dbg !29 + %48 = or disjoint i32 %46, 32, !dbg !29 + %49 = or disjoint i32 %46, 48, !dbg !29 + %50 = or disjoint i32 %46, 64, !dbg !29 + %51 = or disjoint i32 %46, 80, !dbg !29 + %52 = or disjoint i32 %46, 96, !dbg !29 + %53 = or disjoint i32 %46, 112, !dbg !29 + %54 = and i32 %43, 224, !dbg !29 + %.not = icmp slt i32 %27, %29, !dbg !30 + br i1 %.not, label %4664, label %55, !dbg !31 + +55: ; preds = %24 + %56 = lshr exact i32 %54, 1, !dbg !29 + %57 = lshr i32 %43, 2, !dbg !29 + %58 = and i32 %57, 7, !dbg !29 + %59 = or disjoint i32 %56, %58, !dbg !29 + %60 = sub i32 %27, %29, !dbg !32 + %.frozen = freeze i32 %60, !dbg !33 + %61 = sdiv i32 %.frozen, 16, !dbg !33 + %62 = shl nuw nsw i32 %31, 2, !dbg !34 + %63 = add nsw i32 %61, %62, !dbg !35 + %64 = mul i32 %61, 16, !dbg !36 + %.decomposed = sub i32 %.frozen, %64, !dbg !36 + %65 = shl nuw nsw i32 %32, 4, !dbg !37 + %66 = add nsw i32 %.decomposed, %65, !dbg !38 + %reass.mul = mul i32 %66, %19, !dbg !39 + %67 = shl i32 %63, 7, !dbg !40 + %68 = shl i32 %30, 23, !dbg !41 + %69 = add i32 %67, %68, !dbg !42 + %70 = sext i32 %69 to i64, !dbg !43 + %71 = shl i32 %63, 18, !dbg !44 + %72 = add i32 %71, %68, !dbg !45 + %73 = sext i32 %72 to i64, !dbg !46 + %74 = shl nuw i32 %30, 16, !dbg !47 + %75 = shl i32 %63, 11, !dbg !47 + %76 = add i32 %75, %74, !dbg !47 + %77 = sext i32 %76 to i64, !dbg !48 + %78 = getelementptr bfloat, ptr addrspace(1) %0, i64 %70, !dbg !49 + %79 = getelementptr bfloat, ptr addrspace(1) %5, i64 %73, !dbg !50 + %80 = getelementptr bfloat, ptr addrspace(1) %6, i64 %70, !dbg !51 + %81 = getelementptr float, ptr addrspace(1) %3, i64 %77, !dbg !52 + %82 = getelementptr float, ptr addrspace(1) %4, i64 %77, !dbg !53 + %83 = shl nsw i32 %.decomposed, 7, !dbg !54 + %84 = or disjoint i32 %83, %46, !dbg !55 + %85 = or disjoint i32 %83, %47, !dbg !55 + %86 = or disjoint i32 %83, %48, !dbg !55 + %87 = or disjoint i32 %83, %49, !dbg !55 + %88 = or disjoint i32 %83, %50, !dbg !55 + %89 = or disjoint i32 %83, %51, !dbg !55 + %90 = or disjoint i32 %83, %52, !dbg !55 + %91 = or disjoint i32 %83, %53, !dbg !55 + %92 = shl nsw i32 %84, 12, !dbg !56 + %93 = shl nsw i32 %85, 12, !dbg !56 + %94 = shl nsw i32 %86, 12, !dbg !56 + %95 = shl nsw i32 %87, 12, !dbg !56 + %96 = shl nsw i32 %88, 12, !dbg !56 + %97 = shl nsw i32 %89, 12, !dbg !56 + %98 = shl nsw i32 %90, 12, !dbg !56 + %99 = shl nsw i32 %91, 12, !dbg !56 + %100 = sext i32 %92 to i64, !dbg !59 + %101 = getelementptr bfloat, ptr addrspace(1) %78, i64 %100, !dbg !59 + %102 = sext i32 %93 to i64, !dbg !59 + %103 = getelementptr bfloat, ptr addrspace(1) %78, i64 %102, !dbg !59 + %104 = sext i32 %94 to i64, !dbg !59 + %105 = getelementptr bfloat, ptr addrspace(1) %78, i64 %104, !dbg !59 + %106 = sext i32 %95 to i64, !dbg !59 + %107 = getelementptr bfloat, ptr addrspace(1) %78, i64 %106, !dbg !59 + %108 = sext i32 %96 to i64, !dbg !59 + %109 = getelementptr bfloat, ptr addrspace(1) %78, i64 %108, !dbg !59 + %110 = sext i32 %97 to i64, !dbg !59 + %111 = getelementptr bfloat, ptr addrspace(1) %78, i64 %110, !dbg !59 + %112 = sext i32 %98 to i64, !dbg !59 + %113 = getelementptr bfloat, ptr addrspace(1) %78, i64 %112, !dbg !59 + %114 = sext i32 %99 to i64, !dbg !59 + %115 = getelementptr bfloat, ptr addrspace(1) %78, i64 %114, !dbg !59 + %116 = shl nuw nsw i32 %43, 3, !dbg !60 + %117 = and i32 %116, 120, !dbg !60 + %118 = zext nneg i32 %117 to i64, !dbg !61 + %119 = getelementptr bfloat, ptr addrspace(1) %101, i64 %118, !dbg !61 + %120 = getelementptr bfloat, ptr addrspace(1) %103, i64 %118, !dbg !61 + %121 = getelementptr bfloat, ptr addrspace(1) %105, i64 %118, !dbg !61 + %122 = getelementptr bfloat, ptr addrspace(1) %107, i64 %118, !dbg !61 + %123 = getelementptr bfloat, ptr addrspace(1) %109, i64 %118, !dbg !61 + %124 = getelementptr bfloat, ptr addrspace(1) %111, i64 %118, !dbg !61 + %125 = getelementptr bfloat, ptr addrspace(1) %113, i64 %118, !dbg !61 + %126 = getelementptr bfloat, ptr addrspace(1) %115, i64 %118, !dbg !61 + %127 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %119, i1 true) #3, !dbg !62 + %128 = extractvalue { i32, i32, i32, i32 } %127, 0, !dbg !62 + %129 = extractvalue { i32, i32, i32, i32 } %127, 1, !dbg !62 + %130 = extractvalue { i32, i32, i32, i32 } %127, 2, !dbg !62 + %131 = extractvalue { i32, i32, i32, i32 } %127, 3, !dbg !62 + %132 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %120, i1 true) #3, !dbg !62 + %133 = extractvalue { i32, i32, i32, i32 } %132, 0, !dbg !62 + %134 = extractvalue { i32, i32, i32, i32 } %132, 1, !dbg !62 + %135 = extractvalue { i32, i32, i32, i32 } %132, 2, !dbg !62 + %136 = extractvalue { i32, i32, i32, i32 } %132, 3, !dbg !62 + %137 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %121, i1 true) #3, !dbg !62 + %138 = extractvalue { i32, i32, i32, i32 } %137, 0, !dbg !62 + %139 = extractvalue { i32, i32, i32, i32 } %137, 1, !dbg !62 + %140 = extractvalue { i32, i32, i32, i32 } %137, 2, !dbg !62 + %141 = extractvalue { i32, i32, i32, i32 } %137, 3, !dbg !62 + %142 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %122, i1 true) #3, !dbg !62 + %143 = extractvalue { i32, i32, i32, i32 } %142, 0, !dbg !62 + %144 = extractvalue { i32, i32, i32, i32 } %142, 1, !dbg !62 + %145 = extractvalue { i32, i32, i32, i32 } %142, 2, !dbg !62 + %146 = extractvalue { i32, i32, i32, i32 } %142, 3, !dbg !62 + %147 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %123, i1 true) #3, !dbg !62 + %148 = extractvalue { i32, i32, i32, i32 } %147, 0, !dbg !62 + %149 = extractvalue { i32, i32, i32, i32 } %147, 1, !dbg !62 + %150 = extractvalue { i32, i32, i32, i32 } %147, 2, !dbg !62 + %151 = extractvalue { i32, i32, i32, i32 } %147, 3, !dbg !62 + %152 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %124, i1 true) #3, !dbg !62 + %153 = extractvalue { i32, i32, i32, i32 } %152, 0, !dbg !62 + %154 = extractvalue { i32, i32, i32, i32 } %152, 1, !dbg !62 + %155 = extractvalue { i32, i32, i32, i32 } %152, 2, !dbg !62 + %156 = extractvalue { i32, i32, i32, i32 } %152, 3, !dbg !62 + %157 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %125, i1 true) #3, !dbg !62 + %158 = extractvalue { i32, i32, i32, i32 } %157, 0, !dbg !62 + %159 = extractvalue { i32, i32, i32, i32 } %157, 1, !dbg !62 + %160 = extractvalue { i32, i32, i32, i32 } %157, 2, !dbg !62 + %161 = extractvalue { i32, i32, i32, i32 } %157, 3, !dbg !62 + %162 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %126, i1 true) #3, !dbg !62 + %163 = extractvalue { i32, i32, i32, i32 } %162, 0, !dbg !62 + %164 = extractvalue { i32, i32, i32, i32 } %162, 1, !dbg !62 + %165 = extractvalue { i32, i32, i32, i32 } %162, 2, !dbg !62 + %166 = extractvalue { i32, i32, i32, i32 } %162, 3, !dbg !62 + %167 = shl nuw nsw i32 %43, 4, !dbg !62 + %168 = and i32 %167, 112, !dbg !62 + %169 = shl nuw nsw i32 %45, 3, !dbg !62 + %170 = and i32 %43, 112, !dbg !62 + %171 = and i32 %43, 8, !dbg !62 + %172 = shl nuw nsw i32 %171, 11, !dbg !62 + %173 = or disjoint i32 %168, %169, !dbg !62 + %174 = xor i32 %173, %170, !dbg !62 + %175 = or disjoint i32 %174, %172, !dbg !62 + %176 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %175, !dbg !62 + %177 = insertelement <4 x i32> poison, i32 %128, i64 0, !dbg !62 + %178 = insertelement <4 x i32> %177, i32 %129, i64 1, !dbg !62 + %179 = insertelement <4 x i32> %178, i32 %130, i64 2, !dbg !62 + %180 = insertelement <4 x i32> %179, i32 %131, i64 3, !dbg !62 + store <4 x i32> %180, ptr addrspace(3) %176, align 16, !dbg !62 + %181 = or disjoint i32 %175, 2048, !dbg !62 + %182 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %181, !dbg !62 + %183 = insertelement <4 x i32> poison, i32 %133, i64 0, !dbg !62 + %184 = insertelement <4 x i32> %183, i32 %134, i64 1, !dbg !62 + %185 = insertelement <4 x i32> %184, i32 %135, i64 2, !dbg !62 + %186 = insertelement <4 x i32> %185, i32 %136, i64 3, !dbg !62 + store <4 x i32> %186, ptr addrspace(3) %182, align 16, !dbg !62 + %187 = or disjoint i32 %175, 4096, !dbg !62 + %188 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %187, !dbg !62 + %189 = insertelement <4 x i32> poison, i32 %138, i64 0, !dbg !62 + %190 = insertelement <4 x i32> %189, i32 %139, i64 1, !dbg !62 + %191 = insertelement <4 x i32> %190, i32 %140, i64 2, !dbg !62 + %192 = insertelement <4 x i32> %191, i32 %141, i64 3, !dbg !62 + store <4 x i32> %192, ptr addrspace(3) %188, align 16, !dbg !62 + %193 = or disjoint i32 %175, 6144, !dbg !62 + %194 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %193, !dbg !62 + %195 = insertelement <4 x i32> poison, i32 %143, i64 0, !dbg !62 + %196 = insertelement <4 x i32> %195, i32 %144, i64 1, !dbg !62 + %197 = insertelement <4 x i32> %196, i32 %145, i64 2, !dbg !62 + %198 = insertelement <4 x i32> %197, i32 %146, i64 3, !dbg !62 + store <4 x i32> %198, ptr addrspace(3) %194, align 16, !dbg !62 + %199 = or disjoint i32 %175, 8192, !dbg !62 + %200 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %199, !dbg !62 + %201 = insertelement <4 x i32> poison, i32 %148, i64 0, !dbg !62 + %202 = insertelement <4 x i32> %201, i32 %149, i64 1, !dbg !62 + %203 = insertelement <4 x i32> %202, i32 %150, i64 2, !dbg !62 + %204 = insertelement <4 x i32> %203, i32 %151, i64 3, !dbg !62 + store <4 x i32> %204, ptr addrspace(3) %200, align 16, !dbg !62 + %205 = or disjoint i32 %175, 10240, !dbg !62 + %206 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %205, !dbg !62 + %207 = insertelement <4 x i32> poison, i32 %153, i64 0, !dbg !62 + %208 = insertelement <4 x i32> %207, i32 %154, i64 1, !dbg !62 + %209 = insertelement <4 x i32> %208, i32 %155, i64 2, !dbg !62 + %210 = insertelement <4 x i32> %209, i32 %156, i64 3, !dbg !62 + store <4 x i32> %210, ptr addrspace(3) %206, align 16, !dbg !62 + %211 = or disjoint i32 %175, 12288, !dbg !62 + %212 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %211, !dbg !62 + %213 = insertelement <4 x i32> poison, i32 %158, i64 0, !dbg !62 + %214 = insertelement <4 x i32> %213, i32 %159, i64 1, !dbg !62 + %215 = insertelement <4 x i32> %214, i32 %160, i64 2, !dbg !62 + %216 = insertelement <4 x i32> %215, i32 %161, i64 3, !dbg !62 + store <4 x i32> %216, ptr addrspace(3) %212, align 16, !dbg !62 + %217 = or disjoint i32 %175, 14336, !dbg !62 + %218 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %217, !dbg !62 + %219 = insertelement <4 x i32> poison, i32 %163, i64 0, !dbg !62 + %220 = insertelement <4 x i32> %219, i32 %164, i64 1, !dbg !62 + %221 = insertelement <4 x i32> %220, i32 %165, i64 2, !dbg !62 + %222 = insertelement <4 x i32> %221, i32 %166, i64 3, !dbg !62 + store <4 x i32> %222, ptr addrspace(3) %218, align 16, !dbg !62 + %223 = shl nsw i32 %84, 7, !dbg !63 + %224 = shl nsw i32 %85, 7, !dbg !63 + %225 = shl nsw i32 %86, 7, !dbg !63 + %226 = shl nsw i32 %87, 7, !dbg !63 + %227 = shl nsw i32 %88, 7, !dbg !63 + %228 = shl nsw i32 %89, 7, !dbg !63 + %229 = shl nsw i32 %90, 7, !dbg !63 + %230 = shl nsw i32 %91, 7, !dbg !63 + %231 = sext i32 %223 to i64, !dbg !65 + %232 = getelementptr bfloat, ptr addrspace(1) %79, i64 %231, !dbg !65 + %233 = sext i32 %224 to i64, !dbg !65 + %234 = getelementptr bfloat, ptr addrspace(1) %79, i64 %233, !dbg !65 + %235 = sext i32 %225 to i64, !dbg !65 + %236 = getelementptr bfloat, ptr addrspace(1) %79, i64 %235, !dbg !65 + %237 = sext i32 %226 to i64, !dbg !65 + %238 = getelementptr bfloat, ptr addrspace(1) %79, i64 %237, !dbg !65 + %239 = sext i32 %227 to i64, !dbg !65 + %240 = getelementptr bfloat, ptr addrspace(1) %79, i64 %239, !dbg !65 + %241 = sext i32 %228 to i64, !dbg !65 + %242 = getelementptr bfloat, ptr addrspace(1) %79, i64 %241, !dbg !65 + %243 = sext i32 %229 to i64, !dbg !65 + %244 = getelementptr bfloat, ptr addrspace(1) %79, i64 %243, !dbg !65 + %245 = sext i32 %230 to i64, !dbg !65 + %246 = getelementptr bfloat, ptr addrspace(1) %79, i64 %245, !dbg !65 + %247 = getelementptr bfloat, ptr addrspace(1) %232, i64 %118, !dbg !66 + %248 = getelementptr bfloat, ptr addrspace(1) %234, i64 %118, !dbg !66 + %249 = getelementptr bfloat, ptr addrspace(1) %236, i64 %118, !dbg !66 + %250 = getelementptr bfloat, ptr addrspace(1) %238, i64 %118, !dbg !66 + %251 = getelementptr bfloat, ptr addrspace(1) %240, i64 %118, !dbg !66 + %252 = getelementptr bfloat, ptr addrspace(1) %242, i64 %118, !dbg !66 + %253 = getelementptr bfloat, ptr addrspace(1) %244, i64 %118, !dbg !66 + %254 = getelementptr bfloat, ptr addrspace(1) %246, i64 %118, !dbg !66 + %255 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %247, i1 true) #3, !dbg !67 + %256 = extractvalue { i32, i32, i32, i32 } %255, 0, !dbg !67 + %257 = extractvalue { i32, i32, i32, i32 } %255, 1, !dbg !67 + %258 = extractvalue { i32, i32, i32, i32 } %255, 2, !dbg !67 + %259 = extractvalue { i32, i32, i32, i32 } %255, 3, !dbg !67 + %260 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %248, i1 true) #3, !dbg !67 + %261 = extractvalue { i32, i32, i32, i32 } %260, 0, !dbg !67 + %262 = extractvalue { i32, i32, i32, i32 } %260, 1, !dbg !67 + %263 = extractvalue { i32, i32, i32, i32 } %260, 2, !dbg !67 + %264 = extractvalue { i32, i32, i32, i32 } %260, 3, !dbg !67 + %265 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %249, i1 true) #3, !dbg !67 + %266 = extractvalue { i32, i32, i32, i32 } %265, 0, !dbg !67 + %267 = extractvalue { i32, i32, i32, i32 } %265, 1, !dbg !67 + %268 = extractvalue { i32, i32, i32, i32 } %265, 2, !dbg !67 + %269 = extractvalue { i32, i32, i32, i32 } %265, 3, !dbg !67 + %270 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %250, i1 true) #3, !dbg !67 + %271 = extractvalue { i32, i32, i32, i32 } %270, 0, !dbg !67 + %272 = extractvalue { i32, i32, i32, i32 } %270, 1, !dbg !67 + %273 = extractvalue { i32, i32, i32, i32 } %270, 2, !dbg !67 + %274 = extractvalue { i32, i32, i32, i32 } %270, 3, !dbg !67 + %275 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %251, i1 true) #3, !dbg !67 + %276 = extractvalue { i32, i32, i32, i32 } %275, 0, !dbg !67 + %277 = extractvalue { i32, i32, i32, i32 } %275, 1, !dbg !67 + %278 = extractvalue { i32, i32, i32, i32 } %275, 2, !dbg !67 + %279 = extractvalue { i32, i32, i32, i32 } %275, 3, !dbg !67 + %280 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %252, i1 true) #3, !dbg !67 + %281 = extractvalue { i32, i32, i32, i32 } %280, 0, !dbg !67 + %282 = extractvalue { i32, i32, i32, i32 } %280, 1, !dbg !67 + %283 = extractvalue { i32, i32, i32, i32 } %280, 2, !dbg !67 + %284 = extractvalue { i32, i32, i32, i32 } %280, 3, !dbg !67 + %285 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %253, i1 true) #3, !dbg !67 + %286 = extractvalue { i32, i32, i32, i32 } %285, 0, !dbg !67 + %287 = extractvalue { i32, i32, i32, i32 } %285, 1, !dbg !67 + %288 = extractvalue { i32, i32, i32, i32 } %285, 2, !dbg !67 + %289 = extractvalue { i32, i32, i32, i32 } %285, 3, !dbg !67 + %290 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %254, i1 true) #3, !dbg !67 + %291 = extractvalue { i32, i32, i32, i32 } %290, 0, !dbg !67 + %292 = extractvalue { i32, i32, i32, i32 } %290, 1, !dbg !67 + %293 = extractvalue { i32, i32, i32, i32 } %290, 2, !dbg !67 + %294 = extractvalue { i32, i32, i32, i32 } %290, 3, !dbg !67 + %295 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %175, !dbg !67 + %296 = insertelement <4 x i32> poison, i32 %256, i64 0, !dbg !67 + %297 = insertelement <4 x i32> %296, i32 %257, i64 1, !dbg !67 + %298 = insertelement <4 x i32> %297, i32 %258, i64 2, !dbg !67 + %299 = insertelement <4 x i32> %298, i32 %259, i64 3, !dbg !67 + store <4 x i32> %299, ptr addrspace(3) %295, align 16, !dbg !67 + %300 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %181, !dbg !67 + %301 = insertelement <4 x i32> poison, i32 %261, i64 0, !dbg !67 + %302 = insertelement <4 x i32> %301, i32 %262, i64 1, !dbg !67 + %303 = insertelement <4 x i32> %302, i32 %263, i64 2, !dbg !67 + %304 = insertelement <4 x i32> %303, i32 %264, i64 3, !dbg !67 + store <4 x i32> %304, ptr addrspace(3) %300, align 16, !dbg !67 + %305 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %187, !dbg !67 + %306 = insertelement <4 x i32> poison, i32 %266, i64 0, !dbg !67 + %307 = insertelement <4 x i32> %306, i32 %267, i64 1, !dbg !67 + %308 = insertelement <4 x i32> %307, i32 %268, i64 2, !dbg !67 + %309 = insertelement <4 x i32> %308, i32 %269, i64 3, !dbg !67 + store <4 x i32> %309, ptr addrspace(3) %305, align 16, !dbg !67 + %310 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %193, !dbg !67 + %311 = insertelement <4 x i32> poison, i32 %271, i64 0, !dbg !67 + %312 = insertelement <4 x i32> %311, i32 %272, i64 1, !dbg !67 + %313 = insertelement <4 x i32> %312, i32 %273, i64 2, !dbg !67 + %314 = insertelement <4 x i32> %313, i32 %274, i64 3, !dbg !67 + store <4 x i32> %314, ptr addrspace(3) %310, align 16, !dbg !67 + %315 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %199, !dbg !67 + %316 = insertelement <4 x i32> poison, i32 %276, i64 0, !dbg !67 + %317 = insertelement <4 x i32> %316, i32 %277, i64 1, !dbg !67 + %318 = insertelement <4 x i32> %317, i32 %278, i64 2, !dbg !67 + %319 = insertelement <4 x i32> %318, i32 %279, i64 3, !dbg !67 + store <4 x i32> %319, ptr addrspace(3) %315, align 16, !dbg !67 + %320 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %205, !dbg !67 + %321 = insertelement <4 x i32> poison, i32 %281, i64 0, !dbg !67 + %322 = insertelement <4 x i32> %321, i32 %282, i64 1, !dbg !67 + %323 = insertelement <4 x i32> %322, i32 %283, i64 2, !dbg !67 + %324 = insertelement <4 x i32> %323, i32 %284, i64 3, !dbg !67 + store <4 x i32> %324, ptr addrspace(3) %320, align 16, !dbg !67 + %325 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %211, !dbg !67 + %326 = insertelement <4 x i32> poison, i32 %286, i64 0, !dbg !67 + %327 = insertelement <4 x i32> %326, i32 %287, i64 1, !dbg !67 + %328 = insertelement <4 x i32> %327, i32 %288, i64 2, !dbg !67 + %329 = insertelement <4 x i32> %328, i32 %289, i64 3, !dbg !67 + store <4 x i32> %329, ptr addrspace(3) %325, align 16, !dbg !67 + %330 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 %217, !dbg !67 + %331 = insertelement <4 x i32> poison, i32 %291, i64 0, !dbg !67 + %332 = insertelement <4 x i32> %331, i32 %292, i64 1, !dbg !67 + %333 = insertelement <4 x i32> %332, i32 %293, i64 2, !dbg !67 + %334 = insertelement <4 x i32> %333, i32 %294, i64 3, !dbg !67 + store <4 x i32> %334, ptr addrspace(3) %330, align 16, !dbg !67 + %335 = sext i32 %reass.mul to i64, !dbg !68 + %336 = getelementptr i32, ptr addrspace(1) %9, i64 %335, !dbg !68 + %337 = sext i32 %66 to i64, !dbg !69 + %338 = getelementptr i32, ptr addrspace(1) %8, i64 %337, !dbg !69 + %339 = and i32 %43, 3, !dbg !70 + %340 = shl nuw nsw i32 %339, 1, !dbg !70 + %341 = or disjoint i32 %340, 1, !dbg !70 + %342 = insertelement <2 x i32> poison, i32 %340, i64 0, !dbg !70 + %343 = shufflevector <2 x i32> %342, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !70 + %344 = or disjoint <2 x i32> %343, , !dbg !70 + %345 = insertelement <4 x i32> poison, i32 %340, i64 0, !dbg !70 + %346 = shufflevector <4 x i32> %345, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !70 + %347 = or disjoint <4 x i32> %346, , !dbg !70 + %348 = insertelement <8 x i32> poison, i32 %340, i64 0, !dbg !70 + %349 = shufflevector <8 x i32> %348, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !70 + %350 = or disjoint <8 x i32> %349, , !dbg !70 + %351 = add i32 %18, 63, !dbg !71 + %352 = sdiv i32 %351, 64, !dbg !73 + %353 = tail call i32 @llvm.smax.i32(i32 %352, i32 1), !dbg !74 + %354 = zext nneg i32 %30 to i64, !dbg !75 + %355 = getelementptr i64, ptr addrspace(1) %16, i64 %354, !dbg !75 + %356 = or disjoint i32 %83, %59, !dbg !55 + %357 = or disjoint i32 %356, 8, !dbg !55 + %358 = sext i32 %356 to i64, !dbg !76 + %359 = getelementptr float, ptr addrspace(1) %82, i64 %358, !dbg !76 + %360 = sext i32 %357 to i64, !dbg !76 + %361 = getelementptr float, ptr addrspace(1) %82, i64 %360, !dbg !76 + %362 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %359, i1 true) #3, !dbg !77 + %363 = bitcast i32 %362 to float, !dbg !77 + %364 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %361, i1 true) #3, !dbg !77 + %365 = bitcast i32 %364 to float, !dbg !77 + %366 = getelementptr float, ptr addrspace(1) %81, i64 %358, !dbg !78 + %367 = getelementptr float, ptr addrspace(1) %81, i64 %360, !dbg !78 + %368 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %366, i1 true) #3, !dbg !79 + %369 = bitcast i32 %368 to float, !dbg !79 + %370 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %367, i1 true) #3, !dbg !79 + %371 = bitcast i32 %370 to float, !dbg !79 + %372 = fcmp oeq float %369, 0xFFF0000000000000, !dbg !80 + %373 = fcmp oeq float %371, 0xFFF0000000000000, !dbg !80 + %374 = select i1 %372, float 0.000000e+00, float %369, !dbg !81 + %375 = select i1 %373, float 0.000000e+00, float %371, !dbg !81 + %376 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %336) #3, !dbg !82 + %377 = shl i32 %376, 7, !dbg !83 + %378 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %338) #3, !dbg !84 + %379 = or disjoint i32 %377, %46, !dbg !85 + %380 = or disjoint i32 %377, %47, !dbg !85 + %381 = or disjoint i32 %377, %48, !dbg !85 + %382 = or disjoint i32 %377, %49, !dbg !85 + %383 = shl i32 %379, 7, !dbg !86 + %384 = shl i32 %380, 7, !dbg !86 + %385 = shl i32 %381, 7, !dbg !86 + %386 = shl i32 %382, 7, !dbg !86 + %387 = sext i32 %383 to i64, !dbg !87 + %388 = getelementptr bfloat, ptr addrspace(1) %40, i64 %387, !dbg !87 + %389 = sext i32 %384 to i64, !dbg !87 + %390 = getelementptr bfloat, ptr addrspace(1) %40, i64 %389, !dbg !87 + %391 = sext i32 %385 to i64, !dbg !87 + %392 = getelementptr bfloat, ptr addrspace(1) %40, i64 %391, !dbg !87 + %393 = sext i32 %386 to i64, !dbg !87 + %394 = getelementptr bfloat, ptr addrspace(1) %40, i64 %393, !dbg !87 + %395 = getelementptr bfloat, ptr addrspace(1) %388, i64 %118, !dbg !88 + %396 = getelementptr bfloat, ptr addrspace(1) %390, i64 %118, !dbg !88 + %397 = getelementptr bfloat, ptr addrspace(1) %392, i64 %118, !dbg !88 + %398 = getelementptr bfloat, ptr addrspace(1) %394, i64 %118, !dbg !88 + %399 = getelementptr bfloat, ptr addrspace(1) %41, i64 %387, !dbg !89 + %400 = getelementptr bfloat, ptr addrspace(1) %41, i64 %389, !dbg !89 + %401 = getelementptr bfloat, ptr addrspace(1) %41, i64 %391, !dbg !89 + %402 = getelementptr bfloat, ptr addrspace(1) %41, i64 %393, !dbg !89 + %403 = getelementptr bfloat, ptr addrspace(1) %399, i64 %118, !dbg !90 + %404 = getelementptr bfloat, ptr addrspace(1) %400, i64 %118, !dbg !90 + %405 = getelementptr bfloat, ptr addrspace(1) %401, i64 %118, !dbg !90 + %406 = getelementptr bfloat, ptr addrspace(1) %402, i64 %118, !dbg !90 + %407 = shl i32 %378, 1, !dbg !91 + %408 = tail call i32 @llvm.smin.i32(i32 %407, i32 %353), !dbg !92 + %409 = icmp sgt i32 %407, 0, !dbg !93 + %410 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %355, i1 %409) #3, !dbg !94 + %411 = insertelement <2 x i64> poison, i64 %410, i64 0, !dbg !95 + %412 = shufflevector <2 x i64> %411, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !95 + %413 = icmp slt i32 %379, %18, !dbg !96 + %414 = icmp slt i32 %380, %18, !dbg !96 + %415 = icmp slt i32 %381, %18, !dbg !96 + %416 = icmp slt i32 %382, %18, !dbg !96 + %417 = and i1 %409, %413, !dbg !93 + %418 = and i1 %409, %414, !dbg !93 + %419 = and i1 %409, %415, !dbg !93 + %420 = and i1 %409, %416, !dbg !93 + %421 = shl nuw nsw i32 %171, 10, !dbg !97 + %422 = or disjoint i32 %174, %421, !dbg !97 + %423 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %422, !dbg !97 + %424 = select i1 %417, i32 16, i32 0, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %423, ptr addrspace(1) %395, i32 %424) #3, !dbg !97 + %425 = or disjoint i32 %422, 2048, !dbg !97 + %426 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %425, !dbg !97 + %427 = select i1 %418, i32 16, i32 0, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %426, ptr addrspace(1) %396, i32 %427) #3, !dbg !97 + %428 = or disjoint i32 %422, 4096, !dbg !97 + %429 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %428, !dbg !97 + %430 = select i1 %419, i32 16, i32 0, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %429, ptr addrspace(1) %397, i32 %430) #3, !dbg !97 + %431 = or disjoint i32 %422, 6144, !dbg !97 + %432 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %431, !dbg !97 + %433 = select i1 %420, i32 16, i32 0, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %432, ptr addrspace(1) %398, i32 %433) #3, !dbg !97 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !97 + %434 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %422, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %434, ptr addrspace(1) %403, i32 %424) #3, !dbg !97 + %435 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %425, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %435, ptr addrspace(1) %404, i32 %427) #3, !dbg !97 + %436 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %428, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %436, ptr addrspace(1) %405, i32 %430) #3, !dbg !97 + %437 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %431, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %437, ptr addrspace(1) %406, i32 %433) #3, !dbg !97 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !97 + %438 = icmp sgt i32 %408, 1, !dbg !93 + %439 = getelementptr i8, ptr addrspace(1) %395, i64 16384, !dbg !98 + %440 = getelementptr i8, ptr addrspace(1) %396, i64 16384, !dbg !98 + %441 = getelementptr i8, ptr addrspace(1) %397, i64 16384, !dbg !98 + %442 = getelementptr i8, ptr addrspace(1) %398, i64 16384, !dbg !98 + %443 = getelementptr i8, ptr addrspace(1) %403, i64 16384, !dbg !99 + %444 = getelementptr i8, ptr addrspace(1) %404, i64 16384, !dbg !99 + %445 = getelementptr i8, ptr addrspace(1) %405, i64 16384, !dbg !99 + %446 = getelementptr i8, ptr addrspace(1) %406, i64 16384, !dbg !99 + %447 = or disjoint i32 %379, 64, !dbg !100 + %448 = or disjoint i32 %380, 64, !dbg !100 + %449 = or disjoint i32 %381, 64, !dbg !100 + %450 = or disjoint i32 %382, 64, !dbg !100 + %451 = icmp slt i32 %447, %18, !dbg !96 + %452 = icmp slt i32 %448, %18, !dbg !96 + %453 = icmp slt i32 %449, %18, !dbg !96 + %454 = icmp slt i32 %450, %18, !dbg !96 + %455 = and i1 %438, %451, !dbg !93 + %456 = and i1 %438, %452, !dbg !93 + %457 = and i1 %438, %453, !dbg !93 + %458 = and i1 %438, %454, !dbg !93 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !97 + %459 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %422, !dbg !97 + %460 = select i1 %455, i32 16, i32 0, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %459, ptr addrspace(1) %439, i32 %460) #3, !dbg !97 + %461 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %425, !dbg !97 + %462 = select i1 %456, i32 16, i32 0, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %461, ptr addrspace(1) %440, i32 %462) #3, !dbg !97 + %463 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %428, !dbg !97 + %464 = select i1 %457, i32 16, i32 0, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %463, ptr addrspace(1) %441, i32 %464) #3, !dbg !97 + %465 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %431, !dbg !97 + %466 = select i1 %458, i32 16, i32 0, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %465, ptr addrspace(1) %442, i32 %466) #3, !dbg !97 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !97 + %467 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %422, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %467, ptr addrspace(1) %443, i32 %460) #3, !dbg !97 + %468 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %425, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %468, ptr addrspace(1) %444, i32 %462) #3, !dbg !97 + %469 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %428, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %469, ptr addrspace(1) %445, i32 %464) #3, !dbg !97 + %470 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %431, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %470, ptr addrspace(1) %446, i32 %466) #3, !dbg !97 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !97 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #3, !dbg !101 + br i1 %409, label %.lr.ph, label %._crit_edge, !dbg !93 + +.lr.ph: ; preds = %55 + %471 = insertelement <2 x i32> poison, i32 %357, i64 0, !dbg !102 + %472 = insertelement <2 x i32> %471, i32 %356, i64 1, !dbg !102 + %473 = srem <2 x i32> %472, splat (i32 2048), !dbg !102 + %474 = sext <2 x i32> %473 to <2 x i64>, !dbg !95 + %475 = icmp sgt <2 x i64> %412, %474, !dbg !95 + %476 = insertelement <2 x i32> poison, i32 %377, i64 0, !dbg !85 + %477 = shufflevector <2 x i32> %476, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !85 + %478 = shufflevector <8 x i32> %350, <8 x i32> poison, <2 x i32> , !dbg !85 + %479 = or disjoint <2 x i32> %477, %478, !dbg !85 + %480 = shufflevector <8 x i32> %350, <8 x i32> poison, <2 x i32> , !dbg !85 + %481 = or disjoint <2 x i32> %477, %480, !dbg !85 + %482 = shufflevector <8 x i32> %350, <8 x i32> poison, <2 x i32> , !dbg !85 + %483 = or disjoint <2 x i32> %477, %482, !dbg !85 + %484 = shufflevector <8 x i32> %350, <8 x i32> poison, <2 x i32> , !dbg !85 + %485 = or disjoint <2 x i32> %477, %484, !dbg !85 + %486 = shufflevector <4 x i32> %347, <4 x i32> poison, <2 x i32> , !dbg !85 + %487 = or disjoint <2 x i32> %477, %486, !dbg !85 + %488 = shufflevector <4 x i32> %347, <4 x i32> poison, <2 x i32> , !dbg !85 + %489 = or disjoint <2 x i32> %477, %488, !dbg !85 + %490 = or disjoint <2 x i32> %477, %344, !dbg !85 + %491 = insertelement <2 x i32> %342, i32 %341, i64 1, !dbg !85 + %492 = or disjoint <2 x i32> %477, %491, !dbg !85 + %493 = add nsw i32 %408, -2 + %494 = add nsw i32 %408, -1 + %smax = tail call i32 @llvm.smax.i32(i32 %408, i32 1), !dbg !93 + %495 = shufflevector <2 x i1> %475, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !103 + %496 = shufflevector <2 x i32> %473, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !104 + %497 = insertelement <2 x i32> poison, i32 %18, i64 0, !dbg !96 + %498 = shufflevector <2 x i32> %497, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !96 + %499 = insertelement <2 x float> poison, float %365, i64 0, !dbg !105 + %500 = shufflevector <2 x float> %499, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !105 + %501 = shufflevector <2 x i1> %475, <2 x i1> poison, <2 x i32> , !dbg !103 + %502 = shufflevector <2 x i32> %473, <2 x i32> poison, <2 x i32> , !dbg !104 + %503 = insertelement <2 x float> poison, float %363, i64 0, !dbg !105 + %504 = shufflevector <2 x float> %503, <2 x float> poison, <2 x i32> zeroinitializer, !dbg !105 + br label %505, !dbg !93 + +505: ; preds = %.lr.ph, %__nv_exp2f.exit1572 + %506 = phi i32 [ 64, %.lr.ph ], [ %2314, %__nv_exp2f.exit1572 ] + %507 = phi i32 [ -1, %.lr.ph ], [ %586, %__nv_exp2f.exit1572 ] + %508 = phi i32 [ 1, %.lr.ph ], [ %2331, %__nv_exp2f.exit1572 ] + %.pn9431591 = phi ptr addrspace(1) [ %446, %.lr.ph ], [ %2324, %__nv_exp2f.exit1572 ] + %.pn9591590 = phi ptr addrspace(1) [ %445, %.lr.ph ], [ %2323, %__nv_exp2f.exit1572 ] + %.pn9751589 = phi ptr addrspace(1) [ %444, %.lr.ph ], [ %2322, %__nv_exp2f.exit1572 ] + %.pn9911588 = phi ptr addrspace(1) [ %443, %.lr.ph ], [ %2321, %__nv_exp2f.exit1572 ] + %.pn9211587 = phi i32 [ %450, %.lr.ph ], [ %2328, %__nv_exp2f.exit1572 ] + %.pn9231586 = phi i32 [ %449, %.lr.ph ], [ %2327, %__nv_exp2f.exit1572 ] + %.pn9251585 = phi i32 [ %448, %.lr.ph ], [ %2326, %__nv_exp2f.exit1572 ] + %.pn9271584 = phi i32 [ %447, %.lr.ph ], [ %2325, %__nv_exp2f.exit1572 ] + %.pn8711583 = phi ptr addrspace(1) [ %442, %.lr.ph ], [ %2320, %__nv_exp2f.exit1572 ] + %.pn8871582 = phi ptr addrspace(1) [ %441, %.lr.ph ], [ %2319, %__nv_exp2f.exit1572 ] + %.pn9031581 = phi ptr addrspace(1) [ %440, %.lr.ph ], [ %2318, %__nv_exp2f.exit1572 ] + %.pn9191580 = phi ptr addrspace(1) [ %439, %.lr.ph ], [ %2317, %__nv_exp2f.exit1572 ] + %509 = phi float [ 0.000000e+00, %.lr.ph ], [ %2221, %__nv_exp2f.exit1572 ] + %510 = phi float [ 0.000000e+00, %.lr.ph ], [ %2222, %__nv_exp2f.exit1572 ] + %511 = phi float [ 0.000000e+00, %.lr.ph ], [ %2223, %__nv_exp2f.exit1572 ] + %512 = phi float [ 0.000000e+00, %.lr.ph ], [ %2224, %__nv_exp2f.exit1572 ] + %513 = phi float [ 0.000000e+00, %.lr.ph ], [ %2225, %__nv_exp2f.exit1572 ] + %514 = phi float [ 0.000000e+00, %.lr.ph ], [ %2226, %__nv_exp2f.exit1572 ] + %515 = phi float [ 0.000000e+00, %.lr.ph ], [ %2227, %__nv_exp2f.exit1572 ] + %516 = phi float [ 0.000000e+00, %.lr.ph ], [ %2228, %__nv_exp2f.exit1572 ] + %517 = phi float [ 0.000000e+00, %.lr.ph ], [ %2229, %__nv_exp2f.exit1572 ] + %518 = phi float [ 0.000000e+00, %.lr.ph ], [ %2230, %__nv_exp2f.exit1572 ] + %519 = phi float [ 0.000000e+00, %.lr.ph ], [ %2231, %__nv_exp2f.exit1572 ] + %520 = phi float [ 0.000000e+00, %.lr.ph ], [ %2232, %__nv_exp2f.exit1572 ] + %521 = phi float [ 0.000000e+00, %.lr.ph ], [ %2233, %__nv_exp2f.exit1572 ] + %522 = phi float [ 0.000000e+00, %.lr.ph ], [ %2234, %__nv_exp2f.exit1572 ] + %523 = phi float [ 0.000000e+00, %.lr.ph ], [ %2235, %__nv_exp2f.exit1572 ] + %524 = phi float [ 0.000000e+00, %.lr.ph ], [ %2236, %__nv_exp2f.exit1572 ] + %525 = phi float [ 0.000000e+00, %.lr.ph ], [ %2237, %__nv_exp2f.exit1572 ] + %526 = phi float [ 0.000000e+00, %.lr.ph ], [ %2238, %__nv_exp2f.exit1572 ] + %527 = phi float [ 0.000000e+00, %.lr.ph ], [ %2239, %__nv_exp2f.exit1572 ] + %528 = phi float [ 0.000000e+00, %.lr.ph ], [ %2240, %__nv_exp2f.exit1572 ] + %529 = phi float [ 0.000000e+00, %.lr.ph ], [ %2241, %__nv_exp2f.exit1572 ] + %530 = phi float [ 0.000000e+00, %.lr.ph ], [ %2242, %__nv_exp2f.exit1572 ] + %531 = phi float [ 0.000000e+00, %.lr.ph ], [ %2243, %__nv_exp2f.exit1572 ] + %532 = phi float [ 0.000000e+00, %.lr.ph ], [ %2244, %__nv_exp2f.exit1572 ] + %533 = phi float [ 0.000000e+00, %.lr.ph ], [ %2245, %__nv_exp2f.exit1572 ] + %534 = phi float [ 0.000000e+00, %.lr.ph ], [ %2246, %__nv_exp2f.exit1572 ] + %535 = phi float [ 0.000000e+00, %.lr.ph ], [ %2247, %__nv_exp2f.exit1572 ] + %536 = phi float [ 0.000000e+00, %.lr.ph ], [ %2248, %__nv_exp2f.exit1572 ] + %537 = phi float [ 0.000000e+00, %.lr.ph ], [ %2249, %__nv_exp2f.exit1572 ] + %538 = phi float [ 0.000000e+00, %.lr.ph ], [ %2250, %__nv_exp2f.exit1572 ] + %539 = phi float [ 0.000000e+00, %.lr.ph ], [ %2251, %__nv_exp2f.exit1572 ] + %540 = phi float [ 0.000000e+00, %.lr.ph ], [ %2252, %__nv_exp2f.exit1572 ] + %541 = phi float [ 0.000000e+00, %.lr.ph ], [ %2253, %__nv_exp2f.exit1572 ] + %542 = phi float [ 0.000000e+00, %.lr.ph ], [ %2254, %__nv_exp2f.exit1572 ] + %543 = phi float [ 0.000000e+00, %.lr.ph ], [ %2255, %__nv_exp2f.exit1572 ] + %544 = phi float [ 0.000000e+00, %.lr.ph ], [ %2256, %__nv_exp2f.exit1572 ] + %545 = phi float [ 0.000000e+00, %.lr.ph ], [ %2257, %__nv_exp2f.exit1572 ] + %546 = phi float [ 0.000000e+00, %.lr.ph ], [ %2258, %__nv_exp2f.exit1572 ] + %547 = phi float [ 0.000000e+00, %.lr.ph ], [ %2259, %__nv_exp2f.exit1572 ] + %548 = phi float [ 0.000000e+00, %.lr.ph ], [ %2260, %__nv_exp2f.exit1572 ] + %549 = phi float [ 0.000000e+00, %.lr.ph ], [ %2261, %__nv_exp2f.exit1572 ] + %550 = phi float [ 0.000000e+00, %.lr.ph ], [ %2262, %__nv_exp2f.exit1572 ] + %551 = phi float [ 0.000000e+00, %.lr.ph ], [ %2263, %__nv_exp2f.exit1572 ] + %552 = phi float [ 0.000000e+00, %.lr.ph ], [ %2264, %__nv_exp2f.exit1572 ] + %553 = phi float [ 0.000000e+00, %.lr.ph ], [ %2265, %__nv_exp2f.exit1572 ] + %554 = phi float [ 0.000000e+00, %.lr.ph ], [ %2266, %__nv_exp2f.exit1572 ] + %555 = phi float [ 0.000000e+00, %.lr.ph ], [ %2267, %__nv_exp2f.exit1572 ] + %556 = phi float [ 0.000000e+00, %.lr.ph ], [ %2268, %__nv_exp2f.exit1572 ] + %557 = phi float [ 0.000000e+00, %.lr.ph ], [ %2269, %__nv_exp2f.exit1572 ] + %558 = phi float [ 0.000000e+00, %.lr.ph ], [ %2270, %__nv_exp2f.exit1572 ] + %559 = phi float [ 0.000000e+00, %.lr.ph ], [ %2271, %__nv_exp2f.exit1572 ] + %560 = phi float [ 0.000000e+00, %.lr.ph ], [ %2272, %__nv_exp2f.exit1572 ] + %561 = phi float [ 0.000000e+00, %.lr.ph ], [ %2273, %__nv_exp2f.exit1572 ] + %562 = phi float [ 0.000000e+00, %.lr.ph ], [ %2274, %__nv_exp2f.exit1572 ] + %563 = phi float [ 0.000000e+00, %.lr.ph ], [ %2275, %__nv_exp2f.exit1572 ] + %564 = phi float [ 0.000000e+00, %.lr.ph ], [ %2276, %__nv_exp2f.exit1572 ] + %565 = phi float [ 0.000000e+00, %.lr.ph ], [ %2277, %__nv_exp2f.exit1572 ] + %566 = phi float [ 0.000000e+00, %.lr.ph ], [ %2278, %__nv_exp2f.exit1572 ] + %567 = phi float [ 0.000000e+00, %.lr.ph ], [ %2279, %__nv_exp2f.exit1572 ] + %568 = phi float [ 0.000000e+00, %.lr.ph ], [ %2280, %__nv_exp2f.exit1572 ] + %569 = phi float [ 0.000000e+00, %.lr.ph ], [ %2281, %__nv_exp2f.exit1572 ] + %570 = phi float [ 0.000000e+00, %.lr.ph ], [ %2282, %__nv_exp2f.exit1572 ] + %571 = phi float [ 0.000000e+00, %.lr.ph ], [ %2283, %__nv_exp2f.exit1572 ] + %572 = phi float [ 0.000000e+00, %.lr.ph ], [ %2284, %__nv_exp2f.exit1572 ] + %573 = phi i32 [ 0, %.lr.ph ], [ %2295, %__nv_exp2f.exit1572 ] + %574 = phi <2 x i32> [ %479, %.lr.ph ], [ %2294, %__nv_exp2f.exit1572 ] + %575 = phi <2 x i32> [ %481, %.lr.ph ], [ %2293, %__nv_exp2f.exit1572 ] + %576 = phi <2 x i32> [ %483, %.lr.ph ], [ %2292, %__nv_exp2f.exit1572 ] + %577 = phi <2 x i32> [ %485, %.lr.ph ], [ %2291, %__nv_exp2f.exit1572 ] + %578 = phi <2 x i32> [ %487, %.lr.ph ], [ %2290, %__nv_exp2f.exit1572 ] + %579 = phi <2 x i32> [ %489, %.lr.ph ], [ %2289, %__nv_exp2f.exit1572 ] + %580 = phi <2 x i32> [ %490, %.lr.ph ], [ %2288, %__nv_exp2f.exit1572 ] + %581 = phi <2 x i32> [ %492, %.lr.ph ], [ %2287, %__nv_exp2f.exit1572 ] + %582 = icmp slt i32 %573, %493, !dbg !93 + %583 = icmp slt i32 %573, %494, !dbg !93 + %584 = add i32 %507, 1, !dbg !93 + %585 = icmp sgt i32 %584, 2, !dbg !93 + %586 = select i1 %585, i32 0, i32 %584, !dbg !93 + %587 = icmp slt <2 x i32> %581, %498, !dbg !96 + %588 = icmp slt <2 x i32> %580, %498, !dbg !96 + %589 = icmp slt <2 x i32> %579, %498, !dbg !96 + %590 = icmp slt <2 x i32> %578, %498, !dbg !96 + %591 = icmp slt <2 x i32> %577, %498, !dbg !96 + %592 = icmp slt <2 x i32> %576, %498, !dbg !96 + %593 = icmp slt <2 x i32> %575, %498, !dbg !96 + %594 = icmp slt <2 x i32> %574, %498, !dbg !96 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !97 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !97 + %595 = shl i32 %586, 13, !dbg !97 + %596 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %595, !dbg !97 + %597 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %44, i32 0, i32 31), !dbg !101 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !101 + %598 = shl i32 %597, 11, !dbg !101 + %599 = and i32 %598, 8192, !dbg !101 + %600 = add i32 %599, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !101 + %601 = lshr exact i32 %600, 4, !dbg !101 + %602 = and i32 %601, 16383, !dbg !101 + %603 = zext nneg i32 %602 to i64, !dbg !101 + %604 = or disjoint i64 %603, 4611686293372403712, !dbg !101 + %605 = ptrtoint ptr addrspace(3) %596 to i32, !dbg !101 + %606 = lshr exact i32 %605, 4, !dbg !101 + %607 = and i32 %606, 16383, !dbg !101 + %608 = zext nneg i32 %607 to i64, !dbg !101 + %609 = or disjoint i64 %608, 4611686293338849280, !dbg !101 + %610 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %604, i64 %609) #3, !dbg !101 + %611 = or disjoint i32 %599, 32, !dbg !101 + %612 = add i32 %611, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !101 + %613 = lshr exact i32 %612, 4, !dbg !101 + %614 = and i32 %613, 16383, !dbg !101 + %615 = zext nneg i32 %614 to i64, !dbg !101 + %616 = or disjoint i64 %615, 4611686293372403712, !dbg !101 + %617 = add i32 %605, 32, !dbg !101 + %618 = lshr exact i32 %617, 4, !dbg !101 + %619 = and i32 %618, 16383, !dbg !101 + %620 = zext nneg i32 %619 to i64, !dbg !101 + %621 = or disjoint i64 %620, 4611686293338849280, !dbg !101 + %622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 0, !dbg !101 + %623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 1, !dbg !101 + %624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 2, !dbg !101 + %625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 3, !dbg !101 + %626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 4, !dbg !101 + %627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 5, !dbg !101 + %628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 6, !dbg !101 + %629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 7, !dbg !101 + %630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 8, !dbg !101 + %631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 9, !dbg !101 + %632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 10, !dbg !101 + %633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 11, !dbg !101 + %634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 12, !dbg !101 + %635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 13, !dbg !101 + %636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 14, !dbg !101 + %637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 15, !dbg !101 + %638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 16, !dbg !101 + %639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 17, !dbg !101 + %640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 18, !dbg !101 + %641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 19, !dbg !101 + %642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 20, !dbg !101 + %643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 21, !dbg !101 + %644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 22, !dbg !101 + %645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 23, !dbg !101 + %646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 24, !dbg !101 + %647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 25, !dbg !101 + %648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 26, !dbg !101 + %649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 27, !dbg !101 + %650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 28, !dbg !101 + %651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 29, !dbg !101 + %652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 30, !dbg !101 + %653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %610, 31, !dbg !101 + %654 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %622, float %623, float %624, float %625, float %626, float %627, float %628, float %629, float %630, float %631, float %632, float %633, float %634, float %635, float %636, float %637, float %638, float %639, float %640, float %641, float %642, float %643, float %644, float %645, float %646, float %647, float %648, float %649, float %650, float %651, float %652, float %653, i64 %616, i64 %621, i1 true) #3, !dbg !101 + %655 = or disjoint i32 %599, 64, !dbg !101 + %656 = add i32 %655, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !101 + %657 = lshr exact i32 %656, 4, !dbg !101 + %658 = and i32 %657, 16383, !dbg !101 + %659 = zext nneg i32 %658 to i64, !dbg !101 + %660 = or disjoint i64 %659, 4611686293372403712, !dbg !101 + %661 = add i32 %605, 64, !dbg !101 + %662 = lshr exact i32 %661, 4, !dbg !101 + %663 = and i32 %662, 16383, !dbg !101 + %664 = zext nneg i32 %663 to i64, !dbg !101 + %665 = or disjoint i64 %664, 4611686293338849280, !dbg !101 + %666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 0, !dbg !101 + %667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 1, !dbg !101 + %668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 2, !dbg !101 + %669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 3, !dbg !101 + %670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 4, !dbg !101 + %671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 5, !dbg !101 + %672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 6, !dbg !101 + %673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 7, !dbg !101 + %674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 8, !dbg !101 + %675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 9, !dbg !101 + %676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 10, !dbg !101 + %677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 11, !dbg !101 + %678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 12, !dbg !101 + %679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 13, !dbg !101 + %680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 14, !dbg !101 + %681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 15, !dbg !101 + %682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 16, !dbg !101 + %683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 17, !dbg !101 + %684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 18, !dbg !101 + %685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 19, !dbg !101 + %686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 20, !dbg !101 + %687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 21, !dbg !101 + %688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 22, !dbg !101 + %689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 23, !dbg !101 + %690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 24, !dbg !101 + %691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 25, !dbg !101 + %692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 26, !dbg !101 + %693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 27, !dbg !101 + %694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 28, !dbg !101 + %695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 29, !dbg !101 + %696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 30, !dbg !101 + %697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %654, 31, !dbg !101 + %698 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %666, float %667, float %668, float %669, float %670, float %671, float %672, float %673, float %674, float %675, float %676, float %677, float %678, float %679, float %680, float %681, float %682, float %683, float %684, float %685, float %686, float %687, float %688, float %689, float %690, float %691, float %692, float %693, float %694, float %695, float %696, float %697, i64 %660, i64 %665, i1 true) #3, !dbg !101 + %699 = or disjoint i32 %599, 96, !dbg !101 + %700 = add i32 %699, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !101 + %701 = lshr exact i32 %700, 4, !dbg !101 + %702 = and i32 %701, 16383, !dbg !101 + %703 = zext nneg i32 %702 to i64, !dbg !101 + %704 = or disjoint i64 %703, 4611686293372403712, !dbg !101 + %705 = add i32 %605, 96, !dbg !101 + %706 = lshr exact i32 %705, 4, !dbg !101 + %707 = and i32 %706, 16383, !dbg !101 + %708 = zext nneg i32 %707 to i64, !dbg !101 + %709 = or disjoint i64 %708, 4611686293338849280, !dbg !101 + %710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 0, !dbg !101 + %711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 1, !dbg !101 + %712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 2, !dbg !101 + %713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 3, !dbg !101 + %714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 4, !dbg !101 + %715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 5, !dbg !101 + %716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 6, !dbg !101 + %717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 7, !dbg !101 + %718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 8, !dbg !101 + %719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 9, !dbg !101 + %720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 10, !dbg !101 + %721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 11, !dbg !101 + %722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 12, !dbg !101 + %723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 13, !dbg !101 + %724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 14, !dbg !101 + %725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 15, !dbg !101 + %726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 16, !dbg !101 + %727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 17, !dbg !101 + %728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 18, !dbg !101 + %729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 19, !dbg !101 + %730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 20, !dbg !101 + %731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 21, !dbg !101 + %732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 22, !dbg !101 + %733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 23, !dbg !101 + %734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 24, !dbg !101 + %735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 25, !dbg !101 + %736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 26, !dbg !101 + %737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 27, !dbg !101 + %738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 28, !dbg !101 + %739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 29, !dbg !101 + %740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 30, !dbg !101 + %741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 31, !dbg !101 + %742 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %710, float %711, float %712, float %713, float %714, float %715, float %716, float %717, float %718, float %719, float %720, float %721, float %722, float %723, float %724, float %725, float %726, float %727, float %728, float %729, float %730, float %731, float %732, float %733, float %734, float %735, float %736, float %737, float %738, float %739, float %740, float %741, i64 %704, i64 %709, i1 true) #3, !dbg !101 + %743 = or disjoint i32 %599, 16384, !dbg !101 + %744 = add i32 %743, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !101 + %745 = lshr exact i32 %744, 4, !dbg !101 + %746 = and i32 %745, 16383, !dbg !101 + %747 = zext nneg i32 %746 to i64, !dbg !101 + %748 = or disjoint i64 %747, 4611686293372403712, !dbg !101 + %749 = add i32 %605, 8192, !dbg !101 + %750 = lshr exact i32 %749, 4, !dbg !101 + %751 = and i32 %750, 16383, !dbg !101 + %752 = zext nneg i32 %751 to i64, !dbg !101 + %753 = or disjoint i64 %752, 4611686293338849280, !dbg !101 + %754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 0, !dbg !101 + %755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 1, !dbg !101 + %756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 2, !dbg !101 + %757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 3, !dbg !101 + %758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 4, !dbg !101 + %759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 5, !dbg !101 + %760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 6, !dbg !101 + %761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 7, !dbg !101 + %762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 8, !dbg !101 + %763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 9, !dbg !101 + %764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 10, !dbg !101 + %765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 11, !dbg !101 + %766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 12, !dbg !101 + %767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 13, !dbg !101 + %768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 14, !dbg !101 + %769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 15, !dbg !101 + %770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 16, !dbg !101 + %771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 17, !dbg !101 + %772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 18, !dbg !101 + %773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 19, !dbg !101 + %774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 20, !dbg !101 + %775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 21, !dbg !101 + %776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 22, !dbg !101 + %777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 23, !dbg !101 + %778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 24, !dbg !101 + %779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 25, !dbg !101 + %780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 26, !dbg !101 + %781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 27, !dbg !101 + %782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 28, !dbg !101 + %783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 29, !dbg !101 + %784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 30, !dbg !101 + %785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %742, 31, !dbg !101 + %786 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %754, float %755, float %756, float %757, float %758, float %759, float %760, float %761, float %762, float %763, float %764, float %765, float %766, float %767, float %768, float %769, float %770, float %771, float %772, float %773, float %774, float %775, float %776, float %777, float %778, float %779, float %780, float %781, float %782, float %783, float %784, float %785, i64 %748, i64 %753, i1 true) #3, !dbg !101 + %787 = or disjoint i32 %599, 16416, !dbg !101 + %788 = add i32 %787, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !101 + %789 = lshr exact i32 %788, 4, !dbg !101 + %790 = and i32 %789, 16383, !dbg !101 + %791 = zext nneg i32 %790 to i64, !dbg !101 + %792 = or disjoint i64 %791, 4611686293372403712, !dbg !101 + %793 = add i32 %605, 8224, !dbg !101 + %794 = lshr exact i32 %793, 4, !dbg !101 + %795 = and i32 %794, 16383, !dbg !101 + %796 = zext nneg i32 %795 to i64, !dbg !101 + %797 = or disjoint i64 %796, 4611686293338849280, !dbg !101 + %798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 0, !dbg !101 + %799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 1, !dbg !101 + %800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 2, !dbg !101 + %801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 3, !dbg !101 + %802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 4, !dbg !101 + %803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 5, !dbg !101 + %804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 6, !dbg !101 + %805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 7, !dbg !101 + %806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 8, !dbg !101 + %807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 9, !dbg !101 + %808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 10, !dbg !101 + %809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 11, !dbg !101 + %810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 12, !dbg !101 + %811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 13, !dbg !101 + %812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 14, !dbg !101 + %813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 15, !dbg !101 + %814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 16, !dbg !101 + %815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 17, !dbg !101 + %816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 18, !dbg !101 + %817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 19, !dbg !101 + %818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 20, !dbg !101 + %819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 21, !dbg !101 + %820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 22, !dbg !101 + %821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 23, !dbg !101 + %822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 24, !dbg !101 + %823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 25, !dbg !101 + %824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 26, !dbg !101 + %825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 27, !dbg !101 + %826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 28, !dbg !101 + %827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 29, !dbg !101 + %828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 30, !dbg !101 + %829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 31, !dbg !101 + %830 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %798, float %799, float %800, float %801, float %802, float %803, float %804, float %805, float %806, float %807, float %808, float %809, float %810, float %811, float %812, float %813, float %814, float %815, float %816, float %817, float %818, float %819, float %820, float %821, float %822, float %823, float %824, float %825, float %826, float %827, float %828, float %829, i64 %792, i64 %797, i1 true) #3, !dbg !101 + %831 = or disjoint i32 %599, 16448, !dbg !101 + %832 = add i32 %831, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !101 + %833 = lshr exact i32 %832, 4, !dbg !101 + %834 = and i32 %833, 16383, !dbg !101 + %835 = zext nneg i32 %834 to i64, !dbg !101 + %836 = or disjoint i64 %835, 4611686293372403712, !dbg !101 + %837 = add i32 %605, 8256, !dbg !101 + %838 = lshr exact i32 %837, 4, !dbg !101 + %839 = and i32 %838, 16383, !dbg !101 + %840 = zext nneg i32 %839 to i64, !dbg !101 + %841 = or disjoint i64 %840, 4611686293338849280, !dbg !101 + %842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 0, !dbg !101 + %843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 1, !dbg !101 + %844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 2, !dbg !101 + %845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 3, !dbg !101 + %846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 4, !dbg !101 + %847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 5, !dbg !101 + %848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 6, !dbg !101 + %849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 7, !dbg !101 + %850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 8, !dbg !101 + %851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 9, !dbg !101 + %852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 10, !dbg !101 + %853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 11, !dbg !101 + %854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 12, !dbg !101 + %855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 13, !dbg !101 + %856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 14, !dbg !101 + %857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 15, !dbg !101 + %858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 16, !dbg !101 + %859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 17, !dbg !101 + %860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 18, !dbg !101 + %861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 19, !dbg !101 + %862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 20, !dbg !101 + %863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 21, !dbg !101 + %864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 22, !dbg !101 + %865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 23, !dbg !101 + %866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 24, !dbg !101 + %867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 25, !dbg !101 + %868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 26, !dbg !101 + %869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 27, !dbg !101 + %870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 28, !dbg !101 + %871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 29, !dbg !101 + %872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 30, !dbg !101 + %873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %830, 31, !dbg !101 + %874 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %842, float %843, float %844, float %845, float %846, float %847, float %848, float %849, float %850, float %851, float %852, float %853, float %854, float %855, float %856, float %857, float %858, float %859, float %860, float %861, float %862, float %863, float %864, float %865, float %866, float %867, float %868, float %869, float %870, float %871, float %872, float %873, i64 %836, i64 %841, i1 true) #3, !dbg !101 + %875 = or disjoint i32 %599, 16480, !dbg !101 + %876 = add i32 %875, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !101 + %877 = lshr exact i32 %876, 4, !dbg !101 + %878 = and i32 %877, 16383, !dbg !101 + %879 = zext nneg i32 %878 to i64, !dbg !101 + %880 = or disjoint i64 %879, 4611686293372403712, !dbg !101 + %881 = add i32 %605, 8288, !dbg !101 + %882 = lshr exact i32 %881, 4, !dbg !101 + %883 = and i32 %882, 16383, !dbg !101 + %884 = zext nneg i32 %883 to i64, !dbg !101 + %885 = or disjoint i64 %884, 4611686293338849280, !dbg !101 + %886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 0, !dbg !101 + %887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 1, !dbg !101 + %888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 2, !dbg !101 + %889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 3, !dbg !101 + %890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 4, !dbg !101 + %891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 5, !dbg !101 + %892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 6, !dbg !101 + %893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 7, !dbg !101 + %894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 8, !dbg !101 + %895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 9, !dbg !101 + %896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 10, !dbg !101 + %897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 11, !dbg !101 + %898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 12, !dbg !101 + %899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 13, !dbg !101 + %900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 14, !dbg !101 + %901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 15, !dbg !101 + %902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 16, !dbg !101 + %903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 17, !dbg !101 + %904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 18, !dbg !101 + %905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 19, !dbg !101 + %906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 20, !dbg !101 + %907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 21, !dbg !101 + %908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 22, !dbg !101 + %909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 23, !dbg !101 + %910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 24, !dbg !101 + %911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 25, !dbg !101 + %912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 26, !dbg !101 + %913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 27, !dbg !101 + %914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 28, !dbg !101 + %915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 29, !dbg !101 + %916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 30, !dbg !101 + %917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %874, 31, !dbg !101 + %918 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %886, float %887, float %888, float %889, float %890, float %891, float %892, float %893, float %894, float %895, float %896, float %897, float %898, float %899, float %900, float %901, float %902, float %903, float %904, float %905, float %906, float %907, float %908, float %909, float %910, float %911, float %912, float %913, float %914, float %915, float %916, float %917, i64 %880, i64 %885, i1 true) #3, !dbg !101 + %919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 0, !dbg !101 + %920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 1, !dbg !101 + %921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 2, !dbg !101 + %922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 3, !dbg !101 + %923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 4, !dbg !101 + %924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 5, !dbg !101 + %925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 6, !dbg !101 + %926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 7, !dbg !101 + %927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 8, !dbg !101 + %928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 9, !dbg !101 + %929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 10, !dbg !101 + %930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 11, !dbg !101 + %931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 12, !dbg !101 + %932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 13, !dbg !101 + %933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 14, !dbg !101 + %934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 15, !dbg !101 + %935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 16, !dbg !101 + %936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 17, !dbg !101 + %937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 18, !dbg !101 + %938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 19, !dbg !101 + %939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 20, !dbg !101 + %940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 21, !dbg !101 + %941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 22, !dbg !101 + %942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 23, !dbg !101 + %943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 24, !dbg !101 + %944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 25, !dbg !101 + %945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 26, !dbg !101 + %946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 27, !dbg !101 + %947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 28, !dbg !101 + %948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 29, !dbg !101 + %949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 30, !dbg !101 + %950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %918, 31, !dbg !101 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !101 + %951 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %919, float %920, float %921, float %922, float %923, float %924, float %925, float %926, float %927, float %928, float %929, float %930, float %931, float %932, float %933, float %934, float %935, float %936, float %937, float %938, float %939, float %940, float %941, float %942, float %943, float %944, float %945, float %946, float %947, float %948, float %949, float %950, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %596, i32 0, i32 0) #3, !dbg !101 + %952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 0, !dbg !101 + %953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 1, !dbg !101 + %954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 2, !dbg !101 + %955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 3, !dbg !101 + %956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 4, !dbg !101 + %957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 5, !dbg !101 + %958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 6, !dbg !101 + %959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 7, !dbg !101 + %960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 8, !dbg !101 + %961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 9, !dbg !101 + %962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 10, !dbg !101 + %963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 11, !dbg !101 + %964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 12, !dbg !101 + %965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 13, !dbg !101 + %966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 14, !dbg !101 + %967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 15, !dbg !101 + %968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 16, !dbg !101 + %969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 17, !dbg !101 + %970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 18, !dbg !101 + %971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 19, !dbg !101 + %972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 20, !dbg !101 + %973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 21, !dbg !101 + %974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 22, !dbg !101 + %975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 23, !dbg !101 + %976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 24, !dbg !101 + %977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 25, !dbg !101 + %978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 26, !dbg !101 + %979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 27, !dbg !101 + %980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 28, !dbg !101 + %981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 29, !dbg !101 + %982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 30, !dbg !101 + %983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %951, 31, !dbg !101 + %984 = fmul float %952, 0x3FB6A09E60000000, !dbg !106 + %985 = fmul float %953, 0x3FB6A09E60000000, !dbg !106 + %986 = fmul float %954, 0x3FB6A09E60000000, !dbg !106 + %987 = fmul float %955, 0x3FB6A09E60000000, !dbg !106 + %988 = fmul float %956, 0x3FB6A09E60000000, !dbg !106 + %989 = fmul float %957, 0x3FB6A09E60000000, !dbg !106 + %990 = fmul float %958, 0x3FB6A09E60000000, !dbg !106 + %991 = fmul float %959, 0x3FB6A09E60000000, !dbg !106 + %992 = fmul float %960, 0x3FB6A09E60000000, !dbg !106 + %993 = fmul float %961, 0x3FB6A09E60000000, !dbg !106 + %994 = fmul float %962, 0x3FB6A09E60000000, !dbg !106 + %995 = fmul float %963, 0x3FB6A09E60000000, !dbg !106 + %996 = fmul float %964, 0x3FB6A09E60000000, !dbg !106 + %997 = fmul float %965, 0x3FB6A09E60000000, !dbg !106 + %998 = fmul float %966, 0x3FB6A09E60000000, !dbg !106 + %999 = fmul float %967, 0x3FB6A09E60000000, !dbg !106 + %1000 = fmul float %968, 0x3FB6A09E60000000, !dbg !106 + %1001 = fmul float %969, 0x3FB6A09E60000000, !dbg !106 + %1002 = fmul float %970, 0x3FB6A09E60000000, !dbg !106 + %1003 = fmul float %971, 0x3FB6A09E60000000, !dbg !106 + %1004 = fmul float %972, 0x3FB6A09E60000000, !dbg !106 + %1005 = fmul float %973, 0x3FB6A09E60000000, !dbg !106 + %1006 = fmul float %974, 0x3FB6A09E60000000, !dbg !106 + %1007 = fmul float %975, 0x3FB6A09E60000000, !dbg !106 + %1008 = fmul float %976, 0x3FB6A09E60000000, !dbg !106 + %1009 = fmul float %977, 0x3FB6A09E60000000, !dbg !106 + %1010 = fmul float %978, 0x3FB6A09E60000000, !dbg !106 + %1011 = fmul float %979, 0x3FB6A09E60000000, !dbg !106 + %1012 = fmul float %980, 0x3FB6A09E60000000, !dbg !106 + %1013 = fmul float %981, 0x3FB6A09E60000000, !dbg !106 + %1014 = fmul float %982, 0x3FB6A09E60000000, !dbg !106 + %1015 = fmul float %983, 0x3FB6A09E60000000, !dbg !106 + %1016 = srem <2 x i32> %581, %498, !dbg !102 + %1017 = icmp sge <2 x i32> %496, %1016, !dbg !104 + %1018 = and <2 x i1> %495, %1017, !dbg !103 + %1019 = icmp sgt <2 x i32> %1016, splat (i32 2047), !dbg !107 + %1020 = trunc <2 x i32> %1016 to <2 x i16>, !dbg !108 + %1021 = and <2 x i16> %1020, splat (i16 2047), !dbg !108 + %1022 = zext nneg <2 x i16> %1021 to <2 x i64>, !dbg !109 + %1023 = icmp sgt <2 x i64> %412, %1022, !dbg !109 + %1024 = and <2 x i1> %1019, %1023, !dbg !110 + %1025 = sub <2 x i32> %1016, %496, !dbg !111 + %1026 = and <2 x i32> %1025, splat (i32 2047), !dbg !112 + %1027 = icmp eq <2 x i32> %1026, zeroinitializer, !dbg !113 + %1028 = and <2 x i1> %1027, %1024, !dbg !114 + %1029 = or <2 x i1> %1018, %1028, !dbg !115 + %1030 = icmp sge <2 x i32> %502, %1016, !dbg !104 + %1031 = and <2 x i1> %501, %1030, !dbg !103 + %1032 = sub <2 x i32> %1016, %502, !dbg !111 + %1033 = and <2 x i32> %1032, splat (i32 2047), !dbg !112 + %1034 = icmp eq <2 x i32> %1033, zeroinitializer, !dbg !113 + %1035 = and <2 x i1> %1034, %1024, !dbg !114 + %1036 = or <2 x i1> %1031, %1035, !dbg !115 + %1037 = select <2 x i1> %1036, <2 x i1> %587, <2 x i1> zeroinitializer, !dbg !116 + %1038 = select <2 x i1> %1029, <2 x i1> %587, <2 x i1> zeroinitializer, !dbg !116 + %1039 = srem <2 x i32> %580, %498, !dbg !102 + %1040 = icmp sge <2 x i32> %496, %1039, !dbg !104 + %1041 = and <2 x i1> %495, %1040, !dbg !103 + %1042 = icmp sgt <2 x i32> %1039, splat (i32 2047), !dbg !107 + %1043 = trunc <2 x i32> %1039 to <2 x i16>, !dbg !108 + %1044 = and <2 x i16> %1043, splat (i16 2047), !dbg !108 + %1045 = zext nneg <2 x i16> %1044 to <2 x i64>, !dbg !109 + %1046 = icmp sgt <2 x i64> %412, %1045, !dbg !109 + %1047 = and <2 x i1> %1042, %1046, !dbg !110 + %1048 = sub <2 x i32> %1039, %496, !dbg !111 + %1049 = and <2 x i32> %1048, splat (i32 2047), !dbg !112 + %1050 = icmp eq <2 x i32> %1049, zeroinitializer, !dbg !113 + %1051 = and <2 x i1> %1050, %1047, !dbg !114 + %1052 = or <2 x i1> %1041, %1051, !dbg !115 + %1053 = icmp sge <2 x i32> %502, %1039, !dbg !104 + %1054 = and <2 x i1> %501, %1053, !dbg !103 + %1055 = sub <2 x i32> %1039, %502, !dbg !111 + %1056 = and <2 x i32> %1055, splat (i32 2047), !dbg !112 + %1057 = icmp eq <2 x i32> %1056, zeroinitializer, !dbg !113 + %1058 = and <2 x i1> %1057, %1047, !dbg !114 + %1059 = or <2 x i1> %1054, %1058, !dbg !115 + %1060 = select <2 x i1> %1059, <2 x i1> %588, <2 x i1> zeroinitializer, !dbg !116 + %1061 = select <2 x i1> %1052, <2 x i1> %588, <2 x i1> zeroinitializer, !dbg !116 + %1062 = srem <2 x i32> %579, %498, !dbg !102 + %1063 = icmp sge <2 x i32> %496, %1062, !dbg !104 + %1064 = and <2 x i1> %495, %1063, !dbg !103 + %1065 = icmp sgt <2 x i32> %1062, splat (i32 2047), !dbg !107 + %1066 = trunc <2 x i32> %1062 to <2 x i16>, !dbg !108 + %1067 = and <2 x i16> %1066, splat (i16 2047), !dbg !108 + %1068 = zext nneg <2 x i16> %1067 to <2 x i64>, !dbg !109 + %1069 = icmp sgt <2 x i64> %412, %1068, !dbg !109 + %1070 = and <2 x i1> %1065, %1069, !dbg !110 + %1071 = sub <2 x i32> %1062, %496, !dbg !111 + %1072 = and <2 x i32> %1071, splat (i32 2047), !dbg !112 + %1073 = icmp eq <2 x i32> %1072, zeroinitializer, !dbg !113 + %1074 = and <2 x i1> %1073, %1070, !dbg !114 + %1075 = or <2 x i1> %1064, %1074, !dbg !115 + %1076 = icmp sge <2 x i32> %502, %1062, !dbg !104 + %1077 = and <2 x i1> %501, %1076, !dbg !103 + %1078 = sub <2 x i32> %1062, %502, !dbg !111 + %1079 = and <2 x i32> %1078, splat (i32 2047), !dbg !112 + %1080 = icmp eq <2 x i32> %1079, zeroinitializer, !dbg !113 + %1081 = and <2 x i1> %1080, %1070, !dbg !114 + %1082 = or <2 x i1> %1077, %1081, !dbg !115 + %1083 = select <2 x i1> %1082, <2 x i1> %589, <2 x i1> zeroinitializer, !dbg !116 + %1084 = select <2 x i1> %1075, <2 x i1> %589, <2 x i1> zeroinitializer, !dbg !116 + %1085 = srem <2 x i32> %578, %498, !dbg !102 + %1086 = icmp sge <2 x i32> %496, %1085, !dbg !104 + %1087 = and <2 x i1> %495, %1086, !dbg !103 + %1088 = icmp sgt <2 x i32> %1085, splat (i32 2047), !dbg !107 + %1089 = trunc <2 x i32> %1085 to <2 x i16>, !dbg !108 + %1090 = and <2 x i16> %1089, splat (i16 2047), !dbg !108 + %1091 = zext nneg <2 x i16> %1090 to <2 x i64>, !dbg !109 + %1092 = icmp sgt <2 x i64> %412, %1091, !dbg !109 + %1093 = and <2 x i1> %1088, %1092, !dbg !110 + %1094 = sub <2 x i32> %1085, %496, !dbg !111 + %1095 = and <2 x i32> %1094, splat (i32 2047), !dbg !112 + %1096 = icmp eq <2 x i32> %1095, zeroinitializer, !dbg !113 + %1097 = and <2 x i1> %1096, %1093, !dbg !114 + %1098 = or <2 x i1> %1087, %1097, !dbg !115 + %1099 = icmp sge <2 x i32> %502, %1085, !dbg !104 + %1100 = and <2 x i1> %501, %1099, !dbg !103 + %1101 = sub <2 x i32> %1085, %502, !dbg !111 + %1102 = and <2 x i32> %1101, splat (i32 2047), !dbg !112 + %1103 = icmp eq <2 x i32> %1102, zeroinitializer, !dbg !113 + %1104 = and <2 x i1> %1103, %1093, !dbg !114 + %1105 = or <2 x i1> %1100, %1104, !dbg !115 + %1106 = select <2 x i1> %1105, <2 x i1> %590, <2 x i1> zeroinitializer, !dbg !116 + %1107 = select <2 x i1> %1098, <2 x i1> %590, <2 x i1> zeroinitializer, !dbg !116 + %1108 = srem <2 x i32> %577, %498, !dbg !102 + %1109 = icmp sge <2 x i32> %496, %1108, !dbg !104 + %1110 = and <2 x i1> %495, %1109, !dbg !103 + %1111 = icmp sgt <2 x i32> %1108, splat (i32 2047), !dbg !107 + %1112 = trunc <2 x i32> %1108 to <2 x i16>, !dbg !108 + %1113 = and <2 x i16> %1112, splat (i16 2047), !dbg !108 + %1114 = zext nneg <2 x i16> %1113 to <2 x i64>, !dbg !109 + %1115 = icmp sgt <2 x i64> %412, %1114, !dbg !109 + %1116 = and <2 x i1> %1111, %1115, !dbg !110 + %1117 = sub <2 x i32> %1108, %496, !dbg !111 + %1118 = and <2 x i32> %1117, splat (i32 2047), !dbg !112 + %1119 = icmp eq <2 x i32> %1118, zeroinitializer, !dbg !113 + %1120 = and <2 x i1> %1119, %1116, !dbg !114 + %1121 = or <2 x i1> %1110, %1120, !dbg !115 + %1122 = icmp sge <2 x i32> %502, %1108, !dbg !104 + %1123 = and <2 x i1> %501, %1122, !dbg !103 + %1124 = sub <2 x i32> %1108, %502, !dbg !111 + %1125 = and <2 x i32> %1124, splat (i32 2047), !dbg !112 + %1126 = icmp eq <2 x i32> %1125, zeroinitializer, !dbg !113 + %1127 = and <2 x i1> %1126, %1116, !dbg !114 + %1128 = or <2 x i1> %1123, %1127, !dbg !115 + %1129 = select <2 x i1> %1128, <2 x i1> %591, <2 x i1> zeroinitializer, !dbg !116 + %1130 = select <2 x i1> %1121, <2 x i1> %591, <2 x i1> zeroinitializer, !dbg !116 + %1131 = srem <2 x i32> %576, %498, !dbg !102 + %1132 = icmp sge <2 x i32> %496, %1131, !dbg !104 + %1133 = and <2 x i1> %495, %1132, !dbg !103 + %1134 = icmp sgt <2 x i32> %1131, splat (i32 2047), !dbg !107 + %1135 = trunc <2 x i32> %1131 to <2 x i16>, !dbg !108 + %1136 = and <2 x i16> %1135, splat (i16 2047), !dbg !108 + %1137 = zext nneg <2 x i16> %1136 to <2 x i64>, !dbg !109 + %1138 = icmp sgt <2 x i64> %412, %1137, !dbg !109 + %1139 = and <2 x i1> %1134, %1138, !dbg !110 + %1140 = sub <2 x i32> %1131, %496, !dbg !111 + %1141 = and <2 x i32> %1140, splat (i32 2047), !dbg !112 + %1142 = icmp eq <2 x i32> %1141, zeroinitializer, !dbg !113 + %1143 = and <2 x i1> %1142, %1139, !dbg !114 + %1144 = or <2 x i1> %1133, %1143, !dbg !115 + %1145 = icmp sge <2 x i32> %502, %1131, !dbg !104 + %1146 = and <2 x i1> %501, %1145, !dbg !103 + %1147 = sub <2 x i32> %1131, %502, !dbg !111 + %1148 = and <2 x i32> %1147, splat (i32 2047), !dbg !112 + %1149 = icmp eq <2 x i32> %1148, zeroinitializer, !dbg !113 + %1150 = and <2 x i1> %1149, %1139, !dbg !114 + %1151 = or <2 x i1> %1146, %1150, !dbg !115 + %1152 = select <2 x i1> %1151, <2 x i1> %592, <2 x i1> zeroinitializer, !dbg !116 + %1153 = select <2 x i1> %1144, <2 x i1> %592, <2 x i1> zeroinitializer, !dbg !116 + %1154 = srem <2 x i32> %575, %498, !dbg !102 + %1155 = icmp sge <2 x i32> %496, %1154, !dbg !104 + %1156 = and <2 x i1> %495, %1155, !dbg !103 + %1157 = icmp sgt <2 x i32> %1154, splat (i32 2047), !dbg !107 + %1158 = trunc <2 x i32> %1154 to <2 x i16>, !dbg !108 + %1159 = and <2 x i16> %1158, splat (i16 2047), !dbg !108 + %1160 = zext nneg <2 x i16> %1159 to <2 x i64>, !dbg !109 + %1161 = icmp sgt <2 x i64> %412, %1160, !dbg !109 + %1162 = and <2 x i1> %1157, %1161, !dbg !110 + %1163 = sub <2 x i32> %1154, %496, !dbg !111 + %1164 = and <2 x i32> %1163, splat (i32 2047), !dbg !112 + %1165 = icmp eq <2 x i32> %1164, zeroinitializer, !dbg !113 + %1166 = and <2 x i1> %1165, %1162, !dbg !114 + %1167 = or <2 x i1> %1156, %1166, !dbg !115 + %1168 = icmp sge <2 x i32> %502, %1154, !dbg !104 + %1169 = and <2 x i1> %501, %1168, !dbg !103 + %1170 = sub <2 x i32> %1154, %502, !dbg !111 + %1171 = and <2 x i32> %1170, splat (i32 2047), !dbg !112 + %1172 = icmp eq <2 x i32> %1171, zeroinitializer, !dbg !113 + %1173 = and <2 x i1> %1172, %1162, !dbg !114 + %1174 = or <2 x i1> %1169, %1173, !dbg !115 + %1175 = select <2 x i1> %1174, <2 x i1> %593, <2 x i1> zeroinitializer, !dbg !116 + %1176 = select <2 x i1> %1167, <2 x i1> %593, <2 x i1> zeroinitializer, !dbg !116 + %1177 = srem <2 x i32> %574, %498, !dbg !102 + %1178 = icmp sge <2 x i32> %496, %1177, !dbg !104 + %1179 = and <2 x i1> %495, %1178, !dbg !103 + %1180 = icmp sgt <2 x i32> %1177, splat (i32 2047), !dbg !107 + %1181 = trunc <2 x i32> %1177 to <2 x i16>, !dbg !108 + %1182 = and <2 x i16> %1181, splat (i16 2047), !dbg !108 + %1183 = zext nneg <2 x i16> %1182 to <2 x i64>, !dbg !109 + %1184 = icmp sgt <2 x i64> %412, %1183, !dbg !109 + %1185 = and <2 x i1> %1180, %1184, !dbg !110 + %1186 = sub <2 x i32> %1177, %496, !dbg !111 + %1187 = and <2 x i32> %1186, splat (i32 2047), !dbg !112 + %1188 = icmp eq <2 x i32> %1187, zeroinitializer, !dbg !113 + %1189 = and <2 x i1> %1188, %1185, !dbg !114 + %1190 = or <2 x i1> %1179, %1189, !dbg !115 + %1191 = icmp sge <2 x i32> %502, %1177, !dbg !104 + %1192 = and <2 x i1> %501, %1191, !dbg !103 + %1193 = sub <2 x i32> %1177, %502, !dbg !111 + %1194 = and <2 x i32> %1193, splat (i32 2047), !dbg !112 + %1195 = icmp eq <2 x i32> %1194, zeroinitializer, !dbg !113 + %1196 = and <2 x i1> %1195, %1185, !dbg !114 + %1197 = or <2 x i1> %1192, %1196, !dbg !115 + %1198 = select <2 x i1> %1197, <2 x i1> %594, <2 x i1> zeroinitializer, !dbg !116 + %1199 = select <2 x i1> %1190, <2 x i1> %594, <2 x i1> zeroinitializer, !dbg !116 + %1200 = fmul float %984, 0x3FF7154760000000, !dbg !117 + %1201 = extractelement <2 x i1> %1037, i64 0, !dbg !116 + %1202 = select i1 %1201, float %1200, float 0xFFF0000000000000, !dbg !116 + %1203 = fmul float %985, 0x3FF7154760000000, !dbg !117 + %1204 = extractelement <2 x i1> %1037, i64 1, !dbg !116 + %1205 = select i1 %1204, float %1203, float 0xFFF0000000000000, !dbg !116 + %1206 = fmul float %986, 0x3FF7154760000000, !dbg !117 + %1207 = extractelement <2 x i1> %1038, i64 0, !dbg !116 + %1208 = select i1 %1207, float %1206, float 0xFFF0000000000000, !dbg !116 + %1209 = fmul float %987, 0x3FF7154760000000, !dbg !117 + %1210 = extractelement <2 x i1> %1038, i64 1, !dbg !116 + %1211 = select i1 %1210, float %1209, float 0xFFF0000000000000, !dbg !116 + %1212 = fmul float %988, 0x3FF7154760000000, !dbg !117 + %1213 = extractelement <2 x i1> %1060, i64 0, !dbg !116 + %1214 = select i1 %1213, float %1212, float 0xFFF0000000000000, !dbg !116 + %1215 = fmul float %989, 0x3FF7154760000000, !dbg !117 + %1216 = extractelement <2 x i1> %1060, i64 1, !dbg !116 + %1217 = select i1 %1216, float %1215, float 0xFFF0000000000000, !dbg !116 + %1218 = fmul float %990, 0x3FF7154760000000, !dbg !117 + %1219 = extractelement <2 x i1> %1061, i64 0, !dbg !116 + %1220 = select i1 %1219, float %1218, float 0xFFF0000000000000, !dbg !116 + %1221 = fmul float %991, 0x3FF7154760000000, !dbg !117 + %1222 = extractelement <2 x i1> %1061, i64 1, !dbg !116 + %1223 = select i1 %1222, float %1221, float 0xFFF0000000000000, !dbg !116 + %1224 = fmul float %992, 0x3FF7154760000000, !dbg !117 + %1225 = extractelement <2 x i1> %1083, i64 0, !dbg !116 + %1226 = select i1 %1225, float %1224, float 0xFFF0000000000000, !dbg !116 + %1227 = fmul float %993, 0x3FF7154760000000, !dbg !117 + %1228 = extractelement <2 x i1> %1083, i64 1, !dbg !116 + %1229 = select i1 %1228, float %1227, float 0xFFF0000000000000, !dbg !116 + %1230 = fmul float %994, 0x3FF7154760000000, !dbg !117 + %1231 = extractelement <2 x i1> %1084, i64 0, !dbg !116 + %1232 = select i1 %1231, float %1230, float 0xFFF0000000000000, !dbg !116 + %1233 = fmul float %995, 0x3FF7154760000000, !dbg !117 + %1234 = extractelement <2 x i1> %1084, i64 1, !dbg !116 + %1235 = select i1 %1234, float %1233, float 0xFFF0000000000000, !dbg !116 + %1236 = fmul float %996, 0x3FF7154760000000, !dbg !117 + %1237 = extractelement <2 x i1> %1106, i64 0, !dbg !116 + %1238 = select i1 %1237, float %1236, float 0xFFF0000000000000, !dbg !116 + %1239 = fmul float %997, 0x3FF7154760000000, !dbg !117 + %1240 = extractelement <2 x i1> %1106, i64 1, !dbg !116 + %1241 = select i1 %1240, float %1239, float 0xFFF0000000000000, !dbg !116 + %1242 = fmul float %998, 0x3FF7154760000000, !dbg !117 + %1243 = extractelement <2 x i1> %1107, i64 0, !dbg !116 + %1244 = select i1 %1243, float %1242, float 0xFFF0000000000000, !dbg !116 + %1245 = fmul float %999, 0x3FF7154760000000, !dbg !117 + %1246 = extractelement <2 x i1> %1107, i64 1, !dbg !116 + %1247 = select i1 %1246, float %1245, float 0xFFF0000000000000, !dbg !116 + %1248 = fmul float %1000, 0x3FF7154760000000, !dbg !117 + %1249 = extractelement <2 x i1> %1129, i64 0, !dbg !116 + %1250 = select i1 %1249, float %1248, float 0xFFF0000000000000, !dbg !116 + %1251 = fmul float %1001, 0x3FF7154760000000, !dbg !117 + %1252 = extractelement <2 x i1> %1129, i64 1, !dbg !116 + %1253 = select i1 %1252, float %1251, float 0xFFF0000000000000, !dbg !116 + %1254 = fmul float %1002, 0x3FF7154760000000, !dbg !117 + %1255 = extractelement <2 x i1> %1130, i64 0, !dbg !116 + %1256 = select i1 %1255, float %1254, float 0xFFF0000000000000, !dbg !116 + %1257 = fmul float %1003, 0x3FF7154760000000, !dbg !117 + %1258 = extractelement <2 x i1> %1130, i64 1, !dbg !116 + %1259 = select i1 %1258, float %1257, float 0xFFF0000000000000, !dbg !116 + %1260 = fmul float %1004, 0x3FF7154760000000, !dbg !117 + %1261 = extractelement <2 x i1> %1152, i64 0, !dbg !116 + %1262 = select i1 %1261, float %1260, float 0xFFF0000000000000, !dbg !116 + %1263 = fmul float %1005, 0x3FF7154760000000, !dbg !117 + %1264 = extractelement <2 x i1> %1152, i64 1, !dbg !116 + %1265 = select i1 %1264, float %1263, float 0xFFF0000000000000, !dbg !116 + %1266 = fmul float %1006, 0x3FF7154760000000, !dbg !117 + %1267 = extractelement <2 x i1> %1153, i64 0, !dbg !116 + %1268 = select i1 %1267, float %1266, float 0xFFF0000000000000, !dbg !116 + %1269 = fmul float %1007, 0x3FF7154760000000, !dbg !117 + %1270 = extractelement <2 x i1> %1153, i64 1, !dbg !116 + %1271 = select i1 %1270, float %1269, float 0xFFF0000000000000, !dbg !116 + %1272 = fmul float %1008, 0x3FF7154760000000, !dbg !117 + %1273 = extractelement <2 x i1> %1175, i64 0, !dbg !116 + %1274 = select i1 %1273, float %1272, float 0xFFF0000000000000, !dbg !116 + %1275 = fmul float %1009, 0x3FF7154760000000, !dbg !117 + %1276 = extractelement <2 x i1> %1175, i64 1, !dbg !116 + %1277 = select i1 %1276, float %1275, float 0xFFF0000000000000, !dbg !116 + %1278 = fmul float %1010, 0x3FF7154760000000, !dbg !117 + %1279 = extractelement <2 x i1> %1176, i64 0, !dbg !116 + %1280 = select i1 %1279, float %1278, float 0xFFF0000000000000, !dbg !116 + %1281 = fmul float %1011, 0x3FF7154760000000, !dbg !117 + %1282 = extractelement <2 x i1> %1176, i64 1, !dbg !116 + %1283 = select i1 %1282, float %1281, float 0xFFF0000000000000, !dbg !116 + %1284 = fmul float %1012, 0x3FF7154760000000, !dbg !117 + %1285 = extractelement <2 x i1> %1198, i64 0, !dbg !116 + %1286 = select i1 %1285, float %1284, float 0xFFF0000000000000, !dbg !116 + %1287 = fmul float %1013, 0x3FF7154760000000, !dbg !117 + %1288 = extractelement <2 x i1> %1198, i64 1, !dbg !116 + %1289 = select i1 %1288, float %1287, float 0xFFF0000000000000, !dbg !116 + %1290 = fmul float %1014, 0x3FF7154760000000, !dbg !117 + %1291 = extractelement <2 x i1> %1199, i64 0, !dbg !116 + %1292 = select i1 %1291, float %1290, float 0xFFF0000000000000, !dbg !116 + %1293 = fmul float %1015, 0x3FF7154760000000, !dbg !117 + %1294 = extractelement <2 x i1> %1199, i64 1, !dbg !116 + %1295 = select i1 %1294, float %1293, float 0xFFF0000000000000, !dbg !116 + %1296 = fsub float %1202, %374, !dbg !118 + %1297 = fsub float %1205, %374, !dbg !118 + %1298 = fsub float %1208, %375, !dbg !118 + %1299 = fsub float %1211, %375, !dbg !118 + %1300 = fsub float %1214, %374, !dbg !118 + %1301 = fsub float %1217, %374, !dbg !118 + %1302 = fsub float %1220, %375, !dbg !118 + %1303 = fsub float %1223, %375, !dbg !118 + %1304 = fsub float %1226, %374, !dbg !118 + %1305 = fsub float %1229, %374, !dbg !118 + %1306 = fsub float %1232, %375, !dbg !118 + %1307 = fsub float %1235, %375, !dbg !118 + %1308 = fsub float %1238, %374, !dbg !118 + %1309 = fsub float %1241, %374, !dbg !118 + %1310 = fsub float %1244, %375, !dbg !118 + %1311 = fsub float %1247, %375, !dbg !118 + %1312 = fsub float %1250, %374, !dbg !118 + %1313 = fsub float %1253, %374, !dbg !118 + %1314 = fsub float %1256, %375, !dbg !118 + %1315 = fsub float %1259, %375, !dbg !118 + %1316 = fsub float %1262, %374, !dbg !118 + %1317 = fsub float %1265, %374, !dbg !118 + %1318 = fsub float %1268, %375, !dbg !118 + %1319 = fsub float %1271, %375, !dbg !118 + %1320 = fsub float %1274, %374, !dbg !118 + %1321 = fsub float %1277, %374, !dbg !118 + %1322 = fsub float %1280, %375, !dbg !118 + %1323 = fsub float %1283, %375, !dbg !118 + %1324 = fsub float %1286, %374, !dbg !118 + %1325 = fsub float %1289, %374, !dbg !118 + %1326 = fsub float %1292, %375, !dbg !118 + %1327 = fsub float %1295, %375, !dbg !118 + %1328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1477 = icmp eq i32 %1328, 0, !dbg !119 + br i1 %.not.i1477, label %1331, label %1329, !dbg !119 + +1329: ; preds = %505 + %1330 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1296) #3, !dbg !119 + br label %__nv_exp2f.exit1479, !dbg !119 + +1331: ; preds = %505 + %1332 = tail call float @llvm.nvvm.ex2.approx.f(float %1296) #3, !dbg !119 + br label %__nv_exp2f.exit1479, !dbg !119 + +__nv_exp2f.exit1479: ; preds = %1329, %1331 + %.0.i1478 = phi float [ %1330, %1329 ], [ %1332, %1331 ], !dbg !119 + %1333 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1480 = icmp eq i32 %1333, 0, !dbg !119 + br i1 %.not.i1480, label %1336, label %1334, !dbg !119 + +1334: ; preds = %__nv_exp2f.exit1479 + %1335 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1297) #3, !dbg !119 + br label %__nv_exp2f.exit1482, !dbg !119 + +1336: ; preds = %__nv_exp2f.exit1479 + %1337 = tail call float @llvm.nvvm.ex2.approx.f(float %1297) #3, !dbg !119 + br label %__nv_exp2f.exit1482, !dbg !119 + +__nv_exp2f.exit1482: ; preds = %1334, %1336 + %.0.i1481 = phi float [ %1335, %1334 ], [ %1337, %1336 ], !dbg !119 + %1338 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1483 = icmp eq i32 %1338, 0, !dbg !119 + br i1 %.not.i1483, label %1341, label %1339, !dbg !119 + +1339: ; preds = %__nv_exp2f.exit1482 + %1340 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1298) #3, !dbg !119 + br label %__nv_exp2f.exit1485, !dbg !119 + +1341: ; preds = %__nv_exp2f.exit1482 + %1342 = tail call float @llvm.nvvm.ex2.approx.f(float %1298) #3, !dbg !119 + br label %__nv_exp2f.exit1485, !dbg !119 + +__nv_exp2f.exit1485: ; preds = %1339, %1341 + %.0.i1484 = phi float [ %1340, %1339 ], [ %1342, %1341 ], !dbg !119 + %1343 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1486 = icmp eq i32 %1343, 0, !dbg !119 + br i1 %.not.i1486, label %1346, label %1344, !dbg !119 + +1344: ; preds = %__nv_exp2f.exit1485 + %1345 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1299) #3, !dbg !119 + br label %__nv_exp2f.exit1488, !dbg !119 + +1346: ; preds = %__nv_exp2f.exit1485 + %1347 = tail call float @llvm.nvvm.ex2.approx.f(float %1299) #3, !dbg !119 + br label %__nv_exp2f.exit1488, !dbg !119 + +__nv_exp2f.exit1488: ; preds = %1344, %1346 + %.0.i1487 = phi float [ %1345, %1344 ], [ %1347, %1346 ], !dbg !119 + %1348 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1489 = icmp eq i32 %1348, 0, !dbg !119 + br i1 %.not.i1489, label %1351, label %1349, !dbg !119 + +1349: ; preds = %__nv_exp2f.exit1488 + %1350 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1300) #3, !dbg !119 + br label %__nv_exp2f.exit1491, !dbg !119 + +1351: ; preds = %__nv_exp2f.exit1488 + %1352 = tail call float @llvm.nvvm.ex2.approx.f(float %1300) #3, !dbg !119 + br label %__nv_exp2f.exit1491, !dbg !119 + +__nv_exp2f.exit1491: ; preds = %1349, %1351 + %.0.i1490 = phi float [ %1350, %1349 ], [ %1352, %1351 ], !dbg !119 + %1353 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1492 = icmp eq i32 %1353, 0, !dbg !119 + br i1 %.not.i1492, label %1356, label %1354, !dbg !119 + +1354: ; preds = %__nv_exp2f.exit1491 + %1355 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1301) #3, !dbg !119 + br label %__nv_exp2f.exit1494, !dbg !119 + +1356: ; preds = %__nv_exp2f.exit1491 + %1357 = tail call float @llvm.nvvm.ex2.approx.f(float %1301) #3, !dbg !119 + br label %__nv_exp2f.exit1494, !dbg !119 + +__nv_exp2f.exit1494: ; preds = %1354, %1356 + %.0.i1493 = phi float [ %1355, %1354 ], [ %1357, %1356 ], !dbg !119 + %1358 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1495 = icmp eq i32 %1358, 0, !dbg !119 + br i1 %.not.i1495, label %1361, label %1359, !dbg !119 + +1359: ; preds = %__nv_exp2f.exit1494 + %1360 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1302) #3, !dbg !119 + br label %__nv_exp2f.exit1497, !dbg !119 + +1361: ; preds = %__nv_exp2f.exit1494 + %1362 = tail call float @llvm.nvvm.ex2.approx.f(float %1302) #3, !dbg !119 + br label %__nv_exp2f.exit1497, !dbg !119 + +__nv_exp2f.exit1497: ; preds = %1359, %1361 + %.0.i1496 = phi float [ %1360, %1359 ], [ %1362, %1361 ], !dbg !119 + %1363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1498 = icmp eq i32 %1363, 0, !dbg !119 + br i1 %.not.i1498, label %1366, label %1364, !dbg !119 + +1364: ; preds = %__nv_exp2f.exit1497 + %1365 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1303) #3, !dbg !119 + br label %__nv_exp2f.exit1500, !dbg !119 + +1366: ; preds = %__nv_exp2f.exit1497 + %1367 = tail call float @llvm.nvvm.ex2.approx.f(float %1303) #3, !dbg !119 + br label %__nv_exp2f.exit1500, !dbg !119 + +__nv_exp2f.exit1500: ; preds = %1364, %1366 + %.0.i1499 = phi float [ %1365, %1364 ], [ %1367, %1366 ], !dbg !119 + %1368 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1501 = icmp eq i32 %1368, 0, !dbg !119 + br i1 %.not.i1501, label %1371, label %1369, !dbg !119 + +1369: ; preds = %__nv_exp2f.exit1500 + %1370 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1304) #3, !dbg !119 + br label %__nv_exp2f.exit1503, !dbg !119 + +1371: ; preds = %__nv_exp2f.exit1500 + %1372 = tail call float @llvm.nvvm.ex2.approx.f(float %1304) #3, !dbg !119 + br label %__nv_exp2f.exit1503, !dbg !119 + +__nv_exp2f.exit1503: ; preds = %1369, %1371 + %.0.i1502 = phi float [ %1370, %1369 ], [ %1372, %1371 ], !dbg !119 + %1373 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1504 = icmp eq i32 %1373, 0, !dbg !119 + br i1 %.not.i1504, label %1376, label %1374, !dbg !119 + +1374: ; preds = %__nv_exp2f.exit1503 + %1375 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1305) #3, !dbg !119 + br label %__nv_exp2f.exit1506, !dbg !119 + +1376: ; preds = %__nv_exp2f.exit1503 + %1377 = tail call float @llvm.nvvm.ex2.approx.f(float %1305) #3, !dbg !119 + br label %__nv_exp2f.exit1506, !dbg !119 + +__nv_exp2f.exit1506: ; preds = %1374, %1376 + %.0.i1505 = phi float [ %1375, %1374 ], [ %1377, %1376 ], !dbg !119 + %1378 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1507 = icmp eq i32 %1378, 0, !dbg !119 + br i1 %.not.i1507, label %1381, label %1379, !dbg !119 + +1379: ; preds = %__nv_exp2f.exit1506 + %1380 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1306) #3, !dbg !119 + br label %__nv_exp2f.exit1509, !dbg !119 + +1381: ; preds = %__nv_exp2f.exit1506 + %1382 = tail call float @llvm.nvvm.ex2.approx.f(float %1306) #3, !dbg !119 + br label %__nv_exp2f.exit1509, !dbg !119 + +__nv_exp2f.exit1509: ; preds = %1379, %1381 + %.0.i1508 = phi float [ %1380, %1379 ], [ %1382, %1381 ], !dbg !119 + %1383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1510 = icmp eq i32 %1383, 0, !dbg !119 + br i1 %.not.i1510, label %1386, label %1384, !dbg !119 + +1384: ; preds = %__nv_exp2f.exit1509 + %1385 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1307) #3, !dbg !119 + br label %__nv_exp2f.exit1512, !dbg !119 + +1386: ; preds = %__nv_exp2f.exit1509 + %1387 = tail call float @llvm.nvvm.ex2.approx.f(float %1307) #3, !dbg !119 + br label %__nv_exp2f.exit1512, !dbg !119 + +__nv_exp2f.exit1512: ; preds = %1384, %1386 + %.0.i1511 = phi float [ %1385, %1384 ], [ %1387, %1386 ], !dbg !119 + %1388 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1513 = icmp eq i32 %1388, 0, !dbg !119 + br i1 %.not.i1513, label %1391, label %1389, !dbg !119 + +1389: ; preds = %__nv_exp2f.exit1512 + %1390 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1308) #3, !dbg !119 + br label %__nv_exp2f.exit1515, !dbg !119 + +1391: ; preds = %__nv_exp2f.exit1512 + %1392 = tail call float @llvm.nvvm.ex2.approx.f(float %1308) #3, !dbg !119 + br label %__nv_exp2f.exit1515, !dbg !119 + +__nv_exp2f.exit1515: ; preds = %1389, %1391 + %.0.i1514 = phi float [ %1390, %1389 ], [ %1392, %1391 ], !dbg !119 + %1393 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1516 = icmp eq i32 %1393, 0, !dbg !119 + br i1 %.not.i1516, label %1396, label %1394, !dbg !119 + +1394: ; preds = %__nv_exp2f.exit1515 + %1395 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1309) #3, !dbg !119 + br label %__nv_exp2f.exit1518, !dbg !119 + +1396: ; preds = %__nv_exp2f.exit1515 + %1397 = tail call float @llvm.nvvm.ex2.approx.f(float %1309) #3, !dbg !119 + br label %__nv_exp2f.exit1518, !dbg !119 + +__nv_exp2f.exit1518: ; preds = %1394, %1396 + %.0.i1517 = phi float [ %1395, %1394 ], [ %1397, %1396 ], !dbg !119 + %1398 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1519 = icmp eq i32 %1398, 0, !dbg !119 + br i1 %.not.i1519, label %1401, label %1399, !dbg !119 + +1399: ; preds = %__nv_exp2f.exit1518 + %1400 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1310) #3, !dbg !119 + br label %__nv_exp2f.exit1521, !dbg !119 + +1401: ; preds = %__nv_exp2f.exit1518 + %1402 = tail call float @llvm.nvvm.ex2.approx.f(float %1310) #3, !dbg !119 + br label %__nv_exp2f.exit1521, !dbg !119 + +__nv_exp2f.exit1521: ; preds = %1399, %1401 + %.0.i1520 = phi float [ %1400, %1399 ], [ %1402, %1401 ], !dbg !119 + %1403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1522 = icmp eq i32 %1403, 0, !dbg !119 + br i1 %.not.i1522, label %1406, label %1404, !dbg !119 + +1404: ; preds = %__nv_exp2f.exit1521 + %1405 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1311) #3, !dbg !119 + br label %__nv_exp2f.exit1524, !dbg !119 + +1406: ; preds = %__nv_exp2f.exit1521 + %1407 = tail call float @llvm.nvvm.ex2.approx.f(float %1311) #3, !dbg !119 + br label %__nv_exp2f.exit1524, !dbg !119 + +__nv_exp2f.exit1524: ; preds = %1404, %1406 + %.0.i1523 = phi float [ %1405, %1404 ], [ %1407, %1406 ], !dbg !119 + %1408 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1525 = icmp eq i32 %1408, 0, !dbg !119 + br i1 %.not.i1525, label %1411, label %1409, !dbg !119 + +1409: ; preds = %__nv_exp2f.exit1524 + %1410 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1312) #3, !dbg !119 + br label %__nv_exp2f.exit1527, !dbg !119 + +1411: ; preds = %__nv_exp2f.exit1524 + %1412 = tail call float @llvm.nvvm.ex2.approx.f(float %1312) #3, !dbg !119 + br label %__nv_exp2f.exit1527, !dbg !119 + +__nv_exp2f.exit1527: ; preds = %1409, %1411 + %.0.i1526 = phi float [ %1410, %1409 ], [ %1412, %1411 ], !dbg !119 + %1413 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1528 = icmp eq i32 %1413, 0, !dbg !119 + br i1 %.not.i1528, label %1416, label %1414, !dbg !119 + +1414: ; preds = %__nv_exp2f.exit1527 + %1415 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1313) #3, !dbg !119 + br label %__nv_exp2f.exit1530, !dbg !119 + +1416: ; preds = %__nv_exp2f.exit1527 + %1417 = tail call float @llvm.nvvm.ex2.approx.f(float %1313) #3, !dbg !119 + br label %__nv_exp2f.exit1530, !dbg !119 + +__nv_exp2f.exit1530: ; preds = %1414, %1416 + %.0.i1529 = phi float [ %1415, %1414 ], [ %1417, %1416 ], !dbg !119 + %1418 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1531 = icmp eq i32 %1418, 0, !dbg !119 + br i1 %.not.i1531, label %1421, label %1419, !dbg !119 + +1419: ; preds = %__nv_exp2f.exit1530 + %1420 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1314) #3, !dbg !119 + br label %__nv_exp2f.exit1533, !dbg !119 + +1421: ; preds = %__nv_exp2f.exit1530 + %1422 = tail call float @llvm.nvvm.ex2.approx.f(float %1314) #3, !dbg !119 + br label %__nv_exp2f.exit1533, !dbg !119 + +__nv_exp2f.exit1533: ; preds = %1419, %1421 + %.0.i1532 = phi float [ %1420, %1419 ], [ %1422, %1421 ], !dbg !119 + %1423 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1534 = icmp eq i32 %1423, 0, !dbg !119 + br i1 %.not.i1534, label %1426, label %1424, !dbg !119 + +1424: ; preds = %__nv_exp2f.exit1533 + %1425 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1315) #3, !dbg !119 + br label %__nv_exp2f.exit1536, !dbg !119 + +1426: ; preds = %__nv_exp2f.exit1533 + %1427 = tail call float @llvm.nvvm.ex2.approx.f(float %1315) #3, !dbg !119 + br label %__nv_exp2f.exit1536, !dbg !119 + +__nv_exp2f.exit1536: ; preds = %1424, %1426 + %.0.i1535 = phi float [ %1425, %1424 ], [ %1427, %1426 ], !dbg !119 + %1428 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1537 = icmp eq i32 %1428, 0, !dbg !119 + br i1 %.not.i1537, label %1431, label %1429, !dbg !119 + +1429: ; preds = %__nv_exp2f.exit1536 + %1430 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1316) #3, !dbg !119 + br label %__nv_exp2f.exit1539, !dbg !119 + +1431: ; preds = %__nv_exp2f.exit1536 + %1432 = tail call float @llvm.nvvm.ex2.approx.f(float %1316) #3, !dbg !119 + br label %__nv_exp2f.exit1539, !dbg !119 + +__nv_exp2f.exit1539: ; preds = %1429, %1431 + %.0.i1538 = phi float [ %1430, %1429 ], [ %1432, %1431 ], !dbg !119 + %1433 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1540 = icmp eq i32 %1433, 0, !dbg !119 + br i1 %.not.i1540, label %1436, label %1434, !dbg !119 + +1434: ; preds = %__nv_exp2f.exit1539 + %1435 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1317) #3, !dbg !119 + br label %__nv_exp2f.exit1542, !dbg !119 + +1436: ; preds = %__nv_exp2f.exit1539 + %1437 = tail call float @llvm.nvvm.ex2.approx.f(float %1317) #3, !dbg !119 + br label %__nv_exp2f.exit1542, !dbg !119 + +__nv_exp2f.exit1542: ; preds = %1434, %1436 + %.0.i1541 = phi float [ %1435, %1434 ], [ %1437, %1436 ], !dbg !119 + %1438 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1543 = icmp eq i32 %1438, 0, !dbg !119 + br i1 %.not.i1543, label %1441, label %1439, !dbg !119 + +1439: ; preds = %__nv_exp2f.exit1542 + %1440 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1318) #3, !dbg !119 + br label %__nv_exp2f.exit1545, !dbg !119 + +1441: ; preds = %__nv_exp2f.exit1542 + %1442 = tail call float @llvm.nvvm.ex2.approx.f(float %1318) #3, !dbg !119 + br label %__nv_exp2f.exit1545, !dbg !119 + +__nv_exp2f.exit1545: ; preds = %1439, %1441 + %.0.i1544 = phi float [ %1440, %1439 ], [ %1442, %1441 ], !dbg !119 + %1443 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1546 = icmp eq i32 %1443, 0, !dbg !119 + br i1 %.not.i1546, label %1446, label %1444, !dbg !119 + +1444: ; preds = %__nv_exp2f.exit1545 + %1445 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1319) #3, !dbg !119 + br label %__nv_exp2f.exit1548, !dbg !119 + +1446: ; preds = %__nv_exp2f.exit1545 + %1447 = tail call float @llvm.nvvm.ex2.approx.f(float %1319) #3, !dbg !119 + br label %__nv_exp2f.exit1548, !dbg !119 + +__nv_exp2f.exit1548: ; preds = %1444, %1446 + %.0.i1547 = phi float [ %1445, %1444 ], [ %1447, %1446 ], !dbg !119 + %1448 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1549 = icmp eq i32 %1448, 0, !dbg !119 + br i1 %.not.i1549, label %1451, label %1449, !dbg !119 + +1449: ; preds = %__nv_exp2f.exit1548 + %1450 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1320) #3, !dbg !119 + br label %__nv_exp2f.exit1551, !dbg !119 + +1451: ; preds = %__nv_exp2f.exit1548 + %1452 = tail call float @llvm.nvvm.ex2.approx.f(float %1320) #3, !dbg !119 + br label %__nv_exp2f.exit1551, !dbg !119 + +__nv_exp2f.exit1551: ; preds = %1449, %1451 + %.0.i1550 = phi float [ %1450, %1449 ], [ %1452, %1451 ], !dbg !119 + %1453 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1552 = icmp eq i32 %1453, 0, !dbg !119 + br i1 %.not.i1552, label %1456, label %1454, !dbg !119 + +1454: ; preds = %__nv_exp2f.exit1551 + %1455 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1321) #3, !dbg !119 + br label %__nv_exp2f.exit1554, !dbg !119 + +1456: ; preds = %__nv_exp2f.exit1551 + %1457 = tail call float @llvm.nvvm.ex2.approx.f(float %1321) #3, !dbg !119 + br label %__nv_exp2f.exit1554, !dbg !119 + +__nv_exp2f.exit1554: ; preds = %1454, %1456 + %.0.i1553 = phi float [ %1455, %1454 ], [ %1457, %1456 ], !dbg !119 + %1458 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1555 = icmp eq i32 %1458, 0, !dbg !119 + br i1 %.not.i1555, label %1461, label %1459, !dbg !119 + +1459: ; preds = %__nv_exp2f.exit1554 + %1460 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1322) #3, !dbg !119 + br label %__nv_exp2f.exit1557, !dbg !119 + +1461: ; preds = %__nv_exp2f.exit1554 + %1462 = tail call float @llvm.nvvm.ex2.approx.f(float %1322) #3, !dbg !119 + br label %__nv_exp2f.exit1557, !dbg !119 + +__nv_exp2f.exit1557: ; preds = %1459, %1461 + %.0.i1556 = phi float [ %1460, %1459 ], [ %1462, %1461 ], !dbg !119 + %1463 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1558 = icmp eq i32 %1463, 0, !dbg !119 + br i1 %.not.i1558, label %1466, label %1464, !dbg !119 + +1464: ; preds = %__nv_exp2f.exit1557 + %1465 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1323) #3, !dbg !119 + br label %__nv_exp2f.exit1560, !dbg !119 + +1466: ; preds = %__nv_exp2f.exit1557 + %1467 = tail call float @llvm.nvvm.ex2.approx.f(float %1323) #3, !dbg !119 + br label %__nv_exp2f.exit1560, !dbg !119 + +__nv_exp2f.exit1560: ; preds = %1464, %1466 + %.0.i1559 = phi float [ %1465, %1464 ], [ %1467, %1466 ], !dbg !119 + %1468 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1561 = icmp eq i32 %1468, 0, !dbg !119 + br i1 %.not.i1561, label %1471, label %1469, !dbg !119 + +1469: ; preds = %__nv_exp2f.exit1560 + %1470 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1324) #3, !dbg !119 + br label %__nv_exp2f.exit1563, !dbg !119 + +1471: ; preds = %__nv_exp2f.exit1560 + %1472 = tail call float @llvm.nvvm.ex2.approx.f(float %1324) #3, !dbg !119 + br label %__nv_exp2f.exit1563, !dbg !119 + +__nv_exp2f.exit1563: ; preds = %1469, %1471 + %.0.i1562 = phi float [ %1470, %1469 ], [ %1472, %1471 ], !dbg !119 + %1473 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1564 = icmp eq i32 %1473, 0, !dbg !119 + br i1 %.not.i1564, label %1476, label %1474, !dbg !119 + +1474: ; preds = %__nv_exp2f.exit1563 + %1475 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1325) #3, !dbg !119 + br label %__nv_exp2f.exit1566, !dbg !119 + +1476: ; preds = %__nv_exp2f.exit1563 + %1477 = tail call float @llvm.nvvm.ex2.approx.f(float %1325) #3, !dbg !119 + br label %__nv_exp2f.exit1566, !dbg !119 + +__nv_exp2f.exit1566: ; preds = %1474, %1476 + %.0.i1565 = phi float [ %1475, %1474 ], [ %1477, %1476 ], !dbg !119 + %1478 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1567 = icmp eq i32 %1478, 0, !dbg !119 + br i1 %.not.i1567, label %1481, label %1479, !dbg !119 + +1479: ; preds = %__nv_exp2f.exit1566 + %1480 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1326) #3, !dbg !119 + br label %__nv_exp2f.exit1569, !dbg !119 + +1481: ; preds = %__nv_exp2f.exit1566 + %1482 = tail call float @llvm.nvvm.ex2.approx.f(float %1326) #3, !dbg !119 + br label %__nv_exp2f.exit1569, !dbg !119 + +__nv_exp2f.exit1569: ; preds = %1479, %1481 + %.0.i1568 = phi float [ %1480, %1479 ], [ %1482, %1481 ], !dbg !119 + %1483 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !119 + %.not.i1570 = icmp eq i32 %1483, 0, !dbg !119 + br i1 %.not.i1570, label %1486, label %1484, !dbg !119 + +1484: ; preds = %__nv_exp2f.exit1569 + %1485 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1327) #3, !dbg !119 + br label %__nv_exp2f.exit1572, !dbg !119 + +1486: ; preds = %__nv_exp2f.exit1569 + %1487 = tail call float @llvm.nvvm.ex2.approx.f(float %1327) #3, !dbg !119 + br label %__nv_exp2f.exit1572, !dbg !119 + +__nv_exp2f.exit1572: ; preds = %1484, %1486 + %.0.i1571 = phi float [ %1485, %1484 ], [ %1487, %1486 ], !dbg !119 + %1488 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %595, !dbg !97 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !120 + %1489 = add i32 %599, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !120 + %1490 = lshr exact i32 %1489, 4, !dbg !120 + %1491 = and i32 %1490, 16383, !dbg !120 + %1492 = zext nneg i32 %1491 to i64, !dbg !120 + %1493 = or disjoint i64 %1492, 4611686293372403712, !dbg !120 + %1494 = ptrtoint ptr addrspace(3) %1488 to i32, !dbg !120 + %1495 = lshr exact i32 %1494, 4, !dbg !120 + %1496 = and i32 %1495, 16383, !dbg !120 + %1497 = zext nneg i32 %1496 to i64, !dbg !120 + %1498 = or disjoint i64 %1497, 4611686293338849280, !dbg !120 + %1499 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %1493, i64 %1498) #3, !dbg !120 + %1500 = add i32 %611, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !120 + %1501 = lshr exact i32 %1500, 4, !dbg !120 + %1502 = and i32 %1501, 16383, !dbg !120 + %1503 = zext nneg i32 %1502 to i64, !dbg !120 + %1504 = or disjoint i64 %1503, 4611686293372403712, !dbg !120 + %1505 = add i32 %1494, 32, !dbg !120 + %1506 = lshr exact i32 %1505, 4, !dbg !120 + %1507 = and i32 %1506, 16383, !dbg !120 + %1508 = zext nneg i32 %1507 to i64, !dbg !120 + %1509 = or disjoint i64 %1508, 4611686293338849280, !dbg !120 + %1510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 0, !dbg !120 + %1511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 1, !dbg !120 + %1512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 2, !dbg !120 + %1513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 3, !dbg !120 + %1514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 4, !dbg !120 + %1515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 5, !dbg !120 + %1516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 6, !dbg !120 + %1517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 7, !dbg !120 + %1518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 8, !dbg !120 + %1519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 9, !dbg !120 + %1520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 10, !dbg !120 + %1521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 11, !dbg !120 + %1522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 12, !dbg !120 + %1523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 13, !dbg !120 + %1524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 14, !dbg !120 + %1525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 15, !dbg !120 + %1526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 16, !dbg !120 + %1527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 17, !dbg !120 + %1528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 18, !dbg !120 + %1529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 19, !dbg !120 + %1530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 20, !dbg !120 + %1531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 21, !dbg !120 + %1532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 22, !dbg !120 + %1533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 23, !dbg !120 + %1534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 24, !dbg !120 + %1535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 25, !dbg !120 + %1536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 26, !dbg !120 + %1537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 27, !dbg !120 + %1538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 28, !dbg !120 + %1539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 29, !dbg !120 + %1540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 30, !dbg !120 + %1541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1499, 31, !dbg !120 + %1542 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1510, float %1511, float %1512, float %1513, float %1514, float %1515, float %1516, float %1517, float %1518, float %1519, float %1520, float %1521, float %1522, float %1523, float %1524, float %1525, float %1526, float %1527, float %1528, float %1529, float %1530, float %1531, float %1532, float %1533, float %1534, float %1535, float %1536, float %1537, float %1538, float %1539, float %1540, float %1541, i64 %1504, i64 %1509, i1 true) #3, !dbg !120 + %1543 = add i32 %655, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !120 + %1544 = lshr exact i32 %1543, 4, !dbg !120 + %1545 = and i32 %1544, 16383, !dbg !120 + %1546 = zext nneg i32 %1545 to i64, !dbg !120 + %1547 = or disjoint i64 %1546, 4611686293372403712, !dbg !120 + %1548 = add i32 %1494, 64, !dbg !120 + %1549 = lshr exact i32 %1548, 4, !dbg !120 + %1550 = and i32 %1549, 16383, !dbg !120 + %1551 = zext nneg i32 %1550 to i64, !dbg !120 + %1552 = or disjoint i64 %1551, 4611686293338849280, !dbg !120 + %1553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 0, !dbg !120 + %1554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 1, !dbg !120 + %1555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 2, !dbg !120 + %1556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 3, !dbg !120 + %1557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 4, !dbg !120 + %1558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 5, !dbg !120 + %1559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 6, !dbg !120 + %1560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 7, !dbg !120 + %1561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 8, !dbg !120 + %1562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 9, !dbg !120 + %1563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 10, !dbg !120 + %1564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 11, !dbg !120 + %1565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 12, !dbg !120 + %1566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 13, !dbg !120 + %1567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 14, !dbg !120 + %1568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 15, !dbg !120 + %1569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 16, !dbg !120 + %1570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 17, !dbg !120 + %1571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 18, !dbg !120 + %1572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 19, !dbg !120 + %1573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 20, !dbg !120 + %1574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 21, !dbg !120 + %1575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 22, !dbg !120 + %1576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 23, !dbg !120 + %1577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 24, !dbg !120 + %1578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 25, !dbg !120 + %1579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 26, !dbg !120 + %1580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 27, !dbg !120 + %1581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 28, !dbg !120 + %1582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 29, !dbg !120 + %1583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 30, !dbg !120 + %1584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1542, 31, !dbg !120 + %1585 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1553, float %1554, float %1555, float %1556, float %1557, float %1558, float %1559, float %1560, float %1561, float %1562, float %1563, float %1564, float %1565, float %1566, float %1567, float %1568, float %1569, float %1570, float %1571, float %1572, float %1573, float %1574, float %1575, float %1576, float %1577, float %1578, float %1579, float %1580, float %1581, float %1582, float %1583, float %1584, i64 %1547, i64 %1552, i1 true) #3, !dbg !120 + %1586 = add i32 %699, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !120 + %1587 = lshr exact i32 %1586, 4, !dbg !120 + %1588 = and i32 %1587, 16383, !dbg !120 + %1589 = zext nneg i32 %1588 to i64, !dbg !120 + %1590 = or disjoint i64 %1589, 4611686293372403712, !dbg !120 + %1591 = add i32 %1494, 96, !dbg !120 + %1592 = lshr exact i32 %1591, 4, !dbg !120 + %1593 = and i32 %1592, 16383, !dbg !120 + %1594 = zext nneg i32 %1593 to i64, !dbg !120 + %1595 = or disjoint i64 %1594, 4611686293338849280, !dbg !120 + %1596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 0, !dbg !120 + %1597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 1, !dbg !120 + %1598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 2, !dbg !120 + %1599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 3, !dbg !120 + %1600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 4, !dbg !120 + %1601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 5, !dbg !120 + %1602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 6, !dbg !120 + %1603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 7, !dbg !120 + %1604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 8, !dbg !120 + %1605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 9, !dbg !120 + %1606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 10, !dbg !120 + %1607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 11, !dbg !120 + %1608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 12, !dbg !120 + %1609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 13, !dbg !120 + %1610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 14, !dbg !120 + %1611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 15, !dbg !120 + %1612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 16, !dbg !120 + %1613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 17, !dbg !120 + %1614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 18, !dbg !120 + %1615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 19, !dbg !120 + %1616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 20, !dbg !120 + %1617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 21, !dbg !120 + %1618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 22, !dbg !120 + %1619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 23, !dbg !120 + %1620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 24, !dbg !120 + %1621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 25, !dbg !120 + %1622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 26, !dbg !120 + %1623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 27, !dbg !120 + %1624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 28, !dbg !120 + %1625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 29, !dbg !120 + %1626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 30, !dbg !120 + %1627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1585, 31, !dbg !120 + %1628 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1596, float %1597, float %1598, float %1599, float %1600, float %1601, float %1602, float %1603, float %1604, float %1605, float %1606, float %1607, float %1608, float %1609, float %1610, float %1611, float %1612, float %1613, float %1614, float %1615, float %1616, float %1617, float %1618, float %1619, float %1620, float %1621, float %1622, float %1623, float %1624, float %1625, float %1626, float %1627, i64 %1590, i64 %1595, i1 true) #3, !dbg !120 + %1629 = add i32 %743, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !120 + %1630 = lshr exact i32 %1629, 4, !dbg !120 + %1631 = and i32 %1630, 16383, !dbg !120 + %1632 = zext nneg i32 %1631 to i64, !dbg !120 + %1633 = or disjoint i64 %1632, 4611686293372403712, !dbg !120 + %1634 = add i32 %1494, 8192, !dbg !120 + %1635 = lshr exact i32 %1634, 4, !dbg !120 + %1636 = and i32 %1635, 16383, !dbg !120 + %1637 = zext nneg i32 %1636 to i64, !dbg !120 + %1638 = or disjoint i64 %1637, 4611686293338849280, !dbg !120 + %1639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 0, !dbg !120 + %1640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 1, !dbg !120 + %1641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 2, !dbg !120 + %1642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 3, !dbg !120 + %1643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 4, !dbg !120 + %1644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 5, !dbg !120 + %1645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 6, !dbg !120 + %1646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 7, !dbg !120 + %1647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 8, !dbg !120 + %1648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 9, !dbg !120 + %1649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 10, !dbg !120 + %1650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 11, !dbg !120 + %1651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 12, !dbg !120 + %1652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 13, !dbg !120 + %1653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 14, !dbg !120 + %1654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 15, !dbg !120 + %1655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 16, !dbg !120 + %1656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 17, !dbg !120 + %1657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 18, !dbg !120 + %1658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 19, !dbg !120 + %1659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 20, !dbg !120 + %1660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 21, !dbg !120 + %1661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 22, !dbg !120 + %1662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 23, !dbg !120 + %1663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 24, !dbg !120 + %1664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 25, !dbg !120 + %1665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 26, !dbg !120 + %1666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 27, !dbg !120 + %1667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 28, !dbg !120 + %1668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 29, !dbg !120 + %1669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 30, !dbg !120 + %1670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1628, 31, !dbg !120 + %1671 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1639, float %1640, float %1641, float %1642, float %1643, float %1644, float %1645, float %1646, float %1647, float %1648, float %1649, float %1650, float %1651, float %1652, float %1653, float %1654, float %1655, float %1656, float %1657, float %1658, float %1659, float %1660, float %1661, float %1662, float %1663, float %1664, float %1665, float %1666, float %1667, float %1668, float %1669, float %1670, i64 %1633, i64 %1638, i1 true) #3, !dbg !120 + %1672 = add i32 %787, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !120 + %1673 = lshr exact i32 %1672, 4, !dbg !120 + %1674 = and i32 %1673, 16383, !dbg !120 + %1675 = zext nneg i32 %1674 to i64, !dbg !120 + %1676 = or disjoint i64 %1675, 4611686293372403712, !dbg !120 + %1677 = add i32 %1494, 8224, !dbg !120 + %1678 = lshr exact i32 %1677, 4, !dbg !120 + %1679 = and i32 %1678, 16383, !dbg !120 + %1680 = zext nneg i32 %1679 to i64, !dbg !120 + %1681 = or disjoint i64 %1680, 4611686293338849280, !dbg !120 + %1682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 0, !dbg !120 + %1683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 1, !dbg !120 + %1684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 2, !dbg !120 + %1685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 3, !dbg !120 + %1686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 4, !dbg !120 + %1687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 5, !dbg !120 + %1688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 6, !dbg !120 + %1689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 7, !dbg !120 + %1690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 8, !dbg !120 + %1691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 9, !dbg !120 + %1692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 10, !dbg !120 + %1693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 11, !dbg !120 + %1694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 12, !dbg !120 + %1695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 13, !dbg !120 + %1696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 14, !dbg !120 + %1697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 15, !dbg !120 + %1698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 16, !dbg !120 + %1699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 17, !dbg !120 + %1700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 18, !dbg !120 + %1701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 19, !dbg !120 + %1702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 20, !dbg !120 + %1703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 21, !dbg !120 + %1704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 22, !dbg !120 + %1705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 23, !dbg !120 + %1706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 24, !dbg !120 + %1707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 25, !dbg !120 + %1708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 26, !dbg !120 + %1709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 27, !dbg !120 + %1710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 28, !dbg !120 + %1711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 29, !dbg !120 + %1712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 30, !dbg !120 + %1713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1671, 31, !dbg !120 + %1714 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1682, float %1683, float %1684, float %1685, float %1686, float %1687, float %1688, float %1689, float %1690, float %1691, float %1692, float %1693, float %1694, float %1695, float %1696, float %1697, float %1698, float %1699, float %1700, float %1701, float %1702, float %1703, float %1704, float %1705, float %1706, float %1707, float %1708, float %1709, float %1710, float %1711, float %1712, float %1713, i64 %1676, i64 %1681, i1 true) #3, !dbg !120 + %1715 = add i32 %831, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !120 + %1716 = lshr exact i32 %1715, 4, !dbg !120 + %1717 = and i32 %1716, 16383, !dbg !120 + %1718 = zext nneg i32 %1717 to i64, !dbg !120 + %1719 = or disjoint i64 %1718, 4611686293372403712, !dbg !120 + %1720 = add i32 %1494, 8256, !dbg !120 + %1721 = lshr exact i32 %1720, 4, !dbg !120 + %1722 = and i32 %1721, 16383, !dbg !120 + %1723 = zext nneg i32 %1722 to i64, !dbg !120 + %1724 = or disjoint i64 %1723, 4611686293338849280, !dbg !120 + %1725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 0, !dbg !120 + %1726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 1, !dbg !120 + %1727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 2, !dbg !120 + %1728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 3, !dbg !120 + %1729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 4, !dbg !120 + %1730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 5, !dbg !120 + %1731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 6, !dbg !120 + %1732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 7, !dbg !120 + %1733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 8, !dbg !120 + %1734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 9, !dbg !120 + %1735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 10, !dbg !120 + %1736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 11, !dbg !120 + %1737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 12, !dbg !120 + %1738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 13, !dbg !120 + %1739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 14, !dbg !120 + %1740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 15, !dbg !120 + %1741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 16, !dbg !120 + %1742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 17, !dbg !120 + %1743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 18, !dbg !120 + %1744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 19, !dbg !120 + %1745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 20, !dbg !120 + %1746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 21, !dbg !120 + %1747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 22, !dbg !120 + %1748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 23, !dbg !120 + %1749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 24, !dbg !120 + %1750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 25, !dbg !120 + %1751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 26, !dbg !120 + %1752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 27, !dbg !120 + %1753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 28, !dbg !120 + %1754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 29, !dbg !120 + %1755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 30, !dbg !120 + %1756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1714, 31, !dbg !120 + %1757 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1725, float %1726, float %1727, float %1728, float %1729, float %1730, float %1731, float %1732, float %1733, float %1734, float %1735, float %1736, float %1737, float %1738, float %1739, float %1740, float %1741, float %1742, float %1743, float %1744, float %1745, float %1746, float %1747, float %1748, float %1749, float %1750, float %1751, float %1752, float %1753, float %1754, float %1755, float %1756, i64 %1719, i64 %1724, i1 true) #3, !dbg !120 + %1758 = add i32 %875, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !120 + %1759 = lshr exact i32 %1758, 4, !dbg !120 + %1760 = and i32 %1759, 16383, !dbg !120 + %1761 = zext nneg i32 %1760 to i64, !dbg !120 + %1762 = or disjoint i64 %1761, 4611686293372403712, !dbg !120 + %1763 = add i32 %1494, 8288, !dbg !120 + %1764 = lshr exact i32 %1763, 4, !dbg !120 + %1765 = and i32 %1764, 16383, !dbg !120 + %1766 = zext nneg i32 %1765 to i64, !dbg !120 + %1767 = or disjoint i64 %1766, 4611686293338849280, !dbg !120 + %1768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 0, !dbg !120 + %1769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 1, !dbg !120 + %1770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 2, !dbg !120 + %1771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 3, !dbg !120 + %1772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 4, !dbg !120 + %1773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 5, !dbg !120 + %1774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 6, !dbg !120 + %1775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 7, !dbg !120 + %1776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 8, !dbg !120 + %1777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 9, !dbg !120 + %1778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 10, !dbg !120 + %1779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 11, !dbg !120 + %1780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 12, !dbg !120 + %1781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 13, !dbg !120 + %1782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 14, !dbg !120 + %1783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 15, !dbg !120 + %1784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 16, !dbg !120 + %1785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 17, !dbg !120 + %1786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 18, !dbg !120 + %1787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 19, !dbg !120 + %1788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 20, !dbg !120 + %1789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 21, !dbg !120 + %1790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 22, !dbg !120 + %1791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 23, !dbg !120 + %1792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 24, !dbg !120 + %1793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 25, !dbg !120 + %1794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 26, !dbg !120 + %1795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 27, !dbg !120 + %1796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 28, !dbg !120 + %1797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 29, !dbg !120 + %1798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 30, !dbg !120 + %1799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1757, 31, !dbg !120 + %1800 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %1768, float %1769, float %1770, float %1771, float %1772, float %1773, float %1774, float %1775, float %1776, float %1777, float %1778, float %1779, float %1780, float %1781, float %1782, float %1783, float %1784, float %1785, float %1786, float %1787, float %1788, float %1789, float %1790, float %1791, float %1792, float %1793, float %1794, float %1795, float %1796, float %1797, float %1798, float %1799, i64 %1762, i64 %1767, i1 true) #3, !dbg !120 + %1801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 0, !dbg !120 + %1802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 1, !dbg !120 + %1803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 2, !dbg !120 + %1804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 3, !dbg !120 + %1805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 4, !dbg !120 + %1806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 5, !dbg !120 + %1807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 6, !dbg !120 + %1808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 7, !dbg !120 + %1809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 8, !dbg !120 + %1810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 9, !dbg !120 + %1811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 10, !dbg !120 + %1812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 11, !dbg !120 + %1813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 12, !dbg !120 + %1814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 13, !dbg !120 + %1815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 14, !dbg !120 + %1816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 15, !dbg !120 + %1817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 16, !dbg !120 + %1818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 17, !dbg !120 + %1819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 18, !dbg !120 + %1820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 19, !dbg !120 + %1821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 20, !dbg !120 + %1822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 21, !dbg !120 + %1823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 22, !dbg !120 + %1824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 23, !dbg !120 + %1825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 24, !dbg !120 + %1826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 25, !dbg !120 + %1827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 26, !dbg !120 + %1828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 27, !dbg !120 + %1829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 28, !dbg !120 + %1830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 29, !dbg !120 + %1831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 30, !dbg !120 + %1832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1800, 31, !dbg !120 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !120 + %1833 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %1801, float %1802, float %1803, float %1804, float %1805, float %1806, float %1807, float %1808, float %1809, float %1810, float %1811, float %1812, float %1813, float %1814, float %1815, float %1816, float %1817, float %1818, float %1819, float %1820, float %1821, float %1822, float %1823, float %1824, float %1825, float %1826, float %1827, float %1828, float %1829, float %1830, float %1831, float %1832, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 0, i32 0, ptr addrspace(3) %1488, i32 0, i32 0) #3, !dbg !120 + %1834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 0, !dbg !120 + %1835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 1, !dbg !120 + %1836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 2, !dbg !120 + %1837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 3, !dbg !120 + %1838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 4, !dbg !120 + %1839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 5, !dbg !120 + %1840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 6, !dbg !120 + %1841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 7, !dbg !120 + %1842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 8, !dbg !120 + %1843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 9, !dbg !120 + %1844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 10, !dbg !120 + %1845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 11, !dbg !120 + %1846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 12, !dbg !120 + %1847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 13, !dbg !120 + %1848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 14, !dbg !120 + %1849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 15, !dbg !120 + %1850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 16, !dbg !120 + %1851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 17, !dbg !120 + %1852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 18, !dbg !120 + %1853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 19, !dbg !120 + %1854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 20, !dbg !120 + %1855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 21, !dbg !120 + %1856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 22, !dbg !120 + %1857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 23, !dbg !120 + %1858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 24, !dbg !120 + %1859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 25, !dbg !120 + %1860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 26, !dbg !120 + %1861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 27, !dbg !120 + %1862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 28, !dbg !120 + %1863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 29, !dbg !120 + %1864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 30, !dbg !120 + %1865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %1833, 31, !dbg !120 + %1866 = insertelement <2 x float> poison, float %1834, i64 0, !dbg !105 + %1867 = insertelement <2 x float> %1866, float %1835, i64 1, !dbg !105 + %1868 = fsub <2 x float> %1867, %504, !dbg !105 + %1869 = insertelement <2 x float> poison, float %.0.i1478, i64 0, !dbg !121 + %1870 = insertelement <2 x float> %1869, float %.0.i1481, i64 1, !dbg !121 + %1871 = fmul <2 x float> %1870, %1868, !dbg !121 + %1872 = fptrunc <2 x float> %1871 to <2 x bfloat>, !dbg !122 + %1873 = select <2 x i1> %1037, <2 x bfloat> %1872, <2 x bfloat> zeroinitializer, !dbg !123 + %1874 = insertelement <2 x float> poison, float %1836, i64 0, !dbg !105 + %1875 = insertelement <2 x float> %1874, float %1837, i64 1, !dbg !105 + %1876 = fsub <2 x float> %1875, %500, !dbg !105 + %1877 = insertelement <2 x float> poison, float %.0.i1484, i64 0, !dbg !121 + %1878 = insertelement <2 x float> %1877, float %.0.i1487, i64 1, !dbg !121 + %1879 = fmul <2 x float> %1878, %1876, !dbg !121 + %1880 = fptrunc <2 x float> %1879 to <2 x bfloat>, !dbg !122 + %1881 = select <2 x i1> %1038, <2 x bfloat> %1880, <2 x bfloat> zeroinitializer, !dbg !123 + %1882 = insertelement <2 x float> poison, float %1838, i64 0, !dbg !105 + %1883 = insertelement <2 x float> %1882, float %1839, i64 1, !dbg !105 + %1884 = fsub <2 x float> %1883, %504, !dbg !105 + %1885 = insertelement <2 x float> poison, float %.0.i1490, i64 0, !dbg !121 + %1886 = insertelement <2 x float> %1885, float %.0.i1493, i64 1, !dbg !121 + %1887 = fmul <2 x float> %1886, %1884, !dbg !121 + %1888 = fptrunc <2 x float> %1887 to <2 x bfloat>, !dbg !122 + %1889 = select <2 x i1> %1060, <2 x bfloat> %1888, <2 x bfloat> zeroinitializer, !dbg !123 + %1890 = insertelement <2 x float> poison, float %1840, i64 0, !dbg !105 + %1891 = insertelement <2 x float> %1890, float %1841, i64 1, !dbg !105 + %1892 = fsub <2 x float> %1891, %500, !dbg !105 + %1893 = insertelement <2 x float> poison, float %.0.i1496, i64 0, !dbg !121 + %1894 = insertelement <2 x float> %1893, float %.0.i1499, i64 1, !dbg !121 + %1895 = fmul <2 x float> %1894, %1892, !dbg !121 + %1896 = fptrunc <2 x float> %1895 to <2 x bfloat>, !dbg !122 + %1897 = select <2 x i1> %1061, <2 x bfloat> %1896, <2 x bfloat> zeroinitializer, !dbg !123 + %1898 = insertelement <2 x float> poison, float %1842, i64 0, !dbg !105 + %1899 = insertelement <2 x float> %1898, float %1843, i64 1, !dbg !105 + %1900 = fsub <2 x float> %1899, %504, !dbg !105 + %1901 = insertelement <2 x float> poison, float %.0.i1502, i64 0, !dbg !121 + %1902 = insertelement <2 x float> %1901, float %.0.i1505, i64 1, !dbg !121 + %1903 = fmul <2 x float> %1902, %1900, !dbg !121 + %1904 = fptrunc <2 x float> %1903 to <2 x bfloat>, !dbg !122 + %1905 = select <2 x i1> %1083, <2 x bfloat> %1904, <2 x bfloat> zeroinitializer, !dbg !123 + %1906 = insertelement <2 x float> poison, float %1844, i64 0, !dbg !105 + %1907 = insertelement <2 x float> %1906, float %1845, i64 1, !dbg !105 + %1908 = fsub <2 x float> %1907, %500, !dbg !105 + %1909 = insertelement <2 x float> poison, float %.0.i1508, i64 0, !dbg !121 + %1910 = insertelement <2 x float> %1909, float %.0.i1511, i64 1, !dbg !121 + %1911 = fmul <2 x float> %1910, %1908, !dbg !121 + %1912 = fptrunc <2 x float> %1911 to <2 x bfloat>, !dbg !122 + %1913 = select <2 x i1> %1084, <2 x bfloat> %1912, <2 x bfloat> zeroinitializer, !dbg !123 + %1914 = insertelement <2 x float> poison, float %1846, i64 0, !dbg !105 + %1915 = insertelement <2 x float> %1914, float %1847, i64 1, !dbg !105 + %1916 = fsub <2 x float> %1915, %504, !dbg !105 + %1917 = insertelement <2 x float> poison, float %.0.i1514, i64 0, !dbg !121 + %1918 = insertelement <2 x float> %1917, float %.0.i1517, i64 1, !dbg !121 + %1919 = fmul <2 x float> %1918, %1916, !dbg !121 + %1920 = fptrunc <2 x float> %1919 to <2 x bfloat>, !dbg !122 + %1921 = select <2 x i1> %1106, <2 x bfloat> %1920, <2 x bfloat> zeroinitializer, !dbg !123 + %1922 = insertelement <2 x float> poison, float %1848, i64 0, !dbg !105 + %1923 = insertelement <2 x float> %1922, float %1849, i64 1, !dbg !105 + %1924 = fsub <2 x float> %1923, %500, !dbg !105 + %1925 = insertelement <2 x float> poison, float %.0.i1520, i64 0, !dbg !121 + %1926 = insertelement <2 x float> %1925, float %.0.i1523, i64 1, !dbg !121 + %1927 = fmul <2 x float> %1926, %1924, !dbg !121 + %1928 = fptrunc <2 x float> %1927 to <2 x bfloat>, !dbg !122 + %1929 = select <2 x i1> %1107, <2 x bfloat> %1928, <2 x bfloat> zeroinitializer, !dbg !123 + %1930 = insertelement <2 x float> poison, float %1850, i64 0, !dbg !105 + %1931 = insertelement <2 x float> %1930, float %1851, i64 1, !dbg !105 + %1932 = fsub <2 x float> %1931, %504, !dbg !105 + %1933 = insertelement <2 x float> poison, float %.0.i1526, i64 0, !dbg !121 + %1934 = insertelement <2 x float> %1933, float %.0.i1529, i64 1, !dbg !121 + %1935 = fmul <2 x float> %1934, %1932, !dbg !121 + %1936 = fptrunc <2 x float> %1935 to <2 x bfloat>, !dbg !122 + %1937 = select <2 x i1> %1129, <2 x bfloat> %1936, <2 x bfloat> zeroinitializer, !dbg !123 + %1938 = insertelement <2 x float> poison, float %1852, i64 0, !dbg !105 + %1939 = insertelement <2 x float> %1938, float %1853, i64 1, !dbg !105 + %1940 = fsub <2 x float> %1939, %500, !dbg !105 + %1941 = insertelement <2 x float> poison, float %.0.i1532, i64 0, !dbg !121 + %1942 = insertelement <2 x float> %1941, float %.0.i1535, i64 1, !dbg !121 + %1943 = fmul <2 x float> %1942, %1940, !dbg !121 + %1944 = fptrunc <2 x float> %1943 to <2 x bfloat>, !dbg !122 + %1945 = select <2 x i1> %1130, <2 x bfloat> %1944, <2 x bfloat> zeroinitializer, !dbg !123 + %1946 = insertelement <2 x float> poison, float %1854, i64 0, !dbg !105 + %1947 = insertelement <2 x float> %1946, float %1855, i64 1, !dbg !105 + %1948 = fsub <2 x float> %1947, %504, !dbg !105 + %1949 = insertelement <2 x float> poison, float %.0.i1538, i64 0, !dbg !121 + %1950 = insertelement <2 x float> %1949, float %.0.i1541, i64 1, !dbg !121 + %1951 = fmul <2 x float> %1950, %1948, !dbg !121 + %1952 = fptrunc <2 x float> %1951 to <2 x bfloat>, !dbg !122 + %1953 = select <2 x i1> %1152, <2 x bfloat> %1952, <2 x bfloat> zeroinitializer, !dbg !123 + %1954 = insertelement <2 x float> poison, float %1856, i64 0, !dbg !105 + %1955 = insertelement <2 x float> %1954, float %1857, i64 1, !dbg !105 + %1956 = fsub <2 x float> %1955, %500, !dbg !105 + %1957 = insertelement <2 x float> poison, float %.0.i1544, i64 0, !dbg !121 + %1958 = insertelement <2 x float> %1957, float %.0.i1547, i64 1, !dbg !121 + %1959 = fmul <2 x float> %1958, %1956, !dbg !121 + %1960 = fptrunc <2 x float> %1959 to <2 x bfloat>, !dbg !122 + %1961 = select <2 x i1> %1153, <2 x bfloat> %1960, <2 x bfloat> zeroinitializer, !dbg !123 + %1962 = insertelement <2 x float> poison, float %1858, i64 0, !dbg !105 + %1963 = insertelement <2 x float> %1962, float %1859, i64 1, !dbg !105 + %1964 = fsub <2 x float> %1963, %504, !dbg !105 + %1965 = insertelement <2 x float> poison, float %.0.i1550, i64 0, !dbg !121 + %1966 = insertelement <2 x float> %1965, float %.0.i1553, i64 1, !dbg !121 + %1967 = fmul <2 x float> %1966, %1964, !dbg !121 + %1968 = fptrunc <2 x float> %1967 to <2 x bfloat>, !dbg !122 + %1969 = select <2 x i1> %1175, <2 x bfloat> %1968, <2 x bfloat> zeroinitializer, !dbg !123 + %1970 = insertelement <2 x float> poison, float %1860, i64 0, !dbg !105 + %1971 = insertelement <2 x float> %1970, float %1861, i64 1, !dbg !105 + %1972 = fsub <2 x float> %1971, %500, !dbg !105 + %1973 = insertelement <2 x float> poison, float %.0.i1556, i64 0, !dbg !121 + %1974 = insertelement <2 x float> %1973, float %.0.i1559, i64 1, !dbg !121 + %1975 = fmul <2 x float> %1974, %1972, !dbg !121 + %1976 = fptrunc <2 x float> %1975 to <2 x bfloat>, !dbg !122 + %1977 = select <2 x i1> %1176, <2 x bfloat> %1976, <2 x bfloat> zeroinitializer, !dbg !123 + %1978 = insertelement <2 x float> poison, float %1862, i64 0, !dbg !105 + %1979 = insertelement <2 x float> %1978, float %1863, i64 1, !dbg !105 + %1980 = fsub <2 x float> %1979, %504, !dbg !105 + %1981 = insertelement <2 x float> poison, float %.0.i1562, i64 0, !dbg !121 + %1982 = insertelement <2 x float> %1981, float %.0.i1565, i64 1, !dbg !121 + %1983 = fmul <2 x float> %1982, %1980, !dbg !121 + %1984 = fptrunc <2 x float> %1983 to <2 x bfloat>, !dbg !122 + %1985 = select <2 x i1> %1198, <2 x bfloat> %1984, <2 x bfloat> zeroinitializer, !dbg !123 + %1986 = insertelement <2 x float> poison, float %1864, i64 0, !dbg !105 + %1987 = insertelement <2 x float> %1986, float %1865, i64 1, !dbg !105 + %1988 = fsub <2 x float> %1987, %500, !dbg !105 + %1989 = insertelement <2 x float> poison, float %.0.i1568, i64 0, !dbg !121 + %1990 = insertelement <2 x float> %1989, float %.0.i1571, i64 1, !dbg !121 + %1991 = fmul <2 x float> %1990, %1988, !dbg !121 + %1992 = fptrunc <2 x float> %1991 to <2 x bfloat>, !dbg !122 + %1993 = select <2 x i1> %1199, <2 x bfloat> %1992, <2 x bfloat> zeroinitializer, !dbg !123 + %1994 = bitcast <2 x bfloat> %1873 to i32, !dbg !124 + %1995 = bitcast <2 x bfloat> %1881 to i32, !dbg !124 + %1996 = bitcast <2 x bfloat> %1889 to i32, !dbg !124 + %1997 = bitcast <2 x bfloat> %1897 to i32, !dbg !124 + %1998 = bitcast <2 x bfloat> %1905 to i32, !dbg !124 + %1999 = bitcast <2 x bfloat> %1913 to i32, !dbg !124 + %2000 = bitcast <2 x bfloat> %1921 to i32, !dbg !124 + %2001 = bitcast <2 x bfloat> %1929 to i32, !dbg !124 + %2002 = bitcast <2 x bfloat> %1937 to i32, !dbg !124 + %2003 = bitcast <2 x bfloat> %1945 to i32, !dbg !124 + %2004 = bitcast <2 x bfloat> %1953 to i32, !dbg !124 + %2005 = bitcast <2 x bfloat> %1961 to i32, !dbg !124 + %2006 = bitcast <2 x bfloat> %1969 to i32, !dbg !124 + %2007 = bitcast <2 x bfloat> %1977 to i32, !dbg !124 + %2008 = bitcast <2 x bfloat> %1985 to i32, !dbg !124 + %2009 = bitcast <2 x bfloat> %1993 to i32, !dbg !124 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !124 + %2010 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %509, float %510, float %511, float %512, float %513, float %514, float %515, float %516, float %517, float %518, float %519, float %520, float %521, float %522, float %523, float %524, float %525, float %526, float %527, float %528, float %529, float %530, float %531, float %532, float %533, float %534, float %535, float %536, float %537, float %538, float %539, float %540, float %541, float %542, float %543, float %544, float %545, float %546, float %547, float %548, float %549, float %550, float %551, float %552, float %553, float %554, float %555, float %556, float %557, float %558, float %559, float %560, float %561, float %562, float %563, float %564, float %565, float %566, float %567, float %568, float %569, float %570, float %571, float %572, i32 %1994, i32 %1995, i32 %1996, i32 %1997, i64 %609, i1 true) #3, !dbg !124 + %2011 = add i32 %605, 2048, !dbg !124 + %2012 = lshr exact i32 %2011, 4, !dbg !124 + %2013 = and i32 %2012, 16383, !dbg !124 + %2014 = zext nneg i32 %2013 to i64, !dbg !124 + %2015 = or disjoint i64 %2014, 4611686293338849280, !dbg !124 + %2016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 0, !dbg !124 + %2017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 1, !dbg !124 + %2018 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 2, !dbg !124 + %2019 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 3, !dbg !124 + %2020 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 4, !dbg !124 + %2021 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 5, !dbg !124 + %2022 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 6, !dbg !124 + %2023 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 7, !dbg !124 + %2024 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 8, !dbg !124 + %2025 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 9, !dbg !124 + %2026 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 10, !dbg !124 + %2027 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 11, !dbg !124 + %2028 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 12, !dbg !124 + %2029 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 13, !dbg !124 + %2030 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 14, !dbg !124 + %2031 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 15, !dbg !124 + %2032 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 16, !dbg !124 + %2033 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 17, !dbg !124 + %2034 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 18, !dbg !124 + %2035 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 19, !dbg !124 + %2036 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 20, !dbg !124 + %2037 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 21, !dbg !124 + %2038 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 22, !dbg !124 + %2039 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 23, !dbg !124 + %2040 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 24, !dbg !124 + %2041 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 25, !dbg !124 + %2042 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 26, !dbg !124 + %2043 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 27, !dbg !124 + %2044 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 28, !dbg !124 + %2045 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 29, !dbg !124 + %2046 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 30, !dbg !124 + %2047 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 31, !dbg !124 + %2048 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 32, !dbg !124 + %2049 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 33, !dbg !124 + %2050 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 34, !dbg !124 + %2051 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 35, !dbg !124 + %2052 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 36, !dbg !124 + %2053 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 37, !dbg !124 + %2054 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 38, !dbg !124 + %2055 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 39, !dbg !124 + %2056 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 40, !dbg !124 + %2057 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 41, !dbg !124 + %2058 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 42, !dbg !124 + %2059 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 43, !dbg !124 + %2060 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 44, !dbg !124 + %2061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 45, !dbg !124 + %2062 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 46, !dbg !124 + %2063 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 47, !dbg !124 + %2064 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 48, !dbg !124 + %2065 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 49, !dbg !124 + %2066 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 50, !dbg !124 + %2067 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 51, !dbg !124 + %2068 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 52, !dbg !124 + %2069 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 53, !dbg !124 + %2070 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 54, !dbg !124 + %2071 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 55, !dbg !124 + %2072 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 56, !dbg !124 + %2073 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 57, !dbg !124 + %2074 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 58, !dbg !124 + %2075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 59, !dbg !124 + %2076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 60, !dbg !124 + %2077 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 61, !dbg !124 + %2078 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 62, !dbg !124 + %2079 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2010, 63, !dbg !124 + %2080 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %2016, float %2017, float %2018, float %2019, float %2020, float %2021, float %2022, float %2023, float %2024, float %2025, float %2026, float %2027, float %2028, float %2029, float %2030, float %2031, float %2032, float %2033, float %2034, float %2035, float %2036, float %2037, float %2038, float %2039, float %2040, float %2041, float %2042, float %2043, float %2044, float %2045, float %2046, float %2047, float %2048, float %2049, float %2050, float %2051, float %2052, float %2053, float %2054, float %2055, float %2056, float %2057, float %2058, float %2059, float %2060, float %2061, float %2062, float %2063, float %2064, float %2065, float %2066, float %2067, float %2068, float %2069, float %2070, float %2071, float %2072, float %2073, float %2074, float %2075, float %2076, float %2077, float %2078, float %2079, i32 %1998, i32 %1999, i32 %2000, i32 %2001, i64 %2015, i1 true) #3, !dbg !124 + %2081 = add i32 %605, 4096, !dbg !124 + %2082 = lshr exact i32 %2081, 4, !dbg !124 + %2083 = and i32 %2082, 16383, !dbg !124 + %2084 = zext nneg i32 %2083 to i64, !dbg !124 + %2085 = or disjoint i64 %2084, 4611686293338849280, !dbg !124 + %2086 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 0, !dbg !124 + %2087 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 1, !dbg !124 + %2088 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 2, !dbg !124 + %2089 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 3, !dbg !124 + %2090 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 4, !dbg !124 + %2091 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 5, !dbg !124 + %2092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 6, !dbg !124 + %2093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 7, !dbg !124 + %2094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 8, !dbg !124 + %2095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 9, !dbg !124 + %2096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 10, !dbg !124 + %2097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 11, !dbg !124 + %2098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 12, !dbg !124 + %2099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 13, !dbg !124 + %2100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 14, !dbg !124 + %2101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 15, !dbg !124 + %2102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 16, !dbg !124 + %2103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 17, !dbg !124 + %2104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 18, !dbg !124 + %2105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 19, !dbg !124 + %2106 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 20, !dbg !124 + %2107 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 21, !dbg !124 + %2108 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 22, !dbg !124 + %2109 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 23, !dbg !124 + %2110 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 24, !dbg !124 + %2111 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 25, !dbg !124 + %2112 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 26, !dbg !124 + %2113 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 27, !dbg !124 + %2114 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 28, !dbg !124 + %2115 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 29, !dbg !124 + %2116 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 30, !dbg !124 + %2117 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 31, !dbg !124 + %2118 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 32, !dbg !124 + %2119 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 33, !dbg !124 + %2120 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 34, !dbg !124 + %2121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 35, !dbg !124 + %2122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 36, !dbg !124 + %2123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 37, !dbg !124 + %2124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 38, !dbg !124 + %2125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 39, !dbg !124 + %2126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 40, !dbg !124 + %2127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 41, !dbg !124 + %2128 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 42, !dbg !124 + %2129 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 43, !dbg !124 + %2130 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 44, !dbg !124 + %2131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 45, !dbg !124 + %2132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 46, !dbg !124 + %2133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 47, !dbg !124 + %2134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 48, !dbg !124 + %2135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 49, !dbg !124 + %2136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 50, !dbg !124 + %2137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 51, !dbg !124 + %2138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 52, !dbg !124 + %2139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 53, !dbg !124 + %2140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 54, !dbg !124 + %2141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 55, !dbg !124 + %2142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 56, !dbg !124 + %2143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 57, !dbg !124 + %2144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 58, !dbg !124 + %2145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 59, !dbg !124 + %2146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 60, !dbg !124 + %2147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 61, !dbg !124 + %2148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 62, !dbg !124 + %2149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2080, 63, !dbg !124 + %2150 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %2086, float %2087, float %2088, float %2089, float %2090, float %2091, float %2092, float %2093, float %2094, float %2095, float %2096, float %2097, float %2098, float %2099, float %2100, float %2101, float %2102, float %2103, float %2104, float %2105, float %2106, float %2107, float %2108, float %2109, float %2110, float %2111, float %2112, float %2113, float %2114, float %2115, float %2116, float %2117, float %2118, float %2119, float %2120, float %2121, float %2122, float %2123, float %2124, float %2125, float %2126, float %2127, float %2128, float %2129, float %2130, float %2131, float %2132, float %2133, float %2134, float %2135, float %2136, float %2137, float %2138, float %2139, float %2140, float %2141, float %2142, float %2143, float %2144, float %2145, float %2146, float %2147, float %2148, float %2149, i32 %2002, i32 %2003, i32 %2004, i32 %2005, i64 %2085, i1 true) #3, !dbg !124 + %2151 = add i32 %605, 6144, !dbg !124 + %2152 = lshr exact i32 %2151, 4, !dbg !124 + %2153 = and i32 %2152, 16383, !dbg !124 + %2154 = zext nneg i32 %2153 to i64, !dbg !124 + %2155 = or disjoint i64 %2154, 4611686293338849280, !dbg !124 + %2156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 0, !dbg !124 + %2157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 1, !dbg !124 + %2158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 2, !dbg !124 + %2159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 3, !dbg !124 + %2160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 4, !dbg !124 + %2161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 5, !dbg !124 + %2162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 6, !dbg !124 + %2163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 7, !dbg !124 + %2164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 8, !dbg !124 + %2165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 9, !dbg !124 + %2166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 10, !dbg !124 + %2167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 11, !dbg !124 + %2168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 12, !dbg !124 + %2169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 13, !dbg !124 + %2170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 14, !dbg !124 + %2171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 15, !dbg !124 + %2172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 16, !dbg !124 + %2173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 17, !dbg !124 + %2174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 18, !dbg !124 + %2175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 19, !dbg !124 + %2176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 20, !dbg !124 + %2177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 21, !dbg !124 + %2178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 22, !dbg !124 + %2179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 23, !dbg !124 + %2180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 24, !dbg !124 + %2181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 25, !dbg !124 + %2182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 26, !dbg !124 + %2183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 27, !dbg !124 + %2184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 28, !dbg !124 + %2185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 29, !dbg !124 + %2186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 30, !dbg !124 + %2187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 31, !dbg !124 + %2188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 32, !dbg !124 + %2189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 33, !dbg !124 + %2190 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 34, !dbg !124 + %2191 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 35, !dbg !124 + %2192 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 36, !dbg !124 + %2193 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 37, !dbg !124 + %2194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 38, !dbg !124 + %2195 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 39, !dbg !124 + %2196 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 40, !dbg !124 + %2197 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 41, !dbg !124 + %2198 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 42, !dbg !124 + %2199 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 43, !dbg !124 + %2200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 44, !dbg !124 + %2201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 45, !dbg !124 + %2202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 46, !dbg !124 + %2203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 47, !dbg !124 + %2204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 48, !dbg !124 + %2205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 49, !dbg !124 + %2206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 50, !dbg !124 + %2207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 51, !dbg !124 + %2208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 52, !dbg !124 + %2209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 53, !dbg !124 + %2210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 54, !dbg !124 + %2211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 55, !dbg !124 + %2212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 56, !dbg !124 + %2213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 57, !dbg !124 + %2214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 58, !dbg !124 + %2215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 59, !dbg !124 + %2216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 60, !dbg !124 + %2217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 61, !dbg !124 + %2218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 62, !dbg !124 + %2219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2150, 63, !dbg !124 + %2220 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %2156, float %2157, float %2158, float %2159, float %2160, float %2161, float %2162, float %2163, float %2164, float %2165, float %2166, float %2167, float %2168, float %2169, float %2170, float %2171, float %2172, float %2173, float %2174, float %2175, float %2176, float %2177, float %2178, float %2179, float %2180, float %2181, float %2182, float %2183, float %2184, float %2185, float %2186, float %2187, float %2188, float %2189, float %2190, float %2191, float %2192, float %2193, float %2194, float %2195, float %2196, float %2197, float %2198, float %2199, float %2200, float %2201, float %2202, float %2203, float %2204, float %2205, float %2206, float %2207, float %2208, float %2209, float %2210, float %2211, float %2212, float %2213, float %2214, float %2215, float %2216, float %2217, float %2218, float %2219, i32 %2006, i32 %2007, i32 %2008, i32 %2009, i64 %2155, i1 true) #3, !dbg !124 + %2221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 0, !dbg !124 + %2222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 1, !dbg !124 + %2223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 2, !dbg !124 + %2224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 3, !dbg !124 + %2225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 4, !dbg !124 + %2226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 5, !dbg !124 + %2227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 6, !dbg !124 + %2228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 7, !dbg !124 + %2229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 8, !dbg !124 + %2230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 9, !dbg !124 + %2231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 10, !dbg !124 + %2232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 11, !dbg !124 + %2233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 12, !dbg !124 + %2234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 13, !dbg !124 + %2235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 14, !dbg !124 + %2236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 15, !dbg !124 + %2237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 16, !dbg !124 + %2238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 17, !dbg !124 + %2239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 18, !dbg !124 + %2240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 19, !dbg !124 + %2241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 20, !dbg !124 + %2242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 21, !dbg !124 + %2243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 22, !dbg !124 + %2244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 23, !dbg !124 + %2245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 24, !dbg !124 + %2246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 25, !dbg !124 + %2247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 26, !dbg !124 + %2248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 27, !dbg !124 + %2249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 28, !dbg !124 + %2250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 29, !dbg !124 + %2251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 30, !dbg !124 + %2252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 31, !dbg !124 + %2253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 32, !dbg !124 + %2254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 33, !dbg !124 + %2255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 34, !dbg !124 + %2256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 35, !dbg !124 + %2257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 36, !dbg !124 + %2258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 37, !dbg !124 + %2259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 38, !dbg !124 + %2260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 39, !dbg !124 + %2261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 40, !dbg !124 + %2262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 41, !dbg !124 + %2263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 42, !dbg !124 + %2264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 43, !dbg !124 + %2265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 44, !dbg !124 + %2266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 45, !dbg !124 + %2267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 46, !dbg !124 + %2268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 47, !dbg !124 + %2269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 48, !dbg !124 + %2270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 49, !dbg !124 + %2271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 50, !dbg !124 + %2272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 51, !dbg !124 + %2273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 52, !dbg !124 + %2274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 53, !dbg !124 + %2275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 54, !dbg !124 + %2276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 55, !dbg !124 + %2277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 56, !dbg !124 + %2278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 57, !dbg !124 + %2279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 58, !dbg !124 + %2280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 59, !dbg !124 + %2281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 60, !dbg !124 + %2282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 61, !dbg !124 + %2283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 62, !dbg !124 + %2284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2220, 63, !dbg !124 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !124 + %2285 = insertelement <2 x i32> poison, i32 %506, i64 0, !dbg !100 + %2286 = shufflevector <2 x i32> %2285, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !100 + %2287 = add <2 x i32> %2286, %581, !dbg !100 + %2288 = add <2 x i32> %2286, %580, !dbg !100 + %2289 = add <2 x i32> %2286, %579, !dbg !100 + %2290 = add <2 x i32> %2286, %578, !dbg !100 + %2291 = add <2 x i32> %2286, %577, !dbg !100 + %2292 = add <2 x i32> %2286, %576, !dbg !100 + %2293 = add <2 x i32> %2286, %575, !dbg !100 + %2294 = add <2 x i32> %2286, %574, !dbg !100 + %2295 = add nuw nsw i32 %573, 1, !dbg !93 + %2296 = lshr i32 %2295, 1, !dbg !125 + %2297 = zext nneg i32 %2296 to i64, !dbg !126 + %2298 = getelementptr i32, ptr addrspace(1) %336, i64 %2297, !dbg !126 + %2299 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !127 + %2300 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %2298, i64 %2299, i1 %583) #3, !dbg !127 + %2301 = add nuw nsw i32 %2296, 1, !dbg !128 + %2302 = icmp slt i32 %2301, %378, !dbg !129 + %2303 = getelementptr i8, ptr addrspace(1) %2298, i64 4, !dbg !130 + %2304 = and i1 %583, %2302, !dbg !93 + %2305 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !131 + %2306 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %2303, i64 %2305, i1 %2304) #3, !dbg !131 + %2307 = and i32 %573, 1, !dbg !132 + %2308 = sub i32 %2306, %2300, !dbg !133 + %2309 = shl i32 %2308, 7, !dbg !134 + %2310 = add i32 %2309, -64, !dbg !135 + %2311 = xor i32 %2307, 1, !dbg !136 + %2312 = mul nuw nsw i32 %2310, %2311, !dbg !136 + %2313 = shl nuw nsw i32 %2307, 6, !dbg !137 + %2314 = add i32 %2312, %2313, !dbg !138 + %2315 = shl i32 %2314, 7, !dbg !139 + %2316 = sext i32 %2315 to i64, !dbg !98 + %2317 = getelementptr bfloat, ptr addrspace(1) %.pn9191580, i64 %2316, !dbg !98 + %2318 = getelementptr bfloat, ptr addrspace(1) %.pn9031581, i64 %2316, !dbg !98 + %2319 = getelementptr bfloat, ptr addrspace(1) %.pn8871582, i64 %2316, !dbg !98 + %2320 = getelementptr bfloat, ptr addrspace(1) %.pn8711583, i64 %2316, !dbg !98 + %2321 = getelementptr bfloat, ptr addrspace(1) %.pn9911588, i64 %2316, !dbg !99 + %2322 = getelementptr bfloat, ptr addrspace(1) %.pn9751589, i64 %2316, !dbg !99 + %2323 = getelementptr bfloat, ptr addrspace(1) %.pn9591590, i64 %2316, !dbg !99 + %2324 = getelementptr bfloat, ptr addrspace(1) %.pn9431591, i64 %2316, !dbg !99 + %2325 = add i32 %2314, %.pn9271584, !dbg !100 + %2326 = add i32 %2314, %.pn9251585, !dbg !100 + %2327 = add i32 %2314, %.pn9231586, !dbg !100 + %2328 = add i32 %2314, %.pn9211587, !dbg !100 + %2329 = add i32 %508, 1, !dbg !93 + %2330 = icmp sgt i32 %2329, 2, !dbg !93 + %2331 = select i1 %2330, i32 0, i32 %2329, !dbg !93 + %2332 = icmp slt i32 %2325, %18, !dbg !96 + %2333 = icmp slt i32 %2326, %18, !dbg !96 + %2334 = icmp slt i32 %2327, %18, !dbg !96 + %2335 = icmp slt i32 %2328, %18, !dbg !96 + %2336 = shl i32 %2331, 13, !dbg !97 + %2337 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %2336, !dbg !97 + %2338 = and i1 %582, %2332, !dbg !93 + %2339 = and i1 %582, %2333, !dbg !93 + %2340 = and i1 %582, %2334, !dbg !93 + %2341 = and i1 %582, %2335, !dbg !93 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !97 + %2342 = getelementptr inbounds nuw i8, ptr addrspace(3) %2337, i32 %422, !dbg !97 + %2343 = select i1 %2338, i32 16, i32 0, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %2342, ptr addrspace(1) %2317, i32 %2343) #3, !dbg !97 + %2344 = getelementptr inbounds nuw i8, ptr addrspace(3) %2337, i32 %425, !dbg !97 + %2345 = select i1 %2339, i32 16, i32 0, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2344, ptr addrspace(1) %2318, i32 %2345) #3, !dbg !97 + %2346 = getelementptr inbounds nuw i8, ptr addrspace(3) %2337, i32 %428, !dbg !97 + %2347 = select i1 %2340, i32 16, i32 0, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2346, ptr addrspace(1) %2319, i32 %2347) #3, !dbg !97 + %2348 = getelementptr inbounds nuw i8, ptr addrspace(3) %2337, i32 %431, !dbg !97 + %2349 = select i1 %2341, i32 16, i32 0, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2348, ptr addrspace(1) %2320, i32 %2349) #3, !dbg !97 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !97 + %2350 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %2336, !dbg !97 + %2351 = getelementptr inbounds nuw i8, ptr addrspace(3) %2350, i32 %422, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %2351, ptr addrspace(1) %2321, i32 %2343) #3, !dbg !97 + %2352 = getelementptr inbounds nuw i8, ptr addrspace(3) %2350, i32 %425, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2352, ptr addrspace(1) %2322, i32 %2345) #3, !dbg !97 + %2353 = getelementptr inbounds nuw i8, ptr addrspace(3) %2350, i32 %428, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2353, ptr addrspace(1) %2323, i32 %2347) #3, !dbg !97 + %2354 = getelementptr inbounds nuw i8, ptr addrspace(3) %2350, i32 %431, !dbg !97 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2354, ptr addrspace(1) %2324, i32 %2349) #3, !dbg !97 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !97 + %exitcond.not = icmp eq i32 %2295, %smax, !dbg !93 + br i1 %exitcond.not, label %._crit_edge, label %505, !dbg !93 + +._crit_edge: ; preds = %__nv_exp2f.exit1572, %55 + %2355 = phi float [ 0.000000e+00, %55 ], [ %2221, %__nv_exp2f.exit1572 ] + %2356 = phi float [ 0.000000e+00, %55 ], [ %2222, %__nv_exp2f.exit1572 ] + %2357 = phi float [ 0.000000e+00, %55 ], [ %2223, %__nv_exp2f.exit1572 ] + %2358 = phi float [ 0.000000e+00, %55 ], [ %2224, %__nv_exp2f.exit1572 ] + %2359 = phi float [ 0.000000e+00, %55 ], [ %2225, %__nv_exp2f.exit1572 ] + %2360 = phi float [ 0.000000e+00, %55 ], [ %2226, %__nv_exp2f.exit1572 ] + %2361 = phi float [ 0.000000e+00, %55 ], [ %2227, %__nv_exp2f.exit1572 ] + %2362 = phi float [ 0.000000e+00, %55 ], [ %2228, %__nv_exp2f.exit1572 ] + %2363 = phi float [ 0.000000e+00, %55 ], [ %2229, %__nv_exp2f.exit1572 ] + %2364 = phi float [ 0.000000e+00, %55 ], [ %2230, %__nv_exp2f.exit1572 ] + %2365 = phi float [ 0.000000e+00, %55 ], [ %2231, %__nv_exp2f.exit1572 ] + %2366 = phi float [ 0.000000e+00, %55 ], [ %2232, %__nv_exp2f.exit1572 ] + %2367 = phi float [ 0.000000e+00, %55 ], [ %2233, %__nv_exp2f.exit1572 ] + %2368 = phi float [ 0.000000e+00, %55 ], [ %2234, %__nv_exp2f.exit1572 ] + %2369 = phi float [ 0.000000e+00, %55 ], [ %2235, %__nv_exp2f.exit1572 ] + %2370 = phi float [ 0.000000e+00, %55 ], [ %2236, %__nv_exp2f.exit1572 ] + %2371 = phi float [ 0.000000e+00, %55 ], [ %2237, %__nv_exp2f.exit1572 ] + %2372 = phi float [ 0.000000e+00, %55 ], [ %2238, %__nv_exp2f.exit1572 ] + %2373 = phi float [ 0.000000e+00, %55 ], [ %2239, %__nv_exp2f.exit1572 ] + %2374 = phi float [ 0.000000e+00, %55 ], [ %2240, %__nv_exp2f.exit1572 ] + %2375 = phi float [ 0.000000e+00, %55 ], [ %2241, %__nv_exp2f.exit1572 ] + %2376 = phi float [ 0.000000e+00, %55 ], [ %2242, %__nv_exp2f.exit1572 ] + %2377 = phi float [ 0.000000e+00, %55 ], [ %2243, %__nv_exp2f.exit1572 ] + %2378 = phi float [ 0.000000e+00, %55 ], [ %2244, %__nv_exp2f.exit1572 ] + %2379 = phi float [ 0.000000e+00, %55 ], [ %2245, %__nv_exp2f.exit1572 ] + %2380 = phi float [ 0.000000e+00, %55 ], [ %2246, %__nv_exp2f.exit1572 ] + %2381 = phi float [ 0.000000e+00, %55 ], [ %2247, %__nv_exp2f.exit1572 ] + %2382 = phi float [ 0.000000e+00, %55 ], [ %2248, %__nv_exp2f.exit1572 ] + %2383 = phi float [ 0.000000e+00, %55 ], [ %2249, %__nv_exp2f.exit1572 ] + %2384 = phi float [ 0.000000e+00, %55 ], [ %2250, %__nv_exp2f.exit1572 ] + %2385 = phi float [ 0.000000e+00, %55 ], [ %2251, %__nv_exp2f.exit1572 ] + %2386 = phi float [ 0.000000e+00, %55 ], [ %2252, %__nv_exp2f.exit1572 ] + %2387 = phi float [ 0.000000e+00, %55 ], [ %2253, %__nv_exp2f.exit1572 ] + %2388 = phi float [ 0.000000e+00, %55 ], [ %2254, %__nv_exp2f.exit1572 ] + %2389 = phi float [ 0.000000e+00, %55 ], [ %2255, %__nv_exp2f.exit1572 ] + %2390 = phi float [ 0.000000e+00, %55 ], [ %2256, %__nv_exp2f.exit1572 ] + %2391 = phi float [ 0.000000e+00, %55 ], [ %2257, %__nv_exp2f.exit1572 ] + %2392 = phi float [ 0.000000e+00, %55 ], [ %2258, %__nv_exp2f.exit1572 ] + %2393 = phi float [ 0.000000e+00, %55 ], [ %2259, %__nv_exp2f.exit1572 ] + %2394 = phi float [ 0.000000e+00, %55 ], [ %2260, %__nv_exp2f.exit1572 ] + %2395 = phi float [ 0.000000e+00, %55 ], [ %2261, %__nv_exp2f.exit1572 ] + %2396 = phi float [ 0.000000e+00, %55 ], [ %2262, %__nv_exp2f.exit1572 ] + %2397 = phi float [ 0.000000e+00, %55 ], [ %2263, %__nv_exp2f.exit1572 ] + %2398 = phi float [ 0.000000e+00, %55 ], [ %2264, %__nv_exp2f.exit1572 ] + %2399 = phi float [ 0.000000e+00, %55 ], [ %2265, %__nv_exp2f.exit1572 ] + %2400 = phi float [ 0.000000e+00, %55 ], [ %2266, %__nv_exp2f.exit1572 ] + %2401 = phi float [ 0.000000e+00, %55 ], [ %2267, %__nv_exp2f.exit1572 ] + %2402 = phi float [ 0.000000e+00, %55 ], [ %2268, %__nv_exp2f.exit1572 ] + %2403 = phi float [ 0.000000e+00, %55 ], [ %2269, %__nv_exp2f.exit1572 ] + %2404 = phi float [ 0.000000e+00, %55 ], [ %2270, %__nv_exp2f.exit1572 ] + %2405 = phi float [ 0.000000e+00, %55 ], [ %2271, %__nv_exp2f.exit1572 ] + %2406 = phi float [ 0.000000e+00, %55 ], [ %2272, %__nv_exp2f.exit1572 ] + %2407 = phi float [ 0.000000e+00, %55 ], [ %2273, %__nv_exp2f.exit1572 ] + %2408 = phi float [ 0.000000e+00, %55 ], [ %2274, %__nv_exp2f.exit1572 ] + %2409 = phi float [ 0.000000e+00, %55 ], [ %2275, %__nv_exp2f.exit1572 ] + %2410 = phi float [ 0.000000e+00, %55 ], [ %2276, %__nv_exp2f.exit1572 ] + %2411 = phi float [ 0.000000e+00, %55 ], [ %2277, %__nv_exp2f.exit1572 ] + %2412 = phi float [ 0.000000e+00, %55 ], [ %2278, %__nv_exp2f.exit1572 ] + %2413 = phi float [ 0.000000e+00, %55 ], [ %2279, %__nv_exp2f.exit1572 ] + %2414 = phi float [ 0.000000e+00, %55 ], [ %2280, %__nv_exp2f.exit1572 ] + %2415 = phi float [ 0.000000e+00, %55 ], [ %2281, %__nv_exp2f.exit1572 ] + %2416 = phi float [ 0.000000e+00, %55 ], [ %2282, %__nv_exp2f.exit1572 ] + %2417 = phi float [ 0.000000e+00, %55 ], [ %2283, %__nv_exp2f.exit1572 ] + %2418 = phi float [ 0.000000e+00, %55 ], [ %2284, %__nv_exp2f.exit1572 ] + %2419 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %2355, float %2356, float %2357, float %2358, float %2359, float %2360, float %2361, float %2362, float %2363, float %2364, float %2365, float %2366, float %2367, float %2368, float %2369, float %2370, float %2371, float %2372, float %2373, float %2374, float %2375, float %2376, float %2377, float %2378, float %2379, float %2380, float %2381, float %2382, float %2383, float %2384, float %2385, float %2386, float %2387, float %2388, float %2389, float %2390, float %2391, float %2392, float %2393, float %2394, float %2395, float %2396, float %2397, float %2398, float %2399, float %2400, float %2401, float %2402, float %2403, float %2404, float %2405, float %2406, float %2407, float %2408, float %2409, float %2410, float %2411, float %2412, float %2413, float %2414, float %2415, float %2416, float %2417, float %2418) #3, !dbg !93 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !93 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !93 + %2420 = getelementptr i32, ptr addrspace(1) %13, i64 %335, !dbg !140 + %2421 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %2420) #3, !dbg !141 + %2422 = shl i32 %2421, 7, !dbg !142 + %2423 = getelementptr i32, ptr addrspace(1) %12, i64 %337, !dbg !143 + %2424 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %2423) #3, !dbg !144 + %2425 = or disjoint i32 %2422, %46, !dbg !145 + %2426 = or disjoint i32 %2422, %47, !dbg !145 + %2427 = or disjoint i32 %2422, %48, !dbg !145 + %2428 = or disjoint i32 %2422, %49, !dbg !145 + %2429 = shl i32 %2425, 7, !dbg !146 + %2430 = shl i32 %2426, 7, !dbg !146 + %2431 = shl i32 %2427, 7, !dbg !146 + %2432 = shl i32 %2428, 7, !dbg !146 + %2433 = sext i32 %2429 to i64, !dbg !148 + %2434 = getelementptr bfloat, ptr addrspace(1) %40, i64 %2433, !dbg !148 + %2435 = sext i32 %2430 to i64, !dbg !148 + %2436 = getelementptr bfloat, ptr addrspace(1) %40, i64 %2435, !dbg !148 + %2437 = sext i32 %2431 to i64, !dbg !148 + %2438 = getelementptr bfloat, ptr addrspace(1) %40, i64 %2437, !dbg !148 + %2439 = sext i32 %2432 to i64, !dbg !148 + %2440 = getelementptr bfloat, ptr addrspace(1) %40, i64 %2439, !dbg !148 + %2441 = getelementptr bfloat, ptr addrspace(1) %2434, i64 %118, !dbg !149 + %2442 = getelementptr bfloat, ptr addrspace(1) %2436, i64 %118, !dbg !149 + %2443 = getelementptr bfloat, ptr addrspace(1) %2438, i64 %118, !dbg !149 + %2444 = getelementptr bfloat, ptr addrspace(1) %2440, i64 %118, !dbg !149 + %2445 = getelementptr bfloat, ptr addrspace(1) %41, i64 %2433, !dbg !150 + %2446 = getelementptr bfloat, ptr addrspace(1) %41, i64 %2435, !dbg !150 + %2447 = getelementptr bfloat, ptr addrspace(1) %41, i64 %2437, !dbg !150 + %2448 = getelementptr bfloat, ptr addrspace(1) %41, i64 %2439, !dbg !150 + %2449 = getelementptr bfloat, ptr addrspace(1) %2445, i64 %118, !dbg !151 + %2450 = getelementptr bfloat, ptr addrspace(1) %2446, i64 %118, !dbg !151 + %2451 = getelementptr bfloat, ptr addrspace(1) %2447, i64 %118, !dbg !151 + %2452 = getelementptr bfloat, ptr addrspace(1) %2448, i64 %118, !dbg !151 + %2453 = shl i32 %2424, 1, !dbg !152 + %2454 = tail call i32 @llvm.smin.i32(i32 %2453, i32 %353), !dbg !153 + %2455 = icmp sgt i32 %2453, 0, !dbg !154 + %2456 = icmp slt i32 %2425, %18, !dbg !155 + %2457 = icmp slt i32 %2426, %18, !dbg !155 + %2458 = icmp slt i32 %2427, %18, !dbg !155 + %2459 = icmp slt i32 %2428, %18, !dbg !155 + %2460 = and i1 %2455, %2456, !dbg !154 + %2461 = and i1 %2455, %2457, !dbg !154 + %2462 = and i1 %2455, %2458, !dbg !154 + %2463 = and i1 %2455, %2459, !dbg !154 + %2464 = select i1 %2460, i32 16, i32 0, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %423, ptr addrspace(1) %2441, i32 %2464) #3, !dbg !156 + %2465 = select i1 %2461, i32 16, i32 0, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %426, ptr addrspace(1) %2442, i32 %2465) #3, !dbg !156 + %2466 = select i1 %2462, i32 16, i32 0, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %429, ptr addrspace(1) %2443, i32 %2466) #3, !dbg !156 + %2467 = select i1 %2463, i32 16, i32 0, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %432, ptr addrspace(1) %2444, i32 %2467) #3, !dbg !156 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %434, ptr addrspace(1) %2449, i32 %2464) #3, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %435, ptr addrspace(1) %2450, i32 %2465) #3, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %436, ptr addrspace(1) %2451, i32 %2466) #3, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %437, ptr addrspace(1) %2452, i32 %2467) #3, !dbg !156 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !156 + %2468 = icmp sgt i32 %2454, 1, !dbg !154 + %2469 = getelementptr i8, ptr addrspace(1) %2441, i64 16384, !dbg !157 + %2470 = getelementptr i8, ptr addrspace(1) %2442, i64 16384, !dbg !157 + %2471 = getelementptr i8, ptr addrspace(1) %2443, i64 16384, !dbg !157 + %2472 = getelementptr i8, ptr addrspace(1) %2444, i64 16384, !dbg !157 + %2473 = getelementptr i8, ptr addrspace(1) %2449, i64 16384, !dbg !158 + %2474 = getelementptr i8, ptr addrspace(1) %2450, i64 16384, !dbg !158 + %2475 = getelementptr i8, ptr addrspace(1) %2451, i64 16384, !dbg !158 + %2476 = getelementptr i8, ptr addrspace(1) %2452, i64 16384, !dbg !158 + %2477 = or disjoint i32 %2425, 64, !dbg !159 + %2478 = or disjoint i32 %2426, 64, !dbg !159 + %2479 = or disjoint i32 %2427, 64, !dbg !159 + %2480 = or disjoint i32 %2428, 64, !dbg !159 + %2481 = icmp slt i32 %2477, %18, !dbg !155 + %2482 = icmp slt i32 %2478, %18, !dbg !155 + %2483 = icmp slt i32 %2479, %18, !dbg !155 + %2484 = icmp slt i32 %2480, %18, !dbg !155 + %2485 = and i1 %2468, %2481, !dbg !154 + %2486 = and i1 %2468, %2482, !dbg !154 + %2487 = and i1 %2468, %2483, !dbg !154 + %2488 = and i1 %2468, %2484, !dbg !154 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !156 + %2489 = select i1 %2485, i32 16, i32 0, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %459, ptr addrspace(1) %2469, i32 %2489) #3, !dbg !156 + %2490 = select i1 %2486, i32 16, i32 0, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %461, ptr addrspace(1) %2470, i32 %2490) #3, !dbg !156 + %2491 = select i1 %2487, i32 16, i32 0, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %463, ptr addrspace(1) %2471, i32 %2491) #3, !dbg !156 + %2492 = select i1 %2488, i32 16, i32 0, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %465, ptr addrspace(1) %2472, i32 %2492) #3, !dbg !156 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %467, ptr addrspace(1) %2473, i32 %2489) #3, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %468, ptr addrspace(1) %2474, i32 %2490) #3, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %469, ptr addrspace(1) %2475, i32 %2491) #3, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %470, ptr addrspace(1) %2476, i32 %2492) #3, !dbg !156 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !156 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #3, !dbg !160 + %2493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 0, !dbg !154 + %2494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 1, !dbg !154 + %2495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 2, !dbg !154 + %2496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 3, !dbg !154 + %2497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 4, !dbg !154 + %2498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 5, !dbg !154 + %2499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 6, !dbg !154 + %2500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 7, !dbg !154 + %2501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 8, !dbg !154 + %2502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 9, !dbg !154 + %2503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 10, !dbg !154 + %2504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 11, !dbg !154 + %2505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 12, !dbg !154 + %2506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 13, !dbg !154 + %2507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 14, !dbg !154 + %2508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 15, !dbg !154 + %2509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 16, !dbg !154 + %2510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 17, !dbg !154 + %2511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 18, !dbg !154 + %2512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 19, !dbg !154 + %2513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 20, !dbg !154 + %2514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 21, !dbg !154 + %2515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 22, !dbg !154 + %2516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 23, !dbg !154 + %2517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 24, !dbg !154 + %2518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 25, !dbg !154 + %2519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 26, !dbg !154 + %2520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 27, !dbg !154 + %2521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 28, !dbg !154 + %2522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 29, !dbg !154 + %2523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 30, !dbg !154 + %2524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 31, !dbg !154 + %2525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 32, !dbg !154 + %2526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 33, !dbg !154 + %2527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 34, !dbg !154 + %2528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 35, !dbg !154 + %2529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 36, !dbg !154 + %2530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 37, !dbg !154 + %2531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 38, !dbg !154 + %2532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 39, !dbg !154 + %2533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 40, !dbg !154 + %2534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 41, !dbg !154 + %2535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 42, !dbg !154 + %2536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 43, !dbg !154 + %2537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 44, !dbg !154 + %2538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 45, !dbg !154 + %2539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 46, !dbg !154 + %2540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 47, !dbg !154 + %2541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 48, !dbg !154 + %2542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 49, !dbg !154 + %2543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 50, !dbg !154 + %2544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 51, !dbg !154 + %2545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 52, !dbg !154 + %2546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 53, !dbg !154 + %2547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 54, !dbg !154 + %2548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 55, !dbg !154 + %2549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 56, !dbg !154 + %2550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 57, !dbg !154 + %2551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 58, !dbg !154 + %2552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 59, !dbg !154 + %2553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 60, !dbg !154 + %2554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 61, !dbg !154 + %2555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 62, !dbg !154 + %2556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 63, !dbg !154 + br i1 %2455, label %.lr.ph1637, label %._crit_edge1638, !dbg !154 + +.lr.ph1637: ; preds = %._crit_edge + %2557 = insertelement <16 x i32> poison, i32 %2422, i64 0, !dbg !145 + %2558 = shufflevector <16 x i32> %2557, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !145 + %2559 = shufflevector <2 x i32> %344, <2 x i32> poison, <16 x i32> , !dbg !145 + %2560 = insertelement <16 x i32> %2559, i32 %341, i64 14, !dbg !145 + %2561 = insertelement <16 x i32> %2560, i32 %340, i64 15, !dbg !145 + %2562 = shufflevector <8 x i32> %350, <8 x i32> poison, <16 x i32> , !dbg !145 + %2563 = shufflevector <16 x i32> %2562, <16 x i32> %2561, <16 x i32> , !dbg !145 + %2564 = shufflevector <4 x i32> %347, <4 x i32> poison, <16 x i32> , !dbg !145 + %2565 = shufflevector <16 x i32> %2563, <16 x i32> %2564, <16 x i32> , !dbg !145 + %2566 = or disjoint <16 x i32> %2558, %2565, !dbg !145 + %2567 = add nsw i32 %2454, -2 + %2568 = add nsw i32 %2454, -1 + %smax2209 = tail call i32 @llvm.smax.i32(i32 %2454, i32 1), !dbg !154 + %2569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 0, !dbg !154 + %2570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 1, !dbg !154 + %2571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 2, !dbg !154 + %2572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 3, !dbg !154 + %2573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 4, !dbg !154 + %2574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 5, !dbg !154 + %2575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 6, !dbg !154 + %2576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 7, !dbg !154 + %2577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 8, !dbg !154 + %2578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 9, !dbg !154 + %2579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 10, !dbg !154 + %2580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 11, !dbg !154 + %2581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 12, !dbg !154 + %2582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 13, !dbg !154 + %2583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 14, !dbg !154 + %2584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 15, !dbg !154 + %2585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 16, !dbg !154 + %2586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 17, !dbg !154 + %2587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 18, !dbg !154 + %2588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 19, !dbg !154 + %2589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 20, !dbg !154 + %2590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 21, !dbg !154 + %2591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 22, !dbg !154 + %2592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 23, !dbg !154 + %2593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 24, !dbg !154 + %2594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 25, !dbg !154 + %2595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 26, !dbg !154 + %2596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 27, !dbg !154 + %2597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 28, !dbg !154 + %2598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 29, !dbg !154 + %2599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 30, !dbg !154 + %2600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 31, !dbg !154 + %2601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 32, !dbg !154 + %2602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 33, !dbg !154 + %2603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 34, !dbg !154 + %2604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 35, !dbg !154 + %2605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 36, !dbg !154 + %2606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 37, !dbg !154 + %2607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 38, !dbg !154 + %2608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 39, !dbg !154 + %2609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 40, !dbg !154 + %2610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 41, !dbg !154 + %2611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 42, !dbg !154 + %2612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 43, !dbg !154 + %2613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 44, !dbg !154 + %2614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 45, !dbg !154 + %2615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 46, !dbg !154 + %2616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 47, !dbg !154 + %2617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 48, !dbg !154 + %2618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 49, !dbg !154 + %2619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 50, !dbg !154 + %2620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 51, !dbg !154 + %2621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 52, !dbg !154 + %2622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 53, !dbg !154 + %2623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 54, !dbg !154 + %2624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 55, !dbg !154 + %2625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 56, !dbg !154 + %2626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 57, !dbg !154 + %2627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 58, !dbg !154 + %2628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 59, !dbg !154 + %2629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 60, !dbg !154 + %2630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 61, !dbg !154 + %2631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 62, !dbg !154 + %2632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2419, 63, !dbg !154 + br label %2633, !dbg !154 + +2633: ; preds = %.lr.ph1637, %__nv_exp2f.exit1476 + %2634 = phi i32 [ 64, %.lr.ph1637 ], [ %4204, %__nv_exp2f.exit1476 ] + %2635 = phi i32 [ -1, %.lr.ph1637 ], [ %2643, %__nv_exp2f.exit1476 ] + %2636 = phi i32 [ 1, %.lr.ph1637 ], [ %4221, %__nv_exp2f.exit1476 ] + %.pn11111619 = phi ptr addrspace(1) [ %2476, %.lr.ph1637 ], [ %4214, %__nv_exp2f.exit1476 ] + %.pn11271618 = phi ptr addrspace(1) [ %2475, %.lr.ph1637 ], [ %4213, %__nv_exp2f.exit1476 ] + %.pn11431617 = phi ptr addrspace(1) [ %2474, %.lr.ph1637 ], [ %4212, %__nv_exp2f.exit1476 ] + %.pn11591616 = phi ptr addrspace(1) [ %2473, %.lr.ph1637 ], [ %4211, %__nv_exp2f.exit1476 ] + %.pn10891615 = phi i32 [ %2480, %.lr.ph1637 ], [ %4218, %__nv_exp2f.exit1476 ] + %.pn10911614 = phi i32 [ %2479, %.lr.ph1637 ], [ %4217, %__nv_exp2f.exit1476 ] + %.pn10931613 = phi i32 [ %2478, %.lr.ph1637 ], [ %4216, %__nv_exp2f.exit1476 ] + %.pn10951612 = phi i32 [ %2477, %.lr.ph1637 ], [ %4215, %__nv_exp2f.exit1476 ] + %.pn10391611 = phi ptr addrspace(1) [ %2472, %.lr.ph1637 ], [ %4210, %__nv_exp2f.exit1476 ] + %.pn10551610 = phi ptr addrspace(1) [ %2471, %.lr.ph1637 ], [ %4209, %__nv_exp2f.exit1476 ] + %.pn10711609 = phi ptr addrspace(1) [ %2470, %.lr.ph1637 ], [ %4208, %__nv_exp2f.exit1476 ] + %.pn10871608 = phi ptr addrspace(1) [ %2469, %.lr.ph1637 ], [ %4207, %__nv_exp2f.exit1476 ] + %.pn = phi float [ %2569, %.lr.ph1637 ], [ %4118, %__nv_exp2f.exit1476 ] + %.pn2477 = phi float [ %2570, %.lr.ph1637 ], [ %4119, %__nv_exp2f.exit1476 ] + %.pn2478 = phi float [ %2571, %.lr.ph1637 ], [ %4120, %__nv_exp2f.exit1476 ] + %.pn2479 = phi float [ %2572, %.lr.ph1637 ], [ %4121, %__nv_exp2f.exit1476 ] + %.pn2480 = phi float [ %2573, %.lr.ph1637 ], [ %4122, %__nv_exp2f.exit1476 ] + %.pn2481 = phi float [ %2574, %.lr.ph1637 ], [ %4123, %__nv_exp2f.exit1476 ] + %.pn2482 = phi float [ %2575, %.lr.ph1637 ], [ %4124, %__nv_exp2f.exit1476 ] + %.pn2483 = phi float [ %2576, %.lr.ph1637 ], [ %4125, %__nv_exp2f.exit1476 ] + %.pn2484 = phi float [ %2577, %.lr.ph1637 ], [ %4126, %__nv_exp2f.exit1476 ] + %.pn2485 = phi float [ %2578, %.lr.ph1637 ], [ %4127, %__nv_exp2f.exit1476 ] + %.pn2486 = phi float [ %2579, %.lr.ph1637 ], [ %4128, %__nv_exp2f.exit1476 ] + %.pn2487 = phi float [ %2580, %.lr.ph1637 ], [ %4129, %__nv_exp2f.exit1476 ] + %.pn2488 = phi float [ %2581, %.lr.ph1637 ], [ %4130, %__nv_exp2f.exit1476 ] + %.pn2489 = phi float [ %2582, %.lr.ph1637 ], [ %4131, %__nv_exp2f.exit1476 ] + %.pn2490 = phi float [ %2583, %.lr.ph1637 ], [ %4132, %__nv_exp2f.exit1476 ] + %.pn2491 = phi float [ %2584, %.lr.ph1637 ], [ %4133, %__nv_exp2f.exit1476 ] + %.pn2492 = phi float [ %2585, %.lr.ph1637 ], [ %4134, %__nv_exp2f.exit1476 ] + %.pn2493 = phi float [ %2586, %.lr.ph1637 ], [ %4135, %__nv_exp2f.exit1476 ] + %.pn2494 = phi float [ %2587, %.lr.ph1637 ], [ %4136, %__nv_exp2f.exit1476 ] + %.pn2495 = phi float [ %2588, %.lr.ph1637 ], [ %4137, %__nv_exp2f.exit1476 ] + %.pn2496 = phi float [ %2589, %.lr.ph1637 ], [ %4138, %__nv_exp2f.exit1476 ] + %.pn2497 = phi float [ %2590, %.lr.ph1637 ], [ %4139, %__nv_exp2f.exit1476 ] + %.pn2498 = phi float [ %2591, %.lr.ph1637 ], [ %4140, %__nv_exp2f.exit1476 ] + %.pn2499 = phi float [ %2592, %.lr.ph1637 ], [ %4141, %__nv_exp2f.exit1476 ] + %.pn2500 = phi float [ %2593, %.lr.ph1637 ], [ %4142, %__nv_exp2f.exit1476 ] + %.pn2501 = phi float [ %2594, %.lr.ph1637 ], [ %4143, %__nv_exp2f.exit1476 ] + %.pn2502 = phi float [ %2595, %.lr.ph1637 ], [ %4144, %__nv_exp2f.exit1476 ] + %.pn2503 = phi float [ %2596, %.lr.ph1637 ], [ %4145, %__nv_exp2f.exit1476 ] + %.pn2504 = phi float [ %2597, %.lr.ph1637 ], [ %4146, %__nv_exp2f.exit1476 ] + %.pn2505 = phi float [ %2598, %.lr.ph1637 ], [ %4147, %__nv_exp2f.exit1476 ] + %.pn2506 = phi float [ %2599, %.lr.ph1637 ], [ %4148, %__nv_exp2f.exit1476 ] + %.pn2507 = phi float [ %2600, %.lr.ph1637 ], [ %4149, %__nv_exp2f.exit1476 ] + %.pn2508 = phi float [ %2601, %.lr.ph1637 ], [ %4150, %__nv_exp2f.exit1476 ] + %.pn2509 = phi float [ %2602, %.lr.ph1637 ], [ %4151, %__nv_exp2f.exit1476 ] + %.pn2510 = phi float [ %2603, %.lr.ph1637 ], [ %4152, %__nv_exp2f.exit1476 ] + %.pn2511 = phi float [ %2604, %.lr.ph1637 ], [ %4153, %__nv_exp2f.exit1476 ] + %.pn2512 = phi float [ %2605, %.lr.ph1637 ], [ %4154, %__nv_exp2f.exit1476 ] + %.pn2513 = phi float [ %2606, %.lr.ph1637 ], [ %4155, %__nv_exp2f.exit1476 ] + %.pn2514 = phi float [ %2607, %.lr.ph1637 ], [ %4156, %__nv_exp2f.exit1476 ] + %.pn2515 = phi float [ %2608, %.lr.ph1637 ], [ %4157, %__nv_exp2f.exit1476 ] + %.pn2516 = phi float [ %2609, %.lr.ph1637 ], [ %4158, %__nv_exp2f.exit1476 ] + %.pn2517 = phi float [ %2610, %.lr.ph1637 ], [ %4159, %__nv_exp2f.exit1476 ] + %.pn2518 = phi float [ %2611, %.lr.ph1637 ], [ %4160, %__nv_exp2f.exit1476 ] + %.pn2519 = phi float [ %2612, %.lr.ph1637 ], [ %4161, %__nv_exp2f.exit1476 ] + %.pn2520 = phi float [ %2613, %.lr.ph1637 ], [ %4162, %__nv_exp2f.exit1476 ] + %.pn2521 = phi float [ %2614, %.lr.ph1637 ], [ %4163, %__nv_exp2f.exit1476 ] + %.pn2522 = phi float [ %2615, %.lr.ph1637 ], [ %4164, %__nv_exp2f.exit1476 ] + %.pn2523 = phi float [ %2616, %.lr.ph1637 ], [ %4165, %__nv_exp2f.exit1476 ] + %.pn2524 = phi float [ %2617, %.lr.ph1637 ], [ %4166, %__nv_exp2f.exit1476 ] + %.pn2525 = phi float [ %2618, %.lr.ph1637 ], [ %4167, %__nv_exp2f.exit1476 ] + %.pn2526 = phi float [ %2619, %.lr.ph1637 ], [ %4168, %__nv_exp2f.exit1476 ] + %.pn2527 = phi float [ %2620, %.lr.ph1637 ], [ %4169, %__nv_exp2f.exit1476 ] + %.pn2528 = phi float [ %2621, %.lr.ph1637 ], [ %4170, %__nv_exp2f.exit1476 ] + %.pn2529 = phi float [ %2622, %.lr.ph1637 ], [ %4171, %__nv_exp2f.exit1476 ] + %.pn2530 = phi float [ %2623, %.lr.ph1637 ], [ %4172, %__nv_exp2f.exit1476 ] + %.pn2531 = phi float [ %2624, %.lr.ph1637 ], [ %4173, %__nv_exp2f.exit1476 ] + %.pn2532 = phi float [ %2625, %.lr.ph1637 ], [ %4174, %__nv_exp2f.exit1476 ] + %.pn2533 = phi float [ %2626, %.lr.ph1637 ], [ %4175, %__nv_exp2f.exit1476 ] + %.pn2534 = phi float [ %2627, %.lr.ph1637 ], [ %4176, %__nv_exp2f.exit1476 ] + %.pn2535 = phi float [ %2628, %.lr.ph1637 ], [ %4177, %__nv_exp2f.exit1476 ] + %.pn2536 = phi float [ %2629, %.lr.ph1637 ], [ %4178, %__nv_exp2f.exit1476 ] + %.pn2537 = phi float [ %2630, %.lr.ph1637 ], [ %4179, %__nv_exp2f.exit1476 ] + %.pn2538 = phi float [ %2631, %.lr.ph1637 ], [ %4180, %__nv_exp2f.exit1476 ] + %.pn2539 = phi float [ %2632, %.lr.ph1637 ], [ %4181, %__nv_exp2f.exit1476 ] + %2637 = phi i32 [ 0, %.lr.ph1637 ], [ %4185, %__nv_exp2f.exit1476 ] + %2638 = phi <16 x i32> [ %2566, %.lr.ph1637 ], [ %4184, %__nv_exp2f.exit1476 ] + %2639 = icmp slt i32 %2637, %2567, !dbg !154 + %2640 = icmp slt i32 %2637, %2568, !dbg !154 + %2641 = add i32 %2635, 1, !dbg !154 + %2642 = icmp sgt i32 %2641, 2, !dbg !154 + %2643 = select i1 %2642, i32 0, i32 %2641, !dbg !154 + %2644 = extractelement <16 x i32> %2638, i64 15, !dbg !155 + %2645 = icmp slt i32 %2644, %18, !dbg !155 + %2646 = extractelement <16 x i32> %2638, i64 14, !dbg !155 + %2647 = icmp slt i32 %2646, %18, !dbg !155 + %2648 = extractelement <16 x i32> %2638, i64 13, !dbg !155 + %2649 = icmp slt i32 %2648, %18, !dbg !155 + %2650 = extractelement <16 x i32> %2638, i64 12, !dbg !155 + %2651 = icmp slt i32 %2650, %18, !dbg !155 + %2652 = extractelement <16 x i32> %2638, i64 11, !dbg !155 + %2653 = icmp slt i32 %2652, %18, !dbg !155 + %2654 = extractelement <16 x i32> %2638, i64 10, !dbg !155 + %2655 = icmp slt i32 %2654, %18, !dbg !155 + %2656 = extractelement <16 x i32> %2638, i64 9, !dbg !155 + %2657 = icmp slt i32 %2656, %18, !dbg !155 + %2658 = extractelement <16 x i32> %2638, i64 8, !dbg !155 + %2659 = icmp slt i32 %2658, %18, !dbg !155 + %2660 = extractelement <16 x i32> %2638, i64 7, !dbg !155 + %2661 = icmp slt i32 %2660, %18, !dbg !155 + %2662 = extractelement <16 x i32> %2638, i64 6, !dbg !155 + %2663 = icmp slt i32 %2662, %18, !dbg !155 + %2664 = extractelement <16 x i32> %2638, i64 5, !dbg !155 + %2665 = icmp slt i32 %2664, %18, !dbg !155 + %2666 = extractelement <16 x i32> %2638, i64 4, !dbg !155 + %2667 = icmp slt i32 %2666, %18, !dbg !155 + %2668 = extractelement <16 x i32> %2638, i64 3, !dbg !155 + %2669 = icmp slt i32 %2668, %18, !dbg !155 + %2670 = extractelement <16 x i32> %2638, i64 2, !dbg !155 + %2671 = icmp slt i32 %2670, %18, !dbg !155 + %2672 = extractelement <16 x i32> %2638, i64 1, !dbg !155 + %2673 = icmp slt i32 %2672, %18, !dbg !155 + %2674 = extractelement <16 x i32> %2638, i64 0, !dbg !155 + %2675 = icmp slt i32 %2674, %18, !dbg !155 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !156 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !156 + %2676 = shl i32 %2643, 13, !dbg !156 + %2677 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %2676, !dbg !156 + %2678 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %44, i32 0, i32 31), !dbg !160 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !160 + %2679 = shl i32 %2678, 11, !dbg !160 + %2680 = and i32 %2679, 8192, !dbg !160 + %2681 = add i32 %2680, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !160 + %2682 = lshr exact i32 %2681, 4, !dbg !160 + %2683 = and i32 %2682, 16383, !dbg !160 + %2684 = zext nneg i32 %2683 to i64, !dbg !160 + %2685 = or disjoint i64 %2684, 4611686293372403712, !dbg !160 + %2686 = ptrtoint ptr addrspace(3) %2677 to i32, !dbg !160 + %2687 = lshr exact i32 %2686, 4, !dbg !160 + %2688 = and i32 %2687, 16383, !dbg !160 + %2689 = zext nneg i32 %2688 to i64, !dbg !160 + %2690 = or disjoint i64 %2689, 4611686293338849280, !dbg !160 + %2691 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %2685, i64 %2690) #3, !dbg !160 + %2692 = or disjoint i32 %2680, 32, !dbg !160 + %2693 = add i32 %2692, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !160 + %2694 = lshr exact i32 %2693, 4, !dbg !160 + %2695 = and i32 %2694, 16383, !dbg !160 + %2696 = zext nneg i32 %2695 to i64, !dbg !160 + %2697 = or disjoint i64 %2696, 4611686293372403712, !dbg !160 + %2698 = add i32 %2686, 32, !dbg !160 + %2699 = lshr exact i32 %2698, 4, !dbg !160 + %2700 = and i32 %2699, 16383, !dbg !160 + %2701 = zext nneg i32 %2700 to i64, !dbg !160 + %2702 = or disjoint i64 %2701, 4611686293338849280, !dbg !160 + %2703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 0, !dbg !160 + %2704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 1, !dbg !160 + %2705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 2, !dbg !160 + %2706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 3, !dbg !160 + %2707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 4, !dbg !160 + %2708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 5, !dbg !160 + %2709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 6, !dbg !160 + %2710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 7, !dbg !160 + %2711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 8, !dbg !160 + %2712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 9, !dbg !160 + %2713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 10, !dbg !160 + %2714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 11, !dbg !160 + %2715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 12, !dbg !160 + %2716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 13, !dbg !160 + %2717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 14, !dbg !160 + %2718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 15, !dbg !160 + %2719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 16, !dbg !160 + %2720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 17, !dbg !160 + %2721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 18, !dbg !160 + %2722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 19, !dbg !160 + %2723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 20, !dbg !160 + %2724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 21, !dbg !160 + %2725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 22, !dbg !160 + %2726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 23, !dbg !160 + %2727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 24, !dbg !160 + %2728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 25, !dbg !160 + %2729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 26, !dbg !160 + %2730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 27, !dbg !160 + %2731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 28, !dbg !160 + %2732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 29, !dbg !160 + %2733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 30, !dbg !160 + %2734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2691, 31, !dbg !160 + %2735 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2703, float %2704, float %2705, float %2706, float %2707, float %2708, float %2709, float %2710, float %2711, float %2712, float %2713, float %2714, float %2715, float %2716, float %2717, float %2718, float %2719, float %2720, float %2721, float %2722, float %2723, float %2724, float %2725, float %2726, float %2727, float %2728, float %2729, float %2730, float %2731, float %2732, float %2733, float %2734, i64 %2697, i64 %2702, i1 true) #3, !dbg !160 + %2736 = or disjoint i32 %2680, 64, !dbg !160 + %2737 = add i32 %2736, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !160 + %2738 = lshr exact i32 %2737, 4, !dbg !160 + %2739 = and i32 %2738, 16383, !dbg !160 + %2740 = zext nneg i32 %2739 to i64, !dbg !160 + %2741 = or disjoint i64 %2740, 4611686293372403712, !dbg !160 + %2742 = add i32 %2686, 64, !dbg !160 + %2743 = lshr exact i32 %2742, 4, !dbg !160 + %2744 = and i32 %2743, 16383, !dbg !160 + %2745 = zext nneg i32 %2744 to i64, !dbg !160 + %2746 = or disjoint i64 %2745, 4611686293338849280, !dbg !160 + %2747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 0, !dbg !160 + %2748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 1, !dbg !160 + %2749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 2, !dbg !160 + %2750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 3, !dbg !160 + %2751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 4, !dbg !160 + %2752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 5, !dbg !160 + %2753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 6, !dbg !160 + %2754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 7, !dbg !160 + %2755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 8, !dbg !160 + %2756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 9, !dbg !160 + %2757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 10, !dbg !160 + %2758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 11, !dbg !160 + %2759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 12, !dbg !160 + %2760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 13, !dbg !160 + %2761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 14, !dbg !160 + %2762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 15, !dbg !160 + %2763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 16, !dbg !160 + %2764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 17, !dbg !160 + %2765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 18, !dbg !160 + %2766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 19, !dbg !160 + %2767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 20, !dbg !160 + %2768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 21, !dbg !160 + %2769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 22, !dbg !160 + %2770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 23, !dbg !160 + %2771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 24, !dbg !160 + %2772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 25, !dbg !160 + %2773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 26, !dbg !160 + %2774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 27, !dbg !160 + %2775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 28, !dbg !160 + %2776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 29, !dbg !160 + %2777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 30, !dbg !160 + %2778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2735, 31, !dbg !160 + %2779 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2747, float %2748, float %2749, float %2750, float %2751, float %2752, float %2753, float %2754, float %2755, float %2756, float %2757, float %2758, float %2759, float %2760, float %2761, float %2762, float %2763, float %2764, float %2765, float %2766, float %2767, float %2768, float %2769, float %2770, float %2771, float %2772, float %2773, float %2774, float %2775, float %2776, float %2777, float %2778, i64 %2741, i64 %2746, i1 true) #3, !dbg !160 + %2780 = or disjoint i32 %2680, 96, !dbg !160 + %2781 = add i32 %2780, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !160 + %2782 = lshr exact i32 %2781, 4, !dbg !160 + %2783 = and i32 %2782, 16383, !dbg !160 + %2784 = zext nneg i32 %2783 to i64, !dbg !160 + %2785 = or disjoint i64 %2784, 4611686293372403712, !dbg !160 + %2786 = add i32 %2686, 96, !dbg !160 + %2787 = lshr exact i32 %2786, 4, !dbg !160 + %2788 = and i32 %2787, 16383, !dbg !160 + %2789 = zext nneg i32 %2788 to i64, !dbg !160 + %2790 = or disjoint i64 %2789, 4611686293338849280, !dbg !160 + %2791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 0, !dbg !160 + %2792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 1, !dbg !160 + %2793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 2, !dbg !160 + %2794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 3, !dbg !160 + %2795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 4, !dbg !160 + %2796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 5, !dbg !160 + %2797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 6, !dbg !160 + %2798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 7, !dbg !160 + %2799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 8, !dbg !160 + %2800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 9, !dbg !160 + %2801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 10, !dbg !160 + %2802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 11, !dbg !160 + %2803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 12, !dbg !160 + %2804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 13, !dbg !160 + %2805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 14, !dbg !160 + %2806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 15, !dbg !160 + %2807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 16, !dbg !160 + %2808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 17, !dbg !160 + %2809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 18, !dbg !160 + %2810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 19, !dbg !160 + %2811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 20, !dbg !160 + %2812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 21, !dbg !160 + %2813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 22, !dbg !160 + %2814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 23, !dbg !160 + %2815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 24, !dbg !160 + %2816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 25, !dbg !160 + %2817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 26, !dbg !160 + %2818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 27, !dbg !160 + %2819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 28, !dbg !160 + %2820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 29, !dbg !160 + %2821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 30, !dbg !160 + %2822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2779, 31, !dbg !160 + %2823 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2791, float %2792, float %2793, float %2794, float %2795, float %2796, float %2797, float %2798, float %2799, float %2800, float %2801, float %2802, float %2803, float %2804, float %2805, float %2806, float %2807, float %2808, float %2809, float %2810, float %2811, float %2812, float %2813, float %2814, float %2815, float %2816, float %2817, float %2818, float %2819, float %2820, float %2821, float %2822, i64 %2785, i64 %2790, i1 true) #3, !dbg !160 + %2824 = or disjoint i32 %2680, 16384, !dbg !160 + %2825 = add i32 %2824, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !160 + %2826 = lshr exact i32 %2825, 4, !dbg !160 + %2827 = and i32 %2826, 16383, !dbg !160 + %2828 = zext nneg i32 %2827 to i64, !dbg !160 + %2829 = or disjoint i64 %2828, 4611686293372403712, !dbg !160 + %2830 = add i32 %2686, 8192, !dbg !160 + %2831 = lshr exact i32 %2830, 4, !dbg !160 + %2832 = and i32 %2831, 16383, !dbg !160 + %2833 = zext nneg i32 %2832 to i64, !dbg !160 + %2834 = or disjoint i64 %2833, 4611686293338849280, !dbg !160 + %2835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 0, !dbg !160 + %2836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 1, !dbg !160 + %2837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 2, !dbg !160 + %2838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 3, !dbg !160 + %2839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 4, !dbg !160 + %2840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 5, !dbg !160 + %2841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 6, !dbg !160 + %2842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 7, !dbg !160 + %2843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 8, !dbg !160 + %2844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 9, !dbg !160 + %2845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 10, !dbg !160 + %2846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 11, !dbg !160 + %2847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 12, !dbg !160 + %2848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 13, !dbg !160 + %2849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 14, !dbg !160 + %2850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 15, !dbg !160 + %2851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 16, !dbg !160 + %2852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 17, !dbg !160 + %2853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 18, !dbg !160 + %2854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 19, !dbg !160 + %2855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 20, !dbg !160 + %2856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 21, !dbg !160 + %2857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 22, !dbg !160 + %2858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 23, !dbg !160 + %2859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 24, !dbg !160 + %2860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 25, !dbg !160 + %2861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 26, !dbg !160 + %2862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 27, !dbg !160 + %2863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 28, !dbg !160 + %2864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 29, !dbg !160 + %2865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 30, !dbg !160 + %2866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2823, 31, !dbg !160 + %2867 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2835, float %2836, float %2837, float %2838, float %2839, float %2840, float %2841, float %2842, float %2843, float %2844, float %2845, float %2846, float %2847, float %2848, float %2849, float %2850, float %2851, float %2852, float %2853, float %2854, float %2855, float %2856, float %2857, float %2858, float %2859, float %2860, float %2861, float %2862, float %2863, float %2864, float %2865, float %2866, i64 %2829, i64 %2834, i1 true) #3, !dbg !160 + %2868 = or disjoint i32 %2680, 16416, !dbg !160 + %2869 = add i32 %2868, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !160 + %2870 = lshr exact i32 %2869, 4, !dbg !160 + %2871 = and i32 %2870, 16383, !dbg !160 + %2872 = zext nneg i32 %2871 to i64, !dbg !160 + %2873 = or disjoint i64 %2872, 4611686293372403712, !dbg !160 + %2874 = add i32 %2686, 8224, !dbg !160 + %2875 = lshr exact i32 %2874, 4, !dbg !160 + %2876 = and i32 %2875, 16383, !dbg !160 + %2877 = zext nneg i32 %2876 to i64, !dbg !160 + %2878 = or disjoint i64 %2877, 4611686293338849280, !dbg !160 + %2879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 0, !dbg !160 + %2880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 1, !dbg !160 + %2881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 2, !dbg !160 + %2882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 3, !dbg !160 + %2883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 4, !dbg !160 + %2884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 5, !dbg !160 + %2885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 6, !dbg !160 + %2886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 7, !dbg !160 + %2887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 8, !dbg !160 + %2888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 9, !dbg !160 + %2889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 10, !dbg !160 + %2890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 11, !dbg !160 + %2891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 12, !dbg !160 + %2892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 13, !dbg !160 + %2893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 14, !dbg !160 + %2894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 15, !dbg !160 + %2895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 16, !dbg !160 + %2896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 17, !dbg !160 + %2897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 18, !dbg !160 + %2898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 19, !dbg !160 + %2899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 20, !dbg !160 + %2900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 21, !dbg !160 + %2901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 22, !dbg !160 + %2902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 23, !dbg !160 + %2903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 24, !dbg !160 + %2904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 25, !dbg !160 + %2905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 26, !dbg !160 + %2906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 27, !dbg !160 + %2907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 28, !dbg !160 + %2908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 29, !dbg !160 + %2909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 30, !dbg !160 + %2910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2867, 31, !dbg !160 + %2911 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2879, float %2880, float %2881, float %2882, float %2883, float %2884, float %2885, float %2886, float %2887, float %2888, float %2889, float %2890, float %2891, float %2892, float %2893, float %2894, float %2895, float %2896, float %2897, float %2898, float %2899, float %2900, float %2901, float %2902, float %2903, float %2904, float %2905, float %2906, float %2907, float %2908, float %2909, float %2910, i64 %2873, i64 %2878, i1 true) #3, !dbg !160 + %2912 = or disjoint i32 %2680, 16448, !dbg !160 + %2913 = add i32 %2912, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !160 + %2914 = lshr exact i32 %2913, 4, !dbg !160 + %2915 = and i32 %2914, 16383, !dbg !160 + %2916 = zext nneg i32 %2915 to i64, !dbg !160 + %2917 = or disjoint i64 %2916, 4611686293372403712, !dbg !160 + %2918 = add i32 %2686, 8256, !dbg !160 + %2919 = lshr exact i32 %2918, 4, !dbg !160 + %2920 = and i32 %2919, 16383, !dbg !160 + %2921 = zext nneg i32 %2920 to i64, !dbg !160 + %2922 = or disjoint i64 %2921, 4611686293338849280, !dbg !160 + %2923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 0, !dbg !160 + %2924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 1, !dbg !160 + %2925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 2, !dbg !160 + %2926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 3, !dbg !160 + %2927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 4, !dbg !160 + %2928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 5, !dbg !160 + %2929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 6, !dbg !160 + %2930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 7, !dbg !160 + %2931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 8, !dbg !160 + %2932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 9, !dbg !160 + %2933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 10, !dbg !160 + %2934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 11, !dbg !160 + %2935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 12, !dbg !160 + %2936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 13, !dbg !160 + %2937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 14, !dbg !160 + %2938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 15, !dbg !160 + %2939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 16, !dbg !160 + %2940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 17, !dbg !160 + %2941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 18, !dbg !160 + %2942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 19, !dbg !160 + %2943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 20, !dbg !160 + %2944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 21, !dbg !160 + %2945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 22, !dbg !160 + %2946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 23, !dbg !160 + %2947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 24, !dbg !160 + %2948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 25, !dbg !160 + %2949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 26, !dbg !160 + %2950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 27, !dbg !160 + %2951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 28, !dbg !160 + %2952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 29, !dbg !160 + %2953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 30, !dbg !160 + %2954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2911, 31, !dbg !160 + %2955 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2923, float %2924, float %2925, float %2926, float %2927, float %2928, float %2929, float %2930, float %2931, float %2932, float %2933, float %2934, float %2935, float %2936, float %2937, float %2938, float %2939, float %2940, float %2941, float %2942, float %2943, float %2944, float %2945, float %2946, float %2947, float %2948, float %2949, float %2950, float %2951, float %2952, float %2953, float %2954, i64 %2917, i64 %2922, i1 true) #3, !dbg !160 + %2956 = or disjoint i32 %2680, 16480, !dbg !160 + %2957 = add i32 %2956, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !160 + %2958 = lshr exact i32 %2957, 4, !dbg !160 + %2959 = and i32 %2958, 16383, !dbg !160 + %2960 = zext nneg i32 %2959 to i64, !dbg !160 + %2961 = or disjoint i64 %2960, 4611686293372403712, !dbg !160 + %2962 = add i32 %2686, 8288, !dbg !160 + %2963 = lshr exact i32 %2962, 4, !dbg !160 + %2964 = and i32 %2963, 16383, !dbg !160 + %2965 = zext nneg i32 %2964 to i64, !dbg !160 + %2966 = or disjoint i64 %2965, 4611686293338849280, !dbg !160 + %2967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 0, !dbg !160 + %2968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 1, !dbg !160 + %2969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 2, !dbg !160 + %2970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 3, !dbg !160 + %2971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 4, !dbg !160 + %2972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 5, !dbg !160 + %2973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 6, !dbg !160 + %2974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 7, !dbg !160 + %2975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 8, !dbg !160 + %2976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 9, !dbg !160 + %2977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 10, !dbg !160 + %2978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 11, !dbg !160 + %2979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 12, !dbg !160 + %2980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 13, !dbg !160 + %2981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 14, !dbg !160 + %2982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 15, !dbg !160 + %2983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 16, !dbg !160 + %2984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 17, !dbg !160 + %2985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 18, !dbg !160 + %2986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 19, !dbg !160 + %2987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 20, !dbg !160 + %2988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 21, !dbg !160 + %2989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 22, !dbg !160 + %2990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 23, !dbg !160 + %2991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 24, !dbg !160 + %2992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 25, !dbg !160 + %2993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 26, !dbg !160 + %2994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 27, !dbg !160 + %2995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 28, !dbg !160 + %2996 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 29, !dbg !160 + %2997 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 30, !dbg !160 + %2998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2955, 31, !dbg !160 + %2999 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2967, float %2968, float %2969, float %2970, float %2971, float %2972, float %2973, float %2974, float %2975, float %2976, float %2977, float %2978, float %2979, float %2980, float %2981, float %2982, float %2983, float %2984, float %2985, float %2986, float %2987, float %2988, float %2989, float %2990, float %2991, float %2992, float %2993, float %2994, float %2995, float %2996, float %2997, float %2998, i64 %2961, i64 %2966, i1 true) #3, !dbg !160 + %3000 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 0, !dbg !160 + %3001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 1, !dbg !160 + %3002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 2, !dbg !160 + %3003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 3, !dbg !160 + %3004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 4, !dbg !160 + %3005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 5, !dbg !160 + %3006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 6, !dbg !160 + %3007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 7, !dbg !160 + %3008 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 8, !dbg !160 + %3009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 9, !dbg !160 + %3010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 10, !dbg !160 + %3011 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 11, !dbg !160 + %3012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 12, !dbg !160 + %3013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 13, !dbg !160 + %3014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 14, !dbg !160 + %3015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 15, !dbg !160 + %3016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 16, !dbg !160 + %3017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 17, !dbg !160 + %3018 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 18, !dbg !160 + %3019 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 19, !dbg !160 + %3020 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 20, !dbg !160 + %3021 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 21, !dbg !160 + %3022 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 22, !dbg !160 + %3023 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 23, !dbg !160 + %3024 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 24, !dbg !160 + %3025 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 25, !dbg !160 + %3026 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 26, !dbg !160 + %3027 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 27, !dbg !160 + %3028 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 28, !dbg !160 + %3029 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 29, !dbg !160 + %3030 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 30, !dbg !160 + %3031 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2999, 31, !dbg !160 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !160 + %3032 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %3000, float %3001, float %3002, float %3003, float %3004, float %3005, float %3006, float %3007, float %3008, float %3009, float %3010, float %3011, float %3012, float %3013, float %3014, float %3015, float %3016, float %3017, float %3018, float %3019, float %3020, float %3021, float %3022, float %3023, float %3024, float %3025, float %3026, float %3027, float %3028, float %3029, float %3030, float %3031, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %2677, i32 0, i32 0) #3, !dbg !160 + %3033 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 0, !dbg !160 + %3034 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 1, !dbg !160 + %3035 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 2, !dbg !160 + %3036 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 3, !dbg !160 + %3037 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 4, !dbg !160 + %3038 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 5, !dbg !160 + %3039 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 6, !dbg !160 + %3040 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 7, !dbg !160 + %3041 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 8, !dbg !160 + %3042 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 9, !dbg !160 + %3043 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 10, !dbg !160 + %3044 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 11, !dbg !160 + %3045 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 12, !dbg !160 + %3046 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 13, !dbg !160 + %3047 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 14, !dbg !160 + %3048 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 15, !dbg !160 + %3049 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 16, !dbg !160 + %3050 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 17, !dbg !160 + %3051 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 18, !dbg !160 + %3052 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 19, !dbg !160 + %3053 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 20, !dbg !160 + %3054 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 21, !dbg !160 + %3055 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 22, !dbg !160 + %3056 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 23, !dbg !160 + %3057 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 24, !dbg !160 + %3058 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 25, !dbg !160 + %3059 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 26, !dbg !160 + %3060 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 27, !dbg !160 + %3061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 28, !dbg !160 + %3062 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 29, !dbg !160 + %3063 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 30, !dbg !160 + %3064 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3032, 31, !dbg !160 + %3065 = fmul float %3033, 0x3FB6A09E60000000, !dbg !161 + %3066 = fmul float %3034, 0x3FB6A09E60000000, !dbg !161 + %3067 = fmul float %3035, 0x3FB6A09E60000000, !dbg !161 + %3068 = fmul float %3036, 0x3FB6A09E60000000, !dbg !161 + %3069 = fmul float %3037, 0x3FB6A09E60000000, !dbg !161 + %3070 = fmul float %3038, 0x3FB6A09E60000000, !dbg !161 + %3071 = fmul float %3039, 0x3FB6A09E60000000, !dbg !161 + %3072 = fmul float %3040, 0x3FB6A09E60000000, !dbg !161 + %3073 = fmul float %3041, 0x3FB6A09E60000000, !dbg !161 + %3074 = fmul float %3042, 0x3FB6A09E60000000, !dbg !161 + %3075 = fmul float %3043, 0x3FB6A09E60000000, !dbg !161 + %3076 = fmul float %3044, 0x3FB6A09E60000000, !dbg !161 + %3077 = fmul float %3045, 0x3FB6A09E60000000, !dbg !161 + %3078 = fmul float %3046, 0x3FB6A09E60000000, !dbg !161 + %3079 = fmul float %3047, 0x3FB6A09E60000000, !dbg !161 + %3080 = fmul float %3048, 0x3FB6A09E60000000, !dbg !161 + %3081 = fmul float %3049, 0x3FB6A09E60000000, !dbg !161 + %3082 = fmul float %3050, 0x3FB6A09E60000000, !dbg !161 + %3083 = fmul float %3051, 0x3FB6A09E60000000, !dbg !161 + %3084 = fmul float %3052, 0x3FB6A09E60000000, !dbg !161 + %3085 = fmul float %3053, 0x3FB6A09E60000000, !dbg !161 + %3086 = fmul float %3054, 0x3FB6A09E60000000, !dbg !161 + %3087 = fmul float %3055, 0x3FB6A09E60000000, !dbg !161 + %3088 = fmul float %3056, 0x3FB6A09E60000000, !dbg !161 + %3089 = fmul float %3057, 0x3FB6A09E60000000, !dbg !161 + %3090 = fmul float %3058, 0x3FB6A09E60000000, !dbg !161 + %3091 = fmul float %3059, 0x3FB6A09E60000000, !dbg !161 + %3092 = fmul float %3060, 0x3FB6A09E60000000, !dbg !161 + %3093 = fmul float %3061, 0x3FB6A09E60000000, !dbg !161 + %3094 = fmul float %3062, 0x3FB6A09E60000000, !dbg !161 + %3095 = fmul float %3063, 0x3FB6A09E60000000, !dbg !161 + %3096 = fmul float %3064, 0x3FB6A09E60000000, !dbg !161 + %3097 = fmul float %3065, 0x3FF7154760000000, !dbg !162 + %3098 = select i1 %2645, float %3097, float 0xFFF0000000000000, !dbg !163 + %3099 = fmul float %3066, 0x3FF7154760000000, !dbg !162 + %3100 = select i1 %2647, float %3099, float 0xFFF0000000000000, !dbg !163 + %3101 = fmul float %3067, 0x3FF7154760000000, !dbg !162 + %3102 = select i1 %2645, float %3101, float 0xFFF0000000000000, !dbg !163 + %3103 = fmul float %3068, 0x3FF7154760000000, !dbg !162 + %3104 = select i1 %2647, float %3103, float 0xFFF0000000000000, !dbg !163 + %3105 = fmul float %3069, 0x3FF7154760000000, !dbg !162 + %3106 = select i1 %2649, float %3105, float 0xFFF0000000000000, !dbg !163 + %3107 = fmul float %3070, 0x3FF7154760000000, !dbg !162 + %3108 = select i1 %2651, float %3107, float 0xFFF0000000000000, !dbg !163 + %3109 = fmul float %3071, 0x3FF7154760000000, !dbg !162 + %3110 = select i1 %2649, float %3109, float 0xFFF0000000000000, !dbg !163 + %3111 = fmul float %3072, 0x3FF7154760000000, !dbg !162 + %3112 = select i1 %2651, float %3111, float 0xFFF0000000000000, !dbg !163 + %3113 = fmul float %3073, 0x3FF7154760000000, !dbg !162 + %3114 = select i1 %2653, float %3113, float 0xFFF0000000000000, !dbg !163 + %3115 = fmul float %3074, 0x3FF7154760000000, !dbg !162 + %3116 = select i1 %2655, float %3115, float 0xFFF0000000000000, !dbg !163 + %3117 = fmul float %3075, 0x3FF7154760000000, !dbg !162 + %3118 = select i1 %2653, float %3117, float 0xFFF0000000000000, !dbg !163 + %3119 = fmul float %3076, 0x3FF7154760000000, !dbg !162 + %3120 = select i1 %2655, float %3119, float 0xFFF0000000000000, !dbg !163 + %3121 = fmul float %3077, 0x3FF7154760000000, !dbg !162 + %3122 = select i1 %2657, float %3121, float 0xFFF0000000000000, !dbg !163 + %3123 = fmul float %3078, 0x3FF7154760000000, !dbg !162 + %3124 = select i1 %2659, float %3123, float 0xFFF0000000000000, !dbg !163 + %3125 = fmul float %3079, 0x3FF7154760000000, !dbg !162 + %3126 = select i1 %2657, float %3125, float 0xFFF0000000000000, !dbg !163 + %3127 = fmul float %3080, 0x3FF7154760000000, !dbg !162 + %3128 = select i1 %2659, float %3127, float 0xFFF0000000000000, !dbg !163 + %3129 = fmul float %3081, 0x3FF7154760000000, !dbg !162 + %3130 = select i1 %2661, float %3129, float 0xFFF0000000000000, !dbg !163 + %3131 = fmul float %3082, 0x3FF7154760000000, !dbg !162 + %3132 = select i1 %2663, float %3131, float 0xFFF0000000000000, !dbg !163 + %3133 = fmul float %3083, 0x3FF7154760000000, !dbg !162 + %3134 = select i1 %2661, float %3133, float 0xFFF0000000000000, !dbg !163 + %3135 = fmul float %3084, 0x3FF7154760000000, !dbg !162 + %3136 = select i1 %2663, float %3135, float 0xFFF0000000000000, !dbg !163 + %3137 = fmul float %3085, 0x3FF7154760000000, !dbg !162 + %3138 = select i1 %2665, float %3137, float 0xFFF0000000000000, !dbg !163 + %3139 = fmul float %3086, 0x3FF7154760000000, !dbg !162 + %3140 = select i1 %2667, float %3139, float 0xFFF0000000000000, !dbg !163 + %3141 = fmul float %3087, 0x3FF7154760000000, !dbg !162 + %3142 = select i1 %2665, float %3141, float 0xFFF0000000000000, !dbg !163 + %3143 = fmul float %3088, 0x3FF7154760000000, !dbg !162 + %3144 = select i1 %2667, float %3143, float 0xFFF0000000000000, !dbg !163 + %3145 = fmul float %3089, 0x3FF7154760000000, !dbg !162 + %3146 = select i1 %2669, float %3145, float 0xFFF0000000000000, !dbg !163 + %3147 = fmul float %3090, 0x3FF7154760000000, !dbg !162 + %3148 = select i1 %2671, float %3147, float 0xFFF0000000000000, !dbg !163 + %3149 = fmul float %3091, 0x3FF7154760000000, !dbg !162 + %3150 = select i1 %2669, float %3149, float 0xFFF0000000000000, !dbg !163 + %3151 = fmul float %3092, 0x3FF7154760000000, !dbg !162 + %3152 = select i1 %2671, float %3151, float 0xFFF0000000000000, !dbg !163 + %3153 = fmul float %3093, 0x3FF7154760000000, !dbg !162 + %3154 = select i1 %2673, float %3153, float 0xFFF0000000000000, !dbg !163 + %3155 = fmul float %3094, 0x3FF7154760000000, !dbg !162 + %3156 = select i1 %2675, float %3155, float 0xFFF0000000000000, !dbg !163 + %3157 = fmul float %3095, 0x3FF7154760000000, !dbg !162 + %3158 = select i1 %2673, float %3157, float 0xFFF0000000000000, !dbg !163 + %3159 = fmul float %3096, 0x3FF7154760000000, !dbg !162 + %3160 = select i1 %2675, float %3159, float 0xFFF0000000000000, !dbg !163 + %3161 = fsub float %3098, %374, !dbg !164 + %3162 = fsub float %3100, %374, !dbg !164 + %3163 = fsub float %3102, %375, !dbg !164 + %3164 = fsub float %3104, %375, !dbg !164 + %3165 = fsub float %3106, %374, !dbg !164 + %3166 = fsub float %3108, %374, !dbg !164 + %3167 = fsub float %3110, %375, !dbg !164 + %3168 = fsub float %3112, %375, !dbg !164 + %3169 = fsub float %3114, %374, !dbg !164 + %3170 = fsub float %3116, %374, !dbg !164 + %3171 = fsub float %3118, %375, !dbg !164 + %3172 = fsub float %3120, %375, !dbg !164 + %3173 = fsub float %3122, %374, !dbg !164 + %3174 = fsub float %3124, %374, !dbg !164 + %3175 = fsub float %3126, %375, !dbg !164 + %3176 = fsub float %3128, %375, !dbg !164 + %3177 = fsub float %3130, %374, !dbg !164 + %3178 = fsub float %3132, %374, !dbg !164 + %3179 = fsub float %3134, %375, !dbg !164 + %3180 = fsub float %3136, %375, !dbg !164 + %3181 = fsub float %3138, %374, !dbg !164 + %3182 = fsub float %3140, %374, !dbg !164 + %3183 = fsub float %3142, %375, !dbg !164 + %3184 = fsub float %3144, %375, !dbg !164 + %3185 = fsub float %3146, %374, !dbg !164 + %3186 = fsub float %3148, %374, !dbg !164 + %3187 = fsub float %3150, %375, !dbg !164 + %3188 = fsub float %3152, %375, !dbg !164 + %3189 = fsub float %3154, %374, !dbg !164 + %3190 = fsub float %3156, %374, !dbg !164 + %3191 = fsub float %3158, %375, !dbg !164 + %3192 = fsub float %3160, %375, !dbg !164 + %3193 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1381 = icmp eq i32 %3193, 0, !dbg !165 + br i1 %.not.i1381, label %3196, label %3194, !dbg !165 + +3194: ; preds = %2633 + %3195 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3161) #3, !dbg !165 + br label %__nv_exp2f.exit1383, !dbg !165 + +3196: ; preds = %2633 + %3197 = tail call float @llvm.nvvm.ex2.approx.f(float %3161) #3, !dbg !165 + br label %__nv_exp2f.exit1383, !dbg !165 + +__nv_exp2f.exit1383: ; preds = %3194, %3196 + %.0.i1382 = phi float [ %3195, %3194 ], [ %3197, %3196 ], !dbg !165 + %3198 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1384 = icmp eq i32 %3198, 0, !dbg !165 + br i1 %.not.i1384, label %3201, label %3199, !dbg !165 + +3199: ; preds = %__nv_exp2f.exit1383 + %3200 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3162) #3, !dbg !165 + br label %__nv_exp2f.exit1386, !dbg !165 + +3201: ; preds = %__nv_exp2f.exit1383 + %3202 = tail call float @llvm.nvvm.ex2.approx.f(float %3162) #3, !dbg !165 + br label %__nv_exp2f.exit1386, !dbg !165 + +__nv_exp2f.exit1386: ; preds = %3199, %3201 + %.0.i1385 = phi float [ %3200, %3199 ], [ %3202, %3201 ], !dbg !165 + %3203 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1387 = icmp eq i32 %3203, 0, !dbg !165 + br i1 %.not.i1387, label %3206, label %3204, !dbg !165 + +3204: ; preds = %__nv_exp2f.exit1386 + %3205 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3163) #3, !dbg !165 + br label %__nv_exp2f.exit1389, !dbg !165 + +3206: ; preds = %__nv_exp2f.exit1386 + %3207 = tail call float @llvm.nvvm.ex2.approx.f(float %3163) #3, !dbg !165 + br label %__nv_exp2f.exit1389, !dbg !165 + +__nv_exp2f.exit1389: ; preds = %3204, %3206 + %.0.i1388 = phi float [ %3205, %3204 ], [ %3207, %3206 ], !dbg !165 + %3208 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1390 = icmp eq i32 %3208, 0, !dbg !165 + br i1 %.not.i1390, label %3211, label %3209, !dbg !165 + +3209: ; preds = %__nv_exp2f.exit1389 + %3210 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3164) #3, !dbg !165 + br label %__nv_exp2f.exit1392, !dbg !165 + +3211: ; preds = %__nv_exp2f.exit1389 + %3212 = tail call float @llvm.nvvm.ex2.approx.f(float %3164) #3, !dbg !165 + br label %__nv_exp2f.exit1392, !dbg !165 + +__nv_exp2f.exit1392: ; preds = %3209, %3211 + %.0.i1391 = phi float [ %3210, %3209 ], [ %3212, %3211 ], !dbg !165 + %3213 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1393 = icmp eq i32 %3213, 0, !dbg !165 + br i1 %.not.i1393, label %3216, label %3214, !dbg !165 + +3214: ; preds = %__nv_exp2f.exit1392 + %3215 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3165) #3, !dbg !165 + br label %__nv_exp2f.exit1395, !dbg !165 + +3216: ; preds = %__nv_exp2f.exit1392 + %3217 = tail call float @llvm.nvvm.ex2.approx.f(float %3165) #3, !dbg !165 + br label %__nv_exp2f.exit1395, !dbg !165 + +__nv_exp2f.exit1395: ; preds = %3214, %3216 + %.0.i1394 = phi float [ %3215, %3214 ], [ %3217, %3216 ], !dbg !165 + %3218 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1396 = icmp eq i32 %3218, 0, !dbg !165 + br i1 %.not.i1396, label %3221, label %3219, !dbg !165 + +3219: ; preds = %__nv_exp2f.exit1395 + %3220 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3166) #3, !dbg !165 + br label %__nv_exp2f.exit1398, !dbg !165 + +3221: ; preds = %__nv_exp2f.exit1395 + %3222 = tail call float @llvm.nvvm.ex2.approx.f(float %3166) #3, !dbg !165 + br label %__nv_exp2f.exit1398, !dbg !165 + +__nv_exp2f.exit1398: ; preds = %3219, %3221 + %.0.i1397 = phi float [ %3220, %3219 ], [ %3222, %3221 ], !dbg !165 + %3223 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1399 = icmp eq i32 %3223, 0, !dbg !165 + br i1 %.not.i1399, label %3226, label %3224, !dbg !165 + +3224: ; preds = %__nv_exp2f.exit1398 + %3225 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3167) #3, !dbg !165 + br label %__nv_exp2f.exit1401, !dbg !165 + +3226: ; preds = %__nv_exp2f.exit1398 + %3227 = tail call float @llvm.nvvm.ex2.approx.f(float %3167) #3, !dbg !165 + br label %__nv_exp2f.exit1401, !dbg !165 + +__nv_exp2f.exit1401: ; preds = %3224, %3226 + %.0.i1400 = phi float [ %3225, %3224 ], [ %3227, %3226 ], !dbg !165 + %3228 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1402 = icmp eq i32 %3228, 0, !dbg !165 + br i1 %.not.i1402, label %3231, label %3229, !dbg !165 + +3229: ; preds = %__nv_exp2f.exit1401 + %3230 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3168) #3, !dbg !165 + br label %__nv_exp2f.exit1404, !dbg !165 + +3231: ; preds = %__nv_exp2f.exit1401 + %3232 = tail call float @llvm.nvvm.ex2.approx.f(float %3168) #3, !dbg !165 + br label %__nv_exp2f.exit1404, !dbg !165 + +__nv_exp2f.exit1404: ; preds = %3229, %3231 + %.0.i1403 = phi float [ %3230, %3229 ], [ %3232, %3231 ], !dbg !165 + %3233 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1405 = icmp eq i32 %3233, 0, !dbg !165 + br i1 %.not.i1405, label %3236, label %3234, !dbg !165 + +3234: ; preds = %__nv_exp2f.exit1404 + %3235 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3169) #3, !dbg !165 + br label %__nv_exp2f.exit1407, !dbg !165 + +3236: ; preds = %__nv_exp2f.exit1404 + %3237 = tail call float @llvm.nvvm.ex2.approx.f(float %3169) #3, !dbg !165 + br label %__nv_exp2f.exit1407, !dbg !165 + +__nv_exp2f.exit1407: ; preds = %3234, %3236 + %.0.i1406 = phi float [ %3235, %3234 ], [ %3237, %3236 ], !dbg !165 + %3238 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1408 = icmp eq i32 %3238, 0, !dbg !165 + br i1 %.not.i1408, label %3241, label %3239, !dbg !165 + +3239: ; preds = %__nv_exp2f.exit1407 + %3240 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3170) #3, !dbg !165 + br label %__nv_exp2f.exit1410, !dbg !165 + +3241: ; preds = %__nv_exp2f.exit1407 + %3242 = tail call float @llvm.nvvm.ex2.approx.f(float %3170) #3, !dbg !165 + br label %__nv_exp2f.exit1410, !dbg !165 + +__nv_exp2f.exit1410: ; preds = %3239, %3241 + %.0.i1409 = phi float [ %3240, %3239 ], [ %3242, %3241 ], !dbg !165 + %3243 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1411 = icmp eq i32 %3243, 0, !dbg !165 + br i1 %.not.i1411, label %3246, label %3244, !dbg !165 + +3244: ; preds = %__nv_exp2f.exit1410 + %3245 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3171) #3, !dbg !165 + br label %__nv_exp2f.exit1413, !dbg !165 + +3246: ; preds = %__nv_exp2f.exit1410 + %3247 = tail call float @llvm.nvvm.ex2.approx.f(float %3171) #3, !dbg !165 + br label %__nv_exp2f.exit1413, !dbg !165 + +__nv_exp2f.exit1413: ; preds = %3244, %3246 + %.0.i1412 = phi float [ %3245, %3244 ], [ %3247, %3246 ], !dbg !165 + %3248 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1414 = icmp eq i32 %3248, 0, !dbg !165 + br i1 %.not.i1414, label %3251, label %3249, !dbg !165 + +3249: ; preds = %__nv_exp2f.exit1413 + %3250 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3172) #3, !dbg !165 + br label %__nv_exp2f.exit1416, !dbg !165 + +3251: ; preds = %__nv_exp2f.exit1413 + %3252 = tail call float @llvm.nvvm.ex2.approx.f(float %3172) #3, !dbg !165 + br label %__nv_exp2f.exit1416, !dbg !165 + +__nv_exp2f.exit1416: ; preds = %3249, %3251 + %.0.i1415 = phi float [ %3250, %3249 ], [ %3252, %3251 ], !dbg !165 + %3253 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1417 = icmp eq i32 %3253, 0, !dbg !165 + br i1 %.not.i1417, label %3256, label %3254, !dbg !165 + +3254: ; preds = %__nv_exp2f.exit1416 + %3255 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3173) #3, !dbg !165 + br label %__nv_exp2f.exit1419, !dbg !165 + +3256: ; preds = %__nv_exp2f.exit1416 + %3257 = tail call float @llvm.nvvm.ex2.approx.f(float %3173) #3, !dbg !165 + br label %__nv_exp2f.exit1419, !dbg !165 + +__nv_exp2f.exit1419: ; preds = %3254, %3256 + %.0.i1418 = phi float [ %3255, %3254 ], [ %3257, %3256 ], !dbg !165 + %3258 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1420 = icmp eq i32 %3258, 0, !dbg !165 + br i1 %.not.i1420, label %3261, label %3259, !dbg !165 + +3259: ; preds = %__nv_exp2f.exit1419 + %3260 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3174) #3, !dbg !165 + br label %__nv_exp2f.exit1422, !dbg !165 + +3261: ; preds = %__nv_exp2f.exit1419 + %3262 = tail call float @llvm.nvvm.ex2.approx.f(float %3174) #3, !dbg !165 + br label %__nv_exp2f.exit1422, !dbg !165 + +__nv_exp2f.exit1422: ; preds = %3259, %3261 + %.0.i1421 = phi float [ %3260, %3259 ], [ %3262, %3261 ], !dbg !165 + %3263 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1423 = icmp eq i32 %3263, 0, !dbg !165 + br i1 %.not.i1423, label %3266, label %3264, !dbg !165 + +3264: ; preds = %__nv_exp2f.exit1422 + %3265 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3175) #3, !dbg !165 + br label %__nv_exp2f.exit1425, !dbg !165 + +3266: ; preds = %__nv_exp2f.exit1422 + %3267 = tail call float @llvm.nvvm.ex2.approx.f(float %3175) #3, !dbg !165 + br label %__nv_exp2f.exit1425, !dbg !165 + +__nv_exp2f.exit1425: ; preds = %3264, %3266 + %.0.i1424 = phi float [ %3265, %3264 ], [ %3267, %3266 ], !dbg !165 + %3268 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1426 = icmp eq i32 %3268, 0, !dbg !165 + br i1 %.not.i1426, label %3271, label %3269, !dbg !165 + +3269: ; preds = %__nv_exp2f.exit1425 + %3270 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3176) #3, !dbg !165 + br label %__nv_exp2f.exit1428, !dbg !165 + +3271: ; preds = %__nv_exp2f.exit1425 + %3272 = tail call float @llvm.nvvm.ex2.approx.f(float %3176) #3, !dbg !165 + br label %__nv_exp2f.exit1428, !dbg !165 + +__nv_exp2f.exit1428: ; preds = %3269, %3271 + %.0.i1427 = phi float [ %3270, %3269 ], [ %3272, %3271 ], !dbg !165 + %3273 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1429 = icmp eq i32 %3273, 0, !dbg !165 + br i1 %.not.i1429, label %3276, label %3274, !dbg !165 + +3274: ; preds = %__nv_exp2f.exit1428 + %3275 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3177) #3, !dbg !165 + br label %__nv_exp2f.exit1431, !dbg !165 + +3276: ; preds = %__nv_exp2f.exit1428 + %3277 = tail call float @llvm.nvvm.ex2.approx.f(float %3177) #3, !dbg !165 + br label %__nv_exp2f.exit1431, !dbg !165 + +__nv_exp2f.exit1431: ; preds = %3274, %3276 + %.0.i1430 = phi float [ %3275, %3274 ], [ %3277, %3276 ], !dbg !165 + %3278 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1432 = icmp eq i32 %3278, 0, !dbg !165 + br i1 %.not.i1432, label %3281, label %3279, !dbg !165 + +3279: ; preds = %__nv_exp2f.exit1431 + %3280 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3178) #3, !dbg !165 + br label %__nv_exp2f.exit1434, !dbg !165 + +3281: ; preds = %__nv_exp2f.exit1431 + %3282 = tail call float @llvm.nvvm.ex2.approx.f(float %3178) #3, !dbg !165 + br label %__nv_exp2f.exit1434, !dbg !165 + +__nv_exp2f.exit1434: ; preds = %3279, %3281 + %.0.i1433 = phi float [ %3280, %3279 ], [ %3282, %3281 ], !dbg !165 + %3283 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1435 = icmp eq i32 %3283, 0, !dbg !165 + br i1 %.not.i1435, label %3286, label %3284, !dbg !165 + +3284: ; preds = %__nv_exp2f.exit1434 + %3285 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3179) #3, !dbg !165 + br label %__nv_exp2f.exit1437, !dbg !165 + +3286: ; preds = %__nv_exp2f.exit1434 + %3287 = tail call float @llvm.nvvm.ex2.approx.f(float %3179) #3, !dbg !165 + br label %__nv_exp2f.exit1437, !dbg !165 + +__nv_exp2f.exit1437: ; preds = %3284, %3286 + %.0.i1436 = phi float [ %3285, %3284 ], [ %3287, %3286 ], !dbg !165 + %3288 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1438 = icmp eq i32 %3288, 0, !dbg !165 + br i1 %.not.i1438, label %3291, label %3289, !dbg !165 + +3289: ; preds = %__nv_exp2f.exit1437 + %3290 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3180) #3, !dbg !165 + br label %__nv_exp2f.exit1440, !dbg !165 + +3291: ; preds = %__nv_exp2f.exit1437 + %3292 = tail call float @llvm.nvvm.ex2.approx.f(float %3180) #3, !dbg !165 + br label %__nv_exp2f.exit1440, !dbg !165 + +__nv_exp2f.exit1440: ; preds = %3289, %3291 + %.0.i1439 = phi float [ %3290, %3289 ], [ %3292, %3291 ], !dbg !165 + %3293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1441 = icmp eq i32 %3293, 0, !dbg !165 + br i1 %.not.i1441, label %3296, label %3294, !dbg !165 + +3294: ; preds = %__nv_exp2f.exit1440 + %3295 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3181) #3, !dbg !165 + br label %__nv_exp2f.exit1443, !dbg !165 + +3296: ; preds = %__nv_exp2f.exit1440 + %3297 = tail call float @llvm.nvvm.ex2.approx.f(float %3181) #3, !dbg !165 + br label %__nv_exp2f.exit1443, !dbg !165 + +__nv_exp2f.exit1443: ; preds = %3294, %3296 + %.0.i1442 = phi float [ %3295, %3294 ], [ %3297, %3296 ], !dbg !165 + %3298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1444 = icmp eq i32 %3298, 0, !dbg !165 + br i1 %.not.i1444, label %3301, label %3299, !dbg !165 + +3299: ; preds = %__nv_exp2f.exit1443 + %3300 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3182) #3, !dbg !165 + br label %__nv_exp2f.exit1446, !dbg !165 + +3301: ; preds = %__nv_exp2f.exit1443 + %3302 = tail call float @llvm.nvvm.ex2.approx.f(float %3182) #3, !dbg !165 + br label %__nv_exp2f.exit1446, !dbg !165 + +__nv_exp2f.exit1446: ; preds = %3299, %3301 + %.0.i1445 = phi float [ %3300, %3299 ], [ %3302, %3301 ], !dbg !165 + %3303 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1447 = icmp eq i32 %3303, 0, !dbg !165 + br i1 %.not.i1447, label %3306, label %3304, !dbg !165 + +3304: ; preds = %__nv_exp2f.exit1446 + %3305 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3183) #3, !dbg !165 + br label %__nv_exp2f.exit1449, !dbg !165 + +3306: ; preds = %__nv_exp2f.exit1446 + %3307 = tail call float @llvm.nvvm.ex2.approx.f(float %3183) #3, !dbg !165 + br label %__nv_exp2f.exit1449, !dbg !165 + +__nv_exp2f.exit1449: ; preds = %3304, %3306 + %.0.i1448 = phi float [ %3305, %3304 ], [ %3307, %3306 ], !dbg !165 + %3308 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1450 = icmp eq i32 %3308, 0, !dbg !165 + br i1 %.not.i1450, label %3311, label %3309, !dbg !165 + +3309: ; preds = %__nv_exp2f.exit1449 + %3310 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3184) #3, !dbg !165 + br label %__nv_exp2f.exit1452, !dbg !165 + +3311: ; preds = %__nv_exp2f.exit1449 + %3312 = tail call float @llvm.nvvm.ex2.approx.f(float %3184) #3, !dbg !165 + br label %__nv_exp2f.exit1452, !dbg !165 + +__nv_exp2f.exit1452: ; preds = %3309, %3311 + %.0.i1451 = phi float [ %3310, %3309 ], [ %3312, %3311 ], !dbg !165 + %3313 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1453 = icmp eq i32 %3313, 0, !dbg !165 + br i1 %.not.i1453, label %3316, label %3314, !dbg !165 + +3314: ; preds = %__nv_exp2f.exit1452 + %3315 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3185) #3, !dbg !165 + br label %__nv_exp2f.exit1455, !dbg !165 + +3316: ; preds = %__nv_exp2f.exit1452 + %3317 = tail call float @llvm.nvvm.ex2.approx.f(float %3185) #3, !dbg !165 + br label %__nv_exp2f.exit1455, !dbg !165 + +__nv_exp2f.exit1455: ; preds = %3314, %3316 + %.0.i1454 = phi float [ %3315, %3314 ], [ %3317, %3316 ], !dbg !165 + %3318 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1456 = icmp eq i32 %3318, 0, !dbg !165 + br i1 %.not.i1456, label %3321, label %3319, !dbg !165 + +3319: ; preds = %__nv_exp2f.exit1455 + %3320 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3186) #3, !dbg !165 + br label %__nv_exp2f.exit1458, !dbg !165 + +3321: ; preds = %__nv_exp2f.exit1455 + %3322 = tail call float @llvm.nvvm.ex2.approx.f(float %3186) #3, !dbg !165 + br label %__nv_exp2f.exit1458, !dbg !165 + +__nv_exp2f.exit1458: ; preds = %3319, %3321 + %.0.i1457 = phi float [ %3320, %3319 ], [ %3322, %3321 ], !dbg !165 + %3323 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1459 = icmp eq i32 %3323, 0, !dbg !165 + br i1 %.not.i1459, label %3326, label %3324, !dbg !165 + +3324: ; preds = %__nv_exp2f.exit1458 + %3325 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3187) #3, !dbg !165 + br label %__nv_exp2f.exit1461, !dbg !165 + +3326: ; preds = %__nv_exp2f.exit1458 + %3327 = tail call float @llvm.nvvm.ex2.approx.f(float %3187) #3, !dbg !165 + br label %__nv_exp2f.exit1461, !dbg !165 + +__nv_exp2f.exit1461: ; preds = %3324, %3326 + %.0.i1460 = phi float [ %3325, %3324 ], [ %3327, %3326 ], !dbg !165 + %3328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1462 = icmp eq i32 %3328, 0, !dbg !165 + br i1 %.not.i1462, label %3331, label %3329, !dbg !165 + +3329: ; preds = %__nv_exp2f.exit1461 + %3330 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3188) #3, !dbg !165 + br label %__nv_exp2f.exit1464, !dbg !165 + +3331: ; preds = %__nv_exp2f.exit1461 + %3332 = tail call float @llvm.nvvm.ex2.approx.f(float %3188) #3, !dbg !165 + br label %__nv_exp2f.exit1464, !dbg !165 + +__nv_exp2f.exit1464: ; preds = %3329, %3331 + %.0.i1463 = phi float [ %3330, %3329 ], [ %3332, %3331 ], !dbg !165 + %3333 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1465 = icmp eq i32 %3333, 0, !dbg !165 + br i1 %.not.i1465, label %3336, label %3334, !dbg !165 + +3334: ; preds = %__nv_exp2f.exit1464 + %3335 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3189) #3, !dbg !165 + br label %__nv_exp2f.exit1467, !dbg !165 + +3336: ; preds = %__nv_exp2f.exit1464 + %3337 = tail call float @llvm.nvvm.ex2.approx.f(float %3189) #3, !dbg !165 + br label %__nv_exp2f.exit1467, !dbg !165 + +__nv_exp2f.exit1467: ; preds = %3334, %3336 + %.0.i1466 = phi float [ %3335, %3334 ], [ %3337, %3336 ], !dbg !165 + %3338 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1468 = icmp eq i32 %3338, 0, !dbg !165 + br i1 %.not.i1468, label %3341, label %3339, !dbg !165 + +3339: ; preds = %__nv_exp2f.exit1467 + %3340 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3190) #3, !dbg !165 + br label %__nv_exp2f.exit1470, !dbg !165 + +3341: ; preds = %__nv_exp2f.exit1467 + %3342 = tail call float @llvm.nvvm.ex2.approx.f(float %3190) #3, !dbg !165 + br label %__nv_exp2f.exit1470, !dbg !165 + +__nv_exp2f.exit1470: ; preds = %3339, %3341 + %.0.i1469 = phi float [ %3340, %3339 ], [ %3342, %3341 ], !dbg !165 + %3343 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1471 = icmp eq i32 %3343, 0, !dbg !165 + br i1 %.not.i1471, label %3346, label %3344, !dbg !165 + +3344: ; preds = %__nv_exp2f.exit1470 + %3345 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3191) #3, !dbg !165 + br label %__nv_exp2f.exit1473, !dbg !165 + +3346: ; preds = %__nv_exp2f.exit1470 + %3347 = tail call float @llvm.nvvm.ex2.approx.f(float %3191) #3, !dbg !165 + br label %__nv_exp2f.exit1473, !dbg !165 + +__nv_exp2f.exit1473: ; preds = %3344, %3346 + %.0.i1472 = phi float [ %3345, %3344 ], [ %3347, %3346 ], !dbg !165 + %3348 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !165 + %.not.i1474 = icmp eq i32 %3348, 0, !dbg !165 + br i1 %.not.i1474, label %3351, label %3349, !dbg !165 + +3349: ; preds = %__nv_exp2f.exit1473 + %3350 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3192) #3, !dbg !165 + br label %__nv_exp2f.exit1476, !dbg !165 + +3351: ; preds = %__nv_exp2f.exit1473 + %3352 = tail call float @llvm.nvvm.ex2.approx.f(float %3192) #3, !dbg !165 + br label %__nv_exp2f.exit1476, !dbg !165 + +__nv_exp2f.exit1476: ; preds = %3349, %3351 + %.0.i1475 = phi float [ %3350, %3349 ], [ %3352, %3351 ], !dbg !165 + %3353 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %2676, !dbg !156 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !166 + %3354 = add i32 %2680, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !166 + %3355 = lshr exact i32 %3354, 4, !dbg !166 + %3356 = and i32 %3355, 16383, !dbg !166 + %3357 = zext nneg i32 %3356 to i64, !dbg !166 + %3358 = or disjoint i64 %3357, 4611686293372403712, !dbg !166 + %3359 = ptrtoint ptr addrspace(3) %3353 to i32, !dbg !166 + %3360 = lshr exact i32 %3359, 4, !dbg !166 + %3361 = and i32 %3360, 16383, !dbg !166 + %3362 = zext nneg i32 %3361 to i64, !dbg !166 + %3363 = or disjoint i64 %3362, 4611686293338849280, !dbg !166 + %3364 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %3358, i64 %3363) #3, !dbg !166 + %3365 = add i32 %2692, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !166 + %3366 = lshr exact i32 %3365, 4, !dbg !166 + %3367 = and i32 %3366, 16383, !dbg !166 + %3368 = zext nneg i32 %3367 to i64, !dbg !166 + %3369 = or disjoint i64 %3368, 4611686293372403712, !dbg !166 + %3370 = add i32 %3359, 32, !dbg !166 + %3371 = lshr exact i32 %3370, 4, !dbg !166 + %3372 = and i32 %3371, 16383, !dbg !166 + %3373 = zext nneg i32 %3372 to i64, !dbg !166 + %3374 = or disjoint i64 %3373, 4611686293338849280, !dbg !166 + %3375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 0, !dbg !166 + %3376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 1, !dbg !166 + %3377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 2, !dbg !166 + %3378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 3, !dbg !166 + %3379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 4, !dbg !166 + %3380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 5, !dbg !166 + %3381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 6, !dbg !166 + %3382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 7, !dbg !166 + %3383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 8, !dbg !166 + %3384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 9, !dbg !166 + %3385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 10, !dbg !166 + %3386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 11, !dbg !166 + %3387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 12, !dbg !166 + %3388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 13, !dbg !166 + %3389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 14, !dbg !166 + %3390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 15, !dbg !166 + %3391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 16, !dbg !166 + %3392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 17, !dbg !166 + %3393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 18, !dbg !166 + %3394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 19, !dbg !166 + %3395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 20, !dbg !166 + %3396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 21, !dbg !166 + %3397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 22, !dbg !166 + %3398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 23, !dbg !166 + %3399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 24, !dbg !166 + %3400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 25, !dbg !166 + %3401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 26, !dbg !166 + %3402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 27, !dbg !166 + %3403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 28, !dbg !166 + %3404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 29, !dbg !166 + %3405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 30, !dbg !166 + %3406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3364, 31, !dbg !166 + %3407 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3375, float %3376, float %3377, float %3378, float %3379, float %3380, float %3381, float %3382, float %3383, float %3384, float %3385, float %3386, float %3387, float %3388, float %3389, float %3390, float %3391, float %3392, float %3393, float %3394, float %3395, float %3396, float %3397, float %3398, float %3399, float %3400, float %3401, float %3402, float %3403, float %3404, float %3405, float %3406, i64 %3369, i64 %3374, i1 true) #3, !dbg !166 + %3408 = add i32 %2736, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !166 + %3409 = lshr exact i32 %3408, 4, !dbg !166 + %3410 = and i32 %3409, 16383, !dbg !166 + %3411 = zext nneg i32 %3410 to i64, !dbg !166 + %3412 = or disjoint i64 %3411, 4611686293372403712, !dbg !166 + %3413 = add i32 %3359, 64, !dbg !166 + %3414 = lshr exact i32 %3413, 4, !dbg !166 + %3415 = and i32 %3414, 16383, !dbg !166 + %3416 = zext nneg i32 %3415 to i64, !dbg !166 + %3417 = or disjoint i64 %3416, 4611686293338849280, !dbg !166 + %3418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 0, !dbg !166 + %3419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 1, !dbg !166 + %3420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 2, !dbg !166 + %3421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 3, !dbg !166 + %3422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 4, !dbg !166 + %3423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 5, !dbg !166 + %3424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 6, !dbg !166 + %3425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 7, !dbg !166 + %3426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 8, !dbg !166 + %3427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 9, !dbg !166 + %3428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 10, !dbg !166 + %3429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 11, !dbg !166 + %3430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 12, !dbg !166 + %3431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 13, !dbg !166 + %3432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 14, !dbg !166 + %3433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 15, !dbg !166 + %3434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 16, !dbg !166 + %3435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 17, !dbg !166 + %3436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 18, !dbg !166 + %3437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 19, !dbg !166 + %3438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 20, !dbg !166 + %3439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 21, !dbg !166 + %3440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 22, !dbg !166 + %3441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 23, !dbg !166 + %3442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 24, !dbg !166 + %3443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 25, !dbg !166 + %3444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 26, !dbg !166 + %3445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 27, !dbg !166 + %3446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 28, !dbg !166 + %3447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 29, !dbg !166 + %3448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 30, !dbg !166 + %3449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3407, 31, !dbg !166 + %3450 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3418, float %3419, float %3420, float %3421, float %3422, float %3423, float %3424, float %3425, float %3426, float %3427, float %3428, float %3429, float %3430, float %3431, float %3432, float %3433, float %3434, float %3435, float %3436, float %3437, float %3438, float %3439, float %3440, float %3441, float %3442, float %3443, float %3444, float %3445, float %3446, float %3447, float %3448, float %3449, i64 %3412, i64 %3417, i1 true) #3, !dbg !166 + %3451 = add i32 %2780, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !166 + %3452 = lshr exact i32 %3451, 4, !dbg !166 + %3453 = and i32 %3452, 16383, !dbg !166 + %3454 = zext nneg i32 %3453 to i64, !dbg !166 + %3455 = or disjoint i64 %3454, 4611686293372403712, !dbg !166 + %3456 = add i32 %3359, 96, !dbg !166 + %3457 = lshr exact i32 %3456, 4, !dbg !166 + %3458 = and i32 %3457, 16383, !dbg !166 + %3459 = zext nneg i32 %3458 to i64, !dbg !166 + %3460 = or disjoint i64 %3459, 4611686293338849280, !dbg !166 + %3461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 0, !dbg !166 + %3462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 1, !dbg !166 + %3463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 2, !dbg !166 + %3464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 3, !dbg !166 + %3465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 4, !dbg !166 + %3466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 5, !dbg !166 + %3467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 6, !dbg !166 + %3468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 7, !dbg !166 + %3469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 8, !dbg !166 + %3470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 9, !dbg !166 + %3471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 10, !dbg !166 + %3472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 11, !dbg !166 + %3473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 12, !dbg !166 + %3474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 13, !dbg !166 + %3475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 14, !dbg !166 + %3476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 15, !dbg !166 + %3477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 16, !dbg !166 + %3478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 17, !dbg !166 + %3479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 18, !dbg !166 + %3480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 19, !dbg !166 + %3481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 20, !dbg !166 + %3482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 21, !dbg !166 + %3483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 22, !dbg !166 + %3484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 23, !dbg !166 + %3485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 24, !dbg !166 + %3486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 25, !dbg !166 + %3487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 26, !dbg !166 + %3488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 27, !dbg !166 + %3489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 28, !dbg !166 + %3490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 29, !dbg !166 + %3491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 30, !dbg !166 + %3492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3450, 31, !dbg !166 + %3493 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3461, float %3462, float %3463, float %3464, float %3465, float %3466, float %3467, float %3468, float %3469, float %3470, float %3471, float %3472, float %3473, float %3474, float %3475, float %3476, float %3477, float %3478, float %3479, float %3480, float %3481, float %3482, float %3483, float %3484, float %3485, float %3486, float %3487, float %3488, float %3489, float %3490, float %3491, float %3492, i64 %3455, i64 %3460, i1 true) #3, !dbg !166 + %3494 = add i32 %2824, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !166 + %3495 = lshr exact i32 %3494, 4, !dbg !166 + %3496 = and i32 %3495, 16383, !dbg !166 + %3497 = zext nneg i32 %3496 to i64, !dbg !166 + %3498 = or disjoint i64 %3497, 4611686293372403712, !dbg !166 + %3499 = add i32 %3359, 8192, !dbg !166 + %3500 = lshr exact i32 %3499, 4, !dbg !166 + %3501 = and i32 %3500, 16383, !dbg !166 + %3502 = zext nneg i32 %3501 to i64, !dbg !166 + %3503 = or disjoint i64 %3502, 4611686293338849280, !dbg !166 + %3504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 0, !dbg !166 + %3505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 1, !dbg !166 + %3506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 2, !dbg !166 + %3507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 3, !dbg !166 + %3508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 4, !dbg !166 + %3509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 5, !dbg !166 + %3510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 6, !dbg !166 + %3511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 7, !dbg !166 + %3512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 8, !dbg !166 + %3513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 9, !dbg !166 + %3514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 10, !dbg !166 + %3515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 11, !dbg !166 + %3516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 12, !dbg !166 + %3517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 13, !dbg !166 + %3518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 14, !dbg !166 + %3519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 15, !dbg !166 + %3520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 16, !dbg !166 + %3521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 17, !dbg !166 + %3522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 18, !dbg !166 + %3523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 19, !dbg !166 + %3524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 20, !dbg !166 + %3525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 21, !dbg !166 + %3526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 22, !dbg !166 + %3527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 23, !dbg !166 + %3528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 24, !dbg !166 + %3529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 25, !dbg !166 + %3530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 26, !dbg !166 + %3531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 27, !dbg !166 + %3532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 28, !dbg !166 + %3533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 29, !dbg !166 + %3534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 30, !dbg !166 + %3535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3493, 31, !dbg !166 + %3536 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3504, float %3505, float %3506, float %3507, float %3508, float %3509, float %3510, float %3511, float %3512, float %3513, float %3514, float %3515, float %3516, float %3517, float %3518, float %3519, float %3520, float %3521, float %3522, float %3523, float %3524, float %3525, float %3526, float %3527, float %3528, float %3529, float %3530, float %3531, float %3532, float %3533, float %3534, float %3535, i64 %3498, i64 %3503, i1 true) #3, !dbg !166 + %3537 = add i32 %2868, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !166 + %3538 = lshr exact i32 %3537, 4, !dbg !166 + %3539 = and i32 %3538, 16383, !dbg !166 + %3540 = zext nneg i32 %3539 to i64, !dbg !166 + %3541 = or disjoint i64 %3540, 4611686293372403712, !dbg !166 + %3542 = add i32 %3359, 8224, !dbg !166 + %3543 = lshr exact i32 %3542, 4, !dbg !166 + %3544 = and i32 %3543, 16383, !dbg !166 + %3545 = zext nneg i32 %3544 to i64, !dbg !166 + %3546 = or disjoint i64 %3545, 4611686293338849280, !dbg !166 + %3547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 0, !dbg !166 + %3548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 1, !dbg !166 + %3549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 2, !dbg !166 + %3550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 3, !dbg !166 + %3551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 4, !dbg !166 + %3552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 5, !dbg !166 + %3553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 6, !dbg !166 + %3554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 7, !dbg !166 + %3555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 8, !dbg !166 + %3556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 9, !dbg !166 + %3557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 10, !dbg !166 + %3558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 11, !dbg !166 + %3559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 12, !dbg !166 + %3560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 13, !dbg !166 + %3561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 14, !dbg !166 + %3562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 15, !dbg !166 + %3563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 16, !dbg !166 + %3564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 17, !dbg !166 + %3565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 18, !dbg !166 + %3566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 19, !dbg !166 + %3567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 20, !dbg !166 + %3568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 21, !dbg !166 + %3569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 22, !dbg !166 + %3570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 23, !dbg !166 + %3571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 24, !dbg !166 + %3572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 25, !dbg !166 + %3573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 26, !dbg !166 + %3574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 27, !dbg !166 + %3575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 28, !dbg !166 + %3576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 29, !dbg !166 + %3577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 30, !dbg !166 + %3578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3536, 31, !dbg !166 + %3579 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3547, float %3548, float %3549, float %3550, float %3551, float %3552, float %3553, float %3554, float %3555, float %3556, float %3557, float %3558, float %3559, float %3560, float %3561, float %3562, float %3563, float %3564, float %3565, float %3566, float %3567, float %3568, float %3569, float %3570, float %3571, float %3572, float %3573, float %3574, float %3575, float %3576, float %3577, float %3578, i64 %3541, i64 %3546, i1 true) #3, !dbg !166 + %3580 = add i32 %2912, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !166 + %3581 = lshr exact i32 %3580, 4, !dbg !166 + %3582 = and i32 %3581, 16383, !dbg !166 + %3583 = zext nneg i32 %3582 to i64, !dbg !166 + %3584 = or disjoint i64 %3583, 4611686293372403712, !dbg !166 + %3585 = add i32 %3359, 8256, !dbg !166 + %3586 = lshr exact i32 %3585, 4, !dbg !166 + %3587 = and i32 %3586, 16383, !dbg !166 + %3588 = zext nneg i32 %3587 to i64, !dbg !166 + %3589 = or disjoint i64 %3588, 4611686293338849280, !dbg !166 + %3590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 0, !dbg !166 + %3591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 1, !dbg !166 + %3592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 2, !dbg !166 + %3593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 3, !dbg !166 + %3594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 4, !dbg !166 + %3595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 5, !dbg !166 + %3596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 6, !dbg !166 + %3597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 7, !dbg !166 + %3598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 8, !dbg !166 + %3599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 9, !dbg !166 + %3600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 10, !dbg !166 + %3601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 11, !dbg !166 + %3602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 12, !dbg !166 + %3603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 13, !dbg !166 + %3604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 14, !dbg !166 + %3605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 15, !dbg !166 + %3606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 16, !dbg !166 + %3607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 17, !dbg !166 + %3608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 18, !dbg !166 + %3609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 19, !dbg !166 + %3610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 20, !dbg !166 + %3611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 21, !dbg !166 + %3612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 22, !dbg !166 + %3613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 23, !dbg !166 + %3614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 24, !dbg !166 + %3615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 25, !dbg !166 + %3616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 26, !dbg !166 + %3617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 27, !dbg !166 + %3618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 28, !dbg !166 + %3619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 29, !dbg !166 + %3620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 30, !dbg !166 + %3621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3579, 31, !dbg !166 + %3622 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3590, float %3591, float %3592, float %3593, float %3594, float %3595, float %3596, float %3597, float %3598, float %3599, float %3600, float %3601, float %3602, float %3603, float %3604, float %3605, float %3606, float %3607, float %3608, float %3609, float %3610, float %3611, float %3612, float %3613, float %3614, float %3615, float %3616, float %3617, float %3618, float %3619, float %3620, float %3621, i64 %3584, i64 %3589, i1 true) #3, !dbg !166 + %3623 = add i32 %2956, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072) to i32), !dbg !166 + %3624 = lshr exact i32 %3623, 4, !dbg !166 + %3625 = and i32 %3624, 16383, !dbg !166 + %3626 = zext nneg i32 %3625 to i64, !dbg !166 + %3627 = or disjoint i64 %3626, 4611686293372403712, !dbg !166 + %3628 = add i32 %3359, 8288, !dbg !166 + %3629 = lshr exact i32 %3628, 4, !dbg !166 + %3630 = and i32 %3629, 16383, !dbg !166 + %3631 = zext nneg i32 %3630 to i64, !dbg !166 + %3632 = or disjoint i64 %3631, 4611686293338849280, !dbg !166 + %3633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 0, !dbg !166 + %3634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 1, !dbg !166 + %3635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 2, !dbg !166 + %3636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 3, !dbg !166 + %3637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 4, !dbg !166 + %3638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 5, !dbg !166 + %3639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 6, !dbg !166 + %3640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 7, !dbg !166 + %3641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 8, !dbg !166 + %3642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 9, !dbg !166 + %3643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 10, !dbg !166 + %3644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 11, !dbg !166 + %3645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 12, !dbg !166 + %3646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 13, !dbg !166 + %3647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 14, !dbg !166 + %3648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 15, !dbg !166 + %3649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 16, !dbg !166 + %3650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 17, !dbg !166 + %3651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 18, !dbg !166 + %3652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 19, !dbg !166 + %3653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 20, !dbg !166 + %3654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 21, !dbg !166 + %3655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 22, !dbg !166 + %3656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 23, !dbg !166 + %3657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 24, !dbg !166 + %3658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 25, !dbg !166 + %3659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 26, !dbg !166 + %3660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 27, !dbg !166 + %3661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 28, !dbg !166 + %3662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 29, !dbg !166 + %3663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 30, !dbg !166 + %3664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3622, 31, !dbg !166 + %3665 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %3633, float %3634, float %3635, float %3636, float %3637, float %3638, float %3639, float %3640, float %3641, float %3642, float %3643, float %3644, float %3645, float %3646, float %3647, float %3648, float %3649, float %3650, float %3651, float %3652, float %3653, float %3654, float %3655, float %3656, float %3657, float %3658, float %3659, float %3660, float %3661, float %3662, float %3663, float %3664, i64 %3627, i64 %3632, i1 true) #3, !dbg !166 + %3666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 0, !dbg !166 + %3667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 1, !dbg !166 + %3668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 2, !dbg !166 + %3669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 3, !dbg !166 + %3670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 4, !dbg !166 + %3671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 5, !dbg !166 + %3672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 6, !dbg !166 + %3673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 7, !dbg !166 + %3674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 8, !dbg !166 + %3675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 9, !dbg !166 + %3676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 10, !dbg !166 + %3677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 11, !dbg !166 + %3678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 12, !dbg !166 + %3679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 13, !dbg !166 + %3680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 14, !dbg !166 + %3681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 15, !dbg !166 + %3682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 16, !dbg !166 + %3683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 17, !dbg !166 + %3684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 18, !dbg !166 + %3685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 19, !dbg !166 + %3686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 20, !dbg !166 + %3687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 21, !dbg !166 + %3688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 22, !dbg !166 + %3689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 23, !dbg !166 + %3690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 24, !dbg !166 + %3691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 25, !dbg !166 + %3692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 26, !dbg !166 + %3693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 27, !dbg !166 + %3694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 28, !dbg !166 + %3695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 29, !dbg !166 + %3696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 30, !dbg !166 + %3697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3665, 31, !dbg !166 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !166 + %3698 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %3666, float %3667, float %3668, float %3669, float %3670, float %3671, float %3672, float %3673, float %3674, float %3675, float %3676, float %3677, float %3678, float %3679, float %3680, float %3681, float %3682, float %3683, float %3684, float %3685, float %3686, float %3687, float %3688, float %3689, float %3690, float %3691, float %3692, float %3693, float %3694, float %3695, float %3696, float %3697, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 131072), i32 0, i32 0, ptr addrspace(3) %3353, i32 0, i32 0) #3, !dbg !166 + %3699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 0, !dbg !166 + %3700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 1, !dbg !166 + %3701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 2, !dbg !166 + %3702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 3, !dbg !166 + %3703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 4, !dbg !166 + %3704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 5, !dbg !166 + %3705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 6, !dbg !166 + %3706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 7, !dbg !166 + %3707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 8, !dbg !166 + %3708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 9, !dbg !166 + %3709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 10, !dbg !166 + %3710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 11, !dbg !166 + %3711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 12, !dbg !166 + %3712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 13, !dbg !166 + %3713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 14, !dbg !166 + %3714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 15, !dbg !166 + %3715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 16, !dbg !166 + %3716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 17, !dbg !166 + %3717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 18, !dbg !166 + %3718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 19, !dbg !166 + %3719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 20, !dbg !166 + %3720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 21, !dbg !166 + %3721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 22, !dbg !166 + %3722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 23, !dbg !166 + %3723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 24, !dbg !166 + %3724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 25, !dbg !166 + %3725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 26, !dbg !166 + %3726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 27, !dbg !166 + %3727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 28, !dbg !166 + %3728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 29, !dbg !166 + %3729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 30, !dbg !166 + %3730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %3698, 31, !dbg !166 + %3731 = fsub float %3699, %363, !dbg !167 + %3732 = fsub float %3700, %363, !dbg !167 + %3733 = fsub float %3701, %365, !dbg !167 + %3734 = fsub float %3702, %365, !dbg !167 + %3735 = fsub float %3703, %363, !dbg !167 + %3736 = fsub float %3704, %363, !dbg !167 + %3737 = fsub float %3705, %365, !dbg !167 + %3738 = fsub float %3706, %365, !dbg !167 + %3739 = fsub float %3707, %363, !dbg !167 + %3740 = fsub float %3708, %363, !dbg !167 + %3741 = fsub float %3709, %365, !dbg !167 + %3742 = fsub float %3710, %365, !dbg !167 + %3743 = fsub float %3711, %363, !dbg !167 + %3744 = fsub float %3712, %363, !dbg !167 + %3745 = fsub float %3713, %365, !dbg !167 + %3746 = fsub float %3714, %365, !dbg !167 + %3747 = fsub float %3715, %363, !dbg !167 + %3748 = fsub float %3716, %363, !dbg !167 + %3749 = fsub float %3717, %365, !dbg !167 + %3750 = fsub float %3718, %365, !dbg !167 + %3751 = fsub float %3719, %363, !dbg !167 + %3752 = fsub float %3720, %363, !dbg !167 + %3753 = fsub float %3721, %365, !dbg !167 + %3754 = fsub float %3722, %365, !dbg !167 + %3755 = fsub float %3723, %363, !dbg !167 + %3756 = fsub float %3724, %363, !dbg !167 + %3757 = fsub float %3725, %365, !dbg !167 + %3758 = fsub float %3726, %365, !dbg !167 + %3759 = fsub float %3727, %363, !dbg !167 + %3760 = fsub float %3728, %363, !dbg !167 + %3761 = fsub float %3729, %365, !dbg !167 + %3762 = fsub float %3730, %365, !dbg !167 + %3763 = fmul float %.0.i1382, %3731, !dbg !168 + %3764 = fmul float %.0.i1385, %3732, !dbg !168 + %3765 = fmul float %.0.i1388, %3733, !dbg !168 + %3766 = fmul float %.0.i1391, %3734, !dbg !168 + %3767 = fmul float %.0.i1394, %3735, !dbg !168 + %3768 = fmul float %.0.i1397, %3736, !dbg !168 + %3769 = fmul float %.0.i1400, %3737, !dbg !168 + %3770 = fmul float %.0.i1403, %3738, !dbg !168 + %3771 = fmul float %.0.i1406, %3739, !dbg !168 + %3772 = fmul float %.0.i1409, %3740, !dbg !168 + %3773 = fmul float %.0.i1412, %3741, !dbg !168 + %3774 = fmul float %.0.i1415, %3742, !dbg !168 + %3775 = fmul float %.0.i1418, %3743, !dbg !168 + %3776 = fmul float %.0.i1421, %3744, !dbg !168 + %3777 = fmul float %.0.i1424, %3745, !dbg !168 + %3778 = fmul float %.0.i1427, %3746, !dbg !168 + %3779 = fmul float %.0.i1430, %3747, !dbg !168 + %3780 = fmul float %.0.i1433, %3748, !dbg !168 + %3781 = fmul float %.0.i1436, %3749, !dbg !168 + %3782 = fmul float %.0.i1439, %3750, !dbg !168 + %3783 = fmul float %.0.i1442, %3751, !dbg !168 + %3784 = fmul float %.0.i1445, %3752, !dbg !168 + %3785 = fmul float %.0.i1448, %3753, !dbg !168 + %3786 = fmul float %.0.i1451, %3754, !dbg !168 + %3787 = fmul float %.0.i1454, %3755, !dbg !168 + %3788 = fmul float %.0.i1457, %3756, !dbg !168 + %3789 = fmul float %.0.i1460, %3757, !dbg !168 + %3790 = fmul float %.0.i1463, %3758, !dbg !168 + %3791 = fmul float %.0.i1466, %3759, !dbg !168 + %3792 = fmul float %.0.i1469, %3760, !dbg !168 + %3793 = fmul float %.0.i1472, %3761, !dbg !168 + %3794 = fmul float %.0.i1475, %3762, !dbg !168 + %3795 = fptrunc float %3763 to bfloat, !dbg !169 + %3796 = select i1 %2645, bfloat %3795, bfloat 0xR0000, !dbg !170 + %3797 = fptrunc float %3764 to bfloat, !dbg !169 + %3798 = select i1 %2647, bfloat %3797, bfloat 0xR0000, !dbg !170 + %3799 = fptrunc float %3765 to bfloat, !dbg !169 + %3800 = select i1 %2645, bfloat %3799, bfloat 0xR0000, !dbg !170 + %3801 = fptrunc float %3766 to bfloat, !dbg !169 + %3802 = select i1 %2647, bfloat %3801, bfloat 0xR0000, !dbg !170 + %3803 = fptrunc float %3767 to bfloat, !dbg !169 + %3804 = select i1 %2649, bfloat %3803, bfloat 0xR0000, !dbg !170 + %3805 = fptrunc float %3768 to bfloat, !dbg !169 + %3806 = select i1 %2651, bfloat %3805, bfloat 0xR0000, !dbg !170 + %3807 = fptrunc float %3769 to bfloat, !dbg !169 + %3808 = select i1 %2649, bfloat %3807, bfloat 0xR0000, !dbg !170 + %3809 = fptrunc float %3770 to bfloat, !dbg !169 + %3810 = select i1 %2651, bfloat %3809, bfloat 0xR0000, !dbg !170 + %3811 = fptrunc float %3771 to bfloat, !dbg !169 + %3812 = select i1 %2653, bfloat %3811, bfloat 0xR0000, !dbg !170 + %3813 = fptrunc float %3772 to bfloat, !dbg !169 + %3814 = select i1 %2655, bfloat %3813, bfloat 0xR0000, !dbg !170 + %3815 = fptrunc float %3773 to bfloat, !dbg !169 + %3816 = select i1 %2653, bfloat %3815, bfloat 0xR0000, !dbg !170 + %3817 = fptrunc float %3774 to bfloat, !dbg !169 + %3818 = select i1 %2655, bfloat %3817, bfloat 0xR0000, !dbg !170 + %3819 = fptrunc float %3775 to bfloat, !dbg !169 + %3820 = select i1 %2657, bfloat %3819, bfloat 0xR0000, !dbg !170 + %3821 = fptrunc float %3776 to bfloat, !dbg !169 + %3822 = select i1 %2659, bfloat %3821, bfloat 0xR0000, !dbg !170 + %3823 = fptrunc float %3777 to bfloat, !dbg !169 + %3824 = select i1 %2657, bfloat %3823, bfloat 0xR0000, !dbg !170 + %3825 = fptrunc float %3778 to bfloat, !dbg !169 + %3826 = select i1 %2659, bfloat %3825, bfloat 0xR0000, !dbg !170 + %3827 = fptrunc float %3779 to bfloat, !dbg !169 + %3828 = select i1 %2661, bfloat %3827, bfloat 0xR0000, !dbg !170 + %3829 = fptrunc float %3780 to bfloat, !dbg !169 + %3830 = select i1 %2663, bfloat %3829, bfloat 0xR0000, !dbg !170 + %3831 = fptrunc float %3781 to bfloat, !dbg !169 + %3832 = select i1 %2661, bfloat %3831, bfloat 0xR0000, !dbg !170 + %3833 = fptrunc float %3782 to bfloat, !dbg !169 + %3834 = select i1 %2663, bfloat %3833, bfloat 0xR0000, !dbg !170 + %3835 = fptrunc float %3783 to bfloat, !dbg !169 + %3836 = select i1 %2665, bfloat %3835, bfloat 0xR0000, !dbg !170 + %3837 = fptrunc float %3784 to bfloat, !dbg !169 + %3838 = select i1 %2667, bfloat %3837, bfloat 0xR0000, !dbg !170 + %3839 = fptrunc float %3785 to bfloat, !dbg !169 + %3840 = select i1 %2665, bfloat %3839, bfloat 0xR0000, !dbg !170 + %3841 = fptrunc float %3786 to bfloat, !dbg !169 + %3842 = select i1 %2667, bfloat %3841, bfloat 0xR0000, !dbg !170 + %3843 = fptrunc float %3787 to bfloat, !dbg !169 + %3844 = select i1 %2669, bfloat %3843, bfloat 0xR0000, !dbg !170 + %3845 = fptrunc float %3788 to bfloat, !dbg !169 + %3846 = select i1 %2671, bfloat %3845, bfloat 0xR0000, !dbg !170 + %3847 = fptrunc float %3789 to bfloat, !dbg !169 + %3848 = select i1 %2669, bfloat %3847, bfloat 0xR0000, !dbg !170 + %3849 = fptrunc float %3790 to bfloat, !dbg !169 + %3850 = select i1 %2671, bfloat %3849, bfloat 0xR0000, !dbg !170 + %3851 = fptrunc float %3791 to bfloat, !dbg !169 + %3852 = select i1 %2673, bfloat %3851, bfloat 0xR0000, !dbg !170 + %3853 = fptrunc float %3792 to bfloat, !dbg !169 + %3854 = select i1 %2675, bfloat %3853, bfloat 0xR0000, !dbg !170 + %3855 = fptrunc float %3793 to bfloat, !dbg !169 + %3856 = select i1 %2673, bfloat %3855, bfloat 0xR0000, !dbg !170 + %3857 = fptrunc float %3794 to bfloat, !dbg !169 + %3858 = select i1 %2675, bfloat %3857, bfloat 0xR0000, !dbg !170 + %3859 = insertelement <2 x bfloat> poison, bfloat %3796, i64 0, !dbg !171 + %3860 = insertelement <2 x bfloat> %3859, bfloat %3798, i64 1, !dbg !171 + %3861 = bitcast <2 x bfloat> %3860 to i32, !dbg !171 + %3862 = insertelement <2 x bfloat> poison, bfloat %3800, i64 0, !dbg !171 + %3863 = insertelement <2 x bfloat> %3862, bfloat %3802, i64 1, !dbg !171 + %3864 = bitcast <2 x bfloat> %3863 to i32, !dbg !171 + %3865 = insertelement <2 x bfloat> poison, bfloat %3804, i64 0, !dbg !171 + %3866 = insertelement <2 x bfloat> %3865, bfloat %3806, i64 1, !dbg !171 + %3867 = bitcast <2 x bfloat> %3866 to i32, !dbg !171 + %3868 = insertelement <2 x bfloat> poison, bfloat %3808, i64 0, !dbg !171 + %3869 = insertelement <2 x bfloat> %3868, bfloat %3810, i64 1, !dbg !171 + %3870 = bitcast <2 x bfloat> %3869 to i32, !dbg !171 + %3871 = insertelement <2 x bfloat> poison, bfloat %3812, i64 0, !dbg !171 + %3872 = insertelement <2 x bfloat> %3871, bfloat %3814, i64 1, !dbg !171 + %3873 = bitcast <2 x bfloat> %3872 to i32, !dbg !171 + %3874 = insertelement <2 x bfloat> poison, bfloat %3816, i64 0, !dbg !171 + %3875 = insertelement <2 x bfloat> %3874, bfloat %3818, i64 1, !dbg !171 + %3876 = bitcast <2 x bfloat> %3875 to i32, !dbg !171 + %3877 = insertelement <2 x bfloat> poison, bfloat %3820, i64 0, !dbg !171 + %3878 = insertelement <2 x bfloat> %3877, bfloat %3822, i64 1, !dbg !171 + %3879 = bitcast <2 x bfloat> %3878 to i32, !dbg !171 + %3880 = insertelement <2 x bfloat> poison, bfloat %3824, i64 0, !dbg !171 + %3881 = insertelement <2 x bfloat> %3880, bfloat %3826, i64 1, !dbg !171 + %3882 = bitcast <2 x bfloat> %3881 to i32, !dbg !171 + %3883 = insertelement <2 x bfloat> poison, bfloat %3828, i64 0, !dbg !171 + %3884 = insertelement <2 x bfloat> %3883, bfloat %3830, i64 1, !dbg !171 + %3885 = bitcast <2 x bfloat> %3884 to i32, !dbg !171 + %3886 = insertelement <2 x bfloat> poison, bfloat %3832, i64 0, !dbg !171 + %3887 = insertelement <2 x bfloat> %3886, bfloat %3834, i64 1, !dbg !171 + %3888 = bitcast <2 x bfloat> %3887 to i32, !dbg !171 + %3889 = insertelement <2 x bfloat> poison, bfloat %3836, i64 0, !dbg !171 + %3890 = insertelement <2 x bfloat> %3889, bfloat %3838, i64 1, !dbg !171 + %3891 = bitcast <2 x bfloat> %3890 to i32, !dbg !171 + %3892 = insertelement <2 x bfloat> poison, bfloat %3840, i64 0, !dbg !171 + %3893 = insertelement <2 x bfloat> %3892, bfloat %3842, i64 1, !dbg !171 + %3894 = bitcast <2 x bfloat> %3893 to i32, !dbg !171 + %3895 = insertelement <2 x bfloat> poison, bfloat %3844, i64 0, !dbg !171 + %3896 = insertelement <2 x bfloat> %3895, bfloat %3846, i64 1, !dbg !171 + %3897 = bitcast <2 x bfloat> %3896 to i32, !dbg !171 + %3898 = insertelement <2 x bfloat> poison, bfloat %3848, i64 0, !dbg !171 + %3899 = insertelement <2 x bfloat> %3898, bfloat %3850, i64 1, !dbg !171 + %3900 = bitcast <2 x bfloat> %3899 to i32, !dbg !171 + %3901 = insertelement <2 x bfloat> poison, bfloat %3852, i64 0, !dbg !171 + %3902 = insertelement <2 x bfloat> %3901, bfloat %3854, i64 1, !dbg !171 + %3903 = bitcast <2 x bfloat> %3902 to i32, !dbg !171 + %3904 = insertelement <2 x bfloat> poison, bfloat %3856, i64 0, !dbg !171 + %3905 = insertelement <2 x bfloat> %3904, bfloat %3858, i64 1, !dbg !171 + %3906 = bitcast <2 x bfloat> %3905 to i32, !dbg !171 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !171 + %3907 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %.pn, float %.pn2477, float %.pn2478, float %.pn2479, float %.pn2480, float %.pn2481, float %.pn2482, float %.pn2483, float %.pn2484, float %.pn2485, float %.pn2486, float %.pn2487, float %.pn2488, float %.pn2489, float %.pn2490, float %.pn2491, float %.pn2492, float %.pn2493, float %.pn2494, float %.pn2495, float %.pn2496, float %.pn2497, float %.pn2498, float %.pn2499, float %.pn2500, float %.pn2501, float %.pn2502, float %.pn2503, float %.pn2504, float %.pn2505, float %.pn2506, float %.pn2507, float %.pn2508, float %.pn2509, float %.pn2510, float %.pn2511, float %.pn2512, float %.pn2513, float %.pn2514, float %.pn2515, float %.pn2516, float %.pn2517, float %.pn2518, float %.pn2519, float %.pn2520, float %.pn2521, float %.pn2522, float %.pn2523, float %.pn2524, float %.pn2525, float %.pn2526, float %.pn2527, float %.pn2528, float %.pn2529, float %.pn2530, float %.pn2531, float %.pn2532, float %.pn2533, float %.pn2534, float %.pn2535, float %.pn2536, float %.pn2537, float %.pn2538, float %.pn2539, i32 %3861, i32 %3864, i32 %3867, i32 %3870, i64 %2690, i1 true) #3, !dbg !171 + %3908 = add i32 %2686, 2048, !dbg !171 + %3909 = lshr exact i32 %3908, 4, !dbg !171 + %3910 = and i32 %3909, 16383, !dbg !171 + %3911 = zext nneg i32 %3910 to i64, !dbg !171 + %3912 = or disjoint i64 %3911, 4611686293338849280, !dbg !171 + %3913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 0, !dbg !171 + %3914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 1, !dbg !171 + %3915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 2, !dbg !171 + %3916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 3, !dbg !171 + %3917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 4, !dbg !171 + %3918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 5, !dbg !171 + %3919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 6, !dbg !171 + %3920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 7, !dbg !171 + %3921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 8, !dbg !171 + %3922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 9, !dbg !171 + %3923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 10, !dbg !171 + %3924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 11, !dbg !171 + %3925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 12, !dbg !171 + %3926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 13, !dbg !171 + %3927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 14, !dbg !171 + %3928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 15, !dbg !171 + %3929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 16, !dbg !171 + %3930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 17, !dbg !171 + %3931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 18, !dbg !171 + %3932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 19, !dbg !171 + %3933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 20, !dbg !171 + %3934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 21, !dbg !171 + %3935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 22, !dbg !171 + %3936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 23, !dbg !171 + %3937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 24, !dbg !171 + %3938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 25, !dbg !171 + %3939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 26, !dbg !171 + %3940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 27, !dbg !171 + %3941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 28, !dbg !171 + %3942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 29, !dbg !171 + %3943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 30, !dbg !171 + %3944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 31, !dbg !171 + %3945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 32, !dbg !171 + %3946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 33, !dbg !171 + %3947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 34, !dbg !171 + %3948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 35, !dbg !171 + %3949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 36, !dbg !171 + %3950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 37, !dbg !171 + %3951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 38, !dbg !171 + %3952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 39, !dbg !171 + %3953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 40, !dbg !171 + %3954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 41, !dbg !171 + %3955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 42, !dbg !171 + %3956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 43, !dbg !171 + %3957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 44, !dbg !171 + %3958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 45, !dbg !171 + %3959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 46, !dbg !171 + %3960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 47, !dbg !171 + %3961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 48, !dbg !171 + %3962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 49, !dbg !171 + %3963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 50, !dbg !171 + %3964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 51, !dbg !171 + %3965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 52, !dbg !171 + %3966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 53, !dbg !171 + %3967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 54, !dbg !171 + %3968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 55, !dbg !171 + %3969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 56, !dbg !171 + %3970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 57, !dbg !171 + %3971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 58, !dbg !171 + %3972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 59, !dbg !171 + %3973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 60, !dbg !171 + %3974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 61, !dbg !171 + %3975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 62, !dbg !171 + %3976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3907, 63, !dbg !171 + %3977 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3913, float %3914, float %3915, float %3916, float %3917, float %3918, float %3919, float %3920, float %3921, float %3922, float %3923, float %3924, float %3925, float %3926, float %3927, float %3928, float %3929, float %3930, float %3931, float %3932, float %3933, float %3934, float %3935, float %3936, float %3937, float %3938, float %3939, float %3940, float %3941, float %3942, float %3943, float %3944, float %3945, float %3946, float %3947, float %3948, float %3949, float %3950, float %3951, float %3952, float %3953, float %3954, float %3955, float %3956, float %3957, float %3958, float %3959, float %3960, float %3961, float %3962, float %3963, float %3964, float %3965, float %3966, float %3967, float %3968, float %3969, float %3970, float %3971, float %3972, float %3973, float %3974, float %3975, float %3976, i32 %3873, i32 %3876, i32 %3879, i32 %3882, i64 %3912, i1 true) #3, !dbg !171 + %3978 = add i32 %2686, 4096, !dbg !171 + %3979 = lshr exact i32 %3978, 4, !dbg !171 + %3980 = and i32 %3979, 16383, !dbg !171 + %3981 = zext nneg i32 %3980 to i64, !dbg !171 + %3982 = or disjoint i64 %3981, 4611686293338849280, !dbg !171 + %3983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 0, !dbg !171 + %3984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 1, !dbg !171 + %3985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 2, !dbg !171 + %3986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 3, !dbg !171 + %3987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 4, !dbg !171 + %3988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 5, !dbg !171 + %3989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 6, !dbg !171 + %3990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 7, !dbg !171 + %3991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 8, !dbg !171 + %3992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 9, !dbg !171 + %3993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 10, !dbg !171 + %3994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 11, !dbg !171 + %3995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 12, !dbg !171 + %3996 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 13, !dbg !171 + %3997 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 14, !dbg !171 + %3998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 15, !dbg !171 + %3999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 16, !dbg !171 + %4000 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 17, !dbg !171 + %4001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 18, !dbg !171 + %4002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 19, !dbg !171 + %4003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 20, !dbg !171 + %4004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 21, !dbg !171 + %4005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 22, !dbg !171 + %4006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 23, !dbg !171 + %4007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 24, !dbg !171 + %4008 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 25, !dbg !171 + %4009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 26, !dbg !171 + %4010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 27, !dbg !171 + %4011 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 28, !dbg !171 + %4012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 29, !dbg !171 + %4013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 30, !dbg !171 + %4014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 31, !dbg !171 + %4015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 32, !dbg !171 + %4016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 33, !dbg !171 + %4017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 34, !dbg !171 + %4018 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 35, !dbg !171 + %4019 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 36, !dbg !171 + %4020 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 37, !dbg !171 + %4021 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 38, !dbg !171 + %4022 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 39, !dbg !171 + %4023 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 40, !dbg !171 + %4024 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 41, !dbg !171 + %4025 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 42, !dbg !171 + %4026 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 43, !dbg !171 + %4027 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 44, !dbg !171 + %4028 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 45, !dbg !171 + %4029 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 46, !dbg !171 + %4030 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 47, !dbg !171 + %4031 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 48, !dbg !171 + %4032 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 49, !dbg !171 + %4033 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 50, !dbg !171 + %4034 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 51, !dbg !171 + %4035 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 52, !dbg !171 + %4036 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 53, !dbg !171 + %4037 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 54, !dbg !171 + %4038 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 55, !dbg !171 + %4039 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 56, !dbg !171 + %4040 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 57, !dbg !171 + %4041 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 58, !dbg !171 + %4042 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 59, !dbg !171 + %4043 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 60, !dbg !171 + %4044 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 61, !dbg !171 + %4045 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 62, !dbg !171 + %4046 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3977, 63, !dbg !171 + %4047 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3983, float %3984, float %3985, float %3986, float %3987, float %3988, float %3989, float %3990, float %3991, float %3992, float %3993, float %3994, float %3995, float %3996, float %3997, float %3998, float %3999, float %4000, float %4001, float %4002, float %4003, float %4004, float %4005, float %4006, float %4007, float %4008, float %4009, float %4010, float %4011, float %4012, float %4013, float %4014, float %4015, float %4016, float %4017, float %4018, float %4019, float %4020, float %4021, float %4022, float %4023, float %4024, float %4025, float %4026, float %4027, float %4028, float %4029, float %4030, float %4031, float %4032, float %4033, float %4034, float %4035, float %4036, float %4037, float %4038, float %4039, float %4040, float %4041, float %4042, float %4043, float %4044, float %4045, float %4046, i32 %3885, i32 %3888, i32 %3891, i32 %3894, i64 %3982, i1 true) #3, !dbg !171 + %4048 = add i32 %2686, 6144, !dbg !171 + %4049 = lshr exact i32 %4048, 4, !dbg !171 + %4050 = and i32 %4049, 16383, !dbg !171 + %4051 = zext nneg i32 %4050 to i64, !dbg !171 + %4052 = or disjoint i64 %4051, 4611686293338849280, !dbg !171 + %4053 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 0, !dbg !171 + %4054 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 1, !dbg !171 + %4055 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 2, !dbg !171 + %4056 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 3, !dbg !171 + %4057 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 4, !dbg !171 + %4058 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 5, !dbg !171 + %4059 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 6, !dbg !171 + %4060 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 7, !dbg !171 + %4061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 8, !dbg !171 + %4062 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 9, !dbg !171 + %4063 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 10, !dbg !171 + %4064 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 11, !dbg !171 + %4065 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 12, !dbg !171 + %4066 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 13, !dbg !171 + %4067 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 14, !dbg !171 + %4068 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 15, !dbg !171 + %4069 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 16, !dbg !171 + %4070 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 17, !dbg !171 + %4071 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 18, !dbg !171 + %4072 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 19, !dbg !171 + %4073 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 20, !dbg !171 + %4074 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 21, !dbg !171 + %4075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 22, !dbg !171 + %4076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 23, !dbg !171 + %4077 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 24, !dbg !171 + %4078 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 25, !dbg !171 + %4079 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 26, !dbg !171 + %4080 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 27, !dbg !171 + %4081 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 28, !dbg !171 + %4082 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 29, !dbg !171 + %4083 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 30, !dbg !171 + %4084 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 31, !dbg !171 + %4085 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 32, !dbg !171 + %4086 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 33, !dbg !171 + %4087 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 34, !dbg !171 + %4088 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 35, !dbg !171 + %4089 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 36, !dbg !171 + %4090 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 37, !dbg !171 + %4091 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 38, !dbg !171 + %4092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 39, !dbg !171 + %4093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 40, !dbg !171 + %4094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 41, !dbg !171 + %4095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 42, !dbg !171 + %4096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 43, !dbg !171 + %4097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 44, !dbg !171 + %4098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 45, !dbg !171 + %4099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 46, !dbg !171 + %4100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 47, !dbg !171 + %4101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 48, !dbg !171 + %4102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 49, !dbg !171 + %4103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 50, !dbg !171 + %4104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 51, !dbg !171 + %4105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 52, !dbg !171 + %4106 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 53, !dbg !171 + %4107 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 54, !dbg !171 + %4108 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 55, !dbg !171 + %4109 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 56, !dbg !171 + %4110 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 57, !dbg !171 + %4111 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 58, !dbg !171 + %4112 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 59, !dbg !171 + %4113 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 60, !dbg !171 + %4114 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 61, !dbg !171 + %4115 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 62, !dbg !171 + %4116 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4047, 63, !dbg !171 + %4117 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %4053, float %4054, float %4055, float %4056, float %4057, float %4058, float %4059, float %4060, float %4061, float %4062, float %4063, float %4064, float %4065, float %4066, float %4067, float %4068, float %4069, float %4070, float %4071, float %4072, float %4073, float %4074, float %4075, float %4076, float %4077, float %4078, float %4079, float %4080, float %4081, float %4082, float %4083, float %4084, float %4085, float %4086, float %4087, float %4088, float %4089, float %4090, float %4091, float %4092, float %4093, float %4094, float %4095, float %4096, float %4097, float %4098, float %4099, float %4100, float %4101, float %4102, float %4103, float %4104, float %4105, float %4106, float %4107, float %4108, float %4109, float %4110, float %4111, float %4112, float %4113, float %4114, float %4115, float %4116, i32 %3897, i32 %3900, i32 %3903, i32 %3906, i64 %4052, i1 true) #3, !dbg !171 + %4118 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 0, !dbg !171 + %4119 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 1, !dbg !171 + %4120 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 2, !dbg !171 + %4121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 3, !dbg !171 + %4122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 4, !dbg !171 + %4123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 5, !dbg !171 + %4124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 6, !dbg !171 + %4125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 7, !dbg !171 + %4126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 8, !dbg !171 + %4127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 9, !dbg !171 + %4128 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 10, !dbg !171 + %4129 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 11, !dbg !171 + %4130 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 12, !dbg !171 + %4131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 13, !dbg !171 + %4132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 14, !dbg !171 + %4133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 15, !dbg !171 + %4134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 16, !dbg !171 + %4135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 17, !dbg !171 + %4136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 18, !dbg !171 + %4137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 19, !dbg !171 + %4138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 20, !dbg !171 + %4139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 21, !dbg !171 + %4140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 22, !dbg !171 + %4141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 23, !dbg !171 + %4142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 24, !dbg !171 + %4143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 25, !dbg !171 + %4144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 26, !dbg !171 + %4145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 27, !dbg !171 + %4146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 28, !dbg !171 + %4147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 29, !dbg !171 + %4148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 30, !dbg !171 + %4149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 31, !dbg !171 + %4150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 32, !dbg !171 + %4151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 33, !dbg !171 + %4152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 34, !dbg !171 + %4153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 35, !dbg !171 + %4154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 36, !dbg !171 + %4155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 37, !dbg !171 + %4156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 38, !dbg !171 + %4157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 39, !dbg !171 + %4158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 40, !dbg !171 + %4159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 41, !dbg !171 + %4160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 42, !dbg !171 + %4161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 43, !dbg !171 + %4162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 44, !dbg !171 + %4163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 45, !dbg !171 + %4164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 46, !dbg !171 + %4165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 47, !dbg !171 + %4166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 48, !dbg !171 + %4167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 49, !dbg !171 + %4168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 50, !dbg !171 + %4169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 51, !dbg !171 + %4170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 52, !dbg !171 + %4171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 53, !dbg !171 + %4172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 54, !dbg !171 + %4173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 55, !dbg !171 + %4174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 56, !dbg !171 + %4175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 57, !dbg !171 + %4176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 58, !dbg !171 + %4177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 59, !dbg !171 + %4178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 60, !dbg !171 + %4179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 61, !dbg !171 + %4180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 62, !dbg !171 + %4181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4117, 63, !dbg !171 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !171 + %4182 = insertelement <16 x i32> poison, i32 %2634, i64 0, !dbg !159 + %4183 = shufflevector <16 x i32> %4182, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !159 + %4184 = add <16 x i32> %4183, %2638, !dbg !159 + %4185 = add nuw nsw i32 %2637, 1, !dbg !154 + %4186 = lshr i32 %4185, 1, !dbg !172 + %4187 = zext nneg i32 %4186 to i64, !dbg !173 + %4188 = getelementptr i32, ptr addrspace(1) %2420, i64 %4187, !dbg !173 + %4189 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !174 + %4190 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %4188, i64 %4189, i1 %2640) #3, !dbg !174 + %4191 = add nuw nsw i32 %4186, 1, !dbg !175 + %4192 = icmp slt i32 %4191, %2424, !dbg !176 + %4193 = getelementptr i8, ptr addrspace(1) %4188, i64 4, !dbg !177 + %4194 = and i1 %2640, %4192, !dbg !154 + %4195 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !178 + %4196 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %4193, i64 %4195, i1 %4194) #3, !dbg !178 + %4197 = and i32 %2637, 1, !dbg !179 + %4198 = sub i32 %4196, %4190, !dbg !180 + %4199 = shl i32 %4198, 7, !dbg !181 + %4200 = add i32 %4199, -64, !dbg !182 + %4201 = xor i32 %4197, 1, !dbg !183 + %4202 = mul nuw nsw i32 %4200, %4201, !dbg !183 + %4203 = shl nuw nsw i32 %4197, 6, !dbg !184 + %4204 = add i32 %4202, %4203, !dbg !185 + %4205 = shl i32 %4204, 7, !dbg !186 + %4206 = sext i32 %4205 to i64, !dbg !157 + %4207 = getelementptr bfloat, ptr addrspace(1) %.pn10871608, i64 %4206, !dbg !157 + %4208 = getelementptr bfloat, ptr addrspace(1) %.pn10711609, i64 %4206, !dbg !157 + %4209 = getelementptr bfloat, ptr addrspace(1) %.pn10551610, i64 %4206, !dbg !157 + %4210 = getelementptr bfloat, ptr addrspace(1) %.pn10391611, i64 %4206, !dbg !157 + %4211 = getelementptr bfloat, ptr addrspace(1) %.pn11591616, i64 %4206, !dbg !158 + %4212 = getelementptr bfloat, ptr addrspace(1) %.pn11431617, i64 %4206, !dbg !158 + %4213 = getelementptr bfloat, ptr addrspace(1) %.pn11271618, i64 %4206, !dbg !158 + %4214 = getelementptr bfloat, ptr addrspace(1) %.pn11111619, i64 %4206, !dbg !158 + %4215 = add i32 %4204, %.pn10951612, !dbg !159 + %4216 = add i32 %4204, %.pn10931613, !dbg !159 + %4217 = add i32 %4204, %.pn10911614, !dbg !159 + %4218 = add i32 %4204, %.pn10891615, !dbg !159 + %4219 = add i32 %2636, 1, !dbg !154 + %4220 = icmp sgt i32 %4219, 2, !dbg !154 + %4221 = select i1 %4220, i32 0, i32 %4219, !dbg !154 + %4222 = icmp slt i32 %4215, %18, !dbg !155 + %4223 = icmp slt i32 %4216, %18, !dbg !155 + %4224 = icmp slt i32 %4217, %18, !dbg !155 + %4225 = icmp slt i32 %4218, %18, !dbg !155 + %4226 = shl i32 %4221, 13, !dbg !156 + %4227 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %4226, !dbg !156 + %4228 = and i1 %2639, %4222, !dbg !154 + %4229 = and i1 %2639, %4223, !dbg !154 + %4230 = and i1 %2639, %4224, !dbg !154 + %4231 = and i1 %2639, %4225, !dbg !154 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !156 + %4232 = getelementptr inbounds nuw i8, ptr addrspace(3) %4227, i32 %422, !dbg !156 + %4233 = select i1 %4228, i32 16, i32 0, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4232, ptr addrspace(1) %4207, i32 %4233) #3, !dbg !156 + %4234 = getelementptr inbounds nuw i8, ptr addrspace(3) %4227, i32 %425, !dbg !156 + %4235 = select i1 %4229, i32 16, i32 0, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4234, ptr addrspace(1) %4208, i32 %4235) #3, !dbg !156 + %4236 = getelementptr inbounds nuw i8, ptr addrspace(3) %4227, i32 %428, !dbg !156 + %4237 = select i1 %4230, i32 16, i32 0, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4236, ptr addrspace(1) %4209, i32 %4237) #3, !dbg !156 + %4238 = getelementptr inbounds nuw i8, ptr addrspace(3) %4227, i32 %431, !dbg !156 + %4239 = select i1 %4231, i32 16, i32 0, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4238, ptr addrspace(1) %4210, i32 %4239) #3, !dbg !156 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !156 + %4240 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %4226, !dbg !156 + %4241 = getelementptr inbounds nuw i8, ptr addrspace(3) %4240, i32 %422, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %4241, ptr addrspace(1) %4211, i32 %4233) #3, !dbg !156 + %4242 = getelementptr inbounds nuw i8, ptr addrspace(3) %4240, i32 %425, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4242, ptr addrspace(1) %4212, i32 %4235) #3, !dbg !156 + %4243 = getelementptr inbounds nuw i8, ptr addrspace(3) %4240, i32 %428, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4243, ptr addrspace(1) %4213, i32 %4237) #3, !dbg !156 + %4244 = getelementptr inbounds nuw i8, ptr addrspace(3) %4240, i32 %431, !dbg !156 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %4244, ptr addrspace(1) %4214, i32 %4239) #3, !dbg !156 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !156 + %exitcond2210.not = icmp eq i32 %4185, %smax2209, !dbg !154 + br i1 %exitcond2210.not, label %._crit_edge1638, label %2633, !dbg !154 + +._crit_edge1638: ; preds = %__nv_exp2f.exit1476, %._crit_edge + %4245 = phi float [ %2493, %._crit_edge ], [ %4118, %__nv_exp2f.exit1476 ], !dbg !72 + %4246 = phi float [ %2494, %._crit_edge ], [ %4119, %__nv_exp2f.exit1476 ], !dbg !72 + %4247 = phi float [ %2495, %._crit_edge ], [ %4120, %__nv_exp2f.exit1476 ], !dbg !72 + %4248 = phi float [ %2496, %._crit_edge ], [ %4121, %__nv_exp2f.exit1476 ], !dbg !72 + %4249 = phi float [ %2497, %._crit_edge ], [ %4122, %__nv_exp2f.exit1476 ], !dbg !72 + %4250 = phi float [ %2498, %._crit_edge ], [ %4123, %__nv_exp2f.exit1476 ], !dbg !72 + %4251 = phi float [ %2499, %._crit_edge ], [ %4124, %__nv_exp2f.exit1476 ], !dbg !72 + %4252 = phi float [ %2500, %._crit_edge ], [ %4125, %__nv_exp2f.exit1476 ], !dbg !72 + %4253 = phi float [ %2501, %._crit_edge ], [ %4126, %__nv_exp2f.exit1476 ], !dbg !72 + %4254 = phi float [ %2502, %._crit_edge ], [ %4127, %__nv_exp2f.exit1476 ], !dbg !72 + %4255 = phi float [ %2503, %._crit_edge ], [ %4128, %__nv_exp2f.exit1476 ], !dbg !72 + %4256 = phi float [ %2504, %._crit_edge ], [ %4129, %__nv_exp2f.exit1476 ], !dbg !72 + %4257 = phi float [ %2505, %._crit_edge ], [ %4130, %__nv_exp2f.exit1476 ], !dbg !72 + %4258 = phi float [ %2506, %._crit_edge ], [ %4131, %__nv_exp2f.exit1476 ], !dbg !72 + %4259 = phi float [ %2507, %._crit_edge ], [ %4132, %__nv_exp2f.exit1476 ], !dbg !72 + %4260 = phi float [ %2508, %._crit_edge ], [ %4133, %__nv_exp2f.exit1476 ], !dbg !72 + %4261 = phi float [ %2509, %._crit_edge ], [ %4134, %__nv_exp2f.exit1476 ], !dbg !72 + %4262 = phi float [ %2510, %._crit_edge ], [ %4135, %__nv_exp2f.exit1476 ], !dbg !72 + %4263 = phi float [ %2511, %._crit_edge ], [ %4136, %__nv_exp2f.exit1476 ], !dbg !72 + %4264 = phi float [ %2512, %._crit_edge ], [ %4137, %__nv_exp2f.exit1476 ], !dbg !72 + %4265 = phi float [ %2513, %._crit_edge ], [ %4138, %__nv_exp2f.exit1476 ], !dbg !72 + %4266 = phi float [ %2514, %._crit_edge ], [ %4139, %__nv_exp2f.exit1476 ], !dbg !72 + %4267 = phi float [ %2515, %._crit_edge ], [ %4140, %__nv_exp2f.exit1476 ], !dbg !72 + %4268 = phi float [ %2516, %._crit_edge ], [ %4141, %__nv_exp2f.exit1476 ], !dbg !72 + %4269 = phi float [ %2517, %._crit_edge ], [ %4142, %__nv_exp2f.exit1476 ], !dbg !72 + %4270 = phi float [ %2518, %._crit_edge ], [ %4143, %__nv_exp2f.exit1476 ], !dbg !72 + %4271 = phi float [ %2519, %._crit_edge ], [ %4144, %__nv_exp2f.exit1476 ], !dbg !72 + %4272 = phi float [ %2520, %._crit_edge ], [ %4145, %__nv_exp2f.exit1476 ], !dbg !72 + %4273 = phi float [ %2521, %._crit_edge ], [ %4146, %__nv_exp2f.exit1476 ], !dbg !72 + %4274 = phi float [ %2522, %._crit_edge ], [ %4147, %__nv_exp2f.exit1476 ], !dbg !72 + %4275 = phi float [ %2523, %._crit_edge ], [ %4148, %__nv_exp2f.exit1476 ], !dbg !72 + %4276 = phi float [ %2524, %._crit_edge ], [ %4149, %__nv_exp2f.exit1476 ], !dbg !72 + %4277 = phi float [ %2525, %._crit_edge ], [ %4150, %__nv_exp2f.exit1476 ], !dbg !72 + %4278 = phi float [ %2526, %._crit_edge ], [ %4151, %__nv_exp2f.exit1476 ], !dbg !72 + %4279 = phi float [ %2527, %._crit_edge ], [ %4152, %__nv_exp2f.exit1476 ], !dbg !72 + %4280 = phi float [ %2528, %._crit_edge ], [ %4153, %__nv_exp2f.exit1476 ], !dbg !72 + %4281 = phi float [ %2529, %._crit_edge ], [ %4154, %__nv_exp2f.exit1476 ], !dbg !72 + %4282 = phi float [ %2530, %._crit_edge ], [ %4155, %__nv_exp2f.exit1476 ], !dbg !72 + %4283 = phi float [ %2531, %._crit_edge ], [ %4156, %__nv_exp2f.exit1476 ], !dbg !72 + %4284 = phi float [ %2532, %._crit_edge ], [ %4157, %__nv_exp2f.exit1476 ], !dbg !72 + %4285 = phi float [ %2533, %._crit_edge ], [ %4158, %__nv_exp2f.exit1476 ], !dbg !72 + %4286 = phi float [ %2534, %._crit_edge ], [ %4159, %__nv_exp2f.exit1476 ], !dbg !72 + %4287 = phi float [ %2535, %._crit_edge ], [ %4160, %__nv_exp2f.exit1476 ], !dbg !72 + %4288 = phi float [ %2536, %._crit_edge ], [ %4161, %__nv_exp2f.exit1476 ], !dbg !72 + %4289 = phi float [ %2537, %._crit_edge ], [ %4162, %__nv_exp2f.exit1476 ], !dbg !72 + %4290 = phi float [ %2538, %._crit_edge ], [ %4163, %__nv_exp2f.exit1476 ], !dbg !72 + %4291 = phi float [ %2539, %._crit_edge ], [ %4164, %__nv_exp2f.exit1476 ], !dbg !72 + %4292 = phi float [ %2540, %._crit_edge ], [ %4165, %__nv_exp2f.exit1476 ], !dbg !72 + %4293 = phi float [ %2541, %._crit_edge ], [ %4166, %__nv_exp2f.exit1476 ], !dbg !72 + %4294 = phi float [ %2542, %._crit_edge ], [ %4167, %__nv_exp2f.exit1476 ], !dbg !72 + %4295 = phi float [ %2543, %._crit_edge ], [ %4168, %__nv_exp2f.exit1476 ], !dbg !72 + %4296 = phi float [ %2544, %._crit_edge ], [ %4169, %__nv_exp2f.exit1476 ], !dbg !72 + %4297 = phi float [ %2545, %._crit_edge ], [ %4170, %__nv_exp2f.exit1476 ], !dbg !72 + %4298 = phi float [ %2546, %._crit_edge ], [ %4171, %__nv_exp2f.exit1476 ], !dbg !72 + %4299 = phi float [ %2547, %._crit_edge ], [ %4172, %__nv_exp2f.exit1476 ], !dbg !72 + %4300 = phi float [ %2548, %._crit_edge ], [ %4173, %__nv_exp2f.exit1476 ], !dbg !72 + %4301 = phi float [ %2549, %._crit_edge ], [ %4174, %__nv_exp2f.exit1476 ], !dbg !72 + %4302 = phi float [ %2550, %._crit_edge ], [ %4175, %__nv_exp2f.exit1476 ], !dbg !72 + %4303 = phi float [ %2551, %._crit_edge ], [ %4176, %__nv_exp2f.exit1476 ], !dbg !72 + %4304 = phi float [ %2552, %._crit_edge ], [ %4177, %__nv_exp2f.exit1476 ], !dbg !72 + %4305 = phi float [ %2553, %._crit_edge ], [ %4178, %__nv_exp2f.exit1476 ], !dbg !72 + %4306 = phi float [ %2554, %._crit_edge ], [ %4179, %__nv_exp2f.exit1476 ], !dbg !72 + %4307 = phi float [ %2555, %._crit_edge ], [ %4180, %__nv_exp2f.exit1476 ], !dbg !72 + %4308 = phi float [ %2556, %._crit_edge ], [ %4181, %__nv_exp2f.exit1476 ], !dbg !72 + %4309 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %4245, float %4246, float %4247, float %4248, float %4249, float %4250, float %4251, float %4252, float %4253, float %4254, float %4255, float %4256, float %4257, float %4258, float %4259, float %4260, float %4261, float %4262, float %4263, float %4264, float %4265, float %4266, float %4267, float %4268, float %4269, float %4270, float %4271, float %4272, float %4273, float %4274, float %4275, float %4276, float %4277, float %4278, float %4279, float %4280, float %4281, float %4282, float %4283, float %4284, float %4285, float %4286, float %4287, float %4288, float %4289, float %4290, float %4291, float %4292, float %4293, float %4294, float %4295, float %4296, float %4297, float %4298, float %4299, float %4300, float %4301, float %4302, float %4303, float %4304, float %4305, float %4306, float %4307, float %4308) #3, !dbg !154 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !154 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !154 + %4310 = getelementptr bfloat, ptr addrspace(1) %80, i64 %100, !dbg !187 + %4311 = getelementptr bfloat, ptr addrspace(1) %80, i64 %102, !dbg !187 + %4312 = getelementptr bfloat, ptr addrspace(1) %80, i64 %104, !dbg !187 + %4313 = getelementptr bfloat, ptr addrspace(1) %80, i64 %106, !dbg !187 + %4314 = getelementptr bfloat, ptr addrspace(1) %80, i64 %108, !dbg !187 + %4315 = getelementptr bfloat, ptr addrspace(1) %80, i64 %110, !dbg !187 + %4316 = getelementptr bfloat, ptr addrspace(1) %80, i64 %112, !dbg !187 + %4317 = getelementptr bfloat, ptr addrspace(1) %80, i64 %114, !dbg !187 + %4318 = getelementptr bfloat, ptr addrspace(1) %4310, i64 %118, !dbg !188 + %4319 = getelementptr bfloat, ptr addrspace(1) %4311, i64 %118, !dbg !188 + %4320 = getelementptr bfloat, ptr addrspace(1) %4312, i64 %118, !dbg !188 + %4321 = getelementptr bfloat, ptr addrspace(1) %4313, i64 %118, !dbg !188 + %4322 = getelementptr bfloat, ptr addrspace(1) %4314, i64 %118, !dbg !188 + %4323 = getelementptr bfloat, ptr addrspace(1) %4315, i64 %118, !dbg !188 + %4324 = getelementptr bfloat, ptr addrspace(1) %4316, i64 %118, !dbg !188 + %4325 = getelementptr bfloat, ptr addrspace(1) %4317, i64 %118, !dbg !188 + %4326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 0, !dbg !189 + %4327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 1, !dbg !189 + %4328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 2, !dbg !189 + %4329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 3, !dbg !189 + %4330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 4, !dbg !189 + %4331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 5, !dbg !189 + %4332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 6, !dbg !189 + %4333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 7, !dbg !189 + %4334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 8, !dbg !189 + %4335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 9, !dbg !189 + %4336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 10, !dbg !189 + %4337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 11, !dbg !189 + %4338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 12, !dbg !189 + %4339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 13, !dbg !189 + %4340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 14, !dbg !189 + %4341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 15, !dbg !189 + %4342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 16, !dbg !189 + %4343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 17, !dbg !189 + %4344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 18, !dbg !189 + %4345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 19, !dbg !189 + %4346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 20, !dbg !189 + %4347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 21, !dbg !189 + %4348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 22, !dbg !189 + %4349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 23, !dbg !189 + %4350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 24, !dbg !189 + %4351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 25, !dbg !189 + %4352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 26, !dbg !189 + %4353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 27, !dbg !189 + %4354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 28, !dbg !189 + %4355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 29, !dbg !189 + %4356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 30, !dbg !189 + %4357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 31, !dbg !189 + %4358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 32, !dbg !189 + %4359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 33, !dbg !189 + %4360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 34, !dbg !189 + %4361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 35, !dbg !189 + %4362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 36, !dbg !189 + %4363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 37, !dbg !189 + %4364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 38, !dbg !189 + %4365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 39, !dbg !189 + %4366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 40, !dbg !189 + %4367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 41, !dbg !189 + %4368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 42, !dbg !189 + %4369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 43, !dbg !189 + %4370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 44, !dbg !189 + %4371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 45, !dbg !189 + %4372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 46, !dbg !189 + %4373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 47, !dbg !189 + %4374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 48, !dbg !189 + %4375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 49, !dbg !189 + %4376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 50, !dbg !189 + %4377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 51, !dbg !189 + %4378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 52, !dbg !189 + %4379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 53, !dbg !189 + %4380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 54, !dbg !189 + %4381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 55, !dbg !189 + %4382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 56, !dbg !189 + %4383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 57, !dbg !189 + %4384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 58, !dbg !189 + %4385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 59, !dbg !189 + %4386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 60, !dbg !189 + %4387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 61, !dbg !189 + %4388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 62, !dbg !189 + %4389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %4309, 63, !dbg !189 + %4390 = insertelement <2 x float> poison, float %4326, i64 0, !dbg !189 + %4391 = insertelement <2 x float> %4390, float %4327, i64 1, !dbg !189 + %4392 = fmul <2 x float> %4391, splat (float 0x3FB6A09E60000000), !dbg !189 + %4393 = fptrunc <2 x float> %4392 to <2 x bfloat>, !dbg !190 + %4394 = insertelement <2 x float> poison, float %4328, i64 0, !dbg !189 + %4395 = insertelement <2 x float> %4394, float %4329, i64 1, !dbg !189 + %4396 = fmul <2 x float> %4395, splat (float 0x3FB6A09E60000000), !dbg !189 + %4397 = fptrunc <2 x float> %4396 to <2 x bfloat>, !dbg !190 + %4398 = insertelement <2 x float> poison, float %4330, i64 0, !dbg !189 + %4399 = insertelement <2 x float> %4398, float %4331, i64 1, !dbg !189 + %4400 = fmul <2 x float> %4399, splat (float 0x3FB6A09E60000000), !dbg !189 + %4401 = fptrunc <2 x float> %4400 to <2 x bfloat>, !dbg !190 + %4402 = insertelement <2 x float> poison, float %4332, i64 0, !dbg !189 + %4403 = insertelement <2 x float> %4402, float %4333, i64 1, !dbg !189 + %4404 = fmul <2 x float> %4403, splat (float 0x3FB6A09E60000000), !dbg !189 + %4405 = fptrunc <2 x float> %4404 to <2 x bfloat>, !dbg !190 + %4406 = insertelement <2 x float> poison, float %4334, i64 0, !dbg !189 + %4407 = insertelement <2 x float> %4406, float %4335, i64 1, !dbg !189 + %4408 = fmul <2 x float> %4407, splat (float 0x3FB6A09E60000000), !dbg !189 + %4409 = fptrunc <2 x float> %4408 to <2 x bfloat>, !dbg !190 + %4410 = insertelement <2 x float> poison, float %4336, i64 0, !dbg !189 + %4411 = insertelement <2 x float> %4410, float %4337, i64 1, !dbg !189 + %4412 = fmul <2 x float> %4411, splat (float 0x3FB6A09E60000000), !dbg !189 + %4413 = fptrunc <2 x float> %4412 to <2 x bfloat>, !dbg !190 + %4414 = insertelement <2 x float> poison, float %4338, i64 0, !dbg !189 + %4415 = insertelement <2 x float> %4414, float %4339, i64 1, !dbg !189 + %4416 = fmul <2 x float> %4415, splat (float 0x3FB6A09E60000000), !dbg !189 + %4417 = fptrunc <2 x float> %4416 to <2 x bfloat>, !dbg !190 + %4418 = insertelement <2 x float> poison, float %4340, i64 0, !dbg !189 + %4419 = insertelement <2 x float> %4418, float %4341, i64 1, !dbg !189 + %4420 = fmul <2 x float> %4419, splat (float 0x3FB6A09E60000000), !dbg !189 + %4421 = fptrunc <2 x float> %4420 to <2 x bfloat>, !dbg !190 + %4422 = insertelement <2 x float> poison, float %4342, i64 0, !dbg !189 + %4423 = insertelement <2 x float> %4422, float %4343, i64 1, !dbg !189 + %4424 = fmul <2 x float> %4423, splat (float 0x3FB6A09E60000000), !dbg !189 + %4425 = fptrunc <2 x float> %4424 to <2 x bfloat>, !dbg !190 + %4426 = insertelement <2 x float> poison, float %4344, i64 0, !dbg !189 + %4427 = insertelement <2 x float> %4426, float %4345, i64 1, !dbg !189 + %4428 = fmul <2 x float> %4427, splat (float 0x3FB6A09E60000000), !dbg !189 + %4429 = fptrunc <2 x float> %4428 to <2 x bfloat>, !dbg !190 + %4430 = insertelement <2 x float> poison, float %4346, i64 0, !dbg !189 + %4431 = insertelement <2 x float> %4430, float %4347, i64 1, !dbg !189 + %4432 = fmul <2 x float> %4431, splat (float 0x3FB6A09E60000000), !dbg !189 + %4433 = fptrunc <2 x float> %4432 to <2 x bfloat>, !dbg !190 + %4434 = insertelement <2 x float> poison, float %4348, i64 0, !dbg !189 + %4435 = insertelement <2 x float> %4434, float %4349, i64 1, !dbg !189 + %4436 = fmul <2 x float> %4435, splat (float 0x3FB6A09E60000000), !dbg !189 + %4437 = fptrunc <2 x float> %4436 to <2 x bfloat>, !dbg !190 + %4438 = insertelement <2 x float> poison, float %4350, i64 0, !dbg !189 + %4439 = insertelement <2 x float> %4438, float %4351, i64 1, !dbg !189 + %4440 = fmul <2 x float> %4439, splat (float 0x3FB6A09E60000000), !dbg !189 + %4441 = fptrunc <2 x float> %4440 to <2 x bfloat>, !dbg !190 + %4442 = insertelement <2 x float> poison, float %4352, i64 0, !dbg !189 + %4443 = insertelement <2 x float> %4442, float %4353, i64 1, !dbg !189 + %4444 = fmul <2 x float> %4443, splat (float 0x3FB6A09E60000000), !dbg !189 + %4445 = fptrunc <2 x float> %4444 to <2 x bfloat>, !dbg !190 + %4446 = insertelement <2 x float> poison, float %4354, i64 0, !dbg !189 + %4447 = insertelement <2 x float> %4446, float %4355, i64 1, !dbg !189 + %4448 = fmul <2 x float> %4447, splat (float 0x3FB6A09E60000000), !dbg !189 + %4449 = fptrunc <2 x float> %4448 to <2 x bfloat>, !dbg !190 + %4450 = insertelement <2 x float> poison, float %4356, i64 0, !dbg !189 + %4451 = insertelement <2 x float> %4450, float %4357, i64 1, !dbg !189 + %4452 = fmul <2 x float> %4451, splat (float 0x3FB6A09E60000000), !dbg !189 + %4453 = fptrunc <2 x float> %4452 to <2 x bfloat>, !dbg !190 + %4454 = insertelement <2 x float> poison, float %4358, i64 0, !dbg !189 + %4455 = insertelement <2 x float> %4454, float %4359, i64 1, !dbg !189 + %4456 = fmul <2 x float> %4455, splat (float 0x3FB6A09E60000000), !dbg !189 + %4457 = fptrunc <2 x float> %4456 to <2 x bfloat>, !dbg !190 + %4458 = insertelement <2 x float> poison, float %4360, i64 0, !dbg !189 + %4459 = insertelement <2 x float> %4458, float %4361, i64 1, !dbg !189 + %4460 = fmul <2 x float> %4459, splat (float 0x3FB6A09E60000000), !dbg !189 + %4461 = fptrunc <2 x float> %4460 to <2 x bfloat>, !dbg !190 + %4462 = insertelement <2 x float> poison, float %4362, i64 0, !dbg !189 + %4463 = insertelement <2 x float> %4462, float %4363, i64 1, !dbg !189 + %4464 = fmul <2 x float> %4463, splat (float 0x3FB6A09E60000000), !dbg !189 + %4465 = fptrunc <2 x float> %4464 to <2 x bfloat>, !dbg !190 + %4466 = insertelement <2 x float> poison, float %4364, i64 0, !dbg !189 + %4467 = insertelement <2 x float> %4466, float %4365, i64 1, !dbg !189 + %4468 = fmul <2 x float> %4467, splat (float 0x3FB6A09E60000000), !dbg !189 + %4469 = fptrunc <2 x float> %4468 to <2 x bfloat>, !dbg !190 + %4470 = insertelement <2 x float> poison, float %4366, i64 0, !dbg !189 + %4471 = insertelement <2 x float> %4470, float %4367, i64 1, !dbg !189 + %4472 = fmul <2 x float> %4471, splat (float 0x3FB6A09E60000000), !dbg !189 + %4473 = fptrunc <2 x float> %4472 to <2 x bfloat>, !dbg !190 + %4474 = insertelement <2 x float> poison, float %4368, i64 0, !dbg !189 + %4475 = insertelement <2 x float> %4474, float %4369, i64 1, !dbg !189 + %4476 = fmul <2 x float> %4475, splat (float 0x3FB6A09E60000000), !dbg !189 + %4477 = fptrunc <2 x float> %4476 to <2 x bfloat>, !dbg !190 + %4478 = insertelement <2 x float> poison, float %4370, i64 0, !dbg !189 + %4479 = insertelement <2 x float> %4478, float %4371, i64 1, !dbg !189 + %4480 = fmul <2 x float> %4479, splat (float 0x3FB6A09E60000000), !dbg !189 + %4481 = fptrunc <2 x float> %4480 to <2 x bfloat>, !dbg !190 + %4482 = insertelement <2 x float> poison, float %4372, i64 0, !dbg !189 + %4483 = insertelement <2 x float> %4482, float %4373, i64 1, !dbg !189 + %4484 = fmul <2 x float> %4483, splat (float 0x3FB6A09E60000000), !dbg !189 + %4485 = fptrunc <2 x float> %4484 to <2 x bfloat>, !dbg !190 + %4486 = insertelement <2 x float> poison, float %4374, i64 0, !dbg !189 + %4487 = insertelement <2 x float> %4486, float %4375, i64 1, !dbg !189 + %4488 = fmul <2 x float> %4487, splat (float 0x3FB6A09E60000000), !dbg !189 + %4489 = fptrunc <2 x float> %4488 to <2 x bfloat>, !dbg !190 + %4490 = insertelement <2 x float> poison, float %4376, i64 0, !dbg !189 + %4491 = insertelement <2 x float> %4490, float %4377, i64 1, !dbg !189 + %4492 = fmul <2 x float> %4491, splat (float 0x3FB6A09E60000000), !dbg !189 + %4493 = fptrunc <2 x float> %4492 to <2 x bfloat>, !dbg !190 + %4494 = insertelement <2 x float> poison, float %4378, i64 0, !dbg !189 + %4495 = insertelement <2 x float> %4494, float %4379, i64 1, !dbg !189 + %4496 = fmul <2 x float> %4495, splat (float 0x3FB6A09E60000000), !dbg !189 + %4497 = fptrunc <2 x float> %4496 to <2 x bfloat>, !dbg !190 + %4498 = insertelement <2 x float> poison, float %4380, i64 0, !dbg !189 + %4499 = insertelement <2 x float> %4498, float %4381, i64 1, !dbg !189 + %4500 = fmul <2 x float> %4499, splat (float 0x3FB6A09E60000000), !dbg !189 + %4501 = fptrunc <2 x float> %4500 to <2 x bfloat>, !dbg !190 + %4502 = insertelement <2 x float> poison, float %4382, i64 0, !dbg !189 + %4503 = insertelement <2 x float> %4502, float %4383, i64 1, !dbg !189 + %4504 = fmul <2 x float> %4503, splat (float 0x3FB6A09E60000000), !dbg !189 + %4505 = fptrunc <2 x float> %4504 to <2 x bfloat>, !dbg !190 + %4506 = insertelement <2 x float> poison, float %4384, i64 0, !dbg !189 + %4507 = insertelement <2 x float> %4506, float %4385, i64 1, !dbg !189 + %4508 = fmul <2 x float> %4507, splat (float 0x3FB6A09E60000000), !dbg !189 + %4509 = fptrunc <2 x float> %4508 to <2 x bfloat>, !dbg !190 + %4510 = insertelement <2 x float> poison, float %4386, i64 0, !dbg !189 + %4511 = insertelement <2 x float> %4510, float %4387, i64 1, !dbg !189 + %4512 = fmul <2 x float> %4511, splat (float 0x3FB6A09E60000000), !dbg !189 + %4513 = fptrunc <2 x float> %4512 to <2 x bfloat>, !dbg !190 + %4514 = insertelement <2 x float> poison, float %4388, i64 0, !dbg !189 + %4515 = insertelement <2 x float> %4514, float %4389, i64 1, !dbg !189 + %4516 = fmul <2 x float> %4515, splat (float 0x3FB6A09E60000000), !dbg !189 + %4517 = fptrunc <2 x float> %4516 to <2 x bfloat>, !dbg !190 + %4518 = shl nuw nsw i32 %339, 13, !dbg !190 + %4519 = shl nuw nsw i32 %43, 5, !dbg !190 + %4520 = and i32 %4519, 7264, !dbg !190 + %4521 = and i32 %43, 24, !dbg !190 + %4522 = shl nuw nsw i32 %4521, 4, !dbg !190 + %4523 = shl nuw nsw i32 %43, 2, !dbg !190 + %4524 = and i32 %4523, 16, !dbg !190 + %4525 = or disjoint i32 %4518, %4524, !dbg !190 + %4526 = or disjoint i32 %4520, %4522, !dbg !190 + %4527 = or disjoint i32 %4525, %4526, !dbg !190 + %4528 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4527, !dbg !190 + %4529 = bitcast <2 x bfloat> %4393 to i32, !dbg !190 + %4530 = bitcast <2 x bfloat> %4401 to i32, !dbg !190 + %4531 = bitcast <2 x bfloat> %4409 to i32, !dbg !190 + %4532 = bitcast <2 x bfloat> %4417 to i32, !dbg !190 + %4533 = insertelement <4 x i32> poison, i32 %4529, i64 0, !dbg !190 + %4534 = insertelement <4 x i32> %4533, i32 %4530, i64 1, !dbg !190 + %4535 = insertelement <4 x i32> %4534, i32 %4531, i64 2, !dbg !190 + %4536 = insertelement <4 x i32> %4535, i32 %4532, i64 3, !dbg !190 + store <4 x i32> %4536, ptr addrspace(3) %4528, align 16, !dbg !190 + %4537 = getelementptr inbounds nuw i8, ptr addrspace(3) %4528, i32 512, !dbg !190 + %4538 = bitcast <2 x bfloat> %4397 to i32, !dbg !190 + %4539 = bitcast <2 x bfloat> %4405 to i32, !dbg !190 + %4540 = bitcast <2 x bfloat> %4413 to i32, !dbg !190 + %4541 = bitcast <2 x bfloat> %4421 to i32, !dbg !190 + %4542 = insertelement <4 x i32> poison, i32 %4538, i64 0, !dbg !190 + %4543 = insertelement <4 x i32> %4542, i32 %4539, i64 1, !dbg !190 + %4544 = insertelement <4 x i32> %4543, i32 %4540, i64 2, !dbg !190 + %4545 = insertelement <4 x i32> %4544, i32 %4541, i64 3, !dbg !190 + store <4 x i32> %4545, ptr addrspace(3) %4537, align 16, !dbg !190 + %4546 = xor i32 %4527, 32, !dbg !190 + %4547 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4546, !dbg !190 + %4548 = bitcast <2 x bfloat> %4425 to i32, !dbg !190 + %4549 = bitcast <2 x bfloat> %4433 to i32, !dbg !190 + %4550 = bitcast <2 x bfloat> %4441 to i32, !dbg !190 + %4551 = bitcast <2 x bfloat> %4449 to i32, !dbg !190 + %4552 = insertelement <4 x i32> poison, i32 %4548, i64 0, !dbg !190 + %4553 = insertelement <4 x i32> %4552, i32 %4549, i64 1, !dbg !190 + %4554 = insertelement <4 x i32> %4553, i32 %4550, i64 2, !dbg !190 + %4555 = insertelement <4 x i32> %4554, i32 %4551, i64 3, !dbg !190 + store <4 x i32> %4555, ptr addrspace(3) %4547, align 16, !dbg !190 + %4556 = getelementptr inbounds nuw i8, ptr addrspace(3) %4547, i32 512, !dbg !190 + %4557 = bitcast <2 x bfloat> %4429 to i32, !dbg !190 + %4558 = bitcast <2 x bfloat> %4437 to i32, !dbg !190 + %4559 = bitcast <2 x bfloat> %4445 to i32, !dbg !190 + %4560 = bitcast <2 x bfloat> %4453 to i32, !dbg !190 + %4561 = insertelement <4 x i32> poison, i32 %4557, i64 0, !dbg !190 + %4562 = insertelement <4 x i32> %4561, i32 %4558, i64 1, !dbg !190 + %4563 = insertelement <4 x i32> %4562, i32 %4559, i64 2, !dbg !190 + %4564 = insertelement <4 x i32> %4563, i32 %4560, i64 3, !dbg !190 + store <4 x i32> %4564, ptr addrspace(3) %4556, align 16, !dbg !190 + %4565 = xor i32 %4527, 64, !dbg !190 + %4566 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4565, !dbg !190 + %4567 = bitcast <2 x bfloat> %4457 to i32, !dbg !190 + %4568 = bitcast <2 x bfloat> %4465 to i32, !dbg !190 + %4569 = bitcast <2 x bfloat> %4473 to i32, !dbg !190 + %4570 = bitcast <2 x bfloat> %4481 to i32, !dbg !190 + %4571 = insertelement <4 x i32> poison, i32 %4567, i64 0, !dbg !190 + %4572 = insertelement <4 x i32> %4571, i32 %4568, i64 1, !dbg !190 + %4573 = insertelement <4 x i32> %4572, i32 %4569, i64 2, !dbg !190 + %4574 = insertelement <4 x i32> %4573, i32 %4570, i64 3, !dbg !190 + store <4 x i32> %4574, ptr addrspace(3) %4566, align 16, !dbg !190 + %4575 = getelementptr inbounds nuw i8, ptr addrspace(3) %4566, i32 512, !dbg !190 + %4576 = bitcast <2 x bfloat> %4461 to i32, !dbg !190 + %4577 = bitcast <2 x bfloat> %4469 to i32, !dbg !190 + %4578 = bitcast <2 x bfloat> %4477 to i32, !dbg !190 + %4579 = bitcast <2 x bfloat> %4485 to i32, !dbg !190 + %4580 = insertelement <4 x i32> poison, i32 %4576, i64 0, !dbg !190 + %4581 = insertelement <4 x i32> %4580, i32 %4577, i64 1, !dbg !190 + %4582 = insertelement <4 x i32> %4581, i32 %4578, i64 2, !dbg !190 + %4583 = insertelement <4 x i32> %4582, i32 %4579, i64 3, !dbg !190 + store <4 x i32> %4583, ptr addrspace(3) %4575, align 16, !dbg !190 + %4584 = xor i32 %4527, 96, !dbg !190 + %4585 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4584, !dbg !190 + %4586 = bitcast <2 x bfloat> %4489 to i32, !dbg !190 + %4587 = bitcast <2 x bfloat> %4497 to i32, !dbg !190 + %4588 = bitcast <2 x bfloat> %4505 to i32, !dbg !190 + %4589 = bitcast <2 x bfloat> %4513 to i32, !dbg !190 + %4590 = insertelement <4 x i32> poison, i32 %4586, i64 0, !dbg !190 + %4591 = insertelement <4 x i32> %4590, i32 %4587, i64 1, !dbg !190 + %4592 = insertelement <4 x i32> %4591, i32 %4588, i64 2, !dbg !190 + %4593 = insertelement <4 x i32> %4592, i32 %4589, i64 3, !dbg !190 + store <4 x i32> %4593, ptr addrspace(3) %4585, align 16, !dbg !190 + %4594 = getelementptr inbounds nuw i8, ptr addrspace(3) %4585, i32 512, !dbg !190 + %4595 = bitcast <2 x bfloat> %4493 to i32, !dbg !190 + %4596 = bitcast <2 x bfloat> %4501 to i32, !dbg !190 + %4597 = bitcast <2 x bfloat> %4509 to i32, !dbg !190 + %4598 = bitcast <2 x bfloat> %4517 to i32, !dbg !190 + %4599 = insertelement <4 x i32> poison, i32 %4595, i64 0, !dbg !190 + %4600 = insertelement <4 x i32> %4599, i32 %4596, i64 1, !dbg !190 + %4601 = insertelement <4 x i32> %4600, i32 %4597, i64 2, !dbg !190 + %4602 = insertelement <4 x i32> %4601, i32 %4598, i64 3, !dbg !190 + store <4 x i32> %4602, ptr addrspace(3) %4594, align 16, !dbg !190 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !190 + %4603 = shl nuw nsw i32 %4521, 10, !dbg !190 + %4604 = shl nuw nsw i32 %339, 5, !dbg !190 + %4605 = and i32 %4523, 1008, !dbg !190 + %4606 = or disjoint i32 %4603, %4604, !dbg !190 + %4607 = xor i32 %4606, %4605, !dbg !190 + %4608 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4607, !dbg !190 + %4609 = ptrtoint ptr addrspace(3) %4608 to i32, !dbg !190 + %4610 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4609) #3, !dbg !190 + %4611 = extractvalue { i32, i32, i32, i32 } %4610, 0, !dbg !190 + %4612 = extractvalue { i32, i32, i32, i32 } %4610, 1, !dbg !190 + %4613 = extractvalue { i32, i32, i32, i32 } %4610, 2, !dbg !190 + %4614 = extractvalue { i32, i32, i32, i32 } %4610, 3, !dbg !190 + %4615 = getelementptr inbounds nuw i8, ptr addrspace(3) %4608, i32 1024, !dbg !190 + %4616 = ptrtoint ptr addrspace(3) %4615 to i32, !dbg !190 + %4617 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4616) #3, !dbg !190 + %4618 = extractvalue { i32, i32, i32, i32 } %4617, 0, !dbg !190 + %4619 = extractvalue { i32, i32, i32, i32 } %4617, 1, !dbg !190 + %4620 = extractvalue { i32, i32, i32, i32 } %4617, 2, !dbg !190 + %4621 = extractvalue { i32, i32, i32, i32 } %4617, 3, !dbg !190 + %4622 = getelementptr inbounds nuw i8, ptr addrspace(3) %4608, i32 2048, !dbg !190 + %4623 = ptrtoint ptr addrspace(3) %4622 to i32, !dbg !190 + %4624 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4623) #3, !dbg !190 + %4625 = extractvalue { i32, i32, i32, i32 } %4624, 0, !dbg !190 + %4626 = extractvalue { i32, i32, i32, i32 } %4624, 1, !dbg !190 + %4627 = extractvalue { i32, i32, i32, i32 } %4624, 2, !dbg !190 + %4628 = extractvalue { i32, i32, i32, i32 } %4624, 3, !dbg !190 + %4629 = getelementptr inbounds nuw i8, ptr addrspace(3) %4608, i32 3072, !dbg !190 + %4630 = ptrtoint ptr addrspace(3) %4629 to i32, !dbg !190 + %4631 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4630) #3, !dbg !190 + %4632 = extractvalue { i32, i32, i32, i32 } %4631, 0, !dbg !190 + %4633 = extractvalue { i32, i32, i32, i32 } %4631, 1, !dbg !190 + %4634 = extractvalue { i32, i32, i32, i32 } %4631, 2, !dbg !190 + %4635 = extractvalue { i32, i32, i32, i32 } %4631, 3, !dbg !190 + %4636 = getelementptr inbounds nuw i8, ptr addrspace(3) %4608, i32 4096, !dbg !190 + %4637 = ptrtoint ptr addrspace(3) %4636 to i32, !dbg !190 + %4638 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4637) #3, !dbg !190 + %4639 = extractvalue { i32, i32, i32, i32 } %4638, 0, !dbg !190 + %4640 = extractvalue { i32, i32, i32, i32 } %4638, 1, !dbg !190 + %4641 = extractvalue { i32, i32, i32, i32 } %4638, 2, !dbg !190 + %4642 = extractvalue { i32, i32, i32, i32 } %4638, 3, !dbg !190 + %4643 = getelementptr inbounds nuw i8, ptr addrspace(3) %4608, i32 5120, !dbg !190 + %4644 = ptrtoint ptr addrspace(3) %4643 to i32, !dbg !190 + %4645 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4644) #3, !dbg !190 + %4646 = extractvalue { i32, i32, i32, i32 } %4645, 0, !dbg !190 + %4647 = extractvalue { i32, i32, i32, i32 } %4645, 1, !dbg !190 + %4648 = extractvalue { i32, i32, i32, i32 } %4645, 2, !dbg !190 + %4649 = extractvalue { i32, i32, i32, i32 } %4645, 3, !dbg !190 + %4650 = getelementptr inbounds nuw i8, ptr addrspace(3) %4608, i32 6144, !dbg !190 + %4651 = ptrtoint ptr addrspace(3) %4650 to i32, !dbg !190 + %4652 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4651) #3, !dbg !190 + %4653 = extractvalue { i32, i32, i32, i32 } %4652, 0, !dbg !190 + %4654 = extractvalue { i32, i32, i32, i32 } %4652, 1, !dbg !190 + %4655 = extractvalue { i32, i32, i32, i32 } %4652, 2, !dbg !190 + %4656 = extractvalue { i32, i32, i32, i32 } %4652, 3, !dbg !190 + %4657 = getelementptr inbounds nuw i8, ptr addrspace(3) %4608, i32 7168, !dbg !190 + %4658 = ptrtoint ptr addrspace(3) %4657 to i32, !dbg !190 + %4659 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4658) #3, !dbg !190 + %4660 = extractvalue { i32, i32, i32, i32 } %4659, 0, !dbg !190 + %4661 = extractvalue { i32, i32, i32, i32 } %4659, 1, !dbg !190 + %4662 = extractvalue { i32, i32, i32, i32 } %4659, 2, !dbg !190 + %4663 = extractvalue { i32, i32, i32, i32 } %4659, 3, !dbg !190 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4611, i32 %4612, i32 %4613, i32 %4614, ptr addrspace(1) %4318, i1 true) #3, !dbg !190 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4618, i32 %4619, i32 %4620, i32 %4621, ptr addrspace(1) %4319, i1 true) #3, !dbg !190 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4625, i32 %4626, i32 %4627, i32 %4628, ptr addrspace(1) %4320, i1 true) #3, !dbg !190 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4632, i32 %4633, i32 %4634, i32 %4635, ptr addrspace(1) %4321, i1 true) #3, !dbg !190 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4639, i32 %4640, i32 %4641, i32 %4642, ptr addrspace(1) %4322, i1 true) #3, !dbg !190 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4646, i32 %4647, i32 %4648, i32 %4649, ptr addrspace(1) %4323, i1 true) #3, !dbg !190 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4653, i32 %4654, i32 %4655, i32 %4656, ptr addrspace(1) %4324, i1 true) #3, !dbg !190 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4660, i32 %4661, i32 %4662, i32 %4663, ptr addrspace(1) %4325, i1 true) #3, !dbg !190 + br label %11132, !dbg !31 + +4664: ; preds = %24 + %4665 = lshr exact i32 %45, 2, !dbg !29 + %4666 = shl nuw nsw i32 %27, 7, !dbg !191 + %4667 = or disjoint i32 %46, %4666, !dbg !192 + %4668 = or disjoint i32 %47, %4666, !dbg !192 + %4669 = or disjoint i32 %48, %4666, !dbg !192 + %4670 = or disjoint i32 %49, %4666, !dbg !192 + %4671 = or disjoint i32 %50, %4666, !dbg !192 + %4672 = or disjoint i32 %51, %4666, !dbg !192 + %4673 = or disjoint i32 %52, %4666, !dbg !192 + %4674 = or disjoint i32 %53, %4666, !dbg !192 + %4675 = shl i32 %4667, 7, !dbg !193 + %4676 = shl i32 %4668, 7, !dbg !193 + %4677 = shl i32 %4669, 7, !dbg !193 + %4678 = shl i32 %4670, 7, !dbg !193 + %4679 = shl i32 %4671, 7, !dbg !193 + %4680 = shl i32 %4672, 7, !dbg !193 + %4681 = shl i32 %4673, 7, !dbg !193 + %4682 = shl i32 %4674, 7, !dbg !193 + %4683 = sext i32 %4675 to i64, !dbg !195 + %4684 = getelementptr bfloat, ptr addrspace(1) %40, i64 %4683, !dbg !195 + %4685 = sext i32 %4676 to i64, !dbg !195 + %4686 = getelementptr bfloat, ptr addrspace(1) %40, i64 %4685, !dbg !195 + %4687 = sext i32 %4677 to i64, !dbg !195 + %4688 = getelementptr bfloat, ptr addrspace(1) %40, i64 %4687, !dbg !195 + %4689 = sext i32 %4678 to i64, !dbg !195 + %4690 = getelementptr bfloat, ptr addrspace(1) %40, i64 %4689, !dbg !195 + %4691 = sext i32 %4679 to i64, !dbg !195 + %4692 = getelementptr bfloat, ptr addrspace(1) %40, i64 %4691, !dbg !195 + %4693 = sext i32 %4680 to i64, !dbg !195 + %4694 = getelementptr bfloat, ptr addrspace(1) %40, i64 %4693, !dbg !195 + %4695 = sext i32 %4681 to i64, !dbg !195 + %4696 = getelementptr bfloat, ptr addrspace(1) %40, i64 %4695, !dbg !195 + %4697 = sext i32 %4682 to i64, !dbg !195 + %4698 = getelementptr bfloat, ptr addrspace(1) %40, i64 %4697, !dbg !195 + %4699 = and i32 %43, 15, !dbg !196 + %4700 = shl nuw nsw i32 %4699, 3, !dbg !196 + %4701 = zext nneg i32 %4700 to i64, !dbg !197 + %4702 = getelementptr bfloat, ptr addrspace(1) %4684, i64 %4701, !dbg !197 + %4703 = getelementptr bfloat, ptr addrspace(1) %4686, i64 %4701, !dbg !197 + %4704 = getelementptr bfloat, ptr addrspace(1) %4688, i64 %4701, !dbg !197 + %4705 = getelementptr bfloat, ptr addrspace(1) %4690, i64 %4701, !dbg !197 + %4706 = getelementptr bfloat, ptr addrspace(1) %4692, i64 %4701, !dbg !197 + %4707 = getelementptr bfloat, ptr addrspace(1) %4694, i64 %4701, !dbg !197 + %4708 = getelementptr bfloat, ptr addrspace(1) %4696, i64 %4701, !dbg !197 + %4709 = getelementptr bfloat, ptr addrspace(1) %4698, i64 %4701, !dbg !197 + %4710 = icmp slt i32 %4667, %18, !dbg !198 + %4711 = icmp slt i32 %4668, %18, !dbg !198 + %4712 = icmp slt i32 %4669, %18, !dbg !198 + %4713 = icmp slt i32 %4670, %18, !dbg !198 + %4714 = icmp slt i32 %4671, %18, !dbg !198 + %4715 = icmp slt i32 %4672, %18, !dbg !198 + %4716 = icmp slt i32 %4673, %18, !dbg !198 + %4717 = icmp slt i32 %4674, %18, !dbg !198 + %4718 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4702, i1 %4710) #3, !dbg !199 + %4719 = extractvalue { i32, i32, i32, i32 } %4718, 0, !dbg !199 + %4720 = extractvalue { i32, i32, i32, i32 } %4718, 1, !dbg !199 + %4721 = extractvalue { i32, i32, i32, i32 } %4718, 2, !dbg !199 + %4722 = extractvalue { i32, i32, i32, i32 } %4718, 3, !dbg !199 + %4723 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4703, i1 %4711) #3, !dbg !199 + %4724 = extractvalue { i32, i32, i32, i32 } %4723, 0, !dbg !199 + %4725 = extractvalue { i32, i32, i32, i32 } %4723, 1, !dbg !199 + %4726 = extractvalue { i32, i32, i32, i32 } %4723, 2, !dbg !199 + %4727 = extractvalue { i32, i32, i32, i32 } %4723, 3, !dbg !199 + %4728 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4704, i1 %4712) #3, !dbg !199 + %4729 = extractvalue { i32, i32, i32, i32 } %4728, 0, !dbg !199 + %4730 = extractvalue { i32, i32, i32, i32 } %4728, 1, !dbg !199 + %4731 = extractvalue { i32, i32, i32, i32 } %4728, 2, !dbg !199 + %4732 = extractvalue { i32, i32, i32, i32 } %4728, 3, !dbg !199 + %4733 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4705, i1 %4713) #3, !dbg !199 + %4734 = extractvalue { i32, i32, i32, i32 } %4733, 0, !dbg !199 + %4735 = extractvalue { i32, i32, i32, i32 } %4733, 1, !dbg !199 + %4736 = extractvalue { i32, i32, i32, i32 } %4733, 2, !dbg !199 + %4737 = extractvalue { i32, i32, i32, i32 } %4733, 3, !dbg !199 + %4738 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4706, i1 %4714) #3, !dbg !199 + %4739 = extractvalue { i32, i32, i32, i32 } %4738, 0, !dbg !199 + %4740 = extractvalue { i32, i32, i32, i32 } %4738, 1, !dbg !199 + %4741 = extractvalue { i32, i32, i32, i32 } %4738, 2, !dbg !199 + %4742 = extractvalue { i32, i32, i32, i32 } %4738, 3, !dbg !199 + %4743 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4707, i1 %4715) #3, !dbg !199 + %4744 = extractvalue { i32, i32, i32, i32 } %4743, 0, !dbg !199 + %4745 = extractvalue { i32, i32, i32, i32 } %4743, 1, !dbg !199 + %4746 = extractvalue { i32, i32, i32, i32 } %4743, 2, !dbg !199 + %4747 = extractvalue { i32, i32, i32, i32 } %4743, 3, !dbg !199 + %4748 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4708, i1 %4716) #3, !dbg !199 + %4749 = extractvalue { i32, i32, i32, i32 } %4748, 0, !dbg !199 + %4750 = extractvalue { i32, i32, i32, i32 } %4748, 1, !dbg !199 + %4751 = extractvalue { i32, i32, i32, i32 } %4748, 2, !dbg !199 + %4752 = extractvalue { i32, i32, i32, i32 } %4748, 3, !dbg !199 + %4753 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4709, i1 %4717) #3, !dbg !199 + %4754 = extractvalue { i32, i32, i32, i32 } %4753, 0, !dbg !199 + %4755 = extractvalue { i32, i32, i32, i32 } %4753, 1, !dbg !199 + %4756 = extractvalue { i32, i32, i32, i32 } %4753, 2, !dbg !199 + %4757 = extractvalue { i32, i32, i32, i32 } %4753, 3, !dbg !199 + %4758 = shl nuw nsw i32 %43, 4, !dbg !199 + %4759 = and i32 %4758, 112, !dbg !199 + %4760 = shl nuw nsw i32 %45, 3, !dbg !199 + %4761 = and i32 %43, 112, !dbg !199 + %4762 = and i32 %43, 8, !dbg !199 + %4763 = shl nuw nsw i32 %4762, 11, !dbg !199 + %4764 = or disjoint i32 %4759, %4760, !dbg !199 + %4765 = xor i32 %4764, %4761, !dbg !199 + %4766 = or disjoint i32 %4765, %4763, !dbg !199 + %4767 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520), i32 %4766, !dbg !199 + %4768 = insertelement <4 x i32> poison, i32 %4719, i64 0, !dbg !199 + %4769 = insertelement <4 x i32> %4768, i32 %4720, i64 1, !dbg !199 + %4770 = insertelement <4 x i32> %4769, i32 %4721, i64 2, !dbg !199 + %4771 = insertelement <4 x i32> %4770, i32 %4722, i64 3, !dbg !199 + store <4 x i32> %4771, ptr addrspace(3) %4767, align 16, !dbg !199 + %4772 = or disjoint i32 %4766, 2048, !dbg !199 + %4773 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520), i32 %4772, !dbg !199 + %4774 = insertelement <4 x i32> poison, i32 %4724, i64 0, !dbg !199 + %4775 = insertelement <4 x i32> %4774, i32 %4725, i64 1, !dbg !199 + %4776 = insertelement <4 x i32> %4775, i32 %4726, i64 2, !dbg !199 + %4777 = insertelement <4 x i32> %4776, i32 %4727, i64 3, !dbg !199 + store <4 x i32> %4777, ptr addrspace(3) %4773, align 16, !dbg !199 + %4778 = or disjoint i32 %4766, 4096, !dbg !199 + %4779 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520), i32 %4778, !dbg !199 + %4780 = insertelement <4 x i32> poison, i32 %4729, i64 0, !dbg !199 + %4781 = insertelement <4 x i32> %4780, i32 %4730, i64 1, !dbg !199 + %4782 = insertelement <4 x i32> %4781, i32 %4731, i64 2, !dbg !199 + %4783 = insertelement <4 x i32> %4782, i32 %4732, i64 3, !dbg !199 + store <4 x i32> %4783, ptr addrspace(3) %4779, align 16, !dbg !199 + %4784 = or disjoint i32 %4766, 6144, !dbg !199 + %4785 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520), i32 %4784, !dbg !199 + %4786 = insertelement <4 x i32> poison, i32 %4734, i64 0, !dbg !199 + %4787 = insertelement <4 x i32> %4786, i32 %4735, i64 1, !dbg !199 + %4788 = insertelement <4 x i32> %4787, i32 %4736, i64 2, !dbg !199 + %4789 = insertelement <4 x i32> %4788, i32 %4737, i64 3, !dbg !199 + store <4 x i32> %4789, ptr addrspace(3) %4785, align 16, !dbg !199 + %4790 = or disjoint i32 %4766, 8192, !dbg !199 + %4791 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520), i32 %4790, !dbg !199 + %4792 = insertelement <4 x i32> poison, i32 %4739, i64 0, !dbg !199 + %4793 = insertelement <4 x i32> %4792, i32 %4740, i64 1, !dbg !199 + %4794 = insertelement <4 x i32> %4793, i32 %4741, i64 2, !dbg !199 + %4795 = insertelement <4 x i32> %4794, i32 %4742, i64 3, !dbg !199 + store <4 x i32> %4795, ptr addrspace(3) %4791, align 16, !dbg !199 + %4796 = or disjoint i32 %4766, 10240, !dbg !199 + %4797 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520), i32 %4796, !dbg !199 + %4798 = insertelement <4 x i32> poison, i32 %4744, i64 0, !dbg !199 + %4799 = insertelement <4 x i32> %4798, i32 %4745, i64 1, !dbg !199 + %4800 = insertelement <4 x i32> %4799, i32 %4746, i64 2, !dbg !199 + %4801 = insertelement <4 x i32> %4800, i32 %4747, i64 3, !dbg !199 + store <4 x i32> %4801, ptr addrspace(3) %4797, align 16, !dbg !199 + %4802 = or disjoint i32 %4766, 12288, !dbg !199 + %4803 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520), i32 %4802, !dbg !199 + %4804 = insertelement <4 x i32> poison, i32 %4749, i64 0, !dbg !199 + %4805 = insertelement <4 x i32> %4804, i32 %4750, i64 1, !dbg !199 + %4806 = insertelement <4 x i32> %4805, i32 %4751, i64 2, !dbg !199 + %4807 = insertelement <4 x i32> %4806, i32 %4752, i64 3, !dbg !199 + store <4 x i32> %4807, ptr addrspace(3) %4803, align 16, !dbg !199 + %4808 = or disjoint i32 %4766, 14336, !dbg !199 + %4809 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520), i32 %4808, !dbg !199 + %4810 = insertelement <4 x i32> poison, i32 %4754, i64 0, !dbg !199 + %4811 = insertelement <4 x i32> %4810, i32 %4755, i64 1, !dbg !199 + %4812 = insertelement <4 x i32> %4811, i32 %4756, i64 2, !dbg !199 + %4813 = insertelement <4 x i32> %4812, i32 %4757, i64 3, !dbg !199 + store <4 x i32> %4813, ptr addrspace(3) %4809, align 16, !dbg !199 + %4814 = getelementptr bfloat, ptr addrspace(1) %41, i64 %4683, !dbg !200 + %4815 = getelementptr bfloat, ptr addrspace(1) %41, i64 %4685, !dbg !200 + %4816 = getelementptr bfloat, ptr addrspace(1) %41, i64 %4687, !dbg !200 + %4817 = getelementptr bfloat, ptr addrspace(1) %41, i64 %4689, !dbg !200 + %4818 = getelementptr bfloat, ptr addrspace(1) %41, i64 %4691, !dbg !200 + %4819 = getelementptr bfloat, ptr addrspace(1) %41, i64 %4693, !dbg !200 + %4820 = getelementptr bfloat, ptr addrspace(1) %41, i64 %4695, !dbg !200 + %4821 = getelementptr bfloat, ptr addrspace(1) %41, i64 %4697, !dbg !200 + %4822 = getelementptr bfloat, ptr addrspace(1) %4814, i64 %4701, !dbg !202 + %4823 = getelementptr bfloat, ptr addrspace(1) %4815, i64 %4701, !dbg !202 + %4824 = getelementptr bfloat, ptr addrspace(1) %4816, i64 %4701, !dbg !202 + %4825 = getelementptr bfloat, ptr addrspace(1) %4817, i64 %4701, !dbg !202 + %4826 = getelementptr bfloat, ptr addrspace(1) %4818, i64 %4701, !dbg !202 + %4827 = getelementptr bfloat, ptr addrspace(1) %4819, i64 %4701, !dbg !202 + %4828 = getelementptr bfloat, ptr addrspace(1) %4820, i64 %4701, !dbg !202 + %4829 = getelementptr bfloat, ptr addrspace(1) %4821, i64 %4701, !dbg !202 + %4830 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4822, i1 %4710) #3, !dbg !203 + %4831 = extractvalue { i32, i32, i32, i32 } %4830, 0, !dbg !203 + %4832 = extractvalue { i32, i32, i32, i32 } %4830, 1, !dbg !203 + %4833 = extractvalue { i32, i32, i32, i32 } %4830, 2, !dbg !203 + %4834 = extractvalue { i32, i32, i32, i32 } %4830, 3, !dbg !203 + %4835 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4823, i1 %4711) #3, !dbg !203 + %4836 = extractvalue { i32, i32, i32, i32 } %4835, 0, !dbg !203 + %4837 = extractvalue { i32, i32, i32, i32 } %4835, 1, !dbg !203 + %4838 = extractvalue { i32, i32, i32, i32 } %4835, 2, !dbg !203 + %4839 = extractvalue { i32, i32, i32, i32 } %4835, 3, !dbg !203 + %4840 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4824, i1 %4712) #3, !dbg !203 + %4841 = extractvalue { i32, i32, i32, i32 } %4840, 0, !dbg !203 + %4842 = extractvalue { i32, i32, i32, i32 } %4840, 1, !dbg !203 + %4843 = extractvalue { i32, i32, i32, i32 } %4840, 2, !dbg !203 + %4844 = extractvalue { i32, i32, i32, i32 } %4840, 3, !dbg !203 + %4845 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4825, i1 %4713) #3, !dbg !203 + %4846 = extractvalue { i32, i32, i32, i32 } %4845, 0, !dbg !203 + %4847 = extractvalue { i32, i32, i32, i32 } %4845, 1, !dbg !203 + %4848 = extractvalue { i32, i32, i32, i32 } %4845, 2, !dbg !203 + %4849 = extractvalue { i32, i32, i32, i32 } %4845, 3, !dbg !203 + %4850 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4826, i1 %4714) #3, !dbg !203 + %4851 = extractvalue { i32, i32, i32, i32 } %4850, 0, !dbg !203 + %4852 = extractvalue { i32, i32, i32, i32 } %4850, 1, !dbg !203 + %4853 = extractvalue { i32, i32, i32, i32 } %4850, 2, !dbg !203 + %4854 = extractvalue { i32, i32, i32, i32 } %4850, 3, !dbg !203 + %4855 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4827, i1 %4715) #3, !dbg !203 + %4856 = extractvalue { i32, i32, i32, i32 } %4855, 0, !dbg !203 + %4857 = extractvalue { i32, i32, i32, i32 } %4855, 1, !dbg !203 + %4858 = extractvalue { i32, i32, i32, i32 } %4855, 2, !dbg !203 + %4859 = extractvalue { i32, i32, i32, i32 } %4855, 3, !dbg !203 + %4860 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4828, i1 %4716) #3, !dbg !203 + %4861 = extractvalue { i32, i32, i32, i32 } %4860, 0, !dbg !203 + %4862 = extractvalue { i32, i32, i32, i32 } %4860, 1, !dbg !203 + %4863 = extractvalue { i32, i32, i32, i32 } %4860, 2, !dbg !203 + %4864 = extractvalue { i32, i32, i32, i32 } %4860, 3, !dbg !203 + %4865 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %4829, i1 %4717) #3, !dbg !203 + %4866 = extractvalue { i32, i32, i32, i32 } %4865, 0, !dbg !203 + %4867 = extractvalue { i32, i32, i32, i32 } %4865, 1, !dbg !203 + %4868 = extractvalue { i32, i32, i32, i32 } %4865, 2, !dbg !203 + %4869 = extractvalue { i32, i32, i32, i32 } %4865, 3, !dbg !203 + %4870 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288), i32 %4766, !dbg !203 + %4871 = insertelement <4 x i32> poison, i32 %4831, i64 0, !dbg !203 + %4872 = insertelement <4 x i32> %4871, i32 %4832, i64 1, !dbg !203 + %4873 = insertelement <4 x i32> %4872, i32 %4833, i64 2, !dbg !203 + %4874 = insertelement <4 x i32> %4873, i32 %4834, i64 3, !dbg !203 + store <4 x i32> %4874, ptr addrspace(3) %4870, align 16, !dbg !203 + %4875 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288), i32 %4772, !dbg !203 + %4876 = insertelement <4 x i32> poison, i32 %4836, i64 0, !dbg !203 + %4877 = insertelement <4 x i32> %4876, i32 %4837, i64 1, !dbg !203 + %4878 = insertelement <4 x i32> %4877, i32 %4838, i64 2, !dbg !203 + %4879 = insertelement <4 x i32> %4878, i32 %4839, i64 3, !dbg !203 + store <4 x i32> %4879, ptr addrspace(3) %4875, align 16, !dbg !203 + %4880 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288), i32 %4778, !dbg !203 + %4881 = insertelement <4 x i32> poison, i32 %4841, i64 0, !dbg !203 + %4882 = insertelement <4 x i32> %4881, i32 %4842, i64 1, !dbg !203 + %4883 = insertelement <4 x i32> %4882, i32 %4843, i64 2, !dbg !203 + %4884 = insertelement <4 x i32> %4883, i32 %4844, i64 3, !dbg !203 + store <4 x i32> %4884, ptr addrspace(3) %4880, align 16, !dbg !203 + %4885 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288), i32 %4784, !dbg !203 + %4886 = insertelement <4 x i32> poison, i32 %4846, i64 0, !dbg !203 + %4887 = insertelement <4 x i32> %4886, i32 %4847, i64 1, !dbg !203 + %4888 = insertelement <4 x i32> %4887, i32 %4848, i64 2, !dbg !203 + %4889 = insertelement <4 x i32> %4888, i32 %4849, i64 3, !dbg !203 + store <4 x i32> %4889, ptr addrspace(3) %4885, align 16, !dbg !203 + %4890 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288), i32 %4790, !dbg !203 + %4891 = insertelement <4 x i32> poison, i32 %4851, i64 0, !dbg !203 + %4892 = insertelement <4 x i32> %4891, i32 %4852, i64 1, !dbg !203 + %4893 = insertelement <4 x i32> %4892, i32 %4853, i64 2, !dbg !203 + %4894 = insertelement <4 x i32> %4893, i32 %4854, i64 3, !dbg !203 + store <4 x i32> %4894, ptr addrspace(3) %4890, align 16, !dbg !203 + %4895 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288), i32 %4796, !dbg !203 + %4896 = insertelement <4 x i32> poison, i32 %4856, i64 0, !dbg !203 + %4897 = insertelement <4 x i32> %4896, i32 %4857, i64 1, !dbg !203 + %4898 = insertelement <4 x i32> %4897, i32 %4858, i64 2, !dbg !203 + %4899 = insertelement <4 x i32> %4898, i32 %4859, i64 3, !dbg !203 + store <4 x i32> %4899, ptr addrspace(3) %4895, align 16, !dbg !203 + %4900 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288), i32 %4802, !dbg !203 + %4901 = insertelement <4 x i32> poison, i32 %4861, i64 0, !dbg !203 + %4902 = insertelement <4 x i32> %4901, i32 %4862, i64 1, !dbg !203 + %4903 = insertelement <4 x i32> %4902, i32 %4863, i64 2, !dbg !203 + %4904 = insertelement <4 x i32> %4903, i32 %4864, i64 3, !dbg !203 + store <4 x i32> %4904, ptr addrspace(3) %4900, align 16, !dbg !203 + %4905 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288), i32 %4808, !dbg !203 + %4906 = insertelement <4 x i32> poison, i32 %4866, i64 0, !dbg !203 + %4907 = insertelement <4 x i32> %4906, i32 %4867, i64 1, !dbg !203 + %4908 = insertelement <4 x i32> %4907, i32 %4868, i64 2, !dbg !203 + %4909 = insertelement <4 x i32> %4908, i32 %4869, i64 3, !dbg !203 + store <4 x i32> %4909, ptr addrspace(3) %4905, align 16, !dbg !203 + %4910 = shl nuw nsw i32 %31, 2, !dbg !204 + %4911 = shl i32 %30, 23, !dbg !205 + %4912 = mul i32 %20, %32, !dbg !206 + %4913 = add i32 %4912, %27, !dbg !207 + %4914 = mul i32 %21, %32, !dbg !208 + %reass.add1573 = add i32 %4914, %27, !dbg !209 + %reass.mul1574 = shl i32 %reass.add1573, 4, !dbg !209 + %4915 = sext i32 %reass.mul1574 to i64, !dbg !210 + %4916 = getelementptr i32, ptr addrspace(1) %11, i64 %4915, !dbg !210 + %4917 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %4916, i1 true) #3, !dbg !211 + %4918 = shl i32 %4917, 7, !dbg !212 + %4919 = sext i32 %4913 to i64, !dbg !213 + %4920 = getelementptr i32, ptr addrspace(1) %10, i64 %4919, !dbg !213 + %4921 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %4920, i1 true) #3, !dbg !214 + %4922 = and i32 %43, 3, !dbg !215 + %4923 = shl nuw nsw i32 %4922, 1, !dbg !215 + %4924 = or disjoint i32 %4923, 8, !dbg !215 + %4925 = or disjoint i32 %4923, 16, !dbg !215 + %4926 = or disjoint i32 %4923, 24, !dbg !215 + %4927 = or disjoint i32 %4923, 32, !dbg !215 + %4928 = or disjoint i32 %4923, 40, !dbg !215 + %4929 = or disjoint i32 %4923, 48, !dbg !215 + %4930 = or disjoint i32 %4923, 56, !dbg !215 + %4931 = shl nuw nsw i32 %4699, 2, !dbg !215 + %4932 = or disjoint i32 %4918, %4923, !dbg !216 + %4933 = or disjoint i32 %4918, %4924, !dbg !216 + %4934 = or disjoint i32 %4918, %4925, !dbg !216 + %4935 = or disjoint i32 %4918, %4926, !dbg !216 + %4936 = or disjoint i32 %4918, %4927, !dbg !216 + %4937 = or disjoint i32 %4918, %4928, !dbg !216 + %4938 = or disjoint i32 %4918, %4929, !dbg !216 + %4939 = or disjoint i32 %4918, %4930, !dbg !216 + %4940 = or disjoint i32 %4918, %4931, !dbg !216 + %4941 = or disjoint i32 %4940, 1, !dbg !216 + %4942 = insertelement <2 x i32> poison, i32 %4940, i64 0, !dbg !216 + %4943 = shufflevector <2 x i32> %4942, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !216 + %4944 = or disjoint <2 x i32> %4943, , !dbg !216 + %4945 = or disjoint i32 %4918, %46, !dbg !216 + %4946 = or disjoint i32 %4918, %47, !dbg !216 + %4947 = or disjoint i32 %4918, %48, !dbg !216 + %4948 = or disjoint i32 %4918, %49, !dbg !216 + %4949 = shl i32 %4945, 12, !dbg !217 + %4950 = shl i32 %4946, 12, !dbg !217 + %4951 = shl i32 %4947, 12, !dbg !217 + %4952 = shl i32 %4948, 12, !dbg !217 + %4953 = shl i32 %4945, 7, !dbg !219 + %4954 = shl i32 %4946, 7, !dbg !219 + %4955 = shl i32 %4947, 7, !dbg !219 + %4956 = shl i32 %4948, 7, !dbg !219 + %4957 = shl i32 %4921, 1, !dbg !220 + %4958 = tail call i32 @llvm.smin.i32(i32 %4957, i32 32), !dbg !221 + %4959 = zext nneg i32 %30 to i64, !dbg !222 + %4960 = getelementptr i64, ptr addrspace(1) %16, i64 %4959, !dbg !222 + %4961 = icmp sgt i32 %4957, 0, !dbg !223 + %4962 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %4960, i1 %4961) #3, !dbg !224 + %4963 = or disjoint i32 %4665, %4666, !dbg !192 + %4964 = insertelement <4 x i32> poison, i32 %4963, i64 0, !dbg !192 + %4965 = shufflevector <4 x i32> %4964, <4 x i32> poison, <8 x i32> , !dbg !225 + %4966 = insertelement <8 x i32> %4965, i32 %4963, i64 4, !dbg !225 + %4967 = insertelement <8 x i32> %4966, i32 %4963, i64 5, !dbg !225 + %4968 = insertelement <8 x i32> %4967, i32 %4963, i64 6, !dbg !225 + %4969 = or disjoint <8 x i32> %4968, , !dbg !225 + %4970 = insertelement <8 x i32> %4969, i32 %4963, i64 7, !dbg !225 + %4971 = insertelement <8 x i32> poison, i32 %18, i64 0, !dbg !225 + %4972 = shufflevector <8 x i32> %4971, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !225 + %4973 = srem <8 x i32> %4970, %4972, !dbg !225 + %4974 = icmp ugt <8 x i32> %4973, splat (i32 2047), !dbg !226 + %4975 = trunc <8 x i32> %4973 to <8 x i16>, !dbg !227 + %4976 = and <8 x i16> %4975, splat (i16 2047), !dbg !227 + %4977 = insertelement <8 x i64> poison, i64 %4962, i64 0, !dbg !228 + %4978 = shufflevector <8 x i64> %4977, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !228 + %4979 = zext nneg <8 x i16> %4976 to <8 x i64>, !dbg !228 + %4980 = icmp sgt <8 x i64> %4978, %4979, !dbg !228 + %foldExtExtBinop = and <8 x i1> %4974, %4980, !dbg !229 + %foldExtExtBinop3052 = and <8 x i1> %4974, %4980, !dbg !229 + %foldExtExtBinop3054 = and <8 x i1> %4974, %4980, !dbg !229 + %foldExtExtBinop3056 = and <8 x i1> %4974, %4980, !dbg !229 + %foldExtExtBinop3058 = and <8 x i1> %4974, %4980, !dbg !229 + %foldExtExtBinop3060 = and <8 x i1> %4974, %4980, !dbg !229 + %foldExtExtBinop3062 = and <8 x i1> %4974, %4980, !dbg !229 + %foldExtExtBinop3064 = and <8 x i1> %4974, %4980, !dbg !229 + %4981 = getelementptr i32, ptr addrspace(1) %15, i64 %4915, !dbg !230 + %4982 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %4981, i1 true) #3, !dbg !231 + %4983 = shl i32 %4982, 7, !dbg !232 + %4984 = getelementptr i32, ptr addrspace(1) %14, i64 %4919, !dbg !233 + %4985 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %4984, i1 true) #3, !dbg !234 + %4986 = or disjoint i32 %4983, %4923, !dbg !235 + %4987 = or disjoint i32 %4983, %4924, !dbg !235 + %4988 = or disjoint i32 %4983, %4925, !dbg !235 + %4989 = or disjoint i32 %4983, %4926, !dbg !235 + %4990 = or disjoint i32 %4983, %4927, !dbg !235 + %4991 = or disjoint i32 %4983, %4928, !dbg !235 + %4992 = or disjoint i32 %4983, %4929, !dbg !235 + %4993 = or disjoint i32 %4983, %4930, !dbg !235 + %4994 = or disjoint i32 %4983, %4931, !dbg !235 + %4995 = or disjoint i32 %4983, %46, !dbg !235 + %4996 = or disjoint i32 %4983, %47, !dbg !235 + %4997 = or disjoint i32 %4983, %48, !dbg !235 + %4998 = or disjoint i32 %4983, %49, !dbg !235 + %4999 = shl i32 %4995, 12, !dbg !236 + %5000 = shl i32 %4996, 12, !dbg !236 + %5001 = shl i32 %4997, 12, !dbg !236 + %5002 = shl i32 %4998, 12, !dbg !236 + %5003 = shl i32 %4995, 7, !dbg !238 + %5004 = shl i32 %4996, 7, !dbg !238 + %5005 = shl i32 %4997, 7, !dbg !238 + %5006 = shl i32 %4998, 7, !dbg !238 + %5007 = shl i32 %4985, 1, !dbg !239 + %5008 = tail call i32 @llvm.smin.i32(i32 %5007, i32 32), !dbg !240 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #3, !dbg !241 + %5009 = shl nuw i32 %30, 16 + %5010 = sext i32 %4949 to i64 + %5011 = sext i32 %4950 to i64 + %5012 = sext i32 %4951 to i64 + %5013 = sext i32 %4952 to i64 + %5014 = sext i32 %4953 to i64 + %5015 = sext i32 %4954 to i64 + %5016 = sext i32 %4955 to i64 + %5017 = sext i32 %4956 to i64 + %5018 = icmp slt i32 %4945, 2048 + %5019 = icmp slt i32 %4946, 2048 + %5020 = icmp slt i32 %4947, 2048 + %5021 = icmp slt i32 %4948, 2048 + %5022 = and i1 %4961, %5018 + %5023 = and i1 %4961, %5019 + %5024 = and i1 %4961, %5020 + %5025 = and i1 %4961, %5021 + %5026 = shl nuw nsw i32 %4762, 10 + %5027 = or disjoint i32 %4765, %5026 + %5028 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %5027 + %5029 = select i1 %5022, i32 16, i32 0 + %5030 = or disjoint i32 %5027, 2048 + %5031 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %5030 + %5032 = select i1 %5023, i32 16, i32 0 + %5033 = or disjoint i32 %5027, 4096 + %5034 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %5033 + %5035 = select i1 %5024, i32 16, i32 0 + %5036 = or disjoint i32 %5027, 6144 + %5037 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %5036 + %5038 = select i1 %5025, i32 16, i32 0 + %5039 = icmp slt i32 %4932, 2048 + %5040 = icmp slt i32 %4933, 2048 + %5041 = icmp slt i32 %4934, 2048 + %5042 = icmp slt i32 %4935, 2048 + %5043 = icmp slt i32 %4936, 2048 + %5044 = icmp slt i32 %4937, 2048 + %5045 = icmp slt i32 %4938, 2048 + %5046 = icmp slt i32 %4939, 2048 + %5047 = sext i32 %4932 to i64 + %5048 = sext i32 %4933 to i64 + %5049 = sext i32 %4934 to i64 + %5050 = sext i32 %4935 to i64 + %5051 = sext i32 %4936 to i64 + %5052 = sext i32 %4937 to i64 + %5053 = sext i32 %4938 to i64 + %5054 = sext i32 %4939 to i64 + %5055 = and i1 %4961, %5039 + %5056 = and i1 %4961, %5040 + %5057 = and i1 %4961, %5041 + %5058 = and i1 %4961, %5042 + %5059 = and i1 %4961, %5043 + %5060 = and i1 %4961, %5044 + %5061 = and i1 %4961, %5045 + %5062 = and i1 %4961, %5046 + %5063 = and i32 %43, 252 + %5064 = icmp eq i32 %5063, 0 + %5065 = shl nuw nsw i32 %4922, 3 + %5066 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106496), i32 %5065 + %5067 = select i1 %5055, i32 8, i32 0 + %5068 = or disjoint i32 %5065, 32 + %5069 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106496), i32 %5068 + %5070 = select i1 %5056, i32 8, i32 0 + %5071 = or disjoint i32 %5065, 64 + %5072 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106496), i32 %5071 + %5073 = select i1 %5057, i32 8, i32 0 + %5074 = or disjoint i32 %5065, 96 + %5075 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106496), i32 %5074 + %5076 = select i1 %5058, i32 8, i32 0 + %5077 = or disjoint i32 %5065, 128 + %5078 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106496), i32 %5077 + %5079 = select i1 %5059, i32 8, i32 0 + %5080 = or disjoint i32 %5065, 160 + %5081 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106496), i32 %5080 + %5082 = select i1 %5060, i32 8, i32 0 + %5083 = or disjoint i32 %5065, 192 + %5084 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106496), i32 %5083 + %5085 = select i1 %5061, i32 8, i32 0 + %5086 = or disjoint i32 %5065, 224 + %5087 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106496), i32 %5086 + %5088 = select i1 %5062, i32 8, i32 0 + %5089 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %5027 + %5090 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %5030 + %5091 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %5033 + %5092 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %5036 + %5093 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107008), i32 %5065 + %5094 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107008), i32 %5068 + %5095 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107008), i32 %5071 + %5096 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107008), i32 %5074 + %5097 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107008), i32 %5077 + %5098 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107008), i32 %5080 + %5099 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107008), i32 %5083 + %5100 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107008), i32 %5086 + %5101 = icmp sgt i32 %4957, 1 + %5102 = or disjoint i32 %4932, 64 + %5103 = or disjoint i32 %4933, 64 + %5104 = or disjoint i32 %4934, 64 + %5105 = or disjoint i32 %4935, 64 + %5106 = or disjoint i32 %4936, 64 + %5107 = or disjoint i32 %4937, 64 + %5108 = or disjoint i32 %4938, 64 + %5109 = or disjoint i32 %4939, 64 + %5110 = or disjoint i32 %4945, 64 + %5111 = or disjoint i32 %4946, 64 + %5112 = or disjoint i32 %4947, 64 + %5113 = or disjoint i32 %4948, 64 + %5114 = icmp slt i32 %5110, 2048 + %5115 = icmp slt i32 %5111, 2048 + %5116 = icmp slt i32 %5112, 2048 + %5117 = icmp slt i32 %5113, 2048 + %5118 = and i1 %5101, %5114 + %5119 = and i1 %5101, %5115 + %5120 = and i1 %5101, %5116 + %5121 = and i1 %5101, %5117 + %5122 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %5027 + %5123 = select i1 %5118, i32 16, i32 0 + %5124 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %5030 + %5125 = select i1 %5119, i32 16, i32 0 + %5126 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %5033 + %5127 = select i1 %5120, i32 16, i32 0 + %5128 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %5036 + %5129 = select i1 %5121, i32 16, i32 0 + %5130 = icmp slt i32 %5102, 2048 + %5131 = icmp slt i32 %5103, 2048 + %5132 = icmp slt i32 %5104, 2048 + %5133 = icmp slt i32 %5105, 2048 + %5134 = icmp slt i32 %5106, 2048 + %5135 = icmp slt i32 %5107, 2048 + %5136 = icmp slt i32 %5108, 2048 + %5137 = icmp slt i32 %5109, 2048 + %5138 = sext i32 %5102 to i64 + %5139 = sext i32 %5103 to i64 + %5140 = sext i32 %5104 to i64 + %5141 = sext i32 %5105 to i64 + %5142 = sext i32 %5106 to i64 + %5143 = sext i32 %5107 to i64 + %5144 = sext i32 %5108 to i64 + %5145 = sext i32 %5109 to i64 + %5146 = and i1 %5101, %5130 + %5147 = and i1 %5101, %5131 + %5148 = and i1 %5101, %5132 + %5149 = and i1 %5101, %5133 + %5150 = and i1 %5101, %5134 + %5151 = and i1 %5101, %5135 + %5152 = and i1 %5101, %5136 + %5153 = and i1 %5101, %5137 + %5154 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106752), i32 %5065 + %5155 = select i1 %5146, i32 8, i32 0 + %5156 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106752), i32 %5068 + %5157 = select i1 %5147, i32 8, i32 0 + %5158 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106752), i32 %5071 + %5159 = select i1 %5148, i32 8, i32 0 + %5160 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106752), i32 %5074 + %5161 = select i1 %5149, i32 8, i32 0 + %5162 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106752), i32 %5077 + %5163 = select i1 %5150, i32 8, i32 0 + %5164 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106752), i32 %5080 + %5165 = select i1 %5151, i32 8, i32 0 + %5166 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106752), i32 %5083 + %5167 = select i1 %5152, i32 8, i32 0 + %5168 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106752), i32 %5086 + %5169 = select i1 %5153, i32 8, i32 0 + %5170 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %5027 + %5171 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %5030 + %5172 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %5033 + %5173 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %5036 + %5174 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107264), i32 %5065 + %5175 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107264), i32 %5068 + %5176 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107264), i32 %5071 + %5177 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107264), i32 %5074 + %5178 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107264), i32 %5077 + %5179 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107264), i32 %5080 + %5180 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107264), i32 %5083 + %5181 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107264), i32 %5086 + %5182 = add i32 %4958, -2 + %5183 = add nsw i32 %4958, -1 + %5184 = shl nuw nsw i32 %43, 1 + %5185 = and i32 %5184, 444 + %5186 = and i32 %43, 1 + %5187 = icmp eq i32 %5186, 0 + %5188 = shl nuw nsw i32 %5186, 6 + %5189 = lshr i32 %43, 4 + %5190 = and i32 %5189, 2 + %5191 = or disjoint i32 %5185, %5188 + %5192 = or disjoint i32 %5191, %5190 + %5193 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5192 + %5194 = getelementptr inbounds nuw i8, ptr addrspace(3) %5193, i32 512 + %5195 = xor i32 %5192, 4112 + %5196 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5195 + %5197 = getelementptr inbounds nuw i8, ptr addrspace(3) %5196, i32 512 + %5198 = xor i32 %5192, 1028 + %5199 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5198 + %5200 = getelementptr inbounds nuw i8, ptr addrspace(3) %5199, i32 512 + %5201 = xor i32 %5192, 5140 + %5202 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5201 + %5203 = getelementptr inbounds nuw i8, ptr addrspace(3) %5202, i32 512 + %5204 = xor i32 %5192, 2056 + %5205 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5204 + %5206 = getelementptr inbounds nuw i8, ptr addrspace(3) %5205, i32 512 + %5207 = xor i32 %5192, 6168 + %5208 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5207 + %5209 = getelementptr inbounds nuw i8, ptr addrspace(3) %5208, i32 512 + %5210 = xor i32 %5192, 3084 + %5211 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5210 + %5212 = getelementptr inbounds nuw i8, ptr addrspace(3) %5211, i32 512 + %5213 = xor i32 %5192, 7196 + %5214 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5213 + %5215 = getelementptr inbounds nuw i8, ptr addrspace(3) %5214, i32 512 + %5216 = and i32 %43, 12 + %5217 = shl nuw nsw i32 %5216, 8 + %5218 = shl nuw nsw i32 %54, 2 + %5219 = select i1 %5187, i32 0, i32 4112 + %5220 = shl nuw nsw i32 %43, 5 + %5221 = and i32 %5220, 64 + %5222 = and i32 %5184, 32 + %5223 = or disjoint i32 %5217, %5218 + %5224 = or disjoint i32 %5221, %5219 + %5225 = or disjoint i32 %5224, %5223 + %5226 = or disjoint i32 %5225, %5222 + %5227 = or disjoint i32 %5226, %5216 + %5228 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5227 + %5229 = xor i32 %5227, 4 + %5230 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5229 + %5231 = xor i32 %5227, 8 + %5232 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5231 + %5233 = xor i32 %5227, 12 + %5234 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5233 + %5235 = xor i32 %5227, 16 + %5236 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5235 + %5237 = xor i32 %5227, 20 + %5238 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5237 + %5239 = xor i32 %5227, 24 + %5240 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5239 + %5241 = xor i32 %5227, 28 + %5242 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %5241 + %5243 = sext i32 %4999 to i64 + %5244 = sext i32 %5000 to i64 + %5245 = sext i32 %5001 to i64 + %5246 = sext i32 %5002 to i64 + %5247 = sext i32 %5003 to i64 + %5248 = sext i32 %5004 to i64 + %5249 = sext i32 %5005 to i64 + %5250 = sext i32 %5006 to i64 + %5251 = icmp sgt i32 %5007, 0 + %5252 = icmp slt i32 %4995, 2048 + %5253 = icmp slt i32 %4996, 2048 + %5254 = icmp slt i32 %4997, 2048 + %5255 = icmp slt i32 %4998, 2048 + %5256 = and i1 %5251, %5252 + %5257 = and i1 %5251, %5253 + %5258 = and i1 %5251, %5254 + %5259 = and i1 %5251, %5255 + %5260 = select i1 %5256, i32 16, i32 0 + %5261 = select i1 %5257, i32 16, i32 0 + %5262 = select i1 %5258, i32 16, i32 0 + %5263 = select i1 %5259, i32 16, i32 0 + %5264 = icmp slt i32 %4986, 2048 + %5265 = icmp slt i32 %4987, 2048 + %5266 = icmp slt i32 %4988, 2048 + %5267 = icmp slt i32 %4989, 2048 + %5268 = icmp slt i32 %4990, 2048 + %5269 = icmp slt i32 %4991, 2048 + %5270 = icmp slt i32 %4992, 2048 + %5271 = icmp slt i32 %4993, 2048 + %5272 = sext i32 %4986 to i64 + %5273 = sext i32 %4987 to i64 + %5274 = sext i32 %4988 to i64 + %5275 = sext i32 %4989 to i64 + %5276 = sext i32 %4990 to i64 + %5277 = sext i32 %4991 to i64 + %5278 = sext i32 %4992 to i64 + %5279 = sext i32 %4993 to i64 + %5280 = and i1 %5251, %5264 + %5281 = and i1 %5251, %5265 + %5282 = and i1 %5251, %5266 + %5283 = and i1 %5251, %5267 + %5284 = and i1 %5251, %5268 + %5285 = and i1 %5251, %5269 + %5286 = and i1 %5251, %5270 + %5287 = and i1 %5251, %5271 + %5288 = select i1 %5280, i32 8, i32 0 + %5289 = select i1 %5281, i32 8, i32 0 + %5290 = select i1 %5282, i32 8, i32 0 + %5291 = select i1 %5283, i32 8, i32 0 + %5292 = select i1 %5284, i32 8, i32 0 + %5293 = select i1 %5285, i32 8, i32 0 + %5294 = select i1 %5286, i32 8, i32 0 + %5295 = select i1 %5287, i32 8, i32 0 + %5296 = icmp sgt i32 %5007, 1 + %5297 = or disjoint i32 %4986, 64 + %5298 = or disjoint i32 %4987, 64 + %5299 = or disjoint i32 %4988, 64 + %5300 = or disjoint i32 %4989, 64 + %5301 = or disjoint i32 %4990, 64 + %5302 = or disjoint i32 %4991, 64 + %5303 = or disjoint i32 %4992, 64 + %5304 = or disjoint i32 %4993, 64 + %5305 = or disjoint i32 %4995, 64 + %5306 = or disjoint i32 %4996, 64 + %5307 = or disjoint i32 %4997, 64 + %5308 = or disjoint i32 %4998, 64 + %5309 = icmp slt i32 %5305, 2048 + %5310 = icmp slt i32 %5306, 2048 + %5311 = icmp slt i32 %5307, 2048 + %5312 = icmp slt i32 %5308, 2048 + %5313 = and i1 %5296, %5309 + %5314 = and i1 %5296, %5310 + %5315 = and i1 %5296, %5311 + %5316 = and i1 %5296, %5312 + %5317 = select i1 %5313, i32 16, i32 0 + %5318 = select i1 %5314, i32 16, i32 0 + %5319 = select i1 %5315, i32 16, i32 0 + %5320 = select i1 %5316, i32 16, i32 0 + %5321 = icmp slt i32 %5297, 2048 + %5322 = icmp slt i32 %5298, 2048 + %5323 = icmp slt i32 %5299, 2048 + %5324 = icmp slt i32 %5300, 2048 + %5325 = icmp slt i32 %5301, 2048 + %5326 = icmp slt i32 %5302, 2048 + %5327 = icmp slt i32 %5303, 2048 + %5328 = icmp slt i32 %5304, 2048 + %5329 = sext i32 %5297 to i64 + %5330 = sext i32 %5298 to i64 + %5331 = sext i32 %5299 to i64 + %5332 = sext i32 %5300 to i64 + %5333 = sext i32 %5301 to i64 + %5334 = sext i32 %5302 to i64 + %5335 = sext i32 %5303 to i64 + %5336 = sext i32 %5304 to i64 + %5337 = and i1 %5296, %5321 + %5338 = and i1 %5296, %5322 + %5339 = and i1 %5296, %5323 + %5340 = and i1 %5296, %5324 + %5341 = and i1 %5296, %5325 + %5342 = and i1 %5296, %5326 + %5343 = and i1 %5296, %5327 + %5344 = and i1 %5296, %5328 + %5345 = select i1 %5337, i32 8, i32 0 + %5346 = select i1 %5338, i32 8, i32 0 + %5347 = select i1 %5339, i32 8, i32 0 + %5348 = select i1 %5340, i32 8, i32 0 + %5349 = select i1 %5341, i32 8, i32 0 + %5350 = select i1 %5342, i32 8, i32 0 + %5351 = select i1 %5343, i32 8, i32 0 + %5352 = select i1 %5344, i32 8, i32 0 + %5353 = add i32 %5008, -2 + %5354 = add nsw i32 %5008, -1 + %smax2211 = tail call i32 @llvm.smax.i32(i32 %4958, i32 1), !dbg !242 + %smax2213 = tail call i32 @llvm.smax.i32(i32 %5008, i32 1), !dbg !242 + %5355 = zext nneg i32 %4910 to i64, !dbg !242 + %5356 = insertelement <2 x i32> %4942, i32 %4941, i64 1 + %5357 = shufflevector <8 x i32> %4973, <8 x i32> poison, <2 x i32> + %5358 = insertelement <2 x i64> poison, i64 %4962, i64 0 + %5359 = shufflevector <2 x i64> %5358, <2 x i64> poison, <2 x i32> zeroinitializer + %5360 = shufflevector <8 x i1> %foldExtExtBinop, <8 x i1> poison, <2 x i32> + %5361 = shufflevector <8 x i32> %4973, <8 x i32> poison, <2 x i32> + %5362 = shufflevector <8 x i1> %foldExtExtBinop3052, <8 x i1> poison, <2 x i32> + %5363 = shufflevector <8 x i32> %4973, <8 x i32> poison, <2 x i32> + %5364 = shufflevector <8 x i1> %foldExtExtBinop3054, <8 x i1> poison, <2 x i32> + %5365 = shufflevector <8 x i32> %4973, <8 x i32> poison, <2 x i32> + %5366 = shufflevector <8 x i1> %foldExtExtBinop3056, <8 x i1> poison, <2 x i32> + %5367 = shufflevector <8 x i32> %4973, <8 x i32> poison, <2 x i32> + %5368 = shufflevector <8 x i1> %foldExtExtBinop3058, <8 x i1> poison, <2 x i32> + %5369 = shufflevector <8 x i32> %4973, <8 x i32> poison, <2 x i32> + %5370 = shufflevector <8 x i1> %foldExtExtBinop3060, <8 x i1> poison, <2 x i32> + %5371 = shufflevector <8 x i32> %4973, <8 x i32> poison, <2 x i32> + %5372 = shufflevector <8 x i1> %foldExtExtBinop3062, <8 x i1> poison, <2 x i32> + %5373 = shufflevector <8 x i32> %4973, <8 x i32> poison, <2 x i32> zeroinitializer + %5374 = shufflevector <8 x i1> %foldExtExtBinop3064, <8 x i1> poison, <2 x i32> zeroinitializer + br label %5375, !dbg !242 + +5375: ; preds = %4664, %._crit_edge1820 + %indvars.iv = phi i64 [ 0, %4664 ], [ %indvars.iv.next, %._crit_edge1820 ] + %5376 = phi float [ 0.000000e+00, %4664 ], [ %10534, %._crit_edge1820 ] + %5377 = phi float [ 0.000000e+00, %4664 ], [ %10535, %._crit_edge1820 ] + %5378 = phi float [ 0.000000e+00, %4664 ], [ %10536, %._crit_edge1820 ] + %5379 = phi float [ 0.000000e+00, %4664 ], [ %10537, %._crit_edge1820 ] + %5380 = phi float [ 0.000000e+00, %4664 ], [ %10538, %._crit_edge1820 ] + %5381 = phi float [ 0.000000e+00, %4664 ], [ %10539, %._crit_edge1820 ] + %5382 = phi float [ 0.000000e+00, %4664 ], [ %10540, %._crit_edge1820 ] + %5383 = phi float [ 0.000000e+00, %4664 ], [ %10541, %._crit_edge1820 ] + %5384 = phi float [ 0.000000e+00, %4664 ], [ %10542, %._crit_edge1820 ] + %5385 = phi float [ 0.000000e+00, %4664 ], [ %10543, %._crit_edge1820 ] + %5386 = phi float [ 0.000000e+00, %4664 ], [ %10544, %._crit_edge1820 ] + %5387 = phi float [ 0.000000e+00, %4664 ], [ %10545, %._crit_edge1820 ] + %5388 = phi float [ 0.000000e+00, %4664 ], [ %10546, %._crit_edge1820 ] + %5389 = phi float [ 0.000000e+00, %4664 ], [ %10547, %._crit_edge1820 ] + %5390 = phi float [ 0.000000e+00, %4664 ], [ %10548, %._crit_edge1820 ] + %5391 = phi float [ 0.000000e+00, %4664 ], [ %10549, %._crit_edge1820 ] + %5392 = phi float [ 0.000000e+00, %4664 ], [ %10550, %._crit_edge1820 ] + %5393 = phi float [ 0.000000e+00, %4664 ], [ %10551, %._crit_edge1820 ] + %5394 = phi float [ 0.000000e+00, %4664 ], [ %10552, %._crit_edge1820 ] + %5395 = phi float [ 0.000000e+00, %4664 ], [ %10553, %._crit_edge1820 ] + %5396 = phi float [ 0.000000e+00, %4664 ], [ %10554, %._crit_edge1820 ] + %5397 = phi float [ 0.000000e+00, %4664 ], [ %10555, %._crit_edge1820 ] + %5398 = phi float [ 0.000000e+00, %4664 ], [ %10556, %._crit_edge1820 ] + %5399 = phi float [ 0.000000e+00, %4664 ], [ %10557, %._crit_edge1820 ] + %5400 = phi float [ 0.000000e+00, %4664 ], [ %10558, %._crit_edge1820 ] + %5401 = phi float [ 0.000000e+00, %4664 ], [ %10559, %._crit_edge1820 ] + %5402 = phi float [ 0.000000e+00, %4664 ], [ %10560, %._crit_edge1820 ] + %5403 = phi float [ 0.000000e+00, %4664 ], [ %10561, %._crit_edge1820 ] + %5404 = phi float [ 0.000000e+00, %4664 ], [ %10562, %._crit_edge1820 ] + %5405 = phi float [ 0.000000e+00, %4664 ], [ %10563, %._crit_edge1820 ] + %5406 = phi float [ 0.000000e+00, %4664 ], [ %10564, %._crit_edge1820 ] + %5407 = phi float [ 0.000000e+00, %4664 ], [ %10565, %._crit_edge1820 ] + %5408 = phi float [ 0.000000e+00, %4664 ], [ %10566, %._crit_edge1820 ] + %5409 = phi float [ 0.000000e+00, %4664 ], [ %10567, %._crit_edge1820 ] + %5410 = phi float [ 0.000000e+00, %4664 ], [ %10568, %._crit_edge1820 ] + %5411 = phi float [ 0.000000e+00, %4664 ], [ %10569, %._crit_edge1820 ] + %5412 = phi float [ 0.000000e+00, %4664 ], [ %10570, %._crit_edge1820 ] + %5413 = phi float [ 0.000000e+00, %4664 ], [ %10571, %._crit_edge1820 ] + %5414 = phi float [ 0.000000e+00, %4664 ], [ %10572, %._crit_edge1820 ] + %5415 = phi float [ 0.000000e+00, %4664 ], [ %10573, %._crit_edge1820 ] + %5416 = phi float [ 0.000000e+00, %4664 ], [ %10574, %._crit_edge1820 ] + %5417 = phi float [ 0.000000e+00, %4664 ], [ %10575, %._crit_edge1820 ] + %5418 = phi float [ 0.000000e+00, %4664 ], [ %10576, %._crit_edge1820 ] + %5419 = phi float [ 0.000000e+00, %4664 ], [ %10577, %._crit_edge1820 ] + %5420 = phi float [ 0.000000e+00, %4664 ], [ %10578, %._crit_edge1820 ] + %5421 = phi float [ 0.000000e+00, %4664 ], [ %10579, %._crit_edge1820 ] + %5422 = phi float [ 0.000000e+00, %4664 ], [ %10580, %._crit_edge1820 ] + %5423 = phi float [ 0.000000e+00, %4664 ], [ %10581, %._crit_edge1820 ] + %5424 = phi float [ 0.000000e+00, %4664 ], [ %10582, %._crit_edge1820 ] + %5425 = phi float [ 0.000000e+00, %4664 ], [ %10583, %._crit_edge1820 ] + %5426 = phi float [ 0.000000e+00, %4664 ], [ %10584, %._crit_edge1820 ] + %5427 = phi float [ 0.000000e+00, %4664 ], [ %10585, %._crit_edge1820 ] + %5428 = phi float [ 0.000000e+00, %4664 ], [ %10586, %._crit_edge1820 ] + %5429 = phi float [ 0.000000e+00, %4664 ], [ %10587, %._crit_edge1820 ] + %5430 = phi float [ 0.000000e+00, %4664 ], [ %10588, %._crit_edge1820 ] + %5431 = phi float [ 0.000000e+00, %4664 ], [ %10589, %._crit_edge1820 ] + %5432 = phi float [ 0.000000e+00, %4664 ], [ %10590, %._crit_edge1820 ] + %5433 = phi float [ 0.000000e+00, %4664 ], [ %10591, %._crit_edge1820 ] + %5434 = phi float [ 0.000000e+00, %4664 ], [ %10592, %._crit_edge1820 ] + %5435 = phi float [ 0.000000e+00, %4664 ], [ %10593, %._crit_edge1820 ] + %5436 = phi float [ 0.000000e+00, %4664 ], [ %10594, %._crit_edge1820 ] + %5437 = phi float [ 0.000000e+00, %4664 ], [ %10595, %._crit_edge1820 ] + %5438 = phi float [ 0.000000e+00, %4664 ], [ %10596, %._crit_edge1820 ] + %5439 = phi float [ 0.000000e+00, %4664 ], [ %10597, %._crit_edge1820 ] + %5440 = phi float [ 0.000000e+00, %4664 ], [ %10470, %._crit_edge1820 ] + %5441 = phi float [ 0.000000e+00, %4664 ], [ %10471, %._crit_edge1820 ] + %5442 = phi float [ 0.000000e+00, %4664 ], [ %10472, %._crit_edge1820 ] + %5443 = phi float [ 0.000000e+00, %4664 ], [ %10473, %._crit_edge1820 ] + %5444 = phi float [ 0.000000e+00, %4664 ], [ %10474, %._crit_edge1820 ] + %5445 = phi float [ 0.000000e+00, %4664 ], [ %10475, %._crit_edge1820 ] + %5446 = phi float [ 0.000000e+00, %4664 ], [ %10476, %._crit_edge1820 ] + %5447 = phi float [ 0.000000e+00, %4664 ], [ %10477, %._crit_edge1820 ] + %5448 = phi float [ 0.000000e+00, %4664 ], [ %10478, %._crit_edge1820 ] + %5449 = phi float [ 0.000000e+00, %4664 ], [ %10479, %._crit_edge1820 ] + %5450 = phi float [ 0.000000e+00, %4664 ], [ %10480, %._crit_edge1820 ] + %5451 = phi float [ 0.000000e+00, %4664 ], [ %10481, %._crit_edge1820 ] + %5452 = phi float [ 0.000000e+00, %4664 ], [ %10482, %._crit_edge1820 ] + %5453 = phi float [ 0.000000e+00, %4664 ], [ %10483, %._crit_edge1820 ] + %5454 = phi float [ 0.000000e+00, %4664 ], [ %10484, %._crit_edge1820 ] + %5455 = phi float [ 0.000000e+00, %4664 ], [ %10485, %._crit_edge1820 ] + %5456 = phi float [ 0.000000e+00, %4664 ], [ %10486, %._crit_edge1820 ] + %5457 = phi float [ 0.000000e+00, %4664 ], [ %10487, %._crit_edge1820 ] + %5458 = phi float [ 0.000000e+00, %4664 ], [ %10488, %._crit_edge1820 ] + %5459 = phi float [ 0.000000e+00, %4664 ], [ %10489, %._crit_edge1820 ] + %5460 = phi float [ 0.000000e+00, %4664 ], [ %10490, %._crit_edge1820 ] + %5461 = phi float [ 0.000000e+00, %4664 ], [ %10491, %._crit_edge1820 ] + %5462 = phi float [ 0.000000e+00, %4664 ], [ %10492, %._crit_edge1820 ] + %5463 = phi float [ 0.000000e+00, %4664 ], [ %10493, %._crit_edge1820 ] + %5464 = phi float [ 0.000000e+00, %4664 ], [ %10494, %._crit_edge1820 ] + %5465 = phi float [ 0.000000e+00, %4664 ], [ %10495, %._crit_edge1820 ] + %5466 = phi float [ 0.000000e+00, %4664 ], [ %10496, %._crit_edge1820 ] + %5467 = phi float [ 0.000000e+00, %4664 ], [ %10497, %._crit_edge1820 ] + %5468 = phi float [ 0.000000e+00, %4664 ], [ %10498, %._crit_edge1820 ] + %5469 = phi float [ 0.000000e+00, %4664 ], [ %10499, %._crit_edge1820 ] + %5470 = phi float [ 0.000000e+00, %4664 ], [ %10500, %._crit_edge1820 ] + %5471 = phi float [ 0.000000e+00, %4664 ], [ %10501, %._crit_edge1820 ] + %5472 = phi float [ 0.000000e+00, %4664 ], [ %10502, %._crit_edge1820 ] + %5473 = phi float [ 0.000000e+00, %4664 ], [ %10503, %._crit_edge1820 ] + %5474 = phi float [ 0.000000e+00, %4664 ], [ %10504, %._crit_edge1820 ] + %5475 = phi float [ 0.000000e+00, %4664 ], [ %10505, %._crit_edge1820 ] + %5476 = phi float [ 0.000000e+00, %4664 ], [ %10506, %._crit_edge1820 ] + %5477 = phi float [ 0.000000e+00, %4664 ], [ %10507, %._crit_edge1820 ] + %5478 = phi float [ 0.000000e+00, %4664 ], [ %10508, %._crit_edge1820 ] + %5479 = phi float [ 0.000000e+00, %4664 ], [ %10509, %._crit_edge1820 ] + %5480 = phi float [ 0.000000e+00, %4664 ], [ %10510, %._crit_edge1820 ] + %5481 = phi float [ 0.000000e+00, %4664 ], [ %10511, %._crit_edge1820 ] + %5482 = phi float [ 0.000000e+00, %4664 ], [ %10512, %._crit_edge1820 ] + %5483 = phi float [ 0.000000e+00, %4664 ], [ %10513, %._crit_edge1820 ] + %5484 = phi float [ 0.000000e+00, %4664 ], [ %10514, %._crit_edge1820 ] + %5485 = phi float [ 0.000000e+00, %4664 ], [ %10515, %._crit_edge1820 ] + %5486 = phi float [ 0.000000e+00, %4664 ], [ %10516, %._crit_edge1820 ] + %5487 = phi float [ 0.000000e+00, %4664 ], [ %10517, %._crit_edge1820 ] + %5488 = phi float [ 0.000000e+00, %4664 ], [ %10518, %._crit_edge1820 ] + %5489 = phi float [ 0.000000e+00, %4664 ], [ %10519, %._crit_edge1820 ] + %5490 = phi float [ 0.000000e+00, %4664 ], [ %10520, %._crit_edge1820 ] + %5491 = phi float [ 0.000000e+00, %4664 ], [ %10521, %._crit_edge1820 ] + %5492 = phi float [ 0.000000e+00, %4664 ], [ %10522, %._crit_edge1820 ] + %5493 = phi float [ 0.000000e+00, %4664 ], [ %10523, %._crit_edge1820 ] + %5494 = phi float [ 0.000000e+00, %4664 ], [ %10524, %._crit_edge1820 ] + %5495 = phi float [ 0.000000e+00, %4664 ], [ %10525, %._crit_edge1820 ] + %5496 = phi float [ 0.000000e+00, %4664 ], [ %10526, %._crit_edge1820 ] + %5497 = phi float [ 0.000000e+00, %4664 ], [ %10527, %._crit_edge1820 ] + %5498 = phi float [ 0.000000e+00, %4664 ], [ %10528, %._crit_edge1820 ] + %5499 = phi float [ 0.000000e+00, %4664 ], [ %10529, %._crit_edge1820 ] + %5500 = phi float [ 0.000000e+00, %4664 ], [ %10530, %._crit_edge1820 ] + %5501 = phi float [ 0.000000e+00, %4664 ], [ %10531, %._crit_edge1820 ] + %5502 = phi float [ 0.000000e+00, %4664 ], [ %10532, %._crit_edge1820 ] + %5503 = phi float [ 0.000000e+00, %4664 ], [ %10533, %._crit_edge1820 ] + %5504 = add nuw nsw i64 %indvars.iv, %5355, !dbg !243 + %.tr = trunc i64 %5504 to i32, !dbg !244 + %5505 = shl i32 %.tr, 7, !dbg !244 + %5506 = add i32 %5505, %4911, !dbg !244 + %5507 = sext i32 %5506 to i64, !dbg !245 + %5508 = trunc nuw nsw i64 %5504 to i32, !dbg !246 + %5509 = shl i32 %5508, 18, !dbg !246 + %5510 = add i32 %5509, %4911, !dbg !247 + %5511 = sext i32 %5510 to i64, !dbg !248 + %.tr2216 = trunc i64 %5504 to i32, !dbg !249 + %5512 = shl i32 %.tr2216, 11, !dbg !249 + %5513 = add i32 %5512, %5009, !dbg !249 + %5514 = sext i32 %5513 to i64, !dbg !250 + %5515 = getelementptr bfloat, ptr addrspace(1) %0, i64 %5507, !dbg !251 + %5516 = getelementptr bfloat, ptr addrspace(1) %5, i64 %5511, !dbg !252 + %5517 = getelementptr float, ptr addrspace(1) %3, i64 %5514, !dbg !253 + %5518 = getelementptr float, ptr addrspace(1) %4, i64 %5514, !dbg !254 + %5519 = getelementptr bfloat, ptr addrspace(1) %5515, i64 %5010, !dbg !255 + %5520 = getelementptr bfloat, ptr addrspace(1) %5515, i64 %5011, !dbg !255 + %5521 = getelementptr bfloat, ptr addrspace(1) %5515, i64 %5012, !dbg !255 + %5522 = getelementptr bfloat, ptr addrspace(1) %5515, i64 %5013, !dbg !255 + %5523 = getelementptr bfloat, ptr addrspace(1) %5519, i64 %4701, !dbg !256 + %5524 = getelementptr bfloat, ptr addrspace(1) %5520, i64 %4701, !dbg !256 + %5525 = getelementptr bfloat, ptr addrspace(1) %5521, i64 %4701, !dbg !256 + %5526 = getelementptr bfloat, ptr addrspace(1) %5522, i64 %4701, !dbg !256 + %5527 = getelementptr bfloat, ptr addrspace(1) %5516, i64 %5014, !dbg !257 + %5528 = getelementptr bfloat, ptr addrspace(1) %5516, i64 %5015, !dbg !257 + %5529 = getelementptr bfloat, ptr addrspace(1) %5516, i64 %5016, !dbg !257 + %5530 = getelementptr bfloat, ptr addrspace(1) %5516, i64 %5017, !dbg !257 + %5531 = getelementptr bfloat, ptr addrspace(1) %5527, i64 %4701, !dbg !258 + %5532 = getelementptr bfloat, ptr addrspace(1) %5528, i64 %4701, !dbg !258 + %5533 = getelementptr bfloat, ptr addrspace(1) %5529, i64 %4701, !dbg !258 + %5534 = getelementptr bfloat, ptr addrspace(1) %5530, i64 %4701, !dbg !258 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5028, ptr addrspace(1) %5523, i32 %5029) #3, !dbg !259 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5031, ptr addrspace(1) %5524, i32 %5032) #3, !dbg !259 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5034, ptr addrspace(1) %5525, i32 %5035) #3, !dbg !259 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5037, ptr addrspace(1) %5526, i32 %5038) #3, !dbg !259 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !259 + %5535 = getelementptr float, ptr addrspace(1) %5517, i64 %5047, !dbg !260 + %5536 = getelementptr float, ptr addrspace(1) %5517, i64 %5048, !dbg !260 + %5537 = getelementptr float, ptr addrspace(1) %5517, i64 %5049, !dbg !260 + %5538 = getelementptr float, ptr addrspace(1) %5517, i64 %5050, !dbg !260 + %5539 = getelementptr float, ptr addrspace(1) %5517, i64 %5051, !dbg !260 + %5540 = getelementptr float, ptr addrspace(1) %5517, i64 %5052, !dbg !260 + %5541 = getelementptr float, ptr addrspace(1) %5517, i64 %5053, !dbg !260 + %5542 = getelementptr float, ptr addrspace(1) %5517, i64 %5054, !dbg !260 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %5066, ptr addrspace(1) %5535, i32 %5067, i1 %5064) #3, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5069, ptr addrspace(1) %5536, i32 %5070, i1 %5064) #3, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5072, ptr addrspace(1) %5537, i32 %5073, i1 %5064) #3, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5075, ptr addrspace(1) %5538, i32 %5076, i1 %5064) #3, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5078, ptr addrspace(1) %5539, i32 %5079, i1 %5064) #3, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5081, ptr addrspace(1) %5540, i32 %5082, i1 %5064) #3, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5084, ptr addrspace(1) %5541, i32 %5085, i1 %5064) #3, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5087, ptr addrspace(1) %5542, i32 %5088, i1 %5064) #3, !dbg !261 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !261 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5089, ptr addrspace(1) %5531, i32 %5029) #3, !dbg !262 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5090, ptr addrspace(1) %5532, i32 %5032) #3, !dbg !262 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5091, ptr addrspace(1) %5533, i32 %5035) #3, !dbg !262 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5092, ptr addrspace(1) %5534, i32 %5038) #3, !dbg !262 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !262 + %5543 = getelementptr float, ptr addrspace(1) %5518, i64 %5047, !dbg !263 + %5544 = getelementptr float, ptr addrspace(1) %5518, i64 %5048, !dbg !263 + %5545 = getelementptr float, ptr addrspace(1) %5518, i64 %5049, !dbg !263 + %5546 = getelementptr float, ptr addrspace(1) %5518, i64 %5050, !dbg !263 + %5547 = getelementptr float, ptr addrspace(1) %5518, i64 %5051, !dbg !263 + %5548 = getelementptr float, ptr addrspace(1) %5518, i64 %5052, !dbg !263 + %5549 = getelementptr float, ptr addrspace(1) %5518, i64 %5053, !dbg !263 + %5550 = getelementptr float, ptr addrspace(1) %5518, i64 %5054, !dbg !263 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %5093, ptr addrspace(1) %5543, i32 %5067, i1 %5064) #3, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5094, ptr addrspace(1) %5544, i32 %5070, i1 %5064) #3, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5095, ptr addrspace(1) %5545, i32 %5073, i1 %5064) #3, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5096, ptr addrspace(1) %5546, i32 %5076, i1 %5064) #3, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5097, ptr addrspace(1) %5547, i32 %5079, i1 %5064) #3, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5098, ptr addrspace(1) %5548, i32 %5082, i1 %5064) #3, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5099, ptr addrspace(1) %5549, i32 %5085, i1 %5064) #3, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5100, ptr addrspace(1) %5550, i32 %5088, i1 %5064) #3, !dbg !264 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !264 + %5551 = getelementptr i8, ptr addrspace(1) %5523, i64 524288, !dbg !265 + %5552 = getelementptr i8, ptr addrspace(1) %5524, i64 524288, !dbg !265 + %5553 = getelementptr i8, ptr addrspace(1) %5525, i64 524288, !dbg !265 + %5554 = getelementptr i8, ptr addrspace(1) %5526, i64 524288, !dbg !265 + %5555 = getelementptr i8, ptr addrspace(1) %5531, i64 16384, !dbg !266 + %5556 = getelementptr i8, ptr addrspace(1) %5532, i64 16384, !dbg !266 + %5557 = getelementptr i8, ptr addrspace(1) %5533, i64 16384, !dbg !266 + %5558 = getelementptr i8, ptr addrspace(1) %5534, i64 16384, !dbg !266 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !259 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5122, ptr addrspace(1) %5551, i32 %5123) #3, !dbg !259 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5124, ptr addrspace(1) %5552, i32 %5125) #3, !dbg !259 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5126, ptr addrspace(1) %5553, i32 %5127) #3, !dbg !259 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5128, ptr addrspace(1) %5554, i32 %5129) #3, !dbg !259 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !259 + %5559 = getelementptr float, ptr addrspace(1) %5517, i64 %5138, !dbg !260 + %5560 = getelementptr float, ptr addrspace(1) %5517, i64 %5139, !dbg !260 + %5561 = getelementptr float, ptr addrspace(1) %5517, i64 %5140, !dbg !260 + %5562 = getelementptr float, ptr addrspace(1) %5517, i64 %5141, !dbg !260 + %5563 = getelementptr float, ptr addrspace(1) %5517, i64 %5142, !dbg !260 + %5564 = getelementptr float, ptr addrspace(1) %5517, i64 %5143, !dbg !260 + %5565 = getelementptr float, ptr addrspace(1) %5517, i64 %5144, !dbg !260 + %5566 = getelementptr float, ptr addrspace(1) %5517, i64 %5145, !dbg !260 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %5154, ptr addrspace(1) %5559, i32 %5155, i1 %5064) #3, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5156, ptr addrspace(1) %5560, i32 %5157, i1 %5064) #3, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5158, ptr addrspace(1) %5561, i32 %5159, i1 %5064) #3, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5160, ptr addrspace(1) %5562, i32 %5161, i1 %5064) #3, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5162, ptr addrspace(1) %5563, i32 %5163, i1 %5064) #3, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5164, ptr addrspace(1) %5564, i32 %5165, i1 %5064) #3, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5166, ptr addrspace(1) %5565, i32 %5167, i1 %5064) #3, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5168, ptr addrspace(1) %5566, i32 %5169, i1 %5064) #3, !dbg !261 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !261 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5170, ptr addrspace(1) %5555, i32 %5123) #3, !dbg !262 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5171, ptr addrspace(1) %5556, i32 %5125) #3, !dbg !262 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5172, ptr addrspace(1) %5557, i32 %5127) #3, !dbg !262 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5173, ptr addrspace(1) %5558, i32 %5129) #3, !dbg !262 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !262 + %5567 = getelementptr float, ptr addrspace(1) %5518, i64 %5138, !dbg !263 + %5568 = getelementptr float, ptr addrspace(1) %5518, i64 %5139, !dbg !263 + %5569 = getelementptr float, ptr addrspace(1) %5518, i64 %5140, !dbg !263 + %5570 = getelementptr float, ptr addrspace(1) %5518, i64 %5141, !dbg !263 + %5571 = getelementptr float, ptr addrspace(1) %5518, i64 %5142, !dbg !263 + %5572 = getelementptr float, ptr addrspace(1) %5518, i64 %5143, !dbg !263 + %5573 = getelementptr float, ptr addrspace(1) %5518, i64 %5144, !dbg !263 + %5574 = getelementptr float, ptr addrspace(1) %5518, i64 %5145, !dbg !263 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %5174, ptr addrspace(1) %5567, i32 %5155, i1 %5064) #3, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5175, ptr addrspace(1) %5568, i32 %5157, i1 %5064) #3, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5176, ptr addrspace(1) %5569, i32 %5159, i1 %5064) #3, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5177, ptr addrspace(1) %5570, i32 %5161, i1 %5064) #3, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5178, ptr addrspace(1) %5571, i32 %5163, i1 %5064) #3, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5179, ptr addrspace(1) %5572, i32 %5165, i1 %5064) #3, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5180, ptr addrspace(1) %5573, i32 %5167, i1 %5064) #3, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5181, ptr addrspace(1) %5574, i32 %5169, i1 %5064) #3, !dbg !264 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !264 + br i1 %4961, label %.lr.ph1669, label %._crit_edge1670, !dbg !223 + +.lr.ph1669: ; preds = %5375, %__nv_exp2f.exit1380 + %.pn2291667 = phi i32 [ %7892, %__nv_exp2f.exit1380 ], [ %4939, %5375 ] + %.pn2331666 = phi i32 [ %7891, %__nv_exp2f.exit1380 ], [ %4938, %5375 ] + %.pn2371665 = phi i32 [ %7890, %__nv_exp2f.exit1380 ], [ %4937, %5375 ] + %.pn2411664 = phi i32 [ %7889, %__nv_exp2f.exit1380 ], [ %4936, %5375 ] + %.pn2451663 = phi i32 [ %7888, %__nv_exp2f.exit1380 ], [ %4935, %5375 ] + %.pn2491662 = phi i32 [ %7887, %__nv_exp2f.exit1380 ], [ %4934, %5375 ] + %.pn2531661 = phi i32 [ %7886, %__nv_exp2f.exit1380 ], [ %4933, %5375 ] + %.pn2571660 = phi i32 [ %7885, %__nv_exp2f.exit1380 ], [ %4932, %5375 ] + %5575 = phi i32 [ %7916, %__nv_exp2f.exit1380 ], [ 64, %5375 ] + %5576 = phi i32 [ %5723, %__nv_exp2f.exit1380 ], [ -1, %5375 ] + %5577 = phi i32 [ %7947, %__nv_exp2f.exit1380 ], [ 1, %5375 ] + %5578 = phi i32 [ %5726, %__nv_exp2f.exit1380 ], [ -1, %5375 ] + %5579 = phi i32 [ %7950, %__nv_exp2f.exit1380 ], [ 1, %5375 ] + %.pn1971659 = phi i32 [ %7936, %__nv_exp2f.exit1380 ], [ %5109, %5375 ] + %.pn2011658 = phi i32 [ %7935, %__nv_exp2f.exit1380 ], [ %5108, %5375 ] + %.pn2051657 = phi i32 [ %7934, %__nv_exp2f.exit1380 ], [ %5107, %5375 ] + %.pn2091656 = phi i32 [ %7933, %__nv_exp2f.exit1380 ], [ %5106, %5375 ] + %.pn2131655 = phi i32 [ %7932, %__nv_exp2f.exit1380 ], [ %5105, %5375 ] + %.pn2171654 = phi i32 [ %7931, %__nv_exp2f.exit1380 ], [ %5104, %5375 ] + %.pn2211653 = phi i32 [ %7930, %__nv_exp2f.exit1380 ], [ %5103, %5375 ] + %.pn2251652 = phi i32 [ %7929, %__nv_exp2f.exit1380 ], [ %5102, %5375 ] + %5580 = phi i32 [ %7941, %__nv_exp2f.exit1380 ], [ %5110, %5375 ] + %5581 = phi i32 [ %7942, %__nv_exp2f.exit1380 ], [ %5111, %5375 ] + %5582 = phi i32 [ %7943, %__nv_exp2f.exit1380 ], [ %5112, %5375 ] + %5583 = phi i32 [ %7944, %__nv_exp2f.exit1380 ], [ %5113, %5375 ] + %.pn1451651 = phi ptr addrspace(1) [ %7928, %__nv_exp2f.exit1380 ], [ %5558, %5375 ] + %.pn1611650 = phi ptr addrspace(1) [ %7927, %__nv_exp2f.exit1380 ], [ %5557, %5375 ] + %.pn1771649 = phi ptr addrspace(1) [ %7926, %__nv_exp2f.exit1380 ], [ %5556, %5375 ] + %.pn1931648 = phi ptr addrspace(1) [ %7925, %__nv_exp2f.exit1380 ], [ %5555, %5375 ] + %5584 = phi i32 [ %7937, %__nv_exp2f.exit1380 ], [ %5110, %5375 ] + %5585 = phi i32 [ %7938, %__nv_exp2f.exit1380 ], [ %5111, %5375 ] + %5586 = phi i32 [ %7939, %__nv_exp2f.exit1380 ], [ %5112, %5375 ] + %5587 = phi i32 [ %7940, %__nv_exp2f.exit1380 ], [ %5113, %5375 ] + %.pn811647 = phi ptr addrspace(1) [ %7922, %__nv_exp2f.exit1380 ], [ %5554, %5375 ] + %.pn971646 = phi ptr addrspace(1) [ %7921, %__nv_exp2f.exit1380 ], [ %5553, %5375 ] + %.pn1131645 = phi ptr addrspace(1) [ %7920, %__nv_exp2f.exit1380 ], [ %5552, %5375 ] + %.pn1291644 = phi ptr addrspace(1) [ %7919, %__nv_exp2f.exit1380 ], [ %5551, %5375 ] + %5588 = phi float [ %6965, %__nv_exp2f.exit1380 ], [ %5440, %5375 ] + %5589 = phi float [ %6966, %__nv_exp2f.exit1380 ], [ %5441, %5375 ] + %5590 = phi float [ %6967, %__nv_exp2f.exit1380 ], [ %5442, %5375 ] + %5591 = phi float [ %6968, %__nv_exp2f.exit1380 ], [ %5443, %5375 ] + %5592 = phi float [ %6969, %__nv_exp2f.exit1380 ], [ %5444, %5375 ] + %5593 = phi float [ %6970, %__nv_exp2f.exit1380 ], [ %5445, %5375 ] + %5594 = phi float [ %6971, %__nv_exp2f.exit1380 ], [ %5446, %5375 ] + %5595 = phi float [ %6972, %__nv_exp2f.exit1380 ], [ %5447, %5375 ] + %5596 = phi float [ %6973, %__nv_exp2f.exit1380 ], [ %5448, %5375 ] + %5597 = phi float [ %6974, %__nv_exp2f.exit1380 ], [ %5449, %5375 ] + %5598 = phi float [ %6975, %__nv_exp2f.exit1380 ], [ %5450, %5375 ] + %5599 = phi float [ %6976, %__nv_exp2f.exit1380 ], [ %5451, %5375 ] + %5600 = phi float [ %6977, %__nv_exp2f.exit1380 ], [ %5452, %5375 ] + %5601 = phi float [ %6978, %__nv_exp2f.exit1380 ], [ %5453, %5375 ] + %5602 = phi float [ %6979, %__nv_exp2f.exit1380 ], [ %5454, %5375 ] + %5603 = phi float [ %6980, %__nv_exp2f.exit1380 ], [ %5455, %5375 ] + %5604 = phi float [ %6981, %__nv_exp2f.exit1380 ], [ %5456, %5375 ] + %5605 = phi float [ %6982, %__nv_exp2f.exit1380 ], [ %5457, %5375 ] + %5606 = phi float [ %6983, %__nv_exp2f.exit1380 ], [ %5458, %5375 ] + %5607 = phi float [ %6984, %__nv_exp2f.exit1380 ], [ %5459, %5375 ] + %5608 = phi float [ %6985, %__nv_exp2f.exit1380 ], [ %5460, %5375 ] + %5609 = phi float [ %6986, %__nv_exp2f.exit1380 ], [ %5461, %5375 ] + %5610 = phi float [ %6987, %__nv_exp2f.exit1380 ], [ %5462, %5375 ] + %5611 = phi float [ %6988, %__nv_exp2f.exit1380 ], [ %5463, %5375 ] + %5612 = phi float [ %6989, %__nv_exp2f.exit1380 ], [ %5464, %5375 ] + %5613 = phi float [ %6990, %__nv_exp2f.exit1380 ], [ %5465, %5375 ] + %5614 = phi float [ %6991, %__nv_exp2f.exit1380 ], [ %5466, %5375 ] + %5615 = phi float [ %6992, %__nv_exp2f.exit1380 ], [ %5467, %5375 ] + %5616 = phi float [ %6993, %__nv_exp2f.exit1380 ], [ %5468, %5375 ] + %5617 = phi float [ %6994, %__nv_exp2f.exit1380 ], [ %5469, %5375 ] + %5618 = phi float [ %6995, %__nv_exp2f.exit1380 ], [ %5470, %5375 ] + %5619 = phi float [ %6996, %__nv_exp2f.exit1380 ], [ %5471, %5375 ] + %5620 = phi float [ %6997, %__nv_exp2f.exit1380 ], [ %5472, %5375 ] + %5621 = phi float [ %6998, %__nv_exp2f.exit1380 ], [ %5473, %5375 ] + %5622 = phi float [ %6999, %__nv_exp2f.exit1380 ], [ %5474, %5375 ] + %5623 = phi float [ %7000, %__nv_exp2f.exit1380 ], [ %5475, %5375 ] + %5624 = phi float [ %7001, %__nv_exp2f.exit1380 ], [ %5476, %5375 ] + %5625 = phi float [ %7002, %__nv_exp2f.exit1380 ], [ %5477, %5375 ] + %5626 = phi float [ %7003, %__nv_exp2f.exit1380 ], [ %5478, %5375 ] + %5627 = phi float [ %7004, %__nv_exp2f.exit1380 ], [ %5479, %5375 ] + %5628 = phi float [ %7005, %__nv_exp2f.exit1380 ], [ %5480, %5375 ] + %5629 = phi float [ %7006, %__nv_exp2f.exit1380 ], [ %5481, %5375 ] + %5630 = phi float [ %7007, %__nv_exp2f.exit1380 ], [ %5482, %5375 ] + %5631 = phi float [ %7008, %__nv_exp2f.exit1380 ], [ %5483, %5375 ] + %5632 = phi float [ %7009, %__nv_exp2f.exit1380 ], [ %5484, %5375 ] + %5633 = phi float [ %7010, %__nv_exp2f.exit1380 ], [ %5485, %5375 ] + %5634 = phi float [ %7011, %__nv_exp2f.exit1380 ], [ %5486, %5375 ] + %5635 = phi float [ %7012, %__nv_exp2f.exit1380 ], [ %5487, %5375 ] + %5636 = phi float [ %7013, %__nv_exp2f.exit1380 ], [ %5488, %5375 ] + %5637 = phi float [ %7014, %__nv_exp2f.exit1380 ], [ %5489, %5375 ] + %5638 = phi float [ %7015, %__nv_exp2f.exit1380 ], [ %5490, %5375 ] + %5639 = phi float [ %7016, %__nv_exp2f.exit1380 ], [ %5491, %5375 ] + %5640 = phi float [ %7017, %__nv_exp2f.exit1380 ], [ %5492, %5375 ] + %5641 = phi float [ %7018, %__nv_exp2f.exit1380 ], [ %5493, %5375 ] + %5642 = phi float [ %7019, %__nv_exp2f.exit1380 ], [ %5494, %5375 ] + %5643 = phi float [ %7020, %__nv_exp2f.exit1380 ], [ %5495, %5375 ] + %5644 = phi float [ %7021, %__nv_exp2f.exit1380 ], [ %5496, %5375 ] + %5645 = phi float [ %7022, %__nv_exp2f.exit1380 ], [ %5497, %5375 ] + %5646 = phi float [ %7023, %__nv_exp2f.exit1380 ], [ %5498, %5375 ] + %5647 = phi float [ %7024, %__nv_exp2f.exit1380 ], [ %5499, %5375 ] + %5648 = phi float [ %7025, %__nv_exp2f.exit1380 ], [ %5500, %5375 ] + %5649 = phi float [ %7026, %__nv_exp2f.exit1380 ], [ %5501, %5375 ] + %5650 = phi float [ %7027, %__nv_exp2f.exit1380 ], [ %5502, %5375 ] + %5651 = phi float [ %7028, %__nv_exp2f.exit1380 ], [ %5503, %5375 ] + %5652 = phi float [ %7821, %__nv_exp2f.exit1380 ], [ %5376, %5375 ] + %5653 = phi float [ %7822, %__nv_exp2f.exit1380 ], [ %5377, %5375 ] + %5654 = phi float [ %7823, %__nv_exp2f.exit1380 ], [ %5378, %5375 ] + %5655 = phi float [ %7824, %__nv_exp2f.exit1380 ], [ %5379, %5375 ] + %5656 = phi float [ %7825, %__nv_exp2f.exit1380 ], [ %5380, %5375 ] + %5657 = phi float [ %7826, %__nv_exp2f.exit1380 ], [ %5381, %5375 ] + %5658 = phi float [ %7827, %__nv_exp2f.exit1380 ], [ %5382, %5375 ] + %5659 = phi float [ %7828, %__nv_exp2f.exit1380 ], [ %5383, %5375 ] + %5660 = phi float [ %7829, %__nv_exp2f.exit1380 ], [ %5384, %5375 ] + %5661 = phi float [ %7830, %__nv_exp2f.exit1380 ], [ %5385, %5375 ] + %5662 = phi float [ %7831, %__nv_exp2f.exit1380 ], [ %5386, %5375 ] + %5663 = phi float [ %7832, %__nv_exp2f.exit1380 ], [ %5387, %5375 ] + %5664 = phi float [ %7833, %__nv_exp2f.exit1380 ], [ %5388, %5375 ] + %5665 = phi float [ %7834, %__nv_exp2f.exit1380 ], [ %5389, %5375 ] + %5666 = phi float [ %7835, %__nv_exp2f.exit1380 ], [ %5390, %5375 ] + %5667 = phi float [ %7836, %__nv_exp2f.exit1380 ], [ %5391, %5375 ] + %5668 = phi float [ %7837, %__nv_exp2f.exit1380 ], [ %5392, %5375 ] + %5669 = phi float [ %7838, %__nv_exp2f.exit1380 ], [ %5393, %5375 ] + %5670 = phi float [ %7839, %__nv_exp2f.exit1380 ], [ %5394, %5375 ] + %5671 = phi float [ %7840, %__nv_exp2f.exit1380 ], [ %5395, %5375 ] + %5672 = phi float [ %7841, %__nv_exp2f.exit1380 ], [ %5396, %5375 ] + %5673 = phi float [ %7842, %__nv_exp2f.exit1380 ], [ %5397, %5375 ] + %5674 = phi float [ %7843, %__nv_exp2f.exit1380 ], [ %5398, %5375 ] + %5675 = phi float [ %7844, %__nv_exp2f.exit1380 ], [ %5399, %5375 ] + %5676 = phi float [ %7845, %__nv_exp2f.exit1380 ], [ %5400, %5375 ] + %5677 = phi float [ %7846, %__nv_exp2f.exit1380 ], [ %5401, %5375 ] + %5678 = phi float [ %7847, %__nv_exp2f.exit1380 ], [ %5402, %5375 ] + %5679 = phi float [ %7848, %__nv_exp2f.exit1380 ], [ %5403, %5375 ] + %5680 = phi float [ %7849, %__nv_exp2f.exit1380 ], [ %5404, %5375 ] + %5681 = phi float [ %7850, %__nv_exp2f.exit1380 ], [ %5405, %5375 ] + %5682 = phi float [ %7851, %__nv_exp2f.exit1380 ], [ %5406, %5375 ] + %5683 = phi float [ %7852, %__nv_exp2f.exit1380 ], [ %5407, %5375 ] + %5684 = phi float [ %7853, %__nv_exp2f.exit1380 ], [ %5408, %5375 ] + %5685 = phi float [ %7854, %__nv_exp2f.exit1380 ], [ %5409, %5375 ] + %5686 = phi float [ %7855, %__nv_exp2f.exit1380 ], [ %5410, %5375 ] + %5687 = phi float [ %7856, %__nv_exp2f.exit1380 ], [ %5411, %5375 ] + %5688 = phi float [ %7857, %__nv_exp2f.exit1380 ], [ %5412, %5375 ] + %5689 = phi float [ %7858, %__nv_exp2f.exit1380 ], [ %5413, %5375 ] + %5690 = phi float [ %7859, %__nv_exp2f.exit1380 ], [ %5414, %5375 ] + %5691 = phi float [ %7860, %__nv_exp2f.exit1380 ], [ %5415, %5375 ] + %5692 = phi float [ %7861, %__nv_exp2f.exit1380 ], [ %5416, %5375 ] + %5693 = phi float [ %7862, %__nv_exp2f.exit1380 ], [ %5417, %5375 ] + %5694 = phi float [ %7863, %__nv_exp2f.exit1380 ], [ %5418, %5375 ] + %5695 = phi float [ %7864, %__nv_exp2f.exit1380 ], [ %5419, %5375 ] + %5696 = phi float [ %7865, %__nv_exp2f.exit1380 ], [ %5420, %5375 ] + %5697 = phi float [ %7866, %__nv_exp2f.exit1380 ], [ %5421, %5375 ] + %5698 = phi float [ %7867, %__nv_exp2f.exit1380 ], [ %5422, %5375 ] + %5699 = phi float [ %7868, %__nv_exp2f.exit1380 ], [ %5423, %5375 ] + %5700 = phi float [ %7869, %__nv_exp2f.exit1380 ], [ %5424, %5375 ] + %5701 = phi float [ %7870, %__nv_exp2f.exit1380 ], [ %5425, %5375 ] + %5702 = phi float [ %7871, %__nv_exp2f.exit1380 ], [ %5426, %5375 ] + %5703 = phi float [ %7872, %__nv_exp2f.exit1380 ], [ %5427, %5375 ] + %5704 = phi float [ %7873, %__nv_exp2f.exit1380 ], [ %5428, %5375 ] + %5705 = phi float [ %7874, %__nv_exp2f.exit1380 ], [ %5429, %5375 ] + %5706 = phi float [ %7875, %__nv_exp2f.exit1380 ], [ %5430, %5375 ] + %5707 = phi float [ %7876, %__nv_exp2f.exit1380 ], [ %5431, %5375 ] + %5708 = phi float [ %7877, %__nv_exp2f.exit1380 ], [ %5432, %5375 ] + %5709 = phi float [ %7878, %__nv_exp2f.exit1380 ], [ %5433, %5375 ] + %5710 = phi float [ %7879, %__nv_exp2f.exit1380 ], [ %5434, %5375 ] + %5711 = phi float [ %7880, %__nv_exp2f.exit1380 ], [ %5435, %5375 ] + %5712 = phi float [ %7881, %__nv_exp2f.exit1380 ], [ %5436, %5375 ] + %5713 = phi float [ %7882, %__nv_exp2f.exit1380 ], [ %5437, %5375 ] + %5714 = phi float [ %7883, %__nv_exp2f.exit1380 ], [ %5438, %5375 ] + %5715 = phi float [ %7884, %__nv_exp2f.exit1380 ], [ %5439, %5375 ] + %5716 = phi i32 [ %7897, %__nv_exp2f.exit1380 ], [ 0, %5375 ] + %5717 = phi <2 x i32> [ %7895, %__nv_exp2f.exit1380 ], [ %5356, %5375 ] + %5718 = phi <2 x i32> [ %7896, %__nv_exp2f.exit1380 ], [ %4944, %5375 ] + %5719 = icmp slt i32 %5716, %5182, !dbg !223 + %5720 = icmp slt i32 %5716, %5183, !dbg !223 + %5721 = add i32 %5576, 1, !dbg !223 + %5722 = icmp sgt i32 %5721, 1, !dbg !223 + %5723 = select i1 %5722, i32 0, i32 %5721, !dbg !223 + %5724 = add i32 %5578, 1, !dbg !223 + %5725 = icmp sgt i32 %5724, 2, !dbg !223 + %5726 = select i1 %5725, i32 0, i32 %5724, !dbg !223 + %5727 = icmp slt i32 %.pn2571660, 2048, !dbg !267 + %5728 = icmp slt i32 %.pn2531661, 2048, !dbg !267 + %5729 = icmp slt i32 %.pn2491662, 2048, !dbg !267 + %5730 = icmp slt i32 %.pn2451663, 2048, !dbg !267 + %5731 = icmp slt i32 %.pn2411664, 2048, !dbg !267 + %5732 = icmp slt i32 %.pn2371665, 2048, !dbg !267 + %5733 = icmp slt i32 %.pn2331666, 2048, !dbg !267 + %5734 = icmp slt i32 %.pn2291667, 2048, !dbg !267 + tail call void @llvm.nvvm.cp.async.wait.group(i32 4), !dbg !259 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !259 + %5735 = shl i32 %5726, 13, !dbg !259 + %5736 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %5735, !dbg !259 + %5737 = shl i32 %5723, 6, !dbg !261 + %5738 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106496), i32 %5737, !dbg !261 + %5739 = getelementptr inbounds nuw i8, ptr addrspace(3) %5738, i32 %5065, !dbg !261 + %5740 = load float, ptr addrspace(3) %5739, align 8, !dbg !261 + %5741 = getelementptr inbounds nuw i8, ptr addrspace(3) %5739, i32 4, !dbg !261 + %5742 = load float, ptr addrspace(3) %5741, align 4, !dbg !261 + %5743 = getelementptr inbounds nuw i8, ptr addrspace(3) %5738, i32 %5068, !dbg !261 + %5744 = load float, ptr addrspace(3) %5743, align 8, !dbg !261 + %5745 = getelementptr inbounds nuw i8, ptr addrspace(3) %5743, i32 4, !dbg !261 + %5746 = load float, ptr addrspace(3) %5745, align 4, !dbg !261 + %5747 = getelementptr inbounds nuw i8, ptr addrspace(3) %5738, i32 %5071, !dbg !261 + %5748 = load float, ptr addrspace(3) %5747, align 8, !dbg !261 + %5749 = getelementptr inbounds nuw i8, ptr addrspace(3) %5747, i32 4, !dbg !261 + %5750 = load float, ptr addrspace(3) %5749, align 4, !dbg !261 + %5751 = getelementptr inbounds nuw i8, ptr addrspace(3) %5738, i32 %5074, !dbg !261 + %5752 = load float, ptr addrspace(3) %5751, align 8, !dbg !261 + %5753 = getelementptr inbounds nuw i8, ptr addrspace(3) %5751, i32 4, !dbg !261 + %5754 = load float, ptr addrspace(3) %5753, align 4, !dbg !261 + %5755 = getelementptr inbounds nuw i8, ptr addrspace(3) %5738, i32 %5077, !dbg !261 + %5756 = load float, ptr addrspace(3) %5755, align 8, !dbg !261 + %5757 = getelementptr inbounds nuw i8, ptr addrspace(3) %5755, i32 4, !dbg !261 + %5758 = load float, ptr addrspace(3) %5757, align 4, !dbg !261 + %5759 = getelementptr inbounds nuw i8, ptr addrspace(3) %5738, i32 %5080, !dbg !261 + %5760 = load float, ptr addrspace(3) %5759, align 8, !dbg !261 + %5761 = getelementptr inbounds nuw i8, ptr addrspace(3) %5759, i32 4, !dbg !261 + %5762 = load float, ptr addrspace(3) %5761, align 4, !dbg !261 + %5763 = getelementptr inbounds nuw i8, ptr addrspace(3) %5738, i32 %5083, !dbg !261 + %5764 = load float, ptr addrspace(3) %5763, align 8, !dbg !261 + %5765 = getelementptr inbounds nuw i8, ptr addrspace(3) %5763, i32 4, !dbg !261 + %5766 = load float, ptr addrspace(3) %5765, align 4, !dbg !261 + %5767 = getelementptr inbounds nuw i8, ptr addrspace(3) %5738, i32 %5086, !dbg !261 + %5768 = load float, ptr addrspace(3) %5767, align 8, !dbg !261 + %5769 = getelementptr inbounds nuw i8, ptr addrspace(3) %5767, i32 4, !dbg !261 + %5770 = load float, ptr addrspace(3) %5769, align 4, !dbg !261 + %5771 = fcmp oeq float %5740, 0xFFF0000000000000, !dbg !268 + %5772 = fcmp oeq float %5742, 0xFFF0000000000000, !dbg !268 + %5773 = fcmp oeq float %5744, 0xFFF0000000000000, !dbg !268 + %5774 = fcmp oeq float %5746, 0xFFF0000000000000, !dbg !268 + %5775 = fcmp oeq float %5748, 0xFFF0000000000000, !dbg !268 + %5776 = fcmp oeq float %5750, 0xFFF0000000000000, !dbg !268 + %5777 = fcmp oeq float %5752, 0xFFF0000000000000, !dbg !268 + %5778 = fcmp oeq float %5754, 0xFFF0000000000000, !dbg !268 + %5779 = fcmp oeq float %5756, 0xFFF0000000000000, !dbg !268 + %5780 = fcmp oeq float %5758, 0xFFF0000000000000, !dbg !268 + %5781 = fcmp oeq float %5760, 0xFFF0000000000000, !dbg !268 + %5782 = fcmp oeq float %5762, 0xFFF0000000000000, !dbg !268 + %5783 = fcmp oeq float %5764, 0xFFF0000000000000, !dbg !268 + %5784 = fcmp oeq float %5766, 0xFFF0000000000000, !dbg !268 + %5785 = fcmp oeq float %5768, 0xFFF0000000000000, !dbg !268 + %5786 = fcmp oeq float %5770, 0xFFF0000000000000, !dbg !268 + %5787 = select i1 %5771, float 0.000000e+00, float %5740, !dbg !269 + %5788 = select i1 %5772, float 0.000000e+00, float %5742, !dbg !269 + %5789 = select i1 %5773, float 0.000000e+00, float %5744, !dbg !269 + %5790 = select i1 %5774, float 0.000000e+00, float %5746, !dbg !269 + %5791 = select i1 %5775, float 0.000000e+00, float %5748, !dbg !269 + %5792 = select i1 %5776, float 0.000000e+00, float %5750, !dbg !269 + %5793 = select i1 %5777, float 0.000000e+00, float %5752, !dbg !269 + %5794 = select i1 %5778, float 0.000000e+00, float %5754, !dbg !269 + %5795 = select i1 %5779, float 0.000000e+00, float %5756, !dbg !269 + %5796 = select i1 %5780, float 0.000000e+00, float %5758, !dbg !269 + %5797 = select i1 %5781, float 0.000000e+00, float %5760, !dbg !269 + %5798 = select i1 %5782, float 0.000000e+00, float %5762, !dbg !269 + %5799 = select i1 %5783, float 0.000000e+00, float %5764, !dbg !269 + %5800 = select i1 %5784, float 0.000000e+00, float %5766, !dbg !269 + %5801 = select i1 %5785, float 0.000000e+00, float %5768, !dbg !269 + %5802 = select i1 %5786, float 0.000000e+00, float %5770, !dbg !269 + %5803 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %44, i32 0, i32 31), !dbg !241 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !241 + %5804 = shl i32 %5803, 11, !dbg !241 + %5805 = and i32 %5804, 8192, !dbg !241 + %5806 = add i32 %5805, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520) to i32), !dbg !241 + %5807 = lshr exact i32 %5806, 4, !dbg !241 + %5808 = and i32 %5807, 16383, !dbg !241 + %5809 = zext nneg i32 %5808 to i64, !dbg !241 + %5810 = or disjoint i64 %5809, 4611686293372403712, !dbg !241 + %5811 = ptrtoint ptr addrspace(3) %5736 to i32, !dbg !241 + %5812 = lshr exact i32 %5811, 4, !dbg !241 + %5813 = and i32 %5812, 16383, !dbg !241 + %5814 = zext nneg i32 %5813 to i64, !dbg !241 + %5815 = or disjoint i64 %5814, 4611686293338849280, !dbg !241 + %5816 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %5810, i64 %5815) #3, !dbg !241 + %5817 = or disjoint i32 %5805, 32, !dbg !241 + %5818 = add i32 %5817, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520) to i32), !dbg !241 + %5819 = lshr exact i32 %5818, 4, !dbg !241 + %5820 = and i32 %5819, 16383, !dbg !241 + %5821 = zext nneg i32 %5820 to i64, !dbg !241 + %5822 = or disjoint i64 %5821, 4611686293372403712, !dbg !241 + %5823 = add i32 %5811, 32, !dbg !241 + %5824 = lshr exact i32 %5823, 4, !dbg !241 + %5825 = and i32 %5824, 16383, !dbg !241 + %5826 = zext nneg i32 %5825 to i64, !dbg !241 + %5827 = or disjoint i64 %5826, 4611686293338849280, !dbg !241 + %5828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 0, !dbg !241 + %5829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 1, !dbg !241 + %5830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 2, !dbg !241 + %5831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 3, !dbg !241 + %5832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 4, !dbg !241 + %5833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 5, !dbg !241 + %5834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 6, !dbg !241 + %5835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 7, !dbg !241 + %5836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 8, !dbg !241 + %5837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 9, !dbg !241 + %5838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 10, !dbg !241 + %5839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 11, !dbg !241 + %5840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 12, !dbg !241 + %5841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 13, !dbg !241 + %5842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 14, !dbg !241 + %5843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 15, !dbg !241 + %5844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 16, !dbg !241 + %5845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 17, !dbg !241 + %5846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 18, !dbg !241 + %5847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 19, !dbg !241 + %5848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 20, !dbg !241 + %5849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 21, !dbg !241 + %5850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 22, !dbg !241 + %5851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 23, !dbg !241 + %5852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 24, !dbg !241 + %5853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 25, !dbg !241 + %5854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 26, !dbg !241 + %5855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 27, !dbg !241 + %5856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 28, !dbg !241 + %5857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 29, !dbg !241 + %5858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 30, !dbg !241 + %5859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5816, 31, !dbg !241 + %5860 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %5828, float %5829, float %5830, float %5831, float %5832, float %5833, float %5834, float %5835, float %5836, float %5837, float %5838, float %5839, float %5840, float %5841, float %5842, float %5843, float %5844, float %5845, float %5846, float %5847, float %5848, float %5849, float %5850, float %5851, float %5852, float %5853, float %5854, float %5855, float %5856, float %5857, float %5858, float %5859, i64 %5822, i64 %5827, i1 true) #3, !dbg !241 + %5861 = or disjoint i32 %5805, 64, !dbg !241 + %5862 = add i32 %5861, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520) to i32), !dbg !241 + %5863 = lshr exact i32 %5862, 4, !dbg !241 + %5864 = and i32 %5863, 16383, !dbg !241 + %5865 = zext nneg i32 %5864 to i64, !dbg !241 + %5866 = or disjoint i64 %5865, 4611686293372403712, !dbg !241 + %5867 = add i32 %5811, 64, !dbg !241 + %5868 = lshr exact i32 %5867, 4, !dbg !241 + %5869 = and i32 %5868, 16383, !dbg !241 + %5870 = zext nneg i32 %5869 to i64, !dbg !241 + %5871 = or disjoint i64 %5870, 4611686293338849280, !dbg !241 + %5872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 0, !dbg !241 + %5873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 1, !dbg !241 + %5874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 2, !dbg !241 + %5875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 3, !dbg !241 + %5876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 4, !dbg !241 + %5877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 5, !dbg !241 + %5878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 6, !dbg !241 + %5879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 7, !dbg !241 + %5880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 8, !dbg !241 + %5881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 9, !dbg !241 + %5882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 10, !dbg !241 + %5883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 11, !dbg !241 + %5884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 12, !dbg !241 + %5885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 13, !dbg !241 + %5886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 14, !dbg !241 + %5887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 15, !dbg !241 + %5888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 16, !dbg !241 + %5889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 17, !dbg !241 + %5890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 18, !dbg !241 + %5891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 19, !dbg !241 + %5892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 20, !dbg !241 + %5893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 21, !dbg !241 + %5894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 22, !dbg !241 + %5895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 23, !dbg !241 + %5896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 24, !dbg !241 + %5897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 25, !dbg !241 + %5898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 26, !dbg !241 + %5899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 27, !dbg !241 + %5900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 28, !dbg !241 + %5901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 29, !dbg !241 + %5902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 30, !dbg !241 + %5903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5860, 31, !dbg !241 + %5904 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %5872, float %5873, float %5874, float %5875, float %5876, float %5877, float %5878, float %5879, float %5880, float %5881, float %5882, float %5883, float %5884, float %5885, float %5886, float %5887, float %5888, float %5889, float %5890, float %5891, float %5892, float %5893, float %5894, float %5895, float %5896, float %5897, float %5898, float %5899, float %5900, float %5901, float %5902, float %5903, i64 %5866, i64 %5871, i1 true) #3, !dbg !241 + %5905 = or disjoint i32 %5805, 96, !dbg !241 + %5906 = add i32 %5905, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520) to i32), !dbg !241 + %5907 = lshr exact i32 %5906, 4, !dbg !241 + %5908 = and i32 %5907, 16383, !dbg !241 + %5909 = zext nneg i32 %5908 to i64, !dbg !241 + %5910 = or disjoint i64 %5909, 4611686293372403712, !dbg !241 + %5911 = add i32 %5811, 96, !dbg !241 + %5912 = lshr exact i32 %5911, 4, !dbg !241 + %5913 = and i32 %5912, 16383, !dbg !241 + %5914 = zext nneg i32 %5913 to i64, !dbg !241 + %5915 = or disjoint i64 %5914, 4611686293338849280, !dbg !241 + %5916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 0, !dbg !241 + %5917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 1, !dbg !241 + %5918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 2, !dbg !241 + %5919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 3, !dbg !241 + %5920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 4, !dbg !241 + %5921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 5, !dbg !241 + %5922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 6, !dbg !241 + %5923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 7, !dbg !241 + %5924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 8, !dbg !241 + %5925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 9, !dbg !241 + %5926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 10, !dbg !241 + %5927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 11, !dbg !241 + %5928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 12, !dbg !241 + %5929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 13, !dbg !241 + %5930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 14, !dbg !241 + %5931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 15, !dbg !241 + %5932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 16, !dbg !241 + %5933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 17, !dbg !241 + %5934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 18, !dbg !241 + %5935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 19, !dbg !241 + %5936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 20, !dbg !241 + %5937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 21, !dbg !241 + %5938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 22, !dbg !241 + %5939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 23, !dbg !241 + %5940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 24, !dbg !241 + %5941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 25, !dbg !241 + %5942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 26, !dbg !241 + %5943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 27, !dbg !241 + %5944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 28, !dbg !241 + %5945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 29, !dbg !241 + %5946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 30, !dbg !241 + %5947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5904, 31, !dbg !241 + %5948 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %5916, float %5917, float %5918, float %5919, float %5920, float %5921, float %5922, float %5923, float %5924, float %5925, float %5926, float %5927, float %5928, float %5929, float %5930, float %5931, float %5932, float %5933, float %5934, float %5935, float %5936, float %5937, float %5938, float %5939, float %5940, float %5941, float %5942, float %5943, float %5944, float %5945, float %5946, float %5947, i64 %5910, i64 %5915, i1 true) #3, !dbg !241 + %5949 = or disjoint i32 %5805, 16384, !dbg !241 + %5950 = add i32 %5949, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520) to i32), !dbg !241 + %5951 = lshr exact i32 %5950, 4, !dbg !241 + %5952 = and i32 %5951, 16383, !dbg !241 + %5953 = zext nneg i32 %5952 to i64, !dbg !241 + %5954 = or disjoint i64 %5953, 4611686293372403712, !dbg !241 + %5955 = add i32 %5811, 8192, !dbg !241 + %5956 = lshr exact i32 %5955, 4, !dbg !241 + %5957 = and i32 %5956, 16383, !dbg !241 + %5958 = zext nneg i32 %5957 to i64, !dbg !241 + %5959 = or disjoint i64 %5958, 4611686293338849280, !dbg !241 + %5960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 0, !dbg !241 + %5961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 1, !dbg !241 + %5962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 2, !dbg !241 + %5963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 3, !dbg !241 + %5964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 4, !dbg !241 + %5965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 5, !dbg !241 + %5966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 6, !dbg !241 + %5967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 7, !dbg !241 + %5968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 8, !dbg !241 + %5969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 9, !dbg !241 + %5970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 10, !dbg !241 + %5971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 11, !dbg !241 + %5972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 12, !dbg !241 + %5973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 13, !dbg !241 + %5974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 14, !dbg !241 + %5975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 15, !dbg !241 + %5976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 16, !dbg !241 + %5977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 17, !dbg !241 + %5978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 18, !dbg !241 + %5979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 19, !dbg !241 + %5980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 20, !dbg !241 + %5981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 21, !dbg !241 + %5982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 22, !dbg !241 + %5983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 23, !dbg !241 + %5984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 24, !dbg !241 + %5985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 25, !dbg !241 + %5986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 26, !dbg !241 + %5987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 27, !dbg !241 + %5988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 28, !dbg !241 + %5989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 29, !dbg !241 + %5990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 30, !dbg !241 + %5991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5948, 31, !dbg !241 + %5992 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %5960, float %5961, float %5962, float %5963, float %5964, float %5965, float %5966, float %5967, float %5968, float %5969, float %5970, float %5971, float %5972, float %5973, float %5974, float %5975, float %5976, float %5977, float %5978, float %5979, float %5980, float %5981, float %5982, float %5983, float %5984, float %5985, float %5986, float %5987, float %5988, float %5989, float %5990, float %5991, i64 %5954, i64 %5959, i1 true) #3, !dbg !241 + %5993 = or disjoint i32 %5805, 16416, !dbg !241 + %5994 = add i32 %5993, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520) to i32), !dbg !241 + %5995 = lshr exact i32 %5994, 4, !dbg !241 + %5996 = and i32 %5995, 16383, !dbg !241 + %5997 = zext nneg i32 %5996 to i64, !dbg !241 + %5998 = or disjoint i64 %5997, 4611686293372403712, !dbg !241 + %5999 = add i32 %5811, 8224, !dbg !241 + %6000 = lshr exact i32 %5999, 4, !dbg !241 + %6001 = and i32 %6000, 16383, !dbg !241 + %6002 = zext nneg i32 %6001 to i64, !dbg !241 + %6003 = or disjoint i64 %6002, 4611686293338849280, !dbg !241 + %6004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 0, !dbg !241 + %6005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 1, !dbg !241 + %6006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 2, !dbg !241 + %6007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 3, !dbg !241 + %6008 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 4, !dbg !241 + %6009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 5, !dbg !241 + %6010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 6, !dbg !241 + %6011 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 7, !dbg !241 + %6012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 8, !dbg !241 + %6013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 9, !dbg !241 + %6014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 10, !dbg !241 + %6015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 11, !dbg !241 + %6016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 12, !dbg !241 + %6017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 13, !dbg !241 + %6018 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 14, !dbg !241 + %6019 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 15, !dbg !241 + %6020 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 16, !dbg !241 + %6021 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 17, !dbg !241 + %6022 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 18, !dbg !241 + %6023 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 19, !dbg !241 + %6024 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 20, !dbg !241 + %6025 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 21, !dbg !241 + %6026 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 22, !dbg !241 + %6027 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 23, !dbg !241 + %6028 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 24, !dbg !241 + %6029 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 25, !dbg !241 + %6030 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 26, !dbg !241 + %6031 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 27, !dbg !241 + %6032 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 28, !dbg !241 + %6033 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 29, !dbg !241 + %6034 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 30, !dbg !241 + %6035 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %5992, 31, !dbg !241 + %6036 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6004, float %6005, float %6006, float %6007, float %6008, float %6009, float %6010, float %6011, float %6012, float %6013, float %6014, float %6015, float %6016, float %6017, float %6018, float %6019, float %6020, float %6021, float %6022, float %6023, float %6024, float %6025, float %6026, float %6027, float %6028, float %6029, float %6030, float %6031, float %6032, float %6033, float %6034, float %6035, i64 %5998, i64 %6003, i1 true) #3, !dbg !241 + %6037 = or disjoint i32 %5805, 16448, !dbg !241 + %6038 = add i32 %6037, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520) to i32), !dbg !241 + %6039 = lshr exact i32 %6038, 4, !dbg !241 + %6040 = and i32 %6039, 16383, !dbg !241 + %6041 = zext nneg i32 %6040 to i64, !dbg !241 + %6042 = or disjoint i64 %6041, 4611686293372403712, !dbg !241 + %6043 = add i32 %5811, 8256, !dbg !241 + %6044 = lshr exact i32 %6043, 4, !dbg !241 + %6045 = and i32 %6044, 16383, !dbg !241 + %6046 = zext nneg i32 %6045 to i64, !dbg !241 + %6047 = or disjoint i64 %6046, 4611686293338849280, !dbg !241 + %6048 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 0, !dbg !241 + %6049 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 1, !dbg !241 + %6050 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 2, !dbg !241 + %6051 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 3, !dbg !241 + %6052 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 4, !dbg !241 + %6053 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 5, !dbg !241 + %6054 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 6, !dbg !241 + %6055 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 7, !dbg !241 + %6056 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 8, !dbg !241 + %6057 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 9, !dbg !241 + %6058 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 10, !dbg !241 + %6059 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 11, !dbg !241 + %6060 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 12, !dbg !241 + %6061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 13, !dbg !241 + %6062 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 14, !dbg !241 + %6063 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 15, !dbg !241 + %6064 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 16, !dbg !241 + %6065 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 17, !dbg !241 + %6066 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 18, !dbg !241 + %6067 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 19, !dbg !241 + %6068 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 20, !dbg !241 + %6069 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 21, !dbg !241 + %6070 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 22, !dbg !241 + %6071 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 23, !dbg !241 + %6072 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 24, !dbg !241 + %6073 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 25, !dbg !241 + %6074 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 26, !dbg !241 + %6075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 27, !dbg !241 + %6076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 28, !dbg !241 + %6077 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 29, !dbg !241 + %6078 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 30, !dbg !241 + %6079 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6036, 31, !dbg !241 + %6080 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6048, float %6049, float %6050, float %6051, float %6052, float %6053, float %6054, float %6055, float %6056, float %6057, float %6058, float %6059, float %6060, float %6061, float %6062, float %6063, float %6064, float %6065, float %6066, float %6067, float %6068, float %6069, float %6070, float %6071, float %6072, float %6073, float %6074, float %6075, float %6076, float %6077, float %6078, float %6079, i64 %6042, i64 %6047, i1 true) #3, !dbg !241 + %6081 = or disjoint i32 %5805, 16480, !dbg !241 + %6082 = add i32 %6081, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520) to i32), !dbg !241 + %6083 = lshr exact i32 %6082, 4, !dbg !241 + %6084 = and i32 %6083, 16383, !dbg !241 + %6085 = zext nneg i32 %6084 to i64, !dbg !241 + %6086 = or disjoint i64 %6085, 4611686293372403712, !dbg !241 + %6087 = add i32 %5811, 8288, !dbg !241 + %6088 = lshr exact i32 %6087, 4, !dbg !241 + %6089 = and i32 %6088, 16383, !dbg !241 + %6090 = zext nneg i32 %6089 to i64, !dbg !241 + %6091 = or disjoint i64 %6090, 4611686293338849280, !dbg !241 + %6092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 0, !dbg !241 + %6093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 1, !dbg !241 + %6094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 2, !dbg !241 + %6095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 3, !dbg !241 + %6096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 4, !dbg !241 + %6097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 5, !dbg !241 + %6098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 6, !dbg !241 + %6099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 7, !dbg !241 + %6100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 8, !dbg !241 + %6101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 9, !dbg !241 + %6102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 10, !dbg !241 + %6103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 11, !dbg !241 + %6104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 12, !dbg !241 + %6105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 13, !dbg !241 + %6106 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 14, !dbg !241 + %6107 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 15, !dbg !241 + %6108 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 16, !dbg !241 + %6109 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 17, !dbg !241 + %6110 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 18, !dbg !241 + %6111 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 19, !dbg !241 + %6112 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 20, !dbg !241 + %6113 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 21, !dbg !241 + %6114 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 22, !dbg !241 + %6115 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 23, !dbg !241 + %6116 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 24, !dbg !241 + %6117 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 25, !dbg !241 + %6118 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 26, !dbg !241 + %6119 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 27, !dbg !241 + %6120 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 28, !dbg !241 + %6121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 29, !dbg !241 + %6122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 30, !dbg !241 + %6123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6080, 31, !dbg !241 + %6124 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %6092, float %6093, float %6094, float %6095, float %6096, float %6097, float %6098, float %6099, float %6100, float %6101, float %6102, float %6103, float %6104, float %6105, float %6106, float %6107, float %6108, float %6109, float %6110, float %6111, float %6112, float %6113, float %6114, float %6115, float %6116, float %6117, float %6118, float %6119, float %6120, float %6121, float %6122, float %6123, i64 %6086, i64 %6091, i1 true) #3, !dbg !241 + %6125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 0, !dbg !241 + %6126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 1, !dbg !241 + %6127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 2, !dbg !241 + %6128 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 3, !dbg !241 + %6129 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 4, !dbg !241 + %6130 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 5, !dbg !241 + %6131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 6, !dbg !241 + %6132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 7, !dbg !241 + %6133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 8, !dbg !241 + %6134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 9, !dbg !241 + %6135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 10, !dbg !241 + %6136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 11, !dbg !241 + %6137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 12, !dbg !241 + %6138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 13, !dbg !241 + %6139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 14, !dbg !241 + %6140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 15, !dbg !241 + %6141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 16, !dbg !241 + %6142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 17, !dbg !241 + %6143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 18, !dbg !241 + %6144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 19, !dbg !241 + %6145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 20, !dbg !241 + %6146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 21, !dbg !241 + %6147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 22, !dbg !241 + %6148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 23, !dbg !241 + %6149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 24, !dbg !241 + %6150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 25, !dbg !241 + %6151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 26, !dbg !241 + %6152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 27, !dbg !241 + %6153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 28, !dbg !241 + %6154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 29, !dbg !241 + %6155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 30, !dbg !241 + %6156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6124, 31, !dbg !241 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !241 + %6157 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %6125, float %6126, float %6127, float %6128, float %6129, float %6130, float %6131, float %6132, float %6133, float %6134, float %6135, float %6136, float %6137, float %6138, float %6139, float %6140, float %6141, float %6142, float %6143, float %6144, float %6145, float %6146, float %6147, float %6148, float %6149, float %6150, float %6151, float %6152, float %6153, float %6154, float %6155, float %6156, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520), i32 0, i32 0, ptr addrspace(3) %5736, i32 0, i32 0) #3, !dbg !241 + %6158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 0, !dbg !241 + %6159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 1, !dbg !241 + %6160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 2, !dbg !241 + %6161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 3, !dbg !241 + %6162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 4, !dbg !241 + %6163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 5, !dbg !241 + %6164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 6, !dbg !241 + %6165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 7, !dbg !241 + %6166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 8, !dbg !241 + %6167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 9, !dbg !241 + %6168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 10, !dbg !241 + %6169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 11, !dbg !241 + %6170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 12, !dbg !241 + %6171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 13, !dbg !241 + %6172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 14, !dbg !241 + %6173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 15, !dbg !241 + %6174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 16, !dbg !241 + %6175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 17, !dbg !241 + %6176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 18, !dbg !241 + %6177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 19, !dbg !241 + %6178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 20, !dbg !241 + %6179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 21, !dbg !241 + %6180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 22, !dbg !241 + %6181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 23, !dbg !241 + %6182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 24, !dbg !241 + %6183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 25, !dbg !241 + %6184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 26, !dbg !241 + %6185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 27, !dbg !241 + %6186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 28, !dbg !241 + %6187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 29, !dbg !241 + %6188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 30, !dbg !241 + %6189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %6157, 31, !dbg !241 + %6190 = fmul float %6158, 0x3FB6A09E60000000, !dbg !270 + %6191 = fmul float %6159, 0x3FB6A09E60000000, !dbg !270 + %6192 = fmul float %6160, 0x3FB6A09E60000000, !dbg !270 + %6193 = fmul float %6161, 0x3FB6A09E60000000, !dbg !270 + %6194 = fmul float %6162, 0x3FB6A09E60000000, !dbg !270 + %6195 = fmul float %6163, 0x3FB6A09E60000000, !dbg !270 + %6196 = fmul float %6164, 0x3FB6A09E60000000, !dbg !270 + %6197 = fmul float %6165, 0x3FB6A09E60000000, !dbg !270 + %6198 = fmul float %6166, 0x3FB6A09E60000000, !dbg !270 + %6199 = fmul float %6167, 0x3FB6A09E60000000, !dbg !270 + %6200 = fmul float %6168, 0x3FB6A09E60000000, !dbg !270 + %6201 = fmul float %6169, 0x3FB6A09E60000000, !dbg !270 + %6202 = fmul float %6170, 0x3FB6A09E60000000, !dbg !270 + %6203 = fmul float %6171, 0x3FB6A09E60000000, !dbg !270 + %6204 = fmul float %6172, 0x3FB6A09E60000000, !dbg !270 + %6205 = fmul float %6173, 0x3FB6A09E60000000, !dbg !270 + %6206 = fmul float %6174, 0x3FB6A09E60000000, !dbg !270 + %6207 = fmul float %6175, 0x3FB6A09E60000000, !dbg !270 + %6208 = fmul float %6176, 0x3FB6A09E60000000, !dbg !270 + %6209 = fmul float %6177, 0x3FB6A09E60000000, !dbg !270 + %6210 = fmul float %6178, 0x3FB6A09E60000000, !dbg !270 + %6211 = fmul float %6179, 0x3FB6A09E60000000, !dbg !270 + %6212 = fmul float %6180, 0x3FB6A09E60000000, !dbg !270 + %6213 = fmul float %6181, 0x3FB6A09E60000000, !dbg !270 + %6214 = fmul float %6182, 0x3FB6A09E60000000, !dbg !270 + %6215 = fmul float %6183, 0x3FB6A09E60000000, !dbg !270 + %6216 = fmul float %6184, 0x3FB6A09E60000000, !dbg !270 + %6217 = fmul float %6185, 0x3FB6A09E60000000, !dbg !270 + %6218 = fmul float %6186, 0x3FB6A09E60000000, !dbg !270 + %6219 = fmul float %6187, 0x3FB6A09E60000000, !dbg !270 + %6220 = fmul float %6188, 0x3FB6A09E60000000, !dbg !270 + %6221 = fmul float %6189, 0x3FB6A09E60000000, !dbg !270 + %6222 = srem <2 x i32> %5717, splat (i32 2048), !dbg !225 + %6223 = srem <2 x i32> %5718, splat (i32 2048), !dbg !225 + %6224 = icmp sge <2 x i32> %6222, %5357, !dbg !271 + %6225 = sext <2 x i32> %6222 to <2 x i64>, !dbg !272 + %6226 = icmp sgt <2 x i64> %5359, %6225, !dbg !273 + %6227 = and <2 x i1> %6224, %6226, !dbg !274 + %6228 = sub <2 x i32> %5357, %6222, !dbg !275 + %6229 = and <2 x i32> %6228, splat (i32 2047), !dbg !276 + %6230 = icmp eq <2 x i32> %6229, zeroinitializer, !dbg !277 + %6231 = and <2 x i1> %5360, %6230, !dbg !278 + %6232 = or <2 x i1> %6227, %6231, !dbg !279 + %6233 = icmp sge <2 x i32> %6223, %5357, !dbg !271 + %6234 = sext <2 x i32> %6223 to <2 x i64>, !dbg !272 + %6235 = icmp sgt <2 x i64> %5359, %6234, !dbg !273 + %6236 = and <2 x i1> %6233, %6235, !dbg !274 + %6237 = sub <2 x i32> %5357, %6223, !dbg !275 + %6238 = and <2 x i32> %6237, splat (i32 2047), !dbg !276 + %6239 = icmp eq <2 x i32> %6238, zeroinitializer, !dbg !277 + %6240 = and <2 x i1> %5360, %6239, !dbg !278 + %6241 = or <2 x i1> %6236, %6240, !dbg !279 + %6242 = icmp sge <2 x i32> %6222, %5361, !dbg !271 + %6243 = sub <2 x i32> %5361, %6222, !dbg !275 + %6244 = and <2 x i32> %6243, splat (i32 2047), !dbg !276 + %6245 = icmp eq <2 x i32> %6244, zeroinitializer, !dbg !277 + %6246 = and <2 x i1> %5362, %6245, !dbg !278 + %6247 = and <2 x i1> %6242, %6226, !dbg !274 + %6248 = or <2 x i1> %6247, %6246, !dbg !279 + %6249 = icmp sge <2 x i32> %6223, %5361, !dbg !271 + %6250 = sub <2 x i32> %5361, %6223, !dbg !275 + %6251 = and <2 x i32> %6250, splat (i32 2047), !dbg !276 + %6252 = icmp eq <2 x i32> %6251, zeroinitializer, !dbg !277 + %6253 = and <2 x i1> %5362, %6252, !dbg !278 + %6254 = and <2 x i1> %6249, %6235, !dbg !274 + %6255 = or <2 x i1> %6254, %6253, !dbg !279 + %6256 = icmp sge <2 x i32> %6222, %5363, !dbg !271 + %6257 = sub <2 x i32> %5363, %6222, !dbg !275 + %6258 = and <2 x i32> %6257, splat (i32 2047), !dbg !276 + %6259 = icmp eq <2 x i32> %6258, zeroinitializer, !dbg !277 + %6260 = and <2 x i1> %5364, %6259, !dbg !278 + %6261 = and <2 x i1> %6256, %6226, !dbg !274 + %6262 = or <2 x i1> %6261, %6260, !dbg !279 + %6263 = icmp sge <2 x i32> %6223, %5363, !dbg !271 + %6264 = sub <2 x i32> %5363, %6223, !dbg !275 + %6265 = and <2 x i32> %6264, splat (i32 2047), !dbg !276 + %6266 = icmp eq <2 x i32> %6265, zeroinitializer, !dbg !277 + %6267 = and <2 x i1> %5364, %6266, !dbg !278 + %6268 = and <2 x i1> %6263, %6235, !dbg !274 + %6269 = or <2 x i1> %6268, %6267, !dbg !279 + %6270 = icmp sge <2 x i32> %6222, %5365, !dbg !271 + %6271 = sub <2 x i32> %5365, %6222, !dbg !275 + %6272 = and <2 x i32> %6271, splat (i32 2047), !dbg !276 + %6273 = icmp eq <2 x i32> %6272, zeroinitializer, !dbg !277 + %6274 = and <2 x i1> %5366, %6273, !dbg !278 + %6275 = and <2 x i1> %6270, %6226, !dbg !274 + %6276 = or <2 x i1> %6275, %6274, !dbg !279 + %6277 = icmp sge <2 x i32> %6223, %5365, !dbg !271 + %6278 = sub <2 x i32> %5365, %6223, !dbg !275 + %6279 = and <2 x i32> %6278, splat (i32 2047), !dbg !276 + %6280 = icmp eq <2 x i32> %6279, zeroinitializer, !dbg !277 + %6281 = and <2 x i1> %5366, %6280, !dbg !278 + %6282 = and <2 x i1> %6277, %6235, !dbg !274 + %6283 = or <2 x i1> %6282, %6281, !dbg !279 + %6284 = icmp sge <2 x i32> %6222, %5367, !dbg !271 + %6285 = sub <2 x i32> %5367, %6222, !dbg !275 + %6286 = and <2 x i32> %6285, splat (i32 2047), !dbg !276 + %6287 = icmp eq <2 x i32> %6286, zeroinitializer, !dbg !277 + %6288 = and <2 x i1> %5368, %6287, !dbg !278 + %6289 = and <2 x i1> %6284, %6226, !dbg !274 + %6290 = or <2 x i1> %6289, %6288, !dbg !279 + %6291 = icmp sge <2 x i32> %6223, %5367, !dbg !271 + %6292 = sub <2 x i32> %5367, %6223, !dbg !275 + %6293 = and <2 x i32> %6292, splat (i32 2047), !dbg !276 + %6294 = icmp eq <2 x i32> %6293, zeroinitializer, !dbg !277 + %6295 = and <2 x i1> %5368, %6294, !dbg !278 + %6296 = and <2 x i1> %6291, %6235, !dbg !274 + %6297 = or <2 x i1> %6296, %6295, !dbg !279 + %6298 = icmp sge <2 x i32> %6222, %5369, !dbg !271 + %6299 = sub <2 x i32> %5369, %6222, !dbg !275 + %6300 = and <2 x i32> %6299, splat (i32 2047), !dbg !276 + %6301 = icmp eq <2 x i32> %6300, zeroinitializer, !dbg !277 + %6302 = and <2 x i1> %5370, %6301, !dbg !278 + %6303 = and <2 x i1> %6298, %6226, !dbg !274 + %6304 = or <2 x i1> %6303, %6302, !dbg !279 + %6305 = icmp sge <2 x i32> %6223, %5369, !dbg !271 + %6306 = sub <2 x i32> %5369, %6223, !dbg !275 + %6307 = and <2 x i32> %6306, splat (i32 2047), !dbg !276 + %6308 = icmp eq <2 x i32> %6307, zeroinitializer, !dbg !277 + %6309 = and <2 x i1> %5370, %6308, !dbg !278 + %6310 = and <2 x i1> %6305, %6235, !dbg !274 + %6311 = or <2 x i1> %6310, %6309, !dbg !279 + %6312 = icmp sge <2 x i32> %6222, %5371, !dbg !271 + %6313 = sub <2 x i32> %5371, %6222, !dbg !275 + %6314 = and <2 x i32> %6313, splat (i32 2047), !dbg !276 + %6315 = icmp eq <2 x i32> %6314, zeroinitializer, !dbg !277 + %6316 = and <2 x i1> %5372, %6315, !dbg !278 + %6317 = and <2 x i1> %6312, %6226, !dbg !274 + %6318 = or <2 x i1> %6317, %6316, !dbg !279 + %6319 = icmp sge <2 x i32> %6223, %5371, !dbg !271 + %6320 = sub <2 x i32> %5371, %6223, !dbg !275 + %6321 = and <2 x i32> %6320, splat (i32 2047), !dbg !276 + %6322 = icmp eq <2 x i32> %6321, zeroinitializer, !dbg !277 + %6323 = and <2 x i1> %5372, %6322, !dbg !278 + %6324 = and <2 x i1> %6319, %6235, !dbg !274 + %6325 = or <2 x i1> %6324, %6323, !dbg !279 + %6326 = icmp sge <2 x i32> %6222, %5373, !dbg !271 + %6327 = sub <2 x i32> %5373, %6222, !dbg !275 + %6328 = and <2 x i32> %6327, splat (i32 2047), !dbg !276 + %6329 = icmp eq <2 x i32> %6328, zeroinitializer, !dbg !277 + %6330 = and <2 x i1> %5374, %6329, !dbg !278 + %6331 = and <2 x i1> %6326, %6226, !dbg !274 + %6332 = or <2 x i1> %6331, %6330, !dbg !279 + %6333 = icmp sge <2 x i32> %6223, %5373, !dbg !271 + %6334 = sub <2 x i32> %5373, %6223, !dbg !275 + %6335 = and <2 x i32> %6334, splat (i32 2047), !dbg !276 + %6336 = icmp eq <2 x i32> %6335, zeroinitializer, !dbg !277 + %6337 = and <2 x i1> %5374, %6336, !dbg !278 + %6338 = and <2 x i1> %6333, %6235, !dbg !274 + %6339 = or <2 x i1> %6338, %6337, !dbg !279 + %6340 = zext <2 x i1> %6232 to <2 x i8>, !dbg !279 + store <2 x i8> %6340, ptr addrspace(3) %5193, align 2, !dbg !279 + %6341 = zext <2 x i1> %6290 to <2 x i8>, !dbg !279 + store <2 x i8> %6341, ptr addrspace(3) %5194, align 2, !dbg !279 + %6342 = zext <2 x i1> %6241 to <2 x i8>, !dbg !279 + store <2 x i8> %6342, ptr addrspace(3) %5196, align 2, !dbg !279 + %6343 = zext <2 x i1> %6297 to <2 x i8>, !dbg !279 + store <2 x i8> %6343, ptr addrspace(3) %5197, align 2, !dbg !279 + %6344 = zext <2 x i1> %6248 to <2 x i8>, !dbg !279 + store <2 x i8> %6344, ptr addrspace(3) %5199, align 2, !dbg !279 + %6345 = zext <2 x i1> %6304 to <2 x i8>, !dbg !279 + store <2 x i8> %6345, ptr addrspace(3) %5200, align 2, !dbg !279 + %6346 = zext <2 x i1> %6255 to <2 x i8>, !dbg !279 + store <2 x i8> %6346, ptr addrspace(3) %5202, align 2, !dbg !279 + %6347 = zext <2 x i1> %6311 to <2 x i8>, !dbg !279 + store <2 x i8> %6347, ptr addrspace(3) %5203, align 2, !dbg !279 + %6348 = zext <2 x i1> %6262 to <2 x i8>, !dbg !279 + store <2 x i8> %6348, ptr addrspace(3) %5205, align 2, !dbg !279 + %6349 = zext <2 x i1> %6318 to <2 x i8>, !dbg !279 + store <2 x i8> %6349, ptr addrspace(3) %5206, align 2, !dbg !279 + %6350 = zext <2 x i1> %6269 to <2 x i8>, !dbg !279 + store <2 x i8> %6350, ptr addrspace(3) %5208, align 2, !dbg !279 + %6351 = zext <2 x i1> %6325 to <2 x i8>, !dbg !279 + store <2 x i8> %6351, ptr addrspace(3) %5209, align 2, !dbg !279 + %6352 = zext <2 x i1> %6276 to <2 x i8>, !dbg !279 + store <2 x i8> %6352, ptr addrspace(3) %5211, align 2, !dbg !279 + %6353 = zext <2 x i1> %6332 to <2 x i8>, !dbg !279 + store <2 x i8> %6353, ptr addrspace(3) %5212, align 2, !dbg !279 + %6354 = zext <2 x i1> %6283 to <2 x i8>, !dbg !279 + store <2 x i8> %6354, ptr addrspace(3) %5214, align 2, !dbg !279 + %6355 = zext <2 x i1> %6339 to <2 x i8>, !dbg !279 + store <2 x i8> %6355, ptr addrspace(3) %5215, align 2, !dbg !279 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !279 + %6356 = load <32 x i1>, ptr addrspace(3) %5228, align 4, !dbg !279 + %6357 = load <32 x i1>, ptr addrspace(3) %5230, align 4, !dbg !279 + %6358 = load <32 x i1>, ptr addrspace(3) %5232, align 4, !dbg !279 + %6359 = load <32 x i1>, ptr addrspace(3) %5234, align 4, !dbg !279 + %6360 = load <32 x i1>, ptr addrspace(3) %5236, align 4, !dbg !279 + %6361 = load <32 x i1>, ptr addrspace(3) %5238, align 4, !dbg !279 + %6362 = load <32 x i1>, ptr addrspace(3) %5240, align 4, !dbg !279 + %6363 = load <32 x i1>, ptr addrspace(3) %5242, align 4, !dbg !279 + %6364 = extractelement <32 x i1> %6356, i64 0, !dbg !279 + %6365 = extractelement <32 x i1> %6356, i64 8, !dbg !279 + %6366 = extractelement <32 x i1> %6356, i64 16, !dbg !279 + %6367 = extractelement <32 x i1> %6356, i64 24, !dbg !279 + %6368 = extractelement <32 x i1> %6357, i64 0, !dbg !279 + %6369 = extractelement <32 x i1> %6357, i64 8, !dbg !279 + %6370 = extractelement <32 x i1> %6357, i64 16, !dbg !279 + %6371 = extractelement <32 x i1> %6357, i64 24, !dbg !279 + %6372 = extractelement <32 x i1> %6358, i64 0, !dbg !279 + %6373 = extractelement <32 x i1> %6358, i64 8, !dbg !279 + %6374 = extractelement <32 x i1> %6358, i64 16, !dbg !279 + %6375 = extractelement <32 x i1> %6358, i64 24, !dbg !279 + %6376 = extractelement <32 x i1> %6359, i64 0, !dbg !279 + %6377 = extractelement <32 x i1> %6359, i64 8, !dbg !279 + %6378 = extractelement <32 x i1> %6359, i64 16, !dbg !279 + %6379 = extractelement <32 x i1> %6359, i64 24, !dbg !279 + %6380 = extractelement <32 x i1> %6360, i64 0, !dbg !279 + %6381 = extractelement <32 x i1> %6360, i64 8, !dbg !279 + %6382 = extractelement <32 x i1> %6360, i64 16, !dbg !279 + %6383 = extractelement <32 x i1> %6360, i64 24, !dbg !279 + %6384 = extractelement <32 x i1> %6361, i64 0, !dbg !279 + %6385 = extractelement <32 x i1> %6361, i64 8, !dbg !279 + %6386 = extractelement <32 x i1> %6361, i64 16, !dbg !279 + %6387 = extractelement <32 x i1> %6361, i64 24, !dbg !279 + %6388 = extractelement <32 x i1> %6362, i64 0, !dbg !279 + %6389 = extractelement <32 x i1> %6362, i64 8, !dbg !279 + %6390 = extractelement <32 x i1> %6362, i64 16, !dbg !279 + %6391 = extractelement <32 x i1> %6362, i64 24, !dbg !279 + %6392 = extractelement <32 x i1> %6363, i64 0, !dbg !279 + %6393 = extractelement <32 x i1> %6363, i64 8, !dbg !279 + %6394 = extractelement <32 x i1> %6363, i64 16, !dbg !279 + %6395 = extractelement <32 x i1> %6363, i64 24, !dbg !279 + %6396 = select i1 %6364, i1 %5727, i1 false, !dbg !280 + %6397 = select i1 %6365, i1 %5727, i1 false, !dbg !280 + %6398 = select i1 %6366, i1 %5727, i1 false, !dbg !280 + %6399 = select i1 %6367, i1 %5727, i1 false, !dbg !280 + %6400 = select i1 %6368, i1 %5728, i1 false, !dbg !280 + %6401 = select i1 %6369, i1 %5728, i1 false, !dbg !280 + %6402 = select i1 %6370, i1 %5728, i1 false, !dbg !280 + %6403 = select i1 %6371, i1 %5728, i1 false, !dbg !280 + %6404 = select i1 %6372, i1 %5729, i1 false, !dbg !280 + %6405 = select i1 %6373, i1 %5729, i1 false, !dbg !280 + %6406 = select i1 %6374, i1 %5729, i1 false, !dbg !280 + %6407 = select i1 %6375, i1 %5729, i1 false, !dbg !280 + %6408 = select i1 %6376, i1 %5730, i1 false, !dbg !280 + %6409 = select i1 %6377, i1 %5730, i1 false, !dbg !280 + %6410 = select i1 %6378, i1 %5730, i1 false, !dbg !280 + %6411 = select i1 %6379, i1 %5730, i1 false, !dbg !280 + %6412 = select i1 %6380, i1 %5731, i1 false, !dbg !280 + %6413 = select i1 %6381, i1 %5731, i1 false, !dbg !280 + %6414 = select i1 %6382, i1 %5731, i1 false, !dbg !280 + %6415 = select i1 %6383, i1 %5731, i1 false, !dbg !280 + %6416 = select i1 %6384, i1 %5732, i1 false, !dbg !280 + %6417 = select i1 %6385, i1 %5732, i1 false, !dbg !280 + %6418 = select i1 %6386, i1 %5732, i1 false, !dbg !280 + %6419 = select i1 %6387, i1 %5732, i1 false, !dbg !280 + %6420 = select i1 %6388, i1 %5733, i1 false, !dbg !280 + %6421 = select i1 %6389, i1 %5733, i1 false, !dbg !280 + %6422 = select i1 %6390, i1 %5733, i1 false, !dbg !280 + %6423 = select i1 %6391, i1 %5733, i1 false, !dbg !280 + %6424 = select i1 %6392, i1 %5734, i1 false, !dbg !280 + %6425 = select i1 %6393, i1 %5734, i1 false, !dbg !280 + %6426 = select i1 %6394, i1 %5734, i1 false, !dbg !280 + %6427 = select i1 %6395, i1 %5734, i1 false, !dbg !280 + %6428 = fmul float %6190, 0x3FF7154760000000, !dbg !281 + %6429 = select i1 %6396, float %6428, float 0xFFF0000000000000, !dbg !280 + %6430 = fmul float %6191, 0x3FF7154760000000, !dbg !281 + %6431 = select i1 %6397, float %6430, float 0xFFF0000000000000, !dbg !280 + %6432 = fmul float %6192, 0x3FF7154760000000, !dbg !281 + %6433 = select i1 %6398, float %6432, float 0xFFF0000000000000, !dbg !280 + %6434 = fmul float %6193, 0x3FF7154760000000, !dbg !281 + %6435 = select i1 %6399, float %6434, float 0xFFF0000000000000, !dbg !280 + %6436 = fmul float %6194, 0x3FF7154760000000, !dbg !281 + %6437 = select i1 %6400, float %6436, float 0xFFF0000000000000, !dbg !280 + %6438 = fmul float %6195, 0x3FF7154760000000, !dbg !281 + %6439 = select i1 %6401, float %6438, float 0xFFF0000000000000, !dbg !280 + %6440 = fmul float %6196, 0x3FF7154760000000, !dbg !281 + %6441 = select i1 %6402, float %6440, float 0xFFF0000000000000, !dbg !280 + %6442 = fmul float %6197, 0x3FF7154760000000, !dbg !281 + %6443 = select i1 %6403, float %6442, float 0xFFF0000000000000, !dbg !280 + %6444 = fmul float %6198, 0x3FF7154760000000, !dbg !281 + %6445 = select i1 %6404, float %6444, float 0xFFF0000000000000, !dbg !280 + %6446 = fmul float %6199, 0x3FF7154760000000, !dbg !281 + %6447 = select i1 %6405, float %6446, float 0xFFF0000000000000, !dbg !280 + %6448 = fmul float %6200, 0x3FF7154760000000, !dbg !281 + %6449 = select i1 %6406, float %6448, float 0xFFF0000000000000, !dbg !280 + %6450 = fmul float %6201, 0x3FF7154760000000, !dbg !281 + %6451 = select i1 %6407, float %6450, float 0xFFF0000000000000, !dbg !280 + %6452 = fmul float %6202, 0x3FF7154760000000, !dbg !281 + %6453 = select i1 %6408, float %6452, float 0xFFF0000000000000, !dbg !280 + %6454 = fmul float %6203, 0x3FF7154760000000, !dbg !281 + %6455 = select i1 %6409, float %6454, float 0xFFF0000000000000, !dbg !280 + %6456 = fmul float %6204, 0x3FF7154760000000, !dbg !281 + %6457 = select i1 %6410, float %6456, float 0xFFF0000000000000, !dbg !280 + %6458 = fmul float %6205, 0x3FF7154760000000, !dbg !281 + %6459 = select i1 %6411, float %6458, float 0xFFF0000000000000, !dbg !280 + %6460 = fmul float %6206, 0x3FF7154760000000, !dbg !281 + %6461 = select i1 %6412, float %6460, float 0xFFF0000000000000, !dbg !280 + %6462 = fmul float %6207, 0x3FF7154760000000, !dbg !281 + %6463 = select i1 %6413, float %6462, float 0xFFF0000000000000, !dbg !280 + %6464 = fmul float %6208, 0x3FF7154760000000, !dbg !281 + %6465 = select i1 %6414, float %6464, float 0xFFF0000000000000, !dbg !280 + %6466 = fmul float %6209, 0x3FF7154760000000, !dbg !281 + %6467 = select i1 %6415, float %6466, float 0xFFF0000000000000, !dbg !280 + %6468 = fmul float %6210, 0x3FF7154760000000, !dbg !281 + %6469 = select i1 %6416, float %6468, float 0xFFF0000000000000, !dbg !280 + %6470 = fmul float %6211, 0x3FF7154760000000, !dbg !281 + %6471 = select i1 %6417, float %6470, float 0xFFF0000000000000, !dbg !280 + %6472 = fmul float %6212, 0x3FF7154760000000, !dbg !281 + %6473 = select i1 %6418, float %6472, float 0xFFF0000000000000, !dbg !280 + %6474 = fmul float %6213, 0x3FF7154760000000, !dbg !281 + %6475 = select i1 %6419, float %6474, float 0xFFF0000000000000, !dbg !280 + %6476 = fmul float %6214, 0x3FF7154760000000, !dbg !281 + %6477 = select i1 %6420, float %6476, float 0xFFF0000000000000, !dbg !280 + %6478 = fmul float %6215, 0x3FF7154760000000, !dbg !281 + %6479 = select i1 %6421, float %6478, float 0xFFF0000000000000, !dbg !280 + %6480 = fmul float %6216, 0x3FF7154760000000, !dbg !281 + %6481 = select i1 %6422, float %6480, float 0xFFF0000000000000, !dbg !280 + %6482 = fmul float %6217, 0x3FF7154760000000, !dbg !281 + %6483 = select i1 %6423, float %6482, float 0xFFF0000000000000, !dbg !280 + %6484 = fmul float %6218, 0x3FF7154760000000, !dbg !281 + %6485 = select i1 %6424, float %6484, float 0xFFF0000000000000, !dbg !280 + %6486 = fmul float %6219, 0x3FF7154760000000, !dbg !281 + %6487 = select i1 %6425, float %6486, float 0xFFF0000000000000, !dbg !280 + %6488 = fmul float %6220, 0x3FF7154760000000, !dbg !281 + %6489 = select i1 %6426, float %6488, float 0xFFF0000000000000, !dbg !280 + %6490 = fmul float %6221, 0x3FF7154760000000, !dbg !281 + %6491 = select i1 %6427, float %6490, float 0xFFF0000000000000, !dbg !280 + %6492 = fsub float %6429, %5787, !dbg !282 + %6493 = fsub float %6431, %5788, !dbg !282 + %6494 = fsub float %6433, %5787, !dbg !282 + %6495 = fsub float %6435, %5788, !dbg !282 + %6496 = fsub float %6437, %5789, !dbg !282 + %6497 = fsub float %6439, %5790, !dbg !282 + %6498 = fsub float %6441, %5789, !dbg !282 + %6499 = fsub float %6443, %5790, !dbg !282 + %6500 = fsub float %6445, %5791, !dbg !282 + %6501 = fsub float %6447, %5792, !dbg !282 + %6502 = fsub float %6449, %5791, !dbg !282 + %6503 = fsub float %6451, %5792, !dbg !282 + %6504 = fsub float %6453, %5793, !dbg !282 + %6505 = fsub float %6455, %5794, !dbg !282 + %6506 = fsub float %6457, %5793, !dbg !282 + %6507 = fsub float %6459, %5794, !dbg !282 + %6508 = fsub float %6461, %5795, !dbg !282 + %6509 = fsub float %6463, %5796, !dbg !282 + %6510 = fsub float %6465, %5795, !dbg !282 + %6511 = fsub float %6467, %5796, !dbg !282 + %6512 = fsub float %6469, %5797, !dbg !282 + %6513 = fsub float %6471, %5798, !dbg !282 + %6514 = fsub float %6473, %5797, !dbg !282 + %6515 = fsub float %6475, %5798, !dbg !282 + %6516 = fsub float %6477, %5799, !dbg !282 + %6517 = fsub float %6479, %5800, !dbg !282 + %6518 = fsub float %6481, %5799, !dbg !282 + %6519 = fsub float %6483, %5800, !dbg !282 + %6520 = fsub float %6485, %5801, !dbg !282 + %6521 = fsub float %6487, %5802, !dbg !282 + %6522 = fsub float %6489, %5801, !dbg !282 + %6523 = fsub float %6491, %5802, !dbg !282 + %6524 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1285 = icmp eq i32 %6524, 0, !dbg !283 + br i1 %.not.i1285, label %6527, label %6525, !dbg !283 + +6525: ; preds = %.lr.ph1669 + %6526 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6492) #3, !dbg !283 + br label %__nv_exp2f.exit1287, !dbg !283 + +6527: ; preds = %.lr.ph1669 + %6528 = tail call float @llvm.nvvm.ex2.approx.f(float %6492) #3, !dbg !283 + br label %__nv_exp2f.exit1287, !dbg !283 + +__nv_exp2f.exit1287: ; preds = %6525, %6527 + %.0.i1286 = phi float [ %6526, %6525 ], [ %6528, %6527 ], !dbg !283 + %6529 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1288 = icmp eq i32 %6529, 0, !dbg !283 + br i1 %.not.i1288, label %6532, label %6530, !dbg !283 + +6530: ; preds = %__nv_exp2f.exit1287 + %6531 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6493) #3, !dbg !283 + br label %__nv_exp2f.exit1290, !dbg !283 + +6532: ; preds = %__nv_exp2f.exit1287 + %6533 = tail call float @llvm.nvvm.ex2.approx.f(float %6493) #3, !dbg !283 + br label %__nv_exp2f.exit1290, !dbg !283 + +__nv_exp2f.exit1290: ; preds = %6530, %6532 + %.0.i1289 = phi float [ %6531, %6530 ], [ %6533, %6532 ], !dbg !283 + %6534 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1291 = icmp eq i32 %6534, 0, !dbg !283 + br i1 %.not.i1291, label %6537, label %6535, !dbg !283 + +6535: ; preds = %__nv_exp2f.exit1290 + %6536 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6494) #3, !dbg !283 + br label %__nv_exp2f.exit1293, !dbg !283 + +6537: ; preds = %__nv_exp2f.exit1290 + %6538 = tail call float @llvm.nvvm.ex2.approx.f(float %6494) #3, !dbg !283 + br label %__nv_exp2f.exit1293, !dbg !283 + +__nv_exp2f.exit1293: ; preds = %6535, %6537 + %.0.i1292 = phi float [ %6536, %6535 ], [ %6538, %6537 ], !dbg !283 + %6539 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1294 = icmp eq i32 %6539, 0, !dbg !283 + br i1 %.not.i1294, label %6542, label %6540, !dbg !283 + +6540: ; preds = %__nv_exp2f.exit1293 + %6541 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6495) #3, !dbg !283 + br label %__nv_exp2f.exit1296, !dbg !283 + +6542: ; preds = %__nv_exp2f.exit1293 + %6543 = tail call float @llvm.nvvm.ex2.approx.f(float %6495) #3, !dbg !283 + br label %__nv_exp2f.exit1296, !dbg !283 + +__nv_exp2f.exit1296: ; preds = %6540, %6542 + %.0.i1295 = phi float [ %6541, %6540 ], [ %6543, %6542 ], !dbg !283 + %6544 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1297 = icmp eq i32 %6544, 0, !dbg !283 + br i1 %.not.i1297, label %6547, label %6545, !dbg !283 + +6545: ; preds = %__nv_exp2f.exit1296 + %6546 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6496) #3, !dbg !283 + br label %__nv_exp2f.exit1299, !dbg !283 + +6547: ; preds = %__nv_exp2f.exit1296 + %6548 = tail call float @llvm.nvvm.ex2.approx.f(float %6496) #3, !dbg !283 + br label %__nv_exp2f.exit1299, !dbg !283 + +__nv_exp2f.exit1299: ; preds = %6545, %6547 + %.0.i1298 = phi float [ %6546, %6545 ], [ %6548, %6547 ], !dbg !283 + %6549 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1300 = icmp eq i32 %6549, 0, !dbg !283 + br i1 %.not.i1300, label %6552, label %6550, !dbg !283 + +6550: ; preds = %__nv_exp2f.exit1299 + %6551 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6497) #3, !dbg !283 + br label %__nv_exp2f.exit1302, !dbg !283 + +6552: ; preds = %__nv_exp2f.exit1299 + %6553 = tail call float @llvm.nvvm.ex2.approx.f(float %6497) #3, !dbg !283 + br label %__nv_exp2f.exit1302, !dbg !283 + +__nv_exp2f.exit1302: ; preds = %6550, %6552 + %.0.i1301 = phi float [ %6551, %6550 ], [ %6553, %6552 ], !dbg !283 + %6554 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1303 = icmp eq i32 %6554, 0, !dbg !283 + br i1 %.not.i1303, label %6557, label %6555, !dbg !283 + +6555: ; preds = %__nv_exp2f.exit1302 + %6556 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6498) #3, !dbg !283 + br label %__nv_exp2f.exit1305, !dbg !283 + +6557: ; preds = %__nv_exp2f.exit1302 + %6558 = tail call float @llvm.nvvm.ex2.approx.f(float %6498) #3, !dbg !283 + br label %__nv_exp2f.exit1305, !dbg !283 + +__nv_exp2f.exit1305: ; preds = %6555, %6557 + %.0.i1304 = phi float [ %6556, %6555 ], [ %6558, %6557 ], !dbg !283 + %6559 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1306 = icmp eq i32 %6559, 0, !dbg !283 + br i1 %.not.i1306, label %6562, label %6560, !dbg !283 + +6560: ; preds = %__nv_exp2f.exit1305 + %6561 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6499) #3, !dbg !283 + br label %__nv_exp2f.exit1308, !dbg !283 + +6562: ; preds = %__nv_exp2f.exit1305 + %6563 = tail call float @llvm.nvvm.ex2.approx.f(float %6499) #3, !dbg !283 + br label %__nv_exp2f.exit1308, !dbg !283 + +__nv_exp2f.exit1308: ; preds = %6560, %6562 + %.0.i1307 = phi float [ %6561, %6560 ], [ %6563, %6562 ], !dbg !283 + %6564 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1309 = icmp eq i32 %6564, 0, !dbg !283 + br i1 %.not.i1309, label %6567, label %6565, !dbg !283 + +6565: ; preds = %__nv_exp2f.exit1308 + %6566 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6500) #3, !dbg !283 + br label %__nv_exp2f.exit1311, !dbg !283 + +6567: ; preds = %__nv_exp2f.exit1308 + %6568 = tail call float @llvm.nvvm.ex2.approx.f(float %6500) #3, !dbg !283 + br label %__nv_exp2f.exit1311, !dbg !283 + +__nv_exp2f.exit1311: ; preds = %6565, %6567 + %.0.i1310 = phi float [ %6566, %6565 ], [ %6568, %6567 ], !dbg !283 + %6569 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1312 = icmp eq i32 %6569, 0, !dbg !283 + br i1 %.not.i1312, label %6572, label %6570, !dbg !283 + +6570: ; preds = %__nv_exp2f.exit1311 + %6571 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6501) #3, !dbg !283 + br label %__nv_exp2f.exit1314, !dbg !283 + +6572: ; preds = %__nv_exp2f.exit1311 + %6573 = tail call float @llvm.nvvm.ex2.approx.f(float %6501) #3, !dbg !283 + br label %__nv_exp2f.exit1314, !dbg !283 + +__nv_exp2f.exit1314: ; preds = %6570, %6572 + %.0.i1313 = phi float [ %6571, %6570 ], [ %6573, %6572 ], !dbg !283 + %6574 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1315 = icmp eq i32 %6574, 0, !dbg !283 + br i1 %.not.i1315, label %6577, label %6575, !dbg !283 + +6575: ; preds = %__nv_exp2f.exit1314 + %6576 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6502) #3, !dbg !283 + br label %__nv_exp2f.exit1317, !dbg !283 + +6577: ; preds = %__nv_exp2f.exit1314 + %6578 = tail call float @llvm.nvvm.ex2.approx.f(float %6502) #3, !dbg !283 + br label %__nv_exp2f.exit1317, !dbg !283 + +__nv_exp2f.exit1317: ; preds = %6575, %6577 + %.0.i1316 = phi float [ %6576, %6575 ], [ %6578, %6577 ], !dbg !283 + %6579 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1318 = icmp eq i32 %6579, 0, !dbg !283 + br i1 %.not.i1318, label %6582, label %6580, !dbg !283 + +6580: ; preds = %__nv_exp2f.exit1317 + %6581 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6503) #3, !dbg !283 + br label %__nv_exp2f.exit1320, !dbg !283 + +6582: ; preds = %__nv_exp2f.exit1317 + %6583 = tail call float @llvm.nvvm.ex2.approx.f(float %6503) #3, !dbg !283 + br label %__nv_exp2f.exit1320, !dbg !283 + +__nv_exp2f.exit1320: ; preds = %6580, %6582 + %.0.i1319 = phi float [ %6581, %6580 ], [ %6583, %6582 ], !dbg !283 + %6584 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1321 = icmp eq i32 %6584, 0, !dbg !283 + br i1 %.not.i1321, label %6587, label %6585, !dbg !283 + +6585: ; preds = %__nv_exp2f.exit1320 + %6586 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6504) #3, !dbg !283 + br label %__nv_exp2f.exit1323, !dbg !283 + +6587: ; preds = %__nv_exp2f.exit1320 + %6588 = tail call float @llvm.nvvm.ex2.approx.f(float %6504) #3, !dbg !283 + br label %__nv_exp2f.exit1323, !dbg !283 + +__nv_exp2f.exit1323: ; preds = %6585, %6587 + %.0.i1322 = phi float [ %6586, %6585 ], [ %6588, %6587 ], !dbg !283 + %6589 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1324 = icmp eq i32 %6589, 0, !dbg !283 + br i1 %.not.i1324, label %6592, label %6590, !dbg !283 + +6590: ; preds = %__nv_exp2f.exit1323 + %6591 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6505) #3, !dbg !283 + br label %__nv_exp2f.exit1326, !dbg !283 + +6592: ; preds = %__nv_exp2f.exit1323 + %6593 = tail call float @llvm.nvvm.ex2.approx.f(float %6505) #3, !dbg !283 + br label %__nv_exp2f.exit1326, !dbg !283 + +__nv_exp2f.exit1326: ; preds = %6590, %6592 + %.0.i1325 = phi float [ %6591, %6590 ], [ %6593, %6592 ], !dbg !283 + %6594 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1327 = icmp eq i32 %6594, 0, !dbg !283 + br i1 %.not.i1327, label %6597, label %6595, !dbg !283 + +6595: ; preds = %__nv_exp2f.exit1326 + %6596 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6506) #3, !dbg !283 + br label %__nv_exp2f.exit1329, !dbg !283 + +6597: ; preds = %__nv_exp2f.exit1326 + %6598 = tail call float @llvm.nvvm.ex2.approx.f(float %6506) #3, !dbg !283 + br label %__nv_exp2f.exit1329, !dbg !283 + +__nv_exp2f.exit1329: ; preds = %6595, %6597 + %.0.i1328 = phi float [ %6596, %6595 ], [ %6598, %6597 ], !dbg !283 + %6599 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1330 = icmp eq i32 %6599, 0, !dbg !283 + br i1 %.not.i1330, label %6602, label %6600, !dbg !283 + +6600: ; preds = %__nv_exp2f.exit1329 + %6601 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6507) #3, !dbg !283 + br label %__nv_exp2f.exit1332, !dbg !283 + +6602: ; preds = %__nv_exp2f.exit1329 + %6603 = tail call float @llvm.nvvm.ex2.approx.f(float %6507) #3, !dbg !283 + br label %__nv_exp2f.exit1332, !dbg !283 + +__nv_exp2f.exit1332: ; preds = %6600, %6602 + %.0.i1331 = phi float [ %6601, %6600 ], [ %6603, %6602 ], !dbg !283 + %6604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1333 = icmp eq i32 %6604, 0, !dbg !283 + br i1 %.not.i1333, label %6607, label %6605, !dbg !283 + +6605: ; preds = %__nv_exp2f.exit1332 + %6606 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6508) #3, !dbg !283 + br label %__nv_exp2f.exit1335, !dbg !283 + +6607: ; preds = %__nv_exp2f.exit1332 + %6608 = tail call float @llvm.nvvm.ex2.approx.f(float %6508) #3, !dbg !283 + br label %__nv_exp2f.exit1335, !dbg !283 + +__nv_exp2f.exit1335: ; preds = %6605, %6607 + %.0.i1334 = phi float [ %6606, %6605 ], [ %6608, %6607 ], !dbg !283 + %6609 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1336 = icmp eq i32 %6609, 0, !dbg !283 + br i1 %.not.i1336, label %6612, label %6610, !dbg !283 + +6610: ; preds = %__nv_exp2f.exit1335 + %6611 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6509) #3, !dbg !283 + br label %__nv_exp2f.exit1338, !dbg !283 + +6612: ; preds = %__nv_exp2f.exit1335 + %6613 = tail call float @llvm.nvvm.ex2.approx.f(float %6509) #3, !dbg !283 + br label %__nv_exp2f.exit1338, !dbg !283 + +__nv_exp2f.exit1338: ; preds = %6610, %6612 + %.0.i1337 = phi float [ %6611, %6610 ], [ %6613, %6612 ], !dbg !283 + %6614 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1339 = icmp eq i32 %6614, 0, !dbg !283 + br i1 %.not.i1339, label %6617, label %6615, !dbg !283 + +6615: ; preds = %__nv_exp2f.exit1338 + %6616 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6510) #3, !dbg !283 + br label %__nv_exp2f.exit1341, !dbg !283 + +6617: ; preds = %__nv_exp2f.exit1338 + %6618 = tail call float @llvm.nvvm.ex2.approx.f(float %6510) #3, !dbg !283 + br label %__nv_exp2f.exit1341, !dbg !283 + +__nv_exp2f.exit1341: ; preds = %6615, %6617 + %.0.i1340 = phi float [ %6616, %6615 ], [ %6618, %6617 ], !dbg !283 + %6619 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1342 = icmp eq i32 %6619, 0, !dbg !283 + br i1 %.not.i1342, label %6622, label %6620, !dbg !283 + +6620: ; preds = %__nv_exp2f.exit1341 + %6621 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6511) #3, !dbg !283 + br label %__nv_exp2f.exit1344, !dbg !283 + +6622: ; preds = %__nv_exp2f.exit1341 + %6623 = tail call float @llvm.nvvm.ex2.approx.f(float %6511) #3, !dbg !283 + br label %__nv_exp2f.exit1344, !dbg !283 + +__nv_exp2f.exit1344: ; preds = %6620, %6622 + %.0.i1343 = phi float [ %6621, %6620 ], [ %6623, %6622 ], !dbg !283 + %6624 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1345 = icmp eq i32 %6624, 0, !dbg !283 + br i1 %.not.i1345, label %6627, label %6625, !dbg !283 + +6625: ; preds = %__nv_exp2f.exit1344 + %6626 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6512) #3, !dbg !283 + br label %__nv_exp2f.exit1347, !dbg !283 + +6627: ; preds = %__nv_exp2f.exit1344 + %6628 = tail call float @llvm.nvvm.ex2.approx.f(float %6512) #3, !dbg !283 + br label %__nv_exp2f.exit1347, !dbg !283 + +__nv_exp2f.exit1347: ; preds = %6625, %6627 + %.0.i1346 = phi float [ %6626, %6625 ], [ %6628, %6627 ], !dbg !283 + %6629 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1348 = icmp eq i32 %6629, 0, !dbg !283 + br i1 %.not.i1348, label %6632, label %6630, !dbg !283 + +6630: ; preds = %__nv_exp2f.exit1347 + %6631 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6513) #3, !dbg !283 + br label %__nv_exp2f.exit1350, !dbg !283 + +6632: ; preds = %__nv_exp2f.exit1347 + %6633 = tail call float @llvm.nvvm.ex2.approx.f(float %6513) #3, !dbg !283 + br label %__nv_exp2f.exit1350, !dbg !283 + +__nv_exp2f.exit1350: ; preds = %6630, %6632 + %.0.i1349 = phi float [ %6631, %6630 ], [ %6633, %6632 ], !dbg !283 + %6634 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1351 = icmp eq i32 %6634, 0, !dbg !283 + br i1 %.not.i1351, label %6637, label %6635, !dbg !283 + +6635: ; preds = %__nv_exp2f.exit1350 + %6636 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6514) #3, !dbg !283 + br label %__nv_exp2f.exit1353, !dbg !283 + +6637: ; preds = %__nv_exp2f.exit1350 + %6638 = tail call float @llvm.nvvm.ex2.approx.f(float %6514) #3, !dbg !283 + br label %__nv_exp2f.exit1353, !dbg !283 + +__nv_exp2f.exit1353: ; preds = %6635, %6637 + %.0.i1352 = phi float [ %6636, %6635 ], [ %6638, %6637 ], !dbg !283 + %6639 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1354 = icmp eq i32 %6639, 0, !dbg !283 + br i1 %.not.i1354, label %6642, label %6640, !dbg !283 + +6640: ; preds = %__nv_exp2f.exit1353 + %6641 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6515) #3, !dbg !283 + br label %__nv_exp2f.exit1356, !dbg !283 + +6642: ; preds = %__nv_exp2f.exit1353 + %6643 = tail call float @llvm.nvvm.ex2.approx.f(float %6515) #3, !dbg !283 + br label %__nv_exp2f.exit1356, !dbg !283 + +__nv_exp2f.exit1356: ; preds = %6640, %6642 + %.0.i1355 = phi float [ %6641, %6640 ], [ %6643, %6642 ], !dbg !283 + %6644 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1357 = icmp eq i32 %6644, 0, !dbg !283 + br i1 %.not.i1357, label %6647, label %6645, !dbg !283 + +6645: ; preds = %__nv_exp2f.exit1356 + %6646 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6516) #3, !dbg !283 + br label %__nv_exp2f.exit1359, !dbg !283 + +6647: ; preds = %__nv_exp2f.exit1356 + %6648 = tail call float @llvm.nvvm.ex2.approx.f(float %6516) #3, !dbg !283 + br label %__nv_exp2f.exit1359, !dbg !283 + +__nv_exp2f.exit1359: ; preds = %6645, %6647 + %.0.i1358 = phi float [ %6646, %6645 ], [ %6648, %6647 ], !dbg !283 + %6649 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1360 = icmp eq i32 %6649, 0, !dbg !283 + br i1 %.not.i1360, label %6652, label %6650, !dbg !283 + +6650: ; preds = %__nv_exp2f.exit1359 + %6651 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6517) #3, !dbg !283 + br label %__nv_exp2f.exit1362, !dbg !283 + +6652: ; preds = %__nv_exp2f.exit1359 + %6653 = tail call float @llvm.nvvm.ex2.approx.f(float %6517) #3, !dbg !283 + br label %__nv_exp2f.exit1362, !dbg !283 + +__nv_exp2f.exit1362: ; preds = %6650, %6652 + %.0.i1361 = phi float [ %6651, %6650 ], [ %6653, %6652 ], !dbg !283 + %6654 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1363 = icmp eq i32 %6654, 0, !dbg !283 + br i1 %.not.i1363, label %6657, label %6655, !dbg !283 + +6655: ; preds = %__nv_exp2f.exit1362 + %6656 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6518) #3, !dbg !283 + br label %__nv_exp2f.exit1365, !dbg !283 + +6657: ; preds = %__nv_exp2f.exit1362 + %6658 = tail call float @llvm.nvvm.ex2.approx.f(float %6518) #3, !dbg !283 + br label %__nv_exp2f.exit1365, !dbg !283 + +__nv_exp2f.exit1365: ; preds = %6655, %6657 + %.0.i1364 = phi float [ %6656, %6655 ], [ %6658, %6657 ], !dbg !283 + %6659 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1366 = icmp eq i32 %6659, 0, !dbg !283 + br i1 %.not.i1366, label %6662, label %6660, !dbg !283 + +6660: ; preds = %__nv_exp2f.exit1365 + %6661 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6519) #3, !dbg !283 + br label %__nv_exp2f.exit1368, !dbg !283 + +6662: ; preds = %__nv_exp2f.exit1365 + %6663 = tail call float @llvm.nvvm.ex2.approx.f(float %6519) #3, !dbg !283 + br label %__nv_exp2f.exit1368, !dbg !283 + +__nv_exp2f.exit1368: ; preds = %6660, %6662 + %.0.i1367 = phi float [ %6661, %6660 ], [ %6663, %6662 ], !dbg !283 + %6664 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1369 = icmp eq i32 %6664, 0, !dbg !283 + br i1 %.not.i1369, label %6667, label %6665, !dbg !283 + +6665: ; preds = %__nv_exp2f.exit1368 + %6666 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6520) #3, !dbg !283 + br label %__nv_exp2f.exit1371, !dbg !283 + +6667: ; preds = %__nv_exp2f.exit1368 + %6668 = tail call float @llvm.nvvm.ex2.approx.f(float %6520) #3, !dbg !283 + br label %__nv_exp2f.exit1371, !dbg !283 + +__nv_exp2f.exit1371: ; preds = %6665, %6667 + %.0.i1370 = phi float [ %6666, %6665 ], [ %6668, %6667 ], !dbg !283 + %6669 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1372 = icmp eq i32 %6669, 0, !dbg !283 + br i1 %.not.i1372, label %6672, label %6670, !dbg !283 + +6670: ; preds = %__nv_exp2f.exit1371 + %6671 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6521) #3, !dbg !283 + br label %__nv_exp2f.exit1374, !dbg !283 + +6672: ; preds = %__nv_exp2f.exit1371 + %6673 = tail call float @llvm.nvvm.ex2.approx.f(float %6521) #3, !dbg !283 + br label %__nv_exp2f.exit1374, !dbg !283 + +__nv_exp2f.exit1374: ; preds = %6670, %6672 + %.0.i1373 = phi float [ %6671, %6670 ], [ %6673, %6672 ], !dbg !283 + %6674 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1375 = icmp eq i32 %6674, 0, !dbg !283 + br i1 %.not.i1375, label %6677, label %6675, !dbg !283 + +6675: ; preds = %__nv_exp2f.exit1374 + %6676 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6522) #3, !dbg !283 + br label %__nv_exp2f.exit1377, !dbg !283 + +6677: ; preds = %__nv_exp2f.exit1374 + %6678 = tail call float @llvm.nvvm.ex2.approx.f(float %6522) #3, !dbg !283 + br label %__nv_exp2f.exit1377, !dbg !283 + +__nv_exp2f.exit1377: ; preds = %6675, %6677 + %.0.i1376 = phi float [ %6676, %6675 ], [ %6678, %6677 ], !dbg !283 + %6679 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !283 + %.not.i1378 = icmp eq i32 %6679, 0, !dbg !283 + br i1 %.not.i1378, label %6682, label %6680, !dbg !283 + +6680: ; preds = %__nv_exp2f.exit1377 + %6681 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %6523) #3, !dbg !283 + br label %__nv_exp2f.exit1380, !dbg !283 + +6682: ; preds = %__nv_exp2f.exit1377 + %6683 = tail call float @llvm.nvvm.ex2.approx.f(float %6523) #3, !dbg !283 + br label %__nv_exp2f.exit1380, !dbg !283 + +__nv_exp2f.exit1380: ; preds = %6680, %6682 + %.0.i1379 = phi float [ %6681, %6680 ], [ %6683, %6682 ], !dbg !283 + %6684 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %5735, !dbg !262 + %6685 = insertelement <2 x float> poison, float %.0.i1286, i64 0, !dbg !284 + %6686 = insertelement <2 x float> %6685, float %.0.i1289, i64 1, !dbg !284 + %6687 = fptrunc <2 x float> %6686 to <2 x bfloat>, !dbg !284 + %6688 = insertelement <2 x float> poison, float %.0.i1292, i64 0, !dbg !284 + %6689 = insertelement <2 x float> %6688, float %.0.i1295, i64 1, !dbg !284 + %6690 = fptrunc <2 x float> %6689 to <2 x bfloat>, !dbg !284 + %6691 = insertelement <2 x float> poison, float %.0.i1298, i64 0, !dbg !284 + %6692 = insertelement <2 x float> %6691, float %.0.i1301, i64 1, !dbg !284 + %6693 = fptrunc <2 x float> %6692 to <2 x bfloat>, !dbg !284 + %6694 = insertelement <2 x float> poison, float %.0.i1304, i64 0, !dbg !284 + %6695 = insertelement <2 x float> %6694, float %.0.i1307, i64 1, !dbg !284 + %6696 = fptrunc <2 x float> %6695 to <2 x bfloat>, !dbg !284 + %6697 = insertelement <2 x float> poison, float %.0.i1310, i64 0, !dbg !284 + %6698 = insertelement <2 x float> %6697, float %.0.i1313, i64 1, !dbg !284 + %6699 = fptrunc <2 x float> %6698 to <2 x bfloat>, !dbg !284 + %6700 = insertelement <2 x float> poison, float %.0.i1316, i64 0, !dbg !284 + %6701 = insertelement <2 x float> %6700, float %.0.i1319, i64 1, !dbg !284 + %6702 = fptrunc <2 x float> %6701 to <2 x bfloat>, !dbg !284 + %6703 = insertelement <2 x float> poison, float %.0.i1322, i64 0, !dbg !284 + %6704 = insertelement <2 x float> %6703, float %.0.i1325, i64 1, !dbg !284 + %6705 = fptrunc <2 x float> %6704 to <2 x bfloat>, !dbg !284 + %6706 = insertelement <2 x float> poison, float %.0.i1328, i64 0, !dbg !284 + %6707 = insertelement <2 x float> %6706, float %.0.i1331, i64 1, !dbg !284 + %6708 = fptrunc <2 x float> %6707 to <2 x bfloat>, !dbg !284 + %6709 = insertelement <2 x float> poison, float %.0.i1334, i64 0, !dbg !284 + %6710 = insertelement <2 x float> %6709, float %.0.i1337, i64 1, !dbg !284 + %6711 = fptrunc <2 x float> %6710 to <2 x bfloat>, !dbg !284 + %6712 = insertelement <2 x float> poison, float %.0.i1340, i64 0, !dbg !284 + %6713 = insertelement <2 x float> %6712, float %.0.i1343, i64 1, !dbg !284 + %6714 = fptrunc <2 x float> %6713 to <2 x bfloat>, !dbg !284 + %6715 = insertelement <2 x float> poison, float %.0.i1346, i64 0, !dbg !284 + %6716 = insertelement <2 x float> %6715, float %.0.i1349, i64 1, !dbg !284 + %6717 = fptrunc <2 x float> %6716 to <2 x bfloat>, !dbg !284 + %6718 = insertelement <2 x float> poison, float %.0.i1352, i64 0, !dbg !284 + %6719 = insertelement <2 x float> %6718, float %.0.i1355, i64 1, !dbg !284 + %6720 = fptrunc <2 x float> %6719 to <2 x bfloat>, !dbg !284 + %6721 = insertelement <2 x float> poison, float %.0.i1358, i64 0, !dbg !284 + %6722 = insertelement <2 x float> %6721, float %.0.i1361, i64 1, !dbg !284 + %6723 = fptrunc <2 x float> %6722 to <2 x bfloat>, !dbg !284 + %6724 = insertelement <2 x float> poison, float %.0.i1364, i64 0, !dbg !284 + %6725 = insertelement <2 x float> %6724, float %.0.i1367, i64 1, !dbg !284 + %6726 = fptrunc <2 x float> %6725 to <2 x bfloat>, !dbg !284 + %6727 = insertelement <2 x float> poison, float %.0.i1370, i64 0, !dbg !284 + %6728 = insertelement <2 x float> %6727, float %.0.i1373, i64 1, !dbg !284 + %6729 = fptrunc <2 x float> %6728 to <2 x bfloat>, !dbg !284 + %6730 = insertelement <2 x float> poison, float %.0.i1376, i64 0, !dbg !284 + %6731 = insertelement <2 x float> %6730, float %.0.i1379, i64 1, !dbg !284 + %6732 = fptrunc <2 x float> %6731 to <2 x bfloat>, !dbg !284 + %6733 = bitcast <2 x bfloat> %6687 to i32, !dbg !285 + %6734 = bitcast <2 x bfloat> %6690 to i32, !dbg !285 + %6735 = bitcast <2 x bfloat> %6693 to i32, !dbg !285 + %6736 = bitcast <2 x bfloat> %6696 to i32, !dbg !285 + %6737 = bitcast <2 x bfloat> %6699 to i32, !dbg !285 + %6738 = bitcast <2 x bfloat> %6702 to i32, !dbg !285 + %6739 = bitcast <2 x bfloat> %6705 to i32, !dbg !285 + %6740 = bitcast <2 x bfloat> %6708 to i32, !dbg !285 + %6741 = bitcast <2 x bfloat> %6711 to i32, !dbg !285 + %6742 = bitcast <2 x bfloat> %6714 to i32, !dbg !285 + %6743 = bitcast <2 x bfloat> %6717 to i32, !dbg !285 + %6744 = bitcast <2 x bfloat> %6720 to i32, !dbg !285 + %6745 = bitcast <2 x bfloat> %6723 to i32, !dbg !285 + %6746 = bitcast <2 x bfloat> %6726 to i32, !dbg !285 + %6747 = bitcast <2 x bfloat> %6729 to i32, !dbg !285 + %6748 = bitcast <2 x bfloat> %6732 to i32, !dbg !285 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !285 + %6749 = ptrtoint ptr addrspace(3) %6684 to i32, !dbg !285 + %6750 = lshr exact i32 %6749, 4, !dbg !285 + %6751 = and i32 %6750, 16383, !dbg !285 + %6752 = zext nneg i32 %6751 to i64, !dbg !285 + %6753 = or disjoint i64 %6752, 4611686293338849280, !dbg !285 + %6754 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %5588, float %5589, float %5590, float %5591, float %5592, float %5593, float %5594, float %5595, float %5596, float %5597, float %5598, float %5599, float %5600, float %5601, float %5602, float %5603, float %5604, float %5605, float %5606, float %5607, float %5608, float %5609, float %5610, float %5611, float %5612, float %5613, float %5614, float %5615, float %5616, float %5617, float %5618, float %5619, float %5620, float %5621, float %5622, float %5623, float %5624, float %5625, float %5626, float %5627, float %5628, float %5629, float %5630, float %5631, float %5632, float %5633, float %5634, float %5635, float %5636, float %5637, float %5638, float %5639, float %5640, float %5641, float %5642, float %5643, float %5644, float %5645, float %5646, float %5647, float %5648, float %5649, float %5650, float %5651, i32 %6733, i32 %6734, i32 %6735, i32 %6736, i64 %6753, i1 true) #3, !dbg !285 + %6755 = add i32 %6749, 2048, !dbg !285 + %6756 = lshr exact i32 %6755, 4, !dbg !285 + %6757 = and i32 %6756, 16383, !dbg !285 + %6758 = zext nneg i32 %6757 to i64, !dbg !285 + %6759 = or disjoint i64 %6758, 4611686293338849280, !dbg !285 + %6760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 0, !dbg !285 + %6761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 1, !dbg !285 + %6762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 2, !dbg !285 + %6763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 3, !dbg !285 + %6764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 4, !dbg !285 + %6765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 5, !dbg !285 + %6766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 6, !dbg !285 + %6767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 7, !dbg !285 + %6768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 8, !dbg !285 + %6769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 9, !dbg !285 + %6770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 10, !dbg !285 + %6771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 11, !dbg !285 + %6772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 12, !dbg !285 + %6773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 13, !dbg !285 + %6774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 14, !dbg !285 + %6775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 15, !dbg !285 + %6776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 16, !dbg !285 + %6777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 17, !dbg !285 + %6778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 18, !dbg !285 + %6779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 19, !dbg !285 + %6780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 20, !dbg !285 + %6781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 21, !dbg !285 + %6782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 22, !dbg !285 + %6783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 23, !dbg !285 + %6784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 24, !dbg !285 + %6785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 25, !dbg !285 + %6786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 26, !dbg !285 + %6787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 27, !dbg !285 + %6788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 28, !dbg !285 + %6789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 29, !dbg !285 + %6790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 30, !dbg !285 + %6791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 31, !dbg !285 + %6792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 32, !dbg !285 + %6793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 33, !dbg !285 + %6794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 34, !dbg !285 + %6795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 35, !dbg !285 + %6796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 36, !dbg !285 + %6797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 37, !dbg !285 + %6798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 38, !dbg !285 + %6799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 39, !dbg !285 + %6800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 40, !dbg !285 + %6801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 41, !dbg !285 + %6802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 42, !dbg !285 + %6803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 43, !dbg !285 + %6804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 44, !dbg !285 + %6805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 45, !dbg !285 + %6806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 46, !dbg !285 + %6807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 47, !dbg !285 + %6808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 48, !dbg !285 + %6809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 49, !dbg !285 + %6810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 50, !dbg !285 + %6811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 51, !dbg !285 + %6812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 52, !dbg !285 + %6813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 53, !dbg !285 + %6814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 54, !dbg !285 + %6815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 55, !dbg !285 + %6816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 56, !dbg !285 + %6817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 57, !dbg !285 + %6818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 58, !dbg !285 + %6819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 59, !dbg !285 + %6820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 60, !dbg !285 + %6821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 61, !dbg !285 + %6822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 62, !dbg !285 + %6823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6754, 63, !dbg !285 + %6824 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %6760, float %6761, float %6762, float %6763, float %6764, float %6765, float %6766, float %6767, float %6768, float %6769, float %6770, float %6771, float %6772, float %6773, float %6774, float %6775, float %6776, float %6777, float %6778, float %6779, float %6780, float %6781, float %6782, float %6783, float %6784, float %6785, float %6786, float %6787, float %6788, float %6789, float %6790, float %6791, float %6792, float %6793, float %6794, float %6795, float %6796, float %6797, float %6798, float %6799, float %6800, float %6801, float %6802, float %6803, float %6804, float %6805, float %6806, float %6807, float %6808, float %6809, float %6810, float %6811, float %6812, float %6813, float %6814, float %6815, float %6816, float %6817, float %6818, float %6819, float %6820, float %6821, float %6822, float %6823, i32 %6737, i32 %6738, i32 %6739, i32 %6740, i64 %6759, i1 true) #3, !dbg !285 + %6825 = add i32 %6749, 4096, !dbg !285 + %6826 = lshr exact i32 %6825, 4, !dbg !285 + %6827 = and i32 %6826, 16383, !dbg !285 + %6828 = zext nneg i32 %6827 to i64, !dbg !285 + %6829 = or disjoint i64 %6828, 4611686293338849280, !dbg !285 + %6830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 0, !dbg !285 + %6831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 1, !dbg !285 + %6832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 2, !dbg !285 + %6833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 3, !dbg !285 + %6834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 4, !dbg !285 + %6835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 5, !dbg !285 + %6836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 6, !dbg !285 + %6837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 7, !dbg !285 + %6838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 8, !dbg !285 + %6839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 9, !dbg !285 + %6840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 10, !dbg !285 + %6841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 11, !dbg !285 + %6842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 12, !dbg !285 + %6843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 13, !dbg !285 + %6844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 14, !dbg !285 + %6845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 15, !dbg !285 + %6846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 16, !dbg !285 + %6847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 17, !dbg !285 + %6848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 18, !dbg !285 + %6849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 19, !dbg !285 + %6850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 20, !dbg !285 + %6851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 21, !dbg !285 + %6852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 22, !dbg !285 + %6853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 23, !dbg !285 + %6854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 24, !dbg !285 + %6855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 25, !dbg !285 + %6856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 26, !dbg !285 + %6857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 27, !dbg !285 + %6858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 28, !dbg !285 + %6859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 29, !dbg !285 + %6860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 30, !dbg !285 + %6861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 31, !dbg !285 + %6862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 32, !dbg !285 + %6863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 33, !dbg !285 + %6864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 34, !dbg !285 + %6865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 35, !dbg !285 + %6866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 36, !dbg !285 + %6867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 37, !dbg !285 + %6868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 38, !dbg !285 + %6869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 39, !dbg !285 + %6870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 40, !dbg !285 + %6871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 41, !dbg !285 + %6872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 42, !dbg !285 + %6873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 43, !dbg !285 + %6874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 44, !dbg !285 + %6875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 45, !dbg !285 + %6876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 46, !dbg !285 + %6877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 47, !dbg !285 + %6878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 48, !dbg !285 + %6879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 49, !dbg !285 + %6880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 50, !dbg !285 + %6881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 51, !dbg !285 + %6882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 52, !dbg !285 + %6883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 53, !dbg !285 + %6884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 54, !dbg !285 + %6885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 55, !dbg !285 + %6886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 56, !dbg !285 + %6887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 57, !dbg !285 + %6888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 58, !dbg !285 + %6889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 59, !dbg !285 + %6890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 60, !dbg !285 + %6891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 61, !dbg !285 + %6892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 62, !dbg !285 + %6893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6824, 63, !dbg !285 + %6894 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %6830, float %6831, float %6832, float %6833, float %6834, float %6835, float %6836, float %6837, float %6838, float %6839, float %6840, float %6841, float %6842, float %6843, float %6844, float %6845, float %6846, float %6847, float %6848, float %6849, float %6850, float %6851, float %6852, float %6853, float %6854, float %6855, float %6856, float %6857, float %6858, float %6859, float %6860, float %6861, float %6862, float %6863, float %6864, float %6865, float %6866, float %6867, float %6868, float %6869, float %6870, float %6871, float %6872, float %6873, float %6874, float %6875, float %6876, float %6877, float %6878, float %6879, float %6880, float %6881, float %6882, float %6883, float %6884, float %6885, float %6886, float %6887, float %6888, float %6889, float %6890, float %6891, float %6892, float %6893, i32 %6741, i32 %6742, i32 %6743, i32 %6744, i64 %6829, i1 true) #3, !dbg !285 + %6895 = add i32 %6749, 6144, !dbg !285 + %6896 = lshr exact i32 %6895, 4, !dbg !285 + %6897 = and i32 %6896, 16383, !dbg !285 + %6898 = zext nneg i32 %6897 to i64, !dbg !285 + %6899 = or disjoint i64 %6898, 4611686293338849280, !dbg !285 + %6900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 0, !dbg !285 + %6901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 1, !dbg !285 + %6902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 2, !dbg !285 + %6903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 3, !dbg !285 + %6904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 4, !dbg !285 + %6905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 5, !dbg !285 + %6906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 6, !dbg !285 + %6907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 7, !dbg !285 + %6908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 8, !dbg !285 + %6909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 9, !dbg !285 + %6910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 10, !dbg !285 + %6911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 11, !dbg !285 + %6912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 12, !dbg !285 + %6913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 13, !dbg !285 + %6914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 14, !dbg !285 + %6915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 15, !dbg !285 + %6916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 16, !dbg !285 + %6917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 17, !dbg !285 + %6918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 18, !dbg !285 + %6919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 19, !dbg !285 + %6920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 20, !dbg !285 + %6921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 21, !dbg !285 + %6922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 22, !dbg !285 + %6923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 23, !dbg !285 + %6924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 24, !dbg !285 + %6925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 25, !dbg !285 + %6926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 26, !dbg !285 + %6927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 27, !dbg !285 + %6928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 28, !dbg !285 + %6929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 29, !dbg !285 + %6930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 30, !dbg !285 + %6931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 31, !dbg !285 + %6932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 32, !dbg !285 + %6933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 33, !dbg !285 + %6934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 34, !dbg !285 + %6935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 35, !dbg !285 + %6936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 36, !dbg !285 + %6937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 37, !dbg !285 + %6938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 38, !dbg !285 + %6939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 39, !dbg !285 + %6940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 40, !dbg !285 + %6941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 41, !dbg !285 + %6942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 42, !dbg !285 + %6943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 43, !dbg !285 + %6944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 44, !dbg !285 + %6945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 45, !dbg !285 + %6946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 46, !dbg !285 + %6947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 47, !dbg !285 + %6948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 48, !dbg !285 + %6949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 49, !dbg !285 + %6950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 50, !dbg !285 + %6951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 51, !dbg !285 + %6952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 52, !dbg !285 + %6953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 53, !dbg !285 + %6954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 54, !dbg !285 + %6955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 55, !dbg !285 + %6956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 56, !dbg !285 + %6957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 57, !dbg !285 + %6958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 58, !dbg !285 + %6959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 59, !dbg !285 + %6960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 60, !dbg !285 + %6961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 61, !dbg !285 + %6962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 62, !dbg !285 + %6963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6894, 63, !dbg !285 + %6964 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %6900, float %6901, float %6902, float %6903, float %6904, float %6905, float %6906, float %6907, float %6908, float %6909, float %6910, float %6911, float %6912, float %6913, float %6914, float %6915, float %6916, float %6917, float %6918, float %6919, float %6920, float %6921, float %6922, float %6923, float %6924, float %6925, float %6926, float %6927, float %6928, float %6929, float %6930, float %6931, float %6932, float %6933, float %6934, float %6935, float %6936, float %6937, float %6938, float %6939, float %6940, float %6941, float %6942, float %6943, float %6944, float %6945, float %6946, float %6947, float %6948, float %6949, float %6950, float %6951, float %6952, float %6953, float %6954, float %6955, float %6956, float %6957, float %6958, float %6959, float %6960, float %6961, float %6962, float %6963, i32 %6745, i32 %6746, i32 %6747, i32 %6748, i64 %6899, i1 true) #3, !dbg !285 + %6965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 0, !dbg !285 + %6966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 1, !dbg !285 + %6967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 2, !dbg !285 + %6968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 3, !dbg !285 + %6969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 4, !dbg !285 + %6970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 5, !dbg !285 + %6971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 6, !dbg !285 + %6972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 7, !dbg !285 + %6973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 8, !dbg !285 + %6974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 9, !dbg !285 + %6975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 10, !dbg !285 + %6976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 11, !dbg !285 + %6977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 12, !dbg !285 + %6978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 13, !dbg !285 + %6979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 14, !dbg !285 + %6980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 15, !dbg !285 + %6981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 16, !dbg !285 + %6982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 17, !dbg !285 + %6983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 18, !dbg !285 + %6984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 19, !dbg !285 + %6985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 20, !dbg !285 + %6986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 21, !dbg !285 + %6987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 22, !dbg !285 + %6988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 23, !dbg !285 + %6989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 24, !dbg !285 + %6990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 25, !dbg !285 + %6991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 26, !dbg !285 + %6992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 27, !dbg !285 + %6993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 28, !dbg !285 + %6994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 29, !dbg !285 + %6995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 30, !dbg !285 + %6996 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 31, !dbg !285 + %6997 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 32, !dbg !285 + %6998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 33, !dbg !285 + %6999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 34, !dbg !285 + %7000 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 35, !dbg !285 + %7001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 36, !dbg !285 + %7002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 37, !dbg !285 + %7003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 38, !dbg !285 + %7004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 39, !dbg !285 + %7005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 40, !dbg !285 + %7006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 41, !dbg !285 + %7007 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 42, !dbg !285 + %7008 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 43, !dbg !285 + %7009 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 44, !dbg !285 + %7010 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 45, !dbg !285 + %7011 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 46, !dbg !285 + %7012 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 47, !dbg !285 + %7013 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 48, !dbg !285 + %7014 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 49, !dbg !285 + %7015 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 50, !dbg !285 + %7016 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 51, !dbg !285 + %7017 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 52, !dbg !285 + %7018 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 53, !dbg !285 + %7019 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 54, !dbg !285 + %7020 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 55, !dbg !285 + %7021 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 56, !dbg !285 + %7022 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 57, !dbg !285 + %7023 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 58, !dbg !285 + %7024 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 59, !dbg !285 + %7025 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 60, !dbg !285 + %7026 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 61, !dbg !285 + %7027 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 62, !dbg !285 + %7028 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %6964, 63, !dbg !285 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !285 + %7029 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107008), i32 %5737, !dbg !264 + %7030 = getelementptr inbounds nuw i8, ptr addrspace(3) %7029, i32 %5065, !dbg !264 + %7031 = load float, ptr addrspace(3) %7030, align 8, !dbg !264 + %7032 = getelementptr inbounds nuw i8, ptr addrspace(3) %7030, i32 4, !dbg !264 + %7033 = load float, ptr addrspace(3) %7032, align 4, !dbg !264 + %7034 = getelementptr inbounds nuw i8, ptr addrspace(3) %7029, i32 %5068, !dbg !264 + %7035 = load float, ptr addrspace(3) %7034, align 8, !dbg !264 + %7036 = getelementptr inbounds nuw i8, ptr addrspace(3) %7034, i32 4, !dbg !264 + %7037 = load float, ptr addrspace(3) %7036, align 4, !dbg !264 + %7038 = getelementptr inbounds nuw i8, ptr addrspace(3) %7029, i32 %5071, !dbg !264 + %7039 = load float, ptr addrspace(3) %7038, align 8, !dbg !264 + %7040 = getelementptr inbounds nuw i8, ptr addrspace(3) %7038, i32 4, !dbg !264 + %7041 = load float, ptr addrspace(3) %7040, align 4, !dbg !264 + %7042 = getelementptr inbounds nuw i8, ptr addrspace(3) %7029, i32 %5074, !dbg !264 + %7043 = load float, ptr addrspace(3) %7042, align 8, !dbg !264 + %7044 = getelementptr inbounds nuw i8, ptr addrspace(3) %7042, i32 4, !dbg !264 + %7045 = load float, ptr addrspace(3) %7044, align 4, !dbg !264 + %7046 = getelementptr inbounds nuw i8, ptr addrspace(3) %7029, i32 %5077, !dbg !264 + %7047 = load float, ptr addrspace(3) %7046, align 8, !dbg !264 + %7048 = getelementptr inbounds nuw i8, ptr addrspace(3) %7046, i32 4, !dbg !264 + %7049 = load float, ptr addrspace(3) %7048, align 4, !dbg !264 + %7050 = getelementptr inbounds nuw i8, ptr addrspace(3) %7029, i32 %5080, !dbg !264 + %7051 = load float, ptr addrspace(3) %7050, align 8, !dbg !264 + %7052 = getelementptr inbounds nuw i8, ptr addrspace(3) %7050, i32 4, !dbg !264 + %7053 = load float, ptr addrspace(3) %7052, align 4, !dbg !264 + %7054 = getelementptr inbounds nuw i8, ptr addrspace(3) %7029, i32 %5083, !dbg !264 + %7055 = load float, ptr addrspace(3) %7054, align 8, !dbg !264 + %7056 = getelementptr inbounds nuw i8, ptr addrspace(3) %7054, i32 4, !dbg !264 + %7057 = load float, ptr addrspace(3) %7056, align 4, !dbg !264 + %7058 = getelementptr inbounds nuw i8, ptr addrspace(3) %7029, i32 %5086, !dbg !264 + %7059 = load float, ptr addrspace(3) %7058, align 8, !dbg !264 + %7060 = getelementptr inbounds nuw i8, ptr addrspace(3) %7058, i32 4, !dbg !264 + %7061 = load float, ptr addrspace(3) %7060, align 4, !dbg !264 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !286 + %7062 = add i32 %5805, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288) to i32), !dbg !286 + %7063 = lshr exact i32 %7062, 4, !dbg !286 + %7064 = and i32 %7063, 16383, !dbg !286 + %7065 = zext nneg i32 %7064 to i64, !dbg !286 + %7066 = or disjoint i64 %7065, 4611686293372403712, !dbg !286 + %7067 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %7066, i64 %6753) #3, !dbg !286 + %7068 = add i32 %5817, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288) to i32), !dbg !286 + %7069 = lshr exact i32 %7068, 4, !dbg !286 + %7070 = and i32 %7069, 16383, !dbg !286 + %7071 = zext nneg i32 %7070 to i64, !dbg !286 + %7072 = or disjoint i64 %7071, 4611686293372403712, !dbg !286 + %7073 = add i32 %6749, 32, !dbg !286 + %7074 = lshr exact i32 %7073, 4, !dbg !286 + %7075 = and i32 %7074, 16383, !dbg !286 + %7076 = zext nneg i32 %7075 to i64, !dbg !286 + %7077 = or disjoint i64 %7076, 4611686293338849280, !dbg !286 + %7078 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 0, !dbg !286 + %7079 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 1, !dbg !286 + %7080 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 2, !dbg !286 + %7081 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 3, !dbg !286 + %7082 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 4, !dbg !286 + %7083 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 5, !dbg !286 + %7084 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 6, !dbg !286 + %7085 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 7, !dbg !286 + %7086 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 8, !dbg !286 + %7087 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 9, !dbg !286 + %7088 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 10, !dbg !286 + %7089 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 11, !dbg !286 + %7090 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 12, !dbg !286 + %7091 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 13, !dbg !286 + %7092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 14, !dbg !286 + %7093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 15, !dbg !286 + %7094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 16, !dbg !286 + %7095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 17, !dbg !286 + %7096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 18, !dbg !286 + %7097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 19, !dbg !286 + %7098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 20, !dbg !286 + %7099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 21, !dbg !286 + %7100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 22, !dbg !286 + %7101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 23, !dbg !286 + %7102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 24, !dbg !286 + %7103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 25, !dbg !286 + %7104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 26, !dbg !286 + %7105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 27, !dbg !286 + %7106 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 28, !dbg !286 + %7107 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 29, !dbg !286 + %7108 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 30, !dbg !286 + %7109 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7067, 31, !dbg !286 + %7110 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7078, float %7079, float %7080, float %7081, float %7082, float %7083, float %7084, float %7085, float %7086, float %7087, float %7088, float %7089, float %7090, float %7091, float %7092, float %7093, float %7094, float %7095, float %7096, float %7097, float %7098, float %7099, float %7100, float %7101, float %7102, float %7103, float %7104, float %7105, float %7106, float %7107, float %7108, float %7109, i64 %7072, i64 %7077, i1 true) #3, !dbg !286 + %7111 = add i32 %5861, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288) to i32), !dbg !286 + %7112 = lshr exact i32 %7111, 4, !dbg !286 + %7113 = and i32 %7112, 16383, !dbg !286 + %7114 = zext nneg i32 %7113 to i64, !dbg !286 + %7115 = or disjoint i64 %7114, 4611686293372403712, !dbg !286 + %7116 = add i32 %6749, 64, !dbg !286 + %7117 = lshr exact i32 %7116, 4, !dbg !286 + %7118 = and i32 %7117, 16383, !dbg !286 + %7119 = zext nneg i32 %7118 to i64, !dbg !286 + %7120 = or disjoint i64 %7119, 4611686293338849280, !dbg !286 + %7121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 0, !dbg !286 + %7122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 1, !dbg !286 + %7123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 2, !dbg !286 + %7124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 3, !dbg !286 + %7125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 4, !dbg !286 + %7126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 5, !dbg !286 + %7127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 6, !dbg !286 + %7128 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 7, !dbg !286 + %7129 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 8, !dbg !286 + %7130 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 9, !dbg !286 + %7131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 10, !dbg !286 + %7132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 11, !dbg !286 + %7133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 12, !dbg !286 + %7134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 13, !dbg !286 + %7135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 14, !dbg !286 + %7136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 15, !dbg !286 + %7137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 16, !dbg !286 + %7138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 17, !dbg !286 + %7139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 18, !dbg !286 + %7140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 19, !dbg !286 + %7141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 20, !dbg !286 + %7142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 21, !dbg !286 + %7143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 22, !dbg !286 + %7144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 23, !dbg !286 + %7145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 24, !dbg !286 + %7146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 25, !dbg !286 + %7147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 26, !dbg !286 + %7148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 27, !dbg !286 + %7149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 28, !dbg !286 + %7150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 29, !dbg !286 + %7151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 30, !dbg !286 + %7152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7110, 31, !dbg !286 + %7153 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7121, float %7122, float %7123, float %7124, float %7125, float %7126, float %7127, float %7128, float %7129, float %7130, float %7131, float %7132, float %7133, float %7134, float %7135, float %7136, float %7137, float %7138, float %7139, float %7140, float %7141, float %7142, float %7143, float %7144, float %7145, float %7146, float %7147, float %7148, float %7149, float %7150, float %7151, float %7152, i64 %7115, i64 %7120, i1 true) #3, !dbg !286 + %7154 = add i32 %5905, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288) to i32), !dbg !286 + %7155 = lshr exact i32 %7154, 4, !dbg !286 + %7156 = and i32 %7155, 16383, !dbg !286 + %7157 = zext nneg i32 %7156 to i64, !dbg !286 + %7158 = or disjoint i64 %7157, 4611686293372403712, !dbg !286 + %7159 = add i32 %6749, 96, !dbg !286 + %7160 = lshr exact i32 %7159, 4, !dbg !286 + %7161 = and i32 %7160, 16383, !dbg !286 + %7162 = zext nneg i32 %7161 to i64, !dbg !286 + %7163 = or disjoint i64 %7162, 4611686293338849280, !dbg !286 + %7164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 0, !dbg !286 + %7165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 1, !dbg !286 + %7166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 2, !dbg !286 + %7167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 3, !dbg !286 + %7168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 4, !dbg !286 + %7169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 5, !dbg !286 + %7170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 6, !dbg !286 + %7171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 7, !dbg !286 + %7172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 8, !dbg !286 + %7173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 9, !dbg !286 + %7174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 10, !dbg !286 + %7175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 11, !dbg !286 + %7176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 12, !dbg !286 + %7177 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 13, !dbg !286 + %7178 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 14, !dbg !286 + %7179 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 15, !dbg !286 + %7180 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 16, !dbg !286 + %7181 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 17, !dbg !286 + %7182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 18, !dbg !286 + %7183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 19, !dbg !286 + %7184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 20, !dbg !286 + %7185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 21, !dbg !286 + %7186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 22, !dbg !286 + %7187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 23, !dbg !286 + %7188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 24, !dbg !286 + %7189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 25, !dbg !286 + %7190 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 26, !dbg !286 + %7191 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 27, !dbg !286 + %7192 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 28, !dbg !286 + %7193 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 29, !dbg !286 + %7194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 30, !dbg !286 + %7195 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7153, 31, !dbg !286 + %7196 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7164, float %7165, float %7166, float %7167, float %7168, float %7169, float %7170, float %7171, float %7172, float %7173, float %7174, float %7175, float %7176, float %7177, float %7178, float %7179, float %7180, float %7181, float %7182, float %7183, float %7184, float %7185, float %7186, float %7187, float %7188, float %7189, float %7190, float %7191, float %7192, float %7193, float %7194, float %7195, i64 %7158, i64 %7163, i1 true) #3, !dbg !286 + %7197 = add i32 %5949, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288) to i32), !dbg !286 + %7198 = lshr exact i32 %7197, 4, !dbg !286 + %7199 = and i32 %7198, 16383, !dbg !286 + %7200 = zext nneg i32 %7199 to i64, !dbg !286 + %7201 = or disjoint i64 %7200, 4611686293372403712, !dbg !286 + %7202 = add i32 %6749, 8192, !dbg !286 + %7203 = lshr exact i32 %7202, 4, !dbg !286 + %7204 = and i32 %7203, 16383, !dbg !286 + %7205 = zext nneg i32 %7204 to i64, !dbg !286 + %7206 = or disjoint i64 %7205, 4611686293338849280, !dbg !286 + %7207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 0, !dbg !286 + %7208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 1, !dbg !286 + %7209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 2, !dbg !286 + %7210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 3, !dbg !286 + %7211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 4, !dbg !286 + %7212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 5, !dbg !286 + %7213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 6, !dbg !286 + %7214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 7, !dbg !286 + %7215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 8, !dbg !286 + %7216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 9, !dbg !286 + %7217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 10, !dbg !286 + %7218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 11, !dbg !286 + %7219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 12, !dbg !286 + %7220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 13, !dbg !286 + %7221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 14, !dbg !286 + %7222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 15, !dbg !286 + %7223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 16, !dbg !286 + %7224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 17, !dbg !286 + %7225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 18, !dbg !286 + %7226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 19, !dbg !286 + %7227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 20, !dbg !286 + %7228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 21, !dbg !286 + %7229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 22, !dbg !286 + %7230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 23, !dbg !286 + %7231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 24, !dbg !286 + %7232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 25, !dbg !286 + %7233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 26, !dbg !286 + %7234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 27, !dbg !286 + %7235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 28, !dbg !286 + %7236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 29, !dbg !286 + %7237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 30, !dbg !286 + %7238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7196, 31, !dbg !286 + %7239 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7207, float %7208, float %7209, float %7210, float %7211, float %7212, float %7213, float %7214, float %7215, float %7216, float %7217, float %7218, float %7219, float %7220, float %7221, float %7222, float %7223, float %7224, float %7225, float %7226, float %7227, float %7228, float %7229, float %7230, float %7231, float %7232, float %7233, float %7234, float %7235, float %7236, float %7237, float %7238, i64 %7201, i64 %7206, i1 true) #3, !dbg !286 + %7240 = add i32 %5993, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288) to i32), !dbg !286 + %7241 = lshr exact i32 %7240, 4, !dbg !286 + %7242 = and i32 %7241, 16383, !dbg !286 + %7243 = zext nneg i32 %7242 to i64, !dbg !286 + %7244 = or disjoint i64 %7243, 4611686293372403712, !dbg !286 + %7245 = add i32 %6749, 8224, !dbg !286 + %7246 = lshr exact i32 %7245, 4, !dbg !286 + %7247 = and i32 %7246, 16383, !dbg !286 + %7248 = zext nneg i32 %7247 to i64, !dbg !286 + %7249 = or disjoint i64 %7248, 4611686293338849280, !dbg !286 + %7250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 0, !dbg !286 + %7251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 1, !dbg !286 + %7252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 2, !dbg !286 + %7253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 3, !dbg !286 + %7254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 4, !dbg !286 + %7255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 5, !dbg !286 + %7256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 6, !dbg !286 + %7257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 7, !dbg !286 + %7258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 8, !dbg !286 + %7259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 9, !dbg !286 + %7260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 10, !dbg !286 + %7261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 11, !dbg !286 + %7262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 12, !dbg !286 + %7263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 13, !dbg !286 + %7264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 14, !dbg !286 + %7265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 15, !dbg !286 + %7266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 16, !dbg !286 + %7267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 17, !dbg !286 + %7268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 18, !dbg !286 + %7269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 19, !dbg !286 + %7270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 20, !dbg !286 + %7271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 21, !dbg !286 + %7272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 22, !dbg !286 + %7273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 23, !dbg !286 + %7274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 24, !dbg !286 + %7275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 25, !dbg !286 + %7276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 26, !dbg !286 + %7277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 27, !dbg !286 + %7278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 28, !dbg !286 + %7279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 29, !dbg !286 + %7280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 30, !dbg !286 + %7281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7239, 31, !dbg !286 + %7282 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7250, float %7251, float %7252, float %7253, float %7254, float %7255, float %7256, float %7257, float %7258, float %7259, float %7260, float %7261, float %7262, float %7263, float %7264, float %7265, float %7266, float %7267, float %7268, float %7269, float %7270, float %7271, float %7272, float %7273, float %7274, float %7275, float %7276, float %7277, float %7278, float %7279, float %7280, float %7281, i64 %7244, i64 %7249, i1 true) #3, !dbg !286 + %7283 = add i32 %6037, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288) to i32), !dbg !286 + %7284 = lshr exact i32 %7283, 4, !dbg !286 + %7285 = and i32 %7284, 16383, !dbg !286 + %7286 = zext nneg i32 %7285 to i64, !dbg !286 + %7287 = or disjoint i64 %7286, 4611686293372403712, !dbg !286 + %7288 = add i32 %6749, 8256, !dbg !286 + %7289 = lshr exact i32 %7288, 4, !dbg !286 + %7290 = and i32 %7289, 16383, !dbg !286 + %7291 = zext nneg i32 %7290 to i64, !dbg !286 + %7292 = or disjoint i64 %7291, 4611686293338849280, !dbg !286 + %7293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 0, !dbg !286 + %7294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 1, !dbg !286 + %7295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 2, !dbg !286 + %7296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 3, !dbg !286 + %7297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 4, !dbg !286 + %7298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 5, !dbg !286 + %7299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 6, !dbg !286 + %7300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 7, !dbg !286 + %7301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 8, !dbg !286 + %7302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 9, !dbg !286 + %7303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 10, !dbg !286 + %7304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 11, !dbg !286 + %7305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 12, !dbg !286 + %7306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 13, !dbg !286 + %7307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 14, !dbg !286 + %7308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 15, !dbg !286 + %7309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 16, !dbg !286 + %7310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 17, !dbg !286 + %7311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 18, !dbg !286 + %7312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 19, !dbg !286 + %7313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 20, !dbg !286 + %7314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 21, !dbg !286 + %7315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 22, !dbg !286 + %7316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 23, !dbg !286 + %7317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 24, !dbg !286 + %7318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 25, !dbg !286 + %7319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 26, !dbg !286 + %7320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 27, !dbg !286 + %7321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 28, !dbg !286 + %7322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 29, !dbg !286 + %7323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 30, !dbg !286 + %7324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7282, 31, !dbg !286 + %7325 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7293, float %7294, float %7295, float %7296, float %7297, float %7298, float %7299, float %7300, float %7301, float %7302, float %7303, float %7304, float %7305, float %7306, float %7307, float %7308, float %7309, float %7310, float %7311, float %7312, float %7313, float %7314, float %7315, float %7316, float %7317, float %7318, float %7319, float %7320, float %7321, float %7322, float %7323, float %7324, i64 %7287, i64 %7292, i1 true) #3, !dbg !286 + %7326 = add i32 %6081, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288) to i32), !dbg !286 + %7327 = lshr exact i32 %7326, 4, !dbg !286 + %7328 = and i32 %7327, 16383, !dbg !286 + %7329 = zext nneg i32 %7328 to i64, !dbg !286 + %7330 = or disjoint i64 %7329, 4611686293372403712, !dbg !286 + %7331 = add i32 %6749, 8288, !dbg !286 + %7332 = lshr exact i32 %7331, 4, !dbg !286 + %7333 = and i32 %7332, 16383, !dbg !286 + %7334 = zext nneg i32 %7333 to i64, !dbg !286 + %7335 = or disjoint i64 %7334, 4611686293338849280, !dbg !286 + %7336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 0, !dbg !286 + %7337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 1, !dbg !286 + %7338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 2, !dbg !286 + %7339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 3, !dbg !286 + %7340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 4, !dbg !286 + %7341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 5, !dbg !286 + %7342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 6, !dbg !286 + %7343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 7, !dbg !286 + %7344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 8, !dbg !286 + %7345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 9, !dbg !286 + %7346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 10, !dbg !286 + %7347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 11, !dbg !286 + %7348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 12, !dbg !286 + %7349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 13, !dbg !286 + %7350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 14, !dbg !286 + %7351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 15, !dbg !286 + %7352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 16, !dbg !286 + %7353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 17, !dbg !286 + %7354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 18, !dbg !286 + %7355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 19, !dbg !286 + %7356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 20, !dbg !286 + %7357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 21, !dbg !286 + %7358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 22, !dbg !286 + %7359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 23, !dbg !286 + %7360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 24, !dbg !286 + %7361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 25, !dbg !286 + %7362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 26, !dbg !286 + %7363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 27, !dbg !286 + %7364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 28, !dbg !286 + %7365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 29, !dbg !286 + %7366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 30, !dbg !286 + %7367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7325, 31, !dbg !286 + %7368 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %7336, float %7337, float %7338, float %7339, float %7340, float %7341, float %7342, float %7343, float %7344, float %7345, float %7346, float %7347, float %7348, float %7349, float %7350, float %7351, float %7352, float %7353, float %7354, float %7355, float %7356, float %7357, float %7358, float %7359, float %7360, float %7361, float %7362, float %7363, float %7364, float %7365, float %7366, float %7367, i64 %7330, i64 %7335, i1 true) #3, !dbg !286 + %7369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 0, !dbg !286 + %7370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 1, !dbg !286 + %7371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 2, !dbg !286 + %7372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 3, !dbg !286 + %7373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 4, !dbg !286 + %7374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 5, !dbg !286 + %7375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 6, !dbg !286 + %7376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 7, !dbg !286 + %7377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 8, !dbg !286 + %7378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 9, !dbg !286 + %7379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 10, !dbg !286 + %7380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 11, !dbg !286 + %7381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 12, !dbg !286 + %7382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 13, !dbg !286 + %7383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 14, !dbg !286 + %7384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 15, !dbg !286 + %7385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 16, !dbg !286 + %7386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 17, !dbg !286 + %7387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 18, !dbg !286 + %7388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 19, !dbg !286 + %7389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 20, !dbg !286 + %7390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 21, !dbg !286 + %7391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 22, !dbg !286 + %7392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 23, !dbg !286 + %7393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 24, !dbg !286 + %7394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 25, !dbg !286 + %7395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 26, !dbg !286 + %7396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 27, !dbg !286 + %7397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 28, !dbg !286 + %7398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 29, !dbg !286 + %7399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 30, !dbg !286 + %7400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7368, 31, !dbg !286 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !286 + %7401 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %7369, float %7370, float %7371, float %7372, float %7373, float %7374, float %7375, float %7376, float %7377, float %7378, float %7379, float %7380, float %7381, float %7382, float %7383, float %7384, float %7385, float %7386, float %7387, float %7388, float %7389, float %7390, float %7391, float %7392, float %7393, float %7394, float %7395, float %7396, float %7397, float %7398, float %7399, float %7400, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288), i32 0, i32 0, ptr addrspace(3) %6684, i32 0, i32 0) #3, !dbg !286 + %7402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 0, !dbg !286 + %7403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 1, !dbg !286 + %7404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 2, !dbg !286 + %7405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 3, !dbg !286 + %7406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 4, !dbg !286 + %7407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 5, !dbg !286 + %7408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 6, !dbg !286 + %7409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 7, !dbg !286 + %7410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 8, !dbg !286 + %7411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 9, !dbg !286 + %7412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 10, !dbg !286 + %7413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 11, !dbg !286 + %7414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 12, !dbg !286 + %7415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 13, !dbg !286 + %7416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 14, !dbg !286 + %7417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 15, !dbg !286 + %7418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 16, !dbg !286 + %7419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 17, !dbg !286 + %7420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 18, !dbg !286 + %7421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 19, !dbg !286 + %7422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 20, !dbg !286 + %7423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 21, !dbg !286 + %7424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 22, !dbg !286 + %7425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 23, !dbg !286 + %7426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 24, !dbg !286 + %7427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 25, !dbg !286 + %7428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 26, !dbg !286 + %7429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 27, !dbg !286 + %7430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 28, !dbg !286 + %7431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 29, !dbg !286 + %7432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 30, !dbg !286 + %7433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %7401, 31, !dbg !286 + %7434 = fsub float %7402, %7031, !dbg !287 + %7435 = fsub float %7403, %7033, !dbg !287 + %7436 = fsub float %7404, %7031, !dbg !287 + %7437 = fsub float %7405, %7033, !dbg !287 + %7438 = fsub float %7406, %7035, !dbg !287 + %7439 = fsub float %7407, %7037, !dbg !287 + %7440 = fsub float %7408, %7035, !dbg !287 + %7441 = fsub float %7409, %7037, !dbg !287 + %7442 = fsub float %7410, %7039, !dbg !287 + %7443 = fsub float %7411, %7041, !dbg !287 + %7444 = fsub float %7412, %7039, !dbg !287 + %7445 = fsub float %7413, %7041, !dbg !287 + %7446 = fsub float %7414, %7043, !dbg !287 + %7447 = fsub float %7415, %7045, !dbg !287 + %7448 = fsub float %7416, %7043, !dbg !287 + %7449 = fsub float %7417, %7045, !dbg !287 + %7450 = fsub float %7418, %7047, !dbg !287 + %7451 = fsub float %7419, %7049, !dbg !287 + %7452 = fsub float %7420, %7047, !dbg !287 + %7453 = fsub float %7421, %7049, !dbg !287 + %7454 = fsub float %7422, %7051, !dbg !287 + %7455 = fsub float %7423, %7053, !dbg !287 + %7456 = fsub float %7424, %7051, !dbg !287 + %7457 = fsub float %7425, %7053, !dbg !287 + %7458 = fsub float %7426, %7055, !dbg !287 + %7459 = fsub float %7427, %7057, !dbg !287 + %7460 = fsub float %7428, %7055, !dbg !287 + %7461 = fsub float %7429, %7057, !dbg !287 + %7462 = fsub float %7430, %7059, !dbg !287 + %7463 = fsub float %7431, %7061, !dbg !287 + %7464 = fsub float %7432, %7059, !dbg !287 + %7465 = fsub float %7433, %7061, !dbg !287 + %7466 = fmul float %.0.i1286, %7434, !dbg !288 + %7467 = fmul float %.0.i1289, %7435, !dbg !288 + %7468 = fmul float %.0.i1292, %7436, !dbg !288 + %7469 = fmul float %.0.i1295, %7437, !dbg !288 + %7470 = fmul float %.0.i1298, %7438, !dbg !288 + %7471 = fmul float %.0.i1301, %7439, !dbg !288 + %7472 = fmul float %.0.i1304, %7440, !dbg !288 + %7473 = fmul float %.0.i1307, %7441, !dbg !288 + %7474 = fmul float %.0.i1310, %7442, !dbg !288 + %7475 = fmul float %.0.i1313, %7443, !dbg !288 + %7476 = fmul float %.0.i1316, %7444, !dbg !288 + %7477 = fmul float %.0.i1319, %7445, !dbg !288 + %7478 = fmul float %.0.i1322, %7446, !dbg !288 + %7479 = fmul float %.0.i1325, %7447, !dbg !288 + %7480 = fmul float %.0.i1328, %7448, !dbg !288 + %7481 = fmul float %.0.i1331, %7449, !dbg !288 + %7482 = fmul float %.0.i1334, %7450, !dbg !288 + %7483 = fmul float %.0.i1337, %7451, !dbg !288 + %7484 = fmul float %.0.i1340, %7452, !dbg !288 + %7485 = fmul float %.0.i1343, %7453, !dbg !288 + %7486 = fmul float %.0.i1346, %7454, !dbg !288 + %7487 = fmul float %.0.i1349, %7455, !dbg !288 + %7488 = fmul float %.0.i1352, %7456, !dbg !288 + %7489 = fmul float %.0.i1355, %7457, !dbg !288 + %7490 = fmul float %.0.i1358, %7458, !dbg !288 + %7491 = fmul float %.0.i1361, %7459, !dbg !288 + %7492 = fmul float %.0.i1364, %7460, !dbg !288 + %7493 = fmul float %.0.i1367, %7461, !dbg !288 + %7494 = fmul float %.0.i1370, %7462, !dbg !288 + %7495 = fmul float %.0.i1373, %7463, !dbg !288 + %7496 = fmul float %.0.i1376, %7464, !dbg !288 + %7497 = fmul float %.0.i1379, %7465, !dbg !288 + %7498 = fptrunc float %7466 to bfloat, !dbg !289 + %7499 = select i1 %6396, bfloat %7498, bfloat 0xR0000, !dbg !290 + %7500 = fptrunc float %7467 to bfloat, !dbg !289 + %7501 = select i1 %6397, bfloat %7500, bfloat 0xR0000, !dbg !290 + %7502 = fptrunc float %7468 to bfloat, !dbg !289 + %7503 = select i1 %6398, bfloat %7502, bfloat 0xR0000, !dbg !290 + %7504 = fptrunc float %7469 to bfloat, !dbg !289 + %7505 = select i1 %6399, bfloat %7504, bfloat 0xR0000, !dbg !290 + %7506 = fptrunc float %7470 to bfloat, !dbg !289 + %7507 = select i1 %6400, bfloat %7506, bfloat 0xR0000, !dbg !290 + %7508 = fptrunc float %7471 to bfloat, !dbg !289 + %7509 = select i1 %6401, bfloat %7508, bfloat 0xR0000, !dbg !290 + %7510 = fptrunc float %7472 to bfloat, !dbg !289 + %7511 = select i1 %6402, bfloat %7510, bfloat 0xR0000, !dbg !290 + %7512 = fptrunc float %7473 to bfloat, !dbg !289 + %7513 = select i1 %6403, bfloat %7512, bfloat 0xR0000, !dbg !290 + %7514 = fptrunc float %7474 to bfloat, !dbg !289 + %7515 = select i1 %6404, bfloat %7514, bfloat 0xR0000, !dbg !290 + %7516 = fptrunc float %7475 to bfloat, !dbg !289 + %7517 = select i1 %6405, bfloat %7516, bfloat 0xR0000, !dbg !290 + %7518 = fptrunc float %7476 to bfloat, !dbg !289 + %7519 = select i1 %6406, bfloat %7518, bfloat 0xR0000, !dbg !290 + %7520 = fptrunc float %7477 to bfloat, !dbg !289 + %7521 = select i1 %6407, bfloat %7520, bfloat 0xR0000, !dbg !290 + %7522 = fptrunc float %7478 to bfloat, !dbg !289 + %7523 = select i1 %6408, bfloat %7522, bfloat 0xR0000, !dbg !290 + %7524 = fptrunc float %7479 to bfloat, !dbg !289 + %7525 = select i1 %6409, bfloat %7524, bfloat 0xR0000, !dbg !290 + %7526 = fptrunc float %7480 to bfloat, !dbg !289 + %7527 = select i1 %6410, bfloat %7526, bfloat 0xR0000, !dbg !290 + %7528 = fptrunc float %7481 to bfloat, !dbg !289 + %7529 = select i1 %6411, bfloat %7528, bfloat 0xR0000, !dbg !290 + %7530 = fptrunc float %7482 to bfloat, !dbg !289 + %7531 = select i1 %6412, bfloat %7530, bfloat 0xR0000, !dbg !290 + %7532 = fptrunc float %7483 to bfloat, !dbg !289 + %7533 = select i1 %6413, bfloat %7532, bfloat 0xR0000, !dbg !290 + %7534 = fptrunc float %7484 to bfloat, !dbg !289 + %7535 = select i1 %6414, bfloat %7534, bfloat 0xR0000, !dbg !290 + %7536 = fptrunc float %7485 to bfloat, !dbg !289 + %7537 = select i1 %6415, bfloat %7536, bfloat 0xR0000, !dbg !290 + %7538 = fptrunc float %7486 to bfloat, !dbg !289 + %7539 = select i1 %6416, bfloat %7538, bfloat 0xR0000, !dbg !290 + %7540 = fptrunc float %7487 to bfloat, !dbg !289 + %7541 = select i1 %6417, bfloat %7540, bfloat 0xR0000, !dbg !290 + %7542 = fptrunc float %7488 to bfloat, !dbg !289 + %7543 = select i1 %6418, bfloat %7542, bfloat 0xR0000, !dbg !290 + %7544 = fptrunc float %7489 to bfloat, !dbg !289 + %7545 = select i1 %6419, bfloat %7544, bfloat 0xR0000, !dbg !290 + %7546 = fptrunc float %7490 to bfloat, !dbg !289 + %7547 = select i1 %6420, bfloat %7546, bfloat 0xR0000, !dbg !290 + %7548 = fptrunc float %7491 to bfloat, !dbg !289 + %7549 = select i1 %6421, bfloat %7548, bfloat 0xR0000, !dbg !290 + %7550 = fptrunc float %7492 to bfloat, !dbg !289 + %7551 = select i1 %6422, bfloat %7550, bfloat 0xR0000, !dbg !290 + %7552 = fptrunc float %7493 to bfloat, !dbg !289 + %7553 = select i1 %6423, bfloat %7552, bfloat 0xR0000, !dbg !290 + %7554 = fptrunc float %7494 to bfloat, !dbg !289 + %7555 = select i1 %6424, bfloat %7554, bfloat 0xR0000, !dbg !290 + %7556 = fptrunc float %7495 to bfloat, !dbg !289 + %7557 = select i1 %6425, bfloat %7556, bfloat 0xR0000, !dbg !290 + %7558 = fptrunc float %7496 to bfloat, !dbg !289 + %7559 = select i1 %6426, bfloat %7558, bfloat 0xR0000, !dbg !290 + %7560 = fptrunc float %7497 to bfloat, !dbg !289 + %7561 = select i1 %6427, bfloat %7560, bfloat 0xR0000, !dbg !290 + %7562 = insertelement <2 x bfloat> poison, bfloat %7499, i64 0, !dbg !291 + %7563 = insertelement <2 x bfloat> %7562, bfloat %7501, i64 1, !dbg !291 + %7564 = bitcast <2 x bfloat> %7563 to i32, !dbg !291 + %7565 = insertelement <2 x bfloat> poison, bfloat %7503, i64 0, !dbg !291 + %7566 = insertelement <2 x bfloat> %7565, bfloat %7505, i64 1, !dbg !291 + %7567 = bitcast <2 x bfloat> %7566 to i32, !dbg !291 + %7568 = insertelement <2 x bfloat> poison, bfloat %7507, i64 0, !dbg !291 + %7569 = insertelement <2 x bfloat> %7568, bfloat %7509, i64 1, !dbg !291 + %7570 = bitcast <2 x bfloat> %7569 to i32, !dbg !291 + %7571 = insertelement <2 x bfloat> poison, bfloat %7511, i64 0, !dbg !291 + %7572 = insertelement <2 x bfloat> %7571, bfloat %7513, i64 1, !dbg !291 + %7573 = bitcast <2 x bfloat> %7572 to i32, !dbg !291 + %7574 = insertelement <2 x bfloat> poison, bfloat %7515, i64 0, !dbg !291 + %7575 = insertelement <2 x bfloat> %7574, bfloat %7517, i64 1, !dbg !291 + %7576 = bitcast <2 x bfloat> %7575 to i32, !dbg !291 + %7577 = insertelement <2 x bfloat> poison, bfloat %7519, i64 0, !dbg !291 + %7578 = insertelement <2 x bfloat> %7577, bfloat %7521, i64 1, !dbg !291 + %7579 = bitcast <2 x bfloat> %7578 to i32, !dbg !291 + %7580 = insertelement <2 x bfloat> poison, bfloat %7523, i64 0, !dbg !291 + %7581 = insertelement <2 x bfloat> %7580, bfloat %7525, i64 1, !dbg !291 + %7582 = bitcast <2 x bfloat> %7581 to i32, !dbg !291 + %7583 = insertelement <2 x bfloat> poison, bfloat %7527, i64 0, !dbg !291 + %7584 = insertelement <2 x bfloat> %7583, bfloat %7529, i64 1, !dbg !291 + %7585 = bitcast <2 x bfloat> %7584 to i32, !dbg !291 + %7586 = insertelement <2 x bfloat> poison, bfloat %7531, i64 0, !dbg !291 + %7587 = insertelement <2 x bfloat> %7586, bfloat %7533, i64 1, !dbg !291 + %7588 = bitcast <2 x bfloat> %7587 to i32, !dbg !291 + %7589 = insertelement <2 x bfloat> poison, bfloat %7535, i64 0, !dbg !291 + %7590 = insertelement <2 x bfloat> %7589, bfloat %7537, i64 1, !dbg !291 + %7591 = bitcast <2 x bfloat> %7590 to i32, !dbg !291 + %7592 = insertelement <2 x bfloat> poison, bfloat %7539, i64 0, !dbg !291 + %7593 = insertelement <2 x bfloat> %7592, bfloat %7541, i64 1, !dbg !291 + %7594 = bitcast <2 x bfloat> %7593 to i32, !dbg !291 + %7595 = insertelement <2 x bfloat> poison, bfloat %7543, i64 0, !dbg !291 + %7596 = insertelement <2 x bfloat> %7595, bfloat %7545, i64 1, !dbg !291 + %7597 = bitcast <2 x bfloat> %7596 to i32, !dbg !291 + %7598 = insertelement <2 x bfloat> poison, bfloat %7547, i64 0, !dbg !291 + %7599 = insertelement <2 x bfloat> %7598, bfloat %7549, i64 1, !dbg !291 + %7600 = bitcast <2 x bfloat> %7599 to i32, !dbg !291 + %7601 = insertelement <2 x bfloat> poison, bfloat %7551, i64 0, !dbg !291 + %7602 = insertelement <2 x bfloat> %7601, bfloat %7553, i64 1, !dbg !291 + %7603 = bitcast <2 x bfloat> %7602 to i32, !dbg !291 + %7604 = insertelement <2 x bfloat> poison, bfloat %7555, i64 0, !dbg !291 + %7605 = insertelement <2 x bfloat> %7604, bfloat %7557, i64 1, !dbg !291 + %7606 = bitcast <2 x bfloat> %7605 to i32, !dbg !291 + %7607 = insertelement <2 x bfloat> poison, bfloat %7559, i64 0, !dbg !291 + %7608 = insertelement <2 x bfloat> %7607, bfloat %7561, i64 1, !dbg !291 + %7609 = bitcast <2 x bfloat> %7608 to i32, !dbg !291 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !291 + %7610 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %5652, float %5653, float %5654, float %5655, float %5656, float %5657, float %5658, float %5659, float %5660, float %5661, float %5662, float %5663, float %5664, float %5665, float %5666, float %5667, float %5668, float %5669, float %5670, float %5671, float %5672, float %5673, float %5674, float %5675, float %5676, float %5677, float %5678, float %5679, float %5680, float %5681, float %5682, float %5683, float %5684, float %5685, float %5686, float %5687, float %5688, float %5689, float %5690, float %5691, float %5692, float %5693, float %5694, float %5695, float %5696, float %5697, float %5698, float %5699, float %5700, float %5701, float %5702, float %5703, float %5704, float %5705, float %5706, float %5707, float %5708, float %5709, float %5710, float %5711, float %5712, float %5713, float %5714, float %5715, i32 %7564, i32 %7567, i32 %7570, i32 %7573, i64 %5815, i1 true) #3, !dbg !291 + %7611 = add i32 %5811, 2048, !dbg !291 + %7612 = lshr exact i32 %7611, 4, !dbg !291 + %7613 = and i32 %7612, 16383, !dbg !291 + %7614 = zext nneg i32 %7613 to i64, !dbg !291 + %7615 = or disjoint i64 %7614, 4611686293338849280, !dbg !291 + %7616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 0, !dbg !291 + %7617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 1, !dbg !291 + %7618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 2, !dbg !291 + %7619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 3, !dbg !291 + %7620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 4, !dbg !291 + %7621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 5, !dbg !291 + %7622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 6, !dbg !291 + %7623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 7, !dbg !291 + %7624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 8, !dbg !291 + %7625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 9, !dbg !291 + %7626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 10, !dbg !291 + %7627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 11, !dbg !291 + %7628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 12, !dbg !291 + %7629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 13, !dbg !291 + %7630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 14, !dbg !291 + %7631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 15, !dbg !291 + %7632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 16, !dbg !291 + %7633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 17, !dbg !291 + %7634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 18, !dbg !291 + %7635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 19, !dbg !291 + %7636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 20, !dbg !291 + %7637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 21, !dbg !291 + %7638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 22, !dbg !291 + %7639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 23, !dbg !291 + %7640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 24, !dbg !291 + %7641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 25, !dbg !291 + %7642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 26, !dbg !291 + %7643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 27, !dbg !291 + %7644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 28, !dbg !291 + %7645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 29, !dbg !291 + %7646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 30, !dbg !291 + %7647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 31, !dbg !291 + %7648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 32, !dbg !291 + %7649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 33, !dbg !291 + %7650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 34, !dbg !291 + %7651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 35, !dbg !291 + %7652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 36, !dbg !291 + %7653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 37, !dbg !291 + %7654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 38, !dbg !291 + %7655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 39, !dbg !291 + %7656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 40, !dbg !291 + %7657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 41, !dbg !291 + %7658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 42, !dbg !291 + %7659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 43, !dbg !291 + %7660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 44, !dbg !291 + %7661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 45, !dbg !291 + %7662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 46, !dbg !291 + %7663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 47, !dbg !291 + %7664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 48, !dbg !291 + %7665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 49, !dbg !291 + %7666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 50, !dbg !291 + %7667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 51, !dbg !291 + %7668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 52, !dbg !291 + %7669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 53, !dbg !291 + %7670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 54, !dbg !291 + %7671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 55, !dbg !291 + %7672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 56, !dbg !291 + %7673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 57, !dbg !291 + %7674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 58, !dbg !291 + %7675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 59, !dbg !291 + %7676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 60, !dbg !291 + %7677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 61, !dbg !291 + %7678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 62, !dbg !291 + %7679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7610, 63, !dbg !291 + %7680 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %7616, float %7617, float %7618, float %7619, float %7620, float %7621, float %7622, float %7623, float %7624, float %7625, float %7626, float %7627, float %7628, float %7629, float %7630, float %7631, float %7632, float %7633, float %7634, float %7635, float %7636, float %7637, float %7638, float %7639, float %7640, float %7641, float %7642, float %7643, float %7644, float %7645, float %7646, float %7647, float %7648, float %7649, float %7650, float %7651, float %7652, float %7653, float %7654, float %7655, float %7656, float %7657, float %7658, float %7659, float %7660, float %7661, float %7662, float %7663, float %7664, float %7665, float %7666, float %7667, float %7668, float %7669, float %7670, float %7671, float %7672, float %7673, float %7674, float %7675, float %7676, float %7677, float %7678, float %7679, i32 %7576, i32 %7579, i32 %7582, i32 %7585, i64 %7615, i1 true) #3, !dbg !291 + %7681 = add i32 %5811, 4096, !dbg !291 + %7682 = lshr exact i32 %7681, 4, !dbg !291 + %7683 = and i32 %7682, 16383, !dbg !291 + %7684 = zext nneg i32 %7683 to i64, !dbg !291 + %7685 = or disjoint i64 %7684, 4611686293338849280, !dbg !291 + %7686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 0, !dbg !291 + %7687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 1, !dbg !291 + %7688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 2, !dbg !291 + %7689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 3, !dbg !291 + %7690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 4, !dbg !291 + %7691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 5, !dbg !291 + %7692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 6, !dbg !291 + %7693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 7, !dbg !291 + %7694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 8, !dbg !291 + %7695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 9, !dbg !291 + %7696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 10, !dbg !291 + %7697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 11, !dbg !291 + %7698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 12, !dbg !291 + %7699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 13, !dbg !291 + %7700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 14, !dbg !291 + %7701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 15, !dbg !291 + %7702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 16, !dbg !291 + %7703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 17, !dbg !291 + %7704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 18, !dbg !291 + %7705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 19, !dbg !291 + %7706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 20, !dbg !291 + %7707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 21, !dbg !291 + %7708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 22, !dbg !291 + %7709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 23, !dbg !291 + %7710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 24, !dbg !291 + %7711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 25, !dbg !291 + %7712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 26, !dbg !291 + %7713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 27, !dbg !291 + %7714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 28, !dbg !291 + %7715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 29, !dbg !291 + %7716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 30, !dbg !291 + %7717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 31, !dbg !291 + %7718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 32, !dbg !291 + %7719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 33, !dbg !291 + %7720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 34, !dbg !291 + %7721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 35, !dbg !291 + %7722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 36, !dbg !291 + %7723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 37, !dbg !291 + %7724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 38, !dbg !291 + %7725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 39, !dbg !291 + %7726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 40, !dbg !291 + %7727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 41, !dbg !291 + %7728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 42, !dbg !291 + %7729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 43, !dbg !291 + %7730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 44, !dbg !291 + %7731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 45, !dbg !291 + %7732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 46, !dbg !291 + %7733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 47, !dbg !291 + %7734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 48, !dbg !291 + %7735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 49, !dbg !291 + %7736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 50, !dbg !291 + %7737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 51, !dbg !291 + %7738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 52, !dbg !291 + %7739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 53, !dbg !291 + %7740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 54, !dbg !291 + %7741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 55, !dbg !291 + %7742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 56, !dbg !291 + %7743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 57, !dbg !291 + %7744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 58, !dbg !291 + %7745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 59, !dbg !291 + %7746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 60, !dbg !291 + %7747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 61, !dbg !291 + %7748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 62, !dbg !291 + %7749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7680, 63, !dbg !291 + %7750 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %7686, float %7687, float %7688, float %7689, float %7690, float %7691, float %7692, float %7693, float %7694, float %7695, float %7696, float %7697, float %7698, float %7699, float %7700, float %7701, float %7702, float %7703, float %7704, float %7705, float %7706, float %7707, float %7708, float %7709, float %7710, float %7711, float %7712, float %7713, float %7714, float %7715, float %7716, float %7717, float %7718, float %7719, float %7720, float %7721, float %7722, float %7723, float %7724, float %7725, float %7726, float %7727, float %7728, float %7729, float %7730, float %7731, float %7732, float %7733, float %7734, float %7735, float %7736, float %7737, float %7738, float %7739, float %7740, float %7741, float %7742, float %7743, float %7744, float %7745, float %7746, float %7747, float %7748, float %7749, i32 %7588, i32 %7591, i32 %7594, i32 %7597, i64 %7685, i1 true) #3, !dbg !291 + %7751 = add i32 %5811, 6144, !dbg !291 + %7752 = lshr exact i32 %7751, 4, !dbg !291 + %7753 = and i32 %7752, 16383, !dbg !291 + %7754 = zext nneg i32 %7753 to i64, !dbg !291 + %7755 = or disjoint i64 %7754, 4611686293338849280, !dbg !291 + %7756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 0, !dbg !291 + %7757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 1, !dbg !291 + %7758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 2, !dbg !291 + %7759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 3, !dbg !291 + %7760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 4, !dbg !291 + %7761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 5, !dbg !291 + %7762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 6, !dbg !291 + %7763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 7, !dbg !291 + %7764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 8, !dbg !291 + %7765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 9, !dbg !291 + %7766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 10, !dbg !291 + %7767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 11, !dbg !291 + %7768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 12, !dbg !291 + %7769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 13, !dbg !291 + %7770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 14, !dbg !291 + %7771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 15, !dbg !291 + %7772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 16, !dbg !291 + %7773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 17, !dbg !291 + %7774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 18, !dbg !291 + %7775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 19, !dbg !291 + %7776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 20, !dbg !291 + %7777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 21, !dbg !291 + %7778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 22, !dbg !291 + %7779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 23, !dbg !291 + %7780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 24, !dbg !291 + %7781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 25, !dbg !291 + %7782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 26, !dbg !291 + %7783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 27, !dbg !291 + %7784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 28, !dbg !291 + %7785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 29, !dbg !291 + %7786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 30, !dbg !291 + %7787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 31, !dbg !291 + %7788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 32, !dbg !291 + %7789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 33, !dbg !291 + %7790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 34, !dbg !291 + %7791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 35, !dbg !291 + %7792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 36, !dbg !291 + %7793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 37, !dbg !291 + %7794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 38, !dbg !291 + %7795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 39, !dbg !291 + %7796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 40, !dbg !291 + %7797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 41, !dbg !291 + %7798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 42, !dbg !291 + %7799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 43, !dbg !291 + %7800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 44, !dbg !291 + %7801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 45, !dbg !291 + %7802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 46, !dbg !291 + %7803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 47, !dbg !291 + %7804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 48, !dbg !291 + %7805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 49, !dbg !291 + %7806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 50, !dbg !291 + %7807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 51, !dbg !291 + %7808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 52, !dbg !291 + %7809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 53, !dbg !291 + %7810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 54, !dbg !291 + %7811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 55, !dbg !291 + %7812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 56, !dbg !291 + %7813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 57, !dbg !291 + %7814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 58, !dbg !291 + %7815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 59, !dbg !291 + %7816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 60, !dbg !291 + %7817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 61, !dbg !291 + %7818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 62, !dbg !291 + %7819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7750, 63, !dbg !291 + %7820 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %7756, float %7757, float %7758, float %7759, float %7760, float %7761, float %7762, float %7763, float %7764, float %7765, float %7766, float %7767, float %7768, float %7769, float %7770, float %7771, float %7772, float %7773, float %7774, float %7775, float %7776, float %7777, float %7778, float %7779, float %7780, float %7781, float %7782, float %7783, float %7784, float %7785, float %7786, float %7787, float %7788, float %7789, float %7790, float %7791, float %7792, float %7793, float %7794, float %7795, float %7796, float %7797, float %7798, float %7799, float %7800, float %7801, float %7802, float %7803, float %7804, float %7805, float %7806, float %7807, float %7808, float %7809, float %7810, float %7811, float %7812, float %7813, float %7814, float %7815, float %7816, float %7817, float %7818, float %7819, i32 %7600, i32 %7603, i32 %7606, i32 %7609, i64 %7755, i1 true) #3, !dbg !291 + %7821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 0, !dbg !291 + %7822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 1, !dbg !291 + %7823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 2, !dbg !291 + %7824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 3, !dbg !291 + %7825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 4, !dbg !291 + %7826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 5, !dbg !291 + %7827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 6, !dbg !291 + %7828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 7, !dbg !291 + %7829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 8, !dbg !291 + %7830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 9, !dbg !291 + %7831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 10, !dbg !291 + %7832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 11, !dbg !291 + %7833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 12, !dbg !291 + %7834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 13, !dbg !291 + %7835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 14, !dbg !291 + %7836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 15, !dbg !291 + %7837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 16, !dbg !291 + %7838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 17, !dbg !291 + %7839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 18, !dbg !291 + %7840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 19, !dbg !291 + %7841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 20, !dbg !291 + %7842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 21, !dbg !291 + %7843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 22, !dbg !291 + %7844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 23, !dbg !291 + %7845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 24, !dbg !291 + %7846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 25, !dbg !291 + %7847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 26, !dbg !291 + %7848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 27, !dbg !291 + %7849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 28, !dbg !291 + %7850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 29, !dbg !291 + %7851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 30, !dbg !291 + %7852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 31, !dbg !291 + %7853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 32, !dbg !291 + %7854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 33, !dbg !291 + %7855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 34, !dbg !291 + %7856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 35, !dbg !291 + %7857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 36, !dbg !291 + %7858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 37, !dbg !291 + %7859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 38, !dbg !291 + %7860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 39, !dbg !291 + %7861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 40, !dbg !291 + %7862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 41, !dbg !291 + %7863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 42, !dbg !291 + %7864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 43, !dbg !291 + %7865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 44, !dbg !291 + %7866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 45, !dbg !291 + %7867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 46, !dbg !291 + %7868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 47, !dbg !291 + %7869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 48, !dbg !291 + %7870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 49, !dbg !291 + %7871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 50, !dbg !291 + %7872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 51, !dbg !291 + %7873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 52, !dbg !291 + %7874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 53, !dbg !291 + %7875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 54, !dbg !291 + %7876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 55, !dbg !291 + %7877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 56, !dbg !291 + %7878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 57, !dbg !291 + %7879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 58, !dbg !291 + %7880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 59, !dbg !291 + %7881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 60, !dbg !291 + %7882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 61, !dbg !291 + %7883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 62, !dbg !291 + %7884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %7820, 63, !dbg !291 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !291 + %7885 = add i32 %.pn2571660, %5575, !dbg !292 + %7886 = add i32 %.pn2531661, %5575, !dbg !292 + %7887 = add i32 %.pn2491662, %5575, !dbg !292 + %7888 = add i32 %.pn2451663, %5575, !dbg !292 + %7889 = add i32 %.pn2411664, %5575, !dbg !292 + %7890 = add i32 %.pn2371665, %5575, !dbg !292 + %7891 = add i32 %.pn2331666, %5575, !dbg !292 + %7892 = add i32 %.pn2291667, %5575, !dbg !292 + %7893 = insertelement <2 x i32> poison, i32 %5575, i64 0, !dbg !292 + %7894 = shufflevector <2 x i32> %7893, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !292 + %7895 = add <2 x i32> %7894, %5717, !dbg !292 + %7896 = add <2 x i32> %7894, %5718, !dbg !292 + %7897 = add nuw nsw i32 %5716, 1, !dbg !223 + %7898 = lshr i32 %7897, 1, !dbg !293 + %7899 = zext nneg i32 %7898 to i64, !dbg !294 + %7900 = getelementptr i32, ptr addrspace(1) %4916, i64 %7899, !dbg !294 + %7901 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !295 + %7902 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %7900, i64 %7901, i1 %5720) #3, !dbg !295 + %7903 = add nuw nsw i32 %7898, 1, !dbg !296 + %7904 = icmp slt i32 %7903, %4921, !dbg !297 + %7905 = getelementptr i8, ptr addrspace(1) %7900, i64 4, !dbg !298 + %7906 = and i1 %5720, %7904, !dbg !223 + %7907 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !299 + %7908 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %7905, i64 %7907, i1 %7906) #3, !dbg !299 + %7909 = and i32 %5716, 1, !dbg !300 + %7910 = sub i32 %7908, %7902, !dbg !301 + %7911 = shl i32 %7910, 7, !dbg !302 + %7912 = add i32 %7911, -64, !dbg !303 + %7913 = xor i32 %7909, 1, !dbg !304 + %7914 = mul nuw nsw i32 %7912, %7913, !dbg !304 + %7915 = shl nuw nsw i32 %7909, 6, !dbg !305 + %7916 = add i32 %7914, %7915, !dbg !306 + %7917 = shl i32 %7916, 12, !dbg !307 + %7918 = sext i32 %7917 to i64, !dbg !265 + %7919 = getelementptr bfloat, ptr addrspace(1) %.pn1291644, i64 %7918, !dbg !265 + %7920 = getelementptr bfloat, ptr addrspace(1) %.pn1131645, i64 %7918, !dbg !265 + %7921 = getelementptr bfloat, ptr addrspace(1) %.pn971646, i64 %7918, !dbg !265 + %7922 = getelementptr bfloat, ptr addrspace(1) %.pn811647, i64 %7918, !dbg !265 + %7923 = shl i32 %7916, 7, !dbg !308 + %7924 = sext i32 %7923 to i64, !dbg !266 + %7925 = getelementptr bfloat, ptr addrspace(1) %.pn1931648, i64 %7924, !dbg !266 + %7926 = getelementptr bfloat, ptr addrspace(1) %.pn1771649, i64 %7924, !dbg !266 + %7927 = getelementptr bfloat, ptr addrspace(1) %.pn1611650, i64 %7924, !dbg !266 + %7928 = getelementptr bfloat, ptr addrspace(1) %.pn1451651, i64 %7924, !dbg !266 + %7929 = add i32 %7916, %.pn2251652, !dbg !292 + %7930 = add i32 %7916, %.pn2211653, !dbg !292 + %7931 = add i32 %7916, %.pn2171654, !dbg !292 + %7932 = add i32 %7916, %.pn2131655, !dbg !292 + %7933 = add i32 %7916, %.pn2091656, !dbg !292 + %7934 = add i32 %7916, %.pn2051657, !dbg !292 + %7935 = add i32 %7916, %.pn2011658, !dbg !292 + %7936 = add i32 %7916, %.pn1971659, !dbg !292 + %7937 = add i32 %7916, %5584, !dbg !292 + %7938 = add i32 %7916, %5585, !dbg !292 + %7939 = add i32 %7916, %5586, !dbg !292 + %7940 = add i32 %7916, %5587, !dbg !292 + %7941 = add i32 %7916, %5580, !dbg !292 + %7942 = add i32 %7916, %5581, !dbg !292 + %7943 = add i32 %7916, %5582, !dbg !292 + %7944 = add i32 %7916, %5583, !dbg !292 + %7945 = add i32 %5577, 1, !dbg !223 + %7946 = icmp sgt i32 %7945, 1, !dbg !223 + %7947 = select i1 %7946, i32 0, i32 %7945, !dbg !223 + %7948 = add i32 %5579, 1, !dbg !223 + %7949 = icmp sgt i32 %7948, 2, !dbg !223 + %7950 = select i1 %7949, i32 0, i32 %7948, !dbg !223 + %7951 = icmp slt i32 %7937, 2048, !dbg !267 + %7952 = icmp slt i32 %7938, 2048, !dbg !267 + %7953 = icmp slt i32 %7939, 2048, !dbg !267 + %7954 = icmp slt i32 %7940, 2048, !dbg !267 + %7955 = shl i32 %7950, 13, !dbg !259 + %7956 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %7955, !dbg !259 + %7957 = and i1 %5719, %7951, !dbg !223 + %7958 = and i1 %5719, %7952, !dbg !223 + %7959 = and i1 %5719, %7953, !dbg !223 + %7960 = and i1 %5719, %7954, !dbg !223 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !259 + %7961 = getelementptr inbounds nuw i8, ptr addrspace(3) %7956, i32 %5027, !dbg !259 + %7962 = select i1 %7957, i32 16, i32 0, !dbg !259 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %7961, ptr addrspace(1) %7919, i32 %7962) #3, !dbg !259 + %7963 = getelementptr inbounds nuw i8, ptr addrspace(3) %7956, i32 %5030, !dbg !259 + %7964 = select i1 %7958, i32 16, i32 0, !dbg !259 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %7963, ptr addrspace(1) %7920, i32 %7964) #3, !dbg !259 + %7965 = getelementptr inbounds nuw i8, ptr addrspace(3) %7956, i32 %5033, !dbg !259 + %7966 = select i1 %7959, i32 16, i32 0, !dbg !259 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %7965, ptr addrspace(1) %7921, i32 %7966) #3, !dbg !259 + %7967 = getelementptr inbounds nuw i8, ptr addrspace(3) %7956, i32 %5036, !dbg !259 + %7968 = select i1 %7960, i32 16, i32 0, !dbg !259 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %7967, ptr addrspace(1) %7922, i32 %7968) #3, !dbg !259 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !259 + %7969 = icmp slt i32 %7929, 2048, !dbg !309 + %7970 = icmp slt i32 %7930, 2048, !dbg !309 + %7971 = icmp slt i32 %7931, 2048, !dbg !309 + %7972 = icmp slt i32 %7932, 2048, !dbg !309 + %7973 = icmp slt i32 %7933, 2048, !dbg !309 + %7974 = icmp slt i32 %7934, 2048, !dbg !309 + %7975 = icmp slt i32 %7935, 2048, !dbg !309 + %7976 = icmp slt i32 %7936, 2048, !dbg !309 + %7977 = sext i32 %7929 to i64, !dbg !260 + %7978 = getelementptr float, ptr addrspace(1) %5517, i64 %7977, !dbg !260 + %7979 = sext i32 %7930 to i64, !dbg !260 + %7980 = getelementptr float, ptr addrspace(1) %5517, i64 %7979, !dbg !260 + %7981 = sext i32 %7931 to i64, !dbg !260 + %7982 = getelementptr float, ptr addrspace(1) %5517, i64 %7981, !dbg !260 + %7983 = sext i32 %7932 to i64, !dbg !260 + %7984 = getelementptr float, ptr addrspace(1) %5517, i64 %7983, !dbg !260 + %7985 = sext i32 %7933 to i64, !dbg !260 + %7986 = getelementptr float, ptr addrspace(1) %5517, i64 %7985, !dbg !260 + %7987 = sext i32 %7934 to i64, !dbg !260 + %7988 = getelementptr float, ptr addrspace(1) %5517, i64 %7987, !dbg !260 + %7989 = sext i32 %7935 to i64, !dbg !260 + %7990 = getelementptr float, ptr addrspace(1) %5517, i64 %7989, !dbg !260 + %7991 = sext i32 %7936 to i64, !dbg !260 + %7992 = getelementptr float, ptr addrspace(1) %5517, i64 %7991, !dbg !260 + %7993 = shl i32 %7947, 6, !dbg !261 + %7994 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106496), i32 %7993, !dbg !261 + %7995 = and i1 %5719, %7969, !dbg !223 + %7996 = and i1 %5719, %7970, !dbg !223 + %7997 = and i1 %5719, %7971, !dbg !223 + %7998 = and i1 %5719, %7972, !dbg !223 + %7999 = and i1 %5719, %7973, !dbg !223 + %8000 = and i1 %5719, %7974, !dbg !223 + %8001 = and i1 %5719, %7975, !dbg !223 + %8002 = and i1 %5719, %7976, !dbg !223 + %8003 = getelementptr inbounds nuw i8, ptr addrspace(3) %7994, i32 %5065, !dbg !261 + %8004 = select i1 %7995, i32 8, i32 0, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %8003, ptr addrspace(1) %7978, i32 %8004, i1 %5064) #3, !dbg !261 + %8005 = getelementptr inbounds nuw i8, ptr addrspace(3) %7994, i32 %5068, !dbg !261 + %8006 = select i1 %7996, i32 8, i32 0, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8005, ptr addrspace(1) %7980, i32 %8006, i1 %5064) #3, !dbg !261 + %8007 = getelementptr inbounds nuw i8, ptr addrspace(3) %7994, i32 %5071, !dbg !261 + %8008 = select i1 %7997, i32 8, i32 0, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8007, ptr addrspace(1) %7982, i32 %8008, i1 %5064) #3, !dbg !261 + %8009 = getelementptr inbounds nuw i8, ptr addrspace(3) %7994, i32 %5074, !dbg !261 + %8010 = select i1 %7998, i32 8, i32 0, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8009, ptr addrspace(1) %7984, i32 %8010, i1 %5064) #3, !dbg !261 + %8011 = getelementptr inbounds nuw i8, ptr addrspace(3) %7994, i32 %5077, !dbg !261 + %8012 = select i1 %7999, i32 8, i32 0, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8011, ptr addrspace(1) %7986, i32 %8012, i1 %5064) #3, !dbg !261 + %8013 = getelementptr inbounds nuw i8, ptr addrspace(3) %7994, i32 %5080, !dbg !261 + %8014 = select i1 %8000, i32 8, i32 0, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8013, ptr addrspace(1) %7988, i32 %8014, i1 %5064) #3, !dbg !261 + %8015 = getelementptr inbounds nuw i8, ptr addrspace(3) %7994, i32 %5083, !dbg !261 + %8016 = select i1 %8001, i32 8, i32 0, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8015, ptr addrspace(1) %7990, i32 %8016, i1 %5064) #3, !dbg !261 + %8017 = getelementptr inbounds nuw i8, ptr addrspace(3) %7994, i32 %5086, !dbg !261 + %8018 = select i1 %8002, i32 8, i32 0, !dbg !261 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8017, ptr addrspace(1) %7992, i32 %8018, i1 %5064) #3, !dbg !261 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !261 + %8019 = icmp slt i32 %7941, 2048, !dbg !310 + %8020 = icmp slt i32 %7942, 2048, !dbg !310 + %8021 = icmp slt i32 %7943, 2048, !dbg !310 + %8022 = icmp slt i32 %7944, 2048, !dbg !310 + %8023 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %7955, !dbg !262 + %8024 = and i1 %5719, %8019, !dbg !223 + %8025 = and i1 %5719, %8020, !dbg !223 + %8026 = and i1 %5719, %8021, !dbg !223 + %8027 = and i1 %5719, %8022, !dbg !223 + %8028 = getelementptr inbounds nuw i8, ptr addrspace(3) %8023, i32 %5027, !dbg !262 + %8029 = select i1 %8024, i32 16, i32 0, !dbg !262 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %8028, ptr addrspace(1) %7925, i32 %8029) #3, !dbg !262 + %8030 = getelementptr inbounds nuw i8, ptr addrspace(3) %8023, i32 %5030, !dbg !262 + %8031 = select i1 %8025, i32 16, i32 0, !dbg !262 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %8030, ptr addrspace(1) %7926, i32 %8031) #3, !dbg !262 + %8032 = getelementptr inbounds nuw i8, ptr addrspace(3) %8023, i32 %5033, !dbg !262 + %8033 = select i1 %8026, i32 16, i32 0, !dbg !262 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %8032, ptr addrspace(1) %7927, i32 %8033) #3, !dbg !262 + %8034 = getelementptr inbounds nuw i8, ptr addrspace(3) %8023, i32 %5036, !dbg !262 + %8035 = select i1 %8027, i32 16, i32 0, !dbg !262 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %8034, ptr addrspace(1) %7928, i32 %8035) #3, !dbg !262 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !262 + %8036 = getelementptr float, ptr addrspace(1) %5518, i64 %7977, !dbg !263 + %8037 = getelementptr float, ptr addrspace(1) %5518, i64 %7979, !dbg !263 + %8038 = getelementptr float, ptr addrspace(1) %5518, i64 %7981, !dbg !263 + %8039 = getelementptr float, ptr addrspace(1) %5518, i64 %7983, !dbg !263 + %8040 = getelementptr float, ptr addrspace(1) %5518, i64 %7985, !dbg !263 + %8041 = getelementptr float, ptr addrspace(1) %5518, i64 %7987, !dbg !263 + %8042 = getelementptr float, ptr addrspace(1) %5518, i64 %7989, !dbg !263 + %8043 = getelementptr float, ptr addrspace(1) %5518, i64 %7991, !dbg !263 + %8044 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107008), i32 %7993, !dbg !264 + %8045 = getelementptr inbounds nuw i8, ptr addrspace(3) %8044, i32 %5065, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %8045, ptr addrspace(1) %8036, i32 %8004, i1 %5064) #3, !dbg !264 + %8046 = getelementptr inbounds nuw i8, ptr addrspace(3) %8044, i32 %5068, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8046, ptr addrspace(1) %8037, i32 %8006, i1 %5064) #3, !dbg !264 + %8047 = getelementptr inbounds nuw i8, ptr addrspace(3) %8044, i32 %5071, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8047, ptr addrspace(1) %8038, i32 %8008, i1 %5064) #3, !dbg !264 + %8048 = getelementptr inbounds nuw i8, ptr addrspace(3) %8044, i32 %5074, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8048, ptr addrspace(1) %8039, i32 %8010, i1 %5064) #3, !dbg !264 + %8049 = getelementptr inbounds nuw i8, ptr addrspace(3) %8044, i32 %5077, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8049, ptr addrspace(1) %8040, i32 %8012, i1 %5064) #3, !dbg !264 + %8050 = getelementptr inbounds nuw i8, ptr addrspace(3) %8044, i32 %5080, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8050, ptr addrspace(1) %8041, i32 %8014, i1 %5064) #3, !dbg !264 + %8051 = getelementptr inbounds nuw i8, ptr addrspace(3) %8044, i32 %5083, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8051, ptr addrspace(1) %8042, i32 %8016, i1 %5064) #3, !dbg !264 + %8052 = getelementptr inbounds nuw i8, ptr addrspace(3) %8044, i32 %5086, !dbg !264 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %8052, ptr addrspace(1) %8043, i32 %8018, i1 %5064) #3, !dbg !264 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !264 + %exitcond2212.not = icmp eq i32 %7897, %smax2211, !dbg !223 + br i1 %exitcond2212.not, label %._crit_edge1670, label %.lr.ph1669, !dbg !223 + +._crit_edge1670: ; preds = %__nv_exp2f.exit1380, %5375 + %8053 = phi float [ %5376, %5375 ], [ %7821, %__nv_exp2f.exit1380 ] + %8054 = phi float [ %5377, %5375 ], [ %7822, %__nv_exp2f.exit1380 ] + %8055 = phi float [ %5378, %5375 ], [ %7823, %__nv_exp2f.exit1380 ] + %8056 = phi float [ %5379, %5375 ], [ %7824, %__nv_exp2f.exit1380 ] + %8057 = phi float [ %5380, %5375 ], [ %7825, %__nv_exp2f.exit1380 ] + %8058 = phi float [ %5381, %5375 ], [ %7826, %__nv_exp2f.exit1380 ] + %8059 = phi float [ %5382, %5375 ], [ %7827, %__nv_exp2f.exit1380 ] + %8060 = phi float [ %5383, %5375 ], [ %7828, %__nv_exp2f.exit1380 ] + %8061 = phi float [ %5384, %5375 ], [ %7829, %__nv_exp2f.exit1380 ] + %8062 = phi float [ %5385, %5375 ], [ %7830, %__nv_exp2f.exit1380 ] + %8063 = phi float [ %5386, %5375 ], [ %7831, %__nv_exp2f.exit1380 ] + %8064 = phi float [ %5387, %5375 ], [ %7832, %__nv_exp2f.exit1380 ] + %8065 = phi float [ %5388, %5375 ], [ %7833, %__nv_exp2f.exit1380 ] + %8066 = phi float [ %5389, %5375 ], [ %7834, %__nv_exp2f.exit1380 ] + %8067 = phi float [ %5390, %5375 ], [ %7835, %__nv_exp2f.exit1380 ] + %8068 = phi float [ %5391, %5375 ], [ %7836, %__nv_exp2f.exit1380 ] + %8069 = phi float [ %5392, %5375 ], [ %7837, %__nv_exp2f.exit1380 ] + %8070 = phi float [ %5393, %5375 ], [ %7838, %__nv_exp2f.exit1380 ] + %8071 = phi float [ %5394, %5375 ], [ %7839, %__nv_exp2f.exit1380 ] + %8072 = phi float [ %5395, %5375 ], [ %7840, %__nv_exp2f.exit1380 ] + %8073 = phi float [ %5396, %5375 ], [ %7841, %__nv_exp2f.exit1380 ] + %8074 = phi float [ %5397, %5375 ], [ %7842, %__nv_exp2f.exit1380 ] + %8075 = phi float [ %5398, %5375 ], [ %7843, %__nv_exp2f.exit1380 ] + %8076 = phi float [ %5399, %5375 ], [ %7844, %__nv_exp2f.exit1380 ] + %8077 = phi float [ %5400, %5375 ], [ %7845, %__nv_exp2f.exit1380 ] + %8078 = phi float [ %5401, %5375 ], [ %7846, %__nv_exp2f.exit1380 ] + %8079 = phi float [ %5402, %5375 ], [ %7847, %__nv_exp2f.exit1380 ] + %8080 = phi float [ %5403, %5375 ], [ %7848, %__nv_exp2f.exit1380 ] + %8081 = phi float [ %5404, %5375 ], [ %7849, %__nv_exp2f.exit1380 ] + %8082 = phi float [ %5405, %5375 ], [ %7850, %__nv_exp2f.exit1380 ] + %8083 = phi float [ %5406, %5375 ], [ %7851, %__nv_exp2f.exit1380 ] + %8084 = phi float [ %5407, %5375 ], [ %7852, %__nv_exp2f.exit1380 ] + %8085 = phi float [ %5408, %5375 ], [ %7853, %__nv_exp2f.exit1380 ] + %8086 = phi float [ %5409, %5375 ], [ %7854, %__nv_exp2f.exit1380 ] + %8087 = phi float [ %5410, %5375 ], [ %7855, %__nv_exp2f.exit1380 ] + %8088 = phi float [ %5411, %5375 ], [ %7856, %__nv_exp2f.exit1380 ] + %8089 = phi float [ %5412, %5375 ], [ %7857, %__nv_exp2f.exit1380 ] + %8090 = phi float [ %5413, %5375 ], [ %7858, %__nv_exp2f.exit1380 ] + %8091 = phi float [ %5414, %5375 ], [ %7859, %__nv_exp2f.exit1380 ] + %8092 = phi float [ %5415, %5375 ], [ %7860, %__nv_exp2f.exit1380 ] + %8093 = phi float [ %5416, %5375 ], [ %7861, %__nv_exp2f.exit1380 ] + %8094 = phi float [ %5417, %5375 ], [ %7862, %__nv_exp2f.exit1380 ] + %8095 = phi float [ %5418, %5375 ], [ %7863, %__nv_exp2f.exit1380 ] + %8096 = phi float [ %5419, %5375 ], [ %7864, %__nv_exp2f.exit1380 ] + %8097 = phi float [ %5420, %5375 ], [ %7865, %__nv_exp2f.exit1380 ] + %8098 = phi float [ %5421, %5375 ], [ %7866, %__nv_exp2f.exit1380 ] + %8099 = phi float [ %5422, %5375 ], [ %7867, %__nv_exp2f.exit1380 ] + %8100 = phi float [ %5423, %5375 ], [ %7868, %__nv_exp2f.exit1380 ] + %8101 = phi float [ %5424, %5375 ], [ %7869, %__nv_exp2f.exit1380 ] + %8102 = phi float [ %5425, %5375 ], [ %7870, %__nv_exp2f.exit1380 ] + %8103 = phi float [ %5426, %5375 ], [ %7871, %__nv_exp2f.exit1380 ] + %8104 = phi float [ %5427, %5375 ], [ %7872, %__nv_exp2f.exit1380 ] + %8105 = phi float [ %5428, %5375 ], [ %7873, %__nv_exp2f.exit1380 ] + %8106 = phi float [ %5429, %5375 ], [ %7874, %__nv_exp2f.exit1380 ] + %8107 = phi float [ %5430, %5375 ], [ %7875, %__nv_exp2f.exit1380 ] + %8108 = phi float [ %5431, %5375 ], [ %7876, %__nv_exp2f.exit1380 ] + %8109 = phi float [ %5432, %5375 ], [ %7877, %__nv_exp2f.exit1380 ] + %8110 = phi float [ %5433, %5375 ], [ %7878, %__nv_exp2f.exit1380 ] + %8111 = phi float [ %5434, %5375 ], [ %7879, %__nv_exp2f.exit1380 ] + %8112 = phi float [ %5435, %5375 ], [ %7880, %__nv_exp2f.exit1380 ] + %8113 = phi float [ %5436, %5375 ], [ %7881, %__nv_exp2f.exit1380 ] + %8114 = phi float [ %5437, %5375 ], [ %7882, %__nv_exp2f.exit1380 ] + %8115 = phi float [ %5438, %5375 ], [ %7883, %__nv_exp2f.exit1380 ] + %8116 = phi float [ %5439, %5375 ], [ %7884, %__nv_exp2f.exit1380 ] + %8117 = phi float [ %5440, %5375 ], [ %6965, %__nv_exp2f.exit1380 ] + %8118 = phi float [ %5441, %5375 ], [ %6966, %__nv_exp2f.exit1380 ] + %8119 = phi float [ %5442, %5375 ], [ %6967, %__nv_exp2f.exit1380 ] + %8120 = phi float [ %5443, %5375 ], [ %6968, %__nv_exp2f.exit1380 ] + %8121 = phi float [ %5444, %5375 ], [ %6969, %__nv_exp2f.exit1380 ] + %8122 = phi float [ %5445, %5375 ], [ %6970, %__nv_exp2f.exit1380 ] + %8123 = phi float [ %5446, %5375 ], [ %6971, %__nv_exp2f.exit1380 ] + %8124 = phi float [ %5447, %5375 ], [ %6972, %__nv_exp2f.exit1380 ] + %8125 = phi float [ %5448, %5375 ], [ %6973, %__nv_exp2f.exit1380 ] + %8126 = phi float [ %5449, %5375 ], [ %6974, %__nv_exp2f.exit1380 ] + %8127 = phi float [ %5450, %5375 ], [ %6975, %__nv_exp2f.exit1380 ] + %8128 = phi float [ %5451, %5375 ], [ %6976, %__nv_exp2f.exit1380 ] + %8129 = phi float [ %5452, %5375 ], [ %6977, %__nv_exp2f.exit1380 ] + %8130 = phi float [ %5453, %5375 ], [ %6978, %__nv_exp2f.exit1380 ] + %8131 = phi float [ %5454, %5375 ], [ %6979, %__nv_exp2f.exit1380 ] + %8132 = phi float [ %5455, %5375 ], [ %6980, %__nv_exp2f.exit1380 ] + %8133 = phi float [ %5456, %5375 ], [ %6981, %__nv_exp2f.exit1380 ] + %8134 = phi float [ %5457, %5375 ], [ %6982, %__nv_exp2f.exit1380 ] + %8135 = phi float [ %5458, %5375 ], [ %6983, %__nv_exp2f.exit1380 ] + %8136 = phi float [ %5459, %5375 ], [ %6984, %__nv_exp2f.exit1380 ] + %8137 = phi float [ %5460, %5375 ], [ %6985, %__nv_exp2f.exit1380 ] + %8138 = phi float [ %5461, %5375 ], [ %6986, %__nv_exp2f.exit1380 ] + %8139 = phi float [ %5462, %5375 ], [ %6987, %__nv_exp2f.exit1380 ] + %8140 = phi float [ %5463, %5375 ], [ %6988, %__nv_exp2f.exit1380 ] + %8141 = phi float [ %5464, %5375 ], [ %6989, %__nv_exp2f.exit1380 ] + %8142 = phi float [ %5465, %5375 ], [ %6990, %__nv_exp2f.exit1380 ] + %8143 = phi float [ %5466, %5375 ], [ %6991, %__nv_exp2f.exit1380 ] + %8144 = phi float [ %5467, %5375 ], [ %6992, %__nv_exp2f.exit1380 ] + %8145 = phi float [ %5468, %5375 ], [ %6993, %__nv_exp2f.exit1380 ] + %8146 = phi float [ %5469, %5375 ], [ %6994, %__nv_exp2f.exit1380 ] + %8147 = phi float [ %5470, %5375 ], [ %6995, %__nv_exp2f.exit1380 ] + %8148 = phi float [ %5471, %5375 ], [ %6996, %__nv_exp2f.exit1380 ] + %8149 = phi float [ %5472, %5375 ], [ %6997, %__nv_exp2f.exit1380 ] + %8150 = phi float [ %5473, %5375 ], [ %6998, %__nv_exp2f.exit1380 ] + %8151 = phi float [ %5474, %5375 ], [ %6999, %__nv_exp2f.exit1380 ] + %8152 = phi float [ %5475, %5375 ], [ %7000, %__nv_exp2f.exit1380 ] + %8153 = phi float [ %5476, %5375 ], [ %7001, %__nv_exp2f.exit1380 ] + %8154 = phi float [ %5477, %5375 ], [ %7002, %__nv_exp2f.exit1380 ] + %8155 = phi float [ %5478, %5375 ], [ %7003, %__nv_exp2f.exit1380 ] + %8156 = phi float [ %5479, %5375 ], [ %7004, %__nv_exp2f.exit1380 ] + %8157 = phi float [ %5480, %5375 ], [ %7005, %__nv_exp2f.exit1380 ] + %8158 = phi float [ %5481, %5375 ], [ %7006, %__nv_exp2f.exit1380 ] + %8159 = phi float [ %5482, %5375 ], [ %7007, %__nv_exp2f.exit1380 ] + %8160 = phi float [ %5483, %5375 ], [ %7008, %__nv_exp2f.exit1380 ] + %8161 = phi float [ %5484, %5375 ], [ %7009, %__nv_exp2f.exit1380 ] + %8162 = phi float [ %5485, %5375 ], [ %7010, %__nv_exp2f.exit1380 ] + %8163 = phi float [ %5486, %5375 ], [ %7011, %__nv_exp2f.exit1380 ] + %8164 = phi float [ %5487, %5375 ], [ %7012, %__nv_exp2f.exit1380 ] + %8165 = phi float [ %5488, %5375 ], [ %7013, %__nv_exp2f.exit1380 ] + %8166 = phi float [ %5489, %5375 ], [ %7014, %__nv_exp2f.exit1380 ] + %8167 = phi float [ %5490, %5375 ], [ %7015, %__nv_exp2f.exit1380 ] + %8168 = phi float [ %5491, %5375 ], [ %7016, %__nv_exp2f.exit1380 ] + %8169 = phi float [ %5492, %5375 ], [ %7017, %__nv_exp2f.exit1380 ] + %8170 = phi float [ %5493, %5375 ], [ %7018, %__nv_exp2f.exit1380 ] + %8171 = phi float [ %5494, %5375 ], [ %7019, %__nv_exp2f.exit1380 ] + %8172 = phi float [ %5495, %5375 ], [ %7020, %__nv_exp2f.exit1380 ] + %8173 = phi float [ %5496, %5375 ], [ %7021, %__nv_exp2f.exit1380 ] + %8174 = phi float [ %5497, %5375 ], [ %7022, %__nv_exp2f.exit1380 ] + %8175 = phi float [ %5498, %5375 ], [ %7023, %__nv_exp2f.exit1380 ] + %8176 = phi float [ %5499, %5375 ], [ %7024, %__nv_exp2f.exit1380 ] + %8177 = phi float [ %5500, %5375 ], [ %7025, %__nv_exp2f.exit1380 ] + %8178 = phi float [ %5501, %5375 ], [ %7026, %__nv_exp2f.exit1380 ] + %8179 = phi float [ %5502, %5375 ], [ %7027, %__nv_exp2f.exit1380 ] + %8180 = phi float [ %5503, %5375 ], [ %7028, %__nv_exp2f.exit1380 ] + %8181 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101,$102,$103,$104,$105,$106,$107,$108,$109,$110,$111,$112,$113,$114,$115,$116,$117,$118,$119,$120,$121,$122,$123,$124,$125,$126,$127\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127"(float %8117, float %8118, float %8119, float %8120, float %8121, float %8122, float %8123, float %8124, float %8125, float %8126, float %8127, float %8128, float %8129, float %8130, float %8131, float %8132, float %8133, float %8134, float %8135, float %8136, float %8137, float %8138, float %8139, float %8140, float %8141, float %8142, float %8143, float %8144, float %8145, float %8146, float %8147, float %8148, float %8149, float %8150, float %8151, float %8152, float %8153, float %8154, float %8155, float %8156, float %8157, float %8158, float %8159, float %8160, float %8161, float %8162, float %8163, float %8164, float %8165, float %8166, float %8167, float %8168, float %8169, float %8170, float %8171, float %8172, float %8173, float %8174, float %8175, float %8176, float %8177, float %8178, float %8179, float %8180, float %8053, float %8054, float %8055, float %8056, float %8057, float %8058, float %8059, float %8060, float %8061, float %8062, float %8063, float %8064, float %8065, float %8066, float %8067, float %8068, float %8069, float %8070, float %8071, float %8072, float %8073, float %8074, float %8075, float %8076, float %8077, float %8078, float %8079, float %8080, float %8081, float %8082, float %8083, float %8084, float %8085, float %8086, float %8087, float %8088, float %8089, float %8090, float %8091, float %8092, float %8093, float %8094, float %8095, float %8096, float %8097, float %8098, float %8099, float %8100, float %8101, float %8102, float %8103, float %8104, float %8105, float %8106, float %8107, float %8108, float %8109, float %8110, float %8111, float %8112, float %8113, float %8114, float %8115, float %8116) #3, !dbg !223 + %8182 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 0, !dbg !223 + %8183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 1, !dbg !223 + %8184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 2, !dbg !223 + %8185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 3, !dbg !223 + %8186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 4, !dbg !223 + %8187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 5, !dbg !223 + %8188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 6, !dbg !223 + %8189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 7, !dbg !223 + %8190 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 8, !dbg !223 + %8191 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 9, !dbg !223 + %8192 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 10, !dbg !223 + %8193 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 11, !dbg !223 + %8194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 12, !dbg !223 + %8195 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 13, !dbg !223 + %8196 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 14, !dbg !223 + %8197 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 15, !dbg !223 + %8198 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 16, !dbg !223 + %8199 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 17, !dbg !223 + %8200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 18, !dbg !223 + %8201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 19, !dbg !223 + %8202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 20, !dbg !223 + %8203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 21, !dbg !223 + %8204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 22, !dbg !223 + %8205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 23, !dbg !223 + %8206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 24, !dbg !223 + %8207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 25, !dbg !223 + %8208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 26, !dbg !223 + %8209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 27, !dbg !223 + %8210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 28, !dbg !223 + %8211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 29, !dbg !223 + %8212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 30, !dbg !223 + %8213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 31, !dbg !223 + %8214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 32, !dbg !223 + %8215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 33, !dbg !223 + %8216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 34, !dbg !223 + %8217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 35, !dbg !223 + %8218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 36, !dbg !223 + %8219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 37, !dbg !223 + %8220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 38, !dbg !223 + %8221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 39, !dbg !223 + %8222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 40, !dbg !223 + %8223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 41, !dbg !223 + %8224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 42, !dbg !223 + %8225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 43, !dbg !223 + %8226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 44, !dbg !223 + %8227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 45, !dbg !223 + %8228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 46, !dbg !223 + %8229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 47, !dbg !223 + %8230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 48, !dbg !223 + %8231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 49, !dbg !223 + %8232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 50, !dbg !223 + %8233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 51, !dbg !223 + %8234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 52, !dbg !223 + %8235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 53, !dbg !223 + %8236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 54, !dbg !223 + %8237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 55, !dbg !223 + %8238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 56, !dbg !223 + %8239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 57, !dbg !223 + %8240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 58, !dbg !223 + %8241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 59, !dbg !223 + %8242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 60, !dbg !223 + %8243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 61, !dbg !223 + %8244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 62, !dbg !223 + %8245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 63, !dbg !223 + %8246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 64, !dbg !223 + %8247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 65, !dbg !223 + %8248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 66, !dbg !223 + %8249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 67, !dbg !223 + %8250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 68, !dbg !223 + %8251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 69, !dbg !223 + %8252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 70, !dbg !223 + %8253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 71, !dbg !223 + %8254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 72, !dbg !223 + %8255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 73, !dbg !223 + %8256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 74, !dbg !223 + %8257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 75, !dbg !223 + %8258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 76, !dbg !223 + %8259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 77, !dbg !223 + %8260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 78, !dbg !223 + %8261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 79, !dbg !223 + %8262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 80, !dbg !223 + %8263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 81, !dbg !223 + %8264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 82, !dbg !223 + %8265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 83, !dbg !223 + %8266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 84, !dbg !223 + %8267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 85, !dbg !223 + %8268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 86, !dbg !223 + %8269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 87, !dbg !223 + %8270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 88, !dbg !223 + %8271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 89, !dbg !223 + %8272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 90, !dbg !223 + %8273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 91, !dbg !223 + %8274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 92, !dbg !223 + %8275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 93, !dbg !223 + %8276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 94, !dbg !223 + %8277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 95, !dbg !223 + %8278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 96, !dbg !223 + %8279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 97, !dbg !223 + %8280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 98, !dbg !223 + %8281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 99, !dbg !223 + %8282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 100, !dbg !223 + %8283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 101, !dbg !223 + %8284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 102, !dbg !223 + %8285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 103, !dbg !223 + %8286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 104, !dbg !223 + %8287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 105, !dbg !223 + %8288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 106, !dbg !223 + %8289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 107, !dbg !223 + %8290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 108, !dbg !223 + %8291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 109, !dbg !223 + %8292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 110, !dbg !223 + %8293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 111, !dbg !223 + %8294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 112, !dbg !223 + %8295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 113, !dbg !223 + %8296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 114, !dbg !223 + %8297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 115, !dbg !223 + %8298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 116, !dbg !223 + %8299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 117, !dbg !223 + %8300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 118, !dbg !223 + %8301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 119, !dbg !223 + %8302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 120, !dbg !223 + %8303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 121, !dbg !223 + %8304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 122, !dbg !223 + %8305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 123, !dbg !223 + %8306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 124, !dbg !223 + %8307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 125, !dbg !223 + %8308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 126, !dbg !223 + %8309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8181, 127, !dbg !223 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !223 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !223 + %8310 = getelementptr bfloat, ptr addrspace(1) %5515, i64 %5243, !dbg !311 + %8311 = getelementptr bfloat, ptr addrspace(1) %5515, i64 %5244, !dbg !311 + %8312 = getelementptr bfloat, ptr addrspace(1) %5515, i64 %5245, !dbg !311 + %8313 = getelementptr bfloat, ptr addrspace(1) %5515, i64 %5246, !dbg !311 + %8314 = getelementptr bfloat, ptr addrspace(1) %8310, i64 %4701, !dbg !312 + %8315 = getelementptr bfloat, ptr addrspace(1) %8311, i64 %4701, !dbg !312 + %8316 = getelementptr bfloat, ptr addrspace(1) %8312, i64 %4701, !dbg !312 + %8317 = getelementptr bfloat, ptr addrspace(1) %8313, i64 %4701, !dbg !312 + %8318 = getelementptr bfloat, ptr addrspace(1) %5516, i64 %5247, !dbg !313 + %8319 = getelementptr bfloat, ptr addrspace(1) %5516, i64 %5248, !dbg !313 + %8320 = getelementptr bfloat, ptr addrspace(1) %5516, i64 %5249, !dbg !313 + %8321 = getelementptr bfloat, ptr addrspace(1) %5516, i64 %5250, !dbg !313 + %8322 = getelementptr bfloat, ptr addrspace(1) %8318, i64 %4701, !dbg !314 + %8323 = getelementptr bfloat, ptr addrspace(1) %8319, i64 %4701, !dbg !314 + %8324 = getelementptr bfloat, ptr addrspace(1) %8320, i64 %4701, !dbg !314 + %8325 = getelementptr bfloat, ptr addrspace(1) %8321, i64 %4701, !dbg !314 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5028, ptr addrspace(1) %8314, i32 %5260) #3, !dbg !315 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5031, ptr addrspace(1) %8315, i32 %5261) #3, !dbg !315 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5034, ptr addrspace(1) %8316, i32 %5262) #3, !dbg !315 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5037, ptr addrspace(1) %8317, i32 %5263) #3, !dbg !315 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !315 + %8326 = getelementptr float, ptr addrspace(1) %5517, i64 %5272, !dbg !316 + %8327 = getelementptr float, ptr addrspace(1) %5517, i64 %5273, !dbg !316 + %8328 = getelementptr float, ptr addrspace(1) %5517, i64 %5274, !dbg !316 + %8329 = getelementptr float, ptr addrspace(1) %5517, i64 %5275, !dbg !316 + %8330 = getelementptr float, ptr addrspace(1) %5517, i64 %5276, !dbg !316 + %8331 = getelementptr float, ptr addrspace(1) %5517, i64 %5277, !dbg !316 + %8332 = getelementptr float, ptr addrspace(1) %5517, i64 %5278, !dbg !316 + %8333 = getelementptr float, ptr addrspace(1) %5517, i64 %5279, !dbg !316 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %5066, ptr addrspace(1) %8326, i32 %5288, i1 %5064) #3, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5069, ptr addrspace(1) %8327, i32 %5289, i1 %5064) #3, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5072, ptr addrspace(1) %8328, i32 %5290, i1 %5064) #3, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5075, ptr addrspace(1) %8329, i32 %5291, i1 %5064) #3, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5078, ptr addrspace(1) %8330, i32 %5292, i1 %5064) #3, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5081, ptr addrspace(1) %8331, i32 %5293, i1 %5064) #3, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5084, ptr addrspace(1) %8332, i32 %5294, i1 %5064) #3, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5087, ptr addrspace(1) %8333, i32 %5295, i1 %5064) #3, !dbg !317 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !317 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5089, ptr addrspace(1) %8322, i32 %5260) #3, !dbg !318 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5090, ptr addrspace(1) %8323, i32 %5261) #3, !dbg !318 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5091, ptr addrspace(1) %8324, i32 %5262) #3, !dbg !318 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5092, ptr addrspace(1) %8325, i32 %5263) #3, !dbg !318 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !318 + %8334 = getelementptr float, ptr addrspace(1) %5518, i64 %5272, !dbg !319 + %8335 = getelementptr float, ptr addrspace(1) %5518, i64 %5273, !dbg !319 + %8336 = getelementptr float, ptr addrspace(1) %5518, i64 %5274, !dbg !319 + %8337 = getelementptr float, ptr addrspace(1) %5518, i64 %5275, !dbg !319 + %8338 = getelementptr float, ptr addrspace(1) %5518, i64 %5276, !dbg !319 + %8339 = getelementptr float, ptr addrspace(1) %5518, i64 %5277, !dbg !319 + %8340 = getelementptr float, ptr addrspace(1) %5518, i64 %5278, !dbg !319 + %8341 = getelementptr float, ptr addrspace(1) %5518, i64 %5279, !dbg !319 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %5093, ptr addrspace(1) %8334, i32 %5288, i1 %5064) #3, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5094, ptr addrspace(1) %8335, i32 %5289, i1 %5064) #3, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5095, ptr addrspace(1) %8336, i32 %5290, i1 %5064) #3, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5096, ptr addrspace(1) %8337, i32 %5291, i1 %5064) #3, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5097, ptr addrspace(1) %8338, i32 %5292, i1 %5064) #3, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5098, ptr addrspace(1) %8339, i32 %5293, i1 %5064) #3, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5099, ptr addrspace(1) %8340, i32 %5294, i1 %5064) #3, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5100, ptr addrspace(1) %8341, i32 %5295, i1 %5064) #3, !dbg !320 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !320 + %8342 = getelementptr i8, ptr addrspace(1) %8314, i64 524288, !dbg !321 + %8343 = getelementptr i8, ptr addrspace(1) %8315, i64 524288, !dbg !321 + %8344 = getelementptr i8, ptr addrspace(1) %8316, i64 524288, !dbg !321 + %8345 = getelementptr i8, ptr addrspace(1) %8317, i64 524288, !dbg !321 + %8346 = getelementptr i8, ptr addrspace(1) %8322, i64 16384, !dbg !322 + %8347 = getelementptr i8, ptr addrspace(1) %8323, i64 16384, !dbg !322 + %8348 = getelementptr i8, ptr addrspace(1) %8324, i64 16384, !dbg !322 + %8349 = getelementptr i8, ptr addrspace(1) %8325, i64 16384, !dbg !322 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !315 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5122, ptr addrspace(1) %8342, i32 %5317) #3, !dbg !315 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5124, ptr addrspace(1) %8343, i32 %5318) #3, !dbg !315 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5126, ptr addrspace(1) %8344, i32 %5319) #3, !dbg !315 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5128, ptr addrspace(1) %8345, i32 %5320) #3, !dbg !315 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !315 + %8350 = getelementptr float, ptr addrspace(1) %5517, i64 %5329, !dbg !316 + %8351 = getelementptr float, ptr addrspace(1) %5517, i64 %5330, !dbg !316 + %8352 = getelementptr float, ptr addrspace(1) %5517, i64 %5331, !dbg !316 + %8353 = getelementptr float, ptr addrspace(1) %5517, i64 %5332, !dbg !316 + %8354 = getelementptr float, ptr addrspace(1) %5517, i64 %5333, !dbg !316 + %8355 = getelementptr float, ptr addrspace(1) %5517, i64 %5334, !dbg !316 + %8356 = getelementptr float, ptr addrspace(1) %5517, i64 %5335, !dbg !316 + %8357 = getelementptr float, ptr addrspace(1) %5517, i64 %5336, !dbg !316 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %5154, ptr addrspace(1) %8350, i32 %5345, i1 %5064) #3, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5156, ptr addrspace(1) %8351, i32 %5346, i1 %5064) #3, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5158, ptr addrspace(1) %8352, i32 %5347, i1 %5064) #3, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5160, ptr addrspace(1) %8353, i32 %5348, i1 %5064) #3, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5162, ptr addrspace(1) %8354, i32 %5349, i1 %5064) #3, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5164, ptr addrspace(1) %8355, i32 %5350, i1 %5064) #3, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5166, ptr addrspace(1) %8356, i32 %5351, i1 %5064) #3, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5168, ptr addrspace(1) %8357, i32 %5352, i1 %5064) #3, !dbg !317 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !317 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %5170, ptr addrspace(1) %8346, i32 %5317) #3, !dbg !318 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5171, ptr addrspace(1) %8347, i32 %5318) #3, !dbg !318 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5172, ptr addrspace(1) %8348, i32 %5319) #3, !dbg !318 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %5173, ptr addrspace(1) %8349, i32 %5320) #3, !dbg !318 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !318 + %8358 = getelementptr float, ptr addrspace(1) %5518, i64 %5329, !dbg !319 + %8359 = getelementptr float, ptr addrspace(1) %5518, i64 %5330, !dbg !319 + %8360 = getelementptr float, ptr addrspace(1) %5518, i64 %5331, !dbg !319 + %8361 = getelementptr float, ptr addrspace(1) %5518, i64 %5332, !dbg !319 + %8362 = getelementptr float, ptr addrspace(1) %5518, i64 %5333, !dbg !319 + %8363 = getelementptr float, ptr addrspace(1) %5518, i64 %5334, !dbg !319 + %8364 = getelementptr float, ptr addrspace(1) %5518, i64 %5335, !dbg !319 + %8365 = getelementptr float, ptr addrspace(1) %5518, i64 %5336, !dbg !319 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %5174, ptr addrspace(1) %8358, i32 %5345, i1 %5064) #3, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5175, ptr addrspace(1) %8359, i32 %5346, i1 %5064) #3, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5176, ptr addrspace(1) %8360, i32 %5347, i1 %5064) #3, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5177, ptr addrspace(1) %8361, i32 %5348, i1 %5064) #3, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5178, ptr addrspace(1) %8362, i32 %5349, i1 %5064) #3, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5179, ptr addrspace(1) %8363, i32 %5350, i1 %5064) #3, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5180, ptr addrspace(1) %8364, i32 %5351, i1 %5064) #3, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %5181, ptr addrspace(1) %8365, i32 %5352, i1 %5064) #3, !dbg !320 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !320 + br i1 %5251, label %.lr.ph1819, label %._crit_edge1820, !dbg !323 + +.lr.ph1819: ; preds = %._crit_edge1670, %__nv_exp2f.exit1284 + %8366 = phi i32 [ %10332, %__nv_exp2f.exit1284 ], [ 64, %._crit_edge1670 ] + %8367 = phi i32 [ %8384, %__nv_exp2f.exit1284 ], [ -1, %._crit_edge1670 ] + %8368 = phi i32 [ %10363, %__nv_exp2f.exit1284 ], [ 1, %._crit_edge1670 ] + %8369 = phi i32 [ %8387, %__nv_exp2f.exit1284 ], [ -1, %._crit_edge1670 ] + %8370 = phi i32 [ %10366, %__nv_exp2f.exit1284 ], [ 1, %._crit_edge1670 ] + %.pn6531817 = phi i32 [ %10352, %__nv_exp2f.exit1284 ], [ %5304, %._crit_edge1670 ] + %.pn6571816 = phi i32 [ %10351, %__nv_exp2f.exit1284 ], [ %5303, %._crit_edge1670 ] + %.pn6611815 = phi i32 [ %10350, %__nv_exp2f.exit1284 ], [ %5302, %._crit_edge1670 ] + %.pn6651814 = phi i32 [ %10349, %__nv_exp2f.exit1284 ], [ %5301, %._crit_edge1670 ] + %.pn6691813 = phi i32 [ %10348, %__nv_exp2f.exit1284 ], [ %5300, %._crit_edge1670 ] + %.pn6731812 = phi i32 [ %10347, %__nv_exp2f.exit1284 ], [ %5299, %._crit_edge1670 ] + %.pn6771811 = phi i32 [ %10346, %__nv_exp2f.exit1284 ], [ %5298, %._crit_edge1670 ] + %.pn6811810 = phi i32 [ %10345, %__nv_exp2f.exit1284 ], [ %5297, %._crit_edge1670 ] + %8371 = phi i32 [ %10357, %__nv_exp2f.exit1284 ], [ %5305, %._crit_edge1670 ] + %8372 = phi i32 [ %10358, %__nv_exp2f.exit1284 ], [ %5306, %._crit_edge1670 ] + %8373 = phi i32 [ %10359, %__nv_exp2f.exit1284 ], [ %5307, %._crit_edge1670 ] + %8374 = phi i32 [ %10360, %__nv_exp2f.exit1284 ], [ %5308, %._crit_edge1670 ] + %.pn6011809 = phi ptr addrspace(1) [ %10344, %__nv_exp2f.exit1284 ], [ %8349, %._crit_edge1670 ] + %.pn6171808 = phi ptr addrspace(1) [ %10343, %__nv_exp2f.exit1284 ], [ %8348, %._crit_edge1670 ] + %.pn6331807 = phi ptr addrspace(1) [ %10342, %__nv_exp2f.exit1284 ], [ %8347, %._crit_edge1670 ] + %.pn6491806 = phi ptr addrspace(1) [ %10341, %__nv_exp2f.exit1284 ], [ %8346, %._crit_edge1670 ] + %8375 = phi i32 [ %10353, %__nv_exp2f.exit1284 ], [ %5305, %._crit_edge1670 ] + %8376 = phi i32 [ %10354, %__nv_exp2f.exit1284 ], [ %5306, %._crit_edge1670 ] + %8377 = phi i32 [ %10355, %__nv_exp2f.exit1284 ], [ %5307, %._crit_edge1670 ] + %8378 = phi i32 [ %10356, %__nv_exp2f.exit1284 ], [ %5308, %._crit_edge1670 ] + %.pn5371805 = phi ptr addrspace(1) [ %10338, %__nv_exp2f.exit1284 ], [ %8345, %._crit_edge1670 ] + %.pn5531804 = phi ptr addrspace(1) [ %10337, %__nv_exp2f.exit1284 ], [ %8344, %._crit_edge1670 ] + %.pn5691803 = phi ptr addrspace(1) [ %10336, %__nv_exp2f.exit1284 ], [ %8343, %._crit_edge1670 ] + %.pn5851802 = phi ptr addrspace(1) [ %10335, %__nv_exp2f.exit1284 ], [ %8342, %._crit_edge1670 ] + %.pn5211801 = phi i32 [ %10312, %__nv_exp2f.exit1284 ], [ %4994, %._crit_edge1670 ] + %.pn3871800 = phi float [ %9519, %__nv_exp2f.exit1284 ], [ %8245, %._crit_edge1670 ] + %.pn3891799 = phi float [ %9518, %__nv_exp2f.exit1284 ], [ %8244, %._crit_edge1670 ] + %.pn3911798 = phi float [ %9517, %__nv_exp2f.exit1284 ], [ %8243, %._crit_edge1670 ] + %.pn3931797 = phi float [ %9516, %__nv_exp2f.exit1284 ], [ %8242, %._crit_edge1670 ] + %.pn3951796 = phi float [ %9515, %__nv_exp2f.exit1284 ], [ %8241, %._crit_edge1670 ] + %.pn3971795 = phi float [ %9514, %__nv_exp2f.exit1284 ], [ %8240, %._crit_edge1670 ] + %.pn3991794 = phi float [ %9513, %__nv_exp2f.exit1284 ], [ %8239, %._crit_edge1670 ] + %.pn4011793 = phi float [ %9512, %__nv_exp2f.exit1284 ], [ %8238, %._crit_edge1670 ] + %.pn4031792 = phi float [ %9511, %__nv_exp2f.exit1284 ], [ %8237, %._crit_edge1670 ] + %.pn4051791 = phi float [ %9510, %__nv_exp2f.exit1284 ], [ %8236, %._crit_edge1670 ] + %.pn4071790 = phi float [ %9509, %__nv_exp2f.exit1284 ], [ %8235, %._crit_edge1670 ] + %.pn4091789 = phi float [ %9508, %__nv_exp2f.exit1284 ], [ %8234, %._crit_edge1670 ] + %.pn4111788 = phi float [ %9507, %__nv_exp2f.exit1284 ], [ %8233, %._crit_edge1670 ] + %.pn4131787 = phi float [ %9506, %__nv_exp2f.exit1284 ], [ %8232, %._crit_edge1670 ] + %.pn4151786 = phi float [ %9505, %__nv_exp2f.exit1284 ], [ %8231, %._crit_edge1670 ] + %.pn4171785 = phi float [ %9504, %__nv_exp2f.exit1284 ], [ %8230, %._crit_edge1670 ] + %.pn4191784 = phi float [ %9503, %__nv_exp2f.exit1284 ], [ %8229, %._crit_edge1670 ] + %.pn4211783 = phi float [ %9502, %__nv_exp2f.exit1284 ], [ %8228, %._crit_edge1670 ] + %.pn4231782 = phi float [ %9501, %__nv_exp2f.exit1284 ], [ %8227, %._crit_edge1670 ] + %.pn4251781 = phi float [ %9500, %__nv_exp2f.exit1284 ], [ %8226, %._crit_edge1670 ] + %.pn4271780 = phi float [ %9499, %__nv_exp2f.exit1284 ], [ %8225, %._crit_edge1670 ] + %.pn4291779 = phi float [ %9498, %__nv_exp2f.exit1284 ], [ %8224, %._crit_edge1670 ] + %.pn4311778 = phi float [ %9497, %__nv_exp2f.exit1284 ], [ %8223, %._crit_edge1670 ] + %.pn4331777 = phi float [ %9496, %__nv_exp2f.exit1284 ], [ %8222, %._crit_edge1670 ] + %.pn4351776 = phi float [ %9495, %__nv_exp2f.exit1284 ], [ %8221, %._crit_edge1670 ] + %.pn4371775 = phi float [ %9494, %__nv_exp2f.exit1284 ], [ %8220, %._crit_edge1670 ] + %.pn4391774 = phi float [ %9493, %__nv_exp2f.exit1284 ], [ %8219, %._crit_edge1670 ] + %.pn4411773 = phi float [ %9492, %__nv_exp2f.exit1284 ], [ %8218, %._crit_edge1670 ] + %.pn4431772 = phi float [ %9491, %__nv_exp2f.exit1284 ], [ %8217, %._crit_edge1670 ] + %.pn4451771 = phi float [ %9490, %__nv_exp2f.exit1284 ], [ %8216, %._crit_edge1670 ] + %.pn4471770 = phi float [ %9489, %__nv_exp2f.exit1284 ], [ %8215, %._crit_edge1670 ] + %.pn4491769 = phi float [ %9488, %__nv_exp2f.exit1284 ], [ %8214, %._crit_edge1670 ] + %.pn4511768 = phi float [ %9487, %__nv_exp2f.exit1284 ], [ %8213, %._crit_edge1670 ] + %.pn4531767 = phi float [ %9486, %__nv_exp2f.exit1284 ], [ %8212, %._crit_edge1670 ] + %.pn4551766 = phi float [ %9485, %__nv_exp2f.exit1284 ], [ %8211, %._crit_edge1670 ] + %.pn4571765 = phi float [ %9484, %__nv_exp2f.exit1284 ], [ %8210, %._crit_edge1670 ] + %.pn4591764 = phi float [ %9483, %__nv_exp2f.exit1284 ], [ %8209, %._crit_edge1670 ] + %.pn4611763 = phi float [ %9482, %__nv_exp2f.exit1284 ], [ %8208, %._crit_edge1670 ] + %.pn4631762 = phi float [ %9481, %__nv_exp2f.exit1284 ], [ %8207, %._crit_edge1670 ] + %.pn4651761 = phi float [ %9480, %__nv_exp2f.exit1284 ], [ %8206, %._crit_edge1670 ] + %.pn4671760 = phi float [ %9479, %__nv_exp2f.exit1284 ], [ %8205, %._crit_edge1670 ] + %.pn4691759 = phi float [ %9478, %__nv_exp2f.exit1284 ], [ %8204, %._crit_edge1670 ] + %.pn4711758 = phi float [ %9477, %__nv_exp2f.exit1284 ], [ %8203, %._crit_edge1670 ] + %.pn4731757 = phi float [ %9476, %__nv_exp2f.exit1284 ], [ %8202, %._crit_edge1670 ] + %.pn4751756 = phi float [ %9475, %__nv_exp2f.exit1284 ], [ %8201, %._crit_edge1670 ] + %.pn4771755 = phi float [ %9474, %__nv_exp2f.exit1284 ], [ %8200, %._crit_edge1670 ] + %.pn4791754 = phi float [ %9473, %__nv_exp2f.exit1284 ], [ %8199, %._crit_edge1670 ] + %.pn4811753 = phi float [ %9472, %__nv_exp2f.exit1284 ], [ %8198, %._crit_edge1670 ] + %.pn4831752 = phi float [ %9471, %__nv_exp2f.exit1284 ], [ %8197, %._crit_edge1670 ] + %.pn4851751 = phi float [ %9470, %__nv_exp2f.exit1284 ], [ %8196, %._crit_edge1670 ] + %.pn4871750 = phi float [ %9469, %__nv_exp2f.exit1284 ], [ %8195, %._crit_edge1670 ] + %.pn4891749 = phi float [ %9468, %__nv_exp2f.exit1284 ], [ %8194, %._crit_edge1670 ] + %.pn4911748 = phi float [ %9467, %__nv_exp2f.exit1284 ], [ %8193, %._crit_edge1670 ] + %.pn4931747 = phi float [ %9466, %__nv_exp2f.exit1284 ], [ %8192, %._crit_edge1670 ] + %.pn4951746 = phi float [ %9465, %__nv_exp2f.exit1284 ], [ %8191, %._crit_edge1670 ] + %.pn4971745 = phi float [ %9464, %__nv_exp2f.exit1284 ], [ %8190, %._crit_edge1670 ] + %.pn4991744 = phi float [ %9463, %__nv_exp2f.exit1284 ], [ %8189, %._crit_edge1670 ] + %.pn5011743 = phi float [ %9462, %__nv_exp2f.exit1284 ], [ %8188, %._crit_edge1670 ] + %.pn5031742 = phi float [ %9461, %__nv_exp2f.exit1284 ], [ %8187, %._crit_edge1670 ] + %.pn5051741 = phi float [ %9460, %__nv_exp2f.exit1284 ], [ %8186, %._crit_edge1670 ] + %.pn5071740 = phi float [ %9459, %__nv_exp2f.exit1284 ], [ %8185, %._crit_edge1670 ] + %.pn5091739 = phi float [ %9458, %__nv_exp2f.exit1284 ], [ %8184, %._crit_edge1670 ] + %.pn5111738 = phi float [ %9457, %__nv_exp2f.exit1284 ], [ %8183, %._crit_edge1670 ] + %.pn5131737 = phi float [ %9456, %__nv_exp2f.exit1284 ], [ %8182, %._crit_edge1670 ] + %.pn2591736 = phi float [ %10311, %__nv_exp2f.exit1284 ], [ %8309, %._crit_edge1670 ] + %.pn2611735 = phi float [ %10310, %__nv_exp2f.exit1284 ], [ %8308, %._crit_edge1670 ] + %.pn2631734 = phi float [ %10309, %__nv_exp2f.exit1284 ], [ %8307, %._crit_edge1670 ] + %.pn2651733 = phi float [ %10308, %__nv_exp2f.exit1284 ], [ %8306, %._crit_edge1670 ] + %.pn2671732 = phi float [ %10307, %__nv_exp2f.exit1284 ], [ %8305, %._crit_edge1670 ] + %.pn2691731 = phi float [ %10306, %__nv_exp2f.exit1284 ], [ %8304, %._crit_edge1670 ] + %.pn2711730 = phi float [ %10305, %__nv_exp2f.exit1284 ], [ %8303, %._crit_edge1670 ] + %.pn2731729 = phi float [ %10304, %__nv_exp2f.exit1284 ], [ %8302, %._crit_edge1670 ] + %.pn2751728 = phi float [ %10303, %__nv_exp2f.exit1284 ], [ %8301, %._crit_edge1670 ] + %.pn2771727 = phi float [ %10302, %__nv_exp2f.exit1284 ], [ %8300, %._crit_edge1670 ] + %.pn2791726 = phi float [ %10301, %__nv_exp2f.exit1284 ], [ %8299, %._crit_edge1670 ] + %.pn2811725 = phi float [ %10300, %__nv_exp2f.exit1284 ], [ %8298, %._crit_edge1670 ] + %.pn2831724 = phi float [ %10299, %__nv_exp2f.exit1284 ], [ %8297, %._crit_edge1670 ] + %.pn2851723 = phi float [ %10298, %__nv_exp2f.exit1284 ], [ %8296, %._crit_edge1670 ] + %.pn2871722 = phi float [ %10297, %__nv_exp2f.exit1284 ], [ %8295, %._crit_edge1670 ] + %.pn2891721 = phi float [ %10296, %__nv_exp2f.exit1284 ], [ %8294, %._crit_edge1670 ] + %.pn2911720 = phi float [ %10295, %__nv_exp2f.exit1284 ], [ %8293, %._crit_edge1670 ] + %.pn2931719 = phi float [ %10294, %__nv_exp2f.exit1284 ], [ %8292, %._crit_edge1670 ] + %.pn2951718 = phi float [ %10293, %__nv_exp2f.exit1284 ], [ %8291, %._crit_edge1670 ] + %.pn2971717 = phi float [ %10292, %__nv_exp2f.exit1284 ], [ %8290, %._crit_edge1670 ] + %.pn2991716 = phi float [ %10291, %__nv_exp2f.exit1284 ], [ %8289, %._crit_edge1670 ] + %.pn3011715 = phi float [ %10290, %__nv_exp2f.exit1284 ], [ %8288, %._crit_edge1670 ] + %.pn3031714 = phi float [ %10289, %__nv_exp2f.exit1284 ], [ %8287, %._crit_edge1670 ] + %.pn3051713 = phi float [ %10288, %__nv_exp2f.exit1284 ], [ %8286, %._crit_edge1670 ] + %.pn3071712 = phi float [ %10287, %__nv_exp2f.exit1284 ], [ %8285, %._crit_edge1670 ] + %.pn3091711 = phi float [ %10286, %__nv_exp2f.exit1284 ], [ %8284, %._crit_edge1670 ] + %.pn3111710 = phi float [ %10285, %__nv_exp2f.exit1284 ], [ %8283, %._crit_edge1670 ] + %.pn3131709 = phi float [ %10284, %__nv_exp2f.exit1284 ], [ %8282, %._crit_edge1670 ] + %.pn3151708 = phi float [ %10283, %__nv_exp2f.exit1284 ], [ %8281, %._crit_edge1670 ] + %.pn3171707 = phi float [ %10282, %__nv_exp2f.exit1284 ], [ %8280, %._crit_edge1670 ] + %.pn3191706 = phi float [ %10281, %__nv_exp2f.exit1284 ], [ %8279, %._crit_edge1670 ] + %.pn3211705 = phi float [ %10280, %__nv_exp2f.exit1284 ], [ %8278, %._crit_edge1670 ] + %.pn3231704 = phi float [ %10279, %__nv_exp2f.exit1284 ], [ %8277, %._crit_edge1670 ] + %.pn3251703 = phi float [ %10278, %__nv_exp2f.exit1284 ], [ %8276, %._crit_edge1670 ] + %.pn3271702 = phi float [ %10277, %__nv_exp2f.exit1284 ], [ %8275, %._crit_edge1670 ] + %.pn3291701 = phi float [ %10276, %__nv_exp2f.exit1284 ], [ %8274, %._crit_edge1670 ] + %.pn3311700 = phi float [ %10275, %__nv_exp2f.exit1284 ], [ %8273, %._crit_edge1670 ] + %.pn3331699 = phi float [ %10274, %__nv_exp2f.exit1284 ], [ %8272, %._crit_edge1670 ] + %.pn3351698 = phi float [ %10273, %__nv_exp2f.exit1284 ], [ %8271, %._crit_edge1670 ] + %.pn3371697 = phi float [ %10272, %__nv_exp2f.exit1284 ], [ %8270, %._crit_edge1670 ] + %.pn3391696 = phi float [ %10271, %__nv_exp2f.exit1284 ], [ %8269, %._crit_edge1670 ] + %.pn3411695 = phi float [ %10270, %__nv_exp2f.exit1284 ], [ %8268, %._crit_edge1670 ] + %.pn3431694 = phi float [ %10269, %__nv_exp2f.exit1284 ], [ %8267, %._crit_edge1670 ] + %.pn3451693 = phi float [ %10268, %__nv_exp2f.exit1284 ], [ %8266, %._crit_edge1670 ] + %.pn3471692 = phi float [ %10267, %__nv_exp2f.exit1284 ], [ %8265, %._crit_edge1670 ] + %.pn3491691 = phi float [ %10266, %__nv_exp2f.exit1284 ], [ %8264, %._crit_edge1670 ] + %.pn3511690 = phi float [ %10265, %__nv_exp2f.exit1284 ], [ %8263, %._crit_edge1670 ] + %.pn3531689 = phi float [ %10264, %__nv_exp2f.exit1284 ], [ %8262, %._crit_edge1670 ] + %.pn3551688 = phi float [ %10263, %__nv_exp2f.exit1284 ], [ %8261, %._crit_edge1670 ] + %.pn3571687 = phi float [ %10262, %__nv_exp2f.exit1284 ], [ %8260, %._crit_edge1670 ] + %.pn3591686 = phi float [ %10261, %__nv_exp2f.exit1284 ], [ %8259, %._crit_edge1670 ] + %.pn3611685 = phi float [ %10260, %__nv_exp2f.exit1284 ], [ %8258, %._crit_edge1670 ] + %.pn3631684 = phi float [ %10259, %__nv_exp2f.exit1284 ], [ %8257, %._crit_edge1670 ] + %.pn3651683 = phi float [ %10258, %__nv_exp2f.exit1284 ], [ %8256, %._crit_edge1670 ] + %.pn3671682 = phi float [ %10257, %__nv_exp2f.exit1284 ], [ %8255, %._crit_edge1670 ] + %.pn3691681 = phi float [ %10256, %__nv_exp2f.exit1284 ], [ %8254, %._crit_edge1670 ] + %.pn3711680 = phi float [ %10255, %__nv_exp2f.exit1284 ], [ %8253, %._crit_edge1670 ] + %.pn3731679 = phi float [ %10254, %__nv_exp2f.exit1284 ], [ %8252, %._crit_edge1670 ] + %.pn3751678 = phi float [ %10253, %__nv_exp2f.exit1284 ], [ %8251, %._crit_edge1670 ] + %.pn3771677 = phi float [ %10252, %__nv_exp2f.exit1284 ], [ %8250, %._crit_edge1670 ] + %.pn3791676 = phi float [ %10251, %__nv_exp2f.exit1284 ], [ %8249, %._crit_edge1670 ] + %.pn3811675 = phi float [ %10250, %__nv_exp2f.exit1284 ], [ %8248, %._crit_edge1670 ] + %.pn3831674 = phi float [ %10249, %__nv_exp2f.exit1284 ], [ %8247, %._crit_edge1670 ] + %.pn3851673 = phi float [ %10248, %__nv_exp2f.exit1284 ], [ %8246, %._crit_edge1670 ] + %8379 = phi i32 [ %10313, %__nv_exp2f.exit1284 ], [ 0, %._crit_edge1670 ] + %8380 = icmp slt i32 %8379, %5353, !dbg !323 + %8381 = icmp slt i32 %8379, %5354, !dbg !323 + %8382 = add i32 %8367, 1, !dbg !323 + %8383 = icmp sgt i32 %8382, 1, !dbg !323 + %8384 = select i1 %8383, i32 0, i32 %8382, !dbg !323 + %8385 = add i32 %8369, 1, !dbg !323 + %8386 = icmp sgt i32 %8385, 2, !dbg !323 + %8387 = select i1 %8386, i32 0, i32 %8385, !dbg !323 + %8388 = icmp slt i32 %.pn5211801, 2048, !dbg !324 + %8389 = zext i1 %8388 to i8, !dbg !315 + %8390 = insertelement <2 x i8> poison, i8 %8389, i64 0, !dbg !315 + %8391 = shufflevector <2 x i8> %8390, <2 x i8> poison, <2 x i32> zeroinitializer, !dbg !315 + store <2 x i8> %8391, ptr addrspace(3) %5193, align 2, !dbg !315 + store <2 x i8> %8391, ptr addrspace(3) %5194, align 2, !dbg !315 + store <2 x i8> %8391, ptr addrspace(3) %5196, align 2, !dbg !315 + store <2 x i8> %8391, ptr addrspace(3) %5197, align 2, !dbg !315 + store <2 x i8> %8391, ptr addrspace(3) %5199, align 2, !dbg !315 + store <2 x i8> %8391, ptr addrspace(3) %5200, align 2, !dbg !315 + store <2 x i8> %8391, ptr addrspace(3) %5202, align 2, !dbg !315 + store <2 x i8> %8391, ptr addrspace(3) %5203, align 2, !dbg !315 + store <2 x i8> %8391, ptr addrspace(3) %5205, align 2, !dbg !315 + store <2 x i8> %8391, ptr addrspace(3) %5206, align 2, !dbg !315 + store <2 x i8> %8391, ptr addrspace(3) %5208, align 2, !dbg !315 + store <2 x i8> %8391, ptr addrspace(3) %5209, align 2, !dbg !315 + store <2 x i8> %8391, ptr addrspace(3) %5211, align 2, !dbg !315 + store <2 x i8> %8391, ptr addrspace(3) %5212, align 2, !dbg !315 + store <2 x i8> %8391, ptr addrspace(3) %5214, align 2, !dbg !315 + store <2 x i8> %8391, ptr addrspace(3) %5215, align 2, !dbg !315 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !315 + %8392 = load <32 x i1>, ptr addrspace(3) %5228, align 4, !dbg !315 + %8393 = load <32 x i1>, ptr addrspace(3) %5230, align 4, !dbg !315 + %8394 = load <32 x i1>, ptr addrspace(3) %5232, align 4, !dbg !315 + %8395 = load <32 x i1>, ptr addrspace(3) %5234, align 4, !dbg !315 + %8396 = load <32 x i1>, ptr addrspace(3) %5236, align 4, !dbg !315 + %8397 = load <32 x i1>, ptr addrspace(3) %5238, align 4, !dbg !315 + %8398 = load <32 x i1>, ptr addrspace(3) %5240, align 4, !dbg !315 + %8399 = load <32 x i1>, ptr addrspace(3) %5242, align 4, !dbg !315 + %8400 = extractelement <32 x i1> %8392, i64 0, !dbg !315 + %8401 = extractelement <32 x i1> %8392, i64 8, !dbg !315 + %8402 = extractelement <32 x i1> %8392, i64 16, !dbg !315 + %8403 = extractelement <32 x i1> %8392, i64 24, !dbg !315 + %8404 = extractelement <32 x i1> %8393, i64 0, !dbg !315 + %8405 = extractelement <32 x i1> %8393, i64 8, !dbg !315 + %8406 = extractelement <32 x i1> %8393, i64 16, !dbg !315 + %8407 = extractelement <32 x i1> %8393, i64 24, !dbg !315 + %8408 = extractelement <32 x i1> %8394, i64 0, !dbg !315 + %8409 = extractelement <32 x i1> %8394, i64 8, !dbg !315 + %8410 = extractelement <32 x i1> %8394, i64 16, !dbg !315 + %8411 = extractelement <32 x i1> %8394, i64 24, !dbg !315 + %8412 = extractelement <32 x i1> %8395, i64 0, !dbg !315 + %8413 = extractelement <32 x i1> %8395, i64 8, !dbg !315 + %8414 = extractelement <32 x i1> %8395, i64 16, !dbg !315 + %8415 = extractelement <32 x i1> %8395, i64 24, !dbg !315 + %8416 = extractelement <32 x i1> %8396, i64 0, !dbg !315 + %8417 = extractelement <32 x i1> %8396, i64 8, !dbg !315 + %8418 = extractelement <32 x i1> %8396, i64 16, !dbg !315 + %8419 = extractelement <32 x i1> %8396, i64 24, !dbg !315 + %8420 = extractelement <32 x i1> %8397, i64 0, !dbg !315 + %8421 = extractelement <32 x i1> %8397, i64 8, !dbg !315 + %8422 = extractelement <32 x i1> %8397, i64 16, !dbg !315 + %8423 = extractelement <32 x i1> %8397, i64 24, !dbg !315 + %8424 = extractelement <32 x i1> %8398, i64 0, !dbg !315 + %8425 = extractelement <32 x i1> %8398, i64 8, !dbg !315 + %8426 = extractelement <32 x i1> %8398, i64 16, !dbg !315 + %8427 = extractelement <32 x i1> %8398, i64 24, !dbg !315 + %8428 = extractelement <32 x i1> %8399, i64 0, !dbg !315 + %8429 = extractelement <32 x i1> %8399, i64 8, !dbg !315 + %8430 = extractelement <32 x i1> %8399, i64 16, !dbg !315 + %8431 = extractelement <32 x i1> %8399, i64 24, !dbg !315 + tail call void @llvm.nvvm.cp.async.wait.group(i32 4), !dbg !315 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !315 + %8432 = shl i32 %8387, 13, !dbg !315 + %8433 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %8432, !dbg !315 + %8434 = shl i32 %8384, 6, !dbg !317 + %8435 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106496), i32 %8434, !dbg !317 + %8436 = getelementptr inbounds nuw i8, ptr addrspace(3) %8435, i32 %5065, !dbg !317 + %8437 = load float, ptr addrspace(3) %8436, align 8, !dbg !317 + %8438 = getelementptr inbounds nuw i8, ptr addrspace(3) %8436, i32 4, !dbg !317 + %8439 = load float, ptr addrspace(3) %8438, align 4, !dbg !317 + %8440 = getelementptr inbounds nuw i8, ptr addrspace(3) %8435, i32 %5068, !dbg !317 + %8441 = load float, ptr addrspace(3) %8440, align 8, !dbg !317 + %8442 = getelementptr inbounds nuw i8, ptr addrspace(3) %8440, i32 4, !dbg !317 + %8443 = load float, ptr addrspace(3) %8442, align 4, !dbg !317 + %8444 = getelementptr inbounds nuw i8, ptr addrspace(3) %8435, i32 %5071, !dbg !317 + %8445 = load float, ptr addrspace(3) %8444, align 8, !dbg !317 + %8446 = getelementptr inbounds nuw i8, ptr addrspace(3) %8444, i32 4, !dbg !317 + %8447 = load float, ptr addrspace(3) %8446, align 4, !dbg !317 + %8448 = getelementptr inbounds nuw i8, ptr addrspace(3) %8435, i32 %5074, !dbg !317 + %8449 = load float, ptr addrspace(3) %8448, align 8, !dbg !317 + %8450 = getelementptr inbounds nuw i8, ptr addrspace(3) %8448, i32 4, !dbg !317 + %8451 = load float, ptr addrspace(3) %8450, align 4, !dbg !317 + %8452 = getelementptr inbounds nuw i8, ptr addrspace(3) %8435, i32 %5077, !dbg !317 + %8453 = load float, ptr addrspace(3) %8452, align 8, !dbg !317 + %8454 = getelementptr inbounds nuw i8, ptr addrspace(3) %8452, i32 4, !dbg !317 + %8455 = load float, ptr addrspace(3) %8454, align 4, !dbg !317 + %8456 = getelementptr inbounds nuw i8, ptr addrspace(3) %8435, i32 %5080, !dbg !317 + %8457 = load float, ptr addrspace(3) %8456, align 8, !dbg !317 + %8458 = getelementptr inbounds nuw i8, ptr addrspace(3) %8456, i32 4, !dbg !317 + %8459 = load float, ptr addrspace(3) %8458, align 4, !dbg !317 + %8460 = getelementptr inbounds nuw i8, ptr addrspace(3) %8435, i32 %5083, !dbg !317 + %8461 = load float, ptr addrspace(3) %8460, align 8, !dbg !317 + %8462 = getelementptr inbounds nuw i8, ptr addrspace(3) %8460, i32 4, !dbg !317 + %8463 = load float, ptr addrspace(3) %8462, align 4, !dbg !317 + %8464 = getelementptr inbounds nuw i8, ptr addrspace(3) %8435, i32 %5086, !dbg !317 + %8465 = load float, ptr addrspace(3) %8464, align 8, !dbg !317 + %8466 = getelementptr inbounds nuw i8, ptr addrspace(3) %8464, i32 4, !dbg !317 + %8467 = load float, ptr addrspace(3) %8466, align 4, !dbg !317 + %8468 = fcmp oeq float %8437, 0xFFF0000000000000, !dbg !325 + %8469 = fcmp oeq float %8439, 0xFFF0000000000000, !dbg !325 + %8470 = fcmp oeq float %8441, 0xFFF0000000000000, !dbg !325 + %8471 = fcmp oeq float %8443, 0xFFF0000000000000, !dbg !325 + %8472 = fcmp oeq float %8445, 0xFFF0000000000000, !dbg !325 + %8473 = fcmp oeq float %8447, 0xFFF0000000000000, !dbg !325 + %8474 = fcmp oeq float %8449, 0xFFF0000000000000, !dbg !325 + %8475 = fcmp oeq float %8451, 0xFFF0000000000000, !dbg !325 + %8476 = fcmp oeq float %8453, 0xFFF0000000000000, !dbg !325 + %8477 = fcmp oeq float %8455, 0xFFF0000000000000, !dbg !325 + %8478 = fcmp oeq float %8457, 0xFFF0000000000000, !dbg !325 + %8479 = fcmp oeq float %8459, 0xFFF0000000000000, !dbg !325 + %8480 = fcmp oeq float %8461, 0xFFF0000000000000, !dbg !325 + %8481 = fcmp oeq float %8463, 0xFFF0000000000000, !dbg !325 + %8482 = fcmp oeq float %8465, 0xFFF0000000000000, !dbg !325 + %8483 = fcmp oeq float %8467, 0xFFF0000000000000, !dbg !325 + %8484 = select i1 %8468, float 0.000000e+00, float %8437, !dbg !326 + %8485 = select i1 %8469, float 0.000000e+00, float %8439, !dbg !326 + %8486 = select i1 %8470, float 0.000000e+00, float %8441, !dbg !326 + %8487 = select i1 %8471, float 0.000000e+00, float %8443, !dbg !326 + %8488 = select i1 %8472, float 0.000000e+00, float %8445, !dbg !326 + %8489 = select i1 %8473, float 0.000000e+00, float %8447, !dbg !326 + %8490 = select i1 %8474, float 0.000000e+00, float %8449, !dbg !326 + %8491 = select i1 %8475, float 0.000000e+00, float %8451, !dbg !326 + %8492 = select i1 %8476, float 0.000000e+00, float %8453, !dbg !326 + %8493 = select i1 %8477, float 0.000000e+00, float %8455, !dbg !326 + %8494 = select i1 %8478, float 0.000000e+00, float %8457, !dbg !326 + %8495 = select i1 %8479, float 0.000000e+00, float %8459, !dbg !326 + %8496 = select i1 %8480, float 0.000000e+00, float %8461, !dbg !326 + %8497 = select i1 %8481, float 0.000000e+00, float %8463, !dbg !326 + %8498 = select i1 %8482, float 0.000000e+00, float %8465, !dbg !326 + %8499 = select i1 %8483, float 0.000000e+00, float %8467, !dbg !326 + %8500 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %44, i32 0, i32 31), !dbg !327 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !327 + %8501 = shl i32 %8500, 11, !dbg !327 + %8502 = and i32 %8501, 8192, !dbg !327 + %8503 = add i32 %8502, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520) to i32), !dbg !327 + %8504 = lshr exact i32 %8503, 4, !dbg !327 + %8505 = and i32 %8504, 16383, !dbg !327 + %8506 = zext nneg i32 %8505 to i64, !dbg !327 + %8507 = or disjoint i64 %8506, 4611686293372403712, !dbg !327 + %8508 = ptrtoint ptr addrspace(3) %8433 to i32, !dbg !327 + %8509 = lshr exact i32 %8508, 4, !dbg !327 + %8510 = and i32 %8509, 16383, !dbg !327 + %8511 = zext nneg i32 %8510 to i64, !dbg !327 + %8512 = or disjoint i64 %8511, 4611686293338849280, !dbg !327 + %8513 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %8507, i64 %8512) #3, !dbg !327 + %8514 = or disjoint i32 %8502, 32, !dbg !327 + %8515 = add i32 %8514, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520) to i32), !dbg !327 + %8516 = lshr exact i32 %8515, 4, !dbg !327 + %8517 = and i32 %8516, 16383, !dbg !327 + %8518 = zext nneg i32 %8517 to i64, !dbg !327 + %8519 = or disjoint i64 %8518, 4611686293372403712, !dbg !327 + %8520 = add i32 %8508, 32, !dbg !327 + %8521 = lshr exact i32 %8520, 4, !dbg !327 + %8522 = and i32 %8521, 16383, !dbg !327 + %8523 = zext nneg i32 %8522 to i64, !dbg !327 + %8524 = or disjoint i64 %8523, 4611686293338849280, !dbg !327 + %8525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 0, !dbg !327 + %8526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 1, !dbg !327 + %8527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 2, !dbg !327 + %8528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 3, !dbg !327 + %8529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 4, !dbg !327 + %8530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 5, !dbg !327 + %8531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 6, !dbg !327 + %8532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 7, !dbg !327 + %8533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 8, !dbg !327 + %8534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 9, !dbg !327 + %8535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 10, !dbg !327 + %8536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 11, !dbg !327 + %8537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 12, !dbg !327 + %8538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 13, !dbg !327 + %8539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 14, !dbg !327 + %8540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 15, !dbg !327 + %8541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 16, !dbg !327 + %8542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 17, !dbg !327 + %8543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 18, !dbg !327 + %8544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 19, !dbg !327 + %8545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 20, !dbg !327 + %8546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 21, !dbg !327 + %8547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 22, !dbg !327 + %8548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 23, !dbg !327 + %8549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 24, !dbg !327 + %8550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 25, !dbg !327 + %8551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 26, !dbg !327 + %8552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 27, !dbg !327 + %8553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 28, !dbg !327 + %8554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 29, !dbg !327 + %8555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 30, !dbg !327 + %8556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8513, 31, !dbg !327 + %8557 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8525, float %8526, float %8527, float %8528, float %8529, float %8530, float %8531, float %8532, float %8533, float %8534, float %8535, float %8536, float %8537, float %8538, float %8539, float %8540, float %8541, float %8542, float %8543, float %8544, float %8545, float %8546, float %8547, float %8548, float %8549, float %8550, float %8551, float %8552, float %8553, float %8554, float %8555, float %8556, i64 %8519, i64 %8524, i1 true) #3, !dbg !327 + %8558 = or disjoint i32 %8502, 64, !dbg !327 + %8559 = add i32 %8558, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520) to i32), !dbg !327 + %8560 = lshr exact i32 %8559, 4, !dbg !327 + %8561 = and i32 %8560, 16383, !dbg !327 + %8562 = zext nneg i32 %8561 to i64, !dbg !327 + %8563 = or disjoint i64 %8562, 4611686293372403712, !dbg !327 + %8564 = add i32 %8508, 64, !dbg !327 + %8565 = lshr exact i32 %8564, 4, !dbg !327 + %8566 = and i32 %8565, 16383, !dbg !327 + %8567 = zext nneg i32 %8566 to i64, !dbg !327 + %8568 = or disjoint i64 %8567, 4611686293338849280, !dbg !327 + %8569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 0, !dbg !327 + %8570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 1, !dbg !327 + %8571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 2, !dbg !327 + %8572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 3, !dbg !327 + %8573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 4, !dbg !327 + %8574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 5, !dbg !327 + %8575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 6, !dbg !327 + %8576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 7, !dbg !327 + %8577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 8, !dbg !327 + %8578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 9, !dbg !327 + %8579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 10, !dbg !327 + %8580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 11, !dbg !327 + %8581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 12, !dbg !327 + %8582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 13, !dbg !327 + %8583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 14, !dbg !327 + %8584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 15, !dbg !327 + %8585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 16, !dbg !327 + %8586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 17, !dbg !327 + %8587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 18, !dbg !327 + %8588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 19, !dbg !327 + %8589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 20, !dbg !327 + %8590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 21, !dbg !327 + %8591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 22, !dbg !327 + %8592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 23, !dbg !327 + %8593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 24, !dbg !327 + %8594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 25, !dbg !327 + %8595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 26, !dbg !327 + %8596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 27, !dbg !327 + %8597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 28, !dbg !327 + %8598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 29, !dbg !327 + %8599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 30, !dbg !327 + %8600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8557, 31, !dbg !327 + %8601 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8569, float %8570, float %8571, float %8572, float %8573, float %8574, float %8575, float %8576, float %8577, float %8578, float %8579, float %8580, float %8581, float %8582, float %8583, float %8584, float %8585, float %8586, float %8587, float %8588, float %8589, float %8590, float %8591, float %8592, float %8593, float %8594, float %8595, float %8596, float %8597, float %8598, float %8599, float %8600, i64 %8563, i64 %8568, i1 true) #3, !dbg !327 + %8602 = or disjoint i32 %8502, 96, !dbg !327 + %8603 = add i32 %8602, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520) to i32), !dbg !327 + %8604 = lshr exact i32 %8603, 4, !dbg !327 + %8605 = and i32 %8604, 16383, !dbg !327 + %8606 = zext nneg i32 %8605 to i64, !dbg !327 + %8607 = or disjoint i64 %8606, 4611686293372403712, !dbg !327 + %8608 = add i32 %8508, 96, !dbg !327 + %8609 = lshr exact i32 %8608, 4, !dbg !327 + %8610 = and i32 %8609, 16383, !dbg !327 + %8611 = zext nneg i32 %8610 to i64, !dbg !327 + %8612 = or disjoint i64 %8611, 4611686293338849280, !dbg !327 + %8613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 0, !dbg !327 + %8614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 1, !dbg !327 + %8615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 2, !dbg !327 + %8616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 3, !dbg !327 + %8617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 4, !dbg !327 + %8618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 5, !dbg !327 + %8619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 6, !dbg !327 + %8620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 7, !dbg !327 + %8621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 8, !dbg !327 + %8622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 9, !dbg !327 + %8623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 10, !dbg !327 + %8624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 11, !dbg !327 + %8625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 12, !dbg !327 + %8626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 13, !dbg !327 + %8627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 14, !dbg !327 + %8628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 15, !dbg !327 + %8629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 16, !dbg !327 + %8630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 17, !dbg !327 + %8631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 18, !dbg !327 + %8632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 19, !dbg !327 + %8633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 20, !dbg !327 + %8634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 21, !dbg !327 + %8635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 22, !dbg !327 + %8636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 23, !dbg !327 + %8637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 24, !dbg !327 + %8638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 25, !dbg !327 + %8639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 26, !dbg !327 + %8640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 27, !dbg !327 + %8641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 28, !dbg !327 + %8642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 29, !dbg !327 + %8643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 30, !dbg !327 + %8644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8601, 31, !dbg !327 + %8645 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8613, float %8614, float %8615, float %8616, float %8617, float %8618, float %8619, float %8620, float %8621, float %8622, float %8623, float %8624, float %8625, float %8626, float %8627, float %8628, float %8629, float %8630, float %8631, float %8632, float %8633, float %8634, float %8635, float %8636, float %8637, float %8638, float %8639, float %8640, float %8641, float %8642, float %8643, float %8644, i64 %8607, i64 %8612, i1 true) #3, !dbg !327 + %8646 = or disjoint i32 %8502, 16384, !dbg !327 + %8647 = add i32 %8646, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520) to i32), !dbg !327 + %8648 = lshr exact i32 %8647, 4, !dbg !327 + %8649 = and i32 %8648, 16383, !dbg !327 + %8650 = zext nneg i32 %8649 to i64, !dbg !327 + %8651 = or disjoint i64 %8650, 4611686293372403712, !dbg !327 + %8652 = add i32 %8508, 8192, !dbg !327 + %8653 = lshr exact i32 %8652, 4, !dbg !327 + %8654 = and i32 %8653, 16383, !dbg !327 + %8655 = zext nneg i32 %8654 to i64, !dbg !327 + %8656 = or disjoint i64 %8655, 4611686293338849280, !dbg !327 + %8657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 0, !dbg !327 + %8658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 1, !dbg !327 + %8659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 2, !dbg !327 + %8660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 3, !dbg !327 + %8661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 4, !dbg !327 + %8662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 5, !dbg !327 + %8663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 6, !dbg !327 + %8664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 7, !dbg !327 + %8665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 8, !dbg !327 + %8666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 9, !dbg !327 + %8667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 10, !dbg !327 + %8668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 11, !dbg !327 + %8669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 12, !dbg !327 + %8670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 13, !dbg !327 + %8671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 14, !dbg !327 + %8672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 15, !dbg !327 + %8673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 16, !dbg !327 + %8674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 17, !dbg !327 + %8675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 18, !dbg !327 + %8676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 19, !dbg !327 + %8677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 20, !dbg !327 + %8678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 21, !dbg !327 + %8679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 22, !dbg !327 + %8680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 23, !dbg !327 + %8681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 24, !dbg !327 + %8682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 25, !dbg !327 + %8683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 26, !dbg !327 + %8684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 27, !dbg !327 + %8685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 28, !dbg !327 + %8686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 29, !dbg !327 + %8687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 30, !dbg !327 + %8688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8645, 31, !dbg !327 + %8689 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8657, float %8658, float %8659, float %8660, float %8661, float %8662, float %8663, float %8664, float %8665, float %8666, float %8667, float %8668, float %8669, float %8670, float %8671, float %8672, float %8673, float %8674, float %8675, float %8676, float %8677, float %8678, float %8679, float %8680, float %8681, float %8682, float %8683, float %8684, float %8685, float %8686, float %8687, float %8688, i64 %8651, i64 %8656, i1 true) #3, !dbg !327 + %8690 = or disjoint i32 %8502, 16416, !dbg !327 + %8691 = add i32 %8690, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520) to i32), !dbg !327 + %8692 = lshr exact i32 %8691, 4, !dbg !327 + %8693 = and i32 %8692, 16383, !dbg !327 + %8694 = zext nneg i32 %8693 to i64, !dbg !327 + %8695 = or disjoint i64 %8694, 4611686293372403712, !dbg !327 + %8696 = add i32 %8508, 8224, !dbg !327 + %8697 = lshr exact i32 %8696, 4, !dbg !327 + %8698 = and i32 %8697, 16383, !dbg !327 + %8699 = zext nneg i32 %8698 to i64, !dbg !327 + %8700 = or disjoint i64 %8699, 4611686293338849280, !dbg !327 + %8701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 0, !dbg !327 + %8702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 1, !dbg !327 + %8703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 2, !dbg !327 + %8704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 3, !dbg !327 + %8705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 4, !dbg !327 + %8706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 5, !dbg !327 + %8707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 6, !dbg !327 + %8708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 7, !dbg !327 + %8709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 8, !dbg !327 + %8710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 9, !dbg !327 + %8711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 10, !dbg !327 + %8712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 11, !dbg !327 + %8713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 12, !dbg !327 + %8714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 13, !dbg !327 + %8715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 14, !dbg !327 + %8716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 15, !dbg !327 + %8717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 16, !dbg !327 + %8718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 17, !dbg !327 + %8719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 18, !dbg !327 + %8720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 19, !dbg !327 + %8721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 20, !dbg !327 + %8722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 21, !dbg !327 + %8723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 22, !dbg !327 + %8724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 23, !dbg !327 + %8725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 24, !dbg !327 + %8726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 25, !dbg !327 + %8727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 26, !dbg !327 + %8728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 27, !dbg !327 + %8729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 28, !dbg !327 + %8730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 29, !dbg !327 + %8731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 30, !dbg !327 + %8732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8689, 31, !dbg !327 + %8733 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8701, float %8702, float %8703, float %8704, float %8705, float %8706, float %8707, float %8708, float %8709, float %8710, float %8711, float %8712, float %8713, float %8714, float %8715, float %8716, float %8717, float %8718, float %8719, float %8720, float %8721, float %8722, float %8723, float %8724, float %8725, float %8726, float %8727, float %8728, float %8729, float %8730, float %8731, float %8732, i64 %8695, i64 %8700, i1 true) #3, !dbg !327 + %8734 = or disjoint i32 %8502, 16448, !dbg !327 + %8735 = add i32 %8734, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520) to i32), !dbg !327 + %8736 = lshr exact i32 %8735, 4, !dbg !327 + %8737 = and i32 %8736, 16383, !dbg !327 + %8738 = zext nneg i32 %8737 to i64, !dbg !327 + %8739 = or disjoint i64 %8738, 4611686293372403712, !dbg !327 + %8740 = add i32 %8508, 8256, !dbg !327 + %8741 = lshr exact i32 %8740, 4, !dbg !327 + %8742 = and i32 %8741, 16383, !dbg !327 + %8743 = zext nneg i32 %8742 to i64, !dbg !327 + %8744 = or disjoint i64 %8743, 4611686293338849280, !dbg !327 + %8745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 0, !dbg !327 + %8746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 1, !dbg !327 + %8747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 2, !dbg !327 + %8748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 3, !dbg !327 + %8749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 4, !dbg !327 + %8750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 5, !dbg !327 + %8751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 6, !dbg !327 + %8752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 7, !dbg !327 + %8753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 8, !dbg !327 + %8754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 9, !dbg !327 + %8755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 10, !dbg !327 + %8756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 11, !dbg !327 + %8757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 12, !dbg !327 + %8758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 13, !dbg !327 + %8759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 14, !dbg !327 + %8760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 15, !dbg !327 + %8761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 16, !dbg !327 + %8762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 17, !dbg !327 + %8763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 18, !dbg !327 + %8764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 19, !dbg !327 + %8765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 20, !dbg !327 + %8766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 21, !dbg !327 + %8767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 22, !dbg !327 + %8768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 23, !dbg !327 + %8769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 24, !dbg !327 + %8770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 25, !dbg !327 + %8771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 26, !dbg !327 + %8772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 27, !dbg !327 + %8773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 28, !dbg !327 + %8774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 29, !dbg !327 + %8775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 30, !dbg !327 + %8776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8733, 31, !dbg !327 + %8777 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8745, float %8746, float %8747, float %8748, float %8749, float %8750, float %8751, float %8752, float %8753, float %8754, float %8755, float %8756, float %8757, float %8758, float %8759, float %8760, float %8761, float %8762, float %8763, float %8764, float %8765, float %8766, float %8767, float %8768, float %8769, float %8770, float %8771, float %8772, float %8773, float %8774, float %8775, float %8776, i64 %8739, i64 %8744, i1 true) #3, !dbg !327 + %8778 = or disjoint i32 %8502, 16480, !dbg !327 + %8779 = add i32 %8778, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520) to i32), !dbg !327 + %8780 = lshr exact i32 %8779, 4, !dbg !327 + %8781 = and i32 %8780, 16383, !dbg !327 + %8782 = zext nneg i32 %8781 to i64, !dbg !327 + %8783 = or disjoint i64 %8782, 4611686293372403712, !dbg !327 + %8784 = add i32 %8508, 8288, !dbg !327 + %8785 = lshr exact i32 %8784, 4, !dbg !327 + %8786 = and i32 %8785, 16383, !dbg !327 + %8787 = zext nneg i32 %8786 to i64, !dbg !327 + %8788 = or disjoint i64 %8787, 4611686293338849280, !dbg !327 + %8789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 0, !dbg !327 + %8790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 1, !dbg !327 + %8791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 2, !dbg !327 + %8792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 3, !dbg !327 + %8793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 4, !dbg !327 + %8794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 5, !dbg !327 + %8795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 6, !dbg !327 + %8796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 7, !dbg !327 + %8797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 8, !dbg !327 + %8798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 9, !dbg !327 + %8799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 10, !dbg !327 + %8800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 11, !dbg !327 + %8801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 12, !dbg !327 + %8802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 13, !dbg !327 + %8803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 14, !dbg !327 + %8804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 15, !dbg !327 + %8805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 16, !dbg !327 + %8806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 17, !dbg !327 + %8807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 18, !dbg !327 + %8808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 19, !dbg !327 + %8809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 20, !dbg !327 + %8810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 21, !dbg !327 + %8811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 22, !dbg !327 + %8812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 23, !dbg !327 + %8813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 24, !dbg !327 + %8814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 25, !dbg !327 + %8815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 26, !dbg !327 + %8816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 27, !dbg !327 + %8817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 28, !dbg !327 + %8818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 29, !dbg !327 + %8819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 30, !dbg !327 + %8820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8777, 31, !dbg !327 + %8821 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %8789, float %8790, float %8791, float %8792, float %8793, float %8794, float %8795, float %8796, float %8797, float %8798, float %8799, float %8800, float %8801, float %8802, float %8803, float %8804, float %8805, float %8806, float %8807, float %8808, float %8809, float %8810, float %8811, float %8812, float %8813, float %8814, float %8815, float %8816, float %8817, float %8818, float %8819, float %8820, i64 %8783, i64 %8788, i1 true) #3, !dbg !327 + %8822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 0, !dbg !327 + %8823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 1, !dbg !327 + %8824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 2, !dbg !327 + %8825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 3, !dbg !327 + %8826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 4, !dbg !327 + %8827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 5, !dbg !327 + %8828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 6, !dbg !327 + %8829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 7, !dbg !327 + %8830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 8, !dbg !327 + %8831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 9, !dbg !327 + %8832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 10, !dbg !327 + %8833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 11, !dbg !327 + %8834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 12, !dbg !327 + %8835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 13, !dbg !327 + %8836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 14, !dbg !327 + %8837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 15, !dbg !327 + %8838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 16, !dbg !327 + %8839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 17, !dbg !327 + %8840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 18, !dbg !327 + %8841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 19, !dbg !327 + %8842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 20, !dbg !327 + %8843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 21, !dbg !327 + %8844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 22, !dbg !327 + %8845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 23, !dbg !327 + %8846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 24, !dbg !327 + %8847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 25, !dbg !327 + %8848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 26, !dbg !327 + %8849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 27, !dbg !327 + %8850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 28, !dbg !327 + %8851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 29, !dbg !327 + %8852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 30, !dbg !327 + %8853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %8821, 31, !dbg !327 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !327 + %8854 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %8822, float %8823, float %8824, float %8825, float %8826, float %8827, float %8828, float %8829, float %8830, float %8831, float %8832, float %8833, float %8834, float %8835, float %8836, float %8837, float %8838, float %8839, float %8840, float %8841, float %8842, float %8843, float %8844, float %8845, float %8846, float %8847, float %8848, float %8849, float %8850, float %8851, float %8852, float %8853, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107520), i32 0, i32 0, ptr addrspace(3) %8433, i32 0, i32 0) #3, !dbg !327 + %8855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 0, !dbg !327 + %8856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 1, !dbg !327 + %8857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 2, !dbg !327 + %8858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 3, !dbg !327 + %8859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 4, !dbg !327 + %8860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 5, !dbg !327 + %8861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 6, !dbg !327 + %8862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 7, !dbg !327 + %8863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 8, !dbg !327 + %8864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 9, !dbg !327 + %8865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 10, !dbg !327 + %8866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 11, !dbg !327 + %8867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 12, !dbg !327 + %8868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 13, !dbg !327 + %8869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 14, !dbg !327 + %8870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 15, !dbg !327 + %8871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 16, !dbg !327 + %8872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 17, !dbg !327 + %8873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 18, !dbg !327 + %8874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 19, !dbg !327 + %8875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 20, !dbg !327 + %8876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 21, !dbg !327 + %8877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 22, !dbg !327 + %8878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 23, !dbg !327 + %8879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 24, !dbg !327 + %8880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 25, !dbg !327 + %8881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 26, !dbg !327 + %8882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 27, !dbg !327 + %8883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 28, !dbg !327 + %8884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 29, !dbg !327 + %8885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 30, !dbg !327 + %8886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %8854, 31, !dbg !327 + %8887 = fmul float %8855, 0x3FB6A09E60000000, !dbg !328 + %8888 = fmul float %8856, 0x3FB6A09E60000000, !dbg !328 + %8889 = fmul float %8857, 0x3FB6A09E60000000, !dbg !328 + %8890 = fmul float %8858, 0x3FB6A09E60000000, !dbg !328 + %8891 = fmul float %8859, 0x3FB6A09E60000000, !dbg !328 + %8892 = fmul float %8860, 0x3FB6A09E60000000, !dbg !328 + %8893 = fmul float %8861, 0x3FB6A09E60000000, !dbg !328 + %8894 = fmul float %8862, 0x3FB6A09E60000000, !dbg !328 + %8895 = fmul float %8863, 0x3FB6A09E60000000, !dbg !328 + %8896 = fmul float %8864, 0x3FB6A09E60000000, !dbg !328 + %8897 = fmul float %8865, 0x3FB6A09E60000000, !dbg !328 + %8898 = fmul float %8866, 0x3FB6A09E60000000, !dbg !328 + %8899 = fmul float %8867, 0x3FB6A09E60000000, !dbg !328 + %8900 = fmul float %8868, 0x3FB6A09E60000000, !dbg !328 + %8901 = fmul float %8869, 0x3FB6A09E60000000, !dbg !328 + %8902 = fmul float %8870, 0x3FB6A09E60000000, !dbg !328 + %8903 = fmul float %8871, 0x3FB6A09E60000000, !dbg !328 + %8904 = fmul float %8872, 0x3FB6A09E60000000, !dbg !328 + %8905 = fmul float %8873, 0x3FB6A09E60000000, !dbg !328 + %8906 = fmul float %8874, 0x3FB6A09E60000000, !dbg !328 + %8907 = fmul float %8875, 0x3FB6A09E60000000, !dbg !328 + %8908 = fmul float %8876, 0x3FB6A09E60000000, !dbg !328 + %8909 = fmul float %8877, 0x3FB6A09E60000000, !dbg !328 + %8910 = fmul float %8878, 0x3FB6A09E60000000, !dbg !328 + %8911 = fmul float %8879, 0x3FB6A09E60000000, !dbg !328 + %8912 = fmul float %8880, 0x3FB6A09E60000000, !dbg !328 + %8913 = fmul float %8881, 0x3FB6A09E60000000, !dbg !328 + %8914 = fmul float %8882, 0x3FB6A09E60000000, !dbg !328 + %8915 = fmul float %8883, 0x3FB6A09E60000000, !dbg !328 + %8916 = fmul float %8884, 0x3FB6A09E60000000, !dbg !328 + %8917 = fmul float %8885, 0x3FB6A09E60000000, !dbg !328 + %8918 = fmul float %8886, 0x3FB6A09E60000000, !dbg !328 + %8919 = fmul float %8887, 0x3FF7154760000000, !dbg !329 + %8920 = select i1 %8400, float %8919, float 0xFFF0000000000000, !dbg !330 + %8921 = fmul float %8888, 0x3FF7154760000000, !dbg !329 + %8922 = select i1 %8401, float %8921, float 0xFFF0000000000000, !dbg !330 + %8923 = fmul float %8889, 0x3FF7154760000000, !dbg !329 + %8924 = select i1 %8402, float %8923, float 0xFFF0000000000000, !dbg !330 + %8925 = fmul float %8890, 0x3FF7154760000000, !dbg !329 + %8926 = select i1 %8403, float %8925, float 0xFFF0000000000000, !dbg !330 + %8927 = fmul float %8891, 0x3FF7154760000000, !dbg !329 + %8928 = select i1 %8404, float %8927, float 0xFFF0000000000000, !dbg !330 + %8929 = fmul float %8892, 0x3FF7154760000000, !dbg !329 + %8930 = select i1 %8405, float %8929, float 0xFFF0000000000000, !dbg !330 + %8931 = fmul float %8893, 0x3FF7154760000000, !dbg !329 + %8932 = select i1 %8406, float %8931, float 0xFFF0000000000000, !dbg !330 + %8933 = fmul float %8894, 0x3FF7154760000000, !dbg !329 + %8934 = select i1 %8407, float %8933, float 0xFFF0000000000000, !dbg !330 + %8935 = fmul float %8895, 0x3FF7154760000000, !dbg !329 + %8936 = select i1 %8408, float %8935, float 0xFFF0000000000000, !dbg !330 + %8937 = fmul float %8896, 0x3FF7154760000000, !dbg !329 + %8938 = select i1 %8409, float %8937, float 0xFFF0000000000000, !dbg !330 + %8939 = fmul float %8897, 0x3FF7154760000000, !dbg !329 + %8940 = select i1 %8410, float %8939, float 0xFFF0000000000000, !dbg !330 + %8941 = fmul float %8898, 0x3FF7154760000000, !dbg !329 + %8942 = select i1 %8411, float %8941, float 0xFFF0000000000000, !dbg !330 + %8943 = fmul float %8899, 0x3FF7154760000000, !dbg !329 + %8944 = select i1 %8412, float %8943, float 0xFFF0000000000000, !dbg !330 + %8945 = fmul float %8900, 0x3FF7154760000000, !dbg !329 + %8946 = select i1 %8413, float %8945, float 0xFFF0000000000000, !dbg !330 + %8947 = fmul float %8901, 0x3FF7154760000000, !dbg !329 + %8948 = select i1 %8414, float %8947, float 0xFFF0000000000000, !dbg !330 + %8949 = fmul float %8902, 0x3FF7154760000000, !dbg !329 + %8950 = select i1 %8415, float %8949, float 0xFFF0000000000000, !dbg !330 + %8951 = fmul float %8903, 0x3FF7154760000000, !dbg !329 + %8952 = select i1 %8416, float %8951, float 0xFFF0000000000000, !dbg !330 + %8953 = fmul float %8904, 0x3FF7154760000000, !dbg !329 + %8954 = select i1 %8417, float %8953, float 0xFFF0000000000000, !dbg !330 + %8955 = fmul float %8905, 0x3FF7154760000000, !dbg !329 + %8956 = select i1 %8418, float %8955, float 0xFFF0000000000000, !dbg !330 + %8957 = fmul float %8906, 0x3FF7154760000000, !dbg !329 + %8958 = select i1 %8419, float %8957, float 0xFFF0000000000000, !dbg !330 + %8959 = fmul float %8907, 0x3FF7154760000000, !dbg !329 + %8960 = select i1 %8420, float %8959, float 0xFFF0000000000000, !dbg !330 + %8961 = fmul float %8908, 0x3FF7154760000000, !dbg !329 + %8962 = select i1 %8421, float %8961, float 0xFFF0000000000000, !dbg !330 + %8963 = fmul float %8909, 0x3FF7154760000000, !dbg !329 + %8964 = select i1 %8422, float %8963, float 0xFFF0000000000000, !dbg !330 + %8965 = fmul float %8910, 0x3FF7154760000000, !dbg !329 + %8966 = select i1 %8423, float %8965, float 0xFFF0000000000000, !dbg !330 + %8967 = fmul float %8911, 0x3FF7154760000000, !dbg !329 + %8968 = select i1 %8424, float %8967, float 0xFFF0000000000000, !dbg !330 + %8969 = fmul float %8912, 0x3FF7154760000000, !dbg !329 + %8970 = select i1 %8425, float %8969, float 0xFFF0000000000000, !dbg !330 + %8971 = fmul float %8913, 0x3FF7154760000000, !dbg !329 + %8972 = select i1 %8426, float %8971, float 0xFFF0000000000000, !dbg !330 + %8973 = fmul float %8914, 0x3FF7154760000000, !dbg !329 + %8974 = select i1 %8427, float %8973, float 0xFFF0000000000000, !dbg !330 + %8975 = fmul float %8915, 0x3FF7154760000000, !dbg !329 + %8976 = select i1 %8428, float %8975, float 0xFFF0000000000000, !dbg !330 + %8977 = fmul float %8916, 0x3FF7154760000000, !dbg !329 + %8978 = select i1 %8429, float %8977, float 0xFFF0000000000000, !dbg !330 + %8979 = fmul float %8917, 0x3FF7154760000000, !dbg !329 + %8980 = select i1 %8430, float %8979, float 0xFFF0000000000000, !dbg !330 + %8981 = fmul float %8918, 0x3FF7154760000000, !dbg !329 + %8982 = select i1 %8431, float %8981, float 0xFFF0000000000000, !dbg !330 + %8983 = fsub float %8920, %8484, !dbg !331 + %8984 = fsub float %8922, %8485, !dbg !331 + %8985 = fsub float %8924, %8484, !dbg !331 + %8986 = fsub float %8926, %8485, !dbg !331 + %8987 = fsub float %8928, %8486, !dbg !331 + %8988 = fsub float %8930, %8487, !dbg !331 + %8989 = fsub float %8932, %8486, !dbg !331 + %8990 = fsub float %8934, %8487, !dbg !331 + %8991 = fsub float %8936, %8488, !dbg !331 + %8992 = fsub float %8938, %8489, !dbg !331 + %8993 = fsub float %8940, %8488, !dbg !331 + %8994 = fsub float %8942, %8489, !dbg !331 + %8995 = fsub float %8944, %8490, !dbg !331 + %8996 = fsub float %8946, %8491, !dbg !331 + %8997 = fsub float %8948, %8490, !dbg !331 + %8998 = fsub float %8950, %8491, !dbg !331 + %8999 = fsub float %8952, %8492, !dbg !331 + %9000 = fsub float %8954, %8493, !dbg !331 + %9001 = fsub float %8956, %8492, !dbg !331 + %9002 = fsub float %8958, %8493, !dbg !331 + %9003 = fsub float %8960, %8494, !dbg !331 + %9004 = fsub float %8962, %8495, !dbg !331 + %9005 = fsub float %8964, %8494, !dbg !331 + %9006 = fsub float %8966, %8495, !dbg !331 + %9007 = fsub float %8968, %8496, !dbg !331 + %9008 = fsub float %8970, %8497, !dbg !331 + %9009 = fsub float %8972, %8496, !dbg !331 + %9010 = fsub float %8974, %8497, !dbg !331 + %9011 = fsub float %8976, %8498, !dbg !331 + %9012 = fsub float %8978, %8499, !dbg !331 + %9013 = fsub float %8980, %8498, !dbg !331 + %9014 = fsub float %8982, %8499, !dbg !331 + %9015 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i = icmp eq i32 %9015, 0, !dbg !332 + br i1 %.not.i, label %9018, label %9016, !dbg !332 + +9016: ; preds = %.lr.ph1819 + %9017 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8983) #3, !dbg !332 + br label %__nv_exp2f.exit, !dbg !332 + +9018: ; preds = %.lr.ph1819 + %9019 = tail call float @llvm.nvvm.ex2.approx.f(float %8983) #3, !dbg !332 + br label %__nv_exp2f.exit, !dbg !332 + +__nv_exp2f.exit: ; preds = %9016, %9018 + %.0.i = phi float [ %9017, %9016 ], [ %9019, %9018 ], !dbg !332 + %9020 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1192 = icmp eq i32 %9020, 0, !dbg !332 + br i1 %.not.i1192, label %9023, label %9021, !dbg !332 + +9021: ; preds = %__nv_exp2f.exit + %9022 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8984) #3, !dbg !332 + br label %__nv_exp2f.exit1194, !dbg !332 + +9023: ; preds = %__nv_exp2f.exit + %9024 = tail call float @llvm.nvvm.ex2.approx.f(float %8984) #3, !dbg !332 + br label %__nv_exp2f.exit1194, !dbg !332 + +__nv_exp2f.exit1194: ; preds = %9021, %9023 + %.0.i1193 = phi float [ %9022, %9021 ], [ %9024, %9023 ], !dbg !332 + %9025 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1195 = icmp eq i32 %9025, 0, !dbg !332 + br i1 %.not.i1195, label %9028, label %9026, !dbg !332 + +9026: ; preds = %__nv_exp2f.exit1194 + %9027 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8985) #3, !dbg !332 + br label %__nv_exp2f.exit1197, !dbg !332 + +9028: ; preds = %__nv_exp2f.exit1194 + %9029 = tail call float @llvm.nvvm.ex2.approx.f(float %8985) #3, !dbg !332 + br label %__nv_exp2f.exit1197, !dbg !332 + +__nv_exp2f.exit1197: ; preds = %9026, %9028 + %.0.i1196 = phi float [ %9027, %9026 ], [ %9029, %9028 ], !dbg !332 + %9030 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1198 = icmp eq i32 %9030, 0, !dbg !332 + br i1 %.not.i1198, label %9033, label %9031, !dbg !332 + +9031: ; preds = %__nv_exp2f.exit1197 + %9032 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8986) #3, !dbg !332 + br label %__nv_exp2f.exit1200, !dbg !332 + +9033: ; preds = %__nv_exp2f.exit1197 + %9034 = tail call float @llvm.nvvm.ex2.approx.f(float %8986) #3, !dbg !332 + br label %__nv_exp2f.exit1200, !dbg !332 + +__nv_exp2f.exit1200: ; preds = %9031, %9033 + %.0.i1199 = phi float [ %9032, %9031 ], [ %9034, %9033 ], !dbg !332 + %9035 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1201 = icmp eq i32 %9035, 0, !dbg !332 + br i1 %.not.i1201, label %9038, label %9036, !dbg !332 + +9036: ; preds = %__nv_exp2f.exit1200 + %9037 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8987) #3, !dbg !332 + br label %__nv_exp2f.exit1203, !dbg !332 + +9038: ; preds = %__nv_exp2f.exit1200 + %9039 = tail call float @llvm.nvvm.ex2.approx.f(float %8987) #3, !dbg !332 + br label %__nv_exp2f.exit1203, !dbg !332 + +__nv_exp2f.exit1203: ; preds = %9036, %9038 + %.0.i1202 = phi float [ %9037, %9036 ], [ %9039, %9038 ], !dbg !332 + %9040 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1204 = icmp eq i32 %9040, 0, !dbg !332 + br i1 %.not.i1204, label %9043, label %9041, !dbg !332 + +9041: ; preds = %__nv_exp2f.exit1203 + %9042 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8988) #3, !dbg !332 + br label %__nv_exp2f.exit1206, !dbg !332 + +9043: ; preds = %__nv_exp2f.exit1203 + %9044 = tail call float @llvm.nvvm.ex2.approx.f(float %8988) #3, !dbg !332 + br label %__nv_exp2f.exit1206, !dbg !332 + +__nv_exp2f.exit1206: ; preds = %9041, %9043 + %.0.i1205 = phi float [ %9042, %9041 ], [ %9044, %9043 ], !dbg !332 + %9045 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1207 = icmp eq i32 %9045, 0, !dbg !332 + br i1 %.not.i1207, label %9048, label %9046, !dbg !332 + +9046: ; preds = %__nv_exp2f.exit1206 + %9047 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8989) #3, !dbg !332 + br label %__nv_exp2f.exit1209, !dbg !332 + +9048: ; preds = %__nv_exp2f.exit1206 + %9049 = tail call float @llvm.nvvm.ex2.approx.f(float %8989) #3, !dbg !332 + br label %__nv_exp2f.exit1209, !dbg !332 + +__nv_exp2f.exit1209: ; preds = %9046, %9048 + %.0.i1208 = phi float [ %9047, %9046 ], [ %9049, %9048 ], !dbg !332 + %9050 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1210 = icmp eq i32 %9050, 0, !dbg !332 + br i1 %.not.i1210, label %9053, label %9051, !dbg !332 + +9051: ; preds = %__nv_exp2f.exit1209 + %9052 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8990) #3, !dbg !332 + br label %__nv_exp2f.exit1212, !dbg !332 + +9053: ; preds = %__nv_exp2f.exit1209 + %9054 = tail call float @llvm.nvvm.ex2.approx.f(float %8990) #3, !dbg !332 + br label %__nv_exp2f.exit1212, !dbg !332 + +__nv_exp2f.exit1212: ; preds = %9051, %9053 + %.0.i1211 = phi float [ %9052, %9051 ], [ %9054, %9053 ], !dbg !332 + %9055 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1213 = icmp eq i32 %9055, 0, !dbg !332 + br i1 %.not.i1213, label %9058, label %9056, !dbg !332 + +9056: ; preds = %__nv_exp2f.exit1212 + %9057 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8991) #3, !dbg !332 + br label %__nv_exp2f.exit1215, !dbg !332 + +9058: ; preds = %__nv_exp2f.exit1212 + %9059 = tail call float @llvm.nvvm.ex2.approx.f(float %8991) #3, !dbg !332 + br label %__nv_exp2f.exit1215, !dbg !332 + +__nv_exp2f.exit1215: ; preds = %9056, %9058 + %.0.i1214 = phi float [ %9057, %9056 ], [ %9059, %9058 ], !dbg !332 + %9060 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1216 = icmp eq i32 %9060, 0, !dbg !332 + br i1 %.not.i1216, label %9063, label %9061, !dbg !332 + +9061: ; preds = %__nv_exp2f.exit1215 + %9062 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8992) #3, !dbg !332 + br label %__nv_exp2f.exit1218, !dbg !332 + +9063: ; preds = %__nv_exp2f.exit1215 + %9064 = tail call float @llvm.nvvm.ex2.approx.f(float %8992) #3, !dbg !332 + br label %__nv_exp2f.exit1218, !dbg !332 + +__nv_exp2f.exit1218: ; preds = %9061, %9063 + %.0.i1217 = phi float [ %9062, %9061 ], [ %9064, %9063 ], !dbg !332 + %9065 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1219 = icmp eq i32 %9065, 0, !dbg !332 + br i1 %.not.i1219, label %9068, label %9066, !dbg !332 + +9066: ; preds = %__nv_exp2f.exit1218 + %9067 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8993) #3, !dbg !332 + br label %__nv_exp2f.exit1221, !dbg !332 + +9068: ; preds = %__nv_exp2f.exit1218 + %9069 = tail call float @llvm.nvvm.ex2.approx.f(float %8993) #3, !dbg !332 + br label %__nv_exp2f.exit1221, !dbg !332 + +__nv_exp2f.exit1221: ; preds = %9066, %9068 + %.0.i1220 = phi float [ %9067, %9066 ], [ %9069, %9068 ], !dbg !332 + %9070 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1222 = icmp eq i32 %9070, 0, !dbg !332 + br i1 %.not.i1222, label %9073, label %9071, !dbg !332 + +9071: ; preds = %__nv_exp2f.exit1221 + %9072 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8994) #3, !dbg !332 + br label %__nv_exp2f.exit1224, !dbg !332 + +9073: ; preds = %__nv_exp2f.exit1221 + %9074 = tail call float @llvm.nvvm.ex2.approx.f(float %8994) #3, !dbg !332 + br label %__nv_exp2f.exit1224, !dbg !332 + +__nv_exp2f.exit1224: ; preds = %9071, %9073 + %.0.i1223 = phi float [ %9072, %9071 ], [ %9074, %9073 ], !dbg !332 + %9075 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1225 = icmp eq i32 %9075, 0, !dbg !332 + br i1 %.not.i1225, label %9078, label %9076, !dbg !332 + +9076: ; preds = %__nv_exp2f.exit1224 + %9077 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8995) #3, !dbg !332 + br label %__nv_exp2f.exit1227, !dbg !332 + +9078: ; preds = %__nv_exp2f.exit1224 + %9079 = tail call float @llvm.nvvm.ex2.approx.f(float %8995) #3, !dbg !332 + br label %__nv_exp2f.exit1227, !dbg !332 + +__nv_exp2f.exit1227: ; preds = %9076, %9078 + %.0.i1226 = phi float [ %9077, %9076 ], [ %9079, %9078 ], !dbg !332 + %9080 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1228 = icmp eq i32 %9080, 0, !dbg !332 + br i1 %.not.i1228, label %9083, label %9081, !dbg !332 + +9081: ; preds = %__nv_exp2f.exit1227 + %9082 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8996) #3, !dbg !332 + br label %__nv_exp2f.exit1230, !dbg !332 + +9083: ; preds = %__nv_exp2f.exit1227 + %9084 = tail call float @llvm.nvvm.ex2.approx.f(float %8996) #3, !dbg !332 + br label %__nv_exp2f.exit1230, !dbg !332 + +__nv_exp2f.exit1230: ; preds = %9081, %9083 + %.0.i1229 = phi float [ %9082, %9081 ], [ %9084, %9083 ], !dbg !332 + %9085 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1231 = icmp eq i32 %9085, 0, !dbg !332 + br i1 %.not.i1231, label %9088, label %9086, !dbg !332 + +9086: ; preds = %__nv_exp2f.exit1230 + %9087 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8997) #3, !dbg !332 + br label %__nv_exp2f.exit1233, !dbg !332 + +9088: ; preds = %__nv_exp2f.exit1230 + %9089 = tail call float @llvm.nvvm.ex2.approx.f(float %8997) #3, !dbg !332 + br label %__nv_exp2f.exit1233, !dbg !332 + +__nv_exp2f.exit1233: ; preds = %9086, %9088 + %.0.i1232 = phi float [ %9087, %9086 ], [ %9089, %9088 ], !dbg !332 + %9090 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1234 = icmp eq i32 %9090, 0, !dbg !332 + br i1 %.not.i1234, label %9093, label %9091, !dbg !332 + +9091: ; preds = %__nv_exp2f.exit1233 + %9092 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8998) #3, !dbg !332 + br label %__nv_exp2f.exit1236, !dbg !332 + +9093: ; preds = %__nv_exp2f.exit1233 + %9094 = tail call float @llvm.nvvm.ex2.approx.f(float %8998) #3, !dbg !332 + br label %__nv_exp2f.exit1236, !dbg !332 + +__nv_exp2f.exit1236: ; preds = %9091, %9093 + %.0.i1235 = phi float [ %9092, %9091 ], [ %9094, %9093 ], !dbg !332 + %9095 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1237 = icmp eq i32 %9095, 0, !dbg !332 + br i1 %.not.i1237, label %9098, label %9096, !dbg !332 + +9096: ; preds = %__nv_exp2f.exit1236 + %9097 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %8999) #3, !dbg !332 + br label %__nv_exp2f.exit1239, !dbg !332 + +9098: ; preds = %__nv_exp2f.exit1236 + %9099 = tail call float @llvm.nvvm.ex2.approx.f(float %8999) #3, !dbg !332 + br label %__nv_exp2f.exit1239, !dbg !332 + +__nv_exp2f.exit1239: ; preds = %9096, %9098 + %.0.i1238 = phi float [ %9097, %9096 ], [ %9099, %9098 ], !dbg !332 + %9100 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1240 = icmp eq i32 %9100, 0, !dbg !332 + br i1 %.not.i1240, label %9103, label %9101, !dbg !332 + +9101: ; preds = %__nv_exp2f.exit1239 + %9102 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9000) #3, !dbg !332 + br label %__nv_exp2f.exit1242, !dbg !332 + +9103: ; preds = %__nv_exp2f.exit1239 + %9104 = tail call float @llvm.nvvm.ex2.approx.f(float %9000) #3, !dbg !332 + br label %__nv_exp2f.exit1242, !dbg !332 + +__nv_exp2f.exit1242: ; preds = %9101, %9103 + %.0.i1241 = phi float [ %9102, %9101 ], [ %9104, %9103 ], !dbg !332 + %9105 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1243 = icmp eq i32 %9105, 0, !dbg !332 + br i1 %.not.i1243, label %9108, label %9106, !dbg !332 + +9106: ; preds = %__nv_exp2f.exit1242 + %9107 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9001) #3, !dbg !332 + br label %__nv_exp2f.exit1245, !dbg !332 + +9108: ; preds = %__nv_exp2f.exit1242 + %9109 = tail call float @llvm.nvvm.ex2.approx.f(float %9001) #3, !dbg !332 + br label %__nv_exp2f.exit1245, !dbg !332 + +__nv_exp2f.exit1245: ; preds = %9106, %9108 + %.0.i1244 = phi float [ %9107, %9106 ], [ %9109, %9108 ], !dbg !332 + %9110 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1246 = icmp eq i32 %9110, 0, !dbg !332 + br i1 %.not.i1246, label %9113, label %9111, !dbg !332 + +9111: ; preds = %__nv_exp2f.exit1245 + %9112 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9002) #3, !dbg !332 + br label %__nv_exp2f.exit1248, !dbg !332 + +9113: ; preds = %__nv_exp2f.exit1245 + %9114 = tail call float @llvm.nvvm.ex2.approx.f(float %9002) #3, !dbg !332 + br label %__nv_exp2f.exit1248, !dbg !332 + +__nv_exp2f.exit1248: ; preds = %9111, %9113 + %.0.i1247 = phi float [ %9112, %9111 ], [ %9114, %9113 ], !dbg !332 + %9115 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1249 = icmp eq i32 %9115, 0, !dbg !332 + br i1 %.not.i1249, label %9118, label %9116, !dbg !332 + +9116: ; preds = %__nv_exp2f.exit1248 + %9117 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9003) #3, !dbg !332 + br label %__nv_exp2f.exit1251, !dbg !332 + +9118: ; preds = %__nv_exp2f.exit1248 + %9119 = tail call float @llvm.nvvm.ex2.approx.f(float %9003) #3, !dbg !332 + br label %__nv_exp2f.exit1251, !dbg !332 + +__nv_exp2f.exit1251: ; preds = %9116, %9118 + %.0.i1250 = phi float [ %9117, %9116 ], [ %9119, %9118 ], !dbg !332 + %9120 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1252 = icmp eq i32 %9120, 0, !dbg !332 + br i1 %.not.i1252, label %9123, label %9121, !dbg !332 + +9121: ; preds = %__nv_exp2f.exit1251 + %9122 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9004) #3, !dbg !332 + br label %__nv_exp2f.exit1254, !dbg !332 + +9123: ; preds = %__nv_exp2f.exit1251 + %9124 = tail call float @llvm.nvvm.ex2.approx.f(float %9004) #3, !dbg !332 + br label %__nv_exp2f.exit1254, !dbg !332 + +__nv_exp2f.exit1254: ; preds = %9121, %9123 + %.0.i1253 = phi float [ %9122, %9121 ], [ %9124, %9123 ], !dbg !332 + %9125 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1255 = icmp eq i32 %9125, 0, !dbg !332 + br i1 %.not.i1255, label %9128, label %9126, !dbg !332 + +9126: ; preds = %__nv_exp2f.exit1254 + %9127 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9005) #3, !dbg !332 + br label %__nv_exp2f.exit1257, !dbg !332 + +9128: ; preds = %__nv_exp2f.exit1254 + %9129 = tail call float @llvm.nvvm.ex2.approx.f(float %9005) #3, !dbg !332 + br label %__nv_exp2f.exit1257, !dbg !332 + +__nv_exp2f.exit1257: ; preds = %9126, %9128 + %.0.i1256 = phi float [ %9127, %9126 ], [ %9129, %9128 ], !dbg !332 + %9130 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1258 = icmp eq i32 %9130, 0, !dbg !332 + br i1 %.not.i1258, label %9133, label %9131, !dbg !332 + +9131: ; preds = %__nv_exp2f.exit1257 + %9132 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9006) #3, !dbg !332 + br label %__nv_exp2f.exit1260, !dbg !332 + +9133: ; preds = %__nv_exp2f.exit1257 + %9134 = tail call float @llvm.nvvm.ex2.approx.f(float %9006) #3, !dbg !332 + br label %__nv_exp2f.exit1260, !dbg !332 + +__nv_exp2f.exit1260: ; preds = %9131, %9133 + %.0.i1259 = phi float [ %9132, %9131 ], [ %9134, %9133 ], !dbg !332 + %9135 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1261 = icmp eq i32 %9135, 0, !dbg !332 + br i1 %.not.i1261, label %9138, label %9136, !dbg !332 + +9136: ; preds = %__nv_exp2f.exit1260 + %9137 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9007) #3, !dbg !332 + br label %__nv_exp2f.exit1263, !dbg !332 + +9138: ; preds = %__nv_exp2f.exit1260 + %9139 = tail call float @llvm.nvvm.ex2.approx.f(float %9007) #3, !dbg !332 + br label %__nv_exp2f.exit1263, !dbg !332 + +__nv_exp2f.exit1263: ; preds = %9136, %9138 + %.0.i1262 = phi float [ %9137, %9136 ], [ %9139, %9138 ], !dbg !332 + %9140 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1264 = icmp eq i32 %9140, 0, !dbg !332 + br i1 %.not.i1264, label %9143, label %9141, !dbg !332 + +9141: ; preds = %__nv_exp2f.exit1263 + %9142 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9008) #3, !dbg !332 + br label %__nv_exp2f.exit1266, !dbg !332 + +9143: ; preds = %__nv_exp2f.exit1263 + %9144 = tail call float @llvm.nvvm.ex2.approx.f(float %9008) #3, !dbg !332 + br label %__nv_exp2f.exit1266, !dbg !332 + +__nv_exp2f.exit1266: ; preds = %9141, %9143 + %.0.i1265 = phi float [ %9142, %9141 ], [ %9144, %9143 ], !dbg !332 + %9145 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1267 = icmp eq i32 %9145, 0, !dbg !332 + br i1 %.not.i1267, label %9148, label %9146, !dbg !332 + +9146: ; preds = %__nv_exp2f.exit1266 + %9147 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9009) #3, !dbg !332 + br label %__nv_exp2f.exit1269, !dbg !332 + +9148: ; preds = %__nv_exp2f.exit1266 + %9149 = tail call float @llvm.nvvm.ex2.approx.f(float %9009) #3, !dbg !332 + br label %__nv_exp2f.exit1269, !dbg !332 + +__nv_exp2f.exit1269: ; preds = %9146, %9148 + %.0.i1268 = phi float [ %9147, %9146 ], [ %9149, %9148 ], !dbg !332 + %9150 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1270 = icmp eq i32 %9150, 0, !dbg !332 + br i1 %.not.i1270, label %9153, label %9151, !dbg !332 + +9151: ; preds = %__nv_exp2f.exit1269 + %9152 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9010) #3, !dbg !332 + br label %__nv_exp2f.exit1272, !dbg !332 + +9153: ; preds = %__nv_exp2f.exit1269 + %9154 = tail call float @llvm.nvvm.ex2.approx.f(float %9010) #3, !dbg !332 + br label %__nv_exp2f.exit1272, !dbg !332 + +__nv_exp2f.exit1272: ; preds = %9151, %9153 + %.0.i1271 = phi float [ %9152, %9151 ], [ %9154, %9153 ], !dbg !332 + %9155 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1273 = icmp eq i32 %9155, 0, !dbg !332 + br i1 %.not.i1273, label %9158, label %9156, !dbg !332 + +9156: ; preds = %__nv_exp2f.exit1272 + %9157 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9011) #3, !dbg !332 + br label %__nv_exp2f.exit1275, !dbg !332 + +9158: ; preds = %__nv_exp2f.exit1272 + %9159 = tail call float @llvm.nvvm.ex2.approx.f(float %9011) #3, !dbg !332 + br label %__nv_exp2f.exit1275, !dbg !332 + +__nv_exp2f.exit1275: ; preds = %9156, %9158 + %.0.i1274 = phi float [ %9157, %9156 ], [ %9159, %9158 ], !dbg !332 + %9160 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1276 = icmp eq i32 %9160, 0, !dbg !332 + br i1 %.not.i1276, label %9163, label %9161, !dbg !332 + +9161: ; preds = %__nv_exp2f.exit1275 + %9162 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9012) #3, !dbg !332 + br label %__nv_exp2f.exit1278, !dbg !332 + +9163: ; preds = %__nv_exp2f.exit1275 + %9164 = tail call float @llvm.nvvm.ex2.approx.f(float %9012) #3, !dbg !332 + br label %__nv_exp2f.exit1278, !dbg !332 + +__nv_exp2f.exit1278: ; preds = %9161, %9163 + %.0.i1277 = phi float [ %9162, %9161 ], [ %9164, %9163 ], !dbg !332 + %9165 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1279 = icmp eq i32 %9165, 0, !dbg !332 + br i1 %.not.i1279, label %9168, label %9166, !dbg !332 + +9166: ; preds = %__nv_exp2f.exit1278 + %9167 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9013) #3, !dbg !332 + br label %__nv_exp2f.exit1281, !dbg !332 + +9168: ; preds = %__nv_exp2f.exit1278 + %9169 = tail call float @llvm.nvvm.ex2.approx.f(float %9013) #3, !dbg !332 + br label %__nv_exp2f.exit1281, !dbg !332 + +__nv_exp2f.exit1281: ; preds = %9166, %9168 + %.0.i1280 = phi float [ %9167, %9166 ], [ %9169, %9168 ], !dbg !332 + %9170 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #3, !dbg !332 + %.not.i1282 = icmp eq i32 %9170, 0, !dbg !332 + br i1 %.not.i1282, label %9173, label %9171, !dbg !332 + +9171: ; preds = %__nv_exp2f.exit1281 + %9172 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %9014) #3, !dbg !332 + br label %__nv_exp2f.exit1284, !dbg !332 + +9173: ; preds = %__nv_exp2f.exit1281 + %9174 = tail call float @llvm.nvvm.ex2.approx.f(float %9014) #3, !dbg !332 + br label %__nv_exp2f.exit1284, !dbg !332 + +__nv_exp2f.exit1284: ; preds = %9171, %9173 + %.0.i1283 = phi float [ %9172, %9171 ], [ %9174, %9173 ], !dbg !332 + %9175 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %8432, !dbg !318 + %9176 = insertelement <2 x float> poison, float %.0.i, i64 0, !dbg !333 + %9177 = insertelement <2 x float> %9176, float %.0.i1193, i64 1, !dbg !333 + %9178 = fptrunc <2 x float> %9177 to <2 x bfloat>, !dbg !333 + %9179 = insertelement <2 x float> poison, float %.0.i1196, i64 0, !dbg !333 + %9180 = insertelement <2 x float> %9179, float %.0.i1199, i64 1, !dbg !333 + %9181 = fptrunc <2 x float> %9180 to <2 x bfloat>, !dbg !333 + %9182 = insertelement <2 x float> poison, float %.0.i1202, i64 0, !dbg !333 + %9183 = insertelement <2 x float> %9182, float %.0.i1205, i64 1, !dbg !333 + %9184 = fptrunc <2 x float> %9183 to <2 x bfloat>, !dbg !333 + %9185 = insertelement <2 x float> poison, float %.0.i1208, i64 0, !dbg !333 + %9186 = insertelement <2 x float> %9185, float %.0.i1211, i64 1, !dbg !333 + %9187 = fptrunc <2 x float> %9186 to <2 x bfloat>, !dbg !333 + %9188 = insertelement <2 x float> poison, float %.0.i1214, i64 0, !dbg !333 + %9189 = insertelement <2 x float> %9188, float %.0.i1217, i64 1, !dbg !333 + %9190 = fptrunc <2 x float> %9189 to <2 x bfloat>, !dbg !333 + %9191 = insertelement <2 x float> poison, float %.0.i1220, i64 0, !dbg !333 + %9192 = insertelement <2 x float> %9191, float %.0.i1223, i64 1, !dbg !333 + %9193 = fptrunc <2 x float> %9192 to <2 x bfloat>, !dbg !333 + %9194 = insertelement <2 x float> poison, float %.0.i1226, i64 0, !dbg !333 + %9195 = insertelement <2 x float> %9194, float %.0.i1229, i64 1, !dbg !333 + %9196 = fptrunc <2 x float> %9195 to <2 x bfloat>, !dbg !333 + %9197 = insertelement <2 x float> poison, float %.0.i1232, i64 0, !dbg !333 + %9198 = insertelement <2 x float> %9197, float %.0.i1235, i64 1, !dbg !333 + %9199 = fptrunc <2 x float> %9198 to <2 x bfloat>, !dbg !333 + %9200 = insertelement <2 x float> poison, float %.0.i1238, i64 0, !dbg !333 + %9201 = insertelement <2 x float> %9200, float %.0.i1241, i64 1, !dbg !333 + %9202 = fptrunc <2 x float> %9201 to <2 x bfloat>, !dbg !333 + %9203 = insertelement <2 x float> poison, float %.0.i1244, i64 0, !dbg !333 + %9204 = insertelement <2 x float> %9203, float %.0.i1247, i64 1, !dbg !333 + %9205 = fptrunc <2 x float> %9204 to <2 x bfloat>, !dbg !333 + %9206 = insertelement <2 x float> poison, float %.0.i1250, i64 0, !dbg !333 + %9207 = insertelement <2 x float> %9206, float %.0.i1253, i64 1, !dbg !333 + %9208 = fptrunc <2 x float> %9207 to <2 x bfloat>, !dbg !333 + %9209 = insertelement <2 x float> poison, float %.0.i1256, i64 0, !dbg !333 + %9210 = insertelement <2 x float> %9209, float %.0.i1259, i64 1, !dbg !333 + %9211 = fptrunc <2 x float> %9210 to <2 x bfloat>, !dbg !333 + %9212 = insertelement <2 x float> poison, float %.0.i1262, i64 0, !dbg !333 + %9213 = insertelement <2 x float> %9212, float %.0.i1265, i64 1, !dbg !333 + %9214 = fptrunc <2 x float> %9213 to <2 x bfloat>, !dbg !333 + %9215 = insertelement <2 x float> poison, float %.0.i1268, i64 0, !dbg !333 + %9216 = insertelement <2 x float> %9215, float %.0.i1271, i64 1, !dbg !333 + %9217 = fptrunc <2 x float> %9216 to <2 x bfloat>, !dbg !333 + %9218 = insertelement <2 x float> poison, float %.0.i1274, i64 0, !dbg !333 + %9219 = insertelement <2 x float> %9218, float %.0.i1277, i64 1, !dbg !333 + %9220 = fptrunc <2 x float> %9219 to <2 x bfloat>, !dbg !333 + %9221 = insertelement <2 x float> poison, float %.0.i1280, i64 0, !dbg !333 + %9222 = insertelement <2 x float> %9221, float %.0.i1283, i64 1, !dbg !333 + %9223 = fptrunc <2 x float> %9222 to <2 x bfloat>, !dbg !333 + %9224 = bitcast <2 x bfloat> %9178 to i32, !dbg !334 + %9225 = bitcast <2 x bfloat> %9181 to i32, !dbg !334 + %9226 = bitcast <2 x bfloat> %9184 to i32, !dbg !334 + %9227 = bitcast <2 x bfloat> %9187 to i32, !dbg !334 + %9228 = bitcast <2 x bfloat> %9190 to i32, !dbg !334 + %9229 = bitcast <2 x bfloat> %9193 to i32, !dbg !334 + %9230 = bitcast <2 x bfloat> %9196 to i32, !dbg !334 + %9231 = bitcast <2 x bfloat> %9199 to i32, !dbg !334 + %9232 = bitcast <2 x bfloat> %9202 to i32, !dbg !334 + %9233 = bitcast <2 x bfloat> %9205 to i32, !dbg !334 + %9234 = bitcast <2 x bfloat> %9208 to i32, !dbg !334 + %9235 = bitcast <2 x bfloat> %9211 to i32, !dbg !334 + %9236 = bitcast <2 x bfloat> %9214 to i32, !dbg !334 + %9237 = bitcast <2 x bfloat> %9217 to i32, !dbg !334 + %9238 = bitcast <2 x bfloat> %9220 to i32, !dbg !334 + %9239 = bitcast <2 x bfloat> %9223 to i32, !dbg !334 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !334 + %9240 = ptrtoint ptr addrspace(3) %9175 to i32, !dbg !334 + %9241 = lshr exact i32 %9240, 4, !dbg !334 + %9242 = and i32 %9241, 16383, !dbg !334 + %9243 = zext nneg i32 %9242 to i64, !dbg !334 + %9244 = or disjoint i64 %9243, 4611686293338849280, !dbg !334 + %9245 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %.pn5131737, float %.pn5111738, float %.pn5091739, float %.pn5071740, float %.pn5051741, float %.pn5031742, float %.pn5011743, float %.pn4991744, float %.pn4971745, float %.pn4951746, float %.pn4931747, float %.pn4911748, float %.pn4891749, float %.pn4871750, float %.pn4851751, float %.pn4831752, float %.pn4811753, float %.pn4791754, float %.pn4771755, float %.pn4751756, float %.pn4731757, float %.pn4711758, float %.pn4691759, float %.pn4671760, float %.pn4651761, float %.pn4631762, float %.pn4611763, float %.pn4591764, float %.pn4571765, float %.pn4551766, float %.pn4531767, float %.pn4511768, float %.pn4491769, float %.pn4471770, float %.pn4451771, float %.pn4431772, float %.pn4411773, float %.pn4391774, float %.pn4371775, float %.pn4351776, float %.pn4331777, float %.pn4311778, float %.pn4291779, float %.pn4271780, float %.pn4251781, float %.pn4231782, float %.pn4211783, float %.pn4191784, float %.pn4171785, float %.pn4151786, float %.pn4131787, float %.pn4111788, float %.pn4091789, float %.pn4071790, float %.pn4051791, float %.pn4031792, float %.pn4011793, float %.pn3991794, float %.pn3971795, float %.pn3951796, float %.pn3931797, float %.pn3911798, float %.pn3891799, float %.pn3871800, i32 %9224, i32 %9225, i32 %9226, i32 %9227, i64 %9244, i1 true) #3, !dbg !334 + %9246 = add i32 %9240, 2048, !dbg !334 + %9247 = lshr exact i32 %9246, 4, !dbg !334 + %9248 = and i32 %9247, 16383, !dbg !334 + %9249 = zext nneg i32 %9248 to i64, !dbg !334 + %9250 = or disjoint i64 %9249, 4611686293338849280, !dbg !334 + %9251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 0, !dbg !334 + %9252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 1, !dbg !334 + %9253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 2, !dbg !334 + %9254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 3, !dbg !334 + %9255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 4, !dbg !334 + %9256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 5, !dbg !334 + %9257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 6, !dbg !334 + %9258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 7, !dbg !334 + %9259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 8, !dbg !334 + %9260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 9, !dbg !334 + %9261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 10, !dbg !334 + %9262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 11, !dbg !334 + %9263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 12, !dbg !334 + %9264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 13, !dbg !334 + %9265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 14, !dbg !334 + %9266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 15, !dbg !334 + %9267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 16, !dbg !334 + %9268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 17, !dbg !334 + %9269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 18, !dbg !334 + %9270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 19, !dbg !334 + %9271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 20, !dbg !334 + %9272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 21, !dbg !334 + %9273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 22, !dbg !334 + %9274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 23, !dbg !334 + %9275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 24, !dbg !334 + %9276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 25, !dbg !334 + %9277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 26, !dbg !334 + %9278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 27, !dbg !334 + %9279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 28, !dbg !334 + %9280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 29, !dbg !334 + %9281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 30, !dbg !334 + %9282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 31, !dbg !334 + %9283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 32, !dbg !334 + %9284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 33, !dbg !334 + %9285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 34, !dbg !334 + %9286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 35, !dbg !334 + %9287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 36, !dbg !334 + %9288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 37, !dbg !334 + %9289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 38, !dbg !334 + %9290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 39, !dbg !334 + %9291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 40, !dbg !334 + %9292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 41, !dbg !334 + %9293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 42, !dbg !334 + %9294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 43, !dbg !334 + %9295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 44, !dbg !334 + %9296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 45, !dbg !334 + %9297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 46, !dbg !334 + %9298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 47, !dbg !334 + %9299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 48, !dbg !334 + %9300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 49, !dbg !334 + %9301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 50, !dbg !334 + %9302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 51, !dbg !334 + %9303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 52, !dbg !334 + %9304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 53, !dbg !334 + %9305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 54, !dbg !334 + %9306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 55, !dbg !334 + %9307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 56, !dbg !334 + %9308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 57, !dbg !334 + %9309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 58, !dbg !334 + %9310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 59, !dbg !334 + %9311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 60, !dbg !334 + %9312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 61, !dbg !334 + %9313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 62, !dbg !334 + %9314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9245, 63, !dbg !334 + %9315 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %9251, float %9252, float %9253, float %9254, float %9255, float %9256, float %9257, float %9258, float %9259, float %9260, float %9261, float %9262, float %9263, float %9264, float %9265, float %9266, float %9267, float %9268, float %9269, float %9270, float %9271, float %9272, float %9273, float %9274, float %9275, float %9276, float %9277, float %9278, float %9279, float %9280, float %9281, float %9282, float %9283, float %9284, float %9285, float %9286, float %9287, float %9288, float %9289, float %9290, float %9291, float %9292, float %9293, float %9294, float %9295, float %9296, float %9297, float %9298, float %9299, float %9300, float %9301, float %9302, float %9303, float %9304, float %9305, float %9306, float %9307, float %9308, float %9309, float %9310, float %9311, float %9312, float %9313, float %9314, i32 %9228, i32 %9229, i32 %9230, i32 %9231, i64 %9250, i1 true) #3, !dbg !334 + %9316 = add i32 %9240, 4096, !dbg !334 + %9317 = lshr exact i32 %9316, 4, !dbg !334 + %9318 = and i32 %9317, 16383, !dbg !334 + %9319 = zext nneg i32 %9318 to i64, !dbg !334 + %9320 = or disjoint i64 %9319, 4611686293338849280, !dbg !334 + %9321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 0, !dbg !334 + %9322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 1, !dbg !334 + %9323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 2, !dbg !334 + %9324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 3, !dbg !334 + %9325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 4, !dbg !334 + %9326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 5, !dbg !334 + %9327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 6, !dbg !334 + %9328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 7, !dbg !334 + %9329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 8, !dbg !334 + %9330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 9, !dbg !334 + %9331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 10, !dbg !334 + %9332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 11, !dbg !334 + %9333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 12, !dbg !334 + %9334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 13, !dbg !334 + %9335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 14, !dbg !334 + %9336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 15, !dbg !334 + %9337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 16, !dbg !334 + %9338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 17, !dbg !334 + %9339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 18, !dbg !334 + %9340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 19, !dbg !334 + %9341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 20, !dbg !334 + %9342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 21, !dbg !334 + %9343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 22, !dbg !334 + %9344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 23, !dbg !334 + %9345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 24, !dbg !334 + %9346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 25, !dbg !334 + %9347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 26, !dbg !334 + %9348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 27, !dbg !334 + %9349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 28, !dbg !334 + %9350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 29, !dbg !334 + %9351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 30, !dbg !334 + %9352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 31, !dbg !334 + %9353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 32, !dbg !334 + %9354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 33, !dbg !334 + %9355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 34, !dbg !334 + %9356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 35, !dbg !334 + %9357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 36, !dbg !334 + %9358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 37, !dbg !334 + %9359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 38, !dbg !334 + %9360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 39, !dbg !334 + %9361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 40, !dbg !334 + %9362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 41, !dbg !334 + %9363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 42, !dbg !334 + %9364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 43, !dbg !334 + %9365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 44, !dbg !334 + %9366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 45, !dbg !334 + %9367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 46, !dbg !334 + %9368 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 47, !dbg !334 + %9369 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 48, !dbg !334 + %9370 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 49, !dbg !334 + %9371 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 50, !dbg !334 + %9372 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 51, !dbg !334 + %9373 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 52, !dbg !334 + %9374 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 53, !dbg !334 + %9375 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 54, !dbg !334 + %9376 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 55, !dbg !334 + %9377 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 56, !dbg !334 + %9378 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 57, !dbg !334 + %9379 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 58, !dbg !334 + %9380 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 59, !dbg !334 + %9381 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 60, !dbg !334 + %9382 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 61, !dbg !334 + %9383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 62, !dbg !334 + %9384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9315, 63, !dbg !334 + %9385 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %9321, float %9322, float %9323, float %9324, float %9325, float %9326, float %9327, float %9328, float %9329, float %9330, float %9331, float %9332, float %9333, float %9334, float %9335, float %9336, float %9337, float %9338, float %9339, float %9340, float %9341, float %9342, float %9343, float %9344, float %9345, float %9346, float %9347, float %9348, float %9349, float %9350, float %9351, float %9352, float %9353, float %9354, float %9355, float %9356, float %9357, float %9358, float %9359, float %9360, float %9361, float %9362, float %9363, float %9364, float %9365, float %9366, float %9367, float %9368, float %9369, float %9370, float %9371, float %9372, float %9373, float %9374, float %9375, float %9376, float %9377, float %9378, float %9379, float %9380, float %9381, float %9382, float %9383, float %9384, i32 %9232, i32 %9233, i32 %9234, i32 %9235, i64 %9320, i1 true) #3, !dbg !334 + %9386 = add i32 %9240, 6144, !dbg !334 + %9387 = lshr exact i32 %9386, 4, !dbg !334 + %9388 = and i32 %9387, 16383, !dbg !334 + %9389 = zext nneg i32 %9388 to i64, !dbg !334 + %9390 = or disjoint i64 %9389, 4611686293338849280, !dbg !334 + %9391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 0, !dbg !334 + %9392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 1, !dbg !334 + %9393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 2, !dbg !334 + %9394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 3, !dbg !334 + %9395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 4, !dbg !334 + %9396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 5, !dbg !334 + %9397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 6, !dbg !334 + %9398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 7, !dbg !334 + %9399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 8, !dbg !334 + %9400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 9, !dbg !334 + %9401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 10, !dbg !334 + %9402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 11, !dbg !334 + %9403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 12, !dbg !334 + %9404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 13, !dbg !334 + %9405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 14, !dbg !334 + %9406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 15, !dbg !334 + %9407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 16, !dbg !334 + %9408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 17, !dbg !334 + %9409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 18, !dbg !334 + %9410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 19, !dbg !334 + %9411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 20, !dbg !334 + %9412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 21, !dbg !334 + %9413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 22, !dbg !334 + %9414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 23, !dbg !334 + %9415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 24, !dbg !334 + %9416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 25, !dbg !334 + %9417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 26, !dbg !334 + %9418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 27, !dbg !334 + %9419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 28, !dbg !334 + %9420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 29, !dbg !334 + %9421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 30, !dbg !334 + %9422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 31, !dbg !334 + %9423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 32, !dbg !334 + %9424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 33, !dbg !334 + %9425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 34, !dbg !334 + %9426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 35, !dbg !334 + %9427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 36, !dbg !334 + %9428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 37, !dbg !334 + %9429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 38, !dbg !334 + %9430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 39, !dbg !334 + %9431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 40, !dbg !334 + %9432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 41, !dbg !334 + %9433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 42, !dbg !334 + %9434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 43, !dbg !334 + %9435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 44, !dbg !334 + %9436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 45, !dbg !334 + %9437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 46, !dbg !334 + %9438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 47, !dbg !334 + %9439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 48, !dbg !334 + %9440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 49, !dbg !334 + %9441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 50, !dbg !334 + %9442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 51, !dbg !334 + %9443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 52, !dbg !334 + %9444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 53, !dbg !334 + %9445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 54, !dbg !334 + %9446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 55, !dbg !334 + %9447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 56, !dbg !334 + %9448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 57, !dbg !334 + %9449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 58, !dbg !334 + %9450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 59, !dbg !334 + %9451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 60, !dbg !334 + %9452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 61, !dbg !334 + %9453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 62, !dbg !334 + %9454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9385, 63, !dbg !334 + %9455 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %9391, float %9392, float %9393, float %9394, float %9395, float %9396, float %9397, float %9398, float %9399, float %9400, float %9401, float %9402, float %9403, float %9404, float %9405, float %9406, float %9407, float %9408, float %9409, float %9410, float %9411, float %9412, float %9413, float %9414, float %9415, float %9416, float %9417, float %9418, float %9419, float %9420, float %9421, float %9422, float %9423, float %9424, float %9425, float %9426, float %9427, float %9428, float %9429, float %9430, float %9431, float %9432, float %9433, float %9434, float %9435, float %9436, float %9437, float %9438, float %9439, float %9440, float %9441, float %9442, float %9443, float %9444, float %9445, float %9446, float %9447, float %9448, float %9449, float %9450, float %9451, float %9452, float %9453, float %9454, i32 %9236, i32 %9237, i32 %9238, i32 %9239, i64 %9390, i1 true) #3, !dbg !334 + %9456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 0, !dbg !334 + %9457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 1, !dbg !334 + %9458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 2, !dbg !334 + %9459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 3, !dbg !334 + %9460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 4, !dbg !334 + %9461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 5, !dbg !334 + %9462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 6, !dbg !334 + %9463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 7, !dbg !334 + %9464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 8, !dbg !334 + %9465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 9, !dbg !334 + %9466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 10, !dbg !334 + %9467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 11, !dbg !334 + %9468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 12, !dbg !334 + %9469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 13, !dbg !334 + %9470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 14, !dbg !334 + %9471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 15, !dbg !334 + %9472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 16, !dbg !334 + %9473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 17, !dbg !334 + %9474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 18, !dbg !334 + %9475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 19, !dbg !334 + %9476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 20, !dbg !334 + %9477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 21, !dbg !334 + %9478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 22, !dbg !334 + %9479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 23, !dbg !334 + %9480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 24, !dbg !334 + %9481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 25, !dbg !334 + %9482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 26, !dbg !334 + %9483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 27, !dbg !334 + %9484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 28, !dbg !334 + %9485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 29, !dbg !334 + %9486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 30, !dbg !334 + %9487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 31, !dbg !334 + %9488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 32, !dbg !334 + %9489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 33, !dbg !334 + %9490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 34, !dbg !334 + %9491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 35, !dbg !334 + %9492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 36, !dbg !334 + %9493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 37, !dbg !334 + %9494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 38, !dbg !334 + %9495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 39, !dbg !334 + %9496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 40, !dbg !334 + %9497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 41, !dbg !334 + %9498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 42, !dbg !334 + %9499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 43, !dbg !334 + %9500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 44, !dbg !334 + %9501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 45, !dbg !334 + %9502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 46, !dbg !334 + %9503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 47, !dbg !334 + %9504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 48, !dbg !334 + %9505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 49, !dbg !334 + %9506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 50, !dbg !334 + %9507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 51, !dbg !334 + %9508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 52, !dbg !334 + %9509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 53, !dbg !334 + %9510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 54, !dbg !334 + %9511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 55, !dbg !334 + %9512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 56, !dbg !334 + %9513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 57, !dbg !334 + %9514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 58, !dbg !334 + %9515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 59, !dbg !334 + %9516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 60, !dbg !334 + %9517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 61, !dbg !334 + %9518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 62, !dbg !334 + %9519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9455, 63, !dbg !334 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !334 + %9520 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107008), i32 %8434, !dbg !320 + %9521 = getelementptr inbounds nuw i8, ptr addrspace(3) %9520, i32 %5065, !dbg !320 + %9522 = getelementptr inbounds nuw i8, ptr addrspace(3) %9520, i32 %5068, !dbg !320 + %9523 = getelementptr inbounds nuw i8, ptr addrspace(3) %9520, i32 %5071, !dbg !320 + %9524 = getelementptr inbounds nuw i8, ptr addrspace(3) %9520, i32 %5074, !dbg !320 + %9525 = getelementptr inbounds nuw i8, ptr addrspace(3) %9520, i32 %5077, !dbg !320 + %9526 = getelementptr inbounds nuw i8, ptr addrspace(3) %9520, i32 %5080, !dbg !320 + %9527 = getelementptr inbounds nuw i8, ptr addrspace(3) %9520, i32 %5083, !dbg !320 + %9528 = getelementptr inbounds nuw i8, ptr addrspace(3) %9520, i32 %5086, !dbg !320 + %9529 = add i32 %8502, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288) to i32), !dbg !335 + %9530 = lshr exact i32 %9529, 4, !dbg !335 + %9531 = and i32 %9530, 16383, !dbg !335 + %9532 = zext nneg i32 %9531 to i64, !dbg !335 + %9533 = or disjoint i64 %9532, 4611686293372403712, !dbg !335 + %9534 = add i32 %8514, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288) to i32), !dbg !335 + %9535 = lshr exact i32 %9534, 4, !dbg !335 + %9536 = and i32 %9535, 16383, !dbg !335 + %9537 = zext nneg i32 %9536 to i64, !dbg !335 + %9538 = or disjoint i64 %9537, 4611686293372403712, !dbg !335 + %9539 = add i32 %9240, 32, !dbg !335 + %9540 = lshr exact i32 %9539, 4, !dbg !335 + %9541 = and i32 %9540, 16383, !dbg !335 + %9542 = zext nneg i32 %9541 to i64, !dbg !335 + %9543 = or disjoint i64 %9542, 4611686293338849280, !dbg !335 + %9544 = add i32 %8558, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288) to i32), !dbg !335 + %9545 = lshr exact i32 %9544, 4, !dbg !335 + %9546 = and i32 %9545, 16383, !dbg !335 + %9547 = zext nneg i32 %9546 to i64, !dbg !335 + %9548 = or disjoint i64 %9547, 4611686293372403712, !dbg !335 + %9549 = add i32 %9240, 64, !dbg !335 + %9550 = lshr exact i32 %9549, 4, !dbg !335 + %9551 = and i32 %9550, 16383, !dbg !335 + %9552 = zext nneg i32 %9551 to i64, !dbg !335 + %9553 = or disjoint i64 %9552, 4611686293338849280, !dbg !335 + %9554 = add i32 %8602, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288) to i32), !dbg !335 + %9555 = lshr exact i32 %9554, 4, !dbg !335 + %9556 = and i32 %9555, 16383, !dbg !335 + %9557 = zext nneg i32 %9556 to i64, !dbg !335 + %9558 = or disjoint i64 %9557, 4611686293372403712, !dbg !335 + %9559 = add i32 %9240, 96, !dbg !335 + %9560 = lshr exact i32 %9559, 4, !dbg !335 + %9561 = and i32 %9560, 16383, !dbg !335 + %9562 = zext nneg i32 %9561 to i64, !dbg !335 + %9563 = or disjoint i64 %9562, 4611686293338849280, !dbg !335 + %9564 = add i32 %8646, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288) to i32), !dbg !335 + %9565 = lshr exact i32 %9564, 4, !dbg !335 + %9566 = and i32 %9565, 16383, !dbg !335 + %9567 = zext nneg i32 %9566 to i64, !dbg !335 + %9568 = or disjoint i64 %9567, 4611686293372403712, !dbg !335 + %9569 = add i32 %9240, 8192, !dbg !335 + %9570 = lshr exact i32 %9569, 4, !dbg !335 + %9571 = and i32 %9570, 16383, !dbg !335 + %9572 = zext nneg i32 %9571 to i64, !dbg !335 + %9573 = or disjoint i64 %9572, 4611686293338849280, !dbg !335 + %9574 = add i32 %8690, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288) to i32), !dbg !335 + %9575 = lshr exact i32 %9574, 4, !dbg !335 + %9576 = and i32 %9575, 16383, !dbg !335 + %9577 = zext nneg i32 %9576 to i64, !dbg !335 + %9578 = or disjoint i64 %9577, 4611686293372403712, !dbg !335 + %9579 = add i32 %9240, 8224, !dbg !335 + %9580 = lshr exact i32 %9579, 4, !dbg !335 + %9581 = and i32 %9580, 16383, !dbg !335 + %9582 = zext nneg i32 %9581 to i64, !dbg !335 + %9583 = or disjoint i64 %9582, 4611686293338849280, !dbg !335 + %9584 = add i32 %8734, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288) to i32), !dbg !335 + %9585 = lshr exact i32 %9584, 4, !dbg !335 + %9586 = and i32 %9585, 16383, !dbg !335 + %9587 = zext nneg i32 %9586 to i64, !dbg !335 + %9588 = or disjoint i64 %9587, 4611686293372403712, !dbg !335 + %9589 = add i32 %9240, 8256, !dbg !335 + %9590 = lshr exact i32 %9589, 4, !dbg !335 + %9591 = and i32 %9590, 16383, !dbg !335 + %9592 = zext nneg i32 %9591 to i64, !dbg !335 + %9593 = or disjoint i64 %9592, 4611686293338849280, !dbg !335 + %9594 = add i32 %8778, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288) to i32), !dbg !335 + %9595 = lshr exact i32 %9594, 4, !dbg !335 + %9596 = and i32 %9595, 16383, !dbg !335 + %9597 = zext nneg i32 %9596 to i64, !dbg !335 + %9598 = or disjoint i64 %9597, 4611686293372403712, !dbg !335 + %9599 = add i32 %9240, 8288, !dbg !335 + %9600 = lshr exact i32 %9599, 4, !dbg !335 + %9601 = and i32 %9600, 16383, !dbg !335 + %9602 = zext nneg i32 %9601 to i64, !dbg !335 + %9603 = or disjoint i64 %9602, 4611686293338849280, !dbg !335 + %9604 = load <2 x float>, ptr addrspace(3) %9528, align 8, !dbg !320 + %9605 = load <2 x float>, ptr addrspace(3) %9527, align 8, !dbg !320 + %9606 = load <2 x float>, ptr addrspace(3) %9526, align 8, !dbg !320 + %9607 = load <2 x float>, ptr addrspace(3) %9525, align 8, !dbg !320 + %9608 = load <2 x float>, ptr addrspace(3) %9524, align 8, !dbg !320 + %9609 = load <2 x float>, ptr addrspace(3) %9523, align 8, !dbg !320 + %9610 = load <2 x float>, ptr addrspace(3) %9522, align 8, !dbg !320 + %9611 = load <2 x float>, ptr addrspace(3) %9521, align 8, !dbg !320 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !335 + %9612 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %9533, i64 %9244) #3, !dbg !335 + %9613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 0, !dbg !335 + %9614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 1, !dbg !335 + %9615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 2, !dbg !335 + %9616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 3, !dbg !335 + %9617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 4, !dbg !335 + %9618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 5, !dbg !335 + %9619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 6, !dbg !335 + %9620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 7, !dbg !335 + %9621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 8, !dbg !335 + %9622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 9, !dbg !335 + %9623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 10, !dbg !335 + %9624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 11, !dbg !335 + %9625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 12, !dbg !335 + %9626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 13, !dbg !335 + %9627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 14, !dbg !335 + %9628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 15, !dbg !335 + %9629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 16, !dbg !335 + %9630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 17, !dbg !335 + %9631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 18, !dbg !335 + %9632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 19, !dbg !335 + %9633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 20, !dbg !335 + %9634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 21, !dbg !335 + %9635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 22, !dbg !335 + %9636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 23, !dbg !335 + %9637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 24, !dbg !335 + %9638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 25, !dbg !335 + %9639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 26, !dbg !335 + %9640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 27, !dbg !335 + %9641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 28, !dbg !335 + %9642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 29, !dbg !335 + %9643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 30, !dbg !335 + %9644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9612, 31, !dbg !335 + %9645 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9613, float %9614, float %9615, float %9616, float %9617, float %9618, float %9619, float %9620, float %9621, float %9622, float %9623, float %9624, float %9625, float %9626, float %9627, float %9628, float %9629, float %9630, float %9631, float %9632, float %9633, float %9634, float %9635, float %9636, float %9637, float %9638, float %9639, float %9640, float %9641, float %9642, float %9643, float %9644, i64 %9538, i64 %9543, i1 true) #3, !dbg !335 + %9646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 0, !dbg !335 + %9647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 1, !dbg !335 + %9648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 2, !dbg !335 + %9649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 3, !dbg !335 + %9650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 4, !dbg !335 + %9651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 5, !dbg !335 + %9652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 6, !dbg !335 + %9653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 7, !dbg !335 + %9654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 8, !dbg !335 + %9655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 9, !dbg !335 + %9656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 10, !dbg !335 + %9657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 11, !dbg !335 + %9658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 12, !dbg !335 + %9659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 13, !dbg !335 + %9660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 14, !dbg !335 + %9661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 15, !dbg !335 + %9662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 16, !dbg !335 + %9663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 17, !dbg !335 + %9664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 18, !dbg !335 + %9665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 19, !dbg !335 + %9666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 20, !dbg !335 + %9667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 21, !dbg !335 + %9668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 22, !dbg !335 + %9669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 23, !dbg !335 + %9670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 24, !dbg !335 + %9671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 25, !dbg !335 + %9672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 26, !dbg !335 + %9673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 27, !dbg !335 + %9674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 28, !dbg !335 + %9675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 29, !dbg !335 + %9676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 30, !dbg !335 + %9677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9645, 31, !dbg !335 + %9678 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9646, float %9647, float %9648, float %9649, float %9650, float %9651, float %9652, float %9653, float %9654, float %9655, float %9656, float %9657, float %9658, float %9659, float %9660, float %9661, float %9662, float %9663, float %9664, float %9665, float %9666, float %9667, float %9668, float %9669, float %9670, float %9671, float %9672, float %9673, float %9674, float %9675, float %9676, float %9677, i64 %9548, i64 %9553, i1 true) #3, !dbg !335 + %9679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 0, !dbg !335 + %9680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 1, !dbg !335 + %9681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 2, !dbg !335 + %9682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 3, !dbg !335 + %9683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 4, !dbg !335 + %9684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 5, !dbg !335 + %9685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 6, !dbg !335 + %9686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 7, !dbg !335 + %9687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 8, !dbg !335 + %9688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 9, !dbg !335 + %9689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 10, !dbg !335 + %9690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 11, !dbg !335 + %9691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 12, !dbg !335 + %9692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 13, !dbg !335 + %9693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 14, !dbg !335 + %9694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 15, !dbg !335 + %9695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 16, !dbg !335 + %9696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 17, !dbg !335 + %9697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 18, !dbg !335 + %9698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 19, !dbg !335 + %9699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 20, !dbg !335 + %9700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 21, !dbg !335 + %9701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 22, !dbg !335 + %9702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 23, !dbg !335 + %9703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 24, !dbg !335 + %9704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 25, !dbg !335 + %9705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 26, !dbg !335 + %9706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 27, !dbg !335 + %9707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 28, !dbg !335 + %9708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 29, !dbg !335 + %9709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 30, !dbg !335 + %9710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9678, 31, !dbg !335 + %9711 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9679, float %9680, float %9681, float %9682, float %9683, float %9684, float %9685, float %9686, float %9687, float %9688, float %9689, float %9690, float %9691, float %9692, float %9693, float %9694, float %9695, float %9696, float %9697, float %9698, float %9699, float %9700, float %9701, float %9702, float %9703, float %9704, float %9705, float %9706, float %9707, float %9708, float %9709, float %9710, i64 %9558, i64 %9563, i1 true) #3, !dbg !335 + %9712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 0, !dbg !335 + %9713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 1, !dbg !335 + %9714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 2, !dbg !335 + %9715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 3, !dbg !335 + %9716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 4, !dbg !335 + %9717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 5, !dbg !335 + %9718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 6, !dbg !335 + %9719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 7, !dbg !335 + %9720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 8, !dbg !335 + %9721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 9, !dbg !335 + %9722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 10, !dbg !335 + %9723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 11, !dbg !335 + %9724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 12, !dbg !335 + %9725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 13, !dbg !335 + %9726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 14, !dbg !335 + %9727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 15, !dbg !335 + %9728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 16, !dbg !335 + %9729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 17, !dbg !335 + %9730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 18, !dbg !335 + %9731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 19, !dbg !335 + %9732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 20, !dbg !335 + %9733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 21, !dbg !335 + %9734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 22, !dbg !335 + %9735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 23, !dbg !335 + %9736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 24, !dbg !335 + %9737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 25, !dbg !335 + %9738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 26, !dbg !335 + %9739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 27, !dbg !335 + %9740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 28, !dbg !335 + %9741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 29, !dbg !335 + %9742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 30, !dbg !335 + %9743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9711, 31, !dbg !335 + %9744 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9712, float %9713, float %9714, float %9715, float %9716, float %9717, float %9718, float %9719, float %9720, float %9721, float %9722, float %9723, float %9724, float %9725, float %9726, float %9727, float %9728, float %9729, float %9730, float %9731, float %9732, float %9733, float %9734, float %9735, float %9736, float %9737, float %9738, float %9739, float %9740, float %9741, float %9742, float %9743, i64 %9568, i64 %9573, i1 true) #3, !dbg !335 + %9745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 0, !dbg !335 + %9746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 1, !dbg !335 + %9747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 2, !dbg !335 + %9748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 3, !dbg !335 + %9749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 4, !dbg !335 + %9750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 5, !dbg !335 + %9751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 6, !dbg !335 + %9752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 7, !dbg !335 + %9753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 8, !dbg !335 + %9754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 9, !dbg !335 + %9755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 10, !dbg !335 + %9756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 11, !dbg !335 + %9757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 12, !dbg !335 + %9758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 13, !dbg !335 + %9759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 14, !dbg !335 + %9760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 15, !dbg !335 + %9761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 16, !dbg !335 + %9762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 17, !dbg !335 + %9763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 18, !dbg !335 + %9764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 19, !dbg !335 + %9765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 20, !dbg !335 + %9766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 21, !dbg !335 + %9767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 22, !dbg !335 + %9768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 23, !dbg !335 + %9769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 24, !dbg !335 + %9770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 25, !dbg !335 + %9771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 26, !dbg !335 + %9772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 27, !dbg !335 + %9773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 28, !dbg !335 + %9774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 29, !dbg !335 + %9775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 30, !dbg !335 + %9776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9744, 31, !dbg !335 + %9777 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9745, float %9746, float %9747, float %9748, float %9749, float %9750, float %9751, float %9752, float %9753, float %9754, float %9755, float %9756, float %9757, float %9758, float %9759, float %9760, float %9761, float %9762, float %9763, float %9764, float %9765, float %9766, float %9767, float %9768, float %9769, float %9770, float %9771, float %9772, float %9773, float %9774, float %9775, float %9776, i64 %9578, i64 %9583, i1 true) #3, !dbg !335 + %9778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 0, !dbg !335 + %9779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 1, !dbg !335 + %9780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 2, !dbg !335 + %9781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 3, !dbg !335 + %9782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 4, !dbg !335 + %9783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 5, !dbg !335 + %9784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 6, !dbg !335 + %9785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 7, !dbg !335 + %9786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 8, !dbg !335 + %9787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 9, !dbg !335 + %9788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 10, !dbg !335 + %9789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 11, !dbg !335 + %9790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 12, !dbg !335 + %9791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 13, !dbg !335 + %9792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 14, !dbg !335 + %9793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 15, !dbg !335 + %9794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 16, !dbg !335 + %9795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 17, !dbg !335 + %9796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 18, !dbg !335 + %9797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 19, !dbg !335 + %9798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 20, !dbg !335 + %9799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 21, !dbg !335 + %9800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 22, !dbg !335 + %9801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 23, !dbg !335 + %9802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 24, !dbg !335 + %9803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 25, !dbg !335 + %9804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 26, !dbg !335 + %9805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 27, !dbg !335 + %9806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 28, !dbg !335 + %9807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 29, !dbg !335 + %9808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 30, !dbg !335 + %9809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9777, 31, !dbg !335 + %9810 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9778, float %9779, float %9780, float %9781, float %9782, float %9783, float %9784, float %9785, float %9786, float %9787, float %9788, float %9789, float %9790, float %9791, float %9792, float %9793, float %9794, float %9795, float %9796, float %9797, float %9798, float %9799, float %9800, float %9801, float %9802, float %9803, float %9804, float %9805, float %9806, float %9807, float %9808, float %9809, i64 %9588, i64 %9593, i1 true) #3, !dbg !335 + %9811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 0, !dbg !335 + %9812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 1, !dbg !335 + %9813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 2, !dbg !335 + %9814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 3, !dbg !335 + %9815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 4, !dbg !335 + %9816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 5, !dbg !335 + %9817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 6, !dbg !335 + %9818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 7, !dbg !335 + %9819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 8, !dbg !335 + %9820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 9, !dbg !335 + %9821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 10, !dbg !335 + %9822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 11, !dbg !335 + %9823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 12, !dbg !335 + %9824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 13, !dbg !335 + %9825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 14, !dbg !335 + %9826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 15, !dbg !335 + %9827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 16, !dbg !335 + %9828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 17, !dbg !335 + %9829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 18, !dbg !335 + %9830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 19, !dbg !335 + %9831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 20, !dbg !335 + %9832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 21, !dbg !335 + %9833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 22, !dbg !335 + %9834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 23, !dbg !335 + %9835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 24, !dbg !335 + %9836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 25, !dbg !335 + %9837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 26, !dbg !335 + %9838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 27, !dbg !335 + %9839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 28, !dbg !335 + %9840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 29, !dbg !335 + %9841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 30, !dbg !335 + %9842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9810, 31, !dbg !335 + %9843 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %9811, float %9812, float %9813, float %9814, float %9815, float %9816, float %9817, float %9818, float %9819, float %9820, float %9821, float %9822, float %9823, float %9824, float %9825, float %9826, float %9827, float %9828, float %9829, float %9830, float %9831, float %9832, float %9833, float %9834, float %9835, float %9836, float %9837, float %9838, float %9839, float %9840, float %9841, float %9842, i64 %9598, i64 %9603, i1 true) #3, !dbg !335 + %9844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 0, !dbg !335 + %9845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 1, !dbg !335 + %9846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 2, !dbg !335 + %9847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 3, !dbg !335 + %9848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 4, !dbg !335 + %9849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 5, !dbg !335 + %9850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 6, !dbg !335 + %9851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 7, !dbg !335 + %9852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 8, !dbg !335 + %9853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 9, !dbg !335 + %9854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 10, !dbg !335 + %9855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 11, !dbg !335 + %9856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 12, !dbg !335 + %9857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 13, !dbg !335 + %9858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 14, !dbg !335 + %9859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 15, !dbg !335 + %9860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 16, !dbg !335 + %9861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 17, !dbg !335 + %9862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 18, !dbg !335 + %9863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 19, !dbg !335 + %9864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 20, !dbg !335 + %9865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 21, !dbg !335 + %9866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 22, !dbg !335 + %9867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 23, !dbg !335 + %9868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 24, !dbg !335 + %9869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 25, !dbg !335 + %9870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 26, !dbg !335 + %9871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 27, !dbg !335 + %9872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 28, !dbg !335 + %9873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 29, !dbg !335 + %9874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 30, !dbg !335 + %9875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %9843, 31, !dbg !335 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !335 + %9876 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"(float %9844, float %9845, float %9846, float %9847, float %9848, float %9849, float %9850, float %9851, float %9852, float %9853, float %9854, float %9855, float %9856, float %9857, float %9858, float %9859, float %9860, float %9861, float %9862, float %9863, float %9864, float %9865, float %9866, float %9867, float %9868, float %9869, float %9870, float %9871, float %9872, float %9873, float %9874, float %9875, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 140288), i32 0, i32 0, ptr addrspace(3) %9175, i32 0, i32 0) #3, !dbg !335 + %9877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 0, !dbg !335 + %9878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 1, !dbg !335 + %9879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 2, !dbg !335 + %9880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 3, !dbg !335 + %9881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 4, !dbg !335 + %9882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 5, !dbg !335 + %9883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 6, !dbg !335 + %9884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 7, !dbg !335 + %9885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 8, !dbg !335 + %9886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 9, !dbg !335 + %9887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 10, !dbg !335 + %9888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 11, !dbg !335 + %9889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 12, !dbg !335 + %9890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 13, !dbg !335 + %9891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 14, !dbg !335 + %9892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 15, !dbg !335 + %9893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 16, !dbg !335 + %9894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 17, !dbg !335 + %9895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 18, !dbg !335 + %9896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 19, !dbg !335 + %9897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 20, !dbg !335 + %9898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 21, !dbg !335 + %9899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 22, !dbg !335 + %9900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 23, !dbg !335 + %9901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 24, !dbg !335 + %9902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 25, !dbg !335 + %9903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 26, !dbg !335 + %9904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 27, !dbg !335 + %9905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 28, !dbg !335 + %9906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 29, !dbg !335 + %9907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 30, !dbg !335 + %9908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32 } %9876, 31, !dbg !335 + %9909 = insertelement <2 x float> poison, float %9879, i64 0, !dbg !336 + %9910 = insertelement <2 x float> %9909, float %9880, i64 1, !dbg !336 + %9911 = fsub <2 x float> %9910, %9611, !dbg !336 + %9912 = insertelement <2 x float> poison, float %9883, i64 0, !dbg !336 + %9913 = insertelement <2 x float> %9912, float %9884, i64 1, !dbg !336 + %9914 = fsub <2 x float> %9913, %9610, !dbg !336 + %9915 = insertelement <2 x float> poison, float %9887, i64 0, !dbg !336 + %9916 = insertelement <2 x float> %9915, float %9888, i64 1, !dbg !336 + %9917 = fsub <2 x float> %9916, %9609, !dbg !336 + %9918 = insertelement <2 x float> poison, float %9891, i64 0, !dbg !336 + %9919 = insertelement <2 x float> %9918, float %9892, i64 1, !dbg !336 + %9920 = fsub <2 x float> %9919, %9608, !dbg !336 + %9921 = insertelement <2 x float> poison, float %9895, i64 0, !dbg !336 + %9922 = insertelement <2 x float> %9921, float %9896, i64 1, !dbg !336 + %9923 = fsub <2 x float> %9922, %9607, !dbg !336 + %9924 = insertelement <2 x float> poison, float %9899, i64 0, !dbg !336 + %9925 = insertelement <2 x float> %9924, float %9900, i64 1, !dbg !336 + %9926 = fsub <2 x float> %9925, %9606, !dbg !336 + %9927 = insertelement <2 x float> poison, float %9903, i64 0, !dbg !336 + %9928 = insertelement <2 x float> %9927, float %9904, i64 1, !dbg !336 + %9929 = fsub <2 x float> %9928, %9605, !dbg !336 + %9930 = insertelement <2 x float> poison, float %9907, i64 0, !dbg !336 + %9931 = insertelement <2 x float> %9930, float %9908, i64 1, !dbg !336 + %9932 = fsub <2 x float> %9931, %9604, !dbg !336 + %9933 = fmul <2 x float> %9180, %9911, !dbg !337 + %9934 = fmul <2 x float> %9186, %9914, !dbg !337 + %9935 = fmul <2 x float> %9192, %9917, !dbg !337 + %9936 = fmul <2 x float> %9198, %9920, !dbg !337 + %9937 = fmul <2 x float> %9204, %9923, !dbg !337 + %9938 = fmul <2 x float> %9210, %9926, !dbg !337 + %9939 = fmul <2 x float> %9216, %9929, !dbg !337 + %9940 = fmul <2 x float> %9222, %9932, !dbg !337 + %9941 = insertelement <2 x float> poison, float %9877, i64 0, !dbg !336 + %9942 = insertelement <2 x float> %9941, float %9878, i64 1, !dbg !336 + %9943 = fsub <2 x float> %9942, %9611, !dbg !336 + %9944 = fmul <2 x float> %9177, %9943, !dbg !337 + %9945 = fptrunc <2 x float> %9944 to <2 x bfloat>, !dbg !338 + %9946 = shufflevector <32 x i1> %8392, <32 x i1> poison, <2 x i32> , !dbg !339 + %9947 = select <2 x i1> %9946, <2 x bfloat> %9945, <2 x bfloat> zeroinitializer, !dbg !339 + %9948 = fptrunc <2 x float> %9933 to <2 x bfloat>, !dbg !338 + %9949 = shufflevector <32 x i1> %8392, <32 x i1> poison, <2 x i32> , !dbg !339 + %9950 = select <2 x i1> %9949, <2 x bfloat> %9948, <2 x bfloat> zeroinitializer, !dbg !339 + %9951 = insertelement <2 x float> poison, float %9881, i64 0, !dbg !336 + %9952 = insertelement <2 x float> %9951, float %9882, i64 1, !dbg !336 + %9953 = fsub <2 x float> %9952, %9610, !dbg !336 + %9954 = fmul <2 x float> %9183, %9953, !dbg !337 + %9955 = fptrunc <2 x float> %9954 to <2 x bfloat>, !dbg !338 + %9956 = shufflevector <32 x i1> %8393, <32 x i1> poison, <2 x i32> , !dbg !339 + %9957 = select <2 x i1> %9956, <2 x bfloat> %9955, <2 x bfloat> zeroinitializer, !dbg !339 + %9958 = fptrunc <2 x float> %9934 to <2 x bfloat>, !dbg !338 + %9959 = shufflevector <32 x i1> %8393, <32 x i1> poison, <2 x i32> , !dbg !339 + %9960 = select <2 x i1> %9959, <2 x bfloat> %9958, <2 x bfloat> zeroinitializer, !dbg !339 + %9961 = insertelement <2 x float> poison, float %9885, i64 0, !dbg !336 + %9962 = insertelement <2 x float> %9961, float %9886, i64 1, !dbg !336 + %9963 = fsub <2 x float> %9962, %9609, !dbg !336 + %9964 = fmul <2 x float> %9189, %9963, !dbg !337 + %9965 = fptrunc <2 x float> %9964 to <2 x bfloat>, !dbg !338 + %9966 = shufflevector <32 x i1> %8394, <32 x i1> poison, <2 x i32> , !dbg !339 + %9967 = select <2 x i1> %9966, <2 x bfloat> %9965, <2 x bfloat> zeroinitializer, !dbg !339 + %9968 = fptrunc <2 x float> %9935 to <2 x bfloat>, !dbg !338 + %9969 = shufflevector <32 x i1> %8394, <32 x i1> poison, <2 x i32> , !dbg !339 + %9970 = select <2 x i1> %9969, <2 x bfloat> %9968, <2 x bfloat> zeroinitializer, !dbg !339 + %9971 = insertelement <2 x float> poison, float %9889, i64 0, !dbg !336 + %9972 = insertelement <2 x float> %9971, float %9890, i64 1, !dbg !336 + %9973 = fsub <2 x float> %9972, %9608, !dbg !336 + %9974 = fmul <2 x float> %9195, %9973, !dbg !337 + %9975 = fptrunc <2 x float> %9974 to <2 x bfloat>, !dbg !338 + %9976 = shufflevector <32 x i1> %8395, <32 x i1> poison, <2 x i32> , !dbg !339 + %9977 = select <2 x i1> %9976, <2 x bfloat> %9975, <2 x bfloat> zeroinitializer, !dbg !339 + %9978 = fptrunc <2 x float> %9936 to <2 x bfloat>, !dbg !338 + %9979 = shufflevector <32 x i1> %8395, <32 x i1> poison, <2 x i32> , !dbg !339 + %9980 = select <2 x i1> %9979, <2 x bfloat> %9978, <2 x bfloat> zeroinitializer, !dbg !339 + %9981 = insertelement <2 x float> poison, float %9893, i64 0, !dbg !336 + %9982 = insertelement <2 x float> %9981, float %9894, i64 1, !dbg !336 + %9983 = fsub <2 x float> %9982, %9607, !dbg !336 + %9984 = fmul <2 x float> %9201, %9983, !dbg !337 + %9985 = fptrunc <2 x float> %9984 to <2 x bfloat>, !dbg !338 + %9986 = shufflevector <32 x i1> %8396, <32 x i1> poison, <2 x i32> , !dbg !339 + %9987 = select <2 x i1> %9986, <2 x bfloat> %9985, <2 x bfloat> zeroinitializer, !dbg !339 + %9988 = fptrunc <2 x float> %9937 to <2 x bfloat>, !dbg !338 + %9989 = shufflevector <32 x i1> %8396, <32 x i1> poison, <2 x i32> , !dbg !339 + %9990 = select <2 x i1> %9989, <2 x bfloat> %9988, <2 x bfloat> zeroinitializer, !dbg !339 + %9991 = insertelement <2 x float> poison, float %9897, i64 0, !dbg !336 + %9992 = insertelement <2 x float> %9991, float %9898, i64 1, !dbg !336 + %9993 = fsub <2 x float> %9992, %9606, !dbg !336 + %9994 = fmul <2 x float> %9207, %9993, !dbg !337 + %9995 = fptrunc <2 x float> %9994 to <2 x bfloat>, !dbg !338 + %9996 = shufflevector <32 x i1> %8397, <32 x i1> poison, <2 x i32> , !dbg !339 + %9997 = select <2 x i1> %9996, <2 x bfloat> %9995, <2 x bfloat> zeroinitializer, !dbg !339 + %9998 = fptrunc <2 x float> %9938 to <2 x bfloat>, !dbg !338 + %9999 = shufflevector <32 x i1> %8397, <32 x i1> poison, <2 x i32> , !dbg !339 + %10000 = select <2 x i1> %9999, <2 x bfloat> %9998, <2 x bfloat> zeroinitializer, !dbg !339 + %10001 = insertelement <2 x float> poison, float %9901, i64 0, !dbg !336 + %10002 = insertelement <2 x float> %10001, float %9902, i64 1, !dbg !336 + %10003 = fsub <2 x float> %10002, %9605, !dbg !336 + %10004 = fmul <2 x float> %9213, %10003, !dbg !337 + %10005 = fptrunc <2 x float> %10004 to <2 x bfloat>, !dbg !338 + %10006 = shufflevector <32 x i1> %8398, <32 x i1> poison, <2 x i32> , !dbg !339 + %10007 = select <2 x i1> %10006, <2 x bfloat> %10005, <2 x bfloat> zeroinitializer, !dbg !339 + %10008 = fptrunc <2 x float> %9939 to <2 x bfloat>, !dbg !338 + %10009 = shufflevector <32 x i1> %8398, <32 x i1> poison, <2 x i32> , !dbg !339 + %10010 = select <2 x i1> %10009, <2 x bfloat> %10008, <2 x bfloat> zeroinitializer, !dbg !339 + %10011 = insertelement <2 x float> poison, float %9905, i64 0, !dbg !336 + %10012 = insertelement <2 x float> %10011, float %9906, i64 1, !dbg !336 + %10013 = fsub <2 x float> %10012, %9604, !dbg !336 + %10014 = fmul <2 x float> %9219, %10013, !dbg !337 + %10015 = fptrunc <2 x float> %10014 to <2 x bfloat>, !dbg !338 + %10016 = shufflevector <32 x i1> %8399, <32 x i1> poison, <2 x i32> , !dbg !339 + %10017 = select <2 x i1> %10016, <2 x bfloat> %10015, <2 x bfloat> zeroinitializer, !dbg !339 + %10018 = fptrunc <2 x float> %9940 to <2 x bfloat>, !dbg !338 + %10019 = shufflevector <32 x i1> %8399, <32 x i1> poison, <2 x i32> , !dbg !339 + %10020 = select <2 x i1> %10019, <2 x bfloat> %10018, <2 x bfloat> zeroinitializer, !dbg !339 + %10021 = bitcast <2 x bfloat> %9947 to i32, !dbg !340 + %10022 = bitcast <2 x bfloat> %9950 to i32, !dbg !340 + %10023 = bitcast <2 x bfloat> %9957 to i32, !dbg !340 + %10024 = bitcast <2 x bfloat> %9960 to i32, !dbg !340 + %10025 = bitcast <2 x bfloat> %9967 to i32, !dbg !340 + %10026 = bitcast <2 x bfloat> %9970 to i32, !dbg !340 + %10027 = bitcast <2 x bfloat> %9977 to i32, !dbg !340 + %10028 = bitcast <2 x bfloat> %9980 to i32, !dbg !340 + %10029 = bitcast <2 x bfloat> %9987 to i32, !dbg !340 + %10030 = bitcast <2 x bfloat> %9990 to i32, !dbg !340 + %10031 = bitcast <2 x bfloat> %9997 to i32, !dbg !340 + %10032 = bitcast <2 x bfloat> %10000 to i32, !dbg !340 + %10033 = bitcast <2 x bfloat> %10007 to i32, !dbg !340 + %10034 = bitcast <2 x bfloat> %10010 to i32, !dbg !340 + %10035 = bitcast <2 x bfloat> %10017 to i32, !dbg !340 + %10036 = bitcast <2 x bfloat> %10020 to i32, !dbg !340 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !340 + %10037 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %.pn3851673, float %.pn3831674, float %.pn3811675, float %.pn3791676, float %.pn3771677, float %.pn3751678, float %.pn3731679, float %.pn3711680, float %.pn3691681, float %.pn3671682, float %.pn3651683, float %.pn3631684, float %.pn3611685, float %.pn3591686, float %.pn3571687, float %.pn3551688, float %.pn3531689, float %.pn3511690, float %.pn3491691, float %.pn3471692, float %.pn3451693, float %.pn3431694, float %.pn3411695, float %.pn3391696, float %.pn3371697, float %.pn3351698, float %.pn3331699, float %.pn3311700, float %.pn3291701, float %.pn3271702, float %.pn3251703, float %.pn3231704, float %.pn3211705, float %.pn3191706, float %.pn3171707, float %.pn3151708, float %.pn3131709, float %.pn3111710, float %.pn3091711, float %.pn3071712, float %.pn3051713, float %.pn3031714, float %.pn3011715, float %.pn2991716, float %.pn2971717, float %.pn2951718, float %.pn2931719, float %.pn2911720, float %.pn2891721, float %.pn2871722, float %.pn2851723, float %.pn2831724, float %.pn2811725, float %.pn2791726, float %.pn2771727, float %.pn2751728, float %.pn2731729, float %.pn2711730, float %.pn2691731, float %.pn2671732, float %.pn2651733, float %.pn2631734, float %.pn2611735, float %.pn2591736, i32 %10021, i32 %10022, i32 %10023, i32 %10024, i64 %8512, i1 true) #3, !dbg !340 + %10038 = add i32 %8508, 2048, !dbg !340 + %10039 = lshr exact i32 %10038, 4, !dbg !340 + %10040 = and i32 %10039, 16383, !dbg !340 + %10041 = zext nneg i32 %10040 to i64, !dbg !340 + %10042 = or disjoint i64 %10041, 4611686293338849280, !dbg !340 + %10043 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 0, !dbg !340 + %10044 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 1, !dbg !340 + %10045 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 2, !dbg !340 + %10046 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 3, !dbg !340 + %10047 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 4, !dbg !340 + %10048 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 5, !dbg !340 + %10049 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 6, !dbg !340 + %10050 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 7, !dbg !340 + %10051 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 8, !dbg !340 + %10052 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 9, !dbg !340 + %10053 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 10, !dbg !340 + %10054 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 11, !dbg !340 + %10055 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 12, !dbg !340 + %10056 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 13, !dbg !340 + %10057 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 14, !dbg !340 + %10058 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 15, !dbg !340 + %10059 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 16, !dbg !340 + %10060 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 17, !dbg !340 + %10061 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 18, !dbg !340 + %10062 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 19, !dbg !340 + %10063 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 20, !dbg !340 + %10064 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 21, !dbg !340 + %10065 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 22, !dbg !340 + %10066 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 23, !dbg !340 + %10067 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 24, !dbg !340 + %10068 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 25, !dbg !340 + %10069 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 26, !dbg !340 + %10070 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 27, !dbg !340 + %10071 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 28, !dbg !340 + %10072 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 29, !dbg !340 + %10073 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 30, !dbg !340 + %10074 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 31, !dbg !340 + %10075 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 32, !dbg !340 + %10076 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 33, !dbg !340 + %10077 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 34, !dbg !340 + %10078 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 35, !dbg !340 + %10079 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 36, !dbg !340 + %10080 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 37, !dbg !340 + %10081 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 38, !dbg !340 + %10082 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 39, !dbg !340 + %10083 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 40, !dbg !340 + %10084 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 41, !dbg !340 + %10085 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 42, !dbg !340 + %10086 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 43, !dbg !340 + %10087 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 44, !dbg !340 + %10088 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 45, !dbg !340 + %10089 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 46, !dbg !340 + %10090 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 47, !dbg !340 + %10091 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 48, !dbg !340 + %10092 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 49, !dbg !340 + %10093 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 50, !dbg !340 + %10094 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 51, !dbg !340 + %10095 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 52, !dbg !340 + %10096 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 53, !dbg !340 + %10097 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 54, !dbg !340 + %10098 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 55, !dbg !340 + %10099 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 56, !dbg !340 + %10100 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 57, !dbg !340 + %10101 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 58, !dbg !340 + %10102 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 59, !dbg !340 + %10103 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 60, !dbg !340 + %10104 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 61, !dbg !340 + %10105 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 62, !dbg !340 + %10106 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10037, 63, !dbg !340 + %10107 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %10043, float %10044, float %10045, float %10046, float %10047, float %10048, float %10049, float %10050, float %10051, float %10052, float %10053, float %10054, float %10055, float %10056, float %10057, float %10058, float %10059, float %10060, float %10061, float %10062, float %10063, float %10064, float %10065, float %10066, float %10067, float %10068, float %10069, float %10070, float %10071, float %10072, float %10073, float %10074, float %10075, float %10076, float %10077, float %10078, float %10079, float %10080, float %10081, float %10082, float %10083, float %10084, float %10085, float %10086, float %10087, float %10088, float %10089, float %10090, float %10091, float %10092, float %10093, float %10094, float %10095, float %10096, float %10097, float %10098, float %10099, float %10100, float %10101, float %10102, float %10103, float %10104, float %10105, float %10106, i32 %10025, i32 %10026, i32 %10027, i32 %10028, i64 %10042, i1 true) #3, !dbg !340 + %10108 = add i32 %8508, 4096, !dbg !340 + %10109 = lshr exact i32 %10108, 4, !dbg !340 + %10110 = and i32 %10109, 16383, !dbg !340 + %10111 = zext nneg i32 %10110 to i64, !dbg !340 + %10112 = or disjoint i64 %10111, 4611686293338849280, !dbg !340 + %10113 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 0, !dbg !340 + %10114 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 1, !dbg !340 + %10115 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 2, !dbg !340 + %10116 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 3, !dbg !340 + %10117 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 4, !dbg !340 + %10118 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 5, !dbg !340 + %10119 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 6, !dbg !340 + %10120 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 7, !dbg !340 + %10121 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 8, !dbg !340 + %10122 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 9, !dbg !340 + %10123 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 10, !dbg !340 + %10124 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 11, !dbg !340 + %10125 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 12, !dbg !340 + %10126 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 13, !dbg !340 + %10127 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 14, !dbg !340 + %10128 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 15, !dbg !340 + %10129 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 16, !dbg !340 + %10130 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 17, !dbg !340 + %10131 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 18, !dbg !340 + %10132 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 19, !dbg !340 + %10133 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 20, !dbg !340 + %10134 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 21, !dbg !340 + %10135 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 22, !dbg !340 + %10136 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 23, !dbg !340 + %10137 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 24, !dbg !340 + %10138 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 25, !dbg !340 + %10139 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 26, !dbg !340 + %10140 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 27, !dbg !340 + %10141 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 28, !dbg !340 + %10142 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 29, !dbg !340 + %10143 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 30, !dbg !340 + %10144 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 31, !dbg !340 + %10145 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 32, !dbg !340 + %10146 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 33, !dbg !340 + %10147 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 34, !dbg !340 + %10148 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 35, !dbg !340 + %10149 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 36, !dbg !340 + %10150 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 37, !dbg !340 + %10151 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 38, !dbg !340 + %10152 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 39, !dbg !340 + %10153 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 40, !dbg !340 + %10154 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 41, !dbg !340 + %10155 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 42, !dbg !340 + %10156 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 43, !dbg !340 + %10157 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 44, !dbg !340 + %10158 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 45, !dbg !340 + %10159 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 46, !dbg !340 + %10160 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 47, !dbg !340 + %10161 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 48, !dbg !340 + %10162 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 49, !dbg !340 + %10163 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 50, !dbg !340 + %10164 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 51, !dbg !340 + %10165 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 52, !dbg !340 + %10166 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 53, !dbg !340 + %10167 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 54, !dbg !340 + %10168 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 55, !dbg !340 + %10169 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 56, !dbg !340 + %10170 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 57, !dbg !340 + %10171 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 58, !dbg !340 + %10172 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 59, !dbg !340 + %10173 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 60, !dbg !340 + %10174 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 61, !dbg !340 + %10175 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 62, !dbg !340 + %10176 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10107, 63, !dbg !340 + %10177 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %10113, float %10114, float %10115, float %10116, float %10117, float %10118, float %10119, float %10120, float %10121, float %10122, float %10123, float %10124, float %10125, float %10126, float %10127, float %10128, float %10129, float %10130, float %10131, float %10132, float %10133, float %10134, float %10135, float %10136, float %10137, float %10138, float %10139, float %10140, float %10141, float %10142, float %10143, float %10144, float %10145, float %10146, float %10147, float %10148, float %10149, float %10150, float %10151, float %10152, float %10153, float %10154, float %10155, float %10156, float %10157, float %10158, float %10159, float %10160, float %10161, float %10162, float %10163, float %10164, float %10165, float %10166, float %10167, float %10168, float %10169, float %10170, float %10171, float %10172, float %10173, float %10174, float %10175, float %10176, i32 %10029, i32 %10030, i32 %10031, i32 %10032, i64 %10112, i1 true) #3, !dbg !340 + %10178 = add i32 %8508, 6144, !dbg !340 + %10179 = lshr exact i32 %10178, 4, !dbg !340 + %10180 = and i32 %10179, 16383, !dbg !340 + %10181 = zext nneg i32 %10180 to i64, !dbg !340 + %10182 = or disjoint i64 %10181, 4611686293338849280, !dbg !340 + %10183 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 0, !dbg !340 + %10184 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 1, !dbg !340 + %10185 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 2, !dbg !340 + %10186 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 3, !dbg !340 + %10187 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 4, !dbg !340 + %10188 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 5, !dbg !340 + %10189 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 6, !dbg !340 + %10190 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 7, !dbg !340 + %10191 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 8, !dbg !340 + %10192 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 9, !dbg !340 + %10193 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 10, !dbg !340 + %10194 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 11, !dbg !340 + %10195 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 12, !dbg !340 + %10196 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 13, !dbg !340 + %10197 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 14, !dbg !340 + %10198 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 15, !dbg !340 + %10199 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 16, !dbg !340 + %10200 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 17, !dbg !340 + %10201 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 18, !dbg !340 + %10202 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 19, !dbg !340 + %10203 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 20, !dbg !340 + %10204 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 21, !dbg !340 + %10205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 22, !dbg !340 + %10206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 23, !dbg !340 + %10207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 24, !dbg !340 + %10208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 25, !dbg !340 + %10209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 26, !dbg !340 + %10210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 27, !dbg !340 + %10211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 28, !dbg !340 + %10212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 29, !dbg !340 + %10213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 30, !dbg !340 + %10214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 31, !dbg !340 + %10215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 32, !dbg !340 + %10216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 33, !dbg !340 + %10217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 34, !dbg !340 + %10218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 35, !dbg !340 + %10219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 36, !dbg !340 + %10220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 37, !dbg !340 + %10221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 38, !dbg !340 + %10222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 39, !dbg !340 + %10223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 40, !dbg !340 + %10224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 41, !dbg !340 + %10225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 42, !dbg !340 + %10226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 43, !dbg !340 + %10227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 44, !dbg !340 + %10228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 45, !dbg !340 + %10229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 46, !dbg !340 + %10230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 47, !dbg !340 + %10231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 48, !dbg !340 + %10232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 49, !dbg !340 + %10233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 50, !dbg !340 + %10234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 51, !dbg !340 + %10235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 52, !dbg !340 + %10236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 53, !dbg !340 + %10237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 54, !dbg !340 + %10238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 55, !dbg !340 + %10239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 56, !dbg !340 + %10240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 57, !dbg !340 + %10241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 58, !dbg !340 + %10242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 59, !dbg !340 + %10243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 60, !dbg !340 + %10244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 61, !dbg !340 + %10245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 62, !dbg !340 + %10246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10177, 63, !dbg !340 + %10247 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %10183, float %10184, float %10185, float %10186, float %10187, float %10188, float %10189, float %10190, float %10191, float %10192, float %10193, float %10194, float %10195, float %10196, float %10197, float %10198, float %10199, float %10200, float %10201, float %10202, float %10203, float %10204, float %10205, float %10206, float %10207, float %10208, float %10209, float %10210, float %10211, float %10212, float %10213, float %10214, float %10215, float %10216, float %10217, float %10218, float %10219, float %10220, float %10221, float %10222, float %10223, float %10224, float %10225, float %10226, float %10227, float %10228, float %10229, float %10230, float %10231, float %10232, float %10233, float %10234, float %10235, float %10236, float %10237, float %10238, float %10239, float %10240, float %10241, float %10242, float %10243, float %10244, float %10245, float %10246, i32 %10033, i32 %10034, i32 %10035, i32 %10036, i64 %10182, i1 true) #3, !dbg !340 + %10248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 0, !dbg !340 + %10249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 1, !dbg !340 + %10250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 2, !dbg !340 + %10251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 3, !dbg !340 + %10252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 4, !dbg !340 + %10253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 5, !dbg !340 + %10254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 6, !dbg !340 + %10255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 7, !dbg !340 + %10256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 8, !dbg !340 + %10257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 9, !dbg !340 + %10258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 10, !dbg !340 + %10259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 11, !dbg !340 + %10260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 12, !dbg !340 + %10261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 13, !dbg !340 + %10262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 14, !dbg !340 + %10263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 15, !dbg !340 + %10264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 16, !dbg !340 + %10265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 17, !dbg !340 + %10266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 18, !dbg !340 + %10267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 19, !dbg !340 + %10268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 20, !dbg !340 + %10269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 21, !dbg !340 + %10270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 22, !dbg !340 + %10271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 23, !dbg !340 + %10272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 24, !dbg !340 + %10273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 25, !dbg !340 + %10274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 26, !dbg !340 + %10275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 27, !dbg !340 + %10276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 28, !dbg !340 + %10277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 29, !dbg !340 + %10278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 30, !dbg !340 + %10279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 31, !dbg !340 + %10280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 32, !dbg !340 + %10281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 33, !dbg !340 + %10282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 34, !dbg !340 + %10283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 35, !dbg !340 + %10284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 36, !dbg !340 + %10285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 37, !dbg !340 + %10286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 38, !dbg !340 + %10287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 39, !dbg !340 + %10288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 40, !dbg !340 + %10289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 41, !dbg !340 + %10290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 42, !dbg !340 + %10291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 43, !dbg !340 + %10292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 44, !dbg !340 + %10293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 45, !dbg !340 + %10294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 46, !dbg !340 + %10295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 47, !dbg !340 + %10296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 48, !dbg !340 + %10297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 49, !dbg !340 + %10298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 50, !dbg !340 + %10299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 51, !dbg !340 + %10300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 52, !dbg !340 + %10301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 53, !dbg !340 + %10302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 54, !dbg !340 + %10303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 55, !dbg !340 + %10304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 56, !dbg !340 + %10305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 57, !dbg !340 + %10306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 58, !dbg !340 + %10307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 59, !dbg !340 + %10308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 60, !dbg !340 + %10309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 61, !dbg !340 + %10310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 62, !dbg !340 + %10311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10247, 63, !dbg !340 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !340 + %10312 = add i32 %8366, %.pn5211801, !dbg !341 + %10313 = add nuw nsw i32 %8379, 1, !dbg !323 + %10314 = lshr i32 %10313, 1, !dbg !342 + %10315 = zext nneg i32 %10314 to i64, !dbg !343 + %10316 = getelementptr i32, ptr addrspace(1) %4981, i64 %10315, !dbg !343 + %10317 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !344 + %10318 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %10316, i64 %10317, i1 %8381) #3, !dbg !344 + %10319 = add nuw nsw i32 %10314, 1, !dbg !345 + %10320 = icmp slt i32 %10319, %4985, !dbg !346 + %10321 = getelementptr i8, ptr addrspace(1) %10316, i64 4, !dbg !347 + %10322 = and i1 %8381, %10320, !dbg !323 + %10323 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #3, !dbg !348 + %10324 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %10321, i64 %10323, i1 %10322) #3, !dbg !348 + %10325 = and i32 %8379, 1, !dbg !349 + %10326 = sub i32 %10324, %10318, !dbg !350 + %10327 = shl i32 %10326, 7, !dbg !351 + %10328 = add i32 %10327, -64, !dbg !352 + %10329 = xor i32 %10325, 1, !dbg !353 + %10330 = mul nuw nsw i32 %10328, %10329, !dbg !353 + %10331 = shl nuw nsw i32 %10325, 6, !dbg !354 + %10332 = add i32 %10330, %10331, !dbg !355 + %10333 = shl i32 %10332, 12, !dbg !356 + %10334 = sext i32 %10333 to i64, !dbg !321 + %10335 = getelementptr bfloat, ptr addrspace(1) %.pn5851802, i64 %10334, !dbg !321 + %10336 = getelementptr bfloat, ptr addrspace(1) %.pn5691803, i64 %10334, !dbg !321 + %10337 = getelementptr bfloat, ptr addrspace(1) %.pn5531804, i64 %10334, !dbg !321 + %10338 = getelementptr bfloat, ptr addrspace(1) %.pn5371805, i64 %10334, !dbg !321 + %10339 = shl i32 %10332, 7, !dbg !357 + %10340 = sext i32 %10339 to i64, !dbg !322 + %10341 = getelementptr bfloat, ptr addrspace(1) %.pn6491806, i64 %10340, !dbg !322 + %10342 = getelementptr bfloat, ptr addrspace(1) %.pn6331807, i64 %10340, !dbg !322 + %10343 = getelementptr bfloat, ptr addrspace(1) %.pn6171808, i64 %10340, !dbg !322 + %10344 = getelementptr bfloat, ptr addrspace(1) %.pn6011809, i64 %10340, !dbg !322 + %10345 = add i32 %10332, %.pn6811810, !dbg !341 + %10346 = add i32 %10332, %.pn6771811, !dbg !341 + %10347 = add i32 %10332, %.pn6731812, !dbg !341 + %10348 = add i32 %10332, %.pn6691813, !dbg !341 + %10349 = add i32 %10332, %.pn6651814, !dbg !341 + %10350 = add i32 %10332, %.pn6611815, !dbg !341 + %10351 = add i32 %10332, %.pn6571816, !dbg !341 + %10352 = add i32 %10332, %.pn6531817, !dbg !341 + %10353 = add i32 %10332, %8375, !dbg !341 + %10354 = add i32 %10332, %8376, !dbg !341 + %10355 = add i32 %10332, %8377, !dbg !341 + %10356 = add i32 %10332, %8378, !dbg !341 + %10357 = add i32 %10332, %8371, !dbg !341 + %10358 = add i32 %10332, %8372, !dbg !341 + %10359 = add i32 %10332, %8373, !dbg !341 + %10360 = add i32 %10332, %8374, !dbg !341 + %10361 = add i32 %8368, 1, !dbg !323 + %10362 = icmp sgt i32 %10361, 1, !dbg !323 + %10363 = select i1 %10362, i32 0, i32 %10361, !dbg !323 + %10364 = add i32 %8370, 1, !dbg !323 + %10365 = icmp sgt i32 %10364, 2, !dbg !323 + %10366 = select i1 %10365, i32 0, i32 %10364, !dbg !323 + %10367 = icmp slt i32 %10353, 2048, !dbg !324 + %10368 = icmp slt i32 %10354, 2048, !dbg !324 + %10369 = icmp slt i32 %10355, 2048, !dbg !324 + %10370 = icmp slt i32 %10356, 2048, !dbg !324 + %10371 = shl i32 %10366, 13, !dbg !315 + %10372 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %10371, !dbg !315 + %10373 = and i1 %8380, %10367, !dbg !323 + %10374 = and i1 %8380, %10368, !dbg !323 + %10375 = and i1 %8380, %10369, !dbg !323 + %10376 = and i1 %8380, %10370, !dbg !323 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !315 + %10377 = getelementptr inbounds nuw i8, ptr addrspace(3) %10372, i32 %5027, !dbg !315 + %10378 = select i1 %10373, i32 16, i32 0, !dbg !315 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %10377, ptr addrspace(1) %10335, i32 %10378) #3, !dbg !315 + %10379 = getelementptr inbounds nuw i8, ptr addrspace(3) %10372, i32 %5030, !dbg !315 + %10380 = select i1 %10374, i32 16, i32 0, !dbg !315 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %10379, ptr addrspace(1) %10336, i32 %10380) #3, !dbg !315 + %10381 = getelementptr inbounds nuw i8, ptr addrspace(3) %10372, i32 %5033, !dbg !315 + %10382 = select i1 %10375, i32 16, i32 0, !dbg !315 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %10381, ptr addrspace(1) %10337, i32 %10382) #3, !dbg !315 + %10383 = getelementptr inbounds nuw i8, ptr addrspace(3) %10372, i32 %5036, !dbg !315 + %10384 = select i1 %10376, i32 16, i32 0, !dbg !315 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %10383, ptr addrspace(1) %10338, i32 %10384) #3, !dbg !315 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !315 + %10385 = icmp slt i32 %10345, 2048, !dbg !358 + %10386 = icmp slt i32 %10346, 2048, !dbg !358 + %10387 = icmp slt i32 %10347, 2048, !dbg !358 + %10388 = icmp slt i32 %10348, 2048, !dbg !358 + %10389 = icmp slt i32 %10349, 2048, !dbg !358 + %10390 = icmp slt i32 %10350, 2048, !dbg !358 + %10391 = icmp slt i32 %10351, 2048, !dbg !358 + %10392 = icmp slt i32 %10352, 2048, !dbg !358 + %10393 = sext i32 %10345 to i64, !dbg !316 + %10394 = getelementptr float, ptr addrspace(1) %5517, i64 %10393, !dbg !316 + %10395 = sext i32 %10346 to i64, !dbg !316 + %10396 = getelementptr float, ptr addrspace(1) %5517, i64 %10395, !dbg !316 + %10397 = sext i32 %10347 to i64, !dbg !316 + %10398 = getelementptr float, ptr addrspace(1) %5517, i64 %10397, !dbg !316 + %10399 = sext i32 %10348 to i64, !dbg !316 + %10400 = getelementptr float, ptr addrspace(1) %5517, i64 %10399, !dbg !316 + %10401 = sext i32 %10349 to i64, !dbg !316 + %10402 = getelementptr float, ptr addrspace(1) %5517, i64 %10401, !dbg !316 + %10403 = sext i32 %10350 to i64, !dbg !316 + %10404 = getelementptr float, ptr addrspace(1) %5517, i64 %10403, !dbg !316 + %10405 = sext i32 %10351 to i64, !dbg !316 + %10406 = getelementptr float, ptr addrspace(1) %5517, i64 %10405, !dbg !316 + %10407 = sext i32 %10352 to i64, !dbg !316 + %10408 = getelementptr float, ptr addrspace(1) %5517, i64 %10407, !dbg !316 + %10409 = shl i32 %10363, 6, !dbg !317 + %10410 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 106496), i32 %10409, !dbg !317 + %10411 = and i1 %8380, %10385, !dbg !323 + %10412 = and i1 %8380, %10386, !dbg !323 + %10413 = and i1 %8380, %10387, !dbg !323 + %10414 = and i1 %8380, %10388, !dbg !323 + %10415 = and i1 %8380, %10389, !dbg !323 + %10416 = and i1 %8380, %10390, !dbg !323 + %10417 = and i1 %8380, %10391, !dbg !323 + %10418 = and i1 %8380, %10392, !dbg !323 + %10419 = getelementptr inbounds nuw i8, ptr addrspace(3) %10410, i32 %5065, !dbg !317 + %10420 = select i1 %10411, i32 8, i32 0, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %10419, ptr addrspace(1) %10394, i32 %10420, i1 %5064) #3, !dbg !317 + %10421 = getelementptr inbounds nuw i8, ptr addrspace(3) %10410, i32 %5068, !dbg !317 + %10422 = select i1 %10412, i32 8, i32 0, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %10421, ptr addrspace(1) %10396, i32 %10422, i1 %5064) #3, !dbg !317 + %10423 = getelementptr inbounds nuw i8, ptr addrspace(3) %10410, i32 %5071, !dbg !317 + %10424 = select i1 %10413, i32 8, i32 0, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %10423, ptr addrspace(1) %10398, i32 %10424, i1 %5064) #3, !dbg !317 + %10425 = getelementptr inbounds nuw i8, ptr addrspace(3) %10410, i32 %5074, !dbg !317 + %10426 = select i1 %10414, i32 8, i32 0, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %10425, ptr addrspace(1) %10400, i32 %10426, i1 %5064) #3, !dbg !317 + %10427 = getelementptr inbounds nuw i8, ptr addrspace(3) %10410, i32 %5077, !dbg !317 + %10428 = select i1 %10415, i32 8, i32 0, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %10427, ptr addrspace(1) %10402, i32 %10428, i1 %5064) #3, !dbg !317 + %10429 = getelementptr inbounds nuw i8, ptr addrspace(3) %10410, i32 %5080, !dbg !317 + %10430 = select i1 %10416, i32 8, i32 0, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %10429, ptr addrspace(1) %10404, i32 %10430, i1 %5064) #3, !dbg !317 + %10431 = getelementptr inbounds nuw i8, ptr addrspace(3) %10410, i32 %5083, !dbg !317 + %10432 = select i1 %10417, i32 8, i32 0, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %10431, ptr addrspace(1) %10406, i32 %10432, i1 %5064) #3, !dbg !317 + %10433 = getelementptr inbounds nuw i8, ptr addrspace(3) %10410, i32 %5086, !dbg !317 + %10434 = select i1 %10418, i32 8, i32 0, !dbg !317 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %10433, ptr addrspace(1) %10408, i32 %10434, i1 %5064) #3, !dbg !317 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !317 + %10435 = icmp slt i32 %10357, 2048, !dbg !359 + %10436 = icmp slt i32 %10358, 2048, !dbg !359 + %10437 = icmp slt i32 %10359, 2048, !dbg !359 + %10438 = icmp slt i32 %10360, 2048, !dbg !359 + %10439 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %10371, !dbg !318 + %10440 = and i1 %8380, %10435, !dbg !323 + %10441 = and i1 %8380, %10436, !dbg !323 + %10442 = and i1 %8380, %10437, !dbg !323 + %10443 = and i1 %8380, %10438, !dbg !323 + %10444 = getelementptr inbounds nuw i8, ptr addrspace(3) %10439, i32 %5027, !dbg !318 + %10445 = select i1 %10440, i32 16, i32 0, !dbg !318 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %10444, ptr addrspace(1) %10341, i32 %10445) #3, !dbg !318 + %10446 = getelementptr inbounds nuw i8, ptr addrspace(3) %10439, i32 %5030, !dbg !318 + %10447 = select i1 %10441, i32 16, i32 0, !dbg !318 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %10446, ptr addrspace(1) %10342, i32 %10447) #3, !dbg !318 + %10448 = getelementptr inbounds nuw i8, ptr addrspace(3) %10439, i32 %5033, !dbg !318 + %10449 = select i1 %10442, i32 16, i32 0, !dbg !318 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %10448, ptr addrspace(1) %10343, i32 %10449) #3, !dbg !318 + %10450 = getelementptr inbounds nuw i8, ptr addrspace(3) %10439, i32 %5036, !dbg !318 + %10451 = select i1 %10443, i32 16, i32 0, !dbg !318 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %10450, ptr addrspace(1) %10344, i32 %10451) #3, !dbg !318 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !318 + %10452 = getelementptr float, ptr addrspace(1) %5518, i64 %10393, !dbg !319 + %10453 = getelementptr float, ptr addrspace(1) %5518, i64 %10395, !dbg !319 + %10454 = getelementptr float, ptr addrspace(1) %5518, i64 %10397, !dbg !319 + %10455 = getelementptr float, ptr addrspace(1) %5518, i64 %10399, !dbg !319 + %10456 = getelementptr float, ptr addrspace(1) %5518, i64 %10401, !dbg !319 + %10457 = getelementptr float, ptr addrspace(1) %5518, i64 %10403, !dbg !319 + %10458 = getelementptr float, ptr addrspace(1) %5518, i64 %10405, !dbg !319 + %10459 = getelementptr float, ptr addrspace(1) %5518, i64 %10407, !dbg !319 + %10460 = getelementptr float, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 107008), i32 %10409, !dbg !320 + %10461 = getelementptr inbounds nuw i8, ptr addrspace(3) %10460, i32 %5065, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) %10461, ptr addrspace(1) %10452, i32 %10420, i1 %5064) #3, !dbg !320 + %10462 = getelementptr inbounds nuw i8, ptr addrspace(3) %10460, i32 %5068, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %10462, ptr addrspace(1) %10453, i32 %10422, i1 %5064) #3, !dbg !320 + %10463 = getelementptr inbounds nuw i8, ptr addrspace(3) %10460, i32 %5071, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %10463, ptr addrspace(1) %10454, i32 %10424, i1 %5064) #3, !dbg !320 + %10464 = getelementptr inbounds nuw i8, ptr addrspace(3) %10460, i32 %5074, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %10464, ptr addrspace(1) %10455, i32 %10426, i1 %5064) #3, !dbg !320 + %10465 = getelementptr inbounds nuw i8, ptr addrspace(3) %10460, i32 %5077, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %10465, ptr addrspace(1) %10456, i32 %10428, i1 %5064) #3, !dbg !320 + %10466 = getelementptr inbounds nuw i8, ptr addrspace(3) %10460, i32 %5080, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %10466, ptr addrspace(1) %10457, i32 %10430, i1 %5064) #3, !dbg !320 + %10467 = getelementptr inbounds nuw i8, ptr addrspace(3) %10460, i32 %5083, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %10467, ptr addrspace(1) %10458, i32 %10432, i1 %5064) #3, !dbg !320 + %10468 = getelementptr inbounds nuw i8, ptr addrspace(3) %10460, i32 %5086, !dbg !320 + tail call void asm sideeffect "@$3 cp.async.ca.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x8, $2;", "r,l,r,b"(ptr addrspace(3) nonnull %10468, ptr addrspace(1) %10459, i32 %10434, i1 %5064) #3, !dbg !320 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !320 + %exitcond2214.not = icmp eq i32 %10313, %smax2213, !dbg !323 + br i1 %exitcond2214.not, label %._crit_edge1820, label %.lr.ph1819, !dbg !323 + +._crit_edge1820: ; preds = %__nv_exp2f.exit1284, %._crit_edge1670 + %.pn385.lcssa = phi float [ %8246, %._crit_edge1670 ], [ %10248, %__nv_exp2f.exit1284 ] + %.pn383.lcssa = phi float [ %8247, %._crit_edge1670 ], [ %10249, %__nv_exp2f.exit1284 ] + %.pn381.lcssa = phi float [ %8248, %._crit_edge1670 ], [ %10250, %__nv_exp2f.exit1284 ] + %.pn379.lcssa = phi float [ %8249, %._crit_edge1670 ], [ %10251, %__nv_exp2f.exit1284 ] + %.pn377.lcssa = phi float [ %8250, %._crit_edge1670 ], [ %10252, %__nv_exp2f.exit1284 ] + %.pn375.lcssa = phi float [ %8251, %._crit_edge1670 ], [ %10253, %__nv_exp2f.exit1284 ] + %.pn373.lcssa = phi float [ %8252, %._crit_edge1670 ], [ %10254, %__nv_exp2f.exit1284 ] + %.pn371.lcssa = phi float [ %8253, %._crit_edge1670 ], [ %10255, %__nv_exp2f.exit1284 ] + %.pn369.lcssa = phi float [ %8254, %._crit_edge1670 ], [ %10256, %__nv_exp2f.exit1284 ] + %.pn367.lcssa = phi float [ %8255, %._crit_edge1670 ], [ %10257, %__nv_exp2f.exit1284 ] + %.pn365.lcssa = phi float [ %8256, %._crit_edge1670 ], [ %10258, %__nv_exp2f.exit1284 ] + %.pn363.lcssa = phi float [ %8257, %._crit_edge1670 ], [ %10259, %__nv_exp2f.exit1284 ] + %.pn361.lcssa = phi float [ %8258, %._crit_edge1670 ], [ %10260, %__nv_exp2f.exit1284 ] + %.pn359.lcssa = phi float [ %8259, %._crit_edge1670 ], [ %10261, %__nv_exp2f.exit1284 ] + %.pn357.lcssa = phi float [ %8260, %._crit_edge1670 ], [ %10262, %__nv_exp2f.exit1284 ] + %.pn355.lcssa = phi float [ %8261, %._crit_edge1670 ], [ %10263, %__nv_exp2f.exit1284 ] + %.pn353.lcssa = phi float [ %8262, %._crit_edge1670 ], [ %10264, %__nv_exp2f.exit1284 ] + %.pn351.lcssa = phi float [ %8263, %._crit_edge1670 ], [ %10265, %__nv_exp2f.exit1284 ] + %.pn349.lcssa = phi float [ %8264, %._crit_edge1670 ], [ %10266, %__nv_exp2f.exit1284 ] + %.pn347.lcssa = phi float [ %8265, %._crit_edge1670 ], [ %10267, %__nv_exp2f.exit1284 ] + %.pn345.lcssa = phi float [ %8266, %._crit_edge1670 ], [ %10268, %__nv_exp2f.exit1284 ] + %.pn343.lcssa = phi float [ %8267, %._crit_edge1670 ], [ %10269, %__nv_exp2f.exit1284 ] + %.pn341.lcssa = phi float [ %8268, %._crit_edge1670 ], [ %10270, %__nv_exp2f.exit1284 ] + %.pn339.lcssa = phi float [ %8269, %._crit_edge1670 ], [ %10271, %__nv_exp2f.exit1284 ] + %.pn337.lcssa = phi float [ %8270, %._crit_edge1670 ], [ %10272, %__nv_exp2f.exit1284 ] + %.pn335.lcssa = phi float [ %8271, %._crit_edge1670 ], [ %10273, %__nv_exp2f.exit1284 ] + %.pn333.lcssa = phi float [ %8272, %._crit_edge1670 ], [ %10274, %__nv_exp2f.exit1284 ] + %.pn331.lcssa = phi float [ %8273, %._crit_edge1670 ], [ %10275, %__nv_exp2f.exit1284 ] + %.pn329.lcssa = phi float [ %8274, %._crit_edge1670 ], [ %10276, %__nv_exp2f.exit1284 ] + %.pn327.lcssa = phi float [ %8275, %._crit_edge1670 ], [ %10277, %__nv_exp2f.exit1284 ] + %.pn325.lcssa = phi float [ %8276, %._crit_edge1670 ], [ %10278, %__nv_exp2f.exit1284 ] + %.pn323.lcssa = phi float [ %8277, %._crit_edge1670 ], [ %10279, %__nv_exp2f.exit1284 ] + %.pn321.lcssa = phi float [ %8278, %._crit_edge1670 ], [ %10280, %__nv_exp2f.exit1284 ] + %.pn319.lcssa = phi float [ %8279, %._crit_edge1670 ], [ %10281, %__nv_exp2f.exit1284 ] + %.pn317.lcssa = phi float [ %8280, %._crit_edge1670 ], [ %10282, %__nv_exp2f.exit1284 ] + %.pn315.lcssa = phi float [ %8281, %._crit_edge1670 ], [ %10283, %__nv_exp2f.exit1284 ] + %.pn313.lcssa = phi float [ %8282, %._crit_edge1670 ], [ %10284, %__nv_exp2f.exit1284 ] + %.pn311.lcssa = phi float [ %8283, %._crit_edge1670 ], [ %10285, %__nv_exp2f.exit1284 ] + %.pn309.lcssa = phi float [ %8284, %._crit_edge1670 ], [ %10286, %__nv_exp2f.exit1284 ] + %.pn307.lcssa = phi float [ %8285, %._crit_edge1670 ], [ %10287, %__nv_exp2f.exit1284 ] + %.pn305.lcssa = phi float [ %8286, %._crit_edge1670 ], [ %10288, %__nv_exp2f.exit1284 ] + %.pn303.lcssa = phi float [ %8287, %._crit_edge1670 ], [ %10289, %__nv_exp2f.exit1284 ] + %.pn301.lcssa = phi float [ %8288, %._crit_edge1670 ], [ %10290, %__nv_exp2f.exit1284 ] + %.pn299.lcssa = phi float [ %8289, %._crit_edge1670 ], [ %10291, %__nv_exp2f.exit1284 ] + %.pn297.lcssa = phi float [ %8290, %._crit_edge1670 ], [ %10292, %__nv_exp2f.exit1284 ] + %.pn295.lcssa = phi float [ %8291, %._crit_edge1670 ], [ %10293, %__nv_exp2f.exit1284 ] + %.pn293.lcssa = phi float [ %8292, %._crit_edge1670 ], [ %10294, %__nv_exp2f.exit1284 ] + %.pn291.lcssa = phi float [ %8293, %._crit_edge1670 ], [ %10295, %__nv_exp2f.exit1284 ] + %.pn289.lcssa = phi float [ %8294, %._crit_edge1670 ], [ %10296, %__nv_exp2f.exit1284 ] + %.pn287.lcssa = phi float [ %8295, %._crit_edge1670 ], [ %10297, %__nv_exp2f.exit1284 ] + %.pn285.lcssa = phi float [ %8296, %._crit_edge1670 ], [ %10298, %__nv_exp2f.exit1284 ] + %.pn283.lcssa = phi float [ %8297, %._crit_edge1670 ], [ %10299, %__nv_exp2f.exit1284 ] + %.pn281.lcssa = phi float [ %8298, %._crit_edge1670 ], [ %10300, %__nv_exp2f.exit1284 ] + %.pn279.lcssa = phi float [ %8299, %._crit_edge1670 ], [ %10301, %__nv_exp2f.exit1284 ] + %.pn277.lcssa = phi float [ %8300, %._crit_edge1670 ], [ %10302, %__nv_exp2f.exit1284 ] + %.pn275.lcssa = phi float [ %8301, %._crit_edge1670 ], [ %10303, %__nv_exp2f.exit1284 ] + %.pn273.lcssa = phi float [ %8302, %._crit_edge1670 ], [ %10304, %__nv_exp2f.exit1284 ] + %.pn271.lcssa = phi float [ %8303, %._crit_edge1670 ], [ %10305, %__nv_exp2f.exit1284 ] + %.pn269.lcssa = phi float [ %8304, %._crit_edge1670 ], [ %10306, %__nv_exp2f.exit1284 ] + %.pn267.lcssa = phi float [ %8305, %._crit_edge1670 ], [ %10307, %__nv_exp2f.exit1284 ] + %.pn265.lcssa = phi float [ %8306, %._crit_edge1670 ], [ %10308, %__nv_exp2f.exit1284 ] + %.pn263.lcssa = phi float [ %8307, %._crit_edge1670 ], [ %10309, %__nv_exp2f.exit1284 ] + %.pn261.lcssa = phi float [ %8308, %._crit_edge1670 ], [ %10310, %__nv_exp2f.exit1284 ] + %.pn259.lcssa = phi float [ %8309, %._crit_edge1670 ], [ %10311, %__nv_exp2f.exit1284 ] + %.pn513.lcssa = phi float [ %8182, %._crit_edge1670 ], [ %9456, %__nv_exp2f.exit1284 ] + %.pn511.lcssa = phi float [ %8183, %._crit_edge1670 ], [ %9457, %__nv_exp2f.exit1284 ] + %.pn509.lcssa = phi float [ %8184, %._crit_edge1670 ], [ %9458, %__nv_exp2f.exit1284 ] + %.pn507.lcssa = phi float [ %8185, %._crit_edge1670 ], [ %9459, %__nv_exp2f.exit1284 ] + %.pn505.lcssa = phi float [ %8186, %._crit_edge1670 ], [ %9460, %__nv_exp2f.exit1284 ] + %.pn503.lcssa = phi float [ %8187, %._crit_edge1670 ], [ %9461, %__nv_exp2f.exit1284 ] + %.pn501.lcssa = phi float [ %8188, %._crit_edge1670 ], [ %9462, %__nv_exp2f.exit1284 ] + %.pn499.lcssa = phi float [ %8189, %._crit_edge1670 ], [ %9463, %__nv_exp2f.exit1284 ] + %.pn497.lcssa = phi float [ %8190, %._crit_edge1670 ], [ %9464, %__nv_exp2f.exit1284 ] + %.pn495.lcssa = phi float [ %8191, %._crit_edge1670 ], [ %9465, %__nv_exp2f.exit1284 ] + %.pn493.lcssa = phi float [ %8192, %._crit_edge1670 ], [ %9466, %__nv_exp2f.exit1284 ] + %.pn491.lcssa = phi float [ %8193, %._crit_edge1670 ], [ %9467, %__nv_exp2f.exit1284 ] + %.pn489.lcssa = phi float [ %8194, %._crit_edge1670 ], [ %9468, %__nv_exp2f.exit1284 ] + %.pn487.lcssa = phi float [ %8195, %._crit_edge1670 ], [ %9469, %__nv_exp2f.exit1284 ] + %.pn485.lcssa = phi float [ %8196, %._crit_edge1670 ], [ %9470, %__nv_exp2f.exit1284 ] + %.pn483.lcssa = phi float [ %8197, %._crit_edge1670 ], [ %9471, %__nv_exp2f.exit1284 ] + %.pn481.lcssa = phi float [ %8198, %._crit_edge1670 ], [ %9472, %__nv_exp2f.exit1284 ] + %.pn479.lcssa = phi float [ %8199, %._crit_edge1670 ], [ %9473, %__nv_exp2f.exit1284 ] + %.pn477.lcssa = phi float [ %8200, %._crit_edge1670 ], [ %9474, %__nv_exp2f.exit1284 ] + %.pn475.lcssa = phi float [ %8201, %._crit_edge1670 ], [ %9475, %__nv_exp2f.exit1284 ] + %.pn473.lcssa = phi float [ %8202, %._crit_edge1670 ], [ %9476, %__nv_exp2f.exit1284 ] + %.pn471.lcssa = phi float [ %8203, %._crit_edge1670 ], [ %9477, %__nv_exp2f.exit1284 ] + %.pn469.lcssa = phi float [ %8204, %._crit_edge1670 ], [ %9478, %__nv_exp2f.exit1284 ] + %.pn467.lcssa = phi float [ %8205, %._crit_edge1670 ], [ %9479, %__nv_exp2f.exit1284 ] + %.pn465.lcssa = phi float [ %8206, %._crit_edge1670 ], [ %9480, %__nv_exp2f.exit1284 ] + %.pn463.lcssa = phi float [ %8207, %._crit_edge1670 ], [ %9481, %__nv_exp2f.exit1284 ] + %.pn461.lcssa = phi float [ %8208, %._crit_edge1670 ], [ %9482, %__nv_exp2f.exit1284 ] + %.pn459.lcssa = phi float [ %8209, %._crit_edge1670 ], [ %9483, %__nv_exp2f.exit1284 ] + %.pn457.lcssa = phi float [ %8210, %._crit_edge1670 ], [ %9484, %__nv_exp2f.exit1284 ] + %.pn455.lcssa = phi float [ %8211, %._crit_edge1670 ], [ %9485, %__nv_exp2f.exit1284 ] + %.pn453.lcssa = phi float [ %8212, %._crit_edge1670 ], [ %9486, %__nv_exp2f.exit1284 ] + %.pn451.lcssa = phi float [ %8213, %._crit_edge1670 ], [ %9487, %__nv_exp2f.exit1284 ] + %.pn449.lcssa = phi float [ %8214, %._crit_edge1670 ], [ %9488, %__nv_exp2f.exit1284 ] + %.pn447.lcssa = phi float [ %8215, %._crit_edge1670 ], [ %9489, %__nv_exp2f.exit1284 ] + %.pn445.lcssa = phi float [ %8216, %._crit_edge1670 ], [ %9490, %__nv_exp2f.exit1284 ] + %.pn443.lcssa = phi float [ %8217, %._crit_edge1670 ], [ %9491, %__nv_exp2f.exit1284 ] + %.pn441.lcssa = phi float [ %8218, %._crit_edge1670 ], [ %9492, %__nv_exp2f.exit1284 ] + %.pn439.lcssa = phi float [ %8219, %._crit_edge1670 ], [ %9493, %__nv_exp2f.exit1284 ] + %.pn437.lcssa = phi float [ %8220, %._crit_edge1670 ], [ %9494, %__nv_exp2f.exit1284 ] + %.pn435.lcssa = phi float [ %8221, %._crit_edge1670 ], [ %9495, %__nv_exp2f.exit1284 ] + %.pn433.lcssa = phi float [ %8222, %._crit_edge1670 ], [ %9496, %__nv_exp2f.exit1284 ] + %.pn431.lcssa = phi float [ %8223, %._crit_edge1670 ], [ %9497, %__nv_exp2f.exit1284 ] + %.pn429.lcssa = phi float [ %8224, %._crit_edge1670 ], [ %9498, %__nv_exp2f.exit1284 ] + %.pn427.lcssa = phi float [ %8225, %._crit_edge1670 ], [ %9499, %__nv_exp2f.exit1284 ] + %.pn425.lcssa = phi float [ %8226, %._crit_edge1670 ], [ %9500, %__nv_exp2f.exit1284 ] + %.pn423.lcssa = phi float [ %8227, %._crit_edge1670 ], [ %9501, %__nv_exp2f.exit1284 ] + %.pn421.lcssa = phi float [ %8228, %._crit_edge1670 ], [ %9502, %__nv_exp2f.exit1284 ] + %.pn419.lcssa = phi float [ %8229, %._crit_edge1670 ], [ %9503, %__nv_exp2f.exit1284 ] + %.pn417.lcssa = phi float [ %8230, %._crit_edge1670 ], [ %9504, %__nv_exp2f.exit1284 ] + %.pn415.lcssa = phi float [ %8231, %._crit_edge1670 ], [ %9505, %__nv_exp2f.exit1284 ] + %.pn413.lcssa = phi float [ %8232, %._crit_edge1670 ], [ %9506, %__nv_exp2f.exit1284 ] + %.pn411.lcssa = phi float [ %8233, %._crit_edge1670 ], [ %9507, %__nv_exp2f.exit1284 ] + %.pn409.lcssa = phi float [ %8234, %._crit_edge1670 ], [ %9508, %__nv_exp2f.exit1284 ] + %.pn407.lcssa = phi float [ %8235, %._crit_edge1670 ], [ %9509, %__nv_exp2f.exit1284 ] + %.pn405.lcssa = phi float [ %8236, %._crit_edge1670 ], [ %9510, %__nv_exp2f.exit1284 ] + %.pn403.lcssa = phi float [ %8237, %._crit_edge1670 ], [ %9511, %__nv_exp2f.exit1284 ] + %.pn401.lcssa = phi float [ %8238, %._crit_edge1670 ], [ %9512, %__nv_exp2f.exit1284 ] + %.pn399.lcssa = phi float [ %8239, %._crit_edge1670 ], [ %9513, %__nv_exp2f.exit1284 ] + %.pn397.lcssa = phi float [ %8240, %._crit_edge1670 ], [ %9514, %__nv_exp2f.exit1284 ] + %.pn395.lcssa = phi float [ %8241, %._crit_edge1670 ], [ %9515, %__nv_exp2f.exit1284 ] + %.pn393.lcssa = phi float [ %8242, %._crit_edge1670 ], [ %9516, %__nv_exp2f.exit1284 ] + %.pn391.lcssa = phi float [ %8243, %._crit_edge1670 ], [ %9517, %__nv_exp2f.exit1284 ] + %.pn389.lcssa = phi float [ %8244, %._crit_edge1670 ], [ %9518, %__nv_exp2f.exit1284 ] + %.pn387.lcssa = phi float [ %8245, %._crit_edge1670 ], [ %9519, %__nv_exp2f.exit1284 ] + %10469 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101,$102,$103,$104,$105,$106,$107,$108,$109,$110,$111,$112,$113,$114,$115,$116,$117,$118,$119,$120,$121,$122,$123,$124,$125,$126,$127\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127"(float %.pn513.lcssa, float %.pn511.lcssa, float %.pn509.lcssa, float %.pn507.lcssa, float %.pn505.lcssa, float %.pn503.lcssa, float %.pn501.lcssa, float %.pn499.lcssa, float %.pn497.lcssa, float %.pn495.lcssa, float %.pn493.lcssa, float %.pn491.lcssa, float %.pn489.lcssa, float %.pn487.lcssa, float %.pn485.lcssa, float %.pn483.lcssa, float %.pn481.lcssa, float %.pn479.lcssa, float %.pn477.lcssa, float %.pn475.lcssa, float %.pn473.lcssa, float %.pn471.lcssa, float %.pn469.lcssa, float %.pn467.lcssa, float %.pn465.lcssa, float %.pn463.lcssa, float %.pn461.lcssa, float %.pn459.lcssa, float %.pn457.lcssa, float %.pn455.lcssa, float %.pn453.lcssa, float %.pn451.lcssa, float %.pn449.lcssa, float %.pn447.lcssa, float %.pn445.lcssa, float %.pn443.lcssa, float %.pn441.lcssa, float %.pn439.lcssa, float %.pn437.lcssa, float %.pn435.lcssa, float %.pn433.lcssa, float %.pn431.lcssa, float %.pn429.lcssa, float %.pn427.lcssa, float %.pn425.lcssa, float %.pn423.lcssa, float %.pn421.lcssa, float %.pn419.lcssa, float %.pn417.lcssa, float %.pn415.lcssa, float %.pn413.lcssa, float %.pn411.lcssa, float %.pn409.lcssa, float %.pn407.lcssa, float %.pn405.lcssa, float %.pn403.lcssa, float %.pn401.lcssa, float %.pn399.lcssa, float %.pn397.lcssa, float %.pn395.lcssa, float %.pn393.lcssa, float %.pn391.lcssa, float %.pn389.lcssa, float %.pn387.lcssa, float %.pn385.lcssa, float %.pn383.lcssa, float %.pn381.lcssa, float %.pn379.lcssa, float %.pn377.lcssa, float %.pn375.lcssa, float %.pn373.lcssa, float %.pn371.lcssa, float %.pn369.lcssa, float %.pn367.lcssa, float %.pn365.lcssa, float %.pn363.lcssa, float %.pn361.lcssa, float %.pn359.lcssa, float %.pn357.lcssa, float %.pn355.lcssa, float %.pn353.lcssa, float %.pn351.lcssa, float %.pn349.lcssa, float %.pn347.lcssa, float %.pn345.lcssa, float %.pn343.lcssa, float %.pn341.lcssa, float %.pn339.lcssa, float %.pn337.lcssa, float %.pn335.lcssa, float %.pn333.lcssa, float %.pn331.lcssa, float %.pn329.lcssa, float %.pn327.lcssa, float %.pn325.lcssa, float %.pn323.lcssa, float %.pn321.lcssa, float %.pn319.lcssa, float %.pn317.lcssa, float %.pn315.lcssa, float %.pn313.lcssa, float %.pn311.lcssa, float %.pn309.lcssa, float %.pn307.lcssa, float %.pn305.lcssa, float %.pn303.lcssa, float %.pn301.lcssa, float %.pn299.lcssa, float %.pn297.lcssa, float %.pn295.lcssa, float %.pn293.lcssa, float %.pn291.lcssa, float %.pn289.lcssa, float %.pn287.lcssa, float %.pn285.lcssa, float %.pn283.lcssa, float %.pn281.lcssa, float %.pn279.lcssa, float %.pn277.lcssa, float %.pn275.lcssa, float %.pn273.lcssa, float %.pn271.lcssa, float %.pn269.lcssa, float %.pn267.lcssa, float %.pn265.lcssa, float %.pn263.lcssa, float %.pn261.lcssa, float %.pn259.lcssa) #3, !dbg !323 + %10470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 0, !dbg !323 + %10471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 1, !dbg !323 + %10472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 2, !dbg !323 + %10473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 3, !dbg !323 + %10474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 4, !dbg !323 + %10475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 5, !dbg !323 + %10476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 6, !dbg !323 + %10477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 7, !dbg !323 + %10478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 8, !dbg !323 + %10479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 9, !dbg !323 + %10480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 10, !dbg !323 + %10481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 11, !dbg !323 + %10482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 12, !dbg !323 + %10483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 13, !dbg !323 + %10484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 14, !dbg !323 + %10485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 15, !dbg !323 + %10486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 16, !dbg !323 + %10487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 17, !dbg !323 + %10488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 18, !dbg !323 + %10489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 19, !dbg !323 + %10490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 20, !dbg !323 + %10491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 21, !dbg !323 + %10492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 22, !dbg !323 + %10493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 23, !dbg !323 + %10494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 24, !dbg !323 + %10495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 25, !dbg !323 + %10496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 26, !dbg !323 + %10497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 27, !dbg !323 + %10498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 28, !dbg !323 + %10499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 29, !dbg !323 + %10500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 30, !dbg !323 + %10501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 31, !dbg !323 + %10502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 32, !dbg !323 + %10503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 33, !dbg !323 + %10504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 34, !dbg !323 + %10505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 35, !dbg !323 + %10506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 36, !dbg !323 + %10507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 37, !dbg !323 + %10508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 38, !dbg !323 + %10509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 39, !dbg !323 + %10510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 40, !dbg !323 + %10511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 41, !dbg !323 + %10512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 42, !dbg !323 + %10513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 43, !dbg !323 + %10514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 44, !dbg !323 + %10515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 45, !dbg !323 + %10516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 46, !dbg !323 + %10517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 47, !dbg !323 + %10518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 48, !dbg !323 + %10519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 49, !dbg !323 + %10520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 50, !dbg !323 + %10521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 51, !dbg !323 + %10522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 52, !dbg !323 + %10523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 53, !dbg !323 + %10524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 54, !dbg !323 + %10525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 55, !dbg !323 + %10526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 56, !dbg !323 + %10527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 57, !dbg !323 + %10528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 58, !dbg !323 + %10529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 59, !dbg !323 + %10530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 60, !dbg !323 + %10531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 61, !dbg !323 + %10532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 62, !dbg !323 + %10533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 63, !dbg !323 + %10534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 64, !dbg !323 + %10535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 65, !dbg !323 + %10536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 66, !dbg !323 + %10537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 67, !dbg !323 + %10538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 68, !dbg !323 + %10539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 69, !dbg !323 + %10540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 70, !dbg !323 + %10541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 71, !dbg !323 + %10542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 72, !dbg !323 + %10543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 73, !dbg !323 + %10544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 74, !dbg !323 + %10545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 75, !dbg !323 + %10546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 76, !dbg !323 + %10547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 77, !dbg !323 + %10548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 78, !dbg !323 + %10549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 79, !dbg !323 + %10550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 80, !dbg !323 + %10551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 81, !dbg !323 + %10552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 82, !dbg !323 + %10553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 83, !dbg !323 + %10554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 84, !dbg !323 + %10555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 85, !dbg !323 + %10556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 86, !dbg !323 + %10557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 87, !dbg !323 + %10558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 88, !dbg !323 + %10559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 89, !dbg !323 + %10560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 90, !dbg !323 + %10561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 91, !dbg !323 + %10562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 92, !dbg !323 + %10563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 93, !dbg !323 + %10564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 94, !dbg !323 + %10565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 95, !dbg !323 + %10566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 96, !dbg !323 + %10567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 97, !dbg !323 + %10568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 98, !dbg !323 + %10569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 99, !dbg !323 + %10570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 100, !dbg !323 + %10571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 101, !dbg !323 + %10572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 102, !dbg !323 + %10573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 103, !dbg !323 + %10574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 104, !dbg !323 + %10575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 105, !dbg !323 + %10576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 106, !dbg !323 + %10577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 107, !dbg !323 + %10578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 108, !dbg !323 + %10579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 109, !dbg !323 + %10580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 110, !dbg !323 + %10581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 111, !dbg !323 + %10582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 112, !dbg !323 + %10583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 113, !dbg !323 + %10584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 114, !dbg !323 + %10585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 115, !dbg !323 + %10586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 116, !dbg !323 + %10587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 117, !dbg !323 + %10588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 118, !dbg !323 + %10589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 119, !dbg !323 + %10590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 120, !dbg !323 + %10591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 121, !dbg !323 + %10592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 122, !dbg !323 + %10593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 123, !dbg !323 + %10594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 124, !dbg !323 + %10595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 125, !dbg !323 + %10596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 126, !dbg !323 + %10597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %10469, 127, !dbg !323 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !323 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !323 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !242 + %exitcond2215.not = icmp eq i64 %indvars.iv.next, 4, !dbg !242 + br i1 %exitcond2215.not, label %10598, label %5375, !dbg !242 + +10598: ; preds = %._crit_edge1820 + %10599 = getelementptr bfloat, ptr addrspace(1) %42, i64 %4683, !dbg !360 + %10600 = getelementptr bfloat, ptr addrspace(1) %42, i64 %4685, !dbg !360 + %10601 = getelementptr bfloat, ptr addrspace(1) %42, i64 %4687, !dbg !360 + %10602 = getelementptr bfloat, ptr addrspace(1) %42, i64 %4689, !dbg !360 + %10603 = getelementptr bfloat, ptr addrspace(1) %42, i64 %4691, !dbg !360 + %10604 = getelementptr bfloat, ptr addrspace(1) %42, i64 %4693, !dbg !360 + %10605 = getelementptr bfloat, ptr addrspace(1) %42, i64 %4695, !dbg !360 + %10606 = getelementptr bfloat, ptr addrspace(1) %42, i64 %4697, !dbg !360 + %10607 = getelementptr bfloat, ptr addrspace(1) %10599, i64 %4701, !dbg !361 + %10608 = getelementptr bfloat, ptr addrspace(1) %10600, i64 %4701, !dbg !361 + %10609 = getelementptr bfloat, ptr addrspace(1) %10601, i64 %4701, !dbg !361 + %10610 = getelementptr bfloat, ptr addrspace(1) %10602, i64 %4701, !dbg !361 + %10611 = getelementptr bfloat, ptr addrspace(1) %10603, i64 %4701, !dbg !361 + %10612 = getelementptr bfloat, ptr addrspace(1) %10604, i64 %4701, !dbg !361 + %10613 = getelementptr bfloat, ptr addrspace(1) %10605, i64 %4701, !dbg !361 + %10614 = getelementptr bfloat, ptr addrspace(1) %10606, i64 %4701, !dbg !361 + %10615 = insertelement <2 x float> poison, float %10470, i64 0, !dbg !362 + %10616 = insertelement <2 x float> %10615, float %10471, i64 1, !dbg !362 + %10617 = fptrunc <2 x float> %10616 to <2 x bfloat>, !dbg !362 + %10618 = insertelement <2 x float> poison, float %10472, i64 0, !dbg !362 + %10619 = insertelement <2 x float> %10618, float %10473, i64 1, !dbg !362 + %10620 = fptrunc <2 x float> %10619 to <2 x bfloat>, !dbg !362 + %10621 = insertelement <2 x float> poison, float %10474, i64 0, !dbg !362 + %10622 = insertelement <2 x float> %10621, float %10475, i64 1, !dbg !362 + %10623 = fptrunc <2 x float> %10622 to <2 x bfloat>, !dbg !362 + %10624 = insertelement <2 x float> poison, float %10476, i64 0, !dbg !362 + %10625 = insertelement <2 x float> %10624, float %10477, i64 1, !dbg !362 + %10626 = fptrunc <2 x float> %10625 to <2 x bfloat>, !dbg !362 + %10627 = insertelement <2 x float> poison, float %10478, i64 0, !dbg !362 + %10628 = insertelement <2 x float> %10627, float %10479, i64 1, !dbg !362 + %10629 = fptrunc <2 x float> %10628 to <2 x bfloat>, !dbg !362 + %10630 = insertelement <2 x float> poison, float %10480, i64 0, !dbg !362 + %10631 = insertelement <2 x float> %10630, float %10481, i64 1, !dbg !362 + %10632 = fptrunc <2 x float> %10631 to <2 x bfloat>, !dbg !362 + %10633 = insertelement <2 x float> poison, float %10482, i64 0, !dbg !362 + %10634 = insertelement <2 x float> %10633, float %10483, i64 1, !dbg !362 + %10635 = fptrunc <2 x float> %10634 to <2 x bfloat>, !dbg !362 + %10636 = insertelement <2 x float> poison, float %10484, i64 0, !dbg !362 + %10637 = insertelement <2 x float> %10636, float %10485, i64 1, !dbg !362 + %10638 = fptrunc <2 x float> %10637 to <2 x bfloat>, !dbg !362 + %10639 = insertelement <2 x float> poison, float %10486, i64 0, !dbg !362 + %10640 = insertelement <2 x float> %10639, float %10487, i64 1, !dbg !362 + %10641 = fptrunc <2 x float> %10640 to <2 x bfloat>, !dbg !362 + %10642 = insertelement <2 x float> poison, float %10488, i64 0, !dbg !362 + %10643 = insertelement <2 x float> %10642, float %10489, i64 1, !dbg !362 + %10644 = fptrunc <2 x float> %10643 to <2 x bfloat>, !dbg !362 + %10645 = insertelement <2 x float> poison, float %10490, i64 0, !dbg !362 + %10646 = insertelement <2 x float> %10645, float %10491, i64 1, !dbg !362 + %10647 = fptrunc <2 x float> %10646 to <2 x bfloat>, !dbg !362 + %10648 = insertelement <2 x float> poison, float %10492, i64 0, !dbg !362 + %10649 = insertelement <2 x float> %10648, float %10493, i64 1, !dbg !362 + %10650 = fptrunc <2 x float> %10649 to <2 x bfloat>, !dbg !362 + %10651 = insertelement <2 x float> poison, float %10494, i64 0, !dbg !362 + %10652 = insertelement <2 x float> %10651, float %10495, i64 1, !dbg !362 + %10653 = fptrunc <2 x float> %10652 to <2 x bfloat>, !dbg !362 + %10654 = insertelement <2 x float> poison, float %10496, i64 0, !dbg !362 + %10655 = insertelement <2 x float> %10654, float %10497, i64 1, !dbg !362 + %10656 = fptrunc <2 x float> %10655 to <2 x bfloat>, !dbg !362 + %10657 = insertelement <2 x float> poison, float %10498, i64 0, !dbg !362 + %10658 = insertelement <2 x float> %10657, float %10499, i64 1, !dbg !362 + %10659 = fptrunc <2 x float> %10658 to <2 x bfloat>, !dbg !362 + %10660 = insertelement <2 x float> poison, float %10500, i64 0, !dbg !362 + %10661 = insertelement <2 x float> %10660, float %10501, i64 1, !dbg !362 + %10662 = fptrunc <2 x float> %10661 to <2 x bfloat>, !dbg !362 + %10663 = insertelement <2 x float> poison, float %10502, i64 0, !dbg !362 + %10664 = insertelement <2 x float> %10663, float %10503, i64 1, !dbg !362 + %10665 = fptrunc <2 x float> %10664 to <2 x bfloat>, !dbg !362 + %10666 = insertelement <2 x float> poison, float %10504, i64 0, !dbg !362 + %10667 = insertelement <2 x float> %10666, float %10505, i64 1, !dbg !362 + %10668 = fptrunc <2 x float> %10667 to <2 x bfloat>, !dbg !362 + %10669 = insertelement <2 x float> poison, float %10506, i64 0, !dbg !362 + %10670 = insertelement <2 x float> %10669, float %10507, i64 1, !dbg !362 + %10671 = fptrunc <2 x float> %10670 to <2 x bfloat>, !dbg !362 + %10672 = insertelement <2 x float> poison, float %10508, i64 0, !dbg !362 + %10673 = insertelement <2 x float> %10672, float %10509, i64 1, !dbg !362 + %10674 = fptrunc <2 x float> %10673 to <2 x bfloat>, !dbg !362 + %10675 = insertelement <2 x float> poison, float %10510, i64 0, !dbg !362 + %10676 = insertelement <2 x float> %10675, float %10511, i64 1, !dbg !362 + %10677 = fptrunc <2 x float> %10676 to <2 x bfloat>, !dbg !362 + %10678 = insertelement <2 x float> poison, float %10512, i64 0, !dbg !362 + %10679 = insertelement <2 x float> %10678, float %10513, i64 1, !dbg !362 + %10680 = fptrunc <2 x float> %10679 to <2 x bfloat>, !dbg !362 + %10681 = insertelement <2 x float> poison, float %10514, i64 0, !dbg !362 + %10682 = insertelement <2 x float> %10681, float %10515, i64 1, !dbg !362 + %10683 = fptrunc <2 x float> %10682 to <2 x bfloat>, !dbg !362 + %10684 = insertelement <2 x float> poison, float %10516, i64 0, !dbg !362 + %10685 = insertelement <2 x float> %10684, float %10517, i64 1, !dbg !362 + %10686 = fptrunc <2 x float> %10685 to <2 x bfloat>, !dbg !362 + %10687 = insertelement <2 x float> poison, float %10518, i64 0, !dbg !362 + %10688 = insertelement <2 x float> %10687, float %10519, i64 1, !dbg !362 + %10689 = fptrunc <2 x float> %10688 to <2 x bfloat>, !dbg !362 + %10690 = insertelement <2 x float> poison, float %10520, i64 0, !dbg !362 + %10691 = insertelement <2 x float> %10690, float %10521, i64 1, !dbg !362 + %10692 = fptrunc <2 x float> %10691 to <2 x bfloat>, !dbg !362 + %10693 = insertelement <2 x float> poison, float %10522, i64 0, !dbg !362 + %10694 = insertelement <2 x float> %10693, float %10523, i64 1, !dbg !362 + %10695 = fptrunc <2 x float> %10694 to <2 x bfloat>, !dbg !362 + %10696 = insertelement <2 x float> poison, float %10524, i64 0, !dbg !362 + %10697 = insertelement <2 x float> %10696, float %10525, i64 1, !dbg !362 + %10698 = fptrunc <2 x float> %10697 to <2 x bfloat>, !dbg !362 + %10699 = insertelement <2 x float> poison, float %10526, i64 0, !dbg !362 + %10700 = insertelement <2 x float> %10699, float %10527, i64 1, !dbg !362 + %10701 = fptrunc <2 x float> %10700 to <2 x bfloat>, !dbg !362 + %10702 = insertelement <2 x float> poison, float %10528, i64 0, !dbg !362 + %10703 = insertelement <2 x float> %10702, float %10529, i64 1, !dbg !362 + %10704 = fptrunc <2 x float> %10703 to <2 x bfloat>, !dbg !362 + %10705 = insertelement <2 x float> poison, float %10530, i64 0, !dbg !362 + %10706 = insertelement <2 x float> %10705, float %10531, i64 1, !dbg !362 + %10707 = fptrunc <2 x float> %10706 to <2 x bfloat>, !dbg !362 + %10708 = insertelement <2 x float> poison, float %10532, i64 0, !dbg !362 + %10709 = insertelement <2 x float> %10708, float %10533, i64 1, !dbg !362 + %10710 = fptrunc <2 x float> %10709 to <2 x bfloat>, !dbg !362 + %10711 = shl nuw nsw i32 %4922, 13, !dbg !362 + %10712 = and i32 %5220, 7264, !dbg !362 + %10713 = and i32 %43, 24, !dbg !362 + %10714 = shl nuw nsw i32 %10713, 4, !dbg !362 + %10715 = shl nuw nsw i32 %43, 2, !dbg !362 + %10716 = and i32 %10715, 16, !dbg !362 + %10717 = or disjoint i32 %10711, %10716, !dbg !362 + %10718 = or disjoint i32 %10712, %10714, !dbg !362 + %10719 = or disjoint i32 %10717, %10718, !dbg !362 + %10720 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %10719, !dbg !362 + %10721 = bitcast <2 x bfloat> %10617 to i32, !dbg !362 + %10722 = bitcast <2 x bfloat> %10623 to i32, !dbg !362 + %10723 = bitcast <2 x bfloat> %10629 to i32, !dbg !362 + %10724 = bitcast <2 x bfloat> %10635 to i32, !dbg !362 + %10725 = insertelement <4 x i32> poison, i32 %10721, i64 0, !dbg !362 + %10726 = insertelement <4 x i32> %10725, i32 %10722, i64 1, !dbg !362 + %10727 = insertelement <4 x i32> %10726, i32 %10723, i64 2, !dbg !362 + %10728 = insertelement <4 x i32> %10727, i32 %10724, i64 3, !dbg !362 + store <4 x i32> %10728, ptr addrspace(3) %10720, align 16, !dbg !362 + %10729 = getelementptr inbounds nuw i8, ptr addrspace(3) %10720, i32 512, !dbg !362 + %10730 = bitcast <2 x bfloat> %10620 to i32, !dbg !362 + %10731 = bitcast <2 x bfloat> %10626 to i32, !dbg !362 + %10732 = bitcast <2 x bfloat> %10632 to i32, !dbg !362 + %10733 = bitcast <2 x bfloat> %10638 to i32, !dbg !362 + %10734 = insertelement <4 x i32> poison, i32 %10730, i64 0, !dbg !362 + %10735 = insertelement <4 x i32> %10734, i32 %10731, i64 1, !dbg !362 + %10736 = insertelement <4 x i32> %10735, i32 %10732, i64 2, !dbg !362 + %10737 = insertelement <4 x i32> %10736, i32 %10733, i64 3, !dbg !362 + store <4 x i32> %10737, ptr addrspace(3) %10729, align 16, !dbg !362 + %10738 = xor i32 %10719, 32, !dbg !362 + %10739 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %10738, !dbg !362 + %10740 = bitcast <2 x bfloat> %10641 to i32, !dbg !362 + %10741 = bitcast <2 x bfloat> %10647 to i32, !dbg !362 + %10742 = bitcast <2 x bfloat> %10653 to i32, !dbg !362 + %10743 = bitcast <2 x bfloat> %10659 to i32, !dbg !362 + %10744 = insertelement <4 x i32> poison, i32 %10740, i64 0, !dbg !362 + %10745 = insertelement <4 x i32> %10744, i32 %10741, i64 1, !dbg !362 + %10746 = insertelement <4 x i32> %10745, i32 %10742, i64 2, !dbg !362 + %10747 = insertelement <4 x i32> %10746, i32 %10743, i64 3, !dbg !362 + store <4 x i32> %10747, ptr addrspace(3) %10739, align 16, !dbg !362 + %10748 = getelementptr inbounds nuw i8, ptr addrspace(3) %10739, i32 512, !dbg !362 + %10749 = bitcast <2 x bfloat> %10644 to i32, !dbg !362 + %10750 = bitcast <2 x bfloat> %10650 to i32, !dbg !362 + %10751 = bitcast <2 x bfloat> %10656 to i32, !dbg !362 + %10752 = bitcast <2 x bfloat> %10662 to i32, !dbg !362 + %10753 = insertelement <4 x i32> poison, i32 %10749, i64 0, !dbg !362 + %10754 = insertelement <4 x i32> %10753, i32 %10750, i64 1, !dbg !362 + %10755 = insertelement <4 x i32> %10754, i32 %10751, i64 2, !dbg !362 + %10756 = insertelement <4 x i32> %10755, i32 %10752, i64 3, !dbg !362 + store <4 x i32> %10756, ptr addrspace(3) %10748, align 16, !dbg !362 + %10757 = xor i32 %10719, 64, !dbg !362 + %10758 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %10757, !dbg !362 + %10759 = bitcast <2 x bfloat> %10665 to i32, !dbg !362 + %10760 = bitcast <2 x bfloat> %10671 to i32, !dbg !362 + %10761 = bitcast <2 x bfloat> %10677 to i32, !dbg !362 + %10762 = bitcast <2 x bfloat> %10683 to i32, !dbg !362 + %10763 = insertelement <4 x i32> poison, i32 %10759, i64 0, !dbg !362 + %10764 = insertelement <4 x i32> %10763, i32 %10760, i64 1, !dbg !362 + %10765 = insertelement <4 x i32> %10764, i32 %10761, i64 2, !dbg !362 + %10766 = insertelement <4 x i32> %10765, i32 %10762, i64 3, !dbg !362 + store <4 x i32> %10766, ptr addrspace(3) %10758, align 16, !dbg !362 + %10767 = getelementptr inbounds nuw i8, ptr addrspace(3) %10758, i32 512, !dbg !362 + %10768 = bitcast <2 x bfloat> %10668 to i32, !dbg !362 + %10769 = bitcast <2 x bfloat> %10674 to i32, !dbg !362 + %10770 = bitcast <2 x bfloat> %10680 to i32, !dbg !362 + %10771 = bitcast <2 x bfloat> %10686 to i32, !dbg !362 + %10772 = insertelement <4 x i32> poison, i32 %10768, i64 0, !dbg !362 + %10773 = insertelement <4 x i32> %10772, i32 %10769, i64 1, !dbg !362 + %10774 = insertelement <4 x i32> %10773, i32 %10770, i64 2, !dbg !362 + %10775 = insertelement <4 x i32> %10774, i32 %10771, i64 3, !dbg !362 + store <4 x i32> %10775, ptr addrspace(3) %10767, align 16, !dbg !362 + %10776 = xor i32 %10719, 96, !dbg !362 + %10777 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %10776, !dbg !362 + %10778 = bitcast <2 x bfloat> %10689 to i32, !dbg !362 + %10779 = bitcast <2 x bfloat> %10695 to i32, !dbg !362 + %10780 = bitcast <2 x bfloat> %10701 to i32, !dbg !362 + %10781 = bitcast <2 x bfloat> %10707 to i32, !dbg !362 + %10782 = insertelement <4 x i32> poison, i32 %10778, i64 0, !dbg !362 + %10783 = insertelement <4 x i32> %10782, i32 %10779, i64 1, !dbg !362 + %10784 = insertelement <4 x i32> %10783, i32 %10780, i64 2, !dbg !362 + %10785 = insertelement <4 x i32> %10784, i32 %10781, i64 3, !dbg !362 + store <4 x i32> %10785, ptr addrspace(3) %10777, align 16, !dbg !362 + %10786 = getelementptr inbounds nuw i8, ptr addrspace(3) %10777, i32 512, !dbg !362 + %10787 = bitcast <2 x bfloat> %10692 to i32, !dbg !362 + %10788 = bitcast <2 x bfloat> %10698 to i32, !dbg !362 + %10789 = bitcast <2 x bfloat> %10704 to i32, !dbg !362 + %10790 = bitcast <2 x bfloat> %10710 to i32, !dbg !362 + %10791 = insertelement <4 x i32> poison, i32 %10787, i64 0, !dbg !362 + %10792 = insertelement <4 x i32> %10791, i32 %10788, i64 1, !dbg !362 + %10793 = insertelement <4 x i32> %10792, i32 %10789, i64 2, !dbg !362 + %10794 = insertelement <4 x i32> %10793, i32 %10790, i64 3, !dbg !362 + store <4 x i32> %10794, ptr addrspace(3) %10786, align 16, !dbg !362 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !362 + %10795 = shl nuw nsw i32 %10713, 10, !dbg !362 + %10796 = shl nuw nsw i32 %4922, 5, !dbg !362 + %10797 = and i32 %10715, 1008, !dbg !362 + %10798 = or disjoint i32 %10795, %10796, !dbg !362 + %10799 = xor i32 %10798, %10797, !dbg !362 + %10800 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %10799, !dbg !362 + %10801 = ptrtoint ptr addrspace(3) %10800 to i32, !dbg !362 + %10802 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10801) #3, !dbg !362 + %10803 = extractvalue { i32, i32, i32, i32 } %10802, 0, !dbg !362 + %10804 = extractvalue { i32, i32, i32, i32 } %10802, 1, !dbg !362 + %10805 = extractvalue { i32, i32, i32, i32 } %10802, 2, !dbg !362 + %10806 = extractvalue { i32, i32, i32, i32 } %10802, 3, !dbg !362 + %10807 = getelementptr inbounds nuw i8, ptr addrspace(3) %10800, i32 1024, !dbg !362 + %10808 = ptrtoint ptr addrspace(3) %10807 to i32, !dbg !362 + %10809 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10808) #3, !dbg !362 + %10810 = extractvalue { i32, i32, i32, i32 } %10809, 0, !dbg !362 + %10811 = extractvalue { i32, i32, i32, i32 } %10809, 1, !dbg !362 + %10812 = extractvalue { i32, i32, i32, i32 } %10809, 2, !dbg !362 + %10813 = extractvalue { i32, i32, i32, i32 } %10809, 3, !dbg !362 + %10814 = getelementptr inbounds nuw i8, ptr addrspace(3) %10800, i32 2048, !dbg !362 + %10815 = ptrtoint ptr addrspace(3) %10814 to i32, !dbg !362 + %10816 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10815) #3, !dbg !362 + %10817 = extractvalue { i32, i32, i32, i32 } %10816, 0, !dbg !362 + %10818 = extractvalue { i32, i32, i32, i32 } %10816, 1, !dbg !362 + %10819 = extractvalue { i32, i32, i32, i32 } %10816, 2, !dbg !362 + %10820 = extractvalue { i32, i32, i32, i32 } %10816, 3, !dbg !362 + %10821 = getelementptr inbounds nuw i8, ptr addrspace(3) %10800, i32 3072, !dbg !362 + %10822 = ptrtoint ptr addrspace(3) %10821 to i32, !dbg !362 + %10823 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10822) #3, !dbg !362 + %10824 = extractvalue { i32, i32, i32, i32 } %10823, 0, !dbg !362 + %10825 = extractvalue { i32, i32, i32, i32 } %10823, 1, !dbg !362 + %10826 = extractvalue { i32, i32, i32, i32 } %10823, 2, !dbg !362 + %10827 = extractvalue { i32, i32, i32, i32 } %10823, 3, !dbg !362 + %10828 = getelementptr inbounds nuw i8, ptr addrspace(3) %10800, i32 4096, !dbg !362 + %10829 = ptrtoint ptr addrspace(3) %10828 to i32, !dbg !362 + %10830 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10829) #3, !dbg !362 + %10831 = extractvalue { i32, i32, i32, i32 } %10830, 0, !dbg !362 + %10832 = extractvalue { i32, i32, i32, i32 } %10830, 1, !dbg !362 + %10833 = extractvalue { i32, i32, i32, i32 } %10830, 2, !dbg !362 + %10834 = extractvalue { i32, i32, i32, i32 } %10830, 3, !dbg !362 + %10835 = getelementptr inbounds nuw i8, ptr addrspace(3) %10800, i32 5120, !dbg !362 + %10836 = ptrtoint ptr addrspace(3) %10835 to i32, !dbg !362 + %10837 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10836) #3, !dbg !362 + %10838 = extractvalue { i32, i32, i32, i32 } %10837, 0, !dbg !362 + %10839 = extractvalue { i32, i32, i32, i32 } %10837, 1, !dbg !362 + %10840 = extractvalue { i32, i32, i32, i32 } %10837, 2, !dbg !362 + %10841 = extractvalue { i32, i32, i32, i32 } %10837, 3, !dbg !362 + %10842 = getelementptr inbounds nuw i8, ptr addrspace(3) %10800, i32 6144, !dbg !362 + %10843 = ptrtoint ptr addrspace(3) %10842 to i32, !dbg !362 + %10844 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10843) #3, !dbg !362 + %10845 = extractvalue { i32, i32, i32, i32 } %10844, 0, !dbg !362 + %10846 = extractvalue { i32, i32, i32, i32 } %10844, 1, !dbg !362 + %10847 = extractvalue { i32, i32, i32, i32 } %10844, 2, !dbg !362 + %10848 = extractvalue { i32, i32, i32, i32 } %10844, 3, !dbg !362 + %10849 = getelementptr inbounds nuw i8, ptr addrspace(3) %10800, i32 7168, !dbg !362 + %10850 = ptrtoint ptr addrspace(3) %10849 to i32, !dbg !362 + %10851 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10850) #3, !dbg !362 + %10852 = extractvalue { i32, i32, i32, i32 } %10851, 0, !dbg !362 + %10853 = extractvalue { i32, i32, i32, i32 } %10851, 1, !dbg !362 + %10854 = extractvalue { i32, i32, i32, i32 } %10851, 2, !dbg !362 + %10855 = extractvalue { i32, i32, i32, i32 } %10851, 3, !dbg !362 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10803, i32 %10804, i32 %10805, i32 %10806, ptr addrspace(1) %10607, i1 %4710) #3, !dbg !362 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10810, i32 %10811, i32 %10812, i32 %10813, ptr addrspace(1) %10608, i1 %4711) #3, !dbg !362 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10817, i32 %10818, i32 %10819, i32 %10820, ptr addrspace(1) %10609, i1 %4712) #3, !dbg !362 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10824, i32 %10825, i32 %10826, i32 %10827, ptr addrspace(1) %10610, i1 %4713) #3, !dbg !362 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10831, i32 %10832, i32 %10833, i32 %10834, ptr addrspace(1) %10611, i1 %4714) #3, !dbg !362 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10838, i32 %10839, i32 %10840, i32 %10841, ptr addrspace(1) %10612, i1 %4715) #3, !dbg !362 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10845, i32 %10846, i32 %10847, i32 %10848, ptr addrspace(1) %10613, i1 %4716) #3, !dbg !362 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %10852, i32 %10853, i32 %10854, i32 %10855, ptr addrspace(1) %10614, i1 %4717) #3, !dbg !362 + %10856 = insertelement <2 x float> poison, float %10534, i64 0, !dbg !363 + %10857 = insertelement <2 x float> %10856, float %10535, i64 1, !dbg !363 + %10858 = fmul <2 x float> %10857, splat (float 0x3FB6A09E60000000), !dbg !363 + %10859 = insertelement <2 x float> poison, float %10536, i64 0, !dbg !363 + %10860 = insertelement <2 x float> %10859, float %10537, i64 1, !dbg !363 + %10861 = fmul <2 x float> %10860, splat (float 0x3FB6A09E60000000), !dbg !363 + %10862 = insertelement <2 x float> poison, float %10538, i64 0, !dbg !363 + %10863 = insertelement <2 x float> %10862, float %10539, i64 1, !dbg !363 + %10864 = fmul <2 x float> %10863, splat (float 0x3FB6A09E60000000), !dbg !363 + %10865 = insertelement <2 x float> poison, float %10540, i64 0, !dbg !363 + %10866 = insertelement <2 x float> %10865, float %10541, i64 1, !dbg !363 + %10867 = fmul <2 x float> %10866, splat (float 0x3FB6A09E60000000), !dbg !363 + %10868 = insertelement <2 x float> poison, float %10542, i64 0, !dbg !363 + %10869 = insertelement <2 x float> %10868, float %10543, i64 1, !dbg !363 + %10870 = fmul <2 x float> %10869, splat (float 0x3FB6A09E60000000), !dbg !363 + %10871 = insertelement <2 x float> poison, float %10544, i64 0, !dbg !363 + %10872 = insertelement <2 x float> %10871, float %10545, i64 1, !dbg !363 + %10873 = fmul <2 x float> %10872, splat (float 0x3FB6A09E60000000), !dbg !363 + %10874 = insertelement <2 x float> poison, float %10546, i64 0, !dbg !363 + %10875 = insertelement <2 x float> %10874, float %10547, i64 1, !dbg !363 + %10876 = fmul <2 x float> %10875, splat (float 0x3FB6A09E60000000), !dbg !363 + %10877 = insertelement <2 x float> poison, float %10548, i64 0, !dbg !363 + %10878 = insertelement <2 x float> %10877, float %10549, i64 1, !dbg !363 + %10879 = fmul <2 x float> %10878, splat (float 0x3FB6A09E60000000), !dbg !363 + %10880 = insertelement <2 x float> poison, float %10550, i64 0, !dbg !363 + %10881 = insertelement <2 x float> %10880, float %10551, i64 1, !dbg !363 + %10882 = fmul <2 x float> %10881, splat (float 0x3FB6A09E60000000), !dbg !363 + %10883 = insertelement <2 x float> poison, float %10552, i64 0, !dbg !363 + %10884 = insertelement <2 x float> %10883, float %10553, i64 1, !dbg !363 + %10885 = fmul <2 x float> %10884, splat (float 0x3FB6A09E60000000), !dbg !363 + %10886 = insertelement <2 x float> poison, float %10554, i64 0, !dbg !363 + %10887 = insertelement <2 x float> %10886, float %10555, i64 1, !dbg !363 + %10888 = fmul <2 x float> %10887, splat (float 0x3FB6A09E60000000), !dbg !363 + %10889 = insertelement <2 x float> poison, float %10556, i64 0, !dbg !363 + %10890 = insertelement <2 x float> %10889, float %10557, i64 1, !dbg !363 + %10891 = fmul <2 x float> %10890, splat (float 0x3FB6A09E60000000), !dbg !363 + %10892 = insertelement <2 x float> poison, float %10558, i64 0, !dbg !363 + %10893 = insertelement <2 x float> %10892, float %10559, i64 1, !dbg !363 + %10894 = fmul <2 x float> %10893, splat (float 0x3FB6A09E60000000), !dbg !363 + %10895 = insertelement <2 x float> poison, float %10560, i64 0, !dbg !363 + %10896 = insertelement <2 x float> %10895, float %10561, i64 1, !dbg !363 + %10897 = fmul <2 x float> %10896, splat (float 0x3FB6A09E60000000), !dbg !363 + %10898 = insertelement <2 x float> poison, float %10562, i64 0, !dbg !363 + %10899 = insertelement <2 x float> %10898, float %10563, i64 1, !dbg !363 + %10900 = fmul <2 x float> %10899, splat (float 0x3FB6A09E60000000), !dbg !363 + %10901 = insertelement <2 x float> poison, float %10564, i64 0, !dbg !363 + %10902 = insertelement <2 x float> %10901, float %10565, i64 1, !dbg !363 + %10903 = fmul <2 x float> %10902, splat (float 0x3FB6A09E60000000), !dbg !363 + %10904 = insertelement <2 x float> poison, float %10566, i64 0, !dbg !363 + %10905 = insertelement <2 x float> %10904, float %10567, i64 1, !dbg !363 + %10906 = fmul <2 x float> %10905, splat (float 0x3FB6A09E60000000), !dbg !363 + %10907 = insertelement <2 x float> poison, float %10568, i64 0, !dbg !363 + %10908 = insertelement <2 x float> %10907, float %10569, i64 1, !dbg !363 + %10909 = fmul <2 x float> %10908, splat (float 0x3FB6A09E60000000), !dbg !363 + %10910 = insertelement <2 x float> poison, float %10570, i64 0, !dbg !363 + %10911 = insertelement <2 x float> %10910, float %10571, i64 1, !dbg !363 + %10912 = fmul <2 x float> %10911, splat (float 0x3FB6A09E60000000), !dbg !363 + %10913 = insertelement <2 x float> poison, float %10572, i64 0, !dbg !363 + %10914 = insertelement <2 x float> %10913, float %10573, i64 1, !dbg !363 + %10915 = fmul <2 x float> %10914, splat (float 0x3FB6A09E60000000), !dbg !363 + %10916 = insertelement <2 x float> poison, float %10574, i64 0, !dbg !363 + %10917 = insertelement <2 x float> %10916, float %10575, i64 1, !dbg !363 + %10918 = fmul <2 x float> %10917, splat (float 0x3FB6A09E60000000), !dbg !363 + %10919 = insertelement <2 x float> poison, float %10576, i64 0, !dbg !363 + %10920 = insertelement <2 x float> %10919, float %10577, i64 1, !dbg !363 + %10921 = fmul <2 x float> %10920, splat (float 0x3FB6A09E60000000), !dbg !363 + %10922 = insertelement <2 x float> poison, float %10578, i64 0, !dbg !363 + %10923 = insertelement <2 x float> %10922, float %10579, i64 1, !dbg !363 + %10924 = fmul <2 x float> %10923, splat (float 0x3FB6A09E60000000), !dbg !363 + %10925 = insertelement <2 x float> poison, float %10580, i64 0, !dbg !363 + %10926 = insertelement <2 x float> %10925, float %10581, i64 1, !dbg !363 + %10927 = fmul <2 x float> %10926, splat (float 0x3FB6A09E60000000), !dbg !363 + %10928 = insertelement <2 x float> poison, float %10582, i64 0, !dbg !363 + %10929 = insertelement <2 x float> %10928, float %10583, i64 1, !dbg !363 + %10930 = fmul <2 x float> %10929, splat (float 0x3FB6A09E60000000), !dbg !363 + %10931 = insertelement <2 x float> poison, float %10584, i64 0, !dbg !363 + %10932 = insertelement <2 x float> %10931, float %10585, i64 1, !dbg !363 + %10933 = fmul <2 x float> %10932, splat (float 0x3FB6A09E60000000), !dbg !363 + %10934 = insertelement <2 x float> poison, float %10586, i64 0, !dbg !363 + %10935 = insertelement <2 x float> %10934, float %10587, i64 1, !dbg !363 + %10936 = fmul <2 x float> %10935, splat (float 0x3FB6A09E60000000), !dbg !363 + %10937 = insertelement <2 x float> poison, float %10588, i64 0, !dbg !363 + %10938 = insertelement <2 x float> %10937, float %10589, i64 1, !dbg !363 + %10939 = fmul <2 x float> %10938, splat (float 0x3FB6A09E60000000), !dbg !363 + %10940 = insertelement <2 x float> poison, float %10590, i64 0, !dbg !363 + %10941 = insertelement <2 x float> %10940, float %10591, i64 1, !dbg !363 + %10942 = fmul <2 x float> %10941, splat (float 0x3FB6A09E60000000), !dbg !363 + %10943 = insertelement <2 x float> poison, float %10592, i64 0, !dbg !363 + %10944 = insertelement <2 x float> %10943, float %10593, i64 1, !dbg !363 + %10945 = fmul <2 x float> %10944, splat (float 0x3FB6A09E60000000), !dbg !363 + %10946 = insertelement <2 x float> poison, float %10594, i64 0, !dbg !363 + %10947 = insertelement <2 x float> %10946, float %10595, i64 1, !dbg !363 + %10948 = fmul <2 x float> %10947, splat (float 0x3FB6A09E60000000), !dbg !363 + %10949 = insertelement <2 x float> poison, float %10596, i64 0, !dbg !363 + %10950 = insertelement <2 x float> %10949, float %10597, i64 1, !dbg !363 + %10951 = fmul <2 x float> %10950, splat (float 0x3FB6A09E60000000), !dbg !363 + %10952 = or disjoint i32 %4675, %4700, !dbg !364 + %10953 = or disjoint i32 %4676, %4700, !dbg !364 + %10954 = or disjoint i32 %4677, %4700, !dbg !364 + %10955 = or disjoint i32 %4678, %4700, !dbg !364 + %10956 = or disjoint i32 %4679, %4700, !dbg !364 + %10957 = or disjoint i32 %4680, %4700, !dbg !364 + %10958 = or disjoint i32 %4681, %4700, !dbg !364 + %10959 = or disjoint i32 %4682, %4700, !dbg !364 + %10960 = shl nuw nsw i32 %31, 7, !dbg !365 + %10961 = mul i32 %18, %10960, !dbg !366 + %10962 = add i32 %10952, %10961, !dbg !367 + %10963 = add i32 %10953, %10961, !dbg !367 + %10964 = add i32 %10954, %10961, !dbg !367 + %10965 = add i32 %10955, %10961, !dbg !367 + %10966 = add i32 %10956, %10961, !dbg !367 + %10967 = add i32 %10957, %10961, !dbg !367 + %10968 = add i32 %10958, %10961, !dbg !367 + %10969 = add i32 %10959, %10961, !dbg !367 + %10970 = shl nuw nsw i32 %30, 10, !dbg !368 + %10971 = mul i32 %18, %10970, !dbg !369 + %10972 = add i32 %10962, %10971, !dbg !370 + %10973 = add i32 %10963, %10971, !dbg !370 + %10974 = add i32 %10964, %10971, !dbg !370 + %10975 = add i32 %10965, %10971, !dbg !370 + %10976 = add i32 %10966, %10971, !dbg !370 + %10977 = add i32 %10967, %10971, !dbg !370 + %10978 = add i32 %10968, %10971, !dbg !370 + %10979 = add i32 %10969, %10971, !dbg !370 + %10980 = sext i32 %10972 to i64, !dbg !371 + %10981 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10980, !dbg !371 + %10982 = sext i32 %10973 to i64, !dbg !371 + %10983 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10982, !dbg !371 + %10984 = sext i32 %10974 to i64, !dbg !371 + %10985 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10984, !dbg !371 + %10986 = sext i32 %10975 to i64, !dbg !371 + %10987 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10986, !dbg !371 + %10988 = sext i32 %10976 to i64, !dbg !371 + %10989 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10988, !dbg !371 + %10990 = sext i32 %10977 to i64, !dbg !371 + %10991 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10990, !dbg !371 + %10992 = sext i32 %10978 to i64, !dbg !371 + %10993 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10992, !dbg !371 + %10994 = sext i32 %10979 to i64, !dbg !371 + %10995 = getelementptr bfloat, ptr addrspace(1) %17, i64 %10994, !dbg !371 + %10996 = fptrunc <2 x float> %10858 to <2 x bfloat>, !dbg !372 + %10997 = fptrunc <2 x float> %10861 to <2 x bfloat>, !dbg !372 + %10998 = fptrunc <2 x float> %10864 to <2 x bfloat>, !dbg !372 + %10999 = fptrunc <2 x float> %10867 to <2 x bfloat>, !dbg !372 + %11000 = fptrunc <2 x float> %10870 to <2 x bfloat>, !dbg !372 + %11001 = fptrunc <2 x float> %10873 to <2 x bfloat>, !dbg !372 + %11002 = fptrunc <2 x float> %10876 to <2 x bfloat>, !dbg !372 + %11003 = fptrunc <2 x float> %10879 to <2 x bfloat>, !dbg !372 + %11004 = fptrunc <2 x float> %10882 to <2 x bfloat>, !dbg !372 + %11005 = fptrunc <2 x float> %10885 to <2 x bfloat>, !dbg !372 + %11006 = fptrunc <2 x float> %10888 to <2 x bfloat>, !dbg !372 + %11007 = fptrunc <2 x float> %10891 to <2 x bfloat>, !dbg !372 + %11008 = fptrunc <2 x float> %10894 to <2 x bfloat>, !dbg !372 + %11009 = fptrunc <2 x float> %10897 to <2 x bfloat>, !dbg !372 + %11010 = fptrunc <2 x float> %10900 to <2 x bfloat>, !dbg !372 + %11011 = fptrunc <2 x float> %10903 to <2 x bfloat>, !dbg !372 + %11012 = fptrunc <2 x float> %10906 to <2 x bfloat>, !dbg !372 + %11013 = fptrunc <2 x float> %10909 to <2 x bfloat>, !dbg !372 + %11014 = fptrunc <2 x float> %10912 to <2 x bfloat>, !dbg !372 + %11015 = fptrunc <2 x float> %10915 to <2 x bfloat>, !dbg !372 + %11016 = fptrunc <2 x float> %10918 to <2 x bfloat>, !dbg !372 + %11017 = fptrunc <2 x float> %10921 to <2 x bfloat>, !dbg !372 + %11018 = fptrunc <2 x float> %10924 to <2 x bfloat>, !dbg !372 + %11019 = fptrunc <2 x float> %10927 to <2 x bfloat>, !dbg !372 + %11020 = fptrunc <2 x float> %10930 to <2 x bfloat>, !dbg !372 + %11021 = fptrunc <2 x float> %10933 to <2 x bfloat>, !dbg !372 + %11022 = fptrunc <2 x float> %10936 to <2 x bfloat>, !dbg !372 + %11023 = fptrunc <2 x float> %10939 to <2 x bfloat>, !dbg !372 + %11024 = fptrunc <2 x float> %10942 to <2 x bfloat>, !dbg !372 + %11025 = fptrunc <2 x float> %10945 to <2 x bfloat>, !dbg !372 + %11026 = fptrunc <2 x float> %10948 to <2 x bfloat>, !dbg !372 + %11027 = fptrunc <2 x float> %10951 to <2 x bfloat>, !dbg !372 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !372 + %11028 = bitcast <2 x bfloat> %10996 to i32, !dbg !372 + %11029 = bitcast <2 x bfloat> %10998 to i32, !dbg !372 + %11030 = bitcast <2 x bfloat> %11000 to i32, !dbg !372 + %11031 = bitcast <2 x bfloat> %11002 to i32, !dbg !372 + %11032 = insertelement <4 x i32> poison, i32 %11028, i64 0, !dbg !372 + %11033 = insertelement <4 x i32> %11032, i32 %11029, i64 1, !dbg !372 + %11034 = insertelement <4 x i32> %11033, i32 %11030, i64 2, !dbg !372 + %11035 = insertelement <4 x i32> %11034, i32 %11031, i64 3, !dbg !372 + store <4 x i32> %11035, ptr addrspace(3) %10720, align 16, !dbg !372 + %11036 = bitcast <2 x bfloat> %10997 to i32, !dbg !372 + %11037 = bitcast <2 x bfloat> %10999 to i32, !dbg !372 + %11038 = bitcast <2 x bfloat> %11001 to i32, !dbg !372 + %11039 = bitcast <2 x bfloat> %11003 to i32, !dbg !372 + %11040 = insertelement <4 x i32> poison, i32 %11036, i64 0, !dbg !372 + %11041 = insertelement <4 x i32> %11040, i32 %11037, i64 1, !dbg !372 + %11042 = insertelement <4 x i32> %11041, i32 %11038, i64 2, !dbg !372 + %11043 = insertelement <4 x i32> %11042, i32 %11039, i64 3, !dbg !372 + store <4 x i32> %11043, ptr addrspace(3) %10729, align 16, !dbg !372 + %11044 = bitcast <2 x bfloat> %11004 to i32, !dbg !372 + %11045 = bitcast <2 x bfloat> %11006 to i32, !dbg !372 + %11046 = bitcast <2 x bfloat> %11008 to i32, !dbg !372 + %11047 = bitcast <2 x bfloat> %11010 to i32, !dbg !372 + %11048 = insertelement <4 x i32> poison, i32 %11044, i64 0, !dbg !372 + %11049 = insertelement <4 x i32> %11048, i32 %11045, i64 1, !dbg !372 + %11050 = insertelement <4 x i32> %11049, i32 %11046, i64 2, !dbg !372 + %11051 = insertelement <4 x i32> %11050, i32 %11047, i64 3, !dbg !372 + store <4 x i32> %11051, ptr addrspace(3) %10739, align 16, !dbg !372 + %11052 = bitcast <2 x bfloat> %11005 to i32, !dbg !372 + %11053 = bitcast <2 x bfloat> %11007 to i32, !dbg !372 + %11054 = bitcast <2 x bfloat> %11009 to i32, !dbg !372 + %11055 = bitcast <2 x bfloat> %11011 to i32, !dbg !372 + %11056 = insertelement <4 x i32> poison, i32 %11052, i64 0, !dbg !372 + %11057 = insertelement <4 x i32> %11056, i32 %11053, i64 1, !dbg !372 + %11058 = insertelement <4 x i32> %11057, i32 %11054, i64 2, !dbg !372 + %11059 = insertelement <4 x i32> %11058, i32 %11055, i64 3, !dbg !372 + store <4 x i32> %11059, ptr addrspace(3) %10748, align 16, !dbg !372 + %11060 = bitcast <2 x bfloat> %11012 to i32, !dbg !372 + %11061 = bitcast <2 x bfloat> %11014 to i32, !dbg !372 + %11062 = bitcast <2 x bfloat> %11016 to i32, !dbg !372 + %11063 = bitcast <2 x bfloat> %11018 to i32, !dbg !372 + %11064 = insertelement <4 x i32> poison, i32 %11060, i64 0, !dbg !372 + %11065 = insertelement <4 x i32> %11064, i32 %11061, i64 1, !dbg !372 + %11066 = insertelement <4 x i32> %11065, i32 %11062, i64 2, !dbg !372 + %11067 = insertelement <4 x i32> %11066, i32 %11063, i64 3, !dbg !372 + store <4 x i32> %11067, ptr addrspace(3) %10758, align 16, !dbg !372 + %11068 = bitcast <2 x bfloat> %11013 to i32, !dbg !372 + %11069 = bitcast <2 x bfloat> %11015 to i32, !dbg !372 + %11070 = bitcast <2 x bfloat> %11017 to i32, !dbg !372 + %11071 = bitcast <2 x bfloat> %11019 to i32, !dbg !372 + %11072 = insertelement <4 x i32> poison, i32 %11068, i64 0, !dbg !372 + %11073 = insertelement <4 x i32> %11072, i32 %11069, i64 1, !dbg !372 + %11074 = insertelement <4 x i32> %11073, i32 %11070, i64 2, !dbg !372 + %11075 = insertelement <4 x i32> %11074, i32 %11071, i64 3, !dbg !372 + store <4 x i32> %11075, ptr addrspace(3) %10767, align 16, !dbg !372 + %11076 = bitcast <2 x bfloat> %11020 to i32, !dbg !372 + %11077 = bitcast <2 x bfloat> %11022 to i32, !dbg !372 + %11078 = bitcast <2 x bfloat> %11024 to i32, !dbg !372 + %11079 = bitcast <2 x bfloat> %11026 to i32, !dbg !372 + %11080 = insertelement <4 x i32> poison, i32 %11076, i64 0, !dbg !372 + %11081 = insertelement <4 x i32> %11080, i32 %11077, i64 1, !dbg !372 + %11082 = insertelement <4 x i32> %11081, i32 %11078, i64 2, !dbg !372 + %11083 = insertelement <4 x i32> %11082, i32 %11079, i64 3, !dbg !372 + store <4 x i32> %11083, ptr addrspace(3) %10777, align 16, !dbg !372 + %11084 = bitcast <2 x bfloat> %11021 to i32, !dbg !372 + %11085 = bitcast <2 x bfloat> %11023 to i32, !dbg !372 + %11086 = bitcast <2 x bfloat> %11025 to i32, !dbg !372 + %11087 = bitcast <2 x bfloat> %11027 to i32, !dbg !372 + %11088 = insertelement <4 x i32> poison, i32 %11084, i64 0, !dbg !372 + %11089 = insertelement <4 x i32> %11088, i32 %11085, i64 1, !dbg !372 + %11090 = insertelement <4 x i32> %11089, i32 %11086, i64 2, !dbg !372 + %11091 = insertelement <4 x i32> %11090, i32 %11087, i64 3, !dbg !372 + store <4 x i32> %11091, ptr addrspace(3) %10786, align 16, !dbg !372 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !372 + %11092 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10801) #3, !dbg !372 + %11093 = extractvalue { i32, i32, i32, i32 } %11092, 0, !dbg !372 + %11094 = extractvalue { i32, i32, i32, i32 } %11092, 1, !dbg !372 + %11095 = extractvalue { i32, i32, i32, i32 } %11092, 2, !dbg !372 + %11096 = extractvalue { i32, i32, i32, i32 } %11092, 3, !dbg !372 + %11097 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10808) #3, !dbg !372 + %11098 = extractvalue { i32, i32, i32, i32 } %11097, 0, !dbg !372 + %11099 = extractvalue { i32, i32, i32, i32 } %11097, 1, !dbg !372 + %11100 = extractvalue { i32, i32, i32, i32 } %11097, 2, !dbg !372 + %11101 = extractvalue { i32, i32, i32, i32 } %11097, 3, !dbg !372 + %11102 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10815) #3, !dbg !372 + %11103 = extractvalue { i32, i32, i32, i32 } %11102, 0, !dbg !372 + %11104 = extractvalue { i32, i32, i32, i32 } %11102, 1, !dbg !372 + %11105 = extractvalue { i32, i32, i32, i32 } %11102, 2, !dbg !372 + %11106 = extractvalue { i32, i32, i32, i32 } %11102, 3, !dbg !372 + %11107 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10822) #3, !dbg !372 + %11108 = extractvalue { i32, i32, i32, i32 } %11107, 0, !dbg !372 + %11109 = extractvalue { i32, i32, i32, i32 } %11107, 1, !dbg !372 + %11110 = extractvalue { i32, i32, i32, i32 } %11107, 2, !dbg !372 + %11111 = extractvalue { i32, i32, i32, i32 } %11107, 3, !dbg !372 + %11112 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10829) #3, !dbg !372 + %11113 = extractvalue { i32, i32, i32, i32 } %11112, 0, !dbg !372 + %11114 = extractvalue { i32, i32, i32, i32 } %11112, 1, !dbg !372 + %11115 = extractvalue { i32, i32, i32, i32 } %11112, 2, !dbg !372 + %11116 = extractvalue { i32, i32, i32, i32 } %11112, 3, !dbg !372 + %11117 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10836) #3, !dbg !372 + %11118 = extractvalue { i32, i32, i32, i32 } %11117, 0, !dbg !372 + %11119 = extractvalue { i32, i32, i32, i32 } %11117, 1, !dbg !372 + %11120 = extractvalue { i32, i32, i32, i32 } %11117, 2, !dbg !372 + %11121 = extractvalue { i32, i32, i32, i32 } %11117, 3, !dbg !372 + %11122 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10843) #3, !dbg !372 + %11123 = extractvalue { i32, i32, i32, i32 } %11122, 0, !dbg !372 + %11124 = extractvalue { i32, i32, i32, i32 } %11122, 1, !dbg !372 + %11125 = extractvalue { i32, i32, i32, i32 } %11122, 2, !dbg !372 + %11126 = extractvalue { i32, i32, i32, i32 } %11122, 3, !dbg !372 + %11127 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %10850) #3, !dbg !372 + %11128 = extractvalue { i32, i32, i32, i32 } %11127, 0, !dbg !372 + %11129 = extractvalue { i32, i32, i32, i32 } %11127, 1, !dbg !372 + %11130 = extractvalue { i32, i32, i32, i32 } %11127, 2, !dbg !372 + %11131 = extractvalue { i32, i32, i32, i32 } %11127, 3, !dbg !372 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11093, i32 %11094, i32 %11095, i32 %11096, ptr addrspace(1) %10981, i1 %4710) #3, !dbg !372 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11098, i32 %11099, i32 %11100, i32 %11101, ptr addrspace(1) %10983, i1 %4711) #3, !dbg !372 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11103, i32 %11104, i32 %11105, i32 %11106, ptr addrspace(1) %10985, i1 %4712) #3, !dbg !372 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11108, i32 %11109, i32 %11110, i32 %11111, ptr addrspace(1) %10987, i1 %4713) #3, !dbg !372 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11113, i32 %11114, i32 %11115, i32 %11116, ptr addrspace(1) %10989, i1 %4714) #3, !dbg !372 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11118, i32 %11119, i32 %11120, i32 %11121, ptr addrspace(1) %10991, i1 %4715) #3, !dbg !372 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11123, i32 %11124, i32 %11125, i32 %11126, ptr addrspace(1) %10993, i1 %4716) #3, !dbg !372 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %11128, i32 %11129, i32 %11130, i32 %11131, ptr addrspace(1) %10995, i1 %4717) #3, !dbg !372 + br label %11132, !dbg !31 + +11132: ; preds = %._crit_edge1638, %10598 + ret void, !dbg !373 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smin.i32(i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.commit.group() #3 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.wait.group(i32 immarg) #3 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.idx.i32(i32, i32, i32, i32) #4 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.fence.sync.aligned() #5 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.commit_group.sync.aligned() #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smax.i32(i32, i32) #1 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #6 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #7 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #7 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } +attributes #4 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #5 = { convergent nounwind } +attributes #6 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #7 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_tem_fused_zeros_1", linkageName: "triton_tem_fused_zeros_1", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 95, column: 54, scope: !5) +!9 = !DILocation(line: 95, column: 63, scope: !5) +!10 = !DILocation(line: 111, column: 24, scope: !5) +!11 = !DILocation(line: 41, column: 22, scope: !12, inlinedAt: !14) +!12 = distinct !DILexicalBlockFile(scope: !5, file: !13, discriminator: 0) +!13 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!14 = !DILocation(line: 112, column: 36, scope: !5) +!15 = !DILocation(line: 41, column: 28, scope: !12, inlinedAt: !14) +!16 = !DILocation(line: 115, column: 27, scope: !5) +!17 = !DILocation(line: 116, column: 28, scope: !5) +!18 = !DILocation(line: 117, column: 23, scope: !5) +!19 = !DILocation(line: 124, column: 25, scope: !5) +!20 = !DILocation(line: 124, column: 47, scope: !5) +!21 = !DILocation(line: 124, column: 35, scope: !5) +!22 = !DILocation(line: 124, column: 59, scope: !5) +!23 = !DILocation(line: 128, column: 50, scope: !5) +!24 = !DILocation(line: 128, column: 37, scope: !5) +!25 = !DILocation(line: 128, column: 61, scope: !5) +!26 = !DILocation(line: 131, column: 9, scope: !5) +!27 = !DILocation(line: 132, column: 9, scope: !5) +!28 = !DILocation(line: 133, column: 10, scope: !5) +!29 = !DILocation(line: 136, column: 26, scope: !5) +!30 = !DILocation(line: 139, column: 14, scope: !5) +!31 = !DILocation(line: 139, column: 7, scope: !5) +!32 = !DILocation(line: 140, column: 24, scope: !5) +!33 = !DILocation(line: 144, column: 29, scope: !5) +!34 = !DILocation(line: 144, column: 54, scope: !5) +!35 = !DILocation(line: 144, column: 44, scope: !5) +!36 = !DILocation(line: 145, column: 35, scope: !5) +!37 = !DILocation(line: 154, column: 55, scope: !5) +!38 = !DILocation(line: 154, column: 78, scope: !5) +!39 = !DILocation(line: 155, column: 68, scope: !5) +!40 = !DILocation(line: 158, column: 30, scope: !5) +!41 = !DILocation(line: 158, column: 52, scope: !5) +!42 = !DILocation(line: 158, column: 40, scope: !5) +!43 = !DILocation(line: 158, column: 63, scope: !5) +!44 = !DILocation(line: 159, column: 32, scope: !5) +!45 = !DILocation(line: 159, column: 42, scope: !5) +!46 = !DILocation(line: 159, column: 66, scope: !5) +!47 = !DILocation(line: 161, column: 46, scope: !5) +!48 = !DILocation(line: 161, column: 56, scope: !5) +!49 = !DILocation(line: 163, column: 17, scope: !5) +!50 = !DILocation(line: 164, column: 19, scope: !5) +!51 = !DILocation(line: 167, column: 19, scope: !5) +!52 = !DILocation(line: 168, column: 21, scope: !5) +!53 = !DILocation(line: 169, column: 25, scope: !5) +!54 = !DILocation(line: 174, column: 36, scope: !5) +!55 = !DILocation(line: 175, column: 29, scope: !5) +!56 = !DILocation(line: 825, column: 38, scope: !57, inlinedAt: !58) +!57 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!58 = !DILocation(line: 178, column: 107, scope: !5) +!59 = !DILocation(line: 825, column: 20, scope: !57, inlinedAt: !58) +!60 = !DILocation(line: 825, column: 56, scope: !57, inlinedAt: !58) +!61 = !DILocation(line: 825, column: 49, scope: !57, inlinedAt: !58) +!62 = !DILocation(line: 833, column: 23, scope: !57, inlinedAt: !58) +!63 = !DILocation(line: 825, column: 38, scope: !57, inlinedAt: !64) +!64 = !DILocation(line: 179, column: 111, scope: !5) +!65 = !DILocation(line: 825, column: 20, scope: !57, inlinedAt: !64) +!66 = !DILocation(line: 825, column: 49, scope: !57, inlinedAt: !64) +!67 = !DILocation(line: 833, column: 23, scope: !57, inlinedAt: !64) +!68 = !DILocation(line: 195, column: 30, scope: !5) +!69 = !DILocation(line: 197, column: 53, scope: !5) +!70 = !DILocation(line: 199, column: 42, scope: !5) +!71 = !DILocation(line: 41, column: 22, scope: !12, inlinedAt: !72) +!72 = !DILocation(line: 207, column: 12, scope: !5) +!73 = !DILocation(line: 41, column: 28, scope: !12, inlinedAt: !72) +!74 = !DILocation(line: 395, column: 101, scope: !57, inlinedAt: !72) +!75 = !DILocation(line: 485, column: 34, scope: !57, inlinedAt: !72) +!76 = !DILocation(line: 188, column: 34, scope: !5) +!77 = !DILocation(line: 188, column: 25, scope: !5) +!78 = !DILocation(line: 189, column: 33, scope: !5) +!79 = !DILocation(line: 189, column: 26, scope: !5) +!80 = !DILocation(line: 190, column: 30, scope: !5) +!81 = !DILocation(line: 190, column: 50, scope: !5) +!82 = !DILocation(line: 196, column: 27, scope: !5) +!83 = !DILocation(line: 196, column: 41, scope: !5) +!84 = !DILocation(line: 197, column: 39, scope: !5) +!85 = !DILocation(line: 199, column: 29, scope: !5) +!86 = !DILocation(line: 390, column: 37, scope: !57, inlinedAt: !72) +!87 = !DILocation(line: 390, column: 18, scope: !57, inlinedAt: !72) +!88 = !DILocation(line: 390, column: 49, scope: !57, inlinedAt: !72) +!89 = !DILocation(line: 391, column: 18, scope: !57, inlinedAt: !72) +!90 = !DILocation(line: 391, column: 49, scope: !57, inlinedAt: !72) +!91 = !DILocation(line: 395, column: 43, scope: !57, inlinedAt: !72) +!92 = !DILocation(line: 395, column: 63, scope: !57, inlinedAt: !72) +!93 = !DILocation(line: 397, column: 28, scope: !57, inlinedAt: !72) +!94 = !DILocation(line: 485, column: 23, scope: !57, inlinedAt: !72) +!95 = !DILocation(line: 488, column: 23, scope: !57, inlinedAt: !72) +!96 = !DILocation(line: 831, column: 52, scope: !57, inlinedAt: !72) +!97 = !DILocation(line: 831, column: 23, scope: !57, inlinedAt: !72) +!98 = !DILocation(line: 414, column: 19, scope: !57, inlinedAt: !72) +!99 = !DILocation(line: 415, column: 19, scope: !57, inlinedAt: !72) +!100 = !DILocation(line: 417, column: 19, scope: !57, inlinedAt: !72) +!101 = !DILocation(line: 459, column: 19, scope: !57, inlinedAt: !72) +!102 = !DILocation(line: 798, column: 21, scope: !57, inlinedAt: !72) +!103 = !DILocation(line: 490, column: 23, scope: !57, inlinedAt: !72) +!104 = !DILocation(line: 482, column: 23, scope: !57, inlinedAt: !72) +!105 = !DILocation(line: 531, column: 19, scope: !57, inlinedAt: !72) +!106 = !DILocation(line: 461, column: 14, scope: !57, inlinedAt: !72) +!107 = !DILocation(line: 493, column: 24, scope: !57, inlinedAt: !72) +!108 = !DILocation(line: 503, column: 25, scope: !57, inlinedAt: !72) +!109 = !DILocation(line: 504, column: 24, scope: !57, inlinedAt: !72) +!110 = !DILocation(line: 505, column: 24, scope: !57, inlinedAt: !72) +!111 = !DILocation(line: 506, column: 23, scope: !57, inlinedAt: !72) +!112 = !DILocation(line: 513, column: 39, scope: !57, inlinedAt: !72) +!113 = !DILocation(line: 514, column: 25, scope: !57, inlinedAt: !72) +!114 = !DILocation(line: 515, column: 24, scope: !57, inlinedAt: !72) +!115 = !DILocation(line: 516, column: 24, scope: !57, inlinedAt: !72) +!116 = !DILocation(line: 521, column: 69, scope: !57, inlinedAt: !72) +!117 = !DILocation(line: 524, column: 27, scope: !57, inlinedAt: !72) +!118 = !DILocation(line: 525, column: 39, scope: !57, inlinedAt: !72) +!119 = !DILocation(line: 525, column: 21, scope: !57, inlinedAt: !72) +!120 = !DILocation(line: 530, column: 20, scope: !57, inlinedAt: !72) +!121 = !DILocation(line: 531, column: 14, scope: !57, inlinedAt: !72) +!122 = !DILocation(line: 551, column: 15, scope: !57, inlinedAt: !72) +!123 = !DILocation(line: 549, column: 43, scope: !57, inlinedAt: !72) +!124 = !DILocation(line: 553, column: 21, scope: !57, inlinedAt: !72) +!125 = !DILocation(line: 788, column: 33, scope: !57, inlinedAt: !72) +!126 = !DILocation(line: 789, column: 38, scope: !57, inlinedAt: !72) +!127 = !DILocation(line: 789, column: 24, scope: !57, inlinedAt: !72) +!128 = !DILocation(line: 790, column: 109, scope: !57, inlinedAt: !72) +!129 = !DILocation(line: 790, column: 113, scope: !57, inlinedAt: !72) +!130 = !DILocation(line: 790, column: 55, scope: !57, inlinedAt: !72) +!131 = !DILocation(line: 790, column: 25, scope: !57, inlinedAt: !72) +!132 = !DILocation(line: 791, column: 35, scope: !57, inlinedAt: !72) +!133 = !DILocation(line: 792, column: 34, scope: !57, inlinedAt: !72) +!134 = !DILocation(line: 792, column: 48, scope: !57, inlinedAt: !72) +!135 = !DILocation(line: 792, column: 63, scope: !57, inlinedAt: !72) +!136 = !DILocation(line: 793, column: 29, scope: !57, inlinedAt: !72) +!137 = !DILocation(line: 793, column: 61, scope: !57, inlinedAt: !72) +!138 = !DILocation(line: 793, column: 42, scope: !57, inlinedAt: !72) +!139 = !DILocation(line: 414, column: 28, scope: !57, inlinedAt: !72) +!140 = !DILocation(line: 214, column: 39, scope: !5) +!141 = !DILocation(line: 215, column: 31, scope: !5) +!142 = !DILocation(line: 215, column: 45, scope: !5) +!143 = !DILocation(line: 216, column: 62, scope: !5) +!144 = !DILocation(line: 216, column: 43, scope: !5) +!145 = !DILocation(line: 218, column: 33, scope: !5) +!146 = !DILocation(line: 390, column: 37, scope: !57, inlinedAt: !147) +!147 = !DILocation(line: 226, column: 16, scope: !5) +!148 = !DILocation(line: 390, column: 18, scope: !57, inlinedAt: !147) +!149 = !DILocation(line: 390, column: 49, scope: !57, inlinedAt: !147) +!150 = !DILocation(line: 391, column: 18, scope: !57, inlinedAt: !147) +!151 = !DILocation(line: 391, column: 49, scope: !57, inlinedAt: !147) +!152 = !DILocation(line: 395, column: 43, scope: !57, inlinedAt: !147) +!153 = !DILocation(line: 395, column: 63, scope: !57, inlinedAt: !147) +!154 = !DILocation(line: 397, column: 28, scope: !57, inlinedAt: !147) +!155 = !DILocation(line: 831, column: 52, scope: !57, inlinedAt: !147) +!156 = !DILocation(line: 831, column: 23, scope: !57, inlinedAt: !147) +!157 = !DILocation(line: 414, column: 19, scope: !57, inlinedAt: !147) +!158 = !DILocation(line: 415, column: 19, scope: !57, inlinedAt: !147) +!159 = !DILocation(line: 417, column: 19, scope: !57, inlinedAt: !147) +!160 = !DILocation(line: 459, column: 19, scope: !57, inlinedAt: !147) +!161 = !DILocation(line: 461, column: 14, scope: !57, inlinedAt: !147) +!162 = !DILocation(line: 524, column: 27, scope: !57, inlinedAt: !147) +!163 = !DILocation(line: 476, column: 79, scope: !57, inlinedAt: !147) +!164 = !DILocation(line: 525, column: 39, scope: !57, inlinedAt: !147) +!165 = !DILocation(line: 525, column: 21, scope: !57, inlinedAt: !147) +!166 = !DILocation(line: 530, column: 20, scope: !57, inlinedAt: !147) +!167 = !DILocation(line: 531, column: 19, scope: !57, inlinedAt: !147) +!168 = !DILocation(line: 531, column: 14, scope: !57, inlinedAt: !147) +!169 = !DILocation(line: 551, column: 15, scope: !57, inlinedAt: !147) +!170 = !DILocation(line: 538, column: 71, scope: !57, inlinedAt: !147) +!171 = !DILocation(line: 553, column: 21, scope: !57, inlinedAt: !147) +!172 = !DILocation(line: 788, column: 33, scope: !57, inlinedAt: !147) +!173 = !DILocation(line: 789, column: 38, scope: !57, inlinedAt: !147) +!174 = !DILocation(line: 789, column: 24, scope: !57, inlinedAt: !147) +!175 = !DILocation(line: 790, column: 109, scope: !57, inlinedAt: !147) +!176 = !DILocation(line: 790, column: 113, scope: !57, inlinedAt: !147) +!177 = !DILocation(line: 790, column: 55, scope: !57, inlinedAt: !147) +!178 = !DILocation(line: 790, column: 25, scope: !57, inlinedAt: !147) +!179 = !DILocation(line: 791, column: 35, scope: !57, inlinedAt: !147) +!180 = !DILocation(line: 792, column: 34, scope: !57, inlinedAt: !147) +!181 = !DILocation(line: 792, column: 48, scope: !57, inlinedAt: !147) +!182 = !DILocation(line: 792, column: 63, scope: !57, inlinedAt: !147) +!183 = !DILocation(line: 793, column: 29, scope: !57, inlinedAt: !147) +!184 = !DILocation(line: 793, column: 61, scope: !57, inlinedAt: !147) +!185 = !DILocation(line: 793, column: 42, scope: !57, inlinedAt: !147) +!186 = !DILocation(line: 414, column: 28, scope: !57, inlinedAt: !147) +!187 = !DILocation(line: 231, column: 24, scope: !5) +!188 = !DILocation(line: 231, column: 56, scope: !5) +!189 = !DILocation(line: 232, column: 14, scope: !5) +!190 = !DILocation(line: 236, column: 30, scope: !5) +!191 = !DILocation(line: 252, column: 25, scope: !5) +!192 = !DILocation(line: 253, column: 29, scope: !5) +!193 = !DILocation(line: 825, column: 38, scope: !57, inlinedAt: !194) +!194 = !DILocation(line: 256, column: 107, scope: !5) +!195 = !DILocation(line: 825, column: 20, scope: !57, inlinedAt: !194) +!196 = !DILocation(line: 825, column: 56, scope: !57, inlinedAt: !194) +!197 = !DILocation(line: 825, column: 49, scope: !57, inlinedAt: !194) +!198 = !DILocation(line: 833, column: 52, scope: !57, inlinedAt: !194) +!199 = !DILocation(line: 833, column: 23, scope: !57, inlinedAt: !194) +!200 = !DILocation(line: 825, column: 20, scope: !57, inlinedAt: !201) +!201 = !DILocation(line: 257, column: 107, scope: !5) +!202 = !DILocation(line: 825, column: 49, scope: !57, inlinedAt: !201) +!203 = !DILocation(line: 833, column: 23, scope: !57, inlinedAt: !201) +!204 = !DILocation(line: 263, column: 32, scope: !5) +!205 = !DILocation(line: 266, column: 56, scope: !5) +!206 = !DILocation(line: 281, column: 58, scope: !5) +!207 = !DILocation(line: 281, column: 80, scope: !5) +!208 = !DILocation(line: 282, column: 53, scope: !5) +!209 = !DILocation(line: 282, column: 70, scope: !5) +!210 = !DILocation(line: 286, column: 32, scope: !5) +!211 = !DILocation(line: 287, column: 30, scope: !5) +!212 = !DILocation(line: 287, column: 43, scope: !5) +!213 = !DILocation(line: 288, column: 55, scope: !5) +!214 = !DILocation(line: 288, column: 42, scope: !5) +!215 = !DILocation(line: 290, column: 45, scope: !5) +!216 = !DILocation(line: 290, column: 32, scope: !5) +!217 = !DILocation(line: 601, column: 37, scope: !57, inlinedAt: !218) +!218 = !DILocation(line: 298, column: 16, scope: !5) +!219 = !DILocation(line: 602, column: 38, scope: !57, inlinedAt: !218) +!220 = !DILocation(line: 608, column: 42, scope: !57, inlinedAt: !218) +!221 = !DILocation(line: 608, column: 61, scope: !57, inlinedAt: !218) +!222 = !DILocation(line: 701, column: 35, scope: !57, inlinedAt: !218) +!223 = !DILocation(line: 610, column: 28, scope: !57, inlinedAt: !218) +!224 = !DILocation(line: 701, column: 24, scope: !57, inlinedAt: !218) +!225 = !DILocation(line: 798, column: 21, scope: !57, inlinedAt: !218) +!226 = !DILocation(line: 709, column: 25, scope: !57, inlinedAt: !218) +!227 = !DILocation(line: 719, column: 25, scope: !57, inlinedAt: !218) +!228 = !DILocation(line: 720, column: 24, scope: !57, inlinedAt: !218) +!229 = !DILocation(line: 721, column: 24, scope: !57, inlinedAt: !218) +!230 = !DILocation(line: 306, column: 41, scope: !5) +!231 = !DILocation(line: 307, column: 34, scope: !5) +!232 = !DILocation(line: 307, column: 47, scope: !5) +!233 = !DILocation(line: 308, column: 64, scope: !5) +!234 = !DILocation(line: 308, column: 46, scope: !5) +!235 = !DILocation(line: 310, column: 36, scope: !5) +!236 = !DILocation(line: 601, column: 37, scope: !57, inlinedAt: !237) +!237 = !DILocation(line: 318, column: 20, scope: !5) +!238 = !DILocation(line: 602, column: 38, scope: !57, inlinedAt: !237) +!239 = !DILocation(line: 608, column: 42, scope: !57, inlinedAt: !237) +!240 = !DILocation(line: 608, column: 61, scope: !57, inlinedAt: !237) +!241 = !DILocation(line: 676, column: 20, scope: !57, inlinedAt: !218) +!242 = !DILocation(line: 262, column: 30, scope: !5) +!243 = !DILocation(line: 263, column: 51, scope: !5) +!244 = !DILocation(line: 266, column: 44, scope: !5) +!245 = !DILocation(line: 266, column: 67, scope: !5) +!246 = !DILocation(line: 267, column: 36, scope: !5) +!247 = !DILocation(line: 267, column: 46, scope: !5) +!248 = !DILocation(line: 267, column: 70, scope: !5) +!249 = !DILocation(line: 269, column: 50, scope: !5) +!250 = !DILocation(line: 269, column: 60, scope: !5) +!251 = !DILocation(line: 271, column: 21, scope: !5) +!252 = !DILocation(line: 272, column: 23, scope: !5) +!253 = !DILocation(line: 275, column: 25, scope: !5) +!254 = !DILocation(line: 276, column: 29, scope: !5) +!255 = !DILocation(line: 601, column: 18, scope: !57, inlinedAt: !218) +!256 = !DILocation(line: 601, column: 49, scope: !57, inlinedAt: !218) +!257 = !DILocation(line: 602, column: 19, scope: !57, inlinedAt: !218) +!258 = !DILocation(line: 602, column: 51, scope: !57, inlinedAt: !218) +!259 = !DILocation(line: 831, column: 23, scope: !57, inlinedAt: !218) +!260 = !DILocation(line: 674, column: 28, scope: !57, inlinedAt: !218) +!261 = !DILocation(line: 674, column: 22, scope: !57, inlinedAt: !218) +!262 = !DILocation(line: 833, column: 23, scope: !57, inlinedAt: !218) +!263 = !DILocation(line: 748, column: 29, scope: !57, inlinedAt: !218) +!264 = !DILocation(line: 748, column: 21, scope: !57, inlinedAt: !218) +!265 = !DILocation(line: 626, column: 19, scope: !57, inlinedAt: !218) +!266 = !DILocation(line: 627, column: 19, scope: !57, inlinedAt: !218) +!267 = !DILocation(line: 831, column: 52, scope: !57, inlinedAt: !218) +!268 = !DILocation(line: 675, column: 26, scope: !57, inlinedAt: !218) +!269 = !DILocation(line: 675, column: 46, scope: !57, inlinedAt: !218) +!270 = !DILocation(line: 678, column: 15, scope: !57, inlinedAt: !218) +!271 = !DILocation(line: 698, column: 25, scope: !57, inlinedAt: !218) +!272 = !DILocation(line: 703, column: 25, scope: !57, inlinedAt: !218) +!273 = !DILocation(line: 704, column: 24, scope: !57, inlinedAt: !218) +!274 = !DILocation(line: 706, column: 24, scope: !57, inlinedAt: !218) +!275 = !DILocation(line: 722, column: 24, scope: !57, inlinedAt: !218) +!276 = !DILocation(line: 729, column: 39, scope: !57, inlinedAt: !218) +!277 = !DILocation(line: 730, column: 25, scope: !57, inlinedAt: !218) +!278 = !DILocation(line: 731, column: 24, scope: !57, inlinedAt: !218) +!279 = !DILocation(line: 732, column: 24, scope: !57, inlinedAt: !218) +!280 = !DILocation(line: 736, column: 69, scope: !57, inlinedAt: !218) +!281 = !DILocation(line: 739, column: 27, scope: !57, inlinedAt: !218) +!282 = !DILocation(line: 740, column: 40, scope: !57, inlinedAt: !218) +!283 = !DILocation(line: 740, column: 22, scope: !57, inlinedAt: !218) +!284 = !DILocation(line: 744, column: 24, scope: !57, inlinedAt: !218) +!285 = !DILocation(line: 744, column: 43, scope: !57, inlinedAt: !218) +!286 = !DILocation(line: 750, column: 20, scope: !57, inlinedAt: !218) +!287 = !DILocation(line: 751, column: 22, scope: !57, inlinedAt: !218) +!288 = !DILocation(line: 751, column: 16, scope: !57, inlinedAt: !218) +!289 = !DILocation(line: 775, column: 24, scope: !57, inlinedAt: !218) +!290 = !DILocation(line: 773, column: 45, scope: !57, inlinedAt: !218) +!291 = !DILocation(line: 775, column: 43, scope: !57, inlinedAt: !218) +!292 = !DILocation(line: 628, column: 19, scope: !57, inlinedAt: !218) +!293 = !DILocation(line: 788, column: 33, scope: !57, inlinedAt: !218) +!294 = !DILocation(line: 789, column: 38, scope: !57, inlinedAt: !218) +!295 = !DILocation(line: 789, column: 24, scope: !57, inlinedAt: !218) +!296 = !DILocation(line: 790, column: 109, scope: !57, inlinedAt: !218) +!297 = !DILocation(line: 790, column: 113, scope: !57, inlinedAt: !218) +!298 = !DILocation(line: 790, column: 55, scope: !57, inlinedAt: !218) +!299 = !DILocation(line: 790, column: 25, scope: !57, inlinedAt: !218) +!300 = !DILocation(line: 791, column: 35, scope: !57, inlinedAt: !218) +!301 = !DILocation(line: 792, column: 34, scope: !57, inlinedAt: !218) +!302 = !DILocation(line: 792, column: 48, scope: !57, inlinedAt: !218) +!303 = !DILocation(line: 792, column: 63, scope: !57, inlinedAt: !218) +!304 = !DILocation(line: 793, column: 29, scope: !57, inlinedAt: !218) +!305 = !DILocation(line: 793, column: 61, scope: !57, inlinedAt: !218) +!306 = !DILocation(line: 793, column: 42, scope: !57, inlinedAt: !218) +!307 = !DILocation(line: 626, column: 28, scope: !57, inlinedAt: !218) +!308 = !DILocation(line: 627, column: 28, scope: !57, inlinedAt: !218) +!309 = !DILocation(line: 674, column: 52, scope: !57, inlinedAt: !218) +!310 = !DILocation(line: 833, column: 52, scope: !57, inlinedAt: !218) +!311 = !DILocation(line: 601, column: 18, scope: !57, inlinedAt: !237) +!312 = !DILocation(line: 601, column: 49, scope: !57, inlinedAt: !237) +!313 = !DILocation(line: 602, column: 19, scope: !57, inlinedAt: !237) +!314 = !DILocation(line: 602, column: 51, scope: !57, inlinedAt: !237) +!315 = !DILocation(line: 831, column: 23, scope: !57, inlinedAt: !237) +!316 = !DILocation(line: 674, column: 28, scope: !57, inlinedAt: !237) +!317 = !DILocation(line: 674, column: 22, scope: !57, inlinedAt: !237) +!318 = !DILocation(line: 833, column: 23, scope: !57, inlinedAt: !237) +!319 = !DILocation(line: 748, column: 29, scope: !57, inlinedAt: !237) +!320 = !DILocation(line: 748, column: 21, scope: !57, inlinedAt: !237) +!321 = !DILocation(line: 626, column: 19, scope: !57, inlinedAt: !237) +!322 = !DILocation(line: 627, column: 19, scope: !57, inlinedAt: !237) +!323 = !DILocation(line: 610, column: 28, scope: !57, inlinedAt: !237) +!324 = !DILocation(line: 831, column: 52, scope: !57, inlinedAt: !237) +!325 = !DILocation(line: 675, column: 26, scope: !57, inlinedAt: !237) +!326 = !DILocation(line: 675, column: 46, scope: !57, inlinedAt: !237) +!327 = !DILocation(line: 676, column: 20, scope: !57, inlinedAt: !237) +!328 = !DILocation(line: 678, column: 15, scope: !57, inlinedAt: !237) +!329 = !DILocation(line: 739, column: 27, scope: !57, inlinedAt: !237) +!330 = !DILocation(line: 692, column: 78, scope: !57, inlinedAt: !237) +!331 = !DILocation(line: 740, column: 40, scope: !57, inlinedAt: !237) +!332 = !DILocation(line: 740, column: 22, scope: !57, inlinedAt: !237) +!333 = !DILocation(line: 744, column: 24, scope: !57, inlinedAt: !237) +!334 = !DILocation(line: 744, column: 43, scope: !57, inlinedAt: !237) +!335 = !DILocation(line: 750, column: 20, scope: !57, inlinedAt: !237) +!336 = !DILocation(line: 751, column: 22, scope: !57, inlinedAt: !237) +!337 = !DILocation(line: 751, column: 16, scope: !57, inlinedAt: !237) +!338 = !DILocation(line: 775, column: 24, scope: !57, inlinedAt: !237) +!339 = !DILocation(line: 759, column: 70, scope: !57, inlinedAt: !237) +!340 = !DILocation(line: 775, column: 43, scope: !57, inlinedAt: !237) +!341 = !DILocation(line: 628, column: 19, scope: !57, inlinedAt: !237) +!342 = !DILocation(line: 788, column: 33, scope: !57, inlinedAt: !237) +!343 = !DILocation(line: 789, column: 38, scope: !57, inlinedAt: !237) +!344 = !DILocation(line: 789, column: 24, scope: !57, inlinedAt: !237) +!345 = !DILocation(line: 790, column: 109, scope: !57, inlinedAt: !237) +!346 = !DILocation(line: 790, column: 113, scope: !57, inlinedAt: !237) +!347 = !DILocation(line: 790, column: 55, scope: !57, inlinedAt: !237) +!348 = !DILocation(line: 790, column: 25, scope: !57, inlinedAt: !237) +!349 = !DILocation(line: 791, column: 35, scope: !57, inlinedAt: !237) +!350 = !DILocation(line: 792, column: 34, scope: !57, inlinedAt: !237) +!351 = !DILocation(line: 792, column: 48, scope: !57, inlinedAt: !237) +!352 = !DILocation(line: 792, column: 63, scope: !57, inlinedAt: !237) +!353 = !DILocation(line: 793, column: 29, scope: !57, inlinedAt: !237) +!354 = !DILocation(line: 793, column: 61, scope: !57, inlinedAt: !237) +!355 = !DILocation(line: 793, column: 42, scope: !57, inlinedAt: !237) +!356 = !DILocation(line: 626, column: 28, scope: !57, inlinedAt: !237) +!357 = !DILocation(line: 627, column: 28, scope: !57, inlinedAt: !237) +!358 = !DILocation(line: 674, column: 52, scope: !57, inlinedAt: !237) +!359 = !DILocation(line: 833, column: 52, scope: !57, inlinedAt: !237) +!360 = !DILocation(line: 323, column: 23, scope: !5) +!361 = !DILocation(line: 323, column: 55, scope: !5) +!362 = !DILocation(line: 332, column: 30, scope: !5) +!363 = !DILocation(line: 334, column: 14, scope: !5) +!364 = !DILocation(line: 344, column: 27, scope: !5) +!365 = !DILocation(line: 344, column: 45, scope: !5) +!366 = !DILocation(line: 344, column: 53, scope: !5) +!367 = !DILocation(line: 344, column: 41, scope: !5) +!368 = !DILocation(line: 344, column: 64, scope: !5) +!369 = !DILocation(line: 344, column: 71, scope: !5) +!370 = !DILocation(line: 344, column: 59, scope: !5) +!371 = !DILocation(line: 345, column: 29, scope: !5) +!372 = !DILocation(line: 345, column: 69, scope: !5) +!373 = !DILocation(line: 139, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..2009f2a77286114ef8c34778db457936559d0953 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.ptx @@ -0,0 +1,8513 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_tem_fused_zeros_1 // -- Begin function triton_tem_fused_zeros_1 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_tem_fused_zeros_1 +.visible .entry triton_tem_fused_zeros_1( + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_0, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_1, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_2, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_3, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_4, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_5, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_6, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_7, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_8, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_9, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_10, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_11, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_12, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_13, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_14, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_15, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_16, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_17, + .param .u32 triton_tem_fused_zeros_1_param_18, + .param .u32 triton_tem_fused_zeros_1_param_19, + .param .u32 triton_tem_fused_zeros_1_param_20, + .param .u32 triton_tem_fused_zeros_1_param_21, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_22, + .param .u64 .ptr .global .align 1 triton_tem_fused_zeros_1_param_23 +) +.reqntid 256 +{ + .reg .pred %p<1002>; + .reg .b16 %rs<378>; + .reg .b32 %r<14683>; + .reg .b64 %rd<1022>; + .loc 1 18 0 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:18:0 + +// %bb.0: + ld.param.b32 %r2123, [triton_tem_fused_zeros_1_param_18]; + ld.param.b64 %rd170, [triton_tem_fused_zeros_1_param_16]; + ld.param.b64 %rd160, [triton_tem_fused_zeros_1_param_5]; + ld.param.b64 %rd159, [triton_tem_fused_zeros_1_param_4]; + ld.param.b64 %rd158, [triton_tem_fused_zeros_1_param_3]; + ld.param.b64 %rd157, [triton_tem_fused_zeros_1_param_0]; +$L__tmp0: + .loc 1 95 54 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:95:54 + shl.b32 %r2127, %r2123, 10; + ld.param.b64 %rd172, [triton_tem_fused_zeros_1_param_1]; + ld.param.b64 %rd173, [triton_tem_fused_zeros_1_param_2]; + .loc 1 111 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:111:24 + mov.u32 %r1, %ctaid.x; +$L__tmp1: + .loc 2 41 22 // standard.py:41:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:112:36 ] + add.s32 %r2128, %r2123, 127; + .loc 2 41 28 // standard.py:41:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:112:36 ] + shr.s32 %r2129, %r2128, 31; + shr.u32 %r2130, %r2129, 25; + add.s32 %r2131, %r2128, %r2130; + shr.s32 %r2, %r2131, 7; +$L__tmp2: + .loc 1 115 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:115:27 + mov.u32 %r3, %ctaid.y; + .loc 1 116 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:116:28 + mov.u32 %r4, %ctaid.z; + .loc 1 117 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:117:23 + and.b32 %r5, %r3, 7; + .loc 1 124 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:124:25 + mul.lo.s32 %r2132, %r2123, %r4; + shl.b32 %r2133, %r2132, 7; + .loc 1 124 35 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:124:35 + mad.lo.s32 %r2134, %r2127, %r5, %r2133; + .loc 1 131 9 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:131:9 + mul.wide.s32 %rd175, %r2134, 2; + add.s64 %rd1, %rd172, %rd175; + .loc 1 132 9 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:132:9 + add.s64 %rd2, %rd173, %rd175; + .loc 1 136 26 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:136:26 + mov.u32 %r7, %tid.x; + shr.u32 %r8, %r7, 5; + and.b32 %r9, %r7, 240; + bfe.u32 %r10, %r7, 4, 4; + or.b32 %r11, %r10, 16; + or.b32 %r12, %r10, 32; + or.b32 %r13, %r10, 48; + or.b32 %r14, %r10, 64; + or.b32 %r15, %r10, 80; + or.b32 %r16, %r10, 96; + or.b32 %r17, %r10, 112; + and.b32 %r18, %r7, 224; + .loc 1 139 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:139:14 + setp.lt.s32 %p21, %r1, %r2; + .loc 1 139 7 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:139:7 + @%p21 bra $L__BB0_8; + bra.uni $L__BB0_1; +$L__BB0_8: + .loc 1 0 7 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:0:7 + ld.param.b32 %r2126, [triton_tem_fused_zeros_1_param_21]; + ld.param.b32 %r2125, [triton_tem_fused_zeros_1_param_20]; + ld.param.b64 %rd171, [triton_tem_fused_zeros_1_param_17]; + ld.param.b64 %rd169, [triton_tem_fused_zeros_1_param_15]; + ld.param.b64 %rd168, [triton_tem_fused_zeros_1_param_14]; + ld.param.b64 %rd165, [triton_tem_fused_zeros_1_param_11]; + ld.param.b64 %rd164, [triton_tem_fused_zeros_1_param_10]; + ld.param.b64 %rd174, [triton_tem_fused_zeros_1_param_7]; + mad.lo.s32 %r6, %r2127, %r3, %r2133; + mad.wide.s32 %rd3, %r6, 2, %rd174; + .loc 1 136 26 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:136:26 + shr.u32 %r7096, %r9, 2; + .loc 1 252 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:252:25 + shl.b32 %r7097, %r1, 7; + .loc 1 253 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:253:29 + or.b32 %r664, %r10, %r7097; + or.b32 %r665, %r11, %r7097; + or.b32 %r666, %r12, %r7097; + or.b32 %r667, %r13, %r7097; + or.b32 %r668, %r14, %r7097; + or.b32 %r669, %r15, %r7097; + or.b32 %r670, %r16, %r7097; + or.b32 %r671, %r17, %r7097; +$L__tmp3: + .loc 1 825 38 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:825:38 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:256:107 ] + shl.b32 %r7098, %r664, 7; + shl.b32 %r7099, %r665, 7; + shl.b32 %r7100, %r666, 7; + shl.b32 %r7101, %r667, 7; + shl.b32 %r7102, %r668, 7; + shl.b32 %r7103, %r669, 7; + shl.b32 %r7104, %r670, 7; + shl.b32 %r7105, %r671, 7; + .loc 1 825 20 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:825:20 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:256:107 ] + cvt.s64.s32 %rd70, %r7098; + mul.wide.s32 %rd523, %r7098, 2; + add.s64 %rd524, %rd1, %rd523; + cvt.s64.s32 %rd71, %r7099; + mul.wide.s32 %rd525, %r7099, 2; + add.s64 %rd526, %rd1, %rd525; + cvt.s64.s32 %rd72, %r7100; + mul.wide.s32 %rd527, %r7100, 2; + add.s64 %rd528, %rd1, %rd527; + cvt.s64.s32 %rd73, %r7101; + mul.wide.s32 %rd529, %r7101, 2; + add.s64 %rd530, %rd1, %rd529; + cvt.s64.s32 %rd74, %r7102; + mul.wide.s32 %rd531, %r7102, 2; + add.s64 %rd532, %rd1, %rd531; + cvt.s64.s32 %rd75, %r7103; + mul.wide.s32 %rd533, %r7103, 2; + add.s64 %rd534, %rd1, %rd533; + cvt.s64.s32 %rd76, %r7104; + mul.wide.s32 %rd535, %r7104, 2; + add.s64 %rd536, %rd1, %rd535; + cvt.s64.s32 %rd77, %r7105; + mul.wide.s32 %rd537, %r7105, 2; + add.s64 %rd538, %rd1, %rd537; + .loc 1 825 56 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:825:56 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:256:107 ] + and.b32 %r7106, %r7, 15; + shl.b32 %r7107, %r7106, 3; + .loc 1 825 49 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:825:49 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:256:107 ] + cvt.u64.u32 %rd78, %r7107; + mul.wide.u32 %rd539, %r7107, 2; + add.s64 %rd500, %rd524, %rd539; + add.s64 %rd501, %rd526, %rd539; + add.s64 %rd502, %rd528, %rd539; + add.s64 %rd503, %rd530, %rd539; + add.s64 %rd504, %rd532, %rd539; + add.s64 %rd505, %rd534, %rd539; + add.s64 %rd506, %rd536, %rd539; + add.s64 %rd507, %rd538, %rd539; + .loc 1 833 52 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:833:52 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:256:107 ] + setp.lt.s32 %p986, %r664, %r2123; + setp.lt.s32 %p987, %r665, %r2123; + setp.lt.s32 %p988, %r666, %r2123; + setp.lt.s32 %p989, %r667, %r2123; + setp.lt.s32 %p990, %r668, %r2123; + setp.lt.s32 %p991, %r669, %r2123; + setp.lt.s32 %p992, %r670, %r2123; + setp.lt.s32 %p993, %r671, %r2123; + mov.b32 %r6967, 0; + .loc 1 833 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:833:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:256:107 ] + // begin inline asm + mov.u32 %r6963, %r6967; + mov.u32 %r6964, %r6967; + mov.u32 %r6965, %r6967; + mov.u32 %r6966, %r6967; + @%p986 ld.global.v4.b32 { %r6963, %r6964, %r6965, %r6966 }, [ %rd500 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r6971, %r6967; + mov.u32 %r6972, %r6967; + mov.u32 %r6973, %r6967; + mov.u32 %r6974, %r6967; + @%p987 ld.global.v4.b32 { %r6971, %r6972, %r6973, %r6974 }, [ %rd501 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r6979, %r6967; + mov.u32 %r6980, %r6967; + mov.u32 %r6981, %r6967; + mov.u32 %r6982, %r6967; + @%p988 ld.global.v4.b32 { %r6979, %r6980, %r6981, %r6982 }, [ %rd502 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r6987, %r6967; + mov.u32 %r6988, %r6967; + mov.u32 %r6989, %r6967; + mov.u32 %r6990, %r6967; + @%p989 ld.global.v4.b32 { %r6987, %r6988, %r6989, %r6990 }, [ %rd503 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r6995, %r6967; + mov.u32 %r6996, %r6967; + mov.u32 %r6997, %r6967; + mov.u32 %r6998, %r6967; + @%p990 ld.global.v4.b32 { %r6995, %r6996, %r6997, %r6998 }, [ %rd504 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7003, %r6967; + mov.u32 %r7004, %r6967; + mov.u32 %r7005, %r6967; + mov.u32 %r7006, %r6967; + @%p991 ld.global.v4.b32 { %r7003, %r7004, %r7005, %r7006 }, [ %rd505 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7011, %r6967; + mov.u32 %r7012, %r6967; + mov.u32 %r7013, %r6967; + mov.u32 %r7014, %r6967; + @%p992 ld.global.v4.b32 { %r7011, %r7012, %r7013, %r7014 }, [ %rd506 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7019, %r6967; + mov.u32 %r7020, %r6967; + mov.u32 %r7021, %r6967; + mov.u32 %r7022, %r6967; + @%p993 ld.global.v4.b32 { %r7019, %r7020, %r7021, %r7022 }, [ %rd507 + 0 ]; + // end inline asm + shl.b32 %r7108, %r7, 4; + and.b32 %r7109, %r7108, 112; + shl.b32 %r7110, %r9, 3; + and.b32 %r7111, %r7, 112; + and.b32 %r7112, %r7, 8; + shl.b32 %r7113, %r7112, 11; + or.b32 %r7114, %r7109, %r7110; + xor.b32 %r7115, %r7114, %r7111; + or.b32 %r7116, %r7115, %r7113; + mov.b32 %r7117, global_smem; + add.s32 %r7118, %r7117, %r7116; + st.shared.v4.b32 [%r7118+107520], {%r6963, %r6964, %r6965, %r6966}; + st.shared.v4.b32 [%r7118+109568], {%r6971, %r6972, %r6973, %r6974}; + st.shared.v4.b32 [%r7118+111616], {%r6979, %r6980, %r6981, %r6982}; + st.shared.v4.b32 [%r7118+113664], {%r6987, %r6988, %r6989, %r6990}; + st.shared.v4.b32 [%r7118+115712], {%r6995, %r6996, %r6997, %r6998}; + st.shared.v4.b32 [%r7118+117760], {%r7003, %r7004, %r7005, %r7006}; + st.shared.v4.b32 [%r7118+119808], {%r7011, %r7012, %r7013, %r7014}; + st.shared.v4.b32 [%r7118+121856], {%r7019, %r7020, %r7021, %r7022}; +$L__tmp4: + .loc 1 825 20 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:825:20 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:257:107 ] + add.s64 %rd540, %rd2, %rd523; + add.s64 %rd541, %rd2, %rd525; + add.s64 %rd542, %rd2, %rd527; + add.s64 %rd543, %rd2, %rd529; + add.s64 %rd544, %rd2, %rd531; + add.s64 %rd545, %rd2, %rd533; + add.s64 %rd546, %rd2, %rd535; + add.s64 %rd547, %rd2, %rd537; + .loc 1 825 49 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:825:49 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:257:107 ] + add.s64 %rd508, %rd540, %rd539; + add.s64 %rd509, %rd541, %rd539; + add.s64 %rd510, %rd542, %rd539; + add.s64 %rd511, %rd543, %rd539; + add.s64 %rd512, %rd544, %rd539; + add.s64 %rd513, %rd545, %rd539; + add.s64 %rd514, %rd546, %rd539; + add.s64 %rd515, %rd547, %rd539; + .loc 1 833 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:833:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:257:107 ] + // begin inline asm + mov.u32 %r7027, %r6967; + mov.u32 %r7028, %r6967; + mov.u32 %r7029, %r6967; + mov.u32 %r7030, %r6967; + @%p986 ld.global.v4.b32 { %r7027, %r7028, %r7029, %r7030 }, [ %rd508 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7035, %r6967; + mov.u32 %r7036, %r6967; + mov.u32 %r7037, %r6967; + mov.u32 %r7038, %r6967; + @%p987 ld.global.v4.b32 { %r7035, %r7036, %r7037, %r7038 }, [ %rd509 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7043, %r6967; + mov.u32 %r7044, %r6967; + mov.u32 %r7045, %r6967; + mov.u32 %r7046, %r6967; + @%p988 ld.global.v4.b32 { %r7043, %r7044, %r7045, %r7046 }, [ %rd510 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7051, %r6967; + mov.u32 %r7052, %r6967; + mov.u32 %r7053, %r6967; + mov.u32 %r7054, %r6967; + @%p989 ld.global.v4.b32 { %r7051, %r7052, %r7053, %r7054 }, [ %rd511 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7059, %r6967; + mov.u32 %r7060, %r6967; + mov.u32 %r7061, %r6967; + mov.u32 %r7062, %r6967; + @%p990 ld.global.v4.b32 { %r7059, %r7060, %r7061, %r7062 }, [ %rd512 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7067, %r6967; + mov.u32 %r7068, %r6967; + mov.u32 %r7069, %r6967; + mov.u32 %r7070, %r6967; + @%p991 ld.global.v4.b32 { %r7067, %r7068, %r7069, %r7070 }, [ %rd513 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7075, %r6967; + mov.u32 %r7076, %r6967; + mov.u32 %r7077, %r6967; + mov.u32 %r7078, %r6967; + @%p992 ld.global.v4.b32 { %r7075, %r7076, %r7077, %r7078 }, [ %rd514 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7083, %r6967; + mov.u32 %r7084, %r6967; + mov.u32 %r7085, %r6967; + mov.u32 %r7086, %r6967; + @%p993 ld.global.v4.b32 { %r7083, %r7084, %r7085, %r7086 }, [ %rd515 + 0 ]; + // end inline asm + st.shared.v4.b32 [%r7118+140288], {%r7027, %r7028, %r7029, %r7030}; + st.shared.v4.b32 [%r7118+142336], {%r7035, %r7036, %r7037, %r7038}; + st.shared.v4.b32 [%r7118+144384], {%r7043, %r7044, %r7045, %r7046}; + st.shared.v4.b32 [%r7118+146432], {%r7051, %r7052, %r7053, %r7054}; + st.shared.v4.b32 [%r7118+148480], {%r7059, %r7060, %r7061, %r7062}; + st.shared.v4.b32 [%r7118+150528], {%r7067, %r7068, %r7069, %r7070}; + st.shared.v4.b32 [%r7118+152576], {%r7075, %r7076, %r7077, %r7078}; + st.shared.v4.b32 [%r7118+154624], {%r7083, %r7084, %r7085, %r7086}; +$L__tmp5: + .loc 1 266 56 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:266:56 + shl.b32 %r672, %r3, 23; + .loc 1 281 80 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:281:80 + mad.lo.s32 %r7119, %r2125, %r5, %r1; + .loc 1 282 70 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:282:70 + mad.lo.s32 %r7120, %r2126, %r5, %r1; + shl.b32 %r7121, %r7120, 4; + .loc 1 286 32 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:286:32 + mul.wide.s32 %rd548, %r7121, 4; + add.s64 %rd516, %rd165, %rd548; + mov.pred %p420, -1; + .loc 1 287 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:287:30 + // begin inline asm + mov.u32 %r7091, 0x0; + @%p420 ld.global.b32 { %r7091 }, [ %rd516 + 0 ]; + // end inline asm + .loc 1 287 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:287:43 + shl.b32 %r673, %r7091, 7; + .loc 1 288 55 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:288:55 + mul.wide.s32 %rd549, %r7119, 4; + add.s64 %rd517, %rd164, %rd549; + .loc 1 288 42 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:288:42 + // begin inline asm + mov.u32 %r7092, 0x0; + @%p420 ld.global.b32 { %r7092 }, [ %rd517 + 0 ]; + // end inline asm + .loc 1 290 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:290:45 + and.b32 %r675, %r7, 3; + shl.b32 %r676, %r675, 1; + or.b32 %r7122, %r676, 8; + or.b32 %r7123, %r676, 16; + or.b32 %r7124, %r676, 24; + or.b32 %r7125, %r676, 32; + or.b32 %r7126, %r676, 40; + or.b32 %r7127, %r676, 48; + or.b32 %r7128, %r676, 56; + shl.b32 %r7129, %r7106, 2; + .loc 1 290 32 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:290:32 + or.b32 %r677, %r673, %r676; + or.b32 %r678, %r673, %r7122; + or.b32 %r679, %r673, %r7123; + or.b32 %r680, %r673, %r7124; + or.b32 %r681, %r673, %r7125; + or.b32 %r682, %r673, %r7126; + or.b32 %r683, %r673, %r7127; + or.b32 %r684, %r673, %r7128; + or.b32 %r839, %r673, %r7129; + or.b32 %r840, %r839, 1; + or.b32 %r686, %r839, 3; + or.b32 %r685, %r839, 2; + or.b32 %r7130, %r673, %r10; + or.b32 %r7131, %r673, %r11; + or.b32 %r7132, %r673, %r12; + or.b32 %r7133, %r673, %r13; +$L__tmp6: + .loc 1 601 37 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:601:37 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + shl.b32 %r7134, %r7130, 12; + shl.b32 %r7135, %r7131, 12; + shl.b32 %r7136, %r7132, 12; + shl.b32 %r7137, %r7133, 12; + .loc 1 602 38 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:602:38 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + shl.b32 %r7138, %r7130, 7; + shl.b32 %r7139, %r7131, 7; + shl.b32 %r7140, %r7132, 7; + shl.b32 %r7141, %r7133, 7; + .loc 1 608 42 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:608:42 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + shl.b32 %r687, %r7092, 1; + .loc 1 608 61 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:608:61 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + min.s32 %r7142, %r687, 32; + .loc 1 701 35 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:701:35 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mad.wide.u32 %rd519, %r3, 8, %rd170; + .loc 1 610 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:610:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.gt.s32 %p422, %r687, 0; + .loc 1 701 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:701:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + // begin inline asm + mov.u64 %rd518, 0x0; + @%p422 ld.global.b64 { %rd518 }, [ %rd519 + 0 ]; + // end inline asm +$L__tmp7: + .loc 1 253 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:253:29 + or.b32 %r7143, %r7096, %r7097; +$L__tmp8: + .loc 1 798 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:798:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + or.b32 %r7144, %r7143, 67; + or.b32 %r7145, %r7143, 66; + or.b32 %r7146, %r7143, 65; + or.b32 %r7147, %r7143, 64; + or.b32 %r7148, %r7143, 3; + or.b32 %r7149, %r7143, 2; + or.b32 %r7150, %r7143, 1; + rem.s32 %r842, %r7143, %r2123; + rem.s32 %r843, %r7150, %r2123; + rem.s32 %r846, %r7149, %r2123; + rem.s32 %r847, %r7148, %r2123; + rem.s32 %r850, %r7147, %r2123; + rem.s32 %r851, %r7146, %r2123; + rem.s32 %r854, %r7145, %r2123; + rem.s32 %r855, %r7144, %r2123; + .loc 1 709 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:709:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.gt.u32 %p425, %r855, 2047; + setp.gt.u32 %p426, %r854, 2047; + setp.gt.u32 %p427, %r851, 2047; + setp.gt.u32 %p428, %r850, 2047; + setp.gt.u32 %p429, %r847, 2047; + setp.gt.u32 %p430, %r846, 2047; + setp.gt.u32 %p431, %r843, 2047; + setp.gt.u32 %p432, %r842, 2047; + .loc 1 719 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:719:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + prmt.b32 %r7151, %r843, %r842, 0x5410U; + prmt.b32 %r7152, %r847, %r846, 0x5410U; + prmt.b32 %r7153, %r851, %r850, 0x5410U; + prmt.b32 %r7154, %r855, %r854, 0x5410U; + and.b32 %r7155, %r7154, 134154239; + and.b32 %r7156, %r7153, 134154239; + and.b32 %r7157, %r7152, 134154239; + and.b32 %r7158, %r7151, 134154239; + .loc 1 720 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:720:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mov.b32 {%rs145, %rs146}, %r7158; + cvt.u64.u16 %rd550, %rs146; + cvt.u64.u16 %rd551, %rs145; + mov.b32 {%rs147, %rs148}, %r7157; + cvt.u64.u16 %rd552, %rs148; + cvt.u64.u16 %rd553, %rs147; + mov.b32 {%rs149, %rs150}, %r7156; + cvt.u64.u16 %rd554, %rs150; + cvt.u64.u16 %rd555, %rs149; + mov.b32 {%rs151, %rs152}, %r7155; + cvt.u64.u16 %rd556, %rs152; + cvt.u64.u16 %rd557, %rs151; + setp.gt.s64 %p433, %rd518, %rd557; + setp.gt.s64 %p434, %rd518, %rd556; + setp.gt.s64 %p435, %rd518, %rd555; + setp.gt.s64 %p436, %rd518, %rd554; + setp.gt.s64 %p437, %rd518, %rd553; + setp.gt.s64 %p438, %rd518, %rd552; + setp.gt.s64 %p439, %rd518, %rd551; + setp.gt.s64 %p440, %rd518, %rd550; + .loc 1 721 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:721:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p5, %p432, %p440; + and.pred %p7, %p431, %p439; + and.pred %p9, %p430, %p438; + and.pred %p11, %p429, %p437; + and.pred %p13, %p428, %p436; + and.pred %p15, %p427, %p435; + and.pred %p17, %p426, %p434; + and.pred %p19, %p425, %p433; +$L__tmp9: + .loc 1 306 41 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:306:41 + add.s64 %rd520, %rd169, %rd548; + .loc 1 307 34 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:307:34 + // begin inline asm + mov.u32 %r7093, 0x0; + @%p420 ld.global.b32 { %r7093 }, [ %rd520 + 0 ]; + // end inline asm + .loc 1 307 47 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:307:47 + shl.b32 %r688, %r7093, 7; + .loc 1 308 64 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:308:64 + add.s64 %rd521, %rd168, %rd549; + .loc 1 308 46 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:308:46 + // begin inline asm + mov.u32 %r7094, 0x0; + @%p420 ld.global.b32 { %r7094 }, [ %rd521 + 0 ]; + // end inline asm + .loc 1 310 36 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:310:36 + or.b32 %r7159, %r688, %r676; + or.b32 %r7160, %r688, %r7122; + or.b32 %r7161, %r688, %r7123; + or.b32 %r7162, %r688, %r7124; + or.b32 %r7163, %r688, %r7125; + or.b32 %r7164, %r688, %r7126; + or.b32 %r7165, %r688, %r7127; + or.b32 %r7166, %r688, %r7128; + or.b32 %r690, %r688, %r7129; + or.b32 %r7167, %r688, %r10; + or.b32 %r7168, %r688, %r11; + or.b32 %r7169, %r688, %r12; + or.b32 %r7170, %r688, %r13; +$L__tmp10: + .loc 1 601 37 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:601:37 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + shl.b32 %r7171, %r7167, 12; + shl.b32 %r7172, %r7168, 12; + shl.b32 %r7173, %r7169, 12; + shl.b32 %r7174, %r7170, 12; + .loc 1 602 38 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:602:38 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + shl.b32 %r7175, %r7167, 7; + shl.b32 %r7176, %r7168, 7; + shl.b32 %r7177, %r7169, 7; + shl.b32 %r7178, %r7170, 7; + .loc 1 608 42 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:608:42 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + shl.b32 %r691, %r7094, 1; + .loc 1 608 61 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:608:61 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + min.s32 %r7179, %r691, 32; +$L__tmp11: + .loc 1 676 20 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:676:20 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + shl.b32 %r692, %r3, 16; + cvt.s64.s32 %rd81, %r7134; + cvt.s64.s32 %rd82, %r7135; + cvt.s64.s32 %rd83, %r7136; + cvt.s64.s32 %rd84, %r7137; + cvt.s64.s32 %rd85, %r7138; + cvt.s64.s32 %rd86, %r7139; + cvt.s64.s32 %rd87, %r7140; + cvt.s64.s32 %rd88, %r7141; + setp.lt.s32 %p441, %r7130, 2048; + setp.lt.s32 %p442, %r7131, 2048; + setp.lt.s32 %p443, %r7132, 2048; + setp.lt.s32 %p444, %r7133, 2048; + shl.b32 %r7180, %r7112, 10; + or.b32 %r693, %r7115, %r7180; + add.s32 %r10364, %r7117, %r693; + selp.b32 %r7181, 16, 0, %p441; + selp.b32 %r7266, %r7181, 0, %p422; + add.s32 %r10366, %r10364, 2048; + selp.b32 %r7182, 16, 0, %p442; + selp.b32 %r7268, %r7182, 0, %p422; + add.s32 %r10368, %r10364, 4096; + selp.b32 %r7183, 16, 0, %p443; + selp.b32 %r7270, %r7183, 0, %p422; + add.s32 %r10370, %r10364, 6144; + selp.b32 %r7184, 16, 0, %p444; + selp.b32 %r7272, %r7184, 0, %p422; + setp.lt.s32 %p445, %r677, 2048; + setp.lt.s32 %p446, %r678, 2048; + setp.lt.s32 %p447, %r679, 2048; + setp.lt.s32 %p448, %r680, 2048; + setp.lt.s32 %p449, %r681, 2048; + setp.lt.s32 %p450, %r682, 2048; + setp.lt.s32 %p451, %r683, 2048; + setp.lt.s32 %p452, %r684, 2048; + cvt.s64.s32 %rd89, %r677; + and.b32 %r702, %r7, 252; + shl.b32 %r703, %r675, 3; + add.s32 %r7185, %r7117, %r703; + add.s32 %r10372, %r7185, 106496; + selp.b32 %r7186, 8, 0, %p445; + selp.b32 %r7274, %r7186, 0, %p422; + add.s32 %r10374, %r7185, 106528; + selp.b32 %r7187, 8, 0, %p446; + selp.b32 %r7276, %r7187, 0, %p422; + add.s32 %r10376, %r7185, 106560; + selp.b32 %r7188, 8, 0, %p447; + selp.b32 %r7278, %r7188, 0, %p422; + add.s32 %r10378, %r7185, 106592; + selp.b32 %r7189, 8, 0, %p448; + selp.b32 %r7280, %r7189, 0, %p422; + add.s32 %r10380, %r7185, 106624; + selp.b32 %r7190, 8, 0, %p449; + selp.b32 %r7282, %r7190, 0, %p422; + add.s32 %r10382, %r7185, 106656; + selp.b32 %r7191, 8, 0, %p450; + selp.b32 %r7284, %r7191, 0, %p422; + add.s32 %r10384, %r7185, 106688; + selp.b32 %r7192, 8, 0, %p451; + selp.b32 %r7286, %r7192, 0, %p422; + add.s32 %r10386, %r7185, 106720; + selp.b32 %r7193, 8, 0, %p452; + selp.b32 %r7288, %r7193, 0, %p422; + add.s32 %r10388, %r10364, 49152; + add.s32 %r10390, %r10364, 51200; + add.s32 %r10392, %r10364, 53248; + add.s32 %r10394, %r10364, 55296; + add.s32 %r10396, %r7185, 107008; + add.s32 %r10398, %r7185, 107040; + add.s32 %r10400, %r7185, 107072; + add.s32 %r10402, %r7185, 107104; + add.s32 %r10404, %r7185, 107136; + add.s32 %r10406, %r7185, 107168; + add.s32 %r10408, %r7185, 107200; + add.s32 %r10410, %r7185, 107232; + setp.gt.s32 %p453, %r687, 1; + or.b32 %r732, %r677, 64; + or.b32 %r733, %r678, 64; + or.b32 %r734, %r679, 64; + or.b32 %r735, %r680, 64; + or.b32 %r736, %r681, 64; + or.b32 %r737, %r682, 64; + or.b32 %r738, %r683, 64; + or.b32 %r739, %r684, 64; + or.b32 %r740, %r7130, 64; + or.b32 %r741, %r7131, 64; + or.b32 %r742, %r7132, 64; + or.b32 %r743, %r7133, 64; + setp.lt.s32 %p454, %r740, 2048; + setp.lt.s32 %p455, %r741, 2048; + setp.lt.s32 %p456, %r742, 2048; + setp.lt.s32 %p457, %r743, 2048; + add.s32 %r10412, %r10364, 16384; + selp.b32 %r7194, 16, 0, %p454; + selp.b32 %r7314, %r7194, 0, %p453; + add.s32 %r10414, %r10364, 18432; + selp.b32 %r7195, 16, 0, %p455; + selp.b32 %r7316, %r7195, 0, %p453; + add.s32 %r10416, %r10364, 20480; + selp.b32 %r7196, 16, 0, %p456; + selp.b32 %r7318, %r7196, 0, %p453; + add.s32 %r10418, %r10364, 22528; + selp.b32 %r7197, 16, 0, %p457; + selp.b32 %r7320, %r7197, 0, %p453; + setp.lt.s32 %p458, %r732, 2048; + setp.lt.s32 %p459, %r733, 2048; + setp.lt.s32 %p460, %r734, 2048; + setp.lt.s32 %p461, %r735, 2048; + setp.lt.s32 %p462, %r736, 2048; + setp.lt.s32 %p463, %r737, 2048; + setp.lt.s32 %p464, %r738, 2048; + setp.lt.s32 %p465, %r739, 2048; + add.s32 %r10420, %r7185, 106752; + selp.b32 %r7198, 8, 0, %p458; + selp.b32 %r7322, %r7198, 0, %p453; + add.s32 %r10422, %r7185, 106784; + selp.b32 %r7199, 8, 0, %p459; + selp.b32 %r7324, %r7199, 0, %p453; + add.s32 %r10424, %r7185, 106816; + selp.b32 %r7200, 8, 0, %p460; + selp.b32 %r7326, %r7200, 0, %p453; + add.s32 %r10426, %r7185, 106848; + selp.b32 %r7201, 8, 0, %p461; + selp.b32 %r7328, %r7201, 0, %p453; + add.s32 %r10428, %r7185, 106880; + selp.b32 %r7202, 8, 0, %p462; + selp.b32 %r7330, %r7202, 0, %p453; + add.s32 %r10430, %r7185, 106912; + selp.b32 %r7203, 8, 0, %p463; + selp.b32 %r7332, %r7203, 0, %p453; + add.s32 %r10432, %r7185, 106944; + selp.b32 %r7204, 8, 0, %p464; + selp.b32 %r7334, %r7204, 0, %p453; + add.s32 %r10434, %r7185, 106976; + selp.b32 %r7205, 8, 0, %p465; + selp.b32 %r7336, %r7205, 0, %p453; + add.s32 %r10436, %r10364, 65536; + add.s32 %r10438, %r10364, 67584; + add.s32 %r10440, %r10364, 69632; + add.s32 %r10442, %r10364, 71680; + add.s32 %r10444, %r7185, 107264; + add.s32 %r10446, %r7185, 107296; + add.s32 %r10448, %r7185, 107328; + add.s32 %r10450, %r7185, 107360; + add.s32 %r10452, %r7185, 107392; + add.s32 %r10454, %r7185, 107424; + add.s32 %r10456, %r7185, 107456; + add.s32 %r10458, %r7185, 107488; + add.s32 %r780, %r7142, -2; + add.s32 %r781, %r7142, -1; + shl.b32 %r7206, %r7, 1; + and.b32 %r7207, %r7206, 444; + and.b32 %r7208, %r7, 1; + neg.s32 %r7209, %r7208; + shl.b32 %r7210, %r7208, 6; + shr.u32 %r7211, %r7, 4; + and.b32 %r7212, %r7211, 2; + or.b32 %r7213, %r7207, %r7210; + or.b32 %r7214, %r7213, %r7212; + add.s32 %r7215, %r7117, 98304; + add.s32 %r782, %r7215, %r7214; + xor.b32 %r7216, %r7214, 16; + add.s32 %r783, %r7215, %r7216; + xor.b32 %r7217, %r7214, 4; + add.s32 %r784, %r7215, %r7217; + xor.b32 %r7218, %r7214, 20; + add.s32 %r785, %r7215, %r7218; + xor.b32 %r7219, %r7214, 8; + add.s32 %r786, %r7215, %r7219; + xor.b32 %r7220, %r7214, 24; + add.s32 %r787, %r7215, %r7220; + xor.b32 %r7221, %r7214, 12; + add.s32 %r788, %r7215, %r7221; + xor.b32 %r7222, %r7214, 28; + add.s32 %r789, %r7215, %r7222; + and.b32 %r7223, %r7, 12; + shl.b32 %r7224, %r7223, 8; + shl.b32 %r7225, %r18, 2; + and.b32 %r7226, %r7209, 4112; + shl.b32 %r790, %r7, 5; + and.b32 %r7227, %r790, 64; + and.b32 %r7228, %r7206, 32; + or.b32 %r7229, %r7224, %r7225; + or.b32 %r7230, %r7227, %r7226; + or.b32 %r7231, %r7230, %r7229; + or.b32 %r7232, %r7231, %r7228; + or.b32 %r7233, %r7232, %r7223; + add.s32 %r791, %r7215, %r7233; + xor.b32 %r7234, %r7233, 4; + add.s32 %r792, %r7215, %r7234; + xor.b32 %r7235, %r7233, 8; + add.s32 %r793, %r7215, %r7235; + xor.b32 %r7236, %r7233, 12; + add.s32 %r794, %r7215, %r7236; + xor.b32 %r7237, %r7233, 16; + add.s32 %r795, %r7215, %r7237; + xor.b32 %r7238, %r7233, 20; + add.s32 %r796, %r7215, %r7238; + xor.b32 %r7239, %r7233, 24; + add.s32 %r797, %r7215, %r7239; + xor.b32 %r7240, %r7233, 28; + add.s32 %r798, %r7215, %r7240; + cvt.s64.s32 %rd90, %r7171; + cvt.s64.s32 %rd91, %r7172; + cvt.s64.s32 %rd92, %r7173; + cvt.s64.s32 %rd93, %r7174; + cvt.s64.s32 %rd94, %r7175; + cvt.s64.s32 %rd95, %r7176; + cvt.s64.s32 %rd96, %r7177; + cvt.s64.s32 %rd97, %r7178; + setp.gt.s32 %p466, %r691, 0; + setp.lt.s32 %p467, %r7167, 2048; + setp.lt.s32 %p468, %r7168, 2048; + setp.lt.s32 %p469, %r7169, 2048; + setp.lt.s32 %p470, %r7170, 2048; + selp.b32 %r7241, 16, 0, %p467; + selp.b32 %r10365, %r7241, 0, %p466; + selp.b32 %r7242, 16, 0, %p468; + selp.b32 %r10367, %r7242, 0, %p466; + selp.b32 %r7243, 16, 0, %p469; + selp.b32 %r10369, %r7243, 0, %p466; + selp.b32 %r7244, 16, 0, %p470; + selp.b32 %r10371, %r7244, 0, %p466; + setp.lt.s32 %p471, %r7159, 2048; + setp.lt.s32 %p472, %r7160, 2048; + setp.lt.s32 %p473, %r7161, 2048; + setp.lt.s32 %p474, %r7162, 2048; + setp.lt.s32 %p475, %r7163, 2048; + setp.lt.s32 %p476, %r7164, 2048; + setp.lt.s32 %p477, %r7165, 2048; + setp.lt.s32 %p478, %r7166, 2048; + cvt.s64.s32 %rd98, %r7159; + selp.b32 %r7245, 8, 0, %p471; + selp.b32 %r10373, %r7245, 0, %p466; + selp.b32 %r7246, 8, 0, %p472; + selp.b32 %r10375, %r7246, 0, %p466; + selp.b32 %r7247, 8, 0, %p473; + selp.b32 %r10377, %r7247, 0, %p466; + selp.b32 %r7248, 8, 0, %p474; + selp.b32 %r10379, %r7248, 0, %p466; + selp.b32 %r7249, 8, 0, %p475; + selp.b32 %r10381, %r7249, 0, %p466; + selp.b32 %r7250, 8, 0, %p476; + selp.b32 %r10383, %r7250, 0, %p466; + selp.b32 %r7251, 8, 0, %p477; + selp.b32 %r10385, %r7251, 0, %p466; + selp.b32 %r7252, 8, 0, %p478; + selp.b32 %r10387, %r7252, 0, %p466; + setp.gt.s32 %p479, %r691, 1; + or.b32 %r811, %r7159, 64; + or.b32 %r812, %r7160, 64; + or.b32 %r813, %r7161, 64; + or.b32 %r814, %r7162, 64; + or.b32 %r815, %r7163, 64; + or.b32 %r816, %r7164, 64; + or.b32 %r817, %r7165, 64; + or.b32 %r818, %r7166, 64; + or.b32 %r819, %r7167, 64; + or.b32 %r820, %r7168, 64; + or.b32 %r821, %r7169, 64; + or.b32 %r822, %r7170, 64; + setp.lt.s32 %p480, %r819, 2048; + setp.lt.s32 %p481, %r820, 2048; + setp.lt.s32 %p482, %r821, 2048; + setp.lt.s32 %p483, %r822, 2048; + selp.b32 %r7253, 16, 0, %p480; + selp.b32 %r10413, %r7253, 0, %p479; + selp.b32 %r7254, 16, 0, %p481; + selp.b32 %r10415, %r7254, 0, %p479; + selp.b32 %r7255, 16, 0, %p482; + selp.b32 %r10417, %r7255, 0, %p479; + selp.b32 %r7256, 16, 0, %p483; + selp.b32 %r10419, %r7256, 0, %p479; + setp.lt.s32 %p484, %r811, 2048; + setp.lt.s32 %p485, %r812, 2048; + setp.lt.s32 %p486, %r813, 2048; + setp.lt.s32 %p487, %r814, 2048; + setp.lt.s32 %p488, %r815, 2048; + setp.lt.s32 %p489, %r816, 2048; + setp.lt.s32 %p490, %r817, 2048; + setp.lt.s32 %p491, %r818, 2048; + selp.b32 %r7257, 8, 0, %p484; + selp.b32 %r10421, %r7257, 0, %p479; + selp.b32 %r7258, 8, 0, %p485; + selp.b32 %r10423, %r7258, 0, %p479; + selp.b32 %r7259, 8, 0, %p486; + selp.b32 %r10425, %r7259, 0, %p479; + selp.b32 %r7260, 8, 0, %p487; + selp.b32 %r10427, %r7260, 0, %p479; + selp.b32 %r7261, 8, 0, %p488; + selp.b32 %r10429, %r7261, 0, %p479; + selp.b32 %r7262, 8, 0, %p489; + selp.b32 %r10431, %r7262, 0, %p479; + selp.b32 %r7263, 8, 0, %p490; + selp.b32 %r10433, %r7263, 0, %p479; + selp.b32 %r7264, 8, 0, %p491; + selp.b32 %r10435, %r7264, 0, %p479; + add.s32 %r835, %r7179, -2; + add.s32 %r836, %r7179, -1; +$L__tmp12: + .loc 1 262 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:262:30 + max.s32 %r837, %r7142, 1; + max.s32 %r838, %r7179, 1; + mul.wide.u32 %rd99, %r4, 4; + mov.b32 %r14207, 0f00000000; + mov.b64 %rd1005, 0; + shl.b64 %rd608, %rd81, 1; + shl.b64 %rd610, %rd82, 1; + shl.b64 %rd612, %rd83, 1; + shl.b64 %rd614, %rd84, 1; + shl.b64 %rd617, %rd85, 1; + shl.b64 %rd619, %rd86, 1; + shl.b64 %rd621, %rd87, 1; + shl.b64 %rd623, %rd88, 1; + shl.b64 %rd625, %rd89, 2; + shl.b64 %rd804, %rd90, 1; + shl.b64 %rd806, %rd91, 1; + shl.b64 %rd808, %rd92, 1; + shl.b64 %rd810, %rd93, 1; + shl.b64 %rd813, %rd94, 1; + shl.b64 %rd815, %rd95, 1; + shl.b64 %rd817, %rd96, 1; + shl.b64 %rd819, %rd97, 1; + shl.b64 %rd821, %rd98, 2; + mov.b32 %r14208, %r14207; + mov.b32 %r14209, %r14207; + mov.b32 %r14210, %r14207; + mov.b32 %r14211, %r14207; + mov.b32 %r14212, %r14207; + mov.b32 %r14213, %r14207; + mov.b32 %r14214, %r14207; + mov.b32 %r14215, %r14207; + mov.b32 %r14216, %r14207; + mov.b32 %r14217, %r14207; + mov.b32 %r14218, %r14207; + mov.b32 %r14219, %r14207; + mov.b32 %r14220, %r14207; + mov.b32 %r14221, %r14207; + mov.b32 %r14222, %r14207; + mov.b32 %r14223, %r14207; + mov.b32 %r14224, %r14207; + mov.b32 %r14225, %r14207; + mov.b32 %r14226, %r14207; + mov.b32 %r14227, %r14207; + mov.b32 %r14228, %r14207; + mov.b32 %r14229, %r14207; + mov.b32 %r14230, %r14207; + mov.b32 %r14231, %r14207; + mov.b32 %r14232, %r14207; + mov.b32 %r14233, %r14207; + mov.b32 %r14234, %r14207; + mov.b32 %r14235, %r14207; + mov.b32 %r14236, %r14207; + mov.b32 %r14237, %r14207; + mov.b32 %r14238, %r14207; + mov.b32 %r14239, %r14207; + mov.b32 %r14240, %r14207; + mov.b32 %r14241, %r14207; + mov.b32 %r14242, %r14207; + mov.b32 %r14243, %r14207; + mov.b32 %r14244, %r14207; + mov.b32 %r14245, %r14207; + mov.b32 %r14246, %r14207; + mov.b32 %r14247, %r14207; + mov.b32 %r14248, %r14207; + mov.b32 %r14249, %r14207; + mov.b32 %r14250, %r14207; + mov.b32 %r14251, %r14207; + mov.b32 %r14252, %r14207; + mov.b32 %r14253, %r14207; + mov.b32 %r14254, %r14207; + mov.b32 %r14255, %r14207; + mov.b32 %r14256, %r14207; + mov.b32 %r14257, %r14207; + mov.b32 %r14258, %r14207; + mov.b32 %r14259, %r14207; + mov.b32 %r14260, %r14207; + mov.b32 %r14261, %r14207; + mov.b32 %r14262, %r14207; + mov.b32 %r14263, %r14207; + mov.b32 %r14264, %r14207; + mov.b32 %r14265, %r14207; + mov.b32 %r14266, %r14207; + mov.b32 %r14267, %r14207; + mov.b32 %r14268, %r14207; + mov.b32 %r14269, %r14207; + mov.b32 %r14270, %r14207; + mov.b32 %r14143, %r14207; + mov.b32 %r14144, %r14207; + mov.b32 %r14145, %r14207; + mov.b32 %r14146, %r14207; + mov.b32 %r14147, %r14207; + mov.b32 %r14148, %r14207; + mov.b32 %r14149, %r14207; + mov.b32 %r14150, %r14207; + mov.b32 %r14151, %r14207; + mov.b32 %r14152, %r14207; + mov.b32 %r14153, %r14207; + mov.b32 %r14154, %r14207; + mov.b32 %r14155, %r14207; + mov.b32 %r14156, %r14207; + mov.b32 %r14157, %r14207; + mov.b32 %r14158, %r14207; + mov.b32 %r14159, %r14207; + mov.b32 %r14160, %r14207; + mov.b32 %r14161, %r14207; + mov.b32 %r14162, %r14207; + mov.b32 %r14163, %r14207; + mov.b32 %r14164, %r14207; + mov.b32 %r14165, %r14207; + mov.b32 %r14166, %r14207; + mov.b32 %r14167, %r14207; + mov.b32 %r14168, %r14207; + mov.b32 %r14169, %r14207; + mov.b32 %r14170, %r14207; + mov.b32 %r14171, %r14207; + mov.b32 %r14172, %r14207; + mov.b32 %r14173, %r14207; + mov.b32 %r14174, %r14207; + mov.b32 %r14175, %r14207; + mov.b32 %r14176, %r14207; + mov.b32 %r14177, %r14207; + mov.b32 %r14178, %r14207; + mov.b32 %r14179, %r14207; + mov.b32 %r14180, %r14207; + mov.b32 %r14181, %r14207; + mov.b32 %r14182, %r14207; + mov.b32 %r14183, %r14207; + mov.b32 %r14184, %r14207; + mov.b32 %r14185, %r14207; + mov.b32 %r14186, %r14207; + mov.b32 %r14187, %r14207; + mov.b32 %r14188, %r14207; + mov.b32 %r14189, %r14207; + mov.b32 %r14190, %r14207; + mov.b32 %r14191, %r14207; + mov.b32 %r14192, %r14207; + mov.b32 %r14193, %r14207; + mov.b32 %r14194, %r14207; + mov.b32 %r14195, %r14207; + mov.b32 %r14196, %r14207; + mov.b32 %r14197, %r14207; + mov.b32 %r14198, %r14207; + mov.b32 %r14199, %r14207; + mov.b32 %r14200, %r14207; + mov.b32 %r14201, %r14207; + mov.b32 %r14202, %r14207; + mov.b32 %r14203, %r14207; + mov.b32 %r14204, %r14207; + mov.b32 %r14205, %r14207; + mov.b32 %r14206, %r14207; + bra.uni $L__BB0_9; +$L__BB0_15: // %._crit_edge1820 + // in Loop: Header=BB0_9 Depth=1 +$L__tmp13: + .loc 1 610 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:610:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + // begin inline asm + // wait for regs: %r14143,%r14144,%r14145,%r14146,%r14147,%r14148,%r14149,%r14150,%r14151,%r14152,%r14153,%r14154,%r14155,%r14156,%r14157,%r14158,%r14159,%r14160,%r14161,%r14162,%r14163,%r14164,%r14165,%r14166,%r14167,%r14168,%r14169,%r14170,%r14171,%r14172,%r14173,%r14174,%r14175,%r14176,%r14177,%r14178,%r14179,%r14180,%r14181,%r14182,%r14183,%r14184,%r14185,%r14186,%r14187,%r14188,%r14189,%r14190,%r14191,%r14192,%r14193,%r14194,%r14195,%r14196,%r14197,%r14198,%r14199,%r14200,%r14201,%r14202,%r14203,%r14204,%r14205,%r14206,%r14207,%r14208,%r14209,%r14210,%r14211,%r14212,%r14213,%r14214,%r14215,%r14216,%r14217,%r14218,%r14219,%r14220,%r14221,%r14222,%r14223,%r14224,%r14225,%r14226,%r14227,%r14228,%r14229,%r14230,%r14231,%r14232,%r14233,%r14234,%r14235,%r14236,%r14237,%r14238,%r14239,%r14240,%r14241,%r14242,%r14243,%r14244,%r14245,%r14246,%r14247,%r14248,%r14249,%r14250,%r14251,%r14252,%r14253,%r14254,%r14255,%r14256,%r14257,%r14258,%r14259,%r14260,%r14261,%r14262,%r14263,%r14264,%r14265,%r14266,%r14267,%r14268,%r14269,%r14270 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp14: + .loc 1 262 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:262:30 + add.s64 %rd1005, %rd1005, 1; + setp.ne.b64 %p985, %rd1005, 4; + @%p985 bra $L__BB0_9; + bra.uni $L__BB0_16; +$L__BB0_9: // =>This Loop Header: Depth=1 + // Child Loop BB0_11 Depth 2 + // Child Loop BB0_14 Depth 2 + .loc 1 0 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:0:30 + setp.eq.b32 %p840, %r702, 0; +$L__tmp15: + .loc 1 610 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:610:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.lt.s32 %p524, %r687, 1; +$L__tmp16: + .loc 1 263 51 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:263:51 + add.s64 %rd606, %rd1005, %rd99; + .loc 1 266 44 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:266:44 + cvt.u32.u64 %r7361, %rd606; + shl.b32 %r7362, %r7361, 7; + add.s32 %r7363, %r7362, %r672; + .loc 1 267 36 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:267:36 + shl.b32 %r7364, %r7361, 18; + .loc 1 267 46 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:267:46 + add.s32 %r7365, %r7364, %r672; + .loc 1 269 50 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:269:50 + shl.b32 %r7366, %r7361, 11; + add.s32 %r7367, %r7366, %r692; + .loc 1 271 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:271:21 + mad.wide.s32 %rd103, %r7363, 2, %rd157; + .loc 1 272 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:272:23 + mad.wide.s32 %rd104, %r7365, 2, %rd160; + .loc 1 275 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:275:25 + mul.wide.s32 %rd607, %r7367, 4; + add.s64 %rd105, %rd158, %rd607; + .loc 1 276 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:276:29 + add.s64 %rd106, %rd159, %rd607; +$L__tmp17: + .loc 1 601 18 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:601:18 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s64 %rd609, %rd103, %rd608; + add.s64 %rd611, %rd103, %rd610; + add.s64 %rd613, %rd103, %rd612; + add.s64 %rd615, %rd103, %rd614; + .loc 1 601 49 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:601:49 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + shl.b64 %rd616, %rd78, 1; + add.s64 %rd558, %rd609, %rd616; + add.s64 %rd559, %rd611, %rd616; + add.s64 %rd560, %rd613, %rd616; + add.s64 %rd561, %rd615, %rd616; + .loc 1 602 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:602:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s64 %rd618, %rd104, %rd617; + add.s64 %rd620, %rd104, %rd619; + add.s64 %rd622, %rd104, %rd621; + add.s64 %rd624, %rd104, %rd623; + .loc 1 602 51 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:602:51 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s64 %rd570, %rd618, %rd616; + add.s64 %rd571, %rd620, %rd616; + add.s64 %rd572, %rd622, %rd616; + add.s64 %rd573, %rd624, %rd616; + .loc 1 831 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + // begin inline asm + cp.async.cg.shared.global [ %r10364 + 0 ], [ %rd558 + 0 ], 0x10, %r7266; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10366 + 0 ], [ %rd559 + 0 ], 0x10, %r7268; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10368 + 0 ], [ %rd560 + 0 ], 0x10, %r7270; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10370 + 0 ], [ %rd561 + 0 ], 0x10, %r7272; + // end inline asm + cp.async.commit_group; + .loc 1 674 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:674:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s64 %rd562, %rd105, %rd625; + cvt.s64.s32 %rd626, %r673; + cvt.u64.u32 %rd107, %r676; + add.s64 %rd627, %rd626, %rd107; + shl.b64 %rd628, %rd627, 2; + add.s64 %rd629, %rd105, %rd628; + add.s64 %rd563, %rd629, 32; + add.s64 %rd564, %rd629, 64; + add.s64 %rd565, %rd629, 96; + add.s64 %rd566, %rd629, 128; + add.s64 %rd567, %rd629, 160; + add.s64 %rd568, %rd629, 192; + add.s64 %rd569, %rd629, 224; + .loc 1 674 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:674:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10372 + 0 ], [ %rd562 + 0 ], 0x8, %r7274; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10374 + 0 ], [ %rd563 + 0 ], 0x8, %r7276; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10376 + 0 ], [ %rd564 + 0 ], 0x8, %r7278; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10378 + 0 ], [ %rd565 + 0 ], 0x8, %r7280; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10380 + 0 ], [ %rd566 + 0 ], 0x8, %r7282; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10382 + 0 ], [ %rd567 + 0 ], 0x8, %r7284; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10384 + 0 ], [ %rd568 + 0 ], 0x8, %r7286; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10386 + 0 ], [ %rd569 + 0 ], 0x8, %r7288; + // end inline asm + cp.async.commit_group; + .loc 1 833 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:833:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + // begin inline asm + cp.async.cg.shared.global [ %r10388 + 0 ], [ %rd570 + 0 ], 0x10, %r7266; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10390 + 0 ], [ %rd571 + 0 ], 0x10, %r7268; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10392 + 0 ], [ %rd572 + 0 ], 0x10, %r7270; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10394 + 0 ], [ %rd573 + 0 ], 0x10, %r7272; + // end inline asm + cp.async.commit_group; + .loc 1 748 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:748:29 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s64 %rd574, %rd106, %rd625; + add.s64 %rd630, %rd106, %rd628; + add.s64 %rd575, %rd630, 32; + add.s64 %rd576, %rd630, 64; + add.s64 %rd577, %rd630, 96; + add.s64 %rd578, %rd630, 128; + add.s64 %rd579, %rd630, 160; + add.s64 %rd580, %rd630, 192; + add.s64 %rd581, %rd630, 224; + .loc 1 748 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:748:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10396 + 0 ], [ %rd574 + 0 ], 0x8, %r7274; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10398 + 0 ], [ %rd575 + 0 ], 0x8, %r7276; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10400 + 0 ], [ %rd576 + 0 ], 0x8, %r7278; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10402 + 0 ], [ %rd577 + 0 ], 0x8, %r7280; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10404 + 0 ], [ %rd578 + 0 ], 0x8, %r7282; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10406 + 0 ], [ %rd579 + 0 ], 0x8, %r7284; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10408 + 0 ], [ %rd580 + 0 ], 0x8, %r7286; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10410 + 0 ], [ %rd581 + 0 ], 0x8, %r7288; + // end inline asm + cp.async.commit_group; + .loc 1 626 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:626:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s64 %rd1013, %rd558, 524288; + add.s64 %rd1012, %rd559, 524288; + add.s64 %rd1011, %rd560, 524288; + add.s64 %rd1010, %rd561, 524288; + .loc 1 627 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:627:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s64 %rd1009, %rd570, 16384; + add.s64 %rd1008, %rd571, 16384; + add.s64 %rd1007, %rd572, 16384; + add.s64 %rd1006, %rd573, 16384; + .loc 1 831 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + bar.sync 0; + // begin inline asm + cp.async.cg.shared.global [ %r10412 + 0 ], [ %rd1013 + 0 ], 0x10, %r7314; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10414 + 0 ], [ %rd1012 + 0 ], 0x10, %r7316; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10416 + 0 ], [ %rd1011 + 0 ], 0x10, %r7318; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10418 + 0 ], [ %rd1010 + 0 ], 0x10, %r7320; + // end inline asm + cp.async.commit_group; + .loc 1 674 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:674:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s64 %rd586, %rd562, 256; + add.s64 %rd631, %rd628, 256; + add.s64 %rd632, %rd105, %rd631; + add.s64 %rd587, %rd632, 32; + add.s64 %rd588, %rd632, 64; + add.s64 %rd589, %rd632, 96; + add.s64 %rd590, %rd632, 128; + add.s64 %rd591, %rd632, 160; + add.s64 %rd592, %rd632, 192; + add.s64 %rd593, %rd632, 224; + .loc 1 674 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:674:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10420 + 0 ], [ %rd586 + 0 ], 0x8, %r7322; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10422 + 0 ], [ %rd587 + 0 ], 0x8, %r7324; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10424 + 0 ], [ %rd588 + 0 ], 0x8, %r7326; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10426 + 0 ], [ %rd589 + 0 ], 0x8, %r7328; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10428 + 0 ], [ %rd590 + 0 ], 0x8, %r7330; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10430 + 0 ], [ %rd591 + 0 ], 0x8, %r7332; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10432 + 0 ], [ %rd592 + 0 ], 0x8, %r7334; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10434 + 0 ], [ %rd593 + 0 ], 0x8, %r7336; + // end inline asm + cp.async.commit_group; + .loc 1 833 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:833:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + // begin inline asm + cp.async.cg.shared.global [ %r10436 + 0 ], [ %rd1009 + 0 ], 0x10, %r7314; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10438 + 0 ], [ %rd1008 + 0 ], 0x10, %r7316; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10440 + 0 ], [ %rd1007 + 0 ], 0x10, %r7318; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10442 + 0 ], [ %rd1006 + 0 ], 0x10, %r7320; + // end inline asm + cp.async.commit_group; + .loc 1 748 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:748:29 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s64 %rd598, %rd574, 256; + add.s64 %rd633, %rd106, %rd631; + add.s64 %rd599, %rd633, 32; + add.s64 %rd600, %rd633, 64; + add.s64 %rd601, %rd633, 96; + add.s64 %rd602, %rd633, 128; + add.s64 %rd603, %rd633, 160; + add.s64 %rd604, %rd633, 192; + add.s64 %rd605, %rd633, 224; + .loc 1 748 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:748:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10444 + 0 ], [ %rd598 + 0 ], 0x8, %r7322; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10446 + 0 ], [ %rd599 + 0 ], 0x8, %r7324; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10448 + 0 ], [ %rd600 + 0 ], 0x8, %r7326; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10450 + 0 ], [ %rd601 + 0 ], 0x8, %r7328; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10452 + 0 ], [ %rd602 + 0 ], 0x8, %r7330; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10454 + 0 ], [ %rd603 + 0 ], 0x8, %r7332; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10456 + 0 ], [ %rd604 + 0 ], 0x8, %r7334; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10458 + 0 ], [ %rd605 + 0 ], 0x8, %r7336; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:610:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + @%p524 bra $L__BB0_12; +// %bb.10: // %.lr.ph1669.preheader + // in Loop: Header=BB0_9 Depth=1 + .loc 1 0 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:0:28 + mov.b32 %r7923, 0; + mov.b32 %r14124, 1; + mov.b32 %r14123, -1; + mov.b32 %r14122, 64; + mov.b32 %r14114, %r684; + mov.b32 %r14115, %r683; + mov.b32 %r14116, %r682; + mov.b32 %r14117, %r681; + mov.b32 %r14118, %r680; + mov.b32 %r14119, %r679; + mov.b32 %r14120, %r678; + mov.b32 %r14121, %r677; + mov.b32 %r14125, %r14123; + mov.b32 %r14126, %r14124; + mov.b32 %r14127, %r739; + mov.b32 %r14128, %r738; + mov.b32 %r14129, %r737; + mov.b32 %r14130, %r736; + mov.b32 %r14131, %r735; + mov.b32 %r14132, %r734; + mov.b32 %r14133, %r733; + mov.b32 %r14134, %r732; + mov.b32 %r14135, %r740; + mov.b32 %r14136, %r741; + mov.b32 %r14137, %r742; + mov.b32 %r14138, %r743; + mov.b32 %r14139, %r740; + mov.b32 %r14140, %r741; + mov.b32 %r14141, %r742; + mov.b32 %r14142, %r743; + mov.b32 %r14271, %r7923; + mov.b32 %r14272, %r839; + mov.b32 %r14273, %r840; + mov.b32 %r14274, %r685; + mov.b32 %r14275, %r686; +$L__BB0_11: // %.lr.ph1669 + // Parent Loop BB0_9 Depth=1 + // => This Inner Loop Header: Depth=2 + .loc 1 610 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:610:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.lt.s32 %p565, %r14271, %r780; + setp.lt.s32 %p547, %r14271, %r781; + add.s32 %r9590, %r14123, 1; + setp.gt.s32 %p566, %r9590, 1; + selp.b32 %r14123, 0, %r9590, %p566; + add.s32 %r9591, %r14125, 1; + setp.gt.s32 %p567, %r9591, 2; + selp.b32 %r14125, 0, %r9591, %p567; + .loc 1 831 52 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:52 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.lt.s32 %p568, %r14121, 2048; + setp.lt.s32 %p569, %r14120, 2048; + setp.lt.s32 %p570, %r14119, 2048; + setp.lt.s32 %p571, %r14118, 2048; + setp.lt.s32 %p572, %r14117, 2048; + setp.lt.s32 %p573, %r14116, 2048; + setp.lt.s32 %p574, %r14115, 2048; + setp.lt.s32 %p575, %r14114, 2048; + .loc 1 831 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cp.async.wait_group 4; + bar.sync 0; + shl.b32 %r9592, %r14125, 14; + add.s32 %r7925, %r7117, %r9592; + .loc 1 674 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:674:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + shl.b32 %r9594, %r14123, 8; + add.s32 %r9595, %r7117, 106496; + add.s32 %r9596, %r9595, %r9594; + add.s32 %r9597, %r9596, %r703; + ld.shared.v2.b32 {%r9598, %r9599}, [%r9597]; + ld.shared.v2.b32 {%r9600, %r9601}, [%r9597+32]; + ld.shared.v2.b32 {%r9602, %r9603}, [%r9597+64]; + ld.shared.v2.b32 {%r9604, %r9605}, [%r9597+96]; + ld.shared.v2.b32 {%r9606, %r9607}, [%r9597+128]; + ld.shared.v2.b32 {%r9608, %r9609}, [%r9597+160]; + ld.shared.v2.b32 {%r9610, %r9611}, [%r9597+192]; + ld.shared.v2.b32 {%r9612, %r9613}, [%r9597+224]; + .loc 1 675 26 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:675:26 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.eq.f32 %p576, %r9598, 0fFF800000; + setp.eq.f32 %p577, %r9599, 0fFF800000; + setp.eq.f32 %p578, %r9600, 0fFF800000; + setp.eq.f32 %p579, %r9601, 0fFF800000; + setp.eq.f32 %p580, %r9602, 0fFF800000; + setp.eq.f32 %p581, %r9603, 0fFF800000; + setp.eq.f32 %p582, %r9604, 0fFF800000; + setp.eq.f32 %p583, %r9605, 0fFF800000; + setp.eq.f32 %p584, %r9606, 0fFF800000; + setp.eq.f32 %p585, %r9607, 0fFF800000; + setp.eq.f32 %p586, %r9608, 0fFF800000; + setp.eq.f32 %p587, %r9609, 0fFF800000; + setp.eq.f32 %p588, %r9610, 0fFF800000; + setp.eq.f32 %p589, %r9611, 0fFF800000; + setp.eq.f32 %p590, %r9612, 0fFF800000; + setp.eq.f32 %p591, %r9613, 0fFF800000; + .loc 1 675 46 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:675:46 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9614, 0f00000000, %r9598, %p576; + selp.f32 %r9615, 0f00000000, %r9599, %p577; + selp.f32 %r9616, 0f00000000, %r9600, %p578; + selp.f32 %r9617, 0f00000000, %r9601, %p579; + selp.f32 %r9618, 0f00000000, %r9602, %p580; + selp.f32 %r9619, 0f00000000, %r9603, %p581; + selp.f32 %r9620, 0f00000000, %r9604, %p582; + selp.f32 %r9621, 0f00000000, %r9605, %p583; + selp.f32 %r9622, 0f00000000, %r9606, %p584; + selp.f32 %r9623, 0f00000000, %r9607, %p585; + selp.f32 %r9624, 0f00000000, %r9608, %p586; + selp.f32 %r9625, 0f00000000, %r9609, %p587; + selp.f32 %r9626, 0f00000000, %r9610, %p588; + selp.f32 %r9627, 0f00000000, %r9611, %p589; + selp.f32 %r9628, 0f00000000, %r9612, %p590; + selp.f32 %r9629, 0f00000000, %r9613, %p591; + .loc 1 676 20 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:676:20 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + shfl.sync.idx.b32 %r9630, %r8, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r9631, %r9630, 11; + and.b32 %r9632, %r9631, 8192; + add.s32 %r7884, %r7117, 107520; + add.s32 %r9633, %r9632, %r7884; + bfe.u32 %r9634, %r9633, 4, 14; + cvt.u64.u32 %rd704, %r9634; + or.b64 %rd634, %rd704, 4611686293372403712; + bfe.u32 %r9635, %r7925, 4, 14; + cvt.u64.u32 %rd705, %r9635; + or.b64 %rd635, %rd705, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7468,%r7469,%r7470,%r7471,%r7472,%r7473,%r7474,%r7475,%r7476,%r7477,%r7478,%r7479,%r7480,%r7481,%r7482,%r7483,%r7484,%r7485,%r7486,%r7487,%r7488,%r7489,%r7490,%r7491,%r7492,%r7493,%r7494,%r7495,%r7496,%r7497,%r7498,%r7499}, %rd634, %rd635, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r9636, %r9632, 32; + add.s32 %r9637, %r9636, %r7884; + bfe.u32 %r9638, %r9637, 4, 14; + cvt.u64.u32 %rd706, %r9638; + or.b64 %rd636, %rd706, 4611686293372403712; + add.s32 %r9639, %r7925, 32; + bfe.u32 %r9640, %r9639, 4, 14; + cvt.u64.u32 %rd707, %r9640; + or.b64 %rd637, %rd707, 4611686293338849280; + mov.pred %p525, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7468,%r7469,%r7470,%r7471,%r7472,%r7473,%r7474,%r7475,%r7476,%r7477,%r7478,%r7479,%r7480,%r7481,%r7482,%r7483,%r7484,%r7485,%r7486,%r7487,%r7488,%r7489,%r7490,%r7491,%r7492,%r7493,%r7494,%r7495,%r7496,%r7497,%r7498,%r7499}, %rd636, %rd637, %p525, 1, 1, 0, 0; + // end inline asm + or.b32 %r9641, %r9632, 64; + add.s32 %r9642, %r9641, %r7884; + bfe.u32 %r9643, %r9642, 4, 14; + cvt.u64.u32 %rd708, %r9643; + or.b64 %rd638, %rd708, 4611686293372403712; + add.s32 %r9644, %r7925, 64; + bfe.u32 %r9645, %r9644, 4, 14; + cvt.u64.u32 %rd709, %r9645; + or.b64 %rd639, %rd709, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7468,%r7469,%r7470,%r7471,%r7472,%r7473,%r7474,%r7475,%r7476,%r7477,%r7478,%r7479,%r7480,%r7481,%r7482,%r7483,%r7484,%r7485,%r7486,%r7487,%r7488,%r7489,%r7490,%r7491,%r7492,%r7493,%r7494,%r7495,%r7496,%r7497,%r7498,%r7499}, %rd638, %rd639, %p525, 1, 1, 0, 0; + // end inline asm + or.b32 %r9646, %r9632, 96; + add.s32 %r9647, %r9646, %r7884; + bfe.u32 %r9648, %r9647, 4, 14; + cvt.u64.u32 %rd710, %r9648; + or.b64 %rd640, %rd710, 4611686293372403712; + add.s32 %r9649, %r7925, 96; + bfe.u32 %r9650, %r9649, 4, 14; + cvt.u64.u32 %rd711, %r9650; + or.b64 %rd641, %rd711, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7468,%r7469,%r7470,%r7471,%r7472,%r7473,%r7474,%r7475,%r7476,%r7477,%r7478,%r7479,%r7480,%r7481,%r7482,%r7483,%r7484,%r7485,%r7486,%r7487,%r7488,%r7489,%r7490,%r7491,%r7492,%r7493,%r7494,%r7495,%r7496,%r7497,%r7498,%r7499}, %rd640, %rd641, %p525, 1, 1, 0, 0; + // end inline asm + or.b32 %r9651, %r9632, 16384; + add.s32 %r9652, %r9651, %r7884; + bfe.u32 %r9653, %r9652, 4, 14; + cvt.u64.u32 %rd712, %r9653; + or.b64 %rd642, %rd712, 4611686293372403712; + add.s32 %r9654, %r7925, 8192; + bfe.u32 %r9655, %r9654, 4, 14; + cvt.u64.u32 %rd713, %r9655; + or.b64 %rd643, %rd713, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7468,%r7469,%r7470,%r7471,%r7472,%r7473,%r7474,%r7475,%r7476,%r7477,%r7478,%r7479,%r7480,%r7481,%r7482,%r7483,%r7484,%r7485,%r7486,%r7487,%r7488,%r7489,%r7490,%r7491,%r7492,%r7493,%r7494,%r7495,%r7496,%r7497,%r7498,%r7499}, %rd642, %rd643, %p525, 1, 1, 0, 0; + // end inline asm + or.b32 %r9656, %r9632, 16416; + add.s32 %r9657, %r9656, %r7884; + bfe.u32 %r9658, %r9657, 4, 14; + cvt.u64.u32 %rd714, %r9658; + or.b64 %rd644, %rd714, 4611686293372403712; + add.s32 %r9659, %r7925, 8224; + bfe.u32 %r9660, %r9659, 4, 14; + cvt.u64.u32 %rd715, %r9660; + or.b64 %rd645, %rd715, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7468,%r7469,%r7470,%r7471,%r7472,%r7473,%r7474,%r7475,%r7476,%r7477,%r7478,%r7479,%r7480,%r7481,%r7482,%r7483,%r7484,%r7485,%r7486,%r7487,%r7488,%r7489,%r7490,%r7491,%r7492,%r7493,%r7494,%r7495,%r7496,%r7497,%r7498,%r7499}, %rd644, %rd645, %p525, 1, 1, 0, 0; + // end inline asm + or.b32 %r9661, %r9632, 16448; + add.s32 %r9662, %r9661, %r7884; + bfe.u32 %r9663, %r9662, 4, 14; + cvt.u64.u32 %rd716, %r9663; + or.b64 %rd646, %rd716, 4611686293372403712; + add.s32 %r9664, %r7925, 8256; + bfe.u32 %r9665, %r9664, 4, 14; + cvt.u64.u32 %rd717, %r9665; + or.b64 %rd647, %rd717, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7468,%r7469,%r7470,%r7471,%r7472,%r7473,%r7474,%r7475,%r7476,%r7477,%r7478,%r7479,%r7480,%r7481,%r7482,%r7483,%r7484,%r7485,%r7486,%r7487,%r7488,%r7489,%r7490,%r7491,%r7492,%r7493,%r7494,%r7495,%r7496,%r7497,%r7498,%r7499}, %rd646, %rd647, %p525, 1, 1, 0, 0; + // end inline asm + or.b32 %r9666, %r9632, 16480; + add.s32 %r9667, %r9666, %r7884; + bfe.u32 %r9668, %r9667, 4, 14; + cvt.u64.u32 %rd718, %r9668; + or.b64 %rd648, %rd718, 4611686293372403712; + add.s32 %r9669, %r7925, 8288; + bfe.u32 %r9670, %r9669, 4, 14; + cvt.u64.u32 %rd719, %r9670; + or.b64 %rd649, %rd719, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r7468,%r7469,%r7470,%r7471,%r7472,%r7473,%r7474,%r7475,%r7476,%r7477,%r7478,%r7479,%r7480,%r7481,%r7482,%r7483,%r7484,%r7485,%r7486,%r7487,%r7488,%r7489,%r7490,%r7491,%r7492,%r7493,%r7494,%r7495,%r7496,%r7497,%r7498,%r7499}, %rd648, %rd649, %p525, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r7887, %r7925; + mov.b32 %r7885, %r7923; + mov.b32 %r7886, %r7923; + mov.b32 %r7888, %r7923; + mov.b32 %r7889, %r7923; + // begin inline asm + // wait for regs: %r7468,%r7469,%r7470,%r7471,%r7472,%r7473,%r7474,%r7475,%r7476,%r7477,%r7478,%r7479,%r7480,%r7481,%r7482,%r7483,%r7484,%r7485,%r7486,%r7487,%r7488,%r7489,%r7490,%r7491,%r7492,%r7493,%r7494,%r7495,%r7496,%r7497,%r7498,%r7499,%r7884,%r7885,%r7886,%r7887,%r7888,%r7889 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 678 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:678:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9671, %r7468, 0f3DB504F3; + mul.f32 %r9672, %r7469, 0f3DB504F3; + mul.f32 %r9673, %r7470, 0f3DB504F3; + mul.f32 %r9674, %r7471, 0f3DB504F3; + mul.f32 %r9675, %r7472, 0f3DB504F3; + mul.f32 %r9676, %r7473, 0f3DB504F3; + mul.f32 %r9677, %r7474, 0f3DB504F3; + mul.f32 %r9678, %r7475, 0f3DB504F3; + mul.f32 %r9679, %r7476, 0f3DB504F3; + mul.f32 %r9680, %r7477, 0f3DB504F3; + mul.f32 %r9681, %r7478, 0f3DB504F3; + mul.f32 %r9682, %r7479, 0f3DB504F3; + mul.f32 %r9683, %r7480, 0f3DB504F3; + mul.f32 %r9684, %r7481, 0f3DB504F3; + mul.f32 %r9685, %r7482, 0f3DB504F3; + mul.f32 %r9686, %r7483, 0f3DB504F3; + mul.f32 %r9687, %r7484, 0f3DB504F3; + mul.f32 %r9688, %r7485, 0f3DB504F3; + mul.f32 %r9689, %r7486, 0f3DB504F3; + mul.f32 %r9690, %r7487, 0f3DB504F3; + mul.f32 %r9691, %r7488, 0f3DB504F3; + mul.f32 %r9692, %r7489, 0f3DB504F3; + mul.f32 %r9693, %r7490, 0f3DB504F3; + mul.f32 %r9694, %r7491, 0f3DB504F3; + mul.f32 %r9695, %r7492, 0f3DB504F3; + mul.f32 %r9696, %r7493, 0f3DB504F3; + mul.f32 %r9697, %r7494, 0f3DB504F3; + mul.f32 %r9698, %r7495, 0f3DB504F3; + mul.f32 %r9699, %r7496, 0f3DB504F3; + mul.f32 %r9700, %r7497, 0f3DB504F3; + mul.f32 %r9701, %r7498, 0f3DB504F3; + mul.f32 %r9702, %r7499, 0f3DB504F3; + .loc 1 798 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:798:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + shr.s32 %r9703, %r14272, 31; + shr.u32 %r9704, %r9703, 21; + add.s32 %r9705, %r14272, %r9704; + and.b32 %r9706, %r9705, -2048; + sub.s32 %r9707, %r14272, %r9706; + shr.s32 %r9708, %r14273, 31; + shr.u32 %r9709, %r9708, 21; + add.s32 %r9710, %r14273, %r9709; + and.b32 %r9711, %r9710, -2048; + sub.s32 %r9712, %r14273, %r9711; + shr.s32 %r9713, %r14274, 31; + shr.u32 %r9714, %r9713, 21; + add.s32 %r9715, %r14274, %r9714; + and.b32 %r9716, %r9715, -2048; + sub.s32 %r9717, %r14274, %r9716; + shr.s32 %r9718, %r14275, 31; + shr.u32 %r9719, %r9718, 21; + add.s32 %r9720, %r14275, %r9719; + and.b32 %r9721, %r9720, -2048; + sub.s32 %r9722, %r14275, %r9721; + .loc 1 698 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:698:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.le.s32 %p592, %r842, %r9712; + setp.le.s32 %p593, %r842, %r9707; + .loc 1 703 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:703:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.s64.s32 %rd720, %r9707; + cvt.s64.s32 %rd721, %r9712; + .loc 1 704 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:704:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.gt.s64 %p594, %rd518, %rd721; + setp.gt.s64 %p595, %rd518, %rd720; + .loc 1 706 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:706:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p596, %p593, %p595; + and.pred %p597, %p592, %p594; + .loc 1 722 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:722:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.s32 %r9723, %r842, %r9712; + sub.s32 %r9724, %r842, %r9707; + .loc 1 729 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:729:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.b32 %r9725, %r9724, 2047; + and.b32 %r9726, %r9723, 2047; + .loc 1 730 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:730:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.eq.b32 %p598, %r9726, 0; + setp.eq.b32 %p599, %r9725, 0; + .loc 1 731 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:731:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p600, %p5, %p599; + and.pred %p601, %p5, %p598; + .loc 1 732 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:732:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + or.pred %p602, %p597, %p601; + or.pred %p603, %p596, %p600; + .loc 1 698 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:698:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.le.s32 %p604, %r842, %r9722; + setp.le.s32 %p605, %r842, %r9717; + .loc 1 703 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:703:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.s64.s32 %rd722, %r9717; + cvt.s64.s32 %rd723, %r9722; + .loc 1 704 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:704:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.gt.s64 %p606, %rd518, %rd723; + setp.gt.s64 %p607, %rd518, %rd722; + .loc 1 706 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:706:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p608, %p605, %p607; + and.pred %p609, %p604, %p606; + .loc 1 722 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:722:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.s32 %r9727, %r842, %r9722; + sub.s32 %r9728, %r842, %r9717; + .loc 1 729 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:729:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.b32 %r9729, %r9728, 2047; + and.b32 %r9730, %r9727, 2047; + .loc 1 730 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:730:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.eq.b32 %p610, %r9730, 0; + setp.eq.b32 %p611, %r9729, 0; + .loc 1 731 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:731:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p612, %p5, %p611; + and.pred %p613, %p5, %p610; + .loc 1 732 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:732:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + or.pred %p614, %p609, %p613; + or.pred %p615, %p608, %p612; + .loc 1 698 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:698:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.le.s32 %p616, %r843, %r9712; + setp.le.s32 %p617, %r843, %r9707; + .loc 1 722 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:722:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.s32 %r9731, %r843, %r9712; + sub.s32 %r9732, %r843, %r9707; + .loc 1 729 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:729:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.b32 %r9733, %r9732, 2047; + and.b32 %r9734, %r9731, 2047; + .loc 1 730 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:730:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.eq.b32 %p618, %r9734, 0; + setp.eq.b32 %p619, %r9733, 0; + .loc 1 731 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:731:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p620, %p7, %p619; + and.pred %p621, %p7, %p618; + .loc 1 706 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:706:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p622, %p617, %p595; + and.pred %p623, %p616, %p594; + .loc 1 732 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:732:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + or.pred %p624, %p623, %p621; + or.pred %p625, %p622, %p620; + .loc 1 698 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:698:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.le.s32 %p626, %r843, %r9722; + setp.le.s32 %p627, %r843, %r9717; + .loc 1 722 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:722:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.s32 %r9735, %r843, %r9722; + sub.s32 %r9736, %r843, %r9717; + .loc 1 729 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:729:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.b32 %r9737, %r9736, 2047; + and.b32 %r9738, %r9735, 2047; + .loc 1 730 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:730:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.eq.b32 %p628, %r9738, 0; + setp.eq.b32 %p629, %r9737, 0; + .loc 1 731 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:731:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p630, %p7, %p629; + and.pred %p631, %p7, %p628; + .loc 1 706 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:706:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p632, %p627, %p607; + and.pred %p633, %p626, %p606; + .loc 1 732 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:732:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + or.pred %p634, %p633, %p631; + or.pred %p635, %p632, %p630; + .loc 1 698 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:698:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.le.s32 %p636, %r846, %r9712; + setp.le.s32 %p637, %r846, %r9707; + .loc 1 722 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:722:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.s32 %r9739, %r846, %r9712; + sub.s32 %r9740, %r846, %r9707; + .loc 1 729 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:729:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.b32 %r9741, %r9740, 2047; + and.b32 %r9742, %r9739, 2047; + .loc 1 730 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:730:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.eq.b32 %p638, %r9742, 0; + setp.eq.b32 %p639, %r9741, 0; + .loc 1 731 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:731:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p640, %p9, %p639; + and.pred %p641, %p9, %p638; + .loc 1 706 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:706:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p642, %p637, %p595; + and.pred %p643, %p636, %p594; + .loc 1 732 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:732:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + or.pred %p644, %p643, %p641; + or.pred %p645, %p642, %p640; + .loc 1 698 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:698:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.le.s32 %p646, %r846, %r9722; + setp.le.s32 %p647, %r846, %r9717; + .loc 1 722 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:722:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.s32 %r9743, %r846, %r9722; + sub.s32 %r9744, %r846, %r9717; + .loc 1 729 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:729:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.b32 %r9745, %r9744, 2047; + and.b32 %r9746, %r9743, 2047; + .loc 1 730 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:730:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.eq.b32 %p648, %r9746, 0; + setp.eq.b32 %p649, %r9745, 0; + .loc 1 731 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:731:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p650, %p9, %p649; + and.pred %p651, %p9, %p648; + .loc 1 706 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:706:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p652, %p647, %p607; + and.pred %p653, %p646, %p606; + .loc 1 732 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:732:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + or.pred %p654, %p653, %p651; + or.pred %p655, %p652, %p650; + .loc 1 698 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:698:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.le.s32 %p656, %r847, %r9712; + setp.le.s32 %p657, %r847, %r9707; + .loc 1 722 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:722:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.s32 %r9747, %r847, %r9712; + sub.s32 %r9748, %r847, %r9707; + .loc 1 729 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:729:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.b32 %r9749, %r9748, 2047; + and.b32 %r9750, %r9747, 2047; + .loc 1 730 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:730:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.eq.b32 %p658, %r9750, 0; + setp.eq.b32 %p659, %r9749, 0; + .loc 1 731 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:731:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p660, %p11, %p659; + and.pred %p661, %p11, %p658; + .loc 1 706 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:706:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p662, %p657, %p595; + and.pred %p663, %p656, %p594; + .loc 1 732 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:732:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + or.pred %p664, %p663, %p661; + or.pred %p665, %p662, %p660; + .loc 1 698 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:698:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.le.s32 %p666, %r847, %r9722; + setp.le.s32 %p667, %r847, %r9717; + .loc 1 722 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:722:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.s32 %r9751, %r847, %r9722; + sub.s32 %r9752, %r847, %r9717; + .loc 1 729 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:729:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.b32 %r9753, %r9752, 2047; + and.b32 %r9754, %r9751, 2047; + .loc 1 730 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:730:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.eq.b32 %p668, %r9754, 0; + setp.eq.b32 %p669, %r9753, 0; + .loc 1 731 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:731:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p670, %p11, %p669; + and.pred %p671, %p11, %p668; + .loc 1 706 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:706:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p672, %p667, %p607; + and.pred %p673, %p666, %p606; + .loc 1 732 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:732:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + or.pred %p674, %p673, %p671; + or.pred %p675, %p672, %p670; + .loc 1 698 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:698:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.le.s32 %p676, %r850, %r9712; + setp.le.s32 %p677, %r850, %r9707; + .loc 1 722 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:722:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.s32 %r9755, %r850, %r9712; + sub.s32 %r9756, %r850, %r9707; + .loc 1 729 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:729:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.b32 %r9757, %r9756, 2047; + and.b32 %r9758, %r9755, 2047; + .loc 1 730 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:730:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.eq.b32 %p678, %r9758, 0; + setp.eq.b32 %p679, %r9757, 0; + .loc 1 731 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:731:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p680, %p13, %p679; + and.pred %p681, %p13, %p678; + .loc 1 706 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:706:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p682, %p677, %p595; + and.pred %p683, %p676, %p594; + .loc 1 732 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:732:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + or.pred %p684, %p683, %p681; + or.pred %p685, %p682, %p680; + .loc 1 698 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:698:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.le.s32 %p686, %r850, %r9722; + setp.le.s32 %p687, %r850, %r9717; + .loc 1 722 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:722:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.s32 %r9759, %r850, %r9722; + sub.s32 %r9760, %r850, %r9717; + .loc 1 729 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:729:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.b32 %r9761, %r9760, 2047; + and.b32 %r9762, %r9759, 2047; + .loc 1 730 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:730:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.eq.b32 %p688, %r9762, 0; + setp.eq.b32 %p689, %r9761, 0; + .loc 1 731 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:731:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p690, %p13, %p689; + and.pred %p691, %p13, %p688; + .loc 1 706 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:706:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p692, %p687, %p607; + and.pred %p693, %p686, %p606; + .loc 1 732 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:732:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + or.pred %p694, %p693, %p691; + or.pred %p695, %p692, %p690; + .loc 1 698 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:698:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.le.s32 %p696, %r851, %r9712; + setp.le.s32 %p697, %r851, %r9707; + .loc 1 722 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:722:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.s32 %r9763, %r851, %r9712; + sub.s32 %r9764, %r851, %r9707; + .loc 1 729 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:729:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.b32 %r9765, %r9764, 2047; + and.b32 %r9766, %r9763, 2047; + .loc 1 730 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:730:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.eq.b32 %p698, %r9766, 0; + setp.eq.b32 %p699, %r9765, 0; + .loc 1 731 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:731:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p700, %p15, %p699; + and.pred %p701, %p15, %p698; + .loc 1 706 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:706:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p702, %p697, %p595; + and.pred %p703, %p696, %p594; + .loc 1 732 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:732:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + or.pred %p704, %p703, %p701; + or.pred %p705, %p702, %p700; + .loc 1 698 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:698:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.le.s32 %p706, %r851, %r9722; + setp.le.s32 %p707, %r851, %r9717; + .loc 1 722 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:722:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.s32 %r9767, %r851, %r9722; + sub.s32 %r9768, %r851, %r9717; + .loc 1 729 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:729:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.b32 %r9769, %r9768, 2047; + and.b32 %r9770, %r9767, 2047; + .loc 1 730 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:730:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.eq.b32 %p708, %r9770, 0; + setp.eq.b32 %p709, %r9769, 0; + .loc 1 731 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:731:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p710, %p15, %p709; + and.pred %p711, %p15, %p708; + .loc 1 706 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:706:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p712, %p707, %p607; + and.pred %p713, %p706, %p606; + .loc 1 732 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:732:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + or.pred %p714, %p713, %p711; + or.pred %p715, %p712, %p710; + .loc 1 698 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:698:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.le.s32 %p716, %r854, %r9712; + setp.le.s32 %p717, %r854, %r9707; + .loc 1 722 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:722:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.s32 %r9771, %r854, %r9712; + sub.s32 %r9772, %r854, %r9707; + .loc 1 729 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:729:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.b32 %r9773, %r9772, 2047; + and.b32 %r9774, %r9771, 2047; + .loc 1 730 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:730:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.eq.b32 %p718, %r9774, 0; + setp.eq.b32 %p719, %r9773, 0; + .loc 1 731 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:731:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p720, %p17, %p719; + and.pred %p721, %p17, %p718; + .loc 1 706 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:706:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p722, %p717, %p595; + and.pred %p723, %p716, %p594; + .loc 1 732 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:732:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + or.pred %p724, %p723, %p721; + or.pred %p725, %p722, %p720; + .loc 1 698 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:698:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.le.s32 %p726, %r854, %r9722; + setp.le.s32 %p727, %r854, %r9717; + .loc 1 722 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:722:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.s32 %r9775, %r854, %r9722; + sub.s32 %r9776, %r854, %r9717; + .loc 1 729 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:729:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.b32 %r9777, %r9776, 2047; + and.b32 %r9778, %r9775, 2047; + .loc 1 730 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:730:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.eq.b32 %p728, %r9778, 0; + setp.eq.b32 %p729, %r9777, 0; + .loc 1 731 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:731:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p730, %p17, %p729; + and.pred %p731, %p17, %p728; + .loc 1 706 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:706:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p732, %p727, %p607; + and.pred %p733, %p726, %p606; + .loc 1 732 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:732:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + or.pred %p734, %p733, %p731; + or.pred %p735, %p732, %p730; + .loc 1 698 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:698:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.le.s32 %p736, %r855, %r9712; + setp.le.s32 %p737, %r855, %r9707; + .loc 1 722 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:722:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.s32 %r9779, %r855, %r9712; + sub.s32 %r9780, %r855, %r9707; + .loc 1 729 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:729:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.b32 %r9781, %r9780, 2047; + and.b32 %r9782, %r9779, 2047; + .loc 1 730 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:730:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.eq.b32 %p738, %r9782, 0; + setp.eq.b32 %p739, %r9781, 0; + .loc 1 731 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:731:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p740, %p19, %p739; + and.pred %p741, %p19, %p738; + .loc 1 706 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:706:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p742, %p737, %p595; + and.pred %p743, %p736, %p594; + .loc 1 732 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:732:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + or.pred %p744, %p743, %p741; + or.pred %p745, %p742, %p740; + .loc 1 698 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:698:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.le.s32 %p746, %r855, %r9722; + setp.le.s32 %p747, %r855, %r9717; + .loc 1 722 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:722:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.s32 %r9783, %r855, %r9722; + sub.s32 %r9784, %r855, %r9717; + .loc 1 729 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:729:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.b32 %r9785, %r9784, 2047; + and.b32 %r9786, %r9783, 2047; + .loc 1 730 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:730:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.eq.b32 %p748, %r9786, 0; + setp.eq.b32 %p749, %r9785, 0; + .loc 1 731 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:731:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p750, %p19, %p749; + and.pred %p751, %p19, %p748; + .loc 1 706 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:706:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p752, %p747, %p607; + and.pred %p753, %p746, %p606; + .loc 1 732 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:732:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + or.pred %p754, %p753, %p751; + or.pred %p755, %p752, %p750; + selp.b16 %rs153, 1, 0, %p603; + selp.b16 %rs154, 1, 0, %p602; + st.shared.v2.b8 [%r782], {%rs153, %rs154}; + selp.b16 %rs155, 1, 0, %p685; + selp.b16 %rs156, 1, 0, %p684; + st.shared.v2.b8 [%r782+512], {%rs155, %rs156}; + selp.b16 %rs157, 1, 0, %p615; + selp.b16 %rs158, 1, 0, %p614; + st.shared.v2.b8 [%r783+4096], {%rs157, %rs158}; + selp.b16 %rs159, 1, 0, %p695; + selp.b16 %rs160, 1, 0, %p694; + st.shared.v2.b8 [%r783+4608], {%rs159, %rs160}; + selp.b16 %rs161, 1, 0, %p625; + selp.b16 %rs162, 1, 0, %p624; + st.shared.v2.b8 [%r784+1024], {%rs161, %rs162}; + selp.b16 %rs163, 1, 0, %p705; + selp.b16 %rs164, 1, 0, %p704; + st.shared.v2.b8 [%r784+1536], {%rs163, %rs164}; + selp.b16 %rs165, 1, 0, %p635; + selp.b16 %rs166, 1, 0, %p634; + st.shared.v2.b8 [%r785+5120], {%rs165, %rs166}; + selp.b16 %rs167, 1, 0, %p715; + selp.b16 %rs168, 1, 0, %p714; + st.shared.v2.b8 [%r785+5632], {%rs167, %rs168}; + selp.b16 %rs169, 1, 0, %p645; + selp.b16 %rs170, 1, 0, %p644; + st.shared.v2.b8 [%r786+2048], {%rs169, %rs170}; + selp.b16 %rs171, 1, 0, %p725; + selp.b16 %rs172, 1, 0, %p724; + st.shared.v2.b8 [%r786+2560], {%rs171, %rs172}; + selp.b16 %rs173, 1, 0, %p655; + selp.b16 %rs174, 1, 0, %p654; + st.shared.v2.b8 [%r787+6144], {%rs173, %rs174}; + selp.b16 %rs175, 1, 0, %p735; + selp.b16 %rs176, 1, 0, %p734; + st.shared.v2.b8 [%r787+6656], {%rs175, %rs176}; + selp.b16 %rs177, 1, 0, %p665; + selp.b16 %rs178, 1, 0, %p664; + st.shared.v2.b8 [%r788+3072], {%rs177, %rs178}; + selp.b16 %rs179, 1, 0, %p745; + selp.b16 %rs180, 1, 0, %p744; + st.shared.v2.b8 [%r788+3584], {%rs179, %rs180}; + selp.b16 %rs181, 1, 0, %p675; + selp.b16 %rs182, 1, 0, %p674; + st.shared.v2.b8 [%r789+7168], {%rs181, %rs182}; + selp.b16 %rs183, 1, 0, %p755; + selp.b16 %rs184, 1, 0, %p754; + st.shared.v2.b8 [%r789+7680], {%rs183, %rs184}; + bar.sync 0; + ld.shared.b8 %rs185, [%r791]; + and.b16 %rs186, %rs185, 1; + setp.ne.b16 %p756, %rs186, 0; + ld.shared.b16 %r9787, [%r791+2]; + ld.shared.b8 %rs187, [%r791+1]; + ld.shared.b8 %rs188, [%r792]; + and.b16 %rs189, %rs188, 1; + setp.ne.b16 %p757, %rs189, 0; + ld.shared.b16 %r9788, [%r792+2]; + ld.shared.b8 %rs190, [%r792+1]; + ld.shared.b8 %rs191, [%r793]; + and.b16 %rs192, %rs191, 1; + setp.ne.b16 %p758, %rs192, 0; + ld.shared.b16 %r9789, [%r793+2]; + ld.shared.b8 %rs193, [%r793+1]; + ld.shared.b8 %rs194, [%r794]; + and.b16 %rs195, %rs194, 1; + setp.ne.b16 %p759, %rs195, 0; + ld.shared.b16 %r9790, [%r794+2]; + ld.shared.b8 %rs196, [%r794+1]; + ld.shared.b8 %rs197, [%r795]; + and.b16 %rs198, %rs197, 1; + setp.ne.b16 %p760, %rs198, 0; + ld.shared.b16 %r9791, [%r795+2]; + ld.shared.b8 %rs199, [%r795+1]; + ld.shared.b8 %rs200, [%r796]; + and.b16 %rs201, %rs200, 1; + setp.ne.b16 %p761, %rs201, 0; + ld.shared.b16 %r9792, [%r796+2]; + ld.shared.b8 %rs202, [%r796+1]; + ld.shared.b8 %rs203, [%r797]; + and.b16 %rs204, %rs203, 1; + setp.ne.b16 %p762, %rs204, 0; + ld.shared.b16 %r9793, [%r797+2]; + ld.shared.b8 %rs205, [%r797+1]; + ld.shared.b8 %rs206, [%r798]; + and.b16 %rs207, %rs206, 1; + setp.ne.b16 %p763, %rs207, 0; + ld.shared.b16 %r9794, [%r798+2]; + ld.shared.b8 %rs208, [%r798+1]; + and.b16 %rs209, %rs187, 1; + setp.ne.b16 %p764, %rs209, 0; + and.b32 %r9795, %r9787, 1; + setp.ne.b32 %p765, %r9795, 0; + shr.u32 %r9796, %r9787, 8; + and.b32 %r9797, %r9796, 1; + setp.ne.b32 %p766, %r9797, 0; + and.b16 %rs210, %rs190, 1; + setp.ne.b16 %p767, %rs210, 0; + and.b32 %r9798, %r9788, 1; + setp.ne.b32 %p768, %r9798, 0; + shr.u32 %r9799, %r9788, 8; + and.b32 %r9800, %r9799, 1; + setp.ne.b32 %p769, %r9800, 0; + and.b16 %rs211, %rs193, 1; + setp.ne.b16 %p770, %rs211, 0; + and.b32 %r9801, %r9789, 1; + setp.ne.b32 %p771, %r9801, 0; + shr.u32 %r9802, %r9789, 8; + and.b32 %r9803, %r9802, 1; + setp.ne.b32 %p772, %r9803, 0; + and.b16 %rs212, %rs196, 1; + setp.ne.b16 %p773, %rs212, 0; + and.b32 %r9804, %r9790, 1; + setp.ne.b32 %p774, %r9804, 0; + shr.u32 %r9805, %r9790, 8; + and.b32 %r9806, %r9805, 1; + setp.ne.b32 %p775, %r9806, 0; + and.b16 %rs213, %rs199, 1; + setp.ne.b16 %p776, %rs213, 0; + and.b32 %r9807, %r9791, 1; + setp.ne.b32 %p777, %r9807, 0; + shr.u32 %r9808, %r9791, 8; + and.b32 %r9809, %r9808, 1; + setp.ne.b32 %p778, %r9809, 0; + and.b16 %rs214, %rs202, 1; + setp.ne.b16 %p779, %rs214, 0; + and.b32 %r9810, %r9792, 1; + setp.ne.b32 %p780, %r9810, 0; + shr.u32 %r9811, %r9792, 8; + and.b32 %r9812, %r9811, 1; + setp.ne.b32 %p781, %r9812, 0; + and.b16 %rs215, %rs205, 1; + setp.ne.b16 %p782, %rs215, 0; + and.b32 %r9813, %r9793, 1; + setp.ne.b32 %p783, %r9813, 0; + shr.u32 %r9814, %r9793, 8; + and.b32 %r9815, %r9814, 1; + setp.ne.b32 %p784, %r9815, 0; + and.b16 %rs216, %rs208, 1; + setp.ne.b16 %p785, %rs216, 0; + and.b32 %r9816, %r9794, 1; + setp.ne.b32 %p786, %r9816, 0; + shr.u32 %r9817, %r9794, 8; + and.b32 %r9818, %r9817, 1; + setp.ne.b32 %p787, %r9818, 0; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p788, %p756, %p568; + and.pred %p789, %p764, %p568; + and.pred %p790, %p765, %p568; + and.pred %p791, %p766, %p568; + and.pred %p792, %p757, %p569; + and.pred %p793, %p767, %p569; + and.pred %p794, %p768, %p569; + and.pred %p795, %p769, %p569; + and.pred %p796, %p758, %p570; + and.pred %p797, %p770, %p570; + and.pred %p798, %p771, %p570; + and.pred %p799, %p772, %p570; + and.pred %p800, %p759, %p571; + and.pred %p801, %p773, %p571; + and.pred %p802, %p774, %p571; + and.pred %p803, %p775, %p571; + and.pred %p804, %p760, %p572; + and.pred %p805, %p776, %p572; + and.pred %p806, %p777, %p572; + and.pred %p807, %p778, %p572; + and.pred %p808, %p761, %p573; + and.pred %p809, %p779, %p573; + and.pred %p810, %p780, %p573; + and.pred %p811, %p781, %p573; + and.pred %p812, %p762, %p574; + and.pred %p813, %p782, %p574; + and.pred %p814, %p783, %p574; + and.pred %p815, %p784, %p574; + and.pred %p816, %p763, %p575; + and.pred %p817, %p785, %p575; + and.pred %p818, %p786, %p575; + and.pred %p819, %p787, %p575; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9819, %r9671, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9820, %r9819, 0fFF800000, %p788; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9821, %r9672, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9822, %r9821, 0fFF800000, %p789; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9823, %r9673, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9824, %r9823, 0fFF800000, %p790; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9825, %r9674, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9826, %r9825, 0fFF800000, %p791; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9827, %r9675, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9828, %r9827, 0fFF800000, %p792; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9829, %r9676, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9830, %r9829, 0fFF800000, %p793; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9831, %r9677, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9832, %r9831, 0fFF800000, %p794; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9833, %r9678, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9834, %r9833, 0fFF800000, %p795; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9835, %r9679, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9836, %r9835, 0fFF800000, %p796; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9837, %r9680, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9838, %r9837, 0fFF800000, %p797; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9839, %r9681, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9840, %r9839, 0fFF800000, %p798; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9841, %r9682, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9842, %r9841, 0fFF800000, %p799; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9843, %r9683, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9844, %r9843, 0fFF800000, %p800; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9845, %r9684, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9846, %r9845, 0fFF800000, %p801; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9847, %r9685, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9848, %r9847, 0fFF800000, %p802; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9849, %r9686, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9850, %r9849, 0fFF800000, %p803; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9851, %r9687, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9852, %r9851, 0fFF800000, %p804; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9853, %r9688, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9854, %r9853, 0fFF800000, %p805; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9855, %r9689, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9856, %r9855, 0fFF800000, %p806; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9857, %r9690, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9858, %r9857, 0fFF800000, %p807; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9859, %r9691, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9860, %r9859, 0fFF800000, %p808; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9861, %r9692, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9862, %r9861, 0fFF800000, %p809; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9863, %r9693, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9864, %r9863, 0fFF800000, %p810; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9865, %r9694, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9866, %r9865, 0fFF800000, %p811; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9867, %r9695, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9868, %r9867, 0fFF800000, %p812; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9869, %r9696, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9870, %r9869, 0fFF800000, %p813; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9871, %r9697, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9872, %r9871, 0fFF800000, %p814; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9873, %r9698, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9874, %r9873, 0fFF800000, %p815; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9875, %r9699, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9876, %r9875, 0fFF800000, %p816; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9877, %r9700, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9878, %r9877, 0fFF800000, %p817; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9879, %r9701, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9880, %r9879, 0fFF800000, %p818; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r9881, %r9702, 0f3FB8AA3B; + .loc 1 736 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:736:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.f32 %r9882, %r9881, 0fFF800000, %p819; + .loc 1 740 40 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:740:40 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.f32 %r9883, %r9820, %r9614; + sub.f32 %r9884, %r9822, %r9615; + sub.f32 %r9885, %r9824, %r9614; + sub.f32 %r9886, %r9826, %r9615; + sub.f32 %r9887, %r9828, %r9616; + sub.f32 %r9888, %r9830, %r9617; + sub.f32 %r9889, %r9832, %r9616; + sub.f32 %r9890, %r9834, %r9617; + sub.f32 %r9891, %r9836, %r9618; + sub.f32 %r9892, %r9838, %r9619; + sub.f32 %r9893, %r9840, %r9618; + sub.f32 %r9894, %r9842, %r9619; + sub.f32 %r9895, %r9844, %r9620; + sub.f32 %r9896, %r9846, %r9621; + sub.f32 %r9897, %r9848, %r9620; + sub.f32 %r9898, %r9850, %r9621; + sub.f32 %r9899, %r9852, %r9622; + sub.f32 %r9900, %r9854, %r9623; + sub.f32 %r9901, %r9856, %r9622; + sub.f32 %r9902, %r9858, %r9623; + sub.f32 %r9903, %r9860, %r9624; + sub.f32 %r9904, %r9862, %r9625; + sub.f32 %r9905, %r9864, %r9624; + sub.f32 %r9906, %r9866, %r9625; + sub.f32 %r9907, %r9868, %r9626; + sub.f32 %r9908, %r9870, %r9627; + sub.f32 %r9909, %r9872, %r9626; + sub.f32 %r9910, %r9874, %r9627; + sub.f32 %r9911, %r9876, %r9628; + sub.f32 %r9912, %r9878, %r9629; + sub.f32 %r9913, %r9880, %r9628; + sub.f32 %r9914, %r9882, %r9629; + .loc 1 740 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:740:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + ex2.approx.ftz.f32 %r9915, %r9883; + ex2.approx.ftz.f32 %r9916, %r9884; + ex2.approx.ftz.f32 %r9917, %r9885; + ex2.approx.ftz.f32 %r9918, %r9886; + ex2.approx.ftz.f32 %r9919, %r9887; + ex2.approx.ftz.f32 %r9920, %r9888; + ex2.approx.ftz.f32 %r9921, %r9889; + ex2.approx.ftz.f32 %r9922, %r9890; + ex2.approx.ftz.f32 %r9923, %r9891; + ex2.approx.ftz.f32 %r9924, %r9892; + ex2.approx.ftz.f32 %r9925, %r9893; + ex2.approx.ftz.f32 %r9926, %r9894; + ex2.approx.ftz.f32 %r9927, %r9895; + ex2.approx.ftz.f32 %r9928, %r9896; + ex2.approx.ftz.f32 %r9929, %r9897; + ex2.approx.ftz.f32 %r9930, %r9898; + ex2.approx.ftz.f32 %r9931, %r9899; + ex2.approx.ftz.f32 %r9932, %r9900; + ex2.approx.ftz.f32 %r9933, %r9901; + ex2.approx.ftz.f32 %r9934, %r9902; + ex2.approx.ftz.f32 %r9935, %r9903; + ex2.approx.ftz.f32 %r9936, %r9904; + ex2.approx.ftz.f32 %r9937, %r9905; + ex2.approx.ftz.f32 %r9938, %r9906; + ex2.approx.ftz.f32 %r9939, %r9907; + ex2.approx.ftz.f32 %r9940, %r9908; + ex2.approx.ftz.f32 %r9941, %r9909; + ex2.approx.ftz.f32 %r9942, %r9910; + ex2.approx.ftz.f32 %r9943, %r9911; + ex2.approx.ftz.f32 %r9944, %r9912; + ex2.approx.ftz.f32 %r9945, %r9913; + ex2.approx.ftz.f32 %r9946, %r9914; + .loc 1 833 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:833:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s32 %r9947, %r7117, 49152; + add.s32 %r8971, %r9947, %r9592; + .loc 1 744 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:744:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16x2.f32 %r8056, %r9916, %r9915; + cvt.rn.bf16x2.f32 %r8057, %r9918, %r9917; + cvt.rn.bf16x2.f32 %r8058, %r9920, %r9919; + cvt.rn.bf16x2.f32 %r8059, %r9922, %r9921; + cvt.rn.bf16x2.f32 %r8188, %r9924, %r9923; + cvt.rn.bf16x2.f32 %r8189, %r9926, %r9925; + cvt.rn.bf16x2.f32 %r8190, %r9928, %r9927; + cvt.rn.bf16x2.f32 %r8191, %r9930, %r9929; + cvt.rn.bf16x2.f32 %r8320, %r9932, %r9931; + cvt.rn.bf16x2.f32 %r8321, %r9934, %r9933; + cvt.rn.bf16x2.f32 %r8322, %r9936, %r9935; + cvt.rn.bf16x2.f32 %r8323, %r9938, %r9937; + cvt.rn.bf16x2.f32 %r8452, %r9940, %r9939; + cvt.rn.bf16x2.f32 %r8453, %r9942, %r9941; + cvt.rn.bf16x2.f32 %r8454, %r9944, %r9943; + cvt.rn.bf16x2.f32 %r8455, %r9946, %r9945; + .loc 1 744 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:744:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + wgmma.fence.sync.aligned; + bfe.u32 %r9948, %r8971, 4, 14; + cvt.u64.u32 %rd724, %r9948; + or.b64 %rd650, %rd724, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14143,%r14144,%r14145,%r14146,%r14147,%r14148,%r14149,%r14150,%r14151,%r14152,%r14153,%r14154,%r14155,%r14156,%r14157,%r14158,%r14159,%r14160,%r14161,%r14162,%r14163,%r14164,%r14165,%r14166,%r14167,%r14168,%r14169,%r14170,%r14171,%r14172,%r14173,%r14174,%r14175,%r14176,%r14177,%r14178,%r14179,%r14180,%r14181,%r14182,%r14183,%r14184,%r14185,%r14186,%r14187,%r14188,%r14189,%r14190,%r14191,%r14192,%r14193,%r14194,%r14195,%r14196,%r14197,%r14198,%r14199,%r14200,%r14201,%r14202,%r14203,%r14204,%r14205,%r14206}, {%r8056,%r8057,%r8058,%r8059}, %rd650, %p525, 1, 1, 1; + // end inline asm + add.s32 %r9949, %r8971, 2048; + bfe.u32 %r9950, %r9949, 4, 14; + cvt.u64.u32 %rd725, %r9950; + or.b64 %rd651, %rd725, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14143,%r14144,%r14145,%r14146,%r14147,%r14148,%r14149,%r14150,%r14151,%r14152,%r14153,%r14154,%r14155,%r14156,%r14157,%r14158,%r14159,%r14160,%r14161,%r14162,%r14163,%r14164,%r14165,%r14166,%r14167,%r14168,%r14169,%r14170,%r14171,%r14172,%r14173,%r14174,%r14175,%r14176,%r14177,%r14178,%r14179,%r14180,%r14181,%r14182,%r14183,%r14184,%r14185,%r14186,%r14187,%r14188,%r14189,%r14190,%r14191,%r14192,%r14193,%r14194,%r14195,%r14196,%r14197,%r14198,%r14199,%r14200,%r14201,%r14202,%r14203,%r14204,%r14205,%r14206}, {%r8188,%r8189,%r8190,%r8191}, %rd651, %p525, 1, 1, 1; + // end inline asm + add.s32 %r9951, %r8971, 4096; + bfe.u32 %r9952, %r9951, 4, 14; + cvt.u64.u32 %rd726, %r9952; + or.b64 %rd652, %rd726, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14143,%r14144,%r14145,%r14146,%r14147,%r14148,%r14149,%r14150,%r14151,%r14152,%r14153,%r14154,%r14155,%r14156,%r14157,%r14158,%r14159,%r14160,%r14161,%r14162,%r14163,%r14164,%r14165,%r14166,%r14167,%r14168,%r14169,%r14170,%r14171,%r14172,%r14173,%r14174,%r14175,%r14176,%r14177,%r14178,%r14179,%r14180,%r14181,%r14182,%r14183,%r14184,%r14185,%r14186,%r14187,%r14188,%r14189,%r14190,%r14191,%r14192,%r14193,%r14194,%r14195,%r14196,%r14197,%r14198,%r14199,%r14200,%r14201,%r14202,%r14203,%r14204,%r14205,%r14206}, {%r8320,%r8321,%r8322,%r8323}, %rd652, %p525, 1, 1, 1; + // end inline asm + add.s32 %r9953, %r8971, 6144; + bfe.u32 %r9954, %r9953, 4, 14; + cvt.u64.u32 %rd727, %r9954; + or.b64 %rd653, %rd727, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14143,%r14144,%r14145,%r14146,%r14147,%r14148,%r14149,%r14150,%r14151,%r14152,%r14153,%r14154,%r14155,%r14156,%r14157,%r14158,%r14159,%r14160,%r14161,%r14162,%r14163,%r14164,%r14165,%r14166,%r14167,%r14168,%r14169,%r14170,%r14171,%r14172,%r14173,%r14174,%r14175,%r14176,%r14177,%r14178,%r14179,%r14180,%r14181,%r14182,%r14183,%r14184,%r14185,%r14186,%r14187,%r14188,%r14189,%r14190,%r14191,%r14192,%r14193,%r14194,%r14195,%r14196,%r14197,%r14198,%r14199,%r14200,%r14201,%r14202,%r14203,%r14204,%r14205,%r14206}, {%r8452,%r8453,%r8454,%r8455}, %rd653, %p525, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 748 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:748:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s32 %r9955, %r7117, 107008; + add.s32 %r9956, %r9955, %r9594; + add.s32 %r9957, %r9956, %r703; + ld.shared.v2.b32 {%r9958, %r9959}, [%r9957]; + ld.shared.v2.b32 {%r9960, %r9961}, [%r9957+32]; + ld.shared.v2.b32 {%r9962, %r9963}, [%r9957+64]; + ld.shared.v2.b32 {%r9964, %r9965}, [%r9957+96]; + ld.shared.v2.b32 {%r9966, %r9967}, [%r9957+128]; + ld.shared.v2.b32 {%r9968, %r9969}, [%r9957+160]; + ld.shared.v2.b32 {%r9970, %r9971}, [%r9957+192]; + ld.shared.v2.b32 {%r9972, %r9973}, [%r9957+224]; + .loc 1 750 20 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:750:20 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + wgmma.fence.sync.aligned; + add.s32 %r8968, %r7117, 140288; + add.s32 %r9974, %r9632, %r8968; + bfe.u32 %r9975, %r9974, 4, 14; + cvt.u64.u32 %rd728, %r9975; + or.b64 %rd654, %rd728, 4611686293372403712; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8552,%r8553,%r8554,%r8555,%r8556,%r8557,%r8558,%r8559,%r8560,%r8561,%r8562,%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583}, %rd654, %rd650, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r9976, %r9636, %r8968; + bfe.u32 %r9977, %r9976, 4, 14; + cvt.u64.u32 %rd729, %r9977; + or.b64 %rd656, %rd729, 4611686293372403712; + add.s32 %r9978, %r8971, 32; + bfe.u32 %r9979, %r9978, 4, 14; + cvt.u64.u32 %rd730, %r9979; + or.b64 %rd657, %rd730, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8552,%r8553,%r8554,%r8555,%r8556,%r8557,%r8558,%r8559,%r8560,%r8561,%r8562,%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583}, %rd656, %rd657, %p525, 1, 1, 0, 0; + // end inline asm + add.s32 %r9980, %r9641, %r8968; + bfe.u32 %r9981, %r9980, 4, 14; + cvt.u64.u32 %rd731, %r9981; + or.b64 %rd658, %rd731, 4611686293372403712; + add.s32 %r9982, %r8971, 64; + bfe.u32 %r9983, %r9982, 4, 14; + cvt.u64.u32 %rd732, %r9983; + or.b64 %rd659, %rd732, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8552,%r8553,%r8554,%r8555,%r8556,%r8557,%r8558,%r8559,%r8560,%r8561,%r8562,%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583}, %rd658, %rd659, %p525, 1, 1, 0, 0; + // end inline asm + add.s32 %r9984, %r9646, %r8968; + bfe.u32 %r9985, %r9984, 4, 14; + cvt.u64.u32 %rd733, %r9985; + or.b64 %rd660, %rd733, 4611686293372403712; + add.s32 %r9986, %r8971, 96; + bfe.u32 %r9987, %r9986, 4, 14; + cvt.u64.u32 %rd734, %r9987; + or.b64 %rd661, %rd734, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8552,%r8553,%r8554,%r8555,%r8556,%r8557,%r8558,%r8559,%r8560,%r8561,%r8562,%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583}, %rd660, %rd661, %p525, 1, 1, 0, 0; + // end inline asm + add.s32 %r9988, %r9651, %r8968; + bfe.u32 %r9989, %r9988, 4, 14; + cvt.u64.u32 %rd735, %r9989; + or.b64 %rd662, %rd735, 4611686293372403712; + add.s32 %r9990, %r8971, 8192; + bfe.u32 %r9991, %r9990, 4, 14; + cvt.u64.u32 %rd736, %r9991; + or.b64 %rd663, %rd736, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8552,%r8553,%r8554,%r8555,%r8556,%r8557,%r8558,%r8559,%r8560,%r8561,%r8562,%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583}, %rd662, %rd663, %p525, 1, 1, 0, 0; + // end inline asm + add.s32 %r9992, %r9656, %r8968; + bfe.u32 %r9993, %r9992, 4, 14; + cvt.u64.u32 %rd737, %r9993; + or.b64 %rd664, %rd737, 4611686293372403712; + add.s32 %r9994, %r8971, 8224; + bfe.u32 %r9995, %r9994, 4, 14; + cvt.u64.u32 %rd738, %r9995; + or.b64 %rd665, %rd738, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8552,%r8553,%r8554,%r8555,%r8556,%r8557,%r8558,%r8559,%r8560,%r8561,%r8562,%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583}, %rd664, %rd665, %p525, 1, 1, 0, 0; + // end inline asm + add.s32 %r9996, %r9661, %r8968; + bfe.u32 %r9997, %r9996, 4, 14; + cvt.u64.u32 %rd739, %r9997; + or.b64 %rd666, %rd739, 4611686293372403712; + add.s32 %r9998, %r8971, 8256; + bfe.u32 %r9999, %r9998, 4, 14; + cvt.u64.u32 %rd740, %r9999; + or.b64 %rd667, %rd740, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8552,%r8553,%r8554,%r8555,%r8556,%r8557,%r8558,%r8559,%r8560,%r8561,%r8562,%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583}, %rd666, %rd667, %p525, 1, 1, 0, 0; + // end inline asm + add.s32 %r10000, %r9666, %r8968; + bfe.u32 %r10001, %r10000, 4, 14; + cvt.u64.u32 %rd741, %r10001; + or.b64 %rd668, %rd741, 4611686293372403712; + add.s32 %r10002, %r8971, 8288; + bfe.u32 %r10003, %r10002, 4, 14; + cvt.u64.u32 %rd742, %r10003; + or.b64 %rd669, %rd742, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r8552,%r8553,%r8554,%r8555,%r8556,%r8557,%r8558,%r8559,%r8560,%r8561,%r8562,%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583}, %rd668, %rd669, %p525, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r8973, %r7923; + mov.b32 %r8969, %r7923; + mov.b32 %r8970, %r7923; + mov.b32 %r8972, %r7923; + // begin inline asm + // wait for regs: %r8552,%r8553,%r8554,%r8555,%r8556,%r8557,%r8558,%r8559,%r8560,%r8561,%r8562,%r8563,%r8564,%r8565,%r8566,%r8567,%r8568,%r8569,%r8570,%r8571,%r8572,%r8573,%r8574,%r8575,%r8576,%r8577,%r8578,%r8579,%r8580,%r8581,%r8582,%r8583,%r8968,%r8969,%r8970,%r8971,%r8972,%r8973 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 751 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.f32 %r10004, %r8552, %r9958; + sub.f32 %r10005, %r8553, %r9959; + sub.f32 %r10006, %r8554, %r9958; + sub.f32 %r10007, %r8555, %r9959; + sub.f32 %r10008, %r8556, %r9960; + sub.f32 %r10009, %r8557, %r9961; + sub.f32 %r10010, %r8558, %r9960; + sub.f32 %r10011, %r8559, %r9961; + sub.f32 %r10012, %r8560, %r9962; + sub.f32 %r10013, %r8561, %r9963; + sub.f32 %r10014, %r8562, %r9962; + sub.f32 %r10015, %r8563, %r9963; + sub.f32 %r10016, %r8564, %r9964; + sub.f32 %r10017, %r8565, %r9965; + sub.f32 %r10018, %r8566, %r9964; + sub.f32 %r10019, %r8567, %r9965; + sub.f32 %r10020, %r8568, %r9966; + sub.f32 %r10021, %r8569, %r9967; + sub.f32 %r10022, %r8570, %r9966; + sub.f32 %r10023, %r8571, %r9967; + sub.f32 %r10024, %r8572, %r9968; + sub.f32 %r10025, %r8573, %r9969; + sub.f32 %r10026, %r8574, %r9968; + sub.f32 %r10027, %r8575, %r9969; + sub.f32 %r10028, %r8576, %r9970; + sub.f32 %r10029, %r8577, %r9971; + sub.f32 %r10030, %r8578, %r9970; + sub.f32 %r10031, %r8579, %r9971; + sub.f32 %r10032, %r8580, %r9972; + sub.f32 %r10033, %r8581, %r9973; + sub.f32 %r10034, %r8582, %r9972; + sub.f32 %r10035, %r8583, %r9973; + .loc 1 751 16 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:16 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.f32 %r10036, %r9915, %r10004; + mul.f32 %r10037, %r9916, %r10005; + mul.f32 %r10038, %r9917, %r10006; + mul.f32 %r10039, %r9918, %r10007; + mul.f32 %r10040, %r9919, %r10008; + mul.f32 %r10041, %r9920, %r10009; + mul.f32 %r10042, %r9921, %r10010; + mul.f32 %r10043, %r9922, %r10011; + mul.f32 %r10044, %r9923, %r10012; + mul.f32 %r10045, %r9924, %r10013; + mul.f32 %r10046, %r9925, %r10014; + mul.f32 %r10047, %r9926, %r10015; + mul.f32 %r10048, %r9927, %r10016; + mul.f32 %r10049, %r9928, %r10017; + mul.f32 %r10050, %r9929, %r10018; + mul.f32 %r10051, %r9930, %r10019; + mul.f32 %r10052, %r9931, %r10020; + mul.f32 %r10053, %r9932, %r10021; + mul.f32 %r10054, %r9933, %r10022; + mul.f32 %r10055, %r9934, %r10023; + mul.f32 %r10056, %r9935, %r10024; + mul.f32 %r10057, %r9936, %r10025; + mul.f32 %r10058, %r9937, %r10026; + mul.f32 %r10059, %r9938, %r10027; + mul.f32 %r10060, %r9939, %r10028; + mul.f32 %r10061, %r9940, %r10029; + mul.f32 %r10062, %r9941, %r10030; + mul.f32 %r10063, %r9942, %r10031; + mul.f32 %r10064, %r9943, %r10032; + mul.f32 %r10065, %r9944, %r10033; + mul.f32 %r10066, %r9945, %r10034; + mul.f32 %r10067, %r9946, %r10035; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs217, %r10036; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs218, %rs217, 0x0000, %p788; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs219, %r10037; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs220, %rs219, 0x0000, %p789; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs221, %r10038; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs222, %rs221, 0x0000, %p790; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs223, %r10039; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs224, %rs223, 0x0000, %p791; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs225, %r10040; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs226, %rs225, 0x0000, %p792; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs227, %r10041; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs228, %rs227, 0x0000, %p793; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs229, %r10042; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs230, %rs229, 0x0000, %p794; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs231, %r10043; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs232, %rs231, 0x0000, %p795; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs233, %r10044; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs234, %rs233, 0x0000, %p796; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs235, %r10045; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs236, %rs235, 0x0000, %p797; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs237, %r10046; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs238, %rs237, 0x0000, %p798; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs239, %r10047; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs240, %rs239, 0x0000, %p799; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs241, %r10048; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs242, %rs241, 0x0000, %p800; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs243, %r10049; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs244, %rs243, 0x0000, %p801; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs245, %r10050; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs246, %rs245, 0x0000, %p802; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs247, %r10051; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs248, %rs247, 0x0000, %p803; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs249, %r10052; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs250, %rs249, 0x0000, %p804; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs251, %r10053; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs252, %rs251, 0x0000, %p805; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs253, %r10054; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs254, %rs253, 0x0000, %p806; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs255, %r10055; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs256, %rs255, 0x0000, %p807; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs257, %r10056; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs258, %rs257, 0x0000, %p808; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs259, %r10057; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs260, %rs259, 0x0000, %p809; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs261, %r10058; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs262, %rs261, 0x0000, %p810; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs263, %r10059; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs264, %rs263, 0x0000, %p811; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs265, %r10060; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs266, %rs265, 0x0000, %p812; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs267, %r10061; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs268, %rs267, 0x0000, %p813; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs269, %r10062; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs270, %rs269, 0x0000, %p814; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs271, %r10063; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs272, %rs271, 0x0000, %p815; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs273, %r10064; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs274, %rs273, 0x0000, %p816; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs275, %r10065; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs276, %rs275, 0x0000, %p817; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs277, %r10066; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs278, %rs277, 0x0000, %p818; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + cvt.rn.bf16.f32 %rs279, %r10067; + .loc 1 773 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:773:45 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + selp.b16 %rs280, %rs279, 0x0000, %p819; + .loc 1 775 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mov.b32 %r9140, {%rs218, %rs220}; + mov.b32 %r9141, {%rs222, %rs224}; + mov.b32 %r9142, {%rs226, %rs228}; + mov.b32 %r9143, {%rs230, %rs232}; + mov.b32 %r9272, {%rs234, %rs236}; + mov.b32 %r9273, {%rs238, %rs240}; + mov.b32 %r9274, {%rs242, %rs244}; + mov.b32 %r9275, {%rs246, %rs248}; + mov.b32 %r9404, {%rs250, %rs252}; + mov.b32 %r9405, {%rs254, %rs256}; + mov.b32 %r9406, {%rs258, %rs260}; + mov.b32 %r9407, {%rs262, %rs264}; + mov.b32 %r9536, {%rs266, %rs268}; + mov.b32 %r9537, {%rs270, %rs272}; + mov.b32 %r9538, {%rs274, %rs276}; + mov.b32 %r9539, {%rs278, %rs280}; + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14207,%r14208,%r14209,%r14210,%r14211,%r14212,%r14213,%r14214,%r14215,%r14216,%r14217,%r14218,%r14219,%r14220,%r14221,%r14222,%r14223,%r14224,%r14225,%r14226,%r14227,%r14228,%r14229,%r14230,%r14231,%r14232,%r14233,%r14234,%r14235,%r14236,%r14237,%r14238,%r14239,%r14240,%r14241,%r14242,%r14243,%r14244,%r14245,%r14246,%r14247,%r14248,%r14249,%r14250,%r14251,%r14252,%r14253,%r14254,%r14255,%r14256,%r14257,%r14258,%r14259,%r14260,%r14261,%r14262,%r14263,%r14264,%r14265,%r14266,%r14267,%r14268,%r14269,%r14270}, {%r9140,%r9141,%r9142,%r9143}, %rd635, %p525, 1, 1, 1; + // end inline asm + add.s32 %r10068, %r7925, 2048; + bfe.u32 %r10069, %r10068, 4, 14; + cvt.u64.u32 %rd743, %r10069; + or.b64 %rd671, %rd743, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14207,%r14208,%r14209,%r14210,%r14211,%r14212,%r14213,%r14214,%r14215,%r14216,%r14217,%r14218,%r14219,%r14220,%r14221,%r14222,%r14223,%r14224,%r14225,%r14226,%r14227,%r14228,%r14229,%r14230,%r14231,%r14232,%r14233,%r14234,%r14235,%r14236,%r14237,%r14238,%r14239,%r14240,%r14241,%r14242,%r14243,%r14244,%r14245,%r14246,%r14247,%r14248,%r14249,%r14250,%r14251,%r14252,%r14253,%r14254,%r14255,%r14256,%r14257,%r14258,%r14259,%r14260,%r14261,%r14262,%r14263,%r14264,%r14265,%r14266,%r14267,%r14268,%r14269,%r14270}, {%r9272,%r9273,%r9274,%r9275}, %rd671, %p525, 1, 1, 1; + // end inline asm + add.s32 %r10070, %r7925, 4096; + bfe.u32 %r10071, %r10070, 4, 14; + cvt.u64.u32 %rd744, %r10071; + or.b64 %rd672, %rd744, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14207,%r14208,%r14209,%r14210,%r14211,%r14212,%r14213,%r14214,%r14215,%r14216,%r14217,%r14218,%r14219,%r14220,%r14221,%r14222,%r14223,%r14224,%r14225,%r14226,%r14227,%r14228,%r14229,%r14230,%r14231,%r14232,%r14233,%r14234,%r14235,%r14236,%r14237,%r14238,%r14239,%r14240,%r14241,%r14242,%r14243,%r14244,%r14245,%r14246,%r14247,%r14248,%r14249,%r14250,%r14251,%r14252,%r14253,%r14254,%r14255,%r14256,%r14257,%r14258,%r14259,%r14260,%r14261,%r14262,%r14263,%r14264,%r14265,%r14266,%r14267,%r14268,%r14269,%r14270}, {%r9404,%r9405,%r9406,%r9407}, %rd672, %p525, 1, 1, 1; + // end inline asm + add.s32 %r10072, %r7925, 6144; + bfe.u32 %r10073, %r10072, 4, 14; + cvt.u64.u32 %rd745, %r10073; + or.b64 %rd673, %rd745, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14207,%r14208,%r14209,%r14210,%r14211,%r14212,%r14213,%r14214,%r14215,%r14216,%r14217,%r14218,%r14219,%r14220,%r14221,%r14222,%r14223,%r14224,%r14225,%r14226,%r14227,%r14228,%r14229,%r14230,%r14231,%r14232,%r14233,%r14234,%r14235,%r14236,%r14237,%r14238,%r14239,%r14240,%r14241,%r14242,%r14243,%r14244,%r14245,%r14246,%r14247,%r14248,%r14249,%r14250,%r14251,%r14252,%r14253,%r14254,%r14255,%r14256,%r14257,%r14258,%r14259,%r14260,%r14261,%r14262,%r14263,%r14264,%r14265,%r14266,%r14267,%r14268,%r14269,%r14270}, {%r9536,%r9537,%r9538,%r9539}, %rd673, %p525, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 628 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:628:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s32 %r14121, %r14121, %r14122; + add.s32 %r14120, %r14120, %r14122; + add.s32 %r14119, %r14119, %r14122; + add.s32 %r14118, %r14118, %r14122; + add.s32 %r14117, %r14117, %r14122; + add.s32 %r14116, %r14116, %r14122; + add.s32 %r14115, %r14115, %r14122; + add.s32 %r14114, %r14114, %r14122; + add.s32 %r14272, %r14122, %r14272; + add.s32 %r14273, %r14122, %r14273; + add.s32 %r14274, %r14122, %r14274; + add.s32 %r14275, %r14122, %r14275; + .loc 1 610 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:610:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s32 %r1289, %r14271, 1; + .loc 1 788 33 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:788:33 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + shr.u32 %r10074, %r1289, 1; + .loc 1 789 38 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:789:38 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mad.wide.u32 %rd675, %r10074, 4, %rd516; + .loc 1 789 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:789:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + // begin inline asm + mov.u64 %rd674, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd674, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r9540, 0x0; + @%p547 ld.global.L1::evict_last.L2::cache_hint.b32 { %r9540 }, [ %rd675 + 0 ], %rd674; + // end inline asm + .loc 1 790 109 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:790:109 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s32 %r10075, %r10074, 1; + .loc 1 790 113 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:790:113 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.lt.s32 %p820, %r10075, %r7092; + .loc 1 790 55 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:790:55 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s64 %rd678, %rd675, 4; + .loc 1 610 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:610:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.pred %p548, %p547, %p820; + .loc 1 790 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:790:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + // begin inline asm + mov.u64 %rd677, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd677, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r9541, 0x0; + @%p548 ld.global.L1::evict_last.L2::cache_hint.b32 { %r9541 }, [ %rd678 + 0 ], %rd677; + // end inline asm + .loc 1 791 35 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:791:35 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + and.b32 %r10076, %r14271, 1; + .loc 1 792 34 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:792:34 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + sub.s32 %r10077, %r9541, %r9540; + .loc 1 792 48 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:792:48 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + shl.b32 %r10078, %r10077, 7; + .loc 1 792 63 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:792:63 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s32 %r10079, %r10078, -64; + .loc 1 793 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:793:29 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + xor.b32 %r10080, %r10076, 1; + .loc 1 793 61 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:793:61 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + shl.b32 %r10081, %r10076, 6; + .loc 1 793 42 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:793:42 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mad.lo.s32 %r14122, %r10079, %r10080, %r10081; + .loc 1 626 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:626:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + shl.b32 %r10082, %r14122, 12; + .loc 1 626 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:626:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.wide.s32 %rd746, %r10082, 2; + add.s64 %rd1013, %rd1013, %rd746; + add.s64 %rd1012, %rd1012, %rd746; + add.s64 %rd1011, %rd1011, %rd746; + add.s64 %rd1010, %rd1010, %rd746; + .loc 1 627 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:627:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + shl.b32 %r10083, %r14122, 7; + .loc 1 627 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:627:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.wide.s32 %rd747, %r10083, 2; + add.s64 %rd1009, %rd1009, %rd747; + add.s64 %rd1008, %rd1008, %rd747; + add.s64 %rd1007, %rd1007, %rd747; + add.s64 %rd1006, %rd1006, %rd747; + .loc 1 628 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:628:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s32 %r14134, %r14122, %r14134; + add.s32 %r14133, %r14122, %r14133; + add.s32 %r14132, %r14122, %r14132; + add.s32 %r14131, %r14122, %r14131; + add.s32 %r14130, %r14122, %r14130; + add.s32 %r14129, %r14122, %r14129; + add.s32 %r14128, %r14122, %r14128; + add.s32 %r14127, %r14122, %r14127; + add.s32 %r14139, %r14122, %r14139; + add.s32 %r14140, %r14122, %r14140; + add.s32 %r14141, %r14122, %r14141; + add.s32 %r14142, %r14122, %r14142; + add.s32 %r14135, %r14122, %r14135; + add.s32 %r14136, %r14122, %r14136; + add.s32 %r14137, %r14122, %r14137; + add.s32 %r14138, %r14122, %r14138; + .loc 1 610 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:610:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s32 %r10084, %r14124, 1; + setp.gt.s32 %p821, %r10084, 1; + selp.b32 %r14124, 0, %r10084, %p821; + add.s32 %r10085, %r14126, 1; + setp.gt.s32 %p822, %r10085, 2; + selp.b32 %r14126, 0, %r10085, %p822; + .loc 1 831 52 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:52 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.lt.s32 %p823, %r14139, 2048; + setp.lt.s32 %p824, %r14140, 2048; + setp.lt.s32 %p825, %r14141, 2048; + setp.lt.s32 %p826, %r14142, 2048; + .loc 1 831 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + shl.b32 %r10086, %r14126, 14; + add.s32 %r10087, %r7117, %r10086; + bar.sync 0; + add.s32 %r9542, %r10087, %r693; + selp.b32 %r10088, 16, 0, %p823; + selp.b32 %r9543, %r10088, 0, %p565; + // begin inline asm + cp.async.cg.shared.global [ %r9542 + 0 ], [ %rd1013 + 0 ], 0x10, %r9543; + // end inline asm + add.s32 %r9544, %r9542, 2048; + selp.b32 %r10089, 16, 0, %p824; + selp.b32 %r9545, %r10089, 0, %p565; + // begin inline asm + cp.async.cg.shared.global [ %r9544 + 0 ], [ %rd1012 + 0 ], 0x10, %r9545; + // end inline asm + add.s32 %r9546, %r9542, 4096; + selp.b32 %r10090, 16, 0, %p825; + selp.b32 %r9547, %r10090, 0, %p565; + // begin inline asm + cp.async.cg.shared.global [ %r9546 + 0 ], [ %rd1011 + 0 ], 0x10, %r9547; + // end inline asm + add.s32 %r9548, %r9542, 6144; + selp.b32 %r10091, 16, 0, %p826; + selp.b32 %r9549, %r10091, 0, %p565; + // begin inline asm + cp.async.cg.shared.global [ %r9548 + 0 ], [ %rd1010 + 0 ], 0x10, %r9549; + // end inline asm + cp.async.commit_group; + .loc 1 674 52 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:674:52 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.lt.s32 %p827, %r14134, 2048; + setp.lt.s32 %p828, %r14133, 2048; + setp.lt.s32 %p829, %r14132, 2048; + setp.lt.s32 %p830, %r14131, 2048; + setp.lt.s32 %p831, %r14130, 2048; + setp.lt.s32 %p832, %r14129, 2048; + setp.lt.s32 %p833, %r14128, 2048; + setp.lt.s32 %p834, %r14127, 2048; + .loc 1 674 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:674:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + mul.wide.s32 %rd748, %r14134, 4; + add.s64 %rd684, %rd105, %rd748; + mul.wide.s32 %rd749, %r14133, 4; + add.s64 %rd685, %rd105, %rd749; + mul.wide.s32 %rd750, %r14132, 4; + add.s64 %rd686, %rd105, %rd750; + mul.wide.s32 %rd751, %r14131, 4; + add.s64 %rd687, %rd105, %rd751; + mul.wide.s32 %rd752, %r14130, 4; + add.s64 %rd688, %rd105, %rd752; + mul.wide.s32 %rd753, %r14129, 4; + add.s64 %rd689, %rd105, %rd753; + mul.wide.s32 %rd754, %r14128, 4; + add.s64 %rd690, %rd105, %rd754; + mul.wide.s32 %rd755, %r14127, 4; + add.s64 %rd691, %rd105, %rd755; + .loc 1 674 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:674:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + shl.b32 %r10092, %r14124, 8; + add.s32 %r10093, %r9595, %r10092; + add.s32 %r9550, %r10093, %r703; + selp.b32 %r10094, 8, 0, %p827; + selp.b32 %r9575, %r10094, 0, %p565; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r9550 + 0 ], [ %rd684 + 0 ], 0x8, %r9575; + // end inline asm + add.s32 %r9552, %r9550, 32; + selp.b32 %r10095, 8, 0, %p828; + selp.b32 %r9577, %r10095, 0, %p565; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r9552 + 0 ], [ %rd685 + 0 ], 0x8, %r9577; + // end inline asm + add.s32 %r9554, %r9550, 64; + selp.b32 %r10096, 8, 0, %p829; + selp.b32 %r9579, %r10096, 0, %p565; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r9554 + 0 ], [ %rd686 + 0 ], 0x8, %r9579; + // end inline asm + add.s32 %r9556, %r9550, 96; + selp.b32 %r10097, 8, 0, %p830; + selp.b32 %r9581, %r10097, 0, %p565; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r9556 + 0 ], [ %rd687 + 0 ], 0x8, %r9581; + // end inline asm + add.s32 %r9558, %r9550, 128; + selp.b32 %r10098, 8, 0, %p831; + selp.b32 %r9583, %r10098, 0, %p565; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r9558 + 0 ], [ %rd688 + 0 ], 0x8, %r9583; + // end inline asm + add.s32 %r9560, %r9550, 160; + selp.b32 %r10099, 8, 0, %p832; + selp.b32 %r9585, %r10099, 0, %p565; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r9560 + 0 ], [ %rd689 + 0 ], 0x8, %r9585; + // end inline asm + add.s32 %r9562, %r9550, 192; + selp.b32 %r10100, 8, 0, %p833; + selp.b32 %r9587, %r10100, 0, %p565; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r9562 + 0 ], [ %rd690 + 0 ], 0x8, %r9587; + // end inline asm + add.s32 %r9564, %r9550, 224; + selp.b32 %r10101, 8, 0, %p834; + selp.b32 %r9589, %r10101, 0, %p565; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r9564 + 0 ], [ %rd691 + 0 ], 0x8, %r9589; + // end inline asm + cp.async.commit_group; + .loc 1 833 52 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:833:52 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.lt.s32 %p835, %r14135, 2048; + setp.lt.s32 %p836, %r14136, 2048; + setp.lt.s32 %p837, %r14137, 2048; + setp.lt.s32 %p838, %r14138, 2048; + .loc 1 833 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:833:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s32 %r10102, %r9947, %r10086; + add.s32 %r9566, %r10102, %r693; + selp.b32 %r10103, 16, 0, %p835; + selp.b32 %r9567, %r10103, 0, %p565; + // begin inline asm + cp.async.cg.shared.global [ %r9566 + 0 ], [ %rd1009 + 0 ], 0x10, %r9567; + // end inline asm + add.s32 %r9568, %r9566, 2048; + selp.b32 %r10104, 16, 0, %p836; + selp.b32 %r9569, %r10104, 0, %p565; + // begin inline asm + cp.async.cg.shared.global [ %r9568 + 0 ], [ %rd1008 + 0 ], 0x10, %r9569; + // end inline asm + add.s32 %r9570, %r9566, 4096; + selp.b32 %r10105, 16, 0, %p837; + selp.b32 %r9571, %r10105, 0, %p565; + // begin inline asm + cp.async.cg.shared.global [ %r9570 + 0 ], [ %rd1007 + 0 ], 0x10, %r9571; + // end inline asm + add.s32 %r9572, %r9566, 6144; + selp.b32 %r10106, 16, 0, %p838; + selp.b32 %r9573, %r10106, 0, %p565; + // begin inline asm + cp.async.cg.shared.global [ %r9572 + 0 ], [ %rd1006 + 0 ], 0x10, %r9573; + // end inline asm + cp.async.commit_group; + .loc 1 748 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:748:29 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s64 %rd696, %rd106, %rd748; + add.s64 %rd697, %rd106, %rd749; + add.s64 %rd698, %rd106, %rd750; + add.s64 %rd699, %rd106, %rd751; + add.s64 %rd700, %rd106, %rd752; + add.s64 %rd701, %rd106, %rd753; + add.s64 %rd702, %rd106, %rd754; + add.s64 %rd703, %rd106, %rd755; + .loc 1 748 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:748:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + add.s32 %r10107, %r9955, %r10092; + add.s32 %r9574, %r10107, %r703; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r9574 + 0 ], [ %rd696 + 0 ], 0x8, %r9575; + // end inline asm + add.s32 %r9576, %r9574, 32; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r9576 + 0 ], [ %rd697 + 0 ], 0x8, %r9577; + // end inline asm + add.s32 %r9578, %r9574, 64; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r9578 + 0 ], [ %rd698 + 0 ], 0x8, %r9579; + // end inline asm + add.s32 %r9580, %r9574, 96; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r9580 + 0 ], [ %rd699 + 0 ], 0x8, %r9581; + // end inline asm + add.s32 %r9582, %r9574, 128; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r9582 + 0 ], [ %rd700 + 0 ], 0x8, %r9583; + // end inline asm + add.s32 %r9584, %r9574, 160; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r9584 + 0 ], [ %rd701 + 0 ], 0x8, %r9585; + // end inline asm + add.s32 %r9586, %r9574, 192; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r9586 + 0 ], [ %rd702 + 0 ], 0x8, %r9587; + // end inline asm + add.s32 %r9588, %r9574, 224; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r9588 + 0 ], [ %rd703 + 0 ], 0x8, %r9589; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:610:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + setp.ne.b32 %p839, %r837, %r1289; + mov.b32 %r14271, %r1289; + @%p839 bra $L__BB0_11; +$L__BB0_12: // %._crit_edge1670 + // in Loop: Header=BB0_9 Depth=1 + .loc 1 0 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:0:28 + setp.lt.s32 %p872, %r691, 1; + .loc 1 610 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:610:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:298:16 ] + // begin inline asm + // wait for regs: %r14143,%r14144,%r14145,%r14146,%r14147,%r14148,%r14149,%r14150,%r14151,%r14152,%r14153,%r14154,%r14155,%r14156,%r14157,%r14158,%r14159,%r14160,%r14161,%r14162,%r14163,%r14164,%r14165,%r14166,%r14167,%r14168,%r14169,%r14170,%r14171,%r14172,%r14173,%r14174,%r14175,%r14176,%r14177,%r14178,%r14179,%r14180,%r14181,%r14182,%r14183,%r14184,%r14185,%r14186,%r14187,%r14188,%r14189,%r14190,%r14191,%r14192,%r14193,%r14194,%r14195,%r14196,%r14197,%r14198,%r14199,%r14200,%r14201,%r14202,%r14203,%r14204,%r14205,%r14206,%r14207,%r14208,%r14209,%r14210,%r14211,%r14212,%r14213,%r14214,%r14215,%r14216,%r14217,%r14218,%r14219,%r14220,%r14221,%r14222,%r14223,%r14224,%r14225,%r14226,%r14227,%r14228,%r14229,%r14230,%r14231,%r14232,%r14233,%r14234,%r14235,%r14236,%r14237,%r14238,%r14239,%r14240,%r14241,%r14242,%r14243,%r14244,%r14245,%r14246,%r14247,%r14248,%r14249,%r14250,%r14251,%r14252,%r14253,%r14254,%r14255,%r14256,%r14257,%r14258,%r14259,%r14260,%r14261,%r14262,%r14263,%r14264,%r14265,%r14266,%r14267,%r14268,%r14269,%r14270 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp18: + .loc 1 601 18 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:601:18 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s64 %rd805, %rd103, %rd804; + add.s64 %rd807, %rd103, %rd806; + add.s64 %rd809, %rd103, %rd808; + add.s64 %rd811, %rd103, %rd810; + .loc 1 601 49 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:601:49 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s64 %rd756, %rd805, %rd616; + add.s64 %rd757, %rd807, %rd616; + add.s64 %rd758, %rd809, %rd616; + add.s64 %rd759, %rd811, %rd616; + .loc 1 602 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:602:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s64 %rd814, %rd104, %rd813; + add.s64 %rd816, %rd104, %rd815; + add.s64 %rd818, %rd104, %rd817; + add.s64 %rd820, %rd104, %rd819; + .loc 1 602 51 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:602:51 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s64 %rd768, %rd814, %rd616; + add.s64 %rd769, %rd816, %rd616; + add.s64 %rd770, %rd818, %rd616; + add.s64 %rd771, %rd820, %rd616; + .loc 1 831 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + // begin inline asm + cp.async.cg.shared.global [ %r10364 + 0 ], [ %rd756 + 0 ], 0x10, %r10365; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10366 + 0 ], [ %rd757 + 0 ], 0x10, %r10367; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10368 + 0 ], [ %rd758 + 0 ], 0x10, %r10369; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10370 + 0 ], [ %rd759 + 0 ], 0x10, %r10371; + // end inline asm + cp.async.commit_group; + .loc 1 674 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:674:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s64 %rd760, %rd105, %rd821; + cvt.s64.s32 %rd822, %r688; + add.s64 %rd823, %rd822, %rd107; + shl.b64 %rd824, %rd823, 2; + add.s64 %rd825, %rd105, %rd824; + add.s64 %rd761, %rd825, 32; + add.s64 %rd762, %rd825, 64; + add.s64 %rd763, %rd825, 96; + add.s64 %rd764, %rd825, 128; + add.s64 %rd765, %rd825, 160; + add.s64 %rd766, %rd825, 192; + add.s64 %rd767, %rd825, 224; + .loc 1 674 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:674:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10372 + 0 ], [ %rd760 + 0 ], 0x8, %r10373; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10374 + 0 ], [ %rd761 + 0 ], 0x8, %r10375; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10376 + 0 ], [ %rd762 + 0 ], 0x8, %r10377; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10378 + 0 ], [ %rd763 + 0 ], 0x8, %r10379; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10380 + 0 ], [ %rd764 + 0 ], 0x8, %r10381; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10382 + 0 ], [ %rd765 + 0 ], 0x8, %r10383; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10384 + 0 ], [ %rd766 + 0 ], 0x8, %r10385; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10386 + 0 ], [ %rd767 + 0 ], 0x8, %r10387; + // end inline asm + cp.async.commit_group; + .loc 1 833 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:833:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + // begin inline asm + cp.async.cg.shared.global [ %r10388 + 0 ], [ %rd768 + 0 ], 0x10, %r10365; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10390 + 0 ], [ %rd769 + 0 ], 0x10, %r10367; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10392 + 0 ], [ %rd770 + 0 ], 0x10, %r10369; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10394 + 0 ], [ %rd771 + 0 ], 0x10, %r10371; + // end inline asm + cp.async.commit_group; + .loc 1 748 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:748:29 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s64 %rd772, %rd106, %rd821; + add.s64 %rd826, %rd106, %rd824; + add.s64 %rd773, %rd826, 32; + add.s64 %rd774, %rd826, 64; + add.s64 %rd775, %rd826, 96; + add.s64 %rd776, %rd826, 128; + add.s64 %rd777, %rd826, 160; + add.s64 %rd778, %rd826, 192; + add.s64 %rd779, %rd826, 224; + .loc 1 748 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:748:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10396 + 0 ], [ %rd772 + 0 ], 0x8, %r10373; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10398 + 0 ], [ %rd773 + 0 ], 0x8, %r10375; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10400 + 0 ], [ %rd774 + 0 ], 0x8, %r10377; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10402 + 0 ], [ %rd775 + 0 ], 0x8, %r10379; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10404 + 0 ], [ %rd776 + 0 ], 0x8, %r10381; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10406 + 0 ], [ %rd777 + 0 ], 0x8, %r10383; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10408 + 0 ], [ %rd778 + 0 ], 0x8, %r10385; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10410 + 0 ], [ %rd779 + 0 ], 0x8, %r10387; + // end inline asm + cp.async.commit_group; + .loc 1 626 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:626:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s64 %rd1021, %rd756, 524288; + add.s64 %rd1020, %rd757, 524288; + add.s64 %rd1019, %rd758, 524288; + add.s64 %rd1018, %rd759, 524288; + .loc 1 627 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:627:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s64 %rd1017, %rd768, 16384; + add.s64 %rd1016, %rd769, 16384; + add.s64 %rd1015, %rd770, 16384; + add.s64 %rd1014, %rd771, 16384; + .loc 1 831 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + bar.sync 0; + // begin inline asm + cp.async.cg.shared.global [ %r10412 + 0 ], [ %rd1021 + 0 ], 0x10, %r10413; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10414 + 0 ], [ %rd1020 + 0 ], 0x10, %r10415; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10416 + 0 ], [ %rd1019 + 0 ], 0x10, %r10417; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10418 + 0 ], [ %rd1018 + 0 ], 0x10, %r10419; + // end inline asm + cp.async.commit_group; + .loc 1 674 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:674:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s64 %rd784, %rd760, 256; + add.s64 %rd827, %rd824, 256; + add.s64 %rd828, %rd105, %rd827; + add.s64 %rd785, %rd828, 32; + add.s64 %rd786, %rd828, 64; + add.s64 %rd787, %rd828, 96; + add.s64 %rd788, %rd828, 128; + add.s64 %rd789, %rd828, 160; + add.s64 %rd790, %rd828, 192; + add.s64 %rd791, %rd828, 224; + .loc 1 674 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:674:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10420 + 0 ], [ %rd784 + 0 ], 0x8, %r10421; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10422 + 0 ], [ %rd785 + 0 ], 0x8, %r10423; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10424 + 0 ], [ %rd786 + 0 ], 0x8, %r10425; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10426 + 0 ], [ %rd787 + 0 ], 0x8, %r10427; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10428 + 0 ], [ %rd788 + 0 ], 0x8, %r10429; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10430 + 0 ], [ %rd789 + 0 ], 0x8, %r10431; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10432 + 0 ], [ %rd790 + 0 ], 0x8, %r10433; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10434 + 0 ], [ %rd791 + 0 ], 0x8, %r10435; + // end inline asm + cp.async.commit_group; + .loc 1 833 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:833:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + // begin inline asm + cp.async.cg.shared.global [ %r10436 + 0 ], [ %rd1017 + 0 ], 0x10, %r10413; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10438 + 0 ], [ %rd1016 + 0 ], 0x10, %r10415; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10440 + 0 ], [ %rd1015 + 0 ], 0x10, %r10417; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r10442 + 0 ], [ %rd1014 + 0 ], 0x10, %r10419; + // end inline asm + cp.async.commit_group; + .loc 1 748 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:748:29 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s64 %rd796, %rd772, 256; + add.s64 %rd829, %rd106, %rd827; + add.s64 %rd797, %rd829, 32; + add.s64 %rd798, %rd829, 64; + add.s64 %rd799, %rd829, 96; + add.s64 %rd800, %rd829, 128; + add.s64 %rd801, %rd829, 160; + add.s64 %rd802, %rd829, 192; + add.s64 %rd803, %rd829, 224; + .loc 1 748 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:748:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10444 + 0 ], [ %rd796 + 0 ], 0x8, %r10421; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10446 + 0 ], [ %rd797 + 0 ], 0x8, %r10423; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10448 + 0 ], [ %rd798 + 0 ], 0x8, %r10425; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10450 + 0 ], [ %rd799 + 0 ], 0x8, %r10427; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10452 + 0 ], [ %rd800 + 0 ], 0x8, %r10429; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10454 + 0 ], [ %rd801 + 0 ], 0x8, %r10431; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10456 + 0 ], [ %rd802 + 0 ], 0x8, %r10433; + // end inline asm + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r10458 + 0 ], [ %rd803 + 0 ], 0x8, %r10435; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:610:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + @%p872 bra $L__BB0_15; +// %bb.13: // %.lr.ph1819.preheader + // in Loop: Header=BB0_9 Depth=1 + .loc 1 0 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:0:28 + mov.b32 %r11015, 0; + mov.b32 %r14406, 1; + mov.b32 %r14405, -1; + mov.b32 %r14404, 64; + mov.b32 %r14407, %r14405; + mov.b32 %r14408, %r14406; + mov.b32 %r14409, %r818; + mov.b32 %r14410, %r817; + mov.b32 %r14411, %r816; + mov.b32 %r14412, %r815; + mov.b32 %r14413, %r814; + mov.b32 %r14414, %r813; + mov.b32 %r14415, %r812; + mov.b32 %r14416, %r811; + mov.b32 %r14417, %r819; + mov.b32 %r14418, %r820; + mov.b32 %r14419, %r821; + mov.b32 %r14420, %r822; + mov.b32 %r14421, %r819; + mov.b32 %r14422, %r820; + mov.b32 %r14423, %r821; + mov.b32 %r14424, %r822; + mov.b32 %r14425, %r690; + mov.b32 %r14554, %r11015; +$L__BB0_14: // %.lr.ph1819 + // Parent Loop BB0_9 Depth=1 + // => This Inner Loop Header: Depth=2 + .loc 1 610 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:610:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + setp.lt.s32 %p913, %r14554, %r835; + setp.lt.s32 %p895, %r14554, %r836; + add.s32 %r12682, %r14405, 1; + setp.gt.s32 %p914, %r12682, 1; + selp.b32 %r14405, 0, %r12682, %p914; + add.s32 %r12683, %r14407, 1; + setp.gt.s32 %p915, %r12683, 2; + selp.b32 %r14407, 0, %r12683, %p915; + .loc 1 831 52 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:52 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + setp.lt.s32 %p916, %r14425, 2048; + .loc 1 831 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.b16 %rs281, 1, 0, %p916; + st.shared.v2.b8 [%r782], {%rs281, %rs281}; + st.shared.v2.b8 [%r782+512], {%rs281, %rs281}; + st.shared.v2.b8 [%r783+4096], {%rs281, %rs281}; + st.shared.v2.b8 [%r783+4608], {%rs281, %rs281}; + st.shared.v2.b8 [%r784+1024], {%rs281, %rs281}; + st.shared.v2.b8 [%r784+1536], {%rs281, %rs281}; + st.shared.v2.b8 [%r785+5120], {%rs281, %rs281}; + st.shared.v2.b8 [%r785+5632], {%rs281, %rs281}; + st.shared.v2.b8 [%r786+2048], {%rs281, %rs281}; + st.shared.v2.b8 [%r786+2560], {%rs281, %rs281}; + st.shared.v2.b8 [%r787+6144], {%rs281, %rs281}; + st.shared.v2.b8 [%r787+6656], {%rs281, %rs281}; + st.shared.v2.b8 [%r788+3072], {%rs281, %rs281}; + st.shared.v2.b8 [%r788+3584], {%rs281, %rs281}; + st.shared.v2.b8 [%r789+7168], {%rs281, %rs281}; + st.shared.v2.b8 [%r789+7680], {%rs281, %rs281}; + bar.sync 0; + ld.shared.b8 %rs282, [%r791]; + and.b16 %rs283, %rs282, 1; + setp.ne.b16 %p917, %rs283, 0; + ld.shared.b16 %r12684, [%r791+2]; + ld.shared.b8 %rs284, [%r791+1]; + ld.shared.b8 %rs285, [%r792]; + and.b16 %rs286, %rs285, 1; + setp.ne.b16 %p918, %rs286, 0; + ld.shared.b16 %r12685, [%r792+2]; + ld.shared.b8 %rs287, [%r792+1]; + ld.shared.b8 %rs288, [%r793]; + and.b16 %rs289, %rs288, 1; + setp.ne.b16 %p919, %rs289, 0; + ld.shared.b16 %r12686, [%r793+2]; + ld.shared.b8 %rs290, [%r793+1]; + ld.shared.b8 %rs291, [%r794]; + and.b16 %rs292, %rs291, 1; + setp.ne.b16 %p920, %rs292, 0; + ld.shared.b16 %r12687, [%r794+2]; + ld.shared.b8 %rs293, [%r794+1]; + ld.shared.b8 %rs294, [%r795]; + and.b16 %rs295, %rs294, 1; + setp.ne.b16 %p921, %rs295, 0; + ld.shared.b16 %r12688, [%r795+2]; + ld.shared.b8 %rs296, [%r795+1]; + ld.shared.b8 %rs297, [%r796]; + and.b16 %rs298, %rs297, 1; + setp.ne.b16 %p922, %rs298, 0; + ld.shared.b16 %r12689, [%r796+2]; + ld.shared.b8 %rs299, [%r796+1]; + ld.shared.b8 %rs300, [%r797]; + and.b16 %rs301, %rs300, 1; + setp.ne.b16 %p923, %rs301, 0; + ld.shared.b16 %r12690, [%r797+2]; + ld.shared.b8 %rs302, [%r797+1]; + ld.shared.b8 %rs303, [%r798]; + and.b16 %rs304, %rs303, 1; + setp.ne.b16 %p924, %rs304, 0; + ld.shared.b16 %r12691, [%r798+2]; + ld.shared.b8 %rs305, [%r798+1]; + and.b16 %rs306, %rs284, 1; + setp.ne.b16 %p925, %rs306, 0; + and.b32 %r12692, %r12684, 1; + setp.ne.b32 %p926, %r12692, 0; + shr.u32 %r12693, %r12684, 8; + and.b32 %r12694, %r12693, 1; + setp.ne.b32 %p927, %r12694, 0; + and.b16 %rs307, %rs287, 1; + setp.ne.b16 %p928, %rs307, 0; + and.b32 %r12695, %r12685, 1; + setp.ne.b32 %p929, %r12695, 0; + shr.u32 %r12696, %r12685, 8; + and.b32 %r12697, %r12696, 1; + setp.ne.b32 %p930, %r12697, 0; + and.b16 %rs308, %rs290, 1; + setp.ne.b16 %p931, %rs308, 0; + and.b32 %r12698, %r12686, 1; + setp.ne.b32 %p932, %r12698, 0; + shr.u32 %r12699, %r12686, 8; + and.b32 %r12700, %r12699, 1; + setp.ne.b32 %p933, %r12700, 0; + and.b16 %rs309, %rs293, 1; + setp.ne.b16 %p934, %rs309, 0; + and.b32 %r12701, %r12687, 1; + setp.ne.b32 %p935, %r12701, 0; + shr.u32 %r12702, %r12687, 8; + and.b32 %r12703, %r12702, 1; + setp.ne.b32 %p936, %r12703, 0; + and.b16 %rs310, %rs296, 1; + setp.ne.b16 %p937, %rs310, 0; + and.b32 %r12704, %r12688, 1; + setp.ne.b32 %p938, %r12704, 0; + shr.u32 %r12705, %r12688, 8; + and.b32 %r12706, %r12705, 1; + setp.ne.b32 %p939, %r12706, 0; + and.b16 %rs311, %rs299, 1; + setp.ne.b16 %p940, %rs311, 0; + and.b32 %r12707, %r12689, 1; + setp.ne.b32 %p941, %r12707, 0; + shr.u32 %r12708, %r12689, 8; + and.b32 %r12709, %r12708, 1; + setp.ne.b32 %p942, %r12709, 0; + and.b16 %rs312, %rs302, 1; + setp.ne.b16 %p943, %rs312, 0; + and.b32 %r12710, %r12690, 1; + setp.ne.b32 %p944, %r12710, 0; + shr.u32 %r12711, %r12690, 8; + and.b32 %r12712, %r12711, 1; + setp.ne.b32 %p945, %r12712, 0; + and.b16 %rs313, %rs305, 1; + setp.ne.b16 %p946, %rs313, 0; + and.b32 %r12713, %r12691, 1; + setp.ne.b32 %p947, %r12713, 0; + shr.u32 %r12714, %r12691, 8; + and.b32 %r12715, %r12714, 1; + setp.ne.b32 %p948, %r12715, 0; + cp.async.wait_group 4; + bar.sync 0; + shl.b32 %r12716, %r14407, 14; + add.s32 %r11017, %r7117, %r12716; + .loc 1 674 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:674:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + shl.b32 %r12718, %r14405, 8; + add.s32 %r12719, %r7117, 106496; + add.s32 %r12720, %r12719, %r12718; + add.s32 %r12721, %r12720, %r703; + ld.shared.v2.b32 {%r12722, %r12723}, [%r12721]; + ld.shared.v2.b32 {%r12724, %r12725}, [%r12721+32]; + ld.shared.v2.b32 {%r12726, %r12727}, [%r12721+64]; + ld.shared.v2.b32 {%r12728, %r12729}, [%r12721+96]; + ld.shared.v2.b32 {%r12730, %r12731}, [%r12721+128]; + ld.shared.v2.b32 {%r12732, %r12733}, [%r12721+160]; + ld.shared.v2.b32 {%r12734, %r12735}, [%r12721+192]; + ld.shared.v2.b32 {%r12736, %r12737}, [%r12721+224]; + .loc 1 675 26 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:675:26 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + setp.eq.f32 %p949, %r12722, 0fFF800000; + setp.eq.f32 %p950, %r12723, 0fFF800000; + setp.eq.f32 %p951, %r12724, 0fFF800000; + setp.eq.f32 %p952, %r12725, 0fFF800000; + setp.eq.f32 %p953, %r12726, 0fFF800000; + setp.eq.f32 %p954, %r12727, 0fFF800000; + setp.eq.f32 %p955, %r12728, 0fFF800000; + setp.eq.f32 %p956, %r12729, 0fFF800000; + setp.eq.f32 %p957, %r12730, 0fFF800000; + setp.eq.f32 %p958, %r12731, 0fFF800000; + setp.eq.f32 %p959, %r12732, 0fFF800000; + setp.eq.f32 %p960, %r12733, 0fFF800000; + setp.eq.f32 %p961, %r12734, 0fFF800000; + setp.eq.f32 %p962, %r12735, 0fFF800000; + setp.eq.f32 %p963, %r12736, 0fFF800000; + setp.eq.f32 %p964, %r12737, 0fFF800000; + .loc 1 675 46 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:675:46 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12738, 0f00000000, %r12722, %p949; + selp.f32 %r12739, 0f00000000, %r12723, %p950; + selp.f32 %r12740, 0f00000000, %r12724, %p951; + selp.f32 %r12741, 0f00000000, %r12725, %p952; + selp.f32 %r12742, 0f00000000, %r12726, %p953; + selp.f32 %r12743, 0f00000000, %r12727, %p954; + selp.f32 %r12744, 0f00000000, %r12728, %p955; + selp.f32 %r12745, 0f00000000, %r12729, %p956; + selp.f32 %r12746, 0f00000000, %r12730, %p957; + selp.f32 %r12747, 0f00000000, %r12731, %p958; + selp.f32 %r12748, 0f00000000, %r12732, %p959; + selp.f32 %r12749, 0f00000000, %r12733, %p960; + selp.f32 %r12750, 0f00000000, %r12734, %p961; + selp.f32 %r12751, 0f00000000, %r12735, %p962; + selp.f32 %r12752, 0f00000000, %r12736, %p963; + selp.f32 %r12753, 0f00000000, %r12737, %p964; + .loc 1 676 20 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:676:20 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + shfl.sync.idx.b32 %r12754, %r8, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r12755, %r12754, 11; + and.b32 %r12756, %r12755, 8192; + add.s32 %r10976, %r7117, 107520; + add.s32 %r12757, %r12756, %r10976; + bfe.u32 %r12758, %r12757, 4, 14; + cvt.u64.u32 %rd900, %r12758; + or.b64 %rd830, %rd900, 4611686293372403712; + bfe.u32 %r12759, %r11017, 4, 14; + cvt.u64.u32 %rd901, %r12759; + or.b64 %rd831, %rd901, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10560,%r10561,%r10562,%r10563,%r10564,%r10565,%r10566,%r10567,%r10568,%r10569,%r10570,%r10571,%r10572,%r10573,%r10574,%r10575,%r10576,%r10577,%r10578,%r10579,%r10580,%r10581,%r10582,%r10583,%r10584,%r10585,%r10586,%r10587,%r10588,%r10589,%r10590,%r10591}, %rd830, %rd831, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r12760, %r12756, 32; + add.s32 %r12761, %r12760, %r10976; + bfe.u32 %r12762, %r12761, 4, 14; + cvt.u64.u32 %rd902, %r12762; + or.b64 %rd832, %rd902, 4611686293372403712; + add.s32 %r12763, %r11017, 32; + bfe.u32 %r12764, %r12763, 4, 14; + cvt.u64.u32 %rd903, %r12764; + or.b64 %rd833, %rd903, 4611686293338849280; + mov.pred %p873, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10560,%r10561,%r10562,%r10563,%r10564,%r10565,%r10566,%r10567,%r10568,%r10569,%r10570,%r10571,%r10572,%r10573,%r10574,%r10575,%r10576,%r10577,%r10578,%r10579,%r10580,%r10581,%r10582,%r10583,%r10584,%r10585,%r10586,%r10587,%r10588,%r10589,%r10590,%r10591}, %rd832, %rd833, %p873, 1, 1, 0, 0; + // end inline asm + or.b32 %r12765, %r12756, 64; + add.s32 %r12766, %r12765, %r10976; + bfe.u32 %r12767, %r12766, 4, 14; + cvt.u64.u32 %rd904, %r12767; + or.b64 %rd834, %rd904, 4611686293372403712; + add.s32 %r12768, %r11017, 64; + bfe.u32 %r12769, %r12768, 4, 14; + cvt.u64.u32 %rd905, %r12769; + or.b64 %rd835, %rd905, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10560,%r10561,%r10562,%r10563,%r10564,%r10565,%r10566,%r10567,%r10568,%r10569,%r10570,%r10571,%r10572,%r10573,%r10574,%r10575,%r10576,%r10577,%r10578,%r10579,%r10580,%r10581,%r10582,%r10583,%r10584,%r10585,%r10586,%r10587,%r10588,%r10589,%r10590,%r10591}, %rd834, %rd835, %p873, 1, 1, 0, 0; + // end inline asm + or.b32 %r12770, %r12756, 96; + add.s32 %r12771, %r12770, %r10976; + bfe.u32 %r12772, %r12771, 4, 14; + cvt.u64.u32 %rd906, %r12772; + or.b64 %rd836, %rd906, 4611686293372403712; + add.s32 %r12773, %r11017, 96; + bfe.u32 %r12774, %r12773, 4, 14; + cvt.u64.u32 %rd907, %r12774; + or.b64 %rd837, %rd907, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10560,%r10561,%r10562,%r10563,%r10564,%r10565,%r10566,%r10567,%r10568,%r10569,%r10570,%r10571,%r10572,%r10573,%r10574,%r10575,%r10576,%r10577,%r10578,%r10579,%r10580,%r10581,%r10582,%r10583,%r10584,%r10585,%r10586,%r10587,%r10588,%r10589,%r10590,%r10591}, %rd836, %rd837, %p873, 1, 1, 0, 0; + // end inline asm + or.b32 %r12775, %r12756, 16384; + add.s32 %r12776, %r12775, %r10976; + bfe.u32 %r12777, %r12776, 4, 14; + cvt.u64.u32 %rd908, %r12777; + or.b64 %rd838, %rd908, 4611686293372403712; + add.s32 %r12778, %r11017, 8192; + bfe.u32 %r12779, %r12778, 4, 14; + cvt.u64.u32 %rd909, %r12779; + or.b64 %rd839, %rd909, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10560,%r10561,%r10562,%r10563,%r10564,%r10565,%r10566,%r10567,%r10568,%r10569,%r10570,%r10571,%r10572,%r10573,%r10574,%r10575,%r10576,%r10577,%r10578,%r10579,%r10580,%r10581,%r10582,%r10583,%r10584,%r10585,%r10586,%r10587,%r10588,%r10589,%r10590,%r10591}, %rd838, %rd839, %p873, 1, 1, 0, 0; + // end inline asm + or.b32 %r12780, %r12756, 16416; + add.s32 %r12781, %r12780, %r10976; + bfe.u32 %r12782, %r12781, 4, 14; + cvt.u64.u32 %rd910, %r12782; + or.b64 %rd840, %rd910, 4611686293372403712; + add.s32 %r12783, %r11017, 8224; + bfe.u32 %r12784, %r12783, 4, 14; + cvt.u64.u32 %rd911, %r12784; + or.b64 %rd841, %rd911, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10560,%r10561,%r10562,%r10563,%r10564,%r10565,%r10566,%r10567,%r10568,%r10569,%r10570,%r10571,%r10572,%r10573,%r10574,%r10575,%r10576,%r10577,%r10578,%r10579,%r10580,%r10581,%r10582,%r10583,%r10584,%r10585,%r10586,%r10587,%r10588,%r10589,%r10590,%r10591}, %rd840, %rd841, %p873, 1, 1, 0, 0; + // end inline asm + or.b32 %r12785, %r12756, 16448; + add.s32 %r12786, %r12785, %r10976; + bfe.u32 %r12787, %r12786, 4, 14; + cvt.u64.u32 %rd912, %r12787; + or.b64 %rd842, %rd912, 4611686293372403712; + add.s32 %r12788, %r11017, 8256; + bfe.u32 %r12789, %r12788, 4, 14; + cvt.u64.u32 %rd913, %r12789; + or.b64 %rd843, %rd913, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10560,%r10561,%r10562,%r10563,%r10564,%r10565,%r10566,%r10567,%r10568,%r10569,%r10570,%r10571,%r10572,%r10573,%r10574,%r10575,%r10576,%r10577,%r10578,%r10579,%r10580,%r10581,%r10582,%r10583,%r10584,%r10585,%r10586,%r10587,%r10588,%r10589,%r10590,%r10591}, %rd842, %rd843, %p873, 1, 1, 0, 0; + // end inline asm + or.b32 %r12790, %r12756, 16480; + add.s32 %r12791, %r12790, %r10976; + bfe.u32 %r12792, %r12791, 4, 14; + cvt.u64.u32 %rd914, %r12792; + or.b64 %rd844, %rd914, 4611686293372403712; + add.s32 %r12793, %r11017, 8288; + bfe.u32 %r12794, %r12793, 4, 14; + cvt.u64.u32 %rd915, %r12794; + or.b64 %rd845, %rd915, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r10560,%r10561,%r10562,%r10563,%r10564,%r10565,%r10566,%r10567,%r10568,%r10569,%r10570,%r10571,%r10572,%r10573,%r10574,%r10575,%r10576,%r10577,%r10578,%r10579,%r10580,%r10581,%r10582,%r10583,%r10584,%r10585,%r10586,%r10587,%r10588,%r10589,%r10590,%r10591}, %rd844, %rd845, %p873, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r10977, %r11015; + mov.b32 %r10978, %r11015; + mov.b32 %r10980, %r11015; + mov.b32 %r10981, %r11015; + mov.b32 %r10979, %r11017; + // begin inline asm + // wait for regs: %r10560,%r10561,%r10562,%r10563,%r10564,%r10565,%r10566,%r10567,%r10568,%r10569,%r10570,%r10571,%r10572,%r10573,%r10574,%r10575,%r10576,%r10577,%r10578,%r10579,%r10580,%r10581,%r10582,%r10583,%r10584,%r10585,%r10586,%r10587,%r10588,%r10589,%r10590,%r10591,%r10976,%r10977,%r10978,%r10979,%r10980,%r10981 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 678 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:678:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12795, %r10560, 0f3DB504F3; + mul.f32 %r12796, %r10561, 0f3DB504F3; + mul.f32 %r12797, %r10562, 0f3DB504F3; + mul.f32 %r12798, %r10563, 0f3DB504F3; + mul.f32 %r12799, %r10564, 0f3DB504F3; + mul.f32 %r12800, %r10565, 0f3DB504F3; + mul.f32 %r12801, %r10566, 0f3DB504F3; + mul.f32 %r12802, %r10567, 0f3DB504F3; + mul.f32 %r12803, %r10568, 0f3DB504F3; + mul.f32 %r12804, %r10569, 0f3DB504F3; + mul.f32 %r12805, %r10570, 0f3DB504F3; + mul.f32 %r12806, %r10571, 0f3DB504F3; + mul.f32 %r12807, %r10572, 0f3DB504F3; + mul.f32 %r12808, %r10573, 0f3DB504F3; + mul.f32 %r12809, %r10574, 0f3DB504F3; + mul.f32 %r12810, %r10575, 0f3DB504F3; + mul.f32 %r12811, %r10576, 0f3DB504F3; + mul.f32 %r12812, %r10577, 0f3DB504F3; + mul.f32 %r12813, %r10578, 0f3DB504F3; + mul.f32 %r12814, %r10579, 0f3DB504F3; + mul.f32 %r12815, %r10580, 0f3DB504F3; + mul.f32 %r12816, %r10581, 0f3DB504F3; + mul.f32 %r12817, %r10582, 0f3DB504F3; + mul.f32 %r12818, %r10583, 0f3DB504F3; + mul.f32 %r12819, %r10584, 0f3DB504F3; + mul.f32 %r12820, %r10585, 0f3DB504F3; + mul.f32 %r12821, %r10586, 0f3DB504F3; + mul.f32 %r12822, %r10587, 0f3DB504F3; + mul.f32 %r12823, %r10588, 0f3DB504F3; + mul.f32 %r12824, %r10589, 0f3DB504F3; + mul.f32 %r12825, %r10590, 0f3DB504F3; + mul.f32 %r12826, %r10591, 0f3DB504F3; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12827, %r12795, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12828, %r12827, 0fFF800000, %p917; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12829, %r12796, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12830, %r12829, 0fFF800000, %p925; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12831, %r12797, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12832, %r12831, 0fFF800000, %p926; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12833, %r12798, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12834, %r12833, 0fFF800000, %p927; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12835, %r12799, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12836, %r12835, 0fFF800000, %p918; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12837, %r12800, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12838, %r12837, 0fFF800000, %p928; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12839, %r12801, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12840, %r12839, 0fFF800000, %p929; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12841, %r12802, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12842, %r12841, 0fFF800000, %p930; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12843, %r12803, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12844, %r12843, 0fFF800000, %p919; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12845, %r12804, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12846, %r12845, 0fFF800000, %p931; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12847, %r12805, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12848, %r12847, 0fFF800000, %p932; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12849, %r12806, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12850, %r12849, 0fFF800000, %p933; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12851, %r12807, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12852, %r12851, 0fFF800000, %p920; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12853, %r12808, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12854, %r12853, 0fFF800000, %p934; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12855, %r12809, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12856, %r12855, 0fFF800000, %p935; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12857, %r12810, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12858, %r12857, 0fFF800000, %p936; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12859, %r12811, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12860, %r12859, 0fFF800000, %p921; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12861, %r12812, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12862, %r12861, 0fFF800000, %p937; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12863, %r12813, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12864, %r12863, 0fFF800000, %p938; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12865, %r12814, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12866, %r12865, 0fFF800000, %p939; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12867, %r12815, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12868, %r12867, 0fFF800000, %p922; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12869, %r12816, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12870, %r12869, 0fFF800000, %p940; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12871, %r12817, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12872, %r12871, 0fFF800000, %p941; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12873, %r12818, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12874, %r12873, 0fFF800000, %p942; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12875, %r12819, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12876, %r12875, 0fFF800000, %p923; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12877, %r12820, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12878, %r12877, 0fFF800000, %p943; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12879, %r12821, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12880, %r12879, 0fFF800000, %p944; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12881, %r12822, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12882, %r12881, 0fFF800000, %p945; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12883, %r12823, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12884, %r12883, 0fFF800000, %p924; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12885, %r12824, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12886, %r12885, 0fFF800000, %p946; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12887, %r12825, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12888, %r12887, 0fFF800000, %p947; + .loc 1 739 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:739:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r12889, %r12826, 0f3FB8AA3B; + .loc 1 692 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:692:78 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.f32 %r12890, %r12889, 0fFF800000, %p948; + .loc 1 740 40 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:740:40 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + sub.f32 %r12891, %r12828, %r12738; + sub.f32 %r12892, %r12830, %r12739; + sub.f32 %r12893, %r12832, %r12738; + sub.f32 %r12894, %r12834, %r12739; + sub.f32 %r12895, %r12836, %r12740; + sub.f32 %r12896, %r12838, %r12741; + sub.f32 %r12897, %r12840, %r12740; + sub.f32 %r12898, %r12842, %r12741; + sub.f32 %r12899, %r12844, %r12742; + sub.f32 %r12900, %r12846, %r12743; + sub.f32 %r12901, %r12848, %r12742; + sub.f32 %r12902, %r12850, %r12743; + sub.f32 %r12903, %r12852, %r12744; + sub.f32 %r12904, %r12854, %r12745; + sub.f32 %r12905, %r12856, %r12744; + sub.f32 %r12906, %r12858, %r12745; + sub.f32 %r12907, %r12860, %r12746; + sub.f32 %r12908, %r12862, %r12747; + sub.f32 %r12909, %r12864, %r12746; + sub.f32 %r12910, %r12866, %r12747; + sub.f32 %r12911, %r12868, %r12748; + sub.f32 %r12912, %r12870, %r12749; + sub.f32 %r12913, %r12872, %r12748; + sub.f32 %r12914, %r12874, %r12749; + sub.f32 %r12915, %r12876, %r12750; + sub.f32 %r12916, %r12878, %r12751; + sub.f32 %r12917, %r12880, %r12750; + sub.f32 %r12918, %r12882, %r12751; + sub.f32 %r12919, %r12884, %r12752; + sub.f32 %r12920, %r12886, %r12753; + sub.f32 %r12921, %r12888, %r12752; + sub.f32 %r12922, %r12890, %r12753; + .loc 1 740 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:740:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + ex2.approx.ftz.f32 %r12923, %r12891; + ex2.approx.ftz.f32 %r12924, %r12892; + ex2.approx.ftz.f32 %r12925, %r12893; + ex2.approx.ftz.f32 %r12926, %r12894; + ex2.approx.ftz.f32 %r12927, %r12895; + ex2.approx.ftz.f32 %r12928, %r12896; + ex2.approx.ftz.f32 %r12929, %r12897; + ex2.approx.ftz.f32 %r12930, %r12898; + ex2.approx.ftz.f32 %r12931, %r12899; + ex2.approx.ftz.f32 %r12932, %r12900; + ex2.approx.ftz.f32 %r12933, %r12901; + ex2.approx.ftz.f32 %r12934, %r12902; + ex2.approx.ftz.f32 %r12935, %r12903; + ex2.approx.ftz.f32 %r12936, %r12904; + ex2.approx.ftz.f32 %r12937, %r12905; + ex2.approx.ftz.f32 %r12938, %r12906; + ex2.approx.ftz.f32 %r12939, %r12907; + ex2.approx.ftz.f32 %r12940, %r12908; + ex2.approx.ftz.f32 %r12941, %r12909; + ex2.approx.ftz.f32 %r12942, %r12910; + ex2.approx.ftz.f32 %r12943, %r12911; + ex2.approx.ftz.f32 %r12944, %r12912; + ex2.approx.ftz.f32 %r12945, %r12913; + ex2.approx.ftz.f32 %r12946, %r12914; + ex2.approx.ftz.f32 %r12947, %r12915; + ex2.approx.ftz.f32 %r12948, %r12916; + ex2.approx.ftz.f32 %r12949, %r12917; + ex2.approx.ftz.f32 %r12950, %r12918; + ex2.approx.ftz.f32 %r12951, %r12919; + ex2.approx.ftz.f32 %r12952, %r12920; + ex2.approx.ftz.f32 %r12953, %r12921; + ex2.approx.ftz.f32 %r12954, %r12922; + .loc 1 833 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:833:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s32 %r12955, %r7117, 49152; + add.s32 %r12063, %r12955, %r12716; + .loc 1 744 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:744:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + cvt.rn.bf16x2.f32 %r11148, %r12924, %r12923; + cvt.rn.bf16x2.f32 %r11149, %r12926, %r12925; + cvt.rn.bf16x2.f32 %r11150, %r12928, %r12927; + cvt.rn.bf16x2.f32 %r11151, %r12930, %r12929; + cvt.rn.bf16x2.f32 %r11280, %r12932, %r12931; + cvt.rn.bf16x2.f32 %r11281, %r12934, %r12933; + cvt.rn.bf16x2.f32 %r11282, %r12936, %r12935; + cvt.rn.bf16x2.f32 %r11283, %r12938, %r12937; + cvt.rn.bf16x2.f32 %r11412, %r12940, %r12939; + cvt.rn.bf16x2.f32 %r11413, %r12942, %r12941; + cvt.rn.bf16x2.f32 %r11414, %r12944, %r12943; + cvt.rn.bf16x2.f32 %r11415, %r12946, %r12945; + cvt.rn.bf16x2.f32 %r11544, %r12948, %r12947; + cvt.rn.bf16x2.f32 %r11545, %r12950, %r12949; + cvt.rn.bf16x2.f32 %r11546, %r12952, %r12951; + cvt.rn.bf16x2.f32 %r11547, %r12954, %r12953; + .loc 1 744 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:744:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + wgmma.fence.sync.aligned; + bfe.u32 %r12956, %r12063, 4, 14; + cvt.u64.u32 %rd916, %r12956; + or.b64 %rd846, %rd916, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14143,%r14144,%r14145,%r14146,%r14147,%r14148,%r14149,%r14150,%r14151,%r14152,%r14153,%r14154,%r14155,%r14156,%r14157,%r14158,%r14159,%r14160,%r14161,%r14162,%r14163,%r14164,%r14165,%r14166,%r14167,%r14168,%r14169,%r14170,%r14171,%r14172,%r14173,%r14174,%r14175,%r14176,%r14177,%r14178,%r14179,%r14180,%r14181,%r14182,%r14183,%r14184,%r14185,%r14186,%r14187,%r14188,%r14189,%r14190,%r14191,%r14192,%r14193,%r14194,%r14195,%r14196,%r14197,%r14198,%r14199,%r14200,%r14201,%r14202,%r14203,%r14204,%r14205,%r14206}, {%r11148,%r11149,%r11150,%r11151}, %rd846, %p873, 1, 1, 1; + // end inline asm + add.s32 %r12957, %r12063, 2048; + bfe.u32 %r12958, %r12957, 4, 14; + cvt.u64.u32 %rd917, %r12958; + or.b64 %rd847, %rd917, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14143,%r14144,%r14145,%r14146,%r14147,%r14148,%r14149,%r14150,%r14151,%r14152,%r14153,%r14154,%r14155,%r14156,%r14157,%r14158,%r14159,%r14160,%r14161,%r14162,%r14163,%r14164,%r14165,%r14166,%r14167,%r14168,%r14169,%r14170,%r14171,%r14172,%r14173,%r14174,%r14175,%r14176,%r14177,%r14178,%r14179,%r14180,%r14181,%r14182,%r14183,%r14184,%r14185,%r14186,%r14187,%r14188,%r14189,%r14190,%r14191,%r14192,%r14193,%r14194,%r14195,%r14196,%r14197,%r14198,%r14199,%r14200,%r14201,%r14202,%r14203,%r14204,%r14205,%r14206}, {%r11280,%r11281,%r11282,%r11283}, %rd847, %p873, 1, 1, 1; + // end inline asm + add.s32 %r12959, %r12063, 4096; + bfe.u32 %r12960, %r12959, 4, 14; + cvt.u64.u32 %rd918, %r12960; + or.b64 %rd848, %rd918, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14143,%r14144,%r14145,%r14146,%r14147,%r14148,%r14149,%r14150,%r14151,%r14152,%r14153,%r14154,%r14155,%r14156,%r14157,%r14158,%r14159,%r14160,%r14161,%r14162,%r14163,%r14164,%r14165,%r14166,%r14167,%r14168,%r14169,%r14170,%r14171,%r14172,%r14173,%r14174,%r14175,%r14176,%r14177,%r14178,%r14179,%r14180,%r14181,%r14182,%r14183,%r14184,%r14185,%r14186,%r14187,%r14188,%r14189,%r14190,%r14191,%r14192,%r14193,%r14194,%r14195,%r14196,%r14197,%r14198,%r14199,%r14200,%r14201,%r14202,%r14203,%r14204,%r14205,%r14206}, {%r11412,%r11413,%r11414,%r11415}, %rd848, %p873, 1, 1, 1; + // end inline asm + add.s32 %r12961, %r12063, 6144; + bfe.u32 %r12962, %r12961, 4, 14; + cvt.u64.u32 %rd919, %r12962; + or.b64 %rd849, %rd919, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14143,%r14144,%r14145,%r14146,%r14147,%r14148,%r14149,%r14150,%r14151,%r14152,%r14153,%r14154,%r14155,%r14156,%r14157,%r14158,%r14159,%r14160,%r14161,%r14162,%r14163,%r14164,%r14165,%r14166,%r14167,%r14168,%r14169,%r14170,%r14171,%r14172,%r14173,%r14174,%r14175,%r14176,%r14177,%r14178,%r14179,%r14180,%r14181,%r14182,%r14183,%r14184,%r14185,%r14186,%r14187,%r14188,%r14189,%r14190,%r14191,%r14192,%r14193,%r14194,%r14195,%r14196,%r14197,%r14198,%r14199,%r14200,%r14201,%r14202,%r14203,%r14204,%r14205,%r14206}, {%r11544,%r11545,%r11546,%r11547}, %rd849, %p873, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 748 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:748:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s32 %r12963, %r7117, 107008; + add.s32 %r12964, %r12963, %r12718; + add.s32 %r12965, %r12964, %r703; + .loc 1 750 20 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:750:20 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s32 %r12060, %r7117, 140288; + add.s32 %r12966, %r12756, %r12060; + bfe.u32 %r12967, %r12966, 4, 14; + cvt.u64.u32 %rd920, %r12967; + or.b64 %rd850, %rd920, 4611686293372403712; + add.s32 %r12968, %r12760, %r12060; + bfe.u32 %r12969, %r12968, 4, 14; + cvt.u64.u32 %rd921, %r12969; + or.b64 %rd852, %rd921, 4611686293372403712; + add.s32 %r12970, %r12063, 32; + bfe.u32 %r12971, %r12970, 4, 14; + cvt.u64.u32 %rd922, %r12971; + or.b64 %rd853, %rd922, 4611686293338849280; + add.s32 %r12972, %r12765, %r12060; + bfe.u32 %r12973, %r12972, 4, 14; + cvt.u64.u32 %rd923, %r12973; + or.b64 %rd854, %rd923, 4611686293372403712; + add.s32 %r12974, %r12063, 64; + bfe.u32 %r12975, %r12974, 4, 14; + cvt.u64.u32 %rd924, %r12975; + or.b64 %rd855, %rd924, 4611686293338849280; + add.s32 %r12976, %r12770, %r12060; + bfe.u32 %r12977, %r12976, 4, 14; + cvt.u64.u32 %rd925, %r12977; + or.b64 %rd856, %rd925, 4611686293372403712; + add.s32 %r12978, %r12063, 96; + bfe.u32 %r12979, %r12978, 4, 14; + cvt.u64.u32 %rd926, %r12979; + or.b64 %rd857, %rd926, 4611686293338849280; + add.s32 %r12980, %r12775, %r12060; + bfe.u32 %r12981, %r12980, 4, 14; + cvt.u64.u32 %rd927, %r12981; + or.b64 %rd858, %rd927, 4611686293372403712; + add.s32 %r12982, %r12063, 8192; + bfe.u32 %r12983, %r12982, 4, 14; + cvt.u64.u32 %rd928, %r12983; + or.b64 %rd859, %rd928, 4611686293338849280; + add.s32 %r12984, %r12780, %r12060; + bfe.u32 %r12985, %r12984, 4, 14; + cvt.u64.u32 %rd929, %r12985; + or.b64 %rd860, %rd929, 4611686293372403712; + add.s32 %r12986, %r12063, 8224; + bfe.u32 %r12987, %r12986, 4, 14; + cvt.u64.u32 %rd930, %r12987; + or.b64 %rd861, %rd930, 4611686293338849280; + add.s32 %r12988, %r12785, %r12060; + bfe.u32 %r12989, %r12988, 4, 14; + cvt.u64.u32 %rd931, %r12989; + or.b64 %rd862, %rd931, 4611686293372403712; + add.s32 %r12990, %r12063, 8256; + bfe.u32 %r12991, %r12990, 4, 14; + cvt.u64.u32 %rd932, %r12991; + or.b64 %rd863, %rd932, 4611686293338849280; + add.s32 %r12992, %r12790, %r12060; + bfe.u32 %r12993, %r12992, 4, 14; + cvt.u64.u32 %rd933, %r12993; + or.b64 %rd864, %rd933, 4611686293372403712; + add.s32 %r12994, %r12063, 8288; + bfe.u32 %r12995, %r12994, 4, 14; + cvt.u64.u32 %rd934, %r12995; + or.b64 %rd865, %rd934, 4611686293338849280; + .loc 1 748 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:748:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + ld.shared.v2.b32 {%r12996, %r12997}, [%r12965+224]; + ld.shared.v2.b32 {%r12998, %r12999}, [%r12965+192]; + ld.shared.v2.b32 {%r13000, %r13001}, [%r12965+160]; + ld.shared.v2.b32 {%r13002, %r13003}, [%r12965+128]; + ld.shared.v2.b32 {%r13004, %r13005}, [%r12965+96]; + ld.shared.v2.b32 {%r13006, %r13007}, [%r12965+64]; + ld.shared.v2.b32 {%r13008, %r13009}, [%r12965+32]; + ld.shared.v2.b32 {%r13010, %r13011}, [%r12965]; + .loc 1 750 20 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:750:20 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11644,%r11645,%r11646,%r11647,%r11648,%r11649,%r11650,%r11651,%r11652,%r11653,%r11654,%r11655,%r11656,%r11657,%r11658,%r11659,%r11660,%r11661,%r11662,%r11663,%r11664,%r11665,%r11666,%r11667,%r11668,%r11669,%r11670,%r11671,%r11672,%r11673,%r11674,%r11675}, %rd850, %rd846, 0, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11644,%r11645,%r11646,%r11647,%r11648,%r11649,%r11650,%r11651,%r11652,%r11653,%r11654,%r11655,%r11656,%r11657,%r11658,%r11659,%r11660,%r11661,%r11662,%r11663,%r11664,%r11665,%r11666,%r11667,%r11668,%r11669,%r11670,%r11671,%r11672,%r11673,%r11674,%r11675}, %rd852, %rd853, %p873, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11644,%r11645,%r11646,%r11647,%r11648,%r11649,%r11650,%r11651,%r11652,%r11653,%r11654,%r11655,%r11656,%r11657,%r11658,%r11659,%r11660,%r11661,%r11662,%r11663,%r11664,%r11665,%r11666,%r11667,%r11668,%r11669,%r11670,%r11671,%r11672,%r11673,%r11674,%r11675}, %rd854, %rd855, %p873, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11644,%r11645,%r11646,%r11647,%r11648,%r11649,%r11650,%r11651,%r11652,%r11653,%r11654,%r11655,%r11656,%r11657,%r11658,%r11659,%r11660,%r11661,%r11662,%r11663,%r11664,%r11665,%r11666,%r11667,%r11668,%r11669,%r11670,%r11671,%r11672,%r11673,%r11674,%r11675}, %rd856, %rd857, %p873, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11644,%r11645,%r11646,%r11647,%r11648,%r11649,%r11650,%r11651,%r11652,%r11653,%r11654,%r11655,%r11656,%r11657,%r11658,%r11659,%r11660,%r11661,%r11662,%r11663,%r11664,%r11665,%r11666,%r11667,%r11668,%r11669,%r11670,%r11671,%r11672,%r11673,%r11674,%r11675}, %rd858, %rd859, %p873, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11644,%r11645,%r11646,%r11647,%r11648,%r11649,%r11650,%r11651,%r11652,%r11653,%r11654,%r11655,%r11656,%r11657,%r11658,%r11659,%r11660,%r11661,%r11662,%r11663,%r11664,%r11665,%r11666,%r11667,%r11668,%r11669,%r11670,%r11671,%r11672,%r11673,%r11674,%r11675}, %rd860, %rd861, %p873, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11644,%r11645,%r11646,%r11647,%r11648,%r11649,%r11650,%r11651,%r11652,%r11653,%r11654,%r11655,%r11656,%r11657,%r11658,%r11659,%r11660,%r11661,%r11662,%r11663,%r11664,%r11665,%r11666,%r11667,%r11668,%r11669,%r11670,%r11671,%r11672,%r11673,%r11674,%r11675}, %rd862, %rd863, %p873, 1, 1, 0, 0; + // end inline asm + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r11644,%r11645,%r11646,%r11647,%r11648,%r11649,%r11650,%r11651,%r11652,%r11653,%r11654,%r11655,%r11656,%r11657,%r11658,%r11659,%r11660,%r11661,%r11662,%r11663,%r11664,%r11665,%r11666,%r11667,%r11668,%r11669,%r11670,%r11671,%r11672,%r11673,%r11674,%r11675}, %rd864, %rd865, %p873, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r12061, %r11015; + mov.b32 %r12062, %r11015; + mov.b32 %r12064, %r11015; + mov.b32 %r12065, %r11015; + // begin inline asm + // wait for regs: %r11644,%r11645,%r11646,%r11647,%r11648,%r11649,%r11650,%r11651,%r11652,%r11653,%r11654,%r11655,%r11656,%r11657,%r11658,%r11659,%r11660,%r11661,%r11662,%r11663,%r11664,%r11665,%r11666,%r11667,%r11668,%r11669,%r11670,%r11671,%r11672,%r11673,%r11674,%r11675,%r12060,%r12061,%r12062,%r12063,%r12064,%r12065 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 751 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + sub.f32 %r13012, %r11646, %r13010; + sub.f32 %r13013, %r11647, %r13011; + sub.f32 %r13014, %r11650, %r13008; + sub.f32 %r13015, %r11651, %r13009; + sub.f32 %r13016, %r11654, %r13006; + sub.f32 %r13017, %r11655, %r13007; + sub.f32 %r13018, %r11658, %r13004; + sub.f32 %r13019, %r11659, %r13005; + sub.f32 %r13020, %r11662, %r13002; + sub.f32 %r13021, %r11663, %r13003; + sub.f32 %r13022, %r11666, %r13000; + sub.f32 %r13023, %r11667, %r13001; + sub.f32 %r13024, %r11670, %r12998; + sub.f32 %r13025, %r11671, %r12999; + sub.f32 %r13026, %r11674, %r12996; + sub.f32 %r13027, %r11675, %r12997; + .loc 1 751 16 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:16 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r13028, %r12926, %r13013; + mul.f32 %r13029, %r12925, %r13012; + mul.f32 %r13030, %r12930, %r13015; + mul.f32 %r13031, %r12929, %r13014; + mul.f32 %r13032, %r12934, %r13017; + mul.f32 %r13033, %r12933, %r13016; + mul.f32 %r13034, %r12938, %r13019; + mul.f32 %r13035, %r12937, %r13018; + mul.f32 %r13036, %r12942, %r13021; + mul.f32 %r13037, %r12941, %r13020; + mul.f32 %r13038, %r12946, %r13023; + mul.f32 %r13039, %r12945, %r13022; + mul.f32 %r13040, %r12950, %r13025; + mul.f32 %r13041, %r12949, %r13024; + mul.f32 %r13042, %r12954, %r13027; + mul.f32 %r13043, %r12953, %r13026; + .loc 1 751 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + sub.f32 %r13044, %r11644, %r13010; + sub.f32 %r13045, %r11645, %r13011; + .loc 1 751 16 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:16 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r13046, %r12924, %r13045; + mul.f32 %r13047, %r12923, %r13044; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + cvt.rn.bf16.f32 %rs314, %r13047; + cvt.rn.bf16.f32 %rs315, %r13046; + .loc 1 759 70 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:759:70 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.b16 %rs316, %rs315, 0x0000, %p925; + selp.b16 %rs317, %rs314, 0x0000, %p917; + mov.b32 %r12232, {%rs317, %rs316}; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + cvt.rn.bf16.f32 %rs318, %r13029; + cvt.rn.bf16.f32 %rs319, %r13028; + .loc 1 759 70 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:759:70 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.b16 %rs320, %rs319, 0x0000, %p927; + selp.b16 %rs321, %rs318, 0x0000, %p926; + mov.b32 %r12233, {%rs321, %rs320}; + .loc 1 751 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + sub.f32 %r13048, %r11648, %r13008; + sub.f32 %r13049, %r11649, %r13009; + .loc 1 751 16 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:16 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r13050, %r12928, %r13049; + mul.f32 %r13051, %r12927, %r13048; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + cvt.rn.bf16.f32 %rs322, %r13051; + cvt.rn.bf16.f32 %rs323, %r13050; + .loc 1 759 70 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:759:70 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.b16 %rs324, %rs323, 0x0000, %p928; + selp.b16 %rs325, %rs322, 0x0000, %p918; + mov.b32 %r12234, {%rs325, %rs324}; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + cvt.rn.bf16.f32 %rs326, %r13031; + cvt.rn.bf16.f32 %rs327, %r13030; + .loc 1 759 70 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:759:70 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.b16 %rs328, %rs327, 0x0000, %p930; + selp.b16 %rs329, %rs326, 0x0000, %p929; + mov.b32 %r12235, {%rs329, %rs328}; + .loc 1 751 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + sub.f32 %r13052, %r11652, %r13006; + sub.f32 %r13053, %r11653, %r13007; + .loc 1 751 16 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:16 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r13054, %r12932, %r13053; + mul.f32 %r13055, %r12931, %r13052; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + cvt.rn.bf16.f32 %rs330, %r13055; + cvt.rn.bf16.f32 %rs331, %r13054; + .loc 1 759 70 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:759:70 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.b16 %rs332, %rs331, 0x0000, %p931; + selp.b16 %rs333, %rs330, 0x0000, %p919; + mov.b32 %r12364, {%rs333, %rs332}; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + cvt.rn.bf16.f32 %rs334, %r13033; + cvt.rn.bf16.f32 %rs335, %r13032; + .loc 1 759 70 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:759:70 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.b16 %rs336, %rs335, 0x0000, %p933; + selp.b16 %rs337, %rs334, 0x0000, %p932; + mov.b32 %r12365, {%rs337, %rs336}; + .loc 1 751 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + sub.f32 %r13056, %r11656, %r13004; + sub.f32 %r13057, %r11657, %r13005; + .loc 1 751 16 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:16 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r13058, %r12936, %r13057; + mul.f32 %r13059, %r12935, %r13056; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + cvt.rn.bf16.f32 %rs338, %r13059; + cvt.rn.bf16.f32 %rs339, %r13058; + .loc 1 759 70 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:759:70 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.b16 %rs340, %rs339, 0x0000, %p934; + selp.b16 %rs341, %rs338, 0x0000, %p920; + mov.b32 %r12366, {%rs341, %rs340}; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + cvt.rn.bf16.f32 %rs342, %r13035; + cvt.rn.bf16.f32 %rs343, %r13034; + .loc 1 759 70 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:759:70 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.b16 %rs344, %rs343, 0x0000, %p936; + selp.b16 %rs345, %rs342, 0x0000, %p935; + mov.b32 %r12367, {%rs345, %rs344}; + .loc 1 751 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + sub.f32 %r13060, %r11660, %r13002; + sub.f32 %r13061, %r11661, %r13003; + .loc 1 751 16 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:16 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r13062, %r12940, %r13061; + mul.f32 %r13063, %r12939, %r13060; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + cvt.rn.bf16.f32 %rs346, %r13063; + cvt.rn.bf16.f32 %rs347, %r13062; + .loc 1 759 70 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:759:70 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.b16 %rs348, %rs347, 0x0000, %p937; + selp.b16 %rs349, %rs346, 0x0000, %p921; + mov.b32 %r12496, {%rs349, %rs348}; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + cvt.rn.bf16.f32 %rs350, %r13037; + cvt.rn.bf16.f32 %rs351, %r13036; + .loc 1 759 70 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:759:70 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.b16 %rs352, %rs351, 0x0000, %p939; + selp.b16 %rs353, %rs350, 0x0000, %p938; + mov.b32 %r12497, {%rs353, %rs352}; + .loc 1 751 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + sub.f32 %r13064, %r11664, %r13000; + sub.f32 %r13065, %r11665, %r13001; + .loc 1 751 16 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:16 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r13066, %r12944, %r13065; + mul.f32 %r13067, %r12943, %r13064; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + cvt.rn.bf16.f32 %rs354, %r13067; + cvt.rn.bf16.f32 %rs355, %r13066; + .loc 1 759 70 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:759:70 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.b16 %rs356, %rs355, 0x0000, %p940; + selp.b16 %rs357, %rs354, 0x0000, %p922; + mov.b32 %r12498, {%rs357, %rs356}; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + cvt.rn.bf16.f32 %rs358, %r13039; + cvt.rn.bf16.f32 %rs359, %r13038; + .loc 1 759 70 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:759:70 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.b16 %rs360, %rs359, 0x0000, %p942; + selp.b16 %rs361, %rs358, 0x0000, %p941; + mov.b32 %r12499, {%rs361, %rs360}; + .loc 1 751 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + sub.f32 %r13068, %r11668, %r12998; + sub.f32 %r13069, %r11669, %r12999; + .loc 1 751 16 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:16 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r13070, %r12948, %r13069; + mul.f32 %r13071, %r12947, %r13068; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + cvt.rn.bf16.f32 %rs362, %r13071; + cvt.rn.bf16.f32 %rs363, %r13070; + .loc 1 759 70 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:759:70 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.b16 %rs364, %rs363, 0x0000, %p943; + selp.b16 %rs365, %rs362, 0x0000, %p923; + mov.b32 %r12628, {%rs365, %rs364}; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + cvt.rn.bf16.f32 %rs366, %r13041; + cvt.rn.bf16.f32 %rs367, %r13040; + .loc 1 759 70 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:759:70 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.b16 %rs368, %rs367, 0x0000, %p945; + selp.b16 %rs369, %rs366, 0x0000, %p944; + mov.b32 %r12629, {%rs369, %rs368}; + .loc 1 751 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + sub.f32 %r13072, %r11672, %r12996; + sub.f32 %r13073, %r11673, %r12997; + .loc 1 751 16 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:751:16 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.f32 %r13074, %r12952, %r13073; + mul.f32 %r13075, %r12951, %r13072; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + cvt.rn.bf16.f32 %rs370, %r13075; + cvt.rn.bf16.f32 %rs371, %r13074; + .loc 1 759 70 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:759:70 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.b16 %rs372, %rs371, 0x0000, %p946; + selp.b16 %rs373, %rs370, 0x0000, %p924; + mov.b32 %r12630, {%rs373, %rs372}; + .loc 1 775 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + cvt.rn.bf16.f32 %rs374, %r13043; + cvt.rn.bf16.f32 %rs375, %r13042; + .loc 1 759 70 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:759:70 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + selp.b16 %rs376, %rs375, 0x0000, %p948; + selp.b16 %rs377, %rs374, 0x0000, %p947; + mov.b32 %r12631, {%rs377, %rs376}; + .loc 1 775 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:775:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14207,%r14208,%r14209,%r14210,%r14211,%r14212,%r14213,%r14214,%r14215,%r14216,%r14217,%r14218,%r14219,%r14220,%r14221,%r14222,%r14223,%r14224,%r14225,%r14226,%r14227,%r14228,%r14229,%r14230,%r14231,%r14232,%r14233,%r14234,%r14235,%r14236,%r14237,%r14238,%r14239,%r14240,%r14241,%r14242,%r14243,%r14244,%r14245,%r14246,%r14247,%r14248,%r14249,%r14250,%r14251,%r14252,%r14253,%r14254,%r14255,%r14256,%r14257,%r14258,%r14259,%r14260,%r14261,%r14262,%r14263,%r14264,%r14265,%r14266,%r14267,%r14268,%r14269,%r14270}, {%r12232,%r12233,%r12234,%r12235}, %rd831, %p873, 1, 1, 1; + // end inline asm + add.s32 %r13076, %r11017, 2048; + bfe.u32 %r13077, %r13076, 4, 14; + cvt.u64.u32 %rd935, %r13077; + or.b64 %rd867, %rd935, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14207,%r14208,%r14209,%r14210,%r14211,%r14212,%r14213,%r14214,%r14215,%r14216,%r14217,%r14218,%r14219,%r14220,%r14221,%r14222,%r14223,%r14224,%r14225,%r14226,%r14227,%r14228,%r14229,%r14230,%r14231,%r14232,%r14233,%r14234,%r14235,%r14236,%r14237,%r14238,%r14239,%r14240,%r14241,%r14242,%r14243,%r14244,%r14245,%r14246,%r14247,%r14248,%r14249,%r14250,%r14251,%r14252,%r14253,%r14254,%r14255,%r14256,%r14257,%r14258,%r14259,%r14260,%r14261,%r14262,%r14263,%r14264,%r14265,%r14266,%r14267,%r14268,%r14269,%r14270}, {%r12364,%r12365,%r12366,%r12367}, %rd867, %p873, 1, 1, 1; + // end inline asm + add.s32 %r13078, %r11017, 4096; + bfe.u32 %r13079, %r13078, 4, 14; + cvt.u64.u32 %rd936, %r13079; + or.b64 %rd868, %rd936, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14207,%r14208,%r14209,%r14210,%r14211,%r14212,%r14213,%r14214,%r14215,%r14216,%r14217,%r14218,%r14219,%r14220,%r14221,%r14222,%r14223,%r14224,%r14225,%r14226,%r14227,%r14228,%r14229,%r14230,%r14231,%r14232,%r14233,%r14234,%r14235,%r14236,%r14237,%r14238,%r14239,%r14240,%r14241,%r14242,%r14243,%r14244,%r14245,%r14246,%r14247,%r14248,%r14249,%r14250,%r14251,%r14252,%r14253,%r14254,%r14255,%r14256,%r14257,%r14258,%r14259,%r14260,%r14261,%r14262,%r14263,%r14264,%r14265,%r14266,%r14267,%r14268,%r14269,%r14270}, {%r12496,%r12497,%r12498,%r12499}, %rd868, %p873, 1, 1, 1; + // end inline asm + add.s32 %r13080, %r11017, 6144; + bfe.u32 %r13081, %r13080, 4, 14; + cvt.u64.u32 %rd937, %r13081; + or.b64 %rd869, %rd937, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r14207,%r14208,%r14209,%r14210,%r14211,%r14212,%r14213,%r14214,%r14215,%r14216,%r14217,%r14218,%r14219,%r14220,%r14221,%r14222,%r14223,%r14224,%r14225,%r14226,%r14227,%r14228,%r14229,%r14230,%r14231,%r14232,%r14233,%r14234,%r14235,%r14236,%r14237,%r14238,%r14239,%r14240,%r14241,%r14242,%r14243,%r14244,%r14245,%r14246,%r14247,%r14248,%r14249,%r14250,%r14251,%r14252,%r14253,%r14254,%r14255,%r14256,%r14257,%r14258,%r14259,%r14260,%r14261,%r14262,%r14263,%r14264,%r14265,%r14266,%r14267,%r14268,%r14269,%r14270}, {%r12628,%r12629,%r12630,%r12631}, %rd869, %p873, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 628 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:628:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s32 %r14425, %r14404, %r14425; + .loc 1 610 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:610:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s32 %r1847, %r14554, 1; + .loc 1 788 33 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:788:33 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + shr.u32 %r13082, %r1847, 1; + .loc 1 789 38 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:789:38 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mad.wide.u32 %rd871, %r13082, 4, %rd520; + .loc 1 789 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:789:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + // begin inline asm + mov.u64 %rd870, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd870, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r12632, 0x0; + @%p895 ld.global.L1::evict_last.L2::cache_hint.b32 { %r12632 }, [ %rd871 + 0 ], %rd870; + // end inline asm + .loc 1 790 109 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:790:109 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s32 %r13083, %r13082, 1; + .loc 1 790 113 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:790:113 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + setp.lt.s32 %p965, %r13083, %r7094; + .loc 1 790 55 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:790:55 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s64 %rd874, %rd871, 4; + .loc 1 610 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:610:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + and.pred %p896, %p895, %p965; + .loc 1 790 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:790:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + // begin inline asm + mov.u64 %rd873, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd873, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r12633, 0x0; + @%p896 ld.global.L1::evict_last.L2::cache_hint.b32 { %r12633 }, [ %rd874 + 0 ], %rd873; + // end inline asm + .loc 1 791 35 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:791:35 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + and.b32 %r13084, %r14554, 1; + .loc 1 792 34 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:792:34 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + sub.s32 %r13085, %r12633, %r12632; + .loc 1 792 48 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:792:48 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + shl.b32 %r13086, %r13085, 7; + .loc 1 792 63 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:792:63 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s32 %r13087, %r13086, -64; + .loc 1 793 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:793:29 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + xor.b32 %r13088, %r13084, 1; + .loc 1 793 61 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:793:61 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + shl.b32 %r13089, %r13084, 6; + .loc 1 793 42 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:793:42 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mad.lo.s32 %r14404, %r13087, %r13088, %r13089; + .loc 1 626 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:626:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + shl.b32 %r13090, %r14404, 12; + .loc 1 626 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:626:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.wide.s32 %rd938, %r13090, 2; + add.s64 %rd1021, %rd1021, %rd938; + add.s64 %rd1020, %rd1020, %rd938; + add.s64 %rd1019, %rd1019, %rd938; + add.s64 %rd1018, %rd1018, %rd938; + .loc 1 627 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:627:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + shl.b32 %r13091, %r14404, 7; + .loc 1 627 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:627:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.wide.s32 %rd939, %r13091, 2; + add.s64 %rd1017, %rd1017, %rd939; + add.s64 %rd1016, %rd1016, %rd939; + add.s64 %rd1015, %rd1015, %rd939; + add.s64 %rd1014, %rd1014, %rd939; + .loc 1 628 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:628:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s32 %r14416, %r14404, %r14416; + add.s32 %r14415, %r14404, %r14415; + add.s32 %r14414, %r14404, %r14414; + add.s32 %r14413, %r14404, %r14413; + add.s32 %r14412, %r14404, %r14412; + add.s32 %r14411, %r14404, %r14411; + add.s32 %r14410, %r14404, %r14410; + add.s32 %r14409, %r14404, %r14409; + add.s32 %r14421, %r14404, %r14421; + add.s32 %r14422, %r14404, %r14422; + add.s32 %r14423, %r14404, %r14423; + add.s32 %r14424, %r14404, %r14424; + add.s32 %r14417, %r14404, %r14417; + add.s32 %r14418, %r14404, %r14418; + add.s32 %r14419, %r14404, %r14419; + add.s32 %r14420, %r14404, %r14420; + .loc 1 610 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:610:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s32 %r13092, %r14406, 1; + setp.gt.s32 %p966, %r13092, 1; + selp.b32 %r14406, 0, %r13092, %p966; + add.s32 %r13093, %r14408, 1; + setp.gt.s32 %p967, %r13093, 2; + selp.b32 %r14408, 0, %r13093, %p967; + .loc 1 831 52 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:52 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + setp.lt.s32 %p968, %r14421, 2048; + setp.lt.s32 %p969, %r14422, 2048; + setp.lt.s32 %p970, %r14423, 2048; + setp.lt.s32 %p971, %r14424, 2048; + .loc 1 831 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + shl.b32 %r13094, %r14408, 14; + add.s32 %r13095, %r7117, %r13094; + bar.sync 0; + add.s32 %r12634, %r13095, %r693; + selp.b32 %r13096, 16, 0, %p968; + selp.b32 %r12635, %r13096, 0, %p913; + // begin inline asm + cp.async.cg.shared.global [ %r12634 + 0 ], [ %rd1021 + 0 ], 0x10, %r12635; + // end inline asm + add.s32 %r12636, %r12634, 2048; + selp.b32 %r13097, 16, 0, %p969; + selp.b32 %r12637, %r13097, 0, %p913; + // begin inline asm + cp.async.cg.shared.global [ %r12636 + 0 ], [ %rd1020 + 0 ], 0x10, %r12637; + // end inline asm + add.s32 %r12638, %r12634, 4096; + selp.b32 %r13098, 16, 0, %p970; + selp.b32 %r12639, %r13098, 0, %p913; + // begin inline asm + cp.async.cg.shared.global [ %r12638 + 0 ], [ %rd1019 + 0 ], 0x10, %r12639; + // end inline asm + add.s32 %r12640, %r12634, 6144; + selp.b32 %r13099, 16, 0, %p971; + selp.b32 %r12641, %r13099, 0, %p913; + // begin inline asm + cp.async.cg.shared.global [ %r12640 + 0 ], [ %rd1018 + 0 ], 0x10, %r12641; + // end inline asm + cp.async.commit_group; + .loc 1 674 52 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:674:52 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + setp.lt.s32 %p972, %r14416, 2048; + setp.lt.s32 %p973, %r14415, 2048; + setp.lt.s32 %p974, %r14414, 2048; + setp.lt.s32 %p975, %r14413, 2048; + setp.lt.s32 %p976, %r14412, 2048; + setp.lt.s32 %p977, %r14411, 2048; + setp.lt.s32 %p978, %r14410, 2048; + setp.lt.s32 %p979, %r14409, 2048; + .loc 1 674 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:674:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + mul.wide.s32 %rd940, %r14416, 4; + add.s64 %rd880, %rd105, %rd940; + mul.wide.s32 %rd941, %r14415, 4; + add.s64 %rd881, %rd105, %rd941; + mul.wide.s32 %rd942, %r14414, 4; + add.s64 %rd882, %rd105, %rd942; + mul.wide.s32 %rd943, %r14413, 4; + add.s64 %rd883, %rd105, %rd943; + mul.wide.s32 %rd944, %r14412, 4; + add.s64 %rd884, %rd105, %rd944; + mul.wide.s32 %rd945, %r14411, 4; + add.s64 %rd885, %rd105, %rd945; + mul.wide.s32 %rd946, %r14410, 4; + add.s64 %rd886, %rd105, %rd946; + mul.wide.s32 %rd947, %r14409, 4; + add.s64 %rd887, %rd105, %rd947; + .loc 1 674 22 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:674:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + shl.b32 %r13100, %r14406, 8; + add.s32 %r13101, %r12719, %r13100; + add.s32 %r12642, %r13101, %r703; + selp.b32 %r13102, 8, 0, %p972; + selp.b32 %r12667, %r13102, 0, %p913; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r12642 + 0 ], [ %rd880 + 0 ], 0x8, %r12667; + // end inline asm + add.s32 %r12644, %r12642, 32; + selp.b32 %r13103, 8, 0, %p973; + selp.b32 %r12669, %r13103, 0, %p913; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r12644 + 0 ], [ %rd881 + 0 ], 0x8, %r12669; + // end inline asm + add.s32 %r12646, %r12642, 64; + selp.b32 %r13104, 8, 0, %p974; + selp.b32 %r12671, %r13104, 0, %p913; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r12646 + 0 ], [ %rd882 + 0 ], 0x8, %r12671; + // end inline asm + add.s32 %r12648, %r12642, 96; + selp.b32 %r13105, 8, 0, %p975; + selp.b32 %r12673, %r13105, 0, %p913; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r12648 + 0 ], [ %rd883 + 0 ], 0x8, %r12673; + // end inline asm + add.s32 %r12650, %r12642, 128; + selp.b32 %r13106, 8, 0, %p976; + selp.b32 %r12675, %r13106, 0, %p913; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r12650 + 0 ], [ %rd884 + 0 ], 0x8, %r12675; + // end inline asm + add.s32 %r12652, %r12642, 160; + selp.b32 %r13107, 8, 0, %p977; + selp.b32 %r12677, %r13107, 0, %p913; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r12652 + 0 ], [ %rd885 + 0 ], 0x8, %r12677; + // end inline asm + add.s32 %r12654, %r12642, 192; + selp.b32 %r13108, 8, 0, %p978; + selp.b32 %r12679, %r13108, 0, %p913; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r12654 + 0 ], [ %rd886 + 0 ], 0x8, %r12679; + // end inline asm + add.s32 %r12656, %r12642, 224; + selp.b32 %r13109, 8, 0, %p979; + selp.b32 %r12681, %r13109, 0, %p913; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r12656 + 0 ], [ %rd887 + 0 ], 0x8, %r12681; + // end inline asm + cp.async.commit_group; + .loc 1 833 52 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:833:52 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + setp.lt.s32 %p980, %r14417, 2048; + setp.lt.s32 %p981, %r14418, 2048; + setp.lt.s32 %p982, %r14419, 2048; + setp.lt.s32 %p983, %r14420, 2048; + .loc 1 833 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:833:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s32 %r13110, %r12955, %r13094; + add.s32 %r12658, %r13110, %r693; + selp.b32 %r13111, 16, 0, %p980; + selp.b32 %r12659, %r13111, 0, %p913; + // begin inline asm + cp.async.cg.shared.global [ %r12658 + 0 ], [ %rd1017 + 0 ], 0x10, %r12659; + // end inline asm + add.s32 %r12660, %r12658, 2048; + selp.b32 %r13112, 16, 0, %p981; + selp.b32 %r12661, %r13112, 0, %p913; + // begin inline asm + cp.async.cg.shared.global [ %r12660 + 0 ], [ %rd1016 + 0 ], 0x10, %r12661; + // end inline asm + add.s32 %r12662, %r12658, 4096; + selp.b32 %r13113, 16, 0, %p982; + selp.b32 %r12663, %r13113, 0, %p913; + // begin inline asm + cp.async.cg.shared.global [ %r12662 + 0 ], [ %rd1015 + 0 ], 0x10, %r12663; + // end inline asm + add.s32 %r12664, %r12658, 6144; + selp.b32 %r13114, 16, 0, %p983; + selp.b32 %r12665, %r13114, 0, %p913; + // begin inline asm + cp.async.cg.shared.global [ %r12664 + 0 ], [ %rd1014 + 0 ], 0x10, %r12665; + // end inline asm + cp.async.commit_group; + .loc 1 748 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:748:29 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s64 %rd892, %rd106, %rd940; + add.s64 %rd893, %rd106, %rd941; + add.s64 %rd894, %rd106, %rd942; + add.s64 %rd895, %rd106, %rd943; + add.s64 %rd896, %rd106, %rd944; + add.s64 %rd897, %rd106, %rd945; + add.s64 %rd898, %rd106, %rd946; + add.s64 %rd899, %rd106, %rd947; + .loc 1 748 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:748:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + add.s32 %r13115, %r12963, %r13100; + add.s32 %r12666, %r13115, %r703; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r12666 + 0 ], [ %rd892 + 0 ], 0x8, %r12667; + // end inline asm + add.s32 %r12668, %r12666, 32; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r12668 + 0 ], [ %rd893 + 0 ], 0x8, %r12669; + // end inline asm + add.s32 %r12670, %r12666, 64; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r12670 + 0 ], [ %rd894 + 0 ], 0x8, %r12671; + // end inline asm + add.s32 %r12672, %r12666, 96; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r12672 + 0 ], [ %rd895 + 0 ], 0x8, %r12673; + // end inline asm + add.s32 %r12674, %r12666, 128; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r12674 + 0 ], [ %rd896 + 0 ], 0x8, %r12675; + // end inline asm + add.s32 %r12676, %r12666, 160; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r12676 + 0 ], [ %rd897 + 0 ], 0x8, %r12677; + // end inline asm + add.s32 %r12678, %r12666, 192; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r12678 + 0 ], [ %rd898 + 0 ], 0x8, %r12679; + // end inline asm + add.s32 %r12680, %r12666, 224; + // begin inline asm + @%p840 cp.async.ca.shared.global [ %r12680 + 0 ], [ %rd899 + 0 ], 0x8, %r12681; + // end inline asm + cp.async.commit_group; + .loc 1 610 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:610:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:318:20 ] + setp.ne.b32 %p984, %r838, %r1847; + mov.b32 %r14554, %r1847; + @%p984 bra $L__BB0_14; + bra.uni $L__BB0_15; +$L__tmp19: +$L__BB0_1: + .loc 1 0 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:0:28 + ld.param.b32 %r2124, [triton_tem_fused_zeros_1_param_19]; + ld.param.b64 %rd167, [triton_tem_fused_zeros_1_param_13]; + ld.param.b64 %rd166, [triton_tem_fused_zeros_1_param_12]; + ld.param.b64 %rd163, [triton_tem_fused_zeros_1_param_9]; + ld.param.b64 %rd162, [triton_tem_fused_zeros_1_param_8]; + ld.param.b64 %rd161, [triton_tem_fused_zeros_1_param_6]; + .loc 1 136 26 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:136:26 + shr.u32 %r2302, %r18, 1; + bfe.u32 %r2303, %r7, 2, 3; + or.b32 %r2304, %r2302, %r2303; + .loc 1 140 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:140:24 + sub.s32 %r2305, %r1, %r2; + .loc 1 144 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:144:29 + shr.s32 %r2307, %r2305, 31; + shr.u32 %r2308, %r2307, 28; + add.s32 %r2309, %r2305, %r2308; + shr.s32 %r2310, %r2309, 4; + .loc 1 144 54 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:144:54 + shl.b32 %r2311, %r4, 2; + .loc 1 144 44 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:144:44 + add.s32 %r2312, %r2310, %r2311; + .loc 1 145 35 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:145:35 + and.b32 %r2313, %r2309, -16; + sub.s32 %r2314, %r2305, %r2313; + .loc 1 154 55 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:154:55 + shl.b32 %r2315, %r5, 4; + .loc 1 154 78 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:154:78 + add.s32 %r2316, %r2314, %r2315; + .loc 1 155 68 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:155:68 + mul.lo.s32 %r2317, %r2316, %r2124; + .loc 1 158 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:158:30 + shl.b32 %r2318, %r2312, 7; + .loc 1 158 52 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:158:52 + shl.b32 %r2319, %r3, 23; + .loc 1 158 40 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:158:40 + add.s32 %r2320, %r2318, %r2319; + .loc 1 159 42 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:159:42 + mad.lo.s32 %r2321, %r2312, 262016, %r2320; + .loc 1 161 46 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:161:46 + shl.b32 %r2322, %r3, 16; + shl.b32 %r2323, %r2312, 11; + add.s32 %r2324, %r2323, %r2322; + .loc 1 163 17 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:163:17 + mul.wide.s32 %rd216, %r2320, 2; + add.s64 %rd217, %rd157, %rd216; + .loc 1 164 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:164:19 + mad.wide.s32 %rd218, %r2321, 2, %rd160; + .loc 1 168 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:168:21 + mul.wide.s32 %rd219, %r2324, 4; + add.s64 %rd220, %rd158, %rd219; + .loc 1 169 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:169:25 + add.s64 %rd221, %rd159, %rd219; + .loc 1 174 36 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:174:36 + shl.b32 %r2325, %r2314, 7; + .loc 1 175 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:175:29 + or.b32 %r2326, %r2325, %r10; + or.b32 %r2327, %r2325, %r11; + or.b32 %r2328, %r2325, %r12; + or.b32 %r2329, %r2325, %r13; + or.b32 %r2330, %r2325, %r14; + or.b32 %r2331, %r2325, %r15; + or.b32 %r2332, %r2325, %r16; + or.b32 %r2333, %r2325, %r17; +$L__tmp20: + .loc 1 825 38 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:825:38 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:178:107 ] + shl.b32 %r2334, %r2326, 12; + shl.b32 %r2335, %r2327, 12; + shl.b32 %r2336, %r2328, 12; + shl.b32 %r2337, %r2329, 12; + shl.b32 %r2338, %r2330, 12; + shl.b32 %r2339, %r2331, 12; + shl.b32 %r2340, %r2332, 12; + shl.b32 %r2341, %r2333, 12; + .loc 1 825 20 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:825:20 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:178:107 ] + mad.wide.s32 %rd222, %r2334, 2, %rd217; + mad.wide.s32 %rd223, %r2335, 2, %rd217; + mad.wide.s32 %rd224, %r2336, 2, %rd217; + mad.wide.s32 %rd225, %r2337, 2, %rd217; + mad.wide.s32 %rd226, %r2338, 2, %rd217; + mad.wide.s32 %rd227, %r2339, 2, %rd217; + mad.wide.s32 %rd228, %r2340, 2, %rd217; + mad.wide.s32 %rd229, %r2341, 2, %rd217; + .loc 1 825 56 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:825:56 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:178:107 ] + shl.b32 %r2342, %r7, 3; + and.b32 %r2343, %r2342, 120; + .loc 1 825 49 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:825:49 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:178:107 ] + cvt.u64.u32 %rd13, %r2343; + mul.wide.u32 %rd230, %r2343, 2; + add.s64 %rd176, %rd222, %rd230; + add.s64 %rd177, %rd223, %rd230; + add.s64 %rd178, %rd224, %rd230; + add.s64 %rd179, %rd225, %rd230; + add.s64 %rd180, %rd226, %rd230; + add.s64 %rd181, %rd227, %rd230; + add.s64 %rd182, %rd228, %rd230; + add.s64 %rd183, %rd229, %rd230; + mov.b32 %r13753, 0; + mov.pred %p22, -1; + .loc 1 833 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:833:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:178:107 ] + // begin inline asm + mov.u32 %r2135, %r13753; + mov.u32 %r2136, %r13753; + mov.u32 %r2137, %r13753; + mov.u32 %r2138, %r13753; + @%p22 ld.global.v4.b32 { %r2135, %r2136, %r2137, %r2138 }, [ %rd176 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2143, %r13753; + mov.u32 %r2144, %r13753; + mov.u32 %r2145, %r13753; + mov.u32 %r2146, %r13753; + @%p22 ld.global.v4.b32 { %r2143, %r2144, %r2145, %r2146 }, [ %rd177 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2151, %r13753; + mov.u32 %r2152, %r13753; + mov.u32 %r2153, %r13753; + mov.u32 %r2154, %r13753; + @%p22 ld.global.v4.b32 { %r2151, %r2152, %r2153, %r2154 }, [ %rd178 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2159, %r13753; + mov.u32 %r2160, %r13753; + mov.u32 %r2161, %r13753; + mov.u32 %r2162, %r13753; + @%p22 ld.global.v4.b32 { %r2159, %r2160, %r2161, %r2162 }, [ %rd179 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2167, %r13753; + mov.u32 %r2168, %r13753; + mov.u32 %r2169, %r13753; + mov.u32 %r2170, %r13753; + @%p22 ld.global.v4.b32 { %r2167, %r2168, %r2169, %r2170 }, [ %rd180 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2175, %r13753; + mov.u32 %r2176, %r13753; + mov.u32 %r2177, %r13753; + mov.u32 %r2178, %r13753; + @%p22 ld.global.v4.b32 { %r2175, %r2176, %r2177, %r2178 }, [ %rd181 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2183, %r13753; + mov.u32 %r2184, %r13753; + mov.u32 %r2185, %r13753; + mov.u32 %r2186, %r13753; + @%p22 ld.global.v4.b32 { %r2183, %r2184, %r2185, %r2186 }, [ %rd182 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2191, %r13753; + mov.u32 %r2192, %r13753; + mov.u32 %r2193, %r13753; + mov.u32 %r2194, %r13753; + @%p22 ld.global.v4.b32 { %r2191, %r2192, %r2193, %r2194 }, [ %rd183 + 0 ]; + // end inline asm + shl.b32 %r2344, %r7, 4; + and.b32 %r2345, %r2344, 112; + shl.b32 %r2346, %r9, 3; + and.b32 %r2347, %r7, 112; + and.b32 %r2348, %r7, 8; + shl.b32 %r2349, %r2348, 11; + or.b32 %r2350, %r2345, %r2346; + xor.b32 %r2351, %r2350, %r2347; + or.b32 %r2352, %r2351, %r2349; + mov.b32 %r2353, global_smem; + add.s32 %r2354, %r2353, %r2352; + st.shared.v4.b32 [%r2354+98304], {%r2135, %r2136, %r2137, %r2138}; + st.shared.v4.b32 [%r2354+100352], {%r2143, %r2144, %r2145, %r2146}; + st.shared.v4.b32 [%r2354+102400], {%r2151, %r2152, %r2153, %r2154}; + st.shared.v4.b32 [%r2354+104448], {%r2159, %r2160, %r2161, %r2162}; + st.shared.v4.b32 [%r2354+106496], {%r2167, %r2168, %r2169, %r2170}; + st.shared.v4.b32 [%r2354+108544], {%r2175, %r2176, %r2177, %r2178}; + st.shared.v4.b32 [%r2354+110592], {%r2183, %r2184, %r2185, %r2186}; + st.shared.v4.b32 [%r2354+112640], {%r2191, %r2192, %r2193, %r2194}; +$L__tmp21: + .loc 1 825 38 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:825:38 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:179:111 ] + shl.b32 %r2355, %r2326, 7; + shl.b32 %r2356, %r2327, 7; + shl.b32 %r2357, %r2328, 7; + shl.b32 %r2358, %r2329, 7; + shl.b32 %r2359, %r2330, 7; + shl.b32 %r2360, %r2331, 7; + shl.b32 %r2361, %r2332, 7; + shl.b32 %r2362, %r2333, 7; + .loc 1 825 20 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:825:20 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:179:111 ] + mad.wide.s32 %rd231, %r2355, 2, %rd218; + mad.wide.s32 %rd232, %r2356, 2, %rd218; + mad.wide.s32 %rd233, %r2357, 2, %rd218; + mad.wide.s32 %rd234, %r2358, 2, %rd218; + mad.wide.s32 %rd235, %r2359, 2, %rd218; + mad.wide.s32 %rd236, %r2360, 2, %rd218; + mad.wide.s32 %rd237, %r2361, 2, %rd218; + mad.wide.s32 %rd238, %r2362, 2, %rd218; + .loc 1 825 49 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:825:49 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:179:111 ] + add.s64 %rd184, %rd231, %rd230; + add.s64 %rd185, %rd232, %rd230; + add.s64 %rd186, %rd233, %rd230; + add.s64 %rd187, %rd234, %rd230; + add.s64 %rd188, %rd235, %rd230; + add.s64 %rd189, %rd236, %rd230; + add.s64 %rd190, %rd237, %rd230; + add.s64 %rd191, %rd238, %rd230; + .loc 1 833 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:833:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:179:111 ] + // begin inline asm + mov.u32 %r2199, %r13753; + mov.u32 %r2200, %r13753; + mov.u32 %r2201, %r13753; + mov.u32 %r2202, %r13753; + @%p22 ld.global.v4.b32 { %r2199, %r2200, %r2201, %r2202 }, [ %rd184 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2207, %r13753; + mov.u32 %r2208, %r13753; + mov.u32 %r2209, %r13753; + mov.u32 %r2210, %r13753; + @%p22 ld.global.v4.b32 { %r2207, %r2208, %r2209, %r2210 }, [ %rd185 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2215, %r13753; + mov.u32 %r2216, %r13753; + mov.u32 %r2217, %r13753; + mov.u32 %r2218, %r13753; + @%p22 ld.global.v4.b32 { %r2215, %r2216, %r2217, %r2218 }, [ %rd186 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2223, %r13753; + mov.u32 %r2224, %r13753; + mov.u32 %r2225, %r13753; + mov.u32 %r2226, %r13753; + @%p22 ld.global.v4.b32 { %r2223, %r2224, %r2225, %r2226 }, [ %rd187 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2231, %r13753; + mov.u32 %r2232, %r13753; + mov.u32 %r2233, %r13753; + mov.u32 %r2234, %r13753; + @%p22 ld.global.v4.b32 { %r2231, %r2232, %r2233, %r2234 }, [ %rd188 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2239, %r13753; + mov.u32 %r2240, %r13753; + mov.u32 %r2241, %r13753; + mov.u32 %r2242, %r13753; + @%p22 ld.global.v4.b32 { %r2239, %r2240, %r2241, %r2242 }, [ %rd189 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2247, %r13753; + mov.u32 %r2248, %r13753; + mov.u32 %r2249, %r13753; + mov.u32 %r2250, %r13753; + @%p22 ld.global.v4.b32 { %r2247, %r2248, %r2249, %r2250 }, [ %rd190 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2255, %r13753; + mov.u32 %r2256, %r13753; + mov.u32 %r2257, %r13753; + mov.u32 %r2258, %r13753; + @%p22 ld.global.v4.b32 { %r2255, %r2256, %r2257, %r2258 }, [ %rd191 + 0 ]; + // end inline asm + st.shared.v4.b32 [%r2354+131072], {%r2199, %r2200, %r2201, %r2202}; + st.shared.v4.b32 [%r2354+133120], {%r2207, %r2208, %r2209, %r2210}; + st.shared.v4.b32 [%r2354+135168], {%r2215, %r2216, %r2217, %r2218}; + st.shared.v4.b32 [%r2354+137216], {%r2223, %r2224, %r2225, %r2226}; + st.shared.v4.b32 [%r2354+139264], {%r2231, %r2232, %r2233, %r2234}; + st.shared.v4.b32 [%r2354+141312], {%r2239, %r2240, %r2241, %r2242}; + st.shared.v4.b32 [%r2354+143360], {%r2247, %r2248, %r2249, %r2250}; + st.shared.v4.b32 [%r2354+145408], {%r2255, %r2256, %r2257, %r2258}; +$L__tmp22: + .loc 1 195 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:195:30 + cvt.s64.s32 %rd14, %r2317; + mad.wide.s32 %rd196, %r2317, 4, %rd163; + .loc 1 197 53 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:197:53 + cvt.s64.s32 %rd16, %r2316; + mad.wide.s32 %rd197, %r2316, 4, %rd162; + .loc 1 199 42 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:199:42 + and.b32 %r19, %r7, 3; + shl.b32 %r22, %r19, 1; + or.b32 %r21, %r22, 1; + or.b32 %r25, %r22, 9; + or.b32 %r24, %r22, 8; + or.b32 %r29, %r22, 16; + or.b32 %r28, %r22, 17; + or.b32 %r27, %r22, 24; + or.b32 %r26, %r22, 25; + or.b32 %r37, %r22, 32; + or.b32 %r36, %r22, 33; + or.b32 %r35, %r22, 40; + or.b32 %r34, %r22, 41; + or.b32 %r33, %r22, 48; + or.b32 %r32, %r22, 49; + or.b32 %r31, %r22, 56; + or.b32 %r30, %r22, 57; +$L__tmp23: + .loc 2 41 22 // standard.py:41:22 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + add.s32 %r2363, %r2123, 63; + .loc 2 41 28 // standard.py:41:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + shr.s32 %r2364, %r2363, 31; + shr.u32 %r2365, %r2364, 26; + add.s32 %r2366, %r2363, %r2365; + shr.s32 %r2367, %r2366, 6; + .loc 1 395 101 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:395:101 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + max.s32 %r38, %r2367, 1; + .loc 1 485 34 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:485:34 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mad.wide.u32 %rd199, %r3, 8, %rd170; +$L__tmp24: + .loc 1 175 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:175:29 + or.b32 %r39, %r2325, %r2304; + .loc 1 188 34 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:188:34 + mul.wide.s32 %rd239, %r39, 4; + add.s64 %rd192, %rd221, %rd239; + add.s64 %rd193, %rd192, 32; + .loc 1 188 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:188:25 + // begin inline asm + mov.u32 %r2263, 0x0; + @%p22 ld.global.b32 { %r2263 }, [ %rd192 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2264, 0x0; + @%p22 ld.global.b32 { %r2264 }, [ %rd193 + 0 ]; + // end inline asm + .loc 1 189 33 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:189:33 + add.s64 %rd194, %rd220, %rd239; + add.s64 %rd195, %rd194, 32; + .loc 1 189 26 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:189:26 + // begin inline asm + mov.u32 %r2265, 0x0; + @%p22 ld.global.b32 { %r2265 }, [ %rd194 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2266, 0x0; + @%p22 ld.global.b32 { %r2266 }, [ %rd195 + 0 ]; + // end inline asm + .loc 1 190 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:190:30 + setp.eq.f32 %p43, %r2265, 0fFF800000; + setp.eq.f32 %p44, %r2266, 0fFF800000; + .loc 1 190 50 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:190:50 + selp.f32 %r43, 0f00000000, %r2265, %p43; + selp.f32 %r44, 0f00000000, %r2266, %p44; + .loc 1 196 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:196:27 + // begin inline asm + mov.u32 %r2267, 0x0; + ld.global.b32 { %r2267 }, [ %rd196 + 0 ]; + // end inline asm + .loc 1 196 41 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:196:41 + shl.b32 %r45, %r2267, 7; + .loc 1 197 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:197:39 + // begin inline asm + mov.u32 %r2268, 0x0; + ld.global.b32 { %r2268 }, [ %rd197 + 0 ]; + // end inline asm + .loc 1 199 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:199:29 + or.b32 %r2368, %r45, %r10; + or.b32 %r2369, %r45, %r11; + or.b32 %r2370, %r45, %r12; + or.b32 %r2371, %r45, %r13; +$L__tmp25: + .loc 1 390 37 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:390:37 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + shl.b32 %r2372, %r2368, 7; + shl.b32 %r2373, %r2369, 7; + shl.b32 %r2374, %r2370, 7; + shl.b32 %r2375, %r2371, 7; + .loc 1 390 18 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:390:18 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.wide.s32 %rd240, %r2372, 2; + add.s64 %rd241, %rd1, %rd240; + mul.wide.s32 %rd242, %r2373, 2; + add.s64 %rd243, %rd1, %rd242; + mul.wide.s32 %rd244, %r2374, 2; + add.s64 %rd245, %rd1, %rd244; + mul.wide.s32 %rd246, %r2375, 2; + add.s64 %rd247, %rd1, %rd246; + .loc 1 390 49 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:390:49 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + add.s64 %rd200, %rd241, %rd230; + add.s64 %rd201, %rd243, %rd230; + add.s64 %rd202, %rd245, %rd230; + add.s64 %rd203, %rd247, %rd230; + .loc 1 391 18 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:391:18 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + add.s64 %rd248, %rd2, %rd240; + add.s64 %rd249, %rd2, %rd242; + add.s64 %rd250, %rd2, %rd244; + add.s64 %rd251, %rd2, %rd246; + .loc 1 391 49 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:391:49 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + add.s64 %rd204, %rd248, %rd230; + add.s64 %rd205, %rd249, %rd230; + add.s64 %rd206, %rd250, %rd230; + add.s64 %rd207, %rd251, %rd230; + .loc 1 395 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:395:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + shl.b32 %r2376, %r2268, 1; + .loc 1 395 63 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:395:63 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + min.s32 %r47, %r2376, %r38; + .loc 1 397 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:397:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.lt.s32 %p45, %r2376, 1; + setp.gt.s32 %p42, %r2376, 0; + .loc 1 485 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:485:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + // begin inline asm + mov.u64 %rd198, 0x0; + @%p42 ld.global.b64 { %rd198 }, [ %rd199 + 0 ]; + // end inline asm + .loc 1 831 52 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:52 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.lt.s32 %p46, %r2368, %r2123; + setp.lt.s32 %p47, %r2369, %r2123; + setp.lt.s32 %p48, %r2370, %r2123; + setp.lt.s32 %p49, %r2371, %r2123; + .loc 1 831 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + shl.b32 %r2377, %r2348, 10; + or.b32 %r48, %r2351, %r2377; + add.s32 %r4610, %r2353, %r48; + selp.b32 %r2378, 16, 0, %p46; + selp.b32 %r2278, %r2378, 0, %p42; + // begin inline asm + cp.async.cg.shared.global [ %r4610 + 0 ], [ %rd200 + 0 ], 0x10, %r2278; + // end inline asm + add.s32 %r4612, %r4610, 2048; + selp.b32 %r2379, 16, 0, %p47; + selp.b32 %r2280, %r2379, 0, %p42; + // begin inline asm + cp.async.cg.shared.global [ %r4612 + 0 ], [ %rd201 + 0 ], 0x10, %r2280; + // end inline asm + add.s32 %r4614, %r4610, 4096; + selp.b32 %r2380, 16, 0, %p48; + selp.b32 %r2282, %r2380, 0, %p42; + // begin inline asm + cp.async.cg.shared.global [ %r4614 + 0 ], [ %rd202 + 0 ], 0x10, %r2282; + // end inline asm + add.s32 %r4616, %r4610, 6144; + selp.b32 %r2381, 16, 0, %p49; + selp.b32 %r2284, %r2381, 0, %p42; + // begin inline asm + cp.async.cg.shared.global [ %r4616 + 0 ], [ %rd203 + 0 ], 0x10, %r2284; + // end inline asm + cp.async.commit_group; + add.s32 %r2277, %r4610, 49152; + // begin inline asm + cp.async.cg.shared.global [ %r2277 + 0 ], [ %rd204 + 0 ], 0x10, %r2278; + // end inline asm + add.s32 %r2279, %r4610, 51200; + // begin inline asm + cp.async.cg.shared.global [ %r2279 + 0 ], [ %rd205 + 0 ], 0x10, %r2280; + // end inline asm + add.s32 %r2281, %r4610, 53248; + // begin inline asm + cp.async.cg.shared.global [ %r2281 + 0 ], [ %rd206 + 0 ], 0x10, %r2282; + // end inline asm + add.s32 %r2283, %r4610, 55296; + // begin inline asm + cp.async.cg.shared.global [ %r2283 + 0 ], [ %rd207 + 0 ], 0x10, %r2284; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:397:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.gt.s32 %p50, %r47, 1; + .loc 1 414 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:414:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + add.s64 %rd996, %rd200, 16384; + add.s64 %rd995, %rd201, 16384; + add.s64 %rd994, %rd202, 16384; + add.s64 %rd993, %rd203, 16384; + .loc 1 415 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:415:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + add.s64 %rd992, %rd204, 16384; + add.s64 %rd991, %rd205, 16384; + add.s64 %rd990, %rd206, 16384; + add.s64 %rd989, %rd207, 16384; + .loc 1 417 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:417:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + or.b32 %r13688, %r2368, 64; + or.b32 %r13687, %r2369, 64; + or.b32 %r13686, %r2370, 64; + or.b32 %r13685, %r2371, 64; + .loc 1 831 52 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:52 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.lt.s32 %p51, %r13688, %r2123; + setp.lt.s32 %p52, %r13687, %r2123; + setp.lt.s32 %p53, %r13686, %r2123; + setp.lt.s32 %p54, %r13685, %r2123; + .loc 1 831 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + bar.sync 0; + add.s32 %r2285, %r4610, 16384; + selp.b32 %r2382, 16, 0, %p51; + selp.b32 %r2294, %r2382, 0, %p50; + // begin inline asm + cp.async.cg.shared.global [ %r2285 + 0 ], [ %rd996 + 0 ], 0x10, %r2294; + // end inline asm + add.s32 %r2287, %r4610, 18432; + selp.b32 %r2383, 16, 0, %p52; + selp.b32 %r2296, %r2383, 0, %p50; + // begin inline asm + cp.async.cg.shared.global [ %r2287 + 0 ], [ %rd995 + 0 ], 0x10, %r2296; + // end inline asm + add.s32 %r2289, %r4610, 20480; + selp.b32 %r2384, 16, 0, %p53; + selp.b32 %r2298, %r2384, 0, %p50; + // begin inline asm + cp.async.cg.shared.global [ %r2289 + 0 ], [ %rd994 + 0 ], 0x10, %r2298; + // end inline asm + add.s32 %r2291, %r4610, 22528; + selp.b32 %r2385, 16, 0, %p54; + selp.b32 %r2300, %r2385, 0, %p50; + // begin inline asm + cp.async.cg.shared.global [ %r2291 + 0 ], [ %rd993 + 0 ], 0x10, %r2300; + // end inline asm + cp.async.commit_group; + add.s32 %r2293, %r4610, 65536; + // begin inline asm + cp.async.cg.shared.global [ %r2293 + 0 ], [ %rd992 + 0 ], 0x10, %r2294; + // end inline asm + add.s32 %r2295, %r4610, 67584; + // begin inline asm + cp.async.cg.shared.global [ %r2295 + 0 ], [ %rd991 + 0 ], 0x10, %r2296; + // end inline asm + add.s32 %r2297, %r4610, 69632; + // begin inline asm + cp.async.cg.shared.global [ %r2297 + 0 ], [ %rd990 + 0 ], 0x10, %r2298; + // end inline asm + add.s32 %r2299, %r4610, 71680; + // begin inline asm + cp.async.cg.shared.global [ %r2299 + 0 ], [ %rd989 + 0 ], 0x10, %r2300; + // end inline asm + cp.async.commit_group; + .loc 1 459 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:459:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + mov.b32 %r13689, 0f00000000; + mov.b32 %r13690, %r13689; + mov.b32 %r13691, %r13689; + mov.b32 %r13692, %r13689; + mov.b32 %r13693, %r13689; + mov.b32 %r13694, %r13689; + mov.b32 %r13695, %r13689; + mov.b32 %r13696, %r13689; + mov.b32 %r13697, %r13689; + mov.b32 %r13698, %r13689; + mov.b32 %r13699, %r13689; + mov.b32 %r13700, %r13689; + mov.b32 %r13701, %r13689; + mov.b32 %r13702, %r13689; + mov.b32 %r13703, %r13689; + mov.b32 %r13704, %r13689; + mov.b32 %r13705, %r13689; + mov.b32 %r13706, %r13689; + mov.b32 %r13707, %r13689; + mov.b32 %r13708, %r13689; + mov.b32 %r13709, %r13689; + mov.b32 %r13710, %r13689; + mov.b32 %r13711, %r13689; + mov.b32 %r13712, %r13689; + mov.b32 %r13713, %r13689; + mov.b32 %r13714, %r13689; + mov.b32 %r13715, %r13689; + mov.b32 %r13716, %r13689; + mov.b32 %r13717, %r13689; + mov.b32 %r13718, %r13689; + mov.b32 %r13719, %r13689; + mov.b32 %r13720, %r13689; + mov.b32 %r13721, %r13689; + mov.b32 %r13722, %r13689; + mov.b32 %r13723, %r13689; + mov.b32 %r13724, %r13689; + mov.b32 %r13725, %r13689; + mov.b32 %r13726, %r13689; + mov.b32 %r13727, %r13689; + mov.b32 %r13728, %r13689; + mov.b32 %r13729, %r13689; + mov.b32 %r13730, %r13689; + mov.b32 %r13731, %r13689; + mov.b32 %r13732, %r13689; + mov.b32 %r13733, %r13689; + mov.b32 %r13734, %r13689; + mov.b32 %r13735, %r13689; + mov.b32 %r13736, %r13689; + mov.b32 %r13737, %r13689; + mov.b32 %r13738, %r13689; + mov.b32 %r13739, %r13689; + mov.b32 %r13740, %r13689; + mov.b32 %r13741, %r13689; + mov.b32 %r13742, %r13689; + mov.b32 %r13743, %r13689; + mov.b32 %r13744, %r13689; + mov.b32 %r13745, %r13689; + mov.b32 %r13746, %r13689; + mov.b32 %r13747, %r13689; + mov.b32 %r13748, %r13689; + mov.b32 %r13749, %r13689; + mov.b32 %r13750, %r13689; + mov.b32 %r13751, %r13689; + mov.b32 %r13752, %r13689; + .loc 1 397 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:397:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + @%p45 bra $L__BB0_4; +$L__tmp26: +// %bb.2: // %.lr.ph + .loc 1 0 0 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:0 + or.b32 %r40, %r39, 8; +$L__tmp27: + .loc 1 798 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:798:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + shr.s32 %r2391, %r40, 31; + shr.u32 %r2392, %r2391, 21; + add.s32 %r2393, %r40, %r2392; + and.b32 %r2394, %r2393, -2048; + sub.s32 %r89, %r40, %r2394; + shr.s32 %r2395, %r39, 31; + shr.u32 %r2396, %r2395, 21; + add.s32 %r2397, %r39, %r2396; + and.b32 %r2398, %r2397, -2048; + sub.s32 %r93, %r39, %r2398; + .loc 1 488 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:488:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + cvt.s64.s32 %rd252, %r93; + cvt.s64.s32 %rd253, %r89; + setp.gt.s64 %p2, %rd198, %rd253; + setp.gt.s64 %p4, %rd198, %rd252; +$L__tmp28: + .loc 1 199 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:199:29 + or.b32 %r13755, %r45, %r30; + or.b32 %r13754, %r45, %r31; + or.b32 %r13757, %r45, %r32; + or.b32 %r13756, %r45, %r33; + or.b32 %r13759, %r45, %r34; + or.b32 %r13758, %r45, %r35; + or.b32 %r13761, %r45, %r36; + or.b32 %r13760, %r45, %r37; + or.b32 %r13763, %r45, %r26; + or.b32 %r13762, %r45, %r27; + or.b32 %r13765, %r45, %r28; + or.b32 %r13764, %r45, %r29; + or.b32 %r13766, %r45, %r24; + or.b32 %r13767, %r45, %r25; + or.b32 %r13769, %r45, %r21; + or.b32 %r13768, %r45, %r22; + add.s32 %r85, %r47, -2; + add.s32 %r86, %r47, -1; +$L__tmp29: + .loc 1 397 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:397:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + max.s32 %r87, %r47, 1; + .loc 1 531 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mov.b64 %rd27, {%r2264, %r2264}; + mov.b64 %rd28, {%r2263, %r2263}; + mov.b32 %r13689, 0f00000000; + mov.b32 %r13684, 1; + mov.b32 %r13683, -1; + mov.b32 %r13682, 64; + mov.b32 %r13690, %r13689; + mov.b32 %r13691, %r13689; + mov.b32 %r13692, %r13689; + mov.b32 %r13693, %r13689; + mov.b32 %r13694, %r13689; + mov.b32 %r13695, %r13689; + mov.b32 %r13696, %r13689; + mov.b32 %r13697, %r13689; + mov.b32 %r13698, %r13689; + mov.b32 %r13699, %r13689; + mov.b32 %r13700, %r13689; + mov.b32 %r13701, %r13689; + mov.b32 %r13702, %r13689; + mov.b32 %r13703, %r13689; + mov.b32 %r13704, %r13689; + mov.b32 %r13705, %r13689; + mov.b32 %r13706, %r13689; + mov.b32 %r13707, %r13689; + mov.b32 %r13708, %r13689; + mov.b32 %r13709, %r13689; + mov.b32 %r13710, %r13689; + mov.b32 %r13711, %r13689; + mov.b32 %r13712, %r13689; + mov.b32 %r13713, %r13689; + mov.b32 %r13714, %r13689; + mov.b32 %r13715, %r13689; + mov.b32 %r13716, %r13689; + mov.b32 %r13717, %r13689; + mov.b32 %r13718, %r13689; + mov.b32 %r13719, %r13689; + mov.b32 %r13720, %r13689; + mov.b32 %r13721, %r13689; + mov.b32 %r13722, %r13689; + mov.b32 %r13723, %r13689; + mov.b32 %r13724, %r13689; + mov.b32 %r13725, %r13689; + mov.b32 %r13726, %r13689; + mov.b32 %r13727, %r13689; + mov.b32 %r13728, %r13689; + mov.b32 %r13729, %r13689; + mov.b32 %r13730, %r13689; + mov.b32 %r13731, %r13689; + mov.b32 %r13732, %r13689; + mov.b32 %r13733, %r13689; + mov.b32 %r13734, %r13689; + mov.b32 %r13735, %r13689; + mov.b32 %r13736, %r13689; + mov.b32 %r13737, %r13689; + mov.b32 %r13738, %r13689; + mov.b32 %r13739, %r13689; + mov.b32 %r13740, %r13689; + mov.b32 %r13741, %r13689; + mov.b32 %r13742, %r13689; + mov.b32 %r13743, %r13689; + mov.b32 %r13744, %r13689; + mov.b32 %r13745, %r13689; + mov.b32 %r13746, %r13689; + mov.b32 %r13747, %r13689; + mov.b32 %r13748, %r13689; + mov.b32 %r13749, %r13689; + mov.b32 %r13750, %r13689; + mov.b32 %r13751, %r13689; + mov.b32 %r13752, %r13689; +$L__BB0_3: // %__nv_exp2f.exit1479 + // =>This Inner Loop Header: Depth=1 + .loc 1 397 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:397:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.lt.s32 %p75, %r13753, %r85; + setp.lt.s32 %p73, %r13753, %r86; + add.s32 %r4057, %r13683, 1; + setp.gt.s32 %p76, %r4057, 2; + selp.b32 %r13683, 0, %r4057, %p76; + .loc 1 831 52 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:52 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.lt.s32 %p77, %r13768, %r2123; + setp.lt.s32 %p78, %r13769, %r2123; + setp.lt.s32 %p79, %r13766, %r2123; + setp.lt.s32 %p80, %r13767, %r2123; + setp.lt.s32 %p81, %r13764, %r2123; + setp.lt.s32 %p82, %r13765, %r2123; + setp.lt.s32 %p83, %r13762, %r2123; + setp.lt.s32 %p84, %r13763, %r2123; + setp.lt.s32 %p85, %r13760, %r2123; + setp.lt.s32 %p86, %r13761, %r2123; + setp.lt.s32 %p87, %r13758, %r2123; + setp.lt.s32 %p88, %r13759, %r2123; + setp.lt.s32 %p89, %r13756, %r2123; + setp.lt.s32 %p90, %r13757, %r2123; + setp.lt.s32 %p91, %r13754, %r2123; + setp.lt.s32 %p92, %r13755, %r2123; + .loc 1 831 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r4058, %r13683, 14; + add.s32 %r2952, %r2353, %r4058; + .loc 1 459 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:459:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + shfl.sync.idx.b32 %r4060, %r8, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r4061, %r4060, 11; + and.b32 %r4062, %r4061, 8192; + add.s32 %r2911, %r2353, 98304; + add.s32 %r4063, %r4062, %r2911; + bfe.u32 %r4064, %r4063, 4, 14; + cvt.u64.u32 %rd304, %r4064; + or.b64 %rd254, %rd304, 4611686293372403712; + bfe.u32 %r4065, %r2952, 4, 14; + cvt.u64.u32 %rd305, %r4065; + or.b64 %rd255, %rd305, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2495,%r2496,%r2497,%r2498,%r2499,%r2500,%r2501,%r2502,%r2503,%r2504,%r2505,%r2506,%r2507,%r2508,%r2509,%r2510,%r2511,%r2512,%r2513,%r2514,%r2515,%r2516,%r2517,%r2518,%r2519,%r2520,%r2521,%r2522,%r2523,%r2524,%r2525,%r2526}, %rd254, %rd255, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r4066, %r4062, 32; + add.s32 %r4067, %r4066, %r2911; + bfe.u32 %r4068, %r4067, 4, 14; + cvt.u64.u32 %rd306, %r4068; + or.b64 %rd256, %rd306, 4611686293372403712; + add.s32 %r4069, %r2952, 32; + bfe.u32 %r4070, %r4069, 4, 14; + cvt.u64.u32 %rd307, %r4070; + or.b64 %rd257, %rd307, 4611686293338849280; + mov.pred %p55, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2495,%r2496,%r2497,%r2498,%r2499,%r2500,%r2501,%r2502,%r2503,%r2504,%r2505,%r2506,%r2507,%r2508,%r2509,%r2510,%r2511,%r2512,%r2513,%r2514,%r2515,%r2516,%r2517,%r2518,%r2519,%r2520,%r2521,%r2522,%r2523,%r2524,%r2525,%r2526}, %rd256, %rd257, %p55, 1, 1, 0, 0; + // end inline asm + or.b32 %r4071, %r4062, 64; + add.s32 %r4072, %r4071, %r2911; + bfe.u32 %r4073, %r4072, 4, 14; + cvt.u64.u32 %rd308, %r4073; + or.b64 %rd258, %rd308, 4611686293372403712; + add.s32 %r4074, %r2952, 64; + bfe.u32 %r4075, %r4074, 4, 14; + cvt.u64.u32 %rd309, %r4075; + or.b64 %rd259, %rd309, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2495,%r2496,%r2497,%r2498,%r2499,%r2500,%r2501,%r2502,%r2503,%r2504,%r2505,%r2506,%r2507,%r2508,%r2509,%r2510,%r2511,%r2512,%r2513,%r2514,%r2515,%r2516,%r2517,%r2518,%r2519,%r2520,%r2521,%r2522,%r2523,%r2524,%r2525,%r2526}, %rd258, %rd259, %p55, 1, 1, 0, 0; + // end inline asm + or.b32 %r4076, %r4062, 96; + add.s32 %r4077, %r4076, %r2911; + bfe.u32 %r4078, %r4077, 4, 14; + cvt.u64.u32 %rd310, %r4078; + or.b64 %rd260, %rd310, 4611686293372403712; + add.s32 %r4079, %r2952, 96; + bfe.u32 %r4080, %r4079, 4, 14; + cvt.u64.u32 %rd311, %r4080; + or.b64 %rd261, %rd311, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2495,%r2496,%r2497,%r2498,%r2499,%r2500,%r2501,%r2502,%r2503,%r2504,%r2505,%r2506,%r2507,%r2508,%r2509,%r2510,%r2511,%r2512,%r2513,%r2514,%r2515,%r2516,%r2517,%r2518,%r2519,%r2520,%r2521,%r2522,%r2523,%r2524,%r2525,%r2526}, %rd260, %rd261, %p55, 1, 1, 0, 0; + // end inline asm + or.b32 %r4081, %r4062, 16384; + add.s32 %r4082, %r4081, %r2911; + bfe.u32 %r4083, %r4082, 4, 14; + cvt.u64.u32 %rd312, %r4083; + or.b64 %rd262, %rd312, 4611686293372403712; + add.s32 %r4084, %r2952, 8192; + bfe.u32 %r4085, %r4084, 4, 14; + cvt.u64.u32 %rd313, %r4085; + or.b64 %rd263, %rd313, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2495,%r2496,%r2497,%r2498,%r2499,%r2500,%r2501,%r2502,%r2503,%r2504,%r2505,%r2506,%r2507,%r2508,%r2509,%r2510,%r2511,%r2512,%r2513,%r2514,%r2515,%r2516,%r2517,%r2518,%r2519,%r2520,%r2521,%r2522,%r2523,%r2524,%r2525,%r2526}, %rd262, %rd263, %p55, 1, 1, 0, 0; + // end inline asm + or.b32 %r4086, %r4062, 16416; + add.s32 %r4087, %r4086, %r2911; + bfe.u32 %r4088, %r4087, 4, 14; + cvt.u64.u32 %rd314, %r4088; + or.b64 %rd264, %rd314, 4611686293372403712; + add.s32 %r4089, %r2952, 8224; + bfe.u32 %r4090, %r4089, 4, 14; + cvt.u64.u32 %rd315, %r4090; + or.b64 %rd265, %rd315, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2495,%r2496,%r2497,%r2498,%r2499,%r2500,%r2501,%r2502,%r2503,%r2504,%r2505,%r2506,%r2507,%r2508,%r2509,%r2510,%r2511,%r2512,%r2513,%r2514,%r2515,%r2516,%r2517,%r2518,%r2519,%r2520,%r2521,%r2522,%r2523,%r2524,%r2525,%r2526}, %rd264, %rd265, %p55, 1, 1, 0, 0; + // end inline asm + or.b32 %r4091, %r4062, 16448; + add.s32 %r4092, %r4091, %r2911; + bfe.u32 %r4093, %r4092, 4, 14; + cvt.u64.u32 %rd316, %r4093; + or.b64 %rd266, %rd316, 4611686293372403712; + add.s32 %r4094, %r2952, 8256; + bfe.u32 %r4095, %r4094, 4, 14; + cvt.u64.u32 %rd317, %r4095; + or.b64 %rd267, %rd317, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2495,%r2496,%r2497,%r2498,%r2499,%r2500,%r2501,%r2502,%r2503,%r2504,%r2505,%r2506,%r2507,%r2508,%r2509,%r2510,%r2511,%r2512,%r2513,%r2514,%r2515,%r2516,%r2517,%r2518,%r2519,%r2520,%r2521,%r2522,%r2523,%r2524,%r2525,%r2526}, %rd266, %rd267, %p55, 1, 1, 0, 0; + // end inline asm + or.b32 %r4096, %r4062, 16480; + add.s32 %r4097, %r4096, %r2911; + bfe.u32 %r4098, %r4097, 4, 14; + cvt.u64.u32 %rd318, %r4098; + or.b64 %rd268, %rd318, 4611686293372403712; + add.s32 %r4099, %r2952, 8288; + bfe.u32 %r4100, %r4099, 4, 14; + cvt.u64.u32 %rd319, %r4100; + or.b64 %rd269, %rd319, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2495,%r2496,%r2497,%r2498,%r2499,%r2500,%r2501,%r2502,%r2503,%r2504,%r2505,%r2506,%r2507,%r2508,%r2509,%r2510,%r2511,%r2512,%r2513,%r2514,%r2515,%r2516,%r2517,%r2518,%r2519,%r2520,%r2521,%r2522,%r2523,%r2524,%r2525,%r2526}, %rd268, %rd269, %p55, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r3468, 0; + mov.b32 %r2912, %r3468; + mov.b32 %r2913, %r3468; + mov.b32 %r2915, %r3468; + mov.b32 %r2916, %r3468; + mov.b32 %r2914, %r2952; + // begin inline asm + // wait for regs: %r2495,%r2496,%r2497,%r2498,%r2499,%r2500,%r2501,%r2502,%r2503,%r2504,%r2505,%r2506,%r2507,%r2508,%r2509,%r2510,%r2511,%r2512,%r2513,%r2514,%r2515,%r2516,%r2517,%r2518,%r2519,%r2520,%r2521,%r2522,%r2523,%r2524,%r2525,%r2526,%r2911,%r2912,%r2913,%r2914,%r2915,%r2916 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 461 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:461:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4101, %r2495, 0f3DB504F3; + mul.f32 %r4102, %r2496, 0f3DB504F3; + mul.f32 %r4103, %r2497, 0f3DB504F3; + mul.f32 %r4104, %r2498, 0f3DB504F3; + mul.f32 %r4105, %r2499, 0f3DB504F3; + mul.f32 %r4106, %r2500, 0f3DB504F3; + mul.f32 %r4107, %r2501, 0f3DB504F3; + mul.f32 %r4108, %r2502, 0f3DB504F3; + mul.f32 %r4109, %r2503, 0f3DB504F3; + mul.f32 %r4110, %r2504, 0f3DB504F3; + mul.f32 %r4111, %r2505, 0f3DB504F3; + mul.f32 %r4112, %r2506, 0f3DB504F3; + mul.f32 %r4113, %r2507, 0f3DB504F3; + mul.f32 %r4114, %r2508, 0f3DB504F3; + mul.f32 %r4115, %r2509, 0f3DB504F3; + mul.f32 %r4116, %r2510, 0f3DB504F3; + mul.f32 %r4117, %r2511, 0f3DB504F3; + mul.f32 %r4118, %r2512, 0f3DB504F3; + mul.f32 %r4119, %r2513, 0f3DB504F3; + mul.f32 %r4120, %r2514, 0f3DB504F3; + mul.f32 %r4121, %r2515, 0f3DB504F3; + mul.f32 %r4122, %r2516, 0f3DB504F3; + mul.f32 %r4123, %r2517, 0f3DB504F3; + mul.f32 %r4124, %r2518, 0f3DB504F3; + mul.f32 %r4125, %r2519, 0f3DB504F3; + mul.f32 %r4126, %r2520, 0f3DB504F3; + mul.f32 %r4127, %r2521, 0f3DB504F3; + mul.f32 %r4128, %r2522, 0f3DB504F3; + mul.f32 %r4129, %r2523, 0f3DB504F3; + mul.f32 %r4130, %r2524, 0f3DB504F3; + mul.f32 %r4131, %r2525, 0f3DB504F3; + mul.f32 %r4132, %r2526, 0f3DB504F3; + .loc 1 798 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:798:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + rem.s32 %r4133, %r13769, %r2123; + rem.s32 %r4134, %r13768, %r2123; + .loc 1 482 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:482:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.le.s32 %p93, %r4134, %r89; + setp.le.s32 %p94, %r4133, %r89; + .loc 1 490 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:490:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p95, %p2, %p94; + and.pred %p96, %p2, %p93; + .loc 1 493 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:493:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.gt.s32 %p97, %r4133, 2047; + setp.gt.s32 %p98, %r4134, 2047; + .loc 1 503 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:503:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + prmt.b32 %r4135, %r4134, %r4133, 0x5410U; + and.b32 %r4136, %r4135, 134154239; + .loc 1 504 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:504:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mov.b32 {%rs1, %rs2}, %r4136; + cvt.u64.u16 %rd320, %rs1; + cvt.u64.u16 %rd321, %rs2; + setp.gt.s64 %p99, %rd198, %rd321; + setp.gt.s64 %p100, %rd198, %rd320; + .loc 1 505 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:505:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p101, %p98, %p100; + and.pred %p102, %p97, %p99; + .loc 1 506 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:506:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.s32 %r4137, %r4134, %r89; + sub.s32 %r4138, %r4133, %r89; + .loc 1 513 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:513:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.b32 %r4139, %r4138, 2047; + and.b32 %r4140, %r4137, 2047; + .loc 1 514 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:514:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.eq.b32 %p103, %r4140, 0; + setp.eq.b32 %p104, %r4139, 0; + .loc 1 515 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:515:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p105, %p104, %p102; + and.pred %p106, %p103, %p101; + .loc 1 516 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:516:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + or.pred %p107, %p96, %p106; + or.pred %p108, %p95, %p105; + .loc 1 482 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:482:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.le.s32 %p109, %r4134, %r93; + setp.le.s32 %p110, %r4133, %r93; + .loc 1 490 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:490:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p111, %p4, %p110; + and.pred %p112, %p4, %p109; + .loc 1 506 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:506:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.s32 %r4141, %r4134, %r93; + sub.s32 %r4142, %r4133, %r93; + .loc 1 513 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:513:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.b32 %r4143, %r4142, 2047; + and.b32 %r4144, %r4141, 2047; + .loc 1 514 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:514:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.eq.b32 %p113, %r4144, 0; + setp.eq.b32 %p114, %r4143, 0; + .loc 1 515 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:515:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p115, %p114, %p102; + and.pred %p116, %p113, %p101; + .loc 1 516 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:516:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + or.pred %p117, %p112, %p116; + or.pred %p118, %p111, %p115; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p119, %p118, %p78; + and.pred %p120, %p117, %p77; + and.pred %p121, %p108, %p78; + and.pred %p122, %p107, %p77; + .loc 1 798 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:798:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + rem.s32 %r4145, %r13767, %r2123; + rem.s32 %r4146, %r13766, %r2123; + .loc 1 482 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:482:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.le.s32 %p123, %r4146, %r89; + setp.le.s32 %p124, %r4145, %r89; + .loc 1 490 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:490:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p125, %p2, %p124; + and.pred %p126, %p2, %p123; + .loc 1 493 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:493:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.gt.s32 %p127, %r4145, 2047; + setp.gt.s32 %p128, %r4146, 2047; + .loc 1 503 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:503:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + prmt.b32 %r4147, %r4146, %r4145, 0x5410U; + and.b32 %r4148, %r4147, 134154239; + .loc 1 504 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:504:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mov.b32 {%rs3, %rs4}, %r4148; + cvt.u64.u16 %rd322, %rs3; + cvt.u64.u16 %rd323, %rs4; + setp.gt.s64 %p129, %rd198, %rd323; + setp.gt.s64 %p130, %rd198, %rd322; + .loc 1 505 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:505:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p131, %p128, %p130; + and.pred %p132, %p127, %p129; + .loc 1 506 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:506:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.s32 %r4149, %r4146, %r89; + sub.s32 %r4150, %r4145, %r89; + .loc 1 513 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:513:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.b32 %r4151, %r4150, 2047; + and.b32 %r4152, %r4149, 2047; + .loc 1 514 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:514:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.eq.b32 %p133, %r4152, 0; + setp.eq.b32 %p134, %r4151, 0; + .loc 1 515 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:515:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p135, %p134, %p132; + and.pred %p136, %p133, %p131; + .loc 1 516 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:516:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + or.pred %p137, %p126, %p136; + or.pred %p138, %p125, %p135; + .loc 1 482 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:482:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.le.s32 %p139, %r4146, %r93; + setp.le.s32 %p140, %r4145, %r93; + .loc 1 490 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:490:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p141, %p4, %p140; + and.pred %p142, %p4, %p139; + .loc 1 506 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:506:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.s32 %r4153, %r4146, %r93; + sub.s32 %r4154, %r4145, %r93; + .loc 1 513 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:513:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.b32 %r4155, %r4154, 2047; + and.b32 %r4156, %r4153, 2047; + .loc 1 514 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:514:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.eq.b32 %p143, %r4156, 0; + setp.eq.b32 %p144, %r4155, 0; + .loc 1 515 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:515:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p145, %p144, %p132; + and.pred %p146, %p143, %p131; + .loc 1 516 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:516:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + or.pred %p147, %p142, %p146; + or.pred %p148, %p141, %p145; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p149, %p148, %p80; + and.pred %p150, %p147, %p79; + and.pred %p151, %p138, %p80; + and.pred %p152, %p137, %p79; + .loc 1 798 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:798:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + rem.s32 %r4157, %r13765, %r2123; + rem.s32 %r4158, %r13764, %r2123; + .loc 1 482 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:482:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.le.s32 %p153, %r4158, %r89; + setp.le.s32 %p154, %r4157, %r89; + .loc 1 490 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:490:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p155, %p2, %p154; + and.pred %p156, %p2, %p153; + .loc 1 493 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:493:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.gt.s32 %p157, %r4157, 2047; + setp.gt.s32 %p158, %r4158, 2047; + .loc 1 503 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:503:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + prmt.b32 %r4159, %r4158, %r4157, 0x5410U; + and.b32 %r4160, %r4159, 134154239; + .loc 1 504 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:504:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mov.b32 {%rs5, %rs6}, %r4160; + cvt.u64.u16 %rd324, %rs5; + cvt.u64.u16 %rd325, %rs6; + setp.gt.s64 %p159, %rd198, %rd325; + setp.gt.s64 %p160, %rd198, %rd324; + .loc 1 505 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:505:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p161, %p158, %p160; + and.pred %p162, %p157, %p159; + .loc 1 506 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:506:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.s32 %r4161, %r4158, %r89; + sub.s32 %r4162, %r4157, %r89; + .loc 1 513 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:513:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.b32 %r4163, %r4162, 2047; + and.b32 %r4164, %r4161, 2047; + .loc 1 514 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:514:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.eq.b32 %p163, %r4164, 0; + setp.eq.b32 %p164, %r4163, 0; + .loc 1 515 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:515:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p165, %p164, %p162; + and.pred %p166, %p163, %p161; + .loc 1 516 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:516:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + or.pred %p167, %p156, %p166; + or.pred %p168, %p155, %p165; + .loc 1 482 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:482:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.le.s32 %p169, %r4158, %r93; + setp.le.s32 %p170, %r4157, %r93; + .loc 1 490 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:490:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p171, %p4, %p170; + and.pred %p172, %p4, %p169; + .loc 1 506 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:506:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.s32 %r4165, %r4158, %r93; + sub.s32 %r4166, %r4157, %r93; + .loc 1 513 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:513:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.b32 %r4167, %r4166, 2047; + and.b32 %r4168, %r4165, 2047; + .loc 1 514 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:514:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.eq.b32 %p173, %r4168, 0; + setp.eq.b32 %p174, %r4167, 0; + .loc 1 515 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:515:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p175, %p174, %p162; + and.pred %p176, %p173, %p161; + .loc 1 516 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:516:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + or.pred %p177, %p172, %p176; + or.pred %p178, %p171, %p175; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p179, %p178, %p82; + and.pred %p180, %p177, %p81; + and.pred %p181, %p168, %p82; + and.pred %p182, %p167, %p81; + .loc 1 798 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:798:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + rem.s32 %r4169, %r13763, %r2123; + rem.s32 %r4170, %r13762, %r2123; + .loc 1 482 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:482:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.le.s32 %p183, %r4170, %r89; + setp.le.s32 %p184, %r4169, %r89; + .loc 1 490 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:490:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p185, %p2, %p184; + and.pred %p186, %p2, %p183; + .loc 1 493 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:493:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.gt.s32 %p187, %r4169, 2047; + setp.gt.s32 %p188, %r4170, 2047; + .loc 1 503 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:503:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + prmt.b32 %r4171, %r4170, %r4169, 0x5410U; + and.b32 %r4172, %r4171, 134154239; + .loc 1 504 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:504:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mov.b32 {%rs7, %rs8}, %r4172; + cvt.u64.u16 %rd326, %rs7; + cvt.u64.u16 %rd327, %rs8; + setp.gt.s64 %p189, %rd198, %rd327; + setp.gt.s64 %p190, %rd198, %rd326; + .loc 1 505 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:505:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p191, %p188, %p190; + and.pred %p192, %p187, %p189; + .loc 1 506 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:506:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.s32 %r4173, %r4170, %r89; + sub.s32 %r4174, %r4169, %r89; + .loc 1 513 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:513:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.b32 %r4175, %r4174, 2047; + and.b32 %r4176, %r4173, 2047; + .loc 1 514 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:514:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.eq.b32 %p193, %r4176, 0; + setp.eq.b32 %p194, %r4175, 0; + .loc 1 515 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:515:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p195, %p194, %p192; + and.pred %p196, %p193, %p191; + .loc 1 516 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:516:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + or.pred %p197, %p186, %p196; + or.pred %p198, %p185, %p195; + .loc 1 482 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:482:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.le.s32 %p199, %r4170, %r93; + setp.le.s32 %p200, %r4169, %r93; + .loc 1 490 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:490:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p201, %p4, %p200; + and.pred %p202, %p4, %p199; + .loc 1 506 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:506:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.s32 %r4177, %r4170, %r93; + sub.s32 %r4178, %r4169, %r93; + .loc 1 513 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:513:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.b32 %r4179, %r4178, 2047; + and.b32 %r4180, %r4177, 2047; + .loc 1 514 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:514:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.eq.b32 %p203, %r4180, 0; + setp.eq.b32 %p204, %r4179, 0; + .loc 1 515 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:515:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p205, %p204, %p192; + and.pred %p206, %p203, %p191; + .loc 1 516 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:516:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + or.pred %p207, %p202, %p206; + or.pred %p208, %p201, %p205; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p209, %p208, %p84; + and.pred %p210, %p207, %p83; + and.pred %p211, %p198, %p84; + and.pred %p212, %p197, %p83; + .loc 1 798 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:798:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + rem.s32 %r4181, %r13761, %r2123; + rem.s32 %r4182, %r13760, %r2123; + .loc 1 482 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:482:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.le.s32 %p213, %r4182, %r89; + setp.le.s32 %p214, %r4181, %r89; + .loc 1 490 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:490:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p215, %p2, %p214; + and.pred %p216, %p2, %p213; + .loc 1 493 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:493:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.gt.s32 %p217, %r4181, 2047; + setp.gt.s32 %p218, %r4182, 2047; + .loc 1 503 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:503:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + prmt.b32 %r4183, %r4182, %r4181, 0x5410U; + and.b32 %r4184, %r4183, 134154239; + .loc 1 504 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:504:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mov.b32 {%rs9, %rs10}, %r4184; + cvt.u64.u16 %rd328, %rs9; + cvt.u64.u16 %rd329, %rs10; + setp.gt.s64 %p219, %rd198, %rd329; + setp.gt.s64 %p220, %rd198, %rd328; + .loc 1 505 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:505:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p221, %p218, %p220; + and.pred %p222, %p217, %p219; + .loc 1 506 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:506:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.s32 %r4185, %r4182, %r89; + sub.s32 %r4186, %r4181, %r89; + .loc 1 513 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:513:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.b32 %r4187, %r4186, 2047; + and.b32 %r4188, %r4185, 2047; + .loc 1 514 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:514:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.eq.b32 %p223, %r4188, 0; + setp.eq.b32 %p224, %r4187, 0; + .loc 1 515 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:515:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p225, %p224, %p222; + and.pred %p226, %p223, %p221; + .loc 1 516 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:516:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + or.pred %p227, %p216, %p226; + or.pred %p228, %p215, %p225; + .loc 1 482 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:482:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.le.s32 %p229, %r4182, %r93; + setp.le.s32 %p230, %r4181, %r93; + .loc 1 490 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:490:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p231, %p4, %p230; + and.pred %p232, %p4, %p229; + .loc 1 506 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:506:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.s32 %r4189, %r4182, %r93; + sub.s32 %r4190, %r4181, %r93; + .loc 1 513 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:513:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.b32 %r4191, %r4190, 2047; + and.b32 %r4192, %r4189, 2047; + .loc 1 514 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:514:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.eq.b32 %p233, %r4192, 0; + setp.eq.b32 %p234, %r4191, 0; + .loc 1 515 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:515:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p235, %p234, %p222; + and.pred %p236, %p233, %p221; + .loc 1 516 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:516:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + or.pred %p237, %p232, %p236; + or.pred %p238, %p231, %p235; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p239, %p238, %p86; + and.pred %p240, %p237, %p85; + and.pred %p241, %p228, %p86; + and.pred %p242, %p227, %p85; + .loc 1 798 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:798:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + rem.s32 %r4193, %r13759, %r2123; + rem.s32 %r4194, %r13758, %r2123; + .loc 1 482 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:482:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.le.s32 %p243, %r4194, %r89; + setp.le.s32 %p244, %r4193, %r89; + .loc 1 490 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:490:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p245, %p2, %p244; + and.pred %p246, %p2, %p243; + .loc 1 493 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:493:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.gt.s32 %p247, %r4193, 2047; + setp.gt.s32 %p248, %r4194, 2047; + .loc 1 503 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:503:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + prmt.b32 %r4195, %r4194, %r4193, 0x5410U; + and.b32 %r4196, %r4195, 134154239; + .loc 1 504 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:504:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mov.b32 {%rs11, %rs12}, %r4196; + cvt.u64.u16 %rd330, %rs11; + cvt.u64.u16 %rd331, %rs12; + setp.gt.s64 %p249, %rd198, %rd331; + setp.gt.s64 %p250, %rd198, %rd330; + .loc 1 505 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:505:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p251, %p248, %p250; + and.pred %p252, %p247, %p249; + .loc 1 506 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:506:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.s32 %r4197, %r4194, %r89; + sub.s32 %r4198, %r4193, %r89; + .loc 1 513 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:513:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.b32 %r4199, %r4198, 2047; + and.b32 %r4200, %r4197, 2047; + .loc 1 514 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:514:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.eq.b32 %p253, %r4200, 0; + setp.eq.b32 %p254, %r4199, 0; + .loc 1 515 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:515:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p255, %p254, %p252; + and.pred %p256, %p253, %p251; + .loc 1 516 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:516:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + or.pred %p257, %p246, %p256; + or.pred %p258, %p245, %p255; + .loc 1 482 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:482:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.le.s32 %p259, %r4194, %r93; + setp.le.s32 %p260, %r4193, %r93; + .loc 1 490 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:490:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p261, %p4, %p260; + and.pred %p262, %p4, %p259; + .loc 1 506 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:506:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.s32 %r4201, %r4194, %r93; + sub.s32 %r4202, %r4193, %r93; + .loc 1 513 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:513:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.b32 %r4203, %r4202, 2047; + and.b32 %r4204, %r4201, 2047; + .loc 1 514 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:514:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.eq.b32 %p263, %r4204, 0; + setp.eq.b32 %p264, %r4203, 0; + .loc 1 515 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:515:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p265, %p264, %p252; + and.pred %p266, %p263, %p251; + .loc 1 516 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:516:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + or.pred %p267, %p262, %p266; + or.pred %p268, %p261, %p265; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p269, %p268, %p88; + and.pred %p270, %p267, %p87; + and.pred %p271, %p258, %p88; + and.pred %p272, %p257, %p87; + .loc 1 798 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:798:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + rem.s32 %r4205, %r13757, %r2123; + rem.s32 %r4206, %r13756, %r2123; + .loc 1 482 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:482:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.le.s32 %p273, %r4206, %r89; + setp.le.s32 %p274, %r4205, %r89; + .loc 1 490 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:490:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p275, %p2, %p274; + and.pred %p276, %p2, %p273; + .loc 1 493 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:493:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.gt.s32 %p277, %r4205, 2047; + setp.gt.s32 %p278, %r4206, 2047; + .loc 1 503 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:503:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + prmt.b32 %r4207, %r4206, %r4205, 0x5410U; + and.b32 %r4208, %r4207, 134154239; + .loc 1 504 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:504:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mov.b32 {%rs13, %rs14}, %r4208; + cvt.u64.u16 %rd332, %rs13; + cvt.u64.u16 %rd333, %rs14; + setp.gt.s64 %p279, %rd198, %rd333; + setp.gt.s64 %p280, %rd198, %rd332; + .loc 1 505 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:505:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p281, %p278, %p280; + and.pred %p282, %p277, %p279; + .loc 1 506 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:506:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.s32 %r4209, %r4206, %r89; + sub.s32 %r4210, %r4205, %r89; + .loc 1 513 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:513:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.b32 %r4211, %r4210, 2047; + and.b32 %r4212, %r4209, 2047; + .loc 1 514 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:514:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.eq.b32 %p283, %r4212, 0; + setp.eq.b32 %p284, %r4211, 0; + .loc 1 515 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:515:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p285, %p284, %p282; + and.pred %p286, %p283, %p281; + .loc 1 516 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:516:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + or.pred %p287, %p276, %p286; + or.pred %p288, %p275, %p285; + .loc 1 482 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:482:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.le.s32 %p289, %r4206, %r93; + setp.le.s32 %p290, %r4205, %r93; + .loc 1 490 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:490:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p291, %p4, %p290; + and.pred %p292, %p4, %p289; + .loc 1 506 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:506:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.s32 %r4213, %r4206, %r93; + sub.s32 %r4214, %r4205, %r93; + .loc 1 513 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:513:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.b32 %r4215, %r4214, 2047; + and.b32 %r4216, %r4213, 2047; + .loc 1 514 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:514:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.eq.b32 %p293, %r4216, 0; + setp.eq.b32 %p294, %r4215, 0; + .loc 1 515 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:515:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p295, %p294, %p282; + and.pred %p296, %p293, %p281; + .loc 1 516 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:516:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + or.pred %p297, %p292, %p296; + or.pred %p298, %p291, %p295; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p299, %p298, %p90; + and.pred %p300, %p297, %p89; + and.pred %p301, %p288, %p90; + and.pred %p302, %p287, %p89; + .loc 1 798 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:798:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + rem.s32 %r4217, %r13755, %r2123; + rem.s32 %r4218, %r13754, %r2123; + .loc 1 482 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:482:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.le.s32 %p303, %r4218, %r89; + setp.le.s32 %p304, %r4217, %r89; + .loc 1 490 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:490:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p305, %p2, %p304; + and.pred %p306, %p2, %p303; + .loc 1 493 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:493:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.gt.s32 %p307, %r4217, 2047; + setp.gt.s32 %p308, %r4218, 2047; + .loc 1 503 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:503:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + prmt.b32 %r4219, %r4218, %r4217, 0x5410U; + and.b32 %r4220, %r4219, 134154239; + .loc 1 504 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:504:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mov.b32 {%rs15, %rs16}, %r4220; + cvt.u64.u16 %rd334, %rs15; + cvt.u64.u16 %rd335, %rs16; + setp.gt.s64 %p309, %rd198, %rd335; + setp.gt.s64 %p310, %rd198, %rd334; + .loc 1 505 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:505:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p311, %p308, %p310; + and.pred %p312, %p307, %p309; + .loc 1 506 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:506:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.s32 %r4221, %r4218, %r89; + sub.s32 %r4222, %r4217, %r89; + .loc 1 513 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:513:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.b32 %r4223, %r4222, 2047; + and.b32 %r4224, %r4221, 2047; + .loc 1 514 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:514:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.eq.b32 %p313, %r4224, 0; + setp.eq.b32 %p314, %r4223, 0; + .loc 1 515 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:515:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p315, %p314, %p312; + and.pred %p316, %p313, %p311; + .loc 1 516 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:516:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + or.pred %p317, %p306, %p316; + or.pred %p318, %p305, %p315; + .loc 1 482 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:482:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.le.s32 %p319, %r4218, %r93; + setp.le.s32 %p320, %r4217, %r93; + .loc 1 490 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:490:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p321, %p4, %p320; + and.pred %p322, %p4, %p319; + .loc 1 506 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:506:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.s32 %r4225, %r4218, %r93; + sub.s32 %r4226, %r4217, %r93; + .loc 1 513 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:513:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.b32 %r4227, %r4226, 2047; + and.b32 %r4228, %r4225, 2047; + .loc 1 514 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:514:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.eq.b32 %p323, %r4228, 0; + setp.eq.b32 %p324, %r4227, 0; + .loc 1 515 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:515:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p325, %p324, %p312; + and.pred %p326, %p323, %p311; + .loc 1 516 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:516:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + or.pred %p327, %p322, %p326; + or.pred %p328, %p321, %p325; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p329, %p328, %p92; + and.pred %p330, %p327, %p91; + and.pred %p331, %p318, %p92; + and.pred %p332, %p317, %p91; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4229, %r4101, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4230, %r4229, 0fFF800000, %p120; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4231, %r4102, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4232, %r4231, 0fFF800000, %p119; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4233, %r4103, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4234, %r4233, 0fFF800000, %p122; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4235, %r4104, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4236, %r4235, 0fFF800000, %p121; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4237, %r4105, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4238, %r4237, 0fFF800000, %p150; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4239, %r4106, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4240, %r4239, 0fFF800000, %p149; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4241, %r4107, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4242, %r4241, 0fFF800000, %p152; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4243, %r4108, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4244, %r4243, 0fFF800000, %p151; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4245, %r4109, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4246, %r4245, 0fFF800000, %p180; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4247, %r4110, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4248, %r4247, 0fFF800000, %p179; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4249, %r4111, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4250, %r4249, 0fFF800000, %p182; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4251, %r4112, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4252, %r4251, 0fFF800000, %p181; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4253, %r4113, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4254, %r4253, 0fFF800000, %p210; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4255, %r4114, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4256, %r4255, 0fFF800000, %p209; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4257, %r4115, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4258, %r4257, 0fFF800000, %p212; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4259, %r4116, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4260, %r4259, 0fFF800000, %p211; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4261, %r4117, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4262, %r4261, 0fFF800000, %p240; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4263, %r4118, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4264, %r4263, 0fFF800000, %p239; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4265, %r4119, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4266, %r4265, 0fFF800000, %p242; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4267, %r4120, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4268, %r4267, 0fFF800000, %p241; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4269, %r4121, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4270, %r4269, 0fFF800000, %p270; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4271, %r4122, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4272, %r4271, 0fFF800000, %p269; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4273, %r4123, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4274, %r4273, 0fFF800000, %p272; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4275, %r4124, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4276, %r4275, 0fFF800000, %p271; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4277, %r4125, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4278, %r4277, 0fFF800000, %p300; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4279, %r4126, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4280, %r4279, 0fFF800000, %p299; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4281, %r4127, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4282, %r4281, 0fFF800000, %p302; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4283, %r4128, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4284, %r4283, 0fFF800000, %p301; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4285, %r4129, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4286, %r4285, 0fFF800000, %p330; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4287, %r4130, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4288, %r4287, 0fFF800000, %p329; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4289, %r4131, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4290, %r4289, 0fFF800000, %p332; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4291, %r4132, 0f3FB8AA3B; + .loc 1 521 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:521:69 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.f32 %r4292, %r4291, 0fFF800000, %p331; + .loc 1 525 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:525:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.f32 %r4293, %r4230, %r43; + sub.f32 %r4294, %r4232, %r43; + sub.f32 %r4295, %r4234, %r44; + sub.f32 %r4296, %r4236, %r44; + sub.f32 %r4297, %r4238, %r43; + sub.f32 %r4298, %r4240, %r43; + sub.f32 %r4299, %r4242, %r44; + sub.f32 %r4300, %r4244, %r44; + sub.f32 %r4301, %r4246, %r43; + sub.f32 %r4302, %r4248, %r43; + sub.f32 %r4303, %r4250, %r44; + sub.f32 %r4304, %r4252, %r44; + sub.f32 %r4305, %r4254, %r43; + sub.f32 %r4306, %r4256, %r43; + sub.f32 %r4307, %r4258, %r44; + sub.f32 %r4308, %r4260, %r44; + sub.f32 %r4309, %r4262, %r43; + sub.f32 %r4310, %r4264, %r43; + sub.f32 %r4311, %r4266, %r44; + sub.f32 %r4312, %r4268, %r44; + sub.f32 %r4313, %r4270, %r43; + sub.f32 %r4314, %r4272, %r43; + sub.f32 %r4315, %r4274, %r44; + sub.f32 %r4316, %r4276, %r44; + sub.f32 %r4317, %r4278, %r43; + sub.f32 %r4318, %r4280, %r43; + sub.f32 %r4319, %r4282, %r44; + sub.f32 %r4320, %r4284, %r44; + sub.f32 %r4321, %r4286, %r43; + sub.f32 %r4322, %r4288, %r43; + sub.f32 %r4323, %r4290, %r44; + sub.f32 %r4324, %r4292, %r44; + .loc 1 525 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:525:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + ex2.approx.ftz.f32 %r4325, %r4293; + ex2.approx.ftz.f32 %r4326, %r4294; + ex2.approx.ftz.f32 %r4327, %r4295; + ex2.approx.ftz.f32 %r4328, %r4296; + ex2.approx.ftz.f32 %r4329, %r4297; + ex2.approx.ftz.f32 %r4330, %r4298; + ex2.approx.ftz.f32 %r4331, %r4299; + ex2.approx.ftz.f32 %r4332, %r4300; + ex2.approx.ftz.f32 %r4333, %r4301; + ex2.approx.ftz.f32 %r4334, %r4302; + ex2.approx.ftz.f32 %r4335, %r4303; + ex2.approx.ftz.f32 %r4336, %r4304; + ex2.approx.ftz.f32 %r4337, %r4305; + ex2.approx.ftz.f32 %r4338, %r4306; + ex2.approx.ftz.f32 %r4339, %r4307; + ex2.approx.ftz.f32 %r4340, %r4308; + ex2.approx.ftz.f32 %r4341, %r4309; + ex2.approx.ftz.f32 %r4342, %r4310; + ex2.approx.ftz.f32 %r4343, %r4311; + ex2.approx.ftz.f32 %r4344, %r4312; + ex2.approx.ftz.f32 %r4345, %r4313; + ex2.approx.ftz.f32 %r4346, %r4314; + ex2.approx.ftz.f32 %r4347, %r4315; + ex2.approx.ftz.f32 %r4348, %r4316; + ex2.approx.ftz.f32 %r4349, %r4317; + ex2.approx.ftz.f32 %r4350, %r4318; + ex2.approx.ftz.f32 %r4351, %r4319; + ex2.approx.ftz.f32 %r4352, %r4320; + ex2.approx.ftz.f32 %r4353, %r4321; + ex2.approx.ftz.f32 %r4354, %r4322; + ex2.approx.ftz.f32 %r4355, %r4323; + ex2.approx.ftz.f32 %r4356, %r4324; + .loc 1 831 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + add.s32 %r4357, %r2353, 49152; + add.s32 %r3470, %r4357, %r4058; + .loc 1 530 20 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:530:20 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + wgmma.fence.sync.aligned; + add.s32 %r3467, %r2353, 131072; + add.s32 %r4358, %r4062, %r3467; + bfe.u32 %r4359, %r4358, 4, 14; + cvt.u64.u32 %rd336, %r4359; + or.b64 %rd270, %rd336, 4611686293372403712; + bfe.u32 %r4360, %r3470, 4, 14; + cvt.u64.u32 %rd337, %r4360; + or.b64 %rd271, %rd337, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3051,%r3052,%r3053,%r3054,%r3055,%r3056,%r3057,%r3058,%r3059,%r3060,%r3061,%r3062,%r3063,%r3064,%r3065,%r3066,%r3067,%r3068,%r3069,%r3070,%r3071,%r3072,%r3073,%r3074,%r3075,%r3076,%r3077,%r3078,%r3079,%r3080,%r3081,%r3082}, %rd270, %rd271, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r4361, %r4066, %r3467; + bfe.u32 %r4362, %r4361, 4, 14; + cvt.u64.u32 %rd338, %r4362; + or.b64 %rd272, %rd338, 4611686293372403712; + add.s32 %r4363, %r3470, 32; + bfe.u32 %r4364, %r4363, 4, 14; + cvt.u64.u32 %rd339, %r4364; + or.b64 %rd273, %rd339, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3051,%r3052,%r3053,%r3054,%r3055,%r3056,%r3057,%r3058,%r3059,%r3060,%r3061,%r3062,%r3063,%r3064,%r3065,%r3066,%r3067,%r3068,%r3069,%r3070,%r3071,%r3072,%r3073,%r3074,%r3075,%r3076,%r3077,%r3078,%r3079,%r3080,%r3081,%r3082}, %rd272, %rd273, %p55, 1, 1, 0, 0; + // end inline asm + add.s32 %r4365, %r4071, %r3467; + bfe.u32 %r4366, %r4365, 4, 14; + cvt.u64.u32 %rd340, %r4366; + or.b64 %rd274, %rd340, 4611686293372403712; + add.s32 %r4367, %r3470, 64; + bfe.u32 %r4368, %r4367, 4, 14; + cvt.u64.u32 %rd341, %r4368; + or.b64 %rd275, %rd341, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3051,%r3052,%r3053,%r3054,%r3055,%r3056,%r3057,%r3058,%r3059,%r3060,%r3061,%r3062,%r3063,%r3064,%r3065,%r3066,%r3067,%r3068,%r3069,%r3070,%r3071,%r3072,%r3073,%r3074,%r3075,%r3076,%r3077,%r3078,%r3079,%r3080,%r3081,%r3082}, %rd274, %rd275, %p55, 1, 1, 0, 0; + // end inline asm + add.s32 %r4369, %r4076, %r3467; + bfe.u32 %r4370, %r4369, 4, 14; + cvt.u64.u32 %rd342, %r4370; + or.b64 %rd276, %rd342, 4611686293372403712; + add.s32 %r4371, %r3470, 96; + bfe.u32 %r4372, %r4371, 4, 14; + cvt.u64.u32 %rd343, %r4372; + or.b64 %rd277, %rd343, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3051,%r3052,%r3053,%r3054,%r3055,%r3056,%r3057,%r3058,%r3059,%r3060,%r3061,%r3062,%r3063,%r3064,%r3065,%r3066,%r3067,%r3068,%r3069,%r3070,%r3071,%r3072,%r3073,%r3074,%r3075,%r3076,%r3077,%r3078,%r3079,%r3080,%r3081,%r3082}, %rd276, %rd277, %p55, 1, 1, 0, 0; + // end inline asm + add.s32 %r4373, %r4081, %r3467; + bfe.u32 %r4374, %r4373, 4, 14; + cvt.u64.u32 %rd344, %r4374; + or.b64 %rd278, %rd344, 4611686293372403712; + add.s32 %r4375, %r3470, 8192; + bfe.u32 %r4376, %r4375, 4, 14; + cvt.u64.u32 %rd345, %r4376; + or.b64 %rd279, %rd345, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3051,%r3052,%r3053,%r3054,%r3055,%r3056,%r3057,%r3058,%r3059,%r3060,%r3061,%r3062,%r3063,%r3064,%r3065,%r3066,%r3067,%r3068,%r3069,%r3070,%r3071,%r3072,%r3073,%r3074,%r3075,%r3076,%r3077,%r3078,%r3079,%r3080,%r3081,%r3082}, %rd278, %rd279, %p55, 1, 1, 0, 0; + // end inline asm + add.s32 %r4377, %r4086, %r3467; + bfe.u32 %r4378, %r4377, 4, 14; + cvt.u64.u32 %rd346, %r4378; + or.b64 %rd280, %rd346, 4611686293372403712; + add.s32 %r4379, %r3470, 8224; + bfe.u32 %r4380, %r4379, 4, 14; + cvt.u64.u32 %rd347, %r4380; + or.b64 %rd281, %rd347, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3051,%r3052,%r3053,%r3054,%r3055,%r3056,%r3057,%r3058,%r3059,%r3060,%r3061,%r3062,%r3063,%r3064,%r3065,%r3066,%r3067,%r3068,%r3069,%r3070,%r3071,%r3072,%r3073,%r3074,%r3075,%r3076,%r3077,%r3078,%r3079,%r3080,%r3081,%r3082}, %rd280, %rd281, %p55, 1, 1, 0, 0; + // end inline asm + add.s32 %r4381, %r4091, %r3467; + bfe.u32 %r4382, %r4381, 4, 14; + cvt.u64.u32 %rd348, %r4382; + or.b64 %rd282, %rd348, 4611686293372403712; + add.s32 %r4383, %r3470, 8256; + bfe.u32 %r4384, %r4383, 4, 14; + cvt.u64.u32 %rd349, %r4384; + or.b64 %rd283, %rd349, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3051,%r3052,%r3053,%r3054,%r3055,%r3056,%r3057,%r3058,%r3059,%r3060,%r3061,%r3062,%r3063,%r3064,%r3065,%r3066,%r3067,%r3068,%r3069,%r3070,%r3071,%r3072,%r3073,%r3074,%r3075,%r3076,%r3077,%r3078,%r3079,%r3080,%r3081,%r3082}, %rd282, %rd283, %p55, 1, 1, 0, 0; + // end inline asm + add.s32 %r4385, %r4096, %r3467; + bfe.u32 %r4386, %r4385, 4, 14; + cvt.u64.u32 %rd350, %r4386; + or.b64 %rd284, %rd350, 4611686293372403712; + add.s32 %r4387, %r3470, 8288; + bfe.u32 %r4388, %r4387, 4, 14; + cvt.u64.u32 %rd351, %r4388; + or.b64 %rd285, %rd351, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3051,%r3052,%r3053,%r3054,%r3055,%r3056,%r3057,%r3058,%r3059,%r3060,%r3061,%r3062,%r3063,%r3064,%r3065,%r3066,%r3067,%r3068,%r3069,%r3070,%r3071,%r3072,%r3073,%r3074,%r3075,%r3076,%r3077,%r3078,%r3079,%r3080,%r3081,%r3082}, %rd284, %rd285, %p55, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r3469, %r3468; + mov.b32 %r3471, %r3468; + mov.b32 %r3472, %r3468; + // begin inline asm + // wait for regs: %r3051,%r3052,%r3053,%r3054,%r3055,%r3056,%r3057,%r3058,%r3059,%r3060,%r3061,%r3062,%r3063,%r3064,%r3065,%r3066,%r3067,%r3068,%r3069,%r3070,%r3071,%r3072,%r3073,%r3074,%r3075,%r3076,%r3077,%r3078,%r3079,%r3080,%r3081,%r3082,%r3467,%r3468,%r3469,%r3470,%r3471,%r3472 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 531 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mov.b64 {%r4389, %r4390}, %rd28; + sub.f32 %r4391, %r3051, %r4389; + sub.f32 %r4392, %r3052, %r4390; + .loc 1 531 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4393, %r4326, %r4392; + mul.f32 %r4394, %r4325, %r4391; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + cvt.rn.bf16.f32 %rs17, %r4394; + cvt.rn.bf16.f32 %rs18, %r4393; + .loc 1 549 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:549:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.b16 %rs19, %rs18, 0x0000, %p119; + selp.b16 %rs20, %rs17, 0x0000, %p120; + mov.b32 %r3639, {%rs20, %rs19}; + .loc 1 531 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mov.b64 {%r4395, %r4396}, %rd27; + sub.f32 %r4397, %r3053, %r4395; + sub.f32 %r4398, %r3054, %r4396; + .loc 1 531 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4399, %r4328, %r4398; + mul.f32 %r4400, %r4327, %r4397; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + cvt.rn.bf16.f32 %rs21, %r4400; + cvt.rn.bf16.f32 %rs22, %r4399; + .loc 1 549 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:549:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.b16 %rs23, %rs22, 0x0000, %p121; + selp.b16 %rs24, %rs21, 0x0000, %p122; + mov.b32 %r3640, {%rs24, %rs23}; + .loc 1 531 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.f32 %r4401, %r3055, %r4389; + sub.f32 %r4402, %r3056, %r4390; + .loc 1 531 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4403, %r4330, %r4402; + mul.f32 %r4404, %r4329, %r4401; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + cvt.rn.bf16.f32 %rs25, %r4404; + cvt.rn.bf16.f32 %rs26, %r4403; + .loc 1 549 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:549:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.b16 %rs27, %rs26, 0x0000, %p149; + selp.b16 %rs28, %rs25, 0x0000, %p150; + mov.b32 %r3641, {%rs28, %rs27}; + .loc 1 531 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.f32 %r4405, %r3057, %r4395; + sub.f32 %r4406, %r3058, %r4396; + .loc 1 531 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4407, %r4332, %r4406; + mul.f32 %r4408, %r4331, %r4405; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + cvt.rn.bf16.f32 %rs29, %r4408; + cvt.rn.bf16.f32 %rs30, %r4407; + .loc 1 549 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:549:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.b16 %rs31, %rs30, 0x0000, %p151; + selp.b16 %rs32, %rs29, 0x0000, %p152; + mov.b32 %r3642, {%rs32, %rs31}; + .loc 1 531 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.f32 %r4409, %r3059, %r4389; + sub.f32 %r4410, %r3060, %r4390; + .loc 1 531 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4411, %r4334, %r4410; + mul.f32 %r4412, %r4333, %r4409; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + cvt.rn.bf16.f32 %rs33, %r4412; + cvt.rn.bf16.f32 %rs34, %r4411; + .loc 1 549 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:549:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.b16 %rs35, %rs34, 0x0000, %p179; + selp.b16 %rs36, %rs33, 0x0000, %p180; + mov.b32 %r3771, {%rs36, %rs35}; + .loc 1 531 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.f32 %r4413, %r3061, %r4395; + sub.f32 %r4414, %r3062, %r4396; + .loc 1 531 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4415, %r4336, %r4414; + mul.f32 %r4416, %r4335, %r4413; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + cvt.rn.bf16.f32 %rs37, %r4416; + cvt.rn.bf16.f32 %rs38, %r4415; + .loc 1 549 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:549:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.b16 %rs39, %rs38, 0x0000, %p181; + selp.b16 %rs40, %rs37, 0x0000, %p182; + mov.b32 %r3772, {%rs40, %rs39}; + .loc 1 531 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.f32 %r4417, %r3063, %r4389; + sub.f32 %r4418, %r3064, %r4390; + .loc 1 531 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4419, %r4338, %r4418; + mul.f32 %r4420, %r4337, %r4417; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + cvt.rn.bf16.f32 %rs41, %r4420; + cvt.rn.bf16.f32 %rs42, %r4419; + .loc 1 549 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:549:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.b16 %rs43, %rs42, 0x0000, %p209; + selp.b16 %rs44, %rs41, 0x0000, %p210; + mov.b32 %r3773, {%rs44, %rs43}; + .loc 1 531 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.f32 %r4421, %r3065, %r4395; + sub.f32 %r4422, %r3066, %r4396; + .loc 1 531 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4423, %r4340, %r4422; + mul.f32 %r4424, %r4339, %r4421; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + cvt.rn.bf16.f32 %rs45, %r4424; + cvt.rn.bf16.f32 %rs46, %r4423; + .loc 1 549 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:549:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.b16 %rs47, %rs46, 0x0000, %p211; + selp.b16 %rs48, %rs45, 0x0000, %p212; + mov.b32 %r3774, {%rs48, %rs47}; + .loc 1 531 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.f32 %r4425, %r3067, %r4389; + sub.f32 %r4426, %r3068, %r4390; + .loc 1 531 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4427, %r4342, %r4426; + mul.f32 %r4428, %r4341, %r4425; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + cvt.rn.bf16.f32 %rs49, %r4428; + cvt.rn.bf16.f32 %rs50, %r4427; + .loc 1 549 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:549:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.b16 %rs51, %rs50, 0x0000, %p239; + selp.b16 %rs52, %rs49, 0x0000, %p240; + mov.b32 %r3903, {%rs52, %rs51}; + .loc 1 531 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.f32 %r4429, %r3069, %r4395; + sub.f32 %r4430, %r3070, %r4396; + .loc 1 531 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4431, %r4344, %r4430; + mul.f32 %r4432, %r4343, %r4429; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + cvt.rn.bf16.f32 %rs53, %r4432; + cvt.rn.bf16.f32 %rs54, %r4431; + .loc 1 549 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:549:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.b16 %rs55, %rs54, 0x0000, %p241; + selp.b16 %rs56, %rs53, 0x0000, %p242; + mov.b32 %r3904, {%rs56, %rs55}; + .loc 1 531 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.f32 %r4433, %r3071, %r4389; + sub.f32 %r4434, %r3072, %r4390; + .loc 1 531 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4435, %r4346, %r4434; + mul.f32 %r4436, %r4345, %r4433; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + cvt.rn.bf16.f32 %rs57, %r4436; + cvt.rn.bf16.f32 %rs58, %r4435; + .loc 1 549 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:549:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.b16 %rs59, %rs58, 0x0000, %p269; + selp.b16 %rs60, %rs57, 0x0000, %p270; + mov.b32 %r3905, {%rs60, %rs59}; + .loc 1 531 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.f32 %r4437, %r3073, %r4395; + sub.f32 %r4438, %r3074, %r4396; + .loc 1 531 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4439, %r4348, %r4438; + mul.f32 %r4440, %r4347, %r4437; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + cvt.rn.bf16.f32 %rs61, %r4440; + cvt.rn.bf16.f32 %rs62, %r4439; + .loc 1 549 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:549:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.b16 %rs63, %rs62, 0x0000, %p271; + selp.b16 %rs64, %rs61, 0x0000, %p272; + mov.b32 %r3906, {%rs64, %rs63}; + .loc 1 531 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.f32 %r4441, %r3075, %r4389; + sub.f32 %r4442, %r3076, %r4390; + .loc 1 531 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4443, %r4350, %r4442; + mul.f32 %r4444, %r4349, %r4441; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + cvt.rn.bf16.f32 %rs65, %r4444; + cvt.rn.bf16.f32 %rs66, %r4443; + .loc 1 549 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:549:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.b16 %rs67, %rs66, 0x0000, %p299; + selp.b16 %rs68, %rs65, 0x0000, %p300; + mov.b32 %r4035, {%rs68, %rs67}; + .loc 1 531 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.f32 %r4445, %r3077, %r4395; + sub.f32 %r4446, %r3078, %r4396; + .loc 1 531 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4447, %r4352, %r4446; + mul.f32 %r4448, %r4351, %r4445; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + cvt.rn.bf16.f32 %rs69, %r4448; + cvt.rn.bf16.f32 %rs70, %r4447; + .loc 1 549 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:549:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.b16 %rs71, %rs70, 0x0000, %p301; + selp.b16 %rs72, %rs69, 0x0000, %p302; + mov.b32 %r4036, {%rs72, %rs71}; + .loc 1 531 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.f32 %r4449, %r3079, %r4389; + sub.f32 %r4450, %r3080, %r4390; + .loc 1 531 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4451, %r4354, %r4450; + mul.f32 %r4452, %r4353, %r4449; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + cvt.rn.bf16.f32 %rs73, %r4452; + cvt.rn.bf16.f32 %rs74, %r4451; + .loc 1 549 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:549:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.b16 %rs75, %rs74, 0x0000, %p329; + selp.b16 %rs76, %rs73, 0x0000, %p330; + mov.b32 %r4037, {%rs76, %rs75}; + .loc 1 531 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.f32 %r4453, %r3081, %r4395; + sub.f32 %r4454, %r3082, %r4396; + .loc 1 531 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.f32 %r4455, %r4356, %r4454; + mul.f32 %r4456, %r4355, %r4453; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + cvt.rn.bf16.f32 %rs77, %r4456; + cvt.rn.bf16.f32 %rs78, %r4455; + .loc 1 549 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:549:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + selp.b16 %rs79, %rs78, 0x0000, %p331; + selp.b16 %rs80, %rs77, 0x0000, %p332; + mov.b32 %r4038, {%rs80, %rs79}; + .loc 1 553 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:553:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13689,%r13690,%r13691,%r13692,%r13693,%r13694,%r13695,%r13696,%r13697,%r13698,%r13699,%r13700,%r13701,%r13702,%r13703,%r13704,%r13705,%r13706,%r13707,%r13708,%r13709,%r13710,%r13711,%r13712,%r13713,%r13714,%r13715,%r13716,%r13717,%r13718,%r13719,%r13720,%r13721,%r13722,%r13723,%r13724,%r13725,%r13726,%r13727,%r13728,%r13729,%r13730,%r13731,%r13732,%r13733,%r13734,%r13735,%r13736,%r13737,%r13738,%r13739,%r13740,%r13741,%r13742,%r13743,%r13744,%r13745,%r13746,%r13747,%r13748,%r13749,%r13750,%r13751,%r13752}, {%r3639,%r3640,%r3641,%r3642}, %rd255, %p55, 1, 1, 1; + // end inline asm + add.s32 %r4457, %r2952, 2048; + bfe.u32 %r4458, %r4457, 4, 14; + cvt.u64.u32 %rd352, %r4458; + or.b64 %rd287, %rd352, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13689,%r13690,%r13691,%r13692,%r13693,%r13694,%r13695,%r13696,%r13697,%r13698,%r13699,%r13700,%r13701,%r13702,%r13703,%r13704,%r13705,%r13706,%r13707,%r13708,%r13709,%r13710,%r13711,%r13712,%r13713,%r13714,%r13715,%r13716,%r13717,%r13718,%r13719,%r13720,%r13721,%r13722,%r13723,%r13724,%r13725,%r13726,%r13727,%r13728,%r13729,%r13730,%r13731,%r13732,%r13733,%r13734,%r13735,%r13736,%r13737,%r13738,%r13739,%r13740,%r13741,%r13742,%r13743,%r13744,%r13745,%r13746,%r13747,%r13748,%r13749,%r13750,%r13751,%r13752}, {%r3771,%r3772,%r3773,%r3774}, %rd287, %p55, 1, 1, 1; + // end inline asm + add.s32 %r4459, %r2952, 4096; + bfe.u32 %r4460, %r4459, 4, 14; + cvt.u64.u32 %rd353, %r4460; + or.b64 %rd288, %rd353, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13689,%r13690,%r13691,%r13692,%r13693,%r13694,%r13695,%r13696,%r13697,%r13698,%r13699,%r13700,%r13701,%r13702,%r13703,%r13704,%r13705,%r13706,%r13707,%r13708,%r13709,%r13710,%r13711,%r13712,%r13713,%r13714,%r13715,%r13716,%r13717,%r13718,%r13719,%r13720,%r13721,%r13722,%r13723,%r13724,%r13725,%r13726,%r13727,%r13728,%r13729,%r13730,%r13731,%r13732,%r13733,%r13734,%r13735,%r13736,%r13737,%r13738,%r13739,%r13740,%r13741,%r13742,%r13743,%r13744,%r13745,%r13746,%r13747,%r13748,%r13749,%r13750,%r13751,%r13752}, {%r3903,%r3904,%r3905,%r3906}, %rd288, %p55, 1, 1, 1; + // end inline asm + add.s32 %r4461, %r2952, 6144; + bfe.u32 %r4462, %r4461, 4, 14; + cvt.u64.u32 %rd354, %r4462; + or.b64 %rd289, %rd354, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13689,%r13690,%r13691,%r13692,%r13693,%r13694,%r13695,%r13696,%r13697,%r13698,%r13699,%r13700,%r13701,%r13702,%r13703,%r13704,%r13705,%r13706,%r13707,%r13708,%r13709,%r13710,%r13711,%r13712,%r13713,%r13714,%r13715,%r13716,%r13717,%r13718,%r13719,%r13720,%r13721,%r13722,%r13723,%r13724,%r13725,%r13726,%r13727,%r13728,%r13729,%r13730,%r13731,%r13732,%r13733,%r13734,%r13735,%r13736,%r13737,%r13738,%r13739,%r13740,%r13741,%r13742,%r13743,%r13744,%r13745,%r13746,%r13747,%r13748,%r13749,%r13750,%r13751,%r13752}, {%r4035,%r4036,%r4037,%r4038}, %rd289, %p55, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 417 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:417:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + add.s32 %r13768, %r13682, %r13768; + add.s32 %r13769, %r13682, %r13769; + add.s32 %r13766, %r13682, %r13766; + add.s32 %r13767, %r13682, %r13767; + add.s32 %r13764, %r13682, %r13764; + add.s32 %r13765, %r13682, %r13765; + add.s32 %r13762, %r13682, %r13762; + add.s32 %r13763, %r13682, %r13763; + add.s32 %r13760, %r13682, %r13760; + add.s32 %r13761, %r13682, %r13761; + add.s32 %r13758, %r13682, %r13758; + add.s32 %r13759, %r13682, %r13759; + add.s32 %r13756, %r13682, %r13756; + add.s32 %r13757, %r13682, %r13757; + add.s32 %r13754, %r13682, %r13754; + add.s32 %r13755, %r13682, %r13755; + .loc 1 397 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:397:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + add.s32 %r263, %r13753, 1; + .loc 1 788 33 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:788:33 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + shr.u32 %r4463, %r263, 1; + .loc 1 789 38 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:789:38 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mad.wide.u32 %rd291, %r4463, 4, %rd196; + .loc 1 789 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:789:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + // begin inline asm + mov.u64 %rd290, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd290, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4039, 0x0; + @%p73 ld.global.L1::evict_last.L2::cache_hint.b32 { %r4039 }, [ %rd291 + 0 ], %rd290; + // end inline asm + .loc 1 790 109 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:790:109 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + add.s32 %r4464, %r4463, 1; + .loc 1 790 113 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:790:113 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.lt.s32 %p333, %r4464, %r2268; + .loc 1 790 55 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:790:55 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + add.s64 %rd294, %rd291, 4; + .loc 1 397 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:397:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.pred %p74, %p73, %p333; + .loc 1 790 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:790:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + // begin inline asm + mov.u64 %rd293, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd293, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4040, 0x0; + @%p74 ld.global.L1::evict_last.L2::cache_hint.b32 { %r4040 }, [ %rd294 + 0 ], %rd293; + // end inline asm + .loc 1 791 35 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:791:35 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + and.b32 %r4465, %r13753, 1; + .loc 1 792 34 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:792:34 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + sub.s32 %r4466, %r4040, %r4039; + .loc 1 792 48 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:792:48 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + shl.b32 %r4467, %r4466, 7; + .loc 1 792 63 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:792:63 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + add.s32 %r4468, %r4467, -64; + .loc 1 793 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:793:29 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + xor.b32 %r4469, %r4465, 1; + .loc 1 793 61 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:793:61 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + shl.b32 %r4470, %r4465, 6; + .loc 1 793 42 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:793:42 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mad.lo.s32 %r13682, %r4468, %r4469, %r4470; + .loc 1 414 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:414:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + shl.b32 %r4471, %r13682, 7; + .loc 1 414 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:414:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + mul.wide.s32 %rd355, %r4471, 2; + add.s64 %rd996, %rd996, %rd355; + add.s64 %rd995, %rd995, %rd355; + add.s64 %rd994, %rd994, %rd355; + add.s64 %rd993, %rd993, %rd355; + .loc 1 415 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:415:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + add.s64 %rd992, %rd992, %rd355; + add.s64 %rd991, %rd991, %rd355; + add.s64 %rd990, %rd990, %rd355; + add.s64 %rd989, %rd989, %rd355; + .loc 1 417 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:417:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + add.s32 %r13688, %r13682, %r13688; + add.s32 %r13687, %r13682, %r13687; + add.s32 %r13686, %r13682, %r13686; + add.s32 %r13685, %r13682, %r13685; + .loc 1 397 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:397:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + add.s32 %r4472, %r13684, 1; + setp.gt.s32 %p334, %r4472, 2; + selp.b32 %r13684, 0, %r4472, %p334; + .loc 1 831 52 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:52 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.lt.s32 %p335, %r13688, %r2123; + setp.lt.s32 %p336, %r13687, %r2123; + setp.lt.s32 %p337, %r13686, %r2123; + setp.lt.s32 %p338, %r13685, %r2123; + .loc 1 831 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + shl.b32 %r4473, %r13684, 14; + add.s32 %r4474, %r2353, %r4473; + bar.sync 0; + add.s32 %r4041, %r4474, %r48; + selp.b32 %r4475, 16, 0, %p335; + selp.b32 %r4050, %r4475, 0, %p75; + // begin inline asm + cp.async.cg.shared.global [ %r4041 + 0 ], [ %rd996 + 0 ], 0x10, %r4050; + // end inline asm + add.s32 %r4043, %r4041, 2048; + selp.b32 %r4476, 16, 0, %p336; + selp.b32 %r4052, %r4476, 0, %p75; + // begin inline asm + cp.async.cg.shared.global [ %r4043 + 0 ], [ %rd995 + 0 ], 0x10, %r4052; + // end inline asm + add.s32 %r4045, %r4041, 4096; + selp.b32 %r4477, 16, 0, %p337; + selp.b32 %r4054, %r4477, 0, %p75; + // begin inline asm + cp.async.cg.shared.global [ %r4045 + 0 ], [ %rd994 + 0 ], 0x10, %r4054; + // end inline asm + add.s32 %r4047, %r4041, 6144; + selp.b32 %r4478, 16, 0, %p338; + selp.b32 %r4056, %r4478, 0, %p75; + // begin inline asm + cp.async.cg.shared.global [ %r4047 + 0 ], [ %rd993 + 0 ], 0x10, %r4056; + // end inline asm + cp.async.commit_group; + add.s32 %r4479, %r4357, %r4473; + add.s32 %r4049, %r4479, %r48; + // begin inline asm + cp.async.cg.shared.global [ %r4049 + 0 ], [ %rd992 + 0 ], 0x10, %r4050; + // end inline asm + add.s32 %r4051, %r4049, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r4051 + 0 ], [ %rd991 + 0 ], 0x10, %r4052; + // end inline asm + add.s32 %r4053, %r4049, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r4053 + 0 ], [ %rd990 + 0 ], 0x10, %r4054; + // end inline asm + add.s32 %r4055, %r4049, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r4055 + 0 ], [ %rd989 + 0 ], 0x10, %r4056; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:397:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + setp.ne.b32 %p339, %r87, %r263; + mov.b32 %r13753, %r263; + @%p339 bra $L__BB0_3; +$L__tmp30: +$L__BB0_4: // %._crit_edge + .loc 1 0 0 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:0 + add.s64 %rd4, %rd161, %rd216; +$L__tmp31: + cvt.s64.s32 %rd5, %r2334; + cvt.s64.s32 %rd6, %r2335; + cvt.s64.s32 %rd7, %r2336; + cvt.s64.s32 %rd8, %r2337; + cvt.s64.s32 %rd9, %r2338; + cvt.s64.s32 %rd10, %r2339; + cvt.s64.s32 %rd11, %r2340; + cvt.s64.s32 %rd12, %r2341; +$L__tmp32: + .loc 1 397 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:397:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:207:12 ] + // begin inline asm + // wait for regs: %r13689,%r13690,%r13691,%r13692,%r13693,%r13694,%r13695,%r13696,%r13697,%r13698,%r13699,%r13700,%r13701,%r13702,%r13703,%r13704,%r13705,%r13706,%r13707,%r13708,%r13709,%r13710,%r13711,%r13712,%r13713,%r13714,%r13715,%r13716,%r13717,%r13718,%r13719,%r13720,%r13721,%r13722,%r13723,%r13724,%r13725,%r13726,%r13727,%r13728,%r13729,%r13730,%r13731,%r13732,%r13733,%r13734,%r13735,%r13736,%r13737,%r13738,%r13739,%r13740,%r13741,%r13742,%r13743,%r13744,%r13745,%r13746,%r13747,%r13748,%r13749,%r13750,%r13751,%r13752 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp33: + .loc 1 214 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:214:39 + shl.b64 %rd374, %rd14, 2; + add.s64 %rd356, %rd167, %rd374; + .loc 1 215 31 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:215:31 + // begin inline asm + mov.u32 %r4608, 0x0; + ld.global.b32 { %r4608 }, [ %rd356 + 0 ]; + // end inline asm + .loc 1 215 45 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:215:45 + shl.b32 %r334, %r4608, 7; + .loc 1 216 62 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:216:62 + shl.b64 %rd375, %rd16, 2; + add.s64 %rd357, %rd166, %rd375; + .loc 1 216 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:216:43 + // begin inline asm + mov.u32 %r4609, 0x0; + ld.global.b32 { %r4609 }, [ %rd357 + 0 ]; + // end inline asm + .loc 1 218 33 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:218:33 + or.b32 %r4642, %r334, %r10; + or.b32 %r4643, %r334, %r11; + or.b32 %r4644, %r334, %r12; + or.b32 %r4645, %r334, %r13; +$L__tmp34: + .loc 1 390 37 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:390:37 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + shl.b32 %r4646, %r4642, 7; + shl.b32 %r4647, %r4643, 7; + shl.b32 %r4648, %r4644, 7; + shl.b32 %r4649, %r4645, 7; + .loc 1 390 18 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:390:18 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.wide.s32 %rd376, %r4646, 2; + add.s64 %rd377, %rd1, %rd376; + mul.wide.s32 %rd378, %r4647, 2; + add.s64 %rd379, %rd1, %rd378; + mul.wide.s32 %rd380, %r4648, 2; + add.s64 %rd381, %rd1, %rd380; + mul.wide.s32 %rd382, %r4649, 2; + add.s64 %rd383, %rd1, %rd382; + .loc 1 390 49 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:390:49 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + shl.b64 %rd384, %rd13, 1; + add.s64 %rd358, %rd377, %rd384; + add.s64 %rd359, %rd379, %rd384; + add.s64 %rd360, %rd381, %rd384; + add.s64 %rd361, %rd383, %rd384; + .loc 1 391 18 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:391:18 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + add.s64 %rd385, %rd2, %rd376; + add.s64 %rd386, %rd2, %rd378; + add.s64 %rd387, %rd2, %rd380; + add.s64 %rd388, %rd2, %rd382; + .loc 1 391 49 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:391:49 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + add.s64 %rd362, %rd385, %rd384; + add.s64 %rd363, %rd386, %rd384; + add.s64 %rd364, %rd387, %rd384; + add.s64 %rd365, %rd388, %rd384; + .loc 1 395 43 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:395:43 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + shl.b32 %r4650, %r4609, 1; + .loc 1 395 63 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:395:63 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + min.s32 %r336, %r4650, %r38; + .loc 1 397 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:397:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + setp.lt.s32 %p340, %r4650, 1; + setp.gt.s32 %p341, %r4650, 0; + .loc 1 831 52 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:52 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + setp.lt.s32 %p342, %r4642, %r2123; + setp.lt.s32 %p343, %r4643, %r2123; + setp.lt.s32 %p344, %r4644, %r2123; + setp.lt.s32 %p345, %r4645, %r2123; + .loc 1 831 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b32 %r4651, 16, 0, %p342; + selp.b32 %r4619, %r4651, 0, %p341; + // begin inline asm + cp.async.cg.shared.global [ %r4610 + 0 ], [ %rd358 + 0 ], 0x10, %r4619; + // end inline asm + selp.b32 %r4652, 16, 0, %p343; + selp.b32 %r4621, %r4652, 0, %p341; + // begin inline asm + cp.async.cg.shared.global [ %r4612 + 0 ], [ %rd359 + 0 ], 0x10, %r4621; + // end inline asm + selp.b32 %r4653, 16, 0, %p344; + selp.b32 %r4623, %r4653, 0, %p341; + // begin inline asm + cp.async.cg.shared.global [ %r4614 + 0 ], [ %rd360 + 0 ], 0x10, %r4623; + // end inline asm + selp.b32 %r4654, 16, 0, %p345; + selp.b32 %r4625, %r4654, 0, %p341; + // begin inline asm + cp.async.cg.shared.global [ %r4616 + 0 ], [ %rd361 + 0 ], 0x10, %r4625; + // end inline asm + cp.async.commit_group; + // begin inline asm + cp.async.cg.shared.global [ %r2277 + 0 ], [ %rd362 + 0 ], 0x10, %r4619; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2279 + 0 ], [ %rd363 + 0 ], 0x10, %r4621; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2281 + 0 ], [ %rd364 + 0 ], 0x10, %r4623; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2283 + 0 ], [ %rd365 + 0 ], 0x10, %r4625; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:397:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + setp.gt.s32 %p346, %r336, 1; + .loc 1 414 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:414:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + add.s64 %rd1004, %rd358, 16384; + add.s64 %rd1003, %rd359, 16384; + add.s64 %rd1002, %rd360, 16384; + add.s64 %rd1001, %rd361, 16384; + .loc 1 415 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:415:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + add.s64 %rd1000, %rd362, 16384; + add.s64 %rd999, %rd363, 16384; + add.s64 %rd998, %rd364, 16384; + add.s64 %rd997, %rd365, 16384; + .loc 1 417 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:417:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + or.b32 %r13840, %r4642, 64; + or.b32 %r13839, %r4643, 64; + or.b32 %r13838, %r4644, 64; + or.b32 %r13837, %r4645, 64; + .loc 1 831 52 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:52 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + setp.lt.s32 %p347, %r13840, %r2123; + setp.lt.s32 %p348, %r13839, %r2123; + setp.lt.s32 %p349, %r13838, %r2123; + setp.lt.s32 %p350, %r13837, %r2123; + .loc 1 831 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + bar.sync 0; + selp.b32 %r4655, 16, 0, %p347; + selp.b32 %r4635, %r4655, 0, %p346; + // begin inline asm + cp.async.cg.shared.global [ %r2285 + 0 ], [ %rd1004 + 0 ], 0x10, %r4635; + // end inline asm + selp.b32 %r4656, 16, 0, %p348; + selp.b32 %r4637, %r4656, 0, %p346; + // begin inline asm + cp.async.cg.shared.global [ %r2287 + 0 ], [ %rd1003 + 0 ], 0x10, %r4637; + // end inline asm + selp.b32 %r4657, 16, 0, %p349; + selp.b32 %r4639, %r4657, 0, %p346; + // begin inline asm + cp.async.cg.shared.global [ %r2289 + 0 ], [ %rd1002 + 0 ], 0x10, %r4639; + // end inline asm + selp.b32 %r4658, 16, 0, %p350; + selp.b32 %r4641, %r4658, 0, %p346; + // begin inline asm + cp.async.cg.shared.global [ %r2291 + 0 ], [ %rd1001 + 0 ], 0x10, %r4641; + // end inline asm + cp.async.commit_group; + // begin inline asm + cp.async.cg.shared.global [ %r2293 + 0 ], [ %rd1000 + 0 ], 0x10, %r4635; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2295 + 0 ], [ %rd999 + 0 ], 0x10, %r4637; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2297 + 0 ], [ %rd998 + 0 ], 0x10, %r4639; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r2299 + 0 ], [ %rd997 + 0 ], 0x10, %r4641; + // end inline asm + cp.async.commit_group; + .loc 1 459 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:459:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + .loc 1 397 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:397:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + @%p340 bra $L__BB0_7; +$L__tmp35: +// %bb.5: // %.lr.ph1637 + .loc 1 218 33 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:218:33 + or.b32 %r13921, %r334, %r22; + or.b32 %r13920, %r334, %r21; + or.b32 %r13919, %r334, %r24; + or.b32 %r13914, %r334, %r26; + or.b32 %r13906, %r334, %r30; + or.b32 %r13918, %r334, %r25; + or.b32 %r13915, %r334, %r27; + or.b32 %r13907, %r334, %r31; + or.b32 %r13916, %r334, %r28; + or.b32 %r13908, %r334, %r32; + or.b32 %r13917, %r334, %r29; + or.b32 %r13909, %r334, %r33; + or.b32 %r13910, %r334, %r34; + or.b32 %r13911, %r334, %r35; + or.b32 %r13912, %r334, %r36; + or.b32 %r13913, %r334, %r37; + add.s32 %r421, %r336, -2; + add.s32 %r422, %r336, -1; +$L__tmp36: + .loc 1 397 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:397:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + max.s32 %r423, %r336, 1; + mov.b32 %r5214, 0; + mov.b32 %r13836, 1; + mov.b32 %r13835, -1; + mov.b32 %r13834, 64; + mov.b32 %r13905, %r5214; +$L__BB0_6: // %__nv_exp2f.exit1383 + // =>This Inner Loop Header: Depth=1 + setp.lt.s32 %p371, %r13905, %r421; + setp.lt.s32 %p369, %r13905, %r422; + add.s32 %r6321, %r13835, 1; + setp.gt.s32 %p372, %r6321, 2; + selp.b32 %r13835, 0, %r6321, %p372; + .loc 1 831 52 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:52 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + setp.lt.s32 %p373, %r13921, %r2123; + setp.lt.s32 %p374, %r13920, %r2123; + setp.lt.s32 %p375, %r13919, %r2123; + setp.lt.s32 %p376, %r13918, %r2123; + setp.lt.s32 %p377, %r13917, %r2123; + setp.lt.s32 %p378, %r13916, %r2123; + setp.lt.s32 %p379, %r13915, %r2123; + setp.lt.s32 %p380, %r13914, %r2123; + setp.lt.s32 %p381, %r13913, %r2123; + setp.lt.s32 %p382, %r13912, %r2123; + setp.lt.s32 %p383, %r13911, %r2123; + setp.lt.s32 %p384, %r13910, %r2123; + setp.lt.s32 %p385, %r13909, %r2123; + setp.lt.s32 %p386, %r13908, %r2123; + setp.lt.s32 %p387, %r13907, %r2123; + setp.lt.s32 %p388, %r13906, %r2123; + .loc 1 831 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r6322, %r13835, 14; + add.s32 %r5216, %r2353, %r6322; + .loc 1 459 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:459:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + shfl.sync.idx.b32 %r6324, %r8, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r6325, %r6324, 11; + and.b32 %r6326, %r6325, 8192; + add.s32 %r5175, %r2353, 98304; + add.s32 %r6327, %r6326, %r5175; + bfe.u32 %r6328, %r6327, 4, 14; + cvt.u64.u32 %rd439, %r6328; + or.b64 %rd389, %rd439, 4611686293372403712; + bfe.u32 %r6329, %r5216, 4, 14; + cvt.u64.u32 %rd440, %r6329; + or.b64 %rd390, %rd440, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r4759,%r4760,%r4761,%r4762,%r4763,%r4764,%r4765,%r4766,%r4767,%r4768,%r4769,%r4770,%r4771,%r4772,%r4773,%r4774,%r4775,%r4776,%r4777,%r4778,%r4779,%r4780,%r4781,%r4782,%r4783,%r4784,%r4785,%r4786,%r4787,%r4788,%r4789,%r4790}, %rd389, %rd390, 0, 1, 1, 0, 0; + // end inline asm + or.b32 %r6330, %r6326, 32; + add.s32 %r6331, %r6330, %r5175; + bfe.u32 %r6332, %r6331, 4, 14; + cvt.u64.u32 %rd441, %r6332; + or.b64 %rd391, %rd441, 4611686293372403712; + add.s32 %r6333, %r5216, 32; + bfe.u32 %r6334, %r6333, 4, 14; + cvt.u64.u32 %rd442, %r6334; + or.b64 %rd392, %rd442, 4611686293338849280; + mov.pred %p351, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r4759,%r4760,%r4761,%r4762,%r4763,%r4764,%r4765,%r4766,%r4767,%r4768,%r4769,%r4770,%r4771,%r4772,%r4773,%r4774,%r4775,%r4776,%r4777,%r4778,%r4779,%r4780,%r4781,%r4782,%r4783,%r4784,%r4785,%r4786,%r4787,%r4788,%r4789,%r4790}, %rd391, %rd392, %p351, 1, 1, 0, 0; + // end inline asm + or.b32 %r6335, %r6326, 64; + add.s32 %r6336, %r6335, %r5175; + bfe.u32 %r6337, %r6336, 4, 14; + cvt.u64.u32 %rd443, %r6337; + or.b64 %rd393, %rd443, 4611686293372403712; + add.s32 %r6338, %r5216, 64; + bfe.u32 %r6339, %r6338, 4, 14; + cvt.u64.u32 %rd444, %r6339; + or.b64 %rd394, %rd444, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r4759,%r4760,%r4761,%r4762,%r4763,%r4764,%r4765,%r4766,%r4767,%r4768,%r4769,%r4770,%r4771,%r4772,%r4773,%r4774,%r4775,%r4776,%r4777,%r4778,%r4779,%r4780,%r4781,%r4782,%r4783,%r4784,%r4785,%r4786,%r4787,%r4788,%r4789,%r4790}, %rd393, %rd394, %p351, 1, 1, 0, 0; + // end inline asm + or.b32 %r6340, %r6326, 96; + add.s32 %r6341, %r6340, %r5175; + bfe.u32 %r6342, %r6341, 4, 14; + cvt.u64.u32 %rd445, %r6342; + or.b64 %rd395, %rd445, 4611686293372403712; + add.s32 %r6343, %r5216, 96; + bfe.u32 %r6344, %r6343, 4, 14; + cvt.u64.u32 %rd446, %r6344; + or.b64 %rd396, %rd446, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r4759,%r4760,%r4761,%r4762,%r4763,%r4764,%r4765,%r4766,%r4767,%r4768,%r4769,%r4770,%r4771,%r4772,%r4773,%r4774,%r4775,%r4776,%r4777,%r4778,%r4779,%r4780,%r4781,%r4782,%r4783,%r4784,%r4785,%r4786,%r4787,%r4788,%r4789,%r4790}, %rd395, %rd396, %p351, 1, 1, 0, 0; + // end inline asm + or.b32 %r6345, %r6326, 16384; + add.s32 %r6346, %r6345, %r5175; + bfe.u32 %r6347, %r6346, 4, 14; + cvt.u64.u32 %rd447, %r6347; + or.b64 %rd397, %rd447, 4611686293372403712; + add.s32 %r6348, %r5216, 8192; + bfe.u32 %r6349, %r6348, 4, 14; + cvt.u64.u32 %rd448, %r6349; + or.b64 %rd398, %rd448, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r4759,%r4760,%r4761,%r4762,%r4763,%r4764,%r4765,%r4766,%r4767,%r4768,%r4769,%r4770,%r4771,%r4772,%r4773,%r4774,%r4775,%r4776,%r4777,%r4778,%r4779,%r4780,%r4781,%r4782,%r4783,%r4784,%r4785,%r4786,%r4787,%r4788,%r4789,%r4790}, %rd397, %rd398, %p351, 1, 1, 0, 0; + // end inline asm + or.b32 %r6350, %r6326, 16416; + add.s32 %r6351, %r6350, %r5175; + bfe.u32 %r6352, %r6351, 4, 14; + cvt.u64.u32 %rd449, %r6352; + or.b64 %rd399, %rd449, 4611686293372403712; + add.s32 %r6353, %r5216, 8224; + bfe.u32 %r6354, %r6353, 4, 14; + cvt.u64.u32 %rd450, %r6354; + or.b64 %rd400, %rd450, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r4759,%r4760,%r4761,%r4762,%r4763,%r4764,%r4765,%r4766,%r4767,%r4768,%r4769,%r4770,%r4771,%r4772,%r4773,%r4774,%r4775,%r4776,%r4777,%r4778,%r4779,%r4780,%r4781,%r4782,%r4783,%r4784,%r4785,%r4786,%r4787,%r4788,%r4789,%r4790}, %rd399, %rd400, %p351, 1, 1, 0, 0; + // end inline asm + or.b32 %r6355, %r6326, 16448; + add.s32 %r6356, %r6355, %r5175; + bfe.u32 %r6357, %r6356, 4, 14; + cvt.u64.u32 %rd451, %r6357; + or.b64 %rd401, %rd451, 4611686293372403712; + add.s32 %r6358, %r5216, 8256; + bfe.u32 %r6359, %r6358, 4, 14; + cvt.u64.u32 %rd452, %r6359; + or.b64 %rd402, %rd452, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r4759,%r4760,%r4761,%r4762,%r4763,%r4764,%r4765,%r4766,%r4767,%r4768,%r4769,%r4770,%r4771,%r4772,%r4773,%r4774,%r4775,%r4776,%r4777,%r4778,%r4779,%r4780,%r4781,%r4782,%r4783,%r4784,%r4785,%r4786,%r4787,%r4788,%r4789,%r4790}, %rd401, %rd402, %p351, 1, 1, 0, 0; + // end inline asm + or.b32 %r6360, %r6326, 16480; + add.s32 %r6361, %r6360, %r5175; + bfe.u32 %r6362, %r6361, 4, 14; + cvt.u64.u32 %rd453, %r6362; + or.b64 %rd403, %rd453, 4611686293372403712; + add.s32 %r6363, %r5216, 8288; + bfe.u32 %r6364, %r6363, 4, 14; + cvt.u64.u32 %rd454, %r6364; + or.b64 %rd404, %rd454, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r4759,%r4760,%r4761,%r4762,%r4763,%r4764,%r4765,%r4766,%r4767,%r4768,%r4769,%r4770,%r4771,%r4772,%r4773,%r4774,%r4775,%r4776,%r4777,%r4778,%r4779,%r4780,%r4781,%r4782,%r4783,%r4784,%r4785,%r4786,%r4787,%r4788,%r4789,%r4790}, %rd403, %rd404, %p351, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r5178, %r5216; + mov.b32 %r5176, %r5214; + mov.b32 %r5177, %r5214; + mov.b32 %r5179, %r5214; + mov.b32 %r5180, %r5214; + // begin inline asm + // wait for regs: %r4759,%r4760,%r4761,%r4762,%r4763,%r4764,%r4765,%r4766,%r4767,%r4768,%r4769,%r4770,%r4771,%r4772,%r4773,%r4774,%r4775,%r4776,%r4777,%r4778,%r4779,%r4780,%r4781,%r4782,%r4783,%r4784,%r4785,%r4786,%r4787,%r4788,%r4789,%r4790,%r5175,%r5176,%r5177,%r5178,%r5179,%r5180 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 461 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:461:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6365, %r4759, 0f3DB504F3; + mul.f32 %r6366, %r4760, 0f3DB504F3; + mul.f32 %r6367, %r4761, 0f3DB504F3; + mul.f32 %r6368, %r4762, 0f3DB504F3; + mul.f32 %r6369, %r4763, 0f3DB504F3; + mul.f32 %r6370, %r4764, 0f3DB504F3; + mul.f32 %r6371, %r4765, 0f3DB504F3; + mul.f32 %r6372, %r4766, 0f3DB504F3; + mul.f32 %r6373, %r4767, 0f3DB504F3; + mul.f32 %r6374, %r4768, 0f3DB504F3; + mul.f32 %r6375, %r4769, 0f3DB504F3; + mul.f32 %r6376, %r4770, 0f3DB504F3; + mul.f32 %r6377, %r4771, 0f3DB504F3; + mul.f32 %r6378, %r4772, 0f3DB504F3; + mul.f32 %r6379, %r4773, 0f3DB504F3; + mul.f32 %r6380, %r4774, 0f3DB504F3; + mul.f32 %r6381, %r4775, 0f3DB504F3; + mul.f32 %r6382, %r4776, 0f3DB504F3; + mul.f32 %r6383, %r4777, 0f3DB504F3; + mul.f32 %r6384, %r4778, 0f3DB504F3; + mul.f32 %r6385, %r4779, 0f3DB504F3; + mul.f32 %r6386, %r4780, 0f3DB504F3; + mul.f32 %r6387, %r4781, 0f3DB504F3; + mul.f32 %r6388, %r4782, 0f3DB504F3; + mul.f32 %r6389, %r4783, 0f3DB504F3; + mul.f32 %r6390, %r4784, 0f3DB504F3; + mul.f32 %r6391, %r4785, 0f3DB504F3; + mul.f32 %r6392, %r4786, 0f3DB504F3; + mul.f32 %r6393, %r4787, 0f3DB504F3; + mul.f32 %r6394, %r4788, 0f3DB504F3; + mul.f32 %r6395, %r4789, 0f3DB504F3; + mul.f32 %r6396, %r4790, 0f3DB504F3; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6397, %r6365, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6398, %r6397, 0fFF800000, %p373; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6399, %r6366, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6400, %r6399, 0fFF800000, %p374; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6401, %r6367, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6402, %r6401, 0fFF800000, %p373; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6403, %r6368, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6404, %r6403, 0fFF800000, %p374; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6405, %r6369, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6406, %r6405, 0fFF800000, %p375; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6407, %r6370, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6408, %r6407, 0fFF800000, %p376; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6409, %r6371, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6410, %r6409, 0fFF800000, %p375; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6411, %r6372, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6412, %r6411, 0fFF800000, %p376; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6413, %r6373, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6414, %r6413, 0fFF800000, %p377; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6415, %r6374, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6416, %r6415, 0fFF800000, %p378; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6417, %r6375, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6418, %r6417, 0fFF800000, %p377; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6419, %r6376, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6420, %r6419, 0fFF800000, %p378; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6421, %r6377, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6422, %r6421, 0fFF800000, %p379; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6423, %r6378, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6424, %r6423, 0fFF800000, %p380; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6425, %r6379, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6426, %r6425, 0fFF800000, %p379; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6427, %r6380, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6428, %r6427, 0fFF800000, %p380; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6429, %r6381, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6430, %r6429, 0fFF800000, %p381; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6431, %r6382, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6432, %r6431, 0fFF800000, %p382; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6433, %r6383, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6434, %r6433, 0fFF800000, %p381; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6435, %r6384, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6436, %r6435, 0fFF800000, %p382; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6437, %r6385, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6438, %r6437, 0fFF800000, %p383; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6439, %r6386, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6440, %r6439, 0fFF800000, %p384; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6441, %r6387, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6442, %r6441, 0fFF800000, %p383; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6443, %r6388, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6444, %r6443, 0fFF800000, %p384; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6445, %r6389, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6446, %r6445, 0fFF800000, %p385; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6447, %r6390, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6448, %r6447, 0fFF800000, %p386; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6449, %r6391, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6450, %r6449, 0fFF800000, %p385; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6451, %r6392, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6452, %r6451, 0fFF800000, %p386; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6453, %r6393, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6454, %r6453, 0fFF800000, %p387; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6455, %r6394, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6456, %r6455, 0fFF800000, %p388; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6457, %r6395, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6458, %r6457, 0fFF800000, %p387; + .loc 1 524 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:524:27 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6459, %r6396, 0f3FB8AA3B; + .loc 1 476 79 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:476:79 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.f32 %r6460, %r6459, 0fFF800000, %p388; + .loc 1 525 39 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:525:39 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + sub.f32 %r6461, %r6398, %r43; + sub.f32 %r6462, %r6400, %r43; + sub.f32 %r6463, %r6402, %r44; + sub.f32 %r6464, %r6404, %r44; + sub.f32 %r6465, %r6406, %r43; + sub.f32 %r6466, %r6408, %r43; + sub.f32 %r6467, %r6410, %r44; + sub.f32 %r6468, %r6412, %r44; + sub.f32 %r6469, %r6414, %r43; + sub.f32 %r6470, %r6416, %r43; + sub.f32 %r6471, %r6418, %r44; + sub.f32 %r6472, %r6420, %r44; + sub.f32 %r6473, %r6422, %r43; + sub.f32 %r6474, %r6424, %r43; + sub.f32 %r6475, %r6426, %r44; + sub.f32 %r6476, %r6428, %r44; + sub.f32 %r6477, %r6430, %r43; + sub.f32 %r6478, %r6432, %r43; + sub.f32 %r6479, %r6434, %r44; + sub.f32 %r6480, %r6436, %r44; + sub.f32 %r6481, %r6438, %r43; + sub.f32 %r6482, %r6440, %r43; + sub.f32 %r6483, %r6442, %r44; + sub.f32 %r6484, %r6444, %r44; + sub.f32 %r6485, %r6446, %r43; + sub.f32 %r6486, %r6448, %r43; + sub.f32 %r6487, %r6450, %r44; + sub.f32 %r6488, %r6452, %r44; + sub.f32 %r6489, %r6454, %r43; + sub.f32 %r6490, %r6456, %r43; + sub.f32 %r6491, %r6458, %r44; + sub.f32 %r6492, %r6460, %r44; + .loc 1 525 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:525:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + ex2.approx.ftz.f32 %r6493, %r6461; + ex2.approx.ftz.f32 %r6494, %r6462; + ex2.approx.ftz.f32 %r6495, %r6463; + ex2.approx.ftz.f32 %r6496, %r6464; + ex2.approx.ftz.f32 %r6497, %r6465; + ex2.approx.ftz.f32 %r6498, %r6466; + ex2.approx.ftz.f32 %r6499, %r6467; + ex2.approx.ftz.f32 %r6500, %r6468; + ex2.approx.ftz.f32 %r6501, %r6469; + ex2.approx.ftz.f32 %r6502, %r6470; + ex2.approx.ftz.f32 %r6503, %r6471; + ex2.approx.ftz.f32 %r6504, %r6472; + ex2.approx.ftz.f32 %r6505, %r6473; + ex2.approx.ftz.f32 %r6506, %r6474; + ex2.approx.ftz.f32 %r6507, %r6475; + ex2.approx.ftz.f32 %r6508, %r6476; + ex2.approx.ftz.f32 %r6509, %r6477; + ex2.approx.ftz.f32 %r6510, %r6478; + ex2.approx.ftz.f32 %r6511, %r6479; + ex2.approx.ftz.f32 %r6512, %r6480; + ex2.approx.ftz.f32 %r6513, %r6481; + ex2.approx.ftz.f32 %r6514, %r6482; + ex2.approx.ftz.f32 %r6515, %r6483; + ex2.approx.ftz.f32 %r6516, %r6484; + ex2.approx.ftz.f32 %r6517, %r6485; + ex2.approx.ftz.f32 %r6518, %r6486; + ex2.approx.ftz.f32 %r6519, %r6487; + ex2.approx.ftz.f32 %r6520, %r6488; + ex2.approx.ftz.f32 %r6521, %r6489; + ex2.approx.ftz.f32 %r6522, %r6490; + ex2.approx.ftz.f32 %r6523, %r6491; + ex2.approx.ftz.f32 %r6524, %r6492; + .loc 1 831 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + add.s32 %r6525, %r2353, 49152; + add.s32 %r5734, %r6525, %r6322; + .loc 1 530 20 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:530:20 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + wgmma.fence.sync.aligned; + add.s32 %r5731, %r2353, 131072; + add.s32 %r6526, %r6326, %r5731; + bfe.u32 %r6527, %r6526, 4, 14; + cvt.u64.u32 %rd455, %r6527; + or.b64 %rd405, %rd455, 4611686293372403712; + bfe.u32 %r6528, %r5734, 4, 14; + cvt.u64.u32 %rd456, %r6528; + or.b64 %rd406, %rd456, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5315,%r5316,%r5317,%r5318,%r5319,%r5320,%r5321,%r5322,%r5323,%r5324,%r5325,%r5326,%r5327,%r5328,%r5329,%r5330,%r5331,%r5332,%r5333,%r5334,%r5335,%r5336,%r5337,%r5338,%r5339,%r5340,%r5341,%r5342,%r5343,%r5344,%r5345,%r5346}, %rd405, %rd406, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r6529, %r6330, %r5731; + bfe.u32 %r6530, %r6529, 4, 14; + cvt.u64.u32 %rd457, %r6530; + or.b64 %rd407, %rd457, 4611686293372403712; + add.s32 %r6531, %r5734, 32; + bfe.u32 %r6532, %r6531, 4, 14; + cvt.u64.u32 %rd458, %r6532; + or.b64 %rd408, %rd458, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5315,%r5316,%r5317,%r5318,%r5319,%r5320,%r5321,%r5322,%r5323,%r5324,%r5325,%r5326,%r5327,%r5328,%r5329,%r5330,%r5331,%r5332,%r5333,%r5334,%r5335,%r5336,%r5337,%r5338,%r5339,%r5340,%r5341,%r5342,%r5343,%r5344,%r5345,%r5346}, %rd407, %rd408, %p351, 1, 1, 0, 0; + // end inline asm + add.s32 %r6533, %r6335, %r5731; + bfe.u32 %r6534, %r6533, 4, 14; + cvt.u64.u32 %rd459, %r6534; + or.b64 %rd409, %rd459, 4611686293372403712; + add.s32 %r6535, %r5734, 64; + bfe.u32 %r6536, %r6535, 4, 14; + cvt.u64.u32 %rd460, %r6536; + or.b64 %rd410, %rd460, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5315,%r5316,%r5317,%r5318,%r5319,%r5320,%r5321,%r5322,%r5323,%r5324,%r5325,%r5326,%r5327,%r5328,%r5329,%r5330,%r5331,%r5332,%r5333,%r5334,%r5335,%r5336,%r5337,%r5338,%r5339,%r5340,%r5341,%r5342,%r5343,%r5344,%r5345,%r5346}, %rd409, %rd410, %p351, 1, 1, 0, 0; + // end inline asm + add.s32 %r6537, %r6340, %r5731; + bfe.u32 %r6538, %r6537, 4, 14; + cvt.u64.u32 %rd461, %r6538; + or.b64 %rd411, %rd461, 4611686293372403712; + add.s32 %r6539, %r5734, 96; + bfe.u32 %r6540, %r6539, 4, 14; + cvt.u64.u32 %rd462, %r6540; + or.b64 %rd412, %rd462, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5315,%r5316,%r5317,%r5318,%r5319,%r5320,%r5321,%r5322,%r5323,%r5324,%r5325,%r5326,%r5327,%r5328,%r5329,%r5330,%r5331,%r5332,%r5333,%r5334,%r5335,%r5336,%r5337,%r5338,%r5339,%r5340,%r5341,%r5342,%r5343,%r5344,%r5345,%r5346}, %rd411, %rd412, %p351, 1, 1, 0, 0; + // end inline asm + add.s32 %r6541, %r6345, %r5731; + bfe.u32 %r6542, %r6541, 4, 14; + cvt.u64.u32 %rd463, %r6542; + or.b64 %rd413, %rd463, 4611686293372403712; + add.s32 %r6543, %r5734, 8192; + bfe.u32 %r6544, %r6543, 4, 14; + cvt.u64.u32 %rd464, %r6544; + or.b64 %rd414, %rd464, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5315,%r5316,%r5317,%r5318,%r5319,%r5320,%r5321,%r5322,%r5323,%r5324,%r5325,%r5326,%r5327,%r5328,%r5329,%r5330,%r5331,%r5332,%r5333,%r5334,%r5335,%r5336,%r5337,%r5338,%r5339,%r5340,%r5341,%r5342,%r5343,%r5344,%r5345,%r5346}, %rd413, %rd414, %p351, 1, 1, 0, 0; + // end inline asm + add.s32 %r6545, %r6350, %r5731; + bfe.u32 %r6546, %r6545, 4, 14; + cvt.u64.u32 %rd465, %r6546; + or.b64 %rd415, %rd465, 4611686293372403712; + add.s32 %r6547, %r5734, 8224; + bfe.u32 %r6548, %r6547, 4, 14; + cvt.u64.u32 %rd466, %r6548; + or.b64 %rd416, %rd466, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5315,%r5316,%r5317,%r5318,%r5319,%r5320,%r5321,%r5322,%r5323,%r5324,%r5325,%r5326,%r5327,%r5328,%r5329,%r5330,%r5331,%r5332,%r5333,%r5334,%r5335,%r5336,%r5337,%r5338,%r5339,%r5340,%r5341,%r5342,%r5343,%r5344,%r5345,%r5346}, %rd415, %rd416, %p351, 1, 1, 0, 0; + // end inline asm + add.s32 %r6549, %r6355, %r5731; + bfe.u32 %r6550, %r6549, 4, 14; + cvt.u64.u32 %rd467, %r6550; + or.b64 %rd417, %rd467, 4611686293372403712; + add.s32 %r6551, %r5734, 8256; + bfe.u32 %r6552, %r6551, 4, 14; + cvt.u64.u32 %rd468, %r6552; + or.b64 %rd418, %rd468, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5315,%r5316,%r5317,%r5318,%r5319,%r5320,%r5321,%r5322,%r5323,%r5324,%r5325,%r5326,%r5327,%r5328,%r5329,%r5330,%r5331,%r5332,%r5333,%r5334,%r5335,%r5336,%r5337,%r5338,%r5339,%r5340,%r5341,%r5342,%r5343,%r5344,%r5345,%r5346}, %rd417, %rd418, %p351, 1, 1, 0, 0; + // end inline asm + add.s32 %r6553, %r6360, %r5731; + bfe.u32 %r6554, %r6553, 4, 14; + cvt.u64.u32 %rd469, %r6554; + or.b64 %rd419, %rd469, 4611686293372403712; + add.s32 %r6555, %r5734, 8288; + bfe.u32 %r6556, %r6555, 4, 14; + cvt.u64.u32 %rd470, %r6556; + or.b64 %rd420, %rd470, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r5315,%r5316,%r5317,%r5318,%r5319,%r5320,%r5321,%r5322,%r5323,%r5324,%r5325,%r5326,%r5327,%r5328,%r5329,%r5330,%r5331,%r5332,%r5333,%r5334,%r5335,%r5336,%r5337,%r5338,%r5339,%r5340,%r5341,%r5342,%r5343,%r5344,%r5345,%r5346}, %rd419, %rd420, %p351, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r5736, %r5214; + mov.b32 %r5732, %r5214; + mov.b32 %r5733, %r5214; + mov.b32 %r5735, %r5214; + // begin inline asm + // wait for regs: %r5315,%r5316,%r5317,%r5318,%r5319,%r5320,%r5321,%r5322,%r5323,%r5324,%r5325,%r5326,%r5327,%r5328,%r5329,%r5330,%r5331,%r5332,%r5333,%r5334,%r5335,%r5336,%r5337,%r5338,%r5339,%r5340,%r5341,%r5342,%r5343,%r5344,%r5345,%r5346,%r5731,%r5732,%r5733,%r5734,%r5735,%r5736 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 531 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + sub.f32 %r6557, %r5315, %r2263; + sub.f32 %r6558, %r5316, %r2263; + sub.f32 %r6559, %r5317, %r2264; + sub.f32 %r6560, %r5318, %r2264; + sub.f32 %r6561, %r5319, %r2263; + sub.f32 %r6562, %r5320, %r2263; + sub.f32 %r6563, %r5321, %r2264; + sub.f32 %r6564, %r5322, %r2264; + sub.f32 %r6565, %r5323, %r2263; + sub.f32 %r6566, %r5324, %r2263; + sub.f32 %r6567, %r5325, %r2264; + sub.f32 %r6568, %r5326, %r2264; + sub.f32 %r6569, %r5327, %r2263; + sub.f32 %r6570, %r5328, %r2263; + sub.f32 %r6571, %r5329, %r2264; + sub.f32 %r6572, %r5330, %r2264; + sub.f32 %r6573, %r5331, %r2263; + sub.f32 %r6574, %r5332, %r2263; + sub.f32 %r6575, %r5333, %r2264; + sub.f32 %r6576, %r5334, %r2264; + sub.f32 %r6577, %r5335, %r2263; + sub.f32 %r6578, %r5336, %r2263; + sub.f32 %r6579, %r5337, %r2264; + sub.f32 %r6580, %r5338, %r2264; + sub.f32 %r6581, %r5339, %r2263; + sub.f32 %r6582, %r5340, %r2263; + sub.f32 %r6583, %r5341, %r2264; + sub.f32 %r6584, %r5342, %r2264; + sub.f32 %r6585, %r5343, %r2263; + sub.f32 %r6586, %r5344, %r2263; + sub.f32 %r6587, %r5345, %r2264; + sub.f32 %r6588, %r5346, %r2264; + .loc 1 531 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:531:14 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.f32 %r6589, %r6493, %r6557; + mul.f32 %r6590, %r6494, %r6558; + mul.f32 %r6591, %r6495, %r6559; + mul.f32 %r6592, %r6496, %r6560; + mul.f32 %r6593, %r6497, %r6561; + mul.f32 %r6594, %r6498, %r6562; + mul.f32 %r6595, %r6499, %r6563; + mul.f32 %r6596, %r6500, %r6564; + mul.f32 %r6597, %r6501, %r6565; + mul.f32 %r6598, %r6502, %r6566; + mul.f32 %r6599, %r6503, %r6567; + mul.f32 %r6600, %r6504, %r6568; + mul.f32 %r6601, %r6505, %r6569; + mul.f32 %r6602, %r6506, %r6570; + mul.f32 %r6603, %r6507, %r6571; + mul.f32 %r6604, %r6508, %r6572; + mul.f32 %r6605, %r6509, %r6573; + mul.f32 %r6606, %r6510, %r6574; + mul.f32 %r6607, %r6511, %r6575; + mul.f32 %r6608, %r6512, %r6576; + mul.f32 %r6609, %r6513, %r6577; + mul.f32 %r6610, %r6514, %r6578; + mul.f32 %r6611, %r6515, %r6579; + mul.f32 %r6612, %r6516, %r6580; + mul.f32 %r6613, %r6517, %r6581; + mul.f32 %r6614, %r6518, %r6582; + mul.f32 %r6615, %r6519, %r6583; + mul.f32 %r6616, %r6520, %r6584; + mul.f32 %r6617, %r6521, %r6585; + mul.f32 %r6618, %r6522, %r6586; + mul.f32 %r6619, %r6523, %r6587; + mul.f32 %r6620, %r6524, %r6588; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs81, %r6589; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs82, %rs81, 0x0000, %p373; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs83, %r6590; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs84, %rs83, 0x0000, %p374; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs85, %r6591; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs86, %rs85, 0x0000, %p373; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs87, %r6592; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs88, %rs87, 0x0000, %p374; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs89, %r6593; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs90, %rs89, 0x0000, %p375; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs91, %r6594; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs92, %rs91, 0x0000, %p376; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs93, %r6595; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs94, %rs93, 0x0000, %p375; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs95, %r6596; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs96, %rs95, 0x0000, %p376; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs97, %r6597; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs98, %rs97, 0x0000, %p377; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs99, %r6598; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs100, %rs99, 0x0000, %p378; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs101, %r6599; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs102, %rs101, 0x0000, %p377; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs103, %r6600; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs104, %rs103, 0x0000, %p378; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs105, %r6601; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs106, %rs105, 0x0000, %p379; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs107, %r6602; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs108, %rs107, 0x0000, %p380; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs109, %r6603; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs110, %rs109, 0x0000, %p379; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs111, %r6604; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs112, %rs111, 0x0000, %p380; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs113, %r6605; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs114, %rs113, 0x0000, %p381; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs115, %r6606; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs116, %rs115, 0x0000, %p382; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs117, %r6607; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs118, %rs117, 0x0000, %p381; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs119, %r6608; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs120, %rs119, 0x0000, %p382; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs121, %r6609; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs122, %rs121, 0x0000, %p383; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs123, %r6610; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs124, %rs123, 0x0000, %p384; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs125, %r6611; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs126, %rs125, 0x0000, %p383; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs127, %r6612; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs128, %rs127, 0x0000, %p384; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs129, %r6613; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs130, %rs129, 0x0000, %p385; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs131, %r6614; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs132, %rs131, 0x0000, %p386; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs133, %r6615; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs134, %rs133, 0x0000, %p385; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs135, %r6616; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs136, %rs135, 0x0000, %p386; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs137, %r6617; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs138, %rs137, 0x0000, %p387; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs139, %r6618; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs140, %rs139, 0x0000, %p388; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs141, %r6619; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs142, %rs141, 0x0000, %p387; + .loc 1 551 15 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:551:15 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + cvt.rn.bf16.f32 %rs143, %r6620; + .loc 1 538 71 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:538:71 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + selp.b16 %rs144, %rs143, 0x0000, %p388; + .loc 1 553 21 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:553:21 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mov.b32 %r5903, {%rs82, %rs84}; + mov.b32 %r5904, {%rs86, %rs88}; + mov.b32 %r5905, {%rs90, %rs92}; + mov.b32 %r5906, {%rs94, %rs96}; + mov.b32 %r6035, {%rs98, %rs100}; + mov.b32 %r6036, {%rs102, %rs104}; + mov.b32 %r6037, {%rs106, %rs108}; + mov.b32 %r6038, {%rs110, %rs112}; + mov.b32 %r6167, {%rs114, %rs116}; + mov.b32 %r6168, {%rs118, %rs120}; + mov.b32 %r6169, {%rs122, %rs124}; + mov.b32 %r6170, {%rs126, %rs128}; + mov.b32 %r6299, {%rs130, %rs132}; + mov.b32 %r6300, {%rs134, %rs136}; + mov.b32 %r6301, {%rs138, %rs140}; + mov.b32 %r6302, {%rs142, %rs144}; + wgmma.fence.sync.aligned; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13689,%r13690,%r13691,%r13692,%r13693,%r13694,%r13695,%r13696,%r13697,%r13698,%r13699,%r13700,%r13701,%r13702,%r13703,%r13704,%r13705,%r13706,%r13707,%r13708,%r13709,%r13710,%r13711,%r13712,%r13713,%r13714,%r13715,%r13716,%r13717,%r13718,%r13719,%r13720,%r13721,%r13722,%r13723,%r13724,%r13725,%r13726,%r13727,%r13728,%r13729,%r13730,%r13731,%r13732,%r13733,%r13734,%r13735,%r13736,%r13737,%r13738,%r13739,%r13740,%r13741,%r13742,%r13743,%r13744,%r13745,%r13746,%r13747,%r13748,%r13749,%r13750,%r13751,%r13752}, {%r5903,%r5904,%r5905,%r5906}, %rd390, %p351, 1, 1, 1; + // end inline asm + add.s32 %r6621, %r5216, 2048; + bfe.u32 %r6622, %r6621, 4, 14; + cvt.u64.u32 %rd471, %r6622; + or.b64 %rd422, %rd471, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13689,%r13690,%r13691,%r13692,%r13693,%r13694,%r13695,%r13696,%r13697,%r13698,%r13699,%r13700,%r13701,%r13702,%r13703,%r13704,%r13705,%r13706,%r13707,%r13708,%r13709,%r13710,%r13711,%r13712,%r13713,%r13714,%r13715,%r13716,%r13717,%r13718,%r13719,%r13720,%r13721,%r13722,%r13723,%r13724,%r13725,%r13726,%r13727,%r13728,%r13729,%r13730,%r13731,%r13732,%r13733,%r13734,%r13735,%r13736,%r13737,%r13738,%r13739,%r13740,%r13741,%r13742,%r13743,%r13744,%r13745,%r13746,%r13747,%r13748,%r13749,%r13750,%r13751,%r13752}, {%r6035,%r6036,%r6037,%r6038}, %rd422, %p351, 1, 1, 1; + // end inline asm + add.s32 %r6623, %r5216, 4096; + bfe.u32 %r6624, %r6623, 4, 14; + cvt.u64.u32 %rd472, %r6624; + or.b64 %rd423, %rd472, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13689,%r13690,%r13691,%r13692,%r13693,%r13694,%r13695,%r13696,%r13697,%r13698,%r13699,%r13700,%r13701,%r13702,%r13703,%r13704,%r13705,%r13706,%r13707,%r13708,%r13709,%r13710,%r13711,%r13712,%r13713,%r13714,%r13715,%r13716,%r13717,%r13718,%r13719,%r13720,%r13721,%r13722,%r13723,%r13724,%r13725,%r13726,%r13727,%r13728,%r13729,%r13730,%r13731,%r13732,%r13733,%r13734,%r13735,%r13736,%r13737,%r13738,%r13739,%r13740,%r13741,%r13742,%r13743,%r13744,%r13745,%r13746,%r13747,%r13748,%r13749,%r13750,%r13751,%r13752}, {%r6167,%r6168,%r6169,%r6170}, %rd423, %p351, 1, 1, 1; + // end inline asm + add.s32 %r6625, %r5216, 6144; + bfe.u32 %r6626, %r6625, 4, 14; + cvt.u64.u32 %rd473, %r6626; + or.b64 %rd424, %rd473, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r13689,%r13690,%r13691,%r13692,%r13693,%r13694,%r13695,%r13696,%r13697,%r13698,%r13699,%r13700,%r13701,%r13702,%r13703,%r13704,%r13705,%r13706,%r13707,%r13708,%r13709,%r13710,%r13711,%r13712,%r13713,%r13714,%r13715,%r13716,%r13717,%r13718,%r13719,%r13720,%r13721,%r13722,%r13723,%r13724,%r13725,%r13726,%r13727,%r13728,%r13729,%r13730,%r13731,%r13732,%r13733,%r13734,%r13735,%r13736,%r13737,%r13738,%r13739,%r13740,%r13741,%r13742,%r13743,%r13744,%r13745,%r13746,%r13747,%r13748,%r13749,%r13750,%r13751,%r13752}, {%r6299,%r6300,%r6301,%r6302}, %rd424, %p351, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 417 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:417:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + add.s32 %r13906, %r13834, %r13906; + add.s32 %r13907, %r13834, %r13907; + add.s32 %r13908, %r13834, %r13908; + add.s32 %r13909, %r13834, %r13909; + add.s32 %r13910, %r13834, %r13910; + add.s32 %r13911, %r13834, %r13911; + add.s32 %r13912, %r13834, %r13912; + add.s32 %r13913, %r13834, %r13913; + add.s32 %r13914, %r13834, %r13914; + add.s32 %r13915, %r13834, %r13915; + add.s32 %r13916, %r13834, %r13916; + add.s32 %r13917, %r13834, %r13917; + add.s32 %r13918, %r13834, %r13918; + add.s32 %r13919, %r13834, %r13919; + add.s32 %r13920, %r13834, %r13920; + add.s32 %r13921, %r13834, %r13921; + .loc 1 397 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:397:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + add.s32 %r593, %r13905, 1; + .loc 1 788 33 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:788:33 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + shr.u32 %r6627, %r593, 1; + .loc 1 789 38 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:789:38 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mad.wide.u32 %rd426, %r6627, 4, %rd356; + .loc 1 789 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:789:24 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + // begin inline asm + mov.u64 %rd425, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd425, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6303, 0x0; + @%p369 ld.global.L1::evict_last.L2::cache_hint.b32 { %r6303 }, [ %rd426 + 0 ], %rd425; + // end inline asm + .loc 1 790 109 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:790:109 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + add.s32 %r6628, %r6627, 1; + .loc 1 790 113 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:790:113 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + setp.lt.s32 %p389, %r6628, %r4609; + .loc 1 790 55 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:790:55 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + add.s64 %rd429, %rd426, 4; + .loc 1 397 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:397:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + and.pred %p370, %p369, %p389; + .loc 1 790 25 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:790:25 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + // begin inline asm + mov.u64 %rd428, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd428, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r6304, 0x0; + @%p370 ld.global.L1::evict_last.L2::cache_hint.b32 { %r6304 }, [ %rd429 + 0 ], %rd428; + // end inline asm + .loc 1 791 35 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:791:35 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + and.b32 %r6629, %r13905, 1; + .loc 1 792 34 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:792:34 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + sub.s32 %r6630, %r6304, %r6303; + .loc 1 792 48 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:792:48 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + shl.b32 %r6631, %r6630, 7; + .loc 1 792 63 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:792:63 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + add.s32 %r6632, %r6631, -64; + .loc 1 793 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:793:29 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + xor.b32 %r6633, %r6629, 1; + .loc 1 793 61 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:793:61 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + shl.b32 %r6634, %r6629, 6; + .loc 1 793 42 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:793:42 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mad.lo.s32 %r13834, %r6632, %r6633, %r6634; + .loc 1 414 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:414:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + shl.b32 %r6635, %r13834, 7; + .loc 1 414 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:414:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + mul.wide.s32 %rd474, %r6635, 2; + add.s64 %rd1004, %rd1004, %rd474; + add.s64 %rd1003, %rd1003, %rd474; + add.s64 %rd1002, %rd1002, %rd474; + add.s64 %rd1001, %rd1001, %rd474; + .loc 1 415 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:415:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + add.s64 %rd1000, %rd1000, %rd474; + add.s64 %rd999, %rd999, %rd474; + add.s64 %rd998, %rd998, %rd474; + add.s64 %rd997, %rd997, %rd474; + .loc 1 417 19 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:417:19 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + add.s32 %r13840, %r13834, %r13840; + add.s32 %r13839, %r13834, %r13839; + add.s32 %r13838, %r13834, %r13838; + add.s32 %r13837, %r13834, %r13837; + .loc 1 397 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:397:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + add.s32 %r6636, %r13836, 1; + setp.gt.s32 %p390, %r6636, 2; + selp.b32 %r13836, 0, %r6636, %p390; + .loc 1 831 52 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:52 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + setp.lt.s32 %p391, %r13840, %r2123; + setp.lt.s32 %p392, %r13839, %r2123; + setp.lt.s32 %p393, %r13838, %r2123; + setp.lt.s32 %p394, %r13837, %r2123; + .loc 1 831 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:831:23 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + shl.b32 %r6637, %r13836, 14; + add.s32 %r6638, %r2353, %r6637; + bar.sync 0; + add.s32 %r6305, %r6638, %r48; + selp.b32 %r6639, 16, 0, %p391; + selp.b32 %r6314, %r6639, 0, %p371; + // begin inline asm + cp.async.cg.shared.global [ %r6305 + 0 ], [ %rd1004 + 0 ], 0x10, %r6314; + // end inline asm + add.s32 %r6307, %r6305, 2048; + selp.b32 %r6640, 16, 0, %p392; + selp.b32 %r6316, %r6640, 0, %p371; + // begin inline asm + cp.async.cg.shared.global [ %r6307 + 0 ], [ %rd1003 + 0 ], 0x10, %r6316; + // end inline asm + add.s32 %r6309, %r6305, 4096; + selp.b32 %r6641, 16, 0, %p393; + selp.b32 %r6318, %r6641, 0, %p371; + // begin inline asm + cp.async.cg.shared.global [ %r6309 + 0 ], [ %rd1002 + 0 ], 0x10, %r6318; + // end inline asm + add.s32 %r6311, %r6305, 6144; + selp.b32 %r6642, 16, 0, %p394; + selp.b32 %r6320, %r6642, 0, %p371; + // begin inline asm + cp.async.cg.shared.global [ %r6311 + 0 ], [ %rd1001 + 0 ], 0x10, %r6320; + // end inline asm + cp.async.commit_group; + add.s32 %r6643, %r6525, %r6637; + add.s32 %r6313, %r6643, %r48; + // begin inline asm + cp.async.cg.shared.global [ %r6313 + 0 ], [ %rd1000 + 0 ], 0x10, %r6314; + // end inline asm + add.s32 %r6315, %r6313, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r6315 + 0 ], [ %rd999 + 0 ], 0x10, %r6316; + // end inline asm + add.s32 %r6317, %r6313, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r6317 + 0 ], [ %rd998 + 0 ], 0x10, %r6318; + // end inline asm + add.s32 %r6319, %r6313, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r6319 + 0 ], [ %rd997 + 0 ], 0x10, %r6320; + // end inline asm + cp.async.commit_group; + .loc 1 397 28 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:397:28 @[ cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:226:16 ] + setp.ne.b32 %p395, %r423, %r593; + mov.b32 %r13905, %r593; + @%p395 bra $L__BB0_6; +$L__BB0_7: // %._crit_edge1638 + // begin inline asm + // wait for regs: %r13689,%r13690,%r13691,%r13692,%r13693,%r13694,%r13695,%r13696,%r13697,%r13698,%r13699,%r13700,%r13701,%r13702,%r13703,%r13704,%r13705,%r13706,%r13707,%r13708,%r13709,%r13710,%r13711,%r13712,%r13713,%r13714,%r13715,%r13716,%r13717,%r13718,%r13719,%r13720,%r13721,%r13722,%r13723,%r13724,%r13725,%r13726,%r13727,%r13728,%r13729,%r13730,%r13731,%r13732,%r13733,%r13734,%r13735,%r13736,%r13737,%r13738,%r13739,%r13740,%r13741,%r13742,%r13743,%r13744,%r13745,%r13746,%r13747,%r13748,%r13749,%r13750,%r13751,%r13752 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp37: + .loc 1 231 24 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:231:24 + shl.b64 %rd483, %rd5, 1; + add.s64 %rd484, %rd4, %rd483; + shl.b64 %rd485, %rd6, 1; + add.s64 %rd486, %rd4, %rd485; + shl.b64 %rd487, %rd7, 1; + add.s64 %rd488, %rd4, %rd487; + shl.b64 %rd489, %rd8, 1; + add.s64 %rd490, %rd4, %rd489; + shl.b64 %rd491, %rd9, 1; + add.s64 %rd492, %rd4, %rd491; + shl.b64 %rd493, %rd10, 1; + add.s64 %rd494, %rd4, %rd493; + shl.b64 %rd495, %rd11, 1; + add.s64 %rd496, %rd4, %rd495; + shl.b64 %rd497, %rd12, 1; + add.s64 %rd498, %rd4, %rd497; + .loc 1 231 56 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:231:56 + add.s64 %rd475, %rd484, %rd384; + add.s64 %rd476, %rd486, %rd384; + add.s64 %rd477, %rd488, %rd384; + add.s64 %rd478, %rd490, %rd384; + add.s64 %rd479, %rd492, %rd384; + add.s64 %rd480, %rd494, %rd384; + add.s64 %rd481, %rd496, %rd384; + add.s64 %rd482, %rd498, %rd384; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6844, %r13689, 0f3DB504F3; + mul.f32 %r6845, %r13690, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6846, %r6845, %r6844; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6847, %r13691, 0f3DB504F3; + mul.f32 %r6848, %r13692, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6849, %r6848, %r6847; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6850, %r13693, 0f3DB504F3; + mul.f32 %r6851, %r13694, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6852, %r6851, %r6850; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6853, %r13695, 0f3DB504F3; + mul.f32 %r6854, %r13696, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6855, %r6854, %r6853; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6856, %r13697, 0f3DB504F3; + mul.f32 %r6857, %r13698, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6858, %r6857, %r6856; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6859, %r13699, 0f3DB504F3; + mul.f32 %r6860, %r13700, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6861, %r6860, %r6859; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6862, %r13701, 0f3DB504F3; + mul.f32 %r6863, %r13702, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6864, %r6863, %r6862; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6865, %r13703, 0f3DB504F3; + mul.f32 %r6866, %r13704, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6867, %r6866, %r6865; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6868, %r13705, 0f3DB504F3; + mul.f32 %r6869, %r13706, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6870, %r6869, %r6868; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6871, %r13707, 0f3DB504F3; + mul.f32 %r6872, %r13708, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6873, %r6872, %r6871; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6874, %r13709, 0f3DB504F3; + mul.f32 %r6875, %r13710, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6876, %r6875, %r6874; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6877, %r13711, 0f3DB504F3; + mul.f32 %r6878, %r13712, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6879, %r6878, %r6877; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6880, %r13713, 0f3DB504F3; + mul.f32 %r6881, %r13714, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6882, %r6881, %r6880; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6883, %r13715, 0f3DB504F3; + mul.f32 %r6884, %r13716, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6885, %r6884, %r6883; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6886, %r13717, 0f3DB504F3; + mul.f32 %r6887, %r13718, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6888, %r6887, %r6886; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6889, %r13719, 0f3DB504F3; + mul.f32 %r6890, %r13720, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6891, %r6890, %r6889; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6892, %r13721, 0f3DB504F3; + mul.f32 %r6893, %r13722, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6894, %r6893, %r6892; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6895, %r13723, 0f3DB504F3; + mul.f32 %r6896, %r13724, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6897, %r6896, %r6895; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6898, %r13725, 0f3DB504F3; + mul.f32 %r6899, %r13726, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6900, %r6899, %r6898; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6901, %r13727, 0f3DB504F3; + mul.f32 %r6902, %r13728, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6903, %r6902, %r6901; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6904, %r13729, 0f3DB504F3; + mul.f32 %r6905, %r13730, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6906, %r6905, %r6904; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6907, %r13731, 0f3DB504F3; + mul.f32 %r6908, %r13732, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6909, %r6908, %r6907; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6910, %r13733, 0f3DB504F3; + mul.f32 %r6911, %r13734, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6912, %r6911, %r6910; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6913, %r13735, 0f3DB504F3; + mul.f32 %r6914, %r13736, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6915, %r6914, %r6913; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6916, %r13737, 0f3DB504F3; + mul.f32 %r6917, %r13738, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6918, %r6917, %r6916; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6919, %r13739, 0f3DB504F3; + mul.f32 %r6920, %r13740, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6921, %r6920, %r6919; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6922, %r13741, 0f3DB504F3; + mul.f32 %r6923, %r13742, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6924, %r6923, %r6922; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6925, %r13743, 0f3DB504F3; + mul.f32 %r6926, %r13744, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6927, %r6926, %r6925; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6928, %r13745, 0f3DB504F3; + mul.f32 %r6929, %r13746, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6930, %r6929, %r6928; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6931, %r13747, 0f3DB504F3; + mul.f32 %r6932, %r13748, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6933, %r6932, %r6931; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6934, %r13749, 0f3DB504F3; + mul.f32 %r6935, %r13750, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6936, %r6935, %r6934; + .loc 1 232 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:232:14 + mul.f32 %r6937, %r13751, 0f3DB504F3; + mul.f32 %r6938, %r13752, 0f3DB504F3; + .loc 1 236 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:236:30 + cvt.rn.bf16x2.f32 %r6939, %r6938, %r6937; + shl.b32 %r6940, %r19, 13; + shl.b32 %r6941, %r7, 5; + and.b32 %r6942, %r6941, 7264; + and.b32 %r6943, %r7, 24; + shl.b32 %r6944, %r6943, 4; + shl.b32 %r6945, %r7, 2; + and.b32 %r6946, %r6945, 16; + or.b32 %r6947, %r6940, %r6946; + or.b32 %r6948, %r6942, %r6944; + or.b32 %r6949, %r6947, %r6948; + add.s32 %r6951, %r2353, %r6949; + st.shared.v4.b32 [%r6951], {%r6846, %r6852, %r6858, %r6864}; + st.shared.v4.b32 [%r6951+512], {%r6849, %r6855, %r6861, %r6867}; + xor.b32 %r6952, %r6949, 32; + add.s32 %r6953, %r2353, %r6952; + st.shared.v4.b32 [%r6953], {%r6870, %r6876, %r6882, %r6888}; + st.shared.v4.b32 [%r6953+512], {%r6873, %r6879, %r6885, %r6891}; + xor.b32 %r6954, %r6949, 64; + add.s32 %r6955, %r2353, %r6954; + st.shared.v4.b32 [%r6955], {%r6894, %r6900, %r6906, %r6912}; + st.shared.v4.b32 [%r6955+512], {%r6897, %r6903, %r6909, %r6915}; + xor.b32 %r6956, %r6949, 96; + add.s32 %r6957, %r2353, %r6956; + st.shared.v4.b32 [%r6957], {%r6918, %r6924, %r6930, %r6936}; + st.shared.v4.b32 [%r6957+512], {%r6921, %r6927, %r6933, %r6939}; + bar.sync 0; + shl.b32 %r6958, %r6943, 10; + shl.b32 %r6959, %r19, 5; + and.b32 %r6960, %r6945, 1008; + or.b32 %r6961, %r6958, %r6959; + xor.b32 %r6962, %r6961, %r6960; + add.s32 %r6776, %r2353, %r6962; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r6812, %r6813, %r6814, %r6815}, [%r6776]; + // end inline asm + add.s32 %r6781, %r6776, 1024; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r6816, %r6817, %r6818, %r6819}, [%r6781]; + // end inline asm + add.s32 %r6786, %r6776, 2048; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r6820, %r6821, %r6822, %r6823}, [%r6786]; + // end inline asm + add.s32 %r6791, %r6776, 3072; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r6824, %r6825, %r6826, %r6827}, [%r6791]; + // end inline asm + add.s32 %r6796, %r6776, 4096; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r6828, %r6829, %r6830, %r6831}, [%r6796]; + // end inline asm + add.s32 %r6801, %r6776, 5120; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r6832, %r6833, %r6834, %r6835}, [%r6801]; + // end inline asm + add.s32 %r6806, %r6776, 6144; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r6836, %r6837, %r6838, %r6839}, [%r6806]; + // end inline asm + add.s32 %r6811, %r6776, 7168; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r6840, %r6841, %r6842, %r6843}, [%r6811]; + // end inline asm + mov.pred %p396, -1; + // begin inline asm + @%p396 st.global.v4.b32 [ %rd475 + 0 ], { %r6812, %r6813, %r6814, %r6815 }; + // end inline asm + // begin inline asm + @%p396 st.global.v4.b32 [ %rd476 + 0 ], { %r6816, %r6817, %r6818, %r6819 }; + // end inline asm + // begin inline asm + @%p396 st.global.v4.b32 [ %rd477 + 0 ], { %r6820, %r6821, %r6822, %r6823 }; + // end inline asm + // begin inline asm + @%p396 st.global.v4.b32 [ %rd478 + 0 ], { %r6824, %r6825, %r6826, %r6827 }; + // end inline asm + // begin inline asm + @%p396 st.global.v4.b32 [ %rd479 + 0 ], { %r6828, %r6829, %r6830, %r6831 }; + // end inline asm + // begin inline asm + @%p396 st.global.v4.b32 [ %rd480 + 0 ], { %r6832, %r6833, %r6834, %r6835 }; + // end inline asm + // begin inline asm + @%p396 st.global.v4.b32 [ %rd481 + 0 ], { %r6836, %r6837, %r6838, %r6839 }; + // end inline asm + // begin inline asm + @%p396 st.global.v4.b32 [ %rd482 + 0 ], { %r6840, %r6841, %r6842, %r6843 }; + // end inline asm + .loc 1 139 7 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:139:7 + bra.uni $L__BB0_17; +$L__BB0_16: + .loc 1 323 23 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:323:23 + shl.b64 %rd964, %rd70, 1; + add.s64 %rd965, %rd3, %rd964; + shl.b64 %rd966, %rd71, 1; + add.s64 %rd967, %rd3, %rd966; + shl.b64 %rd968, %rd72, 1; + add.s64 %rd969, %rd3, %rd968; + shl.b64 %rd970, %rd73, 1; + add.s64 %rd971, %rd3, %rd970; + shl.b64 %rd972, %rd74, 1; + add.s64 %rd973, %rd3, %rd972; + shl.b64 %rd974, %rd75, 1; + add.s64 %rd975, %rd3, %rd974; + shl.b64 %rd976, %rd76, 1; + add.s64 %rd977, %rd3, %rd976; + shl.b64 %rd978, %rd77, 1; + add.s64 %rd979, %rd3, %rd978; + .loc 1 323 55 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:323:55 + add.s64 %rd948, %rd965, %rd616; + add.s64 %rd949, %rd967, %rd616; + add.s64 %rd950, %rd969, %rd616; + add.s64 %rd951, %rd971, %rd616; + add.s64 %rd952, %rd973, %rd616; + add.s64 %rd953, %rd975, %rd616; + add.s64 %rd954, %rd977, %rd616; + add.s64 %rd955, %rd979, %rd616; + .loc 1 332 30 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:332:30 + cvt.rn.bf16x2.f32 %r13516, %r14144, %r14143; + cvt.rn.bf16x2.f32 %r13517, %r14146, %r14145; + cvt.rn.bf16x2.f32 %r13518, %r14148, %r14147; + cvt.rn.bf16x2.f32 %r13519, %r14150, %r14149; + cvt.rn.bf16x2.f32 %r13520, %r14152, %r14151; + cvt.rn.bf16x2.f32 %r13521, %r14154, %r14153; + cvt.rn.bf16x2.f32 %r13522, %r14156, %r14155; + cvt.rn.bf16x2.f32 %r13523, %r14158, %r14157; + cvt.rn.bf16x2.f32 %r13524, %r14160, %r14159; + cvt.rn.bf16x2.f32 %r13525, %r14162, %r14161; + cvt.rn.bf16x2.f32 %r13526, %r14164, %r14163; + cvt.rn.bf16x2.f32 %r13527, %r14166, %r14165; + cvt.rn.bf16x2.f32 %r13528, %r14168, %r14167; + cvt.rn.bf16x2.f32 %r13529, %r14170, %r14169; + cvt.rn.bf16x2.f32 %r13530, %r14172, %r14171; + cvt.rn.bf16x2.f32 %r13531, %r14174, %r14173; + cvt.rn.bf16x2.f32 %r13532, %r14176, %r14175; + cvt.rn.bf16x2.f32 %r13533, %r14178, %r14177; + cvt.rn.bf16x2.f32 %r13534, %r14180, %r14179; + cvt.rn.bf16x2.f32 %r13535, %r14182, %r14181; + cvt.rn.bf16x2.f32 %r13536, %r14184, %r14183; + cvt.rn.bf16x2.f32 %r13537, %r14186, %r14185; + cvt.rn.bf16x2.f32 %r13538, %r14188, %r14187; + cvt.rn.bf16x2.f32 %r13539, %r14190, %r14189; + cvt.rn.bf16x2.f32 %r13540, %r14192, %r14191; + cvt.rn.bf16x2.f32 %r13541, %r14194, %r14193; + cvt.rn.bf16x2.f32 %r13542, %r14196, %r14195; + cvt.rn.bf16x2.f32 %r13543, %r14198, %r14197; + cvt.rn.bf16x2.f32 %r13544, %r14200, %r14199; + cvt.rn.bf16x2.f32 %r13545, %r14202, %r14201; + cvt.rn.bf16x2.f32 %r13546, %r14204, %r14203; + cvt.rn.bf16x2.f32 %r13547, %r14206, %r14205; + shl.b32 %r13548, %r675, 13; + and.b32 %r13549, %r790, 7264; + and.b32 %r13550, %r7, 24; + shl.b32 %r13551, %r13550, 4; + shl.b32 %r13552, %r7, 2; + and.b32 %r13553, %r13552, 16; + or.b32 %r13554, %r13548, %r13553; + or.b32 %r13555, %r13549, %r13551; + or.b32 %r13556, %r13554, %r13555; + add.s32 %r13558, %r7117, %r13556; + st.shared.v4.b32 [%r13558], {%r13516, %r13518, %r13520, %r13522}; + st.shared.v4.b32 [%r13558+512], {%r13517, %r13519, %r13521, %r13523}; + xor.b32 %r13559, %r13556, 32; + add.s32 %r13560, %r7117, %r13559; + st.shared.v4.b32 [%r13560], {%r13524, %r13526, %r13528, %r13530}; + st.shared.v4.b32 [%r13560+512], {%r13525, %r13527, %r13529, %r13531}; + xor.b32 %r13561, %r13556, 64; + add.s32 %r13562, %r7117, %r13561; + st.shared.v4.b32 [%r13562], {%r13532, %r13534, %r13536, %r13538}; + st.shared.v4.b32 [%r13562+512], {%r13533, %r13535, %r13537, %r13539}; + xor.b32 %r13563, %r13556, 96; + add.s32 %r13564, %r7117, %r13563; + st.shared.v4.b32 [%r13564], {%r13540, %r13542, %r13544, %r13546}; + st.shared.v4.b32 [%r13564+512], {%r13541, %r13543, %r13545, %r13547}; + bar.sync 0; + shl.b32 %r13565, %r13550, 10; + shl.b32 %r13566, %r675, 5; + and.b32 %r13567, %r13552, 1008; + or.b32 %r13568, %r13565, %r13566; + xor.b32 %r13569, %r13568, %r13567; + add.s32 %r13376, %r7117, %r13569; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r13412, %r13413, %r13414, %r13415}, [%r13376]; + // end inline asm + add.s32 %r13381, %r13376, 1024; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r13416, %r13417, %r13418, %r13419}, [%r13381]; + // end inline asm + add.s32 %r13386, %r13376, 2048; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r13420, %r13421, %r13422, %r13423}, [%r13386]; + // end inline asm + add.s32 %r13391, %r13376, 3072; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r13424, %r13425, %r13426, %r13427}, [%r13391]; + // end inline asm + add.s32 %r13396, %r13376, 4096; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r13428, %r13429, %r13430, %r13431}, [%r13396]; + // end inline asm + add.s32 %r13401, %r13376, 5120; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r13432, %r13433, %r13434, %r13435}, [%r13401]; + // end inline asm + add.s32 %r13406, %r13376, 6144; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r13436, %r13437, %r13438, %r13439}, [%r13406]; + // end inline asm + add.s32 %r13411, %r13376, 7168; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r13440, %r13441, %r13442, %r13443}, [%r13411]; + // end inline asm + // begin inline asm + @%p986 st.global.v4.b32 [ %rd948 + 0 ], { %r13412, %r13413, %r13414, %r13415 }; + // end inline asm + // begin inline asm + @%p987 st.global.v4.b32 [ %rd949 + 0 ], { %r13416, %r13417, %r13418, %r13419 }; + // end inline asm + // begin inline asm + @%p988 st.global.v4.b32 [ %rd950 + 0 ], { %r13420, %r13421, %r13422, %r13423 }; + // end inline asm + // begin inline asm + @%p989 st.global.v4.b32 [ %rd951 + 0 ], { %r13424, %r13425, %r13426, %r13427 }; + // end inline asm + // begin inline asm + @%p990 st.global.v4.b32 [ %rd952 + 0 ], { %r13428, %r13429, %r13430, %r13431 }; + // end inline asm + // begin inline asm + @%p991 st.global.v4.b32 [ %rd953 + 0 ], { %r13432, %r13433, %r13434, %r13435 }; + // end inline asm + // begin inline asm + @%p992 st.global.v4.b32 [ %rd954 + 0 ], { %r13436, %r13437, %r13438, %r13439 }; + // end inline asm + // begin inline asm + @%p993 st.global.v4.b32 [ %rd955 + 0 ], { %r13440, %r13441, %r13442, %r13443 }; + // end inline asm + .loc 1 334 14 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:334:14 + mul.f32 %r13570, %r14207, 0f3DB504F3; + mul.f32 %r13571, %r14208, 0f3DB504F3; + mul.f32 %r13572, %r14209, 0f3DB504F3; + mul.f32 %r13573, %r14210, 0f3DB504F3; + mul.f32 %r13574, %r14211, 0f3DB504F3; + mul.f32 %r13575, %r14212, 0f3DB504F3; + mul.f32 %r13576, %r14213, 0f3DB504F3; + mul.f32 %r13577, %r14214, 0f3DB504F3; + mul.f32 %r13578, %r14215, 0f3DB504F3; + mul.f32 %r13579, %r14216, 0f3DB504F3; + mul.f32 %r13580, %r14217, 0f3DB504F3; + mul.f32 %r13581, %r14218, 0f3DB504F3; + mul.f32 %r13582, %r14219, 0f3DB504F3; + mul.f32 %r13583, %r14220, 0f3DB504F3; + mul.f32 %r13584, %r14221, 0f3DB504F3; + mul.f32 %r13585, %r14222, 0f3DB504F3; + mul.f32 %r13586, %r14223, 0f3DB504F3; + mul.f32 %r13587, %r14224, 0f3DB504F3; + mul.f32 %r13588, %r14225, 0f3DB504F3; + mul.f32 %r13589, %r14226, 0f3DB504F3; + mul.f32 %r13590, %r14227, 0f3DB504F3; + mul.f32 %r13591, %r14228, 0f3DB504F3; + mul.f32 %r13592, %r14229, 0f3DB504F3; + mul.f32 %r13593, %r14230, 0f3DB504F3; + mul.f32 %r13594, %r14231, 0f3DB504F3; + mul.f32 %r13595, %r14232, 0f3DB504F3; + mul.f32 %r13596, %r14233, 0f3DB504F3; + mul.f32 %r13597, %r14234, 0f3DB504F3; + mul.f32 %r13598, %r14235, 0f3DB504F3; + mul.f32 %r13599, %r14236, 0f3DB504F3; + mul.f32 %r13600, %r14237, 0f3DB504F3; + mul.f32 %r13601, %r14238, 0f3DB504F3; + mul.f32 %r13602, %r14239, 0f3DB504F3; + mul.f32 %r13603, %r14240, 0f3DB504F3; + mul.f32 %r13604, %r14241, 0f3DB504F3; + mul.f32 %r13605, %r14242, 0f3DB504F3; + mul.f32 %r13606, %r14243, 0f3DB504F3; + mul.f32 %r13607, %r14244, 0f3DB504F3; + mul.f32 %r13608, %r14245, 0f3DB504F3; + mul.f32 %r13609, %r14246, 0f3DB504F3; + mul.f32 %r13610, %r14247, 0f3DB504F3; + mul.f32 %r13611, %r14248, 0f3DB504F3; + mul.f32 %r13612, %r14249, 0f3DB504F3; + mul.f32 %r13613, %r14250, 0f3DB504F3; + mul.f32 %r13614, %r14251, 0f3DB504F3; + mul.f32 %r13615, %r14252, 0f3DB504F3; + mul.f32 %r13616, %r14253, 0f3DB504F3; + mul.f32 %r13617, %r14254, 0f3DB504F3; + mul.f32 %r13618, %r14255, 0f3DB504F3; + mul.f32 %r13619, %r14256, 0f3DB504F3; + mul.f32 %r13620, %r14257, 0f3DB504F3; + mul.f32 %r13621, %r14258, 0f3DB504F3; + mul.f32 %r13622, %r14259, 0f3DB504F3; + mul.f32 %r13623, %r14260, 0f3DB504F3; + mul.f32 %r13624, %r14261, 0f3DB504F3; + mul.f32 %r13625, %r14262, 0f3DB504F3; + mul.f32 %r13626, %r14263, 0f3DB504F3; + mul.f32 %r13627, %r14264, 0f3DB504F3; + mul.f32 %r13628, %r14265, 0f3DB504F3; + mul.f32 %r13629, %r14266, 0f3DB504F3; + mul.f32 %r13630, %r14267, 0f3DB504F3; + mul.f32 %r13631, %r14268, 0f3DB504F3; + mul.f32 %r13632, %r14269, 0f3DB504F3; + mul.f32 %r13633, %r14270, 0f3DB504F3; + .loc 1 344 27 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:344:27 + or.b64 %rd981, %rd70, %rd78; + cvt.u32.u64 %r13634, %rd981; + or.b64 %rd982, %rd71, %rd78; + cvt.u32.u64 %r13635, %rd982; + or.b64 %rd983, %rd72, %rd78; + cvt.u32.u64 %r13636, %rd983; + or.b64 %rd984, %rd73, %rd78; + cvt.u32.u64 %r13637, %rd984; + or.b64 %rd985, %rd74, %rd78; + cvt.u32.u64 %r13638, %rd985; + or.b64 %rd986, %rd75, %rd78; + cvt.u32.u64 %r13639, %rd986; + or.b64 %rd987, %rd76, %rd78; + cvt.u32.u64 %r13640, %rd987; + or.b64 %rd988, %rd77, %rd78; + cvt.u32.u64 %r13641, %rd988; + .loc 1 344 59 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:344:59 + add.s32 %r13642, %r6, %r13634; + add.s32 %r13643, %r6, %r13635; + add.s32 %r13644, %r6, %r13636; + add.s32 %r13645, %r6, %r13637; + add.s32 %r13646, %r6, %r13638; + add.s32 %r13647, %r6, %r13639; + add.s32 %r13648, %r6, %r13640; + add.s32 %r13649, %r6, %r13641; + .loc 1 345 29 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:345:29 + mad.wide.s32 %rd956, %r13642, 2, %rd171; + mad.wide.s32 %rd957, %r13643, 2, %rd171; + mad.wide.s32 %rd958, %r13644, 2, %rd171; + mad.wide.s32 %rd959, %r13645, 2, %rd171; + mad.wide.s32 %rd960, %r13646, 2, %rd171; + mad.wide.s32 %rd961, %r13647, 2, %rd171; + mad.wide.s32 %rd962, %r13648, 2, %rd171; + mad.wide.s32 %rd963, %r13649, 2, %rd171; + .loc 1 345 69 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:345:69 + cvt.rn.bf16x2.f32 %r13650, %r13571, %r13570; + cvt.rn.bf16x2.f32 %r13651, %r13573, %r13572; + cvt.rn.bf16x2.f32 %r13652, %r13575, %r13574; + cvt.rn.bf16x2.f32 %r13653, %r13577, %r13576; + cvt.rn.bf16x2.f32 %r13654, %r13579, %r13578; + cvt.rn.bf16x2.f32 %r13655, %r13581, %r13580; + cvt.rn.bf16x2.f32 %r13656, %r13583, %r13582; + cvt.rn.bf16x2.f32 %r13657, %r13585, %r13584; + cvt.rn.bf16x2.f32 %r13658, %r13587, %r13586; + cvt.rn.bf16x2.f32 %r13659, %r13589, %r13588; + cvt.rn.bf16x2.f32 %r13660, %r13591, %r13590; + cvt.rn.bf16x2.f32 %r13661, %r13593, %r13592; + cvt.rn.bf16x2.f32 %r13662, %r13595, %r13594; + cvt.rn.bf16x2.f32 %r13663, %r13597, %r13596; + cvt.rn.bf16x2.f32 %r13664, %r13599, %r13598; + cvt.rn.bf16x2.f32 %r13665, %r13601, %r13600; + cvt.rn.bf16x2.f32 %r13666, %r13603, %r13602; + cvt.rn.bf16x2.f32 %r13667, %r13605, %r13604; + cvt.rn.bf16x2.f32 %r13668, %r13607, %r13606; + cvt.rn.bf16x2.f32 %r13669, %r13609, %r13608; + cvt.rn.bf16x2.f32 %r13670, %r13611, %r13610; + cvt.rn.bf16x2.f32 %r13671, %r13613, %r13612; + cvt.rn.bf16x2.f32 %r13672, %r13615, %r13614; + cvt.rn.bf16x2.f32 %r13673, %r13617, %r13616; + cvt.rn.bf16x2.f32 %r13674, %r13619, %r13618; + cvt.rn.bf16x2.f32 %r13675, %r13621, %r13620; + cvt.rn.bf16x2.f32 %r13676, %r13623, %r13622; + cvt.rn.bf16x2.f32 %r13677, %r13625, %r13624; + cvt.rn.bf16x2.f32 %r13678, %r13627, %r13626; + cvt.rn.bf16x2.f32 %r13679, %r13629, %r13628; + cvt.rn.bf16x2.f32 %r13680, %r13631, %r13630; + cvt.rn.bf16x2.f32 %r13681, %r13633, %r13632; + bar.sync 0; + st.shared.v4.b32 [%r13558], {%r13650, %r13652, %r13654, %r13656}; + st.shared.v4.b32 [%r13558+512], {%r13651, %r13653, %r13655, %r13657}; + st.shared.v4.b32 [%r13560], {%r13658, %r13660, %r13662, %r13664}; + st.shared.v4.b32 [%r13560+512], {%r13659, %r13661, %r13663, %r13665}; + st.shared.v4.b32 [%r13562], {%r13666, %r13668, %r13670, %r13672}; + st.shared.v4.b32 [%r13562+512], {%r13667, %r13669, %r13671, %r13673}; + st.shared.v4.b32 [%r13564], {%r13674, %r13676, %r13678, %r13680}; + st.shared.v4.b32 [%r13564+512], {%r13675, %r13677, %r13679, %r13681}; + bar.sync 0; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r13484, %r13485, %r13486, %r13487}, [%r13376]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r13488, %r13489, %r13490, %r13491}, [%r13381]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r13492, %r13493, %r13494, %r13495}, [%r13386]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r13496, %r13497, %r13498, %r13499}, [%r13391]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r13500, %r13501, %r13502, %r13503}, [%r13396]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r13504, %r13505, %r13506, %r13507}, [%r13401]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r13508, %r13509, %r13510, %r13511}, [%r13406]; + // end inline asm + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r13512, %r13513, %r13514, %r13515}, [%r13411]; + // end inline asm + // begin inline asm + @%p986 st.global.v4.b32 [ %rd956 + 0 ], { %r13484, %r13485, %r13486, %r13487 }; + // end inline asm + // begin inline asm + @%p987 st.global.v4.b32 [ %rd957 + 0 ], { %r13488, %r13489, %r13490, %r13491 }; + // end inline asm + // begin inline asm + @%p988 st.global.v4.b32 [ %rd958 + 0 ], { %r13492, %r13493, %r13494, %r13495 }; + // end inline asm + // begin inline asm + @%p989 st.global.v4.b32 [ %rd959 + 0 ], { %r13496, %r13497, %r13498, %r13499 }; + // end inline asm + // begin inline asm + @%p990 st.global.v4.b32 [ %rd960 + 0 ], { %r13500, %r13501, %r13502, %r13503 }; + // end inline asm + // begin inline asm + @%p991 st.global.v4.b32 [ %rd961 + 0 ], { %r13504, %r13505, %r13506, %r13507 }; + // end inline asm + // begin inline asm + @%p992 st.global.v4.b32 [ %rd962 + 0 ], { %r13508, %r13509, %r13510, %r13511 }; + // end inline asm + // begin inline asm + @%p993 st.global.v4.b32 [ %rd963 + 0 ], { %r13512, %r13513, %r13514, %r13515 }; + // end inline asm +$L__BB0_17: + .loc 1 139 4 // cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py:139:4 + ret; +$L__tmp38: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 5 // DW_FORM_data2 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 6 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 428 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x1a5 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 110 +.b8 106 +.b8 117 +.b8 112 +.b8 114 +.b8 55 +.b8 104 +.b8 50 +.b8 55 +.b8 110 +.b8 116 +.b8 120 +.b8 102 +.b8 117 +.b8 113 +.b8 121 +.b8 55 +.b8 102 +.b8 102 +.b8 111 +.b8 113 +.b8 110 +.b8 55 +.b8 50 +.b8 115 +.b8 105 +.b8 110 +.b8 97 +.b8 50 +.b8 114 +.b8 122 +.b8 121 +.b8 52 +.b8 102 +.b8 120 +.b8 115 +.b8 101 +.b8 120 +.b8 112 +.b8 50 +.b8 115 +.b8 116 +.b8 51 +.b8 100 +.b8 115 +.b8 115 +.b8 50 +.b8 106 +.b8 122 +.b8 101 +.b8 119 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 110 +.b8 106 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 116 +.b8 101 +.b8 109 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x109 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 112 // DW_AT_call_line +.b8 36 // DW_AT_call_column +.b8 5 // Abbrev [5] 0xd3:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 0 // DW_AT_call_line +.b8 1 +.b8 107 // DW_AT_call_column +.b8 5 // Abbrev [5] 0xec:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp4 // DW_AT_low_pc +.b64 $L__tmp5 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 1 // DW_AT_call_line +.b8 1 +.b8 107 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x105:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp6 // DW_AT_low_pc +.b64 $L__tmp18 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 1 +.b8 16 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x11e:0x19 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp10 // DW_AT_low_pc +.b64 $L__tmp19 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 62 // DW_AT_call_line +.b8 1 +.b8 20 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x137:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp20 // DW_AT_low_pc +.b64 $L__tmp21 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 178 // DW_AT_call_line +.b8 107 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x14f:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp21 // DW_AT_low_pc +.b64 $L__tmp22 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 179 // DW_AT_call_line +.b8 111 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x167:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp23 // DW_AT_low_pc +.b64 $L__tmp33 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 207 // DW_AT_call_line +.b8 12 // DW_AT_call_column +.b8 6 // Abbrev [6] 0x17f:0x17 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp31 // DW_AT_low_pc +.b64 $L__tmp32 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 0 // DW_AT_call_line +.b8 4 // Abbrev [4] 0x196:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp34 // DW_AT_low_pc +.b64 $L__tmp37 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 226 // DW_AT_call_line +.b8 16 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.source new file mode 100644 index 0000000000000000000000000000000000000000..8ec0ba6b105d1c5de7b4d47fb35925a5c24f80ab --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.source @@ -0,0 +1,2270 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":18:0) +#loc216 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":32:0) +#loc226 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":812:0) +#loc238 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":348:0) +#loc269 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":423:0) +#loc344 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":797:0) +#loc348 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":781:0) +#loc369 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":559:0) +#loc400 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":634:0) +#loc479 = loc("arg_Q"(#loc)) +#loc480 = loc("arg_K"(#loc)) +#loc481 = loc("arg_V"(#loc)) +#loc482 = loc("arg_LSE"(#loc)) +#loc483 = loc("arg_DELTA"(#loc)) +#loc484 = loc("arg_DO"(#loc)) +#loc485 = loc("arg_DQ"(#loc)) +#loc486 = loc("arg_DV"(#loc)) +#loc487 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc488 = loc("arg_KV_IDX"(#loc)) +#loc489 = loc("arg_Q_NUM_BLKS"(#loc)) +#loc490 = loc("arg_Q_IDX"(#loc)) +#loc491 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc492 = loc("arg_FULL_KV_IDX"(#loc)) +#loc493 = loc("arg_FULL_Q_NUM_BLKS"(#loc)) +#loc494 = loc("arg_FULL_Q_IDX"(#loc)) +#loc495 = loc("in_ptr16"(#loc)) +#loc496 = loc("out_ptr0"(#loc)) +#loc497 = loc("ks0"(#loc)) +#loc498 = loc("ks1"(#loc)) +#loc499 = loc("ks2"(#loc)) +#loc500 = loc("ks3"(#loc)) +#loc686 = loc("x"(#loc216)) +#loc687 = loc("ptr"(#loc226)) +#loc688 = loc("offs_m"(#loc226)) +#loc689 = loc("offs_n"(#loc226)) +#loc690 = loc("stride_m"(#loc226)) +#loc691 = loc("stride_n"(#loc226)) +#loc692 = loc("M_LEN"(#loc226)) +#loc699 = loc("arg_Q"(#loc238)) +#loc700 = loc("arg_K"(#loc238)) +#loc701 = loc("arg_V"(#loc238)) +#loc702 = loc("arg_LSE"(#loc238)) +#loc703 = loc("arg_DELTA"(#loc238)) +#loc704 = loc("arg_DO"(#loc238)) +#loc705 = loc("arg_DQ"(#loc238)) +#loc706 = loc("arg_DV"(#loc238)) +#loc707 = loc("arg_KV_NUM_BLKS"(#loc238)) +#loc708 = loc("arg_KV_IDX"(#loc238)) +#loc709 = loc("arg_Q_NUM_BLKS"(#loc238)) +#loc710 = loc("arg_Q_IDX"(#loc238)) +#loc711 = loc("arg_FULL_KV_NUM_BLKS"(#loc238)) +#loc712 = loc("arg_FULL_KV_IDX"(#loc238)) +#loc713 = loc("arg_FULL_Q_NUM_BLKS"(#loc238)) +#loc714 = loc("arg_FULL_Q_IDX"(#loc238)) +#loc715 = loc("in_ptr16"(#loc238)) +#loc716 = loc("out_ptr0"(#loc238)) +#loc717 = loc("ks0"(#loc238)) +#loc718 = loc("ks1"(#loc238)) +#loc719 = loc("ks2"(#loc238)) +#loc720 = loc("ks3"(#loc238)) +#loc721 = loc("K"(#loc238)) +#loc722 = loc("V"(#loc238)) +#loc723 = loc("dq"(#loc238)) +#loc724 = loc("q"(#loc238)) +#loc725 = loc("do"(#loc238)) +#loc726 = loc("Di"(#loc238)) +#loc727 = loc("lse"(#loc238)) +#loc728 = loc("off_z"(#loc238)) +#loc729 = loc("off_hq"(#loc238)) +#loc730 = loc("offs_m2"(#loc238)) +#loc731 = loc("offs_n2"(#loc238)) +#loc732 = loc("stride_kn"(#loc238)) +#loc733 = loc("stride_kd"(#loc238)) +#loc734 = loc("stride_vn"(#loc238)) +#loc735 = loc("stride_vd"(#loc238)) +#loc736 = loc("kv_indices"(#loc238)) +#loc737 = loc("sparse_kv_num_blocks"(#loc238)) +#loc765 = loc("arg_Q"(#loc269)) +#loc766 = loc("arg_K"(#loc269)) +#loc767 = loc("arg_V"(#loc269)) +#loc768 = loc("arg_LSE"(#loc269)) +#loc769 = loc("arg_DELTA"(#loc269)) +#loc770 = loc("arg_DO"(#loc269)) +#loc771 = loc("arg_DQ"(#loc269)) +#loc772 = loc("arg_DV"(#loc269)) +#loc773 = loc("arg_KV_NUM_BLKS"(#loc269)) +#loc774 = loc("arg_KV_IDX"(#loc269)) +#loc775 = loc("arg_Q_NUM_BLKS"(#loc269)) +#loc776 = loc("arg_Q_IDX"(#loc269)) +#loc777 = loc("arg_FULL_KV_NUM_BLKS"(#loc269)) +#loc778 = loc("arg_FULL_KV_IDX"(#loc269)) +#loc779 = loc("arg_FULL_Q_NUM_BLKS"(#loc269)) +#loc780 = loc("arg_FULL_Q_IDX"(#loc269)) +#loc781 = loc("in_ptr16"(#loc269)) +#loc782 = loc("out_ptr0"(#loc269)) +#loc783 = loc("ks0"(#loc269)) +#loc784 = loc("ks1"(#loc269)) +#loc785 = loc("ks2"(#loc269)) +#loc786 = loc("ks3"(#loc269)) +#loc787 = loc("dq"(#loc269)) +#loc788 = loc("q"(#loc269)) +#loc789 = loc("kT_ptrs"(#loc269)) +#loc790 = loc("vT_ptrs"(#loc269)) +#loc791 = loc("do"(#loc269)) +#loc792 = loc("Di"(#loc269)) +#loc793 = loc("lse"(#loc269)) +#loc794 = loc("Q_LEN"(#loc269)) +#loc795 = loc("KV_LEN"(#loc269)) +#loc796 = loc("off_z"(#loc269)) +#loc797 = loc("off_hq"(#loc269)) +#loc798 = loc("offs_m2"(#loc269)) +#loc799 = loc("offs_n2"(#loc269)) +#loc800 = loc("offs_k"(#loc269)) +#loc801 = loc("offs_v"(#loc269)) +#loc802 = loc("stride_kn"(#loc269)) +#loc803 = loc("stride_kd"(#loc269)) +#loc804 = loc("stride_vn"(#loc269)) +#loc805 = loc("stride_vd"(#loc269)) +#loc806 = loc("kv_indices"(#loc269)) +#loc807 = loc("sparse_kv_num_blocks"(#loc269)) +#loc876 = loc("N_LEN"(#loc226)) +#loc877 = loc("indices"(#loc344)) +#loc878 = loc("max_len"(#loc344)) +#loc879 = loc("loop_iter"(#loc348)) +#loc880 = loc("col_indices"(#loc348)) +#loc881 = loc("total_blocks"(#loc348)) +#loc900 = loc("arg_Q"(#loc369)) +#loc901 = loc("arg_K"(#loc369)) +#loc902 = loc("arg_V"(#loc369)) +#loc903 = loc("arg_LSE"(#loc369)) +#loc904 = loc("arg_DELTA"(#loc369)) +#loc905 = loc("arg_DO"(#loc369)) +#loc906 = loc("arg_DQ"(#loc369)) +#loc907 = loc("arg_DV"(#loc369)) +#loc908 = loc("arg_KV_NUM_BLKS"(#loc369)) +#loc909 = loc("arg_KV_IDX"(#loc369)) +#loc910 = loc("arg_Q_NUM_BLKS"(#loc369)) +#loc911 = loc("arg_Q_IDX"(#loc369)) +#loc912 = loc("arg_FULL_KV_NUM_BLKS"(#loc369)) +#loc913 = loc("arg_FULL_KV_IDX"(#loc369)) +#loc914 = loc("arg_FULL_Q_NUM_BLKS"(#loc369)) +#loc915 = loc("arg_FULL_Q_IDX"(#loc369)) +#loc916 = loc("in_ptr16"(#loc369)) +#loc917 = loc("out_ptr0"(#loc369)) +#loc918 = loc("ks0"(#loc369)) +#loc919 = loc("ks1"(#loc369)) +#loc920 = loc("ks2"(#loc369)) +#loc921 = loc("ks3"(#loc369)) +#loc922 = loc("Q"(#loc369)) +#loc923 = loc("DO"(#loc369)) +#loc924 = loc("DELTA"(#loc369)) +#loc925 = loc("LSE"(#loc369)) +#loc926 = loc("dk"(#loc369)) +#loc927 = loc("dv"(#loc369)) +#loc928 = loc("k"(#loc369)) +#loc929 = loc("v"(#loc369)) +#loc930 = loc("off_z"(#loc369)) +#loc931 = loc("off_hq"(#loc369)) +#loc932 = loc("offs_n1"(#loc369)) +#loc933 = loc("offs_m1"(#loc369)) +#loc934 = loc("stride_qm"(#loc369)) +#loc935 = loc("stride_qd"(#loc369)) +#loc936 = loc("stride_dom"(#loc369)) +#loc937 = loc("stride_dod"(#loc369)) +#loc938 = loc("q_indices"(#loc369)) +#loc939 = loc("sparse_q_num_blocks"(#loc369)) +#loc966 = loc("arg_Q"(#loc400)) +#loc967 = loc("arg_K"(#loc400)) +#loc968 = loc("arg_V"(#loc400)) +#loc969 = loc("arg_LSE"(#loc400)) +#loc970 = loc("arg_DELTA"(#loc400)) +#loc971 = loc("arg_DO"(#loc400)) +#loc972 = loc("arg_DQ"(#loc400)) +#loc973 = loc("arg_DV"(#loc400)) +#loc974 = loc("arg_KV_NUM_BLKS"(#loc400)) +#loc975 = loc("arg_KV_IDX"(#loc400)) +#loc976 = loc("arg_Q_NUM_BLKS"(#loc400)) +#loc977 = loc("arg_Q_IDX"(#loc400)) +#loc978 = loc("arg_FULL_KV_NUM_BLKS"(#loc400)) +#loc979 = loc("arg_FULL_KV_IDX"(#loc400)) +#loc980 = loc("arg_FULL_Q_NUM_BLKS"(#loc400)) +#loc981 = loc("arg_FULL_Q_IDX"(#loc400)) +#loc982 = loc("in_ptr16"(#loc400)) +#loc983 = loc("out_ptr0"(#loc400)) +#loc984 = loc("ks0"(#loc400)) +#loc985 = loc("ks1"(#loc400)) +#loc986 = loc("ks2"(#loc400)) +#loc987 = loc("ks3"(#loc400)) +#loc988 = loc("dk"(#loc400)) +#loc989 = loc("dv"(#loc400)) +#loc990 = loc("qT_ptrs"(#loc400)) +#loc991 = loc("k"(#loc400)) +#loc992 = loc("v"(#loc400)) +#loc993 = loc("do_ptrs"(#loc400)) +#loc994 = loc("DELTA"(#loc400)) +#loc995 = loc("LSE"(#loc400)) +#loc996 = loc("Q_LEN"(#loc400)) +#loc997 = loc("KV_LEN"(#loc400)) +#loc998 = loc("off_z"(#loc400)) +#loc999 = loc("off_hq"(#loc400)) +#loc1000 = loc("offs_n1"(#loc400)) +#loc1001 = loc("offs_m1"(#loc400)) +#loc1002 = loc("offs_k"(#loc400)) +#loc1003 = loc("offs_v"(#loc400)) +#loc1004 = loc("stride_qm"(#loc400)) +#loc1005 = loc("stride_qd"(#loc400)) +#loc1006 = loc("stride_dom"(#loc400)) +#loc1007 = loc("stride_dod"(#loc400)) +#loc1008 = loc("q_indices"(#loc400)) +#loc1009 = loc("sparse_q_num_blocks"(#loc400)) +module { + tt.func public @triton_tem_fused_zeros_1(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_DELTA: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DELTA"(#loc)), %arg_DO: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DO"(#loc)), %arg_DQ: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DQ"(#loc)), %arg_DV: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DV"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_NUM_BLKS"(#loc)), %arg_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %arg_FULL_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_NUM_BLKS"(#loc)), %arg_FULL_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_IDX"(#loc)), %in_ptr16: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr16"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc))) attributes {noinline = false} { + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc2) + %c1024_i32_0 = arith.constant 1024 : i32 loc(#loc2) + %0 = arith.muli %c1024_i32_0, %ks0 : i32 loc(#loc2) + %c128_i32_1 = arith.constant 128 : i32 loc(#loc3) + %c128_i32_2 = arith.constant 128 : i32 loc(#loc3) + %1 = arith.muli %c128_i32_2, %ks0 : i32 loc(#loc3) + %c128_i32_3 = arith.constant 128 : i32 loc(#loc4) + %c1_i32_4 = arith.constant 1 : i32 loc(#loc4) + %c1024_i32_5 = arith.constant 1024 : i32 loc(#loc5) + %c1024_i32_6 = arith.constant 1024 : i32 loc(#loc5) + %2 = arith.muli %c1024_i32_6, %ks0 : i32 loc(#loc5) + %c128_i32_7 = arith.constant 128 : i32 loc(#loc6) + %c128_i32_8 = arith.constant 128 : i32 loc(#loc6) + %3 = arith.muli %c128_i32_8, %ks0 : i32 loc(#loc6) + %c128_i32_9 = arith.constant 128 : i32 loc(#loc7) + %c1_i32_10 = arith.constant 1 : i32 loc(#loc7) + %c8388608_i32_11 = arith.constant 8388608 : i32 loc(#loc8) + %c262144_i32 = arith.constant 262144 : i32 loc(#loc8) + %c128_i32_12 = arith.constant 128 : i32 loc(#loc8) + %c1_i32_13 = arith.constant 1 : i32 loc(#loc8) + %c8388608_i32_14 = arith.constant 8388608 : i32 loc(#loc9) + %c128_i32_15 = arith.constant 128 : i32 loc(#loc9) + %c4096_i32_16 = arith.constant 4096 : i32 loc(#loc9) + %c1_i32_17 = arith.constant 1 : i32 loc(#loc9) + %c1024_i32_18 = arith.constant 1024 : i32 loc(#loc10) + %c1024_i32_19 = arith.constant 1024 : i32 loc(#loc10) + %4 = arith.muli %c1024_i32_19, %ks0 : i32 loc(#loc10) + %c128_i32_20 = arith.constant 128 : i32 loc(#loc11) + %c128_i32_21 = arith.constant 128 : i32 loc(#loc11) + %5 = arith.muli %c128_i32_21, %ks0 : i32 loc(#loc11) + %c128_i32_22 = arith.constant 128 : i32 loc(#loc12) + %c1_i32_23 = arith.constant 1 : i32 loc(#loc12) + %ZQ = arith.constant 8 : i32 loc(#loc501) + %HQ = arith.constant 32 : i32 loc(#loc502) + %HKV = arith.constant 8 : i32 loc(#loc503) + %Q_LEN = arith.constant 2048 : i32 loc(#loc504) + %ZKV = arith.constant 8 : i32 loc(#loc505) + %pid = tt.get_program_id x : i32 loc(#loc506) + %NUM_KV_BLOCKS = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_128_"(%ks0) : (i32) -> i32 loc(#loc507) + %NUM_Q_BLOCKS = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_128_"(%Q_LEN) : (i32) -> i32 loc(#loc508) + %off_zq = tt.get_program_id y : i32 loc(#loc509) + %off_hkv = tt.get_program_id z : i32 loc(#loc510) + %off_zkv = arith.remsi %off_zq, %ZKV : i32 loc(#loc511) + %SPARSE_Z = arith.constant 8 : i32 loc(#loc512) + %SPARSE_HQ = arith.constant 1 : i32 loc(#loc513) + %sparse_idx_z = arith.remsi %off_zq, %SPARSE_Z : i32 loc(#loc514) + %k_adj = arith.muli %1, %off_hkv : i32 loc(#loc515) + %k_adj_24 = arith.muli %0, %off_zkv : i32 loc(#loc516) + %k_adj_25 = arith.addi %k_adj, %k_adj_24 : i32 loc(#loc517) + %k_adj_26 = arith.extsi %k_adj_25 : i32 to i64 loc(#loc518) + %v_adj = arith.muli %3, %off_hkv : i32 loc(#loc519) + %v_adj_27 = arith.muli %2, %off_zkv : i32 loc(#loc520) + %v_adj_28 = arith.addi %v_adj, %v_adj_27 : i32 loc(#loc521) + %v_adj_29 = arith.extsi %v_adj_28 : i32 to i64 loc(#loc522) + %dv_adj = arith.muli %5, %off_hkv : i32 loc(#loc523) + %dv_adj_30 = arith.muli %4, %off_zq : i32 loc(#loc524) + %dv_adj_31 = arith.addi %dv_adj, %dv_adj_30 : i32 loc(#loc525) + %dv_adj_32 = arith.extsi %dv_adj_31 : i32 to i64 loc(#loc526) + %K = tt.addptr %arg_K, %k_adj_26 : !tt.ptr, i64 loc(#loc527) + %V = tt.addptr %arg_V, %v_adj_29 : !tt.ptr, i64 loc(#loc528) + %DV = tt.addptr %arg_DV, %dv_adj_32 : !tt.ptr, i64 loc(#loc529) + %RCP_LN2 = arith.constant 1.44269502 : f32 loc(#loc530) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc531) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc532) + %6 = arith.cmpi sge, %pid, %NUM_KV_BLOCKS : i32 loc(#loc45) + %7:2 = scf.if %6 -> (i32, i32) { + %off_pid = arith.subi %pid, %NUM_KV_BLOCKS : i32 loc(#loc533) + %SPARSE_Q_MULTIPLE = arith.constant 1 : i32 loc(#loc1086) + %SPARSE_KV_MULTIPLE = arith.constant 2 : i32 loc(#loc1087) + %off_hq2 = arith.divsi %off_pid, %NUM_Q_BLOCKS : i32 loc(#loc536) + %off_hq2_33 = arith.constant 4 : i32 loc(#loc537) + %off_hq2_34 = arith.constant 4 : i32 loc(#loc537) + %off_hq2_35 = arith.muli %off_hkv, %off_hq2_34 : i32 loc(#loc537) + %off_hq2_36 = arith.addi %off_hq2, %off_hq2_35 : i32 loc(#loc538) + %start_m2_block = arith.remsi %off_pid, %NUM_Q_BLOCKS : i32 loc(#loc539) + %off_pid_mask = arith.divsi %start_m2_block, %SPARSE_Q_MULTIPLE : i32 loc(#loc540) + %stride_kv_num_blks_h = arith.constant 16 : i32 loc(#loc541) + %stride_kv_idx_h = arith.constant 16 : i32 loc(#loc542) + %stride_kv_idx_h_37 = arith.constant 16 : i32 loc(#loc542) + %stride_kv_idx_h_38 = arith.muli %stride_kv_idx_h_37, %ks1 : i32 loc(#loc542) + %sparse_idx_hq2 = arith.remsi %off_hq2_36, %SPARSE_HQ : i32 loc(#loc543) + %sparse_hz_offset = arith.muli %sparse_idx_z, %SPARSE_HQ : i32 loc(#loc544) + %sparse_hz_offset_39 = arith.addi %sparse_hz_offset, %sparse_idx_hq2 : i32 loc(#loc545) + %sparse_kv_num_blks_offset = arith.muli %sparse_hz_offset_39, %stride_kv_num_blks_h : i32 loc(#loc546) + %sparse_kv_num_blks_offset_40 = arith.addi %sparse_kv_num_blks_offset, %off_pid_mask : i32 loc(#loc547) + %sparse_kv_idx_offset = arith.muli %sparse_hz_offset_39, %stride_kv_idx_h_38 : i32 loc(#loc548) + %sparse_kv_idx_offset_41 = arith.muli %off_pid_mask, %ks1 : i32 loc(#loc549) + %sparse_kv_idx_offset_42 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_41 : i32 loc(#loc550) + %q_adj2 = arith.muli %c128_i32, %off_hq2_36 : i32 loc(#loc551) + %q_adj2_43 = arith.muli %c8388608_i32, %off_zq : i32 loc(#loc552) + %q_adj2_44 = arith.addi %q_adj2, %q_adj2_43 : i32 loc(#loc553) + %q_adj2_45 = arith.extsi %q_adj2_44 : i32 to i64 loc(#loc554) + %do_adj2 = arith.muli %c262144_i32, %off_hq2_36 : i32 loc(#loc555) + %do_adj2_46 = arith.muli %c8388608_i32_11, %off_zq : i32 loc(#loc556) + %do_adj2_47 = arith.addi %do_adj2, %do_adj2_46 : i32 loc(#loc557) + %do_adj2_48 = arith.extsi %do_adj2_47 : i32 to i64 loc(#loc558) + %dq_adj2 = arith.muli %c128_i32_15, %off_hq2_36 : i32 loc(#loc559) + %dq_adj2_49 = arith.muli %c8388608_i32_14, %off_zq : i32 loc(#loc560) + %dq_adj2_50 = arith.addi %dq_adj2, %dq_adj2_49 : i32 loc(#loc561) + %dq_adj2_51 = arith.extsi %dq_adj2_50 : i32 to i64 loc(#loc562) + %off_chz2 = arith.muli %off_zq, %HQ : i32 loc(#loc563) + %off_chz2_52 = arith.addi %off_chz2, %off_hq2_36 : i32 loc(#loc564) + %off_chz2_53 = arith.muli %off_chz2_52, %Q_LEN : i32 loc(#loc565) + %off_chz2_54 = arith.extsi %off_chz2_53 : i32 to i64 loc(#loc566) + %Q2 = tt.addptr %arg_Q, %q_adj2_45 : !tt.ptr, i64 loc(#loc567) + %DO2 = tt.addptr %arg_DO, %do_adj2_48 : !tt.ptr, i64 loc(#loc568) + %DQ2 = tt.addptr %arg_DQ, %dq_adj2_51 : !tt.ptr, i64 loc(#loc569) + %LSE2 = tt.addptr %arg_LSE, %off_chz2_54 : !tt.ptr, i64 loc(#loc570) + %DELTA2 = tt.addptr %arg_DELTA, %off_chz2_54 : !tt.ptr, i64 loc(#loc571) + %dq = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc572) + %start_m2 = arith.constant 128 : i32 loc(#loc573) + %start_m2_55 = arith.constant 128 : i32 loc(#loc573) + %start_m2_56 = arith.muli %start_m2_block, %start_m2_55 : i32 loc(#loc573) + %offs_m2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc574) + %offs_m2_57 = tt.splat %start_m2_56 : i32 -> tensor<128xi32> loc(#loc575) + %offs_m2_58 = arith.addi %offs_m2_57, %offs_m2 : tensor<128xi32> loc(#loc575) + %q = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%Q2, %offs_m2_58, %offs_k, %c4096_i32, %c1_i32, %Q_LEN) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc576) + %do = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%DO2, %offs_m2_58, %offs_v, %c128_i32_12, %c1_i32_13, %Q_LEN) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc577) + %Di = arith.constant dense<2048> : tensor<128xi32> loc(#loc578) + %Di_59 = arith.cmpi slt, %offs_m2_58, %Di : tensor<128xi32> loc(#loc578) + %Di_60 = tt.splat %DELTA2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc579) + %Di_61 = tt.addptr %Di_60, %offs_m2_58 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc579) + %Di_62 = tt.load %Di_61, %Di_59 : tensor<128x!tt.ptr> loc(#loc580) + %lse = arith.constant dense<2048> : tensor<128xi32> loc(#loc581) + %lse_63 = arith.cmpi slt, %offs_m2_58, %lse : tensor<128xi32> loc(#loc581) + %lse_64 = tt.splat %LSE2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc582) + %lse_65 = tt.addptr %lse_64, %offs_m2_58 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc582) + %lse_66 = tt.load %lse_65, %lse_63 : tensor<128x!tt.ptr> loc(#loc583) + %lse_67 = arith.constant 0xFF800000 : f32 loc(#loc584) + %lse_68 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc584) + %lse_69 = arith.cmpf oeq, %lse_66, %lse_68 : tensor<128xf32> loc(#loc584) + %lse_70 = arith.constant 0.000000e+00 : f32 loc(#loc585) + %lse_71 = arith.constant 0.000000e+00 : f32 loc(#loc585) + %lse_72 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc585) + %lse_73 = arith.select %lse_69, %lse_72, %lse_66 : tensor<128xi1>, tensor<128xf32> loc(#loc585) + %lse_74 = tt.expand_dims %lse_73 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc586) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_42 : !tt.ptr, i32 loc(#loc587) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc588) + %kv_start_75 = arith.constant 128 : i32 loc(#loc589) + %kv_start_76 = arith.constant 128 : i32 loc(#loc589) + %kv_start_77 = arith.muli %kv_start, %kv_start_76 : i32 loc(#loc589) + %sparse_kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_40 : !tt.ptr, i32 loc(#loc590) + %sparse_kv_num_blocks_78 = tt.load %sparse_kv_num_blocks : !tt.ptr loc(#loc591) + %offs_n2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc592) + %offs_n2_79 = tt.splat %kv_start_77 : i32 -> tensor<64xi32> loc(#loc593) + %offs_n2_80 = arith.addi %offs_n2_79, %offs_n2 : tensor<64xi32> loc(#loc593) + %dq_81 = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(39,)cconstexpr_bf16__(40,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %K, %V, %dq, %q, %do, %Di_62, %lse_74, %off_zq, %off_hq2_36, %offs_m2_58, %offs_n2_80, %c128_i32_3, %c1_i32_4, %c128_i32_9, %c1_i32_10, %kv_indices, %sparse_kv_num_blocks_78) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc594) + %kv_indices_82 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_42 : !tt.ptr, i32 loc(#loc595) + %kv_start_83 = tt.load %kv_indices_82 : !tt.ptr loc(#loc596) + %kv_start_84 = arith.constant 128 : i32 loc(#loc597) + %kv_start_85 = arith.constant 128 : i32 loc(#loc597) + %kv_start_86 = arith.muli %kv_start_83, %kv_start_85 : i32 loc(#loc597) + %sparse_kv_num_blocks_87 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_40 : !tt.ptr, i32 loc(#loc598) + %sparse_kv_num_blocks_88 = tt.load %sparse_kv_num_blocks_87 : !tt.ptr loc(#loc599) + %offs_n2_89 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc600) + %offs_n2_90 = tt.splat %kv_start_86 : i32 -> tensor<64xi32> loc(#loc601) + %offs_n2_91 = arith.addi %offs_n2_90, %offs_n2_89 : tensor<64xi32> loc(#loc601) + %dq_92 = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(39,)cconstexpr_bf16__(40,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %K, %V, %dq_81, %q, %do, %Di_62, %lse_74, %off_zq, %off_hq2_36, %offs_m2_58, %offs_n2_91, %c128_i32_3, %c1_i32_4, %c128_i32_9, %c1_i32_10, %kv_indices_82, %sparse_kv_num_blocks_88) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc602) + %dq_ptrs = tt.expand_dims %offs_m2_58 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc603) + %dq_ptrs_93 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc604) + %dq_ptrs_94 = arith.muli %dq_ptrs, %dq_ptrs_93 : tensor<128x1xi32> loc(#loc604) + %dq_ptrs_95 = tt.splat %DQ2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc605) + %dq_ptrs_96 = tt.addptr %dq_ptrs_95, %dq_ptrs_94 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc605) + %dq_ptrs_97 = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc606) + %dq_ptrs_98 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc607) + %dq_ptrs_99 = arith.muli %dq_ptrs_97, %dq_ptrs_98 : tensor<1x128xi32> loc(#loc607) + %dq_ptrs_100 = tt.broadcast %dq_ptrs_96 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc608) + %dq_ptrs_101 = tt.broadcast %dq_ptrs_99 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc608) + %dq_ptrs_102 = tt.addptr %dq_ptrs_100, %dq_ptrs_101 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc608) + %dq_103 = arith.constant 0.0883883461 : f32 loc(#loc609) + %dq_104 = arith.constant 0.0883883461 : f32 loc(#loc609) + %dq_105 = arith.constant dense<0.0883883461> : tensor<128x128xf32> loc(#loc609) + %dq_106 = arith.mulf %dq_92, %dq_105 : tensor<128x128xf32> loc(#loc609) + %8 = tt.expand_dims %offs_m2_58 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc124) + %cst = arith.constant dense<2048> : tensor<128x1xi32> loc(#loc125) + %9 = arith.cmpi slt, %8, %cst : tensor<128x1xi32> loc(#loc125) + %10 = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc126) + %c128_i32_107 = arith.constant 128 : i32 loc(#loc127) + %cst_108 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc127) + %11 = arith.cmpi slt, %10, %cst_108 : tensor<1x128xi32> loc(#loc127) + %12 = tt.broadcast %9 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc128) + %13 = tt.broadcast %11 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc128) + %14 = arith.andi %12, %13 : tensor<128x128xi1> loc(#loc128) + %15 = arith.truncf %dq_106 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc129) + tt.store %dq_ptrs_102, %15, %14 : tensor<128x128x!tt.ptr> loc(#loc129) + scf.yield %SPARSE_KV_MULTIPLE, %SPARSE_Q_MULTIPLE : i32, i32 loc(#loc129) + } else { + %SPARSE_Q_MULTIPLE = arith.constant 2 : i32 loc(#loc1088) + %SPARSE_KV_MULTIPLE = arith.constant 1 : i32 loc(#loc1089) + %pid_mask = arith.divsi %pid, %SPARSE_KV_MULTIPLE : i32 loc(#loc612) + %stride_q_idx_h = arith.constant 16 : i32 loc(#loc613) + %stride_q_idx_h_33 = arith.constant 16 : i32 loc(#loc613) + %stride_q_idx_h_34 = arith.muli %stride_q_idx_h_33, %ks3 : i32 loc(#loc613) + %stride_q_idx_n = arith.constant 16 : i32 loc(#loc614) + %dv = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc615) + %dk = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc616) + %start_n1 = arith.constant 128 : i32 loc(#loc617) + %start_n1_35 = arith.constant 128 : i32 loc(#loc617) + %start_n1_36 = arith.muli %pid, %start_n1_35 : i32 loc(#loc617) + %offs_n1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc618) + %offs_n1_37 = tt.splat %start_n1_36 : i32 -> tensor<128xi32> loc(#loc619) + %offs_n1_38 = arith.addi %offs_n1_37, %offs_n1 : tensor<128xi32> loc(#loc619) + %k = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%K, %offs_n1_38, %offs_k, %c128_i32_3, %c1_i32_4, %ks0) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc620) + %v = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%V, %offs_n1_38, %offs_v, %c128_i32_9, %c1_i32_10, %ks0) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc621) + %c0_i32 = arith.constant 0 : i32 loc(#loc142) + %c4_i32 = arith.constant 4 : i32 loc(#loc142) + %c1_i32_39 = arith.constant 1 : i32 loc(#loc142) + %8 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc142) + %9 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc142) + %10 = arith.bitcast %c1_i32_39 : i32 to i32 loc(#loc142) + %11 = ub.poison : i32 loc(#loc142) + %dk_40:2 = scf.for %off_g = %8 to %9 step %10 iter_args(%dv_75 = %dv, %dk_76 = %dk) -> (tensor<128x128xf32>, tensor<128x128xf32>) : i32 { + %off_hq1 = arith.constant 4 : i32 loc(#loc623) + %off_hq1_77 = arith.constant 4 : i32 loc(#loc623) + %off_hq1_78 = arith.muli %off_hkv, %off_hq1_77 : i32 loc(#loc623) + %off_hq1_79 = arith.addi %off_hq1_78, %off_g : i32 loc(#loc624) + %q_adj1 = arith.muli %c128_i32, %off_hq1_79 : i32 loc(#loc625) + %q_adj1_80 = arith.muli %c8388608_i32, %off_zq : i32 loc(#loc626) + %q_adj1_81 = arith.addi %q_adj1, %q_adj1_80 : i32 loc(#loc627) + %q_adj1_82 = arith.extsi %q_adj1_81 : i32 to i64 loc(#loc628) + %do_adj1 = arith.muli %c262144_i32, %off_hq1_79 : i32 loc(#loc629) + %do_adj1_83 = arith.muli %c8388608_i32_11, %off_zq : i32 loc(#loc630) + %do_adj1_84 = arith.addi %do_adj1, %do_adj1_83 : i32 loc(#loc631) + %do_adj1_85 = arith.extsi %do_adj1_84 : i32 to i64 loc(#loc632) + %dq_adj1 = arith.muli %c128_i32_15, %off_hq1_79 : i32 loc(#loc633) + %dq_adj1_86 = arith.muli %c8388608_i32_14, %off_zq : i32 loc(#loc634) + %dq_adj1_87 = arith.addi %dq_adj1, %dq_adj1_86 : i32 loc(#loc635) + %dq_adj1_88 = arith.extsi %dq_adj1_87 : i32 to i64 loc(#loc636) + %off_chz1 = arith.muli %off_zq, %HQ : i32 loc(#loc637) + %off_chz1_89 = arith.addi %off_chz1, %off_hq1_79 : i32 loc(#loc638) + %off_chz1_90 = arith.muli %off_chz1_89, %Q_LEN : i32 loc(#loc639) + %off_chz1_91 = arith.extsi %off_chz1_90 : i32 to i64 loc(#loc640) + %Q1 = tt.addptr %arg_Q, %q_adj1_82 : !tt.ptr, i64 loc(#loc641) + %DO1 = tt.addptr %arg_DO, %do_adj1_85 : !tt.ptr, i64 loc(#loc642) + %LSE1 = tt.addptr %arg_LSE, %off_chz1_91 : !tt.ptr, i64 loc(#loc643) + %DELTA1 = tt.addptr %arg_DELTA, %off_chz1_91 : !tt.ptr, i64 loc(#loc644) + %sparse_idx_hq1 = arith.remsi %off_hq1_79, %SPARSE_HQ : i32 loc(#loc645) + %sparse_hz_offset = arith.muli %sparse_idx_z, %SPARSE_HQ : i32 loc(#loc646) + %sparse_hz_offset_92 = arith.addi %sparse_hz_offset, %sparse_idx_hq1 : i32 loc(#loc647) + %sparse_q_num_blks_offset = arith.muli %sparse_hz_offset_92, %ks2 : i32 loc(#loc648) + %sparse_q_num_blks_offset_93 = arith.addi %sparse_q_num_blks_offset, %pid_mask : i32 loc(#loc649) + %sparse_q_idx_offset = arith.muli %sparse_hz_offset_92, %stride_q_idx_h_34 : i32 loc(#loc650) + %sparse_q_idx_offset_94 = arith.muli %pid_mask, %stride_q_idx_n : i32 loc(#loc651) + %sparse_q_idx_offset_95 = arith.addi %sparse_q_idx_offset, %sparse_q_idx_offset_94 : i32 loc(#loc652) + %q_indices = tt.addptr %arg_Q_IDX, %sparse_q_idx_offset_95 : !tt.ptr, i32 loc(#loc653) + %q_start = tt.load %q_indices : !tt.ptr loc(#loc654) + %q_start_96 = arith.constant 128 : i32 loc(#loc655) + %q_start_97 = arith.constant 128 : i32 loc(#loc655) + %q_start_98 = arith.muli %q_start, %q_start_97 : i32 loc(#loc655) + %sparse_q_num_blocks = tt.addptr %arg_Q_NUM_BLKS, %sparse_q_num_blks_offset_93 : !tt.ptr, i32 loc(#loc656) + %sparse_q_num_blocks_99 = tt.load %sparse_q_num_blocks : !tt.ptr loc(#loc657) + %offs_m1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc658) + %offs_m1_100 = tt.splat %q_start_98 : i32 -> tensor<64xi32> loc(#loc659) + %offs_m1_101 = arith.addi %offs_m1_100, %offs_m1 : tensor<64xi32> loc(#loc659) + %23:2 = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(40,)cconstexpr_bf16__(41,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %Q1, %DO1, %DELTA1, %LSE1, %dk_76, %dv_75, %k, %v, %off_zq, %off_hq1_79, %offs_n1_38, %offs_m1_101, %c4096_i32, %c1_i32, %c128_i32_12, %c1_i32_13, %q_indices, %sparse_q_num_blocks_99) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc180) + %q_indices_102 = tt.addptr %arg_FULL_Q_IDX, %sparse_q_idx_offset_95 : !tt.ptr, i32 loc(#loc660) + %q_start_103 = tt.load %q_indices_102 : !tt.ptr loc(#loc661) + %q_start_104 = arith.constant 128 : i32 loc(#loc662) + %q_start_105 = arith.constant 128 : i32 loc(#loc662) + %q_start_106 = arith.muli %q_start_103, %q_start_105 : i32 loc(#loc662) + %sparse_q_num_blocks_107 = tt.addptr %arg_FULL_Q_NUM_BLKS, %sparse_q_num_blks_offset_93 : !tt.ptr, i32 loc(#loc663) + %sparse_q_num_blocks_108 = tt.load %sparse_q_num_blocks_107 : !tt.ptr loc(#loc664) + %offs_m1_109 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc665) + %offs_m1_110 = tt.splat %q_start_106 : i32 -> tensor<64xi32> loc(#loc666) + %offs_m1_111 = arith.addi %offs_m1_110, %offs_m1_109 : tensor<64xi32> loc(#loc666) + %24:2 = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(40,)cconstexpr_bf16__(41,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %Q1, %DO1, %DELTA1, %LSE1, %23#0, %23#1, %k, %v, %off_zq, %off_hq1_79, %offs_n1_38, %offs_m1_111, %c4096_i32, %c1_i32, %c128_i32_12, %c1_i32_13, %q_indices_102, %sparse_q_num_blocks_108) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x128xbf16>, i32, i32, tensor<128xi32>, tensor<64xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc188) + scf.yield %24#1, %24#0 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc189) + } loc(#loc1090) + %dv_ptrs = tt.expand_dims %offs_n1_38 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc667) + %dv_ptrs_41 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc668) + %dv_ptrs_42 = arith.muli %dv_ptrs, %dv_ptrs_41 : tensor<128x1xi32> loc(#loc668) + %dv_ptrs_43 = tt.splat %DV : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc669) + %dv_ptrs_44 = tt.addptr %dv_ptrs_43, %dv_ptrs_42 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc669) + %dv_ptrs_45 = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc670) + %dv_ptrs_46 = arith.constant dense<1> : tensor<1x128xi32> loc(#loc671) + %dv_ptrs_47 = arith.muli %dv_ptrs_45, %dv_ptrs_46 : tensor<1x128xi32> loc(#loc671) + %dv_ptrs_48 = tt.broadcast %dv_ptrs_44 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc672) + %dv_ptrs_49 = tt.broadcast %dv_ptrs_47 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc672) + %dv_ptrs_50 = tt.addptr %dv_ptrs_48, %dv_ptrs_49 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc672) + %index_n = tt.expand_dims %offs_n1_38 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc673) + %index_k = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc674) + %index_v = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc675) + %12 = tt.splat %ks0 : i32 -> tensor<128x1xi32> loc(#loc199) + %13 = arith.cmpi slt, %index_n, %12 : tensor<128x1xi32> loc(#loc199) + %c128_i32_51 = arith.constant 128 : i32 loc(#loc200) + %cst = arith.constant dense<128> : tensor<1x128xi32> loc(#loc200) + %14 = arith.cmpi slt, %index_v, %cst : tensor<1x128xi32> loc(#loc200) + %15 = tt.broadcast %13 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc201) + %16 = tt.broadcast %14 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc201) + %17 = arith.andi %15, %16 : tensor<128x128xi1> loc(#loc201) + %18 = arith.truncf %dk_40#0 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc202) + tt.store %dv_ptrs_50, %18, %17 : tensor<128x128x!tt.ptr> loc(#loc202) + %dk_52 = arith.constant 0.0883883461 : f32 loc(#loc676) + %dk_53 = arith.constant 0.0883883461 : f32 loc(#loc676) + %dk_54 = arith.constant dense<0.0883883461> : tensor<128x128xf32> loc(#loc676) + %dk_55 = arith.mulf %dk_40#1, %dk_54 : tensor<128x128xf32> loc(#loc676) + %mask = tt.splat %ks0 : i32 -> tensor<128x1xi32> loc(#loc677) + %mask_56 = arith.cmpi slt, %index_n, %mask : tensor<128x1xi32> loc(#loc677) + %xindex = arith.constant 128 : i32 loc(#loc678) + %xindex_57 = arith.constant 128 : i32 loc(#loc678) + %xindex_58 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc678) + %xindex_59 = arith.muli %xindex_58, %index_n : tensor<128x1xi32> loc(#loc678) + %xindex_60 = tt.broadcast %index_k : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc679) + %xindex_61 = tt.broadcast %xindex_59 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc679) + %xindex_62 = arith.addi %xindex_60, %xindex_61 : tensor<128x128xi32> loc(#loc679) + %xindex_63 = arith.constant 128 : i32 loc(#loc680) + %xindex_64 = arith.constant 128 : i32 loc(#loc680) + %xindex_65 = arith.muli %xindex_64, %off_hkv : i32 loc(#loc680) + %xindex_66 = arith.muli %xindex_65, %ks0 : i32 loc(#loc681) + %xindex_67 = tt.splat %xindex_66 : i32 -> tensor<128x128xi32> loc(#loc682) + %xindex_68 = arith.addi %xindex_62, %xindex_67 : tensor<128x128xi32> loc(#loc682) + %xindex_69 = arith.constant 1024 : i32 loc(#loc683) + %xindex_70 = arith.constant 1024 : i32 loc(#loc683) + %xindex_71 = arith.muli %xindex_70, %off_zq : i32 loc(#loc683) + %xindex_72 = arith.muli %xindex_71, %ks0 : i32 loc(#loc684) + %xindex_73 = tt.splat %xindex_72 : i32 -> tensor<128x128xi32> loc(#loc685) + %xindex_74 = arith.addi %xindex_68, %xindex_73 : tensor<128x128xi32> loc(#loc685) + %19 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc213) + %20 = tt.addptr %19, %xindex_74 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc213) + %21 = tt.broadcast %mask_56 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc214) + %22 = arith.truncf %dk_55 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc214) + tt.store %20, %22, %21 : tensor<128x128x!tt.ptr> loc(#loc214) + scf.yield %SPARSE_KV_MULTIPLE, %SPARSE_Q_MULTIPLE : i32, i32 loc(#loc214) + } loc(#loc46) + tt.return loc(#loc215) + } loc(#loc) + tt.func private @"triton.language.standard.cdiv__i32__(1,)cconstexpr_128_"(%x: i32 loc("x"(#loc216))) -> i32 attributes {noinline = false} { + %c128_i32 = arith.constant 128 : i32 loc(#loc217) + %c128_i32_0 = arith.constant 128 : i32 loc(#loc217) + %0 = arith.addi %x, %c128_i32_0 : i32 loc(#loc217) + %c1_i32 = arith.constant 1 : i32 loc(#loc218) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc218) + %1 = arith.subi %0, %c1_i32_1 : i32 loc(#loc218) + %c128_i32_2 = arith.constant 128 : i32 loc(#loc219) + %c128_i32_3 = arith.constant 128 : i32 loc(#loc219) + %2 = arith.divsi %1, %c128_i32_3 : i32 loc(#loc219) + tt.return %2 : i32 loc(#loc220) + ^bb1: // no predecessors + %3 = ub.poison : i32 loc(#loc221) + tt.return %3 : i32 loc(#loc221) + } loc(#loc216) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() -> tensor<128x128xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc223) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc223) + tt.return %cst_0 : tensor<128x128xf32> loc(#loc224) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc225) + tt.return %0 : tensor<128x128xf32> loc(#loc225) + } loc(#loc222) + tt.func private @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: !tt.ptr loc("ptr"(#loc226)), %offs_m: tensor<128xi32> loc("offs_m"(#loc226)), %offs_n: tensor<128xi32> loc("offs_n"(#loc226)), %stride_m: i32 loc("stride_m"(#loc226)), %stride_n: i32 loc("stride_n"(#loc226)), %M_LEN: i32 loc("M_LEN"(#loc226))) -> tensor<128x128xbf16> attributes {noinline = false} { + %ptr_0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc693) + %ptr_1 = tt.splat %stride_m : i32 -> tensor<128x1xi32> loc(#loc694) + %ptr_2 = arith.muli %ptr_0, %ptr_1 : tensor<128x1xi32> loc(#loc694) + %ptr_3 = tt.splat %ptr : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc695) + %ptr_4 = tt.addptr %ptr_3, %ptr_2 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc695) + %ptr_5 = tt.expand_dims %offs_n {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc696) + %ptr_6 = tt.splat %stride_n : i32 -> tensor<1x128xi32> loc(#loc697) + %ptr_7 = arith.muli %ptr_5, %ptr_6 : tensor<1x128xi32> loc(#loc697) + %ptr_8 = tt.broadcast %ptr_4 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc698) + %ptr_9 = tt.broadcast %ptr_7 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc698) + %ptr_10 = tt.addptr %ptr_8, %ptr_9 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc698) + %0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc233) + %1 = tt.splat %M_LEN : i32 -> tensor<128x1xi32> loc(#loc234) + %2 = arith.cmpi slt, %0, %1 : tensor<128x1xi32> loc(#loc234) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc235) + %3 = tt.broadcast %2 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc235) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc235) + %4 = arith.truncf %cst_11 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc235) + %5 = tt.load %ptr_10, %3, %4 : tensor<128x128x!tt.ptr> loc(#loc235) + tt.return %5 : tensor<128x128xbf16> loc(#loc236) + ^bb1: // no predecessors + %6 = ub.poison : tensor<128x128xbf16> loc(#loc237) + tt.return %6 : tensor<128x128xbf16> loc(#loc237) + } loc(#loc226) + tt.func private @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(39,)cconstexpr_bf16__(40,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc238)), %arg_K: !tt.ptr loc("arg_K"(#loc238)), %arg_V: !tt.ptr loc("arg_V"(#loc238)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc238)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc238)), %arg_DO: !tt.ptr loc("arg_DO"(#loc238)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc238)), %arg_DV: !tt.ptr loc("arg_DV"(#loc238)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc238)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc238)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc238)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc238)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc238)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc238)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc238)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc238)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc238)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc238)), %ks0: i32 loc("ks0"(#loc238)), %ks1: i32 loc("ks1"(#loc238)), %ks2: i32 loc("ks2"(#loc238)), %ks3: i32 loc("ks3"(#loc238)), %K: !tt.ptr loc("K"(#loc238)), %V: !tt.ptr loc("V"(#loc238)), %dq: tensor<128x128xf32> loc("dq"(#loc238)), %q: tensor<128x128xbf16> loc("q"(#loc238)), %do: tensor<128x128xbf16> loc("do"(#loc238)), %Di: tensor<128xf32> loc("Di"(#loc238)), %lse: tensor<128x1xf32> loc("lse"(#loc238)), %off_z: i32 loc("off_z"(#loc238)), %off_hq: i32 loc("off_hq"(#loc238)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc238)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc238)), %stride_kn: i32 loc("stride_kn"(#loc238)), %stride_kd: i32 loc("stride_kd"(#loc238)), %stride_vn: i32 loc("stride_vn"(#loc238)), %stride_vd: i32 loc("stride_vd"(#loc238)), %kv_indices: !tt.ptr loc("kv_indices"(#loc238)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc238))) -> tensor<128x128xf32> attributes {noinline = false} { + %Q_LEN = arith.constant 2048 : i32 loc(#loc738) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc739) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc740) + %kT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc741) + %kT_ptrs_0 = tt.splat %stride_kn : i32 -> tensor<1x64xi32> loc(#loc742) + %kT_ptrs_1 = arith.muli %kT_ptrs, %kT_ptrs_0 : tensor<1x64xi32> loc(#loc742) + %kT_ptrs_2 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc743) + %kT_ptrs_3 = tt.addptr %kT_ptrs_2, %kT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc743) + %kT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc744) + %kT_ptrs_5 = tt.splat %stride_kd : i32 -> tensor<128x1xi32> loc(#loc745) + %kT_ptrs_6 = arith.muli %kT_ptrs_4, %kT_ptrs_5 : tensor<128x1xi32> loc(#loc745) + %kT_ptrs_7 = tt.broadcast %kT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc746) + %kT_ptrs_8 = tt.broadcast %kT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc746) + %kT_ptrs_9 = tt.addptr %kT_ptrs_7, %kT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc746) + %vT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc747) + %vT_ptrs_10 = tt.splat %stride_vn : i32 -> tensor<1x64xi32> loc(#loc748) + %vT_ptrs_11 = arith.muli %vT_ptrs, %vT_ptrs_10 : tensor<1x64xi32> loc(#loc748) + %vT_ptrs_12 = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc749) + %vT_ptrs_13 = tt.addptr %vT_ptrs_12, %vT_ptrs_11 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc749) + %vT_ptrs_14 = tt.expand_dims %offs_v {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc750) + %vT_ptrs_15 = tt.splat %stride_vd : i32 -> tensor<128x1xi32> loc(#loc751) + %vT_ptrs_16 = arith.muli %vT_ptrs_14, %vT_ptrs_15 : tensor<128x1xi32> loc(#loc751) + %vT_ptrs_17 = tt.broadcast %vT_ptrs_13 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc752) + %vT_ptrs_18 = tt.broadcast %vT_ptrs_16 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc752) + %vT_ptrs_19 = tt.addptr %vT_ptrs_17, %vT_ptrs_18 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc752) + %hi = arith.constant 2 : i32 loc(#loc753) + %hi_20 = arith.constant 2 : i32 loc(#loc753) + %hi_21 = arith.muli %sparse_kv_num_blocks, %hi_20 : i32 loc(#loc753) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%ks0) : (i32) -> i32 loc(#loc754) + %hi_23 = arith.constant 1 : i32 loc(#loc755) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc755) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc756) + %c0_i32 = arith.constant 0 : i32 loc(#loc258) + %c1_i32 = arith.constant 1 : i32 loc(#loc258) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc258) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc258) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc258) + %3 = ub.poison : i32 loc(#loc258) + %vT_ptrs_26:4 = scf.for %start_n = %0 to %1 step %2 iter_args(%dq_27 = %dq, %offs_n2_28 = %offs_n2, %kT_ptrs_29 = %kT_ptrs_9, %vT_ptrs_30 = %vT_ptrs_19) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %dq_31 = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(43,)cconstexpr_bf16__(44,)cconstexpr_1_d_44269504__(45,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %dq_27, %q, %kT_ptrs_29, %vT_ptrs_30, %do, %Di, %lse, %Q_LEN, %ks0, %off_z, %off_hq, %offs_m2, %offs_n2_28, %offs_k, %offs_v, %stride_kn, %stride_kd, %stride_vn, %stride_vd, %kv_indices, %sparse_kv_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc758) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %sparse_kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc759) + %kT_ptrs_32 = arith.muli %offset, %stride_kn : i32 loc(#loc760) + %kT_ptrs_33 = tt.splat %kT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc761) + %kT_ptrs_34 = tt.addptr %kT_ptrs_29, %kT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc761) + %vT_ptrs_35 = arith.muli %offset, %stride_vn : i32 loc(#loc762) + %vT_ptrs_36 = tt.splat %vT_ptrs_35 : i32 -> tensor<128x64xi32> loc(#loc763) + %vT_ptrs_37 = tt.addptr %vT_ptrs_30, %vT_ptrs_36 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc763) + %offs_n2_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc764) + %offs_n2_39 = arith.addi %offs_n2_28, %offs_n2_38 : tensor<64xi32> loc(#loc764) + scf.yield %dq_31, %offs_n2_39, %kT_ptrs_34, %vT_ptrs_37 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc266) + } loc(#loc1095) + tt.return %vT_ptrs_26#0 : tensor<128x128xf32> loc(#loc267) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc268) + tt.return %4 : tensor<128x128xf32> loc(#loc268) + } loc(#loc238) + tt.func private @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%x: i32 loc("x"(#loc216))) -> i32 attributes {noinline = false} { + %c64_i32 = arith.constant 64 : i32 loc(#loc217) + %c64_i32_0 = arith.constant 64 : i32 loc(#loc217) + %0 = arith.addi %x, %c64_i32_0 : i32 loc(#loc217) + %c1_i32 = arith.constant 1 : i32 loc(#loc218) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc218) + %1 = arith.subi %0, %c1_i32_1 : i32 loc(#loc218) + %c64_i32_2 = arith.constant 64 : i32 loc(#loc219) + %c64_i32_3 = arith.constant 64 : i32 loc(#loc219) + %2 = arith.divsi %1, %c64_i32_3 : i32 loc(#loc219) + tt.return %2 : i32 loc(#loc220) + ^bb1: // no predecessors + %3 = ub.poison : i32 loc(#loc221) + tt.return %3 : i32 loc(#loc221) + } loc(#loc216) + tt.func private @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(43,)cconstexpr_bf16__(44,)cconstexpr_1_d_44269504__(45,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc269)), %arg_K: !tt.ptr loc("arg_K"(#loc269)), %arg_V: !tt.ptr loc("arg_V"(#loc269)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc269)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc269)), %arg_DO: !tt.ptr loc("arg_DO"(#loc269)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc269)), %arg_DV: !tt.ptr loc("arg_DV"(#loc269)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc269)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc269)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc269)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc269)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc269)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc269)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc269)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc269)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc269)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc269)), %ks0: i32 loc("ks0"(#loc269)), %ks1: i32 loc("ks1"(#loc269)), %ks2: i32 loc("ks2"(#loc269)), %ks3: i32 loc("ks3"(#loc269)), %dq: tensor<128x128xf32> loc("dq"(#loc269)), %q: tensor<128x128xbf16> loc("q"(#loc269)), %kT_ptrs: tensor<128x64x!tt.ptr> loc("kT_ptrs"(#loc269)), %vT_ptrs: tensor<128x64x!tt.ptr> loc("vT_ptrs"(#loc269)), %do: tensor<128x128xbf16> loc("do"(#loc269)), %Di: tensor<128xf32> loc("Di"(#loc269)), %lse: tensor<128x1xf32> loc("lse"(#loc269)), %Q_LEN: i32 loc("Q_LEN"(#loc269)), %KV_LEN: i32 loc("KV_LEN"(#loc269)), %off_z: i32 loc("off_z"(#loc269)), %off_hq: i32 loc("off_hq"(#loc269)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc269)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc269)), %offs_k: tensor<128xi32> loc("offs_k"(#loc269)), %offs_v: tensor<128xi32> loc("offs_v"(#loc269)), %stride_kn: i32 loc("stride_kn"(#loc269)), %stride_kd: i32 loc("stride_kd"(#loc269)), %stride_vn: i32 loc("stride_vn"(#loc269)), %stride_vd: i32 loc("stride_vd"(#loc269)), %kv_indices: !tt.ptr loc("kv_indices"(#loc269)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc269))) -> tensor<128x128xf32> attributes {noinline = false} { + %kT = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%kT_ptrs, %offs_k, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc808) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc809) + %qk_0 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc809) + %qk_1 = tt.dot %q, %kT, %qk_0, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc809) + %qk_2 = arith.constant 0.0883883461 : f32 loc(#loc810) + %qk_3 = arith.constant 0.0883883461 : f32 loc(#loc810) + %qk_4 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc810) + %qk_5 = arith.mulf %qk_1, %qk_4 : tensor<128x64xf32> loc(#loc810) + %n = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc811) + %n_6 = tt.call @torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.get_bounded_indices__i32S1_64S_i32__(%n, %KV_LEN) : (tensor<1x64xi32>, i32) -> tensor<1x64xi32> loc(#loc812) + %m = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc813) + %m_7 = tt.call @torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.get_bounded_indices__i32S128_1S_i32__(%m, %Q_LEN) : (tensor<128x1xi32>, i32) -> tensor<128x1xi32> loc(#loc814) + %post_mod_scores = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc815) + %post_mod_scores_8 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc816) + %post_mod_scores_9 = arith.cmpi slt, %post_mod_scores, %post_mod_scores_8 : tensor<1x64xi32> loc(#loc816) + %post_mod_scores_10 = arith.constant 0xFF800000 : f32 loc(#loc817) + %post_mod_scores_11 = arith.constant 0xFF800000 : f32 loc(#loc817) + %post_mod_scores_12 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc817) + %post_mod_scores_13 = tt.broadcast %post_mod_scores_9 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc817) + %post_mod_scores_14 = arith.select %post_mod_scores_13, %qk_5, %post_mod_scores_12 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc817) + %tmp1 = arith.constant false loc(#loc818) + %tmp1_15 = arith.constant dense : tensor<1xi1> loc(#loc818) + %tmp4 = tt.broadcast %m_7 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc819) + %tmp4_16 = tt.broadcast %n_6 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc819) + %tmp4_17 = arith.cmpi sge, %tmp4, %tmp4_16 : tensor<128x64xi32> loc(#loc819) + %tmp5 = arith.extsi %n_6 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc820) + %tmp7 = tt.addptr %in_ptr16, %off_z : !tt.ptr, i32 loc(#loc821) + %tmp7_18 = tt.load %tmp7 : !tt.ptr loc(#loc822) + %tmp8 = tt.splat %tmp7_18 : i64 -> tensor<1x64xi64> loc(#loc823) + %tmp8_19 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc823) + %tmp9 = arith.extsi %m_7 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc824) + %tmp10 = tt.splat %tmp7_18 : i64 -> tensor<128x1xi64> loc(#loc825) + %tmp10_20 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc825) + %tmp11 = tt.broadcast %tmp8_19 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc826) + %tmp11_21 = tt.broadcast %tmp10_20 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc826) + %tmp11_22 = arith.andi %tmp11, %tmp11_21 : tensor<128x64xi1> loc(#loc826) + %tmp12 = arith.andi %tmp4_17, %tmp11_22 : tensor<128x64xi1> loc(#loc827) + %tmp13 = tt.expand_dims %tmp1_15 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc828) + %tmp13_23 = tt.broadcast %tmp13 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc828) + %tmp13_24 = arith.ori %tmp13_23, %tmp12 : tensor<128x64xi1> loc(#loc828) + %tmp14 = arith.constant 2048 : i32 loc(#loc829) + %tmp14_25 = arith.constant dense<2048> : tensor<1xi32> loc(#loc829) + %tmp15 = tt.expand_dims %tmp14_25 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc830) + %tmp15_26 = tt.broadcast %tmp15 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc830) + %tmp15_27 = arith.cmpi sge, %n_6, %tmp15_26 : tensor<1x64xi32> loc(#loc830) + %tmp16 = tt.expand_dims %tmp14_25 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc831) + %tmp16_28 = tt.broadcast %tmp16 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc831) + %tmp16_29 = arith.remsi %n_6, %tmp16_28 : tensor<1x64xi32> loc(#loc831) + %tmp17 = arith.constant 0 : i32 loc(#loc832) + %tmp17_30 = arith.constant dense<0> : tensor<1xi32> loc(#loc832) + %tmp18 = tt.expand_dims %tmp17_30 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc833) + %tmp18_31 = tt.broadcast %tmp18 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc833) + %tmp18_32 = arith.cmpi ne, %tmp16_29, %tmp18_31 : tensor<1x64xi32> loc(#loc833) + %tmp19 = arith.constant 0 : i32 loc(#loc834) + %tmp19_33 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc834) + %tmp19_34 = arith.cmpi slt, %tmp16_29, %tmp19_33 : tensor<1x64xi32> loc(#loc834) + %tmp20 = arith.constant 0 : i32 loc(#loc835) + %tmp20_35 = arith.constant dense<0> : tensor<1xi32> loc(#loc835) + %tmp20_36 = arith.cmpi slt, %tmp14_25, %tmp20_35 : tensor<1xi32> loc(#loc835) + %tmp21 = tt.expand_dims %tmp20_36 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc836) + %tmp21_37 = tt.broadcast %tmp21 : tensor<1x1xi1> -> tensor<1x64xi1> loc(#loc836) + %tmp21_38 = arith.cmpi ne, %tmp19_34, %tmp21_37 : tensor<1x64xi1> loc(#loc836) + %tmp22 = arith.andi %tmp18_32, %tmp21_38 : tensor<1x64xi1> loc(#loc837) + %tmp23 = tt.expand_dims %tmp14_25 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc838) + %tmp23_39 = tt.broadcast %tmp23 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc838) + %tmp23_40 = arith.addi %tmp16_29, %tmp23_39 : tensor<1x64xi32> loc(#loc838) + %tmp24 = arith.select %tmp22, %tmp23_40, %tmp16_29 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc839) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc840) + %tmp26 = tt.splat %tmp7_18 : i64 -> tensor<1x64xi64> loc(#loc841) + %tmp26_41 = arith.cmpi slt, %tmp25, %tmp26 : tensor<1x64xi64> loc(#loc841) + %tmp27 = arith.andi %tmp15_27, %tmp26_41 : tensor<1x64xi1> loc(#loc842) + %tmp28 = tt.broadcast %n_6 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc843) + %tmp28_42 = tt.broadcast %m_7 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc843) + %tmp28_43 = arith.subi %tmp28, %tmp28_42 : tensor<128x64xi32> loc(#loc843) + %tmp29 = tt.expand_dims %tmp14_25 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc844) + %tmp29_44 = tt.broadcast %tmp29 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc844) + %tmp29_45 = arith.remsi %tmp28_43, %tmp29_44 : tensor<128x64xi32> loc(#loc844) + %tmp30 = tt.expand_dims %tmp17_30 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc845) + %tmp30_46 = tt.broadcast %tmp30 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc845) + %tmp30_47 = arith.cmpi ne, %tmp29_45, %tmp30_46 : tensor<128x64xi32> loc(#loc845) + %tmp31 = arith.constant 0 : i32 loc(#loc846) + %tmp31_48 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc846) + %tmp31_49 = arith.cmpi slt, %tmp29_45, %tmp31_48 : tensor<128x64xi32> loc(#loc846) + %tmp32 = tt.expand_dims %tmp20_36 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc847) + %tmp32_50 = tt.broadcast %tmp32 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc847) + %tmp32_51 = arith.cmpi ne, %tmp31_49, %tmp32_50 : tensor<128x64xi1> loc(#loc847) + %tmp33 = arith.andi %tmp30_47, %tmp32_51 : tensor<128x64xi1> loc(#loc848) + %tmp34 = tt.expand_dims %tmp14_25 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc849) + %tmp34_52 = tt.broadcast %tmp34 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc849) + %tmp34_53 = arith.addi %tmp29_45, %tmp34_52 : tensor<128x64xi32> loc(#loc849) + %tmp35 = arith.select %tmp33, %tmp34_53, %tmp29_45 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc850) + %tmp36 = tt.expand_dims %tmp17_30 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc851) + %tmp36_54 = tt.broadcast %tmp36 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc851) + %tmp36_55 = arith.cmpi eq, %tmp35, %tmp36_54 : tensor<128x64xi32> loc(#loc851) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc852) + %tmp37_56 = arith.andi %tmp37, %tmp36_55 : tensor<128x64xi1> loc(#loc852) + %tmp38 = arith.ori %tmp13_24, %tmp37_56 : tensor<128x64xi1> loc(#loc853) + %post_mod_scores_57 = arith.constant 0xFF800000 : f32 loc(#loc854) + %post_mod_scores_58 = arith.constant 0xFF800000 : f32 loc(#loc854) + %post_mod_scores_59 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc854) + %post_mod_scores_60 = arith.select %tmp38, %post_mod_scores_14, %post_mod_scores_59 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc854) + %post_mod_scores_61 = arith.constant 1.44269502 : f32 loc(#loc855) + %post_mod_scores_62 = arith.constant 1.44269502 : f32 loc(#loc855) + %post_mod_scores_63 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc855) + %post_mod_scores_64 = arith.mulf %post_mod_scores_60, %post_mod_scores_63 : tensor<128x64xf32> loc(#loc855) + %p = tt.broadcast %lse : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc856) + %p_65 = arith.subf %post_mod_scores_64, %p : tensor<128x64xf32> loc(#loc856) + %p_66 = math.exp2 %p_65 : tensor<128x64xf32> loc(#loc857) + %vT = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%vT_ptrs, %offs_v, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc858) + %dp = arith.constant 0.000000e+00 : f32 loc(#loc859) + %dp_67 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc859) + %dp_68 = tt.dot %do, %vT, %dp_67, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc859) + %ds = tt.expand_dims %Di {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc860) + %ds_69 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc861) + %ds_70 = arith.subf %dp_68, %ds_69 : tensor<128x64xf32> loc(#loc861) + %ds_71 = arith.mulf %p_66, %ds_70 : tensor<128x64xf32> loc(#loc862) + %grad_scores = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc863) + %grad_scores_72 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc864) + %grad_scores_73 = arith.cmpi slt, %grad_scores, %grad_scores_72 : tensor<1x64xi32> loc(#loc864) + %grad_scores_74 = arith.constant 0.000000e+00 : f32 loc(#loc865) + %grad_scores_75 = arith.constant 0.000000e+00 : f32 loc(#loc865) + %grad_scores_76 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc865) + %grad_scores_77 = tt.broadcast %grad_scores_73 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc865) + %grad_scores_78 = arith.select %grad_scores_77, %ds_71, %grad_scores_76 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc865) + %scatter_mask = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc866) + %scatter_mask_79 = tt.splat %Q_LEN : i32 -> tensor<128x1xi32> loc(#loc867) + %scatter_mask_80 = arith.cmpi slt, %scatter_mask, %scatter_mask_79 : tensor<128x1xi32> loc(#loc867) + %scatter_mask_81 = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc868) + %scatter_mask_82 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc869) + %scatter_mask_83 = arith.cmpi slt, %scatter_mask_81, %scatter_mask_82 : tensor<1x64xi32> loc(#loc869) + %scatter_mask_84 = tt.broadcast %scatter_mask_80 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc870) + %scatter_mask_85 = tt.broadcast %scatter_mask_83 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc870) + %scatter_mask_86 = arith.andi %scatter_mask_84, %scatter_mask_85 : tensor<128x64xi1> loc(#loc870) + %ds_87 = arith.constant 0.000000e+00 : f32 loc(#loc871) + %ds_88 = arith.constant 0.000000e+00 : f32 loc(#loc871) + %ds_89 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc871) + %ds_90 = arith.select %tmp38, %grad_scores_78, %ds_89 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc871) + %ds_91 = arith.truncf %ds_90 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc872) + %dq_92 = tt.trans %kT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc873) + %dq_93 = arith.constant 0.000000e+00 : f32 loc(#loc874) + %dq_94 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc874) + %dq_95 = tt.dot %ds_91, %dq_92, %dq_94, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc874) + %dq_96 = arith.addf %dq, %dq_95 : tensor<128x128xf32> loc(#loc875) + tt.return %dq_96 : tensor<128x128xf32> loc(#loc338) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc339) + tt.return %0 : tensor<128x128xf32> loc(#loc339) + } loc(#loc269) + tt.func private @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%ptr: tensor<128x64x!tt.ptr> loc("ptr"(#loc226)), %offs_m: tensor<128xi32> loc("offs_m"(#loc226)), %offs_n: tensor<64xi32> loc("offs_n"(#loc226)), %N_LEN: i32 loc("N_LEN"(#loc226))) -> tensor<128x64xbf16> attributes {noinline = false} { + %0 = tt.expand_dims %offs_n {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc340) + %1 = tt.splat %N_LEN : i32 -> tensor<1x64xi32> loc(#loc341) + %2 = arith.cmpi slt, %0, %1 : tensor<1x64xi32> loc(#loc341) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc342) + %3 = tt.broadcast %2 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc342) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc342) + %4 = arith.truncf %cst_0 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc342) + %5 = tt.load %ptr, %3, %4 : tensor<128x64x!tt.ptr> loc(#loc342) + tt.return %5 : tensor<128x64xbf16> loc(#loc343) + ^bb1: // no predecessors + %6 = ub.poison : tensor<128x64xbf16> loc(#loc237) + tt.return %6 : tensor<128x64xbf16> loc(#loc237) + } loc(#loc226) + tt.func private @torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.get_bounded_indices__i32S1_64S_i32__(%indices: tensor<1x64xi32> loc("indices"(#loc344)), %max_len: i32 loc("max_len"(#loc344))) -> tensor<1x64xi32> attributes {noinline = false} { + %0 = tt.splat %max_len : i32 -> tensor<1x64xi32> loc(#loc345) + %1 = arith.remsi %indices, %0 : tensor<1x64xi32> loc(#loc345) + tt.return %1 : tensor<1x64xi32> loc(#loc346) + ^bb1: // no predecessors + %2 = ub.poison : tensor<1x64xi32> loc(#loc347) + tt.return %2 : tensor<1x64xi32> loc(#loc347) + } loc(#loc344) + tt.func private @torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.get_bounded_indices__i32S128_1S_i32__(%indices: tensor<128x1xi32> loc("indices"(#loc344)), %max_len: i32 loc("max_len"(#loc344))) -> tensor<128x1xi32> attributes {noinline = false} { + %0 = tt.splat %max_len : i32 -> tensor<128x1xi32> loc(#loc345) + %1 = arith.remsi %indices, %0 : tensor<128x1xi32> loc(#loc345) + tt.return %1 : tensor<128x1xi32> loc(#loc346) + ^bb1: // no predecessors + %2 = ub.poison : tensor<128x1xi32> loc(#loc347) + tt.return %2 : tensor<128x1xi32> loc(#loc347) + } loc(#loc344) + tt.func private @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%loop_iter: i32 loc("loop_iter"(#loc348)), %col_indices: !tt.ptr loc("col_indices"(#loc348)), %total_blocks: i32 loc("total_blocks"(#loc348))) -> i32 attributes {noinline = false} { + %cur_block_idx = arith.constant 2 : i32 loc(#loc882) + %cur_block_idx_0 = arith.constant 2 : i32 loc(#loc882) + %cur_block_idx_1 = arith.divsi %loop_iter, %cur_block_idx_0 : i32 loc(#loc882) + %cur_block = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc883) + %cur_block_2 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc884) + %next_block = arith.constant 1 : i32 loc(#loc885) + %next_block_3 = arith.constant 1 : i32 loc(#loc885) + %next_block_4 = arith.addi %cur_block_idx_1, %next_block_3 : i32 loc(#loc885) + %next_block_5 = arith.cmpi slt, %next_block_4, %total_blocks : i32 loc(#loc886) + %next_block_6 = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc887) + %next_block_7 = arith.constant 1 : i32 loc(#loc888) + %next_block_8 = tt.addptr %next_block_6, %next_block_7 : !tt.ptr, i32 loc(#loc888) + %next_block_9 = tt.load %next_block_8, %next_block_5 evictionPolicy = evict_last : !tt.ptr loc(#loc889) + %needs_jump = arith.constant 1 : i32 loc(#loc890) + %needs_jump_10 = arith.constant 1 : i32 loc(#loc890) + %needs_jump_11 = arith.addi %loop_iter, %needs_jump_10 : i32 loc(#loc890) + %needs_jump_12 = arith.constant 2 : i32 loc(#loc891) + %needs_jump_13 = arith.constant 2 : i32 loc(#loc891) + %needs_jump_14 = arith.remsi %needs_jump_11, %needs_jump_13 : i32 loc(#loc891) + %needs_jump_15 = arith.constant 0 : i32 loc(#loc892) + %needs_jump_16 = arith.cmpi eq, %needs_jump_14, %needs_jump_15 : i32 loc(#loc892) + %jump_to_block = arith.subi %next_block_9, %cur_block_2 : i32 loc(#loc893) + %jump_to_block_17 = arith.constant 128 : i32 loc(#loc894) + %jump_to_block_18 = arith.constant 128 : i32 loc(#loc894) + %jump_to_block_19 = arith.muli %jump_to_block, %jump_to_block_18 : i32 loc(#loc894) + %jump_to_block_20 = arith.constant 64 : i32 loc(#loc895) + %jump_to_block_21 = arith.constant 64 : i32 loc(#loc895) + %jump_to_block_22 = arith.subi %jump_to_block_19, %jump_to_block_21 : i32 loc(#loc895) + %offset = arith.extui %needs_jump_16 : i1 to i32 loc(#loc896) + %offset_23 = arith.muli %jump_to_block_22, %offset : i32 loc(#loc896) + %offset_24 = arith.constant 1 : i32 loc(#loc897) + %offset_25 = arith.constant 1 : i32 loc(#loc897) + %offset_26 = arith.extui %needs_jump_16 : i1 to i32 loc(#loc897) + %offset_27 = arith.subi %offset_25, %offset_26 : i32 loc(#loc897) + %offset_28 = arith.constant 64 : i32 loc(#loc898) + %offset_29 = arith.constant 64 : i32 loc(#loc898) + %offset_30 = arith.muli %offset_27, %offset_29 : i32 loc(#loc898) + %offset_31 = arith.addi %offset_23, %offset_30 : i32 loc(#loc899) + tt.return %offset_31 : i32 loc(#loc367) + ^bb1: // no predecessors + %0 = ub.poison : i32 loc(#loc368) + tt.return %0 : i32 loc(#loc368) + } loc(#loc348) + tt.func private @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.bwd_dq_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_Pbf16_Pbf16_fp32S128_128S_bf16S128_128S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(39,)cconstexpr_bf16__(40,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc238)), %arg_K: !tt.ptr loc("arg_K"(#loc238)), %arg_V: !tt.ptr loc("arg_V"(#loc238)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc238)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc238)), %arg_DO: !tt.ptr loc("arg_DO"(#loc238)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc238)), %arg_DV: !tt.ptr loc("arg_DV"(#loc238)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc238)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc238)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc238)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc238)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc238)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc238)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc238)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc238)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc238)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc238)), %ks0: i32 loc("ks0"(#loc238)), %ks1: i32 loc("ks1"(#loc238)), %ks2: i32 loc("ks2"(#loc238)), %ks3: i32 loc("ks3"(#loc238)), %K: !tt.ptr loc("K"(#loc238)), %V: !tt.ptr loc("V"(#loc238)), %dq: tensor<128x128xf32> loc("dq"(#loc238)), %q: tensor<128x128xbf16> loc("q"(#loc238)), %do: tensor<128x128xbf16> loc("do"(#loc238)), %Di: tensor<128xf32> loc("Di"(#loc238)), %lse: tensor<128x1xf32> loc("lse"(#loc238)), %off_z: i32 loc("off_z"(#loc238)), %off_hq: i32 loc("off_hq"(#loc238)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc238)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc238)), %stride_kn: i32 loc("stride_kn"(#loc238)), %stride_kd: i32 loc("stride_kd"(#loc238)), %stride_vn: i32 loc("stride_vn"(#loc238)), %stride_vd: i32 loc("stride_vd"(#loc238)), %kv_indices: !tt.ptr loc("kv_indices"(#loc238)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc238))) -> tensor<128x128xf32> attributes {noinline = false} { + %Q_LEN = arith.constant 2048 : i32 loc(#loc738) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc739) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc740) + %kT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc741) + %kT_ptrs_0 = tt.splat %stride_kn : i32 -> tensor<1x64xi32> loc(#loc742) + %kT_ptrs_1 = arith.muli %kT_ptrs, %kT_ptrs_0 : tensor<1x64xi32> loc(#loc742) + %kT_ptrs_2 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc743) + %kT_ptrs_3 = tt.addptr %kT_ptrs_2, %kT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc743) + %kT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc744) + %kT_ptrs_5 = tt.splat %stride_kd : i32 -> tensor<128x1xi32> loc(#loc745) + %kT_ptrs_6 = arith.muli %kT_ptrs_4, %kT_ptrs_5 : tensor<128x1xi32> loc(#loc745) + %kT_ptrs_7 = tt.broadcast %kT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc746) + %kT_ptrs_8 = tt.broadcast %kT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc746) + %kT_ptrs_9 = tt.addptr %kT_ptrs_7, %kT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc746) + %vT_ptrs = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc747) + %vT_ptrs_10 = tt.splat %stride_vn : i32 -> tensor<1x64xi32> loc(#loc748) + %vT_ptrs_11 = arith.muli %vT_ptrs, %vT_ptrs_10 : tensor<1x64xi32> loc(#loc748) + %vT_ptrs_12 = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc749) + %vT_ptrs_13 = tt.addptr %vT_ptrs_12, %vT_ptrs_11 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc749) + %vT_ptrs_14 = tt.expand_dims %offs_v {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc750) + %vT_ptrs_15 = tt.splat %stride_vd : i32 -> tensor<128x1xi32> loc(#loc751) + %vT_ptrs_16 = arith.muli %vT_ptrs_14, %vT_ptrs_15 : tensor<128x1xi32> loc(#loc751) + %vT_ptrs_17 = tt.broadcast %vT_ptrs_13 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc752) + %vT_ptrs_18 = tt.broadcast %vT_ptrs_16 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc752) + %vT_ptrs_19 = tt.addptr %vT_ptrs_17, %vT_ptrs_18 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc752) + %hi = arith.constant 2 : i32 loc(#loc753) + %hi_20 = arith.constant 2 : i32 loc(#loc753) + %hi_21 = arith.muli %sparse_kv_num_blocks, %hi_20 : i32 loc(#loc753) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%ks0) : (i32) -> i32 loc(#loc754) + %hi_23 = arith.constant 1 : i32 loc(#loc755) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc755) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc756) + %c0_i32 = arith.constant 0 : i32 loc(#loc258) + %c1_i32 = arith.constant 1 : i32 loc(#loc258) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc258) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc258) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc258) + %3 = ub.poison : i32 loc(#loc258) + %vT_ptrs_26:4 = scf.for %start_n = %0 to %1 step %2 iter_args(%dq_27 = %dq, %offs_n2_28 = %offs_n2, %kT_ptrs_29 = %kT_ptrs_9, %vT_ptrs_30 = %vT_ptrs_19) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %dq_31 = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(43,)cconstexpr_bf16__(44,)cconstexpr_1_d_44269504__(45,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %dq_27, %q, %kT_ptrs_29, %vT_ptrs_30, %do, %Di, %lse, %Q_LEN, %ks0, %off_z, %off_hq, %offs_m2, %offs_n2_28, %offs_k, %offs_v, %stride_kn, %stride_kd, %stride_vn, %stride_vd, %kv_indices, %sparse_kv_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, tensor<128x128xf32>, tensor<128x128xbf16>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128xf32>, tensor<128x1xf32>, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> tensor<128x128xf32> loc(#loc758) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %sparse_kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc759) + %kT_ptrs_32 = arith.muli %offset, %stride_kn : i32 loc(#loc760) + %kT_ptrs_33 = tt.splat %kT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc761) + %kT_ptrs_34 = tt.addptr %kT_ptrs_29, %kT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc761) + %vT_ptrs_35 = arith.muli %offset, %stride_vn : i32 loc(#loc762) + %vT_ptrs_36 = tt.splat %vT_ptrs_35 : i32 -> tensor<128x64xi32> loc(#loc763) + %vT_ptrs_37 = tt.addptr %vT_ptrs_30, %vT_ptrs_36 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc763) + %offs_n2_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc764) + %offs_n2_39 = arith.addi %offs_n2_28, %offs_n2_38 : tensor<64xi32> loc(#loc764) + scf.yield %dq_31, %offs_n2_39, %kT_ptrs_34, %vT_ptrs_37 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc266) + } loc(#loc1095) + tt.return %vT_ptrs_26#0 : tensor<128x128xf32> loc(#loc267) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc268) + tt.return %4 : tensor<128x128xf32> loc(#loc268) + } loc(#loc238) + tt.func private @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.bwd_dq_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_fp32S128_128S_bf16S128_128S_Pbf16S128_64S_Pbf16S128_64S_bf16S128_128S_fp32S128S_fp32S128_1S_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(43,)cconstexpr_bf16__(44,)cconstexpr_1_d_44269504__(45,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc269)), %arg_K: !tt.ptr loc("arg_K"(#loc269)), %arg_V: !tt.ptr loc("arg_V"(#loc269)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc269)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc269)), %arg_DO: !tt.ptr loc("arg_DO"(#loc269)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc269)), %arg_DV: !tt.ptr loc("arg_DV"(#loc269)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc269)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc269)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc269)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc269)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc269)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc269)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc269)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc269)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc269)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc269)), %ks0: i32 loc("ks0"(#loc269)), %ks1: i32 loc("ks1"(#loc269)), %ks2: i32 loc("ks2"(#loc269)), %ks3: i32 loc("ks3"(#loc269)), %dq: tensor<128x128xf32> loc("dq"(#loc269)), %q: tensor<128x128xbf16> loc("q"(#loc269)), %kT_ptrs: tensor<128x64x!tt.ptr> loc("kT_ptrs"(#loc269)), %vT_ptrs: tensor<128x64x!tt.ptr> loc("vT_ptrs"(#loc269)), %do: tensor<128x128xbf16> loc("do"(#loc269)), %Di: tensor<128xf32> loc("Di"(#loc269)), %lse: tensor<128x1xf32> loc("lse"(#loc269)), %Q_LEN: i32 loc("Q_LEN"(#loc269)), %KV_LEN: i32 loc("KV_LEN"(#loc269)), %off_z: i32 loc("off_z"(#loc269)), %off_hq: i32 loc("off_hq"(#loc269)), %offs_m2: tensor<128xi32> loc("offs_m2"(#loc269)), %offs_n2: tensor<64xi32> loc("offs_n2"(#loc269)), %offs_k: tensor<128xi32> loc("offs_k"(#loc269)), %offs_v: tensor<128xi32> loc("offs_v"(#loc269)), %stride_kn: i32 loc("stride_kn"(#loc269)), %stride_kd: i32 loc("stride_kd"(#loc269)), %stride_vn: i32 loc("stride_vn"(#loc269)), %stride_vd: i32 loc("stride_vd"(#loc269)), %kv_indices: !tt.ptr loc("kv_indices"(#loc269)), %sparse_kv_num_blocks: i32 loc("sparse_kv_num_blocks"(#loc269))) -> tensor<128x128xf32> attributes {noinline = false} { + %kT = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%kT_ptrs, %offs_k, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc808) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc809) + %qk_0 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc809) + %qk_1 = tt.dot %q, %kT, %qk_0, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc809) + %qk_2 = arith.constant 0.0883883461 : f32 loc(#loc810) + %qk_3 = arith.constant 0.0883883461 : f32 loc(#loc810) + %qk_4 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc810) + %qk_5 = arith.mulf %qk_1, %qk_4 : tensor<128x64xf32> loc(#loc810) + %n = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc811) + %n_6 = tt.call @torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.get_bounded_indices__i32S1_64S_i32__(%n, %KV_LEN) : (tensor<1x64xi32>, i32) -> tensor<1x64xi32> loc(#loc812) + %m = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc813) + %m_7 = tt.call @torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.get_bounded_indices__i32S128_1S_i32__(%m, %Q_LEN) : (tensor<128x1xi32>, i32) -> tensor<128x1xi32> loc(#loc814) + %post_mod_scores = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc815) + %post_mod_scores_8 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc816) + %post_mod_scores_9 = arith.cmpi slt, %post_mod_scores, %post_mod_scores_8 : tensor<1x64xi32> loc(#loc816) + %post_mod_scores_10 = arith.constant 0xFF800000 : f32 loc(#loc817) + %post_mod_scores_11 = arith.constant 0xFF800000 : f32 loc(#loc817) + %post_mod_scores_12 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc817) + %post_mod_scores_13 = tt.broadcast %post_mod_scores_9 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc817) + %post_mod_scores_14 = arith.select %post_mod_scores_13, %qk_5, %post_mod_scores_12 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc817) + %post_mod_scores_15 = arith.constant 1.44269502 : f32 loc(#loc855) + %post_mod_scores_16 = arith.constant 1.44269502 : f32 loc(#loc855) + %post_mod_scores_17 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc855) + %post_mod_scores_18 = arith.mulf %post_mod_scores_14, %post_mod_scores_17 : tensor<128x64xf32> loc(#loc855) + %p = tt.broadcast %lse : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc856) + %p_19 = arith.subf %post_mod_scores_18, %p : tensor<128x64xf32> loc(#loc856) + %p_20 = math.exp2 %p_19 : tensor<128x64xf32> loc(#loc857) + %vT = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%vT_ptrs, %offs_v, %offs_n2, %KV_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc858) + %dp = arith.constant 0.000000e+00 : f32 loc(#loc859) + %dp_21 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc859) + %dp_22 = tt.dot %do, %vT, %dp_21, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc859) + %ds = tt.expand_dims %Di {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc860) + %ds_23 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc861) + %ds_24 = arith.subf %dp_22, %ds_23 : tensor<128x64xf32> loc(#loc861) + %ds_25 = arith.mulf %p_20, %ds_24 : tensor<128x64xf32> loc(#loc862) + %grad_scores = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc863) + %grad_scores_26 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc864) + %grad_scores_27 = arith.cmpi slt, %grad_scores, %grad_scores_26 : tensor<1x64xi32> loc(#loc864) + %grad_scores_28 = arith.constant 0.000000e+00 : f32 loc(#loc865) + %grad_scores_29 = arith.constant 0.000000e+00 : f32 loc(#loc865) + %grad_scores_30 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc865) + %grad_scores_31 = tt.broadcast %grad_scores_27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc865) + %grad_scores_32 = arith.select %grad_scores_31, %ds_25, %grad_scores_30 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc865) + %scatter_mask = tt.expand_dims %offs_m2 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc866) + %scatter_mask_33 = tt.splat %Q_LEN : i32 -> tensor<128x1xi32> loc(#loc867) + %scatter_mask_34 = arith.cmpi slt, %scatter_mask, %scatter_mask_33 : tensor<128x1xi32> loc(#loc867) + %scatter_mask_35 = tt.expand_dims %offs_n2 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc868) + %scatter_mask_36 = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc869) + %scatter_mask_37 = arith.cmpi slt, %scatter_mask_35, %scatter_mask_36 : tensor<1x64xi32> loc(#loc869) + %scatter_mask_38 = tt.broadcast %scatter_mask_34 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc870) + %scatter_mask_39 = tt.broadcast %scatter_mask_37 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc870) + %scatter_mask_40 = arith.andi %scatter_mask_38, %scatter_mask_39 : tensor<128x64xi1> loc(#loc870) + %ds_41 = arith.truncf %grad_scores_32 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc872) + %dq_42 = tt.trans %kT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc873) + %dq_43 = arith.constant 0.000000e+00 : f32 loc(#loc874) + %dq_44 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc874) + %dq_45 = tt.dot %ds_41, %dq_42, %dq_44, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc874) + %dq_46 = arith.addf %dq, %dq_45 : tensor<128x128xf32> loc(#loc875) + tt.return %dq_46 : tensor<128x128xf32> loc(#loc338) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc339) + tt.return %0 : tensor<128x128xf32> loc(#loc339) + } loc(#loc269) + tt.func private @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(40,)cconstexpr_bf16__(41,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc369)), %arg_K: !tt.ptr loc("arg_K"(#loc369)), %arg_V: !tt.ptr loc("arg_V"(#loc369)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc369)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc369)), %arg_DO: !tt.ptr loc("arg_DO"(#loc369)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc369)), %arg_DV: !tt.ptr loc("arg_DV"(#loc369)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc369)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc369)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc369)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc369)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc369)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc369)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc369)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc369)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc369)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc369)), %ks0: i32 loc("ks0"(#loc369)), %ks1: i32 loc("ks1"(#loc369)), %ks2: i32 loc("ks2"(#loc369)), %ks3: i32 loc("ks3"(#loc369)), %Q: !tt.ptr loc("Q"(#loc369)), %DO: !tt.ptr loc("DO"(#loc369)), %DELTA: !tt.ptr loc("DELTA"(#loc369)), %LSE: !tt.ptr loc("LSE"(#loc369)), %dk: tensor<128x128xf32> loc("dk"(#loc369)), %dv: tensor<128x128xf32> loc("dv"(#loc369)), %k: tensor<128x128xbf16> loc("k"(#loc369)), %v: tensor<128x128xbf16> loc("v"(#loc369)), %off_z: i32 loc("off_z"(#loc369)), %off_hq: i32 loc("off_hq"(#loc369)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc369)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc369)), %stride_qm: i32 loc("stride_qm"(#loc369)), %stride_qd: i32 loc("stride_qd"(#loc369)), %stride_dom: i32 loc("stride_dom"(#loc369)), %stride_dod: i32 loc("stride_dod"(#loc369)), %q_indices: !tt.ptr loc("q_indices"(#loc369)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc369))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %Q_LEN = arith.constant 2048 : i32 loc(#loc940) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc941) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc942) + %qT_ptrs = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc943) + %qT_ptrs_0 = tt.splat %stride_qm : i32 -> tensor<1x64xi32> loc(#loc944) + %qT_ptrs_1 = arith.muli %qT_ptrs, %qT_ptrs_0 : tensor<1x64xi32> loc(#loc944) + %qT_ptrs_2 = tt.splat %Q : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc945) + %qT_ptrs_3 = tt.addptr %qT_ptrs_2, %qT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc945) + %qT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc946) + %qT_ptrs_5 = tt.splat %stride_qd : i32 -> tensor<128x1xi32> loc(#loc947) + %qT_ptrs_6 = arith.muli %qT_ptrs_4, %qT_ptrs_5 : tensor<128x1xi32> loc(#loc947) + %qT_ptrs_7 = tt.broadcast %qT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc948) + %qT_ptrs_8 = tt.broadcast %qT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc948) + %qT_ptrs_9 = tt.addptr %qT_ptrs_7, %qT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc948) + %do_ptrs = tt.expand_dims %offs_m1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc949) + %do_ptrs_10 = tt.splat %stride_dom : i32 -> tensor<64x1xi32> loc(#loc950) + %do_ptrs_11 = arith.muli %do_ptrs, %do_ptrs_10 : tensor<64x1xi32> loc(#loc950) + %do_ptrs_12 = tt.splat %DO : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc951) + %do_ptrs_13 = tt.addptr %do_ptrs_12, %do_ptrs_11 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc951) + %do_ptrs_14 = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc952) + %do_ptrs_15 = tt.splat %stride_dod : i32 -> tensor<1x128xi32> loc(#loc953) + %do_ptrs_16 = arith.muli %do_ptrs_14, %do_ptrs_15 : tensor<1x128xi32> loc(#loc953) + %do_ptrs_17 = tt.broadcast %do_ptrs_13 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc954) + %do_ptrs_18 = tt.broadcast %do_ptrs_16 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc954) + %do_ptrs_19 = tt.addptr %do_ptrs_17, %do_ptrs_18 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc954) + %hi = arith.constant 2 : i32 loc(#loc955) + %hi_20 = arith.constant 2 : i32 loc(#loc955) + %hi_21 = arith.muli %sparse_q_num_blocks, %hi_20 : i32 loc(#loc955) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%Q_LEN) : (i32) -> i32 loc(#loc956) + %hi_23 = arith.constant 1 : i32 loc(#loc957) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc957) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc958) + %c0_i32 = arith.constant 0 : i32 loc(#loc389) + %c1_i32 = arith.constant 1 : i32 loc(#loc389) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc389) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc389) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc389) + %3 = ub.poison : i32 loc(#loc389) + %do_ptrs_26:5 = scf.for %start_m = %0 to %1 step %2 iter_args(%dk_27 = %dk, %dv_28 = %dv, %offs_m1_29 = %offs_m1, %qT_ptrs_30 = %qT_ptrs_9, %do_ptrs_31 = %do_ptrs_19) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %6:2 = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(44,)cconstexpr_bf16__(45,)cconstexpr_1_d_44269504__(46,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %dk_27, %dv_28, %qT_ptrs_30, %k, %v, %do_ptrs_31, %DELTA, %LSE, %Q_LEN, %ks0, %off_z, %off_hq, %offs_n1, %offs_m1_29, %offs_k, %offs_v, %stride_qm, %stride_qd, %stride_dom, %stride_dod, %q_indices, %sparse_q_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<64x128x!tt.ptr>, !tt.ptr, !tt.ptr, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc390) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_m, %q_indices, %sparse_q_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc960) + %qT_ptrs_32 = arith.muli %offset, %stride_qm : i32 loc(#loc961) + %qT_ptrs_33 = tt.splat %qT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc962) + %qT_ptrs_34 = tt.addptr %qT_ptrs_30, %qT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc962) + %do_ptrs_35 = arith.muli %offset, %stride_dom : i32 loc(#loc963) + %do_ptrs_36 = tt.splat %do_ptrs_35 : i32 -> tensor<64x128xi32> loc(#loc964) + %do_ptrs_37 = tt.addptr %do_ptrs_31, %do_ptrs_36 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc964) + %offs_m1_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc965) + %offs_m1_39 = arith.addi %offs_m1_29, %offs_m1_38 : tensor<64xi32> loc(#loc965) + scf.yield %6#0, %6#1, %offs_m1_39, %qT_ptrs_34, %do_ptrs_37 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc397) + } loc(#loc1097) + tt.return %do_ptrs_26#0, %do_ptrs_26#1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc398) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc399) + %5 = ub.poison : tensor<128x128xf32> loc(#loc399) + tt.return %4, %5 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc399) + } loc(#loc369) + tt.func private @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(44,)cconstexpr_bf16__(45,)cconstexpr_1_d_44269504__(46,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc400)), %arg_K: !tt.ptr loc("arg_K"(#loc400)), %arg_V: !tt.ptr loc("arg_V"(#loc400)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc400)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc400)), %arg_DO: !tt.ptr loc("arg_DO"(#loc400)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc400)), %arg_DV: !tt.ptr loc("arg_DV"(#loc400)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc400)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc400)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc400)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc400)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc400)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc400)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc400)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc400)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc400)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc400)), %ks0: i32 loc("ks0"(#loc400)), %ks1: i32 loc("ks1"(#loc400)), %ks2: i32 loc("ks2"(#loc400)), %ks3: i32 loc("ks3"(#loc400)), %dk: tensor<128x128xf32> loc("dk"(#loc400)), %dv: tensor<128x128xf32> loc("dv"(#loc400)), %qT_ptrs: tensor<128x64x!tt.ptr> loc("qT_ptrs"(#loc400)), %k: tensor<128x128xbf16> loc("k"(#loc400)), %v: tensor<128x128xbf16> loc("v"(#loc400)), %do_ptrs: tensor<64x128x!tt.ptr> loc("do_ptrs"(#loc400)), %DELTA: !tt.ptr loc("DELTA"(#loc400)), %LSE: !tt.ptr loc("LSE"(#loc400)), %Q_LEN: i32 loc("Q_LEN"(#loc400)), %KV_LEN: i32 loc("KV_LEN"(#loc400)), %off_z: i32 loc("off_z"(#loc400)), %off_hq: i32 loc("off_hq"(#loc400)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc400)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc400)), %offs_k: tensor<128xi32> loc("offs_k"(#loc400)), %offs_v: tensor<128xi32> loc("offs_v"(#loc400)), %stride_qm: i32 loc("stride_qm"(#loc400)), %stride_qd: i32 loc("stride_qd"(#loc400)), %stride_dom: i32 loc("stride_dom"(#loc400)), %stride_dod: i32 loc("stride_dod"(#loc400)), %q_indices: !tt.ptr loc("q_indices"(#loc400)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc400))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %qT = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%qT_ptrs, %offs_k, %offs_m1, %Q_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc1010) + %lse = tt.splat %Q_LEN : i32 -> tensor<64xi32> loc(#loc1011) + %lse_0 = arith.cmpi slt, %offs_m1, %lse : tensor<64xi32> loc(#loc1011) + %lse_1 = tt.splat %LSE : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc1012) + %lse_2 = tt.addptr %lse_1, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc1012) + %lse_3 = tt.load %lse_2, %lse_0 : tensor<64x!tt.ptr> loc(#loc1013) + %lse_4 = arith.constant 0xFF800000 : f32 loc(#loc1014) + %lse_5 = arith.constant dense<0xFF800000> : tensor<64xf32> loc(#loc1014) + %lse_6 = arith.cmpf oeq, %lse_3, %lse_5 : tensor<64xf32> loc(#loc1014) + %lse_7 = arith.constant 0.000000e+00 : f32 loc(#loc1015) + %lse_8 = arith.constant 0.000000e+00 : f32 loc(#loc1015) + %lse_9 = arith.constant dense<0.000000e+00> : tensor<64xf32> loc(#loc1015) + %lse_10 = arith.select %lse_6, %lse_9, %lse_3 : tensor<64xi1>, tensor<64xf32> loc(#loc1015) + %qkT = arith.constant 0.000000e+00 : f32 loc(#loc1016) + %qkT_11 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1016) + %qkT_12 = tt.dot %k, %qT, %qkT_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc1016) + %qkT_13 = arith.constant 0.0883883461 : f32 loc(#loc1017) + %qkT_14 = arith.constant 0.0883883461 : f32 loc(#loc1017) + %qkT_15 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1017) + %qkT_16 = arith.mulf %qkT_12, %qkT_15 : tensor<128x64xf32> loc(#loc1017) + %m = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1018) + %m_17 = tt.call @torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.get_bounded_indices__i32S1_64S_i32__(%m, %Q_LEN) : (tensor<1x64xi32>, i32) -> tensor<1x64xi32> loc(#loc1019) + %n = tt.expand_dims %offs_n1 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc1020) + %n_18 = tt.call @torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.get_bounded_indices__i32S128_1S_i32__(%n, %KV_LEN) : (tensor<128x1xi32>, i32) -> tensor<128x1xi32> loc(#loc1021) + %post_mod_scores = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1022) + %post_mod_scores_19 = tt.splat %Q_LEN : i32 -> tensor<1x64xi32> loc(#loc1023) + %post_mod_scores_20 = arith.cmpi slt, %post_mod_scores, %post_mod_scores_19 : tensor<1x64xi32> loc(#loc1023) + %post_mod_scores_21 = arith.constant 0xFF800000 : f32 loc(#loc1024) + %post_mod_scores_22 = arith.constant 0xFF800000 : f32 loc(#loc1024) + %post_mod_scores_23 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1024) + %post_mod_scores_24 = tt.broadcast %post_mod_scores_20 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1024) + %post_mod_scores_25 = arith.select %post_mod_scores_24, %qkT_16, %post_mod_scores_23 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1024) + %tmp41 = arith.constant false loc(#loc1025) + %tmp41_26 = arith.constant dense : tensor<1xi1> loc(#loc1025) + %tmp44 = tt.broadcast %m_17 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc1026) + %tmp44_27 = tt.broadcast %n_18 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc1026) + %tmp44_28 = arith.cmpi sge, %tmp44, %tmp44_27 : tensor<128x64xi32> loc(#loc1026) + %tmp45 = arith.extsi %n_18 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc1027) + %tmp47 = tt.addptr %in_ptr16, %off_z : !tt.ptr, i32 loc(#loc1028) + %tmp47_29 = tt.load %tmp47 : !tt.ptr loc(#loc1029) + %tmp48 = tt.splat %tmp47_29 : i64 -> tensor<128x1xi64> loc(#loc1030) + %tmp48_30 = arith.cmpi slt, %tmp45, %tmp48 : tensor<128x1xi64> loc(#loc1030) + %tmp49 = arith.extsi %m_17 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc1031) + %tmp50 = tt.splat %tmp47_29 : i64 -> tensor<1x64xi64> loc(#loc1032) + %tmp50_31 = arith.cmpi slt, %tmp49, %tmp50 : tensor<1x64xi64> loc(#loc1032) + %tmp51 = tt.broadcast %tmp48_30 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc1033) + %tmp51_32 = tt.broadcast %tmp50_31 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1033) + %tmp51_33 = arith.andi %tmp51, %tmp51_32 : tensor<128x64xi1> loc(#loc1033) + %tmp52 = arith.andi %tmp44_28, %tmp51_33 : tensor<128x64xi1> loc(#loc1034) + %tmp53 = tt.expand_dims %tmp41_26 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc1035) + %tmp53_34 = tt.broadcast %tmp53 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc1035) + %tmp53_35 = arith.ori %tmp53_34, %tmp52 : tensor<128x64xi1> loc(#loc1035) + %tmp54 = arith.constant 2048 : i32 loc(#loc1036) + %tmp54_36 = arith.constant dense<2048> : tensor<1xi32> loc(#loc1036) + %tmp55 = tt.expand_dims %tmp54_36 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc1037) + %tmp55_37 = tt.broadcast %tmp55 : tensor<1x1xi32> -> tensor<128x1xi32> loc(#loc1037) + %tmp55_38 = arith.cmpi sge, %n_18, %tmp55_37 : tensor<128x1xi32> loc(#loc1037) + %tmp56 = tt.expand_dims %tmp54_36 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc1038) + %tmp56_39 = tt.broadcast %tmp56 : tensor<1x1xi32> -> tensor<128x1xi32> loc(#loc1038) + %tmp56_40 = arith.remsi %n_18, %tmp56_39 : tensor<128x1xi32> loc(#loc1038) + %tmp57 = arith.constant 0 : i32 loc(#loc1039) + %tmp57_41 = arith.constant dense<0> : tensor<1xi32> loc(#loc1039) + %tmp58 = tt.expand_dims %tmp57_41 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc1040) + %tmp58_42 = tt.broadcast %tmp58 : tensor<1x1xi32> -> tensor<128x1xi32> loc(#loc1040) + %tmp58_43 = arith.cmpi ne, %tmp56_40, %tmp58_42 : tensor<128x1xi32> loc(#loc1040) + %tmp59 = arith.constant 0 : i32 loc(#loc1041) + %tmp59_44 = arith.constant dense<0> : tensor<128x1xi32> loc(#loc1041) + %tmp59_45 = arith.cmpi slt, %tmp56_40, %tmp59_44 : tensor<128x1xi32> loc(#loc1041) + %tmp60 = arith.constant 0 : i32 loc(#loc1042) + %tmp60_46 = arith.constant dense<0> : tensor<1xi32> loc(#loc1042) + %tmp60_47 = arith.cmpi slt, %tmp54_36, %tmp60_46 : tensor<1xi32> loc(#loc1042) + %tmp61 = tt.expand_dims %tmp60_47 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc1043) + %tmp61_48 = tt.broadcast %tmp61 : tensor<1x1xi1> -> tensor<128x1xi1> loc(#loc1043) + %tmp61_49 = arith.cmpi ne, %tmp59_45, %tmp61_48 : tensor<128x1xi1> loc(#loc1043) + %tmp62 = arith.andi %tmp58_43, %tmp61_49 : tensor<128x1xi1> loc(#loc1044) + %tmp63 = tt.expand_dims %tmp54_36 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc1045) + %tmp63_50 = tt.broadcast %tmp63 : tensor<1x1xi32> -> tensor<128x1xi32> loc(#loc1045) + %tmp63_51 = arith.addi %tmp56_40, %tmp63_50 : tensor<128x1xi32> loc(#loc1045) + %tmp64 = arith.select %tmp62, %tmp63_51, %tmp56_40 : tensor<128x1xi1>, tensor<128x1xi32> loc(#loc1046) + %tmp65 = arith.extsi %tmp64 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc1047) + %tmp66 = tt.splat %tmp47_29 : i64 -> tensor<128x1xi64> loc(#loc1048) + %tmp66_52 = arith.cmpi slt, %tmp65, %tmp66 : tensor<128x1xi64> loc(#loc1048) + %tmp67 = arith.andi %tmp55_38, %tmp66_52 : tensor<128x1xi1> loc(#loc1049) + %tmp68 = tt.broadcast %n_18 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc1050) + %tmp68_53 = tt.broadcast %m_17 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc1050) + %tmp68_54 = arith.subi %tmp68, %tmp68_53 : tensor<128x64xi32> loc(#loc1050) + %tmp69 = tt.expand_dims %tmp54_36 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc1051) + %tmp69_55 = tt.broadcast %tmp69 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc1051) + %tmp69_56 = arith.remsi %tmp68_54, %tmp69_55 : tensor<128x64xi32> loc(#loc1051) + %tmp70 = tt.expand_dims %tmp57_41 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc1052) + %tmp70_57 = tt.broadcast %tmp70 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc1052) + %tmp70_58 = arith.cmpi ne, %tmp69_56, %tmp70_57 : tensor<128x64xi32> loc(#loc1052) + %tmp71 = arith.constant 0 : i32 loc(#loc1053) + %tmp71_59 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc1053) + %tmp71_60 = arith.cmpi slt, %tmp69_56, %tmp71_59 : tensor<128x64xi32> loc(#loc1053) + %tmp72 = tt.expand_dims %tmp60_47 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc1054) + %tmp72_61 = tt.broadcast %tmp72 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc1054) + %tmp72_62 = arith.cmpi ne, %tmp71_60, %tmp72_61 : tensor<128x64xi1> loc(#loc1054) + %tmp73 = arith.andi %tmp70_58, %tmp72_62 : tensor<128x64xi1> loc(#loc1055) + %tmp74 = tt.expand_dims %tmp54_36 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc1056) + %tmp74_63 = tt.broadcast %tmp74 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc1056) + %tmp74_64 = arith.addi %tmp69_56, %tmp74_63 : tensor<128x64xi32> loc(#loc1056) + %tmp75 = arith.select %tmp73, %tmp74_64, %tmp69_56 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc1057) + %tmp76 = tt.expand_dims %tmp57_41 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc1058) + %tmp76_65 = tt.broadcast %tmp76 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc1058) + %tmp76_66 = arith.cmpi eq, %tmp75, %tmp76_65 : tensor<128x64xi32> loc(#loc1058) + %tmp77 = tt.broadcast %tmp67 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc1059) + %tmp77_67 = arith.andi %tmp77, %tmp76_66 : tensor<128x64xi1> loc(#loc1059) + %tmp78 = arith.ori %tmp53_35, %tmp77_67 : tensor<128x64xi1> loc(#loc1060) + %post_mod_scores_68 = arith.constant 0xFF800000 : f32 loc(#loc1061) + %post_mod_scores_69 = arith.constant 0xFF800000 : f32 loc(#loc1061) + %post_mod_scores_70 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1061) + %post_mod_scores_71 = arith.select %tmp78, %post_mod_scores_25, %post_mod_scores_70 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1061) + %post_mod_scores_72 = arith.constant 1.44269502 : f32 loc(#loc1062) + %post_mod_scores_73 = arith.constant 1.44269502 : f32 loc(#loc1062) + %post_mod_scores_74 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1062) + %post_mod_scores_75 = arith.mulf %post_mod_scores_71, %post_mod_scores_74 : tensor<128x64xf32> loc(#loc1062) + %pT = tt.expand_dims %lse_10 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc1063) + %pT_76 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1064) + %pT_77 = arith.subf %post_mod_scores_75, %pT_76 : tensor<128x64xf32> loc(#loc1064) + %pT_78 = math.exp2 %pT_77 : tensor<128x64xf32> loc(#loc1065) + %do = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.load_checked_2d__Pbf16S64_128S_i32S64S_i32S128S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%do_ptrs, %offs_m1, %offs_v, %Q_LEN) : (tensor<64x128x!tt.ptr>, tensor<64xi32>, tensor<128xi32>, i32) -> tensor<64x128xbf16> loc(#loc1066) + %dv_79 = arith.truncf %pT_78 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1067) + %dv_80 = arith.constant 0.000000e+00 : f32 loc(#loc1068) + %dv_81 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1068) + %dv_82 = tt.dot %dv_79, %do, %dv_81, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1068) + %dv_83 = arith.addf %dv, %dv_82 : tensor<128x128xf32> loc(#loc1069) + %Di = tt.splat %Q_LEN : i32 -> tensor<64xi32> loc(#loc1070) + %Di_84 = arith.cmpi slt, %offs_m1, %Di : tensor<64xi32> loc(#loc1070) + %Di_85 = tt.splat %DELTA : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc1071) + %Di_86 = tt.addptr %Di_85, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc1071) + %Di_87 = tt.load %Di_86, %Di_84 : tensor<64x!tt.ptr> loc(#loc1072) + %dpT = tt.trans %do {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc1073) + %dpT_88 = arith.constant 0.000000e+00 : f32 loc(#loc1074) + %dpT_89 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1074) + %dpT_90 = tt.dot %v, %dpT, %dpT_89, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc1074) + %dsT = tt.expand_dims %Di_87 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc1075) + %dsT_91 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1076) + %dsT_92 = arith.subf %dpT_90, %dsT_91 : tensor<128x64xf32> loc(#loc1076) + %dsT_93 = arith.mulf %pT_78, %dsT_92 : tensor<128x64xf32> loc(#loc1077) + %grad_scores = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1078) + %grad_scores_94 = tt.splat %Q_LEN : i32 -> tensor<1x64xi32> loc(#loc1079) + %grad_scores_95 = arith.cmpi slt, %grad_scores, %grad_scores_94 : tensor<1x64xi32> loc(#loc1079) + %grad_scores_96 = arith.constant 0.000000e+00 : f32 loc(#loc1080) + %grad_scores_97 = arith.constant 0.000000e+00 : f32 loc(#loc1080) + %grad_scores_98 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1080) + %grad_scores_99 = tt.broadcast %grad_scores_95 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1080) + %grad_scores_100 = arith.select %grad_scores_99, %dsT_93, %grad_scores_98 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1080) + %dsT_101 = arith.constant 0.000000e+00 : f32 loc(#loc1081) + %dsT_102 = arith.constant 0.000000e+00 : f32 loc(#loc1081) + %dsT_103 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1081) + %dsT_104 = arith.select %tmp78, %grad_scores_100, %dsT_103 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1081) + %dk_105 = arith.truncf %dsT_104 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1082) + %dk_106 = tt.trans %qT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc1083) + %dk_107 = arith.constant 0.000000e+00 : f32 loc(#loc1084) + %dk_108 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1084) + %dk_109 = tt.dot %dk_105, %dk_106, %dk_108, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1084) + %dk_110 = arith.addf %dk, %dk_109 : tensor<128x128xf32> loc(#loc1085) + tt.return %dk_110, %dv_83 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc477) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc478) + %1 = ub.poison : tensor<128x128xf32> loc(#loc478) + tt.return %0, %1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc478) + } loc(#loc400) + tt.func private @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.load_checked_2d__Pbf16S64_128S_i32S64S_i32S128S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: tensor<64x128x!tt.ptr> loc("ptr"(#loc226)), %offs_m: tensor<64xi32> loc("offs_m"(#loc226)), %offs_n: tensor<128xi32> loc("offs_n"(#loc226)), %M_LEN: i32 loc("M_LEN"(#loc226))) -> tensor<64x128xbf16> attributes {noinline = false} { + %0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc233) + %1 = tt.splat %M_LEN : i32 -> tensor<64x1xi32> loc(#loc234) + %2 = arith.cmpi slt, %0, %1 : tensor<64x1xi32> loc(#loc234) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc235) + %3 = tt.broadcast %2 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc235) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xf32> loc(#loc235) + %4 = arith.truncf %cst_0 : tensor<64x128xf32> to tensor<64x128xbf16> loc(#loc235) + %5 = tt.load %ptr, %3, %4 : tensor<64x128x!tt.ptr> loc(#loc235) + tt.return %5 : tensor<64x128xbf16> loc(#loc236) + ^bb1: // no predecessors + %6 = ub.poison : tensor<64x128xbf16> loc(#loc237) + tt.return %6 : tensor<64x128xbf16> loc(#loc237) + } loc(#loc226) + tt.func private @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.bwd_dkdv_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_Pbf16_Pbf16_Pfp32_Pfp32_fp32S128_128S_fp32S128_128S_bf16S128_128S_bf16S128_128S_i32_i32_i32S128S_i32S64S_i32_i32_i32_i32_Pi32_i32__(40,)cconstexpr_bf16__(41,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc369)), %arg_K: !tt.ptr loc("arg_K"(#loc369)), %arg_V: !tt.ptr loc("arg_V"(#loc369)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc369)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc369)), %arg_DO: !tt.ptr loc("arg_DO"(#loc369)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc369)), %arg_DV: !tt.ptr loc("arg_DV"(#loc369)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc369)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc369)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc369)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc369)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc369)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc369)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc369)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc369)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc369)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc369)), %ks0: i32 loc("ks0"(#loc369)), %ks1: i32 loc("ks1"(#loc369)), %ks2: i32 loc("ks2"(#loc369)), %ks3: i32 loc("ks3"(#loc369)), %Q: !tt.ptr loc("Q"(#loc369)), %DO: !tt.ptr loc("DO"(#loc369)), %DELTA: !tt.ptr loc("DELTA"(#loc369)), %LSE: !tt.ptr loc("LSE"(#loc369)), %dk: tensor<128x128xf32> loc("dk"(#loc369)), %dv: tensor<128x128xf32> loc("dv"(#loc369)), %k: tensor<128x128xbf16> loc("k"(#loc369)), %v: tensor<128x128xbf16> loc("v"(#loc369)), %off_z: i32 loc("off_z"(#loc369)), %off_hq: i32 loc("off_hq"(#loc369)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc369)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc369)), %stride_qm: i32 loc("stride_qm"(#loc369)), %stride_qd: i32 loc("stride_qd"(#loc369)), %stride_dom: i32 loc("stride_dom"(#loc369)), %stride_dod: i32 loc("stride_dod"(#loc369)), %q_indices: !tt.ptr loc("q_indices"(#loc369)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc369))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %Q_LEN = arith.constant 2048 : i32 loc(#loc940) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc941) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc942) + %qT_ptrs = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc943) + %qT_ptrs_0 = tt.splat %stride_qm : i32 -> tensor<1x64xi32> loc(#loc944) + %qT_ptrs_1 = arith.muli %qT_ptrs, %qT_ptrs_0 : tensor<1x64xi32> loc(#loc944) + %qT_ptrs_2 = tt.splat %Q : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc945) + %qT_ptrs_3 = tt.addptr %qT_ptrs_2, %qT_ptrs_1 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc945) + %qT_ptrs_4 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc946) + %qT_ptrs_5 = tt.splat %stride_qd : i32 -> tensor<128x1xi32> loc(#loc947) + %qT_ptrs_6 = arith.muli %qT_ptrs_4, %qT_ptrs_5 : tensor<128x1xi32> loc(#loc947) + %qT_ptrs_7 = tt.broadcast %qT_ptrs_3 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc948) + %qT_ptrs_8 = tt.broadcast %qT_ptrs_6 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc948) + %qT_ptrs_9 = tt.addptr %qT_ptrs_7, %qT_ptrs_8 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc948) + %do_ptrs = tt.expand_dims %offs_m1 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc949) + %do_ptrs_10 = tt.splat %stride_dom : i32 -> tensor<64x1xi32> loc(#loc950) + %do_ptrs_11 = arith.muli %do_ptrs, %do_ptrs_10 : tensor<64x1xi32> loc(#loc950) + %do_ptrs_12 = tt.splat %DO : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc951) + %do_ptrs_13 = tt.addptr %do_ptrs_12, %do_ptrs_11 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc951) + %do_ptrs_14 = tt.expand_dims %offs_v {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc952) + %do_ptrs_15 = tt.splat %stride_dod : i32 -> tensor<1x128xi32> loc(#loc953) + %do_ptrs_16 = arith.muli %do_ptrs_14, %do_ptrs_15 : tensor<1x128xi32> loc(#loc953) + %do_ptrs_17 = tt.broadcast %do_ptrs_13 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc954) + %do_ptrs_18 = tt.broadcast %do_ptrs_16 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc954) + %do_ptrs_19 = tt.addptr %do_ptrs_17, %do_ptrs_18 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc954) + %hi = arith.constant 2 : i32 loc(#loc955) + %hi_20 = arith.constant 2 : i32 loc(#loc955) + %hi_21 = arith.muli %sparse_q_num_blocks, %hi_20 : i32 loc(#loc955) + %hi_22 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%Q_LEN) : (i32) -> i32 loc(#loc956) + %hi_23 = arith.constant 1 : i32 loc(#loc957) + %hi_24 = arith.maxsi %hi_22, %hi_23 : i32 loc(#loc957) + %hi_25 = arith.minsi %hi_21, %hi_24 : i32 loc(#loc958) + %c0_i32 = arith.constant 0 : i32 loc(#loc389) + %c1_i32 = arith.constant 1 : i32 loc(#loc389) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc389) + %1 = arith.bitcast %hi_25 : i32 to i32 loc(#loc389) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc389) + %3 = ub.poison : i32 loc(#loc389) + %do_ptrs_26:5 = scf.for %start_m = %0 to %1 step %2 iter_args(%dk_27 = %dk, %dv_28 = %dv, %offs_m1_29 = %offs_m1, %qT_ptrs_30 = %qT_ptrs_9, %do_ptrs_31 = %do_ptrs_19) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %6:2 = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(44,)cconstexpr_bf16__(45,)cconstexpr_1_d_44269504__(46,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_DELTA, %arg_DO, %arg_DQ, %arg_DV, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_Q_NUM_BLKS, %arg_Q_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %arg_FULL_Q_NUM_BLKS, %arg_FULL_Q_IDX, %in_ptr16, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %dk_27, %dv_28, %qT_ptrs_30, %k, %v, %do_ptrs_31, %DELTA, %LSE, %Q_LEN, %ks0, %off_z, %off_hq, %offs_n1, %offs_m1_29, %offs_k, %offs_v, %stride_qm, %stride_qd, %stride_dom, %stride_dod, %q_indices, %sparse_q_num_blocks) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, tensor<128x128xf32>, tensor<128x128xf32>, tensor<128x64x!tt.ptr>, tensor<128x128xbf16>, tensor<128x128xbf16>, tensor<64x128x!tt.ptr>, !tt.ptr, !tt.ptr, i32, i32, i32, i32, tensor<128xi32>, tensor<64xi32>, tensor<128xi32>, tensor<128xi32>, i32, i32, i32, i32, !tt.ptr, i32) -> (tensor<128x128xf32>, tensor<128x128xf32>) loc(#loc390) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_m, %q_indices, %sparse_q_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc960) + %qT_ptrs_32 = arith.muli %offset, %stride_qm : i32 loc(#loc961) + %qT_ptrs_33 = tt.splat %qT_ptrs_32 : i32 -> tensor<128x64xi32> loc(#loc962) + %qT_ptrs_34 = tt.addptr %qT_ptrs_30, %qT_ptrs_33 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc962) + %do_ptrs_35 = arith.muli %offset, %stride_dom : i32 loc(#loc963) + %do_ptrs_36 = tt.splat %do_ptrs_35 : i32 -> tensor<64x128xi32> loc(#loc964) + %do_ptrs_37 = tt.addptr %do_ptrs_31, %do_ptrs_36 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc964) + %offs_m1_38 = tt.splat %offset : i32 -> tensor<64xi32> loc(#loc965) + %offs_m1_39 = arith.addi %offs_m1_29, %offs_m1_38 : tensor<64xi32> loc(#loc965) + scf.yield %6#0, %6#1, %offs_m1_39, %qT_ptrs_34, %do_ptrs_37 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc397) + } loc(#loc1097) + tt.return %do_ptrs_26#0, %do_ptrs_26#1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc398) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc399) + %5 = ub.poison : tensor<128x128xf32> loc(#loc399) + tt.return %4, %5 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc399) + } loc(#loc369) + tt.func private @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.bwd_dkdv_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pbf16_Pbf16_Pbf16_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_fp32S128_128S_fp32S128_128S_Pbf16S128_64S_bf16S128_128S_bf16S128_128S_Pbf16S64_128S_Pfp32_Pfp32_i32_i32_i32_i32_i32S128S_i32S64S_i32S128S_i32S128S_i32_i32_i32_i32_Pi32_i32__(44,)cconstexpr_bf16__(45,)cconstexpr_1_d_44269504__(46,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc400)), %arg_K: !tt.ptr loc("arg_K"(#loc400)), %arg_V: !tt.ptr loc("arg_V"(#loc400)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc400)), %arg_DELTA: !tt.ptr loc("arg_DELTA"(#loc400)), %arg_DO: !tt.ptr loc("arg_DO"(#loc400)), %arg_DQ: !tt.ptr loc("arg_DQ"(#loc400)), %arg_DV: !tt.ptr loc("arg_DV"(#loc400)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc400)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc400)), %arg_Q_NUM_BLKS: !tt.ptr loc("arg_Q_NUM_BLKS"(#loc400)), %arg_Q_IDX: !tt.ptr loc("arg_Q_IDX"(#loc400)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc400)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc400)), %arg_FULL_Q_NUM_BLKS: !tt.ptr loc("arg_FULL_Q_NUM_BLKS"(#loc400)), %arg_FULL_Q_IDX: !tt.ptr loc("arg_FULL_Q_IDX"(#loc400)), %in_ptr16: !tt.ptr loc("in_ptr16"(#loc400)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc400)), %ks0: i32 loc("ks0"(#loc400)), %ks1: i32 loc("ks1"(#loc400)), %ks2: i32 loc("ks2"(#loc400)), %ks3: i32 loc("ks3"(#loc400)), %dk: tensor<128x128xf32> loc("dk"(#loc400)), %dv: tensor<128x128xf32> loc("dv"(#loc400)), %qT_ptrs: tensor<128x64x!tt.ptr> loc("qT_ptrs"(#loc400)), %k: tensor<128x128xbf16> loc("k"(#loc400)), %v: tensor<128x128xbf16> loc("v"(#loc400)), %do_ptrs: tensor<64x128x!tt.ptr> loc("do_ptrs"(#loc400)), %DELTA: !tt.ptr loc("DELTA"(#loc400)), %LSE: !tt.ptr loc("LSE"(#loc400)), %Q_LEN: i32 loc("Q_LEN"(#loc400)), %KV_LEN: i32 loc("KV_LEN"(#loc400)), %off_z: i32 loc("off_z"(#loc400)), %off_hq: i32 loc("off_hq"(#loc400)), %offs_n1: tensor<128xi32> loc("offs_n1"(#loc400)), %offs_m1: tensor<64xi32> loc("offs_m1"(#loc400)), %offs_k: tensor<128xi32> loc("offs_k"(#loc400)), %offs_v: tensor<128xi32> loc("offs_v"(#loc400)), %stride_qm: i32 loc("stride_qm"(#loc400)), %stride_qd: i32 loc("stride_qd"(#loc400)), %stride_dom: i32 loc("stride_dom"(#loc400)), %stride_dod: i32 loc("stride_dod"(#loc400)), %q_indices: !tt.ptr loc("q_indices"(#loc400)), %sparse_q_num_blocks: i32 loc("sparse_q_num_blocks"(#loc400))) -> (tensor<128x128xf32>, tensor<128x128xf32>) attributes {noinline = false} { + %qT = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.load_checked_2d__Pbf16S128_64S_i32S128S_i32S64S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_True__(6,)cconstexpr_False__(7,)cconstexpr_128_"(%qT_ptrs, %offs_k, %offs_m1, %Q_LEN) : (tensor<128x64x!tt.ptr>, tensor<128xi32>, tensor<64xi32>, i32) -> tensor<128x64xbf16> loc(#loc1010) + %lse = tt.splat %Q_LEN : i32 -> tensor<64xi32> loc(#loc1011) + %lse_0 = arith.cmpi slt, %offs_m1, %lse : tensor<64xi32> loc(#loc1011) + %lse_1 = tt.splat %LSE : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc1012) + %lse_2 = tt.addptr %lse_1, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc1012) + %lse_3 = tt.load %lse_2, %lse_0 : tensor<64x!tt.ptr> loc(#loc1013) + %lse_4 = arith.constant 0xFF800000 : f32 loc(#loc1014) + %lse_5 = arith.constant dense<0xFF800000> : tensor<64xf32> loc(#loc1014) + %lse_6 = arith.cmpf oeq, %lse_3, %lse_5 : tensor<64xf32> loc(#loc1014) + %lse_7 = arith.constant 0.000000e+00 : f32 loc(#loc1015) + %lse_8 = arith.constant 0.000000e+00 : f32 loc(#loc1015) + %lse_9 = arith.constant dense<0.000000e+00> : tensor<64xf32> loc(#loc1015) + %lse_10 = arith.select %lse_6, %lse_9, %lse_3 : tensor<64xi1>, tensor<64xf32> loc(#loc1015) + %qkT = arith.constant 0.000000e+00 : f32 loc(#loc1016) + %qkT_11 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1016) + %qkT_12 = tt.dot %k, %qT, %qkT_11, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc1016) + %qkT_13 = arith.constant 0.0883883461 : f32 loc(#loc1017) + %qkT_14 = arith.constant 0.0883883461 : f32 loc(#loc1017) + %qkT_15 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1017) + %qkT_16 = arith.mulf %qkT_12, %qkT_15 : tensor<128x64xf32> loc(#loc1017) + %m = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1018) + %m_17 = tt.call @torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.get_bounded_indices__i32S1_64S_i32__(%m, %Q_LEN) : (tensor<1x64xi32>, i32) -> tensor<1x64xi32> loc(#loc1019) + %n = tt.expand_dims %offs_n1 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc1020) + %n_18 = tt.call @torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.get_bounded_indices__i32S128_1S_i32__(%n, %KV_LEN) : (tensor<128x1xi32>, i32) -> tensor<128x1xi32> loc(#loc1021) + %post_mod_scores = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1022) + %post_mod_scores_19 = tt.splat %Q_LEN : i32 -> tensor<1x64xi32> loc(#loc1023) + %post_mod_scores_20 = arith.cmpi slt, %post_mod_scores, %post_mod_scores_19 : tensor<1x64xi32> loc(#loc1023) + %post_mod_scores_21 = arith.constant 0xFF800000 : f32 loc(#loc1024) + %post_mod_scores_22 = arith.constant 0xFF800000 : f32 loc(#loc1024) + %post_mod_scores_23 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1024) + %post_mod_scores_24 = tt.broadcast %post_mod_scores_20 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1024) + %post_mod_scores_25 = arith.select %post_mod_scores_24, %qkT_16, %post_mod_scores_23 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1024) + %post_mod_scores_26 = arith.constant 1.44269502 : f32 loc(#loc1062) + %post_mod_scores_27 = arith.constant 1.44269502 : f32 loc(#loc1062) + %post_mod_scores_28 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1062) + %post_mod_scores_29 = arith.mulf %post_mod_scores_25, %post_mod_scores_28 : tensor<128x64xf32> loc(#loc1062) + %pT = tt.expand_dims %lse_10 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc1063) + %pT_30 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1064) + %pT_31 = arith.subf %post_mod_scores_29, %pT_30 : tensor<128x64xf32> loc(#loc1064) + %pT_32 = math.exp2 %pT_31 : tensor<128x64xf32> loc(#loc1065) + %do = tt.call @"torch._inductor.runtime.compile_tasks.cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.load_checked_2d__Pbf16S64_128S_i32S64S_i32S128S_i32__(3,)cconstexpr_None__(4,)cconstexpr_None__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%do_ptrs, %offs_m1, %offs_v, %Q_LEN) : (tensor<64x128x!tt.ptr>, tensor<64xi32>, tensor<128xi32>, i32) -> tensor<64x128xbf16> loc(#loc1066) + %dv_33 = arith.truncf %pT_32 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1067) + %dv_34 = arith.constant 0.000000e+00 : f32 loc(#loc1068) + %dv_35 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1068) + %dv_36 = tt.dot %dv_33, %do, %dv_35, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1068) + %dv_37 = arith.addf %dv, %dv_36 : tensor<128x128xf32> loc(#loc1069) + %Di = tt.splat %Q_LEN : i32 -> tensor<64xi32> loc(#loc1070) + %Di_38 = arith.cmpi slt, %offs_m1, %Di : tensor<64xi32> loc(#loc1070) + %Di_39 = tt.splat %DELTA : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc1071) + %Di_40 = tt.addptr %Di_39, %offs_m1 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc1071) + %Di_41 = tt.load %Di_40, %Di_38 : tensor<64x!tt.ptr> loc(#loc1072) + %dpT = tt.trans %do {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc1073) + %dpT_42 = arith.constant 0.000000e+00 : f32 loc(#loc1074) + %dpT_43 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1074) + %dpT_44 = tt.dot %v, %dpT, %dpT_43, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc1074) + %dsT = tt.expand_dims %Di_41 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc1075) + %dsT_45 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc1076) + %dsT_46 = arith.subf %dpT_44, %dsT_45 : tensor<128x64xf32> loc(#loc1076) + %dsT_47 = arith.mulf %pT_32, %dsT_46 : tensor<128x64xf32> loc(#loc1077) + %grad_scores = tt.expand_dims %offs_m1 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc1078) + %grad_scores_48 = tt.splat %Q_LEN : i32 -> tensor<1x64xi32> loc(#loc1079) + %grad_scores_49 = arith.cmpi slt, %grad_scores, %grad_scores_48 : tensor<1x64xi32> loc(#loc1079) + %grad_scores_50 = arith.constant 0.000000e+00 : f32 loc(#loc1080) + %grad_scores_51 = arith.constant 0.000000e+00 : f32 loc(#loc1080) + %grad_scores_52 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1080) + %grad_scores_53 = tt.broadcast %grad_scores_49 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc1080) + %grad_scores_54 = arith.select %grad_scores_53, %dsT_47, %grad_scores_52 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc1080) + %dk_55 = arith.truncf %grad_scores_54 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc1082) + %dk_56 = tt.trans %qT {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc1083) + %dk_57 = arith.constant 0.000000e+00 : f32 loc(#loc1084) + %dk_58 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1084) + %dk_59 = tt.dot %dk_55, %dk_56, %dk_58, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc1084) + %dk_60 = arith.addf %dk, %dk_59 : tensor<128x128xf32> loc(#loc1085) + tt.return %dk_60, %dv_37 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc477) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc478) + %1 = ub.poison : tensor<128x128xf32> loc(#loc478) + tt.return %0, %1 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc478) + } loc(#loc400) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":94:49) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":95:54) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":95:63) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":95:49) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":96:54) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":96:63) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":96:49) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":97:53) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":99:53) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":100:58) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":100:67) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":100:53) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":102:9) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":103:9) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":104:10) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":105:12) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":106:10) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":111:24) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":112:36) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":113:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":115:27) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":116:28) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":117:23) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":119:15) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":120:16) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":122:28) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":124:25) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":124:47) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":124:35) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":124:59) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":125:25) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":125:47) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":125:35) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":125:59) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":128:27) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":128:50) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":128:37) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":128:61) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":131:9) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":132:9) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":133:10) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":135:14) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":136:26) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":137:26) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":139:14) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":139:7) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":140:24) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":142:29) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":143:30) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":144:29) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":144:54) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":144:44) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":145:35) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":146:41) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":147:31) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":148:29) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":151:35) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":152:42) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":152:54) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":154:55) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":154:78) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":155:50) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":155:83) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":155:68) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":158:30) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":158:52) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":158:40) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":158:63) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":159:32) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":159:55) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":159:42) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":159:66) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":160:32) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":160:55) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":160:42) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":160:66) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":161:30) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":161:35) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":161:46) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":161:56) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":163:17) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":164:19) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":167:19) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":168:21) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":169:25) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":172:22) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":174:36) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":175:42) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":175:29) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":178:107) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":179:111) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":188:58) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":188:34) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":188:25) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":189:57) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":189:33) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":189:26) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":190:30) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":190:50) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":191:18) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":195:30) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":196:27) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":196:41) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":197:53) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":197:39) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":199:42) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":199:29) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":207:12) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":214:39) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":215:31) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":215:45) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":216:62) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":216:43) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":218:46) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":218:33) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":226:16) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":231:32) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":231:43) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":231:24) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":231:63) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":231:74) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":231:56) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":232:14) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":236:48) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":236:59) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":236:76) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":236:87) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":236:69) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":236:30) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":239:29) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":240:30) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":242:26) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":245:28) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":246:25) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":249:22) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":250:22) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":252:25) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":253:42) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":253:29) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":256:107) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":257:107) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":262:30) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":263:32) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":263:51) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":266:34) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":266:56) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":266:44) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":266:67) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":267:36) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":267:59) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":267:46) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":267:70) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":268:36) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":268:59) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":268:46) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":268:70) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":269:34) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":269:39) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":269:50) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":269:60) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":271:21) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":272:23) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":275:25) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":276:29) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":278:39) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":279:46) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":279:58) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":281:58) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":281:80) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":282:53) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":282:81) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":282:70) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":286:32) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":287:30) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":287:43) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":288:55) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":288:42) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":290:45) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":290:32) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":298:16) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":306:41) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":307:34) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":307:47) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":308:64) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":308:46) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":310:49) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":310:36) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":318:20) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":303:12) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":323:31) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":323:42) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":323:23) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":323:62) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":323:73) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":323:55) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":325:26) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":326:25) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":327:25) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":332:50) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":332:71) +#loc201 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":332:61) +#loc202 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":332:30) +#loc203 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":334:14) +#loc204 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":337:29) +#loc205 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:31) +#loc206 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:27) +#loc207 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:45) +#loc208 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:53) +#loc209 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:41) +#loc210 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:64) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:71) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:59) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":345:29) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":345:69) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":139:4) +#loc217 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:16) +#loc218 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc219 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc220 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:11) +#loc221 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:4) +#loc222 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc223 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc224 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc225 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc227 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":825:27) +#loc228 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":825:38) +#loc229 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":825:20) +#loc230 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":825:56) +#loc231 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":825:67) +#loc232 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":825:49) +#loc233 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":833:41) +#loc234 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":833:52) +#loc235 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":833:23) +#loc236 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":833:15) +#loc237 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":828:4) +#loc239 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":384:12) +#loc240 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":387:26) +#loc241 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":388:26) +#loc242 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":390:26) +#loc243 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":390:37) +#loc244 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":390:18) +#loc245 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":390:56) +#loc246 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":390:67) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":390:49) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":391:26) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":391:37) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":391:18) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":391:56) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":391:67) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":391:49) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":395:43) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":395:90) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":395:101) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":395:63) +#loc258 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":397:28) +#loc259 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":405:12) +#loc260 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":411:64) +#loc261 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":414:28) +#loc262 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":414:19) +#loc263 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":415:28) +#loc264 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":415:19) +#loc265 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":417:19) +#loc266 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":417:8) +#loc267 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":419:11) +#loc268 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":419:4) +#loc270 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":458:105) +#loc271 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":459:19) +#loc272 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":461:14) +#loc273 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":464:36) +#loc274 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":464:46) +#loc275 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":467:36) +#loc276 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":467:46) +#loc277 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":476:43) +#loc278 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":476:54) +#loc279 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":476:79) +#loc280 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":479:35) +#loc281 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":482:23) +#loc282 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":483:23) +#loc283 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":485:34) +#loc284 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":485:23) +#loc285 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":486:22) +#loc286 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":487:23) +#loc287 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":488:23) +#loc288 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":489:23) +#loc289 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":490:23) +#loc290 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":491:23) +#loc291 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":492:35) +#loc292 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":493:24) +#loc293 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":494:24) +#loc294 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":495:32) +#loc295 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":496:25) +#loc296 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":497:92) +#loc297 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":498:92) +#loc298 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":499:25) +#loc299 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":500:24) +#loc300 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":501:24) +#loc301 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":502:39) +#loc302 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":503:25) +#loc303 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":504:24) +#loc304 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":505:24) +#loc305 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":506:23) +#loc306 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":507:25) +#loc307 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":508:25) +#loc308 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":509:92) +#loc309 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":510:25) +#loc310 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":511:24) +#loc311 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":512:24) +#loc312 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":513:39) +#loc313 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":514:25) +#loc314 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":515:24) +#loc315 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":516:24) +#loc316 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":521:69) +#loc317 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":524:27) +#loc318 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":525:39) +#loc319 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":525:21) +#loc320 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":528:104) +#loc321 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":530:20) +#loc322 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":531:22) +#loc323 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":531:19) +#loc324 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":531:14) +#loc325 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":538:39) +#loc326 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":538:50) +#loc327 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":538:71) +#loc328 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":542:32) +#loc329 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":542:43) +#loc330 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":542:62) +#loc331 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":542:73) +#loc332 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":542:54) +#loc333 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":549:43) +#loc334 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":551:15) +#loc335 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":553:30) +#loc336 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":553:21) +#loc337 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":553:10) +#loc338 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":555:11) +#loc339 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":555:4) +#loc340 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":831:41) +#loc341 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":831:52) +#loc342 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":831:23) +#loc343 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":831:15) +#loc345 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":798:21) +#loc346 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":798:11) +#loc347 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":798:4) +#loc349 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":788:33) +#loc350 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":789:38) +#loc351 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":789:24) +#loc352 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":790:109) +#loc353 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":790:113) +#loc354 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":790:39) +#loc355 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":790:55) +#loc356 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":790:25) +#loc357 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":791:30) +#loc358 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":791:35) +#loc359 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":791:60) +#loc360 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":792:34) +#loc361 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":792:48) +#loc362 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":792:63) +#loc363 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":793:29) +#loc364 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":793:47) +#loc365 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":793:61) +#loc366 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":793:42) +#loc367 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":794:11) +#loc368 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":794:4) +#loc370 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":595:12) +#loc371 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":598:26) +#loc372 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":599:26) +#loc373 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":601:26) +#loc374 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":601:37) +#loc375 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":601:18) +#loc376 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":601:56) +#loc377 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":601:67) +#loc378 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":601:49) +#loc379 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":602:27) +#loc380 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":602:38) +#loc381 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":602:19) +#loc382 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":602:58) +#loc383 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":602:69) +#loc384 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":602:51) +#loc385 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":608:42) +#loc386 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":608:87) +#loc387 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":608:98) +#loc388 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":608:61) +#loc389 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":610:28) +#loc390 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":618:12) +#loc391 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":623:62) +#loc392 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":626:28) +#loc393 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":626:19) +#loc394 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":627:28) +#loc395 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":627:19) +#loc396 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":628:19) +#loc397 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":628:8) +#loc398 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":630:11) +#loc399 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":630:4) +#loc401 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":669:105) +#loc402 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":674:52) +#loc403 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":674:28) +#loc404 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":674:22) +#loc405 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":675:26) +#loc406 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":675:46) +#loc407 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":676:20) +#loc408 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":678:15) +#loc409 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":680:36) +#loc410 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":680:46) +#loc411 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":683:36) +#loc412 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":683:46) +#loc413 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":692:43) +#loc414 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":692:54) +#loc415 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":692:78) +#loc416 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":695:36) +#loc417 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":698:25) +#loc418 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":699:25) +#loc419 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":701:35) +#loc420 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":701:24) +#loc421 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":702:24) +#loc422 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":703:25) +#loc423 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":704:24) +#loc424 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":705:24) +#loc425 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":706:24) +#loc426 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":707:24) +#loc427 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":708:35) +#loc428 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":709:25) +#loc429 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":710:25) +#loc430 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":711:32) +#loc431 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":712:25) +#loc432 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":713:92) +#loc433 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":714:92) +#loc434 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":715:25) +#loc435 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":716:24) +#loc436 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":717:24) +#loc437 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":718:39) +#loc438 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":719:25) +#loc439 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":720:24) +#loc440 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":721:24) +#loc441 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":722:24) +#loc442 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":723:25) +#loc443 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":724:25) +#loc444 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":725:92) +#loc445 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":726:25) +#loc446 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":727:24) +#loc447 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":728:24) +#loc448 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":729:39) +#loc449 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":730:25) +#loc450 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":731:24) +#loc451 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":732:24) +#loc452 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":736:69) +#loc453 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":739:27) +#loc454 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":740:44) +#loc455 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":740:40) +#loc456 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":740:22) +#loc457 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":741:99) +#loc458 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":744:24) +#loc459 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":744:43) +#loc460 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":744:10) +#loc461 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":748:53) +#loc462 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":748:29) +#loc463 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":748:21) +#loc464 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":750:29) +#loc465 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":750:20) +#loc466 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":751:25) +#loc467 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":751:22) +#loc468 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":751:16) +#loc469 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":759:39) +#loc470 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":759:50) +#loc471 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":759:70) +#loc472 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":773:45) +#loc473 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":775:24) +#loc474 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":775:52) +#loc475 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":775:43) +#loc476 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":775:10) +#loc477 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":777:11) +#loc478 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":777:4) +#loc501 = loc("ZQ"(#loc13)) +#loc502 = loc("HQ"(#loc14)) +#loc503 = loc("HKV"(#loc15)) +#loc504 = loc("Q_LEN"(#loc16)) +#loc505 = loc("ZKV"(#loc17)) +#loc506 = loc("pid"(#loc18)) +#loc507 = loc("NUM_KV_BLOCKS"(#loc19)) +#loc508 = loc("NUM_Q_BLOCKS"(#loc20)) +#loc509 = loc("off_zq"(#loc21)) +#loc510 = loc("off_hkv"(#loc22)) +#loc511 = loc("off_zkv"(#loc23)) +#loc512 = loc("SPARSE_Z"(#loc24)) +#loc513 = loc("SPARSE_HQ"(#loc25)) +#loc514 = loc("sparse_idx_z"(#loc26)) +#loc515 = loc("k_adj"(#loc27)) +#loc516 = loc("k_adj"(#loc28)) +#loc517 = loc("k_adj"(#loc29)) +#loc518 = loc("k_adj"(#loc30)) +#loc519 = loc("v_adj"(#loc31)) +#loc520 = loc("v_adj"(#loc32)) +#loc521 = loc("v_adj"(#loc33)) +#loc522 = loc("v_adj"(#loc34)) +#loc523 = loc("dv_adj"(#loc35)) +#loc524 = loc("dv_adj"(#loc36)) +#loc525 = loc("dv_adj"(#loc37)) +#loc526 = loc("dv_adj"(#loc38)) +#loc527 = loc("K"(#loc39)) +#loc528 = loc("V"(#loc40)) +#loc529 = loc("DV"(#loc41)) +#loc530 = loc("RCP_LN2"(#loc42)) +#loc531 = loc("offs_k"(#loc43)) +#loc532 = loc("offs_v"(#loc44)) +#loc533 = loc("off_pid"(#loc47)) +#loc534 = loc("SPARSE_Q_MULTIPLE"(#loc48)) +#loc535 = loc("SPARSE_KV_MULTIPLE"(#loc49)) +#loc536 = loc("off_hq2"(#loc50)) +#loc537 = loc("off_hq2"(#loc51)) +#loc538 = loc("off_hq2"(#loc52)) +#loc539 = loc("start_m2_block"(#loc53)) +#loc540 = loc("off_pid_mask"(#loc54)) +#loc541 = loc("stride_kv_num_blks_h"(#loc55)) +#loc542 = loc("stride_kv_idx_h"(#loc56)) +#loc543 = loc("sparse_idx_hq2"(#loc57)) +#loc544 = loc("sparse_hz_offset"(#loc58)) +#loc545 = loc("sparse_hz_offset"(#loc59)) +#loc546 = loc("sparse_kv_num_blks_offset"(#loc60)) +#loc547 = loc("sparse_kv_num_blks_offset"(#loc61)) +#loc548 = loc("sparse_kv_idx_offset"(#loc62)) +#loc549 = loc("sparse_kv_idx_offset"(#loc63)) +#loc550 = loc("sparse_kv_idx_offset"(#loc64)) +#loc551 = loc("q_adj2"(#loc65)) +#loc552 = loc("q_adj2"(#loc66)) +#loc553 = loc("q_adj2"(#loc67)) +#loc554 = loc("q_adj2"(#loc68)) +#loc555 = loc("do_adj2"(#loc69)) +#loc556 = loc("do_adj2"(#loc70)) +#loc557 = loc("do_adj2"(#loc71)) +#loc558 = loc("do_adj2"(#loc72)) +#loc559 = loc("dq_adj2"(#loc73)) +#loc560 = loc("dq_adj2"(#loc74)) +#loc561 = loc("dq_adj2"(#loc75)) +#loc562 = loc("dq_adj2"(#loc76)) +#loc563 = loc("off_chz2"(#loc77)) +#loc564 = loc("off_chz2"(#loc78)) +#loc565 = loc("off_chz2"(#loc79)) +#loc566 = loc("off_chz2"(#loc80)) +#loc567 = loc("Q2"(#loc81)) +#loc568 = loc("DO2"(#loc82)) +#loc569 = loc("DQ2"(#loc83)) +#loc570 = loc("LSE2"(#loc84)) +#loc571 = loc("DELTA2"(#loc85)) +#loc572 = loc("dq"(#loc86)) +#loc573 = loc("start_m2"(#loc87)) +#loc574 = loc("offs_m2"(#loc88)) +#loc575 = loc("offs_m2"(#loc89)) +#loc576 = loc("q"(#loc90)) +#loc577 = loc("do"(#loc91)) +#loc578 = loc("Di"(#loc92)) +#loc579 = loc("Di"(#loc93)) +#loc580 = loc("Di"(#loc94)) +#loc581 = loc("lse"(#loc95)) +#loc582 = loc("lse"(#loc96)) +#loc583 = loc("lse"(#loc97)) +#loc584 = loc("lse"(#loc98)) +#loc585 = loc("lse"(#loc99)) +#loc586 = loc("lse"(#loc100)) +#loc587 = loc("kv_indices"(#loc101)) +#loc588 = loc("kv_start"(#loc102)) +#loc589 = loc("kv_start"(#loc103)) +#loc590 = loc("sparse_kv_num_blocks"(#loc104)) +#loc591 = loc("sparse_kv_num_blocks"(#loc105)) +#loc592 = loc("offs_n2"(#loc106)) +#loc593 = loc("offs_n2"(#loc107)) +#loc594 = loc("dq"(#loc108)) +#loc595 = loc("kv_indices"(#loc109)) +#loc596 = loc("kv_start"(#loc110)) +#loc597 = loc("kv_start"(#loc111)) +#loc598 = loc("sparse_kv_num_blocks"(#loc112)) +#loc599 = loc("sparse_kv_num_blocks"(#loc113)) +#loc600 = loc("offs_n2"(#loc114)) +#loc601 = loc("offs_n2"(#loc115)) +#loc602 = loc("dq"(#loc116)) +#loc603 = loc("dq_ptrs"(#loc117)) +#loc604 = loc("dq_ptrs"(#loc118)) +#loc605 = loc("dq_ptrs"(#loc119)) +#loc606 = loc("dq_ptrs"(#loc120)) +#loc607 = loc("dq_ptrs"(#loc121)) +#loc608 = loc("dq_ptrs"(#loc122)) +#loc609 = loc("dq"(#loc123)) +#loc610 = loc("SPARSE_Q_MULTIPLE"(#loc130)) +#loc611 = loc("SPARSE_KV_MULTIPLE"(#loc131)) +#loc612 = loc("pid_mask"(#loc132)) +#loc613 = loc("stride_q_idx_h"(#loc133)) +#loc614 = loc("stride_q_idx_n"(#loc134)) +#loc615 = loc("dv"(#loc135)) +#loc616 = loc("dk"(#loc136)) +#loc617 = loc("start_n1"(#loc137)) +#loc618 = loc("offs_n1"(#loc138)) +#loc619 = loc("offs_n1"(#loc139)) +#loc620 = loc("k"(#loc140)) +#loc621 = loc("v"(#loc141)) +#loc622 = loc("dv"(#loc142)) +#loc623 = loc("off_hq1"(#loc143)) +#loc624 = loc("off_hq1"(#loc144)) +#loc625 = loc("q_adj1"(#loc145)) +#loc626 = loc("q_adj1"(#loc146)) +#loc627 = loc("q_adj1"(#loc147)) +#loc628 = loc("q_adj1"(#loc148)) +#loc629 = loc("do_adj1"(#loc149)) +#loc630 = loc("do_adj1"(#loc150)) +#loc631 = loc("do_adj1"(#loc151)) +#loc632 = loc("do_adj1"(#loc152)) +#loc633 = loc("dq_adj1"(#loc153)) +#loc634 = loc("dq_adj1"(#loc154)) +#loc635 = loc("dq_adj1"(#loc155)) +#loc636 = loc("dq_adj1"(#loc156)) +#loc637 = loc("off_chz1"(#loc157)) +#loc638 = loc("off_chz1"(#loc158)) +#loc639 = loc("off_chz1"(#loc159)) +#loc640 = loc("off_chz1"(#loc160)) +#loc641 = loc("Q1"(#loc161)) +#loc642 = loc("DO1"(#loc162)) +#loc643 = loc("LSE1"(#loc163)) +#loc644 = loc("DELTA1"(#loc164)) +#loc645 = loc("sparse_idx_hq1"(#loc165)) +#loc646 = loc("sparse_hz_offset"(#loc166)) +#loc647 = loc("sparse_hz_offset"(#loc167)) +#loc648 = loc("sparse_q_num_blks_offset"(#loc168)) +#loc649 = loc("sparse_q_num_blks_offset"(#loc169)) +#loc650 = loc("sparse_q_idx_offset"(#loc170)) +#loc651 = loc("sparse_q_idx_offset"(#loc171)) +#loc652 = loc("sparse_q_idx_offset"(#loc172)) +#loc653 = loc("q_indices"(#loc173)) +#loc654 = loc("q_start"(#loc174)) +#loc655 = loc("q_start"(#loc175)) +#loc656 = loc("sparse_q_num_blocks"(#loc176)) +#loc657 = loc("sparse_q_num_blocks"(#loc177)) +#loc658 = loc("offs_m1"(#loc178)) +#loc659 = loc("offs_m1"(#loc179)) +#loc660 = loc("q_indices"(#loc181)) +#loc661 = loc("q_start"(#loc182)) +#loc662 = loc("q_start"(#loc183)) +#loc663 = loc("sparse_q_num_blocks"(#loc184)) +#loc664 = loc("sparse_q_num_blocks"(#loc185)) +#loc665 = loc("offs_m1"(#loc186)) +#loc666 = loc("offs_m1"(#loc187)) +#loc667 = loc("dv_ptrs"(#loc190)) +#loc668 = loc("dv_ptrs"(#loc191)) +#loc669 = loc("dv_ptrs"(#loc192)) +#loc670 = loc("dv_ptrs"(#loc193)) +#loc671 = loc("dv_ptrs"(#loc194)) +#loc672 = loc("dv_ptrs"(#loc195)) +#loc673 = loc("index_n"(#loc196)) +#loc674 = loc("index_k"(#loc197)) +#loc675 = loc("index_v"(#loc198)) +#loc676 = loc("dk"(#loc203)) +#loc677 = loc("mask"(#loc204)) +#loc678 = loc("xindex"(#loc205)) +#loc679 = loc("xindex"(#loc206)) +#loc680 = loc("xindex"(#loc207)) +#loc681 = loc("xindex"(#loc208)) +#loc682 = loc("xindex"(#loc209)) +#loc683 = loc("xindex"(#loc210)) +#loc684 = loc("xindex"(#loc211)) +#loc685 = loc("xindex"(#loc212)) +#loc693 = loc("ptr"(#loc227)) +#loc694 = loc("ptr"(#loc228)) +#loc695 = loc("ptr"(#loc229)) +#loc696 = loc("ptr"(#loc230)) +#loc697 = loc("ptr"(#loc231)) +#loc698 = loc("ptr"(#loc232)) +#loc738 = loc("Q_LEN"(#loc239)) +#loc739 = loc("offs_k"(#loc240)) +#loc740 = loc("offs_v"(#loc241)) +#loc741 = loc("kT_ptrs"(#loc242)) +#loc742 = loc("kT_ptrs"(#loc243)) +#loc743 = loc("kT_ptrs"(#loc244)) +#loc744 = loc("kT_ptrs"(#loc245)) +#loc745 = loc("kT_ptrs"(#loc246)) +#loc746 = loc("kT_ptrs"(#loc247)) +#loc747 = loc("vT_ptrs"(#loc248)) +#loc748 = loc("vT_ptrs"(#loc249)) +#loc749 = loc("vT_ptrs"(#loc250)) +#loc750 = loc("vT_ptrs"(#loc251)) +#loc751 = loc("vT_ptrs"(#loc252)) +#loc752 = loc("vT_ptrs"(#loc253)) +#loc753 = loc("hi"(#loc254)) +#loc754 = loc("hi"(#loc255)) +#loc755 = loc("hi"(#loc256)) +#loc756 = loc("hi"(#loc257)) +#loc757 = loc("dq"(#loc258)) +#loc758 = loc("dq"(#loc259)) +#loc759 = loc("offset"(#loc260)) +#loc760 = loc("kT_ptrs"(#loc261)) +#loc761 = loc("kT_ptrs"(#loc262)) +#loc762 = loc("vT_ptrs"(#loc263)) +#loc763 = loc("vT_ptrs"(#loc264)) +#loc764 = loc("offs_n2"(#loc265)) +#loc808 = loc("kT"(#loc270)) +#loc809 = loc("qk"(#loc271)) +#loc810 = loc("qk"(#loc272)) +#loc811 = loc("n"(#loc273)) +#loc812 = loc("n"(#loc274)) +#loc813 = loc("m"(#loc275)) +#loc814 = loc("m"(#loc276)) +#loc815 = loc("post_mod_scores"(#loc277)) +#loc816 = loc("post_mod_scores"(#loc278)) +#loc817 = loc("post_mod_scores"(#loc279)) +#loc818 = loc("tmp1"(#loc280)) +#loc819 = loc("tmp4"(#loc281)) +#loc820 = loc("tmp5"(#loc282)) +#loc821 = loc("tmp7"(#loc283)) +#loc822 = loc("tmp7"(#loc284)) +#loc823 = loc("tmp8"(#loc285)) +#loc824 = loc("tmp9"(#loc286)) +#loc825 = loc("tmp10"(#loc287)) +#loc826 = loc("tmp11"(#loc288)) +#loc827 = loc("tmp12"(#loc289)) +#loc828 = loc("tmp13"(#loc290)) +#loc829 = loc("tmp14"(#loc291)) +#loc830 = loc("tmp15"(#loc292)) +#loc831 = loc("tmp16"(#loc293)) +#loc832 = loc("tmp17"(#loc294)) +#loc833 = loc("tmp18"(#loc295)) +#loc834 = loc("tmp19"(#loc296)) +#loc835 = loc("tmp20"(#loc297)) +#loc836 = loc("tmp21"(#loc298)) +#loc837 = loc("tmp22"(#loc299)) +#loc838 = loc("tmp23"(#loc300)) +#loc839 = loc("tmp24"(#loc301)) +#loc840 = loc("tmp25"(#loc302)) +#loc841 = loc("tmp26"(#loc303)) +#loc842 = loc("tmp27"(#loc304)) +#loc843 = loc("tmp28"(#loc305)) +#loc844 = loc("tmp29"(#loc306)) +#loc845 = loc("tmp30"(#loc307)) +#loc846 = loc("tmp31"(#loc308)) +#loc847 = loc("tmp32"(#loc309)) +#loc848 = loc("tmp33"(#loc310)) +#loc849 = loc("tmp34"(#loc311)) +#loc850 = loc("tmp35"(#loc312)) +#loc851 = loc("tmp36"(#loc313)) +#loc852 = loc("tmp37"(#loc314)) +#loc853 = loc("tmp38"(#loc315)) +#loc854 = loc("post_mod_scores"(#loc316)) +#loc855 = loc("post_mod_scores"(#loc317)) +#loc856 = loc("p"(#loc318)) +#loc857 = loc("p"(#loc319)) +#loc858 = loc("vT"(#loc320)) +#loc859 = loc("dp"(#loc321)) +#loc860 = loc("ds"(#loc322)) +#loc861 = loc("ds"(#loc323)) +#loc862 = loc("ds"(#loc324)) +#loc863 = loc("grad_scores"(#loc325)) +#loc864 = loc("grad_scores"(#loc326)) +#loc865 = loc("grad_scores"(#loc327)) +#loc866 = loc("scatter_mask"(#loc328)) +#loc867 = loc("scatter_mask"(#loc329)) +#loc868 = loc("scatter_mask"(#loc330)) +#loc869 = loc("scatter_mask"(#loc331)) +#loc870 = loc("scatter_mask"(#loc332)) +#loc871 = loc("ds"(#loc333)) +#loc872 = loc("ds"(#loc334)) +#loc873 = loc("dq"(#loc335)) +#loc874 = loc("dq"(#loc336)) +#loc875 = loc("dq"(#loc337)) +#loc882 = loc("cur_block_idx"(#loc349)) +#loc883 = loc("cur_block"(#loc350)) +#loc884 = loc("cur_block"(#loc351)) +#loc885 = loc("next_block"(#loc352)) +#loc886 = loc("next_block"(#loc353)) +#loc887 = loc("next_block"(#loc354)) +#loc888 = loc("next_block"(#loc355)) +#loc889 = loc("next_block"(#loc356)) +#loc890 = loc("needs_jump"(#loc357)) +#loc891 = loc("needs_jump"(#loc358)) +#loc892 = loc("needs_jump"(#loc359)) +#loc893 = loc("jump_to_block"(#loc360)) +#loc894 = loc("jump_to_block"(#loc361)) +#loc895 = loc("jump_to_block"(#loc362)) +#loc896 = loc("offset"(#loc363)) +#loc897 = loc("offset"(#loc364)) +#loc898 = loc("offset"(#loc365)) +#loc899 = loc("offset"(#loc366)) +#loc940 = loc("Q_LEN"(#loc370)) +#loc941 = loc("offs_k"(#loc371)) +#loc942 = loc("offs_v"(#loc372)) +#loc943 = loc("qT_ptrs"(#loc373)) +#loc944 = loc("qT_ptrs"(#loc374)) +#loc945 = loc("qT_ptrs"(#loc375)) +#loc946 = loc("qT_ptrs"(#loc376)) +#loc947 = loc("qT_ptrs"(#loc377)) +#loc948 = loc("qT_ptrs"(#loc378)) +#loc949 = loc("do_ptrs"(#loc379)) +#loc950 = loc("do_ptrs"(#loc380)) +#loc951 = loc("do_ptrs"(#loc381)) +#loc952 = loc("do_ptrs"(#loc382)) +#loc953 = loc("do_ptrs"(#loc383)) +#loc954 = loc("do_ptrs"(#loc384)) +#loc955 = loc("hi"(#loc385)) +#loc956 = loc("hi"(#loc386)) +#loc957 = loc("hi"(#loc387)) +#loc958 = loc("hi"(#loc388)) +#loc959 = loc("dk"(#loc389)) +#loc960 = loc("offset"(#loc391)) +#loc961 = loc("qT_ptrs"(#loc392)) +#loc962 = loc("qT_ptrs"(#loc393)) +#loc963 = loc("do_ptrs"(#loc394)) +#loc964 = loc("do_ptrs"(#loc395)) +#loc965 = loc("offs_m1"(#loc396)) +#loc1010 = loc("qT"(#loc401)) +#loc1011 = loc("lse"(#loc402)) +#loc1012 = loc("lse"(#loc403)) +#loc1013 = loc("lse"(#loc404)) +#loc1014 = loc("lse"(#loc405)) +#loc1015 = loc("lse"(#loc406)) +#loc1016 = loc("qkT"(#loc407)) +#loc1017 = loc("qkT"(#loc408)) +#loc1018 = loc("m"(#loc409)) +#loc1019 = loc("m"(#loc410)) +#loc1020 = loc("n"(#loc411)) +#loc1021 = loc("n"(#loc412)) +#loc1022 = loc("post_mod_scores"(#loc413)) +#loc1023 = loc("post_mod_scores"(#loc414)) +#loc1024 = loc("post_mod_scores"(#loc415)) +#loc1025 = loc("tmp41"(#loc416)) +#loc1026 = loc("tmp44"(#loc417)) +#loc1027 = loc("tmp45"(#loc418)) +#loc1028 = loc("tmp47"(#loc419)) +#loc1029 = loc("tmp47"(#loc420)) +#loc1030 = loc("tmp48"(#loc421)) +#loc1031 = loc("tmp49"(#loc422)) +#loc1032 = loc("tmp50"(#loc423)) +#loc1033 = loc("tmp51"(#loc424)) +#loc1034 = loc("tmp52"(#loc425)) +#loc1035 = loc("tmp53"(#loc426)) +#loc1036 = loc("tmp54"(#loc427)) +#loc1037 = loc("tmp55"(#loc428)) +#loc1038 = loc("tmp56"(#loc429)) +#loc1039 = loc("tmp57"(#loc430)) +#loc1040 = loc("tmp58"(#loc431)) +#loc1041 = loc("tmp59"(#loc432)) +#loc1042 = loc("tmp60"(#loc433)) +#loc1043 = loc("tmp61"(#loc434)) +#loc1044 = loc("tmp62"(#loc435)) +#loc1045 = loc("tmp63"(#loc436)) +#loc1046 = loc("tmp64"(#loc437)) +#loc1047 = loc("tmp65"(#loc438)) +#loc1048 = loc("tmp66"(#loc439)) +#loc1049 = loc("tmp67"(#loc440)) +#loc1050 = loc("tmp68"(#loc441)) +#loc1051 = loc("tmp69"(#loc442)) +#loc1052 = loc("tmp70"(#loc443)) +#loc1053 = loc("tmp71"(#loc444)) +#loc1054 = loc("tmp72"(#loc445)) +#loc1055 = loc("tmp73"(#loc446)) +#loc1056 = loc("tmp74"(#loc447)) +#loc1057 = loc("tmp75"(#loc448)) +#loc1058 = loc("tmp76"(#loc449)) +#loc1059 = loc("tmp77"(#loc450)) +#loc1060 = loc("tmp78"(#loc451)) +#loc1061 = loc("post_mod_scores"(#loc452)) +#loc1062 = loc("post_mod_scores"(#loc453)) +#loc1063 = loc("pT"(#loc454)) +#loc1064 = loc("pT"(#loc455)) +#loc1065 = loc("pT"(#loc456)) +#loc1066 = loc("do"(#loc457)) +#loc1067 = loc("dv"(#loc458)) +#loc1068 = loc("dv"(#loc459)) +#loc1069 = loc("dv"(#loc460)) +#loc1070 = loc("Di"(#loc461)) +#loc1071 = loc("Di"(#loc462)) +#loc1072 = loc("Di"(#loc463)) +#loc1073 = loc("dpT"(#loc464)) +#loc1074 = loc("dpT"(#loc465)) +#loc1075 = loc("dsT"(#loc466)) +#loc1076 = loc("dsT"(#loc467)) +#loc1077 = loc("dsT"(#loc468)) +#loc1078 = loc("grad_scores"(#loc469)) +#loc1079 = loc("grad_scores"(#loc470)) +#loc1080 = loc("grad_scores"(#loc471)) +#loc1081 = loc("dsT"(#loc472)) +#loc1082 = loc("dk"(#loc473)) +#loc1083 = loc("dk"(#loc474)) +#loc1084 = loc("dk"(#loc475)) +#loc1085 = loc("dk"(#loc476)) +#loc1086 = loc("SPARSE_Q_MULTIPLE"(#loc534)) +#loc1087 = loc("SPARSE_KV_MULTIPLE"(#loc535)) +#loc1088 = loc("SPARSE_Q_MULTIPLE"(#loc610)) +#loc1089 = loc("SPARSE_KV_MULTIPLE"(#loc611)) +#loc1090 = loc("dk"(#loc622)) +#loc1091 = loc("offs_n2"(#loc757)) +#loc1092 = loc("dv"(#loc959)) +#loc1093 = loc("kT_ptrs"(#loc1091)) +#loc1094 = loc("offs_m1"(#loc1092)) +#loc1095 = loc("vT_ptrs"(#loc1093)) +#loc1096 = loc("qT_ptrs"(#loc1094)) +#loc1097 = loc("do_ptrs"(#loc1096)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..5f850e9676289167a4c4e9d5754686200be3abf4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.ttgir @@ -0,0 +1,2014 @@ +#blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":18:0) +#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 128, 16]}> +#mma1 = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 64, 16]}> +#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}> +#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}> +#shared2 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}> +#smem = #ttg.shared_memory +#loc319 = loc("arg_Q"(#loc)) +#loc320 = loc("arg_K"(#loc)) +#loc321 = loc("arg_V"(#loc)) +#loc322 = loc("arg_LSE"(#loc)) +#loc323 = loc("arg_DELTA"(#loc)) +#loc324 = loc("arg_DO"(#loc)) +#loc325 = loc("arg_DQ"(#loc)) +#loc326 = loc("arg_DV"(#loc)) +#loc327 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc328 = loc("arg_KV_IDX"(#loc)) +#loc329 = loc("arg_Q_NUM_BLKS"(#loc)) +#loc330 = loc("arg_Q_IDX"(#loc)) +#loc331 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc332 = loc("arg_FULL_KV_IDX"(#loc)) +#loc333 = loc("arg_FULL_Q_NUM_BLKS"(#loc)) +#loc334 = loc("arg_FULL_Q_IDX"(#loc)) +#loc335 = loc("in_ptr16"(#loc)) +#loc336 = loc("out_ptr0"(#loc)) +#loc337 = loc("ks0"(#loc)) +#loc338 = loc("ks1"(#loc)) +#loc339 = loc("ks2"(#loc)) +#loc340 = loc("ks3"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_tem_fused_zeros_1(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_DELTA: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DELTA"(#loc)), %arg_DO: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DO"(#loc)), %arg_DQ: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DQ"(#loc)), %arg_DV: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DV"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_NUM_BLKS"(#loc)), %arg_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %arg_FULL_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_NUM_BLKS"(#loc)), %arg_FULL_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_IDX"(#loc)), %in_ptr16: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr16"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<2048> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<2048> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_1 = arith.constant dense<2048> : tensor<128x64xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<2048> : tensor<64x1xi32, #blocked2> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<128> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<4096> : tensor<1x64xi32, #blocked1> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<64x1xi32, #blocked2> loc(#loc1) + %cst_7 = arith.constant dense<0> : tensor<128x64xi32, #blocked> loc(#loc1) + %cst_8 = arith.constant dense<128> : tensor<128x1xi32, #blocked2> loc(#loc1) + %cst_9 = arith.constant dense<128> : tensor<1x128xi32, #blocked2> loc(#loc1) + %cst_10 = arith.constant dense<2048> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_11 = arith.constant dense<2048> : tensor<128x1xi32, #blocked2> loc(#loc1) + %cst_12 = arith.constant dense<4096> : tensor<128x1xi32, #blocked2> loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c262144_i32 = arith.constant 262144 : i32 loc(#loc1) + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %cst_13 = arith.constant dense<0.000000e+00> : tensor<128x128xbf16, #blocked2> loc(#loc1) + %cst_14 = arith.constant dense<0.000000e+00> : tensor<128x64xbf16, #blocked1> loc(#loc1) + %cst_15 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16, #blocked2> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c127_i32 = arith.constant 127 : i32 loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %c63_i32 = arith.constant 63 : i32 loc(#loc1) + %cst_16 = arith.constant dense<0.0883883461> : tensor<128x128xf32, #mma> loc(#loc1) + %cst_17 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma> loc(#loc1) + %cst_18 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma1> loc(#loc1) + %cst_19 = arith.constant dense<0.0883883461> : tensor<128x64xf32, #mma1> loc(#loc1) + %cst_20 = arith.constant dense<0xFF800000> : tensor<128x64xf32, #mma1> loc(#loc1) + %cst_21 = arith.constant dense<1.44269502> : tensor<128x64xf32, #mma1> loc(#loc1) + %true = arith.constant true loc(#loc1) + %c-1_i32 = arith.constant -1 : i32 loc(#loc1) + %c3_i32 = arith.constant 3 : i32 loc(#loc1) + %cst_22 = arith.constant dense<8192> : tensor<128x64xi32, #blocked1> loc(#loc1) + %cst_23 = arith.constant dense<64> : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc1) + %cst_24 = arith.constant dense<262144> : tensor<128x64xi32, #blocked1> loc(#loc1) + %cst_25 = arith.constant dense<8192> : tensor<64x128xi32, #blocked2> loc(#loc1) + %cst_26 = arith.constant dense<64> : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc1) + %cst_27 = arith.constant dense<64> : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc1) + %cst_28 = arith.constant dense<2048> : tensor<1x64xi32, #mma1> loc(#loc1) + %cst_29 = arith.constant dense<2048> : tensor<128x64xi32, #mma1> loc(#loc1) + %cst_30 = arith.constant dense<2048> : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc1) + %cst_31 = arith.constant dense<0xFF800000> : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc1) + %cst_32 = arith.constant dense<0.000000e+00> : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc1) + %cst_33 = arith.constant dense<0> : tensor<128x64xi32, #mma1> loc(#loc1) + %cst_34 = arith.constant dense<0> : tensor<1x64xi32, #mma1> loc(#loc1) + %cst_35 = arith.constant dense<2048> : tensor<128x1xi32, #mma1> loc(#loc1) + %cst_36 = arith.constant dense<0.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma1}>> loc(#loc1) + %cst_37 = arith.constant dense<0xFF800000> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma1}>> loc(#loc1) + %cst_38 = arith.constant dense<2048> : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma1}>> loc(#loc1) + %0 = arith.muli %ks0, %c1024_i32 : i32 loc(#loc2) + %1 = arith.muli %ks0, %c128_i32 : i32 loc(#loc3) + %pid = tt.get_program_id x : i32 loc(#loc341) + %NUM_KV_BLOCKS = arith.addi %ks0, %c127_i32 : i32 loc(#loc634) + %NUM_KV_BLOCKS_39 = arith.divsi %NUM_KV_BLOCKS, %c128_i32 : i32 loc(#loc635) + %off_zq = tt.get_program_id y : i32 loc(#loc343) + %off_hkv = tt.get_program_id z : i32 loc(#loc344) + %off_zkv = arith.remsi %off_zq, %c8_i32 : i32 loc(#loc345) + %k_adj = arith.muli %1, %off_hkv : i32 loc(#loc346) + %k_adj_40 = arith.muli %0, %off_zkv : i32 loc(#loc347) + %k_adj_41 = arith.addi %k_adj, %k_adj_40 : i32 loc(#loc348) + %k_adj_42 = arith.extsi %k_adj_41 : i32 to i64 loc(#loc349) + %dv_adj = arith.muli %0, %off_zq : i32 loc(#loc350) + %dv_adj_43 = arith.addi %k_adj, %dv_adj : i32 loc(#loc351) + %dv_adj_44 = arith.extsi %dv_adj_43 : i32 to i64 loc(#loc352) + %K = tt.addptr %arg_K, %k_adj_42 : !tt.ptr, i64 loc(#loc353) + %V = tt.addptr %arg_V, %k_adj_42 : !tt.ptr, i64 loc(#loc354) + %DV = tt.addptr %arg_DV, %dv_adj_44 : !tt.ptr, i64 loc(#loc355) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc356) + %offs_k_45 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma1}>> loc(#loc356) + %offs_k_46 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc356) + %2 = arith.cmpi sge, %pid, %NUM_KV_BLOCKS_39 : i32 loc(#loc22) + scf.if %2 { + %off_pid = arith.subi %pid, %NUM_KV_BLOCKS_39 : i32 loc(#loc357) + %off_hq2 = arith.divsi %off_pid, %c16_i32 : i32 loc(#loc358) + %off_hq2_47 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc359) + %off_hq2_48 = arith.addi %off_hq2, %off_hq2_47 : i32 loc(#loc360) + %start_m2_block = arith.remsi %off_pid, %c16_i32 : i32 loc(#loc361) + %stride_kv_idx_h = arith.muli %ks1, %c16_i32 : i32 loc(#loc362) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %c16_i32 : i32 loc(#loc363) + %sparse_kv_num_blks_offset_49 = arith.addi %sparse_kv_num_blks_offset, %start_m2_block : i32 loc(#loc364) + %sparse_kv_idx_offset = arith.muli %off_zkv, %stride_kv_idx_h : i32 loc(#loc365) + %sparse_kv_idx_offset_50 = arith.muli %start_m2_block, %ks1 : i32 loc(#loc366) + %sparse_kv_idx_offset_51 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_50 : i32 loc(#loc367) + %q_adj2 = arith.muli %off_hq2_48, %c128_i32 : i32 loc(#loc368) + %q_adj2_52 = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc369) + %q_adj2_53 = arith.addi %q_adj2, %q_adj2_52 : i32 loc(#loc370) + %q_adj2_54 = arith.extsi %q_adj2_53 : i32 to i64 loc(#loc371) + %do_adj2 = arith.muli %off_hq2_48, %c262144_i32 : i32 loc(#loc372) + %do_adj2_55 = arith.addi %do_adj2, %q_adj2_52 : i32 loc(#loc373) + %do_adj2_56 = arith.extsi %do_adj2_55 : i32 to i64 loc(#loc374) + %off_chz2 = arith.muli %off_zq, %c32_i32 : i32 loc(#loc375) + %off_chz2_57 = arith.addi %off_chz2, %off_hq2_48 : i32 loc(#loc376) + %off_chz2_58 = arith.muli %off_chz2_57, %c2048_i32 : i32 loc(#loc377) + %off_chz2_59 = arith.extsi %off_chz2_58 : i32 to i64 loc(#loc378) + %Q2 = tt.addptr %arg_Q, %q_adj2_54 : !tt.ptr, i64 loc(#loc379) + %DO2 = tt.addptr %arg_DO, %do_adj2_56 : !tt.ptr, i64 loc(#loc380) + %DQ2 = tt.addptr %arg_DQ, %q_adj2_54 : !tt.ptr, i64 loc(#loc381) + %LSE2 = tt.addptr %arg_LSE, %off_chz2_59 : !tt.ptr, i64 loc(#loc382) + %DELTA2 = tt.addptr %arg_DELTA, %off_chz2_59 : !tt.ptr, i64 loc(#loc383) + %start_m2 = arith.muli %start_m2_block, %c128_i32 : i32 loc(#loc384) + %offs_m2 = tt.splat %start_m2 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc385) + %offs_m2_60 = tt.splat %start_m2 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma1}>> loc(#loc385) + %offs_m2_61 = arith.addi %offs_m2, %offs_k : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc385) + %offs_m2_62 = arith.addi %offs_m2_60, %offs_k_45 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma1}>> loc(#loc385) + %ptr = tt.expand_dims %offs_m2_61 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<128x1xi32, #blocked2> loc(#loc636) + %ptr_63 = tt.expand_dims %offs_m2_62 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma1}>> -> tensor<128x1xi32, #mma1> loc(#loc636) + %ptr_64 = arith.muli %ptr, %cst_12 : tensor<128x1xi32, #blocked2> loc(#loc637) + %ptr_65 = tt.splat %Q2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked2> loc(#loc638) + %ptr_66 = tt.addptr %ptr_65, %ptr_64 : tensor<128x1x!tt.ptr, #blocked2>, tensor<128x1xi32, #blocked2> loc(#loc638) + %ptr_67 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> loc(#loc639) + %ptr_68 = tt.expand_dims %ptr_67 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x128xi32, #blocked2> loc(#loc639) + %ptr_69 = tt.broadcast %ptr_66 : tensor<128x1x!tt.ptr, #blocked2> -> tensor<128x128x!tt.ptr, #blocked2> loc(#loc640) + %ptr_70 = tt.broadcast %ptr_68 : tensor<1x128xi32, #blocked2> -> tensor<128x128xi32, #blocked2> loc(#loc640) + %ptr_71 = tt.addptr %ptr_69, %ptr_70 : tensor<128x128x!tt.ptr, #blocked2>, tensor<128x128xi32, #blocked2> loc(#loc640) + %q = arith.cmpi slt, %ptr, %cst_11 : tensor<128x1xi32, #blocked2> loc(#loc641) + %q_72 = tt.broadcast %q : tensor<128x1xi1, #blocked2> -> tensor<128x128xi1, #blocked2> loc(#loc642) + %q_73 = tt.load %ptr_71, %q_72, %cst_13 : tensor<128x128x!tt.ptr, #blocked2> loc(#loc642) + %q_74 = ttg.local_alloc %q_73 : (tensor<128x128xbf16, #blocked2>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc642) + %ptr_75 = arith.muli %ptr, %cst_8 : tensor<128x1xi32, #blocked2> loc(#loc643) + %ptr_76 = tt.splat %DO2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked2> loc(#loc644) + %ptr_77 = tt.addptr %ptr_76, %ptr_75 : tensor<128x1x!tt.ptr, #blocked2>, tensor<128x1xi32, #blocked2> loc(#loc644) + %ptr_78 = tt.broadcast %ptr_77 : tensor<128x1x!tt.ptr, #blocked2> -> tensor<128x128x!tt.ptr, #blocked2> loc(#loc645) + %ptr_79 = tt.addptr %ptr_78, %ptr_70 : tensor<128x128x!tt.ptr, #blocked2>, tensor<128x128xi32, #blocked2> loc(#loc645) + %do = tt.load %ptr_79, %q_72, %cst_13 : tensor<128x128x!tt.ptr, #blocked2> loc(#loc646) + %do_80 = ttg.local_alloc %do : (tensor<128x128xbf16, #blocked2>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc646) + %Di = arith.cmpi slt, %offs_m2_62, %cst_38 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma1}>> loc(#loc393) + %Di_81 = tt.splat %DELTA2 : !tt.ptr -> tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma1}>> loc(#loc394) + %Di_82 = tt.addptr %Di_81, %offs_m2_62 : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma1}>>, tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma1}>> loc(#loc394) + %Di_83 = tt.load %Di_82, %Di : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma1}>> loc(#loc395) + %lse = tt.splat %LSE2 : !tt.ptr -> tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma1}>> loc(#loc396) + %lse_84 = tt.addptr %lse, %offs_m2_62 : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma1}>>, tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma1}>> loc(#loc396) + %lse_85 = tt.load %lse_84, %Di : tensor<128x!tt.ptr, #ttg.slice<{dim = 1, parent = #mma1}>> loc(#loc397) + %lse_86 = arith.cmpf oeq, %lse_85, %cst_37 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma1}>> loc(#loc398) + %lse_87 = arith.select %lse_86, %cst_36, %lse_85 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma1}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma1}>> loc(#loc399) + %lse_88 = tt.expand_dims %lse_87 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma1}>> -> tensor<128x1xf32, #mma1> loc(#loc400) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_51 : !tt.ptr, i32 loc(#loc401) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc402) + %kv_start_89 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc403) + %sparse_kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_49 : !tt.ptr, i32 loc(#loc404) + %sparse_kv_num_blocks_90 = tt.load %sparse_kv_num_blocks : !tt.ptr loc(#loc405) + %offs_n2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc406) + %offs_n2_91 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc406) + %offs_n2_92 = tt.splat %kv_start_89 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc407) + %offs_n2_93 = tt.splat %kv_start_89 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc407) + %offs_n2_94 = arith.addi %offs_n2_92, %offs_n2 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc407) + %offs_n2_95 = arith.addi %offs_n2_93, %offs_n2_91 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc407) + %kT_ptrs = tt.expand_dims %offs_n2_95 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc647) + %kT_ptrs_96 = arith.muli %kT_ptrs, %cst_4 : tensor<1x64xi32, #blocked1> loc(#loc648) + %kT_ptrs_97 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc649) + %kT_ptrs_98 = tt.addptr %kT_ptrs_97, %kT_ptrs_96 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc649) + %kT_ptrs_99 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc650) + %kT_ptrs_100 = tt.expand_dims %kT_ptrs_99 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xi32, #blocked1> loc(#loc650) + %kT_ptrs_101 = tt.broadcast %kT_ptrs_98 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc651) + %kT_ptrs_102 = tt.broadcast %kT_ptrs_100 : tensor<128x1xi32, #blocked1> -> tensor<128x64xi32, #blocked1> loc(#loc651) + %kT_ptrs_103 = tt.addptr %kT_ptrs_101, %kT_ptrs_102 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc651) + %vT_ptrs = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc652) + %vT_ptrs_104 = tt.addptr %vT_ptrs, %kT_ptrs_96 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc652) + %vT_ptrs_105 = tt.broadcast %vT_ptrs_104 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc653) + %vT_ptrs_106 = tt.addptr %vT_ptrs_105, %kT_ptrs_102 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc653) + %hi = arith.muli %sparse_kv_num_blocks_90, %c2_i32 : i32 loc(#loc654) + %hi_107 = arith.addi %ks0, %c63_i32 : i32 loc(#loc811) + %hi_108 = arith.divsi %hi_107, %c64_i32 : i32 loc(#loc812) + %hi_109 = arith.maxsi %hi_108, %c1_i32 : i32 loc(#loc656) + %hi_110 = arith.minsi %hi, %hi_109 : i32 loc(#loc657) + %kT = tt.splat %ks0 : i32 -> tensor<1x64xi32, #mma1> loc(#loc961) + %kT_111 = tt.splat %ks0 : i32 -> tensor<1x64xi32, #blocked1> loc(#loc961) + %m = arith.remsi %ptr_63, %cst_35 : tensor<128x1xi32, #mma1> loc(#loc962) + %tmp4 = tt.broadcast %m : tensor<128x1xi32, #mma1> -> tensor<128x64xi32, #mma1> loc(#loc815) + %tmp7 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc816) + %vT_ptrs_112 = arith.cmpi sgt, %hi_110, %c0_i32 : i32 loc(#loc973) + %tmp7_113 = tt.load %tmp7, %vT_ptrs_112 : !tt.ptr loc(#loc818) + %tmp8 = tt.splat %tmp7_113 : i64 -> tensor<1x64xi64, #mma1> loc(#loc819) + %tmp9 = arith.extsi %m : tensor<128x1xi32, #mma1> to tensor<128x1xi64, #mma1> loc(#loc820) + %tmp10 = tt.splat %tmp7_113 : i64 -> tensor<128x1xi64, #mma1> loc(#loc821) + %tmp10_114 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64, #mma1> loc(#loc821) + %tmp11 = tt.broadcast %tmp10_114 : tensor<128x1xi1, #mma1> -> tensor<128x64xi1, #mma1> loc(#loc822) + %p = tt.broadcast %lse_88 : tensor<128x1xf32, #mma1> -> tensor<128x64xf32, #mma1> loc(#loc823) + %ds = tt.expand_dims %Di_83 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma1}>> -> tensor<128x1xf32, #mma1> loc(#loc824) + %ds_115 = tt.broadcast %ds : tensor<128x1xf32, #mma1> -> tensor<128x64xf32, #mma1> loc(#loc825) + %kT_116 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc964) + %vT = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc965) + %kT_117 = arith.cmpi slt, %kT_ptrs, %kT_111 : tensor<1x64xi32, #blocked1> loc(#loc961) + %kT_118 = tt.broadcast %kT_117 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc964) + %kT_119 = ttg.memdesc_index %kT_116[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc964) + %vT_ptrs_120 = tt.splat %vT_ptrs_112 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc973) + %vT_ptrs_121 = arith.andi %vT_ptrs_120, %kT_118 : tensor<128x64xi1, #blocked1> loc(#loc973) + %kT_122 = ttg.async_copy_global_to_local %kT_ptrs_103, %kT_119 mask %vT_ptrs_121 other %cst_14 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc964) + %kT_123 = ttg.async_commit_group tokens %kT_122 loc(#loc964) + %vT_124 = ttg.memdesc_index %vT[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc965) + %vT_125 = ttg.async_copy_global_to_local %vT_ptrs_106, %vT_124 mask %vT_ptrs_121 other %cst_14 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc965) + %vT_126 = ttg.async_commit_group tokens %vT_125 loc(#loc965) + %vT_ptrs_127 = arith.cmpi sgt, %hi_110, %c1_i32 : i32 loc(#loc973) + %kT_ptrs_128 = tt.addptr %kT_ptrs_103, %cst_22 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc660) + %vT_ptrs_129 = tt.addptr %vT_ptrs_106, %cst_22 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc661) + %offs_n2_130 = arith.addi %offs_n2_95, %cst_23 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc662) + %kT_131 = tt.expand_dims %offs_n2_130 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc966) + %kT_132 = arith.cmpi slt, %kT_131, %kT_111 : tensor<1x64xi32, #blocked1> loc(#loc961) + %kT_133 = tt.broadcast %kT_132 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc964) + %kT_134 = ttg.memdesc_index %kT_116[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc964) + %vT_ptrs_135 = tt.splat %vT_ptrs_127 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc973) + %vT_ptrs_136 = arith.andi %vT_ptrs_135, %kT_133 : tensor<128x64xi1, #blocked1> loc(#loc973) + %kT_137 = ttg.async_copy_global_to_local %kT_ptrs_128, %kT_134 mask %vT_ptrs_136 other %cst_14 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc964) + %kT_138 = ttg.async_commit_group tokens %kT_137 loc(#loc964) + %vT_139 = ttg.memdesc_index %vT[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc965) + %vT_140 = ttg.async_copy_global_to_local %vT_ptrs_129, %vT_139 mask %vT_ptrs_136 other %cst_14 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc965) + %vT_141 = ttg.async_commit_group tokens %vT_140 loc(#loc965) + ttng.fence_async_shared {bCluster = false} loc(#loc827) + %vT_ptrs_142:12 = scf.for %vT_ptrs_198 = %c0_i32 to %hi_110 step %c1_i32 iter_args(%arg23 = %cst_17, %kT_ptrs_199 = %kT_ptrs_128, %offs_n2_200 = %offs_n2_130, %vT_ptrs_201 = %vT_ptrs_129, %offs_n2_202 = %offs_n2_94, %arg28 = %c1_i32, %arg29 = %c-1_i32, %kT_203 = %kT_123, %kT_204 = %kT_138, %vT_205 = %vT_126, %vT_206 = %vT_141, %arg34 = %c64_i32) -> (tensor<128x128xf32, #mma>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 { + %vT_ptrs_207 = arith.subi %hi_110, %c2_i32 : i32 loc(#loc973) + %vT_ptrs_208 = arith.cmpi slt, %vT_ptrs_198, %vT_ptrs_207 : i32 loc(#loc973) + %vT_ptrs_209 = arith.subi %hi_110, %c1_i32 : i32 loc(#loc973) + %vT_ptrs_210 = arith.cmpi slt, %vT_ptrs_198, %vT_ptrs_209 : i32 loc(#loc973) + %vT_ptrs_211 = arith.addi %arg29, %c1_i32 : i32 loc(#loc973) + %vT_ptrs_212 = arith.cmpi sge, %vT_ptrs_211, %c3_i32 : i32 loc(#loc973) + %vT_ptrs_213 = arith.select %vT_ptrs_212, %c0_i32, %vT_ptrs_211 : i32 loc(#loc973) + %kT_214 = tt.expand_dims %offs_n2_202 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> -> tensor<1x64xi32, #mma1> loc(#loc966) + %kT_215 = arith.cmpi slt, %kT_214, %kT : tensor<1x64xi32, #mma1> loc(#loc961) + %kT_216 = tt.broadcast %kT_215 : tensor<1x64xi1, #mma1> -> tensor<128x64xi1, #mma1> loc(#loc964) + %kT_217 = ttg.async_wait %kT_203, %vT_205 {num = 2 : i32} loc(#loc964) + %kT_218 = ttg.memdesc_index %kT_116[%vT_ptrs_213] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc964) + %dq_219 = ttg.memdesc_trans %kT_218 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc828) + %qk = ttng.warp_group_dot %q_74, %kT_218, %cst_18 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma1> loc(#loc827) + %qk_220:3 = ttng.warp_group_dot_wait %qk, %q_74, %kT_218 {pendings = 0 : i32} : tensor<128x64xf32, #mma1>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc827) + %qk_221 = arith.mulf %qk_220#0, %cst_19 : tensor<128x64xf32, #mma1> loc(#loc829) + %n = arith.remsi %kT_214, %kT : tensor<1x64xi32, #mma1> loc(#loc967) + %post_mod_scores = arith.select %kT_216, %qk_221, %cst_20 : tensor<128x64xi1, #mma1>, tensor<128x64xf32, #mma1> loc(#loc831) + %tmp4_222 = tt.broadcast %n : tensor<1x64xi32, #mma1> -> tensor<128x64xi32, #mma1> loc(#loc815) + %tmp4_223 = arith.cmpi sge, %tmp4, %tmp4_222 : tensor<128x64xi32, #mma1> loc(#loc815) + %tmp5 = arith.extsi %n : tensor<1x64xi32, #mma1> to tensor<1x64xi64, #mma1> loc(#loc832) + %tmp8_224 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64, #mma1> loc(#loc819) + %tmp11_225 = tt.broadcast %tmp8_224 : tensor<1x64xi1, #mma1> -> tensor<128x64xi1, #mma1> loc(#loc822) + %tmp11_226 = arith.andi %tmp11_225, %tmp11 : tensor<128x64xi1, #mma1> loc(#loc822) + %tmp12 = arith.andi %tmp4_223, %tmp11_226 : tensor<128x64xi1, #mma1> loc(#loc833) + %tmp15 = arith.cmpi sge, %n, %cst_28 : tensor<1x64xi32, #mma1> loc(#loc834) + %tmp16 = arith.remsi %n, %cst_28 : tensor<1x64xi32, #mma1> loc(#loc835) + %tmp18 = arith.cmpi ne, %tmp16, %cst_34 : tensor<1x64xi32, #mma1> loc(#loc836) + %tmp19 = arith.cmpi slt, %tmp16, %cst_34 : tensor<1x64xi32, #mma1> loc(#loc837) + %tmp22 = arith.andi %tmp18, %tmp19 : tensor<1x64xi1, #mma1> loc(#loc838) + %tmp23 = arith.addi %tmp16, %cst_28 : tensor<1x64xi32, #mma1> loc(#loc839) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1, #mma1>, tensor<1x64xi32, #mma1> loc(#loc840) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32, #mma1> to tensor<1x64xi64, #mma1> loc(#loc841) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64, #mma1> loc(#loc842) + %tmp27 = arith.andi %tmp15, %tmp26 : tensor<1x64xi1, #mma1> loc(#loc843) + %tmp28 = arith.subi %tmp4_222, %tmp4 : tensor<128x64xi32, #mma1> loc(#loc844) + %tmp29 = arith.remsi %tmp28, %cst_29 : tensor<128x64xi32, #mma1> loc(#loc845) + %tmp30 = arith.cmpi ne, %tmp29, %cst_33 : tensor<128x64xi32, #mma1> loc(#loc846) + %tmp31 = arith.cmpi slt, %tmp29, %cst_33 : tensor<128x64xi32, #mma1> loc(#loc847) + %tmp33 = arith.andi %tmp30, %tmp31 : tensor<128x64xi1, #mma1> loc(#loc848) + %tmp34 = arith.addi %tmp29, %cst_29 : tensor<128x64xi32, #mma1> loc(#loc849) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29 : tensor<128x64xi1, #mma1>, tensor<128x64xi32, #mma1> loc(#loc850) + %tmp36 = arith.cmpi eq, %tmp35, %cst_33 : tensor<128x64xi32, #mma1> loc(#loc851) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1, #mma1> -> tensor<128x64xi1, #mma1> loc(#loc852) + %tmp37_227 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1, #mma1> loc(#loc852) + %tmp38 = arith.ori %tmp12, %tmp37_227 : tensor<128x64xi1, #mma1> loc(#loc853) + %post_mod_scores_228 = arith.select %tmp38, %post_mod_scores, %cst_20 : tensor<128x64xi1, #mma1>, tensor<128x64xf32, #mma1> loc(#loc854) + %post_mod_scores_229 = arith.mulf %post_mod_scores_228, %cst_21 : tensor<128x64xf32, #mma1> loc(#loc855) + %p_230 = arith.subf %post_mod_scores_229, %p : tensor<128x64xf32, #mma1> loc(#loc823) + %p_231 = math.exp2 %p_230 : tensor<128x64xf32, #mma1> loc(#loc856) + %vT_232 = ttg.memdesc_index %vT[%vT_ptrs_213] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc965) + %dp = ttng.warp_group_dot %do_80, %vT_232, %cst_18 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma1> loc(#loc857) + %dp_233:3 = ttng.warp_group_dot_wait %dp, %do_80, %vT_232 {pendings = 0 : i32} : tensor<128x64xf32, #mma1>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc857) + %ds_234 = arith.subf %dp_233#0, %ds_115 : tensor<128x64xf32, #mma1> loc(#loc825) + %ds_235 = arith.mulf %p_231, %ds_234 : tensor<128x64xf32, #mma1> loc(#loc858) + %grad_scores = arith.select %kT_216, %ds_235, %cst_18 : tensor<128x64xi1, #mma1>, tensor<128x64xf32, #mma1> loc(#loc859) + %ds_236 = arith.select %tmp38, %grad_scores, %cst_18 : tensor<128x64xi1, #mma1>, tensor<128x64xf32, #mma1> loc(#loc860) + %ds_237 = arith.truncf %ds_236 : tensor<128x64xf32, #mma1> to tensor<128x64xbf16, #mma1> loc(#loc861) + %ds_238 = ttg.convert_layout %ds_237 : tensor<128x64xbf16, #mma1> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> loc(#loc861) + %dq_239 = ttng.warp_group_dot %ds_238, %dq_219, %arg23 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma> loc(#loc862) + %offs_n2_240 = tt.splat %arg34 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc662) + %offs_n2_241 = arith.addi %offs_n2_202, %offs_n2_240 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc662) + %vT_ptrs_242 = arith.addi %vT_ptrs_198, %c1_i32 : i32 loc(#loc973) + %cur_block_idx = arith.divsi %vT_ptrs_242, %c2_i32 : i32 loc(#loc863) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc864) + %cur_block_243 = tt.load %cur_block, %vT_ptrs_210 evictionPolicy = evict_last : !tt.ptr loc(#loc865) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc866) + %next_block_244 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_90 : i32 loc(#loc867) + %next_block_245 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc868) + %vT_ptrs_246 = arith.andi %vT_ptrs_210, %next_block_244 : i1 loc(#loc973) + %next_block_247 = tt.load %next_block_245, %vT_ptrs_246 evictionPolicy = evict_last : !tt.ptr loc(#loc869) + %needs_jump = arith.addi %vT_ptrs_198, %c2_i32 : i32 loc(#loc870) + %needs_jump_248 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc871) + %needs_jump_249 = arith.cmpi eq, %needs_jump_248, %c0_i32 : i32 loc(#loc872) + %jump_to_block = arith.subi %next_block_247, %cur_block_243 : i32 loc(#loc873) + %jump_to_block_250 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc874) + %jump_to_block_251 = arith.subi %jump_to_block_250, %c64_i32 : i32 loc(#loc875) + %offset = arith.extui %needs_jump_249 : i1 to i32 loc(#loc876) + %offset_252 = arith.muli %jump_to_block_251, %offset : i32 loc(#loc876) + %offset_253 = arith.subi %c1_i32, %offset : i32 loc(#loc877) + %offset_254 = arith.muli %offset_253, %c64_i32 : i32 loc(#loc878) + %offset_255 = arith.addi %offset_252, %offset_254 : i32 loc(#loc879) + %kT_ptrs_256 = arith.muli %offset_255, %c128_i32 : i32 loc(#loc664) + %kT_ptrs_257 = tt.splat %kT_ptrs_256 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc660) + %kT_ptrs_258 = tt.addptr %kT_ptrs_199, %kT_ptrs_257 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc660) + %vT_ptrs_259 = tt.addptr %vT_ptrs_201, %kT_ptrs_257 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc661) + %offs_n2_260 = tt.splat %offset_255 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc662) + %offs_n2_261 = arith.addi %offs_n2_200, %offs_n2_260 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc662) + %vT_ptrs_262 = arith.addi %arg28, %c1_i32 : i32 loc(#loc973) + %vT_ptrs_263 = arith.cmpi sge, %vT_ptrs_262, %c3_i32 : i32 loc(#loc973) + %vT_ptrs_264 = arith.select %vT_ptrs_263, %c0_i32, %vT_ptrs_262 : i32 loc(#loc973) + %kT_265 = tt.expand_dims %offs_n2_261 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc966) + %kT_266 = arith.cmpi slt, %kT_265, %kT_111 : tensor<1x64xi32, #blocked1> loc(#loc961) + %kT_267 = tt.broadcast %kT_266 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc964) + %kT_268 = ttg.memdesc_index %kT_116[%vT_ptrs_264] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc964) + %vT_ptrs_269 = tt.splat %vT_ptrs_208 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc973) + %vT_ptrs_270 = arith.andi %vT_ptrs_269, %kT_267 : tensor<128x64xi1, #blocked1> loc(#loc973) + %kT_271 = ttg.async_copy_global_to_local %kT_ptrs_258, %kT_268 mask %vT_ptrs_270 other %cst_14 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc964) + %kT_272 = ttg.async_commit_group tokens %kT_271 loc(#loc964) + %vT_273 = ttg.memdesc_index %vT[%vT_ptrs_264] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc965) + %vT_274 = ttg.async_copy_global_to_local %vT_ptrs_259, %vT_273 mask %vT_ptrs_270 other %cst_14 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc965) + %vT_275 = ttg.async_commit_group tokens %vT_274 loc(#loc965) + scf.yield %dq_239, %kT_ptrs_258, %offs_n2_261, %vT_ptrs_259, %offs_n2_241, %vT_ptrs_264, %vT_ptrs_213, %kT_204, %kT_272, %vT_206, %vT_275, %offset_255 : tensor<128x128xf32, #mma>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc973) + } loc(#loc973) + %vT_ptrs_143 = ttng.warp_group_dot_wait %vT_ptrs_142#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma> loc(#loc973) + %vT_ptrs_144 = ttg.async_wait {num = 0 : i32} loc(#loc973) + ttg.local_dealloc %vT : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc973) + ttg.local_dealloc %kT_116 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc973) + %kv_indices_145 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_51 : !tt.ptr, i32 loc(#loc493) + %kv_start_146 = tt.load %kv_indices_145 : !tt.ptr loc(#loc494) + %kv_start_147 = arith.muli %kv_start_146, %c128_i32 : i32 loc(#loc495) + %sparse_kv_num_blocks_148 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_49 : !tt.ptr, i32 loc(#loc496) + %sparse_kv_num_blocks_149 = tt.load %sparse_kv_num_blocks_148 : !tt.ptr loc(#loc497) + %offs_n2_150 = tt.splat %kv_start_147 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc498) + %offs_n2_151 = tt.splat %kv_start_147 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc498) + %offs_n2_152 = arith.addi %offs_n2_150, %offs_n2 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc498) + %offs_n2_153 = arith.addi %offs_n2_151, %offs_n2_91 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc498) + %kT_ptrs_154 = tt.expand_dims %offs_n2_153 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc665) + %kT_ptrs_155 = arith.muli %kT_ptrs_154, %cst_4 : tensor<1x64xi32, #blocked1> loc(#loc666) + %kT_ptrs_156 = tt.addptr %kT_ptrs_97, %kT_ptrs_155 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc667) + %kT_ptrs_157 = tt.broadcast %kT_ptrs_156 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc668) + %kT_ptrs_158 = tt.addptr %kT_ptrs_157, %kT_ptrs_102 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc668) + %vT_ptrs_159 = tt.addptr %vT_ptrs, %kT_ptrs_155 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc669) + %vT_ptrs_160 = tt.broadcast %vT_ptrs_159 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc670) + %vT_ptrs_161 = tt.addptr %vT_ptrs_160, %kT_ptrs_102 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc670) + %hi_162 = arith.muli %sparse_kv_num_blocks_149, %c2_i32 : i32 loc(#loc671) + %hi_163 = arith.minsi %hi_162, %hi_109 : i32 loc(#loc672) + %kT_164 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc968) + %vT_165 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc969) + %vT_ptrs_166 = arith.cmpi sgt, %hi_163, %c0_i32 : i32 loc(#loc974) + %kT_167 = arith.cmpi slt, %kT_ptrs_154, %kT_111 : tensor<1x64xi32, #blocked1> loc(#loc970) + %kT_168 = tt.broadcast %kT_167 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc968) + %kT_169 = ttg.memdesc_index %kT_164[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc968) + %vT_ptrs_170 = tt.splat %vT_ptrs_166 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc974) + %vT_ptrs_171 = arith.andi %vT_ptrs_170, %kT_168 : tensor<128x64xi1, #blocked1> loc(#loc974) + %kT_172 = ttg.async_copy_global_to_local %kT_ptrs_158, %kT_169 mask %vT_ptrs_171 other %cst_14 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc968) + %kT_173 = ttg.async_commit_group tokens %kT_172 loc(#loc968) + %vT_174 = ttg.memdesc_index %vT_165[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc969) + %vT_175 = ttg.async_copy_global_to_local %vT_ptrs_161, %vT_174 mask %vT_ptrs_171 other %cst_14 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc969) + %vT_176 = ttg.async_commit_group tokens %vT_175 loc(#loc969) + %vT_ptrs_177 = arith.cmpi sgt, %hi_163, %c1_i32 : i32 loc(#loc974) + %kT_ptrs_178 = tt.addptr %kT_ptrs_158, %cst_22 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc674) + %vT_ptrs_179 = tt.addptr %vT_ptrs_161, %cst_22 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc675) + %offs_n2_180 = arith.addi %offs_n2_153, %cst_23 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc676) + %kT_181 = tt.expand_dims %offs_n2_180 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc971) + %kT_182 = arith.cmpi slt, %kT_181, %kT_111 : tensor<1x64xi32, #blocked1> loc(#loc970) + %kT_183 = tt.broadcast %kT_182 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc968) + %kT_184 = ttg.memdesc_index %kT_164[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc968) + %vT_ptrs_185 = tt.splat %vT_ptrs_177 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc974) + %vT_ptrs_186 = arith.andi %vT_ptrs_185, %kT_183 : tensor<128x64xi1, #blocked1> loc(#loc974) + %kT_187 = ttg.async_copy_global_to_local %kT_ptrs_178, %kT_184 mask %vT_ptrs_186 other %cst_14 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc968) + %kT_188 = ttg.async_commit_group tokens %kT_187 loc(#loc968) + %vT_189 = ttg.memdesc_index %vT_165[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc969) + %vT_190 = ttg.async_copy_global_to_local %vT_ptrs_179, %vT_189 mask %vT_ptrs_186 other %cst_14 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc969) + %vT_191 = ttg.async_commit_group tokens %vT_190 loc(#loc969) + ttng.fence_async_shared {bCluster = false} loc(#loc882) + %vT_ptrs_192:12 = scf.for %vT_ptrs_198 = %c0_i32 to %hi_163 step %c1_i32 iter_args(%vT_ptrs_199 = %vT_ptrs_143, %kT_ptrs_200 = %kT_ptrs_178, %offs_n2_201 = %offs_n2_180, %vT_ptrs_202 = %vT_ptrs_179, %offs_n2_203 = %offs_n2_152, %arg28 = %c1_i32, %arg29 = %c-1_i32, %kT_204 = %kT_173, %kT_205 = %kT_188, %vT_206 = %vT_176, %vT_207 = %vT_191, %arg34 = %c64_i32) -> (tensor<128x128xf32, #mma>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 { + %vT_ptrs_208 = arith.subi %hi_163, %c2_i32 : i32 loc(#loc974) + %vT_ptrs_209 = arith.cmpi slt, %vT_ptrs_198, %vT_ptrs_208 : i32 loc(#loc974) + %vT_ptrs_210 = arith.subi %hi_163, %c1_i32 : i32 loc(#loc974) + %vT_ptrs_211 = arith.cmpi slt, %vT_ptrs_198, %vT_ptrs_210 : i32 loc(#loc974) + %vT_ptrs_212 = arith.addi %arg29, %c1_i32 : i32 loc(#loc974) + %vT_ptrs_213 = arith.cmpi sge, %vT_ptrs_212, %c3_i32 : i32 loc(#loc974) + %vT_ptrs_214 = arith.select %vT_ptrs_213, %c0_i32, %vT_ptrs_212 : i32 loc(#loc974) + %kT_215 = tt.expand_dims %offs_n2_203 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> -> tensor<1x64xi32, #mma1> loc(#loc971) + %kT_216 = arith.cmpi slt, %kT_215, %kT : tensor<1x64xi32, #mma1> loc(#loc970) + %kT_217 = tt.broadcast %kT_216 : tensor<1x64xi1, #mma1> -> tensor<128x64xi1, #mma1> loc(#loc968) + %kT_218 = ttg.async_wait %kT_204, %vT_206 {num = 2 : i32} loc(#loc968) + %kT_219 = ttg.memdesc_index %kT_164[%vT_ptrs_214] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc968) + %dq_220 = ttg.memdesc_trans %kT_219 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc883) + %qk = ttng.warp_group_dot %q_74, %kT_219, %cst_18 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma1> loc(#loc882) + %qk_221:3 = ttng.warp_group_dot_wait %qk, %q_74, %kT_219 {pendings = 0 : i32} : tensor<128x64xf32, #mma1>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc882) + %qk_222 = arith.mulf %qk_221#0, %cst_19 : tensor<128x64xf32, #mma1> loc(#loc884) + %post_mod_scores = arith.select %kT_217, %qk_222, %cst_20 : tensor<128x64xi1, #mma1>, tensor<128x64xf32, #mma1> loc(#loc885) + %post_mod_scores_223 = arith.mulf %post_mod_scores, %cst_21 : tensor<128x64xf32, #mma1> loc(#loc886) + %p_224 = arith.subf %post_mod_scores_223, %p : tensor<128x64xf32, #mma1> loc(#loc887) + %p_225 = math.exp2 %p_224 : tensor<128x64xf32, #mma1> loc(#loc888) + %vT_226 = ttg.memdesc_index %vT_165[%vT_ptrs_214] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc969) + %dp = ttng.warp_group_dot %do_80, %vT_226, %cst_18 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma1> loc(#loc889) + %dp_227:3 = ttng.warp_group_dot_wait %dp, %do_80, %vT_226 {pendings = 0 : i32} : tensor<128x64xf32, #mma1>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc889) + %ds_228 = arith.subf %dp_227#0, %ds_115 : tensor<128x64xf32, #mma1> loc(#loc890) + %ds_229 = arith.mulf %p_225, %ds_228 : tensor<128x64xf32, #mma1> loc(#loc891) + %grad_scores = arith.select %kT_217, %ds_229, %cst_18 : tensor<128x64xi1, #mma1>, tensor<128x64xf32, #mma1> loc(#loc892) + %ds_230 = arith.truncf %grad_scores : tensor<128x64xf32, #mma1> to tensor<128x64xbf16, #mma1> loc(#loc893) + %ds_231 = ttg.convert_layout %ds_230 : tensor<128x64xbf16, #mma1> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> loc(#loc893) + %dq_232 = ttng.warp_group_dot %ds_231, %dq_220, %vT_ptrs_199 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma> loc(#loc894) + %offs_n2_233 = tt.splat %arg34 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc676) + %offs_n2_234 = arith.addi %offs_n2_203, %offs_n2_233 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc676) + %vT_ptrs_235 = arith.addi %vT_ptrs_198, %c1_i32 : i32 loc(#loc974) + %cur_block_idx = arith.divsi %vT_ptrs_235, %c2_i32 : i32 loc(#loc895) + %cur_block = tt.addptr %kv_indices_145, %cur_block_idx : !tt.ptr, i32 loc(#loc896) + %cur_block_236 = tt.load %cur_block, %vT_ptrs_211 evictionPolicy = evict_last : !tt.ptr loc(#loc897) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc898) + %next_block_237 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_149 : i32 loc(#loc899) + %next_block_238 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc900) + %vT_ptrs_239 = arith.andi %vT_ptrs_211, %next_block_237 : i1 loc(#loc974) + %next_block_240 = tt.load %next_block_238, %vT_ptrs_239 evictionPolicy = evict_last : !tt.ptr loc(#loc901) + %needs_jump = arith.addi %vT_ptrs_198, %c2_i32 : i32 loc(#loc902) + %needs_jump_241 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc903) + %needs_jump_242 = arith.cmpi eq, %needs_jump_241, %c0_i32 : i32 loc(#loc904) + %jump_to_block = arith.subi %next_block_240, %cur_block_236 : i32 loc(#loc905) + %jump_to_block_243 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc906) + %jump_to_block_244 = arith.subi %jump_to_block_243, %c64_i32 : i32 loc(#loc907) + %offset = arith.extui %needs_jump_242 : i1 to i32 loc(#loc908) + %offset_245 = arith.muli %jump_to_block_244, %offset : i32 loc(#loc908) + %offset_246 = arith.subi %c1_i32, %offset : i32 loc(#loc909) + %offset_247 = arith.muli %offset_246, %c64_i32 : i32 loc(#loc910) + %offset_248 = arith.addi %offset_245, %offset_247 : i32 loc(#loc911) + %kT_ptrs_249 = arith.muli %offset_248, %c128_i32 : i32 loc(#loc678) + %kT_ptrs_250 = tt.splat %kT_ptrs_249 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc674) + %kT_ptrs_251 = tt.addptr %kT_ptrs_200, %kT_ptrs_250 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc674) + %vT_ptrs_252 = tt.addptr %vT_ptrs_202, %kT_ptrs_250 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc675) + %offs_n2_253 = tt.splat %offset_248 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc676) + %offs_n2_254 = arith.addi %offs_n2_201, %offs_n2_253 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc676) + %vT_ptrs_255 = arith.addi %arg28, %c1_i32 : i32 loc(#loc974) + %vT_ptrs_256 = arith.cmpi sge, %vT_ptrs_255, %c3_i32 : i32 loc(#loc974) + %vT_ptrs_257 = arith.select %vT_ptrs_256, %c0_i32, %vT_ptrs_255 : i32 loc(#loc974) + %kT_258 = tt.expand_dims %offs_n2_254 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc971) + %kT_259 = arith.cmpi slt, %kT_258, %kT_111 : tensor<1x64xi32, #blocked1> loc(#loc970) + %kT_260 = tt.broadcast %kT_259 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc968) + %kT_261 = ttg.memdesc_index %kT_164[%vT_ptrs_257] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc968) + %vT_ptrs_262 = tt.splat %vT_ptrs_209 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc974) + %vT_ptrs_263 = arith.andi %vT_ptrs_262, %kT_260 : tensor<128x64xi1, #blocked1> loc(#loc974) + %kT_264 = ttg.async_copy_global_to_local %kT_ptrs_251, %kT_261 mask %vT_ptrs_263 other %cst_14 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc968) + %kT_265 = ttg.async_commit_group tokens %kT_264 loc(#loc968) + %vT_266 = ttg.memdesc_index %vT_165[%vT_ptrs_257] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc969) + %vT_267 = ttg.async_copy_global_to_local %vT_ptrs_252, %vT_266 mask %vT_ptrs_263 other %cst_14 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc969) + %vT_268 = ttg.async_commit_group tokens %vT_267 loc(#loc969) + scf.yield %dq_232, %kT_ptrs_251, %offs_n2_254, %vT_ptrs_252, %offs_n2_234, %vT_ptrs_257, %vT_ptrs_214, %kT_205, %kT_265, %vT_207, %vT_268, %offset_248 : tensor<128x128xf32, #mma>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc974) + } loc(#loc974) + %vT_ptrs_193 = ttng.warp_group_dot_wait %vT_ptrs_192#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma> loc(#loc974) + %vT_ptrs_194 = ttg.async_wait {num = 0 : i32} loc(#loc974) + ttg.local_dealloc %vT_165 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc974) + ttg.local_dealloc %kT_164 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc974) + %dq_ptrs = tt.splat %DQ2 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked2> loc(#loc500) + %dq_ptrs_195 = tt.addptr %dq_ptrs, %ptr_64 : tensor<128x1x!tt.ptr, #blocked2>, tensor<128x1xi32, #blocked2> loc(#loc500) + %dq_ptrs_196 = tt.broadcast %dq_ptrs_195 : tensor<128x1x!tt.ptr, #blocked2> -> tensor<128x128x!tt.ptr, #blocked2> loc(#loc501) + %dq_ptrs_197 = tt.addptr %dq_ptrs_196, %ptr_70 : tensor<128x128x!tt.ptr, #blocked2>, tensor<128x128xi32, #blocked2> loc(#loc501) + %dq = arith.mulf %vT_ptrs_193, %cst_16 : tensor<128x128xf32, #mma> loc(#loc502) + %3 = arith.cmpi slt, %ptr_68, %cst_9 : tensor<1x128xi32, #blocked2> loc(#loc176) + %4 = tt.broadcast %3 : tensor<1x128xi1, #blocked2> -> tensor<128x128xi1, #blocked2> loc(#loc177) + %5 = arith.andi %q_72, %4 : tensor<128x128xi1, #blocked2> loc(#loc177) + %6 = arith.truncf %dq : tensor<128x128xf32, #mma> to tensor<128x128xbf16, #mma> loc(#loc178) + %7 = ttg.convert_layout %6 : tensor<128x128xbf16, #mma> -> tensor<128x128xbf16, #blocked2> loc(#loc178) + tt.store %dq_ptrs_197, %7, %5 : tensor<128x128x!tt.ptr, #blocked2> loc(#loc178) + } else { + %stride_q_idx_h = arith.muli %ks3, %c16_i32 : i32 loc(#loc503) + %start_n1 = arith.muli %pid, %c128_i32 : i32 loc(#loc504) + %offs_n1 = tt.splat %start_n1 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc505) + %offs_n1_47 = tt.splat %start_n1 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc505) + %offs_n1_48 = arith.addi %offs_n1, %offs_k : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc505) + %offs_n1_49 = arith.addi %offs_n1_47, %offs_k_46 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc505) + %ptr = tt.expand_dims %offs_n1_48 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<128x1xi32, #blocked2> loc(#loc679) + %ptr_50 = tt.expand_dims %offs_n1_49 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc679) + %ptr_51 = arith.muli %ptr, %cst_8 : tensor<128x1xi32, #blocked2> loc(#loc680) + %ptr_52 = tt.splat %K : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked2> loc(#loc681) + %ptr_53 = tt.addptr %ptr_52, %ptr_51 : tensor<128x1x!tt.ptr, #blocked2>, tensor<128x1xi32, #blocked2> loc(#loc681) + %ptr_54 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> loc(#loc682) + %ptr_55 = tt.expand_dims %ptr_54 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x128xi32, #blocked2> loc(#loc682) + %ptr_56 = tt.broadcast %ptr_53 : tensor<128x1x!tt.ptr, #blocked2> -> tensor<128x128x!tt.ptr, #blocked2> loc(#loc683) + %ptr_57 = tt.broadcast %ptr_55 : tensor<1x128xi32, #blocked2> -> tensor<128x128xi32, #blocked2> loc(#loc683) + %ptr_58 = tt.addptr %ptr_56, %ptr_57 : tensor<128x128x!tt.ptr, #blocked2>, tensor<128x128xi32, #blocked2> loc(#loc683) + %k = tt.splat %ks0 : i32 -> tensor<128x1xi32, #blocked2> loc(#loc684) + %k_59 = tt.splat %ks0 : i32 -> tensor<128x1xi32, #blocked> loc(#loc684) + %k_60 = arith.cmpi slt, %ptr, %k : tensor<128x1xi32, #blocked2> loc(#loc684) + %k_61 = tt.broadcast %k_60 : tensor<128x1xi1, #blocked2> -> tensor<128x128xi1, #blocked2> loc(#loc685) + %k_62 = tt.load %ptr_58, %k_61, %cst_13 : tensor<128x128x!tt.ptr, #blocked2> loc(#loc685) + %k_63 = ttg.local_alloc %k_62 : (tensor<128x128xbf16, #blocked2>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc685) + %ptr_64 = tt.splat %V : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked2> loc(#loc686) + %ptr_65 = tt.addptr %ptr_64, %ptr_51 : tensor<128x1x!tt.ptr, #blocked2>, tensor<128x1xi32, #blocked2> loc(#loc686) + %ptr_66 = tt.broadcast %ptr_65 : tensor<128x1x!tt.ptr, #blocked2> -> tensor<128x128x!tt.ptr, #blocked2> loc(#loc687) + %ptr_67 = tt.addptr %ptr_66, %ptr_57 : tensor<128x128x!tt.ptr, #blocked2>, tensor<128x128xi32, #blocked2> loc(#loc687) + %v = tt.load %ptr_67, %k_61, %cst_13 : tensor<128x128x!tt.ptr, #blocked2> loc(#loc688) + %v_68 = ttg.local_alloc %v : (tensor<128x128xbf16, #blocked2>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc688) + %off_hq1 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc508) + %q_adj1 = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc509) + %off_chz1 = arith.muli %off_zq, %c32_i32 : i32 loc(#loc510) + %sparse_q_num_blks_offset = arith.muli %off_zkv, %ks2 : i32 loc(#loc511) + %sparse_q_num_blks_offset_69 = arith.addi %sparse_q_num_blks_offset, %pid : i32 loc(#loc512) + %sparse_q_idx_offset = arith.muli %off_zkv, %stride_q_idx_h : i32 loc(#loc513) + %sparse_q_idx_offset_70 = arith.muli %pid, %c16_i32 : i32 loc(#loc514) + %sparse_q_idx_offset_71 = arith.addi %sparse_q_idx_offset, %sparse_q_idx_offset_70 : i32 loc(#loc515) + %q_indices = tt.addptr %arg_Q_IDX, %sparse_q_idx_offset_71 : !tt.ptr, i32 loc(#loc516) + %q_start = tt.load %q_indices, %true : !tt.ptr loc(#loc517) + %q_start_72 = arith.muli %q_start, %c128_i32 : i32 loc(#loc518) + %sparse_q_num_blocks = tt.addptr %arg_Q_NUM_BLKS, %sparse_q_num_blks_offset_69 : !tt.ptr, i32 loc(#loc519) + %sparse_q_num_blocks_73 = tt.load %sparse_q_num_blocks, %true : !tt.ptr loc(#loc520) + %offs_m1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc521) + %offs_m1_74 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc521) + %offs_m1_75 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc521) + %offs_m1_76 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc521) + %offs_m1_77 = tt.splat %q_start_72 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc522) + %offs_m1_78 = tt.splat %q_start_72 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc522) + %offs_m1_79 = tt.splat %q_start_72 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc522) + %offs_m1_80 = tt.splat %q_start_72 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc522) + %offs_m1_81 = arith.addi %offs_m1_77, %offs_m1 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc522) + %offs_m1_82 = arith.addi %offs_m1_78, %offs_m1_74 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc522) + %offs_m1_83 = arith.addi %offs_m1_79, %offs_m1_75 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc522) + %offs_m1_84 = arith.addi %offs_m1_80, %offs_m1_76 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc522) + %qT_ptrs = tt.expand_dims %offs_m1_83 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc689) + %qT_ptrs_85 = arith.muli %qT_ptrs, %cst_5 : tensor<1x64xi32, #blocked1> loc(#loc690) + %qT_ptrs_86 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc691) + %qT_ptrs_87 = tt.expand_dims %qT_ptrs_86 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xi32, #blocked1> loc(#loc691) + %qT_ptrs_88 = tt.broadcast %qT_ptrs_87 : tensor<128x1xi32, #blocked1> -> tensor<128x64xi32, #blocked1> loc(#loc692) + %do_ptrs = tt.expand_dims %offs_m1_84 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1xi32, #blocked2> loc(#loc693) + %do_ptrs_89 = arith.muli %do_ptrs, %cst_6 : tensor<64x1xi32, #blocked2> loc(#loc694) + %do_ptrs_90 = tt.broadcast %ptr_55 : tensor<1x128xi32, #blocked2> -> tensor<64x128xi32, #blocked2> loc(#loc695) + %hi = arith.muli %sparse_q_num_blocks_73, %c2_i32 : i32 loc(#loc696) + %hi_91 = arith.minsi %hi, %c32_i32 : i32 loc(#loc697) + %n = arith.remsi %ptr_50, %k_59 : tensor<128x1xi32, #blocked> loc(#loc912) + %tmp44 = tt.broadcast %n : tensor<128x1xi32, #blocked> -> tensor<128x64xi32, #blocked> loc(#loc699) + %tmp45 = arith.extsi %n : tensor<128x1xi32, #blocked> to tensor<128x1xi64, #blocked> loc(#loc700) + %tmp47 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc701) + %do_ptrs_92 = arith.cmpi sgt, %hi_91, %c0_i32 : i32 loc(#loc976) + %tmp47_93 = tt.load %tmp47, %do_ptrs_92 : !tt.ptr loc(#loc703) + %tmp48 = tt.splat %tmp47_93 : i64 -> tensor<128x1xi64, #blocked> loc(#loc704) + %tmp48_94 = arith.cmpi slt, %tmp45, %tmp48 : tensor<128x1xi64, #blocked> loc(#loc704) + %tmp50 = tt.splat %tmp47_93 : i64 -> tensor<1x64xi64, #blocked> loc(#loc705) + %tmp51 = tt.broadcast %tmp48_94 : tensor<128x1xi1, #blocked> -> tensor<128x64xi1, #blocked> loc(#loc706) + %tmp55 = arith.cmpi sge, %n, %cst_10 : tensor<128x1xi32, #blocked> loc(#loc707) + %tmp56 = arith.remsi %n, %cst_10 : tensor<128x1xi32, #blocked> loc(#loc708) + %tmp58 = arith.cmpi ne, %tmp56, %cst_3 : tensor<128x1xi32, #blocked> loc(#loc709) + %tmp59 = arith.cmpi slt, %tmp56, %cst_3 : tensor<128x1xi32, #blocked> loc(#loc710) + %tmp62 = arith.andi %tmp58, %tmp59 : tensor<128x1xi1, #blocked> loc(#loc711) + %tmp63 = arith.addi %tmp56, %cst_10 : tensor<128x1xi32, #blocked> loc(#loc712) + %tmp64 = arith.select %tmp62, %tmp63, %tmp56 : tensor<128x1xi1, #blocked>, tensor<128x1xi32, #blocked> loc(#loc713) + %tmp65 = arith.extsi %tmp64 : tensor<128x1xi32, #blocked> to tensor<128x1xi64, #blocked> loc(#loc714) + %tmp66 = arith.cmpi slt, %tmp65, %tmp48 : tensor<128x1xi64, #blocked> loc(#loc715) + %tmp67 = arith.andi %tmp55, %tmp66 : tensor<128x1xi1, #blocked> loc(#loc716) + %tmp77 = tt.broadcast %tmp67 : tensor<128x1xi1, #blocked> -> tensor<128x64xi1, #blocked> loc(#loc717) + %q_indices_95 = tt.addptr %arg_FULL_Q_IDX, %sparse_q_idx_offset_71 : !tt.ptr, i32 loc(#loc553) + %q_start_96 = tt.load %q_indices_95, %true : !tt.ptr loc(#loc554) + %q_start_97 = arith.muli %q_start_96, %c128_i32 : i32 loc(#loc555) + %sparse_q_num_blocks_98 = tt.addptr %arg_FULL_Q_NUM_BLKS, %sparse_q_num_blks_offset_69 : !tt.ptr, i32 loc(#loc556) + %sparse_q_num_blocks_99 = tt.load %sparse_q_num_blocks_98, %true : !tt.ptr loc(#loc557) + %offs_m1_100 = tt.splat %q_start_97 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc558) + %offs_m1_101 = tt.splat %q_start_97 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc558) + %offs_m1_102 = tt.splat %q_start_97 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc558) + %offs_m1_103 = tt.splat %q_start_97 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc558) + %offs_m1_104 = arith.addi %offs_m1_100, %offs_m1 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc558) + %offs_m1_105 = arith.addi %offs_m1_101, %offs_m1_74 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc558) + %offs_m1_106 = arith.addi %offs_m1_102, %offs_m1_75 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc558) + %offs_m1_107 = arith.addi %offs_m1_103, %offs_m1_76 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc558) + %qT_ptrs_108 = tt.expand_dims %offs_m1_106 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc718) + %qT_ptrs_109 = arith.muli %qT_ptrs_108, %cst_5 : tensor<1x64xi32, #blocked1> loc(#loc719) + %do_ptrs_110 = tt.expand_dims %offs_m1_107 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1xi32, #blocked2> loc(#loc720) + %do_ptrs_111 = arith.muli %do_ptrs_110, %cst_6 : tensor<64x1xi32, #blocked2> loc(#loc721) + %hi_112 = arith.muli %sparse_q_num_blocks_99, %c2_i32 : i32 loc(#loc722) + %hi_113 = arith.minsi %hi_112, %c32_i32 : i32 loc(#loc723) + ttng.fence_async_shared {bCluster = false} loc(#loc724) + %dk:2 = scf.for %dk_127 = %c0_i32 to %c4_i32 step %c1_i32 iter_args(%arg23 = %cst_17, %arg24 = %cst_17) -> (tensor<128x128xf32, #mma>, tensor<128x128xf32, #mma>) : i32 { + %off_hq1_128 = arith.addi %off_hq1, %dk_127 : i32 loc(#loc561) + %q_adj1_129 = arith.muli %off_hq1_128, %c128_i32 : i32 loc(#loc562) + %q_adj1_130 = arith.addi %q_adj1_129, %q_adj1 : i32 loc(#loc563) + %q_adj1_131 = arith.extsi %q_adj1_130 : i32 to i64 loc(#loc564) + %do_adj1 = arith.muli %off_hq1_128, %c262144_i32 : i32 loc(#loc565) + %do_adj1_132 = arith.addi %do_adj1, %q_adj1 : i32 loc(#loc566) + %do_adj1_133 = arith.extsi %do_adj1_132 : i32 to i64 loc(#loc567) + %off_chz1_134 = arith.addi %off_chz1, %off_hq1_128 : i32 loc(#loc568) + %off_chz1_135 = arith.muli %off_chz1_134, %c2048_i32 : i32 loc(#loc569) + %off_chz1_136 = arith.extsi %off_chz1_135 : i32 to i64 loc(#loc570) + %Q1 = tt.addptr %arg_Q, %q_adj1_131 : !tt.ptr, i64 loc(#loc571) + %DO1 = tt.addptr %arg_DO, %do_adj1_133 : !tt.ptr, i64 loc(#loc572) + %LSE1 = tt.addptr %arg_LSE, %off_chz1_136 : !tt.ptr, i64 loc(#loc573) + %DELTA1 = tt.addptr %arg_DELTA, %off_chz1_136 : !tt.ptr, i64 loc(#loc574) + %qT_ptrs_137 = tt.splat %Q1 : !tt.ptr -> tensor<1x64x!tt.ptr, #blocked1> loc(#loc726) + %qT_ptrs_138 = tt.addptr %qT_ptrs_137, %qT_ptrs_85 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc726) + %qT_ptrs_139 = tt.broadcast %qT_ptrs_138 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc692) + %qT_ptrs_140 = tt.addptr %qT_ptrs_139, %qT_ptrs_88 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc692) + %do_ptrs_141 = tt.splat %DO1 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked2> loc(#loc727) + %do_ptrs_142 = tt.addptr %do_ptrs_141, %do_ptrs_89 : tensor<64x1x!tt.ptr, #blocked2>, tensor<64x1xi32, #blocked2> loc(#loc727) + %do_ptrs_143 = tt.broadcast %do_ptrs_142 : tensor<64x1x!tt.ptr, #blocked2> -> tensor<64x128x!tt.ptr, #blocked2> loc(#loc695) + %do_ptrs_144 = tt.addptr %do_ptrs_143, %do_ptrs_90 : tensor<64x128x!tt.ptr, #blocked2>, tensor<64x128xi32, #blocked2> loc(#loc695) + %lse = tt.splat %LSE1 : !tt.ptr -> tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc728) + %Di = tt.splat %DELTA1 : !tt.ptr -> tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc729) + %qT = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc914) + %lse_145 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc731) + %do = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc915) + %Di_146 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc733) + %qT_147 = arith.cmpi slt, %qT_ptrs, %cst_0 : tensor<1x64xi32, #blocked1> loc(#loc916) + %qT_148 = tt.broadcast %qT_147 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc914) + %qT_149 = ttg.memdesc_index %qT[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc914) + %do_ptrs_150 = tt.splat %do_ptrs_92 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc976) + %do_ptrs_151 = arith.andi %do_ptrs_150, %qT_148 : tensor<128x64xi1, #blocked1> loc(#loc976) + %qT_152 = ttg.async_copy_global_to_local %qT_ptrs_140, %qT_149 mask %do_ptrs_151 other %cst_14 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc914) + %qT_153 = ttg.async_commit_group tokens %qT_152 loc(#loc914) + %lse_154 = arith.cmpi slt, %offs_m1_81, %cst_30 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc734) + %lse_155 = tt.addptr %lse, %offs_m1_81 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc728) + %lse_156 = ttg.memdesc_index %lse_145[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc731) + %do_ptrs_157 = tt.splat %do_ptrs_92 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc976) + %do_ptrs_158 = arith.andi %do_ptrs_157, %lse_154 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc976) + %lse_159 = ttg.async_copy_global_to_local %lse_155, %lse_156 mask %do_ptrs_158 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc731) + %lse_160 = ttg.async_commit_group tokens %lse_159 loc(#loc731) + %do_161 = arith.cmpi slt, %do_ptrs, %cst_2 : tensor<64x1xi32, #blocked2> loc(#loc917) + %do_162 = tt.broadcast %do_161 : tensor<64x1xi1, #blocked2> -> tensor<64x128xi1, #blocked2> loc(#loc915) + %do_163 = ttg.memdesc_index %do[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc915) + %do_ptrs_164 = tt.splat %do_ptrs_92 : i1 -> tensor<64x128xi1, #blocked2> loc(#loc976) + %do_ptrs_165 = arith.andi %do_ptrs_164, %do_162 : tensor<64x128xi1, #blocked2> loc(#loc976) + %do_166 = ttg.async_copy_global_to_local %do_ptrs_144, %do_163 mask %do_ptrs_165 other %cst_15 : tensor<64x128x!tt.ptr, #blocked2> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc915) + %do_167 = ttg.async_commit_group tokens %do_166 loc(#loc915) + %Di_168 = tt.addptr %Di, %offs_m1_81 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc729) + %Di_169 = ttg.memdesc_index %Di_146[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc733) + %Di_170 = ttg.async_copy_global_to_local %Di_168, %Di_169 mask %do_ptrs_158 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc733) + %Di_171 = ttg.async_commit_group tokens %Di_170 loc(#loc733) + %do_ptrs_172 = arith.cmpi sgt, %hi_91, %c1_i32 : i32 loc(#loc976) + %qT_ptrs_173 = tt.addptr %qT_ptrs_140, %cst_24 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc735) + %do_ptrs_174 = tt.addptr %do_ptrs_144, %cst_25 : tensor<64x128x!tt.ptr, #blocked2>, tensor<64x128xi32, #blocked2> loc(#loc736) + %offs_m1_175 = arith.addi %offs_m1_81, %cst_26 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc737) + %offs_m1_176 = arith.addi %offs_m1_83, %cst_23 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc737) + %offs_m1_177 = arith.addi %offs_m1_84, %cst_27 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc737) + %qT_178 = tt.expand_dims %offs_m1_176 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc918) + %qT_179 = arith.cmpi slt, %qT_178, %cst_0 : tensor<1x64xi32, #blocked1> loc(#loc916) + %qT_180 = tt.broadcast %qT_179 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc914) + %qT_181 = ttg.memdesc_index %qT[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc914) + %do_ptrs_182 = tt.splat %do_ptrs_172 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc976) + %do_ptrs_183 = arith.andi %do_ptrs_182, %qT_180 : tensor<128x64xi1, #blocked1> loc(#loc976) + %qT_184 = ttg.async_copy_global_to_local %qT_ptrs_173, %qT_181 mask %do_ptrs_183 other %cst_14 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc914) + %qT_185 = ttg.async_commit_group tokens %qT_184 loc(#loc914) + %lse_186 = arith.cmpi slt, %offs_m1_175, %cst_30 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc734) + %lse_187 = tt.addptr %lse, %offs_m1_175 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc728) + %lse_188 = ttg.memdesc_index %lse_145[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc731) + %do_ptrs_189 = tt.splat %do_ptrs_172 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc976) + %do_ptrs_190 = arith.andi %do_ptrs_189, %lse_186 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc976) + %lse_191 = ttg.async_copy_global_to_local %lse_187, %lse_188 mask %do_ptrs_190 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc731) + %lse_192 = ttg.async_commit_group tokens %lse_191 loc(#loc731) + %do_193 = tt.expand_dims %offs_m1_177 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1xi32, #blocked2> loc(#loc919) + %do_194 = arith.cmpi slt, %do_193, %cst_2 : tensor<64x1xi32, #blocked2> loc(#loc917) + %do_195 = tt.broadcast %do_194 : tensor<64x1xi1, #blocked2> -> tensor<64x128xi1, #blocked2> loc(#loc915) + %do_196 = ttg.memdesc_index %do[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc915) + %do_ptrs_197 = tt.splat %do_ptrs_172 : i1 -> tensor<64x128xi1, #blocked2> loc(#loc976) + %do_ptrs_198 = arith.andi %do_ptrs_197, %do_195 : tensor<64x128xi1, #blocked2> loc(#loc976) + %do_199 = ttg.async_copy_global_to_local %do_ptrs_174, %do_196 mask %do_ptrs_198 other %cst_15 : tensor<64x128x!tt.ptr, #blocked2> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc915) + %do_200 = ttg.async_commit_group tokens %do_199 loc(#loc915) + %Di_201 = tt.addptr %Di, %offs_m1_175 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc729) + %Di_202 = ttg.memdesc_index %Di_146[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc733) + %Di_203 = ttg.async_copy_global_to_local %Di_201, %Di_202 mask %do_ptrs_190 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc733) + %Di_204 = ttg.async_commit_group tokens %Di_203 loc(#loc733) + %do_ptrs_205:22 = scf.for %do_ptrs_280 = %c0_i32 to %hi_91 step %c1_i32 iter_args(%arg26 = %arg24, %arg27 = %arg23, %offs_m1_281 = %offs_m1_82, %qT_ptrs_282 = %qT_ptrs_173, %offs_m1_283 = %offs_m1_176, %do_ptrs_284 = %do_ptrs_174, %offs_m1_285 = %offs_m1_177, %offs_m1_286 = %offs_m1_175, %arg34 = %c1_i32, %arg35 = %c-1_i32, %arg36 = %c1_i32, %arg37 = %c-1_i32, %qT_287 = %qT_153, %qT_288 = %qT_185, %lse_289 = %lse_160, %lse_290 = %lse_192, %do_291 = %do_167, %do_292 = %do_200, %Di_293 = %Di_171, %Di_294 = %Di_204, %arg46 = %c64_i32, %offs_m1_295 = %offs_m1_81) -> (tensor<128x128xf32, #mma>, tensor<128x128xf32, #mma>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<64x128x!tt.ptr, #blocked2>, tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>>, i32, i32, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>>) : i32 { + %do_ptrs_296 = arith.subi %hi_91, %c2_i32 : i32 loc(#loc976) + %do_ptrs_297 = arith.cmpi slt, %do_ptrs_280, %do_ptrs_296 : i32 loc(#loc976) + %do_ptrs_298 = arith.subi %hi_91, %c1_i32 : i32 loc(#loc976) + %do_ptrs_299 = arith.cmpi slt, %do_ptrs_280, %do_ptrs_298 : i32 loc(#loc976) + %do_ptrs_300 = arith.addi %arg37, %c1_i32 : i32 loc(#loc976) + %do_ptrs_301 = arith.cmpi sge, %do_ptrs_300, %c2_i32 : i32 loc(#loc976) + %do_ptrs_302 = arith.select %do_ptrs_301, %c0_i32, %do_ptrs_300 : i32 loc(#loc976) + %do_ptrs_303 = arith.addi %arg35, %c1_i32 : i32 loc(#loc976) + %do_ptrs_304 = arith.cmpi sge, %do_ptrs_303, %c3_i32 : i32 loc(#loc976) + %do_ptrs_305 = arith.select %do_ptrs_304, %c0_i32, %do_ptrs_303 : i32 loc(#loc976) + %qT_306 = tt.expand_dims %offs_m1_295 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> -> tensor<1x64xi32, #mma1> loc(#loc918) + %qT_307 = tt.expand_dims %offs_m1_281 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc918) + %qT_308 = arith.cmpi slt, %qT_306, %cst_28 : tensor<1x64xi32, #mma1> loc(#loc916) + %qT_309 = tt.broadcast %qT_308 : tensor<1x64xi1, #mma1> -> tensor<128x64xi1, #mma1> loc(#loc914) + %qT_310 = ttg.async_wait %qT_287, %lse_289, %do_291, %Di_293 {num = 4 : i32} loc(#loc914) + %qT_311 = ttg.memdesc_index %qT[%do_ptrs_305] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc914) + %dk_312 = ttg.memdesc_trans %qT_311 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc738) + %lse_313 = ttg.memdesc_index %lse_145[%do_ptrs_302] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc731) + %lse_314 = ttg.local_load %lse_313 token %qT_310 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc731) + %lse_315 = arith.cmpf oeq, %lse_314, %cst_31 : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc739) + %lse_316 = arith.select %lse_315, %cst_32, %lse_314 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma1}>>, tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc740) + %qkT = ttng.warp_group_dot %k_63, %qT_311, %cst_18 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma1> loc(#loc724) + %qkT_317:3 = ttng.warp_group_dot_wait %qkT, %k_63, %qT_311 {pendings = 0 : i32} : tensor<128x64xf32, #mma1>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc724) + %qkT_318 = arith.mulf %qkT_317#0, %cst_19 : tensor<128x64xf32, #mma1> loc(#loc741) + %m = arith.remsi %qT_307, %cst : tensor<1x64xi32, #blocked> loc(#loc920) + %post_mod_scores = arith.select %qT_309, %qkT_318, %cst_20 : tensor<128x64xi1, #mma1>, tensor<128x64xf32, #mma1> loc(#loc743) + %tmp44_319 = tt.broadcast %m : tensor<1x64xi32, #blocked> -> tensor<128x64xi32, #blocked> loc(#loc699) + %tmp44_320 = arith.cmpi sge, %tmp44_319, %tmp44 : tensor<128x64xi32, #blocked> loc(#loc699) + %tmp49 = arith.extsi %m : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> loc(#loc744) + %tmp50_321 = arith.cmpi slt, %tmp49, %tmp50 : tensor<1x64xi64, #blocked> loc(#loc705) + %tmp51_322 = tt.broadcast %tmp50_321 : tensor<1x64xi1, #blocked> -> tensor<128x64xi1, #blocked> loc(#loc706) + %tmp51_323 = arith.andi %tmp51, %tmp51_322 : tensor<128x64xi1, #blocked> loc(#loc706) + %tmp52 = arith.andi %tmp44_320, %tmp51_323 : tensor<128x64xi1, #blocked> loc(#loc745) + %tmp68 = arith.subi %tmp44, %tmp44_319 : tensor<128x64xi32, #blocked> loc(#loc746) + %tmp69 = arith.remsi %tmp68, %cst_1 : tensor<128x64xi32, #blocked> loc(#loc747) + %tmp70 = arith.cmpi ne, %tmp69, %cst_7 : tensor<128x64xi32, #blocked> loc(#loc748) + %tmp71 = arith.cmpi slt, %tmp69, %cst_7 : tensor<128x64xi32, #blocked> loc(#loc749) + %tmp73 = arith.andi %tmp70, %tmp71 : tensor<128x64xi1, #blocked> loc(#loc750) + %tmp74 = arith.addi %tmp69, %cst_1 : tensor<128x64xi32, #blocked> loc(#loc751) + %tmp75 = arith.select %tmp73, %tmp74, %tmp69 : tensor<128x64xi1, #blocked>, tensor<128x64xi32, #blocked> loc(#loc752) + %tmp76 = arith.cmpi eq, %tmp75, %cst_7 : tensor<128x64xi32, #blocked> loc(#loc753) + %tmp77_324 = arith.andi %tmp77, %tmp76 : tensor<128x64xi1, #blocked> loc(#loc717) + %tmp78 = arith.ori %tmp52, %tmp77_324 : tensor<128x64xi1, #blocked> loc(#loc754) + %tmp78_325 = ttg.convert_layout %tmp78 : tensor<128x64xi1, #blocked> -> tensor<128x64xi1, #mma1> loc(#loc754) + %post_mod_scores_326 = arith.select %tmp78_325, %post_mod_scores, %cst_20 : tensor<128x64xi1, #mma1>, tensor<128x64xf32, #mma1> loc(#loc755) + %post_mod_scores_327 = arith.mulf %post_mod_scores_326, %cst_21 : tensor<128x64xf32, #mma1> loc(#loc756) + %pT = tt.expand_dims %lse_316 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma1}>> -> tensor<1x64xf32, #mma1> loc(#loc757) + %pT_328 = tt.broadcast %pT : tensor<1x64xf32, #mma1> -> tensor<128x64xf32, #mma1> loc(#loc758) + %pT_329 = arith.subf %post_mod_scores_327, %pT_328 : tensor<128x64xf32, #mma1> loc(#loc758) + %pT_330 = math.exp2 %pT_329 : tensor<128x64xf32, #mma1> loc(#loc759) + %do_331 = ttg.memdesc_index %do[%do_ptrs_305] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc915) + %dpT = ttg.memdesc_trans %do_331 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc760) + %dv = arith.truncf %pT_330 : tensor<128x64xf32, #mma1> to tensor<128x64xbf16, #mma1> loc(#loc761) + %dv_332 = ttg.convert_layout %dv : tensor<128x64xbf16, #mma1> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> loc(#loc761) + %dv_333 = ttng.warp_group_dot %dv_332, %do_331, %arg27 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma> loc(#loc762) + %Di_334 = ttg.memdesc_index %Di_146[%do_ptrs_302] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc733) + %Di_335 = ttg.local_load %Di_334 token %qT_310 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc733) + %dpT_336 = ttng.warp_group_dot %v_68, %dpT, %cst_18 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma1> loc(#loc763) + %dpT_337:3 = ttng.warp_group_dot_wait %dpT_336, %v_68, %dpT {pendings = 0 : i32} : tensor<128x64xf32, #mma1>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc763) + %dsT = tt.expand_dims %Di_335 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma1}>> -> tensor<1x64xf32, #mma1> loc(#loc764) + %dsT_338 = tt.broadcast %dsT : tensor<1x64xf32, #mma1> -> tensor<128x64xf32, #mma1> loc(#loc765) + %dsT_339 = arith.subf %dpT_337#0, %dsT_338 : tensor<128x64xf32, #mma1> loc(#loc765) + %dsT_340 = arith.mulf %pT_330, %dsT_339 : tensor<128x64xf32, #mma1> loc(#loc766) + %grad_scores = arith.select %qT_309, %dsT_340, %cst_18 : tensor<128x64xi1, #mma1>, tensor<128x64xf32, #mma1> loc(#loc767) + %dsT_341 = arith.select %tmp78_325, %grad_scores, %cst_18 : tensor<128x64xi1, #mma1>, tensor<128x64xf32, #mma1> loc(#loc768) + %dk_342 = arith.truncf %dsT_341 : tensor<128x64xf32, #mma1> to tensor<128x64xbf16, #mma1> loc(#loc769) + %dk_343 = ttg.convert_layout %dk_342 : tensor<128x64xbf16, #mma1> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> loc(#loc769) + %dk_344 = ttng.warp_group_dot %dk_343, %dk_312, %arg26 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma> loc(#loc770) + %offs_m1_345 = tt.splat %arg46 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc737) + %offs_m1_346 = tt.splat %arg46 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc737) + %offs_m1_347 = arith.addi %offs_m1_295, %offs_m1_345 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc737) + %offs_m1_348 = arith.addi %offs_m1_281, %offs_m1_346 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc737) + %do_ptrs_349 = arith.addi %do_ptrs_280, %c1_i32 : i32 loc(#loc976) + %cur_block_idx = arith.divsi %do_ptrs_349, %c2_i32 : i32 loc(#loc921) + %cur_block = tt.addptr %q_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc922) + %cur_block_350 = tt.load %cur_block, %do_ptrs_299 evictionPolicy = evict_last : !tt.ptr loc(#loc923) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc924) + %next_block_351 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_73 : i32 loc(#loc925) + %next_block_352 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc926) + %do_ptrs_353 = arith.andi %do_ptrs_299, %next_block_351 : i1 loc(#loc976) + %next_block_354 = tt.load %next_block_352, %do_ptrs_353 evictionPolicy = evict_last : !tt.ptr loc(#loc927) + %needs_jump = arith.addi %do_ptrs_280, %c2_i32 : i32 loc(#loc928) + %needs_jump_355 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc929) + %needs_jump_356 = arith.cmpi eq, %needs_jump_355, %c0_i32 : i32 loc(#loc930) + %jump_to_block = arith.subi %next_block_354, %cur_block_350 : i32 loc(#loc931) + %jump_to_block_357 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc932) + %jump_to_block_358 = arith.subi %jump_to_block_357, %c64_i32 : i32 loc(#loc933) + %offset = arith.extui %needs_jump_356 : i1 to i32 loc(#loc934) + %offset_359 = arith.muli %jump_to_block_358, %offset : i32 loc(#loc934) + %offset_360 = arith.subi %c1_i32, %offset : i32 loc(#loc935) + %offset_361 = arith.muli %offset_360, %c64_i32 : i32 loc(#loc936) + %offset_362 = arith.addi %offset_359, %offset_361 : i32 loc(#loc937) + %qT_ptrs_363 = arith.muli %offset_362, %c4096_i32 : i32 loc(#loc772) + %qT_ptrs_364 = tt.splat %qT_ptrs_363 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc735) + %qT_ptrs_365 = tt.addptr %qT_ptrs_282, %qT_ptrs_364 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc735) + %do_ptrs_366 = arith.muli %offset_362, %c128_i32 : i32 loc(#loc773) + %do_ptrs_367 = tt.splat %do_ptrs_366 : i32 -> tensor<64x128xi32, #blocked2> loc(#loc736) + %do_ptrs_368 = tt.addptr %do_ptrs_284, %do_ptrs_367 : tensor<64x128x!tt.ptr, #blocked2>, tensor<64x128xi32, #blocked2> loc(#loc736) + %offs_m1_369 = tt.splat %offset_362 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc737) + %offs_m1_370 = tt.splat %offset_362 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc737) + %offs_m1_371 = tt.splat %offset_362 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc737) + %offs_m1_372 = arith.addi %offs_m1_286, %offs_m1_369 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc737) + %offs_m1_373 = arith.addi %offs_m1_283, %offs_m1_370 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc737) + %offs_m1_374 = arith.addi %offs_m1_285, %offs_m1_371 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc737) + %do_ptrs_375 = arith.addi %arg36, %c1_i32 : i32 loc(#loc976) + %do_ptrs_376 = arith.cmpi sge, %do_ptrs_375, %c2_i32 : i32 loc(#loc976) + %do_ptrs_377 = arith.select %do_ptrs_376, %c0_i32, %do_ptrs_375 : i32 loc(#loc976) + %do_ptrs_378 = arith.addi %arg34, %c1_i32 : i32 loc(#loc976) + %do_ptrs_379 = arith.cmpi sge, %do_ptrs_378, %c3_i32 : i32 loc(#loc976) + %do_ptrs_380 = arith.select %do_ptrs_379, %c0_i32, %do_ptrs_378 : i32 loc(#loc976) + %qT_381 = tt.expand_dims %offs_m1_373 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc918) + %qT_382 = arith.cmpi slt, %qT_381, %cst_0 : tensor<1x64xi32, #blocked1> loc(#loc916) + %qT_383 = tt.broadcast %qT_382 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc914) + %qT_384 = ttg.memdesc_index %qT[%do_ptrs_380] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc914) + %do_ptrs_385 = tt.splat %do_ptrs_297 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc976) + %do_ptrs_386 = arith.andi %do_ptrs_385, %qT_383 : tensor<128x64xi1, #blocked1> loc(#loc976) + %qT_387 = ttg.async_copy_global_to_local %qT_ptrs_365, %qT_384 mask %do_ptrs_386 other %cst_14 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc914) + %qT_388 = ttg.async_commit_group tokens %qT_387 loc(#loc914) + %lse_389 = arith.cmpi slt, %offs_m1_372, %cst_30 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc734) + %lse_390 = tt.addptr %lse, %offs_m1_372 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc728) + %lse_391 = ttg.memdesc_index %lse_145[%do_ptrs_377] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc731) + %do_ptrs_392 = tt.splat %do_ptrs_297 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc976) + %do_ptrs_393 = arith.andi %do_ptrs_392, %lse_389 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc976) + %lse_394 = ttg.async_copy_global_to_local %lse_390, %lse_391 mask %do_ptrs_393 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc731) + %lse_395 = ttg.async_commit_group tokens %lse_394 loc(#loc731) + %do_396 = tt.expand_dims %offs_m1_374 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1xi32, #blocked2> loc(#loc919) + %do_397 = arith.cmpi slt, %do_396, %cst_2 : tensor<64x1xi32, #blocked2> loc(#loc917) + %do_398 = tt.broadcast %do_397 : tensor<64x1xi1, #blocked2> -> tensor<64x128xi1, #blocked2> loc(#loc915) + %do_399 = ttg.memdesc_index %do[%do_ptrs_380] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc915) + %do_ptrs_400 = tt.splat %do_ptrs_297 : i1 -> tensor<64x128xi1, #blocked2> loc(#loc976) + %do_ptrs_401 = arith.andi %do_ptrs_400, %do_398 : tensor<64x128xi1, #blocked2> loc(#loc976) + %do_402 = ttg.async_copy_global_to_local %do_ptrs_368, %do_399 mask %do_ptrs_401 other %cst_15 : tensor<64x128x!tt.ptr, #blocked2> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc915) + %do_403 = ttg.async_commit_group tokens %do_402 loc(#loc915) + %Di_404 = tt.addptr %Di, %offs_m1_372 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc729) + %Di_405 = ttg.memdesc_index %Di_146[%do_ptrs_377] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc733) + %Di_406 = ttg.async_copy_global_to_local %Di_404, %Di_405 mask %do_ptrs_393 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc733) + %Di_407 = ttg.async_commit_group tokens %Di_406 loc(#loc733) + scf.yield %dk_344, %dv_333, %offs_m1_348, %qT_ptrs_365, %offs_m1_373, %do_ptrs_368, %offs_m1_374, %offs_m1_372, %do_ptrs_380, %do_ptrs_305, %do_ptrs_377, %do_ptrs_302, %qT_288, %qT_388, %lse_290, %lse_395, %do_292, %do_403, %Di_294, %Di_407, %offset_362, %offs_m1_347 : tensor<128x128xf32, #mma>, tensor<128x128xf32, #mma>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<64x128x!tt.ptr, #blocked2>, tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>>, i32, i32, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc976) + } loc(#loc976) + %do_ptrs_206:2 = ttng.warp_group_dot_wait %do_ptrs_205#1, %do_ptrs_205#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma>, tensor<128x128xf32, #mma> loc(#loc976) + %do_ptrs_207 = ttg.async_wait {num = 0 : i32} loc(#loc976) + ttg.local_dealloc %Di_146 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc976) + ttg.local_dealloc %do : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc976) + ttg.local_dealloc %lse_145 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc976) + ttg.local_dealloc %qT : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc976) + %qT_ptrs_208 = tt.addptr %qT_ptrs_137, %qT_ptrs_109 : tensor<1x64x!tt.ptr, #blocked1>, tensor<1x64xi32, #blocked1> loc(#loc774) + %qT_ptrs_209 = tt.broadcast %qT_ptrs_208 : tensor<1x64x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> loc(#loc775) + %qT_ptrs_210 = tt.addptr %qT_ptrs_209, %qT_ptrs_88 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc775) + %do_ptrs_211 = tt.addptr %do_ptrs_141, %do_ptrs_111 : tensor<64x1x!tt.ptr, #blocked2>, tensor<64x1xi32, #blocked2> loc(#loc776) + %do_ptrs_212 = tt.broadcast %do_ptrs_211 : tensor<64x1x!tt.ptr, #blocked2> -> tensor<64x128x!tt.ptr, #blocked2> loc(#loc777) + %do_ptrs_213 = tt.addptr %do_ptrs_212, %do_ptrs_90 : tensor<64x128x!tt.ptr, #blocked2>, tensor<64x128xi32, #blocked2> loc(#loc777) + %qT_214 = ttg.local_alloc : () -> !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc938) + %lse_215 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc779) + %do_216 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc939) + %Di_217 = ttg.local_alloc : () -> !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc781) + %do_ptrs_218 = arith.cmpi sgt, %hi_113, %c0_i32 : i32 loc(#loc977) + %qT_219 = arith.cmpi slt, %qT_ptrs_108, %cst_0 : tensor<1x64xi32, #blocked1> loc(#loc940) + %qT_220 = tt.broadcast %qT_219 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc938) + %qT_221 = ttg.memdesc_index %qT_214[%c0_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc938) + %do_ptrs_222 = tt.splat %do_ptrs_218 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc977) + %do_ptrs_223 = arith.andi %do_ptrs_222, %qT_220 : tensor<128x64xi1, #blocked1> loc(#loc977) + %qT_224 = ttg.async_copy_global_to_local %qT_ptrs_210, %qT_221 mask %do_ptrs_223 other %cst_14 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc938) + %qT_225 = ttg.async_commit_group tokens %qT_224 loc(#loc938) + %lse_226 = arith.cmpi slt, %offs_m1_104, %cst_30 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc782) + %lse_227 = tt.addptr %lse, %offs_m1_104 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc783) + %lse_228 = ttg.memdesc_index %lse_215[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc779) + %do_ptrs_229 = tt.splat %do_ptrs_218 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc977) + %do_ptrs_230 = arith.andi %do_ptrs_229, %lse_226 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc977) + %lse_231 = ttg.async_copy_global_to_local %lse_227, %lse_228 mask %do_ptrs_230 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc779) + %lse_232 = ttg.async_commit_group tokens %lse_231 loc(#loc779) + %do_233 = arith.cmpi slt, %do_ptrs_110, %cst_2 : tensor<64x1xi32, #blocked2> loc(#loc941) + %do_234 = tt.broadcast %do_233 : tensor<64x1xi1, #blocked2> -> tensor<64x128xi1, #blocked2> loc(#loc939) + %do_235 = ttg.memdesc_index %do_216[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc939) + %do_ptrs_236 = tt.splat %do_ptrs_218 : i1 -> tensor<64x128xi1, #blocked2> loc(#loc977) + %do_ptrs_237 = arith.andi %do_ptrs_236, %do_234 : tensor<64x128xi1, #blocked2> loc(#loc977) + %do_238 = ttg.async_copy_global_to_local %do_ptrs_213, %do_235 mask %do_ptrs_237 other %cst_15 : tensor<64x128x!tt.ptr, #blocked2> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc939) + %do_239 = ttg.async_commit_group tokens %do_238 loc(#loc939) + %Di_240 = tt.addptr %Di, %offs_m1_104 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc784) + %Di_241 = ttg.memdesc_index %Di_217[%c0_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc781) + %Di_242 = ttg.async_copy_global_to_local %Di_240, %Di_241 mask %do_ptrs_230 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc781) + %Di_243 = ttg.async_commit_group tokens %Di_242 loc(#loc781) + %do_ptrs_244 = arith.cmpi sgt, %hi_113, %c1_i32 : i32 loc(#loc977) + %qT_ptrs_245 = tt.addptr %qT_ptrs_210, %cst_24 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc785) + %do_ptrs_246 = tt.addptr %do_ptrs_213, %cst_25 : tensor<64x128x!tt.ptr, #blocked2>, tensor<64x128xi32, #blocked2> loc(#loc786) + %offs_m1_247 = arith.addi %offs_m1_104, %cst_26 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc787) + %offs_m1_248 = arith.addi %offs_m1_106, %cst_23 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc787) + %offs_m1_249 = arith.addi %offs_m1_107, %cst_27 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc787) + %qT_250 = tt.expand_dims %offs_m1_248 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc942) + %qT_251 = arith.cmpi slt, %qT_250, %cst_0 : tensor<1x64xi32, #blocked1> loc(#loc940) + %qT_252 = tt.broadcast %qT_251 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc938) + %qT_253 = ttg.memdesc_index %qT_214[%c1_i32] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc938) + %do_ptrs_254 = tt.splat %do_ptrs_244 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc977) + %do_ptrs_255 = arith.andi %do_ptrs_254, %qT_252 : tensor<128x64xi1, #blocked1> loc(#loc977) + %qT_256 = ttg.async_copy_global_to_local %qT_ptrs_245, %qT_253 mask %do_ptrs_255 other %cst_14 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc938) + %qT_257 = ttg.async_commit_group tokens %qT_256 loc(#loc938) + %lse_258 = arith.cmpi slt, %offs_m1_247, %cst_30 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc782) + %lse_259 = tt.addptr %lse, %offs_m1_247 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc783) + %lse_260 = ttg.memdesc_index %lse_215[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc779) + %do_ptrs_261 = tt.splat %do_ptrs_244 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc977) + %do_ptrs_262 = arith.andi %do_ptrs_261, %lse_258 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc977) + %lse_263 = ttg.async_copy_global_to_local %lse_259, %lse_260 mask %do_ptrs_262 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc779) + %lse_264 = ttg.async_commit_group tokens %lse_263 loc(#loc779) + %do_265 = tt.expand_dims %offs_m1_249 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1xi32, #blocked2> loc(#loc943) + %do_266 = arith.cmpi slt, %do_265, %cst_2 : tensor<64x1xi32, #blocked2> loc(#loc941) + %do_267 = tt.broadcast %do_266 : tensor<64x1xi1, #blocked2> -> tensor<64x128xi1, #blocked2> loc(#loc939) + %do_268 = ttg.memdesc_index %do_216[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc939) + %do_ptrs_269 = tt.splat %do_ptrs_244 : i1 -> tensor<64x128xi1, #blocked2> loc(#loc977) + %do_ptrs_270 = arith.andi %do_ptrs_269, %do_267 : tensor<64x128xi1, #blocked2> loc(#loc977) + %do_271 = ttg.async_copy_global_to_local %do_ptrs_246, %do_268 mask %do_ptrs_270 other %cst_15 : tensor<64x128x!tt.ptr, #blocked2> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc939) + %do_272 = ttg.async_commit_group tokens %do_271 loc(#loc939) + %Di_273 = tt.addptr %Di, %offs_m1_247 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc784) + %Di_274 = ttg.memdesc_index %Di_217[%c1_i32] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc781) + %Di_275 = ttg.async_copy_global_to_local %Di_273, %Di_274 mask %do_ptrs_262 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc781) + %Di_276 = ttg.async_commit_group tokens %Di_275 loc(#loc781) + %do_ptrs_277:21 = scf.for %do_ptrs_280 = %c0_i32 to %hi_113 step %c1_i32 iter_args(%do_ptrs_281 = %do_ptrs_206#1, %do_ptrs_282 = %do_ptrs_206#0, %offs_m1_283 = %offs_m1_105, %qT_ptrs_284 = %qT_ptrs_245, %offs_m1_285 = %offs_m1_248, %do_ptrs_286 = %do_ptrs_246, %offs_m1_287 = %offs_m1_249, %offs_m1_288 = %offs_m1_247, %arg34 = %c1_i32, %arg35 = %c-1_i32, %arg36 = %c1_i32, %arg37 = %c-1_i32, %qT_289 = %qT_225, %qT_290 = %qT_257, %lse_291 = %lse_232, %lse_292 = %lse_264, %do_293 = %do_239, %do_294 = %do_272, %Di_295 = %Di_243, %Di_296 = %Di_276, %arg46 = %c64_i32) -> (tensor<128x128xf32, #mma>, tensor<128x128xf32, #mma>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<64x128x!tt.ptr, #blocked2>, tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>>, i32, i32, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 { + %do_ptrs_297 = arith.subi %hi_113, %c2_i32 : i32 loc(#loc977) + %do_ptrs_298 = arith.cmpi slt, %do_ptrs_280, %do_ptrs_297 : i32 loc(#loc977) + %do_ptrs_299 = arith.subi %hi_113, %c1_i32 : i32 loc(#loc977) + %do_ptrs_300 = arith.cmpi slt, %do_ptrs_280, %do_ptrs_299 : i32 loc(#loc977) + %do_ptrs_301 = arith.addi %arg37, %c1_i32 : i32 loc(#loc977) + %do_ptrs_302 = arith.cmpi sge, %do_ptrs_301, %c2_i32 : i32 loc(#loc977) + %do_ptrs_303 = arith.select %do_ptrs_302, %c0_i32, %do_ptrs_301 : i32 loc(#loc977) + %do_ptrs_304 = arith.addi %arg35, %c1_i32 : i32 loc(#loc977) + %do_ptrs_305 = arith.cmpi sge, %do_ptrs_304, %c3_i32 : i32 loc(#loc977) + %do_ptrs_306 = arith.select %do_ptrs_305, %c0_i32, %do_ptrs_304 : i32 loc(#loc977) + %qT_307 = tt.expand_dims %offs_m1_283 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc942) + %qT_308 = arith.cmpi slt, %qT_307, %cst : tensor<1x64xi32, #blocked> loc(#loc940) + %qT_309 = tt.broadcast %qT_308 : tensor<1x64xi1, #blocked> -> tensor<128x64xi1, #blocked> loc(#loc938) + %qT_310 = ttg.convert_layout %qT_309 : tensor<128x64xi1, #blocked> -> tensor<128x64xi1, #mma1> loc(#loc938) + %qT_311 = ttg.async_wait %qT_289, %lse_291, %do_293, %Di_295 {num = 4 : i32} loc(#loc938) + %qT_312 = ttg.memdesc_index %qT_214[%do_ptrs_306] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc938) + %dk_313 = ttg.memdesc_trans %qT_312 {order = array} : !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc788) + %lse_314 = ttg.memdesc_index %lse_215[%do_ptrs_303] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc779) + %lse_315 = ttg.local_load %lse_314 token %qT_311 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc779) + %lse_316 = arith.cmpf oeq, %lse_315, %cst_31 : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc789) + %lse_317 = arith.select %lse_316, %cst_32, %lse_315 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma1}>>, tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc790) + %qkT = ttng.warp_group_dot %k_63, %qT_312, %cst_18 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma1> loc(#loc791) + %qkT_318:3 = ttng.warp_group_dot_wait %qkT, %k_63, %qT_312 {pendings = 0 : i32} : tensor<128x64xf32, #mma1>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc791) + %qkT_319 = arith.mulf %qkT_318#0, %cst_19 : tensor<128x64xf32, #mma1> loc(#loc792) + %post_mod_scores = arith.select %qT_310, %qkT_319, %cst_20 : tensor<128x64xi1, #mma1>, tensor<128x64xf32, #mma1> loc(#loc793) + %post_mod_scores_320 = arith.mulf %post_mod_scores, %cst_21 : tensor<128x64xf32, #mma1> loc(#loc794) + %pT = tt.expand_dims %lse_317 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma1}>> -> tensor<1x64xf32, #mma1> loc(#loc795) + %pT_321 = tt.broadcast %pT : tensor<1x64xf32, #mma1> -> tensor<128x64xf32, #mma1> loc(#loc796) + %pT_322 = arith.subf %post_mod_scores_320, %pT_321 : tensor<128x64xf32, #mma1> loc(#loc796) + %pT_323 = math.exp2 %pT_322 : tensor<128x64xf32, #mma1> loc(#loc797) + %do_324 = ttg.memdesc_index %do_216[%do_ptrs_306] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc939) + %dpT = ttg.memdesc_trans %do_324 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc798) + %dv = arith.truncf %pT_323 : tensor<128x64xf32, #mma1> to tensor<128x64xbf16, #mma1> loc(#loc799) + %dv_325 = ttg.convert_layout %dv : tensor<128x64xbf16, #mma1> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> loc(#loc799) + %dv_326 = ttng.warp_group_dot %dv_325, %do_324, %do_ptrs_282 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma> loc(#loc800) + %Di_327 = ttg.memdesc_index %Di_217[%do_ptrs_303] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc781) + %Di_328 = ttg.local_load %Di_327 token %qT_311 : !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> -> tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc781) + %dpT_329 = ttng.warp_group_dot %v_68, %dpT, %cst_18 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma1> loc(#loc801) + %dpT_330:3 = ttng.warp_group_dot_wait %dpT_329, %v_68, %dpT {pendings = 0 : i32} : tensor<128x64xf32, #mma1>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc801) + %dsT = tt.expand_dims %Di_328 {axis = 0 : i32} : tensor<64xf32, #ttg.slice<{dim = 0, parent = #mma1}>> -> tensor<1x64xf32, #mma1> loc(#loc802) + %dsT_331 = tt.broadcast %dsT : tensor<1x64xf32, #mma1> -> tensor<128x64xf32, #mma1> loc(#loc803) + %dsT_332 = arith.subf %dpT_330#0, %dsT_331 : tensor<128x64xf32, #mma1> loc(#loc803) + %dsT_333 = arith.mulf %pT_323, %dsT_332 : tensor<128x64xf32, #mma1> loc(#loc804) + %grad_scores = arith.select %qT_310, %dsT_333, %cst_18 : tensor<128x64xi1, #mma1>, tensor<128x64xf32, #mma1> loc(#loc805) + %dk_334 = arith.truncf %grad_scores : tensor<128x64xf32, #mma1> to tensor<128x64xbf16, #mma1> loc(#loc806) + %dk_335 = ttg.convert_layout %dk_334 : tensor<128x64xbf16, #mma1> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> loc(#loc806) + %dk_336 = ttng.warp_group_dot %dk_335, %dk_313, %do_ptrs_281 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma> loc(#loc807) + %offs_m1_337 = tt.splat %arg46 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc787) + %offs_m1_338 = arith.addi %offs_m1_283, %offs_m1_337 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc787) + %do_ptrs_339 = arith.addi %do_ptrs_280, %c1_i32 : i32 loc(#loc977) + %cur_block_idx = arith.divsi %do_ptrs_339, %c2_i32 : i32 loc(#loc944) + %cur_block = tt.addptr %q_indices_95, %cur_block_idx : !tt.ptr, i32 loc(#loc945) + %cur_block_340 = tt.load %cur_block, %do_ptrs_300 evictionPolicy = evict_last : !tt.ptr loc(#loc946) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc947) + %next_block_341 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_99 : i32 loc(#loc948) + %next_block_342 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc949) + %do_ptrs_343 = arith.andi %do_ptrs_300, %next_block_341 : i1 loc(#loc977) + %next_block_344 = tt.load %next_block_342, %do_ptrs_343 evictionPolicy = evict_last : !tt.ptr loc(#loc950) + %needs_jump = arith.addi %do_ptrs_280, %c2_i32 : i32 loc(#loc951) + %needs_jump_345 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc952) + %needs_jump_346 = arith.cmpi eq, %needs_jump_345, %c0_i32 : i32 loc(#loc953) + %jump_to_block = arith.subi %next_block_344, %cur_block_340 : i32 loc(#loc954) + %jump_to_block_347 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc955) + %jump_to_block_348 = arith.subi %jump_to_block_347, %c64_i32 : i32 loc(#loc956) + %offset = arith.extui %needs_jump_346 : i1 to i32 loc(#loc957) + %offset_349 = arith.muli %jump_to_block_348, %offset : i32 loc(#loc957) + %offset_350 = arith.subi %c1_i32, %offset : i32 loc(#loc958) + %offset_351 = arith.muli %offset_350, %c64_i32 : i32 loc(#loc959) + %offset_352 = arith.addi %offset_349, %offset_351 : i32 loc(#loc960) + %qT_ptrs_353 = arith.muli %offset_352, %c4096_i32 : i32 loc(#loc809) + %qT_ptrs_354 = tt.splat %qT_ptrs_353 : i32 -> tensor<128x64xi32, #blocked1> loc(#loc785) + %qT_ptrs_355 = tt.addptr %qT_ptrs_284, %qT_ptrs_354 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> loc(#loc785) + %do_ptrs_356 = arith.muli %offset_352, %c128_i32 : i32 loc(#loc810) + %do_ptrs_357 = tt.splat %do_ptrs_356 : i32 -> tensor<64x128xi32, #blocked2> loc(#loc786) + %do_ptrs_358 = tt.addptr %do_ptrs_286, %do_ptrs_357 : tensor<64x128x!tt.ptr, #blocked2>, tensor<64x128xi32, #blocked2> loc(#loc786) + %offs_m1_359 = tt.splat %offset_352 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc787) + %offs_m1_360 = tt.splat %offset_352 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc787) + %offs_m1_361 = tt.splat %offset_352 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc787) + %offs_m1_362 = arith.addi %offs_m1_288, %offs_m1_359 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc787) + %offs_m1_363 = arith.addi %offs_m1_285, %offs_m1_360 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc787) + %offs_m1_364 = arith.addi %offs_m1_287, %offs_m1_361 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc787) + %do_ptrs_365 = arith.addi %arg36, %c1_i32 : i32 loc(#loc977) + %do_ptrs_366 = arith.cmpi sge, %do_ptrs_365, %c2_i32 : i32 loc(#loc977) + %do_ptrs_367 = arith.select %do_ptrs_366, %c0_i32, %do_ptrs_365 : i32 loc(#loc977) + %do_ptrs_368 = arith.addi %arg34, %c1_i32 : i32 loc(#loc977) + %do_ptrs_369 = arith.cmpi sge, %do_ptrs_368, %c3_i32 : i32 loc(#loc977) + %do_ptrs_370 = arith.select %do_ptrs_369, %c0_i32, %do_ptrs_368 : i32 loc(#loc977) + %qT_371 = tt.expand_dims %offs_m1_363 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> loc(#loc942) + %qT_372 = arith.cmpi slt, %qT_371, %cst_0 : tensor<1x64xi32, #blocked1> loc(#loc940) + %qT_373 = tt.broadcast %qT_372 : tensor<1x64xi1, #blocked1> -> tensor<128x64xi1, #blocked1> loc(#loc938) + %qT_374 = ttg.memdesc_index %qT_214[%do_ptrs_370] : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc938) + %do_ptrs_375 = tt.splat %do_ptrs_298 : i1 -> tensor<128x64xi1, #blocked1> loc(#loc977) + %do_ptrs_376 = arith.andi %do_ptrs_375, %qT_373 : tensor<128x64xi1, #blocked1> loc(#loc977) + %qT_377 = ttg.async_copy_global_to_local %qT_ptrs_355, %qT_374 mask %do_ptrs_376 other %cst_14 : tensor<128x64x!tt.ptr, #blocked1> -> <128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc938) + %qT_378 = ttg.async_commit_group tokens %qT_377 loc(#loc938) + %lse_379 = arith.cmpi slt, %offs_m1_362, %cst_30 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc782) + %lse_380 = tt.addptr %lse, %offs_m1_362 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc783) + %lse_381 = ttg.memdesc_index %lse_215[%do_ptrs_367] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc779) + %do_ptrs_382 = tt.splat %do_ptrs_298 : i1 -> tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc977) + %do_ptrs_383 = arith.andi %do_ptrs_382, %lse_379 : tensor<64xi1, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc977) + %lse_384 = ttg.async_copy_global_to_local %lse_380, %lse_381 mask %do_ptrs_383 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc779) + %lse_385 = ttg.async_commit_group tokens %lse_384 loc(#loc779) + %do_386 = tt.expand_dims %offs_m1_364 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1xi32, #blocked2> loc(#loc943) + %do_387 = arith.cmpi slt, %do_386, %cst_2 : tensor<64x1xi32, #blocked2> loc(#loc941) + %do_388 = tt.broadcast %do_387 : tensor<64x1xi1, #blocked2> -> tensor<64x128xi1, #blocked2> loc(#loc939) + %do_389 = ttg.memdesc_index %do_216[%do_ptrs_370] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc939) + %do_ptrs_390 = tt.splat %do_ptrs_298 : i1 -> tensor<64x128xi1, #blocked2> loc(#loc977) + %do_ptrs_391 = arith.andi %do_ptrs_390, %do_388 : tensor<64x128xi1, #blocked2> loc(#loc977) + %do_392 = ttg.async_copy_global_to_local %do_ptrs_358, %do_389 mask %do_ptrs_391 other %cst_15 : tensor<64x128x!tt.ptr, #blocked2> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc939) + %do_393 = ttg.async_commit_group tokens %do_392 loc(#loc939) + %Di_394 = tt.addptr %Di, %offs_m1_362 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>> loc(#loc784) + %Di_395 = ttg.memdesc_index %Di_217[%do_ptrs_367] : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> -> !ttg.memdesc<64xf32, #shared2, #smem, mutable, 2x64> loc(#loc781) + %Di_396 = ttg.async_copy_global_to_local %Di_394, %Di_395 mask %do_ptrs_383 : tensor<64x!tt.ptr, #ttg.slice<{dim = 0, parent = #mma1}>> -> <64xf32, #shared2, #smem, mutable, 2x64> loc(#loc781) + %Di_397 = ttg.async_commit_group tokens %Di_396 loc(#loc781) + scf.yield %dk_336, %dv_326, %offs_m1_338, %qT_ptrs_355, %offs_m1_363, %do_ptrs_358, %offs_m1_364, %offs_m1_362, %do_ptrs_370, %do_ptrs_306, %do_ptrs_367, %do_ptrs_303, %qT_290, %qT_378, %lse_292, %lse_385, %do_294, %do_393, %Di_296, %Di_397, %offset_352 : tensor<128x128xf32, #mma>, tensor<128x128xf32, #mma>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>, tensor<128x64x!tt.ptr, #blocked1>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<64x128x!tt.ptr, #blocked2>, tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>, tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma1}>>, i32, i32, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc977) + } loc(#loc977) + %do_ptrs_278:2 = ttng.warp_group_dot_wait %do_ptrs_277#1, %do_ptrs_277#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma>, tensor<128x128xf32, #mma> loc(#loc977) + %do_ptrs_279 = ttg.async_wait {num = 0 : i32} loc(#loc977) + ttg.local_dealloc %Di_217 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc977) + ttg.local_dealloc %do_216 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc977) + ttg.local_dealloc %lse_215 : !ttg.memdesc<2x64xf32, #shared2, #smem, mutable> loc(#loc977) + ttg.local_dealloc %qT_214 : !ttg.memdesc<3x128x64xbf16, #shared1, #smem, mutable> loc(#loc977) + scf.yield %do_ptrs_278#0, %do_ptrs_278#1 : tensor<128x128xf32, #mma>, tensor<128x128xf32, #mma> loc(#loc302) + } loc(#loc725) + %dv_ptrs = tt.splat %DV : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked2> loc(#loc624) + %dv_ptrs_114 = tt.addptr %dv_ptrs, %ptr_51 : tensor<128x1x!tt.ptr, #blocked2>, tensor<128x1xi32, #blocked2> loc(#loc624) + %dv_ptrs_115 = tt.broadcast %dv_ptrs_114 : tensor<128x1x!tt.ptr, #blocked2> -> tensor<128x128x!tt.ptr, #blocked2> loc(#loc625) + %dv_ptrs_116 = tt.addptr %dv_ptrs_115, %ptr_57 : tensor<128x128x!tt.ptr, #blocked2>, tensor<128x128xi32, #blocked2> loc(#loc625) + %3 = arith.cmpi slt, %ptr_55, %cst_9 : tensor<1x128xi32, #blocked2> loc(#loc305) + %4 = tt.broadcast %3 : tensor<1x128xi1, #blocked2> -> tensor<128x128xi1, #blocked2> loc(#loc306) + %5 = arith.andi %k_61, %4 : tensor<128x128xi1, #blocked2> loc(#loc306) + %6 = arith.truncf %dk#0 : tensor<128x128xf32, #mma> to tensor<128x128xbf16, #mma> loc(#loc307) + %7 = ttg.convert_layout %6 : tensor<128x128xbf16, #mma> -> tensor<128x128xbf16, #blocked2> loc(#loc307) + tt.store %dv_ptrs_116, %7, %5 : tensor<128x128x!tt.ptr, #blocked2> loc(#loc307) + %dk_117 = arith.mulf %dk#1, %cst_16 : tensor<128x128xf32, #mma> loc(#loc626) + %xindex = tt.broadcast %ptr_51 : tensor<128x1xi32, #blocked2> -> tensor<128x128xi32, #blocked2> loc(#loc627) + %xindex_118 = arith.addi %ptr_57, %xindex : tensor<128x128xi32, #blocked2> loc(#loc627) + %xindex_119 = arith.muli %off_hkv, %c128_i32 : i32 loc(#loc628) + %xindex_120 = arith.muli %xindex_119, %ks0 : i32 loc(#loc629) + %xindex_121 = tt.splat %xindex_120 : i32 -> tensor<128x128xi32, #blocked2> loc(#loc630) + %xindex_122 = arith.addi %xindex_118, %xindex_121 : tensor<128x128xi32, #blocked2> loc(#loc630) + %xindex_123 = arith.muli %off_zq, %c1024_i32 : i32 loc(#loc631) + %xindex_124 = arith.muli %xindex_123, %ks0 : i32 loc(#loc632) + %xindex_125 = tt.splat %xindex_124 : i32 -> tensor<128x128xi32, #blocked2> loc(#loc633) + %xindex_126 = arith.addi %xindex_122, %xindex_125 : tensor<128x128xi32, #blocked2> loc(#loc633) + %8 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr, #blocked2> loc(#loc316) + %9 = tt.addptr %8, %xindex_126 : tensor<128x128x!tt.ptr, #blocked2>, tensor<128x128xi32, #blocked2> loc(#loc316) + %10 = arith.truncf %dk_117 : tensor<128x128xf32, #mma> to tensor<128x128xbf16, #mma> loc(#loc317) + %11 = ttg.convert_layout %10 : tensor<128x128xbf16, #mma> -> tensor<128x128xbf16, #blocked2> loc(#loc317) + tt.store %9, %11, %k_61 : tensor<128x128x!tt.ptr, #blocked2> loc(#loc317) + } loc(#loc23) + tt.return loc(#loc318) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":95:54) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":95:63) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":111:24) +#loc5 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":112:36) +#loc7 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":115:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":116:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":117:23) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":124:25) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":124:47) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":124:35) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":124:59) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":128:50) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":128:37) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":128:61) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":131:9) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":132:9) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":133:10) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":136:26) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":139:14) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":139:7) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":140:24) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":144:29) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":144:54) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":144:44) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":145:35) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":148:29) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":154:55) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":154:78) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":155:50) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":155:83) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":155:68) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":158:30) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":158:52) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":158:40) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":158:63) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":159:32) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":159:42) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":159:66) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":161:30) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":161:35) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":161:46) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":161:56) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":163:17) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":164:19) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":167:19) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":168:21) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":169:25) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":174:36) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":175:29) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":825:27) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":178:107) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":825:38) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":825:20) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":825:56) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":825:49) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":833:52) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":833:23) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":179:111) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":188:58) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":188:34) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":188:25) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":189:33) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":189:26) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":190:30) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":190:50) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":191:18) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":195:30) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":196:27) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":196:41) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":197:53) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":197:39) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":199:42) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":199:29) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":390:26) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":207:12) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":390:37) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":390:18) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":390:56) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":390:49) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":391:18) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":391:49) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":395:43) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":395:90) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":395:101) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":395:63) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":831:52) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":458:105) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":405:12) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":798:21) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":467:46) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":482:23) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":485:34) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":397:28) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":485:23) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":486:22) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":487:23) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":488:23) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":489:23) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":525:39) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":531:22) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":531:19) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":831:23) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":528:104) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":414:19) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":415:19) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":417:19) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":831:41) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":459:19) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":553:30) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":461:14) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":464:46) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":476:79) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":483:23) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":490:23) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":493:24) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":494:24) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":496:25) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":497:92) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":500:24) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":501:24) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":502:39) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":503:25) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":504:24) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":505:24) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":506:23) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":507:25) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":508:25) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":509:92) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":511:24) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":512:24) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":513:39) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":514:25) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":515:24) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":516:24) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":521:69) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":524:27) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":525:21) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":530:20) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":531:14) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":538:71) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":549:43) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":551:15) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":553:21) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":788:33) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":411:64) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":789:38) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":789:24) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":790:109) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":790:113) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":790:55) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":790:25) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":791:30) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":791:35) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":791:60) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":792:34) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":792:48) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":792:63) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":793:29) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":793:47) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":793:61) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":793:42) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":414:28) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":214:39) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":215:31) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":215:45) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":216:62) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":216:43) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":218:33) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":226:16) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":231:24) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":231:56) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":232:14) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":236:87) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":236:69) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":236:30) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":245:28) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":252:25) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":253:29) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":256:107) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":257:107) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":263:32) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":266:56) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":269:34) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":281:58) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":281:80) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":282:53) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":282:81) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":282:70) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":286:32) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":287:30) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":287:43) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":288:55) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":288:42) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":290:45) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":290:32) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":601:26) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":298:16) +#loc201 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":601:37) +#loc202 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":601:56) +#loc203 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":601:49) +#loc204 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":602:27) +#loc205 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":602:38) +#loc206 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":602:51) +#loc207 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":608:42) +#loc208 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":608:61) +#loc209 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":683:46) +#loc210 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":618:12) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":698:25) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":699:25) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":701:35) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":610:28) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":701:24) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":702:24) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":704:24) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":705:24) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":709:25) +#loc220 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":710:25) +#loc221 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":712:25) +#loc222 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":713:92) +#loc223 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":716:24) +#loc224 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":717:24) +#loc225 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":718:39) +#loc226 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":719:25) +#loc227 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":720:24) +#loc228 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":721:24) +#loc229 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":731:24) +#loc230 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":306:41) +#loc231 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":307:34) +#loc232 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":307:47) +#loc233 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":308:64) +#loc234 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":308:46) +#loc235 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":310:36) +#loc236 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":318:20) +#loc237 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":676:20) +#loc238 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":262:30) +#loc239 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":263:51) +#loc240 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":266:34) +#loc241 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":266:44) +#loc242 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":266:67) +#loc243 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":267:36) +#loc244 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":267:46) +#loc245 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":267:70) +#loc246 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":269:39) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":269:50) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":269:60) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":271:21) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":272:23) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":275:25) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":276:29) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":601:18) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":602:19) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":674:28) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":748:29) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":669:105) +#loc258 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":674:22) +#loc259 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":741:99) +#loc260 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":748:21) +#loc261 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":674:52) +#loc262 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":626:19) +#loc263 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":627:19) +#loc264 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":628:19) +#loc265 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":833:41) +#loc266 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":775:52) +#loc267 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":675:26) +#loc268 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":675:46) +#loc269 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":678:15) +#loc270 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":680:46) +#loc271 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":692:78) +#loc272 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":703:25) +#loc273 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":706:24) +#loc274 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":722:24) +#loc275 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":723:25) +#loc276 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":724:25) +#loc277 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":725:92) +#loc278 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":727:24) +#loc279 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":728:24) +#loc280 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":729:39) +#loc281 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":730:25) +#loc282 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":732:24) +#loc283 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":736:69) +#loc284 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":739:27) +#loc285 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":740:44) +#loc286 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":740:40) +#loc287 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":740:22) +#loc288 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":750:29) +#loc289 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":744:24) +#loc290 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":744:43) +#loc291 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":750:20) +#loc292 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":751:25) +#loc293 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":751:22) +#loc294 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":751:16) +#loc295 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":759:70) +#loc296 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":773:45) +#loc297 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":775:24) +#loc298 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":775:43) +#loc299 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":623:62) +#loc300 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":626:28) +#loc301 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":627:28) +#loc302 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":303:12) +#loc303 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":323:23) +#loc304 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":323:55) +#loc305 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":332:71) +#loc306 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":332:61) +#loc307 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":332:30) +#loc308 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":334:14) +#loc309 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:27) +#loc310 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:45) +#loc311 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:53) +#loc312 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:41) +#loc313 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:64) +#loc314 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:71) +#loc315 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:59) +#loc316 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":345:29) +#loc317 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":345:69) +#loc318 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":139:4) +#loc341 = loc("pid"(#loc4)) +#loc342 = loc("NUM_KV_BLOCKS"(#loc6)) +#loc343 = loc("off_zq"(#loc8)) +#loc344 = loc("off_hkv"(#loc9)) +#loc345 = loc("off_zkv"(#loc10)) +#loc346 = loc("k_adj"(#loc11)) +#loc347 = loc("k_adj"(#loc12)) +#loc348 = loc("k_adj"(#loc13)) +#loc349 = loc("k_adj"(#loc14)) +#loc350 = loc("dv_adj"(#loc15)) +#loc351 = loc("dv_adj"(#loc16)) +#loc352 = loc("dv_adj"(#loc17)) +#loc353 = loc("K"(#loc18)) +#loc354 = loc("V"(#loc19)) +#loc355 = loc("DV"(#loc20)) +#loc356 = loc("offs_k"(#loc21)) +#loc357 = loc("off_pid"(#loc24)) +#loc358 = loc("off_hq2"(#loc25)) +#loc359 = loc("off_hq2"(#loc26)) +#loc360 = loc("off_hq2"(#loc27)) +#loc361 = loc("start_m2_block"(#loc28)) +#loc362 = loc("stride_kv_idx_h"(#loc29)) +#loc363 = loc("sparse_kv_num_blks_offset"(#loc30)) +#loc364 = loc("sparse_kv_num_blks_offset"(#loc31)) +#loc365 = loc("sparse_kv_idx_offset"(#loc32)) +#loc366 = loc("sparse_kv_idx_offset"(#loc33)) +#loc367 = loc("sparse_kv_idx_offset"(#loc34)) +#loc368 = loc("q_adj2"(#loc35)) +#loc369 = loc("q_adj2"(#loc36)) +#loc370 = loc("q_adj2"(#loc37)) +#loc371 = loc("q_adj2"(#loc38)) +#loc372 = loc("do_adj2"(#loc39)) +#loc373 = loc("do_adj2"(#loc40)) +#loc374 = loc("do_adj2"(#loc41)) +#loc375 = loc("off_chz2"(#loc42)) +#loc376 = loc("off_chz2"(#loc43)) +#loc377 = loc("off_chz2"(#loc44)) +#loc378 = loc("off_chz2"(#loc45)) +#loc379 = loc("Q2"(#loc46)) +#loc380 = loc("DO2"(#loc47)) +#loc381 = loc("DQ2"(#loc48)) +#loc382 = loc("LSE2"(#loc49)) +#loc383 = loc("DELTA2"(#loc50)) +#loc384 = loc("start_m2"(#loc51)) +#loc385 = loc("offs_m2"(#loc52)) +#loc386 = loc("ptr"(#loc53)) +#loc387 = loc("q"(#loc54)) +#loc388 = loc("ptr"(#loc55)) +#loc389 = loc("ptr"(#loc56)) +#loc390 = loc("ptr"(#loc57)) +#loc391 = loc("ptr"(#loc58)) +#loc392 = loc("do"(#loc61)) +#loc393 = loc("Di"(#loc62)) +#loc394 = loc("Di"(#loc63)) +#loc395 = loc("Di"(#loc64)) +#loc396 = loc("lse"(#loc65)) +#loc397 = loc("lse"(#loc66)) +#loc398 = loc("lse"(#loc67)) +#loc399 = loc("lse"(#loc68)) +#loc400 = loc("lse"(#loc69)) +#loc401 = loc("kv_indices"(#loc70)) +#loc402 = loc("kv_start"(#loc71)) +#loc403 = loc("kv_start"(#loc72)) +#loc404 = loc("sparse_kv_num_blocks"(#loc73)) +#loc405 = loc("sparse_kv_num_blocks"(#loc74)) +#loc406 = loc("offs_n2"(#loc75)) +#loc407 = loc("offs_n2"(#loc76)) +#loc408 = loc("kT_ptrs"(#loc77)) +#loc409 = loc("dq"(#loc78)) +#loc410 = loc("kT_ptrs"(#loc79)) +#loc411 = loc("kT_ptrs"(#loc80)) +#loc412 = loc("kT_ptrs"(#loc81)) +#loc413 = loc("kT_ptrs"(#loc82)) +#loc414 = loc("vT_ptrs"(#loc83)) +#loc415 = loc("vT_ptrs"(#loc84)) +#loc416 = loc("hi"(#loc85)) +#loc417 = loc("hi"(#loc86)) +#loc418 = loc("hi"(#loc87)) +#loc419 = loc("hi"(#loc88)) +#loc420 = loc("kT"(#loc90)) +#loc421 = loc("dq"(#loc91)) +#loc422 = loc("m"(#loc93)) +#loc423 = loc("tmp4"(#loc94)) +#loc424 = loc("tmp7"(#loc95)) +#loc425 = loc("dq"(#loc96)) +#loc426 = loc("tmp7"(#loc97)) +#loc427 = loc("tmp8"(#loc98)) +#loc428 = loc("tmp9"(#loc99)) +#loc429 = loc("tmp10"(#loc100)) +#loc430 = loc("tmp11"(#loc101)) +#loc431 = loc("p"(#loc102)) +#loc432 = loc("ds"(#loc103)) +#loc433 = loc("ds"(#loc104)) +#loc434 = loc("vT"(#loc106)) +#loc435 = loc("kT_ptrs"(#loc107)) +#loc436 = loc("vT_ptrs"(#loc108)) +#loc437 = loc("offs_n2"(#loc109)) +#loc438 = loc("qk"(#loc111)) +#loc439 = loc("dq"(#loc112)) +#loc440 = loc("qk"(#loc113)) +#loc441 = loc("n"(#loc114)) +#loc442 = loc("post_mod_scores"(#loc115)) +#loc443 = loc("tmp5"(#loc116)) +#loc444 = loc("tmp12"(#loc117)) +#loc445 = loc("tmp15"(#loc118)) +#loc446 = loc("tmp16"(#loc119)) +#loc447 = loc("tmp18"(#loc120)) +#loc448 = loc("tmp19"(#loc121)) +#loc449 = loc("tmp22"(#loc122)) +#loc450 = loc("tmp23"(#loc123)) +#loc451 = loc("tmp24"(#loc124)) +#loc452 = loc("tmp25"(#loc125)) +#loc453 = loc("tmp26"(#loc126)) +#loc454 = loc("tmp27"(#loc127)) +#loc455 = loc("tmp28"(#loc128)) +#loc456 = loc("tmp29"(#loc129)) +#loc457 = loc("tmp30"(#loc130)) +#loc458 = loc("tmp31"(#loc131)) +#loc459 = loc("tmp33"(#loc132)) +#loc460 = loc("tmp34"(#loc133)) +#loc461 = loc("tmp35"(#loc134)) +#loc462 = loc("tmp36"(#loc135)) +#loc463 = loc("tmp37"(#loc136)) +#loc464 = loc("tmp38"(#loc137)) +#loc465 = loc("post_mod_scores"(#loc138)) +#loc466 = loc("post_mod_scores"(#loc139)) +#loc467 = loc("p"(#loc140)) +#loc468 = loc("dp"(#loc141)) +#loc469 = loc("ds"(#loc142)) +#loc470 = loc("grad_scores"(#loc143)) +#loc471 = loc("ds"(#loc144)) +#loc472 = loc("ds"(#loc145)) +#loc473 = loc("dq"(#loc146)) +#loc474 = loc("cur_block_idx"(#loc147)) +#loc475 = loc("offset"(#loc148)) +#loc476 = loc("cur_block"(#loc149)) +#loc477 = loc("cur_block"(#loc150)) +#loc478 = loc("next_block"(#loc151)) +#loc479 = loc("next_block"(#loc152)) +#loc480 = loc("next_block"(#loc153)) +#loc481 = loc("next_block"(#loc154)) +#loc482 = loc("needs_jump"(#loc155)) +#loc483 = loc("needs_jump"(#loc156)) +#loc484 = loc("needs_jump"(#loc157)) +#loc485 = loc("jump_to_block"(#loc158)) +#loc486 = loc("jump_to_block"(#loc159)) +#loc487 = loc("jump_to_block"(#loc160)) +#loc488 = loc("offset"(#loc161)) +#loc489 = loc("offset"(#loc162)) +#loc490 = loc("offset"(#loc163)) +#loc491 = loc("offset"(#loc164)) +#loc492 = loc("kT_ptrs"(#loc165)) +#loc493 = loc("kv_indices"(#loc166)) +#loc494 = loc("kv_start"(#loc167)) +#loc495 = loc("kv_start"(#loc168)) +#loc496 = loc("sparse_kv_num_blocks"(#loc169)) +#loc497 = loc("sparse_kv_num_blocks"(#loc170)) +#loc498 = loc("offs_n2"(#loc171)) +#loc499 = loc("dq"(#loc172)) +#loc500 = loc("dq_ptrs"(#loc173)) +#loc501 = loc("dq_ptrs"(#loc174)) +#loc502 = loc("dq"(#loc175)) +#loc503 = loc("stride_q_idx_h"(#loc179)) +#loc504 = loc("start_n1"(#loc180)) +#loc505 = loc("offs_n1"(#loc181)) +#loc506 = loc("k"(#loc182)) +#loc507 = loc("v"(#loc183)) +#loc508 = loc("off_hq1"(#loc184)) +#loc509 = loc("q_adj1"(#loc185)) +#loc510 = loc("off_chz1"(#loc186)) +#loc511 = loc("sparse_q_num_blks_offset"(#loc187)) +#loc512 = loc("sparse_q_num_blks_offset"(#loc188)) +#loc513 = loc("sparse_q_idx_offset"(#loc189)) +#loc514 = loc("sparse_q_idx_offset"(#loc190)) +#loc515 = loc("sparse_q_idx_offset"(#loc191)) +#loc516 = loc("q_indices"(#loc192)) +#loc517 = loc("q_start"(#loc193)) +#loc518 = loc("q_start"(#loc194)) +#loc519 = loc("sparse_q_num_blocks"(#loc195)) +#loc520 = loc("sparse_q_num_blocks"(#loc196)) +#loc521 = loc("offs_m1"(#loc197)) +#loc522 = loc("offs_m1"(#loc198)) +#loc523 = loc("qT_ptrs"(#loc199)) +#loc524 = loc("qT_ptrs"(#loc201)) +#loc525 = loc("qT_ptrs"(#loc202)) +#loc526 = loc("qT_ptrs"(#loc203)) +#loc527 = loc("do_ptrs"(#loc204)) +#loc528 = loc("do_ptrs"(#loc205)) +#loc529 = loc("do_ptrs"(#loc206)) +#loc530 = loc("hi"(#loc207)) +#loc531 = loc("hi"(#loc208)) +#loc532 = loc("n"(#loc209)) +#loc533 = loc(callsite(#loc210 at #loc200)) +#loc534 = loc("tmp44"(#loc211)) +#loc535 = loc("tmp45"(#loc212)) +#loc536 = loc("tmp47"(#loc213)) +#loc537 = loc("dk"(#loc214)) +#loc538 = loc("tmp47"(#loc215)) +#loc539 = loc("tmp48"(#loc216)) +#loc540 = loc("tmp50"(#loc217)) +#loc541 = loc("tmp51"(#loc218)) +#loc542 = loc("tmp55"(#loc219)) +#loc543 = loc("tmp56"(#loc220)) +#loc544 = loc("tmp58"(#loc221)) +#loc545 = loc("tmp59"(#loc222)) +#loc546 = loc("tmp62"(#loc223)) +#loc547 = loc("tmp63"(#loc224)) +#loc548 = loc("tmp64"(#loc225)) +#loc549 = loc("tmp65"(#loc226)) +#loc550 = loc("tmp66"(#loc227)) +#loc551 = loc("tmp67"(#loc228)) +#loc552 = loc("tmp77"(#loc229)) +#loc553 = loc("q_indices"(#loc230)) +#loc554 = loc("q_start"(#loc231)) +#loc555 = loc("q_start"(#loc232)) +#loc556 = loc("sparse_q_num_blocks"(#loc233)) +#loc557 = loc("sparse_q_num_blocks"(#loc234)) +#loc558 = loc("offs_m1"(#loc235)) +#loc559 = loc("qkT"(#loc237)) +#loc560 = loc("dv"(#loc238)) +#loc561 = loc("off_hq1"(#loc239)) +#loc562 = loc("q_adj1"(#loc240)) +#loc563 = loc("q_adj1"(#loc241)) +#loc564 = loc("q_adj1"(#loc242)) +#loc565 = loc("do_adj1"(#loc243)) +#loc566 = loc("do_adj1"(#loc244)) +#loc567 = loc("do_adj1"(#loc245)) +#loc568 = loc("off_chz1"(#loc246)) +#loc569 = loc("off_chz1"(#loc247)) +#loc570 = loc("off_chz1"(#loc248)) +#loc571 = loc("Q1"(#loc249)) +#loc572 = loc("DO1"(#loc250)) +#loc573 = loc("LSE1"(#loc251)) +#loc574 = loc("DELTA1"(#loc252)) +#loc575 = loc("qT_ptrs"(#loc253)) +#loc576 = loc("do_ptrs"(#loc254)) +#loc577 = loc("lse"(#loc255)) +#loc578 = loc("Di"(#loc256)) +#loc579 = loc("qT"(#loc257)) +#loc580 = loc("lse"(#loc258)) +#loc581 = loc("do"(#loc259)) +#loc582 = loc("Di"(#loc260)) +#loc583 = loc("lse"(#loc261)) +#loc584 = loc("qT_ptrs"(#loc262)) +#loc585 = loc("do_ptrs"(#loc263)) +#loc586 = loc("offs_m1"(#loc264)) +#loc587 = loc("dk"(#loc266)) +#loc588 = loc("lse"(#loc267)) +#loc589 = loc("lse"(#loc268)) +#loc590 = loc("qkT"(#loc269)) +#loc591 = loc("m"(#loc270)) +#loc592 = loc("post_mod_scores"(#loc271)) +#loc593 = loc("tmp49"(#loc272)) +#loc594 = loc("tmp52"(#loc273)) +#loc595 = loc("tmp68"(#loc274)) +#loc596 = loc("tmp69"(#loc275)) +#loc597 = loc("tmp70"(#loc276)) +#loc598 = loc("tmp71"(#loc277)) +#loc599 = loc("tmp73"(#loc278)) +#loc600 = loc("tmp74"(#loc279)) +#loc601 = loc("tmp75"(#loc280)) +#loc602 = loc("tmp76"(#loc281)) +#loc603 = loc("tmp78"(#loc282)) +#loc604 = loc("post_mod_scores"(#loc283)) +#loc605 = loc("post_mod_scores"(#loc284)) +#loc606 = loc("pT"(#loc285)) +#loc607 = loc("pT"(#loc286)) +#loc608 = loc("pT"(#loc287)) +#loc609 = loc("dpT"(#loc288)) +#loc610 = loc("dv"(#loc289)) +#loc611 = loc("dv"(#loc290)) +#loc612 = loc("dpT"(#loc291)) +#loc613 = loc("dsT"(#loc292)) +#loc614 = loc("dsT"(#loc293)) +#loc615 = loc("dsT"(#loc294)) +#loc616 = loc("grad_scores"(#loc295)) +#loc617 = loc("dsT"(#loc296)) +#loc618 = loc("dk"(#loc297)) +#loc619 = loc("dk"(#loc298)) +#loc620 = loc("offset"(#loc299)) +#loc621 = loc("qT_ptrs"(#loc300)) +#loc622 = loc("do_ptrs"(#loc301)) +#loc623 = loc(callsite(#loc210 at #loc236)) +#loc624 = loc("dv_ptrs"(#loc303)) +#loc625 = loc("dv_ptrs"(#loc304)) +#loc626 = loc("dk"(#loc308)) +#loc627 = loc("xindex"(#loc309)) +#loc628 = loc("xindex"(#loc310)) +#loc629 = loc("xindex"(#loc311)) +#loc630 = loc("xindex"(#loc312)) +#loc631 = loc("xindex"(#loc313)) +#loc632 = loc("xindex"(#loc314)) +#loc633 = loc("xindex"(#loc315)) +#loc634 = loc(callsite(#loc5 at #loc342)) +#loc635 = loc(callsite(#loc7 at #loc342)) +#loc636 = loc(callsite(#loc386 at #loc387)) +#loc637 = loc(callsite(#loc388 at #loc387)) +#loc638 = loc(callsite(#loc389 at #loc387)) +#loc639 = loc(callsite(#loc390 at #loc387)) +#loc640 = loc(callsite(#loc391 at #loc387)) +#loc641 = loc(callsite(#loc59 at #loc387)) +#loc642 = loc(callsite(#loc60 at #loc387)) +#loc643 = loc(callsite(#loc388 at #loc392)) +#loc644 = loc(callsite(#loc389 at #loc392)) +#loc645 = loc(callsite(#loc391 at #loc392)) +#loc646 = loc(callsite(#loc60 at #loc392)) +#loc647 = loc(callsite(#loc408 at #loc409)) +#loc648 = loc(callsite(#loc410 at #loc409)) +#loc649 = loc(callsite(#loc411 at #loc409)) +#loc650 = loc(callsite(#loc412 at #loc409)) +#loc651 = loc(callsite(#loc413 at #loc409)) +#loc652 = loc(callsite(#loc414 at #loc409)) +#loc653 = loc(callsite(#loc415 at #loc409)) +#loc654 = loc(callsite(#loc416 at #loc409)) +#loc655 = loc(callsite(#loc417 at #loc409)) +#loc656 = loc(callsite(#loc418 at #loc409)) +#loc657 = loc(callsite(#loc419 at #loc409)) +#loc658 = loc(callsite(#loc421 at #loc409)) +#loc659 = loc("offs_n2"(#loc425)) +#loc660 = loc(callsite(#loc435 at #loc409)) +#loc661 = loc(callsite(#loc436 at #loc409)) +#loc662 = loc(callsite(#loc437 at #loc409)) +#loc663 = loc(callsite(#loc475 at #loc409)) +#loc664 = loc(callsite(#loc492 at #loc409)) +#loc665 = loc(callsite(#loc408 at #loc499)) +#loc666 = loc(callsite(#loc410 at #loc499)) +#loc667 = loc(callsite(#loc411 at #loc499)) +#loc668 = loc(callsite(#loc413 at #loc499)) +#loc669 = loc(callsite(#loc414 at #loc499)) +#loc670 = loc(callsite(#loc415 at #loc499)) +#loc671 = loc(callsite(#loc416 at #loc499)) +#loc672 = loc(callsite(#loc419 at #loc499)) +#loc673 = loc(callsite(#loc421 at #loc499)) +#loc674 = loc(callsite(#loc435 at #loc499)) +#loc675 = loc(callsite(#loc436 at #loc499)) +#loc676 = loc(callsite(#loc437 at #loc499)) +#loc677 = loc(callsite(#loc475 at #loc499)) +#loc678 = loc(callsite(#loc492 at #loc499)) +#loc679 = loc(callsite(#loc386 at #loc506)) +#loc680 = loc(callsite(#loc388 at #loc506)) +#loc681 = loc(callsite(#loc389 at #loc506)) +#loc682 = loc(callsite(#loc390 at #loc506)) +#loc683 = loc(callsite(#loc391 at #loc506)) +#loc684 = loc(callsite(#loc59 at #loc506)) +#loc685 = loc(callsite(#loc60 at #loc506)) +#loc686 = loc(callsite(#loc389 at #loc507)) +#loc687 = loc(callsite(#loc391 at #loc507)) +#loc688 = loc(callsite(#loc60 at #loc507)) +#loc689 = loc(callsite(#loc523 at #loc200)) +#loc690 = loc(callsite(#loc524 at #loc200)) +#loc691 = loc(callsite(#loc525 at #loc200)) +#loc692 = loc(callsite(#loc526 at #loc200)) +#loc693 = loc(callsite(#loc527 at #loc200)) +#loc694 = loc(callsite(#loc528 at #loc200)) +#loc695 = loc(callsite(#loc529 at #loc200)) +#loc696 = loc(callsite(#loc530 at #loc200)) +#loc697 = loc(callsite(#loc531 at #loc200)) +#loc698 = loc(callsite(#loc532 at #loc533)) +#loc699 = loc(callsite(#loc534 at #loc533)) +#loc700 = loc(callsite(#loc535 at #loc533)) +#loc701 = loc(callsite(#loc536 at #loc533)) +#loc702 = loc("dv"(#loc537)) +#loc703 = loc(callsite(#loc538 at #loc533)) +#loc704 = loc(callsite(#loc539 at #loc533)) +#loc705 = loc(callsite(#loc540 at #loc533)) +#loc706 = loc(callsite(#loc541 at #loc533)) +#loc707 = loc(callsite(#loc542 at #loc533)) +#loc708 = loc(callsite(#loc543 at #loc533)) +#loc709 = loc(callsite(#loc544 at #loc533)) +#loc710 = loc(callsite(#loc545 at #loc533)) +#loc711 = loc(callsite(#loc546 at #loc533)) +#loc712 = loc(callsite(#loc547 at #loc533)) +#loc713 = loc(callsite(#loc548 at #loc533)) +#loc714 = loc(callsite(#loc549 at #loc533)) +#loc715 = loc(callsite(#loc550 at #loc533)) +#loc716 = loc(callsite(#loc551 at #loc533)) +#loc717 = loc(callsite(#loc552 at #loc533)) +#loc718 = loc(callsite(#loc523 at #loc236)) +#loc719 = loc(callsite(#loc524 at #loc236)) +#loc720 = loc(callsite(#loc527 at #loc236)) +#loc721 = loc(callsite(#loc528 at #loc236)) +#loc722 = loc(callsite(#loc530 at #loc236)) +#loc723 = loc(callsite(#loc531 at #loc236)) +#loc724 = loc(callsite(#loc559 at #loc533)) +#loc725 = loc("dk"(#loc560)) +#loc726 = loc(callsite(#loc575 at #loc200)) +#loc727 = loc(callsite(#loc576 at #loc200)) +#loc728 = loc(callsite(#loc577 at #loc533)) +#loc729 = loc(callsite(#loc578 at #loc533)) +#loc730 = loc(callsite(#loc579 at #loc533)) +#loc731 = loc(callsite(#loc580 at #loc533)) +#loc732 = loc(callsite(#loc581 at #loc533)) +#loc733 = loc(callsite(#loc582 at #loc533)) +#loc734 = loc(callsite(#loc583 at #loc533)) +#loc735 = loc(callsite(#loc584 at #loc200)) +#loc736 = loc(callsite(#loc585 at #loc200)) +#loc737 = loc(callsite(#loc586 at #loc200)) +#loc738 = loc(callsite(#loc587 at #loc533)) +#loc739 = loc(callsite(#loc588 at #loc533)) +#loc740 = loc(callsite(#loc589 at #loc533)) +#loc741 = loc(callsite(#loc590 at #loc533)) +#loc742 = loc(callsite(#loc591 at #loc533)) +#loc743 = loc(callsite(#loc592 at #loc533)) +#loc744 = loc(callsite(#loc593 at #loc533)) +#loc745 = loc(callsite(#loc594 at #loc533)) +#loc746 = loc(callsite(#loc595 at #loc533)) +#loc747 = loc(callsite(#loc596 at #loc533)) +#loc748 = loc(callsite(#loc597 at #loc533)) +#loc749 = loc(callsite(#loc598 at #loc533)) +#loc750 = loc(callsite(#loc599 at #loc533)) +#loc751 = loc(callsite(#loc600 at #loc533)) +#loc752 = loc(callsite(#loc601 at #loc533)) +#loc753 = loc(callsite(#loc602 at #loc533)) +#loc754 = loc(callsite(#loc603 at #loc533)) +#loc755 = loc(callsite(#loc604 at #loc533)) +#loc756 = loc(callsite(#loc605 at #loc533)) +#loc757 = loc(callsite(#loc606 at #loc533)) +#loc758 = loc(callsite(#loc607 at #loc533)) +#loc759 = loc(callsite(#loc608 at #loc533)) +#loc760 = loc(callsite(#loc609 at #loc533)) +#loc761 = loc(callsite(#loc610 at #loc533)) +#loc762 = loc(callsite(#loc611 at #loc533)) +#loc763 = loc(callsite(#loc612 at #loc533)) +#loc764 = loc(callsite(#loc613 at #loc533)) +#loc765 = loc(callsite(#loc614 at #loc533)) +#loc766 = loc(callsite(#loc615 at #loc533)) +#loc767 = loc(callsite(#loc616 at #loc533)) +#loc768 = loc(callsite(#loc617 at #loc533)) +#loc769 = loc(callsite(#loc618 at #loc533)) +#loc770 = loc(callsite(#loc619 at #loc533)) +#loc771 = loc(callsite(#loc620 at #loc200)) +#loc772 = loc(callsite(#loc621 at #loc200)) +#loc773 = loc(callsite(#loc622 at #loc200)) +#loc774 = loc(callsite(#loc575 at #loc236)) +#loc775 = loc(callsite(#loc526 at #loc236)) +#loc776 = loc(callsite(#loc576 at #loc236)) +#loc777 = loc(callsite(#loc529 at #loc236)) +#loc778 = loc(callsite(#loc579 at #loc623)) +#loc779 = loc(callsite(#loc580 at #loc623)) +#loc780 = loc(callsite(#loc581 at #loc623)) +#loc781 = loc(callsite(#loc582 at #loc623)) +#loc782 = loc(callsite(#loc583 at #loc623)) +#loc783 = loc(callsite(#loc577 at #loc623)) +#loc784 = loc(callsite(#loc578 at #loc623)) +#loc785 = loc(callsite(#loc584 at #loc236)) +#loc786 = loc(callsite(#loc585 at #loc236)) +#loc787 = loc(callsite(#loc586 at #loc236)) +#loc788 = loc(callsite(#loc587 at #loc623)) +#loc789 = loc(callsite(#loc588 at #loc623)) +#loc790 = loc(callsite(#loc589 at #loc623)) +#loc791 = loc(callsite(#loc559 at #loc623)) +#loc792 = loc(callsite(#loc590 at #loc623)) +#loc793 = loc(callsite(#loc592 at #loc623)) +#loc794 = loc(callsite(#loc605 at #loc623)) +#loc795 = loc(callsite(#loc606 at #loc623)) +#loc796 = loc(callsite(#loc607 at #loc623)) +#loc797 = loc(callsite(#loc608 at #loc623)) +#loc798 = loc(callsite(#loc609 at #loc623)) +#loc799 = loc(callsite(#loc610 at #loc623)) +#loc800 = loc(callsite(#loc611 at #loc623)) +#loc801 = loc(callsite(#loc612 at #loc623)) +#loc802 = loc(callsite(#loc613 at #loc623)) +#loc803 = loc(callsite(#loc614 at #loc623)) +#loc804 = loc(callsite(#loc615 at #loc623)) +#loc805 = loc(callsite(#loc616 at #loc623)) +#loc806 = loc(callsite(#loc618 at #loc623)) +#loc807 = loc(callsite(#loc619 at #loc623)) +#loc808 = loc(callsite(#loc620 at #loc236)) +#loc809 = loc(callsite(#loc621 at #loc236)) +#loc810 = loc(callsite(#loc622 at #loc236)) +#loc811 = loc(callsite(#loc5 at #loc655)) +#loc812 = loc(callsite(#loc7 at #loc655)) +#loc813 = loc(callsite(#loc420 at #loc658)) +#loc814 = loc(callsite(#loc422 at #loc658)) +#loc815 = loc(callsite(#loc423 at #loc658)) +#loc816 = loc(callsite(#loc424 at #loc658)) +#loc817 = loc("kT_ptrs"(#loc659)) +#loc818 = loc(callsite(#loc426 at #loc658)) +#loc819 = loc(callsite(#loc427 at #loc658)) +#loc820 = loc(callsite(#loc428 at #loc658)) +#loc821 = loc(callsite(#loc429 at #loc658)) +#loc822 = loc(callsite(#loc430 at #loc658)) +#loc823 = loc(callsite(#loc431 at #loc658)) +#loc824 = loc(callsite(#loc432 at #loc658)) +#loc825 = loc(callsite(#loc433 at #loc658)) +#loc826 = loc(callsite(#loc434 at #loc658)) +#loc827 = loc(callsite(#loc438 at #loc658)) +#loc828 = loc(callsite(#loc439 at #loc658)) +#loc829 = loc(callsite(#loc440 at #loc658)) +#loc830 = loc(callsite(#loc441 at #loc658)) +#loc831 = loc(callsite(#loc442 at #loc658)) +#loc832 = loc(callsite(#loc443 at #loc658)) +#loc833 = loc(callsite(#loc444 at #loc658)) +#loc834 = loc(callsite(#loc445 at #loc658)) +#loc835 = loc(callsite(#loc446 at #loc658)) +#loc836 = loc(callsite(#loc447 at #loc658)) +#loc837 = loc(callsite(#loc448 at #loc658)) +#loc838 = loc(callsite(#loc449 at #loc658)) +#loc839 = loc(callsite(#loc450 at #loc658)) +#loc840 = loc(callsite(#loc451 at #loc658)) +#loc841 = loc(callsite(#loc452 at #loc658)) +#loc842 = loc(callsite(#loc453 at #loc658)) +#loc843 = loc(callsite(#loc454 at #loc658)) +#loc844 = loc(callsite(#loc455 at #loc658)) +#loc845 = loc(callsite(#loc456 at #loc658)) +#loc846 = loc(callsite(#loc457 at #loc658)) +#loc847 = loc(callsite(#loc458 at #loc658)) +#loc848 = loc(callsite(#loc459 at #loc658)) +#loc849 = loc(callsite(#loc460 at #loc658)) +#loc850 = loc(callsite(#loc461 at #loc658)) +#loc851 = loc(callsite(#loc462 at #loc658)) +#loc852 = loc(callsite(#loc463 at #loc658)) +#loc853 = loc(callsite(#loc464 at #loc658)) +#loc854 = loc(callsite(#loc465 at #loc658)) +#loc855 = loc(callsite(#loc466 at #loc658)) +#loc856 = loc(callsite(#loc467 at #loc658)) +#loc857 = loc(callsite(#loc468 at #loc658)) +#loc858 = loc(callsite(#loc469 at #loc658)) +#loc859 = loc(callsite(#loc470 at #loc658)) +#loc860 = loc(callsite(#loc471 at #loc658)) +#loc861 = loc(callsite(#loc472 at #loc658)) +#loc862 = loc(callsite(#loc473 at #loc658)) +#loc863 = loc(callsite(#loc474 at #loc663)) +#loc864 = loc(callsite(#loc476 at #loc663)) +#loc865 = loc(callsite(#loc477 at #loc663)) +#loc866 = loc(callsite(#loc478 at #loc663)) +#loc867 = loc(callsite(#loc479 at #loc663)) +#loc868 = loc(callsite(#loc480 at #loc663)) +#loc869 = loc(callsite(#loc481 at #loc663)) +#loc870 = loc(callsite(#loc482 at #loc663)) +#loc871 = loc(callsite(#loc483 at #loc663)) +#loc872 = loc(callsite(#loc484 at #loc663)) +#loc873 = loc(callsite(#loc485 at #loc663)) +#loc874 = loc(callsite(#loc486 at #loc663)) +#loc875 = loc(callsite(#loc487 at #loc663)) +#loc876 = loc(callsite(#loc488 at #loc663)) +#loc877 = loc(callsite(#loc489 at #loc663)) +#loc878 = loc(callsite(#loc490 at #loc663)) +#loc879 = loc(callsite(#loc491 at #loc663)) +#loc880 = loc(callsite(#loc420 at #loc673)) +#loc881 = loc(callsite(#loc434 at #loc673)) +#loc882 = loc(callsite(#loc438 at #loc673)) +#loc883 = loc(callsite(#loc439 at #loc673)) +#loc884 = loc(callsite(#loc440 at #loc673)) +#loc885 = loc(callsite(#loc442 at #loc673)) +#loc886 = loc(callsite(#loc466 at #loc673)) +#loc887 = loc(callsite(#loc431 at #loc673)) +#loc888 = loc(callsite(#loc467 at #loc673)) +#loc889 = loc(callsite(#loc468 at #loc673)) +#loc890 = loc(callsite(#loc433 at #loc673)) +#loc891 = loc(callsite(#loc469 at #loc673)) +#loc892 = loc(callsite(#loc470 at #loc673)) +#loc893 = loc(callsite(#loc472 at #loc673)) +#loc894 = loc(callsite(#loc473 at #loc673)) +#loc895 = loc(callsite(#loc474 at #loc677)) +#loc896 = loc(callsite(#loc476 at #loc677)) +#loc897 = loc(callsite(#loc477 at #loc677)) +#loc898 = loc(callsite(#loc478 at #loc677)) +#loc899 = loc(callsite(#loc479 at #loc677)) +#loc900 = loc(callsite(#loc480 at #loc677)) +#loc901 = loc(callsite(#loc481 at #loc677)) +#loc902 = loc(callsite(#loc482 at #loc677)) +#loc903 = loc(callsite(#loc483 at #loc677)) +#loc904 = loc(callsite(#loc484 at #loc677)) +#loc905 = loc(callsite(#loc485 at #loc677)) +#loc906 = loc(callsite(#loc486 at #loc677)) +#loc907 = loc(callsite(#loc487 at #loc677)) +#loc908 = loc(callsite(#loc488 at #loc677)) +#loc909 = loc(callsite(#loc489 at #loc677)) +#loc910 = loc(callsite(#loc490 at #loc677)) +#loc911 = loc(callsite(#loc491 at #loc677)) +#loc912 = loc(callsite(#loc92 at #loc698)) +#loc913 = loc("offs_m1"(#loc702)) +#loc914 = loc(callsite(#loc105 at #loc730)) +#loc915 = loc(callsite(#loc60 at #loc732)) +#loc916 = loc(callsite(#loc89 at #loc730)) +#loc917 = loc(callsite(#loc59 at #loc732)) +#loc918 = loc(callsite(#loc110 at #loc730)) +#loc919 = loc(callsite(#loc265 at #loc732)) +#loc920 = loc(callsite(#loc92 at #loc742)) +#loc921 = loc(callsite(#loc474 at #loc771)) +#loc922 = loc(callsite(#loc476 at #loc771)) +#loc923 = loc(callsite(#loc477 at #loc771)) +#loc924 = loc(callsite(#loc478 at #loc771)) +#loc925 = loc(callsite(#loc479 at #loc771)) +#loc926 = loc(callsite(#loc480 at #loc771)) +#loc927 = loc(callsite(#loc481 at #loc771)) +#loc928 = loc(callsite(#loc482 at #loc771)) +#loc929 = loc(callsite(#loc483 at #loc771)) +#loc930 = loc(callsite(#loc484 at #loc771)) +#loc931 = loc(callsite(#loc485 at #loc771)) +#loc932 = loc(callsite(#loc486 at #loc771)) +#loc933 = loc(callsite(#loc487 at #loc771)) +#loc934 = loc(callsite(#loc488 at #loc771)) +#loc935 = loc(callsite(#loc489 at #loc771)) +#loc936 = loc(callsite(#loc490 at #loc771)) +#loc937 = loc(callsite(#loc491 at #loc771)) +#loc938 = loc(callsite(#loc105 at #loc778)) +#loc939 = loc(callsite(#loc60 at #loc780)) +#loc940 = loc(callsite(#loc89 at #loc778)) +#loc941 = loc(callsite(#loc59 at #loc780)) +#loc942 = loc(callsite(#loc110 at #loc778)) +#loc943 = loc(callsite(#loc265 at #loc780)) +#loc944 = loc(callsite(#loc474 at #loc808)) +#loc945 = loc(callsite(#loc476 at #loc808)) +#loc946 = loc(callsite(#loc477 at #loc808)) +#loc947 = loc(callsite(#loc478 at #loc808)) +#loc948 = loc(callsite(#loc479 at #loc808)) +#loc949 = loc(callsite(#loc480 at #loc808)) +#loc950 = loc(callsite(#loc481 at #loc808)) +#loc951 = loc(callsite(#loc482 at #loc808)) +#loc952 = loc(callsite(#loc483 at #loc808)) +#loc953 = loc(callsite(#loc484 at #loc808)) +#loc954 = loc(callsite(#loc485 at #loc808)) +#loc955 = loc(callsite(#loc486 at #loc808)) +#loc956 = loc(callsite(#loc487 at #loc808)) +#loc957 = loc(callsite(#loc488 at #loc808)) +#loc958 = loc(callsite(#loc489 at #loc808)) +#loc959 = loc(callsite(#loc490 at #loc808)) +#loc960 = loc(callsite(#loc491 at #loc808)) +#loc961 = loc(callsite(#loc89 at #loc813)) +#loc962 = loc(callsite(#loc92 at #loc814)) +#loc963 = loc("vT_ptrs"(#loc817)) +#loc964 = loc(callsite(#loc105 at #loc813)) +#loc965 = loc(callsite(#loc105 at #loc826)) +#loc966 = loc(callsite(#loc110 at #loc813)) +#loc967 = loc(callsite(#loc92 at #loc830)) +#loc968 = loc(callsite(#loc105 at #loc880)) +#loc969 = loc(callsite(#loc105 at #loc881)) +#loc970 = loc(callsite(#loc89 at #loc880)) +#loc971 = loc(callsite(#loc110 at #loc880)) +#loc972 = loc("qT_ptrs"(#loc913)) +#loc973 = loc(callsite(#loc963 at #loc409)) +#loc974 = loc(callsite(#loc963 at #loc499)) +#loc975 = loc("do_ptrs"(#loc972)) +#loc976 = loc(callsite(#loc975 at #loc200)) +#loc977 = loc(callsite(#loc975 at #loc236)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..481d0d939a1d761b95dbd6a9a8ce0d9f584b7119 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/PSYZLLRPOLLLL7NMMQLWVW4WJMIWKTD7RDS4PZKCZIUYLWINUVAA/triton_tem_fused_zeros_1.ttir @@ -0,0 +1,1593 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":18:0) +#loc323 = loc("arg_Q"(#loc)) +#loc324 = loc("arg_K"(#loc)) +#loc325 = loc("arg_V"(#loc)) +#loc326 = loc("arg_LSE"(#loc)) +#loc327 = loc("arg_DELTA"(#loc)) +#loc328 = loc("arg_DO"(#loc)) +#loc329 = loc("arg_DQ"(#loc)) +#loc330 = loc("arg_DV"(#loc)) +#loc331 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc332 = loc("arg_KV_IDX"(#loc)) +#loc333 = loc("arg_Q_NUM_BLKS"(#loc)) +#loc334 = loc("arg_Q_IDX"(#loc)) +#loc335 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc336 = loc("arg_FULL_KV_IDX"(#loc)) +#loc337 = loc("arg_FULL_Q_NUM_BLKS"(#loc)) +#loc338 = loc("arg_FULL_Q_IDX"(#loc)) +#loc339 = loc("in_ptr16"(#loc)) +#loc340 = loc("out_ptr0"(#loc)) +#loc341 = loc("ks0"(#loc)) +#loc342 = loc("ks1"(#loc)) +#loc343 = loc("ks2"(#loc)) +#loc344 = loc("ks3"(#loc)) +module { + tt.func public @triton_tem_fused_zeros_1(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_DELTA: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DELTA"(#loc)), %arg_DO: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DO"(#loc)), %arg_DQ: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DQ"(#loc)), %arg_DV: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_DV"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_NUM_BLKS"(#loc)), %arg_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %arg_FULL_Q_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_NUM_BLKS"(#loc)), %arg_FULL_Q_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_Q_IDX"(#loc)), %in_ptr16: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr16"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_0 = arith.constant dense<4096> : tensor<1x64xi32> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<128x1xi32> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64xf32> loc(#loc1) + %cst_5 = arith.constant dense<0xFF800000> : tensor<64xf32> loc(#loc1) + %cst_6 = arith.constant dense<2048> : tensor<64xi32> loc(#loc1) + %cst_7 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %c63_i32 = arith.constant 63 : i32 loc(#loc1) + %cst_8 = arith.constant dense<2048> : tensor<128x64xi32> loc(#loc1) + %cst_9 = arith.constant dense<2048> : tensor<1x64xi32> loc(#loc1) + %cst_10 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1) + %cst_11 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc1) + %cst_12 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc1) + %cst_13 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1) + %cst_14 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1) + %cst_15 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1) + %cst_16 = arith.constant dense<0.000000e+00> : tensor<128x64xbf16> loc(#loc1) + %cst_17 = arith.constant dense<0.000000e+00> : tensor<128x128xbf16> loc(#loc1) + %cst_18 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc1) + %c127_i32 = arith.constant 127 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_19 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc1) + %cst_20 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc1) + %cst_21 = arith.constant dense<2048> : tensor<128x1xi32> loc(#loc1) + %cst_22 = arith.constant dense<0.0883883461> : tensor<128x128xf32> loc(#loc1) + %cst_23 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc1) + %cst_24 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc1) + %cst_25 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc1) + %cst_26 = arith.constant dense<2048> : tensor<128xi32> loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %Q_LEN = arith.constant 2048 : i32 loc(#loc345) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c262144_i32 = arith.constant 262144 : i32 loc(#loc3) + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %0 = arith.muli %ks0, %c1024_i32 : i32 loc(#loc4) + %1 = arith.muli %ks0, %c128_i32 : i32 loc(#loc5) + %pid = tt.get_program_id x : i32 loc(#loc346) + %NUM_KV_BLOCKS = arith.addi %ks0, %c127_i32 : i32 loc(#loc641) + %NUM_KV_BLOCKS_27 = arith.divsi %NUM_KV_BLOCKS, %c128_i32 : i32 loc(#loc642) + %off_zq = tt.get_program_id y : i32 loc(#loc348) + %off_hkv = tt.get_program_id z : i32 loc(#loc349) + %off_zkv = arith.remsi %off_zq, %c8_i32 : i32 loc(#loc350) + %k_adj = arith.muli %1, %off_hkv : i32 loc(#loc351) + %k_adj_28 = arith.muli %0, %off_zkv : i32 loc(#loc352) + %k_adj_29 = arith.addi %k_adj, %k_adj_28 : i32 loc(#loc353) + %k_adj_30 = arith.extsi %k_adj_29 : i32 to i64 loc(#loc354) + %dv_adj = arith.muli %0, %off_zq : i32 loc(#loc355) + %dv_adj_31 = arith.addi %k_adj, %dv_adj : i32 loc(#loc356) + %dv_adj_32 = arith.extsi %dv_adj_31 : i32 to i64 loc(#loc357) + %K = tt.addptr %arg_K, %k_adj_30 : !tt.ptr, i64 loc(#loc358) + %V = tt.addptr %arg_V, %k_adj_30 : !tt.ptr, i64 loc(#loc359) + %DV = tt.addptr %arg_DV, %dv_adj_32 : !tt.ptr, i64 loc(#loc360) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc361) + %2 = arith.cmpi sge, %pid, %NUM_KV_BLOCKS_27 : i32 loc(#loc24) + scf.if %2 { + %off_pid = arith.subi %pid, %NUM_KV_BLOCKS_27 : i32 loc(#loc362) + %off_hq2 = arith.divsi %off_pid, %c16_i32 : i32 loc(#loc363) + %off_hq2_33 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc364) + %off_hq2_34 = arith.addi %off_hq2, %off_hq2_33 : i32 loc(#loc365) + %start_m2_block = arith.remsi %off_pid, %c16_i32 : i32 loc(#loc366) + %stride_kv_idx_h = arith.muli %ks1, %c16_i32 : i32 loc(#loc367) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %c16_i32 : i32 loc(#loc368) + %sparse_kv_num_blks_offset_35 = arith.addi %sparse_kv_num_blks_offset, %start_m2_block : i32 loc(#loc369) + %sparse_kv_idx_offset = arith.muli %off_zkv, %stride_kv_idx_h : i32 loc(#loc370) + %sparse_kv_idx_offset_36 = arith.muli %start_m2_block, %ks1 : i32 loc(#loc371) + %sparse_kv_idx_offset_37 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_36 : i32 loc(#loc372) + %q_adj2 = arith.muli %off_hq2_34, %c128_i32 : i32 loc(#loc373) + %q_adj2_38 = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc374) + %q_adj2_39 = arith.addi %q_adj2, %q_adj2_38 : i32 loc(#loc375) + %q_adj2_40 = arith.extsi %q_adj2_39 : i32 to i64 loc(#loc376) + %do_adj2 = arith.muli %off_hq2_34, %c262144_i32 : i32 loc(#loc377) + %do_adj2_41 = arith.addi %do_adj2, %q_adj2_38 : i32 loc(#loc378) + %do_adj2_42 = arith.extsi %do_adj2_41 : i32 to i64 loc(#loc379) + %off_chz2 = arith.muli %off_zq, %c32_i32 : i32 loc(#loc380) + %off_chz2_43 = arith.addi %off_chz2, %off_hq2_34 : i32 loc(#loc381) + %off_chz2_44 = arith.muli %off_chz2_43, %Q_LEN : i32 loc(#loc382) + %off_chz2_45 = arith.extsi %off_chz2_44 : i32 to i64 loc(#loc383) + %Q2 = tt.addptr %arg_Q, %q_adj2_40 : !tt.ptr, i64 loc(#loc384) + %DO2 = tt.addptr %arg_DO, %do_adj2_42 : !tt.ptr, i64 loc(#loc385) + %DQ2 = tt.addptr %arg_DQ, %q_adj2_40 : !tt.ptr, i64 loc(#loc386) + %LSE2 = tt.addptr %arg_LSE, %off_chz2_45 : !tt.ptr, i64 loc(#loc387) + %DELTA2 = tt.addptr %arg_DELTA, %off_chz2_45 : !tt.ptr, i64 loc(#loc388) + %start_m2 = arith.muli %start_m2_block, %c128_i32 : i32 loc(#loc389) + %offs_m2 = tt.splat %start_m2 : i32 -> tensor<128xi32> loc(#loc390) + %offs_m2_46 = arith.addi %offs_m2, %offs_k : tensor<128xi32> loc(#loc390) + %ptr = tt.expand_dims %offs_m2_46 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc643) + %ptr_47 = arith.muli %ptr, %cst_23 : tensor<128x1xi32> loc(#loc644) + %ptr_48 = tt.splat %Q2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc645) + %ptr_49 = tt.addptr %ptr_48, %ptr_47 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc645) + %ptr_50 = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc646) + %ptr_51 = tt.broadcast %ptr_49 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc647) + %ptr_52 = tt.broadcast %ptr_50 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc647) + %ptr_53 = tt.addptr %ptr_51, %ptr_52 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc647) + %q = arith.cmpi slt, %ptr, %cst_21 : tensor<128x1xi32> loc(#loc648) + %q_54 = tt.broadcast %q : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc649) + %q_55 = tt.load %ptr_53, %q_54, %cst_17 : tensor<128x128x!tt.ptr> loc(#loc649) + %ptr_56 = arith.muli %ptr, %cst_19 : tensor<128x1xi32> loc(#loc650) + %ptr_57 = tt.splat %DO2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc651) + %ptr_58 = tt.addptr %ptr_57, %ptr_56 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc651) + %ptr_59 = tt.broadcast %ptr_58 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc652) + %ptr_60 = tt.addptr %ptr_59, %ptr_52 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc652) + %do = tt.load %ptr_60, %q_54, %cst_17 : tensor<128x128x!tt.ptr> loc(#loc653) + %Di = arith.cmpi slt, %offs_m2_46, %cst_26 : tensor<128xi32> loc(#loc398) + %Di_61 = tt.splat %DELTA2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc399) + %Di_62 = tt.addptr %Di_61, %offs_m2_46 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc399) + %Di_63 = tt.load %Di_62, %Di : tensor<128x!tt.ptr> loc(#loc400) + %lse = tt.splat %LSE2 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc401) + %lse_64 = tt.addptr %lse, %offs_m2_46 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc401) + %lse_65 = tt.load %lse_64, %Di : tensor<128x!tt.ptr> loc(#loc402) + %lse_66 = arith.cmpf oeq, %lse_65, %cst_25 : tensor<128xf32> loc(#loc403) + %lse_67 = arith.select %lse_66, %cst_24, %lse_65 : tensor<128xi1>, tensor<128xf32> loc(#loc404) + %lse_68 = tt.expand_dims %lse_67 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc405) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_37 : !tt.ptr, i32 loc(#loc406) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc407) + %kv_start_69 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc408) + %sparse_kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_35 : !tt.ptr, i32 loc(#loc409) + %sparse_kv_num_blocks_70 = tt.load %sparse_kv_num_blocks : !tt.ptr loc(#loc410) + %offs_n2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc411) + %offs_n2_71 = tt.splat %kv_start_69 : i32 -> tensor<64xi32> loc(#loc412) + %offs_n2_72 = arith.addi %offs_n2_71, %offs_n2 : tensor<64xi32> loc(#loc412) + %kT_ptrs = tt.expand_dims %offs_n2_72 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc654) + %kT_ptrs_73 = arith.muli %kT_ptrs, %cst_1 : tensor<1x64xi32> loc(#loc655) + %kT_ptrs_74 = tt.splat %K : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc656) + %kT_ptrs_75 = tt.addptr %kT_ptrs_74, %kT_ptrs_73 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc656) + %kT_ptrs_76 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc657) + %kT_ptrs_77 = tt.broadcast %kT_ptrs_75 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc658) + %kT_ptrs_78 = tt.broadcast %kT_ptrs_76 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc658) + %kT_ptrs_79 = tt.addptr %kT_ptrs_77, %kT_ptrs_78 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc658) + %vT_ptrs = tt.splat %V : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc659) + %vT_ptrs_80 = tt.addptr %vT_ptrs, %kT_ptrs_73 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc659) + %vT_ptrs_81 = tt.broadcast %vT_ptrs_80 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc660) + %vT_ptrs_82 = tt.addptr %vT_ptrs_81, %kT_ptrs_78 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc660) + %hi = arith.muli %sparse_kv_num_blocks_70, %c2_i32 : i32 loc(#loc661) + %hi_83 = arith.addi %ks0, %c63_i32 : i32 loc(#loc820) + %hi_84 = arith.divsi %hi_83, %c64_i32 : i32 loc(#loc821) + %hi_85 = arith.maxsi %hi_84, %c1_i32 : i32 loc(#loc663) + %hi_86 = arith.minsi %hi, %hi_85 : i32 loc(#loc664) + %vT_ptrs_87:4 = scf.for %start_n = %c0_i32 to %hi_86 step %c1_i32 iter_args(%dq_109 = %cst_18, %offs_n2_110 = %offs_n2_72, %kT_ptrs_111 = %kT_ptrs_79, %vT_ptrs_112 = %vT_ptrs_82) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %kT = tt.expand_dims %offs_n2_110 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc972) + %kT_113 = tt.splat %ks0 : i32 -> tensor<1x64xi32> loc(#loc973) + %kT_114 = arith.cmpi slt, %kT, %kT_113 : tensor<1x64xi32> loc(#loc973) + %kT_115 = tt.broadcast %kT_114 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc974) + %kT_116 = tt.load %kT_ptrs_111, %kT_115, %cst_16 : tensor<128x64x!tt.ptr> loc(#loc974) + %qk = tt.dot %q_55, %kT_116, %cst_15, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc824) + %qk_117 = arith.mulf %qk, %cst_14 : tensor<128x64xf32> loc(#loc825) + %n = arith.remsi %kT, %kT_113 : tensor<1x64xi32> loc(#loc975) + %m = arith.remsi %ptr, %cst_21 : tensor<128x1xi32> loc(#loc976) + %post_mod_scores = arith.select %kT_115, %qk_117, %cst_13 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc828) + %tmp4 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc829) + %tmp4_118 = tt.broadcast %n : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc829) + %tmp4_119 = arith.cmpi sge, %tmp4, %tmp4_118 : tensor<128x64xi32> loc(#loc829) + %tmp5 = arith.extsi %n : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc830) + %tmp7 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc831) + %tmp7_120 = tt.load %tmp7 : !tt.ptr loc(#loc832) + %tmp8 = tt.splat %tmp7_120 : i64 -> tensor<1x64xi64> loc(#loc833) + %tmp8_121 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc833) + %tmp9 = arith.extsi %m : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc834) + %tmp10 = tt.splat %tmp7_120 : i64 -> tensor<128x1xi64> loc(#loc835) + %tmp10_122 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc835) + %tmp11 = tt.broadcast %tmp8_121 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc836) + %tmp11_123 = tt.broadcast %tmp10_122 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc836) + %tmp11_124 = arith.andi %tmp11, %tmp11_123 : tensor<128x64xi1> loc(#loc836) + %tmp12 = arith.andi %tmp4_119, %tmp11_124 : tensor<128x64xi1> loc(#loc837) + %tmp15 = arith.cmpi sge, %n, %cst_9 : tensor<1x64xi32> loc(#loc838) + %tmp16 = arith.remsi %n, %cst_9 : tensor<1x64xi32> loc(#loc839) + %tmp18 = arith.cmpi ne, %tmp16, %cst_12 : tensor<1x64xi32> loc(#loc840) + %tmp19 = arith.cmpi slt, %tmp16, %cst_12 : tensor<1x64xi32> loc(#loc841) + %tmp22 = arith.andi %tmp18, %tmp19 : tensor<1x64xi1> loc(#loc842) + %tmp23 = arith.addi %tmp16, %cst_9 : tensor<1x64xi32> loc(#loc843) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc844) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc845) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64> loc(#loc846) + %tmp27 = arith.andi %tmp15, %tmp26 : tensor<1x64xi1> loc(#loc847) + %tmp28 = arith.subi %tmp4_118, %tmp4 : tensor<128x64xi32> loc(#loc848) + %tmp29 = arith.remsi %tmp28, %cst_8 : tensor<128x64xi32> loc(#loc849) + %tmp30 = arith.cmpi ne, %tmp29, %cst_11 : tensor<128x64xi32> loc(#loc850) + %tmp31 = arith.cmpi slt, %tmp29, %cst_11 : tensor<128x64xi32> loc(#loc851) + %tmp33 = arith.andi %tmp30, %tmp31 : tensor<128x64xi1> loc(#loc852) + %tmp34 = arith.addi %tmp29, %cst_8 : tensor<128x64xi32> loc(#loc853) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc854) + %tmp36 = arith.cmpi eq, %tmp35, %cst_11 : tensor<128x64xi32> loc(#loc855) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc856) + %tmp37_125 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1> loc(#loc856) + %tmp38 = arith.ori %tmp12, %tmp37_125 : tensor<128x64xi1> loc(#loc857) + %post_mod_scores_126 = arith.select %tmp38, %post_mod_scores, %cst_13 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc858) + %post_mod_scores_127 = arith.mulf %post_mod_scores_126, %cst_10 : tensor<128x64xf32> loc(#loc859) + %p = tt.broadcast %lse_68 : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc860) + %p_128 = arith.subf %post_mod_scores_127, %p : tensor<128x64xf32> loc(#loc860) + %p_129 = math.exp2 %p_128 : tensor<128x64xf32> loc(#loc861) + %vT = tt.load %vT_ptrs_112, %kT_115, %cst_16 : tensor<128x64x!tt.ptr> loc(#loc977) + %dp = tt.dot %do, %vT, %cst_15, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc863) + %ds = tt.expand_dims %Di_63 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc864) + %ds_130 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc865) + %ds_131 = arith.subf %dp, %ds_130 : tensor<128x64xf32> loc(#loc865) + %ds_132 = arith.mulf %p_129, %ds_131 : tensor<128x64xf32> loc(#loc866) + %grad_scores = arith.select %kT_115, %ds_132, %cst_15 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc867) + %ds_133 = arith.select %tmp38, %grad_scores, %cst_15 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc868) + %ds_134 = arith.truncf %ds_133 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc869) + %dq_135 = tt.trans %kT_116 {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc870) + %dq_136 = tt.dot %ds_134, %dq_135, %dq_109, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc871) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc872) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc873) + %cur_block_137 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc874) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc875) + %next_block_138 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_70 : i32 loc(#loc876) + %next_block_139 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc877) + %next_block_140 = tt.load %next_block_139, %next_block_138 evictionPolicy = evict_last : !tt.ptr loc(#loc878) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc879) + %needs_jump_141 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc880) + %needs_jump_142 = arith.cmpi eq, %needs_jump_141, %c0_i32 : i32 loc(#loc881) + %jump_to_block = arith.subi %next_block_140, %cur_block_137 : i32 loc(#loc882) + %jump_to_block_143 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc883) + %jump_to_block_144 = arith.subi %jump_to_block_143, %c64_i32 : i32 loc(#loc884) + %offset = arith.extui %needs_jump_142 : i1 to i32 loc(#loc885) + %offset_145 = arith.muli %jump_to_block_144, %offset : i32 loc(#loc885) + %offset_146 = arith.subi %c1_i32, %offset : i32 loc(#loc886) + %offset_147 = arith.muli %offset_146, %c64_i32 : i32 loc(#loc887) + %offset_148 = arith.addi %offset_145, %offset_147 : i32 loc(#loc888) + %kT_ptrs_149 = arith.muli %offset_148, %c128_i32 : i32 loc(#loc668) + %kT_ptrs_150 = tt.splat %kT_ptrs_149 : i32 -> tensor<128x64xi32> loc(#loc669) + %kT_ptrs_151 = tt.addptr %kT_ptrs_111, %kT_ptrs_150 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc669) + %vT_ptrs_152 = tt.addptr %vT_ptrs_112, %kT_ptrs_150 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc670) + %offs_n2_153 = tt.splat %offset_148 : i32 -> tensor<64xi32> loc(#loc671) + %offs_n2_154 = arith.addi %offs_n2_110, %offs_n2_153 : tensor<64xi32> loc(#loc671) + scf.yield %dq_136, %offs_n2_154, %kT_ptrs_151, %vT_ptrs_152 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc672) + } loc(#loc983) + %kv_indices_88 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_37 : !tt.ptr, i32 loc(#loc498) + %kv_start_89 = tt.load %kv_indices_88 : !tt.ptr loc(#loc499) + %kv_start_90 = arith.muli %kv_start_89, %c128_i32 : i32 loc(#loc500) + %sparse_kv_num_blocks_91 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_35 : !tt.ptr, i32 loc(#loc501) + %sparse_kv_num_blocks_92 = tt.load %sparse_kv_num_blocks_91 : !tt.ptr loc(#loc502) + %offs_n2_93 = tt.splat %kv_start_90 : i32 -> tensor<64xi32> loc(#loc503) + %offs_n2_94 = arith.addi %offs_n2_93, %offs_n2 : tensor<64xi32> loc(#loc503) + %kT_ptrs_95 = tt.expand_dims %offs_n2_94 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc673) + %kT_ptrs_96 = arith.muli %kT_ptrs_95, %cst_1 : tensor<1x64xi32> loc(#loc674) + %kT_ptrs_97 = tt.addptr %kT_ptrs_74, %kT_ptrs_96 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc675) + %kT_ptrs_98 = tt.broadcast %kT_ptrs_97 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc676) + %kT_ptrs_99 = tt.addptr %kT_ptrs_98, %kT_ptrs_78 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc676) + %vT_ptrs_100 = tt.addptr %vT_ptrs, %kT_ptrs_96 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc677) + %vT_ptrs_101 = tt.broadcast %vT_ptrs_100 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc678) + %vT_ptrs_102 = tt.addptr %vT_ptrs_101, %kT_ptrs_78 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc678) + %hi_103 = arith.muli %sparse_kv_num_blocks_92, %c2_i32 : i32 loc(#loc679) + %hi_104 = arith.minsi %hi_103, %hi_85 : i32 loc(#loc680) + %vT_ptrs_105:4 = scf.for %start_n = %c0_i32 to %hi_104 step %c1_i32 iter_args(%dq_109 = %vT_ptrs_87#0, %offs_n2_110 = %offs_n2_94, %kT_ptrs_111 = %kT_ptrs_99, %vT_ptrs_112 = %vT_ptrs_102) -> (tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr>) : i32 { + %kT = tt.expand_dims %offs_n2_110 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc978) + %kT_113 = tt.splat %ks0 : i32 -> tensor<1x64xi32> loc(#loc979) + %kT_114 = arith.cmpi slt, %kT, %kT_113 : tensor<1x64xi32> loc(#loc979) + %kT_115 = tt.broadcast %kT_114 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc980) + %kT_116 = tt.load %kT_ptrs_111, %kT_115, %cst_16 : tensor<128x64x!tt.ptr> loc(#loc980) + %qk = tt.dot %q_55, %kT_116, %cst_15, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc890) + %qk_117 = arith.mulf %qk, %cst_14 : tensor<128x64xf32> loc(#loc891) + %post_mod_scores = arith.select %kT_115, %qk_117, %cst_13 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc892) + %post_mod_scores_118 = arith.mulf %post_mod_scores, %cst_10 : tensor<128x64xf32> loc(#loc893) + %p = tt.broadcast %lse_68 : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc894) + %p_119 = arith.subf %post_mod_scores_118, %p : tensor<128x64xf32> loc(#loc894) + %p_120 = math.exp2 %p_119 : tensor<128x64xf32> loc(#loc895) + %vT = tt.load %vT_ptrs_112, %kT_115, %cst_16 : tensor<128x64x!tt.ptr> loc(#loc981) + %dp = tt.dot %do, %vT, %cst_15, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc897) + %ds = tt.expand_dims %Di_63 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc898) + %ds_121 = tt.broadcast %ds : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc899) + %ds_122 = arith.subf %dp, %ds_121 : tensor<128x64xf32> loc(#loc899) + %ds_123 = arith.mulf %p_120, %ds_122 : tensor<128x64xf32> loc(#loc900) + %grad_scores = arith.select %kT_115, %ds_123, %cst_15 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc901) + %ds_124 = arith.truncf %grad_scores : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc902) + %dq_125 = tt.trans %kT_116 {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc903) + %dq_126 = tt.dot %ds_124, %dq_125, %dq_109, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc904) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc905) + %cur_block = tt.addptr %kv_indices_88, %cur_block_idx : !tt.ptr, i32 loc(#loc906) + %cur_block_127 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc907) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc908) + %next_block_128 = arith.cmpi slt, %next_block, %sparse_kv_num_blocks_92 : i32 loc(#loc909) + %next_block_129 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc910) + %next_block_130 = tt.load %next_block_129, %next_block_128 evictionPolicy = evict_last : !tt.ptr loc(#loc911) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc912) + %needs_jump_131 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc913) + %needs_jump_132 = arith.cmpi eq, %needs_jump_131, %c0_i32 : i32 loc(#loc914) + %jump_to_block = arith.subi %next_block_130, %cur_block_127 : i32 loc(#loc915) + %jump_to_block_133 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc916) + %jump_to_block_134 = arith.subi %jump_to_block_133, %c64_i32 : i32 loc(#loc917) + %offset = arith.extui %needs_jump_132 : i1 to i32 loc(#loc918) + %offset_135 = arith.muli %jump_to_block_134, %offset : i32 loc(#loc918) + %offset_136 = arith.subi %c1_i32, %offset : i32 loc(#loc919) + %offset_137 = arith.muli %offset_136, %c64_i32 : i32 loc(#loc920) + %offset_138 = arith.addi %offset_135, %offset_137 : i32 loc(#loc921) + %kT_ptrs_139 = arith.muli %offset_138, %c128_i32 : i32 loc(#loc683) + %kT_ptrs_140 = tt.splat %kT_ptrs_139 : i32 -> tensor<128x64xi32> loc(#loc684) + %kT_ptrs_141 = tt.addptr %kT_ptrs_111, %kT_ptrs_140 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc684) + %vT_ptrs_142 = tt.addptr %vT_ptrs_112, %kT_ptrs_140 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc685) + %offs_n2_143 = tt.splat %offset_138 : i32 -> tensor<64xi32> loc(#loc686) + %offs_n2_144 = arith.addi %offs_n2_110, %offs_n2_143 : tensor<64xi32> loc(#loc686) + scf.yield %dq_126, %offs_n2_144, %kT_ptrs_141, %vT_ptrs_142 : tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<128x64x!tt.ptr> loc(#loc687) + } loc(#loc984) + %dq_ptrs = tt.splat %DQ2 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc505) + %dq_ptrs_106 = tt.addptr %dq_ptrs, %ptr_47 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc505) + %dq_ptrs_107 = tt.broadcast %dq_ptrs_106 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc506) + %dq_ptrs_108 = tt.addptr %dq_ptrs_107, %ptr_52 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc506) + %dq = arith.mulf %vT_ptrs_105#0, %cst_22 : tensor<128x128xf32> loc(#loc507) + %3 = arith.cmpi slt, %ptr_50, %cst_20 : tensor<1x128xi32> loc(#loc179) + %4 = tt.broadcast %3 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc180) + %5 = arith.andi %q_54, %4 : tensor<128x128xi1> loc(#loc180) + %6 = arith.truncf %dq : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc181) + tt.store %dq_ptrs_108, %6, %5 : tensor<128x128x!tt.ptr> loc(#loc181) + } else { + %stride_q_idx_h = arith.muli %ks3, %c16_i32 : i32 loc(#loc508) + %start_n1 = arith.muli %pid, %c128_i32 : i32 loc(#loc509) + %offs_n1 = tt.splat %start_n1 : i32 -> tensor<128xi32> loc(#loc510) + %offs_n1_33 = arith.addi %offs_n1, %offs_k : tensor<128xi32> loc(#loc510) + %ptr = tt.expand_dims %offs_n1_33 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc688) + %ptr_34 = arith.muli %ptr, %cst_19 : tensor<128x1xi32> loc(#loc689) + %ptr_35 = tt.splat %K : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc690) + %ptr_36 = tt.addptr %ptr_35, %ptr_34 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc690) + %ptr_37 = tt.expand_dims %offs_k {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc691) + %ptr_38 = tt.broadcast %ptr_36 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc692) + %ptr_39 = tt.broadcast %ptr_37 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc692) + %ptr_40 = tt.addptr %ptr_38, %ptr_39 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc692) + %k = tt.splat %ks0 : i32 -> tensor<128x1xi32> loc(#loc693) + %k_41 = arith.cmpi slt, %ptr, %k : tensor<128x1xi32> loc(#loc693) + %k_42 = tt.broadcast %k_41 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc694) + %k_43 = tt.load %ptr_40, %k_42, %cst_17 : tensor<128x128x!tt.ptr> loc(#loc694) + %ptr_44 = tt.splat %V : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc695) + %ptr_45 = tt.addptr %ptr_44, %ptr_34 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc695) + %ptr_46 = tt.broadcast %ptr_45 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc696) + %ptr_47 = tt.addptr %ptr_46, %ptr_39 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc696) + %v = tt.load %ptr_47, %k_42, %cst_17 : tensor<128x128x!tt.ptr> loc(#loc697) + %dk:2 = scf.for %off_g = %c0_i32 to %c4_i32 step %c1_i32 iter_args(%dv = %cst_18, %dk_61 = %cst_18) -> (tensor<128x128xf32>, tensor<128x128xf32>) : i32 { + %off_hq1 = arith.muli %off_hkv, %c4_i32 : i32 loc(#loc514) + %off_hq1_62 = arith.addi %off_hq1, %off_g : i32 loc(#loc515) + %q_adj1 = arith.muli %off_hq1_62, %c128_i32 : i32 loc(#loc516) + %q_adj1_63 = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc517) + %q_adj1_64 = arith.addi %q_adj1, %q_adj1_63 : i32 loc(#loc518) + %q_adj1_65 = arith.extsi %q_adj1_64 : i32 to i64 loc(#loc519) + %do_adj1 = arith.muli %off_hq1_62, %c262144_i32 : i32 loc(#loc520) + %do_adj1_66 = arith.addi %do_adj1, %q_adj1_63 : i32 loc(#loc521) + %do_adj1_67 = arith.extsi %do_adj1_66 : i32 to i64 loc(#loc522) + %off_chz1 = arith.muli %off_zq, %c32_i32 : i32 loc(#loc523) + %off_chz1_68 = arith.addi %off_chz1, %off_hq1_62 : i32 loc(#loc524) + %off_chz1_69 = arith.muli %off_chz1_68, %Q_LEN : i32 loc(#loc525) + %off_chz1_70 = arith.extsi %off_chz1_69 : i32 to i64 loc(#loc526) + %Q1 = tt.addptr %arg_Q, %q_adj1_65 : !tt.ptr, i64 loc(#loc527) + %DO1 = tt.addptr %arg_DO, %do_adj1_67 : !tt.ptr, i64 loc(#loc528) + %LSE1 = tt.addptr %arg_LSE, %off_chz1_70 : !tt.ptr, i64 loc(#loc529) + %DELTA1 = tt.addptr %arg_DELTA, %off_chz1_70 : !tt.ptr, i64 loc(#loc530) + %sparse_q_num_blks_offset = arith.muli %off_zkv, %ks2 : i32 loc(#loc531) + %sparse_q_num_blks_offset_71 = arith.addi %sparse_q_num_blks_offset, %pid : i32 loc(#loc532) + %sparse_q_idx_offset = arith.muli %off_zkv, %stride_q_idx_h : i32 loc(#loc533) + %sparse_q_idx_offset_72 = arith.muli %pid, %c16_i32 : i32 loc(#loc534) + %sparse_q_idx_offset_73 = arith.addi %sparse_q_idx_offset, %sparse_q_idx_offset_72 : i32 loc(#loc535) + %q_indices = tt.addptr %arg_Q_IDX, %sparse_q_idx_offset_73 : !tt.ptr, i32 loc(#loc536) + %q_start = tt.load %q_indices : !tt.ptr loc(#loc537) + %q_start_74 = arith.muli %q_start, %c128_i32 : i32 loc(#loc538) + %sparse_q_num_blocks = tt.addptr %arg_Q_NUM_BLKS, %sparse_q_num_blks_offset_71 : !tt.ptr, i32 loc(#loc539) + %sparse_q_num_blocks_75 = tt.load %sparse_q_num_blocks : !tt.ptr loc(#loc540) + %offs_m1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc541) + %offs_m1_76 = tt.splat %q_start_74 : i32 -> tensor<64xi32> loc(#loc542) + %offs_m1_77 = arith.addi %offs_m1_76, %offs_m1 : tensor<64xi32> loc(#loc542) + %qT_ptrs = tt.expand_dims %offs_m1_77 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc699) + %qT_ptrs_78 = arith.muli %qT_ptrs, %cst_0 : tensor<1x64xi32> loc(#loc700) + %qT_ptrs_79 = tt.splat %Q1 : !tt.ptr -> tensor<1x64x!tt.ptr> loc(#loc701) + %qT_ptrs_80 = tt.addptr %qT_ptrs_79, %qT_ptrs_78 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc701) + %qT_ptrs_81 = tt.expand_dims %offs_k {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc702) + %qT_ptrs_82 = tt.broadcast %qT_ptrs_80 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc703) + %qT_ptrs_83 = tt.broadcast %qT_ptrs_81 : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc703) + %qT_ptrs_84 = tt.addptr %qT_ptrs_82, %qT_ptrs_83 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc703) + %do_ptrs = tt.expand_dims %offs_m1_77 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc704) + %do_ptrs_85 = arith.muli %do_ptrs, %cst : tensor<64x1xi32> loc(#loc705) + %do_ptrs_86 = tt.splat %DO1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc706) + %do_ptrs_87 = tt.addptr %do_ptrs_86, %do_ptrs_85 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc706) + %do_ptrs_88 = tt.broadcast %do_ptrs_87 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc707) + %do_ptrs_89 = tt.broadcast %ptr_37 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc707) + %do_ptrs_90 = tt.addptr %do_ptrs_88, %do_ptrs_89 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc707) + %hi = arith.muli %sparse_q_num_blocks_75, %c2_i32 : i32 loc(#loc708) + %hi_91 = arith.minsi %hi, %c32_i32 : i32 loc(#loc709) + %do_ptrs_92:5 = scf.for %start_m = %c0_i32 to %hi_91 step %c1_i32 iter_args(%dk_113 = %dk_61, %dv_114 = %dv, %offs_m1_115 = %offs_m1_77, %qT_ptrs_116 = %qT_ptrs_84, %do_ptrs_117 = %do_ptrs_90) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %qT = tt.expand_dims %offs_m1_115 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc923) + %qT_118 = arith.cmpi slt, %qT, %cst_9 : tensor<1x64xi32> loc(#loc924) + %qT_119 = tt.broadcast %qT_118 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc925) + %qT_120 = tt.load %qT_ptrs_116, %qT_119, %cst_16 : tensor<128x64x!tt.ptr> loc(#loc925) + %lse = arith.cmpi slt, %offs_m1_115, %cst_6 : tensor<64xi32> loc(#loc712) + %lse_121 = tt.splat %LSE1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc713) + %lse_122 = tt.addptr %lse_121, %offs_m1_115 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc713) + %lse_123 = tt.load %lse_122, %lse : tensor<64x!tt.ptr> loc(#loc714) + %lse_124 = arith.cmpf oeq, %lse_123, %cst_5 : tensor<64xf32> loc(#loc715) + %lse_125 = arith.select %lse_124, %cst_4, %lse_123 : tensor<64xi1>, tensor<64xf32> loc(#loc716) + %qkT = tt.dot %k_43, %qT_120, %cst_15, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc717) + %qkT_126 = arith.mulf %qkT, %cst_14 : tensor<128x64xf32> loc(#loc718) + %m = arith.remsi %qT, %cst_9 : tensor<1x64xi32> loc(#loc926) + %n = arith.remsi %ptr, %k : tensor<128x1xi32> loc(#loc927) + %post_mod_scores = arith.select %qT_119, %qkT_126, %cst_13 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc721) + %tmp44 = tt.broadcast %m : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc722) + %tmp44_127 = tt.broadcast %n : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc722) + %tmp44_128 = arith.cmpi sge, %tmp44, %tmp44_127 : tensor<128x64xi32> loc(#loc722) + %tmp45 = arith.extsi %n : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc723) + %tmp47 = tt.addptr %in_ptr16, %off_zq : !tt.ptr, i32 loc(#loc724) + %tmp47_129 = tt.load %tmp47 : !tt.ptr loc(#loc725) + %tmp48 = tt.splat %tmp47_129 : i64 -> tensor<128x1xi64> loc(#loc726) + %tmp48_130 = arith.cmpi slt, %tmp45, %tmp48 : tensor<128x1xi64> loc(#loc726) + %tmp49 = arith.extsi %m : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc727) + %tmp50 = tt.splat %tmp47_129 : i64 -> tensor<1x64xi64> loc(#loc728) + %tmp50_131 = arith.cmpi slt, %tmp49, %tmp50 : tensor<1x64xi64> loc(#loc728) + %tmp51 = tt.broadcast %tmp48_130 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc729) + %tmp51_132 = tt.broadcast %tmp50_131 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc729) + %tmp51_133 = arith.andi %tmp51, %tmp51_132 : tensor<128x64xi1> loc(#loc729) + %tmp52 = arith.andi %tmp44_128, %tmp51_133 : tensor<128x64xi1> loc(#loc730) + %tmp55 = arith.cmpi sge, %n, %cst_21 : tensor<128x1xi32> loc(#loc731) + %tmp56 = arith.remsi %n, %cst_21 : tensor<128x1xi32> loc(#loc732) + %tmp58 = arith.cmpi ne, %tmp56, %cst_3 : tensor<128x1xi32> loc(#loc733) + %tmp59 = arith.cmpi slt, %tmp56, %cst_3 : tensor<128x1xi32> loc(#loc734) + %tmp62 = arith.andi %tmp58, %tmp59 : tensor<128x1xi1> loc(#loc735) + %tmp63 = arith.addi %tmp56, %cst_21 : tensor<128x1xi32> loc(#loc736) + %tmp64 = arith.select %tmp62, %tmp63, %tmp56 : tensor<128x1xi1>, tensor<128x1xi32> loc(#loc737) + %tmp65 = arith.extsi %tmp64 : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc738) + %tmp66 = arith.cmpi slt, %tmp65, %tmp48 : tensor<128x1xi64> loc(#loc739) + %tmp67 = arith.andi %tmp55, %tmp66 : tensor<128x1xi1> loc(#loc740) + %tmp68 = arith.subi %tmp44_127, %tmp44 : tensor<128x64xi32> loc(#loc741) + %tmp69 = arith.remsi %tmp68, %cst_8 : tensor<128x64xi32> loc(#loc742) + %tmp70 = arith.cmpi ne, %tmp69, %cst_11 : tensor<128x64xi32> loc(#loc743) + %tmp71 = arith.cmpi slt, %tmp69, %cst_11 : tensor<128x64xi32> loc(#loc744) + %tmp73 = arith.andi %tmp70, %tmp71 : tensor<128x64xi1> loc(#loc745) + %tmp74 = arith.addi %tmp69, %cst_8 : tensor<128x64xi32> loc(#loc746) + %tmp75 = arith.select %tmp73, %tmp74, %tmp69 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc747) + %tmp76 = arith.cmpi eq, %tmp75, %cst_11 : tensor<128x64xi32> loc(#loc748) + %tmp77 = tt.broadcast %tmp67 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc749) + %tmp77_134 = arith.andi %tmp77, %tmp76 : tensor<128x64xi1> loc(#loc749) + %tmp78 = arith.ori %tmp52, %tmp77_134 : tensor<128x64xi1> loc(#loc750) + %post_mod_scores_135 = arith.select %tmp78, %post_mod_scores, %cst_13 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc751) + %post_mod_scores_136 = arith.mulf %post_mod_scores_135, %cst_10 : tensor<128x64xf32> loc(#loc752) + %pT = tt.expand_dims %lse_125 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc753) + %pT_137 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc754) + %pT_138 = arith.subf %post_mod_scores_136, %pT_137 : tensor<128x64xf32> loc(#loc754) + %pT_139 = math.exp2 %pT_138 : tensor<128x64xf32> loc(#loc755) + %do = tt.expand_dims %offs_m1_115 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc928) + %do_140 = arith.cmpi slt, %do, %cst_7 : tensor<64x1xi32> loc(#loc929) + %do_141 = tt.broadcast %do_140 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc930) + %do_142 = tt.load %do_ptrs_117, %do_141, %cst_2 : tensor<64x128x!tt.ptr> loc(#loc930) + %dv_143 = arith.truncf %pT_139 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc757) + %dv_144 = tt.dot %dv_143, %do_142, %dv_114, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc758) + %Di = tt.splat %DELTA1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc759) + %Di_145 = tt.addptr %Di, %offs_m1_115 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc759) + %Di_146 = tt.load %Di_145, %lse : tensor<64x!tt.ptr> loc(#loc760) + %dpT = tt.trans %do_142 {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc761) + %dpT_147 = tt.dot %v, %dpT, %cst_15, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc762) + %dsT = tt.expand_dims %Di_146 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc763) + %dsT_148 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc764) + %dsT_149 = arith.subf %dpT_147, %dsT_148 : tensor<128x64xf32> loc(#loc764) + %dsT_150 = arith.mulf %pT_139, %dsT_149 : tensor<128x64xf32> loc(#loc765) + %grad_scores = arith.select %qT_119, %dsT_150, %cst_15 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc766) + %dsT_151 = arith.select %tmp78, %grad_scores, %cst_15 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc767) + %dk_152 = arith.truncf %dsT_151 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc768) + %dk_153 = tt.trans %qT_120 {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc769) + %dk_154 = tt.dot %dk_152, %dk_153, %dk_113, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc770) + %cur_block_idx = arith.divsi %start_m, %c2_i32 : i32 loc(#loc931) + %cur_block = tt.addptr %q_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc932) + %cur_block_155 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc933) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc934) + %next_block_156 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_75 : i32 loc(#loc935) + %next_block_157 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc936) + %next_block_158 = tt.load %next_block_157, %next_block_156 evictionPolicy = evict_last : !tt.ptr loc(#loc937) + %needs_jump = arith.addi %start_m, %c1_i32 : i32 loc(#loc938) + %needs_jump_159 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc939) + %needs_jump_160 = arith.cmpi eq, %needs_jump_159, %c0_i32 : i32 loc(#loc940) + %jump_to_block = arith.subi %next_block_158, %cur_block_155 : i32 loc(#loc941) + %jump_to_block_161 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc942) + %jump_to_block_162 = arith.subi %jump_to_block_161, %c64_i32 : i32 loc(#loc943) + %offset = arith.extui %needs_jump_160 : i1 to i32 loc(#loc944) + %offset_163 = arith.muli %jump_to_block_162, %offset : i32 loc(#loc944) + %offset_164 = arith.subi %c1_i32, %offset : i32 loc(#loc945) + %offset_165 = arith.muli %offset_164, %c64_i32 : i32 loc(#loc946) + %offset_166 = arith.addi %offset_163, %offset_165 : i32 loc(#loc947) + %qT_ptrs_167 = arith.muli %offset_166, %c4096_i32 : i32 loc(#loc772) + %qT_ptrs_168 = tt.splat %qT_ptrs_167 : i32 -> tensor<128x64xi32> loc(#loc773) + %qT_ptrs_169 = tt.addptr %qT_ptrs_116, %qT_ptrs_168 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc773) + %do_ptrs_170 = arith.muli %offset_166, %c128_i32 : i32 loc(#loc774) + %do_ptrs_171 = tt.splat %do_ptrs_170 : i32 -> tensor<64x128xi32> loc(#loc775) + %do_ptrs_172 = tt.addptr %do_ptrs_117, %do_ptrs_171 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc775) + %offs_m1_173 = tt.splat %offset_166 : i32 -> tensor<64xi32> loc(#loc776) + %offs_m1_174 = arith.addi %offs_m1_115, %offs_m1_173 : tensor<64xi32> loc(#loc776) + scf.yield %dk_154, %dv_144, %offs_m1_174, %qT_ptrs_169, %do_ptrs_172 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc622) + } loc(#loc986) + %q_indices_93 = tt.addptr %arg_FULL_Q_IDX, %sparse_q_idx_offset_73 : !tt.ptr, i32 loc(#loc623) + %q_start_94 = tt.load %q_indices_93 : !tt.ptr loc(#loc624) + %q_start_95 = arith.muli %q_start_94, %c128_i32 : i32 loc(#loc625) + %sparse_q_num_blocks_96 = tt.addptr %arg_FULL_Q_NUM_BLKS, %sparse_q_num_blks_offset_71 : !tt.ptr, i32 loc(#loc626) + %sparse_q_num_blocks_97 = tt.load %sparse_q_num_blocks_96 : !tt.ptr loc(#loc627) + %offs_m1_98 = tt.splat %q_start_95 : i32 -> tensor<64xi32> loc(#loc628) + %offs_m1_99 = arith.addi %offs_m1_98, %offs_m1 : tensor<64xi32> loc(#loc628) + %qT_ptrs_100 = tt.expand_dims %offs_m1_99 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc777) + %qT_ptrs_101 = arith.muli %qT_ptrs_100, %cst_0 : tensor<1x64xi32> loc(#loc778) + %qT_ptrs_102 = tt.addptr %qT_ptrs_79, %qT_ptrs_101 : tensor<1x64x!tt.ptr>, tensor<1x64xi32> loc(#loc779) + %qT_ptrs_103 = tt.broadcast %qT_ptrs_102 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr> loc(#loc780) + %qT_ptrs_104 = tt.addptr %qT_ptrs_103, %qT_ptrs_83 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc780) + %do_ptrs_105 = tt.expand_dims %offs_m1_99 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc781) + %do_ptrs_106 = arith.muli %do_ptrs_105, %cst : tensor<64x1xi32> loc(#loc782) + %do_ptrs_107 = tt.addptr %do_ptrs_86, %do_ptrs_106 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc783) + %do_ptrs_108 = tt.broadcast %do_ptrs_107 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc784) + %do_ptrs_109 = tt.addptr %do_ptrs_108, %do_ptrs_89 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc784) + %hi_110 = arith.muli %sparse_q_num_blocks_97, %c2_i32 : i32 loc(#loc785) + %hi_111 = arith.minsi %hi_110, %c32_i32 : i32 loc(#loc786) + %do_ptrs_112:5 = scf.for %start_m = %c0_i32 to %hi_111 step %c1_i32 iter_args(%dk_113 = %do_ptrs_92#0, %dv_114 = %do_ptrs_92#1, %offs_m1_115 = %offs_m1_99, %qT_ptrs_116 = %qT_ptrs_104, %do_ptrs_117 = %do_ptrs_109) -> (tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr>) : i32 { + %qT = tt.expand_dims %offs_m1_115 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc948) + %qT_118 = arith.cmpi slt, %qT, %cst_9 : tensor<1x64xi32> loc(#loc949) + %qT_119 = tt.broadcast %qT_118 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc950) + %qT_120 = tt.load %qT_ptrs_116, %qT_119, %cst_16 : tensor<128x64x!tt.ptr> loc(#loc950) + %lse = arith.cmpi slt, %offs_m1_115, %cst_6 : tensor<64xi32> loc(#loc788) + %lse_121 = tt.splat %LSE1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc789) + %lse_122 = tt.addptr %lse_121, %offs_m1_115 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc789) + %lse_123 = tt.load %lse_122, %lse : tensor<64x!tt.ptr> loc(#loc790) + %lse_124 = arith.cmpf oeq, %lse_123, %cst_5 : tensor<64xf32> loc(#loc791) + %lse_125 = arith.select %lse_124, %cst_4, %lse_123 : tensor<64xi1>, tensor<64xf32> loc(#loc792) + %qkT = tt.dot %k_43, %qT_120, %cst_15, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc793) + %qkT_126 = arith.mulf %qkT, %cst_14 : tensor<128x64xf32> loc(#loc794) + %post_mod_scores = arith.select %qT_119, %qkT_126, %cst_13 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc795) + %post_mod_scores_127 = arith.mulf %post_mod_scores, %cst_10 : tensor<128x64xf32> loc(#loc796) + %pT = tt.expand_dims %lse_125 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc797) + %pT_128 = tt.broadcast %pT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc798) + %pT_129 = arith.subf %post_mod_scores_127, %pT_128 : tensor<128x64xf32> loc(#loc798) + %pT_130 = math.exp2 %pT_129 : tensor<128x64xf32> loc(#loc799) + %do = tt.expand_dims %offs_m1_115 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc951) + %do_131 = arith.cmpi slt, %do, %cst_7 : tensor<64x1xi32> loc(#loc952) + %do_132 = tt.broadcast %do_131 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc953) + %do_133 = tt.load %do_ptrs_117, %do_132, %cst_2 : tensor<64x128x!tt.ptr> loc(#loc953) + %dv_134 = arith.truncf %pT_130 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc801) + %dv_135 = tt.dot %dv_134, %do_133, %dv_114, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc802) + %Di = tt.splat %DELTA1 : !tt.ptr -> tensor<64x!tt.ptr> loc(#loc803) + %Di_136 = tt.addptr %Di, %offs_m1_115 : tensor<64x!tt.ptr>, tensor<64xi32> loc(#loc803) + %Di_137 = tt.load %Di_136, %lse : tensor<64x!tt.ptr> loc(#loc804) + %dpT = tt.trans %do_133 {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc805) + %dpT_138 = tt.dot %v, %dpT, %cst_15, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc806) + %dsT = tt.expand_dims %Di_137 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> loc(#loc807) + %dsT_139 = tt.broadcast %dsT : tensor<1x64xf32> -> tensor<128x64xf32> loc(#loc808) + %dsT_140 = arith.subf %dpT_138, %dsT_139 : tensor<128x64xf32> loc(#loc808) + %dsT_141 = arith.mulf %pT_130, %dsT_140 : tensor<128x64xf32> loc(#loc809) + %grad_scores = arith.select %qT_119, %dsT_141, %cst_15 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc810) + %dk_142 = arith.truncf %grad_scores : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc811) + %dk_143 = tt.trans %qT_120 {order = array} : tensor<128x64xbf16> -> tensor<64x128xbf16> loc(#loc812) + %dk_144 = tt.dot %dk_142, %dk_143, %dk_113, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc813) + %cur_block_idx = arith.divsi %start_m, %c2_i32 : i32 loc(#loc954) + %cur_block = tt.addptr %q_indices_93, %cur_block_idx : !tt.ptr, i32 loc(#loc955) + %cur_block_145 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc956) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc957) + %next_block_146 = arith.cmpi slt, %next_block, %sparse_q_num_blocks_97 : i32 loc(#loc958) + %next_block_147 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc959) + %next_block_148 = tt.load %next_block_147, %next_block_146 evictionPolicy = evict_last : !tt.ptr loc(#loc960) + %needs_jump = arith.addi %start_m, %c1_i32 : i32 loc(#loc961) + %needs_jump_149 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc962) + %needs_jump_150 = arith.cmpi eq, %needs_jump_149, %c0_i32 : i32 loc(#loc963) + %jump_to_block = arith.subi %next_block_148, %cur_block_145 : i32 loc(#loc964) + %jump_to_block_151 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc965) + %jump_to_block_152 = arith.subi %jump_to_block_151, %c64_i32 : i32 loc(#loc966) + %offset = arith.extui %needs_jump_150 : i1 to i32 loc(#loc967) + %offset_153 = arith.muli %jump_to_block_152, %offset : i32 loc(#loc967) + %offset_154 = arith.subi %c1_i32, %offset : i32 loc(#loc968) + %offset_155 = arith.muli %offset_154, %c64_i32 : i32 loc(#loc969) + %offset_156 = arith.addi %offset_153, %offset_155 : i32 loc(#loc970) + %qT_ptrs_157 = arith.muli %offset_156, %c4096_i32 : i32 loc(#loc815) + %qT_ptrs_158 = tt.splat %qT_ptrs_157 : i32 -> tensor<128x64xi32> loc(#loc816) + %qT_ptrs_159 = tt.addptr %qT_ptrs_116, %qT_ptrs_158 : tensor<128x64x!tt.ptr>, tensor<128x64xi32> loc(#loc816) + %do_ptrs_160 = arith.muli %offset_156, %c128_i32 : i32 loc(#loc817) + %do_ptrs_161 = tt.splat %do_ptrs_160 : i32 -> tensor<64x128xi32> loc(#loc818) + %do_ptrs_162 = tt.addptr %do_ptrs_117, %do_ptrs_161 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc818) + %offs_m1_163 = tt.splat %offset_156 : i32 -> tensor<64xi32> loc(#loc819) + %offs_m1_164 = arith.addi %offs_m1_115, %offs_m1_163 : tensor<64xi32> loc(#loc819) + scf.yield %dk_144, %dv_135, %offs_m1_164, %qT_ptrs_159, %do_ptrs_162 : tensor<128x128xf32>, tensor<128x128xf32>, tensor<64xi32>, tensor<128x64x!tt.ptr>, tensor<64x128x!tt.ptr> loc(#loc630) + } loc(#loc987) + scf.yield %do_ptrs_112#1, %do_ptrs_112#0 : tensor<128x128xf32>, tensor<128x128xf32> loc(#loc306) + } loc(#loc698) + %dv_ptrs = tt.splat %DV : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc631) + %dv_ptrs_48 = tt.addptr %dv_ptrs, %ptr_34 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc631) + %dv_ptrs_49 = tt.broadcast %dv_ptrs_48 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc632) + %dv_ptrs_50 = tt.addptr %dv_ptrs_49, %ptr_39 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc632) + %3 = arith.cmpi slt, %ptr_37, %cst_20 : tensor<1x128xi32> loc(#loc309) + %4 = tt.broadcast %3 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc310) + %5 = arith.andi %k_42, %4 : tensor<128x128xi1> loc(#loc310) + %6 = arith.truncf %dk#0 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc311) + tt.store %dv_ptrs_50, %6, %5 : tensor<128x128x!tt.ptr> loc(#loc311) + %dk_51 = arith.mulf %dk#1, %cst_22 : tensor<128x128xf32> loc(#loc633) + %xindex = tt.broadcast %ptr_34 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc634) + %xindex_52 = arith.addi %ptr_39, %xindex : tensor<128x128xi32> loc(#loc634) + %xindex_53 = arith.muli %off_hkv, %c128_i32 : i32 loc(#loc635) + %xindex_54 = arith.muli %xindex_53, %ks0 : i32 loc(#loc636) + %xindex_55 = tt.splat %xindex_54 : i32 -> tensor<128x128xi32> loc(#loc637) + %xindex_56 = arith.addi %xindex_52, %xindex_55 : tensor<128x128xi32> loc(#loc637) + %xindex_57 = arith.muli %off_zq, %c1024_i32 : i32 loc(#loc638) + %xindex_58 = arith.muli %xindex_57, %ks0 : i32 loc(#loc639) + %xindex_59 = tt.splat %xindex_58 : i32 -> tensor<128x128xi32> loc(#loc640) + %xindex_60 = arith.addi %xindex_56, %xindex_59 : tensor<128x128xi32> loc(#loc640) + %7 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc320) + %8 = tt.addptr %7, %xindex_60 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc320) + %9 = arith.truncf %dk_51 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc321) + tt.store %8, %9, %k_42 : tensor<128x128x!tt.ptr> loc(#loc321) + } loc(#loc25) + tt.return loc(#loc322) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":105:12) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":97:53) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":95:54) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":95:63) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":111:24) +#loc7 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":112:36) +#loc9 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":115:27) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":116:28) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":117:23) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":124:25) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":124:47) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":124:35) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":124:59) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":128:50) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":128:37) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":128:61) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":131:9) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":132:9) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":133:10) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":136:26) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":139:14) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":139:7) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":140:24) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":144:29) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":144:54) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":144:44) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":145:35) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":148:29) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":154:55) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":154:78) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":155:50) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":155:83) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":155:68) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":158:30) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":158:52) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":158:40) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":158:63) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":159:32) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":159:42) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":159:66) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":161:30) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":161:35) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":161:46) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":161:56) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":163:17) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":164:19) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":167:19) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":168:21) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":169:25) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":174:36) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":175:29) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":825:27) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":178:107) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":825:38) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":825:20) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":825:56) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":825:49) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":833:52) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":833:23) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":179:111) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":188:58) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":188:34) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":188:25) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":189:33) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":189:26) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":190:30) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":190:50) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":191:18) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":195:30) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":196:27) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":196:41) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":197:53) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":197:39) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":199:42) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":199:29) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":390:26) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":207:12) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":390:37) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":390:18) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":390:56) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":390:49) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":391:18) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":391:49) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":395:43) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":395:90) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":395:101) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":395:63) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":397:28) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":831:41) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":458:105) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":405:12) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":831:52) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":831:23) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":459:19) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":461:14) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":798:21) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":464:46) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":467:46) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":476:79) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":482:23) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":483:23) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":485:34) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":485:23) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":486:22) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":487:23) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":488:23) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":489:23) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":490:23) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":493:24) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":494:24) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":496:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":497:92) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":500:24) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":501:24) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":502:39) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":503:25) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":504:24) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":505:24) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":506:23) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":507:25) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":508:25) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":509:92) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":511:24) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":512:24) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":513:39) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":514:25) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":515:24) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":516:24) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":521:69) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":524:27) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":525:39) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":525:21) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":528:104) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":530:20) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":531:22) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":531:19) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":531:14) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":538:71) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":549:43) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":551:15) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":553:30) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":553:21) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":788:33) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":411:64) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":789:38) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":789:24) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":790:109) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":790:113) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":790:55) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":790:25) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":791:30) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":791:35) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":791:60) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":792:34) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":792:48) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":792:63) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":793:29) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":793:47) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":793:61) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":793:42) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":414:28) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":414:19) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":415:19) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":417:19) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":417:8) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":214:39) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":215:31) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":215:45) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":216:62) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":216:43) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":218:33) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":226:16) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":231:24) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":231:56) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":232:14) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":236:87) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":236:69) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":236:30) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":245:28) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":252:25) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":253:29) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":256:107) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":257:107) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":262:30) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":263:32) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":263:51) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":266:34) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":266:56) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":266:44) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":266:67) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":267:36) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":267:46) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":267:70) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":269:34) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":269:39) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":269:50) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":269:60) +#loc201 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":271:21) +#loc202 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":272:23) +#loc203 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":275:25) +#loc204 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":276:29) +#loc205 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":281:58) +#loc206 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":281:80) +#loc207 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":282:53) +#loc208 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":282:81) +#loc209 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":282:70) +#loc210 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":286:32) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":287:30) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":287:43) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":288:55) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":288:42) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":290:45) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":290:32) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":601:26) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":298:16) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":601:37) +#loc220 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":601:18) +#loc221 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":601:56) +#loc222 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":601:49) +#loc223 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":602:27) +#loc224 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":602:38) +#loc225 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":602:19) +#loc226 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":602:51) +#loc227 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":608:42) +#loc228 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":608:61) +#loc229 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":610:28) +#loc230 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":669:105) +#loc231 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":618:12) +#loc232 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":674:52) +#loc233 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":674:28) +#loc234 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":674:22) +#loc235 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":675:26) +#loc236 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":675:46) +#loc237 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":676:20) +#loc238 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":678:15) +#loc239 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":680:46) +#loc240 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":683:46) +#loc241 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":692:78) +#loc242 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":698:25) +#loc243 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":699:25) +#loc244 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":701:35) +#loc245 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":701:24) +#loc246 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":702:24) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":703:25) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":704:24) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":705:24) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":706:24) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":709:25) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":710:25) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":712:25) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":713:92) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":716:24) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":717:24) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":718:39) +#loc258 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":719:25) +#loc259 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":720:24) +#loc260 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":721:24) +#loc261 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":722:24) +#loc262 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":723:25) +#loc263 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":724:25) +#loc264 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":725:92) +#loc265 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":727:24) +#loc266 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":728:24) +#loc267 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":729:39) +#loc268 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":730:25) +#loc269 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":731:24) +#loc270 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":732:24) +#loc271 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":736:69) +#loc272 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":739:27) +#loc273 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":740:44) +#loc274 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":740:40) +#loc275 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":740:22) +#loc276 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":833:41) +#loc277 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":741:99) +#loc278 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":744:24) +#loc279 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":744:43) +#loc280 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":748:29) +#loc281 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":748:21) +#loc282 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":750:29) +#loc283 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":750:20) +#loc284 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":751:25) +#loc285 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":751:22) +#loc286 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":751:16) +#loc287 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":759:70) +#loc288 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":773:45) +#loc289 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":775:24) +#loc290 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":775:52) +#loc291 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":775:43) +#loc292 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":623:62) +#loc293 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":626:28) +#loc294 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":626:19) +#loc295 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":627:28) +#loc296 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":627:19) +#loc297 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":628:19) +#loc298 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":628:8) +#loc299 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":306:41) +#loc300 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":307:34) +#loc301 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":307:47) +#loc302 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":308:64) +#loc303 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":308:46) +#loc304 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":310:36) +#loc305 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":318:20) +#loc306 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":303:12) +#loc307 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":323:23) +#loc308 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":323:55) +#loc309 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":332:71) +#loc310 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":332:61) +#loc311 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":332:30) +#loc312 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":334:14) +#loc313 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:27) +#loc314 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:45) +#loc315 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:53) +#loc316 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:41) +#loc317 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:64) +#loc318 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:71) +#loc319 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":344:59) +#loc320 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":345:29) +#loc321 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":345:69) +#loc322 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/nj/cnjupr7h27ntxfuqy7ffoqn72sina2rzy4fxsexp2st3dss2jzew.py":139:4) +#loc345 = loc("Q_LEN"(#loc2)) +#loc346 = loc("pid"(#loc6)) +#loc347 = loc("NUM_KV_BLOCKS"(#loc8)) +#loc348 = loc("off_zq"(#loc10)) +#loc349 = loc("off_hkv"(#loc11)) +#loc350 = loc("off_zkv"(#loc12)) +#loc351 = loc("k_adj"(#loc13)) +#loc352 = loc("k_adj"(#loc14)) +#loc353 = loc("k_adj"(#loc15)) +#loc354 = loc("k_adj"(#loc16)) +#loc355 = loc("dv_adj"(#loc17)) +#loc356 = loc("dv_adj"(#loc18)) +#loc357 = loc("dv_adj"(#loc19)) +#loc358 = loc("K"(#loc20)) +#loc359 = loc("V"(#loc21)) +#loc360 = loc("DV"(#loc22)) +#loc361 = loc("offs_k"(#loc23)) +#loc362 = loc("off_pid"(#loc26)) +#loc363 = loc("off_hq2"(#loc27)) +#loc364 = loc("off_hq2"(#loc28)) +#loc365 = loc("off_hq2"(#loc29)) +#loc366 = loc("start_m2_block"(#loc30)) +#loc367 = loc("stride_kv_idx_h"(#loc31)) +#loc368 = loc("sparse_kv_num_blks_offset"(#loc32)) +#loc369 = loc("sparse_kv_num_blks_offset"(#loc33)) +#loc370 = loc("sparse_kv_idx_offset"(#loc34)) +#loc371 = loc("sparse_kv_idx_offset"(#loc35)) +#loc372 = loc("sparse_kv_idx_offset"(#loc36)) +#loc373 = loc("q_adj2"(#loc37)) +#loc374 = loc("q_adj2"(#loc38)) +#loc375 = loc("q_adj2"(#loc39)) +#loc376 = loc("q_adj2"(#loc40)) +#loc377 = loc("do_adj2"(#loc41)) +#loc378 = loc("do_adj2"(#loc42)) +#loc379 = loc("do_adj2"(#loc43)) +#loc380 = loc("off_chz2"(#loc44)) +#loc381 = loc("off_chz2"(#loc45)) +#loc382 = loc("off_chz2"(#loc46)) +#loc383 = loc("off_chz2"(#loc47)) +#loc384 = loc("Q2"(#loc48)) +#loc385 = loc("DO2"(#loc49)) +#loc386 = loc("DQ2"(#loc50)) +#loc387 = loc("LSE2"(#loc51)) +#loc388 = loc("DELTA2"(#loc52)) +#loc389 = loc("start_m2"(#loc53)) +#loc390 = loc("offs_m2"(#loc54)) +#loc391 = loc("ptr"(#loc55)) +#loc392 = loc("q"(#loc56)) +#loc393 = loc("ptr"(#loc57)) +#loc394 = loc("ptr"(#loc58)) +#loc395 = loc("ptr"(#loc59)) +#loc396 = loc("ptr"(#loc60)) +#loc397 = loc("do"(#loc63)) +#loc398 = loc("Di"(#loc64)) +#loc399 = loc("Di"(#loc65)) +#loc400 = loc("Di"(#loc66)) +#loc401 = loc("lse"(#loc67)) +#loc402 = loc("lse"(#loc68)) +#loc403 = loc("lse"(#loc69)) +#loc404 = loc("lse"(#loc70)) +#loc405 = loc("lse"(#loc71)) +#loc406 = loc("kv_indices"(#loc72)) +#loc407 = loc("kv_start"(#loc73)) +#loc408 = loc("kv_start"(#loc74)) +#loc409 = loc("sparse_kv_num_blocks"(#loc75)) +#loc410 = loc("sparse_kv_num_blocks"(#loc76)) +#loc411 = loc("offs_n2"(#loc77)) +#loc412 = loc("offs_n2"(#loc78)) +#loc413 = loc("kT_ptrs"(#loc79)) +#loc414 = loc("dq"(#loc80)) +#loc415 = loc("kT_ptrs"(#loc81)) +#loc416 = loc("kT_ptrs"(#loc82)) +#loc417 = loc("kT_ptrs"(#loc83)) +#loc418 = loc("kT_ptrs"(#loc84)) +#loc419 = loc("vT_ptrs"(#loc85)) +#loc420 = loc("vT_ptrs"(#loc86)) +#loc421 = loc("hi"(#loc87)) +#loc422 = loc("hi"(#loc88)) +#loc423 = loc("hi"(#loc89)) +#loc424 = loc("hi"(#loc90)) +#loc425 = loc("dq"(#loc91)) +#loc426 = loc("kT"(#loc93)) +#loc427 = loc("dq"(#loc94)) +#loc428 = loc("qk"(#loc97)) +#loc429 = loc("qk"(#loc98)) +#loc430 = loc("n"(#loc100)) +#loc431 = loc("m"(#loc101)) +#loc432 = loc("post_mod_scores"(#loc102)) +#loc433 = loc("tmp4"(#loc103)) +#loc434 = loc("tmp5"(#loc104)) +#loc435 = loc("tmp7"(#loc105)) +#loc436 = loc("tmp7"(#loc106)) +#loc437 = loc("tmp8"(#loc107)) +#loc438 = loc("tmp9"(#loc108)) +#loc439 = loc("tmp10"(#loc109)) +#loc440 = loc("tmp11"(#loc110)) +#loc441 = loc("tmp12"(#loc111)) +#loc442 = loc("tmp15"(#loc112)) +#loc443 = loc("tmp16"(#loc113)) +#loc444 = loc("tmp18"(#loc114)) +#loc445 = loc("tmp19"(#loc115)) +#loc446 = loc("tmp22"(#loc116)) +#loc447 = loc("tmp23"(#loc117)) +#loc448 = loc("tmp24"(#loc118)) +#loc449 = loc("tmp25"(#loc119)) +#loc450 = loc("tmp26"(#loc120)) +#loc451 = loc("tmp27"(#loc121)) +#loc452 = loc("tmp28"(#loc122)) +#loc453 = loc("tmp29"(#loc123)) +#loc454 = loc("tmp30"(#loc124)) +#loc455 = loc("tmp31"(#loc125)) +#loc456 = loc("tmp33"(#loc126)) +#loc457 = loc("tmp34"(#loc127)) +#loc458 = loc("tmp35"(#loc128)) +#loc459 = loc("tmp36"(#loc129)) +#loc460 = loc("tmp37"(#loc130)) +#loc461 = loc("tmp38"(#loc131)) +#loc462 = loc("post_mod_scores"(#loc132)) +#loc463 = loc("post_mod_scores"(#loc133)) +#loc464 = loc("p"(#loc134)) +#loc465 = loc("p"(#loc135)) +#loc466 = loc("vT"(#loc136)) +#loc467 = loc("dp"(#loc137)) +#loc468 = loc("ds"(#loc138)) +#loc469 = loc("ds"(#loc139)) +#loc470 = loc("ds"(#loc140)) +#loc471 = loc("grad_scores"(#loc141)) +#loc472 = loc("ds"(#loc142)) +#loc473 = loc("ds"(#loc143)) +#loc474 = loc("dq"(#loc144)) +#loc475 = loc("dq"(#loc145)) +#loc476 = loc("cur_block_idx"(#loc146)) +#loc477 = loc("offset"(#loc147)) +#loc478 = loc("cur_block"(#loc148)) +#loc479 = loc("cur_block"(#loc149)) +#loc480 = loc("next_block"(#loc150)) +#loc481 = loc("next_block"(#loc151)) +#loc482 = loc("next_block"(#loc152)) +#loc483 = loc("next_block"(#loc153)) +#loc484 = loc("needs_jump"(#loc154)) +#loc485 = loc("needs_jump"(#loc155)) +#loc486 = loc("needs_jump"(#loc156)) +#loc487 = loc("jump_to_block"(#loc157)) +#loc488 = loc("jump_to_block"(#loc158)) +#loc489 = loc("jump_to_block"(#loc159)) +#loc490 = loc("offset"(#loc160)) +#loc491 = loc("offset"(#loc161)) +#loc492 = loc("offset"(#loc162)) +#loc493 = loc("offset"(#loc163)) +#loc494 = loc("kT_ptrs"(#loc164)) +#loc495 = loc("kT_ptrs"(#loc165)) +#loc496 = loc("vT_ptrs"(#loc166)) +#loc497 = loc("offs_n2"(#loc167)) +#loc498 = loc("kv_indices"(#loc169)) +#loc499 = loc("kv_start"(#loc170)) +#loc500 = loc("kv_start"(#loc171)) +#loc501 = loc("sparse_kv_num_blocks"(#loc172)) +#loc502 = loc("sparse_kv_num_blocks"(#loc173)) +#loc503 = loc("offs_n2"(#loc174)) +#loc504 = loc("dq"(#loc175)) +#loc505 = loc("dq_ptrs"(#loc176)) +#loc506 = loc("dq_ptrs"(#loc177)) +#loc507 = loc("dq"(#loc178)) +#loc508 = loc("stride_q_idx_h"(#loc182)) +#loc509 = loc("start_n1"(#loc183)) +#loc510 = loc("offs_n1"(#loc184)) +#loc511 = loc("k"(#loc185)) +#loc512 = loc("v"(#loc186)) +#loc513 = loc("dv"(#loc187)) +#loc514 = loc("off_hq1"(#loc188)) +#loc515 = loc("off_hq1"(#loc189)) +#loc516 = loc("q_adj1"(#loc190)) +#loc517 = loc("q_adj1"(#loc191)) +#loc518 = loc("q_adj1"(#loc192)) +#loc519 = loc("q_adj1"(#loc193)) +#loc520 = loc("do_adj1"(#loc194)) +#loc521 = loc("do_adj1"(#loc195)) +#loc522 = loc("do_adj1"(#loc196)) +#loc523 = loc("off_chz1"(#loc197)) +#loc524 = loc("off_chz1"(#loc198)) +#loc525 = loc("off_chz1"(#loc199)) +#loc526 = loc("off_chz1"(#loc200)) +#loc527 = loc("Q1"(#loc201)) +#loc528 = loc("DO1"(#loc202)) +#loc529 = loc("LSE1"(#loc203)) +#loc530 = loc("DELTA1"(#loc204)) +#loc531 = loc("sparse_q_num_blks_offset"(#loc205)) +#loc532 = loc("sparse_q_num_blks_offset"(#loc206)) +#loc533 = loc("sparse_q_idx_offset"(#loc207)) +#loc534 = loc("sparse_q_idx_offset"(#loc208)) +#loc535 = loc("sparse_q_idx_offset"(#loc209)) +#loc536 = loc("q_indices"(#loc210)) +#loc537 = loc("q_start"(#loc211)) +#loc538 = loc("q_start"(#loc212)) +#loc539 = loc("sparse_q_num_blocks"(#loc213)) +#loc540 = loc("sparse_q_num_blocks"(#loc214)) +#loc541 = loc("offs_m1"(#loc215)) +#loc542 = loc("offs_m1"(#loc216)) +#loc543 = loc("qT_ptrs"(#loc217)) +#loc544 = loc("qT_ptrs"(#loc219)) +#loc545 = loc("qT_ptrs"(#loc220)) +#loc546 = loc("qT_ptrs"(#loc221)) +#loc547 = loc("qT_ptrs"(#loc222)) +#loc548 = loc("do_ptrs"(#loc223)) +#loc549 = loc("do_ptrs"(#loc224)) +#loc550 = loc("do_ptrs"(#loc225)) +#loc551 = loc("do_ptrs"(#loc226)) +#loc552 = loc("hi"(#loc227)) +#loc553 = loc("hi"(#loc228)) +#loc554 = loc("dk"(#loc229)) +#loc555 = loc("qT"(#loc230)) +#loc556 = loc(callsite(#loc231 at #loc218)) +#loc557 = loc("lse"(#loc232)) +#loc558 = loc("lse"(#loc233)) +#loc559 = loc("lse"(#loc234)) +#loc560 = loc("lse"(#loc235)) +#loc561 = loc("lse"(#loc236)) +#loc562 = loc("qkT"(#loc237)) +#loc563 = loc("qkT"(#loc238)) +#loc564 = loc("m"(#loc239)) +#loc565 = loc("n"(#loc240)) +#loc566 = loc("post_mod_scores"(#loc241)) +#loc567 = loc("tmp44"(#loc242)) +#loc568 = loc("tmp45"(#loc243)) +#loc569 = loc("tmp47"(#loc244)) +#loc570 = loc("tmp47"(#loc245)) +#loc571 = loc("tmp48"(#loc246)) +#loc572 = loc("tmp49"(#loc247)) +#loc573 = loc("tmp50"(#loc248)) +#loc574 = loc("tmp51"(#loc249)) +#loc575 = loc("tmp52"(#loc250)) +#loc576 = loc("tmp55"(#loc251)) +#loc577 = loc("tmp56"(#loc252)) +#loc578 = loc("tmp58"(#loc253)) +#loc579 = loc("tmp59"(#loc254)) +#loc580 = loc("tmp62"(#loc255)) +#loc581 = loc("tmp63"(#loc256)) +#loc582 = loc("tmp64"(#loc257)) +#loc583 = loc("tmp65"(#loc258)) +#loc584 = loc("tmp66"(#loc259)) +#loc585 = loc("tmp67"(#loc260)) +#loc586 = loc("tmp68"(#loc261)) +#loc587 = loc("tmp69"(#loc262)) +#loc588 = loc("tmp70"(#loc263)) +#loc589 = loc("tmp71"(#loc264)) +#loc590 = loc("tmp73"(#loc265)) +#loc591 = loc("tmp74"(#loc266)) +#loc592 = loc("tmp75"(#loc267)) +#loc593 = loc("tmp76"(#loc268)) +#loc594 = loc("tmp77"(#loc269)) +#loc595 = loc("tmp78"(#loc270)) +#loc596 = loc("post_mod_scores"(#loc271)) +#loc597 = loc("post_mod_scores"(#loc272)) +#loc598 = loc("pT"(#loc273)) +#loc599 = loc("pT"(#loc274)) +#loc600 = loc("pT"(#loc275)) +#loc601 = loc("do"(#loc277)) +#loc602 = loc("dv"(#loc278)) +#loc603 = loc("dv"(#loc279)) +#loc604 = loc("Di"(#loc280)) +#loc605 = loc("Di"(#loc281)) +#loc606 = loc("dpT"(#loc282)) +#loc607 = loc("dpT"(#loc283)) +#loc608 = loc("dsT"(#loc284)) +#loc609 = loc("dsT"(#loc285)) +#loc610 = loc("dsT"(#loc286)) +#loc611 = loc("grad_scores"(#loc287)) +#loc612 = loc("dsT"(#loc288)) +#loc613 = loc("dk"(#loc289)) +#loc614 = loc("dk"(#loc290)) +#loc615 = loc("dk"(#loc291)) +#loc616 = loc("offset"(#loc292)) +#loc617 = loc("qT_ptrs"(#loc293)) +#loc618 = loc("qT_ptrs"(#loc294)) +#loc619 = loc("do_ptrs"(#loc295)) +#loc620 = loc("do_ptrs"(#loc296)) +#loc621 = loc("offs_m1"(#loc297)) +#loc622 = loc(callsite(#loc298 at #loc218)) +#loc623 = loc("q_indices"(#loc299)) +#loc624 = loc("q_start"(#loc300)) +#loc625 = loc("q_start"(#loc301)) +#loc626 = loc("sparse_q_num_blocks"(#loc302)) +#loc627 = loc("sparse_q_num_blocks"(#loc303)) +#loc628 = loc("offs_m1"(#loc304)) +#loc629 = loc(callsite(#loc231 at #loc305)) +#loc630 = loc(callsite(#loc298 at #loc305)) +#loc631 = loc("dv_ptrs"(#loc307)) +#loc632 = loc("dv_ptrs"(#loc308)) +#loc633 = loc("dk"(#loc312)) +#loc634 = loc("xindex"(#loc313)) +#loc635 = loc("xindex"(#loc314)) +#loc636 = loc("xindex"(#loc315)) +#loc637 = loc("xindex"(#loc316)) +#loc638 = loc("xindex"(#loc317)) +#loc639 = loc("xindex"(#loc318)) +#loc640 = loc("xindex"(#loc319)) +#loc641 = loc(callsite(#loc7 at #loc347)) +#loc642 = loc(callsite(#loc9 at #loc347)) +#loc643 = loc(callsite(#loc391 at #loc392)) +#loc644 = loc(callsite(#loc393 at #loc392)) +#loc645 = loc(callsite(#loc394 at #loc392)) +#loc646 = loc(callsite(#loc395 at #loc392)) +#loc647 = loc(callsite(#loc396 at #loc392)) +#loc648 = loc(callsite(#loc61 at #loc392)) +#loc649 = loc(callsite(#loc62 at #loc392)) +#loc650 = loc(callsite(#loc393 at #loc397)) +#loc651 = loc(callsite(#loc394 at #loc397)) +#loc652 = loc(callsite(#loc396 at #loc397)) +#loc653 = loc(callsite(#loc62 at #loc397)) +#loc654 = loc(callsite(#loc413 at #loc414)) +#loc655 = loc(callsite(#loc415 at #loc414)) +#loc656 = loc(callsite(#loc416 at #loc414)) +#loc657 = loc(callsite(#loc417 at #loc414)) +#loc658 = loc(callsite(#loc418 at #loc414)) +#loc659 = loc(callsite(#loc419 at #loc414)) +#loc660 = loc(callsite(#loc420 at #loc414)) +#loc661 = loc(callsite(#loc421 at #loc414)) +#loc662 = loc(callsite(#loc422 at #loc414)) +#loc663 = loc(callsite(#loc423 at #loc414)) +#loc664 = loc(callsite(#loc424 at #loc414)) +#loc665 = loc("offs_n2"(#loc425)) +#loc666 = loc(callsite(#loc427 at #loc414)) +#loc667 = loc(callsite(#loc477 at #loc414)) +#loc668 = loc(callsite(#loc494 at #loc414)) +#loc669 = loc(callsite(#loc495 at #loc414)) +#loc670 = loc(callsite(#loc496 at #loc414)) +#loc671 = loc(callsite(#loc497 at #loc414)) +#loc672 = loc(callsite(#loc168 at #loc414)) +#loc673 = loc(callsite(#loc413 at #loc504)) +#loc674 = loc(callsite(#loc415 at #loc504)) +#loc675 = loc(callsite(#loc416 at #loc504)) +#loc676 = loc(callsite(#loc418 at #loc504)) +#loc677 = loc(callsite(#loc419 at #loc504)) +#loc678 = loc(callsite(#loc420 at #loc504)) +#loc679 = loc(callsite(#loc421 at #loc504)) +#loc680 = loc(callsite(#loc424 at #loc504)) +#loc681 = loc(callsite(#loc427 at #loc504)) +#loc682 = loc(callsite(#loc477 at #loc504)) +#loc683 = loc(callsite(#loc494 at #loc504)) +#loc684 = loc(callsite(#loc495 at #loc504)) +#loc685 = loc(callsite(#loc496 at #loc504)) +#loc686 = loc(callsite(#loc497 at #loc504)) +#loc687 = loc(callsite(#loc168 at #loc504)) +#loc688 = loc(callsite(#loc391 at #loc511)) +#loc689 = loc(callsite(#loc393 at #loc511)) +#loc690 = loc(callsite(#loc394 at #loc511)) +#loc691 = loc(callsite(#loc395 at #loc511)) +#loc692 = loc(callsite(#loc396 at #loc511)) +#loc693 = loc(callsite(#loc61 at #loc511)) +#loc694 = loc(callsite(#loc62 at #loc511)) +#loc695 = loc(callsite(#loc394 at #loc512)) +#loc696 = loc(callsite(#loc396 at #loc512)) +#loc697 = loc(callsite(#loc62 at #loc512)) +#loc698 = loc("dk"(#loc513)) +#loc699 = loc(callsite(#loc543 at #loc218)) +#loc700 = loc(callsite(#loc544 at #loc218)) +#loc701 = loc(callsite(#loc545 at #loc218)) +#loc702 = loc(callsite(#loc546 at #loc218)) +#loc703 = loc(callsite(#loc547 at #loc218)) +#loc704 = loc(callsite(#loc548 at #loc218)) +#loc705 = loc(callsite(#loc549 at #loc218)) +#loc706 = loc(callsite(#loc550 at #loc218)) +#loc707 = loc(callsite(#loc551 at #loc218)) +#loc708 = loc(callsite(#loc552 at #loc218)) +#loc709 = loc(callsite(#loc553 at #loc218)) +#loc710 = loc("dv"(#loc554)) +#loc711 = loc(callsite(#loc555 at #loc556)) +#loc712 = loc(callsite(#loc557 at #loc556)) +#loc713 = loc(callsite(#loc558 at #loc556)) +#loc714 = loc(callsite(#loc559 at #loc556)) +#loc715 = loc(callsite(#loc560 at #loc556)) +#loc716 = loc(callsite(#loc561 at #loc556)) +#loc717 = loc(callsite(#loc562 at #loc556)) +#loc718 = loc(callsite(#loc563 at #loc556)) +#loc719 = loc(callsite(#loc564 at #loc556)) +#loc720 = loc(callsite(#loc565 at #loc556)) +#loc721 = loc(callsite(#loc566 at #loc556)) +#loc722 = loc(callsite(#loc567 at #loc556)) +#loc723 = loc(callsite(#loc568 at #loc556)) +#loc724 = loc(callsite(#loc569 at #loc556)) +#loc725 = loc(callsite(#loc570 at #loc556)) +#loc726 = loc(callsite(#loc571 at #loc556)) +#loc727 = loc(callsite(#loc572 at #loc556)) +#loc728 = loc(callsite(#loc573 at #loc556)) +#loc729 = loc(callsite(#loc574 at #loc556)) +#loc730 = loc(callsite(#loc575 at #loc556)) +#loc731 = loc(callsite(#loc576 at #loc556)) +#loc732 = loc(callsite(#loc577 at #loc556)) +#loc733 = loc(callsite(#loc578 at #loc556)) +#loc734 = loc(callsite(#loc579 at #loc556)) +#loc735 = loc(callsite(#loc580 at #loc556)) +#loc736 = loc(callsite(#loc581 at #loc556)) +#loc737 = loc(callsite(#loc582 at #loc556)) +#loc738 = loc(callsite(#loc583 at #loc556)) +#loc739 = loc(callsite(#loc584 at #loc556)) +#loc740 = loc(callsite(#loc585 at #loc556)) +#loc741 = loc(callsite(#loc586 at #loc556)) +#loc742 = loc(callsite(#loc587 at #loc556)) +#loc743 = loc(callsite(#loc588 at #loc556)) +#loc744 = loc(callsite(#loc589 at #loc556)) +#loc745 = loc(callsite(#loc590 at #loc556)) +#loc746 = loc(callsite(#loc591 at #loc556)) +#loc747 = loc(callsite(#loc592 at #loc556)) +#loc748 = loc(callsite(#loc593 at #loc556)) +#loc749 = loc(callsite(#loc594 at #loc556)) +#loc750 = loc(callsite(#loc595 at #loc556)) +#loc751 = loc(callsite(#loc596 at #loc556)) +#loc752 = loc(callsite(#loc597 at #loc556)) +#loc753 = loc(callsite(#loc598 at #loc556)) +#loc754 = loc(callsite(#loc599 at #loc556)) +#loc755 = loc(callsite(#loc600 at #loc556)) +#loc756 = loc(callsite(#loc601 at #loc556)) +#loc757 = loc(callsite(#loc602 at #loc556)) +#loc758 = loc(callsite(#loc603 at #loc556)) +#loc759 = loc(callsite(#loc604 at #loc556)) +#loc760 = loc(callsite(#loc605 at #loc556)) +#loc761 = loc(callsite(#loc606 at #loc556)) +#loc762 = loc(callsite(#loc607 at #loc556)) +#loc763 = loc(callsite(#loc608 at #loc556)) +#loc764 = loc(callsite(#loc609 at #loc556)) +#loc765 = loc(callsite(#loc610 at #loc556)) +#loc766 = loc(callsite(#loc611 at #loc556)) +#loc767 = loc(callsite(#loc612 at #loc556)) +#loc768 = loc(callsite(#loc613 at #loc556)) +#loc769 = loc(callsite(#loc614 at #loc556)) +#loc770 = loc(callsite(#loc615 at #loc556)) +#loc771 = loc(callsite(#loc616 at #loc218)) +#loc772 = loc(callsite(#loc617 at #loc218)) +#loc773 = loc(callsite(#loc618 at #loc218)) +#loc774 = loc(callsite(#loc619 at #loc218)) +#loc775 = loc(callsite(#loc620 at #loc218)) +#loc776 = loc(callsite(#loc621 at #loc218)) +#loc777 = loc(callsite(#loc543 at #loc305)) +#loc778 = loc(callsite(#loc544 at #loc305)) +#loc779 = loc(callsite(#loc545 at #loc305)) +#loc780 = loc(callsite(#loc547 at #loc305)) +#loc781 = loc(callsite(#loc548 at #loc305)) +#loc782 = loc(callsite(#loc549 at #loc305)) +#loc783 = loc(callsite(#loc550 at #loc305)) +#loc784 = loc(callsite(#loc551 at #loc305)) +#loc785 = loc(callsite(#loc552 at #loc305)) +#loc786 = loc(callsite(#loc553 at #loc305)) +#loc787 = loc(callsite(#loc555 at #loc629)) +#loc788 = loc(callsite(#loc557 at #loc629)) +#loc789 = loc(callsite(#loc558 at #loc629)) +#loc790 = loc(callsite(#loc559 at #loc629)) +#loc791 = loc(callsite(#loc560 at #loc629)) +#loc792 = loc(callsite(#loc561 at #loc629)) +#loc793 = loc(callsite(#loc562 at #loc629)) +#loc794 = loc(callsite(#loc563 at #loc629)) +#loc795 = loc(callsite(#loc566 at #loc629)) +#loc796 = loc(callsite(#loc597 at #loc629)) +#loc797 = loc(callsite(#loc598 at #loc629)) +#loc798 = loc(callsite(#loc599 at #loc629)) +#loc799 = loc(callsite(#loc600 at #loc629)) +#loc800 = loc(callsite(#loc601 at #loc629)) +#loc801 = loc(callsite(#loc602 at #loc629)) +#loc802 = loc(callsite(#loc603 at #loc629)) +#loc803 = loc(callsite(#loc604 at #loc629)) +#loc804 = loc(callsite(#loc605 at #loc629)) +#loc805 = loc(callsite(#loc606 at #loc629)) +#loc806 = loc(callsite(#loc607 at #loc629)) +#loc807 = loc(callsite(#loc608 at #loc629)) +#loc808 = loc(callsite(#loc609 at #loc629)) +#loc809 = loc(callsite(#loc610 at #loc629)) +#loc810 = loc(callsite(#loc611 at #loc629)) +#loc811 = loc(callsite(#loc613 at #loc629)) +#loc812 = loc(callsite(#loc614 at #loc629)) +#loc813 = loc(callsite(#loc615 at #loc629)) +#loc814 = loc(callsite(#loc616 at #loc305)) +#loc815 = loc(callsite(#loc617 at #loc305)) +#loc816 = loc(callsite(#loc618 at #loc305)) +#loc817 = loc(callsite(#loc619 at #loc305)) +#loc818 = loc(callsite(#loc620 at #loc305)) +#loc819 = loc(callsite(#loc621 at #loc305)) +#loc820 = loc(callsite(#loc7 at #loc662)) +#loc821 = loc(callsite(#loc9 at #loc662)) +#loc822 = loc("kT_ptrs"(#loc665)) +#loc823 = loc(callsite(#loc426 at #loc666)) +#loc824 = loc(callsite(#loc428 at #loc666)) +#loc825 = loc(callsite(#loc429 at #loc666)) +#loc826 = loc(callsite(#loc430 at #loc666)) +#loc827 = loc(callsite(#loc431 at #loc666)) +#loc828 = loc(callsite(#loc432 at #loc666)) +#loc829 = loc(callsite(#loc433 at #loc666)) +#loc830 = loc(callsite(#loc434 at #loc666)) +#loc831 = loc(callsite(#loc435 at #loc666)) +#loc832 = loc(callsite(#loc436 at #loc666)) +#loc833 = loc(callsite(#loc437 at #loc666)) +#loc834 = loc(callsite(#loc438 at #loc666)) +#loc835 = loc(callsite(#loc439 at #loc666)) +#loc836 = loc(callsite(#loc440 at #loc666)) +#loc837 = loc(callsite(#loc441 at #loc666)) +#loc838 = loc(callsite(#loc442 at #loc666)) +#loc839 = loc(callsite(#loc443 at #loc666)) +#loc840 = loc(callsite(#loc444 at #loc666)) +#loc841 = loc(callsite(#loc445 at #loc666)) +#loc842 = loc(callsite(#loc446 at #loc666)) +#loc843 = loc(callsite(#loc447 at #loc666)) +#loc844 = loc(callsite(#loc448 at #loc666)) +#loc845 = loc(callsite(#loc449 at #loc666)) +#loc846 = loc(callsite(#loc450 at #loc666)) +#loc847 = loc(callsite(#loc451 at #loc666)) +#loc848 = loc(callsite(#loc452 at #loc666)) +#loc849 = loc(callsite(#loc453 at #loc666)) +#loc850 = loc(callsite(#loc454 at #loc666)) +#loc851 = loc(callsite(#loc455 at #loc666)) +#loc852 = loc(callsite(#loc456 at #loc666)) +#loc853 = loc(callsite(#loc457 at #loc666)) +#loc854 = loc(callsite(#loc458 at #loc666)) +#loc855 = loc(callsite(#loc459 at #loc666)) +#loc856 = loc(callsite(#loc460 at #loc666)) +#loc857 = loc(callsite(#loc461 at #loc666)) +#loc858 = loc(callsite(#loc462 at #loc666)) +#loc859 = loc(callsite(#loc463 at #loc666)) +#loc860 = loc(callsite(#loc464 at #loc666)) +#loc861 = loc(callsite(#loc465 at #loc666)) +#loc862 = loc(callsite(#loc466 at #loc666)) +#loc863 = loc(callsite(#loc467 at #loc666)) +#loc864 = loc(callsite(#loc468 at #loc666)) +#loc865 = loc(callsite(#loc469 at #loc666)) +#loc866 = loc(callsite(#loc470 at #loc666)) +#loc867 = loc(callsite(#loc471 at #loc666)) +#loc868 = loc(callsite(#loc472 at #loc666)) +#loc869 = loc(callsite(#loc473 at #loc666)) +#loc870 = loc(callsite(#loc474 at #loc666)) +#loc871 = loc(callsite(#loc475 at #loc666)) +#loc872 = loc(callsite(#loc476 at #loc667)) +#loc873 = loc(callsite(#loc478 at #loc667)) +#loc874 = loc(callsite(#loc479 at #loc667)) +#loc875 = loc(callsite(#loc480 at #loc667)) +#loc876 = loc(callsite(#loc481 at #loc667)) +#loc877 = loc(callsite(#loc482 at #loc667)) +#loc878 = loc(callsite(#loc483 at #loc667)) +#loc879 = loc(callsite(#loc484 at #loc667)) +#loc880 = loc(callsite(#loc485 at #loc667)) +#loc881 = loc(callsite(#loc486 at #loc667)) +#loc882 = loc(callsite(#loc487 at #loc667)) +#loc883 = loc(callsite(#loc488 at #loc667)) +#loc884 = loc(callsite(#loc489 at #loc667)) +#loc885 = loc(callsite(#loc490 at #loc667)) +#loc886 = loc(callsite(#loc491 at #loc667)) +#loc887 = loc(callsite(#loc492 at #loc667)) +#loc888 = loc(callsite(#loc493 at #loc667)) +#loc889 = loc(callsite(#loc426 at #loc681)) +#loc890 = loc(callsite(#loc428 at #loc681)) +#loc891 = loc(callsite(#loc429 at #loc681)) +#loc892 = loc(callsite(#loc432 at #loc681)) +#loc893 = loc(callsite(#loc463 at #loc681)) +#loc894 = loc(callsite(#loc464 at #loc681)) +#loc895 = loc(callsite(#loc465 at #loc681)) +#loc896 = loc(callsite(#loc466 at #loc681)) +#loc897 = loc(callsite(#loc467 at #loc681)) +#loc898 = loc(callsite(#loc468 at #loc681)) +#loc899 = loc(callsite(#loc469 at #loc681)) +#loc900 = loc(callsite(#loc470 at #loc681)) +#loc901 = loc(callsite(#loc471 at #loc681)) +#loc902 = loc(callsite(#loc473 at #loc681)) +#loc903 = loc(callsite(#loc474 at #loc681)) +#loc904 = loc(callsite(#loc475 at #loc681)) +#loc905 = loc(callsite(#loc476 at #loc682)) +#loc906 = loc(callsite(#loc478 at #loc682)) +#loc907 = loc(callsite(#loc479 at #loc682)) +#loc908 = loc(callsite(#loc480 at #loc682)) +#loc909 = loc(callsite(#loc481 at #loc682)) +#loc910 = loc(callsite(#loc482 at #loc682)) +#loc911 = loc(callsite(#loc483 at #loc682)) +#loc912 = loc(callsite(#loc484 at #loc682)) +#loc913 = loc(callsite(#loc485 at #loc682)) +#loc914 = loc(callsite(#loc486 at #loc682)) +#loc915 = loc(callsite(#loc487 at #loc682)) +#loc916 = loc(callsite(#loc488 at #loc682)) +#loc917 = loc(callsite(#loc489 at #loc682)) +#loc918 = loc(callsite(#loc490 at #loc682)) +#loc919 = loc(callsite(#loc491 at #loc682)) +#loc920 = loc(callsite(#loc492 at #loc682)) +#loc921 = loc(callsite(#loc493 at #loc682)) +#loc922 = loc("offs_m1"(#loc710)) +#loc923 = loc(callsite(#loc92 at #loc711)) +#loc924 = loc(callsite(#loc95 at #loc711)) +#loc925 = loc(callsite(#loc96 at #loc711)) +#loc926 = loc(callsite(#loc99 at #loc719)) +#loc927 = loc(callsite(#loc99 at #loc720)) +#loc928 = loc(callsite(#loc276 at #loc756)) +#loc929 = loc(callsite(#loc61 at #loc756)) +#loc930 = loc(callsite(#loc62 at #loc756)) +#loc931 = loc(callsite(#loc476 at #loc771)) +#loc932 = loc(callsite(#loc478 at #loc771)) +#loc933 = loc(callsite(#loc479 at #loc771)) +#loc934 = loc(callsite(#loc480 at #loc771)) +#loc935 = loc(callsite(#loc481 at #loc771)) +#loc936 = loc(callsite(#loc482 at #loc771)) +#loc937 = loc(callsite(#loc483 at #loc771)) +#loc938 = loc(callsite(#loc484 at #loc771)) +#loc939 = loc(callsite(#loc485 at #loc771)) +#loc940 = loc(callsite(#loc486 at #loc771)) +#loc941 = loc(callsite(#loc487 at #loc771)) +#loc942 = loc(callsite(#loc488 at #loc771)) +#loc943 = loc(callsite(#loc489 at #loc771)) +#loc944 = loc(callsite(#loc490 at #loc771)) +#loc945 = loc(callsite(#loc491 at #loc771)) +#loc946 = loc(callsite(#loc492 at #loc771)) +#loc947 = loc(callsite(#loc493 at #loc771)) +#loc948 = loc(callsite(#loc92 at #loc787)) +#loc949 = loc(callsite(#loc95 at #loc787)) +#loc950 = loc(callsite(#loc96 at #loc787)) +#loc951 = loc(callsite(#loc276 at #loc800)) +#loc952 = loc(callsite(#loc61 at #loc800)) +#loc953 = loc(callsite(#loc62 at #loc800)) +#loc954 = loc(callsite(#loc476 at #loc814)) +#loc955 = loc(callsite(#loc478 at #loc814)) +#loc956 = loc(callsite(#loc479 at #loc814)) +#loc957 = loc(callsite(#loc480 at #loc814)) +#loc958 = loc(callsite(#loc481 at #loc814)) +#loc959 = loc(callsite(#loc482 at #loc814)) +#loc960 = loc(callsite(#loc483 at #loc814)) +#loc961 = loc(callsite(#loc484 at #loc814)) +#loc962 = loc(callsite(#loc485 at #loc814)) +#loc963 = loc(callsite(#loc486 at #loc814)) +#loc964 = loc(callsite(#loc487 at #loc814)) +#loc965 = loc(callsite(#loc488 at #loc814)) +#loc966 = loc(callsite(#loc489 at #loc814)) +#loc967 = loc(callsite(#loc490 at #loc814)) +#loc968 = loc(callsite(#loc491 at #loc814)) +#loc969 = loc(callsite(#loc492 at #loc814)) +#loc970 = loc(callsite(#loc493 at #loc814)) +#loc971 = loc("vT_ptrs"(#loc822)) +#loc972 = loc(callsite(#loc92 at #loc823)) +#loc973 = loc(callsite(#loc95 at #loc823)) +#loc974 = loc(callsite(#loc96 at #loc823)) +#loc975 = loc(callsite(#loc99 at #loc826)) +#loc976 = loc(callsite(#loc99 at #loc827)) +#loc977 = loc(callsite(#loc96 at #loc862)) +#loc978 = loc(callsite(#loc92 at #loc889)) +#loc979 = loc(callsite(#loc95 at #loc889)) +#loc980 = loc(callsite(#loc96 at #loc889)) +#loc981 = loc(callsite(#loc96 at #loc896)) +#loc982 = loc("qT_ptrs"(#loc922)) +#loc983 = loc(callsite(#loc971 at #loc414)) +#loc984 = loc(callsite(#loc971 at #loc504)) +#loc985 = loc("do_ptrs"(#loc982)) +#loc986 = loc(callsite(#loc985 at #loc218)) +#loc987 = loc(callsite(#loc985 at #loc305)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.json b/SpecForge-ext/cache/compiled_kernels/triton/6/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.json new file mode 100644 index 0000000000000000000000000000000000000000..66a0758e7b9c6e3d7165485f360bc678675a9d10 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.json @@ -0,0 +1 @@ +{"hash": "815544e5ace0796cfb02bac4b9dec6a2a110a987d2e79f99b0e8bb667807ed7c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 32, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": false, "backend_name": "cuda", "sanitize_overflow": true, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "log_softmax_backward_kernel"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.source b/SpecForge-ext/cache/compiled_kernels/triton/6/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.source new file mode 100644 index 0000000000000000000000000000000000000000..a4cd6994ae4e49ea89eadbe375c5bf395e5b9573 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.source @@ -0,0 +1,292 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":114:0) +#loc17 = loc(unknown) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc63 = loc("logits_ptr"(#loc)) +#loc64 = loc("logits_stride"(#loc)) +#loc65 = loc("target_ptr"(#loc)) +#loc66 = loc("target_stride"(#loc)) +#loc67 = loc("position_mask_ptr"(#loc)) +#loc68 = loc("grad_output_ptr"(#loc)) +#loc69 = loc("scaling_factor"(#loc)) +#loc70 = loc("m_ptr"(#loc)) +#loc71 = loc("d_ptr"(#loc)) +#loc72 = loc("n_cols"(#loc)) +#loc116 = loc("input"(#loc55)) +#loc117 = loc("a"(#loc59)) +#loc118 = loc("b"(#loc59)) +module { + tt.func public @log_softmax_backward_kernel(%logits_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("logits_ptr"(#loc)), %logits_stride: i32 {tt.divisibility = 16 : i32} loc("logits_stride"(#loc)), %target_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("target_ptr"(#loc)), %target_stride: i32 {tt.divisibility = 16 : i32} loc("target_stride"(#loc)), %position_mask_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("position_mask_ptr"(#loc)), %grad_output_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("grad_output_ptr"(#loc)), %scaling_factor: f32 loc("scaling_factor"(#loc)), %m_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("m_ptr"(#loc)), %d_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("d_ptr"(#loc)), %n_cols: i32 {tt.divisibility = 16 : i32} loc("n_cols"(#loc))) attributes {noinline = false} { + %program_id = tt.get_program_id x : i32 loc(#loc73) + %program_id_0 = arith.extsi %program_id : i32 to i64 loc(#loc74) + %logits_ptr_1 = arith.extsi %logits_stride : i32 to i64 loc(#loc75) + %logits_ptr_2 = arith.muli %program_id_0, %logits_ptr_1 : i64 loc(#loc75) + %logits_ptr_3 = tt.addptr %logits_ptr, %logits_ptr_2 : !tt.ptr, i64 loc(#loc76) + %target_ptr_4 = arith.extsi %target_stride : i32 to i64 loc(#loc77) + %target_ptr_5 = arith.muli %program_id_0, %target_ptr_4 : i64 loc(#loc77) + %target_ptr_6 = tt.addptr %target_ptr, %target_ptr_5 : !tt.ptr, i64 loc(#loc78) + %position_mask_ptr_7 = tt.addptr %position_mask_ptr, %program_id_0 : !tt.ptr, i64 loc(#loc79) + %position_mask = tt.bitcast %position_mask_ptr_7 : !tt.ptr -> !tt.ptr loc(#loc80) + %position_mask_8 = tt.load %position_mask : !tt.ptr loc(#loc80) + %position_mask_9 = arith.constant 0 : i8 loc(#loc80) + %position_mask_10 = arith.cmpi ne, %position_mask_8, %position_mask_9 : i8 loc(#loc80) + %c0_i32 = arith.constant 0 : i32 loc(#loc9) + %0 = arith.extui %position_mask_10 : i1 to i32 loc(#loc9) + %1 = arith.cmpi eq, %0, %c0_i32 : i32 loc(#loc9) + cf.cond_br %1, ^bb1, ^bb2 loc(#loc9) + ^bb1: // pred: ^bb0 + %c0_i32_11 = arith.constant 0 : i32 loc(#loc10) + %c32768_i32 = arith.constant 32768 : i32 loc(#loc10) + %2 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc10) + %3 = arith.bitcast %n_cols : i32 to i32 loc(#loc10) + %4 = arith.bitcast %c32768_i32 : i32 to i32 loc(#loc10) + %5 = ub.poison : i32 loc(#loc10) + scf.for %i = %2 to %3 step %4 : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc81) + %offsets_20 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc82) + %offsets_21 = arith.extsi %offsets_20 : tensor<32768xi32> to tensor<32768xi64> loc(#loc82) + %offsets_22 = arith.extsi %offsets : tensor<32768xi32> to tensor<32768xi64> loc(#loc82) + %offsets_23 = arith.addi %offsets_21, %offsets_22 : tensor<32768xi64> loc(#loc82) + %offsets_24 = arith.constant 2147483647 : i64 loc(#loc82) + %offsets_25 = arith.constant -2147483648 : i64 loc(#loc82) + %offsets_26 = arith.constant dense<2147483647> : tensor<32768xi64> loc(#loc82) + %offsets_27 = arith.cmpi sle, %offsets_23, %offsets_26 : tensor<32768xi64> loc(#loc82) + %offsets_28 = arith.constant dense<-2147483648> : tensor<32768xi64> loc(#loc82) + %offsets_29 = arith.cmpi sge, %offsets_23, %offsets_28 : tensor<32768xi64> loc(#loc82) + %offsets_30 = arith.andi %offsets_27, %offsets_29 : tensor<32768xi1> loc(#loc82) + %offsets_31 = arith.addi %offsets_20, %offsets : tensor<32768xi32> loc(#loc82) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc83) + %mask_32 = arith.cmpi slt, %offsets_31, %mask : tensor<32768xi32> loc(#loc83) + %14 = tt.splat %logits_ptr_3 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc14) + %15 = tt.addptr %14, %offsets_31 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc14) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc15) + %cst_33 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc15) + %16 = arith.truncf %cst_33 : tensor<32768xf32> to tensor<32768xbf16> loc(#loc15) + tt.store %15, %16, %mask_32 : tensor<32768x!tt.ptr> loc(#loc15) + } loc(#loc10) + tt.return loc(#loc16) + ^bb2: // pred: ^bb0 + cf.br ^bb4 loc(#loc17) + ^bb3: // no predecessors + cf.br ^bb4 loc(#loc17) + ^bb4: // 2 preds: ^bb2, ^bb3 + %m_ptr_12 = tt.addptr %m_ptr, %program_id_0 : !tt.ptr, i64 loc(#loc84) + %d_ptr_13 = tt.addptr %d_ptr, %program_id_0 : !tt.ptr, i64 loc(#loc85) + %m = tt.load %m_ptr_12 : !tt.ptr loc(#loc86) + %d = tt.load %d_ptr_13 : !tt.ptr loc(#loc87) + %grad_output = tt.load %grad_output_ptr : !tt.ptr loc(#loc88) + %grad_output_14 = arith.mulf %grad_output, %scaling_factor : f32 loc(#loc89) + %target_grad_sum = arith.constant 0.000000e+00 : f32 loc(#loc90) + %c0_i32_15 = arith.constant 0 : i32 loc(#loc25) + %c32768_i32_16 = arith.constant 32768 : i32 loc(#loc25) + %6 = arith.bitcast %c0_i32_15 : i32 to i32 loc(#loc25) + %7 = arith.bitcast %n_cols : i32 to i32 loc(#loc25) + %8 = arith.bitcast %c32768_i32_16 : i32 to i32 loc(#loc25) + %9 = ub.poison : i32 loc(#loc25) + %target_grad_sum_17 = scf.for %i = %6 to %7 step %8 iter_args(%target_grad_sum_20 = %target_grad_sum) -> (f32) : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc92) + %offsets_21 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc93) + %offsets_22 = arith.extsi %offsets_21 : tensor<32768xi32> to tensor<32768xi64> loc(#loc93) + %offsets_23 = arith.extsi %offsets : tensor<32768xi32> to tensor<32768xi64> loc(#loc93) + %offsets_24 = arith.addi %offsets_22, %offsets_23 : tensor<32768xi64> loc(#loc93) + %offsets_25 = arith.constant 2147483647 : i64 loc(#loc93) + %offsets_26 = arith.constant -2147483648 : i64 loc(#loc93) + %offsets_27 = arith.constant dense<2147483647> : tensor<32768xi64> loc(#loc93) + %offsets_28 = arith.cmpi sle, %offsets_24, %offsets_27 : tensor<32768xi64> loc(#loc93) + %offsets_29 = arith.constant dense<-2147483648> : tensor<32768xi64> loc(#loc93) + %offsets_30 = arith.cmpi sge, %offsets_24, %offsets_29 : tensor<32768xi64> loc(#loc93) + %offsets_31 = arith.andi %offsets_28, %offsets_30 : tensor<32768xi1> loc(#loc93) + %offsets_32 = arith.addi %offsets_21, %offsets : tensor<32768xi32> loc(#loc93) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc94) + %mask_33 = arith.cmpi slt, %offsets_32, %mask : tensor<32768xi32> loc(#loc94) + %target_block = tt.splat %target_ptr_6 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc95) + %target_block_34 = tt.addptr %target_block, %offsets_32 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc95) + %target_block_35 = arith.constant 0.000000e+00 : f32 loc(#loc96) + %target_block_36 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc96) + %target_block_37 = tt.load %target_block_34, %mask_33, %target_block_36 : tensor<32768x!tt.ptr> loc(#loc96) + %target_grad_sum_38 = tt.splat %grad_output_14 : f32 -> tensor<32768xf32> loc(#loc97) + %target_grad_sum_39 = arith.mulf %target_block_37, %target_grad_sum_38 : tensor<32768xf32> loc(#loc97) + %target_grad_sum_40 = arith.constant 0.000000e+00 : f32 loc(#loc98) + %target_grad_sum_41 = arith.constant 0.000000e+00 : f32 loc(#loc98) + %target_grad_sum_42 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc98) + %target_grad_sum_43 = arith.select %mask_33, %target_grad_sum_39, %target_grad_sum_42 : tensor<32768xi1>, tensor<32768xf32> loc(#loc98) + %target_grad_sum_44 = tt.call @"triton.language.standard.sum__fp32S32768S__(1,)cNone_(2,)cconstexpr_False__(3,)cNone"(%target_grad_sum_43) : (tensor<32768xf32>) -> f32 loc(#loc99) + %target_grad_sum_45 = arith.addf %target_grad_sum_20, %target_grad_sum_44 : f32 loc(#loc100) + scf.yield %target_grad_sum_45 : f32 loc(#loc35) + } loc(#loc91) + %c0_i32_18 = arith.constant 0 : i32 loc(#loc36) + %c32768_i32_19 = arith.constant 32768 : i32 loc(#loc36) + %10 = arith.bitcast %c0_i32_18 : i32 to i32 loc(#loc36) + %11 = arith.bitcast %n_cols : i32 to i32 loc(#loc36) + %12 = arith.bitcast %c32768_i32_19 : i32 to i32 loc(#loc36) + %13 = ub.poison : i32 loc(#loc36) + scf.for %i = %10 to %11 step %12 : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc101) + %offsets_20 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc102) + %offsets_21 = arith.extsi %offsets_20 : tensor<32768xi32> to tensor<32768xi64> loc(#loc102) + %offsets_22 = arith.extsi %offsets : tensor<32768xi32> to tensor<32768xi64> loc(#loc102) + %offsets_23 = arith.addi %offsets_21, %offsets_22 : tensor<32768xi64> loc(#loc102) + %offsets_24 = arith.constant 2147483647 : i64 loc(#loc102) + %offsets_25 = arith.constant -2147483648 : i64 loc(#loc102) + %offsets_26 = arith.constant dense<2147483647> : tensor<32768xi64> loc(#loc102) + %offsets_27 = arith.cmpi sle, %offsets_23, %offsets_26 : tensor<32768xi64> loc(#loc102) + %offsets_28 = arith.constant dense<-2147483648> : tensor<32768xi64> loc(#loc102) + %offsets_29 = arith.cmpi sge, %offsets_23, %offsets_28 : tensor<32768xi64> loc(#loc102) + %offsets_30 = arith.andi %offsets_27, %offsets_29 : tensor<32768xi1> loc(#loc102) + %offsets_31 = arith.addi %offsets_20, %offsets : tensor<32768xi32> loc(#loc102) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc103) + %mask_32 = arith.cmpi slt, %offsets_31, %mask : tensor<32768xi32> loc(#loc103) + %logits_block = tt.splat %logits_ptr_3 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc104) + %logits_block_33 = tt.addptr %logits_block, %offsets_31 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc104) + %logits_block_34 = arith.constant 0.000000e+00 : f32 loc(#loc105) + %logits_block_35 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc105) + %logits_block_36 = arith.truncf %logits_block_35 : tensor<32768xf32> to tensor<32768xbf16> loc(#loc105) + %logits_block_37 = tt.load %logits_block_33, %mask_32, %logits_block_36 : tensor<32768x!tt.ptr> loc(#loc105) + %logits_block_38 = arith.extf %logits_block_37 : tensor<32768xbf16> to tensor<32768xf32> loc(#loc106) + %target_block = tt.splat %target_ptr_6 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc107) + %target_block_39 = tt.addptr %target_block, %offsets_31 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc107) + %target_block_40 = arith.constant 0.000000e+00 : f32 loc(#loc108) + %target_block_41 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc108) + %target_block_42 = tt.load %target_block_39, %mask_32, %target_block_41 : tensor<32768x!tt.ptr> loc(#loc108) + %softmax_prob = tt.splat %m : f32 -> tensor<32768xf32> loc(#loc109) + %softmax_prob_43 = arith.subf %logits_block_38, %softmax_prob : tensor<32768xf32> loc(#loc109) + %softmax_prob_44 = math.exp %softmax_prob_43 : tensor<32768xf32> loc(#loc110) + %softmax_prob_45 = tt.splat %d : f32 -> tensor<32768xf32> loc(#loc111) + %softmax_prob_46 = arith.divf %softmax_prob_44, %softmax_prob_45 : tensor<32768xf32> loc(#loc111) + %normalized_grad = tt.splat %target_grad_sum_17 : f32 -> tensor<32768xf32> loc(#loc112) + %normalized_grad_47 = arith.mulf %softmax_prob_46, %normalized_grad : tensor<32768xf32> loc(#loc112) + %grad_block = tt.splat %grad_output_14 : f32 -> tensor<32768xf32> loc(#loc113) + %grad_block_48 = arith.mulf %target_block_42, %grad_block : tensor<32768xf32> loc(#loc113) + %grad_block_49 = arith.subf %grad_block_48, %normalized_grad_47 : tensor<32768xf32> loc(#loc114) + %grad_block_50 = arith.constant 0.000000e+00 : f32 loc(#loc115) + %grad_block_51 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc115) + %grad_block_52 = arith.subf %grad_block_51, %grad_block_49 : tensor<32768xf32> loc(#loc115) + %14 = tt.splat %logits_ptr_3 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc52) + %15 = tt.addptr %14, %offsets_31 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc52) + %16 = arith.truncf %grad_block_52 : tensor<32768xf32> to tensor<32768xbf16> loc(#loc53) + tt.store %15, %16, %mask_32 : tensor<32768x!tt.ptr> loc(#loc53) + } loc(#loc36) + tt.return loc(#loc54) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S32768S__(1,)cNone_(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32768xf32> loc("input"(#loc55))) -> f32 attributes {noinline = false} { + %0 = tt.reshape %input allow_reorder : tensor<32768xf32> -> tensor<32768xf32> loc(#loc56) + %1 = "tt.reduce"(%0) <{axis = 0 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %3 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc56) + tt.reduce.return %3 : f32 loc(#loc56) + }) : (tensor<32768xf32>) -> f32 loc(#loc56) + tt.return %1 : f32 loc(#loc57) + ^bb1: // no predecessors + %2 = ub.poison : f32 loc(#loc58) + tt.return %2 : f32 loc(#loc58) + } loc(#loc55) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc59)), %b: f32 loc("b"(#loc59))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc60) + tt.return %0 : f32 loc(#loc61) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc62) + tt.return %1 : f32 loc(#loc62) + } loc(#loc59) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":127:31) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":127:37) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":128:31) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":128:18) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":129:31) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":129:18) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":130:25) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":132:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":133:24) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":134:34) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":135:39) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":135:26) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":136:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":137:34) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":137:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":138:8) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":140:13) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":141:13) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":142:16) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":143:16) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":144:26) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":145:32) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":148:22) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":149:30) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":150:35) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":150:22) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":151:25) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":152:44) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":152:31) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:64) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:77) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:27) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:8) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":158:30) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":159:35) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":159:22) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":160:25) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":161:44) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":161:31) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":162:12) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":164:44) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":164:31) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:45) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:30) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:50) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":168:41) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:38) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:52) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:23) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":170:30) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":170:39) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":158:4) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc73 = loc("program_id"(#loc1)) +#loc74 = loc("program_id"(#loc2)) +#loc75 = loc("logits_ptr"(#loc3)) +#loc76 = loc("logits_ptr"(#loc4)) +#loc77 = loc("target_ptr"(#loc5)) +#loc78 = loc("target_ptr"(#loc6)) +#loc79 = loc("position_mask_ptr"(#loc7)) +#loc80 = loc("position_mask"(#loc8)) +#loc81 = loc("offsets"(#loc11)) +#loc82 = loc("offsets"(#loc12)) +#loc83 = loc("mask"(#loc13)) +#loc84 = loc("m_ptr"(#loc18)) +#loc85 = loc("d_ptr"(#loc19)) +#loc86 = loc("m"(#loc20)) +#loc87 = loc("d"(#loc21)) +#loc88 = loc("grad_output"(#loc22)) +#loc89 = loc("grad_output"(#loc23)) +#loc90 = loc("target_grad_sum"(#loc24)) +#loc91 = loc("target_grad_sum"(#loc25)) +#loc92 = loc("offsets"(#loc26)) +#loc93 = loc("offsets"(#loc27)) +#loc94 = loc("mask"(#loc28)) +#loc95 = loc("target_block"(#loc29)) +#loc96 = loc("target_block"(#loc30)) +#loc97 = loc("target_grad_sum"(#loc31)) +#loc98 = loc("target_grad_sum"(#loc32)) +#loc99 = loc("target_grad_sum"(#loc33)) +#loc100 = loc("target_grad_sum"(#loc34)) +#loc101 = loc("offsets"(#loc37)) +#loc102 = loc("offsets"(#loc38)) +#loc103 = loc("mask"(#loc39)) +#loc104 = loc("logits_block"(#loc40)) +#loc105 = loc("logits_block"(#loc41)) +#loc106 = loc("logits_block"(#loc42)) +#loc107 = loc("target_block"(#loc43)) +#loc108 = loc("target_block"(#loc44)) +#loc109 = loc("softmax_prob"(#loc45)) +#loc110 = loc("softmax_prob"(#loc46)) +#loc111 = loc("softmax_prob"(#loc47)) +#loc112 = loc("normalized_grad"(#loc48)) +#loc113 = loc("grad_block"(#loc49)) +#loc114 = loc("grad_block"(#loc50)) +#loc115 = loc("grad_block"(#loc51)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..ddf881e117876196c4d5005c7f734e00878bc4e2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttgir @@ -0,0 +1,199 @@ +#blocked = #ttg.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [32], order = [0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [32], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":114:0) +#loc1 = loc(unknown) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:34) +#loc54 = loc("logits_ptr"(#loc)) +#loc55 = loc("logits_stride"(#loc)) +#loc56 = loc("target_ptr"(#loc)) +#loc57 = loc("target_stride"(#loc)) +#loc58 = loc("position_mask_ptr"(#loc)) +#loc59 = loc("grad_output_ptr"(#loc)) +#loc60 = loc("scaling_factor"(#loc)) +#loc61 = loc("m_ptr"(#loc)) +#loc62 = loc("d_ptr"(#loc)) +#loc63 = loc("n_cols"(#loc)) +#loc89 = loc("target_grad_sum"(#loc33)) +#loc106 = loc(callsite(#loc1 at #loc89)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @log_softmax_backward_kernel(%logits_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("logits_ptr"(#loc)), %logits_stride: i32 {tt.divisibility = 16 : i32} loc("logits_stride"(#loc)), %target_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("target_ptr"(#loc)), %target_stride: i32 {tt.divisibility = 16 : i32} loc("target_stride"(#loc)), %position_mask_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("position_mask_ptr"(#loc)), %grad_output_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("grad_output_ptr"(#loc)), %scaling_factor: f32 loc("scaling_factor"(#loc)), %m_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("m_ptr"(#loc)), %d_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("d_ptr"(#loc)), %n_cols: i32 {tt.divisibility = 16 : i32} loc("n_cols"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<32768xf32, #blocked> loc(#loc1) + %c32768_i32 = arith.constant 32768 : i32 loc(#loc1) + %cst_0 = arith.constant 0.000000e+00 : f32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c0_i8 = arith.constant 0 : i8 loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<32768xbf16, #blocked> loc(#loc1) + %program_id = tt.get_program_id x : i32 loc(#loc64) + %program_id_2 = arith.extsi %program_id : i32 to i64 loc(#loc65) + %logits_ptr_3 = arith.extsi %logits_stride : i32 to i64 loc(#loc66) + %logits_ptr_4 = arith.muli %program_id_2, %logits_ptr_3 : i64 loc(#loc66) + %logits_ptr_5 = tt.addptr %logits_ptr, %logits_ptr_4 : !tt.ptr, i64 loc(#loc67) + %target_ptr_6 = arith.extsi %target_stride : i32 to i64 loc(#loc68) + %target_ptr_7 = arith.muli %program_id_2, %target_ptr_6 : i64 loc(#loc68) + %target_ptr_8 = tt.addptr %target_ptr, %target_ptr_7 : !tt.ptr, i64 loc(#loc69) + %position_mask_ptr_9 = tt.addptr %position_mask_ptr, %program_id_2 : !tt.ptr, i64 loc(#loc70) + %position_mask = tt.bitcast %position_mask_ptr_9 : !tt.ptr -> !tt.ptr loc(#loc71) + %position_mask_10 = tt.load %position_mask : !tt.ptr loc(#loc71) + %position_mask_11 = arith.cmpi ne, %position_mask_10, %c0_i8 : i8 loc(#loc71) + %0 = arith.extui %position_mask_11 : i1 to i32 loc(#loc10) + %1 = arith.cmpi eq, %0, %c0_i32 : i32 loc(#loc10) + cf.cond_br %1, ^bb1, ^bb2 loc(#loc10) + ^bb1: // pred: ^bb0 + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32, #blocked> loc(#loc72) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32, #blocked> loc(#loc73) + %2 = tt.splat %logits_ptr_5 : !tt.ptr -> tensor<32768x!tt.ptr, #blocked> loc(#loc13) + scf.for %i = %c0_i32 to %n_cols step %c32768_i32 : i32 { + %offsets_19 = tt.splat %i : i32 -> tensor<32768xi32, #blocked> loc(#loc74) + %offsets_20 = arith.addi %offsets_19, %offsets : tensor<32768xi32, #blocked> loc(#loc74) + %mask_21 = arith.cmpi slt, %offsets_20, %mask : tensor<32768xi32, #blocked> loc(#loc73) + %3 = tt.addptr %2, %offsets_20 : tensor<32768x!tt.ptr, #blocked>, tensor<32768xi32, #blocked> loc(#loc13) + tt.store %3, %cst_1, %mask_21 : tensor<32768x!tt.ptr, #blocked> loc(#loc16) + } loc(#loc14) + tt.return loc(#loc17) + ^bb2: // pred: ^bb0 + %m_ptr_12 = tt.addptr %m_ptr, %program_id_2 : !tt.ptr, i64 loc(#loc75) + %d_ptr_13 = tt.addptr %d_ptr, %program_id_2 : !tt.ptr, i64 loc(#loc76) + %m = tt.load %m_ptr_12 : !tt.ptr loc(#loc77) + %d = tt.load %d_ptr_13 : !tt.ptr loc(#loc78) + %grad_output = tt.load %grad_output_ptr : !tt.ptr loc(#loc79) + %grad_output_14 = arith.mulf %grad_output, %scaling_factor : f32 loc(#loc80) + %offsets_15 = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32, #blocked> loc(#loc81) + %mask_16 = tt.splat %n_cols : i32 -> tensor<32768xi32, #blocked> loc(#loc82) + %target_block = tt.splat %target_ptr_8 : !tt.ptr -> tensor<32768x!tt.ptr, #blocked> loc(#loc83) + %target_grad_sum = tt.splat %grad_output_14 : f32 -> tensor<32768xf32, #blocked> loc(#loc84) + %target_grad_sum_17 = scf.for %i = %c0_i32 to %n_cols step %c32768_i32 iter_args(%target_grad_sum_19 = %cst_0) -> (f32) : i32 { + %offsets_20 = tt.splat %i : i32 -> tensor<32768xi32, #blocked> loc(#loc86) + %offsets_21 = arith.addi %offsets_20, %offsets_15 : tensor<32768xi32, #blocked> loc(#loc86) + %mask_22 = arith.cmpi slt, %offsets_21, %mask_16 : tensor<32768xi32, #blocked> loc(#loc82) + %target_block_23 = tt.addptr %target_block, %offsets_21 : tensor<32768x!tt.ptr, #blocked>, tensor<32768xi32, #blocked> loc(#loc83) + %target_block_24 = tt.load %target_block_23, %mask_22, %cst : tensor<32768x!tt.ptr, #blocked> loc(#loc87) + %target_grad_sum_25 = arith.mulf %target_block_24, %target_grad_sum : tensor<32768xf32, #blocked> loc(#loc84) + %target_grad_sum_26 = arith.select %mask_22, %target_grad_sum_25, %cst : tensor<32768xi1, #blocked>, tensor<32768xf32, #blocked> loc(#loc88) + %target_grad_sum_27 = tt.reshape %target_grad_sum_26 allow_reorder : tensor<32768xf32, #blocked> -> tensor<32768xf32, #blocked1> loc(#loc105) + %target_grad_sum_28 = "tt.reduce"(%target_grad_sum_27) <{axis = 0 : i32}> ({ + ^bb0(%target_grad_sum_30: f32 loc(callsite(#loc1 at #loc89)), %target_grad_sum_31: f32 loc(callsite(#loc1 at #loc89))): + %target_grad_sum_32 = arith.addf %target_grad_sum_30, %target_grad_sum_31 : f32 loc(#loc107) + tt.reduce.return %target_grad_sum_32 : f32 loc(#loc105) + }) : (tensor<32768xf32, #blocked1>) -> f32 loc(#loc105) + %target_grad_sum_29 = arith.addf %target_grad_sum_19, %target_grad_sum_28 : f32 loc(#loc90) + scf.yield %target_grad_sum_29 : f32 loc(#loc36) + } loc(#loc85) + %logits_block = tt.splat %logits_ptr_5 : !tt.ptr -> tensor<32768x!tt.ptr, #blocked> loc(#loc91) + %softmax_prob = tt.splat %m : f32 -> tensor<32768xf32, #blocked> loc(#loc92) + %softmax_prob_18 = tt.splat %d : f32 -> tensor<32768xf32, #blocked> loc(#loc93) + %normalized_grad = tt.splat %target_grad_sum_17 : f32 -> tensor<32768xf32, #blocked> loc(#loc94) + scf.for %i = %c0_i32 to %n_cols step %c32768_i32 : i32 { + %offsets_19 = tt.splat %i : i32 -> tensor<32768xi32, #blocked> loc(#loc95) + %offsets_20 = arith.addi %offsets_19, %offsets_15 : tensor<32768xi32, #blocked> loc(#loc95) + %mask_21 = arith.cmpi slt, %offsets_20, %mask_16 : tensor<32768xi32, #blocked> loc(#loc96) + %logits_block_22 = tt.addptr %logits_block, %offsets_20 : tensor<32768x!tt.ptr, #blocked>, tensor<32768xi32, #blocked> loc(#loc91) + %logits_block_23 = tt.load %logits_block_22, %mask_21, %cst_1 : tensor<32768x!tt.ptr, #blocked> loc(#loc97) + %logits_block_24 = arith.extf %logits_block_23 : tensor<32768xbf16, #blocked> to tensor<32768xf32, #blocked> loc(#loc98) + %target_block_25 = tt.addptr %target_block, %offsets_20 : tensor<32768x!tt.ptr, #blocked>, tensor<32768xi32, #blocked> loc(#loc99) + %target_block_26 = tt.load %target_block_25, %mask_21, %cst : tensor<32768x!tt.ptr, #blocked> loc(#loc100) + %softmax_prob_27 = arith.subf %logits_block_24, %softmax_prob : tensor<32768xf32, #blocked> loc(#loc92) + %softmax_prob_28 = math.exp %softmax_prob_27 : tensor<32768xf32, #blocked> loc(#loc101) + %softmax_prob_29 = arith.divf %softmax_prob_28, %softmax_prob_18 : tensor<32768xf32, #blocked> loc(#loc93) + %normalized_grad_30 = arith.mulf %softmax_prob_29, %normalized_grad : tensor<32768xf32, #blocked> loc(#loc94) + %grad_block = arith.mulf %target_block_26, %target_grad_sum : tensor<32768xf32, #blocked> loc(#loc102) + %grad_block_31 = arith.subf %grad_block, %normalized_grad_30 : tensor<32768xf32, #blocked> loc(#loc103) + %grad_block_32 = arith.subf %cst, %grad_block_31 : tensor<32768xf32, #blocked> loc(#loc104) + %3 = arith.truncf %grad_block_32 : tensor<32768xf32, #blocked> to tensor<32768xbf16, #blocked> loc(#loc52) + tt.store %logits_block_22, %3, %mask_21 : tensor<32768x!tt.ptr, #blocked> loc(#loc52) + } loc(#loc41) + tt.return loc(#loc53) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":127:31) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":127:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":128:31) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":128:18) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":129:31) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":129:18) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":130:25) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":132:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":133:24) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":135:39) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":136:29) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":137:34) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":134:34) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":135:26) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":137:43) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":138:8) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":140:13) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":141:13) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":142:16) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":143:16) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":144:26) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":145:32) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":150:35) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":151:25) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":152:44) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:64) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":149:30) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":150:22) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":152:31) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:77) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:27) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":161:44) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:45) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:50) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":168:41) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":158:30) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":159:22) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":160:25) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":161:31) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":162:12) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":164:44) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":164:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:30) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:38) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:52) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:23) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":170:39) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":158:4) +#loc64 = loc("program_id"(#loc2)) +#loc65 = loc("program_id"(#loc3)) +#loc66 = loc("logits_ptr"(#loc4)) +#loc67 = loc("logits_ptr"(#loc5)) +#loc68 = loc("target_ptr"(#loc6)) +#loc69 = loc("target_ptr"(#loc7)) +#loc70 = loc("position_mask_ptr"(#loc8)) +#loc71 = loc("position_mask"(#loc9)) +#loc72 = loc("offsets"(#loc11)) +#loc73 = loc("mask"(#loc12)) +#loc74 = loc("offsets"(#loc15)) +#loc75 = loc("m_ptr"(#loc18)) +#loc76 = loc("d_ptr"(#loc19)) +#loc77 = loc("m"(#loc20)) +#loc78 = loc("d"(#loc21)) +#loc79 = loc("grad_output"(#loc22)) +#loc80 = loc("grad_output"(#loc23)) +#loc81 = loc("offsets"(#loc24)) +#loc82 = loc("mask"(#loc25)) +#loc83 = loc("target_block"(#loc26)) +#loc84 = loc("target_grad_sum"(#loc27)) +#loc85 = loc("target_grad_sum"(#loc28)) +#loc86 = loc("offsets"(#loc29)) +#loc87 = loc("target_block"(#loc30)) +#loc88 = loc("target_grad_sum"(#loc31)) +#loc90 = loc("target_grad_sum"(#loc35)) +#loc91 = loc("logits_block"(#loc37)) +#loc92 = loc("softmax_prob"(#loc38)) +#loc93 = loc("softmax_prob"(#loc39)) +#loc94 = loc("normalized_grad"(#loc40)) +#loc95 = loc("offsets"(#loc42)) +#loc96 = loc("mask"(#loc43)) +#loc97 = loc("logits_block"(#loc44)) +#loc98 = loc("logits_block"(#loc45)) +#loc99 = loc("target_block"(#loc46)) +#loc100 = loc("target_block"(#loc47)) +#loc101 = loc("softmax_prob"(#loc48)) +#loc102 = loc("grad_block"(#loc49)) +#loc103 = loc("grad_block"(#loc50)) +#loc104 = loc("grad_block"(#loc51)) +#loc105 = loc(callsite(#loc32 at #loc89)) +#loc107 = loc(callsite(#loc34 at #loc105)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttir new file mode 100644 index 0000000000000000000000000000000000000000..b5bc4459f3fc35c5d5c780663109ce8a4c361d3d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/QFKUJZNM4B4WZ6YCXLCLTXWGUKQRBKMH2LTZ7GNQ5C5WM6AH5V6A/log_softmax_backward_kernel.ttir @@ -0,0 +1,203 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":114:0) +#loc1 = loc(unknown) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:34) +#loc55 = loc("logits_ptr"(#loc)) +#loc56 = loc("logits_stride"(#loc)) +#loc57 = loc("target_ptr"(#loc)) +#loc58 = loc("target_stride"(#loc)) +#loc59 = loc("position_mask_ptr"(#loc)) +#loc60 = loc("grad_output_ptr"(#loc)) +#loc61 = loc("scaling_factor"(#loc)) +#loc62 = loc("m_ptr"(#loc)) +#loc63 = loc("d_ptr"(#loc)) +#loc64 = loc("n_cols"(#loc)) +#loc90 = loc("target_grad_sum"(#loc33)) +#loc108 = loc(callsite(#loc1 at #loc90)) +module { + tt.func public @log_softmax_backward_kernel(%logits_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("logits_ptr"(#loc)), %logits_stride: i32 {tt.divisibility = 16 : i32} loc("logits_stride"(#loc)), %target_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("target_ptr"(#loc)), %target_stride: i32 {tt.divisibility = 16 : i32} loc("target_stride"(#loc)), %position_mask_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("position_mask_ptr"(#loc)), %grad_output_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("grad_output_ptr"(#loc)), %scaling_factor: f32 loc("scaling_factor"(#loc)), %m_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("m_ptr"(#loc)), %d_ptr: !tt.ptr {tt.divisibility = 16 : i32} loc("d_ptr"(#loc)), %n_cols: i32 {tt.divisibility = 16 : i32} loc("n_cols"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<32768xbf16> loc(#loc1) + %c32768_i32 = arith.constant 32768 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<32768xf32> loc(#loc1) + %cst_1 = arith.constant 0.000000e+00 : f32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %position_mask = arith.constant 0 : i8 loc(#loc65) + %program_id = tt.get_program_id x : i32 loc(#loc66) + %program_id_2 = arith.extsi %program_id : i32 to i64 loc(#loc67) + %logits_ptr_3 = arith.extsi %logits_stride : i32 to i64 loc(#loc68) + %logits_ptr_4 = arith.muli %program_id_2, %logits_ptr_3 : i64 loc(#loc68) + %logits_ptr_5 = tt.addptr %logits_ptr, %logits_ptr_4 : !tt.ptr, i64 loc(#loc69) + %target_ptr_6 = arith.extsi %target_stride : i32 to i64 loc(#loc70) + %target_ptr_7 = arith.muli %program_id_2, %target_ptr_6 : i64 loc(#loc70) + %target_ptr_8 = tt.addptr %target_ptr, %target_ptr_7 : !tt.ptr, i64 loc(#loc71) + %position_mask_ptr_9 = tt.addptr %position_mask_ptr, %program_id_2 : !tt.ptr, i64 loc(#loc72) + %position_mask_10 = tt.bitcast %position_mask_ptr_9 : !tt.ptr -> !tt.ptr loc(#loc65) + %position_mask_11 = tt.load %position_mask_10 : !tt.ptr loc(#loc65) + %position_mask_12 = arith.cmpi ne, %position_mask_11, %position_mask : i8 loc(#loc65) + %0 = arith.extui %position_mask_12 : i1 to i32 loc(#loc10) + %1 = arith.cmpi eq, %0, %c0_i32 : i32 loc(#loc10) + cf.cond_br %1, ^bb1, ^bb2 loc(#loc10) + ^bb1: // pred: ^bb0 + scf.for %i = %c0_i32 to %n_cols step %c32768_i32 : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc73) + %offsets_16 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc74) + %offsets_17 = arith.addi %offsets_16, %offsets : tensor<32768xi32> loc(#loc74) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc75) + %mask_18 = arith.cmpi slt, %offsets_17, %mask : tensor<32768xi32> loc(#loc75) + %2 = tt.splat %logits_ptr_5 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc15) + %3 = tt.addptr %2, %offsets_17 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc15) + tt.store %3, %cst, %mask_18 : tensor<32768x!tt.ptr> loc(#loc16) + } loc(#loc11) + tt.return loc(#loc17) + ^bb2: // pred: ^bb0 + %m_ptr_13 = tt.addptr %m_ptr, %program_id_2 : !tt.ptr, i64 loc(#loc76) + %d_ptr_14 = tt.addptr %d_ptr, %program_id_2 : !tt.ptr, i64 loc(#loc77) + %m = tt.load %m_ptr_13 : !tt.ptr loc(#loc78) + %d = tt.load %d_ptr_14 : !tt.ptr loc(#loc79) + %grad_output = tt.load %grad_output_ptr : !tt.ptr loc(#loc80) + %grad_output_15 = arith.mulf %grad_output, %scaling_factor : f32 loc(#loc81) + %target_grad_sum = scf.for %i = %c0_i32 to %n_cols step %c32768_i32 iter_args(%target_grad_sum_16 = %cst_1) -> (f32) : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc83) + %offsets_17 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc84) + %offsets_18 = arith.addi %offsets_17, %offsets : tensor<32768xi32> loc(#loc84) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc85) + %mask_19 = arith.cmpi slt, %offsets_18, %mask : tensor<32768xi32> loc(#loc85) + %target_block = tt.splat %target_ptr_8 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc86) + %target_block_20 = tt.addptr %target_block, %offsets_18 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc86) + %target_block_21 = tt.load %target_block_20, %mask_19, %cst_0 : tensor<32768x!tt.ptr> loc(#loc87) + %target_grad_sum_22 = tt.splat %grad_output_15 : f32 -> tensor<32768xf32> loc(#loc88) + %target_grad_sum_23 = arith.mulf %target_block_21, %target_grad_sum_22 : tensor<32768xf32> loc(#loc88) + %target_grad_sum_24 = arith.select %mask_19, %target_grad_sum_23, %cst_0 : tensor<32768xi1>, tensor<32768xf32> loc(#loc89) + %target_grad_sum_25 = tt.reshape %target_grad_sum_24 allow_reorder : tensor<32768xf32> -> tensor<32768xf32> loc(#loc107) + %target_grad_sum_26 = "tt.reduce"(%target_grad_sum_25) <{axis = 0 : i32}> ({ + ^bb0(%target_grad_sum_28: f32 loc(callsite(#loc1 at #loc90)), %target_grad_sum_29: f32 loc(callsite(#loc1 at #loc90))): + %target_grad_sum_30 = arith.addf %target_grad_sum_28, %target_grad_sum_29 : f32 loc(#loc109) + tt.reduce.return %target_grad_sum_30 : f32 loc(#loc107) + }) : (tensor<32768xf32>) -> f32 loc(#loc107) + %target_grad_sum_27 = arith.addf %target_grad_sum_16, %target_grad_sum_26 : f32 loc(#loc91) + scf.yield %target_grad_sum_27 : f32 loc(#loc36) + } loc(#loc82) + scf.for %i = %c0_i32 to %n_cols step %c32768_i32 : i32 { + %offsets = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc92) + %offsets_16 = tt.splat %i : i32 -> tensor<32768xi32> loc(#loc93) + %offsets_17 = arith.addi %offsets_16, %offsets : tensor<32768xi32> loc(#loc93) + %mask = tt.splat %n_cols : i32 -> tensor<32768xi32> loc(#loc94) + %mask_18 = arith.cmpi slt, %offsets_17, %mask : tensor<32768xi32> loc(#loc94) + %logits_block = tt.splat %logits_ptr_5 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc95) + %logits_block_19 = tt.addptr %logits_block, %offsets_17 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc95) + %logits_block_20 = tt.load %logits_block_19, %mask_18, %cst : tensor<32768x!tt.ptr> loc(#loc96) + %logits_block_21 = arith.extf %logits_block_20 : tensor<32768xbf16> to tensor<32768xf32> loc(#loc97) + %target_block = tt.splat %target_ptr_8 : !tt.ptr -> tensor<32768x!tt.ptr> loc(#loc98) + %target_block_22 = tt.addptr %target_block, %offsets_17 : tensor<32768x!tt.ptr>, tensor<32768xi32> loc(#loc98) + %target_block_23 = tt.load %target_block_22, %mask_18, %cst_0 : tensor<32768x!tt.ptr> loc(#loc99) + %softmax_prob = tt.splat %m : f32 -> tensor<32768xf32> loc(#loc100) + %softmax_prob_24 = arith.subf %logits_block_21, %softmax_prob : tensor<32768xf32> loc(#loc100) + %softmax_prob_25 = math.exp %softmax_prob_24 : tensor<32768xf32> loc(#loc101) + %softmax_prob_26 = tt.splat %d : f32 -> tensor<32768xf32> loc(#loc102) + %softmax_prob_27 = arith.divf %softmax_prob_25, %softmax_prob_26 : tensor<32768xf32> loc(#loc102) + %normalized_grad = tt.splat %target_grad_sum : f32 -> tensor<32768xf32> loc(#loc103) + %normalized_grad_28 = arith.mulf %softmax_prob_27, %normalized_grad : tensor<32768xf32> loc(#loc103) + %grad_block = tt.splat %grad_output_15 : f32 -> tensor<32768xf32> loc(#loc104) + %grad_block_29 = arith.mulf %target_block_23, %grad_block : tensor<32768xf32> loc(#loc104) + %grad_block_30 = arith.subf %grad_block_29, %normalized_grad_28 : tensor<32768xf32> loc(#loc105) + %grad_block_31 = arith.subf %cst_0, %grad_block_30 : tensor<32768xf32> loc(#loc106) + %2 = arith.truncf %grad_block_31 : tensor<32768xf32> to tensor<32768xbf16> loc(#loc53) + tt.store %logits_block_19, %2, %mask_18 : tensor<32768x!tt.ptr> loc(#loc53) + } loc(#loc37) + tt.return loc(#loc54) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":132:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":127:31) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":127:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":128:31) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":128:18) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":129:31) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":129:18) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":130:25) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":133:24) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":134:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":135:39) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":135:26) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":136:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":137:34) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":137:43) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":138:8) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":140:13) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":141:13) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":142:16) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":143:16) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":144:26) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":145:32) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":149:30) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":150:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":150:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":151:25) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":152:44) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":152:31) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:64) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:77) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:27) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":155:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":158:30) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":159:35) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":159:22) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":160:25) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":161:44) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":161:31) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":162:12) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":164:44) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":164:31) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:45) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:30) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":167:50) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":168:41) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:38) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:52) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":169:23) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":170:39) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/specforge/core/loss.py":158:4) +#loc65 = loc("position_mask"(#loc2)) +#loc66 = loc("program_id"(#loc3)) +#loc67 = loc("program_id"(#loc4)) +#loc68 = loc("logits_ptr"(#loc5)) +#loc69 = loc("logits_ptr"(#loc6)) +#loc70 = loc("target_ptr"(#loc7)) +#loc71 = loc("target_ptr"(#loc8)) +#loc72 = loc("position_mask_ptr"(#loc9)) +#loc73 = loc("offsets"(#loc12)) +#loc74 = loc("offsets"(#loc13)) +#loc75 = loc("mask"(#loc14)) +#loc76 = loc("m_ptr"(#loc18)) +#loc77 = loc("d_ptr"(#loc19)) +#loc78 = loc("m"(#loc20)) +#loc79 = loc("d"(#loc21)) +#loc80 = loc("grad_output"(#loc22)) +#loc81 = loc("grad_output"(#loc23)) +#loc82 = loc("target_grad_sum"(#loc24)) +#loc83 = loc("offsets"(#loc25)) +#loc84 = loc("offsets"(#loc26)) +#loc85 = loc("mask"(#loc27)) +#loc86 = loc("target_block"(#loc28)) +#loc87 = loc("target_block"(#loc29)) +#loc88 = loc("target_grad_sum"(#loc30)) +#loc89 = loc("target_grad_sum"(#loc31)) +#loc91 = loc("target_grad_sum"(#loc35)) +#loc92 = loc("offsets"(#loc38)) +#loc93 = loc("offsets"(#loc39)) +#loc94 = loc("mask"(#loc40)) +#loc95 = loc("logits_block"(#loc41)) +#loc96 = loc("logits_block"(#loc42)) +#loc97 = loc("logits_block"(#loc43)) +#loc98 = loc("target_block"(#loc44)) +#loc99 = loc("target_block"(#loc45)) +#loc100 = loc("softmax_prob"(#loc46)) +#loc101 = loc("softmax_prob"(#loc47)) +#loc102 = loc("softmax_prob"(#loc48)) +#loc103 = loc("normalized_grad"(#loc49)) +#loc104 = loc("grad_block"(#loc50)) +#loc105 = loc("grad_block"(#loc51)) +#loc106 = loc("grad_block"(#loc52)) +#loc107 = loc(callsite(#loc32 at #loc90)) +#loc109 = loc(callsite(#loc34 at #loc107)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/__grp__triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/__grp__triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0d4f915eb061d745fac67cd9f5a33692907d5337 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/__grp__triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.source", "triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.ttir", "triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.ttgir", "triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.llir", "triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.ptx", "triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.cubin", "triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..f91758581449479c75ec3af701ce59120275e6b3 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a38cb7d0e582842731e23fcc50beb6f2f1ac9c46 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.json @@ -0,0 +1 @@ +{"hash": "823fe8850cf0ea453e93cad016b9654ca148cb065d57c0e118ad0c6e309ee921", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 64, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_mean_mul_pow_rsqrt_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..e448a5e20e95447d7b1154a5cf93f6702c44bc82 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.llir @@ -0,0 +1,363 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__to_copy_mean_mul_pow_rsqrt_0(ptr addrspace(1) %0, ptr addrspace(1) %1, double %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i32 %6, i32 %7, ptr addrspace(1) readnone captures(none) %8, ptr addrspace(1) readnone captures(none) %9) local_unnamed_addr #0 !dbg !5 { + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %12 = icmp slt i32 %11, %6, !dbg !9 + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %14 = and i32 %13, 31, !dbg !10 + %15 = lshr i32 %13, 5, !dbg !10 + %16 = and i32 %13, 511, !dbg !10 + %17 = or i32 %13, 512, !dbg !10 + %18 = or disjoint i32 %16, 1024, !dbg !10 + %19 = or i32 %13, 1536, !dbg !10 + %20 = zext nneg i32 %11 to i64, !dbg !11 + %21 = mul i64 %5, %20, !dbg !11 + %22 = icmp sgt i32 %7, 0, !dbg !12 + br i1 %22, label %.lr.ph, label %._crit_edge, !dbg !12 + +.lr.ph: ; preds = %10 + %23 = getelementptr bfloat, ptr addrspace(1) %1, i64 %21 + %24 = insertelement <4 x i1> poison, i1 %12, i64 0, !dbg !13 + %25 = shufflevector <4 x i1> %24, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !13 + %26 = insertelement <4 x i32> poison, i32 %16, i64 0, !dbg !14 + %27 = insertelement <4 x i32> %26, i32 %17, i64 1, !dbg !14 + %28 = insertelement <4 x i32> %27, i32 %18, i64 2, !dbg !14 + %29 = insertelement <4 x i32> %28, i32 %19, i64 3, !dbg !14 + %30 = insertelement <4 x i32> poison, i32 %7, i64 0, !dbg !15 + %31 = shufflevector <4 x i32> %30, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !15 + br label %32, !dbg !12 + +32: ; preds = %.lr.ph, %32 + %33 = phi i32 [ 0, %.lr.ph ], [ %73, %32 ] + %34 = phi <4 x float> [ zeroinitializer, %.lr.ph ], [ %72, %32 ] + %35 = insertelement <4 x i32> poison, i32 %33, i64 0, !dbg !14 + %36 = shufflevector <4 x i32> %35, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !14 + %37 = or disjoint <4 x i32> %36, %29, !dbg !14 + %38 = extractelement <4 x i32> %37, i64 0, !dbg !16 + %39 = sext i32 %38 to i64, !dbg !16 + %40 = extractelement <4 x i32> %37, i64 1, !dbg !16 + %41 = sext i32 %40 to i64, !dbg !16 + %42 = extractelement <4 x i32> %37, i64 2, !dbg !16 + %43 = sext i32 %42 to i64, !dbg !16 + %44 = extractelement <4 x i32> %37, i64 3, !dbg !16 + %45 = sext i32 %44 to i64, !dbg !16 + %46 = getelementptr bfloat, ptr addrspace(1) %23, i64 %39, !dbg !17 + %47 = getelementptr bfloat, ptr addrspace(1) %23, i64 %41, !dbg !17 + %48 = getelementptr bfloat, ptr addrspace(1) %23, i64 %43, !dbg !17 + %49 = getelementptr bfloat, ptr addrspace(1) %23, i64 %45, !dbg !17 + %50 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !18 + %51 = icmp slt <4 x i32> %37, %31, !dbg !15 + %52 = and <4 x i1> %25, %51, !dbg !13 + %53 = extractelement <4 x i1> %52, i64 0, !dbg !18 + %54 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %46, i64 %50, i1 %53) #6, !dbg !18 + %55 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !18 + %56 = extractelement <4 x i1> %52, i64 1, !dbg !18 + %57 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %47, i64 %55, i1 %56) #6, !dbg !18 + %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !18 + %59 = extractelement <4 x i1> %52, i64 2, !dbg !18 + %60 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %48, i64 %58, i1 %59) #6, !dbg !18 + %61 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !18 + %62 = extractelement <4 x i1> %52, i64 3, !dbg !18 + %63 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %49, i64 %61, i1 %62) #6, !dbg !18 + %64 = insertelement <4 x i16> poison, i16 %54, i64 0, !dbg !18 + %65 = insertelement <4 x i16> %64, i16 %57, i64 1, !dbg !18 + %66 = insertelement <4 x i16> %65, i16 %60, i64 2, !dbg !18 + %67 = insertelement <4 x i16> %66, i16 %63, i64 3, !dbg !18 + %68 = bitcast <4 x i16> %67 to <4 x bfloat>, !dbg !18 + %69 = fpext <4 x bfloat> %68 to <4 x float>, !dbg !19 + %70 = fmul <4 x float> %69, %69, !dbg !20 + %71 = fadd <4 x float> %34, %70, !dbg !21 + %72 = select <4 x i1> %52, <4 x float> %71, <4 x float> %34, !dbg !22 + %73 = add i32 %33, 2048, !dbg !12 + %74 = icmp slt i32 %73, %7, !dbg !12 + br i1 %74, label %32, label %._crit_edge.loopexit, !dbg !12 + +._crit_edge.loopexit: ; preds = %32 + %shift = shufflevector <4 x float> %72, <4 x float> poison, <4 x i32> , !dbg !23 + %foldExtExtBinop = fadd <4 x float> %72, %shift, !dbg !23 + %shift14 = shufflevector <4 x float> %72, <4 x float> poison, <4 x i32> , !dbg !23 + %foldExtExtBinop15 = fadd <4 x float> %shift14, %foldExtExtBinop, !dbg !23 + %shift17 = shufflevector <4 x float> %72, <4 x float> poison, <4 x i32> , !dbg !23 + %foldExtExtBinop18 = fadd <4 x float> %shift17, %foldExtExtBinop15, !dbg !23 + %75 = extractelement <4 x float> %foldExtExtBinop18, i64 0, !dbg !23 + br label %._crit_edge, !dbg !23 + +._crit_edge: ; preds = %._crit_edge.loopexit, %10 + %76 = phi float [ 0.000000e+00, %10 ], [ %75, %._crit_edge.loopexit ], !dbg !23 + %77 = bitcast float %76 to i32, !dbg !27 + %78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %77, i32 16, i32 31), !dbg !27 + %79 = bitcast i32 %78 to float, !dbg !27 + %80 = fadd float %76, %79, !dbg !23 + %81 = bitcast float %80 to i32, !dbg !27 + %82 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %81, i32 8, i32 31), !dbg !27 + %83 = bitcast i32 %82 to float, !dbg !27 + %84 = fadd float %80, %83, !dbg !23 + %85 = bitcast float %84 to i32, !dbg !27 + %86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 4, i32 31), !dbg !27 + %87 = bitcast i32 %86 to float, !dbg !27 + %88 = fadd float %84, %87, !dbg !23 + %89 = bitcast float %88 to i32, !dbg !27 + %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 2, i32 31), !dbg !27 + %91 = bitcast i32 %90 to float, !dbg !27 + %92 = fadd float %88, %91, !dbg !23 + %93 = bitcast float %92 to i32, !dbg !27 + %94 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %93, i32 1, i32 31), !dbg !27 + %95 = bitcast i32 %94 to float, !dbg !27 + %96 = fadd float %92, %95, !dbg !23 + %97 = and i32 %15, 15, !dbg !27 + %98 = icmp eq i32 %14, 0, !dbg !27 + %99 = getelementptr float, ptr addrspace(3) @global_smem, i32 %97, !dbg !27 + %100 = bitcast float %96 to <1 x i32>, !dbg !27 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %99, <1 x i32> %100, i1 %98) #6, !dbg !27 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !27 + %101 = icmp samesign ult i32 %13, 16, !dbg !27 + %102 = getelementptr float, ptr addrspace(3) @global_smem, i32 %13, !dbg !27 + %103 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %102, i1 %101) #6, !dbg !27 + %104 = bitcast i32 %103 to float, !dbg !27 + %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %103, i32 8, i32 31), !dbg !27 + %106 = bitcast i32 %105 to float, !dbg !27 + %107 = fadd float %104, %106, !dbg !23 + %108 = bitcast float %107 to i32, !dbg !27 + %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 4, i32 31), !dbg !27 + %110 = bitcast i32 %109 to float, !dbg !27 + %111 = fadd float %107, %110, !dbg !23 + %112 = bitcast float %111 to i32, !dbg !27 + %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 2, i32 31), !dbg !27 + %114 = bitcast i32 %113 to float, !dbg !27 + %115 = fadd float %111, %114, !dbg !23 + %116 = bitcast float %115 to i32, !dbg !27 + %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 1, i32 31), !dbg !27 + %118 = bitcast i32 %117 to float, !dbg !27 + %119 = fadd float %115, %118, !dbg !23 + %120 = icmp eq i32 %13, 0, !dbg !27 + %121 = bitcast float %119 to <1 x i32>, !dbg !27 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %102, <1 x i32> %121, i1 %120) #6, !dbg !27 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !27 + %122 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !27 + %123 = sitofp i64 %5 to float, !dbg !28 + %124 = tail call float @llvm.nvvm.div.full(float %122, float %123), !dbg !29 + %125 = fptrunc double %2 to float, !dbg !30 + %126 = fadd float %124, %125, !dbg !31 + %127 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !32 + %.not.i = icmp eq i32 %127, 0, !dbg !32 + br i1 %.not.i, label %130, label %128, !dbg !32 + +128: ; preds = %._crit_edge + %129 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %126), !dbg !32 + br label %__nv_rsqrtf.exit, !dbg !32 + +130: ; preds = %._crit_edge + %131 = tail call float @llvm.nvvm.rsqrt.approx.f(float %126), !dbg !32 + br label %__nv_rsqrtf.exit, !dbg !32 + +__nv_rsqrtf.exit: ; preds = %128, %130 + %.0.i = phi float [ %129, %128 ], [ %131, %130 ], !dbg !32 + %132 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !32 + %.not.i3 = icmp eq i32 %132, 0, !dbg !32 + br i1 %.not.i3, label %135, label %133, !dbg !32 + +133: ; preds = %__nv_rsqrtf.exit + %134 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %126), !dbg !32 + br label %__nv_rsqrtf.exit5, !dbg !32 + +135: ; preds = %__nv_rsqrtf.exit + %136 = tail call float @llvm.nvvm.rsqrt.approx.f(float %126), !dbg !32 + br label %__nv_rsqrtf.exit5, !dbg !32 + +__nv_rsqrtf.exit5: ; preds = %133, %135 + %.0.i4 = phi float [ %134, %133 ], [ %136, %135 ], !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !33 + %137 = getelementptr float, ptr addrspace(1) %0, i64 %20, !dbg !34 + %138 = icmp eq i32 %16, 0, !dbg !35 + %139 = bitcast float %.0.i to i32, !dbg !35 + %140 = and i1 %138, %12, !dbg !35 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %139, ptr addrspace(1) %137, i1 %140) #6, !dbg !35 + br i1 %22, label %.lr.ph6, label %._crit_edge7, !dbg !36 + +.lr.ph6: ; preds = %__nv_rsqrtf.exit5, %.lr.ph6 + %141 = phi i32 [ %222, %.lr.ph6 ], [ 0, %__nv_rsqrtf.exit5 ] + %142 = or disjoint i32 %141, %16, !dbg !37 + %143 = or disjoint i32 %141, %17, !dbg !37 + %144 = or disjoint i32 %141, %18, !dbg !37 + %145 = or disjoint i32 %141, %19, !dbg !37 + %146 = icmp slt i32 %142, %7, !dbg !38 + %147 = icmp slt i32 %143, %7, !dbg !38 + %148 = icmp slt i32 %144, %7, !dbg !38 + %149 = icmp slt i32 %145, %7, !dbg !38 + %150 = sext i32 %142 to i64, !dbg !39 + %151 = getelementptr bfloat, ptr addrspace(1) %3, i64 %150, !dbg !39 + %152 = sext i32 %143 to i64, !dbg !39 + %153 = getelementptr bfloat, ptr addrspace(1) %3, i64 %152, !dbg !39 + %154 = sext i32 %144 to i64, !dbg !39 + %155 = getelementptr bfloat, ptr addrspace(1) %3, i64 %154, !dbg !39 + %156 = sext i32 %145 to i64, !dbg !39 + %157 = getelementptr bfloat, ptr addrspace(1) %3, i64 %156, !dbg !39 + %158 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !40 + %159 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %151, i64 %158, i1 %146) #6, !dbg !40 + %160 = bitcast i16 %159 to bfloat, !dbg !40 + %161 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !40 + %162 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %153, i64 %161, i1 %147) #6, !dbg !40 + %163 = bitcast i16 %162 to bfloat, !dbg !40 + %164 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !40 + %165 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %155, i64 %164, i1 %148) #6, !dbg !40 + %166 = bitcast i16 %165 to bfloat, !dbg !40 + %167 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !40 + %168 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %157, i64 %167, i1 %149) #6, !dbg !40 + %169 = bitcast i16 %168 to bfloat, !dbg !40 + %170 = fpext bfloat %160 to float, !dbg !41 + %171 = fpext bfloat %163 to float, !dbg !41 + %172 = fpext bfloat %166 to float, !dbg !41 + %173 = fpext bfloat %169 to float, !dbg !41 + %174 = add i64 %21, %150, !dbg !42 + %175 = add i64 %21, %152, !dbg !42 + %176 = add i64 %21, %154, !dbg !42 + %177 = add i64 %21, %156, !dbg !42 + %178 = getelementptr bfloat, ptr addrspace(1) %1, i64 %174, !dbg !43 + %179 = getelementptr bfloat, ptr addrspace(1) %1, i64 %175, !dbg !43 + %180 = getelementptr bfloat, ptr addrspace(1) %1, i64 %176, !dbg !43 + %181 = getelementptr bfloat, ptr addrspace(1) %1, i64 %177, !dbg !43 + %182 = and i1 %12, %146, !dbg !44 + %183 = and i1 %12, %147, !dbg !44 + %184 = and i1 %12, %148, !dbg !44 + %185 = and i1 %12, %149, !dbg !44 + %186 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !45 + %187 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %178, i64 %186, i1 %182) #6, !dbg !45 + %188 = bitcast i16 %187 to bfloat, !dbg !45 + %189 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !45 + %190 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %179, i64 %189, i1 %183) #6, !dbg !45 + %191 = bitcast i16 %190 to bfloat, !dbg !45 + %192 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !45 + %193 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %180, i64 %192, i1 %184) #6, !dbg !45 + %194 = bitcast i16 %193 to bfloat, !dbg !45 + %195 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !45 + %196 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %181, i64 %195, i1 %185) #6, !dbg !45 + %197 = bitcast i16 %196 to bfloat, !dbg !45 + %198 = fpext bfloat %188 to float, !dbg !46 + %199 = fpext bfloat %191 to float, !dbg !46 + %200 = fpext bfloat %194 to float, !dbg !46 + %201 = fpext bfloat %197 to float, !dbg !46 + %202 = fmul float %.0.i4, %198, !dbg !47 + %203 = fmul float %.0.i4, %199, !dbg !47 + %204 = fmul float %.0.i4, %200, !dbg !47 + %205 = fmul float %.0.i4, %201, !dbg !47 + %206 = fmul float %202, %170, !dbg !48 + %207 = fmul float %203, %171, !dbg !48 + %208 = fmul float %204, %172, !dbg !48 + %209 = fmul float %205, %173, !dbg !48 + %210 = getelementptr bfloat, ptr addrspace(1) %4, i64 %174, !dbg !49 + %211 = getelementptr bfloat, ptr addrspace(1) %4, i64 %175, !dbg !49 + %212 = getelementptr bfloat, ptr addrspace(1) %4, i64 %176, !dbg !49 + %213 = getelementptr bfloat, ptr addrspace(1) %4, i64 %177, !dbg !49 + %214 = fptrunc float %206 to bfloat, !dbg !50 + %215 = fptrunc float %207 to bfloat, !dbg !50 + %216 = fptrunc float %208 to bfloat, !dbg !50 + %217 = fptrunc float %209 to bfloat, !dbg !50 + %218 = bitcast bfloat %214 to i16, !dbg !50 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %218, ptr addrspace(1) %210, i1 %182) #6, !dbg !50 + %219 = bitcast bfloat %215 to i16, !dbg !50 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %219, ptr addrspace(1) %211, i1 %183) #6, !dbg !50 + %220 = bitcast bfloat %216 to i16, !dbg !50 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %220, ptr addrspace(1) %212, i1 %184) #6, !dbg !50 + %221 = bitcast bfloat %217 to i16, !dbg !50 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %221, ptr addrspace(1) %213, i1 %185) #6, !dbg !50 + %222 = add i32 %141, 2048, !dbg !36 + %223 = icmp slt i32 %222, %7, !dbg !36 + br i1 %223, label %.lr.ph6, label %._crit_edge7, !dbg !36 + +._crit_edge7: ; preds = %.lr.ph6, %__nv_rsqrtf.exit5 + ret void, !dbg !51 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused__to_copy_mean_mul_pow_rsqrt_0", linkageName: "triton_red_fused__to_copy_mean_mul_pow_rsqrt_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 21, column: 28, scope: !5) +!9 = !DILocation(line: 23, column: 21, scope: !5) +!10 = !DILocation(line: 24, column: 37, scope: !5) +!11 = !DILocation(line: 34, column: 45, scope: !5) +!12 = !DILocation(line: 28, column: 40, scope: !5) +!13 = !DILocation(line: 34, column: 60, scope: !5) +!14 = !DILocation(line: 29, column: 31, scope: !5) +!15 = !DILocation(line: 30, column: 29, scope: !5) +!16 = !DILocation(line: 34, column: 41, scope: !5) +!17 = !DILocation(line: 34, column: 34, scope: !5) +!18 = !DILocation(line: 34, column: 50, scope: !5) +!19 = !DILocation(line: 34, column: 111, scope: !5) +!20 = !DILocation(line: 36, column: 22, scope: !5) +!21 = !DILocation(line: 38, column: 23, scope: !5) +!22 = !DILocation(line: 39, column: 48, scope: !5) +!23 = !DILocation(line: 261, column: 15, scope: !24, inlinedAt: !26) +!24 = distinct !DILexicalBlockFile(scope: !5, file: !25, discriminator: 0) +!25 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!26 = !DILocation(line: 40, column: 25, scope: !5) +!27 = !DILocation(line: 291, column: 36, scope: !24, inlinedAt: !26) +!28 = !DILocation(line: 43, column: 19, scope: !5) +!29 = !DILocation(line: 44, column: 19, scope: !5) +!30 = !DILocation(line: 45, column: 20, scope: !5) +!31 = !DILocation(line: 46, column: 19, scope: !5) +!32 = !DILocation(line: 47, column: 28, scope: !5) +!33 = !DILocation(line: 48, column: 4, scope: !5) +!34 = !DILocation(line: 49, column: 28, scope: !5) +!35 = !DILocation(line: 49, column: 40, scope: !5) +!36 = !DILocation(line: 50, column: 40, scope: !5) +!37 = !DILocation(line: 51, column: 31, scope: !5) +!38 = !DILocation(line: 52, column: 29, scope: !5) +!39 = !DILocation(line: 56, column: 35, scope: !5) +!40 = !DILocation(line: 56, column: 42, scope: !5) +!41 = !DILocation(line: 56, column: 95, scope: !5) +!42 = !DILocation(line: 57, column: 42, scope: !5) +!43 = !DILocation(line: 57, column: 35, scope: !5) +!44 = !DILocation(line: 57, column: 61, scope: !5) +!45 = !DILocation(line: 57, column: 51, scope: !5) +!46 = !DILocation(line: 57, column: 113, scope: !5) +!47 = !DILocation(line: 59, column: 24, scope: !5) +!48 = !DILocation(line: 61, column: 24, scope: !5) +!49 = !DILocation(line: 62, column: 29, scope: !5) +!50 = !DILocation(line: 62, column: 52, scope: !5) +!51 = !DILocation(line: 50, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..6d08190502d7eb09470ce5916e84b25b7a5bed51 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.ptx @@ -0,0 +1,673 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_mean_mul_pow_rsqrt_0 // -- Begin function triton_red_fused__to_copy_mean_mul_pow_rsqrt_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused__to_copy_mean_mul_pow_rsqrt_0 +.visible .entry triton_red_fused__to_copy_mean_mul_pow_rsqrt_0( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_mean_mul_pow_rsqrt_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_mean_mul_pow_rsqrt_0_param_1, + .param .f64 triton_red_fused__to_copy_mean_mul_pow_rsqrt_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_mean_mul_pow_rsqrt_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_mean_mul_pow_rsqrt_0_param_4, + .param .u64 triton_red_fused__to_copy_mean_mul_pow_rsqrt_0_param_5, + .param .u32 triton_red_fused__to_copy_mean_mul_pow_rsqrt_0_param_6, + .param .u32 triton_red_fused__to_copy_mean_mul_pow_rsqrt_0_param_7, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_mean_mul_pow_rsqrt_0_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_mean_mul_pow_rsqrt_0_param_9 +) +.reqntid 512 +{ + .reg .pred %p<36>; + .reg .b16 %rs<33>; + .reg .b32 %r<117>; + .reg .b64 %rd<79>; + .loc 1 18 0 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:18:0 +$L__func_begin0: + .loc 1 18 0 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:18:0 + +// %bb.0: + ld.param.b32 %r24, [triton_red_fused__to_copy_mean_mul_pow_rsqrt_0_param_7]; + ld.param.b32 %r23, [triton_red_fused__to_copy_mean_mul_pow_rsqrt_0_param_6]; + ld.param.b64 %rd13, [triton_red_fused__to_copy_mean_mul_pow_rsqrt_0_param_5]; + ld.param.b64 %rd10, [triton_red_fused__to_copy_mean_mul_pow_rsqrt_0_param_2]; + ld.param.b64 %rd9, [triton_red_fused__to_copy_mean_mul_pow_rsqrt_0_param_1]; + ld.param.b64 %rd8, [triton_red_fused__to_copy_mean_mul_pow_rsqrt_0_param_0]; +$L__tmp0: + .loc 1 21 28 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:21:28 + mov.u32 %r26, %ctaid.x; + .loc 1 24 37 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:24:37 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + shr.u32 %r3, %r1, 5; + and.b32 %r4, %r1, 511; + or.b32 %r5, %r1, 512; + or.b32 %r7, %r1, 1536; + .loc 1 34 45 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:34:45 + cvt.u64.u32 %rd1, %r26; + mul.lo.s64 %rd2, %rd13, %rd1; + .loc 1 28 40 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:28:40 + setp.lt.s32 %p5, %r24, 1; + mov.b32 %r115, 0f00000000; + cvt.u32.u64 %r113, %rd1; + @%p5 bra $L__BB0_4; +// %bb.1: // %.lr.ph + .loc 1 0 0 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:0 + or.b32 %r6, %r4, 1024; + .loc 1 23 21 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:23:21 + setp.lt.s32 %p1, %r113, %r23; + shl.b64 %rd16, %rd2, 1; + add.s64 %rd3, %rd9, %rd16; + mov.b32 %r29, 0f00000000; + mov.b64 %rd77, {%r29, %r29}; + mov.b32 %r114, 0; + mov.b64 %rd78, %rd77; +$L__BB0_2: // =>This Inner Loop Header: Depth=1 + .loc 1 29 31 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:29:31 + or.b32 %r30, %r114, %r7; + or.b32 %r31, %r114, %r6; + or.b32 %r32, %r114, %r5; + or.b32 %r33, %r114, %r4; + .loc 1 34 34 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:34:34 + mad.wide.s32 %rd18, %r33, 2, %rd3; + mad.wide.s32 %rd21, %r32, 2, %rd3; + mad.wide.s32 %rd24, %r31, 2, %rd3; + mad.wide.s32 %rd27, %r30, 2, %rd3; + .loc 1 34 50 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:34:50 + // begin inline asm + mov.u64 %rd17, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd17, 1.0; + // end inline asm + .loc 1 30 29 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:30:29 + setp.lt.s32 %p10, %r33, %r24; + setp.lt.s32 %p11, %r32, %r24; + setp.lt.s32 %p12, %r31, %r24; + setp.lt.s32 %p13, %r30, %r24; + .loc 1 34 60 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:34:60 + and.pred %p9, %p1, %p13; + and.pred %p8, %p1, %p12; + and.pred %p7, %p1, %p11; + and.pred %p6, %p1, %p10; + mov.b16 %rs2, 0; + .loc 1 34 50 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:34:50 + // begin inline asm + mov.u16 %rs1, %rs2; + @%p6 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs1 }, [ %rd18 + 0 ], %rd17; + // end inline asm + // begin inline asm + mov.u64 %rd20, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs3, %rs2; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs3 }, [ %rd21 + 0 ], %rd20; + // end inline asm + // begin inline asm + mov.u64 %rd23, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs5, %rs2; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs5 }, [ %rd24 + 0 ], %rd23; + // end inline asm + // begin inline asm + mov.u64 %rd26, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd26, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs7, %rs2; + @%p9 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs7 }, [ %rd27 + 0 ], %rd26; + // end inline asm + mov.b32 %r34, {%rs5, %rs7}; + mov.b32 %r35, {%rs1, %rs3}; + .loc 1 34 111 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:34:111 + mov.b32 {%rs9, %rs10}, %r35; + cvt.f32.bf16 %r36, %rs10; + cvt.f32.bf16 %r37, %rs9; + mov.b32 {%rs11, %rs12}, %r34; + cvt.f32.bf16 %r38, %rs12; + cvt.f32.bf16 %r39, %rs11; + .loc 1 38 23 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:38:23 + mov.b64 {%r40, %r41}, %rd78; + fma.rn.f32 %r42, %r39, %r39, %r40; + fma.rn.f32 %r43, %r38, %r38, %r41; + mov.b64 {%r44, %r45}, %rd77; + fma.rn.f32 %r46, %r37, %r37, %r44; + fma.rn.f32 %r47, %r36, %r36, %r45; + .loc 1 39 48 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:39:48 + selp.f32 %r48, %r47, %r45, %p7; + selp.f32 %r49, %r46, %r44, %p6; + mov.b64 %rd77, {%r49, %r48}; + selp.f32 %r50, %r43, %r41, %p9; + selp.f32 %r51, %r42, %r40, %p8; + mov.b64 %rd78, {%r51, %r50}; + .loc 1 28 40 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:28:40 + add.s32 %r114, %r114, 2048; + setp.lt.s32 %p14, %r114, %r24; + @%p14 bra $L__BB0_2; +// %bb.3: // %._crit_edge.loopexit +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + mov.b64 {%r52, %r53}, %rd77; + add.f32 %r54, %r52, %r53; + mov.b64 {%r55, %r56}, %rd78; + add.f32 %r57, %r55, %r54; + add.f32 %r115, %r56, %r57; +$L__tmp2: +$L__BB0_4: // %._crit_edge + .loc 1 23 21 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:23:21 + setp.lt.s32 %p20, %r113, %r23; +$L__tmp3: + .loc 2 291 36 // standard.py:291:36 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + shfl.sync.bfly.b32 %r66, %r115, 16, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + add.f32 %r67, %r115, %r66; + .loc 2 291 36 // standard.py:291:36 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + shfl.sync.bfly.b32 %r68, %r67, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + add.f32 %r69, %r67, %r68; + .loc 2 291 36 // standard.py:291:36 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + shfl.sync.bfly.b32 %r70, %r69, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + add.f32 %r71, %r69, %r70; + .loc 2 291 36 // standard.py:291:36 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + shfl.sync.bfly.b32 %r72, %r71, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + add.f32 %r73, %r71, %r72; + .loc 2 291 36 // standard.py:291:36 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + shfl.sync.bfly.b32 %r74, %r73, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + add.f32 %r59, %r73, %r74; + .loc 2 291 36 // standard.py:291:36 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + and.b32 %r75, %r3, 15; + setp.eq.b32 %p15, %r2, 0; + shl.b32 %r76, %r75, 2; + mov.b32 %r77, global_smem; + add.s32 %r58, %r77, %r76; + // begin inline asm + @%p15 st.shared.b32 [ %r58 + 0 ], %r59; + // end inline asm + bar.sync 0; + setp.lt.u32 %p16, %r1, 16; + shl.b32 %r78, %r1, 2; + add.s32 %r61, %r77, %r78; + // begin inline asm + @%p16 ld.shared.b32 %r60, [ %r61 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r79, %r60, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + add.f32 %r80, %r60, %r79; + .loc 2 291 36 // standard.py:291:36 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + shfl.sync.bfly.b32 %r81, %r80, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + add.f32 %r82, %r80, %r81; + .loc 2 291 36 // standard.py:291:36 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + shfl.sync.bfly.b32 %r83, %r82, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + add.f32 %r84, %r82, %r83; + .loc 2 291 36 // standard.py:291:36 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + shfl.sync.bfly.b32 %r85, %r84, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + add.f32 %r63, %r84, %r85; + .loc 2 291 36 // standard.py:291:36 @[ chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:40:25 ] + setp.eq.b32 %p17, %r1, 0; + // begin inline asm + @%p17 st.shared.b32 [ %r61 + 0 ], %r63; + // end inline asm + bar.sync 0; + ld.shared.b32 %r86, [global_smem]; +$L__tmp4: + .loc 1 43 19 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:43:19 + cvt.rn.f32.s64 %r87, %rd13; + .loc 1 44 19 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:44:19 + div.full.f32 %r88, %r86, %r87; + .loc 1 45 20 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:45:20 + cvt.rn.f32.f64 %r89, %rd10; + .loc 1 46 19 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:46:19 + add.f32 %r90, %r88, %r89; + .loc 1 47 28 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:47:28 + rsqrt.approx.ftz.f32 %r64, %r90; + .loc 1 48 4 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:48:4 + bar.sync 0; + .loc 1 49 28 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:49:28 + shl.b64 %rd30, %rd1, 2; + add.s64 %rd29, %rd8, %rd30; + .loc 1 49 40 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:49:40 + setp.eq.b32 %p21, %r4, 0; + and.pred %p18, %p21, %p20; + // begin inline asm + @%p18 st.global.b32 [ %rd29 + 0 ], { %r64 }; + // end inline asm + .loc 1 50 40 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:50:40 + @%p5 bra $L__BB0_7; +// %bb.5: // %.lr.ph6.preheader + .loc 1 0 40 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:0:40 + ld.param.b64 %rd12, [triton_red_fused__to_copy_mean_mul_pow_rsqrt_0_param_4]; + ld.param.b64 %rd11, [triton_red_fused__to_copy_mean_mul_pow_rsqrt_0_param_3]; + mov.b32 %r116, 0; + cvt.u64.u32 %rd62, %r4; +$L__BB0_6: // %.lr.ph6 + // =>This Inner Loop Header: Depth=1 + .loc 1 51 31 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:51:31 + add.s32 %r93, %r4, %r116; + add.s32 %r94, %r5, %r116; + add.s32 %r95, %r93, 1024; + .loc 1 52 29 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:52:29 + add.s32 %r96, %r7, %r116; + setp.lt.s32 %p22, %r93, %r24; + setp.lt.s32 %p23, %r94, %r24; + setp.lt.s32 %p24, %r95, %r24; + setp.lt.s32 %p25, %r96, %r24; + .loc 1 56 35 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:56:35 + cvt.s64.s32 %rd59, %r93; + mad.wide.s32 %rd32, %r93, 2, %rd11; + cvt.s64.s32 %rd60, %r94; + mad.wide.s32 %rd35, %r94, 2, %rd11; + cvt.s64.s32 %rd61, %r116; + add.s64 %rd63, %rd61, %rd62; + shl.b64 %rd64, %rd63, 1; + add.s64 %rd65, %rd11, %rd64; + add.s64 %rd38, %rd65, 2048; + cvt.s64.s32 %rd66, %r96; + mad.wide.s32 %rd41, %r96, 2, %rd11; + .loc 1 56 42 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:56:42 + // begin inline asm + mov.u64 %rd31, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd31, 1.0; + // end inline asm + mov.b16 %rs14, 0; + // begin inline asm + mov.u16 %rs13, %rs14; + @%p22 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd32 + 0 ], %rd31; + // end inline asm + // begin inline asm + mov.u64 %rd34, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd34, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs14; + @%p23 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd35 + 0 ], %rd34; + // end inline asm + // begin inline asm + mov.u64 %rd37, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd37, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs17, %rs14; + @%p24 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd38 + 0 ], %rd37; + // end inline asm + // begin inline asm + mov.u64 %rd40, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd40, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs19, %rs14; + @%p25 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd41 + 0 ], %rd40; + // end inline asm + .loc 1 56 95 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:56:95 + cvt.f32.bf16 %r97, %rs13; + cvt.f32.bf16 %r98, %rs15; + cvt.f32.bf16 %r99, %rs17; + cvt.f32.bf16 %r100, %rs19; + .loc 1 57 42 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:57:42 + add.s64 %rd67, %rd2, %rd59; + add.s64 %rd68, %rd2, %rd60; + add.s64 %rd69, %rd2, %rd66; + .loc 1 57 35 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:57:35 + shl.b64 %rd70, %rd67, 1; + add.s64 %rd44, %rd9, %rd70; + shl.b64 %rd71, %rd68, 1; + add.s64 %rd47, %rd9, %rd71; + add.s64 %rd72, %rd2, %rd63; + shl.b64 %rd73, %rd72, 1; + add.s64 %rd74, %rd9, %rd73; + add.s64 %rd50, %rd74, 2048; + shl.b64 %rd75, %rd69, 1; + add.s64 %rd53, %rd9, %rd75; + .loc 1 57 61 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:57:61 + and.pred %p26, %p20, %p22; + and.pred %p27, %p20, %p23; + and.pred %p28, %p20, %p24; + and.pred %p29, %p20, %p25; + .loc 1 57 51 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:57:51 + // begin inline asm + mov.u64 %rd43, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd43, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs21, %rs14; + @%p26 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs21 }, [ %rd44 + 0 ], %rd43; + // end inline asm + // begin inline asm + mov.u64 %rd46, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd46, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs23, %rs14; + @%p27 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs23 }, [ %rd47 + 0 ], %rd46; + // end inline asm + // begin inline asm + mov.u64 %rd49, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd49, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs25, %rs14; + @%p28 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs25 }, [ %rd50 + 0 ], %rd49; + // end inline asm + // begin inline asm + mov.u64 %rd52, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd52, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs27, %rs14; + @%p29 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs27 }, [ %rd53 + 0 ], %rd52; + // end inline asm + .loc 1 57 113 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:57:113 + cvt.f32.bf16 %r101, %rs21; + cvt.f32.bf16 %r102, %rs23; + cvt.f32.bf16 %r103, %rs25; + cvt.f32.bf16 %r104, %rs27; + .loc 1 59 24 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:59:24 + mul.f32 %r105, %r64, %r101; + mul.f32 %r106, %r64, %r102; + mul.f32 %r107, %r64, %r103; + mul.f32 %r108, %r64, %r104; + .loc 1 61 24 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:61:24 + mul.f32 %r109, %r105, %r97; + mul.f32 %r110, %r106, %r98; + mul.f32 %r111, %r107, %r99; + mul.f32 %r112, %r108, %r100; + .loc 1 62 29 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:62:29 + add.s64 %rd55, %rd12, %rd70; + add.s64 %rd56, %rd12, %rd71; + add.s64 %rd76, %rd12, %rd73; + add.s64 %rd57, %rd76, 2048; + add.s64 %rd58, %rd12, %rd75; + .loc 1 62 52 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:62:52 + cvt.rn.bf16.f32 %rs29, %r109; + cvt.rn.bf16.f32 %rs30, %r110; + cvt.rn.bf16.f32 %rs31, %r111; + cvt.rn.bf16.f32 %rs32, %r112; + // begin inline asm + @%p26 st.global.b16 [ %rd55 + 0 ], { %rs29 }; + // end inline asm + // begin inline asm + @%p27 st.global.b16 [ %rd56 + 0 ], { %rs30 }; + // end inline asm + // begin inline asm + @%p28 st.global.b16 [ %rd57 + 0 ], { %rs31 }; + // end inline asm + // begin inline asm + @%p29 st.global.b16 [ %rd58 + 0 ], { %rs32 }; + // end inline asm + .loc 1 50 40 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:50:40 + add.s32 %r116, %r116, 2048; + setp.lt.s32 %p35, %r116, %r24; + @%p35 bra $L__BB0_6; +$L__BB0_7: // %._crit_edge7 + .loc 1 50 4 // chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py:50:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 231 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe0 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 104 +.b8 121 +.b8 50 +.b8 122 +.b8 51 +.b8 112 +.b8 110 +.b8 102 +.b8 120 +.b8 102 +.b8 98 +.b8 113 +.b8 117 +.b8 106 +.b8 103 +.b8 101 +.b8 120 +.b8 55 +.b8 119 +.b8 50 +.b8 116 +.b8 50 +.b8 114 +.b8 50 +.b8 109 +.b8 98 +.b8 55 +.b8 103 +.b8 107 +.b8 102 +.b8 53 +.b8 52 +.b8 98 +.b8 108 +.b8 101 +.b8 54 +.b8 114 +.b8 110 +.b8 116 +.b8 115 +.b8 121 +.b8 55 +.b8 97 +.b8 53 +.b8 50 +.b8 50 +.b8 117 +.b8 121 +.b8 108 +.b8 114 +.b8 117 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 104 +.b8 121 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x31 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 109 +.b8 101 +.b8 97 +.b8 110 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 112 +.b8 111 +.b8 119 +.b8 95 +.b8 114 +.b8 115 +.b8 113 +.b8 114 +.b8 116 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xbc:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xd1:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 40 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.source new file mode 100644 index 0000000000000000000000000000000000000000..18652bcac6023d2617bf8d127a7c1440f9b49a87 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.source @@ -0,0 +1,248 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":18:0) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc56 = loc(unknown) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc63 = loc("in_out_ptr0"(#loc)) +#loc64 = loc("in_ptr0"(#loc)) +#loc65 = loc("in_ptr1"(#loc)) +#loc66 = loc("in_ptr2"(#loc)) +#loc67 = loc("out_ptr0"(#loc)) +#loc68 = loc("ks0"(#loc)) +#loc69 = loc("xnumel"(#loc)) +#loc70 = loc("r0_numel"(#loc)) +#loc113 = loc("input"(#loc54)) +#loc114 = loc("a"(#loc59)) +#loc115 = loc("b"(#loc59)) +module { + tt.func public @triton_red_fused__to_copy_mean_mul_pow_rsqrt_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: f64 loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc71) + %xoffset_0 = arith.constant 1 : i32 loc(#loc72) + %xoffset_1 = arith.constant 1 : i32 loc(#loc72) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc72) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc73) + %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc74) + %xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<1x1xi32> loc(#loc75) + %xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<1x1xi32> loc(#loc75) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc76) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<1x1xi32> loc(#loc76) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc77) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc78) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc79) + %_tmp4_8 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc79) + %c0_i32 = arith.constant 0 : i32 loc(#loc10) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc10) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc10) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc10) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc10) + %3 = ub.poison : i32 loc(#loc10) + %_tmp4_9 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_15 = %_tmp4_8) -> (tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc81) + %r0_index_16 = arith.addi %r0_index, %r0_base_7 : tensor<1x2048xi32> loc(#loc81) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x2048xi32> loc(#loc82) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x2048xi32> loc(#loc82) + %tmp0 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc83) + %tmp0_18 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc83) + %tmp0_19 = arith.muli %tmp0_18, %tmp0 : tensor<1x1xi64> loc(#loc83) + %tmp0_20 = arith.extsi %r0_index_16 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc84) + %tmp0_21 = tt.broadcast %tmp0_19 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc84) + %tmp0_22 = arith.addi %tmp0_20, %tmp0_21 : tensor<1x2048xi64> loc(#loc84) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc85) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc85) + %tmp0_25 = tt.broadcast %xmask_6 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc86) + %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x2048xi1> loc(#loc86) + %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc87) + %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc87) + %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc87) + %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc87) + %tmp0_31 = arith.extf %tmp0_30 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc88) + %tmp2 = arith.mulf %tmp0_31, %tmp0_31 : tensor<1x2048xf32> loc(#loc89) + %tmp5 = arith.addf %_tmp4_15, %tmp2 : tensor<1x2048xf32> loc(#loc90) + %_tmp4_32 = tt.broadcast %xmask_6 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc91) + %_tmp4_33 = arith.andi %r0_mask_17, %_tmp4_32 : tensor<1x2048xi1> loc(#loc91) + %_tmp4_34 = arith.select %_tmp4_33, %tmp5, %_tmp4_15 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc92) + scf.yield %_tmp4_34 : tensor<1x2048xf32> loc(#loc23) + } loc(#loc80) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_9) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc93) + %tmp4_10 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc94) + %tmp7 = arith.sitofp %ks0 : i64 to f32 loc(#loc95) + %tmp8 = tt.splat %tmp7 : f32 -> tensor<1x1xf32> loc(#loc96) + %tmp8_11 = arith.divf %tmp4_10, %tmp8 : tensor<1x1xf32> loc(#loc96) + %tmp10 = arith.truncf %in_ptr1 : f64 to f32 loc(#loc97) + %tmp11 = tt.splat %tmp10 : f32 -> tensor<1x1xf32> loc(#loc98) + %tmp11_12 = arith.addf %tmp8_11, %tmp11 : tensor<1x1xf32> loc(#loc98) + %tmp12 = tt.extern_elementwise %tmp11_12 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc99) + gpu.barrier loc(#loc31) + %4 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc32) + %5 = tt.addptr %4, %xindex_5 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc32) + tt.store %5, %tmp12, %xmask_6 : tensor<1x1x!tt.ptr> loc(#loc33) + %c0_i32_13 = arith.constant 0 : i32 loc(#loc34) + %c2048_i32_14 = arith.constant 2048 : i32 loc(#loc34) + %6 = arith.bitcast %c0_i32_13 : i32 to i32 loc(#loc34) + %7 = arith.bitcast %r0_numel : i32 to i32 loc(#loc34) + %8 = arith.bitcast %c2048_i32_14 : i32 to i32 loc(#loc34) + %9 = ub.poison : i32 loc(#loc34) + scf.for %r0_offset = %6 to %7 step %8 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc100) + %r0_index_15 = arith.addi %r0_index, %r0_base_7 : tensor<1x2048xi32> loc(#loc100) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x2048xi32> loc(#loc101) + %r0_mask_16 = arith.cmpi slt, %r0_index_15, %r0_mask : tensor<1x2048xi32> loc(#loc101) + %tmp13 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc102) + %tmp13_17 = tt.addptr %tmp13, %r0_index_15 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc102) + %tmp13_18 = arith.constant 0.000000e+00 : f32 loc(#loc103) + %tmp13_19 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc103) + %tmp13_20 = arith.truncf %tmp13_19 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc103) + %tmp13_21 = tt.load %tmp13_17, %r0_mask_16, %tmp13_20 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc103) + %tmp13_22 = arith.extf %tmp13_21 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc104) + %tmp14 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc105) + %tmp14_23 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc105) + %tmp14_24 = arith.muli %tmp14_23, %tmp14 : tensor<1x1xi64> loc(#loc105) + %tmp14_25 = arith.extsi %r0_index_15 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc106) + %tmp14_26 = tt.broadcast %tmp14_24 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc106) + %tmp14_27 = arith.addi %tmp14_25, %tmp14_26 : tensor<1x2048xi64> loc(#loc106) + %tmp14_28 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc107) + %tmp14_29 = tt.addptr %tmp14_28, %tmp14_27 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc107) + %tmp14_30 = tt.broadcast %xmask_6 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc108) + %tmp14_31 = arith.andi %r0_mask_16, %tmp14_30 : tensor<1x2048xi1> loc(#loc108) + %tmp14_32 = arith.constant 0.000000e+00 : f32 loc(#loc109) + %tmp14_33 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc109) + %tmp14_34 = arith.truncf %tmp14_33 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc109) + %tmp14_35 = tt.load %tmp14_29, %tmp14_31, %tmp14_34 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc109) + %tmp14_36 = arith.extf %tmp14_35 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc110) + %tmp16 = tt.broadcast %tmp12 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc111) + %tmp16_37 = arith.mulf %tmp14_36, %tmp16 : tensor<1x2048xf32> loc(#loc111) + %tmp18 = arith.mulf %tmp13_22, %tmp16_37 : tensor<1x2048xf32> loc(#loc112) + %10 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc48) + %11 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc48) + %12 = arith.muli %11, %10 : tensor<1x1xi64> loc(#loc48) + %13 = arith.extsi %r0_index_15 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc49) + %14 = tt.broadcast %12 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc49) + %15 = arith.addi %13, %14 : tensor<1x2048xi64> loc(#loc49) + %16 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc50) + %17 = tt.addptr %16, %15 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc50) + %18 = tt.broadcast %xmask_6 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc51) + %19 = arith.andi %r0_mask_16, %18 : tensor<1x2048xi1> loc(#loc51) + %20 = arith.truncf %tmp18 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc52) + tt.store %17, %20, %19 : tensor<1x2048x!tt.ptr> loc(#loc52) + } loc(#loc34) + tt.return loc(#loc53) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2048xf32> loc("input"(#loc54))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc55) + tt.reduce.return %2 : f32 loc(#loc55) + }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc55) + tt.return %0 : tensor<1xf32> loc(#loc57) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc58) + tt.return %1 : tensor<1xf32> loc(#loc58) + } loc(#loc54) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc59)), %b: f32 loc("b"(#loc59))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc60) + tt.return %0 : f32 loc(#loc61) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc62) + tt.return %1 : f32 loc(#loc62) + } loc(#loc59) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":21:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":21:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":22:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":22:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":22:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":24:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":24:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":27:43) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":28:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":29:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":30:29) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":34:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":34:41) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":34:34) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":34:60) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":34:50) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":34:111) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":36:22) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":38:23) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":39:35) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":39:48) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":39:8) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":40:25) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":40:28) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":43:19) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":44:19) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":45:20) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":46:19) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":47:28) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":48:4) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":49:28) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":49:40) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":50:40) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":51:31) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":52:29) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":56:35) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":56:42) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":56:95) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":57:46) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":57:42) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":57:35) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":57:61) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":57:51) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":57:113) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":59:24) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":61:24) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":62:40) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":62:36) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":62:29) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":62:62) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":62:52) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":50:4) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc71 = loc("xoffset"(#loc1)) +#loc72 = loc("xoffset"(#loc2)) +#loc73 = loc("xindex"(#loc3)) +#loc74 = loc("xindex"(#loc4)) +#loc75 = loc("xindex"(#loc5)) +#loc76 = loc("xmask"(#loc6)) +#loc77 = loc("r0_base"(#loc7)) +#loc78 = loc("r0_base"(#loc8)) +#loc79 = loc("_tmp4"(#loc9)) +#loc80 = loc("_tmp4"(#loc10)) +#loc81 = loc("r0_index"(#loc11)) +#loc82 = loc("r0_mask"(#loc12)) +#loc83 = loc("tmp0"(#loc13)) +#loc84 = loc("tmp0"(#loc14)) +#loc85 = loc("tmp0"(#loc15)) +#loc86 = loc("tmp0"(#loc16)) +#loc87 = loc("tmp0"(#loc17)) +#loc88 = loc("tmp0"(#loc18)) +#loc89 = loc("tmp2"(#loc19)) +#loc90 = loc("tmp5"(#loc20)) +#loc91 = loc("_tmp4"(#loc21)) +#loc92 = loc("_tmp4"(#loc22)) +#loc93 = loc("tmp4"(#loc24)) +#loc94 = loc("tmp4"(#loc25)) +#loc95 = loc("tmp7"(#loc26)) +#loc96 = loc("tmp8"(#loc27)) +#loc97 = loc("tmp10"(#loc28)) +#loc98 = loc("tmp11"(#loc29)) +#loc99 = loc("tmp12"(#loc30)) +#loc100 = loc("r0_index"(#loc35)) +#loc101 = loc("r0_mask"(#loc36)) +#loc102 = loc("tmp13"(#loc37)) +#loc103 = loc("tmp13"(#loc38)) +#loc104 = loc("tmp13"(#loc39)) +#loc105 = loc("tmp14"(#loc40)) +#loc106 = loc("tmp14"(#loc41)) +#loc107 = loc("tmp14"(#loc42)) +#loc108 = loc("tmp14"(#loc43)) +#loc109 = loc("tmp14"(#loc44)) +#loc110 = loc("tmp14"(#loc45)) +#loc111 = loc("tmp16"(#loc46)) +#loc112 = loc("tmp18"(#loc47)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..fe2ea8453a965342719a5017d56cbc4d3cc56892 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.ttgir @@ -0,0 +1,176 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":18:0) +#loc1 = loc(unknown) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":40:25) +#loc46 = loc("in_out_ptr0"(#loc)) +#loc47 = loc("in_ptr0"(#loc)) +#loc48 = loc("in_ptr1"(#loc)) +#loc49 = loc("in_ptr2"(#loc)) +#loc50 = loc("out_ptr0"(#loc)) +#loc51 = loc("ks0"(#loc)) +#loc52 = loc("xnumel"(#loc)) +#loc53 = loc("r0_numel"(#loc)) +#loc69 = loc("tmp4"(#loc19)) +#loc91 = loc(callsite(#loc1 at #loc69)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_mean_mul_pow_rsqrt_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: f64 loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc54) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc55) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc56) + %r0_base_1 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc56) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x2048xi32, #blocked> loc(#loc57) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc58) + %tmp0_2 = arith.muli %ks0, %tmp0 : i64 loc(#loc58) + %tmp0_3 = tt.splat %tmp0_2 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc88) + %tmp0_4 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc60) + %tmp0_5 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc89) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c2048_i32 iter_args(%_tmp4_16 = %cst) -> (tensor<1x2048xf32, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc63) + %r0_index_17 = arith.addi %r0_index, %r0_base_1 : tensor<1x2048xi32, #blocked> loc(#loc63) + %r0_mask_18 = arith.cmpi slt, %r0_index_17, %r0_mask : tensor<1x2048xi32, #blocked> loc(#loc57) + %tmp0_19 = arith.extsi %r0_index_17 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc59) + %tmp0_20 = arith.addi %tmp0_19, %tmp0_3 : tensor<1x2048xi64, #blocked> loc(#loc59) + %tmp0_21 = tt.addptr %tmp0_4, %tmp0_20 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc60) + %tmp0_22 = arith.andi %r0_mask_18, %tmp0_5 : tensor<1x2048xi1, #blocked> loc(#loc61) + %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc64) + %tmp0_24 = arith.extf %tmp0_23 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc65) + %tmp2 = arith.mulf %tmp0_24, %tmp0_24 : tensor<1x2048xf32, #blocked> loc(#loc66) + %tmp5 = arith.addf %_tmp4_16, %tmp2 : tensor<1x2048xf32, #blocked> loc(#loc67) + %_tmp4_25 = arith.select %tmp0_22, %tmp5, %_tmp4_16 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc68) + scf.yield %_tmp4_25 : tensor<1x2048xf32, #blocked> loc(#loc17) + } loc(#loc62) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_16: f32 loc(callsite(#loc1 at #loc69)), %tmp4_17: f32 loc(callsite(#loc1 at #loc69))): + %tmp4_18 = arith.addf %tmp4_16, %tmp4_17 : f32 loc(#loc92) + tt.reduce.return %tmp4_18 : f32 loc(#loc90) + }) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc90) + %tmp12 = ttg.convert_layout %tmp4 : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc70) + %tmp4_6 = tt.expand_dims %tmp12 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xf32, #blocked1> loc(#loc71) + %tmp4_7 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc71) + %tmp7 = arith.sitofp %ks0 : i64 to f32 loc(#loc72) + %tmp8 = tt.splat %tmp7 : f32 -> tensor<1x1xf32, #blocked1> loc(#loc73) + %tmp8_8 = tt.splat %tmp7 : f32 -> tensor<1x1xf32, #blocked> loc(#loc73) + %tmp8_9 = arith.divf %tmp4_6, %tmp8 : tensor<1x1xf32, #blocked1> loc(#loc73) + %tmp8_10 = arith.divf %tmp4_7, %tmp8_8 : tensor<1x1xf32, #blocked> loc(#loc73) + %tmp10 = arith.truncf %in_ptr1 : f64 to f32 loc(#loc74) + %tmp11 = tt.splat %tmp10 : f32 -> tensor<1x1xf32, #blocked1> loc(#loc75) + %tmp11_11 = tt.splat %tmp10 : f32 -> tensor<1x1xf32, #blocked> loc(#loc75) + %tmp11_12 = arith.addf %tmp8_9, %tmp11 : tensor<1x1xf32, #blocked1> loc(#loc75) + %tmp11_13 = arith.addf %tmp8_10, %tmp11_11 : tensor<1x1xf32, #blocked> loc(#loc75) + %tmp12_14 = tt.extern_elementwise %tmp11_12 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked1>) -> tensor<1x1xf32, #blocked1> loc(#loc70) + %tmp12_15 = tt.extern_elementwise %tmp11_13 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> loc(#loc70) + gpu.barrier loc(#loc27) + %0 = tt.addptr %in_out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc28) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc29) + %2 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked1> loc(#loc29) + tt.store %1, %tmp12_14, %2 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc29) + %tmp13 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc76) + %tmp16 = tt.broadcast %tmp12_15 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc77) + %3 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc32) + scf.for %r0_offset = %c0_i32 to %r0_numel step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc78) + %r0_index_16 = arith.addi %r0_index, %r0_base_1 : tensor<1x2048xi32, #blocked> loc(#loc78) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x2048xi32, #blocked> loc(#loc79) + %tmp13_18 = tt.addptr %tmp13, %r0_index_16 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc76) + %tmp13_19 = tt.load %tmp13_18, %r0_mask_17, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc80) + %tmp13_20 = arith.extf %tmp13_19 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc81) + %tmp14 = arith.extsi %r0_index_16 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc82) + %tmp14_21 = arith.addi %tmp14, %tmp0_3 : tensor<1x2048xi64, #blocked> loc(#loc82) + %tmp14_22 = tt.addptr %tmp0_4, %tmp14_21 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc83) + %tmp14_23 = arith.andi %r0_mask_17, %tmp0_5 : tensor<1x2048xi1, #blocked> loc(#loc84) + %tmp14_24 = tt.load %tmp14_22, %tmp14_23, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc85) + %tmp14_25 = arith.extf %tmp14_24 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc86) + %tmp16_26 = arith.mulf %tmp14_25, %tmp16 : tensor<1x2048xf32, #blocked> loc(#loc77) + %tmp18 = arith.mulf %tmp13_20, %tmp16_26 : tensor<1x2048xf32, #blocked> loc(#loc87) + %4 = tt.addptr %3, %tmp14_21 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc32) + %5 = arith.truncf %tmp18 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked> loc(#loc44) + tt.store %4, %5, %tmp14_23 : tensor<1x2048x!tt.ptr, #blocked> loc(#loc44) + } loc(#loc33) + tt.return loc(#loc45) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":21:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":23:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":24:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":30:29) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":34:45) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":34:41) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":34:34) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":34:60) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":28:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":29:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":34:50) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":34:111) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":36:22) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":38:23) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":39:48) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":39:8) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":47:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":40:28) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":43:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":44:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":45:20) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":46:19) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":48:4) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":49:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":49:40) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":56:35) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":59:24) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":62:29) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":50:40) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":51:31) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":52:29) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":56:42) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":56:95) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":57:42) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":57:35) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":57:61) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":57:51) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":57:113) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":61:24) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":62:52) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":50:4) +#loc54 = loc("xoffset"(#loc2)) +#loc55 = loc("xmask"(#loc3)) +#loc56 = loc("r0_base"(#loc4)) +#loc57 = loc("r0_mask"(#loc5)) +#loc58 = loc("tmp0"(#loc6)) +#loc59 = loc("tmp0"(#loc7)) +#loc60 = loc("tmp0"(#loc8)) +#loc61 = loc("tmp0"(#loc9)) +#loc62 = loc("_tmp4"(#loc10)) +#loc63 = loc("r0_index"(#loc11)) +#loc64 = loc("tmp0"(#loc12)) +#loc65 = loc("tmp0"(#loc13)) +#loc66 = loc("tmp2"(#loc14)) +#loc67 = loc("tmp5"(#loc15)) +#loc68 = loc("_tmp4"(#loc16)) +#loc70 = loc("tmp12"(#loc21)) +#loc71 = loc("tmp4"(#loc22)) +#loc72 = loc("tmp7"(#loc23)) +#loc73 = loc("tmp8"(#loc24)) +#loc74 = loc("tmp10"(#loc25)) +#loc75 = loc("tmp11"(#loc26)) +#loc76 = loc("tmp13"(#loc30)) +#loc77 = loc("tmp16"(#loc31)) +#loc78 = loc("r0_index"(#loc34)) +#loc79 = loc("r0_mask"(#loc35)) +#loc80 = loc("tmp13"(#loc36)) +#loc81 = loc("tmp13"(#loc37)) +#loc82 = loc("tmp14"(#loc38)) +#loc83 = loc("tmp14"(#loc39)) +#loc84 = loc("tmp14"(#loc40)) +#loc85 = loc("tmp14"(#loc41)) +#loc86 = loc("tmp14"(#loc42)) +#loc87 = loc("tmp18"(#loc43)) +#loc88 = loc(fused[#loc59, #loc58]) +#loc89 = loc(fused[#loc61, #loc55]) +#loc90 = loc(callsite(#loc18 at #loc69)) +#loc92 = loc(callsite(#loc20 at #loc90)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..e960da6bab3f9c96878df452611299f756c9a51a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/QI76RBIM6DVEKPUTZLIBNOLFJSQURSYGLVL4BYIYVUGG4ME65EQQ/triton_red_fused__to_copy_mean_mul_pow_rsqrt_0.ttir @@ -0,0 +1,179 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":18:0) +#loc1 = loc(unknown) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":40:25) +#loc48 = loc("in_out_ptr0"(#loc)) +#loc49 = loc("in_ptr0"(#loc)) +#loc50 = loc("in_ptr1"(#loc)) +#loc51 = loc("in_ptr2"(#loc)) +#loc52 = loc("out_ptr0"(#loc)) +#loc53 = loc("ks0"(#loc)) +#loc54 = loc("xnumel"(#loc)) +#loc55 = loc("r0_numel"(#loc)) +#loc72 = loc("tmp4"(#loc20)) +#loc95 = loc(callsite(#loc1 at #loc72)) +module { + tt.func public @triton_red_fused__to_copy_mean_mul_pow_rsqrt_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: f64 loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc56) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc57) + %xmask_1 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc57) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc58) + %r0_base_2 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc59) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c2048_i32 iter_args(%_tmp4_6 = %cst_0) -> (tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc61) + %r0_index_7 = arith.addi %r0_index, %r0_base_2 : tensor<1x2048xi32> loc(#loc61) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x2048xi32> loc(#loc62) + %r0_mask_8 = arith.cmpi slt, %r0_index_7, %r0_mask : tensor<1x2048xi32> loc(#loc62) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc63) + %tmp0_9 = arith.muli %ks0, %tmp0 : i64 loc(#loc63) + %tmp0_10 = arith.extsi %r0_index_7 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc64) + %tmp0_11 = tt.splat %tmp0_9 : i64 -> tensor<1x2048xi64> loc(#loc92) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<1x2048xi64> loc(#loc64) + %tmp0_13 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc65) + %tmp0_14 = tt.addptr %tmp0_13, %tmp0_12 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc65) + %tmp0_15 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc93) + %tmp0_16 = arith.andi %r0_mask_8, %tmp0_15 : tensor<1x2048xi1> loc(#loc66) + %tmp0_17 = tt.load %tmp0_14, %tmp0_16, %cst evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc67) + %tmp0_18 = arith.extf %tmp0_17 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc68) + %tmp2 = arith.mulf %tmp0_18, %tmp0_18 : tensor<1x2048xf32> loc(#loc69) + %tmp5 = arith.addf %_tmp4_6, %tmp2 : tensor<1x2048xf32> loc(#loc70) + %_tmp4_19 = arith.select %tmp0_16, %tmp5, %_tmp4_6 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc71) + scf.yield %_tmp4_19 : tensor<1x2048xf32> loc(#loc18) + } loc(#loc60) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_6: f32 loc(callsite(#loc1 at #loc72)), %tmp4_7: f32 loc(callsite(#loc1 at #loc72))): + %tmp4_8 = arith.addf %tmp4_6, %tmp4_7 : f32 loc(#loc98) + tt.reduce.return %tmp4_8 : f32 loc(#loc94) + }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc94) + %tmp4_3 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc73) + %tmp7 = arith.sitofp %ks0 : i64 to f32 loc(#loc74) + %tmp8 = tt.splat %tmp7 : f32 -> tensor<1x1xf32> loc(#loc75) + %tmp8_4 = arith.divf %tmp4_3, %tmp8 : tensor<1x1xf32> loc(#loc75) + %tmp10 = arith.truncf %in_ptr1 : f64 to f32 loc(#loc76) + %tmp11 = tt.splat %tmp10 : f32 -> tensor<1x1xf32> loc(#loc77) + %tmp11_5 = arith.addf %tmp8_4, %tmp11 : tensor<1x1xf32> loc(#loc77) + %tmp12 = tt.extern_elementwise %tmp11_5 {libname = "", libpath = "", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> loc(#loc78) + gpu.barrier loc(#loc28) + %0 = tt.addptr %in_out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc29) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc29) + tt.store %1, %tmp12, %xmask_1 : tensor<1x1x!tt.ptr> loc(#loc30) + scf.for %r0_offset = %c0_i32 to %r0_numel step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc79) + %r0_index_6 = arith.addi %r0_index, %r0_base_2 : tensor<1x2048xi32> loc(#loc79) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x2048xi32> loc(#loc80) + %r0_mask_7 = arith.cmpi slt, %r0_index_6, %r0_mask : tensor<1x2048xi32> loc(#loc80) + %tmp13 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc81) + %tmp13_8 = tt.addptr %tmp13, %r0_index_6 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc81) + %tmp13_9 = tt.load %tmp13_8, %r0_mask_7, %cst evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc82) + %tmp13_10 = arith.extf %tmp13_9 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc83) + %tmp14 = arith.extsi %xoffset : i32 to i64 loc(#loc84) + %tmp14_11 = arith.muli %ks0, %tmp14 : i64 loc(#loc84) + %tmp14_12 = arith.extsi %r0_index_6 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc85) + %tmp14_13 = tt.splat %tmp14_11 : i64 -> tensor<1x2048xi64> loc(#loc96) + %tmp14_14 = arith.addi %tmp14_12, %tmp14_13 : tensor<1x2048xi64> loc(#loc85) + %tmp14_15 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc86) + %tmp14_16 = tt.addptr %tmp14_15, %tmp14_14 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc86) + %tmp14_17 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc97) + %tmp14_18 = arith.andi %r0_mask_7, %tmp14_17 : tensor<1x2048xi1> loc(#loc87) + %tmp14_19 = tt.load %tmp14_16, %tmp14_18, %cst evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc88) + %tmp14_20 = arith.extf %tmp14_19 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc89) + %tmp16 = tt.broadcast %tmp12 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc90) + %tmp16_21 = arith.mulf %tmp14_20, %tmp16 : tensor<1x2048xf32> loc(#loc90) + %tmp18 = arith.mulf %tmp13_10, %tmp16_21 : tensor<1x2048xf32> loc(#loc91) + %2 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc45) + %3 = tt.addptr %2, %tmp14_14 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc45) + %4 = arith.truncf %tmp18 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc46) + tt.store %3, %4, %tmp14_18 : tensor<1x2048x!tt.ptr> loc(#loc46) + } loc(#loc31) + tt.return loc(#loc47) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":21:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":23:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":24:27) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":24:37) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":28:40) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":29:31) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":30:29) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":34:45) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":34:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":34:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":34:60) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":34:50) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":34:111) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":36:22) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":38:23) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":39:48) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":39:8) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":40:28) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":43:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":44:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":45:20) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":46:19) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":47:28) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":48:4) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":49:28) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":49:40) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":50:40) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":51:31) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":52:29) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":56:35) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":56:42) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":56:95) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":57:46) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":57:42) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":57:35) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":57:61) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":57:51) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":57:113) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":59:24) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":61:24) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":62:29) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":62:52) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hy/chy2z3pnfxfbqujgex7w2t2r2mb7gkf54ble6rntsy7a522uylru.py":50:4) +#loc56 = loc("xoffset"(#loc2)) +#loc57 = loc("xmask"(#loc3)) +#loc58 = loc("r0_base"(#loc4)) +#loc59 = loc("r0_base"(#loc5)) +#loc60 = loc("_tmp4"(#loc6)) +#loc61 = loc("r0_index"(#loc7)) +#loc62 = loc("r0_mask"(#loc8)) +#loc63 = loc("tmp0"(#loc9)) +#loc64 = loc("tmp0"(#loc10)) +#loc65 = loc("tmp0"(#loc11)) +#loc66 = loc("tmp0"(#loc12)) +#loc67 = loc("tmp0"(#loc13)) +#loc68 = loc("tmp0"(#loc14)) +#loc69 = loc("tmp2"(#loc15)) +#loc70 = loc("tmp5"(#loc16)) +#loc71 = loc("_tmp4"(#loc17)) +#loc73 = loc("tmp4"(#loc22)) +#loc74 = loc("tmp7"(#loc23)) +#loc75 = loc("tmp8"(#loc24)) +#loc76 = loc("tmp10"(#loc25)) +#loc77 = loc("tmp11"(#loc26)) +#loc78 = loc("tmp12"(#loc27)) +#loc79 = loc("r0_index"(#loc32)) +#loc80 = loc("r0_mask"(#loc33)) +#loc81 = loc("tmp13"(#loc34)) +#loc82 = loc("tmp13"(#loc35)) +#loc83 = loc("tmp13"(#loc36)) +#loc84 = loc("tmp14"(#loc37)) +#loc85 = loc("tmp14"(#loc38)) +#loc86 = loc("tmp14"(#loc39)) +#loc87 = loc("tmp14"(#loc40)) +#loc88 = loc("tmp14"(#loc41)) +#loc89 = loc("tmp14"(#loc42)) +#loc90 = loc("tmp16"(#loc43)) +#loc91 = loc("tmp18"(#loc44)) +#loc92 = loc(fused[#loc64, #loc63]) +#loc93 = loc(fused[#loc66, #loc57]) +#loc94 = loc(callsite(#loc19 at #loc72)) +#loc96 = loc(fused[#loc85, #loc84]) +#loc97 = loc(fused[#loc87, #loc57]) +#loc98 = loc(callsite(#loc21 at #loc94)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c00fa6457ef867dbffafdbc848337a3646e08f5d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..1c33eb9005dc6bc3dcc347b09e5dd4174128fc8e Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..662886a077c39e0c593a49265f74272afae8679c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "8ac911daa2a533d701fdd1240df5c99a179eb64893d2fcd04f63beccf895f168", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..c0b00bab5c1fb39793a0297f62178964dc3342c2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.llir @@ -0,0 +1,215 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 6, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 504, !dbg !9 + %12 = lshr exact i32 %11, 3, !dbg !9 + %13 = or disjoint i32 %12, %9, !dbg !10 + %14 = shl nuw nsw i32 %10, 3, !dbg !11 + %15 = and i32 %14, 56, !dbg !11 + %16 = sdiv i32 %13, 2048, !dbg !12 + %17 = mul i32 %16, 2048, !dbg !13 + %.decomposed = sub i32 %13, %17, !dbg !13 + %18 = srem i32 %16, 32, !dbg !14 + %19 = sdiv i32 %13, 65536, !dbg !15 + %20 = shl nsw i32 %18, 7, !dbg !16 + %21 = shl nsw i32 %.decomposed, 12, !dbg !17 + %22 = shl i32 %19, 23, !dbg !18 + %23 = shl i32 %13, 7, !dbg !19 + %24 = add i32 %22, %21 + %25 = add i32 %24, %20 + %26 = zext nneg i32 %15 to i64, !dbg !20 + %27 = sext i32 %23 to i64, !dbg !20 + %28 = or disjoint i32 %25, %15, !dbg !21 + %29 = sext i32 %28 to i64, !dbg !22 + %30 = getelementptr bfloat, ptr addrspace(1) %0, i64 %29, !dbg !22 + %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !23 + %32 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %30, i64 %31, i1 true) #4, !dbg !23 + %33 = extractvalue { i32, i32, i32, i32 } %32, 0, !dbg !23 + %34 = bitcast i32 %33 to <2 x bfloat>, !dbg !23 + %35 = extractvalue { i32, i32, i32, i32 } %32, 1, !dbg !23 + %36 = bitcast i32 %35 to <2 x bfloat>, !dbg !23 + %37 = extractvalue { i32, i32, i32, i32 } %32, 2, !dbg !23 + %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !23 + %39 = extractvalue { i32, i32, i32, i32 } %32, 3, !dbg !23 + %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !23 + %41 = getelementptr bfloat, ptr addrspace(1) %1, i64 %26, !dbg !24 + %42 = getelementptr bfloat, ptr addrspace(1) %41, i64 %27, !dbg !24 + %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !25 + %44 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %42, i64 %43, i1 true) #4, !dbg !25 + %45 = extractvalue { i32, i32, i32, i32 } %44, 0, !dbg !25 + %46 = bitcast i32 %45 to <2 x bfloat>, !dbg !25 + %47 = extractvalue { i32, i32, i32, i32 } %44, 1, !dbg !25 + %48 = bitcast i32 %47 to <2 x bfloat>, !dbg !25 + %49 = extractvalue { i32, i32, i32, i32 } %44, 2, !dbg !25 + %50 = bitcast i32 %49 to <2 x bfloat>, !dbg !25 + %51 = extractvalue { i32, i32, i32, i32 } %44, 3, !dbg !25 + %52 = bitcast i32 %51 to <2 x bfloat>, !dbg !25 + %53 = or disjoint i64 %26, 64, !dbg !26 + %54 = trunc nuw nsw i64 %53 to i32, !dbg !21 + %55 = or disjoint i32 %25, %54, !dbg !21 + %56 = sext i32 %55 to i64, !dbg !22 + %57 = getelementptr bfloat, ptr addrspace(1) %0, i64 %56, !dbg !22 + %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !23 + %59 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %57, i64 %58, i1 true) #4, !dbg !23 + %60 = extractvalue { i32, i32, i32, i32 } %59, 0, !dbg !23 + %61 = bitcast i32 %60 to <2 x bfloat>, !dbg !23 + %62 = extractvalue { i32, i32, i32, i32 } %59, 1, !dbg !23 + %63 = bitcast i32 %62 to <2 x bfloat>, !dbg !23 + %64 = extractvalue { i32, i32, i32, i32 } %59, 2, !dbg !23 + %65 = bitcast i32 %64 to <2 x bfloat>, !dbg !23 + %66 = extractvalue { i32, i32, i32, i32 } %59, 3, !dbg !23 + %67 = bitcast i32 %66 to <2 x bfloat>, !dbg !23 + %68 = getelementptr bfloat, ptr addrspace(1) %1, i64 %53, !dbg !24 + %69 = getelementptr bfloat, ptr addrspace(1) %68, i64 %27, !dbg !24 + %70 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !25 + %71 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %69, i64 %70, i1 true) #4, !dbg !25 + %72 = extractvalue { i32, i32, i32, i32 } %71, 0, !dbg !25 + %73 = bitcast i32 %72 to <2 x bfloat>, !dbg !25 + %74 = extractvalue { i32, i32, i32, i32 } %71, 1, !dbg !25 + %75 = bitcast i32 %74 to <2 x bfloat>, !dbg !25 + %76 = extractvalue { i32, i32, i32, i32 } %71, 2, !dbg !25 + %77 = bitcast i32 %76 to <2 x bfloat>, !dbg !25 + %78 = extractvalue { i32, i32, i32, i32 } %71, 3, !dbg !25 + %79 = bitcast i32 %78 to <2 x bfloat>, !dbg !25 + %80 = fpext <2 x bfloat> %34 to <2 x float>, !dbg !27 + %81 = fpext <2 x bfloat> %46 to <2 x float>, !dbg !28 + %82 = fmul <2 x float> %80, %81, !dbg !29 + %83 = fadd <2 x float> %82, zeroinitializer, !dbg !30 + %84 = fpext <2 x bfloat> %61 to <2 x float>, !dbg !27 + %85 = fpext <2 x bfloat> %73 to <2 x float>, !dbg !28 + %86 = fmul <2 x float> %84, %85, !dbg !29 + %87 = fadd <2 x float> %83, %86, !dbg !30 + %88 = fpext <2 x bfloat> %36 to <2 x float>, !dbg !27 + %89 = fpext <2 x bfloat> %48 to <2 x float>, !dbg !28 + %90 = fmul <2 x float> %88, %89, !dbg !29 + %91 = fadd <2 x float> %90, zeroinitializer, !dbg !30 + %92 = fpext <2 x bfloat> %63 to <2 x float>, !dbg !27 + %93 = fpext <2 x bfloat> %75 to <2 x float>, !dbg !28 + %94 = fmul <2 x float> %92, %93, !dbg !29 + %95 = fadd <2 x float> %91, %94, !dbg !30 + %96 = fpext <2 x bfloat> %38 to <2 x float>, !dbg !27 + %97 = fpext <2 x bfloat> %50 to <2 x float>, !dbg !28 + %98 = fmul <2 x float> %96, %97, !dbg !29 + %99 = fadd <2 x float> %98, zeroinitializer, !dbg !30 + %100 = fpext <2 x bfloat> %65 to <2 x float>, !dbg !27 + %101 = fpext <2 x bfloat> %77 to <2 x float>, !dbg !28 + %102 = fmul <2 x float> %100, %101, !dbg !29 + %103 = fadd <2 x float> %99, %102, !dbg !30 + %104 = fpext <2 x bfloat> %40 to <2 x float>, !dbg !27 + %105 = fpext <2 x bfloat> %52 to <2 x float>, !dbg !28 + %106 = fmul <2 x float> %104, %105, !dbg !29 + %107 = fadd <2 x float> %106, zeroinitializer, !dbg !30 + %108 = fpext <2 x bfloat> %67 to <2 x float>, !dbg !27 + %109 = fpext <2 x bfloat> %79 to <2 x float>, !dbg !28 + %110 = fmul <2 x float> %108, %109, !dbg !29 + %111 = fadd <2 x float> %107, %110, !dbg !30 + %112 = and i32 %10, 63, !dbg !9 + %113 = or disjoint i32 %9, %112, !dbg !10 + %shift = shufflevector <2 x float> %87, <2 x float> poison, <2 x i32> , !dbg !31 + %foldExtExtBinop = fadd <2 x float> %87, %shift, !dbg !31 + %foldExtExtBinop9 = fadd <2 x float> %95, %foldExtExtBinop, !dbg !31 + %shift11 = shufflevector <2 x float> %95, <2 x float> poison, <2 x i32> , !dbg !31 + %foldExtExtBinop12 = fadd <2 x float> %shift11, %foldExtExtBinop9, !dbg !31 + %foldExtExtBinop14 = fadd <2 x float> %103, %foldExtExtBinop12, !dbg !31 + %shift16 = shufflevector <2 x float> %103, <2 x float> poison, <2 x i32> , !dbg !31 + %foldExtExtBinop17 = fadd <2 x float> %shift16, %foldExtExtBinop14, !dbg !31 + %foldExtExtBinop19 = fadd <2 x float> %111, %foldExtExtBinop17, !dbg !31 + %shift21 = shufflevector <2 x float> %111, <2 x float> poison, <2 x i32> , !dbg !31 + %foldExtExtBinop22 = fadd <2 x float> %shift21, %foldExtExtBinop19, !dbg !31 + %114 = extractelement <2 x float> %foldExtExtBinop22, i64 0, !dbg !31 + %115 = bitcast float %114 to i32, !dbg !35 + %116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %115, i32 4, i32 31), !dbg !35 + %117 = bitcast i32 %116 to float, !dbg !35 + %118 = fadd float %114, %117, !dbg !31 + %119 = bitcast float %118 to i32, !dbg !35 + %120 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %119, i32 2, i32 31), !dbg !35 + %121 = bitcast i32 %120 to float, !dbg !35 + %122 = fadd float %118, %121, !dbg !31 + %123 = bitcast float %122 to i32, !dbg !35 + %124 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %123, i32 1, i32 31), !dbg !35 + %125 = bitcast i32 %124 to float, !dbg !35 + %126 = fadd float %122, %125, !dbg !31 + %127 = lshr exact i32 %11, 1, !dbg !36 + %128 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %127, !dbg !36 + store float %126, ptr addrspace(3) %128, align 4, !dbg !36 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !36 + %129 = shl nuw nsw i32 %112, 2, !dbg !36 + %130 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %129, !dbg !36 + %131 = load i32, ptr addrspace(3) %130, align 4, !dbg !36 + %132 = sext i32 %113 to i64, !dbg !37 + %133 = getelementptr float, ptr addrspace(1) %2, i64 %132, !dbg !37 + %134 = and i32 %10, 448, !dbg !38 + %135 = icmp eq i32 %134, 0, !dbg !38 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %131, ptr addrspace(1) %133, i1 %135) #4, !dbg !38 + ret void, !dbg !39 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 21, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 29, column: 29, scope: !4) +!15 = !DILocation(line: 30, column: 19, scope: !4) +!16 = !DILocation(line: 39, column: 45, scope: !4) +!17 = !DILocation(line: 39, column: 55, scope: !4) +!18 = !DILocation(line: 39, column: 68, scope: !4) +!19 = !DILocation(line: 40, column: 45, scope: !4) +!20 = !DILocation(line: 33, column: 40, scope: !4) +!21 = !DILocation(line: 39, column: 60, scope: !4) +!22 = !DILocation(line: 39, column: 34, scope: !4) +!23 = !DILocation(line: 39, column: 73, scope: !4) +!24 = !DILocation(line: 40, column: 34, scope: !4) +!25 = !DILocation(line: 40, column: 50, scope: !4) +!26 = !DILocation(line: 34, column: 31, scope: !4) +!27 = !DILocation(line: 39, column: 127, scope: !4) +!28 = !DILocation(line: 40, column: 104, scope: !4) +!29 = !DILocation(line: 41, column: 22, scope: !4) +!30 = !DILocation(line: 43, column: 23, scope: !4) +!31 = !DILocation(line: 261, column: 15, scope: !32, inlinedAt: !34) +!32 = distinct !DILexicalBlockFile(scope: !4, file: !33, discriminator: 0) +!33 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!34 = !DILocation(line: 45, column: 25, scope: !4) +!35 = !DILocation(line: 291, column: 36, scope: !32, inlinedAt: !34) +!36 = !DILocation(line: 45, column: 28, scope: !4) +!37 = !DILocation(line: 49, column: 25, scope: !4) +!38 = !DILocation(line: 49, column: 36, scope: !4) +!39 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..f7ff410af3e9c8ac7e61ba0154f24f5ec806d772 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ptx @@ -0,0 +1,511 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u32 triton_red_fused_zeros_0_param_3, + .param .u32 triton_red_fused_zeros_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_6 +) +.reqntid 512 +{ + .reg .pred %p<6>; + .reg .b16 %rs<33>; + .reg .b32 %r<131>; + .reg .b64 %rd<23>; + .loc 1 18 0 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:18:0 + +// %bb.0: + ld.param.b64 %rd14, [triton_red_fused_zeros_0_param_0]; + ld.param.b64 %rd15, [triton_red_fused_zeros_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:23:28 + mov.u32 %r34, %ctaid.x; + .loc 1 23 33 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:23:33 + shl.b32 %r35, %r34, 6; + ld.param.b64 %rd16, [triton_red_fused_zeros_0_param_2]; + .loc 1 24 44 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:24:44 + mov.u32 %r36, %tid.x; + and.b32 %r37, %r36, 504; + bfe.u32 %r38, %r36, 3, 6; + .loc 1 24 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:24:23 + or.b32 %r39, %r38, %r35; + .loc 1 26 37 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:26:37 + shl.b32 %r40, %r36, 3; + and.b32 %r41, %r40, 56; + .loc 1 29 21 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:29:21 + bfe.s32 %r42, %r34, 25, 1; + shr.u32 %r43, %r42, 21; + add.s32 %r44, %r39, %r43; + shr.s32 %r45, %r44, 11; + .loc 1 28 19 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:28:19 + and.b32 %r46, %r44, 1046528; + sub.s32 %r47, %r39, %r46; + .loc 1 29 29 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:29:29 + shr.u32 %r48, %r45, 27; + add.s32 %r49, %r45, %r48; + and.b32 %r50, %r49, 33554400; + sub.s32 %r51, %r45, %r50; + .loc 1 30 19 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:30:19 + shr.u32 %r52, %r42, 16; + add.s32 %r53, %r39, %r52; + .loc 1 39 45 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:45 + shl.b32 %r54, %r51, 7; + .loc 1 39 55 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:55 + shl.b32 %r55, %r47, 12; + .loc 1 39 68 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:68 + shl.b32 %r56, %r53, 7; + and.b32 %r57, %r56, -8388608; + .loc 1 40 45 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:45 + shl.b32 %r58, %r39, 7; + add.s32 %r59, %r57, %r55; + add.s32 %r60, %r59, %r54; + .loc 1 33 40 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:33:40 + cvt.u64.u32 %rd17, %r41; + .loc 1 39 60 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:60 + or.b32 %r61, %r60, %r41; + .loc 1 39 34 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:34 + mad.wide.s32 %rd2, %r61, 2, %rd14; + .loc 1 39 73 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:73 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd3, 1.0; + // end inline asm + mov.b32 %r5, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd2 + 0 ], %rd3; + // end inline asm + .loc 1 40 34 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:34 + mad.wide.u32 %rd18, %r41, 2, %rd15; + mad.wide.s32 %rd5, %r58, 2, %rd18; + .loc 1 40 50 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:50 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r9, %r5; + mov.u32 %r10, %r5; + mov.u32 %r11, %r5; + mov.u32 %r12, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r9, %r10, %r11, %r12 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 39 34 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:34 + cvt.s64.s32 %rd19, %r60; + or.b64 %rd20, %rd19, %rd17; + shl.b64 %rd21, %rd20, 1; + add.s64 %rd22, %rd14, %rd21; + add.s64 %rd8, %rd22, 128; + .loc 1 39 73 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:73 + // begin inline asm + mov.u64 %rd9, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd9, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r17, %r5; + mov.u32 %r18, %r5; + mov.u32 %r19, %r5; + mov.u32 %r20, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r17, %r18, %r19, %r20 }, [ %rd8 + 0 ], %rd9; + // end inline asm + .loc 1 40 34 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:34 + add.s64 %rd11, %rd5, 128; + .loc 1 40 50 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:50 + // begin inline asm + mov.u64 %rd12, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd12, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r25, %r5; + mov.u32 %r26, %r5; + mov.u32 %r27, %r5; + mov.u32 %r28, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd11 + 0 ], %rd12; + // end inline asm + .loc 1 39 127 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:127 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r62, %rs2; + cvt.f32.bf16 %r63, %rs1; + .loc 1 40 104 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:104 + mov.b32 {%rs3, %rs4}, %r9; + cvt.f32.bf16 %r64, %rs4; + cvt.f32.bf16 %r65, %rs3; + .loc 1 43 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:43:23 + fma.rn.f32 %r66, %r63, %r65, 0f00000000; + fma.rn.f32 %r67, %r62, %r64, 0f00000000; + .loc 1 39 127 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:127 + mov.b32 {%rs5, %rs6}, %r17; + cvt.f32.bf16 %r68, %rs5; + cvt.f32.bf16 %r69, %rs6; + .loc 1 40 104 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:104 + mov.b32 {%rs7, %rs8}, %r25; + cvt.f32.bf16 %r70, %rs7; + cvt.f32.bf16 %r71, %rs8; + .loc 1 43 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:43:23 + fma.rn.f32 %r72, %r69, %r71, %r67; + fma.rn.f32 %r73, %r68, %r70, %r66; + .loc 1 39 127 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:127 + mov.b32 {%rs9, %rs10}, %r2; + cvt.f32.bf16 %r74, %rs10; + cvt.f32.bf16 %r75, %rs9; + .loc 1 40 104 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:104 + mov.b32 {%rs11, %rs12}, %r10; + cvt.f32.bf16 %r76, %rs12; + cvt.f32.bf16 %r77, %rs11; + .loc 1 43 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:43:23 + fma.rn.f32 %r78, %r75, %r77, 0f00000000; + fma.rn.f32 %r79, %r74, %r76, 0f00000000; + .loc 1 39 127 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:127 + mov.b32 {%rs13, %rs14}, %r18; + cvt.f32.bf16 %r80, %rs13; + cvt.f32.bf16 %r81, %rs14; + .loc 1 40 104 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:104 + mov.b32 {%rs15, %rs16}, %r26; + cvt.f32.bf16 %r82, %rs15; + cvt.f32.bf16 %r83, %rs16; + .loc 1 43 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:43:23 + fma.rn.f32 %r84, %r81, %r83, %r79; + fma.rn.f32 %r85, %r80, %r82, %r78; + .loc 1 39 127 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:127 + mov.b32 {%rs17, %rs18}, %r3; + cvt.f32.bf16 %r86, %rs18; + cvt.f32.bf16 %r87, %rs17; + .loc 1 40 104 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:104 + mov.b32 {%rs19, %rs20}, %r11; + cvt.f32.bf16 %r88, %rs20; + cvt.f32.bf16 %r89, %rs19; + .loc 1 43 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:43:23 + fma.rn.f32 %r90, %r87, %r89, 0f00000000; + fma.rn.f32 %r91, %r86, %r88, 0f00000000; + .loc 1 39 127 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:127 + mov.b32 {%rs21, %rs22}, %r19; + cvt.f32.bf16 %r92, %rs21; + cvt.f32.bf16 %r93, %rs22; + .loc 1 40 104 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:104 + mov.b32 {%rs23, %rs24}, %r27; + cvt.f32.bf16 %r94, %rs23; + cvt.f32.bf16 %r95, %rs24; + .loc 1 43 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:43:23 + fma.rn.f32 %r96, %r93, %r95, %r91; + fma.rn.f32 %r97, %r92, %r94, %r90; + .loc 1 39 127 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:127 + mov.b32 {%rs25, %rs26}, %r4; + cvt.f32.bf16 %r98, %rs26; + cvt.f32.bf16 %r99, %rs25; + .loc 1 40 104 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:104 + mov.b32 {%rs27, %rs28}, %r12; + cvt.f32.bf16 %r100, %rs28; + cvt.f32.bf16 %r101, %rs27; + .loc 1 43 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:43:23 + fma.rn.f32 %r102, %r99, %r101, 0f00000000; + fma.rn.f32 %r103, %r98, %r100, 0f00000000; + .loc 1 39 127 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:39:127 + mov.b32 {%rs29, %rs30}, %r20; + cvt.f32.bf16 %r104, %rs29; + cvt.f32.bf16 %r105, %rs30; + .loc 1 40 104 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:40:104 + mov.b32 {%rs31, %rs32}, %r28; + cvt.f32.bf16 %r106, %rs31; + cvt.f32.bf16 %r107, %rs32; + .loc 1 43 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:43:23 + fma.rn.f32 %r108, %r105, %r107, %r103; + fma.rn.f32 %r109, %r104, %r106, %r102; + .loc 1 24 44 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:24:44 + and.b32 %r110, %r36, 63; + .loc 1 24 23 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:24:23 + or.b32 %r111, %r35, %r110; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r112, %r73, %r72; + add.f32 %r113, %r85, %r112; + add.f32 %r114, %r84, %r113; + add.f32 %r115, %r97, %r114; + add.f32 %r116, %r96, %r115; + add.f32 %r117, %r109, %r116; + add.f32 %r118, %r108, %r117; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r119, %r118, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r120, %r118, %r119; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r121, %r120, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r122, %r120, %r121; + .loc 2 291 36 // standard.py:291:36 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + shfl.sync.bfly.b32 %r123, %r122, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:25 ] + add.f32 %r124, %r122, %r123; +$L__tmp2: + .loc 1 45 28 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:45:28 + shr.u32 %r125, %r37, 1; + mov.b32 %r126, global_smem; + add.s32 %r127, %r126, %r125; + st.shared.b32 [%r127], %r124; + bar.sync 0; + shl.b32 %r128, %r110, 2; + add.s32 %r129, %r126, %r128; + ld.shared.b32 %r33, [%r129]; + .loc 1 49 25 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:49:25 + mad.wide.s32 %rd13, %r111, 4, %rd16; + .loc 1 49 36 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:49:36 + and.b32 %r130, %r36, 448; + setp.eq.b32 %p5, %r130, 0; + // begin inline asm + @%p5 st.global.b32 [ %rd13 + 0 ], { %r33 }; + // end inline asm + .loc 1 49 4 // cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py:49:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 209 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xca DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 111 +.b8 103 +.b8 111 +.b8 108 +.b8 53 +.b8 53 +.b8 99 +.b8 116 +.b8 104 +.b8 107 +.b8 52 +.b8 122 +.b8 101 +.b8 118 +.b8 115 +.b8 121 +.b8 51 +.b8 100 +.b8 108 +.b8 113 +.b8 105 +.b8 121 +.b8 122 +.b8 105 +.b8 112 +.b8 101 +.b8 102 +.b8 118 +.b8 55 +.b8 51 +.b8 53 +.b8 103 +.b8 101 +.b8 50 +.b8 119 +.b8 116 +.b8 97 +.b8 100 +.b8 100 +.b8 118 +.b8 107 +.b8 52 +.b8 51 +.b8 54 +.b8 113 +.b8 104 +.b8 116 +.b8 53 +.b8 110 +.b8 111 +.b8 120 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 111 +.b8 103 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..73075f756b92d4b3ae3bcd5e0f9cf2719b2ce5e7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.source @@ -0,0 +1,222 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":18:0) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc46 = loc(unknown) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc53 = loc("in_ptr0"(#loc)) +#loc54 = loc("in_ptr1"(#loc)) +#loc55 = loc("out_ptr1"(#loc)) +#loc56 = loc("xnumel"(#loc)) +#loc57 = loc("r0_numel"(#loc)) +#loc97 = loc("input"(#loc44)) +#loc98 = loc("a"(#loc49)) +#loc99 = loc("b"(#loc49)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 131072 : i32 loc(#loc58) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc59) + %xoffset = tt.get_program_id x : i32 loc(#loc60) + %xoffset_2 = arith.constant 64 : i32 loc(#loc61) + %xoffset_3 = arith.constant 64 : i32 loc(#loc61) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc61) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc62) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc63) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc64) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc64) + %xmask = arith.constant true loc(#loc65) + %xmask_8 = arith.constant dense : tensor<64x64xi1> loc(#loc65) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc66) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc67) + %x0 = arith.constant 2048 : i32 loc(#loc68) + %x0_10 = arith.constant 2048 : i32 loc(#loc68) + %x0_11 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc68) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc68) + %x1 = arith.constant 2048 : i32 loc(#loc69) + %x1_13 = arith.constant 2048 : i32 loc(#loc69) + %x1_14 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc69) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc69) + %x1_16 = arith.constant 32 : i32 loc(#loc70) + %x1_17 = arith.constant 32 : i32 loc(#loc70) + %x1_18 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc70) + %x1_19 = arith.remsi %x1_15, %x1_18 : tensor<64x1xi32> loc(#loc70) + %x2 = arith.constant 65536 : i32 loc(#loc71) + %x2_20 = arith.constant 65536 : i32 loc(#loc71) + %x2_21 = arith.constant dense<65536> : tensor<64x1xi32> loc(#loc71) + %x2_22 = arith.divsi %xindex_7, %x2_21 : tensor<64x1xi32> loc(#loc71) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc72) + %_tmp4_23 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc72) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c64_i32 = arith.constant 64 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_24 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_27 = %_tmp4_23) -> (tensor<64x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc74) + %r0_index_28 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc74) + %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc75) + %r0_mask_29 = arith.cmpi slt, %r0_index_28, %r0_mask : tensor<1x64xi32> loc(#loc75) + %tmp0 = arith.constant 128 : i32 loc(#loc76) + %tmp0_30 = arith.constant 128 : i32 loc(#loc76) + %tmp0_31 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc76) + %tmp0_32 = arith.muli %tmp0_31, %x1_19 : tensor<64x1xi32> loc(#loc76) + %tmp0_33 = tt.broadcast %r0_index_28 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc77) + %tmp0_34 = tt.broadcast %tmp0_32 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc77) + %tmp0_35 = arith.addi %tmp0_33, %tmp0_34 : tensor<64x64xi32> loc(#loc77) + %tmp0_36 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_37 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_38 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc78) + %tmp0_39 = arith.muli %tmp0_38, %x0_12 : tensor<64x1xi32> loc(#loc78) + %tmp0_40 = tt.broadcast %tmp0_39 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc79) + %tmp0_41 = arith.addi %tmp0_35, %tmp0_40 : tensor<64x64xi32> loc(#loc79) + %tmp0_42 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_43 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_44 = arith.constant dense<8388608> : tensor<64x1xi32> loc(#loc80) + %tmp0_45 = arith.muli %tmp0_44, %x2_22 : tensor<64x1xi32> loc(#loc80) + %tmp0_46 = tt.broadcast %tmp0_45 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc81) + %tmp0_47 = arith.addi %tmp0_41, %tmp0_46 : tensor<64x64xi32> loc(#loc81) + %tmp0_48 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc82) + %tmp0_49 = tt.addptr %tmp0_48, %tmp0_47 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc82) + %tmp0_50 = arith.constant 0.000000e+00 : f32 loc(#loc83) + %tmp0_51 = tt.broadcast %r0_mask_29 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc83) + %tmp0_52 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc83) + %tmp0_53 = arith.truncf %tmp0_52 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc83) + %tmp0_54 = tt.load %tmp0_49, %tmp0_51, %tmp0_53 evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc83) + %tmp0_55 = arith.extf %tmp0_54 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc84) + %tmp1 = arith.constant 128 : i32 loc(#loc85) + %tmp1_56 = arith.constant 128 : i32 loc(#loc85) + %tmp1_57 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc85) + %tmp1_58 = arith.muli %tmp1_57, %xindex_7 : tensor<64x1xi32> loc(#loc85) + %tmp1_59 = tt.broadcast %r0_index_28 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc86) + %tmp1_60 = tt.broadcast %tmp1_58 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc86) + %tmp1_61 = arith.addi %tmp1_59, %tmp1_60 : tensor<64x64xi32> loc(#loc86) + %tmp1_62 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc87) + %tmp1_63 = tt.addptr %tmp1_62, %tmp1_61 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc87) + %tmp1_64 = arith.constant 0.000000e+00 : f32 loc(#loc88) + %tmp1_65 = tt.broadcast %r0_mask_29 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc88) + %tmp1_66 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc88) + %tmp1_67 = arith.truncf %tmp1_66 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc88) + %tmp1_68 = tt.load %tmp1_63, %tmp1_65, %tmp1_67 evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc88) + %tmp1_69 = arith.extf %tmp1_68 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc89) + %tmp2 = arith.mulf %tmp0_55, %tmp1_69 : tensor<64x64xf32> loc(#loc90) + %tmp5 = arith.addf %_tmp4_27, %tmp2 : tensor<64x64xf32> loc(#loc91) + %_tmp4_70 = tt.broadcast %r0_mask_29 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc92) + %_tmp4_71 = arith.select %_tmp4_70, %tmp5, %_tmp4_27 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc92) + scf.yield %_tmp4_71 : tensor<64x64xf32> loc(#loc36) + } loc(#loc73) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_24) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc93) + %tmp4_25 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc94) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc95) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<64x1xf32> loc(#loc96) + %tmp8_26 = arith.subf %tmp4_25, %tmp8 : tensor<64x1xf32> loc(#loc96) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc41) + %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc41) + tt.store %5, %tmp8_26 : tensor<64x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x64xf32> loc("input"(#loc44))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc45) + tt.reduce.return %2 : f32 loc(#loc45) + }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc45) + tt.return %0 : tensor<64xf32> loc(#loc47) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc48) + tt.return %1 : tensor<64xf32> loc(#loc48) + } loc(#loc44) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc49)), %b: f32 loc("b"(#loc49))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc50) + tt.return %0 : f32 loc(#loc51) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc52) + tt.return %1 : f32 loc(#loc52) + } loc(#loc49) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":30:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":32:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:68) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:60) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:73) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:127) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:45) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:50) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:104) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":41:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":43:23) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:40) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:28) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":47:11) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":48:18) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:4) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc58 = loc("xnumel"(#loc1)) +#loc59 = loc("r0_numel"(#loc2)) +#loc60 = loc("xoffset"(#loc3)) +#loc61 = loc("xoffset"(#loc4)) +#loc62 = loc("xindex"(#loc5)) +#loc63 = loc("xindex"(#loc6)) +#loc64 = loc("xindex"(#loc7)) +#loc65 = loc("xmask"(#loc8)) +#loc66 = loc("r0_base"(#loc9)) +#loc67 = loc("r0_base"(#loc10)) +#loc68 = loc("x0"(#loc11)) +#loc69 = loc("x1"(#loc12)) +#loc70 = loc("x1"(#loc13)) +#loc71 = loc("x2"(#loc14)) +#loc72 = loc("_tmp4"(#loc15)) +#loc73 = loc("_tmp4"(#loc16)) +#loc74 = loc("r0_index"(#loc17)) +#loc75 = loc("r0_mask"(#loc18)) +#loc76 = loc("tmp0"(#loc19)) +#loc77 = loc("tmp0"(#loc20)) +#loc78 = loc("tmp0"(#loc21)) +#loc79 = loc("tmp0"(#loc22)) +#loc80 = loc("tmp0"(#loc23)) +#loc81 = loc("tmp0"(#loc24)) +#loc82 = loc("tmp0"(#loc25)) +#loc83 = loc("tmp0"(#loc26)) +#loc84 = loc("tmp0"(#loc27)) +#loc85 = loc("tmp1"(#loc28)) +#loc86 = loc("tmp1"(#loc29)) +#loc87 = loc("tmp1"(#loc30)) +#loc88 = loc("tmp1"(#loc31)) +#loc89 = loc("tmp1"(#loc32)) +#loc90 = loc("tmp2"(#loc33)) +#loc91 = loc("tmp5"(#loc34)) +#loc92 = loc("_tmp4"(#loc35)) +#loc93 = loc("tmp4"(#loc37)) +#loc94 = loc("tmp4"(#loc38)) +#loc95 = loc("tmp7"(#loc39)) +#loc96 = loc("tmp8"(#loc40)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..2b6e0586b5a77d9d4973fc911f8cb4a0f225470d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,154 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [16, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 8], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":18:0) +#loc1 = loc(unknown) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:25) +#loc39 = loc("in_ptr0"(#loc)) +#loc40 = loc("in_ptr1"(#loc)) +#loc41 = loc("out_ptr1"(#loc)) +#loc42 = loc("xnumel"(#loc)) +#loc43 = loc("r0_numel"(#loc)) +#loc73 = loc("tmp4"(#loc33)) +#loc76 = loc(callsite(#loc1 at #loc73)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<8388608> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<65536> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<2048> : tensor<64x1xi32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc44) + %xoffset_8 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc45) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc46) + %xindex_9 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc46) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc46) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc46) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked> loc(#loc47) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc47) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<64x1xi32, #blocked> loc(#loc47) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<64x1xi32, #blocked1> loc(#loc47) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc48) + %r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc48) + %x0 = arith.remsi %xindex_14, %cst_5 : tensor<64x1xi32, #blocked> loc(#loc49) + %x1 = arith.divsi %xindex_14, %cst_5 : tensor<64x1xi32, #blocked> loc(#loc50) + %x1_17 = arith.remsi %x1, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc51) + %x2 = arith.divsi %xindex_14, %cst_3 : tensor<64x1xi32, #blocked> loc(#loc52) + %tmp0 = arith.muli %x1_17, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc53) + %tmp0_18 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc54) + %tmp0_19 = arith.muli %x0, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc55) + %tmp0_20 = tt.broadcast %tmp0_19 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc56) + %tmp0_21 = arith.muli %x2, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc57) + %tmp0_22 = tt.broadcast %tmp0_21 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc58) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> loc(#loc59) + %tmp1 = arith.muli %xindex_14, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc60) + %tmp1_24 = tt.broadcast %tmp1 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc61) + %tmp1_25 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> loc(#loc62) + %_tmp4 = scf.for %_tmp4_28 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg6 = %cst_7) -> (tensor<64x64xf32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp4_28 : i32 -> tensor<1x64xi32, #blocked> loc(#loc64) + %r0_index_29 = arith.addi %r0_index, %r0_base_16 : tensor<1x64xi32, #blocked> loc(#loc64) + %r0_mask = arith.cmpi slt, %r0_index_29, %cst : tensor<1x64xi32, #blocked> loc(#loc65) + %tmp0_30 = tt.broadcast %r0_index_29 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc54) + %tmp0_31 = arith.addi %tmp0_30, %tmp0_18 : tensor<64x64xi32, #blocked> loc(#loc54) + %tmp0_32 = arith.addi %tmp0_31, %tmp0_20 : tensor<64x64xi32, #blocked> loc(#loc56) + %tmp0_33 = arith.addi %tmp0_32, %tmp0_22 : tensor<64x64xi32, #blocked> loc(#loc58) + %tmp0_34 = tt.addptr %tmp0_23, %tmp0_33 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi32, #blocked> loc(#loc59) + %tmp0_35 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc66) + %tmp0_36 = tt.load %tmp0_34, %tmp0_35, %cst_6 evictionPolicy = evict_first : tensor<64x64x!tt.ptr, #blocked> loc(#loc66) + %tmp0_37 = arith.extf %tmp0_36 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc67) + %tmp1_38 = arith.addi %tmp0_30, %tmp1_24 : tensor<64x64xi32, #blocked> loc(#loc61) + %tmp1_39 = tt.addptr %tmp1_25, %tmp1_38 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi32, #blocked> loc(#loc62) + %tmp1_40 = tt.load %tmp1_39, %tmp0_35, %cst_6 evictionPolicy = evict_first : tensor<64x64x!tt.ptr, #blocked> loc(#loc68) + %tmp1_41 = arith.extf %tmp1_40 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc69) + %tmp2 = arith.mulf %tmp0_37, %tmp1_41 : tensor<64x64xf32, #blocked> loc(#loc70) + %tmp5 = arith.addf %arg6, %tmp2 : tensor<64x64xf32, #blocked> loc(#loc71) + %_tmp4_42 = arith.select %tmp0_35, %tmp5, %arg6 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc72) + scf.yield %_tmp4_42 : tensor<64x64xf32, #blocked> loc(#loc31) + } loc(#loc63) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_28: f32 loc(callsite(#loc1 at #loc73)), %tmp4_29: f32 loc(callsite(#loc1 at #loc73))): + %tmp4_30 = arith.addf %tmp4_28, %tmp4_29 : f32 loc(#loc77) + tt.reduce.return %tmp4_30 : f32 loc(#loc75) + }) : (tensor<64x64xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc75) + %tmp4_26 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc74) + %tmp4_27 = tt.expand_dims %tmp4_26 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc74) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc36) + %1 = tt.addptr %0, %xindex_15 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc36) + tt.store %1, %tmp4_27 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc37) + tt.return loc(#loc38) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":30:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:41) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:55) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:50) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:68) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:60) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":33:40) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":34:31) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":35:29) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:73) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:127) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:104) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":41:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":43:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:40) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:8) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:28) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:36) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:4) +#loc44 = loc("xoffset"(#loc2)) +#loc45 = loc("xoffset"(#loc3)) +#loc46 = loc("xindex"(#loc4)) +#loc47 = loc("xindex"(#loc5)) +#loc48 = loc("r0_base"(#loc6)) +#loc49 = loc("x0"(#loc7)) +#loc50 = loc("x1"(#loc8)) +#loc51 = loc("x1"(#loc9)) +#loc52 = loc("x2"(#loc10)) +#loc53 = loc("tmp0"(#loc11)) +#loc54 = loc("tmp0"(#loc12)) +#loc55 = loc("tmp0"(#loc13)) +#loc56 = loc("tmp0"(#loc14)) +#loc57 = loc("tmp0"(#loc15)) +#loc58 = loc("tmp0"(#loc16)) +#loc59 = loc("tmp0"(#loc17)) +#loc60 = loc("tmp1"(#loc18)) +#loc61 = loc("tmp1"(#loc19)) +#loc62 = loc("tmp1"(#loc20)) +#loc63 = loc("_tmp4"(#loc21)) +#loc64 = loc("r0_index"(#loc22)) +#loc65 = loc("r0_mask"(#loc23)) +#loc66 = loc("tmp0"(#loc24)) +#loc67 = loc("tmp0"(#loc25)) +#loc68 = loc("tmp1"(#loc26)) +#loc69 = loc("tmp1"(#loc27)) +#loc70 = loc("tmp2"(#loc28)) +#loc71 = loc("tmp5"(#loc29)) +#loc72 = loc("_tmp4"(#loc30)) +#loc74 = loc("tmp4"(#loc35)) +#loc75 = loc(callsite(#loc32 at #loc73)) +#loc77 = loc(callsite(#loc34 at #loc75)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..52832d343547c0481fab97ff6d3da90b304f4c40 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/RLERDWVCUUZ5OAP52ESA35OJTILZ5NSISPJPZUCPMO7MZ6EV6FUA/triton_red_fused_zeros_0.ttir @@ -0,0 +1,148 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":18:0) +#loc1 = loc(unknown) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:25) +#loc40 = loc("in_ptr0"(#loc)) +#loc41 = loc("in_ptr1"(#loc)) +#loc42 = loc("out_ptr1"(#loc)) +#loc43 = loc("xnumel"(#loc)) +#loc44 = loc("r0_numel"(#loc)) +#loc75 = loc("tmp4"(#loc34)) +#loc78 = loc(callsite(#loc1 at #loc75)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x64xbf16> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst_0 = arith.constant dense<8388608> : tensor<64x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc1) + %x2 = arith.constant dense<65536> : tensor<64x1xi32> loc(#loc45) + %x1 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc46) + %cst_5 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc47) + %xoffset_6 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc48) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc49) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc50) + %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<64x1xi32> loc(#loc51) + %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<64x1xi32> loc(#loc51) + %r0_base = tt.expand_dims %xindex {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc52) + %x0 = arith.remsi %xindex_9, %cst_5 : tensor<64x1xi32> loc(#loc53) + %x1_10 = arith.divsi %xindex_9, %cst_5 : tensor<64x1xi32> loc(#loc54) + %x1_11 = arith.remsi %x1_10, %x1 : tensor<64x1xi32> loc(#loc46) + %x2_12 = arith.divsi %xindex_9, %x2 : tensor<64x1xi32> loc(#loc45) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%_tmp4_14 = %cst_4) -> (tensor<64x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc56) + %r0_index_15 = arith.addi %r0_index, %r0_base : tensor<1x64xi32> loc(#loc56) + %r0_mask = arith.cmpi slt, %r0_index_15, %cst_3 : tensor<1x64xi32> loc(#loc57) + %tmp0 = arith.muli %x1_11, %cst_2 : tensor<64x1xi32> loc(#loc58) + %tmp0_16 = tt.broadcast %r0_index_15 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc59) + %tmp0_17 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc59) + %tmp0_18 = arith.addi %tmp0_16, %tmp0_17 : tensor<64x64xi32> loc(#loc59) + %tmp0_19 = arith.muli %x0, %cst_1 : tensor<64x1xi32> loc(#loc60) + %tmp0_20 = tt.broadcast %tmp0_19 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc61) + %tmp0_21 = arith.addi %tmp0_18, %tmp0_20 : tensor<64x64xi32> loc(#loc61) + %tmp0_22 = arith.muli %x2_12, %cst_0 : tensor<64x1xi32> loc(#loc62) + %tmp0_23 = tt.broadcast %tmp0_22 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc63) + %tmp0_24 = arith.addi %tmp0_21, %tmp0_23 : tensor<64x64xi32> loc(#loc63) + %tmp0_25 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc64) + %tmp0_26 = tt.addptr %tmp0_25, %tmp0_24 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc64) + %tmp0_27 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc65) + %tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc65) + %tmp0_29 = arith.extf %tmp0_28 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc66) + %tmp1 = arith.muli %xindex_9, %cst_2 : tensor<64x1xi32> loc(#loc67) + %tmp1_30 = tt.broadcast %tmp1 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc68) + %tmp1_31 = arith.addi %tmp0_16, %tmp1_30 : tensor<64x64xi32> loc(#loc68) + %tmp1_32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc69) + %tmp1_33 = tt.addptr %tmp1_32, %tmp1_31 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc69) + %tmp1_34 = tt.load %tmp1_33, %tmp0_27, %cst evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc70) + %tmp1_35 = arith.extf %tmp1_34 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc71) + %tmp2 = arith.mulf %tmp0_29, %tmp1_35 : tensor<64x64xf32> loc(#loc72) + %tmp5 = arith.addf %_tmp4_14, %tmp2 : tensor<64x64xf32> loc(#loc73) + %_tmp4_36 = arith.select %tmp0_27, %tmp5, %_tmp4_14 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc74) + scf.yield %_tmp4_36 : tensor<64x64xf32> loc(#loc32) + } loc(#loc55) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_14: f32 loc(callsite(#loc1 at #loc75)), %tmp4_15: f32 loc(callsite(#loc1 at #loc75))): + %tmp4_16 = arith.addf %tmp4_14, %tmp4_15 : f32 loc(#loc79) + tt.reduce.return %tmp4_16 : f32 loc(#loc77) + }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc77) + %tmp4_13 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc76) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc37) + %1 = tt.addptr %0, %xindex_9 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc37) + tt.store %1, %tmp4_13 : tensor<64x1x!tt.ptr> loc(#loc38) + tt.return loc(#loc39) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":33:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":30:19) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:29) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:28) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":23:33) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:36) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:44) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":24:23) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":34:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":35:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:45) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:41) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:55) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:50) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:68) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:73) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":39:127) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:45) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:41) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:34) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:50) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":40:104) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":41:22) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":43:23) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:40) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":44:8) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":45:28) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:36) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/og/cogol55cthk4zevsy3dlqiyzipefv735ge2wtaddvk436qht5nox.py":49:4) +#loc45 = loc("x2"(#loc3)) +#loc46 = loc("x1"(#loc4)) +#loc47 = loc("xoffset"(#loc5)) +#loc48 = loc("xoffset"(#loc6)) +#loc49 = loc("xindex"(#loc7)) +#loc50 = loc("xindex"(#loc8)) +#loc51 = loc("xindex"(#loc9)) +#loc52 = loc("r0_base"(#loc10)) +#loc53 = loc("x0"(#loc11)) +#loc54 = loc("x1"(#loc12)) +#loc55 = loc("_tmp4"(#loc2)) +#loc56 = loc("r0_index"(#loc13)) +#loc57 = loc("r0_mask"(#loc14)) +#loc58 = loc("tmp0"(#loc15)) +#loc59 = loc("tmp0"(#loc16)) +#loc60 = loc("tmp0"(#loc17)) +#loc61 = loc("tmp0"(#loc18)) +#loc62 = loc("tmp0"(#loc19)) +#loc63 = loc("tmp0"(#loc20)) +#loc64 = loc("tmp0"(#loc21)) +#loc65 = loc("tmp0"(#loc22)) +#loc66 = loc("tmp0"(#loc23)) +#loc67 = loc("tmp1"(#loc24)) +#loc68 = loc("tmp1"(#loc25)) +#loc69 = loc("tmp1"(#loc26)) +#loc70 = loc("tmp1"(#loc27)) +#loc71 = loc("tmp1"(#loc28)) +#loc72 = loc("tmp2"(#loc29)) +#loc73 = loc("tmp5"(#loc30)) +#loc74 = loc("_tmp4"(#loc31)) +#loc76 = loc("tmp4"(#loc36)) +#loc77 = loc(callsite(#loc33 at #loc75)) +#loc79 = loc(callsite(#loc35 at #loc77)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/__grp__triton_tem_fused_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/__grp__triton_tem_fused_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b6446abae360c5175caa9749e90365614c4fc14d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/__grp__triton_tem_fused_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_tem_fused_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.source", "triton_tem_fused_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.ttir", "triton_tem_fused_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.ttgir", "triton_tem_fused_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.llir", "triton_tem_fused_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.ptx", "triton_tem_fused_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.cubin", "triton_tem_fused_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.json new file mode 100644 index 0000000000000000000000000000000000000000..aa3e28416bf1fc91f111c1438025b07beb2c2064 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.json @@ -0,0 +1 @@ +{"hash": "8dd156316023205489e65663346e9227992331971feb7fac9c2418e56e4c905a", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 131072, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..e292669e18255c4651e1ecfd6d4bccdb357c8c24 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.llir @@ -0,0 +1,5464 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_tem_fused_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, ptr addrspace(1) %10, i32 %11, i32 %12, ptr addrspace(1) readnone captures(none) %13, ptr addrspace(1) readnone captures(none) %14) local_unnamed_addr #0 !dbg !5 { + %16 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %17 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !9 + %18 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !10 + %19 = and i32 %17, 1, !dbg !11 + %20 = shl i32 %17, 23, !dbg !12 + %21 = shl nuw nsw i32 %18, 7, !dbg !13 + %22 = or disjoint i32 %20, %21, !dbg !14 + %23 = shl nuw nsw i32 %19, 10, !dbg !15 + %24 = shl nuw nsw i32 %18, 5, !dbg !16 + %25 = and i32 %24, 2097024, !dbg !16 + %reass.add = add nuw nsw i32 %23, %25, !dbg !17 + %reass.mul = mul i32 %reass.add, %11, !dbg !17 + %26 = sext i32 %22 to i64, !dbg !18 + %27 = getelementptr bfloat, ptr addrspace(1) %0, i64 %26, !dbg !18 + %28 = sext i32 %reass.mul to i64, !dbg !19 + %29 = getelementptr bfloat, ptr addrspace(1) %1, i64 %28, !dbg !19 + %30 = getelementptr bfloat, ptr addrspace(1) %2, i64 %28, !dbg !20 + %31 = shl nuw nsw i32 %19, 4, !dbg !21 + %32 = add nuw i32 %31, %16, !dbg !22 + %reass.mul389 = mul i32 %32, %12, !dbg !23 + %33 = shl i32 %16, 7, !dbg !24 + %34 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !25 + %35 = lshr i32 %34, 5, !dbg !25 + %36 = and i32 %34, 240, !dbg !25 + %37 = lshr exact i32 %36, 4, !dbg !25 + %38 = or disjoint i32 %37, 16, !dbg !25 + %39 = or disjoint i32 %37, 32, !dbg !25 + %40 = or disjoint i32 %37, 48, !dbg !25 + %41 = lshr i32 %34, 1, !dbg !25 + %42 = and i32 %41, 112, !dbg !25 + %43 = lshr i32 %34, 2, !dbg !25 + %44 = and i32 %43, 7, !dbg !25 + %45 = or disjoint i32 %42, %44, !dbg !25 + %46 = or disjoint i32 %37, %33, !dbg !26 + %47 = or disjoint i32 %38, %33, !dbg !26 + %48 = or disjoint i32 %39, %33, !dbg !26 + %49 = or disjoint i32 %40, %33, !dbg !26 + %50 = or disjoint i32 %46, 64, !dbg !26 + %51 = or disjoint i32 %46, 80, !dbg !26 + %52 = or disjoint i32 %46, 96, !dbg !26 + %53 = or disjoint i32 %46, 112, !dbg !26 + %54 = shl i32 %46, 12, !dbg !27 + %55 = shl i32 %47, 12, !dbg !27 + %56 = shl i32 %48, 12, !dbg !27 + %57 = shl i32 %49, 12, !dbg !27 + %58 = shl i32 %50, 12, !dbg !27 + %59 = shl i32 %51, 12, !dbg !27 + %60 = shl i32 %52, 12, !dbg !27 + %61 = shl i32 %53, 12, !dbg !27 + %62 = sext i32 %54 to i64, !dbg !30 + %63 = getelementptr bfloat, ptr addrspace(1) %27, i64 %62, !dbg !30 + %64 = sext i32 %55 to i64, !dbg !30 + %65 = getelementptr bfloat, ptr addrspace(1) %27, i64 %64, !dbg !30 + %66 = sext i32 %56 to i64, !dbg !30 + %67 = getelementptr bfloat, ptr addrspace(1) %27, i64 %66, !dbg !30 + %68 = sext i32 %57 to i64, !dbg !30 + %69 = getelementptr bfloat, ptr addrspace(1) %27, i64 %68, !dbg !30 + %70 = sext i32 %58 to i64, !dbg !30 + %71 = getelementptr bfloat, ptr addrspace(1) %27, i64 %70, !dbg !30 + %72 = sext i32 %59 to i64, !dbg !30 + %73 = getelementptr bfloat, ptr addrspace(1) %27, i64 %72, !dbg !30 + %74 = sext i32 %60 to i64, !dbg !30 + %75 = getelementptr bfloat, ptr addrspace(1) %27, i64 %74, !dbg !30 + %76 = sext i32 %61 to i64, !dbg !30 + %77 = getelementptr bfloat, ptr addrspace(1) %27, i64 %76, !dbg !30 + %78 = shl nuw nsw i32 %34, 3, !dbg !31 + %79 = and i32 %78, 120, !dbg !31 + %80 = zext nneg i32 %79 to i64, !dbg !32 + %81 = getelementptr bfloat, ptr addrspace(1) %63, i64 %80, !dbg !32 + %82 = getelementptr bfloat, ptr addrspace(1) %65, i64 %80, !dbg !32 + %83 = getelementptr bfloat, ptr addrspace(1) %67, i64 %80, !dbg !32 + %84 = getelementptr bfloat, ptr addrspace(1) %69, i64 %80, !dbg !32 + %85 = getelementptr bfloat, ptr addrspace(1) %71, i64 %80, !dbg !32 + %86 = getelementptr bfloat, ptr addrspace(1) %73, i64 %80, !dbg !32 + %87 = getelementptr bfloat, ptr addrspace(1) %75, i64 %80, !dbg !32 + %88 = getelementptr bfloat, ptr addrspace(1) %77, i64 %80, !dbg !32 + %89 = icmp slt i32 %46, 2048, !dbg !33 + %90 = icmp slt i32 %47, 2048, !dbg !33 + %91 = icmp slt i32 %48, 2048, !dbg !33 + %92 = icmp slt i32 %49, 2048, !dbg !33 + %93 = icmp slt i32 %50, 2048, !dbg !33 + %94 = icmp slt i32 %51, 2048, !dbg !33 + %95 = icmp slt i32 %52, 2048, !dbg !33 + %96 = icmp slt i32 %53, 2048, !dbg !33 + %97 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %81, i1 %89) #2, !dbg !34 + %98 = extractvalue { i32, i32, i32, i32 } %97, 0, !dbg !34 + %99 = extractvalue { i32, i32, i32, i32 } %97, 1, !dbg !34 + %100 = extractvalue { i32, i32, i32, i32 } %97, 2, !dbg !34 + %101 = extractvalue { i32, i32, i32, i32 } %97, 3, !dbg !34 + %102 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %82, i1 %90) #2, !dbg !34 + %103 = extractvalue { i32, i32, i32, i32 } %102, 0, !dbg !34 + %104 = extractvalue { i32, i32, i32, i32 } %102, 1, !dbg !34 + %105 = extractvalue { i32, i32, i32, i32 } %102, 2, !dbg !34 + %106 = extractvalue { i32, i32, i32, i32 } %102, 3, !dbg !34 + %107 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %83, i1 %91) #2, !dbg !34 + %108 = extractvalue { i32, i32, i32, i32 } %107, 0, !dbg !34 + %109 = extractvalue { i32, i32, i32, i32 } %107, 1, !dbg !34 + %110 = extractvalue { i32, i32, i32, i32 } %107, 2, !dbg !34 + %111 = extractvalue { i32, i32, i32, i32 } %107, 3, !dbg !34 + %112 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %84, i1 %92) #2, !dbg !34 + %113 = extractvalue { i32, i32, i32, i32 } %112, 0, !dbg !34 + %114 = extractvalue { i32, i32, i32, i32 } %112, 1, !dbg !34 + %115 = extractvalue { i32, i32, i32, i32 } %112, 2, !dbg !34 + %116 = extractvalue { i32, i32, i32, i32 } %112, 3, !dbg !34 + %117 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %85, i1 %93) #2, !dbg !34 + %118 = extractvalue { i32, i32, i32, i32 } %117, 0, !dbg !34 + %119 = extractvalue { i32, i32, i32, i32 } %117, 1, !dbg !34 + %120 = extractvalue { i32, i32, i32, i32 } %117, 2, !dbg !34 + %121 = extractvalue { i32, i32, i32, i32 } %117, 3, !dbg !34 + %122 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %86, i1 %94) #2, !dbg !34 + %123 = extractvalue { i32, i32, i32, i32 } %122, 0, !dbg !34 + %124 = extractvalue { i32, i32, i32, i32 } %122, 1, !dbg !34 + %125 = extractvalue { i32, i32, i32, i32 } %122, 2, !dbg !34 + %126 = extractvalue { i32, i32, i32, i32 } %122, 3, !dbg !34 + %127 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %87, i1 %95) #2, !dbg !34 + %128 = extractvalue { i32, i32, i32, i32 } %127, 0, !dbg !34 + %129 = extractvalue { i32, i32, i32, i32 } %127, 1, !dbg !34 + %130 = extractvalue { i32, i32, i32, i32 } %127, 2, !dbg !34 + %131 = extractvalue { i32, i32, i32, i32 } %127, 3, !dbg !34 + %132 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %88, i1 %96) #2, !dbg !34 + %133 = extractvalue { i32, i32, i32, i32 } %132, 0, !dbg !34 + %134 = extractvalue { i32, i32, i32, i32 } %132, 1, !dbg !34 + %135 = extractvalue { i32, i32, i32, i32 } %132, 2, !dbg !34 + %136 = extractvalue { i32, i32, i32, i32 } %132, 3, !dbg !34 + %137 = and i32 %34, 7, !dbg !34 + %138 = shl nuw nsw i32 %137, 4, !dbg !34 + %139 = shl nuw nsw i32 %36, 3, !dbg !34 + %140 = and i32 %34, 112, !dbg !34 + %141 = and i32 %34, 8, !dbg !34 + %142 = shl nuw nsw i32 %141, 11, !dbg !34 + %143 = or disjoint i32 %138, %139, !dbg !34 + %144 = xor i32 %143, %140, !dbg !34 + %145 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %142, !dbg !34 + %146 = getelementptr inbounds nuw i8, ptr addrspace(3) %145, i32 %144, !dbg !34 + %147 = insertelement <4 x i32> poison, i32 %98, i64 0, !dbg !34 + %148 = insertelement <4 x i32> %147, i32 %99, i64 1, !dbg !34 + %149 = insertelement <4 x i32> %148, i32 %100, i64 2, !dbg !34 + %150 = insertelement <4 x i32> %149, i32 %101, i64 3, !dbg !34 + store <4 x i32> %150, ptr addrspace(3) %146, align 16, !dbg !34 + %151 = getelementptr inbounds nuw i8, ptr addrspace(3) %146, i32 2048, !dbg !34 + %152 = insertelement <4 x i32> poison, i32 %103, i64 0, !dbg !34 + %153 = insertelement <4 x i32> %152, i32 %104, i64 1, !dbg !34 + %154 = insertelement <4 x i32> %153, i32 %105, i64 2, !dbg !34 + %155 = insertelement <4 x i32> %154, i32 %106, i64 3, !dbg !34 + store <4 x i32> %155, ptr addrspace(3) %151, align 16, !dbg !34 + %156 = getelementptr inbounds nuw i8, ptr addrspace(3) %146, i32 4096, !dbg !34 + %157 = insertelement <4 x i32> poison, i32 %108, i64 0, !dbg !34 + %158 = insertelement <4 x i32> %157, i32 %109, i64 1, !dbg !34 + %159 = insertelement <4 x i32> %158, i32 %110, i64 2, !dbg !34 + %160 = insertelement <4 x i32> %159, i32 %111, i64 3, !dbg !34 + store <4 x i32> %160, ptr addrspace(3) %156, align 16, !dbg !34 + %161 = getelementptr inbounds nuw i8, ptr addrspace(3) %146, i32 6144, !dbg !34 + %162 = insertelement <4 x i32> poison, i32 %113, i64 0, !dbg !34 + %163 = insertelement <4 x i32> %162, i32 %114, i64 1, !dbg !34 + %164 = insertelement <4 x i32> %163, i32 %115, i64 2, !dbg !34 + %165 = insertelement <4 x i32> %164, i32 %116, i64 3, !dbg !34 + store <4 x i32> %165, ptr addrspace(3) %161, align 16, !dbg !34 + %166 = getelementptr inbounds nuw i8, ptr addrspace(3) %146, i32 8192, !dbg !34 + %167 = insertelement <4 x i32> poison, i32 %118, i64 0, !dbg !34 + %168 = insertelement <4 x i32> %167, i32 %119, i64 1, !dbg !34 + %169 = insertelement <4 x i32> %168, i32 %120, i64 2, !dbg !34 + %170 = insertelement <4 x i32> %169, i32 %121, i64 3, !dbg !34 + store <4 x i32> %170, ptr addrspace(3) %166, align 16, !dbg !34 + %171 = getelementptr inbounds nuw i8, ptr addrspace(3) %146, i32 10240, !dbg !34 + %172 = insertelement <4 x i32> poison, i32 %123, i64 0, !dbg !34 + %173 = insertelement <4 x i32> %172, i32 %124, i64 1, !dbg !34 + %174 = insertelement <4 x i32> %173, i32 %125, i64 2, !dbg !34 + %175 = insertelement <4 x i32> %174, i32 %126, i64 3, !dbg !34 + store <4 x i32> %175, ptr addrspace(3) %171, align 16, !dbg !34 + %176 = getelementptr inbounds nuw i8, ptr addrspace(3) %146, i32 12288, !dbg !34 + %177 = insertelement <4 x i32> poison, i32 %128, i64 0, !dbg !34 + %178 = insertelement <4 x i32> %177, i32 %129, i64 1, !dbg !34 + %179 = insertelement <4 x i32> %178, i32 %130, i64 2, !dbg !34 + %180 = insertelement <4 x i32> %179, i32 %131, i64 3, !dbg !34 + store <4 x i32> %180, ptr addrspace(3) %176, align 16, !dbg !34 + %181 = getelementptr inbounds nuw i8, ptr addrspace(3) %146, i32 14336, !dbg !34 + %182 = insertelement <4 x i32> poison, i32 %133, i64 0, !dbg !34 + %183 = insertelement <4 x i32> %182, i32 %134, i64 1, !dbg !34 + %184 = insertelement <4 x i32> %183, i32 %135, i64 2, !dbg !34 + %185 = insertelement <4 x i32> %184, i32 %136, i64 3, !dbg !34 + store <4 x i32> %185, ptr addrspace(3) %181, align 16, !dbg !34 + %186 = sext i32 %reass.mul389 to i64, !dbg !35 + %187 = getelementptr i32, ptr addrspace(1) %6, i64 %186, !dbg !35 + %188 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %187) #2, !dbg !36 + %189 = shl i32 %188, 7, !dbg !37 + %190 = sext i32 %32 to i64, !dbg !38 + %191 = getelementptr i32, ptr addrspace(1) %5, i64 %190, !dbg !38 + %192 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %191) #2, !dbg !39 + %193 = shl i32 %192, 1, !dbg !40 + %194 = add i32 %11, 63, !dbg !41 + %195 = sdiv i32 %194, 64, !dbg !45 + %196 = tail call i32 @llvm.smax.i32(i32 %195, i32 1), !dbg !46 + %197 = tail call i32 @llvm.smin.i32(i32 %193, i32 %196), !dbg !47 + %198 = and i32 %34, 3, !dbg !48 + %199 = shl nuw nsw i32 %198, 1, !dbg !48 + %200 = or disjoint i32 %199, 1, !dbg !48 + %201 = insertelement <2 x i32> poison, i32 %199, i64 0, !dbg !48 + %202 = shufflevector <2 x i32> %201, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !48 + %203 = or disjoint <2 x i32> %202, , !dbg !48 + %204 = insertelement <4 x i32> poison, i32 %199, i64 0, !dbg !48 + %205 = shufflevector <4 x i32> %204, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !48 + %206 = or disjoint <4 x i32> %205, , !dbg !48 + %207 = insertelement <8 x i32> poison, i32 %199, i64 0, !dbg !48 + %208 = shufflevector <8 x i32> %207, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !48 + %209 = or disjoint <8 x i32> %208, , !dbg !48 + %210 = or disjoint i32 %45, %33, !dbg !26 + %211 = or disjoint i32 %210, 8, !dbg !26 + %212 = insertelement <2 x i32> poison, i32 %211, i64 0, !dbg !49 + %213 = insertelement <2 x i32> %212, i32 %210, i64 1, !dbg !49 + %214 = srem <2 x i32> %213, splat (i32 2048), !dbg !49 + %215 = shufflevector <2 x i32> %214, <2 x i32> poison, <32 x i32> , !dbg !49 + %216 = zext nneg i32 %17 to i64, !dbg !51 + %217 = getelementptr i64, ptr addrspace(1) %9, i64 %216, !dbg !51 + %218 = icmp sgt i32 %193, 0, !dbg !52 + %219 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %217, i1 %218) #2, !dbg !53 + %220 = extractelement <2 x i32> %214, i64 1, !dbg !54 + %221 = sext i32 %220 to i64, !dbg !54 + %222 = extractelement <2 x i32> %214, i64 0, !dbg !54 + %223 = sext i32 %222 to i64, !dbg !54 + %224 = icmp sgt i64 %219, %221, !dbg !55 + %225 = icmp sgt i64 %219, %223, !dbg !55 + %226 = or disjoint i32 %189, %37, !dbg !56 + %227 = or disjoint i32 %189, %38, !dbg !56 + %228 = or disjoint i32 %189, %39, !dbg !56 + %229 = or disjoint i32 %189, %40, !dbg !56 + %230 = shl i32 %226, 7, !dbg !57 + %231 = shl i32 %227, 7, !dbg !57 + %232 = shl i32 %228, 7, !dbg !57 + %233 = shl i32 %229, 7, !dbg !57 + %234 = sext i32 %230 to i64, !dbg !58 + %235 = getelementptr bfloat, ptr addrspace(1) %29, i64 %234, !dbg !58 + %236 = sext i32 %231 to i64, !dbg !58 + %237 = getelementptr bfloat, ptr addrspace(1) %29, i64 %236, !dbg !58 + %238 = sext i32 %232 to i64, !dbg !58 + %239 = getelementptr bfloat, ptr addrspace(1) %29, i64 %238, !dbg !58 + %240 = sext i32 %233 to i64, !dbg !58 + %241 = getelementptr bfloat, ptr addrspace(1) %29, i64 %240, !dbg !58 + %242 = getelementptr bfloat, ptr addrspace(1) %235, i64 %80, !dbg !59 + %243 = getelementptr bfloat, ptr addrspace(1) %237, i64 %80, !dbg !59 + %244 = getelementptr bfloat, ptr addrspace(1) %239, i64 %80, !dbg !59 + %245 = getelementptr bfloat, ptr addrspace(1) %241, i64 %80, !dbg !59 + %246 = icmp slt i32 %226, %11, !dbg !60 + %247 = icmp slt i32 %227, %11, !dbg !60 + %248 = icmp slt i32 %228, %11, !dbg !60 + %249 = icmp slt i32 %229, %11, !dbg !60 + %250 = and i1 %218, %246, !dbg !52 + %251 = and i1 %218, %247, !dbg !52 + %252 = and i1 %218, %248, !dbg !52 + %253 = and i1 %218, %249, !dbg !52 + %254 = shl nuw nsw i32 %141, 10, !dbg !61 + %255 = or disjoint i32 %144, %254, !dbg !61 + %256 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %255, !dbg !61 + %257 = select i1 %250, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %256, ptr addrspace(1) %242, i32 %257) #2, !dbg !61 + %258 = or disjoint i32 %255, 2048, !dbg !61 + %259 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %258, !dbg !61 + %260 = select i1 %251, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %259, ptr addrspace(1) %243, i32 %260) #2, !dbg !61 + %261 = or disjoint i32 %255, 4096, !dbg !61 + %262 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %261, !dbg !61 + %263 = select i1 %252, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %262, ptr addrspace(1) %244, i32 %263) #2, !dbg !61 + %264 = or disjoint i32 %255, 6144, !dbg !61 + %265 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %264, !dbg !61 + %266 = select i1 %253, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %265, ptr addrspace(1) %245, i32 %266) #2, !dbg !61 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !61 + %267 = getelementptr bfloat, ptr addrspace(1) %30, i64 %234, !dbg !58 + %268 = getelementptr bfloat, ptr addrspace(1) %30, i64 %236, !dbg !58 + %269 = getelementptr bfloat, ptr addrspace(1) %30, i64 %238, !dbg !58 + %270 = getelementptr bfloat, ptr addrspace(1) %30, i64 %240, !dbg !58 + %271 = getelementptr bfloat, ptr addrspace(1) %267, i64 %80, !dbg !59 + %272 = getelementptr bfloat, ptr addrspace(1) %268, i64 %80, !dbg !59 + %273 = getelementptr bfloat, ptr addrspace(1) %269, i64 %80, !dbg !59 + %274 = getelementptr bfloat, ptr addrspace(1) %270, i64 %80, !dbg !59 + %275 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %255, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %275, ptr addrspace(1) %271, i32 %257) #2, !dbg !61 + %276 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %258, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %276, ptr addrspace(1) %272, i32 %260) #2, !dbg !61 + %277 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %261, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %277, ptr addrspace(1) %273, i32 %263) #2, !dbg !61 + %278 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %264, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %278, ptr addrspace(1) %274, i32 %266) #2, !dbg !61 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !61 + %279 = icmp sgt i32 %197, 1, !dbg !52 + %280 = or disjoint i32 %189, 64, !dbg !62 + %281 = or disjoint i32 %280, %37, !dbg !56 + %282 = or disjoint i32 %280, %38, !dbg !56 + %283 = or disjoint i32 %280, %39, !dbg !56 + %284 = or disjoint i32 %280, %40, !dbg !56 + %285 = shl i32 %281, 7, !dbg !57 + %286 = shl i32 %282, 7, !dbg !57 + %287 = shl i32 %283, 7, !dbg !57 + %288 = shl i32 %284, 7, !dbg !57 + %289 = sext i32 %285 to i64, !dbg !58 + %290 = getelementptr bfloat, ptr addrspace(1) %29, i64 %289, !dbg !58 + %291 = sext i32 %286 to i64, !dbg !58 + %292 = getelementptr bfloat, ptr addrspace(1) %29, i64 %291, !dbg !58 + %293 = sext i32 %287 to i64, !dbg !58 + %294 = getelementptr bfloat, ptr addrspace(1) %29, i64 %293, !dbg !58 + %295 = sext i32 %288 to i64, !dbg !58 + %296 = getelementptr bfloat, ptr addrspace(1) %29, i64 %295, !dbg !58 + %297 = getelementptr bfloat, ptr addrspace(1) %290, i64 %80, !dbg !59 + %298 = getelementptr bfloat, ptr addrspace(1) %292, i64 %80, !dbg !59 + %299 = getelementptr bfloat, ptr addrspace(1) %294, i64 %80, !dbg !59 + %300 = getelementptr bfloat, ptr addrspace(1) %296, i64 %80, !dbg !59 + %301 = icmp slt i32 %281, %11, !dbg !60 + %302 = icmp slt i32 %282, %11, !dbg !60 + %303 = icmp slt i32 %283, %11, !dbg !60 + %304 = icmp slt i32 %284, %11, !dbg !60 + %305 = and i1 %279, %301, !dbg !52 + %306 = and i1 %279, %302, !dbg !52 + %307 = and i1 %279, %303, !dbg !52 + %308 = and i1 %279, %304, !dbg !52 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %309 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %255, !dbg !61 + %310 = select i1 %305, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %309, ptr addrspace(1) %297, i32 %310) #2, !dbg !61 + %311 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %258, !dbg !61 + %312 = select i1 %306, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %311, ptr addrspace(1) %298, i32 %312) #2, !dbg !61 + %313 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %261, !dbg !61 + %314 = select i1 %307, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %313, ptr addrspace(1) %299, i32 %314) #2, !dbg !61 + %315 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %264, !dbg !61 + %316 = select i1 %308, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %315, ptr addrspace(1) %300, i32 %316) #2, !dbg !61 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !61 + %317 = getelementptr bfloat, ptr addrspace(1) %30, i64 %289, !dbg !58 + %318 = getelementptr bfloat, ptr addrspace(1) %30, i64 %291, !dbg !58 + %319 = getelementptr bfloat, ptr addrspace(1) %30, i64 %293, !dbg !58 + %320 = getelementptr bfloat, ptr addrspace(1) %30, i64 %295, !dbg !58 + %321 = getelementptr bfloat, ptr addrspace(1) %317, i64 %80, !dbg !59 + %322 = getelementptr bfloat, ptr addrspace(1) %318, i64 %80, !dbg !59 + %323 = getelementptr bfloat, ptr addrspace(1) %319, i64 %80, !dbg !59 + %324 = getelementptr bfloat, ptr addrspace(1) %320, i64 %80, !dbg !59 + %325 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %255, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %325, ptr addrspace(1) %321, i32 %310) #2, !dbg !61 + %326 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %258, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %326, ptr addrspace(1) %322, i32 %312) #2, !dbg !61 + %327 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %261, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %327, ptr addrspace(1) %323, i32 %314) #2, !dbg !61 + %328 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %264, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %328, ptr addrspace(1) %324, i32 %316) #2, !dbg !61 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !61 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #2, !dbg !63 + %invariant.gep = getelementptr bfloat, ptr addrspace(1) %29, i64 %80, !dbg !52 + %invariant.gep401 = getelementptr bfloat, ptr addrspace(1) %30, i64 %80, !dbg !52 + br i1 %218, label %.lr.ph, label %._crit_edge, !dbg !52 + +.lr.ph: ; preds = %15 + %329 = insertelement <16 x i32> poison, i32 %189, i64 0, !dbg !64 + %330 = shufflevector <16 x i32> %329, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !64 + %331 = insertelement <16 x i32> poison, i32 %200, i64 14, !dbg !64 + %332 = insertelement <16 x i32> %331, i32 %199, i64 15, !dbg !64 + %333 = shufflevector <8 x i32> %209, <8 x i32> poison, <16 x i32> , !dbg !64 + %334 = shufflevector <16 x i32> %333, <16 x i32> %332, <16 x i32> , !dbg !64 + %335 = shufflevector <4 x i32> %206, <4 x i32> poison, <16 x i32> , !dbg !64 + %336 = shufflevector <16 x i32> %334, <16 x i32> %335, <16 x i32> , !dbg !64 + %337 = shufflevector <2 x i32> %203, <2 x i32> poison, <16 x i32> , !dbg !64 + %338 = shufflevector <16 x i32> %336, <16 x i32> %337, <16 x i32> , !dbg !64 + %339 = or disjoint <16 x i32> %330, %338, !dbg !64 + %340 = add nsw i32 %197, -2 + %341 = add nsw i32 %197, -1 + %smax = tail call i32 @llvm.smax.i32(i32 %197, i32 1), !dbg !52 + %342 = insertelement <16 x i32> poison, i32 %11, i64 0, !dbg !49 + %343 = shufflevector <16 x i32> %342, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !49 + %344 = insertelement <16 x i64> poison, i64 %219, i64 0, !dbg !65 + %345 = shufflevector <16 x i64> %344, <16 x i64> poison, <16 x i32> zeroinitializer, !dbg !65 + br label %346, !dbg !52 + +346: ; preds = %.lr.ph, %__nv_exp2f.exit355 + %347 = phi i32 [ 64, %.lr.ph ], [ %2006, %__nv_exp2f.exit355 ] + %348 = phi i32 [ -1, %.lr.ph ], [ %424, %__nv_exp2f.exit355 ] + %349 = phi i32 [ 1, %.lr.ph ], [ %2010, %__nv_exp2f.exit355 ] + %350 = phi i32 [ 64, %.lr.ph ], [ %2007, %__nv_exp2f.exit355 ] + %351 = phi float [ 0.000000e+00, %.lr.ph ], [ %1573, %__nv_exp2f.exit355 ] + %352 = phi float [ 0.000000e+00, %.lr.ph ], [ %1574, %__nv_exp2f.exit355 ] + %353 = phi float [ 0.000000e+00, %.lr.ph ], [ %1920, %__nv_exp2f.exit355 ] + %354 = phi float [ 0.000000e+00, %.lr.ph ], [ %1921, %__nv_exp2f.exit355 ] + %355 = phi float [ 0.000000e+00, %.lr.ph ], [ %1922, %__nv_exp2f.exit355 ] + %356 = phi float [ 0.000000e+00, %.lr.ph ], [ %1923, %__nv_exp2f.exit355 ] + %357 = phi float [ 0.000000e+00, %.lr.ph ], [ %1924, %__nv_exp2f.exit355 ] + %358 = phi float [ 0.000000e+00, %.lr.ph ], [ %1925, %__nv_exp2f.exit355 ] + %359 = phi float [ 0.000000e+00, %.lr.ph ], [ %1926, %__nv_exp2f.exit355 ] + %360 = phi float [ 0.000000e+00, %.lr.ph ], [ %1927, %__nv_exp2f.exit355 ] + %361 = phi float [ 0.000000e+00, %.lr.ph ], [ %1928, %__nv_exp2f.exit355 ] + %362 = phi float [ 0.000000e+00, %.lr.ph ], [ %1929, %__nv_exp2f.exit355 ] + %363 = phi float [ 0.000000e+00, %.lr.ph ], [ %1930, %__nv_exp2f.exit355 ] + %364 = phi float [ 0.000000e+00, %.lr.ph ], [ %1931, %__nv_exp2f.exit355 ] + %365 = phi float [ 0.000000e+00, %.lr.ph ], [ %1932, %__nv_exp2f.exit355 ] + %366 = phi float [ 0.000000e+00, %.lr.ph ], [ %1933, %__nv_exp2f.exit355 ] + %367 = phi float [ 0.000000e+00, %.lr.ph ], [ %1934, %__nv_exp2f.exit355 ] + %368 = phi float [ 0.000000e+00, %.lr.ph ], [ %1935, %__nv_exp2f.exit355 ] + %369 = phi float [ 0.000000e+00, %.lr.ph ], [ %1936, %__nv_exp2f.exit355 ] + %370 = phi float [ 0.000000e+00, %.lr.ph ], [ %1937, %__nv_exp2f.exit355 ] + %371 = phi float [ 0.000000e+00, %.lr.ph ], [ %1938, %__nv_exp2f.exit355 ] + %372 = phi float [ 0.000000e+00, %.lr.ph ], [ %1939, %__nv_exp2f.exit355 ] + %373 = phi float [ 0.000000e+00, %.lr.ph ], [ %1940, %__nv_exp2f.exit355 ] + %374 = phi float [ 0.000000e+00, %.lr.ph ], [ %1941, %__nv_exp2f.exit355 ] + %375 = phi float [ 0.000000e+00, %.lr.ph ], [ %1942, %__nv_exp2f.exit355 ] + %376 = phi float [ 0.000000e+00, %.lr.ph ], [ %1943, %__nv_exp2f.exit355 ] + %377 = phi float [ 0.000000e+00, %.lr.ph ], [ %1944, %__nv_exp2f.exit355 ] + %378 = phi float [ 0.000000e+00, %.lr.ph ], [ %1945, %__nv_exp2f.exit355 ] + %379 = phi float [ 0.000000e+00, %.lr.ph ], [ %1946, %__nv_exp2f.exit355 ] + %380 = phi float [ 0.000000e+00, %.lr.ph ], [ %1947, %__nv_exp2f.exit355 ] + %381 = phi float [ 0.000000e+00, %.lr.ph ], [ %1948, %__nv_exp2f.exit355 ] + %382 = phi float [ 0.000000e+00, %.lr.ph ], [ %1949, %__nv_exp2f.exit355 ] + %383 = phi float [ 0.000000e+00, %.lr.ph ], [ %1950, %__nv_exp2f.exit355 ] + %384 = phi float [ 0.000000e+00, %.lr.ph ], [ %1951, %__nv_exp2f.exit355 ] + %385 = phi float [ 0.000000e+00, %.lr.ph ], [ %1952, %__nv_exp2f.exit355 ] + %386 = phi float [ 0.000000e+00, %.lr.ph ], [ %1953, %__nv_exp2f.exit355 ] + %387 = phi float [ 0.000000e+00, %.lr.ph ], [ %1954, %__nv_exp2f.exit355 ] + %388 = phi float [ 0.000000e+00, %.lr.ph ], [ %1955, %__nv_exp2f.exit355 ] + %389 = phi float [ 0.000000e+00, %.lr.ph ], [ %1956, %__nv_exp2f.exit355 ] + %390 = phi float [ 0.000000e+00, %.lr.ph ], [ %1957, %__nv_exp2f.exit355 ] + %391 = phi float [ 0.000000e+00, %.lr.ph ], [ %1958, %__nv_exp2f.exit355 ] + %392 = phi float [ 0.000000e+00, %.lr.ph ], [ %1959, %__nv_exp2f.exit355 ] + %393 = phi float [ 0.000000e+00, %.lr.ph ], [ %1960, %__nv_exp2f.exit355 ] + %394 = phi float [ 0.000000e+00, %.lr.ph ], [ %1961, %__nv_exp2f.exit355 ] + %395 = phi float [ 0.000000e+00, %.lr.ph ], [ %1962, %__nv_exp2f.exit355 ] + %396 = phi float [ 0.000000e+00, %.lr.ph ], [ %1963, %__nv_exp2f.exit355 ] + %397 = phi float [ 0.000000e+00, %.lr.ph ], [ %1964, %__nv_exp2f.exit355 ] + %398 = phi float [ 0.000000e+00, %.lr.ph ], [ %1965, %__nv_exp2f.exit355 ] + %399 = phi float [ 0.000000e+00, %.lr.ph ], [ %1966, %__nv_exp2f.exit355 ] + %400 = phi float [ 0.000000e+00, %.lr.ph ], [ %1967, %__nv_exp2f.exit355 ] + %401 = phi float [ 0.000000e+00, %.lr.ph ], [ %1968, %__nv_exp2f.exit355 ] + %402 = phi float [ 0.000000e+00, %.lr.ph ], [ %1969, %__nv_exp2f.exit355 ] + %403 = phi float [ 0.000000e+00, %.lr.ph ], [ %1970, %__nv_exp2f.exit355 ] + %404 = phi float [ 0.000000e+00, %.lr.ph ], [ %1971, %__nv_exp2f.exit355 ] + %405 = phi float [ 0.000000e+00, %.lr.ph ], [ %1972, %__nv_exp2f.exit355 ] + %406 = phi float [ 0.000000e+00, %.lr.ph ], [ %1973, %__nv_exp2f.exit355 ] + %407 = phi float [ 0.000000e+00, %.lr.ph ], [ %1974, %__nv_exp2f.exit355 ] + %408 = phi float [ 0.000000e+00, %.lr.ph ], [ %1975, %__nv_exp2f.exit355 ] + %409 = phi float [ 0.000000e+00, %.lr.ph ], [ %1976, %__nv_exp2f.exit355 ] + %410 = phi float [ 0.000000e+00, %.lr.ph ], [ %1977, %__nv_exp2f.exit355 ] + %411 = phi float [ 0.000000e+00, %.lr.ph ], [ %1978, %__nv_exp2f.exit355 ] + %412 = phi float [ 0.000000e+00, %.lr.ph ], [ %1979, %__nv_exp2f.exit355 ] + %413 = phi float [ 0.000000e+00, %.lr.ph ], [ %1980, %__nv_exp2f.exit355 ] + %414 = phi float [ 0.000000e+00, %.lr.ph ], [ %1981, %__nv_exp2f.exit355 ] + %415 = phi float [ 0.000000e+00, %.lr.ph ], [ %1982, %__nv_exp2f.exit355 ] + %416 = phi float [ 0.000000e+00, %.lr.ph ], [ %1983, %__nv_exp2f.exit355 ] + %417 = phi i32 [ 0, %.lr.ph ], [ %1987, %__nv_exp2f.exit355 ] + %418 = phi <2 x float> [ splat (float 0xFFF0000000000000), %.lr.ph ], [ %1312, %__nv_exp2f.exit355 ] + %419 = phi <16 x i32> [ %339, %.lr.ph ], [ %1986, %__nv_exp2f.exit355 ] + %420 = icmp slt i32 %417, %340, !dbg !52 + %421 = icmp slt i32 %417, %341, !dbg !52 + %422 = add i32 %348, 1, !dbg !52 + %423 = icmp sgt i32 %422, 2, !dbg !52 + %424 = select i1 %423, i32 0, i32 %422, !dbg !52 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %425 = shl i32 %424, 13, !dbg !61 + %426 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %425, !dbg !61 + %427 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %35, i32 0, i32 31), !dbg !63 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !63 + %428 = shl i32 %427, 11, !dbg !63 + %429 = and i32 %428, 8192, !dbg !63 + %430 = add i32 %429, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !63 + %431 = lshr exact i32 %430, 4, !dbg !63 + %432 = and i32 %431, 16383, !dbg !63 + %433 = zext nneg i32 %432 to i64, !dbg !63 + %434 = or disjoint i64 %433, 4611686293372403712, !dbg !63 + %435 = ptrtoint ptr addrspace(3) %426 to i32, !dbg !63 + %436 = lshr exact i32 %435, 4, !dbg !63 + %437 = and i32 %436, 16383, !dbg !63 + %438 = zext nneg i32 %437 to i64, !dbg !63 + %439 = or disjoint i64 %438, 4611686293338849280, !dbg !63 + %440 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %434, i64 %439) #2, !dbg !63 + %441 = add i32 %429, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 32), !dbg !63 + %442 = lshr exact i32 %441, 4, !dbg !63 + %443 = and i32 %442, 16383, !dbg !63 + %444 = zext nneg i32 %443 to i64, !dbg !63 + %445 = or disjoint i64 %444, 4611686293372403712, !dbg !63 + %446 = add i32 %435, 32, !dbg !63 + %447 = lshr exact i32 %446, 4, !dbg !63 + %448 = and i32 %447, 16383, !dbg !63 + %449 = zext nneg i32 %448 to i64, !dbg !63 + %450 = or disjoint i64 %449, 4611686293338849280, !dbg !63 + %451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 0, !dbg !63 + %452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 1, !dbg !63 + %453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 2, !dbg !63 + %454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 3, !dbg !63 + %455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 4, !dbg !63 + %456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 5, !dbg !63 + %457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 6, !dbg !63 + %458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 7, !dbg !63 + %459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 8, !dbg !63 + %460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 9, !dbg !63 + %461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 10, !dbg !63 + %462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 11, !dbg !63 + %463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 12, !dbg !63 + %464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 13, !dbg !63 + %465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 14, !dbg !63 + %466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 15, !dbg !63 + %467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 16, !dbg !63 + %468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 17, !dbg !63 + %469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 18, !dbg !63 + %470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 19, !dbg !63 + %471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 20, !dbg !63 + %472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 21, !dbg !63 + %473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 22, !dbg !63 + %474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 23, !dbg !63 + %475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 24, !dbg !63 + %476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 25, !dbg !63 + %477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 26, !dbg !63 + %478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 27, !dbg !63 + %479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 28, !dbg !63 + %480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 29, !dbg !63 + %481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 30, !dbg !63 + %482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 31, !dbg !63 + %483 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %451, float %452, float %453, float %454, float %455, float %456, float %457, float %458, float %459, float %460, float %461, float %462, float %463, float %464, float %465, float %466, float %467, float %468, float %469, float %470, float %471, float %472, float %473, float %474, float %475, float %476, float %477, float %478, float %479, float %480, float %481, float %482, i64 %445, i64 %450, i1 true) #2, !dbg !63 + %484 = add i32 %429, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 64), !dbg !63 + %485 = lshr exact i32 %484, 4, !dbg !63 + %486 = and i32 %485, 16383, !dbg !63 + %487 = zext nneg i32 %486 to i64, !dbg !63 + %488 = or disjoint i64 %487, 4611686293372403712, !dbg !63 + %489 = add i32 %435, 64, !dbg !63 + %490 = lshr exact i32 %489, 4, !dbg !63 + %491 = and i32 %490, 16383, !dbg !63 + %492 = zext nneg i32 %491 to i64, !dbg !63 + %493 = or disjoint i64 %492, 4611686293338849280, !dbg !63 + %494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 0, !dbg !63 + %495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 1, !dbg !63 + %496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 2, !dbg !63 + %497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 3, !dbg !63 + %498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 4, !dbg !63 + %499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 5, !dbg !63 + %500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 6, !dbg !63 + %501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 7, !dbg !63 + %502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 8, !dbg !63 + %503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 9, !dbg !63 + %504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 10, !dbg !63 + %505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 11, !dbg !63 + %506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 12, !dbg !63 + %507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 13, !dbg !63 + %508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 14, !dbg !63 + %509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 15, !dbg !63 + %510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 16, !dbg !63 + %511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 17, !dbg !63 + %512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 18, !dbg !63 + %513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 19, !dbg !63 + %514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 20, !dbg !63 + %515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 21, !dbg !63 + %516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 22, !dbg !63 + %517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 23, !dbg !63 + %518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 24, !dbg !63 + %519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 25, !dbg !63 + %520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 26, !dbg !63 + %521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 27, !dbg !63 + %522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 28, !dbg !63 + %523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 29, !dbg !63 + %524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 30, !dbg !63 + %525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 31, !dbg !63 + %526 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %494, float %495, float %496, float %497, float %498, float %499, float %500, float %501, float %502, float %503, float %504, float %505, float %506, float %507, float %508, float %509, float %510, float %511, float %512, float %513, float %514, float %515, float %516, float %517, float %518, float %519, float %520, float %521, float %522, float %523, float %524, float %525, i64 %488, i64 %493, i1 true) #2, !dbg !63 + %527 = add i32 %429, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 96), !dbg !63 + %528 = lshr exact i32 %527, 4, !dbg !63 + %529 = and i32 %528, 16383, !dbg !63 + %530 = zext nneg i32 %529 to i64, !dbg !63 + %531 = or disjoint i64 %530, 4611686293372403712, !dbg !63 + %532 = add i32 %435, 96, !dbg !63 + %533 = lshr exact i32 %532, 4, !dbg !63 + %534 = and i32 %533, 16383, !dbg !63 + %535 = zext nneg i32 %534 to i64, !dbg !63 + %536 = or disjoint i64 %535, 4611686293338849280, !dbg !63 + %537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 0, !dbg !63 + %538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 1, !dbg !63 + %539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 2, !dbg !63 + %540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 3, !dbg !63 + %541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 4, !dbg !63 + %542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 5, !dbg !63 + %543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 6, !dbg !63 + %544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 7, !dbg !63 + %545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 8, !dbg !63 + %546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 9, !dbg !63 + %547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 10, !dbg !63 + %548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 11, !dbg !63 + %549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 12, !dbg !63 + %550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 13, !dbg !63 + %551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 14, !dbg !63 + %552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 15, !dbg !63 + %553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 16, !dbg !63 + %554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 17, !dbg !63 + %555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 18, !dbg !63 + %556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 19, !dbg !63 + %557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 20, !dbg !63 + %558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 21, !dbg !63 + %559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 22, !dbg !63 + %560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 23, !dbg !63 + %561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 24, !dbg !63 + %562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 25, !dbg !63 + %563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 26, !dbg !63 + %564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 27, !dbg !63 + %565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 28, !dbg !63 + %566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 29, !dbg !63 + %567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 30, !dbg !63 + %568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 31, !dbg !63 + %569 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %537, float %538, float %539, float %540, float %541, float %542, float %543, float %544, float %545, float %546, float %547, float %548, float %549, float %550, float %551, float %552, float %553, float %554, float %555, float %556, float %557, float %558, float %559, float %560, float %561, float %562, float %563, float %564, float %565, float %566, float %567, float %568, i64 %531, i64 %536, i1 true) #2, !dbg !63 + %570 = add i32 %429, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16384), !dbg !63 + %571 = lshr exact i32 %570, 4, !dbg !63 + %572 = and i32 %571, 16383, !dbg !63 + %573 = zext nneg i32 %572 to i64, !dbg !63 + %574 = or disjoint i64 %573, 4611686293372403712, !dbg !63 + %575 = add i32 %435, 8192, !dbg !63 + %576 = lshr exact i32 %575, 4, !dbg !63 + %577 = and i32 %576, 16383, !dbg !63 + %578 = zext nneg i32 %577 to i64, !dbg !63 + %579 = or disjoint i64 %578, 4611686293338849280, !dbg !63 + %580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 0, !dbg !63 + %581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 1, !dbg !63 + %582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 2, !dbg !63 + %583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 3, !dbg !63 + %584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 4, !dbg !63 + %585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 5, !dbg !63 + %586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 6, !dbg !63 + %587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 7, !dbg !63 + %588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 8, !dbg !63 + %589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 9, !dbg !63 + %590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 10, !dbg !63 + %591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 11, !dbg !63 + %592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 12, !dbg !63 + %593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 13, !dbg !63 + %594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 14, !dbg !63 + %595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 15, !dbg !63 + %596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 16, !dbg !63 + %597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 17, !dbg !63 + %598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 18, !dbg !63 + %599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 19, !dbg !63 + %600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 20, !dbg !63 + %601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 21, !dbg !63 + %602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 22, !dbg !63 + %603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 23, !dbg !63 + %604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 24, !dbg !63 + %605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 25, !dbg !63 + %606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 26, !dbg !63 + %607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 27, !dbg !63 + %608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 28, !dbg !63 + %609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 29, !dbg !63 + %610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 30, !dbg !63 + %611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 31, !dbg !63 + %612 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %580, float %581, float %582, float %583, float %584, float %585, float %586, float %587, float %588, float %589, float %590, float %591, float %592, float %593, float %594, float %595, float %596, float %597, float %598, float %599, float %600, float %601, float %602, float %603, float %604, float %605, float %606, float %607, float %608, float %609, float %610, float %611, i64 %574, i64 %579, i1 true) #2, !dbg !63 + %613 = add i32 %429, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16416), !dbg !63 + %614 = lshr exact i32 %613, 4, !dbg !63 + %615 = and i32 %614, 16383, !dbg !63 + %616 = zext nneg i32 %615 to i64, !dbg !63 + %617 = or disjoint i64 %616, 4611686293372403712, !dbg !63 + %618 = add i32 %435, 8224, !dbg !63 + %619 = lshr exact i32 %618, 4, !dbg !63 + %620 = and i32 %619, 16383, !dbg !63 + %621 = zext nneg i32 %620 to i64, !dbg !63 + %622 = or disjoint i64 %621, 4611686293338849280, !dbg !63 + %623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 0, !dbg !63 + %624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 1, !dbg !63 + %625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 2, !dbg !63 + %626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 3, !dbg !63 + %627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 4, !dbg !63 + %628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 5, !dbg !63 + %629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 6, !dbg !63 + %630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 7, !dbg !63 + %631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 8, !dbg !63 + %632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 9, !dbg !63 + %633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 10, !dbg !63 + %634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 11, !dbg !63 + %635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 12, !dbg !63 + %636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 13, !dbg !63 + %637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 14, !dbg !63 + %638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 15, !dbg !63 + %639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 16, !dbg !63 + %640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 17, !dbg !63 + %641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 18, !dbg !63 + %642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 19, !dbg !63 + %643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 20, !dbg !63 + %644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 21, !dbg !63 + %645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 22, !dbg !63 + %646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 23, !dbg !63 + %647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 24, !dbg !63 + %648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 25, !dbg !63 + %649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 26, !dbg !63 + %650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 27, !dbg !63 + %651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 28, !dbg !63 + %652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 29, !dbg !63 + %653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 30, !dbg !63 + %654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 31, !dbg !63 + %655 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %623, float %624, float %625, float %626, float %627, float %628, float %629, float %630, float %631, float %632, float %633, float %634, float %635, float %636, float %637, float %638, float %639, float %640, float %641, float %642, float %643, float %644, float %645, float %646, float %647, float %648, float %649, float %650, float %651, float %652, float %653, float %654, i64 %617, i64 %622, i1 true) #2, !dbg !63 + %656 = add i32 %429, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16448), !dbg !63 + %657 = lshr exact i32 %656, 4, !dbg !63 + %658 = and i32 %657, 16383, !dbg !63 + %659 = zext nneg i32 %658 to i64, !dbg !63 + %660 = or disjoint i64 %659, 4611686293372403712, !dbg !63 + %661 = add i32 %435, 8256, !dbg !63 + %662 = lshr exact i32 %661, 4, !dbg !63 + %663 = and i32 %662, 16383, !dbg !63 + %664 = zext nneg i32 %663 to i64, !dbg !63 + %665 = or disjoint i64 %664, 4611686293338849280, !dbg !63 + %666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 0, !dbg !63 + %667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 1, !dbg !63 + %668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 2, !dbg !63 + %669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 3, !dbg !63 + %670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 4, !dbg !63 + %671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 5, !dbg !63 + %672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 6, !dbg !63 + %673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 7, !dbg !63 + %674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 8, !dbg !63 + %675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 9, !dbg !63 + %676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 10, !dbg !63 + %677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 11, !dbg !63 + %678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 12, !dbg !63 + %679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 13, !dbg !63 + %680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 14, !dbg !63 + %681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 15, !dbg !63 + %682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 16, !dbg !63 + %683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 17, !dbg !63 + %684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 18, !dbg !63 + %685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 19, !dbg !63 + %686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 20, !dbg !63 + %687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 21, !dbg !63 + %688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 22, !dbg !63 + %689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 23, !dbg !63 + %690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 24, !dbg !63 + %691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 25, !dbg !63 + %692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 26, !dbg !63 + %693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 27, !dbg !63 + %694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 28, !dbg !63 + %695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 29, !dbg !63 + %696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 30, !dbg !63 + %697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 31, !dbg !63 + %698 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %666, float %667, float %668, float %669, float %670, float %671, float %672, float %673, float %674, float %675, float %676, float %677, float %678, float %679, float %680, float %681, float %682, float %683, float %684, float %685, float %686, float %687, float %688, float %689, float %690, float %691, float %692, float %693, float %694, float %695, float %696, float %697, i64 %660, i64 %665, i1 true) #2, !dbg !63 + %699 = add i32 %429, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16480), !dbg !63 + %700 = lshr exact i32 %699, 4, !dbg !63 + %701 = and i32 %700, 16383, !dbg !63 + %702 = zext nneg i32 %701 to i64, !dbg !63 + %703 = or disjoint i64 %702, 4611686293372403712, !dbg !63 + %704 = add i32 %435, 8288, !dbg !63 + %705 = lshr exact i32 %704, 4, !dbg !63 + %706 = and i32 %705, 16383, !dbg !63 + %707 = zext nneg i32 %706 to i64, !dbg !63 + %708 = or disjoint i64 %707, 4611686293338849280, !dbg !63 + %709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 0, !dbg !63 + %710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 1, !dbg !63 + %711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 2, !dbg !63 + %712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 3, !dbg !63 + %713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 4, !dbg !63 + %714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 5, !dbg !63 + %715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 6, !dbg !63 + %716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 7, !dbg !63 + %717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 8, !dbg !63 + %718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 9, !dbg !63 + %719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 10, !dbg !63 + %720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 11, !dbg !63 + %721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 12, !dbg !63 + %722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 13, !dbg !63 + %723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 14, !dbg !63 + %724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 15, !dbg !63 + %725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 16, !dbg !63 + %726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 17, !dbg !63 + %727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 18, !dbg !63 + %728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 19, !dbg !63 + %729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 20, !dbg !63 + %730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 21, !dbg !63 + %731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 22, !dbg !63 + %732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 23, !dbg !63 + %733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 24, !dbg !63 + %734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 25, !dbg !63 + %735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 26, !dbg !63 + %736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 27, !dbg !63 + %737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 28, !dbg !63 + %738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 29, !dbg !63 + %739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 30, !dbg !63 + %740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 31, !dbg !63 + %741 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %709, float %710, float %711, float %712, float %713, float %714, float %715, float %716, float %717, float %718, float %719, float %720, float %721, float %722, float %723, float %724, float %725, float %726, float %727, float %728, float %729, float %730, float %731, float %732, float %733, float %734, float %735, float %736, float %737, float %738, float %739, float %740, i64 %703, i64 %708, i1 true) #2, !dbg !63 + %742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 0, !dbg !63 + %743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 1, !dbg !63 + %744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 2, !dbg !63 + %745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 3, !dbg !63 + %746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 4, !dbg !63 + %747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 5, !dbg !63 + %748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 6, !dbg !63 + %749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 7, !dbg !63 + %750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 8, !dbg !63 + %751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 9, !dbg !63 + %752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 10, !dbg !63 + %753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 11, !dbg !63 + %754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 12, !dbg !63 + %755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 13, !dbg !63 + %756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 14, !dbg !63 + %757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 15, !dbg !63 + %758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 16, !dbg !63 + %759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 17, !dbg !63 + %760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 18, !dbg !63 + %761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 19, !dbg !63 + %762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 20, !dbg !63 + %763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 21, !dbg !63 + %764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 22, !dbg !63 + %765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 23, !dbg !63 + %766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 24, !dbg !63 + %767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 25, !dbg !63 + %768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 26, !dbg !63 + %769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 27, !dbg !63 + %770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 28, !dbg !63 + %771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 29, !dbg !63 + %772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 30, !dbg !63 + %773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 31, !dbg !63 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !63 + %774 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101"(float %742, float %743, float %744, float %745, float %746, float %747, float %748, float %749, float %750, float %751, float %752, float %753, float %754, float %755, float %756, float %757, float %758, float %759, float %760, float %761, float %762, float %763, float %764, float %765, float %766, float %767, float %768, float %769, float %770, float %771, float %772, float %773, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %426, i32 0, i32 0, float %353, float %354, float %355, float %356, float %357, float %358, float %359, float %360, float %361, float %362, float %363, float %364, float %365, float %366, float %367, float %368, float %369, float %370, float %371, float %372, float %373, float %374, float %375, float %376, float %377, float %378, float %379, float %380, float %381, float %382, float %383, float %384, float %385, float %386, float %387, float %388, float %389, float %390, float %391, float %392, float %393, float %394, float %395, float %396, float %397, float %398, float %399, float %400, float %401, float %402, float %403, float %404, float %405, float %406, float %407, float %408, float %409, float %410, float %411, float %412, float %413, float %414, float %415, float %416) #2, !dbg !63 + %775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 0, !dbg !63 + %776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 1, !dbg !63 + %777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 2, !dbg !63 + %778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 3, !dbg !63 + %779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 4, !dbg !63 + %780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 5, !dbg !63 + %781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 6, !dbg !63 + %782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 7, !dbg !63 + %783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 8, !dbg !63 + %784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 9, !dbg !63 + %785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 10, !dbg !63 + %786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 11, !dbg !63 + %787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 12, !dbg !63 + %788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 13, !dbg !63 + %789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 14, !dbg !63 + %790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 15, !dbg !63 + %791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 16, !dbg !63 + %792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 17, !dbg !63 + %793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 18, !dbg !63 + %794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 19, !dbg !63 + %795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 20, !dbg !63 + %796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 21, !dbg !63 + %797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 22, !dbg !63 + %798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 23, !dbg !63 + %799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 24, !dbg !63 + %800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 25, !dbg !63 + %801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 26, !dbg !63 + %802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 27, !dbg !63 + %803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 28, !dbg !63 + %804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 29, !dbg !63 + %805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 30, !dbg !63 + %806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 31, !dbg !63 + %807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 38, !dbg !63 + %808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 39, !dbg !63 + %809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 40, !dbg !63 + %810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 41, !dbg !63 + %811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 42, !dbg !63 + %812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 43, !dbg !63 + %813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 44, !dbg !63 + %814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 45, !dbg !63 + %815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 46, !dbg !63 + %816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 47, !dbg !63 + %817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 48, !dbg !63 + %818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 49, !dbg !63 + %819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 50, !dbg !63 + %820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 51, !dbg !63 + %821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 52, !dbg !63 + %822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 53, !dbg !63 + %823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 54, !dbg !63 + %824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 55, !dbg !63 + %825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 56, !dbg !63 + %826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 57, !dbg !63 + %827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 58, !dbg !63 + %828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 59, !dbg !63 + %829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 60, !dbg !63 + %830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 61, !dbg !63 + %831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 62, !dbg !63 + %832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 63, !dbg !63 + %833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 64, !dbg !63 + %834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 65, !dbg !63 + %835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 66, !dbg !63 + %836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 67, !dbg !63 + %837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 68, !dbg !63 + %838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 69, !dbg !63 + %839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 70, !dbg !63 + %840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 71, !dbg !63 + %841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 72, !dbg !63 + %842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 73, !dbg !63 + %843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 74, !dbg !63 + %844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 75, !dbg !63 + %845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 76, !dbg !63 + %846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 77, !dbg !63 + %847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 78, !dbg !63 + %848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 79, !dbg !63 + %849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 80, !dbg !63 + %850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 81, !dbg !63 + %851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 82, !dbg !63 + %852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 83, !dbg !63 + %853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 84, !dbg !63 + %854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 85, !dbg !63 + %855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 86, !dbg !63 + %856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 87, !dbg !63 + %857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 88, !dbg !63 + %858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 89, !dbg !63 + %859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 90, !dbg !63 + %860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 91, !dbg !63 + %861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 92, !dbg !63 + %862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 93, !dbg !63 + %863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 94, !dbg !63 + %864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 95, !dbg !63 + %865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 96, !dbg !63 + %866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 97, !dbg !63 + %867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 98, !dbg !63 + %868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 99, !dbg !63 + %869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 100, !dbg !63 + %870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 101, !dbg !63 + %871 = fmul float %775, 0x3FB6A09E60000000, !dbg !66 + %872 = fmul float %776, 0x3FB6A09E60000000, !dbg !66 + %873 = fmul float %777, 0x3FB6A09E60000000, !dbg !66 + %874 = fmul float %778, 0x3FB6A09E60000000, !dbg !66 + %875 = fmul float %779, 0x3FB6A09E60000000, !dbg !66 + %876 = fmul float %780, 0x3FB6A09E60000000, !dbg !66 + %877 = fmul float %781, 0x3FB6A09E60000000, !dbg !66 + %878 = fmul float %782, 0x3FB6A09E60000000, !dbg !66 + %879 = fmul float %783, 0x3FB6A09E60000000, !dbg !66 + %880 = fmul float %784, 0x3FB6A09E60000000, !dbg !66 + %881 = fmul float %785, 0x3FB6A09E60000000, !dbg !66 + %882 = fmul float %786, 0x3FB6A09E60000000, !dbg !66 + %883 = fmul float %787, 0x3FB6A09E60000000, !dbg !66 + %884 = fmul float %788, 0x3FB6A09E60000000, !dbg !66 + %885 = fmul float %789, 0x3FB6A09E60000000, !dbg !66 + %886 = fmul float %790, 0x3FB6A09E60000000, !dbg !66 + %887 = fmul float %791, 0x3FB6A09E60000000, !dbg !66 + %888 = fmul float %792, 0x3FB6A09E60000000, !dbg !66 + %889 = fmul float %793, 0x3FB6A09E60000000, !dbg !66 + %890 = fmul float %794, 0x3FB6A09E60000000, !dbg !66 + %891 = fmul float %795, 0x3FB6A09E60000000, !dbg !66 + %892 = fmul float %796, 0x3FB6A09E60000000, !dbg !66 + %893 = fmul float %797, 0x3FB6A09E60000000, !dbg !66 + %894 = fmul float %798, 0x3FB6A09E60000000, !dbg !66 + %895 = fmul float %799, 0x3FB6A09E60000000, !dbg !66 + %896 = fmul float %800, 0x3FB6A09E60000000, !dbg !66 + %897 = fmul float %801, 0x3FB6A09E60000000, !dbg !66 + %898 = fmul float %802, 0x3FB6A09E60000000, !dbg !66 + %899 = fmul float %803, 0x3FB6A09E60000000, !dbg !66 + %900 = fmul float %804, 0x3FB6A09E60000000, !dbg !66 + %901 = fmul float %805, 0x3FB6A09E60000000, !dbg !66 + %902 = fmul float %806, 0x3FB6A09E60000000, !dbg !66 + %903 = extractelement <16 x i32> %419, i64 15, !dbg !67 + %904 = icmp slt i32 %903, %11, !dbg !67 + %905 = extractelement <16 x i32> %419, i64 14, !dbg !67 + %906 = icmp slt i32 %905, %11, !dbg !67 + %907 = extractelement <16 x i32> %419, i64 13, !dbg !67 + %908 = icmp slt i32 %907, %11, !dbg !67 + %909 = extractelement <16 x i32> %419, i64 12, !dbg !67 + %910 = icmp slt i32 %909, %11, !dbg !67 + %911 = extractelement <16 x i32> %419, i64 11, !dbg !67 + %912 = icmp slt i32 %911, %11, !dbg !67 + %913 = extractelement <16 x i32> %419, i64 10, !dbg !67 + %914 = icmp slt i32 %913, %11, !dbg !67 + %915 = extractelement <16 x i32> %419, i64 9, !dbg !67 + %916 = icmp slt i32 %915, %11, !dbg !67 + %917 = extractelement <16 x i32> %419, i64 8, !dbg !67 + %918 = icmp slt i32 %917, %11, !dbg !67 + %919 = extractelement <16 x i32> %419, i64 7, !dbg !67 + %920 = icmp slt i32 %919, %11, !dbg !67 + %921 = extractelement <16 x i32> %419, i64 6, !dbg !67 + %922 = icmp slt i32 %921, %11, !dbg !67 + %923 = extractelement <16 x i32> %419, i64 5, !dbg !67 + %924 = icmp slt i32 %923, %11, !dbg !67 + %925 = extractelement <16 x i32> %419, i64 4, !dbg !67 + %926 = icmp slt i32 %925, %11, !dbg !67 + %927 = extractelement <16 x i32> %419, i64 3, !dbg !67 + %928 = icmp slt i32 %927, %11, !dbg !67 + %929 = extractelement <16 x i32> %419, i64 2, !dbg !67 + %930 = icmp slt i32 %929, %11, !dbg !67 + %931 = extractelement <16 x i32> %419, i64 1, !dbg !67 + %932 = icmp slt i32 %931, %11, !dbg !67 + %933 = extractelement <16 x i32> %419, i64 0, !dbg !67 + %934 = icmp slt i32 %933, %11, !dbg !67 + %935 = srem <16 x i32> %419, %343, !dbg !49 + %936 = shufflevector <16 x i32> %935, <16 x i32> poison, <32 x i32> , !dbg !49 + %937 = extractelement <16 x i32> %935, i64 15, !dbg !68 + %938 = icmp sge i32 %220, %937, !dbg !69 + %939 = extractelement <16 x i32> %935, i64 14, !dbg !68 + %940 = icmp sge i32 %220, %939, !dbg !69 + %941 = icmp sge i32 %222, %937, !dbg !69 + %942 = icmp sge i32 %222, %939, !dbg !69 + %943 = extractelement <16 x i32> %935, i64 13, !dbg !68 + %944 = icmp sge i32 %220, %943, !dbg !69 + %945 = extractelement <16 x i32> %935, i64 12, !dbg !68 + %946 = icmp sge i32 %220, %945, !dbg !69 + %947 = icmp sge i32 %222, %943, !dbg !69 + %948 = icmp sge i32 %222, %945, !dbg !69 + %949 = extractelement <16 x i32> %935, i64 11, !dbg !68 + %950 = icmp sge i32 %220, %949, !dbg !69 + %951 = extractelement <16 x i32> %935, i64 10, !dbg !68 + %952 = icmp sge i32 %220, %951, !dbg !69 + %953 = icmp sge i32 %222, %949, !dbg !69 + %954 = icmp sge i32 %222, %951, !dbg !69 + %955 = extractelement <16 x i32> %935, i64 9, !dbg !68 + %956 = icmp sge i32 %220, %955, !dbg !69 + %957 = extractelement <16 x i32> %935, i64 8, !dbg !68 + %958 = icmp sge i32 %220, %957, !dbg !69 + %959 = icmp sge i32 %222, %955, !dbg !69 + %960 = icmp sge i32 %222, %957, !dbg !69 + %961 = extractelement <16 x i32> %935, i64 7, !dbg !68 + %962 = icmp sge i32 %220, %961, !dbg !69 + %963 = extractelement <16 x i32> %935, i64 6, !dbg !68 + %964 = icmp sge i32 %220, %963, !dbg !69 + %965 = icmp sge i32 %222, %961, !dbg !69 + %966 = icmp sge i32 %222, %963, !dbg !69 + %967 = extractelement <16 x i32> %935, i64 5, !dbg !68 + %968 = icmp sge i32 %220, %967, !dbg !69 + %969 = extractelement <16 x i32> %935, i64 4, !dbg !68 + %970 = icmp sge i32 %220, %969, !dbg !69 + %971 = icmp sge i32 %222, %967, !dbg !69 + %972 = icmp sge i32 %222, %969, !dbg !69 + %973 = extractelement <16 x i32> %935, i64 3, !dbg !68 + %974 = icmp sge i32 %220, %973, !dbg !69 + %975 = extractelement <16 x i32> %935, i64 2, !dbg !68 + %976 = icmp sge i32 %220, %975, !dbg !69 + %977 = icmp sge i32 %222, %973, !dbg !69 + %978 = icmp sge i32 %222, %975, !dbg !69 + %979 = extractelement <16 x i32> %935, i64 1, !dbg !68 + %980 = icmp sge i32 %220, %979, !dbg !69 + %981 = extractelement <16 x i32> %935, i64 0, !dbg !68 + %982 = icmp sge i32 %220, %981, !dbg !69 + %983 = icmp sge i32 %222, %979, !dbg !69 + %984 = icmp sge i32 %222, %981, !dbg !69 + %985 = and i1 %224, %938, !dbg !70 + %986 = and i1 %224, %940, !dbg !70 + %987 = and i1 %225, %941, !dbg !70 + %988 = and i1 %225, %942, !dbg !70 + %989 = and i1 %224, %944, !dbg !70 + %990 = and i1 %224, %946, !dbg !70 + %991 = and i1 %225, %947, !dbg !70 + %992 = and i1 %225, %948, !dbg !70 + %993 = and i1 %224, %950, !dbg !70 + %994 = and i1 %224, %952, !dbg !70 + %995 = and i1 %225, %953, !dbg !70 + %996 = and i1 %225, %954, !dbg !70 + %997 = and i1 %224, %956, !dbg !70 + %998 = and i1 %224, %958, !dbg !70 + %999 = and i1 %225, %959, !dbg !70 + %1000 = and i1 %225, %960, !dbg !70 + %1001 = and i1 %224, %962, !dbg !70 + %1002 = and i1 %224, %964, !dbg !70 + %1003 = and i1 %225, %965, !dbg !70 + %1004 = and i1 %225, %966, !dbg !70 + %1005 = and i1 %224, %968, !dbg !70 + %1006 = and i1 %224, %970, !dbg !70 + %1007 = and i1 %225, %971, !dbg !70 + %1008 = and i1 %225, %972, !dbg !70 + %1009 = and i1 %224, %974, !dbg !70 + %1010 = and i1 %224, %976, !dbg !70 + %1011 = and i1 %225, %977, !dbg !70 + %1012 = and i1 %225, %978, !dbg !70 + %1013 = and i1 %224, %980, !dbg !70 + %1014 = and i1 %224, %982, !dbg !70 + %1015 = and i1 %225, %983, !dbg !70 + %1016 = and i1 %225, %984, !dbg !70 + %1017 = icmp sgt i32 %937, 2047, !dbg !71 + %1018 = icmp sgt i32 %939, 2047, !dbg !71 + %1019 = icmp sgt i32 %943, 2047, !dbg !71 + %1020 = icmp sgt i32 %945, 2047, !dbg !71 + %1021 = icmp sgt i32 %949, 2047, !dbg !71 + %1022 = icmp sgt i32 %951, 2047, !dbg !71 + %1023 = icmp sgt i32 %955, 2047, !dbg !71 + %1024 = icmp sgt i32 %957, 2047, !dbg !71 + %1025 = icmp sgt i32 %961, 2047, !dbg !71 + %1026 = icmp sgt i32 %963, 2047, !dbg !71 + %1027 = icmp sgt i32 %967, 2047, !dbg !71 + %1028 = icmp sgt i32 %969, 2047, !dbg !71 + %1029 = icmp sgt i32 %973, 2047, !dbg !71 + %1030 = icmp sgt i32 %975, 2047, !dbg !71 + %1031 = icmp sgt i32 %979, 2047, !dbg !71 + %1032 = icmp sgt i32 %981, 2047, !dbg !71 + %1033 = trunc <16 x i32> %935 to <16 x i16>, !dbg !72 + %1034 = and <16 x i16> %1033, splat (i16 2047), !dbg !72 + %1035 = zext nneg <16 x i16> %1034 to <16 x i64>, !dbg !65 + %1036 = icmp sgt <16 x i64> %345, %1035, !dbg !65 + %1037 = extractelement <16 x i1> %1036, i64 15, !dbg !73 + %1038 = and i1 %1017, %1037, !dbg !73 + %1039 = extractelement <16 x i1> %1036, i64 14, !dbg !73 + %1040 = and i1 %1018, %1039, !dbg !73 + %1041 = extractelement <16 x i1> %1036, i64 13, !dbg !73 + %1042 = and i1 %1019, %1041, !dbg !73 + %1043 = extractelement <16 x i1> %1036, i64 12, !dbg !73 + %1044 = and i1 %1020, %1043, !dbg !73 + %1045 = extractelement <16 x i1> %1036, i64 11, !dbg !73 + %1046 = and i1 %1021, %1045, !dbg !73 + %1047 = extractelement <16 x i1> %1036, i64 10, !dbg !73 + %1048 = and i1 %1022, %1047, !dbg !73 + %1049 = extractelement <16 x i1> %1036, i64 9, !dbg !73 + %1050 = and i1 %1023, %1049, !dbg !73 + %1051 = extractelement <16 x i1> %1036, i64 8, !dbg !73 + %1052 = and i1 %1024, %1051, !dbg !73 + %1053 = extractelement <16 x i1> %1036, i64 7, !dbg !73 + %1054 = and i1 %1025, %1053, !dbg !73 + %1055 = extractelement <16 x i1> %1036, i64 6, !dbg !73 + %1056 = and i1 %1026, %1055, !dbg !73 + %1057 = extractelement <16 x i1> %1036, i64 5, !dbg !73 + %1058 = and i1 %1027, %1057, !dbg !73 + %1059 = extractelement <16 x i1> %1036, i64 4, !dbg !73 + %1060 = and i1 %1028, %1059, !dbg !73 + %1061 = extractelement <16 x i1> %1036, i64 3, !dbg !73 + %1062 = and i1 %1029, %1061, !dbg !73 + %1063 = extractelement <16 x i1> %1036, i64 2, !dbg !73 + %1064 = and i1 %1030, %1063, !dbg !73 + %1065 = extractelement <16 x i1> %1036, i64 1, !dbg !73 + %1066 = and i1 %1031, %1065, !dbg !73 + %1067 = extractelement <16 x i1> %1036, i64 0, !dbg !73 + %1068 = and i1 %1032, %1067, !dbg !73 + %1069 = sub <32 x i32> %936, %215, !dbg !74 + %1070 = and <32 x i32> %1069, splat (i32 2047), !dbg !75 + %1071 = icmp eq <32 x i32> %1070, zeroinitializer, !dbg !76 + %1072 = extractelement <32 x i1> %1071, i64 31, !dbg !77 + %1073 = and i1 %1072, %1038, !dbg !77 + %1074 = extractelement <32 x i1> %1071, i64 30, !dbg !77 + %1075 = and i1 %1074, %1040, !dbg !77 + %1076 = extractelement <32 x i1> %1071, i64 29, !dbg !77 + %1077 = and i1 %1076, %1038, !dbg !77 + %1078 = extractelement <32 x i1> %1071, i64 28, !dbg !77 + %1079 = and i1 %1078, %1040, !dbg !77 + %1080 = extractelement <32 x i1> %1071, i64 27, !dbg !77 + %1081 = and i1 %1080, %1042, !dbg !77 + %1082 = extractelement <32 x i1> %1071, i64 26, !dbg !77 + %1083 = and i1 %1082, %1044, !dbg !77 + %1084 = extractelement <32 x i1> %1071, i64 25, !dbg !77 + %1085 = and i1 %1084, %1042, !dbg !77 + %1086 = extractelement <32 x i1> %1071, i64 24, !dbg !77 + %1087 = and i1 %1086, %1044, !dbg !77 + %1088 = extractelement <32 x i1> %1071, i64 23, !dbg !77 + %1089 = and i1 %1088, %1046, !dbg !77 + %1090 = extractelement <32 x i1> %1071, i64 22, !dbg !77 + %1091 = and i1 %1090, %1048, !dbg !77 + %1092 = extractelement <32 x i1> %1071, i64 21, !dbg !77 + %1093 = and i1 %1092, %1046, !dbg !77 + %1094 = extractelement <32 x i1> %1071, i64 20, !dbg !77 + %1095 = and i1 %1094, %1048, !dbg !77 + %1096 = extractelement <32 x i1> %1071, i64 19, !dbg !77 + %1097 = and i1 %1096, %1050, !dbg !77 + %1098 = extractelement <32 x i1> %1071, i64 18, !dbg !77 + %1099 = and i1 %1098, %1052, !dbg !77 + %1100 = extractelement <32 x i1> %1071, i64 17, !dbg !77 + %1101 = and i1 %1100, %1050, !dbg !77 + %1102 = extractelement <32 x i1> %1071, i64 16, !dbg !77 + %1103 = and i1 %1102, %1052, !dbg !77 + %1104 = extractelement <32 x i1> %1071, i64 15, !dbg !77 + %1105 = and i1 %1104, %1054, !dbg !77 + %1106 = extractelement <32 x i1> %1071, i64 14, !dbg !77 + %1107 = and i1 %1106, %1056, !dbg !77 + %1108 = extractelement <32 x i1> %1071, i64 13, !dbg !77 + %1109 = and i1 %1108, %1054, !dbg !77 + %1110 = extractelement <32 x i1> %1071, i64 12, !dbg !77 + %1111 = and i1 %1110, %1056, !dbg !77 + %1112 = extractelement <32 x i1> %1071, i64 11, !dbg !77 + %1113 = and i1 %1112, %1058, !dbg !77 + %1114 = extractelement <32 x i1> %1071, i64 10, !dbg !77 + %1115 = and i1 %1114, %1060, !dbg !77 + %1116 = extractelement <32 x i1> %1071, i64 9, !dbg !77 + %1117 = and i1 %1116, %1058, !dbg !77 + %1118 = extractelement <32 x i1> %1071, i64 8, !dbg !77 + %1119 = and i1 %1118, %1060, !dbg !77 + %1120 = extractelement <32 x i1> %1071, i64 7, !dbg !77 + %1121 = and i1 %1120, %1062, !dbg !77 + %1122 = extractelement <32 x i1> %1071, i64 6, !dbg !77 + %1123 = and i1 %1122, %1064, !dbg !77 + %1124 = extractelement <32 x i1> %1071, i64 5, !dbg !77 + %1125 = and i1 %1124, %1062, !dbg !77 + %1126 = extractelement <32 x i1> %1071, i64 4, !dbg !77 + %1127 = and i1 %1126, %1064, !dbg !77 + %1128 = extractelement <32 x i1> %1071, i64 3, !dbg !77 + %1129 = and i1 %1128, %1066, !dbg !77 + %1130 = extractelement <32 x i1> %1071, i64 2, !dbg !77 + %1131 = and i1 %1130, %1068, !dbg !77 + %1132 = extractelement <32 x i1> %1071, i64 1, !dbg !77 + %1133 = and i1 %1132, %1066, !dbg !77 + %1134 = extractelement <32 x i1> %1071, i64 0, !dbg !77 + %1135 = and i1 %1134, %1068, !dbg !77 + %1136 = or i1 %985, %1073, !dbg !78 + %1137 = or i1 %986, %1075, !dbg !78 + %1138 = or i1 %987, %1077, !dbg !78 + %1139 = or i1 %988, %1079, !dbg !78 + %1140 = or i1 %989, %1081, !dbg !78 + %1141 = or i1 %990, %1083, !dbg !78 + %1142 = or i1 %991, %1085, !dbg !78 + %1143 = or i1 %992, %1087, !dbg !78 + %1144 = or i1 %993, %1089, !dbg !78 + %1145 = or i1 %994, %1091, !dbg !78 + %1146 = or i1 %995, %1093, !dbg !78 + %1147 = or i1 %996, %1095, !dbg !78 + %1148 = or i1 %997, %1097, !dbg !78 + %1149 = or i1 %998, %1099, !dbg !78 + %1150 = or i1 %999, %1101, !dbg !78 + %1151 = or i1 %1000, %1103, !dbg !78 + %1152 = or i1 %1001, %1105, !dbg !78 + %1153 = or i1 %1002, %1107, !dbg !78 + %1154 = or i1 %1003, %1109, !dbg !78 + %1155 = or i1 %1004, %1111, !dbg !78 + %1156 = or i1 %1005, %1113, !dbg !78 + %1157 = or i1 %1006, %1115, !dbg !78 + %1158 = or i1 %1007, %1117, !dbg !78 + %1159 = or i1 %1008, %1119, !dbg !78 + %1160 = or i1 %1009, %1121, !dbg !78 + %1161 = or i1 %1010, %1123, !dbg !78 + %1162 = or i1 %1011, %1125, !dbg !78 + %1163 = or i1 %1012, %1127, !dbg !78 + %1164 = or i1 %1013, %1129, !dbg !78 + %1165 = or i1 %1014, %1131, !dbg !78 + %1166 = or i1 %1015, %1133, !dbg !78 + %1167 = or i1 %1016, %1135, !dbg !78 + %1168 = select i1 %904, i1 %1136, i1 false, !dbg !79 + %1169 = select i1 %906, i1 %1137, i1 false, !dbg !79 + %1170 = select i1 %904, i1 %1138, i1 false, !dbg !79 + %1171 = select i1 %906, i1 %1139, i1 false, !dbg !79 + %1172 = select i1 %908, i1 %1140, i1 false, !dbg !79 + %1173 = select i1 %910, i1 %1141, i1 false, !dbg !79 + %1174 = select i1 %908, i1 %1142, i1 false, !dbg !79 + %1175 = select i1 %910, i1 %1143, i1 false, !dbg !79 + %1176 = select i1 %912, i1 %1144, i1 false, !dbg !79 + %1177 = select i1 %914, i1 %1145, i1 false, !dbg !79 + %1178 = select i1 %912, i1 %1146, i1 false, !dbg !79 + %1179 = select i1 %914, i1 %1147, i1 false, !dbg !79 + %1180 = select i1 %916, i1 %1148, i1 false, !dbg !79 + %1181 = select i1 %918, i1 %1149, i1 false, !dbg !79 + %1182 = select i1 %916, i1 %1150, i1 false, !dbg !79 + %1183 = select i1 %918, i1 %1151, i1 false, !dbg !79 + %1184 = select i1 %920, i1 %1152, i1 false, !dbg !79 + %1185 = select i1 %922, i1 %1153, i1 false, !dbg !79 + %1186 = select i1 %920, i1 %1154, i1 false, !dbg !79 + %1187 = select i1 %922, i1 %1155, i1 false, !dbg !79 + %1188 = select i1 %924, i1 %1156, i1 false, !dbg !79 + %1189 = select i1 %926, i1 %1157, i1 false, !dbg !79 + %1190 = select i1 %924, i1 %1158, i1 false, !dbg !79 + %1191 = select i1 %926, i1 %1159, i1 false, !dbg !79 + %1192 = select i1 %928, i1 %1160, i1 false, !dbg !79 + %1193 = select i1 %930, i1 %1161, i1 false, !dbg !79 + %1194 = select i1 %928, i1 %1162, i1 false, !dbg !79 + %1195 = select i1 %930, i1 %1163, i1 false, !dbg !79 + %1196 = select i1 %932, i1 %1164, i1 false, !dbg !79 + %1197 = select i1 %934, i1 %1165, i1 false, !dbg !79 + %1198 = select i1 %932, i1 %1166, i1 false, !dbg !79 + %1199 = select i1 %934, i1 %1167, i1 false, !dbg !79 + %1200 = fmul float %871, 0x3FF7154760000000, !dbg !80 + %1201 = select i1 %1168, float %1200, float 0xFFF0000000000000, !dbg !81 + %1202 = fmul float %872, 0x3FF7154760000000, !dbg !80 + %1203 = select i1 %1169, float %1202, float 0xFFF0000000000000, !dbg !81 + %1204 = fmul float %873, 0x3FF7154760000000, !dbg !80 + %1205 = select i1 %1170, float %1204, float 0xFFF0000000000000, !dbg !81 + %1206 = fmul float %874, 0x3FF7154760000000, !dbg !80 + %1207 = select i1 %1171, float %1206, float 0xFFF0000000000000, !dbg !81 + %1208 = fmul float %875, 0x3FF7154760000000, !dbg !80 + %1209 = select i1 %1172, float %1208, float 0xFFF0000000000000, !dbg !81 + %1210 = fmul float %876, 0x3FF7154760000000, !dbg !80 + %1211 = select i1 %1173, float %1210, float 0xFFF0000000000000, !dbg !81 + %1212 = fmul float %877, 0x3FF7154760000000, !dbg !80 + %1213 = select i1 %1174, float %1212, float 0xFFF0000000000000, !dbg !81 + %1214 = fmul float %878, 0x3FF7154760000000, !dbg !80 + %1215 = select i1 %1175, float %1214, float 0xFFF0000000000000, !dbg !81 + %1216 = fmul float %879, 0x3FF7154760000000, !dbg !80 + %1217 = select i1 %1176, float %1216, float 0xFFF0000000000000, !dbg !81 + %1218 = fmul float %880, 0x3FF7154760000000, !dbg !80 + %1219 = select i1 %1177, float %1218, float 0xFFF0000000000000, !dbg !81 + %1220 = fmul float %881, 0x3FF7154760000000, !dbg !80 + %1221 = select i1 %1178, float %1220, float 0xFFF0000000000000, !dbg !81 + %1222 = fmul float %882, 0x3FF7154760000000, !dbg !80 + %1223 = select i1 %1179, float %1222, float 0xFFF0000000000000, !dbg !81 + %1224 = fmul float %883, 0x3FF7154760000000, !dbg !80 + %1225 = select i1 %1180, float %1224, float 0xFFF0000000000000, !dbg !81 + %1226 = fmul float %884, 0x3FF7154760000000, !dbg !80 + %1227 = select i1 %1181, float %1226, float 0xFFF0000000000000, !dbg !81 + %1228 = fmul float %885, 0x3FF7154760000000, !dbg !80 + %1229 = select i1 %1182, float %1228, float 0xFFF0000000000000, !dbg !81 + %1230 = fmul float %886, 0x3FF7154760000000, !dbg !80 + %1231 = select i1 %1183, float %1230, float 0xFFF0000000000000, !dbg !81 + %1232 = fmul float %887, 0x3FF7154760000000, !dbg !80 + %1233 = select i1 %1184, float %1232, float 0xFFF0000000000000, !dbg !81 + %1234 = fmul float %888, 0x3FF7154760000000, !dbg !80 + %1235 = select i1 %1185, float %1234, float 0xFFF0000000000000, !dbg !81 + %1236 = fmul float %889, 0x3FF7154760000000, !dbg !80 + %1237 = select i1 %1186, float %1236, float 0xFFF0000000000000, !dbg !81 + %1238 = fmul float %890, 0x3FF7154760000000, !dbg !80 + %1239 = select i1 %1187, float %1238, float 0xFFF0000000000000, !dbg !81 + %1240 = fmul float %891, 0x3FF7154760000000, !dbg !80 + %1241 = select i1 %1188, float %1240, float 0xFFF0000000000000, !dbg !81 + %1242 = fmul float %892, 0x3FF7154760000000, !dbg !80 + %1243 = select i1 %1189, float %1242, float 0xFFF0000000000000, !dbg !81 + %1244 = fmul float %893, 0x3FF7154760000000, !dbg !80 + %1245 = select i1 %1190, float %1244, float 0xFFF0000000000000, !dbg !81 + %1246 = fmul float %894, 0x3FF7154760000000, !dbg !80 + %1247 = select i1 %1191, float %1246, float 0xFFF0000000000000, !dbg !81 + %1248 = fmul float %895, 0x3FF7154760000000, !dbg !80 + %1249 = select i1 %1192, float %1248, float 0xFFF0000000000000, !dbg !81 + %1250 = fmul float %896, 0x3FF7154760000000, !dbg !80 + %1251 = select i1 %1193, float %1250, float 0xFFF0000000000000, !dbg !81 + %1252 = fmul float %897, 0x3FF7154760000000, !dbg !80 + %1253 = select i1 %1194, float %1252, float 0xFFF0000000000000, !dbg !81 + %1254 = fmul float %898, 0x3FF7154760000000, !dbg !80 + %1255 = select i1 %1195, float %1254, float 0xFFF0000000000000, !dbg !81 + %1256 = fmul float %899, 0x3FF7154760000000, !dbg !80 + %1257 = select i1 %1196, float %1256, float 0xFFF0000000000000, !dbg !81 + %1258 = fmul float %900, 0x3FF7154760000000, !dbg !80 + %1259 = select i1 %1197, float %1258, float 0xFFF0000000000000, !dbg !81 + %1260 = fmul float %901, 0x3FF7154760000000, !dbg !80 + %1261 = select i1 %1198, float %1260, float 0xFFF0000000000000, !dbg !81 + %1262 = fmul float %902, 0x3FF7154760000000, !dbg !80 + %1263 = select i1 %1199, float %1262, float 0xFFF0000000000000, !dbg !81 + %1264 = tail call float @llvm.maxnum.f32(float %1201, float %1203), !dbg !82 + %1265 = tail call float @llvm.maxnum.f32(float %1205, float %1207), !dbg !82 + %1266 = tail call float @llvm.maxnum.f32(float %1264, float %1209), !dbg !82 + %1267 = tail call float @llvm.maxnum.f32(float %1266, float %1211), !dbg !82 + %1268 = tail call float @llvm.maxnum.f32(float %1265, float %1213), !dbg !82 + %1269 = tail call float @llvm.maxnum.f32(float %1268, float %1215), !dbg !82 + %1270 = tail call float @llvm.maxnum.f32(float %1267, float %1217), !dbg !82 + %1271 = tail call float @llvm.maxnum.f32(float %1270, float %1219), !dbg !82 + %1272 = tail call float @llvm.maxnum.f32(float %1269, float %1221), !dbg !82 + %1273 = tail call float @llvm.maxnum.f32(float %1272, float %1223), !dbg !82 + %1274 = tail call float @llvm.maxnum.f32(float %1271, float %1225), !dbg !82 + %1275 = tail call float @llvm.maxnum.f32(float %1274, float %1227), !dbg !82 + %1276 = tail call float @llvm.maxnum.f32(float %1273, float %1229), !dbg !82 + %1277 = tail call float @llvm.maxnum.f32(float %1276, float %1231), !dbg !82 + %1278 = tail call float @llvm.maxnum.f32(float %1275, float %1233), !dbg !82 + %1279 = tail call float @llvm.maxnum.f32(float %1278, float %1235), !dbg !82 + %1280 = tail call float @llvm.maxnum.f32(float %1277, float %1237), !dbg !82 + %1281 = tail call float @llvm.maxnum.f32(float %1280, float %1239), !dbg !82 + %1282 = tail call float @llvm.maxnum.f32(float %1279, float %1241), !dbg !82 + %1283 = tail call float @llvm.maxnum.f32(float %1282, float %1243), !dbg !82 + %1284 = tail call float @llvm.maxnum.f32(float %1281, float %1245), !dbg !82 + %1285 = tail call float @llvm.maxnum.f32(float %1284, float %1247), !dbg !82 + %1286 = tail call float @llvm.maxnum.f32(float %1283, float %1249), !dbg !82 + %1287 = tail call float @llvm.maxnum.f32(float %1286, float %1251), !dbg !82 + %1288 = tail call float @llvm.maxnum.f32(float %1285, float %1253), !dbg !82 + %1289 = tail call float @llvm.maxnum.f32(float %1288, float %1255), !dbg !82 + %1290 = tail call float @llvm.maxnum.f32(float %1287, float %1257), !dbg !82 + %1291 = tail call float @llvm.maxnum.f32(float %1290, float %1259), !dbg !82 + %1292 = tail call float @llvm.maxnum.f32(float %1289, float %1261), !dbg !82 + %1293 = tail call float @llvm.maxnum.f32(float %1292, float %1263), !dbg !82 + %1294 = bitcast float %1291 to i32, !dbg !83 + %1295 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1294, i32 2, i32 31), !dbg !83 + %1296 = bitcast i32 %1295 to float, !dbg !83 + %1297 = bitcast float %1293 to i32, !dbg !83 + %1298 = tail call float @llvm.maxnum.f32(float %1291, float %1296), !dbg !82 + %1299 = bitcast float %1298 to i32, !dbg !83 + %1300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1299, i32 1, i32 31), !dbg !83 + %1301 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1297, i32 2, i32 31), !dbg !83 + %1302 = bitcast i32 %1301 to float, !dbg !83 + %1303 = tail call float @llvm.maxnum.f32(float %1293, float %1302), !dbg !82 + %1304 = bitcast float %1303 to i32, !dbg !83 + %1305 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1304, i32 1, i32 31), !dbg !83 + %1306 = insertelement <2 x i32> poison, i32 %1300, i64 0, !dbg !83 + %1307 = insertelement <2 x i32> %1306, i32 %1305, i64 1, !dbg !83 + %1308 = bitcast <2 x i32> %1307 to <2 x float>, !dbg !83 + %1309 = insertelement <2 x float> poison, float %1298, i64 0, !dbg !82 + %1310 = insertelement <2 x float> %1309, float %1303, i64 1, !dbg !82 + %1311 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %1310, <2 x float> %1308), !dbg !82 + %1312 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %418, <2 x float> %1311), !dbg !84 + %1313 = extractelement <2 x float> %1312, i64 0, !dbg !85 + %1314 = fcmp oeq float %1313, 0xFFF0000000000000, !dbg !86 + %1315 = extractelement <2 x float> %1312, i64 1, !dbg !85 + %1316 = fcmp oeq float %1315, 0xFFF0000000000000, !dbg !86 + %1317 = select i1 %1314, float 0.000000e+00, float %1313, !dbg !85 + %1318 = select i1 %1316, float 0.000000e+00, float %1315, !dbg !85 + %1319 = extractelement <2 x float> %418, i64 0, !dbg !87 + %1320 = fsub float %1319, %1317, !dbg !87 + %1321 = extractelement <2 x float> %418, i64 1, !dbg !87 + %1322 = fsub float %1321, %1318, !dbg !87 + %1323 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i254 = icmp eq i32 %1323, 0, !dbg !88 + br i1 %.not.i254, label %1326, label %1324, !dbg !88 + +1324: ; preds = %346 + %1325 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1320) #2, !dbg !88 + br label %__nv_exp2f.exit256, !dbg !88 + +1326: ; preds = %346 + %1327 = tail call float @llvm.nvvm.ex2.approx.f(float %1320) #2, !dbg !88 + br label %__nv_exp2f.exit256, !dbg !88 + +__nv_exp2f.exit256: ; preds = %1324, %1326 + %.0.i255 = phi float [ %1325, %1324 ], [ %1327, %1326 ], !dbg !88 + %1328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i257 = icmp eq i32 %1328, 0, !dbg !88 + br i1 %.not.i257, label %1331, label %1329, !dbg !88 + +1329: ; preds = %__nv_exp2f.exit256 + %1330 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1322) #2, !dbg !88 + br label %__nv_exp2f.exit259, !dbg !88 + +1331: ; preds = %__nv_exp2f.exit256 + %1332 = tail call float @llvm.nvvm.ex2.approx.f(float %1322) #2, !dbg !88 + br label %__nv_exp2f.exit259, !dbg !88 + +__nv_exp2f.exit259: ; preds = %1329, %1331 + %.0.i258 = phi float [ %1330, %1329 ], [ %1332, %1331 ], !dbg !88 + %1333 = fsub float %1201, %1317, !dbg !89 + %1334 = fsub float %1203, %1317, !dbg !89 + %1335 = fsub float %1205, %1318, !dbg !89 + %1336 = fsub float %1207, %1318, !dbg !89 + %1337 = fsub float %1209, %1317, !dbg !89 + %1338 = fsub float %1211, %1317, !dbg !89 + %1339 = fsub float %1213, %1318, !dbg !89 + %1340 = fsub float %1215, %1318, !dbg !89 + %1341 = fsub float %1217, %1317, !dbg !89 + %1342 = fsub float %1219, %1317, !dbg !89 + %1343 = fsub float %1221, %1318, !dbg !89 + %1344 = fsub float %1223, %1318, !dbg !89 + %1345 = fsub float %1225, %1317, !dbg !89 + %1346 = fsub float %1227, %1317, !dbg !89 + %1347 = fsub float %1229, %1318, !dbg !89 + %1348 = fsub float %1231, %1318, !dbg !89 + %1349 = fsub float %1233, %1317, !dbg !89 + %1350 = fsub float %1235, %1317, !dbg !89 + %1351 = fsub float %1237, %1318, !dbg !89 + %1352 = fsub float %1239, %1318, !dbg !89 + %1353 = fsub float %1241, %1317, !dbg !89 + %1354 = fsub float %1243, %1317, !dbg !89 + %1355 = fsub float %1245, %1318, !dbg !89 + %1356 = fsub float %1247, %1318, !dbg !89 + %1357 = fsub float %1249, %1317, !dbg !89 + %1358 = fsub float %1251, %1317, !dbg !89 + %1359 = fsub float %1253, %1318, !dbg !89 + %1360 = fsub float %1255, %1318, !dbg !89 + %1361 = fsub float %1257, %1317, !dbg !89 + %1362 = fsub float %1259, %1317, !dbg !89 + %1363 = fsub float %1261, %1318, !dbg !89 + %1364 = fsub float %1263, %1318, !dbg !89 + %1365 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i260 = icmp eq i32 %1365, 0, !dbg !90 + br i1 %.not.i260, label %1368, label %1366, !dbg !90 + +1366: ; preds = %__nv_exp2f.exit259 + %1367 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1333) #2, !dbg !90 + br label %__nv_exp2f.exit262, !dbg !90 + +1368: ; preds = %__nv_exp2f.exit259 + %1369 = tail call float @llvm.nvvm.ex2.approx.f(float %1333) #2, !dbg !90 + br label %__nv_exp2f.exit262, !dbg !90 + +__nv_exp2f.exit262: ; preds = %1366, %1368 + %.0.i261 = phi float [ %1367, %1366 ], [ %1369, %1368 ], !dbg !90 + %1370 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i263 = icmp eq i32 %1370, 0, !dbg !90 + br i1 %.not.i263, label %1373, label %1371, !dbg !90 + +1371: ; preds = %__nv_exp2f.exit262 + %1372 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1334) #2, !dbg !90 + br label %__nv_exp2f.exit265, !dbg !90 + +1373: ; preds = %__nv_exp2f.exit262 + %1374 = tail call float @llvm.nvvm.ex2.approx.f(float %1334) #2, !dbg !90 + br label %__nv_exp2f.exit265, !dbg !90 + +__nv_exp2f.exit265: ; preds = %1371, %1373 + %.0.i264 = phi float [ %1372, %1371 ], [ %1374, %1373 ], !dbg !90 + %1375 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i266 = icmp eq i32 %1375, 0, !dbg !90 + br i1 %.not.i266, label %1378, label %1376, !dbg !90 + +1376: ; preds = %__nv_exp2f.exit265 + %1377 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1335) #2, !dbg !90 + br label %__nv_exp2f.exit268, !dbg !90 + +1378: ; preds = %__nv_exp2f.exit265 + %1379 = tail call float @llvm.nvvm.ex2.approx.f(float %1335) #2, !dbg !90 + br label %__nv_exp2f.exit268, !dbg !90 + +__nv_exp2f.exit268: ; preds = %1376, %1378 + %.0.i267 = phi float [ %1377, %1376 ], [ %1379, %1378 ], !dbg !90 + %1380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i269 = icmp eq i32 %1380, 0, !dbg !90 + br i1 %.not.i269, label %1383, label %1381, !dbg !90 + +1381: ; preds = %__nv_exp2f.exit268 + %1382 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1336) #2, !dbg !90 + br label %__nv_exp2f.exit271, !dbg !90 + +1383: ; preds = %__nv_exp2f.exit268 + %1384 = tail call float @llvm.nvvm.ex2.approx.f(float %1336) #2, !dbg !90 + br label %__nv_exp2f.exit271, !dbg !90 + +__nv_exp2f.exit271: ; preds = %1381, %1383 + %.0.i270 = phi float [ %1382, %1381 ], [ %1384, %1383 ], !dbg !90 + %1385 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i272 = icmp eq i32 %1385, 0, !dbg !90 + br i1 %.not.i272, label %1388, label %1386, !dbg !90 + +1386: ; preds = %__nv_exp2f.exit271 + %1387 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1337) #2, !dbg !90 + br label %__nv_exp2f.exit274, !dbg !90 + +1388: ; preds = %__nv_exp2f.exit271 + %1389 = tail call float @llvm.nvvm.ex2.approx.f(float %1337) #2, !dbg !90 + br label %__nv_exp2f.exit274, !dbg !90 + +__nv_exp2f.exit274: ; preds = %1386, %1388 + %.0.i273 = phi float [ %1387, %1386 ], [ %1389, %1388 ], !dbg !90 + %1390 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i275 = icmp eq i32 %1390, 0, !dbg !90 + br i1 %.not.i275, label %1393, label %1391, !dbg !90 + +1391: ; preds = %__nv_exp2f.exit274 + %1392 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1338) #2, !dbg !90 + br label %__nv_exp2f.exit277, !dbg !90 + +1393: ; preds = %__nv_exp2f.exit274 + %1394 = tail call float @llvm.nvvm.ex2.approx.f(float %1338) #2, !dbg !90 + br label %__nv_exp2f.exit277, !dbg !90 + +__nv_exp2f.exit277: ; preds = %1391, %1393 + %.0.i276 = phi float [ %1392, %1391 ], [ %1394, %1393 ], !dbg !90 + %1395 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i278 = icmp eq i32 %1395, 0, !dbg !90 + br i1 %.not.i278, label %1398, label %1396, !dbg !90 + +1396: ; preds = %__nv_exp2f.exit277 + %1397 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1339) #2, !dbg !90 + br label %__nv_exp2f.exit280, !dbg !90 + +1398: ; preds = %__nv_exp2f.exit277 + %1399 = tail call float @llvm.nvvm.ex2.approx.f(float %1339) #2, !dbg !90 + br label %__nv_exp2f.exit280, !dbg !90 + +__nv_exp2f.exit280: ; preds = %1396, %1398 + %.0.i279 = phi float [ %1397, %1396 ], [ %1399, %1398 ], !dbg !90 + %1400 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i281 = icmp eq i32 %1400, 0, !dbg !90 + br i1 %.not.i281, label %1403, label %1401, !dbg !90 + +1401: ; preds = %__nv_exp2f.exit280 + %1402 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1340) #2, !dbg !90 + br label %__nv_exp2f.exit283, !dbg !90 + +1403: ; preds = %__nv_exp2f.exit280 + %1404 = tail call float @llvm.nvvm.ex2.approx.f(float %1340) #2, !dbg !90 + br label %__nv_exp2f.exit283, !dbg !90 + +__nv_exp2f.exit283: ; preds = %1401, %1403 + %.0.i282 = phi float [ %1402, %1401 ], [ %1404, %1403 ], !dbg !90 + %1405 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i284 = icmp eq i32 %1405, 0, !dbg !90 + br i1 %.not.i284, label %1408, label %1406, !dbg !90 + +1406: ; preds = %__nv_exp2f.exit283 + %1407 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1341) #2, !dbg !90 + br label %__nv_exp2f.exit286, !dbg !90 + +1408: ; preds = %__nv_exp2f.exit283 + %1409 = tail call float @llvm.nvvm.ex2.approx.f(float %1341) #2, !dbg !90 + br label %__nv_exp2f.exit286, !dbg !90 + +__nv_exp2f.exit286: ; preds = %1406, %1408 + %.0.i285 = phi float [ %1407, %1406 ], [ %1409, %1408 ], !dbg !90 + %1410 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i287 = icmp eq i32 %1410, 0, !dbg !90 + br i1 %.not.i287, label %1413, label %1411, !dbg !90 + +1411: ; preds = %__nv_exp2f.exit286 + %1412 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1342) #2, !dbg !90 + br label %__nv_exp2f.exit289, !dbg !90 + +1413: ; preds = %__nv_exp2f.exit286 + %1414 = tail call float @llvm.nvvm.ex2.approx.f(float %1342) #2, !dbg !90 + br label %__nv_exp2f.exit289, !dbg !90 + +__nv_exp2f.exit289: ; preds = %1411, %1413 + %.0.i288 = phi float [ %1412, %1411 ], [ %1414, %1413 ], !dbg !90 + %1415 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i290 = icmp eq i32 %1415, 0, !dbg !90 + br i1 %.not.i290, label %1418, label %1416, !dbg !90 + +1416: ; preds = %__nv_exp2f.exit289 + %1417 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1343) #2, !dbg !90 + br label %__nv_exp2f.exit292, !dbg !90 + +1418: ; preds = %__nv_exp2f.exit289 + %1419 = tail call float @llvm.nvvm.ex2.approx.f(float %1343) #2, !dbg !90 + br label %__nv_exp2f.exit292, !dbg !90 + +__nv_exp2f.exit292: ; preds = %1416, %1418 + %.0.i291 = phi float [ %1417, %1416 ], [ %1419, %1418 ], !dbg !90 + %1420 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i293 = icmp eq i32 %1420, 0, !dbg !90 + br i1 %.not.i293, label %1423, label %1421, !dbg !90 + +1421: ; preds = %__nv_exp2f.exit292 + %1422 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1344) #2, !dbg !90 + br label %__nv_exp2f.exit295, !dbg !90 + +1423: ; preds = %__nv_exp2f.exit292 + %1424 = tail call float @llvm.nvvm.ex2.approx.f(float %1344) #2, !dbg !90 + br label %__nv_exp2f.exit295, !dbg !90 + +__nv_exp2f.exit295: ; preds = %1421, %1423 + %.0.i294 = phi float [ %1422, %1421 ], [ %1424, %1423 ], !dbg !90 + %1425 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i296 = icmp eq i32 %1425, 0, !dbg !90 + br i1 %.not.i296, label %1428, label %1426, !dbg !90 + +1426: ; preds = %__nv_exp2f.exit295 + %1427 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1345) #2, !dbg !90 + br label %__nv_exp2f.exit298, !dbg !90 + +1428: ; preds = %__nv_exp2f.exit295 + %1429 = tail call float @llvm.nvvm.ex2.approx.f(float %1345) #2, !dbg !90 + br label %__nv_exp2f.exit298, !dbg !90 + +__nv_exp2f.exit298: ; preds = %1426, %1428 + %.0.i297 = phi float [ %1427, %1426 ], [ %1429, %1428 ], !dbg !90 + %1430 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i299 = icmp eq i32 %1430, 0, !dbg !90 + br i1 %.not.i299, label %1433, label %1431, !dbg !90 + +1431: ; preds = %__nv_exp2f.exit298 + %1432 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1346) #2, !dbg !90 + br label %__nv_exp2f.exit301, !dbg !90 + +1433: ; preds = %__nv_exp2f.exit298 + %1434 = tail call float @llvm.nvvm.ex2.approx.f(float %1346) #2, !dbg !90 + br label %__nv_exp2f.exit301, !dbg !90 + +__nv_exp2f.exit301: ; preds = %1431, %1433 + %.0.i300 = phi float [ %1432, %1431 ], [ %1434, %1433 ], !dbg !90 + %1435 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i302 = icmp eq i32 %1435, 0, !dbg !90 + br i1 %.not.i302, label %1438, label %1436, !dbg !90 + +1436: ; preds = %__nv_exp2f.exit301 + %1437 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1347) #2, !dbg !90 + br label %__nv_exp2f.exit304, !dbg !90 + +1438: ; preds = %__nv_exp2f.exit301 + %1439 = tail call float @llvm.nvvm.ex2.approx.f(float %1347) #2, !dbg !90 + br label %__nv_exp2f.exit304, !dbg !90 + +__nv_exp2f.exit304: ; preds = %1436, %1438 + %.0.i303 = phi float [ %1437, %1436 ], [ %1439, %1438 ], !dbg !90 + %1440 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i305 = icmp eq i32 %1440, 0, !dbg !90 + br i1 %.not.i305, label %1443, label %1441, !dbg !90 + +1441: ; preds = %__nv_exp2f.exit304 + %1442 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1348) #2, !dbg !90 + br label %__nv_exp2f.exit307, !dbg !90 + +1443: ; preds = %__nv_exp2f.exit304 + %1444 = tail call float @llvm.nvvm.ex2.approx.f(float %1348) #2, !dbg !90 + br label %__nv_exp2f.exit307, !dbg !90 + +__nv_exp2f.exit307: ; preds = %1441, %1443 + %.0.i306 = phi float [ %1442, %1441 ], [ %1444, %1443 ], !dbg !90 + %1445 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i308 = icmp eq i32 %1445, 0, !dbg !90 + br i1 %.not.i308, label %1448, label %1446, !dbg !90 + +1446: ; preds = %__nv_exp2f.exit307 + %1447 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1349) #2, !dbg !90 + br label %__nv_exp2f.exit310, !dbg !90 + +1448: ; preds = %__nv_exp2f.exit307 + %1449 = tail call float @llvm.nvvm.ex2.approx.f(float %1349) #2, !dbg !90 + br label %__nv_exp2f.exit310, !dbg !90 + +__nv_exp2f.exit310: ; preds = %1446, %1448 + %.0.i309 = phi float [ %1447, %1446 ], [ %1449, %1448 ], !dbg !90 + %1450 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i311 = icmp eq i32 %1450, 0, !dbg !90 + br i1 %.not.i311, label %1453, label %1451, !dbg !90 + +1451: ; preds = %__nv_exp2f.exit310 + %1452 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1350) #2, !dbg !90 + br label %__nv_exp2f.exit313, !dbg !90 + +1453: ; preds = %__nv_exp2f.exit310 + %1454 = tail call float @llvm.nvvm.ex2.approx.f(float %1350) #2, !dbg !90 + br label %__nv_exp2f.exit313, !dbg !90 + +__nv_exp2f.exit313: ; preds = %1451, %1453 + %.0.i312 = phi float [ %1452, %1451 ], [ %1454, %1453 ], !dbg !90 + %1455 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i314 = icmp eq i32 %1455, 0, !dbg !90 + br i1 %.not.i314, label %1458, label %1456, !dbg !90 + +1456: ; preds = %__nv_exp2f.exit313 + %1457 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1351) #2, !dbg !90 + br label %__nv_exp2f.exit316, !dbg !90 + +1458: ; preds = %__nv_exp2f.exit313 + %1459 = tail call float @llvm.nvvm.ex2.approx.f(float %1351) #2, !dbg !90 + br label %__nv_exp2f.exit316, !dbg !90 + +__nv_exp2f.exit316: ; preds = %1456, %1458 + %.0.i315 = phi float [ %1457, %1456 ], [ %1459, %1458 ], !dbg !90 + %1460 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i317 = icmp eq i32 %1460, 0, !dbg !90 + br i1 %.not.i317, label %1463, label %1461, !dbg !90 + +1461: ; preds = %__nv_exp2f.exit316 + %1462 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1352) #2, !dbg !90 + br label %__nv_exp2f.exit319, !dbg !90 + +1463: ; preds = %__nv_exp2f.exit316 + %1464 = tail call float @llvm.nvvm.ex2.approx.f(float %1352) #2, !dbg !90 + br label %__nv_exp2f.exit319, !dbg !90 + +__nv_exp2f.exit319: ; preds = %1461, %1463 + %.0.i318 = phi float [ %1462, %1461 ], [ %1464, %1463 ], !dbg !90 + %1465 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i320 = icmp eq i32 %1465, 0, !dbg !90 + br i1 %.not.i320, label %1468, label %1466, !dbg !90 + +1466: ; preds = %__nv_exp2f.exit319 + %1467 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1353) #2, !dbg !90 + br label %__nv_exp2f.exit322, !dbg !90 + +1468: ; preds = %__nv_exp2f.exit319 + %1469 = tail call float @llvm.nvvm.ex2.approx.f(float %1353) #2, !dbg !90 + br label %__nv_exp2f.exit322, !dbg !90 + +__nv_exp2f.exit322: ; preds = %1466, %1468 + %.0.i321 = phi float [ %1467, %1466 ], [ %1469, %1468 ], !dbg !90 + %1470 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i323 = icmp eq i32 %1470, 0, !dbg !90 + br i1 %.not.i323, label %1473, label %1471, !dbg !90 + +1471: ; preds = %__nv_exp2f.exit322 + %1472 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1354) #2, !dbg !90 + br label %__nv_exp2f.exit325, !dbg !90 + +1473: ; preds = %__nv_exp2f.exit322 + %1474 = tail call float @llvm.nvvm.ex2.approx.f(float %1354) #2, !dbg !90 + br label %__nv_exp2f.exit325, !dbg !90 + +__nv_exp2f.exit325: ; preds = %1471, %1473 + %.0.i324 = phi float [ %1472, %1471 ], [ %1474, %1473 ], !dbg !90 + %1475 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i326 = icmp eq i32 %1475, 0, !dbg !90 + br i1 %.not.i326, label %1478, label %1476, !dbg !90 + +1476: ; preds = %__nv_exp2f.exit325 + %1477 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1355) #2, !dbg !90 + br label %__nv_exp2f.exit328, !dbg !90 + +1478: ; preds = %__nv_exp2f.exit325 + %1479 = tail call float @llvm.nvvm.ex2.approx.f(float %1355) #2, !dbg !90 + br label %__nv_exp2f.exit328, !dbg !90 + +__nv_exp2f.exit328: ; preds = %1476, %1478 + %.0.i327 = phi float [ %1477, %1476 ], [ %1479, %1478 ], !dbg !90 + %1480 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i329 = icmp eq i32 %1480, 0, !dbg !90 + br i1 %.not.i329, label %1483, label %1481, !dbg !90 + +1481: ; preds = %__nv_exp2f.exit328 + %1482 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1356) #2, !dbg !90 + br label %__nv_exp2f.exit331, !dbg !90 + +1483: ; preds = %__nv_exp2f.exit328 + %1484 = tail call float @llvm.nvvm.ex2.approx.f(float %1356) #2, !dbg !90 + br label %__nv_exp2f.exit331, !dbg !90 + +__nv_exp2f.exit331: ; preds = %1481, %1483 + %.0.i330 = phi float [ %1482, %1481 ], [ %1484, %1483 ], !dbg !90 + %1485 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i332 = icmp eq i32 %1485, 0, !dbg !90 + br i1 %.not.i332, label %1488, label %1486, !dbg !90 + +1486: ; preds = %__nv_exp2f.exit331 + %1487 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1357) #2, !dbg !90 + br label %__nv_exp2f.exit334, !dbg !90 + +1488: ; preds = %__nv_exp2f.exit331 + %1489 = tail call float @llvm.nvvm.ex2.approx.f(float %1357) #2, !dbg !90 + br label %__nv_exp2f.exit334, !dbg !90 + +__nv_exp2f.exit334: ; preds = %1486, %1488 + %.0.i333 = phi float [ %1487, %1486 ], [ %1489, %1488 ], !dbg !90 + %1490 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i335 = icmp eq i32 %1490, 0, !dbg !90 + br i1 %.not.i335, label %1493, label %1491, !dbg !90 + +1491: ; preds = %__nv_exp2f.exit334 + %1492 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1358) #2, !dbg !90 + br label %__nv_exp2f.exit337, !dbg !90 + +1493: ; preds = %__nv_exp2f.exit334 + %1494 = tail call float @llvm.nvvm.ex2.approx.f(float %1358) #2, !dbg !90 + br label %__nv_exp2f.exit337, !dbg !90 + +__nv_exp2f.exit337: ; preds = %1491, %1493 + %.0.i336 = phi float [ %1492, %1491 ], [ %1494, %1493 ], !dbg !90 + %1495 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i338 = icmp eq i32 %1495, 0, !dbg !90 + br i1 %.not.i338, label %1498, label %1496, !dbg !90 + +1496: ; preds = %__nv_exp2f.exit337 + %1497 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1359) #2, !dbg !90 + br label %__nv_exp2f.exit340, !dbg !90 + +1498: ; preds = %__nv_exp2f.exit337 + %1499 = tail call float @llvm.nvvm.ex2.approx.f(float %1359) #2, !dbg !90 + br label %__nv_exp2f.exit340, !dbg !90 + +__nv_exp2f.exit340: ; preds = %1496, %1498 + %.0.i339 = phi float [ %1497, %1496 ], [ %1499, %1498 ], !dbg !90 + %1500 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i341 = icmp eq i32 %1500, 0, !dbg !90 + br i1 %.not.i341, label %1503, label %1501, !dbg !90 + +1501: ; preds = %__nv_exp2f.exit340 + %1502 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1360) #2, !dbg !90 + br label %__nv_exp2f.exit343, !dbg !90 + +1503: ; preds = %__nv_exp2f.exit340 + %1504 = tail call float @llvm.nvvm.ex2.approx.f(float %1360) #2, !dbg !90 + br label %__nv_exp2f.exit343, !dbg !90 + +__nv_exp2f.exit343: ; preds = %1501, %1503 + %.0.i342 = phi float [ %1502, %1501 ], [ %1504, %1503 ], !dbg !90 + %1505 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i344 = icmp eq i32 %1505, 0, !dbg !90 + br i1 %.not.i344, label %1508, label %1506, !dbg !90 + +1506: ; preds = %__nv_exp2f.exit343 + %1507 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1361) #2, !dbg !90 + br label %__nv_exp2f.exit346, !dbg !90 + +1508: ; preds = %__nv_exp2f.exit343 + %1509 = tail call float @llvm.nvvm.ex2.approx.f(float %1361) #2, !dbg !90 + br label %__nv_exp2f.exit346, !dbg !90 + +__nv_exp2f.exit346: ; preds = %1506, %1508 + %.0.i345 = phi float [ %1507, %1506 ], [ %1509, %1508 ], !dbg !90 + %1510 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i347 = icmp eq i32 %1510, 0, !dbg !90 + br i1 %.not.i347, label %1513, label %1511, !dbg !90 + +1511: ; preds = %__nv_exp2f.exit346 + %1512 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1362) #2, !dbg !90 + br label %__nv_exp2f.exit349, !dbg !90 + +1513: ; preds = %__nv_exp2f.exit346 + %1514 = tail call float @llvm.nvvm.ex2.approx.f(float %1362) #2, !dbg !90 + br label %__nv_exp2f.exit349, !dbg !90 + +__nv_exp2f.exit349: ; preds = %1511, %1513 + %.0.i348 = phi float [ %1512, %1511 ], [ %1514, %1513 ], !dbg !90 + %1515 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i350 = icmp eq i32 %1515, 0, !dbg !90 + br i1 %.not.i350, label %1518, label %1516, !dbg !90 + +1516: ; preds = %__nv_exp2f.exit349 + %1517 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1363) #2, !dbg !90 + br label %__nv_exp2f.exit352, !dbg !90 + +1518: ; preds = %__nv_exp2f.exit349 + %1519 = tail call float @llvm.nvvm.ex2.approx.f(float %1363) #2, !dbg !90 + br label %__nv_exp2f.exit352, !dbg !90 + +__nv_exp2f.exit352: ; preds = %1516, %1518 + %.0.i351 = phi float [ %1517, %1516 ], [ %1519, %1518 ], !dbg !90 + %1520 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i353 = icmp eq i32 %1520, 0, !dbg !90 + br i1 %.not.i353, label %1523, label %1521, !dbg !90 + +1521: ; preds = %__nv_exp2f.exit352 + %1522 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1364) #2, !dbg !90 + br label %__nv_exp2f.exit355, !dbg !90 + +1523: ; preds = %__nv_exp2f.exit352 + %1524 = tail call float @llvm.nvvm.ex2.approx.f(float %1364) #2, !dbg !90 + br label %__nv_exp2f.exit355, !dbg !90 + +__nv_exp2f.exit355: ; preds = %1521, %1523 + %.0.i354 = phi float [ %1522, %1521 ], [ %1524, %1523 ], !dbg !90 + %1525 = fmul float %351, %.0.i255, !dbg !91 + %1526 = fmul float %352, %.0.i258, !dbg !91 + %1527 = fadd float %.0.i261, %.0.i264, !dbg !92 + %1528 = fadd float %.0.i267, %.0.i270, !dbg !92 + %1529 = fadd float %1527, %.0.i273, !dbg !92 + %1530 = fadd float %1529, %.0.i276, !dbg !92 + %1531 = fadd float %1528, %.0.i279, !dbg !92 + %1532 = fadd float %1531, %.0.i282, !dbg !92 + %1533 = fadd float %1530, %.0.i285, !dbg !92 + %1534 = fadd float %1533, %.0.i288, !dbg !92 + %1535 = fadd float %1532, %.0.i291, !dbg !92 + %1536 = fadd float %1535, %.0.i294, !dbg !92 + %1537 = fadd float %1534, %.0.i297, !dbg !92 + %1538 = fadd float %1537, %.0.i300, !dbg !92 + %1539 = fadd float %1536, %.0.i303, !dbg !92 + %1540 = fadd float %1539, %.0.i306, !dbg !92 + %1541 = fadd float %1538, %.0.i309, !dbg !92 + %1542 = fadd float %1541, %.0.i312, !dbg !92 + %1543 = fadd float %1540, %.0.i315, !dbg !92 + %1544 = fadd float %1543, %.0.i318, !dbg !92 + %1545 = fadd float %1542, %.0.i321, !dbg !92 + %1546 = fadd float %1545, %.0.i324, !dbg !92 + %1547 = fadd float %1544, %.0.i327, !dbg !92 + %1548 = fadd float %1547, %.0.i330, !dbg !92 + %1549 = fadd float %1546, %.0.i333, !dbg !92 + %1550 = fadd float %1549, %.0.i336, !dbg !92 + %1551 = fadd float %1548, %.0.i339, !dbg !92 + %1552 = fadd float %1551, %.0.i342, !dbg !92 + %1553 = fadd float %1550, %.0.i345, !dbg !92 + %1554 = fadd float %1553, %.0.i348, !dbg !92 + %1555 = fadd float %1552, %.0.i351, !dbg !92 + %1556 = fadd float %1555, %.0.i354, !dbg !92 + %1557 = bitcast float %1554 to i32, !dbg !93 + %1558 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1557, i32 2, i32 31), !dbg !93 + %1559 = bitcast i32 %1558 to float, !dbg !93 + %1560 = fadd float %1554, %1559, !dbg !92 + %1561 = bitcast float %1560 to i32, !dbg !93 + %1562 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1561, i32 1, i32 31), !dbg !93 + %1563 = bitcast i32 %1562 to float, !dbg !93 + %1564 = fadd float %1560, %1563, !dbg !92 + %1565 = bitcast float %1556 to i32, !dbg !93 + %1566 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1565, i32 2, i32 31), !dbg !93 + %1567 = bitcast i32 %1566 to float, !dbg !93 + %1568 = fadd float %1556, %1567, !dbg !92 + %1569 = bitcast float %1568 to i32, !dbg !93 + %1570 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1569, i32 1, i32 31), !dbg !93 + %1571 = bitcast i32 %1570 to float, !dbg !93 + %1572 = fadd float %1568, %1571, !dbg !92 + %1573 = fadd float %1525, %1564, !dbg !94 + %1574 = fadd float %1526, %1572, !dbg !94 + %1575 = fmul float %807, %.0.i255, !dbg !95 + %1576 = fmul float %808, %.0.i255, !dbg !95 + %1577 = fmul float %809, %.0.i258, !dbg !95 + %1578 = fmul float %810, %.0.i258, !dbg !95 + %1579 = fmul float %811, %.0.i255, !dbg !95 + %1580 = fmul float %812, %.0.i255, !dbg !95 + %1581 = fmul float %813, %.0.i258, !dbg !95 + %1582 = fmul float %814, %.0.i258, !dbg !95 + %1583 = fmul float %815, %.0.i255, !dbg !95 + %1584 = fmul float %816, %.0.i255, !dbg !95 + %1585 = fmul float %817, %.0.i258, !dbg !95 + %1586 = fmul float %818, %.0.i258, !dbg !95 + %1587 = fmul float %819, %.0.i255, !dbg !95 + %1588 = fmul float %820, %.0.i255, !dbg !95 + %1589 = fmul float %821, %.0.i258, !dbg !95 + %1590 = fmul float %822, %.0.i258, !dbg !95 + %1591 = fmul float %823, %.0.i255, !dbg !95 + %1592 = fmul float %824, %.0.i255, !dbg !95 + %1593 = fmul float %825, %.0.i258, !dbg !95 + %1594 = fmul float %826, %.0.i258, !dbg !95 + %1595 = fmul float %827, %.0.i255, !dbg !95 + %1596 = fmul float %828, %.0.i255, !dbg !95 + %1597 = fmul float %829, %.0.i258, !dbg !95 + %1598 = fmul float %830, %.0.i258, !dbg !95 + %1599 = fmul float %831, %.0.i255, !dbg !95 + %1600 = fmul float %832, %.0.i255, !dbg !95 + %1601 = fmul float %833, %.0.i258, !dbg !95 + %1602 = fmul float %834, %.0.i258, !dbg !95 + %1603 = fmul float %835, %.0.i255, !dbg !95 + %1604 = fmul float %836, %.0.i255, !dbg !95 + %1605 = fmul float %837, %.0.i258, !dbg !95 + %1606 = fmul float %838, %.0.i258, !dbg !95 + %1607 = fmul float %839, %.0.i255, !dbg !95 + %1608 = fmul float %840, %.0.i255, !dbg !95 + %1609 = fmul float %841, %.0.i258, !dbg !95 + %1610 = fmul float %842, %.0.i258, !dbg !95 + %1611 = fmul float %843, %.0.i255, !dbg !95 + %1612 = fmul float %844, %.0.i255, !dbg !95 + %1613 = fmul float %845, %.0.i258, !dbg !95 + %1614 = fmul float %846, %.0.i258, !dbg !95 + %1615 = fmul float %847, %.0.i255, !dbg !95 + %1616 = fmul float %848, %.0.i255, !dbg !95 + %1617 = fmul float %849, %.0.i258, !dbg !95 + %1618 = fmul float %850, %.0.i258, !dbg !95 + %1619 = fmul float %851, %.0.i255, !dbg !95 + %1620 = fmul float %852, %.0.i255, !dbg !95 + %1621 = fmul float %853, %.0.i258, !dbg !95 + %1622 = fmul float %854, %.0.i258, !dbg !95 + %1623 = fmul float %855, %.0.i255, !dbg !95 + %1624 = fmul float %856, %.0.i255, !dbg !95 + %1625 = fmul float %857, %.0.i258, !dbg !95 + %1626 = fmul float %858, %.0.i258, !dbg !95 + %1627 = fmul float %859, %.0.i255, !dbg !95 + %1628 = fmul float %860, %.0.i255, !dbg !95 + %1629 = fmul float %861, %.0.i258, !dbg !95 + %1630 = fmul float %862, %.0.i258, !dbg !95 + %1631 = fmul float %863, %.0.i255, !dbg !95 + %1632 = fmul float %864, %.0.i255, !dbg !95 + %1633 = fmul float %865, %.0.i258, !dbg !95 + %1634 = fmul float %866, %.0.i258, !dbg !95 + %1635 = fmul float %867, %.0.i255, !dbg !95 + %1636 = fmul float %868, %.0.i255, !dbg !95 + %1637 = fmul float %869, %.0.i258, !dbg !95 + %1638 = fmul float %870, %.0.i258, !dbg !95 + %1639 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %425, !dbg !61 + %1640 = insertelement <2 x float> poison, float %.0.i261, i64 0, !dbg !96 + %1641 = insertelement <2 x float> %1640, float %.0.i264, i64 1, !dbg !96 + %1642 = fptrunc <2 x float> %1641 to <2 x bfloat>, !dbg !96 + %1643 = insertelement <2 x float> poison, float %.0.i267, i64 0, !dbg !96 + %1644 = insertelement <2 x float> %1643, float %.0.i270, i64 1, !dbg !96 + %1645 = fptrunc <2 x float> %1644 to <2 x bfloat>, !dbg !96 + %1646 = insertelement <2 x float> poison, float %.0.i273, i64 0, !dbg !96 + %1647 = insertelement <2 x float> %1646, float %.0.i276, i64 1, !dbg !96 + %1648 = fptrunc <2 x float> %1647 to <2 x bfloat>, !dbg !96 + %1649 = insertelement <2 x float> poison, float %.0.i279, i64 0, !dbg !96 + %1650 = insertelement <2 x float> %1649, float %.0.i282, i64 1, !dbg !96 + %1651 = fptrunc <2 x float> %1650 to <2 x bfloat>, !dbg !96 + %1652 = insertelement <2 x float> poison, float %.0.i285, i64 0, !dbg !96 + %1653 = insertelement <2 x float> %1652, float %.0.i288, i64 1, !dbg !96 + %1654 = fptrunc <2 x float> %1653 to <2 x bfloat>, !dbg !96 + %1655 = insertelement <2 x float> poison, float %.0.i291, i64 0, !dbg !96 + %1656 = insertelement <2 x float> %1655, float %.0.i294, i64 1, !dbg !96 + %1657 = fptrunc <2 x float> %1656 to <2 x bfloat>, !dbg !96 + %1658 = insertelement <2 x float> poison, float %.0.i297, i64 0, !dbg !96 + %1659 = insertelement <2 x float> %1658, float %.0.i300, i64 1, !dbg !96 + %1660 = fptrunc <2 x float> %1659 to <2 x bfloat>, !dbg !96 + %1661 = insertelement <2 x float> poison, float %.0.i303, i64 0, !dbg !96 + %1662 = insertelement <2 x float> %1661, float %.0.i306, i64 1, !dbg !96 + %1663 = fptrunc <2 x float> %1662 to <2 x bfloat>, !dbg !96 + %1664 = insertelement <2 x float> poison, float %.0.i309, i64 0, !dbg !96 + %1665 = insertelement <2 x float> %1664, float %.0.i312, i64 1, !dbg !96 + %1666 = fptrunc <2 x float> %1665 to <2 x bfloat>, !dbg !96 + %1667 = insertelement <2 x float> poison, float %.0.i315, i64 0, !dbg !96 + %1668 = insertelement <2 x float> %1667, float %.0.i318, i64 1, !dbg !96 + %1669 = fptrunc <2 x float> %1668 to <2 x bfloat>, !dbg !96 + %1670 = insertelement <2 x float> poison, float %.0.i321, i64 0, !dbg !96 + %1671 = insertelement <2 x float> %1670, float %.0.i324, i64 1, !dbg !96 + %1672 = fptrunc <2 x float> %1671 to <2 x bfloat>, !dbg !96 + %1673 = insertelement <2 x float> poison, float %.0.i327, i64 0, !dbg !96 + %1674 = insertelement <2 x float> %1673, float %.0.i330, i64 1, !dbg !96 + %1675 = fptrunc <2 x float> %1674 to <2 x bfloat>, !dbg !96 + %1676 = insertelement <2 x float> poison, float %.0.i333, i64 0, !dbg !96 + %1677 = insertelement <2 x float> %1676, float %.0.i336, i64 1, !dbg !96 + %1678 = fptrunc <2 x float> %1677 to <2 x bfloat>, !dbg !96 + %1679 = insertelement <2 x float> poison, float %.0.i339, i64 0, !dbg !96 + %1680 = insertelement <2 x float> %1679, float %.0.i342, i64 1, !dbg !96 + %1681 = fptrunc <2 x float> %1680 to <2 x bfloat>, !dbg !96 + %1682 = insertelement <2 x float> poison, float %.0.i345, i64 0, !dbg !96 + %1683 = insertelement <2 x float> %1682, float %.0.i348, i64 1, !dbg !96 + %1684 = fptrunc <2 x float> %1683 to <2 x bfloat>, !dbg !96 + %1685 = insertelement <2 x float> poison, float %.0.i351, i64 0, !dbg !96 + %1686 = insertelement <2 x float> %1685, float %.0.i354, i64 1, !dbg !96 + %1687 = fptrunc <2 x float> %1686 to <2 x bfloat>, !dbg !96 + %1688 = bitcast <2 x bfloat> %1642 to i32, !dbg !97 + %1689 = bitcast <2 x bfloat> %1645 to i32, !dbg !97 + %1690 = bitcast <2 x bfloat> %1648 to i32, !dbg !97 + %1691 = bitcast <2 x bfloat> %1651 to i32, !dbg !97 + %1692 = bitcast <2 x bfloat> %1654 to i32, !dbg !97 + %1693 = bitcast <2 x bfloat> %1657 to i32, !dbg !97 + %1694 = bitcast <2 x bfloat> %1660 to i32, !dbg !97 + %1695 = bitcast <2 x bfloat> %1663 to i32, !dbg !97 + %1696 = bitcast <2 x bfloat> %1666 to i32, !dbg !97 + %1697 = bitcast <2 x bfloat> %1669 to i32, !dbg !97 + %1698 = bitcast <2 x bfloat> %1672 to i32, !dbg !97 + %1699 = bitcast <2 x bfloat> %1675 to i32, !dbg !97 + %1700 = bitcast <2 x bfloat> %1678 to i32, !dbg !97 + %1701 = bitcast <2 x bfloat> %1681 to i32, !dbg !97 + %1702 = bitcast <2 x bfloat> %1684 to i32, !dbg !97 + %1703 = bitcast <2 x bfloat> %1687 to i32, !dbg !97 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !97 + %1704 = ptrtoint ptr addrspace(3) %1639 to i32, !dbg !97 + %1705 = lshr exact i32 %1704, 4, !dbg !97 + %1706 = and i32 %1705, 16383, !dbg !97 + %1707 = zext nneg i32 %1706 to i64, !dbg !97 + %1708 = or disjoint i64 %1707, 4611686293338849280, !dbg !97 + %1709 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1575, float %1576, float %1577, float %1578, float %1579, float %1580, float %1581, float %1582, float %1583, float %1584, float %1585, float %1586, float %1587, float %1588, float %1589, float %1590, float %1591, float %1592, float %1593, float %1594, float %1595, float %1596, float %1597, float %1598, float %1599, float %1600, float %1601, float %1602, float %1603, float %1604, float %1605, float %1606, float %1607, float %1608, float %1609, float %1610, float %1611, float %1612, float %1613, float %1614, float %1615, float %1616, float %1617, float %1618, float %1619, float %1620, float %1621, float %1622, float %1623, float %1624, float %1625, float %1626, float %1627, float %1628, float %1629, float %1630, float %1631, float %1632, float %1633, float %1634, float %1635, float %1636, float %1637, float %1638, i32 %1688, i32 %1689, i32 %1690, i32 %1691, i64 %1708, i1 true) #2, !dbg !97 + %1710 = add i32 %1704, 2048, !dbg !97 + %1711 = lshr exact i32 %1710, 4, !dbg !97 + %1712 = and i32 %1711, 16383, !dbg !97 + %1713 = zext nneg i32 %1712 to i64, !dbg !97 + %1714 = or disjoint i64 %1713, 4611686293338849280, !dbg !97 + %1715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 0, !dbg !97 + %1716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 1, !dbg !97 + %1717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 2, !dbg !97 + %1718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 3, !dbg !97 + %1719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 4, !dbg !97 + %1720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 5, !dbg !97 + %1721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 6, !dbg !97 + %1722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 7, !dbg !97 + %1723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 8, !dbg !97 + %1724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 9, !dbg !97 + %1725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 10, !dbg !97 + %1726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 11, !dbg !97 + %1727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 12, !dbg !97 + %1728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 13, !dbg !97 + %1729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 14, !dbg !97 + %1730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 15, !dbg !97 + %1731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 16, !dbg !97 + %1732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 17, !dbg !97 + %1733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 18, !dbg !97 + %1734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 19, !dbg !97 + %1735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 20, !dbg !97 + %1736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 21, !dbg !97 + %1737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 22, !dbg !97 + %1738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 23, !dbg !97 + %1739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 24, !dbg !97 + %1740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 25, !dbg !97 + %1741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 26, !dbg !97 + %1742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 27, !dbg !97 + %1743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 28, !dbg !97 + %1744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 29, !dbg !97 + %1745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 30, !dbg !97 + %1746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 31, !dbg !97 + %1747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 32, !dbg !97 + %1748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 33, !dbg !97 + %1749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 34, !dbg !97 + %1750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 35, !dbg !97 + %1751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 36, !dbg !97 + %1752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 37, !dbg !97 + %1753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 38, !dbg !97 + %1754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 39, !dbg !97 + %1755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 40, !dbg !97 + %1756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 41, !dbg !97 + %1757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 42, !dbg !97 + %1758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 43, !dbg !97 + %1759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 44, !dbg !97 + %1760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 45, !dbg !97 + %1761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 46, !dbg !97 + %1762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 47, !dbg !97 + %1763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 48, !dbg !97 + %1764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 49, !dbg !97 + %1765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 50, !dbg !97 + %1766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 51, !dbg !97 + %1767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 52, !dbg !97 + %1768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 53, !dbg !97 + %1769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 54, !dbg !97 + %1770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 55, !dbg !97 + %1771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 56, !dbg !97 + %1772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 57, !dbg !97 + %1773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 58, !dbg !97 + %1774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 59, !dbg !97 + %1775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 60, !dbg !97 + %1776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 61, !dbg !97 + %1777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 62, !dbg !97 + %1778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 63, !dbg !97 + %1779 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1715, float %1716, float %1717, float %1718, float %1719, float %1720, float %1721, float %1722, float %1723, float %1724, float %1725, float %1726, float %1727, float %1728, float %1729, float %1730, float %1731, float %1732, float %1733, float %1734, float %1735, float %1736, float %1737, float %1738, float %1739, float %1740, float %1741, float %1742, float %1743, float %1744, float %1745, float %1746, float %1747, float %1748, float %1749, float %1750, float %1751, float %1752, float %1753, float %1754, float %1755, float %1756, float %1757, float %1758, float %1759, float %1760, float %1761, float %1762, float %1763, float %1764, float %1765, float %1766, float %1767, float %1768, float %1769, float %1770, float %1771, float %1772, float %1773, float %1774, float %1775, float %1776, float %1777, float %1778, i32 %1692, i32 %1693, i32 %1694, i32 %1695, i64 %1714, i1 true) #2, !dbg !97 + %1780 = add i32 %1704, 4096, !dbg !97 + %1781 = lshr exact i32 %1780, 4, !dbg !97 + %1782 = and i32 %1781, 16383, !dbg !97 + %1783 = zext nneg i32 %1782 to i64, !dbg !97 + %1784 = or disjoint i64 %1783, 4611686293338849280, !dbg !97 + %1785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 0, !dbg !97 + %1786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 1, !dbg !97 + %1787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 2, !dbg !97 + %1788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 3, !dbg !97 + %1789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 4, !dbg !97 + %1790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 5, !dbg !97 + %1791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 6, !dbg !97 + %1792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 7, !dbg !97 + %1793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 8, !dbg !97 + %1794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 9, !dbg !97 + %1795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 10, !dbg !97 + %1796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 11, !dbg !97 + %1797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 12, !dbg !97 + %1798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 13, !dbg !97 + %1799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 14, !dbg !97 + %1800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 15, !dbg !97 + %1801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 16, !dbg !97 + %1802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 17, !dbg !97 + %1803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 18, !dbg !97 + %1804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 19, !dbg !97 + %1805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 20, !dbg !97 + %1806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 21, !dbg !97 + %1807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 22, !dbg !97 + %1808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 23, !dbg !97 + %1809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 24, !dbg !97 + %1810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 25, !dbg !97 + %1811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 26, !dbg !97 + %1812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 27, !dbg !97 + %1813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 28, !dbg !97 + %1814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 29, !dbg !97 + %1815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 30, !dbg !97 + %1816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 31, !dbg !97 + %1817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 32, !dbg !97 + %1818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 33, !dbg !97 + %1819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 34, !dbg !97 + %1820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 35, !dbg !97 + %1821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 36, !dbg !97 + %1822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 37, !dbg !97 + %1823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 38, !dbg !97 + %1824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 39, !dbg !97 + %1825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 40, !dbg !97 + %1826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 41, !dbg !97 + %1827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 42, !dbg !97 + %1828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 43, !dbg !97 + %1829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 44, !dbg !97 + %1830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 45, !dbg !97 + %1831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 46, !dbg !97 + %1832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 47, !dbg !97 + %1833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 48, !dbg !97 + %1834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 49, !dbg !97 + %1835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 50, !dbg !97 + %1836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 51, !dbg !97 + %1837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 52, !dbg !97 + %1838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 53, !dbg !97 + %1839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 54, !dbg !97 + %1840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 55, !dbg !97 + %1841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 56, !dbg !97 + %1842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 57, !dbg !97 + %1843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 58, !dbg !97 + %1844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 59, !dbg !97 + %1845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 60, !dbg !97 + %1846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 61, !dbg !97 + %1847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 62, !dbg !97 + %1848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 63, !dbg !97 + %1849 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1785, float %1786, float %1787, float %1788, float %1789, float %1790, float %1791, float %1792, float %1793, float %1794, float %1795, float %1796, float %1797, float %1798, float %1799, float %1800, float %1801, float %1802, float %1803, float %1804, float %1805, float %1806, float %1807, float %1808, float %1809, float %1810, float %1811, float %1812, float %1813, float %1814, float %1815, float %1816, float %1817, float %1818, float %1819, float %1820, float %1821, float %1822, float %1823, float %1824, float %1825, float %1826, float %1827, float %1828, float %1829, float %1830, float %1831, float %1832, float %1833, float %1834, float %1835, float %1836, float %1837, float %1838, float %1839, float %1840, float %1841, float %1842, float %1843, float %1844, float %1845, float %1846, float %1847, float %1848, i32 %1696, i32 %1697, i32 %1698, i32 %1699, i64 %1784, i1 true) #2, !dbg !97 + %1850 = add i32 %1704, 6144, !dbg !97 + %1851 = lshr exact i32 %1850, 4, !dbg !97 + %1852 = and i32 %1851, 16383, !dbg !97 + %1853 = zext nneg i32 %1852 to i64, !dbg !97 + %1854 = or disjoint i64 %1853, 4611686293338849280, !dbg !97 + %1855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 0, !dbg !97 + %1856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 1, !dbg !97 + %1857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 2, !dbg !97 + %1858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 3, !dbg !97 + %1859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 4, !dbg !97 + %1860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 5, !dbg !97 + %1861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 6, !dbg !97 + %1862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 7, !dbg !97 + %1863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 8, !dbg !97 + %1864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 9, !dbg !97 + %1865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 10, !dbg !97 + %1866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 11, !dbg !97 + %1867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 12, !dbg !97 + %1868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 13, !dbg !97 + %1869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 14, !dbg !97 + %1870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 15, !dbg !97 + %1871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 16, !dbg !97 + %1872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 17, !dbg !97 + %1873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 18, !dbg !97 + %1874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 19, !dbg !97 + %1875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 20, !dbg !97 + %1876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 21, !dbg !97 + %1877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 22, !dbg !97 + %1878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 23, !dbg !97 + %1879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 24, !dbg !97 + %1880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 25, !dbg !97 + %1881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 26, !dbg !97 + %1882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 27, !dbg !97 + %1883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 28, !dbg !97 + %1884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 29, !dbg !97 + %1885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 30, !dbg !97 + %1886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 31, !dbg !97 + %1887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 32, !dbg !97 + %1888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 33, !dbg !97 + %1889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 34, !dbg !97 + %1890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 35, !dbg !97 + %1891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 36, !dbg !97 + %1892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 37, !dbg !97 + %1893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 38, !dbg !97 + %1894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 39, !dbg !97 + %1895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 40, !dbg !97 + %1896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 41, !dbg !97 + %1897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 42, !dbg !97 + %1898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 43, !dbg !97 + %1899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 44, !dbg !97 + %1900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 45, !dbg !97 + %1901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 46, !dbg !97 + %1902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 47, !dbg !97 + %1903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 48, !dbg !97 + %1904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 49, !dbg !97 + %1905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 50, !dbg !97 + %1906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 51, !dbg !97 + %1907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 52, !dbg !97 + %1908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 53, !dbg !97 + %1909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 54, !dbg !97 + %1910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 55, !dbg !97 + %1911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 56, !dbg !97 + %1912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 57, !dbg !97 + %1913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 58, !dbg !97 + %1914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 59, !dbg !97 + %1915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 60, !dbg !97 + %1916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 61, !dbg !97 + %1917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 62, !dbg !97 + %1918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 63, !dbg !97 + %1919 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1855, float %1856, float %1857, float %1858, float %1859, float %1860, float %1861, float %1862, float %1863, float %1864, float %1865, float %1866, float %1867, float %1868, float %1869, float %1870, float %1871, float %1872, float %1873, float %1874, float %1875, float %1876, float %1877, float %1878, float %1879, float %1880, float %1881, float %1882, float %1883, float %1884, float %1885, float %1886, float %1887, float %1888, float %1889, float %1890, float %1891, float %1892, float %1893, float %1894, float %1895, float %1896, float %1897, float %1898, float %1899, float %1900, float %1901, float %1902, float %1903, float %1904, float %1905, float %1906, float %1907, float %1908, float %1909, float %1910, float %1911, float %1912, float %1913, float %1914, float %1915, float %1916, float %1917, float %1918, i32 %1700, i32 %1701, i32 %1702, i32 %1703, i64 %1854, i1 true) #2, !dbg !97 + %1920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 0, !dbg !97 + %1921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 1, !dbg !97 + %1922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 2, !dbg !97 + %1923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 3, !dbg !97 + %1924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 4, !dbg !97 + %1925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 5, !dbg !97 + %1926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 6, !dbg !97 + %1927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 7, !dbg !97 + %1928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 8, !dbg !97 + %1929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 9, !dbg !97 + %1930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 10, !dbg !97 + %1931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 11, !dbg !97 + %1932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 12, !dbg !97 + %1933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 13, !dbg !97 + %1934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 14, !dbg !97 + %1935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 15, !dbg !97 + %1936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 16, !dbg !97 + %1937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 17, !dbg !97 + %1938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 18, !dbg !97 + %1939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 19, !dbg !97 + %1940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 20, !dbg !97 + %1941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 21, !dbg !97 + %1942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 22, !dbg !97 + %1943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 23, !dbg !97 + %1944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 24, !dbg !97 + %1945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 25, !dbg !97 + %1946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 26, !dbg !97 + %1947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 27, !dbg !97 + %1948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 28, !dbg !97 + %1949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 29, !dbg !97 + %1950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 30, !dbg !97 + %1951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 31, !dbg !97 + %1952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 32, !dbg !97 + %1953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 33, !dbg !97 + %1954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 34, !dbg !97 + %1955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 35, !dbg !97 + %1956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 36, !dbg !97 + %1957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 37, !dbg !97 + %1958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 38, !dbg !97 + %1959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 39, !dbg !97 + %1960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 40, !dbg !97 + %1961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 41, !dbg !97 + %1962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 42, !dbg !97 + %1963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 43, !dbg !97 + %1964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 44, !dbg !97 + %1965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 45, !dbg !97 + %1966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 46, !dbg !97 + %1967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 47, !dbg !97 + %1968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 48, !dbg !97 + %1969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 49, !dbg !97 + %1970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 50, !dbg !97 + %1971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 51, !dbg !97 + %1972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 52, !dbg !97 + %1973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 53, !dbg !97 + %1974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 54, !dbg !97 + %1975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 55, !dbg !97 + %1976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 56, !dbg !97 + %1977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 57, !dbg !97 + %1978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 58, !dbg !97 + %1979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 59, !dbg !97 + %1980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 60, !dbg !97 + %1981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 61, !dbg !97 + %1982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 62, !dbg !97 + %1983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 63, !dbg !97 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !97 + %1984 = insertelement <16 x i32> poison, i32 %347, i64 0, !dbg !98 + %1985 = shufflevector <16 x i32> %1984, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !98 + %1986 = add <16 x i32> %1985, %419, !dbg !98 + %1987 = add nuw nsw i32 %417, 1, !dbg !52 + %1988 = lshr i32 %1987, 1, !dbg !99 + %1989 = zext nneg i32 %1988 to i64, !dbg !100 + %1990 = getelementptr i32, ptr addrspace(1) %187, i64 %1989, !dbg !100 + %1991 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !101 + %1992 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %1990, i64 %1991, i1 %421) #2, !dbg !101 + %1993 = add nuw nsw i32 %1988, 1, !dbg !102 + %1994 = icmp slt i32 %1993, %192, !dbg !103 + %1995 = getelementptr i8, ptr addrspace(1) %1990, i64 4, !dbg !104 + %1996 = and i1 %421, %1994, !dbg !52 + %1997 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !105 + %1998 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %1995, i64 %1997, i1 %1996) #2, !dbg !105 + %1999 = and i32 %417, 1, !dbg !106 + %2000 = sub i32 %1998, %1992, !dbg !107 + %2001 = shl i32 %2000, 7, !dbg !108 + %2002 = add i32 %2001, -64, !dbg !109 + %2003 = xor i32 %1999, 1, !dbg !110 + %2004 = mul nuw nsw i32 %2002, %2003, !dbg !110 + %2005 = shl nuw nsw i32 %1999, 6, !dbg !111 + %2006 = add i32 %2004, %2005, !dbg !112 + %2007 = add i32 %2006, %350, !dbg !113 + %2008 = add i32 %349, 1, !dbg !52 + %2009 = icmp sgt i32 %2008, 2, !dbg !52 + %2010 = select i1 %2009, i32 0, i32 %2008, !dbg !52 + %2011 = add i32 %2007, %189, !dbg !62 + %2012 = add i32 %2011, %37, !dbg !56 + %2013 = add i32 %2011, %38, !dbg !56 + %2014 = add i32 %2011, %39, !dbg !56 + %2015 = add i32 %2011, %40, !dbg !56 + %2016 = shl i32 %2012, 7, !dbg !57 + %2017 = shl i32 %2013, 7, !dbg !57 + %2018 = shl i32 %2014, 7, !dbg !57 + %2019 = shl i32 %2015, 7, !dbg !57 + %2020 = sext i32 %2016 to i64, !dbg !58 + %2021 = sext i32 %2017 to i64, !dbg !58 + %2022 = sext i32 %2018 to i64, !dbg !58 + %2023 = sext i32 %2019 to i64, !dbg !58 + %gep = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %2020, !dbg !59 + %gep396 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %2021, !dbg !59 + %gep398 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %2022, !dbg !59 + %gep400 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %2023, !dbg !59 + %2024 = icmp slt i32 %2012, %11, !dbg !60 + %2025 = icmp slt i32 %2013, %11, !dbg !60 + %2026 = icmp slt i32 %2014, %11, !dbg !60 + %2027 = icmp slt i32 %2015, %11, !dbg !60 + %2028 = shl i32 %2010, 13, !dbg !61 + %2029 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %2028, !dbg !61 + %2030 = and i1 %420, %2024, !dbg !52 + %2031 = and i1 %420, %2025, !dbg !52 + %2032 = and i1 %420, %2026, !dbg !52 + %2033 = and i1 %420, %2027, !dbg !52 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %2034 = getelementptr inbounds nuw i8, ptr addrspace(3) %2029, i32 %255, !dbg !61 + %2035 = select i1 %2030, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %2034, ptr addrspace(1) %gep, i32 %2035) #2, !dbg !61 + %2036 = getelementptr inbounds nuw i8, ptr addrspace(3) %2029, i32 %258, !dbg !61 + %2037 = select i1 %2031, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2036, ptr addrspace(1) %gep396, i32 %2037) #2, !dbg !61 + %2038 = getelementptr inbounds nuw i8, ptr addrspace(3) %2029, i32 %261, !dbg !61 + %2039 = select i1 %2032, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2038, ptr addrspace(1) %gep398, i32 %2039) #2, !dbg !61 + %2040 = getelementptr inbounds nuw i8, ptr addrspace(3) %2029, i32 %264, !dbg !61 + %2041 = select i1 %2033, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2040, ptr addrspace(1) %gep400, i32 %2041) #2, !dbg !61 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !61 + %gep402 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %2020, !dbg !59 + %gep404 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %2021, !dbg !59 + %gep406 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %2022, !dbg !59 + %gep408 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %2023, !dbg !59 + %2042 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %2028, !dbg !61 + %2043 = getelementptr inbounds nuw i8, ptr addrspace(3) %2042, i32 %255, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %2043, ptr addrspace(1) %gep402, i32 %2035) #2, !dbg !61 + %2044 = getelementptr inbounds nuw i8, ptr addrspace(3) %2042, i32 %258, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2044, ptr addrspace(1) %gep404, i32 %2037) #2, !dbg !61 + %2045 = getelementptr inbounds nuw i8, ptr addrspace(3) %2042, i32 %261, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2045, ptr addrspace(1) %gep406, i32 %2039) #2, !dbg !61 + %2046 = getelementptr inbounds nuw i8, ptr addrspace(3) %2042, i32 %264, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2046, ptr addrspace(1) %gep408, i32 %2041) #2, !dbg !61 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !61 + %exitcond.not = icmp eq i32 %1987, %smax, !dbg !52 + br i1 %exitcond.not, label %._crit_edge, label %346, !dbg !52 + +._crit_edge: ; preds = %__nv_exp2f.exit355, %15 + %2047 = phi float [ 0.000000e+00, %15 ], [ %1920, %__nv_exp2f.exit355 ], !dbg !114 + %2048 = phi float [ 0.000000e+00, %15 ], [ %1921, %__nv_exp2f.exit355 ], !dbg !114 + %2049 = phi float [ 0.000000e+00, %15 ], [ %1922, %__nv_exp2f.exit355 ], !dbg !114 + %2050 = phi float [ 0.000000e+00, %15 ], [ %1923, %__nv_exp2f.exit355 ], !dbg !114 + %2051 = phi float [ 0.000000e+00, %15 ], [ %1924, %__nv_exp2f.exit355 ], !dbg !114 + %2052 = phi float [ 0.000000e+00, %15 ], [ %1925, %__nv_exp2f.exit355 ], !dbg !114 + %2053 = phi float [ 0.000000e+00, %15 ], [ %1926, %__nv_exp2f.exit355 ], !dbg !114 + %2054 = phi float [ 0.000000e+00, %15 ], [ %1927, %__nv_exp2f.exit355 ], !dbg !114 + %2055 = phi float [ 0.000000e+00, %15 ], [ %1928, %__nv_exp2f.exit355 ], !dbg !114 + %2056 = phi float [ 0.000000e+00, %15 ], [ %1929, %__nv_exp2f.exit355 ], !dbg !114 + %2057 = phi float [ 0.000000e+00, %15 ], [ %1930, %__nv_exp2f.exit355 ], !dbg !114 + %2058 = phi float [ 0.000000e+00, %15 ], [ %1931, %__nv_exp2f.exit355 ], !dbg !114 + %2059 = phi float [ 0.000000e+00, %15 ], [ %1932, %__nv_exp2f.exit355 ], !dbg !114 + %2060 = phi float [ 0.000000e+00, %15 ], [ %1933, %__nv_exp2f.exit355 ], !dbg !114 + %2061 = phi float [ 0.000000e+00, %15 ], [ %1934, %__nv_exp2f.exit355 ], !dbg !114 + %2062 = phi float [ 0.000000e+00, %15 ], [ %1935, %__nv_exp2f.exit355 ], !dbg !114 + %2063 = phi float [ 0.000000e+00, %15 ], [ %1936, %__nv_exp2f.exit355 ], !dbg !114 + %2064 = phi float [ 0.000000e+00, %15 ], [ %1937, %__nv_exp2f.exit355 ], !dbg !114 + %2065 = phi float [ 0.000000e+00, %15 ], [ %1938, %__nv_exp2f.exit355 ], !dbg !114 + %2066 = phi float [ 0.000000e+00, %15 ], [ %1939, %__nv_exp2f.exit355 ], !dbg !114 + %2067 = phi float [ 0.000000e+00, %15 ], [ %1940, %__nv_exp2f.exit355 ], !dbg !114 + %2068 = phi float [ 0.000000e+00, %15 ], [ %1941, %__nv_exp2f.exit355 ], !dbg !114 + %2069 = phi float [ 0.000000e+00, %15 ], [ %1942, %__nv_exp2f.exit355 ], !dbg !114 + %2070 = phi float [ 0.000000e+00, %15 ], [ %1943, %__nv_exp2f.exit355 ], !dbg !114 + %2071 = phi float [ 0.000000e+00, %15 ], [ %1944, %__nv_exp2f.exit355 ], !dbg !114 + %2072 = phi float [ 0.000000e+00, %15 ], [ %1945, %__nv_exp2f.exit355 ], !dbg !114 + %2073 = phi float [ 0.000000e+00, %15 ], [ %1946, %__nv_exp2f.exit355 ], !dbg !114 + %2074 = phi float [ 0.000000e+00, %15 ], [ %1947, %__nv_exp2f.exit355 ], !dbg !114 + %2075 = phi float [ 0.000000e+00, %15 ], [ %1948, %__nv_exp2f.exit355 ], !dbg !114 + %2076 = phi float [ 0.000000e+00, %15 ], [ %1949, %__nv_exp2f.exit355 ], !dbg !114 + %2077 = phi float [ 0.000000e+00, %15 ], [ %1950, %__nv_exp2f.exit355 ], !dbg !114 + %2078 = phi float [ 0.000000e+00, %15 ], [ %1951, %__nv_exp2f.exit355 ], !dbg !114 + %2079 = phi float [ 0.000000e+00, %15 ], [ %1952, %__nv_exp2f.exit355 ], !dbg !114 + %2080 = phi float [ 0.000000e+00, %15 ], [ %1953, %__nv_exp2f.exit355 ], !dbg !114 + %2081 = phi float [ 0.000000e+00, %15 ], [ %1954, %__nv_exp2f.exit355 ], !dbg !114 + %2082 = phi float [ 0.000000e+00, %15 ], [ %1955, %__nv_exp2f.exit355 ], !dbg !114 + %2083 = phi float [ 0.000000e+00, %15 ], [ %1956, %__nv_exp2f.exit355 ], !dbg !114 + %2084 = phi float [ 0.000000e+00, %15 ], [ %1957, %__nv_exp2f.exit355 ], !dbg !114 + %2085 = phi float [ 0.000000e+00, %15 ], [ %1958, %__nv_exp2f.exit355 ], !dbg !114 + %2086 = phi float [ 0.000000e+00, %15 ], [ %1959, %__nv_exp2f.exit355 ], !dbg !114 + %2087 = phi float [ 0.000000e+00, %15 ], [ %1960, %__nv_exp2f.exit355 ], !dbg !114 + %2088 = phi float [ 0.000000e+00, %15 ], [ %1961, %__nv_exp2f.exit355 ], !dbg !114 + %2089 = phi float [ 0.000000e+00, %15 ], [ %1962, %__nv_exp2f.exit355 ], !dbg !114 + %2090 = phi float [ 0.000000e+00, %15 ], [ %1963, %__nv_exp2f.exit355 ], !dbg !114 + %2091 = phi float [ 0.000000e+00, %15 ], [ %1964, %__nv_exp2f.exit355 ], !dbg !114 + %2092 = phi float [ 0.000000e+00, %15 ], [ %1965, %__nv_exp2f.exit355 ], !dbg !114 + %2093 = phi float [ 0.000000e+00, %15 ], [ %1966, %__nv_exp2f.exit355 ], !dbg !114 + %2094 = phi float [ 0.000000e+00, %15 ], [ %1967, %__nv_exp2f.exit355 ], !dbg !114 + %2095 = phi float [ 0.000000e+00, %15 ], [ %1968, %__nv_exp2f.exit355 ], !dbg !114 + %2096 = phi float [ 0.000000e+00, %15 ], [ %1969, %__nv_exp2f.exit355 ], !dbg !114 + %2097 = phi float [ 0.000000e+00, %15 ], [ %1970, %__nv_exp2f.exit355 ], !dbg !114 + %2098 = phi float [ 0.000000e+00, %15 ], [ %1971, %__nv_exp2f.exit355 ], !dbg !114 + %2099 = phi float [ 0.000000e+00, %15 ], [ %1972, %__nv_exp2f.exit355 ], !dbg !114 + %2100 = phi float [ 0.000000e+00, %15 ], [ %1973, %__nv_exp2f.exit355 ], !dbg !114 + %2101 = phi float [ 0.000000e+00, %15 ], [ %1974, %__nv_exp2f.exit355 ], !dbg !114 + %2102 = phi float [ 0.000000e+00, %15 ], [ %1975, %__nv_exp2f.exit355 ], !dbg !114 + %2103 = phi float [ 0.000000e+00, %15 ], [ %1976, %__nv_exp2f.exit355 ], !dbg !114 + %2104 = phi float [ 0.000000e+00, %15 ], [ %1977, %__nv_exp2f.exit355 ], !dbg !114 + %2105 = phi float [ 0.000000e+00, %15 ], [ %1978, %__nv_exp2f.exit355 ], !dbg !114 + %2106 = phi float [ 0.000000e+00, %15 ], [ %1979, %__nv_exp2f.exit355 ], !dbg !114 + %2107 = phi float [ 0.000000e+00, %15 ], [ %1980, %__nv_exp2f.exit355 ], !dbg !114 + %2108 = phi float [ 0.000000e+00, %15 ], [ %1981, %__nv_exp2f.exit355 ], !dbg !114 + %2109 = phi float [ 0.000000e+00, %15 ], [ %1982, %__nv_exp2f.exit355 ], !dbg !114 + %2110 = phi float [ 0.000000e+00, %15 ], [ %1983, %__nv_exp2f.exit355 ], !dbg !114 + %2111 = phi float [ 0.000000e+00, %15 ], [ %1573, %__nv_exp2f.exit355 ] + %2112 = phi float [ 0.000000e+00, %15 ], [ %1574, %__nv_exp2f.exit355 ] + %2113 = phi <2 x float> [ splat (float 0xFFF0000000000000), %15 ], [ %1312, %__nv_exp2f.exit355 ] + %2114 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %2047, float %2048, float %2049, float %2050, float %2051, float %2052, float %2053, float %2054, float %2055, float %2056, float %2057, float %2058, float %2059, float %2060, float %2061, float %2062, float %2063, float %2064, float %2065, float %2066, float %2067, float %2068, float %2069, float %2070, float %2071, float %2072, float %2073, float %2074, float %2075, float %2076, float %2077, float %2078, float %2079, float %2080, float %2081, float %2082, float %2083, float %2084, float %2085, float %2086, float %2087, float %2088, float %2089, float %2090, float %2091, float %2092, float %2093, float %2094, float %2095, float %2096, float %2097, float %2098, float %2099, float %2100, float %2101, float %2102, float %2103, float %2104, float %2105, float %2106, float %2107, float %2108, float %2109, float %2110) #2, !dbg !52 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !52 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !52 + %2115 = getelementptr i32, ptr addrspace(1) %8, i64 %186, !dbg !115 + %2116 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %2115) #2, !dbg !116 + %2117 = shl i32 %2116, 7, !dbg !117 + %2118 = getelementptr i32, ptr addrspace(1) %7, i64 %190, !dbg !118 + %2119 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %2118) #2, !dbg !119 + %2120 = shl i32 %2119, 1, !dbg !120 + %2121 = tail call i32 @llvm.smin.i32(i32 %2120, i32 %196), !dbg !121 + %2122 = icmp sgt i32 %2120, 0, !dbg !122 + %2123 = or disjoint i32 %2117, %37, !dbg !124 + %2124 = or disjoint i32 %2117, %38, !dbg !124 + %2125 = or disjoint i32 %2117, %39, !dbg !124 + %2126 = or disjoint i32 %2117, %40, !dbg !124 + %2127 = shl i32 %2123, 7, !dbg !125 + %2128 = shl i32 %2124, 7, !dbg !125 + %2129 = shl i32 %2125, 7, !dbg !125 + %2130 = shl i32 %2126, 7, !dbg !125 + %2131 = sext i32 %2127 to i64, !dbg !126 + %2132 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2131, !dbg !126 + %2133 = sext i32 %2128 to i64, !dbg !126 + %2134 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2133, !dbg !126 + %2135 = sext i32 %2129 to i64, !dbg !126 + %2136 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2135, !dbg !126 + %2137 = sext i32 %2130 to i64, !dbg !126 + %2138 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2137, !dbg !126 + %2139 = getelementptr bfloat, ptr addrspace(1) %2132, i64 %80, !dbg !127 + %2140 = getelementptr bfloat, ptr addrspace(1) %2134, i64 %80, !dbg !127 + %2141 = getelementptr bfloat, ptr addrspace(1) %2136, i64 %80, !dbg !127 + %2142 = getelementptr bfloat, ptr addrspace(1) %2138, i64 %80, !dbg !127 + %2143 = icmp slt i32 %2123, %11, !dbg !128 + %2144 = icmp slt i32 %2124, %11, !dbg !128 + %2145 = icmp slt i32 %2125, %11, !dbg !128 + %2146 = icmp slt i32 %2126, %11, !dbg !128 + %2147 = and i1 %2122, %2143, !dbg !122 + %2148 = and i1 %2122, %2144, !dbg !122 + %2149 = and i1 %2122, %2145, !dbg !122 + %2150 = and i1 %2122, %2146, !dbg !122 + %2151 = select i1 %2147, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %256, ptr addrspace(1) %2139, i32 %2151) #2, !dbg !129 + %2152 = select i1 %2148, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %259, ptr addrspace(1) %2140, i32 %2152) #2, !dbg !129 + %2153 = select i1 %2149, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %262, ptr addrspace(1) %2141, i32 %2153) #2, !dbg !129 + %2154 = select i1 %2150, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %265, ptr addrspace(1) %2142, i32 %2154) #2, !dbg !129 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !129 + %2155 = getelementptr bfloat, ptr addrspace(1) %30, i64 %2131, !dbg !126 + %2156 = getelementptr bfloat, ptr addrspace(1) %30, i64 %2133, !dbg !126 + %2157 = getelementptr bfloat, ptr addrspace(1) %30, i64 %2135, !dbg !126 + %2158 = getelementptr bfloat, ptr addrspace(1) %30, i64 %2137, !dbg !126 + %2159 = getelementptr bfloat, ptr addrspace(1) %2155, i64 %80, !dbg !127 + %2160 = getelementptr bfloat, ptr addrspace(1) %2156, i64 %80, !dbg !127 + %2161 = getelementptr bfloat, ptr addrspace(1) %2157, i64 %80, !dbg !127 + %2162 = getelementptr bfloat, ptr addrspace(1) %2158, i64 %80, !dbg !127 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %275, ptr addrspace(1) %2159, i32 %2151) #2, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %276, ptr addrspace(1) %2160, i32 %2152) #2, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %277, ptr addrspace(1) %2161, i32 %2153) #2, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %278, ptr addrspace(1) %2162, i32 %2154) #2, !dbg !129 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !129 + %2163 = icmp sgt i32 %2121, 1, !dbg !122 + %2164 = or disjoint i32 %2117, 64, !dbg !130 + %2165 = or disjoint i32 %2164, %37, !dbg !124 + %2166 = or disjoint i32 %2164, %38, !dbg !124 + %2167 = or disjoint i32 %2164, %39, !dbg !124 + %2168 = or disjoint i32 %2164, %40, !dbg !124 + %2169 = shl i32 %2165, 7, !dbg !125 + %2170 = shl i32 %2166, 7, !dbg !125 + %2171 = shl i32 %2167, 7, !dbg !125 + %2172 = shl i32 %2168, 7, !dbg !125 + %2173 = sext i32 %2169 to i64, !dbg !126 + %2174 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2173, !dbg !126 + %2175 = sext i32 %2170 to i64, !dbg !126 + %2176 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2175, !dbg !126 + %2177 = sext i32 %2171 to i64, !dbg !126 + %2178 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2177, !dbg !126 + %2179 = sext i32 %2172 to i64, !dbg !126 + %2180 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2179, !dbg !126 + %2181 = getelementptr bfloat, ptr addrspace(1) %2174, i64 %80, !dbg !127 + %2182 = getelementptr bfloat, ptr addrspace(1) %2176, i64 %80, !dbg !127 + %2183 = getelementptr bfloat, ptr addrspace(1) %2178, i64 %80, !dbg !127 + %2184 = getelementptr bfloat, ptr addrspace(1) %2180, i64 %80, !dbg !127 + %2185 = icmp slt i32 %2165, %11, !dbg !128 + %2186 = icmp slt i32 %2166, %11, !dbg !128 + %2187 = icmp slt i32 %2167, %11, !dbg !128 + %2188 = icmp slt i32 %2168, %11, !dbg !128 + %2189 = and i1 %2163, %2185, !dbg !122 + %2190 = and i1 %2163, %2186, !dbg !122 + %2191 = and i1 %2163, %2187, !dbg !122 + %2192 = and i1 %2163, %2188, !dbg !122 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !129 + %2193 = select i1 %2189, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %309, ptr addrspace(1) %2181, i32 %2193) #2, !dbg !129 + %2194 = select i1 %2190, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %311, ptr addrspace(1) %2182, i32 %2194) #2, !dbg !129 + %2195 = select i1 %2191, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %313, ptr addrspace(1) %2183, i32 %2195) #2, !dbg !129 + %2196 = select i1 %2192, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %315, ptr addrspace(1) %2184, i32 %2196) #2, !dbg !129 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !129 + %2197 = getelementptr bfloat, ptr addrspace(1) %30, i64 %2173, !dbg !126 + %2198 = getelementptr bfloat, ptr addrspace(1) %30, i64 %2175, !dbg !126 + %2199 = getelementptr bfloat, ptr addrspace(1) %30, i64 %2177, !dbg !126 + %2200 = getelementptr bfloat, ptr addrspace(1) %30, i64 %2179, !dbg !126 + %2201 = getelementptr bfloat, ptr addrspace(1) %2197, i64 %80, !dbg !127 + %2202 = getelementptr bfloat, ptr addrspace(1) %2198, i64 %80, !dbg !127 + %2203 = getelementptr bfloat, ptr addrspace(1) %2199, i64 %80, !dbg !127 + %2204 = getelementptr bfloat, ptr addrspace(1) %2200, i64 %80, !dbg !127 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %325, ptr addrspace(1) %2201, i32 %2193) #2, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %326, ptr addrspace(1) %2202, i32 %2194) #2, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %327, ptr addrspace(1) %2203, i32 %2195) #2, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %328, ptr addrspace(1) %2204, i32 %2196) #2, !dbg !129 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !129 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #2, !dbg !131 + %2205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 0, !dbg !122 + %2206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 1, !dbg !122 + %2207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 2, !dbg !122 + %2208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 3, !dbg !122 + %2209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 4, !dbg !122 + %2210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 5, !dbg !122 + %2211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 6, !dbg !122 + %2212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 7, !dbg !122 + %2213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 8, !dbg !122 + %2214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 9, !dbg !122 + %2215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 10, !dbg !122 + %2216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 11, !dbg !122 + %2217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 12, !dbg !122 + %2218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 13, !dbg !122 + %2219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 14, !dbg !122 + %2220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 15, !dbg !122 + %2221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 16, !dbg !122 + %2222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 17, !dbg !122 + %2223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 18, !dbg !122 + %2224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 19, !dbg !122 + %2225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 20, !dbg !122 + %2226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 21, !dbg !122 + %2227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 22, !dbg !122 + %2228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 23, !dbg !122 + %2229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 24, !dbg !122 + %2230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 25, !dbg !122 + %2231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 26, !dbg !122 + %2232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 27, !dbg !122 + %2233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 28, !dbg !122 + %2234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 29, !dbg !122 + %2235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 30, !dbg !122 + %2236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 31, !dbg !122 + %2237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 32, !dbg !122 + %2238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 33, !dbg !122 + %2239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 34, !dbg !122 + %2240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 35, !dbg !122 + %2241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 36, !dbg !122 + %2242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 37, !dbg !122 + %2243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 38, !dbg !122 + %2244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 39, !dbg !122 + %2245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 40, !dbg !122 + %2246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 41, !dbg !122 + %2247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 42, !dbg !122 + %2248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 43, !dbg !122 + %2249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 44, !dbg !122 + %2250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 45, !dbg !122 + %2251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 46, !dbg !122 + %2252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 47, !dbg !122 + %2253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 48, !dbg !122 + %2254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 49, !dbg !122 + %2255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 50, !dbg !122 + %2256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 51, !dbg !122 + %2257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 52, !dbg !122 + %2258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 53, !dbg !122 + %2259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 54, !dbg !122 + %2260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 55, !dbg !122 + %2261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 56, !dbg !122 + %2262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 57, !dbg !122 + %2263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 58, !dbg !122 + %2264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 59, !dbg !122 + %2265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 60, !dbg !122 + %2266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 61, !dbg !122 + %2267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 62, !dbg !122 + %2268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 63, !dbg !122 + br i1 %2122, label %.lr.ph460, label %._crit_edge461, !dbg !122 + +.lr.ph460: ; preds = %._crit_edge + %2269 = insertelement <16 x i32> poison, i32 %2117, i64 0, !dbg !132 + %2270 = shufflevector <16 x i32> %2269, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !132 + %2271 = shufflevector <2 x i32> %203, <2 x i32> poison, <16 x i32> , !dbg !132 + %2272 = insertelement <16 x i32> %2271, i32 %200, i64 14, !dbg !132 + %2273 = insertelement <16 x i32> %2272, i32 %199, i64 15, !dbg !132 + %2274 = shufflevector <8 x i32> %209, <8 x i32> poison, <16 x i32> , !dbg !132 + %2275 = shufflevector <16 x i32> %2274, <16 x i32> %2273, <16 x i32> , !dbg !132 + %2276 = shufflevector <4 x i32> %206, <4 x i32> poison, <16 x i32> , !dbg !132 + %2277 = shufflevector <16 x i32> %2275, <16 x i32> %2276, <16 x i32> , !dbg !132 + %2278 = or disjoint <16 x i32> %2270, %2277, !dbg !132 + %2279 = add nsw i32 %2121, -2 + %2280 = add nsw i32 %2121, -1 + %smax471 = tail call i32 @llvm.smax.i32(i32 %2121, i32 1), !dbg !122 + %2281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 0, !dbg !122 + %2282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 1, !dbg !122 + %2283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 2, !dbg !122 + %2284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 3, !dbg !122 + %2285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 4, !dbg !122 + %2286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 5, !dbg !122 + %2287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 6, !dbg !122 + %2288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 7, !dbg !122 + %2289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 8, !dbg !122 + %2290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 9, !dbg !122 + %2291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 10, !dbg !122 + %2292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 11, !dbg !122 + %2293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 12, !dbg !122 + %2294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 13, !dbg !122 + %2295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 14, !dbg !122 + %2296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 15, !dbg !122 + %2297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 16, !dbg !122 + %2298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 17, !dbg !122 + %2299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 18, !dbg !122 + %2300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 19, !dbg !122 + %2301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 20, !dbg !122 + %2302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 21, !dbg !122 + %2303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 22, !dbg !122 + %2304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 23, !dbg !122 + %2305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 24, !dbg !122 + %2306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 25, !dbg !122 + %2307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 26, !dbg !122 + %2308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 27, !dbg !122 + %2309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 28, !dbg !122 + %2310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 29, !dbg !122 + %2311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 30, !dbg !122 + %2312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 31, !dbg !122 + %2313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 32, !dbg !122 + %2314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 33, !dbg !122 + %2315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 34, !dbg !122 + %2316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 35, !dbg !122 + %2317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 36, !dbg !122 + %2318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 37, !dbg !122 + %2319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 38, !dbg !122 + %2320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 39, !dbg !122 + %2321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 40, !dbg !122 + %2322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 41, !dbg !122 + %2323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 42, !dbg !122 + %2324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 43, !dbg !122 + %2325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 44, !dbg !122 + %2326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 45, !dbg !122 + %2327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 46, !dbg !122 + %2328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 47, !dbg !122 + %2329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 48, !dbg !122 + %2330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 49, !dbg !122 + %2331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 50, !dbg !122 + %2332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 51, !dbg !122 + %2333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 52, !dbg !122 + %2334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 53, !dbg !122 + %2335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 54, !dbg !122 + %2336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 55, !dbg !122 + %2337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 56, !dbg !122 + %2338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 57, !dbg !122 + %2339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 58, !dbg !122 + %2340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 59, !dbg !122 + %2341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 60, !dbg !122 + %2342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 61, !dbg !122 + %2343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 62, !dbg !122 + %2344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 63, !dbg !122 + br label %2345, !dbg !122 + +2345: ; preds = %.lr.ph460, %__nv_exp2f.exit253 + %2346 = phi i32 [ 64, %.lr.ph460 ], [ %3674, %__nv_exp2f.exit253 ] + %2347 = phi i32 [ -1, %.lr.ph460 ], [ %2357, %__nv_exp2f.exit253 ] + %2348 = phi i32 [ 1, %.lr.ph460 ], [ %3678, %__nv_exp2f.exit253 ] + %2349 = phi i32 [ 64, %.lr.ph460 ], [ %3675, %__nv_exp2f.exit253 ] + %.pn = phi float [ %2281, %.lr.ph460 ], [ %3588, %__nv_exp2f.exit253 ] + %.pn479 = phi float [ %2282, %.lr.ph460 ], [ %3589, %__nv_exp2f.exit253 ] + %.pn480 = phi float [ %2283, %.lr.ph460 ], [ %3590, %__nv_exp2f.exit253 ] + %.pn481 = phi float [ %2284, %.lr.ph460 ], [ %3591, %__nv_exp2f.exit253 ] + %.pn482 = phi float [ %2285, %.lr.ph460 ], [ %3592, %__nv_exp2f.exit253 ] + %.pn483 = phi float [ %2286, %.lr.ph460 ], [ %3593, %__nv_exp2f.exit253 ] + %.pn484 = phi float [ %2287, %.lr.ph460 ], [ %3594, %__nv_exp2f.exit253 ] + %.pn485 = phi float [ %2288, %.lr.ph460 ], [ %3595, %__nv_exp2f.exit253 ] + %.pn486 = phi float [ %2289, %.lr.ph460 ], [ %3596, %__nv_exp2f.exit253 ] + %.pn487 = phi float [ %2290, %.lr.ph460 ], [ %3597, %__nv_exp2f.exit253 ] + %.pn488 = phi float [ %2291, %.lr.ph460 ], [ %3598, %__nv_exp2f.exit253 ] + %.pn489 = phi float [ %2292, %.lr.ph460 ], [ %3599, %__nv_exp2f.exit253 ] + %.pn490 = phi float [ %2293, %.lr.ph460 ], [ %3600, %__nv_exp2f.exit253 ] + %.pn491 = phi float [ %2294, %.lr.ph460 ], [ %3601, %__nv_exp2f.exit253 ] + %.pn492 = phi float [ %2295, %.lr.ph460 ], [ %3602, %__nv_exp2f.exit253 ] + %.pn493 = phi float [ %2296, %.lr.ph460 ], [ %3603, %__nv_exp2f.exit253 ] + %.pn494 = phi float [ %2297, %.lr.ph460 ], [ %3604, %__nv_exp2f.exit253 ] + %.pn495 = phi float [ %2298, %.lr.ph460 ], [ %3605, %__nv_exp2f.exit253 ] + %.pn496 = phi float [ %2299, %.lr.ph460 ], [ %3606, %__nv_exp2f.exit253 ] + %.pn497 = phi float [ %2300, %.lr.ph460 ], [ %3607, %__nv_exp2f.exit253 ] + %.pn498 = phi float [ %2301, %.lr.ph460 ], [ %3608, %__nv_exp2f.exit253 ] + %.pn499 = phi float [ %2302, %.lr.ph460 ], [ %3609, %__nv_exp2f.exit253 ] + %.pn500 = phi float [ %2303, %.lr.ph460 ], [ %3610, %__nv_exp2f.exit253 ] + %.pn501 = phi float [ %2304, %.lr.ph460 ], [ %3611, %__nv_exp2f.exit253 ] + %.pn502 = phi float [ %2305, %.lr.ph460 ], [ %3612, %__nv_exp2f.exit253 ] + %.pn503 = phi float [ %2306, %.lr.ph460 ], [ %3613, %__nv_exp2f.exit253 ] + %.pn504 = phi float [ %2307, %.lr.ph460 ], [ %3614, %__nv_exp2f.exit253 ] + %.pn505 = phi float [ %2308, %.lr.ph460 ], [ %3615, %__nv_exp2f.exit253 ] + %.pn506 = phi float [ %2309, %.lr.ph460 ], [ %3616, %__nv_exp2f.exit253 ] + %.pn507 = phi float [ %2310, %.lr.ph460 ], [ %3617, %__nv_exp2f.exit253 ] + %.pn508 = phi float [ %2311, %.lr.ph460 ], [ %3618, %__nv_exp2f.exit253 ] + %.pn509 = phi float [ %2312, %.lr.ph460 ], [ %3619, %__nv_exp2f.exit253 ] + %.pn510 = phi float [ %2313, %.lr.ph460 ], [ %3620, %__nv_exp2f.exit253 ] + %.pn511 = phi float [ %2314, %.lr.ph460 ], [ %3621, %__nv_exp2f.exit253 ] + %.pn512 = phi float [ %2315, %.lr.ph460 ], [ %3622, %__nv_exp2f.exit253 ] + %.pn513 = phi float [ %2316, %.lr.ph460 ], [ %3623, %__nv_exp2f.exit253 ] + %.pn514 = phi float [ %2317, %.lr.ph460 ], [ %3624, %__nv_exp2f.exit253 ] + %.pn515 = phi float [ %2318, %.lr.ph460 ], [ %3625, %__nv_exp2f.exit253 ] + %.pn516 = phi float [ %2319, %.lr.ph460 ], [ %3626, %__nv_exp2f.exit253 ] + %.pn517 = phi float [ %2320, %.lr.ph460 ], [ %3627, %__nv_exp2f.exit253 ] + %.pn518 = phi float [ %2321, %.lr.ph460 ], [ %3628, %__nv_exp2f.exit253 ] + %.pn519 = phi float [ %2322, %.lr.ph460 ], [ %3629, %__nv_exp2f.exit253 ] + %.pn520 = phi float [ %2323, %.lr.ph460 ], [ %3630, %__nv_exp2f.exit253 ] + %.pn521 = phi float [ %2324, %.lr.ph460 ], [ %3631, %__nv_exp2f.exit253 ] + %.pn522 = phi float [ %2325, %.lr.ph460 ], [ %3632, %__nv_exp2f.exit253 ] + %.pn523 = phi float [ %2326, %.lr.ph460 ], [ %3633, %__nv_exp2f.exit253 ] + %.pn524 = phi float [ %2327, %.lr.ph460 ], [ %3634, %__nv_exp2f.exit253 ] + %.pn525 = phi float [ %2328, %.lr.ph460 ], [ %3635, %__nv_exp2f.exit253 ] + %.pn526 = phi float [ %2329, %.lr.ph460 ], [ %3636, %__nv_exp2f.exit253 ] + %.pn527 = phi float [ %2330, %.lr.ph460 ], [ %3637, %__nv_exp2f.exit253 ] + %.pn528 = phi float [ %2331, %.lr.ph460 ], [ %3638, %__nv_exp2f.exit253 ] + %.pn529 = phi float [ %2332, %.lr.ph460 ], [ %3639, %__nv_exp2f.exit253 ] + %.pn530 = phi float [ %2333, %.lr.ph460 ], [ %3640, %__nv_exp2f.exit253 ] + %.pn531 = phi float [ %2334, %.lr.ph460 ], [ %3641, %__nv_exp2f.exit253 ] + %.pn532 = phi float [ %2335, %.lr.ph460 ], [ %3642, %__nv_exp2f.exit253 ] + %.pn533 = phi float [ %2336, %.lr.ph460 ], [ %3643, %__nv_exp2f.exit253 ] + %.pn534 = phi float [ %2337, %.lr.ph460 ], [ %3644, %__nv_exp2f.exit253 ] + %.pn535 = phi float [ %2338, %.lr.ph460 ], [ %3645, %__nv_exp2f.exit253 ] + %.pn536 = phi float [ %2339, %.lr.ph460 ], [ %3646, %__nv_exp2f.exit253 ] + %.pn537 = phi float [ %2340, %.lr.ph460 ], [ %3647, %__nv_exp2f.exit253 ] + %.pn538 = phi float [ %2341, %.lr.ph460 ], [ %3648, %__nv_exp2f.exit253 ] + %.pn539 = phi float [ %2342, %.lr.ph460 ], [ %3649, %__nv_exp2f.exit253 ] + %.pn540 = phi float [ %2343, %.lr.ph460 ], [ %3650, %__nv_exp2f.exit253 ] + %.pn541 = phi float [ %2344, %.lr.ph460 ], [ %3651, %__nv_exp2f.exit253 ] + %2350 = phi i32 [ 0, %.lr.ph460 ], [ %3655, %__nv_exp2f.exit253 ] + %.pn546 = phi float [ %2111, %.lr.ph460 ], [ %3241, %__nv_exp2f.exit253 ] + %.pn544 = phi float [ %2112, %.lr.ph460 ], [ %3242, %__nv_exp2f.exit253 ] + %2351 = phi <2 x float> [ %2113, %.lr.ph460 ], [ %2980, %__nv_exp2f.exit253 ] + %2352 = phi <16 x i32> [ %2278, %.lr.ph460 ], [ %3654, %__nv_exp2f.exit253 ] + %2353 = icmp slt i32 %2350, %2279, !dbg !122 + %2354 = icmp slt i32 %2350, %2280, !dbg !122 + %2355 = add i32 %2347, 1, !dbg !122 + %2356 = icmp sgt i32 %2355, 2, !dbg !122 + %2357 = select i1 %2356, i32 0, i32 %2355, !dbg !122 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !129 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !129 + %2358 = shl i32 %2357, 13, !dbg !129 + %2359 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %2358, !dbg !129 + %2360 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %35, i32 0, i32 31), !dbg !131 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !131 + %2361 = shl i32 %2360, 11, !dbg !131 + %2362 = and i32 %2361, 8192, !dbg !131 + %2363 = add i32 %2362, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !131 + %2364 = lshr exact i32 %2363, 4, !dbg !131 + %2365 = and i32 %2364, 16383, !dbg !131 + %2366 = zext nneg i32 %2365 to i64, !dbg !131 + %2367 = or disjoint i64 %2366, 4611686293372403712, !dbg !131 + %2368 = ptrtoint ptr addrspace(3) %2359 to i32, !dbg !131 + %2369 = lshr exact i32 %2368, 4, !dbg !131 + %2370 = and i32 %2369, 16383, !dbg !131 + %2371 = zext nneg i32 %2370 to i64, !dbg !131 + %2372 = or disjoint i64 %2371, 4611686293338849280, !dbg !131 + %2373 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %2367, i64 %2372) #2, !dbg !131 + %2374 = add i32 %2362, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 32), !dbg !131 + %2375 = lshr exact i32 %2374, 4, !dbg !131 + %2376 = and i32 %2375, 16383, !dbg !131 + %2377 = zext nneg i32 %2376 to i64, !dbg !131 + %2378 = or disjoint i64 %2377, 4611686293372403712, !dbg !131 + %2379 = add i32 %2368, 32, !dbg !131 + %2380 = lshr exact i32 %2379, 4, !dbg !131 + %2381 = and i32 %2380, 16383, !dbg !131 + %2382 = zext nneg i32 %2381 to i64, !dbg !131 + %2383 = or disjoint i64 %2382, 4611686293338849280, !dbg !131 + %2384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 0, !dbg !131 + %2385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 1, !dbg !131 + %2386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 2, !dbg !131 + %2387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 3, !dbg !131 + %2388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 4, !dbg !131 + %2389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 5, !dbg !131 + %2390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 6, !dbg !131 + %2391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 7, !dbg !131 + %2392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 8, !dbg !131 + %2393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 9, !dbg !131 + %2394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 10, !dbg !131 + %2395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 11, !dbg !131 + %2396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 12, !dbg !131 + %2397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 13, !dbg !131 + %2398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 14, !dbg !131 + %2399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 15, !dbg !131 + %2400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 16, !dbg !131 + %2401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 17, !dbg !131 + %2402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 18, !dbg !131 + %2403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 19, !dbg !131 + %2404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 20, !dbg !131 + %2405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 21, !dbg !131 + %2406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 22, !dbg !131 + %2407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 23, !dbg !131 + %2408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 24, !dbg !131 + %2409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 25, !dbg !131 + %2410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 26, !dbg !131 + %2411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 27, !dbg !131 + %2412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 28, !dbg !131 + %2413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 29, !dbg !131 + %2414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 30, !dbg !131 + %2415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 31, !dbg !131 + %2416 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2384, float %2385, float %2386, float %2387, float %2388, float %2389, float %2390, float %2391, float %2392, float %2393, float %2394, float %2395, float %2396, float %2397, float %2398, float %2399, float %2400, float %2401, float %2402, float %2403, float %2404, float %2405, float %2406, float %2407, float %2408, float %2409, float %2410, float %2411, float %2412, float %2413, float %2414, float %2415, i64 %2378, i64 %2383, i1 true) #2, !dbg !131 + %2417 = add i32 %2362, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 64), !dbg !131 + %2418 = lshr exact i32 %2417, 4, !dbg !131 + %2419 = and i32 %2418, 16383, !dbg !131 + %2420 = zext nneg i32 %2419 to i64, !dbg !131 + %2421 = or disjoint i64 %2420, 4611686293372403712, !dbg !131 + %2422 = add i32 %2368, 64, !dbg !131 + %2423 = lshr exact i32 %2422, 4, !dbg !131 + %2424 = and i32 %2423, 16383, !dbg !131 + %2425 = zext nneg i32 %2424 to i64, !dbg !131 + %2426 = or disjoint i64 %2425, 4611686293338849280, !dbg !131 + %2427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 0, !dbg !131 + %2428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 1, !dbg !131 + %2429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 2, !dbg !131 + %2430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 3, !dbg !131 + %2431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 4, !dbg !131 + %2432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 5, !dbg !131 + %2433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 6, !dbg !131 + %2434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 7, !dbg !131 + %2435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 8, !dbg !131 + %2436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 9, !dbg !131 + %2437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 10, !dbg !131 + %2438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 11, !dbg !131 + %2439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 12, !dbg !131 + %2440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 13, !dbg !131 + %2441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 14, !dbg !131 + %2442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 15, !dbg !131 + %2443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 16, !dbg !131 + %2444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 17, !dbg !131 + %2445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 18, !dbg !131 + %2446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 19, !dbg !131 + %2447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 20, !dbg !131 + %2448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 21, !dbg !131 + %2449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 22, !dbg !131 + %2450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 23, !dbg !131 + %2451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 24, !dbg !131 + %2452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 25, !dbg !131 + %2453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 26, !dbg !131 + %2454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 27, !dbg !131 + %2455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 28, !dbg !131 + %2456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 29, !dbg !131 + %2457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 30, !dbg !131 + %2458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 31, !dbg !131 + %2459 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2427, float %2428, float %2429, float %2430, float %2431, float %2432, float %2433, float %2434, float %2435, float %2436, float %2437, float %2438, float %2439, float %2440, float %2441, float %2442, float %2443, float %2444, float %2445, float %2446, float %2447, float %2448, float %2449, float %2450, float %2451, float %2452, float %2453, float %2454, float %2455, float %2456, float %2457, float %2458, i64 %2421, i64 %2426, i1 true) #2, !dbg !131 + %2460 = add i32 %2362, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 96), !dbg !131 + %2461 = lshr exact i32 %2460, 4, !dbg !131 + %2462 = and i32 %2461, 16383, !dbg !131 + %2463 = zext nneg i32 %2462 to i64, !dbg !131 + %2464 = or disjoint i64 %2463, 4611686293372403712, !dbg !131 + %2465 = add i32 %2368, 96, !dbg !131 + %2466 = lshr exact i32 %2465, 4, !dbg !131 + %2467 = and i32 %2466, 16383, !dbg !131 + %2468 = zext nneg i32 %2467 to i64, !dbg !131 + %2469 = or disjoint i64 %2468, 4611686293338849280, !dbg !131 + %2470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 0, !dbg !131 + %2471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 1, !dbg !131 + %2472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 2, !dbg !131 + %2473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 3, !dbg !131 + %2474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 4, !dbg !131 + %2475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 5, !dbg !131 + %2476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 6, !dbg !131 + %2477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 7, !dbg !131 + %2478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 8, !dbg !131 + %2479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 9, !dbg !131 + %2480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 10, !dbg !131 + %2481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 11, !dbg !131 + %2482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 12, !dbg !131 + %2483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 13, !dbg !131 + %2484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 14, !dbg !131 + %2485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 15, !dbg !131 + %2486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 16, !dbg !131 + %2487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 17, !dbg !131 + %2488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 18, !dbg !131 + %2489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 19, !dbg !131 + %2490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 20, !dbg !131 + %2491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 21, !dbg !131 + %2492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 22, !dbg !131 + %2493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 23, !dbg !131 + %2494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 24, !dbg !131 + %2495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 25, !dbg !131 + %2496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 26, !dbg !131 + %2497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 27, !dbg !131 + %2498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 28, !dbg !131 + %2499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 29, !dbg !131 + %2500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 30, !dbg !131 + %2501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 31, !dbg !131 + %2502 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2470, float %2471, float %2472, float %2473, float %2474, float %2475, float %2476, float %2477, float %2478, float %2479, float %2480, float %2481, float %2482, float %2483, float %2484, float %2485, float %2486, float %2487, float %2488, float %2489, float %2490, float %2491, float %2492, float %2493, float %2494, float %2495, float %2496, float %2497, float %2498, float %2499, float %2500, float %2501, i64 %2464, i64 %2469, i1 true) #2, !dbg !131 + %2503 = add i32 %2362, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16384), !dbg !131 + %2504 = lshr exact i32 %2503, 4, !dbg !131 + %2505 = and i32 %2504, 16383, !dbg !131 + %2506 = zext nneg i32 %2505 to i64, !dbg !131 + %2507 = or disjoint i64 %2506, 4611686293372403712, !dbg !131 + %2508 = add i32 %2368, 8192, !dbg !131 + %2509 = lshr exact i32 %2508, 4, !dbg !131 + %2510 = and i32 %2509, 16383, !dbg !131 + %2511 = zext nneg i32 %2510 to i64, !dbg !131 + %2512 = or disjoint i64 %2511, 4611686293338849280, !dbg !131 + %2513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 0, !dbg !131 + %2514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 1, !dbg !131 + %2515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 2, !dbg !131 + %2516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 3, !dbg !131 + %2517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 4, !dbg !131 + %2518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 5, !dbg !131 + %2519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 6, !dbg !131 + %2520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 7, !dbg !131 + %2521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 8, !dbg !131 + %2522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 9, !dbg !131 + %2523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 10, !dbg !131 + %2524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 11, !dbg !131 + %2525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 12, !dbg !131 + %2526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 13, !dbg !131 + %2527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 14, !dbg !131 + %2528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 15, !dbg !131 + %2529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 16, !dbg !131 + %2530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 17, !dbg !131 + %2531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 18, !dbg !131 + %2532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 19, !dbg !131 + %2533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 20, !dbg !131 + %2534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 21, !dbg !131 + %2535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 22, !dbg !131 + %2536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 23, !dbg !131 + %2537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 24, !dbg !131 + %2538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 25, !dbg !131 + %2539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 26, !dbg !131 + %2540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 27, !dbg !131 + %2541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 28, !dbg !131 + %2542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 29, !dbg !131 + %2543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 30, !dbg !131 + %2544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 31, !dbg !131 + %2545 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2513, float %2514, float %2515, float %2516, float %2517, float %2518, float %2519, float %2520, float %2521, float %2522, float %2523, float %2524, float %2525, float %2526, float %2527, float %2528, float %2529, float %2530, float %2531, float %2532, float %2533, float %2534, float %2535, float %2536, float %2537, float %2538, float %2539, float %2540, float %2541, float %2542, float %2543, float %2544, i64 %2507, i64 %2512, i1 true) #2, !dbg !131 + %2546 = add i32 %2362, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16416), !dbg !131 + %2547 = lshr exact i32 %2546, 4, !dbg !131 + %2548 = and i32 %2547, 16383, !dbg !131 + %2549 = zext nneg i32 %2548 to i64, !dbg !131 + %2550 = or disjoint i64 %2549, 4611686293372403712, !dbg !131 + %2551 = add i32 %2368, 8224, !dbg !131 + %2552 = lshr exact i32 %2551, 4, !dbg !131 + %2553 = and i32 %2552, 16383, !dbg !131 + %2554 = zext nneg i32 %2553 to i64, !dbg !131 + %2555 = or disjoint i64 %2554, 4611686293338849280, !dbg !131 + %2556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 0, !dbg !131 + %2557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 1, !dbg !131 + %2558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 2, !dbg !131 + %2559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 3, !dbg !131 + %2560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 4, !dbg !131 + %2561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 5, !dbg !131 + %2562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 6, !dbg !131 + %2563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 7, !dbg !131 + %2564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 8, !dbg !131 + %2565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 9, !dbg !131 + %2566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 10, !dbg !131 + %2567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 11, !dbg !131 + %2568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 12, !dbg !131 + %2569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 13, !dbg !131 + %2570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 14, !dbg !131 + %2571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 15, !dbg !131 + %2572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 16, !dbg !131 + %2573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 17, !dbg !131 + %2574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 18, !dbg !131 + %2575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 19, !dbg !131 + %2576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 20, !dbg !131 + %2577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 21, !dbg !131 + %2578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 22, !dbg !131 + %2579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 23, !dbg !131 + %2580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 24, !dbg !131 + %2581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 25, !dbg !131 + %2582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 26, !dbg !131 + %2583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 27, !dbg !131 + %2584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 28, !dbg !131 + %2585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 29, !dbg !131 + %2586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 30, !dbg !131 + %2587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 31, !dbg !131 + %2588 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2556, float %2557, float %2558, float %2559, float %2560, float %2561, float %2562, float %2563, float %2564, float %2565, float %2566, float %2567, float %2568, float %2569, float %2570, float %2571, float %2572, float %2573, float %2574, float %2575, float %2576, float %2577, float %2578, float %2579, float %2580, float %2581, float %2582, float %2583, float %2584, float %2585, float %2586, float %2587, i64 %2550, i64 %2555, i1 true) #2, !dbg !131 + %2589 = add i32 %2362, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16448), !dbg !131 + %2590 = lshr exact i32 %2589, 4, !dbg !131 + %2591 = and i32 %2590, 16383, !dbg !131 + %2592 = zext nneg i32 %2591 to i64, !dbg !131 + %2593 = or disjoint i64 %2592, 4611686293372403712, !dbg !131 + %2594 = add i32 %2368, 8256, !dbg !131 + %2595 = lshr exact i32 %2594, 4, !dbg !131 + %2596 = and i32 %2595, 16383, !dbg !131 + %2597 = zext nneg i32 %2596 to i64, !dbg !131 + %2598 = or disjoint i64 %2597, 4611686293338849280, !dbg !131 + %2599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 0, !dbg !131 + %2600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 1, !dbg !131 + %2601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 2, !dbg !131 + %2602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 3, !dbg !131 + %2603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 4, !dbg !131 + %2604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 5, !dbg !131 + %2605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 6, !dbg !131 + %2606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 7, !dbg !131 + %2607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 8, !dbg !131 + %2608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 9, !dbg !131 + %2609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 10, !dbg !131 + %2610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 11, !dbg !131 + %2611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 12, !dbg !131 + %2612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 13, !dbg !131 + %2613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 14, !dbg !131 + %2614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 15, !dbg !131 + %2615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 16, !dbg !131 + %2616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 17, !dbg !131 + %2617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 18, !dbg !131 + %2618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 19, !dbg !131 + %2619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 20, !dbg !131 + %2620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 21, !dbg !131 + %2621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 22, !dbg !131 + %2622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 23, !dbg !131 + %2623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 24, !dbg !131 + %2624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 25, !dbg !131 + %2625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 26, !dbg !131 + %2626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 27, !dbg !131 + %2627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 28, !dbg !131 + %2628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 29, !dbg !131 + %2629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 30, !dbg !131 + %2630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 31, !dbg !131 + %2631 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2599, float %2600, float %2601, float %2602, float %2603, float %2604, float %2605, float %2606, float %2607, float %2608, float %2609, float %2610, float %2611, float %2612, float %2613, float %2614, float %2615, float %2616, float %2617, float %2618, float %2619, float %2620, float %2621, float %2622, float %2623, float %2624, float %2625, float %2626, float %2627, float %2628, float %2629, float %2630, i64 %2593, i64 %2598, i1 true) #2, !dbg !131 + %2632 = add i32 %2362, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16480), !dbg !131 + %2633 = lshr exact i32 %2632, 4, !dbg !131 + %2634 = and i32 %2633, 16383, !dbg !131 + %2635 = zext nneg i32 %2634 to i64, !dbg !131 + %2636 = or disjoint i64 %2635, 4611686293372403712, !dbg !131 + %2637 = add i32 %2368, 8288, !dbg !131 + %2638 = lshr exact i32 %2637, 4, !dbg !131 + %2639 = and i32 %2638, 16383, !dbg !131 + %2640 = zext nneg i32 %2639 to i64, !dbg !131 + %2641 = or disjoint i64 %2640, 4611686293338849280, !dbg !131 + %2642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 0, !dbg !131 + %2643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 1, !dbg !131 + %2644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 2, !dbg !131 + %2645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 3, !dbg !131 + %2646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 4, !dbg !131 + %2647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 5, !dbg !131 + %2648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 6, !dbg !131 + %2649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 7, !dbg !131 + %2650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 8, !dbg !131 + %2651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 9, !dbg !131 + %2652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 10, !dbg !131 + %2653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 11, !dbg !131 + %2654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 12, !dbg !131 + %2655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 13, !dbg !131 + %2656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 14, !dbg !131 + %2657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 15, !dbg !131 + %2658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 16, !dbg !131 + %2659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 17, !dbg !131 + %2660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 18, !dbg !131 + %2661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 19, !dbg !131 + %2662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 20, !dbg !131 + %2663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 21, !dbg !131 + %2664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 22, !dbg !131 + %2665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 23, !dbg !131 + %2666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 24, !dbg !131 + %2667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 25, !dbg !131 + %2668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 26, !dbg !131 + %2669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 27, !dbg !131 + %2670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 28, !dbg !131 + %2671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 29, !dbg !131 + %2672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 30, !dbg !131 + %2673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 31, !dbg !131 + %2674 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2642, float %2643, float %2644, float %2645, float %2646, float %2647, float %2648, float %2649, float %2650, float %2651, float %2652, float %2653, float %2654, float %2655, float %2656, float %2657, float %2658, float %2659, float %2660, float %2661, float %2662, float %2663, float %2664, float %2665, float %2666, float %2667, float %2668, float %2669, float %2670, float %2671, float %2672, float %2673, i64 %2636, i64 %2641, i1 true) #2, !dbg !131 + %2675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 0, !dbg !131 + %2676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 1, !dbg !131 + %2677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 2, !dbg !131 + %2678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 3, !dbg !131 + %2679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 4, !dbg !131 + %2680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 5, !dbg !131 + %2681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 6, !dbg !131 + %2682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 7, !dbg !131 + %2683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 8, !dbg !131 + %2684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 9, !dbg !131 + %2685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 10, !dbg !131 + %2686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 11, !dbg !131 + %2687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 12, !dbg !131 + %2688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 13, !dbg !131 + %2689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 14, !dbg !131 + %2690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 15, !dbg !131 + %2691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 16, !dbg !131 + %2692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 17, !dbg !131 + %2693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 18, !dbg !131 + %2694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 19, !dbg !131 + %2695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 20, !dbg !131 + %2696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 21, !dbg !131 + %2697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 22, !dbg !131 + %2698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 23, !dbg !131 + %2699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 24, !dbg !131 + %2700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 25, !dbg !131 + %2701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 26, !dbg !131 + %2702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 27, !dbg !131 + %2703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 28, !dbg !131 + %2704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 29, !dbg !131 + %2705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 30, !dbg !131 + %2706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 31, !dbg !131 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !131 + %2707 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101"(float %2675, float %2676, float %2677, float %2678, float %2679, float %2680, float %2681, float %2682, float %2683, float %2684, float %2685, float %2686, float %2687, float %2688, float %2689, float %2690, float %2691, float %2692, float %2693, float %2694, float %2695, float %2696, float %2697, float %2698, float %2699, float %2700, float %2701, float %2702, float %2703, float %2704, float %2705, float %2706, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %2359, i32 0, i32 0, float %.pn, float %.pn479, float %.pn480, float %.pn481, float %.pn482, float %.pn483, float %.pn484, float %.pn485, float %.pn486, float %.pn487, float %.pn488, float %.pn489, float %.pn490, float %.pn491, float %.pn492, float %.pn493, float %.pn494, float %.pn495, float %.pn496, float %.pn497, float %.pn498, float %.pn499, float %.pn500, float %.pn501, float %.pn502, float %.pn503, float %.pn504, float %.pn505, float %.pn506, float %.pn507, float %.pn508, float %.pn509, float %.pn510, float %.pn511, float %.pn512, float %.pn513, float %.pn514, float %.pn515, float %.pn516, float %.pn517, float %.pn518, float %.pn519, float %.pn520, float %.pn521, float %.pn522, float %.pn523, float %.pn524, float %.pn525, float %.pn526, float %.pn527, float %.pn528, float %.pn529, float %.pn530, float %.pn531, float %.pn532, float %.pn533, float %.pn534, float %.pn535, float %.pn536, float %.pn537, float %.pn538, float %.pn539, float %.pn540, float %.pn541) #2, !dbg !131 + %2708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 0, !dbg !131 + %2709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 1, !dbg !131 + %2710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 2, !dbg !131 + %2711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 3, !dbg !131 + %2712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 4, !dbg !131 + %2713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 5, !dbg !131 + %2714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 6, !dbg !131 + %2715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 7, !dbg !131 + %2716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 8, !dbg !131 + %2717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 9, !dbg !131 + %2718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 10, !dbg !131 + %2719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 11, !dbg !131 + %2720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 12, !dbg !131 + %2721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 13, !dbg !131 + %2722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 14, !dbg !131 + %2723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 15, !dbg !131 + %2724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 16, !dbg !131 + %2725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 17, !dbg !131 + %2726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 18, !dbg !131 + %2727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 19, !dbg !131 + %2728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 20, !dbg !131 + %2729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 21, !dbg !131 + %2730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 22, !dbg !131 + %2731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 23, !dbg !131 + %2732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 24, !dbg !131 + %2733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 25, !dbg !131 + %2734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 26, !dbg !131 + %2735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 27, !dbg !131 + %2736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 28, !dbg !131 + %2737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 29, !dbg !131 + %2738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 30, !dbg !131 + %2739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 31, !dbg !131 + %2740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 38, !dbg !131 + %2741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 39, !dbg !131 + %2742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 40, !dbg !131 + %2743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 41, !dbg !131 + %2744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 42, !dbg !131 + %2745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 43, !dbg !131 + %2746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 44, !dbg !131 + %2747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 45, !dbg !131 + %2748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 46, !dbg !131 + %2749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 47, !dbg !131 + %2750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 48, !dbg !131 + %2751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 49, !dbg !131 + %2752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 50, !dbg !131 + %2753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 51, !dbg !131 + %2754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 52, !dbg !131 + %2755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 53, !dbg !131 + %2756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 54, !dbg !131 + %2757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 55, !dbg !131 + %2758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 56, !dbg !131 + %2759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 57, !dbg !131 + %2760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 58, !dbg !131 + %2761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 59, !dbg !131 + %2762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 60, !dbg !131 + %2763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 61, !dbg !131 + %2764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 62, !dbg !131 + %2765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 63, !dbg !131 + %2766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 64, !dbg !131 + %2767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 65, !dbg !131 + %2768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 66, !dbg !131 + %2769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 67, !dbg !131 + %2770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 68, !dbg !131 + %2771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 69, !dbg !131 + %2772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 70, !dbg !131 + %2773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 71, !dbg !131 + %2774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 72, !dbg !131 + %2775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 73, !dbg !131 + %2776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 74, !dbg !131 + %2777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 75, !dbg !131 + %2778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 76, !dbg !131 + %2779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 77, !dbg !131 + %2780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 78, !dbg !131 + %2781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 79, !dbg !131 + %2782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 80, !dbg !131 + %2783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 81, !dbg !131 + %2784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 82, !dbg !131 + %2785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 83, !dbg !131 + %2786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 84, !dbg !131 + %2787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 85, !dbg !131 + %2788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 86, !dbg !131 + %2789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 87, !dbg !131 + %2790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 88, !dbg !131 + %2791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 89, !dbg !131 + %2792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 90, !dbg !131 + %2793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 91, !dbg !131 + %2794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 92, !dbg !131 + %2795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 93, !dbg !131 + %2796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 94, !dbg !131 + %2797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 95, !dbg !131 + %2798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 96, !dbg !131 + %2799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 97, !dbg !131 + %2800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 98, !dbg !131 + %2801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 99, !dbg !131 + %2802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 100, !dbg !131 + %2803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 101, !dbg !131 + %2804 = fmul float %2708, 0x3FB6A09E60000000, !dbg !133 + %2805 = fmul float %2709, 0x3FB6A09E60000000, !dbg !133 + %2806 = fmul float %2710, 0x3FB6A09E60000000, !dbg !133 + %2807 = fmul float %2711, 0x3FB6A09E60000000, !dbg !133 + %2808 = fmul float %2712, 0x3FB6A09E60000000, !dbg !133 + %2809 = fmul float %2713, 0x3FB6A09E60000000, !dbg !133 + %2810 = fmul float %2714, 0x3FB6A09E60000000, !dbg !133 + %2811 = fmul float %2715, 0x3FB6A09E60000000, !dbg !133 + %2812 = fmul float %2716, 0x3FB6A09E60000000, !dbg !133 + %2813 = fmul float %2717, 0x3FB6A09E60000000, !dbg !133 + %2814 = fmul float %2718, 0x3FB6A09E60000000, !dbg !133 + %2815 = fmul float %2719, 0x3FB6A09E60000000, !dbg !133 + %2816 = fmul float %2720, 0x3FB6A09E60000000, !dbg !133 + %2817 = fmul float %2721, 0x3FB6A09E60000000, !dbg !133 + %2818 = fmul float %2722, 0x3FB6A09E60000000, !dbg !133 + %2819 = fmul float %2723, 0x3FB6A09E60000000, !dbg !133 + %2820 = fmul float %2724, 0x3FB6A09E60000000, !dbg !133 + %2821 = fmul float %2725, 0x3FB6A09E60000000, !dbg !133 + %2822 = fmul float %2726, 0x3FB6A09E60000000, !dbg !133 + %2823 = fmul float %2727, 0x3FB6A09E60000000, !dbg !133 + %2824 = fmul float %2728, 0x3FB6A09E60000000, !dbg !133 + %2825 = fmul float %2729, 0x3FB6A09E60000000, !dbg !133 + %2826 = fmul float %2730, 0x3FB6A09E60000000, !dbg !133 + %2827 = fmul float %2731, 0x3FB6A09E60000000, !dbg !133 + %2828 = fmul float %2732, 0x3FB6A09E60000000, !dbg !133 + %2829 = fmul float %2733, 0x3FB6A09E60000000, !dbg !133 + %2830 = fmul float %2734, 0x3FB6A09E60000000, !dbg !133 + %2831 = fmul float %2735, 0x3FB6A09E60000000, !dbg !133 + %2832 = fmul float %2736, 0x3FB6A09E60000000, !dbg !133 + %2833 = fmul float %2737, 0x3FB6A09E60000000, !dbg !133 + %2834 = fmul float %2738, 0x3FB6A09E60000000, !dbg !133 + %2835 = fmul float %2739, 0x3FB6A09E60000000, !dbg !133 + %2836 = extractelement <16 x i32> %2352, i64 15, !dbg !134 + %2837 = icmp slt i32 %2836, %11, !dbg !134 + %2838 = extractelement <16 x i32> %2352, i64 14, !dbg !134 + %2839 = icmp slt i32 %2838, %11, !dbg !134 + %2840 = extractelement <16 x i32> %2352, i64 13, !dbg !134 + %2841 = icmp slt i32 %2840, %11, !dbg !134 + %2842 = extractelement <16 x i32> %2352, i64 12, !dbg !134 + %2843 = icmp slt i32 %2842, %11, !dbg !134 + %2844 = extractelement <16 x i32> %2352, i64 11, !dbg !134 + %2845 = icmp slt i32 %2844, %11, !dbg !134 + %2846 = extractelement <16 x i32> %2352, i64 10, !dbg !134 + %2847 = icmp slt i32 %2846, %11, !dbg !134 + %2848 = extractelement <16 x i32> %2352, i64 9, !dbg !134 + %2849 = icmp slt i32 %2848, %11, !dbg !134 + %2850 = extractelement <16 x i32> %2352, i64 8, !dbg !134 + %2851 = icmp slt i32 %2850, %11, !dbg !134 + %2852 = extractelement <16 x i32> %2352, i64 7, !dbg !134 + %2853 = icmp slt i32 %2852, %11, !dbg !134 + %2854 = extractelement <16 x i32> %2352, i64 6, !dbg !134 + %2855 = icmp slt i32 %2854, %11, !dbg !134 + %2856 = extractelement <16 x i32> %2352, i64 5, !dbg !134 + %2857 = icmp slt i32 %2856, %11, !dbg !134 + %2858 = extractelement <16 x i32> %2352, i64 4, !dbg !134 + %2859 = icmp slt i32 %2858, %11, !dbg !134 + %2860 = extractelement <16 x i32> %2352, i64 3, !dbg !134 + %2861 = icmp slt i32 %2860, %11, !dbg !134 + %2862 = extractelement <16 x i32> %2352, i64 2, !dbg !134 + %2863 = icmp slt i32 %2862, %11, !dbg !134 + %2864 = extractelement <16 x i32> %2352, i64 1, !dbg !134 + %2865 = icmp slt i32 %2864, %11, !dbg !134 + %2866 = extractelement <16 x i32> %2352, i64 0, !dbg !134 + %2867 = icmp slt i32 %2866, %11, !dbg !134 + %2868 = fmul float %2804, 0x3FF7154760000000, !dbg !135 + %2869 = select i1 %2837, float %2868, float 0xFFF0000000000000, !dbg !136 + %2870 = fmul float %2805, 0x3FF7154760000000, !dbg !135 + %2871 = select i1 %2839, float %2870, float 0xFFF0000000000000, !dbg !136 + %2872 = fmul float %2806, 0x3FF7154760000000, !dbg !135 + %2873 = select i1 %2837, float %2872, float 0xFFF0000000000000, !dbg !136 + %2874 = fmul float %2807, 0x3FF7154760000000, !dbg !135 + %2875 = select i1 %2839, float %2874, float 0xFFF0000000000000, !dbg !136 + %2876 = fmul float %2808, 0x3FF7154760000000, !dbg !135 + %2877 = select i1 %2841, float %2876, float 0xFFF0000000000000, !dbg !136 + %2878 = fmul float %2809, 0x3FF7154760000000, !dbg !135 + %2879 = select i1 %2843, float %2878, float 0xFFF0000000000000, !dbg !136 + %2880 = fmul float %2810, 0x3FF7154760000000, !dbg !135 + %2881 = select i1 %2841, float %2880, float 0xFFF0000000000000, !dbg !136 + %2882 = fmul float %2811, 0x3FF7154760000000, !dbg !135 + %2883 = select i1 %2843, float %2882, float 0xFFF0000000000000, !dbg !136 + %2884 = fmul float %2812, 0x3FF7154760000000, !dbg !135 + %2885 = select i1 %2845, float %2884, float 0xFFF0000000000000, !dbg !136 + %2886 = fmul float %2813, 0x3FF7154760000000, !dbg !135 + %2887 = select i1 %2847, float %2886, float 0xFFF0000000000000, !dbg !136 + %2888 = fmul float %2814, 0x3FF7154760000000, !dbg !135 + %2889 = select i1 %2845, float %2888, float 0xFFF0000000000000, !dbg !136 + %2890 = fmul float %2815, 0x3FF7154760000000, !dbg !135 + %2891 = select i1 %2847, float %2890, float 0xFFF0000000000000, !dbg !136 + %2892 = fmul float %2816, 0x3FF7154760000000, !dbg !135 + %2893 = select i1 %2849, float %2892, float 0xFFF0000000000000, !dbg !136 + %2894 = fmul float %2817, 0x3FF7154760000000, !dbg !135 + %2895 = select i1 %2851, float %2894, float 0xFFF0000000000000, !dbg !136 + %2896 = fmul float %2818, 0x3FF7154760000000, !dbg !135 + %2897 = select i1 %2849, float %2896, float 0xFFF0000000000000, !dbg !136 + %2898 = fmul float %2819, 0x3FF7154760000000, !dbg !135 + %2899 = select i1 %2851, float %2898, float 0xFFF0000000000000, !dbg !136 + %2900 = fmul float %2820, 0x3FF7154760000000, !dbg !135 + %2901 = select i1 %2853, float %2900, float 0xFFF0000000000000, !dbg !136 + %2902 = fmul float %2821, 0x3FF7154760000000, !dbg !135 + %2903 = select i1 %2855, float %2902, float 0xFFF0000000000000, !dbg !136 + %2904 = fmul float %2822, 0x3FF7154760000000, !dbg !135 + %2905 = select i1 %2853, float %2904, float 0xFFF0000000000000, !dbg !136 + %2906 = fmul float %2823, 0x3FF7154760000000, !dbg !135 + %2907 = select i1 %2855, float %2906, float 0xFFF0000000000000, !dbg !136 + %2908 = fmul float %2824, 0x3FF7154760000000, !dbg !135 + %2909 = select i1 %2857, float %2908, float 0xFFF0000000000000, !dbg !136 + %2910 = fmul float %2825, 0x3FF7154760000000, !dbg !135 + %2911 = select i1 %2859, float %2910, float 0xFFF0000000000000, !dbg !136 + %2912 = fmul float %2826, 0x3FF7154760000000, !dbg !135 + %2913 = select i1 %2857, float %2912, float 0xFFF0000000000000, !dbg !136 + %2914 = fmul float %2827, 0x3FF7154760000000, !dbg !135 + %2915 = select i1 %2859, float %2914, float 0xFFF0000000000000, !dbg !136 + %2916 = fmul float %2828, 0x3FF7154760000000, !dbg !135 + %2917 = select i1 %2861, float %2916, float 0xFFF0000000000000, !dbg !136 + %2918 = fmul float %2829, 0x3FF7154760000000, !dbg !135 + %2919 = select i1 %2863, float %2918, float 0xFFF0000000000000, !dbg !136 + %2920 = fmul float %2830, 0x3FF7154760000000, !dbg !135 + %2921 = select i1 %2861, float %2920, float 0xFFF0000000000000, !dbg !136 + %2922 = fmul float %2831, 0x3FF7154760000000, !dbg !135 + %2923 = select i1 %2863, float %2922, float 0xFFF0000000000000, !dbg !136 + %2924 = fmul float %2832, 0x3FF7154760000000, !dbg !135 + %2925 = select i1 %2865, float %2924, float 0xFFF0000000000000, !dbg !136 + %2926 = fmul float %2833, 0x3FF7154760000000, !dbg !135 + %2927 = select i1 %2867, float %2926, float 0xFFF0000000000000, !dbg !136 + %2928 = fmul float %2834, 0x3FF7154760000000, !dbg !135 + %2929 = select i1 %2865, float %2928, float 0xFFF0000000000000, !dbg !136 + %2930 = fmul float %2835, 0x3FF7154760000000, !dbg !135 + %2931 = select i1 %2867, float %2930, float 0xFFF0000000000000, !dbg !136 + %2932 = tail call float @llvm.maxnum.f32(float %2869, float %2871), !dbg !137 + %2933 = tail call float @llvm.maxnum.f32(float %2873, float %2875), !dbg !137 + %2934 = tail call float @llvm.maxnum.f32(float %2932, float %2877), !dbg !137 + %2935 = tail call float @llvm.maxnum.f32(float %2934, float %2879), !dbg !137 + %2936 = tail call float @llvm.maxnum.f32(float %2933, float %2881), !dbg !137 + %2937 = tail call float @llvm.maxnum.f32(float %2936, float %2883), !dbg !137 + %2938 = tail call float @llvm.maxnum.f32(float %2935, float %2885), !dbg !137 + %2939 = tail call float @llvm.maxnum.f32(float %2938, float %2887), !dbg !137 + %2940 = tail call float @llvm.maxnum.f32(float %2937, float %2889), !dbg !137 + %2941 = tail call float @llvm.maxnum.f32(float %2940, float %2891), !dbg !137 + %2942 = tail call float @llvm.maxnum.f32(float %2939, float %2893), !dbg !137 + %2943 = tail call float @llvm.maxnum.f32(float %2942, float %2895), !dbg !137 + %2944 = tail call float @llvm.maxnum.f32(float %2941, float %2897), !dbg !137 + %2945 = tail call float @llvm.maxnum.f32(float %2944, float %2899), !dbg !137 + %2946 = tail call float @llvm.maxnum.f32(float %2943, float %2901), !dbg !137 + %2947 = tail call float @llvm.maxnum.f32(float %2946, float %2903), !dbg !137 + %2948 = tail call float @llvm.maxnum.f32(float %2945, float %2905), !dbg !137 + %2949 = tail call float @llvm.maxnum.f32(float %2948, float %2907), !dbg !137 + %2950 = tail call float @llvm.maxnum.f32(float %2947, float %2909), !dbg !137 + %2951 = tail call float @llvm.maxnum.f32(float %2950, float %2911), !dbg !137 + %2952 = tail call float @llvm.maxnum.f32(float %2949, float %2913), !dbg !137 + %2953 = tail call float @llvm.maxnum.f32(float %2952, float %2915), !dbg !137 + %2954 = tail call float @llvm.maxnum.f32(float %2951, float %2917), !dbg !137 + %2955 = tail call float @llvm.maxnum.f32(float %2954, float %2919), !dbg !137 + %2956 = tail call float @llvm.maxnum.f32(float %2953, float %2921), !dbg !137 + %2957 = tail call float @llvm.maxnum.f32(float %2956, float %2923), !dbg !137 + %2958 = tail call float @llvm.maxnum.f32(float %2955, float %2925), !dbg !137 + %2959 = tail call float @llvm.maxnum.f32(float %2958, float %2927), !dbg !137 + %2960 = tail call float @llvm.maxnum.f32(float %2957, float %2929), !dbg !137 + %2961 = tail call float @llvm.maxnum.f32(float %2960, float %2931), !dbg !137 + %2962 = bitcast float %2959 to i32, !dbg !138 + %2963 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2962, i32 2, i32 31), !dbg !138 + %2964 = bitcast i32 %2963 to float, !dbg !138 + %2965 = bitcast float %2961 to i32, !dbg !138 + %2966 = tail call float @llvm.maxnum.f32(float %2959, float %2964), !dbg !137 + %2967 = bitcast float %2966 to i32, !dbg !138 + %2968 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2967, i32 1, i32 31), !dbg !138 + %2969 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2965, i32 2, i32 31), !dbg !138 + %2970 = bitcast i32 %2969 to float, !dbg !138 + %2971 = tail call float @llvm.maxnum.f32(float %2961, float %2970), !dbg !137 + %2972 = bitcast float %2971 to i32, !dbg !138 + %2973 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2972, i32 1, i32 31), !dbg !138 + %2974 = insertelement <2 x i32> poison, i32 %2968, i64 0, !dbg !138 + %2975 = insertelement <2 x i32> %2974, i32 %2973, i64 1, !dbg !138 + %2976 = bitcast <2 x i32> %2975 to <2 x float>, !dbg !138 + %2977 = insertelement <2 x float> poison, float %2966, i64 0, !dbg !137 + %2978 = insertelement <2 x float> %2977, float %2971, i64 1, !dbg !137 + %2979 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %2978, <2 x float> %2976), !dbg !137 + %2980 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %2351, <2 x float> %2979), !dbg !139 + %2981 = extractelement <2 x float> %2980, i64 0, !dbg !140 + %2982 = fcmp oeq float %2981, 0xFFF0000000000000, !dbg !141 + %2983 = extractelement <2 x float> %2980, i64 1, !dbg !140 + %2984 = fcmp oeq float %2983, 0xFFF0000000000000, !dbg !141 + %2985 = select i1 %2982, float 0.000000e+00, float %2981, !dbg !140 + %2986 = select i1 %2984, float 0.000000e+00, float %2983, !dbg !140 + %2987 = extractelement <2 x float> %2351, i64 0, !dbg !142 + %2988 = fsub float %2987, %2985, !dbg !142 + %2989 = extractelement <2 x float> %2351, i64 1, !dbg !142 + %2990 = fsub float %2989, %2986, !dbg !142 + %2991 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !143 + %.not.i = icmp eq i32 %2991, 0, !dbg !143 + br i1 %.not.i, label %2994, label %2992, !dbg !143 + +2992: ; preds = %2345 + %2993 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2988) #2, !dbg !143 + br label %__nv_exp2f.exit, !dbg !143 + +2994: ; preds = %2345 + %2995 = tail call float @llvm.nvvm.ex2.approx.f(float %2988) #2, !dbg !143 + br label %__nv_exp2f.exit, !dbg !143 + +__nv_exp2f.exit: ; preds = %2992, %2994 + %.0.i = phi float [ %2993, %2992 ], [ %2995, %2994 ], !dbg !143 + %2996 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !143 + %.not.i155 = icmp eq i32 %2996, 0, !dbg !143 + br i1 %.not.i155, label %2999, label %2997, !dbg !143 + +2997: ; preds = %__nv_exp2f.exit + %2998 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2990) #2, !dbg !143 + br label %__nv_exp2f.exit157, !dbg !143 + +2999: ; preds = %__nv_exp2f.exit + %3000 = tail call float @llvm.nvvm.ex2.approx.f(float %2990) #2, !dbg !143 + br label %__nv_exp2f.exit157, !dbg !143 + +__nv_exp2f.exit157: ; preds = %2997, %2999 + %.0.i156 = phi float [ %2998, %2997 ], [ %3000, %2999 ], !dbg !143 + %3001 = fsub float %2869, %2985, !dbg !144 + %3002 = fsub float %2871, %2985, !dbg !144 + %3003 = fsub float %2873, %2986, !dbg !144 + %3004 = fsub float %2875, %2986, !dbg !144 + %3005 = fsub float %2877, %2985, !dbg !144 + %3006 = fsub float %2879, %2985, !dbg !144 + %3007 = fsub float %2881, %2986, !dbg !144 + %3008 = fsub float %2883, %2986, !dbg !144 + %3009 = fsub float %2885, %2985, !dbg !144 + %3010 = fsub float %2887, %2985, !dbg !144 + %3011 = fsub float %2889, %2986, !dbg !144 + %3012 = fsub float %2891, %2986, !dbg !144 + %3013 = fsub float %2893, %2985, !dbg !144 + %3014 = fsub float %2895, %2985, !dbg !144 + %3015 = fsub float %2897, %2986, !dbg !144 + %3016 = fsub float %2899, %2986, !dbg !144 + %3017 = fsub float %2901, %2985, !dbg !144 + %3018 = fsub float %2903, %2985, !dbg !144 + %3019 = fsub float %2905, %2986, !dbg !144 + %3020 = fsub float %2907, %2986, !dbg !144 + %3021 = fsub float %2909, %2985, !dbg !144 + %3022 = fsub float %2911, %2985, !dbg !144 + %3023 = fsub float %2913, %2986, !dbg !144 + %3024 = fsub float %2915, %2986, !dbg !144 + %3025 = fsub float %2917, %2985, !dbg !144 + %3026 = fsub float %2919, %2985, !dbg !144 + %3027 = fsub float %2921, %2986, !dbg !144 + %3028 = fsub float %2923, %2986, !dbg !144 + %3029 = fsub float %2925, %2985, !dbg !144 + %3030 = fsub float %2927, %2985, !dbg !144 + %3031 = fsub float %2929, %2986, !dbg !144 + %3032 = fsub float %2931, %2986, !dbg !144 + %3033 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i158 = icmp eq i32 %3033, 0, !dbg !145 + br i1 %.not.i158, label %3036, label %3034, !dbg !145 + +3034: ; preds = %__nv_exp2f.exit157 + %3035 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3001) #2, !dbg !145 + br label %__nv_exp2f.exit160, !dbg !145 + +3036: ; preds = %__nv_exp2f.exit157 + %3037 = tail call float @llvm.nvvm.ex2.approx.f(float %3001) #2, !dbg !145 + br label %__nv_exp2f.exit160, !dbg !145 + +__nv_exp2f.exit160: ; preds = %3034, %3036 + %.0.i159 = phi float [ %3035, %3034 ], [ %3037, %3036 ], !dbg !145 + %3038 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i161 = icmp eq i32 %3038, 0, !dbg !145 + br i1 %.not.i161, label %3041, label %3039, !dbg !145 + +3039: ; preds = %__nv_exp2f.exit160 + %3040 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3002) #2, !dbg !145 + br label %__nv_exp2f.exit163, !dbg !145 + +3041: ; preds = %__nv_exp2f.exit160 + %3042 = tail call float @llvm.nvvm.ex2.approx.f(float %3002) #2, !dbg !145 + br label %__nv_exp2f.exit163, !dbg !145 + +__nv_exp2f.exit163: ; preds = %3039, %3041 + %.0.i162 = phi float [ %3040, %3039 ], [ %3042, %3041 ], !dbg !145 + %3043 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i164 = icmp eq i32 %3043, 0, !dbg !145 + br i1 %.not.i164, label %3046, label %3044, !dbg !145 + +3044: ; preds = %__nv_exp2f.exit163 + %3045 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3003) #2, !dbg !145 + br label %__nv_exp2f.exit166, !dbg !145 + +3046: ; preds = %__nv_exp2f.exit163 + %3047 = tail call float @llvm.nvvm.ex2.approx.f(float %3003) #2, !dbg !145 + br label %__nv_exp2f.exit166, !dbg !145 + +__nv_exp2f.exit166: ; preds = %3044, %3046 + %.0.i165 = phi float [ %3045, %3044 ], [ %3047, %3046 ], !dbg !145 + %3048 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i167 = icmp eq i32 %3048, 0, !dbg !145 + br i1 %.not.i167, label %3051, label %3049, !dbg !145 + +3049: ; preds = %__nv_exp2f.exit166 + %3050 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3004) #2, !dbg !145 + br label %__nv_exp2f.exit169, !dbg !145 + +3051: ; preds = %__nv_exp2f.exit166 + %3052 = tail call float @llvm.nvvm.ex2.approx.f(float %3004) #2, !dbg !145 + br label %__nv_exp2f.exit169, !dbg !145 + +__nv_exp2f.exit169: ; preds = %3049, %3051 + %.0.i168 = phi float [ %3050, %3049 ], [ %3052, %3051 ], !dbg !145 + %3053 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i170 = icmp eq i32 %3053, 0, !dbg !145 + br i1 %.not.i170, label %3056, label %3054, !dbg !145 + +3054: ; preds = %__nv_exp2f.exit169 + %3055 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3005) #2, !dbg !145 + br label %__nv_exp2f.exit172, !dbg !145 + +3056: ; preds = %__nv_exp2f.exit169 + %3057 = tail call float @llvm.nvvm.ex2.approx.f(float %3005) #2, !dbg !145 + br label %__nv_exp2f.exit172, !dbg !145 + +__nv_exp2f.exit172: ; preds = %3054, %3056 + %.0.i171 = phi float [ %3055, %3054 ], [ %3057, %3056 ], !dbg !145 + %3058 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i173 = icmp eq i32 %3058, 0, !dbg !145 + br i1 %.not.i173, label %3061, label %3059, !dbg !145 + +3059: ; preds = %__nv_exp2f.exit172 + %3060 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3006) #2, !dbg !145 + br label %__nv_exp2f.exit175, !dbg !145 + +3061: ; preds = %__nv_exp2f.exit172 + %3062 = tail call float @llvm.nvvm.ex2.approx.f(float %3006) #2, !dbg !145 + br label %__nv_exp2f.exit175, !dbg !145 + +__nv_exp2f.exit175: ; preds = %3059, %3061 + %.0.i174 = phi float [ %3060, %3059 ], [ %3062, %3061 ], !dbg !145 + %3063 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i176 = icmp eq i32 %3063, 0, !dbg !145 + br i1 %.not.i176, label %3066, label %3064, !dbg !145 + +3064: ; preds = %__nv_exp2f.exit175 + %3065 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3007) #2, !dbg !145 + br label %__nv_exp2f.exit178, !dbg !145 + +3066: ; preds = %__nv_exp2f.exit175 + %3067 = tail call float @llvm.nvvm.ex2.approx.f(float %3007) #2, !dbg !145 + br label %__nv_exp2f.exit178, !dbg !145 + +__nv_exp2f.exit178: ; preds = %3064, %3066 + %.0.i177 = phi float [ %3065, %3064 ], [ %3067, %3066 ], !dbg !145 + %3068 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i179 = icmp eq i32 %3068, 0, !dbg !145 + br i1 %.not.i179, label %3071, label %3069, !dbg !145 + +3069: ; preds = %__nv_exp2f.exit178 + %3070 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3008) #2, !dbg !145 + br label %__nv_exp2f.exit181, !dbg !145 + +3071: ; preds = %__nv_exp2f.exit178 + %3072 = tail call float @llvm.nvvm.ex2.approx.f(float %3008) #2, !dbg !145 + br label %__nv_exp2f.exit181, !dbg !145 + +__nv_exp2f.exit181: ; preds = %3069, %3071 + %.0.i180 = phi float [ %3070, %3069 ], [ %3072, %3071 ], !dbg !145 + %3073 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i182 = icmp eq i32 %3073, 0, !dbg !145 + br i1 %.not.i182, label %3076, label %3074, !dbg !145 + +3074: ; preds = %__nv_exp2f.exit181 + %3075 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3009) #2, !dbg !145 + br label %__nv_exp2f.exit184, !dbg !145 + +3076: ; preds = %__nv_exp2f.exit181 + %3077 = tail call float @llvm.nvvm.ex2.approx.f(float %3009) #2, !dbg !145 + br label %__nv_exp2f.exit184, !dbg !145 + +__nv_exp2f.exit184: ; preds = %3074, %3076 + %.0.i183 = phi float [ %3075, %3074 ], [ %3077, %3076 ], !dbg !145 + %3078 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i185 = icmp eq i32 %3078, 0, !dbg !145 + br i1 %.not.i185, label %3081, label %3079, !dbg !145 + +3079: ; preds = %__nv_exp2f.exit184 + %3080 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3010) #2, !dbg !145 + br label %__nv_exp2f.exit187, !dbg !145 + +3081: ; preds = %__nv_exp2f.exit184 + %3082 = tail call float @llvm.nvvm.ex2.approx.f(float %3010) #2, !dbg !145 + br label %__nv_exp2f.exit187, !dbg !145 + +__nv_exp2f.exit187: ; preds = %3079, %3081 + %.0.i186 = phi float [ %3080, %3079 ], [ %3082, %3081 ], !dbg !145 + %3083 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i188 = icmp eq i32 %3083, 0, !dbg !145 + br i1 %.not.i188, label %3086, label %3084, !dbg !145 + +3084: ; preds = %__nv_exp2f.exit187 + %3085 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3011) #2, !dbg !145 + br label %__nv_exp2f.exit190, !dbg !145 + +3086: ; preds = %__nv_exp2f.exit187 + %3087 = tail call float @llvm.nvvm.ex2.approx.f(float %3011) #2, !dbg !145 + br label %__nv_exp2f.exit190, !dbg !145 + +__nv_exp2f.exit190: ; preds = %3084, %3086 + %.0.i189 = phi float [ %3085, %3084 ], [ %3087, %3086 ], !dbg !145 + %3088 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i191 = icmp eq i32 %3088, 0, !dbg !145 + br i1 %.not.i191, label %3091, label %3089, !dbg !145 + +3089: ; preds = %__nv_exp2f.exit190 + %3090 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3012) #2, !dbg !145 + br label %__nv_exp2f.exit193, !dbg !145 + +3091: ; preds = %__nv_exp2f.exit190 + %3092 = tail call float @llvm.nvvm.ex2.approx.f(float %3012) #2, !dbg !145 + br label %__nv_exp2f.exit193, !dbg !145 + +__nv_exp2f.exit193: ; preds = %3089, %3091 + %.0.i192 = phi float [ %3090, %3089 ], [ %3092, %3091 ], !dbg !145 + %3093 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i194 = icmp eq i32 %3093, 0, !dbg !145 + br i1 %.not.i194, label %3096, label %3094, !dbg !145 + +3094: ; preds = %__nv_exp2f.exit193 + %3095 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3013) #2, !dbg !145 + br label %__nv_exp2f.exit196, !dbg !145 + +3096: ; preds = %__nv_exp2f.exit193 + %3097 = tail call float @llvm.nvvm.ex2.approx.f(float %3013) #2, !dbg !145 + br label %__nv_exp2f.exit196, !dbg !145 + +__nv_exp2f.exit196: ; preds = %3094, %3096 + %.0.i195 = phi float [ %3095, %3094 ], [ %3097, %3096 ], !dbg !145 + %3098 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i197 = icmp eq i32 %3098, 0, !dbg !145 + br i1 %.not.i197, label %3101, label %3099, !dbg !145 + +3099: ; preds = %__nv_exp2f.exit196 + %3100 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3014) #2, !dbg !145 + br label %__nv_exp2f.exit199, !dbg !145 + +3101: ; preds = %__nv_exp2f.exit196 + %3102 = tail call float @llvm.nvvm.ex2.approx.f(float %3014) #2, !dbg !145 + br label %__nv_exp2f.exit199, !dbg !145 + +__nv_exp2f.exit199: ; preds = %3099, %3101 + %.0.i198 = phi float [ %3100, %3099 ], [ %3102, %3101 ], !dbg !145 + %3103 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i200 = icmp eq i32 %3103, 0, !dbg !145 + br i1 %.not.i200, label %3106, label %3104, !dbg !145 + +3104: ; preds = %__nv_exp2f.exit199 + %3105 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3015) #2, !dbg !145 + br label %__nv_exp2f.exit202, !dbg !145 + +3106: ; preds = %__nv_exp2f.exit199 + %3107 = tail call float @llvm.nvvm.ex2.approx.f(float %3015) #2, !dbg !145 + br label %__nv_exp2f.exit202, !dbg !145 + +__nv_exp2f.exit202: ; preds = %3104, %3106 + %.0.i201 = phi float [ %3105, %3104 ], [ %3107, %3106 ], !dbg !145 + %3108 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i203 = icmp eq i32 %3108, 0, !dbg !145 + br i1 %.not.i203, label %3111, label %3109, !dbg !145 + +3109: ; preds = %__nv_exp2f.exit202 + %3110 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3016) #2, !dbg !145 + br label %__nv_exp2f.exit205, !dbg !145 + +3111: ; preds = %__nv_exp2f.exit202 + %3112 = tail call float @llvm.nvvm.ex2.approx.f(float %3016) #2, !dbg !145 + br label %__nv_exp2f.exit205, !dbg !145 + +__nv_exp2f.exit205: ; preds = %3109, %3111 + %.0.i204 = phi float [ %3110, %3109 ], [ %3112, %3111 ], !dbg !145 + %3113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i206 = icmp eq i32 %3113, 0, !dbg !145 + br i1 %.not.i206, label %3116, label %3114, !dbg !145 + +3114: ; preds = %__nv_exp2f.exit205 + %3115 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3017) #2, !dbg !145 + br label %__nv_exp2f.exit208, !dbg !145 + +3116: ; preds = %__nv_exp2f.exit205 + %3117 = tail call float @llvm.nvvm.ex2.approx.f(float %3017) #2, !dbg !145 + br label %__nv_exp2f.exit208, !dbg !145 + +__nv_exp2f.exit208: ; preds = %3114, %3116 + %.0.i207 = phi float [ %3115, %3114 ], [ %3117, %3116 ], !dbg !145 + %3118 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i209 = icmp eq i32 %3118, 0, !dbg !145 + br i1 %.not.i209, label %3121, label %3119, !dbg !145 + +3119: ; preds = %__nv_exp2f.exit208 + %3120 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3018) #2, !dbg !145 + br label %__nv_exp2f.exit211, !dbg !145 + +3121: ; preds = %__nv_exp2f.exit208 + %3122 = tail call float @llvm.nvvm.ex2.approx.f(float %3018) #2, !dbg !145 + br label %__nv_exp2f.exit211, !dbg !145 + +__nv_exp2f.exit211: ; preds = %3119, %3121 + %.0.i210 = phi float [ %3120, %3119 ], [ %3122, %3121 ], !dbg !145 + %3123 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i212 = icmp eq i32 %3123, 0, !dbg !145 + br i1 %.not.i212, label %3126, label %3124, !dbg !145 + +3124: ; preds = %__nv_exp2f.exit211 + %3125 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3019) #2, !dbg !145 + br label %__nv_exp2f.exit214, !dbg !145 + +3126: ; preds = %__nv_exp2f.exit211 + %3127 = tail call float @llvm.nvvm.ex2.approx.f(float %3019) #2, !dbg !145 + br label %__nv_exp2f.exit214, !dbg !145 + +__nv_exp2f.exit214: ; preds = %3124, %3126 + %.0.i213 = phi float [ %3125, %3124 ], [ %3127, %3126 ], !dbg !145 + %3128 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i215 = icmp eq i32 %3128, 0, !dbg !145 + br i1 %.not.i215, label %3131, label %3129, !dbg !145 + +3129: ; preds = %__nv_exp2f.exit214 + %3130 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3020) #2, !dbg !145 + br label %__nv_exp2f.exit217, !dbg !145 + +3131: ; preds = %__nv_exp2f.exit214 + %3132 = tail call float @llvm.nvvm.ex2.approx.f(float %3020) #2, !dbg !145 + br label %__nv_exp2f.exit217, !dbg !145 + +__nv_exp2f.exit217: ; preds = %3129, %3131 + %.0.i216 = phi float [ %3130, %3129 ], [ %3132, %3131 ], !dbg !145 + %3133 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i218 = icmp eq i32 %3133, 0, !dbg !145 + br i1 %.not.i218, label %3136, label %3134, !dbg !145 + +3134: ; preds = %__nv_exp2f.exit217 + %3135 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3021) #2, !dbg !145 + br label %__nv_exp2f.exit220, !dbg !145 + +3136: ; preds = %__nv_exp2f.exit217 + %3137 = tail call float @llvm.nvvm.ex2.approx.f(float %3021) #2, !dbg !145 + br label %__nv_exp2f.exit220, !dbg !145 + +__nv_exp2f.exit220: ; preds = %3134, %3136 + %.0.i219 = phi float [ %3135, %3134 ], [ %3137, %3136 ], !dbg !145 + %3138 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i221 = icmp eq i32 %3138, 0, !dbg !145 + br i1 %.not.i221, label %3141, label %3139, !dbg !145 + +3139: ; preds = %__nv_exp2f.exit220 + %3140 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3022) #2, !dbg !145 + br label %__nv_exp2f.exit223, !dbg !145 + +3141: ; preds = %__nv_exp2f.exit220 + %3142 = tail call float @llvm.nvvm.ex2.approx.f(float %3022) #2, !dbg !145 + br label %__nv_exp2f.exit223, !dbg !145 + +__nv_exp2f.exit223: ; preds = %3139, %3141 + %.0.i222 = phi float [ %3140, %3139 ], [ %3142, %3141 ], !dbg !145 + %3143 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i224 = icmp eq i32 %3143, 0, !dbg !145 + br i1 %.not.i224, label %3146, label %3144, !dbg !145 + +3144: ; preds = %__nv_exp2f.exit223 + %3145 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3023) #2, !dbg !145 + br label %__nv_exp2f.exit226, !dbg !145 + +3146: ; preds = %__nv_exp2f.exit223 + %3147 = tail call float @llvm.nvvm.ex2.approx.f(float %3023) #2, !dbg !145 + br label %__nv_exp2f.exit226, !dbg !145 + +__nv_exp2f.exit226: ; preds = %3144, %3146 + %.0.i225 = phi float [ %3145, %3144 ], [ %3147, %3146 ], !dbg !145 + %3148 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i227 = icmp eq i32 %3148, 0, !dbg !145 + br i1 %.not.i227, label %3151, label %3149, !dbg !145 + +3149: ; preds = %__nv_exp2f.exit226 + %3150 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3024) #2, !dbg !145 + br label %__nv_exp2f.exit229, !dbg !145 + +3151: ; preds = %__nv_exp2f.exit226 + %3152 = tail call float @llvm.nvvm.ex2.approx.f(float %3024) #2, !dbg !145 + br label %__nv_exp2f.exit229, !dbg !145 + +__nv_exp2f.exit229: ; preds = %3149, %3151 + %.0.i228 = phi float [ %3150, %3149 ], [ %3152, %3151 ], !dbg !145 + %3153 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i230 = icmp eq i32 %3153, 0, !dbg !145 + br i1 %.not.i230, label %3156, label %3154, !dbg !145 + +3154: ; preds = %__nv_exp2f.exit229 + %3155 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3025) #2, !dbg !145 + br label %__nv_exp2f.exit232, !dbg !145 + +3156: ; preds = %__nv_exp2f.exit229 + %3157 = tail call float @llvm.nvvm.ex2.approx.f(float %3025) #2, !dbg !145 + br label %__nv_exp2f.exit232, !dbg !145 + +__nv_exp2f.exit232: ; preds = %3154, %3156 + %.0.i231 = phi float [ %3155, %3154 ], [ %3157, %3156 ], !dbg !145 + %3158 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i233 = icmp eq i32 %3158, 0, !dbg !145 + br i1 %.not.i233, label %3161, label %3159, !dbg !145 + +3159: ; preds = %__nv_exp2f.exit232 + %3160 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3026) #2, !dbg !145 + br label %__nv_exp2f.exit235, !dbg !145 + +3161: ; preds = %__nv_exp2f.exit232 + %3162 = tail call float @llvm.nvvm.ex2.approx.f(float %3026) #2, !dbg !145 + br label %__nv_exp2f.exit235, !dbg !145 + +__nv_exp2f.exit235: ; preds = %3159, %3161 + %.0.i234 = phi float [ %3160, %3159 ], [ %3162, %3161 ], !dbg !145 + %3163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i236 = icmp eq i32 %3163, 0, !dbg !145 + br i1 %.not.i236, label %3166, label %3164, !dbg !145 + +3164: ; preds = %__nv_exp2f.exit235 + %3165 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3027) #2, !dbg !145 + br label %__nv_exp2f.exit238, !dbg !145 + +3166: ; preds = %__nv_exp2f.exit235 + %3167 = tail call float @llvm.nvvm.ex2.approx.f(float %3027) #2, !dbg !145 + br label %__nv_exp2f.exit238, !dbg !145 + +__nv_exp2f.exit238: ; preds = %3164, %3166 + %.0.i237 = phi float [ %3165, %3164 ], [ %3167, %3166 ], !dbg !145 + %3168 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i239 = icmp eq i32 %3168, 0, !dbg !145 + br i1 %.not.i239, label %3171, label %3169, !dbg !145 + +3169: ; preds = %__nv_exp2f.exit238 + %3170 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3028) #2, !dbg !145 + br label %__nv_exp2f.exit241, !dbg !145 + +3171: ; preds = %__nv_exp2f.exit238 + %3172 = tail call float @llvm.nvvm.ex2.approx.f(float %3028) #2, !dbg !145 + br label %__nv_exp2f.exit241, !dbg !145 + +__nv_exp2f.exit241: ; preds = %3169, %3171 + %.0.i240 = phi float [ %3170, %3169 ], [ %3172, %3171 ], !dbg !145 + %3173 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i242 = icmp eq i32 %3173, 0, !dbg !145 + br i1 %.not.i242, label %3176, label %3174, !dbg !145 + +3174: ; preds = %__nv_exp2f.exit241 + %3175 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3029) #2, !dbg !145 + br label %__nv_exp2f.exit244, !dbg !145 + +3176: ; preds = %__nv_exp2f.exit241 + %3177 = tail call float @llvm.nvvm.ex2.approx.f(float %3029) #2, !dbg !145 + br label %__nv_exp2f.exit244, !dbg !145 + +__nv_exp2f.exit244: ; preds = %3174, %3176 + %.0.i243 = phi float [ %3175, %3174 ], [ %3177, %3176 ], !dbg !145 + %3178 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i245 = icmp eq i32 %3178, 0, !dbg !145 + br i1 %.not.i245, label %3181, label %3179, !dbg !145 + +3179: ; preds = %__nv_exp2f.exit244 + %3180 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3030) #2, !dbg !145 + br label %__nv_exp2f.exit247, !dbg !145 + +3181: ; preds = %__nv_exp2f.exit244 + %3182 = tail call float @llvm.nvvm.ex2.approx.f(float %3030) #2, !dbg !145 + br label %__nv_exp2f.exit247, !dbg !145 + +__nv_exp2f.exit247: ; preds = %3179, %3181 + %.0.i246 = phi float [ %3180, %3179 ], [ %3182, %3181 ], !dbg !145 + %3183 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i248 = icmp eq i32 %3183, 0, !dbg !145 + br i1 %.not.i248, label %3186, label %3184, !dbg !145 + +3184: ; preds = %__nv_exp2f.exit247 + %3185 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3031) #2, !dbg !145 + br label %__nv_exp2f.exit250, !dbg !145 + +3186: ; preds = %__nv_exp2f.exit247 + %3187 = tail call float @llvm.nvvm.ex2.approx.f(float %3031) #2, !dbg !145 + br label %__nv_exp2f.exit250, !dbg !145 + +__nv_exp2f.exit250: ; preds = %3184, %3186 + %.0.i249 = phi float [ %3185, %3184 ], [ %3187, %3186 ], !dbg !145 + %3188 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i251 = icmp eq i32 %3188, 0, !dbg !145 + br i1 %.not.i251, label %3191, label %3189, !dbg !145 + +3189: ; preds = %__nv_exp2f.exit250 + %3190 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3032) #2, !dbg !145 + br label %__nv_exp2f.exit253, !dbg !145 + +3191: ; preds = %__nv_exp2f.exit250 + %3192 = tail call float @llvm.nvvm.ex2.approx.f(float %3032) #2, !dbg !145 + br label %__nv_exp2f.exit253, !dbg !145 + +__nv_exp2f.exit253: ; preds = %3189, %3191 + %.0.i252 = phi float [ %3190, %3189 ], [ %3192, %3191 ], !dbg !145 + %3193 = fmul float %.pn546, %.0.i, !dbg !146 + %3194 = fmul float %.pn544, %.0.i156, !dbg !146 + %3195 = fadd float %.0.i159, %.0.i162, !dbg !147 + %3196 = fadd float %.0.i165, %.0.i168, !dbg !147 + %3197 = fadd float %3195, %.0.i171, !dbg !147 + %3198 = fadd float %3197, %.0.i174, !dbg !147 + %3199 = fadd float %3196, %.0.i177, !dbg !147 + %3200 = fadd float %3199, %.0.i180, !dbg !147 + %3201 = fadd float %3198, %.0.i183, !dbg !147 + %3202 = fadd float %3201, %.0.i186, !dbg !147 + %3203 = fadd float %3200, %.0.i189, !dbg !147 + %3204 = fadd float %3203, %.0.i192, !dbg !147 + %3205 = fadd float %3202, %.0.i195, !dbg !147 + %3206 = fadd float %3205, %.0.i198, !dbg !147 + %3207 = fadd float %3204, %.0.i201, !dbg !147 + %3208 = fadd float %3207, %.0.i204, !dbg !147 + %3209 = fadd float %3206, %.0.i207, !dbg !147 + %3210 = fadd float %3209, %.0.i210, !dbg !147 + %3211 = fadd float %3208, %.0.i213, !dbg !147 + %3212 = fadd float %3211, %.0.i216, !dbg !147 + %3213 = fadd float %3210, %.0.i219, !dbg !147 + %3214 = fadd float %3213, %.0.i222, !dbg !147 + %3215 = fadd float %3212, %.0.i225, !dbg !147 + %3216 = fadd float %3215, %.0.i228, !dbg !147 + %3217 = fadd float %3214, %.0.i231, !dbg !147 + %3218 = fadd float %3217, %.0.i234, !dbg !147 + %3219 = fadd float %3216, %.0.i237, !dbg !147 + %3220 = fadd float %3219, %.0.i240, !dbg !147 + %3221 = fadd float %3218, %.0.i243, !dbg !147 + %3222 = fadd float %3221, %.0.i246, !dbg !147 + %3223 = fadd float %3220, %.0.i249, !dbg !147 + %3224 = fadd float %3223, %.0.i252, !dbg !147 + %3225 = bitcast float %3222 to i32, !dbg !148 + %3226 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3225, i32 2, i32 31), !dbg !148 + %3227 = bitcast i32 %3226 to float, !dbg !148 + %3228 = fadd float %3222, %3227, !dbg !147 + %3229 = bitcast float %3228 to i32, !dbg !148 + %3230 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3229, i32 1, i32 31), !dbg !148 + %3231 = bitcast i32 %3230 to float, !dbg !148 + %3232 = fadd float %3228, %3231, !dbg !147 + %3233 = bitcast float %3224 to i32, !dbg !148 + %3234 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3233, i32 2, i32 31), !dbg !148 + %3235 = bitcast i32 %3234 to float, !dbg !148 + %3236 = fadd float %3224, %3235, !dbg !147 + %3237 = bitcast float %3236 to i32, !dbg !148 + %3238 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3237, i32 1, i32 31), !dbg !148 + %3239 = bitcast i32 %3238 to float, !dbg !148 + %3240 = fadd float %3236, %3239, !dbg !147 + %3241 = fadd float %3193, %3232, !dbg !149 + %3242 = fadd float %3194, %3240, !dbg !149 + %3243 = fmul float %2740, %.0.i, !dbg !150 + %3244 = fmul float %2741, %.0.i, !dbg !150 + %3245 = fmul float %2742, %.0.i156, !dbg !150 + %3246 = fmul float %2743, %.0.i156, !dbg !150 + %3247 = fmul float %2744, %.0.i, !dbg !150 + %3248 = fmul float %2745, %.0.i, !dbg !150 + %3249 = fmul float %2746, %.0.i156, !dbg !150 + %3250 = fmul float %2747, %.0.i156, !dbg !150 + %3251 = fmul float %2748, %.0.i, !dbg !150 + %3252 = fmul float %2749, %.0.i, !dbg !150 + %3253 = fmul float %2750, %.0.i156, !dbg !150 + %3254 = fmul float %2751, %.0.i156, !dbg !150 + %3255 = fmul float %2752, %.0.i, !dbg !150 + %3256 = fmul float %2753, %.0.i, !dbg !150 + %3257 = fmul float %2754, %.0.i156, !dbg !150 + %3258 = fmul float %2755, %.0.i156, !dbg !150 + %3259 = fmul float %2756, %.0.i, !dbg !150 + %3260 = fmul float %2757, %.0.i, !dbg !150 + %3261 = fmul float %2758, %.0.i156, !dbg !150 + %3262 = fmul float %2759, %.0.i156, !dbg !150 + %3263 = fmul float %2760, %.0.i, !dbg !150 + %3264 = fmul float %2761, %.0.i, !dbg !150 + %3265 = fmul float %2762, %.0.i156, !dbg !150 + %3266 = fmul float %2763, %.0.i156, !dbg !150 + %3267 = fmul float %2764, %.0.i, !dbg !150 + %3268 = fmul float %2765, %.0.i, !dbg !150 + %3269 = fmul float %2766, %.0.i156, !dbg !150 + %3270 = fmul float %2767, %.0.i156, !dbg !150 + %3271 = fmul float %2768, %.0.i, !dbg !150 + %3272 = fmul float %2769, %.0.i, !dbg !150 + %3273 = fmul float %2770, %.0.i156, !dbg !150 + %3274 = fmul float %2771, %.0.i156, !dbg !150 + %3275 = fmul float %2772, %.0.i, !dbg !150 + %3276 = fmul float %2773, %.0.i, !dbg !150 + %3277 = fmul float %2774, %.0.i156, !dbg !150 + %3278 = fmul float %2775, %.0.i156, !dbg !150 + %3279 = fmul float %2776, %.0.i, !dbg !150 + %3280 = fmul float %2777, %.0.i, !dbg !150 + %3281 = fmul float %2778, %.0.i156, !dbg !150 + %3282 = fmul float %2779, %.0.i156, !dbg !150 + %3283 = fmul float %2780, %.0.i, !dbg !150 + %3284 = fmul float %2781, %.0.i, !dbg !150 + %3285 = fmul float %2782, %.0.i156, !dbg !150 + %3286 = fmul float %2783, %.0.i156, !dbg !150 + %3287 = fmul float %2784, %.0.i, !dbg !150 + %3288 = fmul float %2785, %.0.i, !dbg !150 + %3289 = fmul float %2786, %.0.i156, !dbg !150 + %3290 = fmul float %2787, %.0.i156, !dbg !150 + %3291 = fmul float %2788, %.0.i, !dbg !150 + %3292 = fmul float %2789, %.0.i, !dbg !150 + %3293 = fmul float %2790, %.0.i156, !dbg !150 + %3294 = fmul float %2791, %.0.i156, !dbg !150 + %3295 = fmul float %2792, %.0.i, !dbg !150 + %3296 = fmul float %2793, %.0.i, !dbg !150 + %3297 = fmul float %2794, %.0.i156, !dbg !150 + %3298 = fmul float %2795, %.0.i156, !dbg !150 + %3299 = fmul float %2796, %.0.i, !dbg !150 + %3300 = fmul float %2797, %.0.i, !dbg !150 + %3301 = fmul float %2798, %.0.i156, !dbg !150 + %3302 = fmul float %2799, %.0.i156, !dbg !150 + %3303 = fmul float %2800, %.0.i, !dbg !150 + %3304 = fmul float %2801, %.0.i, !dbg !150 + %3305 = fmul float %2802, %.0.i156, !dbg !150 + %3306 = fmul float %2803, %.0.i156, !dbg !150 + %3307 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %2358, !dbg !129 + %3308 = insertelement <2 x float> poison, float %.0.i159, i64 0, !dbg !151 + %3309 = insertelement <2 x float> %3308, float %.0.i162, i64 1, !dbg !151 + %3310 = fptrunc <2 x float> %3309 to <2 x bfloat>, !dbg !151 + %3311 = insertelement <2 x float> poison, float %.0.i165, i64 0, !dbg !151 + %3312 = insertelement <2 x float> %3311, float %.0.i168, i64 1, !dbg !151 + %3313 = fptrunc <2 x float> %3312 to <2 x bfloat>, !dbg !151 + %3314 = insertelement <2 x float> poison, float %.0.i171, i64 0, !dbg !151 + %3315 = insertelement <2 x float> %3314, float %.0.i174, i64 1, !dbg !151 + %3316 = fptrunc <2 x float> %3315 to <2 x bfloat>, !dbg !151 + %3317 = insertelement <2 x float> poison, float %.0.i177, i64 0, !dbg !151 + %3318 = insertelement <2 x float> %3317, float %.0.i180, i64 1, !dbg !151 + %3319 = fptrunc <2 x float> %3318 to <2 x bfloat>, !dbg !151 + %3320 = insertelement <2 x float> poison, float %.0.i183, i64 0, !dbg !151 + %3321 = insertelement <2 x float> %3320, float %.0.i186, i64 1, !dbg !151 + %3322 = fptrunc <2 x float> %3321 to <2 x bfloat>, !dbg !151 + %3323 = insertelement <2 x float> poison, float %.0.i189, i64 0, !dbg !151 + %3324 = insertelement <2 x float> %3323, float %.0.i192, i64 1, !dbg !151 + %3325 = fptrunc <2 x float> %3324 to <2 x bfloat>, !dbg !151 + %3326 = insertelement <2 x float> poison, float %.0.i195, i64 0, !dbg !151 + %3327 = insertelement <2 x float> %3326, float %.0.i198, i64 1, !dbg !151 + %3328 = fptrunc <2 x float> %3327 to <2 x bfloat>, !dbg !151 + %3329 = insertelement <2 x float> poison, float %.0.i201, i64 0, !dbg !151 + %3330 = insertelement <2 x float> %3329, float %.0.i204, i64 1, !dbg !151 + %3331 = fptrunc <2 x float> %3330 to <2 x bfloat>, !dbg !151 + %3332 = insertelement <2 x float> poison, float %.0.i207, i64 0, !dbg !151 + %3333 = insertelement <2 x float> %3332, float %.0.i210, i64 1, !dbg !151 + %3334 = fptrunc <2 x float> %3333 to <2 x bfloat>, !dbg !151 + %3335 = insertelement <2 x float> poison, float %.0.i213, i64 0, !dbg !151 + %3336 = insertelement <2 x float> %3335, float %.0.i216, i64 1, !dbg !151 + %3337 = fptrunc <2 x float> %3336 to <2 x bfloat>, !dbg !151 + %3338 = insertelement <2 x float> poison, float %.0.i219, i64 0, !dbg !151 + %3339 = insertelement <2 x float> %3338, float %.0.i222, i64 1, !dbg !151 + %3340 = fptrunc <2 x float> %3339 to <2 x bfloat>, !dbg !151 + %3341 = insertelement <2 x float> poison, float %.0.i225, i64 0, !dbg !151 + %3342 = insertelement <2 x float> %3341, float %.0.i228, i64 1, !dbg !151 + %3343 = fptrunc <2 x float> %3342 to <2 x bfloat>, !dbg !151 + %3344 = insertelement <2 x float> poison, float %.0.i231, i64 0, !dbg !151 + %3345 = insertelement <2 x float> %3344, float %.0.i234, i64 1, !dbg !151 + %3346 = fptrunc <2 x float> %3345 to <2 x bfloat>, !dbg !151 + %3347 = insertelement <2 x float> poison, float %.0.i237, i64 0, !dbg !151 + %3348 = insertelement <2 x float> %3347, float %.0.i240, i64 1, !dbg !151 + %3349 = fptrunc <2 x float> %3348 to <2 x bfloat>, !dbg !151 + %3350 = insertelement <2 x float> poison, float %.0.i243, i64 0, !dbg !151 + %3351 = insertelement <2 x float> %3350, float %.0.i246, i64 1, !dbg !151 + %3352 = fptrunc <2 x float> %3351 to <2 x bfloat>, !dbg !151 + %3353 = insertelement <2 x float> poison, float %.0.i249, i64 0, !dbg !151 + %3354 = insertelement <2 x float> %3353, float %.0.i252, i64 1, !dbg !151 + %3355 = fptrunc <2 x float> %3354 to <2 x bfloat>, !dbg !151 + %3356 = bitcast <2 x bfloat> %3310 to i32, !dbg !152 + %3357 = bitcast <2 x bfloat> %3313 to i32, !dbg !152 + %3358 = bitcast <2 x bfloat> %3316 to i32, !dbg !152 + %3359 = bitcast <2 x bfloat> %3319 to i32, !dbg !152 + %3360 = bitcast <2 x bfloat> %3322 to i32, !dbg !152 + %3361 = bitcast <2 x bfloat> %3325 to i32, !dbg !152 + %3362 = bitcast <2 x bfloat> %3328 to i32, !dbg !152 + %3363 = bitcast <2 x bfloat> %3331 to i32, !dbg !152 + %3364 = bitcast <2 x bfloat> %3334 to i32, !dbg !152 + %3365 = bitcast <2 x bfloat> %3337 to i32, !dbg !152 + %3366 = bitcast <2 x bfloat> %3340 to i32, !dbg !152 + %3367 = bitcast <2 x bfloat> %3343 to i32, !dbg !152 + %3368 = bitcast <2 x bfloat> %3346 to i32, !dbg !152 + %3369 = bitcast <2 x bfloat> %3349 to i32, !dbg !152 + %3370 = bitcast <2 x bfloat> %3352 to i32, !dbg !152 + %3371 = bitcast <2 x bfloat> %3355 to i32, !dbg !152 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !152 + %3372 = ptrtoint ptr addrspace(3) %3307 to i32, !dbg !152 + %3373 = lshr exact i32 %3372, 4, !dbg !152 + %3374 = and i32 %3373, 16383, !dbg !152 + %3375 = zext nneg i32 %3374 to i64, !dbg !152 + %3376 = or disjoint i64 %3375, 4611686293338849280, !dbg !152 + %3377 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3243, float %3244, float %3245, float %3246, float %3247, float %3248, float %3249, float %3250, float %3251, float %3252, float %3253, float %3254, float %3255, float %3256, float %3257, float %3258, float %3259, float %3260, float %3261, float %3262, float %3263, float %3264, float %3265, float %3266, float %3267, float %3268, float %3269, float %3270, float %3271, float %3272, float %3273, float %3274, float %3275, float %3276, float %3277, float %3278, float %3279, float %3280, float %3281, float %3282, float %3283, float %3284, float %3285, float %3286, float %3287, float %3288, float %3289, float %3290, float %3291, float %3292, float %3293, float %3294, float %3295, float %3296, float %3297, float %3298, float %3299, float %3300, float %3301, float %3302, float %3303, float %3304, float %3305, float %3306, i32 %3356, i32 %3357, i32 %3358, i32 %3359, i64 %3376, i1 true) #2, !dbg !152 + %3378 = add i32 %3372, 2048, !dbg !152 + %3379 = lshr exact i32 %3378, 4, !dbg !152 + %3380 = and i32 %3379, 16383, !dbg !152 + %3381 = zext nneg i32 %3380 to i64, !dbg !152 + %3382 = or disjoint i64 %3381, 4611686293338849280, !dbg !152 + %3383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 0, !dbg !152 + %3384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 1, !dbg !152 + %3385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 2, !dbg !152 + %3386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 3, !dbg !152 + %3387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 4, !dbg !152 + %3388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 5, !dbg !152 + %3389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 6, !dbg !152 + %3390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 7, !dbg !152 + %3391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 8, !dbg !152 + %3392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 9, !dbg !152 + %3393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 10, !dbg !152 + %3394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 11, !dbg !152 + %3395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 12, !dbg !152 + %3396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 13, !dbg !152 + %3397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 14, !dbg !152 + %3398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 15, !dbg !152 + %3399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 16, !dbg !152 + %3400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 17, !dbg !152 + %3401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 18, !dbg !152 + %3402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 19, !dbg !152 + %3403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 20, !dbg !152 + %3404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 21, !dbg !152 + %3405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 22, !dbg !152 + %3406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 23, !dbg !152 + %3407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 24, !dbg !152 + %3408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 25, !dbg !152 + %3409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 26, !dbg !152 + %3410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 27, !dbg !152 + %3411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 28, !dbg !152 + %3412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 29, !dbg !152 + %3413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 30, !dbg !152 + %3414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 31, !dbg !152 + %3415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 32, !dbg !152 + %3416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 33, !dbg !152 + %3417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 34, !dbg !152 + %3418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 35, !dbg !152 + %3419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 36, !dbg !152 + %3420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 37, !dbg !152 + %3421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 38, !dbg !152 + %3422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 39, !dbg !152 + %3423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 40, !dbg !152 + %3424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 41, !dbg !152 + %3425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 42, !dbg !152 + %3426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 43, !dbg !152 + %3427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 44, !dbg !152 + %3428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 45, !dbg !152 + %3429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 46, !dbg !152 + %3430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 47, !dbg !152 + %3431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 48, !dbg !152 + %3432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 49, !dbg !152 + %3433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 50, !dbg !152 + %3434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 51, !dbg !152 + %3435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 52, !dbg !152 + %3436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 53, !dbg !152 + %3437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 54, !dbg !152 + %3438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 55, !dbg !152 + %3439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 56, !dbg !152 + %3440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 57, !dbg !152 + %3441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 58, !dbg !152 + %3442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 59, !dbg !152 + %3443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 60, !dbg !152 + %3444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 61, !dbg !152 + %3445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 62, !dbg !152 + %3446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 63, !dbg !152 + %3447 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3383, float %3384, float %3385, float %3386, float %3387, float %3388, float %3389, float %3390, float %3391, float %3392, float %3393, float %3394, float %3395, float %3396, float %3397, float %3398, float %3399, float %3400, float %3401, float %3402, float %3403, float %3404, float %3405, float %3406, float %3407, float %3408, float %3409, float %3410, float %3411, float %3412, float %3413, float %3414, float %3415, float %3416, float %3417, float %3418, float %3419, float %3420, float %3421, float %3422, float %3423, float %3424, float %3425, float %3426, float %3427, float %3428, float %3429, float %3430, float %3431, float %3432, float %3433, float %3434, float %3435, float %3436, float %3437, float %3438, float %3439, float %3440, float %3441, float %3442, float %3443, float %3444, float %3445, float %3446, i32 %3360, i32 %3361, i32 %3362, i32 %3363, i64 %3382, i1 true) #2, !dbg !152 + %3448 = add i32 %3372, 4096, !dbg !152 + %3449 = lshr exact i32 %3448, 4, !dbg !152 + %3450 = and i32 %3449, 16383, !dbg !152 + %3451 = zext nneg i32 %3450 to i64, !dbg !152 + %3452 = or disjoint i64 %3451, 4611686293338849280, !dbg !152 + %3453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 0, !dbg !152 + %3454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 1, !dbg !152 + %3455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 2, !dbg !152 + %3456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 3, !dbg !152 + %3457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 4, !dbg !152 + %3458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 5, !dbg !152 + %3459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 6, !dbg !152 + %3460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 7, !dbg !152 + %3461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 8, !dbg !152 + %3462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 9, !dbg !152 + %3463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 10, !dbg !152 + %3464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 11, !dbg !152 + %3465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 12, !dbg !152 + %3466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 13, !dbg !152 + %3467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 14, !dbg !152 + %3468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 15, !dbg !152 + %3469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 16, !dbg !152 + %3470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 17, !dbg !152 + %3471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 18, !dbg !152 + %3472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 19, !dbg !152 + %3473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 20, !dbg !152 + %3474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 21, !dbg !152 + %3475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 22, !dbg !152 + %3476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 23, !dbg !152 + %3477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 24, !dbg !152 + %3478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 25, !dbg !152 + %3479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 26, !dbg !152 + %3480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 27, !dbg !152 + %3481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 28, !dbg !152 + %3482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 29, !dbg !152 + %3483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 30, !dbg !152 + %3484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 31, !dbg !152 + %3485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 32, !dbg !152 + %3486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 33, !dbg !152 + %3487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 34, !dbg !152 + %3488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 35, !dbg !152 + %3489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 36, !dbg !152 + %3490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 37, !dbg !152 + %3491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 38, !dbg !152 + %3492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 39, !dbg !152 + %3493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 40, !dbg !152 + %3494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 41, !dbg !152 + %3495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 42, !dbg !152 + %3496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 43, !dbg !152 + %3497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 44, !dbg !152 + %3498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 45, !dbg !152 + %3499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 46, !dbg !152 + %3500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 47, !dbg !152 + %3501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 48, !dbg !152 + %3502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 49, !dbg !152 + %3503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 50, !dbg !152 + %3504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 51, !dbg !152 + %3505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 52, !dbg !152 + %3506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 53, !dbg !152 + %3507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 54, !dbg !152 + %3508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 55, !dbg !152 + %3509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 56, !dbg !152 + %3510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 57, !dbg !152 + %3511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 58, !dbg !152 + %3512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 59, !dbg !152 + %3513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 60, !dbg !152 + %3514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 61, !dbg !152 + %3515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 62, !dbg !152 + %3516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 63, !dbg !152 + %3517 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3453, float %3454, float %3455, float %3456, float %3457, float %3458, float %3459, float %3460, float %3461, float %3462, float %3463, float %3464, float %3465, float %3466, float %3467, float %3468, float %3469, float %3470, float %3471, float %3472, float %3473, float %3474, float %3475, float %3476, float %3477, float %3478, float %3479, float %3480, float %3481, float %3482, float %3483, float %3484, float %3485, float %3486, float %3487, float %3488, float %3489, float %3490, float %3491, float %3492, float %3493, float %3494, float %3495, float %3496, float %3497, float %3498, float %3499, float %3500, float %3501, float %3502, float %3503, float %3504, float %3505, float %3506, float %3507, float %3508, float %3509, float %3510, float %3511, float %3512, float %3513, float %3514, float %3515, float %3516, i32 %3364, i32 %3365, i32 %3366, i32 %3367, i64 %3452, i1 true) #2, !dbg !152 + %3518 = add i32 %3372, 6144, !dbg !152 + %3519 = lshr exact i32 %3518, 4, !dbg !152 + %3520 = and i32 %3519, 16383, !dbg !152 + %3521 = zext nneg i32 %3520 to i64, !dbg !152 + %3522 = or disjoint i64 %3521, 4611686293338849280, !dbg !152 + %3523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 0, !dbg !152 + %3524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 1, !dbg !152 + %3525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 2, !dbg !152 + %3526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 3, !dbg !152 + %3527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 4, !dbg !152 + %3528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 5, !dbg !152 + %3529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 6, !dbg !152 + %3530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 7, !dbg !152 + %3531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 8, !dbg !152 + %3532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 9, !dbg !152 + %3533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 10, !dbg !152 + %3534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 11, !dbg !152 + %3535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 12, !dbg !152 + %3536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 13, !dbg !152 + %3537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 14, !dbg !152 + %3538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 15, !dbg !152 + %3539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 16, !dbg !152 + %3540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 17, !dbg !152 + %3541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 18, !dbg !152 + %3542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 19, !dbg !152 + %3543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 20, !dbg !152 + %3544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 21, !dbg !152 + %3545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 22, !dbg !152 + %3546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 23, !dbg !152 + %3547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 24, !dbg !152 + %3548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 25, !dbg !152 + %3549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 26, !dbg !152 + %3550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 27, !dbg !152 + %3551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 28, !dbg !152 + %3552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 29, !dbg !152 + %3553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 30, !dbg !152 + %3554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 31, !dbg !152 + %3555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 32, !dbg !152 + %3556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 33, !dbg !152 + %3557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 34, !dbg !152 + %3558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 35, !dbg !152 + %3559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 36, !dbg !152 + %3560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 37, !dbg !152 + %3561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 38, !dbg !152 + %3562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 39, !dbg !152 + %3563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 40, !dbg !152 + %3564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 41, !dbg !152 + %3565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 42, !dbg !152 + %3566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 43, !dbg !152 + %3567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 44, !dbg !152 + %3568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 45, !dbg !152 + %3569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 46, !dbg !152 + %3570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 47, !dbg !152 + %3571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 48, !dbg !152 + %3572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 49, !dbg !152 + %3573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 50, !dbg !152 + %3574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 51, !dbg !152 + %3575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 52, !dbg !152 + %3576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 53, !dbg !152 + %3577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 54, !dbg !152 + %3578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 55, !dbg !152 + %3579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 56, !dbg !152 + %3580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 57, !dbg !152 + %3581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 58, !dbg !152 + %3582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 59, !dbg !152 + %3583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 60, !dbg !152 + %3584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 61, !dbg !152 + %3585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 62, !dbg !152 + %3586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 63, !dbg !152 + %3587 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3523, float %3524, float %3525, float %3526, float %3527, float %3528, float %3529, float %3530, float %3531, float %3532, float %3533, float %3534, float %3535, float %3536, float %3537, float %3538, float %3539, float %3540, float %3541, float %3542, float %3543, float %3544, float %3545, float %3546, float %3547, float %3548, float %3549, float %3550, float %3551, float %3552, float %3553, float %3554, float %3555, float %3556, float %3557, float %3558, float %3559, float %3560, float %3561, float %3562, float %3563, float %3564, float %3565, float %3566, float %3567, float %3568, float %3569, float %3570, float %3571, float %3572, float %3573, float %3574, float %3575, float %3576, float %3577, float %3578, float %3579, float %3580, float %3581, float %3582, float %3583, float %3584, float %3585, float %3586, i32 %3368, i32 %3369, i32 %3370, i32 %3371, i64 %3522, i1 true) #2, !dbg !152 + %3588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 0, !dbg !152 + %3589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 1, !dbg !152 + %3590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 2, !dbg !152 + %3591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 3, !dbg !152 + %3592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 4, !dbg !152 + %3593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 5, !dbg !152 + %3594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 6, !dbg !152 + %3595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 7, !dbg !152 + %3596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 8, !dbg !152 + %3597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 9, !dbg !152 + %3598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 10, !dbg !152 + %3599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 11, !dbg !152 + %3600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 12, !dbg !152 + %3601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 13, !dbg !152 + %3602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 14, !dbg !152 + %3603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 15, !dbg !152 + %3604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 16, !dbg !152 + %3605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 17, !dbg !152 + %3606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 18, !dbg !152 + %3607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 19, !dbg !152 + %3608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 20, !dbg !152 + %3609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 21, !dbg !152 + %3610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 22, !dbg !152 + %3611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 23, !dbg !152 + %3612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 24, !dbg !152 + %3613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 25, !dbg !152 + %3614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 26, !dbg !152 + %3615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 27, !dbg !152 + %3616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 28, !dbg !152 + %3617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 29, !dbg !152 + %3618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 30, !dbg !152 + %3619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 31, !dbg !152 + %3620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 32, !dbg !152 + %3621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 33, !dbg !152 + %3622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 34, !dbg !152 + %3623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 35, !dbg !152 + %3624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 36, !dbg !152 + %3625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 37, !dbg !152 + %3626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 38, !dbg !152 + %3627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 39, !dbg !152 + %3628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 40, !dbg !152 + %3629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 41, !dbg !152 + %3630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 42, !dbg !152 + %3631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 43, !dbg !152 + %3632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 44, !dbg !152 + %3633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 45, !dbg !152 + %3634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 46, !dbg !152 + %3635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 47, !dbg !152 + %3636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 48, !dbg !152 + %3637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 49, !dbg !152 + %3638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 50, !dbg !152 + %3639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 51, !dbg !152 + %3640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 52, !dbg !152 + %3641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 53, !dbg !152 + %3642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 54, !dbg !152 + %3643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 55, !dbg !152 + %3644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 56, !dbg !152 + %3645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 57, !dbg !152 + %3646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 58, !dbg !152 + %3647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 59, !dbg !152 + %3648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 60, !dbg !152 + %3649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 61, !dbg !152 + %3650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 62, !dbg !152 + %3651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 63, !dbg !152 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !152 + %3652 = insertelement <16 x i32> poison, i32 %2346, i64 0, !dbg !153 + %3653 = shufflevector <16 x i32> %3652, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !153 + %3654 = add <16 x i32> %3653, %2352, !dbg !153 + %3655 = add nuw nsw i32 %2350, 1, !dbg !122 + %3656 = lshr i32 %3655, 1, !dbg !154 + %3657 = zext nneg i32 %3656 to i64, !dbg !155 + %3658 = getelementptr i32, ptr addrspace(1) %2115, i64 %3657, !dbg !155 + %3659 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !156 + %3660 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %3658, i64 %3659, i1 %2354) #2, !dbg !156 + %3661 = add nuw nsw i32 %3656, 1, !dbg !157 + %3662 = icmp slt i32 %3661, %2119, !dbg !158 + %3663 = getelementptr i8, ptr addrspace(1) %3658, i64 4, !dbg !159 + %3664 = and i1 %2354, %3662, !dbg !122 + %3665 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !160 + %3666 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %3663, i64 %3665, i1 %3664) #2, !dbg !160 + %3667 = and i32 %2350, 1, !dbg !161 + %3668 = sub i32 %3666, %3660, !dbg !162 + %3669 = shl i32 %3668, 7, !dbg !163 + %3670 = add i32 %3669, -64, !dbg !164 + %3671 = xor i32 %3667, 1, !dbg !165 + %3672 = mul nuw nsw i32 %3670, %3671, !dbg !165 + %3673 = shl nuw nsw i32 %3667, 6, !dbg !166 + %3674 = add i32 %3672, %3673, !dbg !167 + %3675 = add i32 %3674, %2349, !dbg !168 + %3676 = add i32 %2348, 1, !dbg !122 + %3677 = icmp sgt i32 %3676, 2, !dbg !122 + %3678 = select i1 %3677, i32 0, i32 %3676, !dbg !122 + %3679 = add i32 %3675, %2117, !dbg !130 + %3680 = add i32 %3679, %37, !dbg !124 + %3681 = add i32 %3679, %38, !dbg !124 + %3682 = add i32 %3679, %39, !dbg !124 + %3683 = add i32 %3679, %40, !dbg !124 + %3684 = shl i32 %3680, 7, !dbg !125 + %3685 = shl i32 %3681, 7, !dbg !125 + %3686 = shl i32 %3682, 7, !dbg !125 + %3687 = shl i32 %3683, 7, !dbg !125 + %3688 = sext i32 %3684 to i64, !dbg !126 + %3689 = sext i32 %3685 to i64, !dbg !126 + %3690 = sext i32 %3686 to i64, !dbg !126 + %3691 = sext i32 %3687 to i64, !dbg !126 + %gep428 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3688, !dbg !127 + %gep430 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3689, !dbg !127 + %gep432 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3690, !dbg !127 + %gep434 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3691, !dbg !127 + %3692 = icmp slt i32 %3680, %11, !dbg !128 + %3693 = icmp slt i32 %3681, %11, !dbg !128 + %3694 = icmp slt i32 %3682, %11, !dbg !128 + %3695 = icmp slt i32 %3683, %11, !dbg !128 + %3696 = shl i32 %3678, 13, !dbg !129 + %3697 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %3696, !dbg !129 + %3698 = and i1 %2353, %3692, !dbg !122 + %3699 = and i1 %2353, %3693, !dbg !122 + %3700 = and i1 %2353, %3694, !dbg !122 + %3701 = and i1 %2353, %3695, !dbg !122 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !129 + %3702 = getelementptr inbounds nuw i8, ptr addrspace(3) %3697, i32 %255, !dbg !129 + %3703 = select i1 %3698, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %3702, ptr addrspace(1) %gep428, i32 %3703) #2, !dbg !129 + %3704 = getelementptr inbounds nuw i8, ptr addrspace(3) %3697, i32 %258, !dbg !129 + %3705 = select i1 %3699, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3704, ptr addrspace(1) %gep430, i32 %3705) #2, !dbg !129 + %3706 = getelementptr inbounds nuw i8, ptr addrspace(3) %3697, i32 %261, !dbg !129 + %3707 = select i1 %3700, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3706, ptr addrspace(1) %gep432, i32 %3707) #2, !dbg !129 + %3708 = getelementptr inbounds nuw i8, ptr addrspace(3) %3697, i32 %264, !dbg !129 + %3709 = select i1 %3701, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3708, ptr addrspace(1) %gep434, i32 %3709) #2, !dbg !129 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !129 + %gep436 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %3688, !dbg !127 + %gep438 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %3689, !dbg !127 + %gep440 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %3690, !dbg !127 + %gep442 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %3691, !dbg !127 + %3710 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %3696, !dbg !129 + %3711 = getelementptr inbounds nuw i8, ptr addrspace(3) %3710, i32 %255, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %3711, ptr addrspace(1) %gep436, i32 %3703) #2, !dbg !129 + %3712 = getelementptr inbounds nuw i8, ptr addrspace(3) %3710, i32 %258, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3712, ptr addrspace(1) %gep438, i32 %3705) #2, !dbg !129 + %3713 = getelementptr inbounds nuw i8, ptr addrspace(3) %3710, i32 %261, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3713, ptr addrspace(1) %gep440, i32 %3707) #2, !dbg !129 + %3714 = getelementptr inbounds nuw i8, ptr addrspace(3) %3710, i32 %264, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3714, ptr addrspace(1) %gep442, i32 %3709) #2, !dbg !129 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !129 + %exitcond472.not = icmp eq i32 %3655, %smax471, !dbg !122 + br i1 %exitcond472.not, label %._crit_edge461, label %2345, !dbg !122 + +._crit_edge461: ; preds = %__nv_exp2f.exit253, %._crit_edge + %3715 = phi float [ %2205, %._crit_edge ], [ %3588, %__nv_exp2f.exit253 ], !dbg !50 + %3716 = phi float [ %2206, %._crit_edge ], [ %3589, %__nv_exp2f.exit253 ], !dbg !50 + %3717 = phi float [ %2207, %._crit_edge ], [ %3590, %__nv_exp2f.exit253 ], !dbg !50 + %3718 = phi float [ %2208, %._crit_edge ], [ %3591, %__nv_exp2f.exit253 ], !dbg !50 + %3719 = phi float [ %2209, %._crit_edge ], [ %3592, %__nv_exp2f.exit253 ], !dbg !50 + %3720 = phi float [ %2210, %._crit_edge ], [ %3593, %__nv_exp2f.exit253 ], !dbg !50 + %3721 = phi float [ %2211, %._crit_edge ], [ %3594, %__nv_exp2f.exit253 ], !dbg !50 + %3722 = phi float [ %2212, %._crit_edge ], [ %3595, %__nv_exp2f.exit253 ], !dbg !50 + %3723 = phi float [ %2213, %._crit_edge ], [ %3596, %__nv_exp2f.exit253 ], !dbg !50 + %3724 = phi float [ %2214, %._crit_edge ], [ %3597, %__nv_exp2f.exit253 ], !dbg !50 + %3725 = phi float [ %2215, %._crit_edge ], [ %3598, %__nv_exp2f.exit253 ], !dbg !50 + %3726 = phi float [ %2216, %._crit_edge ], [ %3599, %__nv_exp2f.exit253 ], !dbg !50 + %3727 = phi float [ %2217, %._crit_edge ], [ %3600, %__nv_exp2f.exit253 ], !dbg !50 + %3728 = phi float [ %2218, %._crit_edge ], [ %3601, %__nv_exp2f.exit253 ], !dbg !50 + %3729 = phi float [ %2219, %._crit_edge ], [ %3602, %__nv_exp2f.exit253 ], !dbg !50 + %3730 = phi float [ %2220, %._crit_edge ], [ %3603, %__nv_exp2f.exit253 ], !dbg !50 + %3731 = phi float [ %2221, %._crit_edge ], [ %3604, %__nv_exp2f.exit253 ], !dbg !50 + %3732 = phi float [ %2222, %._crit_edge ], [ %3605, %__nv_exp2f.exit253 ], !dbg !50 + %3733 = phi float [ %2223, %._crit_edge ], [ %3606, %__nv_exp2f.exit253 ], !dbg !50 + %3734 = phi float [ %2224, %._crit_edge ], [ %3607, %__nv_exp2f.exit253 ], !dbg !50 + %3735 = phi float [ %2225, %._crit_edge ], [ %3608, %__nv_exp2f.exit253 ], !dbg !50 + %3736 = phi float [ %2226, %._crit_edge ], [ %3609, %__nv_exp2f.exit253 ], !dbg !50 + %3737 = phi float [ %2227, %._crit_edge ], [ %3610, %__nv_exp2f.exit253 ], !dbg !50 + %3738 = phi float [ %2228, %._crit_edge ], [ %3611, %__nv_exp2f.exit253 ], !dbg !50 + %3739 = phi float [ %2229, %._crit_edge ], [ %3612, %__nv_exp2f.exit253 ], !dbg !50 + %3740 = phi float [ %2230, %._crit_edge ], [ %3613, %__nv_exp2f.exit253 ], !dbg !50 + %3741 = phi float [ %2231, %._crit_edge ], [ %3614, %__nv_exp2f.exit253 ], !dbg !50 + %3742 = phi float [ %2232, %._crit_edge ], [ %3615, %__nv_exp2f.exit253 ], !dbg !50 + %3743 = phi float [ %2233, %._crit_edge ], [ %3616, %__nv_exp2f.exit253 ], !dbg !50 + %3744 = phi float [ %2234, %._crit_edge ], [ %3617, %__nv_exp2f.exit253 ], !dbg !50 + %3745 = phi float [ %2235, %._crit_edge ], [ %3618, %__nv_exp2f.exit253 ], !dbg !50 + %3746 = phi float [ %2236, %._crit_edge ], [ %3619, %__nv_exp2f.exit253 ], !dbg !50 + %3747 = phi float [ %2237, %._crit_edge ], [ %3620, %__nv_exp2f.exit253 ], !dbg !50 + %3748 = phi float [ %2238, %._crit_edge ], [ %3621, %__nv_exp2f.exit253 ], !dbg !50 + %3749 = phi float [ %2239, %._crit_edge ], [ %3622, %__nv_exp2f.exit253 ], !dbg !50 + %3750 = phi float [ %2240, %._crit_edge ], [ %3623, %__nv_exp2f.exit253 ], !dbg !50 + %3751 = phi float [ %2241, %._crit_edge ], [ %3624, %__nv_exp2f.exit253 ], !dbg !50 + %3752 = phi float [ %2242, %._crit_edge ], [ %3625, %__nv_exp2f.exit253 ], !dbg !50 + %3753 = phi float [ %2243, %._crit_edge ], [ %3626, %__nv_exp2f.exit253 ], !dbg !50 + %3754 = phi float [ %2244, %._crit_edge ], [ %3627, %__nv_exp2f.exit253 ], !dbg !50 + %3755 = phi float [ %2245, %._crit_edge ], [ %3628, %__nv_exp2f.exit253 ], !dbg !50 + %3756 = phi float [ %2246, %._crit_edge ], [ %3629, %__nv_exp2f.exit253 ], !dbg !50 + %3757 = phi float [ %2247, %._crit_edge ], [ %3630, %__nv_exp2f.exit253 ], !dbg !50 + %3758 = phi float [ %2248, %._crit_edge ], [ %3631, %__nv_exp2f.exit253 ], !dbg !50 + %3759 = phi float [ %2249, %._crit_edge ], [ %3632, %__nv_exp2f.exit253 ], !dbg !50 + %3760 = phi float [ %2250, %._crit_edge ], [ %3633, %__nv_exp2f.exit253 ], !dbg !50 + %3761 = phi float [ %2251, %._crit_edge ], [ %3634, %__nv_exp2f.exit253 ], !dbg !50 + %3762 = phi float [ %2252, %._crit_edge ], [ %3635, %__nv_exp2f.exit253 ], !dbg !50 + %3763 = phi float [ %2253, %._crit_edge ], [ %3636, %__nv_exp2f.exit253 ], !dbg !50 + %3764 = phi float [ %2254, %._crit_edge ], [ %3637, %__nv_exp2f.exit253 ], !dbg !50 + %3765 = phi float [ %2255, %._crit_edge ], [ %3638, %__nv_exp2f.exit253 ], !dbg !50 + %3766 = phi float [ %2256, %._crit_edge ], [ %3639, %__nv_exp2f.exit253 ], !dbg !50 + %3767 = phi float [ %2257, %._crit_edge ], [ %3640, %__nv_exp2f.exit253 ], !dbg !50 + %3768 = phi float [ %2258, %._crit_edge ], [ %3641, %__nv_exp2f.exit253 ], !dbg !50 + %3769 = phi float [ %2259, %._crit_edge ], [ %3642, %__nv_exp2f.exit253 ], !dbg !50 + %3770 = phi float [ %2260, %._crit_edge ], [ %3643, %__nv_exp2f.exit253 ], !dbg !50 + %3771 = phi float [ %2261, %._crit_edge ], [ %3644, %__nv_exp2f.exit253 ], !dbg !50 + %3772 = phi float [ %2262, %._crit_edge ], [ %3645, %__nv_exp2f.exit253 ], !dbg !50 + %3773 = phi float [ %2263, %._crit_edge ], [ %3646, %__nv_exp2f.exit253 ], !dbg !50 + %3774 = phi float [ %2264, %._crit_edge ], [ %3647, %__nv_exp2f.exit253 ], !dbg !50 + %3775 = phi float [ %2265, %._crit_edge ], [ %3648, %__nv_exp2f.exit253 ], !dbg !50 + %3776 = phi float [ %2266, %._crit_edge ], [ %3649, %__nv_exp2f.exit253 ], !dbg !50 + %3777 = phi float [ %2267, %._crit_edge ], [ %3650, %__nv_exp2f.exit253 ], !dbg !50 + %3778 = phi float [ %2268, %._crit_edge ], [ %3651, %__nv_exp2f.exit253 ], !dbg !50 + %3779 = phi float [ %2111, %._crit_edge ], [ %3241, %__nv_exp2f.exit253 ], !dbg !50 + %3780 = phi float [ %2112, %._crit_edge ], [ %3242, %__nv_exp2f.exit253 ], !dbg !50 + %3781 = phi <2 x float> [ %2113, %._crit_edge ], [ %2980, %__nv_exp2f.exit253 ], !dbg !50 + %3782 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %3715, float %3716, float %3717, float %3718, float %3719, float %3720, float %3721, float %3722, float %3723, float %3724, float %3725, float %3726, float %3727, float %3728, float %3729, float %3730, float %3731, float %3732, float %3733, float %3734, float %3735, float %3736, float %3737, float %3738, float %3739, float %3740, float %3741, float %3742, float %3743, float %3744, float %3745, float %3746, float %3747, float %3748, float %3749, float %3750, float %3751, float %3752, float %3753, float %3754, float %3755, float %3756, float %3757, float %3758, float %3759, float %3760, float %3761, float %3762, float %3763, float %3764, float %3765, float %3766, float %3767, float %3768, float %3769, float %3770, float %3771, float %3772, float %3773, float %3774, float %3775, float %3776, float %3777, float %3778) #2, !dbg !122 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !122 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !122 + %3783 = fcmp oeq float %3779, 0.000000e+00, !dbg !169 + %3784 = fcmp oeq float %3780, 0.000000e+00, !dbg !169 + %3785 = select i1 %3783, float 1.000000e+00, float %3779, !dbg !170 + %3786 = select i1 %3784, float 1.000000e+00, float %3780, !dbg !170 + %3787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 0, !dbg !171 + %3788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 1, !dbg !171 + %3789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 2, !dbg !171 + %3790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 3, !dbg !171 + %3791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 4, !dbg !171 + %3792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 5, !dbg !171 + %3793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 6, !dbg !171 + %3794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 7, !dbg !171 + %3795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 8, !dbg !171 + %3796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 9, !dbg !171 + %3797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 10, !dbg !171 + %3798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 11, !dbg !171 + %3799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 12, !dbg !171 + %3800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 13, !dbg !171 + %3801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 14, !dbg !171 + %3802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 15, !dbg !171 + %3803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 16, !dbg !171 + %3804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 17, !dbg !171 + %3805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 18, !dbg !171 + %3806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 19, !dbg !171 + %3807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 20, !dbg !171 + %3808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 21, !dbg !171 + %3809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 22, !dbg !171 + %3810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 23, !dbg !171 + %3811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 24, !dbg !171 + %3812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 25, !dbg !171 + %3813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 26, !dbg !171 + %3814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 27, !dbg !171 + %3815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 28, !dbg !171 + %3816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 29, !dbg !171 + %3817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 30, !dbg !171 + %3818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 31, !dbg !171 + %3819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 32, !dbg !171 + %3820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 33, !dbg !171 + %3821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 34, !dbg !171 + %3822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 35, !dbg !171 + %3823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 36, !dbg !171 + %3824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 37, !dbg !171 + %3825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 38, !dbg !171 + %3826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 39, !dbg !171 + %3827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 40, !dbg !171 + %3828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 41, !dbg !171 + %3829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 42, !dbg !171 + %3830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 43, !dbg !171 + %3831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 44, !dbg !171 + %3832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 45, !dbg !171 + %3833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 46, !dbg !171 + %3834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 47, !dbg !171 + %3835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 48, !dbg !171 + %3836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 49, !dbg !171 + %3837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 50, !dbg !171 + %3838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 51, !dbg !171 + %3839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 52, !dbg !171 + %3840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 53, !dbg !171 + %3841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 54, !dbg !171 + %3842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 55, !dbg !171 + %3843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 56, !dbg !171 + %3844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 57, !dbg !171 + %3845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 58, !dbg !171 + %3846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 59, !dbg !171 + %3847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 60, !dbg !171 + %3848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 61, !dbg !171 + %3849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 62, !dbg !171 + %3850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 63, !dbg !171 + %3851 = tail call float @llvm.nvvm.div.full(float %3787, float %3785), !dbg !171 + %3852 = tail call float @llvm.nvvm.div.full(float %3788, float %3785), !dbg !171 + %3853 = tail call float @llvm.nvvm.div.full(float %3789, float %3786), !dbg !171 + %3854 = tail call float @llvm.nvvm.div.full(float %3790, float %3786), !dbg !171 + %3855 = tail call float @llvm.nvvm.div.full(float %3791, float %3785), !dbg !171 + %3856 = tail call float @llvm.nvvm.div.full(float %3792, float %3785), !dbg !171 + %3857 = tail call float @llvm.nvvm.div.full(float %3793, float %3786), !dbg !171 + %3858 = tail call float @llvm.nvvm.div.full(float %3794, float %3786), !dbg !171 + %3859 = tail call float @llvm.nvvm.div.full(float %3795, float %3785), !dbg !171 + %3860 = tail call float @llvm.nvvm.div.full(float %3796, float %3785), !dbg !171 + %3861 = tail call float @llvm.nvvm.div.full(float %3797, float %3786), !dbg !171 + %3862 = tail call float @llvm.nvvm.div.full(float %3798, float %3786), !dbg !171 + %3863 = tail call float @llvm.nvvm.div.full(float %3799, float %3785), !dbg !171 + %3864 = tail call float @llvm.nvvm.div.full(float %3800, float %3785), !dbg !171 + %3865 = tail call float @llvm.nvvm.div.full(float %3801, float %3786), !dbg !171 + %3866 = tail call float @llvm.nvvm.div.full(float %3802, float %3786), !dbg !171 + %3867 = tail call float @llvm.nvvm.div.full(float %3803, float %3785), !dbg !171 + %3868 = tail call float @llvm.nvvm.div.full(float %3804, float %3785), !dbg !171 + %3869 = tail call float @llvm.nvvm.div.full(float %3805, float %3786), !dbg !171 + %3870 = tail call float @llvm.nvvm.div.full(float %3806, float %3786), !dbg !171 + %3871 = tail call float @llvm.nvvm.div.full(float %3807, float %3785), !dbg !171 + %3872 = tail call float @llvm.nvvm.div.full(float %3808, float %3785), !dbg !171 + %3873 = tail call float @llvm.nvvm.div.full(float %3809, float %3786), !dbg !171 + %3874 = tail call float @llvm.nvvm.div.full(float %3810, float %3786), !dbg !171 + %3875 = tail call float @llvm.nvvm.div.full(float %3811, float %3785), !dbg !171 + %3876 = tail call float @llvm.nvvm.div.full(float %3812, float %3785), !dbg !171 + %3877 = tail call float @llvm.nvvm.div.full(float %3813, float %3786), !dbg !171 + %3878 = tail call float @llvm.nvvm.div.full(float %3814, float %3786), !dbg !171 + %3879 = tail call float @llvm.nvvm.div.full(float %3815, float %3785), !dbg !171 + %3880 = tail call float @llvm.nvvm.div.full(float %3816, float %3785), !dbg !171 + %3881 = tail call float @llvm.nvvm.div.full(float %3817, float %3786), !dbg !171 + %3882 = tail call float @llvm.nvvm.div.full(float %3818, float %3786), !dbg !171 + %3883 = tail call float @llvm.nvvm.div.full(float %3819, float %3785), !dbg !171 + %3884 = tail call float @llvm.nvvm.div.full(float %3820, float %3785), !dbg !171 + %3885 = tail call float @llvm.nvvm.div.full(float %3821, float %3786), !dbg !171 + %3886 = tail call float @llvm.nvvm.div.full(float %3822, float %3786), !dbg !171 + %3887 = tail call float @llvm.nvvm.div.full(float %3823, float %3785), !dbg !171 + %3888 = tail call float @llvm.nvvm.div.full(float %3824, float %3785), !dbg !171 + %3889 = tail call float @llvm.nvvm.div.full(float %3825, float %3786), !dbg !171 + %3890 = tail call float @llvm.nvvm.div.full(float %3826, float %3786), !dbg !171 + %3891 = tail call float @llvm.nvvm.div.full(float %3827, float %3785), !dbg !171 + %3892 = tail call float @llvm.nvvm.div.full(float %3828, float %3785), !dbg !171 + %3893 = tail call float @llvm.nvvm.div.full(float %3829, float %3786), !dbg !171 + %3894 = tail call float @llvm.nvvm.div.full(float %3830, float %3786), !dbg !171 + %3895 = tail call float @llvm.nvvm.div.full(float %3831, float %3785), !dbg !171 + %3896 = tail call float @llvm.nvvm.div.full(float %3832, float %3785), !dbg !171 + %3897 = tail call float @llvm.nvvm.div.full(float %3833, float %3786), !dbg !171 + %3898 = tail call float @llvm.nvvm.div.full(float %3834, float %3786), !dbg !171 + %3899 = tail call float @llvm.nvvm.div.full(float %3835, float %3785), !dbg !171 + %3900 = tail call float @llvm.nvvm.div.full(float %3836, float %3785), !dbg !171 + %3901 = tail call float @llvm.nvvm.div.full(float %3837, float %3786), !dbg !171 + %3902 = tail call float @llvm.nvvm.div.full(float %3838, float %3786), !dbg !171 + %3903 = tail call float @llvm.nvvm.div.full(float %3839, float %3785), !dbg !171 + %3904 = tail call float @llvm.nvvm.div.full(float %3840, float %3785), !dbg !171 + %3905 = tail call float @llvm.nvvm.div.full(float %3841, float %3786), !dbg !171 + %3906 = tail call float @llvm.nvvm.div.full(float %3842, float %3786), !dbg !171 + %3907 = tail call float @llvm.nvvm.div.full(float %3843, float %3785), !dbg !171 + %3908 = tail call float @llvm.nvvm.div.full(float %3844, float %3785), !dbg !171 + %3909 = tail call float @llvm.nvvm.div.full(float %3845, float %3786), !dbg !171 + %3910 = tail call float @llvm.nvvm.div.full(float %3846, float %3786), !dbg !171 + %3911 = tail call float @llvm.nvvm.div.full(float %3847, float %3785), !dbg !171 + %3912 = tail call float @llvm.nvvm.div.full(float %3848, float %3785), !dbg !171 + %3913 = tail call float @llvm.nvvm.div.full(float %3849, float %3786), !dbg !171 + %3914 = tail call float @llvm.nvvm.div.full(float %3850, float %3786), !dbg !171 + %3915 = or disjoint i32 %79, %21, !dbg !172 + %3916 = or disjoint i32 %3915, %20, !dbg !173 + %3917 = add i32 %3916, %54, !dbg !174 + %3918 = add i32 %3916, %55, !dbg !174 + %3919 = add i32 %3916, %56, !dbg !174 + %3920 = add i32 %3916, %57, !dbg !174 + %3921 = add i32 %3916, %58, !dbg !174 + %3922 = add i32 %3916, %59, !dbg !174 + %3923 = add i32 %3916, %60, !dbg !174 + %3924 = add i32 %3916, %61, !dbg !174 + %3925 = sext i32 %3917 to i64, !dbg !175 + %3926 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3925, !dbg !175 + %3927 = sext i32 %3918 to i64, !dbg !175 + %3928 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3927, !dbg !175 + %3929 = sext i32 %3919 to i64, !dbg !175 + %3930 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3929, !dbg !175 + %3931 = sext i32 %3920 to i64, !dbg !175 + %3932 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3931, !dbg !175 + %3933 = sext i32 %3921 to i64, !dbg !175 + %3934 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3933, !dbg !175 + %3935 = sext i32 %3922 to i64, !dbg !175 + %3936 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3935, !dbg !175 + %3937 = sext i32 %3923 to i64, !dbg !175 + %3938 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3937, !dbg !175 + %3939 = sext i32 %3924 to i64, !dbg !175 + %3940 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3939, !dbg !175 + %3941 = insertelement <2 x float> poison, float %3851, i64 0, !dbg !176 + %3942 = insertelement <2 x float> %3941, float %3852, i64 1, !dbg !176 + %3943 = fptrunc <2 x float> %3942 to <2 x bfloat>, !dbg !176 + %3944 = insertelement <2 x float> poison, float %3853, i64 0, !dbg !176 + %3945 = insertelement <2 x float> %3944, float %3854, i64 1, !dbg !176 + %3946 = fptrunc <2 x float> %3945 to <2 x bfloat>, !dbg !176 + %3947 = insertelement <2 x float> poison, float %3855, i64 0, !dbg !176 + %3948 = insertelement <2 x float> %3947, float %3856, i64 1, !dbg !176 + %3949 = fptrunc <2 x float> %3948 to <2 x bfloat>, !dbg !176 + %3950 = insertelement <2 x float> poison, float %3857, i64 0, !dbg !176 + %3951 = insertelement <2 x float> %3950, float %3858, i64 1, !dbg !176 + %3952 = fptrunc <2 x float> %3951 to <2 x bfloat>, !dbg !176 + %3953 = insertelement <2 x float> poison, float %3859, i64 0, !dbg !176 + %3954 = insertelement <2 x float> %3953, float %3860, i64 1, !dbg !176 + %3955 = fptrunc <2 x float> %3954 to <2 x bfloat>, !dbg !176 + %3956 = insertelement <2 x float> poison, float %3861, i64 0, !dbg !176 + %3957 = insertelement <2 x float> %3956, float %3862, i64 1, !dbg !176 + %3958 = fptrunc <2 x float> %3957 to <2 x bfloat>, !dbg !176 + %3959 = insertelement <2 x float> poison, float %3863, i64 0, !dbg !176 + %3960 = insertelement <2 x float> %3959, float %3864, i64 1, !dbg !176 + %3961 = fptrunc <2 x float> %3960 to <2 x bfloat>, !dbg !176 + %3962 = insertelement <2 x float> poison, float %3865, i64 0, !dbg !176 + %3963 = insertelement <2 x float> %3962, float %3866, i64 1, !dbg !176 + %3964 = fptrunc <2 x float> %3963 to <2 x bfloat>, !dbg !176 + %3965 = insertelement <2 x float> poison, float %3867, i64 0, !dbg !176 + %3966 = insertelement <2 x float> %3965, float %3868, i64 1, !dbg !176 + %3967 = fptrunc <2 x float> %3966 to <2 x bfloat>, !dbg !176 + %3968 = insertelement <2 x float> poison, float %3869, i64 0, !dbg !176 + %3969 = insertelement <2 x float> %3968, float %3870, i64 1, !dbg !176 + %3970 = fptrunc <2 x float> %3969 to <2 x bfloat>, !dbg !176 + %3971 = insertelement <2 x float> poison, float %3871, i64 0, !dbg !176 + %3972 = insertelement <2 x float> %3971, float %3872, i64 1, !dbg !176 + %3973 = fptrunc <2 x float> %3972 to <2 x bfloat>, !dbg !176 + %3974 = insertelement <2 x float> poison, float %3873, i64 0, !dbg !176 + %3975 = insertelement <2 x float> %3974, float %3874, i64 1, !dbg !176 + %3976 = fptrunc <2 x float> %3975 to <2 x bfloat>, !dbg !176 + %3977 = insertelement <2 x float> poison, float %3875, i64 0, !dbg !176 + %3978 = insertelement <2 x float> %3977, float %3876, i64 1, !dbg !176 + %3979 = fptrunc <2 x float> %3978 to <2 x bfloat>, !dbg !176 + %3980 = insertelement <2 x float> poison, float %3877, i64 0, !dbg !176 + %3981 = insertelement <2 x float> %3980, float %3878, i64 1, !dbg !176 + %3982 = fptrunc <2 x float> %3981 to <2 x bfloat>, !dbg !176 + %3983 = insertelement <2 x float> poison, float %3879, i64 0, !dbg !176 + %3984 = insertelement <2 x float> %3983, float %3880, i64 1, !dbg !176 + %3985 = fptrunc <2 x float> %3984 to <2 x bfloat>, !dbg !176 + %3986 = insertelement <2 x float> poison, float %3881, i64 0, !dbg !176 + %3987 = insertelement <2 x float> %3986, float %3882, i64 1, !dbg !176 + %3988 = fptrunc <2 x float> %3987 to <2 x bfloat>, !dbg !176 + %3989 = insertelement <2 x float> poison, float %3883, i64 0, !dbg !176 + %3990 = insertelement <2 x float> %3989, float %3884, i64 1, !dbg !176 + %3991 = fptrunc <2 x float> %3990 to <2 x bfloat>, !dbg !176 + %3992 = insertelement <2 x float> poison, float %3885, i64 0, !dbg !176 + %3993 = insertelement <2 x float> %3992, float %3886, i64 1, !dbg !176 + %3994 = fptrunc <2 x float> %3993 to <2 x bfloat>, !dbg !176 + %3995 = insertelement <2 x float> poison, float %3887, i64 0, !dbg !176 + %3996 = insertelement <2 x float> %3995, float %3888, i64 1, !dbg !176 + %3997 = fptrunc <2 x float> %3996 to <2 x bfloat>, !dbg !176 + %3998 = insertelement <2 x float> poison, float %3889, i64 0, !dbg !176 + %3999 = insertelement <2 x float> %3998, float %3890, i64 1, !dbg !176 + %4000 = fptrunc <2 x float> %3999 to <2 x bfloat>, !dbg !176 + %4001 = insertelement <2 x float> poison, float %3891, i64 0, !dbg !176 + %4002 = insertelement <2 x float> %4001, float %3892, i64 1, !dbg !176 + %4003 = fptrunc <2 x float> %4002 to <2 x bfloat>, !dbg !176 + %4004 = insertelement <2 x float> poison, float %3893, i64 0, !dbg !176 + %4005 = insertelement <2 x float> %4004, float %3894, i64 1, !dbg !176 + %4006 = fptrunc <2 x float> %4005 to <2 x bfloat>, !dbg !176 + %4007 = insertelement <2 x float> poison, float %3895, i64 0, !dbg !176 + %4008 = insertelement <2 x float> %4007, float %3896, i64 1, !dbg !176 + %4009 = fptrunc <2 x float> %4008 to <2 x bfloat>, !dbg !176 + %4010 = insertelement <2 x float> poison, float %3897, i64 0, !dbg !176 + %4011 = insertelement <2 x float> %4010, float %3898, i64 1, !dbg !176 + %4012 = fptrunc <2 x float> %4011 to <2 x bfloat>, !dbg !176 + %4013 = insertelement <2 x float> poison, float %3899, i64 0, !dbg !176 + %4014 = insertelement <2 x float> %4013, float %3900, i64 1, !dbg !176 + %4015 = fptrunc <2 x float> %4014 to <2 x bfloat>, !dbg !176 + %4016 = insertelement <2 x float> poison, float %3901, i64 0, !dbg !176 + %4017 = insertelement <2 x float> %4016, float %3902, i64 1, !dbg !176 + %4018 = fptrunc <2 x float> %4017 to <2 x bfloat>, !dbg !176 + %4019 = insertelement <2 x float> poison, float %3903, i64 0, !dbg !176 + %4020 = insertelement <2 x float> %4019, float %3904, i64 1, !dbg !176 + %4021 = fptrunc <2 x float> %4020 to <2 x bfloat>, !dbg !176 + %4022 = insertelement <2 x float> poison, float %3905, i64 0, !dbg !176 + %4023 = insertelement <2 x float> %4022, float %3906, i64 1, !dbg !176 + %4024 = fptrunc <2 x float> %4023 to <2 x bfloat>, !dbg !176 + %4025 = insertelement <2 x float> poison, float %3907, i64 0, !dbg !176 + %4026 = insertelement <2 x float> %4025, float %3908, i64 1, !dbg !176 + %4027 = fptrunc <2 x float> %4026 to <2 x bfloat>, !dbg !176 + %4028 = insertelement <2 x float> poison, float %3909, i64 0, !dbg !176 + %4029 = insertelement <2 x float> %4028, float %3910, i64 1, !dbg !176 + %4030 = fptrunc <2 x float> %4029 to <2 x bfloat>, !dbg !176 + %4031 = insertelement <2 x float> poison, float %3911, i64 0, !dbg !176 + %4032 = insertelement <2 x float> %4031, float %3912, i64 1, !dbg !176 + %4033 = fptrunc <2 x float> %4032 to <2 x bfloat>, !dbg !176 + %4034 = insertelement <2 x float> poison, float %3913, i64 0, !dbg !176 + %4035 = insertelement <2 x float> %4034, float %3914, i64 1, !dbg !176 + %4036 = fptrunc <2 x float> %4035 to <2 x bfloat>, !dbg !176 + %4037 = shl nuw nsw i32 %198, 13, !dbg !176 + %4038 = shl nuw nsw i32 %34, 5, !dbg !176 + %4039 = and i32 %4038, 7264, !dbg !176 + %4040 = and i32 %34, 24, !dbg !176 + %4041 = shl nuw nsw i32 %4040, 4, !dbg !176 + %4042 = shl nuw nsw i32 %34, 2, !dbg !176 + %4043 = and i32 %4042, 16, !dbg !176 + %4044 = or disjoint i32 %4037, %4043, !dbg !176 + %4045 = or disjoint i32 %4039, %4041, !dbg !176 + %4046 = or disjoint i32 %4044, %4045, !dbg !176 + %4047 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4046, !dbg !176 + %4048 = bitcast <2 x bfloat> %3943 to i32, !dbg !176 + %4049 = bitcast <2 x bfloat> %3949 to i32, !dbg !176 + %4050 = bitcast <2 x bfloat> %3955 to i32, !dbg !176 + %4051 = bitcast <2 x bfloat> %3961 to i32, !dbg !176 + %4052 = insertelement <4 x i32> poison, i32 %4048, i64 0, !dbg !176 + %4053 = insertelement <4 x i32> %4052, i32 %4049, i64 1, !dbg !176 + %4054 = insertelement <4 x i32> %4053, i32 %4050, i64 2, !dbg !176 + %4055 = insertelement <4 x i32> %4054, i32 %4051, i64 3, !dbg !176 + store <4 x i32> %4055, ptr addrspace(3) %4047, align 16, !dbg !176 + %4056 = getelementptr inbounds nuw i8, ptr addrspace(3) %4047, i32 512, !dbg !176 + %4057 = bitcast <2 x bfloat> %3946 to i32, !dbg !176 + %4058 = bitcast <2 x bfloat> %3952 to i32, !dbg !176 + %4059 = bitcast <2 x bfloat> %3958 to i32, !dbg !176 + %4060 = bitcast <2 x bfloat> %3964 to i32, !dbg !176 + %4061 = insertelement <4 x i32> poison, i32 %4057, i64 0, !dbg !176 + %4062 = insertelement <4 x i32> %4061, i32 %4058, i64 1, !dbg !176 + %4063 = insertelement <4 x i32> %4062, i32 %4059, i64 2, !dbg !176 + %4064 = insertelement <4 x i32> %4063, i32 %4060, i64 3, !dbg !176 + store <4 x i32> %4064, ptr addrspace(3) %4056, align 16, !dbg !176 + %4065 = xor i32 %4046, 32, !dbg !176 + %4066 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4065, !dbg !176 + %4067 = bitcast <2 x bfloat> %3967 to i32, !dbg !176 + %4068 = bitcast <2 x bfloat> %3973 to i32, !dbg !176 + %4069 = bitcast <2 x bfloat> %3979 to i32, !dbg !176 + %4070 = bitcast <2 x bfloat> %3985 to i32, !dbg !176 + %4071 = insertelement <4 x i32> poison, i32 %4067, i64 0, !dbg !176 + %4072 = insertelement <4 x i32> %4071, i32 %4068, i64 1, !dbg !176 + %4073 = insertelement <4 x i32> %4072, i32 %4069, i64 2, !dbg !176 + %4074 = insertelement <4 x i32> %4073, i32 %4070, i64 3, !dbg !176 + store <4 x i32> %4074, ptr addrspace(3) %4066, align 16, !dbg !176 + %4075 = getelementptr inbounds nuw i8, ptr addrspace(3) %4066, i32 512, !dbg !176 + %4076 = bitcast <2 x bfloat> %3970 to i32, !dbg !176 + %4077 = bitcast <2 x bfloat> %3976 to i32, !dbg !176 + %4078 = bitcast <2 x bfloat> %3982 to i32, !dbg !176 + %4079 = bitcast <2 x bfloat> %3988 to i32, !dbg !176 + %4080 = insertelement <4 x i32> poison, i32 %4076, i64 0, !dbg !176 + %4081 = insertelement <4 x i32> %4080, i32 %4077, i64 1, !dbg !176 + %4082 = insertelement <4 x i32> %4081, i32 %4078, i64 2, !dbg !176 + %4083 = insertelement <4 x i32> %4082, i32 %4079, i64 3, !dbg !176 + store <4 x i32> %4083, ptr addrspace(3) %4075, align 16, !dbg !176 + %4084 = xor i32 %4046, 64, !dbg !176 + %4085 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4084, !dbg !176 + %4086 = bitcast <2 x bfloat> %3991 to i32, !dbg !176 + %4087 = bitcast <2 x bfloat> %3997 to i32, !dbg !176 + %4088 = bitcast <2 x bfloat> %4003 to i32, !dbg !176 + %4089 = bitcast <2 x bfloat> %4009 to i32, !dbg !176 + %4090 = insertelement <4 x i32> poison, i32 %4086, i64 0, !dbg !176 + %4091 = insertelement <4 x i32> %4090, i32 %4087, i64 1, !dbg !176 + %4092 = insertelement <4 x i32> %4091, i32 %4088, i64 2, !dbg !176 + %4093 = insertelement <4 x i32> %4092, i32 %4089, i64 3, !dbg !176 + store <4 x i32> %4093, ptr addrspace(3) %4085, align 16, !dbg !176 + %4094 = getelementptr inbounds nuw i8, ptr addrspace(3) %4085, i32 512, !dbg !176 + %4095 = bitcast <2 x bfloat> %3994 to i32, !dbg !176 + %4096 = bitcast <2 x bfloat> %4000 to i32, !dbg !176 + %4097 = bitcast <2 x bfloat> %4006 to i32, !dbg !176 + %4098 = bitcast <2 x bfloat> %4012 to i32, !dbg !176 + %4099 = insertelement <4 x i32> poison, i32 %4095, i64 0, !dbg !176 + %4100 = insertelement <4 x i32> %4099, i32 %4096, i64 1, !dbg !176 + %4101 = insertelement <4 x i32> %4100, i32 %4097, i64 2, !dbg !176 + %4102 = insertelement <4 x i32> %4101, i32 %4098, i64 3, !dbg !176 + store <4 x i32> %4102, ptr addrspace(3) %4094, align 16, !dbg !176 + %4103 = xor i32 %4046, 96, !dbg !176 + %4104 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4103, !dbg !176 + %4105 = bitcast <2 x bfloat> %4015 to i32, !dbg !176 + %4106 = bitcast <2 x bfloat> %4021 to i32, !dbg !176 + %4107 = bitcast <2 x bfloat> %4027 to i32, !dbg !176 + %4108 = bitcast <2 x bfloat> %4033 to i32, !dbg !176 + %4109 = insertelement <4 x i32> poison, i32 %4105, i64 0, !dbg !176 + %4110 = insertelement <4 x i32> %4109, i32 %4106, i64 1, !dbg !176 + %4111 = insertelement <4 x i32> %4110, i32 %4107, i64 2, !dbg !176 + %4112 = insertelement <4 x i32> %4111, i32 %4108, i64 3, !dbg !176 + store <4 x i32> %4112, ptr addrspace(3) %4104, align 16, !dbg !176 + %4113 = getelementptr inbounds nuw i8, ptr addrspace(3) %4104, i32 512, !dbg !176 + %4114 = bitcast <2 x bfloat> %4018 to i32, !dbg !176 + %4115 = bitcast <2 x bfloat> %4024 to i32, !dbg !176 + %4116 = bitcast <2 x bfloat> %4030 to i32, !dbg !176 + %4117 = bitcast <2 x bfloat> %4036 to i32, !dbg !176 + %4118 = insertelement <4 x i32> poison, i32 %4114, i64 0, !dbg !176 + %4119 = insertelement <4 x i32> %4118, i32 %4115, i64 1, !dbg !176 + %4120 = insertelement <4 x i32> %4119, i32 %4116, i64 2, !dbg !176 + %4121 = insertelement <4 x i32> %4120, i32 %4117, i64 3, !dbg !176 + store <4 x i32> %4121, ptr addrspace(3) %4113, align 16, !dbg !176 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !176 + %4122 = shl nuw nsw i32 %4040, 10, !dbg !176 + %4123 = shl nuw nsw i32 %198, 5, !dbg !176 + %4124 = and i32 %34, 252, !dbg !176 + %4125 = shl nuw nsw i32 %4124, 2, !dbg !176 + %4126 = or disjoint i32 %4122, %4123, !dbg !176 + %4127 = xor i32 %4126, %4125, !dbg !176 + %4128 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4127, !dbg !176 + %4129 = ptrtoint ptr addrspace(3) %4128 to i32, !dbg !176 + %4130 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4129) #2, !dbg !176 + %4131 = extractvalue { i32, i32, i32, i32 } %4130, 0, !dbg !176 + %4132 = extractvalue { i32, i32, i32, i32 } %4130, 1, !dbg !176 + %4133 = extractvalue { i32, i32, i32, i32 } %4130, 2, !dbg !176 + %4134 = extractvalue { i32, i32, i32, i32 } %4130, 3, !dbg !176 + %4135 = getelementptr inbounds nuw i8, ptr addrspace(3) %4128, i32 1024, !dbg !176 + %4136 = ptrtoint ptr addrspace(3) %4135 to i32, !dbg !176 + %4137 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4136) #2, !dbg !176 + %4138 = extractvalue { i32, i32, i32, i32 } %4137, 0, !dbg !176 + %4139 = extractvalue { i32, i32, i32, i32 } %4137, 1, !dbg !176 + %4140 = extractvalue { i32, i32, i32, i32 } %4137, 2, !dbg !176 + %4141 = extractvalue { i32, i32, i32, i32 } %4137, 3, !dbg !176 + %4142 = getelementptr inbounds nuw i8, ptr addrspace(3) %4128, i32 2048, !dbg !176 + %4143 = ptrtoint ptr addrspace(3) %4142 to i32, !dbg !176 + %4144 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4143) #2, !dbg !176 + %4145 = extractvalue { i32, i32, i32, i32 } %4144, 0, !dbg !176 + %4146 = extractvalue { i32, i32, i32, i32 } %4144, 1, !dbg !176 + %4147 = extractvalue { i32, i32, i32, i32 } %4144, 2, !dbg !176 + %4148 = extractvalue { i32, i32, i32, i32 } %4144, 3, !dbg !176 + %4149 = getelementptr inbounds nuw i8, ptr addrspace(3) %4128, i32 3072, !dbg !176 + %4150 = ptrtoint ptr addrspace(3) %4149 to i32, !dbg !176 + %4151 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4150) #2, !dbg !176 + %4152 = extractvalue { i32, i32, i32, i32 } %4151, 0, !dbg !176 + %4153 = extractvalue { i32, i32, i32, i32 } %4151, 1, !dbg !176 + %4154 = extractvalue { i32, i32, i32, i32 } %4151, 2, !dbg !176 + %4155 = extractvalue { i32, i32, i32, i32 } %4151, 3, !dbg !176 + %4156 = getelementptr inbounds nuw i8, ptr addrspace(3) %4128, i32 4096, !dbg !176 + %4157 = ptrtoint ptr addrspace(3) %4156 to i32, !dbg !176 + %4158 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4157) #2, !dbg !176 + %4159 = extractvalue { i32, i32, i32, i32 } %4158, 0, !dbg !176 + %4160 = extractvalue { i32, i32, i32, i32 } %4158, 1, !dbg !176 + %4161 = extractvalue { i32, i32, i32, i32 } %4158, 2, !dbg !176 + %4162 = extractvalue { i32, i32, i32, i32 } %4158, 3, !dbg !176 + %4163 = getelementptr inbounds nuw i8, ptr addrspace(3) %4128, i32 5120, !dbg !176 + %4164 = ptrtoint ptr addrspace(3) %4163 to i32, !dbg !176 + %4165 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4164) #2, !dbg !176 + %4166 = extractvalue { i32, i32, i32, i32 } %4165, 0, !dbg !176 + %4167 = extractvalue { i32, i32, i32, i32 } %4165, 1, !dbg !176 + %4168 = extractvalue { i32, i32, i32, i32 } %4165, 2, !dbg !176 + %4169 = extractvalue { i32, i32, i32, i32 } %4165, 3, !dbg !176 + %4170 = getelementptr inbounds nuw i8, ptr addrspace(3) %4128, i32 6144, !dbg !176 + %4171 = ptrtoint ptr addrspace(3) %4170 to i32, !dbg !176 + %4172 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4171) #2, !dbg !176 + %4173 = extractvalue { i32, i32, i32, i32 } %4172, 0, !dbg !176 + %4174 = extractvalue { i32, i32, i32, i32 } %4172, 1, !dbg !176 + %4175 = extractvalue { i32, i32, i32, i32 } %4172, 2, !dbg !176 + %4176 = extractvalue { i32, i32, i32, i32 } %4172, 3, !dbg !176 + %4177 = getelementptr inbounds nuw i8, ptr addrspace(3) %4128, i32 7168, !dbg !176 + %4178 = ptrtoint ptr addrspace(3) %4177 to i32, !dbg !176 + %4179 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4178) #2, !dbg !176 + %4180 = extractvalue { i32, i32, i32, i32 } %4179, 0, !dbg !176 + %4181 = extractvalue { i32, i32, i32, i32 } %4179, 1, !dbg !176 + %4182 = extractvalue { i32, i32, i32, i32 } %4179, 2, !dbg !176 + %4183 = extractvalue { i32, i32, i32, i32 } %4179, 3, !dbg !176 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4131, i32 %4132, i32 %4133, i32 %4134, ptr addrspace(1) %3926, i1 %89) #2, !dbg !176 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4138, i32 %4139, i32 %4140, i32 %4141, ptr addrspace(1) %3928, i1 %90) #2, !dbg !176 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4145, i32 %4146, i32 %4147, i32 %4148, ptr addrspace(1) %3930, i1 %91) #2, !dbg !176 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4152, i32 %4153, i32 %4154, i32 %4155, ptr addrspace(1) %3932, i1 %92) #2, !dbg !176 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4159, i32 %4160, i32 %4161, i32 %4162, ptr addrspace(1) %3934, i1 %93) #2, !dbg !176 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4166, i32 %4167, i32 %4168, i32 %4169, ptr addrspace(1) %3936, i1 %94) #2, !dbg !176 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4173, i32 %4174, i32 %4175, i32 %4176, ptr addrspace(1) %3938, i1 %95) #2, !dbg !176 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4180, i32 %4181, i32 %4182, i32 %4183, ptr addrspace(1) %3940, i1 %96) #2, !dbg !176 + %4184 = fcmp olt float %3785, 0x3810000000000000, !dbg !177 + %4185 = fmul float %3785, 0x4160000000000000, !dbg !177 + %.02.i = select i1 %4184, float %4185, float %3785, !dbg !177 + %i.i.0.i = select i1 %4184, float -2.300000e+01, float 0.000000e+00, !dbg !177 + %4186 = bitcast float %.02.i to i32, !dbg !177 + %4187 = add i32 %4186, -1060439283, !dbg !177 + %4188 = and i32 %4187, -8388608, !dbg !177 + %4189 = sub i32 %4186, %4188, !dbg !177 + %4190 = bitcast i32 %4189 to float, !dbg !177 + %4191 = sitofp i32 %4188 to float, !dbg !177 + %4192 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not.i356 = icmp eq i32 %4192, 0, !dbg !177 + %4193 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %4191, float 0x3E80000000000000, float %i.i.0.i) #2, !dbg !177 + %4194 = tail call float @llvm.nvvm.fma.rn.f(float %4191, float 0x3E80000000000000, float %i.i.0.i) #2, !dbg !177 + %.08.i = select i1 %.not.i356, float %4194, float %4193, !dbg !177 + %4195 = fadd float %4190, -1.000000e+00, !dbg !177 + %4196 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not1.i = icmp eq i32 %4196, 0, !dbg !177 + %4197 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %4195, float 0xBFC58FE600000000) #2, !dbg !177 + %4198 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %4195, float 0xBFC58FE600000000) #2, !dbg !177 + %.010.i = select i1 %.not1.i, float %4198, float %4197, !dbg !177 + %4199 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not2.i = icmp eq i32 %4199, 0, !dbg !177 + %4200 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i, float %4195, float 0x3FC5F9E540000000) #2, !dbg !177 + %4201 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i, float %4195, float 0x3FC5F9E540000000) #2, !dbg !177 + %.011.i = select i1 %.not2.i, float %4201, float %4200, !dbg !177 + %4202 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not3.i = icmp eq i32 %4202, 0, !dbg !177 + %4203 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i, float %4195, float 0xBFC6E9C860000000) #2, !dbg !177 + %4204 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i, float %4195, float 0xBFC6E9C860000000) #2, !dbg !177 + %.012.i = select i1 %.not3.i, float %4204, float %4203, !dbg !177 + %4205 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not4.i = icmp eq i32 %4205, 0, !dbg !177 + %4206 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i, float %4195, float 0x3FCA417E80000000) #2, !dbg !177 + %4207 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i, float %4195, float 0x3FCA417E80000000) #2, !dbg !177 + %.09.i = select i1 %.not4.i, float %4207, float %4206, !dbg !177 + %4208 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not5.i = icmp eq i32 %4208, 0, !dbg !177 + %4209 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i, float %4195, float 0xBFCEC79160000000) #2, !dbg !177 + %4210 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i, float %4195, float 0xBFCEC79160000000) #2, !dbg !177 + %.05.i = select i1 %.not5.i, float %4210, float %4209, !dbg !177 + %4211 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not6.i = icmp eq i32 %4211, 0, !dbg !177 + %4212 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %4195, float 0x3FD277F320000000) #2, !dbg !177 + %4213 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %4195, float 0x3FD277F320000000) #2, !dbg !177 + %.01.i = select i1 %.not6.i, float %4213, float %4212, !dbg !177 + %4214 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not7.i = icmp eq i32 %4214, 0, !dbg !177 + %4215 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i, float %4195, float 0xBFD7154920000000) #2, !dbg !177 + %4216 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i, float %4195, float 0xBFD7154920000000) #2, !dbg !177 + %.0.i357 = select i1 %.not7.i, float %4216, float %4215, !dbg !177 + %4217 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not8.i = icmp eq i32 %4217, 0, !dbg !177 + %4218 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i357, float %4195, float 0x3FDEC70940000000) #2, !dbg !177 + %4219 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i357, float %4195, float 0x3FDEC70940000000) #2, !dbg !177 + %.07.i = select i1 %.not8.i, float %4219, float %4218, !dbg !177 + %4220 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not9.i = icmp eq i32 %4220, 0, !dbg !177 + %4221 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %4195, float 0xBFE7154760000000) #2, !dbg !177 + %4222 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %4195, float 0xBFE7154760000000) #2, !dbg !177 + %.06.i = select i1 %.not9.i, float %4222, float %4221, !dbg !177 + %4223 = fmul float %4195, %.06.i, !dbg !177 + %4224 = fmul float %4195, %4223, !dbg !177 + %4225 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not10.i = icmp eq i32 %4225, 0, !dbg !177 + %4226 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %4195, float 0x3FF7154760000000, float %4224) #2, !dbg !177 + %4227 = tail call float @llvm.nvvm.fma.rn.f(float %4195, float 0x3FF7154760000000, float %4224) #2, !dbg !177 + %.04.i = select i1 %.not10.i, float %4227, float %4226, !dbg !177 + %4228 = fadd float %.08.i, %.04.i, !dbg !177 + %4229 = icmp ugt i32 %4186, 2139095039, !dbg !177 + br i1 %4229, label %__nv_fmaf_rn.exit.i.i, label %__nv_log2f.exit, !dbg !177 + +__nv_fmaf_rn.exit.i.i: ; preds = %._crit_edge461 + %4230 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not11.i = icmp eq i32 %4230, 0, !dbg !177 + %4231 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !177 + %4232 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !177 + %.03.i = select i1 %.not11.i, float %4232, float %4231, !dbg !177 + br label %__nv_log2f.exit, !dbg !177 + +__nv_log2f.exit: ; preds = %._crit_edge461, %__nv_fmaf_rn.exit.i.i + %r.i.0.i = phi float [ %.03.i, %__nv_fmaf_rn.exit.i.i ], [ %4228, %._crit_edge461 ], !dbg !177 + %4233 = fcmp olt float %3786, 0x3810000000000000, !dbg !177 + %4234 = fmul float %3786, 0x4160000000000000, !dbg !177 + %.02.i358 = select i1 %4233, float %4234, float %3786, !dbg !177 + %i.i.0.i359 = select i1 %4233, float -2.300000e+01, float 0.000000e+00, !dbg !177 + %4235 = bitcast float %.02.i358 to i32, !dbg !177 + %4236 = add i32 %4235, -1060439283, !dbg !177 + %4237 = and i32 %4236, -8388608, !dbg !177 + %4238 = sub i32 %4235, %4237, !dbg !177 + %4239 = bitcast i32 %4238 to float, !dbg !177 + %4240 = sitofp i32 %4237 to float, !dbg !177 + %4241 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not.i360 = icmp eq i32 %4241, 0, !dbg !177 + %4242 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %4240, float 0x3E80000000000000, float %i.i.0.i359) #2, !dbg !177 + %4243 = tail call float @llvm.nvvm.fma.rn.f(float %4240, float 0x3E80000000000000, float %i.i.0.i359) #2, !dbg !177 + %.08.i361 = select i1 %.not.i360, float %4243, float %4242, !dbg !177 + %4244 = fadd float %4239, -1.000000e+00, !dbg !177 + %4245 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not1.i362 = icmp eq i32 %4245, 0, !dbg !177 + %4246 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %4244, float 0xBFC58FE600000000) #2, !dbg !177 + %4247 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %4244, float 0xBFC58FE600000000) #2, !dbg !177 + %.010.i363 = select i1 %.not1.i362, float %4247, float %4246, !dbg !177 + %4248 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not2.i364 = icmp eq i32 %4248, 0, !dbg !177 + %4249 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i363, float %4244, float 0x3FC5F9E540000000) #2, !dbg !177 + %4250 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i363, float %4244, float 0x3FC5F9E540000000) #2, !dbg !177 + %.011.i365 = select i1 %.not2.i364, float %4250, float %4249, !dbg !177 + %4251 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not3.i366 = icmp eq i32 %4251, 0, !dbg !177 + %4252 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i365, float %4244, float 0xBFC6E9C860000000) #2, !dbg !177 + %4253 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i365, float %4244, float 0xBFC6E9C860000000) #2, !dbg !177 + %.012.i367 = select i1 %.not3.i366, float %4253, float %4252, !dbg !177 + %4254 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not4.i368 = icmp eq i32 %4254, 0, !dbg !177 + %4255 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i367, float %4244, float 0x3FCA417E80000000) #2, !dbg !177 + %4256 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i367, float %4244, float 0x3FCA417E80000000) #2, !dbg !177 + %.09.i369 = select i1 %.not4.i368, float %4256, float %4255, !dbg !177 + %4257 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not5.i370 = icmp eq i32 %4257, 0, !dbg !177 + %4258 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i369, float %4244, float 0xBFCEC79160000000) #2, !dbg !177 + %4259 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i369, float %4244, float 0xBFCEC79160000000) #2, !dbg !177 + %.05.i371 = select i1 %.not5.i370, float %4259, float %4258, !dbg !177 + %4260 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not6.i372 = icmp eq i32 %4260, 0, !dbg !177 + %4261 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i371, float %4244, float 0x3FD277F320000000) #2, !dbg !177 + %4262 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i371, float %4244, float 0x3FD277F320000000) #2, !dbg !177 + %.01.i373 = select i1 %.not6.i372, float %4262, float %4261, !dbg !177 + %4263 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not7.i374 = icmp eq i32 %4263, 0, !dbg !177 + %4264 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i373, float %4244, float 0xBFD7154920000000) #2, !dbg !177 + %4265 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i373, float %4244, float 0xBFD7154920000000) #2, !dbg !177 + %.0.i375 = select i1 %.not7.i374, float %4265, float %4264, !dbg !177 + %4266 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not8.i376 = icmp eq i32 %4266, 0, !dbg !177 + %4267 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i375, float %4244, float 0x3FDEC70940000000) #2, !dbg !177 + %4268 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i375, float %4244, float 0x3FDEC70940000000) #2, !dbg !177 + %.07.i377 = select i1 %.not8.i376, float %4268, float %4267, !dbg !177 + %4269 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not9.i378 = icmp eq i32 %4269, 0, !dbg !177 + %4270 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i377, float %4244, float 0xBFE7154760000000) #2, !dbg !177 + %4271 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i377, float %4244, float 0xBFE7154760000000) #2, !dbg !177 + %.06.i379 = select i1 %.not9.i378, float %4271, float %4270, !dbg !177 + %4272 = fmul float %4244, %.06.i379, !dbg !177 + %4273 = fmul float %4244, %4272, !dbg !177 + %4274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not10.i380 = icmp eq i32 %4274, 0, !dbg !177 + %4275 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %4244, float 0x3FF7154760000000, float %4273) #2, !dbg !177 + %4276 = tail call float @llvm.nvvm.fma.rn.f(float %4244, float 0x3FF7154760000000, float %4273) #2, !dbg !177 + %.04.i381 = select i1 %.not10.i380, float %4276, float %4275, !dbg !177 + %4277 = fadd float %.08.i361, %.04.i381, !dbg !177 + %4278 = icmp ugt i32 %4235, 2139095039, !dbg !177 + br i1 %4278, label %__nv_fmaf_rn.exit.i.i384, label %__nv_log2f.exit387, !dbg !177 + +__nv_fmaf_rn.exit.i.i384: ; preds = %__nv_log2f.exit + %4279 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not11.i385 = icmp eq i32 %4279, 0, !dbg !177 + %4280 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i358, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !177 + %4281 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i358, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !177 + %.03.i386 = select i1 %.not11.i385, float %4281, float %4280, !dbg !177 + br label %__nv_log2f.exit387, !dbg !177 + +__nv_log2f.exit387: ; preds = %__nv_log2f.exit, %__nv_fmaf_rn.exit.i.i384 + %r.i.0.i382 = phi float [ %.03.i386, %__nv_fmaf_rn.exit.i.i384 ], [ %4277, %__nv_log2f.exit ], !dbg !177 + %4282 = insertelement <2 x float> poison, float %.02.i, i64 0, !dbg !177 + %4283 = insertelement <2 x float> %4282, float %.02.i358, i64 1, !dbg !177 + %4284 = fcmp oeq <2 x float> %4283, zeroinitializer, !dbg !177 + %4285 = shl nuw i32 %17, 16, !dbg !178 + %4286 = shl nuw nsw i32 %18, 11, !dbg !178 + %4287 = add i32 %4285, %4286, !dbg !178 + %4288 = sext i32 %4287 to i64, !dbg !179 + %4289 = getelementptr float, ptr addrspace(1) %3, i64 %4288, !dbg !179 + %4290 = and i32 %34, 127, !dbg !25 + %4291 = or disjoint i32 %33, %4290, !dbg !26 + %4292 = sext i32 %4291 to i64, !dbg !180 + %4293 = getelementptr float, ptr addrspace(1) %4289, i64 %4292, !dbg !180 + %4294 = insertelement <2 x float> poison, float %r.i.0.i, i64 0, !dbg !177 + %4295 = insertelement <2 x float> %4294, float %r.i.0.i382, i64 1, !dbg !177 + %4296 = select <2 x i1> %4284, <2 x float> splat (float 0xFFF0000000000000), <2 x float> %4295, !dbg !177 + %4297 = fadd <2 x float> %3781, %4296, !dbg !181 + %4298 = icmp slt i32 %4291, 2048, !dbg !182 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !183 + %4299 = shl nuw nsw i32 %4124, 1, !dbg !183 + %4300 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4299, !dbg !183 + store <2 x float> %4297, ptr addrspace(3) %4300, align 8, !dbg !183 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !183 + %4301 = shl nuw nsw i32 %137, 3, !dbg !183 + %4302 = shl nuw nsw i32 %140, 2, !dbg !183 + %4303 = lshr exact i32 %141, 1, !dbg !183 + %4304 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4301, !dbg !183 + %4305 = getelementptr inbounds nuw i8, ptr addrspace(3) %4304, i32 %4302, !dbg !183 + %4306 = getelementptr inbounds nuw i8, ptr addrspace(3) %4305, i32 %4303, !dbg !183 + %4307 = load i32, ptr addrspace(3) %4306, align 4, !dbg !183 + %4308 = and i32 %34, 128, !dbg !183 + %4309 = icmp eq i32 %4308, 0, !dbg !183 + %4310 = and i1 %4309, %4298, !dbg !183 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %4307, ptr addrspace(1) %4293, i1 %4310) #2, !dbg !183 + ret void, !dbg !184 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smax.i32(i32, i32) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smin.i32(i32, i32) #1 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.commit.group() #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.wait.group(i32 immarg) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.idx.i32(i32, i32, i32, i32) #5 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.fence.sync.aligned() #6 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.commit_group.sync.aligned() #6 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.maxnum.f32(float, float) #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #5 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #7 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.f(float, float, float) #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #8 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #6 = { convergent nounwind } +attributes #7 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #8 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_tem_fused_0", linkageName: "triton_tem_fused_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 97, column: 28, scope: !5) +!9 = !DILocation(line: 98, column: 27, scope: !5) +!10 = !DILocation(line: 99, column: 27, scope: !5) +!11 = !DILocation(line: 103, column: 23, scope: !5) +!12 = !DILocation(line: 107, column: 24, scope: !5) +!13 = !DILocation(line: 107, column: 45, scope: !5) +!14 = !DILocation(line: 107, column: 36, scope: !5) +!15 = !DILocation(line: 108, column: 25, scope: !5) +!16 = !DILocation(line: 108, column: 47, scope: !5) +!17 = !DILocation(line: 108, column: 37, scope: !5) +!18 = !DILocation(line: 111, column: 12, scope: !5) +!19 = !DILocation(line: 112, column: 12, scope: !5) +!20 = !DILocation(line: 113, column: 12, scope: !5) +!21 = !DILocation(line: 142, column: 51, scope: !5) +!22 = !DILocation(line: 142, column: 74, scope: !5) +!23 = !DILocation(line: 143, column: 64, scope: !5) +!24 = !DILocation(line: 144, column: 23, scope: !5) +!25 = !DILocation(line: 144, column: 46, scope: !5) +!26 = !DILocation(line: 144, column: 33, scope: !5) +!27 = !DILocation(line: 284, column: 38, scope: !28, inlinedAt: !29) +!28 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!29 = !DILocation(line: 146, column: 101, scope: !5) +!30 = !DILocation(line: 284, column: 20, scope: !28, inlinedAt: !29) +!31 = !DILocation(line: 284, column: 56, scope: !28, inlinedAt: !29) +!32 = !DILocation(line: 284, column: 49, scope: !28, inlinedAt: !29) +!33 = !DILocation(line: 292, column: 52, scope: !28, inlinedAt: !29) +!34 = !DILocation(line: 292, column: 23, scope: !28, inlinedAt: !29) +!35 = !DILocation(line: 151, column: 26, scope: !5) +!36 = !DILocation(line: 152, column: 23, scope: !5) +!37 = !DILocation(line: 152, column: 37, scope: !5) +!38 = !DILocation(line: 153, column: 42, scope: !5) +!39 = !DILocation(line: 153, column: 28, scope: !5) +!40 = !DILocation(line: 154, column: 45, scope: !5) +!41 = !DILocation(line: 41, column: 22, scope: !42, inlinedAt: !44) +!42 = distinct !DILexicalBlockFile(scope: !5, file: !43, discriminator: 0) +!43 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!44 = !DILocation(line: 154, column: 92, scope: !5) +!45 = !DILocation(line: 41, column: 28, scope: !42, inlinedAt: !44) +!46 = !DILocation(line: 154, column: 102, scope: !5) +!47 = !DILocation(line: 154, column: 65, scope: !5) +!48 = !DILocation(line: 159, column: 37, scope: !5) +!49 = !DILocation(line: 257, column: 21, scope: !28, inlinedAt: !50) +!50 = !DILocation(line: 172, column: 41, scope: !5) +!51 = !DILocation(line: 376, column: 33, scope: !28, inlinedAt: !50) +!52 = !DILocation(line: 502, column: 40, scope: !28, inlinedAt: !50) +!53 = !DILocation(line: 376, column: 23, scope: !28, inlinedAt: !50) +!54 = !DILocation(line: 378, column: 23, scope: !28, inlinedAt: !50) +!55 = !DILocation(line: 379, column: 23, scope: !28, inlinedAt: !50) +!56 = !DILocation(line: 346, column: 35, scope: !28, inlinedAt: !50) +!57 = !DILocation(line: 284, column: 38, scope: !28, inlinedAt: !50) +!58 = !DILocation(line: 284, column: 20, scope: !28, inlinedAt: !50) +!59 = !DILocation(line: 284, column: 49, scope: !28, inlinedAt: !50) +!60 = !DILocation(line: 292, column: 52, scope: !28, inlinedAt: !50) +!61 = !DILocation(line: 292, column: 23, scope: !28, inlinedAt: !50) +!62 = !DILocation(line: 342, column: 32, scope: !28, inlinedAt: !50) +!63 = !DILocation(line: 351, column: 19, scope: !28, inlinedAt: !50) +!64 = !DILocation(line: 159, column: 24, scope: !5) +!65 = !DILocation(line: 395, column: 24, scope: !28, inlinedAt: !50) +!66 = !DILocation(line: 353, column: 14, scope: !28, inlinedAt: !50) +!67 = !DILocation(line: 367, column: 44, scope: !28, inlinedAt: !50) +!68 = !DILocation(line: 393, column: 39, scope: !28, inlinedAt: !50) +!69 = !DILocation(line: 373, column: 23, scope: !28, inlinedAt: !50) +!70 = !DILocation(line: 381, column: 23, scope: !28, inlinedAt: !50) +!71 = !DILocation(line: 384, column: 24, scope: !28, inlinedAt: !50) +!72 = !DILocation(line: 394, column: 25, scope: !28, inlinedAt: !50) +!73 = !DILocation(line: 396, column: 24, scope: !28, inlinedAt: !50) +!74 = !DILocation(line: 397, column: 23, scope: !28, inlinedAt: !50) +!75 = !DILocation(line: 404, column: 39, scope: !28, inlinedAt: !50) +!76 = !DILocation(line: 405, column: 25, scope: !28, inlinedAt: !50) +!77 = !DILocation(line: 406, column: 24, scope: !28, inlinedAt: !50) +!78 = !DILocation(line: 407, column: 24, scope: !28, inlinedAt: !50) +!79 = !DILocation(line: 412, column: 73, scope: !28, inlinedAt: !50) +!80 = !DILocation(line: 417, column: 27, scope: !28, inlinedAt: !50) +!81 = !DILocation(line: 414, column: 69, scope: !28, inlinedAt: !50) +!82 = !DILocation(line: 168, column: 27, scope: !42, inlinedAt: !50) +!83 = !DILocation(line: 189, column: 40, scope: !42, inlinedAt: !50) +!84 = !DILocation(line: 421, column: 27, scope: !28, inlinedAt: !50) +!85 = !DILocation(line: 424, column: 51, scope: !28, inlinedAt: !50) +!86 = !DILocation(line: 423, column: 35, scope: !28, inlinedAt: !50) +!87 = !DILocation(line: 428, column: 31, scope: !28, inlinedAt: !50) +!88 = !DILocation(line: 428, column: 25, scope: !28, inlinedAt: !50) +!89 = !DILocation(line: 429, column: 39, scope: !28, inlinedAt: !50) +!90 = !DILocation(line: 429, column: 21, scope: !28, inlinedAt: !50) +!91 = !DILocation(line: 434, column: 16, scope: !28, inlinedAt: !50) +!92 = !DILocation(line: 261, column: 15, scope: !42, inlinedAt: !50) +!93 = !DILocation(line: 291, column: 36, scope: !42, inlinedAt: !50) +!94 = !DILocation(line: 434, column: 24, scope: !28, inlinedAt: !50) +!95 = !DILocation(line: 436, column: 16, scope: !28, inlinedAt: !50) +!96 = !DILocation(line: 440, column: 22, scope: !28, inlinedAt: !50) +!97 = !DILocation(line: 440, column: 44, scope: !28, inlinedAt: !50) +!98 = !DILocation(line: 548, column: 26, scope: !28, inlinedAt: !50) +!99 = !DILocation(line: 247, column: 33, scope: !28, inlinedAt: !50) +!100 = !DILocation(line: 248, column: 38, scope: !28, inlinedAt: !50) +!101 = !DILocation(line: 248, column: 24, scope: !28, inlinedAt: !50) +!102 = !DILocation(line: 249, column: 109, scope: !28, inlinedAt: !50) +!103 = !DILocation(line: 249, column: 113, scope: !28, inlinedAt: !50) +!104 = !DILocation(line: 249, column: 55, scope: !28, inlinedAt: !50) +!105 = !DILocation(line: 249, column: 25, scope: !28, inlinedAt: !50) +!106 = !DILocation(line: 250, column: 35, scope: !28, inlinedAt: !50) +!107 = !DILocation(line: 251, column: 34, scope: !28, inlinedAt: !50) +!108 = !DILocation(line: 251, column: 48, scope: !28, inlinedAt: !50) +!109 = !DILocation(line: 251, column: 63, scope: !28, inlinedAt: !50) +!110 = !DILocation(line: 252, column: 29, scope: !28, inlinedAt: !50) +!111 = !DILocation(line: 252, column: 61, scope: !28, inlinedAt: !50) +!112 = !DILocation(line: 252, column: 42, scope: !28, inlinedAt: !50) +!113 = !DILocation(line: 549, column: 21, scope: !28, inlinedAt: !50) +!114 = !DILocation(line: 136, column: 19, scope: !5) +!115 = !DILocation(line: 181, column: 35, scope: !5) +!116 = !DILocation(line: 182, column: 27, scope: !5) +!117 = !DILocation(line: 182, column: 41, scope: !5) +!118 = !DILocation(line: 183, column: 51, scope: !5) +!119 = !DILocation(line: 183, column: 32, scope: !5) +!120 = !DILocation(line: 184, column: 49, scope: !5) +!121 = !DILocation(line: 184, column: 69, scope: !5) +!122 = !DILocation(line: 502, column: 40, scope: !28, inlinedAt: !123) +!123 = !DILocation(line: 198, column: 45, scope: !5) +!124 = !DILocation(line: 346, column: 35, scope: !28, inlinedAt: !123) +!125 = !DILocation(line: 284, column: 38, scope: !28, inlinedAt: !123) +!126 = !DILocation(line: 284, column: 20, scope: !28, inlinedAt: !123) +!127 = !DILocation(line: 284, column: 49, scope: !28, inlinedAt: !123) +!128 = !DILocation(line: 292, column: 52, scope: !28, inlinedAt: !123) +!129 = !DILocation(line: 292, column: 23, scope: !28, inlinedAt: !123) +!130 = !DILocation(line: 342, column: 32, scope: !28, inlinedAt: !123) +!131 = !DILocation(line: 351, column: 19, scope: !28, inlinedAt: !123) +!132 = !DILocation(line: 186, column: 28, scope: !5) +!133 = !DILocation(line: 353, column: 14, scope: !28, inlinedAt: !123) +!134 = !DILocation(line: 367, column: 44, scope: !28, inlinedAt: !123) +!135 = !DILocation(line: 417, column: 27, scope: !28, inlinedAt: !123) +!136 = !DILocation(line: 367, column: 69, scope: !28, inlinedAt: !123) +!137 = !DILocation(line: 168, column: 27, scope: !42, inlinedAt: !123) +!138 = !DILocation(line: 189, column: 40, scope: !42, inlinedAt: !123) +!139 = !DILocation(line: 421, column: 27, scope: !28, inlinedAt: !123) +!140 = !DILocation(line: 424, column: 51, scope: !28, inlinedAt: !123) +!141 = !DILocation(line: 423, column: 35, scope: !28, inlinedAt: !123) +!142 = !DILocation(line: 428, column: 31, scope: !28, inlinedAt: !123) +!143 = !DILocation(line: 428, column: 25, scope: !28, inlinedAt: !123) +!144 = !DILocation(line: 429, column: 39, scope: !28, inlinedAt: !123) +!145 = !DILocation(line: 429, column: 21, scope: !28, inlinedAt: !123) +!146 = !DILocation(line: 434, column: 16, scope: !28, inlinedAt: !123) +!147 = !DILocation(line: 261, column: 15, scope: !42, inlinedAt: !123) +!148 = !DILocation(line: 291, column: 36, scope: !42, inlinedAt: !123) +!149 = !DILocation(line: 434, column: 24, scope: !28, inlinedAt: !123) +!150 = !DILocation(line: 436, column: 16, scope: !28, inlinedAt: !123) +!151 = !DILocation(line: 440, column: 22, scope: !28, inlinedAt: !123) +!152 = !DILocation(line: 440, column: 44, scope: !28, inlinedAt: !123) +!153 = !DILocation(line: 548, column: 26, scope: !28, inlinedAt: !123) +!154 = !DILocation(line: 247, column: 33, scope: !28, inlinedAt: !123) +!155 = !DILocation(line: 248, column: 38, scope: !28, inlinedAt: !123) +!156 = !DILocation(line: 248, column: 24, scope: !28, inlinedAt: !123) +!157 = !DILocation(line: 249, column: 109, scope: !28, inlinedAt: !123) +!158 = !DILocation(line: 249, column: 113, scope: !28, inlinedAt: !123) +!159 = !DILocation(line: 249, column: 55, scope: !28, inlinedAt: !123) +!160 = !DILocation(line: 249, column: 25, scope: !28, inlinedAt: !123) +!161 = !DILocation(line: 250, column: 35, scope: !28, inlinedAt: !123) +!162 = !DILocation(line: 251, column: 34, scope: !28, inlinedAt: !123) +!163 = !DILocation(line: 251, column: 48, scope: !28, inlinedAt: !123) +!164 = !DILocation(line: 251, column: 63, scope: !28, inlinedAt: !123) +!165 = !DILocation(line: 252, column: 29, scope: !28, inlinedAt: !123) +!166 = !DILocation(line: 252, column: 61, scope: !28, inlinedAt: !123) +!167 = !DILocation(line: 252, column: 42, scope: !28, inlinedAt: !123) +!168 = !DILocation(line: 549, column: 21, scope: !28, inlinedAt: !123) +!169 = !DILocation(line: 206, column: 26, scope: !5) +!170 = !DILocation(line: 206, column: 34, scope: !5) +!171 = !DILocation(line: 208, column: 16, scope: !5) +!172 = !DILocation(line: 218, column: 49, scope: !5) +!173 = !DILocation(line: 218, column: 62, scope: !5) +!174 = !DILocation(line: 218, column: 75, scope: !5) +!175 = !DILocation(line: 218, column: 25, scope: !5) +!176 = !DILocation(line: 218, column: 109, scope: !5) +!177 = !DILocation(line: 223, column: 33, scope: !5) +!178 = !DILocation(line: 222, column: 32, scope: !5) +!179 = !DILocation(line: 222, column: 23, scope: !5) +!180 = !DILocation(line: 222, column: 40, scope: !5) +!181 = !DILocation(line: 223, column: 20, scope: !5) +!182 = !DILocation(line: 227, column: 48, scope: !5) +!183 = !DILocation(line: 227, column: 29, scope: !5) +!184 = !DILocation(line: 229, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..c6b0bc6fc3df428ad47e28d7c8a9963c06aa1daa --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.ptx @@ -0,0 +1,3420 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_tem_fused_0 // -- Begin function triton_tem_fused_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_tem_fused_0 +.visible .entry triton_tem_fused_0( + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_0, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_1, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_2, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_3, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_4, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_5, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_6, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_7, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_8, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_9, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_10, + .param .u32 triton_tem_fused_0_param_11, + .param .u32 triton_tem_fused_0_param_12, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_13, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_14 +) +.reqntid 256 +{ + .reg .pred %p<387>; + .reg .b16 %rs<70>; + .reg .b32 %r<5064>; + .reg .b64 %rd<293>; + .loc 1 18 0 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:18:0 + +// %bb.0: + ld.param.b32 %r712, [triton_tem_fused_0_param_11]; + ld.param.b64 %rd37, [triton_tem_fused_0_param_8]; + ld.param.b64 %rd36, [triton_tem_fused_0_param_7]; + ld.param.b64 %rd68, [triton_tem_fused_0_param_0]; +$L__tmp0: + .loc 1 97 28 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:97:28 + mov.u32 %r812, %ctaid.x; + ld.param.b64 %rd69, [triton_tem_fused_0_param_1]; + .loc 1 98 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:98:27 + mov.u32 %r1, %ctaid.y; + ld.param.b64 %rd70, [triton_tem_fused_0_param_2]; + .loc 1 99 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:99:27 + mov.u32 %r2, %ctaid.z; + .loc 1 103 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:103:23 + and.b32 %r813, %r1, 1; + .loc 1 107 24 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:107:24 + shl.b32 %r3, %r1, 23; + ld.param.b64 %rd71, [triton_tem_fused_0_param_5]; + .loc 1 107 45 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:107:45 + shl.b32 %r4, %r2, 7; + ld.param.b64 %rd72, [triton_tem_fused_0_param_6]; + .loc 1 107 36 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:107:36 + or.b32 %r814, %r3, %r4; + .loc 1 108 25 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:108:25 + shl.b32 %r815, %r813, 10; + .loc 1 108 47 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:108:47 + shl.b32 %r816, %r2, 5; + ld.param.b64 %rd73, [triton_tem_fused_0_param_9]; + and.b32 %r817, %r816, 2097024; + .loc 1 108 37 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:108:37 + add.s32 %r818, %r815, %r817; + mul.lo.s32 %r819, %r818, %r712; + ld.param.b32 %r820, [triton_tem_fused_0_param_12]; + .loc 1 111 12 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:111:12 + mad.wide.s32 %rd74, %r814, 2, %rd68; + .loc 1 112 12 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:112:12 + mul.wide.s32 %rd75, %r819, 2; + add.s64 %rd1, %rd69, %rd75; + .loc 1 113 12 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:113:12 + add.s64 %rd2, %rd70, %rd75; + .loc 1 142 51 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:142:51 + shl.b32 %r821, %r813, 4; + .loc 1 142 74 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:142:74 + add.s32 %r822, %r821, %r812; + .loc 1 143 64 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:143:64 + mul.lo.s32 %r823, %r822, %r820; + .loc 1 144 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:144:23 + shl.b32 %r5, %r812, 7; + .loc 1 144 46 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:144:46 + mov.u32 %r6, %tid.x; + shr.u32 %r7, %r6, 5; + and.b32 %r824, %r6, 240; + bfe.u32 %r8, %r6, 4, 4; + or.b32 %r9, %r8, 16; + or.b32 %r10, %r8, 32; + or.b32 %r11, %r8, 48; + .loc 1 144 33 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:144:33 + or.b32 %r12, %r8, %r5; + or.b32 %r13, %r9, %r5; + or.b32 %r14, %r10, %r5; + or.b32 %r15, %r11, %r5; + or.b32 %r16, %r12, 64; + or.b32 %r17, %r12, 80; + or.b32 %r18, %r12, 96; + or.b32 %r19, %r12, 112; +$L__tmp1: + .loc 1 284 38 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:38 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:146:101 ] + shl.b32 %r20, %r12, 12; + shl.b32 %r21, %r13, 12; + shl.b32 %r22, %r14, 12; + shl.b32 %r23, %r15, 12; + shl.b32 %r24, %r16, 12; + shl.b32 %r25, %r17, 12; + shl.b32 %r26, %r18, 12; + shl.b32 %r27, %r19, 12; + .loc 1 284 20 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:20 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:146:101 ] + mad.wide.s32 %rd76, %r20, 2, %rd74; + mad.wide.s32 %rd77, %r21, 2, %rd74; + mad.wide.s32 %rd78, %r22, 2, %rd74; + mad.wide.s32 %rd79, %r23, 2, %rd74; + mad.wide.s32 %rd80, %r24, 2, %rd74; + mad.wide.s32 %rd81, %r25, 2, %rd74; + mad.wide.s32 %rd82, %r26, 2, %rd74; + mad.wide.s32 %rd83, %r27, 2, %rd74; + .loc 1 284 56 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:56 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:146:101 ] + shl.b32 %r829, %r6, 3; + and.b32 %r830, %r829, 120; + .loc 1 284 49 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:49 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:146:101 ] + cvt.u64.u32 %rd3, %r830; + mul.wide.u32 %rd84, %r830, 2; + add.s64 %rd39, %rd76, %rd84; + add.s64 %rd40, %rd77, %rd84; + add.s64 %rd41, %rd78, %rd84; + add.s64 %rd42, %rd79, %rd84; + add.s64 %rd43, %rd80, %rd84; + add.s64 %rd44, %rd81, %rd84; + add.s64 %rd45, %rd82, %rd84; + add.s64 %rd46, %rd83, %rd84; + .loc 1 292 52 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:52 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:146:101 ] + setp.lt.s32 %p1, %r12, 2048; + setp.lt.s32 %p2, %r13, 2048; + setp.lt.s32 %p3, %r14, 2048; + setp.lt.s32 %p4, %r15, 2048; + setp.lt.s32 %p5, %r16, 2048; + setp.lt.s32 %p6, %r17, 2048; + setp.lt.s32 %p7, %r18, 2048; + setp.lt.s32 %p8, %r19, 2048; + mov.b32 %r4826, 0; + .loc 1 292 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:146:101 ] + // begin inline asm + mov.u32 %r713, %r4826; + mov.u32 %r714, %r4826; + mov.u32 %r715, %r4826; + mov.u32 %r716, %r4826; + @%p1 ld.global.v4.b32 { %r713, %r714, %r715, %r716 }, [ %rd39 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r721, %r4826; + mov.u32 %r722, %r4826; + mov.u32 %r723, %r4826; + mov.u32 %r724, %r4826; + @%p2 ld.global.v4.b32 { %r721, %r722, %r723, %r724 }, [ %rd40 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r729, %r4826; + mov.u32 %r730, %r4826; + mov.u32 %r731, %r4826; + mov.u32 %r732, %r4826; + @%p3 ld.global.v4.b32 { %r729, %r730, %r731, %r732 }, [ %rd41 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r737, %r4826; + mov.u32 %r738, %r4826; + mov.u32 %r739, %r4826; + mov.u32 %r740, %r4826; + @%p4 ld.global.v4.b32 { %r737, %r738, %r739, %r740 }, [ %rd42 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r745, %r4826; + mov.u32 %r746, %r4826; + mov.u32 %r747, %r4826; + mov.u32 %r748, %r4826; + @%p5 ld.global.v4.b32 { %r745, %r746, %r747, %r748 }, [ %rd43 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r753, %r4826; + mov.u32 %r754, %r4826; + mov.u32 %r755, %r4826; + mov.u32 %r756, %r4826; + @%p6 ld.global.v4.b32 { %r753, %r754, %r755, %r756 }, [ %rd44 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r761, %r4826; + mov.u32 %r762, %r4826; + mov.u32 %r763, %r4826; + mov.u32 %r764, %r4826; + @%p7 ld.global.v4.b32 { %r761, %r762, %r763, %r764 }, [ %rd45 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r769, %r4826; + mov.u32 %r770, %r4826; + mov.u32 %r771, %r4826; + mov.u32 %r772, %r4826; + @%p8 ld.global.v4.b32 { %r769, %r770, %r771, %r772 }, [ %rd46 + 0 ]; + // end inline asm + and.b32 %r28, %r6, 7; + shl.b32 %r831, %r28, 4; + shl.b32 %r832, %r824, 3; + and.b32 %r29, %r6, 112; + and.b32 %r30, %r6, 8; + shl.b32 %r833, %r30, 11; + or.b32 %r834, %r831, %r832; + xor.b32 %r835, %r834, %r29; + mov.b32 %r836, global_smem; + add.s32 %r837, %r836, %r833; + add.s32 %r838, %r837, %r835; + st.shared.v4.b32 [%r838+98304], {%r713, %r714, %r715, %r716}; + st.shared.v4.b32 [%r838+100352], {%r721, %r722, %r723, %r724}; + st.shared.v4.b32 [%r838+102400], {%r729, %r730, %r731, %r732}; + st.shared.v4.b32 [%r838+104448], {%r737, %r738, %r739, %r740}; + st.shared.v4.b32 [%r838+106496], {%r745, %r746, %r747, %r748}; + st.shared.v4.b32 [%r838+108544], {%r753, %r754, %r755, %r756}; + st.shared.v4.b32 [%r838+110592], {%r761, %r762, %r763, %r764}; + st.shared.v4.b32 [%r838+112640], {%r769, %r770, %r771, %r772}; +$L__tmp2: + .loc 1 151 26 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:151:26 + cvt.s64.s32 %rd4, %r823; + mad.wide.s32 %rd47, %r823, 4, %rd72; + .loc 1 152 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:152:23 + // begin inline asm + mov.u32 %r777, 0x0; + ld.global.b32 { %r777 }, [ %rd47 + 0 ]; + // end inline asm + .loc 1 152 37 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:152:37 + shl.b32 %r31, %r777, 7; + .loc 1 153 42 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:153:42 + cvt.s64.s32 %rd6, %r822; + mad.wide.s32 %rd48, %r822, 4, %rd71; + .loc 1 153 28 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:153:28 + // begin inline asm + mov.u32 %r778, 0x0; + ld.global.b32 { %r778 }, [ %rd48 + 0 ]; + // end inline asm + .loc 1 154 45 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:154:45 + shl.b32 %r839, %r778, 1; +$L__tmp3: + .loc 2 41 22 // standard.py:41:22 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:154:92 ] + add.s32 %r840, %r712, 63; + .loc 2 41 28 // standard.py:41:28 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:154:92 ] + shr.s32 %r841, %r840, 31; + shr.u32 %r842, %r841, 26; + add.s32 %r843, %r840, %r842; + shr.s32 %r844, %r843, 6; +$L__tmp4: + .loc 1 154 102 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:154:102 + max.s32 %r33, %r844, 1; + .loc 1 154 65 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:154:65 + min.s32 %r34, %r839, %r33; + .loc 1 159 37 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:159:37 + and.b32 %r35, %r6, 3; + shl.b32 %r36, %r35, 1; + or.b32 %r37, %r36, 1; + or.b32 %r39, %r36, 8; + or.b32 %r38, %r36, 9; + or.b32 %r43, %r36, 16; + or.b32 %r42, %r36, 17; + or.b32 %r41, %r36, 24; + or.b32 %r40, %r36, 25; + or.b32 %r51, %r36, 32; + or.b32 %r50, %r36, 33; + or.b32 %r49, %r36, 40; + or.b32 %r48, %r36, 41; + or.b32 %r47, %r36, 48; + or.b32 %r46, %r36, 49; + or.b32 %r45, %r36, 56; + or.b32 %r44, %r36, 57; +$L__tmp5: + .loc 1 376 33 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:376:33 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mad.wide.u32 %rd50, %r1, 8, %rd73; + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + setp.lt.s32 %p10, %r839, 1; + setp.gt.s32 %p9, %r839, 0; + .loc 1 376 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:376:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + // begin inline asm + mov.u64 %rd7, 0x0; + @%p9 ld.global.b64 { %rd7 }, [ %rd50 + 0 ]; + // end inline asm + .loc 1 346 35 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:346:35 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + or.b32 %r853, %r31, %r8; + or.b32 %r854, %r31, %r9; + or.b32 %r855, %r31, %r10; + or.b32 %r856, %r31, %r11; + .loc 1 284 38 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:38 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + shl.b32 %r857, %r853, 7; + shl.b32 %r858, %r854, 7; + shl.b32 %r859, %r855, 7; + shl.b32 %r860, %r856, 7; + .loc 1 284 20 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:20 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.wide.s32 %rd85, %r857, 2; + mul.wide.s32 %rd86, %r858, 2; + mul.wide.s32 %rd87, %r859, 2; + mul.wide.s32 %rd88, %r860, 2; + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s64 %rd10, %rd1, %rd84; + .loc 1 284 49 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:49 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s64 %rd51, %rd10, %rd85; + add.s64 %rd52, %rd10, %rd86; + add.s64 %rd53, %rd10, %rd87; + add.s64 %rd54, %rd10, %rd88; + .loc 1 292 52 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:52 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + setp.lt.s32 %p11, %r853, %r712; + setp.lt.s32 %p12, %r854, %r712; + setp.lt.s32 %p13, %r855, %r712; + setp.lt.s32 %p14, %r856, %r712; + .loc 1 292 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + shl.b32 %r861, %r30, 10; + or.b32 %r84, %r835, %r861; + add.s32 %r2726, %r836, %r84; + selp.b32 %r862, 16, 0, %p11; + selp.b32 %r788, %r862, 0, %p9; + // begin inline asm + cp.async.cg.shared.global [ %r2726 + 0 ], [ %rd51 + 0 ], 0x10, %r788; + // end inline asm + add.s32 %r2728, %r2726, 2048; + selp.b32 %r863, 16, 0, %p12; + selp.b32 %r790, %r863, 0, %p9; + // begin inline asm + cp.async.cg.shared.global [ %r2728 + 0 ], [ %rd52 + 0 ], 0x10, %r790; + // end inline asm + add.s32 %r2730, %r2726, 4096; + selp.b32 %r864, 16, 0, %p13; + selp.b32 %r792, %r864, 0, %p9; + // begin inline asm + cp.async.cg.shared.global [ %r2730 + 0 ], [ %rd53 + 0 ], 0x10, %r792; + // end inline asm + add.s32 %r2732, %r2726, 6144; + selp.b32 %r865, 16, 0, %p14; + selp.b32 %r794, %r865, 0, %p9; + // begin inline asm + cp.async.cg.shared.global [ %r2732 + 0 ], [ %rd54 + 0 ], 0x10, %r794; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s64 %rd11, %rd2, %rd84; + .loc 1 284 49 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:49 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s64 %rd55, %rd11, %rd85; + add.s64 %rd56, %rd11, %rd86; + add.s64 %rd57, %rd11, %rd87; + add.s64 %rd58, %rd11, %rd88; + .loc 1 292 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s32 %r787, %r2726, 49152; + // begin inline asm + cp.async.cg.shared.global [ %r787 + 0 ], [ %rd55 + 0 ], 0x10, %r788; + // end inline asm + add.s32 %r789, %r2726, 51200; + // begin inline asm + cp.async.cg.shared.global [ %r789 + 0 ], [ %rd56 + 0 ], 0x10, %r790; + // end inline asm + add.s32 %r791, %r2726, 53248; + // begin inline asm + cp.async.cg.shared.global [ %r791 + 0 ], [ %rd57 + 0 ], 0x10, %r792; + // end inline asm + add.s32 %r793, %r2726, 55296; + // begin inline asm + cp.async.cg.shared.global [ %r793 + 0 ], [ %rd58 + 0 ], 0x10, %r794; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + setp.gt.s32 %p15, %r34, 1; + .loc 1 342 32 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:342:32 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + or.b32 %r866, %r31, 64; + .loc 1 346 35 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:346:35 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + or.b32 %r867, %r866, %r8; + or.b32 %r868, %r866, %r9; + or.b32 %r869, %r866, %r10; + or.b32 %r870, %r866, %r11; + .loc 1 284 38 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:38 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + shl.b32 %r871, %r867, 7; + shl.b32 %r872, %r868, 7; + shl.b32 %r873, %r869, 7; + shl.b32 %r874, %r870, 7; + .loc 1 284 20 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:20 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.wide.s32 %rd89, %r871, 2; + mul.wide.s32 %rd90, %r872, 2; + mul.wide.s32 %rd91, %r873, 2; + mul.wide.s32 %rd92, %r874, 2; + .loc 1 284 49 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:49 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s64 %rd59, %rd10, %rd89; + add.s64 %rd60, %rd10, %rd90; + add.s64 %rd61, %rd10, %rd91; + add.s64 %rd62, %rd10, %rd92; + .loc 1 292 52 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:52 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + setp.lt.s32 %p16, %r867, %r712; + setp.lt.s32 %p17, %r868, %r712; + setp.lt.s32 %p18, %r869, %r712; + setp.lt.s32 %p19, %r870, %r712; + .loc 1 292 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + bar.sync 0; + add.s32 %r795, %r2726, 16384; + selp.b32 %r875, 16, 0, %p16; + selp.b32 %r804, %r875, 0, %p15; + // begin inline asm + cp.async.cg.shared.global [ %r795 + 0 ], [ %rd59 + 0 ], 0x10, %r804; + // end inline asm + add.s32 %r797, %r2726, 18432; + selp.b32 %r876, 16, 0, %p17; + selp.b32 %r806, %r876, 0, %p15; + // begin inline asm + cp.async.cg.shared.global [ %r797 + 0 ], [ %rd60 + 0 ], 0x10, %r806; + // end inline asm + add.s32 %r799, %r2726, 20480; + selp.b32 %r877, 16, 0, %p18; + selp.b32 %r808, %r877, 0, %p15; + // begin inline asm + cp.async.cg.shared.global [ %r799 + 0 ], [ %rd61 + 0 ], 0x10, %r808; + // end inline asm + add.s32 %r801, %r2726, 22528; + selp.b32 %r878, 16, 0, %p19; + selp.b32 %r810, %r878, 0, %p15; + // begin inline asm + cp.async.cg.shared.global [ %r801 + 0 ], [ %rd62 + 0 ], 0x10, %r810; + // end inline asm + cp.async.commit_group; + .loc 1 284 49 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:49 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s64 %rd63, %rd11, %rd89; + add.s64 %rd64, %rd11, %rd90; + add.s64 %rd65, %rd11, %rd91; + add.s64 %rd66, %rd11, %rd92; + .loc 1 292 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s32 %r803, %r2726, 65536; + // begin inline asm + cp.async.cg.shared.global [ %r803 + 0 ], [ %rd63 + 0 ], 0x10, %r804; + // end inline asm + add.s32 %r805, %r2726, 67584; + // begin inline asm + cp.async.cg.shared.global [ %r805 + 0 ], [ %rd64 + 0 ], 0x10, %r806; + // end inline asm + add.s32 %r807, %r2726, 69632; + // begin inline asm + cp.async.cg.shared.global [ %r807 + 0 ], [ %rd65 + 0 ], 0x10, %r808; + // end inline asm + add.s32 %r809, %r2726, 71680; + // begin inline asm + cp.async.cg.shared.global [ %r809 + 0 ], [ %rd66 + 0 ], 0x10, %r810; + // end inline asm + cp.async.commit_group; + .loc 1 351 19 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:351:19 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + mov.b32 %r879, 0fFF800000; + mov.b64 %rd292, {%r879, %r879}; + mov.b32 %r1702, 0f00000000; + mov.b32 %r1703, %r1702; + mov.b32 %r1704, %r1702; + mov.b32 %r1705, %r1702; + mov.b32 %r1706, %r1702; + mov.b32 %r1707, %r1702; + mov.b32 %r1708, %r1702; + mov.b32 %r1709, %r1702; + mov.b32 %r1710, %r1702; + mov.b32 %r1711, %r1702; + mov.b32 %r1712, %r1702; + mov.b32 %r1713, %r1702; + mov.b32 %r1714, %r1702; + mov.b32 %r1715, %r1702; + mov.b32 %r1716, %r1702; + mov.b32 %r1717, %r1702; + mov.b32 %r1718, %r1702; + mov.b32 %r1719, %r1702; + mov.b32 %r1720, %r1702; + mov.b32 %r1721, %r1702; + mov.b32 %r1722, %r1702; + mov.b32 %r1723, %r1702; + mov.b32 %r1724, %r1702; + mov.b32 %r1725, %r1702; + mov.b32 %r1726, %r1702; + mov.b32 %r1727, %r1702; + mov.b32 %r1728, %r1702; + mov.b32 %r1729, %r1702; + mov.b32 %r1730, %r1702; + mov.b32 %r1731, %r1702; + mov.b32 %r1732, %r1702; + mov.b32 %r1733, %r1702; + mov.b32 %r1734, %r1702; + mov.b32 %r1735, %r1702; + mov.b32 %r1736, %r1702; + mov.b32 %r1737, %r1702; + mov.b32 %r1738, %r1702; + mov.b32 %r1739, %r1702; + mov.b32 %r1740, %r1702; + mov.b32 %r1741, %r1702; + mov.b32 %r1742, %r1702; + mov.b32 %r1743, %r1702; + mov.b32 %r1744, %r1702; + mov.b32 %r1745, %r1702; + mov.b32 %r1746, %r1702; + mov.b32 %r1747, %r1702; + mov.b32 %r1748, %r1702; + mov.b32 %r1749, %r1702; + mov.b32 %r1750, %r1702; + mov.b32 %r1751, %r1702; + mov.b32 %r1752, %r1702; + mov.b32 %r1753, %r1702; + mov.b32 %r1754, %r1702; + mov.b32 %r1755, %r1702; + mov.b32 %r1756, %r1702; + mov.b32 %r1757, %r1702; + mov.b32 %r1758, %r1702; + mov.b32 %r1759, %r1702; + mov.b32 %r1760, %r1702; + mov.b32 %r1761, %r1702; + mov.b32 %r1762, %r1702; + mov.b32 %r1763, %r1702; + mov.b32 %r1764, %r1702; + mov.b32 %r1765, %r1702; + mov.b32 %r5060, %r1702; + mov.b32 %r5061, %r1702; + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + @%p10 bra $L__BB0_3; +$L__tmp6: +// %bb.1: // %.lr.ph + .loc 1 0 0 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:0 + shr.u32 %r825, %r6, 1; + and.b32 %r826, %r825, 112; + bfe.u32 %r827, %r6, 2, 3; + or.b32 %r828, %r826, %r827; + or.b32 %r845, %r828, %r5; + or.b32 %r846, %r845, 8; + bfe.s32 %r847, %r812, 24, 1; + shr.u32 %r848, %r847, 21; + add.s32 %r849, %r846, %r848; + and.b32 %r850, %r849, -2048; + sub.s32 %r81, %r846, %r850; + add.s32 %r851, %r845, %r848; + and.b32 %r852, %r851, -2048; + sub.s32 %r83, %r845, %r852; + cvt.s64.s32 %rd8, %r83; + cvt.s64.s32 %rd9, %r81; + .loc 1 159 24 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:159:24 + or.b32 %r4842, %r31, %r36; + or.b32 %r4841, %r31, %r37; + or.b32 %r4839, %r31, %r38; + or.b32 %r4835, %r31, %r40; + or.b32 %r4827, %r31, %r44; + or.b32 %r4840, %r31, %r39; + or.b32 %r4836, %r31, %r41; + or.b32 %r4828, %r31, %r45; + or.b32 %r4837, %r31, %r42; + or.b32 %r4829, %r31, %r46; + or.b32 %r4838, %r31, %r43; + or.b32 %r4830, %r31, %r47; + or.b32 %r4831, %r31, %r48; + or.b32 %r4832, %r31, %r49; + or.b32 %r4833, %r31, %r50; + or.b32 %r4834, %r31, %r51; + add.s32 %r117, %r34, -2; + add.s32 %r118, %r34, -1; +$L__tmp7: + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + max.s32 %r119, %r34, 1; + mov.b64 %rd292, {%r879, %r879}; + mov.b32 %r5060, 0f00000000; + mov.b32 %r4758, 1; + mov.b32 %r4757, -1; + mov.b32 %r4756, 64; + mov.b32 %r4759, %r4756; + mov.b32 %r5061, %r5060; + mov.b32 %r1702, %r5060; + mov.b32 %r1703, %r5060; + mov.b32 %r1704, %r5060; + mov.b32 %r1705, %r5060; + mov.b32 %r1706, %r5060; + mov.b32 %r1707, %r5060; + mov.b32 %r1708, %r5060; + mov.b32 %r1709, %r5060; + mov.b32 %r1710, %r5060; + mov.b32 %r1711, %r5060; + mov.b32 %r1712, %r5060; + mov.b32 %r1713, %r5060; + mov.b32 %r1714, %r5060; + mov.b32 %r1715, %r5060; + mov.b32 %r1716, %r5060; + mov.b32 %r1717, %r5060; + mov.b32 %r1718, %r5060; + mov.b32 %r1719, %r5060; + mov.b32 %r1720, %r5060; + mov.b32 %r1721, %r5060; + mov.b32 %r1722, %r5060; + mov.b32 %r1723, %r5060; + mov.b32 %r1724, %r5060; + mov.b32 %r1725, %r5060; + mov.b32 %r1726, %r5060; + mov.b32 %r1727, %r5060; + mov.b32 %r1728, %r5060; + mov.b32 %r1729, %r5060; + mov.b32 %r1730, %r5060; + mov.b32 %r1731, %r5060; + mov.b32 %r1732, %r5060; + mov.b32 %r1733, %r5060; + mov.b32 %r1734, %r5060; + mov.b32 %r1735, %r5060; + mov.b32 %r1736, %r5060; + mov.b32 %r1737, %r5060; + mov.b32 %r1738, %r5060; + mov.b32 %r1739, %r5060; + mov.b32 %r1740, %r5060; + mov.b32 %r1741, %r5060; + mov.b32 %r1742, %r5060; + mov.b32 %r1743, %r5060; + mov.b32 %r1744, %r5060; + mov.b32 %r1745, %r5060; + mov.b32 %r1746, %r5060; + mov.b32 %r1747, %r5060; + mov.b32 %r1748, %r5060; + mov.b32 %r1749, %r5060; + mov.b32 %r1750, %r5060; + mov.b32 %r1751, %r5060; + mov.b32 %r1752, %r5060; + mov.b32 %r1753, %r5060; + mov.b32 %r1754, %r5060; + mov.b32 %r1755, %r5060; + mov.b32 %r1756, %r5060; + mov.b32 %r1757, %r5060; + mov.b32 %r1758, %r5060; + mov.b32 %r1759, %r5060; + mov.b32 %r1760, %r5060; + mov.b32 %r1761, %r5060; + mov.b32 %r1762, %r5060; + mov.b32 %r1763, %r5060; + mov.b32 %r1764, %r5060; + mov.b32 %r1765, %r5060; +$L__BB0_2: // %__nv_exp2f.exit256 + // =>This Inner Loop Header: Depth=1 + .loc 1 0 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:0:40 + cvt.u32.u64 %r2116, %rd9; + cvt.u32.u64 %r2117, %rd8; + .loc 1 379 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:379:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + setp.gt.s64 %p33, %rd7, %rd9; + setp.gt.s64 %p34, %rd7, %rd8; + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + setp.lt.s32 %p35, %r4826, %r117; + setp.lt.s32 %p31, %r4826, %r118; + add.s32 %r2118, %r4757, 1; + setp.gt.s32 %p36, %r2118, 2; + selp.b32 %r4757, 0, %r2118, %p36; + .loc 1 292 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r2119, %r4757, 14; + add.s32 %r1401, %r836, %r2119; + .loc 1 351 19 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:351:19 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + shfl.sync.idx.b32 %r2121, %r7, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r2122, %r2121, 11; + and.b32 %r2123, %r2122, 8192; + add.s32 %r1398, %r836, 98304; + add.s32 %r2124, %r2123, %r1398; + bfe.u32 %r2125, %r2124, 4, 14; + cvt.u64.u32 %rd128, %r2125; + or.b64 %rd94, %rd128, 4611686293372403712; + bfe.u32 %r2126, %r1401, 4, 14; + cvt.u64.u32 %rd129, %r2126; + or.b64 %rd95, %rd129, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r982,%r983,%r984,%r985,%r986,%r987,%r988,%r989,%r990,%r991,%r992,%r993,%r994,%r995,%r996,%r997,%r998,%r999,%r1000,%r1001,%r1002,%r1003,%r1004,%r1005,%r1006,%r1007,%r1008,%r1009,%r1010,%r1011,%r1012,%r1013}, %rd94, %rd95, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r2127, %r2124, 32; + bfe.u32 %r2128, %r2127, 4, 14; + cvt.u64.u32 %rd130, %r2128; + or.b64 %rd96, %rd130, 4611686293372403712; + add.s32 %r2129, %r1401, 32; + bfe.u32 %r2130, %r2129, 4, 14; + cvt.u64.u32 %rd131, %r2130; + or.b64 %rd97, %rd131, 4611686293338849280; + mov.pred %p20, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r982,%r983,%r984,%r985,%r986,%r987,%r988,%r989,%r990,%r991,%r992,%r993,%r994,%r995,%r996,%r997,%r998,%r999,%r1000,%r1001,%r1002,%r1003,%r1004,%r1005,%r1006,%r1007,%r1008,%r1009,%r1010,%r1011,%r1012,%r1013}, %rd96, %rd97, %p20, 1, 1, 0, 0; + // end inline asm + add.s32 %r2131, %r2124, 64; + bfe.u32 %r2132, %r2131, 4, 14; + cvt.u64.u32 %rd132, %r2132; + or.b64 %rd98, %rd132, 4611686293372403712; + add.s32 %r2133, %r1401, 64; + bfe.u32 %r2134, %r2133, 4, 14; + cvt.u64.u32 %rd133, %r2134; + or.b64 %rd99, %rd133, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r982,%r983,%r984,%r985,%r986,%r987,%r988,%r989,%r990,%r991,%r992,%r993,%r994,%r995,%r996,%r997,%r998,%r999,%r1000,%r1001,%r1002,%r1003,%r1004,%r1005,%r1006,%r1007,%r1008,%r1009,%r1010,%r1011,%r1012,%r1013}, %rd98, %rd99, %p20, 1, 1, 0, 0; + // end inline asm + add.s32 %r2135, %r2124, 96; + bfe.u32 %r2136, %r2135, 4, 14; + cvt.u64.u32 %rd134, %r2136; + or.b64 %rd100, %rd134, 4611686293372403712; + add.s32 %r2137, %r1401, 96; + bfe.u32 %r2138, %r2137, 4, 14; + cvt.u64.u32 %rd135, %r2138; + or.b64 %rd101, %rd135, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r982,%r983,%r984,%r985,%r986,%r987,%r988,%r989,%r990,%r991,%r992,%r993,%r994,%r995,%r996,%r997,%r998,%r999,%r1000,%r1001,%r1002,%r1003,%r1004,%r1005,%r1006,%r1007,%r1008,%r1009,%r1010,%r1011,%r1012,%r1013}, %rd100, %rd101, %p20, 1, 1, 0, 0; + // end inline asm + add.s32 %r2139, %r2124, 16384; + bfe.u32 %r2140, %r2139, 4, 14; + cvt.u64.u32 %rd136, %r2140; + or.b64 %rd102, %rd136, 4611686293372403712; + add.s32 %r2141, %r1401, 8192; + bfe.u32 %r2142, %r2141, 4, 14; + cvt.u64.u32 %rd137, %r2142; + or.b64 %rd103, %rd137, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r982,%r983,%r984,%r985,%r986,%r987,%r988,%r989,%r990,%r991,%r992,%r993,%r994,%r995,%r996,%r997,%r998,%r999,%r1000,%r1001,%r1002,%r1003,%r1004,%r1005,%r1006,%r1007,%r1008,%r1009,%r1010,%r1011,%r1012,%r1013}, %rd102, %rd103, %p20, 1, 1, 0, 0; + // end inline asm + add.s32 %r2143, %r2124, 16416; + bfe.u32 %r2144, %r2143, 4, 14; + cvt.u64.u32 %rd138, %r2144; + or.b64 %rd104, %rd138, 4611686293372403712; + add.s32 %r2145, %r1401, 8224; + bfe.u32 %r2146, %r2145, 4, 14; + cvt.u64.u32 %rd139, %r2146; + or.b64 %rd105, %rd139, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r982,%r983,%r984,%r985,%r986,%r987,%r988,%r989,%r990,%r991,%r992,%r993,%r994,%r995,%r996,%r997,%r998,%r999,%r1000,%r1001,%r1002,%r1003,%r1004,%r1005,%r1006,%r1007,%r1008,%r1009,%r1010,%r1011,%r1012,%r1013}, %rd104, %rd105, %p20, 1, 1, 0, 0; + // end inline asm + add.s32 %r2147, %r2124, 16448; + bfe.u32 %r2148, %r2147, 4, 14; + cvt.u64.u32 %rd140, %r2148; + or.b64 %rd106, %rd140, 4611686293372403712; + add.s32 %r2149, %r1401, 8256; + bfe.u32 %r2150, %r2149, 4, 14; + cvt.u64.u32 %rd141, %r2150; + or.b64 %rd107, %rd141, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r982,%r983,%r984,%r985,%r986,%r987,%r988,%r989,%r990,%r991,%r992,%r993,%r994,%r995,%r996,%r997,%r998,%r999,%r1000,%r1001,%r1002,%r1003,%r1004,%r1005,%r1006,%r1007,%r1008,%r1009,%r1010,%r1011,%r1012,%r1013}, %rd106, %rd107, %p20, 1, 1, 0, 0; + // end inline asm + add.s32 %r2151, %r2124, 16480; + bfe.u32 %r2152, %r2151, 4, 14; + cvt.u64.u32 %rd142, %r2152; + or.b64 %rd108, %rd142, 4611686293372403712; + add.s32 %r2153, %r1401, 8288; + bfe.u32 %r2154, %r2153, 4, 14; + cvt.u64.u32 %rd143, %r2154; + or.b64 %rd109, %rd143, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r982,%r983,%r984,%r985,%r986,%r987,%r988,%r989,%r990,%r991,%r992,%r993,%r994,%r995,%r996,%r997,%r998,%r999,%r1000,%r1001,%r1002,%r1003,%r1004,%r1005,%r1006,%r1007,%r1008,%r1009,%r1010,%r1011,%r1012,%r1013}, %rd108, %rd109, %p20, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r1399, 0; + mov.b32 %r1400, %r1399; + mov.b32 %r1402, %r1399; + mov.b32 %r1403, %r1399; + // begin inline asm + // wait for regs: %r982,%r983,%r984,%r985,%r986,%r987,%r988,%r989,%r990,%r991,%r992,%r993,%r994,%r995,%r996,%r997,%r998,%r999,%r1000,%r1001,%r1002,%r1003,%r1004,%r1005,%r1006,%r1007,%r1008,%r1009,%r1010,%r1011,%r1012,%r1013,%r1398,%r1399,%r1400,%r1401,%r1402,%r1403,%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 353 14 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:353:14 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2155, %r982, 0f3DB504F3; + mul.f32 %r2156, %r983, 0f3DB504F3; + mul.f32 %r2157, %r984, 0f3DB504F3; + mul.f32 %r2158, %r985, 0f3DB504F3; + mul.f32 %r2159, %r986, 0f3DB504F3; + mul.f32 %r2160, %r987, 0f3DB504F3; + mul.f32 %r2161, %r988, 0f3DB504F3; + mul.f32 %r2162, %r989, 0f3DB504F3; + mul.f32 %r2163, %r990, 0f3DB504F3; + mul.f32 %r2164, %r991, 0f3DB504F3; + mul.f32 %r2165, %r992, 0f3DB504F3; + mul.f32 %r2166, %r993, 0f3DB504F3; + mul.f32 %r2167, %r994, 0f3DB504F3; + mul.f32 %r2168, %r995, 0f3DB504F3; + mul.f32 %r2169, %r996, 0f3DB504F3; + mul.f32 %r2170, %r997, 0f3DB504F3; + mul.f32 %r2171, %r998, 0f3DB504F3; + mul.f32 %r2172, %r999, 0f3DB504F3; + mul.f32 %r2173, %r1000, 0f3DB504F3; + mul.f32 %r2174, %r1001, 0f3DB504F3; + mul.f32 %r2175, %r1002, 0f3DB504F3; + mul.f32 %r2176, %r1003, 0f3DB504F3; + mul.f32 %r2177, %r1004, 0f3DB504F3; + mul.f32 %r2178, %r1005, 0f3DB504F3; + mul.f32 %r2179, %r1006, 0f3DB504F3; + mul.f32 %r2180, %r1007, 0f3DB504F3; + mul.f32 %r2181, %r1008, 0f3DB504F3; + mul.f32 %r2182, %r1009, 0f3DB504F3; + mul.f32 %r2183, %r1010, 0f3DB504F3; + mul.f32 %r2184, %r1011, 0f3DB504F3; + mul.f32 %r2185, %r1012, 0f3DB504F3; + mul.f32 %r2186, %r1013, 0f3DB504F3; + .loc 1 367 44 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:44 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + setp.lt.s32 %p37, %r4842, %r712; + setp.lt.s32 %p38, %r4841, %r712; + setp.lt.s32 %p39, %r4840, %r712; + setp.lt.s32 %p40, %r4839, %r712; + setp.lt.s32 %p41, %r4838, %r712; + setp.lt.s32 %p42, %r4837, %r712; + setp.lt.s32 %p43, %r4836, %r712; + setp.lt.s32 %p44, %r4835, %r712; + setp.lt.s32 %p45, %r4834, %r712; + setp.lt.s32 %p46, %r4833, %r712; + setp.lt.s32 %p47, %r4832, %r712; + setp.lt.s32 %p48, %r4831, %r712; + setp.lt.s32 %p49, %r4830, %r712; + setp.lt.s32 %p50, %r4829, %r712; + setp.lt.s32 %p51, %r4828, %r712; + setp.lt.s32 %p52, %r4827, %r712; + .loc 1 257 21 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:257:21 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + rem.s32 %r2187, %r4827, %r712; + rem.s32 %r2188, %r4828, %r712; + rem.s32 %r2189, %r4829, %r712; + rem.s32 %r2190, %r4830, %r712; + rem.s32 %r2191, %r4831, %r712; + rem.s32 %r2192, %r4832, %r712; + rem.s32 %r2193, %r4833, %r712; + rem.s32 %r2194, %r4834, %r712; + rem.s32 %r2195, %r4835, %r712; + rem.s32 %r2196, %r4836, %r712; + rem.s32 %r2197, %r4837, %r712; + rem.s32 %r2198, %r4838, %r712; + rem.s32 %r2199, %r4839, %r712; + rem.s32 %r2200, %r4840, %r712; + rem.s32 %r2201, %r4841, %r712; + rem.s32 %r2202, %r4842, %r712; + .loc 1 373 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:373:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + setp.ge.s32 %p53, %r2117, %r2202; + setp.ge.s32 %p54, %r2117, %r2201; + setp.ge.s32 %p55, %r2116, %r2202; + setp.ge.s32 %p56, %r2116, %r2201; + setp.ge.s32 %p57, %r2117, %r2200; + setp.ge.s32 %p58, %r2117, %r2199; + setp.ge.s32 %p59, %r2116, %r2200; + setp.ge.s32 %p60, %r2116, %r2199; + setp.ge.s32 %p61, %r2117, %r2198; + setp.ge.s32 %p62, %r2117, %r2197; + setp.ge.s32 %p63, %r2116, %r2198; + setp.ge.s32 %p64, %r2116, %r2197; + setp.ge.s32 %p65, %r2117, %r2196; + setp.ge.s32 %p66, %r2117, %r2195; + setp.ge.s32 %p67, %r2116, %r2196; + setp.ge.s32 %p68, %r2116, %r2195; + setp.ge.s32 %p69, %r2117, %r2194; + setp.ge.s32 %p70, %r2117, %r2193; + setp.ge.s32 %p71, %r2116, %r2194; + setp.ge.s32 %p72, %r2116, %r2193; + setp.ge.s32 %p73, %r2117, %r2192; + setp.ge.s32 %p74, %r2117, %r2191; + setp.ge.s32 %p75, %r2116, %r2192; + setp.ge.s32 %p76, %r2116, %r2191; + setp.ge.s32 %p77, %r2117, %r2190; + setp.ge.s32 %p78, %r2117, %r2189; + setp.ge.s32 %p79, %r2116, %r2190; + setp.ge.s32 %p80, %r2116, %r2189; + setp.ge.s32 %p81, %r2117, %r2188; + setp.ge.s32 %p82, %r2117, %r2187; + setp.ge.s32 %p83, %r2116, %r2188; + setp.ge.s32 %p84, %r2116, %r2187; + .loc 1 381 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:381:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + and.pred %p85, %p34, %p53; + and.pred %p86, %p34, %p54; + and.pred %p87, %p33, %p55; + and.pred %p88, %p33, %p56; + and.pred %p89, %p34, %p57; + and.pred %p90, %p34, %p58; + and.pred %p91, %p33, %p59; + and.pred %p92, %p33, %p60; + and.pred %p93, %p34, %p61; + and.pred %p94, %p34, %p62; + and.pred %p95, %p33, %p63; + and.pred %p96, %p33, %p64; + and.pred %p97, %p34, %p65; + and.pred %p98, %p34, %p66; + and.pred %p99, %p33, %p67; + and.pred %p100, %p33, %p68; + and.pred %p101, %p34, %p69; + and.pred %p102, %p34, %p70; + and.pred %p103, %p33, %p71; + and.pred %p104, %p33, %p72; + and.pred %p105, %p34, %p73; + and.pred %p106, %p34, %p74; + and.pred %p107, %p33, %p75; + and.pred %p108, %p33, %p76; + and.pred %p109, %p34, %p77; + and.pred %p110, %p34, %p78; + and.pred %p111, %p33, %p79; + and.pred %p112, %p33, %p80; + and.pred %p113, %p34, %p81; + and.pred %p114, %p34, %p82; + and.pred %p115, %p33, %p83; + and.pred %p116, %p33, %p84; + .loc 1 384 24 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:384:24 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + setp.gt.s32 %p117, %r2202, 2047; + setp.gt.s32 %p118, %r2201, 2047; + setp.gt.s32 %p119, %r2200, 2047; + setp.gt.s32 %p120, %r2199, 2047; + setp.gt.s32 %p121, %r2198, 2047; + setp.gt.s32 %p122, %r2197, 2047; + setp.gt.s32 %p123, %r2196, 2047; + setp.gt.s32 %p124, %r2195, 2047; + setp.gt.s32 %p125, %r2194, 2047; + setp.gt.s32 %p126, %r2193, 2047; + setp.gt.s32 %p127, %r2192, 2047; + setp.gt.s32 %p128, %r2191, 2047; + setp.gt.s32 %p129, %r2190, 2047; + setp.gt.s32 %p130, %r2189, 2047; + setp.gt.s32 %p131, %r2188, 2047; + setp.gt.s32 %p132, %r2187, 2047; + .loc 1 394 25 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:394:25 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + prmt.b32 %r2203, %r2201, %r2202, 0x5410U; + prmt.b32 %r2204, %r2199, %r2200, 0x5410U; + prmt.b32 %r2205, %r2197, %r2198, 0x5410U; + prmt.b32 %r2206, %r2195, %r2196, 0x5410U; + prmt.b32 %r2207, %r2193, %r2194, 0x5410U; + prmt.b32 %r2208, %r2191, %r2192, 0x5410U; + prmt.b32 %r2209, %r2189, %r2190, 0x5410U; + prmt.b32 %r2210, %r2187, %r2188, 0x5410U; + and.b32 %r2211, %r2210, 134154239; + and.b32 %r2212, %r2209, 134154239; + and.b32 %r2213, %r2208, 134154239; + and.b32 %r2214, %r2207, 134154239; + and.b32 %r2215, %r2206, 134154239; + and.b32 %r2216, %r2205, 134154239; + and.b32 %r2217, %r2204, 134154239; + and.b32 %r2218, %r2203, 134154239; + .loc 1 395 24 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:395:24 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mov.b32 {%rs1, %rs2}, %r2218; + cvt.u64.u16 %rd144, %rs2; + cvt.u64.u16 %rd145, %rs1; + mov.b32 {%rs3, %rs4}, %r2217; + cvt.u64.u16 %rd146, %rs4; + cvt.u64.u16 %rd147, %rs3; + mov.b32 {%rs5, %rs6}, %r2216; + cvt.u64.u16 %rd148, %rs6; + cvt.u64.u16 %rd149, %rs5; + mov.b32 {%rs7, %rs8}, %r2215; + cvt.u64.u16 %rd150, %rs8; + cvt.u64.u16 %rd151, %rs7; + mov.b32 {%rs9, %rs10}, %r2214; + cvt.u64.u16 %rd152, %rs10; + cvt.u64.u16 %rd153, %rs9; + mov.b32 {%rs11, %rs12}, %r2213; + cvt.u64.u16 %rd154, %rs12; + cvt.u64.u16 %rd155, %rs11; + mov.b32 {%rs13, %rs14}, %r2212; + cvt.u64.u16 %rd156, %rs14; + cvt.u64.u16 %rd157, %rs13; + mov.b32 {%rs15, %rs16}, %r2211; + cvt.u64.u16 %rd158, %rs16; + cvt.u64.u16 %rd159, %rs15; + setp.gt.s64 %p133, %rd7, %rd159; + setp.gt.s64 %p134, %rd7, %rd158; + setp.gt.s64 %p135, %rd7, %rd157; + setp.gt.s64 %p136, %rd7, %rd156; + setp.gt.s64 %p137, %rd7, %rd155; + setp.gt.s64 %p138, %rd7, %rd154; + setp.gt.s64 %p139, %rd7, %rd153; + setp.gt.s64 %p140, %rd7, %rd152; + setp.gt.s64 %p141, %rd7, %rd151; + setp.gt.s64 %p142, %rd7, %rd150; + setp.gt.s64 %p143, %rd7, %rd149; + setp.gt.s64 %p144, %rd7, %rd148; + setp.gt.s64 %p145, %rd7, %rd147; + setp.gt.s64 %p146, %rd7, %rd146; + setp.gt.s64 %p147, %rd7, %rd145; + setp.gt.s64 %p148, %rd7, %rd144; + .loc 1 396 24 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:396:24 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + and.pred %p149, %p117, %p148; + and.pred %p150, %p118, %p147; + and.pred %p151, %p119, %p146; + and.pred %p152, %p120, %p145; + and.pred %p153, %p121, %p144; + and.pred %p154, %p122, %p143; + and.pred %p155, %p123, %p142; + and.pred %p156, %p124, %p141; + and.pred %p157, %p125, %p140; + and.pred %p158, %p126, %p139; + and.pred %p159, %p127, %p138; + and.pred %p160, %p128, %p137; + and.pred %p161, %p129, %p136; + and.pred %p162, %p130, %p135; + and.pred %p163, %p131, %p134; + and.pred %p164, %p132, %p133; + .loc 1 397 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:397:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + sub.s32 %r2219, %r2191, %r83; + sub.s32 %r2220, %r2192, %r83; + sub.s32 %r2221, %r2191, %r81; + sub.s32 %r2222, %r2192, %r81; + sub.s32 %r2223, %r2193, %r83; + sub.s32 %r2224, %r2194, %r83; + sub.s32 %r2225, %r2193, %r81; + sub.s32 %r2226, %r2194, %r81; + sub.s32 %r2227, %r2187, %r83; + sub.s32 %r2228, %r2188, %r83; + sub.s32 %r2229, %r2187, %r81; + sub.s32 %r2230, %r2188, %r81; + sub.s32 %r2231, %r2189, %r83; + sub.s32 %r2232, %r2190, %r83; + sub.s32 %r2233, %r2189, %r81; + sub.s32 %r2234, %r2190, %r81; + sub.s32 %r2235, %r2195, %r81; + sub.s32 %r2236, %r2196, %r81; + sub.s32 %r2237, %r2195, %r83; + sub.s32 %r2238, %r2196, %r83; + sub.s32 %r2239, %r2197, %r81; + sub.s32 %r2240, %r2198, %r81; + sub.s32 %r2241, %r2197, %r83; + sub.s32 %r2242, %r2198, %r83; + sub.s32 %r2243, %r2199, %r81; + sub.s32 %r2244, %r2200, %r81; + sub.s32 %r2245, %r2199, %r83; + sub.s32 %r2246, %r2200, %r83; + sub.s32 %r2247, %r2201, %r81; + sub.s32 %r2248, %r2202, %r81; + sub.s32 %r2249, %r2201, %r83; + sub.s32 %r2250, %r2202, %r83; + .loc 1 404 39 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:404:39 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + and.b32 %r2251, %r2250, 2047; + and.b32 %r2252, %r2249, 2047; + and.b32 %r2253, %r2248, 2047; + and.b32 %r2254, %r2247, 2047; + and.b32 %r2255, %r2246, 2047; + and.b32 %r2256, %r2245, 2047; + and.b32 %r2257, %r2244, 2047; + and.b32 %r2258, %r2243, 2047; + and.b32 %r2259, %r2242, 2047; + and.b32 %r2260, %r2241, 2047; + and.b32 %r2261, %r2240, 2047; + and.b32 %r2262, %r2239, 2047; + and.b32 %r2263, %r2238, 2047; + and.b32 %r2264, %r2237, 2047; + and.b32 %r2265, %r2236, 2047; + and.b32 %r2266, %r2235, 2047; + and.b32 %r2267, %r2234, 2047; + and.b32 %r2268, %r2233, 2047; + and.b32 %r2269, %r2232, 2047; + and.b32 %r2270, %r2231, 2047; + and.b32 %r2271, %r2230, 2047; + and.b32 %r2272, %r2229, 2047; + and.b32 %r2273, %r2228, 2047; + and.b32 %r2274, %r2227, 2047; + and.b32 %r2275, %r2226, 2047; + and.b32 %r2276, %r2225, 2047; + and.b32 %r2277, %r2224, 2047; + and.b32 %r2278, %r2223, 2047; + and.b32 %r2279, %r2222, 2047; + and.b32 %r2280, %r2221, 2047; + and.b32 %r2281, %r2220, 2047; + and.b32 %r2282, %r2219, 2047; + .loc 1 405 25 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:405:25 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + setp.eq.b32 %p165, %r2282, 0; + selp.b16 %rs17, 1, 0, %p165; + shl.b16 %rs18, %rs17, 2; + setp.eq.b32 %p166, %r2281, 0; + selp.b16 %rs19, -1, 0, %p166; + shl.b16 %rs20, %rs19, 3; + or.b16 %rs21, %rs20, %rs18; + setp.eq.b32 %p167, %r2280, 0; + selp.b16 %rs22, 1, 0, %p167; + setp.eq.b32 %p168, %r2279, 0; + selp.b16 %rs23, -1, 0, %p168; + shl.b16 %rs24, %rs23, 1; + or.b16 %rs25, %rs22, %rs24; + and.b16 %rs26, %rs25, 3; + or.b16 %rs27, %rs26, %rs21; + and.b16 %rs28, %rs27, 15; + shl.b16 %rs29, %rs28, 8; + setp.eq.b32 %p169, %r2278, 0; + selp.b16 %rs30, 1, 0, %p169; + shl.b16 %rs31, %rs30, 2; + setp.eq.b32 %p170, %r2277, 0; + selp.b16 %rs32, -1, 0, %p170; + shl.b16 %rs33, %rs32, 3; + or.b16 %rs34, %rs33, %rs31; + setp.eq.b32 %p171, %r2276, 0; + selp.b16 %rs35, 1, 0, %p171; + setp.eq.b32 %p172, %r2275, 0; + selp.b16 %rs36, -1, 0, %p172; + shl.b16 %rs37, %rs36, 1; + or.b16 %rs38, %rs35, %rs37; + and.b16 %rs39, %rs38, 3; + or.b16 %rs40, %rs39, %rs34; + shl.b16 %rs41, %rs40, 12; + or.b16 %rs42, %rs41, %rs29; + setp.eq.b32 %p173, %r2274, 0; + selp.b16 %rs43, 1, 0, %p173; + shl.b16 %rs44, %rs43, 2; + setp.eq.b32 %p174, %r2273, 0; + selp.b16 %rs45, -1, 0, %p174; + shl.b16 %rs46, %rs45, 3; + or.b16 %rs47, %rs46, %rs44; + setp.eq.b32 %p175, %r2272, 0; + selp.b16 %rs48, 1, 0, %p175; + setp.eq.b32 %p176, %r2271, 0; + selp.b16 %rs49, -1, 0, %p176; + shl.b16 %rs50, %rs49, 1; + or.b16 %rs51, %rs48, %rs50; + and.b16 %rs52, %rs51, 3; + or.b16 %rs53, %rs52, %rs47; + and.b16 %rs54, %rs53, 15; + setp.eq.b32 %p177, %r2270, 0; + selp.b16 %rs55, 1, 0, %p177; + shl.b16 %rs56, %rs55, 2; + setp.eq.b32 %p178, %r2269, 0; + selp.b16 %rs57, -1, 0, %p178; + shl.b16 %rs58, %rs57, 3; + or.b16 %rs59, %rs58, %rs56; + setp.eq.b32 %p179, %r2268, 0; + selp.b16 %rs60, 1, 0, %p179; + setp.eq.b32 %p180, %r2267, 0; + selp.b16 %rs61, -1, 0, %p180; + shl.b16 %rs62, %rs61, 1; + or.b16 %rs63, %rs60, %rs62; + and.b16 %rs64, %rs63, 3; + or.b16 %rs65, %rs64, %rs59; + shl.b16 %rs66, %rs65, 4; + or.b16 %rs67, %rs54, %rs66; + and.b16 %rs68, %rs67, 255; + or.b16 %rs69, %rs68, %rs42; + cvt.u32.u16 %r2283, %rs69; + setp.eq.b32 %p181, %r2266, 0; + setp.eq.b32 %p182, %r2265, 0; + setp.eq.b32 %p183, %r2264, 0; + setp.eq.b32 %p184, %r2263, 0; + setp.eq.b32 %p185, %r2262, 0; + setp.eq.b32 %p186, %r2261, 0; + setp.eq.b32 %p187, %r2260, 0; + setp.eq.b32 %p188, %r2259, 0; + setp.eq.b32 %p189, %r2258, 0; + setp.eq.b32 %p190, %r2257, 0; + setp.eq.b32 %p191, %r2256, 0; + setp.eq.b32 %p192, %r2255, 0; + setp.eq.b32 %p193, %r2254, 0; + setp.eq.b32 %p194, %r2253, 0; + setp.eq.b32 %p195, %r2252, 0; + setp.eq.b32 %p196, %r2251, 0; + .loc 1 406 24 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:406:24 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + and.pred %p197, %p196, %p149; + and.pred %p198, %p195, %p150; + and.pred %p199, %p194, %p149; + and.pred %p200, %p193, %p150; + and.pred %p201, %p192, %p151; + and.pred %p202, %p191, %p152; + and.pred %p203, %p190, %p151; + and.pred %p204, %p189, %p152; + and.pred %p205, %p188, %p153; + and.pred %p206, %p187, %p154; + and.pred %p207, %p186, %p153; + and.pred %p208, %p185, %p154; + and.pred %p209, %p184, %p155; + and.pred %p210, %p183, %p156; + and.pred %p211, %p182, %p155; + and.pred %p212, %p181, %p156; + shr.u32 %r2284, %r2283, 15; + and.b32 %r2285, %r2284, 1; + setp.ne.b32 %p213, %r2285, 0; + and.pred %p214, %p213, %p157; + shr.u32 %r2286, %r2283, 14; + and.b32 %r2287, %r2286, 1; + setp.ne.b32 %p215, %r2287, 0; + and.pred %p216, %p215, %p158; + shr.u32 %r2288, %r2283, 13; + and.b32 %r2289, %r2288, 1; + setp.ne.b32 %p217, %r2289, 0; + and.pred %p218, %p217, %p157; + shr.u32 %r2290, %r2283, 12; + and.b32 %r2291, %r2290, 1; + setp.ne.b32 %p219, %r2291, 0; + and.pred %p220, %p219, %p158; + shr.u32 %r2292, %r2283, 11; + and.b32 %r2293, %r2292, 1; + setp.ne.b32 %p221, %r2293, 0; + and.pred %p222, %p221, %p159; + shr.u32 %r2294, %r2283, 10; + and.b32 %r2295, %r2294, 1; + setp.ne.b32 %p223, %r2295, 0; + and.pred %p224, %p223, %p160; + shr.u32 %r2296, %r2283, 9; + and.b32 %r2297, %r2296, 1; + setp.ne.b32 %p225, %r2297, 0; + and.pred %p226, %p225, %p159; + shr.u32 %r2298, %r2283, 8; + and.b32 %r2299, %r2298, 1; + setp.ne.b32 %p227, %r2299, 0; + and.pred %p228, %p227, %p160; + shr.u32 %r2300, %r2283, 7; + and.b32 %r2301, %r2300, 1; + setp.ne.b32 %p229, %r2301, 0; + and.pred %p230, %p229, %p161; + shr.u32 %r2302, %r2283, 6; + and.b32 %r2303, %r2302, 1; + setp.ne.b32 %p231, %r2303, 0; + and.pred %p232, %p231, %p162; + shr.u32 %r2304, %r2283, 5; + and.b32 %r2305, %r2304, 1; + setp.ne.b32 %p233, %r2305, 0; + and.pred %p234, %p233, %p161; + shr.u32 %r2306, %r2283, 4; + and.b32 %r2307, %r2306, 1; + setp.ne.b32 %p235, %r2307, 0; + and.pred %p236, %p235, %p162; + shr.u32 %r2308, %r2283, 3; + and.b32 %r2309, %r2308, 1; + setp.ne.b32 %p237, %r2309, 0; + and.pred %p238, %p237, %p163; + shr.u32 %r2310, %r2283, 2; + and.b32 %r2311, %r2310, 1; + setp.ne.b32 %p239, %r2311, 0; + and.pred %p240, %p239, %p164; + shr.u32 %r2312, %r2283, 1; + and.b32 %r2313, %r2312, 1; + setp.ne.b32 %p241, %r2313, 0; + and.pred %p242, %p241, %p163; + and.pred %p243, %p175, %p164; + .loc 1 407 24 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:407:24 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + or.pred %p244, %p85, %p197; + or.pred %p246, %p86, %p198; + or.pred %p248, %p87, %p199; + or.pred %p250, %p88, %p200; + or.pred %p252, %p89, %p201; + or.pred %p254, %p90, %p202; + or.pred %p256, %p91, %p203; + or.pred %p258, %p92, %p204; + or.pred %p260, %p93, %p205; + or.pred %p262, %p94, %p206; + or.pred %p264, %p95, %p207; + or.pred %p266, %p96, %p208; + or.pred %p268, %p97, %p209; + or.pred %p270, %p98, %p210; + or.pred %p272, %p99, %p211; + or.pred %p274, %p100, %p212; + or.pred %p276, %p101, %p214; + or.pred %p278, %p102, %p216; + or.pred %p280, %p103, %p218; + or.pred %p282, %p104, %p220; + or.pred %p284, %p105, %p222; + or.pred %p286, %p106, %p224; + or.pred %p288, %p107, %p226; + or.pred %p290, %p108, %p228; + or.pred %p292, %p109, %p230; + or.pred %p294, %p110, %p232; + or.pred %p296, %p111, %p234; + or.pred %p298, %p112, %p236; + or.pred %p300, %p113, %p238; + or.pred %p302, %p114, %p240; + or.pred %p304, %p115, %p242; + or.pred %p306, %p116, %p243; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2314, %r2155, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2315, %r2314, 0fFF800000, %p244; + selp.f32 %r2316, %r2315, 0fFF800000, %p37; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2317, %r2156, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2318, %r2317, 0fFF800000, %p246; + selp.f32 %r2319, %r2318, 0fFF800000, %p38; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2320, %r2157, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2321, %r2320, 0fFF800000, %p248; + selp.f32 %r2322, %r2321, 0fFF800000, %p37; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2323, %r2158, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2324, %r2323, 0fFF800000, %p250; + selp.f32 %r2325, %r2324, 0fFF800000, %p38; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2326, %r2159, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2327, %r2326, 0fFF800000, %p252; + selp.f32 %r2328, %r2327, 0fFF800000, %p39; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2329, %r2160, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2330, %r2329, 0fFF800000, %p254; + selp.f32 %r2331, %r2330, 0fFF800000, %p40; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2332, %r2161, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2333, %r2332, 0fFF800000, %p256; + selp.f32 %r2334, %r2333, 0fFF800000, %p39; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2335, %r2162, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2336, %r2335, 0fFF800000, %p258; + selp.f32 %r2337, %r2336, 0fFF800000, %p40; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2338, %r2163, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2339, %r2338, 0fFF800000, %p260; + selp.f32 %r2340, %r2339, 0fFF800000, %p41; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2341, %r2164, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2342, %r2341, 0fFF800000, %p262; + selp.f32 %r2343, %r2342, 0fFF800000, %p42; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2344, %r2165, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2345, %r2344, 0fFF800000, %p264; + selp.f32 %r2346, %r2345, 0fFF800000, %p41; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2347, %r2166, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2348, %r2347, 0fFF800000, %p266; + selp.f32 %r2349, %r2348, 0fFF800000, %p42; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2350, %r2167, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2351, %r2350, 0fFF800000, %p268; + selp.f32 %r2352, %r2351, 0fFF800000, %p43; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2353, %r2168, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2354, %r2353, 0fFF800000, %p270; + selp.f32 %r2355, %r2354, 0fFF800000, %p44; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2356, %r2169, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2357, %r2356, 0fFF800000, %p272; + selp.f32 %r2358, %r2357, 0fFF800000, %p43; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2359, %r2170, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2360, %r2359, 0fFF800000, %p274; + selp.f32 %r2361, %r2360, 0fFF800000, %p44; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2362, %r2171, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2363, %r2362, 0fFF800000, %p276; + selp.f32 %r2364, %r2363, 0fFF800000, %p45; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2365, %r2172, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2366, %r2365, 0fFF800000, %p278; + selp.f32 %r2367, %r2366, 0fFF800000, %p46; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2368, %r2173, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2369, %r2368, 0fFF800000, %p280; + selp.f32 %r2370, %r2369, 0fFF800000, %p45; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2371, %r2174, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2372, %r2371, 0fFF800000, %p282; + selp.f32 %r2373, %r2372, 0fFF800000, %p46; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2374, %r2175, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2375, %r2374, 0fFF800000, %p284; + selp.f32 %r2376, %r2375, 0fFF800000, %p47; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2377, %r2176, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2378, %r2377, 0fFF800000, %p286; + selp.f32 %r2379, %r2378, 0fFF800000, %p48; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2380, %r2177, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2381, %r2380, 0fFF800000, %p288; + selp.f32 %r2382, %r2381, 0fFF800000, %p47; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2383, %r2178, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2384, %r2383, 0fFF800000, %p290; + selp.f32 %r2385, %r2384, 0fFF800000, %p48; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2386, %r2179, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2387, %r2386, 0fFF800000, %p292; + selp.f32 %r2388, %r2387, 0fFF800000, %p49; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2389, %r2180, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2390, %r2389, 0fFF800000, %p294; + selp.f32 %r2391, %r2390, 0fFF800000, %p50; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2392, %r2181, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2393, %r2392, 0fFF800000, %p296; + selp.f32 %r2394, %r2393, 0fFF800000, %p49; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2395, %r2182, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2396, %r2395, 0fFF800000, %p298; + selp.f32 %r2397, %r2396, 0fFF800000, %p50; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2398, %r2183, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2399, %r2398, 0fFF800000, %p300; + selp.f32 %r2400, %r2399, 0fFF800000, %p51; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2401, %r2184, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2402, %r2401, 0fFF800000, %p302; + selp.f32 %r2403, %r2402, 0fFF800000, %p52; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2404, %r2185, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2405, %r2404, 0fFF800000, %p304; + selp.f32 %r2406, %r2405, 0fFF800000, %p51; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r2407, %r2186, 0f3FB8AA3B; + .loc 1 414 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:414:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2408, %r2407, 0fFF800000, %p306; + selp.f32 %r2409, %r2408, 0fFF800000, %p52; + .loc 2 168 27 // standard.py:168:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + max.f32 %r2410, %r2316, %r2319; + max.f32 %r2411, %r2322, %r2325; + max.f32 %r2412, %r2410, %r2328; + max.f32 %r2413, %r2412, %r2331; + max.f32 %r2414, %r2411, %r2334; + max.f32 %r2415, %r2414, %r2337; + max.f32 %r2416, %r2413, %r2340; + max.f32 %r2417, %r2416, %r2343; + max.f32 %r2418, %r2415, %r2346; + max.f32 %r2419, %r2418, %r2349; + max.f32 %r2420, %r2417, %r2352; + max.f32 %r2421, %r2420, %r2355; + max.f32 %r2422, %r2419, %r2358; + max.f32 %r2423, %r2422, %r2361; + max.f32 %r2424, %r2421, %r2364; + max.f32 %r2425, %r2424, %r2367; + max.f32 %r2426, %r2423, %r2370; + max.f32 %r2427, %r2426, %r2373; + max.f32 %r2428, %r2425, %r2376; + max.f32 %r2429, %r2428, %r2379; + max.f32 %r2430, %r2427, %r2382; + max.f32 %r2431, %r2430, %r2385; + max.f32 %r2432, %r2429, %r2388; + max.f32 %r2433, %r2432, %r2391; + max.f32 %r2434, %r2431, %r2394; + max.f32 %r2435, %r2434, %r2397; + max.f32 %r2436, %r2433, %r2400; + max.f32 %r2437, %r2436, %r2403; + max.f32 %r2438, %r2435, %r2406; + max.f32 %r2439, %r2438, %r2409; + .loc 2 189 40 // standard.py:189:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + shfl.sync.bfly.b32 %r2440, %r2437, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + max.f32 %r2441, %r2437, %r2440; + .loc 2 189 40 // standard.py:189:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + shfl.sync.bfly.b32 %r2442, %r2441, 1, 31, -1; + shfl.sync.bfly.b32 %r2443, %r2439, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + max.f32 %r2444, %r2439, %r2443; + .loc 2 189 40 // standard.py:189:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + shfl.sync.bfly.b32 %r2445, %r2444, 1, 31, -1; + cvt.u64.u32 %rd160, %r2442; + cvt.u64.u32 %rd161, %r2445; + shl.b64 %rd162, %rd161, 32; + or.b64 %rd163, %rd160, %rd162; + .loc 2 168 27 // standard.py:168:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mov.b64 {%r2446, %r2447}, %rd163; + max.f32 %r2448, %r2441, %r2446; + max.f32 %r2449, %r2444, %r2447; + .loc 1 421 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:421:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mov.b64 {%r2450, %r2451}, %rd292; + max.f32 %r2452, %r2451, %r2449; + max.f32 %r2453, %r2450, %r2448; + mov.b64 %rd292, {%r2453, %r2452}; + .loc 1 423 35 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:423:35 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + setp.eq.f32 %p308, %r2453, 0fFF800000; + setp.eq.f32 %p309, %r2452, 0fFF800000; + .loc 1 424 51 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:424:51 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + selp.f32 %r2454, 0f00000000, %r2453, %p308; + selp.f32 %r2455, 0f00000000, %r2452, %p309; + .loc 1 428 31 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:428:31 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + sub.f32 %r2456, %r2450, %r2454; + sub.f32 %r2457, %r2451, %r2455; + .loc 1 428 25 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:428:25 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + ex2.approx.ftz.f32 %r2458, %r2456; + ex2.approx.ftz.f32 %r2459, %r2457; + .loc 1 429 39 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:429:39 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + sub.f32 %r2460, %r2316, %r2454; + sub.f32 %r2461, %r2319, %r2454; + sub.f32 %r2462, %r2322, %r2455; + sub.f32 %r2463, %r2325, %r2455; + sub.f32 %r2464, %r2328, %r2454; + sub.f32 %r2465, %r2331, %r2454; + sub.f32 %r2466, %r2334, %r2455; + sub.f32 %r2467, %r2337, %r2455; + sub.f32 %r2468, %r2340, %r2454; + sub.f32 %r2469, %r2343, %r2454; + sub.f32 %r2470, %r2346, %r2455; + sub.f32 %r2471, %r2349, %r2455; + sub.f32 %r2472, %r2352, %r2454; + sub.f32 %r2473, %r2355, %r2454; + sub.f32 %r2474, %r2358, %r2455; + sub.f32 %r2475, %r2361, %r2455; + sub.f32 %r2476, %r2364, %r2454; + sub.f32 %r2477, %r2367, %r2454; + sub.f32 %r2478, %r2370, %r2455; + sub.f32 %r2479, %r2373, %r2455; + sub.f32 %r2480, %r2376, %r2454; + sub.f32 %r2481, %r2379, %r2454; + sub.f32 %r2482, %r2382, %r2455; + sub.f32 %r2483, %r2385, %r2455; + sub.f32 %r2484, %r2388, %r2454; + sub.f32 %r2485, %r2391, %r2454; + sub.f32 %r2486, %r2394, %r2455; + sub.f32 %r2487, %r2397, %r2455; + sub.f32 %r2488, %r2400, %r2454; + sub.f32 %r2489, %r2403, %r2454; + sub.f32 %r2490, %r2406, %r2455; + sub.f32 %r2491, %r2409, %r2455; + .loc 1 429 21 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:429:21 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + ex2.approx.ftz.f32 %r2492, %r2460; + ex2.approx.ftz.f32 %r2493, %r2461; + ex2.approx.ftz.f32 %r2494, %r2462; + ex2.approx.ftz.f32 %r2495, %r2463; + ex2.approx.ftz.f32 %r2496, %r2464; + ex2.approx.ftz.f32 %r2497, %r2465; + ex2.approx.ftz.f32 %r2498, %r2466; + ex2.approx.ftz.f32 %r2499, %r2467; + ex2.approx.ftz.f32 %r2500, %r2468; + ex2.approx.ftz.f32 %r2501, %r2469; + ex2.approx.ftz.f32 %r2502, %r2470; + ex2.approx.ftz.f32 %r2503, %r2471; + ex2.approx.ftz.f32 %r2504, %r2472; + ex2.approx.ftz.f32 %r2505, %r2473; + ex2.approx.ftz.f32 %r2506, %r2474; + ex2.approx.ftz.f32 %r2507, %r2475; + ex2.approx.ftz.f32 %r2508, %r2476; + ex2.approx.ftz.f32 %r2509, %r2477; + ex2.approx.ftz.f32 %r2510, %r2478; + ex2.approx.ftz.f32 %r2511, %r2479; + ex2.approx.ftz.f32 %r2512, %r2480; + ex2.approx.ftz.f32 %r2513, %r2481; + ex2.approx.ftz.f32 %r2514, %r2482; + ex2.approx.ftz.f32 %r2515, %r2483; + ex2.approx.ftz.f32 %r2516, %r2484; + ex2.approx.ftz.f32 %r2517, %r2485; + ex2.approx.ftz.f32 %r2518, %r2486; + ex2.approx.ftz.f32 %r2519, %r2487; + ex2.approx.ftz.f32 %r2520, %r2488; + ex2.approx.ftz.f32 %r2521, %r2489; + ex2.approx.ftz.f32 %r2522, %r2490; + ex2.approx.ftz.f32 %r2523, %r2491; + .loc 2 261 15 // standard.py:261:15 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.f32 %r2524, %r2492, %r2493; + add.f32 %r2525, %r2494, %r2495; + add.f32 %r2526, %r2524, %r2496; + add.f32 %r2527, %r2526, %r2497; + add.f32 %r2528, %r2525, %r2498; + add.f32 %r2529, %r2528, %r2499; + add.f32 %r2530, %r2527, %r2500; + add.f32 %r2531, %r2530, %r2501; + add.f32 %r2532, %r2529, %r2502; + add.f32 %r2533, %r2532, %r2503; + add.f32 %r2534, %r2531, %r2504; + add.f32 %r2535, %r2534, %r2505; + add.f32 %r2536, %r2533, %r2506; + add.f32 %r2537, %r2536, %r2507; + add.f32 %r2538, %r2535, %r2508; + add.f32 %r2539, %r2538, %r2509; + add.f32 %r2540, %r2537, %r2510; + add.f32 %r2541, %r2540, %r2511; + add.f32 %r2542, %r2539, %r2512; + add.f32 %r2543, %r2542, %r2513; + add.f32 %r2544, %r2541, %r2514; + add.f32 %r2545, %r2544, %r2515; + add.f32 %r2546, %r2543, %r2516; + add.f32 %r2547, %r2546, %r2517; + add.f32 %r2548, %r2545, %r2518; + add.f32 %r2549, %r2548, %r2519; + add.f32 %r2550, %r2547, %r2520; + add.f32 %r2551, %r2550, %r2521; + add.f32 %r2552, %r2549, %r2522; + add.f32 %r2553, %r2552, %r2523; + .loc 2 291 36 // standard.py:291:36 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + shfl.sync.bfly.b32 %r2554, %r2551, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.f32 %r2555, %r2551, %r2554; + .loc 2 291 36 // standard.py:291:36 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + shfl.sync.bfly.b32 %r2556, %r2555, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.f32 %r2557, %r2555, %r2556; + .loc 2 291 36 // standard.py:291:36 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + shfl.sync.bfly.b32 %r2558, %r2553, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.f32 %r2559, %r2553, %r2558; + .loc 2 291 36 // standard.py:291:36 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + shfl.sync.bfly.b32 %r2560, %r2559, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.f32 %r2561, %r2559, %r2560; + .loc 1 434 24 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:434:24 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + fma.rn.f32 %r5060, %r5060, %r2458, %r2557; + fma.rn.f32 %r5061, %r5061, %r2459, %r2561; + .loc 1 436 16 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:436:16 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.f32 %r1702, %r1702, %r2458; + mul.f32 %r1703, %r1703, %r2458; + mul.f32 %r1704, %r1704, %r2459; + mul.f32 %r1705, %r1705, %r2459; + mul.f32 %r1706, %r1706, %r2458; + mul.f32 %r1707, %r1707, %r2458; + mul.f32 %r1708, %r1708, %r2459; + mul.f32 %r1709, %r1709, %r2459; + mul.f32 %r1710, %r1710, %r2458; + mul.f32 %r1711, %r1711, %r2458; + mul.f32 %r1712, %r1712, %r2459; + mul.f32 %r1713, %r1713, %r2459; + mul.f32 %r1714, %r1714, %r2458; + mul.f32 %r1715, %r1715, %r2458; + mul.f32 %r1716, %r1716, %r2459; + mul.f32 %r1717, %r1717, %r2459; + mul.f32 %r1718, %r1718, %r2458; + mul.f32 %r1719, %r1719, %r2458; + mul.f32 %r1720, %r1720, %r2459; + mul.f32 %r1721, %r1721, %r2459; + mul.f32 %r1722, %r1722, %r2458; + mul.f32 %r1723, %r1723, %r2458; + mul.f32 %r1724, %r1724, %r2459; + mul.f32 %r1725, %r1725, %r2459; + mul.f32 %r1726, %r1726, %r2458; + mul.f32 %r1727, %r1727, %r2458; + mul.f32 %r1728, %r1728, %r2459; + mul.f32 %r1729, %r1729, %r2459; + mul.f32 %r1730, %r1730, %r2458; + mul.f32 %r1731, %r1731, %r2458; + mul.f32 %r1732, %r1732, %r2459; + mul.f32 %r1733, %r1733, %r2459; + mul.f32 %r1734, %r1734, %r2458; + mul.f32 %r1735, %r1735, %r2458; + mul.f32 %r1736, %r1736, %r2459; + mul.f32 %r1737, %r1737, %r2459; + mul.f32 %r1738, %r1738, %r2458; + mul.f32 %r1739, %r1739, %r2458; + mul.f32 %r1740, %r1740, %r2459; + mul.f32 %r1741, %r1741, %r2459; + mul.f32 %r1742, %r1742, %r2458; + mul.f32 %r1743, %r1743, %r2458; + mul.f32 %r1744, %r1744, %r2459; + mul.f32 %r1745, %r1745, %r2459; + mul.f32 %r1746, %r1746, %r2458; + mul.f32 %r1747, %r1747, %r2458; + mul.f32 %r1748, %r1748, %r2459; + mul.f32 %r1749, %r1749, %r2459; + mul.f32 %r1750, %r1750, %r2458; + mul.f32 %r1751, %r1751, %r2458; + mul.f32 %r1752, %r1752, %r2459; + mul.f32 %r1753, %r1753, %r2459; + mul.f32 %r1754, %r1754, %r2458; + mul.f32 %r1755, %r1755, %r2458; + mul.f32 %r1756, %r1756, %r2459; + mul.f32 %r1757, %r1757, %r2459; + mul.f32 %r1758, %r1758, %r2458; + mul.f32 %r1759, %r1759, %r2458; + mul.f32 %r1760, %r1760, %r2459; + mul.f32 %r1761, %r1761, %r2459; + mul.f32 %r1762, %r1762, %r2458; + mul.f32 %r1763, %r1763, %r2458; + mul.f32 %r1764, %r1764, %r2459; + mul.f32 %r1765, %r1765, %r2459; + .loc 1 292 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s32 %r2562, %r836, 49152; + add.s32 %r2563, %r2562, %r2119; + .loc 1 440 22 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:440:22 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + cvt.rn.bf16x2.f32 %r1698, %r2493, %r2492; + cvt.rn.bf16x2.f32 %r1699, %r2495, %r2494; + cvt.rn.bf16x2.f32 %r1700, %r2497, %r2496; + cvt.rn.bf16x2.f32 %r1701, %r2499, %r2498; + cvt.rn.bf16x2.f32 %r1830, %r2501, %r2500; + cvt.rn.bf16x2.f32 %r1831, %r2503, %r2502; + cvt.rn.bf16x2.f32 %r1832, %r2505, %r2504; + cvt.rn.bf16x2.f32 %r1833, %r2507, %r2506; + cvt.rn.bf16x2.f32 %r1962, %r2509, %r2508; + cvt.rn.bf16x2.f32 %r1963, %r2511, %r2510; + cvt.rn.bf16x2.f32 %r1964, %r2513, %r2512; + cvt.rn.bf16x2.f32 %r1965, %r2515, %r2514; + cvt.rn.bf16x2.f32 %r2094, %r2517, %r2516; + cvt.rn.bf16x2.f32 %r2095, %r2519, %r2518; + cvt.rn.bf16x2.f32 %r2096, %r2521, %r2520; + cvt.rn.bf16x2.f32 %r2097, %r2523, %r2522; + .loc 1 440 44 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:440:44 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + wgmma.fence.sync.aligned; + bfe.u32 %r2564, %r2563, 4, 14; + cvt.u64.u32 %rd164, %r2564; + or.b64 %rd110, %rd164, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765}, {%r1698,%r1699,%r1700,%r1701}, %rd110, %p20, 1, 1, 1; + // end inline asm + add.s32 %r2565, %r2563, 2048; + bfe.u32 %r2566, %r2565, 4, 14; + cvt.u64.u32 %rd165, %r2566; + or.b64 %rd111, %rd165, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765}, {%r1830,%r1831,%r1832,%r1833}, %rd111, %p20, 1, 1, 1; + // end inline asm + add.s32 %r2567, %r2563, 4096; + bfe.u32 %r2568, %r2567, 4, 14; + cvt.u64.u32 %rd166, %r2568; + or.b64 %rd112, %rd166, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765}, {%r1962,%r1963,%r1964,%r1965}, %rd112, %p20, 1, 1, 1; + // end inline asm + add.s32 %r2569, %r2563, 6144; + bfe.u32 %r2570, %r2569, 4, 14; + cvt.u64.u32 %rd167, %r2570; + or.b64 %rd113, %rd167, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765}, {%r2094,%r2095,%r2096,%r2097}, %rd113, %p20, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 548 26 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:548:26 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s32 %r4827, %r4756, %r4827; + add.s32 %r4828, %r4756, %r4828; + add.s32 %r4829, %r4756, %r4829; + add.s32 %r4830, %r4756, %r4830; + add.s32 %r4831, %r4756, %r4831; + add.s32 %r4832, %r4756, %r4832; + add.s32 %r4833, %r4756, %r4833; + add.s32 %r4834, %r4756, %r4834; + add.s32 %r4835, %r4756, %r4835; + add.s32 %r4836, %r4756, %r4836; + add.s32 %r4837, %r4756, %r4837; + add.s32 %r4838, %r4756, %r4838; + add.s32 %r4839, %r4756, %r4839; + add.s32 %r4840, %r4756, %r4840; + add.s32 %r4841, %r4756, %r4841; + add.s32 %r4842, %r4756, %r4842; + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s32 %r306, %r4826, 1; + .loc 1 247 33 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:247:33 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + shr.u32 %r2571, %r306, 1; + .loc 1 248 38 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:248:38 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mad.wide.u32 %rd115, %r2571, 4, %rd47; + .loc 1 248 24 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:248:24 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + // begin inline asm + mov.u64 %rd114, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd114, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r2098, 0x0; + @%p31 ld.global.L1::evict_last.L2::cache_hint.b32 { %r2098 }, [ %rd115 + 0 ], %rd114; + // end inline asm + .loc 1 249 109 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:249:109 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s32 %r2572, %r2571, 1; + .loc 1 249 113 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:249:113 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + setp.lt.s32 %p310, %r2572, %r778; + .loc 1 249 55 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:249:55 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s64 %rd118, %rd115, 4; + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + and.pred %p32, %p31, %p310; + .loc 1 249 25 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:249:25 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + // begin inline asm + mov.u64 %rd117, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd117, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r2099, 0x0; + @%p32 ld.global.L1::evict_last.L2::cache_hint.b32 { %r2099 }, [ %rd118 + 0 ], %rd117; + // end inline asm + .loc 1 250 35 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:250:35 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + and.b32 %r2573, %r4826, 1; + .loc 1 251 34 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:251:34 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + sub.s32 %r2574, %r2099, %r2098; + .loc 1 251 48 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:251:48 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + shl.b32 %r2575, %r2574, 7; + .loc 1 251 63 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:251:63 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s32 %r2576, %r2575, -64; + .loc 1 252 29 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:252:29 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + xor.b32 %r2577, %r2573, 1; + .loc 1 252 61 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:252:61 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + shl.b32 %r2578, %r2573, 6; + .loc 1 252 42 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:252:42 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mad.lo.s32 %r4756, %r2576, %r2577, %r2578; + .loc 1 549 21 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:549:21 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s32 %r4759, %r4756, %r4759; + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s32 %r2579, %r4758, 1; + setp.gt.s32 %p311, %r2579, 2; + selp.b32 %r4758, 0, %r2579, %p311; + .loc 1 342 32 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:342:32 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s32 %r2580, %r4759, %r31; + .loc 1 346 35 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:346:35 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s32 %r2581, %r2580, %r8; + add.s32 %r2582, %r2580, %r9; + add.s32 %r2583, %r2580, %r10; + add.s32 %r2584, %r2580, %r11; + .loc 1 284 38 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:38 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + shl.b32 %r2585, %r2581, 7; + shl.b32 %r2586, %r2582, 7; + shl.b32 %r2587, %r2583, 7; + shl.b32 %r2588, %r2584, 7; + .loc 1 284 49 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:49 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + mul.wide.s32 %rd168, %r2585, 2; + add.s64 %rd120, %rd10, %rd168; + mul.wide.s32 %rd169, %r2586, 2; + add.s64 %rd121, %rd10, %rd169; + mul.wide.s32 %rd170, %r2587, 2; + add.s64 %rd122, %rd10, %rd170; + mul.wide.s32 %rd171, %r2588, 2; + add.s64 %rd123, %rd10, %rd171; + .loc 1 292 52 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:52 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + setp.lt.s32 %p312, %r2581, %r712; + setp.lt.s32 %p313, %r2582, %r712; + setp.lt.s32 %p314, %r2583, %r712; + setp.lt.s32 %p315, %r2584, %r712; + .loc 1 292 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + shl.b32 %r2589, %r4758, 14; + add.s32 %r2590, %r836, %r2589; + bar.sync 0; + add.s32 %r2100, %r2590, %r84; + selp.b32 %r2591, 16, 0, %p312; + selp.b32 %r2109, %r2591, 0, %p35; + // begin inline asm + cp.async.cg.shared.global [ %r2100 + 0 ], [ %rd120 + 0 ], 0x10, %r2109; + // end inline asm + add.s32 %r2102, %r2100, 2048; + selp.b32 %r2592, 16, 0, %p313; + selp.b32 %r2111, %r2592, 0, %p35; + // begin inline asm + cp.async.cg.shared.global [ %r2102 + 0 ], [ %rd121 + 0 ], 0x10, %r2111; + // end inline asm + add.s32 %r2104, %r2100, 4096; + selp.b32 %r2593, 16, 0, %p314; + selp.b32 %r2113, %r2593, 0, %p35; + // begin inline asm + cp.async.cg.shared.global [ %r2104 + 0 ], [ %rd122 + 0 ], 0x10, %r2113; + // end inline asm + add.s32 %r2106, %r2100, 6144; + selp.b32 %r2594, 16, 0, %p315; + selp.b32 %r2115, %r2594, 0, %p35; + // begin inline asm + cp.async.cg.shared.global [ %r2106 + 0 ], [ %rd123 + 0 ], 0x10, %r2115; + // end inline asm + cp.async.commit_group; + .loc 1 284 49 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:49 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s64 %rd124, %rd11, %rd168; + add.s64 %rd125, %rd11, %rd169; + add.s64 %rd126, %rd11, %rd170; + add.s64 %rd127, %rd11, %rd171; + .loc 1 292 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + add.s32 %r2595, %r2562, %r2589; + add.s32 %r2108, %r2595, %r84; + // begin inline asm + cp.async.cg.shared.global [ %r2108 + 0 ], [ %rd124 + 0 ], 0x10, %r2109; + // end inline asm + add.s32 %r2110, %r2108, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r2110 + 0 ], [ %rd125 + 0 ], 0x10, %r2111; + // end inline asm + add.s32 %r2112, %r2108, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r2112 + 0 ], [ %rd126 + 0 ], 0x10, %r2113; + // end inline asm + add.s32 %r2114, %r2108, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r2114 + 0 ], [ %rd127 + 0 ], 0x10, %r2115; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + setp.ne.b32 %p316, %r119, %r306; + mov.b32 %r4826, %r306; + @%p316 bra $L__BB0_2; +$L__BB0_3: // %._crit_edge + .loc 1 0 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:0:40 + ld.param.b64 %rd38, [triton_tem_fused_0_param_10]; + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:172:41 ] + // begin inline asm + // wait for regs: %r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp8: + .loc 1 181 35 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:181:35 + shl.b64 %rd190, %rd4, 2; + add.s64 %rd172, %rd37, %rd190; + .loc 1 182 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:182:27 + // begin inline asm + mov.u32 %r2724, 0x0; + ld.global.b32 { %r2724 }, [ %rd172 + 0 ]; + // end inline asm + .loc 1 182 41 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:182:41 + shl.b32 %r376, %r2724, 7; + .loc 1 183 51 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:183:51 + shl.b64 %rd191, %rd6, 2; + add.s64 %rd173, %rd36, %rd191; + .loc 1 183 32 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:183:32 + // begin inline asm + mov.u32 %r2725, 0x0; + ld.global.b32 { %r2725 }, [ %rd173 + 0 ]; + // end inline asm + .loc 1 184 49 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:184:49 + shl.b32 %r2758, %r2725, 1; + .loc 1 184 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:184:69 + min.s32 %r378, %r2758, %r33; +$L__tmp9: + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + setp.lt.s32 %p317, %r2758, 1; + setp.gt.s32 %p318, %r2758, 0; + .loc 1 346 35 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:346:35 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + or.b32 %r2759, %r376, %r8; + or.b32 %r2760, %r376, %r9; + or.b32 %r2761, %r376, %r10; + or.b32 %r2762, %r376, %r11; + .loc 1 284 38 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:38 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + shl.b32 %r2763, %r2759, 7; + shl.b32 %r2764, %r2760, 7; + shl.b32 %r2765, %r2761, 7; + shl.b32 %r2766, %r2762, 7; + .loc 1 284 20 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:20 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.wide.s32 %rd192, %r2763, 2; + add.s64 %rd193, %rd1, %rd192; + mul.wide.s32 %rd194, %r2764, 2; + add.s64 %rd195, %rd1, %rd194; + mul.wide.s32 %rd196, %r2765, 2; + add.s64 %rd197, %rd1, %rd196; + mul.wide.s32 %rd198, %r2766, 2; + add.s64 %rd199, %rd1, %rd198; + .loc 1 284 49 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:49 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + shl.b64 %rd200, %rd3, 1; + add.s64 %rd174, %rd193, %rd200; + add.s64 %rd175, %rd195, %rd200; + add.s64 %rd176, %rd197, %rd200; + add.s64 %rd177, %rd199, %rd200; + .loc 1 292 52 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:52 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + setp.lt.s32 %p319, %r2759, %r712; + setp.lt.s32 %p320, %r2760, %r712; + setp.lt.s32 %p321, %r2761, %r712; + setp.lt.s32 %p322, %r2762, %r712; + .loc 1 292 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.b32 %r2767, 16, 0, %p319; + selp.b32 %r2735, %r2767, 0, %p318; + // begin inline asm + cp.async.cg.shared.global [ %r2726 + 0 ], [ %rd174 + 0 ], 0x10, %r2735; + // end inline asm + selp.b32 %r2768, 16, 0, %p320; + selp.b32 %r2737, %r2768, 0, %p318; + // begin inline asm + cp.async.cg.shared.global [ %r2728 + 0 ], [ %rd175 + 0 ], 0x10, %r2737; + // end inline asm + selp.b32 %r2769, 16, 0, %p321; + selp.b32 %r2739, %r2769, 0, %p318; + // begin inline asm + cp.async.cg.shared.global [ %r2730 + 0 ], [ %rd176 + 0 ], 0x10, %r2739; + // end inline asm + selp.b32 %r2770, 16, 0, %p322; + selp.b32 %r2741, %r2770, 0, %p318; + // begin inline asm + cp.async.cg.shared.global [ %r2732 + 0 ], [ %rd177 + 0 ], 0x10, %r2741; + // end inline asm + cp.async.commit_group; + .loc 1 284 20 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:20 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.s64 %rd201, %rd2, %rd192; + add.s64 %rd202, %rd2, %rd194; + add.s64 %rd203, %rd2, %rd196; + add.s64 %rd204, %rd2, %rd198; + .loc 1 284 49 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:49 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.s64 %rd178, %rd201, %rd200; + add.s64 %rd179, %rd202, %rd200; + add.s64 %rd180, %rd203, %rd200; + add.s64 %rd181, %rd204, %rd200; + .loc 1 292 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + // begin inline asm + cp.async.cg.shared.global [ %r787 + 0 ], [ %rd178 + 0 ], 0x10, %r2735; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r789 + 0 ], [ %rd179 + 0 ], 0x10, %r2737; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r791 + 0 ], [ %rd180 + 0 ], 0x10, %r2739; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r793 + 0 ], [ %rd181 + 0 ], 0x10, %r2741; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + setp.gt.s32 %p323, %r378, 1; + .loc 1 342 32 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:342:32 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + or.b32 %r2771, %r376, 64; + .loc 1 346 35 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:346:35 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + or.b32 %r2772, %r2771, %r8; + or.b32 %r2773, %r2771, %r9; + or.b32 %r2774, %r2771, %r10; + or.b32 %r2775, %r2771, %r11; + .loc 1 284 38 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:38 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + shl.b32 %r2776, %r2772, 7; + shl.b32 %r2777, %r2773, 7; + shl.b32 %r2778, %r2774, 7; + shl.b32 %r2779, %r2775, 7; + .loc 1 284 20 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:20 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.wide.s32 %rd205, %r2776, 2; + add.s64 %rd206, %rd1, %rd205; + mul.wide.s32 %rd207, %r2777, 2; + add.s64 %rd208, %rd1, %rd207; + mul.wide.s32 %rd209, %r2778, 2; + add.s64 %rd210, %rd1, %rd209; + mul.wide.s32 %rd211, %r2779, 2; + add.s64 %rd212, %rd1, %rd211; + .loc 1 284 49 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:49 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.s64 %rd182, %rd206, %rd200; + add.s64 %rd183, %rd208, %rd200; + add.s64 %rd184, %rd210, %rd200; + add.s64 %rd185, %rd212, %rd200; + .loc 1 292 52 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:52 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + setp.lt.s32 %p324, %r2772, %r712; + setp.lt.s32 %p325, %r2773, %r712; + setp.lt.s32 %p326, %r2774, %r712; + setp.lt.s32 %p327, %r2775, %r712; + .loc 1 292 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + bar.sync 0; + selp.b32 %r2780, 16, 0, %p324; + selp.b32 %r2751, %r2780, 0, %p323; + // begin inline asm + cp.async.cg.shared.global [ %r795 + 0 ], [ %rd182 + 0 ], 0x10, %r2751; + // end inline asm + selp.b32 %r2781, 16, 0, %p325; + selp.b32 %r2753, %r2781, 0, %p323; + // begin inline asm + cp.async.cg.shared.global [ %r797 + 0 ], [ %rd183 + 0 ], 0x10, %r2753; + // end inline asm + selp.b32 %r2782, 16, 0, %p326; + selp.b32 %r2755, %r2782, 0, %p323; + // begin inline asm + cp.async.cg.shared.global [ %r799 + 0 ], [ %rd184 + 0 ], 0x10, %r2755; + // end inline asm + selp.b32 %r2783, 16, 0, %p327; + selp.b32 %r2757, %r2783, 0, %p323; + // begin inline asm + cp.async.cg.shared.global [ %r801 + 0 ], [ %rd185 + 0 ], 0x10, %r2757; + // end inline asm + cp.async.commit_group; + .loc 1 284 20 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:20 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.s64 %rd213, %rd2, %rd205; + add.s64 %rd214, %rd2, %rd207; + add.s64 %rd215, %rd2, %rd209; + add.s64 %rd216, %rd2, %rd211; + .loc 1 284 49 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:49 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.s64 %rd186, %rd213, %rd200; + add.s64 %rd187, %rd214, %rd200; + add.s64 %rd188, %rd215, %rd200; + add.s64 %rd189, %rd216, %rd200; + .loc 1 292 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + // begin inline asm + cp.async.cg.shared.global [ %r803 + 0 ], [ %rd186 + 0 ], 0x10, %r2751; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r805 + 0 ], [ %rd187 + 0 ], 0x10, %r2753; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r807 + 0 ], [ %rd188 + 0 ], 0x10, %r2755; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r809 + 0 ], [ %rd189 + 0 ], 0x10, %r2757; + // end inline asm + cp.async.commit_group; + .loc 1 351 19 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:351:19 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + @%p317 bra $L__BB0_6; +$L__tmp10: +// %bb.4: // %.lr.ph460 + .loc 1 186 28 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:186:28 + or.b32 %r4995, %r376, %r36; + or.b32 %r4994, %r376, %r37; + or.b32 %r4992, %r376, %r38; + or.b32 %r4988, %r376, %r40; + or.b32 %r4980, %r376, %r44; + or.b32 %r4993, %r376, %r39; + or.b32 %r4989, %r376, %r41; + or.b32 %r4981, %r376, %r45; + or.b32 %r4990, %r376, %r42; + or.b32 %r4982, %r376, %r46; + or.b32 %r4991, %r376, %r43; + or.b32 %r4983, %r376, %r47; + or.b32 %r4984, %r376, %r48; + or.b32 %r4985, %r376, %r49; + or.b32 %r4986, %r376, %r50; + or.b32 %r4987, %r376, %r51; + add.s32 %r459, %r378, -2; + add.s32 %r460, %r378, -1; +$L__tmp11: + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + max.s32 %r461, %r378, 1; + mov.b32 %r3403, 0; + mov.b32 %r4911, 1; + mov.b32 %r4910, -1; + mov.b32 %r4909, 64; + mov.b32 %r4912, %r4909; + mov.b32 %r4977, %r3403; +$L__BB0_5: // %__nv_exp2f.exit + // =>This Inner Loop Header: Depth=1 + setp.lt.s32 %p341, %r4977, %r459; + setp.lt.s32 %p339, %r4977, %r460; + add.s32 %r4018, %r4910, 1; + setp.gt.s32 %p342, %r4018, 2; + selp.b32 %r4910, 0, %r4018, %p342; + .loc 1 292 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r4019, %r4910, 14; + add.s32 %r3303, %r836, %r4019; + .loc 1 351 19 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:351:19 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + shfl.sync.idx.b32 %r4021, %r7, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r4022, %r4021, 11; + and.b32 %r4023, %r4022, 8192; + add.s32 %r3300, %r836, 98304; + add.s32 %r4024, %r4023, %r3300; + bfe.u32 %r4025, %r4024, 4, 14; + cvt.u64.u32 %rd251, %r4025; + or.b64 %rd217, %rd251, 4611686293372403712; + bfe.u32 %r4026, %r3303, 4, 14; + cvt.u64.u32 %rd252, %r4026; + or.b64 %rd218, %rd252, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2884,%r2885,%r2886,%r2887,%r2888,%r2889,%r2890,%r2891,%r2892,%r2893,%r2894,%r2895,%r2896,%r2897,%r2898,%r2899,%r2900,%r2901,%r2902,%r2903,%r2904,%r2905,%r2906,%r2907,%r2908,%r2909,%r2910,%r2911,%r2912,%r2913,%r2914,%r2915}, %rd217, %rd218, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r4027, %r4024, 32; + bfe.u32 %r4028, %r4027, 4, 14; + cvt.u64.u32 %rd253, %r4028; + or.b64 %rd219, %rd253, 4611686293372403712; + add.s32 %r4029, %r3303, 32; + bfe.u32 %r4030, %r4029, 4, 14; + cvt.u64.u32 %rd254, %r4030; + or.b64 %rd220, %rd254, 4611686293338849280; + mov.pred %p328, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2884,%r2885,%r2886,%r2887,%r2888,%r2889,%r2890,%r2891,%r2892,%r2893,%r2894,%r2895,%r2896,%r2897,%r2898,%r2899,%r2900,%r2901,%r2902,%r2903,%r2904,%r2905,%r2906,%r2907,%r2908,%r2909,%r2910,%r2911,%r2912,%r2913,%r2914,%r2915}, %rd219, %rd220, %p328, 1, 1, 0, 0; + // end inline asm + add.s32 %r4031, %r4024, 64; + bfe.u32 %r4032, %r4031, 4, 14; + cvt.u64.u32 %rd255, %r4032; + or.b64 %rd221, %rd255, 4611686293372403712; + add.s32 %r4033, %r3303, 64; + bfe.u32 %r4034, %r4033, 4, 14; + cvt.u64.u32 %rd256, %r4034; + or.b64 %rd222, %rd256, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2884,%r2885,%r2886,%r2887,%r2888,%r2889,%r2890,%r2891,%r2892,%r2893,%r2894,%r2895,%r2896,%r2897,%r2898,%r2899,%r2900,%r2901,%r2902,%r2903,%r2904,%r2905,%r2906,%r2907,%r2908,%r2909,%r2910,%r2911,%r2912,%r2913,%r2914,%r2915}, %rd221, %rd222, %p328, 1, 1, 0, 0; + // end inline asm + add.s32 %r4035, %r4024, 96; + bfe.u32 %r4036, %r4035, 4, 14; + cvt.u64.u32 %rd257, %r4036; + or.b64 %rd223, %rd257, 4611686293372403712; + add.s32 %r4037, %r3303, 96; + bfe.u32 %r4038, %r4037, 4, 14; + cvt.u64.u32 %rd258, %r4038; + or.b64 %rd224, %rd258, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2884,%r2885,%r2886,%r2887,%r2888,%r2889,%r2890,%r2891,%r2892,%r2893,%r2894,%r2895,%r2896,%r2897,%r2898,%r2899,%r2900,%r2901,%r2902,%r2903,%r2904,%r2905,%r2906,%r2907,%r2908,%r2909,%r2910,%r2911,%r2912,%r2913,%r2914,%r2915}, %rd223, %rd224, %p328, 1, 1, 0, 0; + // end inline asm + add.s32 %r4039, %r4024, 16384; + bfe.u32 %r4040, %r4039, 4, 14; + cvt.u64.u32 %rd259, %r4040; + or.b64 %rd225, %rd259, 4611686293372403712; + add.s32 %r4041, %r3303, 8192; + bfe.u32 %r4042, %r4041, 4, 14; + cvt.u64.u32 %rd260, %r4042; + or.b64 %rd226, %rd260, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2884,%r2885,%r2886,%r2887,%r2888,%r2889,%r2890,%r2891,%r2892,%r2893,%r2894,%r2895,%r2896,%r2897,%r2898,%r2899,%r2900,%r2901,%r2902,%r2903,%r2904,%r2905,%r2906,%r2907,%r2908,%r2909,%r2910,%r2911,%r2912,%r2913,%r2914,%r2915}, %rd225, %rd226, %p328, 1, 1, 0, 0; + // end inline asm + add.s32 %r4043, %r4024, 16416; + bfe.u32 %r4044, %r4043, 4, 14; + cvt.u64.u32 %rd261, %r4044; + or.b64 %rd227, %rd261, 4611686293372403712; + add.s32 %r4045, %r3303, 8224; + bfe.u32 %r4046, %r4045, 4, 14; + cvt.u64.u32 %rd262, %r4046; + or.b64 %rd228, %rd262, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2884,%r2885,%r2886,%r2887,%r2888,%r2889,%r2890,%r2891,%r2892,%r2893,%r2894,%r2895,%r2896,%r2897,%r2898,%r2899,%r2900,%r2901,%r2902,%r2903,%r2904,%r2905,%r2906,%r2907,%r2908,%r2909,%r2910,%r2911,%r2912,%r2913,%r2914,%r2915}, %rd227, %rd228, %p328, 1, 1, 0, 0; + // end inline asm + add.s32 %r4047, %r4024, 16448; + bfe.u32 %r4048, %r4047, 4, 14; + cvt.u64.u32 %rd263, %r4048; + or.b64 %rd229, %rd263, 4611686293372403712; + add.s32 %r4049, %r3303, 8256; + bfe.u32 %r4050, %r4049, 4, 14; + cvt.u64.u32 %rd264, %r4050; + or.b64 %rd230, %rd264, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2884,%r2885,%r2886,%r2887,%r2888,%r2889,%r2890,%r2891,%r2892,%r2893,%r2894,%r2895,%r2896,%r2897,%r2898,%r2899,%r2900,%r2901,%r2902,%r2903,%r2904,%r2905,%r2906,%r2907,%r2908,%r2909,%r2910,%r2911,%r2912,%r2913,%r2914,%r2915}, %rd229, %rd230, %p328, 1, 1, 0, 0; + // end inline asm + add.s32 %r4051, %r4024, 16480; + bfe.u32 %r4052, %r4051, 4, 14; + cvt.u64.u32 %rd265, %r4052; + or.b64 %rd231, %rd265, 4611686293372403712; + add.s32 %r4053, %r3303, 8288; + bfe.u32 %r4054, %r4053, 4, 14; + cvt.u64.u32 %rd266, %r4054; + or.b64 %rd232, %rd266, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2884,%r2885,%r2886,%r2887,%r2888,%r2889,%r2890,%r2891,%r2892,%r2893,%r2894,%r2895,%r2896,%r2897,%r2898,%r2899,%r2900,%r2901,%r2902,%r2903,%r2904,%r2905,%r2906,%r2907,%r2908,%r2909,%r2910,%r2911,%r2912,%r2913,%r2914,%r2915}, %rd231, %rd232, %p328, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r3305, %r3403; + mov.b32 %r3301, %r3403; + mov.b32 %r3302, %r3403; + mov.b32 %r3304, %r3403; + // begin inline asm + // wait for regs: %r2884,%r2885,%r2886,%r2887,%r2888,%r2889,%r2890,%r2891,%r2892,%r2893,%r2894,%r2895,%r2896,%r2897,%r2898,%r2899,%r2900,%r2901,%r2902,%r2903,%r2904,%r2905,%r2906,%r2907,%r2908,%r2909,%r2910,%r2911,%r2912,%r2913,%r2914,%r2915,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 353 14 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:353:14 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4055, %r2884, 0f3DB504F3; + mul.f32 %r4056, %r2885, 0f3DB504F3; + mul.f32 %r4057, %r2886, 0f3DB504F3; + mul.f32 %r4058, %r2887, 0f3DB504F3; + mul.f32 %r4059, %r2888, 0f3DB504F3; + mul.f32 %r4060, %r2889, 0f3DB504F3; + mul.f32 %r4061, %r2890, 0f3DB504F3; + mul.f32 %r4062, %r2891, 0f3DB504F3; + mul.f32 %r4063, %r2892, 0f3DB504F3; + mul.f32 %r4064, %r2893, 0f3DB504F3; + mul.f32 %r4065, %r2894, 0f3DB504F3; + mul.f32 %r4066, %r2895, 0f3DB504F3; + mul.f32 %r4067, %r2896, 0f3DB504F3; + mul.f32 %r4068, %r2897, 0f3DB504F3; + mul.f32 %r4069, %r2898, 0f3DB504F3; + mul.f32 %r4070, %r2899, 0f3DB504F3; + mul.f32 %r4071, %r2900, 0f3DB504F3; + mul.f32 %r4072, %r2901, 0f3DB504F3; + mul.f32 %r4073, %r2902, 0f3DB504F3; + mul.f32 %r4074, %r2903, 0f3DB504F3; + mul.f32 %r4075, %r2904, 0f3DB504F3; + mul.f32 %r4076, %r2905, 0f3DB504F3; + mul.f32 %r4077, %r2906, 0f3DB504F3; + mul.f32 %r4078, %r2907, 0f3DB504F3; + mul.f32 %r4079, %r2908, 0f3DB504F3; + mul.f32 %r4080, %r2909, 0f3DB504F3; + mul.f32 %r4081, %r2910, 0f3DB504F3; + mul.f32 %r4082, %r2911, 0f3DB504F3; + mul.f32 %r4083, %r2912, 0f3DB504F3; + mul.f32 %r4084, %r2913, 0f3DB504F3; + mul.f32 %r4085, %r2914, 0f3DB504F3; + mul.f32 %r4086, %r2915, 0f3DB504F3; + .loc 1 367 44 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:44 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + setp.lt.s32 %p343, %r4995, %r712; + setp.lt.s32 %p344, %r4994, %r712; + setp.lt.s32 %p345, %r4993, %r712; + setp.lt.s32 %p346, %r4992, %r712; + setp.lt.s32 %p347, %r4991, %r712; + setp.lt.s32 %p348, %r4990, %r712; + setp.lt.s32 %p349, %r4989, %r712; + setp.lt.s32 %p350, %r4988, %r712; + setp.lt.s32 %p351, %r4987, %r712; + setp.lt.s32 %p352, %r4986, %r712; + setp.lt.s32 %p353, %r4985, %r712; + setp.lt.s32 %p354, %r4984, %r712; + setp.lt.s32 %p355, %r4983, %r712; + setp.lt.s32 %p356, %r4982, %r712; + setp.lt.s32 %p357, %r4981, %r712; + setp.lt.s32 %p358, %r4980, %r712; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4087, %r4055, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4088, %r4087, 0fFF800000, %p343; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4089, %r4056, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4090, %r4089, 0fFF800000, %p344; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4091, %r4057, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4092, %r4091, 0fFF800000, %p343; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4093, %r4058, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4094, %r4093, 0fFF800000, %p344; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4095, %r4059, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4096, %r4095, 0fFF800000, %p345; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4097, %r4060, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4098, %r4097, 0fFF800000, %p346; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4099, %r4061, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4100, %r4099, 0fFF800000, %p345; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4101, %r4062, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4102, %r4101, 0fFF800000, %p346; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4103, %r4063, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4104, %r4103, 0fFF800000, %p347; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4105, %r4064, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4106, %r4105, 0fFF800000, %p348; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4107, %r4065, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4108, %r4107, 0fFF800000, %p347; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4109, %r4066, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4110, %r4109, 0fFF800000, %p348; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4111, %r4067, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4112, %r4111, 0fFF800000, %p349; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4113, %r4068, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4114, %r4113, 0fFF800000, %p350; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4115, %r4069, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4116, %r4115, 0fFF800000, %p349; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4117, %r4070, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4118, %r4117, 0fFF800000, %p350; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4119, %r4071, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4120, %r4119, 0fFF800000, %p351; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4121, %r4072, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4122, %r4121, 0fFF800000, %p352; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4123, %r4073, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4124, %r4123, 0fFF800000, %p351; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4125, %r4074, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4126, %r4125, 0fFF800000, %p352; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4127, %r4075, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4128, %r4127, 0fFF800000, %p353; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4129, %r4076, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4130, %r4129, 0fFF800000, %p354; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4131, %r4077, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4132, %r4131, 0fFF800000, %p353; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4133, %r4078, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4134, %r4133, 0fFF800000, %p354; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4135, %r4079, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4136, %r4135, 0fFF800000, %p355; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4137, %r4080, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4138, %r4137, 0fFF800000, %p356; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4139, %r4081, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4140, %r4139, 0fFF800000, %p355; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4141, %r4082, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4142, %r4141, 0fFF800000, %p356; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4143, %r4083, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4144, %r4143, 0fFF800000, %p357; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4145, %r4084, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4146, %r4145, 0fFF800000, %p358; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4147, %r4085, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4148, %r4147, 0fFF800000, %p357; + .loc 1 417 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:417:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r4149, %r4086, 0f3FB8AA3B; + .loc 1 367 69 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:367:69 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4150, %r4149, 0fFF800000, %p358; + .loc 2 168 27 // standard.py:168:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + max.f32 %r4151, %r4088, %r4090; + max.f32 %r4152, %r4092, %r4094; + max.f32 %r4153, %r4151, %r4096; + max.f32 %r4154, %r4153, %r4098; + max.f32 %r4155, %r4152, %r4100; + max.f32 %r4156, %r4155, %r4102; + max.f32 %r4157, %r4154, %r4104; + max.f32 %r4158, %r4157, %r4106; + max.f32 %r4159, %r4156, %r4108; + max.f32 %r4160, %r4159, %r4110; + max.f32 %r4161, %r4158, %r4112; + max.f32 %r4162, %r4161, %r4114; + max.f32 %r4163, %r4160, %r4116; + max.f32 %r4164, %r4163, %r4118; + max.f32 %r4165, %r4162, %r4120; + max.f32 %r4166, %r4165, %r4122; + max.f32 %r4167, %r4164, %r4124; + max.f32 %r4168, %r4167, %r4126; + max.f32 %r4169, %r4166, %r4128; + max.f32 %r4170, %r4169, %r4130; + max.f32 %r4171, %r4168, %r4132; + max.f32 %r4172, %r4171, %r4134; + max.f32 %r4173, %r4170, %r4136; + max.f32 %r4174, %r4173, %r4138; + max.f32 %r4175, %r4172, %r4140; + max.f32 %r4176, %r4175, %r4142; + max.f32 %r4177, %r4174, %r4144; + max.f32 %r4178, %r4177, %r4146; + max.f32 %r4179, %r4176, %r4148; + max.f32 %r4180, %r4179, %r4150; + .loc 2 189 40 // standard.py:189:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + shfl.sync.bfly.b32 %r4181, %r4178, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + max.f32 %r4182, %r4178, %r4181; + .loc 2 189 40 // standard.py:189:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + shfl.sync.bfly.b32 %r4183, %r4182, 1, 31, -1; + shfl.sync.bfly.b32 %r4184, %r4180, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + max.f32 %r4185, %r4180, %r4184; + .loc 2 189 40 // standard.py:189:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + shfl.sync.bfly.b32 %r4186, %r4185, 1, 31, -1; + cvt.u64.u32 %rd267, %r4183; + cvt.u64.u32 %rd268, %r4186; + shl.b64 %rd269, %rd268, 32; + or.b64 %rd270, %rd267, %rd269; + .loc 2 168 27 // standard.py:168:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mov.b64 {%r4187, %r4188}, %rd270; + max.f32 %r4189, %r4182, %r4187; + max.f32 %r4190, %r4185, %r4188; + .loc 1 421 27 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:421:27 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mov.b64 {%r4191, %r4192}, %rd292; + max.f32 %r4193, %r4192, %r4190; + max.f32 %r4194, %r4191, %r4189; + mov.b64 %rd292, {%r4194, %r4193}; + .loc 1 423 35 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:423:35 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + setp.eq.f32 %p359, %r4194, 0fFF800000; + setp.eq.f32 %p360, %r4193, 0fFF800000; + .loc 1 424 51 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:424:51 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + selp.f32 %r4195, 0f00000000, %r4194, %p359; + selp.f32 %r4196, 0f00000000, %r4193, %p360; + .loc 1 428 31 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:428:31 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + sub.f32 %r4197, %r4191, %r4195; + sub.f32 %r4198, %r4192, %r4196; + .loc 1 428 25 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:428:25 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + ex2.approx.ftz.f32 %r4199, %r4197; + ex2.approx.ftz.f32 %r4200, %r4198; + .loc 1 429 39 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:429:39 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + sub.f32 %r4201, %r4088, %r4195; + sub.f32 %r4202, %r4090, %r4195; + sub.f32 %r4203, %r4092, %r4196; + sub.f32 %r4204, %r4094, %r4196; + sub.f32 %r4205, %r4096, %r4195; + sub.f32 %r4206, %r4098, %r4195; + sub.f32 %r4207, %r4100, %r4196; + sub.f32 %r4208, %r4102, %r4196; + sub.f32 %r4209, %r4104, %r4195; + sub.f32 %r4210, %r4106, %r4195; + sub.f32 %r4211, %r4108, %r4196; + sub.f32 %r4212, %r4110, %r4196; + sub.f32 %r4213, %r4112, %r4195; + sub.f32 %r4214, %r4114, %r4195; + sub.f32 %r4215, %r4116, %r4196; + sub.f32 %r4216, %r4118, %r4196; + sub.f32 %r4217, %r4120, %r4195; + sub.f32 %r4218, %r4122, %r4195; + sub.f32 %r4219, %r4124, %r4196; + sub.f32 %r4220, %r4126, %r4196; + sub.f32 %r4221, %r4128, %r4195; + sub.f32 %r4222, %r4130, %r4195; + sub.f32 %r4223, %r4132, %r4196; + sub.f32 %r4224, %r4134, %r4196; + sub.f32 %r4225, %r4136, %r4195; + sub.f32 %r4226, %r4138, %r4195; + sub.f32 %r4227, %r4140, %r4196; + sub.f32 %r4228, %r4142, %r4196; + sub.f32 %r4229, %r4144, %r4195; + sub.f32 %r4230, %r4146, %r4195; + sub.f32 %r4231, %r4148, %r4196; + sub.f32 %r4232, %r4150, %r4196; + .loc 1 429 21 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:429:21 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + ex2.approx.ftz.f32 %r4233, %r4201; + ex2.approx.ftz.f32 %r4234, %r4202; + ex2.approx.ftz.f32 %r4235, %r4203; + ex2.approx.ftz.f32 %r4236, %r4204; + ex2.approx.ftz.f32 %r4237, %r4205; + ex2.approx.ftz.f32 %r4238, %r4206; + ex2.approx.ftz.f32 %r4239, %r4207; + ex2.approx.ftz.f32 %r4240, %r4208; + ex2.approx.ftz.f32 %r4241, %r4209; + ex2.approx.ftz.f32 %r4242, %r4210; + ex2.approx.ftz.f32 %r4243, %r4211; + ex2.approx.ftz.f32 %r4244, %r4212; + ex2.approx.ftz.f32 %r4245, %r4213; + ex2.approx.ftz.f32 %r4246, %r4214; + ex2.approx.ftz.f32 %r4247, %r4215; + ex2.approx.ftz.f32 %r4248, %r4216; + ex2.approx.ftz.f32 %r4249, %r4217; + ex2.approx.ftz.f32 %r4250, %r4218; + ex2.approx.ftz.f32 %r4251, %r4219; + ex2.approx.ftz.f32 %r4252, %r4220; + ex2.approx.ftz.f32 %r4253, %r4221; + ex2.approx.ftz.f32 %r4254, %r4222; + ex2.approx.ftz.f32 %r4255, %r4223; + ex2.approx.ftz.f32 %r4256, %r4224; + ex2.approx.ftz.f32 %r4257, %r4225; + ex2.approx.ftz.f32 %r4258, %r4226; + ex2.approx.ftz.f32 %r4259, %r4227; + ex2.approx.ftz.f32 %r4260, %r4228; + ex2.approx.ftz.f32 %r4261, %r4229; + ex2.approx.ftz.f32 %r4262, %r4230; + ex2.approx.ftz.f32 %r4263, %r4231; + ex2.approx.ftz.f32 %r4264, %r4232; + .loc 2 261 15 // standard.py:261:15 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.f32 %r4265, %r4233, %r4234; + add.f32 %r4266, %r4235, %r4236; + add.f32 %r4267, %r4265, %r4237; + add.f32 %r4268, %r4267, %r4238; + add.f32 %r4269, %r4266, %r4239; + add.f32 %r4270, %r4269, %r4240; + add.f32 %r4271, %r4268, %r4241; + add.f32 %r4272, %r4271, %r4242; + add.f32 %r4273, %r4270, %r4243; + add.f32 %r4274, %r4273, %r4244; + add.f32 %r4275, %r4272, %r4245; + add.f32 %r4276, %r4275, %r4246; + add.f32 %r4277, %r4274, %r4247; + add.f32 %r4278, %r4277, %r4248; + add.f32 %r4279, %r4276, %r4249; + add.f32 %r4280, %r4279, %r4250; + add.f32 %r4281, %r4278, %r4251; + add.f32 %r4282, %r4281, %r4252; + add.f32 %r4283, %r4280, %r4253; + add.f32 %r4284, %r4283, %r4254; + add.f32 %r4285, %r4282, %r4255; + add.f32 %r4286, %r4285, %r4256; + add.f32 %r4287, %r4284, %r4257; + add.f32 %r4288, %r4287, %r4258; + add.f32 %r4289, %r4286, %r4259; + add.f32 %r4290, %r4289, %r4260; + add.f32 %r4291, %r4288, %r4261; + add.f32 %r4292, %r4291, %r4262; + add.f32 %r4293, %r4290, %r4263; + add.f32 %r4294, %r4293, %r4264; + .loc 2 291 36 // standard.py:291:36 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + shfl.sync.bfly.b32 %r4295, %r4292, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.f32 %r4296, %r4292, %r4295; + .loc 2 291 36 // standard.py:291:36 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + shfl.sync.bfly.b32 %r4297, %r4296, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.f32 %r4298, %r4296, %r4297; + .loc 2 291 36 // standard.py:291:36 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + shfl.sync.bfly.b32 %r4299, %r4294, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.f32 %r4300, %r4294, %r4299; + .loc 2 291 36 // standard.py:291:36 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + shfl.sync.bfly.b32 %r4301, %r4300, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.f32 %r4302, %r4300, %r4301; + .loc 1 434 24 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:434:24 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + fma.rn.f32 %r5060, %r5060, %r4199, %r4298; + fma.rn.f32 %r5061, %r5061, %r4200, %r4302; + .loc 1 436 16 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:436:16 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.f32 %r1702, %r1702, %r4199; + mul.f32 %r1703, %r1703, %r4199; + mul.f32 %r1704, %r1704, %r4200; + mul.f32 %r1705, %r1705, %r4200; + mul.f32 %r1706, %r1706, %r4199; + mul.f32 %r1707, %r1707, %r4199; + mul.f32 %r1708, %r1708, %r4200; + mul.f32 %r1709, %r1709, %r4200; + mul.f32 %r1710, %r1710, %r4199; + mul.f32 %r1711, %r1711, %r4199; + mul.f32 %r1712, %r1712, %r4200; + mul.f32 %r1713, %r1713, %r4200; + mul.f32 %r1714, %r1714, %r4199; + mul.f32 %r1715, %r1715, %r4199; + mul.f32 %r1716, %r1716, %r4200; + mul.f32 %r1717, %r1717, %r4200; + mul.f32 %r1718, %r1718, %r4199; + mul.f32 %r1719, %r1719, %r4199; + mul.f32 %r1720, %r1720, %r4200; + mul.f32 %r1721, %r1721, %r4200; + mul.f32 %r1722, %r1722, %r4199; + mul.f32 %r1723, %r1723, %r4199; + mul.f32 %r1724, %r1724, %r4200; + mul.f32 %r1725, %r1725, %r4200; + mul.f32 %r1726, %r1726, %r4199; + mul.f32 %r1727, %r1727, %r4199; + mul.f32 %r1728, %r1728, %r4200; + mul.f32 %r1729, %r1729, %r4200; + mul.f32 %r1730, %r1730, %r4199; + mul.f32 %r1731, %r1731, %r4199; + mul.f32 %r1732, %r1732, %r4200; + mul.f32 %r1733, %r1733, %r4200; + mul.f32 %r1734, %r1734, %r4199; + mul.f32 %r1735, %r1735, %r4199; + mul.f32 %r1736, %r1736, %r4200; + mul.f32 %r1737, %r1737, %r4200; + mul.f32 %r1738, %r1738, %r4199; + mul.f32 %r1739, %r1739, %r4199; + mul.f32 %r1740, %r1740, %r4200; + mul.f32 %r1741, %r1741, %r4200; + mul.f32 %r1742, %r1742, %r4199; + mul.f32 %r1743, %r1743, %r4199; + mul.f32 %r1744, %r1744, %r4200; + mul.f32 %r1745, %r1745, %r4200; + mul.f32 %r1746, %r1746, %r4199; + mul.f32 %r1747, %r1747, %r4199; + mul.f32 %r1748, %r1748, %r4200; + mul.f32 %r1749, %r1749, %r4200; + mul.f32 %r1750, %r1750, %r4199; + mul.f32 %r1751, %r1751, %r4199; + mul.f32 %r1752, %r1752, %r4200; + mul.f32 %r1753, %r1753, %r4200; + mul.f32 %r1754, %r1754, %r4199; + mul.f32 %r1755, %r1755, %r4199; + mul.f32 %r1756, %r1756, %r4200; + mul.f32 %r1757, %r1757, %r4200; + mul.f32 %r1758, %r1758, %r4199; + mul.f32 %r1759, %r1759, %r4199; + mul.f32 %r1760, %r1760, %r4200; + mul.f32 %r1761, %r1761, %r4200; + mul.f32 %r1762, %r1762, %r4199; + mul.f32 %r1763, %r1763, %r4199; + mul.f32 %r1764, %r1764, %r4200; + mul.f32 %r1765, %r1765, %r4200; + .loc 1 292 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.s32 %r4303, %r836, 49152; + add.s32 %r4304, %r4303, %r4019; + .loc 1 440 22 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:440:22 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + cvt.rn.bf16x2.f32 %r3600, %r4234, %r4233; + cvt.rn.bf16x2.f32 %r3601, %r4236, %r4235; + cvt.rn.bf16x2.f32 %r3602, %r4238, %r4237; + cvt.rn.bf16x2.f32 %r3603, %r4240, %r4239; + cvt.rn.bf16x2.f32 %r3732, %r4242, %r4241; + cvt.rn.bf16x2.f32 %r3733, %r4244, %r4243; + cvt.rn.bf16x2.f32 %r3734, %r4246, %r4245; + cvt.rn.bf16x2.f32 %r3735, %r4248, %r4247; + cvt.rn.bf16x2.f32 %r3864, %r4250, %r4249; + cvt.rn.bf16x2.f32 %r3865, %r4252, %r4251; + cvt.rn.bf16x2.f32 %r3866, %r4254, %r4253; + cvt.rn.bf16x2.f32 %r3867, %r4256, %r4255; + cvt.rn.bf16x2.f32 %r3996, %r4258, %r4257; + cvt.rn.bf16x2.f32 %r3997, %r4260, %r4259; + cvt.rn.bf16x2.f32 %r3998, %r4262, %r4261; + cvt.rn.bf16x2.f32 %r3999, %r4264, %r4263; + .loc 1 440 44 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:440:44 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + wgmma.fence.sync.aligned; + bfe.u32 %r4305, %r4304, 4, 14; + cvt.u64.u32 %rd271, %r4305; + or.b64 %rd233, %rd271, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765}, {%r3600,%r3601,%r3602,%r3603}, %rd233, %p328, 1, 1, 1; + // end inline asm + add.s32 %r4306, %r4304, 2048; + bfe.u32 %r4307, %r4306, 4, 14; + cvt.u64.u32 %rd272, %r4307; + or.b64 %rd234, %rd272, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765}, {%r3732,%r3733,%r3734,%r3735}, %rd234, %p328, 1, 1, 1; + // end inline asm + add.s32 %r4308, %r4304, 4096; + bfe.u32 %r4309, %r4308, 4, 14; + cvt.u64.u32 %rd273, %r4309; + or.b64 %rd235, %rd273, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765}, {%r3864,%r3865,%r3866,%r3867}, %rd235, %p328, 1, 1, 1; + // end inline asm + add.s32 %r4310, %r4304, 6144; + bfe.u32 %r4311, %r4310, 4, 14; + cvt.u64.u32 %rd274, %r4311; + or.b64 %rd236, %rd274, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765}, {%r3996,%r3997,%r3998,%r3999}, %rd236, %p328, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 548 26 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:548:26 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.s32 %r4980, %r4909, %r4980; + add.s32 %r4981, %r4909, %r4981; + add.s32 %r4982, %r4909, %r4982; + add.s32 %r4983, %r4909, %r4983; + add.s32 %r4984, %r4909, %r4984; + add.s32 %r4985, %r4909, %r4985; + add.s32 %r4986, %r4909, %r4986; + add.s32 %r4987, %r4909, %r4987; + add.s32 %r4988, %r4909, %r4988; + add.s32 %r4989, %r4909, %r4989; + add.s32 %r4990, %r4909, %r4990; + add.s32 %r4991, %r4909, %r4991; + add.s32 %r4992, %r4909, %r4992; + add.s32 %r4993, %r4909, %r4993; + add.s32 %r4994, %r4909, %r4994; + add.s32 %r4995, %r4909, %r4995; + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.s32 %r632, %r4977, 1; + .loc 1 247 33 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:247:33 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + shr.u32 %r4312, %r632, 1; + .loc 1 248 38 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:248:38 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mad.wide.u32 %rd238, %r4312, 4, %rd172; + .loc 1 248 24 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:248:24 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + // begin inline asm + mov.u64 %rd237, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd237, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4000, 0x0; + @%p339 ld.global.L1::evict_last.L2::cache_hint.b32 { %r4000 }, [ %rd238 + 0 ], %rd237; + // end inline asm + .loc 1 249 109 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:249:109 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.s32 %r4313, %r4312, 1; + .loc 1 249 113 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:249:113 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + setp.lt.s32 %p361, %r4313, %r2725; + .loc 1 249 55 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:249:55 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.s64 %rd241, %rd238, 4; + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + and.pred %p340, %p339, %p361; + .loc 1 249 25 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:249:25 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + // begin inline asm + mov.u64 %rd240, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd240, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4001, 0x0; + @%p340 ld.global.L1::evict_last.L2::cache_hint.b32 { %r4001 }, [ %rd241 + 0 ], %rd240; + // end inline asm + .loc 1 250 35 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:250:35 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + and.b32 %r4314, %r4977, 1; + .loc 1 251 34 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:251:34 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + sub.s32 %r4315, %r4001, %r4000; + .loc 1 251 48 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:251:48 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + shl.b32 %r4316, %r4315, 7; + .loc 1 251 63 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:251:63 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.s32 %r4317, %r4316, -64; + .loc 1 252 29 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:252:29 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + xor.b32 %r4318, %r4314, 1; + .loc 1 252 61 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:252:61 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + shl.b32 %r4319, %r4314, 6; + .loc 1 252 42 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:252:42 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mad.lo.s32 %r4909, %r4317, %r4318, %r4319; + .loc 1 549 21 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:549:21 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.s32 %r4912, %r4909, %r4912; + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.s32 %r4320, %r4911, 1; + setp.gt.s32 %p362, %r4320, 2; + selp.b32 %r4911, 0, %r4320, %p362; + .loc 1 342 32 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:342:32 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.s32 %r4321, %r4912, %r376; + .loc 1 346 35 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:346:35 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.s32 %r4322, %r4321, %r8; + add.s32 %r4323, %r4321, %r9; + add.s32 %r4324, %r4321, %r10; + add.s32 %r4325, %r4321, %r11; + .loc 1 284 38 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:38 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + shl.b32 %r4326, %r4322, 7; + shl.b32 %r4327, %r4323, 7; + shl.b32 %r4328, %r4324, 7; + shl.b32 %r4329, %r4325, 7; + .loc 1 284 49 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:49 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + mul.wide.s32 %rd275, %r4326, 2; + add.s64 %rd243, %rd10, %rd275; + mul.wide.s32 %rd276, %r4327, 2; + add.s64 %rd244, %rd10, %rd276; + mul.wide.s32 %rd277, %r4328, 2; + add.s64 %rd245, %rd10, %rd277; + mul.wide.s32 %rd278, %r4329, 2; + add.s64 %rd246, %rd10, %rd278; + .loc 1 292 52 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:52 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + setp.lt.s32 %p363, %r4322, %r712; + setp.lt.s32 %p364, %r4323, %r712; + setp.lt.s32 %p365, %r4324, %r712; + setp.lt.s32 %p366, %r4325, %r712; + .loc 1 292 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + shl.b32 %r4330, %r4911, 14; + add.s32 %r4331, %r836, %r4330; + bar.sync 0; + add.s32 %r4002, %r4331, %r84; + selp.b32 %r4332, 16, 0, %p363; + selp.b32 %r4011, %r4332, 0, %p341; + // begin inline asm + cp.async.cg.shared.global [ %r4002 + 0 ], [ %rd243 + 0 ], 0x10, %r4011; + // end inline asm + add.s32 %r4004, %r4002, 2048; + selp.b32 %r4333, 16, 0, %p364; + selp.b32 %r4013, %r4333, 0, %p341; + // begin inline asm + cp.async.cg.shared.global [ %r4004 + 0 ], [ %rd244 + 0 ], 0x10, %r4013; + // end inline asm + add.s32 %r4006, %r4002, 4096; + selp.b32 %r4334, 16, 0, %p365; + selp.b32 %r4015, %r4334, 0, %p341; + // begin inline asm + cp.async.cg.shared.global [ %r4006 + 0 ], [ %rd245 + 0 ], 0x10, %r4015; + // end inline asm + add.s32 %r4008, %r4002, 6144; + selp.b32 %r4335, 16, 0, %p366; + selp.b32 %r4017, %r4335, 0, %p341; + // begin inline asm + cp.async.cg.shared.global [ %r4008 + 0 ], [ %rd246 + 0 ], 0x10, %r4017; + // end inline asm + cp.async.commit_group; + .loc 1 284 49 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:284:49 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.s64 %rd247, %rd11, %rd275; + add.s64 %rd248, %rd11, %rd276; + add.s64 %rd249, %rd11, %rd277; + add.s64 %rd250, %rd11, %rd278; + .loc 1 292 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:292:23 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + add.s32 %r4336, %r4303, %r4330; + add.s32 %r4010, %r4336, %r84; + // begin inline asm + cp.async.cg.shared.global [ %r4010 + 0 ], [ %rd247 + 0 ], 0x10, %r4011; + // end inline asm + add.s32 %r4012, %r4010, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r4012 + 0 ], [ %rd248 + 0 ], 0x10, %r4013; + // end inline asm + add.s32 %r4014, %r4010, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r4014 + 0 ], [ %rd249 + 0 ], 0x10, %r4015; + // end inline asm + add.s32 %r4016, %r4010, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r4016 + 0 ], [ %rd250 + 0 ], 0x10, %r4017; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + setp.ne.b32 %p367, %r461, %r632; + mov.b32 %r4977, %r632; + @%p367 bra $L__BB0_5; +$L__BB0_6: // %._crit_edge461 + .loc 1 0 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:0:40 + cvt.u32.u64 %r4537, %rd3; + .loc 1 502 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:502:40 @[ cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:198:45 ] + // begin inline asm + // wait for regs: %r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp12: + .loc 1 206 26 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:206:26 + setp.eq.f32 %p376, %r5060, 0f00000000; + setp.eq.f32 %p377, %r5061, 0f00000000; + .loc 1 206 34 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:206:34 + selp.f32 %r4538, 0f3F800000, %r5060, %p376; + selp.f32 %r702, 0f3F800000, %r5061, %p377; + .loc 1 208 16 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:208:16 + div.full.f32 %r4539, %r1702, %r4538; + div.full.f32 %r4540, %r1703, %r4538; + div.full.f32 %r4541, %r1704, %r702; + div.full.f32 %r4542, %r1705, %r702; + div.full.f32 %r4543, %r1706, %r4538; + div.full.f32 %r4544, %r1707, %r4538; + div.full.f32 %r4545, %r1708, %r702; + div.full.f32 %r4546, %r1709, %r702; + div.full.f32 %r4547, %r1710, %r4538; + div.full.f32 %r4548, %r1711, %r4538; + div.full.f32 %r4549, %r1712, %r702; + div.full.f32 %r4550, %r1713, %r702; + div.full.f32 %r4551, %r1714, %r4538; + div.full.f32 %r4552, %r1715, %r4538; + div.full.f32 %r4553, %r1716, %r702; + div.full.f32 %r4554, %r1717, %r702; + div.full.f32 %r4555, %r1718, %r4538; + div.full.f32 %r4556, %r1719, %r4538; + div.full.f32 %r4557, %r1720, %r702; + div.full.f32 %r4558, %r1721, %r702; + div.full.f32 %r4559, %r1722, %r4538; + div.full.f32 %r4560, %r1723, %r4538; + div.full.f32 %r4561, %r1724, %r702; + div.full.f32 %r4562, %r1725, %r702; + div.full.f32 %r4563, %r1726, %r4538; + div.full.f32 %r4564, %r1727, %r4538; + div.full.f32 %r4565, %r1728, %r702; + div.full.f32 %r4566, %r1729, %r702; + div.full.f32 %r4567, %r1730, %r4538; + div.full.f32 %r4568, %r1731, %r4538; + div.full.f32 %r4569, %r1732, %r702; + div.full.f32 %r4570, %r1733, %r702; + div.full.f32 %r4571, %r1734, %r4538; + div.full.f32 %r4572, %r1735, %r4538; + div.full.f32 %r4573, %r1736, %r702; + div.full.f32 %r4574, %r1737, %r702; + div.full.f32 %r4575, %r1738, %r4538; + div.full.f32 %r4576, %r1739, %r4538; + div.full.f32 %r4577, %r1740, %r702; + div.full.f32 %r4578, %r1741, %r702; + div.full.f32 %r4579, %r1742, %r4538; + div.full.f32 %r4580, %r1743, %r4538; + div.full.f32 %r4581, %r1744, %r702; + div.full.f32 %r4582, %r1745, %r702; + div.full.f32 %r4583, %r1746, %r4538; + div.full.f32 %r4584, %r1747, %r4538; + div.full.f32 %r4585, %r1748, %r702; + div.full.f32 %r4586, %r1749, %r702; + div.full.f32 %r4587, %r1750, %r4538; + div.full.f32 %r4588, %r1751, %r4538; + div.full.f32 %r4589, %r1752, %r702; + div.full.f32 %r4590, %r1753, %r702; + div.full.f32 %r4591, %r1754, %r4538; + div.full.f32 %r4592, %r1755, %r4538; + div.full.f32 %r4593, %r1756, %r702; + div.full.f32 %r4594, %r1757, %r702; + div.full.f32 %r4595, %r1758, %r4538; + div.full.f32 %r4596, %r1759, %r4538; + div.full.f32 %r4597, %r1760, %r702; + div.full.f32 %r4598, %r1761, %r702; + div.full.f32 %r4599, %r1762, %r4538; + div.full.f32 %r4600, %r1763, %r4538; + div.full.f32 %r4601, %r1764, %r702; + div.full.f32 %r4602, %r1765, %r702; + .loc 1 218 49 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:218:49 + or.b32 %r4603, %r4537, %r4; + .loc 1 218 62 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:218:62 + or.b32 %r4604, %r4603, %r3; + .loc 1 218 75 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:218:75 + add.s32 %r4605, %r4604, %r20; + add.s32 %r4606, %r4604, %r21; + add.s32 %r4607, %r4604, %r22; + add.s32 %r4608, %r4604, %r23; + add.s32 %r4609, %r4604, %r24; + add.s32 %r4610, %r4604, %r25; + add.s32 %r4611, %r4604, %r26; + add.s32 %r4612, %r4604, %r27; + .loc 1 218 25 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:218:25 + mad.wide.s32 %rd279, %r4605, 2, %rd38; + mad.wide.s32 %rd280, %r4606, 2, %rd38; + mad.wide.s32 %rd281, %r4607, 2, %rd38; + mad.wide.s32 %rd282, %r4608, 2, %rd38; + mad.wide.s32 %rd283, %r4609, 2, %rd38; + mad.wide.s32 %rd284, %r4610, 2, %rd38; + mad.wide.s32 %rd285, %r4611, 2, %rd38; + mad.wide.s32 %rd286, %r4612, 2, %rd38; + .loc 1 218 109 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:218:109 + cvt.rn.bf16x2.f32 %r4613, %r4540, %r4539; + cvt.rn.bf16x2.f32 %r4614, %r4542, %r4541; + cvt.rn.bf16x2.f32 %r4615, %r4544, %r4543; + cvt.rn.bf16x2.f32 %r4616, %r4546, %r4545; + cvt.rn.bf16x2.f32 %r4617, %r4548, %r4547; + cvt.rn.bf16x2.f32 %r4618, %r4550, %r4549; + cvt.rn.bf16x2.f32 %r4619, %r4552, %r4551; + cvt.rn.bf16x2.f32 %r4620, %r4554, %r4553; + cvt.rn.bf16x2.f32 %r4621, %r4556, %r4555; + cvt.rn.bf16x2.f32 %r4622, %r4558, %r4557; + cvt.rn.bf16x2.f32 %r4623, %r4560, %r4559; + cvt.rn.bf16x2.f32 %r4624, %r4562, %r4561; + cvt.rn.bf16x2.f32 %r4625, %r4564, %r4563; + cvt.rn.bf16x2.f32 %r4626, %r4566, %r4565; + cvt.rn.bf16x2.f32 %r4627, %r4568, %r4567; + cvt.rn.bf16x2.f32 %r4628, %r4570, %r4569; + cvt.rn.bf16x2.f32 %r4629, %r4572, %r4571; + cvt.rn.bf16x2.f32 %r4630, %r4574, %r4573; + cvt.rn.bf16x2.f32 %r4631, %r4576, %r4575; + cvt.rn.bf16x2.f32 %r4632, %r4578, %r4577; + cvt.rn.bf16x2.f32 %r4633, %r4580, %r4579; + cvt.rn.bf16x2.f32 %r4634, %r4582, %r4581; + cvt.rn.bf16x2.f32 %r4635, %r4584, %r4583; + cvt.rn.bf16x2.f32 %r4636, %r4586, %r4585; + cvt.rn.bf16x2.f32 %r4637, %r4588, %r4587; + cvt.rn.bf16x2.f32 %r4638, %r4590, %r4589; + cvt.rn.bf16x2.f32 %r4639, %r4592, %r4591; + cvt.rn.bf16x2.f32 %r4640, %r4594, %r4593; + cvt.rn.bf16x2.f32 %r4641, %r4596, %r4595; + cvt.rn.bf16x2.f32 %r4642, %r4598, %r4597; + cvt.rn.bf16x2.f32 %r4643, %r4600, %r4599; + cvt.rn.bf16x2.f32 %r4644, %r4602, %r4601; + shl.b32 %r4645, %r35, 13; + shl.b32 %r4646, %r6, 5; + and.b32 %r4647, %r4646, 7264; + and.b32 %r4648, %r6, 24; + shl.b32 %r4649, %r4648, 4; + shl.b32 %r4650, %r6, 2; + and.b32 %r4651, %r4650, 16; + or.b32 %r4652, %r4645, %r4651; + or.b32 %r4653, %r4647, %r4649; + or.b32 %r4654, %r4652, %r4653; + add.s32 %r4656, %r836, %r4654; + st.shared.v4.b32 [%r4656], {%r4613, %r4615, %r4617, %r4619}; + st.shared.v4.b32 [%r4656+512], {%r4614, %r4616, %r4618, %r4620}; + xor.b32 %r4657, %r4654, 32; + add.s32 %r4658, %r836, %r4657; + st.shared.v4.b32 [%r4658], {%r4621, %r4623, %r4625, %r4627}; + st.shared.v4.b32 [%r4658+512], {%r4622, %r4624, %r4626, %r4628}; + xor.b32 %r4659, %r4654, 64; + add.s32 %r4660, %r836, %r4659; + st.shared.v4.b32 [%r4660], {%r4629, %r4631, %r4633, %r4635}; + st.shared.v4.b32 [%r4660+512], {%r4630, %r4632, %r4634, %r4636}; + xor.b32 %r4661, %r4654, 96; + add.s32 %r4662, %r836, %r4661; + st.shared.v4.b32 [%r4662], {%r4637, %r4639, %r4641, %r4643}; + st.shared.v4.b32 [%r4662+512], {%r4638, %r4640, %r4642, %r4644}; + bar.sync 0; + shl.b32 %r4663, %r4648, 10; + shl.b32 %r4664, %r35, 5; + and.b32 %r703, %r6, 252; + shl.b32 %r4665, %r703, 2; + or.b32 %r4666, %r4663, %r4664; + xor.b32 %r4667, %r4666, %r4665; + add.s32 %r4469, %r836, %r4667; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4465, %r4466, %r4467, %r4468}, [%r4469]; + // end inline asm + add.s32 %r4474, %r4469, 1024; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4470, %r4471, %r4472, %r4473}, [%r4474]; + // end inline asm + add.s32 %r4479, %r4469, 2048; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4475, %r4476, %r4477, %r4478}, [%r4479]; + // end inline asm + add.s32 %r4484, %r4469, 3072; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4480, %r4481, %r4482, %r4483}, [%r4484]; + // end inline asm + add.s32 %r4489, %r4469, 4096; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4485, %r4486, %r4487, %r4488}, [%r4489]; + // end inline asm + add.s32 %r4494, %r4469, 5120; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4490, %r4491, %r4492, %r4493}, [%r4494]; + // end inline asm + add.s32 %r4499, %r4469, 6144; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4495, %r4496, %r4497, %r4498}, [%r4499]; + // end inline asm + add.s32 %r4504, %r4469, 7168; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4500, %r4501, %r4502, %r4503}, [%r4504]; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd279 + 0 ], { %r4465, %r4466, %r4467, %r4468 }; + // end inline asm + // begin inline asm + @%p2 st.global.v4.b32 [ %rd280 + 0 ], { %r4470, %r4471, %r4472, %r4473 }; + // end inline asm + // begin inline asm + @%p3 st.global.v4.b32 [ %rd281 + 0 ], { %r4475, %r4476, %r4477, %r4478 }; + // end inline asm + // begin inline asm + @%p4 st.global.v4.b32 [ %rd282 + 0 ], { %r4480, %r4481, %r4482, %r4483 }; + // end inline asm + // begin inline asm + @%p5 st.global.v4.b32 [ %rd283 + 0 ], { %r4485, %r4486, %r4487, %r4488 }; + // end inline asm + // begin inline asm + @%p6 st.global.v4.b32 [ %rd284 + 0 ], { %r4490, %r4491, %r4492, %r4493 }; + // end inline asm + // begin inline asm + @%p7 st.global.v4.b32 [ %rd285 + 0 ], { %r4495, %r4496, %r4497, %r4498 }; + // end inline asm + // begin inline asm + @%p8 st.global.v4.b32 [ %rd286 + 0 ], { %r4500, %r4501, %r4502, %r4503 }; + // end inline asm + .loc 1 223 33 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:223:33 + setp.lt.f32 %p378, %r4538, 0f00800000; + mul.f32 %r4668, %r4538, 0f4B000000; + selp.f32 %r704, %r4668, %r4538, %p378; + selp.f32 %r4669, 0fC1B80000, 0f00000000, %p378; + add.s32 %r4670, %r704, -1060439283; + and.b32 %r4671, %r4670, -8388608; + sub.s32 %r4672, %r704, %r4671; + cvt.rn.f32.s32 %r4673, %r4671; + mov.b32 %r4674, 0f34000000; + fma.rn.ftz.f32 %r4675, %r4673, %r4674, %r4669; + add.f32 %r4676, %r4672, 0fBF800000; + mov.b32 %r4677, 0fBE2C7F30; + mov.b32 %r4678, 0f3DC6B27F; + fma.rn.ftz.f32 %r4679, %r4678, %r4676, %r4677; + mov.b32 %r4680, 0f3E2FCF2A; + fma.rn.ftz.f32 %r4681, %r4679, %r4676, %r4680; + mov.b32 %r4682, 0fBE374E43; + fma.rn.ftz.f32 %r4683, %r4681, %r4676, %r4682; + mov.b32 %r4684, 0f3E520BF4; + fma.rn.ftz.f32 %r4685, %r4683, %r4676, %r4684; + mov.b32 %r4686, 0fBE763C8B; + fma.rn.ftz.f32 %r4687, %r4685, %r4676, %r4686; + mov.b32 %r4688, 0f3E93BF99; + fma.rn.ftz.f32 %r4689, %r4687, %r4676, %r4688; + mov.b32 %r4690, 0fBEB8AA49; + fma.rn.ftz.f32 %r4691, %r4689, %r4676, %r4690; + mov.b32 %r4692, 0f3EF6384A; + fma.rn.ftz.f32 %r4693, %r4691, %r4676, %r4692; + mov.b32 %r4694, 0fBF38AA3B; + fma.rn.ftz.f32 %r4695, %r4693, %r4676, %r4694; + mul.f32 %r4696, %r4676, %r4695; + mul.f32 %r4697, %r4676, %r4696; + mov.b32 %r4698, 0f3FB8AA3B; + fma.rn.ftz.f32 %r4699, %r4676, %r4698, %r4697; + add.f32 %r5062, %r4675, %r4699; + setp.lt.u32 %p379, %r704, 2139095040; + mov.b32 %r4700, 0f7F800000; + @%p379 bra $L__BB0_8; +// %bb.7: // %__nv_fmaf_rn.exit.i.i + .loc 1 0 33 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:0:33 + fma.rn.ftz.f32 %r5062, %r704, %r4700, %r4700; +$L__BB0_8: // %__nv_log2f.exit + ld.param.b64 %rd35, [triton_tem_fused_0_param_3]; + .loc 1 223 33 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:223:33 + setp.lt.f32 %p380, %r702, 0f00800000; + mul.f32 %r4701, %r702, 0f4B000000; + selp.f32 %r708, %r4701, %r702, %p380; + selp.f32 %r4702, 0fC1B80000, 0f00000000, %p380; + add.s32 %r4703, %r708, -1060439283; + and.b32 %r4704, %r4703, -8388608; + sub.s32 %r4705, %r708, %r4704; + cvt.rn.f32.s32 %r4706, %r4704; + fma.rn.ftz.f32 %r4708, %r4706, %r4674, %r4702; + add.f32 %r4709, %r4705, 0fBF800000; + fma.rn.ftz.f32 %r4712, %r4678, %r4709, %r4677; + fma.rn.ftz.f32 %r4714, %r4712, %r4709, %r4680; + fma.rn.ftz.f32 %r4716, %r4714, %r4709, %r4682; + fma.rn.ftz.f32 %r4718, %r4716, %r4709, %r4684; + fma.rn.ftz.f32 %r4720, %r4718, %r4709, %r4686; + fma.rn.ftz.f32 %r4722, %r4720, %r4709, %r4688; + fma.rn.ftz.f32 %r4724, %r4722, %r4709, %r4690; + fma.rn.ftz.f32 %r4726, %r4724, %r4709, %r4692; + fma.rn.ftz.f32 %r4728, %r4726, %r4709, %r4694; + mul.f32 %r4729, %r4709, %r4728; + mul.f32 %r4730, %r4709, %r4729; + fma.rn.ftz.f32 %r4732, %r4709, %r4698, %r4730; + add.f32 %r5063, %r4708, %r4732; + setp.lt.u32 %p381, %r708, 2139095040; + @%p381 bra $L__BB0_10; +// %bb.9: // %__nv_fmaf_rn.exit.i.i384 + .loc 1 0 33 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:0:33 + fma.rn.ftz.f32 %r5063, %r708, %r4700, %r4700; +$L__BB0_10: // %__nv_log2f.exit387 + .loc 1 223 33 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:223:33 + setp.eq.f32 %p383, %r708, 0f00000000; + setp.eq.f32 %p384, %r704, 0f00000000; + .loc 1 222 32 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:222:32 + shl.b32 %r4735, %r1, 16; + shl.b32 %r4736, %r2, 11; + add.s32 %r4737, %r4735, %r4736; + .loc 1 222 23 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:222:23 + mad.wide.s32 %rd288, %r4737, 4, %rd35; + .loc 1 144 46 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:144:46 + and.b32 %r4738, %r6, 127; + .loc 1 144 33 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:144:33 + or.b32 %r4739, %r5, %r4738; + .loc 1 222 40 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:222:40 + mad.wide.s32 %rd287, %r4739, 4, %rd288; + .loc 1 223 33 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:223:33 + selp.f32 %r4740, 0fFF800000, %r5062, %p384; + selp.f32 %r4741, 0fFF800000, %r5063, %p383; + .loc 1 223 20 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:223:20 + mov.b64 {%r4742, %r4743}, %rd292; + add.f32 %r4744, %r4743, %r4741; + add.f32 %r4745, %r4742, %r4740; + .loc 1 227 48 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:227:48 + setp.lt.s32 %p385, %r4739, 2048; + .loc 1 227 29 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:227:29 + bar.sync 0; + shl.b32 %r4746, %r703, 1; + add.s32 %r4748, %r836, %r4746; + st.shared.v2.b32 [%r4748], {%r4745, %r4744}; + bar.sync 0; + shl.b32 %r4749, %r28, 3; + shl.b32 %r4750, %r29, 2; + shr.u32 %r4751, %r30, 1; + add.s32 %r4752, %r836, %r4749; + add.s32 %r4753, %r4752, %r4750; + add.s32 %r4754, %r4753, %r4751; + ld.shared.b32 %r4734, [%r4754]; + and.b32 %r4755, %r6, 128; + setp.eq.b32 %p386, %r4755, 0; + and.pred %p382, %p386, %p385; + // begin inline asm + @%p382 st.global.b32 [ %rd287 + 0 ], { %r4734 }; + // end inline asm + .loc 1 229 4 // cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py:229:4 + ret; +$L__tmp13: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 275 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x10c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 115 +.b8 55 +.b8 113 +.b8 104 +.b8 106 +.b8 108 +.b8 116 +.b8 51 +.b8 113 +.b8 97 +.b8 103 +.b8 119 +.b8 121 +.b8 121 +.b8 105 +.b8 99 +.b8 50 +.b8 111 +.b8 105 +.b8 121 +.b8 111 +.b8 115 +.b8 116 +.b8 52 +.b8 109 +.b8 122 +.b8 106 +.b8 116 +.b8 98 +.b8 121 +.b8 113 +.b8 117 +.b8 99 +.b8 54 +.b8 109 +.b8 117 +.b8 103 +.b8 103 +.b8 121 +.b8 122 +.b8 117 +.b8 100 +.b8 119 +.b8 122 +.b8 103 +.b8 50 +.b8 117 +.b8 52 +.b8 118 +.b8 98 +.b8 116 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 115 +.b8 55 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x15 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 116 +.b8 101 +.b8 109 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa0:0x76 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xb5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 146 // DW_AT_call_line +.b8 101 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xcd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 154 // DW_AT_call_line +.b8 92 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xe5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 172 // DW_AT_call_line +.b8 41 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xfd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp9 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 198 // DW_AT_call_line +.b8 45 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.source new file mode 100644 index 0000000000000000000000000000000000000000..e3b41721e9d46e66096919ea541d3db59eb7c0a9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.source @@ -0,0 +1,1200 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":18:0) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":271:0) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":32:0) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":448:0) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":299:0) +#loc226 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":256:0) +#loc230 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":175:0) +#loc232 = loc(unknown) +#loc235 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":167:0) +#loc239 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc243 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":240:0) +#loc268 = loc("arg_Q"(#loc)) +#loc269 = loc("arg_K"(#loc)) +#loc270 = loc("arg_V"(#loc)) +#loc271 = loc("arg_LSE"(#loc)) +#loc272 = loc("arg_MAX"(#loc)) +#loc273 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc274 = loc("arg_KV_IDX"(#loc)) +#loc275 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc276 = loc("arg_FULL_KV_IDX"(#loc)) +#loc277 = loc("in_ptr9"(#loc)) +#loc278 = loc("out_ptr0"(#loc)) +#loc279 = loc("ks0"(#loc)) +#loc280 = loc("ks1"(#loc)) +#loc377 = loc("ptr"(#loc125)) +#loc378 = loc("offs_m"(#loc125)) +#loc379 = loc("offs_n"(#loc125)) +#loc380 = loc("stride_m"(#loc125)) +#loc381 = loc("stride_n"(#loc125)) +#loc382 = loc("M_LEN"(#loc125)) +#loc389 = loc("x"(#loc137)) +#loc390 = loc("arg_Q"(#loc143)) +#loc391 = loc("arg_K"(#loc143)) +#loc392 = loc("arg_V"(#loc143)) +#loc393 = loc("arg_LSE"(#loc143)) +#loc394 = loc("arg_MAX"(#loc143)) +#loc395 = loc("arg_KV_NUM_BLKS"(#loc143)) +#loc396 = loc("arg_KV_IDX"(#loc143)) +#loc397 = loc("arg_FULL_KV_NUM_BLKS"(#loc143)) +#loc398 = loc("arg_FULL_KV_IDX"(#loc143)) +#loc399 = loc("in_ptr9"(#loc143)) +#loc400 = loc("out_ptr0"(#loc143)) +#loc401 = loc("ks0"(#loc143)) +#loc402 = loc("ks1"(#loc143)) +#loc403 = loc("q"(#loc143)) +#loc404 = loc("K"(#loc143)) +#loc405 = loc("V"(#loc143)) +#loc406 = loc("Q_LEN"(#loc143)) +#loc407 = loc("KV_LEN"(#loc143)) +#loc408 = loc("acc"(#loc143)) +#loc409 = loc("l_i"(#loc143)) +#loc410 = loc("m_i"(#loc143)) +#loc411 = loc("off_z"(#loc143)) +#loc412 = loc("off_h"(#loc143)) +#loc413 = loc("offs_m"(#loc143)) +#loc414 = loc("offs_n"(#loc143)) +#loc415 = loc("kv_start"(#loc143)) +#loc416 = loc("kv_indices"(#loc143)) +#loc417 = loc("kv_num_blocks"(#loc143)) +#loc418 = loc("block_n_end"(#loc143)) +#loc419 = loc("stride_kk"(#loc143)) +#loc420 = loc("stride_kn"(#loc143)) +#loc421 = loc("stride_vn"(#loc143)) +#loc422 = loc("stride_vk"(#loc143)) +#loc428 = loc("arg_Q"(#loc153)) +#loc429 = loc("arg_K"(#loc153)) +#loc430 = loc("arg_V"(#loc153)) +#loc431 = loc("arg_LSE"(#loc153)) +#loc432 = loc("arg_MAX"(#loc153)) +#loc433 = loc("arg_KV_NUM_BLKS"(#loc153)) +#loc434 = loc("arg_KV_IDX"(#loc153)) +#loc435 = loc("arg_FULL_KV_NUM_BLKS"(#loc153)) +#loc436 = loc("arg_FULL_KV_IDX"(#loc153)) +#loc437 = loc("in_ptr9"(#loc153)) +#loc438 = loc("out_ptr0"(#loc153)) +#loc439 = loc("ks0"(#loc153)) +#loc440 = loc("ks1"(#loc153)) +#loc441 = loc("q"(#loc153)) +#loc442 = loc("K"(#loc153)) +#loc443 = loc("V"(#loc153)) +#loc444 = loc("Q_LEN"(#loc153)) +#loc445 = loc("KV_LEN"(#loc153)) +#loc446 = loc("acc"(#loc153)) +#loc447 = loc("l_i"(#loc153)) +#loc448 = loc("m_i"(#loc153)) +#loc449 = loc("off_z"(#loc153)) +#loc450 = loc("off_h"(#loc153)) +#loc451 = loc("offs_m"(#loc153)) +#loc452 = loc("offs_n"(#loc153)) +#loc453 = loc("kv_start"(#loc153)) +#loc454 = loc("kv_offset"(#loc153)) +#loc455 = loc("stride_kk"(#loc153)) +#loc456 = loc("stride_kn"(#loc153)) +#loc457 = loc("stride_vn"(#loc153)) +#loc458 = loc("stride_vk"(#loc153)) +#loc529 = loc("indices"(#loc226)) +#loc530 = loc("max_len"(#loc226)) +#loc531 = loc("input"(#loc230)) +#loc532 = loc("a"(#loc235)) +#loc533 = loc("b"(#loc235)) +#loc534 = loc("input"(#loc239)) +#loc535 = loc("a"(#loc243)) +#loc536 = loc("b"(#loc243)) +#loc537 = loc("loop_iter"(#loc247)) +#loc538 = loc("col_indices"(#loc247)) +#loc539 = loc("total_blocks"(#loc247)) +module { + tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %in_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr9"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc))) attributes {noinline = false} { + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc2) + %c1024_i32_0 = arith.constant 1024 : i32 loc(#loc2) + %0 = arith.muli %c1024_i32_0, %ks0 : i32 loc(#loc2) + %c128_i32_1 = arith.constant 128 : i32 loc(#loc3) + %c128_i32_2 = arith.constant 128 : i32 loc(#loc3) + %1 = arith.muli %c128_i32_2, %ks0 : i32 loc(#loc3) + %c128_i32_3 = arith.constant 128 : i32 loc(#loc4) + %c1_i32_4 = arith.constant 1 : i32 loc(#loc4) + %c1024_i32_5 = arith.constant 1024 : i32 loc(#loc5) + %c1024_i32_6 = arith.constant 1024 : i32 loc(#loc5) + %2 = arith.muli %c1024_i32_6, %ks0 : i32 loc(#loc5) + %c128_i32_7 = arith.constant 128 : i32 loc(#loc6) + %c128_i32_8 = arith.constant 128 : i32 loc(#loc6) + %3 = arith.muli %c128_i32_8, %ks0 : i32 loc(#loc6) + %c128_i32_9 = arith.constant 128 : i32 loc(#loc7) + %c1_i32_10 = arith.constant 1 : i32 loc(#loc7) + %ZQ = arith.constant 2 : i32 loc(#loc281) + %HQ = arith.constant 32 : i32 loc(#loc282) + %Q_LEN = arith.constant 2048 : i32 loc(#loc283) + %ZKV = arith.constant 2 : i32 loc(#loc284) + %q_start = tt.get_program_id x : i32 loc(#loc285) + %off_zq = tt.get_program_id y : i32 loc(#loc286) + %off_hq = tt.get_program_id z : i32 loc(#loc287) + %off_zkv = arith.remsi %off_zq, %ZKV : i32 loc(#loc288) + %off_hkv = arith.constant 4 : i32 loc(#loc289) + %off_hkv_11 = arith.constant 4 : i32 loc(#loc289) + %off_hkv_12 = arith.divsi %off_hq, %off_hkv_11 : i32 loc(#loc289) + %off_g = arith.constant 4 : i32 loc(#loc290) + %off_g_13 = arith.constant 4 : i32 loc(#loc290) + %off_g_14 = arith.remsi %off_hq, %off_g_13 : i32 loc(#loc290) + %q_offset = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc291) + %q_offset_15 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc292) + %q_offset_16 = arith.addi %q_offset, %q_offset_15 : i32 loc(#loc293) + %k_offset = arith.muli %off_zkv, %0 : i32 loc(#loc294) + %k_offset_17 = arith.muli %off_hkv_12, %1 : i32 loc(#loc295) + %k_offset_18 = arith.addi %k_offset, %k_offset_17 : i32 loc(#loc296) + %v_offset = arith.muli %off_zkv, %2 : i32 loc(#loc297) + %v_offset_19 = arith.muli %off_hkv_12, %3 : i32 loc(#loc298) + %v_offset_20 = arith.addi %v_offset, %v_offset_19 : i32 loc(#loc299) + %Q = tt.addptr %arg_Q, %q_offset_16 : !tt.ptr, i32 loc(#loc300) + %K = tt.addptr %arg_K, %k_offset_18 : !tt.ptr, i32 loc(#loc301) + %V = tt.addptr %arg_V, %v_offset_20 : !tt.ptr, i32 loc(#loc302) + %SPARSE_Z = arith.constant 2 : i32 loc(#loc303) + %SPARSE_HQ = arith.constant 1 : i32 loc(#loc304) + %sparse_idx_z = arith.remsi %off_zq, %SPARSE_Z : i32 loc(#loc305) + %sparse_idx_hq = arith.remsi %off_hq, %SPARSE_HQ : i32 loc(#loc306) + %stride_kv_num_blks_h = arith.constant 16 : i32 loc(#loc307) + %stride_kv_idx_h = arith.constant 16 : i32 loc(#loc308) + %stride_kv_idx_h_21 = arith.constant 16 : i32 loc(#loc308) + %stride_kv_idx_h_22 = arith.muli %stride_kv_idx_h_21, %ks1 : i32 loc(#loc308) + %m_i = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128xf32> loc(#loc309) + %m_i_23 = arith.constant 0x7F800000 : f32 loc(#loc310) + %m_i_24 = arith.constant 0x7F800000 : f32 loc(#loc310) + %m_i_25 = arith.constant dense<0x7F800000> : tensor<128xf32> loc(#loc310) + %m_i_26 = arith.subf %m_i, %m_i_25 : tensor<128xf32> loc(#loc310) + %l_i = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128xf32> loc(#loc311) + %acc = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc312) + %offs_m = arith.constant 128 : i32 loc(#loc313) + %offs_m_27 = arith.constant 128 : i32 loc(#loc313) + %offs_m_28 = arith.muli %q_start, %offs_m_27 : i32 loc(#loc313) + %offs_m_29 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc314) + %offs_m_30 = tt.splat %offs_m_28 : i32 -> tensor<128xi32> loc(#loc315) + %offs_m_31 = arith.addi %offs_m_30, %offs_m_29 : tensor<128xi32> loc(#loc315) + %sparse_hz_offset = arith.muli %sparse_idx_z, %SPARSE_HQ : i32 loc(#loc316) + %sparse_hz_offset_32 = arith.addi %sparse_hz_offset, %sparse_idx_hq : i32 loc(#loc317) + %sparse_kv_num_blks_offset = arith.muli %sparse_hz_offset_32, %stride_kv_num_blks_h : i32 loc(#loc318) + %sparse_kv_num_blks_offset_33 = arith.constant 1 : i32 loc(#loc319) + %sparse_kv_num_blks_offset_34 = arith.constant 1 : i32 loc(#loc319) + %sparse_kv_num_blks_offset_35 = arith.divsi %q_start, %sparse_kv_num_blks_offset_34 : i32 loc(#loc319) + %sparse_kv_num_blks_offset_36 = arith.addi %sparse_kv_num_blks_offset, %sparse_kv_num_blks_offset_35 : i32 loc(#loc320) + %sparse_kv_idx_offset = arith.muli %sparse_hz_offset_32, %stride_kv_idx_h_22 : i32 loc(#loc321) + %sparse_kv_idx_offset_37 = arith.constant 1 : i32 loc(#loc322) + %sparse_kv_idx_offset_38 = arith.constant 1 : i32 loc(#loc322) + %sparse_kv_idx_offset_39 = arith.divsi %q_start, %sparse_kv_idx_offset_38 : i32 loc(#loc322) + %sparse_kv_idx_offset_40 = arith.muli %sparse_kv_idx_offset_39, %ks1 : i32 loc(#loc323) + %sparse_kv_idx_offset_41 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_40 : i32 loc(#loc324) + %offs_m_42 = arith.constant 128 : i32 loc(#loc325) + %offs_m_43 = arith.constant 128 : i32 loc(#loc325) + %offs_m_44 = arith.muli %q_start, %offs_m_43 : i32 loc(#loc325) + %offs_m_45 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc326) + %offs_m_46 = tt.splat %offs_m_44 : i32 -> tensor<128xi32> loc(#loc327) + %offs_m_47 = arith.addi %offs_m_46, %offs_m_45 : tensor<128xi32> loc(#loc327) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc328) + %q = tt.call @"torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%Q, %offs_m_47, %offs_k, %c4096_i32, %c1_i32, %Q_LEN) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc329) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_41 : !tt.ptr, i32 loc(#loc330) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc331) + %kv_start_48 = arith.constant 128 : i32 loc(#loc332) + %kv_start_49 = arith.constant 128 : i32 loc(#loc332) + %kv_start_50 = arith.muli %kv_start, %kv_start_49 : i32 loc(#loc332) + %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_36 : !tt.ptr, i32 loc(#loc333) + %kv_num_blocks_51 = tt.load %kv_num_blocks : !tt.ptr loc(#loc334) + %block_n_end = arith.constant 2 : i32 loc(#loc335) + %block_n_end_52 = arith.constant 2 : i32 loc(#loc335) + %block_n_end_53 = arith.muli %kv_num_blocks_51, %block_n_end_52 : i32 loc(#loc335) + %block_n_end_54 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%ks0) : (i32) -> i32 loc(#loc336) + %block_n_end_55 = arith.constant 1 : i32 loc(#loc337) + %block_n_end_56 = arith.maxsi %block_n_end_54, %block_n_end_55 : i32 loc(#loc337) + %block_n_end_57 = arith.minsi %block_n_end_53, %block_n_end_56 : i32 loc(#loc338) + %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc339) + %offs_n_58 = tt.splat %kv_start_50 : i32 -> tensor<64xi32> loc(#loc340) + %offs_n_59 = arith.addi %offs_n_58, %offs_n : tensor<64xi32> loc(#loc340) + %4 = tt.expand_dims %offs_m_47 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc68) + %5 = tt.expand_dims %offs_n_59 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc69) + %6:3 = tt.call @"torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(16,)cNone_(17,)cNone_(30,)cconstexpr_0__(32,)cconstexpr_bf16__(37,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %ks0, %ks1, %q, %K, %V, %Q_LEN, %ks0, %acc, %l_i, %m_i_26, %off_zq, %off_hq, %4, %5, %kv_start_50, %kv_indices, %kv_num_blocks_51, %block_n_end_57, %c1_i32_4, %c128_i32_3, %c128_i32_9, %c1_i32_10) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, !tt.ptr, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc70) + %kv_indices_60 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_41 : !tt.ptr, i32 loc(#loc341) + %kv_start_61 = tt.load %kv_indices_60 : !tt.ptr loc(#loc342) + %kv_start_62 = arith.constant 128 : i32 loc(#loc343) + %kv_start_63 = arith.constant 128 : i32 loc(#loc343) + %kv_start_64 = arith.muli %kv_start_61, %kv_start_63 : i32 loc(#loc343) + %kv_num_blocks_65 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_36 : !tt.ptr, i32 loc(#loc344) + %kv_num_blocks_66 = tt.load %kv_num_blocks_65 : !tt.ptr loc(#loc345) + %block_n_end_67 = arith.constant 2 : i32 loc(#loc346) + %block_n_end_68 = arith.constant 2 : i32 loc(#loc346) + %block_n_end_69 = arith.muli %kv_num_blocks_66, %block_n_end_68 : i32 loc(#loc346) + %block_n_end_70 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%ks0) : (i32) -> i32 loc(#loc347) + %block_n_end_71 = arith.constant 1 : i32 loc(#loc348) + %block_n_end_72 = arith.maxsi %block_n_end_70, %block_n_end_71 : i32 loc(#loc348) + %block_n_end_73 = arith.minsi %block_n_end_69, %block_n_end_72 : i32 loc(#loc349) + %offs_n_74 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc350) + %offs_n_75 = tt.splat %kv_start_64 : i32 -> tensor<64xi32> loc(#loc351) + %offs_n_76 = arith.addi %offs_n_75, %offs_n_74 : tensor<64xi32> loc(#loc351) + %7 = tt.expand_dims %offs_m_47 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc82) + %8 = tt.expand_dims %offs_n_76 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc83) + %9:3 = tt.call @"torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(16,)cNone_(17,)cNone_(30,)cconstexpr_0__(32,)cconstexpr_bf16__(37,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %ks0, %ks1, %q, %K, %V, %Q_LEN, %ks0, %6#0, %6#1, %6#2, %off_zq, %off_hq, %7, %8, %kv_start_64, %kv_indices_60, %kv_num_blocks_66, %block_n_end_73, %c1_i32_4, %c128_i32_3, %c128_i32_9, %c1_i32_10) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, !tt.ptr, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc84) + %l_i_77 = arith.constant 0.000000e+00 : f32 loc(#loc352) + %l_i_78 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc352) + %l_i_79 = arith.cmpf oeq, %9#1, %l_i_78 : tensor<128xf32> loc(#loc352) + %l_i_80 = arith.constant 1 : i32 loc(#loc353) + %l_i_81 = arith.constant 1.000000e+00 : f32 loc(#loc353) + %l_i_82 = arith.constant dense<1.000000e+00> : tensor<128xf32> loc(#loc353) + %l_i_83 = arith.select %l_i_79, %l_i_82, %9#1 : tensor<128xi1>, tensor<128xf32> loc(#loc353) + %acc_84 = tt.expand_dims %l_i_83 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc354) + %acc_85 = tt.broadcast %acc_84 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc355) + %acc_86 = arith.divf %9#0, %acc_85 : tensor<128x128xf32> loc(#loc355) + %idx_zq = tt.get_program_id y : i32 loc(#loc356) + %idx_hq = tt.get_program_id z : i32 loc(#loc357) + %idx_m = tt.expand_dims %offs_m_47 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc358) + %idx_d = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc359) + %idx_d_87 = tt.expand_dims %idx_d {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc360) + %mask = arith.constant dense<2048> : tensor<128x1xi32> loc(#loc361) + %mask_88 = arith.cmpi slt, %idx_m, %mask : tensor<128x1xi32> loc(#loc361) + %mask_89 = arith.constant 128 : i32 loc(#loc362) + %mask_90 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc362) + %mask_91 = arith.cmpi slt, %idx_d_87, %mask_90 : tensor<1x128xi32> loc(#loc362) + %mask_92 = tt.broadcast %mask_88 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc363) + %mask_93 = tt.broadcast %mask_91 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc363) + %mask_94 = arith.andi %mask_92, %mask_93 : tensor<128x128xi1> loc(#loc363) + %xindex = arith.constant 128 : i32 loc(#loc364) + %xindex_95 = arith.constant 128 : i32 loc(#loc364) + %xindex_96 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc364) + %xindex_97 = arith.muli %xindex_96, %idx_m : tensor<128x1xi32> loc(#loc364) + %xindex_98 = tt.broadcast %idx_d_87 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc365) + %xindex_99 = tt.broadcast %xindex_97 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc365) + %xindex_100 = arith.addi %xindex_98, %xindex_99 : tensor<128x128xi32> loc(#loc365) + %xindex_101 = arith.constant 262144 : i32 loc(#loc366) + %xindex_102 = arith.constant 262144 : i32 loc(#loc366) + %xindex_103 = arith.muli %xindex_102, %idx_hq : i32 loc(#loc366) + %xindex_104 = tt.splat %xindex_103 : i32 -> tensor<128x128xi32> loc(#loc367) + %xindex_105 = arith.addi %xindex_100, %xindex_104 : tensor<128x128xi32> loc(#loc367) + %xindex_106 = arith.constant 8388608 : i32 loc(#loc368) + %xindex_107 = arith.constant 8388608 : i32 loc(#loc368) + %xindex_108 = arith.muli %xindex_107, %idx_zq : i32 loc(#loc368) + %xindex_109 = tt.splat %xindex_108 : i32 -> tensor<128x128xi32> loc(#loc369) + %xindex_110 = arith.addi %xindex_105, %xindex_109 : tensor<128x128xi32> loc(#loc369) + %c128_i32_111 = arith.constant 128 : i32 loc(#loc103) + %c128_i32_112 = arith.constant 128 : i32 loc(#loc103) + %10 = arith.muli %c128_i32_112, %idx_hq : i32 loc(#loc103) + %11 = tt.splat %10 : i32 -> tensor<1x128xi32> loc(#loc104) + %12 = arith.addi %idx_d_87, %11 : tensor<1x128xi32> loc(#loc104) + %c4096_i32_113 = arith.constant 4096 : i32 loc(#loc105) + %c4096_i32_114 = arith.constant 4096 : i32 loc(#loc105) + %cst = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc105) + %13 = arith.muli %cst, %idx_m : tensor<128x1xi32> loc(#loc105) + %14 = tt.broadcast %12 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc106) + %15 = tt.broadcast %13 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc106) + %16 = arith.addi %14, %15 : tensor<128x128xi32> loc(#loc106) + %c8388608_i32_115 = arith.constant 8388608 : i32 loc(#loc107) + %c8388608_i32_116 = arith.constant 8388608 : i32 loc(#loc107) + %17 = arith.muli %c8388608_i32_116, %idx_zq : i32 loc(#loc107) + %18 = tt.splat %17 : i32 -> tensor<128x128xi32> loc(#loc108) + %19 = arith.addi %16, %18 : tensor<128x128xi32> loc(#loc108) + %20 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc109) + %21 = tt.addptr %20, %19 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc109) + %22 = arith.truncf %acc_86 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc110) + tt.store %21, %22, %mask_94 : tensor<128x128x!tt.ptr> loc(#loc110) + %off_hz = arith.muli %off_zq, %HQ : i32 loc(#loc370) + %off_hz_117 = arith.addi %off_hz, %off_hq : i32 loc(#loc371) + %l_ptrs = arith.muli %off_hz_117, %Q_LEN : i32 loc(#loc372) + %l_ptrs_118 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr, i32 loc(#loc373) + %l_ptrs_119 = tt.splat %l_ptrs_118 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc374) + %l_ptrs_120 = tt.addptr %l_ptrs_119, %offs_m_47 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc374) + %lse = math.log2 %l_i_83 : tensor<128xf32> loc(#loc375) + %lse_121 = arith.addf %9#2, %lse : tensor<128xf32> loc(#loc376) + %cst_122 = arith.constant dense<2048> : tensor<128xi32> loc(#loc118) + %23 = arith.cmpi slt, %offs_m_47, %cst_122 : tensor<128xi32> loc(#loc118) + tt.store %l_ptrs_120, %lse_121, %23 : tensor<128x!tt.ptr> loc(#loc119) + tt.return loc(#loc120) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(1,)cconstexpr_fp32_"() -> tensor<128xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc122) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc122) + tt.return %cst_0 : tensor<128xf32> loc(#loc123) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128xf32> loc(#loc124) + tt.return %0 : tensor<128xf32> loc(#loc124) + } loc(#loc121) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() -> tensor<128x128xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc122) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc122) + tt.return %cst_0 : tensor<128x128xf32> loc(#loc123) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc124) + tt.return %0 : tensor<128x128xf32> loc(#loc124) + } loc(#loc121) + tt.func private @"torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: !tt.ptr loc("ptr"(#loc125)), %offs_m: tensor<128xi32> loc("offs_m"(#loc125)), %offs_n: tensor<128xi32> loc("offs_n"(#loc125)), %stride_m: i32 loc("stride_m"(#loc125)), %stride_n: i32 loc("stride_n"(#loc125)), %M_LEN: i32 loc("M_LEN"(#loc125))) -> tensor<128x128xbf16> attributes {noinline = false} { + %ptr_0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc383) + %ptr_1 = tt.splat %stride_m : i32 -> tensor<128x1xi32> loc(#loc384) + %ptr_2 = arith.muli %ptr_0, %ptr_1 : tensor<128x1xi32> loc(#loc384) + %ptr_3 = tt.splat %ptr : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc385) + %ptr_4 = tt.addptr %ptr_3, %ptr_2 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc385) + %ptr_5 = tt.expand_dims %offs_n {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc386) + %ptr_6 = tt.splat %stride_n : i32 -> tensor<1x128xi32> loc(#loc387) + %ptr_7 = arith.muli %ptr_5, %ptr_6 : tensor<1x128xi32> loc(#loc387) + %ptr_8 = tt.broadcast %ptr_4 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc388) + %ptr_9 = tt.broadcast %ptr_7 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc388) + %ptr_10 = tt.addptr %ptr_8, %ptr_9 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc388) + %0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc132) + %1 = tt.splat %M_LEN : i32 -> tensor<128x1xi32> loc(#loc133) + %2 = arith.cmpi slt, %0, %1 : tensor<128x1xi32> loc(#loc133) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc134) + %3 = tt.broadcast %2 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc134) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc134) + %4 = arith.truncf %cst_11 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc134) + %5 = tt.load %ptr_10, %3, %4 : tensor<128x128x!tt.ptr> loc(#loc134) + tt.return %5 : tensor<128x128xbf16> loc(#loc135) + ^bb1: // no predecessors + %6 = ub.poison : tensor<128x128xbf16> loc(#loc136) + tt.return %6 : tensor<128x128xbf16> loc(#loc136) + } loc(#loc125) + tt.func private @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%x: i32 loc("x"(#loc137))) -> i32 attributes {noinline = false} { + %c64_i32 = arith.constant 64 : i32 loc(#loc138) + %c64_i32_0 = arith.constant 64 : i32 loc(#loc138) + %0 = arith.addi %x, %c64_i32_0 : i32 loc(#loc138) + %c1_i32 = arith.constant 1 : i32 loc(#loc139) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc139) + %1 = arith.subi %0, %c1_i32_1 : i32 loc(#loc139) + %c64_i32_2 = arith.constant 64 : i32 loc(#loc140) + %c64_i32_3 = arith.constant 64 : i32 loc(#loc140) + %2 = arith.divsi %1, %c64_i32_3 : i32 loc(#loc140) + tt.return %2 : i32 loc(#loc141) + ^bb1: // no predecessors + %3 = ub.poison : i32 loc(#loc142) + tt.return %3 : i32 loc(#loc142) + } loc(#loc137) + tt.func private @"torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(16,)cNone_(17,)cNone_(30,)cconstexpr_0__(32,)cconstexpr_bf16__(37,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc143)), %arg_K: !tt.ptr loc("arg_K"(#loc143)), %arg_V: !tt.ptr loc("arg_V"(#loc143)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc143)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc143)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc143)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc143)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc143)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc143)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc143)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc143)), %ks0: i32 loc("ks0"(#loc143)), %ks1: i32 loc("ks1"(#loc143)), %q: tensor<128x128xbf16> loc("q"(#loc143)), %K: !tt.ptr loc("K"(#loc143)), %V: !tt.ptr loc("V"(#loc143)), %Q_LEN: i32 loc("Q_LEN"(#loc143)), %KV_LEN: i32 loc("KV_LEN"(#loc143)), %acc: tensor<128x128xf32> loc("acc"(#loc143)), %l_i: tensor<128xf32> loc("l_i"(#loc143)), %m_i: tensor<128xf32> loc("m_i"(#loc143)), %off_z: i32 loc("off_z"(#loc143)), %off_h: i32 loc("off_h"(#loc143)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc143)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc143)), %kv_start: i32 loc("kv_start"(#loc143)), %kv_indices: !tt.ptr loc("kv_indices"(#loc143)), %kv_num_blocks: i32 loc("kv_num_blocks"(#loc143)), %block_n_end: i32 loc("block_n_end"(#loc143)), %stride_kk: i32 loc("stride_kk"(#loc143)), %stride_kn: i32 loc("stride_kn"(#loc143)), %stride_vn: i32 loc("stride_vn"(#loc143)), %stride_vk: i32 loc("stride_vk"(#loc143))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_offset = arith.constant 0 : i32 loc(#loc423) + %c0_i32 = arith.constant 0 : i32 loc(#loc145) + %c1_i32 = arith.constant 1 : i32 loc(#loc145) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc145) + %1 = arith.bitcast %block_n_end : i32 to i32 loc(#loc145) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc145) + %3 = ub.poison : i32 loc(#loc145) + %kv_offset_0:5 = scf.for %start_n = %0 to %1 step %2 iter_args(%acc_1 = %acc, %l_i_2 = %l_i, %m_i_3 = %m_i, %offs_n_4 = %offs_n, %kv_offset_5 = %kv_offset) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %7:3 = tt.call @"torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(16,)cconstexpr_None__(17,)cconstexpr_None__(29,)cconstexpr_bf16__(30,)cconstexpr_1_d_44269504__(35,)cconstexpr_False__(36,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %ks0, %ks1, %q, %K, %V, %Q_LEN, %KV_LEN, %acc_1, %l_i_2, %m_i_3, %off_z, %off_h, %offs_m, %offs_n_4, %kv_start, %kv_offset_5, %stride_kk, %stride_kn, %stride_vn, %stride_vk) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc146) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc425) + %offs_n_6 = tt.splat %offset : i32 -> tensor<1x64xi32> loc(#loc426) + %offs_n_7 = arith.addi %offs_n_4, %offs_n_6 : tensor<1x64xi32> loc(#loc426) + %kv_offset_8 = arith.addi %kv_offset_5, %offset : i32 loc(#loc427) + scf.yield %7#0, %7#1, %7#2, %offs_n_7, %kv_offset_8 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc150) + } loc(#loc561) + tt.return %kv_offset_0#0, %kv_offset_0#1, %kv_offset_0#2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc151) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc152) + %5 = ub.poison : tensor<128xf32> loc(#loc152) + %6 = ub.poison : tensor<128xf32> loc(#loc152) + tt.return %4, %5, %6 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc152) + } loc(#loc143) + tt.func private @"torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(16,)cconstexpr_None__(17,)cconstexpr_None__(29,)cconstexpr_bf16__(30,)cconstexpr_1_d_44269504__(35,)cconstexpr_False__(36,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc153)), %arg_K: !tt.ptr loc("arg_K"(#loc153)), %arg_V: !tt.ptr loc("arg_V"(#loc153)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc153)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc153)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc153)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc153)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc153)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc153)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc153)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc153)), %ks0: i32 loc("ks0"(#loc153)), %ks1: i32 loc("ks1"(#loc153)), %q: tensor<128x128xbf16> loc("q"(#loc153)), %K: !tt.ptr loc("K"(#loc153)), %V: !tt.ptr loc("V"(#loc153)), %Q_LEN: i32 loc("Q_LEN"(#loc153)), %KV_LEN: i32 loc("KV_LEN"(#loc153)), %acc: tensor<128x128xf32> loc("acc"(#loc153)), %l_i: tensor<128xf32> loc("l_i"(#loc153)), %m_i: tensor<128xf32> loc("m_i"(#loc153)), %off_z: i32 loc("off_z"(#loc153)), %off_h: i32 loc("off_h"(#loc153)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc153)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc153)), %kv_start: i32 loc("kv_start"(#loc153)), %kv_offset: i32 loc("kv_offset"(#loc153)), %stride_kk: i32 loc("stride_kk"(#loc153)), %stride_kn: i32 loc("stride_kn"(#loc153)), %stride_vn: i32 loc("stride_vn"(#loc153)), %stride_vk: i32 loc("stride_vk"(#loc153))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_base_offset = arith.addi %kv_start, %kv_offset : i32 loc(#loc459) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc460) + %offs_n_load = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc461) + %offs_n_load_0 = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc462) + %offs_n_load_1 = arith.addi %offs_n_load_0, %offs_n_load : tensor<64xi32> loc(#loc462) + %k = tt.call @"torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%K, %offs_n_load_1, %offs_k, %stride_kn, %stride_kk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc463) + %k_2 = tt.trans %k {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc464) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc465) + %qk_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc465) + %qk_4 = tt.dot %q, %k_2, %qk_3, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc465) + %qk_5 = arith.constant 0.0883883461 : f32 loc(#loc466) + %qk_6 = arith.constant 0.0883883461 : f32 loc(#loc466) + %qk_7 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc466) + %qk_8 = arith.mulf %qk_4, %qk_7 : tensor<128x64xf32> loc(#loc466) + %m = tt.call @torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.get_bounded_indices__i32S128_1S_i32__(%offs_m, %Q_LEN) : (tensor<128x1xi32>, i32) -> tensor<128x1xi32> loc(#loc467) + %n = tt.call @torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.get_bounded_indices__i32S1_64S_i32__(%offs_n, %KV_LEN) : (tensor<1x64xi32>, i32) -> tensor<1x64xi32> loc(#loc468) + %post_mod_scores = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc469) + %post_mod_scores_9 = arith.cmpi slt, %offs_n, %post_mod_scores : tensor<1x64xi32> loc(#loc469) + %post_mod_scores_10 = arith.constant 0xFF800000 : f32 loc(#loc470) + %post_mod_scores_11 = arith.constant 0xFF800000 : f32 loc(#loc470) + %post_mod_scores_12 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc470) + %post_mod_scores_13 = tt.broadcast %post_mod_scores_9 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc470) + %post_mod_scores_14 = arith.select %post_mod_scores_13, %qk_8, %post_mod_scores_12 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc470) + %tmp1 = arith.constant false loc(#loc471) + %tmp1_15 = arith.constant dense : tensor<1xi1> loc(#loc471) + %tmp4 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc472) + %tmp4_16 = tt.broadcast %n : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc472) + %tmp4_17 = arith.cmpi sge, %tmp4, %tmp4_16 : tensor<128x64xi32> loc(#loc472) + %tmp5 = arith.extsi %n : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc473) + %tmp7 = tt.addptr %in_ptr9, %off_z : !tt.ptr, i32 loc(#loc474) + %tmp7_18 = tt.load %tmp7 : !tt.ptr loc(#loc475) + %tmp8 = tt.splat %tmp7_18 : i64 -> tensor<1x64xi64> loc(#loc476) + %tmp8_19 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc476) + %tmp9 = arith.extsi %m : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc477) + %tmp10 = tt.splat %tmp7_18 : i64 -> tensor<128x1xi64> loc(#loc478) + %tmp10_20 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc478) + %tmp11 = tt.broadcast %tmp8_19 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc479) + %tmp11_21 = tt.broadcast %tmp10_20 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc479) + %tmp11_22 = arith.andi %tmp11, %tmp11_21 : tensor<128x64xi1> loc(#loc479) + %tmp12 = arith.andi %tmp4_17, %tmp11_22 : tensor<128x64xi1> loc(#loc480) + %tmp13 = tt.expand_dims %tmp1_15 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc481) + %tmp13_23 = tt.broadcast %tmp13 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc481) + %tmp13_24 = arith.ori %tmp13_23, %tmp12 : tensor<128x64xi1> loc(#loc481) + %tmp14 = arith.constant 2048 : i32 loc(#loc482) + %tmp14_25 = arith.constant dense<2048> : tensor<1xi32> loc(#loc482) + %tmp15 = tt.expand_dims %tmp14_25 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc483) + %tmp15_26 = tt.broadcast %tmp15 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc483) + %tmp15_27 = arith.cmpi sge, %n, %tmp15_26 : tensor<1x64xi32> loc(#loc483) + %tmp16 = tt.expand_dims %tmp14_25 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc484) + %tmp16_28 = tt.broadcast %tmp16 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc484) + %tmp16_29 = arith.remsi %n, %tmp16_28 : tensor<1x64xi32> loc(#loc484) + %tmp17 = arith.constant 0 : i32 loc(#loc485) + %tmp17_30 = arith.constant dense<0> : tensor<1xi32> loc(#loc485) + %tmp18 = tt.expand_dims %tmp17_30 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc486) + %tmp18_31 = tt.broadcast %tmp18 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc486) + %tmp18_32 = arith.cmpi ne, %tmp16_29, %tmp18_31 : tensor<1x64xi32> loc(#loc486) + %tmp19 = arith.constant 0 : i32 loc(#loc487) + %tmp19_33 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc487) + %tmp19_34 = arith.cmpi slt, %tmp16_29, %tmp19_33 : tensor<1x64xi32> loc(#loc487) + %tmp20 = arith.constant 0 : i32 loc(#loc488) + %tmp20_35 = arith.constant dense<0> : tensor<1xi32> loc(#loc488) + %tmp20_36 = arith.cmpi slt, %tmp14_25, %tmp20_35 : tensor<1xi32> loc(#loc488) + %tmp21 = tt.expand_dims %tmp20_36 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc489) + %tmp21_37 = tt.broadcast %tmp21 : tensor<1x1xi1> -> tensor<1x64xi1> loc(#loc489) + %tmp21_38 = arith.cmpi ne, %tmp19_34, %tmp21_37 : tensor<1x64xi1> loc(#loc489) + %tmp22 = arith.andi %tmp18_32, %tmp21_38 : tensor<1x64xi1> loc(#loc490) + %tmp23 = tt.expand_dims %tmp14_25 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc491) + %tmp23_39 = tt.broadcast %tmp23 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc491) + %tmp23_40 = arith.addi %tmp16_29, %tmp23_39 : tensor<1x64xi32> loc(#loc491) + %tmp24 = arith.select %tmp22, %tmp23_40, %tmp16_29 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc492) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc493) + %tmp26 = tt.splat %tmp7_18 : i64 -> tensor<1x64xi64> loc(#loc494) + %tmp26_41 = arith.cmpi slt, %tmp25, %tmp26 : tensor<1x64xi64> loc(#loc494) + %tmp27 = arith.andi %tmp15_27, %tmp26_41 : tensor<1x64xi1> loc(#loc495) + %tmp28 = tt.broadcast %n : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc496) + %tmp28_42 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc496) + %tmp28_43 = arith.subi %tmp28, %tmp28_42 : tensor<128x64xi32> loc(#loc496) + %tmp29 = tt.expand_dims %tmp14_25 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc497) + %tmp29_44 = tt.broadcast %tmp29 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc497) + %tmp29_45 = arith.remsi %tmp28_43, %tmp29_44 : tensor<128x64xi32> loc(#loc497) + %tmp30 = tt.expand_dims %tmp17_30 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc498) + %tmp30_46 = tt.broadcast %tmp30 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc498) + %tmp30_47 = arith.cmpi ne, %tmp29_45, %tmp30_46 : tensor<128x64xi32> loc(#loc498) + %tmp31 = arith.constant 0 : i32 loc(#loc499) + %tmp31_48 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc499) + %tmp31_49 = arith.cmpi slt, %tmp29_45, %tmp31_48 : tensor<128x64xi32> loc(#loc499) + %tmp32 = tt.expand_dims %tmp20_36 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc500) + %tmp32_50 = tt.broadcast %tmp32 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc500) + %tmp32_51 = arith.cmpi ne, %tmp31_49, %tmp32_50 : tensor<128x64xi1> loc(#loc500) + %tmp33 = arith.andi %tmp30_47, %tmp32_51 : tensor<128x64xi1> loc(#loc501) + %tmp34 = tt.expand_dims %tmp14_25 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc502) + %tmp34_52 = tt.broadcast %tmp34 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc502) + %tmp34_53 = arith.addi %tmp29_45, %tmp34_52 : tensor<128x64xi32> loc(#loc502) + %tmp35 = arith.select %tmp33, %tmp34_53, %tmp29_45 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc503) + %tmp36 = tt.expand_dims %tmp17_30 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc504) + %tmp36_54 = tt.broadcast %tmp36 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc504) + %tmp36_55 = arith.cmpi eq, %tmp35, %tmp36_54 : tensor<128x64xi32> loc(#loc504) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc505) + %tmp37_56 = arith.andi %tmp37, %tmp36_55 : tensor<128x64xi1> loc(#loc505) + %tmp38 = arith.ori %tmp13_24, %tmp37_56 : tensor<128x64xi1> loc(#loc506) + %mask_mod_output = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc507) + %mask_mod_output_57 = arith.cmpi slt, %offs_n, %mask_mod_output : tensor<1x64xi32> loc(#loc507) + %mask_mod_output_58 = arith.constant false loc(#loc508) + %mask_mod_output_59 = arith.constant false loc(#loc508) + %mask_mod_output_60 = arith.constant dense : tensor<128x64xi1> loc(#loc508) + %mask_mod_output_61 = tt.broadcast %mask_mod_output_57 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc508) + %mask_mod_output_62 = arith.select %mask_mod_output_61, %tmp38, %mask_mod_output_60 : tensor<128x64xi1>, tensor<128x64xi1> loc(#loc508) + %post_mod_scores_63 = arith.constant 0xFF800000 : f32 loc(#loc509) + %post_mod_scores_64 = arith.constant 0xFF800000 : f32 loc(#loc509) + %post_mod_scores_65 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc509) + %post_mod_scores_66 = arith.select %mask_mod_output_62, %post_mod_scores_14, %post_mod_scores_65 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc509) + %post_mod_scores_67 = arith.constant 1.44269502 : f32 loc(#loc510) + %post_mod_scores_68 = arith.constant 1.44269502 : f32 loc(#loc510) + %post_mod_scores_69 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc510) + %post_mod_scores_70 = arith.mulf %post_mod_scores_66, %post_mod_scores_69 : tensor<128x64xf32> loc(#loc510) + %m_ij = tt.call @"triton.language.standard.max__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cconstexpr_True__(4,)cconstexpr_False_"(%post_mod_scores_70) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc511) + %m_ij_71 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc512) + %masked_out_rows = arith.constant 0xFF800000 : f32 loc(#loc513) + %masked_out_rows_72 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc513) + %masked_out_rows_73 = arith.cmpf oeq, %m_ij_71, %masked_out_rows_72 : tensor<128xf32> loc(#loc513) + %m_ij_masked = arith.constant 0 : i32 loc(#loc514) + %m_ij_masked_74 = arith.constant 0.000000e+00 : f32 loc(#loc514) + %m_ij_masked_75 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc514) + %m_ij_masked_76 = arith.select %masked_out_rows_73, %m_ij_masked_75, %m_ij_71 : tensor<128xi1>, tensor<128xf32> loc(#loc514) + %alpha = arith.subf %m_i, %m_ij_masked_76 : tensor<128xf32> loc(#loc515) + %alpha_77 = math.exp2 %alpha : tensor<128xf32> loc(#loc516) + %p = tt.expand_dims %m_ij_masked_76 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc517) + %p_78 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc518) + %p_79 = arith.subf %post_mod_scores_70, %p_78 : tensor<128x64xf32> loc(#loc518) + %p_80 = math.exp2 %p_79 : tensor<128x64xf32> loc(#loc519) + %l_i_81 = arith.mulf %l_i, %alpha_77 : tensor<128xf32> loc(#loc520) + %l_i_82 = tt.call @"triton.language.standard.sum__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%p_80) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc521) + %l_i_83 = arith.addf %l_i_81, %l_i_82 : tensor<128xf32> loc(#loc522) + %acc_84 = tt.expand_dims %alpha_77 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc523) + %acc_85 = tt.broadcast %acc_84 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc524) + %acc_86 = arith.mulf %acc, %acc_85 : tensor<128x128xf32> loc(#loc524) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc525) + %v = tt.call @"torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%V, %offs_n_load_1, %offs_v, %stride_vn, %stride_vk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc526) + %acc_87 = arith.truncf %p_80 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc527) + %acc_88 = arith.constant 0.000000e+00 : f32 loc(#loc528) + %acc_89 = tt.dot %acc_87, %v, %acc_86, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc528) + tt.return %acc_89, %l_i_83, %m_ij_71 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc224) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc225) + %1 = ub.poison : tensor<128xf32> loc(#loc225) + %2 = ub.poison : tensor<128xf32> loc(#loc225) + tt.return %0, %1, %2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc225) + } loc(#loc153) + tt.func private @"torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: !tt.ptr loc("ptr"(#loc125)), %offs_m: tensor<64xi32> loc("offs_m"(#loc125)), %offs_n: tensor<128xi32> loc("offs_n"(#loc125)), %stride_m: i32 loc("stride_m"(#loc125)), %stride_n: i32 loc("stride_n"(#loc125)), %M_LEN: i32 loc("M_LEN"(#loc125))) -> tensor<64x128xbf16> attributes {noinline = false} { + %ptr_0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc383) + %ptr_1 = tt.splat %stride_m : i32 -> tensor<64x1xi32> loc(#loc384) + %ptr_2 = arith.muli %ptr_0, %ptr_1 : tensor<64x1xi32> loc(#loc384) + %ptr_3 = tt.splat %ptr : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc385) + %ptr_4 = tt.addptr %ptr_3, %ptr_2 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc385) + %ptr_5 = tt.expand_dims %offs_n {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc386) + %ptr_6 = tt.splat %stride_n : i32 -> tensor<1x128xi32> loc(#loc387) + %ptr_7 = arith.muli %ptr_5, %ptr_6 : tensor<1x128xi32> loc(#loc387) + %ptr_8 = tt.broadcast %ptr_4 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc388) + %ptr_9 = tt.broadcast %ptr_7 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc388) + %ptr_10 = tt.addptr %ptr_8, %ptr_9 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc388) + %0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc132) + %1 = tt.splat %M_LEN : i32 -> tensor<64x1xi32> loc(#loc133) + %2 = arith.cmpi slt, %0, %1 : tensor<64x1xi32> loc(#loc133) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc134) + %3 = tt.broadcast %2 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc134) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<64x128xf32> loc(#loc134) + %4 = arith.truncf %cst_11 : tensor<64x128xf32> to tensor<64x128xbf16> loc(#loc134) + %5 = tt.load %ptr_10, %3, %4 : tensor<64x128x!tt.ptr> loc(#loc134) + tt.return %5 : tensor<64x128xbf16> loc(#loc135) + ^bb1: // no predecessors + %6 = ub.poison : tensor<64x128xbf16> loc(#loc136) + tt.return %6 : tensor<64x128xbf16> loc(#loc136) + } loc(#loc125) + tt.func private @torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.get_bounded_indices__i32S128_1S_i32__(%indices: tensor<128x1xi32> loc("indices"(#loc226)), %max_len: i32 loc("max_len"(#loc226))) -> tensor<128x1xi32> attributes {noinline = false} { + %0 = tt.splat %max_len : i32 -> tensor<128x1xi32> loc(#loc227) + %1 = arith.remsi %indices, %0 : tensor<128x1xi32> loc(#loc227) + tt.return %1 : tensor<128x1xi32> loc(#loc228) + ^bb1: // no predecessors + %2 = ub.poison : tensor<128x1xi32> loc(#loc229) + tt.return %2 : tensor<128x1xi32> loc(#loc229) + } loc(#loc226) + tt.func private @torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.get_bounded_indices__i32S1_64S_i32__(%indices: tensor<1x64xi32> loc("indices"(#loc226)), %max_len: i32 loc("max_len"(#loc226))) -> tensor<1x64xi32> attributes {noinline = false} { + %0 = tt.splat %max_len : i32 -> tensor<1x64xi32> loc(#loc227) + %1 = arith.remsi %indices, %0 : tensor<1x64xi32> loc(#loc227) + tt.return %1 : tensor<1x64xi32> loc(#loc228) + ^bb1: // no predecessors + %2 = ub.poison : tensor<1x64xi32> loc(#loc229) + tt.return %2 : tensor<1x64xi32> loc(#loc229) + } loc(#loc226) + tt.func private @"triton.language.standard.max__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cconstexpr_True__(4,)cconstexpr_False_"(%input: tensor<128x64xf32> loc("input"(#loc230))) -> tensor<128xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._elementwise_max__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc231) + tt.reduce.return %2 : f32 loc(#loc231) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc231) + tt.return %0 : tensor<128xf32> loc(#loc233) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128xf32> loc(#loc234) + tt.return %1 : tensor<128xf32> loc(#loc234) + } loc(#loc230) + tt.func private @triton.language.standard._elementwise_max__fp32_fp32__(%a: f32 loc("a"(#loc235)), %b: f32 loc("b"(#loc235))) -> f32 attributes {noinline = false} { + %0 = arith.maxnumf %a, %b : f32 loc(#loc236) + tt.return %0 : f32 loc(#loc237) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc238) + tt.return %1 : f32 loc(#loc238) + } loc(#loc235) + tt.func private @"triton.language.standard.sum__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x64xf32> loc("input"(#loc239))) -> tensor<128xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc240) + tt.reduce.return %2 : f32 loc(#loc240) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc240) + tt.return %0 : tensor<128xf32> loc(#loc241) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128xf32> loc(#loc242) + tt.return %1 : tensor<128xf32> loc(#loc242) + } loc(#loc239) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc243)), %b: f32 loc("b"(#loc243))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc244) + tt.return %0 : f32 loc(#loc245) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc246) + tt.return %1 : f32 loc(#loc246) + } loc(#loc243) + tt.func private @"torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%loop_iter: i32 loc("loop_iter"(#loc247)), %col_indices: !tt.ptr loc("col_indices"(#loc247)), %total_blocks: i32 loc("total_blocks"(#loc247))) -> i32 attributes {noinline = false} { + %cur_block_idx = arith.constant 2 : i32 loc(#loc540) + %cur_block_idx_0 = arith.constant 2 : i32 loc(#loc540) + %cur_block_idx_1 = arith.divsi %loop_iter, %cur_block_idx_0 : i32 loc(#loc540) + %cur_block = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc541) + %cur_block_2 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc542) + %next_block = arith.constant 1 : i32 loc(#loc543) + %next_block_3 = arith.constant 1 : i32 loc(#loc543) + %next_block_4 = arith.addi %cur_block_idx_1, %next_block_3 : i32 loc(#loc543) + %next_block_5 = arith.cmpi slt, %next_block_4, %total_blocks : i32 loc(#loc544) + %next_block_6 = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc545) + %next_block_7 = arith.constant 1 : i32 loc(#loc546) + %next_block_8 = tt.addptr %next_block_6, %next_block_7 : !tt.ptr, i32 loc(#loc546) + %next_block_9 = tt.load %next_block_8, %next_block_5 evictionPolicy = evict_last : !tt.ptr loc(#loc547) + %needs_jump = arith.constant 1 : i32 loc(#loc548) + %needs_jump_10 = arith.constant 1 : i32 loc(#loc548) + %needs_jump_11 = arith.addi %loop_iter, %needs_jump_10 : i32 loc(#loc548) + %needs_jump_12 = arith.constant 2 : i32 loc(#loc549) + %needs_jump_13 = arith.constant 2 : i32 loc(#loc549) + %needs_jump_14 = arith.remsi %needs_jump_11, %needs_jump_13 : i32 loc(#loc549) + %needs_jump_15 = arith.constant 0 : i32 loc(#loc550) + %needs_jump_16 = arith.cmpi eq, %needs_jump_14, %needs_jump_15 : i32 loc(#loc550) + %jump_to_block = arith.subi %next_block_9, %cur_block_2 : i32 loc(#loc551) + %jump_to_block_17 = arith.constant 128 : i32 loc(#loc552) + %jump_to_block_18 = arith.constant 128 : i32 loc(#loc552) + %jump_to_block_19 = arith.muli %jump_to_block, %jump_to_block_18 : i32 loc(#loc552) + %jump_to_block_20 = arith.constant 64 : i32 loc(#loc553) + %jump_to_block_21 = arith.constant 64 : i32 loc(#loc553) + %jump_to_block_22 = arith.subi %jump_to_block_19, %jump_to_block_21 : i32 loc(#loc553) + %offset = arith.extui %needs_jump_16 : i1 to i32 loc(#loc554) + %offset_23 = arith.muli %jump_to_block_22, %offset : i32 loc(#loc554) + %offset_24 = arith.constant 1 : i32 loc(#loc555) + %offset_25 = arith.constant 1 : i32 loc(#loc555) + %offset_26 = arith.extui %needs_jump_16 : i1 to i32 loc(#loc555) + %offset_27 = arith.subi %offset_25, %offset_26 : i32 loc(#loc555) + %offset_28 = arith.constant 64 : i32 loc(#loc556) + %offset_29 = arith.constant 64 : i32 loc(#loc556) + %offset_30 = arith.muli %offset_27, %offset_29 : i32 loc(#loc556) + %offset_31 = arith.addi %offset_23, %offset_30 : i32 loc(#loc557) + tt.return %offset_31 : i32 loc(#loc266) + ^bb1: // no predecessors + %0 = ub.poison : i32 loc(#loc267) + tt.return %0 : i32 loc(#loc267) + } loc(#loc247) + tt.func private @"torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(16,)cNone_(17,)cNone_(30,)cconstexpr_0__(32,)cconstexpr_bf16__(37,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc143)), %arg_K: !tt.ptr loc("arg_K"(#loc143)), %arg_V: !tt.ptr loc("arg_V"(#loc143)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc143)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc143)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc143)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc143)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc143)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc143)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc143)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc143)), %ks0: i32 loc("ks0"(#loc143)), %ks1: i32 loc("ks1"(#loc143)), %q: tensor<128x128xbf16> loc("q"(#loc143)), %K: !tt.ptr loc("K"(#loc143)), %V: !tt.ptr loc("V"(#loc143)), %Q_LEN: i32 loc("Q_LEN"(#loc143)), %KV_LEN: i32 loc("KV_LEN"(#loc143)), %acc: tensor<128x128xf32> loc("acc"(#loc143)), %l_i: tensor<128xf32> loc("l_i"(#loc143)), %m_i: tensor<128xf32> loc("m_i"(#loc143)), %off_z: i32 loc("off_z"(#loc143)), %off_h: i32 loc("off_h"(#loc143)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc143)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc143)), %kv_start: i32 loc("kv_start"(#loc143)), %kv_indices: !tt.ptr loc("kv_indices"(#loc143)), %kv_num_blocks: i32 loc("kv_num_blocks"(#loc143)), %block_n_end: i32 loc("block_n_end"(#loc143)), %stride_kk: i32 loc("stride_kk"(#loc143)), %stride_kn: i32 loc("stride_kn"(#loc143)), %stride_vn: i32 loc("stride_vn"(#loc143)), %stride_vk: i32 loc("stride_vk"(#loc143))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_offset = arith.constant 0 : i32 loc(#loc423) + %c0_i32 = arith.constant 0 : i32 loc(#loc145) + %c1_i32 = arith.constant 1 : i32 loc(#loc145) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc145) + %1 = arith.bitcast %block_n_end : i32 to i32 loc(#loc145) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc145) + %3 = ub.poison : i32 loc(#loc145) + %kv_offset_0:5 = scf.for %start_n = %0 to %1 step %2 iter_args(%acc_1 = %acc, %l_i_2 = %l_i, %m_i_3 = %m_i, %offs_n_4 = %offs_n, %kv_offset_5 = %kv_offset) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %7:3 = tt.call @"torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(16,)cconstexpr_None__(17,)cconstexpr_None__(29,)cconstexpr_bf16__(30,)cconstexpr_1_d_44269504__(35,)cconstexpr_True__(36,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %ks0, %ks1, %q, %K, %V, %Q_LEN, %KV_LEN, %acc_1, %l_i_2, %m_i_3, %off_z, %off_h, %offs_m, %offs_n_4, %kv_start, %kv_offset_5, %stride_kk, %stride_kn, %stride_vn, %stride_vk) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc146) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc425) + %offs_n_6 = tt.splat %offset : i32 -> tensor<1x64xi32> loc(#loc426) + %offs_n_7 = arith.addi %offs_n_4, %offs_n_6 : tensor<1x64xi32> loc(#loc426) + %kv_offset_8 = arith.addi %kv_offset_5, %offset : i32 loc(#loc427) + scf.yield %7#0, %7#1, %7#2, %offs_n_7, %kv_offset_8 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc150) + } loc(#loc561) + tt.return %kv_offset_0#0, %kv_offset_0#1, %kv_offset_0#2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc151) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc152) + %5 = ub.poison : tensor<128xf32> loc(#loc152) + %6 = ub.poison : tensor<128xf32> loc(#loc152) + tt.return %4, %5, %6 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc152) + } loc(#loc143) + tt.func private @"torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(16,)cconstexpr_None__(17,)cconstexpr_None__(29,)cconstexpr_bf16__(30,)cconstexpr_1_d_44269504__(35,)cconstexpr_True__(36,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc153)), %arg_K: !tt.ptr loc("arg_K"(#loc153)), %arg_V: !tt.ptr loc("arg_V"(#loc153)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc153)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc153)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc153)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc153)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc153)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc153)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc153)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc153)), %ks0: i32 loc("ks0"(#loc153)), %ks1: i32 loc("ks1"(#loc153)), %q: tensor<128x128xbf16> loc("q"(#loc153)), %K: !tt.ptr loc("K"(#loc153)), %V: !tt.ptr loc("V"(#loc153)), %Q_LEN: i32 loc("Q_LEN"(#loc153)), %KV_LEN: i32 loc("KV_LEN"(#loc153)), %acc: tensor<128x128xf32> loc("acc"(#loc153)), %l_i: tensor<128xf32> loc("l_i"(#loc153)), %m_i: tensor<128xf32> loc("m_i"(#loc153)), %off_z: i32 loc("off_z"(#loc153)), %off_h: i32 loc("off_h"(#loc153)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc153)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc153)), %kv_start: i32 loc("kv_start"(#loc153)), %kv_offset: i32 loc("kv_offset"(#loc153)), %stride_kk: i32 loc("stride_kk"(#loc153)), %stride_kn: i32 loc("stride_kn"(#loc153)), %stride_vn: i32 loc("stride_vn"(#loc153)), %stride_vk: i32 loc("stride_vk"(#loc153))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_base_offset = arith.addi %kv_start, %kv_offset : i32 loc(#loc459) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc460) + %offs_n_load = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc461) + %offs_n_load_0 = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc462) + %offs_n_load_1 = arith.addi %offs_n_load_0, %offs_n_load : tensor<64xi32> loc(#loc462) + %k = tt.call @"torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%K, %offs_n_load_1, %offs_k, %stride_kn, %stride_kk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc463) + %k_2 = tt.trans %k {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc464) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc465) + %qk_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc465) + %qk_4 = tt.dot %q, %k_2, %qk_3, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc465) + %qk_5 = arith.constant 0.0883883461 : f32 loc(#loc466) + %qk_6 = arith.constant 0.0883883461 : f32 loc(#loc466) + %qk_7 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc466) + %qk_8 = arith.mulf %qk_4, %qk_7 : tensor<128x64xf32> loc(#loc466) + %m = tt.call @torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.get_bounded_indices__i32S128_1S_i32__(%offs_m, %Q_LEN) : (tensor<128x1xi32>, i32) -> tensor<128x1xi32> loc(#loc467) + %n = tt.call @torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.get_bounded_indices__i32S1_64S_i32__(%offs_n, %KV_LEN) : (tensor<1x64xi32>, i32) -> tensor<1x64xi32> loc(#loc468) + %post_mod_scores = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc469) + %post_mod_scores_9 = arith.cmpi slt, %offs_n, %post_mod_scores : tensor<1x64xi32> loc(#loc469) + %post_mod_scores_10 = arith.constant 0xFF800000 : f32 loc(#loc470) + %post_mod_scores_11 = arith.constant 0xFF800000 : f32 loc(#loc470) + %post_mod_scores_12 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc470) + %post_mod_scores_13 = tt.broadcast %post_mod_scores_9 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc470) + %post_mod_scores_14 = arith.select %post_mod_scores_13, %qk_8, %post_mod_scores_12 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc470) + %post_mod_scores_15 = arith.constant 1.44269502 : f32 loc(#loc510) + %post_mod_scores_16 = arith.constant 1.44269502 : f32 loc(#loc510) + %post_mod_scores_17 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc510) + %post_mod_scores_18 = arith.mulf %post_mod_scores_14, %post_mod_scores_17 : tensor<128x64xf32> loc(#loc510) + %m_ij = tt.call @"triton.language.standard.max__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cconstexpr_True__(4,)cconstexpr_False_"(%post_mod_scores_18) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc511) + %m_ij_19 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc512) + %masked_out_rows = arith.constant 0xFF800000 : f32 loc(#loc513) + %masked_out_rows_20 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc513) + %masked_out_rows_21 = arith.cmpf oeq, %m_ij_19, %masked_out_rows_20 : tensor<128xf32> loc(#loc513) + %m_ij_masked = arith.constant 0 : i32 loc(#loc514) + %m_ij_masked_22 = arith.constant 0.000000e+00 : f32 loc(#loc514) + %m_ij_masked_23 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc514) + %m_ij_masked_24 = arith.select %masked_out_rows_21, %m_ij_masked_23, %m_ij_19 : tensor<128xi1>, tensor<128xf32> loc(#loc514) + %alpha = arith.subf %m_i, %m_ij_masked_24 : tensor<128xf32> loc(#loc515) + %alpha_25 = math.exp2 %alpha : tensor<128xf32> loc(#loc516) + %p = tt.expand_dims %m_ij_masked_24 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc517) + %p_26 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc518) + %p_27 = arith.subf %post_mod_scores_18, %p_26 : tensor<128x64xf32> loc(#loc518) + %p_28 = math.exp2 %p_27 : tensor<128x64xf32> loc(#loc519) + %l_i_29 = arith.mulf %l_i, %alpha_25 : tensor<128xf32> loc(#loc520) + %l_i_30 = tt.call @"triton.language.standard.sum__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%p_28) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc521) + %l_i_31 = arith.addf %l_i_29, %l_i_30 : tensor<128xf32> loc(#loc522) + %acc_32 = tt.expand_dims %alpha_25 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc523) + %acc_33 = tt.broadcast %acc_32 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc524) + %acc_34 = arith.mulf %acc, %acc_33 : tensor<128x128xf32> loc(#loc524) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc525) + %v = tt.call @"torch._inductor.runtime.compile_tasks.cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%V, %offs_n_load_1, %offs_v, %stride_vn, %stride_vk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc526) + %acc_35 = arith.truncf %p_28 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc527) + %acc_36 = arith.constant 0.000000e+00 : f32 loc(#loc528) + %acc_37 = tt.dot %acc_35, %v, %acc_34, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc528) + tt.return %acc_37, %l_i_31, %m_ij_19 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc224) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc225) + %1 = ub.poison : tensor<128xf32> loc(#loc225) + %2 = ub.poison : tensor<128xf32> loc(#loc225) + tt.return %0, %1, %2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc225) + } loc(#loc153) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":85:49) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":86:54) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":86:63) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":86:49) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":87:54) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":87:63) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":87:49) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":89:9) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":90:9) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":91:12) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":92:10) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":97:28) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":98:27) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":99:27) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":103:23) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":104:24) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":105:21) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":107:24) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":107:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":107:36) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":108:25) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":108:47) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":108:37) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":109:25) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":109:47) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":109:37) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":111:12) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":112:12) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":113:12) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":120:15) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":121:16) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":123:28) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":124:29) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":129:27) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":130:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":134:19) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":134:50) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":135:19) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":136:19) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":138:23) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":138:46) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":138:33) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":141:38) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":141:50) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":142:51) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":142:85) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":142:74) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":143:46) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":143:76) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":143:97) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":143:64) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":144:23) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":144:46) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":144:33) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":145:26) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":146:101) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":151:26) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":152:23) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":152:37) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":153:42) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":153:28) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":154:45) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":154:92) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":154:102) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":154:65) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":159:37) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":159:24) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":167:31) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":167:48) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":172:41) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":181:35) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":182:27) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":182:41) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":183:51) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":183:32) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":184:49) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":184:96) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":184:106) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":184:69) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":186:41) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":186:28) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":193:35) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":193:52) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":198:45) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":206:26) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":206:34) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":208:20) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":208:16) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":209:27) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":210:27) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":211:19) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":212:25) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":212:45) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":214:20) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":214:38) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":214:30) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":217:25) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":217:21) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":217:40) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":217:33) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":217:57) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":217:49) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":218:53) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":218:49) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":218:67) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":218:62) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":218:83) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":218:75) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":218:25) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":218:109) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":221:26) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":221:31) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":222:32) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":222:23) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":222:40) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":223:33) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":223:20) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":227:48) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":227:29) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":229:4) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":284:27) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":284:38) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":284:20) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":284:56) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":284:67) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":284:49) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":292:41) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":292:52) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":292:23) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":292:15) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":287:4) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:16) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc140 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc141 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:11) +#loc142 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:4) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":499:16) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":502:40) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":538:16) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":545:63) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":548:26) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":549:21) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":549:8) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":552:11) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":552:4) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":342:32) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":345:26) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":346:48) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":346:35) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":347:107) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":349:17) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":351:19) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":353:14) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":358:36) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":359:36) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":367:44) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":367:69) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":370:35) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":373:23) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":374:23) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":376:33) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":376:23) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":377:22) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":378:23) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":379:23) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":380:23) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":381:23) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":382:23) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":383:35) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":384:24) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":385:24) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":386:32) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":387:25) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":388:92) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":389:92) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":390:25) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":391:24) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":392:24) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":393:39) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":394:25) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":395:24) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":396:24) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":397:23) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":398:25) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":399:25) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":400:92) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":401:25) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":402:24) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":403:24) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":404:39) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":405:25) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":406:24) +#loc201 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":407:24) +#loc202 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":412:48) +#loc203 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":412:73) +#loc204 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":414:69) +#loc205 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":417:27) +#loc206 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":421:51) +#loc207 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":421:27) +#loc208 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":423:35) +#loc209 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":424:51) +#loc210 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":428:31) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":428:25) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":429:51) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":429:39) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":429:21) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":434:16) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":434:34) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":434:24) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":436:22) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":436:16) +#loc220 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":438:26) +#loc221 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":439:107) +#loc222 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":440:22) +#loc223 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":440:44) +#loc224 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":445:11) +#loc225 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":445:4) +#loc227 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":257:21) +#loc228 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":257:11) +#loc229 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":257:4) +#loc231 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40) +#loc233 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:15) +#loc234 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":177:4) +#loc236 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27) +#loc237 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:11) +#loc238 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:4) +#loc240 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc241 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc242 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc244 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc245 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc246 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":247:33) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":248:38) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":248:24) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":249:109) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":249:113) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":249:39) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":249:55) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":249:25) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":250:30) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":250:35) +#loc258 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":250:60) +#loc259 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":251:34) +#loc260 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":251:48) +#loc261 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":251:63) +#loc262 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":252:29) +#loc263 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":252:47) +#loc264 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":252:61) +#loc265 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":252:42) +#loc266 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":253:11) +#loc267 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":253:4) +#loc281 = loc("ZQ"(#loc8)) +#loc282 = loc("HQ"(#loc9)) +#loc283 = loc("Q_LEN"(#loc10)) +#loc284 = loc("ZKV"(#loc11)) +#loc285 = loc("q_start"(#loc12)) +#loc286 = loc("off_zq"(#loc13)) +#loc287 = loc("off_hq"(#loc14)) +#loc288 = loc("off_zkv"(#loc15)) +#loc289 = loc("off_hkv"(#loc16)) +#loc290 = loc("off_g"(#loc17)) +#loc291 = loc("q_offset"(#loc18)) +#loc292 = loc("q_offset"(#loc19)) +#loc293 = loc("q_offset"(#loc20)) +#loc294 = loc("k_offset"(#loc21)) +#loc295 = loc("k_offset"(#loc22)) +#loc296 = loc("k_offset"(#loc23)) +#loc297 = loc("v_offset"(#loc24)) +#loc298 = loc("v_offset"(#loc25)) +#loc299 = loc("v_offset"(#loc26)) +#loc300 = loc("Q"(#loc27)) +#loc301 = loc("K"(#loc28)) +#loc302 = loc("V"(#loc29)) +#loc303 = loc("SPARSE_Z"(#loc30)) +#loc304 = loc("SPARSE_HQ"(#loc31)) +#loc305 = loc("sparse_idx_z"(#loc32)) +#loc306 = loc("sparse_idx_hq"(#loc33)) +#loc307 = loc("stride_kv_num_blks_h"(#loc34)) +#loc308 = loc("stride_kv_idx_h"(#loc35)) +#loc309 = loc("m_i"(#loc36)) +#loc310 = loc("m_i"(#loc37)) +#loc311 = loc("l_i"(#loc38)) +#loc312 = loc("acc"(#loc39)) +#loc313 = loc("offs_m"(#loc40)) +#loc314 = loc("offs_m"(#loc41)) +#loc315 = loc("offs_m"(#loc42)) +#loc316 = loc("sparse_hz_offset"(#loc43)) +#loc317 = loc("sparse_hz_offset"(#loc44)) +#loc318 = loc("sparse_kv_num_blks_offset"(#loc45)) +#loc319 = loc("sparse_kv_num_blks_offset"(#loc46)) +#loc320 = loc("sparse_kv_num_blks_offset"(#loc47)) +#loc321 = loc("sparse_kv_idx_offset"(#loc48)) +#loc322 = loc("sparse_kv_idx_offset"(#loc49)) +#loc323 = loc("sparse_kv_idx_offset"(#loc50)) +#loc324 = loc("sparse_kv_idx_offset"(#loc51)) +#loc325 = loc("offs_m"(#loc52)) +#loc326 = loc("offs_m"(#loc53)) +#loc327 = loc("offs_m"(#loc54)) +#loc328 = loc("offs_k"(#loc55)) +#loc329 = loc("q"(#loc56)) +#loc330 = loc("kv_indices"(#loc57)) +#loc331 = loc("kv_start"(#loc58)) +#loc332 = loc("kv_start"(#loc59)) +#loc333 = loc("kv_num_blocks"(#loc60)) +#loc334 = loc("kv_num_blocks"(#loc61)) +#loc335 = loc("block_n_end"(#loc62)) +#loc336 = loc("block_n_end"(#loc63)) +#loc337 = loc("block_n_end"(#loc64)) +#loc338 = loc("block_n_end"(#loc65)) +#loc339 = loc("offs_n"(#loc66)) +#loc340 = loc("offs_n"(#loc67)) +#loc341 = loc("kv_indices"(#loc71)) +#loc342 = loc("kv_start"(#loc72)) +#loc343 = loc("kv_start"(#loc73)) +#loc344 = loc("kv_num_blocks"(#loc74)) +#loc345 = loc("kv_num_blocks"(#loc75)) +#loc346 = loc("block_n_end"(#loc76)) +#loc347 = loc("block_n_end"(#loc77)) +#loc348 = loc("block_n_end"(#loc78)) +#loc349 = loc("block_n_end"(#loc79)) +#loc350 = loc("offs_n"(#loc80)) +#loc351 = loc("offs_n"(#loc81)) +#loc352 = loc("l_i"(#loc85)) +#loc353 = loc("l_i"(#loc86)) +#loc354 = loc("acc"(#loc87)) +#loc355 = loc("acc"(#loc88)) +#loc356 = loc("idx_zq"(#loc89)) +#loc357 = loc("idx_hq"(#loc90)) +#loc358 = loc("idx_m"(#loc91)) +#loc359 = loc("idx_d"(#loc92)) +#loc360 = loc("idx_d"(#loc93)) +#loc361 = loc("mask"(#loc94)) +#loc362 = loc("mask"(#loc95)) +#loc363 = loc("mask"(#loc96)) +#loc364 = loc("xindex"(#loc97)) +#loc365 = loc("xindex"(#loc98)) +#loc366 = loc("xindex"(#loc99)) +#loc367 = loc("xindex"(#loc100)) +#loc368 = loc("xindex"(#loc101)) +#loc369 = loc("xindex"(#loc102)) +#loc370 = loc("off_hz"(#loc111)) +#loc371 = loc("off_hz"(#loc112)) +#loc372 = loc("l_ptrs"(#loc113)) +#loc373 = loc("l_ptrs"(#loc114)) +#loc374 = loc("l_ptrs"(#loc115)) +#loc375 = loc("lse"(#loc116)) +#loc376 = loc("lse"(#loc117)) +#loc383 = loc("ptr"(#loc126)) +#loc384 = loc("ptr"(#loc127)) +#loc385 = loc("ptr"(#loc128)) +#loc386 = loc("ptr"(#loc129)) +#loc387 = loc("ptr"(#loc130)) +#loc388 = loc("ptr"(#loc131)) +#loc423 = loc("kv_offset"(#loc144)) +#loc424 = loc("acc"(#loc145)) +#loc425 = loc("offset"(#loc147)) +#loc426 = loc("offs_n"(#loc148)) +#loc427 = loc("kv_offset"(#loc149)) +#loc459 = loc("kv_base_offset"(#loc154)) +#loc460 = loc("offs_k"(#loc155)) +#loc461 = loc("offs_n_load"(#loc156)) +#loc462 = loc("offs_n_load"(#loc157)) +#loc463 = loc("k"(#loc158)) +#loc464 = loc("k"(#loc159)) +#loc465 = loc("qk"(#loc160)) +#loc466 = loc("qk"(#loc161)) +#loc467 = loc("m"(#loc162)) +#loc468 = loc("n"(#loc163)) +#loc469 = loc("post_mod_scores"(#loc164)) +#loc470 = loc("post_mod_scores"(#loc165)) +#loc471 = loc("tmp1"(#loc166)) +#loc472 = loc("tmp4"(#loc167)) +#loc473 = loc("tmp5"(#loc168)) +#loc474 = loc("tmp7"(#loc169)) +#loc475 = loc("tmp7"(#loc170)) +#loc476 = loc("tmp8"(#loc171)) +#loc477 = loc("tmp9"(#loc172)) +#loc478 = loc("tmp10"(#loc173)) +#loc479 = loc("tmp11"(#loc174)) +#loc480 = loc("tmp12"(#loc175)) +#loc481 = loc("tmp13"(#loc176)) +#loc482 = loc("tmp14"(#loc177)) +#loc483 = loc("tmp15"(#loc178)) +#loc484 = loc("tmp16"(#loc179)) +#loc485 = loc("tmp17"(#loc180)) +#loc486 = loc("tmp18"(#loc181)) +#loc487 = loc("tmp19"(#loc182)) +#loc488 = loc("tmp20"(#loc183)) +#loc489 = loc("tmp21"(#loc184)) +#loc490 = loc("tmp22"(#loc185)) +#loc491 = loc("tmp23"(#loc186)) +#loc492 = loc("tmp24"(#loc187)) +#loc493 = loc("tmp25"(#loc188)) +#loc494 = loc("tmp26"(#loc189)) +#loc495 = loc("tmp27"(#loc190)) +#loc496 = loc("tmp28"(#loc191)) +#loc497 = loc("tmp29"(#loc192)) +#loc498 = loc("tmp30"(#loc193)) +#loc499 = loc("tmp31"(#loc194)) +#loc500 = loc("tmp32"(#loc195)) +#loc501 = loc("tmp33"(#loc196)) +#loc502 = loc("tmp34"(#loc197)) +#loc503 = loc("tmp35"(#loc198)) +#loc504 = loc("tmp36"(#loc199)) +#loc505 = loc("tmp37"(#loc200)) +#loc506 = loc("tmp38"(#loc201)) +#loc507 = loc("mask_mod_output"(#loc202)) +#loc508 = loc("mask_mod_output"(#loc203)) +#loc509 = loc("post_mod_scores"(#loc204)) +#loc510 = loc("post_mod_scores"(#loc205)) +#loc511 = loc("m_ij"(#loc206)) +#loc512 = loc("m_ij"(#loc207)) +#loc513 = loc("masked_out_rows"(#loc208)) +#loc514 = loc("m_ij_masked"(#loc209)) +#loc515 = loc("alpha"(#loc210)) +#loc516 = loc("alpha"(#loc211)) +#loc517 = loc("p"(#loc212)) +#loc518 = loc("p"(#loc213)) +#loc519 = loc("p"(#loc214)) +#loc520 = loc("l_i"(#loc215)) +#loc521 = loc("l_i"(#loc216)) +#loc522 = loc("l_i"(#loc217)) +#loc523 = loc("acc"(#loc218)) +#loc524 = loc("acc"(#loc219)) +#loc525 = loc("offs_v"(#loc220)) +#loc526 = loc("v"(#loc221)) +#loc527 = loc("acc"(#loc222)) +#loc528 = loc("acc"(#loc223)) +#loc540 = loc("cur_block_idx"(#loc248)) +#loc541 = loc("cur_block"(#loc249)) +#loc542 = loc("cur_block"(#loc250)) +#loc543 = loc("next_block"(#loc251)) +#loc544 = loc("next_block"(#loc252)) +#loc545 = loc("next_block"(#loc253)) +#loc546 = loc("next_block"(#loc254)) +#loc547 = loc("next_block"(#loc255)) +#loc548 = loc("needs_jump"(#loc256)) +#loc549 = loc("needs_jump"(#loc257)) +#loc550 = loc("needs_jump"(#loc258)) +#loc551 = loc("jump_to_block"(#loc259)) +#loc552 = loc("jump_to_block"(#loc260)) +#loc553 = loc("jump_to_block"(#loc261)) +#loc554 = loc("offset"(#loc262)) +#loc555 = loc("offset"(#loc263)) +#loc556 = loc("offset"(#loc264)) +#loc557 = loc("offset"(#loc265)) +#loc558 = loc("l_i"(#loc424)) +#loc559 = loc("m_i"(#loc558)) +#loc560 = loc("offs_n"(#loc559)) +#loc561 = loc("kv_offset"(#loc560)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..dae64d3399def85c8e9bd5373a5c589fcf5c6e5a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.ttgir @@ -0,0 +1,982 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":18:0) +#loc1 = loc(unknown) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":538:16) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":172:41) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":421:51) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":434:34) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":198:45) +#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 64, 16]}> +#mma1 = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 128, 16]}> +#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}> +#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}> +#smem = #ttg.shared_memory +#loc167 = loc("arg_Q"(#loc)) +#loc168 = loc("arg_K"(#loc)) +#loc169 = loc("arg_V"(#loc)) +#loc170 = loc("arg_LSE"(#loc)) +#loc171 = loc("arg_MAX"(#loc)) +#loc172 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc173 = loc("arg_KV_IDX"(#loc)) +#loc174 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc175 = loc("arg_FULL_KV_IDX"(#loc)) +#loc176 = loc("in_ptr9"(#loc)) +#loc177 = loc("out_ptr0"(#loc)) +#loc178 = loc("ks0"(#loc)) +#loc179 = loc("ks1"(#loc)) +#loc221 = loc(callsite(#loc50 at #loc51)) +#loc265 = loc("m_ij"(#loc97)) +#loc275 = loc("l_i"(#loc109)) +#loc309 = loc(callsite(#loc50 at #loc145)) +#loc376 = loc(callsite(#loc265 at #loc221)) +#loc386 = loc(callsite(#loc275 at #loc221)) +#loc405 = loc(callsite(#loc265 at #loc309)) +#loc415 = loc(callsite(#loc275 at #loc309)) +#loc437 = loc(callsite(#loc1 at #loc376)) +#loc439 = loc(callsite(#loc1 at #loc386)) +#loc467 = loc(callsite(#loc1 at #loc405)) +#loc469 = loc(callsite(#loc1 at #loc415)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %in_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr9"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<2048> : tensor<128x1xi32, #mma> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x64xi32, #mma> loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<128x64xi32, #mma> loc(#loc1) + %cst_2 = arith.constant dense : tensor<128x64xi1, #mma> loc(#loc1) + %cst_3 = arith.constant dense<2048> : tensor<128x64xi32, #mma> loc(#loc1) + %cst_4 = arith.constant dense<2048> : tensor<1x64xi32, #mma> loc(#loc1) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<4096> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_8 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_9 = arith.constant dense<2048> : tensor<128x1xi32, #blocked> loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %cst_10 = arith.constant dense<0.000000e+00> : tensor<128x128xbf16, #blocked> loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16, #blocked> loc(#loc1) + %cst_12 = arith.constant dense<2048> : tensor<128xi32, #blocked1> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %c63_i32 = arith.constant 63 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_13 = arith.constant dense<0.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %cst_14 = arith.constant dense<1.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %cst_15 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma1> loc(#loc1) + %cst_16 = arith.constant dense<0.0883883461> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_17 = arith.constant dense<0xFF800000> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_18 = arith.constant dense<1.44269502> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_19 = arith.constant dense<0xFF800000> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %c-1_i32 = arith.constant -1 : i32 loc(#loc1) + %c3_i32 = arith.constant 3 : i32 loc(#loc1) + %0 = arith.muli %ks0, %c1024_i32 : i32 loc(#loc2) + %1 = arith.muli %ks0, %c128_i32 : i32 loc(#loc3) + %q_start = tt.get_program_id x : i32 loc(#loc180) + %off_zq = tt.get_program_id y : i32 loc(#loc181) + %off_hq = tt.get_program_id z : i32 loc(#loc182) + %off_zkv = arith.remsi %off_zq, %c2_i32 : i32 loc(#loc183) + %off_hkv = arith.divsi %off_hq, %c4_i32 : i32 loc(#loc184) + %q_offset = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc185) + %q_offset_20 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc186) + %q_offset_21 = arith.addi %q_offset, %q_offset_20 : i32 loc(#loc187) + %k_offset = arith.muli %off_zkv, %0 : i32 loc(#loc188) + %k_offset_22 = arith.muli %off_hkv, %1 : i32 loc(#loc189) + %k_offset_23 = arith.addi %k_offset, %k_offset_22 : i32 loc(#loc190) + %Q = tt.addptr %arg_Q, %q_offset_21 : !tt.ptr, i32 loc(#loc191) + %K = tt.addptr %arg_K, %k_offset_23 : !tt.ptr, i32 loc(#loc192) + %V = tt.addptr %arg_V, %k_offset_23 : !tt.ptr, i32 loc(#loc193) + %stride_kv_idx_h = arith.muli %ks1, %c16_i32 : i32 loc(#loc194) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %c16_i32 : i32 loc(#loc195) + %sparse_kv_num_blks_offset_24 = arith.addi %sparse_kv_num_blks_offset, %q_start : i32 loc(#loc196) + %sparse_kv_idx_offset = arith.muli %off_zkv, %stride_kv_idx_h : i32 loc(#loc197) + %sparse_kv_idx_offset_25 = arith.muli %q_start, %ks1 : i32 loc(#loc198) + %sparse_kv_idx_offset_26 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_25 : i32 loc(#loc199) + %offs_m = arith.muli %q_start, %c128_i32 : i32 loc(#loc200) + %offs_m_27 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc201) + %offs_m_28 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc201) + %offs_m_29 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked1> loc(#loc201) + %offs_m_30 = tt.splat %offs_m : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc202) + %offs_m_31 = tt.splat %offs_m : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc202) + %offs_m_32 = tt.splat %offs_m : i32 -> tensor<128xi32, #blocked1> loc(#loc202) + %offs_m_33 = arith.addi %offs_m_30, %offs_m_27 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc202) + %offs_m_34 = arith.addi %offs_m_31, %offs_m_28 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc202) + %offs_m_35 = arith.addi %offs_m_32, %offs_m_29 : tensor<128xi32, #blocked1> loc(#loc202) + %ptr = tt.expand_dims %offs_m_33 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc323) + %ptr_36 = tt.expand_dims %offs_m_34 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xi32, #mma> loc(#loc323) + %ptr_37 = arith.muli %ptr, %cst_7 : tensor<128x1xi32, #blocked> loc(#loc324) + %ptr_38 = tt.splat %Q : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc325) + %ptr_39 = tt.addptr %ptr_38, %ptr_37 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc325) + %ptr_40 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc326) + %ptr_41 = tt.expand_dims %ptr_40 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc326) + %ptr_42 = tt.broadcast %ptr_39 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc327) + %ptr_43 = tt.broadcast %ptr_41 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc327) + %ptr_44 = tt.addptr %ptr_42, %ptr_43 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc327) + %q = arith.cmpi slt, %ptr, %cst_9 : tensor<128x1xi32, #blocked> loc(#loc328) + %q_45 = tt.broadcast %q : tensor<128x1xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc329) + %q_46 = tt.load %ptr_44, %q_45, %cst_10 : tensor<128x128x!tt.ptr, #blocked> loc(#loc329) + %q_47 = ttg.local_alloc %q_46 : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc329) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_26 : !tt.ptr, i32 loc(#loc209) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc210) + %kv_start_48 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc211) + %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_24 : !tt.ptr, i32 loc(#loc212) + %kv_num_blocks_49 = tt.load %kv_num_blocks : !tt.ptr loc(#loc213) + %block_n_end = arith.muli %kv_num_blocks_49, %c2_i32 : i32 loc(#loc214) + %block_n_end_50 = arith.addi %ks0, %c63_i32 : i32 loc(#loc330) + %block_n_end_51 = arith.divsi %block_n_end_50, %c64_i32 : i32 loc(#loc331) + %block_n_end_52 = arith.maxsi %block_n_end_51, %c1_i32 : i32 loc(#loc216) + %block_n_end_53 = arith.minsi %block_n_end, %block_n_end_52 : i32 loc(#loc217) + %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc218) + %offs_n_54 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc218) + %offs_n_55 = tt.splat %kv_start_48 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc219) + %offs_n_56 = arith.addi %offs_n_55, %offs_n : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc219) + %2 = tt.expand_dims %offs_n_56 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc48) + %ptr_57 = tt.splat %K : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked> loc(#loc424) + %ptr_58 = tt.broadcast %ptr_41 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked> loc(#loc425) + %k = tt.splat %ks0 : i32 -> tensor<64x1xi32, #blocked> loc(#loc426) + %m = arith.remsi %ptr_36, %cst : tensor<128x1xi32, #mma> loc(#loc427) + %n = tt.splat %ks0 : i32 -> tensor<1x64xi32, #mma> loc(#loc428) + %tmp4 = tt.broadcast %m : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc335) + %tmp7 = tt.addptr %in_ptr9, %off_zq : !tt.ptr, i32 loc(#loc336) + %kv_offset = arith.cmpi sgt, %block_n_end_53, %c0_i32 : i32 loc(#loc493) + %tmp7_59 = tt.load %tmp7, %kv_offset : !tt.ptr loc(#loc338) + %tmp8 = tt.splat %tmp7_59 : i64 -> tensor<1x64xi64, #mma> loc(#loc339) + %tmp9 = arith.extsi %m : tensor<128x1xi32, #mma> to tensor<128x1xi64, #mma> loc(#loc340) + %tmp10 = tt.splat %tmp7_59 : i64 -> tensor<128x1xi64, #mma> loc(#loc341) + %tmp10_60 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64, #mma> loc(#loc341) + %tmp11 = tt.broadcast %tmp10_60 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc342) + %ptr_61 = tt.splat %V : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked> loc(#loc430) + %k_62 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc431) + %v = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc432) + %offs_n_load = tt.splat %kv_start_48 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc344) + %offs_n_load_63 = arith.addi %offs_n_load, %offs_n_54 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc344) + %ptr_64 = tt.expand_dims %offs_n_load_63 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc433) + %ptr_65 = arith.muli %ptr_64, %cst_6 : tensor<64x1xi32, #blocked> loc(#loc434) + %ptr_66 = tt.addptr %ptr_57, %ptr_65 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc424) + %ptr_67 = tt.broadcast %ptr_66 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc425) + %ptr_68 = tt.addptr %ptr_67, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc425) + %k_69 = arith.cmpi slt, %ptr_64, %k : tensor<64x1xi32, #blocked> loc(#loc426) + %k_70 = tt.broadcast %k_69 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc431) + %k_71 = ttg.memdesc_index %k_62[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc431) + %kv_offset_72 = tt.splat %kv_offset : i1 -> tensor<64x128xi1, #blocked> loc(#loc493) + %kv_offset_73 = arith.andi %kv_offset_72, %k_70 : tensor<64x128xi1, #blocked> loc(#loc493) + %k_74 = ttg.async_copy_global_to_local %ptr_68, %k_71 mask %kv_offset_73 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc431) + %k_75 = ttg.async_commit_group tokens %k_74 loc(#loc431) + %ptr_76 = tt.addptr %ptr_61, %ptr_65 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc430) + %ptr_77 = tt.broadcast %ptr_76 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc435) + %ptr_78 = tt.addptr %ptr_77, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc435) + %v_79 = ttg.memdesc_index %v[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc432) + %v_80 = ttg.async_copy_global_to_local %ptr_78, %v_79 mask %kv_offset_73 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc432) + %v_81 = ttg.async_commit_group tokens %v_80 loc(#loc432) + %kv_offset_82 = arith.cmpi sgt, %block_n_end_53, %c1_i32 : i32 loc(#loc493) + %kv_base_offset = arith.addi %kv_start_48, %c64_i32 : i32 loc(#loc345) + %offs_n_load_83 = tt.splat %kv_base_offset : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc344) + %offs_n_load_84 = arith.addi %offs_n_load_83, %offs_n_54 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc344) + %ptr_85 = tt.expand_dims %offs_n_load_84 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc433) + %ptr_86 = arith.muli %ptr_85, %cst_6 : tensor<64x1xi32, #blocked> loc(#loc434) + %ptr_87 = tt.addptr %ptr_57, %ptr_86 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc424) + %ptr_88 = tt.broadcast %ptr_87 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc425) + %ptr_89 = tt.addptr %ptr_88, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc425) + %k_90 = arith.cmpi slt, %ptr_85, %k : tensor<64x1xi32, #blocked> loc(#loc426) + %k_91 = tt.broadcast %k_90 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc431) + %k_92 = ttg.memdesc_index %k_62[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc431) + %kv_offset_93 = tt.splat %kv_offset_82 : i1 -> tensor<64x128xi1, #blocked> loc(#loc493) + %kv_offset_94 = arith.andi %kv_offset_93, %k_91 : tensor<64x128xi1, #blocked> loc(#loc493) + %k_95 = ttg.async_copy_global_to_local %ptr_89, %k_92 mask %kv_offset_94 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc431) + %k_96 = ttg.async_commit_group tokens %k_95 loc(#loc431) + %ptr_97 = tt.addptr %ptr_61, %ptr_86 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc430) + %ptr_98 = tt.broadcast %ptr_97 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc435) + %ptr_99 = tt.addptr %ptr_98, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc435) + %v_100 = ttg.memdesc_index %v[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc432) + %v_101 = ttg.async_copy_global_to_local %ptr_99, %v_100 mask %kv_offset_94 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc432) + %v_102 = ttg.async_commit_group tokens %v_101 loc(#loc432) + ttng.fence_async_shared {bCluster = false} loc(#loc346) + %kv_offset_103:12 = scf.for %kv_offset_174 = %c0_i32 to %block_n_end_53 step %c1_i32 iter_args(%acc_175 = %cst_15, %arg15 = %cst_13, %arg16 = %cst_19, %arg17 = %c64_i32, %arg18 = %2, %arg19 = %c1_i32, %arg20 = %c-1_i32, %k_176 = %k_75, %k_177 = %k_96, %v_178 = %v_81, %v_179 = %v_102, %arg25 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 { + %kv_offset_180 = arith.subi %block_n_end_53, %c2_i32 : i32 loc(#loc493) + %kv_offset_181 = arith.cmpi slt, %kv_offset_174, %kv_offset_180 : i32 loc(#loc493) + %kv_offset_182 = arith.subi %block_n_end_53, %c1_i32 : i32 loc(#loc493) + %kv_offset_183 = arith.cmpi slt, %kv_offset_174, %kv_offset_182 : i32 loc(#loc493) + %kv_offset_184 = arith.addi %arg20, %c1_i32 : i32 loc(#loc493) + %kv_offset_185 = arith.cmpi sge, %kv_offset_184, %c3_i32 : i32 loc(#loc493) + %kv_offset_186 = arith.select %kv_offset_185, %c0_i32, %kv_offset_184 : i32 loc(#loc493) + %k_187 = ttg.async_wait %k_176, %v_178 {num = 2 : i32} loc(#loc431) + %k_188 = ttg.memdesc_index %k_62[%kv_offset_186] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc431) + %k_189 = ttg.memdesc_trans %k_188 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc347) + %qk = ttng.warp_group_dot %q_47, %k_189, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc346) + %qk_190:4 = ttng.warp_group_dot_wait %qk, %q_47, %k_189, %acc_175 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64>, tensor<128x128xf32, #mma1> loc(#loc346) + %qk_191 = arith.mulf %qk_190#0, %cst_16 : tensor<128x64xf32, #mma> loc(#loc348) + %n_192 = arith.remsi %arg18, %n : tensor<1x64xi32, #mma> loc(#loc428) + %post_mod_scores = arith.cmpi slt, %arg18, %n : tensor<1x64xi32, #mma> loc(#loc349) + %post_mod_scores_193 = tt.broadcast %post_mod_scores : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc350) + %post_mod_scores_194 = arith.select %post_mod_scores_193, %qk_191, %cst_17 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc350) + %tmp4_195 = tt.broadcast %n_192 : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc335) + %tmp4_196 = arith.cmpi sge, %tmp4, %tmp4_195 : tensor<128x64xi32, #mma> loc(#loc335) + %tmp5 = arith.extsi %n_192 : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc351) + %tmp8_197 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64, #mma> loc(#loc339) + %tmp11_198 = tt.broadcast %tmp8_197 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc342) + %tmp11_199 = arith.andi %tmp11_198, %tmp11 : tensor<128x64xi1, #mma> loc(#loc342) + %tmp12 = arith.andi %tmp4_196, %tmp11_199 : tensor<128x64xi1, #mma> loc(#loc352) + %tmp15 = arith.cmpi sge, %n_192, %cst_4 : tensor<1x64xi32, #mma> loc(#loc353) + %tmp16 = arith.remsi %n_192, %cst_4 : tensor<1x64xi32, #mma> loc(#loc354) + %tmp18 = arith.cmpi ne, %tmp16, %cst_0 : tensor<1x64xi32, #mma> loc(#loc355) + %tmp19 = arith.cmpi slt, %tmp16, %cst_0 : tensor<1x64xi32, #mma> loc(#loc356) + %tmp22 = arith.andi %tmp18, %tmp19 : tensor<1x64xi1, #mma> loc(#loc357) + %tmp23 = arith.addi %tmp16, %cst_4 : tensor<1x64xi32, #mma> loc(#loc358) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1, #mma>, tensor<1x64xi32, #mma> loc(#loc359) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc360) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64, #mma> loc(#loc361) + %tmp27 = arith.andi %tmp15, %tmp26 : tensor<1x64xi1, #mma> loc(#loc362) + %tmp28 = arith.subi %tmp4_195, %tmp4 : tensor<128x64xi32, #mma> loc(#loc363) + %tmp29 = arith.remsi %tmp28, %cst_3 : tensor<128x64xi32, #mma> loc(#loc364) + %tmp30 = arith.cmpi ne, %tmp29, %cst_1 : tensor<128x64xi32, #mma> loc(#loc365) + %tmp31 = arith.cmpi slt, %tmp29, %cst_1 : tensor<128x64xi32, #mma> loc(#loc366) + %tmp33 = arith.andi %tmp30, %tmp31 : tensor<128x64xi1, #mma> loc(#loc367) + %tmp34 = arith.addi %tmp29, %cst_3 : tensor<128x64xi32, #mma> loc(#loc368) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29 : tensor<128x64xi1, #mma>, tensor<128x64xi32, #mma> loc(#loc369) + %tmp36 = arith.cmpi eq, %tmp35, %cst_1 : tensor<128x64xi32, #mma> loc(#loc370) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc371) + %tmp37_200 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1, #mma> loc(#loc371) + %tmp38 = arith.ori %tmp12, %tmp37_200 : tensor<128x64xi1, #mma> loc(#loc372) + %mask_mod_output = arith.select %post_mod_scores_193, %tmp38, %cst_2 : tensor<128x64xi1, #mma>, tensor<128x64xi1, #mma> loc(#loc373) + %post_mod_scores_201 = arith.select %mask_mod_output, %post_mod_scores_194, %cst_17 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc374) + %post_mod_scores_202 = arith.mulf %post_mod_scores_201, %cst_18 : tensor<128x64xf32, #mma> loc(#loc375) + %m_ij = "tt.reduce"(%post_mod_scores_202) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_260: f32 loc(callsite(#loc1 at #loc376)), %m_ij_261: f32 loc(callsite(#loc1 at #loc376))): + %m_ij_262 = arith.maxnumf %m_ij_260, %m_ij_261 : f32 loc(#loc488) + tt.reduce.return %m_ij_262 : f32 loc(#loc436) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc436) + %m_ij_203 = arith.maxnumf %arg16, %m_ij : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc377) + %masked_out_rows = arith.cmpf oeq, %m_ij_203, %cst_19 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc378) + %m_ij_masked = arith.select %masked_out_rows, %cst_13, %m_ij_203 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc379) + %alpha = arith.subf %arg16, %m_ij_masked : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc380) + %alpha_204 = math.exp2 %alpha : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc381) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc382) + %p_205 = tt.broadcast %p : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc383) + %p_206 = arith.subf %post_mod_scores_202, %p_205 : tensor<128x64xf32, #mma> loc(#loc383) + %p_207 = math.exp2 %p_206 : tensor<128x64xf32, #mma> loc(#loc384) + %l_i_208 = arith.mulf %arg15, %alpha_204 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc385) + %l_i_209 = "tt.reduce"(%p_207) <{axis = 1 : i32}> ({ + ^bb0(%l_i_260: f32 loc(callsite(#loc1 at #loc386)), %l_i_261: f32 loc(callsite(#loc1 at #loc386))): + %l_i_262 = arith.addf %l_i_260, %l_i_261 : f32 loc(#loc489) + tt.reduce.return %l_i_262 : f32 loc(#loc438) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc438) + %l_i_210 = arith.addf %l_i_208, %l_i_209 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc387) + %acc_211 = tt.expand_dims %alpha_204 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc388) + %acc_212 = ttg.convert_layout %acc_211 : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc389) + %acc_213 = tt.broadcast %acc_212 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc389) + %acc_214 = arith.mulf %qk_190#3, %acc_213 : tensor<128x128xf32, #mma1> loc(#loc389) + %v_215 = ttg.memdesc_index %v[%kv_offset_186] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc432) + %acc_216 = arith.truncf %p_207 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc390) + %acc_217 = ttg.convert_layout %acc_216 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc390) + %acc_218 = ttng.warp_group_dot %acc_217, %v_215, %acc_214 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc391) + %offs_n_219 = tt.splat %arg25 : i32 -> tensor<1x64xi32, #mma> loc(#loc392) + %offs_n_220 = arith.addi %arg18, %offs_n_219 : tensor<1x64xi32, #mma> loc(#loc392) + %kv_offset_221 = arith.addi %kv_offset_174, %c1_i32 : i32 loc(#loc493) + %cur_block_idx = arith.divsi %kv_offset_221, %c2_i32 : i32 loc(#loc440) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc441) + %cur_block_222 = tt.load %cur_block, %kv_offset_183 evictionPolicy = evict_last : !tt.ptr loc(#loc442) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc443) + %next_block_223 = arith.cmpi slt, %next_block, %kv_num_blocks_49 : i32 loc(#loc444) + %next_block_224 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc445) + %kv_offset_225 = arith.andi %kv_offset_183, %next_block_223 : i1 loc(#loc493) + %next_block_226 = tt.load %next_block_224, %kv_offset_225 evictionPolicy = evict_last : !tt.ptr loc(#loc446) + %needs_jump = arith.addi %kv_offset_174, %c2_i32 : i32 loc(#loc447) + %needs_jump_227 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc448) + %needs_jump_228 = arith.cmpi eq, %needs_jump_227, %c0_i32 : i32 loc(#loc449) + %jump_to_block = arith.subi %next_block_226, %cur_block_222 : i32 loc(#loc450) + %jump_to_block_229 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc451) + %jump_to_block_230 = arith.subi %jump_to_block_229, %c64_i32 : i32 loc(#loc452) + %offset = arith.extui %needs_jump_228 : i1 to i32 loc(#loc453) + %offset_231 = arith.muli %jump_to_block_230, %offset : i32 loc(#loc453) + %offset_232 = arith.subi %c1_i32, %offset : i32 loc(#loc454) + %offset_233 = arith.muli %offset_232, %c64_i32 : i32 loc(#loc455) + %offset_234 = arith.addi %offset_231, %offset_233 : i32 loc(#loc456) + %kv_offset_235 = arith.addi %arg17, %offset_234 : i32 loc(#loc394) + %kv_offset_236 = arith.addi %arg19, %c1_i32 : i32 loc(#loc493) + %kv_offset_237 = arith.cmpi sge, %kv_offset_236, %c3_i32 : i32 loc(#loc493) + %kv_offset_238 = arith.select %kv_offset_237, %c0_i32, %kv_offset_236 : i32 loc(#loc493) + %kv_base_offset_239 = arith.addi %kv_start_48, %kv_offset_235 : i32 loc(#loc345) + %offs_n_load_240 = tt.splat %kv_base_offset_239 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc344) + %offs_n_load_241 = arith.addi %offs_n_load_240, %offs_n_54 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc344) + %ptr_242 = tt.expand_dims %offs_n_load_241 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc433) + %ptr_243 = arith.muli %ptr_242, %cst_6 : tensor<64x1xi32, #blocked> loc(#loc434) + %ptr_244 = tt.addptr %ptr_57, %ptr_243 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc424) + %ptr_245 = tt.broadcast %ptr_244 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc425) + %ptr_246 = tt.addptr %ptr_245, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc425) + %k_247 = arith.cmpi slt, %ptr_242, %k : tensor<64x1xi32, #blocked> loc(#loc426) + %k_248 = tt.broadcast %k_247 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc431) + %k_249 = ttg.memdesc_index %k_62[%kv_offset_238] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc431) + %kv_offset_250 = tt.splat %kv_offset_181 : i1 -> tensor<64x128xi1, #blocked> loc(#loc493) + %kv_offset_251 = arith.andi %kv_offset_250, %k_248 : tensor<64x128xi1, #blocked> loc(#loc493) + %k_252 = ttg.async_copy_global_to_local %ptr_246, %k_249 mask %kv_offset_251 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc431) + %k_253 = ttg.async_commit_group tokens %k_252 loc(#loc431) + %ptr_254 = tt.addptr %ptr_61, %ptr_243 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc430) + %ptr_255 = tt.broadcast %ptr_254 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc435) + %ptr_256 = tt.addptr %ptr_255, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc435) + %v_257 = ttg.memdesc_index %v[%kv_offset_238] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc432) + %v_258 = ttg.async_copy_global_to_local %ptr_256, %v_257 mask %kv_offset_251 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc432) + %v_259 = ttg.async_commit_group tokens %v_258 loc(#loc432) + scf.yield %acc_218, %l_i_210, %m_ij_203, %kv_offset_235, %offs_n_220, %kv_offset_238, %kv_offset_186, %k_177, %k_253, %v_179, %v_259, %offset_234 : tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc493) + } loc(#loc493) + %kv_offset_104 = ttng.warp_group_dot_wait %kv_offset_103#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc493) + %kv_offset_105 = ttg.async_wait {num = 0 : i32} loc(#loc493) + ttg.local_dealloc %v : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc493) + ttg.local_dealloc %k_62 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc493) + %kv_indices_106 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_26 : !tt.ptr, i32 loc(#loc301) + %kv_start_107 = tt.load %kv_indices_106 : !tt.ptr loc(#loc302) + %kv_start_108 = arith.muli %kv_start_107, %c128_i32 : i32 loc(#loc303) + %kv_num_blocks_109 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_24 : !tt.ptr, i32 loc(#loc304) + %kv_num_blocks_110 = tt.load %kv_num_blocks_109 : !tt.ptr loc(#loc305) + %block_n_end_111 = arith.muli %kv_num_blocks_110, %c2_i32 : i32 loc(#loc306) + %block_n_end_112 = arith.minsi %block_n_end_111, %block_n_end_52 : i32 loc(#loc307) + %offs_n_113 = tt.splat %kv_start_108 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc308) + %offs_n_114 = arith.addi %offs_n_113, %offs_n : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc308) + %3 = tt.expand_dims %offs_n_114 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc144) + %k_115 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc457) + %v_116 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc458) + %kv_offset_117 = arith.cmpi sgt, %block_n_end_112, %c0_i32 : i32 loc(#loc494) + %offs_n_load_118 = tt.splat %kv_start_108 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc397) + %offs_n_load_119 = arith.addi %offs_n_load_118, %offs_n_54 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc397) + %ptr_120 = tt.expand_dims %offs_n_load_119 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc459) + %ptr_121 = arith.muli %ptr_120, %cst_6 : tensor<64x1xi32, #blocked> loc(#loc460) + %ptr_122 = tt.addptr %ptr_57, %ptr_121 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc461) + %ptr_123 = tt.broadcast %ptr_122 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc462) + %ptr_124 = tt.addptr %ptr_123, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc462) + %k_125 = arith.cmpi slt, %ptr_120, %k : tensor<64x1xi32, #blocked> loc(#loc463) + %k_126 = tt.broadcast %k_125 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc457) + %k_127 = ttg.memdesc_index %k_115[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc457) + %kv_offset_128 = tt.splat %kv_offset_117 : i1 -> tensor<64x128xi1, #blocked> loc(#loc494) + %kv_offset_129 = arith.andi %kv_offset_128, %k_126 : tensor<64x128xi1, #blocked> loc(#loc494) + %k_130 = ttg.async_copy_global_to_local %ptr_124, %k_127 mask %kv_offset_129 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc457) + %k_131 = ttg.async_commit_group tokens %k_130 loc(#loc457) + %ptr_132 = tt.addptr %ptr_61, %ptr_121 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc464) + %ptr_133 = tt.broadcast %ptr_132 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc465) + %ptr_134 = tt.addptr %ptr_133, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc465) + %v_135 = ttg.memdesc_index %v_116[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc458) + %v_136 = ttg.async_copy_global_to_local %ptr_134, %v_135 mask %kv_offset_129 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc458) + %v_137 = ttg.async_commit_group tokens %v_136 loc(#loc458) + %kv_offset_138 = arith.cmpi sgt, %block_n_end_112, %c1_i32 : i32 loc(#loc494) + %kv_base_offset_139 = arith.addi %kv_start_108, %c64_i32 : i32 loc(#loc398) + %offs_n_load_140 = tt.splat %kv_base_offset_139 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc397) + %offs_n_load_141 = arith.addi %offs_n_load_140, %offs_n_54 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc397) + %ptr_142 = tt.expand_dims %offs_n_load_141 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc459) + %ptr_143 = arith.muli %ptr_142, %cst_6 : tensor<64x1xi32, #blocked> loc(#loc460) + %ptr_144 = tt.addptr %ptr_57, %ptr_143 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc461) + %ptr_145 = tt.broadcast %ptr_144 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc462) + %ptr_146 = tt.addptr %ptr_145, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc462) + %k_147 = arith.cmpi slt, %ptr_142, %k : tensor<64x1xi32, #blocked> loc(#loc463) + %k_148 = tt.broadcast %k_147 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc457) + %k_149 = ttg.memdesc_index %k_115[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc457) + %kv_offset_150 = tt.splat %kv_offset_138 : i1 -> tensor<64x128xi1, #blocked> loc(#loc494) + %kv_offset_151 = arith.andi %kv_offset_150, %k_148 : tensor<64x128xi1, #blocked> loc(#loc494) + %k_152 = ttg.async_copy_global_to_local %ptr_146, %k_149 mask %kv_offset_151 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc457) + %k_153 = ttg.async_commit_group tokens %k_152 loc(#loc457) + %ptr_154 = tt.addptr %ptr_61, %ptr_143 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc464) + %ptr_155 = tt.broadcast %ptr_154 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc465) + %ptr_156 = tt.addptr %ptr_155, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc465) + %v_157 = ttg.memdesc_index %v_116[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc458) + %v_158 = ttg.async_copy_global_to_local %ptr_156, %v_157 mask %kv_offset_151 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc458) + %v_159 = ttg.async_commit_group tokens %v_158 loc(#loc458) + ttng.fence_async_shared {bCluster = false} loc(#loc399) + %kv_offset_160:12 = scf.for %kv_offset_174 = %c0_i32 to %block_n_end_112 step %c1_i32 iter_args(%kv_offset_175 = %kv_offset_104, %kv_offset_176 = %kv_offset_103#1, %kv_offset_177 = %kv_offset_103#2, %arg17 = %c64_i32, %arg18 = %3, %arg19 = %c1_i32, %arg20 = %c-1_i32, %k_178 = %k_131, %k_179 = %k_153, %v_180 = %v_137, %v_181 = %v_159, %arg25 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 { + %kv_offset_182 = arith.subi %block_n_end_112, %c2_i32 : i32 loc(#loc494) + %kv_offset_183 = arith.cmpi slt, %kv_offset_174, %kv_offset_182 : i32 loc(#loc494) + %kv_offset_184 = arith.subi %block_n_end_112, %c1_i32 : i32 loc(#loc494) + %kv_offset_185 = arith.cmpi slt, %kv_offset_174, %kv_offset_184 : i32 loc(#loc494) + %kv_offset_186 = arith.addi %arg20, %c1_i32 : i32 loc(#loc494) + %kv_offset_187 = arith.cmpi sge, %kv_offset_186, %c3_i32 : i32 loc(#loc494) + %kv_offset_188 = arith.select %kv_offset_187, %c0_i32, %kv_offset_186 : i32 loc(#loc494) + %k_189 = ttg.async_wait %k_178, %v_180 {num = 2 : i32} loc(#loc457) + %k_190 = ttg.memdesc_index %k_115[%kv_offset_188] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc457) + %k_191 = ttg.memdesc_trans %k_190 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc400) + %qk = ttng.warp_group_dot %q_47, %k_191, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc399) + %qk_192:4 = ttng.warp_group_dot_wait %qk, %q_47, %k_191, %kv_offset_175 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64>, tensor<128x128xf32, #mma1> loc(#loc399) + %qk_193 = arith.mulf %qk_192#0, %cst_16 : tensor<128x64xf32, #mma> loc(#loc401) + %post_mod_scores = arith.cmpi slt, %arg18, %n : tensor<1x64xi32, #mma> loc(#loc402) + %post_mod_scores_194 = tt.broadcast %post_mod_scores : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc403) + %post_mod_scores_195 = arith.select %post_mod_scores_194, %qk_193, %cst_17 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc403) + %post_mod_scores_196 = arith.mulf %post_mod_scores_195, %cst_18 : tensor<128x64xf32, #mma> loc(#loc404) + %m_ij = "tt.reduce"(%post_mod_scores_196) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_254: f32 loc(callsite(#loc1 at #loc405)), %m_ij_255: f32 loc(callsite(#loc1 at #loc405))): + %m_ij_256 = arith.maxnumf %m_ij_254, %m_ij_255 : f32 loc(#loc490) + tt.reduce.return %m_ij_256 : f32 loc(#loc466) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc466) + %m_ij_197 = arith.maxnumf %kv_offset_177, %m_ij : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc406) + %masked_out_rows = arith.cmpf oeq, %m_ij_197, %cst_19 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc407) + %m_ij_masked = arith.select %masked_out_rows, %cst_13, %m_ij_197 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc408) + %alpha = arith.subf %kv_offset_177, %m_ij_masked : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc409) + %alpha_198 = math.exp2 %alpha : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc410) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc411) + %p_199 = tt.broadcast %p : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc412) + %p_200 = arith.subf %post_mod_scores_196, %p_199 : tensor<128x64xf32, #mma> loc(#loc412) + %p_201 = math.exp2 %p_200 : tensor<128x64xf32, #mma> loc(#loc413) + %l_i_202 = arith.mulf %kv_offset_176, %alpha_198 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc414) + %l_i_203 = "tt.reduce"(%p_201) <{axis = 1 : i32}> ({ + ^bb0(%l_i_254: f32 loc(callsite(#loc1 at #loc415)), %l_i_255: f32 loc(callsite(#loc1 at #loc415))): + %l_i_256 = arith.addf %l_i_254, %l_i_255 : f32 loc(#loc491) + tt.reduce.return %l_i_256 : f32 loc(#loc468) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc468) + %l_i_204 = arith.addf %l_i_202, %l_i_203 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc416) + %acc_205 = tt.expand_dims %alpha_198 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc417) + %acc_206 = ttg.convert_layout %acc_205 : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc418) + %acc_207 = tt.broadcast %acc_206 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc418) + %acc_208 = arith.mulf %qk_192#3, %acc_207 : tensor<128x128xf32, #mma1> loc(#loc418) + %v_209 = ttg.memdesc_index %v_116[%kv_offset_188] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc458) + %acc_210 = arith.truncf %p_201 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc419) + %acc_211 = ttg.convert_layout %acc_210 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc419) + %acc_212 = ttng.warp_group_dot %acc_211, %v_209, %acc_208 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc420) + %offs_n_213 = tt.splat %arg25 : i32 -> tensor<1x64xi32, #mma> loc(#loc421) + %offs_n_214 = arith.addi %arg18, %offs_n_213 : tensor<1x64xi32, #mma> loc(#loc421) + %kv_offset_215 = arith.addi %kv_offset_174, %c1_i32 : i32 loc(#loc494) + %cur_block_idx = arith.divsi %kv_offset_215, %c2_i32 : i32 loc(#loc470) + %cur_block = tt.addptr %kv_indices_106, %cur_block_idx : !tt.ptr, i32 loc(#loc471) + %cur_block_216 = tt.load %cur_block, %kv_offset_185 evictionPolicy = evict_last : !tt.ptr loc(#loc472) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc473) + %next_block_217 = arith.cmpi slt, %next_block, %kv_num_blocks_110 : i32 loc(#loc474) + %next_block_218 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc475) + %kv_offset_219 = arith.andi %kv_offset_185, %next_block_217 : i1 loc(#loc494) + %next_block_220 = tt.load %next_block_218, %kv_offset_219 evictionPolicy = evict_last : !tt.ptr loc(#loc476) + %needs_jump = arith.addi %kv_offset_174, %c2_i32 : i32 loc(#loc477) + %needs_jump_221 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc478) + %needs_jump_222 = arith.cmpi eq, %needs_jump_221, %c0_i32 : i32 loc(#loc479) + %jump_to_block = arith.subi %next_block_220, %cur_block_216 : i32 loc(#loc480) + %jump_to_block_223 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc481) + %jump_to_block_224 = arith.subi %jump_to_block_223, %c64_i32 : i32 loc(#loc482) + %offset = arith.extui %needs_jump_222 : i1 to i32 loc(#loc483) + %offset_225 = arith.muli %jump_to_block_224, %offset : i32 loc(#loc483) + %offset_226 = arith.subi %c1_i32, %offset : i32 loc(#loc484) + %offset_227 = arith.muli %offset_226, %c64_i32 : i32 loc(#loc485) + %offset_228 = arith.addi %offset_225, %offset_227 : i32 loc(#loc486) + %kv_offset_229 = arith.addi %arg17, %offset_228 : i32 loc(#loc423) + %kv_offset_230 = arith.addi %arg19, %c1_i32 : i32 loc(#loc494) + %kv_offset_231 = arith.cmpi sge, %kv_offset_230, %c3_i32 : i32 loc(#loc494) + %kv_offset_232 = arith.select %kv_offset_231, %c0_i32, %kv_offset_230 : i32 loc(#loc494) + %kv_base_offset_233 = arith.addi %kv_start_108, %kv_offset_229 : i32 loc(#loc398) + %offs_n_load_234 = tt.splat %kv_base_offset_233 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc397) + %offs_n_load_235 = arith.addi %offs_n_load_234, %offs_n_54 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc397) + %ptr_236 = tt.expand_dims %offs_n_load_235 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc459) + %ptr_237 = arith.muli %ptr_236, %cst_6 : tensor<64x1xi32, #blocked> loc(#loc460) + %ptr_238 = tt.addptr %ptr_57, %ptr_237 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc461) + %ptr_239 = tt.broadcast %ptr_238 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc462) + %ptr_240 = tt.addptr %ptr_239, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc462) + %k_241 = arith.cmpi slt, %ptr_236, %k : tensor<64x1xi32, #blocked> loc(#loc463) + %k_242 = tt.broadcast %k_241 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc457) + %k_243 = ttg.memdesc_index %k_115[%kv_offset_232] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc457) + %kv_offset_244 = tt.splat %kv_offset_183 : i1 -> tensor<64x128xi1, #blocked> loc(#loc494) + %kv_offset_245 = arith.andi %kv_offset_244, %k_242 : tensor<64x128xi1, #blocked> loc(#loc494) + %k_246 = ttg.async_copy_global_to_local %ptr_240, %k_243 mask %kv_offset_245 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc457) + %k_247 = ttg.async_commit_group tokens %k_246 loc(#loc457) + %ptr_248 = tt.addptr %ptr_61, %ptr_237 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc464) + %ptr_249 = tt.broadcast %ptr_248 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc465) + %ptr_250 = tt.addptr %ptr_249, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc465) + %v_251 = ttg.memdesc_index %v_116[%kv_offset_232] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc458) + %v_252 = ttg.async_copy_global_to_local %ptr_250, %v_251 mask %kv_offset_245 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc458) + %v_253 = ttg.async_commit_group tokens %v_252 loc(#loc458) + scf.yield %acc_212, %l_i_204, %m_ij_197, %kv_offset_229, %offs_n_214, %kv_offset_232, %kv_offset_188, %k_179, %k_247, %v_181, %v_253, %offset_228 : tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc494) + } loc(#loc494) + %kv_offset_161 = ttng.warp_group_dot_wait %kv_offset_160#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc494) + %kv_offset_162 = ttg.async_wait {num = 0 : i32} loc(#loc494) + ttg.local_dealloc %v_116 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc494) + ttg.local_dealloc %k_115 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc494) + %l_i = arith.cmpf oeq, %kv_offset_160#1, %cst_13 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc310) + %l_i_163 = arith.select %l_i, %cst_14, %kv_offset_160#1 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc311) + %acc = tt.expand_dims %l_i_163 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc312) + %acc_164 = ttg.convert_layout %acc : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc313) + %acc_165 = tt.broadcast %acc_164 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc313) + %acc_166 = arith.divf %kv_offset_161, %acc_165 : tensor<128x128xf32, #mma1> loc(#loc313) + %mask = arith.cmpi slt, %ptr_41, %cst_8 : tensor<1x128xi32, #blocked> loc(#loc314) + %mask_167 = tt.broadcast %mask : tensor<1x128xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc315) + %mask_168 = arith.andi %q_45, %mask_167 : tensor<128x128xi1, #blocked> loc(#loc315) + %4 = tt.splat %q_offset_20 : i32 -> tensor<1x128xi32, #blocked> loc(#loc152) + %5 = arith.addi %ptr_41, %4 : tensor<1x128xi32, #blocked> loc(#loc152) + %6 = tt.broadcast %5 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc153) + %7 = tt.broadcast %ptr_37 : tensor<128x1xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc153) + %8 = arith.addi %6, %7 : tensor<128x128xi32, #blocked> loc(#loc153) + %9 = tt.splat %q_offset : i32 -> tensor<128x128xi32, #blocked> loc(#loc154) + %10 = arith.addi %8, %9 : tensor<128x128xi32, #blocked> loc(#loc154) + %11 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr, #blocked> loc(#loc155) + %12 = tt.addptr %11, %10 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc155) + %13 = arith.truncf %acc_166 : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc156) + %14 = ttg.convert_layout %13 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc156) + tt.store %12, %14, %mask_168 : tensor<128x128x!tt.ptr, #blocked> loc(#loc156) + %off_hz = arith.muli %off_zq, %c32_i32 : i32 loc(#loc316) + %off_hz_169 = arith.addi %off_hz, %off_hq : i32 loc(#loc317) + %l_ptrs = arith.muli %off_hz_169, %c2048_i32 : i32 loc(#loc318) + %l_ptrs_170 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr, i32 loc(#loc319) + %l_ptrs_171 = tt.splat %l_ptrs_170 : !tt.ptr -> tensor<128x!tt.ptr, #blocked1> loc(#loc320) + %l_ptrs_172 = tt.addptr %l_ptrs_171, %offs_m_35 : tensor<128x!tt.ptr, #blocked1>, tensor<128xi32, #blocked1> loc(#loc320) + %lse = math.log2 %l_i_163 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc321) + %lse_173 = arith.addf %kv_offset_160#2, %lse : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc322) + %15 = arith.cmpi slt, %offs_m_35, %cst_12 : tensor<128xi32, #blocked1> loc(#loc164) + %16 = ttg.convert_layout %lse_173 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128xf32, #blocked1> loc(#loc165) + tt.store %l_ptrs_172, %16, %15 : tensor<128x!tt.ptr, #blocked1> loc(#loc165) + tt.return loc(#loc166) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":86:54) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":86:63) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":97:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":98:27) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":99:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":103:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":104:24) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":107:24) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":107:45) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":107:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":108:25) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":108:47) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":108:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":111:12) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":112:12) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":113:12) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":130:25) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":142:51) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":142:74) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":143:46) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":143:97) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":143:64) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":144:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":144:46) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":144:33) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":284:27) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":146:101) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":284:38) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":284:20) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":284:56) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":284:49) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":292:52) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":292:23) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":151:26) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":152:23) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":152:37) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":153:42) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":153:28) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":154:45) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":154:92) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":154:102) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":154:65) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":159:37) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":159:24) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":167:48) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":347:107) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":257:21) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":358:36) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":359:36) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":373:23) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":376:33) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":502:40) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":376:23) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":377:22) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":378:23) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":379:23) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":380:23) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":439:107) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":346:35) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":342:32) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":351:19) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":349:17) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":353:14) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":367:44) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":367:69) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":374:23) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":381:23) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":384:24) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":385:24) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":387:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":388:92) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":391:24) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":392:24) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":393:39) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":394:25) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":395:24) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":396:24) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":397:23) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":398:25) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":399:25) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":400:92) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":402:24) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":403:24) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":404:39) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":405:25) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":406:24) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":407:24) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":412:73) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":414:69) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":417:27) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":421:27) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":423:35) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":424:51) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":428:31) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":428:25) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":429:51) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":429:39) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":429:21) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":434:16) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":434:24) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":436:22) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":436:16) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":440:22) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":440:44) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":548:26) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":247:33) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":545:63) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":248:38) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":248:24) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":249:109) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":249:113) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":249:55) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":249:25) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":250:30) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":250:35) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":250:60) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":251:34) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":251:48) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":251:63) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":252:29) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":252:47) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":252:61) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":252:42) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":549:21) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":181:35) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":182:27) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":182:41) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":183:51) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":183:32) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":184:49) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":184:69) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":186:28) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":193:52) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":206:26) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":206:34) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":208:20) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":208:16) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":214:38) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":214:30) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":218:49) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":218:62) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":218:75) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":218:25) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":218:109) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":221:26) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":221:31) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":222:32) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":222:23) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":222:40) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":223:33) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":223:20) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":227:48) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":227:29) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":229:4) +#loc180 = loc("q_start"(#loc4)) +#loc181 = loc("off_zq"(#loc5)) +#loc182 = loc("off_hq"(#loc6)) +#loc183 = loc("off_zkv"(#loc7)) +#loc184 = loc("off_hkv"(#loc8)) +#loc185 = loc("q_offset"(#loc9)) +#loc186 = loc("q_offset"(#loc10)) +#loc187 = loc("q_offset"(#loc11)) +#loc188 = loc("k_offset"(#loc12)) +#loc189 = loc("k_offset"(#loc13)) +#loc190 = loc("k_offset"(#loc14)) +#loc191 = loc("Q"(#loc15)) +#loc192 = loc("K"(#loc16)) +#loc193 = loc("V"(#loc17)) +#loc194 = loc("stride_kv_idx_h"(#loc18)) +#loc195 = loc("sparse_kv_num_blks_offset"(#loc19)) +#loc196 = loc("sparse_kv_num_blks_offset"(#loc20)) +#loc197 = loc("sparse_kv_idx_offset"(#loc21)) +#loc198 = loc("sparse_kv_idx_offset"(#loc22)) +#loc199 = loc("sparse_kv_idx_offset"(#loc23)) +#loc200 = loc("offs_m"(#loc24)) +#loc201 = loc("offs_m"(#loc25)) +#loc202 = loc("offs_m"(#loc26)) +#loc203 = loc("ptr"(#loc27)) +#loc204 = loc("q"(#loc28)) +#loc205 = loc("ptr"(#loc29)) +#loc206 = loc("ptr"(#loc30)) +#loc207 = loc("ptr"(#loc31)) +#loc208 = loc("ptr"(#loc32)) +#loc209 = loc("kv_indices"(#loc35)) +#loc210 = loc("kv_start"(#loc36)) +#loc211 = loc("kv_start"(#loc37)) +#loc212 = loc("kv_num_blocks"(#loc38)) +#loc213 = loc("kv_num_blocks"(#loc39)) +#loc214 = loc("block_n_end"(#loc40)) +#loc215 = loc("block_n_end"(#loc42)) +#loc216 = loc("block_n_end"(#loc44)) +#loc217 = loc("block_n_end"(#loc45)) +#loc218 = loc("offs_n"(#loc46)) +#loc219 = loc("offs_n"(#loc47)) +#loc220 = loc("k"(#loc49)) +#loc222 = loc("m"(#loc53)) +#loc223 = loc("n"(#loc54)) +#loc224 = loc("tmp4"(#loc55)) +#loc225 = loc("tmp7"(#loc56)) +#loc226 = loc("acc"(#loc57)) +#loc227 = loc("tmp7"(#loc58)) +#loc228 = loc("tmp8"(#loc59)) +#loc229 = loc("tmp9"(#loc60)) +#loc230 = loc("tmp10"(#loc61)) +#loc231 = loc("tmp11"(#loc62)) +#loc232 = loc("v"(#loc63)) +#loc233 = loc("offs_n_load"(#loc64)) +#loc234 = loc("kv_base_offset"(#loc65)) +#loc235 = loc("qk"(#loc66)) +#loc236 = loc("k"(#loc67)) +#loc237 = loc("qk"(#loc68)) +#loc238 = loc("post_mod_scores"(#loc69)) +#loc239 = loc("post_mod_scores"(#loc70)) +#loc240 = loc("tmp5"(#loc71)) +#loc241 = loc("tmp12"(#loc72)) +#loc242 = loc("tmp15"(#loc73)) +#loc243 = loc("tmp16"(#loc74)) +#loc244 = loc("tmp18"(#loc75)) +#loc245 = loc("tmp19"(#loc76)) +#loc246 = loc("tmp22"(#loc77)) +#loc247 = loc("tmp23"(#loc78)) +#loc248 = loc("tmp24"(#loc79)) +#loc249 = loc("tmp25"(#loc80)) +#loc250 = loc("tmp26"(#loc81)) +#loc251 = loc("tmp27"(#loc82)) +#loc252 = loc("tmp28"(#loc83)) +#loc253 = loc("tmp29"(#loc84)) +#loc254 = loc("tmp30"(#loc85)) +#loc255 = loc("tmp31"(#loc86)) +#loc256 = loc("tmp33"(#loc87)) +#loc257 = loc("tmp34"(#loc88)) +#loc258 = loc("tmp35"(#loc89)) +#loc259 = loc("tmp36"(#loc90)) +#loc260 = loc("tmp37"(#loc91)) +#loc261 = loc("tmp38"(#loc92)) +#loc262 = loc("mask_mod_output"(#loc93)) +#loc263 = loc("post_mod_scores"(#loc94)) +#loc264 = loc("post_mod_scores"(#loc95)) +#loc266 = loc("m_ij"(#loc99)) +#loc267 = loc("masked_out_rows"(#loc100)) +#loc268 = loc("m_ij_masked"(#loc101)) +#loc269 = loc("alpha"(#loc102)) +#loc270 = loc("alpha"(#loc103)) +#loc271 = loc("p"(#loc104)) +#loc272 = loc("p"(#loc105)) +#loc273 = loc("p"(#loc106)) +#loc274 = loc("l_i"(#loc107)) +#loc276 = loc("l_i"(#loc111)) +#loc277 = loc("acc"(#loc112)) +#loc278 = loc("acc"(#loc113)) +#loc279 = loc("acc"(#loc114)) +#loc280 = loc("acc"(#loc115)) +#loc281 = loc("offs_n"(#loc116)) +#loc282 = loc("cur_block_idx"(#loc117)) +#loc283 = loc("offset"(#loc118)) +#loc284 = loc("cur_block"(#loc119)) +#loc285 = loc("cur_block"(#loc120)) +#loc286 = loc("next_block"(#loc121)) +#loc287 = loc("next_block"(#loc122)) +#loc288 = loc("next_block"(#loc123)) +#loc289 = loc("next_block"(#loc124)) +#loc290 = loc("needs_jump"(#loc125)) +#loc291 = loc("needs_jump"(#loc126)) +#loc292 = loc("needs_jump"(#loc127)) +#loc293 = loc("jump_to_block"(#loc128)) +#loc294 = loc("jump_to_block"(#loc129)) +#loc295 = loc("jump_to_block"(#loc130)) +#loc296 = loc("offset"(#loc131)) +#loc297 = loc("offset"(#loc132)) +#loc298 = loc("offset"(#loc133)) +#loc299 = loc("offset"(#loc134)) +#loc300 = loc("kv_offset"(#loc135)) +#loc301 = loc("kv_indices"(#loc136)) +#loc302 = loc("kv_start"(#loc137)) +#loc303 = loc("kv_start"(#loc138)) +#loc304 = loc("kv_num_blocks"(#loc139)) +#loc305 = loc("kv_num_blocks"(#loc140)) +#loc306 = loc("block_n_end"(#loc141)) +#loc307 = loc("block_n_end"(#loc142)) +#loc308 = loc("offs_n"(#loc143)) +#loc310 = loc("l_i"(#loc146)) +#loc311 = loc("l_i"(#loc147)) +#loc312 = loc("acc"(#loc148)) +#loc313 = loc("acc"(#loc149)) +#loc314 = loc("mask"(#loc150)) +#loc315 = loc("mask"(#loc151)) +#loc316 = loc("off_hz"(#loc157)) +#loc317 = loc("off_hz"(#loc158)) +#loc318 = loc("l_ptrs"(#loc159)) +#loc319 = loc("l_ptrs"(#loc160)) +#loc320 = loc("l_ptrs"(#loc161)) +#loc321 = loc("lse"(#loc162)) +#loc322 = loc("lse"(#loc163)) +#loc323 = loc(callsite(#loc203 at #loc204)) +#loc324 = loc(callsite(#loc205 at #loc204)) +#loc325 = loc(callsite(#loc206 at #loc204)) +#loc326 = loc(callsite(#loc207 at #loc204)) +#loc327 = loc(callsite(#loc208 at #loc204)) +#loc328 = loc(callsite(#loc33 at #loc204)) +#loc329 = loc(callsite(#loc34 at #loc204)) +#loc330 = loc(callsite(#loc41 at #loc215)) +#loc331 = loc(callsite(#loc43 at #loc215)) +#loc332 = loc(callsite(#loc220 at #loc221)) +#loc333 = loc(callsite(#loc222 at #loc221)) +#loc334 = loc(callsite(#loc223 at #loc221)) +#loc335 = loc(callsite(#loc224 at #loc221)) +#loc336 = loc(callsite(#loc225 at #loc221)) +#loc337 = loc("l_i"(#loc226)) +#loc338 = loc(callsite(#loc227 at #loc221)) +#loc339 = loc(callsite(#loc228 at #loc221)) +#loc340 = loc(callsite(#loc229 at #loc221)) +#loc341 = loc(callsite(#loc230 at #loc221)) +#loc342 = loc(callsite(#loc231 at #loc221)) +#loc343 = loc(callsite(#loc232 at #loc221)) +#loc344 = loc(callsite(#loc233 at #loc221)) +#loc345 = loc(callsite(#loc234 at #loc221)) +#loc346 = loc(callsite(#loc235 at #loc221)) +#loc347 = loc(callsite(#loc236 at #loc221)) +#loc348 = loc(callsite(#loc237 at #loc221)) +#loc349 = loc(callsite(#loc238 at #loc221)) +#loc350 = loc(callsite(#loc239 at #loc221)) +#loc351 = loc(callsite(#loc240 at #loc221)) +#loc352 = loc(callsite(#loc241 at #loc221)) +#loc353 = loc(callsite(#loc242 at #loc221)) +#loc354 = loc(callsite(#loc243 at #loc221)) +#loc355 = loc(callsite(#loc244 at #loc221)) +#loc356 = loc(callsite(#loc245 at #loc221)) +#loc357 = loc(callsite(#loc246 at #loc221)) +#loc358 = loc(callsite(#loc247 at #loc221)) +#loc359 = loc(callsite(#loc248 at #loc221)) +#loc360 = loc(callsite(#loc249 at #loc221)) +#loc361 = loc(callsite(#loc250 at #loc221)) +#loc362 = loc(callsite(#loc251 at #loc221)) +#loc363 = loc(callsite(#loc252 at #loc221)) +#loc364 = loc(callsite(#loc253 at #loc221)) +#loc365 = loc(callsite(#loc254 at #loc221)) +#loc366 = loc(callsite(#loc255 at #loc221)) +#loc367 = loc(callsite(#loc256 at #loc221)) +#loc368 = loc(callsite(#loc257 at #loc221)) +#loc369 = loc(callsite(#loc258 at #loc221)) +#loc370 = loc(callsite(#loc259 at #loc221)) +#loc371 = loc(callsite(#loc260 at #loc221)) +#loc372 = loc(callsite(#loc261 at #loc221)) +#loc373 = loc(callsite(#loc262 at #loc221)) +#loc374 = loc(callsite(#loc263 at #loc221)) +#loc375 = loc(callsite(#loc264 at #loc221)) +#loc377 = loc(callsite(#loc266 at #loc221)) +#loc378 = loc(callsite(#loc267 at #loc221)) +#loc379 = loc(callsite(#loc268 at #loc221)) +#loc380 = loc(callsite(#loc269 at #loc221)) +#loc381 = loc(callsite(#loc270 at #loc221)) +#loc382 = loc(callsite(#loc271 at #loc221)) +#loc383 = loc(callsite(#loc272 at #loc221)) +#loc384 = loc(callsite(#loc273 at #loc221)) +#loc385 = loc(callsite(#loc274 at #loc221)) +#loc387 = loc(callsite(#loc276 at #loc221)) +#loc388 = loc(callsite(#loc277 at #loc221)) +#loc389 = loc(callsite(#loc278 at #loc221)) +#loc390 = loc(callsite(#loc279 at #loc221)) +#loc391 = loc(callsite(#loc280 at #loc221)) +#loc392 = loc(callsite(#loc281 at #loc51)) +#loc393 = loc(callsite(#loc283 at #loc51)) +#loc394 = loc(callsite(#loc300 at #loc51)) +#loc395 = loc(callsite(#loc220 at #loc309)) +#loc396 = loc(callsite(#loc232 at #loc309)) +#loc397 = loc(callsite(#loc233 at #loc309)) +#loc398 = loc(callsite(#loc234 at #loc309)) +#loc399 = loc(callsite(#loc235 at #loc309)) +#loc400 = loc(callsite(#loc236 at #loc309)) +#loc401 = loc(callsite(#loc237 at #loc309)) +#loc402 = loc(callsite(#loc238 at #loc309)) +#loc403 = loc(callsite(#loc239 at #loc309)) +#loc404 = loc(callsite(#loc264 at #loc309)) +#loc406 = loc(callsite(#loc266 at #loc309)) +#loc407 = loc(callsite(#loc267 at #loc309)) +#loc408 = loc(callsite(#loc268 at #loc309)) +#loc409 = loc(callsite(#loc269 at #loc309)) +#loc410 = loc(callsite(#loc270 at #loc309)) +#loc411 = loc(callsite(#loc271 at #loc309)) +#loc412 = loc(callsite(#loc272 at #loc309)) +#loc413 = loc(callsite(#loc273 at #loc309)) +#loc414 = loc(callsite(#loc274 at #loc309)) +#loc416 = loc(callsite(#loc276 at #loc309)) +#loc417 = loc(callsite(#loc277 at #loc309)) +#loc418 = loc(callsite(#loc278 at #loc309)) +#loc419 = loc(callsite(#loc279 at #loc309)) +#loc420 = loc(callsite(#loc280 at #loc309)) +#loc421 = loc(callsite(#loc281 at #loc145)) +#loc422 = loc(callsite(#loc283 at #loc145)) +#loc423 = loc(callsite(#loc300 at #loc145)) +#loc424 = loc(callsite(#loc206 at #loc332)) +#loc425 = loc(callsite(#loc208 at #loc332)) +#loc426 = loc(callsite(#loc33 at #loc332)) +#loc427 = loc(callsite(#loc52 at #loc333)) +#loc428 = loc(callsite(#loc52 at #loc334)) +#loc429 = loc("m_i"(#loc337)) +#loc430 = loc(callsite(#loc206 at #loc343)) +#loc431 = loc(callsite(#loc34 at #loc332)) +#loc432 = loc(callsite(#loc34 at #loc343)) +#loc433 = loc(callsite(#loc203 at #loc332)) +#loc434 = loc(callsite(#loc205 at #loc332)) +#loc435 = loc(callsite(#loc208 at #loc343)) +#loc436 = loc(callsite(#loc96 at #loc376)) +#loc438 = loc(callsite(#loc108 at #loc386)) +#loc440 = loc(callsite(#loc282 at #loc393)) +#loc441 = loc(callsite(#loc284 at #loc393)) +#loc442 = loc(callsite(#loc285 at #loc393)) +#loc443 = loc(callsite(#loc286 at #loc393)) +#loc444 = loc(callsite(#loc287 at #loc393)) +#loc445 = loc(callsite(#loc288 at #loc393)) +#loc446 = loc(callsite(#loc289 at #loc393)) +#loc447 = loc(callsite(#loc290 at #loc393)) +#loc448 = loc(callsite(#loc291 at #loc393)) +#loc449 = loc(callsite(#loc292 at #loc393)) +#loc450 = loc(callsite(#loc293 at #loc393)) +#loc451 = loc(callsite(#loc294 at #loc393)) +#loc452 = loc(callsite(#loc295 at #loc393)) +#loc453 = loc(callsite(#loc296 at #loc393)) +#loc454 = loc(callsite(#loc297 at #loc393)) +#loc455 = loc(callsite(#loc298 at #loc393)) +#loc456 = loc(callsite(#loc299 at #loc393)) +#loc457 = loc(callsite(#loc34 at #loc395)) +#loc458 = loc(callsite(#loc34 at #loc396)) +#loc459 = loc(callsite(#loc203 at #loc395)) +#loc460 = loc(callsite(#loc205 at #loc395)) +#loc461 = loc(callsite(#loc206 at #loc395)) +#loc462 = loc(callsite(#loc208 at #loc395)) +#loc463 = loc(callsite(#loc33 at #loc395)) +#loc464 = loc(callsite(#loc206 at #loc396)) +#loc465 = loc(callsite(#loc208 at #loc396)) +#loc466 = loc(callsite(#loc96 at #loc405)) +#loc468 = loc(callsite(#loc108 at #loc415)) +#loc470 = loc(callsite(#loc282 at #loc422)) +#loc471 = loc(callsite(#loc284 at #loc422)) +#loc472 = loc(callsite(#loc285 at #loc422)) +#loc473 = loc(callsite(#loc286 at #loc422)) +#loc474 = loc(callsite(#loc287 at #loc422)) +#loc475 = loc(callsite(#loc288 at #loc422)) +#loc476 = loc(callsite(#loc289 at #loc422)) +#loc477 = loc(callsite(#loc290 at #loc422)) +#loc478 = loc(callsite(#loc291 at #loc422)) +#loc479 = loc(callsite(#loc292 at #loc422)) +#loc480 = loc(callsite(#loc293 at #loc422)) +#loc481 = loc(callsite(#loc294 at #loc422)) +#loc482 = loc(callsite(#loc295 at #loc422)) +#loc483 = loc(callsite(#loc296 at #loc422)) +#loc484 = loc(callsite(#loc297 at #loc422)) +#loc485 = loc(callsite(#loc298 at #loc422)) +#loc486 = loc(callsite(#loc299 at #loc422)) +#loc487 = loc("offs_n"(#loc429)) +#loc488 = loc(callsite(#loc98 at #loc436)) +#loc489 = loc(callsite(#loc110 at #loc438)) +#loc490 = loc(callsite(#loc98 at #loc466)) +#loc491 = loc(callsite(#loc110 at #loc468)) +#loc492 = loc("kv_offset"(#loc487)) +#loc493 = loc(callsite(#loc492 at #loc51)) +#loc494 = loc(callsite(#loc492 at #loc145)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..1be3875c01f147319204b41e92cdb302fd5ddf2e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/RXIVMMLAEMQFJCPGKZRTI3USE6MSGMMXD7VX7LE4EQMOK3SMSBNA/triton_tem_fused_0.ttir @@ -0,0 +1,828 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":172:41) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":538:16) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":421:51) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":434:34) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":198:45) +#loc172 = loc("arg_Q"(#loc)) +#loc173 = loc("arg_K"(#loc)) +#loc174 = loc("arg_V"(#loc)) +#loc175 = loc("arg_LSE"(#loc)) +#loc176 = loc("arg_MAX"(#loc)) +#loc177 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc178 = loc("arg_KV_IDX"(#loc)) +#loc179 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc180 = loc("arg_FULL_KV_IDX"(#loc)) +#loc181 = loc("in_ptr9"(#loc)) +#loc182 = loc("out_ptr0"(#loc)) +#loc183 = loc("ks0"(#loc)) +#loc184 = loc("ks1"(#loc)) +#loc233 = loc(callsite(#loc59 at #loc2)) +#loc275 = loc("m_ij"(#loc103)) +#loc285 = loc("l_i"(#loc115)) +#loc321 = loc(callsite(#loc59 at #loc153)) +#loc387 = loc(callsite(#loc275 at #loc233)) +#loc397 = loc(callsite(#loc285 at #loc233)) +#loc416 = loc(callsite(#loc275 at #loc321)) +#loc426 = loc(callsite(#loc285 at #loc321)) +#loc446 = loc(callsite(#loc1 at #loc387)) +#loc448 = loc(callsite(#loc1 at #loc397)) +#loc476 = loc(callsite(#loc1 at #loc416)) +#loc478 = loc(callsite(#loc1 at #loc426)) +module { + tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %in_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr9"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16> loc(#loc1) + %cst_1 = arith.constant dense<2048> : tensor<128x64xi32> loc(#loc185) + %cst_2 = arith.constant dense<2048> : tensor<1x64xi32> loc(#loc185) + %cst_3 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc1) + %cst_4 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1) + %cst_5 = arith.constant dense : tensor<128x64xi1> loc(#loc185) + %cst_6 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc185) + %cst_7 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc185) + %cst_8 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1) + %cst_9 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1) + %cst_10 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1) + %c63_i32 = arith.constant 63 : i32 loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %q = arith.constant dense<0.000000e+00> : tensor<128x128xbf16> loc(#loc334) + %acc = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc335) + %cst_11 = arith.constant dense<2048> : tensor<128xi32> loc(#loc7) + %cst_12 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc1) + %mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc188) + %cst_13 = arith.constant dense<2048> : tensor<128x1xi32> loc(#loc1) + %l_i = arith.constant dense<1.000000e+00> : tensor<128xf32> loc(#loc189) + %cst_14 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %Q_LEN = arith.constant 2048 : i32 loc(#loc190) + %HQ = arith.constant 32 : i32 loc(#loc191) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %0 = arith.muli %ks0, %c1024_i32 : i32 loc(#loc12) + %1 = arith.muli %ks0, %c128_i32 : i32 loc(#loc13) + %q_start = tt.get_program_id x : i32 loc(#loc192) + %off_zq = tt.get_program_id y : i32 loc(#loc193) + %off_hq = tt.get_program_id z : i32 loc(#loc194) + %off_zkv = arith.remsi %off_zq, %c2_i32 : i32 loc(#loc195) + %off_hkv = arith.divsi %off_hq, %c4_i32 : i32 loc(#loc196) + %q_offset = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc197) + %q_offset_15 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc198) + %q_offset_16 = arith.addi %q_offset, %q_offset_15 : i32 loc(#loc199) + %k_offset = arith.muli %off_zkv, %0 : i32 loc(#loc200) + %k_offset_17 = arith.muli %off_hkv, %1 : i32 loc(#loc201) + %k_offset_18 = arith.addi %k_offset, %k_offset_17 : i32 loc(#loc202) + %Q = tt.addptr %arg_Q, %q_offset_16 : !tt.ptr, i32 loc(#loc203) + %K = tt.addptr %arg_K, %k_offset_18 : !tt.ptr, i32 loc(#loc204) + %V = tt.addptr %arg_V, %k_offset_18 : !tt.ptr, i32 loc(#loc205) + %stride_kv_idx_h = arith.muli %ks1, %c16_i32 : i32 loc(#loc206) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %c16_i32 : i32 loc(#loc207) + %sparse_kv_num_blks_offset_19 = arith.addi %sparse_kv_num_blks_offset, %q_start : i32 loc(#loc208) + %sparse_kv_idx_offset = arith.muli %off_zkv, %stride_kv_idx_h : i32 loc(#loc209) + %sparse_kv_idx_offset_20 = arith.muli %q_start, %ks1 : i32 loc(#loc210) + %sparse_kv_idx_offset_21 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_20 : i32 loc(#loc211) + %offs_m = arith.muli %q_start, %c128_i32 : i32 loc(#loc212) + %offs_m_22 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc213) + %offs_m_23 = tt.splat %offs_m : i32 -> tensor<128xi32> loc(#loc214) + %offs_m_24 = arith.addi %offs_m_23, %offs_m_22 : tensor<128xi32> loc(#loc214) + %ptr = tt.expand_dims %offs_m_24 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc336) + %ptr_25 = arith.muli %ptr, %cst_12 : tensor<128x1xi32> loc(#loc337) + %ptr_26 = tt.splat %Q : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc338) + %ptr_27 = tt.addptr %ptr_26, %ptr_25 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc338) + %ptr_28 = tt.expand_dims %offs_m_22 {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc339) + %ptr_29 = tt.broadcast %ptr_27 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc340) + %ptr_30 = tt.broadcast %ptr_28 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc340) + %ptr_31 = tt.addptr %ptr_29, %ptr_30 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc340) + %q_32 = arith.cmpi slt, %ptr, %cst_13 : tensor<128x1xi32> loc(#loc341) + %q_33 = tt.broadcast %q_32 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc334) + %q_34 = tt.load %ptr_31, %q_33, %q : tensor<128x128x!tt.ptr> loc(#loc334) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_21 : !tt.ptr, i32 loc(#loc220) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc221) + %kv_start_35 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc222) + %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_19 : !tt.ptr, i32 loc(#loc223) + %kv_num_blocks_36 = tt.load %kv_num_blocks : !tt.ptr loc(#loc224) + %block_n_end = arith.muli %kv_num_blocks_36, %c2_i32 : i32 loc(#loc225) + %block_n_end_37 = arith.addi %ks0, %c63_i32 : i32 loc(#loc342) + %block_n_end_38 = arith.divsi %block_n_end_37, %c64_i32 : i32 loc(#loc343) + %block_n_end_39 = arith.maxsi %block_n_end_38, %c1_i32 : i32 loc(#loc227) + %block_n_end_40 = arith.minsi %block_n_end, %block_n_end_39 : i32 loc(#loc228) + %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc229) + %offs_n_41 = tt.splat %kv_start_35 : i32 -> tensor<64xi32> loc(#loc230) + %offs_n_42 = arith.addi %offs_n_41, %offs_n : tensor<64xi32> loc(#loc230) + %2 = tt.expand_dims %offs_n_42 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc56) + %kv_offset:5 = scf.for %start_n = %c0_i32 to %block_n_end_40 step %c1_i32 iter_args(%acc_66 = %acc, %l_i_67 = %cst_14, %m_i = %cst_3, %offs_n_68 = %2, %kv_offset_69 = %c0_i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %kv_base_offset = arith.addi %kv_start_35, %kv_offset_69 : i32 loc(#loc345) + %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc346) + %offs_n_load_70 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc346) + %ptr_71 = tt.expand_dims %offs_n_load_70 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc437) + %ptr_72 = arith.muli %ptr_71, %cst : tensor<64x1xi32> loc(#loc438) + %ptr_73 = tt.splat %K : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc439) + %ptr_74 = tt.addptr %ptr_73, %ptr_72 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc439) + %ptr_75 = tt.broadcast %ptr_74 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc440) + %ptr_76 = tt.broadcast %ptr_28 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc440) + %ptr_77 = tt.addptr %ptr_75, %ptr_76 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc440) + %k = tt.splat %ks0 : i32 -> tensor<64x1xi32> loc(#loc441) + %k_78 = arith.cmpi slt, %ptr_71, %k : tensor<64x1xi32> loc(#loc441) + %k_79 = tt.broadcast %k_78 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc442) + %k_80 = tt.load %ptr_77, %k_79, %cst_0 : tensor<64x128x!tt.ptr> loc(#loc442) + %k_81 = tt.trans %k_80 {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc348) + %qk = tt.dot %q_34, %k_81, %cst_10, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc349) + %qk_82 = arith.mulf %qk, %cst_9 : tensor<128x64xf32> loc(#loc350) + %m = arith.remsi %ptr, %cst_13 : tensor<128x1xi32> loc(#loc443) + %n = tt.splat %ks0 : i32 -> tensor<1x64xi32> loc(#loc444) + %n_83 = arith.remsi %offs_n_68, %n : tensor<1x64xi32> loc(#loc444) + %post_mod_scores = arith.cmpi slt, %offs_n_68, %n : tensor<1x64xi32> loc(#loc353) + %post_mod_scores_84 = tt.broadcast %post_mod_scores : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc354) + %post_mod_scores_85 = arith.select %post_mod_scores_84, %qk_82, %cst_8 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc354) + %tmp4 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc355) + %tmp4_86 = tt.broadcast %n_83 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc355) + %tmp4_87 = arith.cmpi sge, %tmp4, %tmp4_86 : tensor<128x64xi32> loc(#loc355) + %tmp5 = arith.extsi %n_83 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc356) + %tmp7 = tt.addptr %in_ptr9, %off_zq : !tt.ptr, i32 loc(#loc357) + %tmp7_88 = tt.load %tmp7 : !tt.ptr loc(#loc358) + %tmp8 = tt.splat %tmp7_88 : i64 -> tensor<1x64xi64> loc(#loc359) + %tmp8_89 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc359) + %tmp9 = arith.extsi %m : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc360) + %tmp10 = tt.splat %tmp7_88 : i64 -> tensor<128x1xi64> loc(#loc361) + %tmp10_90 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc361) + %tmp11 = tt.broadcast %tmp8_89 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc362) + %tmp11_91 = tt.broadcast %tmp10_90 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc362) + %tmp11_92 = arith.andi %tmp11, %tmp11_91 : tensor<128x64xi1> loc(#loc362) + %tmp12 = arith.andi %tmp4_87, %tmp11_92 : tensor<128x64xi1> loc(#loc363) + %tmp15 = arith.cmpi sge, %n_83, %cst_2 : tensor<1x64xi32> loc(#loc364) + %tmp16 = arith.remsi %n_83, %cst_2 : tensor<1x64xi32> loc(#loc365) + %tmp18 = arith.cmpi ne, %tmp16, %cst_7 : tensor<1x64xi32> loc(#loc366) + %tmp19 = arith.cmpi slt, %tmp16, %cst_7 : tensor<1x64xi32> loc(#loc367) + %tmp22 = arith.andi %tmp18, %tmp19 : tensor<1x64xi1> loc(#loc368) + %tmp23 = arith.addi %tmp16, %cst_2 : tensor<1x64xi32> loc(#loc369) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc370) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc371) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64> loc(#loc372) + %tmp27 = arith.andi %tmp15, %tmp26 : tensor<1x64xi1> loc(#loc373) + %tmp28 = arith.subi %tmp4_86, %tmp4 : tensor<128x64xi32> loc(#loc374) + %tmp29 = arith.remsi %tmp28, %cst_1 : tensor<128x64xi32> loc(#loc375) + %tmp30 = arith.cmpi ne, %tmp29, %cst_6 : tensor<128x64xi32> loc(#loc376) + %tmp31 = arith.cmpi slt, %tmp29, %cst_6 : tensor<128x64xi32> loc(#loc377) + %tmp33 = arith.andi %tmp30, %tmp31 : tensor<128x64xi1> loc(#loc378) + %tmp34 = arith.addi %tmp29, %cst_1 : tensor<128x64xi32> loc(#loc379) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc380) + %tmp36 = arith.cmpi eq, %tmp35, %cst_6 : tensor<128x64xi32> loc(#loc381) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc382) + %tmp37_93 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1> loc(#loc382) + %tmp38 = arith.ori %tmp12, %tmp37_93 : tensor<128x64xi1> loc(#loc383) + %mask_mod_output = arith.select %post_mod_scores_84, %tmp38, %cst_5 : tensor<128x64xi1>, tensor<128x64xi1> loc(#loc384) + %post_mod_scores_94 = arith.select %mask_mod_output, %post_mod_scores_85, %cst_8 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc385) + %post_mod_scores_95 = arith.mulf %post_mod_scores_94, %cst_4 : tensor<128x64xf32> loc(#loc386) + %m_ij = "tt.reduce"(%post_mod_scores_95) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_128: f32 loc(callsite(#loc1 at #loc387)), %m_ij_129: f32 loc(callsite(#loc1 at #loc387))): + %m_ij_130 = arith.maxnumf %m_ij_128, %m_ij_129 : f32 loc(#loc500) + tt.reduce.return %m_ij_130 : f32 loc(#loc445) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc445) + %m_ij_96 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc388) + %masked_out_rows = arith.cmpf oeq, %m_ij_96, %cst_3 : tensor<128xf32> loc(#loc389) + %m_ij_masked = arith.select %masked_out_rows, %cst_14, %m_ij_96 : tensor<128xi1>, tensor<128xf32> loc(#loc390) + %alpha = arith.subf %m_i, %m_ij_masked : tensor<128xf32> loc(#loc391) + %alpha_97 = math.exp2 %alpha : tensor<128xf32> loc(#loc392) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc393) + %p_98 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc394) + %p_99 = arith.subf %post_mod_scores_95, %p_98 : tensor<128x64xf32> loc(#loc394) + %p_100 = math.exp2 %p_99 : tensor<128x64xf32> loc(#loc395) + %l_i_101 = arith.mulf %l_i_67, %alpha_97 : tensor<128xf32> loc(#loc396) + %l_i_102 = "tt.reduce"(%p_100) <{axis = 1 : i32}> ({ + ^bb0(%l_i_128: f32 loc(callsite(#loc1 at #loc397)), %l_i_129: f32 loc(callsite(#loc1 at #loc397))): + %l_i_130 = arith.addf %l_i_128, %l_i_129 : f32 loc(#loc501) + tt.reduce.return %l_i_130 : f32 loc(#loc447) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc447) + %l_i_103 = arith.addf %l_i_101, %l_i_102 : tensor<128xf32> loc(#loc398) + %acc_104 = tt.expand_dims %alpha_97 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc399) + %acc_105 = tt.broadcast %acc_104 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc400) + %acc_106 = arith.mulf %acc_66, %acc_105 : tensor<128x128xf32> loc(#loc400) + %ptr_107 = tt.splat %V : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc449) + %ptr_108 = tt.addptr %ptr_107, %ptr_72 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc449) + %ptr_109 = tt.broadcast %ptr_108 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc450) + %ptr_110 = tt.addptr %ptr_109, %ptr_76 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc450) + %v = tt.load %ptr_110, %k_79, %cst_0 : tensor<64x128x!tt.ptr> loc(#loc451) + %acc_111 = arith.truncf %p_100 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc402) + %acc_112 = tt.dot %acc_111, %v, %acc_106, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc403) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc452) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc453) + %cur_block_113 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc454) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc455) + %next_block_114 = arith.cmpi slt, %next_block, %kv_num_blocks_36 : i32 loc(#loc456) + %next_block_115 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc457) + %next_block_116 = tt.load %next_block_115, %next_block_114 evictionPolicy = evict_last : !tt.ptr loc(#loc458) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc459) + %needs_jump_117 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc460) + %needs_jump_118 = arith.cmpi eq, %needs_jump_117, %c0_i32 : i32 loc(#loc461) + %jump_to_block = arith.subi %next_block_116, %cur_block_113 : i32 loc(#loc462) + %jump_to_block_119 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc463) + %jump_to_block_120 = arith.subi %jump_to_block_119, %c64_i32 : i32 loc(#loc464) + %offset = arith.extui %needs_jump_118 : i1 to i32 loc(#loc465) + %offset_121 = arith.muli %jump_to_block_120, %offset : i32 loc(#loc465) + %offset_122 = arith.subi %c1_i32, %offset : i32 loc(#loc466) + %offset_123 = arith.muli %offset_122, %c64_i32 : i32 loc(#loc467) + %offset_124 = arith.addi %offset_121, %offset_123 : i32 loc(#loc468) + %offs_n_125 = tt.splat %offset_124 : i32 -> tensor<1x64xi32> loc(#loc405) + %offs_n_126 = arith.addi %offs_n_68, %offs_n_125 : tensor<1x64xi32> loc(#loc405) + %kv_offset_127 = arith.addi %kv_offset_69, %offset_124 : i32 loc(#loc406) + scf.yield %acc_112, %l_i_103, %m_ij_96, %offs_n_126, %kv_offset_127 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc312) + } loc(#loc505) + %kv_indices_43 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_21 : !tt.ptr, i32 loc(#loc313) + %kv_start_44 = tt.load %kv_indices_43 : !tt.ptr loc(#loc314) + %kv_start_45 = arith.muli %kv_start_44, %c128_i32 : i32 loc(#loc315) + %kv_num_blocks_46 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_19 : !tt.ptr, i32 loc(#loc316) + %kv_num_blocks_47 = tt.load %kv_num_blocks_46 : !tt.ptr loc(#loc317) + %block_n_end_48 = arith.muli %kv_num_blocks_47, %c2_i32 : i32 loc(#loc318) + %block_n_end_49 = arith.minsi %block_n_end_48, %block_n_end_39 : i32 loc(#loc319) + %offs_n_50 = tt.splat %kv_start_45 : i32 -> tensor<64xi32> loc(#loc320) + %offs_n_51 = arith.addi %offs_n_50, %offs_n : tensor<64xi32> loc(#loc320) + %3 = tt.expand_dims %offs_n_51 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc152) + %kv_offset_52:5 = scf.for %start_n = %c0_i32 to %block_n_end_49 step %c1_i32 iter_args(%acc_66 = %kv_offset#0, %l_i_67 = %kv_offset#1, %m_i = %kv_offset#2, %offs_n_68 = %3, %kv_offset_69 = %c0_i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %kv_base_offset = arith.addi %kv_start_45, %kv_offset_69 : i32 loc(#loc407) + %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc408) + %offs_n_load_70 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc408) + %ptr_71 = tt.expand_dims %offs_n_load_70 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc469) + %ptr_72 = arith.muli %ptr_71, %cst : tensor<64x1xi32> loc(#loc470) + %ptr_73 = tt.splat %K : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc471) + %ptr_74 = tt.addptr %ptr_73, %ptr_72 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc471) + %ptr_75 = tt.broadcast %ptr_74 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc472) + %ptr_76 = tt.broadcast %ptr_28 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc472) + %ptr_77 = tt.addptr %ptr_75, %ptr_76 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc472) + %k = tt.splat %ks0 : i32 -> tensor<64x1xi32> loc(#loc473) + %k_78 = arith.cmpi slt, %ptr_71, %k : tensor<64x1xi32> loc(#loc473) + %k_79 = tt.broadcast %k_78 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc474) + %k_80 = tt.load %ptr_77, %k_79, %cst_0 : tensor<64x128x!tt.ptr> loc(#loc474) + %k_81 = tt.trans %k_80 {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc410) + %qk = tt.dot %q_34, %k_81, %cst_10, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc411) + %qk_82 = arith.mulf %qk, %cst_9 : tensor<128x64xf32> loc(#loc412) + %post_mod_scores = tt.splat %ks0 : i32 -> tensor<1x64xi32> loc(#loc413) + %post_mod_scores_83 = arith.cmpi slt, %offs_n_68, %post_mod_scores : tensor<1x64xi32> loc(#loc413) + %post_mod_scores_84 = tt.broadcast %post_mod_scores_83 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc414) + %post_mod_scores_85 = arith.select %post_mod_scores_84, %qk_82, %cst_8 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc414) + %post_mod_scores_86 = arith.mulf %post_mod_scores_85, %cst_4 : tensor<128x64xf32> loc(#loc415) + %m_ij = "tt.reduce"(%post_mod_scores_86) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_119: f32 loc(callsite(#loc1 at #loc416)), %m_ij_120: f32 loc(callsite(#loc1 at #loc416))): + %m_ij_121 = arith.maxnumf %m_ij_119, %m_ij_120 : f32 loc(#loc502) + tt.reduce.return %m_ij_121 : f32 loc(#loc475) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc475) + %m_ij_87 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc417) + %masked_out_rows = arith.cmpf oeq, %m_ij_87, %cst_3 : tensor<128xf32> loc(#loc418) + %m_ij_masked = arith.select %masked_out_rows, %cst_14, %m_ij_87 : tensor<128xi1>, tensor<128xf32> loc(#loc419) + %alpha = arith.subf %m_i, %m_ij_masked : tensor<128xf32> loc(#loc420) + %alpha_88 = math.exp2 %alpha : tensor<128xf32> loc(#loc421) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc422) + %p_89 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc423) + %p_90 = arith.subf %post_mod_scores_86, %p_89 : tensor<128x64xf32> loc(#loc423) + %p_91 = math.exp2 %p_90 : tensor<128x64xf32> loc(#loc424) + %l_i_92 = arith.mulf %l_i_67, %alpha_88 : tensor<128xf32> loc(#loc425) + %l_i_93 = "tt.reduce"(%p_91) <{axis = 1 : i32}> ({ + ^bb0(%l_i_119: f32 loc(callsite(#loc1 at #loc426)), %l_i_120: f32 loc(callsite(#loc1 at #loc426))): + %l_i_121 = arith.addf %l_i_119, %l_i_120 : f32 loc(#loc503) + tt.reduce.return %l_i_121 : f32 loc(#loc477) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc477) + %l_i_94 = arith.addf %l_i_92, %l_i_93 : tensor<128xf32> loc(#loc427) + %acc_95 = tt.expand_dims %alpha_88 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc428) + %acc_96 = tt.broadcast %acc_95 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc429) + %acc_97 = arith.mulf %acc_66, %acc_96 : tensor<128x128xf32> loc(#loc429) + %ptr_98 = tt.splat %V : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc479) + %ptr_99 = tt.addptr %ptr_98, %ptr_72 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc479) + %ptr_100 = tt.broadcast %ptr_99 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc480) + %ptr_101 = tt.addptr %ptr_100, %ptr_76 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc480) + %v = tt.load %ptr_101, %k_79, %cst_0 : tensor<64x128x!tt.ptr> loc(#loc481) + %acc_102 = arith.truncf %p_91 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc431) + %acc_103 = tt.dot %acc_102, %v, %acc_97, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc432) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc482) + %cur_block = tt.addptr %kv_indices_43, %cur_block_idx : !tt.ptr, i32 loc(#loc483) + %cur_block_104 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc484) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc485) + %next_block_105 = arith.cmpi slt, %next_block, %kv_num_blocks_47 : i32 loc(#loc486) + %next_block_106 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc487) + %next_block_107 = tt.load %next_block_106, %next_block_105 evictionPolicy = evict_last : !tt.ptr loc(#loc488) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc489) + %needs_jump_108 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc490) + %needs_jump_109 = arith.cmpi eq, %needs_jump_108, %c0_i32 : i32 loc(#loc491) + %jump_to_block = arith.subi %next_block_107, %cur_block_104 : i32 loc(#loc492) + %jump_to_block_110 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc493) + %jump_to_block_111 = arith.subi %jump_to_block_110, %c64_i32 : i32 loc(#loc494) + %offset = arith.extui %needs_jump_109 : i1 to i32 loc(#loc495) + %offset_112 = arith.muli %jump_to_block_111, %offset : i32 loc(#loc495) + %offset_113 = arith.subi %c1_i32, %offset : i32 loc(#loc496) + %offset_114 = arith.muli %offset_113, %c64_i32 : i32 loc(#loc497) + %offset_115 = arith.addi %offset_112, %offset_114 : i32 loc(#loc498) + %offs_n_116 = tt.splat %offset_115 : i32 -> tensor<1x64xi32> loc(#loc434) + %offs_n_117 = arith.addi %offs_n_68, %offs_n_116 : tensor<1x64xi32> loc(#loc434) + %kv_offset_118 = arith.addi %kv_offset_69, %offset_115 : i32 loc(#loc435) + scf.yield %acc_103, %l_i_94, %m_ij_87, %offs_n_117, %kv_offset_118 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc322) + } loc(#loc506) + %l_i_53 = arith.cmpf oeq, %kv_offset_52#1, %cst_14 : tensor<128xf32> loc(#loc323) + %l_i_54 = arith.select %l_i_53, %l_i, %kv_offset_52#1 : tensor<128xi1>, tensor<128xf32> loc(#loc189) + %acc_55 = tt.expand_dims %l_i_54 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc324) + %acc_56 = tt.broadcast %acc_55 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc325) + %acc_57 = arith.divf %kv_offset_52#0, %acc_56 : tensor<128x128xf32> loc(#loc325) + %mask_58 = arith.cmpi slt, %ptr_28, %mask : tensor<1x128xi32> loc(#loc188) + %mask_59 = tt.broadcast %mask_58 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc326) + %mask_60 = arith.andi %q_33, %mask_59 : tensor<128x128xi1> loc(#loc326) + %4 = tt.splat %q_offset_15 : i32 -> tensor<1x128xi32> loc(#loc158) + %5 = arith.addi %ptr_28, %4 : tensor<1x128xi32> loc(#loc158) + %6 = tt.broadcast %5 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc159) + %7 = tt.broadcast %ptr_25 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc159) + %8 = arith.addi %6, %7 : tensor<128x128xi32> loc(#loc159) + %9 = tt.splat %q_offset : i32 -> tensor<128x128xi32> loc(#loc160) + %10 = arith.addi %8, %9 : tensor<128x128xi32> loc(#loc160) + %11 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc161) + %12 = tt.addptr %11, %10 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc161) + %13 = arith.truncf %acc_57 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc162) + tt.store %12, %13, %mask_60 : tensor<128x128x!tt.ptr> loc(#loc162) + %off_hz = arith.muli %off_zq, %HQ : i32 loc(#loc327) + %off_hz_61 = arith.addi %off_hz, %off_hq : i32 loc(#loc328) + %l_ptrs = arith.muli %off_hz_61, %Q_LEN : i32 loc(#loc329) + %l_ptrs_62 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr, i32 loc(#loc330) + %l_ptrs_63 = tt.splat %l_ptrs_62 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc331) + %l_ptrs_64 = tt.addptr %l_ptrs_63, %offs_m_24 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc331) + %lse = math.log2 %l_i_54 : tensor<128xf32> loc(#loc332) + %lse_65 = arith.addf %kv_offset_52#2, %lse : tensor<128xf32> loc(#loc333) + %14 = arith.cmpi slt, %offs_m_24, %cst_11 : tensor<128xi32> loc(#loc7) + tt.store %l_ptrs_64, %lse_65, %14 : tensor<128x!tt.ptr> loc(#loc170) + tt.return loc(#loc171) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":292:23) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":146:101) +#loc5 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":136:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":227:48) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":214:38) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":206:34) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":91:12) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":90:9) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":86:54) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":86:63) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":97:28) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":98:27) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":99:27) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":103:23) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":104:24) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":107:24) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":107:45) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":107:36) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":108:25) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":108:47) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":108:37) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":111:12) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":112:12) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":113:12) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":130:25) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":142:51) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":142:74) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":143:46) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":143:97) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":143:64) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":144:23) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":144:46) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":144:33) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":284:27) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":284:38) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":284:20) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":284:56) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":284:49) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":292:52) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":151:26) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":152:23) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":152:37) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":153:42) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":153:28) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":154:45) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":154:92) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":154:102) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":154:65) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":159:37) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":159:24) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":167:48) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":502:40) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":342:32) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":346:35) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":347:107) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":349:17) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":351:19) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":353:14) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":257:21) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":358:36) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":359:36) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":367:44) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":367:69) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":373:23) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":374:23) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":376:33) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":376:23) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":377:22) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":378:23) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":379:23) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":380:23) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":381:23) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":384:24) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":385:24) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":387:25) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":388:92) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":391:24) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":392:24) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":393:39) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":394:25) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":395:24) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":396:24) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":397:23) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":398:25) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":399:25) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":400:92) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":402:24) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":403:24) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":404:39) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":405:25) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":406:24) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":407:24) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":412:73) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":414:69) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":417:27) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":421:27) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":423:35) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":424:51) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":428:31) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":428:25) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":429:51) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":429:39) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":429:21) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":434:16) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":434:24) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":436:22) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":436:16) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":439:107) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":440:22) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":440:44) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":247:33) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":545:63) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":248:38) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":248:24) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":249:109) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":249:113) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":249:55) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":249:25) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":250:30) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":250:35) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":250:60) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":251:34) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":251:48) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":251:63) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":252:29) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":252:47) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":252:61) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":252:42) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":548:26) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":549:21) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":549:8) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":181:35) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":182:27) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":182:41) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":183:51) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":183:32) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":184:49) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":184:69) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":186:28) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":193:52) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":206:26) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":208:20) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":208:16) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":214:30) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":218:49) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":218:62) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":218:75) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":218:25) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":218:109) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":221:26) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":221:31) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":222:32) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":222:23) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":222:40) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":223:33) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":223:20) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":227:29) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/s7/cs7qhjlt3qagwyyic2oiyost4mzjtbyquc6muggyzudwzg2u4vbt.py":229:4) +#loc185 = loc(callsite(#loc1 at #loc2)) +#loc186 = loc("q"(#loc4)) +#loc187 = loc("acc"(#loc6)) +#loc188 = loc("mask"(#loc8)) +#loc189 = loc("l_i"(#loc9)) +#loc190 = loc("Q_LEN"(#loc10)) +#loc191 = loc("HQ"(#loc11)) +#loc192 = loc("q_start"(#loc14)) +#loc193 = loc("off_zq"(#loc15)) +#loc194 = loc("off_hq"(#loc16)) +#loc195 = loc("off_zkv"(#loc17)) +#loc196 = loc("off_hkv"(#loc18)) +#loc197 = loc("q_offset"(#loc19)) +#loc198 = loc("q_offset"(#loc20)) +#loc199 = loc("q_offset"(#loc21)) +#loc200 = loc("k_offset"(#loc22)) +#loc201 = loc("k_offset"(#loc23)) +#loc202 = loc("k_offset"(#loc24)) +#loc203 = loc("Q"(#loc25)) +#loc204 = loc("K"(#loc26)) +#loc205 = loc("V"(#loc27)) +#loc206 = loc("stride_kv_idx_h"(#loc28)) +#loc207 = loc("sparse_kv_num_blks_offset"(#loc29)) +#loc208 = loc("sparse_kv_num_blks_offset"(#loc30)) +#loc209 = loc("sparse_kv_idx_offset"(#loc31)) +#loc210 = loc("sparse_kv_idx_offset"(#loc32)) +#loc211 = loc("sparse_kv_idx_offset"(#loc33)) +#loc212 = loc("offs_m"(#loc34)) +#loc213 = loc("offs_m"(#loc35)) +#loc214 = loc("offs_m"(#loc36)) +#loc215 = loc("ptr"(#loc37)) +#loc216 = loc("ptr"(#loc38)) +#loc217 = loc("ptr"(#loc39)) +#loc218 = loc("ptr"(#loc40)) +#loc219 = loc("ptr"(#loc41)) +#loc220 = loc("kv_indices"(#loc43)) +#loc221 = loc("kv_start"(#loc44)) +#loc222 = loc("kv_start"(#loc45)) +#loc223 = loc("kv_num_blocks"(#loc46)) +#loc224 = loc("kv_num_blocks"(#loc47)) +#loc225 = loc("block_n_end"(#loc48)) +#loc226 = loc("block_n_end"(#loc50)) +#loc227 = loc("block_n_end"(#loc52)) +#loc228 = loc("block_n_end"(#loc53)) +#loc229 = loc("offs_n"(#loc54)) +#loc230 = loc("offs_n"(#loc55)) +#loc231 = loc("acc"(#loc57)) +#loc232 = loc("kv_base_offset"(#loc58)) +#loc234 = loc("offs_n_load"(#loc60)) +#loc235 = loc("k"(#loc61)) +#loc236 = loc("k"(#loc62)) +#loc237 = loc("qk"(#loc63)) +#loc238 = loc("qk"(#loc64)) +#loc239 = loc("m"(#loc66)) +#loc240 = loc("n"(#loc67)) +#loc241 = loc("post_mod_scores"(#loc68)) +#loc242 = loc("post_mod_scores"(#loc69)) +#loc243 = loc("tmp4"(#loc70)) +#loc244 = loc("tmp5"(#loc71)) +#loc245 = loc("tmp7"(#loc72)) +#loc246 = loc("tmp7"(#loc73)) +#loc247 = loc("tmp8"(#loc74)) +#loc248 = loc("tmp9"(#loc75)) +#loc249 = loc("tmp10"(#loc76)) +#loc250 = loc("tmp11"(#loc77)) +#loc251 = loc("tmp12"(#loc78)) +#loc252 = loc("tmp15"(#loc79)) +#loc253 = loc("tmp16"(#loc80)) +#loc254 = loc("tmp18"(#loc81)) +#loc255 = loc("tmp19"(#loc82)) +#loc256 = loc("tmp22"(#loc83)) +#loc257 = loc("tmp23"(#loc84)) +#loc258 = loc("tmp24"(#loc85)) +#loc259 = loc("tmp25"(#loc86)) +#loc260 = loc("tmp26"(#loc87)) +#loc261 = loc("tmp27"(#loc88)) +#loc262 = loc("tmp28"(#loc89)) +#loc263 = loc("tmp29"(#loc90)) +#loc264 = loc("tmp30"(#loc91)) +#loc265 = loc("tmp31"(#loc92)) +#loc266 = loc("tmp33"(#loc93)) +#loc267 = loc("tmp34"(#loc94)) +#loc268 = loc("tmp35"(#loc95)) +#loc269 = loc("tmp36"(#loc96)) +#loc270 = loc("tmp37"(#loc97)) +#loc271 = loc("tmp38"(#loc98)) +#loc272 = loc("mask_mod_output"(#loc99)) +#loc273 = loc("post_mod_scores"(#loc100)) +#loc274 = loc("post_mod_scores"(#loc101)) +#loc276 = loc("m_ij"(#loc105)) +#loc277 = loc("masked_out_rows"(#loc106)) +#loc278 = loc("m_ij_masked"(#loc107)) +#loc279 = loc("alpha"(#loc108)) +#loc280 = loc("alpha"(#loc109)) +#loc281 = loc("p"(#loc110)) +#loc282 = loc("p"(#loc111)) +#loc283 = loc("p"(#loc112)) +#loc284 = loc("l_i"(#loc113)) +#loc286 = loc("l_i"(#loc117)) +#loc287 = loc("acc"(#loc118)) +#loc288 = loc("acc"(#loc119)) +#loc289 = loc("v"(#loc120)) +#loc290 = loc("acc"(#loc121)) +#loc291 = loc("acc"(#loc122)) +#loc292 = loc("cur_block_idx"(#loc123)) +#loc293 = loc("offset"(#loc124)) +#loc294 = loc("cur_block"(#loc125)) +#loc295 = loc("cur_block"(#loc126)) +#loc296 = loc("next_block"(#loc127)) +#loc297 = loc("next_block"(#loc128)) +#loc298 = loc("next_block"(#loc129)) +#loc299 = loc("next_block"(#loc130)) +#loc300 = loc("needs_jump"(#loc131)) +#loc301 = loc("needs_jump"(#loc132)) +#loc302 = loc("needs_jump"(#loc133)) +#loc303 = loc("jump_to_block"(#loc134)) +#loc304 = loc("jump_to_block"(#loc135)) +#loc305 = loc("jump_to_block"(#loc136)) +#loc306 = loc("offset"(#loc137)) +#loc307 = loc("offset"(#loc138)) +#loc308 = loc("offset"(#loc139)) +#loc309 = loc("offset"(#loc140)) +#loc310 = loc("offs_n"(#loc141)) +#loc311 = loc("kv_offset"(#loc142)) +#loc312 = loc(callsite(#loc143 at #loc2)) +#loc313 = loc("kv_indices"(#loc144)) +#loc314 = loc("kv_start"(#loc145)) +#loc315 = loc("kv_start"(#loc146)) +#loc316 = loc("kv_num_blocks"(#loc147)) +#loc317 = loc("kv_num_blocks"(#loc148)) +#loc318 = loc("block_n_end"(#loc149)) +#loc319 = loc("block_n_end"(#loc150)) +#loc320 = loc("offs_n"(#loc151)) +#loc322 = loc(callsite(#loc143 at #loc153)) +#loc323 = loc("l_i"(#loc154)) +#loc324 = loc("acc"(#loc155)) +#loc325 = loc("acc"(#loc156)) +#loc326 = loc("mask"(#loc157)) +#loc327 = loc("off_hz"(#loc163)) +#loc328 = loc("off_hz"(#loc164)) +#loc329 = loc("l_ptrs"(#loc165)) +#loc330 = loc("l_ptrs"(#loc166)) +#loc331 = loc("l_ptrs"(#loc167)) +#loc332 = loc("lse"(#loc168)) +#loc333 = loc("lse"(#loc169)) +#loc334 = loc(callsite(#loc3 at #loc186)) +#loc335 = loc(callsite(#loc5 at #loc187)) +#loc336 = loc(callsite(#loc215 at #loc186)) +#loc337 = loc(callsite(#loc216 at #loc186)) +#loc338 = loc(callsite(#loc217 at #loc186)) +#loc339 = loc(callsite(#loc218 at #loc186)) +#loc340 = loc(callsite(#loc219 at #loc186)) +#loc341 = loc(callsite(#loc42 at #loc186)) +#loc342 = loc(callsite(#loc49 at #loc226)) +#loc343 = loc(callsite(#loc51 at #loc226)) +#loc344 = loc("l_i"(#loc231)) +#loc345 = loc(callsite(#loc232 at #loc233)) +#loc346 = loc(callsite(#loc234 at #loc233)) +#loc347 = loc(callsite(#loc235 at #loc233)) +#loc348 = loc(callsite(#loc236 at #loc233)) +#loc349 = loc(callsite(#loc237 at #loc233)) +#loc350 = loc(callsite(#loc238 at #loc233)) +#loc351 = loc(callsite(#loc239 at #loc233)) +#loc352 = loc(callsite(#loc240 at #loc233)) +#loc353 = loc(callsite(#loc241 at #loc233)) +#loc354 = loc(callsite(#loc242 at #loc233)) +#loc355 = loc(callsite(#loc243 at #loc233)) +#loc356 = loc(callsite(#loc244 at #loc233)) +#loc357 = loc(callsite(#loc245 at #loc233)) +#loc358 = loc(callsite(#loc246 at #loc233)) +#loc359 = loc(callsite(#loc247 at #loc233)) +#loc360 = loc(callsite(#loc248 at #loc233)) +#loc361 = loc(callsite(#loc249 at #loc233)) +#loc362 = loc(callsite(#loc250 at #loc233)) +#loc363 = loc(callsite(#loc251 at #loc233)) +#loc364 = loc(callsite(#loc252 at #loc233)) +#loc365 = loc(callsite(#loc253 at #loc233)) +#loc366 = loc(callsite(#loc254 at #loc233)) +#loc367 = loc(callsite(#loc255 at #loc233)) +#loc368 = loc(callsite(#loc256 at #loc233)) +#loc369 = loc(callsite(#loc257 at #loc233)) +#loc370 = loc(callsite(#loc258 at #loc233)) +#loc371 = loc(callsite(#loc259 at #loc233)) +#loc372 = loc(callsite(#loc260 at #loc233)) +#loc373 = loc(callsite(#loc261 at #loc233)) +#loc374 = loc(callsite(#loc262 at #loc233)) +#loc375 = loc(callsite(#loc263 at #loc233)) +#loc376 = loc(callsite(#loc264 at #loc233)) +#loc377 = loc(callsite(#loc265 at #loc233)) +#loc378 = loc(callsite(#loc266 at #loc233)) +#loc379 = loc(callsite(#loc267 at #loc233)) +#loc380 = loc(callsite(#loc268 at #loc233)) +#loc381 = loc(callsite(#loc269 at #loc233)) +#loc382 = loc(callsite(#loc270 at #loc233)) +#loc383 = loc(callsite(#loc271 at #loc233)) +#loc384 = loc(callsite(#loc272 at #loc233)) +#loc385 = loc(callsite(#loc273 at #loc233)) +#loc386 = loc(callsite(#loc274 at #loc233)) +#loc388 = loc(callsite(#loc276 at #loc233)) +#loc389 = loc(callsite(#loc277 at #loc233)) +#loc390 = loc(callsite(#loc278 at #loc233)) +#loc391 = loc(callsite(#loc279 at #loc233)) +#loc392 = loc(callsite(#loc280 at #loc233)) +#loc393 = loc(callsite(#loc281 at #loc233)) +#loc394 = loc(callsite(#loc282 at #loc233)) +#loc395 = loc(callsite(#loc283 at #loc233)) +#loc396 = loc(callsite(#loc284 at #loc233)) +#loc398 = loc(callsite(#loc286 at #loc233)) +#loc399 = loc(callsite(#loc287 at #loc233)) +#loc400 = loc(callsite(#loc288 at #loc233)) +#loc401 = loc(callsite(#loc289 at #loc233)) +#loc402 = loc(callsite(#loc290 at #loc233)) +#loc403 = loc(callsite(#loc291 at #loc233)) +#loc404 = loc(callsite(#loc293 at #loc2)) +#loc405 = loc(callsite(#loc310 at #loc2)) +#loc406 = loc(callsite(#loc311 at #loc2)) +#loc407 = loc(callsite(#loc232 at #loc321)) +#loc408 = loc(callsite(#loc234 at #loc321)) +#loc409 = loc(callsite(#loc235 at #loc321)) +#loc410 = loc(callsite(#loc236 at #loc321)) +#loc411 = loc(callsite(#loc237 at #loc321)) +#loc412 = loc(callsite(#loc238 at #loc321)) +#loc413 = loc(callsite(#loc241 at #loc321)) +#loc414 = loc(callsite(#loc242 at #loc321)) +#loc415 = loc(callsite(#loc274 at #loc321)) +#loc417 = loc(callsite(#loc276 at #loc321)) +#loc418 = loc(callsite(#loc277 at #loc321)) +#loc419 = loc(callsite(#loc278 at #loc321)) +#loc420 = loc(callsite(#loc279 at #loc321)) +#loc421 = loc(callsite(#loc280 at #loc321)) +#loc422 = loc(callsite(#loc281 at #loc321)) +#loc423 = loc(callsite(#loc282 at #loc321)) +#loc424 = loc(callsite(#loc283 at #loc321)) +#loc425 = loc(callsite(#loc284 at #loc321)) +#loc427 = loc(callsite(#loc286 at #loc321)) +#loc428 = loc(callsite(#loc287 at #loc321)) +#loc429 = loc(callsite(#loc288 at #loc321)) +#loc430 = loc(callsite(#loc289 at #loc321)) +#loc431 = loc(callsite(#loc290 at #loc321)) +#loc432 = loc(callsite(#loc291 at #loc321)) +#loc433 = loc(callsite(#loc293 at #loc153)) +#loc434 = loc(callsite(#loc310 at #loc153)) +#loc435 = loc(callsite(#loc311 at #loc153)) +#loc436 = loc("m_i"(#loc344)) +#loc437 = loc(callsite(#loc215 at #loc347)) +#loc438 = loc(callsite(#loc216 at #loc347)) +#loc439 = loc(callsite(#loc217 at #loc347)) +#loc440 = loc(callsite(#loc219 at #loc347)) +#loc441 = loc(callsite(#loc42 at #loc347)) +#loc442 = loc(callsite(#loc3 at #loc347)) +#loc443 = loc(callsite(#loc65 at #loc351)) +#loc444 = loc(callsite(#loc65 at #loc352)) +#loc445 = loc(callsite(#loc102 at #loc387)) +#loc447 = loc(callsite(#loc114 at #loc397)) +#loc449 = loc(callsite(#loc217 at #loc401)) +#loc450 = loc(callsite(#loc219 at #loc401)) +#loc451 = loc(callsite(#loc3 at #loc401)) +#loc452 = loc(callsite(#loc292 at #loc404)) +#loc453 = loc(callsite(#loc294 at #loc404)) +#loc454 = loc(callsite(#loc295 at #loc404)) +#loc455 = loc(callsite(#loc296 at #loc404)) +#loc456 = loc(callsite(#loc297 at #loc404)) +#loc457 = loc(callsite(#loc298 at #loc404)) +#loc458 = loc(callsite(#loc299 at #loc404)) +#loc459 = loc(callsite(#loc300 at #loc404)) +#loc460 = loc(callsite(#loc301 at #loc404)) +#loc461 = loc(callsite(#loc302 at #loc404)) +#loc462 = loc(callsite(#loc303 at #loc404)) +#loc463 = loc(callsite(#loc304 at #loc404)) +#loc464 = loc(callsite(#loc305 at #loc404)) +#loc465 = loc(callsite(#loc306 at #loc404)) +#loc466 = loc(callsite(#loc307 at #loc404)) +#loc467 = loc(callsite(#loc308 at #loc404)) +#loc468 = loc(callsite(#loc309 at #loc404)) +#loc469 = loc(callsite(#loc215 at #loc409)) +#loc470 = loc(callsite(#loc216 at #loc409)) +#loc471 = loc(callsite(#loc217 at #loc409)) +#loc472 = loc(callsite(#loc219 at #loc409)) +#loc473 = loc(callsite(#loc42 at #loc409)) +#loc474 = loc(callsite(#loc3 at #loc409)) +#loc475 = loc(callsite(#loc102 at #loc416)) +#loc477 = loc(callsite(#loc114 at #loc426)) +#loc479 = loc(callsite(#loc217 at #loc430)) +#loc480 = loc(callsite(#loc219 at #loc430)) +#loc481 = loc(callsite(#loc3 at #loc430)) +#loc482 = loc(callsite(#loc292 at #loc433)) +#loc483 = loc(callsite(#loc294 at #loc433)) +#loc484 = loc(callsite(#loc295 at #loc433)) +#loc485 = loc(callsite(#loc296 at #loc433)) +#loc486 = loc(callsite(#loc297 at #loc433)) +#loc487 = loc(callsite(#loc298 at #loc433)) +#loc488 = loc(callsite(#loc299 at #loc433)) +#loc489 = loc(callsite(#loc300 at #loc433)) +#loc490 = loc(callsite(#loc301 at #loc433)) +#loc491 = loc(callsite(#loc302 at #loc433)) +#loc492 = loc(callsite(#loc303 at #loc433)) +#loc493 = loc(callsite(#loc304 at #loc433)) +#loc494 = loc(callsite(#loc305 at #loc433)) +#loc495 = loc(callsite(#loc306 at #loc433)) +#loc496 = loc(callsite(#loc307 at #loc433)) +#loc497 = loc(callsite(#loc308 at #loc433)) +#loc498 = loc(callsite(#loc309 at #loc433)) +#loc499 = loc("offs_n"(#loc436)) +#loc500 = loc(callsite(#loc104 at #loc445)) +#loc501 = loc(callsite(#loc116 at #loc447)) +#loc502 = loc(callsite(#loc104 at #loc475)) +#loc503 = loc(callsite(#loc116 at #loc477)) +#loc504 = loc("kv_offset"(#loc499)) +#loc505 = loc(callsite(#loc504 at #loc2)) +#loc506 = loc(callsite(#loc504 at #loc153)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/__grp__triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/__grp__triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8eb0d49f88b682c65f0af3605dffad874cdb1760 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/__grp__triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.source", "triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttir", "triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttgir", "triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.llir", "triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ptx", "triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.cubin", "triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.cubin new file mode 100644 index 0000000000000000000000000000000000000000..a18962661c0023c8621c0f38e901bb13a3634265 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ab03638d07a952ff8852b9b1be399d2ac1547539 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.json @@ -0,0 +1 @@ +{"hash": "96e87f726ec4dbcfed4004e2c32b3d92e1cf13384d5f1f4d0bfc1acc083b6498", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.llir new file mode 100644 index 0000000000000000000000000000000000000000..4b249f51e9be8bf4a693cac3c9b8eb3708125823 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.llir @@ -0,0 +1,162 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py\00" +@assertMessage_0 = internal constant [90 x i8] c"index out of bounds: 0 <= tmp10 < 1 + (triton_helpers.div_floor_integer(127 + ks3, 128))\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i64 %4, i64 %5, i64 %6, i64 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %13 = shl i32 %12, 7, !dbg !11 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %15 = and i32 %14, 127, !dbg !12 + %16 = or disjoint i32 %13, %15, !dbg !13 + %17 = icmp slt i32 %16, %8, !dbg !14 + %18 = sext i32 %16 to i64, !dbg !15 + %.frozen = freeze i64 %4, !dbg !16 + %19 = sdiv i64 %18, %.frozen, !dbg !16 + %20 = mul i64 %19, %.frozen, !dbg !15 + %.decomposed = sub i64 %18, %20, !dbg !15 + %21 = srem i64 %19, %5, !dbg !17 + %22 = sdiv i64 %18, %6, !dbg !18 + %23 = icmp slt i64 %4, 2, !dbg !19 + %24 = icmp sgt i64 %4, 1, !dbg !20 + %25 = select i1 %24, i64 %4, i64 0, !dbg !21 + %26 = zext i1 %23 to i64, !dbg !22 + %27 = add i64 %25, %26, !dbg !23 + %28 = icmp slt i64 %5, 2, !dbg !24 + %29 = icmp sgt i64 %5, 1, !dbg !25 + %30 = select i1 %29, i64 %5, i64 0, !dbg !26 + %31 = zext i1 %28 to i64, !dbg !22 + %32 = add i64 %30, %31, !dbg !27 + %33 = mul i64 %22, %32, !dbg !28 + %reass.add = add i64 %33, %21, !dbg !29 + %reass.mul = mul i64 %reass.add, %27, !dbg !29 + %34 = add i64 %reass.mul, %.decomposed, !dbg !29 + %35 = getelementptr i64, ptr addrspace(1) %0, i64 %34, !dbg !30 + %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !31 + %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %35, i64 %36, i1 %17) #4, !dbg !31 + %38 = getelementptr i32, ptr addrspace(1) %1, i64 %19, !dbg !32 + %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !33 + %40 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %38, i64 %39, i1 %17) #4, !dbg !33 + %41 = sext i32 %40 to i64, !dbg !34 + %42 = icmp slt i64 %.decomposed, %41, !dbg !34 + %sext = shl i64 %37, 32, !dbg !35 + %43 = ashr exact i64 %sext, 32, !dbg !35 + %44 = select i1 %42, i64 %43, i64 %4, !dbg !35 + %45 = add i64 %4, 1, !dbg !36 + %46 = icmp slt i64 %44, 0, !dbg !37 + %47 = select i1 %46, i64 %45, i64 0, !dbg !38 + %48 = add i64 %47, %44, !dbg !38 + %49 = icmp slt i64 %48, 0, !dbg !39 + %50 = add i64 %7, 127, !dbg !40 + %51 = sdiv i64 %50, 128, !dbg !41 + %52 = and i64 %50, 127, !dbg !45 + %.not = icmp ne i64 %52, 0, !dbg !45 + %53 = icmp slt i64 %50, 0, !dbg !46 + %narrow = and i1 %53, %.not, !dbg !47 + %54 = sext i1 %narrow to i64, !dbg !47 + %55 = add nsw i64 %51, %54, !dbg !47 + %56 = icmp sgt i64 %48, %55, !dbg !48 + %.not5 = or i1 %49, %56, !dbg !49 + %.not2 = and i1 %17, %.not5, !dbg !50 + br i1 %.not2, label %57, label %58, !dbg !50 + +57: ; preds = %11 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 37, ptr nonnull @assertFunc_0, i64 1), !dbg !50 + unreachable, !dbg !50 + +58: ; preds = %11 + %59 = trunc i64 %37 to i32, !dbg !51 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50 + %60 = getelementptr i32, ptr addrspace(1) %2, i64 %34, !dbg !52 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %59, ptr addrspace(1) %60, i1 %17) #4, !dbg !53 + %61 = mul i64 %19, %4, !dbg !54 + %62 = getelementptr i32, ptr addrspace(1) %3, i64 %48, !dbg !55 + %63 = getelementptr i32, ptr addrspace(1) %62, i64 %19, !dbg !55 + %64 = getelementptr i32, ptr addrspace(1) %63, i64 %61, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %64, i1 %17) #4, !dbg !56 + ret void, !dbg !57 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="128" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3", linkageName: "triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 19, column: 28, scope: !9) +!11 = !DILocation(line: 19, column: 33, scope: !9) +!12 = !DILocation(line: 20, column: 36, scope: !9) +!13 = !DILocation(line: 20, column: 23, scope: !9) +!14 = !DILocation(line: 21, column: 21, scope: !9) +!15 = !DILocation(line: 22, column: 19, scope: !9) +!16 = !DILocation(line: 23, column: 21, scope: !9) +!17 = !DILocation(line: 23, column: 28, scope: !9) +!18 = !DILocation(line: 24, column: 19, scope: !9) +!19 = !DILocation(line: 26, column: 54, scope: !9) +!20 = !DILocation(line: 26, column: 80, scope: !9) +!21 = !DILocation(line: 26, column: 71, scope: !9) +!22 = !DILocation(line: 26, scope: !9) +!23 = !DILocation(line: 26, column: 62, scope: !9) +!24 = !DILocation(line: 26, column: 153, scope: !9) +!25 = !DILocation(line: 26, column: 179, scope: !9) +!26 = !DILocation(line: 26, column: 170, scope: !9) +!27 = !DILocation(line: 26, column: 161, scope: !9) +!28 = !DILocation(line: 26, column: 138, scope: !9) +!29 = !DILocation(line: 26, column: 87, scope: !9) +!30 = !DILocation(line: 26, column: 30, scope: !9) +!31 = !DILocation(line: 26, column: 186, scope: !9) +!32 = !DILocation(line: 27, column: 30, scope: !9) +!33 = !DILocation(line: 27, column: 35, scope: !9) +!34 = !DILocation(line: 30, column: 18, scope: !9) +!35 = !DILocation(line: 32, column: 32, scope: !9) +!36 = !DILocation(line: 33, column: 15, scope: !9) +!37 = !DILocation(line: 35, column: 18, scope: !9) +!38 = !DILocation(line: 36, column: 33, scope: !9) +!39 = !DILocation(line: 37, column: 28, scope: !9) +!40 = !DILocation(line: 37, column: 90, scope: !9) +!41 = !DILocation(line: 72, column: 16, scope: !42, inlinedAt: !44) +!42 = distinct !DILexicalBlockFile(scope: !9, file: !43, discriminator: 0) +!43 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!44 = !DILocation(line: 37, column: 96, scope: !9) +!45 = !DILocation(line: 74, column: 34, scope: !42, inlinedAt: !44) +!46 = !DILocation(line: 75, column: 25, scope: !42, inlinedAt: !44) +!47 = !DILocation(line: 75, column: 47, scope: !42, inlinedAt: !44) +!48 = !DILocation(line: 37, column: 46, scope: !9) +!49 = !DILocation(line: 37, column: 108, scope: !9) +!50 = !DILocation(line: 37, column: 116, scope: !9) +!51 = !DILocation(line: 28, column: 19, scope: !9) +!52 = !DILocation(line: 39, column: 25, scope: !9) +!53 = !DILocation(line: 39, column: 187, scope: !9) +!54 = !DILocation(line: 40, column: 42, scope: !9) +!55 = !DILocation(line: 40, column: 25, scope: !9) +!56 = !DILocation(line: 40, column: 54, scope: !9) +!57 = !DILocation(line: 40, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ptx new file mode 100644 index 0000000000000000000000000000000000000000..bf160c94361501aebcd2f2da29efe8ebf72c8150 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ptx @@ -0,0 +1,557 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3 // -- Begin function triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 108, 105, 47, 99, 108, 105, 109, 122, 103, 122, 105, 112, 116, 115, 106, 108, 122, 52, 116, 115, 117, 120, 97, 99, 118, 116, 54, 109, 117, 121, 111, 101, 101, 109, 111, 55, 108, 50, 98, 120, 99, 121, 111, 116, 102, 106, 121, 51, 118, 105, 106, 54, 104, 98, 116, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[90] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 48, 32, 60, 32, 49, 32, 43, 32, 40, 116, 114, 105, 116, 111, 110, 95, 104, 101, 108, 112, 101, 114, 115, 46, 100, 105, 118, 95, 102, 108, 111, 111, 114, 95, 105, 110, 116, 101, 103, 101, 114, 40, 49, 50, 55, 32, 43, 32, 107, 115, 51, 44, 32, 32, 49, 50, 56, 41, 41}; + // @triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3 +.visible .entry triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3( + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_3, + .param .u64 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_4, + .param .u64 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_5, + .param .u64 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_6, + .param .u64 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_7, + .param .u32 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_8, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_9, + .param .u64 .ptr .global .align 1 triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_10 +) +.reqntid 128 +{ + .reg .pred %p<21>; + .reg .b32 %r<23>; + .reg .b64 %rd<80>; + .loc 1 18 0 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:18:0 +$L__func_begin0: + .loc 1 18 0 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:18:0 + +// %bb.0: + ld.param.b64 %rd21, [triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_5]; + ld.param.b64 %rd20, [triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_4]; +$L__tmp0: + .loc 1 19 28 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:19:28 + mov.u32 %r2, %ctaid.x; + .loc 1 19 33 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:19:33 + shl.b32 %r3, %r2, 7; + .loc 1 20 36 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:20:36 + mov.u32 %r4, %tid.x; + and.b32 %r5, %r4, 127; + .loc 1 20 23 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:20:23 + or.b32 %r6, %r3, %r5; + .loc 1 22 19 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:22:19 + cvt.s64.s32 %rd1, %r6; + .loc 1 23 21 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:23:21 + or.b64 %rd25, %rd1, %rd20; + and.b64 %rd26, %rd25, -4294967296; + setp.ne.b64 %p1, %rd26, 0; + cvt.u32.u64 %r22, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd77, %rd1, %rd20; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r7, %rd20; + div.u32 %r9, %r22, %r7; + cvt.u64.u32 %rd77, %r9; +$L__BB0_3: + .loc 1 0 21 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:0:21 + ld.param.b64 %rd22, [triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_6]; + .loc 1 22 19 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:22:19 + mul.lo.s64 %rd27, %rd77, %rd20; + .loc 1 23 28 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:23:28 + or.b64 %rd28, %rd77, %rd21; + and.b64 %rd29, %rd28, -4294967296; + setp.ne.b64 %p2, %rd29, 0; + @%p2 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + rem.s64 %rd78, %rd77, %rd21; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r10, %rd21; + cvt.u32.u64 %r11, %rd77; + rem.u32 %r12, %r11, %r10; + cvt.u64.u32 %rd78, %r12; +$L__BB0_6: + .loc 1 0 28 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:0:28 + ld.param.b32 %r1, [triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_8]; + ld.param.b64 %rd23, [triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_7]; + ld.param.b64 %rd17, [triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_1]; + ld.param.b64 %rd16, [triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_0]; + sub.s64 %rd6, %rd1, %rd27; + .loc 1 24 19 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:24:19 + or.b64 %rd30, %rd1, %rd22; + and.b64 %rd31, %rd30, -4294967296; + setp.ne.b64 %p3, %rd31, 0; + @%p3 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + div.s64 %rd79, %rd1, %rd22; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r13, %rd22; + div.u32 %r15, %r22, %r13; + cvt.u64.u32 %rd79, %r15; +$L__BB0_9: + .loc 1 21 21 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:21:21 + setp.lt.s32 %p5, %r22, %r1; + .loc 1 26 54 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:26:54 + setp.lt.s64 %p6, %rd20, 2; + .loc 1 26 80 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:26:80 + setp.gt.s64 %p7, %rd20, 1; + .loc 1 26 71 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:26:71 + selp.b64 %rd39, %rd20, 0, %p7; + .loc 1 26 0 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:26 + selp.b64 %rd40, 1, 0, %p6; + .loc 1 26 62 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:26:62 + add.s64 %rd41, %rd39, %rd40; + .loc 1 26 153 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:26:153 + setp.lt.s64 %p8, %rd21, 2; + .loc 1 26 179 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:26:179 + setp.gt.s64 %p9, %rd21, 1; + .loc 1 26 170 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:26:170 + selp.b64 %rd42, %rd21, 0, %p9; + .loc 1 26 0 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:26 + selp.b64 %rd43, 1, 0, %p8; + .loc 1 26 161 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:26:161 + add.s64 %rd44, %rd42, %rd43; + .loc 1 26 87 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:26:87 + mad.lo.s64 %rd45, %rd79, %rd44, %rd78; + mad.lo.s64 %rd13, %rd45, %rd41, %rd6; + .loc 1 26 30 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:26:30 + shl.b64 %rd46, %rd13, 3; + add.s64 %rd34, %rd16, %rd46; + .loc 1 26 186 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:26:186 + // begin inline asm + mov.u64 %rd32, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd32, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd33, 0x0; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd33 }, [ %rd34 + 0 ], %rd32; + // end inline asm + .loc 1 27 30 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:27:30 + shl.b64 %rd47, %rd77, 2; + add.s64 %rd37, %rd17, %rd47; + .loc 1 27 35 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:27:35 + // begin inline asm + mov.u64 %rd36, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd36, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r16, 0x0; + @%p5 ld.global.L1::evict_last.L2::cache_hint.b32 { %r16 }, [ %rd37 + 0 ], %rd36; + // end inline asm + .loc 1 30 18 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:30:18 + cvt.s64.s32 %rd48, %r16; + setp.lt.s64 %p10, %rd6, %rd48; + .loc 1 32 32 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:32:32 + cvt.s64.s32 %rd49, %rd33; + selp.b64 %rd50, %rd49, %rd20, %p10; + .loc 1 33 15 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:33:15 + add.s64 %rd51, %rd20, 1; + .loc 1 36 33 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:36:33 + shr.s64 %rd52, %rd50, 63; + and.b64 %rd53, %rd52, %rd51; + add.s64 %rd15, %rd53, %rd50; + .loc 1 37 28 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:28 + setp.lt.s64 %p11, %rd15, 0; + .loc 1 37 90 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:90 + add.s64 %rd54, %rd23, 127; +$L__tmp1: + .loc 2 72 16 // triton_helpers.py:72:16 @[ climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:96 ] + shr.s64 %rd55, %rd54, 63; + shr.u64 %rd56, %rd55, 57; + add.s64 %rd57, %rd54, %rd56; + shr.s64 %rd58, %rd57, 7; + .loc 2 74 34 // triton_helpers.py:74:34 @[ climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:96 ] + and.b64 %rd59, %rd54, 127; + setp.ne.b64 %p12, %rd59, 0; + .loc 2 75 25 // triton_helpers.py:75:25 @[ climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:96 ] + setp.lt.s64 %p13, %rd54, 0; + .loc 2 75 47 // triton_helpers.py:75:47 @[ climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:96 ] + and.pred %p14, %p13, %p12; + selp.b64 %rd60, -1, 0, %p14; + add.s64 %rd61, %rd58, %rd60; +$L__tmp2: + .loc 1 37 46 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:46 + setp.gt.s64 %p15, %rd15, %rd61; + .loc 1 37 108 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:108 + or.pred %p16, %p11, %p15; + .loc 1 37 116 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:116 + and.pred %p17, %p5, %p16; + not.pred %p18, %p17; + @%p18 bra $L__BB0_11; + bra.uni $L__BB0_10; +$L__BB0_11: + .loc 1 0 116 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:0:116 + ld.param.b64 %rd19, [triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_3]; + ld.param.b64 %rd18, [triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3_param_2]; + .loc 1 28 19 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:28:19 + cvt.u32.u64 %r18, %rd33; + .loc 1 37 116 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:116 + bar.sync 0; + .loc 1 39 25 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:39:25 + shl.b64 %rd64, %rd13, 2; + add.s64 %rd62, %rd18, %rd64; + .loc 1 39 187 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:39:187 + // begin inline asm + @%p5 st.global.b32 [ %rd62 + 0 ], { %r18 }; + // end inline asm + .loc 1 40 25 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:40:25 + shl.b64 %rd66, %rd15, 2; + add.s64 %rd67, %rd19, %rd66; + add.s64 %rd69, %rd67, %rd47; + shl.b64 %rd70, %rd27, 2; + add.s64 %rd63, %rd69, %rd70; + mov.b32 %r19, 1; + .loc 1 40 54 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:40:54 + // begin inline asm + @%p5 st.global.b32 [ %rd63 + 0 ], { %r19 }; + // end inline asm + .loc 1 40 4 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:40:4 + ret; +$L__BB0_10: + .loc 1 37 116 // climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py:37:116 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd71, assertFunc_0; + cvta.global.u64 %rd72, %rd71; + st.param.b64 [param3], %rd72; + mov.b64 %rd73, assertFile_0; + cvta.global.u64 %rd74, %rd73; + st.param.b64 [param1], %rd74; + mov.b64 %rd75, assertMessage_0; + cvta.global.u64 %rd76, %rd75; + st.param.b64 [param0], %rd76; + st.param.b64 [param4], 1; + st.param.b32 [param2], 37; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 277 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x10e DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 108 +.b8 105 +.b8 109 +.b8 122 +.b8 103 +.b8 122 +.b8 105 +.b8 112 +.b8 116 +.b8 115 +.b8 106 +.b8 108 +.b8 122 +.b8 52 +.b8 116 +.b8 115 +.b8 117 +.b8 120 +.b8 97 +.b8 99 +.b8 118 +.b8 116 +.b8 54 +.b8 109 +.b8 117 +.b8 121 +.b8 111 +.b8 101 +.b8 101 +.b8 109 +.b8 111 +.b8 55 +.b8 108 +.b8 50 +.b8 98 +.b8 120 +.b8 99 +.b8 121 +.b8 111 +.b8 116 +.b8 102 +.b8 106 +.b8 121 +.b8 51 +.b8 118 +.b8 105 +.b8 106 +.b8 54 +.b8 104 +.b8 98 +.b8 116 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 108 +.b8 105 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x5f DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 111 +.b8 105 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 112 +.b8 117 +.b8 116 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 110 +.b8 101 +.b8 119 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 115 +.b8 99 +.b8 97 +.b8 108 +.b8 97 +.b8 114 +.b8 95 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 119 +.b8 104 +.b8 101 +.b8 114 +.b8 101 +.b8 95 +.b8 51 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xea:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xff:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 37 // DW_AT_call_line +.b8 96 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.source b/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.source new file mode 100644 index 0000000000000000000000000000000000000000..f1182ff45b8085df45bf3bca10a20e6e59911a3a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.source @@ -0,0 +1,355 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":18:0) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0) +#loc91 = loc("in_ptr0"(#loc)) +#loc92 = loc("in_ptr1"(#loc)) +#loc93 = loc("out_ptr0"(#loc)) +#loc94 = loc("out_ptr1"(#loc)) +#loc95 = loc("ks0"(#loc)) +#loc96 = loc("ks1"(#loc)) +#loc97 = loc("ks2"(#loc)) +#loc98 = loc("ks3"(#loc)) +#loc99 = loc("xnumel"(#loc)) +#loc142 = loc("a"(#loc80)) +module { + tt.func public @triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc100) + %xoffset_0 = arith.constant 128 : i32 loc(#loc101) + %xoffset_1 = arith.constant 128 : i32 loc(#loc101) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc101) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc102) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<128xi32> loc(#loc103) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<128xi32> loc(#loc103) + %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc104) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<128xi32> loc(#loc104) + %x0 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc105) + %x0_6 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc105) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<128xi64> loc(#loc105) + %x1 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc106) + %x1_8 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc106) + %x1_9 = arith.divsi %x1, %x1_8 : tensor<128xi64> loc(#loc106) + %x1_10 = tt.splat %ks1 : i64 -> tensor<128xi64> loc(#loc107) + %x1_11 = arith.remsi %x1_9, %x1_10 : tensor<128xi64> loc(#loc107) + %x2 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc108) + %x2_12 = tt.splat %ks2 : i64 -> tensor<128xi64> loc(#loc108) + %x2_13 = arith.divsi %x2, %x2_12 : tensor<128xi64> loc(#loc108) + %x3 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc109) + %x3_14 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc109) + %x3_15 = arith.divsi %x3, %x3_14 : tensor<128xi64> loc(#loc109) + %tmp0 = arith.constant 1 : i32 loc(#loc110) + %tmp0_16 = arith.extsi %tmp0 : i32 to i64 loc(#loc110) + %tmp0_17 = arith.cmpi sge, %tmp0_16, %ks0 : i64 loc(#loc110) + %tmp0_18 = arith.constant 1 : i32 loc(#loc111) + %tmp0_19 = arith.constant 1 : i32 loc(#loc111) + %tmp0_20 = arith.extui %tmp0_17 : i1 to i32 loc(#loc111) + %tmp0_21 = arith.muli %tmp0_19, %tmp0_20 : i32 loc(#loc111) + %tmp0_22 = arith.constant 1 : i32 loc(#loc112) + %tmp0_23 = arith.extsi %tmp0_22 : i32 to i64 loc(#loc112) + %tmp0_24 = arith.cmpi sgt, %ks0, %tmp0_23 : i64 loc(#loc112) + %tmp0_25 = arith.extui %tmp0_24 : i1 to i64 loc(#loc113) + %tmp0_26 = arith.muli %ks0, %tmp0_25 : i64 loc(#loc113) + %tmp0_27 = arith.extsi %tmp0_21 : i32 to i64 loc(#loc114) + %tmp0_28 = arith.addi %tmp0_27, %tmp0_26 : i64 loc(#loc114) + %tmp0_29 = tt.splat %tmp0_28 : i64 -> tensor<128xi64> loc(#loc115) + %tmp0_30 = arith.muli %x1_11, %tmp0_29 : tensor<128xi64> loc(#loc115) + %tmp0_31 = arith.addi %x0_7, %tmp0_30 : tensor<128xi64> loc(#loc116) + %tmp0_32 = arith.constant 1 : i32 loc(#loc117) + %tmp0_33 = arith.extsi %tmp0_32 : i32 to i64 loc(#loc117) + %tmp0_34 = arith.cmpi sge, %tmp0_33, %ks0 : i64 loc(#loc117) + %tmp0_35 = arith.constant 1 : i32 loc(#loc118) + %tmp0_36 = arith.constant 1 : i32 loc(#loc118) + %tmp0_37 = arith.extui %tmp0_34 : i1 to i32 loc(#loc118) + %tmp0_38 = arith.muli %tmp0_36, %tmp0_37 : i32 loc(#loc118) + %tmp0_39 = arith.constant 1 : i32 loc(#loc119) + %tmp0_40 = arith.extsi %tmp0_39 : i32 to i64 loc(#loc119) + %tmp0_41 = arith.cmpi sgt, %ks0, %tmp0_40 : i64 loc(#loc119) + %tmp0_42 = arith.extui %tmp0_41 : i1 to i64 loc(#loc120) + %tmp0_43 = arith.muli %ks0, %tmp0_42 : i64 loc(#loc120) + %tmp0_44 = arith.extsi %tmp0_38 : i32 to i64 loc(#loc121) + %tmp0_45 = arith.addi %tmp0_44, %tmp0_43 : i64 loc(#loc121) + %tmp0_46 = tt.splat %tmp0_45 : i64 -> tensor<128xi64> loc(#loc122) + %tmp0_47 = arith.muli %x2_13, %tmp0_46 : tensor<128xi64> loc(#loc122) + %tmp0_48 = arith.constant 1 : i32 loc(#loc123) + %tmp0_49 = arith.extsi %tmp0_48 : i32 to i64 loc(#loc123) + %tmp0_50 = arith.cmpi sge, %tmp0_49, %ks1 : i64 loc(#loc123) + %tmp0_51 = arith.constant 1 : i32 loc(#loc124) + %tmp0_52 = arith.constant 1 : i32 loc(#loc124) + %tmp0_53 = arith.extui %tmp0_50 : i1 to i32 loc(#loc124) + %tmp0_54 = arith.muli %tmp0_52, %tmp0_53 : i32 loc(#loc124) + %tmp0_55 = arith.constant 1 : i32 loc(#loc125) + %tmp0_56 = arith.extsi %tmp0_55 : i32 to i64 loc(#loc125) + %tmp0_57 = arith.cmpi sgt, %ks1, %tmp0_56 : i64 loc(#loc125) + %tmp0_58 = arith.extui %tmp0_57 : i1 to i64 loc(#loc126) + %tmp0_59 = arith.muli %ks1, %tmp0_58 : i64 loc(#loc126) + %tmp0_60 = arith.extsi %tmp0_54 : i32 to i64 loc(#loc127) + %tmp0_61 = arith.addi %tmp0_60, %tmp0_59 : i64 loc(#loc127) + %tmp0_62 = tt.splat %tmp0_61 : i64 -> tensor<128xi64> loc(#loc128) + %tmp0_63 = arith.muli %tmp0_47, %tmp0_62 : tensor<128xi64> loc(#loc128) + %tmp0_64 = arith.addi %tmp0_31, %tmp0_63 : tensor<128xi64> loc(#loc129) + %tmp0_65 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc130) + %tmp0_66 = tt.addptr %tmp0_65, %tmp0_64 : tensor<128x!tt.ptr>, tensor<128xi64> loc(#loc130) + %tmp0_67 = tt.load %tmp0_66, %xmask_5 evictionPolicy = evict_last : tensor<128x!tt.ptr> loc(#loc131) + %tmp2 = tt.splat %in_ptr1 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc132) + %tmp2_68 = tt.addptr %tmp2, %x3_15 : tensor<128x!tt.ptr>, tensor<128xi64> loc(#loc132) + %tmp2_69 = tt.load %tmp2_68, %xmask_5 evictionPolicy = evict_last : tensor<128x!tt.ptr> loc(#loc133) + %tmp1 = arith.trunci %tmp0_67 : tensor<128xi64> to tensor<128xi32> loc(#loc134) + %tmp4 = arith.extsi %tmp2_69 : tensor<128xi32> to tensor<128xi64> loc(#loc135) + %tmp4_70 = arith.cmpi slt, %x0_7, %tmp4 : tensor<128xi64> loc(#loc135) + %tmp6 = arith.extsi %tmp1 : tensor<128xi32> to tensor<128xi64> loc(#loc136) + %tmp6_71 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc136) + %tmp6_72 = arith.select %tmp4_70, %tmp6, %tmp6_71 : tensor<128xi1>, tensor<128xi64> loc(#loc136) + %tmp7 = arith.constant 1 : i32 loc(#loc137) + %tmp7_73 = arith.constant 1 : i64 loc(#loc137) + %tmp7_74 = arith.addi %tmp7_73, %ks0 : i64 loc(#loc137) + %tmp8 = tt.splat %tmp7_74 : i64 -> tensor<128xi64> loc(#loc138) + %tmp8_75 = arith.addi %tmp6_72, %tmp8 : tensor<128xi64> loc(#loc138) + %tmp9 = arith.constant 0 : i32 loc(#loc139) + %tmp9_76 = arith.extsi %tmp9 : i32 to i64 loc(#loc139) + %tmp9_77 = tt.splat %tmp9_76 : i64 -> tensor<128xi64> loc(#loc139) + %tmp9_78 = arith.cmpi slt, %tmp6_72, %tmp9_77 : tensor<128xi64> loc(#loc139) + %tmp10 = arith.select %tmp9_78, %tmp8_75, %tmp6_72 : tensor<128xi1>, tensor<128xi64> loc(#loc140) + %c0_i32 = arith.constant 0 : i32 loc(#loc42) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc42) + %1 = tt.splat %0 : i64 -> tensor<128xi64> loc(#loc42) + %2 = arith.cmpi sle, %1, %tmp10 : tensor<128xi64> loc(#loc42) + %c127_i32 = arith.constant 127 : i32 loc(#loc43) + %c127_i64 = arith.constant 127 : i64 loc(#loc43) + %3 = arith.addi %c127_i64, %ks3 : i64 loc(#loc43) + %4 = tt.call @"torch._inductor.runtime.triton_helpers.div_floor_integer__i64__(1,)cconstexpr_128_"(%3) : (i64) -> i64 loc(#loc44) + %c1_i32 = arith.constant 1 : i32 loc(#loc45) + %c1_i64 = arith.constant 1 : i64 loc(#loc45) + %5 = arith.addi %c1_i64, %4 : i64 loc(#loc45) + %6 = tt.splat %5 : i64 -> tensor<128xi64> loc(#loc46) + %7 = arith.cmpi slt, %tmp10, %6 : tensor<128xi64> loc(#loc46) + %8 = arith.andi %2, %7 : tensor<128xi1> loc(#loc47) + %true = arith.constant true loc(#loc48) + %cst = arith.constant dense : tensor<128xi1> loc(#loc48) + %9 = arith.xori %xmask_5, %cst : tensor<128xi1> loc(#loc48) + %10 = arith.ori %8, %9 : tensor<128xi1> loc(#loc49) + tt.assert %10, "index out of bounds: 0 <= tmp10 < 1 + (triton_helpers.div_floor_integer(127 + ks3, 128))" : tensor<128xi1> loc(#loc50) + %tmp12 = arith.constant 1 : i32 loc(#loc141) + %tmp12_79 = arith.constant dense<1> : tensor<1xi32> loc(#loc141) + %c1_i32_80 = arith.constant 1 : i32 loc(#loc52) + %11 = arith.extsi %c1_i32_80 : i32 to i64 loc(#loc52) + %12 = arith.cmpi sge, %11, %ks0 : i64 loc(#loc52) + %c1_i32_81 = arith.constant 1 : i32 loc(#loc53) + %c1_i32_82 = arith.constant 1 : i32 loc(#loc53) + %13 = arith.extui %12 : i1 to i32 loc(#loc53) + %14 = arith.muli %c1_i32_82, %13 : i32 loc(#loc53) + %c1_i32_83 = arith.constant 1 : i32 loc(#loc54) + %15 = arith.extsi %c1_i32_83 : i32 to i64 loc(#loc54) + %16 = arith.cmpi sgt, %ks0, %15 : i64 loc(#loc54) + %17 = arith.extui %16 : i1 to i64 loc(#loc55) + %18 = arith.muli %ks0, %17 : i64 loc(#loc55) + %19 = arith.extsi %14 : i32 to i64 loc(#loc56) + %20 = arith.addi %19, %18 : i64 loc(#loc56) + %21 = tt.splat %20 : i64 -> tensor<128xi64> loc(#loc57) + %22 = arith.muli %x1_11, %21 : tensor<128xi64> loc(#loc57) + %23 = arith.addi %x0_7, %22 : tensor<128xi64> loc(#loc58) + %c1_i32_84 = arith.constant 1 : i32 loc(#loc59) + %24 = arith.extsi %c1_i32_84 : i32 to i64 loc(#loc59) + %25 = arith.cmpi sge, %24, %ks0 : i64 loc(#loc59) + %c1_i32_85 = arith.constant 1 : i32 loc(#loc60) + %c1_i32_86 = arith.constant 1 : i32 loc(#loc60) + %26 = arith.extui %25 : i1 to i32 loc(#loc60) + %27 = arith.muli %c1_i32_86, %26 : i32 loc(#loc60) + %c1_i32_87 = arith.constant 1 : i32 loc(#loc61) + %28 = arith.extsi %c1_i32_87 : i32 to i64 loc(#loc61) + %29 = arith.cmpi sgt, %ks0, %28 : i64 loc(#loc61) + %30 = arith.extui %29 : i1 to i64 loc(#loc62) + %31 = arith.muli %ks0, %30 : i64 loc(#loc62) + %32 = arith.extsi %27 : i32 to i64 loc(#loc63) + %33 = arith.addi %32, %31 : i64 loc(#loc63) + %34 = tt.splat %33 : i64 -> tensor<128xi64> loc(#loc64) + %35 = arith.muli %x2_13, %34 : tensor<128xi64> loc(#loc64) + %c1_i32_88 = arith.constant 1 : i32 loc(#loc65) + %36 = arith.extsi %c1_i32_88 : i32 to i64 loc(#loc65) + %37 = arith.cmpi sge, %36, %ks1 : i64 loc(#loc65) + %c1_i32_89 = arith.constant 1 : i32 loc(#loc66) + %c1_i32_90 = arith.constant 1 : i32 loc(#loc66) + %38 = arith.extui %37 : i1 to i32 loc(#loc66) + %39 = arith.muli %c1_i32_90, %38 : i32 loc(#loc66) + %c1_i32_91 = arith.constant 1 : i32 loc(#loc67) + %40 = arith.extsi %c1_i32_91 : i32 to i64 loc(#loc67) + %41 = arith.cmpi sgt, %ks1, %40 : i64 loc(#loc67) + %42 = arith.extui %41 : i1 to i64 loc(#loc68) + %43 = arith.muli %ks1, %42 : i64 loc(#loc68) + %44 = arith.extsi %39 : i32 to i64 loc(#loc69) + %45 = arith.addi %44, %43 : i64 loc(#loc69) + %46 = tt.splat %45 : i64 -> tensor<128xi64> loc(#loc70) + %47 = arith.muli %35, %46 : tensor<128xi64> loc(#loc70) + %48 = arith.addi %23, %47 : tensor<128xi64> loc(#loc71) + %49 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc72) + %50 = tt.addptr %49, %48 : tensor<128x!tt.ptr>, tensor<128xi64> loc(#loc72) + tt.store %50, %tmp1, %xmask_5 : tensor<128x!tt.ptr> loc(#loc73) + %51 = arith.addi %tmp10, %x3_15 : tensor<128xi64> loc(#loc74) + %52 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc75) + %53 = arith.muli %52, %x3_15 : tensor<128xi64> loc(#loc75) + %54 = arith.addi %51, %53 : tensor<128xi64> loc(#loc76) + %55 = tt.splat %out_ptr1 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc77) + %56 = tt.addptr %55, %54 : tensor<128x!tt.ptr>, tensor<128xi64> loc(#loc77) + %cst_92 = arith.constant dense<1> : tensor<128xi32> loc(#loc78) + tt.store %56, %cst_92, %xmask_5 : tensor<128x!tt.ptr> loc(#loc78) + tt.return loc(#loc79) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.div_floor_integer__i64__(1,)cconstexpr_128_"(%a: i64 loc("a"(#loc80))) -> i64 attributes {noinline = false} { + %quot = arith.constant 128 : i32 loc(#loc143) + %quot_0 = arith.constant 128 : i64 loc(#loc143) + %quot_1 = arith.divsi %a, %quot_0 : i64 loc(#loc143) + %remainder = arith.constant 128 : i32 loc(#loc144) + %remainder_2 = arith.constant 128 : i64 loc(#loc144) + %remainder_3 = arith.remsi %a, %remainder_2 : i64 loc(#loc144) + %fixed = arith.constant 0 : i32 loc(#loc145) + %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc145) + %fixed_5 = arith.cmpi ne, %remainder_3, %fixed_4 : i64 loc(#loc145) + %fixed_6 = arith.constant 1 : i32 loc(#loc146) + %fixed_7 = arith.constant 1 : i64 loc(#loc146) + %fixed_8 = arith.subi %quot_1, %fixed_7 : i64 loc(#loc146) + %fixed_9 = arith.select %fixed_5, %fixed_8, %quot_1 : i64 loc(#loc147) + %c0_i32 = arith.constant 0 : i32 loc(#loc86) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc86) + %1 = arith.cmpi slt, %a, %0 : i64 loc(#loc86) + %false = arith.constant false loc(#loc87) + %2 = arith.cmpi ne, %1, %false : i1 loc(#loc87) + %3 = arith.select %2, %fixed_9, %quot_1 : i64 loc(#loc88) + tt.return %3 : i64 loc(#loc89) + ^bb1: // no predecessors + %4 = ub.poison : i64 loc(#loc90) + tt.return %4 : i64 loc(#loc90) + } loc(#loc80) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":22:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":23:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":23:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":24:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":25:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:54) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:46) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:80) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:71) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:62) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:39) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:35) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:106) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:98) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:132) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:123) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:114) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:91) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:153) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:145) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:179) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:170) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:161) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:138) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:87) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:30) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:186) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":27:30) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":27:35) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":28:19) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":30:18) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":32:32) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":33:15) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":34:18) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":35:18) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":36:33) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:28) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:90) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:96) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:51) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:46) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:38) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:108) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:106) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:116) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":38:28) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:49) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:41) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:75) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:66) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:57) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:34) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:30) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:101) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:93) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:127) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:118) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:109) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:86) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:148) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:140) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:174) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:165) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:156) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:133) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:82) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:187) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:33) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:42) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:38) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:25) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:54) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:4) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4) +#loc100 = loc("xoffset"(#loc1)) +#loc101 = loc("xoffset"(#loc2)) +#loc102 = loc("xindex"(#loc3)) +#loc103 = loc("xindex"(#loc4)) +#loc104 = loc("xmask"(#loc5)) +#loc105 = loc("x0"(#loc6)) +#loc106 = loc("x1"(#loc7)) +#loc107 = loc("x1"(#loc8)) +#loc108 = loc("x2"(#loc9)) +#loc109 = loc("x3"(#loc10)) +#loc110 = loc("tmp0"(#loc11)) +#loc111 = loc("tmp0"(#loc12)) +#loc112 = loc("tmp0"(#loc13)) +#loc113 = loc("tmp0"(#loc14)) +#loc114 = loc("tmp0"(#loc15)) +#loc115 = loc("tmp0"(#loc16)) +#loc116 = loc("tmp0"(#loc17)) +#loc117 = loc("tmp0"(#loc18)) +#loc118 = loc("tmp0"(#loc19)) +#loc119 = loc("tmp0"(#loc20)) +#loc120 = loc("tmp0"(#loc21)) +#loc121 = loc("tmp0"(#loc22)) +#loc122 = loc("tmp0"(#loc23)) +#loc123 = loc("tmp0"(#loc24)) +#loc124 = loc("tmp0"(#loc25)) +#loc125 = loc("tmp0"(#loc26)) +#loc126 = loc("tmp0"(#loc27)) +#loc127 = loc("tmp0"(#loc28)) +#loc128 = loc("tmp0"(#loc29)) +#loc129 = loc("tmp0"(#loc30)) +#loc130 = loc("tmp0"(#loc31)) +#loc131 = loc("tmp0"(#loc32)) +#loc132 = loc("tmp2"(#loc33)) +#loc133 = loc("tmp2"(#loc34)) +#loc134 = loc("tmp1"(#loc35)) +#loc135 = loc("tmp4"(#loc36)) +#loc136 = loc("tmp6"(#loc37)) +#loc137 = loc("tmp7"(#loc38)) +#loc138 = loc("tmp8"(#loc39)) +#loc139 = loc("tmp9"(#loc40)) +#loc140 = loc("tmp10"(#loc41)) +#loc141 = loc("tmp12"(#loc51)) +#loc143 = loc("quot"(#loc81)) +#loc144 = loc("remainder"(#loc82)) +#loc145 = loc("fixed"(#loc83)) +#loc146 = loc("fixed"(#loc84)) +#loc147 = loc("fixed"(#loc85)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..d0b9474aff8df22282831e4df429817a6022197d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttgir @@ -0,0 +1,208 @@ +#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":18:0) +#loc61 = loc("in_ptr0"(#loc)) +#loc62 = loc("in_ptr1"(#loc)) +#loc63 = loc("out_ptr0"(#loc)) +#loc64 = loc("out_ptr1"(#loc)) +#loc65 = loc("ks0"(#loc)) +#loc66 = loc("ks1"(#loc)) +#loc67 = loc("ks2"(#loc)) +#loc68 = loc("ks3"(#loc)) +#loc69 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c127_i64 = arith.constant 127 : i64 loc(#loc1) + %cst = arith.constant dense : tensor<128xi1, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<128xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<128xi64, #blocked> loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c128_i64 = arith.constant 128 : i64 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc70) + %xoffset_2 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc71) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked> loc(#loc72) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<128xi32, #blocked> loc(#loc73) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<128xi32, #blocked> loc(#loc73) + %xmask = tt.splat %xnumel : i32 -> tensor<128xi32, #blocked> loc(#loc74) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<128xi32, #blocked> loc(#loc74) + %x0 = arith.extsi %xindex_4 : tensor<128xi32, #blocked> to tensor<128xi64, #blocked> loc(#loc75) + %x0_6 = tt.splat %ks0 : i64 -> tensor<128xi64, #blocked> loc(#loc75) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<128xi64, #blocked> loc(#loc75) + %x1 = arith.divsi %x0, %x0_6 : tensor<128xi64, #blocked> loc(#loc76) + %x1_8 = tt.splat %ks1 : i64 -> tensor<128xi64, #blocked> loc(#loc77) + %x1_9 = arith.remsi %x1, %x1_8 : tensor<128xi64, #blocked> loc(#loc77) + %x2 = tt.splat %ks2 : i64 -> tensor<128xi64, #blocked> loc(#loc78) + %x2_10 = arith.divsi %x0, %x2 : tensor<128xi64, #blocked> loc(#loc78) + %tmp0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc79) + %tmp0_11 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc80) + %tmp0_12 = arith.extui %tmp0_11 : i1 to i64 loc(#loc81) + %tmp0_13 = arith.muli %ks0, %tmp0_12 : i64 loc(#loc81) + %tmp0_14 = arith.extui %tmp0 : i1 to i64 loc(#loc112) + %tmp0_15 = arith.addi %tmp0_14, %tmp0_13 : i64 loc(#loc82) + %tmp0_16 = tt.splat %tmp0_15 : i64 -> tensor<128xi64, #blocked> loc(#loc84) + %tmp0_17 = arith.muli %x1_9, %tmp0_16 : tensor<128xi64, #blocked> loc(#loc84) + %tmp0_18 = arith.addi %x0_7, %tmp0_17 : tensor<128xi64, #blocked> loc(#loc85) + %tmp0_19 = arith.muli %x2_10, %tmp0_16 : tensor<128xi64, #blocked> loc(#loc86) + %tmp0_20 = arith.cmpi sle, %ks1, %c1_i64 : i64 loc(#loc87) + %tmp0_21 = arith.cmpi sgt, %ks1, %c1_i64 : i64 loc(#loc88) + %tmp0_22 = arith.extui %tmp0_21 : i1 to i64 loc(#loc89) + %tmp0_23 = arith.muli %ks1, %tmp0_22 : i64 loc(#loc89) + %tmp0_24 = arith.extui %tmp0_20 : i1 to i64 loc(#loc113) + %tmp0_25 = arith.addi %tmp0_24, %tmp0_23 : i64 loc(#loc90) + %tmp0_26 = tt.splat %tmp0_25 : i64 -> tensor<128xi64, #blocked> loc(#loc92) + %tmp0_27 = arith.muli %tmp0_19, %tmp0_26 : tensor<128xi64, #blocked> loc(#loc92) + %tmp0_28 = arith.addi %tmp0_18, %tmp0_27 : tensor<128xi64, #blocked> loc(#loc93) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x!tt.ptr, #blocked> loc(#loc94) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<128x!tt.ptr, #blocked>, tensor<128xi64, #blocked> loc(#loc94) + %tmp0_31 = tt.load %tmp0_30, %xmask_5 evictionPolicy = evict_last : tensor<128x!tt.ptr, #blocked> loc(#loc95) + %tmp2 = tt.splat %in_ptr1 : !tt.ptr -> tensor<128x!tt.ptr, #blocked> loc(#loc96) + %tmp2_32 = tt.addptr %tmp2, %x1 : tensor<128x!tt.ptr, #blocked>, tensor<128xi64, #blocked> loc(#loc96) + %tmp2_33 = tt.load %tmp2_32, %xmask_5 evictionPolicy = evict_last : tensor<128x!tt.ptr, #blocked> loc(#loc97) + %tmp1 = arith.trunci %tmp0_31 : tensor<128xi64, #blocked> to tensor<128xi32, #blocked> loc(#loc98) + %tmp4 = arith.extsi %tmp2_33 : tensor<128xi32, #blocked> to tensor<128xi64, #blocked> loc(#loc99) + %tmp4_34 = arith.cmpi slt, %x0_7, %tmp4 : tensor<128xi64, #blocked> loc(#loc99) + %tmp6 = arith.extsi %tmp1 : tensor<128xi32, #blocked> to tensor<128xi64, #blocked> loc(#loc100) + %tmp6_35 = arith.select %tmp4_34, %tmp6, %x0_6 : tensor<128xi1, #blocked>, tensor<128xi64, #blocked> loc(#loc100) + %tmp7 = arith.addi %ks0, %c1_i64 : i64 loc(#loc101) + %tmp8 = tt.splat %tmp7 : i64 -> tensor<128xi64, #blocked> loc(#loc102) + %tmp8_36 = arith.addi %tmp6_35, %tmp8 : tensor<128xi64, #blocked> loc(#loc102) + %tmp9 = arith.cmpi slt, %tmp6_35, %cst_1 : tensor<128xi64, #blocked> loc(#loc103) + %tmp10 = arith.select %tmp9, %tmp8_36, %tmp6_35 : tensor<128xi1, #blocked>, tensor<128xi64, #blocked> loc(#loc104) + %0 = arith.cmpi sge, %tmp10, %cst_1 : tensor<128xi64, #blocked> loc(#loc37) + %1 = arith.addi %ks3, %c127_i64 : i64 loc(#loc38) + %quot = arith.divsi %1, %c128_i64 : i64 loc(#loc114) + %remainder = arith.remsi %1, %c128_i64 : i64 loc(#loc115) + %fixed = arith.cmpi ne, %remainder, %c0_i64 : i64 loc(#loc116) + %fixed_37 = arith.subi %quot, %c1_i64 : i64 loc(#loc117) + %fixed_38 = arith.select %fixed, %fixed_37, %quot : i64 loc(#loc118) + %2 = arith.cmpi slt, %1, %c0_i64 : i64 loc(#loc110) + %3 = arith.select %2, %fixed_38, %quot : i64 loc(#loc111) + %4 = arith.addi %3, %c1_i64 : i64 loc(#loc47) + %5 = tt.splat %4 : i64 -> tensor<128xi64, #blocked> loc(#loc48) + %6 = arith.cmpi slt, %tmp10, %5 : tensor<128xi64, #blocked> loc(#loc48) + %7 = arith.andi %0, %6 : tensor<128xi1, #blocked> loc(#loc49) + %8 = arith.xori %xmask_5, %cst : tensor<128xi1, #blocked> loc(#loc50) + %9 = arith.ori %7, %8 : tensor<128xi1, #blocked> loc(#loc51) + tt.assert %9, "index out of bounds: 0 <= tmp10 < 1 + (triton_helpers.div_floor_integer(127 + ks3, 128))" : tensor<128xi1, #blocked> loc(#loc52) + %10 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr, #blocked> loc(#loc53) + %11 = tt.addptr %10, %tmp0_28 : tensor<128x!tt.ptr, #blocked>, tensor<128xi64, #blocked> loc(#loc53) + tt.store %11, %tmp1, %xmask_5 : tensor<128x!tt.ptr, #blocked> loc(#loc54) + %12 = arith.addi %tmp10, %x1 : tensor<128xi64, #blocked> loc(#loc55) + %13 = arith.muli %x0_6, %x1 : tensor<128xi64, #blocked> loc(#loc56) + %14 = arith.addi %12, %13 : tensor<128xi64, #blocked> loc(#loc57) + %15 = tt.splat %out_ptr1 : !tt.ptr -> tensor<128x!tt.ptr, #blocked> loc(#loc58) + %16 = tt.addptr %15, %14 : tensor<128x!tt.ptr, #blocked>, tensor<128xi64, #blocked> loc(#loc58) + tt.store %16, %cst_0, %xmask_5 : tensor<128x!tt.ptr, #blocked> loc(#loc59) + tt.return loc(#loc60) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":23:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":23:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":24:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:54) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:80) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:71) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:62) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:46) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:39) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:35) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:91) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:153) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:179) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:170) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:161) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:145) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:138) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:87) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:30) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:186) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":27:30) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":27:35) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":28:19) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":30:18) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":32:32) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":33:15) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":34:18) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":35:18) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":36:33) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:28) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:90) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:96) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:51) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:46) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:38) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:108) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:106) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:116) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:187) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:33) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:42) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:38) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:25) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:54) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:4) +#loc70 = loc("xoffset"(#loc2)) +#loc71 = loc("xoffset"(#loc3)) +#loc72 = loc("xindex"(#loc4)) +#loc73 = loc("xindex"(#loc5)) +#loc74 = loc("xmask"(#loc6)) +#loc75 = loc("x0"(#loc7)) +#loc76 = loc("x1"(#loc8)) +#loc77 = loc("x1"(#loc9)) +#loc78 = loc("x2"(#loc10)) +#loc79 = loc("tmp0"(#loc11)) +#loc80 = loc("tmp0"(#loc12)) +#loc81 = loc("tmp0"(#loc13)) +#loc82 = loc("tmp0"(#loc14)) +#loc83 = loc("tmp0"(#loc15)) +#loc84 = loc("tmp0"(#loc16)) +#loc85 = loc("tmp0"(#loc17)) +#loc86 = loc("tmp0"(#loc18)) +#loc87 = loc("tmp0"(#loc19)) +#loc88 = loc("tmp0"(#loc20)) +#loc89 = loc("tmp0"(#loc21)) +#loc90 = loc("tmp0"(#loc22)) +#loc91 = loc("tmp0"(#loc23)) +#loc92 = loc("tmp0"(#loc24)) +#loc93 = loc("tmp0"(#loc25)) +#loc94 = loc("tmp0"(#loc26)) +#loc95 = loc("tmp0"(#loc27)) +#loc96 = loc("tmp2"(#loc28)) +#loc97 = loc("tmp2"(#loc29)) +#loc98 = loc("tmp1"(#loc30)) +#loc99 = loc("tmp4"(#loc31)) +#loc100 = loc("tmp6"(#loc32)) +#loc101 = loc("tmp7"(#loc33)) +#loc102 = loc("tmp8"(#loc34)) +#loc103 = loc("tmp9"(#loc35)) +#loc104 = loc("tmp10"(#loc36)) +#loc105 = loc("quot"(#loc39)) +#loc106 = loc("remainder"(#loc41)) +#loc107 = loc("fixed"(#loc42)) +#loc108 = loc("fixed"(#loc43)) +#loc109 = loc("fixed"(#loc44)) +#loc110 = loc(callsite(#loc45 at #loc40)) +#loc111 = loc(callsite(#loc46 at #loc40)) +#loc112 = loc(fused[#loc82, #loc83]) +#loc113 = loc(fused[#loc90, #loc91]) +#loc114 = loc(callsite(#loc105 at #loc40)) +#loc115 = loc(callsite(#loc106 at #loc40)) +#loc116 = loc(callsite(#loc107 at #loc40)) +#loc117 = loc(callsite(#loc108 at #loc40)) +#loc118 = loc(callsite(#loc109 at #loc40)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttir new file mode 100644 index 0000000000000000000000000000000000000000..1d71730baec483fb75a4bce1c103f7b100538b8e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/S3UH64TOYTN473KAATRMGKZ5SLQ46EZYJVPR6TIL7QNMYCB3MSMA/triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3.ttir @@ -0,0 +1,208 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":18:0) +#loc61 = loc("in_ptr0"(#loc)) +#loc62 = loc("in_ptr1"(#loc)) +#loc63 = loc("out_ptr0"(#loc)) +#loc64 = loc("out_ptr1"(#loc)) +#loc65 = loc("ks0"(#loc)) +#loc66 = loc("ks1"(#loc)) +#loc67 = loc("ks2"(#loc)) +#loc68 = loc("ks3"(#loc)) +#loc69 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused__to_copy_arange_index_put_lt_new_zeros_scalar_tensor_unsqueeze_view_where_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %c128_i64 = arith.constant 128 : i64 loc(#loc70) + %c0_i64 = arith.constant 0 : i64 loc(#loc70) + %cst = arith.constant dense<0> : tensor<128xi64> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<128xi32> loc(#loc3) + %cst_1 = arith.constant dense : tensor<128xi1> loc(#loc4) + %c127_i64 = arith.constant 127 : i64 loc(#loc5) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc71) + %xoffset_2 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc72) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc73) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<128xi32> loc(#loc74) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<128xi32> loc(#loc74) + %xmask = tt.splat %xnumel : i32 -> tensor<128xi32> loc(#loc75) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<128xi32> loc(#loc75) + %x0 = arith.extsi %xindex_4 : tensor<128xi32> to tensor<128xi64> loc(#loc76) + %x0_6 = tt.splat %ks0 : i64 -> tensor<128xi64> loc(#loc76) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<128xi64> loc(#loc76) + %x1 = arith.divsi %x0, %x0_6 : tensor<128xi64> loc(#loc77) + %x1_8 = tt.splat %ks1 : i64 -> tensor<128xi64> loc(#loc78) + %x1_9 = arith.remsi %x1, %x1_8 : tensor<128xi64> loc(#loc78) + %x2 = tt.splat %ks2 : i64 -> tensor<128xi64> loc(#loc79) + %x2_10 = arith.divsi %x0, %x2 : tensor<128xi64> loc(#loc79) + %tmp0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc80) + %tmp0_11 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc81) + %tmp0_12 = arith.extui %tmp0_11 : i1 to i64 loc(#loc82) + %tmp0_13 = arith.muli %ks0, %tmp0_12 : i64 loc(#loc82) + %tmp0_14 = arith.extui %tmp0 : i1 to i64 loc(#loc113) + %tmp0_15 = arith.addi %tmp0_14, %tmp0_13 : i64 loc(#loc83) + %tmp0_16 = tt.splat %tmp0_15 : i64 -> tensor<128xi64> loc(#loc85) + %tmp0_17 = arith.muli %x1_9, %tmp0_16 : tensor<128xi64> loc(#loc85) + %tmp0_18 = arith.addi %x0_7, %tmp0_17 : tensor<128xi64> loc(#loc86) + %tmp0_19 = arith.muli %x2_10, %tmp0_16 : tensor<128xi64> loc(#loc87) + %tmp0_20 = arith.cmpi sle, %ks1, %c1_i64 : i64 loc(#loc88) + %tmp0_21 = arith.cmpi sgt, %ks1, %c1_i64 : i64 loc(#loc89) + %tmp0_22 = arith.extui %tmp0_21 : i1 to i64 loc(#loc90) + %tmp0_23 = arith.muli %ks1, %tmp0_22 : i64 loc(#loc90) + %tmp0_24 = arith.extui %tmp0_20 : i1 to i64 loc(#loc114) + %tmp0_25 = arith.addi %tmp0_24, %tmp0_23 : i64 loc(#loc91) + %tmp0_26 = tt.splat %tmp0_25 : i64 -> tensor<128xi64> loc(#loc93) + %tmp0_27 = arith.muli %tmp0_19, %tmp0_26 : tensor<128xi64> loc(#loc93) + %tmp0_28 = arith.addi %tmp0_18, %tmp0_27 : tensor<128xi64> loc(#loc94) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc95) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<128x!tt.ptr>, tensor<128xi64> loc(#loc95) + %tmp0_31 = tt.load %tmp0_30, %xmask_5 evictionPolicy = evict_last : tensor<128x!tt.ptr> loc(#loc96) + %tmp2 = tt.splat %in_ptr1 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc97) + %tmp2_32 = tt.addptr %tmp2, %x1 : tensor<128x!tt.ptr>, tensor<128xi64> loc(#loc97) + %tmp2_33 = tt.load %tmp2_32, %xmask_5 evictionPolicy = evict_last : tensor<128x!tt.ptr> loc(#loc98) + %tmp1 = arith.trunci %tmp0_31 : tensor<128xi64> to tensor<128xi32> loc(#loc99) + %tmp4 = arith.extsi %tmp2_33 : tensor<128xi32> to tensor<128xi64> loc(#loc100) + %tmp4_34 = arith.cmpi slt, %x0_7, %tmp4 : tensor<128xi64> loc(#loc100) + %tmp6 = arith.extsi %tmp1 : tensor<128xi32> to tensor<128xi64> loc(#loc101) + %tmp6_35 = arith.select %tmp4_34, %tmp6, %x0_6 : tensor<128xi1>, tensor<128xi64> loc(#loc101) + %tmp7 = arith.addi %ks0, %c1_i64 : i64 loc(#loc102) + %tmp8 = tt.splat %tmp7 : i64 -> tensor<128xi64> loc(#loc103) + %tmp8_36 = arith.addi %tmp6_35, %tmp8 : tensor<128xi64> loc(#loc103) + %tmp9 = arith.cmpi slt, %tmp6_35, %cst : tensor<128xi64> loc(#loc104) + %tmp10 = arith.select %tmp9, %tmp8_36, %tmp6_35 : tensor<128xi1>, tensor<128xi64> loc(#loc105) + %0 = arith.cmpi sge, %tmp10, %cst : tensor<128xi64> loc(#loc41) + %1 = arith.addi %ks3, %c127_i64 : i64 loc(#loc5) + %quot = arith.divsi %1, %c128_i64 : i64 loc(#loc115) + %remainder = arith.remsi %1, %c128_i64 : i64 loc(#loc116) + %fixed = arith.cmpi ne, %remainder, %c0_i64 : i64 loc(#loc117) + %fixed_37 = arith.subi %quot, %c1_i64 : i64 loc(#loc118) + %fixed_38 = arith.select %fixed, %fixed_37, %quot : i64 loc(#loc119) + %2 = arith.cmpi slt, %1, %c0_i64 : i64 loc(#loc111) + %3 = arith.select %2, %fixed_38, %quot : i64 loc(#loc112) + %4 = arith.addi %3, %c1_i64 : i64 loc(#loc49) + %5 = tt.splat %4 : i64 -> tensor<128xi64> loc(#loc50) + %6 = arith.cmpi slt, %tmp10, %5 : tensor<128xi64> loc(#loc50) + %7 = arith.andi %0, %6 : tensor<128xi1> loc(#loc51) + %8 = arith.xori %xmask_5, %cst_1 : tensor<128xi1> loc(#loc4) + %9 = arith.ori %7, %8 : tensor<128xi1> loc(#loc52) + tt.assert %9, "index out of bounds: 0 <= tmp10 < 1 + (triton_helpers.div_floor_integer(127 + ks3, 128))" : tensor<128xi1> loc(#loc53) + %10 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc54) + %11 = tt.addptr %10, %tmp0_28 : tensor<128x!tt.ptr>, tensor<128xi64> loc(#loc54) + tt.store %11, %tmp1, %xmask_5 : tensor<128x!tt.ptr> loc(#loc55) + %12 = arith.addi %tmp10, %x1 : tensor<128xi64> loc(#loc56) + %13 = arith.muli %x0_6, %x1 : tensor<128xi64> loc(#loc57) + %14 = arith.addi %12, %13 : tensor<128xi64> loc(#loc58) + %15 = tt.splat %out_ptr1 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc59) + %16 = tt.addptr %15, %14 : tensor<128x!tt.ptr>, tensor<128xi64> loc(#loc59) + tt.store %16, %cst_0, %xmask_5 : tensor<128x!tt.ptr> loc(#loc3) + tt.return loc(#loc60) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:96) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:54) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:108) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:90) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":19:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":19:33) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":20:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":20:23) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":21:21) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":22:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":23:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":23:28) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":24:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:54) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:80) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:71) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:62) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:46) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:39) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:35) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:91) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:153) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:179) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:170) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:161) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:145) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:138) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:87) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:30) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":26:186) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":27:30) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":27:35) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":28:19) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":30:18) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":32:32) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":33:15) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":34:18) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":35:18) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":36:33) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:28) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:51) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:46) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:38) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:106) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":37:116) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:25) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":39:187) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:33) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:42) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:38) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:25) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/li/climzgziptsjlz4tsuxacvt6muyoeemo7l2bxcyotfjy3vij6hbt.py":40:4) +#loc70 = loc(callsite(#loc1 at #loc2)) +#loc71 = loc("xoffset"(#loc6)) +#loc72 = loc("xoffset"(#loc7)) +#loc73 = loc("xindex"(#loc8)) +#loc74 = loc("xindex"(#loc9)) +#loc75 = loc("xmask"(#loc10)) +#loc76 = loc("x0"(#loc11)) +#loc77 = loc("x1"(#loc12)) +#loc78 = loc("x1"(#loc13)) +#loc79 = loc("x2"(#loc14)) +#loc80 = loc("tmp0"(#loc15)) +#loc81 = loc("tmp0"(#loc16)) +#loc82 = loc("tmp0"(#loc17)) +#loc83 = loc("tmp0"(#loc18)) +#loc84 = loc("tmp0"(#loc19)) +#loc85 = loc("tmp0"(#loc20)) +#loc86 = loc("tmp0"(#loc21)) +#loc87 = loc("tmp0"(#loc22)) +#loc88 = loc("tmp0"(#loc23)) +#loc89 = loc("tmp0"(#loc24)) +#loc90 = loc("tmp0"(#loc25)) +#loc91 = loc("tmp0"(#loc26)) +#loc92 = loc("tmp0"(#loc27)) +#loc93 = loc("tmp0"(#loc28)) +#loc94 = loc("tmp0"(#loc29)) +#loc95 = loc("tmp0"(#loc30)) +#loc96 = loc("tmp0"(#loc31)) +#loc97 = loc("tmp2"(#loc32)) +#loc98 = loc("tmp2"(#loc33)) +#loc99 = loc("tmp1"(#loc34)) +#loc100 = loc("tmp4"(#loc35)) +#loc101 = loc("tmp6"(#loc36)) +#loc102 = loc("tmp7"(#loc37)) +#loc103 = loc("tmp8"(#loc38)) +#loc104 = loc("tmp9"(#loc39)) +#loc105 = loc("tmp10"(#loc40)) +#loc106 = loc("quot"(#loc42)) +#loc107 = loc("remainder"(#loc43)) +#loc108 = loc("fixed"(#loc44)) +#loc109 = loc("fixed"(#loc45)) +#loc110 = loc("fixed"(#loc46)) +#loc111 = loc(callsite(#loc47 at #loc2)) +#loc112 = loc(callsite(#loc48 at #loc2)) +#loc113 = loc(fused[#loc83, #loc84]) +#loc114 = loc(fused[#loc91, #loc92]) +#loc115 = loc(callsite(#loc106 at #loc2)) +#loc116 = loc(callsite(#loc107 at #loc2)) +#loc117 = loc(callsite(#loc108 at #loc2)) +#loc118 = loc(callsite(#loc109 at #loc2)) +#loc119 = loc(callsite(#loc110 at #loc2)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/__grp__triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/__grp__triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d6fade2a7233da5d5b3942233915e99d804d7210 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/__grp__triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.source", "triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.ttir", "triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.ttgir", "triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.llir", "triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.ptx", "triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.cubin", "triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..567122ee43abea29088db5aaf9a7badb33d3c566 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.json new file mode 100644 index 0000000000000000000000000000000000000000..07f861573ea3d5fb22fed96ab67acb49fb922879 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.json @@ -0,0 +1 @@ +{"hash": "97064bbbc9ad0c35055e6dc50c920ec68fc33a44a31944c46b50dc7a16e16ca9", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 64, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..99e1a45931ebef2588b7fa1b6b6dfa8861e675bc --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.llir @@ -0,0 +1,418 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i32 %6, i32 %7, ptr addrspace(1) readnone captures(none) %8, ptr addrspace(1) readnone captures(none) %9) local_unnamed_addr #0 !dbg !4 { + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %12 = icmp slt i32 %11, %6, !dbg !8 + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %14 = and i32 %13, 31, !dbg !9 + %15 = lshr i32 %13, 5, !dbg !9 + %16 = and i32 %13, 511, !dbg !9 + %17 = or i32 %13, 512, !dbg !9 + %18 = or disjoint i32 %16, 1024, !dbg !9 + %19 = or i32 %13, 1536, !dbg !9 + %20 = zext nneg i32 %11 to i64, !dbg !10 + %21 = mul i64 %5, %20, !dbg !10 + %22 = icmp sgt i32 %7, 0, !dbg !11 + br i1 %22, label %.lr.ph.preheader, label %._crit_edge, !dbg !11 + +.lr.ph.preheader: ; preds = %10 + %23 = insertelement <4 x i32> poison, i32 %16, i64 0 + %24 = insertelement <4 x i32> %23, i32 %17, i64 1 + %25 = insertelement <4 x i32> %24, i32 %18, i64 2 + %26 = insertelement <4 x i32> %25, i32 %19, i64 3 + %27 = insertelement <4 x i32> poison, i32 %7, i64 0 + %28 = shufflevector <4 x i32> %27, <4 x i32> poison, <4 x i32> zeroinitializer + %29 = insertelement <4 x i1> poison, i1 %12, i64 0 + %30 = shufflevector <4 x i1> %29, <4 x i1> poison, <4 x i32> zeroinitializer + br label %.lr.ph, !dbg !11 + +.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph + %31 = phi i32 [ %116, %.lr.ph ], [ 0, %.lr.ph.preheader ] + %32 = phi <4 x float> [ %115, %.lr.ph ], [ zeroinitializer, %.lr.ph.preheader ] + %33 = insertelement <4 x i32> poison, i32 %31, i64 0, !dbg !12 + %34 = shufflevector <4 x i32> %33, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !12 + %35 = or disjoint <4 x i32> %34, %26, !dbg !12 + %36 = extractelement <4 x i32> %35, i64 0, !dbg !13 + %37 = sext i32 %36 to i64, !dbg !13 + %38 = extractelement <4 x i32> %35, i64 1, !dbg !13 + %39 = sext i32 %38 to i64, !dbg !13 + %40 = extractelement <4 x i32> %35, i64 2, !dbg !13 + %41 = sext i32 %40 to i64, !dbg !13 + %42 = extractelement <4 x i32> %35, i64 3, !dbg !13 + %43 = sext i32 %42 to i64, !dbg !13 + %44 = add i64 %21, %37, !dbg !13 + %45 = add i64 %21, %39, !dbg !13 + %46 = add i64 %21, %41, !dbg !13 + %47 = add i64 %21, %43, !dbg !13 + %48 = getelementptr bfloat, ptr addrspace(1) %0, i64 %44, !dbg !14 + %49 = getelementptr bfloat, ptr addrspace(1) %0, i64 %45, !dbg !14 + %50 = getelementptr bfloat, ptr addrspace(1) %0, i64 %46, !dbg !14 + %51 = getelementptr bfloat, ptr addrspace(1) %0, i64 %47, !dbg !14 + %52 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !15 + %53 = getelementptr bfloat, ptr addrspace(1) %1, i64 %37, !dbg !16 + %54 = getelementptr bfloat, ptr addrspace(1) %1, i64 %39, !dbg !16 + %55 = getelementptr bfloat, ptr addrspace(1) %1, i64 %41, !dbg !16 + %56 = getelementptr bfloat, ptr addrspace(1) %1, i64 %43, !dbg !16 + %57 = getelementptr bfloat, ptr addrspace(1) %2, i64 %44, !dbg !17 + %58 = getelementptr bfloat, ptr addrspace(1) %2, i64 %45, !dbg !17 + %59 = getelementptr bfloat, ptr addrspace(1) %2, i64 %46, !dbg !17 + %60 = getelementptr bfloat, ptr addrspace(1) %2, i64 %47, !dbg !17 + %61 = icmp slt <4 x i32> %35, %28, !dbg !18 + %62 = and <4 x i1> %30, %61, !dbg !19 + %63 = extractelement <4 x i1> %62, i64 0, !dbg !20 + %64 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %48, i64 %52, i1 %63) #5, !dbg !15 + %65 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !15 + %66 = extractelement <4 x i1> %62, i64 1, !dbg !20 + %67 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %49, i64 %65, i1 %66) #5, !dbg !15 + %68 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !15 + %69 = extractelement <4 x i1> %62, i64 2, !dbg !20 + %70 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %50, i64 %68, i1 %69) #5, !dbg !15 + %71 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !15 + %72 = extractelement <4 x i1> %62, i64 3, !dbg !20 + %73 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %51, i64 %71, i1 %72) #5, !dbg !15 + %74 = insertelement <4 x i16> poison, i16 %64, i64 0, !dbg !15 + %75 = insertelement <4 x i16> %74, i16 %67, i64 1, !dbg !15 + %76 = insertelement <4 x i16> %75, i16 %70, i64 2, !dbg !15 + %77 = insertelement <4 x i16> %76, i16 %73, i64 3, !dbg !15 + %78 = bitcast <4 x i16> %77 to <4 x bfloat>, !dbg !15 + %79 = fpext <4 x bfloat> %78 to <4 x float>, !dbg !21 + %80 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !22 + %81 = extractelement <4 x i1> %61, i64 0, !dbg !22 + %82 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %53, i64 %80, i1 %81) #5, !dbg !22 + %83 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !22 + %84 = extractelement <4 x i1> %61, i64 1, !dbg !22 + %85 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %54, i64 %83, i1 %84) #5, !dbg !22 + %86 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !22 + %87 = extractelement <4 x i1> %61, i64 2, !dbg !22 + %88 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %55, i64 %86, i1 %87) #5, !dbg !22 + %89 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !22 + %90 = extractelement <4 x i1> %61, i64 3, !dbg !22 + %91 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %56, i64 %89, i1 %90) #5, !dbg !22 + %92 = insertelement <4 x i16> poison, i16 %82, i64 0, !dbg !22 + %93 = insertelement <4 x i16> %92, i16 %85, i64 1, !dbg !22 + %94 = insertelement <4 x i16> %93, i16 %88, i64 2, !dbg !22 + %95 = insertelement <4 x i16> %94, i16 %91, i64 3, !dbg !22 + %96 = bitcast <4 x i16> %95 to <4 x bfloat>, !dbg !22 + %97 = fpext <4 x bfloat> %96 to <4 x float>, !dbg !23 + %98 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20 + %99 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %57, i64 %98, i1 %63) #5, !dbg !20 + %100 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20 + %101 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %58, i64 %100, i1 %66) #5, !dbg !20 + %102 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20 + %103 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %59, i64 %102, i1 %69) #5, !dbg !20 + %104 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !20 + %105 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %60, i64 %104, i1 %72) #5, !dbg !20 + %106 = insertelement <4 x i16> poison, i16 %99, i64 0, !dbg !20 + %107 = insertelement <4 x i16> %106, i16 %101, i64 1, !dbg !20 + %108 = insertelement <4 x i16> %107, i16 %103, i64 2, !dbg !20 + %109 = insertelement <4 x i16> %108, i16 %105, i64 3, !dbg !20 + %110 = bitcast <4 x i16> %109 to <4 x bfloat>, !dbg !20 + %111 = fpext <4 x bfloat> %110 to <4 x float>, !dbg !24 + %112 = fmul <4 x float> %79, %97, !dbg !25 + %113 = fmul <4 x float> %112, %111, !dbg !26 + %114 = fadd <4 x float> %32, %113, !dbg !27 + %115 = select <4 x i1> %62, <4 x float> %114, <4 x float> %32, !dbg !28 + %116 = add i32 %31, 2048, !dbg !11 + %117 = icmp slt i32 %116, %7, !dbg !11 + br i1 %117, label %.lr.ph, label %._crit_edge.loopexit, !dbg !11 + +._crit_edge.loopexit: ; preds = %.lr.ph + %shift = shufflevector <4 x float> %115, <4 x float> poison, <4 x i32> , !dbg !29 + %foldExtExtBinop = fadd <4 x float> %115, %shift, !dbg !29 + %shift12 = shufflevector <4 x float> %115, <4 x float> poison, <4 x i32> , !dbg !29 + %foldExtExtBinop13 = fadd <4 x float> %shift12, %foldExtExtBinop, !dbg !29 + %shift15 = shufflevector <4 x float> %115, <4 x float> poison, <4 x i32> , !dbg !29 + %foldExtExtBinop16 = fadd <4 x float> %shift15, %foldExtExtBinop13, !dbg !29 + %118 = extractelement <4 x float> %foldExtExtBinop16, i64 0, !dbg !29 + br label %._crit_edge, !dbg !29 + +._crit_edge: ; preds = %._crit_edge.loopexit, %10 + %119 = phi float [ 0.000000e+00, %10 ], [ %118, %._crit_edge.loopexit ], !dbg !29 + %120 = bitcast float %119 to i32, !dbg !33 + %121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 16, i32 31), !dbg !33 + %122 = bitcast i32 %121 to float, !dbg !33 + %123 = fadd float %119, %122, !dbg !29 + %124 = bitcast float %123 to i32, !dbg !33 + %125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %124, i32 8, i32 31), !dbg !33 + %126 = bitcast i32 %125 to float, !dbg !33 + %127 = fadd float %123, %126, !dbg !29 + %128 = bitcast float %127 to i32, !dbg !33 + %129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 4, i32 31), !dbg !33 + %130 = bitcast i32 %129 to float, !dbg !33 + %131 = fadd float %127, %130, !dbg !29 + %132 = bitcast float %131 to i32, !dbg !33 + %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 2, i32 31), !dbg !33 + %134 = bitcast i32 %133 to float, !dbg !33 + %135 = fadd float %131, %134, !dbg !29 + %136 = bitcast float %135 to i32, !dbg !33 + %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 1, i32 31), !dbg !33 + %138 = bitcast i32 %137 to float, !dbg !33 + %139 = fadd float %135, %138, !dbg !29 + %140 = and i32 %15, 15, !dbg !33 + %141 = icmp eq i32 %14, 0, !dbg !33 + %142 = getelementptr float, ptr addrspace(3) @global_smem, i32 %140, !dbg !33 + %143 = bitcast float %139 to <1 x i32>, !dbg !33 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %142, <1 x i32> %143, i1 %141) #5, !dbg !33 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !33 + %144 = icmp samesign ult i32 %13, 16, !dbg !33 + %145 = getelementptr float, ptr addrspace(3) @global_smem, i32 %13, !dbg !33 + %146 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %145, i1 %144) #5, !dbg !33 + %147 = bitcast i32 %146 to float, !dbg !33 + %148 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %146, i32 8, i32 31), !dbg !33 + %149 = bitcast i32 %148 to float, !dbg !33 + %150 = fadd float %147, %149, !dbg !29 + %151 = bitcast float %150 to i32, !dbg !33 + %152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %151, i32 4, i32 31), !dbg !33 + %153 = bitcast i32 %152 to float, !dbg !33 + %154 = fadd float %150, %153, !dbg !29 + %155 = bitcast float %154 to i32, !dbg !33 + %156 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %155, i32 2, i32 31), !dbg !33 + %157 = bitcast i32 %156 to float, !dbg !33 + %158 = fadd float %154, %157, !dbg !29 + %159 = bitcast float %158 to i32, !dbg !33 + %160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %159, i32 1, i32 31), !dbg !33 + %161 = bitcast i32 %160 to float, !dbg !33 + %162 = fadd float %158, %161, !dbg !29 + %163 = icmp eq i32 %13, 0, !dbg !33 + %164 = bitcast float %162 to <1 x i32>, !dbg !33 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %145, <1 x i32> %164, i1 %163) #5, !dbg !33 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !33 + %165 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !33 + %166 = getelementptr float, ptr addrspace(1) %3, i64 %20, !dbg !34 + %167 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !35 + %168 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %166, i64 %167, i1 %12) #5, !dbg !35 + %169 = bitcast i32 %168 to float, !dbg !35 + %170 = fmul float %165, -5.000000e-01, !dbg !36 + %171 = fmul float %169, %169, !dbg !37 + %172 = fmul float %171, %169, !dbg !38 + %173 = fmul float %170, %172, !dbg !39 + %174 = sitofp i64 %5 to float, !dbg !40 + %175 = tail call float @llvm.nvvm.div.full(float %173, float %174), !dbg !41 + br i1 %22, label %.lr.ph4.preheader, label %._crit_edge5, !dbg !42 + +.lr.ph4.preheader: ; preds = %._crit_edge + %176 = insertelement <2 x float> poison, float %169, i64 0 + %177 = insertelement <2 x float> %176, float %175, i64 1 + br label %.lr.ph4, !dbg !42 + +.lr.ph4: ; preds = %.lr.ph4.preheader, %.lr.ph4 + %178 = phi i32 [ %287, %.lr.ph4 ], [ 0, %.lr.ph4.preheader ] + %179 = or disjoint i32 %178, %16, !dbg !43 + %180 = or disjoint i32 %178, %17, !dbg !43 + %181 = or disjoint i32 %178, %18, !dbg !43 + %182 = or disjoint i32 %178, %19, !dbg !43 + %183 = icmp slt i32 %179, %7, !dbg !44 + %184 = icmp slt i32 %180, %7, !dbg !44 + %185 = icmp slt i32 %181, %7, !dbg !44 + %186 = icmp slt i32 %182, %7, !dbg !44 + %187 = sext i32 %179 to i64, !dbg !45 + %188 = sext i32 %180 to i64, !dbg !45 + %189 = sext i32 %181 to i64, !dbg !45 + %190 = sext i32 %182 to i64, !dbg !45 + %191 = add i64 %21, %187, !dbg !45 + %192 = add i64 %21, %188, !dbg !45 + %193 = add i64 %21, %189, !dbg !45 + %194 = add i64 %21, %190, !dbg !45 + %195 = getelementptr bfloat, ptr addrspace(1) %0, i64 %191, !dbg !46 + %196 = getelementptr bfloat, ptr addrspace(1) %0, i64 %192, !dbg !46 + %197 = getelementptr bfloat, ptr addrspace(1) %0, i64 %193, !dbg !46 + %198 = getelementptr bfloat, ptr addrspace(1) %0, i64 %194, !dbg !46 + %199 = and i1 %12, %183, !dbg !47 + %200 = and i1 %12, %184, !dbg !47 + %201 = and i1 %12, %185, !dbg !47 + %202 = and i1 %12, %186, !dbg !47 + %203 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !48 + %204 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %195, i64 %203, i1 %199) #5, !dbg !48 + %205 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !48 + %206 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %196, i64 %205, i1 %200) #5, !dbg !48 + %207 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !48 + %208 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %197, i64 %207, i1 %201) #5, !dbg !48 + %209 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !48 + %210 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %198, i64 %209, i1 %202) #5, !dbg !48 + %211 = getelementptr bfloat, ptr addrspace(1) %1, i64 %187, !dbg !49 + %212 = getelementptr bfloat, ptr addrspace(1) %1, i64 %188, !dbg !49 + %213 = getelementptr bfloat, ptr addrspace(1) %1, i64 %189, !dbg !49 + %214 = getelementptr bfloat, ptr addrspace(1) %1, i64 %190, !dbg !49 + %215 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !50 + %216 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %211, i64 %215, i1 %183) #5, !dbg !50 + %217 = bitcast i16 %216 to bfloat, !dbg !50 + %218 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !50 + %219 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %212, i64 %218, i1 %184) #5, !dbg !50 + %220 = bitcast i16 %219 to bfloat, !dbg !50 + %221 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !50 + %222 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %213, i64 %221, i1 %185) #5, !dbg !50 + %223 = bitcast i16 %222 to bfloat, !dbg !50 + %224 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !50 + %225 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %214, i64 %224, i1 %186) #5, !dbg !50 + %226 = bitcast i16 %225 to bfloat, !dbg !50 + %227 = fpext bfloat %217 to float, !dbg !51 + %228 = fpext bfloat %220 to float, !dbg !51 + %229 = fpext bfloat %223 to float, !dbg !51 + %230 = fpext bfloat %226 to float, !dbg !51 + %231 = getelementptr bfloat, ptr addrspace(1) %2, i64 %191, !dbg !52 + %232 = getelementptr bfloat, ptr addrspace(1) %2, i64 %192, !dbg !52 + %233 = getelementptr bfloat, ptr addrspace(1) %2, i64 %193, !dbg !52 + %234 = getelementptr bfloat, ptr addrspace(1) %2, i64 %194, !dbg !52 + %235 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !53 + %236 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %231, i64 %235, i1 %199) #5, !dbg !53 + %237 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !53 + %238 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %232, i64 %237, i1 %200) #5, !dbg !53 + %239 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !53 + %240 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %233, i64 %239, i1 %201) #5, !dbg !53 + %241 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !53 + %242 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %234, i64 %241, i1 %202) #5, !dbg !53 + %243 = insertelement <2 x i16> poison, i16 %204, i64 0, !dbg !48 + %244 = insertelement <2 x i16> %243, i16 %236, i64 1, !dbg !48 + %245 = bitcast <2 x i16> %244 to <2 x bfloat>, !dbg !48 + %246 = fpext <2 x bfloat> %245 to <2 x float>, !dbg !54 + %247 = insertelement <2 x float> , float %227, i64 0, !dbg !55 + %248 = fmul <2 x float> %247, %246, !dbg !55 + %249 = fmul <2 x float> %248, %177, !dbg !56 + %250 = insertelement <2 x i16> poison, i16 %206, i64 0, !dbg !48 + %251 = insertelement <2 x i16> %250, i16 %238, i64 1, !dbg !48 + %252 = bitcast <2 x i16> %251 to <2 x bfloat>, !dbg !48 + %253 = fpext <2 x bfloat> %252 to <2 x float>, !dbg !54 + %254 = insertelement <2 x float> , float %228, i64 0, !dbg !55 + %255 = fmul <2 x float> %254, %253, !dbg !55 + %256 = fmul <2 x float> %255, %177, !dbg !56 + %257 = insertelement <2 x i16> poison, i16 %208, i64 0, !dbg !48 + %258 = insertelement <2 x i16> %257, i16 %240, i64 1, !dbg !48 + %259 = bitcast <2 x i16> %258 to <2 x bfloat>, !dbg !48 + %260 = fpext <2 x bfloat> %259 to <2 x float>, !dbg !54 + %261 = insertelement <2 x float> , float %229, i64 0, !dbg !55 + %262 = fmul <2 x float> %261, %260, !dbg !55 + %263 = fmul <2 x float> %262, %177, !dbg !56 + %264 = insertelement <2 x i16> poison, i16 %210, i64 0, !dbg !48 + %265 = insertelement <2 x i16> %264, i16 %242, i64 1, !dbg !48 + %266 = bitcast <2 x i16> %265 to <2 x bfloat>, !dbg !48 + %267 = fpext <2 x bfloat> %266 to <2 x float>, !dbg !54 + %268 = insertelement <2 x float> , float %230, i64 0, !dbg !55 + %269 = fmul <2 x float> %268, %267, !dbg !55 + %270 = fmul <2 x float> %269, %177, !dbg !56 + %shift18 = shufflevector <2 x float> %249, <2 x float> poison, <2 x i32> , !dbg !57 + %foldExtExtBinop19 = fadd <2 x float> %249, %shift18, !dbg !57 + %271 = extractelement <2 x float> %foldExtExtBinop19, i64 0, !dbg !57 + %shift21 = shufflevector <2 x float> %256, <2 x float> poison, <2 x i32> , !dbg !57 + %foldExtExtBinop22 = fadd <2 x float> %256, %shift21, !dbg !57 + %272 = extractelement <2 x float> %foldExtExtBinop22, i64 0, !dbg !57 + %shift24 = shufflevector <2 x float> %263, <2 x float> poison, <2 x i32> , !dbg !57 + %foldExtExtBinop25 = fadd <2 x float> %263, %shift24, !dbg !57 + %273 = extractelement <2 x float> %foldExtExtBinop25, i64 0, !dbg !57 + %shift27 = shufflevector <2 x float> %270, <2 x float> poison, <2 x i32> , !dbg !57 + %foldExtExtBinop28 = fadd <2 x float> %270, %shift27, !dbg !57 + %274 = extractelement <2 x float> %foldExtExtBinop28, i64 0, !dbg !57 + %275 = getelementptr bfloat, ptr addrspace(1) %4, i64 %191, !dbg !58 + %276 = getelementptr bfloat, ptr addrspace(1) %4, i64 %192, !dbg !58 + %277 = getelementptr bfloat, ptr addrspace(1) %4, i64 %193, !dbg !58 + %278 = getelementptr bfloat, ptr addrspace(1) %4, i64 %194, !dbg !58 + %279 = fptrunc float %271 to bfloat, !dbg !59 + %280 = fptrunc float %272 to bfloat, !dbg !59 + %281 = fptrunc float %273 to bfloat, !dbg !59 + %282 = fptrunc float %274 to bfloat, !dbg !59 + %283 = bitcast bfloat %279 to i16, !dbg !59 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %283, ptr addrspace(1) %275, i1 %199) #5, !dbg !59 + %284 = bitcast bfloat %280 to i16, !dbg !59 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %284, ptr addrspace(1) %276, i1 %200) #5, !dbg !59 + %285 = bitcast bfloat %281 to i16, !dbg !59 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %285, ptr addrspace(1) %277, i1 %201) #5, !dbg !59 + %286 = bitcast bfloat %282 to i16, !dbg !59 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %286, ptr addrspace(1) %278, i1 %202) #5, !dbg !59 + %287 = add i32 %178, 2048, !dbg !42 + %288 = icmp slt i32 %287, %7, !dbg !42 + br i1 %288, label %.lr.ph4, label %._crit_edge5, !dbg !42 + +._crit_edge5: ; preds = %.lr.ph4, %._crit_edge + ret void, !dbg !60 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2", linkageName: "triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 21, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 21, scope: !4) +!9 = !DILocation(line: 24, column: 37, scope: !4) +!10 = !DILocation(line: 34, column: 45, scope: !4) +!11 = !DILocation(line: 28, column: 40, scope: !4) +!12 = !DILocation(line: 29, column: 31, scope: !4) +!13 = !DILocation(line: 34, column: 41, scope: !4) +!14 = !DILocation(line: 34, column: 34, scope: !4) +!15 = !DILocation(line: 34, column: 50, scope: !4) +!16 = !DILocation(line: 35, column: 34, scope: !4) +!17 = !DILocation(line: 36, column: 34, scope: !4) +!18 = !DILocation(line: 30, column: 29, scope: !4) +!19 = !DILocation(line: 34, column: 60, scope: !4) +!20 = !DILocation(line: 36, column: 50, scope: !4) +!21 = !DILocation(line: 34, column: 111, scope: !4) +!22 = !DILocation(line: 35, column: 41, scope: !4) +!23 = !DILocation(line: 35, column: 94, scope: !4) +!24 = !DILocation(line: 36, column: 111, scope: !4) +!25 = !DILocation(line: 37, column: 22, scope: !4) +!26 = !DILocation(line: 40, column: 22, scope: !4) +!27 = !DILocation(line: 42, column: 23, scope: !4) +!28 = !DILocation(line: 43, column: 48, scope: !4) +!29 = !DILocation(line: 261, column: 15, scope: !30, inlinedAt: !32) +!30 = distinct !DILexicalBlockFile(scope: !4, file: !31, discriminator: 0) +!31 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!32 = !DILocation(line: 44, column: 25, scope: !4) +!33 = !DILocation(line: 291, column: 36, scope: !30, inlinedAt: !32) +!34 = !DILocation(line: 45, column: 31, scope: !4) +!35 = !DILocation(line: 45, column: 36, scope: !4) +!36 = !DILocation(line: 59, column: 23, scope: !4) +!37 = !DILocation(line: 60, column: 24, scope: !4) +!38 = !DILocation(line: 61, column: 24, scope: !4) +!39 = !DILocation(line: 62, column: 24, scope: !4) +!40 = !DILocation(line: 64, column: 25, scope: !4) +!41 = !DILocation(line: 65, column: 25, scope: !4) +!42 = !DILocation(line: 46, column: 40, scope: !4) +!43 = !DILocation(line: 47, column: 31, scope: !4) +!44 = !DILocation(line: 48, column: 29, scope: !4) +!45 = !DILocation(line: 52, column: 42, scope: !4) +!46 = !DILocation(line: 52, column: 35, scope: !4) +!47 = !DILocation(line: 52, column: 61, scope: !4) +!48 = !DILocation(line: 52, column: 51, scope: !4) +!49 = !DILocation(line: 53, column: 35, scope: !4) +!50 = !DILocation(line: 53, column: 42, scope: !4) +!51 = !DILocation(line: 53, column: 95, scope: !4) +!52 = !DILocation(line: 54, column: 35, scope: !4) +!53 = !DILocation(line: 54, column: 51, scope: !4) +!54 = !DILocation(line: 52, column: 113, scope: !4) +!55 = !DILocation(line: 55, column: 24, scope: !4) +!56 = !DILocation(line: 57, column: 24, scope: !4) +!57 = !DILocation(line: 70, column: 24, scope: !4) +!58 = !DILocation(line: 72, column: 29, scope: !4) +!59 = !DILocation(line: 72, column: 52, scope: !4) +!60 = !DILocation(line: 46, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..fa3af3be79008c50b5d1cfac6d4fb7a7605c4ca2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.ptx @@ -0,0 +1,865 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2 // -- Begin function triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2 +.visible .entry triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2_param_4, + .param .u64 triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2_param_5, + .param .u32 triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2_param_6, + .param .u32 triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2_param_7, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2_param_8, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2_param_9 +) +.reqntid 512 +{ + .reg .pred %p<42>; + .reg .b16 %rs<73>; + .reg .b32 %r<153>; + .reg .b64 %rd<129>; + .loc 1 18 0 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:18:0 + +// %bb.0: + ld.param.b32 %r25, [triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2_param_7]; + ld.param.b32 %r24, [triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2_param_6]; + ld.param.b64 %rd13, [triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2_param_5]; + ld.param.b64 %rd11, [triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2_param_3]; + ld.param.b64 %rd10, [triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2_param_2]; + ld.param.b64 %rd9, [triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2_param_1]; + ld.param.b64 %rd8, [triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2_param_0]; +$L__tmp0: + .loc 1 21 28 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:21:28 + mov.u32 %r27, %ctaid.x; + .loc 1 24 37 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:24:37 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + shr.u32 %r3, %r1, 5; + and.b32 %r4, %r1, 511; + or.b32 %r5, %r1, 512; + or.b32 %r7, %r1, 1536; + .loc 1 34 45 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:34:45 + cvt.u64.u32 %rd1, %r27; + mul.lo.s64 %rd2, %rd13, %rd1; + .loc 1 28 40 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:28:40 + setp.lt.s32 %p5, %r25, 1; + mov.b32 %r151, 0f00000000; + cvt.u32.u64 %r149, %rd1; + @%p5 bra $L__BB0_4; +// %bb.1: // %.lr.ph.preheader + .loc 1 0 0 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:0 + or.b32 %r6, %r4, 1024; + .loc 1 23 21 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:23:21 + setp.lt.s32 %p1, %r149, %r24; + mov.b32 %r30, 0f00000000; + mov.b64 %rd127, {%r30, %r30}; + mov.b32 %r150, 0; + mov.b64 %rd128, %rd127; +$L__BB0_2: // %.lr.ph + // =>This Inner Loop Header: Depth=1 + .loc 1 29 31 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:29:31 + or.b32 %r31, %r150, %r7; + or.b32 %r32, %r150, %r6; + or.b32 %r33, %r150, %r5; + or.b32 %r34, %r150, %r4; + .loc 1 34 41 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:34:41 + cvt.s64.s32 %rd52, %r34; + cvt.s64.s32 %rd53, %r33; + cvt.s64.s32 %rd54, %r32; + cvt.s64.s32 %rd55, %r31; + add.s64 %rd56, %rd2, %rd52; + add.s64 %rd57, %rd2, %rd53; + add.s64 %rd58, %rd2, %rd54; + add.s64 %rd59, %rd2, %rd55; + .loc 1 34 34 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:34:34 + shl.b64 %rd60, %rd56, 1; + add.s64 %rd17, %rd8, %rd60; + shl.b64 %rd61, %rd57, 1; + add.s64 %rd20, %rd8, %rd61; + shl.b64 %rd62, %rd58, 1; + add.s64 %rd23, %rd8, %rd62; + shl.b64 %rd63, %rd59, 1; + add.s64 %rd26, %rd8, %rd63; + .loc 1 34 50 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:34:50 + // begin inline asm + mov.u64 %rd16, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd16, 1.0; + // end inline asm + .loc 1 35 34 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:35:34 + mad.wide.s32 %rd29, %r34, 2, %rd9; + mad.wide.s32 %rd32, %r33, 2, %rd9; + mad.wide.s32 %rd35, %r32, 2, %rd9; + mad.wide.s32 %rd38, %r31, 2, %rd9; + .loc 1 36 34 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:36:34 + add.s64 %rd41, %rd10, %rd60; + add.s64 %rd44, %rd10, %rd61; + add.s64 %rd47, %rd10, %rd62; + add.s64 %rd50, %rd10, %rd63; + .loc 1 30 29 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:30:29 + setp.lt.s32 %p10, %r34, %r25; + setp.lt.s32 %p11, %r33, %r25; + setp.lt.s32 %p12, %r32, %r25; + setp.lt.s32 %p13, %r31, %r25; + .loc 1 34 60 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:34:60 + and.pred %p17, %p1, %p13; + and.pred %p16, %p1, %p12; + and.pred %p15, %p1, %p11; + and.pred %p14, %p1, %p10; + mov.b16 %rs2, 0; + .loc 1 34 50 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:34:50 + // begin inline asm + mov.u16 %rs1, %rs2; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs1 }, [ %rd17 + 0 ], %rd16; + // end inline asm + // begin inline asm + mov.u64 %rd19, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd19, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs3, %rs2; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs3 }, [ %rd20 + 0 ], %rd19; + // end inline asm + // begin inline asm + mov.u64 %rd22, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd22, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs5, %rs2; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs5 }, [ %rd23 + 0 ], %rd22; + // end inline asm + // begin inline asm + mov.u64 %rd25, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd25, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs7, %rs2; + @%p17 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs7 }, [ %rd26 + 0 ], %rd25; + // end inline asm + mov.b32 %r35, {%rs1, %rs3}; + mov.b32 %r36, {%rs5, %rs7}; + .loc 1 34 111 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:34:111 + mov.b32 {%rs25, %rs26}, %r36; + cvt.f32.bf16 %r37, %rs25; + cvt.f32.bf16 %r38, %rs26; + mov.b32 {%rs27, %rs28}, %r35; + cvt.f32.bf16 %r39, %rs27; + cvt.f32.bf16 %r40, %rs28; + .loc 1 35 41 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:35:41 + // begin inline asm + mov.u64 %rd28, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd28, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs9, %rs2; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd29 + 0 ], %rd28; + // end inline asm + // begin inline asm + mov.u64 %rd31, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd31, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs11, %rs2; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd32 + 0 ], %rd31; + // end inline asm + // begin inline asm + mov.u64 %rd34, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd34, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs13, %rs2; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd35 + 0 ], %rd34; + // end inline asm + // begin inline asm + mov.u64 %rd37, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd37, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs2; + @%p13 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd38 + 0 ], %rd37; + // end inline asm + mov.b32 %r41, {%rs9, %rs11}; + mov.b32 %r42, {%rs13, %rs15}; + .loc 1 35 94 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:35:94 + mov.b32 {%rs29, %rs30}, %r42; + cvt.f32.bf16 %r43, %rs29; + cvt.f32.bf16 %r44, %rs30; + mov.b32 {%rs31, %rs32}, %r41; + cvt.f32.bf16 %r45, %rs31; + cvt.f32.bf16 %r46, %rs32; + .loc 1 36 50 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:36:50 + // begin inline asm + mov.u64 %rd40, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd40, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs17, %rs2; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd41 + 0 ], %rd40; + // end inline asm + // begin inline asm + mov.u64 %rd43, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd43, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs19, %rs2; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd44 + 0 ], %rd43; + // end inline asm + // begin inline asm + mov.u64 %rd46, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd46, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs21, %rs2; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd47 + 0 ], %rd46; + // end inline asm + // begin inline asm + mov.u64 %rd49, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd49, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs23, %rs2; + @%p17 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd50 + 0 ], %rd49; + // end inline asm + mov.b32 %r47, {%rs21, %rs23}; + mov.b32 %r48, {%rs17, %rs19}; + .loc 1 36 111 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:36:111 + mov.b32 {%rs33, %rs34}, %r48; + cvt.f32.bf16 %r49, %rs34; + cvt.f32.bf16 %r50, %rs33; + mov.b32 {%rs35, %rs36}, %r47; + cvt.f32.bf16 %r51, %rs36; + cvt.f32.bf16 %r52, %rs35; + .loc 1 37 22 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:37:22 + mul.f32 %r53, %r40, %r46; + mul.f32 %r54, %r39, %r45; + mul.f32 %r55, %r38, %r44; + mul.f32 %r56, %r37, %r43; + .loc 1 42 23 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:42:23 + mov.b64 {%r57, %r58}, %rd128; + fma.rn.f32 %r59, %r56, %r52, %r57; + fma.rn.f32 %r60, %r55, %r51, %r58; + mov.b64 {%r61, %r62}, %rd127; + fma.rn.f32 %r63, %r54, %r50, %r61; + fma.rn.f32 %r64, %r53, %r49, %r62; + .loc 1 43 48 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:43:48 + selp.f32 %r65, %r64, %r62, %p15; + selp.f32 %r66, %r63, %r61, %p14; + mov.b64 %rd127, {%r66, %r65}; + selp.f32 %r67, %r60, %r58, %p17; + selp.f32 %r68, %r59, %r57, %p16; + mov.b64 %rd128, {%r68, %r67}; + .loc 1 28 40 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:28:40 + add.s32 %r150, %r150, 2048; + setp.lt.s32 %p18, %r150, %r25; + @%p18 bra $L__BB0_2; +// %bb.3: // %._crit_edge.loopexit +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + mov.b64 {%r69, %r70}, %rd127; + add.f32 %r71, %r69, %r70; + mov.b64 {%r72, %r73}, %rd128; + add.f32 %r74, %r72, %r71; + add.f32 %r151, %r73, %r74; +$L__tmp2: +$L__BB0_4: // %._crit_edge + .loc 1 23 21 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:23:21 + setp.lt.s32 %p22, %r149, %r24; +$L__tmp3: + .loc 2 291 36 // standard.py:291:36 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + shfl.sync.bfly.b32 %r83, %r151, 16, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + add.f32 %r84, %r151, %r83; + .loc 2 291 36 // standard.py:291:36 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + shfl.sync.bfly.b32 %r85, %r84, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + add.f32 %r86, %r84, %r85; + .loc 2 291 36 // standard.py:291:36 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + shfl.sync.bfly.b32 %r87, %r86, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + add.f32 %r88, %r86, %r87; + .loc 2 291 36 // standard.py:291:36 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + shfl.sync.bfly.b32 %r89, %r88, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + add.f32 %r90, %r88, %r89; + .loc 2 291 36 // standard.py:291:36 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + shfl.sync.bfly.b32 %r91, %r90, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + add.f32 %r76, %r90, %r91; + .loc 2 291 36 // standard.py:291:36 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + and.b32 %r92, %r3, 15; + setp.eq.b32 %p19, %r2, 0; + shl.b32 %r93, %r92, 2; + mov.b32 %r94, global_smem; + add.s32 %r75, %r94, %r93; + // begin inline asm + @%p19 st.shared.b32 [ %r75 + 0 ], %r76; + // end inline asm + bar.sync 0; + setp.lt.u32 %p20, %r1, 16; + shl.b32 %r95, %r1, 2; + add.s32 %r78, %r94, %r95; + // begin inline asm + @%p20 ld.shared.b32 %r77, [ %r78 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r96, %r77, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + add.f32 %r97, %r77, %r96; + .loc 2 291 36 // standard.py:291:36 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + shfl.sync.bfly.b32 %r98, %r97, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + add.f32 %r99, %r97, %r98; + .loc 2 291 36 // standard.py:291:36 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + shfl.sync.bfly.b32 %r100, %r99, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + add.f32 %r101, %r99, %r100; + .loc 2 291 36 // standard.py:291:36 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + shfl.sync.bfly.b32 %r102, %r101, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + add.f32 %r80, %r101, %r102; + .loc 2 291 36 // standard.py:291:36 @[ cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:44:25 ] + setp.eq.b32 %p21, %r1, 0; + // begin inline asm + @%p21 st.shared.b32 [ %r78 + 0 ], %r80; + // end inline asm + bar.sync 0; +$L__tmp4: + .loc 1 45 31 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:45:31 + shl.b64 %rd67, %rd1, 2; + add.s64 %rd65, %rd11, %rd67; + .loc 1 45 36 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:45:36 + // begin inline asm + mov.u64 %rd64, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd64, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r81, 0x0; + @%p22 ld.global.L1::evict_last.L2::cache_hint.b32 { %r81 }, [ %rd65 + 0 ], %rd64; + // end inline asm + .loc 1 46 40 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:46:40 + @%p5 bra $L__BB0_7; +// %bb.5: // %.lr.ph4.preheader + .loc 1 0 40 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:0:40 + ld.param.b64 %rd12, [triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2_param_4]; + ld.shared.b32 %r103, [global_smem]; + mul.f32 %r104, %r103, 0fBF000000; + mul.f32 %r105, %r81, %r81; + mul.f32 %r106, %r105, %r81; + mul.f32 %r107, %r104, %r106; + cvt.rn.f32.s64 %r108, %rd13; + div.full.f32 %r21, %r107, %r108; + mov.b64 %rd7, {%r81, %r21}; + mov.b32 %r152, 0; + cvt.u64.u32 %rd117, %r4; +$L__BB0_6: // %.lr.ph4 + // =>This Inner Loop Header: Depth=1 + .loc 1 47 31 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:47:31 + add.s32 %r111, %r4, %r152; + add.s32 %r112, %r5, %r152; + add.s32 %r113, %r111, 1024; + .loc 1 48 29 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:48:29 + add.s32 %r114, %r7, %r152; + setp.lt.s32 %p28, %r111, %r25; + setp.lt.s32 %p29, %r112, %r25; + setp.lt.s32 %p30, %r113, %r25; + setp.lt.s32 %p31, %r114, %r25; + .loc 1 52 42 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:52:42 + cvt.s64.s32 %rd108, %r111; + cvt.s64.s32 %rd109, %r112; + cvt.s64.s32 %rd110, %r114; + add.s64 %rd111, %rd2, %rd108; + add.s64 %rd112, %rd2, %rd109; + add.s64 %rd113, %rd2, %rd110; + .loc 1 52 35 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:52:35 + shl.b64 %rd114, %rd111, 1; + add.s64 %rd69, %rd8, %rd114; + shl.b64 %rd115, %rd112, 1; + add.s64 %rd72, %rd8, %rd115; + .loc 1 52 42 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:52:42 + cvt.s64.s32 %rd116, %r152; + add.s64 %rd118, %rd116, %rd117; + add.s64 %rd119, %rd2, %rd118; + .loc 1 52 35 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:52:35 + shl.b64 %rd120, %rd119, 1; + add.s64 %rd121, %rd8, %rd120; + add.s64 %rd75, %rd121, 2048; + shl.b64 %rd122, %rd113, 1; + add.s64 %rd78, %rd8, %rd122; + .loc 1 52 61 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:52:61 + and.pred %p24, %p22, %p28; + and.pred %p25, %p22, %p29; + and.pred %p26, %p22, %p30; + and.pred %p27, %p22, %p31; + .loc 1 52 51 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:52:51 + // begin inline asm + mov.u64 %rd68, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd68, 1.0; + // end inline asm + mov.b16 %rs38, 0; + // begin inline asm + mov.u16 %rs37, %rs38; + @%p24 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs37 }, [ %rd69 + 0 ], %rd68; + // end inline asm + // begin inline asm + mov.u64 %rd71, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd71, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs39, %rs38; + @%p25 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs39 }, [ %rd72 + 0 ], %rd71; + // end inline asm + // begin inline asm + mov.u64 %rd74, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd74, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs41, %rs38; + @%p26 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs41 }, [ %rd75 + 0 ], %rd74; + // end inline asm + // begin inline asm + mov.u64 %rd77, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd77, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs43, %rs38; + @%p27 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs43 }, [ %rd78 + 0 ], %rd77; + // end inline asm + .loc 1 53 35 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:53:35 + mad.wide.s32 %rd81, %r111, 2, %rd9; + mad.wide.s32 %rd84, %r112, 2, %rd9; + shl.b64 %rd123, %rd118, 1; + add.s64 %rd124, %rd9, %rd123; + add.s64 %rd87, %rd124, 2048; + mad.wide.s32 %rd90, %r114, 2, %rd9; + .loc 1 53 42 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:53:42 + // begin inline asm + mov.u64 %rd80, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd80, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs45, %rs38; + @%p28 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs45 }, [ %rd81 + 0 ], %rd80; + // end inline asm + // begin inline asm + mov.u64 %rd83, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd83, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs47, %rs38; + @%p29 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs47 }, [ %rd84 + 0 ], %rd83; + // end inline asm + // begin inline asm + mov.u64 %rd86, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd86, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs49, %rs38; + @%p30 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs49 }, [ %rd87 + 0 ], %rd86; + // end inline asm + // begin inline asm + mov.u64 %rd89, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd89, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs51, %rs38; + @%p31 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs51 }, [ %rd90 + 0 ], %rd89; + // end inline asm + .loc 1 53 95 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:53:95 + cvt.f32.bf16 %r115, %rs45; + cvt.f32.bf16 %r116, %rs47; + cvt.f32.bf16 %r117, %rs49; + cvt.f32.bf16 %r118, %rs51; + .loc 1 54 35 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:54:35 + add.s64 %rd93, %rd10, %rd114; + add.s64 %rd96, %rd10, %rd115; + add.s64 %rd125, %rd10, %rd120; + add.s64 %rd99, %rd125, 2048; + add.s64 %rd102, %rd10, %rd122; + .loc 1 54 51 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:54:51 + // begin inline asm + mov.u64 %rd92, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd92, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs53, %rs38; + @%p24 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs53 }, [ %rd93 + 0 ], %rd92; + // end inline asm + // begin inline asm + mov.u64 %rd95, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd95, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs55, %rs38; + @%p25 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs55 }, [ %rd96 + 0 ], %rd95; + // end inline asm + // begin inline asm + mov.u64 %rd98, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd98, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs57, %rs38; + @%p26 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs57 }, [ %rd99 + 0 ], %rd98; + // end inline asm + // begin inline asm + mov.u64 %rd101, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd101, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs59, %rs38; + @%p27 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs59 }, [ %rd102 + 0 ], %rd101; + // end inline asm + .loc 1 52 51 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:52:51 + mov.b32 %r119, {%rs37, %rs53}; + .loc 1 52 113 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:52:113 + mov.b32 {%rs65, %rs66}, %r119; + cvt.f32.bf16 %r120, %rs66; + cvt.f32.bf16 %r121, %rs65; + .loc 1 55 24 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:55:24 + mul.f32 %r122, %r115, %r121; + add.f32 %r123, %r120, %r120; + .loc 1 57 24 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:57:24 + mov.b64 {%r124, %r125}, %rd7; + mul.f32 %r126, %r123, %r125; + .loc 1 52 51 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:52:51 + mov.b32 %r127, {%rs39, %rs55}; + .loc 1 52 113 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:52:113 + mov.b32 {%rs67, %rs68}, %r127; + cvt.f32.bf16 %r128, %rs68; + cvt.f32.bf16 %r129, %rs67; + .loc 1 55 24 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:55:24 + mul.f32 %r130, %r116, %r129; + add.f32 %r131, %r128, %r128; + .loc 1 57 24 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:57:24 + mul.f32 %r132, %r131, %r125; + .loc 1 52 51 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:52:51 + mov.b32 %r133, {%rs41, %rs57}; + .loc 1 52 113 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:52:113 + mov.b32 {%rs69, %rs70}, %r133; + cvt.f32.bf16 %r134, %rs70; + cvt.f32.bf16 %r135, %rs69; + .loc 1 55 24 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:55:24 + mul.f32 %r136, %r117, %r135; + add.f32 %r137, %r134, %r134; + .loc 1 57 24 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:57:24 + mul.f32 %r138, %r137, %r125; + .loc 1 52 51 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:52:51 + mov.b32 %r139, {%rs43, %rs59}; + .loc 1 52 113 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:52:113 + mov.b32 {%rs71, %rs72}, %r139; + cvt.f32.bf16 %r140, %rs72; + cvt.f32.bf16 %r141, %rs71; + .loc 1 55 24 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:55:24 + mul.f32 %r142, %r118, %r141; + add.f32 %r143, %r140, %r140; + .loc 1 57 24 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:57:24 + mul.f32 %r144, %r143, %r125; + .loc 1 70 24 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:70:24 + fma.rn.f32 %r145, %r122, %r124, %r126; + fma.rn.f32 %r146, %r130, %r124, %r132; + fma.rn.f32 %r147, %r136, %r124, %r138; + fma.rn.f32 %r148, %r142, %r124, %r144; + .loc 1 72 29 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:72:29 + add.s64 %rd104, %rd12, %rd114; + add.s64 %rd105, %rd12, %rd115; + add.s64 %rd126, %rd12, %rd120; + add.s64 %rd106, %rd126, 2048; + add.s64 %rd107, %rd12, %rd122; + .loc 1 72 52 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:72:52 + cvt.rn.bf16.f32 %rs61, %r145; + cvt.rn.bf16.f32 %rs62, %r146; + cvt.rn.bf16.f32 %rs63, %r147; + cvt.rn.bf16.f32 %rs64, %r148; + // begin inline asm + @%p24 st.global.b16 [ %rd104 + 0 ], { %rs61 }; + // end inline asm + // begin inline asm + @%p25 st.global.b16 [ %rd105 + 0 ], { %rs62 }; + // end inline asm + // begin inline asm + @%p26 st.global.b16 [ %rd106 + 0 ], { %rs63 }; + // end inline asm + // begin inline asm + @%p27 st.global.b16 [ %rd107 + 0 ], { %rs64 }; + // end inline asm + .loc 1 46 40 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:46:40 + add.s32 %r152, %r152, 2048; + setp.lt.s32 %p41, %r152, %r25; + @%p41 bra $L__BB0_6; +$L__BB0_7: // %._crit_edge5 + .loc 1 46 4 // cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py:46:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 239 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe8 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 103 +.b8 110 +.b8 109 +.b8 106 +.b8 120 +.b8 105 +.b8 107 +.b8 118 +.b8 105 +.b8 53 +.b8 117 +.b8 108 +.b8 99 +.b8 121 +.b8 106 +.b8 51 +.b8 117 +.b8 111 +.b8 122 +.b8 105 +.b8 102 +.b8 51 +.b8 108 +.b8 101 +.b8 53 +.b8 104 +.b8 100 +.b8 50 +.b8 54 +.b8 107 +.b8 119 +.b8 50 +.b8 107 +.b8 106 +.b8 104 +.b8 107 +.b8 99 +.b8 98 +.b8 104 +.b8 117 +.b8 112 +.b8 113 +.b8 103 +.b8 117 +.b8 100 +.b8 113 +.b8 105 +.b8 51 +.b8 98 +.b8 119 +.b8 110 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 103 +.b8 110 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x39 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 100 +.b8 100 +.b8 95 +.b8 100 +.b8 105 +.b8 118 +.b8 95 +.b8 101 +.b8 120 +.b8 112 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 112 +.b8 111 +.b8 119 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc4:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xd9:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.source b/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.source new file mode 100644 index 0000000000000000000000000000000000000000..7baddeb0db2d862fbdefd8ce31afbbc7ff745f99 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.source @@ -0,0 +1,338 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":18:0) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc77 = loc(unknown) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc84 = loc("in_ptr0"(#loc)) +#loc85 = loc("in_ptr1"(#loc)) +#loc86 = loc("in_ptr2"(#loc)) +#loc87 = loc("in_ptr3"(#loc)) +#loc88 = loc("out_ptr1"(#loc)) +#loc89 = loc("ks0"(#loc)) +#loc90 = loc("xnumel"(#loc)) +#loc91 = loc("r0_numel"(#loc)) +#loc158 = loc("input"(#loc75)) +#loc159 = loc("a"(#loc80)) +#loc160 = loc("b"(#loc80)) +module { + tt.func public @triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc92) + %xoffset_0 = arith.constant 1 : i32 loc(#loc93) + %xoffset_1 = arith.constant 1 : i32 loc(#loc93) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc93) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc94) + %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc95) + %xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<1x1xi32> loc(#loc96) + %xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<1x1xi32> loc(#loc96) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc97) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<1x1xi32> loc(#loc97) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc98) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc99) + %_tmp8 = arith.constant 0.000000e+00 : f32 loc(#loc100) + %_tmp8_8 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc100) + %c0_i32 = arith.constant 0 : i32 loc(#loc10) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc10) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc10) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc10) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc10) + %3 = ub.poison : i32 loc(#loc10) + %_tmp8_9 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp8_15 = %_tmp8_8) -> (tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc102) + %r0_index_16 = arith.addi %r0_index, %r0_base_7 : tensor<1x2048xi32> loc(#loc102) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x2048xi32> loc(#loc103) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x2048xi32> loc(#loc103) + %tmp0 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc104) + %tmp0_18 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc104) + %tmp0_19 = arith.muli %tmp0_18, %tmp0 : tensor<1x1xi64> loc(#loc104) + %tmp0_20 = arith.extsi %r0_index_16 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc105) + %tmp0_21 = tt.broadcast %tmp0_19 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc105) + %tmp0_22 = arith.addi %tmp0_20, %tmp0_21 : tensor<1x2048xi64> loc(#loc105) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc106) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc106) + %tmp0_25 = tt.broadcast %xmask_6 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc107) + %tmp0_26 = arith.andi %r0_mask_17, %tmp0_25 : tensor<1x2048xi1> loc(#loc107) + %tmp0_27 = arith.constant 0.000000e+00 : f32 loc(#loc108) + %tmp0_28 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc108) + %tmp0_29 = arith.truncf %tmp0_28 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc108) + %tmp0_30 = tt.load %tmp0_24, %tmp0_26, %tmp0_29 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc108) + %tmp0_31 = arith.extf %tmp0_30 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc109) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc110) + %tmp1_32 = tt.addptr %tmp1, %r0_index_16 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc110) + %tmp1_33 = arith.constant 0.000000e+00 : f32 loc(#loc111) + %tmp1_34 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc111) + %tmp1_35 = arith.truncf %tmp1_34 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc111) + %tmp1_36 = tt.load %tmp1_32, %r0_mask_17, %tmp1_35 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc111) + %tmp1_37 = arith.extf %tmp1_36 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc112) + %tmp4 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc113) + %tmp4_38 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc113) + %tmp4_39 = arith.muli %tmp4_38, %tmp4 : tensor<1x1xi64> loc(#loc113) + %tmp4_40 = arith.extsi %r0_index_16 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc114) + %tmp4_41 = tt.broadcast %tmp4_39 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc114) + %tmp4_42 = arith.addi %tmp4_40, %tmp4_41 : tensor<1x2048xi64> loc(#loc114) + %tmp4_43 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc115) + %tmp4_44 = tt.addptr %tmp4_43, %tmp4_42 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc115) + %tmp4_45 = tt.broadcast %xmask_6 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc116) + %tmp4_46 = arith.andi %r0_mask_17, %tmp4_45 : tensor<1x2048xi1> loc(#loc116) + %tmp4_47 = arith.constant 0.000000e+00 : f32 loc(#loc117) + %tmp4_48 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc117) + %tmp4_49 = arith.truncf %tmp4_48 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc117) + %tmp4_50 = tt.load %tmp4_44, %tmp4_46, %tmp4_49 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc117) + %tmp4_51 = arith.extf %tmp4_50 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc118) + %tmp2 = arith.mulf %tmp0_31, %tmp1_37 : tensor<1x2048xf32> loc(#loc119) + %tmp6 = arith.mulf %tmp2, %tmp4_51 : tensor<1x2048xf32> loc(#loc120) + %tmp9 = arith.addf %_tmp8_15, %tmp6 : tensor<1x2048xf32> loc(#loc121) + %_tmp8_52 = tt.broadcast %xmask_6 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc122) + %_tmp8_53 = arith.andi %r0_mask_17, %_tmp8_52 : tensor<1x2048xi1> loc(#loc122) + %_tmp8_54 = arith.select %_tmp8_53, %tmp9, %_tmp8_15 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc123) + scf.yield %_tmp8_54 : tensor<1x2048xf32> loc(#loc33) + } loc(#loc101) + %tmp8 = tt.call @"triton.language.standard.sum__fp32S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp8_9) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc124) + %tmp8_10 = tt.expand_dims %tmp8 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc125) + %tmp14 = tt.splat %in_ptr3 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc126) + %tmp14_11 = tt.addptr %tmp14, %xindex_5 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc126) + %tmp14_12 = tt.load %tmp14_11, %xmask_6 evictionPolicy = evict_last : tensor<1x1x!tt.ptr> loc(#loc127) + %c0_i32_13 = arith.constant 0 : i32 loc(#loc38) + %c2048_i32_14 = arith.constant 2048 : i32 loc(#loc38) + %4 = arith.bitcast %c0_i32_13 : i32 to i32 loc(#loc38) + %5 = arith.bitcast %r0_numel : i32 to i32 loc(#loc38) + %6 = arith.bitcast %c2048_i32_14 : i32 to i32 loc(#loc38) + %7 = ub.poison : i32 loc(#loc38) + scf.for %r0_offset = %4 to %5 step %6 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc128) + %r0_index_15 = arith.addi %r0_index, %r0_base_7 : tensor<1x2048xi32> loc(#loc128) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x2048xi32> loc(#loc129) + %r0_mask_16 = arith.cmpi slt, %r0_index_15, %r0_mask : tensor<1x2048xi32> loc(#loc129) + %tmp10 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc130) + %tmp10_17 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc130) + %tmp10_18 = arith.muli %tmp10_17, %tmp10 : tensor<1x1xi64> loc(#loc130) + %tmp10_19 = arith.extsi %r0_index_15 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc131) + %tmp10_20 = tt.broadcast %tmp10_18 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc131) + %tmp10_21 = arith.addi %tmp10_19, %tmp10_20 : tensor<1x2048xi64> loc(#loc131) + %tmp10_22 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc132) + %tmp10_23 = tt.addptr %tmp10_22, %tmp10_21 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc132) + %tmp10_24 = tt.broadcast %xmask_6 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc133) + %tmp10_25 = arith.andi %r0_mask_16, %tmp10_24 : tensor<1x2048xi1> loc(#loc133) + %tmp10_26 = arith.constant 0.000000e+00 : f32 loc(#loc134) + %tmp10_27 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc134) + %tmp10_28 = arith.truncf %tmp10_27 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc134) + %tmp10_29 = tt.load %tmp10_23, %tmp10_25, %tmp10_28 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc134) + %tmp10_30 = arith.extf %tmp10_29 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc135) + %tmp11 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc136) + %tmp11_31 = tt.addptr %tmp11, %r0_index_15 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc136) + %tmp11_32 = arith.constant 0.000000e+00 : f32 loc(#loc137) + %tmp11_33 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc137) + %tmp11_34 = arith.truncf %tmp11_33 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc137) + %tmp11_35 = tt.load %tmp11_31, %r0_mask_16, %tmp11_34 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc137) + %tmp11_36 = arith.extf %tmp11_35 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc138) + %tmp24 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc139) + %tmp24_37 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc139) + %tmp24_38 = arith.muli %tmp24_37, %tmp24 : tensor<1x1xi64> loc(#loc139) + %tmp24_39 = arith.extsi %r0_index_15 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc140) + %tmp24_40 = tt.broadcast %tmp24_38 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc140) + %tmp24_41 = arith.addi %tmp24_39, %tmp24_40 : tensor<1x2048xi64> loc(#loc140) + %tmp24_42 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc141) + %tmp24_43 = tt.addptr %tmp24_42, %tmp24_41 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc141) + %tmp24_44 = tt.broadcast %xmask_6 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc142) + %tmp24_45 = arith.andi %r0_mask_16, %tmp24_44 : tensor<1x2048xi1> loc(#loc142) + %tmp24_46 = arith.constant 0.000000e+00 : f32 loc(#loc143) + %tmp24_47 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc143) + %tmp24_48 = arith.truncf %tmp24_47 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc143) + %tmp24_49 = tt.load %tmp24_43, %tmp24_45, %tmp24_48 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc143) + %tmp24_50 = arith.extf %tmp24_49 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc144) + %tmp12 = arith.mulf %tmp10_30, %tmp11_36 : tensor<1x2048xf32> loc(#loc145) + %tmp15 = tt.broadcast %tmp14_12 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc146) + %tmp15_51 = arith.mulf %tmp12, %tmp15 : tensor<1x2048xf32> loc(#loc146) + %tmp16 = arith.constant -5.000000e-01 : f32 loc(#loc147) + %tmp17 = arith.constant dense<-5.000000e-01> : tensor<1x1xf32> loc(#loc148) + %tmp17_52 = arith.mulf %tmp8_10, %tmp17 : tensor<1x1xf32> loc(#loc148) + %tmp18 = arith.mulf %tmp14_12, %tmp14_12 : tensor<1x1xf32> loc(#loc149) + %tmp19 = arith.mulf %tmp18, %tmp14_12 : tensor<1x1xf32> loc(#loc150) + %tmp20 = arith.mulf %tmp17_52, %tmp19 : tensor<1x1xf32> loc(#loc151) + %tmp22 = arith.sitofp %ks0 : i64 to f32 loc(#loc152) + %tmp23 = tt.splat %tmp22 : f32 -> tensor<1x1xf32> loc(#loc153) + %tmp23_53 = arith.divf %tmp20, %tmp23 : tensor<1x1xf32> loc(#loc153) + %tmp26 = arith.constant 2.000000e+00 : f32 loc(#loc154) + %tmp27 = arith.constant dense<2.000000e+00> : tensor<1x2048xf32> loc(#loc155) + %tmp27_54 = arith.mulf %tmp24_50, %tmp27 : tensor<1x2048xf32> loc(#loc155) + %tmp28 = tt.broadcast %tmp23_53 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc156) + %tmp28_55 = arith.mulf %tmp28, %tmp27_54 : tensor<1x2048xf32> loc(#loc156) + %tmp29 = arith.addf %tmp15_51, %tmp28_55 : tensor<1x2048xf32> loc(#loc157) + %8 = arith.extsi %xindex_5 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc69) + %9 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc69) + %10 = arith.muli %9, %8 : tensor<1x1xi64> loc(#loc69) + %11 = arith.extsi %r0_index_15 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc70) + %12 = tt.broadcast %10 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc70) + %13 = arith.addi %11, %12 : tensor<1x2048xi64> loc(#loc70) + %14 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc71) + %15 = tt.addptr %14, %13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc71) + %16 = tt.broadcast %xmask_6 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc72) + %17 = arith.andi %r0_mask_16, %16 : tensor<1x2048xi1> loc(#loc72) + %18 = arith.truncf %tmp29 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc73) + tt.store %15, %18, %17 : tensor<1x2048x!tt.ptr> loc(#loc73) + } loc(#loc38) + tt.return loc(#loc74) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2048xf32> loc("input"(#loc75))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc76) + tt.reduce.return %2 : f32 loc(#loc76) + }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc76) + tt.return %0 : tensor<1xf32> loc(#loc78) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc79) + tt.return %1 : tensor<1xf32> loc(#loc79) + } loc(#loc75) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc80)), %b: f32 loc("b"(#loc80))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc81) + tt.return %0 : f32 loc(#loc82) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc83) + tt.return %1 : f32 loc(#loc83) + } loc(#loc80) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":21:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":21:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":22:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":22:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":22:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":24:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":24:37) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":27:43) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":28:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":29:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":30:29) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":34:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":34:41) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":34:34) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":34:60) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":34:50) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":34:111) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":35:34) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":35:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":35:94) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":36:45) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":36:41) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":36:34) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":36:60) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":36:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":36:111) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":37:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":40:22) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":42:23) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":43:35) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":43:48) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":43:8) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":44:25) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":44:28) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":45:31) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":45:36) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":46:40) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":47:31) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":48:29) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":52:46) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":52:42) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":52:35) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":52:61) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":52:51) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":52:113) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":53:35) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":53:42) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":53:95) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":54:46) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":54:42) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":54:35) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":54:61) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":54:51) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":54:113) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":55:24) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":57:24) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":58:16) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":59:23) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":60:24) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":61:24) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":62:24) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":64:25) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":65:25) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":67:16) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":68:24) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":69:24) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":70:24) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":72:40) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":72:36) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":72:29) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":72:62) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":72:52) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":46:4) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc92 = loc("xoffset"(#loc1)) +#loc93 = loc("xoffset"(#loc2)) +#loc94 = loc("xindex"(#loc3)) +#loc95 = loc("xindex"(#loc4)) +#loc96 = loc("xindex"(#loc5)) +#loc97 = loc("xmask"(#loc6)) +#loc98 = loc("r0_base"(#loc7)) +#loc99 = loc("r0_base"(#loc8)) +#loc100 = loc("_tmp8"(#loc9)) +#loc101 = loc("_tmp8"(#loc10)) +#loc102 = loc("r0_index"(#loc11)) +#loc103 = loc("r0_mask"(#loc12)) +#loc104 = loc("tmp0"(#loc13)) +#loc105 = loc("tmp0"(#loc14)) +#loc106 = loc("tmp0"(#loc15)) +#loc107 = loc("tmp0"(#loc16)) +#loc108 = loc("tmp0"(#loc17)) +#loc109 = loc("tmp0"(#loc18)) +#loc110 = loc("tmp1"(#loc19)) +#loc111 = loc("tmp1"(#loc20)) +#loc112 = loc("tmp1"(#loc21)) +#loc113 = loc("tmp4"(#loc22)) +#loc114 = loc("tmp4"(#loc23)) +#loc115 = loc("tmp4"(#loc24)) +#loc116 = loc("tmp4"(#loc25)) +#loc117 = loc("tmp4"(#loc26)) +#loc118 = loc("tmp4"(#loc27)) +#loc119 = loc("tmp2"(#loc28)) +#loc120 = loc("tmp6"(#loc29)) +#loc121 = loc("tmp9"(#loc30)) +#loc122 = loc("_tmp8"(#loc31)) +#loc123 = loc("_tmp8"(#loc32)) +#loc124 = loc("tmp8"(#loc34)) +#loc125 = loc("tmp8"(#loc35)) +#loc126 = loc("tmp14"(#loc36)) +#loc127 = loc("tmp14"(#loc37)) +#loc128 = loc("r0_index"(#loc39)) +#loc129 = loc("r0_mask"(#loc40)) +#loc130 = loc("tmp10"(#loc41)) +#loc131 = loc("tmp10"(#loc42)) +#loc132 = loc("tmp10"(#loc43)) +#loc133 = loc("tmp10"(#loc44)) +#loc134 = loc("tmp10"(#loc45)) +#loc135 = loc("tmp10"(#loc46)) +#loc136 = loc("tmp11"(#loc47)) +#loc137 = loc("tmp11"(#loc48)) +#loc138 = loc("tmp11"(#loc49)) +#loc139 = loc("tmp24"(#loc50)) +#loc140 = loc("tmp24"(#loc51)) +#loc141 = loc("tmp24"(#loc52)) +#loc142 = loc("tmp24"(#loc53)) +#loc143 = loc("tmp24"(#loc54)) +#loc144 = loc("tmp24"(#loc55)) +#loc145 = loc("tmp12"(#loc56)) +#loc146 = loc("tmp15"(#loc57)) +#loc147 = loc("tmp16"(#loc58)) +#loc148 = loc("tmp17"(#loc59)) +#loc149 = loc("tmp18"(#loc60)) +#loc150 = loc("tmp19"(#loc61)) +#loc151 = loc("tmp20"(#loc62)) +#loc152 = loc("tmp22"(#loc63)) +#loc153 = loc("tmp23"(#loc64)) +#loc154 = loc("tmp26"(#loc65)) +#loc155 = loc("tmp27"(#loc66)) +#loc156 = loc("tmp28"(#loc67)) +#loc157 = loc("tmp29"(#loc68)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..fdf0a49afc2b02938adf353726d15834c453f3e3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.ttgir @@ -0,0 +1,213 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":18:0) +#loc1 = loc(unknown) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":44:25) +#loc59 = loc("in_ptr0"(#loc)) +#loc60 = loc("in_ptr1"(#loc)) +#loc61 = loc("in_ptr2"(#loc)) +#loc62 = loc("in_ptr3"(#loc)) +#loc63 = loc("out_ptr1"(#loc)) +#loc64 = loc("ks0"(#loc)) +#loc65 = loc("xnumel"(#loc)) +#loc66 = loc("r0_numel"(#loc)) +#loc89 = loc("tmp8"(#loc26)) +#loc120 = loc(callsite(#loc1 at #loc89)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<-5.000000e-01> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<2.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc67) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc68) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc69) + %r0_base_3 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc69) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x2048xi32, #blocked> loc(#loc70) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc71) + %tmp0_4 = arith.muli %ks0, %tmp0 : i64 loc(#loc71) + %tmp0_5 = tt.splat %tmp0_4 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc117) + %tmp0_6 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc73) + %tmp0_7 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc118) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc75) + %tmp4 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc76) + %_tmp8 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c2048_i32 iter_args(%_tmp8_13 = %cst_2) -> (tensor<1x2048xf32, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc78) + %r0_index_14 = arith.addi %r0_index, %r0_base_3 : tensor<1x2048xi32, #blocked> loc(#loc78) + %r0_mask_15 = arith.cmpi slt, %r0_index_14, %r0_mask : tensor<1x2048xi32, #blocked> loc(#loc70) + %tmp0_16 = arith.extsi %r0_index_14 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc72) + %tmp0_17 = arith.addi %tmp0_16, %tmp0_5 : tensor<1x2048xi64, #blocked> loc(#loc72) + %tmp0_18 = tt.addptr %tmp0_6, %tmp0_17 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc73) + %tmp0_19 = arith.andi %r0_mask_15, %tmp0_7 : tensor<1x2048xi1, #blocked> loc(#loc74) + %tmp0_20 = tt.load %tmp0_18, %tmp0_19, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc79) + %tmp0_21 = arith.extf %tmp0_20 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc80) + %tmp1_22 = tt.addptr %tmp1, %r0_index_14 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc75) + %tmp1_23 = tt.load %tmp1_22, %r0_mask_15, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc81) + %tmp1_24 = arith.extf %tmp1_23 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc82) + %tmp4_25 = tt.addptr %tmp4, %tmp0_17 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc76) + %tmp4_26 = tt.load %tmp4_25, %tmp0_19, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc83) + %tmp4_27 = arith.extf %tmp4_26 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc84) + %tmp2 = arith.mulf %tmp0_21, %tmp1_24 : tensor<1x2048xf32, #blocked> loc(#loc85) + %tmp6 = arith.mulf %tmp2, %tmp4_27 : tensor<1x2048xf32, #blocked> loc(#loc86) + %tmp9 = arith.addf %_tmp8_13, %tmp6 : tensor<1x2048xf32, #blocked> loc(#loc87) + %_tmp8_28 = arith.select %tmp0_19, %tmp9, %_tmp8_13 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc88) + scf.yield %_tmp8_28 : tensor<1x2048xf32, #blocked> loc(#loc24) + } loc(#loc77) + %tmp8 = "tt.reduce"(%_tmp8) <{axis = 1 : i32}> ({ + ^bb0(%tmp8_13: f32 loc(callsite(#loc1 at #loc89)), %tmp8_14: f32 loc(callsite(#loc1 at #loc89))): + %tmp8_15 = arith.addf %tmp8_13, %tmp8_14 : f32 loc(#loc121) + tt.reduce.return %tmp8_15 : f32 loc(#loc119) + }) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc119) + %tmp8_8 = tt.expand_dims %tmp8 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc90) + %tmp14 = tt.addptr %in_ptr3, %xoffset : !tt.ptr, i32 loc(#loc91) + %tmp14_9 = tt.splat %tmp14 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc92) + %tmp14_10 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc92) + %tmp14_11 = tt.load %tmp14_9, %tmp14_10 evictionPolicy = evict_last : tensor<1x1x!tt.ptr, #blocked> loc(#loc92) + %tmp15 = tt.broadcast %tmp14_11 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc93) + %tmp17 = arith.mulf %tmp8_8, %cst : tensor<1x1xf32, #blocked> loc(#loc94) + %tmp18 = arith.mulf %tmp14_11, %tmp14_11 : tensor<1x1xf32, #blocked> loc(#loc95) + %tmp19 = arith.mulf %tmp18, %tmp14_11 : tensor<1x1xf32, #blocked> loc(#loc96) + %tmp20 = arith.mulf %tmp17, %tmp19 : tensor<1x1xf32, #blocked> loc(#loc97) + %tmp22 = arith.sitofp %ks0 : i64 to f32 loc(#loc98) + %tmp23 = tt.splat %tmp22 : f32 -> tensor<1x1xf32, #blocked> loc(#loc99) + %tmp23_12 = arith.divf %tmp20, %tmp23 : tensor<1x1xf32, #blocked> loc(#loc99) + %tmp28 = tt.broadcast %tmp23_12 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc100) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc39) + scf.for %r0_offset = %c0_i32 to %r0_numel step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc101) + %r0_index_13 = arith.addi %r0_index, %r0_base_3 : tensor<1x2048xi32, #blocked> loc(#loc101) + %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x2048xi32, #blocked> loc(#loc102) + %tmp10 = arith.extsi %r0_index_13 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc103) + %tmp10_15 = arith.addi %tmp10, %tmp0_5 : tensor<1x2048xi64, #blocked> loc(#loc103) + %tmp10_16 = tt.addptr %tmp0_6, %tmp10_15 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc104) + %tmp10_17 = arith.andi %r0_mask_14, %tmp0_7 : tensor<1x2048xi1, #blocked> loc(#loc105) + %tmp10_18 = tt.load %tmp10_16, %tmp10_17, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc106) + %tmp10_19 = arith.extf %tmp10_18 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc107) + %tmp11 = tt.addptr %tmp1, %r0_index_13 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc108) + %tmp11_20 = tt.load %tmp11, %r0_mask_14, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc109) + %tmp11_21 = arith.extf %tmp11_20 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc110) + %tmp24 = tt.addptr %tmp4, %tmp10_15 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc111) + %tmp24_22 = tt.load %tmp24, %tmp10_17, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc112) + %tmp24_23 = arith.extf %tmp24_22 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc113) + %tmp12 = arith.mulf %tmp10_19, %tmp11_21 : tensor<1x2048xf32, #blocked> loc(#loc114) + %tmp15_24 = arith.mulf %tmp12, %tmp15 : tensor<1x2048xf32, #blocked> loc(#loc93) + %tmp27 = arith.mulf %tmp24_23, %cst_0 : tensor<1x2048xf32, #blocked> loc(#loc115) + %tmp28_25 = arith.mulf %tmp28, %tmp27 : tensor<1x2048xf32, #blocked> loc(#loc100) + %tmp29 = arith.addf %tmp15_24, %tmp28_25 : tensor<1x2048xf32, #blocked> loc(#loc116) + %1 = tt.addptr %0, %tmp10_15 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc39) + %2 = arith.truncf %tmp29 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked> loc(#loc57) + tt.store %1, %2, %tmp10_17 : tensor<1x2048x!tt.ptr, #blocked> loc(#loc57) + } loc(#loc40) + tt.return loc(#loc58) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":21:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":23:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":24:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":30:29) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":34:45) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":34:41) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":34:34) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":34:60) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":35:34) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":36:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":28:40) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":29:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":34:50) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":34:111) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":35:41) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":35:94) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":36:50) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":36:111) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":37:22) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":40:22) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":42:23) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":43:48) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":43:8) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":44:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":45:31) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":45:36) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":57:24) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":59:23) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":60:24) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":61:24) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":62:24) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":64:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":65:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":69:24) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":72:29) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":46:40) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":47:31) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":48:29) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":52:42) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":52:35) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":52:61) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":52:51) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":52:113) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":53:35) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":53:42) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":53:95) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":54:35) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":54:51) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":54:113) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":55:24) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":68:24) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":70:24) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":72:52) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":46:4) +#loc67 = loc("xoffset"(#loc2)) +#loc68 = loc("xmask"(#loc3)) +#loc69 = loc("r0_base"(#loc4)) +#loc70 = loc("r0_mask"(#loc5)) +#loc71 = loc("tmp0"(#loc6)) +#loc72 = loc("tmp0"(#loc7)) +#loc73 = loc("tmp0"(#loc8)) +#loc74 = loc("tmp0"(#loc9)) +#loc75 = loc("tmp1"(#loc10)) +#loc76 = loc("tmp4"(#loc11)) +#loc77 = loc("_tmp8"(#loc12)) +#loc78 = loc("r0_index"(#loc13)) +#loc79 = loc("tmp0"(#loc14)) +#loc80 = loc("tmp0"(#loc15)) +#loc81 = loc("tmp1"(#loc16)) +#loc82 = loc("tmp1"(#loc17)) +#loc83 = loc("tmp4"(#loc18)) +#loc84 = loc("tmp4"(#loc19)) +#loc85 = loc("tmp2"(#loc20)) +#loc86 = loc("tmp6"(#loc21)) +#loc87 = loc("tmp9"(#loc22)) +#loc88 = loc("_tmp8"(#loc23)) +#loc90 = loc("tmp8"(#loc28)) +#loc91 = loc("tmp14"(#loc29)) +#loc92 = loc("tmp14"(#loc30)) +#loc93 = loc("tmp15"(#loc31)) +#loc94 = loc("tmp17"(#loc32)) +#loc95 = loc("tmp18"(#loc33)) +#loc96 = loc("tmp19"(#loc34)) +#loc97 = loc("tmp20"(#loc35)) +#loc98 = loc("tmp22"(#loc36)) +#loc99 = loc("tmp23"(#loc37)) +#loc100 = loc("tmp28"(#loc38)) +#loc101 = loc("r0_index"(#loc41)) +#loc102 = loc("r0_mask"(#loc42)) +#loc103 = loc("tmp10"(#loc43)) +#loc104 = loc("tmp10"(#loc44)) +#loc105 = loc("tmp10"(#loc45)) +#loc106 = loc("tmp10"(#loc46)) +#loc107 = loc("tmp10"(#loc47)) +#loc108 = loc("tmp11"(#loc48)) +#loc109 = loc("tmp11"(#loc49)) +#loc110 = loc("tmp11"(#loc50)) +#loc111 = loc("tmp24"(#loc51)) +#loc112 = loc("tmp24"(#loc52)) +#loc113 = loc("tmp24"(#loc53)) +#loc114 = loc("tmp12"(#loc54)) +#loc115 = loc("tmp27"(#loc55)) +#loc116 = loc("tmp29"(#loc56)) +#loc117 = loc(fused[#loc72, #loc71]) +#loc118 = loc(fused[#loc74, #loc68]) +#loc119 = loc(callsite(#loc25 at #loc89)) +#loc121 = loc(callsite(#loc27 at #loc119)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..819885c07fcdaf03591629e6fec9b88539b30a70 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/S4DEXO6JVUGDKBK6NXCQZEQOY2H4GOSEUMMUJRDLKDOHUFXBNSUQ/triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2.ttir @@ -0,0 +1,226 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":18:0) +#loc1 = loc(unknown) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":44:25) +#loc61 = loc("in_ptr0"(#loc)) +#loc62 = loc("in_ptr1"(#loc)) +#loc63 = loc("in_ptr2"(#loc)) +#loc64 = loc("in_ptr3"(#loc)) +#loc65 = loc("out_ptr1"(#loc)) +#loc66 = loc("ks0"(#loc)) +#loc67 = loc("xnumel"(#loc)) +#loc68 = loc("r0_numel"(#loc)) +#loc92 = loc("tmp8"(#loc27)) +#loc124 = loc(callsite(#loc1 at #loc92)) +module { + tt.func public @triton_red_fused__to_copy_add_div_expand_mul_pow_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<2.000000e+00> : tensor<1x2048xf32> loc(#loc1) + %cst_1 = arith.constant dense<-5.000000e-01> : tensor<1x1xf32> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc69) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc70) + %xmask_3 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc70) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc71) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc72) + %_tmp8 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c2048_i32 iter_args(%_tmp8_8 = %cst_2) -> (tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc74) + %r0_index_9 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32> loc(#loc74) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x2048xi32> loc(#loc75) + %r0_mask_10 = arith.cmpi slt, %r0_index_9, %r0_mask : tensor<1x2048xi32> loc(#loc75) + %tmp0 = arith.extsi %xoffset : i32 to i64 loc(#loc76) + %tmp0_11 = arith.muli %ks0, %tmp0 : i64 loc(#loc76) + %tmp0_12 = arith.extsi %r0_index_9 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc77) + %tmp0_13 = tt.splat %tmp0_11 : i64 -> tensor<1x2048xi64> loc(#loc121) + %tmp0_14 = arith.addi %tmp0_12, %tmp0_13 : tensor<1x2048xi64> loc(#loc77) + %tmp0_15 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc78) + %tmp0_16 = tt.addptr %tmp0_15, %tmp0_14 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc78) + %tmp0_17 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc122) + %tmp0_18 = arith.andi %r0_mask_10, %tmp0_17 : tensor<1x2048xi1> loc(#loc79) + %tmp0_19 = tt.load %tmp0_16, %tmp0_18, %cst evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc80) + %tmp0_20 = arith.extf %tmp0_19 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc81) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc82) + %tmp1_21 = tt.addptr %tmp1, %r0_index_9 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc82) + %tmp1_22 = tt.load %tmp1_21, %r0_mask_10, %cst evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc83) + %tmp1_23 = arith.extf %tmp1_22 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc84) + %tmp4 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc85) + %tmp4_24 = tt.addptr %tmp4, %tmp0_14 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc85) + %tmp4_25 = tt.load %tmp4_24, %tmp0_18, %cst evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc86) + %tmp4_26 = arith.extf %tmp4_25 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc87) + %tmp2 = arith.mulf %tmp0_20, %tmp1_23 : tensor<1x2048xf32> loc(#loc88) + %tmp6 = arith.mulf %tmp2, %tmp4_26 : tensor<1x2048xf32> loc(#loc89) + %tmp9 = arith.addf %_tmp8_8, %tmp6 : tensor<1x2048xf32> loc(#loc90) + %_tmp8_27 = arith.select %tmp0_18, %tmp9, %_tmp8_8 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc91) + scf.yield %_tmp8_27 : tensor<1x2048xf32> loc(#loc25) + } loc(#loc73) + %tmp8 = "tt.reduce"(%_tmp8) <{axis = 1 : i32}> ({ + ^bb0(%tmp8_8: f32 loc(callsite(#loc1 at #loc92)), %tmp8_9: f32 loc(callsite(#loc1 at #loc92))): + %tmp8_10 = arith.addf %tmp8_8, %tmp8_9 : f32 loc(#loc127) + tt.reduce.return %tmp8_10 : f32 loc(#loc123) + }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc123) + %tmp8_5 = tt.expand_dims %tmp8 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc93) + %tmp14 = tt.addptr %in_ptr3, %xoffset : !tt.ptr, i32 loc(#loc94) + %tmp14_6 = tt.splat %tmp14 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc94) + %tmp14_7 = tt.load %tmp14_6, %xmask_3 evictionPolicy = evict_last : tensor<1x1x!tt.ptr> loc(#loc95) + scf.for %r0_offset = %c0_i32 to %r0_numel step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc96) + %r0_index_8 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32> loc(#loc96) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x2048xi32> loc(#loc97) + %r0_mask_9 = arith.cmpi slt, %r0_index_8, %r0_mask : tensor<1x2048xi32> loc(#loc97) + %tmp10 = arith.extsi %xoffset : i32 to i64 loc(#loc98) + %tmp10_10 = arith.muli %ks0, %tmp10 : i64 loc(#loc98) + %tmp10_11 = arith.extsi %r0_index_8 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc99) + %tmp10_12 = tt.splat %tmp10_10 : i64 -> tensor<1x2048xi64> loc(#loc125) + %tmp10_13 = arith.addi %tmp10_11, %tmp10_12 : tensor<1x2048xi64> loc(#loc99) + %tmp10_14 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc100) + %tmp10_15 = tt.addptr %tmp10_14, %tmp10_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc100) + %tmp10_16 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc126) + %tmp10_17 = arith.andi %r0_mask_9, %tmp10_16 : tensor<1x2048xi1> loc(#loc101) + %tmp10_18 = tt.load %tmp10_15, %tmp10_17, %cst evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc102) + %tmp10_19 = arith.extf %tmp10_18 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc103) + %tmp11 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc104) + %tmp11_20 = tt.addptr %tmp11, %r0_index_8 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc104) + %tmp11_21 = tt.load %tmp11_20, %r0_mask_9, %cst evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc105) + %tmp11_22 = arith.extf %tmp11_21 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc106) + %tmp24 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc107) + %tmp24_23 = tt.addptr %tmp24, %tmp10_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc107) + %tmp24_24 = tt.load %tmp24_23, %tmp10_17, %cst evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc108) + %tmp24_25 = arith.extf %tmp24_24 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc109) + %tmp12 = arith.mulf %tmp10_19, %tmp11_22 : tensor<1x2048xf32> loc(#loc110) + %tmp15 = tt.broadcast %tmp14_7 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc111) + %tmp15_26 = arith.mulf %tmp12, %tmp15 : tensor<1x2048xf32> loc(#loc111) + %tmp17 = arith.mulf %tmp8_5, %cst_1 : tensor<1x1xf32> loc(#loc112) + %tmp18 = arith.mulf %tmp14_7, %tmp14_7 : tensor<1x1xf32> loc(#loc113) + %tmp19 = arith.mulf %tmp18, %tmp14_7 : tensor<1x1xf32> loc(#loc114) + %tmp20 = arith.mulf %tmp17, %tmp19 : tensor<1x1xf32> loc(#loc115) + %tmp22 = arith.sitofp %ks0 : i64 to f32 loc(#loc116) + %tmp23 = tt.splat %tmp22 : f32 -> tensor<1x1xf32> loc(#loc117) + %tmp23_27 = arith.divf %tmp20, %tmp23 : tensor<1x1xf32> loc(#loc117) + %tmp27 = arith.mulf %tmp24_25, %cst_0 : tensor<1x2048xf32> loc(#loc118) + %tmp28 = tt.broadcast %tmp23_27 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc119) + %tmp28_28 = arith.mulf %tmp28, %tmp27 : tensor<1x2048xf32> loc(#loc119) + %tmp29 = arith.addf %tmp15_26, %tmp28_28 : tensor<1x2048xf32> loc(#loc120) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc58) + %1 = tt.addptr %0, %tmp10_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc58) + %2 = arith.truncf %tmp29 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc59) + tt.store %1, %2, %tmp10_17 : tensor<1x2048x!tt.ptr> loc(#loc59) + } loc(#loc32) + tt.return loc(#loc60) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":21:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":23:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":24:27) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":24:37) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":28:40) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":29:31) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":30:29) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":34:45) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":34:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":34:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":34:60) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":34:50) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":34:111) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":35:34) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":35:41) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":35:94) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":36:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":36:50) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":36:111) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":37:22) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":40:22) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":42:23) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":43:48) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":43:8) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":44:28) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":45:31) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":45:36) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":46:40) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":47:31) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":48:29) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":52:46) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":52:42) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":52:35) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":52:61) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":52:51) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":52:113) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":53:35) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":53:42) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":53:95) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":54:35) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":54:51) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":54:113) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":55:24) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":57:24) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":59:23) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":60:24) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":61:24) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":62:24) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":64:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":65:25) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":68:24) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":69:24) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":70:24) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":72:29) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":72:52) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/gn/cgnmjxikvi5ulcyj3uozif3le5hd26kw2kjhkcbhupqgudqi3bwn.py":46:4) +#loc69 = loc("xoffset"(#loc2)) +#loc70 = loc("xmask"(#loc3)) +#loc71 = loc("r0_base"(#loc4)) +#loc72 = loc("r0_base"(#loc5)) +#loc73 = loc("_tmp8"(#loc6)) +#loc74 = loc("r0_index"(#loc7)) +#loc75 = loc("r0_mask"(#loc8)) +#loc76 = loc("tmp0"(#loc9)) +#loc77 = loc("tmp0"(#loc10)) +#loc78 = loc("tmp0"(#loc11)) +#loc79 = loc("tmp0"(#loc12)) +#loc80 = loc("tmp0"(#loc13)) +#loc81 = loc("tmp0"(#loc14)) +#loc82 = loc("tmp1"(#loc15)) +#loc83 = loc("tmp1"(#loc16)) +#loc84 = loc("tmp1"(#loc17)) +#loc85 = loc("tmp4"(#loc18)) +#loc86 = loc("tmp4"(#loc19)) +#loc87 = loc("tmp4"(#loc20)) +#loc88 = loc("tmp2"(#loc21)) +#loc89 = loc("tmp6"(#loc22)) +#loc90 = loc("tmp9"(#loc23)) +#loc91 = loc("_tmp8"(#loc24)) +#loc93 = loc("tmp8"(#loc29)) +#loc94 = loc("tmp14"(#loc30)) +#loc95 = loc("tmp14"(#loc31)) +#loc96 = loc("r0_index"(#loc33)) +#loc97 = loc("r0_mask"(#loc34)) +#loc98 = loc("tmp10"(#loc35)) +#loc99 = loc("tmp10"(#loc36)) +#loc100 = loc("tmp10"(#loc37)) +#loc101 = loc("tmp10"(#loc38)) +#loc102 = loc("tmp10"(#loc39)) +#loc103 = loc("tmp10"(#loc40)) +#loc104 = loc("tmp11"(#loc41)) +#loc105 = loc("tmp11"(#loc42)) +#loc106 = loc("tmp11"(#loc43)) +#loc107 = loc("tmp24"(#loc44)) +#loc108 = loc("tmp24"(#loc45)) +#loc109 = loc("tmp24"(#loc46)) +#loc110 = loc("tmp12"(#loc47)) +#loc111 = loc("tmp15"(#loc48)) +#loc112 = loc("tmp17"(#loc49)) +#loc113 = loc("tmp18"(#loc50)) +#loc114 = loc("tmp19"(#loc51)) +#loc115 = loc("tmp20"(#loc52)) +#loc116 = loc("tmp22"(#loc53)) +#loc117 = loc("tmp23"(#loc54)) +#loc118 = loc("tmp27"(#loc55)) +#loc119 = loc("tmp28"(#loc56)) +#loc120 = loc("tmp29"(#loc57)) +#loc121 = loc(fused[#loc77, #loc76]) +#loc122 = loc(fused[#loc79, #loc70]) +#loc123 = loc(callsite(#loc26 at #loc92)) +#loc125 = loc(fused[#loc99, #loc98]) +#loc126 = loc(fused[#loc101, #loc70]) +#loc127 = loc(callsite(#loc28 at #loc123)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a238e9212efb98cf2d45b18e8d1a73eab710f541 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin new file mode 100644 index 0000000000000000000000000000000000000000..74cd9c8205fe91651b0a525fb5be16786a0e2d43 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9bcdeb6ee3cec8031b1c86b2afb544278660ef41 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"hash": "94ff31a7ae71fb6793b7d83905d081fceae6f14b3b7d2b1c70eabf7683ae97dd", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 2048, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir new file mode 100644 index 0000000000000000000000000000000000000000..98024aa19ce3e01c52bc5d5ca1d84339650fa58a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir @@ -0,0 +1,1854 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = shl i32 %9, 5, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = lshr i32 %11, 5, !dbg !9 + %13 = and i32 %11, 31, !dbg !9 + %14 = lshr i32 %11, 2, !dbg !9 + %15 = and i32 %14, 31, !dbg !9 + %16 = or disjoint i32 %10, %13, !dbg !10 + %17 = or disjoint i32 %15, %10, !dbg !10 + %18 = icmp slt i32 %16, %4, !dbg !11 + %19 = icmp slt i32 %17, %4, !dbg !11 + %20 = and i32 %11, 96, !dbg !12 + %21 = lshr exact i32 %20, 5, !dbg !12 + %22 = or disjoint i32 %21, 4, !dbg !12 + %23 = or disjoint i32 %21, 8, !dbg !12 + %24 = or disjoint i32 %21, 12, !dbg !12 + %25 = shl nuw nsw i32 %11, 2, !dbg !12 + %26 = and i32 %25, 12, !dbg !12 + %27 = sext i32 %16 to i64, !dbg !13 + %28 = sext i32 %17 to i64, !dbg !13 + %.frozen = freeze i64 %3, !dbg !14 + %29 = sdiv i64 %27, %.frozen, !dbg !14 + %30 = mul i64 %29, %.frozen, !dbg !13 + %.decomposed = sub i64 %27, %30, !dbg !13 + %.frozen81 = freeze i64 %3, !dbg !14 + %31 = sdiv i64 %28, %.frozen81, !dbg !14 + %32 = mul i64 %31, %.frozen81, !dbg !13 + %.decomposed82 = sub i64 %28, %32, !dbg !13 + %33 = zext nneg i32 %21 to i64, !dbg !15 + %34 = zext nneg i32 %22 to i64, !dbg !15 + %35 = zext nneg i32 %23 to i64, !dbg !15 + %36 = zext nneg i32 %24 to i64, !dbg !15 + %37 = zext nneg i32 %26 to i64, !dbg !15 + %38 = shl nsw i64 %29, 4, !dbg !16 + %39 = mul i64 %3, %33, !dbg !17 + %40 = mul i64 %3, %34, !dbg !17 + %41 = mul i64 %3, %35, !dbg !17 + %42 = mul i64 %3, %36, !dbg !17 + %43 = shl i64 %3, 4, !dbg !18 + %44 = mul i64 %29, %43, !dbg !19 + %45 = getelementptr i32, ptr addrspace(1) %0, i64 %.decomposed, !dbg !20 + %46 = getelementptr i32, ptr addrspace(1) %45, i64 %33, !dbg !20 + %47 = getelementptr i32, ptr addrspace(1) %46, i64 %38, !dbg !20 + %48 = getelementptr i32, ptr addrspace(1) %47, i64 %39, !dbg !20 + %49 = getelementptr i32, ptr addrspace(1) %48, i64 %44, !dbg !20 + %50 = getelementptr i32, ptr addrspace(1) %45, i64 %34, !dbg !20 + %51 = getelementptr i32, ptr addrspace(1) %50, i64 %38, !dbg !20 + %52 = getelementptr i32, ptr addrspace(1) %51, i64 %40, !dbg !20 + %53 = getelementptr i32, ptr addrspace(1) %52, i64 %44, !dbg !20 + %54 = getelementptr i32, ptr addrspace(1) %45, i64 %35, !dbg !20 + %55 = getelementptr i32, ptr addrspace(1) %54, i64 %38, !dbg !20 + %56 = getelementptr i32, ptr addrspace(1) %55, i64 %41, !dbg !20 + %57 = getelementptr i32, ptr addrspace(1) %56, i64 %44, !dbg !20 + %58 = getelementptr i32, ptr addrspace(1) %45, i64 %36, !dbg !20 + %59 = getelementptr i32, ptr addrspace(1) %58, i64 %38, !dbg !20 + %60 = getelementptr i32, ptr addrspace(1) %59, i64 %42, !dbg !20 + %61 = getelementptr i32, ptr addrspace(1) %60, i64 %44, !dbg !20 + %62 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !21 + %63 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %49, i64 %62, i1 %18) #5, !dbg !21 + %64 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !21 + %65 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %53, i64 %64, i1 %18) #5, !dbg !21 + %66 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !21 + %67 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %57, i64 %66, i1 %18) #5, !dbg !21 + %68 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !21 + %69 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %61, i64 %68, i1 %18) #5, !dbg !21 + %70 = lshr i32 %11, 6, !dbg !22 + %.lobit = and i32 %70, 1, !dbg !22 + %.lobit1 = and i32 %12, 1, !dbg !22 + %71 = xor i32 %.lobit1, 1, !dbg !26 + %72 = xor i32 %.lobit, 1, !dbg !26 + %73 = mul nuw nsw i32 %63, %71, !dbg !27 + %74 = mul nuw nsw i32 %65, %71, !dbg !27 + %75 = mul nuw nsw i32 %67, %71, !dbg !27 + %76 = mul nuw nsw i32 %69, %71, !dbg !27 + %.idx67 = shl nuw nsw i32 %.lobit, 3, !dbg !28 + %77 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx67, !dbg !28 + %.idx68 = shl nuw nsw i32 %13, 6, !dbg !28 + %78 = getelementptr i8, ptr addrspace(3) %77, i32 %.idx68, !dbg !28 + %79 = getelementptr i32, ptr addrspace(3) %78, i32 %.lobit1, !dbg !28 + %80 = insertelement <1 x i32> poison, i32 %73, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %79, <1 x i32> %80, i1 true) #5, !dbg !28 + %81 = getelementptr i8, ptr addrspace(3) %78, i32 16, !dbg !28 + %82 = getelementptr i32, ptr addrspace(3) %81, i32 %.lobit1, !dbg !28 + %83 = insertelement <1 x i32> poison, i32 %74, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %83, i1 true) #5, !dbg !28 + %84 = getelementptr i8, ptr addrspace(3) %78, i32 32, !dbg !28 + %85 = getelementptr i32, ptr addrspace(3) %84, i32 %.lobit1, !dbg !28 + %86 = insertelement <1 x i32> poison, i32 %75, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %86, i1 true) #5, !dbg !28 + %87 = getelementptr i8, ptr addrspace(3) %78, i32 48, !dbg !28 + %88 = getelementptr i32, ptr addrspace(3) %87, i32 %.lobit1, !dbg !28 + %89 = insertelement <1 x i32> poison, i32 %76, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %88, <1 x i32> %89, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %90 = icmp samesign ult i32 %11, 512, !dbg !28 + %91 = getelementptr i32, ptr addrspace(3) @global_smem, i32 %11, !dbg !28 + %92 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %93 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %92, i32 1, i32 31), !dbg !28 + %94 = add i32 %93, %92, !dbg !31 + %95 = and i32 %11, 513, !dbg !28 + %96 = icmp eq i32 %95, 0, !dbg !28 + %97 = insertelement <1 x i32> poison, i32 %94, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %97, i1 %96) #5, !dbg !28 + %98 = getelementptr i8, ptr addrspace(3) %91, i32 512, !dbg !28 + %99 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %100 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %99, i32 1, i32 31), !dbg !28 + %101 = add i32 %100, %99, !dbg !31 + %102 = insertelement <1 x i32> poison, i32 %101, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %102, i1 %96) #5, !dbg !28 + %103 = getelementptr i8, ptr addrspace(3) %91, i32 1024, !dbg !28 + %104 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 1, i32 31), !dbg !28 + %106 = add i32 %105, %104, !dbg !31 + %107 = insertelement <1 x i32> poison, i32 %106, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %107, i1 %96) #5, !dbg !28 + %108 = getelementptr i8, ptr addrspace(3) %91, i32 1536, !dbg !28 + %109 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %110 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %109, i32 1, i32 31), !dbg !28 + %111 = add i32 %110, %109, !dbg !31 + %112 = insertelement <1 x i32> poison, i32 %111, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %112, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %113 = load i32, ptr addrspace(3) %78, align 8, !dbg !28 + %114 = load i32, ptr addrspace(3) %81, align 8, !dbg !28 + %115 = load i32, ptr addrspace(3) %84, align 8, !dbg !28 + %116 = load i32, ptr addrspace(3) %87, align 8, !dbg !28 + %117 = mul nuw nsw i32 %63, %.lobit1, !dbg !32 + %118 = mul nuw nsw i32 %65, %.lobit1, !dbg !32 + %119 = mul nuw nsw i32 %67, %.lobit1, !dbg !32 + %120 = mul nuw nsw i32 %69, %.lobit1, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %121 = insertelement <1 x i32> poison, i32 %117, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %79, <1 x i32> %121, i1 true) #5, !dbg !28 + %122 = insertelement <1 x i32> poison, i32 %118, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %122, i1 true) #5, !dbg !28 + %123 = insertelement <1 x i32> poison, i32 %119, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %123, i1 true) #5, !dbg !28 + %124 = insertelement <1 x i32> poison, i32 %120, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %88, <1 x i32> %124, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %125 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 1, i32 31), !dbg !28 + %127 = add i32 %126, %125, !dbg !31 + %128 = insertelement <1 x i32> poison, i32 %127, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %128, i1 %96) #5, !dbg !28 + %129 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 1, i32 31), !dbg !28 + %131 = add i32 %130, %129, !dbg !31 + %132 = insertelement <1 x i32> poison, i32 %131, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %132, i1 %96) #5, !dbg !28 + %133 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 1, i32 31), !dbg !28 + %135 = add i32 %134, %133, !dbg !31 + %136 = insertelement <1 x i32> poison, i32 %135, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %136, i1 %96) #5, !dbg !28 + %137 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 1, i32 31), !dbg !28 + %139 = add i32 %138, %137, !dbg !31 + %140 = insertelement <1 x i32> poison, i32 %139, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %140, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %141 = load i32, ptr addrspace(3) %78, align 8, !dbg !28 + %142 = load i32, ptr addrspace(3) %81, align 8, !dbg !28 + %143 = load i32, ptr addrspace(3) %84, align 8, !dbg !28 + %144 = load i32, ptr addrspace(3) %87, align 8, !dbg !28 + %145 = mul nuw nsw i32 %71, %21, !dbg !33 + %146 = mul nuw nsw i32 %22, %71, !dbg !33 + %147 = mul nuw nsw i32 %23, %71, !dbg !33 + %148 = mul nuw nsw i32 %24, %71, !dbg !33 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %149 = insertelement <1 x i32> poison, i32 %145, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %79, <1 x i32> %149, i1 true) #5, !dbg !28 + %150 = insertelement <1 x i32> poison, i32 %146, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %150, i1 true) #5, !dbg !28 + %151 = insertelement <1 x i32> poison, i32 %147, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %151, i1 true) #5, !dbg !28 + %152 = insertelement <1 x i32> poison, i32 %148, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %88, <1 x i32> %152, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %153 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 1, i32 31), !dbg !28 + %155 = add i32 %154, %153, !dbg !31 + %156 = insertelement <1 x i32> poison, i32 %155, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %156, i1 %96) #5, !dbg !28 + %157 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !28 + %159 = add i32 %158, %157, !dbg !31 + %160 = insertelement <1 x i32> poison, i32 %159, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %160, i1 %96) #5, !dbg !28 + %161 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 1, i32 31), !dbg !28 + %163 = add i32 %162, %161, !dbg !31 + %164 = insertelement <1 x i32> poison, i32 %163, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %164, i1 %96) #5, !dbg !28 + %165 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 1, i32 31), !dbg !28 + %167 = add i32 %166, %165, !dbg !31 + %168 = insertelement <1 x i32> poison, i32 %167, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %168, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %169 = load i32, ptr addrspace(3) %78, align 8, !dbg !28 + %170 = load i32, ptr addrspace(3) %81, align 8, !dbg !28 + %171 = load i32, ptr addrspace(3) %84, align 8, !dbg !28 + %172 = load i32, ptr addrspace(3) %87, align 8, !dbg !28 + %173 = mul nuw nsw i32 %21, %.lobit1, !dbg !34 + %174 = mul nuw nsw i32 %22, %.lobit1, !dbg !34 + %175 = mul nuw nsw i32 %23, %.lobit1, !dbg !34 + %176 = mul nuw nsw i32 %24, %.lobit1, !dbg !34 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %177 = insertelement <1 x i32> poison, i32 %173, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %79, <1 x i32> %177, i1 true) #5, !dbg !28 + %178 = insertelement <1 x i32> poison, i32 %174, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %178, i1 true) #5, !dbg !28 + %179 = insertelement <1 x i32> poison, i32 %175, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %179, i1 true) #5, !dbg !28 + %180 = insertelement <1 x i32> poison, i32 %176, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %88, <1 x i32> %180, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %181 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %181, i32 1, i32 31), !dbg !28 + %183 = add i32 %182, %181, !dbg !31 + %184 = insertelement <1 x i32> poison, i32 %183, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %184, i1 %96) #5, !dbg !28 + %185 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %185, i32 1, i32 31), !dbg !28 + %187 = add i32 %186, %185, !dbg !31 + %188 = insertelement <1 x i32> poison, i32 %187, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %188, i1 %96) #5, !dbg !28 + %189 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %190 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %189, i32 1, i32 31), !dbg !28 + %191 = add i32 %190, %189, !dbg !31 + %192 = insertelement <1 x i32> poison, i32 %191, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %192, i1 %96) #5, !dbg !28 + %193 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %194 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %193, i32 1, i32 31), !dbg !28 + %195 = add i32 %194, %193, !dbg !31 + %196 = insertelement <1 x i32> poison, i32 %195, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %196, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %197 = load i32, ptr addrspace(3) %78, align 8, !dbg !28 + %198 = load i32, ptr addrspace(3) %81, align 8, !dbg !28 + %199 = load i32, ptr addrspace(3) %84, align 8, !dbg !28 + %200 = load i32, ptr addrspace(3) %87, align 8, !dbg !28 + %201 = trunc i32 %70 to i1, !dbg !35 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %202 = shl nuw nsw i32 %13, 2, !dbg !28 + %203 = or disjoint i32 %202, 1, !dbg !28 + %204 = or disjoint i32 %202, 2, !dbg !28 + %205 = or disjoint i32 %202, 3, !dbg !28 + %206 = shl nuw nsw i32 %.lobit1, 7, !dbg !28 + %207 = or disjoint i32 %206, %202, !dbg !28 + %.idx63 = shl nuw nsw i32 %207, 3, !dbg !28 + %208 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx63, !dbg !28 + %209 = getelementptr i32, ptr addrspace(3) %208, i32 %.lobit, !dbg !28 + %210 = or disjoint i32 %206, %203, !dbg !28 + %.idx64 = shl nuw nsw i32 %210, 3, !dbg !28 + %211 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx64, !dbg !28 + %212 = getelementptr i32, ptr addrspace(3) %211, i32 %.lobit, !dbg !28 + %213 = or disjoint i32 %206, %204, !dbg !28 + %.idx65 = shl nuw nsw i32 %213, 3, !dbg !28 + %214 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx65, !dbg !28 + %215 = getelementptr i32, ptr addrspace(3) %214, i32 %.lobit, !dbg !28 + %216 = or disjoint i32 %206, %205, !dbg !28 + %.idx66 = shl nuw nsw i32 %216, 3, !dbg !28 + %217 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx66, !dbg !28 + %218 = getelementptr i32, ptr addrspace(3) %217, i32 %.lobit, !dbg !28 + %219 = insertelement <2 x i32> poison, i32 %113, i64 0, !dbg !35 + %220 = insertelement <2 x i32> %219, i32 %114, i64 1, !dbg !35 + %221 = insertelement <2 x i32> poison, i32 %141, i64 0, !dbg !35 + %222 = insertelement <2 x i32> %221, i32 %142, i64 1, !dbg !35 + %223 = icmp sge <2 x i32> %220, %222, !dbg !35 + %224 = icmp ne <2 x i32> %220, %222, !dbg !35 + %225 = insertelement <2 x i32> poison, i32 %169, i64 0, !dbg !35 + %226 = insertelement <2 x i32> %225, i32 %170, i64 1, !dbg !35 + %227 = insertelement <2 x i32> poison, i32 %197, i64 0, !dbg !35 + %228 = insertelement <2 x i32> %227, i32 %198, i64 1, !dbg !35 + %229 = icmp sle <2 x i32> %226, %228, !dbg !35 + %230 = or <2 x i1> %224, %229, !dbg !35 + %231 = and <2 x i1> %223, %230, !dbg !35 + %232 = insertelement <2 x i1> poison, i1 %201, i64 0, !dbg !35 + %233 = shufflevector <2 x i1> %232, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !35 + %234 = xor <2 x i1> %231, %233, !dbg !35 + %235 = insertelement <2 x i32> %227, i32 %142, i64 1, !dbg !36 + %236 = insertelement <2 x i32> %225, i32 %114, i64 1, !dbg !36 + %237 = xor <2 x i32> %235, %236, !dbg !36 + %238 = insertelement <2 x i32> poison, i32 %198, i64 0, !dbg !36 + %239 = insertelement <2 x i32> %238, i32 %141, i64 1, !dbg !36 + %240 = insertelement <2 x i32> poison, i32 %170, i64 0, !dbg !36 + %241 = insertelement <2 x i32> %240, i32 %113, i64 1, !dbg !36 + %242 = xor <2 x i32> %239, %241, !dbg !36 + %243 = select <2 x i1> %234, <2 x i32> zeroinitializer, <2 x i32> %237, !dbg !37 + %244 = shufflevector <2 x i1> %234, <2 x i1> poison, <2 x i32> , !dbg !37 + %245 = select <2 x i1> %244, <2 x i32> zeroinitializer, <2 x i32> %242, !dbg !37 + %246 = insertelement <2 x i32> poison, i32 %21, i64 0, !dbg !38 + %247 = insertelement <2 x i32> %246, i32 %65, i64 1, !dbg !38 + %248 = xor <2 x i32> %243, %247, !dbg !38 + %249 = insertelement <2 x i32> poison, i32 %22, i64 0, !dbg !38 + %250 = insertelement <2 x i32> %249, i32 %63, i64 1, !dbg !38 + %251 = xor <2 x i32> %245, %250, !dbg !38 + %252 = extractelement <2 x i32> %251, i64 1, !dbg !32 + %253 = mul nuw nsw i32 %252, %72, !dbg !27 + %254 = extractelement <2 x i32> %248, i64 1, !dbg !32 + %255 = mul nuw nsw i32 %254, %72, !dbg !27 + %256 = insertelement <1 x i32> poison, i32 %253, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %256, i1 true) #5, !dbg !28 + %257 = insertelement <1 x i32> poison, i32 %255, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %257, i1 true) #5, !dbg !28 + %258 = mul nuw nsw i32 %252, %.lobit, !dbg !32 + %259 = mul nuw nsw i32 %254, %.lobit, !dbg !32 + %260 = insertelement <1 x i32> poison, i32 %258, i64 0, !dbg !28 + %261 = insertelement <1 x i32> poison, i32 %259, i64 0, !dbg !28 + %262 = extractelement <2 x i32> %248, i64 0, !dbg !34 + %263 = mul nuw nsw i32 %262, %72, !dbg !33 + %264 = extractelement <2 x i32> %251, i64 0, !dbg !34 + %265 = mul nuw nsw i32 %264, %72, !dbg !33 + %266 = insertelement <1 x i32> poison, i32 %263, i64 0, !dbg !28 + %267 = insertelement <1 x i32> poison, i32 %265, i64 0, !dbg !28 + %268 = mul nuw nsw i32 %262, %.lobit, !dbg !34 + %269 = mul nuw nsw i32 %264, %.lobit, !dbg !34 + %270 = insertelement <1 x i32> poison, i32 %268, i64 0, !dbg !28 + %271 = insertelement <1 x i32> poison, i32 %269, i64 0, !dbg !28 + %272 = insertelement <2 x i32> poison, i32 %115, i64 0, !dbg !35 + %273 = insertelement <2 x i32> %272, i32 %116, i64 1, !dbg !35 + %274 = insertelement <2 x i32> poison, i32 %143, i64 0, !dbg !35 + %275 = insertelement <2 x i32> %274, i32 %144, i64 1, !dbg !35 + %276 = icmp sge <2 x i32> %273, %275, !dbg !35 + %277 = icmp ne <2 x i32> %273, %275, !dbg !35 + %278 = insertelement <2 x i32> poison, i32 %171, i64 0, !dbg !35 + %279 = insertelement <2 x i32> %278, i32 %172, i64 1, !dbg !35 + %280 = insertelement <2 x i32> poison, i32 %199, i64 0, !dbg !35 + %281 = insertelement <2 x i32> %280, i32 %200, i64 1, !dbg !35 + %282 = icmp sle <2 x i32> %279, %281, !dbg !35 + %283 = or <2 x i1> %277, %282, !dbg !35 + %284 = and <2 x i1> %276, %283, !dbg !35 + %285 = xor <2 x i1> %284, %233, !dbg !35 + %286 = insertelement <2 x i32> %280, i32 %144, i64 1, !dbg !36 + %287 = insertelement <2 x i32> %278, i32 %116, i64 1, !dbg !36 + %288 = xor <2 x i32> %286, %287, !dbg !36 + %289 = insertelement <2 x i32> poison, i32 %200, i64 0, !dbg !36 + %290 = insertelement <2 x i32> %289, i32 %143, i64 1, !dbg !36 + %291 = insertelement <2 x i32> poison, i32 %172, i64 0, !dbg !36 + %292 = insertelement <2 x i32> %291, i32 %115, i64 1, !dbg !36 + %293 = xor <2 x i32> %290, %292, !dbg !36 + %294 = select <2 x i1> %285, <2 x i32> zeroinitializer, <2 x i32> %288, !dbg !37 + %295 = shufflevector <2 x i1> %285, <2 x i1> poison, <2 x i32> , !dbg !37 + %296 = select <2 x i1> %295, <2 x i32> zeroinitializer, <2 x i32> %293, !dbg !37 + %297 = insertelement <2 x i32> poison, i32 %23, i64 0, !dbg !38 + %298 = insertelement <2 x i32> %297, i32 %69, i64 1, !dbg !38 + %299 = xor <2 x i32> %294, %298, !dbg !38 + %300 = insertelement <2 x i32> poison, i32 %24, i64 0, !dbg !38 + %301 = insertelement <2 x i32> %300, i32 %67, i64 1, !dbg !38 + %302 = xor <2 x i32> %296, %301, !dbg !38 + %303 = extractelement <2 x i32> %302, i64 1, !dbg !32 + %304 = mul nuw nsw i32 %303, %72, !dbg !27 + %305 = extractelement <2 x i32> %299, i64 1, !dbg !32 + %306 = mul nuw nsw i32 %305, %72, !dbg !27 + %307 = insertelement <1 x i32> poison, i32 %304, i64 0, !dbg !28 + %308 = insertelement <1 x i32> poison, i32 %306, i64 0, !dbg !28 + %309 = mul nuw nsw i32 %303, %.lobit, !dbg !32 + %310 = mul nuw nsw i32 %305, %.lobit, !dbg !32 + %311 = insertelement <1 x i32> poison, i32 %309, i64 0, !dbg !28 + %312 = insertelement <1 x i32> poison, i32 %310, i64 0, !dbg !28 + %313 = extractelement <2 x i32> %299, i64 0, !dbg !34 + %314 = mul nuw nsw i32 %313, %72, !dbg !33 + %315 = extractelement <2 x i32> %302, i64 0, !dbg !34 + %316 = mul nuw nsw i32 %315, %72, !dbg !33 + %317 = insertelement <1 x i32> poison, i32 %314, i64 0, !dbg !28 + %318 = insertelement <1 x i32> poison, i32 %316, i64 0, !dbg !28 + %319 = mul nuw nsw i32 %313, %.lobit, !dbg !34 + %320 = mul nuw nsw i32 %315, %.lobit, !dbg !34 + %321 = insertelement <1 x i32> poison, i32 %319, i64 0, !dbg !28 + %322 = insertelement <1 x i32> poison, i32 %320, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %215, <1 x i32> %307, i1 true) #5, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %218, <1 x i32> %308, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %323 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %324 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %323, i32 1, i32 31), !dbg !28 + %325 = add i32 %324, %323, !dbg !31 + %326 = insertelement <1 x i32> poison, i32 %325, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %326, i1 %96) #5, !dbg !28 + %327 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %328 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %327, i32 1, i32 31), !dbg !28 + %329 = add i32 %328, %327, !dbg !31 + %330 = insertelement <1 x i32> poison, i32 %329, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %330, i1 %96) #5, !dbg !28 + %331 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %332 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %331, i32 1, i32 31), !dbg !28 + %333 = add i32 %332, %331, !dbg !31 + %334 = insertelement <1 x i32> poison, i32 %333, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %334, i1 %96) #5, !dbg !28 + %335 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %336 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %335, i32 1, i32 31), !dbg !28 + %337 = add i32 %336, %335, !dbg !31 + %338 = insertelement <1 x i32> poison, i32 %337, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %338, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %339 = load i32, ptr addrspace(3) %208, align 16, !dbg !28 + %340 = load i32, ptr addrspace(3) %211, align 8, !dbg !28 + %341 = load i32, ptr addrspace(3) %214, align 16, !dbg !28 + %342 = load i32, ptr addrspace(3) %217, align 8, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %260, i1 true) #5, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %261, i1 true) #5, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %215, <1 x i32> %311, i1 true) #5, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %218, <1 x i32> %312, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %343 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %344 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %343, i32 1, i32 31), !dbg !28 + %345 = add i32 %344, %343, !dbg !31 + %346 = insertelement <1 x i32> poison, i32 %345, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %346, i1 %96) #5, !dbg !28 + %347 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %348 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %347, i32 1, i32 31), !dbg !28 + %349 = add i32 %348, %347, !dbg !31 + %350 = insertelement <1 x i32> poison, i32 %349, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %350, i1 %96) #5, !dbg !28 + %351 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %352 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %351, i32 1, i32 31), !dbg !28 + %353 = add i32 %352, %351, !dbg !31 + %354 = insertelement <1 x i32> poison, i32 %353, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %354, i1 %96) #5, !dbg !28 + %355 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %356 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %355, i32 1, i32 31), !dbg !28 + %357 = add i32 %356, %355, !dbg !31 + %358 = insertelement <1 x i32> poison, i32 %357, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %358, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %359 = load i32, ptr addrspace(3) %208, align 16, !dbg !28 + %360 = load i32, ptr addrspace(3) %211, align 8, !dbg !28 + %361 = load i32, ptr addrspace(3) %214, align 16, !dbg !28 + %362 = load i32, ptr addrspace(3) %217, align 8, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %266, i1 true) #5, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %267, i1 true) #5, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %215, <1 x i32> %317, i1 true) #5, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %218, <1 x i32> %318, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %363 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %364 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %363, i32 1, i32 31), !dbg !28 + %365 = add i32 %364, %363, !dbg !31 + %366 = insertelement <1 x i32> poison, i32 %365, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %366, i1 %96) #5, !dbg !28 + %367 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %368 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %367, i32 1, i32 31), !dbg !28 + %369 = add i32 %368, %367, !dbg !31 + %370 = insertelement <1 x i32> poison, i32 %369, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %370, i1 %96) #5, !dbg !28 + %371 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %372 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %371, i32 1, i32 31), !dbg !28 + %373 = add i32 %372, %371, !dbg !31 + %374 = insertelement <1 x i32> poison, i32 %373, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %374, i1 %96) #5, !dbg !28 + %375 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %376 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %375, i32 1, i32 31), !dbg !28 + %377 = add i32 %376, %375, !dbg !31 + %378 = insertelement <1 x i32> poison, i32 %377, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %378, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %379 = load i32, ptr addrspace(3) %208, align 16, !dbg !28 + %380 = load i32, ptr addrspace(3) %211, align 8, !dbg !28 + %381 = load i32, ptr addrspace(3) %214, align 16, !dbg !28 + %382 = load i32, ptr addrspace(3) %217, align 8, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %270, i1 true) #5, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %271, i1 true) #5, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %215, <1 x i32> %321, i1 true) #5, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %218, <1 x i32> %322, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %383 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %384 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %383, i32 1, i32 31), !dbg !28 + %385 = add i32 %384, %383, !dbg !31 + %386 = insertelement <1 x i32> poison, i32 %385, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %386, i1 %96) #5, !dbg !28 + %387 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %388 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %387, i32 1, i32 31), !dbg !28 + %389 = add i32 %388, %387, !dbg !31 + %390 = insertelement <1 x i32> poison, i32 %389, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %390, i1 %96) #5, !dbg !28 + %391 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %392 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %391, i32 1, i32 31), !dbg !28 + %393 = add i32 %392, %391, !dbg !31 + %394 = insertelement <1 x i32> poison, i32 %393, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %394, i1 %96) #5, !dbg !28 + %395 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %396 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %395, i32 1, i32 31), !dbg !28 + %397 = add i32 %396, %395, !dbg !31 + %398 = insertelement <1 x i32> poison, i32 %397, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %398, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %399 = load i32, ptr addrspace(3) %208, align 16, !dbg !28 + %400 = load i32, ptr addrspace(3) %211, align 8, !dbg !28 + %401 = load i32, ptr addrspace(3) %214, align 16, !dbg !28 + %402 = load i32, ptr addrspace(3) %217, align 8, !dbg !28 + %403 = icmp slt i32 %339, %359, !dbg !39 + %404 = icmp sge i32 %340, %360, !dbg !39 + %405 = icmp slt i32 %341, %361, !dbg !39 + %406 = icmp sge i32 %342, %362, !dbg !39 + %407 = icmp eq i32 %339, %359, !dbg !40 + %408 = icmp ne i32 %340, %360, !dbg !40 + %409 = icmp eq i32 %341, %361, !dbg !40 + %410 = icmp ne i32 %342, %362, !dbg !40 + %411 = icmp sgt i32 %379, %399, !dbg !41 + %412 = icmp sle i32 %380, %400, !dbg !41 + %413 = icmp sgt i32 %381, %401, !dbg !41 + %414 = icmp sle i32 %382, %402, !dbg !41 + %415 = insertelement <2 x i1> poison, i1 %407, i64 0, !dbg !42 + %416 = insertelement <2 x i1> %415, i1 %408, i64 1, !dbg !42 + %417 = insertelement <2 x i1> poison, i1 %411, i64 0, !dbg !42 + %418 = insertelement <2 x i1> %417, i1 %412, i64 1, !dbg !42 + %419 = or <2 x i1> %416, %418, !dbg !42 + %420 = and <2 x i1> %416, %418, !dbg !42 + %421 = shufflevector <2 x i1> %420, <2 x i1> %419, <2 x i32> , !dbg !42 + %422 = insertelement <2 x i1> poison, i1 %409, i64 0, !dbg !43 + %423 = insertelement <2 x i1> %422, i1 %410, i64 1, !dbg !43 + %424 = insertelement <2 x i1> poison, i1 %413, i64 0, !dbg !43 + %425 = insertelement <2 x i1> %424, i1 %414, i64 1, !dbg !43 + %426 = or <2 x i1> %423, %425, !dbg !43 + %427 = and <2 x i1> %423, %425, !dbg !43 + %428 = shufflevector <2 x i1> %427, <2 x i1> %426, <2 x i32> , !dbg !43 + %429 = insertelement <2 x i1> poison, i1 %403, i64 0, !dbg !43 + %430 = insertelement <2 x i1> %429, i1 %404, i64 1, !dbg !43 + %431 = and <2 x i1> %430, %421, !dbg !43 + %432 = or <2 x i1> %430, %421, !dbg !43 + %433 = shufflevector <2 x i1> %432, <2 x i1> %431, <2 x i32> , !dbg !43 + %434 = insertelement <2 x i1> poison, i1 %405, i64 0, !dbg !44 + %435 = insertelement <2 x i1> %434, i1 %406, i64 1, !dbg !44 + %436 = and <2 x i1> %435, %428, !dbg !44 + %437 = or <2 x i1> %435, %428, !dbg !44 + %438 = shufflevector <2 x i1> %437, <2 x i1> %436, <2 x i32> , !dbg !44 + %439 = insertelement <2 x i32> poison, i32 %399, i64 0, !dbg !36 + %440 = insertelement <2 x i32> %439, i32 %360, i64 1, !dbg !36 + %441 = insertelement <2 x i32> poison, i32 %379, i64 0, !dbg !36 + %442 = insertelement <2 x i32> %441, i32 %340, i64 1, !dbg !36 + %443 = xor <2 x i32> %440, %442, !dbg !36 + %444 = insertelement <2 x i32> poison, i32 %400, i64 0, !dbg !36 + %445 = insertelement <2 x i32> %444, i32 %359, i64 1, !dbg !36 + %446 = insertelement <2 x i32> poison, i32 %380, i64 0, !dbg !36 + %447 = insertelement <2 x i32> %446, i32 %339, i64 1, !dbg !36 + %448 = xor <2 x i32> %445, %447, !dbg !36 + %449 = insertelement <2 x i32> poison, i32 %401, i64 0, !dbg !36 + %450 = insertelement <2 x i32> %449, i32 %362, i64 1, !dbg !36 + %451 = insertelement <2 x i32> poison, i32 %381, i64 0, !dbg !36 + %452 = insertelement <2 x i32> %451, i32 %342, i64 1, !dbg !36 + %453 = xor <2 x i32> %450, %452, !dbg !36 + %454 = insertelement <2 x i32> poison, i32 %402, i64 0, !dbg !36 + %455 = insertelement <2 x i32> %454, i32 %361, i64 1, !dbg !36 + %456 = insertelement <2 x i32> poison, i32 %382, i64 0, !dbg !36 + %457 = insertelement <2 x i32> %456, i32 %341, i64 1, !dbg !36 + %458 = xor <2 x i32> %455, %457, !dbg !36 + %459 = select <2 x i1> %433, <2 x i32> %443, <2 x i32> zeroinitializer, !dbg !37 + %460 = shufflevector <2 x i1> %431, <2 x i1> %432, <2 x i32> , !dbg !37 + %461 = select <2 x i1> %460, <2 x i32> %448, <2 x i32> zeroinitializer, !dbg !37 + %462 = select <2 x i1> %438, <2 x i32> %453, <2 x i32> zeroinitializer, !dbg !37 + %463 = shufflevector <2 x i1> %436, <2 x i1> %437, <2 x i32> , !dbg !37 + %464 = select <2 x i1> %463, <2 x i32> %458, <2 x i32> zeroinitializer, !dbg !37 + %465 = xor <2 x i32> %459, %248, !dbg !38 + %466 = xor <2 x i32> %461, %251, !dbg !38 + %467 = xor <2 x i32> %462, %299, !dbg !38 + %468 = xor <2 x i32> %464, %302, !dbg !38 + %469 = extractelement <2 x i32> %466, i64 1, !dbg !32 + %470 = mul nuw nsw i32 %469, %71, !dbg !27 + %471 = extractelement <2 x i32> %465, i64 1, !dbg !32 + %472 = mul nuw nsw i32 %471, %71, !dbg !27 + %473 = extractelement <2 x i32> %468, i64 1, !dbg !32 + %474 = mul nuw nsw i32 %473, %71, !dbg !27 + %475 = extractelement <2 x i32> %467, i64 1, !dbg !32 + %476 = mul nuw nsw i32 %475, %71, !dbg !27 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %477 = insertelement <1 x i32> poison, i32 %470, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %79, <1 x i32> %477, i1 true) #5, !dbg !28 + %478 = insertelement <1 x i32> poison, i32 %472, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %478, i1 true) #5, !dbg !28 + %479 = insertelement <1 x i32> poison, i32 %474, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %479, i1 true) #5, !dbg !28 + %480 = insertelement <1 x i32> poison, i32 %476, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %88, <1 x i32> %480, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %481 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %482 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %481, i32 1, i32 31), !dbg !28 + %483 = add i32 %482, %481, !dbg !31 + %484 = insertelement <1 x i32> poison, i32 %483, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %484, i1 %96) #5, !dbg !28 + %485 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %486 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %485, i32 1, i32 31), !dbg !28 + %487 = add i32 %486, %485, !dbg !31 + %488 = insertelement <1 x i32> poison, i32 %487, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %488, i1 %96) #5, !dbg !28 + %489 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %490 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %489, i32 1, i32 31), !dbg !28 + %491 = add i32 %490, %489, !dbg !31 + %492 = insertelement <1 x i32> poison, i32 %491, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %492, i1 %96) #5, !dbg !28 + %493 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %494 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %493, i32 1, i32 31), !dbg !28 + %495 = add i32 %494, %493, !dbg !31 + %496 = insertelement <1 x i32> poison, i32 %495, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %496, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %497 = load i32, ptr addrspace(3) %78, align 8, !dbg !28 + %498 = load i32, ptr addrspace(3) %81, align 8, !dbg !28 + %499 = load i32, ptr addrspace(3) %84, align 8, !dbg !28 + %500 = load i32, ptr addrspace(3) %87, align 8, !dbg !28 + %501 = mul nuw nsw i32 %469, %.lobit1, !dbg !32 + %502 = mul nuw nsw i32 %471, %.lobit1, !dbg !32 + %503 = mul nuw nsw i32 %473, %.lobit1, !dbg !32 + %504 = mul nuw nsw i32 %475, %.lobit1, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %505 = insertelement <1 x i32> poison, i32 %501, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %79, <1 x i32> %505, i1 true) #5, !dbg !28 + %506 = insertelement <1 x i32> poison, i32 %502, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %506, i1 true) #5, !dbg !28 + %507 = insertelement <1 x i32> poison, i32 %503, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %507, i1 true) #5, !dbg !28 + %508 = insertelement <1 x i32> poison, i32 %504, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %88, <1 x i32> %508, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %509 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %510 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %509, i32 1, i32 31), !dbg !28 + %511 = add i32 %510, %509, !dbg !31 + %512 = insertelement <1 x i32> poison, i32 %511, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %512, i1 %96) #5, !dbg !28 + %513 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %513, i32 1, i32 31), !dbg !28 + %515 = add i32 %514, %513, !dbg !31 + %516 = insertelement <1 x i32> poison, i32 %515, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %516, i1 %96) #5, !dbg !28 + %517 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %518 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %517, i32 1, i32 31), !dbg !28 + %519 = add i32 %518, %517, !dbg !31 + %520 = insertelement <1 x i32> poison, i32 %519, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %520, i1 %96) #5, !dbg !28 + %521 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %522 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %521, i32 1, i32 31), !dbg !28 + %523 = add i32 %522, %521, !dbg !31 + %524 = insertelement <1 x i32> poison, i32 %523, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %524, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %525 = load i32, ptr addrspace(3) %78, align 8, !dbg !28 + %526 = load i32, ptr addrspace(3) %81, align 8, !dbg !28 + %527 = load i32, ptr addrspace(3) %84, align 8, !dbg !28 + %528 = load i32, ptr addrspace(3) %87, align 8, !dbg !28 + %529 = extractelement <2 x i32> %465, i64 0, !dbg !34 + %530 = mul nuw nsw i32 %529, %71, !dbg !33 + %531 = extractelement <2 x i32> %466, i64 0, !dbg !34 + %532 = mul nuw nsw i32 %531, %71, !dbg !33 + %533 = extractelement <2 x i32> %467, i64 0, !dbg !34 + %534 = mul nuw nsw i32 %533, %71, !dbg !33 + %535 = extractelement <2 x i32> %468, i64 0, !dbg !34 + %536 = mul nuw nsw i32 %535, %71, !dbg !33 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %537 = insertelement <1 x i32> poison, i32 %530, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %79, <1 x i32> %537, i1 true) #5, !dbg !28 + %538 = insertelement <1 x i32> poison, i32 %532, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %538, i1 true) #5, !dbg !28 + %539 = insertelement <1 x i32> poison, i32 %534, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %539, i1 true) #5, !dbg !28 + %540 = insertelement <1 x i32> poison, i32 %536, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %88, <1 x i32> %540, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %541 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %542 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %541, i32 1, i32 31), !dbg !28 + %543 = add i32 %542, %541, !dbg !31 + %544 = insertelement <1 x i32> poison, i32 %543, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %544, i1 %96) #5, !dbg !28 + %545 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %546 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %545, i32 1, i32 31), !dbg !28 + %547 = add i32 %546, %545, !dbg !31 + %548 = insertelement <1 x i32> poison, i32 %547, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %548, i1 %96) #5, !dbg !28 + %549 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %550 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %549, i32 1, i32 31), !dbg !28 + %551 = add i32 %550, %549, !dbg !31 + %552 = insertelement <1 x i32> poison, i32 %551, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %552, i1 %96) #5, !dbg !28 + %553 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %554 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %553, i32 1, i32 31), !dbg !28 + %555 = add i32 %554, %553, !dbg !31 + %556 = insertelement <1 x i32> poison, i32 %555, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %556, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %557 = load i32, ptr addrspace(3) %78, align 8, !dbg !28 + %558 = load i32, ptr addrspace(3) %81, align 8, !dbg !28 + %559 = load i32, ptr addrspace(3) %84, align 8, !dbg !28 + %560 = load i32, ptr addrspace(3) %87, align 8, !dbg !28 + %561 = mul nuw nsw i32 %529, %.lobit1, !dbg !34 + %562 = mul nuw nsw i32 %531, %.lobit1, !dbg !34 + %563 = mul nuw nsw i32 %533, %.lobit1, !dbg !34 + %564 = mul nuw nsw i32 %535, %.lobit1, !dbg !34 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %565 = insertelement <1 x i32> poison, i32 %561, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %79, <1 x i32> %565, i1 true) #5, !dbg !28 + %566 = insertelement <1 x i32> poison, i32 %562, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %566, i1 true) #5, !dbg !28 + %567 = insertelement <1 x i32> poison, i32 %563, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %567, i1 true) #5, !dbg !28 + %568 = insertelement <1 x i32> poison, i32 %564, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %88, <1 x i32> %568, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %569 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %570 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %569, i32 1, i32 31), !dbg !28 + %571 = add i32 %570, %569, !dbg !31 + %572 = insertelement <1 x i32> poison, i32 %571, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %572, i1 %96) #5, !dbg !28 + %573 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %574 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %573, i32 1, i32 31), !dbg !28 + %575 = add i32 %574, %573, !dbg !31 + %576 = insertelement <1 x i32> poison, i32 %575, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %576, i1 %96) #5, !dbg !28 + %577 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %578 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %577, i32 1, i32 31), !dbg !28 + %579 = add i32 %578, %577, !dbg !31 + %580 = insertelement <1 x i32> poison, i32 %579, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %580, i1 %96) #5, !dbg !28 + %581 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %582 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %581, i32 1, i32 31), !dbg !28 + %583 = add i32 %582, %581, !dbg !31 + %584 = insertelement <1 x i32> poison, i32 %583, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %584, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %585 = load i32, ptr addrspace(3) %78, align 8, !dbg !28 + %586 = load i32, ptr addrspace(3) %81, align 8, !dbg !28 + %587 = load i32, ptr addrspace(3) %84, align 8, !dbg !28 + %588 = load i32, ptr addrspace(3) %87, align 8, !dbg !28 + %589 = icmp slt i32 %497, %525, !dbg !39 + %590 = icmp sge i32 %498, %526, !dbg !39 + %591 = icmp slt i32 %499, %527, !dbg !39 + %592 = icmp sge i32 %500, %528, !dbg !39 + %593 = icmp eq i32 %497, %525, !dbg !40 + %594 = icmp ne i32 %498, %526, !dbg !40 + %595 = icmp eq i32 %499, %527, !dbg !40 + %596 = icmp ne i32 %500, %528, !dbg !40 + %597 = icmp sgt i32 %557, %585, !dbg !41 + %598 = icmp sle i32 %558, %586, !dbg !41 + %599 = icmp sgt i32 %559, %587, !dbg !41 + %600 = icmp sle i32 %560, %588, !dbg !41 + %601 = insertelement <2 x i1> poison, i1 %593, i64 0, !dbg !42 + %602 = insertelement <2 x i1> %601, i1 %594, i64 1, !dbg !42 + %603 = insertelement <2 x i1> poison, i1 %597, i64 0, !dbg !42 + %604 = insertelement <2 x i1> %603, i1 %598, i64 1, !dbg !42 + %605 = or <2 x i1> %602, %604, !dbg !42 + %606 = and <2 x i1> %602, %604, !dbg !42 + %607 = shufflevector <2 x i1> %606, <2 x i1> %605, <2 x i32> , !dbg !42 + %608 = insertelement <2 x i1> poison, i1 %595, i64 0, !dbg !43 + %609 = insertelement <2 x i1> %608, i1 %596, i64 1, !dbg !43 + %610 = insertelement <2 x i1> poison, i1 %599, i64 0, !dbg !43 + %611 = insertelement <2 x i1> %610, i1 %600, i64 1, !dbg !43 + %612 = or <2 x i1> %609, %611, !dbg !43 + %613 = and <2 x i1> %609, %611, !dbg !43 + %614 = shufflevector <2 x i1> %613, <2 x i1> %612, <2 x i32> , !dbg !43 + %615 = insertelement <2 x i1> poison, i1 %589, i64 0, !dbg !43 + %616 = insertelement <2 x i1> %615, i1 %590, i64 1, !dbg !43 + %617 = and <2 x i1> %616, %607, !dbg !43 + %618 = or <2 x i1> %616, %607, !dbg !43 + %619 = shufflevector <2 x i1> %618, <2 x i1> %617, <2 x i32> , !dbg !43 + %620 = insertelement <2 x i1> poison, i1 %591, i64 0, !dbg !44 + %621 = insertelement <2 x i1> %620, i1 %592, i64 1, !dbg !44 + %622 = and <2 x i1> %621, %614, !dbg !44 + %623 = or <2 x i1> %621, %614, !dbg !44 + %624 = shufflevector <2 x i1> %623, <2 x i1> %622, <2 x i32> , !dbg !44 + %625 = insertelement <2 x i32> poison, i32 %585, i64 0, !dbg !36 + %626 = insertelement <2 x i32> %625, i32 %526, i64 1, !dbg !36 + %627 = insertelement <2 x i32> poison, i32 %557, i64 0, !dbg !36 + %628 = insertelement <2 x i32> %627, i32 %498, i64 1, !dbg !36 + %629 = xor <2 x i32> %626, %628, !dbg !36 + %630 = insertelement <2 x i32> poison, i32 %586, i64 0, !dbg !36 + %631 = insertelement <2 x i32> %630, i32 %525, i64 1, !dbg !36 + %632 = insertelement <2 x i32> poison, i32 %558, i64 0, !dbg !36 + %633 = insertelement <2 x i32> %632, i32 %497, i64 1, !dbg !36 + %634 = xor <2 x i32> %631, %633, !dbg !36 + %635 = insertelement <2 x i32> poison, i32 %587, i64 0, !dbg !36 + %636 = insertelement <2 x i32> %635, i32 %528, i64 1, !dbg !36 + %637 = insertelement <2 x i32> poison, i32 %559, i64 0, !dbg !36 + %638 = insertelement <2 x i32> %637, i32 %500, i64 1, !dbg !36 + %639 = xor <2 x i32> %636, %638, !dbg !36 + %640 = insertelement <2 x i32> poison, i32 %588, i64 0, !dbg !36 + %641 = insertelement <2 x i32> %640, i32 %527, i64 1, !dbg !36 + %642 = insertelement <2 x i32> poison, i32 %560, i64 0, !dbg !36 + %643 = insertelement <2 x i32> %642, i32 %499, i64 1, !dbg !36 + %644 = xor <2 x i32> %641, %643, !dbg !36 + %645 = select <2 x i1> %619, <2 x i32> %629, <2 x i32> zeroinitializer, !dbg !37 + %646 = shufflevector <2 x i1> %617, <2 x i1> %618, <2 x i32> , !dbg !37 + %647 = select <2 x i1> %646, <2 x i32> %634, <2 x i32> zeroinitializer, !dbg !37 + %648 = select <2 x i1> %624, <2 x i32> %639, <2 x i32> zeroinitializer, !dbg !37 + %649 = shufflevector <2 x i1> %622, <2 x i1> %623, <2 x i32> , !dbg !37 + %650 = select <2 x i1> %649, <2 x i32> %644, <2 x i32> zeroinitializer, !dbg !37 + %651 = xor <2 x i32> %645, %465, !dbg !38 + %652 = xor <2 x i32> %647, %466, !dbg !38 + %653 = xor <2 x i32> %648, %467, !dbg !38 + %654 = xor <2 x i32> %650, %468, !dbg !38 + %655 = extractelement <2 x i32> %651, i64 1, !dbg !39 + %656 = extractelement <2 x i32> %652, i64 1, !dbg !39 + %657 = icmp slt i32 %656, %655, !dbg !39 + %658 = extractelement <2 x i32> %653, i64 1, !dbg !39 + %659 = extractelement <2 x i32> %654, i64 1, !dbg !39 + %660 = icmp sge i32 %659, %658, !dbg !39 + %661 = icmp eq <2 x i32> %651, %652, !dbg !40 + %662 = icmp sgt <2 x i32> %651, %652, !dbg !40 + %663 = icmp ne <2 x i32> %653, %654, !dbg !40 + %664 = icmp sle <2 x i32> %653, %654, !dbg !40 + %shift = shufflevector <2 x i1> %661, <2 x i1> poison, <2 x i32> , !dbg !42 + %foldExtExtBinop = and <2 x i1> %shift, %662, !dbg !42 + %665 = extractelement <2 x i1> %foldExtExtBinop, i64 0, !dbg !42 + %shift70 = shufflevector <2 x i1> %663, <2 x i1> poison, <2 x i32> , !dbg !43 + %foldExtExtBinop71 = or <2 x i1> %shift70, %664, !dbg !43 + %.not34 = extractelement <2 x i1> %foldExtExtBinop71, i64 0, !dbg !43 + %666 = or i1 %657, %665, !dbg !43 + %.not31 = and i1 %660, %.not34, !dbg !44 + %667 = xor i32 %655, %656, !dbg !36 + %668 = xor i32 %658, %659, !dbg !36 + %669 = select i1 %666, i32 %667, i32 0, !dbg !37 + %670 = select i1 %.not31, i32 %668, i32 0, !dbg !37 + %foldExtExtBinop73 = xor <2 x i32> %652, %651, !dbg !45 + %671 = extractelement <2 x i32> %foldExtExtBinop73, i64 0, !dbg !45 + %foldExtExtBinop75 = xor <2 x i32> %654, %653, !dbg !45 + %672 = extractelement <2 x i32> %foldExtExtBinop75, i64 0, !dbg !45 + %673 = select i1 %666, i32 %671, i32 0, !dbg !46 + %674 = select i1 %.not31, i32 %672, i32 0, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %675 = insertelement <2 x i32> poison, i32 %669, i64 0, !dbg !38 + %676 = shufflevector <2 x i32> %675, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !38 + %677 = shufflevector <2 x i32> %651, <2 x i32> %652, <2 x i32> , !dbg !38 + %678 = xor <2 x i32> %676, %677, !dbg !38 + %679 = insertelement <2 x i32> poison, i32 %670, i64 0, !dbg !38 + %680 = shufflevector <2 x i32> %679, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !38 + %681 = shufflevector <2 x i32> %653, <2 x i32> %654, <2 x i32> , !dbg !38 + %682 = xor <2 x i32> %680, %681, !dbg !38 + %683 = insertelement <2 x i32> poison, i32 %673, i64 0, !dbg !47 + %684 = shufflevector <2 x i32> %683, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !47 + %685 = shufflevector <2 x i32> %652, <2 x i32> %651, <2 x i32> , !dbg !47 + %686 = xor <2 x i32> %684, %685, !dbg !47 + %687 = insertelement <2 x i32> poison, i32 %674, i64 0, !dbg !47 + %688 = shufflevector <2 x i32> %687, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !47 + %689 = shufflevector <2 x i32> %654, <2 x i32> %653, <2 x i32> , !dbg !47 + %690 = xor <2 x i32> %688, %689, !dbg !47 + %691 = extractelement <2 x i32> %678, i64 1, !dbg !32 + %692 = mul nuw nsw i32 %691, %72, !dbg !27 + %693 = extractelement <2 x i32> %678, i64 0, !dbg !32 + %694 = mul nuw nsw i32 %693, %72, !dbg !27 + %695 = extractelement <2 x i32> %682, i64 1, !dbg !32 + %696 = mul nuw nsw i32 %695, %72, !dbg !27 + %697 = extractelement <2 x i32> %682, i64 0, !dbg !32 + %698 = mul nuw nsw i32 %697, %72, !dbg !27 + %699 = insertelement <1 x i32> poison, i32 %692, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %699, i1 true) #5, !dbg !28 + %700 = insertelement <1 x i32> poison, i32 %694, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %700, i1 true) #5, !dbg !28 + %701 = insertelement <1 x i32> poison, i32 %696, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %215, <1 x i32> %701, i1 true) #5, !dbg !28 + %702 = insertelement <1 x i32> poison, i32 %698, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %218, <1 x i32> %702, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %703 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %704 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %703, i32 1, i32 31), !dbg !28 + %705 = add i32 %704, %703, !dbg !31 + %706 = insertelement <1 x i32> poison, i32 %705, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %706, i1 %96) #5, !dbg !28 + %707 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %708 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %707, i32 1, i32 31), !dbg !28 + %709 = add i32 %708, %707, !dbg !31 + %710 = insertelement <1 x i32> poison, i32 %709, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %710, i1 %96) #5, !dbg !28 + %711 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %712 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %711, i32 1, i32 31), !dbg !28 + %713 = add i32 %712, %711, !dbg !31 + %714 = insertelement <1 x i32> poison, i32 %713, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %714, i1 %96) #5, !dbg !28 + %715 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %716 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %715, i32 1, i32 31), !dbg !28 + %717 = add i32 %716, %715, !dbg !31 + %718 = insertelement <1 x i32> poison, i32 %717, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %718, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %719 = load i32, ptr addrspace(3) %208, align 16, !dbg !28 + %720 = load i32, ptr addrspace(3) %211, align 8, !dbg !28 + %721 = load i32, ptr addrspace(3) %214, align 16, !dbg !28 + %722 = load i32, ptr addrspace(3) %217, align 8, !dbg !28 + %723 = mul nuw nsw i32 %691, %.lobit, !dbg !32 + %724 = mul nuw nsw i32 %693, %.lobit, !dbg !32 + %725 = mul nuw nsw i32 %695, %.lobit, !dbg !32 + %726 = mul nuw nsw i32 %697, %.lobit, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %727 = insertelement <1 x i32> poison, i32 %723, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %727, i1 true) #5, !dbg !28 + %728 = insertelement <1 x i32> poison, i32 %724, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %728, i1 true) #5, !dbg !28 + %729 = insertelement <1 x i32> poison, i32 %725, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %215, <1 x i32> %729, i1 true) #5, !dbg !28 + %730 = insertelement <1 x i32> poison, i32 %726, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %218, <1 x i32> %730, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %731 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %732 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %731, i32 1, i32 31), !dbg !28 + %733 = add i32 %732, %731, !dbg !31 + %734 = insertelement <1 x i32> poison, i32 %733, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %734, i1 %96) #5, !dbg !28 + %735 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %736 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %735, i32 1, i32 31), !dbg !28 + %737 = add i32 %736, %735, !dbg !31 + %738 = insertelement <1 x i32> poison, i32 %737, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %738, i1 %96) #5, !dbg !28 + %739 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %740 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %739, i32 1, i32 31), !dbg !28 + %741 = add i32 %740, %739, !dbg !31 + %742 = insertelement <1 x i32> poison, i32 %741, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %742, i1 %96) #5, !dbg !28 + %743 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %744 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %743, i32 1, i32 31), !dbg !28 + %745 = add i32 %744, %743, !dbg !31 + %746 = insertelement <1 x i32> poison, i32 %745, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %746, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %747 = load i32, ptr addrspace(3) %208, align 16, !dbg !28 + %748 = load i32, ptr addrspace(3) %211, align 8, !dbg !28 + %749 = load i32, ptr addrspace(3) %214, align 16, !dbg !28 + %750 = load i32, ptr addrspace(3) %217, align 8, !dbg !28 + %751 = extractelement <2 x i32> %686, i64 1, !dbg !34 + %752 = mul nuw nsw i32 %751, %72, !dbg !33 + %753 = extractelement <2 x i32> %686, i64 0, !dbg !34 + %754 = mul nuw nsw i32 %753, %72, !dbg !33 + %755 = extractelement <2 x i32> %690, i64 1, !dbg !34 + %756 = mul nuw nsw i32 %755, %72, !dbg !33 + %757 = extractelement <2 x i32> %690, i64 0, !dbg !34 + %758 = mul nuw nsw i32 %757, %72, !dbg !33 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %759 = insertelement <1 x i32> poison, i32 %752, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %759, i1 true) #5, !dbg !28 + %760 = insertelement <1 x i32> poison, i32 %754, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %760, i1 true) #5, !dbg !28 + %761 = insertelement <1 x i32> poison, i32 %756, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %215, <1 x i32> %761, i1 true) #5, !dbg !28 + %762 = insertelement <1 x i32> poison, i32 %758, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %218, <1 x i32> %762, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %763 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %764 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %763, i32 1, i32 31), !dbg !28 + %765 = add i32 %764, %763, !dbg !31 + %766 = insertelement <1 x i32> poison, i32 %765, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %766, i1 %96) #5, !dbg !28 + %767 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %768 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %767, i32 1, i32 31), !dbg !28 + %769 = add i32 %768, %767, !dbg !31 + %770 = insertelement <1 x i32> poison, i32 %769, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %770, i1 %96) #5, !dbg !28 + %771 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %772 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %771, i32 1, i32 31), !dbg !28 + %773 = add i32 %772, %771, !dbg !31 + %774 = insertelement <1 x i32> poison, i32 %773, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %774, i1 %96) #5, !dbg !28 + %775 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %776 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %775, i32 1, i32 31), !dbg !28 + %777 = add i32 %776, %775, !dbg !31 + %778 = insertelement <1 x i32> poison, i32 %777, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %778, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %779 = load i32, ptr addrspace(3) %208, align 16, !dbg !28 + %780 = load i32, ptr addrspace(3) %211, align 8, !dbg !28 + %781 = load i32, ptr addrspace(3) %214, align 16, !dbg !28 + %782 = load i32, ptr addrspace(3) %217, align 8, !dbg !28 + %783 = mul nuw nsw i32 %751, %.lobit, !dbg !34 + %784 = mul nuw nsw i32 %753, %.lobit, !dbg !34 + %785 = mul nuw nsw i32 %755, %.lobit, !dbg !34 + %786 = mul nuw nsw i32 %757, %.lobit, !dbg !34 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %787 = insertelement <1 x i32> poison, i32 %783, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %787, i1 true) #5, !dbg !28 + %788 = insertelement <1 x i32> poison, i32 %784, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %788, i1 true) #5, !dbg !28 + %789 = insertelement <1 x i32> poison, i32 %785, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %215, <1 x i32> %789, i1 true) #5, !dbg !28 + %790 = insertelement <1 x i32> poison, i32 %786, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %218, <1 x i32> %790, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %791 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %792 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %791, i32 1, i32 31), !dbg !28 + %793 = add i32 %792, %791, !dbg !31 + %794 = insertelement <1 x i32> poison, i32 %793, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %794, i1 %96) #5, !dbg !28 + %795 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %796 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %795, i32 1, i32 31), !dbg !28 + %797 = add i32 %796, %795, !dbg !31 + %798 = insertelement <1 x i32> poison, i32 %797, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %798, i1 %96) #5, !dbg !28 + %799 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %800 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %799, i32 1, i32 31), !dbg !28 + %801 = add i32 %800, %799, !dbg !31 + %802 = insertelement <1 x i32> poison, i32 %801, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %802, i1 %96) #5, !dbg !28 + %803 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %804 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %803, i32 1, i32 31), !dbg !28 + %805 = add i32 %804, %803, !dbg !31 + %806 = insertelement <1 x i32> poison, i32 %805, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %806, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %807 = load i32, ptr addrspace(3) %208, align 16, !dbg !28 + %808 = load i32, ptr addrspace(3) %211, align 8, !dbg !28 + %809 = load i32, ptr addrspace(3) %214, align 16, !dbg !28 + %810 = load i32, ptr addrspace(3) %217, align 8, !dbg !28 + %811 = insertelement <2 x i32> poison, i32 %720, i64 0, !dbg !39 + %812 = insertelement <2 x i32> %811, i32 %719, i64 1, !dbg !39 + %813 = insertelement <2 x i32> poison, i32 %748, i64 0, !dbg !39 + %814 = insertelement <2 x i32> %813, i32 %747, i64 1, !dbg !39 + %815 = icmp slt <2 x i32> %812, %814, !dbg !39 + %816 = insertelement <2 x i32> poison, i32 %722, i64 0, !dbg !39 + %817 = insertelement <2 x i32> %816, i32 %721, i64 1, !dbg !39 + %818 = insertelement <2 x i32> poison, i32 %750, i64 0, !dbg !39 + %819 = insertelement <2 x i32> %818, i32 %749, i64 1, !dbg !39 + %820 = icmp sge <2 x i32> %817, %819, !dbg !39 + %821 = icmp eq <2 x i32> %812, %814, !dbg !40 + %822 = icmp ne <2 x i32> %817, %819, !dbg !40 + %823 = insertelement <2 x i32> poison, i32 %780, i64 0, !dbg !41 + %824 = insertelement <2 x i32> %823, i32 %779, i64 1, !dbg !41 + %825 = insertelement <2 x i32> poison, i32 %808, i64 0, !dbg !41 + %826 = insertelement <2 x i32> %825, i32 %807, i64 1, !dbg !41 + %827 = icmp sgt <2 x i32> %824, %826, !dbg !41 + %828 = insertelement <2 x i32> poison, i32 %782, i64 0, !dbg !41 + %829 = insertelement <2 x i32> %828, i32 %781, i64 1, !dbg !41 + %830 = insertelement <2 x i32> poison, i32 %810, i64 0, !dbg !41 + %831 = insertelement <2 x i32> %830, i32 %809, i64 1, !dbg !41 + %832 = icmp sle <2 x i32> %829, %831, !dbg !41 + %833 = and <2 x i1> %821, %827, !dbg !42 + %834 = or <2 x i1> %822, %832, !dbg !43 + %835 = or <2 x i1> %815, %833, !dbg !43 + %836 = and <2 x i1> %820, %834, !dbg !44 + %837 = xor <2 x i32> %814, %812, !dbg !36 + %838 = xor <2 x i32> %819, %817, !dbg !36 + %839 = select <2 x i1> %835, <2 x i32> %837, <2 x i32> zeroinitializer, !dbg !37 + %840 = select <2 x i1> %836, <2 x i32> %838, <2 x i32> zeroinitializer, !dbg !37 + %841 = xor <2 x i32> %839, %678, !dbg !38 + %842 = xor <2 x i32> %840, %682, !dbg !38 + %843 = xor <2 x i32> %826, %824, !dbg !45 + %844 = xor <2 x i32> %831, %829, !dbg !45 + %845 = select <2 x i1> %835, <2 x i32> %843, <2 x i32> zeroinitializer, !dbg !46 + %846 = select <2 x i1> %836, <2 x i32> %844, <2 x i32> zeroinitializer, !dbg !46 + %847 = xor <2 x i32> %845, %686, !dbg !47 + %848 = xor <2 x i32> %846, %690, !dbg !47 + %849 = extractelement <2 x i32> %841, i64 1, !dbg !32 + %850 = mul nuw nsw i32 %849, %71, !dbg !27 + %851 = extractelement <2 x i32> %841, i64 0, !dbg !32 + %852 = mul nuw nsw i32 %851, %71, !dbg !27 + %853 = extractelement <2 x i32> %842, i64 1, !dbg !32 + %854 = mul nuw nsw i32 %853, %71, !dbg !27 + %855 = extractelement <2 x i32> %842, i64 0, !dbg !32 + %856 = mul nuw nsw i32 %855, %71, !dbg !27 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %857 = insertelement <1 x i32> poison, i32 %850, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %79, <1 x i32> %857, i1 true) #5, !dbg !28 + %858 = insertelement <1 x i32> poison, i32 %852, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %858, i1 true) #5, !dbg !28 + %859 = insertelement <1 x i32> poison, i32 %854, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %859, i1 true) #5, !dbg !28 + %860 = insertelement <1 x i32> poison, i32 %856, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %88, <1 x i32> %860, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %861 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %862 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %861, i32 1, i32 31), !dbg !28 + %863 = add i32 %862, %861, !dbg !31 + %864 = insertelement <1 x i32> poison, i32 %863, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %864, i1 %96) #5, !dbg !28 + %865 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %866 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %865, i32 1, i32 31), !dbg !28 + %867 = add i32 %866, %865, !dbg !31 + %868 = insertelement <1 x i32> poison, i32 %867, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %868, i1 %96) #5, !dbg !28 + %869 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %870 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %869, i32 1, i32 31), !dbg !28 + %871 = add i32 %870, %869, !dbg !31 + %872 = insertelement <1 x i32> poison, i32 %871, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %872, i1 %96) #5, !dbg !28 + %873 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %874 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %873, i32 1, i32 31), !dbg !28 + %875 = add i32 %874, %873, !dbg !31 + %876 = insertelement <1 x i32> poison, i32 %875, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %876, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %877 = load i32, ptr addrspace(3) %78, align 8, !dbg !28 + %878 = load i32, ptr addrspace(3) %81, align 8, !dbg !28 + %879 = load i32, ptr addrspace(3) %84, align 8, !dbg !28 + %880 = load i32, ptr addrspace(3) %87, align 8, !dbg !28 + %881 = mul nuw nsw i32 %849, %.lobit1, !dbg !32 + %882 = mul nuw nsw i32 %851, %.lobit1, !dbg !32 + %883 = mul nuw nsw i32 %853, %.lobit1, !dbg !32 + %884 = mul nuw nsw i32 %855, %.lobit1, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %885 = insertelement <1 x i32> poison, i32 %881, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %79, <1 x i32> %885, i1 true) #5, !dbg !28 + %886 = insertelement <1 x i32> poison, i32 %882, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %886, i1 true) #5, !dbg !28 + %887 = insertelement <1 x i32> poison, i32 %883, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %887, i1 true) #5, !dbg !28 + %888 = insertelement <1 x i32> poison, i32 %884, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %88, <1 x i32> %888, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %889 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %890 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %889, i32 1, i32 31), !dbg !28 + %891 = add i32 %890, %889, !dbg !31 + %892 = insertelement <1 x i32> poison, i32 %891, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %892, i1 %96) #5, !dbg !28 + %893 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %894 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %893, i32 1, i32 31), !dbg !28 + %895 = add i32 %894, %893, !dbg !31 + %896 = insertelement <1 x i32> poison, i32 %895, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %896, i1 %96) #5, !dbg !28 + %897 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %898 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %897, i32 1, i32 31), !dbg !28 + %899 = add i32 %898, %897, !dbg !31 + %900 = insertelement <1 x i32> poison, i32 %899, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %900, i1 %96) #5, !dbg !28 + %901 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %902 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %901, i32 1, i32 31), !dbg !28 + %903 = add i32 %902, %901, !dbg !31 + %904 = insertelement <1 x i32> poison, i32 %903, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %904, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %905 = load i32, ptr addrspace(3) %78, align 8, !dbg !28 + %906 = load i32, ptr addrspace(3) %81, align 8, !dbg !28 + %907 = load i32, ptr addrspace(3) %84, align 8, !dbg !28 + %908 = load i32, ptr addrspace(3) %87, align 8, !dbg !28 + %909 = extractelement <2 x i32> %847, i64 1, !dbg !34 + %910 = mul nuw nsw i32 %909, %71, !dbg !33 + %911 = extractelement <2 x i32> %847, i64 0, !dbg !34 + %912 = mul nuw nsw i32 %911, %71, !dbg !33 + %913 = extractelement <2 x i32> %848, i64 1, !dbg !34 + %914 = mul nuw nsw i32 %913, %71, !dbg !33 + %915 = extractelement <2 x i32> %848, i64 0, !dbg !34 + %916 = mul nuw nsw i32 %915, %71, !dbg !33 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %917 = insertelement <1 x i32> poison, i32 %910, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %79, <1 x i32> %917, i1 true) #5, !dbg !28 + %918 = insertelement <1 x i32> poison, i32 %912, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %918, i1 true) #5, !dbg !28 + %919 = insertelement <1 x i32> poison, i32 %914, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %919, i1 true) #5, !dbg !28 + %920 = insertelement <1 x i32> poison, i32 %916, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %88, <1 x i32> %920, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %921 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %922 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %921, i32 1, i32 31), !dbg !28 + %923 = add i32 %922, %921, !dbg !31 + %924 = insertelement <1 x i32> poison, i32 %923, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %924, i1 %96) #5, !dbg !28 + %925 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %926 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %925, i32 1, i32 31), !dbg !28 + %927 = add i32 %926, %925, !dbg !31 + %928 = insertelement <1 x i32> poison, i32 %927, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %928, i1 %96) #5, !dbg !28 + %929 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %930 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %929, i32 1, i32 31), !dbg !28 + %931 = add i32 %930, %929, !dbg !31 + %932 = insertelement <1 x i32> poison, i32 %931, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %932, i1 %96) #5, !dbg !28 + %933 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %934 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %933, i32 1, i32 31), !dbg !28 + %935 = add i32 %934, %933, !dbg !31 + %936 = insertelement <1 x i32> poison, i32 %935, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %936, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %937 = load i32, ptr addrspace(3) %78, align 8, !dbg !28 + %938 = load i32, ptr addrspace(3) %81, align 8, !dbg !28 + %939 = load i32, ptr addrspace(3) %84, align 8, !dbg !28 + %940 = load i32, ptr addrspace(3) %87, align 8, !dbg !28 + %941 = mul nuw nsw i32 %909, %.lobit1, !dbg !34 + %942 = mul nuw nsw i32 %911, %.lobit1, !dbg !34 + %943 = mul nuw nsw i32 %913, %.lobit1, !dbg !34 + %944 = mul nuw nsw i32 %915, %.lobit1, !dbg !34 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %945 = insertelement <1 x i32> poison, i32 %941, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %79, <1 x i32> %945, i1 true) #5, !dbg !28 + %946 = insertelement <1 x i32> poison, i32 %942, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %946, i1 true) #5, !dbg !28 + %947 = insertelement <1 x i32> poison, i32 %943, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %947, i1 true) #5, !dbg !28 + %948 = insertelement <1 x i32> poison, i32 %944, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %88, <1 x i32> %948, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %949 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %950 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %949, i32 1, i32 31), !dbg !28 + %951 = add i32 %950, %949, !dbg !31 + %952 = insertelement <1 x i32> poison, i32 %951, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %952, i1 %96) #5, !dbg !28 + %953 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %954 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %953, i32 1, i32 31), !dbg !28 + %955 = add i32 %954, %953, !dbg !31 + %956 = insertelement <1 x i32> poison, i32 %955, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %956, i1 %96) #5, !dbg !28 + %957 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %958 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %957, i32 1, i32 31), !dbg !28 + %959 = add i32 %958, %957, !dbg !31 + %960 = insertelement <1 x i32> poison, i32 %959, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %960, i1 %96) #5, !dbg !28 + %961 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %962 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %961, i32 1, i32 31), !dbg !28 + %963 = add i32 %962, %961, !dbg !31 + %964 = insertelement <1 x i32> poison, i32 %963, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %964, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %965 = load i32, ptr addrspace(3) %78, align 8, !dbg !28 + %966 = load i32, ptr addrspace(3) %81, align 8, !dbg !28 + %967 = load i32, ptr addrspace(3) %84, align 8, !dbg !28 + %968 = load i32, ptr addrspace(3) %87, align 8, !dbg !28 + %969 = insertelement <2 x i32> poison, i32 %878, i64 0, !dbg !39 + %970 = insertelement <2 x i32> %969, i32 %877, i64 1, !dbg !39 + %971 = insertelement <2 x i32> poison, i32 %906, i64 0, !dbg !39 + %972 = insertelement <2 x i32> %971, i32 %905, i64 1, !dbg !39 + %973 = icmp slt <2 x i32> %970, %972, !dbg !39 + %974 = insertelement <2 x i32> poison, i32 %880, i64 0, !dbg !39 + %975 = insertelement <2 x i32> %974, i32 %879, i64 1, !dbg !39 + %976 = insertelement <2 x i32> poison, i32 %908, i64 0, !dbg !39 + %977 = insertelement <2 x i32> %976, i32 %907, i64 1, !dbg !39 + %978 = icmp sge <2 x i32> %975, %977, !dbg !39 + %979 = icmp eq <2 x i32> %970, %972, !dbg !40 + %980 = icmp ne <2 x i32> %975, %977, !dbg !40 + %981 = insertelement <2 x i32> poison, i32 %938, i64 0, !dbg !41 + %982 = insertelement <2 x i32> %981, i32 %937, i64 1, !dbg !41 + %983 = insertelement <2 x i32> poison, i32 %966, i64 0, !dbg !41 + %984 = insertelement <2 x i32> %983, i32 %965, i64 1, !dbg !41 + %985 = icmp sgt <2 x i32> %982, %984, !dbg !41 + %986 = insertelement <2 x i32> poison, i32 %940, i64 0, !dbg !41 + %987 = insertelement <2 x i32> %986, i32 %939, i64 1, !dbg !41 + %988 = insertelement <2 x i32> poison, i32 %968, i64 0, !dbg !41 + %989 = insertelement <2 x i32> %988, i32 %967, i64 1, !dbg !41 + %990 = icmp sle <2 x i32> %987, %989, !dbg !41 + %991 = and <2 x i1> %979, %985, !dbg !42 + %992 = or <2 x i1> %980, %990, !dbg !43 + %993 = or <2 x i1> %973, %991, !dbg !43 + %994 = and <2 x i1> %978, %992, !dbg !44 + %995 = xor <2 x i32> %972, %970, !dbg !36 + %996 = xor <2 x i32> %977, %975, !dbg !36 + %997 = select <2 x i1> %993, <2 x i32> %995, <2 x i32> zeroinitializer, !dbg !37 + %998 = select <2 x i1> %994, <2 x i32> %996, <2 x i32> zeroinitializer, !dbg !37 + %999 = xor <2 x i32> %997, %841, !dbg !38 + %1000 = xor <2 x i32> %998, %842, !dbg !38 + %1001 = xor <2 x i32> %984, %982, !dbg !45 + %1002 = xor <2 x i32> %989, %987, !dbg !45 + %1003 = select <2 x i1> %993, <2 x i32> %1001, <2 x i32> zeroinitializer, !dbg !46 + %1004 = select <2 x i1> %994, <2 x i32> %1002, <2 x i32> zeroinitializer, !dbg !46 + %1005 = xor <2 x i32> %1003, %847, !dbg !47 + %1006 = xor <2 x i32> %1004, %848, !dbg !47 + %1007 = icmp slt <2 x i32> %999, %1000, !dbg !39 + %1008 = icmp eq <2 x i32> %999, %1000, !dbg !40 + %1009 = icmp sgt <2 x i32> %1005, %1006, !dbg !41 + %1010 = and <2 x i1> %1008, %1009, !dbg !42 + %1011 = or <2 x i1> %1007, %1010, !dbg !43 + %1012 = xor <2 x i32> %1000, %999, !dbg !36 + %1013 = select <2 x i1> %1011, <2 x i32> %1012, <2 x i32> zeroinitializer, !dbg !37 + %foldExtExtBinop77 = xor <2 x i32> %1013, %999, !dbg !38 + %1014 = extractelement <2 x i32> %foldExtExtBinop77, i64 1, !dbg !38 + %foldExtExtBinop79 = xor <2 x i32> %1013, %999, !dbg !38 + %1015 = extractelement <2 x i32> %foldExtExtBinop79, i64 0, !dbg !38 + %1016 = xor <2 x i32> %1013, %1000, !dbg !38 + %1017 = extractelement <2 x i32> %1016, i64 0, !dbg !39 + %1018 = extractelement <2 x i32> %1016, i64 1, !dbg !39 + %1019 = xor i32 %1015, %1014, !dbg !36 + %1020 = xor i32 %1017, %1018, !dbg !36 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1021 = xor <2 x i32> %1006, %1005, !dbg !45 + %1022 = select <2 x i1> %1011, <2 x i32> %1021, <2 x i32> zeroinitializer, !dbg !46 + %1023 = shufflevector <2 x i32> %1022, <2 x i32> poison, <4 x i32> , !dbg !47 + %1024 = shufflevector <2 x i32> %1005, <2 x i32> %1006, <4 x i32> , !dbg !47 + %1025 = xor <4 x i32> %1023, %1024, !dbg !47 + %1026 = shufflevector <2 x i32> %1022, <2 x i32> poison, <2 x i32> , !dbg !47 + %1027 = shufflevector <2 x i32> %1005, <2 x i32> %1006, <2 x i32> , !dbg !47 + %1028 = xor <2 x i32> %1026, %1027, !dbg !47 + %1029 = shufflevector <2 x i32> %1006, <2 x i32> %1005, <2 x i32> , !dbg !47 + %1030 = shufflevector <2 x i32> %1022, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !47 + %1031 = shufflevector <2 x i32> %1006, <2 x i32> %1005, <2 x i32> , !dbg !47 + %1032 = xor <2 x i32> %1030, %1031, !dbg !47 + %1033 = shufflevector <2 x i32> %1022, <2 x i32> poison, <2 x i32> , !dbg !47 + %1034 = shufflevector <2 x i32> %1006, <2 x i32> %1005, <2 x i32> , !dbg !47 + %1035 = xor <2 x i32> %1033, %1034, !dbg !47 + %1036 = shufflevector <2 x i32> %1016, <2 x i32> %foldExtExtBinop77, <2 x i32> , !dbg !39 + %1037 = shufflevector <2 x i32> %1016, <2 x i32> %foldExtExtBinop79, <2 x i32> , !dbg !39 + %1038 = icmp slt <2 x i32> %1036, %1037, !dbg !39 + %1039 = icmp eq <2 x i32> %1036, %1037, !dbg !40 + %1040 = icmp sgt <2 x i32> %1035, %1032, !dbg !41 + %1041 = and <2 x i1> %1039, %1040, !dbg !42 + %1042 = or <2 x i1> %1038, %1041, !dbg !43 + %1043 = extractelement <2 x i1> %1042, i64 1, !dbg !37 + %1044 = select i1 %1043, i32 %1019, i32 0, !dbg !37 + %1045 = extractelement <2 x i1> %1042, i64 0, !dbg !37 + %1046 = select i1 %1045, i32 %1020, i32 0, !dbg !37 + %1047 = xor i32 %1044, %1014, !dbg !38 + %1048 = xor i32 %1044, %1015, !dbg !38 + %1049 = xor i32 %1046, %1018, !dbg !38 + %1050 = xor i32 %1046, %1017, !dbg !38 + %1051 = xor <2 x i32> %1029, %1028, !dbg !45 + %1052 = xor <2 x i32> %1051, %1022, !dbg !45 + %1053 = select <2 x i1> %1042, <2 x i32> %1052, <2 x i32> zeroinitializer, !dbg !46 + %1054 = shufflevector <2 x i32> %1053, <2 x i32> poison, <4 x i32> , !dbg !46 + %1055 = xor <4 x i32> %1054, %1025, !dbg !47 + %1056 = mul nuw nsw i32 %1047, %72, !dbg !27 + %1057 = mul nuw nsw i32 %1048, %72, !dbg !27 + %1058 = mul nuw nsw i32 %1049, %72, !dbg !27 + %1059 = mul nuw nsw i32 %1050, %72, !dbg !27 + %1060 = insertelement <1 x i32> poison, i32 %1056, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %1060, i1 true) #5, !dbg !28 + %1061 = insertelement <1 x i32> poison, i32 %1057, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %1061, i1 true) #5, !dbg !28 + %1062 = insertelement <1 x i32> poison, i32 %1058, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %215, <1 x i32> %1062, i1 true) #5, !dbg !28 + %1063 = insertelement <1 x i32> poison, i32 %1059, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %218, <1 x i32> %1063, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1064 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %1065 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1064, i32 1, i32 31), !dbg !28 + %1066 = add i32 %1065, %1064, !dbg !31 + %1067 = insertelement <1 x i32> poison, i32 %1066, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %1067, i1 %96) #5, !dbg !28 + %1068 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %1069 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1068, i32 1, i32 31), !dbg !28 + %1070 = add i32 %1069, %1068, !dbg !31 + %1071 = insertelement <1 x i32> poison, i32 %1070, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %1071, i1 %96) #5, !dbg !28 + %1072 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %1073 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1072, i32 1, i32 31), !dbg !28 + %1074 = add i32 %1073, %1072, !dbg !31 + %1075 = insertelement <1 x i32> poison, i32 %1074, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %1075, i1 %96) #5, !dbg !28 + %1076 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %1077 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1076, i32 1, i32 31), !dbg !28 + %1078 = add i32 %1077, %1076, !dbg !31 + %1079 = insertelement <1 x i32> poison, i32 %1078, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %1079, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1080 = load i32, ptr addrspace(3) %208, align 16, !dbg !28 + %1081 = load i32, ptr addrspace(3) %211, align 8, !dbg !28 + %1082 = load i32, ptr addrspace(3) %214, align 16, !dbg !28 + %1083 = load i32, ptr addrspace(3) %217, align 8, !dbg !28 + %1084 = mul nuw nsw i32 %1047, %.lobit, !dbg !32 + %1085 = mul nuw nsw i32 %1048, %.lobit, !dbg !32 + %1086 = mul nuw nsw i32 %1049, %.lobit, !dbg !32 + %1087 = mul nuw nsw i32 %1050, %.lobit, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1088 = insertelement <1 x i32> poison, i32 %1084, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %1088, i1 true) #5, !dbg !28 + %1089 = insertelement <1 x i32> poison, i32 %1085, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %1089, i1 true) #5, !dbg !28 + %1090 = insertelement <1 x i32> poison, i32 %1086, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %215, <1 x i32> %1090, i1 true) #5, !dbg !28 + %1091 = insertelement <1 x i32> poison, i32 %1087, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %218, <1 x i32> %1091, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1092 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %1093 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1092, i32 1, i32 31), !dbg !28 + %1094 = add i32 %1093, %1092, !dbg !31 + %1095 = insertelement <1 x i32> poison, i32 %1094, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %1095, i1 %96) #5, !dbg !28 + %1096 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %1097 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1096, i32 1, i32 31), !dbg !28 + %1098 = add i32 %1097, %1096, !dbg !31 + %1099 = insertelement <1 x i32> poison, i32 %1098, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %1099, i1 %96) #5, !dbg !28 + %1100 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %1101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1100, i32 1, i32 31), !dbg !28 + %1102 = add i32 %1101, %1100, !dbg !31 + %1103 = insertelement <1 x i32> poison, i32 %1102, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %1103, i1 %96) #5, !dbg !28 + %1104 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %1105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1104, i32 1, i32 31), !dbg !28 + %1106 = add i32 %1105, %1104, !dbg !31 + %1107 = insertelement <1 x i32> poison, i32 %1106, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %1107, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1108 = load i32, ptr addrspace(3) %208, align 16, !dbg !28 + %1109 = load i32, ptr addrspace(3) %211, align 8, !dbg !28 + %1110 = load i32, ptr addrspace(3) %214, align 16, !dbg !28 + %1111 = load i32, ptr addrspace(3) %217, align 8, !dbg !28 + %1112 = extractelement <4 x i32> %1055, i64 0, !dbg !34 + %1113 = mul nuw nsw i32 %1112, %72, !dbg !33 + %1114 = extractelement <4 x i32> %1055, i64 1, !dbg !34 + %1115 = mul nuw nsw i32 %1114, %72, !dbg !33 + %1116 = extractelement <4 x i32> %1055, i64 2, !dbg !34 + %1117 = mul nuw nsw i32 %1116, %72, !dbg !33 + %1118 = extractelement <4 x i32> %1055, i64 3, !dbg !34 + %1119 = mul nuw nsw i32 %1118, %72, !dbg !33 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1120 = insertelement <1 x i32> poison, i32 %1113, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %1120, i1 true) #5, !dbg !28 + %1121 = insertelement <1 x i32> poison, i32 %1115, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %1121, i1 true) #5, !dbg !28 + %1122 = insertelement <1 x i32> poison, i32 %1117, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %215, <1 x i32> %1122, i1 true) #5, !dbg !28 + %1123 = insertelement <1 x i32> poison, i32 %1119, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %218, <1 x i32> %1123, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1124 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %1125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1124, i32 1, i32 31), !dbg !28 + %1126 = add i32 %1125, %1124, !dbg !31 + %1127 = insertelement <1 x i32> poison, i32 %1126, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %1127, i1 %96) #5, !dbg !28 + %1128 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %1129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1128, i32 1, i32 31), !dbg !28 + %1130 = add i32 %1129, %1128, !dbg !31 + %1131 = insertelement <1 x i32> poison, i32 %1130, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %1131, i1 %96) #5, !dbg !28 + %1132 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %1133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1132, i32 1, i32 31), !dbg !28 + %1134 = add i32 %1133, %1132, !dbg !31 + %1135 = insertelement <1 x i32> poison, i32 %1134, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %1135, i1 %96) #5, !dbg !28 + %1136 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %1137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1136, i32 1, i32 31), !dbg !28 + %1138 = add i32 %1137, %1136, !dbg !31 + %1139 = insertelement <1 x i32> poison, i32 %1138, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %1139, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1140 = load i32, ptr addrspace(3) %208, align 16, !dbg !28 + %1141 = load i32, ptr addrspace(3) %211, align 8, !dbg !28 + %1142 = load i32, ptr addrspace(3) %214, align 16, !dbg !28 + %1143 = load i32, ptr addrspace(3) %217, align 8, !dbg !28 + %1144 = mul nuw nsw i32 %1112, %.lobit, !dbg !34 + %1145 = mul nuw nsw i32 %1114, %.lobit, !dbg !34 + %1146 = mul nuw nsw i32 %1116, %.lobit, !dbg !34 + %1147 = mul nuw nsw i32 %1118, %.lobit, !dbg !34 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1148 = insertelement <1 x i32> poison, i32 %1144, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %1148, i1 true) #5, !dbg !28 + %1149 = insertelement <1 x i32> poison, i32 %1145, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %1149, i1 true) #5, !dbg !28 + %1150 = insertelement <1 x i32> poison, i32 %1146, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %215, <1 x i32> %1150, i1 true) #5, !dbg !28 + %1151 = insertelement <1 x i32> poison, i32 %1147, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %218, <1 x i32> %1151, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1152 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %1153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1152, i32 1, i32 31), !dbg !28 + %1154 = add i32 %1153, %1152, !dbg !31 + %1155 = insertelement <1 x i32> poison, i32 %1154, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %1155, i1 %96) #5, !dbg !28 + %1156 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %1157 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1156, i32 1, i32 31), !dbg !28 + %1158 = add i32 %1157, %1156, !dbg !31 + %1159 = insertelement <1 x i32> poison, i32 %1158, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %1159, i1 %96) #5, !dbg !28 + %1160 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %1161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1160, i32 1, i32 31), !dbg !28 + %1162 = add i32 %1161, %1160, !dbg !31 + %1163 = insertelement <1 x i32> poison, i32 %1162, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %1163, i1 %96) #5, !dbg !28 + %1164 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %1165 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1164, i32 1, i32 31), !dbg !28 + %1166 = add i32 %1165, %1164, !dbg !31 + %1167 = insertelement <1 x i32> poison, i32 %1166, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %1167, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1168 = load i32, ptr addrspace(3) %208, align 16, !dbg !28 + %1169 = load i32, ptr addrspace(3) %211, align 8, !dbg !28 + %1170 = load i32, ptr addrspace(3) %214, align 16, !dbg !28 + %1171 = load i32, ptr addrspace(3) %217, align 8, !dbg !28 + %1172 = insertelement <4 x i32> poison, i32 %1080, i64 0, !dbg !39 + %1173 = insertelement <4 x i32> %1172, i32 %1081, i64 1, !dbg !39 + %1174 = insertelement <4 x i32> %1173, i32 %1082, i64 2, !dbg !39 + %1175 = insertelement <4 x i32> %1174, i32 %1083, i64 3, !dbg !39 + %1176 = insertelement <4 x i32> poison, i32 %1108, i64 0, !dbg !39 + %1177 = insertelement <4 x i32> %1176, i32 %1109, i64 1, !dbg !39 + %1178 = insertelement <4 x i32> %1177, i32 %1110, i64 2, !dbg !39 + %1179 = insertelement <4 x i32> %1178, i32 %1111, i64 3, !dbg !39 + %1180 = icmp slt <4 x i32> %1175, %1179, !dbg !39 + %1181 = icmp eq <4 x i32> %1175, %1179, !dbg !40 + %1182 = insertelement <4 x i32> poison, i32 %1140, i64 0, !dbg !41 + %1183 = insertelement <4 x i32> %1182, i32 %1141, i64 1, !dbg !41 + %1184 = insertelement <4 x i32> %1183, i32 %1142, i64 2, !dbg !41 + %1185 = insertelement <4 x i32> %1184, i32 %1143, i64 3, !dbg !41 + %1186 = insertelement <4 x i32> poison, i32 %1168, i64 0, !dbg !41 + %1187 = insertelement <4 x i32> %1186, i32 %1169, i64 1, !dbg !41 + %1188 = insertelement <4 x i32> %1187, i32 %1170, i64 2, !dbg !41 + %1189 = insertelement <4 x i32> %1188, i32 %1171, i64 3, !dbg !41 + %1190 = icmp sgt <4 x i32> %1185, %1189, !dbg !41 + %1191 = and <4 x i1> %1181, %1190, !dbg !42 + %1192 = or <4 x i1> %1180, %1191, !dbg !43 + %1193 = xor i32 %1108, %1080, !dbg !36 + %1194 = xor i32 %1109, %1081, !dbg !36 + %1195 = xor i32 %1110, %1082, !dbg !36 + %1196 = xor i32 %1111, %1083, !dbg !36 + %1197 = extractelement <4 x i1> %1192, i64 0, !dbg !37 + %1198 = select i1 %1197, i32 %1193, i32 0, !dbg !37 + %1199 = extractelement <4 x i1> %1192, i64 1, !dbg !37 + %1200 = select i1 %1199, i32 %1194, i32 0, !dbg !37 + %1201 = extractelement <4 x i1> %1192, i64 2, !dbg !37 + %1202 = select i1 %1201, i32 %1195, i32 0, !dbg !37 + %1203 = extractelement <4 x i1> %1192, i64 3, !dbg !37 + %1204 = select i1 %1203, i32 %1196, i32 0, !dbg !37 + %1205 = xor i32 %1198, %1047, !dbg !38 + %1206 = xor i32 %1200, %1048, !dbg !38 + %1207 = xor i32 %1202, %1049, !dbg !38 + %1208 = xor i32 %1204, %1050, !dbg !38 + %1209 = xor <4 x i32> %1189, %1185, !dbg !45 + %1210 = select <4 x i1> %1192, <4 x i32> %1209, <4 x i32> zeroinitializer, !dbg !46 + %1211 = xor <4 x i32> %1210, %1055, !dbg !47 + %1212 = mul nuw nsw i32 %1205, %71, !dbg !27 + %1213 = mul nuw nsw i32 %1206, %71, !dbg !27 + %1214 = mul nuw nsw i32 %1207, %71, !dbg !27 + %1215 = mul nuw nsw i32 %1208, %71, !dbg !27 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1216 = insertelement <1 x i32> poison, i32 %1212, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %79, <1 x i32> %1216, i1 true) #5, !dbg !28 + %1217 = insertelement <1 x i32> poison, i32 %1213, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1217, i1 true) #5, !dbg !28 + %1218 = insertelement <1 x i32> poison, i32 %1214, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %1218, i1 true) #5, !dbg !28 + %1219 = insertelement <1 x i32> poison, i32 %1215, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %88, <1 x i32> %1219, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1220 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %1221 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1220, i32 1, i32 31), !dbg !28 + %1222 = add i32 %1221, %1220, !dbg !31 + %1223 = insertelement <1 x i32> poison, i32 %1222, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %1223, i1 %96) #5, !dbg !28 + %1224 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %1225 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1224, i32 1, i32 31), !dbg !28 + %1226 = add i32 %1225, %1224, !dbg !31 + %1227 = insertelement <1 x i32> poison, i32 %1226, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %1227, i1 %96) #5, !dbg !28 + %1228 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %1229 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1228, i32 1, i32 31), !dbg !28 + %1230 = add i32 %1229, %1228, !dbg !31 + %1231 = insertelement <1 x i32> poison, i32 %1230, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %1231, i1 %96) #5, !dbg !28 + %1232 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %1233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1232, i32 1, i32 31), !dbg !28 + %1234 = add i32 %1233, %1232, !dbg !31 + %1235 = insertelement <1 x i32> poison, i32 %1234, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %1235, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1236 = load i32, ptr addrspace(3) %78, align 8, !dbg !28 + %1237 = load i32, ptr addrspace(3) %81, align 8, !dbg !28 + %1238 = load i32, ptr addrspace(3) %84, align 8, !dbg !28 + %1239 = load i32, ptr addrspace(3) %87, align 8, !dbg !28 + %1240 = mul nuw nsw i32 %1205, %.lobit1, !dbg !32 + %1241 = mul nuw nsw i32 %1206, %.lobit1, !dbg !32 + %1242 = mul nuw nsw i32 %1207, %.lobit1, !dbg !32 + %1243 = mul nuw nsw i32 %1208, %.lobit1, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1244 = insertelement <1 x i32> poison, i32 %1240, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %79, <1 x i32> %1244, i1 true) #5, !dbg !28 + %1245 = insertelement <1 x i32> poison, i32 %1241, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1245, i1 true) #5, !dbg !28 + %1246 = insertelement <1 x i32> poison, i32 %1242, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %1246, i1 true) #5, !dbg !28 + %1247 = insertelement <1 x i32> poison, i32 %1243, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %88, <1 x i32> %1247, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1248 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %1249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1248, i32 1, i32 31), !dbg !28 + %1250 = add i32 %1249, %1248, !dbg !31 + %1251 = insertelement <1 x i32> poison, i32 %1250, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %1251, i1 %96) #5, !dbg !28 + %1252 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %1253 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1252, i32 1, i32 31), !dbg !28 + %1254 = add i32 %1253, %1252, !dbg !31 + %1255 = insertelement <1 x i32> poison, i32 %1254, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %1255, i1 %96) #5, !dbg !28 + %1256 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %1257 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1256, i32 1, i32 31), !dbg !28 + %1258 = add i32 %1257, %1256, !dbg !31 + %1259 = insertelement <1 x i32> poison, i32 %1258, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %1259, i1 %96) #5, !dbg !28 + %1260 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %1261 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1260, i32 1, i32 31), !dbg !28 + %1262 = add i32 %1261, %1260, !dbg !31 + %1263 = insertelement <1 x i32> poison, i32 %1262, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %1263, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1264 = load i32, ptr addrspace(3) %78, align 8, !dbg !28 + %1265 = load i32, ptr addrspace(3) %81, align 8, !dbg !28 + %1266 = load i32, ptr addrspace(3) %84, align 8, !dbg !28 + %1267 = load i32, ptr addrspace(3) %87, align 8, !dbg !28 + %1268 = extractelement <4 x i32> %1211, i64 0, !dbg !34 + %1269 = mul nuw nsw i32 %1268, %71, !dbg !33 + %1270 = extractelement <4 x i32> %1211, i64 1, !dbg !34 + %1271 = mul nuw nsw i32 %1270, %71, !dbg !33 + %1272 = extractelement <4 x i32> %1211, i64 2, !dbg !34 + %1273 = mul nuw nsw i32 %1272, %71, !dbg !33 + %1274 = extractelement <4 x i32> %1211, i64 3, !dbg !34 + %1275 = mul nuw nsw i32 %1274, %71, !dbg !33 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1276 = insertelement <1 x i32> poison, i32 %1269, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %79, <1 x i32> %1276, i1 true) #5, !dbg !28 + %1277 = insertelement <1 x i32> poison, i32 %1271, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1277, i1 true) #5, !dbg !28 + %1278 = insertelement <1 x i32> poison, i32 %1273, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %1278, i1 true) #5, !dbg !28 + %1279 = insertelement <1 x i32> poison, i32 %1275, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %88, <1 x i32> %1279, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1280 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %1281 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1280, i32 1, i32 31), !dbg !28 + %1282 = add i32 %1281, %1280, !dbg !31 + %1283 = insertelement <1 x i32> poison, i32 %1282, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %1283, i1 %96) #5, !dbg !28 + %1284 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %1285 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1284, i32 1, i32 31), !dbg !28 + %1286 = add i32 %1285, %1284, !dbg !31 + %1287 = insertelement <1 x i32> poison, i32 %1286, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %1287, i1 %96) #5, !dbg !28 + %1288 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %1289 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1288, i32 1, i32 31), !dbg !28 + %1290 = add i32 %1289, %1288, !dbg !31 + %1291 = insertelement <1 x i32> poison, i32 %1290, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %1291, i1 %96) #5, !dbg !28 + %1292 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %1293 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1292, i32 1, i32 31), !dbg !28 + %1294 = add i32 %1293, %1292, !dbg !31 + %1295 = insertelement <1 x i32> poison, i32 %1294, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %1295, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1296 = load i32, ptr addrspace(3) %78, align 8, !dbg !28 + %1297 = load i32, ptr addrspace(3) %81, align 8, !dbg !28 + %1298 = load i32, ptr addrspace(3) %84, align 8, !dbg !28 + %1299 = load i32, ptr addrspace(3) %87, align 8, !dbg !28 + %1300 = mul nuw nsw i32 %1268, %.lobit1, !dbg !34 + %1301 = mul nuw nsw i32 %1270, %.lobit1, !dbg !34 + %1302 = mul nuw nsw i32 %1272, %.lobit1, !dbg !34 + %1303 = mul nuw nsw i32 %1274, %.lobit1, !dbg !34 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1304 = insertelement <1 x i32> poison, i32 %1300, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %79, <1 x i32> %1304, i1 true) #5, !dbg !28 + %1305 = insertelement <1 x i32> poison, i32 %1301, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1305, i1 true) #5, !dbg !28 + %1306 = insertelement <1 x i32> poison, i32 %1302, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %1306, i1 true) #5, !dbg !28 + %1307 = insertelement <1 x i32> poison, i32 %1303, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %88, <1 x i32> %1307, i1 true) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1308 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %91, i1 %90) #5, !dbg !28 + %1309 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1308, i32 1, i32 31), !dbg !28 + %1310 = add i32 %1309, %1308, !dbg !31 + %1311 = insertelement <1 x i32> poison, i32 %1310, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %91, <1 x i32> %1311, i1 %96) #5, !dbg !28 + %1312 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %98, i1 %90) #5, !dbg !28 + %1313 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1312, i32 1, i32 31), !dbg !28 + %1314 = add i32 %1313, %1312, !dbg !31 + %1315 = insertelement <1 x i32> poison, i32 %1314, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, <1 x i32> %1315, i1 %96) #5, !dbg !28 + %1316 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %90) #5, !dbg !28 + %1317 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1316, i32 1, i32 31), !dbg !28 + %1318 = add i32 %1317, %1316, !dbg !31 + %1319 = insertelement <1 x i32> poison, i32 %1318, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, <1 x i32> %1319, i1 %96) #5, !dbg !28 + %1320 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %108, i1 %90) #5, !dbg !28 + %1321 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1320, i32 1, i32 31), !dbg !28 + %1322 = add i32 %1321, %1320, !dbg !31 + %1323 = insertelement <1 x i32> poison, i32 %1322, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %108, <1 x i32> %1323, i1 %96) #5, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %1324 = load i32, ptr addrspace(3) %78, align 8, !dbg !28 + %1325 = load i32, ptr addrspace(3) %81, align 8, !dbg !28 + %1326 = load i32, ptr addrspace(3) %84, align 8, !dbg !28 + %1327 = load i32, ptr addrspace(3) %87, align 8, !dbg !28 + %1328 = insertelement <4 x i32> poison, i32 %1236, i64 0, !dbg !39 + %1329 = insertelement <4 x i32> %1328, i32 %1237, i64 1, !dbg !39 + %1330 = insertelement <4 x i32> %1329, i32 %1238, i64 2, !dbg !39 + %1331 = insertelement <4 x i32> %1330, i32 %1239, i64 3, !dbg !39 + %1332 = insertelement <4 x i32> poison, i32 %1264, i64 0, !dbg !39 + %1333 = insertelement <4 x i32> %1332, i32 %1265, i64 1, !dbg !39 + %1334 = insertelement <4 x i32> %1333, i32 %1266, i64 2, !dbg !39 + %1335 = insertelement <4 x i32> %1334, i32 %1267, i64 3, !dbg !39 + %1336 = icmp slt <4 x i32> %1331, %1335, !dbg !39 + %1337 = icmp eq <4 x i32> %1331, %1335, !dbg !40 + %1338 = insertelement <4 x i32> poison, i32 %1296, i64 0, !dbg !41 + %1339 = insertelement <4 x i32> %1338, i32 %1297, i64 1, !dbg !41 + %1340 = insertelement <4 x i32> %1339, i32 %1298, i64 2, !dbg !41 + %1341 = insertelement <4 x i32> %1340, i32 %1299, i64 3, !dbg !41 + %1342 = insertelement <4 x i32> poison, i32 %1324, i64 0, !dbg !41 + %1343 = insertelement <4 x i32> %1342, i32 %1325, i64 1, !dbg !41 + %1344 = insertelement <4 x i32> %1343, i32 %1326, i64 2, !dbg !41 + %1345 = insertelement <4 x i32> %1344, i32 %1327, i64 3, !dbg !41 + %1346 = icmp sgt <4 x i32> %1341, %1345, !dbg !41 + %1347 = and <4 x i1> %1337, %1346, !dbg !42 + %1348 = or <4 x i1> %1336, %1347, !dbg !43 + %1349 = xor <4 x i32> %1345, %1341, !dbg !45 + %1350 = select <4 x i1> %1348, <4 x i32> %1349, <4 x i32> zeroinitializer, !dbg !46 + %1351 = xor <4 x i32> %1350, %1211, !dbg !47 + %1352 = insertelement <4 x i1> poison, i1 %18, i64 0, !dbg !48 + %1353 = shufflevector <4 x i1> %1352, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !48 + %1354 = insertelement <4 x i32> poison, i32 %65, i64 0, !dbg !48 + %1355 = insertelement <4 x i32> %1354, i32 %63, i64 1, !dbg !48 + %1356 = insertelement <4 x i32> %1355, i32 %67, i64 2, !dbg !48 + %1357 = insertelement <4 x i32> %1356, i32 %69, i64 3, !dbg !48 + %1358 = sext <4 x i32> %1357 to <4 x i64>, !dbg !48 + %1359 = select <4 x i1> %1353, <4 x i64> %1358, <4 x i64> zeroinitializer, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !49 + %1360 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %1359), !dbg !51 + %1361 = and i32 %12, 3, !dbg !49 + %1362 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %202, !dbg !49 + %1363 = getelementptr i64, ptr addrspace(3) %1362, i32 %1361, !dbg !49 + %1364 = insertelement <1 x i64> poison, i64 %1360, i64 0, !dbg !49 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %1363, <1 x i64> %1364, i1 true) #5, !dbg !49 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !49 + %1365 = icmp samesign ult i32 %11, 128, !dbg !49 + %1366 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %11, !dbg !49 + %1367 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %1366, i1 %1365) #5, !dbg !49 + %extelt.offset = lshr i64 %1367, 32, !dbg !49 + %1368 = trunc nuw i64 %extelt.offset to i32, !dbg !49 + %1369 = trunc i64 %1367 to i32, !dbg !49 + %1370 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1369, i32 2, i32 31), !dbg !49 + %1371 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1368, i32 2, i32 31), !dbg !49 + %1372 = insertelement <2 x i32> poison, i32 %1370, i64 0, !dbg !49 + %1373 = insertelement <2 x i32> %1372, i32 %1371, i64 1, !dbg !49 + %1374 = bitcast <2 x i32> %1373 to i64, !dbg !49 + %1375 = add i64 %1367, %1374, !dbg !51 + %extelt.offset62 = lshr i64 %1375, 32, !dbg !49 + %1376 = trunc nuw i64 %extelt.offset62 to i32, !dbg !49 + %1377 = trunc i64 %1375 to i32, !dbg !49 + %1378 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1377, i32 1, i32 31), !dbg !49 + %1379 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1376, i32 1, i32 31), !dbg !49 + %1380 = insertelement <2 x i32> poison, i32 %1378, i64 0, !dbg !49 + %1381 = insertelement <2 x i32> %1380, i32 %1379, i64 1, !dbg !49 + %1382 = bitcast <2 x i32> %1381 to i64, !dbg !49 + %1383 = add i64 %1375, %1382, !dbg !51 + %1384 = and i32 %11, 899, !dbg !49 + %1385 = icmp eq i32 %1384, 0, !dbg !49 + %1386 = insertelement <1 x i64> poison, i64 %1383, i64 0, !dbg !49 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %1366, <1 x i64> %1386, i1 %1385) #5, !dbg !49 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !49 + %1387 = load i64, ptr addrspace(3) %1362, align 16, !dbg !49 + %1388 = trunc i64 %1387 to i32, !dbg !52 + %1389 = icmp slt i64 %3, 2, !dbg !53 + %1390 = icmp sgt i64 %3, 1, !dbg !54 + %1391 = select i1 %1390, i64 %3, i64 0, !dbg !55 + %1392 = zext i1 %1389 to i64, !dbg !56 + %1393 = add i64 %1391, %1392, !dbg !57 + %1394 = shl i64 %1393, 4, !dbg !16 + %1395 = mul i64 %1394, %31, !dbg !58 + %.idx = shl nsw i64 %.decomposed82, 6, !dbg !59 + %1396 = getelementptr i8, ptr addrspace(1) %1, i64 %.idx, !dbg !59 + %1397 = getelementptr i32, ptr addrspace(1) %1396, i64 %37, !dbg !59 + %1398 = getelementptr i32, ptr addrspace(1) %1397, i64 %1395, !dbg !59 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !60 + %1399 = shl nuw nsw i32 %11, 4, !dbg !60 + %1400 = and i32 %1399, 2032, !dbg !60 + %1401 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1400, !dbg !60 + store <4 x i32> %1351, ptr addrspace(3) %1401, align 16, !dbg !60 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !60 + %1402 = shl nuw nsw i32 %11, 6, !dbg !60 + %1403 = and i32 %1402, 1536, !dbg !60 + %1404 = and i32 %1399, 112, !dbg !60 + %1405 = shl nuw nsw i32 %20, 2, !dbg !60 + %1406 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1403, !dbg !60 + %1407 = getelementptr inbounds nuw i8, ptr addrspace(3) %1406, i32 %1404, !dbg !60 + %1408 = getelementptr inbounds nuw i8, ptr addrspace(3) %1407, i32 %1405, !dbg !60 + %1409 = ptrtoint ptr addrspace(3) %1408 to i32, !dbg !60 + %1410 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %1409) #5, !dbg !60 + %1411 = extractvalue { i32, i32, i32, i32 } %1410, 0, !dbg !60 + %1412 = extractvalue { i32, i32, i32, i32 } %1410, 1, !dbg !60 + %1413 = extractvalue { i32, i32, i32, i32 } %1410, 2, !dbg !60 + %1414 = extractvalue { i32, i32, i32, i32 } %1410, 3, !dbg !60 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1411, i32 %1412, i32 %1413, i32 %1414, ptr addrspace(1) %1398, i1 %19) #5, !dbg !60 + %1415 = mul i64 %29, %1393, !dbg !61 + %1416 = getelementptr i32, ptr addrspace(1) %2, i64 %.decomposed, !dbg !62 + %1417 = getelementptr i32, ptr addrspace(1) %1416, i64 %1415, !dbg !62 + %1418 = icmp eq i32 %20, 0, !dbg !63 + %1419 = and i1 %1418, %18, !dbg !63 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %1388, ptr addrspace(1) %1417, i1 %1419) #5, !dbg !63 + ret void, !dbg !64 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", linkageName: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 25, column: 21, scope: !4) +!12 = !DILocation(line: 26, column: 38, scope: !4) +!13 = !DILocation(line: 32, column: 19, scope: !4) +!14 = !DILocation(line: 33, column: 19, scope: !4) +!15 = !DILocation(line: 35, column: 37, scope: !4) +!16 = !DILocation(line: 35, column: 45, scope: !4) +!17 = !DILocation(line: 35, column: 54, scope: !4) +!18 = !DILocation(line: 35, column: 64, scope: !4) +!19 = !DILocation(line: 35, column: 68, scope: !4) +!20 = !DILocation(line: 35, column: 30, scope: !4) +!21 = !DILocation(line: 35, column: 73, scope: !4) +!22 = !DILocation(line: 627, column: 44, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !4, file: !24, discriminator: 0) +!24 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!25 = !DILocation(line: 40, column: 67, scope: !4) +!26 = !DILocation(line: 537, column: 21, scope: !23, inlinedAt: !25) +!27 = !DILocation(line: 538, column: 40, scope: !23, inlinedAt: !25) +!28 = !DILocation(line: 291, column: 36, scope: !29, inlinedAt: !25) +!29 = distinct !DILexicalBlockFile(scope: !4, file: !30, discriminator: 0) +!30 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!31 = !DILocation(line: 261, column: 15, scope: !29, inlinedAt: !25) +!32 = !DILocation(line: 539, column: 41, scope: !23, inlinedAt: !25) +!33 = !DILocation(line: 548, column: 23, scope: !23, inlinedAt: !25) +!34 = !DILocation(line: 551, column: 23, scope: !23, inlinedAt: !25) +!35 = !DILocation(line: 599, column: 28, scope: !23, inlinedAt: !25) +!36 = !DILocation(line: 600, column: 38, scope: !23, inlinedAt: !25) +!37 = !DILocation(line: 600, column: 46, scope: !23, inlinedAt: !25) +!38 = !DILocation(line: 600, column: 15, scope: !23, inlinedAt: !25) +!39 = !DILocation(line: 574, column: 22, scope: !23, inlinedAt: !25) +!40 = !DILocation(line: 591, column: 21, scope: !23, inlinedAt: !25) +!41 = !DILocation(line: 594, column: 40, scope: !23, inlinedAt: !25) +!42 = !DILocation(line: 594, column: 29, scope: !23, inlinedAt: !25) +!43 = !DILocation(line: 594, column: 23, scope: !23, inlinedAt: !25) +!44 = !DILocation(line: 599, column: 19, scope: !23, inlinedAt: !25) +!45 = !DILocation(line: 601, column: 48, scope: !23, inlinedAt: !25) +!46 = !DILocation(line: 601, column: 59, scope: !23, inlinedAt: !25) +!47 = !DILocation(line: 601, column: 22, scope: !23, inlinedAt: !25) +!48 = !DILocation(line: 43, column: 34, scope: !4) +!49 = !DILocation(line: 291, column: 36, scope: !29, inlinedAt: !50) +!50 = !DILocation(line: 44, column: 26, scope: !4) +!51 = !DILocation(line: 261, column: 15, scope: !29, inlinedAt: !50) +!52 = !DILocation(line: 47, column: 21, scope: !4) +!53 = !DILocation(line: 48, column: 62, scope: !4) +!54 = !DILocation(line: 48, column: 88, scope: !4) +!55 = !DILocation(line: 48, column: 79, scope: !4) +!56 = !DILocation(line: 48, scope: !4) +!57 = !DILocation(line: 48, column: 70, scope: !4) +!58 = !DILocation(line: 48, column: 47, scope: !4) +!59 = !DILocation(line: 48, column: 25, scope: !4) +!60 = !DILocation(line: 48, column: 102, scope: !4) +!61 = !DILocation(line: 49, column: 34, scope: !4) +!62 = !DILocation(line: 49, column: 25, scope: !4) +!63 = !DILocation(line: 49, column: 89, scope: !4) +!64 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx new file mode 100644 index 0000000000000000000000000000000000000000..179c8c14af66e943e035f443ff43ec99d5afbaae --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx @@ -0,0 +1,2905 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 // -- Begin function triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.visible .entry triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2, + .param .u64 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_6, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_7 +) +.reqntid 128 +{ + .reg .pred %p<518>; + .reg .b32 %r<1199>; + .reg .b64 %rd<98>; + .loc 1 18 0 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:18:0 + +// %bb.0: + ld.param.b64 %rd14, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3]; +$L__tmp0: + .loc 1 23 28 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:23:28 + mov.u32 %r11, %ctaid.x; + .loc 1 23 33 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:23:33 + shl.b32 %r12, %r11, 5; + .loc 1 24 44 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:24:44 + mov.u32 %r1, %tid.x; + and.b32 %r3, %r1, 31; + bfe.u32 %r13, %r1, 2, 5; + .loc 1 24 23 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:24:23 + or.b32 %r14, %r12, %r3; + or.b32 %r15, %r13, %r12; + .loc 1 26 38 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:26:38 + bfe.u32 %r5, %r1, 5, 2; + shl.b32 %r16, %r1, 2; + .loc 1 32 19 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:32:19 + cvt.s64.s32 %rd1, %r14; + cvt.s64.s32 %rd2, %r15; + .loc 1 33 19 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:33:19 + or.b64 %rd16, %rd1, %rd14; + and.b64 %rd17, %rd16, -4294967296; + setp.ne.b64 %p1, %rd17, 0; + cvt.u32.u64 %r1198, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd96, %rd1, %rd14; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r17, %rd14; + div.u32 %r19, %r1198, %r17; + cvt.u64.u32 %rd96, %r19; +$L__BB0_3: + .loc 1 0 19 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:0:19 + ld.param.b32 %r10, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4]; + ld.param.b64 %rd13, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2]; + ld.param.b64 %rd12, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1]; + ld.param.b64 %rd11, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0]; + shr.u32 %r2, %r1, 5; + and.b32 %r4, %r1, 96; + or.b32 %r6, %r5, 4; + or.b32 %r7, %r5, 8; + or.b32 %r8, %r5, 12; + and.b32 %r9, %r16, 12; + .loc 1 32 19 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:32:19 + mul.lo.s64 %rd18, %rd96, %rd14; + sub.s64 %rd7, %rd1, %rd18; + .loc 1 33 19 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:33:19 + or.b64 %rd19, %rd2, %rd14; + and.b64 %rd20, %rd19, -4294967296; + setp.ne.b64 %p2, %rd20, 0; + cvt.u32.u64 %r1197, %rd2; + @%p2 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd97, %rd2, %rd14; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r20, %rd14; + div.u32 %r22, %r1197, %r20; + cvt.u64.u32 %rd97, %r22; +$L__BB0_6: + .loc 1 25 21 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:25:21 + setp.lt.s32 %p346, %r1197, %r10; + setp.lt.s32 %p3, %r1198, %r10; + .loc 1 32 19 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:32:19 + mul.lo.s64 %rd38, %rd97, %rd14; + sub.s64 %rd39, %rd2, %rd38; + .loc 1 35 37 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:35:37 + cvt.u64.u32 %rd40, %r5; + cvt.u64.u32 %rd41, %r6; + cvt.u64.u32 %rd42, %r7; + cvt.u64.u32 %rd43, %r8; + .loc 1 35 54 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:35:54 + mul.lo.s64 %rd44, %rd14, %rd40; + mul.lo.s64 %rd45, %rd14, %rd41; + mul.lo.s64 %rd46, %rd14, %rd42; + mul.lo.s64 %rd47, %rd14, %rd43; + .loc 1 35 68 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:35:68 + mul.lo.s64 %rd48, %rd14, %rd96; + .loc 1 35 30 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:35:30 + shl.b64 %rd49, %rd7, 2; + add.s64 %rd50, %rd11, %rd49; + mad.wide.u32 %rd51, %r5, 4, %rd50; + shl.b64 %rd52, %rd96, 6; + add.s64 %rd53, %rd51, %rd52; + shl.b64 %rd54, %rd44, 2; + add.s64 %rd55, %rd53, %rd54; + shl.b64 %rd56, %rd48, 6; + add.s64 %rd22, %rd55, %rd56; + shl.b64 %rd57, %rd45, 2; + add.s64 %rd58, %rd53, %rd57; + add.s64 %rd59, %rd58, %rd56; + add.s64 %rd25, %rd59, 16; + shl.b64 %rd60, %rd46, 2; + add.s64 %rd61, %rd53, %rd60; + add.s64 %rd62, %rd61, %rd56; + add.s64 %rd28, %rd62, 32; + shl.b64 %rd63, %rd47, 2; + add.s64 %rd64, %rd53, %rd63; + add.s64 %rd65, %rd64, %rd56; + add.s64 %rd31, %rd65, 48; + .loc 1 35 73 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:35:73 + // begin inline asm + mov.u64 %rd23, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r23, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b32 { %r23 }, [ %rd22 + 0 ], %rd23; + // end inline asm + // begin inline asm + mov.u64 %rd26, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd26, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r24, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b32 { %r24 }, [ %rd25 + 0 ], %rd26; + // end inline asm + // begin inline asm + mov.u64 %rd29, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd29, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r25, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b32 { %r25 }, [ %rd28 + 0 ], %rd29; + // end inline asm + // begin inline asm + mov.u64 %rd32, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd32, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r26, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b32 { %r26 }, [ %rd31 + 0 ], %rd32; + // end inline asm +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shr.u32 %r714, %r1, 6; + bfe.u32 %r715, %r1, 6, 1; + and.b32 %r716, %r2, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r717, %r716, 1; + xor.b32 %r718, %r715, 1; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r28, %r23, %r717; + mul.lo.s32 %r30, %r24, %r717; + mul.lo.s32 %r32, %r25, %r717; + mul.lo.s32 %r34, %r26, %r717; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shl.b32 %r719, %r715, 3; + mov.b32 %r720, global_smem; + add.s32 %r721, %r720, %r719; + shl.b32 %r722, %r3, 6; + add.s32 %r723, %r721, %r722; + shl.b32 %r724, %r716, 2; + add.s32 %r27, %r723, %r724; + mov.pred %p7, -1; + // begin inline asm + @%p7 st.shared.b32 [ %r27 + 0 ], %r28; + // end inline asm + add.s32 %r29, %r27, 16; + // begin inline asm + @%p7 st.shared.b32 [ %r29 + 0 ], %r30; + // end inline asm + add.s32 %r31, %r27, 32; + // begin inline asm + @%p7 st.shared.b32 [ %r31 + 0 ], %r32; + // end inline asm + add.s32 %r33, %r27, 48; + // begin inline asm + @%p7 st.shared.b32 [ %r33 + 0 ], %r34; + // end inline asm + bar.sync 0; + setp.lt.u32 %p11, %r1, 512; + add.s32 %r36, %r720, %r16; + // begin inline asm + @%p11 ld.shared.b32 %r35, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r726, %r35, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r38, %r726, %r35; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.b32 %r727, %r1, 513; + setp.eq.b32 %p12, %r727, 0; + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r38; + // end inline asm + add.s32 %r40, %r36, 512; + // begin inline asm + @%p11 ld.shared.b32 %r39, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r728, %r39, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r42, %r728, %r39; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r42; + // end inline asm + add.s32 %r44, %r36, 1024; + // begin inline asm + @%p11 ld.shared.b32 %r43, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r729, %r43, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r46, %r729, %r43; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r46; + // end inline asm + add.s32 %r48, %r36, 1536; + // begin inline asm + @%p11 ld.shared.b32 %r47, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r730, %r47, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r50, %r730, %r47; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r50; + // end inline asm + bar.sync 0; + ld.shared.b32 %r731, [%r723]; + ld.shared.b32 %r732, [%r723+16]; + ld.shared.b32 %r733, [%r723+32]; + ld.shared.b32 %r734, [%r723+48]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r52, %r23, %r716; + mul.lo.s32 %r54, %r24, %r716; + mul.lo.s32 %r56, %r25, %r716; + mul.lo.s32 %r58, %r26, %r716; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r27 + 0 ], %r52; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r29 + 0 ], %r54; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r31 + 0 ], %r56; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r33 + 0 ], %r58; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r59, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r735, %r59, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r62, %r735, %r59; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r62; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r63, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r736, %r63, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r66, %r736, %r63; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r66; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r67, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r737, %r67, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r70, %r737, %r67; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r70; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r71, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r738, %r71, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r74, %r738, %r71; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r74; + // end inline asm + bar.sync 0; + ld.shared.b32 %r739, [%r723]; + ld.shared.b32 %r740, [%r723+16]; + ld.shared.b32 %r741, [%r723+32]; + ld.shared.b32 %r742, [%r723+48]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r76, %r717, %r5; + shl.b32 %r743, %r717, 2; + or.b32 %r78, %r76, %r743; + add.s32 %r80, %r78, %r743; + add.s32 %r82, %r80, %r743; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r27 + 0 ], %r76; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r29 + 0 ], %r78; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r31 + 0 ], %r80; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r33 + 0 ], %r82; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r83, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r744, %r83, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r86, %r744, %r83; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r86; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r87, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r745, %r87, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r90, %r745, %r87; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r90; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r91, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r746, %r91, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r94, %r746, %r91; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r94; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r95, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r747, %r95, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r98, %r747, %r95; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r98; + // end inline asm + bar.sync 0; + ld.shared.b32 %r748, [%r723]; + ld.shared.b32 %r749, [%r723+16]; + ld.shared.b32 %r750, [%r723+32]; + ld.shared.b32 %r751, [%r723+48]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r100, %r5, %r716; + or.b32 %r102, %r100, %r724; + add.s32 %r104, %r102, %r724; + add.s32 %r106, %r104, %r724; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r27 + 0 ], %r100; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r29 + 0 ], %r102; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r31 + 0 ], %r104; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r33 + 0 ], %r106; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r107, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r752, %r107, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r110, %r752, %r107; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r110; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r111, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r753, %r111, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r114, %r753, %r111; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r114; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r115, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r754, %r115, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r118, %r754, %r115; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r118; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r119, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r755, %r119, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r122, %r755, %r119; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r122; + // end inline asm + bar.sync 0; + ld.shared.b32 %r756, [%r723]; + ld.shared.b32 %r757, [%r723+16]; + ld.shared.b32 %r758, [%r723+32]; + ld.shared.b32 %r759, [%r723+48]; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.b32 %r760, %r714, 1; + setp.ne.b32 %p348, %r760, 0; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + shl.b32 %r761, %r716, 10; + shl.b32 %r762, %r3, 5; + or.b32 %r763, %r761, %r762; + add.s32 %r764, %r720, %r763; + shl.b32 %r765, %r715, 2; + add.s32 %r123, %r764, %r765; + add.s32 %r766, %r764, 8; + add.s32 %r125, %r766, %r765; + add.s32 %r767, %r764, 16; + add.s32 %r127, %r767, %r765; + add.s32 %r768, %r764, 24; + add.s32 %r129, %r768, %r765; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.ge.s32 %p349, %r731, %r739; + setp.ge.s32 %p350, %r732, %r740; + setp.ne.b32 %p351, %r732, %r740; + setp.ne.b32 %p352, %r731, %r739; + setp.le.s32 %p353, %r749, %r757; + setp.le.s32 %p354, %r748, %r756; + or.pred %p355, %p352, %p354; + or.pred %p356, %p351, %p353; + and.pred %p357, %p350, %p356; + and.pred %p358, %p349, %p355; + xor.pred %p359, %p358, %p348; + xor.pred %p360, %p357, %p348; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r769, %r756, %r748; + xor.b32 %r770, %r740, %r732; + xor.b32 %r771, %r757, %r749; + xor.b32 %r772, %r739, %r731; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r773, 0, %r770, %p360; + selp.b32 %r774, 0, %r769, %p359; + selp.b32 %r775, 0, %r772, %p359; + selp.b32 %r776, 0, %r771, %p360; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r777, %r774, %r5; + xor.b32 %r778, %r773, %r24; + xor.b32 %r779, %r776, %r6; + xor.b32 %r780, %r775, %r23; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r124, %r780, %r718; + mul.lo.s32 %r126, %r778, %r718; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p7 st.shared.b32 [ %r123 + 0 ], %r124; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r125 + 0 ], %r126; + // end inline asm + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r148, %r780, %r715; + mul.lo.s32 %r150, %r778, %r715; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r172, %r777, %r718; + mul.lo.s32 %r174, %r779, %r718; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r196, %r777, %r715; + mul.lo.s32 %r198, %r779, %r715; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.ge.s32 %p361, %r733, %r741; + setp.ge.s32 %p362, %r734, %r742; + setp.ne.b32 %p363, %r734, %r742; + setp.ne.b32 %p364, %r733, %r741; + setp.le.s32 %p365, %r751, %r759; + setp.le.s32 %p366, %r750, %r758; + or.pred %p367, %p364, %p366; + or.pred %p368, %p363, %p365; + and.pred %p369, %p362, %p368; + and.pred %p370, %p361, %p367; + xor.pred %p371, %p370, %p348; + xor.pred %p372, %p369, %p348; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r781, %r758, %r750; + xor.b32 %r782, %r742, %r734; + xor.b32 %r783, %r759, %r751; + xor.b32 %r784, %r741, %r733; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r785, 0, %r782, %p372; + selp.b32 %r786, 0, %r781, %p371; + selp.b32 %r787, 0, %r784, %p371; + selp.b32 %r788, 0, %r783, %p372; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r789, %r786, %r7; + xor.b32 %r790, %r785, %r26; + xor.b32 %r791, %r788, %r8; + xor.b32 %r792, %r787, %r25; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r128, %r792, %r718; + mul.lo.s32 %r130, %r790, %r718; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r152, %r792, %r715; + mul.lo.s32 %r154, %r790, %r715; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r176, %r789, %r718; + mul.lo.s32 %r178, %r791, %r718; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r200, %r789, %r715; + mul.lo.s32 %r202, %r791, %r715; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p7 st.shared.b32 [ %r127 + 0 ], %r128; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r129 + 0 ], %r130; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r131, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r793, %r131, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r134, %r793, %r131; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r134; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r135, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r794, %r135, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r138, %r794, %r135; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r138; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r139, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r795, %r139, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r142, %r795, %r139; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r142; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r143, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r796, %r143, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r146, %r796, %r143; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r146; + // end inline asm + bar.sync 0; + ld.shared.b32 %r797, [%r764]; + ld.shared.b32 %r798, [%r764+8]; + ld.shared.b32 %r799, [%r764+16]; + ld.shared.b32 %r800, [%r764+24]; + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r123 + 0 ], %r148; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r125 + 0 ], %r150; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r127 + 0 ], %r152; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r129 + 0 ], %r154; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r155, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r801, %r155, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r158, %r801, %r155; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r158; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r159, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r802, %r159, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r162, %r802, %r159; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r162; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r163, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r803, %r163, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r166, %r803, %r163; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r166; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r167, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r804, %r167, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r170, %r804, %r167; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r170; + // end inline asm + bar.sync 0; + ld.shared.b32 %r805, [%r764]; + ld.shared.b32 %r806, [%r764+8]; + ld.shared.b32 %r807, [%r764+16]; + ld.shared.b32 %r808, [%r764+24]; + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r123 + 0 ], %r172; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r125 + 0 ], %r174; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r127 + 0 ], %r176; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r129 + 0 ], %r178; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r179, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r809, %r179, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r182, %r809, %r179; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r182; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r183, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r810, %r183, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r186, %r810, %r183; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r186; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r187, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r811, %r187, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r190, %r811, %r187; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r190; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r191, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r812, %r191, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r194, %r812, %r191; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r194; + // end inline asm + bar.sync 0; + ld.shared.b32 %r813, [%r764]; + ld.shared.b32 %r814, [%r764+8]; + ld.shared.b32 %r815, [%r764+16]; + ld.shared.b32 %r816, [%r764+24]; + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r123 + 0 ], %r196; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r125 + 0 ], %r198; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r127 + 0 ], %r200; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r129 + 0 ], %r202; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r203, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r817, %r203, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r206, %r817, %r203; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r206; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r207, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r818, %r207, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r210, %r818, %r207; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r210; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r211, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r819, %r211, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r214, %r819, %r211; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r214; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r215, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r820, %r215, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r218, %r820, %r215; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r218; + // end inline asm + bar.sync 0; + ld.shared.b32 %r821, [%r764]; + ld.shared.b32 %r822, [%r764+8]; + ld.shared.b32 %r823, [%r764+16]; + ld.shared.b32 %r824, [%r764+24]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p373, %r797, %r805; + setp.ge.s32 %p374, %r798, %r806; + setp.lt.s32 %p375, %r799, %r807; + setp.ge.s32 %p376, %r800, %r808; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p377, %r797, %r805; + setp.ne.b32 %p378, %r798, %r806; + setp.eq.b32 %p379, %r799, %r807; + setp.ne.b32 %p380, %r800, %r808; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p381, %r813, %r821; + setp.le.s32 %p382, %r814, %r822; + setp.gt.s32 %p383, %r815, %r823; + setp.le.s32 %p384, %r816, %r824; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p385, %p378, %p382; + and.pred %p386, %p377, %p381; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p387, %p380, %p384; + and.pred %p388, %p379, %p383; + and.pred %p389, %p374, %p385; + or.pred %p390, %p373, %p386; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p391, %p376, %p387; + or.pred %p392, %p375, %p388; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r825, %r821, %r813; + xor.b32 %r826, %r806, %r798; + xor.b32 %r827, %r822, %r814; + xor.b32 %r828, %r805, %r797; + xor.b32 %r829, %r823, %r815; + xor.b32 %r830, %r808, %r800; + xor.b32 %r831, %r824, %r816; + xor.b32 %r832, %r807, %r799; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r833, %r826, 0, %p389; + selp.b32 %r834, %r825, 0, %p390; + selp.b32 %r835, %r828, 0, %p390; + selp.b32 %r836, %r827, 0, %p389; + selp.b32 %r837, %r830, 0, %p391; + selp.b32 %r838, %r829, 0, %p392; + selp.b32 %r839, %r832, 0, %p392; + selp.b32 %r840, %r831, 0, %p391; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r841, %r834, %r777; + xor.b32 %r842, %r833, %r778; + xor.b32 %r843, %r836, %r779; + xor.b32 %r844, %r835, %r780; + xor.b32 %r845, %r838, %r789; + xor.b32 %r846, %r837, %r790; + xor.b32 %r847, %r840, %r791; + xor.b32 %r848, %r839, %r792; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r220, %r844, %r717; + mul.lo.s32 %r222, %r842, %r717; + mul.lo.s32 %r224, %r848, %r717; + mul.lo.s32 %r226, %r846, %r717; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r27 + 0 ], %r220; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r29 + 0 ], %r222; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r31 + 0 ], %r224; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r33 + 0 ], %r226; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r227, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r849, %r227, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r230, %r849, %r227; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r230; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r231, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r850, %r231, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r234, %r850, %r231; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r234; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r235, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r851, %r235, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r238, %r851, %r235; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r238; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r239, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r852, %r239, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r242, %r852, %r239; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r242; + // end inline asm + bar.sync 0; + ld.shared.b32 %r853, [%r723]; + ld.shared.b32 %r854, [%r723+16]; + ld.shared.b32 %r855, [%r723+32]; + ld.shared.b32 %r856, [%r723+48]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r244, %r844, %r716; + mul.lo.s32 %r246, %r842, %r716; + mul.lo.s32 %r248, %r848, %r716; + mul.lo.s32 %r250, %r846, %r716; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r27 + 0 ], %r244; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r29 + 0 ], %r246; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r31 + 0 ], %r248; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r33 + 0 ], %r250; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r251, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r857, %r251, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r254, %r857, %r251; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r254; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r255, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r858, %r255, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r258, %r858, %r255; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r258; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r259, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r859, %r259, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r262, %r859, %r259; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r262; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r263, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r860, %r263, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r266, %r860, %r263; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r266; + // end inline asm + bar.sync 0; + ld.shared.b32 %r861, [%r723]; + ld.shared.b32 %r862, [%r723+16]; + ld.shared.b32 %r863, [%r723+32]; + ld.shared.b32 %r864, [%r723+48]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r268, %r841, %r717; + mul.lo.s32 %r270, %r843, %r717; + mul.lo.s32 %r272, %r845, %r717; + mul.lo.s32 %r274, %r847, %r717; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r27 + 0 ], %r268; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r29 + 0 ], %r270; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r31 + 0 ], %r272; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r33 + 0 ], %r274; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r275, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r865, %r275, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r278, %r865, %r275; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r278; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r279, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r866, %r279, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r282, %r866, %r279; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r282; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r283, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r867, %r283, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r286, %r867, %r283; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r286; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r287, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r868, %r287, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r290, %r868, %r287; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r290; + // end inline asm + bar.sync 0; + ld.shared.b32 %r869, [%r723]; + ld.shared.b32 %r870, [%r723+16]; + ld.shared.b32 %r871, [%r723+32]; + ld.shared.b32 %r872, [%r723+48]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r292, %r841, %r716; + mul.lo.s32 %r294, %r843, %r716; + mul.lo.s32 %r296, %r845, %r716; + mul.lo.s32 %r298, %r847, %r716; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r27 + 0 ], %r292; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r29 + 0 ], %r294; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r31 + 0 ], %r296; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r33 + 0 ], %r298; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r299, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r873, %r299, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r302, %r873, %r299; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r302; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r303, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r874, %r303, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r306, %r874, %r303; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r306; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r307, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r875, %r307, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r310, %r875, %r307; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r310; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r311, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r876, %r311, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r314, %r876, %r311; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r314; + // end inline asm + bar.sync 0; + ld.shared.b32 %r877, [%r723]; + ld.shared.b32 %r878, [%r723+16]; + ld.shared.b32 %r879, [%r723+32]; + ld.shared.b32 %r880, [%r723+48]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p393, %r853, %r861; + setp.ge.s32 %p394, %r854, %r862; + setp.lt.s32 %p395, %r855, %r863; + setp.ge.s32 %p396, %r856, %r864; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p397, %r853, %r861; + setp.ne.b32 %p398, %r854, %r862; + setp.eq.b32 %p399, %r855, %r863; + setp.ne.b32 %p400, %r856, %r864; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p401, %r869, %r877; + setp.le.s32 %p402, %r870, %r878; + setp.gt.s32 %p403, %r871, %r879; + setp.le.s32 %p404, %r872, %r880; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p405, %p398, %p402; + and.pred %p406, %p397, %p401; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p407, %p400, %p404; + and.pred %p408, %p399, %p403; + and.pred %p409, %p394, %p405; + or.pred %p410, %p393, %p406; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p411, %p396, %p407; + or.pred %p412, %p395, %p408; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r881, %r877, %r869; + xor.b32 %r882, %r862, %r854; + xor.b32 %r883, %r878, %r870; + xor.b32 %r884, %r861, %r853; + xor.b32 %r885, %r879, %r871; + xor.b32 %r886, %r864, %r856; + xor.b32 %r887, %r880, %r872; + xor.b32 %r888, %r863, %r855; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r889, %r882, 0, %p409; + selp.b32 %r890, %r881, 0, %p410; + selp.b32 %r891, %r884, 0, %p410; + selp.b32 %r892, %r883, 0, %p409; + selp.b32 %r893, %r886, 0, %p411; + selp.b32 %r894, %r885, 0, %p412; + selp.b32 %r895, %r888, 0, %p412; + selp.b32 %r896, %r887, 0, %p411; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r897, %r890, %r841; + xor.b32 %r898, %r889, %r842; + xor.b32 %r899, %r892, %r843; + xor.b32 %r900, %r891, %r844; + xor.b32 %r901, %r894, %r845; + xor.b32 %r902, %r893, %r846; + xor.b32 %r903, %r896, %r847; + xor.b32 %r904, %r895, %r848; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p413, %r900, %r898; + setp.ge.s32 %p414, %r904, %r902; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p415, %r898, %r900; + setp.gt.s32 %p416, %r897, %r899; + setp.ne.b32 %p417, %r902, %r904; + setp.le.s32 %p418, %r901, %r903; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p419, %p415, %p416; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p420, %p417, %p418; + or.pred %p421, %p413, %p419; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p422, %p414, %p420; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r905, %r898, %r900; + xor.b32 %r906, %r902, %r904; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r907, %r905, 0, %p421; + selp.b32 %r908, %r906, 0, %p422; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r909, %r899, %r897; + xor.b32 %r910, %r903, %r901; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r911, %r909, 0, %p421; + selp.b32 %r912, %r910, 0, %p422; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r913, %r907, %r898; + xor.b32 %r914, %r907, %r900; + xor.b32 %r915, %r908, %r902; + xor.b32 %r916, %r908, %r904; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r917, %r911, %r899; + xor.b32 %r918, %r911, %r897; + xor.b32 %r919, %r912, %r903; + xor.b32 %r920, %r912, %r901; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r316, %r914, %r718; + mul.lo.s32 %r318, %r913, %r718; + mul.lo.s32 %r320, %r916, %r718; + mul.lo.s32 %r322, %r915, %r718; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p7 st.shared.b32 [ %r123 + 0 ], %r316; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r125 + 0 ], %r318; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r127 + 0 ], %r320; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r129 + 0 ], %r322; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r323, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r921, %r323, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r326, %r921, %r323; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r326; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r327, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r922, %r327, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r330, %r922, %r327; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r330; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r331, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r923, %r331, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r334, %r923, %r331; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r334; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r335, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r924, %r335, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r338, %r924, %r335; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r338; + // end inline asm + bar.sync 0; + ld.shared.b32 %r925, [%r764]; + ld.shared.b32 %r926, [%r764+8]; + ld.shared.b32 %r927, [%r764+16]; + ld.shared.b32 %r928, [%r764+24]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r340, %r914, %r715; + mul.lo.s32 %r342, %r913, %r715; + mul.lo.s32 %r344, %r916, %r715; + mul.lo.s32 %r346, %r915, %r715; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r123 + 0 ], %r340; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r125 + 0 ], %r342; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r127 + 0 ], %r344; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r129 + 0 ], %r346; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r347, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r929, %r347, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r350, %r929, %r347; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r350; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r351, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r930, %r351, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r354, %r930, %r351; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r354; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r355, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r931, %r355, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r358, %r931, %r355; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r358; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r359, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r932, %r359, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r362, %r932, %r359; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r362; + // end inline asm + bar.sync 0; + ld.shared.b32 %r933, [%r764]; + ld.shared.b32 %r934, [%r764+8]; + ld.shared.b32 %r935, [%r764+16]; + ld.shared.b32 %r936, [%r764+24]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r364, %r918, %r718; + mul.lo.s32 %r366, %r917, %r718; + mul.lo.s32 %r368, %r920, %r718; + mul.lo.s32 %r370, %r919, %r718; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r123 + 0 ], %r364; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r125 + 0 ], %r366; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r127 + 0 ], %r368; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r129 + 0 ], %r370; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r371, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r937, %r371, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r374, %r937, %r371; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r374; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r375, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r938, %r375, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r378, %r938, %r375; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r378; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r379, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r939, %r379, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r382, %r939, %r379; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r382; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r383, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r940, %r383, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r386, %r940, %r383; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r386; + // end inline asm + bar.sync 0; + ld.shared.b32 %r941, [%r764]; + ld.shared.b32 %r942, [%r764+8]; + ld.shared.b32 %r943, [%r764+16]; + ld.shared.b32 %r944, [%r764+24]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r388, %r918, %r715; + mul.lo.s32 %r390, %r917, %r715; + mul.lo.s32 %r392, %r920, %r715; + mul.lo.s32 %r394, %r919, %r715; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r123 + 0 ], %r388; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r125 + 0 ], %r390; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r127 + 0 ], %r392; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r129 + 0 ], %r394; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r395, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r945, %r395, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r398, %r945, %r395; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r398; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r399, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r946, %r399, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r402, %r946, %r399; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r402; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r403, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r947, %r403, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r406, %r947, %r403; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r406; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r407, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r948, %r407, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r410, %r948, %r407; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r410; + // end inline asm + bar.sync 0; + ld.shared.b32 %r949, [%r764]; + ld.shared.b32 %r950, [%r764+8]; + ld.shared.b32 %r951, [%r764+16]; + ld.shared.b32 %r952, [%r764+24]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p423, %r925, %r933; + setp.lt.s32 %p424, %r926, %r934; + setp.ge.s32 %p425, %r927, %r935; + setp.ge.s32 %p426, %r928, %r936; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p427, %r926, %r934; + setp.eq.b32 %p428, %r925, %r933; + setp.ne.b32 %p429, %r928, %r936; + setp.ne.b32 %p430, %r927, %r935; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p431, %r942, %r950; + setp.gt.s32 %p432, %r941, %r949; + setp.le.s32 %p433, %r944, %r952; + setp.le.s32 %p434, %r943, %r951; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p435, %p428, %p432; + and.pred %p436, %p427, %p431; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p437, %p430, %p434; + or.pred %p438, %p429, %p433; + or.pred %p439, %p424, %p436; + or.pred %p440, %p423, %p435; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p441, %p426, %p438; + and.pred %p442, %p425, %p437; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r953, %r934, %r926; + xor.b32 %r954, %r933, %r925; + xor.b32 %r955, %r936, %r928; + xor.b32 %r956, %r935, %r927; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r957, %r954, 0, %p440; + selp.b32 %r958, %r953, 0, %p439; + selp.b32 %r959, %r956, 0, %p442; + selp.b32 %r960, %r955, 0, %p441; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r961, %r958, %r913; + xor.b32 %r962, %r957, %r914; + xor.b32 %r963, %r960, %r915; + xor.b32 %r964, %r959, %r916; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r965, %r950, %r942; + xor.b32 %r966, %r949, %r941; + xor.b32 %r967, %r952, %r944; + xor.b32 %r968, %r951, %r943; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r969, %r966, 0, %p440; + selp.b32 %r970, %r965, 0, %p439; + selp.b32 %r971, %r968, 0, %p442; + selp.b32 %r972, %r967, 0, %p441; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r973, %r970, %r917; + xor.b32 %r974, %r969, %r918; + xor.b32 %r975, %r972, %r919; + xor.b32 %r976, %r971, %r920; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r412, %r962, %r717; + mul.lo.s32 %r414, %r961, %r717; + mul.lo.s32 %r416, %r964, %r717; + mul.lo.s32 %r418, %r963, %r717; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r27 + 0 ], %r412; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r29 + 0 ], %r414; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r31 + 0 ], %r416; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r33 + 0 ], %r418; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r419, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r977, %r419, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r422, %r977, %r419; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r422; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r423, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r978, %r423, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r426, %r978, %r423; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r426; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r427, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r979, %r427, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r430, %r979, %r427; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r430; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r431, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r980, %r431, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r434, %r980, %r431; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r434; + // end inline asm + bar.sync 0; + ld.shared.b32 %r981, [%r723]; + ld.shared.b32 %r982, [%r723+16]; + ld.shared.b32 %r983, [%r723+32]; + ld.shared.b32 %r984, [%r723+48]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r436, %r962, %r716; + mul.lo.s32 %r438, %r961, %r716; + mul.lo.s32 %r440, %r964, %r716; + mul.lo.s32 %r442, %r963, %r716; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r27 + 0 ], %r436; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r29 + 0 ], %r438; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r31 + 0 ], %r440; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r33 + 0 ], %r442; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r443, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r985, %r443, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r446, %r985, %r443; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r446; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r447, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r986, %r447, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r450, %r986, %r447; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r450; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r451, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r987, %r451, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r454, %r987, %r451; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r454; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r455, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r988, %r455, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r458, %r988, %r455; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r458; + // end inline asm + bar.sync 0; + ld.shared.b32 %r989, [%r723]; + ld.shared.b32 %r990, [%r723+16]; + ld.shared.b32 %r991, [%r723+32]; + ld.shared.b32 %r992, [%r723+48]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r460, %r974, %r717; + mul.lo.s32 %r462, %r973, %r717; + mul.lo.s32 %r464, %r976, %r717; + mul.lo.s32 %r466, %r975, %r717; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r27 + 0 ], %r460; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r29 + 0 ], %r462; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r31 + 0 ], %r464; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r33 + 0 ], %r466; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r467, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r993, %r467, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r470, %r993, %r467; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r470; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r471, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r994, %r471, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r474, %r994, %r471; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r474; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r475, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r995, %r475, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r478, %r995, %r475; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r478; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r479, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r996, %r479, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r482, %r996, %r479; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r482; + // end inline asm + bar.sync 0; + ld.shared.b32 %r997, [%r723]; + ld.shared.b32 %r998, [%r723+16]; + ld.shared.b32 %r999, [%r723+32]; + ld.shared.b32 %r1000, [%r723+48]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r484, %r974, %r716; + mul.lo.s32 %r486, %r973, %r716; + mul.lo.s32 %r488, %r976, %r716; + mul.lo.s32 %r490, %r975, %r716; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r27 + 0 ], %r484; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r29 + 0 ], %r486; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r31 + 0 ], %r488; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r33 + 0 ], %r490; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r491, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1001, %r491, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r494, %r1001, %r491; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r494; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r495, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1002, %r495, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r498, %r1002, %r495; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r498; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r499, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1003, %r499, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r502, %r1003, %r499; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r502; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r503, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1004, %r503, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r506, %r1004, %r503; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r506; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1005, [%r723]; + ld.shared.b32 %r1006, [%r723+16]; + ld.shared.b32 %r1007, [%r723+32]; + ld.shared.b32 %r1008, [%r723+48]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p443, %r982, %r990; + setp.lt.s32 %p444, %r981, %r989; + setp.ge.s32 %p445, %r984, %r992; + setp.ge.s32 %p446, %r983, %r991; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p447, %r981, %r989; + setp.eq.b32 %p448, %r982, %r990; + setp.ne.b32 %p449, %r983, %r991; + setp.ne.b32 %p450, %r984, %r992; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p451, %r997, %r1005; + setp.gt.s32 %p452, %r998, %r1006; + setp.le.s32 %p453, %r999, %r1007; + setp.le.s32 %p454, %r1000, %r1008; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p455, %p448, %p452; + and.pred %p456, %p447, %p451; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p457, %p450, %p454; + or.pred %p458, %p449, %p453; + or.pred %p459, %p444, %p456; + or.pred %p460, %p443, %p455; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p461, %p446, %p458; + and.pred %p462, %p445, %p457; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r1009, %r989, %r981; + xor.b32 %r1010, %r990, %r982; + xor.b32 %r1011, %r991, %r983; + xor.b32 %r1012, %r992, %r984; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r1013, %r1010, 0, %p460; + selp.b32 %r1014, %r1009, 0, %p459; + selp.b32 %r1015, %r1012, 0, %p462; + selp.b32 %r1016, %r1011, 0, %p461; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r1017, %r1014, %r962; + xor.b32 %r1018, %r1013, %r961; + xor.b32 %r1019, %r1016, %r964; + xor.b32 %r1020, %r1015, %r963; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r1021, %r1006, %r998; + xor.b32 %r1022, %r1005, %r997; + xor.b32 %r1023, %r1008, %r1000; + xor.b32 %r1024, %r1007, %r999; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r1025, %r1022, 0, %p459; + selp.b32 %r1026, %r1021, 0, %p460; + selp.b32 %r1027, %r1024, 0, %p461; + selp.b32 %r1028, %r1023, 0, %p462; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r1029, %r1026, %r973; + xor.b32 %r1030, %r1025, %r974; + xor.b32 %r1031, %r1028, %r975; + xor.b32 %r1032, %r1027, %r976; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p463, %r1018, %r1020; + setp.lt.s32 %p464, %r1017, %r1019; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p465, %r1017, %r1019; + setp.eq.b32 %p466, %r1018, %r1020; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p467, %r1030, %r1032; + setp.gt.s32 %p468, %r1029, %r1031; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p469, %p466, %p468; + and.pred %p470, %p465, %p467; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p471, %p464, %p470; + or.pred %p472, %p463, %p469; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r1033, %r1019, %r1017; + xor.b32 %r1034, %r1020, %r1018; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r1035, %r1034, 0, %p472; + selp.b32 %r1036, %r1033, 0, %p471; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r1037, %r1036, %r1017; + xor.b32 %r1038, %r1035, %r1018; + xor.b32 %r1039, %r1036, %r1019; + xor.b32 %r1040, %r1035, %r1020; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r1041, %r1038, %r1037; + xor.b32 %r1042, %r1040, %r1039; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r1043, %r1031, %r1029; + xor.b32 %r1044, %r1032, %r1030; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r1045, %r1044, 0, %p471; + selp.b32 %r1046, %r1043, 0, %p472; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r1047, %r1046, %r1029; + xor.b32 %r1048, %r1045, %r1032; + xor.b32 %r1049, %r1046, %r1031; + xor.b32 %r1050, %r1045, %r1030; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p473, %r1037, %r1038; + setp.lt.s32 %p474, %r1039, %r1040; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p475, %r1039, %r1040; + setp.eq.b32 %p476, %r1037, %r1038; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p477, %r1048, %r1049; + setp.gt.s32 %p478, %r1050, %r1047; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p479, %p476, %p478; + and.pred %p480, %p475, %p477; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p481, %p474, %p480; + or.pred %p482, %p473, %p479; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r1051, %r1041, 0, %p482; + selp.b32 %r1052, %r1042, 0, %p481; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r1053, %r1051, %r1037; + xor.b32 %r1054, %r1051, %r1038; + xor.b32 %r1055, %r1052, %r1039; + xor.b32 %r1056, %r1052, %r1040; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r1057, %r1030, %r1047; + xor.b32 %r1058, %r1031, %r1048; + xor.b32 %r1059, %r1058, %r1046; + xor.b32 %r1060, %r1057, %r1045; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r1061, %r1060, 0, %p482; + selp.b32 %r1062, %r1059, 0, %p481; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r1063, %r1062, %r1049; + xor.b32 %r1064, %r1062, %r1048; + xor.b32 %r1065, %r1061, %r1047; + xor.b32 %r1066, %r1061, %r1050; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r508, %r1053, %r718; + mul.lo.s32 %r510, %r1054, %r718; + mul.lo.s32 %r512, %r1055, %r718; + mul.lo.s32 %r514, %r1056, %r718; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p7 st.shared.b32 [ %r123 + 0 ], %r508; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r125 + 0 ], %r510; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r127 + 0 ], %r512; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r129 + 0 ], %r514; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r515, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1067, %r515, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r518, %r1067, %r515; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r518; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r519, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1068, %r519, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r522, %r1068, %r519; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r522; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r523, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1069, %r523, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r526, %r1069, %r523; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r526; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r527, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1070, %r527, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r530, %r1070, %r527; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r530; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1071, [%r764]; + ld.shared.b32 %r1072, [%r764+8]; + ld.shared.b32 %r1073, [%r764+16]; + ld.shared.b32 %r1074, [%r764+24]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r532, %r1053, %r715; + mul.lo.s32 %r534, %r1054, %r715; + mul.lo.s32 %r536, %r1055, %r715; + mul.lo.s32 %r538, %r1056, %r715; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r123 + 0 ], %r532; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r125 + 0 ], %r534; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r127 + 0 ], %r536; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r129 + 0 ], %r538; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r539, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1075, %r539, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r542, %r1075, %r539; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r542; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r543, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1076, %r543, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r546, %r1076, %r543; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r546; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r547, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1077, %r547, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r550, %r1077, %r547; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r550; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r551, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1078, %r551, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r554, %r1078, %r551; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r554; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1079, [%r764]; + ld.shared.b32 %r1080, [%r764+8]; + ld.shared.b32 %r1081, [%r764+16]; + ld.shared.b32 %r1082, [%r764+24]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r556, %r1066, %r718; + mul.lo.s32 %r558, %r1065, %r718; + mul.lo.s32 %r560, %r1064, %r718; + mul.lo.s32 %r562, %r1063, %r718; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r123 + 0 ], %r556; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r125 + 0 ], %r558; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r127 + 0 ], %r560; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r129 + 0 ], %r562; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r563, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1083, %r563, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r566, %r1083, %r563; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r566; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r567, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1084, %r567, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r570, %r1084, %r567; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r570; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r571, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1085, %r571, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r574, %r1085, %r571; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r574; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r575, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1086, %r575, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r578, %r1086, %r575; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r578; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1087, [%r764]; + ld.shared.b32 %r1088, [%r764+8]; + ld.shared.b32 %r1089, [%r764+16]; + ld.shared.b32 %r1090, [%r764+24]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r580, %r1066, %r715; + mul.lo.s32 %r582, %r1065, %r715; + mul.lo.s32 %r584, %r1064, %r715; + mul.lo.s32 %r586, %r1063, %r715; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r123 + 0 ], %r580; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r125 + 0 ], %r582; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r127 + 0 ], %r584; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r129 + 0 ], %r586; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r587, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1091, %r587, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r590, %r1091, %r587; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r590; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r591, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1092, %r591, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r594, %r1092, %r591; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r594; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r595, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1093, %r595, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r598, %r1093, %r595; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r598; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r599, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1094, %r599, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r602, %r1094, %r599; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r602; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1095, [%r764]; + ld.shared.b32 %r1096, [%r764+8]; + ld.shared.b32 %r1097, [%r764+16]; + ld.shared.b32 %r1098, [%r764+24]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p483, %r1071, %r1079; + setp.lt.s32 %p484, %r1072, %r1080; + setp.lt.s32 %p485, %r1073, %r1081; + setp.lt.s32 %p486, %r1074, %r1082; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p487, %r1074, %r1082; + setp.eq.b32 %p488, %r1073, %r1081; + setp.eq.b32 %p489, %r1072, %r1080; + setp.eq.b32 %p490, %r1071, %r1079; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p491, %r1090, %r1098; + setp.gt.s32 %p492, %r1089, %r1097; + setp.gt.s32 %p493, %r1088, %r1096; + setp.gt.s32 %p494, %r1087, %r1095; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p495, %p490, %p494; + and.pred %p496, %p489, %p493; + and.pred %p497, %p488, %p492; + and.pred %p498, %p487, %p491; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p499, %p486, %p498; + or.pred %p500, %p485, %p497; + or.pred %p501, %p484, %p496; + or.pred %p502, %p483, %p495; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r1099, %r1079, %r1071; + xor.b32 %r1100, %r1080, %r1072; + xor.b32 %r1101, %r1081, %r1073; + xor.b32 %r1102, %r1082, %r1074; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r1103, %r1099, 0, %p502; + selp.b32 %r1104, %r1100, 0, %p501; + selp.b32 %r1105, %r1101, 0, %p500; + selp.b32 %r1106, %r1102, 0, %p499; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r1107, %r1103, %r1053; + xor.b32 %r1108, %r1104, %r1054; + xor.b32 %r1109, %r1105, %r1055; + xor.b32 %r1110, %r1106, %r1056; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r1111, %r1098, %r1090; + xor.b32 %r1112, %r1097, %r1089; + xor.b32 %r1113, %r1096, %r1088; + xor.b32 %r1114, %r1095, %r1087; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r1115, %r1114, 0, %p502; + selp.b32 %r1116, %r1113, 0, %p501; + selp.b32 %r1117, %r1112, 0, %p500; + selp.b32 %r1118, %r1111, 0, %p499; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r1119, %r1118, %r1063; + xor.b32 %r1120, %r1117, %r1064; + xor.b32 %r1121, %r1116, %r1065; + xor.b32 %r1122, %r1115, %r1066; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r604, %r1107, %r717; + mul.lo.s32 %r606, %r1108, %r717; + mul.lo.s32 %r608, %r1109, %r717; + mul.lo.s32 %r610, %r1110, %r717; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r27 + 0 ], %r604; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r29 + 0 ], %r606; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r31 + 0 ], %r608; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r33 + 0 ], %r610; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r611, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1123, %r611, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r614, %r1123, %r611; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r614; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r615, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1124, %r615, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r618, %r1124, %r615; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r618; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r619, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1125, %r619, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r622, %r1125, %r619; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r622; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r623, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1126, %r623, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r626, %r1126, %r623; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r626; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1127, [%r723]; + ld.shared.b32 %r1128, [%r723+16]; + ld.shared.b32 %r1129, [%r723+32]; + ld.shared.b32 %r1130, [%r723+48]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r628, %r1107, %r716; + mul.lo.s32 %r630, %r1108, %r716; + mul.lo.s32 %r632, %r1109, %r716; + mul.lo.s32 %r634, %r1110, %r716; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r27 + 0 ], %r628; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r29 + 0 ], %r630; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r31 + 0 ], %r632; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r33 + 0 ], %r634; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r635, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1131, %r635, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r638, %r1131, %r635; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r638; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r639, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1132, %r639, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r642, %r1132, %r639; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r642; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r643, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1133, %r643, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r646, %r1133, %r643; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r646; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r647, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1134, %r647, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r650, %r1134, %r647; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r650; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1135, [%r723]; + ld.shared.b32 %r1136, [%r723+16]; + ld.shared.b32 %r1137, [%r723+32]; + ld.shared.b32 %r1138, [%r723+48]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r652, %r1122, %r717; + mul.lo.s32 %r654, %r1121, %r717; + mul.lo.s32 %r656, %r1120, %r717; + mul.lo.s32 %r658, %r1119, %r717; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r27 + 0 ], %r652; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r29 + 0 ], %r654; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r31 + 0 ], %r656; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r33 + 0 ], %r658; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r659, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1139, %r659, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r662, %r1139, %r659; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r662; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r663, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1140, %r663, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r666, %r1140, %r663; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r666; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r667, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1141, %r667, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r670, %r1141, %r667; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r670; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r671, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1142, %r671, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r674, %r1142, %r671; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r674; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1143, [%r723]; + ld.shared.b32 %r1144, [%r723+16]; + ld.shared.b32 %r1145, [%r723+32]; + ld.shared.b32 %r1146, [%r723+48]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r676, %r1122, %r716; + mul.lo.s32 %r678, %r1121, %r716; + mul.lo.s32 %r680, %r1120, %r716; + mul.lo.s32 %r682, %r1119, %r716; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p7 st.shared.b32 [ %r27 + 0 ], %r676; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r29 + 0 ], %r678; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r31 + 0 ], %r680; + // end inline asm + // begin inline asm + @%p7 st.shared.b32 [ %r33 + 0 ], %r682; + // end inline asm + bar.sync 0; + // begin inline asm + @%p11 ld.shared.b32 %r683, [ %r36 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1147, %r683, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r686, %r1147, %r683; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r36 + 0 ], %r686; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r687, [ %r40 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1148, %r687, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r690, %r1148, %r687; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r40 + 0 ], %r690; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r691, [ %r44 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1149, %r691, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r694, %r1149, %r691; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r44 + 0 ], %r694; + // end inline asm + // begin inline asm + @%p11 ld.shared.b32 %r695, [ %r48 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1150, %r695, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r698, %r1150, %r695; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p12 st.shared.b32 [ %r48 + 0 ], %r698; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1151, [%r723]; + ld.shared.b32 %r1152, [%r723+16]; + ld.shared.b32 %r1153, [%r723+32]; + ld.shared.b32 %r1154, [%r723+48]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p503, %r1130, %r1138; + setp.lt.s32 %p504, %r1129, %r1137; + setp.lt.s32 %p505, %r1128, %r1136; + setp.lt.s32 %p506, %r1127, %r1135; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p507, %r1130, %r1138; + setp.eq.b32 %p508, %r1129, %r1137; + setp.eq.b32 %p509, %r1128, %r1136; + setp.eq.b32 %p510, %r1127, %r1135; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p511, %r1146, %r1154; + setp.gt.s32 %p512, %r1145, %r1153; + setp.gt.s32 %p513, %r1144, %r1152; + setp.gt.s32 %p514, %r1143, %r1151; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r1155, %r1154, %r1146; + xor.b32 %r1156, %r1153, %r1145; + xor.b32 %r1157, %r1152, %r1144; + xor.b32 %r1158, %r1151, %r1143; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r1159, %r1158, 0, %p514; + selp.b32 %r1160, %r1159, 0, %p510; + selp.b32 %r1161, %r1158, %r1160, %p506; + selp.b32 %r1162, %r1157, 0, %p513; + selp.b32 %r1163, %r1162, 0, %p509; + selp.b32 %r1164, %r1157, %r1163, %p505; + selp.b32 %r1165, %r1156, 0, %p512; + selp.b32 %r1166, %r1165, 0, %p508; + selp.b32 %r1167, %r1156, %r1166, %p504; + selp.b32 %r1168, %r1155, 0, %p511; + selp.b32 %r1169, %r1168, 0, %p507; + selp.b32 %r1170, %r1155, %r1169, %p503; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r1171, %r1170, %r1119; + xor.b32 %r1172, %r1167, %r1120; + xor.b32 %r1173, %r1164, %r1121; + xor.b32 %r1174, %r1161, %r1122; +$L__tmp2: + .loc 1 43 34 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:43:34 + cvt.s64.s32 %rd66, %r25; + cvt.s64.s32 %rd67, %r24; + cvt.s64.s32 %rd68, %r26; + cvt.s64.s32 %rd69, %r23; + selp.b64 %rd70, %rd69, 0, %p3; + selp.b64 %rd71, %rd68, 0, %p3; + selp.b64 %rd72, %rd67, 0, %p3; + selp.b64 %rd73, %rd66, 0, %p3; +$L__tmp3: + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + bar.sync 0; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + add.s64 %rd74, %rd72, %rd73; + add.s64 %rd75, %rd70, %rd71; + add.s64 %rd33, %rd74, %rd75; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + and.b32 %r1175, %r2, 3; + add.s32 %r1176, %r720, %r762; + shl.b32 %r1177, %r1175, 3; + add.s32 %r699, %r1176, %r1177; + // begin inline asm + @%p7 st.shared.b64 [ %r699 + 0 ], %rd33; + // end inline asm + bar.sync 0; + setp.lt.u32 %p344, %r1, 128; + shl.b32 %r1178, %r1, 3; + add.s32 %r700, %r720, %r1178; + // begin inline asm + @%p344 ld.shared.b64 %rd34, [ %r700 + 0 ]; + // end inline asm + mov.b64 {_, %r1179}, %rd34; + cvt.u32.u64 %r1180, %rd34; + shfl.sync.bfly.b32 %r1181, %r1180, 2, 31, -1; + shfl.sync.bfly.b32 %r1182, %r1179, 2, 31, -1; + cvt.u64.u32 %rd76, %r1181; + cvt.u64.u32 %rd77, %r1182; + shl.b64 %rd78, %rd77, 32; + or.b64 %rd79, %rd76, %rd78; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + add.s64 %rd80, %rd34, %rd79; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + mov.b64 {_, %r1183}, %rd80; + cvt.u32.u64 %r1184, %rd80; + shfl.sync.bfly.b32 %r1185, %r1184, 1, 31, -1; + shfl.sync.bfly.b32 %r1186, %r1183, 1, 31, -1; + cvt.u64.u32 %rd81, %r1185; + cvt.u64.u32 %rd82, %r1186; + shl.b64 %rd83, %rd82, 32; + or.b64 %rd84, %rd81, %rd83; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + add.s64 %rd35, %rd80, %rd84; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + and.b32 %r1187, %r1, 899; + setp.eq.b32 %p345, %r1187, 0; + // begin inline asm + @%p345 st.shared.b64 [ %r700 + 0 ], %rd35; + // end inline asm + bar.sync 0; + ld.shared.b32 %r711, [%r1176]; +$L__tmp4: + .loc 1 48 62 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:62 + setp.lt.s64 %p515, %rd14, 2; + .loc 1 48 88 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:88 + setp.gt.s64 %p516, %rd14, 1; + .loc 1 48 79 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:79 + selp.b64 %rd85, %rd14, 0, %p516; + .loc 1 48 0 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48 + selp.b64 %rd86, 1, 0, %p515; + .loc 1 48 70 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:70 + add.s64 %rd87, %rd85, %rd86; + .loc 1 48 47 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:47 + mul.lo.s64 %rd88, %rd87, %rd97; + .loc 1 48 25 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:25 + shl.b64 %rd89, %rd39, 6; + add.s64 %rd90, %rd12, %rd89; + mad.wide.u32 %rd91, %r9, 4, %rd90; + shl.b64 %rd92, %rd88, 6; + add.s64 %rd36, %rd91, %rd92; + .loc 1 48 102 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:102 + bar.sync 0; + shl.b32 %r1188, %r1, 4; + and.b32 %r1189, %r1188, 2032; + add.s32 %r1190, %r720, %r1189; + st.shared.v4.b32 [%r1190], {%r1174, %r1173, %r1172, %r1171}; + bar.sync 0; + shl.b32 %r1191, %r1, 6; + and.b32 %r1192, %r1191, 1536; + and.b32 %r1193, %r1188, 112; + shl.b32 %r1194, %r4, 2; + add.s32 %r1195, %r720, %r1192; + add.s32 %r1196, %r1195, %r1193; + add.s32 %r706, %r1196, %r1194; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r707, %r708, %r709, %r710}, [%r706]; + // end inline asm + // begin inline asm + @%p346 st.global.v4.b32 [ %rd36 + 0 ], { %r707, %r708, %r709, %r710 }; + // end inline asm + .loc 1 49 34 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:49:34 + mul.lo.s64 %rd93, %rd96, %rd87; + .loc 1 49 25 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:49:25 + add.s64 %rd94, %rd13, %rd49; + shl.b64 %rd95, %rd93, 2; + add.s64 %rd37, %rd94, %rd95; + .loc 1 49 89 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:49:89 + setp.eq.b32 %p517, %r4, 0; + and.pred %p347, %p517, %p3; + // begin inline asm + @%p347 st.global.b32 [ %rd37 + 0 ], { %r711 }; + // end inline asm + .loc 1 49 4 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:49:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 267 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x104 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 119 +.b8 101 +.b8 53 +.b8 52 +.b8 113 +.b8 100 +.b8 122 +.b8 117 +.b8 100 +.b8 54 +.b8 120 +.b8 115 +.b8 107 +.b8 104 +.b8 118 +.b8 106 +.b8 115 +.b8 117 +.b8 98 +.b8 122 +.b8 113 +.b8 98 +.b8 118 +.b8 105 +.b8 111 +.b8 98 +.b8 116 +.b8 111 +.b8 102 +.b8 105 +.b8 55 +.b8 114 +.b8 116 +.b8 108 +.b8 97 +.b8 51 +.b8 99 +.b8 122 +.b8 51 +.b8 102 +.b8 118 +.b8 117 +.b8 102 +.b8 109 +.b8 101 +.b8 107 +.b8 102 +.b8 121 +.b8 52 +.b8 113 +.b8 102 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 119 +.b8 101 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x3d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 99 +.b8 108 +.b8 111 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 108 +.b8 105 +.b8 99 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 116 +.b8 114 +.b8 97 +.b8 110 +.b8 115 +.b8 112 +.b8 111 +.b8 115 +.b8 101 +.b8 95 +.b8 51 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc8:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xdd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 40 // DW_AT_call_line +.b8 67 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xf5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source b/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source new file mode 100644 index 0000000000000000000000000000000000000000..b317cf463717d77fdee0fd189d4b3f511dd8a62a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source @@ -0,0 +1,1289 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":18:0) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc106 = loc(unknown) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc140 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc144 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc153 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc158 = loc("in_ptr0"(#loc)) +#loc159 = loc("out_ptr2"(#loc)) +#loc160 = loc("out_ptr3"(#loc)) +#loc161 = loc("ks0"(#loc)) +#loc162 = loc("xnumel"(#loc)) +#loc163 = loc("r0_numel"(#loc)) +#loc196 = loc("x"(#loc56)) +#loc197 = loc("idxs"(#loc56)) +#loc198 = loc("x"(#loc60)) +#loc199 = loc("idxs"(#loc60)) +#loc204 = loc("x"(#loc68)) +#loc205 = loc("idxs"(#loc68)) +#loc206 = loc("flip"(#loc68)) +#loc262 = loc("input"(#loc131)) +#loc263 = loc("a"(#loc135)) +#loc264 = loc("b"(#loc135)) +#loc266 = loc("x"(#loc140)) +#loc267 = loc("x"(#loc144)) +#loc268 = loc("input"(#loc153)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 16 : i32 loc(#loc164) + %xoffset = tt.get_program_id x : i32 loc(#loc165) + %xoffset_1 = arith.constant 32 : i32 loc(#loc166) + %xoffset_2 = arith.constant 32 : i32 loc(#loc166) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc166) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc167) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc168) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<32x1xi32> loc(#loc169) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<32x1xi32> loc(#loc169) + %xmask = tt.splat %xnumel : i32 -> tensor<32x1xi32> loc(#loc170) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<32x1xi32> loc(#loc170) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc171) + %r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc172) + %r0_offset = arith.constant 0 : i32 loc(#loc173) + %r0_mask = arith.constant true loc(#loc174) + %r0_mask_9 = arith.constant dense : tensor<32x16xi1> loc(#loc174) + %x0 = arith.extsi %xindex_6 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc175) + %x0_10 = tt.splat %ks0 : i64 -> tensor<32x1xi64> loc(#loc175) + %x0_11 = arith.remsi %x0, %x0_10 : tensor<32x1xi64> loc(#loc175) + %x1 = arith.extsi %xindex_6 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc176) + %x1_12 = tt.splat %ks0 : i64 -> tensor<32x1xi64> loc(#loc176) + %x1_13 = arith.divsi %x1, %x1_12 : tensor<32x1xi64> loc(#loc176) + %tmp0 = arith.extsi %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc177) + %tmp0_14 = tt.broadcast %tmp0 : tensor<1x16xi64> -> tensor<32x16xi64> loc(#loc177) + %tmp0_15 = tt.broadcast %x0_11 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc177) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<32x16xi64> loc(#loc177) + %tmp0_17 = arith.constant 16 : i32 loc(#loc178) + %tmp0_18 = arith.constant 16 : i64 loc(#loc178) + %tmp0_19 = arith.constant dense<16> : tensor<32x1xi64> loc(#loc178) + %tmp0_20 = arith.muli %tmp0_19, %x1_13 : tensor<32x1xi64> loc(#loc178) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc179) + %tmp0_22 = arith.addi %tmp0_16, %tmp0_21 : tensor<32x16xi64> loc(#loc179) + %tmp0_23 = arith.extsi %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc180) + %tmp0_24 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc180) + %tmp0_25 = arith.muli %tmp0_24, %tmp0_23 : tensor<1x16xi64> loc(#loc180) + %tmp0_26 = tt.broadcast %tmp0_25 : tensor<1x16xi64> -> tensor<32x16xi64> loc(#loc181) + %tmp0_27 = arith.addi %tmp0_22, %tmp0_26 : tensor<32x16xi64> loc(#loc181) + %tmp0_28 = arith.constant 16 : i32 loc(#loc182) + %tmp0_29 = arith.constant 16 : i64 loc(#loc182) + %tmp0_30 = arith.muli %tmp0_29, %ks0 : i64 loc(#loc182) + %tmp0_31 = tt.splat %tmp0_30 : i64 -> tensor<32x1xi64> loc(#loc183) + %tmp0_32 = arith.muli %tmp0_31, %x1_13 : tensor<32x1xi64> loc(#loc183) + %tmp0_33 = tt.broadcast %tmp0_32 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc184) + %tmp0_34 = arith.addi %tmp0_27, %tmp0_33 : tensor<32x16xi64> loc(#loc184) + %tmp0_35 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc185) + %tmp0_36 = tt.addptr %tmp0_35, %tmp0_34 : tensor<32x16x!tt.ptr>, tensor<32x16xi64> loc(#loc185) + %tmp0_37 = arith.constant 0.000000e+00 : f32 loc(#loc186) + %tmp0_38 = tt.broadcast %xmask_7 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc186) + %tmp0_39 = arith.constant dense<0.000000e+00> : tensor<32x16xf32> loc(#loc186) + %tmp0_40 = arith.fptosi %tmp0_39 : tensor<32x16xf32> to tensor<32x16xi32> loc(#loc186) + %tmp0_41 = tt.load %tmp0_36, %tmp0_38, %tmp0_40 evictionPolicy = evict_last : tensor<32x16x!tt.ptr> loc(#loc186) + %tmp2 = arith.trunci %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc187) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<32x16xi16> loc(#loc188) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp0_41, %tmp4) : (tensor<32x16xi32>, tensor<32x16xi16>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc26) + %tmp7 = arith.extsi %tmp0_41 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc189) + %tmp10 = arith.constant 0 : i32 loc(#loc190) + %tmp10_42 = arith.constant 0 : i64 loc(#loc190) + %tmp10_43 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc190) + %tmp10_44 = tt.broadcast %xmask_7 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc190) + %tmp10_45 = arith.select %tmp10_44, %tmp7, %tmp10_43 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc190) + %tmp11 = tt.call @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp10_45) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc191) + %tmp11_46 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc192) + %tmp12 = arith.extsi %0#1 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc193) + %tmp13 = arith.trunci %tmp12 : tensor<32x16xi64> to tensor<32x16xi32> loc(#loc194) + %tmp14 = arith.trunci %tmp11_46 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc195) + %c16_i32 = arith.constant 16 : i32 loc(#loc34) + %c16_i64 = arith.constant 16 : i64 loc(#loc34) + %cst = arith.constant dense<16> : tensor<32x1xi64> loc(#loc34) + %1 = arith.muli %cst, %x0_11 : tensor<32x1xi64> loc(#loc34) + %2 = arith.extsi %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc35) + %3 = tt.broadcast %2 : tensor<1x16xi64> -> tensor<32x16xi64> loc(#loc35) + %4 = tt.broadcast %1 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc35) + %5 = arith.addi %3, %4 : tensor<32x16xi64> loc(#loc35) + %c16_i32_47 = arith.constant 16 : i32 loc(#loc36) + %c16_i64_48 = arith.constant 16 : i64 loc(#loc36) + %cst_49 = arith.constant dense<16> : tensor<32x1xi64> loc(#loc36) + %6 = arith.muli %cst_49, %x1_13 : tensor<32x1xi64> loc(#loc36) + %c1_i32 = arith.constant 1 : i32 loc(#loc37) + %7 = arith.extsi %c1_i32 : i32 to i64 loc(#loc37) + %8 = arith.cmpi sge, %7, %ks0 : i64 loc(#loc37) + %c1_i32_50 = arith.constant 1 : i32 loc(#loc38) + %c1_i32_51 = arith.constant 1 : i32 loc(#loc38) + %9 = arith.extui %8 : i1 to i32 loc(#loc38) + %10 = arith.muli %c1_i32_51, %9 : i32 loc(#loc38) + %c1_i32_52 = arith.constant 1 : i32 loc(#loc39) + %11 = arith.extsi %c1_i32_52 : i32 to i64 loc(#loc39) + %12 = arith.cmpi sgt, %ks0, %11 : i64 loc(#loc39) + %13 = arith.extui %12 : i1 to i64 loc(#loc40) + %14 = arith.muli %ks0, %13 : i64 loc(#loc40) + %15 = arith.extsi %10 : i32 to i64 loc(#loc41) + %16 = arith.addi %15, %14 : i64 loc(#loc41) + %17 = tt.splat %16 : i64 -> tensor<32x1xi64> loc(#loc42) + %18 = arith.muli %6, %17 : tensor<32x1xi64> loc(#loc42) + %19 = tt.broadcast %18 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc43) + %20 = arith.addi %5, %19 : tensor<32x16xi64> loc(#loc43) + %21 = tt.splat %out_ptr2 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc44) + %22 = tt.addptr %21, %20 : tensor<32x16x!tt.ptr>, tensor<32x16xi64> loc(#loc44) + %23 = tt.broadcast %xmask_7 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc45) + tt.store %22, %tmp13, %23 : tensor<32x16x!tt.ptr> loc(#loc45) + %c1_i32_53 = arith.constant 1 : i32 loc(#loc46) + %24 = arith.extsi %c1_i32_53 : i32 to i64 loc(#loc46) + %25 = arith.cmpi sge, %24, %ks0 : i64 loc(#loc46) + %c1_i32_54 = arith.constant 1 : i32 loc(#loc47) + %c1_i32_55 = arith.constant 1 : i32 loc(#loc47) + %26 = arith.extui %25 : i1 to i32 loc(#loc47) + %27 = arith.muli %c1_i32_55, %26 : i32 loc(#loc47) + %c1_i32_56 = arith.constant 1 : i32 loc(#loc48) + %28 = arith.extsi %c1_i32_56 : i32 to i64 loc(#loc48) + %29 = arith.cmpi sgt, %ks0, %28 : i64 loc(#loc48) + %30 = arith.extui %29 : i1 to i64 loc(#loc49) + %31 = arith.muli %ks0, %30 : i64 loc(#loc49) + %32 = arith.extsi %27 : i32 to i64 loc(#loc50) + %33 = arith.addi %32, %31 : i64 loc(#loc50) + %34 = tt.splat %33 : i64 -> tensor<32x1xi64> loc(#loc51) + %35 = arith.muli %x1_13, %34 : tensor<32x1xi64> loc(#loc51) + %36 = arith.addi %x0_11, %35 : tensor<32x1xi64> loc(#loc52) + %37 = tt.splat %out_ptr3 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc53) + %38 = tt.addptr %37, %36 : tensor<32x1x!tt.ptr>, tensor<32x1xi64> loc(#loc53) + tt.store %38, %tmp14, %xmask_7 : tensor<32x1x!tt.ptr> loc(#loc54) + tt.return loc(#loc55) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc56)), %idxs: tensor<32x16xi16> loc("idxs"(#loc56))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<32x16xi32>, tensor<32x16xi16>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc57) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc57) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc57) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc57) + tt.return %3#0, %3#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc58) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc59) + %5 = ub.poison : tensor<32x16xi32> loc(#loc59) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc59) + } loc(#loc56) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc60)), %idxs: tensor<32x16xi16> loc("idxs"(#loc60))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc200) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc201) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc201) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc202) + %flip_3 = tt.reshape %flip_2 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc203) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i16S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi16>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc65) + tt.return %0#0, %0#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc66) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi32> loc(#loc67) + %2 = ub.poison : tensor<32x16xi32> loc(#loc67) + tt.return %1, %2 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc67) + } loc(#loc60) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i16S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc68)), %idxs: tensor<32x16xi16> loc("idxs"(#loc68)), %flip: tensor<32x16xi32> loc("flip"(#loc68))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<32x16xi16> -> tensor<256x2x1xi16> loc(#loc221) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc222) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc223) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<256x2x1xi16> loc(#loc223) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<256x2x1xi16>) -> tensor<256x1xi32> loc(#loc224) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc225) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc226) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc227) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc228) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<256x2x1xi16> loc(#loc228) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<256x2x1xi16>) -> tensor<256x1xi32> loc(#loc229) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc230) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc231) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc232) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_27 = arith.constant dense : tensor<32x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_28 = arith.constant dense : tensor<32x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_49 = arith.constant true loc(#loc239) + %cond_50 = arith.constant dense : tensor<32x16xi1> loc(#loc239) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<32x16xi1> loc(#loc239) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<32x16xi1> loc(#loc240) + %cond_53 = arith.ori %cond, %cond_52 : tensor<32x16xi1> loc(#loc271) + scf.yield %cond_53 : tensor<32x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc243) + %eq_50 = arith.ori %eq, %eq_49 : tensor<32x16xi1> loc(#loc273) + scf.yield %eq_50 : tensor<32x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc106) + } loc(#loc109) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<32x16xi32> loc(#loc245) + %cond_30 = arith.andi %3, %cond_29 : tensor<32x16xi1> loc(#loc246) + %cond_31 = arith.ori %1, %cond_30 : tensor<32x16xi1> loc(#loc247) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<32x16xi1> loc(#loc248) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<32x16xi1> loc(#loc249) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<32x16xi1> loc(#loc250) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<32x16xi1> loc(#loc251) + %cond_36 = arith.extui %cond_35 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc252) + %cond_37 = arith.xori %cond_36, %flip : tensor<32x16xi32> loc(#loc252) + %cond_38 = arith.constant 0 : i32 loc(#loc253) + %cond_39 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc253) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<32x16xi32> loc(#loc253) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc254) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc255) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc256) + %ret_43 = arith.xori %x, %ret_42 : tensor<32x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<32x16xi32> loc(#loc258) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S32_16S__(%idxs) : (tensor<32x16xi16>) -> tensor<32x16xi16> loc(#loc259) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<32x16xi16> to tensor<32x16xi32> loc(#loc260) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc260) + %new_idxs_47 = arith.extsi %idxs : tensor<32x16xi16> to tensor<32x16xi32> loc(#loc261) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<32x16xi32> loc(#loc261) + tt.return %ret_43, %new_idxs_48 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc130) + %5 = ub.poison : tensor<32x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<256x2x1xi32> loc("input"(#loc131))) -> tensor<256x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc132) + tt.reduce.return %2 : i32 loc(#loc132) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc132) + tt.return %0 : tensor<256x1xi32> loc(#loc133) + ^bb1: // no predecessors + %1 = ub.poison : tensor<256x1xi32> loc(#loc134) + tt.return %1 : tensor<256x1xi32> loc(#loc134) + } loc(#loc131) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc135)), %b: i32 loc("b"(#loc135))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc136) + tt.return %0 : i32 loc(#loc137) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc138) + tt.return %1 : i32 loc(#loc138) + } loc(#loc135) + tt.func private @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<256x2x1xi16> loc("input"(#loc131))) -> tensor<256x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc265) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc132) + tt.reduce.return %2 : i32 loc(#loc132) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc132) + tt.return %0 : tensor<256x1xi32> loc(#loc133) + ^bb1: // no predecessors + %1 = ub.poison : tensor<256x1xi32> loc(#loc134) + tt.return %1 : tensor<256x1xi32> loc(#loc134) + } loc(#loc131) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%x: tensor<32x16xi32> loc("x"(#loc140))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc141) + %false = arith.constant false loc(#loc142) + tt.return %false : i1 loc(#loc142) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc143) + tt.return %1 : i1 loc(#loc143) + } loc(#loc140) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S32_16S__(%x: tensor<32x16xi32> loc("x"(#loc144))) -> tensor<32x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc145) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc146) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc146) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<32x16xi32> loc(#loc146) + %4 = arith.addi %x, %3 : tensor<32x16xi32> loc(#loc146) + tt.return %4 : tensor<32x16xi32> loc(#loc147) + ^bb1: // no predecessors + %5 = ub.poison : tensor<32x16xi32> loc(#loc148) + tt.return %5 : tensor<32x16xi32> loc(#loc148) + } loc(#loc144) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc150) + %cst = arith.constant dense : tensor<1xi1> loc(#loc150) + tt.return %cst : tensor<1xi1> loc(#loc151) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc152) + tt.return %0 : tensor<1xi1> loc(#loc152) + } loc(#loc149) + tt.func private @triton.language.standard.zeros_like__i32S32_16S__(%input: tensor<32x16xi32> loc("input"(#loc153))) -> tensor<32x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<32x16xi32> loc(#loc154) + tt.return %0 : tensor<32x16xi32> loc(#loc155) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi32> loc(#loc156) + tt.return %1 : tensor<32x16xi32> loc(#loc156) + } loc(#loc153) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<32x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc150) + %cst = arith.constant dense<0> : tensor<32x16xi32> loc(#loc150) + tt.return %cst : tensor<32x16xi32> loc(#loc151) + ^bb1: // no predecessors + %0 = ub.poison : tensor<32x16xi32> loc(#loc152) + tt.return %0 : tensor<32x16xi32> loc(#loc152) + } loc(#loc149) + tt.func private @triton.language.standard.zeros_like__i16S32_16S__(%input: tensor<32x16xi16> loc("input"(#loc153))) -> tensor<32x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<32x16xi16> loc(#loc154) + tt.return %0 : tensor<32x16xi16> loc(#loc155) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi16> loc(#loc156) + tt.return %1 : tensor<32x16xi16> loc(#loc156) + } loc(#loc153) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<32x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc150) + %cst = arith.constant dense<0> : tensor<32x16xi16> loc(#loc150) + tt.return %cst : tensor<32x16xi16> loc(#loc151) + ^bb1: // no predecessors + %0 = ub.poison : tensor<32x16xi16> loc(#loc152) + tt.return %0 : tensor<32x16xi16> loc(#loc152) + } loc(#loc149) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc60)), %idxs: tensor<32x16xi32> loc("idxs"(#loc60))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc200) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc201) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc201) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc202) + %flip_3 = tt.reshape %flip_2 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc203) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc65) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc65) + tt.return %1#0, %1#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc66) + ^bb1: // no predecessors + %2 = ub.poison : tensor<32x16xi32> loc(#loc67) + %3 = ub.poison : tensor<32x16xi32> loc(#loc67) + tt.return %2, %3 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc67) + } loc(#loc60) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc68)), %idxs: tensor<32x16xi32> loc("idxs"(#loc68)), %flip: tensor<32x16xi32> loc("flip"(#loc68))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<128x2x2xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<128x2x2xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<128x2x2xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<128x2x2xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc239) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc239) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc239) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc240) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc271) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc243) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc273) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc251) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc252) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc252) + %cond_36 = arith.constant 0 : i32 loc(#loc253) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc253) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc253) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc254) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc255) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc256) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc258) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc259) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc260) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc261) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc130) + %5 = ub.poison : tensor<32x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x2x2xi32> loc("input"(#loc131))) -> tensor<128x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc132) + tt.reduce.return %2 : i32 loc(#loc132) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc132) + tt.return %0 : tensor<128x2xi32> loc(#loc133) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x2xi32> loc(#loc134) + tt.return %1 : tensor<128x2xi32> loc(#loc134) + } loc(#loc131) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc68)), %idxs: tensor<32x16xi32> loc("idxs"(#loc68)), %flip: tensor<32x16xi32> loc("flip"(#loc68))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x1xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x1xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc239) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc239) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc239) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc240) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc271) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc243) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc273) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc251) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc252) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc252) + %cond_36 = arith.constant 0 : i32 loc(#loc253) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc253) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc253) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc254) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc255) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc256) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc258) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc259) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc260) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc261) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc130) + %5 = ub.poison : tensor<32x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc60)), %idxs: tensor<32x16xi32> loc("idxs"(#loc60))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc200) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc201) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc201) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc202) + %flip_3 = tt.reshape %flip_2 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc203) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc65) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc65) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc65) + tt.return %2#0, %2#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc66) + ^bb1: // no predecessors + %3 = ub.poison : tensor<32x16xi32> loc(#loc67) + %4 = ub.poison : tensor<32x16xi32> loc(#loc67) + tt.return %3, %4 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc67) + } loc(#loc60) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc68)), %idxs: tensor<32x16xi32> loc("idxs"(#loc68)), %flip: tensor<32x16xi32> loc("flip"(#loc68))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x4xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<64x2x4xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x4xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x4xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc239) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc239) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc239) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc240) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc271) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc243) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc273) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc251) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc252) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc252) + %cond_36 = arith.constant 0 : i32 loc(#loc253) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc253) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc253) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc254) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc255) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc256) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc258) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc259) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc260) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc261) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc130) + %5 = ub.poison : tensor<32x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x4xi32> loc("input"(#loc131))) -> tensor<64x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc132) + tt.reduce.return %2 : i32 loc(#loc132) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc132) + tt.return %0 : tensor<64x4xi32> loc(#loc133) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x4xi32> loc(#loc134) + tt.return %1 : tensor<64x4xi32> loc(#loc134) + } loc(#loc131) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc60)), %idxs: tensor<32x16xi32> loc("idxs"(#loc60))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc269) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc65) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc65) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc65) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc65) + tt.return %3#0, %3#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc66) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc67) + %5 = ub.poison : tensor<32x16xi32> loc(#loc67) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc67) + } loc(#loc60) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc68)), %idxs: tensor<32x16xi32> loc("idxs"(#loc68)), %flip: i1 loc("flip"(#loc68))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x8xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<32x2x8xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x8xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x8xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc239) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc239) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc239) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc240) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc271) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc243) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc273) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc251) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc252) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc252) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc254) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc255) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc256) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc258) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc259) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc260) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc261) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc130) + %5 = ub.poison : tensor<32x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x2x8xi32> loc("input"(#loc131))) -> tensor<32x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc132) + tt.reduce.return %2 : i32 loc(#loc132) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc132) + tt.return %0 : tensor<32x8xi32> loc(#loc133) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x8xi32> loc(#loc134) + tt.return %1 : tensor<32x8xi32> loc(#loc134) + } loc(#loc131) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc68)), %idxs: tensor<32x16xi32> loc("idxs"(#loc68)), %flip: i1 loc("flip"(#loc68))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x4xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<64x2x4xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x4xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x4xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc239) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc239) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc239) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc240) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc271) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc243) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc273) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc251) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc252) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc252) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc254) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc255) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc256) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc258) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc259) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc260) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc261) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc130) + %5 = ub.poison : tensor<32x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc68)), %idxs: tensor<32x16xi32> loc("idxs"(#loc68)), %flip: i1 loc("flip"(#loc68))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<128x2x2xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<128x2x2xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<128x2x2xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<128x2x2xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc239) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc239) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc239) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc240) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc271) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc243) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc273) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc251) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc252) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc252) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc254) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc255) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc256) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc258) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc259) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc260) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc261) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc130) + %5 = ub.poison : tensor<32x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc68)), %idxs: tensor<32x16xi32> loc("idxs"(#loc68)), %flip: i1 loc("flip"(#loc68))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x1xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x1xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc239) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc239) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc239) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc240) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc271) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc243) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc273) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc251) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc252) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc252) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc254) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc255) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc256) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc258) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc259) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc260) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc261) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc130) + %5 = ub.poison : tensor<32x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x16xi64> loc("input"(#loc131))) -> tensor<32xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc132) + tt.reduce.return %2 : i64 loc(#loc132) + }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc132) + tt.return %0 : tensor<32xi64> loc(#loc133) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32xi64> loc(#loc134) + tt.return %1 : tensor<32xi64> loc(#loc134) + } loc(#loc131) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc135)), %b: i64 loc("b"(#loc135))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc136) + tt.return %0 : i64 loc(#loc137) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc138) + tt.return %1 : i64 loc(#loc138) + } loc(#loc135) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":25:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":26:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":26:38) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":27:16) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":28:48) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":32:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":33:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:45) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:42) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:54) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:50) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:64) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:68) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:61) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:30) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:73) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":37:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":39:33) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":40:67) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":41:19) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":43:34) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":44:26) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":44:29) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":45:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":46:21) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":47:21) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:35) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:32) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:43) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:62) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:54) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:88) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:79) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:70) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:47) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:40) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:25) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:102) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:49) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:41) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:75) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:66) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:57) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:34) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:30) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:89) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:4) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc141 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc142 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc143 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc145 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc146 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc147 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc148 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc149 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc150 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc151 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc152 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc154 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc155 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc156 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc157 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc164 = loc("r0_numel"(#loc1)) +#loc165 = loc("xoffset"(#loc2)) +#loc166 = loc("xoffset"(#loc3)) +#loc167 = loc("xindex"(#loc4)) +#loc168 = loc("xindex"(#loc5)) +#loc169 = loc("xindex"(#loc6)) +#loc170 = loc("xmask"(#loc7)) +#loc171 = loc("r0_index"(#loc8)) +#loc172 = loc("r0_index"(#loc9)) +#loc173 = loc("r0_offset"(#loc10)) +#loc174 = loc("r0_mask"(#loc11)) +#loc175 = loc("x0"(#loc12)) +#loc176 = loc("x1"(#loc13)) +#loc177 = loc("tmp0"(#loc14)) +#loc178 = loc("tmp0"(#loc15)) +#loc179 = loc("tmp0"(#loc16)) +#loc180 = loc("tmp0"(#loc17)) +#loc181 = loc("tmp0"(#loc18)) +#loc182 = loc("tmp0"(#loc19)) +#loc183 = loc("tmp0"(#loc20)) +#loc184 = loc("tmp0"(#loc21)) +#loc185 = loc("tmp0"(#loc22)) +#loc186 = loc("tmp0"(#loc23)) +#loc187 = loc("tmp2"(#loc24)) +#loc188 = loc("tmp4"(#loc25)) +#loc189 = loc("tmp7"(#loc27)) +#loc190 = loc("tmp10"(#loc28)) +#loc191 = loc("tmp11"(#loc29)) +#loc192 = loc("tmp11"(#loc30)) +#loc193 = loc("tmp12"(#loc31)) +#loc194 = loc("tmp13"(#loc32)) +#loc195 = loc("tmp14"(#loc33)) +#loc200 = loc("flip"(#loc61)) +#loc201 = loc("flip"(#loc62)) +#loc202 = loc("flip"(#loc63)) +#loc203 = loc("flip"(#loc64)) +#loc207 = loc("y"(#loc69)) +#loc208 = loc("right_mask"(#loc70)) +#loc209 = loc("right_mask"(#loc71)) +#loc210 = loc("left_mask"(#loc72)) +#loc211 = loc("ileft"(#loc73)) +#loc212 = loc("ileft"(#loc74)) +#loc213 = loc("ileft"(#loc75)) +#loc214 = loc("ileft"(#loc76)) +#loc215 = loc("iright"(#loc77)) +#loc216 = loc("iright"(#loc78)) +#loc217 = loc("iright"(#loc79)) +#loc218 = loc("iright"(#loc80)) +#loc219 = loc("ileft"(#loc81)) +#loc220 = loc("iright"(#loc82)) +#loc221 = loc("y_idx"(#loc83)) +#loc222 = loc("left_idx"(#loc84)) +#loc223 = loc("left_idx"(#loc85)) +#loc224 = loc("left_idx"(#loc86)) +#loc225 = loc("left_idx"(#loc87)) +#loc226 = loc("left_idx"(#loc88)) +#loc227 = loc("right_idx"(#loc89)) +#loc228 = loc("right_idx"(#loc90)) +#loc229 = loc("right_idx"(#loc91)) +#loc230 = loc("right_idx"(#loc92)) +#loc231 = loc("right_idx"(#loc93)) +#loc232 = loc("left_idx"(#loc94)) +#loc233 = loc("right_idx"(#loc95)) +#loc234 = loc("left_valid_mask"(#loc96)) +#loc235 = loc("right_valid_mask"(#loc97)) +#loc236 = loc("left_isnan"(#loc98)) +#loc237 = loc("right_isnan"(#loc99)) +#loc238 = loc("cond"(#loc100)) +#loc239 = loc("cond"(#loc103)) +#loc240 = loc("cond"(#loc104)) +#loc241 = loc("cond"(#loc105)) +#loc242 = loc("eq"(#loc107)) +#loc243 = loc("eq"(#loc110)) +#loc244 = loc("eq"(#loc111)) +#loc245 = loc("cond"(#loc112)) +#loc246 = loc("cond"(#loc113)) +#loc247 = loc("cond"(#loc114)) +#loc248 = loc("cond"(#loc115)) +#loc249 = loc("cond"(#loc116)) +#loc250 = loc("cond"(#loc117)) +#loc251 = loc("cond"(#loc118)) +#loc252 = loc("cond"(#loc119)) +#loc253 = loc("cond"(#loc120)) +#loc254 = loc("ret"(#loc121)) +#loc255 = loc("ret"(#loc122)) +#loc256 = loc("ret"(#loc123)) +#loc257 = loc("ret"(#loc124)) +#loc258 = loc("new_idxs"(#loc125)) +#loc259 = loc("new_idxs"(#loc126)) +#loc260 = loc("new_idxs"(#loc127)) +#loc261 = loc("new_idxs"(#loc128)) +#loc265 = loc("input"(#loc139)) +#loc269 = loc("flip"(#loc157)) +#loc270 = loc("cond"(#loc238)) +#loc271 = loc("cond"(#loc241)) +#loc272 = loc("eq"(#loc242)) +#loc273 = loc("eq"(#loc244)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..1e3901e7f9330a7bd9002a8180e2ab5d9a7356c6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir @@ -0,0 +1,890 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}> +#linear = #ttg.linear<{register = [[0, 4], [0, 8]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0]], warp = [[0, 1], [0, 2]], block = []}> +#linear1 = #ttg.linear<{register = [[2, 0, 0], [4, 0, 0]], lane = [[8, 0, 0], [16, 0, 0], [32, 0, 0], [64, 0, 0], [128, 0, 0]], warp = [[0, 1, 0], [1, 0, 0]], block = []}> +#linear2 = #ttg.linear<{register = [[1, 0, 0], [2, 0, 0]], lane = [[4, 0, 0], [8, 0, 0], [16, 0, 0], [32, 0, 0], [64, 0, 0]], warp = [[0, 0, 1], [0, 1, 0]], block = []}> +#linear3 = #ttg.linear<{register = [[0, 1, 0], [1, 0, 0]], lane = [[2, 0, 0], [4, 0, 0], [8, 0, 0], [16, 0, 0], [32, 0, 0]], warp = [[0, 0, 1], [0, 0, 2]], block = []}> +#linear4 = #ttg.linear<{register = [[0, 0, 4], [0, 1, 0]], lane = [[1, 0, 0], [2, 0, 0], [4, 0, 0], [8, 0, 0], [16, 0, 0]], warp = [[0, 0, 1], [0, 0, 2]], block = []}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":18:0) +#loc1 = loc(unknown) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":40:67) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":44:26) +#loc90 = loc("in_ptr0"(#loc)) +#loc91 = loc("out_ptr2"(#loc)) +#loc92 = loc("out_ptr3"(#loc)) +#loc93 = loc("ks0"(#loc)) +#loc94 = loc("xnumel"(#loc)) +#loc95 = loc("r0_numel"(#loc)) +#loc117 = loc(callsite(#loc23 at #loc24)) +#loc123 = loc("ileft"(#loc32)) +#loc127 = loc("iright"(#loc37)) +#loc136 = loc("left_idx"(#loc46)) +#loc141 = loc("right_idx"(#loc51)) +#loc161 = loc("tmp11"(#loc71)) +#loc168 = loc(callsite(#loc28 at #loc117)) +#loc172 = loc(callsite(#loc1 at #loc161)) +#loc176 = loc(callsite(#loc123 at #loc168)) +#loc180 = loc(callsite(#loc127 at #loc168)) +#loc188 = loc(callsite(#loc136 at #loc168)) +#loc193 = loc(callsite(#loc141 at #loc168)) +#loc213 = loc(callsite(#loc1 at #loc176)) +#loc215 = loc(callsite(#loc1 at #loc180)) +#loc218 = loc(callsite(#loc1 at #loc188)) +#loc221 = loc(callsite(#loc1 at #loc193)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<32x16xi32, #linear> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<32x16xi64, #blocked> loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c16_i64 = arith.constant 16 : i64 loc(#loc1) + %cst_1 = arith.constant dense<16> : tensor<32x1xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<16> : tensor<32x1xi64, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x2x1xi32, #linear1> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x2x1xi32, #linear2> loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x2x1xi32, #linear3> loc(#loc1) + %cst_6 = arith.constant dense<1> : tensor<1x2x1xi32, #linear4> loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst_7 = arith.constant dense<0> : tensor<32x16xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc96) + %xoffset_8 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc97) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc98) + %xindex_9 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc98) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc98) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1> loc(#loc98) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<32x1xi32, #blocked> loc(#loc99) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<32x1xi32, #blocked1> loc(#loc99) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<32x1xi32, #blocked> loc(#loc99) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<32x1xi32, #blocked1> loc(#loc99) + %xmask = tt.splat %xnumel : i32 -> tensor<32x1xi32, #blocked> loc(#loc100) + %xmask_16 = tt.splat %xnumel : i32 -> tensor<32x1xi32, #blocked1> loc(#loc100) + %xmask_17 = arith.cmpi slt, %xindex_14, %xmask : tensor<32x1xi32, #blocked> loc(#loc100) + %xmask_18 = arith.cmpi slt, %xindex_15, %xmask_16 : tensor<32x1xi32, #blocked1> loc(#loc100) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc101) + %r0_index_19 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> loc(#loc101) + %r0_index_20 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc101) + %r0_index_21 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc101) + %r0_index_22 = tt.expand_dims %r0_index_19 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> -> tensor<1x16xi32, #linear> loc(#loc101) + %r0_index_23 = tt.expand_dims %r0_index_20 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x16xi32, #blocked1> loc(#loc101) + %x0 = arith.extsi %xindex_14 : tensor<32x1xi32, #blocked> to tensor<32x1xi64, #blocked> loc(#loc102) + %x0_24 = arith.extsi %xindex_15 : tensor<32x1xi32, #blocked1> to tensor<32x1xi64, #blocked1> loc(#loc102) + %x0_25 = tt.splat %ks0 : i64 -> tensor<32x1xi64, #blocked> loc(#loc102) + %x0_26 = tt.splat %ks0 : i64 -> tensor<32x1xi64, #blocked1> loc(#loc102) + %x0_27 = arith.remsi %x0, %x0_25 : tensor<32x1xi64, #blocked> loc(#loc102) + %x0_28 = arith.remsi %x0_24, %x0_26 : tensor<32x1xi64, #blocked1> loc(#loc102) + %x1 = arith.divsi %x0, %x0_25 : tensor<32x1xi64, #blocked> loc(#loc103) + %x1_29 = arith.divsi %x0_24, %x0_26 : tensor<32x1xi64, #blocked1> loc(#loc103) + %tmp0 = arith.extsi %r0_index_21 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc104) + %tmp0_30 = arith.extsi %r0_index_23 : tensor<1x16xi32, #blocked1> to tensor<1x16xi64, #blocked1> loc(#loc104) + %tmp0_31 = tt.broadcast %tmp0 : tensor<1x16xi64, #blocked> -> tensor<32x16xi64, #blocked> loc(#loc104) + %tmp0_32 = tt.broadcast %tmp0_30 : tensor<1x16xi64, #blocked1> -> tensor<32x16xi64, #blocked1> loc(#loc104) + %tmp0_33 = tt.broadcast %x0_27 : tensor<32x1xi64, #blocked> -> tensor<32x16xi64, #blocked> loc(#loc104) + %tmp0_34 = arith.addi %tmp0_31, %tmp0_33 : tensor<32x16xi64, #blocked> loc(#loc104) + %tmp0_35 = arith.muli %x1, %cst_1 : tensor<32x1xi64, #blocked> loc(#loc105) + %tmp0_36 = arith.muli %x1_29, %cst_2 : tensor<32x1xi64, #blocked1> loc(#loc105) + %tmp0_37 = tt.broadcast %tmp0_35 : tensor<32x1xi64, #blocked> -> tensor<32x16xi64, #blocked> loc(#loc106) + %tmp0_38 = arith.addi %tmp0_34, %tmp0_37 : tensor<32x16xi64, #blocked> loc(#loc106) + %tmp0_39 = tt.splat %ks0 : i64 -> tensor<1x16xi64, #blocked> loc(#loc107) + %tmp0_40 = arith.muli %tmp0_39, %tmp0 : tensor<1x16xi64, #blocked> loc(#loc107) + %tmp0_41 = tt.broadcast %tmp0_40 : tensor<1x16xi64, #blocked> -> tensor<32x16xi64, #blocked> loc(#loc108) + %tmp0_42 = arith.addi %tmp0_38, %tmp0_41 : tensor<32x16xi64, #blocked> loc(#loc108) + %tmp0_43 = arith.muli %ks0, %c16_i64 : i64 loc(#loc109) + %tmp0_44 = tt.splat %tmp0_43 : i64 -> tensor<32x1xi64, #blocked> loc(#loc110) + %tmp0_45 = arith.muli %tmp0_44, %x1 : tensor<32x1xi64, #blocked> loc(#loc110) + %tmp0_46 = tt.broadcast %tmp0_45 : tensor<32x1xi64, #blocked> -> tensor<32x16xi64, #blocked> loc(#loc111) + %tmp0_47 = arith.addi %tmp0_42, %tmp0_46 : tensor<32x16xi64, #blocked> loc(#loc111) + %tmp0_48 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc112) + %tmp0_49 = tt.addptr %tmp0_48, %tmp0_47 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi64, #blocked> loc(#loc112) + %tmp0_50 = tt.broadcast %xmask_17 : tensor<32x1xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc113) + %tmp0_51 = tt.broadcast %xmask_18 : tensor<32x1xi1, #blocked1> -> tensor<32x16xi1, #blocked1> loc(#loc113) + %tmp0_52 = tt.load %tmp0_49, %tmp0_50, %cst_7 evictionPolicy = evict_last : tensor<32x16x!tt.ptr, #blocked> loc(#loc113) + %tmp2 = arith.trunci %r0_index_22 : tensor<1x16xi32, #linear> to tensor<1x16xi16, #linear> loc(#loc114) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16, #linear> -> tensor<32x16xi16, #linear> loc(#loc115) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> loc(#loc165) + %flip_53 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> loc(#loc165) + %flip_54 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> loc(#loc165) + %flip_55 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> loc(#loc165) + %flip_56 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> loc(#loc165) + %flip_57 = tt.expand_dims %flip_53 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> loc(#loc165) + %flip_58 = tt.expand_dims %flip_54 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> loc(#loc165) + %flip_59 = tt.expand_dims %flip_55 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> loc(#loc165) + %flip_60 = tt.expand_dims %flip_56 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> -> tensor<1x2x1xi32, #linear2> loc(#loc165) + %flip_61 = tt.expand_dims %flip_57 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> -> tensor<1x2x1xi32, #linear1> loc(#loc165) + %flip_62 = tt.expand_dims %flip_58 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> -> tensor<1x2x1xi32, #linear3> loc(#loc165) + %flip_63 = tt.expand_dims %flip_59 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> -> tensor<1x2x1xi32, #linear4> loc(#loc165) + %flip_64 = tt.broadcast %flip_60 : tensor<1x2x1xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc166) + %flip_65 = tt.reshape %flip_64 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #blocked> loc(#loc167) + %flip_66 = tt.reshape %flip_64 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc167) + %y = tt.reshape %tmp0_52 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #linear1> loc(#loc173) + %left_mask = arith.subi %cst_3, %flip_61 : tensor<1x2x1xi32, #linear1> loc(#loc174) + %left_mask_67 = arith.subi %cst_4, %flip_60 : tensor<1x2x1xi32, #linear2> loc(#loc174) + %left_mask_68 = arith.subi %cst_5, %flip_62 : tensor<1x2x1xi32, #linear3> loc(#loc174) + %left_mask_69 = arith.subi %cst_6, %flip_63 : tensor<1x2x1xi32, #linear4> loc(#loc174) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc175) + %ileft_70 = arith.muli %y, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc175) + %ileft_71 = "tt.reduce"(%ileft_70) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc212) + %ileft_72 = tt.expand_dims %ileft_71 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc177) + %ileft_73 = tt.broadcast %ileft_72 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc178) + %iright = tt.broadcast %flip_61 : tensor<1x2x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc179) + %iright_74 = arith.muli %y, %iright : tensor<256x2x1xi32, #linear1> loc(#loc179) + %iright_75 = "tt.reduce"(%iright_74) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc214) + %iright_76 = tt.expand_dims %iright_75 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc181) + %iright_77 = tt.broadcast %iright_76 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc182) + %ileft_78 = tt.reshape %ileft_73 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc183) + %ileft_79 = tt.reshape %ileft_73 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc183) + %iright_80 = tt.reshape %iright_77 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc184) + %iright_81 = tt.reshape %iright_77 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc184) + %y_idx = tt.reshape %tmp4 : tensor<32x16xi16, #linear> -> tensor<256x2x1xi16, #linear1> loc(#loc185) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc186) + %left_idx_82 = tt.broadcast %left_idx : tensor<1x2x1xi16, #linear1> -> tensor<256x2x1xi16, #linear1> loc(#loc187) + %left_idx_83 = arith.muli %y_idx, %left_idx_82 : tensor<256x2x1xi16, #linear1> loc(#loc187) + %input = arith.extsi %left_idx_83 : tensor<256x2x1xi16, #linear1> to tensor<256x2x1xi32, #linear1> loc(#loc216) + %left_idx_84 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc217) + %left_idx_85 = tt.expand_dims %left_idx_84 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc189) + %left_idx_86 = tt.broadcast %left_idx_85 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc190) + %right_idx = arith.trunci %flip_61 : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc191) + %right_idx_87 = tt.broadcast %right_idx : tensor<1x2x1xi16, #linear1> -> tensor<256x2x1xi16, #linear1> loc(#loc192) + %right_idx_88 = arith.muli %y_idx, %right_idx_87 : tensor<256x2x1xi16, #linear1> loc(#loc192) + %input_89 = arith.extsi %right_idx_88 : tensor<256x2x1xi16, #linear1> to tensor<256x2x1xi32, #linear1> loc(#loc219) + %right_idx_90 = "tt.reduce"(%input_89) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc220) + %right_idx_91 = tt.expand_dims %right_idx_90 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc194) + %right_idx_92 = tt.broadcast %right_idx_91 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc195) + %left_idx_93 = tt.reshape %left_idx_86 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc196) + %left_idx_94 = tt.reshape %left_idx_86 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc196) + %right_idx_95 = tt.reshape %right_idx_92 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #blocked> loc(#loc197) + %right_idx_96 = tt.reshape %right_idx_92 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc197) + %cond = arith.cmpi slt, %ileft_78, %iright_80 : tensor<32x16xi32, #blocked> loc(#loc198) + %cond_97 = arith.cmpi slt, %ileft_79, %iright_81 : tensor<32x16xi32, #linear> loc(#loc198) + %eq = arith.cmpi eq, %ileft_78, %iright_80 : tensor<32x16xi32, #blocked> loc(#loc199) + %eq_98 = arith.cmpi eq, %ileft_79, %iright_81 : tensor<32x16xi32, #linear> loc(#loc199) + %cond_99 = arith.cmpi sgt, %left_idx_93, %right_idx_95 : tensor<32x16xi32, #blocked> loc(#loc200) + %cond_100 = arith.cmpi sgt, %left_idx_94, %right_idx_96 : tensor<32x16xi32, #linear> loc(#loc200) + %cond_101 = arith.andi %eq, %cond_99 : tensor<32x16xi1, #blocked> loc(#loc201) + %cond_102 = arith.andi %eq_98, %cond_100 : tensor<32x16xi1, #linear> loc(#loc201) + %cond_103 = arith.ori %cond, %cond_101 : tensor<32x16xi1, #blocked> loc(#loc202) + %cond_104 = arith.ori %cond_97, %cond_102 : tensor<32x16xi1, #linear> loc(#loc202) + %cond_105 = arith.extui %cond_103 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc203) + %cond_106 = arith.extui %cond_104 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc203) + %cond_107 = arith.xori %cond_105, %flip_65 : tensor<32x16xi32, #blocked> loc(#loc203) + %cond_108 = arith.xori %cond_106, %flip_66 : tensor<32x16xi32, #linear> loc(#loc203) + %cond_109 = arith.cmpi ne, %cond_107, %cst_7 : tensor<32x16xi32, #blocked> loc(#loc204) + %cond_110 = arith.cmpi ne, %cond_108, %cst : tensor<32x16xi32, #linear> loc(#loc204) + %ret = arith.xori %ileft_78, %iright_80 : tensor<32x16xi32, #blocked> loc(#loc205) + %ret_111 = arith.select %cond_109, %ret, %cst_7 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc206) + %ret_112 = arith.xori %tmp0_52, %ret_111 : tensor<32x16xi32, #blocked> loc(#loc207) + %ret_113 = ttg.convert_layout %ret_112 : tensor<32x16xi32, #blocked> -> tensor<32x16xi32, #linear> loc(#loc207) + %new_idxs = arith.xori %left_idx_94, %right_idx_96 : tensor<32x16xi32, #linear> loc(#loc208) + %new_idxs_114 = arith.select %cond_110, %new_idxs, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc209) + %new_idxs_115 = arith.extsi %tmp2 : tensor<1x16xi16, #linear> to tensor<1x16xi32, #linear> loc(#loc210) + %new_idxs_116 = tt.broadcast %new_idxs_115 : tensor<1x16xi32, #linear> -> tensor<32x16xi32, #linear> loc(#loc210) + %new_idxs_117 = arith.xori %new_idxs_116, %new_idxs_114 : tensor<32x16xi32, #linear> loc(#loc210) + %flip_118 = tt.broadcast %flip_62 : tensor<1x2x1xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc166) + %flip_119 = tt.reshape %flip_118 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc167) + %y_120 = tt.reshape %ret_112 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #linear2> loc(#loc173) + %ileft_121 = tt.broadcast %left_mask_67 : tensor<1x2x1xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc175) + %ileft_122 = arith.muli %y_120, %ileft_121 : tensor<128x2x2xi32, #linear2> loc(#loc175) + %ileft_123 = "tt.reduce"(%ileft_122) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc212) + %ileft_124 = tt.expand_dims %ileft_123 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc177) + %ileft_125 = tt.broadcast %ileft_124 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc178) + %iright_126 = arith.muli %y_120, %flip_64 : tensor<128x2x2xi32, #linear2> loc(#loc179) + %iright_127 = "tt.reduce"(%iright_126) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc214) + %iright_128 = tt.expand_dims %iright_127 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc181) + %iright_129 = tt.broadcast %iright_128 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc182) + %ileft_130 = tt.reshape %ileft_125 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc183) + %iright_131 = tt.reshape %iright_129 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc184) + %y_idx_132 = tt.reshape %new_idxs_117 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc185) + %left_idx_133 = arith.muli %y_idx_132, %ileft_121 : tensor<128x2x2xi32, #linear2> loc(#loc187) + %left_idx_134 = "tt.reduce"(%left_idx_133) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc217) + %left_idx_135 = tt.expand_dims %left_idx_134 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc189) + %left_idx_136 = tt.broadcast %left_idx_135 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc190) + %right_idx_137 = arith.muli %y_idx_132, %flip_64 : tensor<128x2x2xi32, #linear2> loc(#loc192) + %right_idx_138 = "tt.reduce"(%right_idx_137) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc220) + %right_idx_139 = tt.expand_dims %right_idx_138 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc194) + %right_idx_140 = tt.broadcast %right_idx_139 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc195) + %left_idx_141 = tt.reshape %left_idx_136 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc196) + %right_idx_142 = tt.reshape %right_idx_140 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc197) + %cond_143 = arith.cmpi slt, %ileft_130, %iright_131 : tensor<32x16xi32, #linear> loc(#loc198) + %eq_144 = arith.cmpi eq, %ileft_130, %iright_131 : tensor<32x16xi32, #linear> loc(#loc199) + %cond_145 = arith.cmpi sgt, %left_idx_141, %right_idx_142 : tensor<32x16xi32, #linear> loc(#loc200) + %cond_146 = arith.andi %eq_144, %cond_145 : tensor<32x16xi1, #linear> loc(#loc201) + %cond_147 = arith.ori %cond_143, %cond_146 : tensor<32x16xi1, #linear> loc(#loc202) + %cond_148 = arith.extui %cond_147 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc203) + %cond_149 = arith.xori %cond_148, %flip_119 : tensor<32x16xi32, #linear> loc(#loc203) + %cond_150 = arith.cmpi ne, %cond_149, %cst : tensor<32x16xi32, #linear> loc(#loc204) + %ret_151 = arith.xori %ileft_130, %iright_131 : tensor<32x16xi32, #linear> loc(#loc205) + %ret_152 = arith.select %cond_150, %ret_151, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc206) + %ret_153 = arith.xori %ret_113, %ret_152 : tensor<32x16xi32, #linear> loc(#loc207) + %new_idxs_154 = arith.xori %left_idx_141, %right_idx_142 : tensor<32x16xi32, #linear> loc(#loc208) + %new_idxs_155 = arith.select %cond_150, %new_idxs_154, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc209) + %new_idxs_156 = arith.xori %new_idxs_117, %new_idxs_155 : tensor<32x16xi32, #linear> loc(#loc210) + %y_157 = tt.reshape %ret_153 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc173) + %ileft_158 = arith.muli %y_157, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc175) + %ileft_159 = "tt.reduce"(%ileft_158) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc212) + %ileft_160 = tt.expand_dims %ileft_159 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc177) + %ileft_161 = tt.broadcast %ileft_160 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc178) + %iright_162 = arith.muli %y_157, %iright : tensor<256x2x1xi32, #linear1> loc(#loc179) + %iright_163 = "tt.reduce"(%iright_162) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc214) + %iright_164 = tt.expand_dims %iright_163 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc181) + %iright_165 = tt.broadcast %iright_164 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc182) + %ileft_166 = tt.reshape %ileft_161 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc183) + %iright_167 = tt.reshape %iright_165 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc184) + %y_idx_168 = tt.reshape %new_idxs_156 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc185) + %left_idx_169 = arith.muli %y_idx_168, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc187) + %left_idx_170 = "tt.reduce"(%left_idx_169) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc217) + %left_idx_171 = tt.expand_dims %left_idx_170 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc189) + %left_idx_172 = tt.broadcast %left_idx_171 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc190) + %right_idx_173 = arith.muli %y_idx_168, %iright : tensor<256x2x1xi32, #linear1> loc(#loc192) + %right_idx_174 = "tt.reduce"(%right_idx_173) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc220) + %right_idx_175 = tt.expand_dims %right_idx_174 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc194) + %right_idx_176 = tt.broadcast %right_idx_175 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc195) + %left_idx_177 = tt.reshape %left_idx_172 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc196) + %right_idx_178 = tt.reshape %right_idx_176 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc197) + %cond_179 = arith.cmpi slt, %ileft_166, %iright_167 : tensor<32x16xi32, #linear> loc(#loc198) + %eq_180 = arith.cmpi eq, %ileft_166, %iright_167 : tensor<32x16xi32, #linear> loc(#loc199) + %cond_181 = arith.cmpi sgt, %left_idx_177, %right_idx_178 : tensor<32x16xi32, #linear> loc(#loc200) + %cond_182 = arith.andi %eq_180, %cond_181 : tensor<32x16xi1, #linear> loc(#loc201) + %cond_183 = arith.ori %cond_179, %cond_182 : tensor<32x16xi1, #linear> loc(#loc202) + %cond_184 = arith.extui %cond_183 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc203) + %cond_185 = arith.xori %cond_184, %flip_119 : tensor<32x16xi32, #linear> loc(#loc203) + %cond_186 = arith.cmpi ne, %cond_185, %cst : tensor<32x16xi32, #linear> loc(#loc204) + %ret_187 = arith.xori %ileft_166, %iright_167 : tensor<32x16xi32, #linear> loc(#loc205) + %ret_188 = arith.select %cond_186, %ret_187, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc206) + %ret_189 = arith.xori %ret_153, %ret_188 : tensor<32x16xi32, #linear> loc(#loc207) + %new_idxs_190 = arith.xori %left_idx_177, %right_idx_178 : tensor<32x16xi32, #linear> loc(#loc208) + %new_idxs_191 = arith.select %cond_186, %new_idxs_190, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc209) + %new_idxs_192 = arith.xori %new_idxs_156, %new_idxs_191 : tensor<32x16xi32, #linear> loc(#loc210) + %flip_193 = tt.broadcast %flip_63 : tensor<1x2x1xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc166) + %flip_194 = tt.reshape %flip_193 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc167) + %y_195 = tt.reshape %ret_189 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc173) + %ileft_196 = tt.broadcast %left_mask_68 : tensor<1x2x1xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc175) + %ileft_197 = arith.muli %y_195, %ileft_196 : tensor<64x2x4xi32, #linear3> loc(#loc175) + %ileft_198 = "tt.reduce"(%ileft_197) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc212) + %ileft_199 = tt.expand_dims %ileft_198 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc177) + %ileft_200 = tt.broadcast %ileft_199 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc178) + %iright_201 = arith.muli %y_195, %flip_118 : tensor<64x2x4xi32, #linear3> loc(#loc179) + %iright_202 = "tt.reduce"(%iright_201) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc214) + %iright_203 = tt.expand_dims %iright_202 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc181) + %iright_204 = tt.broadcast %iright_203 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc182) + %ileft_205 = tt.reshape %ileft_200 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc183) + %iright_206 = tt.reshape %iright_204 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc184) + %y_idx_207 = tt.reshape %new_idxs_192 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc185) + %left_idx_208 = arith.muli %y_idx_207, %ileft_196 : tensor<64x2x4xi32, #linear3> loc(#loc187) + %left_idx_209 = "tt.reduce"(%left_idx_208) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc217) + %left_idx_210 = tt.expand_dims %left_idx_209 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc189) + %left_idx_211 = tt.broadcast %left_idx_210 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc190) + %right_idx_212 = arith.muli %y_idx_207, %flip_118 : tensor<64x2x4xi32, #linear3> loc(#loc192) + %right_idx_213 = "tt.reduce"(%right_idx_212) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc220) + %right_idx_214 = tt.expand_dims %right_idx_213 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc194) + %right_idx_215 = tt.broadcast %right_idx_214 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc195) + %left_idx_216 = tt.reshape %left_idx_211 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc196) + %right_idx_217 = tt.reshape %right_idx_215 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc197) + %cond_218 = arith.cmpi slt, %ileft_205, %iright_206 : tensor<32x16xi32, #linear> loc(#loc198) + %eq_219 = arith.cmpi eq, %ileft_205, %iright_206 : tensor<32x16xi32, #linear> loc(#loc199) + %cond_220 = arith.cmpi sgt, %left_idx_216, %right_idx_217 : tensor<32x16xi32, #linear> loc(#loc200) + %cond_221 = arith.andi %eq_219, %cond_220 : tensor<32x16xi1, #linear> loc(#loc201) + %cond_222 = arith.ori %cond_218, %cond_221 : tensor<32x16xi1, #linear> loc(#loc202) + %cond_223 = arith.extui %cond_222 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc203) + %cond_224 = arith.xori %cond_223, %flip_194 : tensor<32x16xi32, #linear> loc(#loc203) + %cond_225 = arith.cmpi ne, %cond_224, %cst : tensor<32x16xi32, #linear> loc(#loc204) + %ret_226 = arith.xori %ileft_205, %iright_206 : tensor<32x16xi32, #linear> loc(#loc205) + %ret_227 = arith.select %cond_225, %ret_226, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc206) + %ret_228 = arith.xori %ret_189, %ret_227 : tensor<32x16xi32, #linear> loc(#loc207) + %new_idxs_229 = arith.xori %left_idx_216, %right_idx_217 : tensor<32x16xi32, #linear> loc(#loc208) + %new_idxs_230 = arith.select %cond_225, %new_idxs_229, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc209) + %new_idxs_231 = arith.xori %new_idxs_192, %new_idxs_230 : tensor<32x16xi32, #linear> loc(#loc210) + %y_232 = tt.reshape %ret_228 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc173) + %ileft_233 = arith.muli %y_232, %ileft_121 : tensor<128x2x2xi32, #linear2> loc(#loc175) + %ileft_234 = "tt.reduce"(%ileft_233) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc212) + %ileft_235 = tt.expand_dims %ileft_234 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc177) + %ileft_236 = tt.broadcast %ileft_235 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc178) + %iright_237 = arith.muli %y_232, %flip_64 : tensor<128x2x2xi32, #linear2> loc(#loc179) + %iright_238 = "tt.reduce"(%iright_237) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc214) + %iright_239 = tt.expand_dims %iright_238 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc181) + %iright_240 = tt.broadcast %iright_239 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc182) + %ileft_241 = tt.reshape %ileft_236 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc183) + %iright_242 = tt.reshape %iright_240 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc184) + %y_idx_243 = tt.reshape %new_idxs_231 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc185) + %left_idx_244 = arith.muli %y_idx_243, %ileft_121 : tensor<128x2x2xi32, #linear2> loc(#loc187) + %left_idx_245 = "tt.reduce"(%left_idx_244) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc217) + %left_idx_246 = tt.expand_dims %left_idx_245 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc189) + %left_idx_247 = tt.broadcast %left_idx_246 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc190) + %right_idx_248 = arith.muli %y_idx_243, %flip_64 : tensor<128x2x2xi32, #linear2> loc(#loc192) + %right_idx_249 = "tt.reduce"(%right_idx_248) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc220) + %right_idx_250 = tt.expand_dims %right_idx_249 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc194) + %right_idx_251 = tt.broadcast %right_idx_250 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc195) + %left_idx_252 = tt.reshape %left_idx_247 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc196) + %right_idx_253 = tt.reshape %right_idx_251 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc197) + %cond_254 = arith.cmpi slt, %ileft_241, %iright_242 : tensor<32x16xi32, #linear> loc(#loc198) + %eq_255 = arith.cmpi eq, %ileft_241, %iright_242 : tensor<32x16xi32, #linear> loc(#loc199) + %cond_256 = arith.cmpi sgt, %left_idx_252, %right_idx_253 : tensor<32x16xi32, #linear> loc(#loc200) + %cond_257 = arith.andi %eq_255, %cond_256 : tensor<32x16xi1, #linear> loc(#loc201) + %cond_258 = arith.ori %cond_254, %cond_257 : tensor<32x16xi1, #linear> loc(#loc202) + %cond_259 = arith.extui %cond_258 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc203) + %cond_260 = arith.xori %cond_259, %flip_194 : tensor<32x16xi32, #linear> loc(#loc203) + %cond_261 = arith.cmpi ne, %cond_260, %cst : tensor<32x16xi32, #linear> loc(#loc204) + %ret_262 = arith.xori %ileft_241, %iright_242 : tensor<32x16xi32, #linear> loc(#loc205) + %ret_263 = arith.select %cond_261, %ret_262, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc206) + %ret_264 = arith.xori %ret_228, %ret_263 : tensor<32x16xi32, #linear> loc(#loc207) + %new_idxs_265 = arith.xori %left_idx_252, %right_idx_253 : tensor<32x16xi32, #linear> loc(#loc208) + %new_idxs_266 = arith.select %cond_261, %new_idxs_265, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc209) + %new_idxs_267 = arith.xori %new_idxs_231, %new_idxs_266 : tensor<32x16xi32, #linear> loc(#loc210) + %y_268 = tt.reshape %ret_264 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc173) + %ileft_269 = arith.muli %y_268, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc175) + %ileft_270 = "tt.reduce"(%ileft_269) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc212) + %ileft_271 = tt.expand_dims %ileft_270 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc177) + %ileft_272 = tt.broadcast %ileft_271 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc178) + %iright_273 = arith.muli %y_268, %iright : tensor<256x2x1xi32, #linear1> loc(#loc179) + %iright_274 = "tt.reduce"(%iright_273) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc214) + %iright_275 = tt.expand_dims %iright_274 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc181) + %iright_276 = tt.broadcast %iright_275 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc182) + %ileft_277 = tt.reshape %ileft_272 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc183) + %iright_278 = tt.reshape %iright_276 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc184) + %y_idx_279 = tt.reshape %new_idxs_267 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc185) + %left_idx_280 = arith.muli %y_idx_279, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc187) + %left_idx_281 = "tt.reduce"(%left_idx_280) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc217) + %left_idx_282 = tt.expand_dims %left_idx_281 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc189) + %left_idx_283 = tt.broadcast %left_idx_282 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc190) + %right_idx_284 = arith.muli %y_idx_279, %iright : tensor<256x2x1xi32, #linear1> loc(#loc192) + %right_idx_285 = "tt.reduce"(%right_idx_284) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc220) + %right_idx_286 = tt.expand_dims %right_idx_285 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc194) + %right_idx_287 = tt.broadcast %right_idx_286 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc195) + %left_idx_288 = tt.reshape %left_idx_283 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc196) + %right_idx_289 = tt.reshape %right_idx_287 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc197) + %cond_290 = arith.cmpi slt, %ileft_277, %iright_278 : tensor<32x16xi32, #linear> loc(#loc198) + %eq_291 = arith.cmpi eq, %ileft_277, %iright_278 : tensor<32x16xi32, #linear> loc(#loc199) + %cond_292 = arith.cmpi sgt, %left_idx_288, %right_idx_289 : tensor<32x16xi32, #linear> loc(#loc200) + %cond_293 = arith.andi %eq_291, %cond_292 : tensor<32x16xi1, #linear> loc(#loc201) + %cond_294 = arith.ori %cond_290, %cond_293 : tensor<32x16xi1, #linear> loc(#loc202) + %cond_295 = arith.extui %cond_294 : tensor<32x16xi1, #linear> to tensor<32x16xi32, #linear> loc(#loc203) + %cond_296 = arith.xori %cond_295, %flip_194 : tensor<32x16xi32, #linear> loc(#loc203) + %cond_297 = arith.cmpi ne, %cond_296, %cst : tensor<32x16xi32, #linear> loc(#loc204) + %ret_298 = arith.xori %ileft_277, %iright_278 : tensor<32x16xi32, #linear> loc(#loc205) + %ret_299 = arith.select %cond_297, %ret_298, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc206) + %ret_300 = arith.xori %ret_264, %ret_299 : tensor<32x16xi32, #linear> loc(#loc207) + %new_idxs_301 = arith.xori %left_idx_288, %right_idx_289 : tensor<32x16xi32, #linear> loc(#loc208) + %new_idxs_302 = arith.select %cond_297, %new_idxs_301, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc209) + %new_idxs_303 = arith.xori %new_idxs_267, %new_idxs_302 : tensor<32x16xi32, #linear> loc(#loc210) + %y_304 = tt.reshape %ret_300 : tensor<32x16xi32, #linear> -> tensor<32x2x8xi32, #linear4> loc(#loc173) + %ileft_305 = tt.broadcast %left_mask_69 : tensor<1x2x1xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc175) + %ileft_306 = arith.muli %y_304, %ileft_305 : tensor<32x2x8xi32, #linear4> loc(#loc175) + %ileft_307 = "tt.reduce"(%ileft_306) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc212) + %ileft_308 = tt.expand_dims %ileft_307 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc177) + %ileft_309 = tt.broadcast %ileft_308 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc178) + %iright_310 = arith.muli %y_304, %flip_193 : tensor<32x2x8xi32, #linear4> loc(#loc179) + %iright_311 = "tt.reduce"(%iright_310) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc214) + %iright_312 = tt.expand_dims %iright_311 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc181) + %iright_313 = tt.broadcast %iright_312 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc182) + %ileft_314 = tt.reshape %ileft_309 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc183) + %iright_315 = tt.reshape %iright_313 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc184) + %y_idx_316 = tt.reshape %new_idxs_303 : tensor<32x16xi32, #linear> -> tensor<32x2x8xi32, #linear4> loc(#loc185) + %left_idx_317 = arith.muli %y_idx_316, %ileft_305 : tensor<32x2x8xi32, #linear4> loc(#loc187) + %left_idx_318 = "tt.reduce"(%left_idx_317) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc217) + %left_idx_319 = tt.expand_dims %left_idx_318 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc189) + %left_idx_320 = tt.broadcast %left_idx_319 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc190) + %right_idx_321 = arith.muli %y_idx_316, %flip_193 : tensor<32x2x8xi32, #linear4> loc(#loc192) + %right_idx_322 = "tt.reduce"(%right_idx_321) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<32x2x8xi32, #linear4>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc220) + %right_idx_323 = tt.expand_dims %right_idx_322 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<32x1x8xi32, #linear4> loc(#loc194) + %right_idx_324 = tt.broadcast %right_idx_323 : tensor<32x1x8xi32, #linear4> -> tensor<32x2x8xi32, #linear4> loc(#loc195) + %left_idx_325 = tt.reshape %left_idx_320 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc196) + %right_idx_326 = tt.reshape %right_idx_324 : tensor<32x2x8xi32, #linear4> -> tensor<32x16xi32, #linear> loc(#loc197) + %cond_327 = arith.cmpi slt, %ileft_314, %iright_315 : tensor<32x16xi32, #linear> loc(#loc198) + %eq_328 = arith.cmpi eq, %ileft_314, %iright_315 : tensor<32x16xi32, #linear> loc(#loc199) + %cond_329 = arith.cmpi sgt, %left_idx_325, %right_idx_326 : tensor<32x16xi32, #linear> loc(#loc200) + %cond_330 = arith.andi %eq_328, %cond_329 : tensor<32x16xi1, #linear> loc(#loc201) + %cond_331 = arith.ori %cond_327, %cond_330 : tensor<32x16xi1, #linear> loc(#loc202) + %ret_332 = arith.xori %ileft_314, %iright_315 : tensor<32x16xi32, #linear> loc(#loc205) + %ret_333 = arith.select %cond_331, %ret_332, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc206) + %ret_334 = arith.xori %ret_300, %ret_333 : tensor<32x16xi32, #linear> loc(#loc207) + %new_idxs_335 = arith.xori %left_idx_325, %right_idx_326 : tensor<32x16xi32, #linear> loc(#loc208) + %new_idxs_336 = arith.select %cond_331, %new_idxs_335, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc209) + %new_idxs_337 = arith.xori %new_idxs_303, %new_idxs_336 : tensor<32x16xi32, #linear> loc(#loc210) + %y_338 = tt.reshape %ret_334 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc173) + %ileft_339 = arith.muli %y_338, %ileft_196 : tensor<64x2x4xi32, #linear3> loc(#loc175) + %ileft_340 = "tt.reduce"(%ileft_339) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc212) + %ileft_341 = tt.expand_dims %ileft_340 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc177) + %ileft_342 = tt.broadcast %ileft_341 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc178) + %iright_343 = arith.muli %y_338, %flip_118 : tensor<64x2x4xi32, #linear3> loc(#loc179) + %iright_344 = "tt.reduce"(%iright_343) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc214) + %iright_345 = tt.expand_dims %iright_344 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc181) + %iright_346 = tt.broadcast %iright_345 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc182) + %ileft_347 = tt.reshape %ileft_342 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc183) + %iright_348 = tt.reshape %iright_346 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc184) + %y_idx_349 = tt.reshape %new_idxs_337 : tensor<32x16xi32, #linear> -> tensor<64x2x4xi32, #linear3> loc(#loc185) + %left_idx_350 = arith.muli %y_idx_349, %ileft_196 : tensor<64x2x4xi32, #linear3> loc(#loc187) + %left_idx_351 = "tt.reduce"(%left_idx_350) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc217) + %left_idx_352 = tt.expand_dims %left_idx_351 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc189) + %left_idx_353 = tt.broadcast %left_idx_352 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc190) + %right_idx_354 = arith.muli %y_idx_349, %flip_118 : tensor<64x2x4xi32, #linear3> loc(#loc192) + %right_idx_355 = "tt.reduce"(%right_idx_354) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<64x2x4xi32, #linear3>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc220) + %right_idx_356 = tt.expand_dims %right_idx_355 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<64x1x4xi32, #linear3> loc(#loc194) + %right_idx_357 = tt.broadcast %right_idx_356 : tensor<64x1x4xi32, #linear3> -> tensor<64x2x4xi32, #linear3> loc(#loc195) + %left_idx_358 = tt.reshape %left_idx_353 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc196) + %right_idx_359 = tt.reshape %right_idx_357 : tensor<64x2x4xi32, #linear3> -> tensor<32x16xi32, #linear> loc(#loc197) + %cond_360 = arith.cmpi slt, %ileft_347, %iright_348 : tensor<32x16xi32, #linear> loc(#loc198) + %eq_361 = arith.cmpi eq, %ileft_347, %iright_348 : tensor<32x16xi32, #linear> loc(#loc199) + %cond_362 = arith.cmpi sgt, %left_idx_358, %right_idx_359 : tensor<32x16xi32, #linear> loc(#loc200) + %cond_363 = arith.andi %eq_361, %cond_362 : tensor<32x16xi1, #linear> loc(#loc201) + %cond_364 = arith.ori %cond_360, %cond_363 : tensor<32x16xi1, #linear> loc(#loc202) + %ret_365 = arith.xori %ileft_347, %iright_348 : tensor<32x16xi32, #linear> loc(#loc205) + %ret_366 = arith.select %cond_364, %ret_365, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc206) + %ret_367 = arith.xori %ret_334, %ret_366 : tensor<32x16xi32, #linear> loc(#loc207) + %new_idxs_368 = arith.xori %left_idx_358, %right_idx_359 : tensor<32x16xi32, #linear> loc(#loc208) + %new_idxs_369 = arith.select %cond_364, %new_idxs_368, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc209) + %new_idxs_370 = arith.xori %new_idxs_337, %new_idxs_369 : tensor<32x16xi32, #linear> loc(#loc210) + %y_371 = tt.reshape %ret_367 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc173) + %ileft_372 = arith.muli %y_371, %ileft_121 : tensor<128x2x2xi32, #linear2> loc(#loc175) + %ileft_373 = "tt.reduce"(%ileft_372) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc212) + %ileft_374 = tt.expand_dims %ileft_373 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc177) + %ileft_375 = tt.broadcast %ileft_374 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc178) + %iright_376 = arith.muli %y_371, %flip_64 : tensor<128x2x2xi32, #linear2> loc(#loc179) + %iright_377 = "tt.reduce"(%iright_376) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc214) + %iright_378 = tt.expand_dims %iright_377 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc181) + %iright_379 = tt.broadcast %iright_378 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc182) + %ileft_380 = tt.reshape %ileft_375 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc183) + %iright_381 = tt.reshape %iright_379 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc184) + %y_idx_382 = tt.reshape %new_idxs_370 : tensor<32x16xi32, #linear> -> tensor<128x2x2xi32, #linear2> loc(#loc185) + %left_idx_383 = arith.muli %y_idx_382, %ileft_121 : tensor<128x2x2xi32, #linear2> loc(#loc187) + %left_idx_384 = "tt.reduce"(%left_idx_383) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc217) + %left_idx_385 = tt.expand_dims %left_idx_384 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc189) + %left_idx_386 = tt.broadcast %left_idx_385 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc190) + %right_idx_387 = arith.muli %y_idx_382, %flip_64 : tensor<128x2x2xi32, #linear2> loc(#loc192) + %right_idx_388 = "tt.reduce"(%right_idx_387) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<128x2x2xi32, #linear2>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc220) + %right_idx_389 = tt.expand_dims %right_idx_388 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<128x1x2xi32, #linear2> loc(#loc194) + %right_idx_390 = tt.broadcast %right_idx_389 : tensor<128x1x2xi32, #linear2> -> tensor<128x2x2xi32, #linear2> loc(#loc195) + %left_idx_391 = tt.reshape %left_idx_386 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc196) + %right_idx_392 = tt.reshape %right_idx_390 : tensor<128x2x2xi32, #linear2> -> tensor<32x16xi32, #linear> loc(#loc197) + %cond_393 = arith.cmpi slt, %ileft_380, %iright_381 : tensor<32x16xi32, #linear> loc(#loc198) + %eq_394 = arith.cmpi eq, %ileft_380, %iright_381 : tensor<32x16xi32, #linear> loc(#loc199) + %cond_395 = arith.cmpi sgt, %left_idx_391, %right_idx_392 : tensor<32x16xi32, #linear> loc(#loc200) + %cond_396 = arith.andi %eq_394, %cond_395 : tensor<32x16xi1, #linear> loc(#loc201) + %cond_397 = arith.ori %cond_393, %cond_396 : tensor<32x16xi1, #linear> loc(#loc202) + %ret_398 = arith.xori %ileft_380, %iright_381 : tensor<32x16xi32, #linear> loc(#loc205) + %ret_399 = arith.select %cond_397, %ret_398, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc206) + %ret_400 = arith.xori %ret_367, %ret_399 : tensor<32x16xi32, #linear> loc(#loc207) + %new_idxs_401 = arith.xori %left_idx_391, %right_idx_392 : tensor<32x16xi32, #linear> loc(#loc208) + %new_idxs_402 = arith.select %cond_397, %new_idxs_401, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc209) + %new_idxs_403 = arith.xori %new_idxs_370, %new_idxs_402 : tensor<32x16xi32, #linear> loc(#loc210) + %y_404 = tt.reshape %ret_400 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc173) + %ileft_405 = arith.muli %y_404, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc175) + %ileft_406 = "tt.reduce"(%ileft_405) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc212) + %ileft_407 = tt.expand_dims %ileft_406 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc177) + %ileft_408 = tt.broadcast %ileft_407 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc178) + %iright_409 = arith.muli %y_404, %iright : tensor<256x2x1xi32, #linear1> loc(#loc179) + %iright_410 = "tt.reduce"(%iright_409) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc214) + %iright_411 = tt.expand_dims %iright_410 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc181) + %iright_412 = tt.broadcast %iright_411 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc182) + %ileft_413 = tt.reshape %ileft_408 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc183) + %iright_414 = tt.reshape %iright_412 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc184) + %y_idx_415 = tt.reshape %new_idxs_403 : tensor<32x16xi32, #linear> -> tensor<256x2x1xi32, #linear1> loc(#loc185) + %left_idx_416 = arith.muli %y_idx_415, %ileft : tensor<256x2x1xi32, #linear1> loc(#loc187) + %left_idx_417 = "tt.reduce"(%left_idx_416) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc217) + %left_idx_418 = tt.expand_dims %left_idx_417 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc189) + %left_idx_419 = tt.broadcast %left_idx_418 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc190) + %right_idx_420 = arith.muli %y_idx_415, %iright : tensor<256x2x1xi32, #linear1> loc(#loc192) + %right_idx_421 = "tt.reduce"(%right_idx_420) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<256x2x1xi32, #linear1>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc220) + %right_idx_422 = tt.expand_dims %right_idx_421 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<256x1x1xi32, #linear1> loc(#loc194) + %right_idx_423 = tt.broadcast %right_idx_422 : tensor<256x1x1xi32, #linear1> -> tensor<256x2x1xi32, #linear1> loc(#loc195) + %left_idx_424 = tt.reshape %left_idx_419 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc196) + %right_idx_425 = tt.reshape %right_idx_423 : tensor<256x2x1xi32, #linear1> -> tensor<32x16xi32, #linear> loc(#loc197) + %cond_426 = arith.cmpi slt, %ileft_413, %iright_414 : tensor<32x16xi32, #linear> loc(#loc198) + %eq_427 = arith.cmpi eq, %ileft_413, %iright_414 : tensor<32x16xi32, #linear> loc(#loc199) + %cond_428 = arith.cmpi sgt, %left_idx_424, %right_idx_425 : tensor<32x16xi32, #linear> loc(#loc200) + %cond_429 = arith.andi %eq_427, %cond_428 : tensor<32x16xi1, #linear> loc(#loc201) + %cond_430 = arith.ori %cond_426, %cond_429 : tensor<32x16xi1, #linear> loc(#loc202) + %new_idxs_431 = arith.xori %left_idx_424, %right_idx_425 : tensor<32x16xi32, #linear> loc(#loc208) + %new_idxs_432 = arith.select %cond_430, %new_idxs_431, %cst : tensor<32x16xi1, #linear>, tensor<32x16xi32, #linear> loc(#loc209) + %new_idxs_433 = arith.xori %new_idxs_403, %new_idxs_432 : tensor<32x16xi32, #linear> loc(#loc210) + %tmp7 = arith.extsi %tmp0_52 : tensor<32x16xi32, #blocked> to tensor<32x16xi64, #blocked> loc(#loc159) + %tmp10 = arith.select %tmp0_50, %tmp7, %cst_0 : tensor<32x16xi1, #blocked>, tensor<32x16xi64, #blocked> loc(#loc160) + %tmp11 = "tt.reduce"(%tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_435: i64 loc(callsite(#loc1 at #loc161)), %tmp11_436: i64 loc(callsite(#loc1 at #loc161))): + %tmp11_437 = arith.addi %tmp11_435, %tmp11_436 : i64 loc(#loc211) + tt.reduce.return %tmp11_437 : i64 loc(#loc171) + }) : (tensor<32x16xi64, #blocked>) -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc171) + %tmp11_434 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi64, #blocked> loc(#loc162) + %tmp14 = arith.trunci %tmp11_434 : tensor<32x1xi64, #blocked> to tensor<32x1xi32, #blocked> loc(#loc163) + %0 = arith.muli %x0_28, %cst_2 : tensor<32x1xi64, #blocked1> loc(#loc74) + %1 = tt.broadcast %0 : tensor<32x1xi64, #blocked1> -> tensor<32x16xi64, #blocked1> loc(#loc75) + %2 = arith.addi %tmp0_32, %1 : tensor<32x16xi64, #blocked1> loc(#loc75) + %3 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc76) + %4 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc77) + %5 = arith.extui %4 : i1 to i64 loc(#loc78) + %6 = arith.muli %ks0, %5 : i64 loc(#loc78) + %7 = arith.extui %3 : i1 to i64 loc(#loc164) + %8 = arith.addi %7, %6 : i64 loc(#loc79) + %9 = tt.splat %8 : i64 -> tensor<32x1xi64, #blocked1> loc(#loc81) + %10 = tt.splat %8 : i64 -> tensor<32x1xi64, #blocked> loc(#loc81) + %11 = arith.muli %tmp0_36, %9 : tensor<32x1xi64, #blocked1> loc(#loc81) + %12 = tt.broadcast %11 : tensor<32x1xi64, #blocked1> -> tensor<32x16xi64, #blocked1> loc(#loc82) + %13 = arith.addi %2, %12 : tensor<32x16xi64, #blocked1> loc(#loc82) + %14 = tt.splat %out_ptr2 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked1> loc(#loc83) + %15 = tt.addptr %14, %13 : tensor<32x16x!tt.ptr, #blocked1>, tensor<32x16xi64, #blocked1> loc(#loc83) + %16 = ttg.convert_layout %new_idxs_433 : tensor<32x16xi32, #linear> -> tensor<32x16xi32, #blocked1> loc(#loc84) + tt.store %15, %16, %tmp0_51 : tensor<32x16x!tt.ptr, #blocked1> loc(#loc84) + %17 = arith.muli %x1, %10 : tensor<32x1xi64, #blocked> loc(#loc85) + %18 = arith.addi %x0_27, %17 : tensor<32x1xi64, #blocked> loc(#loc86) + %19 = tt.splat %out_ptr3 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> loc(#loc87) + %20 = tt.addptr %19, %18 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi64, #blocked> loc(#loc87) + tt.store %20, %tmp14, %xmask_17 : tensor<32x1x!tt.ptr, #blocked> loc(#loc88) + tt.return loc(#loc89) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":25:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":26:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":32:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":33:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:42) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:54) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:50) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:64) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:68) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:61) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:30) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:73) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":37:19) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":39:33) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":41:19) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":43:34) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":44:29) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":47:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:32) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:62) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:88) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:79) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:70) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:54) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:47) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:40) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:25) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:102) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:34) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:30) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:25) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:89) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:4) +#loc96 = loc("xoffset"(#loc2)) +#loc97 = loc("xoffset"(#loc3)) +#loc98 = loc("xindex"(#loc4)) +#loc99 = loc("xindex"(#loc5)) +#loc100 = loc("xmask"(#loc6)) +#loc101 = loc("r0_index"(#loc7)) +#loc102 = loc("x0"(#loc8)) +#loc103 = loc("x1"(#loc9)) +#loc104 = loc("tmp0"(#loc10)) +#loc105 = loc("tmp0"(#loc11)) +#loc106 = loc("tmp0"(#loc12)) +#loc107 = loc("tmp0"(#loc13)) +#loc108 = loc("tmp0"(#loc14)) +#loc109 = loc("tmp0"(#loc15)) +#loc110 = loc("tmp0"(#loc16)) +#loc111 = loc("tmp0"(#loc17)) +#loc112 = loc("tmp0"(#loc18)) +#loc113 = loc("tmp0"(#loc19)) +#loc114 = loc("tmp2"(#loc20)) +#loc115 = loc("tmp4"(#loc21)) +#loc116 = loc("flip"(#loc22)) +#loc118 = loc("flip"(#loc25)) +#loc119 = loc("flip"(#loc26)) +#loc120 = loc("y"(#loc27)) +#loc121 = loc("left_mask"(#loc29)) +#loc122 = loc("ileft"(#loc30)) +#loc124 = loc("ileft"(#loc34)) +#loc125 = loc("ileft"(#loc35)) +#loc126 = loc("iright"(#loc36)) +#loc128 = loc("iright"(#loc38)) +#loc129 = loc("iright"(#loc39)) +#loc130 = loc("ileft"(#loc40)) +#loc131 = loc("iright"(#loc41)) +#loc132 = loc("y_idx"(#loc42)) +#loc133 = loc("left_idx"(#loc43)) +#loc134 = loc("left_idx"(#loc44)) +#loc135 = loc("input"(#loc45)) +#loc137 = loc("left_idx"(#loc47)) +#loc138 = loc("left_idx"(#loc48)) +#loc139 = loc("right_idx"(#loc49)) +#loc140 = loc("right_idx"(#loc50)) +#loc142 = loc("right_idx"(#loc52)) +#loc143 = loc("right_idx"(#loc53)) +#loc144 = loc("left_idx"(#loc54)) +#loc145 = loc("right_idx"(#loc55)) +#loc146 = loc("cond"(#loc56)) +#loc147 = loc("eq"(#loc57)) +#loc148 = loc("cond"(#loc58)) +#loc149 = loc("cond"(#loc59)) +#loc150 = loc("cond"(#loc60)) +#loc151 = loc("cond"(#loc61)) +#loc152 = loc("cond"(#loc62)) +#loc153 = loc("ret"(#loc63)) +#loc154 = loc("ret"(#loc64)) +#loc155 = loc("ret"(#loc65)) +#loc156 = loc("new_idxs"(#loc66)) +#loc157 = loc("new_idxs"(#loc67)) +#loc158 = loc("new_idxs"(#loc68)) +#loc159 = loc("tmp7"(#loc69)) +#loc160 = loc("tmp10"(#loc70)) +#loc162 = loc("tmp11"(#loc72)) +#loc163 = loc("tmp14"(#loc73)) +#loc164 = loc(fused[#loc79, #loc80]) +#loc165 = loc(callsite(#loc116 at #loc117)) +#loc166 = loc(callsite(#loc118 at #loc117)) +#loc167 = loc(callsite(#loc119 at #loc117)) +#loc169 = loc("cond"(#loc146)) +#loc170 = loc("eq"(#loc147)) +#loc171 = loc(callsite(#loc31 at #loc161)) +#loc173 = loc(callsite(#loc120 at #loc168)) +#loc174 = loc(callsite(#loc121 at #loc168)) +#loc175 = loc(callsite(#loc122 at #loc168)) +#loc177 = loc(callsite(#loc124 at #loc168)) +#loc178 = loc(callsite(#loc125 at #loc168)) +#loc179 = loc(callsite(#loc126 at #loc168)) +#loc181 = loc(callsite(#loc128 at #loc168)) +#loc182 = loc(callsite(#loc129 at #loc168)) +#loc183 = loc(callsite(#loc130 at #loc168)) +#loc184 = loc(callsite(#loc131 at #loc168)) +#loc185 = loc(callsite(#loc132 at #loc168)) +#loc186 = loc(callsite(#loc133 at #loc168)) +#loc187 = loc(callsite(#loc134 at #loc168)) +#loc189 = loc(callsite(#loc137 at #loc168)) +#loc190 = loc(callsite(#loc138 at #loc168)) +#loc191 = loc(callsite(#loc139 at #loc168)) +#loc192 = loc(callsite(#loc140 at #loc168)) +#loc194 = loc(callsite(#loc142 at #loc168)) +#loc195 = loc(callsite(#loc143 at #loc168)) +#loc196 = loc(callsite(#loc144 at #loc168)) +#loc197 = loc(callsite(#loc145 at #loc168)) +#loc198 = loc(callsite(#loc169 at #loc168)) +#loc199 = loc(callsite(#loc170 at #loc168)) +#loc200 = loc(callsite(#loc148 at #loc168)) +#loc201 = loc(callsite(#loc149 at #loc168)) +#loc202 = loc(callsite(#loc150 at #loc168)) +#loc203 = loc(callsite(#loc151 at #loc168)) +#loc204 = loc(callsite(#loc152 at #loc168)) +#loc205 = loc(callsite(#loc153 at #loc168)) +#loc206 = loc(callsite(#loc154 at #loc168)) +#loc207 = loc(callsite(#loc155 at #loc168)) +#loc208 = loc(callsite(#loc156 at #loc168)) +#loc209 = loc(callsite(#loc157 at #loc168)) +#loc210 = loc(callsite(#loc158 at #loc168)) +#loc211 = loc(callsite(#loc33 at #loc171)) +#loc212 = loc(callsite(#loc31 at #loc176)) +#loc214 = loc(callsite(#loc31 at #loc180)) +#loc216 = loc(callsite(#loc135 at #loc188)) +#loc217 = loc(callsite(#loc31 at #loc188)) +#loc219 = loc(callsite(#loc135 at #loc193)) +#loc220 = loc(callsite(#loc31 at #loc193)) +#loc222 = loc(callsite(#loc33 at #loc212)) +#loc223 = loc(callsite(#loc33 at #loc214)) +#loc224 = loc(callsite(#loc33 at #loc217)) +#loc225 = loc(callsite(#loc33 at #loc220)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir new file mode 100644 index 0000000000000000000000000000000000000000..bcbab2b7ec251dba2a180c29d06efe34248c2cc5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ST7TDJ5OOH5WPE5X3A4QLUEB7TVON4KLHN6SWHDQ5K7XNA5OS7OQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir @@ -0,0 +1,840 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":40:67) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":44:26) +#loc93 = loc("in_ptr0"(#loc)) +#loc94 = loc("out_ptr2"(#loc)) +#loc95 = loc("out_ptr3"(#loc)) +#loc96 = loc("ks0"(#loc)) +#loc97 = loc("xnumel"(#loc)) +#loc98 = loc("r0_numel"(#loc)) +#loc124 = loc(callsite(#loc27 at #loc2)) +#loc131 = loc("ileft"(#loc36)) +#loc135 = loc("iright"(#loc41)) +#loc144 = loc("left_idx"(#loc50)) +#loc149 = loc("right_idx"(#loc55)) +#loc168 = loc("tmp11"(#loc74)) +#loc176 = loc(callsite(#loc32 at #loc124)) +#loc180 = loc(callsite(#loc1 at #loc168)) +#loc184 = loc(callsite(#loc131 at #loc176)) +#loc188 = loc(callsite(#loc135 at #loc176)) +#loc196 = loc(callsite(#loc144 at #loc176)) +#loc201 = loc(callsite(#loc149 at #loc176)) +#loc221 = loc(callsite(#loc1 at #loc184)) +#loc223 = loc(callsite(#loc1 at #loc188)) +#loc226 = loc(callsite(#loc1 at #loc196)) +#loc229 = loc(callsite(#loc1 at #loc201)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc99) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc1) + %tmp10 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc100) + %cst_1 = arith.constant dense<16> : tensor<32x1xi64> loc(#loc1) + %c16_i64 = arith.constant 16 : i64 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc101) + %xoffset_2 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc102) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc103) + %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc104) + %xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<32x1xi32> loc(#loc105) + %xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<32x1xi32> loc(#loc105) + %xmask = tt.splat %xnumel : i32 -> tensor<32x1xi32> loc(#loc106) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<32x1xi32> loc(#loc106) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc107) + %r0_index_7 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc108) + %x0 = arith.extsi %xindex_5 : tensor<32x1xi32> to tensor<32x1xi64> loc(#loc109) + %x0_8 = tt.splat %ks0 : i64 -> tensor<32x1xi64> loc(#loc109) + %x0_9 = arith.remsi %x0, %x0_8 : tensor<32x1xi64> loc(#loc109) + %x1 = arith.divsi %x0, %x0_8 : tensor<32x1xi64> loc(#loc110) + %tmp0 = arith.extsi %r0_index_7 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc111) + %tmp0_10 = tt.broadcast %tmp0 : tensor<1x16xi64> -> tensor<32x16xi64> loc(#loc111) + %tmp0_11 = tt.broadcast %x0_9 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc111) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<32x16xi64> loc(#loc111) + %tmp0_13 = arith.muli %x1, %cst_1 : tensor<32x1xi64> loc(#loc112) + %tmp0_14 = tt.broadcast %tmp0_13 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc113) + %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<32x16xi64> loc(#loc113) + %tmp0_16 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc114) + %tmp0_17 = arith.muli %tmp0_16, %tmp0 : tensor<1x16xi64> loc(#loc114) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<1x16xi64> -> tensor<32x16xi64> loc(#loc115) + %tmp0_19 = arith.addi %tmp0_15, %tmp0_18 : tensor<32x16xi64> loc(#loc115) + %tmp0_20 = arith.muli %ks0, %c16_i64 : i64 loc(#loc116) + %tmp0_21 = tt.splat %tmp0_20 : i64 -> tensor<32x1xi64> loc(#loc117) + %tmp0_22 = arith.muli %tmp0_21, %x1 : tensor<32x1xi64> loc(#loc117) + %tmp0_23 = tt.broadcast %tmp0_22 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc118) + %tmp0_24 = arith.addi %tmp0_19, %tmp0_23 : tensor<32x16xi64> loc(#loc118) + %tmp0_25 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc119) + %tmp0_26 = tt.addptr %tmp0_25, %tmp0_24 : tensor<32x16x!tt.ptr>, tensor<32x16xi64> loc(#loc119) + %tmp0_27 = tt.broadcast %xmask_6 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc120) + %tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst_0 evictionPolicy = evict_last : tensor<32x16x!tt.ptr> loc(#loc120) + %tmp2 = arith.trunci %r0_index_7 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc121) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<32x16xi16> loc(#loc122) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc172) + %flip_29 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc173) + %flip_30 = tt.expand_dims %flip_29 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc173) + %flip_31 = tt.broadcast %flip_30 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc174) + %flip_32 = tt.reshape %flip_31 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc175) + %y = tt.reshape %tmp0_28 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc181) + %left_mask = arith.subi %cst, %flip_30 : tensor<1x2x1xi32> loc(#loc182) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc183) + %ileft_33 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc183) + %ileft_34 = "tt.reduce"(%ileft_33) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc220) + %ileft_35 = tt.expand_dims %ileft_34 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc185) + %ileft_36 = tt.broadcast %ileft_35 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc186) + %iright = tt.broadcast %flip_30 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc187) + %iright_37 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc187) + %iright_38 = "tt.reduce"(%iright_37) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc222) + %iright_39 = tt.expand_dims %iright_38 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc189) + %iright_40 = tt.broadcast %iright_39 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc190) + %ileft_41 = tt.reshape %ileft_36 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc191) + %iright_42 = tt.reshape %iright_40 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc192) + %y_idx = tt.reshape %tmp4 : tensor<32x16xi16> -> tensor<256x2x1xi16> loc(#loc193) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc194) + %left_idx_43 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc195) + %left_idx_44 = arith.muli %y_idx, %left_idx_43 : tensor<256x2x1xi16> loc(#loc195) + %input = arith.extsi %left_idx_44 : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc224) + %left_idx_45 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc225) + %left_idx_46 = tt.expand_dims %left_idx_45 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc197) + %left_idx_47 = tt.broadcast %left_idx_46 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc198) + %right_idx = arith.trunci %flip_30 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc199) + %right_idx_48 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc200) + %right_idx_49 = arith.muli %y_idx, %right_idx_48 : tensor<256x2x1xi16> loc(#loc200) + %input_50 = arith.extsi %right_idx_49 : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc227) + %right_idx_51 = "tt.reduce"(%input_50) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc228) + %right_idx_52 = tt.expand_dims %right_idx_51 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc202) + %right_idx_53 = tt.broadcast %right_idx_52 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc203) + %left_idx_54 = tt.reshape %left_idx_47 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc204) + %right_idx_55 = tt.reshape %right_idx_53 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc205) + %cond = arith.cmpi slt, %ileft_41, %iright_42 : tensor<32x16xi32> loc(#loc206) + %eq = arith.cmpi eq, %ileft_41, %iright_42 : tensor<32x16xi32> loc(#loc207) + %cond_56 = arith.cmpi sgt, %left_idx_54, %right_idx_55 : tensor<32x16xi32> loc(#loc208) + %cond_57 = arith.andi %eq, %cond_56 : tensor<32x16xi1> loc(#loc209) + %cond_58 = arith.ori %cond, %cond_57 : tensor<32x16xi1> loc(#loc210) + %cond_59 = arith.extui %cond_58 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc211) + %cond_60 = arith.xori %cond_59, %flip_32 : tensor<32x16xi32> loc(#loc211) + %cond_61 = arith.cmpi ne, %cond_60, %cst_0 : tensor<32x16xi32> loc(#loc212) + %ret = arith.xori %ileft_41, %iright_42 : tensor<32x16xi32> loc(#loc213) + %ret_62 = arith.select %cond_61, %ret, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc214) + %ret_63 = arith.xori %tmp0_28, %ret_62 : tensor<32x16xi32> loc(#loc215) + %new_idxs = arith.xori %left_idx_54, %right_idx_55 : tensor<32x16xi32> loc(#loc216) + %new_idxs_64 = arith.select %cond_61, %new_idxs, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc217) + %new_idxs_65 = arith.extsi %tmp2 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc218) + %new_idxs_66 = tt.broadcast %new_idxs_65 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc218) + %new_idxs_67 = arith.xori %new_idxs_66, %new_idxs_64 : tensor<32x16xi32> loc(#loc218) + %flip_68 = tt.broadcast %flip_30 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc174) + %flip_69 = tt.reshape %flip_68 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc175) + %y_70 = tt.reshape %ret_63 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc181) + %ileft_71 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc183) + %ileft_72 = arith.muli %y_70, %ileft_71 : tensor<128x2x2xi32> loc(#loc183) + %ileft_73 = "tt.reduce"(%ileft_72) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc220) + %ileft_74 = tt.expand_dims %ileft_73 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc185) + %ileft_75 = tt.broadcast %ileft_74 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc186) + %iright_76 = arith.muli %y_70, %flip_31 : tensor<128x2x2xi32> loc(#loc187) + %iright_77 = "tt.reduce"(%iright_76) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc222) + %iright_78 = tt.expand_dims %iright_77 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc189) + %iright_79 = tt.broadcast %iright_78 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc190) + %ileft_80 = tt.reshape %ileft_75 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc191) + %iright_81 = tt.reshape %iright_79 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc192) + %y_idx_82 = tt.reshape %new_idxs_67 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc193) + %left_idx_83 = arith.muli %y_idx_82, %ileft_71 : tensor<128x2x2xi32> loc(#loc195) + %left_idx_84 = "tt.reduce"(%left_idx_83) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc225) + %left_idx_85 = tt.expand_dims %left_idx_84 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc197) + %left_idx_86 = tt.broadcast %left_idx_85 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc198) + %right_idx_87 = arith.muli %y_idx_82, %flip_31 : tensor<128x2x2xi32> loc(#loc200) + %right_idx_88 = "tt.reduce"(%right_idx_87) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc228) + %right_idx_89 = tt.expand_dims %right_idx_88 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc202) + %right_idx_90 = tt.broadcast %right_idx_89 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc203) + %left_idx_91 = tt.reshape %left_idx_86 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc204) + %right_idx_92 = tt.reshape %right_idx_90 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc205) + %cond_93 = arith.cmpi slt, %ileft_80, %iright_81 : tensor<32x16xi32> loc(#loc206) + %eq_94 = arith.cmpi eq, %ileft_80, %iright_81 : tensor<32x16xi32> loc(#loc207) + %cond_95 = arith.cmpi sgt, %left_idx_91, %right_idx_92 : tensor<32x16xi32> loc(#loc208) + %cond_96 = arith.andi %eq_94, %cond_95 : tensor<32x16xi1> loc(#loc209) + %cond_97 = arith.ori %cond_93, %cond_96 : tensor<32x16xi1> loc(#loc210) + %cond_98 = arith.extui %cond_97 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc211) + %cond_99 = arith.xori %cond_98, %flip_69 : tensor<32x16xi32> loc(#loc211) + %cond_100 = arith.cmpi ne, %cond_99, %cst_0 : tensor<32x16xi32> loc(#loc212) + %ret_101 = arith.xori %ileft_80, %iright_81 : tensor<32x16xi32> loc(#loc213) + %ret_102 = arith.select %cond_100, %ret_101, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc214) + %ret_103 = arith.xori %ret_63, %ret_102 : tensor<32x16xi32> loc(#loc215) + %new_idxs_104 = arith.xori %left_idx_91, %right_idx_92 : tensor<32x16xi32> loc(#loc216) + %new_idxs_105 = arith.select %cond_100, %new_idxs_104, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc217) + %new_idxs_106 = arith.xori %new_idxs_67, %new_idxs_105 : tensor<32x16xi32> loc(#loc218) + %y_107 = tt.reshape %ret_103 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc181) + %ileft_108 = arith.muli %y_107, %ileft : tensor<256x2x1xi32> loc(#loc183) + %ileft_109 = "tt.reduce"(%ileft_108) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc220) + %ileft_110 = tt.expand_dims %ileft_109 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc185) + %ileft_111 = tt.broadcast %ileft_110 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc186) + %iright_112 = arith.muli %y_107, %iright : tensor<256x2x1xi32> loc(#loc187) + %iright_113 = "tt.reduce"(%iright_112) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc222) + %iright_114 = tt.expand_dims %iright_113 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc189) + %iright_115 = tt.broadcast %iright_114 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc190) + %ileft_116 = tt.reshape %ileft_111 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc191) + %iright_117 = tt.reshape %iright_115 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc192) + %y_idx_118 = tt.reshape %new_idxs_106 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc193) + %left_idx_119 = arith.muli %y_idx_118, %ileft : tensor<256x2x1xi32> loc(#loc195) + %left_idx_120 = "tt.reduce"(%left_idx_119) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc225) + %left_idx_121 = tt.expand_dims %left_idx_120 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc197) + %left_idx_122 = tt.broadcast %left_idx_121 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc198) + %right_idx_123 = arith.muli %y_idx_118, %iright : tensor<256x2x1xi32> loc(#loc200) + %right_idx_124 = "tt.reduce"(%right_idx_123) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc228) + %right_idx_125 = tt.expand_dims %right_idx_124 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc202) + %right_idx_126 = tt.broadcast %right_idx_125 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc203) + %left_idx_127 = tt.reshape %left_idx_122 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc204) + %right_idx_128 = tt.reshape %right_idx_126 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc205) + %cond_129 = arith.cmpi slt, %ileft_116, %iright_117 : tensor<32x16xi32> loc(#loc206) + %eq_130 = arith.cmpi eq, %ileft_116, %iright_117 : tensor<32x16xi32> loc(#loc207) + %cond_131 = arith.cmpi sgt, %left_idx_127, %right_idx_128 : tensor<32x16xi32> loc(#loc208) + %cond_132 = arith.andi %eq_130, %cond_131 : tensor<32x16xi1> loc(#loc209) + %cond_133 = arith.ori %cond_129, %cond_132 : tensor<32x16xi1> loc(#loc210) + %cond_134 = arith.extui %cond_133 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc211) + %cond_135 = arith.xori %cond_134, %flip_69 : tensor<32x16xi32> loc(#loc211) + %cond_136 = arith.cmpi ne, %cond_135, %cst_0 : tensor<32x16xi32> loc(#loc212) + %ret_137 = arith.xori %ileft_116, %iright_117 : tensor<32x16xi32> loc(#loc213) + %ret_138 = arith.select %cond_136, %ret_137, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc214) + %ret_139 = arith.xori %ret_103, %ret_138 : tensor<32x16xi32> loc(#loc215) + %new_idxs_140 = arith.xori %left_idx_127, %right_idx_128 : tensor<32x16xi32> loc(#loc216) + %new_idxs_141 = arith.select %cond_136, %new_idxs_140, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc217) + %new_idxs_142 = arith.xori %new_idxs_106, %new_idxs_141 : tensor<32x16xi32> loc(#loc218) + %flip_143 = tt.broadcast %flip_30 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc174) + %flip_144 = tt.reshape %flip_143 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc175) + %y_145 = tt.reshape %ret_139 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc181) + %ileft_146 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc183) + %ileft_147 = arith.muli %y_145, %ileft_146 : tensor<64x2x4xi32> loc(#loc183) + %ileft_148 = "tt.reduce"(%ileft_147) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc220) + %ileft_149 = tt.expand_dims %ileft_148 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc185) + %ileft_150 = tt.broadcast %ileft_149 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc186) + %iright_151 = arith.muli %y_145, %flip_68 : tensor<64x2x4xi32> loc(#loc187) + %iright_152 = "tt.reduce"(%iright_151) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc222) + %iright_153 = tt.expand_dims %iright_152 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc189) + %iright_154 = tt.broadcast %iright_153 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc190) + %ileft_155 = tt.reshape %ileft_150 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc191) + %iright_156 = tt.reshape %iright_154 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc192) + %y_idx_157 = tt.reshape %new_idxs_142 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc193) + %left_idx_158 = arith.muli %y_idx_157, %ileft_146 : tensor<64x2x4xi32> loc(#loc195) + %left_idx_159 = "tt.reduce"(%left_idx_158) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc225) + %left_idx_160 = tt.expand_dims %left_idx_159 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc197) + %left_idx_161 = tt.broadcast %left_idx_160 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc198) + %right_idx_162 = arith.muli %y_idx_157, %flip_68 : tensor<64x2x4xi32> loc(#loc200) + %right_idx_163 = "tt.reduce"(%right_idx_162) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc228) + %right_idx_164 = tt.expand_dims %right_idx_163 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc202) + %right_idx_165 = tt.broadcast %right_idx_164 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc203) + %left_idx_166 = tt.reshape %left_idx_161 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc204) + %right_idx_167 = tt.reshape %right_idx_165 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc205) + %cond_168 = arith.cmpi slt, %ileft_155, %iright_156 : tensor<32x16xi32> loc(#loc206) + %eq_169 = arith.cmpi eq, %ileft_155, %iright_156 : tensor<32x16xi32> loc(#loc207) + %cond_170 = arith.cmpi sgt, %left_idx_166, %right_idx_167 : tensor<32x16xi32> loc(#loc208) + %cond_171 = arith.andi %eq_169, %cond_170 : tensor<32x16xi1> loc(#loc209) + %cond_172 = arith.ori %cond_168, %cond_171 : tensor<32x16xi1> loc(#loc210) + %cond_173 = arith.extui %cond_172 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc211) + %cond_174 = arith.xori %cond_173, %flip_144 : tensor<32x16xi32> loc(#loc211) + %cond_175 = arith.cmpi ne, %cond_174, %cst_0 : tensor<32x16xi32> loc(#loc212) + %ret_176 = arith.xori %ileft_155, %iright_156 : tensor<32x16xi32> loc(#loc213) + %ret_177 = arith.select %cond_175, %ret_176, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc214) + %ret_178 = arith.xori %ret_139, %ret_177 : tensor<32x16xi32> loc(#loc215) + %new_idxs_179 = arith.xori %left_idx_166, %right_idx_167 : tensor<32x16xi32> loc(#loc216) + %new_idxs_180 = arith.select %cond_175, %new_idxs_179, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc217) + %new_idxs_181 = arith.xori %new_idxs_142, %new_idxs_180 : tensor<32x16xi32> loc(#loc218) + %y_182 = tt.reshape %ret_178 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc181) + %ileft_183 = arith.muli %y_182, %ileft_71 : tensor<128x2x2xi32> loc(#loc183) + %ileft_184 = "tt.reduce"(%ileft_183) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc220) + %ileft_185 = tt.expand_dims %ileft_184 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc185) + %ileft_186 = tt.broadcast %ileft_185 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc186) + %iright_187 = arith.muli %y_182, %flip_31 : tensor<128x2x2xi32> loc(#loc187) + %iright_188 = "tt.reduce"(%iright_187) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc222) + %iright_189 = tt.expand_dims %iright_188 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc189) + %iright_190 = tt.broadcast %iright_189 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc190) + %ileft_191 = tt.reshape %ileft_186 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc191) + %iright_192 = tt.reshape %iright_190 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc192) + %y_idx_193 = tt.reshape %new_idxs_181 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc193) + %left_idx_194 = arith.muli %y_idx_193, %ileft_71 : tensor<128x2x2xi32> loc(#loc195) + %left_idx_195 = "tt.reduce"(%left_idx_194) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc225) + %left_idx_196 = tt.expand_dims %left_idx_195 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc197) + %left_idx_197 = tt.broadcast %left_idx_196 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc198) + %right_idx_198 = arith.muli %y_idx_193, %flip_31 : tensor<128x2x2xi32> loc(#loc200) + %right_idx_199 = "tt.reduce"(%right_idx_198) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc228) + %right_idx_200 = tt.expand_dims %right_idx_199 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc202) + %right_idx_201 = tt.broadcast %right_idx_200 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc203) + %left_idx_202 = tt.reshape %left_idx_197 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc204) + %right_idx_203 = tt.reshape %right_idx_201 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc205) + %cond_204 = arith.cmpi slt, %ileft_191, %iright_192 : tensor<32x16xi32> loc(#loc206) + %eq_205 = arith.cmpi eq, %ileft_191, %iright_192 : tensor<32x16xi32> loc(#loc207) + %cond_206 = arith.cmpi sgt, %left_idx_202, %right_idx_203 : tensor<32x16xi32> loc(#loc208) + %cond_207 = arith.andi %eq_205, %cond_206 : tensor<32x16xi1> loc(#loc209) + %cond_208 = arith.ori %cond_204, %cond_207 : tensor<32x16xi1> loc(#loc210) + %cond_209 = arith.extui %cond_208 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc211) + %cond_210 = arith.xori %cond_209, %flip_144 : tensor<32x16xi32> loc(#loc211) + %cond_211 = arith.cmpi ne, %cond_210, %cst_0 : tensor<32x16xi32> loc(#loc212) + %ret_212 = arith.xori %ileft_191, %iright_192 : tensor<32x16xi32> loc(#loc213) + %ret_213 = arith.select %cond_211, %ret_212, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc214) + %ret_214 = arith.xori %ret_178, %ret_213 : tensor<32x16xi32> loc(#loc215) + %new_idxs_215 = arith.xori %left_idx_202, %right_idx_203 : tensor<32x16xi32> loc(#loc216) + %new_idxs_216 = arith.select %cond_211, %new_idxs_215, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc217) + %new_idxs_217 = arith.xori %new_idxs_181, %new_idxs_216 : tensor<32x16xi32> loc(#loc218) + %y_218 = tt.reshape %ret_214 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc181) + %ileft_219 = arith.muli %y_218, %ileft : tensor<256x2x1xi32> loc(#loc183) + %ileft_220 = "tt.reduce"(%ileft_219) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc220) + %ileft_221 = tt.expand_dims %ileft_220 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc185) + %ileft_222 = tt.broadcast %ileft_221 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc186) + %iright_223 = arith.muli %y_218, %iright : tensor<256x2x1xi32> loc(#loc187) + %iright_224 = "tt.reduce"(%iright_223) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc222) + %iright_225 = tt.expand_dims %iright_224 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc189) + %iright_226 = tt.broadcast %iright_225 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc190) + %ileft_227 = tt.reshape %ileft_222 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc191) + %iright_228 = tt.reshape %iright_226 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc192) + %y_idx_229 = tt.reshape %new_idxs_217 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc193) + %left_idx_230 = arith.muli %y_idx_229, %ileft : tensor<256x2x1xi32> loc(#loc195) + %left_idx_231 = "tt.reduce"(%left_idx_230) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc225) + %left_idx_232 = tt.expand_dims %left_idx_231 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc197) + %left_idx_233 = tt.broadcast %left_idx_232 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc198) + %right_idx_234 = arith.muli %y_idx_229, %iright : tensor<256x2x1xi32> loc(#loc200) + %right_idx_235 = "tt.reduce"(%right_idx_234) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc228) + %right_idx_236 = tt.expand_dims %right_idx_235 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc202) + %right_idx_237 = tt.broadcast %right_idx_236 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc203) + %left_idx_238 = tt.reshape %left_idx_233 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc204) + %right_idx_239 = tt.reshape %right_idx_237 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc205) + %cond_240 = arith.cmpi slt, %ileft_227, %iright_228 : tensor<32x16xi32> loc(#loc206) + %eq_241 = arith.cmpi eq, %ileft_227, %iright_228 : tensor<32x16xi32> loc(#loc207) + %cond_242 = arith.cmpi sgt, %left_idx_238, %right_idx_239 : tensor<32x16xi32> loc(#loc208) + %cond_243 = arith.andi %eq_241, %cond_242 : tensor<32x16xi1> loc(#loc209) + %cond_244 = arith.ori %cond_240, %cond_243 : tensor<32x16xi1> loc(#loc210) + %cond_245 = arith.extui %cond_244 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc211) + %cond_246 = arith.xori %cond_245, %flip_144 : tensor<32x16xi32> loc(#loc211) + %cond_247 = arith.cmpi ne, %cond_246, %cst_0 : tensor<32x16xi32> loc(#loc212) + %ret_248 = arith.xori %ileft_227, %iright_228 : tensor<32x16xi32> loc(#loc213) + %ret_249 = arith.select %cond_247, %ret_248, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc214) + %ret_250 = arith.xori %ret_214, %ret_249 : tensor<32x16xi32> loc(#loc215) + %new_idxs_251 = arith.xori %left_idx_238, %right_idx_239 : tensor<32x16xi32> loc(#loc216) + %new_idxs_252 = arith.select %cond_247, %new_idxs_251, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc217) + %new_idxs_253 = arith.xori %new_idxs_217, %new_idxs_252 : tensor<32x16xi32> loc(#loc218) + %y_254 = tt.reshape %ret_250 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc181) + %ileft_255 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc183) + %ileft_256 = arith.muli %y_254, %ileft_255 : tensor<32x2x8xi32> loc(#loc183) + %ileft_257 = "tt.reduce"(%ileft_256) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc220) + %ileft_258 = tt.expand_dims %ileft_257 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc185) + %ileft_259 = tt.broadcast %ileft_258 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc186) + %iright_260 = arith.muli %y_254, %flip_143 : tensor<32x2x8xi32> loc(#loc187) + %iright_261 = "tt.reduce"(%iright_260) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc222) + %iright_262 = tt.expand_dims %iright_261 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc189) + %iright_263 = tt.broadcast %iright_262 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc190) + %ileft_264 = tt.reshape %ileft_259 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc191) + %iright_265 = tt.reshape %iright_263 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc192) + %y_idx_266 = tt.reshape %new_idxs_253 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc193) + %left_idx_267 = arith.muli %y_idx_266, %ileft_255 : tensor<32x2x8xi32> loc(#loc195) + %left_idx_268 = "tt.reduce"(%left_idx_267) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc225) + %left_idx_269 = tt.expand_dims %left_idx_268 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc197) + %left_idx_270 = tt.broadcast %left_idx_269 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc198) + %right_idx_271 = arith.muli %y_idx_266, %flip_143 : tensor<32x2x8xi32> loc(#loc200) + %right_idx_272 = "tt.reduce"(%right_idx_271) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc228) + %right_idx_273 = tt.expand_dims %right_idx_272 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc202) + %right_idx_274 = tt.broadcast %right_idx_273 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc203) + %left_idx_275 = tt.reshape %left_idx_270 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc204) + %right_idx_276 = tt.reshape %right_idx_274 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc205) + %cond_277 = arith.cmpi slt, %ileft_264, %iright_265 : tensor<32x16xi32> loc(#loc206) + %eq_278 = arith.cmpi eq, %ileft_264, %iright_265 : tensor<32x16xi32> loc(#loc207) + %cond_279 = arith.cmpi sgt, %left_idx_275, %right_idx_276 : tensor<32x16xi32> loc(#loc208) + %cond_280 = arith.andi %eq_278, %cond_279 : tensor<32x16xi1> loc(#loc209) + %cond_281 = arith.ori %cond_277, %cond_280 : tensor<32x16xi1> loc(#loc210) + %ret_282 = arith.xori %ileft_264, %iright_265 : tensor<32x16xi32> loc(#loc213) + %ret_283 = arith.select %cond_281, %ret_282, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc214) + %ret_284 = arith.xori %ret_250, %ret_283 : tensor<32x16xi32> loc(#loc215) + %new_idxs_285 = arith.xori %left_idx_275, %right_idx_276 : tensor<32x16xi32> loc(#loc216) + %new_idxs_286 = arith.select %cond_281, %new_idxs_285, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc217) + %new_idxs_287 = arith.xori %new_idxs_253, %new_idxs_286 : tensor<32x16xi32> loc(#loc218) + %y_288 = tt.reshape %ret_284 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc181) + %ileft_289 = arith.muli %y_288, %ileft_146 : tensor<64x2x4xi32> loc(#loc183) + %ileft_290 = "tt.reduce"(%ileft_289) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc220) + %ileft_291 = tt.expand_dims %ileft_290 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc185) + %ileft_292 = tt.broadcast %ileft_291 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc186) + %iright_293 = arith.muli %y_288, %flip_68 : tensor<64x2x4xi32> loc(#loc187) + %iright_294 = "tt.reduce"(%iright_293) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc222) + %iright_295 = tt.expand_dims %iright_294 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc189) + %iright_296 = tt.broadcast %iright_295 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc190) + %ileft_297 = tt.reshape %ileft_292 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc191) + %iright_298 = tt.reshape %iright_296 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc192) + %y_idx_299 = tt.reshape %new_idxs_287 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc193) + %left_idx_300 = arith.muli %y_idx_299, %ileft_146 : tensor<64x2x4xi32> loc(#loc195) + %left_idx_301 = "tt.reduce"(%left_idx_300) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc225) + %left_idx_302 = tt.expand_dims %left_idx_301 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc197) + %left_idx_303 = tt.broadcast %left_idx_302 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc198) + %right_idx_304 = arith.muli %y_idx_299, %flip_68 : tensor<64x2x4xi32> loc(#loc200) + %right_idx_305 = "tt.reduce"(%right_idx_304) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc228) + %right_idx_306 = tt.expand_dims %right_idx_305 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc202) + %right_idx_307 = tt.broadcast %right_idx_306 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc203) + %left_idx_308 = tt.reshape %left_idx_303 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc204) + %right_idx_309 = tt.reshape %right_idx_307 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc205) + %cond_310 = arith.cmpi slt, %ileft_297, %iright_298 : tensor<32x16xi32> loc(#loc206) + %eq_311 = arith.cmpi eq, %ileft_297, %iright_298 : tensor<32x16xi32> loc(#loc207) + %cond_312 = arith.cmpi sgt, %left_idx_308, %right_idx_309 : tensor<32x16xi32> loc(#loc208) + %cond_313 = arith.andi %eq_311, %cond_312 : tensor<32x16xi1> loc(#loc209) + %cond_314 = arith.ori %cond_310, %cond_313 : tensor<32x16xi1> loc(#loc210) + %ret_315 = arith.xori %ileft_297, %iright_298 : tensor<32x16xi32> loc(#loc213) + %ret_316 = arith.select %cond_314, %ret_315, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc214) + %ret_317 = arith.xori %ret_284, %ret_316 : tensor<32x16xi32> loc(#loc215) + %new_idxs_318 = arith.xori %left_idx_308, %right_idx_309 : tensor<32x16xi32> loc(#loc216) + %new_idxs_319 = arith.select %cond_314, %new_idxs_318, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc217) + %new_idxs_320 = arith.xori %new_idxs_287, %new_idxs_319 : tensor<32x16xi32> loc(#loc218) + %y_321 = tt.reshape %ret_317 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc181) + %ileft_322 = arith.muli %y_321, %ileft_71 : tensor<128x2x2xi32> loc(#loc183) + %ileft_323 = "tt.reduce"(%ileft_322) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc220) + %ileft_324 = tt.expand_dims %ileft_323 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc185) + %ileft_325 = tt.broadcast %ileft_324 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc186) + %iright_326 = arith.muli %y_321, %flip_31 : tensor<128x2x2xi32> loc(#loc187) + %iright_327 = "tt.reduce"(%iright_326) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc222) + %iright_328 = tt.expand_dims %iright_327 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc189) + %iright_329 = tt.broadcast %iright_328 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc190) + %ileft_330 = tt.reshape %ileft_325 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc191) + %iright_331 = tt.reshape %iright_329 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc192) + %y_idx_332 = tt.reshape %new_idxs_320 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc193) + %left_idx_333 = arith.muli %y_idx_332, %ileft_71 : tensor<128x2x2xi32> loc(#loc195) + %left_idx_334 = "tt.reduce"(%left_idx_333) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc225) + %left_idx_335 = tt.expand_dims %left_idx_334 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc197) + %left_idx_336 = tt.broadcast %left_idx_335 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc198) + %right_idx_337 = arith.muli %y_idx_332, %flip_31 : tensor<128x2x2xi32> loc(#loc200) + %right_idx_338 = "tt.reduce"(%right_idx_337) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc228) + %right_idx_339 = tt.expand_dims %right_idx_338 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc202) + %right_idx_340 = tt.broadcast %right_idx_339 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc203) + %left_idx_341 = tt.reshape %left_idx_336 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc204) + %right_idx_342 = tt.reshape %right_idx_340 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc205) + %cond_343 = arith.cmpi slt, %ileft_330, %iright_331 : tensor<32x16xi32> loc(#loc206) + %eq_344 = arith.cmpi eq, %ileft_330, %iright_331 : tensor<32x16xi32> loc(#loc207) + %cond_345 = arith.cmpi sgt, %left_idx_341, %right_idx_342 : tensor<32x16xi32> loc(#loc208) + %cond_346 = arith.andi %eq_344, %cond_345 : tensor<32x16xi1> loc(#loc209) + %cond_347 = arith.ori %cond_343, %cond_346 : tensor<32x16xi1> loc(#loc210) + %ret_348 = arith.xori %ileft_330, %iright_331 : tensor<32x16xi32> loc(#loc213) + %ret_349 = arith.select %cond_347, %ret_348, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc214) + %ret_350 = arith.xori %ret_317, %ret_349 : tensor<32x16xi32> loc(#loc215) + %new_idxs_351 = arith.xori %left_idx_341, %right_idx_342 : tensor<32x16xi32> loc(#loc216) + %new_idxs_352 = arith.select %cond_347, %new_idxs_351, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc217) + %new_idxs_353 = arith.xori %new_idxs_320, %new_idxs_352 : tensor<32x16xi32> loc(#loc218) + %y_354 = tt.reshape %ret_350 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc181) + %ileft_355 = arith.muli %y_354, %ileft : tensor<256x2x1xi32> loc(#loc183) + %ileft_356 = "tt.reduce"(%ileft_355) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc220) + %ileft_357 = tt.expand_dims %ileft_356 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc185) + %ileft_358 = tt.broadcast %ileft_357 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc186) + %iright_359 = arith.muli %y_354, %iright : tensor<256x2x1xi32> loc(#loc187) + %iright_360 = "tt.reduce"(%iright_359) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc222) + %iright_361 = tt.expand_dims %iright_360 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc189) + %iright_362 = tt.broadcast %iright_361 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc190) + %ileft_363 = tt.reshape %ileft_358 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc191) + %iright_364 = tt.reshape %iright_362 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc192) + %y_idx_365 = tt.reshape %new_idxs_353 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc193) + %left_idx_366 = arith.muli %y_idx_365, %ileft : tensor<256x2x1xi32> loc(#loc195) + %left_idx_367 = "tt.reduce"(%left_idx_366) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc225) + %left_idx_368 = tt.expand_dims %left_idx_367 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc197) + %left_idx_369 = tt.broadcast %left_idx_368 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc198) + %right_idx_370 = arith.muli %y_idx_365, %iright : tensor<256x2x1xi32> loc(#loc200) + %right_idx_371 = "tt.reduce"(%right_idx_370) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc228) + %right_idx_372 = tt.expand_dims %right_idx_371 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc202) + %right_idx_373 = tt.broadcast %right_idx_372 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc203) + %left_idx_374 = tt.reshape %left_idx_369 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc204) + %right_idx_375 = tt.reshape %right_idx_373 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc205) + %cond_376 = arith.cmpi slt, %ileft_363, %iright_364 : tensor<32x16xi32> loc(#loc206) + %eq_377 = arith.cmpi eq, %ileft_363, %iright_364 : tensor<32x16xi32> loc(#loc207) + %cond_378 = arith.cmpi sgt, %left_idx_374, %right_idx_375 : tensor<32x16xi32> loc(#loc208) + %cond_379 = arith.andi %eq_377, %cond_378 : tensor<32x16xi1> loc(#loc209) + %cond_380 = arith.ori %cond_376, %cond_379 : tensor<32x16xi1> loc(#loc210) + %new_idxs_381 = arith.xori %left_idx_374, %right_idx_375 : tensor<32x16xi32> loc(#loc216) + %new_idxs_382 = arith.select %cond_380, %new_idxs_381, %cst_0 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc217) + %new_idxs_383 = arith.xori %new_idxs_353, %new_idxs_382 : tensor<32x16xi32> loc(#loc218) + %tmp7 = arith.extsi %tmp0_28 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc167) + %tmp10_384 = arith.select %tmp0_27, %tmp7, %tmp10 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc100) + %tmp11 = "tt.reduce"(%tmp10_384) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_386: i64 loc(callsite(#loc1 at #loc168)), %tmp11_387: i64 loc(callsite(#loc1 at #loc168))): + %tmp11_388 = arith.addi %tmp11_386, %tmp11_387 : i64 loc(#loc219) + tt.reduce.return %tmp11_388 : i64 loc(#loc179) + }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc179) + %tmp11_385 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc169) + %tmp14 = arith.trunci %tmp11_385 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc170) + %0 = arith.muli %x0_9, %cst_1 : tensor<32x1xi64> loc(#loc77) + %1 = tt.broadcast %0 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc78) + %2 = arith.addi %tmp0_10, %1 : tensor<32x16xi64> loc(#loc78) + %3 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc79) + %4 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc80) + %5 = arith.extui %4 : i1 to i64 loc(#loc81) + %6 = arith.muli %ks0, %5 : i64 loc(#loc81) + %7 = arith.extui %3 : i1 to i64 loc(#loc171) + %8 = arith.addi %7, %6 : i64 loc(#loc82) + %9 = tt.splat %8 : i64 -> tensor<32x1xi64> loc(#loc84) + %10 = arith.muli %tmp0_13, %9 : tensor<32x1xi64> loc(#loc84) + %11 = tt.broadcast %10 : tensor<32x1xi64> -> tensor<32x16xi64> loc(#loc85) + %12 = arith.addi %2, %11 : tensor<32x16xi64> loc(#loc85) + %13 = tt.splat %out_ptr2 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc86) + %14 = tt.addptr %13, %12 : tensor<32x16x!tt.ptr>, tensor<32x16xi64> loc(#loc86) + tt.store %14, %new_idxs_383, %tmp0_27 : tensor<32x16x!tt.ptr> loc(#loc87) + %15 = arith.muli %x1, %9 : tensor<32x1xi64> loc(#loc88) + %16 = arith.addi %x0_9, %15 : tensor<32x1xi64> loc(#loc89) + %17 = tt.splat %out_ptr3 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc90) + %18 = tt.addptr %17, %16 : tensor<32x1x!tt.ptr>, tensor<32x1xi64> loc(#loc90) + tt.store %18, %tmp14, %xmask_6 : tensor<32x1x!tt.ptr> loc(#loc91) + tt.return loc(#loc92) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":43:34) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":23:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":23:33) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:44) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:23) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":25:21) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":26:28) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":26:38) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":32:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":33:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:45) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:42) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:54) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:50) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:64) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:68) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:61) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:30) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:73) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":37:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":39:33) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":41:19) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":44:29) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":47:21) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:35) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:32) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:62) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:88) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:79) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:70) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:54) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:47) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:40) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:25) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:102) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:34) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:30) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:25) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:89) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:4) +#loc99 = loc(callsite(#loc1 at #loc2)) +#loc100 = loc("tmp10"(#loc3)) +#loc101 = loc("xoffset"(#loc4)) +#loc102 = loc("xoffset"(#loc5)) +#loc103 = loc("xindex"(#loc6)) +#loc104 = loc("xindex"(#loc7)) +#loc105 = loc("xindex"(#loc8)) +#loc106 = loc("xmask"(#loc9)) +#loc107 = loc("r0_index"(#loc10)) +#loc108 = loc("r0_index"(#loc11)) +#loc109 = loc("x0"(#loc12)) +#loc110 = loc("x1"(#loc13)) +#loc111 = loc("tmp0"(#loc14)) +#loc112 = loc("tmp0"(#loc15)) +#loc113 = loc("tmp0"(#loc16)) +#loc114 = loc("tmp0"(#loc17)) +#loc115 = loc("tmp0"(#loc18)) +#loc116 = loc("tmp0"(#loc19)) +#loc117 = loc("tmp0"(#loc20)) +#loc118 = loc("tmp0"(#loc21)) +#loc119 = loc("tmp0"(#loc22)) +#loc120 = loc("tmp0"(#loc23)) +#loc121 = loc("tmp2"(#loc24)) +#loc122 = loc("tmp4"(#loc25)) +#loc123 = loc("flip"(#loc26)) +#loc125 = loc("flip"(#loc28)) +#loc126 = loc("flip"(#loc29)) +#loc127 = loc("flip"(#loc30)) +#loc128 = loc("y"(#loc31)) +#loc129 = loc("left_mask"(#loc33)) +#loc130 = loc("ileft"(#loc34)) +#loc132 = loc("ileft"(#loc38)) +#loc133 = loc("ileft"(#loc39)) +#loc134 = loc("iright"(#loc40)) +#loc136 = loc("iright"(#loc42)) +#loc137 = loc("iright"(#loc43)) +#loc138 = loc("ileft"(#loc44)) +#loc139 = loc("iright"(#loc45)) +#loc140 = loc("y_idx"(#loc46)) +#loc141 = loc("left_idx"(#loc47)) +#loc142 = loc("left_idx"(#loc48)) +#loc143 = loc("input"(#loc49)) +#loc145 = loc("left_idx"(#loc51)) +#loc146 = loc("left_idx"(#loc52)) +#loc147 = loc("right_idx"(#loc53)) +#loc148 = loc("right_idx"(#loc54)) +#loc150 = loc("right_idx"(#loc56)) +#loc151 = loc("right_idx"(#loc57)) +#loc152 = loc("left_idx"(#loc58)) +#loc153 = loc("right_idx"(#loc59)) +#loc154 = loc("cond"(#loc60)) +#loc155 = loc("eq"(#loc61)) +#loc156 = loc("cond"(#loc62)) +#loc157 = loc("cond"(#loc63)) +#loc158 = loc("cond"(#loc64)) +#loc159 = loc("cond"(#loc65)) +#loc160 = loc("cond"(#loc66)) +#loc161 = loc("ret"(#loc67)) +#loc162 = loc("ret"(#loc68)) +#loc163 = loc("ret"(#loc69)) +#loc164 = loc("new_idxs"(#loc70)) +#loc165 = loc("new_idxs"(#loc71)) +#loc166 = loc("new_idxs"(#loc72)) +#loc167 = loc("tmp7"(#loc73)) +#loc169 = loc("tmp11"(#loc75)) +#loc170 = loc("tmp14"(#loc76)) +#loc171 = loc(fused[#loc82, #loc83]) +#loc172 = loc(callsite(#loc123 at #loc124)) +#loc173 = loc(callsite(#loc125 at #loc124)) +#loc174 = loc(callsite(#loc126 at #loc124)) +#loc175 = loc(callsite(#loc127 at #loc124)) +#loc177 = loc("cond"(#loc154)) +#loc178 = loc("eq"(#loc155)) +#loc179 = loc(callsite(#loc35 at #loc168)) +#loc181 = loc(callsite(#loc128 at #loc176)) +#loc182 = loc(callsite(#loc129 at #loc176)) +#loc183 = loc(callsite(#loc130 at #loc176)) +#loc185 = loc(callsite(#loc132 at #loc176)) +#loc186 = loc(callsite(#loc133 at #loc176)) +#loc187 = loc(callsite(#loc134 at #loc176)) +#loc189 = loc(callsite(#loc136 at #loc176)) +#loc190 = loc(callsite(#loc137 at #loc176)) +#loc191 = loc(callsite(#loc138 at #loc176)) +#loc192 = loc(callsite(#loc139 at #loc176)) +#loc193 = loc(callsite(#loc140 at #loc176)) +#loc194 = loc(callsite(#loc141 at #loc176)) +#loc195 = loc(callsite(#loc142 at #loc176)) +#loc197 = loc(callsite(#loc145 at #loc176)) +#loc198 = loc(callsite(#loc146 at #loc176)) +#loc199 = loc(callsite(#loc147 at #loc176)) +#loc200 = loc(callsite(#loc148 at #loc176)) +#loc202 = loc(callsite(#loc150 at #loc176)) +#loc203 = loc(callsite(#loc151 at #loc176)) +#loc204 = loc(callsite(#loc152 at #loc176)) +#loc205 = loc(callsite(#loc153 at #loc176)) +#loc206 = loc(callsite(#loc177 at #loc176)) +#loc207 = loc(callsite(#loc178 at #loc176)) +#loc208 = loc(callsite(#loc156 at #loc176)) +#loc209 = loc(callsite(#loc157 at #loc176)) +#loc210 = loc(callsite(#loc158 at #loc176)) +#loc211 = loc(callsite(#loc159 at #loc176)) +#loc212 = loc(callsite(#loc160 at #loc176)) +#loc213 = loc(callsite(#loc161 at #loc176)) +#loc214 = loc(callsite(#loc162 at #loc176)) +#loc215 = loc(callsite(#loc163 at #loc176)) +#loc216 = loc(callsite(#loc164 at #loc176)) +#loc217 = loc(callsite(#loc165 at #loc176)) +#loc218 = loc(callsite(#loc166 at #loc176)) +#loc219 = loc(callsite(#loc37 at #loc179)) +#loc220 = loc(callsite(#loc35 at #loc184)) +#loc222 = loc(callsite(#loc35 at #loc188)) +#loc224 = loc(callsite(#loc143 at #loc196)) +#loc225 = loc(callsite(#loc35 at #loc196)) +#loc227 = loc(callsite(#loc143 at #loc201)) +#loc228 = loc(callsite(#loc35 at #loc201)) +#loc230 = loc(callsite(#loc37 at #loc220)) +#loc231 = loc(callsite(#loc37 at #loc222)) +#loc232 = loc(callsite(#loc37 at #loc225)) +#loc233 = loc(callsite(#loc37 at #loc228)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json new file mode 100644 index 0000000000000000000000000000000000000000..fa6c211348425b43ae264057dbd6d4c44ccbb26b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..d7ae44b40d72b0635a580fd88c023cb4dba0f6b2 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a52fa1b7f6a03f3f61a785c411b5abe5c0e5b07c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json @@ -0,0 +1 @@ +{"hash": "9875b0a37e993d2badcafd1c96e7998131064889a77c38a5caf0f4bf6d53c3b6", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 64, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..d704d37650e72468041f907efb55c5f061797412 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir @@ -0,0 +1,940 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !5 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %8 = icmp slt i32 %7, %2, !dbg !9 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %10 = shl nuw nsw i32 %9, 2, !dbg !10 + %11 = and i32 %10, 2044, !dbg !10 + %12 = mul i32 %7, 32000, !dbg !11 + %13 = zext nneg i32 %11 to i64, !dbg !12 + br label %14, !dbg !12 + +14: ; preds = %6, %14 + %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %14 ] + %15 = phi <2 x float> [ zeroinitializer, %6 ], [ %273, %14 ] + %16 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %271, %14 ] + %17 = phi <2 x float> [ zeroinitializer, %6 ], [ %272, %14 ] + %18 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %270, %14 ] + %19 = or disjoint i64 %indvars.iv, %13, !dbg !13 + %20 = icmp samesign ult i64 %19, 32000, !dbg !14 + %21 = trunc nuw nsw i64 %19 to i32, !dbg !15 + %22 = add i32 %12, %21, !dbg !15 + %23 = sext i32 %22 to i64, !dbg !16 + %24 = getelementptr bfloat, ptr addrspace(1) %0, i64 %23, !dbg !16 + %25 = and i1 %8, %20, !dbg !17 + %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !18 + %27 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %24, i64 %26, i1 %25) #6, !dbg !18 + %28 = extractvalue { i32, i32 } %27, 0, !dbg !18 + %29 = bitcast i32 %28 to <2 x bfloat>, !dbg !18 + %30 = extractvalue { i32, i32 } %27, 1, !dbg !18 + %31 = bitcast i32 %30 to <2 x bfloat>, !dbg !18 + %32 = fcmp uno <2 x float> %18, zeroinitializer, !dbg !19 + %33 = fcmp uno <2 x float> %16, zeroinitializer, !dbg !19 + %34 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not.i72 = icmp eq i32 %34, 0, !dbg !23 + %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not1.i74 = icmp eq i32 %35, 0, !dbg !23 + %36 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %37 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not3.i78 = icmp eq i32 %37, 0, !dbg !23 + %38 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not4.i80 = icmp eq i32 %38, 0, !dbg !23 + %39 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not.i82 = icmp eq i32 %39, 0, !dbg !23 + %40 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not1.i84 = icmp eq i32 %40, 0, !dbg !23 + %41 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %42 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not3.i88 = icmp eq i32 %42, 0, !dbg !23 + %43 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not4.i90 = icmp eq i32 %43, 0, !dbg !23 + %44 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not.i92 = icmp eq i32 %44, 0, !dbg !23 + %45 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not1.i94 = icmp eq i32 %45, 0, !dbg !23 + %46 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %47 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not3.i98 = icmp eq i32 %47, 0, !dbg !23 + %48 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not4.i100 = icmp eq i32 %48, 0, !dbg !23 + %49 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not.i102 = icmp eq i32 %49, 0, !dbg !23 + %50 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not1.i104 = icmp eq i32 %50, 0, !dbg !23 + %51 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %52 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not3.i108 = icmp eq i32 %52, 0, !dbg !23 + %53 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not4.i110 = icmp eq i32 %53, 0, !dbg !23 + %54 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not.i112 = icmp eq i32 %54, 0, !dbg !23 + %55 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not1.i114 = icmp eq i32 %55, 0, !dbg !23 + %56 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %57 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not3.i118 = icmp eq i32 %57, 0, !dbg !23 + %58 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not4.i120 = icmp eq i32 %58, 0, !dbg !23 + %59 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not.i122 = icmp eq i32 %59, 0, !dbg !23 + %60 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not1.i124 = icmp eq i32 %60, 0, !dbg !23 + %61 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %62 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not3.i128 = icmp eq i32 %62, 0, !dbg !23 + %63 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not4.i130 = icmp eq i32 %63, 0, !dbg !23 + %64 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not.i132 = icmp eq i32 %64, 0, !dbg !23 + %65 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not1.i134 = icmp eq i32 %65, 0, !dbg !23 + %66 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %67 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not3.i138 = icmp eq i32 %67, 0, !dbg !23 + %68 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not4.i140 = icmp eq i32 %68, 0, !dbg !23 + %69 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not.i142 = icmp eq i32 %69, 0, !dbg !23 + %70 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not1.i144 = icmp eq i32 %70, 0, !dbg !23 + %71 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %72 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not3.i148 = icmp eq i32 %72, 0, !dbg !23 + %73 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !23 + %.not4.i150 = icmp eq i32 %73, 0, !dbg !23 + %74 = fpext <2 x bfloat> %29 to <2 x float>, !dbg !24 + %75 = fcmp ogt <2 x float> %18, %74, !dbg !25 + %76 = or <2 x i1> %32, %75, !dbg !26 + %77 = select <2 x i1> %76, <2 x float> %18, <2 x float> %74, !dbg !27 + %78 = fcmp oeq <2 x float> %77, splat (float 0xFFF0000000000000), !dbg !28 + %foldExtExtBinop = fsub <2 x float> %18, %77, !dbg !29 + %79 = extractelement <2 x float> %foldExtExtBinop, i64 0, !dbg !29 + %foldExtExtBinop177 = fsub <2 x float> %18, %77, !dbg !29 + %80 = extractelement <2 x float> %foldExtExtBinop177, i64 1, !dbg !29 + %81 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %79, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !23 + %82 = tail call float @llvm.nvvm.fma.rn.f(float %79, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !23 + %.02.i73 = select i1 %.not.i72, float %82, float %81, !dbg !23 + %83 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i73) #6, !dbg !23 + %84 = tail call float @llvm.nvvm.saturate.f(float %.02.i73) #6, !dbg !23 + %.03.i75 = select i1 %.not1.i74, float %84, float %83, !dbg !23 + %85 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i75, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !23 + %86 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i75, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !23 + %87 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %80, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !23 + %88 = tail call float @llvm.nvvm.fma.rn.f(float %80, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !23 + %.02.i83 = select i1 %.not.i82, float %88, float %87, !dbg !23 + %89 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i83) #6, !dbg !23 + %90 = tail call float @llvm.nvvm.saturate.f(float %.02.i83) #6, !dbg !23 + %.03.i85 = select i1 %.not1.i84, float %90, float %89, !dbg !23 + %91 = insertelement <2 x i32> poison, i32 %36, i64 0, !dbg !23 + %92 = insertelement <2 x i32> %91, i32 %41, i64 1, !dbg !23 + %93 = icmp eq <2 x i32> %92, zeroinitializer, !dbg !23 + %94 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i85, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !23 + %95 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i85, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !23 + %96 = insertelement <2 x float> poison, float %86, i64 0, !dbg !23 + %97 = insertelement <2 x float> %96, float %95, i64 1, !dbg !23 + %98 = insertelement <2 x float> poison, float %85, i64 0, !dbg !23 + %99 = insertelement <2 x float> %98, float %94, i64 1, !dbg !23 + %100 = select <2 x i1> %93, <2 x float> %97, <2 x float> %99, !dbg !23 + %101 = extractelement <2 x float> %100, i64 0, !dbg !23 + %102 = fadd float %101, 0xC168000FE0000000, !dbg !23 + %103 = fneg float %102, !dbg !23 + %104 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %79, float 0x3FF7154760000000, float %103) #6, !dbg !23 + %105 = tail call float @llvm.nvvm.fma.rn.f(float %79, float 0x3FF7154760000000, float %103) #6, !dbg !23 + %.0.i79 = select i1 %.not3.i78, float %105, float %104, !dbg !23 + %106 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %79, float 0x3E54AE0C00000000, float %.0.i79) #6, !dbg !23 + %107 = tail call float @llvm.nvvm.fma.rn.f(float %79, float 0x3E54AE0C00000000, float %.0.i79) #6, !dbg !23 + %.01.i81 = select i1 %.not4.i80, float %107, float %106, !dbg !23 + %108 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i81) #6, !dbg !23 + %109 = extractelement <2 x float> %100, i64 1, !dbg !23 + %110 = fadd float %109, 0xC168000FE0000000, !dbg !23 + %111 = fneg float %110, !dbg !23 + %112 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %80, float 0x3FF7154760000000, float %111) #6, !dbg !23 + %113 = tail call float @llvm.nvvm.fma.rn.f(float %80, float 0x3FF7154760000000, float %111) #6, !dbg !23 + %.0.i89 = select i1 %.not3.i88, float %113, float %112, !dbg !23 + %114 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %80, float 0x3E54AE0C00000000, float %.0.i89) #6, !dbg !23 + %115 = tail call float @llvm.nvvm.fma.rn.f(float %80, float 0x3E54AE0C00000000, float %.0.i89) #6, !dbg !23 + %.01.i91 = select i1 %.not4.i90, float %115, float %114, !dbg !23 + %116 = bitcast <2 x float> %100 to <2 x i32>, !dbg !23 + %117 = shl <2 x i32> %116, splat (i32 23), !dbg !23 + %118 = bitcast <2 x i32> %117 to <2 x float>, !dbg !23 + %119 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i91) #6, !dbg !23 + %120 = insertelement <2 x float> poison, float %108, i64 0, !dbg !23 + %121 = insertelement <2 x float> %120, float %119, i64 1, !dbg !23 + %122 = fmul <2 x float> %121, %118, !dbg !23 + %123 = select <2 x i1> %78, <2 x float> splat (float 1.000000e+00), <2 x float> %122, !dbg !30 + %foldExtExtBinop179 = fsub <2 x float> %74, %77, !dbg !31 + %124 = extractelement <2 x float> %foldExtExtBinop179, i64 0, !dbg !31 + %foldExtExtBinop181 = fsub <2 x float> %74, %77, !dbg !31 + %125 = extractelement <2 x float> %foldExtExtBinop181, i64 1, !dbg !31 + %126 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %124, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !23 + %127 = tail call float @llvm.nvvm.fma.rn.f(float %124, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !23 + %.02.i113 = select i1 %.not.i112, float %127, float %126, !dbg !23 + %128 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i113) #6, !dbg !23 + %129 = tail call float @llvm.nvvm.saturate.f(float %.02.i113) #6, !dbg !23 + %.03.i115 = select i1 %.not1.i114, float %129, float %128, !dbg !23 + %130 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i115, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !23 + %131 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i115, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !23 + %132 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %125, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !23 + %133 = tail call float @llvm.nvvm.fma.rn.f(float %125, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !23 + %.02.i123 = select i1 %.not.i122, float %133, float %132, !dbg !23 + %134 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i123) #6, !dbg !23 + %135 = tail call float @llvm.nvvm.saturate.f(float %.02.i123) #6, !dbg !23 + %.03.i125 = select i1 %.not1.i124, float %135, float %134, !dbg !23 + %136 = insertelement <2 x i32> poison, i32 %56, i64 0, !dbg !23 + %137 = insertelement <2 x i32> %136, i32 %61, i64 1, !dbg !23 + %138 = icmp eq <2 x i32> %137, zeroinitializer, !dbg !23 + %139 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i125, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !23 + %140 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i125, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !23 + %141 = insertelement <2 x float> poison, float %131, i64 0, !dbg !23 + %142 = insertelement <2 x float> %141, float %140, i64 1, !dbg !23 + %143 = insertelement <2 x float> poison, float %130, i64 0, !dbg !23 + %144 = insertelement <2 x float> %143, float %139, i64 1, !dbg !23 + %145 = select <2 x i1> %138, <2 x float> %142, <2 x float> %144, !dbg !23 + %146 = extractelement <2 x float> %145, i64 0, !dbg !23 + %147 = fadd float %146, 0xC168000FE0000000, !dbg !23 + %148 = fneg float %147, !dbg !23 + %149 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %124, float 0x3FF7154760000000, float %148) #6, !dbg !23 + %150 = tail call float @llvm.nvvm.fma.rn.f(float %124, float 0x3FF7154760000000, float %148) #6, !dbg !23 + %.0.i119 = select i1 %.not3.i118, float %150, float %149, !dbg !23 + %151 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %124, float 0x3E54AE0C00000000, float %.0.i119) #6, !dbg !23 + %152 = tail call float @llvm.nvvm.fma.rn.f(float %124, float 0x3E54AE0C00000000, float %.0.i119) #6, !dbg !23 + %.01.i121 = select i1 %.not4.i120, float %152, float %151, !dbg !23 + %153 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i121) #6, !dbg !23 + %154 = extractelement <2 x float> %145, i64 1, !dbg !23 + %155 = fadd float %154, 0xC168000FE0000000, !dbg !23 + %156 = fneg float %155, !dbg !23 + %157 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %125, float 0x3FF7154760000000, float %156) #6, !dbg !23 + %158 = tail call float @llvm.nvvm.fma.rn.f(float %125, float 0x3FF7154760000000, float %156) #6, !dbg !23 + %.0.i129 = select i1 %.not3.i128, float %158, float %157, !dbg !23 + %159 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %125, float 0x3E54AE0C00000000, float %.0.i129) #6, !dbg !23 + %160 = tail call float @llvm.nvvm.fma.rn.f(float %125, float 0x3E54AE0C00000000, float %.0.i129) #6, !dbg !23 + %.01.i131 = select i1 %.not4.i130, float %160, float %159, !dbg !23 + %161 = bitcast <2 x float> %145 to <2 x i32>, !dbg !23 + %162 = shl <2 x i32> %161, splat (i32 23), !dbg !23 + %163 = bitcast <2 x i32> %162 to <2 x float>, !dbg !23 + %164 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i131) #6, !dbg !23 + %165 = insertelement <2 x float> poison, float %153, i64 0, !dbg !23 + %166 = insertelement <2 x float> %165, float %164, i64 1, !dbg !23 + %167 = fmul <2 x float> %166, %163, !dbg !23 + %168 = select <2 x i1> %78, <2 x float> splat (float 1.000000e+00), <2 x float> %167, !dbg !32 + %169 = fmul <2 x float> %17, %123, !dbg !33 + %170 = fadd <2 x float> %169, %168, !dbg !34 + %171 = fpext <2 x bfloat> %31 to <2 x float>, !dbg !24 + %172 = fcmp ogt <2 x float> %16, %171, !dbg !25 + %173 = or <2 x i1> %33, %172, !dbg !26 + %174 = select <2 x i1> %173, <2 x float> %16, <2 x float> %171, !dbg !27 + %175 = fcmp oeq <2 x float> %174, splat (float 0xFFF0000000000000), !dbg !28 + %foldExtExtBinop183 = fsub <2 x float> %16, %174, !dbg !29 + %176 = extractelement <2 x float> %foldExtExtBinop183, i64 0, !dbg !29 + %foldExtExtBinop185 = fsub <2 x float> %16, %174, !dbg !29 + %177 = extractelement <2 x float> %foldExtExtBinop185, i64 1, !dbg !29 + %178 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %176, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !23 + %179 = tail call float @llvm.nvvm.fma.rn.f(float %176, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !23 + %.02.i93 = select i1 %.not.i92, float %179, float %178, !dbg !23 + %180 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i93) #6, !dbg !23 + %181 = tail call float @llvm.nvvm.saturate.f(float %.02.i93) #6, !dbg !23 + %.03.i95 = select i1 %.not1.i94, float %181, float %180, !dbg !23 + %182 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i95, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !23 + %183 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i95, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !23 + %184 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %177, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !23 + %185 = tail call float @llvm.nvvm.fma.rn.f(float %177, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !23 + %.02.i103 = select i1 %.not.i102, float %185, float %184, !dbg !23 + %186 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i103) #6, !dbg !23 + %187 = tail call float @llvm.nvvm.saturate.f(float %.02.i103) #6, !dbg !23 + %.03.i105 = select i1 %.not1.i104, float %187, float %186, !dbg !23 + %188 = insertelement <2 x i32> poison, i32 %46, i64 0, !dbg !23 + %189 = insertelement <2 x i32> %188, i32 %51, i64 1, !dbg !23 + %190 = icmp eq <2 x i32> %189, zeroinitializer, !dbg !23 + %191 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i105, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !23 + %192 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i105, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !23 + %193 = insertelement <2 x float> poison, float %183, i64 0, !dbg !23 + %194 = insertelement <2 x float> %193, float %192, i64 1, !dbg !23 + %195 = insertelement <2 x float> poison, float %182, i64 0, !dbg !23 + %196 = insertelement <2 x float> %195, float %191, i64 1, !dbg !23 + %197 = select <2 x i1> %190, <2 x float> %194, <2 x float> %196, !dbg !23 + %198 = extractelement <2 x float> %197, i64 0, !dbg !23 + %199 = fadd float %198, 0xC168000FE0000000, !dbg !23 + %200 = fneg float %199, !dbg !23 + %201 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %176, float 0x3FF7154760000000, float %200) #6, !dbg !23 + %202 = tail call float @llvm.nvvm.fma.rn.f(float %176, float 0x3FF7154760000000, float %200) #6, !dbg !23 + %.0.i99 = select i1 %.not3.i98, float %202, float %201, !dbg !23 + %203 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %176, float 0x3E54AE0C00000000, float %.0.i99) #6, !dbg !23 + %204 = tail call float @llvm.nvvm.fma.rn.f(float %176, float 0x3E54AE0C00000000, float %.0.i99) #6, !dbg !23 + %.01.i101 = select i1 %.not4.i100, float %204, float %203, !dbg !23 + %205 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i101) #6, !dbg !23 + %206 = extractelement <2 x float> %197, i64 1, !dbg !23 + %207 = fadd float %206, 0xC168000FE0000000, !dbg !23 + %208 = fneg float %207, !dbg !23 + %209 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %177, float 0x3FF7154760000000, float %208) #6, !dbg !23 + %210 = tail call float @llvm.nvvm.fma.rn.f(float %177, float 0x3FF7154760000000, float %208) #6, !dbg !23 + %.0.i109 = select i1 %.not3.i108, float %210, float %209, !dbg !23 + %211 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %177, float 0x3E54AE0C00000000, float %.0.i109) #6, !dbg !23 + %212 = tail call float @llvm.nvvm.fma.rn.f(float %177, float 0x3E54AE0C00000000, float %.0.i109) #6, !dbg !23 + %.01.i111 = select i1 %.not4.i110, float %212, float %211, !dbg !23 + %213 = bitcast <2 x float> %197 to <2 x i32>, !dbg !23 + %214 = shl <2 x i32> %213, splat (i32 23), !dbg !23 + %215 = bitcast <2 x i32> %214 to <2 x float>, !dbg !23 + %216 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i111) #6, !dbg !23 + %217 = insertelement <2 x float> poison, float %205, i64 0, !dbg !23 + %218 = insertelement <2 x float> %217, float %216, i64 1, !dbg !23 + %219 = fmul <2 x float> %218, %215, !dbg !23 + %220 = select <2 x i1> %175, <2 x float> splat (float 1.000000e+00), <2 x float> %219, !dbg !30 + %foldExtExtBinop187 = fsub <2 x float> %171, %174, !dbg !31 + %221 = extractelement <2 x float> %foldExtExtBinop187, i64 0, !dbg !31 + %foldExtExtBinop189 = fsub <2 x float> %171, %174, !dbg !31 + %222 = extractelement <2 x float> %foldExtExtBinop189, i64 1, !dbg !31 + %223 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %221, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !23 + %224 = tail call float @llvm.nvvm.fma.rn.f(float %221, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !23 + %.02.i133 = select i1 %.not.i132, float %224, float %223, !dbg !23 + %225 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i133) #6, !dbg !23 + %226 = tail call float @llvm.nvvm.saturate.f(float %.02.i133) #6, !dbg !23 + %.03.i135 = select i1 %.not1.i134, float %226, float %225, !dbg !23 + %227 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i135, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !23 + %228 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i135, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !23 + %229 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %222, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !23 + %230 = tail call float @llvm.nvvm.fma.rn.f(float %222, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !23 + %.02.i143 = select i1 %.not.i142, float %230, float %229, !dbg !23 + %231 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i143) #6, !dbg !23 + %232 = tail call float @llvm.nvvm.saturate.f(float %.02.i143) #6, !dbg !23 + %.03.i145 = select i1 %.not1.i144, float %232, float %231, !dbg !23 + %233 = insertelement <2 x i32> poison, i32 %66, i64 0, !dbg !23 + %234 = insertelement <2 x i32> %233, i32 %71, i64 1, !dbg !23 + %235 = icmp eq <2 x i32> %234, zeroinitializer, !dbg !23 + %236 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i145, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !23 + %237 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i145, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !23 + %238 = insertelement <2 x float> poison, float %228, i64 0, !dbg !23 + %239 = insertelement <2 x float> %238, float %237, i64 1, !dbg !23 + %240 = insertelement <2 x float> poison, float %227, i64 0, !dbg !23 + %241 = insertelement <2 x float> %240, float %236, i64 1, !dbg !23 + %242 = select <2 x i1> %235, <2 x float> %239, <2 x float> %241, !dbg !23 + %243 = extractelement <2 x float> %242, i64 0, !dbg !23 + %244 = fadd float %243, 0xC168000FE0000000, !dbg !23 + %245 = fneg float %244, !dbg !23 + %246 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %221, float 0x3FF7154760000000, float %245) #6, !dbg !23 + %247 = tail call float @llvm.nvvm.fma.rn.f(float %221, float 0x3FF7154760000000, float %245) #6, !dbg !23 + %.0.i139 = select i1 %.not3.i138, float %247, float %246, !dbg !23 + %248 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %221, float 0x3E54AE0C00000000, float %.0.i139) #6, !dbg !23 + %249 = tail call float @llvm.nvvm.fma.rn.f(float %221, float 0x3E54AE0C00000000, float %.0.i139) #6, !dbg !23 + %.01.i141 = select i1 %.not4.i140, float %249, float %248, !dbg !23 + %250 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i141) #6, !dbg !23 + %251 = extractelement <2 x float> %242, i64 1, !dbg !23 + %252 = fadd float %251, 0xC168000FE0000000, !dbg !23 + %253 = fneg float %252, !dbg !23 + %254 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %222, float 0x3FF7154760000000, float %253) #6, !dbg !23 + %255 = tail call float @llvm.nvvm.fma.rn.f(float %222, float 0x3FF7154760000000, float %253) #6, !dbg !23 + %.0.i149 = select i1 %.not3.i148, float %255, float %254, !dbg !23 + %256 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %222, float 0x3E54AE0C00000000, float %.0.i149) #6, !dbg !23 + %257 = tail call float @llvm.nvvm.fma.rn.f(float %222, float 0x3E54AE0C00000000, float %.0.i149) #6, !dbg !23 + %.01.i151 = select i1 %.not4.i150, float %257, float %256, !dbg !23 + %258 = bitcast <2 x float> %242 to <2 x i32>, !dbg !23 + %259 = shl <2 x i32> %258, splat (i32 23), !dbg !23 + %260 = bitcast <2 x i32> %259 to <2 x float>, !dbg !23 + %261 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i151) #6, !dbg !23 + %262 = insertelement <2 x float> poison, float %250, i64 0, !dbg !23 + %263 = insertelement <2 x float> %262, float %261, i64 1, !dbg !23 + %264 = fmul <2 x float> %263, %260, !dbg !23 + %265 = select <2 x i1> %175, <2 x float> splat (float 1.000000e+00), <2 x float> %264, !dbg !32 + %266 = fmul <2 x float> %15, %220, !dbg !33 + %267 = fadd <2 x float> %266, %265, !dbg !34 + %268 = insertelement <2 x i1> poison, i1 %25, i64 0, !dbg !35 + %269 = shufflevector <2 x i1> %268, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !35 + %270 = select <2 x i1> %269, <2 x float> %77, <2 x float> %18, !dbg !35 + %271 = select <2 x i1> %269, <2 x float> %174, <2 x float> %16, !dbg !35 + %272 = select <2 x i1> %269, <2 x float> %170, <2 x float> %17, !dbg !36 + %273 = select <2 x i1> %269, <2 x float> %267, <2 x float> %15, !dbg !36 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !12 + %274 = icmp samesign ult i64 %indvars.iv, 29952, !dbg !12 + br i1 %274, label %14, label %275, !dbg !12 + +275: ; preds = %14 + %276 = and i32 %9, 31, !dbg !10 + %277 = lshr i32 %9, 5, !dbg !10 + %278 = extractelement <2 x float> %270, i64 0, !dbg !37 + %279 = extractelement <2 x float> %270, i64 1, !dbg !37 + %280 = fcmp ogt float %278, %279, !dbg !37 + %281 = fcmp uno float %278, 0.000000e+00, !dbg !39 + %282 = or i1 %280, %281, !dbg !40 + %283 = select i1 %282, float %278, float %279, !dbg !41 + %284 = extractelement <2 x float> %271, i64 0, !dbg !37 + %285 = fcmp ogt float %283, %284, !dbg !37 + %286 = fcmp uno float %283, 0.000000e+00, !dbg !39 + %287 = or i1 %285, %286, !dbg !40 + %288 = select i1 %287, float %283, float %284, !dbg !41 + %289 = extractelement <2 x float> %271, i64 1, !dbg !37 + %290 = fcmp ogt float %288, %289, !dbg !37 + %291 = fcmp uno float %288, 0.000000e+00, !dbg !39 + %292 = or i1 %290, %291, !dbg !40 + %293 = select i1 %292, float %288, float %289, !dbg !41 + %294 = bitcast float %293 to i32, !dbg !42 + %295 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %294, i32 16, i32 31), !dbg !42 + %296 = bitcast i32 %295 to float, !dbg !42 + %297 = fcmp ogt float %293, %296, !dbg !37 + %298 = fcmp uno float %293, 0.000000e+00, !dbg !39 + %299 = or i1 %298, %297, !dbg !40 + %300 = select i1 %299, float %293, float %296, !dbg !41 + %301 = bitcast float %300 to i32, !dbg !42 + %302 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %301, i32 8, i32 31), !dbg !42 + %303 = bitcast i32 %302 to float, !dbg !42 + %304 = fcmp ogt float %300, %303, !dbg !37 + %305 = fcmp uno float %300, 0.000000e+00, !dbg !39 + %306 = or i1 %304, %305, !dbg !40 + %307 = select i1 %306, float %300, float %303, !dbg !41 + %308 = bitcast float %307 to i32, !dbg !42 + %309 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 4, i32 31), !dbg !42 + %310 = bitcast i32 %309 to float, !dbg !42 + %311 = fcmp ogt float %307, %310, !dbg !37 + %312 = fcmp uno float %307, 0.000000e+00, !dbg !39 + %313 = or i1 %311, %312, !dbg !40 + %314 = select i1 %313, float %307, float %310, !dbg !41 + %315 = bitcast float %314 to i32, !dbg !42 + %316 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %315, i32 2, i32 31), !dbg !42 + %317 = bitcast i32 %316 to float, !dbg !42 + %318 = fcmp ogt float %314, %317, !dbg !37 + %319 = fcmp uno float %314, 0.000000e+00, !dbg !39 + %320 = or i1 %318, %319, !dbg !40 + %321 = select i1 %320, float %314, float %317, !dbg !41 + %322 = bitcast float %321 to i32, !dbg !42 + %323 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %322, i32 1, i32 31), !dbg !42 + %324 = bitcast i32 %323 to float, !dbg !42 + %325 = fcmp ogt float %321, %324, !dbg !37 + %326 = fcmp uno float %321, 0.000000e+00, !dbg !39 + %327 = or i1 %325, %326, !dbg !40 + %328 = and i32 %277, 15, !dbg !42 + %329 = icmp eq i32 %276, 0, !dbg !42 + %330 = getelementptr float, ptr addrspace(3) @global_smem, i32 %328, !dbg !42 + %331 = select i1 %327, i32 %322, i32 %323, !dbg !41 + %332 = insertelement <1 x i32> poison, i32 %331, i64 0, !dbg !42 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %330, <1 x i32> %332, i1 %329) #6, !dbg !42 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !42 + %333 = icmp samesign ult i32 %9, 16, !dbg !42 + %334 = getelementptr float, ptr addrspace(3) @global_smem, i32 %9, !dbg !42 + %335 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %334, i1 %333) #6, !dbg !42 + %336 = bitcast i32 %335 to float, !dbg !42 + %337 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %335, i32 8, i32 31), !dbg !42 + %338 = bitcast i32 %337 to float, !dbg !42 + %339 = fcmp ogt float %336, %338, !dbg !37 + %340 = fcmp uno float %336, 0.000000e+00, !dbg !39 + %341 = or i1 %340, %339, !dbg !40 + %342 = select i1 %341, float %336, float %338, !dbg !41 + %343 = bitcast float %342 to i32, !dbg !42 + %344 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %343, i32 4, i32 31), !dbg !42 + %345 = bitcast i32 %344 to float, !dbg !42 + %346 = fcmp ogt float %342, %345, !dbg !37 + %347 = fcmp uno float %342, 0.000000e+00, !dbg !39 + %348 = or i1 %346, %347, !dbg !40 + %349 = select i1 %348, float %342, float %345, !dbg !41 + %350 = bitcast float %349 to i32, !dbg !42 + %351 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %350, i32 2, i32 31), !dbg !42 + %352 = bitcast i32 %351 to float, !dbg !42 + %353 = fcmp ogt float %349, %352, !dbg !37 + %354 = fcmp uno float %349, 0.000000e+00, !dbg !39 + %355 = or i1 %353, %354, !dbg !40 + %356 = select i1 %355, float %349, float %352, !dbg !41 + %357 = bitcast float %356 to i32, !dbg !42 + %358 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %357, i32 1, i32 31), !dbg !42 + %359 = bitcast i32 %358 to float, !dbg !42 + %360 = fcmp ogt float %356, %359, !dbg !37 + %361 = fcmp uno float %356, 0.000000e+00, !dbg !39 + %362 = or i1 %360, %361, !dbg !40 + %363 = icmp eq i32 %9, 0, !dbg !42 + %364 = select i1 %362, i32 %357, i32 %358, !dbg !41 + %365 = insertelement <1 x i32> poison, i32 %364, i64 0, !dbg !42 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %334, <1 x i32> %365, i1 %363) #6, !dbg !42 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !42 + %366 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !42 + %367 = fcmp oeq float %366, 0xFFF0000000000000, !dbg !43 + %368 = fsub float %278, %366, !dbg !44 + %369 = fsub float %279, %366, !dbg !44 + %370 = fsub float %284, %366, !dbg !44 + %371 = fsub float %289, %366, !dbg !44 + %372 = select i1 %367, float 0.000000e+00, float %368, !dbg !45 + %373 = select i1 %367, float 0.000000e+00, float %369, !dbg !45 + %374 = select i1 %367, float 0.000000e+00, float %370, !dbg !45 + %375 = select i1 %367, float 0.000000e+00, float %371, !dbg !45 + %376 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %.not.i = icmp eq i32 %376, 0, !dbg !46 + %377 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %372, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !46 + %378 = tail call float @llvm.nvvm.fma.rn.f(float %372, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !46 + %.02.i = select i1 %.not.i, float %378, float %377, !dbg !46 + %379 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %.not1.i = icmp eq i32 %379, 0, !dbg !46 + %380 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i) #6, !dbg !46 + %381 = tail call float @llvm.nvvm.saturate.f(float %.02.i) #6, !dbg !46 + %.03.i = select i1 %.not1.i, float %381, float %380, !dbg !46 + %382 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %383 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !46 + %384 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !46 + %385 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %.not3.i = icmp eq i32 %385, 0, !dbg !46 + %386 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %.not4.i = icmp eq i32 %386, 0, !dbg !46 + %387 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %.not.i2 = icmp eq i32 %387, 0, !dbg !46 + %388 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %373, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !46 + %389 = tail call float @llvm.nvvm.fma.rn.f(float %373, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !46 + %.02.i3 = select i1 %.not.i2, float %389, float %388, !dbg !46 + %390 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %.not1.i4 = icmp eq i32 %390, 0, !dbg !46 + %391 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i3) #6, !dbg !46 + %392 = tail call float @llvm.nvvm.saturate.f(float %.02.i3) #6, !dbg !46 + %.03.i5 = select i1 %.not1.i4, float %392, float %391, !dbg !46 + %393 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %394 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i5, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !46 + %395 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i5, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !46 + %396 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %.not3.i8 = icmp eq i32 %396, 0, !dbg !46 + %397 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %.not4.i10 = icmp eq i32 %397, 0, !dbg !46 + %398 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %.not.i12 = icmp eq i32 %398, 0, !dbg !46 + %399 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %374, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !46 + %400 = tail call float @llvm.nvvm.fma.rn.f(float %374, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !46 + %.02.i13 = select i1 %.not.i12, float %400, float %399, !dbg !46 + %401 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %.not1.i14 = icmp eq i32 %401, 0, !dbg !46 + %402 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i13) #6, !dbg !46 + %403 = tail call float @llvm.nvvm.saturate.f(float %.02.i13) #6, !dbg !46 + %.03.i15 = select i1 %.not1.i14, float %403, float %402, !dbg !46 + %404 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %405 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i15, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !46 + %406 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i15, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !46 + %407 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %.not3.i18 = icmp eq i32 %407, 0, !dbg !46 + %408 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %.not4.i20 = icmp eq i32 %408, 0, !dbg !46 + %409 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %.not.i22 = icmp eq i32 %409, 0, !dbg !46 + %410 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %375, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !46 + %411 = tail call float @llvm.nvvm.fma.rn.f(float %375, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !46 + %.02.i23 = select i1 %.not.i22, float %411, float %410, !dbg !46 + %412 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %.not1.i24 = icmp eq i32 %412, 0, !dbg !46 + %413 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i23) #6, !dbg !46 + %414 = tail call float @llvm.nvvm.saturate.f(float %.02.i23) #6, !dbg !46 + %.03.i25 = select i1 %.not1.i24, float %414, float %413, !dbg !46 + %415 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %416 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i25, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !46 + %417 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i25, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !46 + %418 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %.not3.i28 = icmp eq i32 %418, 0, !dbg !46 + %419 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %.not4.i30 = icmp eq i32 %419, 0, !dbg !46 + %420 = insertelement <2 x i32> poison, i32 %382, i64 0, !dbg !46 + %421 = insertelement <2 x i32> %420, i32 %393, i64 1, !dbg !46 + %422 = icmp eq <2 x i32> %421, zeroinitializer, !dbg !46 + %423 = insertelement <2 x float> poison, float %384, i64 0, !dbg !46 + %424 = insertelement <2 x float> %423, float %395, i64 1, !dbg !46 + %425 = insertelement <2 x float> poison, float %383, i64 0, !dbg !46 + %426 = insertelement <2 x float> %425, float %394, i64 1, !dbg !46 + %427 = select <2 x i1> %422, <2 x float> %424, <2 x float> %426, !dbg !46 + %428 = extractelement <2 x float> %427, i64 0, !dbg !46 + %429 = fadd float %428, 0xC168000FE0000000, !dbg !46 + %430 = fneg float %429, !dbg !46 + %431 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %372, float 0x3FF7154760000000, float %430) #6, !dbg !46 + %432 = tail call float @llvm.nvvm.fma.rn.f(float %372, float 0x3FF7154760000000, float %430) #6, !dbg !46 + %.0.i = select i1 %.not3.i, float %432, float %431, !dbg !46 + %433 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %372, float 0x3E54AE0C00000000, float %.0.i) #6, !dbg !46 + %434 = tail call float @llvm.nvvm.fma.rn.f(float %372, float 0x3E54AE0C00000000, float %.0.i) #6, !dbg !46 + %.01.i = select i1 %.not4.i, float %434, float %433, !dbg !46 + %435 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i) #6, !dbg !46 + %436 = extractelement <2 x float> %427, i64 1, !dbg !46 + %437 = fadd float %436, 0xC168000FE0000000, !dbg !46 + %438 = fneg float %437, !dbg !46 + %439 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %373, float 0x3FF7154760000000, float %438) #6, !dbg !46 + %440 = tail call float @llvm.nvvm.fma.rn.f(float %373, float 0x3FF7154760000000, float %438) #6, !dbg !46 + %.0.i9 = select i1 %.not3.i8, float %440, float %439, !dbg !46 + %441 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %373, float 0x3E54AE0C00000000, float %.0.i9) #6, !dbg !46 + %442 = tail call float @llvm.nvvm.fma.rn.f(float %373, float 0x3E54AE0C00000000, float %.0.i9) #6, !dbg !46 + %.01.i11 = select i1 %.not4.i10, float %442, float %441, !dbg !46 + %443 = bitcast <2 x float> %427 to <2 x i32>, !dbg !46 + %444 = shl <2 x i32> %443, splat (i32 23), !dbg !46 + %445 = bitcast <2 x i32> %444 to <2 x float>, !dbg !46 + %446 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i11) #6, !dbg !46 + %447 = insertelement <2 x float> poison, float %435, i64 0, !dbg !46 + %448 = insertelement <2 x float> %447, float %446, i64 1, !dbg !46 + %449 = fmul <2 x float> %448, %445, !dbg !46 + %450 = fmul <2 x float> %272, %449, !dbg !47 + %451 = insertelement <2 x i32> poison, i32 %404, i64 0, !dbg !46 + %452 = insertelement <2 x i32> %451, i32 %415, i64 1, !dbg !46 + %453 = icmp eq <2 x i32> %452, zeroinitializer, !dbg !46 + %454 = insertelement <2 x float> poison, float %406, i64 0, !dbg !46 + %455 = insertelement <2 x float> %454, float %417, i64 1, !dbg !46 + %456 = insertelement <2 x float> poison, float %405, i64 0, !dbg !46 + %457 = insertelement <2 x float> %456, float %416, i64 1, !dbg !46 + %458 = select <2 x i1> %453, <2 x float> %455, <2 x float> %457, !dbg !46 + %459 = extractelement <2 x float> %458, i64 0, !dbg !46 + %460 = fadd float %459, 0xC168000FE0000000, !dbg !46 + %461 = fneg float %460, !dbg !46 + %462 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %374, float 0x3FF7154760000000, float %461) #6, !dbg !46 + %463 = tail call float @llvm.nvvm.fma.rn.f(float %374, float 0x3FF7154760000000, float %461) #6, !dbg !46 + %.0.i19 = select i1 %.not3.i18, float %463, float %462, !dbg !46 + %464 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %374, float 0x3E54AE0C00000000, float %.0.i19) #6, !dbg !46 + %465 = tail call float @llvm.nvvm.fma.rn.f(float %374, float 0x3E54AE0C00000000, float %.0.i19) #6, !dbg !46 + %.01.i21 = select i1 %.not4.i20, float %465, float %464, !dbg !46 + %466 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i21) #6, !dbg !46 + %467 = extractelement <2 x float> %458, i64 1, !dbg !46 + %468 = fadd float %467, 0xC168000FE0000000, !dbg !46 + %469 = fneg float %468, !dbg !46 + %470 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %375, float 0x3FF7154760000000, float %469) #6, !dbg !46 + %471 = tail call float @llvm.nvvm.fma.rn.f(float %375, float 0x3FF7154760000000, float %469) #6, !dbg !46 + %.0.i29 = select i1 %.not3.i28, float %471, float %470, !dbg !46 + %472 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %375, float 0x3E54AE0C00000000, float %.0.i29) #6, !dbg !46 + %473 = tail call float @llvm.nvvm.fma.rn.f(float %375, float 0x3E54AE0C00000000, float %.0.i29) #6, !dbg !46 + %.01.i31 = select i1 %.not4.i30, float %473, float %472, !dbg !46 + %474 = bitcast <2 x float> %458 to <2 x i32>, !dbg !46 + %475 = shl <2 x i32> %474, splat (i32 23), !dbg !46 + %476 = bitcast <2 x i32> %475 to <2 x float>, !dbg !46 + %477 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i31) #6, !dbg !46 + %478 = insertelement <2 x float> poison, float %466, i64 0, !dbg !46 + %479 = insertelement <2 x float> %478, float %477, i64 1, !dbg !46 + %480 = fmul <2 x float> %479, %476, !dbg !46 + %481 = fmul <2 x float> %273, %480, !dbg !47 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %shift = shufflevector <2 x float> %450, <2 x float> poison, <2 x i32> , !dbg !51 + %foldExtExtBinop191 = fadd <2 x float> %450, %shift, !dbg !51 + %foldExtExtBinop193 = fadd <2 x float> %foldExtExtBinop191, %481, !dbg !51 + %shift195 = shufflevector <2 x float> %481, <2 x float> poison, <2 x i32> , !dbg !51 + %foldExtExtBinop196 = fadd <2 x float> %foldExtExtBinop193, %shift195, !dbg !51 + %482 = extractelement <2 x float> %foldExtExtBinop196, i64 0, !dbg !51 + %483 = bitcast float %482 to i32, !dbg !48 + %484 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %483, i32 16, i32 31), !dbg !48 + %485 = bitcast i32 %484 to float, !dbg !48 + %486 = fadd float %482, %485, !dbg !51 + %487 = bitcast float %486 to i32, !dbg !48 + %488 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %487, i32 8, i32 31), !dbg !48 + %489 = bitcast i32 %488 to float, !dbg !48 + %490 = fadd float %486, %489, !dbg !51 + %491 = bitcast float %490 to i32, !dbg !48 + %492 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %491, i32 4, i32 31), !dbg !48 + %493 = bitcast i32 %492 to float, !dbg !48 + %494 = fadd float %490, %493, !dbg !51 + %495 = bitcast float %494 to i32, !dbg !48 + %496 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %495, i32 2, i32 31), !dbg !48 + %497 = bitcast i32 %496 to float, !dbg !48 + %498 = fadd float %494, %497, !dbg !51 + %499 = bitcast float %498 to i32, !dbg !48 + %500 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %499, i32 1, i32 31), !dbg !48 + %501 = bitcast i32 %500 to float, !dbg !48 + %502 = fadd float %498, %501, !dbg !51 + %503 = bitcast float %502 to <1 x i32>, !dbg !48 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %330, <1 x i32> %503, i1 %329) #6, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %504 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %334, i1 %333) #6, !dbg !48 + %505 = bitcast i32 %504 to float, !dbg !48 + %506 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %504, i32 8, i32 31), !dbg !48 + %507 = bitcast i32 %506 to float, !dbg !48 + %508 = fadd float %505, %507, !dbg !51 + %509 = bitcast float %508 to i32, !dbg !48 + %510 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %509, i32 4, i32 31), !dbg !48 + %511 = bitcast i32 %510 to float, !dbg !48 + %512 = fadd float %508, %511, !dbg !51 + %513 = bitcast float %512 to i32, !dbg !48 + %514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %513, i32 2, i32 31), !dbg !48 + %515 = bitcast i32 %514 to float, !dbg !48 + %516 = fadd float %512, %515, !dbg !51 + %517 = bitcast float %516 to i32, !dbg !48 + %518 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %517, i32 1, i32 31), !dbg !48 + %519 = bitcast i32 %518 to float, !dbg !48 + %520 = fadd float %516, %519, !dbg !51 + %521 = bitcast float %520 to <1 x i32>, !dbg !48 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %334, <1 x i32> %521, i1 %363) #6, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %522 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !48 + br label %523, !dbg !52 + +523: ; preds = %275, %523 + %indvars.iv160 = phi i64 [ 0, %275 ], [ %indvars.iv.next161, %523 ] + %524 = or disjoint i64 %indvars.iv160, %13, !dbg !53 + %525 = icmp samesign ult i64 %524, 32000, !dbg !54 + %526 = trunc nuw nsw i64 %524 to i32, !dbg !55 + %527 = add i32 %12, %526, !dbg !55 + %528 = sext i32 %527 to i64, !dbg !56 + %529 = getelementptr bfloat, ptr addrspace(1) %0, i64 %528, !dbg !56 + %530 = and i1 %8, %525, !dbg !57 + %531 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !58 + %532 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %529, i64 %531, i1 %530) #6, !dbg !58 + %533 = extractvalue { i32, i32 } %532, 0, !dbg !58 + %534 = bitcast i32 %533 to <2 x bfloat>, !dbg !58 + %535 = extractvalue { i32, i32 } %532, 1, !dbg !58 + %536 = bitcast i32 %535 to <2 x bfloat>, !dbg !58 + %537 = extractelement <2 x bfloat> %534, i64 0, !dbg !58 + %538 = extractelement <2 x bfloat> %534, i64 1, !dbg !58 + %539 = extractelement <2 x bfloat> %536, i64 0, !dbg !58 + %540 = extractelement <2 x bfloat> %536, i64 1, !dbg !58 + %541 = fpext bfloat %537 to float, !dbg !59 + %542 = fpext bfloat %538 to float, !dbg !59 + %543 = fpext bfloat %539 to float, !dbg !59 + %544 = fpext bfloat %540 to float, !dbg !59 + %545 = fsub float %541, %366, !dbg !60 + %546 = fsub float %542, %366, !dbg !60 + %547 = fsub float %543, %366, !dbg !60 + %548 = fsub float %544, %366, !dbg !60 + %549 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not.i32 = icmp eq i32 %549, 0, !dbg !61 + %550 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %545, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !61 + %551 = tail call float @llvm.nvvm.fma.rn.f(float %545, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !61 + %.02.i33 = select i1 %.not.i32, float %551, float %550, !dbg !61 + %552 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not1.i34 = icmp eq i32 %552, 0, !dbg !61 + %553 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i33) #6, !dbg !61 + %554 = tail call float @llvm.nvvm.saturate.f(float %.02.i33) #6, !dbg !61 + %.03.i35 = select i1 %.not1.i34, float %554, float %553, !dbg !61 + %555 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not2.i36 = icmp eq i32 %555, 0, !dbg !61 + %556 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i35, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !61 + %557 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i35, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !61 + %.04.i37 = select i1 %.not2.i36, float %557, float %556, !dbg !61 + %558 = fadd float %.04.i37, 0xC168000FE0000000, !dbg !61 + %559 = fneg float %558, !dbg !61 + %560 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not3.i38 = icmp eq i32 %560, 0, !dbg !61 + %561 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %545, float 0x3FF7154760000000, float %559) #6, !dbg !61 + %562 = tail call float @llvm.nvvm.fma.rn.f(float %545, float 0x3FF7154760000000, float %559) #6, !dbg !61 + %.0.i39 = select i1 %.not3.i38, float %562, float %561, !dbg !61 + %563 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not4.i40 = icmp eq i32 %563, 0, !dbg !61 + %564 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %545, float 0x3E54AE0C00000000, float %.0.i39) #6, !dbg !61 + %565 = tail call float @llvm.nvvm.fma.rn.f(float %545, float 0x3E54AE0C00000000, float %.0.i39) #6, !dbg !61 + %.01.i41 = select i1 %.not4.i40, float %565, float %564, !dbg !61 + %566 = bitcast float %.04.i37 to i32, !dbg !61 + %567 = shl i32 %566, 23, !dbg !61 + %568 = bitcast i32 %567 to float, !dbg !61 + %569 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i41) #6, !dbg !61 + %570 = fmul float %569, %568, !dbg !61 + %571 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not.i42 = icmp eq i32 %571, 0, !dbg !61 + %572 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %546, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !61 + %573 = tail call float @llvm.nvvm.fma.rn.f(float %546, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !61 + %.02.i43 = select i1 %.not.i42, float %573, float %572, !dbg !61 + %574 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not1.i44 = icmp eq i32 %574, 0, !dbg !61 + %575 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i43) #6, !dbg !61 + %576 = tail call float @llvm.nvvm.saturate.f(float %.02.i43) #6, !dbg !61 + %.03.i45 = select i1 %.not1.i44, float %576, float %575, !dbg !61 + %577 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not2.i46 = icmp eq i32 %577, 0, !dbg !61 + %578 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i45, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !61 + %579 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i45, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !61 + %.04.i47 = select i1 %.not2.i46, float %579, float %578, !dbg !61 + %580 = fadd float %.04.i47, 0xC168000FE0000000, !dbg !61 + %581 = fneg float %580, !dbg !61 + %582 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not3.i48 = icmp eq i32 %582, 0, !dbg !61 + %583 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %546, float 0x3FF7154760000000, float %581) #6, !dbg !61 + %584 = tail call float @llvm.nvvm.fma.rn.f(float %546, float 0x3FF7154760000000, float %581) #6, !dbg !61 + %.0.i49 = select i1 %.not3.i48, float %584, float %583, !dbg !61 + %585 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not4.i50 = icmp eq i32 %585, 0, !dbg !61 + %586 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %546, float 0x3E54AE0C00000000, float %.0.i49) #6, !dbg !61 + %587 = tail call float @llvm.nvvm.fma.rn.f(float %546, float 0x3E54AE0C00000000, float %.0.i49) #6, !dbg !61 + %.01.i51 = select i1 %.not4.i50, float %587, float %586, !dbg !61 + %588 = bitcast float %.04.i47 to i32, !dbg !61 + %589 = shl i32 %588, 23, !dbg !61 + %590 = bitcast i32 %589 to float, !dbg !61 + %591 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i51) #6, !dbg !61 + %592 = fmul float %591, %590, !dbg !61 + %593 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not.i52 = icmp eq i32 %593, 0, !dbg !61 + %594 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %547, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !61 + %595 = tail call float @llvm.nvvm.fma.rn.f(float %547, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !61 + %.02.i53 = select i1 %.not.i52, float %595, float %594, !dbg !61 + %596 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not1.i54 = icmp eq i32 %596, 0, !dbg !61 + %597 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i53) #6, !dbg !61 + %598 = tail call float @llvm.nvvm.saturate.f(float %.02.i53) #6, !dbg !61 + %.03.i55 = select i1 %.not1.i54, float %598, float %597, !dbg !61 + %599 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not2.i56 = icmp eq i32 %599, 0, !dbg !61 + %600 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i55, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !61 + %601 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i55, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !61 + %.04.i57 = select i1 %.not2.i56, float %601, float %600, !dbg !61 + %602 = fadd float %.04.i57, 0xC168000FE0000000, !dbg !61 + %603 = fneg float %602, !dbg !61 + %604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not3.i58 = icmp eq i32 %604, 0, !dbg !61 + %605 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %547, float 0x3FF7154760000000, float %603) #6, !dbg !61 + %606 = tail call float @llvm.nvvm.fma.rn.f(float %547, float 0x3FF7154760000000, float %603) #6, !dbg !61 + %.0.i59 = select i1 %.not3.i58, float %606, float %605, !dbg !61 + %607 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not4.i60 = icmp eq i32 %607, 0, !dbg !61 + %608 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %547, float 0x3E54AE0C00000000, float %.0.i59) #6, !dbg !61 + %609 = tail call float @llvm.nvvm.fma.rn.f(float %547, float 0x3E54AE0C00000000, float %.0.i59) #6, !dbg !61 + %.01.i61 = select i1 %.not4.i60, float %609, float %608, !dbg !61 + %610 = bitcast float %.04.i57 to i32, !dbg !61 + %611 = shl i32 %610, 23, !dbg !61 + %612 = bitcast i32 %611 to float, !dbg !61 + %613 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i61) #6, !dbg !61 + %614 = fmul float %613, %612, !dbg !61 + %615 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not.i62 = icmp eq i32 %615, 0, !dbg !61 + %616 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %548, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !61 + %617 = tail call float @llvm.nvvm.fma.rn.f(float %548, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !61 + %.02.i63 = select i1 %.not.i62, float %617, float %616, !dbg !61 + %618 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not1.i64 = icmp eq i32 %618, 0, !dbg !61 + %619 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i63) #6, !dbg !61 + %620 = tail call float @llvm.nvvm.saturate.f(float %.02.i63) #6, !dbg !61 + %.03.i65 = select i1 %.not1.i64, float %620, float %619, !dbg !61 + %621 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not2.i66 = icmp eq i32 %621, 0, !dbg !61 + %622 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i65, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !61 + %623 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i65, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !61 + %.04.i67 = select i1 %.not2.i66, float %623, float %622, !dbg !61 + %624 = fadd float %.04.i67, 0xC168000FE0000000, !dbg !61 + %625 = fneg float %624, !dbg !61 + %626 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not3.i68 = icmp eq i32 %626, 0, !dbg !61 + %627 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %548, float 0x3FF7154760000000, float %625) #6, !dbg !61 + %628 = tail call float @llvm.nvvm.fma.rn.f(float %548, float 0x3FF7154760000000, float %625) #6, !dbg !61 + %.0.i69 = select i1 %.not3.i68, float %628, float %627, !dbg !61 + %629 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !61 + %.not4.i70 = icmp eq i32 %629, 0, !dbg !61 + %630 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %548, float 0x3E54AE0C00000000, float %.0.i69) #6, !dbg !61 + %631 = tail call float @llvm.nvvm.fma.rn.f(float %548, float 0x3E54AE0C00000000, float %.0.i69) #6, !dbg !61 + %.01.i71 = select i1 %.not4.i70, float %631, float %630, !dbg !61 + %632 = bitcast float %.04.i67 to i32, !dbg !61 + %633 = shl i32 %632, 23, !dbg !61 + %634 = bitcast i32 %633 to float, !dbg !61 + %635 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i71) #6, !dbg !61 + %636 = fmul float %635, %634, !dbg !61 + %637 = tail call float @llvm.nvvm.div.full(float %570, float %522), !dbg !62 + %638 = tail call float @llvm.nvvm.div.full(float %592, float %522), !dbg !62 + %639 = tail call float @llvm.nvvm.div.full(float %614, float %522), !dbg !62 + %640 = tail call float @llvm.nvvm.div.full(float %636, float %522), !dbg !62 + %641 = getelementptr float, ptr addrspace(1) %1, i64 %528, !dbg !63 + %642 = bitcast float %637 to i32, !dbg !64 + %643 = bitcast float %638 to i32, !dbg !64 + %644 = bitcast float %639 to i32, !dbg !64 + %645 = bitcast float %640 to i32, !dbg !64 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %642, i32 %643, i32 %644, i32 %645, ptr addrspace(1) %641, i1 %530) #6, !dbg !64 + %indvars.iv.next161 = add nuw nsw i64 %indvars.iv160, 2048, !dbg !52 + %646 = icmp samesign ult i64 %indvars.iv160, 29952, !dbg !52 + br i1 %646, label %523, label %647, !dbg !52 + +647: ; preds = %523 + ret void, !dbg !65 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.saturate.ftz.f(float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.saturate.f(float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rm.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rm.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0", linkageName: "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 22, column: 28, scope: !5) +!9 = !DILocation(line: 24, column: 21, scope: !5) +!10 = !DILocation(line: 25, column: 37, scope: !5) +!11 = !DILocation(line: 36, column: 47, scope: !5) +!12 = !DILocation(line: 30, column: 40, scope: !5) +!13 = !DILocation(line: 31, column: 31, scope: !5) +!14 = !DILocation(line: 32, column: 29, scope: !5) +!15 = !DILocation(line: 36, column: 41, scope: !5) +!16 = !DILocation(line: 36, column: 34, scope: !5) +!17 = !DILocation(line: 36, column: 62, scope: !5) +!18 = !DILocation(line: 36, column: 52, scope: !5) +!19 = !DILocation(line: 112, column: 21, scope: !20, inlinedAt: !22) +!20 = distinct !DILexicalBlockFile(scope: !5, file: !21, discriminator: 0) +!21 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!22 = !DILocation(line: 41, column: 40, scope: !5) +!23 = !DILocation(line: 173, column: 29, scope: !20, inlinedAt: !22) +!24 = !DILocation(line: 36, column: 113, scope: !5) +!25 = !DILocation(line: 110, column: 15, scope: !20, inlinedAt: !22) +!26 = !DILocation(line: 112, column: 16, scope: !20, inlinedAt: !22) +!27 = !DILocation(line: 113, column: 29, scope: !20, inlinedAt: !22) +!28 = !DILocation(line: 196, column: 19, scope: !20, inlinedAt: !22) +!29 = !DILocation(line: 196, column: 53, scope: !20, inlinedAt: !22) +!30 = !DILocation(line: 196, column: 39, scope: !20, inlinedAt: !22) +!31 = !DILocation(line: 199, column: 53, scope: !20, inlinedAt: !22) +!32 = !DILocation(line: 199, column: 39, scope: !20, inlinedAt: !22) +!33 = !DILocation(line: 205, column: 24, scope: !20, inlinedAt: !22) +!34 = !DILocation(line: 205, column: 36, scope: !20, inlinedAt: !22) +!35 = !DILocation(line: 44, column: 62, scope: !5) +!36 = !DILocation(line: 45, column: 62, scope: !5) +!37 = !DILocation(line: 110, column: 15, scope: !20, inlinedAt: !38) +!38 = !DILocation(line: 48, column: 33, scope: !5) +!39 = !DILocation(line: 112, column: 21, scope: !20, inlinedAt: !38) +!40 = !DILocation(line: 112, column: 16, scope: !20, inlinedAt: !38) +!41 = !DILocation(line: 113, column: 29, scope: !20, inlinedAt: !38) +!42 = !DILocation(line: 123, column: 29, scope: !20, inlinedAt: !38) +!43 = !DILocation(line: 180, column: 40, scope: !20, inlinedAt: !38) +!44 = !DILocation(line: 180, column: 68, scope: !20, inlinedAt: !38) +!45 = !DILocation(line: 180, column: 58, scope: !20, inlinedAt: !38) +!46 = !DILocation(line: 173, column: 29, scope: !20, inlinedAt: !38) +!47 = !DILocation(line: 181, column: 31, scope: !20, inlinedAt: !38) +!48 = !DILocation(line: 291, column: 36, scope: !49, inlinedAt: !38) +!49 = distinct !DILexicalBlockFile(scope: !5, file: !50, discriminator: 0) +!50 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!51 = !DILocation(line: 261, column: 15, scope: !49, inlinedAt: !38) +!52 = !DILocation(line: 51, column: 40, scope: !5) +!53 = !DILocation(line: 52, column: 31, scope: !5) +!54 = !DILocation(line: 53, column: 29, scope: !5) +!55 = !DILocation(line: 57, column: 41, scope: !5) +!56 = !DILocation(line: 57, column: 34, scope: !5) +!57 = !DILocation(line: 57, column: 62, scope: !5) +!58 = !DILocation(line: 57, column: 52, scope: !5) +!59 = !DILocation(line: 57, column: 114, scope: !5) +!60 = !DILocation(line: 59, column: 22, scope: !5) +!61 = !DILocation(line: 60, column: 29, scope: !5) +!62 = !DILocation(line: 61, column: 23, scope: !5) +!63 = !DILocation(line: 62, column: 29, scope: !5) +!64 = !DILocation(line: 62, column: 53, scope: !5) +!65 = !DILocation(line: 51, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..34c174b8c4686449044beaecdcdd5ec93677a2af --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx @@ -0,0 +1,927 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 // -- Begin function triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 +.visible .entry triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0( + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_1, + .param .u32 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_2, + .param .u32 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_5 +) +.reqntid 512 +{ + .reg .pred %p<53>; + .reg .b16 %rs<9>; + .reg .b32 %r<355>; + .reg .b64 %rd<68>; + .loc 1 18 0 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:18:0 + +// %bb.0: + ld.param.b32 %r5, [triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_2]; + ld.param.b64 %rd16, [triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_1]; + ld.param.b64 %rd15, [triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_0]; +$L__tmp0: + .loc 1 22 28 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:22:28 + mov.u32 %r1, %ctaid.x; + .loc 1 25 37 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:25:37 + mov.u32 %r2, %tid.x; + shl.b32 %r6, %r2, 2; + and.b32 %r7, %r6, 2044; + .loc 1 30 40 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:30:40 + cvt.u64.u32 %rd1, %r7; + mad.lo.s32 %r8, %r1, 32000, %r7; + cvt.u64.u32 %rd2, %r8; + mov.b32 %r9, 0fFF800000; + mov.b64 %rd64, {%r9, %r9}; + mov.b32 %r10, 0f00000000; + mov.b64 %rd63, {%r10, %r10}; + mov.b64 %rd62, 0; + setp.lt.s32 %p2, %r1, %r5; + mov.b64 %rd65, %rd63; + mov.b64 %rd66, %rd64; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 32 29 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:32:29 + add.s64 %rd23, %rd1, %rd62; + setp.lt.u64 %p3, %rd23, 32000; + .loc 1 36 34 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:36:34 + add.s64 %rd24, %rd2, %rd62; + cvt.u32.u64 %r15, %rd24; + mad.wide.s32 %rd21, %r15, 2, %rd15; + .loc 1 36 62 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:36:62 + and.pred %p1, %p2, %p3; + .loc 1 36 52 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:36:52 + // begin inline asm + mov.u64 %rd20, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd20, 1.0; + // end inline asm + mov.b32 %r13, 0; + // begin inline asm + mov.u32 %r11, %r13; + mov.u32 %r12, %r13; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v2.b32 { %r11, %r12 }, [ %rd21 + 0 ], %rd20; + // end inline asm +$L__tmp1: + .loc 2 112 21 // triton_helpers.py:112:21 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + mov.b64 {%r16, %r17}, %rd66; + setp.nan.f32 %p4, %r16, %r16; + setp.nan.f32 %p5, %r17, %r17; + mov.b64 {%r18, %r19}, %rd64; + setp.nan.f32 %p6, %r18, %r18; + setp.nan.f32 %p7, %r19, %r19; +$L__tmp2: + .loc 1 36 113 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:36:113 + mov.b32 {%rs1, %rs2}, %r11; + cvt.f32.bf16 %r20, %rs2; + cvt.f32.bf16 %r21, %rs1; +$L__tmp3: + .loc 2 110 15 // triton_helpers.py:110:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + setp.gt.f32 %p8, %r16, %r21; + setp.gt.f32 %p9, %r17, %r20; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + selp.f32 %r22, %r17, %r20, %p9; + selp.f32 %r23, %r17, %r22, %p5; + selp.f32 %r24, %r16, %r21, %p8; + selp.f32 %r25, %r16, %r24, %p4; + .loc 2 196 19 // triton_helpers.py:196:19 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + setp.eq.f32 %p10, %r25, 0fFF800000; + setp.eq.f32 %p11, %r23, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + sub.f32 %r26, %r17, %r23; + sub.f32 %r27, %r16, %r25; + mov.b32 %r28, 0f3F000000; + mov.b32 %r29, 0f3BBB989D; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + fma.rn.ftz.f32 %r30, %r27, %r29, %r28; + cvt.ftz.sat.f32.f32 %r31, %r30; + mov.b32 %r32, 0f4B400001; + mov.b32 %r33, 0f437C0000; + fma.rm.ftz.f32 %r34, %r31, %r33, %r32; + fma.rn.ftz.f32 %r35, %r26, %r29, %r28; + cvt.ftz.sat.f32.f32 %r36, %r35; + fma.rm.ftz.f32 %r37, %r36, %r33, %r32; + mov.b64 %rd25, {%r34, %r37}; + cvt.u32.u64 %r38, %rd25; + add.f32 %r39, %r34, 0fCB40007F; + neg.f32 %r40, %r39; + mov.b32 %r41, 0f3FB8AA3B; + fma.rn.ftz.f32 %r42, %r27, %r41, %r40; + mov.b32 %r43, 0f32A57060; + fma.rn.ftz.f32 %r44, %r27, %r43, %r42; + ex2.approx.ftz.f32 %r45, %r44; + add.f32 %r46, %r37, 0fCB40007F; + neg.f32 %r47, %r46; + fma.rn.ftz.f32 %r48, %r26, %r41, %r47; + fma.rn.ftz.f32 %r49, %r26, %r43, %r48; + shl.b64 %rd26, %rd25, 23; + and.b64 %rd27, %rd26, -36028797018963968; + shl.b32 %r50, %r38, 23; + cvt.u64.u32 %rd28, %r50; + or.b64 %rd29, %rd28, %rd27; + ex2.approx.ftz.f32 %r51, %r49; + mov.b64 {%r52, %r53}, %rd29; + mul.f32 %r54, %r45, %r52; + mul.f32 %r55, %r51, %r53; + .loc 2 196 39 // triton_helpers.py:196:39 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + selp.f32 %r56, 0f3F800000, %r55, %p11; + selp.f32 %r57, 0f3F800000, %r54, %p10; + .loc 2 199 53 // triton_helpers.py:199:53 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + sub.f32 %r58, %r20, %r23; + sub.f32 %r59, %r21, %r25; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + fma.rn.ftz.f32 %r60, %r59, %r29, %r28; + cvt.ftz.sat.f32.f32 %r61, %r60; + fma.rm.ftz.f32 %r62, %r61, %r33, %r32; + fma.rn.ftz.f32 %r63, %r58, %r29, %r28; + cvt.ftz.sat.f32.f32 %r64, %r63; + fma.rm.ftz.f32 %r65, %r64, %r33, %r32; + mov.b64 %rd30, {%r62, %r65}; + cvt.u32.u64 %r66, %rd30; + add.f32 %r67, %r62, 0fCB40007F; + neg.f32 %r68, %r67; + fma.rn.ftz.f32 %r69, %r59, %r41, %r68; + fma.rn.ftz.f32 %r70, %r59, %r43, %r69; + ex2.approx.ftz.f32 %r71, %r70; + add.f32 %r72, %r65, 0fCB40007F; + neg.f32 %r73, %r72; + fma.rn.ftz.f32 %r74, %r58, %r41, %r73; + fma.rn.ftz.f32 %r75, %r58, %r43, %r74; + shl.b64 %rd31, %rd30, 23; + and.b64 %rd32, %rd31, -36028797018963968; + shl.b32 %r76, %r66, 23; + cvt.u64.u32 %rd33, %r76; + or.b64 %rd34, %rd33, %rd32; + ex2.approx.ftz.f32 %r77, %r75; + mov.b64 {%r78, %r79}, %rd34; + mul.f32 %r80, %r71, %r78; + mul.f32 %r81, %r77, %r79; + .loc 2 199 39 // triton_helpers.py:199:39 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + selp.f32 %r82, 0f3F800000, %r81, %p11; + selp.f32 %r83, 0f3F800000, %r80, %p10; + .loc 2 205 36 // triton_helpers.py:205:36 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + mov.b64 {%r84, %r85}, %rd65; + fma.rn.f32 %r86, %r84, %r57, %r83; + fma.rn.f32 %r87, %r85, %r56, %r82; +$L__tmp4: + .loc 1 36 113 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:36:113 + mov.b32 {%rs3, %rs4}, %r12; + cvt.f32.bf16 %r88, %rs4; + cvt.f32.bf16 %r89, %rs3; +$L__tmp5: + .loc 2 110 15 // triton_helpers.py:110:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + setp.gt.f32 %p12, %r18, %r89; + setp.gt.f32 %p13, %r19, %r88; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + selp.f32 %r90, %r19, %r88, %p13; + selp.f32 %r91, %r19, %r90, %p7; + selp.f32 %r92, %r18, %r89, %p12; + selp.f32 %r93, %r18, %r92, %p6; + .loc 2 196 19 // triton_helpers.py:196:19 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + setp.eq.f32 %p14, %r93, 0fFF800000; + setp.eq.f32 %p15, %r91, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + sub.f32 %r94, %r19, %r91; + sub.f32 %r95, %r18, %r93; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + fma.rn.ftz.f32 %r96, %r95, %r29, %r28; + cvt.ftz.sat.f32.f32 %r97, %r96; + fma.rm.ftz.f32 %r98, %r97, %r33, %r32; + fma.rn.ftz.f32 %r99, %r94, %r29, %r28; + cvt.ftz.sat.f32.f32 %r100, %r99; + fma.rm.ftz.f32 %r101, %r100, %r33, %r32; + mov.b64 %rd35, {%r98, %r101}; + cvt.u32.u64 %r102, %rd35; + add.f32 %r103, %r98, 0fCB40007F; + neg.f32 %r104, %r103; + fma.rn.ftz.f32 %r105, %r95, %r41, %r104; + fma.rn.ftz.f32 %r106, %r95, %r43, %r105; + ex2.approx.ftz.f32 %r107, %r106; + add.f32 %r108, %r101, 0fCB40007F; + neg.f32 %r109, %r108; + fma.rn.ftz.f32 %r110, %r94, %r41, %r109; + fma.rn.ftz.f32 %r111, %r94, %r43, %r110; + shl.b64 %rd36, %rd35, 23; + and.b64 %rd37, %rd36, -36028797018963968; + shl.b32 %r112, %r102, 23; + cvt.u64.u32 %rd38, %r112; + or.b64 %rd39, %rd38, %rd37; + ex2.approx.ftz.f32 %r113, %r111; + mov.b64 {%r114, %r115}, %rd39; + mul.f32 %r116, %r107, %r114; + mul.f32 %r117, %r113, %r115; + .loc 2 196 39 // triton_helpers.py:196:39 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + selp.f32 %r118, 0f3F800000, %r117, %p15; + selp.f32 %r119, 0f3F800000, %r116, %p14; + .loc 2 199 53 // triton_helpers.py:199:53 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + sub.f32 %r120, %r88, %r91; + sub.f32 %r121, %r89, %r93; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + fma.rn.ftz.f32 %r122, %r121, %r29, %r28; + cvt.ftz.sat.f32.f32 %r123, %r122; + fma.rm.ftz.f32 %r124, %r123, %r33, %r32; + fma.rn.ftz.f32 %r125, %r120, %r29, %r28; + cvt.ftz.sat.f32.f32 %r126, %r125; + fma.rm.ftz.f32 %r127, %r126, %r33, %r32; + mov.b64 %rd40, {%r124, %r127}; + cvt.u32.u64 %r128, %rd40; + add.f32 %r129, %r124, 0fCB40007F; + neg.f32 %r130, %r129; + fma.rn.ftz.f32 %r131, %r121, %r41, %r130; + fma.rn.ftz.f32 %r132, %r121, %r43, %r131; + ex2.approx.ftz.f32 %r133, %r132; + add.f32 %r134, %r127, 0fCB40007F; + neg.f32 %r135, %r134; + fma.rn.ftz.f32 %r136, %r120, %r41, %r135; + fma.rn.ftz.f32 %r137, %r120, %r43, %r136; + shl.b64 %rd41, %rd40, 23; + and.b64 %rd42, %rd41, -36028797018963968; + shl.b32 %r138, %r128, 23; + cvt.u64.u32 %rd43, %r138; + or.b64 %rd44, %rd43, %rd42; + ex2.approx.ftz.f32 %r139, %r137; + mov.b64 {%r140, %r141}, %rd44; + mul.f32 %r142, %r133, %r140; + mul.f32 %r143, %r139, %r141; + .loc 2 199 39 // triton_helpers.py:199:39 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + selp.f32 %r144, 0f3F800000, %r143, %p15; + selp.f32 %r145, 0f3F800000, %r142, %p14; + .loc 2 205 36 // triton_helpers.py:205:36 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:41:40 ] + mov.b64 {%r146, %r147}, %rd63; + fma.rn.f32 %r148, %r146, %r119, %r145; + fma.rn.f32 %r149, %r147, %r118, %r144; +$L__tmp6: + .loc 1 44 62 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:44:62 + selp.f32 %r150, %r23, %r17, %p1; + selp.f32 %r151, %r25, %r16, %p1; + mov.b64 %rd66, {%r151, %r150}; + selp.f32 %r152, %r91, %r19, %p1; + selp.f32 %r153, %r93, %r18, %p1; + mov.b64 %rd64, {%r153, %r152}; + .loc 1 45 62 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:45:62 + selp.f32 %r154, %r87, %r85, %p1; + selp.f32 %r155, %r86, %r84, %p1; + mov.b64 %rd65, {%r155, %r154}; + selp.f32 %r156, %r149, %r147, %p1; + selp.f32 %r157, %r148, %r146, %p1; + mov.b64 %rd63, {%r157, %r156}; + .loc 1 30 40 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:30:40 + add.s64 %rd12, %rd62, 2048; + setp.lt.u64 %p16, %rd62, 29952; + mov.b64 %rd62, %rd12; + @%p16 bra $L__BB0_1; +// %bb.2: + .loc 1 25 37 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:25:37 + and.b32 %r170, %r2, 31; +$L__tmp7: + .loc 2 110 15 // triton_helpers.py:110:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + mov.b64 {%r171, %r172}, %rd66; + setp.gt.f32 %p23, %r171, %r172; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.nan.f32 %p24, %r171, %r171; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + selp.f32 %r173, %r171, %r172, %p24; + selp.f32 %r174, %r171, %r173, %p23; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + mov.b64 {%r175, %r176}, %rd64; + setp.gt.f32 %p25, %r174, %r175; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.nan.f32 %p26, %r174, %r174; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + selp.f32 %r177, %r174, %r175, %p26; + selp.f32 %r178, %r174, %r177, %p25; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.gt.f32 %p27, %r178, %r176; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.nan.f32 %p28, %r178, %r178; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + selp.f32 %r179, %r178, %r176, %p28; + selp.f32 %r180, %r178, %r179, %p27; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + shfl.sync.bfly.b32 %r181, %r180, 16, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.gt.f32 %p29, %r180, %r181; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.nan.f32 %p30, %r180, %r180; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + selp.f32 %r182, %r180, %r181, %p29; + selp.f32 %r183, %r180, %r182, %p30; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + shfl.sync.bfly.b32 %r184, %r183, 8, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.gt.f32 %p31, %r183, %r184; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.nan.f32 %p32, %r183, %r183; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + selp.f32 %r185, %r183, %r184, %p32; + selp.f32 %r186, %r183, %r185, %p31; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + shfl.sync.bfly.b32 %r187, %r186, 4, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.gt.f32 %p33, %r186, %r187; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.nan.f32 %p34, %r186, %r186; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + selp.f32 %r188, %r186, %r187, %p34; + selp.f32 %r189, %r186, %r188, %p33; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + shfl.sync.bfly.b32 %r190, %r189, 2, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.gt.f32 %p35, %r189, %r190; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.nan.f32 %p36, %r189, %r189; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + selp.f32 %r191, %r189, %r190, %p36; + selp.f32 %r192, %r189, %r191, %p35; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + shfl.sync.bfly.b32 %r193, %r192, 1, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.gt.f32 %p37, %r192, %r193; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.nan.f32 %p38, %r192, %r192; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.eq.b32 %p17, %r170, 0; + shr.u32 %r194, %r2, 3; + and.b32 %r195, %r194, 60; + mov.b32 %r196, global_smem; + add.s32 %r158, %r196, %r195; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + selp.b32 %r197, %r192, %r193, %p38; + selp.b32 %r159, %r192, %r197, %p37; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + // begin inline asm + @%p17 st.shared.b32 [ %r158 + 0 ], %r159; + // end inline asm + bar.sync 0; + setp.lt.u32 %p18, %r2, 16; + add.s32 %r161, %r196, %r6; + // begin inline asm + @%p18 ld.shared.b32 %r160, [ %r161 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r199, %r160, 8, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.gt.f32 %p39, %r160, %r199; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.nan.f32 %p40, %r160, %r160; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + selp.f32 %r200, %r160, %r199, %p39; + selp.f32 %r201, %r160, %r200, %p40; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + shfl.sync.bfly.b32 %r202, %r201, 4, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.gt.f32 %p41, %r201, %r202; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.nan.f32 %p42, %r201, %r201; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + selp.f32 %r203, %r201, %r202, %p42; + selp.f32 %r204, %r201, %r203, %p41; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + shfl.sync.bfly.b32 %r205, %r204, 2, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.gt.f32 %p43, %r204, %r205; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.nan.f32 %p44, %r204, %r204; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + selp.f32 %r206, %r204, %r205, %p44; + selp.f32 %r207, %r204, %r206, %p43; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + shfl.sync.bfly.b32 %r208, %r207, 1, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.gt.f32 %p45, %r207, %r208; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.nan.f32 %p46, %r207, %r207; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.eq.b32 %p19, %r2, 0; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + selp.b32 %r209, %r207, %r208, %p46; + selp.b32 %r163, %r207, %r209, %p45; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + // begin inline asm + @%p19 st.shared.b32 [ %r161 + 0 ], %r163; + // end inline asm + bar.sync 0; + ld.shared.b32 %r3, [global_smem]; + .loc 2 180 40 // triton_helpers.py:180:40 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + setp.eq.f32 %p47, %r3, 0fFF800000; + .loc 2 180 68 // triton_helpers.py:180:68 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + sub.f32 %r210, %r171, %r3; + sub.f32 %r211, %r172, %r3; + sub.f32 %r212, %r175, %r3; + sub.f32 %r213, %r176, %r3; + .loc 2 180 58 // triton_helpers.py:180:58 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + selp.f32 %r214, 0f00000000, %r210, %p47; + selp.f32 %r215, 0f00000000, %r211, %p47; + selp.f32 %r216, 0f00000000, %r212, %p47; + selp.f32 %r217, 0f00000000, %r213, %p47; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + fma.rn.ftz.f32 %r220, %r214, %r29, %r28; + cvt.ftz.sat.f32.f32 %r221, %r220; + fma.rm.ftz.f32 %r224, %r221, %r33, %r32; + fma.rn.ftz.f32 %r225, %r215, %r29, %r28; + cvt.ftz.sat.f32.f32 %r226, %r225; + fma.rm.ftz.f32 %r227, %r226, %r33, %r32; + fma.rn.ftz.f32 %r228, %r216, %r29, %r28; + cvt.ftz.sat.f32.f32 %r229, %r228; + fma.rm.ftz.f32 %r230, %r229, %r33, %r32; + fma.rn.ftz.f32 %r231, %r217, %r29, %r28; + cvt.ftz.sat.f32.f32 %r232, %r231; + fma.rm.ftz.f32 %r233, %r232, %r33, %r32; + mov.b64 %rd46, {%r224, %r227}; + cvt.u32.u64 %r234, %rd46; + add.f32 %r235, %r224, 0fCB40007F; + neg.f32 %r236, %r235; + fma.rn.ftz.f32 %r238, %r214, %r41, %r236; + fma.rn.ftz.f32 %r240, %r214, %r43, %r238; + ex2.approx.ftz.f32 %r241, %r240; + add.f32 %r242, %r227, 0fCB40007F; + neg.f32 %r243, %r242; + fma.rn.ftz.f32 %r244, %r215, %r41, %r243; + fma.rn.ftz.f32 %r245, %r215, %r43, %r244; + shl.b64 %rd47, %rd46, 23; + and.b64 %rd48, %rd47, -36028797018963968; + shl.b32 %r246, %r234, 23; + cvt.u64.u32 %rd49, %r246; + or.b64 %rd50, %rd49, %rd48; + ex2.approx.ftz.f32 %r247, %r245; + mov.b64 {%r248, %r249}, %rd50; + mul.f32 %r250, %r241, %r248; + mul.f32 %r251, %r247, %r249; + .loc 2 181 31 // triton_helpers.py:181:31 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + mov.b64 {%r252, %r253}, %rd65; + mul.f32 %r254, %r253, %r251; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + mov.b64 %rd51, {%r230, %r233}; + cvt.u32.u64 %r255, %rd51; + add.f32 %r256, %r230, 0fCB40007F; + neg.f32 %r257, %r256; + fma.rn.ftz.f32 %r258, %r216, %r41, %r257; + fma.rn.ftz.f32 %r259, %r216, %r43, %r258; + ex2.approx.ftz.f32 %r260, %r259; + add.f32 %r261, %r233, 0fCB40007F; + neg.f32 %r262, %r261; + fma.rn.ftz.f32 %r263, %r217, %r41, %r262; + fma.rn.ftz.f32 %r264, %r217, %r43, %r263; + shl.b64 %rd52, %rd51, 23; + and.b64 %rd53, %rd52, -36028797018963968; + shl.b32 %r265, %r255, 23; + cvt.u64.u32 %rd54, %r265; + or.b64 %rd55, %rd54, %rd53; + ex2.approx.ftz.f32 %r266, %r264; + mov.b64 {%r267, %r268}, %rd55; + mul.f32 %r269, %r266, %r268; + mul.f32 %r270, %r260, %r267; + .loc 2 181 31 // triton_helpers.py:181:31 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + mov.b64 {%r271, %r272}, %rd63; + .loc 3 291 36 // standard.py:291:36 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + bar.sync 0; + .loc 3 261 15 // standard.py:261:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + fma.rn.f32 %r273, %r252, %r250, %r254; + fma.rn.f32 %r274, %r271, %r270, %r273; + fma.rn.f32 %r275, %r272, %r269, %r274; + .loc 3 291 36 // standard.py:291:36 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + shfl.sync.bfly.b32 %r276, %r275, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + add.f32 %r277, %r275, %r276; + .loc 3 291 36 // standard.py:291:36 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + shfl.sync.bfly.b32 %r278, %r277, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + add.f32 %r279, %r277, %r278; + .loc 3 291 36 // standard.py:291:36 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + shfl.sync.bfly.b32 %r280, %r279, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + add.f32 %r281, %r279, %r280; + .loc 3 291 36 // standard.py:291:36 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + shfl.sync.bfly.b32 %r282, %r281, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + add.f32 %r283, %r281, %r282; + .loc 3 291 36 // standard.py:291:36 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + shfl.sync.bfly.b32 %r284, %r283, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + add.f32 %r165, %r283, %r284; + .loc 3 291 36 // standard.py:291:36 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + // begin inline asm + @%p17 st.shared.b32 [ %r158 + 0 ], %r165; + // end inline asm + bar.sync 0; + // begin inline asm + @%p18 ld.shared.b32 %r166, [ %r161 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r285, %r166, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + add.f32 %r286, %r166, %r285; + .loc 3 291 36 // standard.py:291:36 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + shfl.sync.bfly.b32 %r287, %r286, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + add.f32 %r288, %r286, %r287; + .loc 3 291 36 // standard.py:291:36 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + shfl.sync.bfly.b32 %r289, %r288, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + add.f32 %r290, %r288, %r289; + .loc 3 291 36 // standard.py:291:36 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + shfl.sync.bfly.b32 %r291, %r290, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + add.f32 %r169, %r290, %r291; + .loc 3 291 36 // standard.py:291:36 @[ c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:48:33 ] + // begin inline asm + @%p19 st.shared.b32 [ %r161 + 0 ], %r169; + // end inline asm + bar.sync 0; + mov.b64 %rd67, 0; + ld.shared.b32 %r4, [global_smem]; +$L__tmp8: +$L__BB0_3: // =>This Inner Loop Header: Depth=1 + .loc 1 53 29 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:53:29 + add.s64 %rd60, %rd1, %rd67; + setp.lt.u64 %p51, %rd60, 32000; + .loc 1 57 34 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:57:34 + add.s64 %rd61, %rd2, %rd67; + cvt.u32.u64 %r300, %rd61; + mad.wide.s32 %rd57, %r300, 2, %rd15; + .loc 1 57 62 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:57:62 + and.pred %p48, %p2, %p51; + .loc 1 57 52 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:57:52 + // begin inline asm + mov.u64 %rd56, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd56, 1.0; + // end inline asm + mov.b32 %r294, 0; + // begin inline asm + mov.u32 %r292, %r294; + mov.u32 %r293, %r294; + @%p48 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r292, %r293 }, [ %rd57 + 0 ], %rd56; + // end inline asm + mov.b32 {%rs5, %rs6}, %r292; + mov.b32 {%rs7, %rs8}, %r293; + .loc 1 57 114 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:57:114 + cvt.f32.bf16 %r301, %rs5; + cvt.f32.bf16 %r302, %rs6; + cvt.f32.bf16 %r303, %rs7; + cvt.f32.bf16 %r304, %rs8; + .loc 1 59 22 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:59:22 + sub.f32 %r305, %r301, %r3; + sub.f32 %r306, %r302, %r3; + sub.f32 %r307, %r303, %r3; + sub.f32 %r308, %r304, %r3; + mov.b32 %r309, 0f3F000000; + mov.b32 %r310, 0f3BBB989D; + .loc 1 60 29 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:60:29 + fma.rn.ftz.f32 %r311, %r305, %r310, %r309; + cvt.ftz.sat.f32.f32 %r312, %r311; + mov.b32 %r313, 0f4B400001; + mov.b32 %r314, 0f437C0000; + fma.rm.ftz.f32 %r315, %r312, %r314, %r313; + add.f32 %r316, %r315, 0fCB40007F; + neg.f32 %r317, %r316; + mov.b32 %r318, 0f3FB8AA3B; + fma.rn.ftz.f32 %r319, %r305, %r318, %r317; + mov.b32 %r320, 0f32A57060; + fma.rn.ftz.f32 %r321, %r305, %r320, %r319; + shl.b32 %r322, %r315, 23; + ex2.approx.ftz.f32 %r323, %r321; + mul.f32 %r324, %r323, %r322; + fma.rn.ftz.f32 %r325, %r306, %r310, %r309; + cvt.ftz.sat.f32.f32 %r326, %r325; + fma.rm.ftz.f32 %r327, %r326, %r314, %r313; + add.f32 %r328, %r327, 0fCB40007F; + neg.f32 %r329, %r328; + fma.rn.ftz.f32 %r330, %r306, %r318, %r329; + fma.rn.ftz.f32 %r331, %r306, %r320, %r330; + shl.b32 %r332, %r327, 23; + ex2.approx.ftz.f32 %r333, %r331; + mul.f32 %r334, %r333, %r332; + fma.rn.ftz.f32 %r335, %r307, %r310, %r309; + cvt.ftz.sat.f32.f32 %r336, %r335; + fma.rm.ftz.f32 %r337, %r336, %r314, %r313; + add.f32 %r338, %r337, 0fCB40007F; + neg.f32 %r339, %r338; + fma.rn.ftz.f32 %r340, %r307, %r318, %r339; + fma.rn.ftz.f32 %r341, %r307, %r320, %r340; + shl.b32 %r342, %r337, 23; + ex2.approx.ftz.f32 %r343, %r341; + mul.f32 %r344, %r343, %r342; + fma.rn.ftz.f32 %r345, %r308, %r310, %r309; + cvt.ftz.sat.f32.f32 %r346, %r345; + fma.rm.ftz.f32 %r347, %r346, %r314, %r313; + add.f32 %r348, %r347, 0fCB40007F; + neg.f32 %r349, %r348; + fma.rn.ftz.f32 %r350, %r308, %r318, %r349; + fma.rn.ftz.f32 %r351, %r308, %r320, %r350; + shl.b32 %r352, %r347, 23; + ex2.approx.ftz.f32 %r353, %r351; + mul.f32 %r354, %r353, %r352; + .loc 1 61 23 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:61:23 + div.full.f32 %r296, %r324, %r4; + div.full.f32 %r297, %r334, %r4; + div.full.f32 %r298, %r344, %r4; + div.full.f32 %r299, %r354, %r4; + .loc 1 62 29 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:62:29 + mad.wide.s32 %rd59, %r300, 4, %rd16; + .loc 1 62 53 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:62:53 + // begin inline asm + @%p48 st.global.v4.b32 [ %rd59 + 0 ], { %r296, %r297, %r298, %r299 }; + // end inline asm + .loc 1 51 40 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:51:40 + add.s64 %rd14, %rd67, 2048; + setp.lt.u64 %p52, %rd67, 29952; + mov.b64 %rd67, %rd14; + @%p52 bra $L__BB0_3; +// %bb.4: + .loc 1 51 4 // c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py:51:4 + ret; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 276 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x10d DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 54 +.b8 114 +.b8 98 +.b8 118 +.b8 103 +.b8 109 +.b8 53 +.b8 51 +.b8 106 +.b8 114 +.b8 51 +.b8 110 +.b8 117 +.b8 120 +.b8 54 +.b8 54 +.b8 100 +.b8 117 +.b8 114 +.b8 113 +.b8 104 +.b8 105 +.b8 115 +.b8 97 +.b8 110 +.b8 99 +.b8 99 +.b8 103 +.b8 97 +.b8 101 +.b8 98 +.b8 122 +.b8 120 +.b8 99 +.b8 100 +.b8 106 +.b8 100 +.b8 104 +.b8 100 +.b8 114 +.b8 112 +.b8 104 +.b8 113 +.b8 106 +.b8 112 +.b8 121 +.b8 117 +.b8 50 +.b8 116 +.b8 53 +.b8 114 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 54 +.b8 114 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x46 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 101 +.b8 120 +.b8 112 +.b8 95 +.b8 112 +.b8 114 +.b8 101 +.b8 112 +.b8 97 +.b8 114 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 111 +.b8 110 +.b8 108 +.b8 105 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 117 +.b8 98 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xd1:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xe6:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 40 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xfe:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 48 // DW_AT_call_line +.b8 33 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source new file mode 100644 index 0000000000000000000000000000000000000000..0f0ec1fc49992ce2cffff291d6686018f306156b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source @@ -0,0 +1,465 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":18:0) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":186:0) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":109:0) +#loc72 = loc(unknown) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":169:0) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":177:0) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":122:0) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc112 = loc("in_ptr0"(#loc)) +#loc113 = loc("out_ptr2"(#loc)) +#loc114 = loc("xnumel"(#loc)) +#loc115 = loc("r0_numel"(#loc)) +#loc153 = loc("lhs_max"(#loc52)) +#loc154 = loc("lhs_sum"(#loc52)) +#loc155 = loc("rhs_max"(#loc52)) +#loc167 = loc("a"(#loc66)) +#loc168 = loc("b"(#loc66)) +#loc172 = loc("x"(#loc76)) +#loc173 = loc("x"(#loc80)) +#loc174 = loc("x"(#loc85)) +#loc175 = loc("lhs_max"(#loc89)) +#loc176 = loc("lhs_sum"(#loc89)) +#loc185 = loc("a"(#loc100)) +#loc186 = loc("input"(#loc104)) +#loc187 = loc("a"(#loc108)) +#loc188 = loc("b"(#loc108)) +module { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 32000 : i32 loc(#loc116) + %xoffset = tt.get_program_id x : i32 loc(#loc117) + %xoffset_1 = arith.constant 1 : i32 loc(#loc118) + %xoffset_2 = arith.constant 1 : i32 loc(#loc118) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc118) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc119) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc120) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc121) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc121) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc122) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc122) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc123) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc124) + %_tmp3_max = arith.constant 0xFF800000 : f32 loc(#loc125) + %_tmp3_max_9 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc125) + %_tmp3_sum = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() : () -> tensor<1x2048xf32> loc(#loc126) + %c0_i32 = arith.constant 0 : i32 loc(#loc12) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc12) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc12) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc12) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc12) + %3 = ub.poison : i32 loc(#loc12) + %_tmp3_sum_10:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_max_13 = %_tmp3_max_9, %_tmp3_sum_14 = %_tmp3_sum) -> (tensor<1x2048xf32>, tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc128) + %r0_index_15 = arith.addi %r0_index, %r0_base_8 : tensor<1x2048xi32> loc(#loc128) + %r0_mask = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc129) + %r0_mask_16 = arith.cmpi slt, %r0_index_15, %r0_mask : tensor<1x2048xi32> loc(#loc129) + %tmp0 = arith.constant 32000 : i32 loc(#loc130) + %tmp0_17 = arith.constant 32000 : i32 loc(#loc130) + %tmp0_18 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc130) + %tmp0_19 = arith.muli %tmp0_18, %xindex_6 : tensor<1x1xi32> loc(#loc130) + %tmp0_20 = tt.broadcast %tmp0_19 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc131) + %tmp0_21 = arith.addi %r0_index_15, %tmp0_20 : tensor<1x2048xi32> loc(#loc131) + %tmp0_22 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc132) + %tmp0_23 = tt.addptr %tmp0_22, %tmp0_21 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc132) + %tmp0_24 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc133) + %tmp0_25 = arith.andi %r0_mask_16, %tmp0_24 : tensor<1x2048xi1> loc(#loc133) + %tmp0_26 = arith.constant 0.000000e+00 : f32 loc(#loc134) + %tmp0_27 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc134) + %tmp0_28 = arith.truncf %tmp0_27 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc134) + %tmp0_29 = tt.load %tmp0_23, %tmp0_25, %tmp0_28 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc134) + %tmp0_30 = arith.extf %tmp0_29 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc135) + %9:2 = tt.call @"torch._inductor.runtime.triton_helpers.online_softmax_combine__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_False_"(%_tmp3_max_13, %_tmp3_sum_14, %tmp0_30) : (tensor<1x2048xf32>, tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1x2048xf32>, tensor<1x2048xf32>) loc(#loc21) + %_tmp3_max_31 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc136) + %_tmp3_max_32 = arith.andi %r0_mask_16, %_tmp3_max_31 : tensor<1x2048xi1> loc(#loc136) + %_tmp3_max_33 = arith.select %_tmp3_max_32, %9#0, %_tmp3_max_13 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc137) + %_tmp3_sum_34 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc138) + %_tmp3_sum_35 = arith.andi %r0_mask_16, %_tmp3_sum_34 : tensor<1x2048xi1> loc(#loc138) + %_tmp3_sum_36 = arith.select %_tmp3_sum_35, %9#1, %_tmp3_sum_14 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc139) + scf.yield %_tmp3_max_33, %_tmp3_sum_36 : tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc26) + } loc(#loc189) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.online_softmax_reduce__fp32S1_2048S_fp32S1_2048S__(2,)cconstexpr_1__(3,)cconstexpr_False_"(%_tmp3_sum_10#0, %_tmp3_sum_10#1) : (tensor<1x2048xf32>, tensor<1x2048xf32>) -> (tensor<1xf32>, tensor<1xf32>) loc(#loc27) + %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc140) + %tmp4 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc141) + %c0_i32_11 = arith.constant 0 : i32 loc(#loc30) + %c2048_i32_12 = arith.constant 2048 : i32 loc(#loc30) + %5 = arith.bitcast %c0_i32_11 : i32 to i32 loc(#loc30) + %6 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc30) + %7 = arith.bitcast %c2048_i32_12 : i32 to i32 loc(#loc30) + %8 = ub.poison : i32 loc(#loc30) + scf.for %r0_offset = %5 to %6 step %7 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc142) + %r0_index_13 = arith.addi %r0_index, %r0_base_8 : tensor<1x2048xi32> loc(#loc142) + %r0_mask = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc143) + %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x2048xi32> loc(#loc143) + %tmp5 = arith.constant 32000 : i32 loc(#loc144) + %tmp5_15 = arith.constant 32000 : i32 loc(#loc144) + %tmp5_16 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc144) + %tmp5_17 = arith.muli %tmp5_16, %xindex_6 : tensor<1x1xi32> loc(#loc144) + %tmp5_18 = tt.broadcast %tmp5_17 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc145) + %tmp5_19 = arith.addi %r0_index_13, %tmp5_18 : tensor<1x2048xi32> loc(#loc145) + %tmp5_20 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc146) + %tmp5_21 = tt.addptr %tmp5_20, %tmp5_19 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc146) + %tmp5_22 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc147) + %tmp5_23 = arith.andi %r0_mask_14, %tmp5_22 : tensor<1x2048xi1> loc(#loc147) + %tmp5_24 = arith.constant 0.000000e+00 : f32 loc(#loc148) + %tmp5_25 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc148) + %tmp5_26 = arith.truncf %tmp5_25 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc148) + %tmp5_27 = tt.load %tmp5_21, %tmp5_23, %tmp5_26 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc148) + %tmp5_28 = arith.extf %tmp5_27 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc149) + %tmp7 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc150) + %tmp7_29 = arith.subf %tmp5_28, %tmp7 : tensor<1x2048xf32> loc(#loc150) + %tmp8 = tt.extern_elementwise %tmp7_29 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc151) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc152) + %tmp9_30 = arith.divf %tmp8, %tmp9 : tensor<1x2048xf32> loc(#loc152) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc42) + %c32000_i32_31 = arith.constant 32000 : i32 loc(#loc42) + %cst = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc42) + %9 = arith.muli %cst, %xindex_6 : tensor<1x1xi32> loc(#loc42) + %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc43) + %11 = arith.addi %r0_index_13, %10 : tensor<1x2048xi32> loc(#loc43) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc44) + %13 = tt.addptr %12, %11 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc44) + %14 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc45) + %15 = arith.andi %r0_mask_14, %14 : tensor<1x2048xi1> loc(#loc45) + tt.store %13, %tmp9_30, %15 : tensor<1x2048x!tt.ptr> loc(#loc46) + } loc(#loc30) + tt.return loc(#loc47) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_2048__(1,)cconstexpr_fp32_"() -> tensor<1x2048xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc49) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc49) + tt.return %cst_0 : tensor<1x2048xf32> loc(#loc50) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x2048xf32> loc(#loc51) + tt.return %0 : tensor<1x2048xf32> loc(#loc51) + } loc(#loc48) + tt.func private @"torch._inductor.runtime.triton_helpers.online_softmax_combine__fp32S1_2048S_fp32S1_2048S_fp32S1_2048S__(3,)cconstexpr_False_"(%lhs_max: tensor<1x2048xf32> loc("lhs_max"(#loc52)), %lhs_sum: tensor<1x2048xf32> loc("lhs_sum"(#loc52)), %rhs_max: tensor<1x2048xf32> loc("rhs_max"(#loc52))) -> (tensor<1x2048xf32>, tensor<1x2048xf32>) attributes {noinline = false} { + %out_max = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32S1_2048S_fp32S1_2048S__(%lhs_max, %rhs_max) : (tensor<1x2048xf32>, tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc156) + %lhs_scale = arith.constant 0xFF800000 : f32 loc(#loc157) + %lhs_scale_0 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc157) + %lhs_scale_1 = arith.cmpf oeq, %out_max, %lhs_scale_0 : tensor<1x2048xf32> loc(#loc157) + %lhs_scale_2 = arith.subf %lhs_max, %out_max : tensor<1x2048xf32> loc(#loc158) + %lhs_scale_3 = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_2048S__(1,)cconstexpr_False_"(%lhs_scale_2) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc159) + %lhs_scale_4 = arith.constant 1.000000e+00 : f32 loc(#loc160) + %lhs_scale_5 = arith.constant 1.000000e+00 : f32 loc(#loc160) + %lhs_scale_6 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc160) + %lhs_scale_7 = arith.select %lhs_scale_1, %lhs_scale_6, %lhs_scale_3 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc160) + %rhs_scale = arith.constant 0xFF800000 : f32 loc(#loc161) + %rhs_scale_8 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc161) + %rhs_scale_9 = arith.cmpf oeq, %out_max, %rhs_scale_8 : tensor<1x2048xf32> loc(#loc161) + %rhs_scale_10 = arith.subf %rhs_max, %out_max : tensor<1x2048xf32> loc(#loc162) + %rhs_scale_11 = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_2048S__(1,)cconstexpr_False_"(%rhs_scale_10) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc163) + %rhs_scale_12 = arith.constant 1.000000e+00 : f32 loc(#loc164) + %rhs_scale_13 = arith.constant 1.000000e+00 : f32 loc(#loc164) + %rhs_scale_14 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc164) + %rhs_scale_15 = arith.select %rhs_scale_9, %rhs_scale_14, %rhs_scale_11 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc164) + %out_sum = arith.mulf %lhs_sum, %lhs_scale_7 : tensor<1x2048xf32> loc(#loc165) + %out_sum_16 = arith.addf %out_sum, %rhs_scale_15 : tensor<1x2048xf32> loc(#loc166) + tt.return %out_max, %out_sum_16 : tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc64) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x2048xf32> loc(#loc65) + %1 = ub.poison : tensor<1x2048xf32> loc(#loc65) + tt.return %0, %1 : tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc65) + } loc(#loc52) + tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32S1_2048S_fp32S1_2048S__(%a: tensor<1x2048xf32> loc("a"(#loc66)), %b: tensor<1x2048xf32> loc("b"(#loc66))) -> tensor<1x2048xf32> attributes {noinline = false} { + %mask = arith.cmpf ogt, %a, %b : tensor<1x2048xf32> loc(#loc190) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%a) : (tensor<1x2048xf32>) -> i1 loc(#loc68) + %1 = scf.if %0 -> (tensor<1x2048xi1>) { + %mask_0 = arith.cmpf une, %a, %a : tensor<1x2048xf32> loc(#loc170) + %mask_1 = arith.ori %mask, %mask_0 : tensor<1x2048xi1> loc(#loc191) + scf.yield %mask_1 : tensor<1x2048xi1> loc(#loc191) + } else { + scf.yield %mask : tensor<1x2048xi1> loc(#loc72) + } loc(#loc69) + %2 = arith.select %1, %a, %b : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc73) + tt.return %2 : tensor<1x2048xf32> loc(#loc74) + ^bb1: // no predecessors + %3 = ub.poison : tensor<1x2048xf32> loc(#loc75) + tt.return %3 : tensor<1x2048xf32> loc(#loc75) + } loc(#loc66) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc76))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc77) + %true = arith.constant true loc(#loc78) + tt.return %true : i1 loc(#loc78) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc79) + tt.return %1 : i1 loc(#loc79) + } loc(#loc76) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc80))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc81) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc82) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc82) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc82) + %4 = arith.addf %x, %3 : tensor<1x2048xf32> loc(#loc82) + tt.return %4 : tensor<1x2048xf32> loc(#loc83) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x2048xf32> loc(#loc84) + tt.return %5 : tensor<1x2048xf32> loc(#loc84) + } loc(#loc80) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc49) + %cst = arith.constant dense : tensor<1xi1> loc(#loc49) + tt.return %cst : tensor<1xi1> loc(#loc50) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc51) + tt.return %0 : tensor<1xi1> loc(#loc51) + } loc(#loc48) + tt.func private @"torch._inductor.runtime.triton_helpers.exp__fp32S1_2048S__(1,)cconstexpr_False_"(%x: tensor<1x2048xf32> loc("x"(#loc85))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.extern_elementwise %x {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc86) + tt.return %0 : tensor<1x2048xf32> loc(#loc87) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x2048xf32> loc(#loc88) + tt.return %1 : tensor<1x2048xf32> loc(#loc88) + } loc(#loc85) + tt.func private @"torch._inductor.runtime.triton_helpers.online_softmax_reduce__fp32S1_2048S_fp32S1_2048S__(2,)cconstexpr_1__(3,)cconstexpr_False_"(%lhs_max: tensor<1x2048xf32> loc("lhs_max"(#loc89)), %lhs_sum: tensor<1x2048xf32> loc("lhs_sum"(#loc89))) -> (tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} { + %out_max = tt.call @"torch._inductor.runtime.triton_helpers.max2__fp32S1_2048S__(1,)cconstexpr_1_"(%lhs_max) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc177) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc178) + %delta = arith.constant 0xFF800000 : f32 loc(#loc179) + %delta_0 = arith.constant dense<0xFF800000> : tensor<1x1xf32> loc(#loc179) + %delta_1 = arith.cmpf oeq, %out_max_keepdim, %delta_0 : tensor<1x1xf32> loc(#loc179) + %delta_2 = tt.broadcast %out_max_keepdim : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc180) + %delta_3 = arith.subf %lhs_max, %delta_2 : tensor<1x2048xf32> loc(#loc180) + %delta_4 = arith.constant 0 : i32 loc(#loc181) + %delta_5 = arith.constant 0.000000e+00 : f32 loc(#loc181) + %delta_6 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc181) + %delta_7 = tt.broadcast %delta_1 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc181) + %delta_8 = arith.select %delta_7, %delta_6, %delta_3 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc181) + %out_sum = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_2048S__(1,)cconstexpr_False_"(%delta_8) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc182) + %out_sum_9 = arith.mulf %lhs_sum, %out_sum : tensor<1x2048xf32> loc(#loc183) + %out_sum_10 = tt.call @"triton.language.standard.sum__fp32S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%out_sum_9) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc184) + tt.return %out_max, %out_sum_10 : tensor<1xf32>, tensor<1xf32> loc(#loc98) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xf32> loc(#loc99) + %1 = ub.poison : tensor<1xf32> loc(#loc99) + tt.return %0, %1 : tensor<1xf32>, tensor<1xf32> loc(#loc99) + } loc(#loc89) + tt.func private @"torch._inductor.runtime.triton_helpers.max2__fp32S1_2048S__(1,)cconstexpr_1_"(%a: tensor<1x2048xf32> loc("a"(#loc100))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%a) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc101) + tt.reduce.return %2 : f32 loc(#loc101) + }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc101) + tt.return %0 : tensor<1xf32> loc(#loc102) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc103) + tt.return %1 : tensor<1xf32> loc(#loc103) + } loc(#loc100) + tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%a: f32 loc("a"(#loc66)), %b: f32 loc("b"(#loc66))) -> f32 attributes {noinline = false} { + %mask = arith.cmpf ogt, %a, %b : f32 loc(#loc190) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a) : (f32) -> i1 loc(#loc68) + %1 = scf.if %0 -> (i1) { + %mask_0 = arith.cmpf une, %a, %a : f32 loc(#loc170) + %mask_1 = arith.ori %mask, %mask_0 : i1 loc(#loc191) + scf.yield %mask_1 : i1 loc(#loc191) + } else { + scf.yield %mask : i1 loc(#loc72) + } loc(#loc69) + %2 = arith.select %1, %a, %b : f32 loc(#loc73) + tt.return %2 : f32 loc(#loc74) + ^bb1: // no predecessors + %3 = ub.poison : f32 loc(#loc75) + tt.return %3 : f32 loc(#loc75) + } loc(#loc66) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc76))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc77) + %true = arith.constant true loc(#loc78) + tt.return %true : i1 loc(#loc78) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc79) + tt.return %1 : i1 loc(#loc79) + } loc(#loc76) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc80))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc81) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc82) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc82) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc82) + tt.return %3 : tensor<1xf32> loc(#loc83) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc84) + tt.return %4 : tensor<1xf32> loc(#loc84) + } loc(#loc80) + tt.func private @"triton.language.standard.sum__fp32S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2048xf32> loc("input"(#loc104))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc105) + tt.reduce.return %2 : f32 loc(#loc105) + }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc105) + tt.return %0 : tensor<1xf32> loc(#loc106) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc107) + tt.return %1 : tensor<1xf32> loc(#loc107) + } loc(#loc104) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc108)), %b: f32 loc("b"(#loc108))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc109) + tt.return %0 : f32 loc(#loc110) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc111) + tt.return %1 : f32 loc(#loc111) + } loc(#loc108) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":28:59) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":29:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":30:40) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":31:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":32:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":36:47) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":36:41) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":36:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":36:62) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":36:52) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":36:113) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":41:40) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":44:39) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":44:62) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":45:39) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":45:62) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":45:8) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":48:33) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":49:16) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":50:16) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":51:40) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":52:31) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":53:29) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":57:47) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":57:41) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":57:34) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":57:62) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":57:52) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":57:114) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":59:22) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":60:29) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":61:23) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":62:42) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":62:36) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":62:29) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":62:63) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":62:53) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":51:4) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:19) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":206:11) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":206:4) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:19) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:7) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:11) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:4) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:15) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":170:4) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":182:11) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":182:4) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:11) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:4) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc116 = loc("r0_numel"(#loc1)) +#loc117 = loc("xoffset"(#loc2)) +#loc118 = loc("xoffset"(#loc3)) +#loc119 = loc("xindex"(#loc4)) +#loc120 = loc("xindex"(#loc5)) +#loc121 = loc("xindex"(#loc6)) +#loc122 = loc("xmask"(#loc7)) +#loc123 = loc("r0_base"(#loc8)) +#loc124 = loc("r0_base"(#loc9)) +#loc125 = loc("_tmp3_max"(#loc10)) +#loc126 = loc("_tmp3_sum"(#loc11)) +#loc127 = loc("_tmp3_max"(#loc12)) +#loc128 = loc("r0_index"(#loc13)) +#loc129 = loc("r0_mask"(#loc14)) +#loc130 = loc("tmp0"(#loc15)) +#loc131 = loc("tmp0"(#loc16)) +#loc132 = loc("tmp0"(#loc17)) +#loc133 = loc("tmp0"(#loc18)) +#loc134 = loc("tmp0"(#loc19)) +#loc135 = loc("tmp0"(#loc20)) +#loc136 = loc("_tmp3_max"(#loc22)) +#loc137 = loc("_tmp3_max"(#loc23)) +#loc138 = loc("_tmp3_sum"(#loc24)) +#loc139 = loc("_tmp3_sum"(#loc25)) +#loc140 = loc("tmp3"(#loc28)) +#loc141 = loc("tmp4"(#loc29)) +#loc142 = loc("r0_index"(#loc31)) +#loc143 = loc("r0_mask"(#loc32)) +#loc144 = loc("tmp5"(#loc33)) +#loc145 = loc("tmp5"(#loc34)) +#loc146 = loc("tmp5"(#loc35)) +#loc147 = loc("tmp5"(#loc36)) +#loc148 = loc("tmp5"(#loc37)) +#loc149 = loc("tmp5"(#loc38)) +#loc150 = loc("tmp7"(#loc39)) +#loc151 = loc("tmp8"(#loc40)) +#loc152 = loc("tmp9"(#loc41)) +#loc156 = loc("out_max"(#loc53)) +#loc157 = loc("lhs_scale"(#loc54)) +#loc158 = loc("lhs_scale"(#loc55)) +#loc159 = loc("lhs_scale"(#loc56)) +#loc160 = loc("lhs_scale"(#loc57)) +#loc161 = loc("rhs_scale"(#loc58)) +#loc162 = loc("rhs_scale"(#loc59)) +#loc163 = loc("rhs_scale"(#loc60)) +#loc164 = loc("rhs_scale"(#loc61)) +#loc165 = loc("out_sum"(#loc62)) +#loc166 = loc("out_sum"(#loc63)) +#loc169 = loc("mask"(#loc67)) +#loc170 = loc("mask"(#loc70)) +#loc171 = loc("mask"(#loc71)) +#loc177 = loc("out_max"(#loc90)) +#loc178 = loc("out_max_keepdim"(#loc91)) +#loc179 = loc("delta"(#loc92)) +#loc180 = loc("delta"(#loc93)) +#loc181 = loc("delta"(#loc94)) +#loc182 = loc("out_sum"(#loc95)) +#loc183 = loc("out_sum"(#loc96)) +#loc184 = loc("out_sum"(#loc97)) +#loc189 = loc("_tmp3_sum"(#loc127)) +#loc190 = loc("mask"(#loc169)) +#loc191 = loc("mask"(#loc171)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..e54471095c8f218b1da4a5b8c7bb310eeb6bc64e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir @@ -0,0 +1,237 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":18:0) +#loc1 = loc(unknown) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":48:33) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc60 = loc("in_ptr0"(#loc)) +#loc61 = loc("out_ptr2"(#loc)) +#loc62 = loc("xnumel"(#loc)) +#loc63 = loc("r0_numel"(#loc)) +#loc91 = loc("out_max"(#loc34)) +#loc98 = loc("out_sum"(#loc43)) +#loc125 = loc(callsite(#loc91 at #loc35)) +#loc132 = loc(callsite(#loc98 at #loc35)) +#loc140 = loc(callsite(#loc1 at #loc125)) +#loc143 = loc(callsite(#loc1 at #loc132)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32000> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %cst_1 = arith.constant dense<0xFF800000> : tensor<1x1xf32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc64) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc65) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc66) + %r0_base_5 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc66) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc67) + %tmp0_6 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc110) + %tmp0_7 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc69) + %tmp0_8 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc111) + %_tmp3_sum:2 = scf.for %_tmp3_sum_15 = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%arg5 = %cst_4, %arg6 = %cst_3) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp3_sum_15 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc72) + %r0_index_16 = arith.addi %r0_index, %r0_base_5 : tensor<1x2048xi32, #blocked> loc(#loc72) + %r0_mask = arith.cmpi slt, %r0_index_16, %cst : tensor<1x2048xi32, #blocked> loc(#loc73) + %tmp0_17 = arith.addi %r0_index_16, %tmp0_6 : tensor<1x2048xi32, #blocked> loc(#loc68) + %tmp0_18 = tt.addptr %tmp0_7, %tmp0_17 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc69) + %tmp0_19 = arith.andi %r0_mask, %tmp0_8 : tensor<1x2048xi1, #blocked> loc(#loc70) + %tmp0_20 = tt.load %tmp0_18, %tmp0_19, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc74) + %tmp0_21 = arith.extf %tmp0_20 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc75) + %mask = arith.cmpf ogt, %arg5, %tmp0_21 : tensor<1x2048xf32, #blocked> loc(#loc133) + %mask_22 = arith.cmpf une, %arg5, %arg5 : tensor<1x2048xf32, #blocked> loc(#loc134) + %mask_23 = arith.ori %mask, %mask_22 : tensor<1x2048xi1, #blocked> loc(#loc135) + %out_max_24 = arith.select %mask_23, %arg5, %tmp0_21 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc136) + %lhs_scale = arith.cmpf oeq, %out_max_24, %cst_4 : tensor<1x2048xf32, #blocked> loc(#loc116) + %lhs_scale_25 = arith.subf %arg5, %out_max_24 : tensor<1x2048xf32, #blocked> loc(#loc117) + %lhs_scale_26 = tt.extern_elementwise %lhs_scale_25 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked> loc(#loc137) + %lhs_scale_27 = arith.select %lhs_scale, %cst_2, %lhs_scale_26 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc119) + %rhs_scale = arith.subf %tmp0_21, %out_max_24 : tensor<1x2048xf32, #blocked> loc(#loc120) + %rhs_scale_28 = tt.extern_elementwise %rhs_scale {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked> loc(#loc138) + %rhs_scale_29 = arith.select %lhs_scale, %cst_2, %rhs_scale_28 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc122) + %out_sum_30 = arith.mulf %arg6, %lhs_scale_27 : tensor<1x2048xf32, #blocked> loc(#loc123) + %out_sum_31 = arith.addf %out_sum_30, %rhs_scale_29 : tensor<1x2048xf32, #blocked> loc(#loc124) + %_tmp3_max = arith.select %tmp0_19, %out_max_24, %arg5 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc89) + %_tmp3_sum_32 = arith.select %tmp0_19, %out_sum_31, %arg6 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc90) + scf.yield %_tmp3_max, %_tmp3_sum_32 : tensor<1x2048xf32, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc32) + } loc(#loc112) + %out_max = "tt.reduce"(%_tmp3_sum#0) <{axis = 1 : i32}> ({ + ^bb0(%out_max_15: f32 loc(callsite(#loc1 at #loc125)), %out_max_16: f32 loc(callsite(#loc1 at #loc125))): + %mask = arith.cmpf ogt, %out_max_15, %out_max_16 : f32 loc(#loc144) + %mask_17 = arith.cmpf une, %out_max_15, %out_max_15 : f32 loc(#loc145) + %mask_18 = arith.ori %mask, %mask_17 : i1 loc(#loc146) + %out_max_19 = arith.select %mask_18, %out_max_15, %out_max_16 : f32 loc(#loc147) + tt.reduce.return %out_max_19 : f32 loc(#loc139) + }) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc139) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc126) + %delta = arith.cmpf oeq, %out_max_keepdim, %cst_1 : tensor<1x1xf32, #blocked> loc(#loc127) + %delta_9 = tt.broadcast %out_max_keepdim : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc128) + %delta_10 = arith.subf %_tmp3_sum#0, %delta_9 : tensor<1x2048xf32, #blocked> loc(#loc128) + %delta_11 = tt.broadcast %delta : tensor<1x1xi1, #blocked> -> tensor<1x2048xi1, #blocked> loc(#loc129) + %delta_12 = arith.select %delta_11, %cst_3, %delta_10 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc129) + %out_sum = tt.extern_elementwise %delta_12 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked> loc(#loc141) + %out_sum_13 = arith.mulf %_tmp3_sum#1, %out_sum : tensor<1x2048xf32, #blocked> loc(#loc131) + %out_sum_14 = "tt.reduce"(%out_sum_13) <{axis = 1 : i32}> ({ + ^bb0(%out_sum_15: f32 loc(callsite(#loc1 at #loc132)), %out_sum_16: f32 loc(callsite(#loc1 at #loc132))): + %out_sum_17 = arith.addf %out_sum_15, %out_sum_16 : f32 loc(#loc148) + tt.reduce.return %out_sum_17 : f32 loc(#loc142) + }) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc142) + %tmp4 = tt.expand_dims %out_sum_14 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xf32, #blocked> loc(#loc99) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32, #blocked> -> tensor<1x2048xf32, #blocked> loc(#loc100) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc47) + scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc101) + %r0_index_15 = arith.addi %r0_index, %r0_base_5 : tensor<1x2048xi32, #blocked> loc(#loc101) + %r0_mask = arith.cmpi slt, %r0_index_15, %cst : tensor<1x2048xi32, #blocked> loc(#loc102) + %tmp5 = arith.addi %r0_index_15, %tmp0_6 : tensor<1x2048xi32, #blocked> loc(#loc103) + %tmp5_16 = tt.addptr %tmp0_7, %tmp5 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc104) + %tmp5_17 = arith.andi %r0_mask, %tmp0_8 : tensor<1x2048xi1, #blocked> loc(#loc105) + %tmp5_18 = tt.load %tmp5_16, %tmp5_17, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc106) + %tmp5_19 = arith.extf %tmp5_18 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc107) + %tmp7 = arith.subf %tmp5_19, %delta_9 : tensor<1x2048xf32, #blocked> loc(#loc108) + %tmp8 = tt.extern_elementwise %tmp7 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked> loc(#loc109) + %tmp9_20 = arith.divf %tmp8, %tmp9 : tensor<1x2048xf32, #blocked> loc(#loc100) + %1 = tt.addptr %0, %tmp5 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc47) + tt.store %1, %tmp9_20, %tmp5_17 : tensor<1x2048x!tt.ptr, #blocked> loc(#loc58) + } loc(#loc48) + tt.return loc(#loc59) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":36:47) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":36:41) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":36:34) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":36:62) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":30:40) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":31:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":32:29) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":36:52) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":36:113) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":41:40) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":44:62) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":45:62) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":45:8) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":50:16) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":61:23) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":62:29) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":51:40) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":52:31) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":53:29) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":57:41) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":57:34) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":57:62) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":57:52) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":57:114) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":59:22) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":60:29) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":62:53) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":51:4) +#loc64 = loc("xoffset"(#loc2)) +#loc65 = loc("xmask"(#loc3)) +#loc66 = loc("r0_base"(#loc4)) +#loc67 = loc("tmp0"(#loc5)) +#loc68 = loc("tmp0"(#loc6)) +#loc69 = loc("tmp0"(#loc7)) +#loc70 = loc("tmp0"(#loc8)) +#loc71 = loc("_tmp3_max"(#loc9)) +#loc72 = loc("r0_index"(#loc10)) +#loc73 = loc("r0_mask"(#loc11)) +#loc74 = loc("tmp0"(#loc12)) +#loc75 = loc("tmp0"(#loc13)) +#loc76 = loc("mask"(#loc14)) +#loc77 = loc("out_max"(#loc15)) +#loc78 = loc("mask"(#loc17)) +#loc79 = loc("mask"(#loc18)) +#loc80 = loc("lhs_scale"(#loc20)) +#loc81 = loc("lhs_scale"(#loc21)) +#loc82 = loc("lhs_scale"(#loc23)) +#loc83 = loc("lhs_scale"(#loc24)) +#loc84 = loc("rhs_scale"(#loc25)) +#loc85 = loc("rhs_scale"(#loc26)) +#loc86 = loc("rhs_scale"(#loc27)) +#loc87 = loc("out_sum"(#loc28)) +#loc88 = loc("out_sum"(#loc29)) +#loc89 = loc("_tmp3_max"(#loc30)) +#loc90 = loc("_tmp3_sum"(#loc31)) +#loc92 = loc("out_max_keepdim"(#loc36)) +#loc93 = loc("delta"(#loc37)) +#loc94 = loc("delta"(#loc38)) +#loc95 = loc("delta"(#loc39)) +#loc96 = loc("out_sum"(#loc40)) +#loc97 = loc("out_sum"(#loc41)) +#loc99 = loc("tmp4"(#loc45)) +#loc100 = loc("tmp9"(#loc46)) +#loc101 = loc("r0_index"(#loc49)) +#loc102 = loc("r0_mask"(#loc50)) +#loc103 = loc("tmp5"(#loc51)) +#loc104 = loc("tmp5"(#loc52)) +#loc105 = loc("tmp5"(#loc53)) +#loc106 = loc("tmp5"(#loc54)) +#loc107 = loc("tmp5"(#loc55)) +#loc108 = loc("tmp7"(#loc56)) +#loc109 = loc("tmp8"(#loc57)) +#loc110 = loc(fused[#loc68, #loc67]) +#loc111 = loc(fused[#loc70, #loc65]) +#loc112 = loc("_tmp3_sum"(#loc71)) +#loc113 = loc("mask"(#loc76)) +#loc114 = loc(callsite(#loc77 at #loc16)) +#loc115 = loc("mask"(#loc79)) +#loc116 = loc(callsite(#loc80 at #loc16)) +#loc117 = loc(callsite(#loc81 at #loc16)) +#loc118 = loc(callsite(#loc82 at #loc16)) +#loc119 = loc(callsite(#loc83 at #loc16)) +#loc120 = loc(callsite(#loc84 at #loc16)) +#loc121 = loc(callsite(#loc85 at #loc16)) +#loc122 = loc(callsite(#loc86 at #loc16)) +#loc123 = loc(callsite(#loc87 at #loc16)) +#loc124 = loc(callsite(#loc88 at #loc16)) +#loc126 = loc(callsite(#loc92 at #loc35)) +#loc127 = loc(callsite(#loc93 at #loc35)) +#loc128 = loc(callsite(#loc94 at #loc35)) +#loc129 = loc(callsite(#loc95 at #loc35)) +#loc130 = loc(callsite(#loc96 at #loc35)) +#loc131 = loc(callsite(#loc97 at #loc35)) +#loc133 = loc(callsite(#loc113 at #loc114)) +#loc134 = loc(callsite(#loc78 at #loc114)) +#loc135 = loc(callsite(#loc115 at #loc114)) +#loc136 = loc(callsite(#loc19 at #loc114)) +#loc137 = loc(callsite(#loc22 at #loc118)) +#loc138 = loc(callsite(#loc22 at #loc121)) +#loc139 = loc(callsite(#loc33 at #loc125)) +#loc141 = loc(callsite(#loc22 at #loc130)) +#loc142 = loc(callsite(#loc42 at #loc132)) +#loc144 = loc(callsite(#loc113 at #loc139)) +#loc145 = loc(callsite(#loc78 at #loc139)) +#loc146 = loc(callsite(#loc115 at #loc139)) +#loc147 = loc(callsite(#loc19 at #loc139)) +#loc148 = loc(callsite(#loc44 at #loc142)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..d9bf9f53e421262898f6b31fa4459b7fc22e07e0 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/TB23BI36TE6SXLOK7UOJNZ4ZQEYQMSEJU56DRJOK6D2L63KTYO3A/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir @@ -0,0 +1,246 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":18:0) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":48:33) +#loc3 = loc(unknown) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc62 = loc("in_ptr0"(#loc)) +#loc63 = loc("out_ptr2"(#loc)) +#loc64 = loc("xnumel"(#loc)) +#loc65 = loc("r0_numel"(#loc)) +#loc95 = loc("out_max"(#loc37)) +#loc101 = loc("out_sum"(#loc44)) +#loc130 = loc(callsite(#loc95 at #loc2)) +#loc136 = loc(callsite(#loc101 at #loc2)) +#loc146 = loc(callsite(#loc3 at #loc130)) +#loc149 = loc(callsite(#loc3 at #loc136)) +module { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %delta = arith.constant dense<0xFF800000> : tensor<1x1xf32> loc(#loc114) + %cst = arith.constant dense<1.000000e+00> : tensor<1x2048xf32> loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc3) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc3) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc3) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_2 = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc3) + %cst_3 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc67) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc68) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc69) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc70) + %_tmp3_sum:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp3_max = %cst_3, %_tmp3_sum_12 = %cst_0) -> (tensor<1x2048xf32>, tensor<1x2048xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc72) + %r0_index_13 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32> loc(#loc72) + %r0_mask = arith.cmpi slt, %r0_index_13, %cst_2 : tensor<1x2048xi32> loc(#loc73) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc74) + %tmp0_14 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc116) + %tmp0_15 = arith.addi %r0_index_13, %tmp0_14 : tensor<1x2048xi32> loc(#loc75) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc76) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc76) + %tmp0_18 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc117) + %tmp0_19 = arith.andi %r0_mask, %tmp0_18 : tensor<1x2048xi1> loc(#loc77) + %tmp0_20 = tt.load %tmp0_17, %tmp0_19, %cst_1 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc78) + %tmp0_21 = arith.extf %tmp0_20 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc79) + %mask = arith.cmpf ogt, %_tmp3_max, %tmp0_21 : tensor<1x2048xf32> loc(#loc139) + %mask_22 = arith.cmpf une, %_tmp3_max, %_tmp3_max : tensor<1x2048xf32> loc(#loc140) + %mask_23 = arith.ori %mask, %mask_22 : tensor<1x2048xi1> loc(#loc141) + %out_max_24 = arith.select %mask_23, %_tmp3_max, %tmp0_21 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc142) + %lhs_scale = arith.cmpf oeq, %out_max_24, %cst_3 : tensor<1x2048xf32> loc(#loc121) + %lhs_scale_25 = arith.subf %_tmp3_max, %out_max_24 : tensor<1x2048xf32> loc(#loc122) + %lhs_scale_26 = tt.extern_elementwise %lhs_scale_25 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc143) + %lhs_scale_27 = arith.select %lhs_scale, %cst, %lhs_scale_26 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc124) + %rhs_scale = arith.subf %tmp0_21, %out_max_24 : tensor<1x2048xf32> loc(#loc125) + %rhs_scale_28 = tt.extern_elementwise %rhs_scale {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc144) + %rhs_scale_29 = arith.select %lhs_scale, %cst, %rhs_scale_28 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc127) + %out_sum_30 = arith.mulf %_tmp3_sum_12, %lhs_scale_27 : tensor<1x2048xf32> loc(#loc128) + %out_sum_31 = arith.addf %out_sum_30, %rhs_scale_29 : tensor<1x2048xf32> loc(#loc129) + %_tmp3_max_32 = arith.select %tmp0_19, %out_max_24, %_tmp3_max : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc93) + %_tmp3_sum_33 = arith.select %tmp0_19, %out_sum_31, %_tmp3_sum_12 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc94) + scf.yield %_tmp3_max_32, %_tmp3_sum_33 : tensor<1x2048xf32>, tensor<1x2048xf32> loc(#loc35) + } loc(#loc115) + %out_max = "tt.reduce"(%_tmp3_sum#0) <{axis = 1 : i32}> ({ + ^bb0(%out_max_12: f32 loc(callsite(#loc3 at #loc130)), %out_max_13: f32 loc(callsite(#loc3 at #loc130))): + %mask = arith.cmpf ogt, %out_max_12, %out_max_13 : f32 loc(#loc150) + %mask_14 = arith.cmpf une, %out_max_12, %out_max_12 : f32 loc(#loc151) + %mask_15 = arith.ori %mask, %mask_14 : i1 loc(#loc152) + %out_max_16 = arith.select %mask_15, %out_max_12, %out_max_13 : f32 loc(#loc153) + tt.reduce.return %out_max_16 : f32 loc(#loc145) + }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc145) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc131) + %delta_5 = arith.cmpf oeq, %out_max_keepdim, %delta : tensor<1x1xf32> loc(#loc114) + %delta_6 = tt.broadcast %out_max_keepdim : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc132) + %delta_7 = arith.subf %_tmp3_sum#0, %delta_6 : tensor<1x2048xf32> loc(#loc132) + %delta_8 = tt.broadcast %delta_5 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc133) + %delta_9 = arith.select %delta_8, %cst_0, %delta_7 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc133) + %out_sum = tt.extern_elementwise %delta_9 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc147) + %out_sum_10 = arith.mulf %_tmp3_sum#1, %out_sum : tensor<1x2048xf32> loc(#loc135) + %out_sum_11 = "tt.reduce"(%out_sum_10) <{axis = 1 : i32}> ({ + ^bb0(%out_sum_12: f32 loc(callsite(#loc3 at #loc136)), %out_sum_13: f32 loc(callsite(#loc3 at #loc136))): + %out_sum_14 = arith.addf %out_sum_12, %out_sum_13 : f32 loc(#loc154) + tt.reduce.return %out_sum_14 : f32 loc(#loc148) + }) : (tensor<1x2048xf32>) -> tensor<1xf32> loc(#loc148) + %tmp4 = tt.expand_dims %out_sum_11 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc102) + scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc103) + %r0_index_12 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32> loc(#loc103) + %r0_mask = arith.cmpi slt, %r0_index_12, %cst_2 : tensor<1x2048xi32> loc(#loc104) + %tmp5 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc105) + %tmp5_13 = tt.splat %tmp5 : i32 -> tensor<1x2048xi32> loc(#loc137) + %tmp5_14 = arith.addi %r0_index_12, %tmp5_13 : tensor<1x2048xi32> loc(#loc106) + %tmp5_15 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc107) + %tmp5_16 = tt.addptr %tmp5_15, %tmp5_14 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc107) + %tmp5_17 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc138) + %tmp5_18 = arith.andi %r0_mask, %tmp5_17 : tensor<1x2048xi1> loc(#loc108) + %tmp5_19 = tt.load %tmp5_16, %tmp5_18, %cst_1 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc109) + %tmp5_20 = arith.extf %tmp5_19 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc110) + %tmp7 = arith.subf %tmp5_20, %delta_6 : tensor<1x2048xf32> loc(#loc111) + %tmp8 = tt.extern_elementwise %tmp7 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc112) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc113) + %tmp9_21 = arith.divf %tmp8, %tmp9 : tensor<1x2048xf32> loc(#loc113) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc59) + %1 = tt.addptr %0, %tmp5_14 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc59) + tt.store %1, %tmp9_21, %tmp5_18 : tensor<1x2048x!tt.ptr> loc(#loc60) + } loc(#loc47) + tt.return loc(#loc61) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":22:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":24:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":25:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":25:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":30:40) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":31:31) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":32:29) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":36:47) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":36:41) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":36:34) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":36:62) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":36:52) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":36:113) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":41:40) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":44:62) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":45:62) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":45:8) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":50:16) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":51:40) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":52:31) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":53:29) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":57:47) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":57:41) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":57:34) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":57:62) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":57:52) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":57:114) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":59:22) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":60:29) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":61:23) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":62:29) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":62:53) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6r/c6rbvgm53jr3nux66durqhisanccgaebzxcdjdhdrphqjpyu2t5r.py":51:4) +#loc66 = loc("delta"(#loc1)) +#loc67 = loc("xoffset"(#loc4)) +#loc68 = loc("xmask"(#loc5)) +#loc69 = loc("r0_base"(#loc6)) +#loc70 = loc("r0_base"(#loc7)) +#loc71 = loc("_tmp3_max"(#loc8)) +#loc72 = loc("r0_index"(#loc9)) +#loc73 = loc("r0_mask"(#loc10)) +#loc74 = loc("tmp0"(#loc11)) +#loc75 = loc("tmp0"(#loc12)) +#loc76 = loc("tmp0"(#loc13)) +#loc77 = loc("tmp0"(#loc14)) +#loc78 = loc("tmp0"(#loc15)) +#loc79 = loc("tmp0"(#loc16)) +#loc80 = loc("mask"(#loc17)) +#loc81 = loc("out_max"(#loc18)) +#loc82 = loc("mask"(#loc20)) +#loc83 = loc("mask"(#loc21)) +#loc84 = loc("lhs_scale"(#loc23)) +#loc85 = loc("lhs_scale"(#loc24)) +#loc86 = loc("lhs_scale"(#loc26)) +#loc87 = loc("lhs_scale"(#loc27)) +#loc88 = loc("rhs_scale"(#loc28)) +#loc89 = loc("rhs_scale"(#loc29)) +#loc90 = loc("rhs_scale"(#loc30)) +#loc91 = loc("out_sum"(#loc31)) +#loc92 = loc("out_sum"(#loc32)) +#loc93 = loc("_tmp3_max"(#loc33)) +#loc94 = loc("_tmp3_sum"(#loc34)) +#loc96 = loc("out_max_keepdim"(#loc38)) +#loc97 = loc("delta"(#loc39)) +#loc98 = loc("delta"(#loc40)) +#loc99 = loc("out_sum"(#loc41)) +#loc100 = loc("out_sum"(#loc42)) +#loc102 = loc("tmp4"(#loc46)) +#loc103 = loc("r0_index"(#loc48)) +#loc104 = loc("r0_mask"(#loc49)) +#loc105 = loc("tmp5"(#loc50)) +#loc106 = loc("tmp5"(#loc51)) +#loc107 = loc("tmp5"(#loc52)) +#loc108 = loc("tmp5"(#loc53)) +#loc109 = loc("tmp5"(#loc54)) +#loc110 = loc("tmp5"(#loc55)) +#loc111 = loc("tmp7"(#loc56)) +#loc112 = loc("tmp8"(#loc57)) +#loc113 = loc("tmp9"(#loc58)) +#loc114 = loc(callsite(#loc66 at #loc2)) +#loc115 = loc("_tmp3_sum"(#loc71)) +#loc116 = loc(fused[#loc75, #loc74]) +#loc117 = loc(fused[#loc77, #loc68]) +#loc118 = loc("mask"(#loc80)) +#loc119 = loc(callsite(#loc81 at #loc19)) +#loc120 = loc("mask"(#loc83)) +#loc121 = loc(callsite(#loc84 at #loc19)) +#loc122 = loc(callsite(#loc85 at #loc19)) +#loc123 = loc(callsite(#loc86 at #loc19)) +#loc124 = loc(callsite(#loc87 at #loc19)) +#loc125 = loc(callsite(#loc88 at #loc19)) +#loc126 = loc(callsite(#loc89 at #loc19)) +#loc127 = loc(callsite(#loc90 at #loc19)) +#loc128 = loc(callsite(#loc91 at #loc19)) +#loc129 = loc(callsite(#loc92 at #loc19)) +#loc131 = loc(callsite(#loc96 at #loc2)) +#loc132 = loc(callsite(#loc97 at #loc2)) +#loc133 = loc(callsite(#loc98 at #loc2)) +#loc134 = loc(callsite(#loc99 at #loc2)) +#loc135 = loc(callsite(#loc100 at #loc2)) +#loc137 = loc(fused[#loc106, #loc105]) +#loc138 = loc(fused[#loc108, #loc68]) +#loc139 = loc(callsite(#loc118 at #loc119)) +#loc140 = loc(callsite(#loc82 at #loc119)) +#loc141 = loc(callsite(#loc120 at #loc119)) +#loc142 = loc(callsite(#loc22 at #loc119)) +#loc143 = loc(callsite(#loc25 at #loc123)) +#loc144 = loc(callsite(#loc25 at #loc126)) +#loc145 = loc(callsite(#loc36 at #loc130)) +#loc147 = loc(callsite(#loc25 at #loc134)) +#loc148 = loc(callsite(#loc43 at #loc136)) +#loc150 = loc(callsite(#loc118 at #loc145)) +#loc151 = loc(callsite(#loc82 at #loc145)) +#loc152 = loc(callsite(#loc120 at #loc145)) +#loc153 = loc(callsite(#loc22 at #loc145)) +#loc154 = loc(callsite(#loc45 at #loc148)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/__grp__triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/__grp__triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..47ba2fbc408d4c189c17df8290c146a7ad0d8174 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/__grp__triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..1068d9566a76b04b9bd3d5bbb9f62a393a43c5e8 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4f2ad8266a6586d25342eed8330467f7e16e5b71 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"hash": "9bed8cacf3cc5bcaa5c46b105313202944201bdf1d2d71451b3a200b3ef7312c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 64, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..3840a2c4b5d5e14453357dc12c12020c65d115b8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.llir @@ -0,0 +1,308 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_argmax_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = shl i32 %7, 4, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = and i32 %9, 120, !dbg !9 + %11 = lshr exact i32 %10, 3, !dbg !9 + %12 = or disjoint i32 %11, %8, !dbg !10 + %13 = shl nuw nsw i32 %9, 2, !dbg !11 + %14 = and i32 %13, 28, !dbg !11 + %15 = sdiv i32 %12, 2048, !dbg !12 + %16 = mul i32 %12, 32000 + %17 = mul i32 %15, 224000 + %18 = add i32 %17, %16 + %19 = zext nneg i32 %14 to i64, !dbg !13 + br label %20, !dbg !13 + +20: ; preds = %6, %20 + %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %20 ] + %21 = phi i32 [ 2147483647, %6 ], [ %97, %20 ] + %22 = phi i32 [ 2147483647, %6 ], [ %99, %20 ] + %23 = phi float [ 0xFFF0000000000000, %6 ], [ %89, %20 ] + %24 = phi float [ 0xFFF0000000000000, %6 ], [ %90, %20 ] + %25 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %88, %20 ] + %26 = phi <2 x i32> [ splat (i32 2147483647), %6 ], [ %95, %20 ] + %27 = or disjoint i64 %indvars.iv, %19, !dbg !14 + %28 = or disjoint i64 %27, 2, !dbg !14 + %29 = or disjoint i64 %27, 3, !dbg !14 + %30 = trunc nuw nsw i64 %27 to i32, !dbg !15 + %31 = add i32 %18, %30, !dbg !15 + %32 = sext i32 %31 to i64, !dbg !16 + %33 = getelementptr float, ptr addrspace(1) %0, i64 %32, !dbg !16 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %35 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %33, i64 %34, i1 true) #4, !dbg !17 + %36 = extractvalue { i32, i32, i32, i32 } %35, 0, !dbg !17 + %37 = extractvalue { i32, i32, i32, i32 } %35, 1, !dbg !17 + %38 = extractvalue { i32, i32, i32, i32 } %35, 2, !dbg !17 + %39 = extractvalue { i32, i32, i32, i32 } %35, 3, !dbg !17 + %40 = bitcast i32 %38 to float, !dbg !17 + %41 = bitcast i32 %39 to float, !dbg !17 + %42 = fcmp ogt float %23, %40, !dbg !18 + %43 = fcmp ogt float %24, %41, !dbg !18 + %44 = fcmp oeq float %23, %40, !dbg !22 + %45 = fcmp oeq float %24, %41, !dbg !22 + %46 = fcmp uno <2 x float> %25, zeroinitializer, !dbg !23 + %47 = fcmp uno float %23, 0.000000e+00, !dbg !23 + %48 = fcmp uno float %24, 0.000000e+00, !dbg !23 + %49 = fcmp uno float %40, 0.000000e+00, !dbg !24 + %50 = fcmp uno float %41, 0.000000e+00, !dbg !24 + %51 = xor i1 %49, true, !dbg !25 + %52 = xor i1 %50, true, !dbg !25 + %53 = and i1 %47, %51, !dbg !26 + %54 = and i1 %48, %52, !dbg !26 + %55 = or i1 %42, %53, !dbg !27 + %56 = or i1 %43, %54, !dbg !27 + %57 = and i1 %47, %49, !dbg !28 + %58 = and i1 %48, %50, !dbg !28 + %59 = or i1 %44, %57, !dbg !29 + %60 = or i1 %45, %58, !dbg !29 + %61 = sext <2 x i32> %26 to <2 x i64>, !dbg !30 + %62 = sext i32 %21 to i64, !dbg !30 + %63 = icmp sgt i64 %28, %62, !dbg !30 + %64 = sext i32 %22 to i64, !dbg !30 + %65 = icmp sgt i64 %29, %64, !dbg !30 + %66 = and i1 %63, %59, !dbg !31 + %67 = and i1 %65, %60, !dbg !31 + %68 = insertelement <2 x i32> poison, i32 %36, i64 0, !dbg !17 + %69 = insertelement <2 x i32> %68, i32 %37, i64 1, !dbg !17 + %70 = bitcast <2 x i32> %69 to <2 x float>, !dbg !17 + %71 = fcmp ogt <2 x float> %25, %70, !dbg !18 + %72 = fcmp oeq <2 x float> %25, %70, !dbg !22 + %73 = fcmp uno <2 x float> %70, zeroinitializer, !dbg !24 + %74 = xor <2 x i1> %73, splat (i1 true), !dbg !25 + %75 = and <2 x i1> %46, %74, !dbg !26 + %76 = or <2 x i1> %71, %75, !dbg !27 + %77 = and <2 x i1> %46, %73, !dbg !28 + %78 = or <2 x i1> %72, %77, !dbg !29 + %79 = insertelement <2 x i64> poison, i64 %27, i64 0, !dbg !30 + %80 = shufflevector <2 x i64> %79, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !30 + %81 = icmp sgt <2 x i64> %80, %61, !dbg !30 + %82 = icmp sge <2 x i64> %80, %61, !dbg !30 + %83 = shufflevector <2 x i1> %81, <2 x i1> %82, <2 x i32> , !dbg !30 + %84 = and <2 x i1> %83, %78, !dbg !31 + %85 = or <2 x i1> %76, %84, !dbg !32 + %86 = or i1 %55, %66, !dbg !32 + %87 = or i1 %56, %67, !dbg !32 + %88 = select <2 x i1> %85, <2 x float> %25, <2 x float> %70, !dbg !33 + %89 = select i1 %86, float %23, float %40, !dbg !33 + %90 = select i1 %87, float %24, float %41, !dbg !33 + %91 = trunc nuw nsw i64 %27 to i32, !dbg !34 + %92 = or disjoint i32 %91, 1, !dbg !34 + %93 = insertelement <2 x i32> poison, i32 %30, i64 0, !dbg !34 + %94 = insertelement <2 x i32> %93, i32 %92, i64 1, !dbg !34 + %95 = select <2 x i1> %85, <2 x i32> %26, <2 x i32> %94, !dbg !34 + %96 = trunc nuw nsw i64 %28 to i32, !dbg !34 + %97 = select i1 %86, i32 %21, i32 %96, !dbg !34 + %98 = trunc nuw nsw i64 %29 to i32, !dbg !34 + %99 = select i1 %87, i32 %22, i32 %98, !dbg !34 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 32, !dbg !13 + %100 = icmp samesign ult i64 %indvars.iv, 31968, !dbg !13 + br i1 %100, label %20, label %101, !dbg !13 + +101: ; preds = %20 + %102 = and i32 %9, 15, !dbg !9 + %103 = or disjoint i32 %8, %102, !dbg !10 + %104 = shufflevector <2 x float> %88, <2 x float> poison, <2 x i32> , !dbg !35 + %105 = fcmp ogt <2 x float> %88, %104, !dbg !35 + %106 = fcmp oeq <2 x float> %88, %104, !dbg !35 + %107 = shufflevector <2 x i1> %105, <2 x i1> %106, <2 x i32> , !dbg !35 + %108 = extractelement <2 x float> %88, i64 0, !dbg !37 + %109 = fcmp uno float %108, 0.000000e+00, !dbg !37 + %110 = extractelement <2 x float> %88, i64 1, !dbg !38 + %111 = fcmp uno float %110, 0.000000e+00, !dbg !38 + %112 = xor i1 %111, true, !dbg !39 + %113 = insertelement <2 x i1> poison, i1 %109, i64 0, !dbg !40 + %114 = shufflevector <2 x i1> %113, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !40 + %115 = insertelement <2 x i1> poison, i1 %112, i64 0, !dbg !40 + %116 = insertelement <2 x i1> %115, i1 %111, i64 1, !dbg !40 + %117 = and <2 x i1> %114, %116, !dbg !40 + %118 = or <2 x i1> %107, %117, !dbg !41 + %119 = extractelement <2 x i32> %95, i64 0, !dbg !42 + %120 = extractelement <2 x i32> %95, i64 1, !dbg !42 + %121 = icmp slt i32 %119, %120, !dbg !42 + %122 = extractelement <2 x i1> %118, i64 1, !dbg !43 + %123 = and i1 %121, %122, !dbg !43 + %124 = extractelement <2 x i1> %118, i64 0, !dbg !44 + %125 = or i1 %124, %123, !dbg !44 + %126 = select i1 %125, float %108, float %110, !dbg !45 + %127 = select i1 %125, i32 %119, i32 %120, !dbg !46 + %128 = fcmp ogt float %126, %89, !dbg !35 + %129 = fcmp oeq float %126, %89, !dbg !47 + %130 = fcmp uno float %126, 0.000000e+00, !dbg !37 + %131 = fcmp uno float %89, 0.000000e+00, !dbg !38 + %132 = xor i1 %131, true, !dbg !39 + %133 = and i1 %130, %132, !dbg !40 + %134 = or i1 %128, %133, !dbg !41 + %135 = and i1 %131, %130, !dbg !48 + %136 = or i1 %129, %135, !dbg !49 + %137 = icmp slt i32 %127, %97, !dbg !42 + %138 = and i1 %137, %136, !dbg !43 + %139 = or i1 %134, %138, !dbg !44 + %140 = select i1 %139, float %126, float %89, !dbg !45 + %141 = select i1 %139, i32 %127, i32 %97, !dbg !46 + %142 = fcmp ogt float %140, %90, !dbg !35 + %143 = fcmp oeq float %140, %90, !dbg !47 + %144 = fcmp uno float %140, 0.000000e+00, !dbg !37 + %145 = fcmp uno float %90, 0.000000e+00, !dbg !38 + %146 = xor i1 %145, true, !dbg !39 + %147 = and i1 %144, %146, !dbg !40 + %148 = or i1 %142, %147, !dbg !41 + %149 = and i1 %145, %144, !dbg !48 + %150 = or i1 %143, %149, !dbg !49 + %151 = icmp slt i32 %141, %99, !dbg !42 + %152 = and i1 %151, %150, !dbg !43 + %153 = or i1 %148, %152, !dbg !44 + %154 = select i1 %153, float %140, float %90, !dbg !45 + %155 = select i1 %153, i32 %141, i32 %99, !dbg !46 + %156 = bitcast float %154 to i32, !dbg !50 + %157 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %156, i32 4, i32 31), !dbg !50 + %158 = bitcast i32 %157 to float, !dbg !50 + %159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %155, i32 4, i32 31), !dbg !50 + %160 = fcmp ogt float %154, %158, !dbg !35 + %161 = fcmp oeq float %154, %158, !dbg !47 + %162 = fcmp uno float %154, 0.000000e+00, !dbg !37 + %163 = fcmp uno float %158, 0.000000e+00, !dbg !38 + %164 = xor i1 %163, true, !dbg !39 + %165 = and i1 %162, %164, !dbg !40 + %166 = or i1 %160, %165, !dbg !41 + %167 = and i1 %162, %163, !dbg !48 + %168 = or i1 %161, %167, !dbg !49 + %169 = icmp slt i32 %155, %159, !dbg !42 + %170 = and i1 %169, %168, !dbg !43 + %171 = or i1 %166, %170, !dbg !44 + %172 = select i1 %171, float %154, float %158, !dbg !45 + %173 = select i1 %171, i32 %155, i32 %159, !dbg !46 + %174 = bitcast float %172 to i32, !dbg !50 + %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 2, i32 31), !dbg !50 + %176 = bitcast i32 %175 to float, !dbg !50 + %177 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %173, i32 2, i32 31), !dbg !50 + %178 = fcmp ogt float %172, %176, !dbg !35 + %179 = fcmp oeq float %172, %176, !dbg !47 + %180 = fcmp uno float %172, 0.000000e+00, !dbg !37 + %181 = fcmp uno float %176, 0.000000e+00, !dbg !38 + %182 = xor i1 %181, true, !dbg !39 + %183 = and i1 %180, %182, !dbg !40 + %184 = or i1 %178, %183, !dbg !41 + %185 = and i1 %181, %180, !dbg !48 + %186 = or i1 %179, %185, !dbg !49 + %187 = icmp slt i32 %173, %177, !dbg !42 + %188 = and i1 %187, %186, !dbg !43 + %189 = or i1 %184, %188, !dbg !44 + %190 = select i1 %189, float %172, float %176, !dbg !45 + %191 = select i1 %189, i32 %173, i32 %177, !dbg !46 + %192 = bitcast float %190 to i32, !dbg !50 + %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 1, i32 31), !dbg !50 + %194 = bitcast i32 %193 to float, !dbg !50 + %195 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %191, i32 1, i32 31), !dbg !50 + %196 = fcmp ogt float %190, %194, !dbg !35 + %197 = fcmp oeq float %190, %194, !dbg !47 + %198 = fcmp uno float %190, 0.000000e+00, !dbg !37 + %199 = fcmp uno float %194, 0.000000e+00, !dbg !38 + %200 = xor i1 %199, true, !dbg !39 + %201 = and i1 %198, %200, !dbg !40 + %202 = or i1 %196, %201, !dbg !41 + %203 = and i1 %199, %198, !dbg !48 + %204 = or i1 %197, %203, !dbg !49 + %205 = icmp slt i32 %191, %195, !dbg !42 + %206 = and i1 %205, %204, !dbg !43 + %207 = or i1 %202, %206, !dbg !44 + %208 = select i1 %207, i32 %191, i32 %195, !dbg !46 + %209 = sext i32 %103 to i64, !dbg !51 + %210 = getelementptr i64, ptr addrspace(1) %1, i64 %209, !dbg !51 + %211 = lshr exact i32 %10, 1, !dbg !52 + %212 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %211, !dbg !52 + %213 = insertelement <1 x i32> poison, i32 %208, i64 0, !dbg !52 + store <1 x i32> %213, ptr addrspace(3) %212, align 4, !dbg !52 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !52 + %214 = shl nuw nsw i32 %102, 2, !dbg !52 + %215 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %214, !dbg !52 + %216 = load i32, ptr addrspace(3) %215, align 4, !dbg !52 + %217 = sext i32 %216 to i64, !dbg !52 + %218 = and i32 %9, 112, !dbg !52 + %219 = icmp eq i32 %218, 0, !dbg !52 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %217, ptr addrspace(1) %210, i1 %219) #4, !dbg !52 + ret void, !dbg !53 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_1", linkageName: "triton_red_fused_argmax_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 19, scope: !4) +!13 = !DILocation(line: 33, column: 40, scope: !4) +!14 = !DILocation(line: 34, column: 31, scope: !4) +!15 = !DILocation(line: 39, column: 52, scope: !4) +!16 = !DILocation(line: 39, column: 34, scope: !4) +!17 = !DILocation(line: 39, column: 66, scope: !4) +!18 = !DILocation(line: 144, column: 21, scope: !19, inlinedAt: !21) +!19 = distinct !DILexicalBlockFile(scope: !4, file: !20, discriminator: 0) +!20 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!21 = !DILocation(line: 42, column: 38, scope: !4) +!22 = !DILocation(line: 145, column: 23, scope: !19, inlinedAt: !21) +!23 = !DILocation(line: 147, column: 29, scope: !19, inlinedAt: !21) +!24 = !DILocation(line: 148, column: 29, scope: !19, inlinedAt: !21) +!25 = !DILocation(line: 149, column: 31, scope: !19, inlinedAt: !21) +!26 = !DILocation(line: 149, column: 27, scope: !19, inlinedAt: !21) +!27 = !DILocation(line: 149, column: 16, scope: !19, inlinedAt: !21) +!28 = !DILocation(line: 151, column: 27, scope: !19, inlinedAt: !21) +!29 = !DILocation(line: 151, column: 17, scope: !19, inlinedAt: !21) +!30 = !DILocation(line: 154, column: 31, scope: !19, inlinedAt: !21) +!31 = !DILocation(line: 154, column: 21, scope: !19, inlinedAt: !21) +!32 = !DILocation(line: 154, column: 12, scope: !19, inlinedAt: !21) +!33 = !DILocation(line: 155, column: 35, scope: !19, inlinedAt: !21) +!34 = !DILocation(line: 155, column: 69, scope: !19, inlinedAt: !21) +!35 = !DILocation(line: 144, column: 21, scope: !19, inlinedAt: !36) +!36 = !DILocation(line: 46, column: 75, scope: !4) +!37 = !DILocation(line: 147, column: 29, scope: !19, inlinedAt: !36) +!38 = !DILocation(line: 148, column: 29, scope: !19, inlinedAt: !36) +!39 = !DILocation(line: 149, column: 31, scope: !19, inlinedAt: !36) +!40 = !DILocation(line: 149, column: 27, scope: !19, inlinedAt: !36) +!41 = !DILocation(line: 149, column: 16, scope: !19, inlinedAt: !36) +!42 = !DILocation(line: 154, column: 31, scope: !19, inlinedAt: !36) +!43 = !DILocation(line: 154, column: 21, scope: !19, inlinedAt: !36) +!44 = !DILocation(line: 154, column: 12, scope: !19, inlinedAt: !36) +!45 = !DILocation(line: 155, column: 35, scope: !19, inlinedAt: !36) +!46 = !DILocation(line: 155, column: 69, scope: !19, inlinedAt: !36) +!47 = !DILocation(line: 145, column: 23, scope: !19, inlinedAt: !36) +!48 = !DILocation(line: 151, column: 27, scope: !19, inlinedAt: !36) +!49 = !DILocation(line: 151, column: 17, scope: !19, inlinedAt: !36) +!50 = !DILocation(line: 165, column: 42, scope: !19, inlinedAt: !36) +!51 = !DILocation(line: 48, column: 25, scope: !4) +!52 = !DILocation(line: 48, column: 36, scope: !4) +!53 = !DILocation(line: 48, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..bdf808b9baf9fbfb7d622e7c7aaca7d132ce4e96 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.ptx @@ -0,0 +1,624 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_argmax_1 // -- Begin function triton_red_fused_argmax_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_argmax_1 +.visible .entry triton_red_fused_argmax_1( + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_1, + .param .u32 triton_red_fused_argmax_1_param_2, + .param .u32 triton_red_fused_argmax_1_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_5 +) +.reqntid 128 +{ + .reg .pred %p<117>; + .reg .b32 %r<93>; + .reg .b64 %rd<31>; + .loc 1 18 0 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:18:0 +$L__func_begin0: + .loc 1 18 0 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_red_fused_argmax_1_param_1]; + ld.param.b64 %rd7, [triton_red_fused_argmax_1_param_0]; +$L__tmp0: + .loc 1 23 28 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:23:28 + mov.u32 %r20, %ctaid.x; + .loc 1 23 33 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:23:33 + shl.b32 %r1, %r20, 4; + .loc 1 24 44 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 120; + bfe.u32 %r21, %r2, 3, 4; + .loc 1 24 23 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:23 + or.b32 %r22, %r21, %r1; + .loc 1 26 37 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:26:37 + shl.b32 %r23, %r2, 2; + and.b32 %r24, %r23, 28; + .loc 1 29 19 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:29:19 + bfe.s32 %r25, %r20, 27, 1; + shr.u32 %r26, %r25, 21; + add.s32 %r27, %r22, %r26; + shr.s32 %r28, %r27, 11; + mul.lo.s32 %r29, %r28, 224000; + .loc 1 33 40 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:33:40 + cvt.u64.u32 %rd1, %r24; + mad.lo.s32 %r30, %r20, 512000, %r29; + mad.lo.s32 %r31, %r21, 32000, %r30; + or.b32 %r32, %r31, %r24; + cvt.u64.u32 %rd2, %r32; + mov.b32 %r89, 0fFF800000; + mov.b64 %rd30, {%r89, %r89}; + mov.b32 %r87, 2147483647; + mov.b64 %rd29, -32; + mov.b32 %r88, %r87; + mov.b32 %r90, %r89; + mov.b32 %r91, %r87; + mov.b32 %r92, %r87; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 34 31 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:34:31 + add.s64 %rd14, %rd1, %rd29; + add.s64 %rd15, %rd14, 32; + add.s64 %rd16, %rd14, 34; + .loc 1 39 52 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:52 + add.s64 %rd17, %rd14, 35; + .loc 1 39 34 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:34 + add.s64 %rd18, %rd2, %rd29; + cvt.u32.u64 %r41, %rd18; + add.s32 %r42, %r41, 32; + mad.wide.s32 %rd12, %r42, 4, %rd7; + .loc 1 39 66 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:66 + // begin inline asm + mov.u64 %rd11, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd11, 1.0; + // end inline asm + mov.b32 %r37, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r33, %r37; + mov.u32 %r34, %r37; + mov.u32 %r35, %r37; + mov.u32 %r36, %r37; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd12 + 0 ], %rd11; + // end inline asm +$L__tmp1: + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.f32 %p2, %r89, %r35; + setp.gt.f32 %p3, %r90, %r36; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.eq.f32 %p4, %r89, %r35; + setp.eq.f32 %p5, %r90, %r36; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + mov.b64 {%r43, %r44}, %rd30; + setp.nan.f32 %p6, %r44, %r44; + setp.nan.f32 %p7, %r43, %r43; + setp.nan.f32 %p8, %r89, %r89; + setp.nan.f32 %p9, %r90, %r90; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.nan.f32 %p10, %r35, %r35; + setp.num.f32 %p11, %r35, %r35; + setp.nan.f32 %p12, %r36, %r36; + setp.num.f32 %p13, %r36, %r36; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p14, %p8, %p11; + and.pred %p15, %p9, %p13; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p16, %p2, %p14; + or.pred %p17, %p3, %p15; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p18, %p8, %p10; + and.pred %p19, %p9, %p12; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p20, %p4, %p18; + or.pred %p21, %p5, %p19; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + cvt.s64.s32 %rd19, %r92; + cvt.s64.s32 %rd20, %r91; + cvt.s64.s32 %rd21, %r87; + setp.gt.s64 %p22, %rd16, %rd21; + cvt.s64.s32 %rd22, %r88; + setp.gt.s64 %p23, %rd17, %rd22; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p24, %p22, %p20; + and.pred %p25, %p23, %p21; +$L__tmp2: + .loc 1 39 66 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:39:66 + cvt.u64.u32 %rd23, %r33; + cvt.u64.u32 %rd24, %r34; + shl.b64 %rd25, %rd24, 32; + or.b64 %rd26, %rd23, %rd25; +$L__tmp3: + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + mov.b64 {%r45, %r46}, %rd26; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.f32 %p26, %r43, %r45; + setp.gt.f32 %p27, %r44, %r46; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.eq.f32 %p28, %r44, %r46; + setp.eq.f32 %p29, %r43, %r45; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.nan.f32 %p30, %r45, %r45; + setp.nan.f32 %p31, %r46, %r46; + setp.num.f32 %p32, %r46, %r46; + setp.num.f32 %p33, %r45, %r45; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p34, %p7, %p33; + and.pred %p35, %p6, %p32; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p36, %p27, %p35; + or.pred %p37, %p26, %p34; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p38, %p6, %p31; + and.pred %p39, %p7, %p30; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p40, %p29, %p39; + or.pred %p41, %p28, %p38; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + setp.gt.s64 %p42, %rd15, %rd20; + setp.ge.s64 %p43, %rd15, %rd19; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + and.pred %p44, %p43, %p41; + and.pred %p45, %p42, %p40; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + or.pred %p46, %p37, %p45; + or.pred %p47, %p36, %p44; + or.pred %p48, %p16, %p24; + or.pred %p49, %p17, %p25; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + selp.f32 %r47, %r44, %r46, %p47; + selp.f32 %r48, %r43, %r45, %p46; + mov.b64 %rd30, {%r48, %r47}; + selp.f32 %r89, %r89, %r35, %p48; + selp.f32 %r90, %r90, %r36, %p49; + cvt.u32.u64 %r49, %rd15; + cvt.u32.u64 %r50, %rd14; + add.s32 %r51, %r50, 33; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:42:38 ] + selp.b32 %r92, %r92, %r51, %p47; + selp.b32 %r91, %r91, %r49, %p46; + cvt.u32.u64 %r52, %rd16; + selp.b32 %r87, %r87, %r52, %p48; + cvt.u32.u64 %r53, %rd17; + selp.b32 %r88, %r88, %r53, %p49; +$L__tmp4: + .loc 1 33 40 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:33:40 + add.s64 %rd29, %rd29, 32; + setp.lt.u64 %p50, %rd29, 31968; + @%p50 bra $L__BB0_1; +// %bb.2: + .loc 1 24 44 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:44 + and.b32 %r54, %r2, 15; + .loc 1 24 23 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:24:23 + or.b32 %r55, %r1, %r54; +$L__tmp5: + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + mov.b64 {%r56, %r57}, %rd30; + setp.gt.f32 %p52, %r56, %r57; + setp.eq.f32 %p53, %r57, %r56; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p54, %r56, %r56; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.num.f32 %p55, %r57, %r57; + setp.nan.f32 %p56, %r57, %r57; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p57, %p54, %p56; + and.pred %p58, %p54, %p55; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p59, %p52, %p58; + or.pred %p60, %p53, %p57; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p61, %r91, %r92; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p62, %p61, %p60; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p63, %p59, %p62; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r58, %r56, %r57, %p63; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r59, %r91, %r92, %p63; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p64, %r58, %r89; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p65, %r58, %r89; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p66, %r58, %r58; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p67, %r89, %r89; + setp.num.f32 %p68, %r89, %r89; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p69, %p66, %p68; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p70, %p64, %p69; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p71, %p67, %p66; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p72, %p65, %p71; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p73, %r59, %r87; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p74, %p73, %p72; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p75, %p70, %p74; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r60, %r58, %r89, %p75; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r61, %r59, %r87, %p75; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p76, %r60, %r90; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p77, %r60, %r90; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p78, %r60, %r60; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p79, %r90, %r90; + setp.num.f32 %p80, %r90, %r90; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p81, %p78, %p80; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p82, %p76, %p81; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p83, %p79, %p78; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p84, %p77, %p83; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p85, %r61, %r88; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p86, %p85, %p84; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p87, %p82, %p86; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r62, %r60, %r90, %p87; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r63, %r61, %r88, %p87; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r64, %r62, 4, 31, -1; + shfl.sync.bfly.b32 %r65, %r63, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p88, %r62, %r64; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p89, %r62, %r64; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p90, %r62, %r62; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p91, %r64, %r64; + setp.num.f32 %p92, %r64, %r64; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p93, %p90, %p92; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p94, %p88, %p93; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p95, %p90, %p91; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p96, %p89, %p95; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p97, %r63, %r65; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p98, %p97, %p96; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p99, %p94, %p98; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r66, %r62, %r64, %p99; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r67, %r63, %r65, %p99; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r68, %r66, 2, 31, -1; + shfl.sync.bfly.b32 %r69, %r67, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p100, %r66, %r68; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p101, %r66, %r68; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p102, %r66, %r66; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p103, %r68, %r68; + setp.num.f32 %p104, %r68, %r68; + .loc 2 149 27 // triton_helpers.py:149:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p105, %p102, %p104; + .loc 2 149 16 // triton_helpers.py:149:16 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p106, %p100, %p105; + .loc 2 151 27 // triton_helpers.py:151:27 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p107, %p103, %p102; + .loc 2 151 17 // triton_helpers.py:151:17 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p108, %p101, %p107; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p109, %r67, %r69; + .loc 2 154 21 // triton_helpers.py:154:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + and.pred %p110, %p109, %p108; + .loc 2 154 12 // triton_helpers.py:154:12 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + or.pred %p111, %p106, %p110; + .loc 2 155 35 // triton_helpers.py:155:35 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.f32 %r70, %r66, %r68, %p111; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r71, %r67, %r69, %p111; + .loc 2 165 42 // triton_helpers.py:165:42 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + shfl.sync.bfly.b32 %r72, %r70, 1, 31, -1; + shfl.sync.bfly.b32 %r73, %r71, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.gt.f32 %p112, %r70, %r72; + .loc 2 145 23 // triton_helpers.py:145:23 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.eq.f32 %p113, %r70, %r72; + .loc 2 147 29 // triton_helpers.py:147:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p114, %r70, %r70; + .loc 2 148 29 // triton_helpers.py:148:29 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.nan.f32 %p115, %r72, %r72; + .loc 2 154 31 // triton_helpers.py:154:31 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + setp.lt.s32 %p116, %r71, %r73; + .loc 2 155 69 // triton_helpers.py:155:69 @[ ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:46:75 ] + selp.b32 %r74, %r71, %r73, %p114; + selp.b32 %r75, %r74, %r73, %p115; + selp.b32 %r76, %r71, %r75, %p113; + selp.b32 %r77, %r76, %r73, %p116; + selp.b32 %r78, %r77, %r71, %p115; + selp.b32 %r79, %r78, %r77, %p114; + selp.b32 %r80, %r71, %r79, %p112; +$L__tmp6: + .loc 1 48 25 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:48:25 + mad.wide.s32 %rd28, %r55, 8, %rd8; + .loc 1 48 36 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:48:36 + shr.u32 %r81, %r3, 1; + mov.b32 %r82, global_smem; + add.s32 %r83, %r82, %r81; + st.shared.b32 [%r83], %r80; + bar.sync 0; + shl.b32 %r84, %r54, 2; + add.s32 %r85, %r82, %r84; + ld.shared.s32 %rd27, [%r85]; + and.b32 %r86, %r2, 112; + setp.eq.b32 %p51, %r86, 0; + // begin inline asm + @%p51 st.global.b64 [ %rd28 + 0 ], { %rd27 }; + // end inline asm + .loc 1 48 4 // ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py:48:4 + ret; +$L__tmp7: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 234 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 101 +.b8 53 +.b8 114 +.b8 111 +.b8 97 +.b8 116 +.b8 115 +.b8 101 +.b8 122 +.b8 111 +.b8 100 +.b8 113 +.b8 102 +.b8 53 +.b8 113 +.b8 112 +.b8 100 +.b8 51 +.b8 50 +.b8 104 +.b8 107 +.b8 115 +.b8 112 +.b8 116 +.b8 52 +.b8 102 +.b8 99 +.b8 102 +.b8 105 +.b8 113 +.b8 111 +.b8 55 +.b8 106 +.b8 111 +.b8 98 +.b8 111 +.b8 109 +.b8 55 +.b8 112 +.b8 122 +.b8 104 +.b8 115 +.b8 102 +.b8 116 +.b8 117 +.b8 107 +.b8 50 +.b8 55 +.b8 103 +.b8 106 +.b8 105 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 101 +.b8 53 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.source new file mode 100644 index 0000000000000000000000000000000000000000..5fe60123df601854a2096e1a402553606ea38a53 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.source @@ -0,0 +1,315 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":18:0) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc45 = loc(unknown) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("out_ptr0"(#loc)) +#loc72 = loc("xnumel"(#loc)) +#loc73 = loc("r0_numel"(#loc)) +#loc100 = loc("a_value"(#loc33)) +#loc101 = loc("a_index"(#loc33)) +#loc102 = loc("b_value"(#loc33)) +#loc103 = loc("b_index"(#loc33)) +#loc116 = loc("x"(#loc53)) +#loc117 = loc("x"(#loc57)) +#loc118 = loc("value"(#loc66)) +#loc119 = loc("index"(#loc66)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 16384 : i32 loc(#loc74) + %r0_numel_1 = arith.constant 32000 : i32 loc(#loc75) + %xoffset = tt.get_program_id x : i32 loc(#loc76) + %xoffset_2 = arith.constant 16 : i32 loc(#loc77) + %xoffset_3 = arith.constant 16 : i32 loc(#loc77) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc77) + %xindex = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc78) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<16xi32> -> tensor<16x1xi32> loc(#loc79) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<16x1xi32> loc(#loc80) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<16x1xi32> loc(#loc80) + %xmask = arith.constant true loc(#loc81) + %xmask_8 = arith.constant dense : tensor<16x32xi1> loc(#loc81) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc82) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc83) + %x0 = arith.constant 2048 : i32 loc(#loc84) + %x0_10 = arith.constant 2048 : i32 loc(#loc84) + %x0_11 = arith.constant dense<2048> : tensor<16x1xi32> loc(#loc84) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<16x1xi32> loc(#loc84) + %x1 = arith.constant 2048 : i32 loc(#loc85) + %x1_13 = arith.constant 2048 : i32 loc(#loc85) + %x1_14 = arith.constant dense<2048> : tensor<16x1xi32> loc(#loc85) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<16x1xi32> loc(#loc85) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc86) + %_tmp2_16 = arith.constant dense<0xFF800000> : tensor<16x32xf32> loc(#loc86) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc87) + %_tmp2_index_17 = arith.constant dense<2147483647> : tensor<16x32xi32> loc(#loc87) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c32_i32 = arith.constant 32 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c32_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp2_index_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_19 = %_tmp2_16, %_tmp2_index_20 = %_tmp2_index_17) -> (tensor<16x32xf32>, tensor<16x32xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc89) + %r0_index_21 = arith.addi %r0_index, %r0_base_9 : tensor<1x32xi32> loc(#loc89) + %r0_mask = arith.constant dense<32000> : tensor<1x32xi32> loc(#loc90) + %r0_mask_22 = arith.cmpi slt, %r0_index_21, %r0_mask : tensor<1x32xi32> loc(#loc90) + %tmp0 = arith.constant 32000 : i32 loc(#loc91) + %tmp0_23 = arith.constant 32000 : i32 loc(#loc91) + %tmp0_24 = arith.constant dense<32000> : tensor<16x1xi32> loc(#loc91) + %tmp0_25 = arith.muli %tmp0_24, %x0_12 : tensor<16x1xi32> loc(#loc91) + %tmp0_26 = tt.broadcast %r0_index_21 : tensor<1x32xi32> -> tensor<16x32xi32> loc(#loc92) + %tmp0_27 = tt.broadcast %tmp0_25 : tensor<16x1xi32> -> tensor<16x32xi32> loc(#loc92) + %tmp0_28 = arith.addi %tmp0_26, %tmp0_27 : tensor<16x32xi32> loc(#loc92) + %tmp0_29 = arith.constant 65760000 : i32 loc(#loc93) + %tmp0_30 = arith.constant 65760000 : i32 loc(#loc93) + %tmp0_31 = arith.constant dense<65760000> : tensor<16x1xi32> loc(#loc93) + %tmp0_32 = arith.muli %tmp0_31, %x1_15 : tensor<16x1xi32> loc(#loc93) + %tmp0_33 = tt.broadcast %tmp0_32 : tensor<16x1xi32> -> tensor<16x32xi32> loc(#loc94) + %tmp0_34 = arith.addi %tmp0_28, %tmp0_33 : tensor<16x32xi32> loc(#loc94) + %tmp0_35 = tt.splat %in_ptr0 : !tt.ptr -> tensor<16x32x!tt.ptr> loc(#loc95) + %tmp0_36 = tt.addptr %tmp0_35, %tmp0_34 : tensor<16x32x!tt.ptr>, tensor<16x32xi32> loc(#loc95) + %tmp0_37 = arith.constant 0.000000e+00 : f32 loc(#loc96) + %tmp0_38 = tt.broadcast %r0_mask_22 : tensor<1x32xi1> -> tensor<16x32xi1> loc(#loc96) + %tmp0_39 = arith.constant dense<0.000000e+00> : tensor<16x32xf32> loc(#loc96) + %tmp0_40 = tt.load %tmp0_36, %tmp0_38, %tmp0_39 evictionPolicy = evict_first : tensor<16x32x!tt.ptr> loc(#loc96) + %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S16_32S_i32S16_32S_fp32S16_32S_i32S1_32S__(%_tmp2_19, %_tmp2_index_20, %tmp0_40, %r0_index_21) : (tensor<16x32xf32>, tensor<16x32xi32>, tensor<16x32xf32>, tensor<1x32xi32>) -> (tensor<16x32xf32>, tensor<16x32xi32>) loc(#loc24) + %_tmp2_41 = tt.broadcast %r0_mask_22 : tensor<1x32xi1> -> tensor<16x32xi1> loc(#loc97) + %_tmp2_42 = arith.select %_tmp2_41, %8#0, %_tmp2_19 : tensor<16x32xi1>, tensor<16x32xf32> loc(#loc97) + %_tmp2_index_43 = tt.broadcast %r0_mask_22 : tensor<1x32xi1> -> tensor<16x32xi1> loc(#loc98) + %_tmp2_index_44 = arith.select %_tmp2_index_43, %8#1, %_tmp2_index_20 : tensor<16x32xi1>, tensor<16x32xi32> loc(#loc98) + scf.yield %_tmp2_42, %_tmp2_index_44 : tensor<16x32xf32>, tensor<16x32xi32> loc(#loc27) + } loc(#loc120) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S16_32S_i32S16_32S__(2,)cconstexpr_1_"(%_tmp2_index_18#0, %_tmp2_index_18#1) : (tensor<16x32xf32>, tensor<16x32xi32>) -> (tensor<16xf32>, tensor<16xi32>) loc(#loc28) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<16xi32> -> tensor<16x1xi32> loc(#loc99) + %5 = tt.splat %out_ptr0 : !tt.ptr -> tensor<16x1x!tt.ptr> loc(#loc30) + %6 = tt.addptr %5, %xindex_7 : tensor<16x1x!tt.ptr>, tensor<16x1xi32> loc(#loc30) + %7 = arith.extsi %tmp2 : tensor<16x1xi32> to tensor<16x1xi64> loc(#loc31) + tt.store %6, %7 : tensor<16x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S16_32S_i32S16_32S_fp32S16_32S_i32S1_32S__(%a_value: tensor<16x32xf32> loc("a_value"(#loc33)), %a_index: tensor<16x32xi32> loc("a_index"(#loc33)), %b_value: tensor<16x32xf32> loc("b_value"(#loc33)), %b_index: tensor<1x32xi32> loc("b_index"(#loc33))) -> (tensor<16x32xf32>, tensor<16x32xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<16x32xf32> loc(#loc121) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<16x32xf32> loc(#loc122) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S16_32S__(%a_value) : (tensor<16x32xf32>) -> i1 loc(#loc36) + %1:2 = scf.if %0 -> (tensor<16x32xi1>, tensor<16x32xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<16x32xf32> loc(#loc106) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<16x32xf32> loc(#loc107) + %mask_4 = arith.constant true loc(#loc108) + %mask_5 = arith.constant dense : tensor<16x32xi1> loc(#loc108) + %mask_6 = arith.xori %b_isnan, %mask_5 : tensor<16x32xi1> loc(#loc108) + %mask_7 = arith.andi %a_isnan, %mask_6 : tensor<16x32xi1> loc(#loc109) + %mask_8 = arith.ori %mask, %mask_7 : tensor<16x32xi1> loc(#loc123) + %equal_9 = arith.andi %a_isnan, %b_isnan : tensor<16x32xi1> loc(#loc111) + %equal_10 = arith.ori %equal, %equal_9 : tensor<16x32xi1> loc(#loc124) + scf.yield %mask_8, %equal_10 : tensor<16x32xi1>, tensor<16x32xi1> loc(#loc124) + } else { + scf.yield %mask, %equal : tensor<16x32xi1>, tensor<16x32xi1> loc(#loc45) + } loc(#loc37) + %mask_0 = tt.broadcast %b_index : tensor<1x32xi32> -> tensor<16x32xi32> loc(#loc113) + %mask_1 = arith.cmpi slt, %a_index, %mask_0 : tensor<16x32xi32> loc(#loc113) + %mask_2 = arith.andi %1#1, %mask_1 : tensor<16x32xi1> loc(#loc114) + %mask_3 = arith.ori %1#0, %mask_2 : tensor<16x32xi1> loc(#loc115) + %2 = arith.select %mask_3, %a_value, %b_value : tensor<16x32xi1>, tensor<16x32xf32> loc(#loc49) + %3 = tt.broadcast %b_index : tensor<1x32xi32> -> tensor<16x32xi32> loc(#loc50) + %4 = arith.select %mask_3, %a_index, %3 : tensor<16x32xi1>, tensor<16x32xi32> loc(#loc50) + tt.return %2, %4 : tensor<16x32xf32>, tensor<16x32xi32> loc(#loc51) + ^bb1: // no predecessors + %5 = ub.poison : tensor<16x32xf32> loc(#loc52) + %6 = ub.poison : tensor<16x32xi32> loc(#loc52) + tt.return %5, %6 : tensor<16x32xf32>, tensor<16x32xi32> loc(#loc52) + } loc(#loc33) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S16_32S__(%x: tensor<16x32xf32> loc("x"(#loc53))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S16_32S__(%x) : (tensor<16x32xf32>) -> tensor<16x32xf32> loc(#loc54) + %true = arith.constant true loc(#loc55) + tt.return %true : i1 loc(#loc55) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc56) + tt.return %1 : i1 loc(#loc56) + } loc(#loc53) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S16_32S__(%x: tensor<16x32xf32> loc("x"(#loc57))) -> tensor<16x32xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc59) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<16x32xf32> loc(#loc59) + %4 = arith.addf %x, %3 : tensor<16x32xf32> loc(#loc59) + tt.return %4 : tensor<16x32xf32> loc(#loc60) + ^bb1: // no predecessors + %5 = ub.poison : tensor<16x32xf32> loc(#loc61) + tt.return %5 : tensor<16x32xf32> loc(#loc61) + } loc(#loc57) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc63) + %cst = arith.constant dense : tensor<1xi1> loc(#loc63) + tt.return %cst : tensor<1xi1> loc(#loc64) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc65) + tt.return %0 : tensor<1xi1> loc(#loc65) + } loc(#loc62) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S16_32S_i32S16_32S__(2,)cconstexpr_1_"(%value: tensor<16x32xf32> loc("value"(#loc66)), %index: tensor<16x32xi32> loc("index"(#loc66))) -> (tensor<16xf32>, tensor<16xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc67) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc67) + }) : (tensor<16x32xf32>, tensor<16x32xi32>) -> (tensor<16xf32>, tensor<16xi32>) loc(#loc67) + tt.return %0#0, %0#1 : tensor<16xf32>, tensor<16xi32> loc(#loc68) + ^bb1: // no predecessors + %1 = ub.poison : tensor<16xf32> loc(#loc69) + %2 = ub.poison : tensor<16xi32> loc(#loc69) + tt.return %1, %2 : tensor<16xf32>, tensor<16xi32> loc(#loc69) + } loc(#loc66) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc33)), %a_index: i32 loc("a_index"(#loc33)), %b_value: f32 loc("b_value"(#loc33)), %b_index: i32 loc("b_index"(#loc33))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc121) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc122) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc36) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc106) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc107) + %mask_3 = arith.constant true loc(#loc108) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc108) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc109) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc123) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc111) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc124) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc124) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc45) + } loc(#loc37) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc113) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc114) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc115) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc49) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc50) + tt.return %2, %3 : f32, i32 loc(#loc51) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc52) + %5 = ub.poison : i32 loc(#loc52) + tt.return %4, %5 : f32, i32 loc(#loc52) + } loc(#loc33) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc53))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc54) + %true = arith.constant true loc(#loc55) + tt.return %true : i1 loc(#loc55) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc56) + tt.return %1 : i1 loc(#loc56) + } loc(#loc53) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc57))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc59) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc59) + tt.return %3 : tensor<1xf32> loc(#loc60) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc61) + tt.return %4 : tensor<1xf32> loc(#loc61) + } loc(#loc57) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":29:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":30:55) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":31:58) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":33:40) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":34:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":35:29) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:47) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:61) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:52) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:66) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":42:38) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":44:46) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:58) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:8) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":46:75) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":47:20) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:25) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:36) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:4) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc74 = loc("xnumel"(#loc1)) +#loc75 = loc("r0_numel"(#loc2)) +#loc76 = loc("xoffset"(#loc3)) +#loc77 = loc("xoffset"(#loc4)) +#loc78 = loc("xindex"(#loc5)) +#loc79 = loc("xindex"(#loc6)) +#loc80 = loc("xindex"(#loc7)) +#loc81 = loc("xmask"(#loc8)) +#loc82 = loc("r0_base"(#loc9)) +#loc83 = loc("r0_base"(#loc10)) +#loc84 = loc("x0"(#loc11)) +#loc85 = loc("x1"(#loc12)) +#loc86 = loc("_tmp2"(#loc13)) +#loc87 = loc("_tmp2_index"(#loc14)) +#loc88 = loc("_tmp2"(#loc15)) +#loc89 = loc("r0_index"(#loc16)) +#loc90 = loc("r0_mask"(#loc17)) +#loc91 = loc("tmp0"(#loc18)) +#loc92 = loc("tmp0"(#loc19)) +#loc93 = loc("tmp0"(#loc20)) +#loc94 = loc("tmp0"(#loc21)) +#loc95 = loc("tmp0"(#loc22)) +#loc96 = loc("tmp0"(#loc23)) +#loc97 = loc("_tmp2"(#loc25)) +#loc98 = loc("_tmp2_index"(#loc26)) +#loc99 = loc("tmp2"(#loc29)) +#loc104 = loc("mask"(#loc34)) +#loc105 = loc("equal"(#loc35)) +#loc106 = loc("a_isnan"(#loc38)) +#loc107 = loc("b_isnan"(#loc39)) +#loc108 = loc("mask"(#loc40)) +#loc109 = loc("mask"(#loc41)) +#loc110 = loc("mask"(#loc42)) +#loc111 = loc("equal"(#loc43)) +#loc112 = loc("equal"(#loc44)) +#loc113 = loc("mask"(#loc46)) +#loc114 = loc("mask"(#loc47)) +#loc115 = loc("mask"(#loc48)) +#loc120 = loc("_tmp2_index"(#loc88)) +#loc121 = loc("mask"(#loc104)) +#loc122 = loc("equal"(#loc105)) +#loc123 = loc("mask"(#loc110)) +#loc124 = loc("equal"(#loc112)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..fbda09378bda34a9daae148284b4bcafe2000e85 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.ttgir @@ -0,0 +1,203 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":18:0) +#loc1 = loc(unknown) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":46:75) +#loc42 = loc("in_ptr0"(#loc)) +#loc43 = loc("out_ptr0"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc79 = loc(callsite(#loc1 at #loc37)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<65760000> : tensor<16x1xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<32000> : tensor<16x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<32000> : tensor<1x32xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<2048> : tensor<16x1xi32, #blocked> loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<16x32xf32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %true = arith.constant true loc(#loc1) + %cst_4 = arith.constant dense : tensor<16x32xi1, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<2147483647> : tensor<16x32xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<0xFF800000> : tensor<16x32xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc46) + %xoffset_7 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc47) + %xindex = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc48) + %xindex_8 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc48) + %xindex_9 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi32, #blocked> loc(#loc48) + %xindex_10 = tt.expand_dims %xindex_8 {axis = 1 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<16x1xi32, #blocked1> loc(#loc48) + %xindex_11 = tt.splat %xoffset_7 : i32 -> tensor<16x1xi32, #blocked> loc(#loc49) + %xindex_12 = tt.splat %xoffset_7 : i32 -> tensor<16x1xi32, #blocked1> loc(#loc49) + %xindex_13 = arith.addi %xindex_11, %xindex_9 : tensor<16x1xi32, #blocked> loc(#loc49) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<16x1xi32, #blocked1> loc(#loc49) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc50) + %r0_base_15 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> loc(#loc50) + %x0 = arith.remsi %xindex_13, %cst_2 : tensor<16x1xi32, #blocked> loc(#loc51) + %x1 = arith.divsi %xindex_13, %cst_2 : tensor<16x1xi32, #blocked> loc(#loc52) + %tmp0 = arith.muli %x0, %cst_0 : tensor<16x1xi32, #blocked> loc(#loc53) + %tmp0_16 = tt.broadcast %tmp0 : tensor<16x1xi32, #blocked> -> tensor<16x32xi32, #blocked> loc(#loc54) + %tmp0_17 = arith.muli %x1, %cst : tensor<16x1xi32, #blocked> loc(#loc55) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<16x1xi32, #blocked> -> tensor<16x32xi32, #blocked> loc(#loc56) + %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr -> tensor<16x32x!tt.ptr, #blocked> loc(#loc57) + %_tmp2_index:2 = scf.for %_tmp2_index_20 = %c0_i32 to %c32000_i32 step %c32_i32 iter_args(%_tmp2 = %cst_6, %_tmp2_index_21 = %cst_5) -> (tensor<16x32xf32, #blocked>, tensor<16x32xi32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp2_index_20 : i32 -> tensor<1x32xi32, #blocked> loc(#loc59) + %r0_index_22 = arith.addi %r0_index, %r0_base_15 : tensor<1x32xi32, #blocked> loc(#loc59) + %r0_mask = arith.cmpi slt, %r0_index_22, %cst_1 : tensor<1x32xi32, #blocked> loc(#loc60) + %tmp0_23 = tt.broadcast %r0_index_22 : tensor<1x32xi32, #blocked> -> tensor<16x32xi32, #blocked> loc(#loc54) + %tmp0_24 = arith.addi %tmp0_23, %tmp0_16 : tensor<16x32xi32, #blocked> loc(#loc54) + %tmp0_25 = arith.addi %tmp0_24, %tmp0_18 : tensor<16x32xi32, #blocked> loc(#loc56) + %tmp0_26 = tt.addptr %tmp0_19, %tmp0_25 : tensor<16x32x!tt.ptr, #blocked>, tensor<16x32xi32, #blocked> loc(#loc57) + %tmp0_27 = tt.broadcast %r0_mask : tensor<1x32xi1, #blocked> -> tensor<16x32xi1, #blocked> loc(#loc61) + %tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst_3 evictionPolicy = evict_first : tensor<16x32x!tt.ptr, #blocked> loc(#loc61) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_28 : tensor<16x32xf32, #blocked> loc(#loc104) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_28 : tensor<16x32xf32, #blocked> loc(#loc105) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<16x32xf32, #blocked> loc(#loc84) + %b_isnan = arith.cmpf une, %tmp0_28, %tmp0_28 : tensor<16x32xf32, #blocked> loc(#loc85) + %mask_29 = arith.xori %b_isnan, %cst_4 : tensor<16x32xi1, #blocked> loc(#loc86) + %mask_30 = arith.andi %a_isnan, %mask_29 : tensor<16x32xi1, #blocked> loc(#loc87) + %mask_31 = arith.ori %mask, %mask_30 : tensor<16x32xi1, #blocked> loc(#loc106) + %equal_32 = arith.andi %a_isnan, %b_isnan : tensor<16x32xi1, #blocked> loc(#loc89) + %equal_33 = arith.ori %equal, %equal_32 : tensor<16x32xi1, #blocked> loc(#loc107) + %mask_34 = arith.cmpi slt, %_tmp2_index_21, %tmp0_23 : tensor<16x32xi32, #blocked> loc(#loc91) + %mask_35 = arith.andi %equal_33, %mask_34 : tensor<16x32xi1, #blocked> loc(#loc92) + %mask_36 = arith.ori %mask_31, %mask_35 : tensor<16x32xi1, #blocked> loc(#loc93) + %5 = arith.select %mask_36, %_tmp2, %tmp0_28 : tensor<16x32xi1, #blocked>, tensor<16x32xf32, #blocked> loc(#loc74) + %6 = arith.select %mask_36, %_tmp2_index_21, %tmp0_23 : tensor<16x32xi1, #blocked>, tensor<16x32xi32, #blocked> loc(#loc75) + %_tmp2_37 = arith.select %tmp0_27, %5, %_tmp2 : tensor<16x32xi1, #blocked>, tensor<16x32xf32, #blocked> loc(#loc76) + %_tmp2_index_38 = arith.select %tmp0_27, %6, %_tmp2_index_21 : tensor<16x32xi1, #blocked>, tensor<16x32xi32, #blocked> loc(#loc77) + scf.yield %_tmp2_37, %_tmp2_index_38 : tensor<16x32xf32, #blocked>, tensor<16x32xi32, #blocked> loc(#loc35) + } loc(#loc81) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc37)), %arg5: i32 loc(callsite(#loc1 at #loc37)), %arg6: f32 loc(callsite(#loc1 at #loc37)), %arg7: i32 loc(callsite(#loc1 at #loc37))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc108) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc109) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc94) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc95) + %mask_20 = arith.xori %b_isnan, %true : i1 loc(#loc96) + %mask_21 = arith.andi %a_isnan, %mask_20 : i1 loc(#loc97) + %mask_22 = arith.ori %mask, %mask_21 : i1 loc(#loc110) + %equal_23 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc98) + %equal_24 = arith.ori %equal, %equal_23 : i1 loc(#loc111) + %mask_25 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc99) + %mask_26 = arith.andi %equal_24, %mask_25 : i1 loc(#loc100) + %mask_27 = arith.ori %mask_22, %mask_26 : i1 loc(#loc101) + %5 = arith.select %mask_27, %arg4, %arg6 : f32 loc(#loc102) + %6 = arith.select %mask_27, %arg5, %arg7 : i32 loc(#loc103) + tt.reduce.return %5, %6 : f32, i32 loc(#loc78) + }) : (tensor<16x32xf32, #blocked>, tensor<16x32xi32, #blocked>) -> (tensor<16xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc78) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<16xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi32, #blocked> loc(#loc80) + %1 = tt.splat %out_ptr0 : !tt.ptr -> tensor<16x1x!tt.ptr, #blocked1> loc(#loc39) + %2 = tt.addptr %1, %xindex_14 : tensor<16x1x!tt.ptr, #blocked1>, tensor<16x1xi32, #blocked1> loc(#loc39) + %3 = ttg.convert_layout %tmp2 : tensor<16x1xi32, #blocked> -> tensor<16x1xi32, #blocked1> loc(#loc40) + %4 = arith.extsi %3 : tensor<16x1xi32, #blocked1> to tensor<16x1xi64, #blocked1> loc(#loc40) + tt.store %2, %4 : tensor<16x1x!tt.ptr, #blocked1> loc(#loc40) + tt.return loc(#loc41) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":29:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:47) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:61) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:52) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:34) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":33:40) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":34:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":35:29) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:66) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":42:38) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":44:46) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:58) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:8) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":47:20) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:25) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:36) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:4) +#loc46 = loc("xoffset"(#loc2)) +#loc47 = loc("xoffset"(#loc3)) +#loc48 = loc("xindex"(#loc4)) +#loc49 = loc("xindex"(#loc5)) +#loc50 = loc("r0_base"(#loc6)) +#loc51 = loc("x0"(#loc7)) +#loc52 = loc("x1"(#loc8)) +#loc53 = loc("tmp0"(#loc9)) +#loc54 = loc("tmp0"(#loc10)) +#loc55 = loc("tmp0"(#loc11)) +#loc56 = loc("tmp0"(#loc12)) +#loc57 = loc("tmp0"(#loc13)) +#loc58 = loc("_tmp2"(#loc14)) +#loc59 = loc("r0_index"(#loc15)) +#loc60 = loc("r0_mask"(#loc16)) +#loc61 = loc("tmp0"(#loc17)) +#loc62 = loc("mask"(#loc18)) +#loc63 = loc("equal"(#loc20)) +#loc64 = loc("a_isnan"(#loc21)) +#loc65 = loc("b_isnan"(#loc22)) +#loc66 = loc("mask"(#loc23)) +#loc67 = loc("mask"(#loc24)) +#loc68 = loc("mask"(#loc25)) +#loc69 = loc("equal"(#loc26)) +#loc70 = loc("equal"(#loc27)) +#loc71 = loc("mask"(#loc28)) +#loc72 = loc("mask"(#loc29)) +#loc73 = loc("mask"(#loc30)) +#loc74 = loc(callsite(#loc31 at #loc19)) +#loc75 = loc(callsite(#loc32 at #loc19)) +#loc76 = loc("_tmp2"(#loc33)) +#loc77 = loc("_tmp2_index"(#loc34)) +#loc78 = loc(callsite(#loc36 at #loc37)) +#loc80 = loc("tmp2"(#loc38)) +#loc81 = loc("_tmp2_index"(#loc58)) +#loc82 = loc("mask"(#loc62)) +#loc83 = loc("equal"(#loc63)) +#loc84 = loc(callsite(#loc64 at #loc19)) +#loc85 = loc(callsite(#loc65 at #loc19)) +#loc86 = loc(callsite(#loc66 at #loc19)) +#loc87 = loc(callsite(#loc67 at #loc19)) +#loc88 = loc("mask"(#loc68)) +#loc89 = loc(callsite(#loc69 at #loc19)) +#loc90 = loc("equal"(#loc70)) +#loc91 = loc(callsite(#loc71 at #loc19)) +#loc92 = loc(callsite(#loc72 at #loc19)) +#loc93 = loc(callsite(#loc73 at #loc19)) +#loc94 = loc(callsite(#loc64 at #loc78)) +#loc95 = loc(callsite(#loc65 at #loc78)) +#loc96 = loc(callsite(#loc66 at #loc78)) +#loc97 = loc(callsite(#loc67 at #loc78)) +#loc98 = loc(callsite(#loc69 at #loc78)) +#loc99 = loc(callsite(#loc71 at #loc78)) +#loc100 = loc(callsite(#loc72 at #loc78)) +#loc101 = loc(callsite(#loc73 at #loc78)) +#loc102 = loc(callsite(#loc31 at #loc78)) +#loc103 = loc(callsite(#loc32 at #loc78)) +#loc104 = loc(callsite(#loc82 at #loc19)) +#loc105 = loc(callsite(#loc83 at #loc19)) +#loc106 = loc(callsite(#loc88 at #loc19)) +#loc107 = loc(callsite(#loc90 at #loc19)) +#loc108 = loc(callsite(#loc82 at #loc78)) +#loc109 = loc(callsite(#loc83 at #loc78)) +#loc110 = loc(callsite(#loc88 at #loc78)) +#loc111 = loc(callsite(#loc90 at #loc78)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..925160e8e75fffc07342f8bca6205a9f9e7725c9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/TPWYZLHTZRN4VJOENMIFGEZAFFCCAG67DUWXCRI3HIQAWPXXGEWA/triton_red_fused_argmax_1.ttir @@ -0,0 +1,204 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":46:75) +#loc46 = loc("in_ptr0"(#loc)) +#loc47 = loc("out_ptr0"(#loc)) +#loc48 = loc("xnumel"(#loc)) +#loc49 = loc("r0_numel"(#loc)) +#loc50 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %true = arith.constant true loc(#loc50) + %cst = arith.constant dense : tensor<16x32xi1> loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc3) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<16x32xf32> loc(#loc1) + %cst_1 = arith.constant dense<65760000> : tensor<16x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<32000> : tensor<16x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<32000> : tensor<1x32xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<16x32xi32> loc(#loc51) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<16x32xf32> loc(#loc52) + %cst_4 = arith.constant dense<2048> : tensor<16x1xi32> loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc53) + %xoffset_5 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc54) + %xindex = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc55) + %xindex_6 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<16xi32> -> tensor<16x1xi32> loc(#loc56) + %xindex_7 = tt.splat %xoffset_5 : i32 -> tensor<16x1xi32> loc(#loc57) + %xindex_8 = arith.addi %xindex_7, %xindex_6 : tensor<16x1xi32> loc(#loc57) + %r0_base = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc58) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32> loc(#loc59) + %x0 = arith.remsi %xindex_8, %cst_4 : tensor<16x1xi32> loc(#loc60) + %x1 = arith.divsi %xindex_8, %cst_4 : tensor<16x1xi32> loc(#loc61) + %_tmp2_index_10:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c32_i32 iter_args(%_tmp2_11 = %_tmp2, %_tmp2_index_12 = %_tmp2_index) -> (tensor<16x32xf32>, tensor<16x32xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32xi32> loc(#loc63) + %r0_index_13 = arith.addi %r0_index, %r0_base_9 : tensor<1x32xi32> loc(#loc63) + %r0_mask = arith.cmpi slt, %r0_index_13, %cst_3 : tensor<1x32xi32> loc(#loc64) + %tmp0 = arith.muli %x0, %cst_2 : tensor<16x1xi32> loc(#loc65) + %tmp0_14 = tt.broadcast %r0_index_13 : tensor<1x32xi32> -> tensor<16x32xi32> loc(#loc66) + %tmp0_15 = tt.broadcast %tmp0 : tensor<16x1xi32> -> tensor<16x32xi32> loc(#loc66) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<16x32xi32> loc(#loc66) + %tmp0_17 = arith.muli %x1, %cst_1 : tensor<16x1xi32> loc(#loc67) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<16x1xi32> -> tensor<16x32xi32> loc(#loc68) + %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<16x32xi32> loc(#loc68) + %tmp0_20 = tt.splat %in_ptr0 : !tt.ptr -> tensor<16x32x!tt.ptr> loc(#loc69) + %tmp0_21 = tt.addptr %tmp0_20, %tmp0_19 : tensor<16x32x!tt.ptr>, tensor<16x32xi32> loc(#loc69) + %tmp0_22 = tt.broadcast %r0_mask : tensor<1x32xi1> -> tensor<16x32xi1> loc(#loc70) + %tmp0_23 = tt.load %tmp0_21, %tmp0_22, %cst_0 evictionPolicy = evict_first : tensor<16x32x!tt.ptr> loc(#loc70) + %mask = arith.cmpf ogt, %_tmp2_11, %tmp0_23 : tensor<16x32xf32> loc(#loc112) + %equal = arith.cmpf oeq, %_tmp2_11, %tmp0_23 : tensor<16x32xf32> loc(#loc113) + %a_isnan = arith.cmpf une, %_tmp2_11, %_tmp2_11 : tensor<16x32xf32> loc(#loc92) + %b_isnan = arith.cmpf une, %tmp0_23, %tmp0_23 : tensor<16x32xf32> loc(#loc93) + %mask_24 = arith.xori %b_isnan, %cst : tensor<16x32xi1> loc(#loc94) + %mask_25 = arith.andi %a_isnan, %mask_24 : tensor<16x32xi1> loc(#loc95) + %mask_26 = arith.ori %mask, %mask_25 : tensor<16x32xi1> loc(#loc114) + %equal_27 = arith.andi %a_isnan, %b_isnan : tensor<16x32xi1> loc(#loc97) + %equal_28 = arith.ori %equal, %equal_27 : tensor<16x32xi1> loc(#loc115) + %mask_29 = arith.cmpi slt, %_tmp2_index_12, %tmp0_14 : tensor<16x32xi32> loc(#loc99) + %mask_30 = arith.andi %equal_28, %mask_29 : tensor<16x32xi1> loc(#loc100) + %mask_31 = arith.ori %mask_26, %mask_30 : tensor<16x32xi1> loc(#loc101) + %4 = arith.select %mask_31, %_tmp2_11, %tmp0_23 : tensor<16x32xi1>, tensor<16x32xf32> loc(#loc83) + %5 = arith.select %mask_31, %_tmp2_index_12, %tmp0_14 : tensor<16x32xi1>, tensor<16x32xi32> loc(#loc84) + %_tmp2_32 = arith.select %tmp0_22, %4, %_tmp2_11 : tensor<16x32xi1>, tensor<16x32xf32> loc(#loc85) + %_tmp2_index_33 = arith.select %tmp0_22, %5, %_tmp2_index_12 : tensor<16x32xi1>, tensor<16x32xi32> loc(#loc86) + scf.yield %_tmp2_32, %_tmp2_index_33 : tensor<16x32xf32>, tensor<16x32xi32> loc(#loc40) + } loc(#loc89) + %0:2 = "tt.reduce"(%_tmp2_index_10#0, %_tmp2_index_10#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc2)), %arg5: i32 loc(callsite(#loc1 at #loc2)), %arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc116) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc117) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc102) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc103) + %mask_11 = arith.xori %b_isnan, %true : i1 loc(#loc104) + %mask_12 = arith.andi %a_isnan, %mask_11 : i1 loc(#loc105) + %mask_13 = arith.ori %mask, %mask_12 : i1 loc(#loc118) + %equal_14 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc106) + %equal_15 = arith.ori %equal, %equal_14 : i1 loc(#loc119) + %mask_16 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc107) + %mask_17 = arith.andi %equal_15, %mask_16 : i1 loc(#loc108) + %mask_18 = arith.ori %mask_13, %mask_17 : i1 loc(#loc109) + %4 = arith.select %mask_18, %arg4, %arg6 : f32 loc(#loc110) + %5 = arith.select %mask_18, %arg5, %arg7 : i32 loc(#loc111) + tt.reduce.return %4, %5 : f32, i32 loc(#loc87) + }) : (tensor<16x32xf32>, tensor<16x32xi32>) -> (tensor<16xf32>, tensor<16xi32>) loc(#loc87) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<16xi32> -> tensor<16x1xi32> loc(#loc88) + %1 = tt.splat %out_ptr0 : !tt.ptr -> tensor<16x1x!tt.ptr> loc(#loc43) + %2 = tt.addptr %1, %xindex_8 : tensor<16x1x!tt.ptr>, tensor<16x1xi32> loc(#loc43) + %3 = arith.extsi %tmp2 : tensor<16x1xi32> to tensor<16x1xi64> loc(#loc44) + tt.store %2, %3 : tensor<16x1x!tt.ptr> loc(#loc44) + tt.return loc(#loc45) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":33:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":31:58) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":30:55) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:33) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:44) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:23) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:27) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:37) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":28:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":29:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":34:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":35:29) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:47) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:41) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:61) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:52) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:66) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":42:38) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":44:46) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:58) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:8) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":47:20) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:25) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:36) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:4) +#loc51 = loc("_tmp2_index"(#loc4)) +#loc52 = loc("_tmp2"(#loc5)) +#loc53 = loc("xoffset"(#loc6)) +#loc54 = loc("xoffset"(#loc7)) +#loc55 = loc("xindex"(#loc8)) +#loc56 = loc("xindex"(#loc9)) +#loc57 = loc("xindex"(#loc10)) +#loc58 = loc("r0_base"(#loc11)) +#loc59 = loc("r0_base"(#loc12)) +#loc60 = loc("x0"(#loc13)) +#loc61 = loc("x1"(#loc14)) +#loc62 = loc("_tmp2"(#loc3)) +#loc63 = loc("r0_index"(#loc15)) +#loc64 = loc("r0_mask"(#loc16)) +#loc65 = loc("tmp0"(#loc17)) +#loc66 = loc("tmp0"(#loc18)) +#loc67 = loc("tmp0"(#loc19)) +#loc68 = loc("tmp0"(#loc20)) +#loc69 = loc("tmp0"(#loc21)) +#loc70 = loc("tmp0"(#loc22)) +#loc71 = loc("mask"(#loc23)) +#loc72 = loc("equal"(#loc25)) +#loc73 = loc("a_isnan"(#loc26)) +#loc74 = loc("b_isnan"(#loc27)) +#loc75 = loc("mask"(#loc28)) +#loc76 = loc("mask"(#loc29)) +#loc77 = loc("mask"(#loc30)) +#loc78 = loc("equal"(#loc31)) +#loc79 = loc("equal"(#loc32)) +#loc80 = loc("mask"(#loc33)) +#loc81 = loc("mask"(#loc34)) +#loc82 = loc("mask"(#loc35)) +#loc83 = loc(callsite(#loc36 at #loc24)) +#loc84 = loc(callsite(#loc37 at #loc24)) +#loc85 = loc("_tmp2"(#loc38)) +#loc86 = loc("_tmp2_index"(#loc39)) +#loc87 = loc(callsite(#loc41 at #loc2)) +#loc88 = loc("tmp2"(#loc42)) +#loc89 = loc("_tmp2_index"(#loc62)) +#loc90 = loc("mask"(#loc71)) +#loc91 = loc("equal"(#loc72)) +#loc92 = loc(callsite(#loc73 at #loc24)) +#loc93 = loc(callsite(#loc74 at #loc24)) +#loc94 = loc(callsite(#loc75 at #loc24)) +#loc95 = loc(callsite(#loc76 at #loc24)) +#loc96 = loc("mask"(#loc77)) +#loc97 = loc(callsite(#loc78 at #loc24)) +#loc98 = loc("equal"(#loc79)) +#loc99 = loc(callsite(#loc80 at #loc24)) +#loc100 = loc(callsite(#loc81 at #loc24)) +#loc101 = loc(callsite(#loc82 at #loc24)) +#loc102 = loc(callsite(#loc73 at #loc87)) +#loc103 = loc(callsite(#loc74 at #loc87)) +#loc104 = loc(callsite(#loc75 at #loc87)) +#loc105 = loc(callsite(#loc76 at #loc87)) +#loc106 = loc(callsite(#loc78 at #loc87)) +#loc107 = loc(callsite(#loc80 at #loc87)) +#loc108 = loc(callsite(#loc81 at #loc87)) +#loc109 = loc(callsite(#loc82 at #loc87)) +#loc110 = loc(callsite(#loc36 at #loc87)) +#loc111 = loc(callsite(#loc37 at #loc87)) +#loc112 = loc(callsite(#loc90 at #loc24)) +#loc113 = loc(callsite(#loc91 at #loc24)) +#loc114 = loc(callsite(#loc96 at #loc24)) +#loc115 = loc(callsite(#loc98 at #loc24)) +#loc116 = loc(callsite(#loc90 at #loc87)) +#loc117 = loc(callsite(#loc91 at #loc87)) +#loc118 = loc(callsite(#loc96 at #loc87)) +#loc119 = loc(callsite(#loc98 at #loc87)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/__grp__triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/__grp__triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c7477eab83f4716b7bc66dc0f0416c85787565a4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/__grp__triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.source", "triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttir", "triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttgir", "triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.llir", "triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ptx", "triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.cubin", "triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..d1998b94401088fcd577a463027102b05f788f53 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3d3ee6e91e8eccb84cb19b06b03381347aea04c7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.json @@ -0,0 +1 @@ +{"hash": "a019c20703a0e1ee031d76756df2ddcbc15ab877a7ab6924dc3e024258b322c6", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..22c73025db71c89e5b25dc155730ea8d721e5e56 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.llir @@ -0,0 +1,424 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = icmp samesign ult i32 %7, 2048, !dbg !8 + %9 = lshr i32 %7, 8, !dbg !9 + %10 = zext nneg i32 %9 to i64, !dbg !10 + %11 = getelementptr i64, ptr addrspace(1) %0, i64 %10, !dbg !10 + %12 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !11 + %13 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %11, i64 %12, i1 %8) #5, !dbg !11 + %14 = insertelement <4 x i64> poison, i64 %13, i64 0, !dbg !12 + %15 = shufflevector <4 x i64> %14, <4 x i64> poison, <4 x i32> zeroinitializer, !dbg !12 + %16 = shl i32 %7, 7, !dbg !13 + %17 = and i32 %16, 1920, !dbg !13 + %18 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !14 + %19 = and i32 %18, 127 + %20 = or disjoint i32 %17, %19 + %21 = zext nneg i32 %20 to i64 + %22 = icmp sgt i64 %13, %21 + %23 = insertelement <4 x i1> poison, i1 %22, i64 0, !dbg !15 + %24 = shufflevector <4 x i1> %23, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !15 + %25 = insertelement <4 x i1> poison, i1 %8, i64 0, !dbg !16 + %26 = shufflevector <4 x i1> %25, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !16 + %27 = zext nneg i32 %20 to i64, !dbg !17 + %28 = shl i32 %7, 3, !dbg !18 + %29 = and i32 %28, 1920, !dbg !18 + %30 = zext nneg i32 %29 to i64, !dbg !17 + %31 = and i32 %18, 511, !dbg !14 + %32 = zext nneg i32 %31 to i64, !dbg !17 + %33 = lshr i64 %32, 7, !dbg !19 + %34 = lshr i32 %18, 7, !dbg !19 + %35 = or disjoint i64 %33, %30, !dbg !20 + %36 = or disjoint i32 %29, %34, !dbg !20 + %37 = or i32 %36, 4, !dbg !20 + %38 = or disjoint i64 %35, 8, !dbg !20 + %39 = or i32 %36, 12, !dbg !20 + %40 = icmp samesign uge i64 %35, %27, !dbg !21 + %41 = icmp samesign uge i32 %37, %20, !dbg !21 + %42 = icmp samesign uge i64 %38, %27, !dbg !21 + %43 = icmp samesign uge i32 %39, %20, !dbg !21 + %44 = zext nneg i32 %37 to i64, !dbg !12 + %45 = zext nneg i32 %39 to i64, !dbg !12 + %46 = insertelement <4 x i64> poison, i64 %35, i64 0, !dbg !12 + %47 = insertelement <4 x i64> %46, i64 %44, i64 1, !dbg !12 + %48 = insertelement <4 x i64> %47, i64 %38, i64 2, !dbg !12 + %49 = insertelement <4 x i64> %48, i64 %45, i64 3, !dbg !12 + %50 = icmp sgt <4 x i64> %15, %49, !dbg !12 + %51 = insertelement <4 x i1> poison, i1 %40, i64 0, !dbg !15 + %52 = insertelement <4 x i1> %51, i1 %41, i64 1, !dbg !15 + %53 = insertelement <4 x i1> %52, i1 %42, i64 2, !dbg !15 + %54 = insertelement <4 x i1> %53, i1 %43, i64 3, !dbg !15 + %55 = and <4 x i1> %54, %50, !dbg !15 + %56 = and <4 x i1> %24, %55, !dbg !15 + %57 = select <4 x i1> %26, <4 x i1> %56, <4 x i1> zeroinitializer, !dbg !16 + %58 = zext <4 x i1> %57 to <4 x i64>, !dbg !16 + %59 = lshr i64 %32, 7, !dbg !19 + %60 = lshr i32 %18, 7, !dbg !19 + %61 = or disjoint i32 %60, 16, !dbg !19 + %62 = or disjoint i64 %59, %30, !dbg !20 + %63 = or disjoint i64 %62, 16, !dbg !20 + %64 = or disjoint i32 %29, %61, !dbg !20 + %65 = or i32 %64, 4, !dbg !20 + %66 = or disjoint i64 %62, 24, !dbg !20 + %67 = or i32 %64, 12, !dbg !20 + %68 = icmp samesign uge i64 %63, %27, !dbg !21 + %69 = icmp samesign uge i32 %65, %20, !dbg !21 + %70 = icmp samesign uge i64 %66, %27, !dbg !21 + %71 = icmp samesign uge i32 %67, %20, !dbg !21 + %72 = zext nneg i32 %65 to i64, !dbg !12 + %73 = zext nneg i32 %67 to i64, !dbg !12 + %74 = insertelement <4 x i64> poison, i64 %63, i64 0, !dbg !12 + %75 = insertelement <4 x i64> %74, i64 %72, i64 1, !dbg !12 + %76 = insertelement <4 x i64> %75, i64 %66, i64 2, !dbg !12 + %77 = insertelement <4 x i64> %76, i64 %73, i64 3, !dbg !12 + %78 = icmp sgt <4 x i64> %15, %77, !dbg !12 + %79 = insertelement <4 x i1> poison, i1 %68, i64 0, !dbg !15 + %80 = insertelement <4 x i1> %79, i1 %69, i64 1, !dbg !15 + %81 = insertelement <4 x i1> %80, i1 %70, i64 2, !dbg !15 + %82 = insertelement <4 x i1> %81, i1 %71, i64 3, !dbg !15 + %83 = and <4 x i1> %82, %78, !dbg !15 + %84 = and <4 x i1> %24, %83, !dbg !15 + %85 = select <4 x i1> %26, <4 x i1> %84, <4 x i1> zeroinitializer, !dbg !16 + %86 = zext <4 x i1> %85 to <4 x i64>, !dbg !16 + %87 = add nuw nsw <4 x i64> %58, %86, !dbg !16 + %88 = lshr i64 %32, 7, !dbg !19 + %89 = lshr i32 %18, 7, !dbg !19 + %90 = or disjoint i32 %89, 32, !dbg !19 + %91 = or disjoint i64 %88, %30, !dbg !20 + %92 = or disjoint i64 %91, 32, !dbg !20 + %93 = or disjoint i32 %29, %90, !dbg !20 + %94 = or i32 %93, 4, !dbg !20 + %95 = or disjoint i64 %91, 40, !dbg !20 + %96 = or i32 %93, 12, !dbg !20 + %97 = icmp samesign uge i64 %92, %27, !dbg !21 + %98 = icmp samesign uge i32 %94, %20, !dbg !21 + %99 = icmp samesign uge i64 %95, %27, !dbg !21 + %100 = icmp samesign uge i32 %96, %20, !dbg !21 + %101 = zext nneg i32 %94 to i64, !dbg !12 + %102 = zext nneg i32 %96 to i64, !dbg !12 + %103 = insertelement <4 x i64> poison, i64 %92, i64 0, !dbg !12 + %104 = insertelement <4 x i64> %103, i64 %101, i64 1, !dbg !12 + %105 = insertelement <4 x i64> %104, i64 %95, i64 2, !dbg !12 + %106 = insertelement <4 x i64> %105, i64 %102, i64 3, !dbg !12 + %107 = icmp sgt <4 x i64> %15, %106, !dbg !12 + %108 = insertelement <4 x i1> poison, i1 %97, i64 0, !dbg !15 + %109 = insertelement <4 x i1> %108, i1 %98, i64 1, !dbg !15 + %110 = insertelement <4 x i1> %109, i1 %99, i64 2, !dbg !15 + %111 = insertelement <4 x i1> %110, i1 %100, i64 3, !dbg !15 + %112 = and <4 x i1> %111, %107, !dbg !15 + %113 = and <4 x i1> %24, %112, !dbg !15 + %114 = select <4 x i1> %26, <4 x i1> %113, <4 x i1> zeroinitializer, !dbg !16 + %115 = zext <4 x i1> %114 to <4 x i64>, !dbg !16 + %116 = add nuw nsw <4 x i64> %87, %115, !dbg !16 + %117 = lshr i64 %32, 7, !dbg !19 + %118 = lshr i32 %18, 7, !dbg !19 + %119 = or disjoint i32 %118, 48, !dbg !19 + %120 = or disjoint i64 %117, %30, !dbg !20 + %121 = or disjoint i64 %120, 48, !dbg !20 + %122 = or disjoint i32 %29, %119, !dbg !20 + %123 = or i32 %122, 4, !dbg !20 + %124 = or disjoint i64 %120, 56, !dbg !20 + %125 = or i32 %122, 12, !dbg !20 + %126 = icmp samesign uge i64 %121, %27, !dbg !21 + %127 = icmp samesign uge i32 %123, %20, !dbg !21 + %128 = icmp samesign uge i64 %124, %27, !dbg !21 + %129 = icmp samesign uge i32 %125, %20, !dbg !21 + %130 = zext nneg i32 %123 to i64, !dbg !12 + %131 = zext nneg i32 %125 to i64, !dbg !12 + %132 = insertelement <4 x i64> poison, i64 %121, i64 0, !dbg !12 + %133 = insertelement <4 x i64> %132, i64 %130, i64 1, !dbg !12 + %134 = insertelement <4 x i64> %133, i64 %124, i64 2, !dbg !12 + %135 = insertelement <4 x i64> %134, i64 %131, i64 3, !dbg !12 + %136 = icmp sgt <4 x i64> %15, %135, !dbg !12 + %137 = insertelement <4 x i1> poison, i1 %126, i64 0, !dbg !15 + %138 = insertelement <4 x i1> %137, i1 %127, i64 1, !dbg !15 + %139 = insertelement <4 x i1> %138, i1 %128, i64 2, !dbg !15 + %140 = insertelement <4 x i1> %139, i1 %129, i64 3, !dbg !15 + %141 = and <4 x i1> %140, %136, !dbg !15 + %142 = and <4 x i1> %24, %141, !dbg !15 + %143 = select <4 x i1> %26, <4 x i1> %142, <4 x i1> zeroinitializer, !dbg !16 + %144 = zext <4 x i1> %143 to <4 x i64>, !dbg !16 + %145 = add nuw nsw <4 x i64> %116, %144, !dbg !16 + %146 = lshr i64 %32, 7, !dbg !19 + %147 = lshr i32 %18, 7, !dbg !19 + %148 = or disjoint i32 %147, 64, !dbg !19 + %149 = or disjoint i64 %146, %30, !dbg !20 + %150 = or disjoint i64 %149, 64, !dbg !20 + %151 = or disjoint i32 %29, %148, !dbg !20 + %152 = or i32 %151, 4, !dbg !20 + %153 = or disjoint i64 %149, 72, !dbg !20 + %154 = or i32 %151, 12, !dbg !20 + %155 = icmp samesign uge i64 %150, %27, !dbg !21 + %156 = icmp samesign uge i32 %152, %20, !dbg !21 + %157 = icmp samesign uge i64 %153, %27, !dbg !21 + %158 = icmp samesign uge i32 %154, %20, !dbg !21 + %159 = zext nneg i32 %152 to i64, !dbg !12 + %160 = zext nneg i32 %154 to i64, !dbg !12 + %161 = insertelement <4 x i64> poison, i64 %150, i64 0, !dbg !12 + %162 = insertelement <4 x i64> %161, i64 %159, i64 1, !dbg !12 + %163 = insertelement <4 x i64> %162, i64 %153, i64 2, !dbg !12 + %164 = insertelement <4 x i64> %163, i64 %160, i64 3, !dbg !12 + %165 = icmp sgt <4 x i64> %15, %164, !dbg !12 + %166 = insertelement <4 x i1> poison, i1 %155, i64 0, !dbg !15 + %167 = insertelement <4 x i1> %166, i1 %156, i64 1, !dbg !15 + %168 = insertelement <4 x i1> %167, i1 %157, i64 2, !dbg !15 + %169 = insertelement <4 x i1> %168, i1 %158, i64 3, !dbg !15 + %170 = and <4 x i1> %169, %165, !dbg !15 + %171 = and <4 x i1> %24, %170, !dbg !15 + %172 = select <4 x i1> %26, <4 x i1> %171, <4 x i1> zeroinitializer, !dbg !16 + %173 = zext <4 x i1> %172 to <4 x i64>, !dbg !16 + %174 = add nuw nsw <4 x i64> %145, %173, !dbg !16 + %175 = lshr i64 %32, 7, !dbg !19 + %176 = lshr i32 %18, 7, !dbg !19 + %177 = or disjoint i32 %176, 80, !dbg !19 + %178 = or disjoint i64 %175, %30, !dbg !20 + %179 = or disjoint i64 %178, 80, !dbg !20 + %180 = or disjoint i32 %29, %177, !dbg !20 + %181 = or i32 %180, 4, !dbg !20 + %182 = or disjoint i64 %178, 88, !dbg !20 + %183 = or i32 %180, 12, !dbg !20 + %184 = icmp samesign uge i64 %179, %27, !dbg !21 + %185 = icmp samesign uge i32 %181, %20, !dbg !21 + %186 = icmp samesign uge i64 %182, %27, !dbg !21 + %187 = icmp samesign uge i32 %183, %20, !dbg !21 + %188 = zext nneg i32 %181 to i64, !dbg !12 + %189 = zext nneg i32 %183 to i64, !dbg !12 + %190 = insertelement <4 x i64> poison, i64 %179, i64 0, !dbg !12 + %191 = insertelement <4 x i64> %190, i64 %188, i64 1, !dbg !12 + %192 = insertelement <4 x i64> %191, i64 %182, i64 2, !dbg !12 + %193 = insertelement <4 x i64> %192, i64 %189, i64 3, !dbg !12 + %194 = icmp sgt <4 x i64> %15, %193, !dbg !12 + %195 = insertelement <4 x i1> poison, i1 %184, i64 0, !dbg !15 + %196 = insertelement <4 x i1> %195, i1 %185, i64 1, !dbg !15 + %197 = insertelement <4 x i1> %196, i1 %186, i64 2, !dbg !15 + %198 = insertelement <4 x i1> %197, i1 %187, i64 3, !dbg !15 + %199 = and <4 x i1> %198, %194, !dbg !15 + %200 = and <4 x i1> %24, %199, !dbg !15 + %201 = select <4 x i1> %26, <4 x i1> %200, <4 x i1> zeroinitializer, !dbg !16 + %202 = zext <4 x i1> %201 to <4 x i64>, !dbg !16 + %203 = add nuw nsw <4 x i64> %174, %202, !dbg !16 + %204 = lshr i64 %32, 7, !dbg !19 + %205 = lshr i32 %18, 7, !dbg !19 + %206 = or disjoint i32 %205, 96, !dbg !19 + %207 = or disjoint i64 %204, %30, !dbg !20 + %208 = or disjoint i64 %207, 96, !dbg !20 + %209 = or disjoint i32 %29, %206, !dbg !20 + %210 = or i32 %209, 4, !dbg !20 + %211 = or disjoint i64 %207, 104, !dbg !20 + %212 = or i32 %209, 12, !dbg !20 + %213 = icmp samesign uge i64 %208, %27, !dbg !21 + %214 = icmp samesign uge i32 %210, %20, !dbg !21 + %215 = icmp samesign uge i64 %211, %27, !dbg !21 + %216 = icmp samesign uge i32 %212, %20, !dbg !21 + %217 = zext nneg i32 %210 to i64, !dbg !12 + %218 = zext nneg i32 %212 to i64, !dbg !12 + %219 = insertelement <4 x i64> poison, i64 %208, i64 0, !dbg !12 + %220 = insertelement <4 x i64> %219, i64 %217, i64 1, !dbg !12 + %221 = insertelement <4 x i64> %220, i64 %211, i64 2, !dbg !12 + %222 = insertelement <4 x i64> %221, i64 %218, i64 3, !dbg !12 + %223 = icmp sgt <4 x i64> %15, %222, !dbg !12 + %224 = insertelement <4 x i1> poison, i1 %213, i64 0, !dbg !15 + %225 = insertelement <4 x i1> %224, i1 %214, i64 1, !dbg !15 + %226 = insertelement <4 x i1> %225, i1 %215, i64 2, !dbg !15 + %227 = insertelement <4 x i1> %226, i1 %216, i64 3, !dbg !15 + %228 = and <4 x i1> %227, %223, !dbg !15 + %229 = and <4 x i1> %24, %228, !dbg !15 + %230 = select <4 x i1> %26, <4 x i1> %229, <4 x i1> zeroinitializer, !dbg !16 + %231 = zext <4 x i1> %230 to <4 x i64>, !dbg !16 + %232 = add nuw nsw <4 x i64> %203, %231, !dbg !16 + %233 = lshr i64 %32, 7, !dbg !19 + %234 = lshr i32 %18, 7, !dbg !19 + %235 = or disjoint i32 %234, 112, !dbg !19 + %236 = or disjoint i64 %233, %30, !dbg !20 + %237 = or disjoint i64 %236, 112, !dbg !20 + %238 = or disjoint i32 %29, %235, !dbg !20 + %239 = or i32 %238, 4, !dbg !20 + %240 = or disjoint i64 %236, 120, !dbg !20 + %241 = or i32 %238, 12, !dbg !20 + %242 = icmp samesign uge i64 %237, %27, !dbg !21 + %243 = icmp samesign uge i32 %239, %20, !dbg !21 + %244 = icmp samesign uge i64 %240, %27, !dbg !21 + %245 = icmp samesign uge i32 %241, %20, !dbg !21 + %246 = zext nneg i32 %239 to i64, !dbg !12 + %247 = zext nneg i32 %241 to i64, !dbg !12 + %248 = insertelement <4 x i64> poison, i64 %237, i64 0, !dbg !12 + %249 = insertelement <4 x i64> %248, i64 %246, i64 1, !dbg !12 + %250 = insertelement <4 x i64> %249, i64 %240, i64 2, !dbg !12 + %251 = insertelement <4 x i64> %250, i64 %247, i64 3, !dbg !12 + %252 = icmp sgt <4 x i64> %15, %251, !dbg !12 + %253 = insertelement <4 x i1> poison, i1 %242, i64 0, !dbg !15 + %254 = insertelement <4 x i1> %253, i1 %243, i64 1, !dbg !15 + %255 = insertelement <4 x i1> %254, i1 %244, i64 2, !dbg !15 + %256 = insertelement <4 x i1> %255, i1 %245, i64 3, !dbg !15 + %257 = and <4 x i1> %256, %252, !dbg !15 + %258 = and <4 x i1> %24, %257, !dbg !15 + %259 = select <4 x i1> %26, <4 x i1> %258, <4 x i1> zeroinitializer, !dbg !16 + %260 = zext <4 x i1> %259 to <4 x i64>, !dbg !16 + %261 = add <4 x i64> %232, %260, !dbg !16 + %262 = and i32 %18, 31, !dbg !14 + %263 = lshr i32 %18, 5, !dbg !14 + %264 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %261), !dbg !22 + %extelt.offset = lshr i64 %264, 32, !dbg !26 + %265 = trunc nuw i64 %extelt.offset to i32, !dbg !26 + %266 = trunc i64 %264 to i32, !dbg !26 + %267 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %266, i32 16, i32 31), !dbg !26 + %268 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %265, i32 16, i32 31), !dbg !26 + %269 = insertelement <2 x i32> poison, i32 %267, i64 0, !dbg !26 + %270 = insertelement <2 x i32> %269, i32 %268, i64 1, !dbg !26 + %271 = bitcast <2 x i32> %270 to i64, !dbg !26 + %272 = add i64 %264, %271, !dbg !22 + %extelt.offset1 = lshr i64 %272, 32, !dbg !26 + %273 = trunc nuw i64 %extelt.offset1 to i32, !dbg !26 + %274 = trunc i64 %272 to i32, !dbg !26 + %275 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %274, i32 8, i32 31), !dbg !26 + %276 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %273, i32 8, i32 31), !dbg !26 + %277 = insertelement <2 x i32> poison, i32 %275, i64 0, !dbg !26 + %278 = insertelement <2 x i32> %277, i32 %276, i64 1, !dbg !26 + %279 = bitcast <2 x i32> %278 to i64, !dbg !26 + %280 = add i64 %272, %279, !dbg !22 + %extelt.offset2 = lshr i64 %280, 32, !dbg !26 + %281 = trunc nuw i64 %extelt.offset2 to i32, !dbg !26 + %282 = trunc i64 %280 to i32, !dbg !26 + %283 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %282, i32 4, i32 31), !dbg !26 + %284 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %281, i32 4, i32 31), !dbg !26 + %285 = insertelement <2 x i32> poison, i32 %283, i64 0, !dbg !26 + %286 = insertelement <2 x i32> %285, i32 %284, i64 1, !dbg !26 + %287 = bitcast <2 x i32> %286 to i64, !dbg !26 + %288 = add i64 %280, %287, !dbg !22 + %extelt.offset3 = lshr i64 %288, 32, !dbg !26 + %289 = trunc nuw i64 %extelt.offset3 to i32, !dbg !26 + %290 = trunc i64 %288 to i32, !dbg !26 + %291 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %290, i32 2, i32 31), !dbg !26 + %292 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %289, i32 2, i32 31), !dbg !26 + %293 = insertelement <2 x i32> poison, i32 %291, i64 0, !dbg !26 + %294 = insertelement <2 x i32> %293, i32 %292, i64 1, !dbg !26 + %295 = bitcast <2 x i32> %294 to i64, !dbg !26 + %296 = add i64 %288, %295, !dbg !22 + %extelt.offset4 = lshr i64 %296, 32, !dbg !26 + %297 = trunc nuw i64 %extelt.offset4 to i32, !dbg !26 + %298 = trunc i64 %296 to i32, !dbg !26 + %299 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 1, i32 31), !dbg !26 + %300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %297, i32 1, i32 31), !dbg !26 + %301 = insertelement <2 x i32> poison, i32 %299, i64 0, !dbg !26 + %302 = insertelement <2 x i32> %301, i32 %300, i64 1, !dbg !26 + %303 = bitcast <2 x i32> %302 to i64, !dbg !26 + %304 = add i64 %296, %303, !dbg !22 + %305 = and i32 %263, 15, !dbg !26 + %306 = icmp eq i32 %262, 0, !dbg !26 + %307 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %305, !dbg !26 + %308 = insertelement <1 x i64> poison, i64 %304, i64 0, !dbg !26 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %307, <1 x i64> %308, i1 %306) #5, !dbg !26 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !26 + %309 = icmp samesign ult i32 %18, 16, !dbg !26 + %310 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %18, !dbg !26 + %311 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %310, i1 %309) #5, !dbg !26 + %extelt.offset5 = lshr i64 %311, 32, !dbg !26 + %312 = trunc nuw i64 %extelt.offset5 to i32, !dbg !26 + %313 = trunc i64 %311 to i32, !dbg !26 + %314 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %313, i32 8, i32 31), !dbg !26 + %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %312, i32 8, i32 31), !dbg !26 + %316 = insertelement <2 x i32> poison, i32 %314, i64 0, !dbg !26 + %317 = insertelement <2 x i32> %316, i32 %315, i64 1, !dbg !26 + %318 = bitcast <2 x i32> %317 to i64, !dbg !26 + %319 = add i64 %311, %318, !dbg !22 + %extelt.offset6 = lshr i64 %319, 32, !dbg !26 + %320 = trunc nuw i64 %extelt.offset6 to i32, !dbg !26 + %321 = trunc i64 %319 to i32, !dbg !26 + %322 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %321, i32 4, i32 31), !dbg !26 + %323 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %320, i32 4, i32 31), !dbg !26 + %324 = insertelement <2 x i32> poison, i32 %322, i64 0, !dbg !26 + %325 = insertelement <2 x i32> %324, i32 %323, i64 1, !dbg !26 + %326 = bitcast <2 x i32> %325 to i64, !dbg !26 + %327 = add i64 %319, %326, !dbg !22 + %extelt.offset7 = lshr i64 %327, 32, !dbg !26 + %328 = trunc nuw i64 %extelt.offset7 to i32, !dbg !26 + %329 = trunc i64 %327 to i32, !dbg !26 + %330 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %329, i32 2, i32 31), !dbg !26 + %331 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %328, i32 2, i32 31), !dbg !26 + %332 = insertelement <2 x i32> poison, i32 %330, i64 0, !dbg !26 + %333 = insertelement <2 x i32> %332, i32 %331, i64 1, !dbg !26 + %334 = bitcast <2 x i32> %333 to i64, !dbg !26 + %335 = add i64 %327, %334, !dbg !22 + %extelt.offset8 = lshr i64 %335, 32, !dbg !26 + %336 = trunc nuw i64 %extelt.offset8 to i32, !dbg !26 + %337 = trunc i64 %335 to i32, !dbg !26 + %338 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %337, i32 1, i32 31), !dbg !26 + %339 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %336, i32 1, i32 31), !dbg !26 + %340 = insertelement <2 x i32> poison, i32 %338, i64 0, !dbg !26 + %341 = insertelement <2 x i32> %340, i32 %339, i64 1, !dbg !26 + %342 = bitcast <2 x i32> %341 to i64, !dbg !26 + %343 = add i64 %335, %342, !dbg !22 + %344 = icmp eq i32 %18, 0, !dbg !26 + %345 = insertelement <1 x i64> poison, i64 %343, i64 0, !dbg !26 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %310, <1 x i64> %345, i1 %344) #5, !dbg !26 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !26 + %346 = load i64, ptr addrspace(3) @global_smem, align 16, !dbg !26 + %347 = zext nneg i32 %7 to i64, !dbg !27 + %348 = getelementptr i64, ptr addrspace(1) %1, i64 %347, !dbg !27 + %349 = icmp eq i32 %31, 0, !dbg !28 + %350 = and i1 %8, %349, !dbg !28 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %346, ptr addrspace(1) %348, i1 %350) #5, !dbg !28 + ret void, !dbg !29 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0", linkageName: "triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 25, column: 21, scope: !4) +!9 = !DILocation(line: 30, column: 19, scope: !4) +!10 = !DILocation(line: 31, column: 30, scope: !4) +!11 = !DILocation(line: 31, column: 35, scope: !4) +!12 = !DILocation(line: 45, column: 22, scope: !4) +!13 = !DILocation(line: 42, column: 26, scope: !4) +!14 = !DILocation(line: 26, column: 37, scope: !4) +!15 = !DILocation(line: 47, column: 22, scope: !4) +!16 = !DILocation(line: 70, column: 50, scope: !4) +!17 = !DILocation(line: 34, column: 40, scope: !4) +!18 = !DILocation(line: 41, column: 26, scope: !4) +!19 = !DILocation(line: 39, column: 27, scope: !4) +!20 = !DILocation(line: 41, column: 22, scope: !4) +!21 = !DILocation(line: 43, column: 23, scope: !4) +!22 = !DILocation(line: 261, column: 15, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !4, file: !24, discriminator: 0) +!24 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!25 = !DILocation(line: 71, column: 27, scope: !4) +!26 = !DILocation(line: 291, column: 36, scope: !23, inlinedAt: !25) +!27 = !DILocation(line: 72, column: 25, scope: !4) +!28 = !DILocation(line: 72, column: 37, scope: !4) +!29 = !DILocation(line: 72, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..cd32221360bdd60856b6eda3a3c372568c677512 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ptx @@ -0,0 +1,824 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0 // -- Begin function triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0 +.visible .entry triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0( + .param .u64 .ptr .global .align 1 triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0_param_1, + .param .u32 triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0_param_2, + .param .u32 triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0_param_5 +) +.reqntid 512 +{ + .reg .pred %p<200>; + .reg .b32 %r<73>; + .reg .b64 %rd<154>; + .loc 1 18 0 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:18:0 + +// %bb.0: + ld.param.b64 %rd10, [triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0_param_0]; + ld.param.b64 %rd11, [triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:23:28 + mov.u32 %r4, %ctaid.x; + .loc 1 25 21 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:25:21 + setp.lt.u32 %p1, %r4, 2048; + .loc 1 30 19 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:30:19 + shr.u32 %r5, %r4, 8; + .loc 1 31 30 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:31:30 + mad.wide.u32 %rd3, %r5, 8, %rd10; + .loc 1 31 35 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:31:35 + // begin inline asm + mov.u64 %rd4, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd4, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd2, 0x0; + @%p1 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd2 }, [ %rd3 + 0 ], %rd4; + // end inline asm + .loc 1 42 26 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:42:26 + shl.b32 %r6, %r4, 7; + and.b32 %r7, %r6, 1920; + .loc 1 26 37 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:26:37 + mov.u32 %r8, %tid.x; + and.b32 %r9, %r8, 127; + or.b32 %r10, %r7, %r9; + cvt.u64.u32 %rd12, %r10; + setp.gt.s64 %p6, %rd2, %rd12; + .loc 1 41 26 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:41:26 + shl.b32 %r11, %r4, 3; + and.b32 %r12, %r11, 1920; + .loc 1 34 40 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:34:40 + cvt.u64.u32 %rd13, %r12; + .loc 1 26 37 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:26:37 + and.b32 %r13, %r8, 511; + .loc 1 34 40 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:34:40 + cvt.u64.u32 %rd14, %r13; + .loc 1 39 27 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:39:27 + shr.u64 %rd15, %rd14, 7; + shr.u32 %r14, %r8, 7; + .loc 1 41 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:41:22 + or.b64 %rd16, %rd15, %rd13; + or.b32 %r15, %r12, %r14; + or.b32 %r16, %r15, 4; + or.b64 %rd17, %rd16, 8; + or.b32 %r17, %r15, 12; + .loc 1 43 23 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:43:23 + setp.ge.u64 %p7, %rd16, %rd12; + setp.ge.u32 %p8, %r16, %r10; + setp.ge.u64 %p9, %rd17, %rd12; + setp.ge.u32 %p10, %r17, %r10; + .loc 1 45 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:45:22 + cvt.u64.u32 %rd18, %r16; + cvt.u64.u32 %rd19, %r17; + setp.gt.s64 %p11, %rd2, %rd18; + setp.gt.s64 %p12, %rd2, %rd19; + setp.gt.s64 %p13, %rd2, %rd16; + setp.gt.s64 %p14, %rd2, %rd17; + .loc 1 47 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:47:22 + and.pred %p15, %p14, %p6; + and.pred %p16, %p13, %p6; + and.pred %p17, %p12, %p6; + and.pred %p18, %p11, %p6; + and.pred %p19, %p18, %p8; + and.pred %p21, %p17, %p10; + and.pred %p23, %p16, %p7; + and.pred %p25, %p15, %p9; + .loc 1 70 50 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:70:50 + and.pred %p27, %p1, %p25; + and.pred %p28, %p1, %p23; + and.pred %p29, %p1, %p21; + and.pred %p30, %p1, %p19; + selp.b64 %rd20, 1, 0, %p30; + selp.b64 %rd21, 1, 0, %p29; + selp.b64 %rd22, 1, 0, %p28; + selp.b64 %rd23, 1, 0, %p27; + .loc 1 41 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:41:22 + or.b64 %rd24, %rd16, 16; + or.b32 %r18, %r15, 20; + or.b64 %rd25, %rd16, 24; + or.b32 %r19, %r15, 28; + .loc 1 43 23 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:43:23 + setp.ge.u64 %p31, %rd24, %rd12; + setp.ge.u32 %p32, %r18, %r10; + setp.ge.u64 %p33, %rd25, %rd12; + setp.ge.u32 %p34, %r19, %r10; + .loc 1 45 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:45:22 + cvt.u64.u32 %rd26, %r18; + cvt.u64.u32 %rd27, %r19; + setp.gt.s64 %p35, %rd2, %rd26; + setp.gt.s64 %p36, %rd2, %rd27; + setp.gt.s64 %p37, %rd2, %rd24; + setp.gt.s64 %p38, %rd2, %rd25; + .loc 1 47 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:47:22 + and.pred %p39, %p38, %p6; + and.pred %p40, %p37, %p6; + and.pred %p41, %p36, %p6; + and.pred %p42, %p35, %p6; + and.pred %p43, %p42, %p32; + and.pred %p45, %p41, %p34; + and.pred %p47, %p40, %p31; + and.pred %p49, %p39, %p33; + .loc 1 70 50 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:70:50 + and.pred %p51, %p1, %p49; + and.pred %p52, %p1, %p47; + and.pred %p53, %p1, %p45; + and.pred %p54, %p1, %p43; + selp.b64 %rd28, 1, 0, %p54; + selp.b64 %rd29, 1, 0, %p53; + selp.b64 %rd30, 1, 0, %p52; + selp.b64 %rd31, 1, 0, %p51; + add.s64 %rd32, %rd23, %rd31; + add.s64 %rd33, %rd22, %rd30; + add.s64 %rd34, %rd21, %rd29; + add.s64 %rd35, %rd20, %rd28; + .loc 1 41 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:41:22 + or.b64 %rd36, %rd16, 32; + or.b32 %r20, %r15, 36; + or.b64 %rd37, %rd16, 40; + or.b32 %r21, %r15, 44; + .loc 1 43 23 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:43:23 + setp.ge.u64 %p55, %rd36, %rd12; + setp.ge.u32 %p56, %r20, %r10; + setp.ge.u64 %p57, %rd37, %rd12; + setp.ge.u32 %p58, %r21, %r10; + .loc 1 45 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:45:22 + cvt.u64.u32 %rd38, %r20; + cvt.u64.u32 %rd39, %r21; + setp.gt.s64 %p59, %rd2, %rd37; + setp.gt.s64 %p60, %rd2, %rd36; + setp.gt.s64 %p61, %rd2, %rd39; + setp.gt.s64 %p62, %rd2, %rd38; + .loc 1 47 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:47:22 + and.pred %p63, %p62, %p6; + and.pred %p64, %p61, %p6; + and.pred %p65, %p60, %p6; + and.pred %p66, %p59, %p6; + and.pred %p67, %p66, %p57; + and.pred %p69, %p65, %p55; + and.pred %p71, %p64, %p58; + and.pred %p73, %p63, %p56; + .loc 1 70 50 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:70:50 + and.pred %p75, %p1, %p73; + and.pred %p76, %p1, %p71; + and.pred %p77, %p1, %p69; + and.pred %p78, %p1, %p67; + selp.b64 %rd40, 1, 0, %p78; + selp.b64 %rd41, 1, 0, %p77; + selp.b64 %rd42, 1, 0, %p76; + selp.b64 %rd43, 1, 0, %p75; + add.s64 %rd44, %rd35, %rd43; + add.s64 %rd45, %rd34, %rd42; + add.s64 %rd46, %rd33, %rd41; + add.s64 %rd47, %rd32, %rd40; + .loc 1 41 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:41:22 + or.b64 %rd48, %rd16, 48; + or.b32 %r22, %r15, 52; + or.b64 %rd49, %rd16, 56; + or.b32 %r23, %r15, 60; + .loc 1 43 23 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:43:23 + setp.ge.u64 %p79, %rd48, %rd12; + setp.ge.u32 %p80, %r22, %r10; + setp.ge.u64 %p81, %rd49, %rd12; + setp.ge.u32 %p82, %r23, %r10; + .loc 1 45 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:45:22 + cvt.u64.u32 %rd50, %r22; + cvt.u64.u32 %rd51, %r23; + setp.gt.s64 %p83, %rd2, %rd50; + setp.gt.s64 %p84, %rd2, %rd51; + setp.gt.s64 %p85, %rd2, %rd48; + setp.gt.s64 %p86, %rd2, %rd49; + .loc 1 47 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:47:22 + and.pred %p87, %p86, %p6; + and.pred %p88, %p85, %p6; + and.pred %p89, %p84, %p6; + and.pred %p90, %p83, %p6; + and.pred %p91, %p90, %p80; + and.pred %p93, %p89, %p82; + and.pred %p95, %p88, %p79; + and.pred %p97, %p87, %p81; + .loc 1 70 50 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:70:50 + and.pred %p99, %p1, %p97; + and.pred %p100, %p1, %p95; + and.pred %p101, %p1, %p93; + and.pred %p102, %p1, %p91; + selp.b64 %rd52, 1, 0, %p102; + selp.b64 %rd53, 1, 0, %p101; + selp.b64 %rd54, 1, 0, %p100; + selp.b64 %rd55, 1, 0, %p99; + add.s64 %rd56, %rd47, %rd55; + add.s64 %rd57, %rd46, %rd54; + add.s64 %rd58, %rd45, %rd53; + add.s64 %rd59, %rd44, %rd52; + .loc 1 41 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:41:22 + or.b64 %rd60, %rd16, 64; + or.b32 %r24, %r15, 68; + or.b64 %rd61, %rd16, 72; + or.b32 %r25, %r15, 76; + .loc 1 43 23 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:43:23 + setp.ge.u64 %p103, %rd60, %rd12; + setp.ge.u32 %p104, %r24, %r10; + setp.ge.u64 %p105, %rd61, %rd12; + setp.ge.u32 %p106, %r25, %r10; + .loc 1 45 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:45:22 + cvt.u64.u32 %rd62, %r24; + cvt.u64.u32 %rd63, %r25; + setp.gt.s64 %p107, %rd2, %rd61; + setp.gt.s64 %p108, %rd2, %rd60; + setp.gt.s64 %p109, %rd2, %rd63; + setp.gt.s64 %p110, %rd2, %rd62; + .loc 1 47 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:47:22 + and.pred %p111, %p110, %p6; + and.pred %p112, %p109, %p6; + and.pred %p113, %p108, %p6; + and.pred %p114, %p107, %p6; + and.pred %p115, %p114, %p105; + and.pred %p117, %p113, %p103; + and.pred %p119, %p112, %p106; + and.pred %p121, %p111, %p104; + .loc 1 70 50 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:70:50 + and.pred %p123, %p1, %p121; + and.pred %p124, %p1, %p119; + and.pred %p125, %p1, %p117; + and.pred %p126, %p1, %p115; + selp.b64 %rd64, 1, 0, %p126; + selp.b64 %rd65, 1, 0, %p125; + selp.b64 %rd66, 1, 0, %p124; + selp.b64 %rd67, 1, 0, %p123; + add.s64 %rd68, %rd59, %rd67; + add.s64 %rd69, %rd58, %rd66; + add.s64 %rd70, %rd57, %rd65; + add.s64 %rd71, %rd56, %rd64; + .loc 1 41 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:41:22 + or.b64 %rd72, %rd16, 80; + or.b32 %r26, %r15, 84; + or.b64 %rd73, %rd16, 88; + or.b32 %r27, %r15, 92; + .loc 1 43 23 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:43:23 + setp.ge.u64 %p127, %rd72, %rd12; + setp.ge.u32 %p128, %r26, %r10; + setp.ge.u64 %p129, %rd73, %rd12; + setp.ge.u32 %p130, %r27, %r10; + .loc 1 45 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:45:22 + cvt.u64.u32 %rd74, %r26; + cvt.u64.u32 %rd75, %r27; + setp.gt.s64 %p131, %rd2, %rd74; + setp.gt.s64 %p132, %rd2, %rd75; + setp.gt.s64 %p133, %rd2, %rd72; + setp.gt.s64 %p134, %rd2, %rd73; + .loc 1 47 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:47:22 + and.pred %p135, %p134, %p6; + and.pred %p136, %p133, %p6; + and.pred %p137, %p132, %p6; + and.pred %p138, %p131, %p6; + and.pred %p139, %p138, %p128; + and.pred %p141, %p137, %p130; + and.pred %p143, %p136, %p127; + and.pred %p145, %p135, %p129; + .loc 1 70 50 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:70:50 + and.pred %p147, %p1, %p145; + and.pred %p148, %p1, %p143; + and.pred %p149, %p1, %p141; + and.pred %p150, %p1, %p139; + selp.b64 %rd76, 1, 0, %p150; + selp.b64 %rd77, 1, 0, %p149; + selp.b64 %rd78, 1, 0, %p148; + selp.b64 %rd79, 1, 0, %p147; + add.s64 %rd80, %rd71, %rd79; + add.s64 %rd81, %rd70, %rd78; + add.s64 %rd82, %rd69, %rd77; + add.s64 %rd83, %rd68, %rd76; + .loc 1 41 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:41:22 + or.b64 %rd84, %rd16, 96; + or.b32 %r28, %r15, 100; + or.b64 %rd85, %rd16, 104; + or.b32 %r29, %r15, 108; + .loc 1 43 23 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:43:23 + setp.ge.u64 %p151, %rd84, %rd12; + setp.ge.u32 %p152, %r28, %r10; + setp.ge.u64 %p153, %rd85, %rd12; + setp.ge.u32 %p154, %r29, %r10; + .loc 1 45 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:45:22 + cvt.u64.u32 %rd86, %r28; + cvt.u64.u32 %rd87, %r29; + setp.gt.s64 %p155, %rd2, %rd85; + setp.gt.s64 %p156, %rd2, %rd84; + setp.gt.s64 %p157, %rd2, %rd87; + setp.gt.s64 %p158, %rd2, %rd86; + .loc 1 47 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:47:22 + and.pred %p159, %p158, %p6; + and.pred %p160, %p157, %p6; + and.pred %p161, %p156, %p6; + and.pred %p162, %p155, %p6; + and.pred %p163, %p162, %p153; + and.pred %p165, %p161, %p151; + and.pred %p167, %p160, %p154; + and.pred %p169, %p159, %p152; + .loc 1 70 50 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:70:50 + and.pred %p171, %p1, %p169; + and.pred %p172, %p1, %p167; + and.pred %p173, %p1, %p165; + and.pred %p174, %p1, %p163; + selp.b64 %rd88, 1, 0, %p174; + selp.b64 %rd89, 1, 0, %p173; + selp.b64 %rd90, 1, 0, %p172; + selp.b64 %rd91, 1, 0, %p171; + add.s64 %rd92, %rd83, %rd91; + add.s64 %rd93, %rd82, %rd90; + add.s64 %rd94, %rd81, %rd89; + add.s64 %rd95, %rd80, %rd88; + .loc 1 41 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:41:22 + or.b64 %rd96, %rd16, 112; + or.b32 %r30, %r15, 116; + or.b64 %rd97, %rd16, 120; + or.b32 %r31, %r15, 124; + .loc 1 43 23 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:43:23 + setp.ge.u64 %p175, %rd96, %rd12; + setp.ge.u32 %p176, %r30, %r10; + setp.ge.u64 %p177, %rd97, %rd12; + setp.ge.u32 %p178, %r31, %r10; + .loc 1 45 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:45:22 + cvt.u64.u32 %rd98, %r30; + cvt.u64.u32 %rd99, %r31; + setp.gt.s64 %p179, %rd2, %rd98; + setp.gt.s64 %p180, %rd2, %rd99; + setp.gt.s64 %p181, %rd2, %rd96; + setp.gt.s64 %p182, %rd2, %rd97; + .loc 1 47 22 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:47:22 + and.pred %p183, %p182, %p6; + and.pred %p184, %p181, %p6; + and.pred %p185, %p180, %p6; + and.pred %p186, %p179, %p6; + and.pred %p187, %p186, %p176; + and.pred %p189, %p185, %p178; + and.pred %p191, %p184, %p175; + and.pred %p193, %p183, %p177; + .loc 1 70 50 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:70:50 + and.pred %p195, %p1, %p193; + and.pred %p196, %p1, %p191; + and.pred %p197, %p1, %p189; + and.pred %p198, %p1, %p187; + selp.b64 %rd100, 1, 0, %p198; + selp.b64 %rd101, 1, 0, %p197; + selp.b64 %rd102, 1, 0, %p196; + selp.b64 %rd103, 1, 0, %p195; + add.s64 %rd104, %rd95, %rd103; + add.s64 %rd105, %rd94, %rd102; + add.s64 %rd106, %rd93, %rd101; + add.s64 %rd107, %rd92, %rd100; + .loc 1 26 37 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:26:37 + and.b32 %r32, %r8, 31; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + add.s64 %rd108, %rd107, %rd106; + add.s64 %rd109, %rd105, %rd104; + add.s64 %rd110, %rd109, %rd108; + .loc 2 291 36 // standard.py:291:36 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + mov.b64 {_, %r33}, %rd110; + cvt.u32.u64 %r34, %rd110; + shfl.sync.bfly.b32 %r35, %r34, 16, 31, -1; + shfl.sync.bfly.b32 %r36, %r33, 16, 31, -1; + cvt.u64.u32 %rd111, %r35; + cvt.u64.u32 %rd112, %r36; + shl.b64 %rd113, %rd112, 32; + or.b64 %rd114, %rd111, %rd113; + .loc 2 261 15 // standard.py:261:15 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + add.s64 %rd115, %rd110, %rd114; + .loc 2 291 36 // standard.py:291:36 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + mov.b64 {_, %r37}, %rd115; + cvt.u32.u64 %r38, %rd115; + shfl.sync.bfly.b32 %r39, %r38, 8, 31, -1; + shfl.sync.bfly.b32 %r40, %r37, 8, 31, -1; + cvt.u64.u32 %rd116, %r39; + cvt.u64.u32 %rd117, %r40; + shl.b64 %rd118, %rd117, 32; + or.b64 %rd119, %rd116, %rd118; + .loc 2 261 15 // standard.py:261:15 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + add.s64 %rd120, %rd115, %rd119; + .loc 2 291 36 // standard.py:291:36 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + mov.b64 {_, %r41}, %rd120; + cvt.u32.u64 %r42, %rd120; + shfl.sync.bfly.b32 %r43, %r42, 4, 31, -1; + shfl.sync.bfly.b32 %r44, %r41, 4, 31, -1; + cvt.u64.u32 %rd121, %r43; + cvt.u64.u32 %rd122, %r44; + shl.b64 %rd123, %rd122, 32; + or.b64 %rd124, %rd121, %rd123; + .loc 2 261 15 // standard.py:261:15 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + add.s64 %rd125, %rd120, %rd124; + .loc 2 291 36 // standard.py:291:36 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + mov.b64 {_, %r45}, %rd125; + cvt.u32.u64 %r46, %rd125; + shfl.sync.bfly.b32 %r47, %r46, 2, 31, -1; + shfl.sync.bfly.b32 %r48, %r45, 2, 31, -1; + cvt.u64.u32 %rd126, %r47; + cvt.u64.u32 %rd127, %r48; + shl.b64 %rd128, %rd127, 32; + or.b64 %rd129, %rd126, %rd128; + .loc 2 261 15 // standard.py:261:15 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + add.s64 %rd130, %rd125, %rd129; + .loc 2 291 36 // standard.py:291:36 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + mov.b64 {_, %r49}, %rd130; + cvt.u32.u64 %r50, %rd130; + shfl.sync.bfly.b32 %r51, %r50, 1, 31, -1; + shfl.sync.bfly.b32 %r52, %r49, 1, 31, -1; + cvt.u64.u32 %rd131, %r51; + cvt.u64.u32 %rd132, %r52; + shl.b64 %rd133, %rd132, 32; + or.b64 %rd134, %rd131, %rd133; + .loc 2 261 15 // standard.py:261:15 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + add.s64 %rd5, %rd130, %rd134; + .loc 2 291 36 // standard.py:291:36 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + setp.eq.b32 %p2, %r32, 0; + shr.u32 %r53, %r8, 2; + and.b32 %r54, %r53, 120; + mov.b32 %r55, global_smem; + add.s32 %r1, %r55, %r54; + // begin inline asm + @%p2 st.shared.b64 [ %r1 + 0 ], %rd5; + // end inline asm + bar.sync 0; + setp.lt.u32 %p3, %r8, 16; + shl.b32 %r56, %r8, 3; + add.s32 %r2, %r55, %r56; + // begin inline asm + @%p3 ld.shared.b64 %rd6, [ %r2 + 0 ]; + // end inline asm + mov.b64 {_, %r57}, %rd6; + cvt.u32.u64 %r58, %rd6; + shfl.sync.bfly.b32 %r59, %r58, 8, 31, -1; + shfl.sync.bfly.b32 %r60, %r57, 8, 31, -1; + cvt.u64.u32 %rd135, %r59; + cvt.u64.u32 %rd136, %r60; + shl.b64 %rd137, %rd136, 32; + or.b64 %rd138, %rd135, %rd137; + .loc 2 261 15 // standard.py:261:15 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + add.s64 %rd139, %rd6, %rd138; + .loc 2 291 36 // standard.py:291:36 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + mov.b64 {_, %r61}, %rd139; + cvt.u32.u64 %r62, %rd139; + shfl.sync.bfly.b32 %r63, %r62, 4, 31, -1; + shfl.sync.bfly.b32 %r64, %r61, 4, 31, -1; + cvt.u64.u32 %rd140, %r63; + cvt.u64.u32 %rd141, %r64; + shl.b64 %rd142, %rd141, 32; + or.b64 %rd143, %rd140, %rd142; + .loc 2 261 15 // standard.py:261:15 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + add.s64 %rd144, %rd139, %rd143; + .loc 2 291 36 // standard.py:291:36 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + mov.b64 {_, %r65}, %rd144; + cvt.u32.u64 %r66, %rd144; + shfl.sync.bfly.b32 %r67, %r66, 2, 31, -1; + shfl.sync.bfly.b32 %r68, %r65, 2, 31, -1; + cvt.u64.u32 %rd145, %r67; + cvt.u64.u32 %rd146, %r68; + shl.b64 %rd147, %rd146, 32; + or.b64 %rd148, %rd145, %rd147; + .loc 2 261 15 // standard.py:261:15 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + add.s64 %rd149, %rd144, %rd148; + .loc 2 291 36 // standard.py:291:36 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + mov.b64 {_, %r69}, %rd149; + cvt.u32.u64 %r70, %rd149; + shfl.sync.bfly.b32 %r71, %r70, 1, 31, -1; + shfl.sync.bfly.b32 %r72, %r69, 1, 31, -1; + cvt.u64.u32 %rd150, %r71; + cvt.u64.u32 %rd151, %r72; + shl.b64 %rd152, %rd151, 32; + or.b64 %rd153, %rd150, %rd152; + .loc 2 261 15 // standard.py:261:15 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + add.s64 %rd7, %rd149, %rd153; + .loc 2 291 36 // standard.py:291:36 @[ cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:71:27 ] + setp.eq.b32 %p4, %r8, 0; + // begin inline asm + @%p4 st.shared.b64 [ %r2 + 0 ], %rd7; + // end inline asm + bar.sync 0; + ld.shared.b64 %rd8, [global_smem]; +$L__tmp2: + .loc 1 72 25 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:72:25 + mad.wide.u32 %rd9, %r4, 8, %rd11; + .loc 1 72 37 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:72:37 + setp.eq.b32 %p199, %r13, 0; + and.pred %p5, %p1, %p199; + // begin inline asm + @%p5 st.global.b64 [ %rd9 + 0 ], { %rd8 }; + // end inline asm + .loc 1 72 4 // cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py:72:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 279 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x110 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 119 +.b8 114 +.b8 116 +.b8 51 +.b8 99 +.b8 100 +.b8 102 +.b8 105 +.b8 114 +.b8 105 +.b8 50 +.b8 122 +.b8 52 +.b8 106 +.b8 115 +.b8 111 +.b8 52 +.b8 97 +.b8 102 +.b8 121 +.b8 112 +.b8 101 +.b8 100 +.b8 116 +.b8 114 +.b8 117 +.b8 52 +.b8 99 +.b8 100 +.b8 101 +.b8 98 +.b8 112 +.b8 111 +.b8 53 +.b8 53 +.b8 54 +.b8 121 +.b8 122 +.b8 103 +.b8 114 +.b8 113 +.b8 97 +.b8 119 +.b8 108 +.b8 117 +.b8 102 +.b8 115 +.b8 119 +.b8 107 +.b8 50 +.b8 54 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 119 +.b8 114 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x61 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 111 +.b8 114 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 101 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 109 +.b8 117 +.b8 116 +.b8 101 +.b8 95 +.b8 114 +.b8 101 +.b8 109 +.b8 97 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 114 +.b8 95 +.b8 115 +.b8 117 +.b8 98 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xec:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x101:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 71 // DW_AT_call_line +.b8 27 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.source new file mode 100644 index 0000000000000000000000000000000000000000..edea76ce8757ca4ed9d82e01234e3a0e99098637 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.source @@ -0,0 +1,308 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":18:0) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc68 = loc(unknown) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc75 = loc("in_ptr0"(#loc)) +#loc76 = loc("out_ptr0"(#loc)) +#loc77 = loc("xnumel"(#loc)) +#loc78 = loc("r0_numel"(#loc)) +#loc140 = loc("input"(#loc66)) +#loc141 = loc("a"(#loc71)) +#loc142 = loc("b"(#loc71)) +module { + tt.func public @triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 2048 : i32 loc(#loc79) + %r0_numel_1 = arith.constant 16384 : i32 loc(#loc80) + %xoffset = tt.get_program_id x : i32 loc(#loc81) + %xoffset_2 = arith.constant 1 : i32 loc(#loc82) + %xoffset_3 = arith.constant 1 : i32 loc(#loc82) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc82) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc83) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc84) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc85) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc85) + %xmask = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc86) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc86) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc87) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc88) + %x1 = arith.constant 16 : i32 loc(#loc89) + %x1_10 = arith.constant 16 : i32 loc(#loc89) + %x1_11 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc89) + %x1_12 = arith.divsi %xindex_7, %x1_11 : tensor<1x1xi32> loc(#loc89) + %x1_13 = arith.constant 16 : i32 loc(#loc90) + %x1_14 = arith.constant 16 : i32 loc(#loc90) + %x1_15 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc90) + %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<1x1xi32> loc(#loc90) + %x0 = arith.constant 16 : i32 loc(#loc91) + %x0_17 = arith.constant 16 : i32 loc(#loc91) + %x0_18 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc91) + %x0_19 = arith.remsi %xindex_7, %x0_18 : tensor<1x1xi32> loc(#loc91) + %x2 = arith.constant 256 : i32 loc(#loc92) + %x2_20 = arith.constant 256 : i32 loc(#loc92) + %x2_21 = arith.constant dense<256> : tensor<1x1xi32> loc(#loc92) + %x2_22 = arith.divsi %xindex_7, %x2_21 : tensor<1x1xi32> loc(#loc92) + %tmp3 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc93) + %tmp3_23 = tt.addptr %tmp3, %x2_22 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc93) + %tmp3_24 = tt.load %tmp3_23, %xmask_8 evictionPolicy = evict_last : tensor<1x1x!tt.ptr> loc(#loc94) + %_tmp29 = arith.constant 0 : i64 loc(#loc95) + %_tmp29_25 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc95) + %c0_i32 = arith.constant 0 : i32 loc(#loc18) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc18) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc18) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc18) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc18) + %3 = ub.poison : i32 loc(#loc18) + %_tmp29_26 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp29_28 = %_tmp29_25) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc97) + %r0_index_29 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc97) + %r0_mask = arith.constant dense<16384> : tensor<1x2048xi32> loc(#loc98) + %r0_mask_30 = arith.cmpi slt, %r0_index_29, %r0_mask : tensor<1x2048xi32> loc(#loc98) + %r0_4 = arith.constant 128 : i32 loc(#loc99) + %r0_4_31 = arith.constant 128 : i32 loc(#loc99) + %r0_4_32 = arith.constant dense<128> : tensor<1x2048xi32> loc(#loc99) + %r0_4_33 = arith.divsi %r0_index_29, %r0_4_32 : tensor<1x2048xi32> loc(#loc99) + %r0_3 = arith.constant 128 : i32 loc(#loc100) + %r0_3_34 = arith.constant 128 : i32 loc(#loc100) + %r0_3_35 = arith.constant dense<128> : tensor<1x2048xi32> loc(#loc100) + %r0_3_36 = arith.remsi %r0_index_29, %r0_3_35 : tensor<1x2048xi32> loc(#loc100) + %tmp0 = arith.constant 128 : i32 loc(#loc101) + %tmp0_37 = arith.constant 128 : i32 loc(#loc101) + %tmp0_38 = arith.constant dense<128> : tensor<1x1xi32> loc(#loc101) + %tmp0_39 = arith.muli %tmp0_38, %x1_16 : tensor<1x1xi32> loc(#loc101) + %tmp0_40 = tt.broadcast %tmp0_39 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc102) + %tmp0_41 = arith.addi %r0_4_33, %tmp0_40 : tensor<1x2048xi32> loc(#loc102) + %tmp1 = arith.constant 128 : i32 loc(#loc103) + %tmp1_42 = arith.constant 128 : i32 loc(#loc103) + %tmp1_43 = arith.constant dense<128> : tensor<1x1xi32> loc(#loc103) + %tmp1_44 = arith.muli %tmp1_43, %x0_19 : tensor<1x1xi32> loc(#loc103) + %tmp1_45 = tt.broadcast %tmp1_44 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc104) + %tmp1_46 = arith.addi %r0_3_36, %tmp1_45 : tensor<1x2048xi32> loc(#loc104) + %tmp2 = arith.cmpi sge, %tmp0_41, %tmp1_46 : tensor<1x2048xi32> loc(#loc105) + %tmp4 = arith.extsi %tmp1_46 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc106) + %tmp4_47 = tt.broadcast %tmp3_24 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc106) + %tmp4_48 = arith.cmpi slt, %tmp4, %tmp4_47 : tensor<1x2048xi64> loc(#loc106) + %tmp5 = arith.extsi %tmp0_41 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc107) + %tmp5_49 = tt.broadcast %tmp3_24 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc107) + %tmp5_50 = arith.cmpi slt, %tmp5, %tmp5_49 : tensor<1x2048xi64> loc(#loc107) + %tmp6 = arith.andi %tmp4_48, %tmp5_50 : tensor<1x2048xi1> loc(#loc108) + %tmp7 = arith.andi %tmp2, %tmp6 : tensor<1x2048xi1> loc(#loc109) + %tmp8 = arith.constant false loc(#loc110) + %tmp8_51 = arith.constant dense : tensor<1x1xi1> loc(#loc110) + %tmp9 = arith.constant dense : tensor<1x2048xi1> loc(#loc111) + %tmp9_52 = arith.ori %tmp9, %tmp7 : tensor<1x2048xi1> loc(#loc111) + %tmp10 = arith.constant 2048 : i64 loc(#loc112) + %tmp10_53 = arith.constant dense<2048> : tensor<1x1xi64> loc(#loc112) + %tmp11 = arith.extsi %tmp1_46 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc113) + %tmp11_54 = arith.constant dense<2048> : tensor<1x2048xi64> loc(#loc113) + %tmp11_55 = arith.cmpi sge, %tmp11, %tmp11_54 : tensor<1x2048xi64> loc(#loc113) + %tmp12 = arith.andi %tmp11_55, %tmp4_48 : tensor<1x2048xi1> loc(#loc114) + %tmp13 = arith.constant -1 : i32 loc(#loc115) + %tmp13_56 = arith.constant -1 : i32 loc(#loc115) + %tmp13_57 = arith.constant dense<-1> : tensor<1x2048xi32> loc(#loc115) + %tmp13_58 = arith.muli %tmp13_57, %r0_4_33 : tensor<1x2048xi32> loc(#loc115) + %tmp13_59 = arith.addi %r0_3_36, %tmp13_58 : tensor<1x2048xi32> loc(#loc116) + %tmp13_60 = arith.constant -128 : i32 loc(#loc117) + %tmp13_61 = arith.constant -128 : i32 loc(#loc117) + %tmp13_62 = arith.constant dense<-128> : tensor<1x1xi32> loc(#loc117) + %tmp13_63 = arith.muli %tmp13_62, %x1_16 : tensor<1x1xi32> loc(#loc117) + %tmp13_64 = tt.broadcast %tmp13_63 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc118) + %tmp13_65 = arith.addi %tmp13_59, %tmp13_64 : tensor<1x2048xi32> loc(#loc118) + %tmp13_66 = arith.constant 128 : i32 loc(#loc119) + %tmp13_67 = arith.constant 128 : i32 loc(#loc119) + %tmp13_68 = arith.constant dense<128> : tensor<1x1xi32> loc(#loc119) + %tmp13_69 = arith.muli %tmp13_68, %x0_19 : tensor<1x1xi32> loc(#loc119) + %tmp13_70 = tt.broadcast %tmp13_69 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc120) + %tmp13_71 = arith.addi %tmp13_65, %tmp13_70 : tensor<1x2048xi32> loc(#loc120) + %tmp14 = arith.extsi %tmp13_71 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc121) + %tmp14_72 = arith.constant dense<2048> : tensor<1x2048xi64> loc(#loc121) + %tmp14_73 = arith.remsi %tmp14, %tmp14_72 : tensor<1x2048xi64> loc(#loc121) + %tmp15 = arith.constant 0 : i32 loc(#loc122) + %tmp15_74 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc122) + %tmp16 = arith.extsi %tmp15_74 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc123) + %tmp16_75 = tt.broadcast %tmp16 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc123) + %tmp16_76 = arith.cmpi ne, %tmp14_73, %tmp16_75 : tensor<1x2048xi64> loc(#loc123) + %tmp17 = arith.constant 0 : i32 loc(#loc124) + %tmp17_77 = arith.extsi %tmp17 : i32 to i64 loc(#loc124) + %tmp17_78 = tt.splat %tmp17_77 : i64 -> tensor<1x2048xi64> loc(#loc124) + %tmp17_79 = arith.cmpi slt, %tmp14_73, %tmp17_78 : tensor<1x2048xi64> loc(#loc124) + %tmp18 = arith.constant 0 : i32 loc(#loc125) + %tmp18_80 = arith.extsi %tmp18 : i32 to i64 loc(#loc125) + %tmp18_81 = tt.splat %tmp18_80 : i64 -> tensor<1x1xi64> loc(#loc125) + %tmp18_82 = arith.cmpi slt, %tmp10_53, %tmp18_81 : tensor<1x1xi64> loc(#loc125) + %tmp19 = tt.broadcast %tmp18_82 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc126) + %tmp19_83 = arith.cmpi ne, %tmp17_79, %tmp19 : tensor<1x2048xi1> loc(#loc126) + %tmp20 = arith.andi %tmp16_76, %tmp19_83 : tensor<1x2048xi1> loc(#loc127) + %tmp21 = arith.constant dense<2048> : tensor<1x2048xi64> loc(#loc128) + %tmp21_84 = arith.addi %tmp14_73, %tmp21 : tensor<1x2048xi64> loc(#loc128) + %tmp22 = arith.select %tmp20, %tmp21_84, %tmp14_73 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc129) + %tmp23 = arith.constant 0 : i64 loc(#loc130) + %tmp23_85 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc130) + %tmp24 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc131) + %tmp24_86 = arith.cmpi eq, %tmp22, %tmp24 : tensor<1x2048xi64> loc(#loc131) + %tmp25 = arith.andi %tmp12, %tmp24_86 : tensor<1x2048xi1> loc(#loc132) + %tmp26 = arith.ori %tmp9_52, %tmp25 : tensor<1x2048xi1> loc(#loc133) + %tmp27 = arith.extui %tmp26 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc134) + %tmp30 = arith.addi %_tmp29_28, %tmp27 : tensor<1x2048xi64> loc(#loc135) + %_tmp29_87 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc136) + %_tmp29_88 = arith.andi %r0_mask_30, %_tmp29_87 : tensor<1x2048xi1> loc(#loc136) + %_tmp29_89 = arith.select %_tmp29_88, %tmp30, %_tmp29_28 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc137) + scf.yield %_tmp29_89 : tensor<1x2048xi64> loc(#loc60) + } loc(#loc96) + %tmp29 = tt.call @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp29_26) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc138) + %tmp29_27 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc139) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc63) + %5 = tt.addptr %4, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc63) + tt.store %5, %tmp29_27, %xmask_8 : tensor<1x1x!tt.ptr> loc(#loc64) + tt.return loc(#loc65) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2048xi64> loc("input"(#loc66))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc67) + tt.reduce.return %2 : i64 loc(#loc67) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc67) + tt.return %0 : tensor<1xi64> loc(#loc69) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc70) + tt.return %1 : tensor<1xi64> loc(#loc70) + } loc(#loc66) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc71)), %b: i64 loc("b"(#loc71))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc72) + tt.return %0 : i64 loc(#loc73) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc74) + tt.return %1 : i64 loc(#loc74) + } loc(#loc71) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":25:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":28:21) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":28:27) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":29:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":30:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":31:30) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":31:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":32:44) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":34:40) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":35:31) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":36:29) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":39:27) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":40:27) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":41:26) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":41:22) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":42:26) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":42:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":43:23) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":44:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":45:22) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":46:22) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":47:22) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":48:38) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":49:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":50:38) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":51:24) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":52:24) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":53:29) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":53:24) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":53:45) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":53:38) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":53:55) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":53:51) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":54:25) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":55:35) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":56:25) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":57:92) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":58:92) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":59:25) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":60:24) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":61:24) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":62:39) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":63:35) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":64:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":65:24) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":66:23) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":67:25) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":69:25) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":70:36) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":70:50) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":70:8) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":71:27) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":71:30) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":72:25) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":72:37) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":72:4) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc79 = loc("xnumel"(#loc1)) +#loc80 = loc("r0_numel"(#loc2)) +#loc81 = loc("xoffset"(#loc3)) +#loc82 = loc("xoffset"(#loc4)) +#loc83 = loc("xindex"(#loc5)) +#loc84 = loc("xindex"(#loc6)) +#loc85 = loc("xindex"(#loc7)) +#loc86 = loc("xmask"(#loc8)) +#loc87 = loc("r0_base"(#loc9)) +#loc88 = loc("r0_base"(#loc10)) +#loc89 = loc("x1"(#loc11)) +#loc90 = loc("x1"(#loc12)) +#loc91 = loc("x0"(#loc13)) +#loc92 = loc("x2"(#loc14)) +#loc93 = loc("tmp3"(#loc15)) +#loc94 = loc("tmp3"(#loc16)) +#loc95 = loc("_tmp29"(#loc17)) +#loc96 = loc("_tmp29"(#loc18)) +#loc97 = loc("r0_index"(#loc19)) +#loc98 = loc("r0_mask"(#loc20)) +#loc99 = loc("r0_4"(#loc21)) +#loc100 = loc("r0_3"(#loc22)) +#loc101 = loc("tmp0"(#loc23)) +#loc102 = loc("tmp0"(#loc24)) +#loc103 = loc("tmp1"(#loc25)) +#loc104 = loc("tmp1"(#loc26)) +#loc105 = loc("tmp2"(#loc27)) +#loc106 = loc("tmp4"(#loc28)) +#loc107 = loc("tmp5"(#loc29)) +#loc108 = loc("tmp6"(#loc30)) +#loc109 = loc("tmp7"(#loc31)) +#loc110 = loc("tmp8"(#loc32)) +#loc111 = loc("tmp9"(#loc33)) +#loc112 = loc("tmp10"(#loc34)) +#loc113 = loc("tmp11"(#loc35)) +#loc114 = loc("tmp12"(#loc36)) +#loc115 = loc("tmp13"(#loc37)) +#loc116 = loc("tmp13"(#loc38)) +#loc117 = loc("tmp13"(#loc39)) +#loc118 = loc("tmp13"(#loc40)) +#loc119 = loc("tmp13"(#loc41)) +#loc120 = loc("tmp13"(#loc42)) +#loc121 = loc("tmp14"(#loc43)) +#loc122 = loc("tmp15"(#loc44)) +#loc123 = loc("tmp16"(#loc45)) +#loc124 = loc("tmp17"(#loc46)) +#loc125 = loc("tmp18"(#loc47)) +#loc126 = loc("tmp19"(#loc48)) +#loc127 = loc("tmp20"(#loc49)) +#loc128 = loc("tmp21"(#loc50)) +#loc129 = loc("tmp22"(#loc51)) +#loc130 = loc("tmp23"(#loc52)) +#loc131 = loc("tmp24"(#loc53)) +#loc132 = loc("tmp25"(#loc54)) +#loc133 = loc("tmp26"(#loc55)) +#loc134 = loc("tmp27"(#loc56)) +#loc135 = loc("tmp30"(#loc57)) +#loc136 = loc("_tmp29"(#loc58)) +#loc137 = loc("_tmp29"(#loc59)) +#loc138 = loc("tmp29"(#loc61)) +#loc139 = loc("tmp29"(#loc62)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..34c89bd6495c558dd481b6376bcefca8eda9c1df --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttgir @@ -0,0 +1,192 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":18:0) +#loc1 = loc(unknown) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":71:27) +#loc52 = loc("in_ptr0"(#loc)) +#loc53 = loc("out_ptr0"(#loc)) +#loc54 = loc("xnumel"(#loc)) +#loc55 = loc("r0_numel"(#loc)) +#loc98 = loc("tmp29"(#loc46)) +#loc105 = loc(callsite(#loc1 at #loc98)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x2048xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<16384> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<2048> : tensor<1x2048xi64, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c-128_i32 = arith.constant -128 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc56) + %xmask = arith.cmpi slt, %xoffset, %c2048_i32 : i32 loc(#loc57) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc58) + %r0_base_3 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc58) + %x1 = arith.divsi %xoffset, %c16_i32 : i32 loc(#loc59) + %x1_4 = arith.remsi %x1, %c16_i32 : i32 loc(#loc60) + %x0 = arith.remsi %xoffset, %c16_i32 : i32 loc(#loc61) + %x2 = arith.divsi %xoffset, %c256_i32 : i32 loc(#loc62) + %tmp3 = tt.addptr %in_ptr0, %x2 : !tt.ptr, i32 loc(#loc63) + %tmp3_5 = tt.splat %tmp3 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc64) + %tmp3_6 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc64) + %tmp3_7 = tt.load %tmp3_5, %tmp3_6 evictionPolicy = evict_last : tensor<1x1x!tt.ptr, #blocked> loc(#loc64) + %tmp0 = arith.muli %x1_4, %c128_i32 : i32 loc(#loc65) + %tmp0_8 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc100) + %tmp1 = arith.muli %x0, %c128_i32 : i32 loc(#loc67) + %tmp1_9 = tt.splat %tmp1 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc101) + %tmp4 = tt.broadcast %tmp3_7 : tensor<1x1xi64, #blocked> -> tensor<1x2048xi64, #blocked> loc(#loc69) + %tmp13 = arith.muli %x1_4, %c-128_i32 : i32 loc(#loc70) + %tmp13_10 = tt.splat %tmp13 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc102) + %_tmp29 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc103) + %_tmp29_11 = scf.for %r0_offset = %c0_i32 to %c16384_i32 step %c2048_i32 iter_args(%_tmp29_13 = %cst) -> (tensor<1x2048xi64, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc74) + %r0_index_14 = arith.addi %r0_index, %r0_base_3 : tensor<1x2048xi32, #blocked> loc(#loc74) + %r0_mask = arith.cmpi slt, %r0_index_14, %cst_0 : tensor<1x2048xi32, #blocked> loc(#loc75) + %r0_4 = arith.divsi %r0_index_14, %cst_1 : tensor<1x2048xi32, #blocked> loc(#loc76) + %r0_3 = arith.remsi %r0_index_14, %cst_1 : tensor<1x2048xi32, #blocked> loc(#loc77) + %tmp0_15 = arith.addi %r0_4, %tmp0_8 : tensor<1x2048xi32, #blocked> loc(#loc66) + %tmp1_16 = arith.addi %r0_3, %tmp1_9 : tensor<1x2048xi32, #blocked> loc(#loc68) + %tmp2 = arith.cmpi sge, %tmp0_15, %tmp1_16 : tensor<1x2048xi32, #blocked> loc(#loc78) + %tmp4_17 = arith.extsi %tmp1_16 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc69) + %tmp4_18 = arith.cmpi slt, %tmp4_17, %tmp4 : tensor<1x2048xi64, #blocked> loc(#loc69) + %tmp5 = arith.extsi %tmp0_15 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc79) + %tmp5_19 = arith.cmpi slt, %tmp5, %tmp4 : tensor<1x2048xi64, #blocked> loc(#loc79) + %tmp6 = arith.andi %tmp4_18, %tmp5_19 : tensor<1x2048xi1, #blocked> loc(#loc80) + %tmp7 = arith.andi %tmp2, %tmp6 : tensor<1x2048xi1, #blocked> loc(#loc81) + %tmp11 = arith.cmpi sge, %tmp4_17, %cst_2 : tensor<1x2048xi64, #blocked> loc(#loc82) + %tmp12 = arith.andi %tmp11, %tmp4_18 : tensor<1x2048xi1, #blocked> loc(#loc83) + %tmp13_20 = arith.subi %r0_3, %r0_4 : tensor<1x2048xi32, #blocked> loc(#loc84) + %tmp13_21 = arith.addi %tmp13_20, %tmp13_10 : tensor<1x2048xi32, #blocked> loc(#loc71) + %tmp13_22 = arith.addi %tmp13_21, %tmp1_9 : tensor<1x2048xi32, #blocked> loc(#loc85) + %tmp14 = arith.extsi %tmp13_22 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc86) + %tmp14_23 = arith.remsi %tmp14, %cst_2 : tensor<1x2048xi64, #blocked> loc(#loc86) + %tmp16 = arith.cmpi ne, %tmp14_23, %cst : tensor<1x2048xi64, #blocked> loc(#loc87) + %tmp17 = arith.cmpi slt, %tmp14_23, %cst : tensor<1x2048xi64, #blocked> loc(#loc88) + %tmp20 = arith.andi %tmp16, %tmp17 : tensor<1x2048xi1, #blocked> loc(#loc89) + %tmp21 = arith.addi %tmp14_23, %cst_2 : tensor<1x2048xi64, #blocked> loc(#loc90) + %tmp22 = arith.select %tmp20, %tmp21, %tmp14_23 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc91) + %tmp24 = arith.cmpi eq, %tmp22, %cst : tensor<1x2048xi64, #blocked> loc(#loc92) + %tmp25 = arith.andi %tmp12, %tmp24 : tensor<1x2048xi1, #blocked> loc(#loc93) + %tmp26 = arith.ori %tmp7, %tmp25 : tensor<1x2048xi1, #blocked> loc(#loc94) + %tmp27 = arith.extui %tmp26 : tensor<1x2048xi1, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc95) + %tmp30 = arith.addi %_tmp29_13, %tmp27 : tensor<1x2048xi64, #blocked> loc(#loc96) + %_tmp29_24 = arith.andi %r0_mask, %_tmp29 : tensor<1x2048xi1, #blocked> loc(#loc72) + %_tmp29_25 = arith.select %_tmp29_24, %tmp30, %_tmp29_13 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc97) + scf.yield %_tmp29_25 : tensor<1x2048xi64, #blocked> loc(#loc44) + } loc(#loc73) + %tmp29 = "tt.reduce"(%_tmp29_11) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_13: i64 loc(callsite(#loc1 at #loc98)), %tmp29_14: i64 loc(callsite(#loc1 at #loc98))): + %tmp29_15 = arith.addi %tmp29_13, %tmp29_14 : i64 loc(#loc106) + tt.reduce.return %tmp29_15 : i64 loc(#loc104) + }) : (tensor<1x2048xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc104) + %0 = ttg.convert_layout %tmp29 : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc48) + %tmp29_12 = tt.expand_dims %0 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi64, #blocked1> loc(#loc99) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc50) + %2 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc48) + %3 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked1> loc(#loc48) + tt.store %2, %tmp29_12, %3 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc48) + tt.return loc(#loc51) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":25:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":26:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":28:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":28:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":29:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":30:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":31:30) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":31:35) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":41:26) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":41:22) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":42:26) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":42:22) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":44:22) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":53:45) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":53:38) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":70:36) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":34:40) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":35:31) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":36:29) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":39:27) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":40:27) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":43:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":45:22) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":46:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":47:22) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":51:24) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":52:24) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":53:24) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":53:51) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":54:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":56:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":57:92) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":60:24) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":61:24) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":62:39) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":64:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":65:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":66:23) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":67:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":69:25) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":70:50) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":70:8) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":72:37) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":71:30) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":72:25) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":72:4) +#loc56 = loc("xoffset"(#loc2)) +#loc57 = loc("xmask"(#loc3)) +#loc58 = loc("r0_base"(#loc4)) +#loc59 = loc("x1"(#loc5)) +#loc60 = loc("x1"(#loc6)) +#loc61 = loc("x0"(#loc7)) +#loc62 = loc("x2"(#loc8)) +#loc63 = loc("tmp3"(#loc9)) +#loc64 = loc("tmp3"(#loc10)) +#loc65 = loc("tmp0"(#loc11)) +#loc66 = loc("tmp0"(#loc12)) +#loc67 = loc("tmp1"(#loc13)) +#loc68 = loc("tmp1"(#loc14)) +#loc69 = loc("tmp4"(#loc15)) +#loc70 = loc("tmp13"(#loc16)) +#loc71 = loc("tmp13"(#loc17)) +#loc72 = loc("_tmp29"(#loc18)) +#loc73 = loc("_tmp29"(#loc19)) +#loc74 = loc("r0_index"(#loc20)) +#loc75 = loc("r0_mask"(#loc21)) +#loc76 = loc("r0_4"(#loc22)) +#loc77 = loc("r0_3"(#loc23)) +#loc78 = loc("tmp2"(#loc24)) +#loc79 = loc("tmp5"(#loc25)) +#loc80 = loc("tmp6"(#loc26)) +#loc81 = loc("tmp7"(#loc27)) +#loc82 = loc("tmp11"(#loc28)) +#loc83 = loc("tmp12"(#loc29)) +#loc84 = loc("tmp13"(#loc30)) +#loc85 = loc("tmp13"(#loc31)) +#loc86 = loc("tmp14"(#loc32)) +#loc87 = loc("tmp16"(#loc33)) +#loc88 = loc("tmp17"(#loc34)) +#loc89 = loc("tmp20"(#loc35)) +#loc90 = loc("tmp21"(#loc36)) +#loc91 = loc("tmp22"(#loc37)) +#loc92 = loc("tmp24"(#loc38)) +#loc93 = loc("tmp25"(#loc39)) +#loc94 = loc("tmp26"(#loc40)) +#loc95 = loc("tmp27"(#loc41)) +#loc96 = loc("tmp30"(#loc42)) +#loc97 = loc("_tmp29"(#loc43)) +#loc99 = loc("tmp29"(#loc49)) +#loc100 = loc(fused[#loc66, #loc65]) +#loc101 = loc(fused[#loc68, #loc67]) +#loc102 = loc(fused[#loc71, #loc70]) +#loc103 = loc(fused[#loc72, #loc57]) +#loc104 = loc(callsite(#loc45 at #loc98)) +#loc106 = loc(callsite(#loc47 at #loc104)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..74d3d70984099e109b64f9319c422714f4121c8d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UAM4EBYDUDQ64AY5OZ2W34W5ZPAVVODXU6VWSJG4HYBEEWFTELDA/triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0.ttir @@ -0,0 +1,190 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":18:0) +#loc1 = loc(unknown) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":71:27) +#loc53 = loc("in_ptr0"(#loc)) +#loc54 = loc("out_ptr0"(#loc)) +#loc55 = loc("xnumel"(#loc)) +#loc56 = loc("r0_numel"(#loc)) +#loc100 = loc("tmp29"(#loc47)) +#loc107 = loc(callsite(#loc1 at #loc100)) +module { + tt.func public @triton_red_fused_arange_bitwise_and_bitwise_or_eq_ge_index_lt_permute_remainder_sub_sum_view_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c-128_i32 = arith.constant -128 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %x2 = arith.constant 256 : i32 loc(#loc57) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst = arith.constant dense<2048> : tensor<1x2048xi64> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<1x2048xi32> loc(#loc1) + %cst_1 = arith.constant dense<16384> : tensor<1x2048xi32> loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc58) + %xmask = arith.cmpi slt, %xoffset, %c2048_i32 : i32 loc(#loc59) + %xmask_3 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc59) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc60) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc61) + %x1 = arith.divsi %xoffset, %c16_i32 : i32 loc(#loc62) + %x1_5 = arith.remsi %x1, %c16_i32 : i32 loc(#loc63) + %x0 = arith.remsi %xoffset, %c16_i32 : i32 loc(#loc64) + %x2_6 = arith.divsi %xoffset, %x2 : i32 loc(#loc57) + %tmp3 = tt.addptr %in_ptr0, %x2_6 : !tt.ptr, i32 loc(#loc65) + %tmp3_7 = tt.splat %tmp3 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc65) + %tmp3_8 = tt.load %tmp3_7, %xmask_3 evictionPolicy = evict_last : tensor<1x1x!tt.ptr> loc(#loc66) + %_tmp29 = scf.for %r0_offset = %c0_i32 to %c16384_i32 step %c2048_i32 iter_args(%_tmp29_10 = %cst_2) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc68) + %r0_index_11 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32> loc(#loc68) + %r0_mask = arith.cmpi slt, %r0_index_11, %cst_1 : tensor<1x2048xi32> loc(#loc69) + %r0_4 = arith.divsi %r0_index_11, %cst_0 : tensor<1x2048xi32> loc(#loc70) + %r0_3 = arith.remsi %r0_index_11, %cst_0 : tensor<1x2048xi32> loc(#loc71) + %tmp0 = arith.muli %x1_5, %c128_i32 : i32 loc(#loc72) + %tmp0_12 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc102) + %tmp0_13 = arith.addi %r0_4, %tmp0_12 : tensor<1x2048xi32> loc(#loc73) + %tmp1 = arith.muli %x0, %c128_i32 : i32 loc(#loc74) + %tmp1_14 = tt.splat %tmp1 : i32 -> tensor<1x2048xi32> loc(#loc103) + %tmp1_15 = arith.addi %r0_3, %tmp1_14 : tensor<1x2048xi32> loc(#loc75) + %tmp2 = arith.cmpi sge, %tmp0_13, %tmp1_15 : tensor<1x2048xi32> loc(#loc76) + %tmp4 = arith.extsi %tmp1_15 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc77) + %tmp4_16 = tt.broadcast %tmp3_8 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc77) + %tmp4_17 = arith.cmpi slt, %tmp4, %tmp4_16 : tensor<1x2048xi64> loc(#loc77) + %tmp5 = arith.extsi %tmp0_13 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc78) + %tmp5_18 = arith.cmpi slt, %tmp5, %tmp4_16 : tensor<1x2048xi64> loc(#loc78) + %tmp6 = arith.andi %tmp4_17, %tmp5_18 : tensor<1x2048xi1> loc(#loc79) + %tmp7 = arith.andi %tmp2, %tmp6 : tensor<1x2048xi1> loc(#loc80) + %tmp11 = arith.cmpi sge, %tmp4, %cst : tensor<1x2048xi64> loc(#loc81) + %tmp12 = arith.andi %tmp11, %tmp4_17 : tensor<1x2048xi1> loc(#loc82) + %tmp13 = arith.subi %r0_3, %r0_4 : tensor<1x2048xi32> loc(#loc83) + %tmp13_19 = arith.muli %x1_5, %c-128_i32 : i32 loc(#loc84) + %tmp13_20 = tt.splat %tmp13_19 : i32 -> tensor<1x2048xi32> loc(#loc104) + %tmp13_21 = arith.addi %tmp13, %tmp13_20 : tensor<1x2048xi32> loc(#loc85) + %tmp13_22 = arith.addi %tmp13_21, %tmp1_14 : tensor<1x2048xi32> loc(#loc86) + %tmp14 = arith.extsi %tmp13_22 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc87) + %tmp14_23 = arith.remsi %tmp14, %cst : tensor<1x2048xi64> loc(#loc87) + %tmp16 = arith.cmpi ne, %tmp14_23, %cst_2 : tensor<1x2048xi64> loc(#loc88) + %tmp17 = arith.cmpi slt, %tmp14_23, %cst_2 : tensor<1x2048xi64> loc(#loc89) + %tmp20 = arith.andi %tmp16, %tmp17 : tensor<1x2048xi1> loc(#loc90) + %tmp21 = arith.addi %tmp14_23, %cst : tensor<1x2048xi64> loc(#loc91) + %tmp22 = arith.select %tmp20, %tmp21, %tmp14_23 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc92) + %tmp24 = arith.cmpi eq, %tmp22, %cst_2 : tensor<1x2048xi64> loc(#loc93) + %tmp25 = arith.andi %tmp12, %tmp24 : tensor<1x2048xi1> loc(#loc94) + %tmp26 = arith.ori %tmp7, %tmp25 : tensor<1x2048xi1> loc(#loc95) + %tmp27 = arith.extui %tmp26 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc96) + %tmp30 = arith.addi %_tmp29_10, %tmp27 : tensor<1x2048xi64> loc(#loc97) + %_tmp29_24 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc105) + %_tmp29_25 = arith.andi %r0_mask, %_tmp29_24 : tensor<1x2048xi1> loc(#loc98) + %_tmp29_26 = arith.select %_tmp29_25, %tmp30, %_tmp29_10 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc99) + scf.yield %_tmp29_26 : tensor<1x2048xi64> loc(#loc45) + } loc(#loc67) + %tmp29 = "tt.reduce"(%_tmp29) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_10: i64 loc(callsite(#loc1 at #loc100)), %tmp29_11: i64 loc(callsite(#loc1 at #loc100))): + %tmp29_12 = arith.addi %tmp29_10, %tmp29_11 : i64 loc(#loc108) + tt.reduce.return %tmp29_12 : i64 loc(#loc106) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc106) + %tmp29_9 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc101) + %0 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc50) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc50) + tt.store %1, %tmp29_9, %xmask_3 : tensor<1x1x!tt.ptr> loc(#loc51) + tt.return loc(#loc52) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":30:19) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":34:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":23:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":25:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":26:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":26:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":28:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":28:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":29:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":31:30) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":31:35) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":35:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":36:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":39:27) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":40:27) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":41:26) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":41:22) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":42:26) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":42:22) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":43:23) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":44:22) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":45:22) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":46:22) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":47:22) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":51:24) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":52:24) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":53:24) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":53:45) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":53:38) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":53:51) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":54:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":56:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":57:92) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":60:24) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":61:24) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":62:39) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":64:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":65:24) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":66:23) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":67:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":69:25) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":70:36) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":70:50) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":70:8) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":71:30) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":72:25) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":72:37) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/wr/cwrt3cdfiri2z4jso4afypedtru4cdebpo556yzgrqawlufswk26.py":72:4) +#loc57 = loc("x2"(#loc2)) +#loc58 = loc("xoffset"(#loc4)) +#loc59 = loc("xmask"(#loc5)) +#loc60 = loc("r0_base"(#loc6)) +#loc61 = loc("r0_base"(#loc7)) +#loc62 = loc("x1"(#loc8)) +#loc63 = loc("x1"(#loc9)) +#loc64 = loc("x0"(#loc10)) +#loc65 = loc("tmp3"(#loc11)) +#loc66 = loc("tmp3"(#loc12)) +#loc67 = loc("_tmp29"(#loc3)) +#loc68 = loc("r0_index"(#loc13)) +#loc69 = loc("r0_mask"(#loc14)) +#loc70 = loc("r0_4"(#loc15)) +#loc71 = loc("r0_3"(#loc16)) +#loc72 = loc("tmp0"(#loc17)) +#loc73 = loc("tmp0"(#loc18)) +#loc74 = loc("tmp1"(#loc19)) +#loc75 = loc("tmp1"(#loc20)) +#loc76 = loc("tmp2"(#loc21)) +#loc77 = loc("tmp4"(#loc22)) +#loc78 = loc("tmp5"(#loc23)) +#loc79 = loc("tmp6"(#loc24)) +#loc80 = loc("tmp7"(#loc25)) +#loc81 = loc("tmp11"(#loc26)) +#loc82 = loc("tmp12"(#loc27)) +#loc83 = loc("tmp13"(#loc28)) +#loc84 = loc("tmp13"(#loc29)) +#loc85 = loc("tmp13"(#loc30)) +#loc86 = loc("tmp13"(#loc31)) +#loc87 = loc("tmp14"(#loc32)) +#loc88 = loc("tmp16"(#loc33)) +#loc89 = loc("tmp17"(#loc34)) +#loc90 = loc("tmp20"(#loc35)) +#loc91 = loc("tmp21"(#loc36)) +#loc92 = loc("tmp22"(#loc37)) +#loc93 = loc("tmp24"(#loc38)) +#loc94 = loc("tmp25"(#loc39)) +#loc95 = loc("tmp26"(#loc40)) +#loc96 = loc("tmp27"(#loc41)) +#loc97 = loc("tmp30"(#loc42)) +#loc98 = loc("_tmp29"(#loc43)) +#loc99 = loc("_tmp29"(#loc44)) +#loc101 = loc("tmp29"(#loc49)) +#loc102 = loc(fused[#loc73, #loc72]) +#loc103 = loc(fused[#loc75, #loc74]) +#loc104 = loc(fused[#loc85, #loc84]) +#loc105 = loc(fused[#loc98, #loc59]) +#loc106 = loc(callsite(#loc46 at #loc100)) +#loc108 = loc(callsite(#loc48 at #loc106)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a5eaaebb9bb6ea7d9c1bff0ab0daf4b11d97f862 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..96440c4159a75f0de4ada728e511011f80c6d564 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..723f49252e057bd7d914bf8ede71ae14b58c3431 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "a35ffaa034bc57b4427c333ac65258b17402dc19c02242be6885d56888e186b5", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 32, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..015dd771e2496cd751b54a3122113a1a1b596540 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.llir @@ -0,0 +1,190 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i64 %4, i32 %5, i32 %6, ptr addrspace(1) readnone captures(none) %7, ptr addrspace(1) readnone captures(none) %8) local_unnamed_addr #0 !dbg !4 { + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %11 = shl i32 %10, 3, !dbg !8 + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %13 = and i32 %12, 224, !dbg !9 + %14 = lshr exact i32 %13, 5, !dbg !9 + %15 = and i32 %12, 7, !dbg !9 + %16 = or disjoint i32 %14, %11, !dbg !10 + %17 = or disjoint i32 %11, %15, !dbg !10 + %18 = icmp slt i32 %16, %5, !dbg !11 + %19 = icmp slt i32 %17, %5, !dbg !11 + %20 = shl nuw nsw i32 %12, 2, !dbg !12 + %21 = and i32 %20, 124, !dbg !12 + %22 = sext i32 %16 to i64, !dbg !13 + %.frozen = freeze i64 %3, !dbg !14 + %23 = sdiv i64 %22, %.frozen, !dbg !14 + %24 = mul i64 %23, %.frozen, !dbg !13 + %.decomposed = sub i64 %22, %24, !dbg !13 + %25 = srem i64 %23, 32, !dbg !15 + %26 = sdiv i64 %22, %4, !dbg !16 + %.not = icmp ne i64 %.decomposed, 0, !dbg !17 + %27 = icmp slt i32 %11, 0, !dbg !21 + %28 = icmp slt i64 %3, 0, !dbg !22 + %29 = xor i1 %27, %28, !dbg !23 + %narrow = select i1 %29, i1 %.not, i1 false, !dbg !24 + %30 = sext i1 %narrow to i64, !dbg !24 + %31 = add nsw i64 %23, %30, !dbg !24 + %32 = zext nneg i32 %21 to i64, !dbg !25 + %33 = shl i64 %3, 12, !dbg !26 + %34 = mul i64 %33, %26, !dbg !27 + %.idx = shl nsw i64 %25, 8, !dbg !28 + %35 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx, !dbg !28 + %36 = getelementptr bfloat, ptr addrspace(1) %35, i64 %32, !dbg !28 + %.idx1 = shl nsw i64 %.decomposed, 13, !dbg !28 + %37 = getelementptr i8, ptr addrspace(1) %36, i64 %.idx1, !dbg !28 + %38 = getelementptr bfloat, ptr addrspace(1) %37, i64 %34, !dbg !28 + %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !29 + %40 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %38, i64 %39, i1 %18) #4, !dbg !29 + %41 = extractvalue { i32, i32 } %40, 0, !dbg !29 + %42 = bitcast i32 %41 to <2 x bfloat>, !dbg !29 + %43 = extractvalue { i32, i32 } %40, 1, !dbg !29 + %44 = bitcast i32 %43 to <2 x bfloat>, !dbg !29 + %45 = icmp slt i64 %3, 2, !dbg !30 + %46 = icmp sgt i64 %3, 1, !dbg !31 + %47 = select i1 %46, i64 %3, i64 0, !dbg !32 + %48 = zext i1 %45 to i64, !dbg !33 + %49 = add i64 %47, %48, !dbg !34 + %50 = shl i64 %49, 7, !dbg !35 + %51 = mul i64 %50, %31, !dbg !36 + %.idx2 = shl nsw i64 %.decomposed, 8, !dbg !37 + %52 = getelementptr i8, ptr addrspace(1) %1, i64 %.idx2, !dbg !37 + %53 = getelementptr bfloat, ptr addrspace(1) %52, i64 %32, !dbg !37 + %54 = getelementptr bfloat, ptr addrspace(1) %53, i64 %51, !dbg !37 + %55 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !38 + %56 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %54, i64 %55, i1 %18) #4, !dbg !38 + %57 = extractvalue { i32, i32 } %56, 0, !dbg !38 + %58 = bitcast i32 %57 to <2 x bfloat>, !dbg !38 + %59 = extractvalue { i32, i32 } %56, 1, !dbg !38 + %60 = bitcast i32 %59 to <2 x bfloat>, !dbg !38 + %61 = fpext <2 x bfloat> %42 to <2 x float>, !dbg !39 + %62 = fpext <2 x bfloat> %58 to <2 x float>, !dbg !40 + %63 = fmul <2 x float> %61, %62, !dbg !41 + %64 = fadd <2 x float> %63, zeroinitializer, !dbg !42 + %65 = fpext <2 x bfloat> %44 to <2 x float>, !dbg !39 + %66 = fpext <2 x bfloat> %60 to <2 x float>, !dbg !40 + %67 = fmul <2 x float> %65, %66, !dbg !41 + %68 = fadd <2 x float> %67, zeroinitializer, !dbg !42 + %shift = shufflevector <2 x float> %64, <2 x float> poison, <2 x i32> , !dbg !43 + %foldExtExtBinop = fadd <2 x float> %64, %shift, !dbg !43 + %foldExtExtBinop4 = fadd <2 x float> %68, %foldExtExtBinop, !dbg !43 + %shift6 = shufflevector <2 x float> %68, <2 x float> poison, <2 x i32> , !dbg !43 + %foldExtExtBinop7 = fadd <2 x float> %shift6, %foldExtExtBinop4, !dbg !43 + %69 = extractelement <2 x float> %foldExtExtBinop7, i64 0, !dbg !43 + %70 = select i1 %18, float %69, float 0.000000e+00, !dbg !43 + %71 = bitcast float %70 to i32, !dbg !47 + %72 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 16, i32 31), !dbg !47 + %73 = bitcast i32 %72 to float, !dbg !47 + %74 = fadd float %70, %73, !dbg !43 + %75 = bitcast float %74 to i32, !dbg !47 + %76 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %75, i32 8, i32 31), !dbg !47 + %77 = bitcast i32 %76 to float, !dbg !47 + %78 = fadd float %74, %77, !dbg !43 + %79 = bitcast float %78 to i32, !dbg !47 + %80 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %79, i32 4, i32 31), !dbg !47 + %81 = bitcast i32 %80 to float, !dbg !47 + %82 = fadd float %78, %81, !dbg !43 + %83 = bitcast float %82 to i32, !dbg !47 + %84 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %83, i32 2, i32 31), !dbg !47 + %85 = bitcast i32 %84 to float, !dbg !47 + %86 = fadd float %82, %85, !dbg !43 + %87 = bitcast float %86 to i32, !dbg !47 + %88 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 1, i32 31), !dbg !47 + %89 = bitcast i32 %88 to float, !dbg !47 + %90 = fadd float %86, %89, !dbg !43 + %91 = lshr exact i32 %13, 3, !dbg !48 + %92 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %91, !dbg !48 + store float %90, ptr addrspace(3) %92, align 4, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %93 = shl nuw nsw i32 %15, 2, !dbg !48 + %94 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %93, !dbg !48 + %95 = load i32, ptr addrspace(3) %94, align 4, !dbg !48 + %96 = sext i32 %17 to i64, !dbg !49 + %97 = getelementptr float, ptr addrspace(1) %2, i64 %96, !dbg !49 + %98 = and i32 %12, 248, !dbg !50 + %99 = icmp eq i32 %98, 0, !dbg !50 + %100 = and i1 %99, %19, !dbg !50 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %95, ptr addrspace(1) %97, i1 %100) #4, !dbg !50 + ret void, !dbg !51 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 22, column: 33, scope: !4) +!9 = !DILocation(line: 23, column: 44, scope: !4) +!10 = !DILocation(line: 23, column: 23, scope: !4) +!11 = !DILocation(line: 24, column: 21, scope: !4) +!12 = !DILocation(line: 25, column: 37, scope: !4) +!13 = !DILocation(line: 27, column: 19, scope: !4) +!14 = !DILocation(line: 28, column: 21, scope: !4) +!15 = !DILocation(line: 28, column: 28, scope: !4) +!16 = !DILocation(line: 29, column: 19, scope: !4) +!17 = !DILocation(line: 74, column: 34, scope: !18, inlinedAt: !20) +!18 = distinct !DILexicalBlockFile(scope: !4, file: !19, discriminator: 0) +!19 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!20 = !DILocation(line: 30, column: 51, scope: !4) +!21 = !DILocation(line: 75, column: 25, scope: !18, inlinedAt: !20) +!22 = !DILocation(line: 75, column: 36, scope: !18, inlinedAt: !20) +!23 = !DILocation(line: 75, column: 32, scope: !18, inlinedAt: !20) +!24 = !DILocation(line: 75, column: 47, scope: !18, inlinedAt: !20) +!25 = !DILocation(line: 39, column: 41, scope: !4) +!26 = !DILocation(line: 39, column: 65, scope: !4) +!27 = !DILocation(line: 39, column: 69, scope: !4) +!28 = !DILocation(line: 39, column: 34, scope: !4) +!29 = !DILocation(line: 39, column: 74, scope: !4) +!30 = !DILocation(line: 40, column: 73, scope: !4) +!31 = !DILocation(line: 40, column: 99, scope: !4) +!32 = !DILocation(line: 40, column: 90, scope: !4) +!33 = !DILocation(line: 40, scope: !4) +!34 = !DILocation(line: 40, column: 81, scope: !4) +!35 = !DILocation(line: 40, column: 54, scope: !4) +!36 = !DILocation(line: 40, column: 58, scope: !4) +!37 = !DILocation(line: 40, column: 34, scope: !4) +!38 = !DILocation(line: 40, column: 106, scope: !4) +!39 = !DILocation(line: 39, column: 136, scope: !4) +!40 = !DILocation(line: 40, column: 168, scope: !4) +!41 = !DILocation(line: 41, column: 22, scope: !4) +!42 = !DILocation(line: 43, column: 23, scope: !4) +!43 = !DILocation(line: 261, column: 15, scope: !44, inlinedAt: !46) +!44 = distinct !DILexicalBlockFile(scope: !4, file: !45, discriminator: 0) +!45 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!46 = !DILocation(line: 45, column: 25, scope: !4) +!47 = !DILocation(line: 291, column: 36, scope: !44, inlinedAt: !46) +!48 = !DILocation(line: 45, column: 28, scope: !4) +!49 = !DILocation(line: 49, column: 25, scope: !4) +!50 = !DILocation(line: 49, column: 36, scope: !4) +!51 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..29427c0a1abb69e8962b6c6c5d57de491572a594 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.ptx @@ -0,0 +1,476 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u64 triton_red_fused_zeros_0_param_3, + .param .u64 triton_red_fused_zeros_0_param_4, + .param .u32 triton_red_fused_zeros_0_param_5, + .param .u32 triton_red_fused_zeros_0_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_7, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_8 +) +.reqntid 256 +{ + .reg .pred %p<15>; + .reg .b16 %rs<9>; + .reg .b32 %r<62>; + .reg .b64 %rd<53>; + .loc 1 18 0 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:18:0 + +// %bb.0: + ld.param.b64 %rd15, [triton_red_fused_zeros_0_param_4]; + ld.param.b64 %rd14, [triton_red_fused_zeros_0_param_3]; +$L__tmp0: + .loc 1 22 28 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:22:28 + mov.u32 %r8, %ctaid.x; + .loc 1 22 33 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:22:33 + shl.b32 %r1, %r8, 3; + .loc 1 23 44 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:23:44 + mov.u32 %r2, %tid.x; + bfe.u32 %r9, %r2, 5, 3; + and.b32 %r4, %r2, 7; + .loc 1 23 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:23:23 + or.b32 %r10, %r9, %r1; + .loc 1 25 37 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:25:37 + shl.b32 %r11, %r2, 2; + .loc 1 27 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:27:19 + cvt.s64.s32 %rd1, %r10; + .loc 1 28 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:28:21 + or.b64 %rd17, %rd1, %rd14; + and.b64 %rd18, %rd17, -4294967296; + setp.ne.b64 %p1, %rd18, 0; + cvt.u32.u64 %r61, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd51, %rd1, %rd14; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r12, %rd14; + div.u32 %r14, %r61, %r12; + cvt.u64.u32 %rd51, %r14; +$L__BB0_3: + .loc 1 0 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:0:21 + ld.param.b32 %r7, [triton_red_fused_zeros_0_param_5]; + ld.param.b64 %rd13, [triton_red_fused_zeros_0_param_2]; + ld.param.b64 %rd12, [triton_red_fused_zeros_0_param_1]; + ld.param.b64 %rd11, [triton_red_fused_zeros_0_param_0]; + and.b32 %r3, %r2, 224; + or.b32 %r5, %r1, %r4; + and.b32 %r6, %r11, 124; + .loc 1 27 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:27:19 + mul.lo.s64 %rd19, %rd51, %rd14; + sub.s64 %rd6, %rd1, %rd19; + .loc 1 28 28 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:28:28 + shr.s64 %rd20, %rd51, 63; + shr.u64 %rd21, %rd20, 59; + add.s64 %rd22, %rd51, %rd21; + and.b64 %rd23, %rd22, -32; + sub.s64 %rd7, %rd51, %rd23; + .loc 1 29 19 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:29:19 + or.b64 %rd24, %rd1, %rd15; + and.b64 %rd25, %rd24, -4294967296; + setp.ne.b64 %p2, %rd25, 0; + @%p2 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd52, %rd1, %rd15; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r15, %rd15; + div.u32 %r17, %r61, %r15; + cvt.u64.u32 %rd52, %r17; +$L__BB0_6: + .loc 1 24 21 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:24:21 + setp.lt.s32 %p6, %r5, %r7; + setp.lt.s32 %p3, %r61, %r7; +$L__tmp1: + .loc 2 74 34 // triton_helpers.py:74:34 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + setp.ne.b64 %p7, %rd6, 0; + .loc 2 75 25 // triton_helpers.py:75:25 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + setp.lt.s32 %p8, %r1, 0; + .loc 2 75 36 // triton_helpers.py:75:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + setp.lt.s64 %p9, %rd14, 0; + .loc 2 75 32 // triton_helpers.py:75:32 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + xor.pred %p10, %p8, %p9; + .loc 2 75 47 // triton_helpers.py:75:47 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:30:51 ] + and.pred %p11, %p10, %p7; + selp.b64 %rd33, -1, 0, %p11; + add.s64 %rd34, %rd51, %rd33; +$L__tmp2: + .loc 1 39 69 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:69 + mul.lo.s64 %rd35, %rd14, %rd52; + .loc 1 39 34 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:34 + shl.b64 %rd36, %rd7, 8; + add.s64 %rd37, %rd11, %rd36; + mul.wide.u32 %rd38, %r6, 2; + add.s64 %rd39, %rd37, %rd38; + shl.b64 %rd40, %rd6, 13; + add.s64 %rd41, %rd39, %rd40; + shl.b64 %rd42, %rd35, 13; + add.s64 %rd27, %rd41, %rd42; + .loc 1 39 74 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:74 + // begin inline asm + mov.u64 %rd28, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd28, 1.0; + // end inline asm + mov.b32 %r20, 0; + // begin inline asm + mov.u32 %r18, %r20; + mov.u32 %r19, %r20; + @%p3 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r18, %r19 }, [ %rd27 + 0 ], %rd28; + // end inline asm + .loc 1 40 73 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:73 + setp.lt.s64 %p12, %rd14, 2; + .loc 1 40 99 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:99 + setp.gt.s64 %p13, %rd14, 1; + .loc 1 40 90 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:90 + selp.b64 %rd43, %rd14, 0, %p13; + .loc 1 40 0 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40 + selp.b64 %rd44, 1, 0, %p12; + .loc 1 40 81 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:81 + add.s64 %rd45, %rd43, %rd44; + .loc 1 40 58 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:58 + mul.lo.s64 %rd46, %rd45, %rd34; + .loc 1 40 34 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:34 + shl.b64 %rd47, %rd6, 8; + add.s64 %rd48, %rd12, %rd47; + add.s64 %rd49, %rd48, %rd38; + shl.b64 %rd50, %rd46, 8; + add.s64 %rd30, %rd49, %rd50; + .loc 1 40 106 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:106 + // begin inline asm + mov.u64 %rd31, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd31, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r22, %r20; + mov.u32 %r23, %r20; + @%p3 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r22, %r23 }, [ %rd30 + 0 ], %rd31; + // end inline asm + .loc 1 39 136 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:136 + mov.b32 {%rs1, %rs2}, %r18; + cvt.f32.bf16 %r28, %rs1; + cvt.f32.bf16 %r29, %rs2; + .loc 1 40 168 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:168 + mov.b32 {%rs3, %rs4}, %r22; + cvt.f32.bf16 %r30, %rs3; + cvt.f32.bf16 %r31, %rs4; + .loc 1 43 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:43:23 + fma.rn.f32 %r32, %r29, %r31, 0f00000000; + fma.rn.f32 %r33, %r28, %r30, 0f00000000; + .loc 1 39 136 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:39:136 + mov.b32 {%rs5, %rs6}, %r19; + cvt.f32.bf16 %r34, %rs5; + cvt.f32.bf16 %r35, %rs6; + .loc 1 40 168 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:40:168 + mov.b32 {%rs7, %rs8}, %r23; + cvt.f32.bf16 %r36, %rs7; + cvt.f32.bf16 %r37, %rs8; + .loc 1 43 23 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:43:23 + fma.rn.f32 %r38, %r35, %r37, 0f00000000; + fma.rn.f32 %r39, %r34, %r36, 0f00000000; +$L__tmp3: + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r40, %r33, %r32; + add.f32 %r41, %r39, %r40; + add.f32 %r42, %r38, %r41; + selp.f32 %r43, %r42, 0f00000000, %p3; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r44, %r43, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r45, %r43, %r44; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r46, %r45, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r47, %r45, %r46; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r48, %r47, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r49, %r47, %r48; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r50, %r49, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r51, %r49, %r50; + .loc 3 291 36 // standard.py:291:36 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + shfl.sync.bfly.b32 %r52, %r51, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:25 ] + add.f32 %r53, %r51, %r52; +$L__tmp4: + .loc 1 45 28 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:45:28 + shr.u32 %r54, %r3, 3; + mov.b32 %r55, global_smem; + add.s32 %r56, %r55, %r54; + st.shared.b32 [%r56], %r53; + bar.sync 0; + shl.b32 %r57, %r4, 2; + add.s32 %r58, %r55, %r57; + ld.shared.b32 %r26, [%r58]; + .loc 1 49 25 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:49:25 + mad.wide.s32 %rd32, %r5, 4, %rd13; + .loc 1 49 36 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:49:36 + and.b32 %r59, %r2, 248; + setp.eq.b32 %p14, %r59, 0; + and.pred %p5, %p14, %p6; + // begin inline asm + @%p5 st.global.b32 [ %rd32 + 0 ], { %r26 }; + // end inline asm + .loc 1 49 4 // c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py:49:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 233 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe2 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 54 +.b8 52 +.b8 100 +.b8 110 +.b8 112 +.b8 110 +.b8 99 +.b8 50 +.b8 52 +.b8 117 +.b8 106 +.b8 115 +.b8 52 +.b8 101 +.b8 108 +.b8 51 +.b8 111 +.b8 122 +.b8 101 +.b8 100 +.b8 97 +.b8 55 +.b8 112 +.b8 110 +.b8 115 +.b8 118 +.b8 116 +.b8 101 +.b8 119 +.b8 104 +.b8 55 +.b8 120 +.b8 54 +.b8 55 +.b8 103 +.b8 122 +.b8 119 +.b8 116 +.b8 102 +.b8 111 +.b8 55 +.b8 122 +.b8 120 +.b8 118 +.b8 114 +.b8 102 +.b8 115 +.b8 113 +.b8 122 +.b8 115 +.b8 103 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 54 +.b8 52 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 30 // DW_AT_call_line +.b8 51 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd3:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..aa35e3a4d8e5e0222ec37e3cbb33b748a1ccfd8d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.source @@ -0,0 +1,325 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":18:0) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":69:0) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc70 = loc(unknown) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc77 = loc("in_ptr0"(#loc)) +#loc78 = loc("in_ptr1"(#loc)) +#loc79 = loc("out_ptr1"(#loc)) +#loc80 = loc("ks0"(#loc)) +#loc81 = loc("ks1"(#loc)) +#loc82 = loc("xnumel"(#loc)) +#loc83 = loc("r0_numel"(#loc)) +#loc135 = loc("a"(#loc56)) +#loc136 = loc("b"(#loc56)) +#loc142 = loc("input"(#loc68)) +#loc143 = loc("a"(#loc73)) +#loc144 = loc("b"(#loc73)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 128 : i32 loc(#loc84) + %xoffset = tt.get_program_id x : i32 loc(#loc85) + %xoffset_1 = arith.constant 8 : i32 loc(#loc86) + %xoffset_2 = arith.constant 8 : i32 loc(#loc86) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc86) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc87) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc88) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<8x1xi32> loc(#loc89) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<8x1xi32> loc(#loc89) + %xmask = tt.splat %xnumel : i32 -> tensor<8x1xi32> loc(#loc90) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<8x1xi32> loc(#loc90) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc91) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc92) + %x0 = arith.extsi %xindex_6 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc93) + %x0_9 = tt.splat %ks0 : i64 -> tensor<8x1xi64> loc(#loc93) + %x0_10 = arith.remsi %x0, %x0_9 : tensor<8x1xi64> loc(#loc93) + %x1 = arith.extsi %xindex_6 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc94) + %x1_11 = tt.splat %ks0 : i64 -> tensor<8x1xi64> loc(#loc94) + %x1_12 = arith.divsi %x1, %x1_11 : tensor<8x1xi64> loc(#loc94) + %x1_13 = arith.constant 32 : i32 loc(#loc95) + %x1_14 = arith.constant 32 : i64 loc(#loc95) + %x1_15 = arith.constant dense<32> : tensor<8x1xi64> loc(#loc95) + %x1_16 = arith.remsi %x1_12, %x1_15 : tensor<8x1xi64> loc(#loc95) + %x2 = arith.extsi %xindex_6 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc96) + %x2_17 = tt.splat %ks1 : i64 -> tensor<8x1xi64> loc(#loc96) + %x2_18 = arith.divsi %x2, %x2_17 : tensor<8x1xi64> loc(#loc96) + %x5 = tt.call @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S8_1S_i64__(%xindex_6, %ks0) : (tensor<8x1xi32>, i64) -> tensor<8x1xi64> loc(#loc97) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc98) + %_tmp4_19 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc98) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c128_i32 = arith.constant 128 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c128_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_20 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_23 = %_tmp4_19) -> (tensor<8x128xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x128xi32> loc(#loc100) + %r0_index_24 = arith.addi %r0_index, %r0_base_8 : tensor<1x128xi32> loc(#loc100) + %r0_mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc101) + %r0_mask_25 = arith.cmpi slt, %r0_index_24, %r0_mask : tensor<1x128xi32> loc(#loc101) + %tmp0 = arith.constant 128 : i32 loc(#loc102) + %tmp0_26 = arith.constant 128 : i64 loc(#loc102) + %tmp0_27 = arith.constant dense<128> : tensor<8x1xi64> loc(#loc102) + %tmp0_28 = arith.muli %tmp0_27, %x1_16 : tensor<8x1xi64> loc(#loc102) + %tmp0_29 = arith.extsi %r0_index_24 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc103) + %tmp0_30 = tt.broadcast %tmp0_29 : tensor<1x128xi64> -> tensor<8x128xi64> loc(#loc103) + %tmp0_31 = tt.broadcast %tmp0_28 : tensor<8x1xi64> -> tensor<8x128xi64> loc(#loc103) + %tmp0_32 = arith.addi %tmp0_30, %tmp0_31 : tensor<8x128xi64> loc(#loc103) + %tmp0_33 = arith.constant 4096 : i32 loc(#loc104) + %tmp0_34 = arith.constant 4096 : i64 loc(#loc104) + %tmp0_35 = arith.constant dense<4096> : tensor<8x1xi64> loc(#loc104) + %tmp0_36 = arith.muli %tmp0_35, %x0_10 : tensor<8x1xi64> loc(#loc104) + %tmp0_37 = tt.broadcast %tmp0_36 : tensor<8x1xi64> -> tensor<8x128xi64> loc(#loc105) + %tmp0_38 = arith.addi %tmp0_32, %tmp0_37 : tensor<8x128xi64> loc(#loc105) + %tmp0_39 = arith.constant 4096 : i32 loc(#loc106) + %tmp0_40 = arith.constant 4096 : i64 loc(#loc106) + %tmp0_41 = arith.muli %tmp0_40, %ks0 : i64 loc(#loc106) + %tmp0_42 = tt.splat %tmp0_41 : i64 -> tensor<8x1xi64> loc(#loc107) + %tmp0_43 = arith.muli %tmp0_42, %x2_18 : tensor<8x1xi64> loc(#loc107) + %tmp0_44 = tt.broadcast %tmp0_43 : tensor<8x1xi64> -> tensor<8x128xi64> loc(#loc108) + %tmp0_45 = arith.addi %tmp0_38, %tmp0_44 : tensor<8x128xi64> loc(#loc108) + %tmp0_46 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc109) + %tmp0_47 = tt.addptr %tmp0_46, %tmp0_45 : tensor<8x128x!tt.ptr>, tensor<8x128xi64> loc(#loc109) + %tmp0_48 = tt.broadcast %r0_mask_25 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc110) + %tmp0_49 = tt.broadcast %xmask_7 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc110) + %tmp0_50 = arith.andi %tmp0_48, %tmp0_49 : tensor<8x128xi1> loc(#loc110) + %tmp0_51 = arith.constant 0.000000e+00 : f32 loc(#loc111) + %tmp0_52 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc111) + %tmp0_53 = arith.truncf %tmp0_52 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc111) + %tmp0_54 = tt.load %tmp0_47, %tmp0_50, %tmp0_53 evictionPolicy = evict_first : tensor<8x128x!tt.ptr> loc(#loc111) + %tmp0_55 = arith.extf %tmp0_54 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc112) + %tmp1 = arith.constant 128 : i32 loc(#loc113) + %tmp1_56 = arith.constant 128 : i64 loc(#loc113) + %tmp1_57 = arith.constant dense<128> : tensor<8x1xi64> loc(#loc113) + %tmp1_58 = arith.muli %tmp1_57, %x0_10 : tensor<8x1xi64> loc(#loc113) + %tmp1_59 = arith.extsi %r0_index_24 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc114) + %tmp1_60 = tt.broadcast %tmp1_59 : tensor<1x128xi64> -> tensor<8x128xi64> loc(#loc114) + %tmp1_61 = tt.broadcast %tmp1_58 : tensor<8x1xi64> -> tensor<8x128xi64> loc(#loc114) + %tmp1_62 = arith.addi %tmp1_60, %tmp1_61 : tensor<8x128xi64> loc(#loc114) + %tmp1_63 = arith.constant 128 : i32 loc(#loc115) + %tmp1_64 = arith.constant 128 : i64 loc(#loc115) + %tmp1_65 = arith.constant dense<128> : tensor<8x1xi64> loc(#loc115) + %tmp1_66 = arith.muli %tmp1_65, %x5 : tensor<8x1xi64> loc(#loc115) + %tmp1_67 = arith.constant 1 : i32 loc(#loc116) + %tmp1_68 = arith.extsi %tmp1_67 : i32 to i64 loc(#loc116) + %tmp1_69 = arith.cmpi sge, %tmp1_68, %ks0 : i64 loc(#loc116) + %tmp1_70 = arith.constant 1 : i32 loc(#loc117) + %tmp1_71 = arith.constant 1 : i32 loc(#loc117) + %tmp1_72 = arith.extui %tmp1_69 : i1 to i32 loc(#loc117) + %tmp1_73 = arith.muli %tmp1_71, %tmp1_72 : i32 loc(#loc117) + %tmp1_74 = arith.constant 1 : i32 loc(#loc118) + %tmp1_75 = arith.extsi %tmp1_74 : i32 to i64 loc(#loc118) + %tmp1_76 = arith.cmpi sgt, %ks0, %tmp1_75 : i64 loc(#loc118) + %tmp1_77 = arith.extui %tmp1_76 : i1 to i64 loc(#loc119) + %tmp1_78 = arith.muli %ks0, %tmp1_77 : i64 loc(#loc119) + %tmp1_79 = arith.extsi %tmp1_73 : i32 to i64 loc(#loc120) + %tmp1_80 = arith.addi %tmp1_79, %tmp1_78 : i64 loc(#loc120) + %tmp1_81 = tt.splat %tmp1_80 : i64 -> tensor<8x1xi64> loc(#loc121) + %tmp1_82 = arith.muli %tmp1_66, %tmp1_81 : tensor<8x1xi64> loc(#loc121) + %tmp1_83 = tt.broadcast %tmp1_82 : tensor<8x1xi64> -> tensor<8x128xi64> loc(#loc122) + %tmp1_84 = arith.addi %tmp1_62, %tmp1_83 : tensor<8x128xi64> loc(#loc122) + %tmp1_85 = tt.splat %in_ptr1 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc123) + %tmp1_86 = tt.addptr %tmp1_85, %tmp1_84 : tensor<8x128x!tt.ptr>, tensor<8x128xi64> loc(#loc123) + %tmp1_87 = tt.broadcast %r0_mask_25 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc124) + %tmp1_88 = tt.broadcast %xmask_7 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc124) + %tmp1_89 = arith.andi %tmp1_87, %tmp1_88 : tensor<8x128xi1> loc(#loc124) + %tmp1_90 = arith.constant 0.000000e+00 : f32 loc(#loc125) + %tmp1_91 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc125) + %tmp1_92 = arith.truncf %tmp1_91 : tensor<8x128xf32> to tensor<8x128xbf16> loc(#loc125) + %tmp1_93 = tt.load %tmp1_86, %tmp1_89, %tmp1_92 evictionPolicy = evict_first : tensor<8x128x!tt.ptr> loc(#loc125) + %tmp1_94 = arith.extf %tmp1_93 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc126) + %tmp2 = arith.mulf %tmp0_55, %tmp1_94 : tensor<8x128xf32> loc(#loc127) + %tmp5 = arith.addf %_tmp4_23, %tmp2 : tensor<8x128xf32> loc(#loc128) + %_tmp4_95 = tt.broadcast %r0_mask_25 : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc129) + %_tmp4_96 = tt.broadcast %xmask_7 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc129) + %_tmp4_97 = arith.andi %_tmp4_95, %_tmp4_96 : tensor<8x128xi1> loc(#loc129) + %_tmp4_98 = arith.select %_tmp4_97, %tmp5, %_tmp4_23 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc130) + scf.yield %_tmp4_98 : tensor<8x128xf32> loc(#loc48) + } loc(#loc99) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_20) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc131) + %tmp4_21 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc132) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc133) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<8x1xf32> loc(#loc134) + %tmp8_22 = arith.subf %tmp4_21, %tmp8 : tensor<8x1xf32> loc(#loc134) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc53) + %5 = tt.addptr %4, %xindex_6 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc53) + tt.store %5, %tmp8_22, %xmask_7 : tensor<8x1x!tt.ptr> loc(#loc54) + tt.return loc(#loc55) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.div_floor_integer__i32S8_1S_i64__(%a: tensor<8x1xi32> loc("a"(#loc56)), %b: i64 loc("b"(#loc56))) -> tensor<8x1xi64> attributes {noinline = false} { + %quot = arith.extsi %a : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc137) + %quot_0 = tt.splat %b : i64 -> tensor<8x1xi64> loc(#loc137) + %quot_1 = arith.divsi %quot, %quot_0 : tensor<8x1xi64> loc(#loc137) + %remainder = arith.extsi %a : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc138) + %remainder_2 = tt.splat %b : i64 -> tensor<8x1xi64> loc(#loc138) + %remainder_3 = arith.remsi %remainder, %remainder_2 : tensor<8x1xi64> loc(#loc138) + %fixed = arith.constant 0 : i32 loc(#loc139) + %fixed_4 = arith.extsi %fixed : i32 to i64 loc(#loc139) + %fixed_5 = tt.splat %fixed_4 : i64 -> tensor<8x1xi64> loc(#loc139) + %fixed_6 = arith.cmpi ne, %remainder_3, %fixed_5 : tensor<8x1xi64> loc(#loc139) + %fixed_7 = arith.constant 1 : i32 loc(#loc140) + %fixed_8 = arith.constant 1 : i64 loc(#loc140) + %fixed_9 = arith.constant dense<1> : tensor<8x1xi64> loc(#loc140) + %fixed_10 = arith.subi %quot_1, %fixed_9 : tensor<8x1xi64> loc(#loc140) + %fixed_11 = arith.select %fixed_6, %fixed_10, %quot_1 : tensor<8x1xi1>, tensor<8x1xi64> loc(#loc141) + %c0_i32 = arith.constant 0 : i32 loc(#loc62) + %cst = arith.constant dense<0> : tensor<8x1xi32> loc(#loc62) + %0 = arith.cmpi slt, %a, %cst : tensor<8x1xi32> loc(#loc62) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc63) + %1 = arith.extsi %c0_i32_12 : i32 to i64 loc(#loc63) + %2 = arith.cmpi slt, %b, %1 : i64 loc(#loc63) + %3 = tt.splat %2 : i1 -> tensor<8x1xi1> loc(#loc64) + %4 = arith.cmpi ne, %0, %3 : tensor<8x1xi1> loc(#loc64) + %5 = arith.select %4, %fixed_11, %quot_1 : tensor<8x1xi1>, tensor<8x1xi64> loc(#loc65) + tt.return %5 : tensor<8x1xi64> loc(#loc66) + ^bb1: // no predecessors + %6 = ub.poison : tensor<8x1xi64> loc(#loc67) + tt.return %6 : tensor<8x1xi64> loc(#loc67) + } loc(#loc56) + tt.func private @"triton.language.standard.sum__fp32S8_128S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x128xf32> loc("input"(#loc68))) -> tensor<8xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc69) + tt.reduce.return %2 : f32 loc(#loc69) + }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc69) + tt.return %0 : tensor<8xf32> loc(#loc71) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8xf32> loc(#loc72) + tt.return %1 : tensor<8xf32> loc(#loc72) + } loc(#loc68) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc73)), %b: f32 loc("b"(#loc73))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc74) + tt.return %0 : f32 loc(#loc75) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc76) + tt.return %1 : f32 loc(#loc76) + } loc(#loc73) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:21) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:28) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":29:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":30:51) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":31:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:65) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:69) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:60) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:34) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:84) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:74) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:136) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:45) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:41) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:54) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:73) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:65) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:99) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:90) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:81) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:58) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:50) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:34) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:116) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:106) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:168) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":41:22) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":43:23) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:35) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:48) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:8) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:25) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:28) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":47:11) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":48:18) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:36) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:4) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":72:16) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":73:20) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:11) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:4) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc84 = loc("r0_numel"(#loc1)) +#loc85 = loc("xoffset"(#loc2)) +#loc86 = loc("xoffset"(#loc3)) +#loc87 = loc("xindex"(#loc4)) +#loc88 = loc("xindex"(#loc5)) +#loc89 = loc("xindex"(#loc6)) +#loc90 = loc("xmask"(#loc7)) +#loc91 = loc("r0_base"(#loc8)) +#loc92 = loc("r0_base"(#loc9)) +#loc93 = loc("x0"(#loc10)) +#loc94 = loc("x1"(#loc11)) +#loc95 = loc("x1"(#loc12)) +#loc96 = loc("x2"(#loc13)) +#loc97 = loc("x5"(#loc14)) +#loc98 = loc("_tmp4"(#loc15)) +#loc99 = loc("_tmp4"(#loc16)) +#loc100 = loc("r0_index"(#loc17)) +#loc101 = loc("r0_mask"(#loc18)) +#loc102 = loc("tmp0"(#loc19)) +#loc103 = loc("tmp0"(#loc20)) +#loc104 = loc("tmp0"(#loc21)) +#loc105 = loc("tmp0"(#loc22)) +#loc106 = loc("tmp0"(#loc23)) +#loc107 = loc("tmp0"(#loc24)) +#loc108 = loc("tmp0"(#loc25)) +#loc109 = loc("tmp0"(#loc26)) +#loc110 = loc("tmp0"(#loc27)) +#loc111 = loc("tmp0"(#loc28)) +#loc112 = loc("tmp0"(#loc29)) +#loc113 = loc("tmp1"(#loc30)) +#loc114 = loc("tmp1"(#loc31)) +#loc115 = loc("tmp1"(#loc32)) +#loc116 = loc("tmp1"(#loc33)) +#loc117 = loc("tmp1"(#loc34)) +#loc118 = loc("tmp1"(#loc35)) +#loc119 = loc("tmp1"(#loc36)) +#loc120 = loc("tmp1"(#loc37)) +#loc121 = loc("tmp1"(#loc38)) +#loc122 = loc("tmp1"(#loc39)) +#loc123 = loc("tmp1"(#loc40)) +#loc124 = loc("tmp1"(#loc41)) +#loc125 = loc("tmp1"(#loc42)) +#loc126 = loc("tmp1"(#loc43)) +#loc127 = loc("tmp2"(#loc44)) +#loc128 = loc("tmp5"(#loc45)) +#loc129 = loc("_tmp4"(#loc46)) +#loc130 = loc("_tmp4"(#loc47)) +#loc131 = loc("tmp4"(#loc49)) +#loc132 = loc("tmp4"(#loc50)) +#loc133 = loc("tmp7"(#loc51)) +#loc134 = loc("tmp8"(#loc52)) +#loc137 = loc("quot"(#loc57)) +#loc138 = loc("remainder"(#loc58)) +#loc139 = loc("fixed"(#loc59)) +#loc140 = loc("fixed"(#loc60)) +#loc141 = loc("fixed"(#loc61)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..3135f3cd4900141d199e78be60d7126342e4dac6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,220 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":18:0) +#loc1 = loc(unknown) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:25) +#loc55 = loc("in_ptr0"(#loc)) +#loc56 = loc("in_ptr1"(#loc)) +#loc57 = loc("out_ptr1"(#loc)) +#loc58 = loc("ks0"(#loc)) +#loc59 = loc("ks1"(#loc)) +#loc60 = loc("xnumel"(#loc)) +#loc61 = loc("r0_numel"(#loc)) +#loc104 = loc("tmp4"(#loc49)) +#loc115 = loc(callsite(#loc1 at #loc104)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<8x1xi64, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<8x1xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<8x1xi64, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<8x1xi64, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<32> : tensor<8x1xi64, #blocked> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %cst_6 = arith.constant dense<0.000000e+00> : tensor<8x128xbf16, #blocked> loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c4096_i64 = arith.constant 4096 : i64 loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<8x128xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc62) + %xoffset_8 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc63) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc64) + %xindex_9 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc64) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc64) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc64) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<8x1xi32, #blocked> loc(#loc65) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc65) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<8x1xi32, #blocked> loc(#loc65) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<8x1xi32, #blocked1> loc(#loc65) + %xmask = tt.splat %xnumel : i32 -> tensor<8x1xi32, #blocked> loc(#loc66) + %xmask_16 = tt.splat %xnumel : i32 -> tensor<8x1xi32, #blocked1> loc(#loc66) + %xmask_17 = arith.cmpi slt, %xindex_14, %xmask : tensor<8x1xi32, #blocked> loc(#loc66) + %xmask_18 = arith.cmpi slt, %xindex_15, %xmask_16 : tensor<8x1xi32, #blocked1> loc(#loc66) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc67) + %r0_base_19 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc67) + %x0 = arith.extsi %xindex_14 : tensor<8x1xi32, #blocked> to tensor<8x1xi64, #blocked> loc(#loc68) + %x0_20 = tt.splat %ks0 : i64 -> tensor<8x1xi64, #blocked> loc(#loc68) + %x0_21 = arith.remsi %x0, %x0_20 : tensor<8x1xi64, #blocked> loc(#loc68) + %x1 = arith.divsi %x0, %x0_20 : tensor<8x1xi64, #blocked> loc(#loc69) + %x1_22 = arith.remsi %x1, %cst_5 : tensor<8x1xi64, #blocked> loc(#loc70) + %x2 = tt.splat %ks1 : i64 -> tensor<8x1xi64, #blocked> loc(#loc71) + %x2_23 = arith.divsi %x0, %x2 : tensor<8x1xi64, #blocked> loc(#loc71) + %fixed = arith.cmpi ne, %x0_21, %cst_2 : tensor<8x1xi64, #blocked> loc(#loc106) + %fixed_24 = arith.subi %x1, %cst_4 : tensor<8x1xi64, #blocked> loc(#loc107) + %fixed_25 = arith.select %fixed, %fixed_24, %x1 : tensor<8x1xi1, #blocked>, tensor<8x1xi64, #blocked> loc(#loc108) + %x5 = arith.cmpi slt, %xindex_14, %cst_3 : tensor<8x1xi32, #blocked> loc(#loc109) + %x5_26 = arith.cmpi slt, %ks0, %c0_i64 : i64 loc(#loc110) + %x5_27 = tt.splat %x5_26 : i1 -> tensor<8x1xi1, #blocked> loc(#loc111) + %x5_28 = arith.cmpi ne, %x5, %x5_27 : tensor<8x1xi1, #blocked> loc(#loc111) + %x5_29 = arith.select %x5_28, %fixed_25, %x1 : tensor<8x1xi1, #blocked>, tensor<8x1xi64, #blocked> loc(#loc112) + %r0_mask = arith.cmpi slt, %r0_base_19, %cst : tensor<1x128xi32, #blocked> loc(#loc76) + %tmp0 = arith.muli %x1_22, %cst_0 : tensor<8x1xi64, #blocked> loc(#loc77) + %tmp0_30 = arith.extsi %r0_base_19 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked> loc(#loc78) + %tmp0_31 = tt.broadcast %tmp0_30 : tensor<1x128xi64, #blocked> -> tensor<8x128xi64, #blocked> loc(#loc78) + %tmp0_32 = tt.broadcast %tmp0 : tensor<8x1xi64, #blocked> -> tensor<8x128xi64, #blocked> loc(#loc78) + %tmp0_33 = arith.addi %tmp0_31, %tmp0_32 : tensor<8x128xi64, #blocked> loc(#loc78) + %tmp0_34 = arith.muli %x0_21, %cst_1 : tensor<8x1xi64, #blocked> loc(#loc79) + %tmp0_35 = tt.broadcast %tmp0_34 : tensor<8x1xi64, #blocked> -> tensor<8x128xi64, #blocked> loc(#loc80) + %tmp0_36 = arith.addi %tmp0_33, %tmp0_35 : tensor<8x128xi64, #blocked> loc(#loc80) + %tmp0_37 = arith.muli %ks0, %c4096_i64 : i64 loc(#loc81) + %tmp0_38 = tt.splat %tmp0_37 : i64 -> tensor<8x1xi64, #blocked> loc(#loc82) + %tmp0_39 = arith.muli %tmp0_38, %x2_23 : tensor<8x1xi64, #blocked> loc(#loc82) + %tmp0_40 = tt.broadcast %tmp0_39 : tensor<8x1xi64, #blocked> -> tensor<8x128xi64, #blocked> loc(#loc83) + %tmp0_41 = arith.addi %tmp0_36, %tmp0_40 : tensor<8x128xi64, #blocked> loc(#loc83) + %tmp0_42 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr, #blocked> loc(#loc84) + %tmp0_43 = tt.addptr %tmp0_42, %tmp0_41 : tensor<8x128x!tt.ptr, #blocked>, tensor<8x128xi64, #blocked> loc(#loc84) + %tmp0_44 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc85) + %tmp0_45 = tt.broadcast %xmask_17 : tensor<8x1xi1, #blocked> -> tensor<8x128xi1, #blocked> loc(#loc85) + %tmp0_46 = arith.andi %tmp0_44, %tmp0_45 : tensor<8x128xi1, #blocked> loc(#loc85) + %tmp0_47 = tt.load %tmp0_43, %tmp0_46, %cst_6 evictionPolicy = evict_first : tensor<8x128x!tt.ptr, #blocked> loc(#loc86) + %tmp0_48 = arith.extf %tmp0_47 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc87) + %tmp1 = arith.muli %x0_21, %cst_0 : tensor<8x1xi64, #blocked> loc(#loc88) + %tmp1_49 = tt.broadcast %tmp1 : tensor<8x1xi64, #blocked> -> tensor<8x128xi64, #blocked> loc(#loc89) + %tmp1_50 = arith.addi %tmp0_31, %tmp1_49 : tensor<8x128xi64, #blocked> loc(#loc89) + %tmp1_51 = arith.muli %x5_29, %cst_0 : tensor<8x1xi64, #blocked> loc(#loc90) + %tmp1_52 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc91) + %tmp1_53 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc92) + %tmp1_54 = arith.extui %tmp1_53 : i1 to i64 loc(#loc93) + %tmp1_55 = arith.muli %ks0, %tmp1_54 : i64 loc(#loc93) + %tmp1_56 = arith.extui %tmp1_52 : i1 to i64 loc(#loc113) + %tmp1_57 = arith.addi %tmp1_56, %tmp1_55 : i64 loc(#loc94) + %tmp1_58 = tt.splat %tmp1_57 : i64 -> tensor<8x1xi64, #blocked> loc(#loc96) + %tmp1_59 = arith.muli %tmp1_51, %tmp1_58 : tensor<8x1xi64, #blocked> loc(#loc96) + %tmp1_60 = tt.broadcast %tmp1_59 : tensor<8x1xi64, #blocked> -> tensor<8x128xi64, #blocked> loc(#loc97) + %tmp1_61 = arith.addi %tmp1_50, %tmp1_60 : tensor<8x128xi64, #blocked> loc(#loc97) + %tmp1_62 = tt.splat %in_ptr1 : !tt.ptr -> tensor<8x128x!tt.ptr, #blocked> loc(#loc98) + %tmp1_63 = tt.addptr %tmp1_62, %tmp1_61 : tensor<8x128x!tt.ptr, #blocked>, tensor<8x128xi64, #blocked> loc(#loc98) + %tmp1_64 = tt.load %tmp1_63, %tmp0_46, %cst_6 evictionPolicy = evict_first : tensor<8x128x!tt.ptr, #blocked> loc(#loc99) + %tmp1_65 = arith.extf %tmp1_64 : tensor<8x128xbf16, #blocked> to tensor<8x128xf32, #blocked> loc(#loc100) + %tmp2 = arith.mulf %tmp0_48, %tmp1_65 : tensor<8x128xf32, #blocked> loc(#loc101) + %tmp5 = arith.addf %tmp2, %cst_7 : tensor<8x128xf32, #blocked> loc(#loc102) + %_tmp4 = arith.select %tmp0_46, %tmp5, %cst_7 : tensor<8x128xi1, #blocked>, tensor<8x128xf32, #blocked> loc(#loc103) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_68: f32 loc(callsite(#loc1 at #loc104)), %tmp4_69: f32 loc(callsite(#loc1 at #loc104))): + %tmp4_70 = arith.addf %tmp4_68, %tmp4_69 : f32 loc(#loc116) + tt.reduce.return %tmp4_70 : f32 loc(#loc114) + }) : (tensor<8x128xf32, #blocked>) -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc114) + %tmp4_66 = ttg.convert_layout %tmp4 : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc105) + %tmp4_67 = tt.expand_dims %tmp4_66 {axis = 1 : i32} : tensor<8xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xf32, #blocked1> loc(#loc105) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked1> loc(#loc52) + %1 = tt.addptr %0, %xindex_15 : tensor<8x1x!tt.ptr, #blocked1>, tensor<8x1xi32, #blocked1> loc(#loc52) + tt.store %1, %tmp4_67, %xmask_18 : tensor<8x1x!tt.ptr, #blocked1> loc(#loc53) + tt.return loc(#loc54) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":24:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":27:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:21) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:28) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":29:19) +#loc12 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":30:51) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":35:29) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:45) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:41) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:55) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:50) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:65) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:69) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:60) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:34) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:84) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:74) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:136) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:45) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:41) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:54) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:73) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:99) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:90) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:81) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:65) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:58) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:50) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:34) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:106) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:168) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":41:22) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":43:23) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:48) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:28) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:25) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:36) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:4) +#loc62 = loc("xoffset"(#loc2)) +#loc63 = loc("xoffset"(#loc3)) +#loc64 = loc("xindex"(#loc4)) +#loc65 = loc("xindex"(#loc5)) +#loc66 = loc("xmask"(#loc6)) +#loc67 = loc("r0_base"(#loc7)) +#loc68 = loc("x0"(#loc8)) +#loc69 = loc("x1"(#loc9)) +#loc70 = loc("x1"(#loc10)) +#loc71 = loc("x2"(#loc11)) +#loc72 = loc("fixed"(#loc12)) +#loc73 = loc("x5"(#loc13)) +#loc74 = loc("fixed"(#loc14)) +#loc75 = loc("fixed"(#loc15)) +#loc76 = loc("r0_mask"(#loc20)) +#loc77 = loc("tmp0"(#loc21)) +#loc78 = loc("tmp0"(#loc22)) +#loc79 = loc("tmp0"(#loc23)) +#loc80 = loc("tmp0"(#loc24)) +#loc81 = loc("tmp0"(#loc25)) +#loc82 = loc("tmp0"(#loc26)) +#loc83 = loc("tmp0"(#loc27)) +#loc84 = loc("tmp0"(#loc28)) +#loc85 = loc("tmp0"(#loc29)) +#loc86 = loc("tmp0"(#loc30)) +#loc87 = loc("tmp0"(#loc31)) +#loc88 = loc("tmp1"(#loc32)) +#loc89 = loc("tmp1"(#loc33)) +#loc90 = loc("tmp1"(#loc34)) +#loc91 = loc("tmp1"(#loc35)) +#loc92 = loc("tmp1"(#loc36)) +#loc93 = loc("tmp1"(#loc37)) +#loc94 = loc("tmp1"(#loc38)) +#loc95 = loc("tmp1"(#loc39)) +#loc96 = loc("tmp1"(#loc40)) +#loc97 = loc("tmp1"(#loc41)) +#loc98 = loc("tmp1"(#loc42)) +#loc99 = loc("tmp1"(#loc43)) +#loc100 = loc("tmp1"(#loc44)) +#loc101 = loc("tmp2"(#loc45)) +#loc102 = loc("tmp5"(#loc46)) +#loc103 = loc("_tmp4"(#loc47)) +#loc105 = loc("tmp4"(#loc51)) +#loc106 = loc(callsite(#loc72 at #loc73)) +#loc107 = loc(callsite(#loc74 at #loc73)) +#loc108 = loc(callsite(#loc75 at #loc73)) +#loc109 = loc(callsite(#loc16 at #loc73)) +#loc110 = loc(callsite(#loc17 at #loc73)) +#loc111 = loc(callsite(#loc18 at #loc73)) +#loc112 = loc(callsite(#loc19 at #loc73)) +#loc113 = loc(fused[#loc94, #loc95]) +#loc114 = loc(callsite(#loc48 at #loc104)) +#loc116 = loc(callsite(#loc50 at #loc114)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..20235653b20b96664031d99ddbea061e7584c443 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UNP7VIBUXRL3IQT4GM5MMUSYWF2AFXAZYAREFPTIQXKWRCHBQ22Q/triton_red_fused_zeros_0.ttir @@ -0,0 +1,215 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":18:0) +#loc6 = loc(unknown) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:25) +#loc57 = loc("in_ptr0"(#loc)) +#loc58 = loc("in_ptr1"(#loc)) +#loc59 = loc("out_ptr1"(#loc)) +#loc60 = loc("ks0"(#loc)) +#loc61 = loc("ks1"(#loc)) +#loc62 = loc("xnumel"(#loc)) +#loc63 = loc("r0_numel"(#loc)) +#loc108 = loc("tmp4"(#loc51)) +#loc119 = loc(callsite(#loc6 at #loc108)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %fixed = arith.constant dense<1> : tensor<8x1xi64> loc(#loc110) + %x5 = arith.constant dense<0> : tensor<8x1xi32> loc(#loc111) + %fixed_0 = arith.constant dense<0> : tensor<8x1xi64> loc(#loc112) + %x5_1 = arith.constant 0 : i64 loc(#loc113) + %c1_i64 = arith.constant 1 : i64 loc(#loc6) + %cst = arith.constant dense<0.000000e+00> : tensor<8x128xbf16> loc(#loc6) + %cst_2 = arith.constant dense<4096> : tensor<8x1xi64> loc(#loc6) + %c4096_i64 = arith.constant 4096 : i64 loc(#loc6) + %cst_3 = arith.constant dense<128> : tensor<8x1xi64> loc(#loc6) + %cst_4 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc6) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<8x128xf32> loc(#loc6) + %x1 = arith.constant dense<32> : tensor<8x1xi64> loc(#loc67) + %c8_i32 = arith.constant 8 : i32 loc(#loc6) + %xoffset = tt.get_program_id x : i32 loc(#loc68) + %xoffset_6 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc69) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc70) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc71) + %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<8x1xi32> loc(#loc72) + %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<8x1xi32> loc(#loc72) + %xmask = tt.splat %xnumel : i32 -> tensor<8x1xi32> loc(#loc73) + %xmask_10 = arith.cmpi slt, %xindex_9, %xmask : tensor<8x1xi32> loc(#loc73) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc74) + %r0_base_11 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc75) + %x0 = arith.extsi %xindex_9 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc76) + %x0_12 = tt.splat %ks0 : i64 -> tensor<8x1xi64> loc(#loc76) + %x0_13 = arith.remsi %x0, %x0_12 : tensor<8x1xi64> loc(#loc76) + %x1_14 = arith.divsi %x0, %x0_12 : tensor<8x1xi64> loc(#loc77) + %x1_15 = arith.remsi %x1_14, %x1 : tensor<8x1xi64> loc(#loc67) + %x2 = tt.splat %ks1 : i64 -> tensor<8x1xi64> loc(#loc78) + %x2_16 = arith.divsi %x0, %x2 : tensor<8x1xi64> loc(#loc78) + %fixed_17 = arith.cmpi ne, %x0_13, %fixed_0 : tensor<8x1xi64> loc(#loc112) + %fixed_18 = arith.subi %x1_14, %fixed : tensor<8x1xi64> loc(#loc110) + %fixed_19 = arith.select %fixed_17, %fixed_18, %x1_14 : tensor<8x1xi1>, tensor<8x1xi64> loc(#loc114) + %x5_20 = arith.cmpi slt, %xindex_9, %x5 : tensor<8x1xi32> loc(#loc111) + %x5_21 = arith.cmpi slt, %ks0, %x5_1 : i64 loc(#loc113) + %x5_22 = tt.splat %x5_21 : i1 -> tensor<8x1xi1> loc(#loc115) + %x5_23 = arith.cmpi ne, %x5_20, %x5_22 : tensor<8x1xi1> loc(#loc115) + %x5_24 = arith.select %x5_23, %fixed_19, %x1_14 : tensor<8x1xi1>, tensor<8x1xi64> loc(#loc116) + %r0_mask = arith.cmpi slt, %r0_base_11, %cst_4 : tensor<1x128xi32> loc(#loc80) + %tmp0 = arith.muli %x1_15, %cst_3 : tensor<8x1xi64> loc(#loc81) + %tmp0_25 = arith.extsi %r0_base_11 : tensor<1x128xi32> to tensor<1x128xi64> loc(#loc82) + %tmp0_26 = tt.broadcast %tmp0_25 : tensor<1x128xi64> -> tensor<8x128xi64> loc(#loc82) + %tmp0_27 = tt.broadcast %tmp0 : tensor<8x1xi64> -> tensor<8x128xi64> loc(#loc82) + %tmp0_28 = arith.addi %tmp0_26, %tmp0_27 : tensor<8x128xi64> loc(#loc82) + %tmp0_29 = arith.muli %x0_13, %cst_2 : tensor<8x1xi64> loc(#loc83) + %tmp0_30 = tt.broadcast %tmp0_29 : tensor<8x1xi64> -> tensor<8x128xi64> loc(#loc84) + %tmp0_31 = arith.addi %tmp0_28, %tmp0_30 : tensor<8x128xi64> loc(#loc84) + %tmp0_32 = arith.muli %ks0, %c4096_i64 : i64 loc(#loc85) + %tmp0_33 = tt.splat %tmp0_32 : i64 -> tensor<8x1xi64> loc(#loc86) + %tmp0_34 = arith.muli %tmp0_33, %x2_16 : tensor<8x1xi64> loc(#loc86) + %tmp0_35 = tt.broadcast %tmp0_34 : tensor<8x1xi64> -> tensor<8x128xi64> loc(#loc87) + %tmp0_36 = arith.addi %tmp0_31, %tmp0_35 : tensor<8x128xi64> loc(#loc87) + %tmp0_37 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc88) + %tmp0_38 = tt.addptr %tmp0_37, %tmp0_36 : tensor<8x128x!tt.ptr>, tensor<8x128xi64> loc(#loc88) + %tmp0_39 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<8x128xi1> loc(#loc89) + %tmp0_40 = tt.broadcast %xmask_10 : tensor<8x1xi1> -> tensor<8x128xi1> loc(#loc89) + %tmp0_41 = arith.andi %tmp0_39, %tmp0_40 : tensor<8x128xi1> loc(#loc89) + %tmp0_42 = tt.load %tmp0_38, %tmp0_41, %cst evictionPolicy = evict_first : tensor<8x128x!tt.ptr> loc(#loc90) + %tmp0_43 = arith.extf %tmp0_42 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc91) + %tmp1 = arith.muli %x0_13, %cst_3 : tensor<8x1xi64> loc(#loc92) + %tmp1_44 = tt.broadcast %tmp1 : tensor<8x1xi64> -> tensor<8x128xi64> loc(#loc93) + %tmp1_45 = arith.addi %tmp0_26, %tmp1_44 : tensor<8x128xi64> loc(#loc93) + %tmp1_46 = arith.muli %x5_24, %cst_3 : tensor<8x1xi64> loc(#loc94) + %tmp1_47 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc95) + %tmp1_48 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc96) + %tmp1_49 = arith.extui %tmp1_48 : i1 to i64 loc(#loc97) + %tmp1_50 = arith.muli %ks0, %tmp1_49 : i64 loc(#loc97) + %tmp1_51 = arith.extui %tmp1_47 : i1 to i64 loc(#loc117) + %tmp1_52 = arith.addi %tmp1_51, %tmp1_50 : i64 loc(#loc98) + %tmp1_53 = tt.splat %tmp1_52 : i64 -> tensor<8x1xi64> loc(#loc100) + %tmp1_54 = arith.muli %tmp1_46, %tmp1_53 : tensor<8x1xi64> loc(#loc100) + %tmp1_55 = tt.broadcast %tmp1_54 : tensor<8x1xi64> -> tensor<8x128xi64> loc(#loc101) + %tmp1_56 = arith.addi %tmp1_45, %tmp1_55 : tensor<8x128xi64> loc(#loc101) + %tmp1_57 = tt.splat %in_ptr1 : !tt.ptr -> tensor<8x128x!tt.ptr> loc(#loc102) + %tmp1_58 = tt.addptr %tmp1_57, %tmp1_56 : tensor<8x128x!tt.ptr>, tensor<8x128xi64> loc(#loc102) + %tmp1_59 = tt.load %tmp1_58, %tmp0_41, %cst evictionPolicy = evict_first : tensor<8x128x!tt.ptr> loc(#loc103) + %tmp1_60 = arith.extf %tmp1_59 : tensor<8x128xbf16> to tensor<8x128xf32> loc(#loc104) + %tmp2 = arith.mulf %tmp0_43, %tmp1_60 : tensor<8x128xf32> loc(#loc105) + %tmp5 = arith.addf %tmp2, %cst_5 : tensor<8x128xf32> loc(#loc106) + %_tmp4 = arith.select %tmp0_41, %tmp5, %cst_5 : tensor<8x128xi1>, tensor<8x128xf32> loc(#loc107) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_62: f32 loc(callsite(#loc6 at #loc108)), %tmp4_63: f32 loc(callsite(#loc6 at #loc108))): + %tmp4_64 = arith.addf %tmp4_62, %tmp4_63 : f32 loc(#loc120) + tt.reduce.return %tmp4_64 : f32 loc(#loc118) + }) : (tensor<8x128xf32>) -> tensor<8xf32> loc(#loc118) + %tmp4_61 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<8xf32> -> tensor<8x1xf32> loc(#loc109) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc54) + %1 = tt.addptr %0, %xindex_9 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc54) + tt.store %1, %tmp4_61, %xmask_10 : tensor<8x1x!tt.ptr> loc(#loc55) + tt.return loc(#loc56) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":30:51) +#loc3 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc4 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc5 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":22:33) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:36) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:44) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":23:23) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":24:21) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:27) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":25:37) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":27:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":28:21) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":29:19) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":35:29) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:45) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:41) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:55) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:65) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:69) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:60) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:84) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:74) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":39:136) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:45) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:41) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:73) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:99) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:90) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:81) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:65) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:58) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:50) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:34) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:106) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":40:168) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":41:22) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":43:23) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":44:48) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":45:28) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:25) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:36) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/64/c64dnpnc24ujs4el3ozeda7pnsvtewh7x67gzwtfo7zxvrfsqzsg.py":49:4) +#loc64 = loc("fixed"(#loc1)) +#loc65 = loc("x5"(#loc2)) +#loc66 = loc("fixed"(#loc4)) +#loc67 = loc("x1"(#loc7)) +#loc68 = loc("xoffset"(#loc8)) +#loc69 = loc("xoffset"(#loc9)) +#loc70 = loc("xindex"(#loc10)) +#loc71 = loc("xindex"(#loc11)) +#loc72 = loc("xindex"(#loc12)) +#loc73 = loc("xmask"(#loc13)) +#loc74 = loc("r0_base"(#loc14)) +#loc75 = loc("r0_base"(#loc15)) +#loc76 = loc("x0"(#loc16)) +#loc77 = loc("x1"(#loc17)) +#loc78 = loc("x2"(#loc18)) +#loc79 = loc("fixed"(#loc19)) +#loc80 = loc("r0_mask"(#loc22)) +#loc81 = loc("tmp0"(#loc23)) +#loc82 = loc("tmp0"(#loc24)) +#loc83 = loc("tmp0"(#loc25)) +#loc84 = loc("tmp0"(#loc26)) +#loc85 = loc("tmp0"(#loc27)) +#loc86 = loc("tmp0"(#loc28)) +#loc87 = loc("tmp0"(#loc29)) +#loc88 = loc("tmp0"(#loc30)) +#loc89 = loc("tmp0"(#loc31)) +#loc90 = loc("tmp0"(#loc32)) +#loc91 = loc("tmp0"(#loc33)) +#loc92 = loc("tmp1"(#loc34)) +#loc93 = loc("tmp1"(#loc35)) +#loc94 = loc("tmp1"(#loc36)) +#loc95 = loc("tmp1"(#loc37)) +#loc96 = loc("tmp1"(#loc38)) +#loc97 = loc("tmp1"(#loc39)) +#loc98 = loc("tmp1"(#loc40)) +#loc99 = loc("tmp1"(#loc41)) +#loc100 = loc("tmp1"(#loc42)) +#loc101 = loc("tmp1"(#loc43)) +#loc102 = loc("tmp1"(#loc44)) +#loc103 = loc("tmp1"(#loc45)) +#loc104 = loc("tmp1"(#loc46)) +#loc105 = loc("tmp2"(#loc47)) +#loc106 = loc("tmp5"(#loc48)) +#loc107 = loc("_tmp4"(#loc49)) +#loc109 = loc("tmp4"(#loc53)) +#loc110 = loc(callsite(#loc64 at #loc65)) +#loc111 = loc(callsite(#loc3 at #loc65)) +#loc112 = loc(callsite(#loc66 at #loc65)) +#loc113 = loc(callsite(#loc5 at #loc65)) +#loc114 = loc(callsite(#loc79 at #loc65)) +#loc115 = loc(callsite(#loc20 at #loc65)) +#loc116 = loc(callsite(#loc21 at #loc65)) +#loc117 = loc(fused[#loc98, #loc99]) +#loc118 = loc(callsite(#loc50 at #loc108)) +#loc120 = loc(callsite(#loc52 at #loc118)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5223bb8b1082c35def788eee4bcbcafab46b038a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.source", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttgir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.llir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ptx", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.cubin", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..54a278d9375e99ac5a082f44ef6676037c5bf03c Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d6dd0267ef186b38e101eb9a7398684ea3662636 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.json @@ -0,0 +1 @@ +{"hash": "a4245c2045f08b0b65c1388d9fa73cefcc7b55de8b1ede802db38132b148f1a0", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..469985c2ac74b38c81fd33e9435a281c8bf6a666 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.llir @@ -0,0 +1,346 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_2 = internal constant [8 x i8] c"unknown\00" +@assertFile_2 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py\00" +@assertMessage_2 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp36 < ks3\00" +@assertFunc_1 = internal constant [8 x i8] c"unknown\00" +@assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py\00" +@assertMessage_1 = internal constant [65 x i8] c"index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2\00" +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py\00" +@assertMessage_0 = internal constant [64 x i8] c"index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i64 %6, i64 %7, i64 %8, i32 %9, ptr addrspace(1) readnone captures(none) %10, ptr addrspace(1) readnone captures(none) %11) local_unnamed_addr #1 !dbg !9 { + %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %14 = shl i32 %13, 9, !dbg !11 + %15 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %16 = shl nuw nsw i32 %15, 1, !dbg !12 + %17 = and i32 %16, 510, !dbg !12 + %18 = or disjoint i32 %17, %14, !dbg !13 + %19 = or disjoint i32 %18, 1, !dbg !13 + %20 = icmp slt i32 %18, %9, !dbg !14 + %21 = icmp slt i32 %19, %9, !dbg !14 + %22 = sext i32 %18 to i64, !dbg !15 + %23 = sext i32 %19 to i64, !dbg !15 + %.frozen = freeze i64 %5, !dbg !16 + %24 = sdiv i64 %22, %.frozen, !dbg !16 + %25 = mul i64 %24, %.frozen, !dbg !15 + %.decomposed = sub i64 %22, %25, !dbg !15 + %.frozen25 = freeze i64 %5, !dbg !16 + %26 = sdiv i64 %23, %.frozen25, !dbg !16 + %27 = mul i64 %26, %.frozen25, !dbg !15 + %.decomposed26 = sub i64 %23, %27, !dbg !15 + %28 = srem i64 %24, %6, !dbg !17 + %29 = srem i64 %26, %6, !dbg !17 + %30 = getelementptr bfloat, ptr addrspace(1) %0, i64 %22, !dbg !18 + %31 = getelementptr bfloat, ptr addrspace(1) %0, i64 %23, !dbg !18 + %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %33 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %30, i64 %32, i1 %20) #4, !dbg !19 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %35 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %31, i64 %34, i1 %21) #4, !dbg !19 + %36 = getelementptr i64, ptr addrspace(1) %1, i64 %28, !dbg !20 + %37 = getelementptr i64, ptr addrspace(1) %1, i64 %29, !dbg !20 + %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %36, i64 %38, i1 %20) #4, !dbg !21 + %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %37, i64 %40, i1 %21) #4, !dbg !21 + %42 = sdiv i64 %5, 2, !dbg !22 + %43 = icmp sge i64 %.decomposed, %42, !dbg !23 + %44 = icmp sge i64 %.decomposed26, %42, !dbg !23 + %45 = sub nsw i64 %22, %42, !dbg !24 + %46 = sub nsw i64 %23, %42, !dbg !24 + %47 = getelementptr bfloat, ptr addrspace(1) %0, i64 %45, !dbg !25 + %48 = getelementptr bfloat, ptr addrspace(1) %0, i64 %46, !dbg !25 + %49 = and i1 %20, %43, !dbg !26 + %50 = and i1 %21, %44, !dbg !26 + %51 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !27 + %52 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %47, i64 %51, i1 %49) #4, !dbg !27 + %53 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !27 + %54 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %48, i64 %53, i1 %50) #4, !dbg !27 + %55 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28 + %56 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %36, i64 %55, i1 %49) #4, !dbg !28 + %57 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !28 + %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %37, i64 %57, i1 %50) #4, !dbg !28 + %59 = icmp slt i64 %56, 0, !dbg !29 + %60 = icmp slt i64 %58, 0, !dbg !29 + %61 = select i1 %59, i64 %7, i64 0, !dbg !30 + %62 = add i64 %61, %56, !dbg !30 + %63 = select i1 %60, i64 %7, i64 0, !dbg !30 + %64 = add i64 %63, %58, !dbg !30 + %65 = icmp slt i64 %62, 0, !dbg !31 + %66 = icmp slt i64 %64, 0, !dbg !31 + %67 = icmp sge i64 %62, %7, !dbg !32 + %68 = icmp sge i64 %64, %7, !dbg !32 + %.not4 = or i1 %65, %67, !dbg !33 + %.not8 = or i1 %66, %68, !dbg !34 + %.not1 = and i1 %49, %.not4, !dbg !35 + %.not5 = and i1 %50, %.not8, !dbg !36 + %69 = or i1 %.not1, %.not5, !dbg !36 + br i1 %69, label %70, label %71, !dbg !36 + +70: ; preds = %12 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 36, ptr nonnull @assertFunc_0, i64 1), !dbg !36 + unreachable, !dbg !36 + +71: ; preds = %12 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !36 + %72 = sub nsw i64 %.decomposed, %42, !dbg !37 + %73 = sub nsw i64 %.decomposed26, %42, !dbg !37 + %74 = mul i64 %62, %5, !dbg !38 + %75 = mul i64 %64, %5, !dbg !38 + %76 = getelementptr bfloat, ptr addrspace(1) %2, i64 %72, !dbg !39 + %77 = getelementptr bfloat, ptr addrspace(1) %76, i64 %74, !dbg !39 + %78 = getelementptr bfloat, ptr addrspace(1) %2, i64 %73, !dbg !39 + %79 = getelementptr bfloat, ptr addrspace(1) %78, i64 %75, !dbg !39 + %80 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !40 + %81 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %77, i64 %80, i1 %49) #4, !dbg !40 + %82 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !40 + %83 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %79, i64 %82, i1 %50) #4, !dbg !40 + %84 = icmp slt i64 %.decomposed, %42, !dbg !41 + %85 = icmp slt i64 %.decomposed26, %42, !dbg !41 + %86 = add i64 %5, %22, !dbg !42 + %87 = add i64 %5, %23, !dbg !42 + %88 = sub i64 %86, %42, !dbg !43 + %89 = sub i64 %87, %42, !dbg !43 + %90 = getelementptr bfloat, ptr addrspace(1) %0, i64 %88, !dbg !44 + %91 = getelementptr bfloat, ptr addrspace(1) %0, i64 %89, !dbg !44 + %92 = and i1 %20, %84, !dbg !45 + %93 = and i1 %21, %85, !dbg !45 + %94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46 + %95 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %90, i64 %94, i1 %92) #4, !dbg !46 + %96 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !46 + %97 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %91, i64 %96, i1 %93) #4, !dbg !46 + %98 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !47 + %99 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %36, i64 %98, i1 %92) #4, !dbg !47 + %100 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !47 + %101 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %37, i64 %100, i1 %93) #4, !dbg !47 + %102 = icmp slt i64 %99, 0, !dbg !48 + %103 = icmp slt i64 %101, 0, !dbg !48 + %104 = select i1 %102, i64 %7, i64 0, !dbg !49 + %105 = add i64 %104, %99, !dbg !49 + %106 = select i1 %103, i64 %7, i64 0, !dbg !49 + %107 = add i64 %106, %101, !dbg !49 + %108 = icmp slt i64 %105, 0, !dbg !50 + %109 = icmp slt i64 %107, 0, !dbg !50 + %110 = icmp sge i64 %105, %7, !dbg !51 + %111 = icmp sge i64 %107, %7, !dbg !51 + %.not12 = or i1 %108, %110, !dbg !52 + %.not16 = or i1 %109, %111, !dbg !53 + %.not9 = and i1 %92, %.not12, !dbg !54 + %.not13 = and i1 %93, %.not16, !dbg !55 + %112 = or i1 %.not9, %.not13, !dbg !55 + br i1 %112, label %113, label %114, !dbg !55 + +113: ; preds = %71 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 51, ptr nonnull @assertFunc_1, i64 1), !dbg !55 + unreachable, !dbg !55 + +114: ; preds = %71 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !55 + %115 = sub i64 %5, %42, !dbg !56 + %116 = mul i64 %105, %5, !dbg !57 + %117 = mul i64 %107, %5, !dbg !57 + %118 = getelementptr bfloat, ptr addrspace(1) %2, i64 %115, !dbg !58 + %119 = getelementptr bfloat, ptr addrspace(1) %118, i64 %.decomposed, !dbg !58 + %120 = getelementptr bfloat, ptr addrspace(1) %119, i64 %116, !dbg !58 + %121 = getelementptr bfloat, ptr addrspace(1) %118, i64 %.decomposed26, !dbg !58 + %122 = getelementptr bfloat, ptr addrspace(1) %121, i64 %117, !dbg !58 + %123 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !59 + %124 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %120, i64 %123, i1 %92) #4, !dbg !59 + %125 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !59 + %126 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %122, i64 %125, i1 %93) #4, !dbg !59 + %127 = icmp slt i64 %39, 0, !dbg !60 + %128 = icmp slt i64 %41, 0, !dbg !60 + %129 = select i1 %127, i64 %8, i64 0, !dbg !61 + %130 = add i64 %129, %39, !dbg !61 + %131 = select i1 %128, i64 %8, i64 0, !dbg !61 + %132 = add i64 %131, %41, !dbg !61 + %133 = icmp slt i64 %130, 0, !dbg !62 + %134 = icmp slt i64 %132, 0, !dbg !62 + %135 = icmp sge i64 %130, %8, !dbg !63 + %136 = icmp sge i64 %132, %8, !dbg !63 + %.not20 = or i1 %133, %135, !dbg !64 + %.not24 = or i1 %134, %136, !dbg !65 + %.not17 = and i1 %20, %.not20, !dbg !66 + %.not21 = and i1 %21, %.not24, !dbg !67 + %137 = or i1 %.not17, %.not21, !dbg !67 + br i1 %137, label %138, label %139, !dbg !67 + +138: ; preds = %114 + tail call void @__assertfail(ptr nonnull @assertMessage_2, ptr nonnull @assertFile_2, i32 62, ptr nonnull @assertFunc_2, i64 1), !dbg !67 + unreachable, !dbg !67 + +139: ; preds = %114 + %140 = bitcast i16 %54 to bfloat, !dbg !27 + %141 = fpext bfloat %140 to float, !dbg !68 + %142 = bitcast i16 %83 to bfloat, !dbg !40 + %143 = fpext bfloat %142 to float, !dbg !69 + %144 = fmul float %141, %143, !dbg !70 + %145 = fsub float 0.000000e+00, %144, !dbg !71 + %146 = select i1 %44, float %145, float 0.000000e+00, !dbg !72 + %147 = bitcast i16 %97 to bfloat, !dbg !46 + %148 = fpext bfloat %147 to float, !dbg !73 + %149 = bitcast i16 %126 to bfloat, !dbg !59 + %150 = fpext bfloat %149 to float, !dbg !74 + %151 = fmul float %148, %150, !dbg !75 + %152 = select i1 %85, float %151, float 0.000000e+00, !dbg !72 + %153 = fadd float %146, %152, !dbg !76 + %154 = bitcast i16 %52 to bfloat, !dbg !27 + %155 = fpext bfloat %154 to float, !dbg !68 + %156 = bitcast i16 %81 to bfloat, !dbg !40 + %157 = fpext bfloat %156 to float, !dbg !69 + %158 = fmul float %155, %157, !dbg !70 + %159 = fsub float 0.000000e+00, %158, !dbg !71 + %160 = select i1 %43, float %159, float 0.000000e+00, !dbg !72 + %161 = bitcast i16 %95 to bfloat, !dbg !46 + %162 = fpext bfloat %161 to float, !dbg !73 + %163 = bitcast i16 %124 to bfloat, !dbg !59 + %164 = fpext bfloat %163 to float, !dbg !74 + %165 = fmul float %162, %164, !dbg !75 + %166 = select i1 %84, float %165, float 0.000000e+00, !dbg !72 + %167 = fadd float %160, %166, !dbg !76 + %168 = bitcast i16 %35 to bfloat, !dbg !19 + %169 = fpext bfloat %168 to float, !dbg !77 + %170 = bitcast i16 %33 to bfloat, !dbg !19 + %171 = fpext bfloat %170 to float, !dbg !77 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !67 + %172 = mul i64 %130, %5, !dbg !78 + %173 = mul i64 %132, %5, !dbg !78 + %174 = getelementptr bfloat, ptr addrspace(1) %3, i64 %.decomposed, !dbg !79 + %175 = getelementptr bfloat, ptr addrspace(1) %174, i64 %172, !dbg !79 + %176 = getelementptr bfloat, ptr addrspace(1) %3, i64 %.decomposed26, !dbg !79 + %177 = getelementptr bfloat, ptr addrspace(1) %176, i64 %173, !dbg !79 + %178 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !80 + %179 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %175, i64 %178, i1 %20) #4, !dbg !80 + %180 = bitcast i16 %179 to bfloat, !dbg !80 + %181 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !80 + %182 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %177, i64 %181, i1 %21) #4, !dbg !80 + %183 = bitcast i16 %182 to bfloat, !dbg !80 + %184 = fpext bfloat %180 to float, !dbg !81 + %185 = fpext bfloat %183 to float, !dbg !81 + %186 = fmul float %171, %184, !dbg !82 + %187 = fmul float %169, %185, !dbg !82 + %188 = fadd float %167, %186, !dbg !83 + %189 = fadd float %153, %187, !dbg !83 + %190 = getelementptr bfloat, ptr addrspace(1) %4, i64 %22, !dbg !84 + %191 = getelementptr bfloat, ptr addrspace(1) %4, i64 %23, !dbg !84 + %192 = fptrunc float %188 to bfloat, !dbg !85 + %193 = fptrunc float %189 to bfloat, !dbg !85 + %194 = bitcast bfloat %192 to i16, !dbg !85 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %194, ptr addrspace(1) %190, i1 %20) #4, !dbg !85 + %195 = bitcast bfloat %193 to i16, !dbg !85 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %195, ptr addrspace(1) %191, i1 %21) #4, !dbg !85 + ret void, !dbg !86 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="256" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1", linkageName: "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 19, column: 28, scope: !9) +!11 = !DILocation(line: 19, column: 33, scope: !9) +!12 = !DILocation(line: 20, column: 36, scope: !9) +!13 = !DILocation(line: 20, column: 23, scope: !9) +!14 = !DILocation(line: 21, column: 21, scope: !9) +!15 = !DILocation(line: 22, column: 19, scope: !9) +!16 = !DILocation(line: 24, column: 21, scope: !9) +!17 = !DILocation(line: 24, column: 28, scope: !9) +!18 = !DILocation(line: 25, column: 31, scope: !9) +!19 = !DILocation(line: 25, column: 36, scope: !9) +!20 = !DILocation(line: 26, column: 31, scope: !9) +!21 = !DILocation(line: 26, column: 36, scope: !9) +!22 = !DILocation(line: 28, column: 18, scope: !9) +!23 = !DILocation(line: 29, column: 19, scope: !9) +!24 = !DILocation(line: 30, column: 35, scope: !9) +!25 = !DILocation(line: 30, column: 30, scope: !9) +!26 = !DILocation(line: 30, column: 60, scope: !9) +!27 = !DILocation(line: 30, column: 53, scope: !9) +!28 = !DILocation(line: 31, column: 35, scope: !9) +!29 = !DILocation(line: 34, column: 18, scope: !9) +!30 = !DILocation(line: 35, column: 32, scope: !9) +!31 = !DILocation(line: 36, column: 28, scope: !9) +!32 = !DILocation(line: 36, column: 98, scope: !9) +!33 = !DILocation(line: 36, column: 64, scope: !9) +!34 = !DILocation(line: 36, column: 108, scope: !9) +!35 = !DILocation(line: 36, column: 106, scope: !9) +!36 = !DILocation(line: 36, column: 123, scope: !9) +!37 = !DILocation(line: 37, column: 36, scope: !9) +!38 = !DILocation(line: 37, column: 58, scope: !9) +!39 = !DILocation(line: 37, column: 31, scope: !9) +!40 = !DILocation(line: 37, column: 65, scope: !9) +!41 = !DILocation(line: 44, column: 19, scope: !9) +!42 = !DILocation(line: 45, column: 37, scope: !9) +!43 = !DILocation(line: 45, column: 42, scope: !9) +!44 = !DILocation(line: 45, column: 31, scope: !9) +!45 = !DILocation(line: 45, column: 68, scope: !9) +!46 = !DILocation(line: 45, column: 60, scope: !9) +!47 = !DILocation(line: 46, column: 36, scope: !9) +!48 = !DILocation(line: 49, column: 20, scope: !9) +!49 = !DILocation(line: 50, column: 35, scope: !9) +!50 = !DILocation(line: 51, column: 28, scope: !9) +!51 = !DILocation(line: 51, column: 100, scope: !9) +!52 = !DILocation(line: 51, column: 65, scope: !9) +!53 = !DILocation(line: 51, column: 110, scope: !9) +!54 = !DILocation(line: 51, column: 108, scope: !9) +!55 = !DILocation(line: 51, column: 126, scope: !9) +!56 = !DILocation(line: 52, column: 37, scope: !9) +!57 = !DILocation(line: 52, column: 64, scope: !9) +!58 = !DILocation(line: 52, column: 31, scope: !9) +!59 = !DILocation(line: 52, column: 72, scope: !9) +!60 = !DILocation(line: 60, column: 20, scope: !9) +!61 = !DILocation(line: 61, column: 35, scope: !9) +!62 = !DILocation(line: 62, column: 28, scope: !9) +!63 = !DILocation(line: 62, column: 46, scope: !9) +!64 = !DILocation(line: 62, column: 38, scope: !9) +!65 = !DILocation(line: 62, column: 56, scope: !9) +!66 = !DILocation(line: 62, column: 54, scope: !9) +!67 = !DILocation(line: 62, column: 64, scope: !9) +!68 = !DILocation(line: 30, column: 111, scope: !9) +!69 = !DILocation(line: 37, column: 123, scope: !9) +!70 = !DILocation(line: 38, column: 19, scope: !9) +!71 = !DILocation(line: 39, column: 13, scope: !9) +!72 = !DILocation(line: 0, scope: !9) +!73 = !DILocation(line: 45, column: 119, scope: !9) +!74 = !DILocation(line: 52, column: 131, scope: !9) +!75 = !DILocation(line: 53, column: 20, scope: !9) +!76 = !DILocation(line: 57, column: 20, scope: !9) +!77 = !DILocation(line: 25, column: 76, scope: !9) +!78 = !DILocation(line: 63, column: 40, scope: !9) +!79 = !DILocation(line: 63, column: 31, scope: !9) +!80 = !DILocation(line: 63, column: 48, scope: !9) +!81 = !DILocation(line: 63, column: 88, scope: !9) +!82 = !DILocation(line: 64, column: 20, scope: !9) +!83 = !DILocation(line: 65, column: 20, scope: !9) +!84 = !DILocation(line: 66, column: 25, scope: !9) +!85 = !DILocation(line: 66, column: 37, scope: !9) +!86 = !DILocation(line: 66, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..1e15cfe739b390e984a47c0df428d424ef29c506 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ptx @@ -0,0 +1,759 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1 // -- Begin function triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_2[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_2[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 116, 52, 47, 99, 116, 52, 102, 103, 105, 55, 122, 52, 109, 55, 121, 110, 50, 113, 51, 102, 122, 105, 98, 107, 105, 55, 104, 99, 99, 110, 97, 97, 115, 99, 99, 113, 108, 114, 53, 98, 122, 102, 53, 113, 97, 108, 104, 116, 100, 114, 108, 107, 114, 122, 110, 46, 112, 121}; +.global .align 1 .b8 assertMessage_2[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 54, 32, 60, 32, 107, 115, 51}; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 116, 52, 47, 99, 116, 52, 102, 103, 105, 55, 122, 52, 109, 55, 121, 110, 50, 113, 51, 102, 122, 105, 98, 107, 105, 55, 104, 99, 99, 110, 97, 97, 115, 99, 99, 113, 108, 114, 53, 98, 122, 102, 53, 113, 97, 108, 104, 116, 100, 114, 108, 107, 114, 122, 110, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[65] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 108, 46, 98, 114, 111, 97, 100, 99, 97, 115, 116, 95, 116, 111, 40, 116, 109, 112, 50, 51, 44, 32, 91, 88, 66, 76, 79, 67, 75, 93, 41, 32, 60, 32, 107, 115, 50}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 116, 52, 47, 99, 116, 52, 102, 103, 105, 55, 122, 52, 109, 55, 121, 110, 50, 113, 51, 102, 122, 105, 98, 107, 105, 55, 104, 99, 99, 110, 97, 97, 115, 99, 99, 113, 108, 114, 53, 98, 122, 102, 53, 113, 97, 108, 104, 116, 100, 114, 108, 107, 114, 122, 110, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[64] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 108, 46, 98, 114, 111, 97, 100, 99, 97, 115, 116, 95, 116, 111, 40, 116, 109, 112, 56, 44, 32, 91, 88, 66, 76, 79, 67, 75, 93, 41, 32, 60, 32, 107, 115, 50}; + // @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1 +.visible .entry triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1( + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_4, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_5, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_6, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_7, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_8, + .param .u32 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_9, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_10, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_11 +) +.reqntid 256 +{ + .reg .pred %p<71>; + .reg .b16 %rs<33>; + .reg .b32 %r<57>; + .reg .b64 %rd<189>; + .loc 1 18 0 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:18:0 +$L__func_begin0: + .loc 1 18 0 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:18:0 + +// %bb.0: + ld.param.b64 %rd40, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_5]; +$L__tmp0: + .loc 1 19 28 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:19:28 + mov.u32 %r2, %ctaid.x; + .loc 1 19 33 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:19:33 + shl.b32 %r3, %r2, 9; + .loc 1 20 36 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:20:36 + mov.u32 %r4, %tid.x; + shl.b32 %r5, %r4, 1; + and.b32 %r6, %r5, 510; + .loc 1 20 23 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:20:23 + or.b32 %r7, %r6, %r3; + or.b32 %r8, %r7, 1; + .loc 1 22 19 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:22:19 + cvt.s64.s32 %rd1, %r7; + cvt.s64.s32 %rd2, %r8; + .loc 1 24 21 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:24:21 + or.b64 %rd41, %rd1, %rd40; + and.b64 %rd42, %rd41, -4294967296; + setp.ne.b64 %p5, %rd42, 0; + cvt.u32.u64 %r56, %rd1; + @%p5 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd185, %rd1, %rd40; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r9, %rd40; + div.u32 %r11, %r56, %r9; + cvt.u64.u32 %rd185, %r11; +$L__BB0_3: + .loc 1 0 21 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:0:21 + ld.param.b64 %rd37, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_6]; + .loc 1 24 21 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:24:21 + or.b64 %rd44, %rd2, %rd40; + and.b64 %rd45, %rd44, -4294967296; + setp.ne.b64 %p6, %rd45, 0; + cvt.u32.u64 %r55, %rd2; + @%p6 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd186, %rd2, %rd40; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r12, %rd40; + div.u32 %r14, %r55, %r12; + cvt.u64.u32 %rd186, %r14; +$L__BB0_6: + .loc 1 22 19 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:22:19 + mul.lo.s64 %rd43, %rd185, %rd40; + mul.lo.s64 %rd11, %rd186, %rd40; + .loc 1 24 28 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:24:28 + or.b64 %rd46, %rd185, %rd37; + and.b64 %rd47, %rd46, -4294967296; + setp.ne.b64 %p7, %rd47, 0; + @%p7 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + rem.s64 %rd187, %rd185, %rd37; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r15, %rd37; + cvt.u32.u64 %r16, %rd185; + rem.u32 %r17, %r16, %r15; + cvt.u64.u32 %rd187, %r17; +$L__BB0_9: + .loc 1 0 28 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:0:28 + ld.param.b32 %r1, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_9]; + ld.param.b64 %rd38, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_7]; + ld.param.b64 %rd32, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_1]; + ld.param.b64 %rd31, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_0]; + sub.s64 %rd7, %rd1, %rd43; + sub.s64 %rd12, %rd2, %rd11; + .loc 1 24 28 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:24:28 + or.b64 %rd48, %rd186, %rd37; + and.b64 %rd49, %rd48, -4294967296; + setp.ne.b64 %p8, %rd49, 0; + @%p8 bra $L__BB0_11; + bra.uni $L__BB0_10; +$L__BB0_11: + rem.s64 %rd188, %rd186, %rd37; + bra.uni $L__BB0_12; +$L__BB0_10: + cvt.u32.u64 %r18, %rd37; + cvt.u32.u64 %r19, %rd186; + rem.u32 %r20, %r19, %r18; + cvt.u64.u32 %rd188, %r20; +$L__BB0_12: + .loc 1 21 21 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:21:21 + setp.lt.s32 %p12, %r55, %r1; + setp.lt.s32 %p11, %r56, %r1; + .loc 1 25 31 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:25:31 + shl.b64 %rd78, %rd1, 1; + add.s64 %rd51, %rd31, %rd78; + add.s64 %rd54, %rd51, 2; + .loc 1 25 36 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:25:36 + // begin inline asm + mov.u64 %rd50, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd50, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs11, 0x0; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd51 + 0 ], %rd50; + // end inline asm + // begin inline asm + mov.u64 %rd53, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs12, 0x0; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd54 + 0 ], %rd53; + // end inline asm + .loc 1 26 31 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:26:31 + shl.b64 %rd79, %rd187, 3; + add.s64 %rd72, %rd32, %rd79; + shl.b64 %rd80, %rd188, 3; + add.s64 %rd76, %rd32, %rd80; + .loc 1 26 36 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:26:36 + // begin inline asm + mov.u64 %rd56, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd56, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd57, 0x0; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd57 }, [ %rd72 + 0 ], %rd56; + // end inline asm + // begin inline asm + mov.u64 %rd60, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd60, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd61, 0x0; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd61 }, [ %rd76 + 0 ], %rd60; + // end inline asm + .loc 1 28 18 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:28:18 + shr.u64 %rd81, %rd40, 63; + add.s64 %rd82, %rd40, %rd81; + shr.s64 %rd23, %rd82, 1; + .loc 1 29 19 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:29:19 + setp.ge.s64 %p17, %rd7, %rd23; + setp.ge.s64 %p18, %rd12, %rd23; + .loc 1 30 35 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:30:35 + sub.s64 %rd83, %rd1, %rd23; + .loc 1 30 30 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:30:30 + shl.b64 %rd84, %rd83, 1; + add.s64 %rd65, %rd31, %rd84; + add.s64 %rd68, %rd65, 2; + .loc 1 30 60 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:30:60 + and.pred %p15, %p11, %p17; + and.pred %p16, %p12, %p18; + .loc 1 30 53 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:30:53 + // begin inline asm + mov.u64 %rd64, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd64, 1.0; + // end inline asm + mov.b16 %rs16, 0; + // begin inline asm + mov.u16 %rs13, %rs16; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd65 + 0 ], %rd64; + // end inline asm + // begin inline asm + mov.u64 %rd67, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd67, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs16; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd68 + 0 ], %rd67; + // end inline asm + .loc 1 31 35 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:31:35 + // begin inline asm + mov.u64 %rd70, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd70, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd71, 0x0; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd71 }, [ %rd72 + 0 ], %rd70; + // end inline asm + // begin inline asm + mov.u64 %rd74, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd74, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd75, 0x0; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd75 }, [ %rd76 + 0 ], %rd74; + // end inline asm + .loc 1 35 32 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:35:32 + shr.s64 %rd85, %rd71, 63; + and.b64 %rd86, %rd85, %rd38; + add.s64 %rd24, %rd86, %rd71; + shr.s64 %rd87, %rd75, 63; + and.b64 %rd88, %rd87, %rd38; + add.s64 %rd25, %rd88, %rd75; + .loc 1 36 28 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:36:28 + setp.lt.s64 %p19, %rd24, 0; + setp.lt.s64 %p20, %rd25, 0; + .loc 1 36 98 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:36:98 + setp.ge.s64 %p21, %rd24, %rd38; + setp.ge.s64 %p22, %rd25, %rd38; + .loc 1 36 64 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:36:64 + or.pred %p23, %p19, %p21; + .loc 1 36 108 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:36:108 + or.pred %p24, %p20, %p22; + .loc 1 36 106 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:36:106 + and.pred %p25, %p15, %p23; + .loc 1 36 123 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:36:123 + and.pred %p26, %p16, %p24; + or.pred %p27, %p25, %p26; + not.pred %p28, %p27; + @%p28 bra $L__BB0_14; + bra.uni $L__BB0_13; +$L__BB0_14: + .loc 1 0 123 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:0:123 + ld.param.b64 %rd33, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_2]; + .loc 1 36 123 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:36:123 + bar.sync 0; + .loc 1 37 36 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:37:36 + sub.s64 %rd109, %rd7, %rd23; + .loc 1 37 58 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:37:58 + mul.lo.s64 %rd110, %rd24, %rd40; + mul.lo.s64 %rd111, %rd25, %rd40; + .loc 1 37 31 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:37:31 + shl.b64 %rd112, %rd109, 1; + add.s64 %rd113, %rd33, %rd112; + shl.b64 %rd114, %rd110, 1; + add.s64 %rd90, %rd113, %rd114; + sub.s64 %rd26, %rd1, %rd11; + sub.s64 %rd115, %rd26, %rd23; + shl.b64 %rd116, %rd115, 1; + add.s64 %rd117, %rd33, %rd116; + shl.b64 %rd118, %rd111, 1; + add.s64 %rd119, %rd117, %rd118; + add.s64 %rd93, %rd119, 2; + .loc 1 37 65 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:37:65 + // begin inline asm + mov.u64 %rd89, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd89, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs17, %rs16; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd90 + 0 ], %rd89; + // end inline asm + // begin inline asm + mov.u64 %rd92, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd92, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs19, %rs16; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd93 + 0 ], %rd92; + // end inline asm + .loc 1 44 19 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:44:19 + setp.lt.s64 %p37, %rd7, %rd23; + setp.lt.s64 %p38, %rd12, %rd23; + .loc 1 45 37 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:45:37 + add.s64 %rd120, %rd40, %rd1; + .loc 1 45 42 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:45:42 + sub.s64 %rd121, %rd120, %rd23; + .loc 1 45 31 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:45:31 + shl.b64 %rd122, %rd121, 1; + add.s64 %rd96, %rd31, %rd122; + add.s64 %rd99, %rd96, 2; + .loc 1 45 68 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:45:68 + and.pred %p33, %p11, %p37; + and.pred %p34, %p12, %p38; + .loc 1 45 60 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:45:60 + // begin inline asm + mov.u64 %rd95, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd95, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs21, %rs16; + @%p33 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd96 + 0 ], %rd95; + // end inline asm + // begin inline asm + mov.u64 %rd98, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd98, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs23, %rs16; + @%p34 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd99 + 0 ], %rd98; + // end inline asm + .loc 1 46 36 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:46:36 + // begin inline asm + mov.u64 %rd101, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd101, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd102, 0x0; + @%p33 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd102 }, [ %rd72 + 0 ], %rd101; + // end inline asm + // begin inline asm + mov.u64 %rd105, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd105, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd106, 0x0; + @%p34 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd106 }, [ %rd76 + 0 ], %rd105; + // end inline asm + .loc 1 50 35 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:50:35 + shr.s64 %rd123, %rd102, 63; + and.b64 %rd124, %rd123, %rd38; + add.s64 %rd27, %rd124, %rd102; + shr.s64 %rd125, %rd106, 63; + and.b64 %rd126, %rd125, %rd38; + add.s64 %rd28, %rd126, %rd106; + .loc 1 51 28 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:51:28 + setp.lt.s64 %p39, %rd27, 0; + setp.lt.s64 %p40, %rd28, 0; + .loc 1 51 100 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:51:100 + setp.ge.s64 %p41, %rd27, %rd38; + setp.ge.s64 %p42, %rd28, %rd38; + .loc 1 51 65 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:51:65 + or.pred %p43, %p39, %p41; + .loc 1 51 110 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:51:110 + or.pred %p44, %p40, %p42; + .loc 1 51 108 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:51:108 + and.pred %p45, %p33, %p43; + .loc 1 51 126 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:51:126 + and.pred %p46, %p34, %p44; + or.pred %p47, %p45, %p46; + not.pred %p48, %p47; + @%p48 bra $L__BB0_16; + bra.uni $L__BB0_15; +$L__BB0_16: + .loc 1 0 126 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:0:126 + ld.param.b64 %rd39, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_8]; + .loc 1 51 126 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:51:126 + bar.sync 0; + .loc 1 52 37 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:52:37 + sub.s64 %rd133, %rd40, %rd23; + .loc 1 52 64 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:52:64 + mul.lo.s64 %rd134, %rd27, %rd40; + mul.lo.s64 %rd135, %rd28, %rd40; + .loc 1 52 31 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:52:31 + shl.b64 %rd136, %rd133, 1; + add.s64 %rd137, %rd33, %rd136; + shl.b64 %rd138, %rd7, 1; + add.s64 %rd139, %rd137, %rd138; + shl.b64 %rd140, %rd134, 1; + add.s64 %rd128, %rd139, %rd140; + shl.b64 %rd141, %rd26, 1; + add.s64 %rd142, %rd137, %rd141; + shl.b64 %rd143, %rd135, 1; + add.s64 %rd144, %rd142, %rd143; + add.s64 %rd131, %rd144, 2; + .loc 1 52 72 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:52:72 + // begin inline asm + mov.u64 %rd127, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd127, 1.0; + // end inline asm + mov.b16 %rs26, 0; + // begin inline asm + mov.u16 %rs25, %rs26; + @%p33 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd128 + 0 ], %rd127; + // end inline asm + // begin inline asm + mov.u64 %rd130, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd130, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs27, %rs26; + @%p34 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs27 }, [ %rd131 + 0 ], %rd130; + // end inline asm + .loc 1 61 35 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:61:35 + shr.s64 %rd145, %rd57, 63; + and.b64 %rd146, %rd145, %rd39; + add.s64 %rd29, %rd146, %rd57; + shr.s64 %rd147, %rd61, 63; + and.b64 %rd148, %rd147, %rd39; + add.s64 %rd30, %rd148, %rd61; + .loc 1 62 28 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:62:28 + setp.lt.s64 %p53, %rd29, 0; + setp.lt.s64 %p54, %rd30, 0; + .loc 1 62 46 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:62:46 + setp.ge.s64 %p55, %rd29, %rd39; + setp.ge.s64 %p56, %rd30, %rd39; + .loc 1 62 38 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:62:38 + or.pred %p57, %p53, %p55; + .loc 1 62 56 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:62:56 + or.pred %p58, %p54, %p56; + .loc 1 62 54 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:62:54 + and.pred %p59, %p11, %p57; + .loc 1 62 64 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:62:64 + and.pred %p60, %p12, %p58; + or.pred %p61, %p59, %p60; + not.pred %p62, %p61; + @%p62 bra $L__BB0_18; + bra.uni $L__BB0_17; +$L__BB0_18: + .loc 1 0 64 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:0:64 + ld.param.b64 %rd35, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_4]; + ld.param.b64 %rd34, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1_param_3]; + .loc 1 30 111 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:30:111 + cvt.f32.bf16 %r29, %rs15; + .loc 1 37 123 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:37:123 + cvt.f32.bf16 %r30, %rs19; + .loc 1 39 13 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:39:13 + neg.f32 %r31, %r29; + fma.rn.f32 %r32, %r31, %r30, 0f00000000; + .loc 1 0 0 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:0 + selp.f32 %r33, %r32, 0f00000000, %p18; + .loc 1 45 119 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:45:119 + cvt.f32.bf16 %r34, %rs23; + .loc 1 52 131 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:52:131 + cvt.f32.bf16 %r35, %rs27; + .loc 1 53 20 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:53:20 + mul.f32 %r36, %r34, %r35; + .loc 1 0 0 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:0 + selp.f32 %r37, %r36, 0f00000000, %p38; + .loc 1 57 20 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:57:20 + add.f32 %r38, %r33, %r37; + .loc 1 30 111 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:30:111 + cvt.f32.bf16 %r39, %rs13; + .loc 1 37 123 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:37:123 + cvt.f32.bf16 %r40, %rs17; + .loc 1 39 13 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:39:13 + neg.f32 %r41, %r39; + fma.rn.f32 %r42, %r41, %r40, 0f00000000; + .loc 1 0 0 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:0 + selp.f32 %r43, %r42, 0f00000000, %p17; + .loc 1 45 119 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:45:119 + cvt.f32.bf16 %r44, %rs21; + .loc 1 52 131 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:52:131 + cvt.f32.bf16 %r45, %rs25; + .loc 1 53 20 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:53:20 + mul.f32 %r46, %r44, %r45; + .loc 1 0 0 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:0 + selp.f32 %r47, %r46, 0f00000000, %p37; + .loc 1 57 20 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:57:20 + add.f32 %r48, %r43, %r47; + .loc 1 25 76 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:25:76 + cvt.f32.bf16 %r49, %rs12; + cvt.f32.bf16 %r50, %rs11; + .loc 1 62 64 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:62:64 + bar.sync 0; + .loc 1 63 40 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:63:40 + mul.lo.s64 %rd157, %rd29, %rd40; + mul.lo.s64 %rd158, %rd30, %rd40; + .loc 1 63 31 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:63:31 + add.s64 %rd160, %rd34, %rd138; + shl.b64 %rd161, %rd157, 1; + add.s64 %rd150, %rd160, %rd161; + add.s64 %rd163, %rd34, %rd141; + shl.b64 %rd164, %rd158, 1; + add.s64 %rd165, %rd163, %rd164; + add.s64 %rd153, %rd165, 2; + .loc 1 63 48 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:63:48 + // begin inline asm + mov.u64 %rd151, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd151, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs29, 0x0; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs29 }, [ %rd150 + 0 ], %rd151; + // end inline asm + // begin inline asm + mov.u64 %rd154, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd154, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs30, 0x0; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs30 }, [ %rd153 + 0 ], %rd154; + // end inline asm + .loc 1 63 88 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:63:88 + cvt.f32.bf16 %r51, %rs29; + cvt.f32.bf16 %r52, %rs30; + .loc 1 65 20 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:65:20 + fma.rn.f32 %r53, %r50, %r51, %r48; + fma.rn.f32 %r54, %r49, %r52, %r38; + .loc 1 66 25 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:66:25 + add.s64 %rd155, %rd35, %rd78; + add.s64 %rd156, %rd155, 2; + .loc 1 66 37 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:66:37 + cvt.rn.bf16.f32 %rs31, %r53; + cvt.rn.bf16.f32 %rs32, %r54; + // begin inline asm + @%p11 st.global.b16 [ %rd155 + 0 ], { %rs31 }; + // end inline asm + // begin inline asm + @%p12 st.global.b16 [ %rd156 + 0 ], { %rs32 }; + // end inline asm + .loc 1 66 4 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:66:4 + ret; +$L__BB0_13: + .loc 1 36 123 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:36:123 + { // callseq 2, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd179, assertFunc_0; + cvta.global.u64 %rd180, %rd179; + st.param.b64 [param3], %rd180; + mov.b64 %rd181, assertFile_0; + cvta.global.u64 %rd182, %rd181; + st.param.b64 [param1], %rd182; + mov.b64 %rd183, assertMessage_0; + cvta.global.u64 %rd184, %rd183; + st.param.b64 [param0], %rd184; + st.param.b64 [param4], 1; + st.param.b32 [param2], 36; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 2 + trap; +$L__BB0_15: + .loc 1 51 126 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:51:126 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd173, assertFunc_1; + cvta.global.u64 %rd174, %rd173; + st.param.b64 [param3], %rd174; + mov.b64 %rd175, assertFile_1; + cvta.global.u64 %rd176, %rd175; + st.param.b64 [param1], %rd176; + mov.b64 %rd177, assertMessage_1; + cvta.global.u64 %rd178, %rd177; + st.param.b64 [param0], %rd178; + st.param.b64 [param4], 1; + st.param.b32 [param2], 51; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__BB0_17: + .loc 1 62 64 // ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py:62:64 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd167, assertFunc_2; + cvta.global.u64 %rd168, %rd167; + st.param.b64 [param3], %rd168; + mov.b64 %rd169, assertFile_2; + cvta.global.u64 %rd170, %rd169; + st.param.b64 [param1], %rd170; + mov.b64 %rd171, assertMessage_2; + cvta.global.u64 %rd172, %rd171; + st.param.b64 [param0], %rd172; + st.param.b64 [param4], 1; + st.param.b32 [param2], 62; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 116 +.b8 52 +.b8 102 +.b8 103 +.b8 105 +.b8 55 +.b8 122 +.b8 52 +.b8 109 +.b8 55 +.b8 121 +.b8 110 +.b8 50 +.b8 113 +.b8 51 +.b8 102 +.b8 122 +.b8 105 +.b8 98 +.b8 107 +.b8 105 +.b8 55 +.b8 104 +.b8 99 +.b8 99 +.b8 110 +.b8 97 +.b8 97 +.b8 115 +.b8 99 +.b8 99 +.b8 113 +.b8 108 +.b8 114 +.b8 53 +.b8 98 +.b8 122 +.b8 102 +.b8 53 +.b8 113 +.b8 97 +.b8 108 +.b8 104 +.b8 116 +.b8 100 +.b8 114 +.b8 108 +.b8 107 +.b8 114 +.b8 122 +.b8 110 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 116 +.b8 52 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.source new file mode 100644 index 0000000000000000000000000000000000000000..87c1327f132d6cc85fa38d113e02da61255c4d62 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.source @@ -0,0 +1,419 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":18:0) +#loc109 = loc("in_ptr0"(#loc)) +#loc110 = loc("in_ptr1"(#loc)) +#loc111 = loc("in_ptr2"(#loc)) +#loc112 = loc("in_ptr3"(#loc)) +#loc113 = loc("out_ptr0"(#loc)) +#loc114 = loc("ks0"(#loc)) +#loc115 = loc("ks1"(#loc)) +#loc116 = loc("ks2"(#loc)) +#loc117 = loc("ks3"(#loc)) +#loc118 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc119) + %xoffset_0 = arith.constant 512 : i32 loc(#loc120) + %xoffset_1 = arith.constant 512 : i32 loc(#loc120) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc120) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc121) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<512xi32> loc(#loc122) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<512xi32> loc(#loc122) + %xmask = tt.splat %xnumel : i32 -> tensor<512xi32> loc(#loc123) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<512xi32> loc(#loc123) + %x0 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc124) + %x0_6 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc124) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<512xi64> loc(#loc124) + %x1 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc125) + %x1_8 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc125) + %x1_9 = arith.divsi %x1, %x1_8 : tensor<512xi64> loc(#loc125) + %x1_10 = tt.splat %ks1 : i64 -> tensor<512xi64> loc(#loc126) + %x1_11 = arith.remsi %x1_9, %x1_10 : tensor<512xi64> loc(#loc126) + %tmp31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc127) + %tmp31_12 = tt.addptr %tmp31, %xindex_4 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc127) + %tmp31_13 = tt.load %tmp31_12, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc128) + %tmp31_14 = arith.extf %tmp31_13 : tensor<512xbf16> to tensor<512xf32> loc(#loc129) + %tmp32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc130) + %tmp32_15 = tt.addptr %tmp32, %x1_11 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc130) + %tmp32_16 = tt.load %tmp32_15, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc131) + %tmp1 = arith.constant 2 : i32 loc(#loc132) + %tmp1_17 = arith.constant 2 : i64 loc(#loc132) + %tmp1_18 = arith.divsi %ks0, %tmp1_17 : i64 loc(#loc132) + %tmp2 = tt.splat %tmp1_18 : i64 -> tensor<512xi64> loc(#loc133) + %tmp2_19 = arith.cmpi sge, %x0_7, %tmp2 : tensor<512xi64> loc(#loc133) + %tmp3 = arith.constant 2 : i32 loc(#loc134) + %tmp3_20 = arith.constant 2 : i64 loc(#loc134) + %tmp3_21 = arith.divsi %ks0, %tmp3_20 : i64 loc(#loc134) + %tmp3_22 = arith.constant -1 : i32 loc(#loc135) + %tmp3_23 = arith.constant -1 : i64 loc(#loc135) + %tmp3_24 = arith.muli %tmp3_23, %tmp3_21 : i64 loc(#loc135) + %tmp3_25 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc136) + %tmp3_26 = tt.splat %tmp3_24 : i64 -> tensor<512xi64> loc(#loc136) + %tmp3_27 = arith.addi %tmp3_25, %tmp3_26 : tensor<512xi64> loc(#loc136) + %tmp3_28 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc137) + %tmp3_29 = tt.addptr %tmp3_28, %tmp3_27 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc137) + %tmp3_30 = arith.andi %tmp2_19, %xmask_5 : tensor<512xi1> loc(#loc138) + %tmp3_31 = arith.constant 0.000000e+00 : f32 loc(#loc139) + %tmp3_32 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc139) + %tmp3_33 = arith.truncf %tmp3_32 : tensor<512xf32> to tensor<512xbf16> loc(#loc139) + %tmp3_34 = tt.load %tmp3_29, %tmp3_30, %tmp3_33 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc139) + %tmp3_35 = arith.extf %tmp3_34 : tensor<512xbf16> to tensor<512xf32> loc(#loc140) + %tmp4 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc141) + %tmp4_36 = tt.addptr %tmp4, %x1_11 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc141) + %tmp4_37 = arith.andi %tmp2_19, %xmask_5 : tensor<512xi1> loc(#loc142) + %tmp4_38 = arith.constant 0.000000e+00 : f32 loc(#loc143) + %tmp4_39 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc143) + %tmp4_40 = arith.fptosi %tmp4_39 : tensor<512xf32> to tensor<512xi64> loc(#loc143) + %tmp4_41 = tt.load %tmp4_36, %tmp4_37, %tmp4_40 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc143) + %tmp5 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc144) + %tmp6 = arith.addi %tmp4_41, %tmp5 : tensor<512xi64> loc(#loc145) + %tmp7 = arith.constant 0 : i32 loc(#loc146) + %tmp7_42 = arith.extsi %tmp7 : i32 to i64 loc(#loc146) + %tmp7_43 = tt.splat %tmp7_42 : i64 -> tensor<512xi64> loc(#loc146) + %tmp7_44 = arith.cmpi slt, %tmp4_41, %tmp7_43 : tensor<512xi64> loc(#loc146) + %tmp8 = arith.select %tmp7_44, %tmp6, %tmp4_41 : tensor<512xi1>, tensor<512xi64> loc(#loc147) + %c0_i32 = arith.constant 0 : i32 loc(#loc30) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc30) + %1 = tt.splat %0 : i64 -> tensor<512xi64> loc(#loc30) + %2 = arith.cmpi sle, %1, %tmp8 : tensor<512xi64> loc(#loc30) + %3 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc31) + %4 = arith.cmpi slt, %tmp8, %3 : tensor<512xi64> loc(#loc31) + %5 = arith.andi %2, %4 : tensor<512xi1> loc(#loc32) + %6 = arith.andi %tmp2_19, %xmask_5 : tensor<512xi1> loc(#loc33) + %true = arith.constant true loc(#loc34) + %cst = arith.constant dense : tensor<512xi1> loc(#loc34) + %7 = arith.xori %6, %cst : tensor<512xi1> loc(#loc34) + %8 = arith.ori %5, %7 : tensor<512xi1> loc(#loc35) + tt.assert %8, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<512xi1> loc(#loc36) + %tmp10 = arith.constant 2 : i32 loc(#loc148) + %tmp10_45 = arith.constant 2 : i64 loc(#loc148) + %tmp10_46 = arith.divsi %ks0, %tmp10_45 : i64 loc(#loc148) + %tmp10_47 = arith.constant -1 : i32 loc(#loc149) + %tmp10_48 = arith.constant -1 : i64 loc(#loc149) + %tmp10_49 = arith.muli %tmp10_48, %tmp10_46 : i64 loc(#loc149) + %tmp10_50 = tt.splat %tmp10_49 : i64 -> tensor<512xi64> loc(#loc150) + %tmp10_51 = arith.addi %x0_7, %tmp10_50 : tensor<512xi64> loc(#loc150) + %tmp10_52 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc151) + %tmp10_53 = arith.muli %tmp10_52, %tmp8 : tensor<512xi64> loc(#loc151) + %tmp10_54 = arith.addi %tmp10_51, %tmp10_53 : tensor<512xi64> loc(#loc152) + %tmp10_55 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc153) + %tmp10_56 = tt.addptr %tmp10_55, %tmp10_54 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc153) + %tmp10_57 = arith.andi %tmp2_19, %xmask_5 : tensor<512xi1> loc(#loc154) + %tmp10_58 = arith.constant 0.000000e+00 : f32 loc(#loc155) + %tmp10_59 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc155) + %tmp10_60 = arith.truncf %tmp10_59 : tensor<512xf32> to tensor<512xbf16> loc(#loc155) + %tmp10_61 = tt.load %tmp10_56, %tmp10_57, %tmp10_60 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc155) + %tmp10_62 = arith.extf %tmp10_61 : tensor<512xbf16> to tensor<512xf32> loc(#loc156) + %tmp11 = arith.mulf %tmp3_35, %tmp10_62 : tensor<512xf32> loc(#loc157) + %tmp12 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp12_63 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc158) + %tmp12_64 = arith.subf %tmp12_63, %tmp11 : tensor<512xf32> loc(#loc158) + %tmp13 = arith.constant 0.000000e+00 : f32 loc(#loc159) + %tmp13_65 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc159) + %tmp14 = arith.select %tmp2_19, %tmp12_64, %tmp13_65 : tensor<512xi1>, tensor<512xf32> loc(#loc160) + %tmp15 = arith.constant 0.000000e+00 : f32 loc(#loc161) + %tmp16 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc162) + %tmp16_66 = arith.select %tmp2_19, %tmp14, %tmp16 : tensor<512xi1>, tensor<512xf32> loc(#loc162) + %tmp17 = tt.splat %tmp1_18 : i64 -> tensor<512xi64> loc(#loc163) + %tmp17_67 = arith.cmpi slt, %x0_7, %tmp17 : tensor<512xi64> loc(#loc163) + %tmp18 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc164) + %tmp18_68 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc164) + %tmp18_69 = arith.addi %tmp18_68, %tmp18 : tensor<512xi64> loc(#loc164) + %tmp18_70 = arith.constant 2 : i32 loc(#loc165) + %tmp18_71 = arith.constant 2 : i64 loc(#loc165) + %tmp18_72 = arith.divsi %ks0, %tmp18_71 : i64 loc(#loc165) + %tmp18_73 = arith.constant -1 : i32 loc(#loc166) + %tmp18_74 = arith.constant -1 : i64 loc(#loc166) + %tmp18_75 = arith.muli %tmp18_74, %tmp18_72 : i64 loc(#loc166) + %tmp18_76 = tt.splat %tmp18_75 : i64 -> tensor<512xi64> loc(#loc167) + %tmp18_77 = arith.addi %tmp18_69, %tmp18_76 : tensor<512xi64> loc(#loc167) + %tmp18_78 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc168) + %tmp18_79 = tt.addptr %tmp18_78, %tmp18_77 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc168) + %tmp18_80 = arith.andi %tmp17_67, %xmask_5 : tensor<512xi1> loc(#loc169) + %tmp18_81 = arith.constant 0.000000e+00 : f32 loc(#loc170) + %tmp18_82 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc170) + %tmp18_83 = arith.truncf %tmp18_82 : tensor<512xf32> to tensor<512xbf16> loc(#loc170) + %tmp18_84 = tt.load %tmp18_79, %tmp18_80, %tmp18_83 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc170) + %tmp18_85 = arith.extf %tmp18_84 : tensor<512xbf16> to tensor<512xf32> loc(#loc171) + %tmp19 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc172) + %tmp19_86 = tt.addptr %tmp19, %x1_11 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc172) + %tmp19_87 = arith.andi %tmp17_67, %xmask_5 : tensor<512xi1> loc(#loc173) + %tmp19_88 = arith.constant 0.000000e+00 : f32 loc(#loc174) + %tmp19_89 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc174) + %tmp19_90 = arith.fptosi %tmp19_89 : tensor<512xf32> to tensor<512xi64> loc(#loc174) + %tmp19_91 = tt.load %tmp19_86, %tmp19_87, %tmp19_90 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc174) + %tmp20 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc175) + %tmp21 = arith.addi %tmp19_91, %tmp20 : tensor<512xi64> loc(#loc176) + %tmp22 = arith.constant 0 : i32 loc(#loc177) + %tmp22_92 = arith.extsi %tmp22 : i32 to i64 loc(#loc177) + %tmp22_93 = tt.splat %tmp22_92 : i64 -> tensor<512xi64> loc(#loc177) + %tmp22_94 = arith.cmpi slt, %tmp19_91, %tmp22_93 : tensor<512xi64> loc(#loc177) + %tmp23 = arith.select %tmp22_94, %tmp21, %tmp19_91 : tensor<512xi1>, tensor<512xi64> loc(#loc178) + %c0_i32_95 = arith.constant 0 : i32 loc(#loc68) + %9 = arith.extsi %c0_i32_95 : i32 to i64 loc(#loc68) + %10 = tt.splat %9 : i64 -> tensor<512xi64> loc(#loc68) + %11 = arith.cmpi sle, %10, %tmp23 : tensor<512xi64> loc(#loc68) + %12 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc69) + %13 = arith.cmpi slt, %tmp23, %12 : tensor<512xi64> loc(#loc69) + %14 = arith.andi %11, %13 : tensor<512xi1> loc(#loc70) + %15 = arith.andi %tmp17_67, %xmask_5 : tensor<512xi1> loc(#loc71) + %true_96 = arith.constant true loc(#loc72) + %cst_97 = arith.constant dense : tensor<512xi1> loc(#loc72) + %16 = arith.xori %15, %cst_97 : tensor<512xi1> loc(#loc72) + %17 = arith.ori %14, %16 : tensor<512xi1> loc(#loc73) + tt.assert %17, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<512xi1> loc(#loc74) + %tmp25 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc179) + %tmp25_98 = arith.addi %tmp25, %x0_7 : tensor<512xi64> loc(#loc179) + %tmp25_99 = arith.constant 2 : i32 loc(#loc180) + %tmp25_100 = arith.constant 2 : i64 loc(#loc180) + %tmp25_101 = arith.divsi %ks0, %tmp25_100 : i64 loc(#loc180) + %tmp25_102 = arith.constant -1 : i32 loc(#loc181) + %tmp25_103 = arith.constant -1 : i64 loc(#loc181) + %tmp25_104 = arith.muli %tmp25_103, %tmp25_101 : i64 loc(#loc181) + %tmp25_105 = tt.splat %tmp25_104 : i64 -> tensor<512xi64> loc(#loc182) + %tmp25_106 = arith.addi %tmp25_98, %tmp25_105 : tensor<512xi64> loc(#loc182) + %tmp25_107 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc183) + %tmp25_108 = arith.muli %tmp25_107, %tmp23 : tensor<512xi64> loc(#loc183) + %tmp25_109 = arith.addi %tmp25_106, %tmp25_108 : tensor<512xi64> loc(#loc184) + %tmp25_110 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc185) + %tmp25_111 = tt.addptr %tmp25_110, %tmp25_109 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc185) + %tmp25_112 = arith.andi %tmp17_67, %xmask_5 : tensor<512xi1> loc(#loc186) + %tmp25_113 = arith.constant 0.000000e+00 : f32 loc(#loc187) + %tmp25_114 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc187) + %tmp25_115 = arith.truncf %tmp25_114 : tensor<512xf32> to tensor<512xbf16> loc(#loc187) + %tmp25_116 = tt.load %tmp25_111, %tmp25_112, %tmp25_115 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc187) + %tmp25_117 = arith.extf %tmp25_116 : tensor<512xbf16> to tensor<512xf32> loc(#loc188) + %tmp26 = arith.mulf %tmp18_85, %tmp25_117 : tensor<512xf32> loc(#loc189) + %tmp27 = arith.constant 0.000000e+00 : f32 loc(#loc190) + %tmp27_118 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc190) + %tmp28 = arith.select %tmp17_67, %tmp26, %tmp27_118 : tensor<512xi1>, tensor<512xf32> loc(#loc191) + %tmp29 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc192) + %tmp29_119 = arith.select %tmp17_67, %tmp28, %tmp29 : tensor<512xi1>, tensor<512xf32> loc(#loc192) + %tmp30 = arith.addf %tmp16_66, %tmp29_119 : tensor<512xf32> loc(#loc193) + %tmp34 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc194) + %tmp34_120 = arith.addi %tmp32_16, %tmp34 : tensor<512xi64> loc(#loc194) + %tmp35 = arith.constant 0 : i32 loc(#loc195) + %tmp35_121 = arith.extsi %tmp35 : i32 to i64 loc(#loc195) + %tmp35_122 = tt.splat %tmp35_121 : i64 -> tensor<512xi64> loc(#loc195) + %tmp35_123 = arith.cmpi slt, %tmp32_16, %tmp35_122 : tensor<512xi64> loc(#loc195) + %tmp36 = arith.select %tmp35_123, %tmp34_120, %tmp32_16 : tensor<512xi1>, tensor<512xi64> loc(#loc196) + %c0_i32_124 = arith.constant 0 : i32 loc(#loc93) + %18 = arith.extsi %c0_i32_124 : i32 to i64 loc(#loc93) + %19 = tt.splat %18 : i64 -> tensor<512xi64> loc(#loc93) + %20 = arith.cmpi sle, %19, %tmp36 : tensor<512xi64> loc(#loc93) + %21 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc94) + %22 = arith.cmpi slt, %tmp36, %21 : tensor<512xi64> loc(#loc94) + %23 = arith.andi %20, %22 : tensor<512xi1> loc(#loc95) + %true_125 = arith.constant true loc(#loc96) + %cst_126 = arith.constant dense : tensor<512xi1> loc(#loc96) + %24 = arith.xori %xmask_5, %cst_126 : tensor<512xi1> loc(#loc96) + %25 = arith.ori %23, %24 : tensor<512xi1> loc(#loc97) + tt.assert %25, "index out of bounds: 0 <= tmp36 < ks3" : tensor<512xi1> loc(#loc98) + %tmp38 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc197) + %tmp38_127 = arith.muli %tmp38, %tmp36 : tensor<512xi64> loc(#loc197) + %tmp38_128 = arith.addi %x0_7, %tmp38_127 : tensor<512xi64> loc(#loc198) + %tmp38_129 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc199) + %tmp38_130 = tt.addptr %tmp38_129, %tmp38_128 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc199) + %tmp38_131 = tt.load %tmp38_130, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc200) + %tmp38_132 = arith.extf %tmp38_131 : tensor<512xbf16> to tensor<512xf32> loc(#loc201) + %tmp39 = arith.mulf %tmp31_14, %tmp38_132 : tensor<512xf32> loc(#loc202) + %tmp40 = arith.addf %tmp30, %tmp39 : tensor<512xf32> loc(#loc203) + %26 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc106) + %27 = tt.addptr %26, %xindex_4 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc106) + %28 = arith.truncf %tmp40 : tensor<512xf32> to tensor<512xbf16> loc(#loc107) + tt.store %27, %28, %xmask_5 : tensor<512x!tt.ptr> loc(#loc107) + tt.return loc(#loc108) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":22:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":24:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":25:31) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":25:36) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":25:76) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":26:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":26:36) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":28:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":29:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:48) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:35) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:111) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":31:30) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":31:42) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":31:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":32:32) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":33:18) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":34:18) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":35:32) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:28) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:98) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:64) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:115) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:108) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:106) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:123) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:49) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:42) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:36) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:58) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:54) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:31) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:72) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:65) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:123) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":38:19) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":39:13) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":40:38) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":41:34) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":42:12) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":43:34) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":44:19) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:37) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:55) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:48) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:42) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:31) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:68) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:60) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:119) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":46:31) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":46:44) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":46:36) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":47:33) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":48:20) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":49:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":50:35) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:28) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:100) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:65) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:118) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:110) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:108) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:126) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:37) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:55) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:48) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:42) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:64) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:60) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:31) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:80) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:72) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:131) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":53:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":54:38) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":55:35) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":56:35) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":57:20) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":59:20) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":60:20) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":61:35) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:28) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:46) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:38) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:56) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:54) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:64) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:40) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:36) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:31) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:48) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:88) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":64:20) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":65:20) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":66:25) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":66:37) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":66:4) +#loc119 = loc("xoffset"(#loc1)) +#loc120 = loc("xoffset"(#loc2)) +#loc121 = loc("xindex"(#loc3)) +#loc122 = loc("xindex"(#loc4)) +#loc123 = loc("xmask"(#loc5)) +#loc124 = loc("x0"(#loc6)) +#loc125 = loc("x1"(#loc7)) +#loc126 = loc("x1"(#loc8)) +#loc127 = loc("tmp31"(#loc9)) +#loc128 = loc("tmp31"(#loc10)) +#loc129 = loc("tmp31"(#loc11)) +#loc130 = loc("tmp32"(#loc12)) +#loc131 = loc("tmp32"(#loc13)) +#loc132 = loc("tmp1"(#loc14)) +#loc133 = loc("tmp2"(#loc15)) +#loc134 = loc("tmp3"(#loc16)) +#loc135 = loc("tmp3"(#loc17)) +#loc136 = loc("tmp3"(#loc18)) +#loc137 = loc("tmp3"(#loc19)) +#loc138 = loc("tmp3"(#loc20)) +#loc139 = loc("tmp3"(#loc21)) +#loc140 = loc("tmp3"(#loc22)) +#loc141 = loc("tmp4"(#loc23)) +#loc142 = loc("tmp4"(#loc24)) +#loc143 = loc("tmp4"(#loc25)) +#loc144 = loc("tmp5"(#loc26)) +#loc145 = loc("tmp6"(#loc27)) +#loc146 = loc("tmp7"(#loc28)) +#loc147 = loc("tmp8"(#loc29)) +#loc148 = loc("tmp10"(#loc37)) +#loc149 = loc("tmp10"(#loc38)) +#loc150 = loc("tmp10"(#loc39)) +#loc151 = loc("tmp10"(#loc40)) +#loc152 = loc("tmp10"(#loc41)) +#loc153 = loc("tmp10"(#loc42)) +#loc154 = loc("tmp10"(#loc43)) +#loc155 = loc("tmp10"(#loc44)) +#loc156 = loc("tmp10"(#loc45)) +#loc157 = loc("tmp11"(#loc46)) +#loc158 = loc("tmp12"(#loc47)) +#loc159 = loc("tmp13"(#loc48)) +#loc160 = loc("tmp14"(#loc49)) +#loc161 = loc("tmp15"(#loc50)) +#loc162 = loc("tmp16"(#loc51)) +#loc163 = loc("tmp17"(#loc52)) +#loc164 = loc("tmp18"(#loc53)) +#loc165 = loc("tmp18"(#loc54)) +#loc166 = loc("tmp18"(#loc55)) +#loc167 = loc("tmp18"(#loc56)) +#loc168 = loc("tmp18"(#loc57)) +#loc169 = loc("tmp18"(#loc58)) +#loc170 = loc("tmp18"(#loc59)) +#loc171 = loc("tmp18"(#loc60)) +#loc172 = loc("tmp19"(#loc61)) +#loc173 = loc("tmp19"(#loc62)) +#loc174 = loc("tmp19"(#loc63)) +#loc175 = loc("tmp20"(#loc64)) +#loc176 = loc("tmp21"(#loc65)) +#loc177 = loc("tmp22"(#loc66)) +#loc178 = loc("tmp23"(#loc67)) +#loc179 = loc("tmp25"(#loc75)) +#loc180 = loc("tmp25"(#loc76)) +#loc181 = loc("tmp25"(#loc77)) +#loc182 = loc("tmp25"(#loc78)) +#loc183 = loc("tmp25"(#loc79)) +#loc184 = loc("tmp25"(#loc80)) +#loc185 = loc("tmp25"(#loc81)) +#loc186 = loc("tmp25"(#loc82)) +#loc187 = loc("tmp25"(#loc83)) +#loc188 = loc("tmp25"(#loc84)) +#loc189 = loc("tmp26"(#loc85)) +#loc190 = loc("tmp27"(#loc86)) +#loc191 = loc("tmp28"(#loc87)) +#loc192 = loc("tmp29"(#loc88)) +#loc193 = loc("tmp30"(#loc89)) +#loc194 = loc("tmp34"(#loc90)) +#loc195 = loc("tmp35"(#loc91)) +#loc196 = loc("tmp36"(#loc92)) +#loc197 = loc("tmp38"(#loc99)) +#loc198 = loc("tmp38"(#loc100)) +#loc199 = loc("tmp38"(#loc101)) +#loc200 = loc("tmp38"(#loc102)) +#loc201 = loc("tmp38"(#loc103)) +#loc202 = loc("tmp39"(#loc104)) +#loc203 = loc("tmp40"(#loc105)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..0a50f5bbc00ad73f1a617589f53bc87992b541f1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttgir @@ -0,0 +1,284 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":18:0) +#loc91 = loc("in_ptr0"(#loc)) +#loc92 = loc("in_ptr1"(#loc)) +#loc93 = loc("in_ptr2"(#loc)) +#loc94 = loc("in_ptr3"(#loc)) +#loc95 = loc("out_ptr0"(#loc)) +#loc96 = loc("ks0"(#loc)) +#loc97 = loc("ks1"(#loc)) +#loc98 = loc("ks2"(#loc)) +#loc99 = loc("ks3"(#loc)) +#loc100 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<512xi1, #blocked> loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xbf16, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<512xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc101) + %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc102) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc103) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32, #blocked> loc(#loc104) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32, #blocked> loc(#loc104) + %xmask = tt.splat %xnumel : i32 -> tensor<512xi32, #blocked> loc(#loc105) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<512xi32, #blocked> loc(#loc105) + %x0 = arith.extsi %xindex_5 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked> loc(#loc106) + %x0_7 = tt.splat %ks0 : i64 -> tensor<512xi64, #blocked> loc(#loc106) + %x0_8 = arith.remsi %x0, %x0_7 : tensor<512xi64, #blocked> loc(#loc106) + %x1 = arith.divsi %x0, %x0_7 : tensor<512xi64, #blocked> loc(#loc107) + %x1_9 = tt.splat %ks1 : i64 -> tensor<512xi64, #blocked> loc(#loc108) + %x1_10 = arith.remsi %x1, %x1_9 : tensor<512xi64, #blocked> loc(#loc108) + %tmp31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc109) + %tmp31_11 = tt.addptr %tmp31, %xindex_5 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc109) + %tmp31_12 = tt.load %tmp31_11, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc110) + %tmp31_13 = arith.extf %tmp31_12 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc111) + %tmp32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc112) + %tmp32_14 = tt.addptr %tmp32, %x1_10 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc112) + %tmp32_15 = tt.load %tmp32_14, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc113) + %tmp1 = arith.divsi %ks0, %c2_i64 : i64 loc(#loc114) + %tmp2 = tt.splat %tmp1 : i64 -> tensor<512xi64, #blocked> loc(#loc115) + %tmp2_16 = arith.cmpi sge, %x0_8, %tmp2 : tensor<512xi64, #blocked> loc(#loc115) + %tmp3 = arith.muli %tmp1, %c-1_i64 : i64 loc(#loc116) + %tmp3_17 = tt.splat %tmp3 : i64 -> tensor<512xi64, #blocked> loc(#loc117) + %tmp3_18 = arith.addi %x0, %tmp3_17 : tensor<512xi64, #blocked> loc(#loc117) + %tmp3_19 = tt.addptr %tmp31, %tmp3_18 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc118) + %tmp3_20 = arith.andi %tmp2_16, %xmask_6 : tensor<512xi1, #blocked> loc(#loc119) + %tmp3_21 = tt.load %tmp3_19, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc120) + %tmp3_22 = arith.extf %tmp3_21 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc121) + %tmp4 = tt.load %tmp32_14, %tmp3_20, %cst_1 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc122) + %tmp5 = tt.splat %ks2 : i64 -> tensor<512xi64, #blocked> loc(#loc123) + %tmp6 = arith.addi %tmp4, %tmp5 : tensor<512xi64, #blocked> loc(#loc124) + %tmp7 = arith.cmpi slt, %tmp4, %cst_1 : tensor<512xi64, #blocked> loc(#loc125) + %tmp8 = arith.select %tmp7, %tmp6, %tmp4 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked> loc(#loc126) + %0 = arith.cmpi sge, %tmp8, %cst_1 : tensor<512xi64, #blocked> loc(#loc28) + %1 = arith.cmpi slt, %tmp8, %tmp5 : tensor<512xi64, #blocked> loc(#loc29) + %2 = arith.andi %0, %1 : tensor<512xi1, #blocked> loc(#loc30) + %3 = arith.xori %tmp3_20, %cst : tensor<512xi1, #blocked> loc(#loc31) + %4 = arith.ori %2, %3 : tensor<512xi1, #blocked> loc(#loc32) + tt.assert %4, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<512xi1, #blocked> loc(#loc33) + %tmp10 = arith.addi %x0_8, %tmp3_17 : tensor<512xi64, #blocked> loc(#loc127) + %tmp10_23 = arith.muli %x0_7, %tmp8 : tensor<512xi64, #blocked> loc(#loc128) + %tmp10_24 = arith.addi %tmp10, %tmp10_23 : tensor<512xi64, #blocked> loc(#loc129) + %tmp10_25 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc130) + %tmp10_26 = tt.addptr %tmp10_25, %tmp10_24 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc130) + %tmp10_27 = tt.load %tmp10_26, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc131) + %tmp10_28 = arith.extf %tmp10_27 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc132) + %tmp11 = arith.mulf %tmp3_22, %tmp10_28 : tensor<512xf32, #blocked> loc(#loc133) + %tmp12 = arith.subf %cst_2, %tmp11 : tensor<512xf32, #blocked> loc(#loc134) + %tmp16 = arith.select %tmp2_16, %tmp12, %cst_2 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc169) + %tmp17 = arith.cmpi slt, %x0_8, %tmp2 : tensor<512xi64, #blocked> loc(#loc137) + %tmp18 = arith.addi %x0_7, %x0 : tensor<512xi64, #blocked> loc(#loc138) + %tmp18_29 = arith.addi %tmp18, %tmp3_17 : tensor<512xi64, #blocked> loc(#loc139) + %tmp18_30 = tt.addptr %tmp31, %tmp18_29 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc140) + %tmp18_31 = arith.andi %tmp17, %xmask_6 : tensor<512xi1, #blocked> loc(#loc141) + %tmp18_32 = tt.load %tmp18_30, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc142) + %tmp18_33 = arith.extf %tmp18_32 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc143) + %tmp19 = tt.load %tmp32_14, %tmp18_31, %cst_1 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc144) + %tmp21 = arith.addi %tmp19, %tmp5 : tensor<512xi64, #blocked> loc(#loc145) + %tmp22 = arith.cmpi slt, %tmp19, %cst_1 : tensor<512xi64, #blocked> loc(#loc146) + %tmp23 = arith.select %tmp22, %tmp21, %tmp19 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked> loc(#loc147) + %5 = arith.cmpi sge, %tmp23, %cst_1 : tensor<512xi64, #blocked> loc(#loc55) + %6 = arith.cmpi slt, %tmp23, %tmp5 : tensor<512xi64, #blocked> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<512xi1, #blocked> loc(#loc57) + %8 = arith.xori %tmp18_31, %cst : tensor<512xi1, #blocked> loc(#loc58) + %9 = arith.ori %7, %8 : tensor<512xi1, #blocked> loc(#loc59) + tt.assert %9, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<512xi1, #blocked> loc(#loc60) + %tmp25 = arith.addi %x0_7, %x0_8 : tensor<512xi64, #blocked> loc(#loc148) + %tmp25_34 = arith.addi %tmp25, %tmp3_17 : tensor<512xi64, #blocked> loc(#loc149) + %tmp25_35 = arith.muli %x0_7, %tmp23 : tensor<512xi64, #blocked> loc(#loc150) + %tmp25_36 = arith.addi %tmp25_34, %tmp25_35 : tensor<512xi64, #blocked> loc(#loc151) + %tmp25_37 = tt.addptr %tmp10_25, %tmp25_36 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc152) + %tmp25_38 = tt.load %tmp25_37, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc153) + %tmp25_39 = arith.extf %tmp25_38 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc154) + %tmp26 = arith.mulf %tmp18_33, %tmp25_39 : tensor<512xf32, #blocked> loc(#loc155) + %tmp29 = arith.select %tmp17, %tmp26, %cst_2 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc170) + %tmp30 = arith.addf %tmp16, %tmp29 : tensor<512xf32, #blocked> loc(#loc158) + %tmp34 = tt.splat %ks3 : i64 -> tensor<512xi64, #blocked> loc(#loc159) + %tmp34_40 = arith.addi %tmp32_15, %tmp34 : tensor<512xi64, #blocked> loc(#loc159) + %tmp35 = arith.cmpi slt, %tmp32_15, %cst_1 : tensor<512xi64, #blocked> loc(#loc160) + %tmp36 = arith.select %tmp35, %tmp34_40, %tmp32_15 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked> loc(#loc161) + %10 = arith.cmpi sge, %tmp36, %cst_1 : tensor<512xi64, #blocked> loc(#loc75) + %11 = arith.cmpi slt, %tmp36, %tmp34 : tensor<512xi64, #blocked> loc(#loc76) + %12 = arith.andi %10, %11 : tensor<512xi1, #blocked> loc(#loc77) + %13 = arith.xori %xmask_6, %cst : tensor<512xi1, #blocked> loc(#loc78) + %14 = arith.ori %12, %13 : tensor<512xi1, #blocked> loc(#loc79) + tt.assert %14, "index out of bounds: 0 <= tmp36 < ks3" : tensor<512xi1, #blocked> loc(#loc80) + %tmp38 = arith.muli %x0_7, %tmp36 : tensor<512xi64, #blocked> loc(#loc162) + %tmp38_41 = arith.addi %x0_8, %tmp38 : tensor<512xi64, #blocked> loc(#loc163) + %tmp38_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc164) + %tmp38_43 = tt.addptr %tmp38_42, %tmp38_41 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc164) + %tmp38_44 = tt.load %tmp38_43, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc165) + %tmp38_45 = arith.extf %tmp38_44 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc166) + %tmp39 = arith.mulf %tmp31_13, %tmp38_45 : tensor<512xf32, #blocked> loc(#loc167) + %tmp40 = arith.addf %tmp30, %tmp39 : tensor<512xf32, #blocked> loc(#loc168) + %15 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc88) + %16 = tt.addptr %15, %xindex_5 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc88) + %17 = arith.truncf %tmp40 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc89) + tt.store %16, %17, %xmask_6 : tensor<512x!tt.ptr, #blocked> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":24:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":24:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":25:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":25:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":25:76) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":26:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":26:36) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":28:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":29:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:35) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:111) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":31:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":32:32) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":33:18) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":34:18) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":35:32) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:98) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:64) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:108) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:106) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:123) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:58) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:65) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:123) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":38:19) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":39:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":43:34) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":41:34) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":44:19) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:37) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:42) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:68) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:60) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:119) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":46:36) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":48:20) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":49:20) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":50:35) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:100) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:65) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:110) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:108) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:126) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:37) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:64) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:60) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:31) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:72) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:131) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":53:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":56:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":55:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":57:20) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":59:20) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":60:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":61:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:28) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:46) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:38) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:56) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:54) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:64) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:40) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:36) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:31) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:48) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:88) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":64:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":65:20) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":66:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":66:37) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":66:4) +#loc101 = loc("xoffset"(#loc2)) +#loc102 = loc("xoffset"(#loc3)) +#loc103 = loc("xindex"(#loc4)) +#loc104 = loc("xindex"(#loc5)) +#loc105 = loc("xmask"(#loc6)) +#loc106 = loc("x0"(#loc7)) +#loc107 = loc("x1"(#loc8)) +#loc108 = loc("x1"(#loc9)) +#loc109 = loc("tmp31"(#loc10)) +#loc110 = loc("tmp31"(#loc11)) +#loc111 = loc("tmp31"(#loc12)) +#loc112 = loc("tmp32"(#loc13)) +#loc113 = loc("tmp32"(#loc14)) +#loc114 = loc("tmp1"(#loc15)) +#loc115 = loc("tmp2"(#loc16)) +#loc116 = loc("tmp3"(#loc17)) +#loc117 = loc("tmp3"(#loc18)) +#loc118 = loc("tmp3"(#loc19)) +#loc119 = loc("tmp3"(#loc20)) +#loc120 = loc("tmp3"(#loc21)) +#loc121 = loc("tmp3"(#loc22)) +#loc122 = loc("tmp4"(#loc23)) +#loc123 = loc("tmp5"(#loc24)) +#loc124 = loc("tmp6"(#loc25)) +#loc125 = loc("tmp7"(#loc26)) +#loc126 = loc("tmp8"(#loc27)) +#loc127 = loc("tmp10"(#loc34)) +#loc128 = loc("tmp10"(#loc35)) +#loc129 = loc("tmp10"(#loc36)) +#loc130 = loc("tmp10"(#loc37)) +#loc131 = loc("tmp10"(#loc38)) +#loc132 = loc("tmp10"(#loc39)) +#loc133 = loc("tmp11"(#loc40)) +#loc134 = loc("tmp12"(#loc41)) +#loc135 = loc("tmp16"(#loc42)) +#loc136 = loc("tmp14"(#loc43)) +#loc137 = loc("tmp17"(#loc44)) +#loc138 = loc("tmp18"(#loc45)) +#loc139 = loc("tmp18"(#loc46)) +#loc140 = loc("tmp18"(#loc47)) +#loc141 = loc("tmp18"(#loc48)) +#loc142 = loc("tmp18"(#loc49)) +#loc143 = loc("tmp18"(#loc50)) +#loc144 = loc("tmp19"(#loc51)) +#loc145 = loc("tmp21"(#loc52)) +#loc146 = loc("tmp22"(#loc53)) +#loc147 = loc("tmp23"(#loc54)) +#loc148 = loc("tmp25"(#loc61)) +#loc149 = loc("tmp25"(#loc62)) +#loc150 = loc("tmp25"(#loc63)) +#loc151 = loc("tmp25"(#loc64)) +#loc152 = loc("tmp25"(#loc65)) +#loc153 = loc("tmp25"(#loc66)) +#loc154 = loc("tmp25"(#loc67)) +#loc155 = loc("tmp26"(#loc68)) +#loc156 = loc("tmp29"(#loc69)) +#loc157 = loc("tmp28"(#loc70)) +#loc158 = loc("tmp30"(#loc71)) +#loc159 = loc("tmp34"(#loc72)) +#loc160 = loc("tmp35"(#loc73)) +#loc161 = loc("tmp36"(#loc74)) +#loc162 = loc("tmp38"(#loc81)) +#loc163 = loc("tmp38"(#loc82)) +#loc164 = loc("tmp38"(#loc83)) +#loc165 = loc("tmp38"(#loc84)) +#loc166 = loc("tmp38"(#loc85)) +#loc167 = loc("tmp39"(#loc86)) +#loc168 = loc("tmp40"(#loc87)) +#loc169 = loc(fused[#loc135, #loc136]) +#loc170 = loc(fused[#loc156, #loc157]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..c0ffcdf99bf71f1ee107de9a0dc88513b3cd7078 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/UQSFYICF6CFQWZOBHCGZ7JZ457GHWVO6RMPN5ABNWOATFMKI6GQA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1.ttir @@ -0,0 +1,283 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":18:0) +#loc91 = loc("in_ptr0"(#loc)) +#loc92 = loc("in_ptr1"(#loc)) +#loc93 = loc("in_ptr2"(#loc)) +#loc94 = loc("in_ptr3"(#loc)) +#loc95 = loc("out_ptr0"(#loc)) +#loc96 = loc("ks0"(#loc)) +#loc97 = loc("ks1"(#loc)) +#loc98 = loc("ks2"(#loc)) +#loc99 = loc("ks3"(#loc)) +#loc100 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<512xi64> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xbf16> loc(#loc1) + %cst_1 = arith.constant dense : tensor<512xi1> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc101) + %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc102) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc103) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc104) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc104) + %xmask = tt.splat %xnumel : i32 -> tensor<512xi32> loc(#loc105) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<512xi32> loc(#loc105) + %x0 = arith.extsi %xindex_5 : tensor<512xi32> to tensor<512xi64> loc(#loc106) + %x0_7 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc106) + %x0_8 = arith.remsi %x0, %x0_7 : tensor<512xi64> loc(#loc106) + %x1 = arith.divsi %x0, %x0_7 : tensor<512xi64> loc(#loc107) + %x1_9 = tt.splat %ks1 : i64 -> tensor<512xi64> loc(#loc108) + %x1_10 = arith.remsi %x1, %x1_9 : tensor<512xi64> loc(#loc108) + %tmp31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc109) + %tmp31_11 = tt.addptr %tmp31, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc109) + %tmp31_12 = tt.load %tmp31_11, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc110) + %tmp31_13 = arith.extf %tmp31_12 : tensor<512xbf16> to tensor<512xf32> loc(#loc111) + %tmp32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc112) + %tmp32_14 = tt.addptr %tmp32, %x1_10 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc112) + %tmp32_15 = tt.load %tmp32_14, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc113) + %tmp1 = arith.divsi %ks0, %c2_i64 : i64 loc(#loc114) + %tmp2 = tt.splat %tmp1 : i64 -> tensor<512xi64> loc(#loc115) + %tmp2_16 = arith.cmpi sge, %x0_8, %tmp2 : tensor<512xi64> loc(#loc115) + %tmp3 = arith.muli %tmp1, %c-1_i64 : i64 loc(#loc116) + %tmp3_17 = tt.splat %tmp3 : i64 -> tensor<512xi64> loc(#loc117) + %tmp3_18 = arith.addi %x0, %tmp3_17 : tensor<512xi64> loc(#loc117) + %tmp3_19 = tt.addptr %tmp31, %tmp3_18 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc118) + %tmp3_20 = arith.andi %tmp2_16, %xmask_6 : tensor<512xi1> loc(#loc119) + %tmp3_21 = tt.load %tmp3_19, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc120) + %tmp3_22 = arith.extf %tmp3_21 : tensor<512xbf16> to tensor<512xf32> loc(#loc121) + %tmp4 = tt.load %tmp32_14, %tmp3_20, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc122) + %tmp5 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc123) + %tmp6 = arith.addi %tmp4, %tmp5 : tensor<512xi64> loc(#loc124) + %tmp7 = arith.cmpi slt, %tmp4, %cst : tensor<512xi64> loc(#loc125) + %tmp8 = arith.select %tmp7, %tmp6, %tmp4 : tensor<512xi1>, tensor<512xi64> loc(#loc126) + %0 = arith.cmpi sge, %tmp8, %cst : tensor<512xi64> loc(#loc28) + %1 = arith.cmpi slt, %tmp8, %tmp5 : tensor<512xi64> loc(#loc29) + %2 = arith.andi %0, %1 : tensor<512xi1> loc(#loc30) + %3 = arith.xori %tmp3_20, %cst_1 : tensor<512xi1> loc(#loc31) + %4 = arith.ori %2, %3 : tensor<512xi1> loc(#loc32) + tt.assert %4, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<512xi1> loc(#loc33) + %tmp10 = arith.addi %x0_8, %tmp3_17 : tensor<512xi64> loc(#loc127) + %tmp10_23 = arith.muli %x0_7, %tmp8 : tensor<512xi64> loc(#loc128) + %tmp10_24 = arith.addi %tmp10, %tmp10_23 : tensor<512xi64> loc(#loc129) + %tmp10_25 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc130) + %tmp10_26 = tt.addptr %tmp10_25, %tmp10_24 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc130) + %tmp10_27 = tt.load %tmp10_26, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc131) + %tmp10_28 = arith.extf %tmp10_27 : tensor<512xbf16> to tensor<512xf32> loc(#loc132) + %tmp11 = arith.mulf %tmp3_22, %tmp10_28 : tensor<512xf32> loc(#loc133) + %tmp12 = arith.subf %cst_2, %tmp11 : tensor<512xf32> loc(#loc134) + %tmp16 = arith.select %tmp2_16, %tmp12, %cst_2 : tensor<512xi1>, tensor<512xf32> loc(#loc169) + %tmp17 = arith.cmpi slt, %x0_8, %tmp2 : tensor<512xi64> loc(#loc137) + %tmp18 = arith.addi %x0_7, %x0 : tensor<512xi64> loc(#loc138) + %tmp18_29 = arith.addi %tmp18, %tmp3_17 : tensor<512xi64> loc(#loc139) + %tmp18_30 = tt.addptr %tmp31, %tmp18_29 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc140) + %tmp18_31 = arith.andi %tmp17, %xmask_6 : tensor<512xi1> loc(#loc141) + %tmp18_32 = tt.load %tmp18_30, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc142) + %tmp18_33 = arith.extf %tmp18_32 : tensor<512xbf16> to tensor<512xf32> loc(#loc143) + %tmp19 = tt.load %tmp32_14, %tmp18_31, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc144) + %tmp21 = arith.addi %tmp19, %tmp5 : tensor<512xi64> loc(#loc145) + %tmp22 = arith.cmpi slt, %tmp19, %cst : tensor<512xi64> loc(#loc146) + %tmp23 = arith.select %tmp22, %tmp21, %tmp19 : tensor<512xi1>, tensor<512xi64> loc(#loc147) + %5 = arith.cmpi sge, %tmp23, %cst : tensor<512xi64> loc(#loc55) + %6 = arith.cmpi slt, %tmp23, %tmp5 : tensor<512xi64> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<512xi1> loc(#loc57) + %8 = arith.xori %tmp18_31, %cst_1 : tensor<512xi1> loc(#loc58) + %9 = arith.ori %7, %8 : tensor<512xi1> loc(#loc59) + tt.assert %9, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<512xi1> loc(#loc60) + %tmp25 = arith.addi %x0_7, %x0_8 : tensor<512xi64> loc(#loc148) + %tmp25_34 = arith.addi %tmp25, %tmp3_17 : tensor<512xi64> loc(#loc149) + %tmp25_35 = arith.muli %x0_7, %tmp23 : tensor<512xi64> loc(#loc150) + %tmp25_36 = arith.addi %tmp25_34, %tmp25_35 : tensor<512xi64> loc(#loc151) + %tmp25_37 = tt.addptr %tmp10_25, %tmp25_36 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc152) + %tmp25_38 = tt.load %tmp25_37, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc153) + %tmp25_39 = arith.extf %tmp25_38 : tensor<512xbf16> to tensor<512xf32> loc(#loc154) + %tmp26 = arith.mulf %tmp18_33, %tmp25_39 : tensor<512xf32> loc(#loc155) + %tmp29 = arith.select %tmp17, %tmp26, %cst_2 : tensor<512xi1>, tensor<512xf32> loc(#loc170) + %tmp30 = arith.addf %tmp16, %tmp29 : tensor<512xf32> loc(#loc158) + %tmp34 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc159) + %tmp34_40 = arith.addi %tmp32_15, %tmp34 : tensor<512xi64> loc(#loc159) + %tmp35 = arith.cmpi slt, %tmp32_15, %cst : tensor<512xi64> loc(#loc160) + %tmp36 = arith.select %tmp35, %tmp34_40, %tmp32_15 : tensor<512xi1>, tensor<512xi64> loc(#loc161) + %10 = arith.cmpi sge, %tmp36, %cst : tensor<512xi64> loc(#loc75) + %11 = arith.cmpi slt, %tmp36, %tmp34 : tensor<512xi64> loc(#loc76) + %12 = arith.andi %10, %11 : tensor<512xi1> loc(#loc77) + %13 = arith.xori %xmask_6, %cst_1 : tensor<512xi1> loc(#loc78) + %14 = arith.ori %12, %13 : tensor<512xi1> loc(#loc79) + tt.assert %14, "index out of bounds: 0 <= tmp36 < ks3" : tensor<512xi1> loc(#loc80) + %tmp38 = arith.muli %x0_7, %tmp36 : tensor<512xi64> loc(#loc162) + %tmp38_41 = arith.addi %x0_8, %tmp38 : tensor<512xi64> loc(#loc163) + %tmp38_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc164) + %tmp38_43 = tt.addptr %tmp38_42, %tmp38_41 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc164) + %tmp38_44 = tt.load %tmp38_43, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc165) + %tmp38_45 = arith.extf %tmp38_44 : tensor<512xbf16> to tensor<512xf32> loc(#loc166) + %tmp39 = arith.mulf %tmp31_13, %tmp38_45 : tensor<512xf32> loc(#loc167) + %tmp40 = arith.addf %tmp30, %tmp39 : tensor<512xf32> loc(#loc168) + %15 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc88) + %16 = tt.addptr %15, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc88) + %17 = arith.truncf %tmp40 : tensor<512xf32> to tensor<512xbf16> loc(#loc89) + tt.store %16, %17, %xmask_6 : tensor<512x!tt.ptr> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":24:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":24:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":25:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":25:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":25:76) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":26:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":26:36) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":28:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":29:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:35) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":30:111) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":31:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":32:32) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":33:18) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":34:18) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":35:32) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:98) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:64) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:108) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:106) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":36:123) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:58) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:65) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":37:123) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":38:19) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":39:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":43:34) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":41:34) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":44:19) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:37) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:42) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:68) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:60) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":45:119) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":46:36) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":48:20) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":49:20) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":50:35) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:100) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:65) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:110) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:108) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":51:126) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:37) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:64) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:60) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:31) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:72) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":52:131) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":53:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":56:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":55:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":57:20) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":59:20) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":60:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":61:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:28) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:46) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:38) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:56) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:54) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":62:64) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:40) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:36) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:31) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:48) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":63:88) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":64:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":65:20) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":66:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":66:37) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/t4/ct4fgi7z4m7yn2q3fzibki7hccnaasccqlr5bzf5qalhtdrlkrzn.py":66:4) +#loc101 = loc("xoffset"(#loc2)) +#loc102 = loc("xoffset"(#loc3)) +#loc103 = loc("xindex"(#loc4)) +#loc104 = loc("xindex"(#loc5)) +#loc105 = loc("xmask"(#loc6)) +#loc106 = loc("x0"(#loc7)) +#loc107 = loc("x1"(#loc8)) +#loc108 = loc("x1"(#loc9)) +#loc109 = loc("tmp31"(#loc10)) +#loc110 = loc("tmp31"(#loc11)) +#loc111 = loc("tmp31"(#loc12)) +#loc112 = loc("tmp32"(#loc13)) +#loc113 = loc("tmp32"(#loc14)) +#loc114 = loc("tmp1"(#loc15)) +#loc115 = loc("tmp2"(#loc16)) +#loc116 = loc("tmp3"(#loc17)) +#loc117 = loc("tmp3"(#loc18)) +#loc118 = loc("tmp3"(#loc19)) +#loc119 = loc("tmp3"(#loc20)) +#loc120 = loc("tmp3"(#loc21)) +#loc121 = loc("tmp3"(#loc22)) +#loc122 = loc("tmp4"(#loc23)) +#loc123 = loc("tmp5"(#loc24)) +#loc124 = loc("tmp6"(#loc25)) +#loc125 = loc("tmp7"(#loc26)) +#loc126 = loc("tmp8"(#loc27)) +#loc127 = loc("tmp10"(#loc34)) +#loc128 = loc("tmp10"(#loc35)) +#loc129 = loc("tmp10"(#loc36)) +#loc130 = loc("tmp10"(#loc37)) +#loc131 = loc("tmp10"(#loc38)) +#loc132 = loc("tmp10"(#loc39)) +#loc133 = loc("tmp11"(#loc40)) +#loc134 = loc("tmp12"(#loc41)) +#loc135 = loc("tmp16"(#loc42)) +#loc136 = loc("tmp14"(#loc43)) +#loc137 = loc("tmp17"(#loc44)) +#loc138 = loc("tmp18"(#loc45)) +#loc139 = loc("tmp18"(#loc46)) +#loc140 = loc("tmp18"(#loc47)) +#loc141 = loc("tmp18"(#loc48)) +#loc142 = loc("tmp18"(#loc49)) +#loc143 = loc("tmp18"(#loc50)) +#loc144 = loc("tmp19"(#loc51)) +#loc145 = loc("tmp21"(#loc52)) +#loc146 = loc("tmp22"(#loc53)) +#loc147 = loc("tmp23"(#loc54)) +#loc148 = loc("tmp25"(#loc61)) +#loc149 = loc("tmp25"(#loc62)) +#loc150 = loc("tmp25"(#loc63)) +#loc151 = loc("tmp25"(#loc64)) +#loc152 = loc("tmp25"(#loc65)) +#loc153 = loc("tmp25"(#loc66)) +#loc154 = loc("tmp25"(#loc67)) +#loc155 = loc("tmp26"(#loc68)) +#loc156 = loc("tmp29"(#loc69)) +#loc157 = loc("tmp28"(#loc70)) +#loc158 = loc("tmp30"(#loc71)) +#loc159 = loc("tmp34"(#loc72)) +#loc160 = loc("tmp35"(#loc73)) +#loc161 = loc("tmp36"(#loc74)) +#loc162 = loc("tmp38"(#loc81)) +#loc163 = loc("tmp38"(#loc82)) +#loc164 = loc("tmp38"(#loc83)) +#loc165 = loc("tmp38"(#loc84)) +#loc166 = loc("tmp38"(#loc85)) +#loc167 = loc("tmp39"(#loc86)) +#loc168 = loc("tmp40"(#loc87)) +#loc169 = loc(fused[#loc135, #loc136]) +#loc170 = loc(fused[#loc156, #loc157]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/__grp__triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/__grp__triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4372b6ffe0b6d54eede282586f90825ec9a8311e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/__grp__triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..2123cb5aea05af4a0c8d98e4c446b77c3dfd054b Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e61b1473136112cdb2c008c73e5ff893eb6238e4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"hash": "a86b11120b5411cbf0d8b8f8917a84c667a2fd685341aa47866d51603d9e328a", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..17e5482f6812c289f543bafa79b2eab4f69f1cc2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.llir @@ -0,0 +1,450 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_argmax_1(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = icmp slt i32 %9, %4, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 511, !dbg !9 + %13 = zext nneg i32 %9 to i64, !dbg !10 + %.frozen = freeze i64 %2, !dbg !11 + %14 = sdiv i64 %13, %.frozen, !dbg !11 + %15 = mul i64 %14, %.frozen, !dbg !10 + %.decomposed = sub i64 %13, %15, !dbg !10 + %16 = mul i64 %14, %3, !dbg !12 + %.idx = mul nuw nsw i64 %.decomposed, 128000 + %17 = getelementptr i8, ptr addrspace(1) %0, i64 %.idx + %invariant.gep = getelementptr float, ptr addrspace(1) %17, i64 %16, !dbg !13 + %18 = zext nneg i32 %12 to i64, !dbg !13 + %19 = insertelement <2 x i1> poison, i1 %10, i64 0, !dbg !14 + %20 = shufflevector <2 x i1> %19, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !15 + br label %21, !dbg !13 + +21: ; preds = %8, %21 + %indvars.iv = phi i64 [ 0, %8 ], [ %indvars.iv.next, %21 ] + %22 = phi i32 [ 2147483647, %8 ], [ %106, %21 ] + %23 = phi i32 [ 2147483647, %8 ], [ %107, %21 ] + %24 = phi <2 x float> [ splat (float 0xFFF0000000000000), %8 ], [ %104, %21 ] + %25 = phi <2 x i32> [ splat (i32 2147483647), %8 ], [ %108, %21 ] + %26 = phi <2 x float> [ splat (float 0xFFF0000000000000), %8 ], [ %105, %21 ] + %27 = or disjoint i64 %indvars.iv, %18, !dbg !16 + %28 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !16 + %29 = or disjoint i32 %11, %28, !dbg !16 + %30 = or i32 %29, 512, !dbg !16 + %31 = or disjoint i64 %27, 1024, !dbg !16 + %32 = or i32 %29, 1536, !dbg !16 + %33 = icmp samesign ult i32 %30, 32000, !dbg !17 + %34 = icmp samesign ult i64 %31, 32000, !dbg !17 + %35 = icmp samesign ult i32 %32, 32000, !dbg !17 + %36 = zext nneg i32 %30 to i64, !dbg !18 + %37 = zext nneg i32 %32 to i64, !dbg !18 + %gep = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %27, !dbg !19 + %gep3 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %36, !dbg !19 + %gep5 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %31, !dbg !19 + %gep7 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %37, !dbg !19 + %38 = and i1 %10, %33, !dbg !15 + %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %40 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep, i64 %39, i1 %10) #4, !dbg !20 + %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %42 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep3, i64 %41, i1 %38) #4, !dbg !20 + %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %44 = fcmp uno <2 x float> %24, zeroinitializer, !dbg !21 + %45 = fcmp uno <2 x float> %26, zeroinitializer, !dbg !21 + %46 = sext i32 %22 to i64, !dbg !25 + %47 = icmp sgt i64 %27, %46, !dbg !25 + %48 = icmp slt i32 %23, %30, !dbg !25 + %49 = extractelement <2 x i32> %25, i64 0, !dbg !25 + %50 = sext i32 %49 to i64, !dbg !25 + %51 = icmp sgt i64 %31, %50, !dbg !25 + %52 = extractelement <2 x i32> %25, i64 1, !dbg !25 + %53 = icmp slt i32 %52, %32, !dbg !25 + %54 = insertelement <2 x i32> poison, i32 %40, i64 0, !dbg !20 + %55 = insertelement <2 x i32> %54, i32 %42, i64 1, !dbg !20 + %56 = bitcast <2 x i32> %55 to <2 x float>, !dbg !20 + %57 = fcmp ogt <2 x float> %24, %56, !dbg !26 + %58 = fcmp oeq <2 x float> %24, %56, !dbg !27 + %59 = fcmp uno <2 x float> %56, zeroinitializer, !dbg !28 + %60 = xor <2 x i1> %59, splat (i1 true), !dbg !29 + %61 = and <2 x i1> %44, %60, !dbg !30 + %62 = or <2 x i1> %57, %61, !dbg !31 + %63 = and <2 x i1> %44, %59, !dbg !32 + %64 = or <2 x i1> %58, %63, !dbg !33 + %65 = insertelement <2 x i1> poison, i1 %47, i64 0, !dbg !34 + %66 = insertelement <2 x i1> %65, i1 %48, i64 1, !dbg !34 + %67 = and <2 x i1> %66, %64, !dbg !34 + %68 = or <2 x i1> %62, %67, !dbg !35 + %69 = select <2 x i1> %68, <2 x float> %24, <2 x float> %56, !dbg !36 + %70 = trunc nuw nsw i64 %27 to i32, !dbg !37 + %71 = extractelement <2 x i1> %68, i64 0, !dbg !37 + %72 = select i1 %71, i32 %22, i32 %70, !dbg !37 + %73 = extractelement <2 x i1> %68, i64 1, !dbg !37 + %74 = select i1 %73, i32 %23, i32 %30, !dbg !37 + %75 = trunc nuw nsw i64 %31 to i32, !dbg !37 + %76 = insertelement <2 x i1> poison, i1 %34, i64 0, !dbg !15 + %77 = insertelement <2 x i1> %76, i1 %35, i64 1, !dbg !15 + %78 = and <2 x i1> %20, %77, !dbg !15 + %79 = extractelement <2 x i1> %78, i64 0, !dbg !20 + %80 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep5, i64 %43, i1 %79) #4, !dbg !20 + %81 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !20 + %82 = extractelement <2 x i1> %78, i64 1, !dbg !20 + %83 = tail call i32 asm sideeffect "mov.u32 $0, $1;\0A\09@$4 ld.global.L1::evict_first.L2::cache_hint.b32 { $0 }, [ $2 + 0 ], $3;", "=r,r,l,l,b"(i32 0, ptr addrspace(1) %gep7, i64 %81, i1 %82) #4, !dbg !20 + %84 = insertelement <2 x i32> poison, i32 %80, i64 0, !dbg !20 + %85 = insertelement <2 x i32> %84, i32 %83, i64 1, !dbg !20 + %86 = bitcast <2 x i32> %85 to <2 x float>, !dbg !20 + %87 = fcmp ogt <2 x float> %26, %86, !dbg !26 + %88 = fcmp oeq <2 x float> %26, %86, !dbg !27 + %89 = fcmp uno <2 x float> %86, zeroinitializer, !dbg !28 + %90 = xor <2 x i1> %89, splat (i1 true), !dbg !29 + %91 = and <2 x i1> %45, %90, !dbg !30 + %92 = or <2 x i1> %87, %91, !dbg !31 + %93 = and <2 x i1> %45, %89, !dbg !32 + %94 = or <2 x i1> %88, %93, !dbg !33 + %95 = insertelement <2 x i1> poison, i1 %51, i64 0, !dbg !34 + %96 = insertelement <2 x i1> %95, i1 %53, i64 1, !dbg !34 + %97 = and <2 x i1> %96, %94, !dbg !34 + %98 = or <2 x i1> %92, %97, !dbg !35 + %99 = select <2 x i1> %98, <2 x float> %26, <2 x float> %86, !dbg !36 + %100 = insertelement <2 x i32> poison, i32 %75, i64 0, !dbg !37 + %101 = insertelement <2 x i32> %100, i32 %32, i64 1, !dbg !37 + %102 = select <2 x i1> %98, <2 x i32> %25, <2 x i32> %101, !dbg !37 + %103 = insertelement <2 x i1> %19, i1 %38, i64 1, !dbg !14 + %104 = select <2 x i1> %103, <2 x float> %69, <2 x float> %24, !dbg !14 + %105 = select <2 x i1> %78, <2 x float> %99, <2 x float> %26, !dbg !14 + %106 = select i1 %10, i32 %72, i32 %22, !dbg !38 + %107 = select i1 %38, i32 %74, i32 %23, !dbg !38 + %108 = select <2 x i1> %78, <2 x i32> %102, <2 x i32> %25, !dbg !38 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !13 + %109 = icmp samesign ult i64 %indvars.iv, 29952, !dbg !13 + br i1 %109, label %21, label %110, !dbg !13 + +110: ; preds = %21 + %111 = and i32 %11, 31, !dbg !9 + %112 = lshr i32 %11, 5, !dbg !9 + %113 = shufflevector <2 x float> %104, <2 x float> poison, <2 x i32> , !dbg !39 + %114 = fcmp ogt <2 x float> %104, %113, !dbg !39 + %115 = fcmp oeq <2 x float> %104, %113, !dbg !39 + %116 = shufflevector <2 x i1> %114, <2 x i1> %115, <2 x i32> , !dbg !39 + %117 = extractelement <2 x float> %104, i64 0, !dbg !41 + %118 = fcmp uno float %117, 0.000000e+00, !dbg !41 + %119 = extractelement <2 x float> %104, i64 1, !dbg !42 + %120 = fcmp uno float %119, 0.000000e+00, !dbg !42 + %121 = xor i1 %120, true, !dbg !43 + %122 = insertelement <2 x i1> poison, i1 %118, i64 0, !dbg !44 + %123 = shufflevector <2 x i1> %122, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !44 + %124 = insertelement <2 x i1> poison, i1 %121, i64 0, !dbg !44 + %125 = insertelement <2 x i1> %124, i1 %120, i64 1, !dbg !44 + %126 = and <2 x i1> %123, %125, !dbg !44 + %127 = or <2 x i1> %116, %126, !dbg !45 + %128 = icmp slt i32 %106, %107, !dbg !46 + %129 = extractelement <2 x i1> %127, i64 1, !dbg !47 + %130 = and i1 %128, %129, !dbg !47 + %131 = extractelement <2 x i1> %127, i64 0, !dbg !48 + %132 = or i1 %131, %130, !dbg !48 + %133 = select i1 %132, float %117, float %119, !dbg !49 + %134 = select i1 %132, i32 %106, i32 %107, !dbg !50 + %135 = extractelement <2 x float> %105, i64 0, !dbg !39 + %136 = fcmp ogt float %133, %135, !dbg !39 + %137 = fcmp oeq float %133, %135, !dbg !51 + %138 = fcmp uno float %133, 0.000000e+00, !dbg !41 + %139 = fcmp uno float %135, 0.000000e+00, !dbg !42 + %140 = xor i1 %139, true, !dbg !43 + %141 = and i1 %138, %140, !dbg !44 + %142 = or i1 %136, %141, !dbg !45 + %143 = and i1 %139, %138, !dbg !52 + %144 = or i1 %137, %143, !dbg !53 + %145 = extractelement <2 x i32> %108, i64 0, !dbg !46 + %146 = icmp slt i32 %134, %145, !dbg !46 + %147 = and i1 %146, %144, !dbg !47 + %148 = or i1 %142, %147, !dbg !48 + %149 = select i1 %148, float %133, float %135, !dbg !49 + %150 = select i1 %148, i32 %134, i32 %145, !dbg !50 + %151 = extractelement <2 x float> %105, i64 1, !dbg !39 + %152 = fcmp ogt float %149, %151, !dbg !39 + %153 = fcmp oeq float %149, %151, !dbg !51 + %154 = fcmp uno float %149, 0.000000e+00, !dbg !41 + %155 = fcmp uno float %151, 0.000000e+00, !dbg !42 + %156 = xor i1 %155, true, !dbg !43 + %157 = and i1 %154, %156, !dbg !44 + %158 = or i1 %152, %157, !dbg !45 + %159 = and i1 %155, %154, !dbg !52 + %160 = or i1 %153, %159, !dbg !53 + %161 = extractelement <2 x i32> %108, i64 1, !dbg !46 + %162 = icmp slt i32 %150, %161, !dbg !46 + %163 = and i1 %162, %160, !dbg !47 + %164 = or i1 %158, %163, !dbg !48 + %165 = select i1 %164, float %149, float %151, !dbg !49 + %166 = select i1 %164, i32 %150, i32 %161, !dbg !50 + %167 = bitcast float %165 to i32, !dbg !54 + %168 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %167, i32 16, i32 31), !dbg !54 + %169 = bitcast i32 %168 to float, !dbg !54 + %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %166, i32 16, i32 31), !dbg !54 + %171 = fcmp ogt float %165, %169, !dbg !39 + %172 = fcmp oeq float %165, %169, !dbg !51 + %173 = fcmp uno float %165, 0.000000e+00, !dbg !41 + %174 = fcmp uno float %169, 0.000000e+00, !dbg !42 + %175 = xor i1 %174, true, !dbg !43 + %176 = and i1 %173, %175, !dbg !44 + %177 = or i1 %171, %176, !dbg !45 + %178 = and i1 %173, %174, !dbg !52 + %179 = or i1 %172, %178, !dbg !53 + %180 = icmp slt i32 %166, %170, !dbg !46 + %181 = and i1 %180, %179, !dbg !47 + %182 = or i1 %177, %181, !dbg !48 + %183 = select i1 %182, float %165, float %169, !dbg !49 + %184 = select i1 %182, i32 %166, i32 %170, !dbg !50 + %185 = bitcast float %183 to i32, !dbg !54 + %186 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %185, i32 8, i32 31), !dbg !54 + %187 = bitcast i32 %186 to float, !dbg !54 + %188 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 8, i32 31), !dbg !54 + %189 = fcmp ogt float %183, %187, !dbg !39 + %190 = fcmp oeq float %183, %187, !dbg !51 + %191 = fcmp uno float %183, 0.000000e+00, !dbg !41 + %192 = fcmp uno float %187, 0.000000e+00, !dbg !42 + %193 = xor i1 %192, true, !dbg !43 + %194 = and i1 %191, %193, !dbg !44 + %195 = or i1 %189, %194, !dbg !45 + %196 = and i1 %192, %191, !dbg !52 + %197 = or i1 %190, %196, !dbg !53 + %198 = icmp slt i32 %184, %188, !dbg !46 + %199 = and i1 %198, %197, !dbg !47 + %200 = or i1 %195, %199, !dbg !48 + %201 = select i1 %200, float %183, float %187, !dbg !49 + %202 = select i1 %200, i32 %184, i32 %188, !dbg !50 + %203 = bitcast float %201 to i32, !dbg !54 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 4, i32 31), !dbg !54 + %205 = bitcast i32 %204 to float, !dbg !54 + %206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 4, i32 31), !dbg !54 + %207 = fcmp ogt float %201, %205, !dbg !39 + %208 = fcmp oeq float %201, %205, !dbg !51 + %209 = fcmp uno float %201, 0.000000e+00, !dbg !41 + %210 = fcmp uno float %205, 0.000000e+00, !dbg !42 + %211 = xor i1 %210, true, !dbg !43 + %212 = and i1 %209, %211, !dbg !44 + %213 = or i1 %207, %212, !dbg !45 + %214 = and i1 %210, %209, !dbg !52 + %215 = or i1 %208, %214, !dbg !53 + %216 = icmp slt i32 %202, %206, !dbg !46 + %217 = and i1 %216, %215, !dbg !47 + %218 = or i1 %213, %217, !dbg !48 + %219 = select i1 %218, float %201, float %205, !dbg !49 + %220 = select i1 %218, i32 %202, i32 %206, !dbg !50 + %221 = bitcast float %219 to i32, !dbg !54 + %222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 2, i32 31), !dbg !54 + %223 = bitcast i32 %222 to float, !dbg !54 + %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 2, i32 31), !dbg !54 + %225 = fcmp ogt float %219, %223, !dbg !39 + %226 = fcmp oeq float %219, %223, !dbg !51 + %227 = fcmp uno float %219, 0.000000e+00, !dbg !41 + %228 = fcmp uno float %223, 0.000000e+00, !dbg !42 + %229 = xor i1 %228, true, !dbg !43 + %230 = and i1 %227, %229, !dbg !44 + %231 = or i1 %225, %230, !dbg !45 + %232 = and i1 %228, %227, !dbg !52 + %233 = or i1 %226, %232, !dbg !53 + %234 = icmp slt i32 %220, %224, !dbg !46 + %235 = and i1 %234, %233, !dbg !47 + %236 = or i1 %231, %235, !dbg !48 + %237 = select i1 %236, float %219, float %223, !dbg !49 + %238 = select i1 %236, i32 %220, i32 %224, !dbg !50 + %239 = bitcast float %237 to i32, !dbg !54 + %240 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %239, i32 1, i32 31), !dbg !54 + %241 = bitcast i32 %240 to float, !dbg !54 + %242 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %238, i32 1, i32 31), !dbg !54 + %243 = fcmp ogt float %237, %241, !dbg !39 + %244 = fcmp oeq float %237, %241, !dbg !51 + %245 = fcmp uno float %237, 0.000000e+00, !dbg !41 + %246 = fcmp uno float %241, 0.000000e+00, !dbg !42 + %247 = xor i1 %246, true, !dbg !43 + %248 = and i1 %245, %247, !dbg !44 + %249 = or i1 %243, %248, !dbg !45 + %250 = and i1 %246, %245, !dbg !52 + %251 = or i1 %244, %250, !dbg !53 + %252 = icmp slt i32 %238, %242, !dbg !46 + %253 = and i1 %252, %251, !dbg !47 + %254 = or i1 %249, %253, !dbg !48 + %255 = select i1 %254, i32 %238, i32 %242, !dbg !50 + %256 = and i32 %112, 15, !dbg !54 + %257 = icmp eq i32 %111, 0, !dbg !54 + %258 = getelementptr float, ptr addrspace(3) @global_smem, i32 %256, !dbg !54 + %259 = select i1 %254, i32 %239, i32 %240, !dbg !49 + %260 = insertelement <1 x i32> poison, i32 %259, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %258, <1 x i32> %260, i1 %257) #4, !dbg !54 + %261 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %256, !dbg !54 + %262 = insertelement <1 x i32> poison, i32 %255, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %261, <1 x i32> %262, i1 %257) #4, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + %263 = icmp samesign ult i32 %11, 16, !dbg !54 + %264 = getelementptr float, ptr addrspace(3) @global_smem, i32 %11, !dbg !54 + %265 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %264, i1 %263) #4, !dbg !54 + %266 = bitcast i32 %265 to float, !dbg !54 + %267 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %11, !dbg !54 + %268 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %267, i1 %263) #4, !dbg !54 + %269 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %265, i32 8, i32 31), !dbg !54 + %270 = bitcast i32 %269 to float, !dbg !54 + %271 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 8, i32 31), !dbg !54 + %272 = fcmp ogt float %266, %270, !dbg !39 + %273 = fcmp oeq float %266, %270, !dbg !51 + %274 = fcmp uno float %266, 0.000000e+00, !dbg !41 + %275 = fcmp uno float %270, 0.000000e+00, !dbg !42 + %276 = xor i1 %275, true, !dbg !43 + %277 = and i1 %274, %276, !dbg !44 + %278 = or i1 %272, %277, !dbg !45 + %279 = and i1 %274, %275, !dbg !52 + %280 = or i1 %273, %279, !dbg !53 + %281 = icmp slt i32 %268, %271, !dbg !46 + %282 = and i1 %281, %280, !dbg !47 + %283 = or i1 %278, %282, !dbg !48 + %284 = select i1 %283, float %266, float %270, !dbg !49 + %285 = select i1 %283, i32 %268, i32 %271, !dbg !50 + %286 = bitcast float %284 to i32, !dbg !54 + %287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %286, i32 4, i32 31), !dbg !54 + %288 = bitcast i32 %287 to float, !dbg !54 + %289 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 4, i32 31), !dbg !54 + %290 = fcmp ogt float %284, %288, !dbg !39 + %291 = fcmp oeq float %284, %288, !dbg !51 + %292 = fcmp uno float %284, 0.000000e+00, !dbg !41 + %293 = fcmp uno float %288, 0.000000e+00, !dbg !42 + %294 = xor i1 %293, true, !dbg !43 + %295 = and i1 %292, %294, !dbg !44 + %296 = or i1 %290, %295, !dbg !45 + %297 = and i1 %293, %292, !dbg !52 + %298 = or i1 %291, %297, !dbg !53 + %299 = icmp slt i32 %285, %289, !dbg !46 + %300 = and i1 %299, %298, !dbg !47 + %301 = or i1 %296, %300, !dbg !48 + %302 = select i1 %301, float %284, float %288, !dbg !49 + %303 = select i1 %301, i32 %285, i32 %289, !dbg !50 + %304 = bitcast float %302 to i32, !dbg !54 + %305 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %304, i32 2, i32 31), !dbg !54 + %306 = bitcast i32 %305 to float, !dbg !54 + %307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %303, i32 2, i32 31), !dbg !54 + %308 = fcmp ogt float %302, %306, !dbg !39 + %309 = fcmp oeq float %302, %306, !dbg !51 + %310 = fcmp uno float %302, 0.000000e+00, !dbg !41 + %311 = fcmp uno float %306, 0.000000e+00, !dbg !42 + %312 = xor i1 %311, true, !dbg !43 + %313 = and i1 %310, %312, !dbg !44 + %314 = or i1 %308, %313, !dbg !45 + %315 = and i1 %311, %310, !dbg !52 + %316 = or i1 %309, %315, !dbg !53 + %317 = icmp slt i32 %303, %307, !dbg !46 + %318 = and i1 %317, %316, !dbg !47 + %319 = or i1 %314, %318, !dbg !48 + %320 = select i1 %319, float %302, float %306, !dbg !49 + %321 = select i1 %319, i32 %303, i32 %307, !dbg !50 + %322 = bitcast float %320 to i32, !dbg !54 + %323 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %322, i32 1, i32 31), !dbg !54 + %324 = bitcast i32 %323 to float, !dbg !54 + %325 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %321, i32 1, i32 31), !dbg !54 + %326 = fcmp ogt float %320, %324, !dbg !39 + %327 = fcmp oeq float %320, %324, !dbg !51 + %328 = fcmp uno float %320, 0.000000e+00, !dbg !41 + %329 = fcmp uno float %324, 0.000000e+00, !dbg !42 + %330 = xor i1 %329, true, !dbg !43 + %331 = and i1 %328, %330, !dbg !44 + %332 = or i1 %326, %331, !dbg !45 + %333 = and i1 %329, %328, !dbg !52 + %334 = or i1 %327, %333, !dbg !53 + %335 = icmp slt i32 %321, %325, !dbg !46 + %336 = and i1 %335, %334, !dbg !47 + %337 = or i1 %332, %336, !dbg !48 + %338 = select i1 %337, i32 %321, i32 %325, !dbg !50 + %339 = icmp eq i32 %11, 0, !dbg !54 + %340 = select i1 %337, i32 %322, i32 %323, !dbg !49 + %341 = insertelement <1 x i32> poison, i32 %340, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %264, <1 x i32> %341, i1 %339) #4, !dbg !54 + %342 = insertelement <1 x i32> poison, i32 %338, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %267, <1 x i32> %342, i1 %339) #4, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + %343 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !54 + %344 = getelementptr i64, ptr addrspace(1) %1, i64 %13, !dbg !55 + %345 = sext i32 %343 to i64, !dbg !56 + %346 = icmp eq i32 %12, 0, !dbg !56 + %347 = and i1 %346, %10, !dbg !56 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %345, ptr addrspace(1) %344, i1 %347) #4, !dbg !56 + ret void, !dbg !57 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_1", linkageName: "triton_red_fused_argmax_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 24, column: 21, scope: !4) +!9 = !DILocation(line: 25, column: 37, scope: !4) +!10 = !DILocation(line: 27, column: 19, scope: !4) +!11 = !DILocation(line: 28, column: 19, scope: !4) +!12 = !DILocation(line: 38, column: 56, scope: !4) +!13 = !DILocation(line: 32, column: 40, scope: !4) +!14 = !DILocation(line: 43, column: 54, scope: !4) +!15 = !DILocation(line: 38, column: 71, scope: !4) +!16 = !DILocation(line: 33, column: 31, scope: !4) +!17 = !DILocation(line: 34, column: 29, scope: !4) +!18 = !DILocation(line: 38, column: 41, scope: !4) +!19 = !DILocation(line: 38, column: 34, scope: !4) +!20 = !DILocation(line: 38, column: 61, scope: !4) +!21 = !DILocation(line: 147, column: 29, scope: !22, inlinedAt: !24) +!22 = distinct !DILexicalBlockFile(scope: !4, file: !23, discriminator: 0) +!23 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!24 = !DILocation(line: 41, column: 38, scope: !4) +!25 = !DILocation(line: 154, column: 31, scope: !22, inlinedAt: !24) +!26 = !DILocation(line: 144, column: 21, scope: !22, inlinedAt: !24) +!27 = !DILocation(line: 145, column: 23, scope: !22, inlinedAt: !24) +!28 = !DILocation(line: 148, column: 29, scope: !22, inlinedAt: !24) +!29 = !DILocation(line: 149, column: 31, scope: !22, inlinedAt: !24) +!30 = !DILocation(line: 149, column: 27, scope: !22, inlinedAt: !24) +!31 = !DILocation(line: 149, column: 16, scope: !22, inlinedAt: !24) +!32 = !DILocation(line: 151, column: 27, scope: !22, inlinedAt: !24) +!33 = !DILocation(line: 151, column: 17, scope: !22, inlinedAt: !24) +!34 = !DILocation(line: 154, column: 21, scope: !22, inlinedAt: !24) +!35 = !DILocation(line: 154, column: 12, scope: !22, inlinedAt: !24) +!36 = !DILocation(line: 155, column: 35, scope: !22, inlinedAt: !24) +!37 = !DILocation(line: 155, column: 69, scope: !22, inlinedAt: !24) +!38 = !DILocation(line: 44, column: 66, scope: !4) +!39 = !DILocation(line: 144, column: 21, scope: !22, inlinedAt: !40) +!40 = !DILocation(line: 45, column: 75, scope: !4) +!41 = !DILocation(line: 147, column: 29, scope: !22, inlinedAt: !40) +!42 = !DILocation(line: 148, column: 29, scope: !22, inlinedAt: !40) +!43 = !DILocation(line: 149, column: 31, scope: !22, inlinedAt: !40) +!44 = !DILocation(line: 149, column: 27, scope: !22, inlinedAt: !40) +!45 = !DILocation(line: 149, column: 16, scope: !22, inlinedAt: !40) +!46 = !DILocation(line: 154, column: 31, scope: !22, inlinedAt: !40) +!47 = !DILocation(line: 154, column: 21, scope: !22, inlinedAt: !40) +!48 = !DILocation(line: 154, column: 12, scope: !22, inlinedAt: !40) +!49 = !DILocation(line: 155, column: 35, scope: !22, inlinedAt: !40) +!50 = !DILocation(line: 155, column: 69, scope: !22, inlinedAt: !40) +!51 = !DILocation(line: 145, column: 23, scope: !22, inlinedAt: !40) +!52 = !DILocation(line: 151, column: 27, scope: !22, inlinedAt: !40) +!53 = !DILocation(line: 151, column: 17, scope: !22, inlinedAt: !40) +!54 = !DILocation(line: 165, column: 42, scope: !22, inlinedAt: !40) +!55 = !DILocation(line: 47, column: 25, scope: !4) +!56 = !DILocation(line: 47, column: 36, scope: !4) +!57 = !DILocation(line: 47, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..7c72bdde81959f79a1724cca9cc82707a177dfc4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ptx @@ -0,0 +1,915 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_argmax_1 // -- Begin function triton_red_fused_argmax_1 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_argmax_1 +.visible .entry triton_red_fused_argmax_1( + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_1, + .param .u64 triton_red_fused_argmax_1_param_2, + .param .u64 triton_red_fused_argmax_1_param_3, + .param .u32 triton_red_fused_argmax_1_param_4, + .param .u32 triton_red_fused_argmax_1_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_1_param_7 +) +.reqntid 512 +{ + .reg .pred %p<215>; + .reg .b32 %r<123>; + .reg .b64 %rd<70>; + .loc 1 18 0 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:18:0 + +// %bb.0: + ld.param.b32 %r10, [triton_red_fused_argmax_1_param_4]; + ld.param.b64 %rd20, [triton_red_fused_argmax_1_param_3]; + ld.param.b64 %rd18, [triton_red_fused_argmax_1_param_0]; +$L__tmp0: + .loc 1 22 28 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:22:28 + mov.u32 %r11, %ctaid.x; + ld.param.b64 %rd21, [triton_red_fused_argmax_1_param_2]; + .loc 1 25 37 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:25:37 + mov.u32 %r12, %tid.x; + and.b32 %r1, %r12, 511; + .loc 1 27 19 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:27:19 + cvt.u64.u32 %rd2, %r11; + .loc 1 28 19 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:28:19 + and.b64 %rd22, %rd21, -4294967296; + setp.ne.b64 %p5, %rd22, 0; + cvt.u32.u64 %r118, %rd2; + @%p5 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd65, %rd2, %rd21; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r13, %rd21; + div.u32 %r15, %r118, %r13; + cvt.u64.u32 %rd65, %r15; +$L__BB0_3: + .loc 1 0 19 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:0:19 + ld.param.b64 %rd19, [triton_red_fused_argmax_1_param_1]; + cvt.u64.u32 %rd1, %r12; + .loc 1 24 21 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:24:21 + setp.lt.s32 %p3, %r118, %r10; + .loc 1 27 19 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:27:19 + mul.lo.s64 %rd25, %rd65, %rd21; + sub.s64 %rd26, %rd2, %rd25; + .loc 1 38 56 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:38:56 + mul.lo.s64 %rd27, %rd65, %rd20; + mad.lo.s64 %rd28, %rd26, 128000, %rd18; + .loc 1 32 40 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:32:40 + shl.b64 %rd29, %rd27, 2; + add.s64 %rd7, %rd28, %rd29; + cvt.u64.u32 %rd8, %r1; + shl.b64 %rd30, %rd20, 2; + mul.lo.s64 %rd31, %rd21, 128000; + sub.s64 %rd32, %rd30, %rd31; + mul.lo.s64 %rd33, %rd65, %rd32; + mad.lo.s64 %rd34, %rd2, 128000, %rd33; + mad.wide.u32 %rd35, %r1, 4, %rd34; + add.s64 %rd66, %rd18, %rd35; + mov.b32 %r20, 0fFF800000; + mov.b64 %rd68, {%r20, %r20}; + mov.b32 %r119, 2147483647; + mov.b64 %rd67, -2048; + mov.b32 %r120, %r119; + mov.b32 %r121, %r119; + mov.b32 %r122, %r119; + mov.b64 %rd69, %rd68; +$L__BB0_4: // =>This Inner Loop Header: Depth=1 + .loc 1 33 31 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:33:31 + add.s64 %rd48, %rd8, %rd67; + add.s64 %rd49, %rd48, 2048; + add.s64 %rd50, %rd1, %rd67; + cvt.u32.u64 %r30, %rd50; + add.s32 %r31, %r30, 2048; + or.b32 %r32, %r31, 512; + add.s64 %rd51, %rd48, 3072; + or.b32 %r33, %r31, 1536; + .loc 1 34 29 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:34:29 + setp.lt.u32 %p10, %r32, 32000; + setp.lt.u64 %p11, %rd51, 32000; + setp.lt.u32 %p12, %r33, 32000; + .loc 1 38 34 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:38:34 + mad.wide.u32 %rd40, %r32, 4, %rd7; + add.s64 %rd43, %rd66, 4096; + mad.wide.u32 %rd46, %r33, 4, %rd7; + .loc 1 38 71 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:38:71 + and.pred %p7, %p3, %p10; + .loc 1 38 61 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:38:61 + // begin inline asm + mov.u64 %rd36, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd36, 1.0; + // end inline asm + mov.b32 %r22, 0; + // begin inline asm + mov.u32 %r21, %r22; + @%p3 ld.global.L1::evict_first.L2::cache_hint.b32 { %r21 }, [ %rd66 + 0 ], %rd36; + // end inline asm + // begin inline asm + mov.u64 %rd39, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd39, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r23, %r22; + @%p7 ld.global.L1::evict_first.L2::cache_hint.b32 { %r23 }, [ %rd40 + 0 ], %rd39; + // end inline asm + // begin inline asm + mov.u64 %rd42, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd42, 1.0; + // end inline asm +$L__tmp1: + .loc 2 147 29 // triton_helpers.py:147:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + mov.b64 {%r34, %r35}, %rd68; + setp.nan.f32 %p13, %r34, %r34; + setp.nan.f32 %p14, %r35, %r35; + mov.b64 {%r36, %r37}, %rd69; + setp.nan.f32 %p15, %r36, %r36; + setp.nan.f32 %p16, %r37, %r37; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + cvt.s64.s32 %rd52, %r119; + setp.gt.s64 %p17, %rd49, %rd52; + setp.lt.s32 %p18, %r120, %r32; + cvt.s64.s32 %rd53, %r121; + setp.gt.s64 %p19, %rd51, %rd53; + setp.lt.s32 %p20, %r122, %r33; +$L__tmp2: + .loc 1 38 61 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:38:61 + cvt.u64.u32 %rd54, %r23; + shl.b64 %rd55, %rd54, 32; + cvt.u64.u32 %rd56, %r21; + or.b64 %rd57, %rd56, %rd55; +$L__tmp3: + .loc 2 148 29 // triton_helpers.py:148:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + mov.b64 {%r38, %r39}, %rd57; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + setp.gt.f32 %p21, %r35, %r39; + setp.gt.f32 %p22, %r34, %r38; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + setp.eq.f32 %p23, %r34, %r38; + setp.eq.f32 %p24, %r35, %r39; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + setp.nan.f32 %p25, %r39, %r39; + setp.nan.f32 %p26, %r38, %r38; + setp.num.f32 %p27, %r38, %r38; + setp.num.f32 %p28, %r39, %r39; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + and.pred %p29, %p14, %p28; + and.pred %p30, %p13, %p27; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + or.pred %p31, %p22, %p30; + or.pred %p32, %p21, %p29; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + and.pred %p33, %p13, %p26; + and.pred %p34, %p14, %p25; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + or.pred %p35, %p24, %p34; + or.pred %p36, %p23, %p33; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + and.pred %p37, %p17, %p36; + and.pred %p38, %p18, %p35; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + or.pred %p39, %p32, %p38; + or.pred %p40, %p31, %p37; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + selp.f32 %r40, %r34, %r38, %p40; + selp.f32 %r41, %r35, %r39, %p39; + cvt.u32.u64 %r42, %rd49; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + selp.b32 %r43, %r119, %r42, %p40; + selp.b32 %r44, %r120, %r32, %p39; +$L__tmp4: + .loc 1 38 71 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:38:71 + and.pred %p9, %p3, %p12; + and.pred %p8, %p3, %p11; + .loc 1 38 61 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:38:61 + // begin inline asm + mov.u32 %r25, %r22; + @%p8 ld.global.L1::evict_first.L2::cache_hint.b32 { %r25 }, [ %rd43 + 0 ], %rd42; + // end inline asm + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd45, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r27, %r22; + @%p9 ld.global.L1::evict_first.L2::cache_hint.b32 { %r27 }, [ %rd46 + 0 ], %rd45; + // end inline asm + cvt.u64.u32 %rd58, %r27; + shl.b64 %rd59, %rd58, 32; + cvt.u64.u32 %rd60, %r25; + or.b64 %rd61, %rd60, %rd59; +$L__tmp5: + .loc 2 148 29 // triton_helpers.py:148:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + mov.b64 {%r45, %r46}, %rd61; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + setp.gt.f32 %p41, %r37, %r46; + setp.gt.f32 %p42, %r36, %r45; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + setp.eq.f32 %p43, %r36, %r45; + setp.eq.f32 %p44, %r37, %r46; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + setp.nan.f32 %p45, %r46, %r46; + setp.nan.f32 %p46, %r45, %r45; + setp.num.f32 %p47, %r45, %r45; + setp.num.f32 %p48, %r46, %r46; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + and.pred %p49, %p16, %p48; + and.pred %p50, %p15, %p47; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + or.pred %p51, %p42, %p50; + or.pred %p52, %p41, %p49; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + and.pred %p53, %p15, %p46; + and.pred %p54, %p16, %p45; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + or.pred %p55, %p44, %p54; + or.pred %p56, %p43, %p53; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + and.pred %p57, %p19, %p56; + and.pred %p58, %p20, %p55; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + or.pred %p59, %p52, %p58; + or.pred %p60, %p51, %p57; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + selp.f32 %r47, %r36, %r45, %p60; + selp.f32 %r48, %r37, %r46, %p59; + cvt.u32.u64 %r49, %rd51; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:41:38 ] + selp.b32 %r50, %r121, %r49, %p60; + selp.b32 %r51, %r122, %r33, %p59; +$L__tmp6: + .loc 1 43 54 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:43:54 + selp.f32 %r52, %r41, %r35, %p7; + selp.f32 %r53, %r40, %r34, %p3; + mov.b64 %rd68, {%r53, %r52}; + selp.f32 %r54, %r48, %r37, %p9; + selp.f32 %r55, %r47, %r36, %p8; + mov.b64 %rd69, {%r55, %r54}; + .loc 1 44 66 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:44:66 + selp.b32 %r119, %r43, %r119, %p3; + selp.b32 %r120, %r44, %r120, %p7; + selp.b32 %r122, %r51, %r122, %p9; + selp.b32 %r121, %r50, %r121, %p8; + .loc 1 32 40 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:32:40 + add.s64 %rd67, %rd67, 2048; + add.s64 %rd66, %rd66, 8192; + setp.lt.u64 %p61, %rd67, 29952; + @%p61 bra $L__BB0_4; +// %bb.5: + .loc 1 0 40 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:0:40 + cvt.u32.u64 %r68, %rd1; + .loc 1 25 37 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:25:37 + and.b32 %r70, %r68, 31; +$L__tmp7: + .loc 2 144 21 // triton_helpers.py:144:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + mov.b64 {%r71, %r72}, %rd68; + setp.gt.f32 %p70, %r71, %r72; + setp.eq.f32 %p71, %r72, %r71; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p72, %r71, %r71; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.num.f32 %p73, %r72, %r72; + setp.nan.f32 %p74, %r72, %r72; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p75, %p72, %p74; + and.pred %p76, %p72, %p73; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p77, %p70, %p76; + or.pred %p78, %p71, %p75; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.lt.s32 %p79, %r119, %r120; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p80, %p79, %p78; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p81, %p77, %p80; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.f32 %r73, %r71, %r72, %p81; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.b32 %r74, %r119, %r120, %p81; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + mov.b64 {%r75, %r76}, %rd69; + setp.gt.f32 %p82, %r73, %r75; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.eq.f32 %p83, %r73, %r75; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p84, %r73, %r73; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p85, %r75, %r75; + setp.num.f32 %p86, %r75, %r75; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p87, %p84, %p86; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p88, %p82, %p87; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p89, %p85, %p84; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p90, %p83, %p89; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.lt.s32 %p91, %r74, %r121; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p92, %p91, %p90; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p93, %p88, %p92; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.f32 %r77, %r73, %r75, %p93; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.b32 %r78, %r74, %r121, %p93; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.gt.f32 %p94, %r77, %r76; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.eq.f32 %p95, %r77, %r76; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p96, %r77, %r77; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p97, %r76, %r76; + setp.num.f32 %p98, %r76, %r76; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p99, %p96, %p98; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p100, %p94, %p99; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p101, %p97, %p96; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p102, %p95, %p101; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.lt.s32 %p103, %r78, %r122; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p104, %p103, %p102; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p105, %p100, %p104; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.f32 %r79, %r77, %r76, %p105; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.b32 %r80, %r78, %r122, %p105; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + shfl.sync.bfly.b32 %r81, %r79, 16, 31, -1; + shfl.sync.bfly.b32 %r82, %r80, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.gt.f32 %p106, %r79, %r81; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.eq.f32 %p107, %r79, %r81; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p108, %r79, %r79; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p109, %r81, %r81; + setp.num.f32 %p110, %r81, %r81; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p111, %p108, %p110; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p112, %p106, %p111; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p113, %p108, %p109; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p114, %p107, %p113; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.lt.s32 %p115, %r80, %r82; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p116, %p115, %p114; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p117, %p112, %p116; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.f32 %r83, %r79, %r81, %p117; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.b32 %r84, %r80, %r82, %p117; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + shfl.sync.bfly.b32 %r85, %r83, 8, 31, -1; + shfl.sync.bfly.b32 %r86, %r84, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.gt.f32 %p118, %r83, %r85; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.eq.f32 %p119, %r83, %r85; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p120, %r83, %r83; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p121, %r85, %r85; + setp.num.f32 %p122, %r85, %r85; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p123, %p120, %p122; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p124, %p118, %p123; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p125, %p121, %p120; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p126, %p119, %p125; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.lt.s32 %p127, %r84, %r86; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p128, %p127, %p126; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p129, %p124, %p128; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.f32 %r87, %r83, %r85, %p129; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.b32 %r88, %r84, %r86, %p129; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + shfl.sync.bfly.b32 %r89, %r87, 4, 31, -1; + shfl.sync.bfly.b32 %r90, %r88, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.gt.f32 %p130, %r87, %r89; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.eq.f32 %p131, %r87, %r89; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p132, %r87, %r87; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p133, %r89, %r89; + setp.num.f32 %p134, %r89, %r89; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p135, %p132, %p134; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p136, %p130, %p135; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p137, %p133, %p132; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p138, %p131, %p137; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.lt.s32 %p139, %r88, %r90; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p140, %p139, %p138; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p141, %p136, %p140; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.f32 %r91, %r87, %r89, %p141; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.b32 %r92, %r88, %r90, %p141; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + shfl.sync.bfly.b32 %r93, %r91, 2, 31, -1; + shfl.sync.bfly.b32 %r94, %r92, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.gt.f32 %p142, %r91, %r93; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.eq.f32 %p143, %r91, %r93; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p144, %r91, %r91; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p145, %r93, %r93; + setp.num.f32 %p146, %r93, %r93; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p147, %p144, %p146; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p148, %p142, %p147; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p149, %p145, %p144; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p150, %p143, %p149; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.lt.s32 %p151, %r92, %r94; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p152, %p151, %p150; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p153, %p148, %p152; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.f32 %r95, %r91, %r93, %p153; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.b32 %r96, %r92, %r94, %p153; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + shfl.sync.bfly.b32 %r97, %r95, 1, 31, -1; + shfl.sync.bfly.b32 %r98, %r96, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.gt.f32 %p154, %r95, %r97; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.eq.f32 %p155, %r95, %r97; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p156, %r95, %r95; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p157, %r97, %r97; + setp.num.f32 %p158, %r97, %r97; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p159, %p156, %p158; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p160, %p154, %p159; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p161, %p157, %p156; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p162, %p155, %p161; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.lt.s32 %p163, %r96, %r98; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p164, %p163, %p162; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p165, %p160, %p164; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.b32 %r59, %r96, %r98, %p165; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.eq.b32 %p62, %r70, 0; + shr.u32 %r99, %r68, 3; + and.b32 %r100, %r99, 60; + mov.b32 %r101, global_smem; + add.s32 %r56, %r101, %r100; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.b32 %r57, %r95, %r97, %p165; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + // begin inline asm + @%p62 st.shared.b32 [ %r56 + 0 ], %r57; + // end inline asm + add.s32 %r102, %r101, 64; + add.s32 %r58, %r102, %r100; + // begin inline asm + @%p62 st.shared.b32 [ %r58 + 0 ], %r59; + // end inline asm + bar.sync 0; + setp.lt.u32 %p64, %r68, 16; + shl.b32 %r103, %r68, 2; + add.s32 %r61, %r101, %r103; + // begin inline asm + @%p64 ld.shared.b32 %r60, [ %r61 + 0 ]; + // end inline asm + add.s32 %r63, %r102, %r103; + // begin inline asm + @%p64 ld.shared.b32 %r62, [ %r63 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r104, %r60, 8, 31, -1; + shfl.sync.bfly.b32 %r105, %r62, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.gt.f32 %p166, %r60, %r104; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.eq.f32 %p167, %r60, %r104; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p168, %r60, %r60; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p169, %r104, %r104; + setp.num.f32 %p170, %r104, %r104; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p171, %p168, %p170; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p172, %p166, %p171; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p173, %p168, %p169; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p174, %p167, %p173; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.lt.s32 %p175, %r62, %r105; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p176, %p175, %p174; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p177, %p172, %p176; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.f32 %r106, %r60, %r104, %p177; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.b32 %r107, %r62, %r105, %p177; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + shfl.sync.bfly.b32 %r108, %r106, 4, 31, -1; + shfl.sync.bfly.b32 %r109, %r107, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.gt.f32 %p178, %r106, %r108; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.eq.f32 %p179, %r106, %r108; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p180, %r106, %r106; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p181, %r108, %r108; + setp.num.f32 %p182, %r108, %r108; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p183, %p180, %p182; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p184, %p178, %p183; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p185, %p181, %p180; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p186, %p179, %p185; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.lt.s32 %p187, %r107, %r109; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p188, %p187, %p186; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p189, %p184, %p188; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.f32 %r110, %r106, %r108, %p189; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.b32 %r111, %r107, %r109, %p189; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + shfl.sync.bfly.b32 %r112, %r110, 2, 31, -1; + shfl.sync.bfly.b32 %r113, %r111, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.gt.f32 %p190, %r110, %r112; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.eq.f32 %p191, %r110, %r112; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p192, %r110, %r110; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p193, %r112, %r112; + setp.num.f32 %p194, %r112, %r112; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p195, %p192, %p194; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p196, %p190, %p195; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p197, %p193, %p192; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p198, %p191, %p197; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.lt.s32 %p199, %r111, %r113; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p200, %p199, %p198; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p201, %p196, %p200; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.f32 %r114, %r110, %r112, %p201; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.b32 %r115, %r111, %r113, %p201; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + shfl.sync.bfly.b32 %r116, %r114, 1, 31, -1; + shfl.sync.bfly.b32 %r117, %r115, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.gt.f32 %p202, %r114, %r116; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.eq.f32 %p203, %r114, %r116; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p204, %r114, %r114; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.nan.f32 %p205, %r116, %r116; + setp.num.f32 %p206, %r116, %r116; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p207, %p204, %p206; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p208, %p202, %p207; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p209, %p205, %p204; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p210, %p203, %p209; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.lt.s32 %p211, %r115, %r117; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + and.pred %p212, %p211, %p210; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + or.pred %p213, %p208, %p212; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.b32 %r67, %r115, %r117, %p213; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + setp.eq.b32 %p66, %r68, 0; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + selp.b32 %r65, %r114, %r116, %p213; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:45:75 ] + // begin inline asm + @%p66 st.shared.b32 [ %r61 + 0 ], %r65; + // end inline asm + // begin inline asm + @%p66 st.shared.b32 [ %r63 + 0 ], %r67; + // end inline asm + bar.sync 0; + ld.shared.s32 %rd62, [global_smem+64]; +$L__tmp8: + .loc 1 47 25 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:47:25 + shl.b64 %rd64, %rd2, 3; + add.s64 %rd63, %rd19, %rd64; + .loc 1 47 36 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:47:36 + setp.eq.b32 %p214, %r1, 0; + and.pred %p68, %p214, %p3; + // begin inline asm + @%p68 st.global.b64 [ %rd63 + 0 ], { %rd62 }; + // end inline asm + .loc 1 47 4 // c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py:47:4 + ret; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 234 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 53 +.b8 55 +.b8 115 +.b8 118 +.b8 97 +.b8 101 +.b8 111 +.b8 55 +.b8 52 +.b8 97 +.b8 101 +.b8 108 +.b8 52 +.b8 111 +.b8 120 +.b8 113 +.b8 118 +.b8 101 +.b8 117 +.b8 100 +.b8 102 +.b8 118 +.b8 104 +.b8 120 +.b8 52 +.b8 120 +.b8 102 +.b8 109 +.b8 117 +.b8 51 +.b8 105 +.b8 107 +.b8 114 +.b8 109 +.b8 118 +.b8 108 +.b8 106 +.b8 99 +.b8 110 +.b8 105 +.b8 120 +.b8 98 +.b8 52 +.b8 107 +.b8 105 +.b8 113 +.b8 97 +.b8 103 +.b8 114 +.b8 122 +.b8 110 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 53 +.b8 55 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 49 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.source new file mode 100644 index 0000000000000000000000000000000000000000..742b77ac93e1ccc460f9a207247ed16d77c8b0bb --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.source @@ -0,0 +1,317 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":18:0) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc47 = loc(unknown) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc72 = loc("in_ptr0"(#loc)) +#loc73 = loc("out_ptr0"(#loc)) +#loc74 = loc("ks0"(#loc)) +#loc75 = loc("ks1"(#loc)) +#loc76 = loc("xnumel"(#loc)) +#loc77 = loc("r0_numel"(#loc)) +#loc106 = loc("a_value"(#loc35)) +#loc107 = loc("a_index"(#loc35)) +#loc108 = loc("b_value"(#loc35)) +#loc109 = loc("b_index"(#loc35)) +#loc122 = loc("x"(#loc55)) +#loc123 = loc("x"(#loc59)) +#loc124 = loc("value"(#loc68)) +#loc125 = loc("index"(#loc68)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 32000 : i32 loc(#loc78) + %xoffset = tt.get_program_id x : i32 loc(#loc79) + %xoffset_1 = arith.constant 1 : i32 loc(#loc80) + %xoffset_2 = arith.constant 1 : i32 loc(#loc80) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc80) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc81) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc82) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc83) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc83) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc84) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc84) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc85) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc86) + %x0 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc87) + %x0_9 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc87) + %x0_10 = arith.remsi %x0, %x0_9 : tensor<1x1xi64> loc(#loc87) + %x1 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc88) + %x1_11 = tt.splat %ks0 : i64 -> tensor<1x1xi64> loc(#loc88) + %x1_12 = arith.divsi %x1, %x1_11 : tensor<1x1xi64> loc(#loc88) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc89) + %_tmp2_13 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc89) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc90) + %_tmp2_index_14 = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc90) + %c0_i32 = arith.constant 0 : i32 loc(#loc14) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc14) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc14) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc14) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc14) + %3 = ub.poison : i32 loc(#loc14) + %_tmp2_index_15:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_16 = %_tmp2_13, %_tmp2_index_17 = %_tmp2_index_14) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc92) + %r0_index_18 = arith.addi %r0_index, %r0_base_8 : tensor<1x2048xi32> loc(#loc92) + %r0_mask = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc93) + %r0_mask_19 = arith.cmpi slt, %r0_index_18, %r0_mask : tensor<1x2048xi32> loc(#loc93) + %tmp0 = arith.constant 32000 : i32 loc(#loc94) + %tmp0_20 = arith.constant 32000 : i64 loc(#loc94) + %tmp0_21 = arith.constant dense<32000> : tensor<1x1xi64> loc(#loc94) + %tmp0_22 = arith.muli %tmp0_21, %x0_10 : tensor<1x1xi64> loc(#loc94) + %tmp0_23 = arith.extsi %r0_index_18 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc95) + %tmp0_24 = tt.broadcast %tmp0_22 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc95) + %tmp0_25 = arith.addi %tmp0_23, %tmp0_24 : tensor<1x2048xi64> loc(#loc95) + %tmp0_26 = tt.splat %ks1 : i64 -> tensor<1x1xi64> loc(#loc96) + %tmp0_27 = arith.muli %tmp0_26, %x1_12 : tensor<1x1xi64> loc(#loc96) + %tmp0_28 = tt.broadcast %tmp0_27 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc97) + %tmp0_29 = arith.addi %tmp0_25, %tmp0_28 : tensor<1x2048xi64> loc(#loc97) + %tmp0_30 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc98) + %tmp0_31 = tt.addptr %tmp0_30, %tmp0_29 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc98) + %tmp0_32 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc99) + %tmp0_33 = arith.andi %r0_mask_19, %tmp0_32 : tensor<1x2048xi1> loc(#loc99) + %tmp0_34 = arith.constant 0.000000e+00 : f32 loc(#loc100) + %tmp0_35 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc100) + %tmp0_36 = tt.load %tmp0_31, %tmp0_33, %tmp0_35 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc100) + %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%_tmp2_16, %_tmp2_index_17, %tmp0_36, %r0_index_18) : (tensor<1x2048xf32>, tensor<1x2048xi32>, tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) loc(#loc24) + %_tmp2_37 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc101) + %_tmp2_38 = arith.andi %r0_mask_19, %_tmp2_37 : tensor<1x2048xi1> loc(#loc101) + %_tmp2_39 = arith.select %_tmp2_38, %8#0, %_tmp2_16 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc102) + %_tmp2_index_40 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc103) + %_tmp2_index_41 = arith.andi %r0_mask_19, %_tmp2_index_40 : tensor<1x2048xi1> loc(#loc103) + %_tmp2_index_42 = arith.select %_tmp2_index_41, %8#1, %_tmp2_index_17 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc104) + scf.yield %_tmp2_39, %_tmp2_index_42 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc29) + } loc(#loc126) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%_tmp2_index_15#0, %_tmp2_index_15#1) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc30) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc105) + %5 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc32) + %6 = tt.addptr %5, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc32) + %7 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc33) + tt.store %6, %7, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc33) + tt.return loc(#loc34) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%a_value: tensor<1x2048xf32> loc("a_value"(#loc35)), %a_index: tensor<1x2048xi32> loc("a_index"(#loc35)), %b_value: tensor<1x2048xf32> loc("b_value"(#loc35)), %b_index: tensor<1x2048xi32> loc("b_index"(#loc35))) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<1x2048xf32> loc(#loc127) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<1x2048xf32> loc(#loc128) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%a_value) : (tensor<1x2048xf32>) -> i1 loc(#loc38) + %1:2 = scf.if %0 -> (tensor<1x2048xi1>, tensor<1x2048xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<1x2048xf32> loc(#loc112) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<1x2048xf32> loc(#loc113) + %mask_3 = arith.constant true loc(#loc114) + %mask_4 = arith.constant dense : tensor<1x2048xi1> loc(#loc114) + %mask_5 = arith.xori %b_isnan, %mask_4 : tensor<1x2048xi1> loc(#loc114) + %mask_6 = arith.andi %a_isnan, %mask_5 : tensor<1x2048xi1> loc(#loc115) + %mask_7 = arith.ori %mask, %mask_6 : tensor<1x2048xi1> loc(#loc129) + %equal_8 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc117) + %equal_9 = arith.ori %equal, %equal_8 : tensor<1x2048xi1> loc(#loc130) + scf.yield %mask_7, %equal_9 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc130) + } else { + scf.yield %mask, %equal : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc47) + } loc(#loc39) + %mask_0 = arith.cmpi slt, %a_index, %b_index : tensor<1x2048xi32> loc(#loc119) + %mask_1 = arith.andi %1#1, %mask_0 : tensor<1x2048xi1> loc(#loc120) + %mask_2 = arith.ori %1#0, %mask_1 : tensor<1x2048xi1> loc(#loc121) + %2 = arith.select %mask_2, %a_value, %b_value : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc51) + %3 = arith.select %mask_2, %a_index, %b_index : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc52) + tt.return %2, %3 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc53) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x2048xf32> loc(#loc54) + %5 = ub.poison : tensor<1x2048xi32> loc(#loc54) + tt.return %4, %5 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc54) + } loc(#loc35) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc55))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc56) + %true = arith.constant true loc(#loc57) + tt.return %true : i1 loc(#loc57) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc58) + tt.return %1 : i1 loc(#loc58) + } loc(#loc55) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc59))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc60) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc61) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc61) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc61) + %4 = arith.addf %x, %3 : tensor<1x2048xf32> loc(#loc61) + tt.return %4 : tensor<1x2048xf32> loc(#loc62) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x2048xf32> loc(#loc63) + tt.return %5 : tensor<1x2048xf32> loc(#loc63) + } loc(#loc59) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc65) + %cst = arith.constant dense : tensor<1xi1> loc(#loc65) + tt.return %cst : tensor<1xi1> loc(#loc66) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc67) + tt.return %0 : tensor<1xi1> loc(#loc67) + } loc(#loc64) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%value: tensor<1x2048xf32> loc("value"(#loc68)), %index: tensor<1x2048xi32> loc("index"(#loc68))) -> (tensor<1xf32>, tensor<1xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc69) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc69) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc69) + tt.return %0#0, %0#1 : tensor<1xf32>, tensor<1xi32> loc(#loc70) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc71) + %2 = ub.poison : tensor<1xi32> loc(#loc71) + tt.return %1, %2 : tensor<1xf32>, tensor<1xi32> loc(#loc71) + } loc(#loc68) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc35)), %a_index: i32 loc("a_index"(#loc35)), %b_value: f32 loc("b_value"(#loc35)), %b_index: i32 loc("b_index"(#loc35))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc127) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc128) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc38) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc112) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc113) + %mask_3 = arith.constant true loc(#loc114) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc114) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc115) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc129) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc117) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc130) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc130) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc47) + } loc(#loc39) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc119) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc120) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc121) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc51) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc52) + tt.return %2, %3 : f32, i32 loc(#loc53) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc54) + %5 = ub.poison : i32 loc(#loc54) + tt.return %4, %5 : f32, i32 loc(#loc54) + } loc(#loc35) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc55))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc56) + %true = arith.constant true loc(#loc57) + tt.return %true : i1 loc(#loc57) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc58) + tt.return %1 : i1 loc(#loc58) + } loc(#loc55) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc59))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc60) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc61) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc61) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc61) + tt.return %3 : tensor<1xf32> loc(#loc62) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc63) + tt.return %4 : tensor<1xf32> loc(#loc63) + } loc(#loc59) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":29:55) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":30:58) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":32:40) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":33:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":34:29) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:47) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:41) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:56) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:52) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:71) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:61) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":41:38) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":43:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":43:54) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":44:41) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":44:66) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":44:8) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":45:75) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":46:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":47:25) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":47:36) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":47:4) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc78 = loc("r0_numel"(#loc1)) +#loc79 = loc("xoffset"(#loc2)) +#loc80 = loc("xoffset"(#loc3)) +#loc81 = loc("xindex"(#loc4)) +#loc82 = loc("xindex"(#loc5)) +#loc83 = loc("xindex"(#loc6)) +#loc84 = loc("xmask"(#loc7)) +#loc85 = loc("r0_base"(#loc8)) +#loc86 = loc("r0_base"(#loc9)) +#loc87 = loc("x0"(#loc10)) +#loc88 = loc("x1"(#loc11)) +#loc89 = loc("_tmp2"(#loc12)) +#loc90 = loc("_tmp2_index"(#loc13)) +#loc91 = loc("_tmp2"(#loc14)) +#loc92 = loc("r0_index"(#loc15)) +#loc93 = loc("r0_mask"(#loc16)) +#loc94 = loc("tmp0"(#loc17)) +#loc95 = loc("tmp0"(#loc18)) +#loc96 = loc("tmp0"(#loc19)) +#loc97 = loc("tmp0"(#loc20)) +#loc98 = loc("tmp0"(#loc21)) +#loc99 = loc("tmp0"(#loc22)) +#loc100 = loc("tmp0"(#loc23)) +#loc101 = loc("_tmp2"(#loc25)) +#loc102 = loc("_tmp2"(#loc26)) +#loc103 = loc("_tmp2_index"(#loc27)) +#loc104 = loc("_tmp2_index"(#loc28)) +#loc105 = loc("tmp2"(#loc31)) +#loc110 = loc("mask"(#loc36)) +#loc111 = loc("equal"(#loc37)) +#loc112 = loc("a_isnan"(#loc40)) +#loc113 = loc("b_isnan"(#loc41)) +#loc114 = loc("mask"(#loc42)) +#loc115 = loc("mask"(#loc43)) +#loc116 = loc("mask"(#loc44)) +#loc117 = loc("equal"(#loc45)) +#loc118 = loc("equal"(#loc46)) +#loc119 = loc("mask"(#loc48)) +#loc120 = loc("mask"(#loc49)) +#loc121 = loc("mask"(#loc50)) +#loc126 = loc("_tmp2_index"(#loc91)) +#loc127 = loc("mask"(#loc110)) +#loc128 = loc("equal"(#loc111)) +#loc129 = loc("mask"(#loc116)) +#loc130 = loc("equal"(#loc118)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..ded9f0857734af6bd9a7b7d278b539ed21d7e6b7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttgir @@ -0,0 +1,198 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":18:0) +#loc1 = loc(unknown) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":45:75) +#loc41 = loc("in_ptr0"(#loc)) +#loc42 = loc("out_ptr0"(#loc)) +#loc43 = loc("ks0"(#loc)) +#loc44 = loc("ks1"(#loc)) +#loc45 = loc("xnumel"(#loc)) +#loc46 = loc("r0_numel"(#loc)) +#loc79 = loc(callsite(#loc1 at #loc36)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<2147483647> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<32000> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %cst_3 = arith.constant dense : tensor<1x2048xi1, #blocked> loc(#loc1) + %true = arith.constant true loc(#loc1) + %c32000_i64 = arith.constant 32000 : i64 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc47) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc48) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc49) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc49) + %x0 = arith.extsi %xoffset : i32 to i64 loc(#loc50) + %x0_5 = arith.remsi %x0, %ks0 : i64 loc(#loc50) + %x1 = arith.divsi %x0, %ks0 : i64 loc(#loc51) + %tmp0 = arith.muli %x0_5, %c32000_i64 : i64 loc(#loc52) + %tmp0_6 = tt.splat %tmp0 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc81) + %tmp0_7 = arith.muli %ks1, %x1 : i64 loc(#loc54) + %tmp0_8 = tt.splat %tmp0_7 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc82) + %tmp0_9 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc56) + %tmp0_10 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc83) + %_tmp2_index:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp2 = %cst, %_tmp2_index_11 = %cst_0) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32, #blocked> loc(#loc59) + %r0_index_12 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32, #blocked> loc(#loc59) + %r0_mask = arith.cmpi slt, %r0_index_12, %cst_1 : tensor<1x2048xi32, #blocked> loc(#loc60) + %tmp0_13 = arith.extsi %r0_index_12 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc53) + %tmp0_14 = arith.addi %tmp0_13, %tmp0_6 : tensor<1x2048xi64, #blocked> loc(#loc53) + %tmp0_15 = arith.addi %tmp0_14, %tmp0_8 : tensor<1x2048xi64, #blocked> loc(#loc55) + %tmp0_16 = tt.addptr %tmp0_9, %tmp0_15 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc56) + %tmp0_17 = arith.andi %r0_mask, %tmp0_10 : tensor<1x2048xi1, #blocked> loc(#loc57) + %tmp0_18 = tt.load %tmp0_16, %tmp0_17, %cst_2 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc61) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_18 : tensor<1x2048xf32, #blocked> loc(#loc107) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_18 : tensor<1x2048xf32, #blocked> loc(#loc108) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<1x2048xf32, #blocked> loc(#loc87) + %b_isnan = arith.cmpf une, %tmp0_18, %tmp0_18 : tensor<1x2048xf32, #blocked> loc(#loc88) + %mask_19 = arith.xori %b_isnan, %cst_3 : tensor<1x2048xi1, #blocked> loc(#loc89) + %mask_20 = arith.andi %a_isnan, %mask_19 : tensor<1x2048xi1, #blocked> loc(#loc90) + %mask_21 = arith.ori %mask, %mask_20 : tensor<1x2048xi1, #blocked> loc(#loc109) + %equal_22 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1, #blocked> loc(#loc92) + %equal_23 = arith.ori %equal, %equal_22 : tensor<1x2048xi1, #blocked> loc(#loc110) + %mask_24 = arith.cmpi slt, %_tmp2_index_11, %r0_index_12 : tensor<1x2048xi32, #blocked> loc(#loc94) + %mask_25 = arith.andi %equal_23, %mask_24 : tensor<1x2048xi1, #blocked> loc(#loc95) + %mask_26 = arith.ori %mask_21, %mask_25 : tensor<1x2048xi1, #blocked> loc(#loc96) + %6 = arith.select %mask_26, %_tmp2, %tmp0_18 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc74) + %7 = arith.select %mask_26, %_tmp2_index_11, %r0_index_12 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc75) + %_tmp2_27 = arith.select %tmp0_17, %6, %_tmp2 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc76) + %_tmp2_index_28 = arith.select %tmp0_17, %7, %_tmp2_index_11 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc77) + scf.yield %_tmp2_27, %_tmp2_index_28 : tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc34) + } loc(#loc84) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc36)), %arg7: i32 loc(callsite(#loc1 at #loc36)), %arg8: f32 loc(callsite(#loc1 at #loc36)), %arg9: i32 loc(callsite(#loc1 at #loc36))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc111) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc112) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc97) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc98) + %mask_11 = arith.xori %b_isnan, %true : i1 loc(#loc99) + %mask_12 = arith.andi %a_isnan, %mask_11 : i1 loc(#loc100) + %mask_13 = arith.ori %mask, %mask_12 : i1 loc(#loc113) + %equal_14 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc101) + %equal_15 = arith.ori %equal, %equal_14 : i1 loc(#loc114) + %mask_16 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc102) + %mask_17 = arith.andi %equal_15, %mask_16 : i1 loc(#loc103) + %mask_18 = arith.ori %mask_13, %mask_17 : i1 loc(#loc104) + %6 = arith.select %mask_18, %arg6, %arg8 : f32 loc(#loc105) + %7 = arith.select %mask_18, %arg7, %arg9 : i32 loc(#loc106) + tt.reduce.return %6, %7 : f32, i32 loc(#loc78) + }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc78) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi32, #blocked> loc(#loc80) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc38) + %2 = ttg.convert_layout %tmp2 : tensor<1x1xi32, #blocked> -> tensor<1x1xi32, #blocked1> loc(#loc39) + %3 = arith.extsi %2 : tensor<1x1xi32, #blocked1> to tensor<1x1xi64, #blocked1> loc(#loc39) + %4 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc39) + %5 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked1> loc(#loc39) + tt.store %4, %3, %5 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc39) + tt.return loc(#loc40) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":27:19) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":28:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:47) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:41) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:56) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:52) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:71) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":32:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":33:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":34:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:61) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":41:38) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":43:54) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":44:66) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":44:8) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":46:20) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":47:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":47:36) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":47:4) +#loc47 = loc("xoffset"(#loc2)) +#loc48 = loc("xmask"(#loc3)) +#loc49 = loc("r0_base"(#loc4)) +#loc50 = loc("x0"(#loc5)) +#loc51 = loc("x1"(#loc6)) +#loc52 = loc("tmp0"(#loc7)) +#loc53 = loc("tmp0"(#loc8)) +#loc54 = loc("tmp0"(#loc9)) +#loc55 = loc("tmp0"(#loc10)) +#loc56 = loc("tmp0"(#loc11)) +#loc57 = loc("tmp0"(#loc12)) +#loc58 = loc("_tmp2"(#loc13)) +#loc59 = loc("r0_index"(#loc14)) +#loc60 = loc("r0_mask"(#loc15)) +#loc61 = loc("tmp0"(#loc16)) +#loc62 = loc("mask"(#loc17)) +#loc63 = loc("equal"(#loc19)) +#loc64 = loc("a_isnan"(#loc20)) +#loc65 = loc("b_isnan"(#loc21)) +#loc66 = loc("mask"(#loc22)) +#loc67 = loc("mask"(#loc23)) +#loc68 = loc("mask"(#loc24)) +#loc69 = loc("equal"(#loc25)) +#loc70 = loc("equal"(#loc26)) +#loc71 = loc("mask"(#loc27)) +#loc72 = loc("mask"(#loc28)) +#loc73 = loc("mask"(#loc29)) +#loc74 = loc(callsite(#loc30 at #loc18)) +#loc75 = loc(callsite(#loc31 at #loc18)) +#loc76 = loc("_tmp2"(#loc32)) +#loc77 = loc("_tmp2_index"(#loc33)) +#loc78 = loc(callsite(#loc35 at #loc36)) +#loc80 = loc("tmp2"(#loc37)) +#loc81 = loc(fused[#loc53, #loc52]) +#loc82 = loc(fused[#loc55, #loc54]) +#loc83 = loc(fused[#loc57, #loc48]) +#loc84 = loc("_tmp2_index"(#loc58)) +#loc85 = loc("mask"(#loc62)) +#loc86 = loc("equal"(#loc63)) +#loc87 = loc(callsite(#loc64 at #loc18)) +#loc88 = loc(callsite(#loc65 at #loc18)) +#loc89 = loc(callsite(#loc66 at #loc18)) +#loc90 = loc(callsite(#loc67 at #loc18)) +#loc91 = loc("mask"(#loc68)) +#loc92 = loc(callsite(#loc69 at #loc18)) +#loc93 = loc("equal"(#loc70)) +#loc94 = loc(callsite(#loc71 at #loc18)) +#loc95 = loc(callsite(#loc72 at #loc18)) +#loc96 = loc(callsite(#loc73 at #loc18)) +#loc97 = loc(callsite(#loc64 at #loc78)) +#loc98 = loc(callsite(#loc65 at #loc78)) +#loc99 = loc(callsite(#loc66 at #loc78)) +#loc100 = loc(callsite(#loc67 at #loc78)) +#loc101 = loc(callsite(#loc69 at #loc78)) +#loc102 = loc(callsite(#loc71 at #loc78)) +#loc103 = loc(callsite(#loc72 at #loc78)) +#loc104 = loc(callsite(#loc73 at #loc78)) +#loc105 = loc(callsite(#loc30 at #loc78)) +#loc106 = loc(callsite(#loc31 at #loc78)) +#loc107 = loc(callsite(#loc85 at #loc18)) +#loc108 = loc(callsite(#loc86 at #loc18)) +#loc109 = loc(callsite(#loc91 at #loc18)) +#loc110 = loc(callsite(#loc93 at #loc18)) +#loc111 = loc(callsite(#loc85 at #loc78)) +#loc112 = loc(callsite(#loc86 at #loc78)) +#loc113 = loc(callsite(#loc91 at #loc78)) +#loc114 = loc(callsite(#loc93 at #loc78)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..2319a6166390da214dba234b39256d010e851f48 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/VBVRCEQLKQI4X4GYXD4JC6UEYZT2F7LIKNA2UR4GNVIWAPM6GKFA/triton_red_fused_argmax_1.ttir @@ -0,0 +1,201 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":45:75) +#loc44 = loc("in_ptr0"(#loc)) +#loc45 = loc("out_ptr0"(#loc)) +#loc46 = loc("ks0"(#loc)) +#loc47 = loc("ks1"(#loc)) +#loc48 = loc("xnumel"(#loc)) +#loc49 = loc("r0_numel"(#loc)) +#loc50 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c32000_i64 = arith.constant 32000 : i64 loc(#loc1) + %true = arith.constant true loc(#loc50) + %cst = arith.constant dense : tensor<1x2048xi1> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc3) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc1) + %cst_1 = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc51) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc52) + %xoffset = tt.get_program_id x : i32 loc(#loc53) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc54) + %xmask_2 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc54) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc55) + %r0_base_3 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc56) + %x0 = arith.extsi %xoffset : i32 to i64 loc(#loc57) + %x0_4 = arith.remsi %x0, %ks0 : i64 loc(#loc57) + %x1 = arith.divsi %x0, %ks0 : i64 loc(#loc58) + %_tmp2_index_5:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp2_6 = %_tmp2, %_tmp2_index_7 = %_tmp2_index) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc60) + %r0_index_8 = arith.addi %r0_index, %r0_base_3 : tensor<1x2048xi32> loc(#loc60) + %r0_mask = arith.cmpi slt, %r0_index_8, %cst_1 : tensor<1x2048xi32> loc(#loc61) + %tmp0 = arith.muli %x0_4, %c32000_i64 : i64 loc(#loc62) + %tmp0_9 = arith.extsi %r0_index_8 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc63) + %tmp0_10 = tt.splat %tmp0 : i64 -> tensor<1x2048xi64> loc(#loc88) + %tmp0_11 = arith.addi %tmp0_9, %tmp0_10 : tensor<1x2048xi64> loc(#loc63) + %tmp0_12 = arith.muli %ks1, %x1 : i64 loc(#loc64) + %tmp0_13 = tt.splat %tmp0_12 : i64 -> tensor<1x2048xi64> loc(#loc89) + %tmp0_14 = arith.addi %tmp0_11, %tmp0_13 : tensor<1x2048xi64> loc(#loc65) + %tmp0_15 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc66) + %tmp0_16 = tt.addptr %tmp0_15, %tmp0_14 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc66) + %tmp0_17 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc90) + %tmp0_18 = arith.andi %r0_mask, %tmp0_17 : tensor<1x2048xi1> loc(#loc67) + %tmp0_19 = tt.load %tmp0_16, %tmp0_18, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc68) + %mask = arith.cmpf ogt, %_tmp2_6, %tmp0_19 : tensor<1x2048xf32> loc(#loc113) + %equal = arith.cmpf oeq, %_tmp2_6, %tmp0_19 : tensor<1x2048xf32> loc(#loc114) + %a_isnan = arith.cmpf une, %_tmp2_6, %_tmp2_6 : tensor<1x2048xf32> loc(#loc93) + %b_isnan = arith.cmpf une, %tmp0_19, %tmp0_19 : tensor<1x2048xf32> loc(#loc94) + %mask_20 = arith.xori %b_isnan, %cst : tensor<1x2048xi1> loc(#loc95) + %mask_21 = arith.andi %a_isnan, %mask_20 : tensor<1x2048xi1> loc(#loc96) + %mask_22 = arith.ori %mask, %mask_21 : tensor<1x2048xi1> loc(#loc115) + %equal_23 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc98) + %equal_24 = arith.ori %equal, %equal_23 : tensor<1x2048xi1> loc(#loc116) + %mask_25 = arith.cmpi slt, %_tmp2_index_7, %r0_index_8 : tensor<1x2048xi32> loc(#loc100) + %mask_26 = arith.andi %equal_24, %mask_25 : tensor<1x2048xi1> loc(#loc101) + %mask_27 = arith.ori %mask_22, %mask_26 : tensor<1x2048xi1> loc(#loc102) + %4 = arith.select %mask_27, %_tmp2_6, %tmp0_19 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc81) + %5 = arith.select %mask_27, %_tmp2_index_7, %r0_index_8 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc82) + %_tmp2_28 = arith.select %tmp0_18, %4, %_tmp2_6 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc83) + %_tmp2_index_29 = arith.select %tmp0_18, %5, %_tmp2_index_7 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc84) + scf.yield %_tmp2_28, %_tmp2_index_29 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc38) + } loc(#loc87) + %0:2 = "tt.reduce"(%_tmp2_index_5#0, %_tmp2_index_5#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2)), %arg8: f32 loc(callsite(#loc1 at #loc2)), %arg9: i32 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc117) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc118) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc103) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc104) + %mask_6 = arith.xori %b_isnan, %true : i1 loc(#loc105) + %mask_7 = arith.andi %a_isnan, %mask_6 : i1 loc(#loc106) + %mask_8 = arith.ori %mask, %mask_7 : i1 loc(#loc119) + %equal_9 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc107) + %equal_10 = arith.ori %equal, %equal_9 : i1 loc(#loc120) + %mask_11 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc108) + %mask_12 = arith.andi %equal_10, %mask_11 : i1 loc(#loc109) + %mask_13 = arith.ori %mask_8, %mask_12 : i1 loc(#loc110) + %4 = arith.select %mask_13, %arg6, %arg8 : f32 loc(#loc111) + %5 = arith.select %mask_13, %arg7, %arg9 : i32 loc(#loc112) + tt.reduce.return %4, %5 : f32, i32 loc(#loc85) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc85) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc86) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc41) + %2 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc41) + %3 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc42) + tt.store %2, %3, %xmask_2 : tensor<1x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":32:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":30:58) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":29:55) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":22:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":27:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":33:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":34:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:47) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:41) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:56) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:52) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:71) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":38:61) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":41:38) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":43:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":44:66) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":44:8) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":46:20) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":47:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":47:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/57/c57svaeo74ael4oxqveudfvhx4xfmu3ikrmvljcnixb4kiqagrzn.py":47:4) +#loc51 = loc("_tmp2_index"(#loc4)) +#loc52 = loc("_tmp2"(#loc5)) +#loc53 = loc("xoffset"(#loc6)) +#loc54 = loc("xmask"(#loc7)) +#loc55 = loc("r0_base"(#loc8)) +#loc56 = loc("r0_base"(#loc9)) +#loc57 = loc("x0"(#loc10)) +#loc58 = loc("x1"(#loc11)) +#loc59 = loc("_tmp2"(#loc3)) +#loc60 = loc("r0_index"(#loc12)) +#loc61 = loc("r0_mask"(#loc13)) +#loc62 = loc("tmp0"(#loc14)) +#loc63 = loc("tmp0"(#loc15)) +#loc64 = loc("tmp0"(#loc16)) +#loc65 = loc("tmp0"(#loc17)) +#loc66 = loc("tmp0"(#loc18)) +#loc67 = loc("tmp0"(#loc19)) +#loc68 = loc("tmp0"(#loc20)) +#loc69 = loc("mask"(#loc21)) +#loc70 = loc("equal"(#loc23)) +#loc71 = loc("a_isnan"(#loc24)) +#loc72 = loc("b_isnan"(#loc25)) +#loc73 = loc("mask"(#loc26)) +#loc74 = loc("mask"(#loc27)) +#loc75 = loc("mask"(#loc28)) +#loc76 = loc("equal"(#loc29)) +#loc77 = loc("equal"(#loc30)) +#loc78 = loc("mask"(#loc31)) +#loc79 = loc("mask"(#loc32)) +#loc80 = loc("mask"(#loc33)) +#loc81 = loc(callsite(#loc34 at #loc22)) +#loc82 = loc(callsite(#loc35 at #loc22)) +#loc83 = loc("_tmp2"(#loc36)) +#loc84 = loc("_tmp2_index"(#loc37)) +#loc85 = loc(callsite(#loc39 at #loc2)) +#loc86 = loc("tmp2"(#loc40)) +#loc87 = loc("_tmp2_index"(#loc59)) +#loc88 = loc(fused[#loc63, #loc62]) +#loc89 = loc(fused[#loc65, #loc64]) +#loc90 = loc(fused[#loc67, #loc54]) +#loc91 = loc("mask"(#loc69)) +#loc92 = loc("equal"(#loc70)) +#loc93 = loc(callsite(#loc71 at #loc22)) +#loc94 = loc(callsite(#loc72 at #loc22)) +#loc95 = loc(callsite(#loc73 at #loc22)) +#loc96 = loc(callsite(#loc74 at #loc22)) +#loc97 = loc("mask"(#loc75)) +#loc98 = loc(callsite(#loc76 at #loc22)) +#loc99 = loc("equal"(#loc77)) +#loc100 = loc(callsite(#loc78 at #loc22)) +#loc101 = loc(callsite(#loc79 at #loc22)) +#loc102 = loc(callsite(#loc80 at #loc22)) +#loc103 = loc(callsite(#loc71 at #loc85)) +#loc104 = loc(callsite(#loc72 at #loc85)) +#loc105 = loc(callsite(#loc73 at #loc85)) +#loc106 = loc(callsite(#loc74 at #loc85)) +#loc107 = loc(callsite(#loc76 at #loc85)) +#loc108 = loc(callsite(#loc78 at #loc85)) +#loc109 = loc(callsite(#loc79 at #loc85)) +#loc110 = loc(callsite(#loc80 at #loc85)) +#loc111 = loc(callsite(#loc34 at #loc85)) +#loc112 = loc(callsite(#loc35 at #loc85)) +#loc113 = loc(callsite(#loc91 at #loc22)) +#loc114 = loc(callsite(#loc92 at #loc22)) +#loc115 = loc(callsite(#loc97 at #loc22)) +#loc116 = loc(callsite(#loc99 at #loc22)) +#loc117 = loc(callsite(#loc91 at #loc85)) +#loc118 = loc(callsite(#loc92 at #loc85)) +#loc119 = loc(callsite(#loc97 at #loc85)) +#loc120 = loc(callsite(#loc99 at #loc85)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/VXP5IRMC7IFU5CJ6XMHME5S7WHSQUMTBQ3HYORXV76EIEXNVJXCQ/__triton_launcher.cpython-311-x86_64-linux-gnu.so b/SpecForge-ext/cache/compiled_kernels/triton/6/VXP5IRMC7IFU5CJ6XMHME5S7WHSQUMTBQ3HYORXV76EIEXNVJXCQ/__triton_launcher.cpython-311-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..b3f97b307eda2f69baeedb4d215897f7b17ceaf7 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/VXP5IRMC7IFU5CJ6XMHME5S7WHSQUMTBQ3HYORXV76EIEXNVJXCQ/__triton_launcher.cpython-311-x86_64-linux-gnu.so differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WKIJ5MF6FWKHWKI7JEZEJXBLBMIVQTOQQ662UO54ZTETPY26LZPQ/__triton_launcher.cpython-311-x86_64-linux-gnu.so b/SpecForge-ext/cache/compiled_kernels/triton/6/WKIJ5MF6FWKHWKI7JEZEJXBLBMIVQTOQQ662UO54ZTETPY26LZPQ/__triton_launcher.cpython-311-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..b8fb2616ede5107df20265ce5b37fe268ff70214 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/WKIJ5MF6FWKHWKI7JEZEJXBLBMIVQTOQQ662UO54ZTETPY26LZPQ/__triton_launcher.cpython-311-x86_64-linux-gnu.so differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/__grp__triton_red_fused_eq_mul_squeeze_sum_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/__grp__triton_red_fused_eq_mul_squeeze_sum_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0fef063149d80d331ccda7ad2d5f9437bb710989 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/__grp__triton_red_fused_eq_mul_squeeze_sum_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_eq_mul_squeeze_sum_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.source", "triton_red_fused_eq_mul_squeeze_sum_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.ttir", "triton_red_fused_eq_mul_squeeze_sum_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.ttgir", "triton_red_fused_eq_mul_squeeze_sum_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.llir", "triton_red_fused_eq_mul_squeeze_sum_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.ptx", "triton_red_fused_eq_mul_squeeze_sum_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.cubin", "triton_red_fused_eq_mul_squeeze_sum_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..bb894b3d68338279fe6866113fbb1d554c1837ba Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.json b/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.json new file mode 100644 index 0000000000000000000000000000000000000000..289a59a8959f5bc74357a64e826c9e3af12ffd14 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.json @@ -0,0 +1 @@ +{"hash": "b368dc8137f4fc865009c58f12b875db3671aebf683589e7ef2f8d4854a3d43b", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_eq_mul_squeeze_sum_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..d5a9d92e99d2a09c956c5b3761601e935bdc69b6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.llir @@ -0,0 +1,253 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_eq_mul_squeeze_sum_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !7 + %9 = shl nuw nsw i32 %8, 1, !dbg !7 + %10 = and i32 %9, 1022, !dbg !7 + %11 = icmp sgt i32 %4, 0, !dbg !8 + br i1 %11, label %.lr.ph.preheader, label %._crit_edge, !dbg !8 + +.lr.ph.preheader: ; preds = %7 + %12 = insertelement <4 x i32> poison, i32 %4, i64 0 + %13 = shufflevector <4 x i32> %12, <4 x i32> poison, <4 x i32> zeroinitializer + br label %.lr.ph, !dbg !8 + +.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph + %14 = phi i32 [ %86, %.lr.ph ], [ 0, %.lr.ph.preheader ] + %15 = phi <4 x i64> [ %85, %.lr.ph ], [ zeroinitializer, %.lr.ph.preheader ] + %16 = or disjoint i32 %9, %14, !dbg !9 + %17 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !10 + %18 = or disjoint i32 %14, %10, !dbg !9 + %19 = or disjoint i32 %18, 1, !dbg !9 + %20 = or i32 %16, 1024, !dbg !9 + %21 = or i32 %16, 1025, !dbg !9 + %22 = insertelement <4 x i32> poison, i32 %18, i64 0, !dbg !11 + %23 = insertelement <4 x i32> %22, i32 %19, i64 1, !dbg !11 + %24 = insertelement <4 x i32> %23, i32 %20, i64 2, !dbg !11 + %25 = insertelement <4 x i32> %24, i32 %21, i64 3, !dbg !11 + %26 = icmp slt <4 x i32> %25, %13, !dbg !11 + %27 = sext i32 %18 to i64, !dbg !12 + %28 = getelementptr i64, ptr addrspace(1) %0, i64 %27, !dbg !12 + %29 = sext i32 %19 to i64, !dbg !12 + %30 = getelementptr i64, ptr addrspace(1) %0, i64 %29, !dbg !12 + %31 = sext i32 %20 to i64, !dbg !12 + %32 = getelementptr i64, ptr addrspace(1) %0, i64 %31, !dbg !12 + %33 = sext i32 %21 to i64, !dbg !12 + %34 = getelementptr i64, ptr addrspace(1) %0, i64 %33, !dbg !12 + %35 = extractelement <4 x i1> %26, i64 0, !dbg !13 + %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %28, i64 %17, i1 %35) #5, !dbg !10 + %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !10 + %38 = extractelement <4 x i1> %26, i64 1, !dbg !13 + %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %30, i64 %37, i1 %38) #5, !dbg !10 + %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !10 + %41 = extractelement <4 x i1> %26, i64 2, !dbg !13 + %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %32, i64 %40, i1 %41) #5, !dbg !10 + %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !10 + %44 = extractelement <4 x i1> %26, i64 3, !dbg !13 + %45 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %34, i64 %43, i1 %44) #5, !dbg !10 + %46 = getelementptr i64, ptr addrspace(1) %1, i64 %27, !dbg !14 + %47 = getelementptr i64, ptr addrspace(1) %1, i64 %29, !dbg !14 + %48 = getelementptr i64, ptr addrspace(1) %1, i64 %31, !dbg !14 + %49 = getelementptr i64, ptr addrspace(1) %1, i64 %33, !dbg !14 + %50 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !15 + %51 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %46, i64 %50, i1 %35) #5, !dbg !15 + %52 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !15 + %53 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %47, i64 %52, i1 %38) #5, !dbg !15 + %54 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !15 + %55 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %48, i64 %54, i1 %41) #5, !dbg !15 + %56 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !15 + %57 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %49, i64 %56, i1 %44) #5, !dbg !15 + %58 = getelementptr i64, ptr addrspace(1) %2, i64 %27, !dbg !16 + %59 = getelementptr i64, ptr addrspace(1) %2, i64 %29, !dbg !16 + %60 = getelementptr i64, ptr addrspace(1) %2, i64 %31, !dbg !16 + %61 = getelementptr i64, ptr addrspace(1) %2, i64 %33, !dbg !16 + %62 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !13 + %63 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %58, i64 %62, i1 %35) #5, !dbg !13 + %64 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !13 + %65 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %59, i64 %64, i1 %38) #5, !dbg !13 + %66 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !13 + %67 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %60, i64 %66, i1 %41) #5, !dbg !13 + %68 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !13 + %69 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_first.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %61, i64 %68, i1 %44) #5, !dbg !13 + %70 = insertelement <4 x i64> poison, i64 %36, i64 0, !dbg !17 + %71 = insertelement <4 x i64> %70, i64 %39, i64 1, !dbg !17 + %72 = insertelement <4 x i64> %71, i64 %42, i64 2, !dbg !17 + %73 = insertelement <4 x i64> %72, i64 %45, i64 3, !dbg !17 + %74 = insertelement <4 x i64> poison, i64 %51, i64 0, !dbg !17 + %75 = insertelement <4 x i64> %74, i64 %53, i64 1, !dbg !17 + %76 = insertelement <4 x i64> %75, i64 %55, i64 2, !dbg !17 + %77 = insertelement <4 x i64> %76, i64 %57, i64 3, !dbg !17 + %78 = icmp eq <4 x i64> %73, %77, !dbg !17 + %79 = select <4 x i1> %26, <4 x i1> %78, <4 x i1> zeroinitializer, !dbg !18 + %80 = insertelement <4 x i64> poison, i64 %63, i64 0, !dbg !18 + %81 = insertelement <4 x i64> %80, i64 %65, i64 1, !dbg !18 + %82 = insertelement <4 x i64> %81, i64 %67, i64 2, !dbg !18 + %83 = insertelement <4 x i64> %82, i64 %69, i64 3, !dbg !18 + %84 = select <4 x i1> %79, <4 x i64> %83, <4 x i64> zeroinitializer, !dbg !18 + %85 = add <4 x i64> %84, %15, !dbg !18 + %86 = add i32 %14, 2048, !dbg !8 + %87 = icmp slt i32 %86, %4, !dbg !8 + br i1 %87, label %.lr.ph, label %._crit_edge.loopexit, !dbg !8 + +._crit_edge.loopexit: ; preds = %.lr.ph + %88 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %85), !dbg !7 + br label %._crit_edge, !dbg !7 + +._crit_edge: ; preds = %._crit_edge.loopexit, %7 + %89 = phi i64 [ 0, %7 ], [ %88, %._crit_edge.loopexit ], !dbg !19 + %90 = and i32 %8, 31, !dbg !7 + %91 = lshr i32 %8, 5, !dbg !7 + %extelt.offset = lshr i64 %89, 32, !dbg !23 + %92 = trunc nuw i64 %extelt.offset to i32, !dbg !23 + %93 = trunc i64 %89 to i32, !dbg !23 + %94 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %93, i32 16, i32 31), !dbg !23 + %95 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %92, i32 16, i32 31), !dbg !23 + %96 = insertelement <2 x i32> poison, i32 %94, i64 0, !dbg !23 + %97 = insertelement <2 x i32> %96, i32 %95, i64 1, !dbg !23 + %98 = bitcast <2 x i32> %97 to i64, !dbg !23 + %99 = add i64 %89, %98, !dbg !19 + %extelt.offset2 = lshr i64 %99, 32, !dbg !23 + %100 = trunc nuw i64 %extelt.offset2 to i32, !dbg !23 + %101 = trunc i64 %99 to i32, !dbg !23 + %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 8, i32 31), !dbg !23 + %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 8, i32 31), !dbg !23 + %104 = insertelement <2 x i32> poison, i32 %102, i64 0, !dbg !23 + %105 = insertelement <2 x i32> %104, i32 %103, i64 1, !dbg !23 + %106 = bitcast <2 x i32> %105 to i64, !dbg !23 + %107 = add i64 %99, %106, !dbg !19 + %extelt.offset3 = lshr i64 %107, 32, !dbg !23 + %108 = trunc nuw i64 %extelt.offset3 to i32, !dbg !23 + %109 = trunc i64 %107 to i32, !dbg !23 + %110 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %109, i32 4, i32 31), !dbg !23 + %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 4, i32 31), !dbg !23 + %112 = insertelement <2 x i32> poison, i32 %110, i64 0, !dbg !23 + %113 = insertelement <2 x i32> %112, i32 %111, i64 1, !dbg !23 + %114 = bitcast <2 x i32> %113 to i64, !dbg !23 + %115 = add i64 %107, %114, !dbg !19 + %extelt.offset4 = lshr i64 %115, 32, !dbg !23 + %116 = trunc nuw i64 %extelt.offset4 to i32, !dbg !23 + %117 = trunc i64 %115 to i32, !dbg !23 + %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 2, i32 31), !dbg !23 + %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 2, i32 31), !dbg !23 + %120 = insertelement <2 x i32> poison, i32 %118, i64 0, !dbg !23 + %121 = insertelement <2 x i32> %120, i32 %119, i64 1, !dbg !23 + %122 = bitcast <2 x i32> %121 to i64, !dbg !23 + %123 = add i64 %115, %122, !dbg !19 + %extelt.offset5 = lshr i64 %123, 32, !dbg !23 + %124 = trunc nuw i64 %extelt.offset5 to i32, !dbg !23 + %125 = trunc i64 %123 to i32, !dbg !23 + %126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 1, i32 31), !dbg !23 + %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %124, i32 1, i32 31), !dbg !23 + %128 = insertelement <2 x i32> poison, i32 %126, i64 0, !dbg !23 + %129 = insertelement <2 x i32> %128, i32 %127, i64 1, !dbg !23 + %130 = bitcast <2 x i32> %129 to i64, !dbg !23 + %131 = add i64 %123, %130, !dbg !19 + %132 = and i32 %91, 15, !dbg !23 + %133 = icmp eq i32 %90, 0, !dbg !23 + %134 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %132, !dbg !23 + %135 = insertelement <1 x i64> poison, i64 %131, i64 0, !dbg !23 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %134, <1 x i64> %135, i1 %133) #5, !dbg !23 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !23 + %136 = icmp samesign ult i32 %8, 16, !dbg !23 + %137 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %8, !dbg !23 + %138 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %137, i1 %136) #5, !dbg !23 + %extelt.offset6 = lshr i64 %138, 32, !dbg !23 + %139 = trunc nuw i64 %extelt.offset6 to i32, !dbg !23 + %140 = trunc i64 %138 to i32, !dbg !23 + %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 8, i32 31), !dbg !23 + %142 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 8, i32 31), !dbg !23 + %143 = insertelement <2 x i32> poison, i32 %141, i64 0, !dbg !23 + %144 = insertelement <2 x i32> %143, i32 %142, i64 1, !dbg !23 + %145 = bitcast <2 x i32> %144 to i64, !dbg !23 + %146 = add i64 %138, %145, !dbg !19 + %extelt.offset7 = lshr i64 %146, 32, !dbg !23 + %147 = trunc nuw i64 %extelt.offset7 to i32, !dbg !23 + %148 = trunc i64 %146 to i32, !dbg !23 + %149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 4, i32 31), !dbg !23 + %150 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %147, i32 4, i32 31), !dbg !23 + %151 = insertelement <2 x i32> poison, i32 %149, i64 0, !dbg !23 + %152 = insertelement <2 x i32> %151, i32 %150, i64 1, !dbg !23 + %153 = bitcast <2 x i32> %152 to i64, !dbg !23 + %154 = add i64 %146, %153, !dbg !19 + %extelt.offset8 = lshr i64 %154, 32, !dbg !23 + %155 = trunc nuw i64 %extelt.offset8 to i32, !dbg !23 + %156 = trunc i64 %154 to i32, !dbg !23 + %157 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %156, i32 2, i32 31), !dbg !23 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %155, i32 2, i32 31), !dbg !23 + %159 = insertelement <2 x i32> poison, i32 %157, i64 0, !dbg !23 + %160 = insertelement <2 x i32> %159, i32 %158, i64 1, !dbg !23 + %161 = bitcast <2 x i32> %160 to i64, !dbg !23 + %162 = add i64 %154, %161, !dbg !19 + %extelt.offset9 = lshr i64 %162, 32, !dbg !23 + %163 = trunc nuw i64 %extelt.offset9 to i32, !dbg !23 + %164 = trunc i64 %162 to i32, !dbg !23 + %165 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 1, i32 31), !dbg !23 + %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %163, i32 1, i32 31), !dbg !23 + %167 = insertelement <2 x i32> poison, i32 %165, i64 0, !dbg !23 + %168 = insertelement <2 x i32> %167, i32 %166, i64 1, !dbg !23 + %169 = bitcast <2 x i32> %168 to i64, !dbg !23 + %170 = add i64 %162, %169, !dbg !19 + %171 = icmp eq i32 %8, 0, !dbg !23 + %172 = insertelement <1 x i64> poison, i64 %170, i64 0, !dbg !23 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %137, <1 x i64> %172, i1 %171) #5, !dbg !23 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !23 + %173 = load i64, ptr addrspace(3) @global_smem, align 16, !dbg !23 + %174 = and i32 %8, 511, !dbg !24 + %175 = icmp eq i32 %174, 0, !dbg !24 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %173, ptr addrspace(1) %3, i1 %175) #5, !dbg !24 + ret void, !dbg !25 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_eq_mul_squeeze_sum_2", linkageName: "triton_red_fused_eq_mul_squeeze_sum_2", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 25, column: 37, scope: !4) +!8 = !DILocation(line: 28, column: 40, scope: !4) +!9 = !DILocation(line: 29, column: 31, scope: !4) +!10 = !DILocation(line: 34, column: 41, scope: !4) +!11 = !DILocation(line: 30, column: 29, scope: !4) +!12 = !DILocation(line: 34, column: 34, scope: !4) +!13 = !DILocation(line: 36, column: 41, scope: !4) +!14 = !DILocation(line: 35, column: 34, scope: !4) +!15 = !DILocation(line: 35, column: 41, scope: !4) +!16 = !DILocation(line: 36, column: 34, scope: !4) +!17 = !DILocation(line: 37, column: 23, scope: !4) +!18 = !DILocation(line: 42, column: 40, scope: !4) +!19 = !DILocation(line: 261, column: 15, scope: !20, inlinedAt: !22) +!20 = distinct !DILexicalBlockFile(scope: !4, file: !21, discriminator: 0) +!21 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!22 = !DILocation(line: 43, column: 25, scope: !4) +!23 = !DILocation(line: 291, column: 36, scope: !20, inlinedAt: !22) +!24 = !DILocation(line: 44, column: 67, scope: !4) +!25 = !DILocation(line: 44, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..52ca33a8a424095b6e34c0d22d400a8fc5a65c6e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.ptx @@ -0,0 +1,592 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_eq_mul_squeeze_sum_2 // -- Begin function triton_red_fused_eq_mul_squeeze_sum_2 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_eq_mul_squeeze_sum_2 +.visible .entry triton_red_fused_eq_mul_squeeze_sum_2( + .param .u64 .ptr .global .align 1 triton_red_fused_eq_mul_squeeze_sum_2_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_eq_mul_squeeze_sum_2_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_eq_mul_squeeze_sum_2_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused_eq_mul_squeeze_sum_2_param_3, + .param .u32 triton_red_fused_eq_mul_squeeze_sum_2_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_eq_mul_squeeze_sum_2_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_eq_mul_squeeze_sum_2_param_6 +) +.reqntid 512 +{ + .reg .pred %p<23>; + .reg .b32 %r<63>; + .reg .b64 %rd<134>; + .loc 1 18 0 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:18:0 + +// %bb.0: + ld.param.b32 %r10, [triton_red_fused_eq_mul_squeeze_sum_2_param_4]; + ld.param.b64 %rd85, [triton_red_fused_eq_mul_squeeze_sum_2_param_3]; +$L__tmp0: + .loc 1 25 37 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:25:37 + mov.u32 %r1, %tid.x; + .loc 1 28 40 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:28:40 + setp.lt.s32 %p1, %r10, 1; + mov.b64 %rd133, 0; + @%p1 bra $L__BB0_4; +// %bb.1: // %.lr.ph.preheader + .loc 1 0 40 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:0:40 + ld.param.b64 %rd13, [triton_red_fused_eq_mul_squeeze_sum_2_param_2]; + ld.param.b64 %rd12, [triton_red_fused_eq_mul_squeeze_sum_2_param_1]; + ld.param.b64 %rd11, [triton_red_fused_eq_mul_squeeze_sum_2_param_0]; + shl.b32 %r2, %r1, 1; + and.b32 %r3, %r2, 1022; + mov.b64 %rd129, 0; + mov.b32 %r62, 0; + mov.b64 %rd130, %rd129; + mov.b64 %rd131, %rd129; + mov.b64 %rd132, %rd129; +$L__BB0_2: // %.lr.ph + // =>This Inner Loop Header: Depth=1 + .loc 1 34 41 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:34:41 + add.s32 %r12, %r2, %r62; + // begin inline asm + mov.u64 %rd20, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd20, 1.0; + // end inline asm + .loc 1 29 31 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:29:31 + add.s32 %r13, %r3, %r62; + add.s32 %r14, %r13, 1; + or.b32 %r15, %r12, 1024; + or.b32 %r16, %r12, 1025; + .loc 1 30 29 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:30:29 + setp.lt.s32 %p9, %r16, %r10; + setp.lt.s32 %p8, %r15, %r10; + setp.lt.s32 %p7, %r14, %r10; + setp.lt.s32 %p6, %r13, %r10; + .loc 1 34 34 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:34:34 + mul.wide.s32 %rd68, %r13, 8; + add.s64 %rd22, %rd11, %rd68; + add.s64 %rd26, %rd22, 8; + mul.wide.s32 %rd69, %r15, 8; + add.s64 %rd30, %rd11, %rd69; + mul.wide.s32 %rd70, %r16, 8; + add.s64 %rd34, %rd11, %rd70; + .loc 1 34 41 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:34:41 + // begin inline asm + mov.u64 %rd21, 0x0; + @%p6 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd21 }, [ %rd22 + 0 ], %rd20; + // end inline asm + // begin inline asm + mov.u64 %rd24, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd24, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd25, 0x0; + @%p7 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd25 }, [ %rd26 + 0 ], %rd24; + // end inline asm + // begin inline asm + mov.u64 %rd28, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd28, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd29, 0x0; + @%p8 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd29 }, [ %rd30 + 0 ], %rd28; + // end inline asm + // begin inline asm + mov.u64 %rd32, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd32, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd33, 0x0; + @%p9 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd33 }, [ %rd34 + 0 ], %rd32; + // end inline asm + .loc 1 35 34 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:35:34 + add.s64 %rd38, %rd12, %rd68; + add.s64 %rd42, %rd38, 8; + add.s64 %rd46, %rd12, %rd69; + add.s64 %rd50, %rd12, %rd70; + .loc 1 35 41 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:35:41 + // begin inline asm + mov.u64 %rd36, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd36, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd37, 0x0; + @%p6 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd37 }, [ %rd38 + 0 ], %rd36; + // end inline asm + // begin inline asm + mov.u64 %rd40, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd40, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd41, 0x0; + @%p7 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd41 }, [ %rd42 + 0 ], %rd40; + // end inline asm + // begin inline asm + mov.u64 %rd44, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd44, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd45, 0x0; + @%p8 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd45 }, [ %rd46 + 0 ], %rd44; + // end inline asm + // begin inline asm + mov.u64 %rd48, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd48, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd49, 0x0; + @%p9 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd49 }, [ %rd50 + 0 ], %rd48; + // end inline asm + .loc 1 36 34 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:36:34 + add.s64 %rd54, %rd13, %rd68; + add.s64 %rd58, %rd54, 8; + add.s64 %rd62, %rd13, %rd69; + add.s64 %rd66, %rd13, %rd70; + .loc 1 36 41 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:36:41 + // begin inline asm + mov.u64 %rd52, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd52, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd53, 0x0; + @%p6 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd53 }, [ %rd54 + 0 ], %rd52; + // end inline asm + // begin inline asm + mov.u64 %rd56, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd56, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd57, 0x0; + @%p7 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd57 }, [ %rd58 + 0 ], %rd56; + // end inline asm + // begin inline asm + mov.u64 %rd60, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd60, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd61, 0x0; + @%p8 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd61 }, [ %rd62 + 0 ], %rd60; + // end inline asm + // begin inline asm + mov.u64 %rd64, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd64, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd65, 0x0; + @%p9 ld.global.L1::evict_first.L2::cache_hint.b64 { %rd65 }, [ %rd66 + 0 ], %rd64; + // end inline asm + .loc 1 37 23 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:37:23 + setp.eq.b64 %p14, %rd21, %rd37; + setp.eq.b64 %p15, %rd25, %rd41; + setp.eq.b64 %p16, %rd29, %rd45; + setp.eq.b64 %p17, %rd33, %rd49; + .loc 1 42 40 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:42:40 + selp.b64 %rd71, %rd65, 0, %p17; + selp.b64 %rd72, %rd71, 0, %p9; + selp.b64 %rd73, %rd61, 0, %p16; + selp.b64 %rd74, %rd73, 0, %p8; + selp.b64 %rd75, %rd57, 0, %p15; + selp.b64 %rd76, %rd75, 0, %p7; + selp.b64 %rd77, %rd53, 0, %p14; + selp.b64 %rd78, %rd77, 0, %p6; + add.s64 %rd129, %rd78, %rd129; + add.s64 %rd130, %rd76, %rd130; + add.s64 %rd131, %rd74, %rd131; + add.s64 %rd132, %rd72, %rd132; + .loc 1 28 40 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:28:40 + add.s32 %r62, %r62, 2048; + setp.lt.s32 %p18, %r62, %r10; + @%p18 bra $L__BB0_2; +// %bb.3: // %._crit_edge.loopexit + .loc 1 25 37 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:25:37 + add.s64 %rd79, %rd129, %rd131; + add.s64 %rd80, %rd130, %rd132; + add.s64 %rd133, %rd79, %rd80; +$L__BB0_4: // %._crit_edge + .loc 1 25 37 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:25:37 + and.b32 %r20, %r1, 31; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + mov.b64 {_, %r21}, %rd133; + cvt.u32.u64 %r22, %rd133; + shfl.sync.bfly.b32 %r23, %r22, 16, 31, -1; + shfl.sync.bfly.b32 %r24, %r21, 16, 31, -1; + cvt.u64.u32 %rd86, %r23; + cvt.u64.u32 %rd87, %r24; + shl.b64 %rd88, %rd87, 32; + or.b64 %rd89, %rd86, %rd88; + .loc 2 261 15 // standard.py:261:15 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + add.s64 %rd90, %rd133, %rd89; + .loc 2 291 36 // standard.py:291:36 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + mov.b64 {_, %r25}, %rd90; + cvt.u32.u64 %r26, %rd90; + shfl.sync.bfly.b32 %r27, %r26, 8, 31, -1; + shfl.sync.bfly.b32 %r28, %r25, 8, 31, -1; + cvt.u64.u32 %rd91, %r27; + cvt.u64.u32 %rd92, %r28; + shl.b64 %rd93, %rd92, 32; + or.b64 %rd94, %rd91, %rd93; + .loc 2 261 15 // standard.py:261:15 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + add.s64 %rd95, %rd90, %rd94; + .loc 2 291 36 // standard.py:291:36 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + mov.b64 {_, %r29}, %rd95; + cvt.u32.u64 %r30, %rd95; + shfl.sync.bfly.b32 %r31, %r30, 4, 31, -1; + shfl.sync.bfly.b32 %r32, %r29, 4, 31, -1; + cvt.u64.u32 %rd96, %r31; + cvt.u64.u32 %rd97, %r32; + shl.b64 %rd98, %rd97, 32; + or.b64 %rd99, %rd96, %rd98; + .loc 2 261 15 // standard.py:261:15 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + add.s64 %rd100, %rd95, %rd99; + .loc 2 291 36 // standard.py:291:36 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + mov.b64 {_, %r33}, %rd100; + cvt.u32.u64 %r34, %rd100; + shfl.sync.bfly.b32 %r35, %r34, 2, 31, -1; + shfl.sync.bfly.b32 %r36, %r33, 2, 31, -1; + cvt.u64.u32 %rd101, %r35; + cvt.u64.u32 %rd102, %r36; + shl.b64 %rd103, %rd102, 32; + or.b64 %rd104, %rd101, %rd103; + .loc 2 261 15 // standard.py:261:15 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + add.s64 %rd105, %rd100, %rd104; + .loc 2 291 36 // standard.py:291:36 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + mov.b64 {_, %r37}, %rd105; + cvt.u32.u64 %r38, %rd105; + shfl.sync.bfly.b32 %r39, %r38, 1, 31, -1; + shfl.sync.bfly.b32 %r40, %r37, 1, 31, -1; + cvt.u64.u32 %rd106, %r39; + cvt.u64.u32 %rd107, %r40; + shl.b64 %rd108, %rd107, 32; + or.b64 %rd109, %rd106, %rd108; + .loc 2 261 15 // standard.py:261:15 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + add.s64 %rd81, %rd105, %rd109; + .loc 2 291 36 // standard.py:291:36 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + setp.eq.b32 %p19, %r20, 0; + shr.u32 %r41, %r1, 2; + and.b32 %r42, %r41, 120; + mov.b32 %r43, global_smem; + add.s32 %r17, %r43, %r42; + // begin inline asm + @%p19 st.shared.b64 [ %r17 + 0 ], %rd81; + // end inline asm + bar.sync 0; + setp.lt.u32 %p20, %r1, 16; + shl.b32 %r44, %r1, 3; + add.s32 %r18, %r43, %r44; + // begin inline asm + @%p20 ld.shared.b64 %rd82, [ %r18 + 0 ]; + // end inline asm + mov.b64 {_, %r45}, %rd82; + cvt.u32.u64 %r46, %rd82; + shfl.sync.bfly.b32 %r47, %r46, 8, 31, -1; + shfl.sync.bfly.b32 %r48, %r45, 8, 31, -1; + cvt.u64.u32 %rd110, %r47; + cvt.u64.u32 %rd111, %r48; + shl.b64 %rd112, %rd111, 32; + or.b64 %rd113, %rd110, %rd112; + .loc 2 261 15 // standard.py:261:15 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + add.s64 %rd114, %rd82, %rd113; + .loc 2 291 36 // standard.py:291:36 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + mov.b64 {_, %r49}, %rd114; + cvt.u32.u64 %r50, %rd114; + shfl.sync.bfly.b32 %r51, %r50, 4, 31, -1; + shfl.sync.bfly.b32 %r52, %r49, 4, 31, -1; + cvt.u64.u32 %rd115, %r51; + cvt.u64.u32 %rd116, %r52; + shl.b64 %rd117, %rd116, 32; + or.b64 %rd118, %rd115, %rd117; + .loc 2 261 15 // standard.py:261:15 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + add.s64 %rd119, %rd114, %rd118; + .loc 2 291 36 // standard.py:291:36 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + mov.b64 {_, %r53}, %rd119; + cvt.u32.u64 %r54, %rd119; + shfl.sync.bfly.b32 %r55, %r54, 2, 31, -1; + shfl.sync.bfly.b32 %r56, %r53, 2, 31, -1; + cvt.u64.u32 %rd120, %r55; + cvt.u64.u32 %rd121, %r56; + shl.b64 %rd122, %rd121, 32; + or.b64 %rd123, %rd120, %rd122; + .loc 2 261 15 // standard.py:261:15 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + add.s64 %rd124, %rd119, %rd123; + .loc 2 291 36 // standard.py:291:36 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + mov.b64 {_, %r57}, %rd124; + cvt.u32.u64 %r58, %rd124; + shfl.sync.bfly.b32 %r59, %r58, 1, 31, -1; + shfl.sync.bfly.b32 %r60, %r57, 1, 31, -1; + cvt.u64.u32 %rd125, %r59; + cvt.u64.u32 %rd126, %r60; + shl.b64 %rd127, %rd126, 32; + or.b64 %rd128, %rd125, %rd127; + .loc 2 261 15 // standard.py:261:15 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + add.s64 %rd83, %rd124, %rd128; + .loc 2 291 36 // standard.py:291:36 @[ c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:43:25 ] + setp.eq.b32 %p21, %r1, 0; + // begin inline asm + @%p21 st.shared.b64 [ %r18 + 0 ], %rd83; + // end inline asm + bar.sync 0; + ld.shared.b64 %rd84, [global_smem]; +$L__tmp2: + .loc 1 44 67 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:44:67 + and.b32 %r61, %r1, 511; + setp.eq.b32 %p22, %r61, 0; + // begin inline asm + @%p22 st.global.b64 [ %rd85 + 0 ], { %rd84 }; + // end inline asm + .loc 1 44 4 // c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py:44:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 222 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd7 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 50 +.b8 118 +.b8 97 +.b8 98 +.b8 98 +.b8 108 +.b8 114 +.b8 106 +.b8 122 +.b8 121 +.b8 114 +.b8 121 +.b8 97 +.b8 117 +.b8 99 +.b8 50 +.b8 106 +.b8 114 +.b8 97 +.b8 109 +.b8 53 +.b8 107 +.b8 119 +.b8 103 +.b8 119 +.b8 118 +.b8 106 +.b8 101 +.b8 120 +.b8 113 +.b8 53 +.b8 51 +.b8 98 +.b8 100 +.b8 119 +.b8 120 +.b8 117 +.b8 103 +.b8 97 +.b8 103 +.b8 106 +.b8 101 +.b8 103 +.b8 99 +.b8 50 +.b8 120 +.b8 118 +.b8 117 +.b8 102 +.b8 117 +.b8 121 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 50 +.b8 118 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x28 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xb3:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xc8:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 43 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.source b/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.source new file mode 100644 index 0000000000000000000000000000000000000000..23b1ced34561f9bdac239b1e13e428d540e33aaf --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.source @@ -0,0 +1,156 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":18:0) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc34 = loc(unknown) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc41 = loc("in_ptr0"(#loc)) +#loc42 = loc("in_ptr1"(#loc)) +#loc43 = loc("in_ptr2"(#loc)) +#loc44 = loc("out_ptr0"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc72 = loc("input"(#loc32)) +#loc73 = loc("a"(#loc37)) +#loc74 = loc("b"(#loc37)) +module { + tt.func public @triton_red_fused_eq_mul_squeeze_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel = arith.constant 1 : i32 loc(#loc46) + %xoffset = tt.get_program_id x : i32 loc(#loc47) + %xoffset_0 = arith.constant 1 : i32 loc(#loc48) + %xoffset_1 = arith.constant 1 : i32 loc(#loc48) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc48) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc49) + %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc50) + %xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<1x1xi32> loc(#loc51) + %xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<1x1xi32> loc(#loc51) + %xmask = arith.constant true loc(#loc52) + %xmask_6 = arith.constant dense : tensor<1x2048xi1> loc(#loc52) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc53) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc54) + %_tmp7 = arith.constant 0 : i64 loc(#loc55) + %_tmp7_8 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc55) + %c0_i32 = arith.constant 0 : i32 loc(#loc11) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc11) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc11) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc11) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc11) + %3 = ub.poison : i32 loc(#loc11) + %_tmp7_9 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp7_12 = %_tmp7_8) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc57) + %r0_index_13 = arith.addi %r0_index, %r0_base_7 : tensor<1x2048xi32> loc(#loc57) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x2048xi32> loc(#loc58) + %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x2048xi32> loc(#loc58) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc59) + %tmp0_15 = tt.addptr %tmp0, %r0_index_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc59) + %tmp0_16 = arith.constant 0.000000e+00 : f32 loc(#loc60) + %tmp0_17 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc60) + %tmp0_18 = arith.fptosi %tmp0_17 : tensor<1x2048xf32> to tensor<1x2048xi64> loc(#loc60) + %tmp0_19 = tt.load %tmp0_15, %r0_mask_14, %tmp0_18 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc60) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc61) + %tmp1_20 = tt.addptr %tmp1, %r0_index_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc61) + %tmp1_21 = arith.constant 0.000000e+00 : f32 loc(#loc62) + %tmp1_22 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc62) + %tmp1_23 = arith.fptosi %tmp1_22 : tensor<1x2048xf32> to tensor<1x2048xi64> loc(#loc62) + %tmp1_24 = tt.load %tmp1_20, %r0_mask_14, %tmp1_23 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc62) + %tmp4 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc63) + %tmp4_25 = tt.addptr %tmp4, %r0_index_13 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc63) + %tmp4_26 = arith.constant 0.000000e+00 : f32 loc(#loc64) + %tmp4_27 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc64) + %tmp4_28 = arith.fptosi %tmp4_27 : tensor<1x2048xf32> to tensor<1x2048xi64> loc(#loc64) + %tmp4_29 = tt.load %tmp4_25, %r0_mask_14, %tmp4_28 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc64) + %tmp2 = arith.cmpi eq, %tmp0_19, %tmp1_24 : tensor<1x2048xi64> loc(#loc65) + %tmp3 = arith.extui %tmp2 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc66) + %tmp5 = arith.muli %tmp3, %tmp4_29 : tensor<1x2048xi64> loc(#loc67) + %tmp8 = arith.addi %_tmp7_12, %tmp5 : tensor<1x2048xi64> loc(#loc68) + %_tmp7_30 = arith.select %r0_mask_14, %tmp8, %_tmp7_12 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc69) + scf.yield %_tmp7_30 : tensor<1x2048xi64> loc(#loc25) + } loc(#loc56) + %tmp7 = tt.call @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp7_9) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc70) + %tmp7_10 = tt.expand_dims %tmp7 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc71) + %c0_i32_11 = arith.constant 0 : i32 loc(#loc28) + %cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc28) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc29) + %5 = tt.addptr %4, %cst : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc29) + tt.store %5, %tmp7_10 : tensor<1x1x!tt.ptr> loc(#loc30) + tt.return loc(#loc31) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2048xi64> loc("input"(#loc32))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc33) + tt.reduce.return %2 : i64 loc(#loc33) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc33) + tt.return %0 : tensor<1xi64> loc(#loc35) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc36) + tt.return %1 : tensor<1xi64> loc(#loc36) + } loc(#loc32) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc37)), %b: i64 loc("b"(#loc37))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc38) + tt.return %0 : i64 loc(#loc39) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc40) + tt.return %1 : i64 loc(#loc40) + } loc(#loc37) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":24:46) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":27:43) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":28:40) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":29:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":30:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":34:34) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":34:41) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":35:34) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":35:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":36:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":36:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":37:23) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":38:23) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":39:22) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":41:23) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":42:40) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":42:8) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":43:25) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":43:28) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":44:49) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":44:25) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":44:67) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":44:4) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc46 = loc("xnumel"(#loc1)) +#loc47 = loc("xoffset"(#loc2)) +#loc48 = loc("xoffset"(#loc3)) +#loc49 = loc("xindex"(#loc4)) +#loc50 = loc("xindex"(#loc5)) +#loc51 = loc("xindex"(#loc6)) +#loc52 = loc("xmask"(#loc7)) +#loc53 = loc("r0_base"(#loc8)) +#loc54 = loc("r0_base"(#loc9)) +#loc55 = loc("_tmp7"(#loc10)) +#loc56 = loc("_tmp7"(#loc11)) +#loc57 = loc("r0_index"(#loc12)) +#loc58 = loc("r0_mask"(#loc13)) +#loc59 = loc("tmp0"(#loc14)) +#loc60 = loc("tmp0"(#loc15)) +#loc61 = loc("tmp1"(#loc16)) +#loc62 = loc("tmp1"(#loc17)) +#loc63 = loc("tmp4"(#loc18)) +#loc64 = loc("tmp4"(#loc19)) +#loc65 = loc("tmp2"(#loc20)) +#loc66 = loc("tmp3"(#loc21)) +#loc67 = loc("tmp5"(#loc22)) +#loc68 = loc("tmp8"(#loc23)) +#loc69 = loc("_tmp7"(#loc24)) +#loc70 = loc("tmp7"(#loc26)) +#loc71 = loc("tmp7"(#loc27)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..bf0ae5ccfc0f86ff7640d1dfdb788504cccb5d12 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.ttgir @@ -0,0 +1,91 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":18:0) +#loc1 = loc(unknown) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":43:25) +#loc24 = loc("in_ptr0"(#loc)) +#loc25 = loc("in_ptr1"(#loc)) +#loc26 = loc("in_ptr2"(#loc)) +#loc27 = loc("out_ptr0"(#loc)) +#loc28 = loc("r0_numel"(#loc)) +#loc44 = loc("tmp7"(#loc19)) +#loc47 = loc(callsite(#loc1 at #loc44)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_eq_mul_squeeze_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x2048xi64, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc29) + %r0_base_0 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc29) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x2048xi32, #blocked> loc(#loc30) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc31) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc32) + %tmp4 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc33) + %_tmp7 = scf.for %_tmp7_3 = %c0_i32 to %r0_numel step %c2048_i32 iter_args(%arg6 = %cst) -> (tensor<1x2048xi64, #blocked>) : i32 { + %r0_index = tt.splat %_tmp7_3 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc35) + %r0_index_4 = arith.addi %r0_index, %r0_base_0 : tensor<1x2048xi32, #blocked> loc(#loc35) + %r0_mask_5 = arith.cmpi slt, %r0_index_4, %r0_mask : tensor<1x2048xi32, #blocked> loc(#loc30) + %tmp0_6 = tt.addptr %tmp0, %r0_index_4 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc31) + %tmp0_7 = tt.load %tmp0_6, %r0_mask_5, %cst evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc36) + %tmp1_8 = tt.addptr %tmp1, %r0_index_4 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc32) + %tmp1_9 = tt.load %tmp1_8, %r0_mask_5, %cst evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc37) + %tmp4_10 = tt.addptr %tmp4, %r0_index_4 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc33) + %tmp4_11 = tt.load %tmp4_10, %r0_mask_5, %cst evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc38) + %tmp2 = arith.cmpi eq, %tmp0_7, %tmp1_9 : tensor<1x2048xi64, #blocked> loc(#loc39) + %tmp3 = arith.extui %tmp2 : tensor<1x2048xi1, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc40) + %tmp5 = arith.muli %tmp3, %tmp4_11 : tensor<1x2048xi64, #blocked> loc(#loc41) + %tmp8 = arith.addi %arg6, %tmp5 : tensor<1x2048xi64, #blocked> loc(#loc42) + %_tmp7_12 = arith.select %r0_mask_5, %tmp8, %arg6 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc43) + scf.yield %_tmp7_12 : tensor<1x2048xi64, #blocked> loc(#loc17) + } loc(#loc34) + %tmp7 = "tt.reduce"(%_tmp7) <{axis = 1 : i32}> ({ + ^bb0(%tmp7_3: i64 loc(callsite(#loc1 at #loc44)), %tmp7_4: i64 loc(callsite(#loc1 at #loc44))): + %tmp7_5 = arith.addi %tmp7_3, %tmp7_4 : i64 loc(#loc48) + tt.reduce.return %tmp7_5 : i64 loc(#loc46) + }) : (tensor<1x2048xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc46) + %tmp7_1 = ttg.convert_layout %tmp7 : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc45) + %tmp7_2 = tt.expand_dims %tmp7_1 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi64, #blocked1> loc(#loc45) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc22) + tt.store %0, %tmp7_2 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc22) + tt.return loc(#loc23) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":25:37) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":30:29) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":34:34) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":35:34) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":36:34) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":28:40) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":29:31) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":34:41) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":35:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":36:41) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":37:23) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":38:23) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":39:22) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":41:23) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":42:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":42:8) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":43:28) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":44:67) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":44:4) +#loc29 = loc("r0_base"(#loc2)) +#loc30 = loc("r0_mask"(#loc3)) +#loc31 = loc("tmp0"(#loc4)) +#loc32 = loc("tmp1"(#loc5)) +#loc33 = loc("tmp4"(#loc6)) +#loc34 = loc("_tmp7"(#loc7)) +#loc35 = loc("r0_index"(#loc8)) +#loc36 = loc("tmp0"(#loc9)) +#loc37 = loc("tmp1"(#loc10)) +#loc38 = loc("tmp4"(#loc11)) +#loc39 = loc("tmp2"(#loc12)) +#loc40 = loc("tmp3"(#loc13)) +#loc41 = loc("tmp5"(#loc14)) +#loc42 = loc("tmp8"(#loc15)) +#loc43 = loc("_tmp7"(#loc16)) +#loc45 = loc("tmp7"(#loc21)) +#loc46 = loc(callsite(#loc18 at #loc44)) +#loc48 = loc(callsite(#loc20 at #loc46)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..11eca43006211ad931b1036e7b1253c99f4a596a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WNUNZAJX6T6IMUAJYWHRFODV3M3HDLV7NA2YTZ7PF6GUQVFD2Q5Q/triton_red_fused_eq_mul_squeeze_sum_2.ttir @@ -0,0 +1,91 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":18:0) +#loc2 = loc(unknown) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":43:25) +#loc26 = loc("in_ptr0"(#loc)) +#loc27 = loc("in_ptr1"(#loc)) +#loc28 = loc("in_ptr2"(#loc)) +#loc29 = loc("out_ptr0"(#loc)) +#loc30 = loc("r0_numel"(#loc)) +#loc47 = loc("tmp7"(#loc20)) +#loc50 = loc(callsite(#loc2 at #loc47)) +module { + tt.func public @triton_red_fused_eq_mul_squeeze_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc2) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc31) + %r0_base_0 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc32) + %_tmp7 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c2048_i32 iter_args(%_tmp7_2 = %cst) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc34) + %r0_index_3 = arith.addi %r0_index, %r0_base_0 : tensor<1x2048xi32> loc(#loc34) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x2048xi32> loc(#loc35) + %r0_mask_4 = arith.cmpi slt, %r0_index_3, %r0_mask : tensor<1x2048xi32> loc(#loc35) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc36) + %tmp0_5 = tt.addptr %tmp0, %r0_index_3 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc36) + %tmp0_6 = tt.load %tmp0_5, %r0_mask_4, %cst evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc37) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc38) + %tmp1_7 = tt.addptr %tmp1, %r0_index_3 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc38) + %tmp1_8 = tt.load %tmp1_7, %r0_mask_4, %cst evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc39) + %tmp4 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc40) + %tmp4_9 = tt.addptr %tmp4, %r0_index_3 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc40) + %tmp4_10 = tt.load %tmp4_9, %r0_mask_4, %cst evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc41) + %tmp2 = arith.cmpi eq, %tmp0_6, %tmp1_8 : tensor<1x2048xi64> loc(#loc42) + %tmp3 = arith.extui %tmp2 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc43) + %tmp5 = arith.muli %tmp3, %tmp4_10 : tensor<1x2048xi64> loc(#loc44) + %tmp8 = arith.addi %_tmp7_2, %tmp5 : tensor<1x2048xi64> loc(#loc45) + %_tmp7_11 = arith.select %r0_mask_4, %tmp8, %_tmp7_2 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc46) + scf.yield %_tmp7_11 : tensor<1x2048xi64> loc(#loc18) + } loc(#loc33) + %tmp7 = "tt.reduce"(%_tmp7) <{axis = 1 : i32}> ({ + ^bb0(%tmp7_2: i64 loc(callsite(#loc2 at #loc47)), %tmp7_3: i64 loc(callsite(#loc2 at #loc47))): + %tmp7_4 = arith.addi %tmp7_2, %tmp7_3 : i64 loc(#loc51) + tt.reduce.return %tmp7_4 : i64 loc(#loc49) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc49) + %tmp7_1 = tt.expand_dims %tmp7 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc48) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc23) + tt.store %0, %tmp7_1 : tensor<1x1x!tt.ptr> loc(#loc24) + tt.return loc(#loc25) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":28:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":25:27) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":29:31) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":30:29) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":34:34) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":34:41) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":35:34) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":35:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":36:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":36:41) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":37:23) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":38:23) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":39:22) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":41:23) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":42:40) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":42:8) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":43:28) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":44:25) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":44:67) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/2v/c2vabblrjzyryauc2jram5kwgwvjexq53bdwxugagjegc2xvufuy.py":44:4) +#loc31 = loc("r0_base"(#loc3)) +#loc32 = loc("r0_base"(#loc4)) +#loc33 = loc("_tmp7"(#loc1)) +#loc34 = loc("r0_index"(#loc5)) +#loc35 = loc("r0_mask"(#loc6)) +#loc36 = loc("tmp0"(#loc7)) +#loc37 = loc("tmp0"(#loc8)) +#loc38 = loc("tmp1"(#loc9)) +#loc39 = loc("tmp1"(#loc10)) +#loc40 = loc("tmp4"(#loc11)) +#loc41 = loc("tmp4"(#loc12)) +#loc42 = loc("tmp2"(#loc13)) +#loc43 = loc("tmp3"(#loc14)) +#loc44 = loc("tmp5"(#loc15)) +#loc45 = loc("tmp8"(#loc16)) +#loc46 = loc("_tmp7"(#loc17)) +#loc48 = loc("tmp7"(#loc22)) +#loc49 = loc(callsite(#loc19 at #loc47)) +#loc51 = loc(callsite(#loc21 at #loc49)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/__grp__triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/__grp__triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..18bcd06724ea03635a4afb898a3f67a6b753f0bd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/__grp__triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.source", "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttir", "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttgir", "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.llir", "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ptx", "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.cubin", "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..873827af0e288a123f0271182f432f7048c57729 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5e73b9c4b4cc44cb4d0481c866b5419c80a9f7e2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.json @@ -0,0 +1 @@ +{"hash": "b49d98ec8578f21ee084d87ee226398e84a084427899998d2ea45ff1e1765227", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..470f0dcef1c4551709475213af78deec55bb6ecc --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.llir @@ -0,0 +1,475 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py\00" +@assertMessage_0 = internal constant [40 x i8] c"index out of bounds: 0 <= tmp6 < 151936\00" +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #1 !dbg !9 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !11 + %11 = shl nuw nsw i32 %10, 2, !dbg !11 + %12 = and i32 %11, 2044, !dbg !11 + %13 = mul i32 %9, 151936, !dbg !12 + %14 = zext nneg i32 %12 to i64, !dbg !13 + br label %15, !dbg !13 + +15: ; preds = %8, %15 + %indvars.iv = phi i64 [ 0, %8 ], [ %indvars.iv.next, %15 ] + %16 = phi i32 [ 2147483647, %8 ], [ %102, %15 ] + %17 = phi i32 [ 2147483647, %8 ], [ %103, %15 ] + %18 = phi float [ 0xFFF0000000000000, %8 ], [ %99, %15 ] + %19 = phi float [ 0xFFF0000000000000, %8 ], [ %100, %15 ] + %20 = phi <2 x float> [ splat (float 0xFFF0000000000000), %8 ], [ %98, %15 ] + %21 = phi <2 x i32> [ splat (i32 2147483647), %8 ], [ %101, %15 ] + %22 = or disjoint i64 %indvars.iv, %14, !dbg !14 + %23 = or disjoint i64 %22, 2, !dbg !14 + %24 = or disjoint i64 %22, 3, !dbg !14 + %25 = icmp samesign ult i64 %22, 151936, !dbg !15 + %26 = trunc nuw nsw i64 %22 to i32, !dbg !16 + %27 = add i32 %13, %26, !dbg !16 + %28 = sext i32 %27 to i64, !dbg !17 + %29 = getelementptr bfloat, ptr addrspace(1) %1, i64 %28, !dbg !17 + %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #5, !dbg !18 + %31 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %29, i64 %30, i1 %25) #5, !dbg !18 + %32 = extractvalue { i32, i32 } %31, 0, !dbg !18 + %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !18 + %34 = extractvalue { i32, i32 } %31, 1, !dbg !18 + %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !18 + %36 = extractelement <2 x bfloat> %35, i64 0, !dbg !18 + %37 = extractelement <2 x bfloat> %35, i64 1, !dbg !18 + %38 = fpext bfloat %36 to float, !dbg !19 + %39 = fpext bfloat %37 to float, !dbg !19 + %40 = fcmp ogt float %18, %38, !dbg !20 + %41 = fcmp ogt float %19, %39, !dbg !20 + %42 = fcmp oeq float %18, %38, !dbg !24 + %43 = fcmp oeq float %19, %39, !dbg !24 + %44 = fcmp uno <2 x float> %20, zeroinitializer, !dbg !25 + %45 = fcmp uno float %18, 0.000000e+00, !dbg !25 + %46 = fcmp uno float %19, 0.000000e+00, !dbg !25 + %47 = fcmp uno bfloat %36, 0xR0000, !dbg !26 + %48 = fcmp uno bfloat %37, 0xR0000, !dbg !26 + %49 = xor i1 %47, true, !dbg !27 + %50 = xor i1 %48, true, !dbg !27 + %51 = and i1 %45, %49, !dbg !28 + %52 = and i1 %46, %50, !dbg !28 + %53 = or i1 %40, %51, !dbg !29 + %54 = or i1 %41, %52, !dbg !29 + %55 = and i1 %45, %47, !dbg !30 + %56 = and i1 %46, %48, !dbg !30 + %57 = or i1 %42, %55, !dbg !31 + %58 = or i1 %43, %56, !dbg !31 + %59 = sext <2 x i32> %21 to <2 x i64>, !dbg !32 + %60 = sext i32 %16 to i64, !dbg !32 + %61 = icmp sgt i64 %23, %60, !dbg !32 + %62 = sext i32 %17 to i64, !dbg !32 + %63 = icmp sgt i64 %24, %62, !dbg !32 + %64 = and i1 %61, %57, !dbg !33 + %65 = and i1 %63, %58, !dbg !33 + %66 = or i1 %53, %64, !dbg !34 + %67 = or i1 %54, %65, !dbg !34 + %68 = select i1 %66, float %18, float %38, !dbg !35 + %69 = select i1 %67, float %19, float %39, !dbg !35 + %70 = trunc nuw nsw i64 %22 to i32, !dbg !36 + %71 = or disjoint i32 %70, 1, !dbg !36 + %72 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !19 + %73 = fcmp ogt <2 x float> %20, %72, !dbg !20 + %74 = fcmp oeq <2 x float> %20, %72, !dbg !24 + %75 = fcmp uno <2 x bfloat> %33, zeroinitializer, !dbg !26 + %76 = xor <2 x i1> %75, splat (i1 true), !dbg !27 + %77 = and <2 x i1> %44, %76, !dbg !28 + %78 = or <2 x i1> %73, %77, !dbg !29 + %79 = and <2 x i1> %44, %75, !dbg !30 + %80 = or <2 x i1> %74, %79, !dbg !31 + %81 = insertelement <2 x i64> poison, i64 %22, i64 0, !dbg !32 + %82 = shufflevector <2 x i64> %81, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !32 + %83 = icmp sgt <2 x i64> %82, %59, !dbg !32 + %84 = icmp sge <2 x i64> %82, %59, !dbg !32 + %85 = shufflevector <2 x i1> %83, <2 x i1> %84, <2 x i32> , !dbg !32 + %86 = and <2 x i1> %85, %80, !dbg !33 + %87 = or <2 x i1> %78, %86, !dbg !34 + %88 = select <2 x i1> %87, <2 x float> %20, <2 x float> %72, !dbg !35 + %89 = insertelement <2 x i32> poison, i32 %26, i64 0, !dbg !36 + %90 = insertelement <2 x i32> %89, i32 %71, i64 1, !dbg !36 + %91 = select <2 x i1> %87, <2 x i32> %21, <2 x i32> %90, !dbg !36 + %92 = trunc nuw nsw i64 %23 to i32, !dbg !36 + %93 = select i1 %66, i32 %16, i32 %92, !dbg !36 + %94 = trunc nuw nsw i64 %24 to i32, !dbg !36 + %95 = select i1 %67, i32 %17, i32 %94, !dbg !36 + %96 = insertelement <2 x i1> poison, i1 %25, i64 0, !dbg !37 + %97 = shufflevector <2 x i1> %96, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !37 + %98 = select <2 x i1> %97, <2 x float> %88, <2 x float> %20, !dbg !37 + %99 = select i1 %25, float %68, float %18, !dbg !37 + %100 = select i1 %25, float %69, float %19, !dbg !37 + %101 = select <2 x i1> %97, <2 x i32> %91, <2 x i32> %21, !dbg !38 + %102 = select i1 %25, i32 %93, i32 %16, !dbg !38 + %103 = select i1 %25, i32 %95, i32 %17, !dbg !38 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !13 + %104 = icmp samesign ult i64 %indvars.iv, 149888, !dbg !13 + br i1 %104, label %15, label %105, !dbg !13 + +105: ; preds = %15 + %106 = and i32 %10, 31, !dbg !11 + %107 = lshr i32 %10, 5, !dbg !11 + %108 = shufflevector <2 x float> %98, <2 x float> poison, <2 x i32> , !dbg !39 + %109 = fcmp ogt <2 x float> %98, %108, !dbg !39 + %110 = fcmp oeq <2 x float> %98, %108, !dbg !39 + %111 = shufflevector <2 x i1> %109, <2 x i1> %110, <2 x i32> , !dbg !39 + %112 = extractelement <2 x float> %98, i64 0, !dbg !41 + %113 = fcmp uno float %112, 0.000000e+00, !dbg !41 + %114 = extractelement <2 x float> %98, i64 1, !dbg !42 + %115 = fcmp uno float %114, 0.000000e+00, !dbg !42 + %116 = xor i1 %115, true, !dbg !43 + %117 = insertelement <2 x i1> poison, i1 %113, i64 0, !dbg !44 + %118 = shufflevector <2 x i1> %117, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !44 + %119 = insertelement <2 x i1> poison, i1 %116, i64 0, !dbg !44 + %120 = insertelement <2 x i1> %119, i1 %115, i64 1, !dbg !44 + %121 = and <2 x i1> %118, %120, !dbg !44 + %122 = or <2 x i1> %111, %121, !dbg !45 + %123 = extractelement <2 x i32> %101, i64 0, !dbg !46 + %124 = extractelement <2 x i32> %101, i64 1, !dbg !46 + %125 = icmp slt i32 %123, %124, !dbg !46 + %126 = extractelement <2 x i1> %122, i64 1, !dbg !47 + %127 = and i1 %125, %126, !dbg !47 + %128 = extractelement <2 x i1> %122, i64 0, !dbg !48 + %129 = or i1 %128, %127, !dbg !48 + %130 = select i1 %129, float %112, float %114, !dbg !49 + %131 = select i1 %129, i32 %123, i32 %124, !dbg !50 + %132 = fcmp ogt float %130, %99, !dbg !39 + %133 = fcmp oeq float %130, %99, !dbg !51 + %134 = fcmp uno float %130, 0.000000e+00, !dbg !41 + %135 = fcmp uno float %99, 0.000000e+00, !dbg !42 + %136 = xor i1 %135, true, !dbg !43 + %137 = and i1 %134, %136, !dbg !44 + %138 = or i1 %132, %137, !dbg !45 + %139 = and i1 %135, %134, !dbg !52 + %140 = or i1 %133, %139, !dbg !53 + %141 = icmp slt i32 %131, %102, !dbg !46 + %142 = and i1 %141, %140, !dbg !47 + %143 = or i1 %138, %142, !dbg !48 + %144 = select i1 %143, float %130, float %99, !dbg !49 + %145 = select i1 %143, i32 %131, i32 %102, !dbg !50 + %146 = fcmp ogt float %144, %100, !dbg !39 + %147 = fcmp oeq float %144, %100, !dbg !51 + %148 = fcmp uno float %144, 0.000000e+00, !dbg !41 + %149 = fcmp uno float %100, 0.000000e+00, !dbg !42 + %150 = xor i1 %149, true, !dbg !43 + %151 = and i1 %148, %150, !dbg !44 + %152 = or i1 %146, %151, !dbg !45 + %153 = and i1 %149, %148, !dbg !52 + %154 = or i1 %147, %153, !dbg !53 + %155 = icmp slt i32 %145, %103, !dbg !46 + %156 = and i1 %155, %154, !dbg !47 + %157 = or i1 %152, %156, !dbg !48 + %158 = select i1 %157, float %144, float %100, !dbg !49 + %159 = select i1 %157, i32 %145, i32 %103, !dbg !50 + %160 = bitcast float %158 to i32, !dbg !54 + %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 16, i32 31), !dbg !54 + %162 = bitcast i32 %161 to float, !dbg !54 + %163 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %159, i32 16, i32 31), !dbg !54 + %164 = fcmp ogt float %158, %162, !dbg !39 + %165 = fcmp oeq float %158, %162, !dbg !51 + %166 = fcmp uno float %158, 0.000000e+00, !dbg !41 + %167 = fcmp uno float %162, 0.000000e+00, !dbg !42 + %168 = xor i1 %167, true, !dbg !43 + %169 = and i1 %166, %168, !dbg !44 + %170 = or i1 %164, %169, !dbg !45 + %171 = and i1 %166, %167, !dbg !52 + %172 = or i1 %165, %171, !dbg !53 + %173 = icmp slt i32 %159, %163, !dbg !46 + %174 = and i1 %173, %172, !dbg !47 + %175 = or i1 %170, %174, !dbg !48 + %176 = select i1 %175, float %158, float %162, !dbg !49 + %177 = select i1 %175, i32 %159, i32 %163, !dbg !50 + %178 = bitcast float %176 to i32, !dbg !54 + %179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 8, i32 31), !dbg !54 + %180 = bitcast i32 %179 to float, !dbg !54 + %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 8, i32 31), !dbg !54 + %182 = fcmp ogt float %176, %180, !dbg !39 + %183 = fcmp oeq float %176, %180, !dbg !51 + %184 = fcmp uno float %176, 0.000000e+00, !dbg !41 + %185 = fcmp uno float %180, 0.000000e+00, !dbg !42 + %186 = xor i1 %185, true, !dbg !43 + %187 = and i1 %184, %186, !dbg !44 + %188 = or i1 %182, %187, !dbg !45 + %189 = and i1 %185, %184, !dbg !52 + %190 = or i1 %183, %189, !dbg !53 + %191 = icmp slt i32 %177, %181, !dbg !46 + %192 = and i1 %191, %190, !dbg !47 + %193 = or i1 %188, %192, !dbg !48 + %194 = select i1 %193, float %176, float %180, !dbg !49 + %195 = select i1 %193, i32 %177, i32 %181, !dbg !50 + %196 = bitcast float %194 to i32, !dbg !54 + %197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 4, i32 31), !dbg !54 + %198 = bitcast i32 %197 to float, !dbg !54 + %199 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %195, i32 4, i32 31), !dbg !54 + %200 = fcmp ogt float %194, %198, !dbg !39 + %201 = fcmp oeq float %194, %198, !dbg !51 + %202 = fcmp uno float %194, 0.000000e+00, !dbg !41 + %203 = fcmp uno float %198, 0.000000e+00, !dbg !42 + %204 = xor i1 %203, true, !dbg !43 + %205 = and i1 %202, %204, !dbg !44 + %206 = or i1 %200, %205, !dbg !45 + %207 = and i1 %203, %202, !dbg !52 + %208 = or i1 %201, %207, !dbg !53 + %209 = icmp slt i32 %195, %199, !dbg !46 + %210 = and i1 %209, %208, !dbg !47 + %211 = or i1 %206, %210, !dbg !48 + %212 = select i1 %211, float %194, float %198, !dbg !49 + %213 = select i1 %211, i32 %195, i32 %199, !dbg !50 + %214 = bitcast float %212 to i32, !dbg !54 + %215 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %214, i32 2, i32 31), !dbg !54 + %216 = bitcast i32 %215 to float, !dbg !54 + %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %213, i32 2, i32 31), !dbg !54 + %218 = fcmp ogt float %212, %216, !dbg !39 + %219 = fcmp oeq float %212, %216, !dbg !51 + %220 = fcmp uno float %212, 0.000000e+00, !dbg !41 + %221 = fcmp uno float %216, 0.000000e+00, !dbg !42 + %222 = xor i1 %221, true, !dbg !43 + %223 = and i1 %220, %222, !dbg !44 + %224 = or i1 %218, %223, !dbg !45 + %225 = and i1 %221, %220, !dbg !52 + %226 = or i1 %219, %225, !dbg !53 + %227 = icmp slt i32 %213, %217, !dbg !46 + %228 = and i1 %227, %226, !dbg !47 + %229 = or i1 %224, %228, !dbg !48 + %230 = select i1 %229, float %212, float %216, !dbg !49 + %231 = select i1 %229, i32 %213, i32 %217, !dbg !50 + %232 = bitcast float %230 to i32, !dbg !54 + %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !54 + %234 = bitcast i32 %233 to float, !dbg !54 + %235 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %231, i32 1, i32 31), !dbg !54 + %236 = fcmp ogt float %230, %234, !dbg !39 + %237 = fcmp oeq float %230, %234, !dbg !51 + %238 = fcmp uno float %230, 0.000000e+00, !dbg !41 + %239 = fcmp uno float %234, 0.000000e+00, !dbg !42 + %240 = xor i1 %239, true, !dbg !43 + %241 = and i1 %238, %240, !dbg !44 + %242 = or i1 %236, %241, !dbg !45 + %243 = and i1 %239, %238, !dbg !52 + %244 = or i1 %237, %243, !dbg !53 + %245 = icmp slt i32 %231, %235, !dbg !46 + %246 = and i1 %245, %244, !dbg !47 + %247 = or i1 %242, %246, !dbg !48 + %248 = select i1 %247, i32 %231, i32 %235, !dbg !50 + %249 = and i32 %107, 15, !dbg !54 + %250 = icmp eq i32 %106, 0, !dbg !54 + %251 = getelementptr float, ptr addrspace(3) @global_smem, i32 %249, !dbg !54 + %252 = select i1 %247, i32 %232, i32 %233, !dbg !49 + %253 = insertelement <1 x i32> poison, i32 %252, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %251, <1 x i32> %253, i1 %250) #5, !dbg !54 + %254 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %249, !dbg !54 + %255 = insertelement <1 x i32> poison, i32 %248, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %254, <1 x i32> %255, i1 %250) #5, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + %256 = icmp samesign ult i32 %10, 16, !dbg !54 + %257 = getelementptr float, ptr addrspace(3) @global_smem, i32 %10, !dbg !54 + %258 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %257, i1 %256) #5, !dbg !54 + %259 = bitcast i32 %258 to float, !dbg !54 + %260 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %10, !dbg !54 + %261 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %260, i1 %256) #5, !dbg !54 + %262 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %258, i32 8, i32 31), !dbg !54 + %263 = bitcast i32 %262 to float, !dbg !54 + %264 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %261, i32 8, i32 31), !dbg !54 + %265 = fcmp ogt float %259, %263, !dbg !39 + %266 = fcmp oeq float %259, %263, !dbg !51 + %267 = fcmp uno float %259, 0.000000e+00, !dbg !41 + %268 = fcmp uno float %263, 0.000000e+00, !dbg !42 + %269 = xor i1 %268, true, !dbg !43 + %270 = and i1 %267, %269, !dbg !44 + %271 = or i1 %265, %270, !dbg !45 + %272 = and i1 %267, %268, !dbg !52 + %273 = or i1 %266, %272, !dbg !53 + %274 = icmp slt i32 %261, %264, !dbg !46 + %275 = and i1 %274, %273, !dbg !47 + %276 = or i1 %271, %275, !dbg !48 + %277 = select i1 %276, float %259, float %263, !dbg !49 + %278 = select i1 %276, i32 %261, i32 %264, !dbg !50 + %279 = bitcast float %277 to i32, !dbg !54 + %280 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %279, i32 4, i32 31), !dbg !54 + %281 = bitcast i32 %280 to float, !dbg !54 + %282 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %278, i32 4, i32 31), !dbg !54 + %283 = fcmp ogt float %277, %281, !dbg !39 + %284 = fcmp oeq float %277, %281, !dbg !51 + %285 = fcmp uno float %277, 0.000000e+00, !dbg !41 + %286 = fcmp uno float %281, 0.000000e+00, !dbg !42 + %287 = xor i1 %286, true, !dbg !43 + %288 = and i1 %285, %287, !dbg !44 + %289 = or i1 %283, %288, !dbg !45 + %290 = and i1 %286, %285, !dbg !52 + %291 = or i1 %284, %290, !dbg !53 + %292 = icmp slt i32 %278, %282, !dbg !46 + %293 = and i1 %292, %291, !dbg !47 + %294 = or i1 %289, %293, !dbg !48 + %295 = select i1 %294, float %277, float %281, !dbg !49 + %296 = select i1 %294, i32 %278, i32 %282, !dbg !50 + %297 = bitcast float %295 to i32, !dbg !54 + %298 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %297, i32 2, i32 31), !dbg !54 + %299 = bitcast i32 %298 to float, !dbg !54 + %300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %296, i32 2, i32 31), !dbg !54 + %301 = fcmp ogt float %295, %299, !dbg !39 + %302 = fcmp oeq float %295, %299, !dbg !51 + %303 = fcmp uno float %295, 0.000000e+00, !dbg !41 + %304 = fcmp uno float %299, 0.000000e+00, !dbg !42 + %305 = xor i1 %304, true, !dbg !43 + %306 = and i1 %303, %305, !dbg !44 + %307 = or i1 %301, %306, !dbg !45 + %308 = and i1 %304, %303, !dbg !52 + %309 = or i1 %302, %308, !dbg !53 + %310 = icmp slt i32 %296, %300, !dbg !46 + %311 = and i1 %310, %309, !dbg !47 + %312 = or i1 %307, %311, !dbg !48 + %313 = select i1 %312, float %295, float %299, !dbg !49 + %314 = select i1 %312, i32 %296, i32 %300, !dbg !50 + %315 = bitcast float %313 to i32, !dbg !54 + %316 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %315, i32 1, i32 31), !dbg !54 + %317 = bitcast i32 %316 to float, !dbg !54 + %318 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !54 + %319 = fcmp ogt float %313, %317, !dbg !39 + %320 = fcmp oeq float %313, %317, !dbg !51 + %321 = fcmp uno float %313, 0.000000e+00, !dbg !41 + %322 = fcmp uno float %317, 0.000000e+00, !dbg !42 + %323 = xor i1 %322, true, !dbg !43 + %324 = and i1 %321, %323, !dbg !44 + %325 = or i1 %319, %324, !dbg !45 + %326 = and i1 %322, %321, !dbg !52 + %327 = or i1 %320, %326, !dbg !53 + %328 = icmp slt i32 %314, %318, !dbg !46 + %329 = and i1 %328, %327, !dbg !47 + %330 = or i1 %325, %329, !dbg !48 + %331 = select i1 %330, i32 %314, i32 %318, !dbg !50 + %332 = icmp eq i32 %10, 0, !dbg !54 + %333 = select i1 %330, i32 %315, i32 %316, !dbg !49 + %334 = insertelement <1 x i32> poison, i32 %333, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %257, <1 x i32> %334, i1 %332) #5, !dbg !54 + %335 = insertelement <1 x i32> poison, i32 %331, i64 0, !dbg !54 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %260, <1 x i32> %335, i1 %332) #5, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + %336 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !54 + %337 = zext nneg i32 %9 to i64, !dbg !55 + %338 = getelementptr i64, ptr addrspace(1) %3, i64 %337, !dbg !55 + %339 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !56 + %340 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l"(ptr addrspace(1) %338, i64 %339) #5, !dbg !56 + %341 = add i32 %336, 151936, !dbg !57 + %342 = icmp slt i32 %336, 0, !dbg !58 + %343 = select i1 %342, i32 %341, i32 %336, !dbg !59 + %344 = icmp ugt i32 %343, 151935, !dbg !60 + br i1 %344, label %345, label %346, !dbg !61 + +345: ; preds = %105 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 51, ptr nonnull @assertFunc_0, i64 1), !dbg !61 + unreachable, !dbg !61 + +346: ; preds = %105 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %347 = zext nneg i32 %343 to i64, !dbg !62 + %348 = getelementptr i1, ptr addrspace(1) %2, i64 %347, !dbg !62 + %349 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !63 + %350 = tail call i8 asm sideeffect "mov.u16 $0, 0x0;\0A\09ld.global.L1::evict_last.L2::cache_hint.b8 { $0 }, [ $1 + 0 ], $2;", "=c,l,l"(ptr addrspace(1) %348, i64 %349) #5, !dbg !63 + %.not = icmp eq i8 %350, 0, !dbg !63 + %351 = select i1 %.not, i64 0, i64 %340, !dbg !64 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !65 + %352 = getelementptr i64, ptr addrspace(1) %0, i64 %337, !dbg !66 + %353 = and i32 %10, 511, !dbg !67 + %354 = icmp eq i32 %353, 0, !dbg !67 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %351, ptr addrspace(1) %352, i1 %354) #5, !dbg !67 + ret void, !dbg !68 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="512" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0", linkageName: "triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 23, column: 28, scope: !9) +!11 = !DILocation(line: 26, column: 37, scope: !9) +!12 = !DILocation(line: 37, column: 48, scope: !9) +!13 = !DILocation(line: 31, column: 40, scope: !9) +!14 = !DILocation(line: 32, column: 31, scope: !9) +!15 = !DILocation(line: 33, column: 29, scope: !9) +!16 = !DILocation(line: 37, column: 41, scope: !9) +!17 = !DILocation(line: 37, column: 34, scope: !9) +!18 = !DILocation(line: 37, column: 53, scope: !9) +!19 = !DILocation(line: 37, column: 107, scope: !9) +!20 = !DILocation(line: 144, column: 21, scope: !21, inlinedAt: !23) +!21 = distinct !DILexicalBlockFile(scope: !9, file: !22, discriminator: 0) +!22 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!23 = !DILocation(line: 40, column: 38, scope: !9) +!24 = !DILocation(line: 145, column: 23, scope: !21, inlinedAt: !23) +!25 = !DILocation(line: 147, column: 29, scope: !21, inlinedAt: !23) +!26 = !DILocation(line: 148, column: 29, scope: !21, inlinedAt: !23) +!27 = !DILocation(line: 149, column: 31, scope: !21, inlinedAt: !23) +!28 = !DILocation(line: 149, column: 27, scope: !21, inlinedAt: !23) +!29 = !DILocation(line: 149, column: 16, scope: !21, inlinedAt: !23) +!30 = !DILocation(line: 151, column: 27, scope: !21, inlinedAt: !23) +!31 = !DILocation(line: 151, column: 17, scope: !21, inlinedAt: !23) +!32 = !DILocation(line: 154, column: 31, scope: !21, inlinedAt: !23) +!33 = !DILocation(line: 154, column: 21, scope: !21, inlinedAt: !23) +!34 = !DILocation(line: 154, column: 12, scope: !21, inlinedAt: !23) +!35 = !DILocation(line: 155, column: 35, scope: !21, inlinedAt: !23) +!36 = !DILocation(line: 155, column: 69, scope: !21, inlinedAt: !23) +!37 = !DILocation(line: 42, column: 46, scope: !9) +!38 = !DILocation(line: 43, column: 58, scope: !9) +!39 = !DILocation(line: 144, column: 21, scope: !21, inlinedAt: !40) +!40 = !DILocation(line: 44, column: 75, scope: !9) +!41 = !DILocation(line: 147, column: 29, scope: !21, inlinedAt: !40) +!42 = !DILocation(line: 148, column: 29, scope: !21, inlinedAt: !40) +!43 = !DILocation(line: 149, column: 31, scope: !21, inlinedAt: !40) +!44 = !DILocation(line: 149, column: 27, scope: !21, inlinedAt: !40) +!45 = !DILocation(line: 149, column: 16, scope: !21, inlinedAt: !40) +!46 = !DILocation(line: 154, column: 31, scope: !21, inlinedAt: !40) +!47 = !DILocation(line: 154, column: 21, scope: !21, inlinedAt: !40) +!48 = !DILocation(line: 154, column: 12, scope: !21, inlinedAt: !40) +!49 = !DILocation(line: 155, column: 35, scope: !21, inlinedAt: !40) +!50 = !DILocation(line: 155, column: 69, scope: !21, inlinedAt: !40) +!51 = !DILocation(line: 145, column: 23, scope: !21, inlinedAt: !40) +!52 = !DILocation(line: 151, column: 27, scope: !21, inlinedAt: !40) +!53 = !DILocation(line: 151, column: 17, scope: !21, inlinedAt: !40) +!54 = !DILocation(line: 165, column: 42, scope: !21, inlinedAt: !40) +!55 = !DILocation(line: 46, column: 31, scope: !9) +!56 = !DILocation(line: 46, column: 36, scope: !9) +!57 = !DILocation(line: 48, column: 18, scope: !9) +!58 = !DILocation(line: 49, column: 18, scope: !9) +!59 = !DILocation(line: 50, column: 32, scope: !9) +!60 = !DILocation(line: 51, column: 36, scope: !9) +!61 = !DILocation(line: 51, column: 52, scope: !9) +!62 = !DILocation(line: 52, column: 30, scope: !9) +!63 = !DILocation(line: 52, column: 37, scope: !9) +!64 = !DILocation(line: 55, column: 20, scope: !9) +!65 = !DILocation(line: 56, column: 4, scope: !9) +!66 = !DILocation(line: 57, column: 28, scope: !9) +!67 = !DILocation(line: 57, column: 40, scope: !9) +!68 = !DILocation(line: 57, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..3d9049cf05df254dcc1daa4cf543855389182f96 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ptx @@ -0,0 +1,946 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0 // -- Begin function triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 109, 108, 47, 99, 109, 108, 103, 114, 102, 114, 112, 121, 118, 108, 53, 103, 113, 108, 112, 104, 100, 102, 105, 113, 120, 112, 113, 97, 119, 108, 111, 54, 119, 121, 106, 116, 112, 119, 113, 107, 107, 121, 54, 122, 121, 119, 107, 107, 100, 107, 52, 104, 52, 104, 108, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[40] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 54, 32, 60, 32, 49, 53, 49, 57, 51, 54}; +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0 +.visible .entry triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_3, + .param .u32 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_4, + .param .u32 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_7 +) +.reqntid 512 +{ + .reg .pred %p<205>; + .reg .b16 %rs<7>; + .reg .b32 %r<117>; + .reg .b64 %rd<46>; + .loc 1 18 0 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:18:0 + +// %bb.0: + ld.param.b64 %rd12, [triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_3]; + ld.param.b64 %rd11, [triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_2]; + ld.param.b64 %rd10, [triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_1]; + ld.param.b64 %rd9, [triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0_param_0]; +$L__tmp0: + .loc 1 23 28 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:23:28 + mov.u32 %r1, %ctaid.x; + .loc 1 26 37 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:26:37 + mov.u32 %r2, %tid.x; + shl.b32 %r20, %r2, 2; + and.b32 %r21, %r20, 2044; + .loc 1 31 40 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:31:40 + cvt.u64.u32 %rd1, %r21; + mad.lo.s32 %r22, %r1, 151936, %r21; + cvt.u64.u32 %rd2, %r22; + mov.b32 %r113, 0fFF800000; + mov.b64 %rd45, {%r113, %r113}; + mov.b32 %r111, 2147483647; + mov.b64 %rd44, -2048; + mov.b32 %r112, %r111; + mov.b32 %r114, %r113; + mov.b32 %r115, %r111; + mov.b32 %r116, %r111; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 32 31 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:32:31 + add.s64 %rd18, %rd1, %rd44; + add.s64 %rd19, %rd18, 2048; + add.s64 %rd20, %rd18, 2050; + .loc 1 33 29 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:33:29 + add.s64 %rd21, %rd18, 2051; + setp.lt.u64 %p1, %rd19, 151936; + .loc 1 37 34 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:37:34 + add.s64 %rd22, %rd2, %rd44; + cvt.u32.u64 %r27, %rd22; + add.s32 %r28, %r27, 2048; + mad.wide.s32 %rd16, %r28, 2, %rd10; + .loc 1 37 53 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:37:53 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0; + // end inline asm + mov.b32 %r25, 0; + // begin inline asm + mov.u32 %r23, %r25; + mov.u32 %r24, %r25; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r23, %r24 }, [ %rd16 + 0 ], %rd15; + // end inline asm + mov.b32 {%rs1, %rs2}, %r24; + .loc 1 37 107 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:37:107 + cvt.f32.bf16 %r29, %rs1; + cvt.f32.bf16 %r30, %rs2; +$L__tmp1: + .loc 2 144 21 // triton_helpers.py:144:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + setp.gt.f32 %p2, %r113, %r29; + setp.gt.f32 %p3, %r114, %r30; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + setp.eq.f32 %p4, %r113, %r29; + setp.eq.f32 %p5, %r114, %r30; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + mov.b64 {%r31, %r32}, %rd45; + setp.nan.f32 %p6, %r31, %r31; + setp.nan.f32 %p7, %r32, %r32; + setp.nan.f32 %p8, %r113, %r113; + setp.nan.f32 %p9, %r114, %r114; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + setp.nan.bf16 %p10, %rs1, %rs1; + setp.num.bf16 %p11, %rs1, %rs1; + setp.nan.bf16 %p12, %rs2, %rs2; + setp.num.bf16 %p13, %rs2, %rs2; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + and.pred %p14, %p8, %p11; + and.pred %p15, %p9, %p13; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + or.pred %p16, %p2, %p14; + or.pred %p17, %p3, %p15; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + and.pred %p18, %p8, %p10; + and.pred %p19, %p9, %p12; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + or.pred %p20, %p4, %p18; + or.pred %p21, %p5, %p19; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + cvt.s64.s32 %rd23, %r116; + cvt.s64.s32 %rd24, %r115; + cvt.s64.s32 %rd25, %r111; + setp.gt.s64 %p22, %rd20, %rd25; + cvt.s64.s32 %rd26, %r112; + setp.gt.s64 %p23, %rd21, %rd26; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + and.pred %p24, %p22, %p20; + and.pred %p25, %p23, %p21; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + or.pred %p26, %p16, %p24; + or.pred %p27, %p17, %p25; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + selp.f32 %r33, %r113, %r29, %p26; + selp.f32 %r34, %r114, %r30, %p27; +$L__tmp2: + .loc 1 37 107 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:37:107 + mov.b32 {%rs3, %rs4}, %r23; + cvt.f32.bf16 %r35, %rs3; + cvt.f32.bf16 %r36, %rs4; +$L__tmp3: + .loc 2 144 21 // triton_helpers.py:144:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + setp.gt.f32 %p28, %r32, %r36; + setp.gt.f32 %p29, %r31, %r35; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + setp.eq.f32 %p30, %r31, %r35; + setp.eq.f32 %p31, %r32, %r36; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + setp.nan.bf16x2 %p32|%p33, %r23, %r25; + setp.num.bf16x2 %p34|%p35, %r23, %r25; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + and.pred %p36, %p7, %p35; + and.pred %p37, %p6, %p34; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + or.pred %p38, %p29, %p37; + or.pred %p39, %p28, %p36; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + and.pred %p40, %p6, %p32; + and.pred %p41, %p7, %p33; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + or.pred %p42, %p31, %p41; + or.pred %p43, %p30, %p40; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + setp.gt.s64 %p44, %rd19, %rd24; + setp.ge.s64 %p45, %rd19, %rd23; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + and.pred %p46, %p44, %p43; + and.pred %p47, %p45, %p42; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + or.pred %p48, %p39, %p47; + or.pred %p49, %p38, %p46; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + selp.f32 %r37, %r31, %r35, %p49; + selp.f32 %r38, %r32, %r36, %p48; + cvt.u32.u64 %r39, %rd19; + cvt.u32.u64 %r40, %rd18; + add.s32 %r41, %r40, 2049; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:40:38 ] + selp.b32 %r42, %r115, %r39, %p49; + selp.b32 %r43, %r116, %r41, %p48; + cvt.u32.u64 %r44, %rd20; + selp.b32 %r45, %r111, %r44, %p26; + cvt.u32.u64 %r46, %rd21; + selp.b32 %r47, %r112, %r46, %p27; +$L__tmp4: + .loc 1 42 46 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:42:46 + selp.f32 %r48, %r38, %r32, %p1; + selp.f32 %r49, %r37, %r31, %p1; + mov.b64 %rd45, {%r49, %r48}; + selp.f32 %r113, %r33, %r113, %p1; + selp.f32 %r114, %r34, %r114, %p1; + .loc 1 43 58 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:43:58 + selp.b32 %r116, %r43, %r116, %p1; + selp.b32 %r115, %r42, %r115, %p1; + selp.b32 %r111, %r45, %r111, %p1; + selp.b32 %r112, %r47, %r112, %p1; + .loc 1 31 40 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:31:40 + add.s64 %rd44, %rd44, 2048; + setp.lt.u64 %p50, %rd44, 149888; + @%p50 bra $L__BB0_1; +// %bb.2: + .loc 1 26 37 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:26:37 + and.b32 %r62, %r2, 31; +$L__tmp5: + .loc 2 144 21 // triton_helpers.py:144:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + mov.b64 {%r63, %r64}, %rd45; + setp.gt.f32 %p57, %r63, %r64; + setp.eq.f32 %p58, %r64, %r63; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p59, %r63, %r63; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.num.f32 %p60, %r64, %r64; + setp.nan.f32 %p61, %r64, %r64; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p62, %p59, %p61; + and.pred %p63, %p59, %p60; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p64, %p57, %p63; + or.pred %p65, %p58, %p62; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.lt.s32 %p66, %r115, %r116; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p67, %p66, %p65; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p68, %p64, %p67; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.f32 %r65, %r63, %r64, %p68; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.b32 %r66, %r115, %r116, %p68; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.gt.f32 %p69, %r65, %r113; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.eq.f32 %p70, %r65, %r113; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p71, %r65, %r65; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p72, %r113, %r113; + setp.num.f32 %p73, %r113, %r113; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p74, %p71, %p73; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p75, %p69, %p74; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p76, %p72, %p71; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p77, %p70, %p76; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.lt.s32 %p78, %r66, %r111; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p79, %p78, %p77; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p80, %p75, %p79; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.f32 %r67, %r65, %r113, %p80; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.b32 %r68, %r66, %r111, %p80; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.gt.f32 %p81, %r67, %r114; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.eq.f32 %p82, %r67, %r114; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p83, %r67, %r67; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p84, %r114, %r114; + setp.num.f32 %p85, %r114, %r114; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p86, %p83, %p85; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p87, %p81, %p86; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p88, %p84, %p83; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p89, %p82, %p88; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.lt.s32 %p90, %r68, %r112; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p91, %p90, %p89; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p92, %p87, %p91; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.f32 %r69, %r67, %r114, %p92; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.b32 %r70, %r68, %r112, %p92; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + shfl.sync.bfly.b32 %r71, %r69, 16, 31, -1; + shfl.sync.bfly.b32 %r72, %r70, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.gt.f32 %p93, %r69, %r71; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.eq.f32 %p94, %r69, %r71; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p95, %r69, %r69; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p96, %r71, %r71; + setp.num.f32 %p97, %r71, %r71; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p98, %p95, %p97; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p99, %p93, %p98; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p100, %p95, %p96; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p101, %p94, %p100; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.lt.s32 %p102, %r70, %r72; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p103, %p102, %p101; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p104, %p99, %p103; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.f32 %r73, %r69, %r71, %p104; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.b32 %r74, %r70, %r72, %p104; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + shfl.sync.bfly.b32 %r75, %r73, 8, 31, -1; + shfl.sync.bfly.b32 %r76, %r74, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.gt.f32 %p105, %r73, %r75; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.eq.f32 %p106, %r73, %r75; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p107, %r73, %r73; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p108, %r75, %r75; + setp.num.f32 %p109, %r75, %r75; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p110, %p107, %p109; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p111, %p105, %p110; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p112, %p108, %p107; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p113, %p106, %p112; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.lt.s32 %p114, %r74, %r76; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p115, %p114, %p113; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p116, %p111, %p115; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.f32 %r77, %r73, %r75, %p116; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.b32 %r78, %r74, %r76, %p116; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + shfl.sync.bfly.b32 %r79, %r77, 4, 31, -1; + shfl.sync.bfly.b32 %r80, %r78, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.gt.f32 %p117, %r77, %r79; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.eq.f32 %p118, %r77, %r79; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p119, %r77, %r77; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p120, %r79, %r79; + setp.num.f32 %p121, %r79, %r79; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p122, %p119, %p121; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p123, %p117, %p122; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p124, %p120, %p119; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p125, %p118, %p124; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.lt.s32 %p126, %r78, %r80; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p127, %p126, %p125; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p128, %p123, %p127; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.f32 %r81, %r77, %r79, %p128; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.b32 %r82, %r78, %r80, %p128; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + shfl.sync.bfly.b32 %r83, %r81, 2, 31, -1; + shfl.sync.bfly.b32 %r84, %r82, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.gt.f32 %p129, %r81, %r83; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.eq.f32 %p130, %r81, %r83; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p131, %r81, %r81; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p132, %r83, %r83; + setp.num.f32 %p133, %r83, %r83; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p134, %p131, %p133; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p135, %p129, %p134; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p136, %p132, %p131; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p137, %p130, %p136; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.lt.s32 %p138, %r82, %r84; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p139, %p138, %p137; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p140, %p135, %p139; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.f32 %r85, %r81, %r83, %p140; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.b32 %r86, %r82, %r84, %p140; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + shfl.sync.bfly.b32 %r87, %r85, 1, 31, -1; + shfl.sync.bfly.b32 %r88, %r86, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.gt.f32 %p141, %r85, %r87; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.eq.f32 %p142, %r85, %r87; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p143, %r85, %r85; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p144, %r87, %r87; + setp.num.f32 %p145, %r87, %r87; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p146, %p143, %p145; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p147, %p141, %p146; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p148, %p144, %p143; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p149, %p142, %p148; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.lt.s32 %p150, %r86, %r88; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p151, %p150, %p149; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p152, %p147, %p151; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.b32 %r53, %r86, %r88, %p152; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.eq.b32 %p51, %r62, 0; + shr.u32 %r89, %r2, 3; + and.b32 %r90, %r89, 60; + mov.b32 %r91, global_smem; + add.s32 %r50, %r91, %r90; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.b32 %r51, %r85, %r87, %p152; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + // begin inline asm + @%p51 st.shared.b32 [ %r50 + 0 ], %r51; + // end inline asm + add.s32 %r92, %r91, 64; + add.s32 %r52, %r92, %r90; + // begin inline asm + @%p51 st.shared.b32 [ %r52 + 0 ], %r53; + // end inline asm + bar.sync 0; + setp.lt.u32 %p53, %r2, 16; + add.s32 %r55, %r91, %r20; + // begin inline asm + @%p53 ld.shared.b32 %r54, [ %r55 + 0 ]; + // end inline asm + add.s32 %r57, %r92, %r20; + // begin inline asm + @%p53 ld.shared.b32 %r56, [ %r57 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r94, %r54, 8, 31, -1; + shfl.sync.bfly.b32 %r95, %r56, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.gt.f32 %p153, %r54, %r94; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.eq.f32 %p154, %r54, %r94; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p155, %r54, %r54; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p156, %r94, %r94; + setp.num.f32 %p157, %r94, %r94; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p158, %p155, %p157; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p159, %p153, %p158; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p160, %p155, %p156; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p161, %p154, %p160; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.lt.s32 %p162, %r56, %r95; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p163, %p162, %p161; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p164, %p159, %p163; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.f32 %r96, %r54, %r94, %p164; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.b32 %r97, %r56, %r95, %p164; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + shfl.sync.bfly.b32 %r98, %r96, 4, 31, -1; + shfl.sync.bfly.b32 %r99, %r97, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.gt.f32 %p165, %r96, %r98; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.eq.f32 %p166, %r96, %r98; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p167, %r96, %r96; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p168, %r98, %r98; + setp.num.f32 %p169, %r98, %r98; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p170, %p167, %p169; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p171, %p165, %p170; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p172, %p168, %p167; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p173, %p166, %p172; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.lt.s32 %p174, %r97, %r99; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p175, %p174, %p173; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p176, %p171, %p175; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.f32 %r100, %r96, %r98, %p176; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.b32 %r101, %r97, %r99, %p176; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + shfl.sync.bfly.b32 %r102, %r100, 2, 31, -1; + shfl.sync.bfly.b32 %r103, %r101, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.gt.f32 %p177, %r100, %r102; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.eq.f32 %p178, %r100, %r102; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p179, %r100, %r100; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p180, %r102, %r102; + setp.num.f32 %p181, %r102, %r102; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p182, %p179, %p181; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p183, %p177, %p182; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p184, %p180, %p179; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p185, %p178, %p184; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.lt.s32 %p186, %r101, %r103; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p187, %p186, %p185; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p188, %p183, %p187; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.f32 %r104, %r100, %r102, %p188; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.b32 %r105, %r101, %r103, %p188; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + shfl.sync.bfly.b32 %r106, %r104, 1, 31, -1; + shfl.sync.bfly.b32 %r107, %r105, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.gt.f32 %p189, %r104, %r106; + .loc 2 145 23 // triton_helpers.py:145:23 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.eq.f32 %p190, %r104, %r106; + .loc 2 147 29 // triton_helpers.py:147:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p191, %r104, %r104; + .loc 2 148 29 // triton_helpers.py:148:29 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.nan.f32 %p192, %r106, %r106; + setp.num.f32 %p193, %r106, %r106; + .loc 2 149 27 // triton_helpers.py:149:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p194, %p191, %p193; + .loc 2 149 16 // triton_helpers.py:149:16 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p195, %p189, %p194; + .loc 2 151 27 // triton_helpers.py:151:27 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p196, %p192, %p191; + .loc 2 151 17 // triton_helpers.py:151:17 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p197, %p190, %p196; + .loc 2 154 31 // triton_helpers.py:154:31 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.lt.s32 %p198, %r105, %r107; + .loc 2 154 21 // triton_helpers.py:154:21 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + and.pred %p199, %p198, %p197; + .loc 2 154 12 // triton_helpers.py:154:12 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + or.pred %p200, %p195, %p199; + .loc 2 155 69 // triton_helpers.py:155:69 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.b32 %r61, %r105, %r107, %p200; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + setp.eq.b32 %p55, %r2, 0; + .loc 2 155 35 // triton_helpers.py:155:35 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + selp.b32 %r59, %r104, %r106, %p200; + .loc 2 165 42 // triton_helpers.py:165:42 @[ cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:44:75 ] + // begin inline asm + @%p55 st.shared.b32 [ %r55 + 0 ], %r59; + // end inline asm + // begin inline asm + @%p55 st.shared.b32 [ %r57 + 0 ], %r61; + // end inline asm + bar.sync 0; + ld.shared.b32 %r108, [global_smem+64]; +$L__tmp6: + .loc 1 46 31 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:46:31 + mad.wide.u32 %rd29, %r1, 8, %rd12; + .loc 1 46 36 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:46:36 + // begin inline asm + mov.u64 %rd27, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd27, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd28, 0x0; + ld.global.L1::evict_last.L2::cache_hint.b64 { %rd28 }, [ %rd29 + 0 ], %rd27; + // end inline asm + .loc 1 48 18 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:48:18 + add.s32 %r109, %r108, 151936; + .loc 1 49 18 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:49:18 + setp.lt.s32 %p201, %r108, 0; + .loc 1 50 32 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:50:32 + selp.b32 %r15, %r109, %r108, %p201; + .loc 1 51 36 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:51:36 + setp.lt.u32 %p202, %r15, 151936; + .loc 1 51 52 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:51:52 + @%p202 bra $L__BB0_4; + bra.uni $L__BB0_3; +$L__BB0_4: + .loc 1 0 0 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:0 + cvt.u64.u32 %rd7, %r1; + .loc 1 51 52 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:51:52 + bar.sync 0; + .loc 1 52 30 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:52:30 + cvt.u64.u32 %rd36, %r15; + add.s64 %rd32, %rd11, %rd36; + .loc 1 52 37 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:52:37 + // begin inline asm + mov.u64 %rd33, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd33, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs5, 0x0; + ld.global.L1::evict_last.L2::cache_hint.b8 { %rs5 }, [ %rd32 + 0 ], %rd33; + // end inline asm + and.b16 %rs6, %rs5, 255; + setp.eq.b16 %p204, %rs6, 0; + .loc 1 55 20 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:55:20 + selp.b64 %rd34, 0, %rd28, %p204; + .loc 1 56 4 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:56:4 + bar.sync 0; + .loc 1 57 28 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:57:28 + shl.b64 %rd37, %rd7, 3; + add.s64 %rd35, %rd9, %rd37; + .loc 1 57 40 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:57:40 + and.b32 %r110, %r2, 511; + setp.eq.b32 %p203, %r110, 0; + // begin inline asm + @%p203 st.global.b64 [ %rd35 + 0 ], { %rd34 }; + // end inline asm + .loc 1 57 4 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:57:4 + ret; +$L__BB0_3: + .loc 1 51 52 // cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py:51:52 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd38, assertFunc_0; + cvta.global.u64 %rd39, %rd38; + st.param.b64 [param3], %rd39; + mov.b64 %rd40, assertFile_0; + cvta.global.u64 %rd41, %rd40; + st.param.b64 [param1], %rd41; + mov.b64 %rd42, assertMessage_0; + cvta.global.u64 %rd43, %rd42; + st.param.b64 [param0], %rd43; + st.param.b64 [param4], 1; + st.param.b32 [param2], 51; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp7: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 263 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x100 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 109 +.b8 108 +.b8 103 +.b8 114 +.b8 102 +.b8 114 +.b8 112 +.b8 121 +.b8 118 +.b8 108 +.b8 53 +.b8 103 +.b8 113 +.b8 108 +.b8 112 +.b8 104 +.b8 100 +.b8 102 +.b8 105 +.b8 113 +.b8 120 +.b8 112 +.b8 113 +.b8 97 +.b8 119 +.b8 108 +.b8 111 +.b8 54 +.b8 119 +.b8 121 +.b8 106 +.b8 116 +.b8 112 +.b8 119 +.b8 113 +.b8 107 +.b8 107 +.b8 121 +.b8 54 +.b8 122 +.b8 121 +.b8 119 +.b8 107 +.b8 107 +.b8 100 +.b8 107 +.b8 52 +.b8 104 +.b8 52 +.b8 104 +.b8 108 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 109 +.b8 108 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x39 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc4:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xd9:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 40 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xf1:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.source new file mode 100644 index 0000000000000000000000000000000000000000..6fbf15e2ebe5f0ad4c8612d3b5eee3d7f27d4a48 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.source @@ -0,0 +1,348 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":18:0) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc58 = loc(unknown) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc83 = loc("in_out_ptr0"(#loc)) +#loc84 = loc("in_ptr0"(#loc)) +#loc85 = loc("in_ptr1"(#loc)) +#loc86 = loc("in_ptr2"(#loc)) +#loc87 = loc("xnumel"(#loc)) +#loc88 = loc("r0_numel"(#loc)) +#loc123 = loc("a_value"(#loc46)) +#loc124 = loc("a_index"(#loc46)) +#loc125 = loc("b_value"(#loc46)) +#loc126 = loc("b_index"(#loc46)) +#loc139 = loc("x"(#loc66)) +#loc140 = loc("x"(#loc70)) +#loc141 = loc("value"(#loc79)) +#loc142 = loc("index"(#loc79)) +module { + tt.func public @triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 4096 : i32 loc(#loc89) + %r0_numel_1 = arith.constant 151936 : i32 loc(#loc90) + %xoffset = tt.get_program_id x : i32 loc(#loc91) + %xoffset_2 = arith.constant 1 : i32 loc(#loc92) + %xoffset_3 = arith.constant 1 : i32 loc(#loc92) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc92) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc93) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc94) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc95) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc95) + %xmask = arith.constant true loc(#loc96) + %xmask_8 = arith.constant dense : tensor<1x2048xi1> loc(#loc96) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc97) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc98) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc99) + %_tmp2_10 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc99) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc100) + %_tmp2_index_11 = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc100) + %c0_i32 = arith.constant 0 : i32 loc(#loc13) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc13) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc13) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc13) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc13) + %3 = ub.poison : i32 loc(#loc13) + %_tmp2_index_12:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_26 = %_tmp2_10, %_tmp2_index_27 = %_tmp2_index_11) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc102) + %r0_index_28 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc102) + %r0_mask = arith.constant dense<151936> : tensor<1x2048xi32> loc(#loc103) + %r0_mask_29 = arith.cmpi slt, %r0_index_28, %r0_mask : tensor<1x2048xi32> loc(#loc103) + %tmp0 = arith.constant 151936 : i32 loc(#loc104) + %tmp0_30 = arith.constant 151936 : i32 loc(#loc104) + %tmp0_31 = arith.constant dense<151936> : tensor<1x1xi32> loc(#loc104) + %tmp0_32 = arith.muli %tmp0_31, %xindex_7 : tensor<1x1xi32> loc(#loc104) + %tmp0_33 = tt.broadcast %tmp0_32 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc105) + %tmp0_34 = arith.addi %r0_index_28, %tmp0_33 : tensor<1x2048xi32> loc(#loc105) + %tmp0_35 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc106) + %tmp0_36 = tt.addptr %tmp0_35, %tmp0_34 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc106) + %tmp0_37 = arith.constant 0.000000e+00 : f32 loc(#loc107) + %tmp0_38 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc107) + %tmp0_39 = arith.truncf %tmp0_38 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc107) + %tmp0_40 = tt.load %tmp0_36, %r0_mask_29, %tmp0_39 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc107) + %tmp0_41 = arith.extf %tmp0_40 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc108) + %10:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%_tmp2_26, %_tmp2_index_27, %tmp0_41, %r0_index_28) : (tensor<1x2048xf32>, tensor<1x2048xi32>, tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) loc(#loc21) + %_tmp2_42 = arith.select %r0_mask_29, %10#0, %_tmp2_26 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc109) + %_tmp2_index_43 = arith.select %r0_mask_29, %10#1, %_tmp2_index_27 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc110) + scf.yield %_tmp2_42, %_tmp2_index_43 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc24) + } loc(#loc143) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%_tmp2_index_12#0, %_tmp2_index_12#1) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc25) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc111) + %tmp11 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc112) + %tmp11_13 = tt.addptr %tmp11, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc112) + %tmp11_14 = tt.load %tmp11_13 evictionPolicy = evict_last : tensor<1x1x!tt.ptr> loc(#loc113) + %tmp3 = arith.constant 151936 : i32 loc(#loc114) + %tmp3_15 = arith.constant dense<151936> : tensor<1x1xi32> loc(#loc114) + %tmp4 = arith.addi %tmp2, %tmp3_15 : tensor<1x1xi32> loc(#loc115) + %tmp5 = arith.constant 0 : i32 loc(#loc116) + %tmp5_16 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc116) + %tmp5_17 = arith.cmpi slt, %tmp2, %tmp5_16 : tensor<1x1xi32> loc(#loc116) + %tmp6 = arith.select %tmp5_17, %tmp4, %tmp2 : tensor<1x1xi1>, tensor<1x1xi32> loc(#loc117) + %c0_i32_18 = arith.constant 0 : i32 loc(#loc33) + %cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc33) + %5 = arith.cmpi sle, %cst, %tmp6 : tensor<1x1xi32> loc(#loc33) + %c151936_i32 = arith.constant 151936 : i32 loc(#loc34) + %cst_19 = arith.constant dense<151936> : tensor<1x1xi32> loc(#loc34) + %6 = arith.cmpi slt, %tmp6, %cst_19 : tensor<1x1xi32> loc(#loc34) + %7 = arith.andi %5, %6 : tensor<1x1xi1> loc(#loc35) + tt.assert %7, "index out of bounds: 0 <= tmp6 < 151936" : tensor<1x1xi1> loc(#loc36) + %tmp8 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc118) + %tmp8_20 = tt.addptr %tmp8, %tmp6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc118) + %tmp8_21 = tt.bitcast %tmp8_20 : tensor<1x1x!tt.ptr> -> tensor<1x1x!tt.ptr> loc(#loc119) + %tmp8_22 = tt.load %tmp8_21 evictionPolicy = evict_last : tensor<1x1x!tt.ptr> loc(#loc119) + %tmp8_23 = arith.constant 0 : i8 loc(#loc119) + %tmp8_24 = arith.constant dense<0> : tensor<1x1xi8> loc(#loc119) + %tmp8_25 = arith.cmpi ne, %tmp8_22, %tmp8_24 : tensor<1x1xi8> loc(#loc119) + %tmp9 = arith.extui %tmp8_25 : tensor<1x1xi1> to tensor<1x1xi32> loc(#loc120) + %tmp10 = arith.extsi %tmp9 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc121) + %tmp12 = arith.muli %tmp10, %tmp11_14 : tensor<1x1xi64> loc(#loc122) + gpu.barrier loc(#loc42) + %8 = tt.splat %in_out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc43) + %9 = tt.addptr %8, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc43) + tt.store %9, %tmp12 : tensor<1x1x!tt.ptr> loc(#loc44) + tt.return loc(#loc45) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%a_value: tensor<1x2048xf32> loc("a_value"(#loc46)), %a_index: tensor<1x2048xi32> loc("a_index"(#loc46)), %b_value: tensor<1x2048xf32> loc("b_value"(#loc46)), %b_index: tensor<1x2048xi32> loc("b_index"(#loc46))) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<1x2048xf32> loc(#loc144) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<1x2048xf32> loc(#loc145) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%a_value) : (tensor<1x2048xf32>) -> i1 loc(#loc49) + %1:2 = scf.if %0 -> (tensor<1x2048xi1>, tensor<1x2048xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<1x2048xf32> loc(#loc129) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<1x2048xf32> loc(#loc130) + %mask_3 = arith.constant true loc(#loc131) + %mask_4 = arith.constant dense : tensor<1x2048xi1> loc(#loc131) + %mask_5 = arith.xori %b_isnan, %mask_4 : tensor<1x2048xi1> loc(#loc131) + %mask_6 = arith.andi %a_isnan, %mask_5 : tensor<1x2048xi1> loc(#loc132) + %mask_7 = arith.ori %mask, %mask_6 : tensor<1x2048xi1> loc(#loc146) + %equal_8 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc134) + %equal_9 = arith.ori %equal, %equal_8 : tensor<1x2048xi1> loc(#loc147) + scf.yield %mask_7, %equal_9 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc147) + } else { + scf.yield %mask, %equal : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc58) + } loc(#loc50) + %mask_0 = arith.cmpi slt, %a_index, %b_index : tensor<1x2048xi32> loc(#loc136) + %mask_1 = arith.andi %1#1, %mask_0 : tensor<1x2048xi1> loc(#loc137) + %mask_2 = arith.ori %1#0, %mask_1 : tensor<1x2048xi1> loc(#loc138) + %2 = arith.select %mask_2, %a_value, %b_value : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc62) + %3 = arith.select %mask_2, %a_index, %b_index : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc63) + tt.return %2, %3 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc64) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x2048xf32> loc(#loc65) + %5 = ub.poison : tensor<1x2048xi32> loc(#loc65) + tt.return %4, %5 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc65) + } loc(#loc46) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc66))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc67) + %true = arith.constant true loc(#loc68) + tt.return %true : i1 loc(#loc68) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc69) + tt.return %1 : i1 loc(#loc69) + } loc(#loc66) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc70))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc71) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc72) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc72) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc72) + %4 = arith.addf %x, %3 : tensor<1x2048xf32> loc(#loc72) + tt.return %4 : tensor<1x2048xf32> loc(#loc73) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x2048xf32> loc(#loc74) + tt.return %5 : tensor<1x2048xf32> loc(#loc74) + } loc(#loc70) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc76) + %cst = arith.constant dense : tensor<1xi1> loc(#loc76) + tt.return %cst : tensor<1xi1> loc(#loc77) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc78) + tt.return %0 : tensor<1xi1> loc(#loc78) + } loc(#loc75) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%value: tensor<1x2048xf32> loc("value"(#loc79)), %index: tensor<1x2048xi32> loc("index"(#loc79))) -> (tensor<1xf32>, tensor<1xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc80) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc80) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc80) + tt.return %0#0, %0#1 : tensor<1xf32>, tensor<1xi32> loc(#loc81) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc82) + %2 = ub.poison : tensor<1xi32> loc(#loc82) + tt.return %1, %2 : tensor<1xf32>, tensor<1xi32> loc(#loc82) + } loc(#loc79) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc46)), %a_index: i32 loc("a_index"(#loc46)), %b_value: f32 loc("b_value"(#loc46)), %b_index: i32 loc("b_index"(#loc46))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc144) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc145) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc49) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc129) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc130) + %mask_3 = arith.constant true loc(#loc131) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc131) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc132) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc146) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc134) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc147) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc147) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc58) + } loc(#loc50) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc136) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc137) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc138) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc62) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc63) + tt.return %2, %3 : f32, i32 loc(#loc64) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc65) + %5 = ub.poison : i32 loc(#loc65) + tt.return %4, %5 : f32, i32 loc(#loc65) + } loc(#loc46) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc66))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc67) + %true = arith.constant true loc(#loc68) + tt.return %true : i1 loc(#loc68) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc69) + tt.return %1 : i1 loc(#loc69) + } loc(#loc66) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc70))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc71) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc72) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc72) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc72) + tt.return %3 : tensor<1xf32> loc(#loc73) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc74) + tt.return %4 : tensor<1xf32> loc(#loc74) + } loc(#loc70) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":29:55) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":30:58) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":31:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":32:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":33:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":37:48) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":37:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":37:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":37:53) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":37:107) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":40:38) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":42:46) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":43:58) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":43:8) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":44:75) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":45:20) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":46:31) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":46:36) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":47:40) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":48:18) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":49:18) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":50:32) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":51:27) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":51:43) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":51:36) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":51:52) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":52:30) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":52:37) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":53:19) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":54:20) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":55:20) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":56:4) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":57:28) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":57:40) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":57:4) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc89 = loc("xnumel"(#loc1)) +#loc90 = loc("r0_numel"(#loc2)) +#loc91 = loc("xoffset"(#loc3)) +#loc92 = loc("xoffset"(#loc4)) +#loc93 = loc("xindex"(#loc5)) +#loc94 = loc("xindex"(#loc6)) +#loc95 = loc("xindex"(#loc7)) +#loc96 = loc("xmask"(#loc8)) +#loc97 = loc("r0_base"(#loc9)) +#loc98 = loc("r0_base"(#loc10)) +#loc99 = loc("_tmp2"(#loc11)) +#loc100 = loc("_tmp2_index"(#loc12)) +#loc101 = loc("_tmp2"(#loc13)) +#loc102 = loc("r0_index"(#loc14)) +#loc103 = loc("r0_mask"(#loc15)) +#loc104 = loc("tmp0"(#loc16)) +#loc105 = loc("tmp0"(#loc17)) +#loc106 = loc("tmp0"(#loc18)) +#loc107 = loc("tmp0"(#loc19)) +#loc108 = loc("tmp0"(#loc20)) +#loc109 = loc("_tmp2"(#loc22)) +#loc110 = loc("_tmp2_index"(#loc23)) +#loc111 = loc("tmp2"(#loc26)) +#loc112 = loc("tmp11"(#loc27)) +#loc113 = loc("tmp11"(#loc28)) +#loc114 = loc("tmp3"(#loc29)) +#loc115 = loc("tmp4"(#loc30)) +#loc116 = loc("tmp5"(#loc31)) +#loc117 = loc("tmp6"(#loc32)) +#loc118 = loc("tmp8"(#loc37)) +#loc119 = loc("tmp8"(#loc38)) +#loc120 = loc("tmp9"(#loc39)) +#loc121 = loc("tmp10"(#loc40)) +#loc122 = loc("tmp12"(#loc41)) +#loc127 = loc("mask"(#loc47)) +#loc128 = loc("equal"(#loc48)) +#loc129 = loc("a_isnan"(#loc51)) +#loc130 = loc("b_isnan"(#loc52)) +#loc131 = loc("mask"(#loc53)) +#loc132 = loc("mask"(#loc54)) +#loc133 = loc("mask"(#loc55)) +#loc134 = loc("equal"(#loc56)) +#loc135 = loc("equal"(#loc57)) +#loc136 = loc("mask"(#loc59)) +#loc137 = loc("mask"(#loc60)) +#loc138 = loc("mask"(#loc61)) +#loc143 = loc("_tmp2_index"(#loc101)) +#loc144 = loc("mask"(#loc127)) +#loc145 = loc("equal"(#loc128)) +#loc146 = loc("mask"(#loc133)) +#loc147 = loc("equal"(#loc135)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..59b38b9c39b585ae77f756ee0f3ceb8426bf4052 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttgir @@ -0,0 +1,227 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":18:0) +#loc1 = loc(unknown) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":44:75) +#loc51 = loc("in_out_ptr0"(#loc)) +#loc52 = loc("in_ptr0"(#loc)) +#loc53 = loc("in_ptr1"(#loc)) +#loc54 = loc("in_ptr2"(#loc)) +#loc55 = loc("xnumel"(#loc)) +#loc56 = loc("r0_numel"(#loc)) +#loc84 = loc(callsite(#loc1 at #loc31)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x1xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<151936> : tensor<1x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<151936> : tensor<1x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<2147483647> : tensor<1x2048xi32, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked1> loc(#loc1) + %cst_4 = arith.constant dense<0> : tensor<1x1xi8, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<151936> : tensor<1x2048xi32, #blocked1> loc(#loc1) + %cst_6 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked1> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c151936_i32 = arith.constant 151936 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %true = arith.constant true loc(#loc1) + %cst_7 = arith.constant dense : tensor<1x2048xi1, #blocked1> loc(#loc1) + %cst_8 = arith.constant dense<0> : tensor<1x1xi32, #blocked1> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc57) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc58) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x2048xi32, #blocked1> loc(#loc58) + %tmp0 = arith.muli %xoffset, %c151936_i32 : i32 loc(#loc59) + %tmp0_10 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked1> loc(#loc96) + %tmp0_11 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked1> loc(#loc61) + %_tmp2_index:2 = scf.for %_tmp2_index_23 = %c0_i32 to %c151936_i32 step %c2048_i32 iter_args(%_tmp2 = %cst_3, %_tmp2_index_24 = %cst_2) -> (tensor<1x2048xf32, #blocked1>, tensor<1x2048xi32, #blocked1>) : i32 { + %r0_index = tt.splat %_tmp2_index_23 : i32 -> tensor<1x2048xi32, #blocked1> loc(#loc63) + %r0_index_25 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32, #blocked1> loc(#loc63) + %r0_mask = arith.cmpi slt, %r0_index_25, %cst_5 : tensor<1x2048xi32, #blocked1> loc(#loc64) + %tmp0_26 = arith.addi %r0_index_25, %tmp0_10 : tensor<1x2048xi32, #blocked1> loc(#loc60) + %tmp0_27 = tt.addptr %tmp0_11, %tmp0_26 : tensor<1x2048x!tt.ptr, #blocked1>, tensor<1x2048xi32, #blocked1> loc(#loc61) + %tmp0_28 = tt.load %tmp0_27, %r0_mask, %cst_6 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked1> loc(#loc65) + %tmp0_29 = arith.extf %tmp0_28 : tensor<1x2048xbf16, #blocked1> to tensor<1x2048xf32, #blocked1> loc(#loc66) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_29 : tensor<1x2048xf32, #blocked1> loc(#loc121) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_29 : tensor<1x2048xf32, #blocked1> loc(#loc122) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<1x2048xf32, #blocked1> loc(#loc100) + %b_isnan = arith.cmpf une, %tmp0_29, %tmp0_29 : tensor<1x2048xf32, #blocked1> loc(#loc101) + %mask_30 = arith.xori %b_isnan, %cst_7 : tensor<1x2048xi1, #blocked1> loc(#loc102) + %mask_31 = arith.andi %a_isnan, %mask_30 : tensor<1x2048xi1, #blocked1> loc(#loc103) + %mask_32 = arith.ori %mask, %mask_31 : tensor<1x2048xi1, #blocked1> loc(#loc123) + %equal_33 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1, #blocked1> loc(#loc105) + %equal_34 = arith.ori %equal, %equal_33 : tensor<1x2048xi1, #blocked1> loc(#loc124) + %mask_35 = arith.cmpi slt, %_tmp2_index_24, %r0_index_25 : tensor<1x2048xi32, #blocked1> loc(#loc107) + %mask_36 = arith.andi %equal_34, %mask_35 : tensor<1x2048xi1, #blocked1> loc(#loc108) + %mask_37 = arith.ori %mask_32, %mask_36 : tensor<1x2048xi1, #blocked1> loc(#loc109) + %6 = arith.select %mask_37, %_tmp2, %tmp0_29 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xf32, #blocked1> loc(#loc79) + %7 = arith.select %mask_37, %_tmp2_index_24, %r0_index_25 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xi32, #blocked1> loc(#loc80) + %_tmp2_38 = arith.select %r0_mask, %6, %_tmp2 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xf32, #blocked1> loc(#loc81) + %_tmp2_index_39 = arith.select %r0_mask, %7, %_tmp2_index_24 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xi32, #blocked1> loc(#loc82) + scf.yield %_tmp2_38, %_tmp2_index_39 : tensor<1x2048xf32, #blocked1>, tensor<1x2048xi32, #blocked1> loc(#loc29) + } loc(#loc97) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc31)), %arg7: i32 loc(callsite(#loc1 at #loc31)), %arg8: f32 loc(callsite(#loc1 at #loc31)), %arg9: i32 loc(callsite(#loc1 at #loc31))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc125) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc126) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc110) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc111) + %mask_23 = arith.xori %b_isnan, %true : i1 loc(#loc112) + %mask_24 = arith.andi %a_isnan, %mask_23 : i1 loc(#loc113) + %mask_25 = arith.ori %mask, %mask_24 : i1 loc(#loc127) + %equal_26 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc114) + %equal_27 = arith.ori %equal, %equal_26 : i1 loc(#loc128) + %mask_28 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc115) + %mask_29 = arith.andi %equal_27, %mask_28 : i1 loc(#loc116) + %mask_30 = arith.ori %mask_25, %mask_29 : i1 loc(#loc117) + %6 = arith.select %mask_30, %arg6, %arg8 : f32 loc(#loc118) + %7 = arith.select %mask_30, %arg7, %arg9 : i32 loc(#loc119) + tt.reduce.return %6, %7 : f32, i32 loc(#loc83) + }) : (tensor<1x2048xf32, #blocked1>, tensor<1x2048xi32, #blocked1>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>>, tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked1}>>) loc(#loc83) + %tmp8 = ttg.convert_layout %0#1 : tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc85) + %tmp2 = tt.expand_dims %tmp8 {axis = 1 : i32} : tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi32, #blocked> loc(#loc86) + %tmp2_12 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xi32, #blocked1> loc(#loc86) + %tmp11 = tt.addptr %in_ptr2, %xoffset : !tt.ptr, i32 loc(#loc87) + %tmp11_13 = tt.splat %tmp11 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc88) + %tmp11_14 = tt.load %tmp11_13 evictionPolicy = evict_last : tensor<1x1x!tt.ptr, #blocked> loc(#loc88) + %tmp4 = arith.addi %tmp2, %cst_0 : tensor<1x1xi32, #blocked> loc(#loc89) + %tmp4_15 = arith.addi %tmp2_12, %cst_1 : tensor<1x1xi32, #blocked1> loc(#loc89) + %tmp5 = arith.cmpi slt, %tmp2, %cst : tensor<1x1xi32, #blocked> loc(#loc90) + %tmp5_16 = arith.cmpi slt, %tmp2_12, %cst_8 : tensor<1x1xi32, #blocked1> loc(#loc90) + %tmp6 = arith.select %tmp5, %tmp4, %tmp2 : tensor<1x1xi1, #blocked>, tensor<1x1xi32, #blocked> loc(#loc91) + %tmp6_17 = arith.select %tmp5_16, %tmp4_15, %tmp2_12 : tensor<1x1xi1, #blocked1>, tensor<1x1xi32, #blocked1> loc(#loc91) + %1 = arith.cmpi sge, %tmp6_17, %cst_8 : tensor<1x1xi32, #blocked1> loc(#loc39) + %2 = arith.cmpi slt, %tmp6_17, %cst_1 : tensor<1x1xi32, #blocked1> loc(#loc40) + %3 = arith.andi %1, %2 : tensor<1x1xi1, #blocked1> loc(#loc41) + tt.assert %3, "index out of bounds: 0 <= tmp6 < 151936" : tensor<1x1xi1, #blocked1> loc(#loc42) + %tmp8_18 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc92) + %tmp8_19 = tt.addptr %tmp8_18, %tmp6 : tensor<1x1x!tt.ptr, #blocked>, tensor<1x1xi32, #blocked> loc(#loc92) + %tmp8_20 = tt.bitcast %tmp8_19 : tensor<1x1x!tt.ptr, #blocked> -> tensor<1x1x!tt.ptr, #blocked> loc(#loc85) + %tmp8_21 = tt.load %tmp8_20 evictionPolicy = evict_last : tensor<1x1x!tt.ptr, #blocked> loc(#loc85) + %tmp8_22 = arith.cmpi ne, %tmp8_21, %cst_4 : tensor<1x1xi8, #blocked> loc(#loc85) + %tmp10 = arith.extui %tmp8_22 : tensor<1x1xi1, #blocked> to tensor<1x1xi64, #blocked> loc(#loc120) + %tmp12 = arith.muli %tmp10, %tmp11_14 : tensor<1x1xi64, #blocked> loc(#loc95) + gpu.barrier loc(#loc47) + %4 = tt.addptr %in_out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc48) + %5 = tt.splat %4 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc49) + tt.store %5, %tmp12 : tensor<1x1x!tt.ptr, #blocked> loc(#loc49) + tt.return loc(#loc50) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":26:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":37:48) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":37:41) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":37:34) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":31:40) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":32:31) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":33:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":37:53) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":37:107) +#loc12 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":40:38) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":42:46) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":43:58) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":43:8) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":52:37) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":45:20) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":46:31) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":46:36) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":48:18) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":49:18) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":50:32) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":51:27) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":51:43) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":51:36) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":51:52) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":52:30) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":54:20) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":53:19) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":55:20) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":56:4) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":57:28) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":57:40) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":57:4) +#loc57 = loc("xoffset"(#loc2)) +#loc58 = loc("r0_base"(#loc3)) +#loc59 = loc("tmp0"(#loc4)) +#loc60 = loc("tmp0"(#loc5)) +#loc61 = loc("tmp0"(#loc6)) +#loc62 = loc("_tmp2"(#loc7)) +#loc63 = loc("r0_index"(#loc8)) +#loc64 = loc("r0_mask"(#loc9)) +#loc65 = loc("tmp0"(#loc10)) +#loc66 = loc("tmp0"(#loc11)) +#loc67 = loc("mask"(#loc12)) +#loc68 = loc("equal"(#loc14)) +#loc69 = loc("a_isnan"(#loc15)) +#loc70 = loc("b_isnan"(#loc16)) +#loc71 = loc("mask"(#loc17)) +#loc72 = loc("mask"(#loc18)) +#loc73 = loc("mask"(#loc19)) +#loc74 = loc("equal"(#loc20)) +#loc75 = loc("equal"(#loc21)) +#loc76 = loc("mask"(#loc22)) +#loc77 = loc("mask"(#loc23)) +#loc78 = loc("mask"(#loc24)) +#loc79 = loc(callsite(#loc25 at #loc13)) +#loc80 = loc(callsite(#loc26 at #loc13)) +#loc81 = loc("_tmp2"(#loc27)) +#loc82 = loc("_tmp2_index"(#loc28)) +#loc83 = loc(callsite(#loc30 at #loc31)) +#loc85 = loc("tmp8"(#loc32)) +#loc86 = loc("tmp2"(#loc33)) +#loc87 = loc("tmp11"(#loc34)) +#loc88 = loc("tmp11"(#loc35)) +#loc89 = loc("tmp4"(#loc36)) +#loc90 = loc("tmp5"(#loc37)) +#loc91 = loc("tmp6"(#loc38)) +#loc92 = loc("tmp8"(#loc43)) +#loc93 = loc("tmp10"(#loc44)) +#loc94 = loc("tmp9"(#loc45)) +#loc95 = loc("tmp12"(#loc46)) +#loc96 = loc(fused[#loc60, #loc59]) +#loc97 = loc("_tmp2_index"(#loc62)) +#loc98 = loc("mask"(#loc67)) +#loc99 = loc("equal"(#loc68)) +#loc100 = loc(callsite(#loc69 at #loc13)) +#loc101 = loc(callsite(#loc70 at #loc13)) +#loc102 = loc(callsite(#loc71 at #loc13)) +#loc103 = loc(callsite(#loc72 at #loc13)) +#loc104 = loc("mask"(#loc73)) +#loc105 = loc(callsite(#loc74 at #loc13)) +#loc106 = loc("equal"(#loc75)) +#loc107 = loc(callsite(#loc76 at #loc13)) +#loc108 = loc(callsite(#loc77 at #loc13)) +#loc109 = loc(callsite(#loc78 at #loc13)) +#loc110 = loc(callsite(#loc69 at #loc83)) +#loc111 = loc(callsite(#loc70 at #loc83)) +#loc112 = loc(callsite(#loc71 at #loc83)) +#loc113 = loc(callsite(#loc72 at #loc83)) +#loc114 = loc(callsite(#loc74 at #loc83)) +#loc115 = loc(callsite(#loc76 at #loc83)) +#loc116 = loc(callsite(#loc77 at #loc83)) +#loc117 = loc(callsite(#loc78 at #loc83)) +#loc118 = loc(callsite(#loc25 at #loc83)) +#loc119 = loc(callsite(#loc26 at #loc83)) +#loc120 = loc(fused[#loc93, #loc94]) +#loc121 = loc(callsite(#loc98 at #loc13)) +#loc122 = loc(callsite(#loc99 at #loc13)) +#loc123 = loc(callsite(#loc104 at #loc13)) +#loc124 = loc(callsite(#loc106 at #loc13)) +#loc125 = loc(callsite(#loc98 at #loc83)) +#loc126 = loc(callsite(#loc99 at #loc83)) +#loc127 = loc(callsite(#loc104 at #loc83)) +#loc128 = loc(callsite(#loc106 at #loc83)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..c13a28d096d0df1dcd38b1440735fcd1da296671 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WSOZR3EFPDZB5YEE3B7OEJRZR2CKBBCCPCMZTDJOURP7DYLWKITQ/triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0.ttir @@ -0,0 +1,224 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":44:75) +#loc54 = loc("in_out_ptr0"(#loc)) +#loc55 = loc("in_ptr0"(#loc)) +#loc56 = loc("in_ptr1"(#loc)) +#loc57 = loc("in_ptr2"(#loc)) +#loc58 = loc("xnumel"(#loc)) +#loc59 = loc("r0_numel"(#loc)) +#loc60 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused__to_copy_argmax_index_mul_unsqueeze_0(%in_out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_out_ptr0"(#loc)), %in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %true = arith.constant true loc(#loc60) + %cst = arith.constant dense : tensor<1x2048xi1> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc3) + %c151936_i32 = arith.constant 151936 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %tmp8 = arith.constant dense<0> : tensor<1x1xi8> loc(#loc61) + %cst_1 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<151936> : tensor<1x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<151936> : tensor<1x2048xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc62) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc63) + %xoffset = tt.get_program_id x : i32 loc(#loc64) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc65) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc66) + %_tmp2_index_5:2 = scf.for %r0_offset = %c0_i32 to %c151936_i32 step %c2048_i32 iter_args(%_tmp2_13 = %_tmp2, %_tmp2_index_14 = %_tmp2_index) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc68) + %r0_index_15 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32> loc(#loc68) + %r0_mask = arith.cmpi slt, %r0_index_15, %cst_3 : tensor<1x2048xi32> loc(#loc69) + %tmp0 = arith.muli %xoffset, %c151936_i32 : i32 loc(#loc70) + %tmp0_16 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc103) + %tmp0_17 = arith.addi %r0_index_15, %tmp0_16 : tensor<1x2048xi32> loc(#loc71) + %tmp0_18 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc72) + %tmp0_19 = tt.addptr %tmp0_18, %tmp0_17 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc72) + %tmp0_20 = tt.load %tmp0_19, %r0_mask, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc73) + %tmp0_21 = arith.extf %tmp0_20 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc74) + %mask = arith.cmpf ogt, %_tmp2_13, %tmp0_21 : tensor<1x2048xf32> loc(#loc127) + %equal = arith.cmpf oeq, %_tmp2_13, %tmp0_21 : tensor<1x2048xf32> loc(#loc128) + %a_isnan = arith.cmpf une, %_tmp2_13, %_tmp2_13 : tensor<1x2048xf32> loc(#loc106) + %b_isnan = arith.cmpf une, %tmp0_21, %tmp0_21 : tensor<1x2048xf32> loc(#loc107) + %mask_22 = arith.xori %b_isnan, %cst : tensor<1x2048xi1> loc(#loc108) + %mask_23 = arith.andi %a_isnan, %mask_22 : tensor<1x2048xi1> loc(#loc109) + %mask_24 = arith.ori %mask, %mask_23 : tensor<1x2048xi1> loc(#loc129) + %equal_25 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc111) + %equal_26 = arith.ori %equal, %equal_25 : tensor<1x2048xi1> loc(#loc130) + %mask_27 = arith.cmpi slt, %_tmp2_index_14, %r0_index_15 : tensor<1x2048xi32> loc(#loc113) + %mask_28 = arith.andi %equal_26, %mask_27 : tensor<1x2048xi1> loc(#loc114) + %mask_29 = arith.ori %mask_24, %mask_28 : tensor<1x2048xi1> loc(#loc115) + %6 = arith.select %mask_29, %_tmp2_13, %tmp0_21 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc87) + %7 = arith.select %mask_29, %_tmp2_index_14, %r0_index_15 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc88) + %_tmp2_30 = arith.select %r0_mask, %6, %_tmp2_13 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc89) + %_tmp2_index_31 = arith.select %r0_mask, %7, %_tmp2_index_14 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc90) + scf.yield %_tmp2_30, %_tmp2_index_31 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc34) + } loc(#loc102) + %0:2 = "tt.reduce"(%_tmp2_index_5#0, %_tmp2_index_5#1) <{axis = 1 : i32}> ({ + ^bb0(%arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2)), %arg8: f32 loc(callsite(#loc1 at #loc2)), %arg9: i32 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg6, %arg8 : f32 loc(#loc131) + %equal = arith.cmpf oeq, %arg6, %arg8 : f32 loc(#loc132) + %a_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc116) + %b_isnan = arith.cmpf une, %arg8, %arg8 : f32 loc(#loc117) + %mask_13 = arith.xori %b_isnan, %true : i1 loc(#loc118) + %mask_14 = arith.andi %a_isnan, %mask_13 : i1 loc(#loc119) + %mask_15 = arith.ori %mask, %mask_14 : i1 loc(#loc133) + %equal_16 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc120) + %equal_17 = arith.ori %equal, %equal_16 : i1 loc(#loc134) + %mask_18 = arith.cmpi slt, %arg7, %arg9 : i32 loc(#loc121) + %mask_19 = arith.andi %equal_17, %mask_18 : i1 loc(#loc122) + %mask_20 = arith.ori %mask_15, %mask_19 : i1 loc(#loc123) + %6 = arith.select %mask_20, %arg6, %arg8 : f32 loc(#loc124) + %7 = arith.select %mask_20, %arg7, %arg9 : i32 loc(#loc125) + tt.reduce.return %6, %7 : f32, i32 loc(#loc91) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc91) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc92) + %tmp11 = tt.addptr %in_ptr2, %xoffset : !tt.ptr, i32 loc(#loc93) + %tmp11_6 = tt.splat %tmp11 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc93) + %tmp11_7 = tt.load %tmp11_6 evictionPolicy = evict_last : tensor<1x1x!tt.ptr> loc(#loc94) + %tmp4 = arith.addi %tmp2, %cst_2 : tensor<1x1xi32> loc(#loc95) + %tmp5 = arith.cmpi slt, %tmp2, %cst_1 : tensor<1x1xi32> loc(#loc96) + %tmp6 = arith.select %tmp5, %tmp4, %tmp2 : tensor<1x1xi1>, tensor<1x1xi32> loc(#loc97) + %1 = arith.cmpi sge, %tmp6, %cst_1 : tensor<1x1xi32> loc(#loc42) + %2 = arith.cmpi slt, %tmp6, %cst_2 : tensor<1x1xi32> loc(#loc43) + %3 = arith.andi %1, %2 : tensor<1x1xi1> loc(#loc44) + tt.assert %3, "index out of bounds: 0 <= tmp6 < 151936" : tensor<1x1xi1> loc(#loc45) + %tmp8_8 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc98) + %tmp8_9 = tt.addptr %tmp8_8, %tmp6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc98) + %tmp8_10 = tt.bitcast %tmp8_9 : tensor<1x1x!tt.ptr> -> tensor<1x1x!tt.ptr> loc(#loc61) + %tmp8_11 = tt.load %tmp8_10 evictionPolicy = evict_last : tensor<1x1x!tt.ptr> loc(#loc61) + %tmp8_12 = arith.cmpi ne, %tmp8_11, %tmp8 : tensor<1x1xi8> loc(#loc61) + %tmp10 = arith.extui %tmp8_12 : tensor<1x1xi1> to tensor<1x1xi64> loc(#loc126) + %tmp12 = arith.muli %tmp10, %tmp11_7 : tensor<1x1xi64> loc(#loc101) + gpu.barrier loc(#loc50) + %4 = tt.addptr %in_out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc51) + %5 = tt.splat %4 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc51) + tt.store %5, %tmp12 : tensor<1x1x!tt.ptr> loc(#loc52) + tt.return loc(#loc53) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":31:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":52:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":30:58) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":29:55) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":23:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":26:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":26:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":32:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":33:29) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":37:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":37:41) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":37:34) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":37:53) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":37:107) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":40:38) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":42:46) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":43:58) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":43:8) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":45:20) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":46:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":46:36) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":48:18) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":49:18) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":50:32) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":51:27) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":51:43) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":51:36) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":51:52) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":52:30) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":54:20) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":53:19) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":55:20) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":56:4) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":57:28) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":57:40) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ml/cmlgrfrpyvl5gqlphdfiqxpqawlo6wyjtpwqkky6zywkkdk4h4hl.py":57:4) +#loc61 = loc("tmp8"(#loc4)) +#loc62 = loc("_tmp2_index"(#loc5)) +#loc63 = loc("_tmp2"(#loc6)) +#loc64 = loc("xoffset"(#loc7)) +#loc65 = loc("r0_base"(#loc8)) +#loc66 = loc("r0_base"(#loc9)) +#loc67 = loc("_tmp2"(#loc3)) +#loc68 = loc("r0_index"(#loc10)) +#loc69 = loc("r0_mask"(#loc11)) +#loc70 = loc("tmp0"(#loc12)) +#loc71 = loc("tmp0"(#loc13)) +#loc72 = loc("tmp0"(#loc14)) +#loc73 = loc("tmp0"(#loc15)) +#loc74 = loc("tmp0"(#loc16)) +#loc75 = loc("mask"(#loc17)) +#loc76 = loc("equal"(#loc19)) +#loc77 = loc("a_isnan"(#loc20)) +#loc78 = loc("b_isnan"(#loc21)) +#loc79 = loc("mask"(#loc22)) +#loc80 = loc("mask"(#loc23)) +#loc81 = loc("mask"(#loc24)) +#loc82 = loc("equal"(#loc25)) +#loc83 = loc("equal"(#loc26)) +#loc84 = loc("mask"(#loc27)) +#loc85 = loc("mask"(#loc28)) +#loc86 = loc("mask"(#loc29)) +#loc87 = loc(callsite(#loc30 at #loc18)) +#loc88 = loc(callsite(#loc31 at #loc18)) +#loc89 = loc("_tmp2"(#loc32)) +#loc90 = loc("_tmp2_index"(#loc33)) +#loc91 = loc(callsite(#loc35 at #loc2)) +#loc92 = loc("tmp2"(#loc36)) +#loc93 = loc("tmp11"(#loc37)) +#loc94 = loc("tmp11"(#loc38)) +#loc95 = loc("tmp4"(#loc39)) +#loc96 = loc("tmp5"(#loc40)) +#loc97 = loc("tmp6"(#loc41)) +#loc98 = loc("tmp8"(#loc46)) +#loc99 = loc("tmp10"(#loc47)) +#loc100 = loc("tmp9"(#loc48)) +#loc101 = loc("tmp12"(#loc49)) +#loc102 = loc("_tmp2_index"(#loc67)) +#loc103 = loc(fused[#loc71, #loc70]) +#loc104 = loc("mask"(#loc75)) +#loc105 = loc("equal"(#loc76)) +#loc106 = loc(callsite(#loc77 at #loc18)) +#loc107 = loc(callsite(#loc78 at #loc18)) +#loc108 = loc(callsite(#loc79 at #loc18)) +#loc109 = loc(callsite(#loc80 at #loc18)) +#loc110 = loc("mask"(#loc81)) +#loc111 = loc(callsite(#loc82 at #loc18)) +#loc112 = loc("equal"(#loc83)) +#loc113 = loc(callsite(#loc84 at #loc18)) +#loc114 = loc(callsite(#loc85 at #loc18)) +#loc115 = loc(callsite(#loc86 at #loc18)) +#loc116 = loc(callsite(#loc77 at #loc91)) +#loc117 = loc(callsite(#loc78 at #loc91)) +#loc118 = loc(callsite(#loc79 at #loc91)) +#loc119 = loc(callsite(#loc80 at #loc91)) +#loc120 = loc(callsite(#loc82 at #loc91)) +#loc121 = loc(callsite(#loc84 at #loc91)) +#loc122 = loc(callsite(#loc85 at #loc91)) +#loc123 = loc(callsite(#loc86 at #loc91)) +#loc124 = loc(callsite(#loc30 at #loc91)) +#loc125 = loc(callsite(#loc31 at #loc91)) +#loc126 = loc(fused[#loc99, #loc100]) +#loc127 = loc(callsite(#loc104 at #loc18)) +#loc128 = loc(callsite(#loc105 at #loc18)) +#loc129 = loc(callsite(#loc110 at #loc18)) +#loc130 = loc(callsite(#loc112 at #loc18)) +#loc131 = loc(callsite(#loc104 at #loc91)) +#loc132 = loc(callsite(#loc105 at #loc91)) +#loc133 = loc(callsite(#loc110 at #loc91)) +#loc134 = loc(callsite(#loc112 at #loc91)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/__grp__triton_red_fused_argmax_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/__grp__triton_red_fused_argmax_0.json new file mode 100644 index 0000000000000000000000000000000000000000..76dbe08ea2904477a37a9b353d11f5084a0ad84f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/__grp__triton_red_fused_argmax_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_argmax_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.source", "triton_red_fused_argmax_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.ttir", "triton_red_fused_argmax_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.ttgir", "triton_red_fused_argmax_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.llir", "triton_red_fused_argmax_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.ptx", "triton_red_fused_argmax_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.cubin", "triton_red_fused_argmax_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..c24f61e8a6f6c2299d51f7b586ea4a0c3d2c7eb1 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.json new file mode 100644 index 0000000000000000000000000000000000000000..bb3e238ed53a340bb282f34cea29127d6c1f5923 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.json @@ -0,0 +1 @@ +{"hash": "b4d1ae81e5cde4116a26b8ef72d2fec8a2e50c8a870ee6673a39801781911254", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_argmax_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..f94763335f1b816e92cdb090186161db8bd41a39 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.llir @@ -0,0 +1,438 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_argmax_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !4 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %8 = icmp slt i32 %7, %2, !dbg !8 + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %10 = shl nuw nsw i32 %9, 2, !dbg !9 + %11 = and i32 %10, 2044, !dbg !9 + %12 = mul i32 %7, 32000, !dbg !10 + %13 = zext nneg i32 %11 to i64, !dbg !11 + br label %14, !dbg !11 + +14: ; preds = %6, %14 + %indvars.iv = phi i64 [ 0, %6 ], [ %indvars.iv.next, %14 ] + %15 = phi i32 [ 2147483647, %6 ], [ %102, %14 ] + %16 = phi i32 [ 2147483647, %6 ], [ %103, %14 ] + %17 = phi float [ 0xFFF0000000000000, %6 ], [ %99, %14 ] + %18 = phi float [ 0xFFF0000000000000, %6 ], [ %100, %14 ] + %19 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %98, %14 ] + %20 = phi <2 x i32> [ splat (i32 2147483647), %6 ], [ %101, %14 ] + %21 = or disjoint i64 %indvars.iv, %13, !dbg !12 + %22 = or disjoint i64 %21, 2, !dbg !12 + %23 = or disjoint i64 %21, 3, !dbg !12 + %24 = icmp samesign ult i64 %21, 32000, !dbg !13 + %25 = trunc nuw nsw i64 %21 to i32, !dbg !14 + %26 = add i32 %12, %25, !dbg !14 + %27 = sext i32 %26 to i64, !dbg !15 + %28 = getelementptr bfloat, ptr addrspace(1) %0, i64 %27, !dbg !15 + %29 = and i1 %8, %24, !dbg !16 + %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !17 + %31 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %28, i64 %30, i1 %29) #4, !dbg !17 + %32 = extractvalue { i32, i32 } %31, 0, !dbg !17 + %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !17 + %34 = extractvalue { i32, i32 } %31, 1, !dbg !17 + %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !17 + %36 = extractelement <2 x bfloat> %35, i64 0, !dbg !17 + %37 = extractelement <2 x bfloat> %35, i64 1, !dbg !17 + %38 = fpext bfloat %36 to float, !dbg !18 + %39 = fpext bfloat %37 to float, !dbg !18 + %40 = fcmp ogt float %17, %38, !dbg !19 + %41 = fcmp ogt float %18, %39, !dbg !19 + %42 = fcmp oeq float %17, %38, !dbg !23 + %43 = fcmp oeq float %18, %39, !dbg !23 + %44 = fcmp uno <2 x float> %19, zeroinitializer, !dbg !24 + %45 = fcmp uno float %17, 0.000000e+00, !dbg !24 + %46 = fcmp uno float %18, 0.000000e+00, !dbg !24 + %47 = fcmp uno bfloat %36, 0xR0000, !dbg !25 + %48 = fcmp uno bfloat %37, 0xR0000, !dbg !25 + %49 = xor i1 %47, true, !dbg !26 + %50 = xor i1 %48, true, !dbg !26 + %51 = and i1 %45, %49, !dbg !27 + %52 = and i1 %46, %50, !dbg !27 + %53 = or i1 %40, %51, !dbg !28 + %54 = or i1 %41, %52, !dbg !28 + %55 = and i1 %45, %47, !dbg !29 + %56 = and i1 %46, %48, !dbg !29 + %57 = or i1 %42, %55, !dbg !30 + %58 = or i1 %43, %56, !dbg !30 + %59 = sext <2 x i32> %20 to <2 x i64>, !dbg !31 + %60 = sext i32 %15 to i64, !dbg !31 + %61 = icmp sgt i64 %22, %60, !dbg !31 + %62 = sext i32 %16 to i64, !dbg !31 + %63 = icmp sgt i64 %23, %62, !dbg !31 + %64 = and i1 %61, %57, !dbg !32 + %65 = and i1 %63, %58, !dbg !32 + %66 = or i1 %53, %64, !dbg !33 + %67 = or i1 %54, %65, !dbg !33 + %68 = select i1 %66, float %17, float %38, !dbg !34 + %69 = select i1 %67, float %18, float %39, !dbg !34 + %70 = trunc nuw nsw i64 %21 to i32, !dbg !35 + %71 = or disjoint i32 %70, 1, !dbg !35 + %72 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !18 + %73 = fcmp ogt <2 x float> %19, %72, !dbg !19 + %74 = fcmp oeq <2 x float> %19, %72, !dbg !23 + %75 = fcmp uno <2 x bfloat> %33, zeroinitializer, !dbg !25 + %76 = xor <2 x i1> %75, splat (i1 true), !dbg !26 + %77 = and <2 x i1> %44, %76, !dbg !27 + %78 = or <2 x i1> %73, %77, !dbg !28 + %79 = and <2 x i1> %44, %75, !dbg !29 + %80 = or <2 x i1> %74, %79, !dbg !30 + %81 = insertelement <2 x i64> poison, i64 %21, i64 0, !dbg !31 + %82 = shufflevector <2 x i64> %81, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !31 + %83 = icmp sgt <2 x i64> %82, %59, !dbg !31 + %84 = icmp sge <2 x i64> %82, %59, !dbg !31 + %85 = shufflevector <2 x i1> %83, <2 x i1> %84, <2 x i32> , !dbg !31 + %86 = and <2 x i1> %85, %80, !dbg !32 + %87 = or <2 x i1> %78, %86, !dbg !33 + %88 = select <2 x i1> %87, <2 x float> %19, <2 x float> %72, !dbg !34 + %89 = insertelement <2 x i32> poison, i32 %25, i64 0, !dbg !35 + %90 = insertelement <2 x i32> %89, i32 %71, i64 1, !dbg !35 + %91 = select <2 x i1> %87, <2 x i32> %20, <2 x i32> %90, !dbg !35 + %92 = trunc nuw nsw i64 %22 to i32, !dbg !35 + %93 = select i1 %66, i32 %15, i32 %92, !dbg !35 + %94 = trunc nuw nsw i64 %23 to i32, !dbg !35 + %95 = select i1 %67, i32 %16, i32 %94, !dbg !35 + %96 = insertelement <2 x i1> poison, i1 %29, i64 0, !dbg !36 + %97 = shufflevector <2 x i1> %96, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !36 + %98 = select <2 x i1> %97, <2 x float> %88, <2 x float> %19, !dbg !36 + %99 = select i1 %29, float %68, float %17, !dbg !36 + %100 = select i1 %29, float %69, float %18, !dbg !36 + %101 = select <2 x i1> %97, <2 x i32> %91, <2 x i32> %20, !dbg !37 + %102 = select i1 %29, i32 %93, i32 %15, !dbg !37 + %103 = select i1 %29, i32 %95, i32 %16, !dbg !37 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2048, !dbg !11 + %104 = icmp samesign ult i64 %indvars.iv, 29952, !dbg !11 + br i1 %104, label %14, label %105, !dbg !11 + +105: ; preds = %14 + %106 = and i32 %9, 31, !dbg !9 + %107 = lshr i32 %9, 5, !dbg !9 + %108 = shufflevector <2 x float> %98, <2 x float> poison, <2 x i32> , !dbg !38 + %109 = fcmp ogt <2 x float> %98, %108, !dbg !38 + %110 = fcmp oeq <2 x float> %98, %108, !dbg !38 + %111 = shufflevector <2 x i1> %109, <2 x i1> %110, <2 x i32> , !dbg !38 + %112 = extractelement <2 x float> %98, i64 0, !dbg !40 + %113 = fcmp uno float %112, 0.000000e+00, !dbg !40 + %114 = extractelement <2 x float> %98, i64 1, !dbg !41 + %115 = fcmp uno float %114, 0.000000e+00, !dbg !41 + %116 = xor i1 %115, true, !dbg !42 + %117 = insertelement <2 x i1> poison, i1 %113, i64 0, !dbg !43 + %118 = shufflevector <2 x i1> %117, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !43 + %119 = insertelement <2 x i1> poison, i1 %116, i64 0, !dbg !43 + %120 = insertelement <2 x i1> %119, i1 %115, i64 1, !dbg !43 + %121 = and <2 x i1> %118, %120, !dbg !43 + %122 = or <2 x i1> %111, %121, !dbg !44 + %123 = extractelement <2 x i32> %101, i64 0, !dbg !45 + %124 = extractelement <2 x i32> %101, i64 1, !dbg !45 + %125 = icmp slt i32 %123, %124, !dbg !45 + %126 = extractelement <2 x i1> %122, i64 1, !dbg !46 + %127 = and i1 %125, %126, !dbg !46 + %128 = extractelement <2 x i1> %122, i64 0, !dbg !47 + %129 = or i1 %128, %127, !dbg !47 + %130 = select i1 %129, float %112, float %114, !dbg !48 + %131 = select i1 %129, i32 %123, i32 %124, !dbg !49 + %132 = fcmp ogt float %130, %99, !dbg !38 + %133 = fcmp oeq float %130, %99, !dbg !50 + %134 = fcmp uno float %130, 0.000000e+00, !dbg !40 + %135 = fcmp uno float %99, 0.000000e+00, !dbg !41 + %136 = xor i1 %135, true, !dbg !42 + %137 = and i1 %134, %136, !dbg !43 + %138 = or i1 %132, %137, !dbg !44 + %139 = and i1 %135, %134, !dbg !51 + %140 = or i1 %133, %139, !dbg !52 + %141 = icmp slt i32 %131, %102, !dbg !45 + %142 = and i1 %141, %140, !dbg !46 + %143 = or i1 %138, %142, !dbg !47 + %144 = select i1 %143, float %130, float %99, !dbg !48 + %145 = select i1 %143, i32 %131, i32 %102, !dbg !49 + %146 = fcmp ogt float %144, %100, !dbg !38 + %147 = fcmp oeq float %144, %100, !dbg !50 + %148 = fcmp uno float %144, 0.000000e+00, !dbg !40 + %149 = fcmp uno float %100, 0.000000e+00, !dbg !41 + %150 = xor i1 %149, true, !dbg !42 + %151 = and i1 %148, %150, !dbg !43 + %152 = or i1 %146, %151, !dbg !44 + %153 = and i1 %149, %148, !dbg !51 + %154 = or i1 %147, %153, !dbg !52 + %155 = icmp slt i32 %145, %103, !dbg !45 + %156 = and i1 %155, %154, !dbg !46 + %157 = or i1 %152, %156, !dbg !47 + %158 = select i1 %157, float %144, float %100, !dbg !48 + %159 = select i1 %157, i32 %145, i32 %103, !dbg !49 + %160 = bitcast float %158 to i32, !dbg !53 + %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 16, i32 31), !dbg !53 + %162 = bitcast i32 %161 to float, !dbg !53 + %163 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %159, i32 16, i32 31), !dbg !53 + %164 = fcmp ogt float %158, %162, !dbg !38 + %165 = fcmp oeq float %158, %162, !dbg !50 + %166 = fcmp uno float %158, 0.000000e+00, !dbg !40 + %167 = fcmp uno float %162, 0.000000e+00, !dbg !41 + %168 = xor i1 %167, true, !dbg !42 + %169 = and i1 %166, %168, !dbg !43 + %170 = or i1 %164, %169, !dbg !44 + %171 = and i1 %166, %167, !dbg !51 + %172 = or i1 %165, %171, !dbg !52 + %173 = icmp slt i32 %159, %163, !dbg !45 + %174 = and i1 %173, %172, !dbg !46 + %175 = or i1 %170, %174, !dbg !47 + %176 = select i1 %175, float %158, float %162, !dbg !48 + %177 = select i1 %175, i32 %159, i32 %163, !dbg !49 + %178 = bitcast float %176 to i32, !dbg !53 + %179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 8, i32 31), !dbg !53 + %180 = bitcast i32 %179 to float, !dbg !53 + %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 8, i32 31), !dbg !53 + %182 = fcmp ogt float %176, %180, !dbg !38 + %183 = fcmp oeq float %176, %180, !dbg !50 + %184 = fcmp uno float %176, 0.000000e+00, !dbg !40 + %185 = fcmp uno float %180, 0.000000e+00, !dbg !41 + %186 = xor i1 %185, true, !dbg !42 + %187 = and i1 %184, %186, !dbg !43 + %188 = or i1 %182, %187, !dbg !44 + %189 = and i1 %185, %184, !dbg !51 + %190 = or i1 %183, %189, !dbg !52 + %191 = icmp slt i32 %177, %181, !dbg !45 + %192 = and i1 %191, %190, !dbg !46 + %193 = or i1 %188, %192, !dbg !47 + %194 = select i1 %193, float %176, float %180, !dbg !48 + %195 = select i1 %193, i32 %177, i32 %181, !dbg !49 + %196 = bitcast float %194 to i32, !dbg !53 + %197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 4, i32 31), !dbg !53 + %198 = bitcast i32 %197 to float, !dbg !53 + %199 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %195, i32 4, i32 31), !dbg !53 + %200 = fcmp ogt float %194, %198, !dbg !38 + %201 = fcmp oeq float %194, %198, !dbg !50 + %202 = fcmp uno float %194, 0.000000e+00, !dbg !40 + %203 = fcmp uno float %198, 0.000000e+00, !dbg !41 + %204 = xor i1 %203, true, !dbg !42 + %205 = and i1 %202, %204, !dbg !43 + %206 = or i1 %200, %205, !dbg !44 + %207 = and i1 %203, %202, !dbg !51 + %208 = or i1 %201, %207, !dbg !52 + %209 = icmp slt i32 %195, %199, !dbg !45 + %210 = and i1 %209, %208, !dbg !46 + %211 = or i1 %206, %210, !dbg !47 + %212 = select i1 %211, float %194, float %198, !dbg !48 + %213 = select i1 %211, i32 %195, i32 %199, !dbg !49 + %214 = bitcast float %212 to i32, !dbg !53 + %215 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %214, i32 2, i32 31), !dbg !53 + %216 = bitcast i32 %215 to float, !dbg !53 + %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %213, i32 2, i32 31), !dbg !53 + %218 = fcmp ogt float %212, %216, !dbg !38 + %219 = fcmp oeq float %212, %216, !dbg !50 + %220 = fcmp uno float %212, 0.000000e+00, !dbg !40 + %221 = fcmp uno float %216, 0.000000e+00, !dbg !41 + %222 = xor i1 %221, true, !dbg !42 + %223 = and i1 %220, %222, !dbg !43 + %224 = or i1 %218, %223, !dbg !44 + %225 = and i1 %221, %220, !dbg !51 + %226 = or i1 %219, %225, !dbg !52 + %227 = icmp slt i32 %213, %217, !dbg !45 + %228 = and i1 %227, %226, !dbg !46 + %229 = or i1 %224, %228, !dbg !47 + %230 = select i1 %229, float %212, float %216, !dbg !48 + %231 = select i1 %229, i32 %213, i32 %217, !dbg !49 + %232 = bitcast float %230 to i32, !dbg !53 + %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !53 + %234 = bitcast i32 %233 to float, !dbg !53 + %235 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %231, i32 1, i32 31), !dbg !53 + %236 = fcmp ogt float %230, %234, !dbg !38 + %237 = fcmp oeq float %230, %234, !dbg !50 + %238 = fcmp uno float %230, 0.000000e+00, !dbg !40 + %239 = fcmp uno float %234, 0.000000e+00, !dbg !41 + %240 = xor i1 %239, true, !dbg !42 + %241 = and i1 %238, %240, !dbg !43 + %242 = or i1 %236, %241, !dbg !44 + %243 = and i1 %239, %238, !dbg !51 + %244 = or i1 %237, %243, !dbg !52 + %245 = icmp slt i32 %231, %235, !dbg !45 + %246 = and i1 %245, %244, !dbg !46 + %247 = or i1 %242, %246, !dbg !47 + %248 = select i1 %247, i32 %231, i32 %235, !dbg !49 + %249 = and i32 %107, 15, !dbg !53 + %250 = icmp eq i32 %106, 0, !dbg !53 + %251 = getelementptr float, ptr addrspace(3) @global_smem, i32 %249, !dbg !53 + %252 = select i1 %247, i32 %232, i32 %233, !dbg !48 + %253 = insertelement <1 x i32> poison, i32 %252, i64 0, !dbg !53 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %251, <1 x i32> %253, i1 %250) #4, !dbg !53 + %254 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %249, !dbg !53 + %255 = insertelement <1 x i32> poison, i32 %248, i64 0, !dbg !53 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %254, <1 x i32> %255, i1 %250) #4, !dbg !53 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %256 = icmp samesign ult i32 %9, 16, !dbg !53 + %257 = getelementptr float, ptr addrspace(3) @global_smem, i32 %9, !dbg !53 + %258 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %257, i1 %256) #4, !dbg !53 + %259 = bitcast i32 %258 to float, !dbg !53 + %260 = getelementptr i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), i32 %9, !dbg !53 + %261 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %260, i1 %256) #4, !dbg !53 + %262 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %258, i32 8, i32 31), !dbg !53 + %263 = bitcast i32 %262 to float, !dbg !53 + %264 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %261, i32 8, i32 31), !dbg !53 + %265 = fcmp ogt float %259, %263, !dbg !38 + %266 = fcmp oeq float %259, %263, !dbg !50 + %267 = fcmp uno float %259, 0.000000e+00, !dbg !40 + %268 = fcmp uno float %263, 0.000000e+00, !dbg !41 + %269 = xor i1 %268, true, !dbg !42 + %270 = and i1 %267, %269, !dbg !43 + %271 = or i1 %265, %270, !dbg !44 + %272 = and i1 %267, %268, !dbg !51 + %273 = or i1 %266, %272, !dbg !52 + %274 = icmp slt i32 %261, %264, !dbg !45 + %275 = and i1 %274, %273, !dbg !46 + %276 = or i1 %271, %275, !dbg !47 + %277 = select i1 %276, float %259, float %263, !dbg !48 + %278 = select i1 %276, i32 %261, i32 %264, !dbg !49 + %279 = bitcast float %277 to i32, !dbg !53 + %280 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %279, i32 4, i32 31), !dbg !53 + %281 = bitcast i32 %280 to float, !dbg !53 + %282 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %278, i32 4, i32 31), !dbg !53 + %283 = fcmp ogt float %277, %281, !dbg !38 + %284 = fcmp oeq float %277, %281, !dbg !50 + %285 = fcmp uno float %277, 0.000000e+00, !dbg !40 + %286 = fcmp uno float %281, 0.000000e+00, !dbg !41 + %287 = xor i1 %286, true, !dbg !42 + %288 = and i1 %285, %287, !dbg !43 + %289 = or i1 %283, %288, !dbg !44 + %290 = and i1 %286, %285, !dbg !51 + %291 = or i1 %284, %290, !dbg !52 + %292 = icmp slt i32 %278, %282, !dbg !45 + %293 = and i1 %292, %291, !dbg !46 + %294 = or i1 %289, %293, !dbg !47 + %295 = select i1 %294, float %277, float %281, !dbg !48 + %296 = select i1 %294, i32 %278, i32 %282, !dbg !49 + %297 = bitcast float %295 to i32, !dbg !53 + %298 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %297, i32 2, i32 31), !dbg !53 + %299 = bitcast i32 %298 to float, !dbg !53 + %300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %296, i32 2, i32 31), !dbg !53 + %301 = fcmp ogt float %295, %299, !dbg !38 + %302 = fcmp oeq float %295, %299, !dbg !50 + %303 = fcmp uno float %295, 0.000000e+00, !dbg !40 + %304 = fcmp uno float %299, 0.000000e+00, !dbg !41 + %305 = xor i1 %304, true, !dbg !42 + %306 = and i1 %303, %305, !dbg !43 + %307 = or i1 %301, %306, !dbg !44 + %308 = and i1 %304, %303, !dbg !51 + %309 = or i1 %302, %308, !dbg !52 + %310 = icmp slt i32 %296, %300, !dbg !45 + %311 = and i1 %310, %309, !dbg !46 + %312 = or i1 %307, %311, !dbg !47 + %313 = select i1 %312, float %295, float %299, !dbg !48 + %314 = select i1 %312, i32 %296, i32 %300, !dbg !49 + %315 = bitcast float %313 to i32, !dbg !53 + %316 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %315, i32 1, i32 31), !dbg !53 + %317 = bitcast i32 %316 to float, !dbg !53 + %318 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !53 + %319 = fcmp ogt float %313, %317, !dbg !38 + %320 = fcmp oeq float %313, %317, !dbg !50 + %321 = fcmp uno float %313, 0.000000e+00, !dbg !40 + %322 = fcmp uno float %317, 0.000000e+00, !dbg !41 + %323 = xor i1 %322, true, !dbg !42 + %324 = and i1 %321, %323, !dbg !43 + %325 = or i1 %319, %324, !dbg !44 + %326 = and i1 %322, %321, !dbg !51 + %327 = or i1 %320, %326, !dbg !52 + %328 = icmp slt i32 %314, %318, !dbg !45 + %329 = and i1 %328, %327, !dbg !46 + %330 = or i1 %325, %329, !dbg !47 + %331 = select i1 %330, i32 %314, i32 %318, !dbg !49 + %332 = icmp eq i32 %9, 0, !dbg !53 + %333 = select i1 %330, i32 %315, i32 %316, !dbg !48 + %334 = insertelement <1 x i32> poison, i32 %333, i64 0, !dbg !53 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %257, <1 x i32> %334, i1 %332) #4, !dbg !53 + %335 = insertelement <1 x i32> poison, i32 %331, i64 0, !dbg !53 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %260, <1 x i32> %335, i1 %332) #4, !dbg !53 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !53 + %336 = load i32, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 64), align 16, !dbg !53 + %337 = zext nneg i32 %7 to i64, !dbg !54 + %338 = getelementptr i64, ptr addrspace(1) %1, i64 %337, !dbg !54 + %339 = sext i32 %336 to i64, !dbg !55 + %340 = and i32 %9, 511, !dbg !55 + %341 = icmp eq i32 %340, 0, !dbg !55 + %342 = and i1 %341, %8, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %339, ptr addrspace(1) %338, i1 %342) #4, !dbg !55 + ret void, !dbg !56 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_argmax_0", linkageName: "triton_red_fused_argmax_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 24, column: 21, scope: !4) +!9 = !DILocation(line: 25, column: 37, scope: !4) +!10 = !DILocation(line: 36, column: 47, scope: !4) +!11 = !DILocation(line: 30, column: 40, scope: !4) +!12 = !DILocation(line: 31, column: 31, scope: !4) +!13 = !DILocation(line: 32, column: 29, scope: !4) +!14 = !DILocation(line: 36, column: 41, scope: !4) +!15 = !DILocation(line: 36, column: 34, scope: !4) +!16 = !DILocation(line: 36, column: 62, scope: !4) +!17 = !DILocation(line: 36, column: 52, scope: !4) +!18 = !DILocation(line: 36, column: 114, scope: !4) +!19 = !DILocation(line: 144, column: 21, scope: !20, inlinedAt: !22) +!20 = distinct !DILexicalBlockFile(scope: !4, file: !21, discriminator: 0) +!21 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!22 = !DILocation(line: 39, column: 38, scope: !4) +!23 = !DILocation(line: 145, column: 23, scope: !20, inlinedAt: !22) +!24 = !DILocation(line: 147, column: 29, scope: !20, inlinedAt: !22) +!25 = !DILocation(line: 148, column: 29, scope: !20, inlinedAt: !22) +!26 = !DILocation(line: 149, column: 31, scope: !20, inlinedAt: !22) +!27 = !DILocation(line: 149, column: 27, scope: !20, inlinedAt: !22) +!28 = !DILocation(line: 149, column: 16, scope: !20, inlinedAt: !22) +!29 = !DILocation(line: 151, column: 27, scope: !20, inlinedAt: !22) +!30 = !DILocation(line: 151, column: 17, scope: !20, inlinedAt: !22) +!31 = !DILocation(line: 154, column: 31, scope: !20, inlinedAt: !22) +!32 = !DILocation(line: 154, column: 21, scope: !20, inlinedAt: !22) +!33 = !DILocation(line: 154, column: 12, scope: !20, inlinedAt: !22) +!34 = !DILocation(line: 155, column: 35, scope: !20, inlinedAt: !22) +!35 = !DILocation(line: 155, column: 69, scope: !20, inlinedAt: !22) +!36 = !DILocation(line: 41, column: 54, scope: !4) +!37 = !DILocation(line: 42, column: 66, scope: !4) +!38 = !DILocation(line: 144, column: 21, scope: !20, inlinedAt: !39) +!39 = !DILocation(line: 43, column: 75, scope: !4) +!40 = !DILocation(line: 147, column: 29, scope: !20, inlinedAt: !39) +!41 = !DILocation(line: 148, column: 29, scope: !20, inlinedAt: !39) +!42 = !DILocation(line: 149, column: 31, scope: !20, inlinedAt: !39) +!43 = !DILocation(line: 149, column: 27, scope: !20, inlinedAt: !39) +!44 = !DILocation(line: 149, column: 16, scope: !20, inlinedAt: !39) +!45 = !DILocation(line: 154, column: 31, scope: !20, inlinedAt: !39) +!46 = !DILocation(line: 154, column: 21, scope: !20, inlinedAt: !39) +!47 = !DILocation(line: 154, column: 12, scope: !20, inlinedAt: !39) +!48 = !DILocation(line: 155, column: 35, scope: !20, inlinedAt: !39) +!49 = !DILocation(line: 155, column: 69, scope: !20, inlinedAt: !39) +!50 = !DILocation(line: 145, column: 23, scope: !20, inlinedAt: !39) +!51 = !DILocation(line: 151, column: 27, scope: !20, inlinedAt: !39) +!52 = !DILocation(line: 151, column: 17, scope: !20, inlinedAt: !39) +!53 = !DILocation(line: 165, column: 42, scope: !20, inlinedAt: !39) +!54 = !DILocation(line: 45, column: 25, scope: !4) +!55 = !DILocation(line: 45, column: 36, scope: !4) +!56 = !DILocation(line: 45, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..e6a2fe27246f91fc5ebb89be76bc0591873ba4d5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.ptx @@ -0,0 +1,838 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_argmax_0 // -- Begin function triton_red_fused_argmax_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_argmax_0 +.visible .entry triton_red_fused_argmax_0( + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_0_param_1, + .param .u32 triton_red_fused_argmax_0_param_2, + .param .u32 triton_red_fused_argmax_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_argmax_0_param_5 +) +.reqntid 512 +{ + .reg .pred %p<206>; + .reg .b16 %rs<5>; + .reg .b32 %r<115>; + .reg .b64 %rd<27>; + .loc 1 18 0 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:18:0 + +// %bb.0: + ld.param.b32 %r15, [triton_red_fused_argmax_0_param_2]; + ld.param.b64 %rd8, [triton_red_fused_argmax_0_param_1]; + ld.param.b64 %rd7, [triton_red_fused_argmax_0_param_0]; +$L__tmp0: + .loc 1 22 28 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:22:28 + mov.u32 %r1, %ctaid.x; + .loc 1 25 37 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:25:37 + mov.u32 %r2, %tid.x; + shl.b32 %r20, %r2, 2; + and.b32 %r21, %r20, 2044; + .loc 1 30 40 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:30:40 + cvt.u64.u32 %rd1, %r21; + mad.lo.s32 %r22, %r1, 32000, %r21; + cvt.u64.u32 %rd2, %r22; + mov.b32 %r111, 0fFF800000; + mov.b64 %rd26, {%r111, %r111}; + mov.b32 %r109, 2147483647; + mov.b64 %rd25, -2048; + setp.lt.s32 %p2, %r1, %r15; + mov.b32 %r110, %r109; + mov.b32 %r112, %r111; + mov.b32 %r113, %r109; + mov.b32 %r114, %r109; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 31 31 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:31:31 + add.s64 %rd14, %rd1, %rd25; + add.s64 %rd15, %rd14, 2048; + add.s64 %rd16, %rd14, 2050; + .loc 1 32 29 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:32:29 + add.s64 %rd17, %rd14, 2051; + setp.lt.u64 %p3, %rd15, 32000; + .loc 1 36 34 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:36:34 + add.s64 %rd18, %rd2, %rd25; + cvt.u32.u64 %r27, %rd18; + add.s32 %r28, %r27, 2048; + mad.wide.s32 %rd12, %r28, 2, %rd7; + .loc 1 36 62 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:36:62 + and.pred %p1, %p2, %p3; + .loc 1 36 52 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:36:52 + // begin inline asm + mov.u64 %rd11, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd11, 1.0; + // end inline asm + mov.b32 %r25, 0; + // begin inline asm + mov.u32 %r23, %r25; + mov.u32 %r24, %r25; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r23, %r24 }, [ %rd12 + 0 ], %rd11; + // end inline asm + mov.b32 {%rs1, %rs2}, %r24; + .loc 1 36 114 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:36:114 + cvt.f32.bf16 %r29, %rs1; + cvt.f32.bf16 %r30, %rs2; +$L__tmp1: + .loc 2 144 21 // triton_helpers.py:144:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + setp.gt.f32 %p4, %r111, %r29; + setp.gt.f32 %p5, %r112, %r30; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + setp.eq.f32 %p6, %r111, %r29; + setp.eq.f32 %p7, %r112, %r30; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + mov.b64 {%r31, %r32}, %rd26; + setp.nan.f32 %p8, %r31, %r31; + setp.nan.f32 %p9, %r32, %r32; + setp.nan.f32 %p10, %r111, %r111; + setp.nan.f32 %p11, %r112, %r112; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + setp.nan.bf16 %p12, %rs1, %rs1; + setp.num.bf16 %p13, %rs1, %rs1; + setp.nan.bf16 %p14, %rs2, %rs2; + setp.num.bf16 %p15, %rs2, %rs2; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + and.pred %p16, %p10, %p13; + and.pred %p17, %p11, %p15; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + or.pred %p18, %p4, %p16; + or.pred %p19, %p5, %p17; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + and.pred %p20, %p10, %p12; + and.pred %p21, %p11, %p14; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + or.pred %p22, %p6, %p20; + or.pred %p23, %p7, %p21; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + cvt.s64.s32 %rd19, %r114; + cvt.s64.s32 %rd20, %r113; + cvt.s64.s32 %rd21, %r109; + setp.gt.s64 %p24, %rd16, %rd21; + cvt.s64.s32 %rd22, %r110; + setp.gt.s64 %p25, %rd17, %rd22; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + and.pred %p26, %p24, %p22; + and.pred %p27, %p25, %p23; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + or.pred %p28, %p18, %p26; + or.pred %p29, %p19, %p27; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + selp.f32 %r33, %r111, %r29, %p28; + selp.f32 %r34, %r112, %r30, %p29; +$L__tmp2: + .loc 1 36 114 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:36:114 + mov.b32 {%rs3, %rs4}, %r23; + cvt.f32.bf16 %r35, %rs3; + cvt.f32.bf16 %r36, %rs4; +$L__tmp3: + .loc 2 144 21 // triton_helpers.py:144:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + setp.gt.f32 %p30, %r32, %r36; + setp.gt.f32 %p31, %r31, %r35; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + setp.eq.f32 %p32, %r31, %r35; + setp.eq.f32 %p33, %r32, %r36; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + setp.nan.bf16x2 %p34|%p35, %r23, %r25; + setp.num.bf16x2 %p36|%p37, %r23, %r25; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + and.pred %p38, %p9, %p37; + and.pred %p39, %p8, %p36; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + or.pred %p40, %p31, %p39; + or.pred %p41, %p30, %p38; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + and.pred %p42, %p8, %p34; + and.pred %p43, %p9, %p35; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + or.pred %p44, %p33, %p43; + or.pred %p45, %p32, %p42; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + setp.gt.s64 %p46, %rd15, %rd20; + setp.ge.s64 %p47, %rd15, %rd19; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + and.pred %p48, %p46, %p45; + and.pred %p49, %p47, %p44; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + or.pred %p50, %p41, %p49; + or.pred %p51, %p40, %p48; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + selp.f32 %r37, %r31, %r35, %p51; + selp.f32 %r38, %r32, %r36, %p50; + cvt.u32.u64 %r39, %rd15; + cvt.u32.u64 %r40, %rd14; + add.s32 %r41, %r40, 2049; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:39:38 ] + selp.b32 %r42, %r113, %r39, %p51; + selp.b32 %r43, %r114, %r41, %p50; + cvt.u32.u64 %r44, %rd16; + selp.b32 %r45, %r109, %r44, %p28; + cvt.u32.u64 %r46, %rd17; + selp.b32 %r47, %r110, %r46, %p29; +$L__tmp4: + .loc 1 41 54 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:41:54 + selp.f32 %r48, %r38, %r32, %p1; + selp.f32 %r49, %r37, %r31, %p1; + mov.b64 %rd26, {%r49, %r48}; + selp.f32 %r111, %r33, %r111, %p1; + selp.f32 %r112, %r34, %r112, %p1; + .loc 1 42 66 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:42:66 + selp.b32 %r114, %r43, %r114, %p1; + selp.b32 %r113, %r42, %r113, %p1; + selp.b32 %r109, %r45, %r109, %p1; + selp.b32 %r110, %r47, %r110, %p1; + .loc 1 30 40 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:30:40 + add.s64 %rd25, %rd25, 2048; + setp.lt.u64 %p52, %rd25, 29952; + @%p52 bra $L__BB0_1; +// %bb.2: + .loc 1 25 37 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:25:37 + and.b32 %r62, %r2, 31; +$L__tmp5: + .loc 2 144 21 // triton_helpers.py:144:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + mov.b64 {%r63, %r64}, %rd26; + setp.gt.f32 %p61, %r63, %r64; + setp.eq.f32 %p62, %r64, %r63; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p63, %r63, %r63; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.num.f32 %p64, %r64, %r64; + setp.nan.f32 %p65, %r64, %r64; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p66, %p63, %p65; + and.pred %p67, %p63, %p64; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p68, %p61, %p67; + or.pred %p69, %p62, %p66; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.lt.s32 %p70, %r113, %r114; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p71, %p70, %p69; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p72, %p68, %p71; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.f32 %r65, %r63, %r64, %p72; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.b32 %r66, %r113, %r114, %p72; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.gt.f32 %p73, %r65, %r111; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.eq.f32 %p74, %r65, %r111; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p75, %r65, %r65; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p76, %r111, %r111; + setp.num.f32 %p77, %r111, %r111; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p78, %p75, %p77; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p79, %p73, %p78; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p80, %p76, %p75; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p81, %p74, %p80; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.lt.s32 %p82, %r66, %r109; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p83, %p82, %p81; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p84, %p79, %p83; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.f32 %r67, %r65, %r111, %p84; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.b32 %r68, %r66, %r109, %p84; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.gt.f32 %p85, %r67, %r112; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.eq.f32 %p86, %r67, %r112; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p87, %r67, %r67; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p88, %r112, %r112; + setp.num.f32 %p89, %r112, %r112; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p90, %p87, %p89; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p91, %p85, %p90; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p92, %p88, %p87; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p93, %p86, %p92; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.lt.s32 %p94, %r68, %r110; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p95, %p94, %p93; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p96, %p91, %p95; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.f32 %r69, %r67, %r112, %p96; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.b32 %r70, %r68, %r110, %p96; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + shfl.sync.bfly.b32 %r71, %r69, 16, 31, -1; + shfl.sync.bfly.b32 %r72, %r70, 16, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.gt.f32 %p97, %r69, %r71; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.eq.f32 %p98, %r69, %r71; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p99, %r69, %r69; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p100, %r71, %r71; + setp.num.f32 %p101, %r71, %r71; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p102, %p99, %p101; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p103, %p97, %p102; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p104, %p99, %p100; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p105, %p98, %p104; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.lt.s32 %p106, %r70, %r72; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p107, %p106, %p105; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p108, %p103, %p107; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.f32 %r73, %r69, %r71, %p108; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.b32 %r74, %r70, %r72, %p108; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + shfl.sync.bfly.b32 %r75, %r73, 8, 31, -1; + shfl.sync.bfly.b32 %r76, %r74, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.gt.f32 %p109, %r73, %r75; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.eq.f32 %p110, %r73, %r75; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p111, %r73, %r73; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p112, %r75, %r75; + setp.num.f32 %p113, %r75, %r75; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p114, %p111, %p113; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p115, %p109, %p114; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p116, %p112, %p111; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p117, %p110, %p116; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.lt.s32 %p118, %r74, %r76; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p119, %p118, %p117; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p120, %p115, %p119; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.f32 %r77, %r73, %r75, %p120; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.b32 %r78, %r74, %r76, %p120; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + shfl.sync.bfly.b32 %r79, %r77, 4, 31, -1; + shfl.sync.bfly.b32 %r80, %r78, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.gt.f32 %p121, %r77, %r79; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.eq.f32 %p122, %r77, %r79; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p123, %r77, %r77; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p124, %r79, %r79; + setp.num.f32 %p125, %r79, %r79; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p126, %p123, %p125; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p127, %p121, %p126; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p128, %p124, %p123; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p129, %p122, %p128; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.lt.s32 %p130, %r78, %r80; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p131, %p130, %p129; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p132, %p127, %p131; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.f32 %r81, %r77, %r79, %p132; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.b32 %r82, %r78, %r80, %p132; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + shfl.sync.bfly.b32 %r83, %r81, 2, 31, -1; + shfl.sync.bfly.b32 %r84, %r82, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.gt.f32 %p133, %r81, %r83; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.eq.f32 %p134, %r81, %r83; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p135, %r81, %r81; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p136, %r83, %r83; + setp.num.f32 %p137, %r83, %r83; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p138, %p135, %p137; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p139, %p133, %p138; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p140, %p136, %p135; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p141, %p134, %p140; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.lt.s32 %p142, %r82, %r84; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p143, %p142, %p141; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p144, %p139, %p143; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.f32 %r85, %r81, %r83, %p144; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.b32 %r86, %r82, %r84, %p144; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + shfl.sync.bfly.b32 %r87, %r85, 1, 31, -1; + shfl.sync.bfly.b32 %r88, %r86, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.gt.f32 %p145, %r85, %r87; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.eq.f32 %p146, %r85, %r87; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p147, %r85, %r85; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p148, %r87, %r87; + setp.num.f32 %p149, %r87, %r87; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p150, %p147, %p149; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p151, %p145, %p150; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p152, %p148, %p147; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p153, %p146, %p152; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.lt.s32 %p154, %r86, %r88; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p155, %p154, %p153; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p156, %p151, %p155; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.b32 %r53, %r86, %r88, %p156; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.eq.b32 %p53, %r62, 0; + shr.u32 %r89, %r2, 3; + and.b32 %r90, %r89, 60; + mov.b32 %r91, global_smem; + add.s32 %r50, %r91, %r90; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.b32 %r51, %r85, %r87, %p156; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + // begin inline asm + @%p53 st.shared.b32 [ %r50 + 0 ], %r51; + // end inline asm + add.s32 %r92, %r91, 64; + add.s32 %r52, %r92, %r90; + // begin inline asm + @%p53 st.shared.b32 [ %r52 + 0 ], %r53; + // end inline asm + bar.sync 0; + setp.lt.u32 %p55, %r2, 16; + add.s32 %r55, %r91, %r20; + // begin inline asm + @%p55 ld.shared.b32 %r54, [ %r55 + 0 ]; + // end inline asm + add.s32 %r57, %r92, %r20; + // begin inline asm + @%p55 ld.shared.b32 %r56, [ %r57 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r94, %r54, 8, 31, -1; + shfl.sync.bfly.b32 %r95, %r56, 8, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.gt.f32 %p157, %r54, %r94; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.eq.f32 %p158, %r54, %r94; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p159, %r54, %r54; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p160, %r94, %r94; + setp.num.f32 %p161, %r94, %r94; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p162, %p159, %p161; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p163, %p157, %p162; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p164, %p159, %p160; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p165, %p158, %p164; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.lt.s32 %p166, %r56, %r95; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p167, %p166, %p165; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p168, %p163, %p167; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.f32 %r96, %r54, %r94, %p168; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.b32 %r97, %r56, %r95, %p168; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + shfl.sync.bfly.b32 %r98, %r96, 4, 31, -1; + shfl.sync.bfly.b32 %r99, %r97, 4, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.gt.f32 %p169, %r96, %r98; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.eq.f32 %p170, %r96, %r98; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p171, %r96, %r96; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p172, %r98, %r98; + setp.num.f32 %p173, %r98, %r98; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p174, %p171, %p173; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p175, %p169, %p174; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p176, %p172, %p171; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p177, %p170, %p176; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.lt.s32 %p178, %r97, %r99; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p179, %p178, %p177; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p180, %p175, %p179; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.f32 %r100, %r96, %r98, %p180; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.b32 %r101, %r97, %r99, %p180; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + shfl.sync.bfly.b32 %r102, %r100, 2, 31, -1; + shfl.sync.bfly.b32 %r103, %r101, 2, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.gt.f32 %p181, %r100, %r102; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.eq.f32 %p182, %r100, %r102; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p183, %r100, %r100; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p184, %r102, %r102; + setp.num.f32 %p185, %r102, %r102; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p186, %p183, %p185; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p187, %p181, %p186; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p188, %p184, %p183; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p189, %p182, %p188; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.lt.s32 %p190, %r101, %r103; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p191, %p190, %p189; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p192, %p187, %p191; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.f32 %r104, %r100, %r102, %p192; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.b32 %r105, %r101, %r103, %p192; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + shfl.sync.bfly.b32 %r106, %r104, 1, 31, -1; + shfl.sync.bfly.b32 %r107, %r105, 1, 31, -1; + .loc 2 144 21 // triton_helpers.py:144:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.gt.f32 %p193, %r104, %r106; + .loc 2 145 23 // triton_helpers.py:145:23 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.eq.f32 %p194, %r104, %r106; + .loc 2 147 29 // triton_helpers.py:147:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p195, %r104, %r104; + .loc 2 148 29 // triton_helpers.py:148:29 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.nan.f32 %p196, %r106, %r106; + setp.num.f32 %p197, %r106, %r106; + .loc 2 149 27 // triton_helpers.py:149:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p198, %p195, %p197; + .loc 2 149 16 // triton_helpers.py:149:16 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p199, %p193, %p198; + .loc 2 151 27 // triton_helpers.py:151:27 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p200, %p196, %p195; + .loc 2 151 17 // triton_helpers.py:151:17 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p201, %p194, %p200; + .loc 2 154 31 // triton_helpers.py:154:31 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.lt.s32 %p202, %r105, %r107; + .loc 2 154 21 // triton_helpers.py:154:21 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + and.pred %p203, %p202, %p201; + .loc 2 154 12 // triton_helpers.py:154:12 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + or.pred %p204, %p199, %p203; + .loc 2 155 69 // triton_helpers.py:155:69 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.b32 %r61, %r105, %r107, %p204; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + setp.eq.b32 %p57, %r2, 0; + .loc 2 155 35 // triton_helpers.py:155:35 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + selp.b32 %r59, %r104, %r106, %p204; + .loc 2 165 42 // triton_helpers.py:165:42 @[ c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:43:75 ] + // begin inline asm + @%p57 st.shared.b32 [ %r55 + 0 ], %r59; + // end inline asm + // begin inline asm + @%p57 st.shared.b32 [ %r57 + 0 ], %r61; + // end inline asm + bar.sync 0; + ld.shared.s32 %rd23, [global_smem+64]; +$L__tmp6: + .loc 1 45 25 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:45:25 + mad.wide.u32 %rd24, %r1, 8, %rd8; + .loc 1 45 36 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:45:36 + and.b32 %r108, %r2, 511; + setp.eq.b32 %p205, %r108, 0; + and.pred %p59, %p205, %p2; + // begin inline asm + @%p59 st.global.b64 [ %rd24 + 0 ], { %rd23 }; + // end inline asm + .loc 1 45 4 // c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py:45:4 + ret; +$L__tmp7: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 234 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe3 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 52 +.b8 50 +.b8 104 +.b8 55 +.b8 118 +.b8 105 +.b8 115 +.b8 110 +.b8 52 +.b8 103 +.b8 117 +.b8 115 +.b8 115 +.b8 55 +.b8 115 +.b8 119 +.b8 120 +.b8 106 +.b8 52 +.b8 117 +.b8 112 +.b8 50 +.b8 101 +.b8 114 +.b8 52 +.b8 105 +.b8 106 +.b8 101 +.b8 52 +.b8 104 +.b8 121 +.b8 110 +.b8 111 +.b8 55 +.b8 121 +.b8 114 +.b8 117 +.b8 103 +.b8 104 +.b8 117 +.b8 118 +.b8 117 +.b8 114 +.b8 110 +.b8 101 +.b8 110 +.b8 104 +.b8 50 +.b8 112 +.b8 118 +.b8 100 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 52 +.b8 50 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1c DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 97 +.b8 114 +.b8 103 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa7:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbc:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 39 // DW_AT_call_line +.b8 38 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xd4:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 43 // DW_AT_call_line +.b8 75 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.source new file mode 100644 index 0000000000000000000000000000000000000000..7284f284212ef88d06e82ae078943c766833fffd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.source @@ -0,0 +1,300 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":18:0) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc44 = loc(unknown) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc69 = loc("in_ptr0"(#loc)) +#loc70 = loc("out_ptr0"(#loc)) +#loc71 = loc("xnumel"(#loc)) +#loc72 = loc("r0_numel"(#loc)) +#loc98 = loc("a_value"(#loc32)) +#loc99 = loc("a_index"(#loc32)) +#loc100 = loc("b_value"(#loc32)) +#loc101 = loc("b_index"(#loc32)) +#loc114 = loc("x"(#loc52)) +#loc115 = loc("x"(#loc56)) +#loc116 = loc("value"(#loc65)) +#loc117 = loc("index"(#loc65)) +module { + tt.func public @triton_red_fused_argmax_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 32000 : i32 loc(#loc73) + %xoffset = tt.get_program_id x : i32 loc(#loc74) + %xoffset_1 = arith.constant 1 : i32 loc(#loc75) + %xoffset_2 = arith.constant 1 : i32 loc(#loc75) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc75) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc76) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc77) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc78) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc78) + %xmask = tt.splat %xnumel : i32 -> tensor<1x1xi32> loc(#loc79) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc79) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc80) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc81) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc82) + %_tmp2_9 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc82) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc83) + %_tmp2_index_10 = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc83) + %c0_i32 = arith.constant 0 : i32 loc(#loc12) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc12) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc12) + %1 = arith.bitcast %r0_numel_0 : i32 to i32 loc(#loc12) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc12) + %3 = ub.poison : i32 loc(#loc12) + %_tmp2_index_11:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_12 = %_tmp2_9, %_tmp2_index_13 = %_tmp2_index_10) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc85) + %r0_index_14 = arith.addi %r0_index, %r0_base_8 : tensor<1x2048xi32> loc(#loc85) + %r0_mask = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc86) + %r0_mask_15 = arith.cmpi slt, %r0_index_14, %r0_mask : tensor<1x2048xi32> loc(#loc86) + %tmp0 = arith.constant 32000 : i32 loc(#loc87) + %tmp0_16 = arith.constant 32000 : i32 loc(#loc87) + %tmp0_17 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc87) + %tmp0_18 = arith.muli %tmp0_17, %xindex_6 : tensor<1x1xi32> loc(#loc87) + %tmp0_19 = tt.broadcast %tmp0_18 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc88) + %tmp0_20 = arith.addi %r0_index_14, %tmp0_19 : tensor<1x2048xi32> loc(#loc88) + %tmp0_21 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc89) + %tmp0_22 = tt.addptr %tmp0_21, %tmp0_20 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc89) + %tmp0_23 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc90) + %tmp0_24 = arith.andi %r0_mask_15, %tmp0_23 : tensor<1x2048xi1> loc(#loc90) + %tmp0_25 = arith.constant 0.000000e+00 : f32 loc(#loc91) + %tmp0_26 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc91) + %tmp0_27 = arith.truncf %tmp0_26 : tensor<1x2048xf32> to tensor<1x2048xbf16> loc(#loc91) + %tmp0_28 = tt.load %tmp0_22, %tmp0_24, %tmp0_27 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc91) + %tmp0_29 = arith.extf %tmp0_28 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc92) + %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%_tmp2_12, %_tmp2_index_13, %tmp0_29, %r0_index_14) : (tensor<1x2048xf32>, tensor<1x2048xi32>, tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) loc(#loc21) + %_tmp2_30 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc93) + %_tmp2_31 = arith.andi %r0_mask_15, %_tmp2_30 : tensor<1x2048xi1> loc(#loc93) + %_tmp2_32 = arith.select %_tmp2_31, %8#0, %_tmp2_12 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc94) + %_tmp2_index_33 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc95) + %_tmp2_index_34 = arith.andi %r0_mask_15, %_tmp2_index_33 : tensor<1x2048xi1> loc(#loc95) + %_tmp2_index_35 = arith.select %_tmp2_index_34, %8#1, %_tmp2_index_13 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc96) + scf.yield %_tmp2_32, %_tmp2_index_35 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc26) + } loc(#loc118) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%_tmp2_index_11#0, %_tmp2_index_11#1) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc27) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc97) + %5 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc29) + %6 = tt.addptr %5, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc29) + %7 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc30) + tt.store %6, %7, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc30) + tt.return loc(#loc31) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%a_value: tensor<1x2048xf32> loc("a_value"(#loc32)), %a_index: tensor<1x2048xi32> loc("a_index"(#loc32)), %b_value: tensor<1x2048xf32> loc("b_value"(#loc32)), %b_index: tensor<1x2048xi32> loc("b_index"(#loc32))) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<1x2048xf32> loc(#loc119) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<1x2048xf32> loc(#loc120) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%a_value) : (tensor<1x2048xf32>) -> i1 loc(#loc35) + %1:2 = scf.if %0 -> (tensor<1x2048xi1>, tensor<1x2048xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<1x2048xf32> loc(#loc104) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<1x2048xf32> loc(#loc105) + %mask_3 = arith.constant true loc(#loc106) + %mask_4 = arith.constant dense : tensor<1x2048xi1> loc(#loc106) + %mask_5 = arith.xori %b_isnan, %mask_4 : tensor<1x2048xi1> loc(#loc106) + %mask_6 = arith.andi %a_isnan, %mask_5 : tensor<1x2048xi1> loc(#loc107) + %mask_7 = arith.ori %mask, %mask_6 : tensor<1x2048xi1> loc(#loc121) + %equal_8 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc109) + %equal_9 = arith.ori %equal, %equal_8 : tensor<1x2048xi1> loc(#loc122) + scf.yield %mask_7, %equal_9 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc122) + } else { + scf.yield %mask, %equal : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc44) + } loc(#loc36) + %mask_0 = arith.cmpi slt, %a_index, %b_index : tensor<1x2048xi32> loc(#loc111) + %mask_1 = arith.andi %1#1, %mask_0 : tensor<1x2048xi1> loc(#loc112) + %mask_2 = arith.ori %1#0, %mask_1 : tensor<1x2048xi1> loc(#loc113) + %2 = arith.select %mask_2, %a_value, %b_value : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc48) + %3 = arith.select %mask_2, %a_index, %b_index : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc49) + tt.return %2, %3 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc50) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x2048xf32> loc(#loc51) + %5 = ub.poison : tensor<1x2048xi32> loc(#loc51) + tt.return %4, %5 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc51) + } loc(#loc32) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc52))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc53) + %true = arith.constant true loc(#loc54) + tt.return %true : i1 loc(#loc54) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc55) + tt.return %1 : i1 loc(#loc55) + } loc(#loc52) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc56))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc57) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc58) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc58) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc58) + %4 = arith.addf %x, %3 : tensor<1x2048xf32> loc(#loc58) + tt.return %4 : tensor<1x2048xf32> loc(#loc59) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x2048xf32> loc(#loc60) + tt.return %5 : tensor<1x2048xf32> loc(#loc60) + } loc(#loc56) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc62) + %cst = arith.constant dense : tensor<1xi1> loc(#loc62) + tt.return %cst : tensor<1xi1> loc(#loc63) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc64) + tt.return %0 : tensor<1xi1> loc(#loc64) + } loc(#loc61) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%value: tensor<1x2048xf32> loc("value"(#loc65)), %index: tensor<1x2048xi32> loc("index"(#loc65))) -> (tensor<1xf32>, tensor<1xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc66) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc66) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc66) + tt.return %0#0, %0#1 : tensor<1xf32>, tensor<1xi32> loc(#loc67) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc68) + %2 = ub.poison : tensor<1xi32> loc(#loc68) + tt.return %1, %2 : tensor<1xf32>, tensor<1xi32> loc(#loc68) + } loc(#loc65) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc32)), %a_index: i32 loc("a_index"(#loc32)), %b_value: f32 loc("b_value"(#loc32)), %b_index: i32 loc("b_index"(#loc32))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc119) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc120) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc35) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc104) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc105) + %mask_3 = arith.constant true loc(#loc106) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc106) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc107) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc121) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc109) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc122) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc122) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc44) + } loc(#loc36) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc111) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc112) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc113) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc48) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc49) + tt.return %2, %3 : f32, i32 loc(#loc50) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc51) + %5 = ub.poison : i32 loc(#loc51) + tt.return %4, %5 : f32, i32 loc(#loc51) + } loc(#loc32) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc52))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc53) + %true = arith.constant true loc(#loc54) + tt.return %true : i1 loc(#loc54) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc55) + tt.return %1 : i1 loc(#loc55) + } loc(#loc52) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc56))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc57) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc58) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc58) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc58) + tt.return %3 : tensor<1xf32> loc(#loc59) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc60) + tt.return %4 : tensor<1xf32> loc(#loc60) + } loc(#loc56) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":28:55) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":29:58) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":30:40) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":31:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":32:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":36:47) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":36:41) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":36:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":36:62) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":36:52) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":36:114) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":39:38) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":41:35) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":41:54) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":42:41) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":42:66) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":42:8) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":43:75) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":44:20) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":45:25) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":45:36) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":45:4) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc73 = loc("r0_numel"(#loc1)) +#loc74 = loc("xoffset"(#loc2)) +#loc75 = loc("xoffset"(#loc3)) +#loc76 = loc("xindex"(#loc4)) +#loc77 = loc("xindex"(#loc5)) +#loc78 = loc("xindex"(#loc6)) +#loc79 = loc("xmask"(#loc7)) +#loc80 = loc("r0_base"(#loc8)) +#loc81 = loc("r0_base"(#loc9)) +#loc82 = loc("_tmp2"(#loc10)) +#loc83 = loc("_tmp2_index"(#loc11)) +#loc84 = loc("_tmp2"(#loc12)) +#loc85 = loc("r0_index"(#loc13)) +#loc86 = loc("r0_mask"(#loc14)) +#loc87 = loc("tmp0"(#loc15)) +#loc88 = loc("tmp0"(#loc16)) +#loc89 = loc("tmp0"(#loc17)) +#loc90 = loc("tmp0"(#loc18)) +#loc91 = loc("tmp0"(#loc19)) +#loc92 = loc("tmp0"(#loc20)) +#loc93 = loc("_tmp2"(#loc22)) +#loc94 = loc("_tmp2"(#loc23)) +#loc95 = loc("_tmp2_index"(#loc24)) +#loc96 = loc("_tmp2_index"(#loc25)) +#loc97 = loc("tmp2"(#loc28)) +#loc102 = loc("mask"(#loc33)) +#loc103 = loc("equal"(#loc34)) +#loc104 = loc("a_isnan"(#loc37)) +#loc105 = loc("b_isnan"(#loc38)) +#loc106 = loc("mask"(#loc39)) +#loc107 = loc("mask"(#loc40)) +#loc108 = loc("mask"(#loc41)) +#loc109 = loc("equal"(#loc42)) +#loc110 = loc("equal"(#loc43)) +#loc111 = loc("mask"(#loc45)) +#loc112 = loc("mask"(#loc46)) +#loc113 = loc("mask"(#loc47)) +#loc118 = loc("_tmp2_index"(#loc84)) +#loc119 = loc("mask"(#loc102)) +#loc120 = loc("equal"(#loc103)) +#loc121 = loc("mask"(#loc108)) +#loc122 = loc("equal"(#loc110)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..5d85b19e326b0f301724d51e0c9c9f322acf75b9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.ttgir @@ -0,0 +1,182 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":18:0) +#loc1 = loc(unknown) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":43:75) +#loc38 = loc("in_ptr0"(#loc)) +#loc39 = loc("out_ptr0"(#loc)) +#loc40 = loc("xnumel"(#loc)) +#loc41 = loc("r0_numel"(#loc)) +#loc71 = loc(callsite(#loc1 at #loc33)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_argmax_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32000> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %true = arith.constant true loc(#loc1) + %cst_1 = arith.constant dense : tensor<1x2048xi1, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<2147483647> : tensor<1x2048xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc42) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc43) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc44) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc44) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc45) + %tmp0_5 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc73) + %tmp0_6 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc47) + %tmp0_7 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc74) + %_tmp2_index:2 = scf.for %_tmp2_index_8 = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp2 = %cst_3, %_tmp2_index_9 = %cst_2) -> (tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp2_index_8 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc50) + %r0_index_10 = arith.addi %r0_index, %r0_base_4 : tensor<1x2048xi32, #blocked> loc(#loc50) + %r0_mask = arith.cmpi slt, %r0_index_10, %cst : tensor<1x2048xi32, #blocked> loc(#loc51) + %tmp0_11 = arith.addi %r0_index_10, %tmp0_5 : tensor<1x2048xi32, #blocked> loc(#loc46) + %tmp0_12 = tt.addptr %tmp0_6, %tmp0_11 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc47) + %tmp0_13 = arith.andi %r0_mask, %tmp0_7 : tensor<1x2048xi1, #blocked> loc(#loc48) + %tmp0_14 = tt.load %tmp0_12, %tmp0_13, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr, #blocked> loc(#loc52) + %tmp0_15 = arith.extf %tmp0_14 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked> loc(#loc53) + %mask = arith.cmpf ogt, %_tmp2, %tmp0_15 : tensor<1x2048xf32, #blocked> loc(#loc98) + %equal = arith.cmpf oeq, %_tmp2, %tmp0_15 : tensor<1x2048xf32, #blocked> loc(#loc99) + %a_isnan = arith.cmpf une, %_tmp2, %_tmp2 : tensor<1x2048xf32, #blocked> loc(#loc78) + %b_isnan = arith.cmpf une, %tmp0_15, %tmp0_15 : tensor<1x2048xf32, #blocked> loc(#loc79) + %mask_16 = arith.xori %b_isnan, %cst_1 : tensor<1x2048xi1, #blocked> loc(#loc80) + %mask_17 = arith.andi %a_isnan, %mask_16 : tensor<1x2048xi1, #blocked> loc(#loc81) + %mask_18 = arith.ori %mask, %mask_17 : tensor<1x2048xi1, #blocked> loc(#loc100) + %equal_19 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1, #blocked> loc(#loc83) + %equal_20 = arith.ori %equal, %equal_19 : tensor<1x2048xi1, #blocked> loc(#loc101) + %mask_21 = arith.cmpi slt, %_tmp2_index_9, %r0_index_10 : tensor<1x2048xi32, #blocked> loc(#loc85) + %mask_22 = arith.andi %equal_20, %mask_21 : tensor<1x2048xi1, #blocked> loc(#loc86) + %mask_23 = arith.ori %mask_18, %mask_22 : tensor<1x2048xi1, #blocked> loc(#loc87) + %6 = arith.select %mask_23, %_tmp2, %tmp0_15 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc66) + %7 = arith.select %mask_23, %_tmp2_index_9, %r0_index_10 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc67) + %_tmp2_24 = arith.select %tmp0_13, %6, %_tmp2 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked> loc(#loc68) + %_tmp2_index_25 = arith.select %tmp0_13, %7, %_tmp2_index_9 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc69) + scf.yield %_tmp2_24, %_tmp2_index_25 : tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked> loc(#loc31) + } loc(#loc75) + %0:2 = "tt.reduce"(%_tmp2_index#0, %_tmp2_index#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc33)), %arg5: i32 loc(callsite(#loc1 at #loc33)), %arg6: f32 loc(callsite(#loc1 at #loc33)), %arg7: i32 loc(callsite(#loc1 at #loc33))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc102) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc103) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc88) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc89) + %mask_8 = arith.xori %b_isnan, %true : i1 loc(#loc90) + %mask_9 = arith.andi %a_isnan, %mask_8 : i1 loc(#loc91) + %mask_10 = arith.ori %mask, %mask_9 : i1 loc(#loc104) + %equal_11 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc92) + %equal_12 = arith.ori %equal, %equal_11 : i1 loc(#loc105) + %mask_13 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc93) + %mask_14 = arith.andi %equal_12, %mask_13 : i1 loc(#loc94) + %mask_15 = arith.ori %mask_10, %mask_14 : i1 loc(#loc95) + %6 = arith.select %mask_15, %arg4, %arg6 : f32 loc(#loc96) + %7 = arith.select %mask_15, %arg5, %arg7 : i32 loc(#loc97) + tt.reduce.return %6, %7 : f32, i32 loc(#loc70) + }) : (tensor<1x2048xf32, #blocked>, tensor<1x2048xi32, #blocked>) -> (tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>>) loc(#loc70) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi32, #blocked> loc(#loc72) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc35) + %2 = ttg.convert_layout %tmp2 : tensor<1x1xi32, #blocked> -> tensor<1x1xi32, #blocked1> loc(#loc36) + %3 = arith.extsi %2 : tensor<1x1xi32, #blocked1> to tensor<1x1xi64, #blocked1> loc(#loc36) + %4 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked1> loc(#loc36) + %5 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked1> loc(#loc36) + tt.store %4, %3, %5 : tensor<1x1x!tt.ptr, #blocked1> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":36:47) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":36:41) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":36:34) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":36:62) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":30:40) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":31:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":32:29) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":36:52) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":36:114) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":39:38) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":41:54) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":42:66) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":42:8) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":44:20) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":45:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":45:36) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":45:4) +#loc42 = loc("xoffset"(#loc2)) +#loc43 = loc("xmask"(#loc3)) +#loc44 = loc("r0_base"(#loc4)) +#loc45 = loc("tmp0"(#loc5)) +#loc46 = loc("tmp0"(#loc6)) +#loc47 = loc("tmp0"(#loc7)) +#loc48 = loc("tmp0"(#loc8)) +#loc49 = loc("_tmp2"(#loc9)) +#loc50 = loc("r0_index"(#loc10)) +#loc51 = loc("r0_mask"(#loc11)) +#loc52 = loc("tmp0"(#loc12)) +#loc53 = loc("tmp0"(#loc13)) +#loc54 = loc("mask"(#loc14)) +#loc55 = loc("equal"(#loc16)) +#loc56 = loc("a_isnan"(#loc17)) +#loc57 = loc("b_isnan"(#loc18)) +#loc58 = loc("mask"(#loc19)) +#loc59 = loc("mask"(#loc20)) +#loc60 = loc("mask"(#loc21)) +#loc61 = loc("equal"(#loc22)) +#loc62 = loc("equal"(#loc23)) +#loc63 = loc("mask"(#loc24)) +#loc64 = loc("mask"(#loc25)) +#loc65 = loc("mask"(#loc26)) +#loc66 = loc(callsite(#loc27 at #loc15)) +#loc67 = loc(callsite(#loc28 at #loc15)) +#loc68 = loc("_tmp2"(#loc29)) +#loc69 = loc("_tmp2_index"(#loc30)) +#loc70 = loc(callsite(#loc32 at #loc33)) +#loc72 = loc("tmp2"(#loc34)) +#loc73 = loc(fused[#loc46, #loc45]) +#loc74 = loc(fused[#loc48, #loc43]) +#loc75 = loc("_tmp2_index"(#loc49)) +#loc76 = loc("mask"(#loc54)) +#loc77 = loc("equal"(#loc55)) +#loc78 = loc(callsite(#loc56 at #loc15)) +#loc79 = loc(callsite(#loc57 at #loc15)) +#loc80 = loc(callsite(#loc58 at #loc15)) +#loc81 = loc(callsite(#loc59 at #loc15)) +#loc82 = loc("mask"(#loc60)) +#loc83 = loc(callsite(#loc61 at #loc15)) +#loc84 = loc("equal"(#loc62)) +#loc85 = loc(callsite(#loc63 at #loc15)) +#loc86 = loc(callsite(#loc64 at #loc15)) +#loc87 = loc(callsite(#loc65 at #loc15)) +#loc88 = loc(callsite(#loc56 at #loc70)) +#loc89 = loc(callsite(#loc57 at #loc70)) +#loc90 = loc(callsite(#loc58 at #loc70)) +#loc91 = loc(callsite(#loc59 at #loc70)) +#loc92 = loc(callsite(#loc61 at #loc70)) +#loc93 = loc(callsite(#loc63 at #loc70)) +#loc94 = loc(callsite(#loc64 at #loc70)) +#loc95 = loc(callsite(#loc65 at #loc70)) +#loc96 = loc(callsite(#loc27 at #loc70)) +#loc97 = loc(callsite(#loc28 at #loc70)) +#loc98 = loc(callsite(#loc76 at #loc15)) +#loc99 = loc(callsite(#loc77 at #loc15)) +#loc100 = loc(callsite(#loc82 at #loc15)) +#loc101 = loc(callsite(#loc84 at #loc15)) +#loc102 = loc(callsite(#loc76 at #loc70)) +#loc103 = loc(callsite(#loc77 at #loc70)) +#loc104 = loc(callsite(#loc82 at #loc70)) +#loc105 = loc(callsite(#loc84 at #loc70)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..26159d3d264a55ab6baacdb6c05342252c2ec560 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WTI25APFZXSBC2RGXDXXFUX6ZCROKDEKQ4HOMZZ2HGABPAMRCJKA/triton_red_fused_argmax_0.ttir @@ -0,0 +1,185 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":43:75) +#loc41 = loc("in_ptr0"(#loc)) +#loc42 = loc("out_ptr0"(#loc)) +#loc43 = loc("xnumel"(#loc)) +#loc44 = loc("r0_numel"(#loc)) +#loc45 = loc(callsite(#loc1 at #loc2)) +module { + tt.func public @triton_red_fused_argmax_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %true = arith.constant true loc(#loc45) + %cst = arith.constant dense : tensor<1x2048xi1> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16> loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc3) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_1 = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc1) + %_tmp2_index = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc46) + %_tmp2 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc47) + %xoffset = tt.get_program_id x : i32 loc(#loc48) + %xmask = arith.cmpi slt, %xoffset, %xnumel : i32 loc(#loc49) + %xmask_2 = tt.splat %xmask : i1 -> tensor<1x1xi1> loc(#loc49) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc50) + %r0_base_3 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc51) + %_tmp2_index_4:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c2048_i32 iter_args(%_tmp2_5 = %_tmp2, %_tmp2_index_6 = %_tmp2_index) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc53) + %r0_index_7 = arith.addi %r0_index, %r0_base_3 : tensor<1x2048xi32> loc(#loc53) + %r0_mask = arith.cmpi slt, %r0_index_7, %cst_1 : tensor<1x2048xi32> loc(#loc54) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc55) + %tmp0_8 = tt.splat %tmp0 : i32 -> tensor<1x2048xi32> loc(#loc80) + %tmp0_9 = arith.addi %r0_index_7, %tmp0_8 : tensor<1x2048xi32> loc(#loc56) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc57) + %tmp0_11 = tt.addptr %tmp0_10, %tmp0_9 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc57) + %tmp0_12 = tt.splat %xmask : i1 -> tensor<1x2048xi1> loc(#loc81) + %tmp0_13 = arith.andi %r0_mask, %tmp0_12 : tensor<1x2048xi1> loc(#loc58) + %tmp0_14 = tt.load %tmp0_11, %tmp0_13, %cst_0 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc59) + %tmp0_15 = arith.extf %tmp0_14 : tensor<1x2048xbf16> to tensor<1x2048xf32> loc(#loc60) + %mask = arith.cmpf ogt, %_tmp2_5, %tmp0_15 : tensor<1x2048xf32> loc(#loc104) + %equal = arith.cmpf oeq, %_tmp2_5, %tmp0_15 : tensor<1x2048xf32> loc(#loc105) + %a_isnan = arith.cmpf une, %_tmp2_5, %_tmp2_5 : tensor<1x2048xf32> loc(#loc84) + %b_isnan = arith.cmpf une, %tmp0_15, %tmp0_15 : tensor<1x2048xf32> loc(#loc85) + %mask_16 = arith.xori %b_isnan, %cst : tensor<1x2048xi1> loc(#loc86) + %mask_17 = arith.andi %a_isnan, %mask_16 : tensor<1x2048xi1> loc(#loc87) + %mask_18 = arith.ori %mask, %mask_17 : tensor<1x2048xi1> loc(#loc106) + %equal_19 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc89) + %equal_20 = arith.ori %equal, %equal_19 : tensor<1x2048xi1> loc(#loc107) + %mask_21 = arith.cmpi slt, %_tmp2_index_6, %r0_index_7 : tensor<1x2048xi32> loc(#loc91) + %mask_22 = arith.andi %equal_20, %mask_21 : tensor<1x2048xi1> loc(#loc92) + %mask_23 = arith.ori %mask_18, %mask_22 : tensor<1x2048xi1> loc(#loc93) + %4 = arith.select %mask_23, %_tmp2_5, %tmp0_15 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc73) + %5 = arith.select %mask_23, %_tmp2_index_6, %r0_index_7 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc74) + %_tmp2_24 = arith.select %tmp0_13, %4, %_tmp2_5 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc75) + %_tmp2_index_25 = arith.select %tmp0_13, %5, %_tmp2_index_6 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc76) + scf.yield %_tmp2_24, %_tmp2_index_25 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc35) + } loc(#loc79) + %0:2 = "tt.reduce"(%_tmp2_index_4#0, %_tmp2_index_4#1) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32 loc(callsite(#loc1 at #loc2)), %arg5: i32 loc(callsite(#loc1 at #loc2)), %arg6: f32 loc(callsite(#loc1 at #loc2)), %arg7: i32 loc(callsite(#loc1 at #loc2))): + %mask = arith.cmpf ogt, %arg4, %arg6 : f32 loc(#loc108) + %equal = arith.cmpf oeq, %arg4, %arg6 : f32 loc(#loc109) + %a_isnan = arith.cmpf une, %arg4, %arg4 : f32 loc(#loc94) + %b_isnan = arith.cmpf une, %arg6, %arg6 : f32 loc(#loc95) + %mask_5 = arith.xori %b_isnan, %true : i1 loc(#loc96) + %mask_6 = arith.andi %a_isnan, %mask_5 : i1 loc(#loc97) + %mask_7 = arith.ori %mask, %mask_6 : i1 loc(#loc110) + %equal_8 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc98) + %equal_9 = arith.ori %equal, %equal_8 : i1 loc(#loc111) + %mask_10 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc99) + %mask_11 = arith.andi %equal_9, %mask_10 : i1 loc(#loc100) + %mask_12 = arith.ori %mask_7, %mask_11 : i1 loc(#loc101) + %4 = arith.select %mask_12, %arg4, %arg6 : f32 loc(#loc102) + %5 = arith.select %mask_12, %arg5, %arg7 : i32 loc(#loc103) + tt.reduce.return %4, %5 : f32, i32 loc(#loc77) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc77) + %tmp2 = tt.expand_dims %0#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc78) + %1 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc38) + %2 = tt.splat %1 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc38) + %3 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc39) + tt.store %2, %3, %xmask_2 : tensor<1x1x!tt.ptr> loc(#loc39) + tt.return loc(#loc40) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":30:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":29:58) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":28:55) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":22:28) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":31:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":32:29) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":36:47) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":36:41) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":36:34) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":36:62) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":36:52) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":36:114) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":39:38) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":41:54) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":42:66) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":42:8) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":44:20) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":45:25) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":45:36) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/42/c42h7visn4guss7swxj4up2er4ije4hyno7yrughuvurnenh2pvd.py":45:4) +#loc46 = loc("_tmp2_index"(#loc4)) +#loc47 = loc("_tmp2"(#loc5)) +#loc48 = loc("xoffset"(#loc6)) +#loc49 = loc("xmask"(#loc7)) +#loc50 = loc("r0_base"(#loc8)) +#loc51 = loc("r0_base"(#loc9)) +#loc52 = loc("_tmp2"(#loc3)) +#loc53 = loc("r0_index"(#loc10)) +#loc54 = loc("r0_mask"(#loc11)) +#loc55 = loc("tmp0"(#loc12)) +#loc56 = loc("tmp0"(#loc13)) +#loc57 = loc("tmp0"(#loc14)) +#loc58 = loc("tmp0"(#loc15)) +#loc59 = loc("tmp0"(#loc16)) +#loc60 = loc("tmp0"(#loc17)) +#loc61 = loc("mask"(#loc18)) +#loc62 = loc("equal"(#loc20)) +#loc63 = loc("a_isnan"(#loc21)) +#loc64 = loc("b_isnan"(#loc22)) +#loc65 = loc("mask"(#loc23)) +#loc66 = loc("mask"(#loc24)) +#loc67 = loc("mask"(#loc25)) +#loc68 = loc("equal"(#loc26)) +#loc69 = loc("equal"(#loc27)) +#loc70 = loc("mask"(#loc28)) +#loc71 = loc("mask"(#loc29)) +#loc72 = loc("mask"(#loc30)) +#loc73 = loc(callsite(#loc31 at #loc19)) +#loc74 = loc(callsite(#loc32 at #loc19)) +#loc75 = loc("_tmp2"(#loc33)) +#loc76 = loc("_tmp2_index"(#loc34)) +#loc77 = loc(callsite(#loc36 at #loc2)) +#loc78 = loc("tmp2"(#loc37)) +#loc79 = loc("_tmp2_index"(#loc52)) +#loc80 = loc(fused[#loc56, #loc55]) +#loc81 = loc(fused[#loc58, #loc49]) +#loc82 = loc("mask"(#loc61)) +#loc83 = loc("equal"(#loc62)) +#loc84 = loc(callsite(#loc63 at #loc19)) +#loc85 = loc(callsite(#loc64 at #loc19)) +#loc86 = loc(callsite(#loc65 at #loc19)) +#loc87 = loc(callsite(#loc66 at #loc19)) +#loc88 = loc("mask"(#loc67)) +#loc89 = loc(callsite(#loc68 at #loc19)) +#loc90 = loc("equal"(#loc69)) +#loc91 = loc(callsite(#loc70 at #loc19)) +#loc92 = loc(callsite(#loc71 at #loc19)) +#loc93 = loc(callsite(#loc72 at #loc19)) +#loc94 = loc(callsite(#loc63 at #loc77)) +#loc95 = loc(callsite(#loc64 at #loc77)) +#loc96 = loc(callsite(#loc65 at #loc77)) +#loc97 = loc(callsite(#loc66 at #loc77)) +#loc98 = loc(callsite(#loc68 at #loc77)) +#loc99 = loc(callsite(#loc70 at #loc77)) +#loc100 = loc(callsite(#loc71 at #loc77)) +#loc101 = loc(callsite(#loc72 at #loc77)) +#loc102 = loc(callsite(#loc31 at #loc77)) +#loc103 = loc(callsite(#loc32 at #loc77)) +#loc104 = loc(callsite(#loc82 at #loc19)) +#loc105 = loc(callsite(#loc83 at #loc19)) +#loc106 = loc(callsite(#loc88 at #loc19)) +#loc107 = loc(callsite(#loc90 at #loc19)) +#loc108 = loc(callsite(#loc82 at #loc77)) +#loc109 = loc(callsite(#loc83 at #loc77)) +#loc110 = loc(callsite(#loc88 at #loc77)) +#loc111 = loc(callsite(#loc90 at #loc77)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6cfc06547306ce98372a98131031ec7c7091b3f8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8f5af812b3d6895e5ff369ccfd081bdd38c3adee --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"hash": "b641e82492d6804d5c82db421f12f0b74dfdbf3f943ecf5c7166117cf9d597a1", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 8192, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir new file mode 100644 index 0000000000000000000000000000000000000000..6e5e68d34c770943d520b4c9d397720593dcf8e8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir @@ -0,0 +1,2154 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 7, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 127, !dbg !9 + %12 = lshr i32 %10, 2, !dbg !9 + %13 = and i32 %12, 63, !dbg !9 + %14 = or disjoint i32 %9, %11, !dbg !10 + %15 = or disjoint i32 %13, %9, !dbg !10 + %16 = or disjoint i32 %15, 64, !dbg !10 + %17 = icmp slt i32 %14, 128, !dbg !11 + %18 = icmp slt i32 %15, 128, !dbg !11 + %19 = icmp slt i32 %16, 128, !dbg !11 + %20 = lshr i32 %10, 7, !dbg !12 + %21 = shl nuw nsw i32 %10, 2, !dbg !12 + %22 = and i32 %21, 12, !dbg !12 + %23 = sdiv i32 %14, 16, !dbg !13 + %24 = shl i32 %23, 8, !dbg !14 + %25 = add i32 %24, %14, !dbg !14 + %26 = shl nuw nsw i32 %11, 4, !dbg !15 + %27 = getelementptr i32, ptr addrspace(3) @global_smem, i32 %26, !dbg !15 + %28 = getelementptr i8, ptr addrspace(3) %27, i32 8, !dbg !15 + %29 = getelementptr i8, ptr addrspace(3) %27, i32 16, !dbg !15 + %30 = getelementptr i8, ptr addrspace(3) %27, i32 24, !dbg !15 + %31 = getelementptr i8, ptr addrspace(3) %27, i32 32, !dbg !15 + %32 = getelementptr i8, ptr addrspace(3) %27, i32 40, !dbg !15 + %33 = getelementptr i8, ptr addrspace(3) %27, i32 48, !dbg !15 + %34 = getelementptr i8, ptr addrspace(3) %27, i32 56, !dbg !15 + %35 = getelementptr i32, ptr addrspace(3) @global_smem, i32 %10, !dbg !15 + %36 = and i32 %10, 1, !dbg !15 + %37 = icmp eq i32 %36, 0, !dbg !15 + %38 = getelementptr i8, ptr addrspace(3) %35, i32 1024, !dbg !15 + %39 = getelementptr i8, ptr addrspace(3) %35, i32 2048, !dbg !15 + %40 = getelementptr i8, ptr addrspace(3) %35, i32 3072, !dbg !15 + %41 = getelementptr i8, ptr addrspace(3) %35, i32 4096, !dbg !15 + %42 = getelementptr i8, ptr addrspace(3) %35, i32 5120, !dbg !15 + %43 = getelementptr i8, ptr addrspace(3) %35, i32 6144, !dbg !15 + %44 = getelementptr i8, ptr addrspace(3) %35, i32 7168, !dbg !15 + %.lobit = and i32 %20, 1, !dbg !12 + %45 = or disjoint i32 %.lobit, 4, !dbg !12 + %46 = insertelement <2 x i32> poison, i32 %20, i64 0, !dbg !12 + %47 = insertelement <2 x i32> %46, i32 %.lobit, i64 1, !dbg !12 + %48 = or <2 x i32> %47, , !dbg !12 + %49 = mul nuw nsw i32 %.lobit, 17, !dbg !19 + %50 = extractelement <2 x i32> %48, i64 1, !dbg !20 + %51 = mul nuw nsw i32 %50, 17, !dbg !19 + %52 = mul nuw nsw i32 %45, 17, !dbg !19 + %53 = extractelement <2 x i32> %48, i64 0, !dbg !20 + %54 = mul nuw nsw i32 %53, 17, !dbg !19 + %55 = add i32 %25, %49, !dbg !23 + %56 = add i32 %25, %51, !dbg !23 + %57 = add i32 %25, %52, !dbg !23 + %58 = add i32 %25, %54, !dbg !23 + %59 = sext i32 %55 to i64, !dbg !24 + %60 = getelementptr i32, ptr addrspace(1) %0, i64 %59, !dbg !24 + %61 = sext i32 %56 to i64, !dbg !24 + %62 = getelementptr i32, ptr addrspace(1) %0, i64 %61, !dbg !24 + %63 = sext i32 %57 to i64, !dbg !24 + %64 = getelementptr i32, ptr addrspace(1) %0, i64 %63, !dbg !24 + %65 = sext i32 %58 to i64, !dbg !24 + %66 = getelementptr i32, ptr addrspace(1) %0, i64 %65, !dbg !24 + %67 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %60, i1 %17) #5, !dbg !25 + %68 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 %17) #5, !dbg !25 + %69 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %64, i1 %17) #5, !dbg !25 + %70 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %66, i1 %17) #5, !dbg !25 + %71 = xor i32 %.lobit, 1, !dbg !26 + %72 = mul nuw nsw i32 %67, %71, !dbg !27 + %73 = mul nuw nsw i32 %68, %71, !dbg !27 + %74 = mul nuw nsw i32 %69, %71, !dbg !27 + %75 = mul nuw nsw i32 %70, %71, !dbg !27 + %76 = getelementptr i32, ptr addrspace(3) %27, i32 %.lobit, !dbg !15 + %77 = insertelement <1 x i32> poison, i32 %72, i64 0, !dbg !15 + %78 = getelementptr i32, ptr addrspace(3) %28, i32 %.lobit, !dbg !15 + %79 = insertelement <1 x i32> poison, i32 %73, i64 0, !dbg !15 + %80 = getelementptr i32, ptr addrspace(3) %29, i32 %.lobit, !dbg !15 + %81 = insertelement <1 x i32> poison, i32 %74, i64 0, !dbg !15 + %82 = getelementptr i32, ptr addrspace(3) %30, i32 %.lobit, !dbg !15 + %83 = insertelement <1 x i32> poison, i32 %75, i64 0, !dbg !15 + %84 = getelementptr i32, ptr addrspace(3) %31, i32 %.lobit, !dbg !15 + %85 = getelementptr i32, ptr addrspace(3) %32, i32 %.lobit, !dbg !15 + %86 = getelementptr i32, ptr addrspace(3) %33, i32 %.lobit, !dbg !15 + %87 = getelementptr i32, ptr addrspace(3) %34, i32 %.lobit, !dbg !15 + %88 = mul nuw nsw i32 %67, %.lobit, !dbg !28 + %89 = mul nuw nsw i32 %68, %.lobit, !dbg !28 + %90 = mul nuw nsw i32 %69, %.lobit, !dbg !28 + %91 = mul nuw nsw i32 %70, %.lobit, !dbg !28 + %92 = insertelement <1 x i32> poison, i32 %88, i64 0, !dbg !15 + %93 = insertelement <1 x i32> poison, i32 %89, i64 0, !dbg !15 + %94 = insertelement <1 x i32> poison, i32 %90, i64 0, !dbg !15 + %95 = insertelement <1 x i32> poison, i32 %91, i64 0, !dbg !15 + %96 = mul nuw nsw i32 %71, %.lobit, !dbg !29 + %97 = mul nuw nsw i32 %50, %71, !dbg !29 + %98 = mul nuw nsw i32 %45, %71, !dbg !29 + %99 = mul nuw nsw i32 %71, %53, !dbg !29 + %100 = insertelement <1 x i32> poison, i32 %96, i64 0, !dbg !15 + %101 = insertelement <1 x i32> poison, i32 %97, i64 0, !dbg !15 + %102 = insertelement <1 x i32> poison, i32 %98, i64 0, !dbg !15 + %103 = insertelement <1 x i32> poison, i32 %99, i64 0, !dbg !15 + %104 = mul nuw nsw i32 %50, %.lobit, !dbg !20 + %105 = mul nuw nsw i32 %45, %.lobit, !dbg !20 + %106 = mul nuw nsw i32 %53, %.lobit, !dbg !20 + %107 = insertelement <1 x i32> poison, i32 %.lobit, i64 0, !dbg !15 + %108 = insertelement <1 x i32> poison, i32 %104, i64 0, !dbg !15 + %109 = insertelement <1 x i32> poison, i32 %105, i64 0, !dbg !15 + %110 = insertelement <1 x i32> poison, i32 %106, i64 0, !dbg !15 + %111 = insertelement <2 x i32> poison, i32 %69, i64 0, !dbg !30 + %112 = insertelement <2 x i32> %111, i32 %67, i64 1, !dbg !30 + %113 = insertelement <2 x i32> poison, i32 %70, i64 0, !dbg !30 + %114 = insertelement <2 x i32> %113, i32 %68, i64 1, !dbg !30 + %115 = insertelement <2 x i32> poison, i32 %45, i64 0, !dbg !31 + %116 = insertelement <2 x i32> %115, i32 %.lobit, i64 1, !dbg !31 + %117 = or <2 x i32> %47, , !dbg !12 + %118 = extractelement <2 x i32> %117, i64 0, !dbg !20 + %119 = mul nuw nsw i32 %118, 17, !dbg !19 + %120 = add i32 %25, %119, !dbg !23 + %121 = sext i32 %120 to i64, !dbg !24 + %122 = getelementptr i32, ptr addrspace(1) %0, i64 %121, !dbg !24 + %123 = insertelement <2 x i32> poison, i32 %.lobit, i64 0, !dbg !12 + %124 = shufflevector <2 x i32> %123, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !12 + %125 = or disjoint <2 x i32> %124, , !dbg !12 + %126 = extractelement <2 x i32> %125, i64 1, !dbg !20 + %127 = mul nuw nsw i32 %126, 17, !dbg !19 + %128 = extractelement <2 x i32> %117, i64 1, !dbg !20 + %129 = mul nuw nsw i32 %128, 17, !dbg !19 + %130 = extractelement <2 x i32> %125, i64 0, !dbg !20 + %131 = mul nuw nsw i32 %130, 17, !dbg !19 + %132 = add i32 %25, %127, !dbg !23 + %133 = add i32 %25, %129, !dbg !23 + %134 = add i32 %25, %131, !dbg !23 + %135 = sext i32 %132 to i64, !dbg !24 + %136 = getelementptr i32, ptr addrspace(1) %0, i64 %135, !dbg !24 + %137 = sext i32 %133 to i64, !dbg !24 + %138 = getelementptr i32, ptr addrspace(1) %0, i64 %137, !dbg !24 + %139 = sext i32 %134 to i64, !dbg !24 + %140 = getelementptr i32, ptr addrspace(1) %0, i64 %139, !dbg !24 + %141 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %136, i1 %17) #5, !dbg !25 + %142 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %138, i1 %17) #5, !dbg !25 + %143 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %140, i1 %17) #5, !dbg !25 + %144 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %122, i1 %17) #5, !dbg !25 + %145 = mul nuw nsw i32 %141, %71, !dbg !27 + %146 = mul nuw nsw i32 %142, %71, !dbg !27 + %147 = mul nuw nsw i32 %143, %71, !dbg !27 + %148 = mul nuw nsw i32 %144, %71, !dbg !27 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %76, <1 x i32> %77, i1 true) #5, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %78, <1 x i32> %79, i1 true) #5, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %80, <1 x i32> %81, i1 true) #5, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %83, i1 true) #5, !dbg !15 + %149 = insertelement <1 x i32> poison, i32 %145, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, <1 x i32> %149, i1 true) #5, !dbg !15 + %150 = insertelement <1 x i32> poison, i32 %146, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %150, i1 true) #5, !dbg !15 + %151 = insertelement <1 x i32> poison, i32 %147, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %86, <1 x i32> %151, i1 true) #5, !dbg !15 + %152 = insertelement <1 x i32> poison, i32 %148, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %152, i1 true) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %153 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %35, i1 true) #5, !dbg !15 + %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 1, i32 31), !dbg !15 + %155 = add i32 %154, %153, !dbg !32 + %156 = insertelement <1 x i32> poison, i32 %155, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %35, <1 x i32> %156, i1 %37) #5, !dbg !15 + %157 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %38, i1 true) #5, !dbg !15 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !15 + %159 = add i32 %158, %157, !dbg !32 + %160 = insertelement <1 x i32> poison, i32 %159, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %38, <1 x i32> %160, i1 %37) #5, !dbg !15 + %161 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %39, i1 true) #5, !dbg !15 + %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 1, i32 31), !dbg !15 + %163 = add i32 %162, %161, !dbg !32 + %164 = insertelement <1 x i32> poison, i32 %163, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %39, <1 x i32> %164, i1 %37) #5, !dbg !15 + %165 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %40, i1 true) #5, !dbg !15 + %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 1, i32 31), !dbg !15 + %167 = add i32 %166, %165, !dbg !32 + %168 = insertelement <1 x i32> poison, i32 %167, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %40, <1 x i32> %168, i1 %37) #5, !dbg !15 + %169 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %41, i1 true) #5, !dbg !15 + %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 1, i32 31), !dbg !15 + %171 = add i32 %170, %169, !dbg !32 + %172 = insertelement <1 x i32> poison, i32 %171, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %41, <1 x i32> %172, i1 %37) #5, !dbg !15 + %173 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %42, i1 true) #5, !dbg !15 + %174 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %173, i32 1, i32 31), !dbg !15 + %175 = add i32 %174, %173, !dbg !32 + %176 = insertelement <1 x i32> poison, i32 %175, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %42, <1 x i32> %176, i1 %37) #5, !dbg !15 + %177 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %43, i1 true) #5, !dbg !15 + %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 1, i32 31), !dbg !15 + %179 = add i32 %178, %177, !dbg !32 + %180 = insertelement <1 x i32> poison, i32 %179, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %43, <1 x i32> %180, i1 %37) #5, !dbg !15 + %181 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %44, i1 true) #5, !dbg !15 + %182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %181, i32 1, i32 31), !dbg !15 + %183 = add i32 %182, %181, !dbg !32 + %184 = insertelement <1 x i32> poison, i32 %183, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %44, <1 x i32> %184, i1 %37) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %185 = load i32, ptr addrspace(3) %27, align 16, !dbg !15 + %186 = load i32, ptr addrspace(3) %28, align 8, !dbg !15 + %187 = load i32, ptr addrspace(3) %29, align 16, !dbg !15 + %188 = load i32, ptr addrspace(3) %30, align 8, !dbg !15 + %189 = load i32, ptr addrspace(3) %31, align 16, !dbg !15 + %190 = load i32, ptr addrspace(3) %32, align 8, !dbg !15 + %191 = load i32, ptr addrspace(3) %33, align 16, !dbg !15 + %192 = load i32, ptr addrspace(3) %34, align 8, !dbg !15 + %193 = mul nuw nsw i32 %141, %.lobit, !dbg !28 + %194 = mul nuw nsw i32 %142, %.lobit, !dbg !28 + %195 = mul nuw nsw i32 %143, %.lobit, !dbg !28 + %196 = mul nuw nsw i32 %144, %.lobit, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %76, <1 x i32> %92, i1 true) #5, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %78, <1 x i32> %93, i1 true) #5, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %80, <1 x i32> %94, i1 true) #5, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %95, i1 true) #5, !dbg !15 + %197 = insertelement <1 x i32> poison, i32 %193, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, <1 x i32> %197, i1 true) #5, !dbg !15 + %198 = insertelement <1 x i32> poison, i32 %194, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %198, i1 true) #5, !dbg !15 + %199 = insertelement <1 x i32> poison, i32 %195, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %86, <1 x i32> %199, i1 true) #5, !dbg !15 + %200 = insertelement <1 x i32> poison, i32 %196, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %200, i1 true) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %201 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %35, i1 true) #5, !dbg !15 + %202 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %201, i32 1, i32 31), !dbg !15 + %203 = add i32 %202, %201, !dbg !32 + %204 = insertelement <1 x i32> poison, i32 %203, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %35, <1 x i32> %204, i1 %37) #5, !dbg !15 + %205 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %38, i1 true) #5, !dbg !15 + %206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %205, i32 1, i32 31), !dbg !15 + %207 = add i32 %206, %205, !dbg !32 + %208 = insertelement <1 x i32> poison, i32 %207, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %38, <1 x i32> %208, i1 %37) #5, !dbg !15 + %209 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %39, i1 true) #5, !dbg !15 + %210 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %209, i32 1, i32 31), !dbg !15 + %211 = add i32 %210, %209, !dbg !32 + %212 = insertelement <1 x i32> poison, i32 %211, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %39, <1 x i32> %212, i1 %37) #5, !dbg !15 + %213 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %40, i1 true) #5, !dbg !15 + %214 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %213, i32 1, i32 31), !dbg !15 + %215 = add i32 %214, %213, !dbg !32 + %216 = insertelement <1 x i32> poison, i32 %215, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %40, <1 x i32> %216, i1 %37) #5, !dbg !15 + %217 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %41, i1 true) #5, !dbg !15 + %218 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %217, i32 1, i32 31), !dbg !15 + %219 = add i32 %218, %217, !dbg !32 + %220 = insertelement <1 x i32> poison, i32 %219, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %41, <1 x i32> %220, i1 %37) #5, !dbg !15 + %221 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %42, i1 true) #5, !dbg !15 + %222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 1, i32 31), !dbg !15 + %223 = add i32 %222, %221, !dbg !32 + %224 = insertelement <1 x i32> poison, i32 %223, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %42, <1 x i32> %224, i1 %37) #5, !dbg !15 + %225 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %43, i1 true) #5, !dbg !15 + %226 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %225, i32 1, i32 31), !dbg !15 + %227 = add i32 %226, %225, !dbg !32 + %228 = insertelement <1 x i32> poison, i32 %227, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %43, <1 x i32> %228, i1 %37) #5, !dbg !15 + %229 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %44, i1 true) #5, !dbg !15 + %230 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %229, i32 1, i32 31), !dbg !15 + %231 = add i32 %230, %229, !dbg !32 + %232 = insertelement <1 x i32> poison, i32 %231, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %44, <1 x i32> %232, i1 %37) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %233 = load i32, ptr addrspace(3) %27, align 16, !dbg !15 + %234 = load i32, ptr addrspace(3) %28, align 8, !dbg !15 + %235 = load i32, ptr addrspace(3) %29, align 16, !dbg !15 + %236 = load i32, ptr addrspace(3) %30, align 8, !dbg !15 + %237 = load i32, ptr addrspace(3) %31, align 16, !dbg !15 + %238 = load i32, ptr addrspace(3) %32, align 8, !dbg !15 + %239 = load i32, ptr addrspace(3) %33, align 16, !dbg !15 + %240 = load i32, ptr addrspace(3) %34, align 8, !dbg !15 + %241 = mul nuw nsw i32 %126, %71, !dbg !29 + %242 = mul nuw nsw i32 %128, %71, !dbg !29 + %243 = mul nuw nsw i32 %130, %71, !dbg !29 + %244 = mul nuw nsw i32 %71, %118, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %76, <1 x i32> %100, i1 true) #5, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %78, <1 x i32> %101, i1 true) #5, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %80, <1 x i32> %102, i1 true) #5, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %103, i1 true) #5, !dbg !15 + %245 = insertelement <1 x i32> poison, i32 %241, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, <1 x i32> %245, i1 true) #5, !dbg !15 + %246 = insertelement <1 x i32> poison, i32 %242, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %246, i1 true) #5, !dbg !15 + %247 = insertelement <1 x i32> poison, i32 %243, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %86, <1 x i32> %247, i1 true) #5, !dbg !15 + %248 = insertelement <1 x i32> poison, i32 %244, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %248, i1 true) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %249 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %35, i1 true) #5, !dbg !15 + %250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %249, i32 1, i32 31), !dbg !15 + %251 = add i32 %250, %249, !dbg !32 + %252 = insertelement <1 x i32> poison, i32 %251, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %35, <1 x i32> %252, i1 %37) #5, !dbg !15 + %253 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %38, i1 true) #5, !dbg !15 + %254 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %253, i32 1, i32 31), !dbg !15 + %255 = add i32 %254, %253, !dbg !32 + %256 = insertelement <1 x i32> poison, i32 %255, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %38, <1 x i32> %256, i1 %37) #5, !dbg !15 + %257 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %39, i1 true) #5, !dbg !15 + %258 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %257, i32 1, i32 31), !dbg !15 + %259 = add i32 %258, %257, !dbg !32 + %260 = insertelement <1 x i32> poison, i32 %259, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %39, <1 x i32> %260, i1 %37) #5, !dbg !15 + %261 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %40, i1 true) #5, !dbg !15 + %262 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %261, i32 1, i32 31), !dbg !15 + %263 = add i32 %262, %261, !dbg !32 + %264 = insertelement <1 x i32> poison, i32 %263, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %40, <1 x i32> %264, i1 %37) #5, !dbg !15 + %265 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %41, i1 true) #5, !dbg !15 + %266 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %265, i32 1, i32 31), !dbg !15 + %267 = add i32 %266, %265, !dbg !32 + %268 = insertelement <1 x i32> poison, i32 %267, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %41, <1 x i32> %268, i1 %37) #5, !dbg !15 + %269 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %42, i1 true) #5, !dbg !15 + %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 1, i32 31), !dbg !15 + %271 = add i32 %270, %269, !dbg !32 + %272 = insertelement <1 x i32> poison, i32 %271, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %42, <1 x i32> %272, i1 %37) #5, !dbg !15 + %273 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %43, i1 true) #5, !dbg !15 + %274 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %273, i32 1, i32 31), !dbg !15 + %275 = add i32 %274, %273, !dbg !32 + %276 = insertelement <1 x i32> poison, i32 %275, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %43, <1 x i32> %276, i1 %37) #5, !dbg !15 + %277 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %44, i1 true) #5, !dbg !15 + %278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %277, i32 1, i32 31), !dbg !15 + %279 = add i32 %278, %277, !dbg !32 + %280 = insertelement <1 x i32> poison, i32 %279, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %44, <1 x i32> %280, i1 %37) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %281 = load i32, ptr addrspace(3) %27, align 16, !dbg !15 + %282 = load i32, ptr addrspace(3) %28, align 8, !dbg !15 + %283 = load i32, ptr addrspace(3) %29, align 16, !dbg !15 + %284 = load i32, ptr addrspace(3) %30, align 8, !dbg !15 + %285 = load i32, ptr addrspace(3) %31, align 16, !dbg !15 + %286 = load i32, ptr addrspace(3) %32, align 8, !dbg !15 + %287 = load i32, ptr addrspace(3) %33, align 16, !dbg !15 + %288 = load i32, ptr addrspace(3) %34, align 8, !dbg !15 + %289 = mul nuw nsw i32 %126, %.lobit, !dbg !20 + %290 = mul nuw nsw i32 %128, %.lobit, !dbg !20 + %291 = mul nuw nsw i32 %130, %.lobit, !dbg !20 + %292 = mul nuw nsw i32 %118, %.lobit, !dbg !20 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %76, <1 x i32> %107, i1 true) #5, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %78, <1 x i32> %108, i1 true) #5, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %80, <1 x i32> %109, i1 true) #5, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %110, i1 true) #5, !dbg !15 + %293 = insertelement <1 x i32> poison, i32 %289, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, <1 x i32> %293, i1 true) #5, !dbg !15 + %294 = insertelement <1 x i32> poison, i32 %290, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %294, i1 true) #5, !dbg !15 + %295 = insertelement <1 x i32> poison, i32 %291, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %86, <1 x i32> %295, i1 true) #5, !dbg !15 + %296 = insertelement <1 x i32> poison, i32 %292, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %296, i1 true) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %297 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %35, i1 true) #5, !dbg !15 + %298 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %297, i32 1, i32 31), !dbg !15 + %299 = add i32 %298, %297, !dbg !32 + %300 = insertelement <1 x i32> poison, i32 %299, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %35, <1 x i32> %300, i1 %37) #5, !dbg !15 + %301 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %38, i1 true) #5, !dbg !15 + %302 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %301, i32 1, i32 31), !dbg !15 + %303 = add i32 %302, %301, !dbg !32 + %304 = insertelement <1 x i32> poison, i32 %303, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %38, <1 x i32> %304, i1 %37) #5, !dbg !15 + %305 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %39, i1 true) #5, !dbg !15 + %306 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %305, i32 1, i32 31), !dbg !15 + %307 = add i32 %306, %305, !dbg !32 + %308 = insertelement <1 x i32> poison, i32 %307, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %39, <1 x i32> %308, i1 %37) #5, !dbg !15 + %309 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %40, i1 true) #5, !dbg !15 + %310 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %309, i32 1, i32 31), !dbg !15 + %311 = add i32 %310, %309, !dbg !32 + %312 = insertelement <1 x i32> poison, i32 %311, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %40, <1 x i32> %312, i1 %37) #5, !dbg !15 + %313 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %41, i1 true) #5, !dbg !15 + %314 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %313, i32 1, i32 31), !dbg !15 + %315 = add i32 %314, %313, !dbg !32 + %316 = insertelement <1 x i32> poison, i32 %315, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %41, <1 x i32> %316, i1 %37) #5, !dbg !15 + %317 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %42, i1 true) #5, !dbg !15 + %318 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %317, i32 1, i32 31), !dbg !15 + %319 = add i32 %318, %317, !dbg !32 + %320 = insertelement <1 x i32> poison, i32 %319, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %42, <1 x i32> %320, i1 %37) #5, !dbg !15 + %321 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %43, i1 true) #5, !dbg !15 + %322 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %321, i32 1, i32 31), !dbg !15 + %323 = add i32 %322, %321, !dbg !32 + %324 = insertelement <1 x i32> poison, i32 %323, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %43, <1 x i32> %324, i1 %37) #5, !dbg !15 + %325 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %44, i1 true) #5, !dbg !15 + %326 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %325, i32 1, i32 31), !dbg !15 + %327 = add i32 %326, %325, !dbg !32 + %328 = insertelement <1 x i32> poison, i32 %327, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %44, <1 x i32> %328, i1 %37) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %329 = load i32, ptr addrspace(3) %27, align 16, !dbg !15 + %330 = load i32, ptr addrspace(3) %28, align 8, !dbg !15 + %331 = load i32, ptr addrspace(3) %29, align 16, !dbg !15 + %332 = load i32, ptr addrspace(3) %30, align 8, !dbg !15 + %333 = load i32, ptr addrspace(3) %31, align 16, !dbg !15 + %334 = load i32, ptr addrspace(3) %32, align 8, !dbg !15 + %335 = load i32, ptr addrspace(3) %33, align 16, !dbg !15 + %336 = load i32, ptr addrspace(3) %34, align 8, !dbg !15 + %337 = insertelement <2 x i32> poison, i32 %187, i64 0, !dbg !33 + %338 = insertelement <2 x i32> %337, i32 %185, i64 1, !dbg !33 + %339 = insertelement <2 x i32> poison, i32 %235, i64 0, !dbg !33 + %340 = insertelement <2 x i32> %339, i32 %233, i64 1, !dbg !33 + %341 = icmp slt <2 x i32> %338, %340, !dbg !33 + %342 = insertelement <2 x i32> poison, i32 %188, i64 0, !dbg !33 + %343 = insertelement <2 x i32> %342, i32 %186, i64 1, !dbg !33 + %344 = insertelement <2 x i32> poison, i32 %236, i64 0, !dbg !33 + %345 = insertelement <2 x i32> %344, i32 %234, i64 1, !dbg !33 + %346 = icmp sge <2 x i32> %343, %345, !dbg !33 + %347 = insertelement <2 x i32> poison, i32 %191, i64 0, !dbg !33 + %348 = insertelement <2 x i32> %347, i32 %189, i64 1, !dbg !33 + %349 = insertelement <2 x i32> poison, i32 %239, i64 0, !dbg !33 + %350 = insertelement <2 x i32> %349, i32 %237, i64 1, !dbg !33 + %351 = icmp slt <2 x i32> %348, %350, !dbg !33 + %352 = insertelement <2 x i32> poison, i32 %192, i64 0, !dbg !33 + %353 = insertelement <2 x i32> %352, i32 %190, i64 1, !dbg !33 + %354 = insertelement <2 x i32> poison, i32 %240, i64 0, !dbg !33 + %355 = insertelement <2 x i32> %354, i32 %238, i64 1, !dbg !33 + %356 = icmp sge <2 x i32> %353, %355, !dbg !33 + %357 = icmp eq <2 x i32> %338, %340, !dbg !34 + %358 = icmp ne <2 x i32> %343, %345, !dbg !34 + %359 = icmp eq <2 x i32> %348, %350, !dbg !34 + %360 = icmp ne <2 x i32> %353, %355, !dbg !34 + %361 = insertelement <2 x i32> poison, i32 %283, i64 0, !dbg !35 + %362 = insertelement <2 x i32> %361, i32 %281, i64 1, !dbg !35 + %363 = insertelement <2 x i32> poison, i32 %331, i64 0, !dbg !35 + %364 = insertelement <2 x i32> %363, i32 %329, i64 1, !dbg !35 + %365 = icmp sgt <2 x i32> %362, %364, !dbg !35 + %366 = insertelement <2 x i32> poison, i32 %284, i64 0, !dbg !35 + %367 = insertelement <2 x i32> %366, i32 %282, i64 1, !dbg !35 + %368 = insertelement <2 x i32> poison, i32 %332, i64 0, !dbg !35 + %369 = insertelement <2 x i32> %368, i32 %330, i64 1, !dbg !35 + %370 = icmp sle <2 x i32> %367, %369, !dbg !35 + %371 = insertelement <2 x i32> poison, i32 %287, i64 0, !dbg !35 + %372 = insertelement <2 x i32> %371, i32 %285, i64 1, !dbg !35 + %373 = insertelement <2 x i32> poison, i32 %335, i64 0, !dbg !35 + %374 = insertelement <2 x i32> %373, i32 %333, i64 1, !dbg !35 + %375 = icmp sgt <2 x i32> %372, %374, !dbg !35 + %376 = insertelement <2 x i32> poison, i32 %288, i64 0, !dbg !35 + %377 = insertelement <2 x i32> %376, i32 %286, i64 1, !dbg !35 + %378 = insertelement <2 x i32> poison, i32 %336, i64 0, !dbg !35 + %379 = insertelement <2 x i32> %378, i32 %334, i64 1, !dbg !35 + %380 = icmp sle <2 x i32> %377, %379, !dbg !35 + %381 = and <2 x i1> %357, %365, !dbg !36 + %382 = or <2 x i1> %358, %370, !dbg !36 + %383 = and <2 x i1> %359, %375, !dbg !36 + %384 = or <2 x i1> %360, %380, !dbg !37 + %385 = or <2 x i1> %341, %381, !dbg !37 + %386 = and <2 x i1> %346, %382, !dbg !37 + %387 = or <2 x i1> %351, %383, !dbg !37 + %388 = and <2 x i1> %356, %384, !dbg !38 + %389 = xor <2 x i32> %340, %338, !dbg !39 + %390 = xor <2 x i32> %345, %343, !dbg !39 + %391 = xor <2 x i32> %350, %348, !dbg !39 + %392 = xor <2 x i32> %355, %353, !dbg !39 + %393 = select <2 x i1> %385, <2 x i32> %389, <2 x i32> zeroinitializer, !dbg !40 + %394 = select <2 x i1> %386, <2 x i32> %390, <2 x i32> zeroinitializer, !dbg !40 + %395 = select <2 x i1> %387, <2 x i32> %391, <2 x i32> zeroinitializer, !dbg !40 + %396 = select <2 x i1> %388, <2 x i32> %392, <2 x i32> zeroinitializer, !dbg !40 + %397 = xor <2 x i32> %393, %112, !dbg !30 + %398 = xor <2 x i32> %394, %114, !dbg !30 + %399 = insertelement <2 x i32> poison, i32 %143, i64 0, !dbg !30 + %400 = insertelement <2 x i32> %399, i32 %141, i64 1, !dbg !30 + %401 = xor <2 x i32> %395, %400, !dbg !30 + %402 = insertelement <2 x i32> poison, i32 %144, i64 0, !dbg !30 + %403 = insertelement <2 x i32> %402, i32 %142, i64 1, !dbg !30 + %404 = xor <2 x i32> %396, %403, !dbg !30 + %405 = xor <2 x i32> %364, %362, !dbg !41 + %406 = xor <2 x i32> %369, %367, !dbg !41 + %407 = xor <2 x i32> %374, %372, !dbg !41 + %408 = xor <2 x i32> %379, %377, !dbg !41 + %409 = select <2 x i1> %385, <2 x i32> %405, <2 x i32> zeroinitializer, !dbg !42 + %410 = select <2 x i1> %386, <2 x i32> %406, <2 x i32> zeroinitializer, !dbg !42 + %411 = select <2 x i1> %387, <2 x i32> %407, <2 x i32> zeroinitializer, !dbg !42 + %412 = select <2 x i1> %388, <2 x i32> %408, <2 x i32> zeroinitializer, !dbg !42 + %413 = xor <2 x i32> %409, %116, !dbg !31 + %414 = xor <2 x i32> %410, %48, !dbg !31 + %415 = xor <2 x i32> %411, %125, !dbg !31 + %416 = xor <2 x i32> %412, %117, !dbg !31 + %417 = icmp sge <2 x i32> %397, %398, !dbg !33 + %418 = icmp slt <2 x i32> %397, %398, !dbg !33 + %419 = shufflevector <2 x i1> %417, <2 x i1> %418, <2 x i32> , !dbg !33 + %420 = icmp sge <2 x i32> %401, %404, !dbg !33 + %421 = icmp slt <2 x i32> %401, %404, !dbg !33 + %422 = shufflevector <2 x i1> %420, <2 x i1> %421, <2 x i32> , !dbg !33 + %423 = icmp ne <2 x i32> %397, %398, !dbg !34 + %424 = icmp eq <2 x i32> %397, %398, !dbg !34 + %425 = shufflevector <2 x i1> %423, <2 x i1> %424, <2 x i32> , !dbg !34 + %426 = icmp ne <2 x i32> %401, %404, !dbg !34 + %427 = icmp eq <2 x i32> %401, %404, !dbg !34 + %428 = shufflevector <2 x i1> %426, <2 x i1> %427, <2 x i32> , !dbg !34 + %429 = icmp sle <2 x i32> %413, %414, !dbg !35 + %430 = icmp sgt <2 x i32> %413, %414, !dbg !35 + %431 = shufflevector <2 x i1> %429, <2 x i1> %430, <2 x i32> , !dbg !35 + %432 = icmp sle <2 x i32> %415, %416, !dbg !35 + %433 = icmp sgt <2 x i32> %415, %416, !dbg !35 + %434 = shufflevector <2 x i1> %432, <2 x i1> %433, <2 x i32> , !dbg !35 + %435 = or <2 x i1> %425, %431, !dbg !36 + %436 = and <2 x i1> %425, %431, !dbg !36 + %437 = shufflevector <2 x i1> %435, <2 x i1> %436, <2 x i32> , !dbg !36 + %438 = or <2 x i1> %428, %434, !dbg !37 + %439 = and <2 x i1> %428, %434, !dbg !37 + %440 = shufflevector <2 x i1> %438, <2 x i1> %439, <2 x i32> , !dbg !37 + %441 = and <2 x i1> %419, %437, !dbg !37 + %442 = or <2 x i1> %419, %437, !dbg !37 + %443 = shufflevector <2 x i1> %441, <2 x i1> %442, <2 x i32> , !dbg !37 + %444 = and <2 x i1> %422, %440, !dbg !38 + %445 = or <2 x i1> %422, %440, !dbg !38 + %446 = shufflevector <2 x i1> %444, <2 x i1> %445, <2 x i32> , !dbg !38 + %447 = xor <2 x i32> %398, %397, !dbg !39 + %448 = xor <2 x i32> %404, %401, !dbg !39 + %449 = select <2 x i1> %443, <2 x i32> %447, <2 x i32> zeroinitializer, !dbg !40 + %450 = select <2 x i1> %446, <2 x i32> %448, <2 x i32> zeroinitializer, !dbg !40 + %foldExtExtBinop = xor <2 x i32> %449, %397, !dbg !30 + %451 = extractelement <2 x i32> %foldExtExtBinop, i64 1, !dbg !30 + %foldExtExtBinop81 = xor <2 x i32> %449, %397, !dbg !30 + %452 = extractelement <2 x i32> %foldExtExtBinop81, i64 0, !dbg !30 + %453 = xor <2 x i32> %449, %398, !dbg !30 + %foldExtExtBinop83 = xor <2 x i32> %450, %401, !dbg !30 + %454 = extractelement <2 x i32> %foldExtExtBinop83, i64 1, !dbg !30 + %foldExtExtBinop85 = xor <2 x i32> %450, %401, !dbg !30 + %455 = extractelement <2 x i32> %foldExtExtBinop85, i64 0, !dbg !30 + %456 = xor <2 x i32> %450, %404, !dbg !30 + %457 = extractelement <2 x i32> %413, i64 1, !dbg !41 + %458 = extractelement <2 x i32> %414, i64 1, !dbg !41 + %459 = xor i32 %458, %457, !dbg !41 + %460 = extractelement <2 x i32> %413, i64 0, !dbg !41 + %461 = extractelement <2 x i32> %414, i64 0, !dbg !41 + %462 = xor i32 %461, %460, !dbg !41 + %463 = extractelement <2 x i32> %415, i64 1, !dbg !41 + %464 = extractelement <2 x i32> %416, i64 1, !dbg !41 + %465 = xor i32 %464, %463, !dbg !41 + %466 = extractelement <2 x i32> %415, i64 0, !dbg !41 + %467 = extractelement <2 x i32> %416, i64 0, !dbg !41 + %468 = xor i32 %467, %466, !dbg !41 + %469 = extractelement <2 x i1> %442, i64 1, !dbg !42 + %470 = select i1 %469, i32 %459, i32 0, !dbg !42 + %471 = extractelement <2 x i1> %441, i64 0, !dbg !42 + %472 = select i1 %471, i32 %462, i32 0, !dbg !42 + %473 = extractelement <2 x i1> %445, i64 1, !dbg !42 + %474 = select i1 %473, i32 %465, i32 0, !dbg !42 + %475 = extractelement <2 x i1> %444, i64 0, !dbg !42 + %476 = select i1 %475, i32 %468, i32 0, !dbg !42 + %477 = xor i32 %470, %457, !dbg !31 + %478 = xor i32 %470, %458, !dbg !31 + %479 = xor i32 %472, %460, !dbg !31 + %480 = xor i32 %472, %461, !dbg !31 + %481 = xor i32 %474, %463, !dbg !31 + %482 = xor i32 %474, %464, !dbg !31 + %483 = xor i32 %476, %466, !dbg !31 + %484 = xor i32 %476, %467, !dbg !31 + %485 = mul nuw nsw i32 %451, %71, !dbg !27 + %486 = extractelement <2 x i32> %453, i64 1, !dbg !28 + %487 = mul nuw nsw i32 %486, %71, !dbg !27 + %488 = mul nuw nsw i32 %452, %71, !dbg !27 + %489 = extractelement <2 x i32> %453, i64 0, !dbg !28 + %490 = mul nuw nsw i32 %489, %71, !dbg !27 + %491 = mul nuw nsw i32 %454, %71, !dbg !27 + %492 = extractelement <2 x i32> %456, i64 1, !dbg !28 + %493 = mul nuw nsw i32 %492, %71, !dbg !27 + %494 = mul nuw nsw i32 %455, %71, !dbg !27 + %495 = extractelement <2 x i32> %456, i64 0, !dbg !28 + %496 = mul nuw nsw i32 %495, %71, !dbg !27 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %497 = insertelement <1 x i32> poison, i32 %485, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %76, <1 x i32> %497, i1 true) #5, !dbg !15 + %498 = insertelement <1 x i32> poison, i32 %487, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %78, <1 x i32> %498, i1 true) #5, !dbg !15 + %499 = insertelement <1 x i32> poison, i32 %488, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %80, <1 x i32> %499, i1 true) #5, !dbg !15 + %500 = insertelement <1 x i32> poison, i32 %490, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %500, i1 true) #5, !dbg !15 + %501 = insertelement <1 x i32> poison, i32 %491, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, <1 x i32> %501, i1 true) #5, !dbg !15 + %502 = insertelement <1 x i32> poison, i32 %493, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %502, i1 true) #5, !dbg !15 + %503 = insertelement <1 x i32> poison, i32 %494, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %86, <1 x i32> %503, i1 true) #5, !dbg !15 + %504 = insertelement <1 x i32> poison, i32 %496, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %504, i1 true) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %505 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %35, i1 true) #5, !dbg !15 + %506 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %505, i32 1, i32 31), !dbg !15 + %507 = add i32 %506, %505, !dbg !32 + %508 = insertelement <1 x i32> poison, i32 %507, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %35, <1 x i32> %508, i1 %37) #5, !dbg !15 + %509 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %38, i1 true) #5, !dbg !15 + %510 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %509, i32 1, i32 31), !dbg !15 + %511 = add i32 %510, %509, !dbg !32 + %512 = insertelement <1 x i32> poison, i32 %511, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %38, <1 x i32> %512, i1 %37) #5, !dbg !15 + %513 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %39, i1 true) #5, !dbg !15 + %514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %513, i32 1, i32 31), !dbg !15 + %515 = add i32 %514, %513, !dbg !32 + %516 = insertelement <1 x i32> poison, i32 %515, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %39, <1 x i32> %516, i1 %37) #5, !dbg !15 + %517 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %40, i1 true) #5, !dbg !15 + %518 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %517, i32 1, i32 31), !dbg !15 + %519 = add i32 %518, %517, !dbg !32 + %520 = insertelement <1 x i32> poison, i32 %519, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %40, <1 x i32> %520, i1 %37) #5, !dbg !15 + %521 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %41, i1 true) #5, !dbg !15 + %522 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %521, i32 1, i32 31), !dbg !15 + %523 = add i32 %522, %521, !dbg !32 + %524 = insertelement <1 x i32> poison, i32 %523, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %41, <1 x i32> %524, i1 %37) #5, !dbg !15 + %525 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %42, i1 true) #5, !dbg !15 + %526 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %525, i32 1, i32 31), !dbg !15 + %527 = add i32 %526, %525, !dbg !32 + %528 = insertelement <1 x i32> poison, i32 %527, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %42, <1 x i32> %528, i1 %37) #5, !dbg !15 + %529 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %43, i1 true) #5, !dbg !15 + %530 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %529, i32 1, i32 31), !dbg !15 + %531 = add i32 %530, %529, !dbg !32 + %532 = insertelement <1 x i32> poison, i32 %531, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %43, <1 x i32> %532, i1 %37) #5, !dbg !15 + %533 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %44, i1 true) #5, !dbg !15 + %534 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %533, i32 1, i32 31), !dbg !15 + %535 = add i32 %534, %533, !dbg !32 + %536 = insertelement <1 x i32> poison, i32 %535, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %44, <1 x i32> %536, i1 %37) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %537 = load i32, ptr addrspace(3) %27, align 16, !dbg !15 + %538 = load i32, ptr addrspace(3) %28, align 8, !dbg !15 + %539 = load i32, ptr addrspace(3) %29, align 16, !dbg !15 + %540 = load i32, ptr addrspace(3) %30, align 8, !dbg !15 + %541 = load i32, ptr addrspace(3) %31, align 16, !dbg !15 + %542 = load i32, ptr addrspace(3) %32, align 8, !dbg !15 + %543 = load i32, ptr addrspace(3) %33, align 16, !dbg !15 + %544 = load i32, ptr addrspace(3) %34, align 8, !dbg !15 + %545 = mul nuw nsw i32 %451, %.lobit, !dbg !28 + %546 = mul nuw nsw i32 %486, %.lobit, !dbg !28 + %547 = mul nuw nsw i32 %452, %.lobit, !dbg !28 + %548 = mul nuw nsw i32 %489, %.lobit, !dbg !28 + %549 = mul nuw nsw i32 %454, %.lobit, !dbg !28 + %550 = mul nuw nsw i32 %492, %.lobit, !dbg !28 + %551 = mul nuw nsw i32 %455, %.lobit, !dbg !28 + %552 = mul nuw nsw i32 %495, %.lobit, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %553 = insertelement <1 x i32> poison, i32 %545, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %76, <1 x i32> %553, i1 true) #5, !dbg !15 + %554 = insertelement <1 x i32> poison, i32 %546, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %78, <1 x i32> %554, i1 true) #5, !dbg !15 + %555 = insertelement <1 x i32> poison, i32 %547, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %80, <1 x i32> %555, i1 true) #5, !dbg !15 + %556 = insertelement <1 x i32> poison, i32 %548, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %556, i1 true) #5, !dbg !15 + %557 = insertelement <1 x i32> poison, i32 %549, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, <1 x i32> %557, i1 true) #5, !dbg !15 + %558 = insertelement <1 x i32> poison, i32 %550, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %558, i1 true) #5, !dbg !15 + %559 = insertelement <1 x i32> poison, i32 %551, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %86, <1 x i32> %559, i1 true) #5, !dbg !15 + %560 = insertelement <1 x i32> poison, i32 %552, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %560, i1 true) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %561 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %35, i1 true) #5, !dbg !15 + %562 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %561, i32 1, i32 31), !dbg !15 + %563 = add i32 %562, %561, !dbg !32 + %564 = insertelement <1 x i32> poison, i32 %563, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %35, <1 x i32> %564, i1 %37) #5, !dbg !15 + %565 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %38, i1 true) #5, !dbg !15 + %566 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %565, i32 1, i32 31), !dbg !15 + %567 = add i32 %566, %565, !dbg !32 + %568 = insertelement <1 x i32> poison, i32 %567, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %38, <1 x i32> %568, i1 %37) #5, !dbg !15 + %569 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %39, i1 true) #5, !dbg !15 + %570 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %569, i32 1, i32 31), !dbg !15 + %571 = add i32 %570, %569, !dbg !32 + %572 = insertelement <1 x i32> poison, i32 %571, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %39, <1 x i32> %572, i1 %37) #5, !dbg !15 + %573 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %40, i1 true) #5, !dbg !15 + %574 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %573, i32 1, i32 31), !dbg !15 + %575 = add i32 %574, %573, !dbg !32 + %576 = insertelement <1 x i32> poison, i32 %575, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %40, <1 x i32> %576, i1 %37) #5, !dbg !15 + %577 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %41, i1 true) #5, !dbg !15 + %578 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %577, i32 1, i32 31), !dbg !15 + %579 = add i32 %578, %577, !dbg !32 + %580 = insertelement <1 x i32> poison, i32 %579, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %41, <1 x i32> %580, i1 %37) #5, !dbg !15 + %581 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %42, i1 true) #5, !dbg !15 + %582 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %581, i32 1, i32 31), !dbg !15 + %583 = add i32 %582, %581, !dbg !32 + %584 = insertelement <1 x i32> poison, i32 %583, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %42, <1 x i32> %584, i1 %37) #5, !dbg !15 + %585 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %43, i1 true) #5, !dbg !15 + %586 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %585, i32 1, i32 31), !dbg !15 + %587 = add i32 %586, %585, !dbg !32 + %588 = insertelement <1 x i32> poison, i32 %587, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %43, <1 x i32> %588, i1 %37) #5, !dbg !15 + %589 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %44, i1 true) #5, !dbg !15 + %590 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %589, i32 1, i32 31), !dbg !15 + %591 = add i32 %590, %589, !dbg !32 + %592 = insertelement <1 x i32> poison, i32 %591, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %44, <1 x i32> %592, i1 %37) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %593 = load i32, ptr addrspace(3) %27, align 16, !dbg !15 + %594 = load i32, ptr addrspace(3) %28, align 8, !dbg !15 + %595 = load i32, ptr addrspace(3) %29, align 16, !dbg !15 + %596 = load i32, ptr addrspace(3) %30, align 8, !dbg !15 + %597 = load i32, ptr addrspace(3) %31, align 16, !dbg !15 + %598 = load i32, ptr addrspace(3) %32, align 8, !dbg !15 + %599 = load i32, ptr addrspace(3) %33, align 16, !dbg !15 + %600 = load i32, ptr addrspace(3) %34, align 8, !dbg !15 + %601 = mul nuw nsw i32 %477, %71, !dbg !29 + %602 = mul nuw nsw i32 %478, %71, !dbg !29 + %603 = mul nuw nsw i32 %479, %71, !dbg !29 + %604 = mul nuw nsw i32 %480, %71, !dbg !29 + %605 = mul nuw nsw i32 %481, %71, !dbg !29 + %606 = mul nuw nsw i32 %482, %71, !dbg !29 + %607 = mul nuw nsw i32 %483, %71, !dbg !29 + %608 = mul nuw nsw i32 %484, %71, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %609 = insertelement <1 x i32> poison, i32 %601, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %76, <1 x i32> %609, i1 true) #5, !dbg !15 + %610 = insertelement <1 x i32> poison, i32 %602, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %78, <1 x i32> %610, i1 true) #5, !dbg !15 + %611 = insertelement <1 x i32> poison, i32 %603, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %80, <1 x i32> %611, i1 true) #5, !dbg !15 + %612 = insertelement <1 x i32> poison, i32 %604, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %612, i1 true) #5, !dbg !15 + %613 = insertelement <1 x i32> poison, i32 %605, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, <1 x i32> %613, i1 true) #5, !dbg !15 + %614 = insertelement <1 x i32> poison, i32 %606, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %614, i1 true) #5, !dbg !15 + %615 = insertelement <1 x i32> poison, i32 %607, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %86, <1 x i32> %615, i1 true) #5, !dbg !15 + %616 = insertelement <1 x i32> poison, i32 %608, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %616, i1 true) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %617 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %35, i1 true) #5, !dbg !15 + %618 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %617, i32 1, i32 31), !dbg !15 + %619 = add i32 %618, %617, !dbg !32 + %620 = insertelement <1 x i32> poison, i32 %619, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %35, <1 x i32> %620, i1 %37) #5, !dbg !15 + %621 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %38, i1 true) #5, !dbg !15 + %622 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %621, i32 1, i32 31), !dbg !15 + %623 = add i32 %622, %621, !dbg !32 + %624 = insertelement <1 x i32> poison, i32 %623, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %38, <1 x i32> %624, i1 %37) #5, !dbg !15 + %625 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %39, i1 true) #5, !dbg !15 + %626 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %625, i32 1, i32 31), !dbg !15 + %627 = add i32 %626, %625, !dbg !32 + %628 = insertelement <1 x i32> poison, i32 %627, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %39, <1 x i32> %628, i1 %37) #5, !dbg !15 + %629 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %40, i1 true) #5, !dbg !15 + %630 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %629, i32 1, i32 31), !dbg !15 + %631 = add i32 %630, %629, !dbg !32 + %632 = insertelement <1 x i32> poison, i32 %631, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %40, <1 x i32> %632, i1 %37) #5, !dbg !15 + %633 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %41, i1 true) #5, !dbg !15 + %634 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %633, i32 1, i32 31), !dbg !15 + %635 = add i32 %634, %633, !dbg !32 + %636 = insertelement <1 x i32> poison, i32 %635, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %41, <1 x i32> %636, i1 %37) #5, !dbg !15 + %637 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %42, i1 true) #5, !dbg !15 + %638 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %637, i32 1, i32 31), !dbg !15 + %639 = add i32 %638, %637, !dbg !32 + %640 = insertelement <1 x i32> poison, i32 %639, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %42, <1 x i32> %640, i1 %37) #5, !dbg !15 + %641 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %43, i1 true) #5, !dbg !15 + %642 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %641, i32 1, i32 31), !dbg !15 + %643 = add i32 %642, %641, !dbg !32 + %644 = insertelement <1 x i32> poison, i32 %643, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %43, <1 x i32> %644, i1 %37) #5, !dbg !15 + %645 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %44, i1 true) #5, !dbg !15 + %646 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %645, i32 1, i32 31), !dbg !15 + %647 = add i32 %646, %645, !dbg !32 + %648 = insertelement <1 x i32> poison, i32 %647, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %44, <1 x i32> %648, i1 %37) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %649 = load i32, ptr addrspace(3) %27, align 16, !dbg !15 + %650 = load i32, ptr addrspace(3) %28, align 8, !dbg !15 + %651 = load i32, ptr addrspace(3) %29, align 16, !dbg !15 + %652 = load i32, ptr addrspace(3) %30, align 8, !dbg !15 + %653 = load i32, ptr addrspace(3) %31, align 16, !dbg !15 + %654 = load i32, ptr addrspace(3) %32, align 8, !dbg !15 + %655 = load i32, ptr addrspace(3) %33, align 16, !dbg !15 + %656 = load i32, ptr addrspace(3) %34, align 8, !dbg !15 + %657 = mul nuw nsw i32 %477, %.lobit, !dbg !20 + %658 = mul nuw nsw i32 %478, %.lobit, !dbg !20 + %659 = mul nuw nsw i32 %479, %.lobit, !dbg !20 + %660 = mul nuw nsw i32 %480, %.lobit, !dbg !20 + %661 = mul nuw nsw i32 %481, %.lobit, !dbg !20 + %662 = mul nuw nsw i32 %482, %.lobit, !dbg !20 + %663 = mul nuw nsw i32 %483, %.lobit, !dbg !20 + %664 = mul nuw nsw i32 %484, %.lobit, !dbg !20 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %665 = insertelement <1 x i32> poison, i32 %657, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %76, <1 x i32> %665, i1 true) #5, !dbg !15 + %666 = insertelement <1 x i32> poison, i32 %658, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %78, <1 x i32> %666, i1 true) #5, !dbg !15 + %667 = insertelement <1 x i32> poison, i32 %659, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %80, <1 x i32> %667, i1 true) #5, !dbg !15 + %668 = insertelement <1 x i32> poison, i32 %660, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %668, i1 true) #5, !dbg !15 + %669 = insertelement <1 x i32> poison, i32 %661, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, <1 x i32> %669, i1 true) #5, !dbg !15 + %670 = insertelement <1 x i32> poison, i32 %662, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %670, i1 true) #5, !dbg !15 + %671 = insertelement <1 x i32> poison, i32 %663, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %86, <1 x i32> %671, i1 true) #5, !dbg !15 + %672 = insertelement <1 x i32> poison, i32 %664, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %672, i1 true) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %673 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %35, i1 true) #5, !dbg !15 + %674 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %673, i32 1, i32 31), !dbg !15 + %675 = add i32 %674, %673, !dbg !32 + %676 = insertelement <1 x i32> poison, i32 %675, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %35, <1 x i32> %676, i1 %37) #5, !dbg !15 + %677 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %38, i1 true) #5, !dbg !15 + %678 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %677, i32 1, i32 31), !dbg !15 + %679 = add i32 %678, %677, !dbg !32 + %680 = insertelement <1 x i32> poison, i32 %679, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %38, <1 x i32> %680, i1 %37) #5, !dbg !15 + %681 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %39, i1 true) #5, !dbg !15 + %682 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %681, i32 1, i32 31), !dbg !15 + %683 = add i32 %682, %681, !dbg !32 + %684 = insertelement <1 x i32> poison, i32 %683, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %39, <1 x i32> %684, i1 %37) #5, !dbg !15 + %685 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %40, i1 true) #5, !dbg !15 + %686 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %685, i32 1, i32 31), !dbg !15 + %687 = add i32 %686, %685, !dbg !32 + %688 = insertelement <1 x i32> poison, i32 %687, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %40, <1 x i32> %688, i1 %37) #5, !dbg !15 + %689 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %41, i1 true) #5, !dbg !15 + %690 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %689, i32 1, i32 31), !dbg !15 + %691 = add i32 %690, %689, !dbg !32 + %692 = insertelement <1 x i32> poison, i32 %691, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %41, <1 x i32> %692, i1 %37) #5, !dbg !15 + %693 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %42, i1 true) #5, !dbg !15 + %694 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %693, i32 1, i32 31), !dbg !15 + %695 = add i32 %694, %693, !dbg !32 + %696 = insertelement <1 x i32> poison, i32 %695, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %42, <1 x i32> %696, i1 %37) #5, !dbg !15 + %697 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %43, i1 true) #5, !dbg !15 + %698 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %697, i32 1, i32 31), !dbg !15 + %699 = add i32 %698, %697, !dbg !32 + %700 = insertelement <1 x i32> poison, i32 %699, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %43, <1 x i32> %700, i1 %37) #5, !dbg !15 + %701 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %44, i1 true) #5, !dbg !15 + %702 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %701, i32 1, i32 31), !dbg !15 + %703 = add i32 %702, %701, !dbg !32 + %704 = insertelement <1 x i32> poison, i32 %703, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %44, <1 x i32> %704, i1 %37) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %705 = load i32, ptr addrspace(3) %27, align 16, !dbg !15 + %706 = load i32, ptr addrspace(3) %28, align 8, !dbg !15 + %707 = load i32, ptr addrspace(3) %29, align 16, !dbg !15 + %708 = load i32, ptr addrspace(3) %30, align 8, !dbg !15 + %709 = load i32, ptr addrspace(3) %31, align 16, !dbg !15 + %710 = load i32, ptr addrspace(3) %32, align 8, !dbg !15 + %711 = load i32, ptr addrspace(3) %33, align 16, !dbg !15 + %712 = load i32, ptr addrspace(3) %34, align 8, !dbg !15 + %713 = icmp slt i32 %537, %593, !dbg !33 + %714 = icmp slt i32 %538, %594, !dbg !33 + %715 = icmp sge i32 %539, %595, !dbg !33 + %716 = icmp sge i32 %540, %596, !dbg !33 + %717 = icmp slt i32 %541, %597, !dbg !33 + %718 = icmp slt i32 %542, %598, !dbg !33 + %719 = icmp sge i32 %543, %599, !dbg !33 + %720 = icmp sge i32 %544, %600, !dbg !33 + %721 = icmp eq i32 %537, %593, !dbg !34 + %722 = icmp eq i32 %538, %594, !dbg !34 + %723 = icmp ne i32 %539, %595, !dbg !34 + %724 = icmp ne i32 %540, %596, !dbg !34 + %725 = icmp eq i32 %541, %597, !dbg !34 + %726 = icmp eq i32 %542, %598, !dbg !34 + %727 = icmp ne i32 %543, %599, !dbg !34 + %728 = icmp ne i32 %544, %600, !dbg !34 + %729 = icmp sgt i32 %649, %705, !dbg !35 + %730 = icmp sgt i32 %650, %706, !dbg !35 + %731 = icmp sle i32 %651, %707, !dbg !35 + %732 = icmp sle i32 %652, %708, !dbg !35 + %733 = icmp sgt i32 %653, %709, !dbg !35 + %734 = icmp sgt i32 %654, %710, !dbg !35 + %735 = icmp sle i32 %655, %711, !dbg !35 + %736 = icmp sle i32 %656, %712, !dbg !35 + %737 = and i1 %721, %729, !dbg !36 + %.not28 = or i1 %723, %731, !dbg !36 + %738 = insertelement <2 x i1> poison, i1 %724, i64 0, !dbg !36 + %739 = insertelement <2 x i1> %738, i1 %722, i64 1, !dbg !36 + %740 = insertelement <2 x i1> poison, i1 %732, i64 0, !dbg !36 + %741 = insertelement <2 x i1> %740, i1 %730, i64 1, !dbg !36 + %742 = or <2 x i1> %739, %741, !dbg !36 + %743 = and <2 x i1> %739, %741, !dbg !36 + %744 = shufflevector <2 x i1> %742, <2 x i1> %743, <2 x i32> , !dbg !36 + %745 = and i1 %725, %733, !dbg !36 + %.not36 = or i1 %727, %735, !dbg !36 + %746 = insertelement <2 x i1> poison, i1 %728, i64 0, !dbg !37 + %747 = insertelement <2 x i1> %746, i1 %726, i64 1, !dbg !37 + %748 = insertelement <2 x i1> poison, i1 %736, i64 0, !dbg !37 + %749 = insertelement <2 x i1> %748, i1 %734, i64 1, !dbg !37 + %750 = or <2 x i1> %747, %749, !dbg !37 + %751 = and <2 x i1> %747, %749, !dbg !37 + %752 = shufflevector <2 x i1> %750, <2 x i1> %751, <2 x i32> , !dbg !37 + %753 = or i1 %713, %737, !dbg !37 + %.not26 = and i1 %715, %.not28, !dbg !37 + %754 = insertelement <2 x i1> poison, i1 %716, i64 0, !dbg !37 + %755 = insertelement <2 x i1> %754, i1 %714, i64 1, !dbg !37 + %756 = and <2 x i1> %755, %744, !dbg !37 + %757 = or <2 x i1> %755, %744, !dbg !37 + %758 = shufflevector <2 x i1> %756, <2 x i1> %757, <2 x i32> , !dbg !37 + %759 = or i1 %717, %745, !dbg !37 + %.not34 = and i1 %719, %.not36, !dbg !37 + %760 = insertelement <2 x i1> poison, i1 %720, i64 0, !dbg !38 + %761 = insertelement <2 x i1> %760, i1 %718, i64 1, !dbg !38 + %762 = and <2 x i1> %761, %752, !dbg !38 + %763 = or <2 x i1> %761, %752, !dbg !38 + %764 = shufflevector <2 x i1> %762, <2 x i1> %763, <2 x i32> , !dbg !38 + %765 = xor i32 %593, %537, !dbg !39 + %766 = xor i32 %595, %539, !dbg !39 + %767 = insertelement <2 x i32> poison, i32 %596, i64 0, !dbg !39 + %768 = insertelement <2 x i32> %767, i32 %594, i64 1, !dbg !39 + %769 = insertelement <2 x i32> poison, i32 %540, i64 0, !dbg !39 + %770 = insertelement <2 x i32> %769, i32 %538, i64 1, !dbg !39 + %771 = xor <2 x i32> %768, %770, !dbg !39 + %772 = xor i32 %597, %541, !dbg !39 + %773 = xor i32 %599, %543, !dbg !39 + %774 = insertelement <2 x i32> poison, i32 %600, i64 0, !dbg !39 + %775 = insertelement <2 x i32> %774, i32 %598, i64 1, !dbg !39 + %776 = insertelement <2 x i32> poison, i32 %544, i64 0, !dbg !39 + %777 = insertelement <2 x i32> %776, i32 %542, i64 1, !dbg !39 + %778 = xor <2 x i32> %775, %777, !dbg !39 + %779 = select i1 %753, i32 %765, i32 0, !dbg !40 + %780 = select i1 %.not26, i32 %766, i32 0, !dbg !40 + %781 = select <2 x i1> %758, <2 x i32> %771, <2 x i32> zeroinitializer, !dbg !40 + %782 = select i1 %759, i32 %772, i32 0, !dbg !40 + %783 = select i1 %.not34, i32 %773, i32 0, !dbg !40 + %784 = select <2 x i1> %764, <2 x i32> %778, <2 x i32> zeroinitializer, !dbg !40 + %785 = xor i32 %779, %451, !dbg !30 + %786 = xor i32 %780, %452, !dbg !30 + %787 = xor <2 x i32> %781, %453, !dbg !30 + %788 = xor i32 %782, %454, !dbg !30 + %789 = xor i32 %783, %455, !dbg !30 + %790 = xor <2 x i32> %784, %456, !dbg !30 + %791 = xor i32 %705, %649, !dbg !41 + %792 = xor i32 %706, %650, !dbg !41 + %793 = xor i32 %707, %651, !dbg !41 + %794 = xor i32 %708, %652, !dbg !41 + %795 = xor i32 %709, %653, !dbg !41 + %796 = xor i32 %710, %654, !dbg !41 + %797 = xor i32 %711, %655, !dbg !41 + %798 = xor i32 %712, %656, !dbg !41 + %799 = select i1 %753, i32 %791, i32 0, !dbg !42 + %800 = extractelement <2 x i1> %757, i64 1, !dbg !42 + %801 = select i1 %800, i32 %792, i32 0, !dbg !42 + %802 = select i1 %.not26, i32 %793, i32 0, !dbg !42 + %803 = extractelement <2 x i1> %756, i64 0, !dbg !42 + %804 = select i1 %803, i32 %794, i32 0, !dbg !42 + %805 = select i1 %759, i32 %795, i32 0, !dbg !42 + %806 = extractelement <2 x i1> %763, i64 1, !dbg !42 + %807 = select i1 %806, i32 %796, i32 0, !dbg !42 + %808 = select i1 %.not34, i32 %797, i32 0, !dbg !42 + %809 = extractelement <2 x i1> %762, i64 0, !dbg !42 + %810 = select i1 %809, i32 %798, i32 0, !dbg !42 + %811 = xor i32 %799, %477, !dbg !31 + %812 = xor i32 %801, %478, !dbg !31 + %813 = xor i32 %802, %479, !dbg !31 + %814 = xor i32 %804, %480, !dbg !31 + %815 = xor i32 %805, %481, !dbg !31 + %816 = xor i32 %807, %482, !dbg !31 + %817 = xor i32 %808, %483, !dbg !31 + %818 = xor i32 %810, %484, !dbg !31 + %819 = icmp slt i32 %785, %786, !dbg !33 + %820 = extractelement <2 x i32> %787, i64 0, !dbg !33 + %821 = extractelement <2 x i32> %787, i64 1, !dbg !33 + %822 = icmp slt i32 %821, %820, !dbg !33 + %823 = icmp sge i32 %788, %789, !dbg !33 + %824 = extractelement <2 x i32> %790, i64 0, !dbg !33 + %825 = extractelement <2 x i32> %790, i64 1, !dbg !33 + %826 = icmp sge i32 %825, %824, !dbg !33 + %827 = icmp eq i32 %785, %786, !dbg !34 + %828 = icmp eq i32 %821, %820, !dbg !34 + %829 = icmp ne i32 %788, %789, !dbg !34 + %830 = icmp ne i32 %825, %824, !dbg !34 + %831 = icmp sgt i32 %811, %813, !dbg !35 + %832 = icmp sgt i32 %812, %814, !dbg !35 + %833 = icmp sle i32 %815, %817, !dbg !35 + %834 = icmp sle i32 %816, %818, !dbg !35 + %835 = and i1 %827, %831, !dbg !36 + %836 = and i1 %828, %832, !dbg !36 + %.not44 = or i1 %829, %833, !dbg !36 + %.not48 = or i1 %830, %834, !dbg !37 + %837 = or i1 %819, %835, !dbg !37 + %838 = or i1 %822, %836, !dbg !37 + %.not42 = and i1 %823, %.not44, !dbg !37 + %.not46 = and i1 %826, %.not48, !dbg !38 + %839 = xor i32 %786, %785, !dbg !39 + %840 = xor i32 %820, %821, !dbg !39 + %841 = xor i32 %789, %788, !dbg !39 + %842 = xor i32 %824, %825, !dbg !39 + %843 = select i1 %837, i32 %839, i32 0, !dbg !40 + %844 = select i1 %838, i32 %840, i32 0, !dbg !40 + %845 = select i1 %.not42, i32 %841, i32 0, !dbg !40 + %846 = select i1 %.not46, i32 %842, i32 0, !dbg !40 + %847 = xor i32 %843, %785, !dbg !30 + %848 = xor i32 %844, %821, !dbg !30 + %849 = xor i32 %843, %786, !dbg !30 + %850 = xor i32 %844, %820, !dbg !30 + %851 = xor i32 %845, %788, !dbg !30 + %852 = xor i32 %846, %825, !dbg !30 + %853 = xor i32 %845, %789, !dbg !30 + %854 = xor i32 %846, %824, !dbg !30 + %855 = xor i32 %813, %811, !dbg !41 + %856 = xor i32 %814, %812, !dbg !41 + %857 = xor i32 %817, %815, !dbg !41 + %858 = xor i32 %818, %816, !dbg !41 + %859 = select i1 %837, i32 %855, i32 0, !dbg !42 + %860 = select i1 %838, i32 %856, i32 0, !dbg !42 + %861 = select i1 %.not42, i32 %857, i32 0, !dbg !42 + %862 = select i1 %.not46, i32 %858, i32 0, !dbg !42 + %863 = xor i32 %859, %811, !dbg !31 + %864 = xor i32 %860, %812, !dbg !31 + %865 = xor i32 %859, %813, !dbg !31 + %866 = xor i32 %860, %814, !dbg !31 + %867 = xor i32 %861, %815, !dbg !31 + %868 = xor i32 %862, %816, !dbg !31 + %869 = xor i32 %861, %817, !dbg !31 + %870 = xor i32 %862, %818, !dbg !31 + %871 = icmp slt i32 %847, %848, !dbg !33 + %872 = icmp slt i32 %849, %850, !dbg !33 + %873 = icmp sge i32 %851, %852, !dbg !33 + %874 = icmp sge i32 %853, %854, !dbg !33 + %875 = icmp eq i32 %847, %848, !dbg !34 + %876 = icmp eq i32 %849, %850, !dbg !34 + %877 = icmp ne i32 %851, %852, !dbg !34 + %878 = icmp ne i32 %853, %854, !dbg !34 + %879 = icmp sgt i32 %863, %864, !dbg !35 + %880 = icmp sgt i32 %865, %866, !dbg !35 + %881 = icmp sle i32 %867, %868, !dbg !35 + %882 = icmp sle i32 %869, %870, !dbg !35 + %883 = and i1 %875, %879, !dbg !36 + %884 = and i1 %876, %880, !dbg !36 + %.not52 = or i1 %877, %881, !dbg !36 + %.not56 = or i1 %878, %882, !dbg !37 + %885 = or i1 %871, %883, !dbg !37 + %886 = or i1 %872, %884, !dbg !37 + %.not50 = and i1 %873, %.not52, !dbg !37 + %.not54 = and i1 %874, %.not56, !dbg !38 + %887 = xor i32 %848, %847, !dbg !39 + %888 = xor i32 %850, %849, !dbg !39 + %889 = xor i32 %852, %851, !dbg !39 + %890 = xor i32 %854, %853, !dbg !39 + %891 = select i1 %885, i32 %887, i32 0, !dbg !40 + %892 = select i1 %886, i32 %888, i32 0, !dbg !40 + %893 = select i1 %.not50, i32 %889, i32 0, !dbg !40 + %894 = select i1 %.not54, i32 %890, i32 0, !dbg !40 + %895 = xor i32 %891, %847, !dbg !30 + %896 = xor i32 %891, %848, !dbg !30 + %897 = xor i32 %892, %849, !dbg !30 + %898 = xor i32 %892, %850, !dbg !30 + %899 = xor i32 %893, %851, !dbg !30 + %900 = xor i32 %893, %852, !dbg !30 + %901 = xor i32 %894, %853, !dbg !30 + %902 = xor i32 %894, %854, !dbg !30 + %903 = xor i32 %864, %863, !dbg !41 + %904 = xor i32 %866, %865, !dbg !41 + %905 = xor i32 %868, %867, !dbg !41 + %906 = xor i32 %870, %869, !dbg !41 + %907 = select i1 %885, i32 %903, i32 0, !dbg !42 + %908 = select i1 %886, i32 %904, i32 0, !dbg !42 + %909 = select i1 %.not50, i32 %905, i32 0, !dbg !42 + %910 = select i1 %.not54, i32 %906, i32 0, !dbg !42 + %911 = xor i32 %907, %863, !dbg !31 + %912 = xor i32 %907, %864, !dbg !31 + %913 = xor i32 %908, %865, !dbg !31 + %914 = xor i32 %908, %866, !dbg !31 + %915 = xor i32 %909, %867, !dbg !31 + %916 = xor i32 %909, %868, !dbg !31 + %917 = xor i32 %910, %869, !dbg !31 + %918 = xor i32 %910, %870, !dbg !31 + %919 = mul nuw nsw i32 %895, %71, !dbg !27 + %920 = mul nuw nsw i32 %896, %71, !dbg !27 + %921 = mul nuw nsw i32 %897, %71, !dbg !27 + %922 = mul nuw nsw i32 %898, %71, !dbg !27 + %923 = mul nuw nsw i32 %899, %71, !dbg !27 + %924 = mul nuw nsw i32 %900, %71, !dbg !27 + %925 = mul nuw nsw i32 %901, %71, !dbg !27 + %926 = mul nuw nsw i32 %902, %71, !dbg !27 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %927 = insertelement <1 x i32> poison, i32 %919, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %76, <1 x i32> %927, i1 true) #5, !dbg !15 + %928 = insertelement <1 x i32> poison, i32 %920, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %78, <1 x i32> %928, i1 true) #5, !dbg !15 + %929 = insertelement <1 x i32> poison, i32 %921, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %80, <1 x i32> %929, i1 true) #5, !dbg !15 + %930 = insertelement <1 x i32> poison, i32 %922, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %930, i1 true) #5, !dbg !15 + %931 = insertelement <1 x i32> poison, i32 %923, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, <1 x i32> %931, i1 true) #5, !dbg !15 + %932 = insertelement <1 x i32> poison, i32 %924, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %932, i1 true) #5, !dbg !15 + %933 = insertelement <1 x i32> poison, i32 %925, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %86, <1 x i32> %933, i1 true) #5, !dbg !15 + %934 = insertelement <1 x i32> poison, i32 %926, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %934, i1 true) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %935 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %35, i1 true) #5, !dbg !15 + %936 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %935, i32 1, i32 31), !dbg !15 + %937 = add i32 %936, %935, !dbg !32 + %938 = insertelement <1 x i32> poison, i32 %937, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %35, <1 x i32> %938, i1 %37) #5, !dbg !15 + %939 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %38, i1 true) #5, !dbg !15 + %940 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %939, i32 1, i32 31), !dbg !15 + %941 = add i32 %940, %939, !dbg !32 + %942 = insertelement <1 x i32> poison, i32 %941, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %38, <1 x i32> %942, i1 %37) #5, !dbg !15 + %943 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %39, i1 true) #5, !dbg !15 + %944 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %943, i32 1, i32 31), !dbg !15 + %945 = add i32 %944, %943, !dbg !32 + %946 = insertelement <1 x i32> poison, i32 %945, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %39, <1 x i32> %946, i1 %37) #5, !dbg !15 + %947 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %40, i1 true) #5, !dbg !15 + %948 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %947, i32 1, i32 31), !dbg !15 + %949 = add i32 %948, %947, !dbg !32 + %950 = insertelement <1 x i32> poison, i32 %949, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %40, <1 x i32> %950, i1 %37) #5, !dbg !15 + %951 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %41, i1 true) #5, !dbg !15 + %952 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %951, i32 1, i32 31), !dbg !15 + %953 = add i32 %952, %951, !dbg !32 + %954 = insertelement <1 x i32> poison, i32 %953, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %41, <1 x i32> %954, i1 %37) #5, !dbg !15 + %955 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %42, i1 true) #5, !dbg !15 + %956 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %955, i32 1, i32 31), !dbg !15 + %957 = add i32 %956, %955, !dbg !32 + %958 = insertelement <1 x i32> poison, i32 %957, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %42, <1 x i32> %958, i1 %37) #5, !dbg !15 + %959 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %43, i1 true) #5, !dbg !15 + %960 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %959, i32 1, i32 31), !dbg !15 + %961 = add i32 %960, %959, !dbg !32 + %962 = insertelement <1 x i32> poison, i32 %961, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %43, <1 x i32> %962, i1 %37) #5, !dbg !15 + %963 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %44, i1 true) #5, !dbg !15 + %964 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %963, i32 1, i32 31), !dbg !15 + %965 = add i32 %964, %963, !dbg !32 + %966 = insertelement <1 x i32> poison, i32 %965, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %44, <1 x i32> %966, i1 %37) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %967 = load i32, ptr addrspace(3) %27, align 16, !dbg !15 + %968 = load i32, ptr addrspace(3) %28, align 8, !dbg !15 + %969 = load i32, ptr addrspace(3) %29, align 16, !dbg !15 + %970 = load i32, ptr addrspace(3) %30, align 8, !dbg !15 + %971 = load i32, ptr addrspace(3) %31, align 16, !dbg !15 + %972 = load i32, ptr addrspace(3) %32, align 8, !dbg !15 + %973 = load i32, ptr addrspace(3) %33, align 16, !dbg !15 + %974 = load i32, ptr addrspace(3) %34, align 8, !dbg !15 + %975 = mul nuw nsw i32 %895, %.lobit, !dbg !28 + %976 = mul nuw nsw i32 %896, %.lobit, !dbg !28 + %977 = mul nuw nsw i32 %897, %.lobit, !dbg !28 + %978 = mul nuw nsw i32 %898, %.lobit, !dbg !28 + %979 = mul nuw nsw i32 %899, %.lobit, !dbg !28 + %980 = mul nuw nsw i32 %900, %.lobit, !dbg !28 + %981 = mul nuw nsw i32 %901, %.lobit, !dbg !28 + %982 = mul nuw nsw i32 %902, %.lobit, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %983 = insertelement <1 x i32> poison, i32 %975, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %76, <1 x i32> %983, i1 true) #5, !dbg !15 + %984 = insertelement <1 x i32> poison, i32 %976, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %78, <1 x i32> %984, i1 true) #5, !dbg !15 + %985 = insertelement <1 x i32> poison, i32 %977, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %80, <1 x i32> %985, i1 true) #5, !dbg !15 + %986 = insertelement <1 x i32> poison, i32 %978, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %986, i1 true) #5, !dbg !15 + %987 = insertelement <1 x i32> poison, i32 %979, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, <1 x i32> %987, i1 true) #5, !dbg !15 + %988 = insertelement <1 x i32> poison, i32 %980, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %988, i1 true) #5, !dbg !15 + %989 = insertelement <1 x i32> poison, i32 %981, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %86, <1 x i32> %989, i1 true) #5, !dbg !15 + %990 = insertelement <1 x i32> poison, i32 %982, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %990, i1 true) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %991 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %35, i1 true) #5, !dbg !15 + %992 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %991, i32 1, i32 31), !dbg !15 + %993 = add i32 %992, %991, !dbg !32 + %994 = insertelement <1 x i32> poison, i32 %993, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %35, <1 x i32> %994, i1 %37) #5, !dbg !15 + %995 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %38, i1 true) #5, !dbg !15 + %996 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %995, i32 1, i32 31), !dbg !15 + %997 = add i32 %996, %995, !dbg !32 + %998 = insertelement <1 x i32> poison, i32 %997, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %38, <1 x i32> %998, i1 %37) #5, !dbg !15 + %999 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %39, i1 true) #5, !dbg !15 + %1000 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %999, i32 1, i32 31), !dbg !15 + %1001 = add i32 %1000, %999, !dbg !32 + %1002 = insertelement <1 x i32> poison, i32 %1001, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %39, <1 x i32> %1002, i1 %37) #5, !dbg !15 + %1003 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %40, i1 true) #5, !dbg !15 + %1004 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1003, i32 1, i32 31), !dbg !15 + %1005 = add i32 %1004, %1003, !dbg !32 + %1006 = insertelement <1 x i32> poison, i32 %1005, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %40, <1 x i32> %1006, i1 %37) #5, !dbg !15 + %1007 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %41, i1 true) #5, !dbg !15 + %1008 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1007, i32 1, i32 31), !dbg !15 + %1009 = add i32 %1008, %1007, !dbg !32 + %1010 = insertelement <1 x i32> poison, i32 %1009, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %41, <1 x i32> %1010, i1 %37) #5, !dbg !15 + %1011 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %42, i1 true) #5, !dbg !15 + %1012 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1011, i32 1, i32 31), !dbg !15 + %1013 = add i32 %1012, %1011, !dbg !32 + %1014 = insertelement <1 x i32> poison, i32 %1013, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %42, <1 x i32> %1014, i1 %37) #5, !dbg !15 + %1015 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %43, i1 true) #5, !dbg !15 + %1016 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1015, i32 1, i32 31), !dbg !15 + %1017 = add i32 %1016, %1015, !dbg !32 + %1018 = insertelement <1 x i32> poison, i32 %1017, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %43, <1 x i32> %1018, i1 %37) #5, !dbg !15 + %1019 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %44, i1 true) #5, !dbg !15 + %1020 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1019, i32 1, i32 31), !dbg !15 + %1021 = add i32 %1020, %1019, !dbg !32 + %1022 = insertelement <1 x i32> poison, i32 %1021, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %44, <1 x i32> %1022, i1 %37) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1023 = load i32, ptr addrspace(3) %27, align 16, !dbg !15 + %1024 = load i32, ptr addrspace(3) %28, align 8, !dbg !15 + %1025 = load i32, ptr addrspace(3) %29, align 16, !dbg !15 + %1026 = load i32, ptr addrspace(3) %30, align 8, !dbg !15 + %1027 = load i32, ptr addrspace(3) %31, align 16, !dbg !15 + %1028 = load i32, ptr addrspace(3) %32, align 8, !dbg !15 + %1029 = load i32, ptr addrspace(3) %33, align 16, !dbg !15 + %1030 = load i32, ptr addrspace(3) %34, align 8, !dbg !15 + %1031 = mul nuw nsw i32 %911, %71, !dbg !29 + %1032 = mul nuw nsw i32 %912, %71, !dbg !29 + %1033 = mul nuw nsw i32 %913, %71, !dbg !29 + %1034 = mul nuw nsw i32 %914, %71, !dbg !29 + %1035 = mul nuw nsw i32 %915, %71, !dbg !29 + %1036 = mul nuw nsw i32 %916, %71, !dbg !29 + %1037 = mul nuw nsw i32 %917, %71, !dbg !29 + %1038 = mul nuw nsw i32 %918, %71, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1039 = insertelement <1 x i32> poison, i32 %1031, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %76, <1 x i32> %1039, i1 true) #5, !dbg !15 + %1040 = insertelement <1 x i32> poison, i32 %1032, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %78, <1 x i32> %1040, i1 true) #5, !dbg !15 + %1041 = insertelement <1 x i32> poison, i32 %1033, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %80, <1 x i32> %1041, i1 true) #5, !dbg !15 + %1042 = insertelement <1 x i32> poison, i32 %1034, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1042, i1 true) #5, !dbg !15 + %1043 = insertelement <1 x i32> poison, i32 %1035, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, <1 x i32> %1043, i1 true) #5, !dbg !15 + %1044 = insertelement <1 x i32> poison, i32 %1036, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %1044, i1 true) #5, !dbg !15 + %1045 = insertelement <1 x i32> poison, i32 %1037, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %86, <1 x i32> %1045, i1 true) #5, !dbg !15 + %1046 = insertelement <1 x i32> poison, i32 %1038, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1046, i1 true) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1047 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %35, i1 true) #5, !dbg !15 + %1048 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1047, i32 1, i32 31), !dbg !15 + %1049 = add i32 %1048, %1047, !dbg !32 + %1050 = insertelement <1 x i32> poison, i32 %1049, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %35, <1 x i32> %1050, i1 %37) #5, !dbg !15 + %1051 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %38, i1 true) #5, !dbg !15 + %1052 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1051, i32 1, i32 31), !dbg !15 + %1053 = add i32 %1052, %1051, !dbg !32 + %1054 = insertelement <1 x i32> poison, i32 %1053, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %38, <1 x i32> %1054, i1 %37) #5, !dbg !15 + %1055 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %39, i1 true) #5, !dbg !15 + %1056 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1055, i32 1, i32 31), !dbg !15 + %1057 = add i32 %1056, %1055, !dbg !32 + %1058 = insertelement <1 x i32> poison, i32 %1057, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %39, <1 x i32> %1058, i1 %37) #5, !dbg !15 + %1059 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %40, i1 true) #5, !dbg !15 + %1060 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1059, i32 1, i32 31), !dbg !15 + %1061 = add i32 %1060, %1059, !dbg !32 + %1062 = insertelement <1 x i32> poison, i32 %1061, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %40, <1 x i32> %1062, i1 %37) #5, !dbg !15 + %1063 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %41, i1 true) #5, !dbg !15 + %1064 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1063, i32 1, i32 31), !dbg !15 + %1065 = add i32 %1064, %1063, !dbg !32 + %1066 = insertelement <1 x i32> poison, i32 %1065, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %41, <1 x i32> %1066, i1 %37) #5, !dbg !15 + %1067 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %42, i1 true) #5, !dbg !15 + %1068 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1067, i32 1, i32 31), !dbg !15 + %1069 = add i32 %1068, %1067, !dbg !32 + %1070 = insertelement <1 x i32> poison, i32 %1069, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %42, <1 x i32> %1070, i1 %37) #5, !dbg !15 + %1071 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %43, i1 true) #5, !dbg !15 + %1072 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1071, i32 1, i32 31), !dbg !15 + %1073 = add i32 %1072, %1071, !dbg !32 + %1074 = insertelement <1 x i32> poison, i32 %1073, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %43, <1 x i32> %1074, i1 %37) #5, !dbg !15 + %1075 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %44, i1 true) #5, !dbg !15 + %1076 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1075, i32 1, i32 31), !dbg !15 + %1077 = add i32 %1076, %1075, !dbg !32 + %1078 = insertelement <1 x i32> poison, i32 %1077, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %44, <1 x i32> %1078, i1 %37) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1079 = load i32, ptr addrspace(3) %27, align 16, !dbg !15 + %1080 = load i32, ptr addrspace(3) %28, align 8, !dbg !15 + %1081 = load i32, ptr addrspace(3) %29, align 16, !dbg !15 + %1082 = load i32, ptr addrspace(3) %30, align 8, !dbg !15 + %1083 = load i32, ptr addrspace(3) %31, align 16, !dbg !15 + %1084 = load i32, ptr addrspace(3) %32, align 8, !dbg !15 + %1085 = load i32, ptr addrspace(3) %33, align 16, !dbg !15 + %1086 = load i32, ptr addrspace(3) %34, align 8, !dbg !15 + %1087 = mul nuw nsw i32 %911, %.lobit, !dbg !20 + %1088 = mul nuw nsw i32 %912, %.lobit, !dbg !20 + %1089 = mul nuw nsw i32 %913, %.lobit, !dbg !20 + %1090 = mul nuw nsw i32 %914, %.lobit, !dbg !20 + %1091 = mul nuw nsw i32 %915, %.lobit, !dbg !20 + %1092 = mul nuw nsw i32 %916, %.lobit, !dbg !20 + %1093 = mul nuw nsw i32 %917, %.lobit, !dbg !20 + %1094 = mul nuw nsw i32 %918, %.lobit, !dbg !20 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1095 = insertelement <1 x i32> poison, i32 %1087, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %76, <1 x i32> %1095, i1 true) #5, !dbg !15 + %1096 = insertelement <1 x i32> poison, i32 %1088, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %78, <1 x i32> %1096, i1 true) #5, !dbg !15 + %1097 = insertelement <1 x i32> poison, i32 %1089, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %80, <1 x i32> %1097, i1 true) #5, !dbg !15 + %1098 = insertelement <1 x i32> poison, i32 %1090, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1098, i1 true) #5, !dbg !15 + %1099 = insertelement <1 x i32> poison, i32 %1091, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, <1 x i32> %1099, i1 true) #5, !dbg !15 + %1100 = insertelement <1 x i32> poison, i32 %1092, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %1100, i1 true) #5, !dbg !15 + %1101 = insertelement <1 x i32> poison, i32 %1093, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %86, <1 x i32> %1101, i1 true) #5, !dbg !15 + %1102 = insertelement <1 x i32> poison, i32 %1094, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1102, i1 true) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1103 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %35, i1 true) #5, !dbg !15 + %1104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1103, i32 1, i32 31), !dbg !15 + %1105 = add i32 %1104, %1103, !dbg !32 + %1106 = insertelement <1 x i32> poison, i32 %1105, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %35, <1 x i32> %1106, i1 %37) #5, !dbg !15 + %1107 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %38, i1 true) #5, !dbg !15 + %1108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1107, i32 1, i32 31), !dbg !15 + %1109 = add i32 %1108, %1107, !dbg !32 + %1110 = insertelement <1 x i32> poison, i32 %1109, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %38, <1 x i32> %1110, i1 %37) #5, !dbg !15 + %1111 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %39, i1 true) #5, !dbg !15 + %1112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1111, i32 1, i32 31), !dbg !15 + %1113 = add i32 %1112, %1111, !dbg !32 + %1114 = insertelement <1 x i32> poison, i32 %1113, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %39, <1 x i32> %1114, i1 %37) #5, !dbg !15 + %1115 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %40, i1 true) #5, !dbg !15 + %1116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1115, i32 1, i32 31), !dbg !15 + %1117 = add i32 %1116, %1115, !dbg !32 + %1118 = insertelement <1 x i32> poison, i32 %1117, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %40, <1 x i32> %1118, i1 %37) #5, !dbg !15 + %1119 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %41, i1 true) #5, !dbg !15 + %1120 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1119, i32 1, i32 31), !dbg !15 + %1121 = add i32 %1120, %1119, !dbg !32 + %1122 = insertelement <1 x i32> poison, i32 %1121, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %41, <1 x i32> %1122, i1 %37) #5, !dbg !15 + %1123 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %42, i1 true) #5, !dbg !15 + %1124 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1123, i32 1, i32 31), !dbg !15 + %1125 = add i32 %1124, %1123, !dbg !32 + %1126 = insertelement <1 x i32> poison, i32 %1125, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %42, <1 x i32> %1126, i1 %37) #5, !dbg !15 + %1127 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %43, i1 true) #5, !dbg !15 + %1128 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1127, i32 1, i32 31), !dbg !15 + %1129 = add i32 %1128, %1127, !dbg !32 + %1130 = insertelement <1 x i32> poison, i32 %1129, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %43, <1 x i32> %1130, i1 %37) #5, !dbg !15 + %1131 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %44, i1 true) #5, !dbg !15 + %1132 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1131, i32 1, i32 31), !dbg !15 + %1133 = add i32 %1132, %1131, !dbg !32 + %1134 = insertelement <1 x i32> poison, i32 %1133, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %44, <1 x i32> %1134, i1 %37) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1135 = load i32, ptr addrspace(3) %27, align 16, !dbg !15 + %1136 = load i32, ptr addrspace(3) %28, align 8, !dbg !15 + %1137 = load i32, ptr addrspace(3) %29, align 16, !dbg !15 + %1138 = load i32, ptr addrspace(3) %30, align 8, !dbg !15 + %1139 = load i32, ptr addrspace(3) %31, align 16, !dbg !15 + %1140 = load i32, ptr addrspace(3) %32, align 8, !dbg !15 + %1141 = load i32, ptr addrspace(3) %33, align 16, !dbg !15 + %1142 = load i32, ptr addrspace(3) %34, align 8, !dbg !15 + %1143 = icmp slt i32 %967, %1023, !dbg !33 + %1144 = icmp slt i32 %968, %1024, !dbg !33 + %1145 = icmp slt i32 %969, %1025, !dbg !33 + %1146 = icmp slt i32 %970, %1026, !dbg !33 + %1147 = icmp sge i32 %971, %1027, !dbg !33 + %1148 = icmp sge i32 %972, %1028, !dbg !33 + %1149 = icmp sge i32 %973, %1029, !dbg !33 + %1150 = icmp sge i32 %974, %1030, !dbg !33 + %1151 = icmp eq i32 %967, %1023, !dbg !34 + %1152 = icmp eq i32 %968, %1024, !dbg !34 + %1153 = icmp eq i32 %969, %1025, !dbg !34 + %1154 = icmp eq i32 %970, %1026, !dbg !34 + %1155 = icmp ne i32 %971, %1027, !dbg !34 + %1156 = icmp ne i32 %972, %1028, !dbg !34 + %1157 = icmp ne i32 %973, %1029, !dbg !34 + %1158 = icmp ne i32 %974, %1030, !dbg !34 + %1159 = icmp sgt i32 %1079, %1135, !dbg !35 + %1160 = icmp sgt i32 %1080, %1136, !dbg !35 + %1161 = icmp sgt i32 %1081, %1137, !dbg !35 + %1162 = icmp sgt i32 %1082, %1138, !dbg !35 + %1163 = icmp sle i32 %1083, %1139, !dbg !35 + %1164 = icmp sle i32 %1084, %1140, !dbg !35 + %1165 = icmp sle i32 %1085, %1141, !dbg !35 + %1166 = icmp sle i32 %1086, %1142, !dbg !35 + %1167 = and i1 %1151, %1159, !dbg !36 + %1168 = and i1 %1152, %1160, !dbg !36 + %1169 = and i1 %1153, %1161, !dbg !36 + %1170 = and i1 %1154, %1162, !dbg !36 + %.not60 = or i1 %1155, %1163, !dbg !36 + %.not64 = or i1 %1156, %1164, !dbg !36 + %.not68 = or i1 %1157, %1165, !dbg !36 + %.not72 = or i1 %1158, %1166, !dbg !37 + %1171 = or i1 %1143, %1167, !dbg !37 + %1172 = or i1 %1144, %1168, !dbg !37 + %1173 = or i1 %1145, %1169, !dbg !37 + %1174 = or i1 %1146, %1170, !dbg !37 + %.not58 = and i1 %1147, %.not60, !dbg !37 + %.not62 = and i1 %1148, %.not64, !dbg !37 + %.not66 = and i1 %1149, %.not68, !dbg !37 + %.not70 = and i1 %1150, %.not72, !dbg !38 + %1175 = xor i32 %1023, %967, !dbg !39 + %1176 = xor i32 %1024, %968, !dbg !39 + %1177 = xor i32 %1025, %969, !dbg !39 + %1178 = xor i32 %1026, %970, !dbg !39 + %1179 = xor i32 %1027, %971, !dbg !39 + %1180 = xor i32 %1028, %972, !dbg !39 + %1181 = xor i32 %1029, %973, !dbg !39 + %1182 = xor i32 %1030, %974, !dbg !39 + %1183 = select i1 %1171, i32 %1175, i32 0, !dbg !40 + %1184 = select i1 %1172, i32 %1176, i32 0, !dbg !40 + %1185 = select i1 %1173, i32 %1177, i32 0, !dbg !40 + %1186 = select i1 %1174, i32 %1178, i32 0, !dbg !40 + %1187 = select i1 %.not58, i32 %1179, i32 0, !dbg !40 + %1188 = select i1 %.not62, i32 %1180, i32 0, !dbg !40 + %1189 = select i1 %.not66, i32 %1181, i32 0, !dbg !40 + %1190 = select i1 %.not70, i32 %1182, i32 0, !dbg !40 + %1191 = xor i32 %1183, %895, !dbg !30 + %1192 = xor i32 %1184, %896, !dbg !30 + %1193 = xor i32 %1185, %897, !dbg !30 + %1194 = xor i32 %1186, %898, !dbg !30 + %1195 = xor i32 %1187, %899, !dbg !30 + %1196 = xor i32 %1188, %900, !dbg !30 + %1197 = xor i32 %1189, %901, !dbg !30 + %1198 = xor i32 %1190, %902, !dbg !30 + %1199 = xor i32 %1135, %1079, !dbg !41 + %1200 = xor i32 %1136, %1080, !dbg !41 + %1201 = xor i32 %1137, %1081, !dbg !41 + %1202 = xor i32 %1138, %1082, !dbg !41 + %1203 = xor i32 %1139, %1083, !dbg !41 + %1204 = xor i32 %1140, %1084, !dbg !41 + %1205 = xor i32 %1141, %1085, !dbg !41 + %1206 = xor i32 %1142, %1086, !dbg !41 + %1207 = select i1 %1171, i32 %1199, i32 0, !dbg !42 + %1208 = select i1 %1172, i32 %1200, i32 0, !dbg !42 + %1209 = select i1 %1173, i32 %1201, i32 0, !dbg !42 + %1210 = select i1 %1174, i32 %1202, i32 0, !dbg !42 + %1211 = select i1 %.not58, i32 %1203, i32 0, !dbg !42 + %1212 = select i1 %.not62, i32 %1204, i32 0, !dbg !42 + %1213 = select i1 %.not66, i32 %1205, i32 0, !dbg !42 + %1214 = select i1 %.not70, i32 %1206, i32 0, !dbg !42 + %1215 = xor i32 %1207, %911, !dbg !31 + %1216 = xor i32 %1208, %912, !dbg !31 + %1217 = xor i32 %1209, %913, !dbg !31 + %1218 = xor i32 %1210, %914, !dbg !31 + %1219 = xor i32 %1211, %915, !dbg !31 + %1220 = xor i32 %1212, %916, !dbg !31 + %1221 = xor i32 %1213, %917, !dbg !31 + %1222 = xor i32 %1214, %918, !dbg !31 + %1223 = icmp slt i32 %1191, %1195, !dbg !33 + %1224 = icmp slt i32 %1192, %1196, !dbg !33 + %1225 = icmp slt i32 %1193, %1197, !dbg !33 + %1226 = icmp slt i32 %1194, %1198, !dbg !33 + %1227 = icmp eq i32 %1191, %1195, !dbg !34 + %1228 = icmp eq i32 %1192, %1196, !dbg !34 + %1229 = icmp eq i32 %1193, %1197, !dbg !34 + %1230 = icmp eq i32 %1194, %1198, !dbg !34 + %1231 = icmp sgt i32 %1215, %1219, !dbg !35 + %1232 = icmp sgt i32 %1216, %1220, !dbg !35 + %1233 = icmp sgt i32 %1217, %1221, !dbg !35 + %1234 = icmp sgt i32 %1218, %1222, !dbg !35 + %1235 = and i1 %1227, %1231, !dbg !36 + %1236 = and i1 %1228, %1232, !dbg !36 + %1237 = and i1 %1229, %1233, !dbg !36 + %1238 = and i1 %1230, %1234, !dbg !36 + %1239 = or i1 %1223, %1235, !dbg !37 + %1240 = or i1 %1224, %1236, !dbg !37 + %1241 = or i1 %1225, %1237, !dbg !37 + %1242 = or i1 %1226, %1238, !dbg !37 + %1243 = xor i32 %1195, %1191, !dbg !39 + %1244 = xor i32 %1196, %1192, !dbg !39 + %1245 = xor i32 %1197, %1193, !dbg !39 + %1246 = xor i32 %1198, %1194, !dbg !39 + %1247 = select i1 %1239, i32 %1243, i32 0, !dbg !40 + %1248 = select i1 %1240, i32 %1244, i32 0, !dbg !40 + %1249 = select i1 %1241, i32 %1245, i32 0, !dbg !40 + %1250 = select i1 %1242, i32 %1246, i32 0, !dbg !40 + %1251 = xor i32 %1247, %1191, !dbg !30 + %1252 = xor i32 %1248, %1192, !dbg !30 + %1253 = xor i32 %1249, %1193, !dbg !30 + %1254 = xor i32 %1250, %1194, !dbg !30 + %1255 = xor i32 %1247, %1195, !dbg !30 + %1256 = xor i32 %1248, %1196, !dbg !30 + %1257 = xor i32 %1249, %1197, !dbg !30 + %1258 = xor i32 %1250, %1198, !dbg !30 + %1259 = xor i32 %1219, %1215, !dbg !41 + %1260 = xor i32 %1220, %1216, !dbg !41 + %1261 = xor i32 %1221, %1217, !dbg !41 + %1262 = xor i32 %1222, %1218, !dbg !41 + %1263 = select i1 %1239, i32 %1259, i32 0, !dbg !42 + %1264 = select i1 %1240, i32 %1260, i32 0, !dbg !42 + %1265 = select i1 %1241, i32 %1261, i32 0, !dbg !42 + %1266 = select i1 %1242, i32 %1262, i32 0, !dbg !42 + %1267 = xor i32 %1263, %1215, !dbg !31 + %1268 = xor i32 %1264, %1216, !dbg !31 + %1269 = xor i32 %1265, %1217, !dbg !31 + %1270 = xor i32 %1266, %1218, !dbg !31 + %1271 = xor i32 %1263, %1219, !dbg !31 + %1272 = xor i32 %1264, %1220, !dbg !31 + %1273 = xor i32 %1265, %1221, !dbg !31 + %1274 = xor i32 %1266, %1222, !dbg !31 + %1275 = icmp slt i32 %1251, %1253, !dbg !33 + %1276 = icmp slt i32 %1252, %1254, !dbg !33 + %1277 = icmp slt i32 %1255, %1257, !dbg !33 + %1278 = icmp slt i32 %1256, %1258, !dbg !33 + %1279 = icmp eq i32 %1251, %1253, !dbg !34 + %1280 = icmp eq i32 %1252, %1254, !dbg !34 + %1281 = icmp eq i32 %1255, %1257, !dbg !34 + %1282 = icmp eq i32 %1256, %1258, !dbg !34 + %1283 = icmp sgt i32 %1267, %1269, !dbg !35 + %1284 = icmp sgt i32 %1268, %1270, !dbg !35 + %1285 = icmp sgt i32 %1271, %1273, !dbg !35 + %1286 = icmp sgt i32 %1272, %1274, !dbg !35 + %1287 = and i1 %1279, %1283, !dbg !36 + %1288 = and i1 %1280, %1284, !dbg !36 + %1289 = and i1 %1281, %1285, !dbg !36 + %1290 = and i1 %1282, %1286, !dbg !36 + %1291 = or i1 %1275, %1287, !dbg !37 + %1292 = or i1 %1276, %1288, !dbg !37 + %1293 = or i1 %1277, %1289, !dbg !37 + %1294 = or i1 %1278, %1290, !dbg !37 + %1295 = xor i32 %1253, %1251, !dbg !39 + %1296 = xor i32 %1254, %1252, !dbg !39 + %1297 = xor i32 %1257, %1255, !dbg !39 + %1298 = xor i32 %1258, %1256, !dbg !39 + %1299 = select i1 %1291, i32 %1295, i32 0, !dbg !40 + %1300 = select i1 %1292, i32 %1296, i32 0, !dbg !40 + %1301 = select i1 %1293, i32 %1297, i32 0, !dbg !40 + %1302 = select i1 %1294, i32 %1298, i32 0, !dbg !40 + %1303 = xor i32 %1299, %1251, !dbg !30 + %1304 = xor i32 %1300, %1252, !dbg !30 + %1305 = xor i32 %1299, %1253, !dbg !30 + %1306 = xor i32 %1300, %1254, !dbg !30 + %1307 = xor i32 %1301, %1255, !dbg !30 + %1308 = xor i32 %1302, %1256, !dbg !30 + %1309 = xor i32 %1301, %1257, !dbg !30 + %1310 = xor i32 %1302, %1258, !dbg !30 + %1311 = xor i32 %1269, %1267, !dbg !41 + %1312 = xor i32 %1270, %1268, !dbg !41 + %1313 = xor i32 %1273, %1271, !dbg !41 + %1314 = xor i32 %1274, %1272, !dbg !41 + %1315 = select i1 %1291, i32 %1311, i32 0, !dbg !42 + %1316 = select i1 %1292, i32 %1312, i32 0, !dbg !42 + %1317 = select i1 %1293, i32 %1313, i32 0, !dbg !42 + %1318 = select i1 %1294, i32 %1314, i32 0, !dbg !42 + %1319 = xor i32 %1315, %1267, !dbg !31 + %1320 = xor i32 %1316, %1268, !dbg !31 + %1321 = xor i32 %1315, %1269, !dbg !31 + %1322 = xor i32 %1316, %1270, !dbg !31 + %1323 = xor i32 %1317, %1271, !dbg !31 + %1324 = xor i32 %1318, %1272, !dbg !31 + %1325 = xor i32 %1317, %1273, !dbg !31 + %1326 = xor i32 %1318, %1274, !dbg !31 + %1327 = icmp slt i32 %1303, %1304, !dbg !33 + %1328 = icmp slt i32 %1305, %1306, !dbg !33 + %1329 = icmp slt i32 %1307, %1308, !dbg !33 + %1330 = icmp slt i32 %1309, %1310, !dbg !33 + %1331 = icmp eq i32 %1303, %1304, !dbg !34 + %1332 = icmp eq i32 %1305, %1306, !dbg !34 + %1333 = icmp eq i32 %1307, %1308, !dbg !34 + %1334 = icmp eq i32 %1309, %1310, !dbg !34 + %1335 = icmp sgt i32 %1319, %1320, !dbg !35 + %1336 = icmp sgt i32 %1321, %1322, !dbg !35 + %1337 = icmp sgt i32 %1323, %1324, !dbg !35 + %1338 = icmp sgt i32 %1325, %1326, !dbg !35 + %1339 = and i1 %1331, %1335, !dbg !36 + %1340 = and i1 %1332, %1336, !dbg !36 + %1341 = and i1 %1333, %1337, !dbg !36 + %1342 = and i1 %1334, %1338, !dbg !36 + %1343 = or i1 %1327, %1339, !dbg !37 + %1344 = or i1 %1328, %1340, !dbg !37 + %1345 = or i1 %1329, %1341, !dbg !37 + %1346 = or i1 %1330, %1342, !dbg !37 + %1347 = xor i32 %1304, %1303, !dbg !39 + %1348 = xor i32 %1306, %1305, !dbg !39 + %1349 = xor i32 %1308, %1307, !dbg !39 + %1350 = xor i32 %1310, %1309, !dbg !39 + %1351 = select i1 %1343, i32 %1347, i32 0, !dbg !40 + %1352 = select i1 %1344, i32 %1348, i32 0, !dbg !40 + %1353 = select i1 %1345, i32 %1349, i32 0, !dbg !40 + %1354 = select i1 %1346, i32 %1350, i32 0, !dbg !40 + %1355 = xor i32 %1351, %1303, !dbg !30 + %1356 = xor i32 %1351, %1304, !dbg !30 + %1357 = xor i32 %1352, %1305, !dbg !30 + %1358 = xor i32 %1352, %1306, !dbg !30 + %1359 = xor i32 %1353, %1307, !dbg !30 + %1360 = xor i32 %1353, %1308, !dbg !30 + %1361 = xor i32 %1354, %1309, !dbg !30 + %1362 = xor i32 %1354, %1310, !dbg !30 + %1363 = xor i32 %1320, %1319, !dbg !41 + %1364 = xor i32 %1322, %1321, !dbg !41 + %1365 = xor i32 %1324, %1323, !dbg !41 + %1366 = xor i32 %1326, %1325, !dbg !41 + %1367 = select i1 %1343, i32 %1363, i32 0, !dbg !42 + %1368 = select i1 %1344, i32 %1364, i32 0, !dbg !42 + %1369 = select i1 %1345, i32 %1365, i32 0, !dbg !42 + %1370 = select i1 %1346, i32 %1366, i32 0, !dbg !42 + %1371 = xor i32 %1367, %1319, !dbg !31 + %1372 = xor i32 %1367, %1320, !dbg !31 + %1373 = xor i32 %1368, %1321, !dbg !31 + %1374 = xor i32 %1368, %1322, !dbg !31 + %1375 = xor i32 %1369, %1323, !dbg !31 + %1376 = xor i32 %1369, %1324, !dbg !31 + %1377 = xor i32 %1370, %1325, !dbg !31 + %1378 = xor i32 %1370, %1326, !dbg !31 + %1379 = mul nuw nsw i32 %1355, %71, !dbg !27 + %1380 = mul nuw nsw i32 %1356, %71, !dbg !27 + %1381 = mul nuw nsw i32 %1357, %71, !dbg !27 + %1382 = mul nuw nsw i32 %1358, %71, !dbg !27 + %1383 = mul nuw nsw i32 %1359, %71, !dbg !27 + %1384 = mul nuw nsw i32 %1360, %71, !dbg !27 + %1385 = mul nuw nsw i32 %1361, %71, !dbg !27 + %1386 = mul nuw nsw i32 %1362, %71, !dbg !27 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1387 = insertelement <1 x i32> poison, i32 %1379, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %76, <1 x i32> %1387, i1 true) #5, !dbg !15 + %1388 = insertelement <1 x i32> poison, i32 %1380, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %78, <1 x i32> %1388, i1 true) #5, !dbg !15 + %1389 = insertelement <1 x i32> poison, i32 %1381, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %80, <1 x i32> %1389, i1 true) #5, !dbg !15 + %1390 = insertelement <1 x i32> poison, i32 %1382, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1390, i1 true) #5, !dbg !15 + %1391 = insertelement <1 x i32> poison, i32 %1383, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, <1 x i32> %1391, i1 true) #5, !dbg !15 + %1392 = insertelement <1 x i32> poison, i32 %1384, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %1392, i1 true) #5, !dbg !15 + %1393 = insertelement <1 x i32> poison, i32 %1385, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %86, <1 x i32> %1393, i1 true) #5, !dbg !15 + %1394 = insertelement <1 x i32> poison, i32 %1386, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1394, i1 true) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1395 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %35, i1 true) #5, !dbg !15 + %1396 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1395, i32 1, i32 31), !dbg !15 + %1397 = add i32 %1396, %1395, !dbg !32 + %1398 = insertelement <1 x i32> poison, i32 %1397, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %35, <1 x i32> %1398, i1 %37) #5, !dbg !15 + %1399 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %38, i1 true) #5, !dbg !15 + %1400 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1399, i32 1, i32 31), !dbg !15 + %1401 = add i32 %1400, %1399, !dbg !32 + %1402 = insertelement <1 x i32> poison, i32 %1401, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %38, <1 x i32> %1402, i1 %37) #5, !dbg !15 + %1403 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %39, i1 true) #5, !dbg !15 + %1404 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1403, i32 1, i32 31), !dbg !15 + %1405 = add i32 %1404, %1403, !dbg !32 + %1406 = insertelement <1 x i32> poison, i32 %1405, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %39, <1 x i32> %1406, i1 %37) #5, !dbg !15 + %1407 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %40, i1 true) #5, !dbg !15 + %1408 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1407, i32 1, i32 31), !dbg !15 + %1409 = add i32 %1408, %1407, !dbg !32 + %1410 = insertelement <1 x i32> poison, i32 %1409, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %40, <1 x i32> %1410, i1 %37) #5, !dbg !15 + %1411 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %41, i1 true) #5, !dbg !15 + %1412 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1411, i32 1, i32 31), !dbg !15 + %1413 = add i32 %1412, %1411, !dbg !32 + %1414 = insertelement <1 x i32> poison, i32 %1413, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %41, <1 x i32> %1414, i1 %37) #5, !dbg !15 + %1415 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %42, i1 true) #5, !dbg !15 + %1416 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1415, i32 1, i32 31), !dbg !15 + %1417 = add i32 %1416, %1415, !dbg !32 + %1418 = insertelement <1 x i32> poison, i32 %1417, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %42, <1 x i32> %1418, i1 %37) #5, !dbg !15 + %1419 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %43, i1 true) #5, !dbg !15 + %1420 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1419, i32 1, i32 31), !dbg !15 + %1421 = add i32 %1420, %1419, !dbg !32 + %1422 = insertelement <1 x i32> poison, i32 %1421, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %43, <1 x i32> %1422, i1 %37) #5, !dbg !15 + %1423 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %44, i1 true) #5, !dbg !15 + %1424 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1423, i32 1, i32 31), !dbg !15 + %1425 = add i32 %1424, %1423, !dbg !32 + %1426 = insertelement <1 x i32> poison, i32 %1425, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %44, <1 x i32> %1426, i1 %37) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1427 = load i32, ptr addrspace(3) %27, align 16, !dbg !15 + %1428 = load i32, ptr addrspace(3) %28, align 8, !dbg !15 + %1429 = load i32, ptr addrspace(3) %29, align 16, !dbg !15 + %1430 = load i32, ptr addrspace(3) %30, align 8, !dbg !15 + %1431 = load i32, ptr addrspace(3) %31, align 16, !dbg !15 + %1432 = load i32, ptr addrspace(3) %32, align 8, !dbg !15 + %1433 = load i32, ptr addrspace(3) %33, align 16, !dbg !15 + %1434 = load i32, ptr addrspace(3) %34, align 8, !dbg !15 + %1435 = mul nuw nsw i32 %1355, %.lobit, !dbg !28 + %1436 = mul nuw nsw i32 %1356, %.lobit, !dbg !28 + %1437 = mul nuw nsw i32 %1357, %.lobit, !dbg !28 + %1438 = mul nuw nsw i32 %1358, %.lobit, !dbg !28 + %1439 = mul nuw nsw i32 %1359, %.lobit, !dbg !28 + %1440 = mul nuw nsw i32 %1360, %.lobit, !dbg !28 + %1441 = mul nuw nsw i32 %1361, %.lobit, !dbg !28 + %1442 = mul nuw nsw i32 %1362, %.lobit, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1443 = insertelement <1 x i32> poison, i32 %1435, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %76, <1 x i32> %1443, i1 true) #5, !dbg !15 + %1444 = insertelement <1 x i32> poison, i32 %1436, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %78, <1 x i32> %1444, i1 true) #5, !dbg !15 + %1445 = insertelement <1 x i32> poison, i32 %1437, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %80, <1 x i32> %1445, i1 true) #5, !dbg !15 + %1446 = insertelement <1 x i32> poison, i32 %1438, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1446, i1 true) #5, !dbg !15 + %1447 = insertelement <1 x i32> poison, i32 %1439, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, <1 x i32> %1447, i1 true) #5, !dbg !15 + %1448 = insertelement <1 x i32> poison, i32 %1440, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %1448, i1 true) #5, !dbg !15 + %1449 = insertelement <1 x i32> poison, i32 %1441, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %86, <1 x i32> %1449, i1 true) #5, !dbg !15 + %1450 = insertelement <1 x i32> poison, i32 %1442, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1450, i1 true) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1451 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %35, i1 true) #5, !dbg !15 + %1452 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1451, i32 1, i32 31), !dbg !15 + %1453 = add i32 %1452, %1451, !dbg !32 + %1454 = insertelement <1 x i32> poison, i32 %1453, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %35, <1 x i32> %1454, i1 %37) #5, !dbg !15 + %1455 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %38, i1 true) #5, !dbg !15 + %1456 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1455, i32 1, i32 31), !dbg !15 + %1457 = add i32 %1456, %1455, !dbg !32 + %1458 = insertelement <1 x i32> poison, i32 %1457, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %38, <1 x i32> %1458, i1 %37) #5, !dbg !15 + %1459 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %39, i1 true) #5, !dbg !15 + %1460 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1459, i32 1, i32 31), !dbg !15 + %1461 = add i32 %1460, %1459, !dbg !32 + %1462 = insertelement <1 x i32> poison, i32 %1461, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %39, <1 x i32> %1462, i1 %37) #5, !dbg !15 + %1463 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %40, i1 true) #5, !dbg !15 + %1464 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1463, i32 1, i32 31), !dbg !15 + %1465 = add i32 %1464, %1463, !dbg !32 + %1466 = insertelement <1 x i32> poison, i32 %1465, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %40, <1 x i32> %1466, i1 %37) #5, !dbg !15 + %1467 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %41, i1 true) #5, !dbg !15 + %1468 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1467, i32 1, i32 31), !dbg !15 + %1469 = add i32 %1468, %1467, !dbg !32 + %1470 = insertelement <1 x i32> poison, i32 %1469, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %41, <1 x i32> %1470, i1 %37) #5, !dbg !15 + %1471 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %42, i1 true) #5, !dbg !15 + %1472 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1471, i32 1, i32 31), !dbg !15 + %1473 = add i32 %1472, %1471, !dbg !32 + %1474 = insertelement <1 x i32> poison, i32 %1473, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %42, <1 x i32> %1474, i1 %37) #5, !dbg !15 + %1475 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %43, i1 true) #5, !dbg !15 + %1476 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1475, i32 1, i32 31), !dbg !15 + %1477 = add i32 %1476, %1475, !dbg !32 + %1478 = insertelement <1 x i32> poison, i32 %1477, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %43, <1 x i32> %1478, i1 %37) #5, !dbg !15 + %1479 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %44, i1 true) #5, !dbg !15 + %1480 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1479, i32 1, i32 31), !dbg !15 + %1481 = add i32 %1480, %1479, !dbg !32 + %1482 = insertelement <1 x i32> poison, i32 %1481, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %44, <1 x i32> %1482, i1 %37) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1483 = load i32, ptr addrspace(3) %27, align 16, !dbg !15 + %1484 = load i32, ptr addrspace(3) %28, align 8, !dbg !15 + %1485 = load i32, ptr addrspace(3) %29, align 16, !dbg !15 + %1486 = load i32, ptr addrspace(3) %30, align 8, !dbg !15 + %1487 = load i32, ptr addrspace(3) %31, align 16, !dbg !15 + %1488 = load i32, ptr addrspace(3) %32, align 8, !dbg !15 + %1489 = load i32, ptr addrspace(3) %33, align 16, !dbg !15 + %1490 = load i32, ptr addrspace(3) %34, align 8, !dbg !15 + %1491 = mul nuw nsw i32 %1371, %71, !dbg !29 + %1492 = mul nuw nsw i32 %1372, %71, !dbg !29 + %1493 = mul nuw nsw i32 %1373, %71, !dbg !29 + %1494 = mul nuw nsw i32 %1374, %71, !dbg !29 + %1495 = mul nuw nsw i32 %1375, %71, !dbg !29 + %1496 = mul nuw nsw i32 %1376, %71, !dbg !29 + %1497 = mul nuw nsw i32 %1377, %71, !dbg !29 + %1498 = mul nuw nsw i32 %1378, %71, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1499 = insertelement <1 x i32> poison, i32 %1491, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %76, <1 x i32> %1499, i1 true) #5, !dbg !15 + %1500 = insertelement <1 x i32> poison, i32 %1492, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %78, <1 x i32> %1500, i1 true) #5, !dbg !15 + %1501 = insertelement <1 x i32> poison, i32 %1493, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %80, <1 x i32> %1501, i1 true) #5, !dbg !15 + %1502 = insertelement <1 x i32> poison, i32 %1494, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1502, i1 true) #5, !dbg !15 + %1503 = insertelement <1 x i32> poison, i32 %1495, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, <1 x i32> %1503, i1 true) #5, !dbg !15 + %1504 = insertelement <1 x i32> poison, i32 %1496, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %1504, i1 true) #5, !dbg !15 + %1505 = insertelement <1 x i32> poison, i32 %1497, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %86, <1 x i32> %1505, i1 true) #5, !dbg !15 + %1506 = insertelement <1 x i32> poison, i32 %1498, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1506, i1 true) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1507 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %35, i1 true) #5, !dbg !15 + %1508 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1507, i32 1, i32 31), !dbg !15 + %1509 = add i32 %1508, %1507, !dbg !32 + %1510 = insertelement <1 x i32> poison, i32 %1509, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %35, <1 x i32> %1510, i1 %37) #5, !dbg !15 + %1511 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %38, i1 true) #5, !dbg !15 + %1512 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1511, i32 1, i32 31), !dbg !15 + %1513 = add i32 %1512, %1511, !dbg !32 + %1514 = insertelement <1 x i32> poison, i32 %1513, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %38, <1 x i32> %1514, i1 %37) #5, !dbg !15 + %1515 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %39, i1 true) #5, !dbg !15 + %1516 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1515, i32 1, i32 31), !dbg !15 + %1517 = add i32 %1516, %1515, !dbg !32 + %1518 = insertelement <1 x i32> poison, i32 %1517, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %39, <1 x i32> %1518, i1 %37) #5, !dbg !15 + %1519 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %40, i1 true) #5, !dbg !15 + %1520 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1519, i32 1, i32 31), !dbg !15 + %1521 = add i32 %1520, %1519, !dbg !32 + %1522 = insertelement <1 x i32> poison, i32 %1521, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %40, <1 x i32> %1522, i1 %37) #5, !dbg !15 + %1523 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %41, i1 true) #5, !dbg !15 + %1524 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1523, i32 1, i32 31), !dbg !15 + %1525 = add i32 %1524, %1523, !dbg !32 + %1526 = insertelement <1 x i32> poison, i32 %1525, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %41, <1 x i32> %1526, i1 %37) #5, !dbg !15 + %1527 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %42, i1 true) #5, !dbg !15 + %1528 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1527, i32 1, i32 31), !dbg !15 + %1529 = add i32 %1528, %1527, !dbg !32 + %1530 = insertelement <1 x i32> poison, i32 %1529, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %42, <1 x i32> %1530, i1 %37) #5, !dbg !15 + %1531 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %43, i1 true) #5, !dbg !15 + %1532 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1531, i32 1, i32 31), !dbg !15 + %1533 = add i32 %1532, %1531, !dbg !32 + %1534 = insertelement <1 x i32> poison, i32 %1533, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %43, <1 x i32> %1534, i1 %37) #5, !dbg !15 + %1535 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %44, i1 true) #5, !dbg !15 + %1536 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1535, i32 1, i32 31), !dbg !15 + %1537 = add i32 %1536, %1535, !dbg !32 + %1538 = insertelement <1 x i32> poison, i32 %1537, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %44, <1 x i32> %1538, i1 %37) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1539 = load i32, ptr addrspace(3) %27, align 16, !dbg !15 + %1540 = load i32, ptr addrspace(3) %28, align 8, !dbg !15 + %1541 = load i32, ptr addrspace(3) %29, align 16, !dbg !15 + %1542 = load i32, ptr addrspace(3) %30, align 8, !dbg !15 + %1543 = load i32, ptr addrspace(3) %31, align 16, !dbg !15 + %1544 = load i32, ptr addrspace(3) %32, align 8, !dbg !15 + %1545 = load i32, ptr addrspace(3) %33, align 16, !dbg !15 + %1546 = load i32, ptr addrspace(3) %34, align 8, !dbg !15 + %1547 = mul nuw nsw i32 %1371, %.lobit, !dbg !20 + %1548 = mul nuw nsw i32 %1372, %.lobit, !dbg !20 + %1549 = mul nuw nsw i32 %1373, %.lobit, !dbg !20 + %1550 = mul nuw nsw i32 %1374, %.lobit, !dbg !20 + %1551 = mul nuw nsw i32 %1375, %.lobit, !dbg !20 + %1552 = mul nuw nsw i32 %1376, %.lobit, !dbg !20 + %1553 = mul nuw nsw i32 %1377, %.lobit, !dbg !20 + %1554 = mul nuw nsw i32 %1378, %.lobit, !dbg !20 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1555 = insertelement <1 x i32> poison, i32 %1547, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %76, <1 x i32> %1555, i1 true) #5, !dbg !15 + %1556 = insertelement <1 x i32> poison, i32 %1548, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %78, <1 x i32> %1556, i1 true) #5, !dbg !15 + %1557 = insertelement <1 x i32> poison, i32 %1549, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %80, <1 x i32> %1557, i1 true) #5, !dbg !15 + %1558 = insertelement <1 x i32> poison, i32 %1550, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %82, <1 x i32> %1558, i1 true) #5, !dbg !15 + %1559 = insertelement <1 x i32> poison, i32 %1551, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %84, <1 x i32> %1559, i1 true) #5, !dbg !15 + %1560 = insertelement <1 x i32> poison, i32 %1552, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, <1 x i32> %1560, i1 true) #5, !dbg !15 + %1561 = insertelement <1 x i32> poison, i32 %1553, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %86, <1 x i32> %1561, i1 true) #5, !dbg !15 + %1562 = insertelement <1 x i32> poison, i32 %1554, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %87, <1 x i32> %1562, i1 true) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1563 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %35, i1 true) #5, !dbg !15 + %1564 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1563, i32 1, i32 31), !dbg !15 + %1565 = add i32 %1564, %1563, !dbg !32 + %1566 = insertelement <1 x i32> poison, i32 %1565, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %35, <1 x i32> %1566, i1 %37) #5, !dbg !15 + %1567 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %38, i1 true) #5, !dbg !15 + %1568 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1567, i32 1, i32 31), !dbg !15 + %1569 = add i32 %1568, %1567, !dbg !32 + %1570 = insertelement <1 x i32> poison, i32 %1569, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %38, <1 x i32> %1570, i1 %37) #5, !dbg !15 + %1571 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %39, i1 true) #5, !dbg !15 + %1572 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1571, i32 1, i32 31), !dbg !15 + %1573 = add i32 %1572, %1571, !dbg !32 + %1574 = insertelement <1 x i32> poison, i32 %1573, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %39, <1 x i32> %1574, i1 %37) #5, !dbg !15 + %1575 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %40, i1 true) #5, !dbg !15 + %1576 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1575, i32 1, i32 31), !dbg !15 + %1577 = add i32 %1576, %1575, !dbg !32 + %1578 = insertelement <1 x i32> poison, i32 %1577, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %40, <1 x i32> %1578, i1 %37) #5, !dbg !15 + %1579 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %41, i1 true) #5, !dbg !15 + %1580 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1579, i32 1, i32 31), !dbg !15 + %1581 = add i32 %1580, %1579, !dbg !32 + %1582 = insertelement <1 x i32> poison, i32 %1581, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %41, <1 x i32> %1582, i1 %37) #5, !dbg !15 + %1583 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %42, i1 true) #5, !dbg !15 + %1584 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1583, i32 1, i32 31), !dbg !15 + %1585 = add i32 %1584, %1583, !dbg !32 + %1586 = insertelement <1 x i32> poison, i32 %1585, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %42, <1 x i32> %1586, i1 %37) #5, !dbg !15 + %1587 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %43, i1 true) #5, !dbg !15 + %1588 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1587, i32 1, i32 31), !dbg !15 + %1589 = add i32 %1588, %1587, !dbg !32 + %1590 = insertelement <1 x i32> poison, i32 %1589, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %43, <1 x i32> %1590, i1 %37) #5, !dbg !15 + %1591 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %44, i1 true) #5, !dbg !15 + %1592 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1591, i32 1, i32 31), !dbg !15 + %1593 = add i32 %1592, %1591, !dbg !32 + %1594 = insertelement <1 x i32> poison, i32 %1593, i64 0, !dbg !15 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %44, <1 x i32> %1594, i1 %37) #5, !dbg !15 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !15 + %1595 = load i32, ptr addrspace(3) %27, align 16, !dbg !15 + %1596 = load i32, ptr addrspace(3) %28, align 8, !dbg !15 + %1597 = load i32, ptr addrspace(3) %29, align 16, !dbg !15 + %1598 = load i32, ptr addrspace(3) %30, align 8, !dbg !15 + %1599 = load i32, ptr addrspace(3) %31, align 16, !dbg !15 + %1600 = load i32, ptr addrspace(3) %32, align 8, !dbg !15 + %1601 = load i32, ptr addrspace(3) %33, align 16, !dbg !15 + %1602 = load i32, ptr addrspace(3) %34, align 8, !dbg !15 + %1603 = insertelement <4 x i32> poison, i32 %1427, i64 0, !dbg !33 + %1604 = insertelement <4 x i32> %1603, i32 %1429, i64 1, !dbg !33 + %1605 = insertelement <4 x i32> %1604, i32 %1431, i64 2, !dbg !33 + %1606 = insertelement <4 x i32> %1605, i32 %1433, i64 3, !dbg !33 + %1607 = insertelement <4 x i32> poison, i32 %1483, i64 0, !dbg !33 + %1608 = insertelement <4 x i32> %1607, i32 %1485, i64 1, !dbg !33 + %1609 = insertelement <4 x i32> %1608, i32 %1487, i64 2, !dbg !33 + %1610 = insertelement <4 x i32> %1609, i32 %1489, i64 3, !dbg !33 + %1611 = icmp slt <4 x i32> %1606, %1610, !dbg !33 + %1612 = insertelement <4 x i32> poison, i32 %1428, i64 0, !dbg !33 + %1613 = insertelement <4 x i32> %1612, i32 %1430, i64 1, !dbg !33 + %1614 = insertelement <4 x i32> %1613, i32 %1432, i64 2, !dbg !33 + %1615 = insertelement <4 x i32> %1614, i32 %1434, i64 3, !dbg !33 + %1616 = insertelement <4 x i32> poison, i32 %1484, i64 0, !dbg !33 + %1617 = insertelement <4 x i32> %1616, i32 %1486, i64 1, !dbg !33 + %1618 = insertelement <4 x i32> %1617, i32 %1488, i64 2, !dbg !33 + %1619 = insertelement <4 x i32> %1618, i32 %1490, i64 3, !dbg !33 + %1620 = icmp slt <4 x i32> %1615, %1619, !dbg !33 + %1621 = icmp eq <4 x i32> %1606, %1610, !dbg !34 + %1622 = icmp eq <4 x i32> %1615, %1619, !dbg !34 + %1623 = insertelement <4 x i32> poison, i32 %1539, i64 0, !dbg !35 + %1624 = insertelement <4 x i32> %1623, i32 %1541, i64 1, !dbg !35 + %1625 = insertelement <4 x i32> %1624, i32 %1543, i64 2, !dbg !35 + %1626 = insertelement <4 x i32> %1625, i32 %1545, i64 3, !dbg !35 + %1627 = insertelement <4 x i32> poison, i32 %1595, i64 0, !dbg !35 + %1628 = insertelement <4 x i32> %1627, i32 %1597, i64 1, !dbg !35 + %1629 = insertelement <4 x i32> %1628, i32 %1599, i64 2, !dbg !35 + %1630 = insertelement <4 x i32> %1629, i32 %1601, i64 3, !dbg !35 + %1631 = icmp sgt <4 x i32> %1626, %1630, !dbg !35 + %1632 = insertelement <4 x i32> poison, i32 %1540, i64 0, !dbg !35 + %1633 = insertelement <4 x i32> %1632, i32 %1542, i64 1, !dbg !35 + %1634 = insertelement <4 x i32> %1633, i32 %1544, i64 2, !dbg !35 + %1635 = insertelement <4 x i32> %1634, i32 %1546, i64 3, !dbg !35 + %1636 = insertelement <4 x i32> poison, i32 %1596, i64 0, !dbg !35 + %1637 = insertelement <4 x i32> %1636, i32 %1598, i64 1, !dbg !35 + %1638 = insertelement <4 x i32> %1637, i32 %1600, i64 2, !dbg !35 + %1639 = insertelement <4 x i32> %1638, i32 %1602, i64 3, !dbg !35 + %1640 = icmp sgt <4 x i32> %1635, %1639, !dbg !35 + %1641 = and <4 x i1> %1621, %1631, !dbg !36 + %1642 = and <4 x i1> %1622, %1640, !dbg !36 + %1643 = or <4 x i1> %1611, %1641, !dbg !37 + %1644 = or <4 x i1> %1620, %1642, !dbg !37 + %1645 = xor <4 x i32> %1630, %1626, !dbg !41 + %1646 = xor <4 x i32> %1639, %1635, !dbg !41 + %1647 = select <4 x i1> %1643, <4 x i32> %1645, <4 x i32> zeroinitializer, !dbg !42 + %1648 = insertelement <4 x i32> poison, i32 %1371, i64 0, !dbg !31 + %1649 = insertelement <4 x i32> %1648, i32 %1373, i64 1, !dbg !31 + %1650 = insertelement <4 x i32> %1649, i32 %1375, i64 2, !dbg !31 + %1651 = insertelement <4 x i32> %1650, i32 %1377, i64 3, !dbg !31 + %1652 = xor <4 x i32> %1647, %1651, !dbg !31 + %1653 = select <4 x i1> %1644, <4 x i32> %1646, <4 x i32> zeroinitializer, !dbg !42 + %1654 = insertelement <4 x i32> poison, i32 %1372, i64 0, !dbg !31 + %1655 = insertelement <4 x i32> %1654, i32 %1374, i64 1, !dbg !31 + %1656 = insertelement <4 x i32> %1655, i32 %1376, i64 2, !dbg !31 + %1657 = insertelement <4 x i32> %1656, i32 %1378, i64 3, !dbg !31 + %1658 = xor <4 x i32> %1653, %1657, !dbg !31 + %1659 = insertelement <8 x i1> poison, i1 %17, i64 0, !dbg !43 + %1660 = shufflevector <8 x i1> %1659, <8 x i1> poison, <8 x i32> zeroinitializer, !dbg !43 + %1661 = insertelement <8 x i32> poison, i32 %68, i64 0, !dbg !43 + %1662 = insertelement <8 x i32> %1661, i32 %67, i64 1, !dbg !43 + %1663 = insertelement <8 x i32> %1662, i32 %69, i64 2, !dbg !43 + %1664 = insertelement <8 x i32> %1663, i32 %70, i64 3, !dbg !43 + %1665 = insertelement <8 x i32> %1664, i32 %141, i64 4, !dbg !43 + %1666 = insertelement <8 x i32> %1665, i32 %142, i64 5, !dbg !43 + %1667 = insertelement <8 x i32> %1666, i32 %143, i64 6, !dbg !43 + %1668 = insertelement <8 x i32> %1667, i32 %144, i64 7, !dbg !43 + %1669 = sext <8 x i32> %1668 to <8 x i64>, !dbg !43 + %1670 = select <8 x i1> %1660, <8 x i64> %1669, <8 x i64> zeroinitializer, !dbg !43 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !44 + %1671 = tail call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %1670), !dbg !46 + %1672 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %26, !dbg !44 + %1673 = getelementptr i64, ptr addrspace(3) %1672, i32 %.lobit, !dbg !44 + %1674 = insertelement <1 x i64> poison, i64 %1671, i64 0, !dbg !44 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %1673, <1 x i64> %1674, i1 true) #5, !dbg !44 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !44 + %1675 = icmp samesign ult i32 %10, 256, !dbg !44 + %1676 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %10, !dbg !44 + %1677 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %1676, i1 %1675) #5, !dbg !44 + %extelt.offset = lshr i64 %1677, 32, !dbg !44 + %1678 = trunc nuw i64 %extelt.offset to i32, !dbg !44 + %1679 = trunc i64 %1677 to i32, !dbg !44 + %1680 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1679, i32 1, i32 31), !dbg !44 + %1681 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1678, i32 1, i32 31), !dbg !44 + %1682 = insertelement <2 x i32> poison, i32 %1680, i64 0, !dbg !44 + %1683 = insertelement <2 x i32> %1682, i32 %1681, i64 1, !dbg !44 + %1684 = bitcast <2 x i32> %1683 to i64, !dbg !44 + %1685 = add i64 %1677, %1684, !dbg !46 + %1686 = and i32 %10, 769, !dbg !44 + %1687 = icmp eq i32 %1686, 0, !dbg !44 + %1688 = insertelement <1 x i64> poison, i64 %1685, i64 0, !dbg !44 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %1676, <1 x i64> %1688, i1 %1687) #5, !dbg !44 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !44 + %1689 = load i64, ptr addrspace(3) %1672, align 16, !dbg !44 + %1690 = trunc i64 %1689 to i32, !dbg !47 + %1691 = shl i32 %15, 4, !dbg !48 + %1692 = shl i32 %16, 4, !dbg !48 + %1693 = or disjoint i32 %1691, %22, !dbg !49 + %1694 = or disjoint i32 %1692, %22, !dbg !49 + %1695 = sext i32 %1693 to i64, !dbg !50 + %1696 = getelementptr i32, ptr addrspace(1) %1, i64 %1695, !dbg !50 + %1697 = sext i32 %1694 to i64, !dbg !50 + %1698 = getelementptr i32, ptr addrspace(1) %1, i64 %1697, !dbg !50 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !51 + %1699 = shl nuw nsw i32 %10, 4, !dbg !51 + %1700 = and i32 %1699, 4080, !dbg !51 + %1701 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1700, !dbg !51 + store <4 x i32> %1652, ptr addrspace(3) %1701, align 16, !dbg !51 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !51 + %1702 = and i32 %1699, 112, !dbg !51 + %1703 = and i32 %21, 896, !dbg !51 + %1704 = shl nuw nsw i32 %10, 8, !dbg !51 + %1705 = and i32 %1704, 2048, !dbg !51 + %1706 = shl nuw nsw i32 %10, 6, !dbg !51 + %1707 = and i32 %1706, 1024, !dbg !51 + %1708 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1702, !dbg !51 + %1709 = getelementptr inbounds nuw i8, ptr addrspace(3) %1708, i32 %1705, !dbg !51 + %1710 = getelementptr inbounds nuw i8, ptr addrspace(3) %1709, i32 %1707, !dbg !51 + %1711 = getelementptr inbounds nuw i8, ptr addrspace(3) %1710, i32 %1703, !dbg !51 + %1712 = ptrtoint ptr addrspace(3) %1711 to i32, !dbg !51 + %1713 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %1712) #5, !dbg !51 + %1714 = extractvalue { i32, i32, i32, i32 } %1713, 0, !dbg !51 + %1715 = extractvalue { i32, i32, i32, i32 } %1713, 1, !dbg !51 + %1716 = extractvalue { i32, i32, i32, i32 } %1713, 2, !dbg !51 + %1717 = extractvalue { i32, i32, i32, i32 } %1713, 3, !dbg !51 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !51 + store <4 x i32> %1658, ptr addrspace(3) %1701, align 16, !dbg !51 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !51 + %1718 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %1712) #5, !dbg !51 + %1719 = extractvalue { i32, i32, i32, i32 } %1718, 0, !dbg !51 + %1720 = extractvalue { i32, i32, i32, i32 } %1718, 1, !dbg !51 + %1721 = extractvalue { i32, i32, i32, i32 } %1718, 2, !dbg !51 + %1722 = extractvalue { i32, i32, i32, i32 } %1718, 3, !dbg !51 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1714, i32 %1715, i32 %1719, i32 %1720, ptr addrspace(1) %1696, i1 %18) #5, !dbg !51 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1716, i32 %1717, i32 %1721, i32 %1722, ptr addrspace(1) %1698, i1 %19) #5, !dbg !51 + %1723 = sext i32 %14 to i64, !dbg !52 + %1724 = getelementptr i32, ptr addrspace(1) %2, i64 %1723, !dbg !52 + %1725 = and i32 %10, 128, !dbg !53 + %1726 = icmp eq i32 %1725, 0, !dbg !53 + %1727 = and i1 %1726, %17, !dbg !53 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %1690, ptr addrspace(1) %1724, i1 %1727) #5, !dbg !53 + ret void, !dbg !54 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", linkageName: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 24, column: 28, scope: !4) +!8 = !DILocation(line: 24, column: 33, scope: !4) +!9 = !DILocation(line: 25, column: 44, scope: !4) +!10 = !DILocation(line: 25, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 21, scope: !4) +!12 = !DILocation(line: 27, column: 38, scope: !4) +!13 = !DILocation(line: 34, column: 19, scope: !4) +!14 = !DILocation(line: 36, column: 35, scope: !4) +!15 = !DILocation(line: 291, column: 36, scope: !16, inlinedAt: !18) +!16 = distinct !DILexicalBlockFile(scope: !4, file: !17, discriminator: 0) +!17 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!18 = !DILocation(line: 41, column: 67, scope: !4) +!19 = !DILocation(line: 36, column: 38, scope: !4) +!20 = !DILocation(line: 551, column: 23, scope: !21, inlinedAt: !18) +!21 = distinct !DILexicalBlockFile(scope: !4, file: !22, discriminator: 0) +!22 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!23 = !DILocation(line: 36, column: 45, scope: !4) +!24 = !DILocation(line: 36, column: 30, scope: !4) +!25 = !DILocation(line: 36, column: 54, scope: !4) +!26 = !DILocation(line: 537, column: 21, scope: !21, inlinedAt: !18) +!27 = !DILocation(line: 538, column: 40, scope: !21, inlinedAt: !18) +!28 = !DILocation(line: 539, column: 41, scope: !21, inlinedAt: !18) +!29 = !DILocation(line: 548, column: 23, scope: !21, inlinedAt: !18) +!30 = !DILocation(line: 600, column: 15, scope: !21, inlinedAt: !18) +!31 = !DILocation(line: 601, column: 22, scope: !21, inlinedAt: !18) +!32 = !DILocation(line: 261, column: 15, scope: !16, inlinedAt: !18) +!33 = !DILocation(line: 574, column: 22, scope: !21, inlinedAt: !18) +!34 = !DILocation(line: 591, column: 21, scope: !21, inlinedAt: !18) +!35 = !DILocation(line: 594, column: 40, scope: !21, inlinedAt: !18) +!36 = !DILocation(line: 594, column: 29, scope: !21, inlinedAt: !18) +!37 = !DILocation(line: 594, column: 23, scope: !21, inlinedAt: !18) +!38 = !DILocation(line: 599, column: 19, scope: !21, inlinedAt: !18) +!39 = !DILocation(line: 600, column: 38, scope: !21, inlinedAt: !18) +!40 = !DILocation(line: 600, column: 46, scope: !21, inlinedAt: !18) +!41 = !DILocation(line: 601, column: 48, scope: !21, inlinedAt: !18) +!42 = !DILocation(line: 601, column: 59, scope: !21, inlinedAt: !18) +!43 = !DILocation(line: 44, column: 34, scope: !4) +!44 = !DILocation(line: 291, column: 36, scope: !16, inlinedAt: !45) +!45 = !DILocation(line: 45, column: 26, scope: !4) +!46 = !DILocation(line: 261, column: 15, scope: !16, inlinedAt: !45) +!47 = !DILocation(line: 48, column: 21, scope: !4) +!48 = !DILocation(line: 49, column: 35, scope: !4) +!49 = !DILocation(line: 49, column: 32, scope: !4) +!50 = !DILocation(line: 49, column: 25, scope: !4) +!51 = !DILocation(line: 49, column: 47, scope: !4) +!52 = !DILocation(line: 50, column: 25, scope: !4) +!53 = !DILocation(line: 50, column: 37, scope: !4) +!54 = !DILocation(line: 50, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx new file mode 100644 index 0000000000000000000000000000000000000000..3666c1edc17354ca7b5f826c1dc047c0f7e1c251 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx @@ -0,0 +1,3311 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 // -- Begin function triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.visible .entry triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_6 +) +.reqntid 256 +{ + .reg .pred %p<665>; + .reg .b32 %r<1496>; + .reg .b64 %rd<44>; + .loc 1 18 0 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:18:0 + +// %bb.0: + ld.param.b64 %rd15, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0]; + ld.param.b64 %rd16, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1]; +$L__tmp0: + .loc 1 24 28 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:24:28 + mov.u32 %r799, %ctaid.x; + .loc 1 24 33 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:24:33 + shl.b32 %r800, %r799, 7; + ld.param.b64 %rd17, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2]; + .loc 1 25 44 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:25:44 + mov.u32 %r801, %tid.x; + and.b32 %r802, %r801, 127; + bfe.u32 %r803, %r801, 2, 6; + .loc 1 25 23 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:25:23 + or.b32 %r804, %r800, %r802; + or.b32 %r805, %r803, %r800; + or.b32 %r806, %r805, 64; + .loc 1 26 21 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:26:21 + setp.lt.s32 %p1, %r804, 128; + setp.lt.s32 %p396, %r805, 128; + setp.lt.s32 %p397, %r806, 128; + .loc 1 27 38 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:27:38 + shr.u32 %r807, %r801, 7; + shl.b32 %r808, %r801, 2; + and.b32 %r809, %r808, 12; + .loc 1 34 19 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:34:19 + bfe.s32 %r810, %r799, 24, 1; + shr.u32 %r811, %r810, 28; + add.s32 %r812, %r804, %r811; + .loc 1 36 35 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:36:35 + shl.b32 %r813, %r812, 4; + and.b32 %r814, %r813, -256; + add.s32 %r815, %r814, %r804; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shl.b32 %r816, %r802, 4; + shl.b32 %r817, %r802, 6; + mov.b32 %r818, global_smem; + add.s32 %r819, %r818, %r817; + add.s32 %r26, %r818, %r808; + and.b32 %r820, %r801, 1; + setp.ne.b32 %p399, %r820, 0; + not.pred %p18, %p399; + add.s32 %r30, %r26, 1024; + add.s32 %r34, %r26, 2048; + add.s32 %r38, %r26, 3072; + add.s32 %r42, %r26, 4096; + add.s32 %r46, %r26, 5120; + add.s32 %r50, %r26, 6144; + add.s32 %r54, %r26, 7168; +$L__tmp2: + .loc 1 27 38 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:27:38 + bfe.u32 %r154, %r801, 7, 1; + or.b32 %r821, %r154, 4; + or.b32 %r822, %r807, 6; + or.b32 %r823, %r154, 2; + .loc 1 36 45 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:36:45 + mad.lo.s32 %r824, %r154, 17, %r815; + mad.lo.s32 %r825, %r823, 17, %r815; + add.s32 %r826, %r824, 68; + mad.lo.s32 %r827, %r822, 17, %r815; + .loc 1 36 30 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:36:30 + mad.wide.s32 %rd1, %r824, 4, %rd15; + mad.wide.s32 %rd2, %r825, 4, %rd15; + mad.wide.s32 %rd3, %r826, 4, %rd15; + mad.wide.s32 %rd4, %r827, 4, %rd15; + .loc 1 36 54 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:36:54 + // begin inline asm + mov.u32 %r1, 0x0; + @%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r2, 0x0; + @%p1 ld.global.b32 { %r2 }, [ %rd2 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r3, 0x0; + @%p1 ld.global.b32 { %r3 }, [ %rd3 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r4, 0x0; + @%p1 ld.global.b32 { %r4 }, [ %rd4 + 0 ]; + // end inline asm +$L__tmp3: + .loc 3 537 21 // triton_helpers.py:537:21 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r828, %r154, 1; + .loc 3 538 40 // triton_helpers.py:538:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r10, %r1, %r828; + mul.lo.s32 %r12, %r2, %r828; + mul.lo.s32 %r14, %r3, %r828; + mul.lo.s32 %r16, %r4, %r828; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + shl.b32 %r829, %r154, 2; + add.s32 %r9, %r819, %r829; + add.s32 %r11, %r9, 8; + add.s32 %r13, %r9, 16; + add.s32 %r15, %r9, 24; + add.s32 %r17, %r9, 32; + add.s32 %r19, %r9, 40; + add.s32 %r21, %r9, 48; + add.s32 %r23, %r9, 56; + .loc 3 539 41 // triton_helpers.py:539:41 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r58, %r1, %r154; + mul.lo.s32 %r60, %r2, %r154; + mul.lo.s32 %r62, %r3, %r154; + mul.lo.s32 %r64, %r4, %r154; + .loc 3 548 23 // triton_helpers.py:548:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r106, %r828, %r154; + mul.lo.s32 %r108, %r823, %r828; + shl.b32 %r830, %r828, 2; + or.b32 %r110, %r106, %r830; + mul.lo.s32 %r112, %r828, %r822; + .loc 3 551 23 // triton_helpers.py:551:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r156, %r823, %r154; + mul.lo.s32 %r158, %r821, %r154; + mul.lo.s32 %r160, %r822, %r154; +$L__tmp4: + .loc 1 27 38 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:27:38 + or.b32 %r831, %r154, 10; + or.b32 %r832, %r807, 14; + .loc 1 36 45 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:36:45 + mad.lo.s32 %r833, %r832, 17, %r815; + .loc 1 36 30 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:36:30 + mad.wide.s32 %rd8, %r833, 4, %rd15; + .loc 1 27 38 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:27:38 + or.b32 %r834, %r154, 12; + or.b32 %r835, %r154, 8; + .loc 1 36 45 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:36:45 + mad.lo.s32 %r836, %r835, 17, %r815; + mad.lo.s32 %r837, %r831, 17, %r815; + mad.lo.s32 %r838, %r834, 17, %r815; + .loc 1 36 30 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:36:30 + mad.wide.s32 %rd5, %r836, 4, %rd15; + mad.wide.s32 %rd6, %r837, 4, %rd15; + mad.wide.s32 %rd7, %r838, 4, %rd15; + .loc 1 36 54 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:36:54 + // begin inline asm + mov.u32 %r5, 0x0; + @%p1 ld.global.b32 { %r5 }, [ %rd5 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r6, 0x0; + @%p1 ld.global.b32 { %r6 }, [ %rd6 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r7, 0x0; + @%p1 ld.global.b32 { %r7 }, [ %rd7 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r8, 0x0; + @%p1 ld.global.b32 { %r8 }, [ %rd8 + 0 ]; + // end inline asm +$L__tmp5: + .loc 3 538 40 // triton_helpers.py:538:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r18, %r5, %r828; + mul.lo.s32 %r20, %r6, %r828; + mul.lo.s32 %r22, %r7, %r828; + mul.lo.s32 %r24, %r8, %r828; + mov.pred %p9, -1; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p9 st.shared.b32 [ %r9 + 0 ], %r10; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r11 + 0 ], %r12; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r13 + 0 ], %r14; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r15 + 0 ], %r16; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r17 + 0 ], %r18; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r19 + 0 ], %r20; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r21 + 0 ], %r22; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r23 + 0 ], %r24; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r25, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r839, %r25, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r28, %r839, %r25; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r26 + 0 ], %r28; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r29, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r840, %r29, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r32, %r840, %r29; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r30 + 0 ], %r32; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r33, [ %r34 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r841, %r33, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r36, %r841, %r33; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r34 + 0 ], %r36; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r37, [ %r38 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r842, %r37, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r40, %r842, %r37; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r38 + 0 ], %r40; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r41, [ %r42 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r843, %r41, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r44, %r843, %r41; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r42 + 0 ], %r44; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r45, [ %r46 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r844, %r45, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r48, %r844, %r45; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r46 + 0 ], %r48; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r49, [ %r50 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r845, %r49, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r52, %r845, %r49; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r50 + 0 ], %r52; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r53, [ %r54 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r846, %r53, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r56, %r846, %r53; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r54 + 0 ], %r56; + // end inline asm + bar.sync 0; + ld.shared.b32 %r847, [%r819]; + ld.shared.b32 %r848, [%r819+8]; + ld.shared.b32 %r849, [%r819+16]; + ld.shared.b32 %r850, [%r819+24]; + ld.shared.b32 %r851, [%r819+32]; + ld.shared.b32 %r852, [%r819+40]; + ld.shared.b32 %r853, [%r819+48]; + ld.shared.b32 %r854, [%r819+56]; + .loc 3 539 41 // triton_helpers.py:539:41 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r66, %r5, %r154; + mul.lo.s32 %r68, %r6, %r154; + mul.lo.s32 %r70, %r7, %r154; + mul.lo.s32 %r72, %r8, %r154; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p9 st.shared.b32 [ %r9 + 0 ], %r58; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r11 + 0 ], %r60; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r13 + 0 ], %r62; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r15 + 0 ], %r64; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r17 + 0 ], %r66; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r19 + 0 ], %r68; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r21 + 0 ], %r70; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r23 + 0 ], %r72; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r73, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r855, %r73, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r76, %r855, %r73; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r26 + 0 ], %r76; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r77, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r856, %r77, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r80, %r856, %r77; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r30 + 0 ], %r80; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r81, [ %r34 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r857, %r81, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r84, %r857, %r81; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r34 + 0 ], %r84; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r85, [ %r38 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r858, %r85, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r88, %r858, %r85; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r38 + 0 ], %r88; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r89, [ %r42 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r859, %r89, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r92, %r859, %r89; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r42 + 0 ], %r92; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r93, [ %r46 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r860, %r93, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r96, %r860, %r93; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r46 + 0 ], %r96; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r97, [ %r50 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r861, %r97, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r100, %r861, %r97; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r50 + 0 ], %r100; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r101, [ %r54 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r862, %r101, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r104, %r862, %r101; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r54 + 0 ], %r104; + // end inline asm + bar.sync 0; + ld.shared.b32 %r863, [%r819]; + ld.shared.b32 %r864, [%r819+8]; + ld.shared.b32 %r865, [%r819+16]; + ld.shared.b32 %r866, [%r819+24]; + ld.shared.b32 %r867, [%r819+32]; + ld.shared.b32 %r868, [%r819+40]; + ld.shared.b32 %r869, [%r819+48]; + ld.shared.b32 %r870, [%r819+56]; + .loc 3 548 23 // triton_helpers.py:548:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r114, %r835, %r828; + mul.lo.s32 %r116, %r831, %r828; + mul.lo.s32 %r118, %r834, %r828; + mul.lo.s32 %r120, %r828, %r832; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p9 st.shared.b32 [ %r9 + 0 ], %r106; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r11 + 0 ], %r108; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r13 + 0 ], %r110; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r15 + 0 ], %r112; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r17 + 0 ], %r114; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r19 + 0 ], %r116; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r21 + 0 ], %r118; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r23 + 0 ], %r120; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r121, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r871, %r121, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r124, %r871, %r121; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r26 + 0 ], %r124; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r125, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r872, %r125, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r128, %r872, %r125; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r30 + 0 ], %r128; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r129, [ %r34 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r873, %r129, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r132, %r873, %r129; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r34 + 0 ], %r132; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r133, [ %r38 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r874, %r133, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r136, %r874, %r133; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r38 + 0 ], %r136; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r137, [ %r42 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r875, %r137, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r140, %r875, %r137; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r42 + 0 ], %r140; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r141, [ %r46 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r876, %r141, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r144, %r876, %r141; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r46 + 0 ], %r144; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r145, [ %r50 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r877, %r145, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r148, %r877, %r145; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r50 + 0 ], %r148; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r149, [ %r54 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r878, %r149, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r152, %r878, %r149; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r54 + 0 ], %r152; + // end inline asm + bar.sync 0; + ld.shared.b32 %r879, [%r819]; + ld.shared.b32 %r880, [%r819+8]; + ld.shared.b32 %r881, [%r819+16]; + ld.shared.b32 %r882, [%r819+24]; + ld.shared.b32 %r883, [%r819+32]; + ld.shared.b32 %r884, [%r819+40]; + ld.shared.b32 %r885, [%r819+48]; + ld.shared.b32 %r886, [%r819+56]; + .loc 3 551 23 // triton_helpers.py:551:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r162, %r835, %r154; + mul.lo.s32 %r164, %r831, %r154; + mul.lo.s32 %r166, %r834, %r154; + mul.lo.s32 %r168, %r832, %r154; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p9 st.shared.b32 [ %r9 + 0 ], %r154; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r11 + 0 ], %r156; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r13 + 0 ], %r158; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r15 + 0 ], %r160; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r17 + 0 ], %r162; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r19 + 0 ], %r164; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r21 + 0 ], %r166; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r23 + 0 ], %r168; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r169, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r887, %r169, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r172, %r887, %r169; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r26 + 0 ], %r172; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r173, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r888, %r173, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r176, %r888, %r173; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r30 + 0 ], %r176; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r177, [ %r34 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r889, %r177, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r180, %r889, %r177; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r34 + 0 ], %r180; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r181, [ %r38 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r890, %r181, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r184, %r890, %r181; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r38 + 0 ], %r184; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r185, [ %r42 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r891, %r185, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r188, %r891, %r185; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r42 + 0 ], %r188; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r189, [ %r46 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r892, %r189, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r192, %r892, %r189; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r46 + 0 ], %r192; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r193, [ %r50 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r893, %r193, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r196, %r893, %r193; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r50 + 0 ], %r196; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r197, [ %r54 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r894, %r197, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r200, %r894, %r197; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r54 + 0 ], %r200; + // end inline asm + bar.sync 0; + ld.shared.b32 %r895, [%r819]; + ld.shared.b32 %r896, [%r819+8]; + ld.shared.b32 %r897, [%r819+16]; + ld.shared.b32 %r898, [%r819+24]; + ld.shared.b32 %r899, [%r819+32]; + ld.shared.b32 %r900, [%r819+40]; + ld.shared.b32 %r901, [%r819+48]; + ld.shared.b32 %r902, [%r819+56]; + .loc 3 574 22 // triton_helpers.py:574:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.lt.s32 %p400, %r849, %r865; + setp.lt.s32 %p401, %r847, %r863; + setp.ge.s32 %p402, %r850, %r866; + setp.ge.s32 %p403, %r848, %r864; + setp.lt.s32 %p404, %r853, %r869; + setp.lt.s32 %p405, %r851, %r867; + setp.ge.s32 %p406, %r854, %r870; + setp.ge.s32 %p407, %r852, %r868; + .loc 3 591 21 // triton_helpers.py:591:21 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.eq.b32 %p408, %r847, %r863; + setp.eq.b32 %p409, %r849, %r865; + setp.ne.b32 %p410, %r848, %r864; + setp.ne.b32 %p411, %r850, %r866; + setp.eq.b32 %p412, %r851, %r867; + setp.eq.b32 %p413, %r853, %r869; + setp.ne.b32 %p414, %r852, %r868; + setp.ne.b32 %p415, %r854, %r870; + .loc 3 594 40 // triton_helpers.py:594:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.gt.s32 %p416, %r879, %r895; + setp.gt.s32 %p417, %r881, %r897; + setp.le.s32 %p418, %r880, %r896; + setp.le.s32 %p419, %r882, %r898; + setp.gt.s32 %p420, %r883, %r899; + setp.gt.s32 %p421, %r885, %r901; + setp.le.s32 %p422, %r884, %r900; + setp.le.s32 %p423, %r886, %r902; + .loc 3 594 29 // triton_helpers.py:594:29 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.pred %p424, %p409, %p417; + and.pred %p425, %p408, %p416; + or.pred %p426, %p411, %p419; + or.pred %p427, %p410, %p418; + and.pred %p428, %p413, %p421; + and.pred %p429, %p412, %p420; + .loc 3 594 23 // triton_helpers.py:594:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + or.pred %p430, %p415, %p423; + or.pred %p431, %p414, %p422; + or.pred %p432, %p401, %p425; + or.pred %p433, %p400, %p424; + and.pred %p434, %p403, %p427; + and.pred %p435, %p402, %p426; + or.pred %p436, %p405, %p429; + or.pred %p437, %p404, %p428; + .loc 3 599 19 // triton_helpers.py:599:19 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.pred %p438, %p407, %p431; + and.pred %p439, %p406, %p430; + .loc 3 600 38 // triton_helpers.py:600:38 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r903, %r863, %r847; + xor.b32 %r904, %r865, %r849; + xor.b32 %r905, %r864, %r848; + xor.b32 %r906, %r866, %r850; + xor.b32 %r907, %r867, %r851; + xor.b32 %r908, %r869, %r853; + xor.b32 %r909, %r868, %r852; + xor.b32 %r910, %r870, %r854; + .loc 3 600 46 // triton_helpers.py:600:46 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r911, %r904, 0, %p433; + selp.b32 %r912, %r903, 0, %p432; + selp.b32 %r913, %r906, 0, %p435; + selp.b32 %r914, %r905, 0, %p434; + selp.b32 %r915, %r908, 0, %p437; + selp.b32 %r916, %r907, 0, %p436; + selp.b32 %r917, %r910, 0, %p439; + selp.b32 %r918, %r909, 0, %p438; + .loc 3 600 15 // triton_helpers.py:600:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r919, %r912, %r1; + xor.b32 %r920, %r911, %r3; + xor.b32 %r921, %r914, %r2; + xor.b32 %r922, %r913, %r4; + xor.b32 %r923, %r916, %r5; + xor.b32 %r924, %r915, %r7; + xor.b32 %r925, %r918, %r6; + xor.b32 %r926, %r917, %r8; + .loc 3 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r927, %r895, %r879; + xor.b32 %r928, %r897, %r881; + xor.b32 %r929, %r896, %r880; + xor.b32 %r930, %r898, %r882; + xor.b32 %r931, %r899, %r883; + xor.b32 %r932, %r901, %r885; + xor.b32 %r933, %r900, %r884; + xor.b32 %r934, %r902, %r886; + .loc 3 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r935, %r928, 0, %p433; + selp.b32 %r936, %r927, 0, %p432; + selp.b32 %r937, %r930, 0, %p435; + selp.b32 %r938, %r929, 0, %p434; + selp.b32 %r939, %r932, 0, %p437; + selp.b32 %r940, %r931, 0, %p436; + selp.b32 %r941, %r934, 0, %p439; + selp.b32 %r942, %r933, 0, %p438; + .loc 3 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r943, %r936, %r154; + xor.b32 %r944, %r935, %r821; + xor.b32 %r945, %r938, %r823; + xor.b32 %r946, %r937, %r822; + xor.b32 %r947, %r940, %r835; + xor.b32 %r948, %r939, %r834; + xor.b32 %r949, %r942, %r831; + xor.b32 %r950, %r941, %r832; + .loc 3 574 22 // triton_helpers.py:574:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.ge.s32 %p440, %r920, %r922; + setp.lt.s32 %p441, %r919, %r921; + setp.ge.s32 %p442, %r924, %r926; + setp.lt.s32 %p443, %r923, %r925; + .loc 3 591 21 // triton_helpers.py:591:21 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.ne.b32 %p444, %r920, %r922; + setp.eq.b32 %p445, %r919, %r921; + setp.ne.b32 %p446, %r924, %r926; + setp.eq.b32 %p447, %r923, %r925; + .loc 3 594 40 // triton_helpers.py:594:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.le.s32 %p448, %r944, %r946; + setp.gt.s32 %p449, %r943, %r945; + setp.le.s32 %p450, %r948, %r950; + setp.gt.s32 %p451, %r947, %r949; + .loc 3 594 29 // triton_helpers.py:594:29 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + or.pred %p452, %p444, %p448; + and.pred %p453, %p445, %p449; + .loc 3 594 23 // triton_helpers.py:594:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + or.pred %p454, %p446, %p450; + and.pred %p455, %p447, %p451; + and.pred %p456, %p440, %p452; + or.pred %p457, %p441, %p453; + .loc 3 599 19 // triton_helpers.py:599:19 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.pred %p458, %p442, %p454; + or.pred %p459, %p443, %p455; + .loc 3 600 38 // triton_helpers.py:600:38 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r951, %r922, %r920; + xor.b32 %r952, %r921, %r919; + xor.b32 %r953, %r926, %r924; + xor.b32 %r954, %r925, %r923; + .loc 3 600 46 // triton_helpers.py:600:46 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r955, %r952, 0, %p457; + selp.b32 %r956, %r951, 0, %p456; + selp.b32 %r957, %r954, 0, %p459; + selp.b32 %r958, %r953, 0, %p458; + .loc 3 600 15 // triton_helpers.py:600:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r959, %r956, %r920; + xor.b32 %r960, %r955, %r919; + xor.b32 %r961, %r956, %r922; + xor.b32 %r962, %r955, %r921; + xor.b32 %r963, %r958, %r924; + xor.b32 %r964, %r957, %r923; + xor.b32 %r965, %r958, %r926; + xor.b32 %r966, %r957, %r925; + .loc 3 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r967, %r945, %r943; + xor.b32 %r968, %r946, %r944; + xor.b32 %r969, %r949, %r947; + xor.b32 %r970, %r950, %r948; + .loc 3 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r971, %r967, 0, %p457; + selp.b32 %r972, %r968, 0, %p456; + selp.b32 %r973, %r969, 0, %p459; + selp.b32 %r974, %r970, 0, %p458; + .loc 3 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r975, %r971, %r943; + xor.b32 %r976, %r971, %r945; + xor.b32 %r977, %r972, %r944; + xor.b32 %r978, %r972, %r946; + xor.b32 %r979, %r973, %r947; + xor.b32 %r980, %r973, %r949; + xor.b32 %r981, %r974, %r948; + xor.b32 %r982, %r974, %r950; + .loc 3 538 40 // triton_helpers.py:538:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r202, %r960, %r828; + mul.lo.s32 %r204, %r962, %r828; + mul.lo.s32 %r206, %r959, %r828; + mul.lo.s32 %r208, %r961, %r828; + mul.lo.s32 %r210, %r964, %r828; + mul.lo.s32 %r212, %r966, %r828; + mul.lo.s32 %r214, %r963, %r828; + mul.lo.s32 %r216, %r965, %r828; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p9 st.shared.b32 [ %r9 + 0 ], %r202; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r11 + 0 ], %r204; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r13 + 0 ], %r206; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r15 + 0 ], %r208; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r17 + 0 ], %r210; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r19 + 0 ], %r212; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r21 + 0 ], %r214; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r23 + 0 ], %r216; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r217, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r983, %r217, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r220, %r983, %r217; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r26 + 0 ], %r220; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r221, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r984, %r221, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r224, %r984, %r221; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r30 + 0 ], %r224; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r225, [ %r34 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r985, %r225, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r228, %r985, %r225; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r34 + 0 ], %r228; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r229, [ %r38 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r986, %r229, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r232, %r986, %r229; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r38 + 0 ], %r232; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r233, [ %r42 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r987, %r233, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r236, %r987, %r233; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r42 + 0 ], %r236; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r237, [ %r46 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r988, %r237, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r240, %r988, %r237; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r46 + 0 ], %r240; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r241, [ %r50 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r989, %r241, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r244, %r989, %r241; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r50 + 0 ], %r244; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r245, [ %r54 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r990, %r245, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r248, %r990, %r245; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r54 + 0 ], %r248; + // end inline asm + bar.sync 0; + ld.shared.b32 %r991, [%r819]; + ld.shared.b32 %r992, [%r819+8]; + ld.shared.b32 %r993, [%r819+16]; + ld.shared.b32 %r994, [%r819+24]; + ld.shared.b32 %r995, [%r819+32]; + ld.shared.b32 %r996, [%r819+40]; + ld.shared.b32 %r997, [%r819+48]; + ld.shared.b32 %r998, [%r819+56]; + .loc 3 539 41 // triton_helpers.py:539:41 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r250, %r960, %r154; + mul.lo.s32 %r252, %r962, %r154; + mul.lo.s32 %r254, %r959, %r154; + mul.lo.s32 %r256, %r961, %r154; + mul.lo.s32 %r258, %r964, %r154; + mul.lo.s32 %r260, %r966, %r154; + mul.lo.s32 %r262, %r963, %r154; + mul.lo.s32 %r264, %r965, %r154; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p9 st.shared.b32 [ %r9 + 0 ], %r250; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r11 + 0 ], %r252; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r13 + 0 ], %r254; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r15 + 0 ], %r256; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r17 + 0 ], %r258; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r19 + 0 ], %r260; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r21 + 0 ], %r262; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r23 + 0 ], %r264; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r265, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r999, %r265, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r268, %r999, %r265; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r26 + 0 ], %r268; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r269, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1000, %r269, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r272, %r1000, %r269; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r30 + 0 ], %r272; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r273, [ %r34 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1001, %r273, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r276, %r1001, %r273; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r34 + 0 ], %r276; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r277, [ %r38 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1002, %r277, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r280, %r1002, %r277; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r38 + 0 ], %r280; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r281, [ %r42 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1003, %r281, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r284, %r1003, %r281; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r42 + 0 ], %r284; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r285, [ %r46 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1004, %r285, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r288, %r1004, %r285; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r46 + 0 ], %r288; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r289, [ %r50 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1005, %r289, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r292, %r1005, %r289; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r50 + 0 ], %r292; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r293, [ %r54 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1006, %r293, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r296, %r1006, %r293; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r54 + 0 ], %r296; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1007, [%r819]; + ld.shared.b32 %r1008, [%r819+8]; + ld.shared.b32 %r1009, [%r819+16]; + ld.shared.b32 %r1010, [%r819+24]; + ld.shared.b32 %r1011, [%r819+32]; + ld.shared.b32 %r1012, [%r819+40]; + ld.shared.b32 %r1013, [%r819+48]; + ld.shared.b32 %r1014, [%r819+56]; + .loc 3 548 23 // triton_helpers.py:548:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r298, %r975, %r828; + mul.lo.s32 %r300, %r976, %r828; + mul.lo.s32 %r302, %r977, %r828; + mul.lo.s32 %r304, %r978, %r828; + mul.lo.s32 %r306, %r979, %r828; + mul.lo.s32 %r308, %r980, %r828; + mul.lo.s32 %r310, %r981, %r828; + mul.lo.s32 %r312, %r982, %r828; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p9 st.shared.b32 [ %r9 + 0 ], %r298; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r11 + 0 ], %r300; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r13 + 0 ], %r302; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r15 + 0 ], %r304; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r17 + 0 ], %r306; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r19 + 0 ], %r308; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r21 + 0 ], %r310; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r23 + 0 ], %r312; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r313, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1015, %r313, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r316, %r1015, %r313; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r26 + 0 ], %r316; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r317, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1016, %r317, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r320, %r1016, %r317; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r30 + 0 ], %r320; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r321, [ %r34 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1017, %r321, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r324, %r1017, %r321; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r34 + 0 ], %r324; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r325, [ %r38 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1018, %r325, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r328, %r1018, %r325; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r38 + 0 ], %r328; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r329, [ %r42 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1019, %r329, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r332, %r1019, %r329; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r42 + 0 ], %r332; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r333, [ %r46 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1020, %r333, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r336, %r1020, %r333; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r46 + 0 ], %r336; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r337, [ %r50 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1021, %r337, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r340, %r1021, %r337; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r50 + 0 ], %r340; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r341, [ %r54 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1022, %r341, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r344, %r1022, %r341; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r54 + 0 ], %r344; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1023, [%r819]; + ld.shared.b32 %r1024, [%r819+8]; + ld.shared.b32 %r1025, [%r819+16]; + ld.shared.b32 %r1026, [%r819+24]; + ld.shared.b32 %r1027, [%r819+32]; + ld.shared.b32 %r1028, [%r819+40]; + ld.shared.b32 %r1029, [%r819+48]; + ld.shared.b32 %r1030, [%r819+56]; + .loc 3 551 23 // triton_helpers.py:551:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r346, %r975, %r154; + mul.lo.s32 %r348, %r976, %r154; + mul.lo.s32 %r350, %r977, %r154; + mul.lo.s32 %r352, %r978, %r154; + mul.lo.s32 %r354, %r979, %r154; + mul.lo.s32 %r356, %r980, %r154; + mul.lo.s32 %r358, %r981, %r154; + mul.lo.s32 %r360, %r982, %r154; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p9 st.shared.b32 [ %r9 + 0 ], %r346; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r11 + 0 ], %r348; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r13 + 0 ], %r350; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r15 + 0 ], %r352; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r17 + 0 ], %r354; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r19 + 0 ], %r356; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r21 + 0 ], %r358; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r23 + 0 ], %r360; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r361, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1031, %r361, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r364, %r1031, %r361; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r26 + 0 ], %r364; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r365, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1032, %r365, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r368, %r1032, %r365; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r30 + 0 ], %r368; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r369, [ %r34 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1033, %r369, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r372, %r1033, %r369; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r34 + 0 ], %r372; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r373, [ %r38 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1034, %r373, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r376, %r1034, %r373; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r38 + 0 ], %r376; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r377, [ %r42 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1035, %r377, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r380, %r1035, %r377; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r42 + 0 ], %r380; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r381, [ %r46 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1036, %r381, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r384, %r1036, %r381; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r46 + 0 ], %r384; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r385, [ %r50 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1037, %r385, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r388, %r1037, %r385; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r50 + 0 ], %r388; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r389, [ %r54 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1038, %r389, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r392, %r1038, %r389; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r54 + 0 ], %r392; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1039, [%r819]; + ld.shared.b32 %r1040, [%r819+8]; + ld.shared.b32 %r1041, [%r819+16]; + ld.shared.b32 %r1042, [%r819+24]; + ld.shared.b32 %r1043, [%r819+32]; + ld.shared.b32 %r1044, [%r819+40]; + ld.shared.b32 %r1045, [%r819+48]; + ld.shared.b32 %r1046, [%r819+56]; + .loc 3 574 22 // triton_helpers.py:574:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.lt.s32 %p460, %r991, %r1007; + setp.lt.s32 %p461, %r992, %r1008; + setp.ge.s32 %p462, %r993, %r1009; + setp.ge.s32 %p463, %r994, %r1010; + setp.lt.s32 %p464, %r995, %r1011; + setp.lt.s32 %p465, %r996, %r1012; + setp.ge.s32 %p466, %r997, %r1013; + setp.ge.s32 %p467, %r998, %r1014; + .loc 3 591 21 // triton_helpers.py:591:21 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.eq.b32 %p468, %r991, %r1007; + setp.eq.b32 %p469, %r992, %r1008; + setp.ne.b32 %p470, %r993, %r1009; + setp.ne.b32 %p471, %r994, %r1010; + setp.eq.b32 %p472, %r995, %r1011; + setp.eq.b32 %p473, %r996, %r1012; + setp.ne.b32 %p474, %r997, %r1013; + setp.ne.b32 %p475, %r998, %r1014; + .loc 3 594 40 // triton_helpers.py:594:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.gt.s32 %p476, %r1023, %r1039; + setp.gt.s32 %p477, %r1024, %r1040; + setp.le.s32 %p478, %r1025, %r1041; + setp.le.s32 %p479, %r1026, %r1042; + setp.gt.s32 %p480, %r1027, %r1043; + setp.gt.s32 %p481, %r1028, %r1044; + setp.le.s32 %p482, %r1029, %r1045; + setp.le.s32 %p483, %r1030, %r1046; + .loc 3 594 29 // triton_helpers.py:594:29 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.pred %p484, %p468, %p476; + or.pred %p485, %p470, %p478; + or.pred %p486, %p471, %p479; + and.pred %p487, %p469, %p477; + and.pred %p488, %p472, %p480; + or.pred %p489, %p474, %p482; + .loc 3 594 23 // triton_helpers.py:594:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + or.pred %p490, %p475, %p483; + and.pred %p491, %p473, %p481; + or.pred %p492, %p460, %p484; + and.pred %p493, %p462, %p485; + and.pred %p494, %p463, %p486; + or.pred %p495, %p461, %p487; + or.pred %p496, %p464, %p488; + and.pred %p497, %p466, %p489; + .loc 3 599 19 // triton_helpers.py:599:19 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.pred %p498, %p467, %p490; + or.pred %p499, %p465, %p491; + .loc 3 600 38 // triton_helpers.py:600:38 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1047, %r1007, %r991; + xor.b32 %r1048, %r1009, %r993; + xor.b32 %r1049, %r1010, %r994; + xor.b32 %r1050, %r1008, %r992; + xor.b32 %r1051, %r1011, %r995; + xor.b32 %r1052, %r1013, %r997; + xor.b32 %r1053, %r1014, %r998; + xor.b32 %r1054, %r1012, %r996; + .loc 3 600 46 // triton_helpers.py:600:46 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r1055, %r1047, 0, %p492; + selp.b32 %r1056, %r1048, 0, %p493; + selp.b32 %r1057, %r1050, 0, %p495; + selp.b32 %r1058, %r1049, 0, %p494; + selp.b32 %r1059, %r1051, 0, %p496; + selp.b32 %r1060, %r1052, 0, %p497; + selp.b32 %r1061, %r1054, 0, %p499; + selp.b32 %r1062, %r1053, 0, %p498; + .loc 3 600 15 // triton_helpers.py:600:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1063, %r1055, %r960; + xor.b32 %r1064, %r1056, %r959; + xor.b32 %r1065, %r1058, %r961; + xor.b32 %r1066, %r1057, %r962; + xor.b32 %r1067, %r1059, %r964; + xor.b32 %r1068, %r1060, %r963; + xor.b32 %r1069, %r1062, %r965; + xor.b32 %r1070, %r1061, %r966; + .loc 3 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1071, %r1039, %r1023; + xor.b32 %r1072, %r1040, %r1024; + xor.b32 %r1073, %r1041, %r1025; + xor.b32 %r1074, %r1042, %r1026; + xor.b32 %r1075, %r1043, %r1027; + xor.b32 %r1076, %r1044, %r1028; + xor.b32 %r1077, %r1045, %r1029; + xor.b32 %r1078, %r1046, %r1030; + .loc 3 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r1079, %r1071, 0, %p492; + selp.b32 %r1080, %r1072, 0, %p495; + selp.b32 %r1081, %r1073, 0, %p493; + selp.b32 %r1082, %r1074, 0, %p494; + selp.b32 %r1083, %r1075, 0, %p496; + selp.b32 %r1084, %r1076, 0, %p499; + selp.b32 %r1085, %r1077, 0, %p497; + selp.b32 %r1086, %r1078, 0, %p498; + .loc 3 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1087, %r1079, %r975; + xor.b32 %r1088, %r1080, %r976; + xor.b32 %r1089, %r1081, %r977; + xor.b32 %r1090, %r1082, %r978; + xor.b32 %r1091, %r1083, %r979; + xor.b32 %r1092, %r1084, %r980; + xor.b32 %r1093, %r1085, %r981; + xor.b32 %r1094, %r1086, %r982; + .loc 3 574 22 // triton_helpers.py:574:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.lt.s32 %p500, %r1063, %r1064; + setp.lt.s32 %p501, %r1066, %r1065; + setp.ge.s32 %p502, %r1067, %r1068; + setp.ge.s32 %p503, %r1070, %r1069; + .loc 3 591 21 // triton_helpers.py:591:21 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.eq.b32 %p504, %r1063, %r1064; + setp.eq.b32 %p505, %r1066, %r1065; + setp.ne.b32 %p506, %r1067, %r1068; + setp.ne.b32 %p507, %r1070, %r1069; + .loc 3 594 40 // triton_helpers.py:594:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.gt.s32 %p508, %r1087, %r1089; + setp.gt.s32 %p509, %r1088, %r1090; + setp.le.s32 %p510, %r1091, %r1093; + setp.le.s32 %p511, %r1092, %r1094; + .loc 3 594 29 // triton_helpers.py:594:29 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.pred %p512, %p504, %p508; + and.pred %p513, %p505, %p509; + or.pred %p514, %p506, %p510; + .loc 3 594 23 // triton_helpers.py:594:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + or.pred %p515, %p507, %p511; + or.pred %p516, %p500, %p512; + or.pred %p517, %p501, %p513; + and.pred %p518, %p502, %p514; + .loc 3 599 19 // triton_helpers.py:599:19 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.pred %p519, %p503, %p515; + .loc 3 600 38 // triton_helpers.py:600:38 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1095, %r1064, %r1063; + xor.b32 %r1096, %r1065, %r1066; + xor.b32 %r1097, %r1068, %r1067; + xor.b32 %r1098, %r1069, %r1070; + .loc 3 600 46 // triton_helpers.py:600:46 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r1099, %r1095, 0, %p516; + selp.b32 %r1100, %r1096, 0, %p517; + selp.b32 %r1101, %r1097, 0, %p518; + selp.b32 %r1102, %r1098, 0, %p519; + .loc 3 600 15 // triton_helpers.py:600:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1103, %r1099, %r1063; + xor.b32 %r1104, %r1100, %r1066; + xor.b32 %r1105, %r1099, %r1064; + xor.b32 %r1106, %r1100, %r1065; + xor.b32 %r1107, %r1101, %r1067; + xor.b32 %r1108, %r1102, %r1070; + xor.b32 %r1109, %r1101, %r1068; + xor.b32 %r1110, %r1102, %r1069; + .loc 3 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1111, %r1089, %r1087; + xor.b32 %r1112, %r1090, %r1088; + xor.b32 %r1113, %r1093, %r1091; + xor.b32 %r1114, %r1094, %r1092; + .loc 3 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r1115, %r1111, 0, %p516; + selp.b32 %r1116, %r1112, 0, %p517; + selp.b32 %r1117, %r1113, 0, %p518; + selp.b32 %r1118, %r1114, 0, %p519; + .loc 3 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1119, %r1115, %r1087; + xor.b32 %r1120, %r1116, %r1088; + xor.b32 %r1121, %r1115, %r1089; + xor.b32 %r1122, %r1116, %r1090; + xor.b32 %r1123, %r1117, %r1091; + xor.b32 %r1124, %r1118, %r1092; + xor.b32 %r1125, %r1117, %r1093; + xor.b32 %r1126, %r1118, %r1094; + .loc 3 574 22 // triton_helpers.py:574:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.lt.s32 %p520, %r1103, %r1104; + setp.lt.s32 %p521, %r1105, %r1106; + setp.ge.s32 %p522, %r1107, %r1108; + setp.ge.s32 %p523, %r1109, %r1110; + .loc 3 591 21 // triton_helpers.py:591:21 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.eq.b32 %p524, %r1103, %r1104; + setp.eq.b32 %p525, %r1105, %r1106; + setp.ne.b32 %p526, %r1107, %r1108; + setp.ne.b32 %p527, %r1109, %r1110; + .loc 3 594 40 // triton_helpers.py:594:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.gt.s32 %p528, %r1119, %r1120; + setp.gt.s32 %p529, %r1121, %r1122; + setp.le.s32 %p530, %r1123, %r1124; + setp.le.s32 %p531, %r1125, %r1126; + .loc 3 594 29 // triton_helpers.py:594:29 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.pred %p532, %p524, %p528; + and.pred %p533, %p525, %p529; + or.pred %p534, %p526, %p530; + .loc 3 594 23 // triton_helpers.py:594:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + or.pred %p535, %p527, %p531; + or.pred %p536, %p520, %p532; + or.pred %p537, %p521, %p533; + and.pred %p538, %p522, %p534; + .loc 3 599 19 // triton_helpers.py:599:19 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.pred %p539, %p523, %p535; + .loc 3 600 38 // triton_helpers.py:600:38 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1127, %r1104, %r1103; + xor.b32 %r1128, %r1106, %r1105; + xor.b32 %r1129, %r1108, %r1107; + xor.b32 %r1130, %r1110, %r1109; + .loc 3 600 46 // triton_helpers.py:600:46 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r1131, %r1127, 0, %p536; + selp.b32 %r1132, %r1128, 0, %p537; + selp.b32 %r1133, %r1129, 0, %p538; + selp.b32 %r1134, %r1130, 0, %p539; + .loc 3 600 15 // triton_helpers.py:600:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1135, %r1131, %r1103; + xor.b32 %r1136, %r1131, %r1104; + xor.b32 %r1137, %r1132, %r1105; + xor.b32 %r1138, %r1132, %r1106; + xor.b32 %r1139, %r1133, %r1107; + xor.b32 %r1140, %r1133, %r1108; + xor.b32 %r1141, %r1134, %r1109; + xor.b32 %r1142, %r1134, %r1110; + .loc 3 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1143, %r1120, %r1119; + xor.b32 %r1144, %r1122, %r1121; + xor.b32 %r1145, %r1124, %r1123; + xor.b32 %r1146, %r1126, %r1125; + .loc 3 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r1147, %r1143, 0, %p536; + selp.b32 %r1148, %r1144, 0, %p537; + selp.b32 %r1149, %r1145, 0, %p538; + selp.b32 %r1150, %r1146, 0, %p539; + .loc 3 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1151, %r1147, %r1119; + xor.b32 %r1152, %r1147, %r1120; + xor.b32 %r1153, %r1148, %r1121; + xor.b32 %r1154, %r1148, %r1122; + xor.b32 %r1155, %r1149, %r1123; + xor.b32 %r1156, %r1149, %r1124; + xor.b32 %r1157, %r1150, %r1125; + xor.b32 %r1158, %r1150, %r1126; + .loc 3 538 40 // triton_helpers.py:538:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r394, %r1135, %r828; + mul.lo.s32 %r396, %r1136, %r828; + mul.lo.s32 %r398, %r1137, %r828; + mul.lo.s32 %r400, %r1138, %r828; + mul.lo.s32 %r402, %r1139, %r828; + mul.lo.s32 %r404, %r1140, %r828; + mul.lo.s32 %r406, %r1141, %r828; + mul.lo.s32 %r408, %r1142, %r828; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p9 st.shared.b32 [ %r9 + 0 ], %r394; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r11 + 0 ], %r396; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r13 + 0 ], %r398; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r15 + 0 ], %r400; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r17 + 0 ], %r402; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r19 + 0 ], %r404; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r21 + 0 ], %r406; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r23 + 0 ], %r408; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r409, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1159, %r409, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r412, %r1159, %r409; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r26 + 0 ], %r412; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r413, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1160, %r413, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r416, %r1160, %r413; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r30 + 0 ], %r416; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r417, [ %r34 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1161, %r417, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r420, %r1161, %r417; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r34 + 0 ], %r420; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r421, [ %r38 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1162, %r421, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r424, %r1162, %r421; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r38 + 0 ], %r424; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r425, [ %r42 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1163, %r425, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r428, %r1163, %r425; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r42 + 0 ], %r428; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r429, [ %r46 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1164, %r429, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r432, %r1164, %r429; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r46 + 0 ], %r432; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r433, [ %r50 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1165, %r433, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r436, %r1165, %r433; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r50 + 0 ], %r436; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r437, [ %r54 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1166, %r437, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r440, %r1166, %r437; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r54 + 0 ], %r440; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1167, [%r819]; + ld.shared.b32 %r1168, [%r819+8]; + ld.shared.b32 %r1169, [%r819+16]; + ld.shared.b32 %r1170, [%r819+24]; + ld.shared.b32 %r1171, [%r819+32]; + ld.shared.b32 %r1172, [%r819+40]; + ld.shared.b32 %r1173, [%r819+48]; + ld.shared.b32 %r1174, [%r819+56]; + .loc 3 539 41 // triton_helpers.py:539:41 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r442, %r1135, %r154; + mul.lo.s32 %r444, %r1136, %r154; + mul.lo.s32 %r446, %r1137, %r154; + mul.lo.s32 %r448, %r1138, %r154; + mul.lo.s32 %r450, %r1139, %r154; + mul.lo.s32 %r452, %r1140, %r154; + mul.lo.s32 %r454, %r1141, %r154; + mul.lo.s32 %r456, %r1142, %r154; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p9 st.shared.b32 [ %r9 + 0 ], %r442; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r11 + 0 ], %r444; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r13 + 0 ], %r446; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r15 + 0 ], %r448; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r17 + 0 ], %r450; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r19 + 0 ], %r452; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r21 + 0 ], %r454; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r23 + 0 ], %r456; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r457, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1175, %r457, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r460, %r1175, %r457; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r26 + 0 ], %r460; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r461, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1176, %r461, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r464, %r1176, %r461; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r30 + 0 ], %r464; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r465, [ %r34 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1177, %r465, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r468, %r1177, %r465; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r34 + 0 ], %r468; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r469, [ %r38 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1178, %r469, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r472, %r1178, %r469; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r38 + 0 ], %r472; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r473, [ %r42 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1179, %r473, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r476, %r1179, %r473; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r42 + 0 ], %r476; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r477, [ %r46 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1180, %r477, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r480, %r1180, %r477; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r46 + 0 ], %r480; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r481, [ %r50 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1181, %r481, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r484, %r1181, %r481; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r50 + 0 ], %r484; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r485, [ %r54 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1182, %r485, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r488, %r1182, %r485; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r54 + 0 ], %r488; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1183, [%r819]; + ld.shared.b32 %r1184, [%r819+8]; + ld.shared.b32 %r1185, [%r819+16]; + ld.shared.b32 %r1186, [%r819+24]; + ld.shared.b32 %r1187, [%r819+32]; + ld.shared.b32 %r1188, [%r819+40]; + ld.shared.b32 %r1189, [%r819+48]; + ld.shared.b32 %r1190, [%r819+56]; + .loc 3 548 23 // triton_helpers.py:548:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r490, %r1151, %r828; + mul.lo.s32 %r492, %r1152, %r828; + mul.lo.s32 %r494, %r1153, %r828; + mul.lo.s32 %r496, %r1154, %r828; + mul.lo.s32 %r498, %r1155, %r828; + mul.lo.s32 %r500, %r1156, %r828; + mul.lo.s32 %r502, %r1157, %r828; + mul.lo.s32 %r504, %r1158, %r828; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p9 st.shared.b32 [ %r9 + 0 ], %r490; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r11 + 0 ], %r492; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r13 + 0 ], %r494; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r15 + 0 ], %r496; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r17 + 0 ], %r498; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r19 + 0 ], %r500; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r21 + 0 ], %r502; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r23 + 0 ], %r504; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r505, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1191, %r505, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r508, %r1191, %r505; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r26 + 0 ], %r508; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r509, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1192, %r509, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r512, %r1192, %r509; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r30 + 0 ], %r512; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r513, [ %r34 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1193, %r513, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r516, %r1193, %r513; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r34 + 0 ], %r516; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r517, [ %r38 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1194, %r517, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r520, %r1194, %r517; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r38 + 0 ], %r520; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r521, [ %r42 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1195, %r521, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r524, %r1195, %r521; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r42 + 0 ], %r524; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r525, [ %r46 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1196, %r525, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r528, %r1196, %r525; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r46 + 0 ], %r528; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r529, [ %r50 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1197, %r529, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r532, %r1197, %r529; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r50 + 0 ], %r532; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r533, [ %r54 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1198, %r533, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r536, %r1198, %r533; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r54 + 0 ], %r536; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1199, [%r819]; + ld.shared.b32 %r1200, [%r819+8]; + ld.shared.b32 %r1201, [%r819+16]; + ld.shared.b32 %r1202, [%r819+24]; + ld.shared.b32 %r1203, [%r819+32]; + ld.shared.b32 %r1204, [%r819+40]; + ld.shared.b32 %r1205, [%r819+48]; + ld.shared.b32 %r1206, [%r819+56]; + .loc 3 551 23 // triton_helpers.py:551:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r538, %r1151, %r154; + mul.lo.s32 %r540, %r1152, %r154; + mul.lo.s32 %r542, %r1153, %r154; + mul.lo.s32 %r544, %r1154, %r154; + mul.lo.s32 %r546, %r1155, %r154; + mul.lo.s32 %r548, %r1156, %r154; + mul.lo.s32 %r550, %r1157, %r154; + mul.lo.s32 %r552, %r1158, %r154; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p9 st.shared.b32 [ %r9 + 0 ], %r538; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r11 + 0 ], %r540; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r13 + 0 ], %r542; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r15 + 0 ], %r544; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r17 + 0 ], %r546; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r19 + 0 ], %r548; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r21 + 0 ], %r550; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r23 + 0 ], %r552; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r553, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1207, %r553, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r556, %r1207, %r553; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r26 + 0 ], %r556; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r557, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1208, %r557, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r560, %r1208, %r557; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r30 + 0 ], %r560; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r561, [ %r34 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1209, %r561, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r564, %r1209, %r561; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r34 + 0 ], %r564; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r565, [ %r38 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1210, %r565, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r568, %r1210, %r565; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r38 + 0 ], %r568; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r569, [ %r42 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1211, %r569, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r572, %r1211, %r569; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r42 + 0 ], %r572; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r573, [ %r46 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1212, %r573, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r576, %r1212, %r573; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r46 + 0 ], %r576; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r577, [ %r50 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1213, %r577, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r580, %r1213, %r577; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r50 + 0 ], %r580; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r581, [ %r54 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1214, %r581, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r584, %r1214, %r581; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r54 + 0 ], %r584; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1215, [%r819]; + ld.shared.b32 %r1216, [%r819+8]; + ld.shared.b32 %r1217, [%r819+16]; + ld.shared.b32 %r1218, [%r819+24]; + ld.shared.b32 %r1219, [%r819+32]; + ld.shared.b32 %r1220, [%r819+40]; + ld.shared.b32 %r1221, [%r819+48]; + ld.shared.b32 %r1222, [%r819+56]; + .loc 3 574 22 // triton_helpers.py:574:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.lt.s32 %p540, %r1167, %r1183; + setp.lt.s32 %p541, %r1168, %r1184; + setp.lt.s32 %p542, %r1169, %r1185; + setp.lt.s32 %p543, %r1170, %r1186; + setp.ge.s32 %p544, %r1171, %r1187; + setp.ge.s32 %p545, %r1172, %r1188; + setp.ge.s32 %p546, %r1173, %r1189; + setp.ge.s32 %p547, %r1174, %r1190; + .loc 3 591 21 // triton_helpers.py:591:21 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.eq.b32 %p548, %r1167, %r1183; + setp.eq.b32 %p549, %r1168, %r1184; + setp.eq.b32 %p550, %r1169, %r1185; + setp.eq.b32 %p551, %r1170, %r1186; + setp.ne.b32 %p552, %r1171, %r1187; + setp.ne.b32 %p553, %r1172, %r1188; + setp.ne.b32 %p554, %r1173, %r1189; + setp.ne.b32 %p555, %r1174, %r1190; + .loc 3 594 40 // triton_helpers.py:594:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.gt.s32 %p556, %r1199, %r1215; + setp.gt.s32 %p557, %r1200, %r1216; + setp.gt.s32 %p558, %r1201, %r1217; + setp.gt.s32 %p559, %r1202, %r1218; + setp.le.s32 %p560, %r1203, %r1219; + setp.le.s32 %p561, %r1204, %r1220; + setp.le.s32 %p562, %r1205, %r1221; + setp.le.s32 %p563, %r1206, %r1222; + .loc 3 594 29 // triton_helpers.py:594:29 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.pred %p564, %p548, %p556; + and.pred %p565, %p549, %p557; + and.pred %p566, %p550, %p558; + and.pred %p567, %p551, %p559; + or.pred %p568, %p552, %p560; + or.pred %p569, %p553, %p561; + or.pred %p570, %p554, %p562; + .loc 3 594 23 // triton_helpers.py:594:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + or.pred %p571, %p555, %p563; + or.pred %p572, %p540, %p564; + or.pred %p573, %p541, %p565; + or.pred %p574, %p542, %p566; + or.pred %p575, %p543, %p567; + and.pred %p576, %p544, %p568; + and.pred %p577, %p545, %p569; + and.pred %p578, %p546, %p570; + .loc 3 599 19 // triton_helpers.py:599:19 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.pred %p579, %p547, %p571; + .loc 3 600 38 // triton_helpers.py:600:38 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1223, %r1183, %r1167; + xor.b32 %r1224, %r1184, %r1168; + xor.b32 %r1225, %r1185, %r1169; + xor.b32 %r1226, %r1186, %r1170; + xor.b32 %r1227, %r1187, %r1171; + xor.b32 %r1228, %r1188, %r1172; + xor.b32 %r1229, %r1189, %r1173; + xor.b32 %r1230, %r1190, %r1174; + .loc 3 600 46 // triton_helpers.py:600:46 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r1231, %r1223, 0, %p572; + selp.b32 %r1232, %r1224, 0, %p573; + selp.b32 %r1233, %r1225, 0, %p574; + selp.b32 %r1234, %r1226, 0, %p575; + selp.b32 %r1235, %r1227, 0, %p576; + selp.b32 %r1236, %r1228, 0, %p577; + selp.b32 %r1237, %r1229, 0, %p578; + selp.b32 %r1238, %r1230, 0, %p579; + .loc 3 600 15 // triton_helpers.py:600:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1239, %r1231, %r1135; + xor.b32 %r1240, %r1232, %r1136; + xor.b32 %r1241, %r1233, %r1137; + xor.b32 %r1242, %r1234, %r1138; + xor.b32 %r1243, %r1235, %r1139; + xor.b32 %r1244, %r1236, %r1140; + xor.b32 %r1245, %r1237, %r1141; + xor.b32 %r1246, %r1238, %r1142; + .loc 3 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1247, %r1215, %r1199; + xor.b32 %r1248, %r1216, %r1200; + xor.b32 %r1249, %r1217, %r1201; + xor.b32 %r1250, %r1218, %r1202; + xor.b32 %r1251, %r1219, %r1203; + xor.b32 %r1252, %r1220, %r1204; + xor.b32 %r1253, %r1221, %r1205; + xor.b32 %r1254, %r1222, %r1206; + .loc 3 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r1255, %r1247, 0, %p572; + selp.b32 %r1256, %r1248, 0, %p573; + selp.b32 %r1257, %r1249, 0, %p574; + selp.b32 %r1258, %r1250, 0, %p575; + selp.b32 %r1259, %r1251, 0, %p576; + selp.b32 %r1260, %r1252, 0, %p577; + selp.b32 %r1261, %r1253, 0, %p578; + selp.b32 %r1262, %r1254, 0, %p579; + .loc 3 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1263, %r1255, %r1151; + xor.b32 %r1264, %r1256, %r1152; + xor.b32 %r1265, %r1257, %r1153; + xor.b32 %r1266, %r1258, %r1154; + xor.b32 %r1267, %r1259, %r1155; + xor.b32 %r1268, %r1260, %r1156; + xor.b32 %r1269, %r1261, %r1157; + xor.b32 %r1270, %r1262, %r1158; + .loc 3 574 22 // triton_helpers.py:574:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.lt.s32 %p580, %r1239, %r1243; + setp.lt.s32 %p581, %r1240, %r1244; + setp.lt.s32 %p582, %r1241, %r1245; + setp.lt.s32 %p583, %r1242, %r1246; + .loc 3 591 21 // triton_helpers.py:591:21 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.eq.b32 %p584, %r1239, %r1243; + setp.eq.b32 %p585, %r1240, %r1244; + setp.eq.b32 %p586, %r1241, %r1245; + setp.eq.b32 %p587, %r1242, %r1246; + .loc 3 594 40 // triton_helpers.py:594:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.gt.s32 %p588, %r1263, %r1267; + setp.gt.s32 %p589, %r1264, %r1268; + setp.gt.s32 %p590, %r1265, %r1269; + setp.gt.s32 %p591, %r1266, %r1270; + .loc 3 594 29 // triton_helpers.py:594:29 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.pred %p592, %p584, %p588; + and.pred %p593, %p585, %p589; + and.pred %p594, %p586, %p590; + and.pred %p595, %p587, %p591; + .loc 3 594 23 // triton_helpers.py:594:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + or.pred %p596, %p580, %p592; + or.pred %p597, %p581, %p593; + or.pred %p598, %p582, %p594; + or.pred %p599, %p583, %p595; + .loc 3 600 38 // triton_helpers.py:600:38 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1271, %r1243, %r1239; + xor.b32 %r1272, %r1244, %r1240; + xor.b32 %r1273, %r1245, %r1241; + xor.b32 %r1274, %r1246, %r1242; + .loc 3 600 46 // triton_helpers.py:600:46 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r1275, %r1271, 0, %p596; + selp.b32 %r1276, %r1272, 0, %p597; + selp.b32 %r1277, %r1273, 0, %p598; + selp.b32 %r1278, %r1274, 0, %p599; + .loc 3 600 15 // triton_helpers.py:600:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1279, %r1275, %r1239; + xor.b32 %r1280, %r1276, %r1240; + xor.b32 %r1281, %r1277, %r1241; + xor.b32 %r1282, %r1278, %r1242; + xor.b32 %r1283, %r1275, %r1243; + xor.b32 %r1284, %r1276, %r1244; + xor.b32 %r1285, %r1277, %r1245; + xor.b32 %r1286, %r1278, %r1246; + .loc 3 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1287, %r1267, %r1263; + xor.b32 %r1288, %r1268, %r1264; + xor.b32 %r1289, %r1269, %r1265; + xor.b32 %r1290, %r1270, %r1266; + .loc 3 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r1291, %r1287, 0, %p596; + selp.b32 %r1292, %r1288, 0, %p597; + selp.b32 %r1293, %r1289, 0, %p598; + selp.b32 %r1294, %r1290, 0, %p599; + .loc 3 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1295, %r1291, %r1263; + xor.b32 %r1296, %r1292, %r1264; + xor.b32 %r1297, %r1293, %r1265; + xor.b32 %r1298, %r1294, %r1266; + xor.b32 %r1299, %r1291, %r1267; + xor.b32 %r1300, %r1292, %r1268; + xor.b32 %r1301, %r1293, %r1269; + xor.b32 %r1302, %r1294, %r1270; + .loc 3 574 22 // triton_helpers.py:574:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.lt.s32 %p600, %r1279, %r1281; + setp.lt.s32 %p601, %r1280, %r1282; + setp.lt.s32 %p602, %r1283, %r1285; + setp.lt.s32 %p603, %r1284, %r1286; + .loc 3 591 21 // triton_helpers.py:591:21 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.eq.b32 %p604, %r1279, %r1281; + setp.eq.b32 %p605, %r1280, %r1282; + setp.eq.b32 %p606, %r1283, %r1285; + setp.eq.b32 %p607, %r1284, %r1286; + .loc 3 594 40 // triton_helpers.py:594:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.gt.s32 %p608, %r1295, %r1297; + setp.gt.s32 %p609, %r1296, %r1298; + setp.gt.s32 %p610, %r1299, %r1301; + setp.gt.s32 %p611, %r1300, %r1302; + .loc 3 594 29 // triton_helpers.py:594:29 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.pred %p612, %p604, %p608; + and.pred %p613, %p605, %p609; + and.pred %p614, %p606, %p610; + and.pred %p615, %p607, %p611; + .loc 3 594 23 // triton_helpers.py:594:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + or.pred %p616, %p600, %p612; + or.pred %p617, %p601, %p613; + or.pred %p618, %p602, %p614; + or.pred %p619, %p603, %p615; + .loc 3 600 38 // triton_helpers.py:600:38 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1303, %r1281, %r1279; + xor.b32 %r1304, %r1282, %r1280; + xor.b32 %r1305, %r1285, %r1283; + xor.b32 %r1306, %r1286, %r1284; + .loc 3 600 46 // triton_helpers.py:600:46 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r1307, %r1303, 0, %p616; + selp.b32 %r1308, %r1304, 0, %p617; + selp.b32 %r1309, %r1305, 0, %p618; + selp.b32 %r1310, %r1306, 0, %p619; + .loc 3 600 15 // triton_helpers.py:600:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1311, %r1307, %r1279; + xor.b32 %r1312, %r1308, %r1280; + xor.b32 %r1313, %r1307, %r1281; + xor.b32 %r1314, %r1308, %r1282; + xor.b32 %r1315, %r1309, %r1283; + xor.b32 %r1316, %r1310, %r1284; + xor.b32 %r1317, %r1309, %r1285; + xor.b32 %r1318, %r1310, %r1286; + .loc 3 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1319, %r1297, %r1295; + xor.b32 %r1320, %r1298, %r1296; + xor.b32 %r1321, %r1301, %r1299; + xor.b32 %r1322, %r1302, %r1300; + .loc 3 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r1323, %r1319, 0, %p616; + selp.b32 %r1324, %r1320, 0, %p617; + selp.b32 %r1325, %r1321, 0, %p618; + selp.b32 %r1326, %r1322, 0, %p619; + .loc 3 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1327, %r1323, %r1295; + xor.b32 %r1328, %r1324, %r1296; + xor.b32 %r1329, %r1323, %r1297; + xor.b32 %r1330, %r1324, %r1298; + xor.b32 %r1331, %r1325, %r1299; + xor.b32 %r1332, %r1326, %r1300; + xor.b32 %r1333, %r1325, %r1301; + xor.b32 %r1334, %r1326, %r1302; + .loc 3 574 22 // triton_helpers.py:574:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.lt.s32 %p620, %r1311, %r1312; + setp.lt.s32 %p621, %r1313, %r1314; + setp.lt.s32 %p622, %r1315, %r1316; + setp.lt.s32 %p623, %r1317, %r1318; + .loc 3 591 21 // triton_helpers.py:591:21 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.eq.b32 %p624, %r1311, %r1312; + setp.eq.b32 %p625, %r1313, %r1314; + setp.eq.b32 %p626, %r1315, %r1316; + setp.eq.b32 %p627, %r1317, %r1318; + .loc 3 594 40 // triton_helpers.py:594:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.gt.s32 %p628, %r1327, %r1328; + setp.gt.s32 %p629, %r1329, %r1330; + setp.gt.s32 %p630, %r1331, %r1332; + setp.gt.s32 %p631, %r1333, %r1334; + .loc 3 594 29 // triton_helpers.py:594:29 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + and.pred %p632, %p624, %p628; + and.pred %p633, %p625, %p629; + and.pred %p634, %p626, %p630; + and.pred %p635, %p627, %p631; + .loc 3 594 23 // triton_helpers.py:594:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + or.pred %p636, %p620, %p632; + or.pred %p637, %p621, %p633; + or.pred %p638, %p622, %p634; + or.pred %p639, %p623, %p635; + .loc 3 600 38 // triton_helpers.py:600:38 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1335, %r1312, %r1311; + xor.b32 %r1336, %r1314, %r1313; + xor.b32 %r1337, %r1316, %r1315; + xor.b32 %r1338, %r1318, %r1317; + .loc 3 600 46 // triton_helpers.py:600:46 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r1339, %r1335, 0, %p636; + selp.b32 %r1340, %r1336, 0, %p637; + selp.b32 %r1341, %r1337, 0, %p638; + selp.b32 %r1342, %r1338, 0, %p639; + .loc 3 600 15 // triton_helpers.py:600:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1343, %r1339, %r1311; + xor.b32 %r1344, %r1339, %r1312; + xor.b32 %r1345, %r1340, %r1313; + xor.b32 %r1346, %r1340, %r1314; + xor.b32 %r1347, %r1341, %r1315; + xor.b32 %r1348, %r1341, %r1316; + xor.b32 %r1349, %r1342, %r1317; + xor.b32 %r1350, %r1342, %r1318; + .loc 3 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1351, %r1328, %r1327; + xor.b32 %r1352, %r1330, %r1329; + xor.b32 %r1353, %r1332, %r1331; + xor.b32 %r1354, %r1334, %r1333; + .loc 3 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r1355, %r1351, 0, %p636; + selp.b32 %r1356, %r1352, 0, %p637; + selp.b32 %r1357, %r1353, 0, %p638; + selp.b32 %r1358, %r1354, 0, %p639; + .loc 3 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1359, %r1355, %r1327; + xor.b32 %r1360, %r1355, %r1328; + xor.b32 %r1361, %r1356, %r1329; + xor.b32 %r1362, %r1356, %r1330; + xor.b32 %r1363, %r1357, %r1331; + xor.b32 %r1364, %r1357, %r1332; + xor.b32 %r1365, %r1358, %r1333; + xor.b32 %r1366, %r1358, %r1334; + .loc 3 538 40 // triton_helpers.py:538:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r586, %r1343, %r828; + mul.lo.s32 %r588, %r1344, %r828; + mul.lo.s32 %r590, %r1345, %r828; + mul.lo.s32 %r592, %r1346, %r828; + mul.lo.s32 %r594, %r1347, %r828; + mul.lo.s32 %r596, %r1348, %r828; + mul.lo.s32 %r598, %r1349, %r828; + mul.lo.s32 %r600, %r1350, %r828; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p9 st.shared.b32 [ %r9 + 0 ], %r586; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r11 + 0 ], %r588; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r13 + 0 ], %r590; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r15 + 0 ], %r592; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r17 + 0 ], %r594; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r19 + 0 ], %r596; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r21 + 0 ], %r598; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r23 + 0 ], %r600; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r601, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1367, %r601, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r604, %r1367, %r601; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r26 + 0 ], %r604; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r605, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1368, %r605, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r608, %r1368, %r605; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r30 + 0 ], %r608; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r609, [ %r34 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1369, %r609, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r612, %r1369, %r609; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r34 + 0 ], %r612; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r613, [ %r38 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1370, %r613, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r616, %r1370, %r613; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r38 + 0 ], %r616; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r617, [ %r42 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1371, %r617, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r620, %r1371, %r617; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r42 + 0 ], %r620; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r621, [ %r46 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1372, %r621, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r624, %r1372, %r621; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r46 + 0 ], %r624; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r625, [ %r50 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1373, %r625, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r628, %r1373, %r625; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r50 + 0 ], %r628; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r629, [ %r54 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1374, %r629, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r632, %r1374, %r629; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r54 + 0 ], %r632; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1375, [%r819]; + ld.shared.b32 %r1376, [%r819+8]; + ld.shared.b32 %r1377, [%r819+16]; + ld.shared.b32 %r1378, [%r819+24]; + ld.shared.b32 %r1379, [%r819+32]; + ld.shared.b32 %r1380, [%r819+40]; + ld.shared.b32 %r1381, [%r819+48]; + ld.shared.b32 %r1382, [%r819+56]; + .loc 3 539 41 // triton_helpers.py:539:41 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r634, %r1343, %r154; + mul.lo.s32 %r636, %r1344, %r154; + mul.lo.s32 %r638, %r1345, %r154; + mul.lo.s32 %r640, %r1346, %r154; + mul.lo.s32 %r642, %r1347, %r154; + mul.lo.s32 %r644, %r1348, %r154; + mul.lo.s32 %r646, %r1349, %r154; + mul.lo.s32 %r648, %r1350, %r154; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p9 st.shared.b32 [ %r9 + 0 ], %r634; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r11 + 0 ], %r636; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r13 + 0 ], %r638; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r15 + 0 ], %r640; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r17 + 0 ], %r642; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r19 + 0 ], %r644; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r21 + 0 ], %r646; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r23 + 0 ], %r648; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r649, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1383, %r649, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r652, %r1383, %r649; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r26 + 0 ], %r652; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r653, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1384, %r653, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r656, %r1384, %r653; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r30 + 0 ], %r656; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r657, [ %r34 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1385, %r657, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r660, %r1385, %r657; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r34 + 0 ], %r660; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r661, [ %r38 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1386, %r661, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r664, %r1386, %r661; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r38 + 0 ], %r664; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r665, [ %r42 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1387, %r665, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r668, %r1387, %r665; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r42 + 0 ], %r668; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r669, [ %r46 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1388, %r669, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r672, %r1388, %r669; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r46 + 0 ], %r672; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r673, [ %r50 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1389, %r673, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r676, %r1389, %r673; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r50 + 0 ], %r676; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r677, [ %r54 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1390, %r677, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r680, %r1390, %r677; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r54 + 0 ], %r680; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1391, [%r819]; + ld.shared.b32 %r1392, [%r819+8]; + ld.shared.b32 %r1393, [%r819+16]; + ld.shared.b32 %r1394, [%r819+24]; + ld.shared.b32 %r1395, [%r819+32]; + ld.shared.b32 %r1396, [%r819+40]; + ld.shared.b32 %r1397, [%r819+48]; + ld.shared.b32 %r1398, [%r819+56]; + .loc 3 548 23 // triton_helpers.py:548:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r682, %r1359, %r828; + mul.lo.s32 %r684, %r1360, %r828; + mul.lo.s32 %r686, %r1361, %r828; + mul.lo.s32 %r688, %r1362, %r828; + mul.lo.s32 %r690, %r1363, %r828; + mul.lo.s32 %r692, %r1364, %r828; + mul.lo.s32 %r694, %r1365, %r828; + mul.lo.s32 %r696, %r1366, %r828; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p9 st.shared.b32 [ %r9 + 0 ], %r682; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r11 + 0 ], %r684; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r13 + 0 ], %r686; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r15 + 0 ], %r688; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r17 + 0 ], %r690; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r19 + 0 ], %r692; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r21 + 0 ], %r694; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r23 + 0 ], %r696; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r697, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1399, %r697, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r700, %r1399, %r697; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r26 + 0 ], %r700; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r701, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1400, %r701, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r704, %r1400, %r701; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r30 + 0 ], %r704; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r705, [ %r34 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1401, %r705, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r708, %r1401, %r705; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r34 + 0 ], %r708; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r709, [ %r38 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1402, %r709, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r712, %r1402, %r709; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r38 + 0 ], %r712; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r713, [ %r42 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1403, %r713, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r716, %r1403, %r713; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r42 + 0 ], %r716; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r717, [ %r46 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1404, %r717, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r720, %r1404, %r717; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r46 + 0 ], %r720; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r721, [ %r50 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1405, %r721, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r724, %r1405, %r721; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r50 + 0 ], %r724; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r725, [ %r54 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1406, %r725, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r728, %r1406, %r725; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r54 + 0 ], %r728; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1407, [%r819]; + ld.shared.b32 %r1408, [%r819+8]; + ld.shared.b32 %r1409, [%r819+16]; + ld.shared.b32 %r1410, [%r819+24]; + ld.shared.b32 %r1411, [%r819+32]; + ld.shared.b32 %r1412, [%r819+40]; + ld.shared.b32 %r1413, [%r819+48]; + ld.shared.b32 %r1414, [%r819+56]; + .loc 3 551 23 // triton_helpers.py:551:23 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + mul.lo.s32 %r730, %r1359, %r154; + mul.lo.s32 %r732, %r1360, %r154; + mul.lo.s32 %r734, %r1361, %r154; + mul.lo.s32 %r736, %r1362, %r154; + mul.lo.s32 %r738, %r1363, %r154; + mul.lo.s32 %r740, %r1364, %r154; + mul.lo.s32 %r742, %r1365, %r154; + mul.lo.s32 %r744, %r1366, %r154; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + bar.sync 0; + // begin inline asm + @%p9 st.shared.b32 [ %r9 + 0 ], %r730; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r11 + 0 ], %r732; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r13 + 0 ], %r734; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r15 + 0 ], %r736; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r17 + 0 ], %r738; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r19 + 0 ], %r740; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r21 + 0 ], %r742; + // end inline asm + // begin inline asm + @%p9 st.shared.b32 [ %r23 + 0 ], %r744; + // end inline asm + bar.sync 0; + // begin inline asm + @%p9 ld.shared.b32 %r745, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1415, %r745, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r748, %r1415, %r745; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r26 + 0 ], %r748; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r749, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1416, %r749, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r752, %r1416, %r749; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r30 + 0 ], %r752; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r753, [ %r34 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1417, %r753, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r756, %r1417, %r753; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r34 + 0 ], %r756; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r757, [ %r38 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1418, %r757, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r760, %r1418, %r757; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r38 + 0 ], %r760; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r761, [ %r42 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1419, %r761, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r764, %r1419, %r761; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r42 + 0 ], %r764; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r765, [ %r46 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1420, %r765, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r768, %r1420, %r765; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r46 + 0 ], %r768; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r769, [ %r50 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1421, %r769, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r772, %r1421, %r769; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r50 + 0 ], %r772; + // end inline asm + // begin inline asm + @%p9 ld.shared.b32 %r773, [ %r54 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1422, %r773, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + add.s32 %r776, %r1422, %r773; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + // begin inline asm + @%p18 st.shared.b32 [ %r54 + 0 ], %r776; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1423, [%r819]; + ld.shared.b32 %r1424, [%r819+8]; + ld.shared.b32 %r1425, [%r819+16]; + ld.shared.b32 %r1426, [%r819+24]; + ld.shared.b32 %r1427, [%r819+32]; + ld.shared.b32 %r1428, [%r819+40]; + ld.shared.b32 %r1429, [%r819+48]; + ld.shared.b32 %r1430, [%r819+56]; + .loc 3 574 22 // triton_helpers.py:574:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.lt.s32 %p640, %r1381, %r1397; + setp.lt.s32 %p641, %r1379, %r1395; + setp.lt.s32 %p642, %r1377, %r1393; + setp.lt.s32 %p643, %r1375, %r1391; + setp.lt.s32 %p644, %r1382, %r1398; + setp.lt.s32 %p645, %r1380, %r1396; + setp.lt.s32 %p646, %r1378, %r1394; + setp.lt.s32 %p647, %r1376, %r1392; + .loc 3 591 21 // triton_helpers.py:591:21 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.eq.b32 %p648, %r1381, %r1397; + setp.eq.b32 %p649, %r1379, %r1395; + setp.eq.b32 %p650, %r1377, %r1393; + setp.eq.b32 %p651, %r1375, %r1391; + setp.eq.b32 %p652, %r1382, %r1398; + setp.eq.b32 %p653, %r1380, %r1396; + setp.eq.b32 %p654, %r1378, %r1394; + setp.eq.b32 %p655, %r1376, %r1392; + .loc 3 594 40 // triton_helpers.py:594:40 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + setp.gt.s32 %p656, %r1413, %r1429; + setp.gt.s32 %p657, %r1411, %r1427; + setp.gt.s32 %p658, %r1409, %r1425; + setp.gt.s32 %p659, %r1407, %r1423; + setp.gt.s32 %p660, %r1414, %r1430; + setp.gt.s32 %p661, %r1412, %r1428; + setp.gt.s32 %p662, %r1410, %r1426; + setp.gt.s32 %p663, %r1408, %r1424; + .loc 3 601 48 // triton_helpers.py:601:48 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1431, %r1429, %r1413; + xor.b32 %r1432, %r1427, %r1411; + xor.b32 %r1433, %r1425, %r1409; + xor.b32 %r1434, %r1423, %r1407; + xor.b32 %r1435, %r1430, %r1414; + xor.b32 %r1436, %r1428, %r1412; + xor.b32 %r1437, %r1426, %r1410; + xor.b32 %r1438, %r1424, %r1408; + .loc 3 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r1439, %r1434, 0, %p659; + selp.b32 %r1440, %r1439, 0, %p651; + selp.b32 %r1441, %r1434, %r1440, %p643; + selp.b32 %r1442, %r1433, 0, %p658; + selp.b32 %r1443, %r1442, 0, %p650; + selp.b32 %r1444, %r1433, %r1443, %p642; + selp.b32 %r1445, %r1432, 0, %p657; + selp.b32 %r1446, %r1445, 0, %p649; + selp.b32 %r1447, %r1432, %r1446, %p641; + selp.b32 %r1448, %r1431, 0, %p656; + selp.b32 %r1449, %r1448, 0, %p648; + selp.b32 %r1450, %r1431, %r1449, %p640; + .loc 3 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1451, %r1450, %r1365; + xor.b32 %r1452, %r1447, %r1363; + xor.b32 %r1453, %r1444, %r1361; + xor.b32 %r1454, %r1441, %r1359; + .loc 3 601 59 // triton_helpers.py:601:59 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + selp.b32 %r1455, %r1438, 0, %p663; + selp.b32 %r1456, %r1455, 0, %p655; + selp.b32 %r1457, %r1438, %r1456, %p647; + selp.b32 %r1458, %r1437, 0, %p662; + selp.b32 %r1459, %r1458, 0, %p654; + selp.b32 %r1460, %r1437, %r1459, %p646; + selp.b32 %r1461, %r1436, 0, %p661; + selp.b32 %r1462, %r1461, 0, %p653; + selp.b32 %r1463, %r1436, %r1462, %p645; + selp.b32 %r1464, %r1435, 0, %p660; + selp.b32 %r1465, %r1464, 0, %p652; + selp.b32 %r1466, %r1435, %r1465, %p644; + .loc 3 601 22 // triton_helpers.py:601:22 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:41:67 ] + xor.b32 %r1467, %r1466, %r1366; + xor.b32 %r1468, %r1463, %r1364; + xor.b32 %r1469, %r1460, %r1362; + xor.b32 %r1470, %r1457, %r1360; +$L__tmp6: + .loc 1 44 34 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:44:34 + cvt.s64.s32 %rd18, %r5; + cvt.s64.s32 %rd19, %r2; + cvt.s64.s32 %rd20, %r7; + cvt.s64.s32 %rd21, %r3; + cvt.s64.s32 %rd22, %r6; + cvt.s64.s32 %rd23, %r1; + cvt.s64.s32 %rd24, %r8; + cvt.s64.s32 %rd25, %r4; + selp.b64 %rd26, %rd25, 0, %p1; + selp.b64 %rd27, %rd24, 0, %p1; + selp.b64 %rd28, %rd23, 0, %p1; + selp.b64 %rd29, %rd22, 0, %p1; + selp.b64 %rd30, %rd21, 0, %p1; + selp.b64 %rd31, %rd20, 0, %p1; + selp.b64 %rd32, %rd19, 0, %p1; + selp.b64 %rd33, %rd18, 0, %p1; +$L__tmp7: + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:45:26 ] + bar.sync 0; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:45:26 ] + add.s64 %rd34, %rd32, %rd33; + add.s64 %rd35, %rd30, %rd31; + add.s64 %rd36, %rd34, %rd35; + add.s64 %rd37, %rd28, %rd29; + add.s64 %rd38, %rd26, %rd27; + add.s64 %rd39, %rd37, %rd38; + add.s64 %rd9, %rd36, %rd39; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:45:26 ] + add.s32 %r1471, %r818, %r816; + shl.b32 %r1472, %r154, 3; + add.s32 %r777, %r1471, %r1472; + // begin inline asm + @%p9 st.shared.b64 [ %r777 + 0 ], %rd9; + // end inline asm + bar.sync 0; + setp.lt.u32 %p394, %r801, 256; + shl.b32 %r1473, %r801, 3; + add.s32 %r778, %r818, %r1473; + // begin inline asm + @%p394 ld.shared.b64 %rd10, [ %r778 + 0 ]; + // end inline asm + mov.b64 {_, %r1474}, %rd10; + cvt.u32.u64 %r1475, %rd10; + shfl.sync.bfly.b32 %r1476, %r1475, 1, 31, -1; + shfl.sync.bfly.b32 %r1477, %r1474, 1, 31, -1; + cvt.u64.u32 %rd40, %r1476; + cvt.u64.u32 %rd41, %r1477; + shl.b64 %rd42, %rd41, 32; + or.b64 %rd43, %rd40, %rd42; + .loc 2 261 15 // standard.py:261:15 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:45:26 ] + add.s64 %rd11, %rd10, %rd43; + .loc 2 291 36 // standard.py:291:36 @[ c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:45:26 ] + and.b32 %r1478, %r801, 769; + setp.eq.b32 %p395, %r1478, 0; + // begin inline asm + @%p395 st.shared.b64 [ %r778 + 0 ], %rd11; + // end inline asm + bar.sync 0; + ld.shared.b32 %r798, [%r1471]; +$L__tmp8: + .loc 1 49 35 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:49:35 + shl.b32 %r1479, %r805, 4; + shl.b32 %r1480, %r806, 4; + .loc 1 49 32 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:49:32 + or.b32 %r1481, %r1479, %r809; + or.b32 %r1482, %r1480, %r809; + .loc 1 49 25 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:49:25 + mad.wide.s32 %rd12, %r1481, 4, %rd16; + mad.wide.s32 %rd13, %r1482, 4, %rd16; + .loc 1 49 47 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:49:47 + bar.sync 0; + shl.b32 %r1483, %r801, 4; + and.b32 %r1484, %r1483, 4080; + add.s32 %r1485, %r818, %r1484; + st.shared.v4.b32 [%r1485], {%r1454, %r1453, %r1452, %r1451}; + bar.sync 0; + and.b32 %r1486, %r1483, 112; + and.b32 %r1487, %r808, 896; + shl.b32 %r1488, %r801, 8; + and.b32 %r1489, %r1488, 2048; + shl.b32 %r1490, %r801, 6; + and.b32 %r1491, %r1490, 1024; + add.s32 %r1492, %r818, %r1486; + add.s32 %r1493, %r1492, %r1489; + add.s32 %r1494, %r1493, %r1491; + add.s32 %r784, %r1494, %r1487; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r790, %r791, %r794, %r795}, [%r784]; + // end inline asm + bar.sync 0; + st.shared.v4.b32 [%r1485], {%r1470, %r1469, %r1468, %r1467}; + bar.sync 0; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r792, %r793, %r796, %r797}, [%r784]; + // end inline asm + // begin inline asm + @%p396 st.global.v4.b32 [ %rd12 + 0 ], { %r790, %r791, %r792, %r793 }; + // end inline asm + // begin inline asm + @%p397 st.global.v4.b32 [ %rd13 + 0 ], { %r794, %r795, %r796, %r797 }; + // end inline asm + .loc 1 50 25 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:50:25 + mad.wide.s32 %rd14, %r804, 4, %rd17; + .loc 1 50 37 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:50:37 + and.b32 %r1495, %r801, 128; + setp.eq.b32 %p664, %r1495, 0; + and.pred %p398, %p664, %p1; + // begin inline asm + @%p398 st.global.b32 [ %rd14 + 0 ], { %r798 }; + // end inline asm + .loc 1 50 4 // c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py:50:4 + ret; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 267 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x104 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 52 +.b8 121 +.b8 120 +.b8 103 +.b8 111 +.b8 116 +.b8 105 +.b8 104 +.b8 111 +.b8 120 +.b8 112 +.b8 110 +.b8 54 +.b8 111 +.b8 53 +.b8 120 +.b8 97 +.b8 52 +.b8 106 +.b8 118 +.b8 107 +.b8 99 +.b8 121 +.b8 55 +.b8 115 +.b8 104 +.b8 108 +.b8 103 +.b8 110 +.b8 121 +.b8 118 +.b8 52 +.b8 52 +.b8 117 +.b8 54 +.b8 100 +.b8 112 +.b8 109 +.b8 53 +.b8 101 +.b8 55 +.b8 52 +.b8 54 +.b8 102 +.b8 54 +.b8 100 +.b8 119 +.b8 103 +.b8 55 +.b8 111 +.b8 101 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 52 +.b8 121 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x3d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 99 +.b8 108 +.b8 111 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 108 +.b8 105 +.b8 99 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 116 +.b8 114 +.b8 97 +.b8 110 +.b8 115 +.b8 112 +.b8 111 +.b8 115 +.b8 101 +.b8 95 +.b8 51 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc8:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xdd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 67 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xf5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source b/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source new file mode 100644 index 0000000000000000000000000000000000000000..10c934a2d57358044c1f97b6d5e4f0855278e68c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source @@ -0,0 +1,1221 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":18:0) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc88 = loc(unknown) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc140 = loc("in_ptr0"(#loc)) +#loc141 = loc("out_ptr2"(#loc)) +#loc142 = loc("out_ptr3"(#loc)) +#loc143 = loc("xnumel"(#loc)) +#loc144 = loc("r0_numel"(#loc)) +#loc174 = loc("x"(#loc38)) +#loc175 = loc("idxs"(#loc38)) +#loc176 = loc("x"(#loc42)) +#loc177 = loc("idxs"(#loc42)) +#loc182 = loc("x"(#loc50)) +#loc183 = loc("idxs"(#loc50)) +#loc184 = loc("flip"(#loc50)) +#loc240 = loc("input"(#loc113)) +#loc241 = loc("a"(#loc117)) +#loc242 = loc("b"(#loc117)) +#loc244 = loc("x"(#loc122)) +#loc245 = loc("x"(#loc126)) +#loc246 = loc("input"(#loc135)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 128 : i32 loc(#loc145) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc146) + %xoffset = tt.get_program_id x : i32 loc(#loc147) + %xoffset_2 = arith.constant 128 : i32 loc(#loc148) + %xoffset_3 = arith.constant 128 : i32 loc(#loc148) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc148) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc149) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc150) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<128x1xi32> loc(#loc151) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<128x1xi32> loc(#loc151) + %xmask = arith.constant dense<128> : tensor<128x1xi32> loc(#loc152) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<128x1xi32> loc(#loc152) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc153) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc154) + %r0_offset = arith.constant 0 : i32 loc(#loc155) + %r0_mask = arith.constant true loc(#loc156) + %r0_mask_10 = arith.constant dense : tensor<128x16xi1> loc(#loc156) + %x0 = arith.constant 16 : i32 loc(#loc157) + %x0_11 = arith.constant 16 : i32 loc(#loc157) + %x0_12 = arith.constant dense<16> : tensor<128x1xi32> loc(#loc157) + %x0_13 = arith.remsi %xindex_7, %x0_12 : tensor<128x1xi32> loc(#loc157) + %x1 = arith.constant 16 : i32 loc(#loc158) + %x1_14 = arith.constant 16 : i32 loc(#loc158) + %x1_15 = arith.constant dense<16> : tensor<128x1xi32> loc(#loc158) + %x1_16 = arith.divsi %xindex_7, %x1_15 : tensor<128x1xi32> loc(#loc158) + %tmp0 = arith.constant 17 : i32 loc(#loc159) + %tmp0_17 = arith.constant 17 : i32 loc(#loc159) + %tmp0_18 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc159) + %tmp0_19 = arith.muli %tmp0_18, %r0_index_9 : tensor<1x16xi32> loc(#loc159) + %tmp0_20 = tt.broadcast %x0_13 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc160) + %tmp0_21 = tt.broadcast %tmp0_19 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc160) + %tmp0_22 = arith.addi %tmp0_20, %tmp0_21 : tensor<128x16xi32> loc(#loc160) + %tmp0_23 = arith.constant 272 : i32 loc(#loc161) + %tmp0_24 = arith.constant 272 : i32 loc(#loc161) + %tmp0_25 = arith.constant dense<272> : tensor<128x1xi32> loc(#loc161) + %tmp0_26 = arith.muli %tmp0_25, %x1_16 : tensor<128x1xi32> loc(#loc161) + %tmp0_27 = tt.broadcast %tmp0_26 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc162) + %tmp0_28 = arith.addi %tmp0_22, %tmp0_27 : tensor<128x16xi32> loc(#loc162) + %tmp0_29 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc163) + %tmp0_30 = tt.addptr %tmp0_29, %tmp0_28 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc163) + %tmp0_31 = arith.constant 0.000000e+00 : f32 loc(#loc164) + %tmp0_32 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc164) + %tmp0_33 = arith.constant dense<0.000000e+00> : tensor<128x16xf32> loc(#loc164) + %tmp0_34 = arith.fptosi %tmp0_33 : tensor<128x16xf32> to tensor<128x16xi32> loc(#loc164) + %tmp0_35 = tt.load %tmp0_30, %tmp0_32, %tmp0_34 : tensor<128x16x!tt.ptr> loc(#loc164) + %tmp2 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc165) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<128x16xi16> loc(#loc166) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S128_16S_i16S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp0_35, %tmp4) : (tensor<128x16xi32>, tensor<128x16xi16>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc23) + %tmp7 = arith.extsi %tmp0_35 : tensor<128x16xi32> to tensor<128x16xi64> loc(#loc167) + %tmp10 = arith.constant 0 : i32 loc(#loc168) + %tmp10_36 = arith.constant 0 : i64 loc(#loc168) + %tmp10_37 = arith.constant dense<0> : tensor<128x16xi64> loc(#loc168) + %tmp10_38 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc168) + %tmp10_39 = arith.select %tmp10_38, %tmp7, %tmp10_37 : tensor<128x16xi1>, tensor<128x16xi64> loc(#loc168) + %tmp11 = tt.call @"triton.language.standard.sum__i64S128_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp10_39) : (tensor<128x16xi64>) -> tensor<128xi64> loc(#loc169) + %tmp11_40 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<128xi64> -> tensor<128x1xi64> loc(#loc170) + %tmp12 = arith.extsi %0#1 : tensor<128x16xi32> to tensor<128x16xi64> loc(#loc171) + %tmp13 = arith.trunci %tmp12 : tensor<128x16xi64> to tensor<128x16xi32> loc(#loc172) + %tmp14 = arith.trunci %tmp11_40 : tensor<128x1xi64> to tensor<128x1xi32> loc(#loc173) + %c16_i32 = arith.constant 16 : i32 loc(#loc31) + %c16_i32_41 = arith.constant 16 : i32 loc(#loc31) + %cst = arith.constant dense<16> : tensor<128x1xi32> loc(#loc31) + %1 = arith.muli %cst, %xindex_7 : tensor<128x1xi32> loc(#loc31) + %2 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc32) + %3 = tt.broadcast %1 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc32) + %4 = arith.addi %2, %3 : tensor<128x16xi32> loc(#loc32) + %5 = tt.splat %out_ptr2 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc33) + %6 = tt.addptr %5, %4 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc33) + %7 = tt.broadcast %xmask_8 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc34) + tt.store %6, %tmp13, %7 : tensor<128x16x!tt.ptr> loc(#loc34) + %8 = tt.splat %out_ptr3 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc35) + %9 = tt.addptr %8, %xindex_7 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc35) + tt.store %9, %tmp14, %xmask_8 : tensor<128x1x!tt.ptr> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S128_16S_i16S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc38)), %idxs: tensor<128x16xi16> loc("idxs"(#loc38))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i16S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<128x16xi32>, tensor<128x16xi16>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc39) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc39) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc39) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc39) + tt.return %3#0, %3#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc40) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc41) + %5 = ub.poison : tensor<128x16xi32> loc(#loc41) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc41) + } loc(#loc38) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i16S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc42)), %idxs: tensor<128x16xi16> loc("idxs"(#loc42))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc178) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc179) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc179) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc180) + %flip_3 = tt.reshape %flip_2 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc181) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i16S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi16>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc47) + tt.return %0#0, %0#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc48) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x16xi32> loc(#loc49) + %2 = ub.poison : tensor<128x16xi32> loc(#loc49) + tt.return %1, %2 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i16S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc50)), %idxs: tensor<128x16xi16> loc("idxs"(#loc50)), %flip: tensor<128x16xi32> loc("flip"(#loc50))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<1024x2x1xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<1024x2x1xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<128x16xi16> -> tensor<1024x2x1xi16> loc(#loc199) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc200) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<1024x2x1xi16> loc(#loc201) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<1024x2x1xi16> loc(#loc201) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<1024x2x1xi16>) -> tensor<1024x1xi32> loc(#loc202) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc203) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc204) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc205) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<1024x2x1xi16> loc(#loc206) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<1024x2x1xi16> loc(#loc206) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<1024x2x1xi16>) -> tensor<1024x1xi32> loc(#loc207) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc208) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc209) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc210) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_27 = arith.constant dense : tensor<128x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_28 = arith.constant dense : tensor<128x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_49 = arith.constant true loc(#loc217) + %cond_50 = arith.constant dense : tensor<128x16xi1> loc(#loc217) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<128x16xi1> loc(#loc217) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<128x16xi1> loc(#loc218) + %cond_53 = arith.ori %cond, %cond_52 : tensor<128x16xi1> loc(#loc249) + scf.yield %cond_53 : tensor<128x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc221) + %eq_50 = arith.ori %eq, %eq_49 : tensor<128x16xi1> loc(#loc251) + scf.yield %eq_50 : tensor<128x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc88) + } loc(#loc91) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<128x16xi32> loc(#loc223) + %cond_30 = arith.andi %3, %cond_29 : tensor<128x16xi1> loc(#loc224) + %cond_31 = arith.ori %1, %cond_30 : tensor<128x16xi1> loc(#loc225) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<128x16xi1> loc(#loc226) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<128x16xi1> loc(#loc227) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<128x16xi1> loc(#loc228) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<128x16xi1> loc(#loc229) + %cond_36 = arith.extui %cond_35 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc230) + %cond_37 = arith.xori %cond_36, %flip : tensor<128x16xi32> loc(#loc230) + %cond_38 = arith.constant 0 : i32 loc(#loc231) + %cond_39 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc231) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<128x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc232) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc233) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc234) + %ret_43 = arith.xori %x, %ret_42 : tensor<128x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<128x16xi32> loc(#loc236) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S128_16S__(%idxs) : (tensor<128x16xi16>) -> tensor<128x16xi16> loc(#loc237) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<128x16xi16> to tensor<128x16xi32> loc(#loc238) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc238) + %new_idxs_47 = arith.extsi %idxs : tensor<128x16xi16> to tensor<128x16xi32> loc(#loc239) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<128x16xi32> loc(#loc239) + tt.return %ret_43, %new_idxs_48 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc112) + %5 = ub.poison : tensor<128x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1024x2x1xi32> loc("input"(#loc113))) -> tensor<1024x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc114) + tt.return %0 : tensor<1024x1xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1024x1xi32> loc(#loc116) + tt.return %1 : tensor<1024x1xi32> loc(#loc116) + } loc(#loc113) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc117)), %b: i32 loc("b"(#loc117))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc118) + tt.return %0 : i32 loc(#loc119) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc120) + tt.return %1 : i32 loc(#loc120) + } loc(#loc117) + tt.func private @"triton.language.standard.sum__i16S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1024x2x1xi16> loc("input"(#loc113))) -> tensor<1024x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<1024x2x1xi16> to tensor<1024x2x1xi32> loc(#loc243) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc114) + tt.return %0 : tensor<1024x1xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1024x1xi32> loc(#loc116) + tt.return %1 : tensor<1024x1xi32> loc(#loc116) + } loc(#loc113) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%x: tensor<128x16xi32> loc("x"(#loc122))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc123) + %false = arith.constant false loc(#loc124) + tt.return %false : i1 loc(#loc124) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc125) + tt.return %1 : i1 loc(#loc125) + } loc(#loc122) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S128_16S__(%x: tensor<128x16xi32> loc("x"(#loc126))) -> tensor<128x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc127) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc128) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc128) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<128x16xi32> loc(#loc128) + %4 = arith.addi %x, %3 : tensor<128x16xi32> loc(#loc128) + tt.return %4 : tensor<128x16xi32> loc(#loc129) + ^bb1: // no predecessors + %5 = ub.poison : tensor<128x16xi32> loc(#loc130) + tt.return %5 : tensor<128x16xi32> loc(#loc130) + } loc(#loc126) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc132) + %cst = arith.constant dense : tensor<1xi1> loc(#loc132) + tt.return %cst : tensor<1xi1> loc(#loc133) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc134) + tt.return %0 : tensor<1xi1> loc(#loc134) + } loc(#loc131) + tt.func private @triton.language.standard.zeros_like__i32S128_16S__(%input: tensor<128x16xi32> loc("input"(#loc135))) -> tensor<128x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<128x16xi32> loc(#loc136) + tt.return %0 : tensor<128x16xi32> loc(#loc137) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x16xi32> loc(#loc138) + tt.return %1 : tensor<128x16xi32> loc(#loc138) + } loc(#loc135) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<128x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc132) + %cst = arith.constant dense<0> : tensor<128x16xi32> loc(#loc132) + tt.return %cst : tensor<128x16xi32> loc(#loc133) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x16xi32> loc(#loc134) + tt.return %0 : tensor<128x16xi32> loc(#loc134) + } loc(#loc131) + tt.func private @triton.language.standard.zeros_like__i16S128_16S__(%input: tensor<128x16xi16> loc("input"(#loc135))) -> tensor<128x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<128x16xi16> loc(#loc136) + tt.return %0 : tensor<128x16xi16> loc(#loc137) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x16xi16> loc(#loc138) + tt.return %1 : tensor<128x16xi16> loc(#loc138) + } loc(#loc135) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<128x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc132) + %cst = arith.constant dense<0> : tensor<128x16xi16> loc(#loc132) + tt.return %cst : tensor<128x16xi16> loc(#loc133) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x16xi16> loc(#loc134) + tt.return %0 : tensor<128x16xi16> loc(#loc134) + } loc(#loc131) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc42)), %idxs: tensor<128x16xi32> loc("idxs"(#loc42))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc178) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc179) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc179) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc180) + %flip_3 = tt.reshape %flip_2 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc181) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc47) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc47) + tt.return %1#0, %1#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc48) + ^bb1: // no predecessors + %2 = ub.poison : tensor<128x16xi32> loc(#loc49) + %3 = ub.poison : tensor<128x16xi32> loc(#loc49) + tt.return %2, %3 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc50)), %idxs: tensor<128x16xi32> loc("idxs"(#loc50)), %flip: tensor<128x16xi32> loc("flip"(#loc50))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<512x2x2xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<512x2x2xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<512x2x2xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<512x2x2xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_45 = arith.constant true loc(#loc217) + %cond_46 = arith.constant dense : tensor<128x16xi1> loc(#loc217) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<128x16xi1> loc(#loc217) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<128x16xi1> loc(#loc218) + %cond_49 = arith.ori %cond, %cond_48 : tensor<128x16xi1> loc(#loc249) + scf.yield %cond_49 : tensor<128x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc221) + %eq_46 = arith.ori %eq, %eq_45 : tensor<128x16xi1> loc(#loc251) + scf.yield %eq_46 : tensor<128x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc229) + %cond_34 = arith.extui %cond_33 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc230) + %cond_35 = arith.xori %cond_34, %flip : tensor<128x16xi32> loc(#loc230) + %cond_36 = arith.constant 0 : i32 loc(#loc231) + %cond_37 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc231) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<128x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc232) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc233) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc234) + %ret_41 = arith.xori %x, %ret_40 : tensor<128x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc236) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc237) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc238) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<128x16xi32> loc(#loc239) + tt.return %ret_41, %new_idxs_44 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc112) + %5 = ub.poison : tensor<128x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<512x2x2xi32> loc("input"(#loc113))) -> tensor<512x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc114) + tt.return %0 : tensor<512x2xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<512x2xi32> loc(#loc116) + tt.return %1 : tensor<512x2xi32> loc(#loc116) + } loc(#loc113) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc50)), %idxs: tensor<128x16xi32> loc("idxs"(#loc50)), %flip: tensor<128x16xi32> loc("flip"(#loc50))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<1024x2x1xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<1024x2x1xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<1024x2x1xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<1024x2x1xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_45 = arith.constant true loc(#loc217) + %cond_46 = arith.constant dense : tensor<128x16xi1> loc(#loc217) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<128x16xi1> loc(#loc217) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<128x16xi1> loc(#loc218) + %cond_49 = arith.ori %cond, %cond_48 : tensor<128x16xi1> loc(#loc249) + scf.yield %cond_49 : tensor<128x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc221) + %eq_46 = arith.ori %eq, %eq_45 : tensor<128x16xi1> loc(#loc251) + scf.yield %eq_46 : tensor<128x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc229) + %cond_34 = arith.extui %cond_33 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc230) + %cond_35 = arith.xori %cond_34, %flip : tensor<128x16xi32> loc(#loc230) + %cond_36 = arith.constant 0 : i32 loc(#loc231) + %cond_37 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc231) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<128x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc232) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc233) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc234) + %ret_41 = arith.xori %x, %ret_40 : tensor<128x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc236) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc237) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc238) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<128x16xi32> loc(#loc239) + tt.return %ret_41, %new_idxs_44 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc112) + %5 = ub.poison : tensor<128x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc42)), %idxs: tensor<128x16xi32> loc("idxs"(#loc42))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc178) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc179) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc179) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc180) + %flip_3 = tt.reshape %flip_2 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc181) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc47) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc47) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<128x16xi32>, tensor<128x16xi32>, tensor<128x16xi32>) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc47) + tt.return %2#0, %2#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc48) + ^bb1: // no predecessors + %3 = ub.poison : tensor<128x16xi32> loc(#loc49) + %4 = ub.poison : tensor<128x16xi32> loc(#loc49) + tt.return %3, %4 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc50)), %idxs: tensor<128x16xi32> loc("idxs"(#loc50)), %flip: tensor<128x16xi32> loc("flip"(#loc50))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x4xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<256x2x4xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x4xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x4xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_45 = arith.constant true loc(#loc217) + %cond_46 = arith.constant dense : tensor<128x16xi1> loc(#loc217) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<128x16xi1> loc(#loc217) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<128x16xi1> loc(#loc218) + %cond_49 = arith.ori %cond, %cond_48 : tensor<128x16xi1> loc(#loc249) + scf.yield %cond_49 : tensor<128x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc221) + %eq_46 = arith.ori %eq, %eq_45 : tensor<128x16xi1> loc(#loc251) + scf.yield %eq_46 : tensor<128x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc229) + %cond_34 = arith.extui %cond_33 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc230) + %cond_35 = arith.xori %cond_34, %flip : tensor<128x16xi32> loc(#loc230) + %cond_36 = arith.constant 0 : i32 loc(#loc231) + %cond_37 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc231) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<128x16xi32> loc(#loc231) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc232) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc233) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc234) + %ret_41 = arith.xori %x, %ret_40 : tensor<128x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc236) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc237) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc238) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<128x16xi32> loc(#loc239) + tt.return %ret_41, %new_idxs_44 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc112) + %5 = ub.poison : tensor<128x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<256x2x4xi32> loc("input"(#loc113))) -> tensor<256x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc114) + tt.return %0 : tensor<256x4xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<256x4xi32> loc(#loc116) + tt.return %1 : tensor<256x4xi32> loc(#loc116) + } loc(#loc113) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S128_16S_i32S128_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc42)), %idxs: tensor<128x16xi32> loc("idxs"(#loc42))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc247) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<128x16xi32>, tensor<128x16xi32>, i1) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc47) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<128x16xi32>, tensor<128x16xi32>, i1) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc47) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<128x16xi32>, tensor<128x16xi32>, i1) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc47) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<128x16xi32>, tensor<128x16xi32>, i1) -> (tensor<128x16xi32>, tensor<128x16xi32>) loc(#loc47) + tt.return %3#0, %3#1 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc48) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc49) + %5 = ub.poison : tensor<128x16xi32> loc(#loc49) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc49) + } loc(#loc42) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc50)), %idxs: tensor<128x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<128x2x8xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<128x2x8xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<128x2x8xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<128x2x8xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<128x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<128x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<128x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<128x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<128x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<128x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<128x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<128x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<128x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<128x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<128x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc112) + %5 = ub.poison : tensor<128x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i32S128_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x2x8xi32> loc("input"(#loc113))) -> tensor<128x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc114) + tt.reduce.return %2 : i32 loc(#loc114) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc114) + tt.return %0 : tensor<128x8xi32> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x8xi32> loc(#loc116) + tt.return %1 : tensor<128x8xi32> loc(#loc116) + } loc(#loc113) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc50)), %idxs: tensor<128x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x4xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<256x2x4xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x4xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x4xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<128x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<128x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<128x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<128x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<128x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<128x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<128x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<128x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<128x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<128x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<128x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc112) + %5 = ub.poison : tensor<128x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc50)), %idxs: tensor<128x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<512x2x2xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<512x2x2xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<512x2x2xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<512x2x2xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S512_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<128x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<128x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<128x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<128x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<128x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<128x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<128x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<128x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<128x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<128x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<128x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc112) + %5 = ub.poison : tensor<128x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S128_16S_i32S128_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<128x16xi32> loc("x"(#loc50)), %idxs: tensor<128x16xi32> loc("idxs"(#loc50)), %flip: i1 loc("flip"(#loc50))) -> (tensor<128x16xi32>, tensor<128x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc185) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc186) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc187) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc187) + %left_mask = arith.constant 1 : i32 loc(#loc188) + %left_mask_2 = arith.constant 1 : i32 loc(#loc188) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc188) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc188) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc189) + %ileft_5 = arith.muli %y, %ileft : tensor<1024x2x1xi32> loc(#loc189) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc190) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc191) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc192) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc193) + %iright_9 = arith.muli %y, %iright : tensor<1024x2x1xi32> loc(#loc193) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc194) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc195) + %iright_12 = tt.broadcast %iright_11 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc196) + %ileft_13 = tt.reshape %ileft_8 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc197) + %iright_14 = tt.reshape %iright_12 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc198) + %y_idx = tt.reshape %idxs : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc199) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc201) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<1024x2x1xi32> loc(#loc201) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc202) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc203) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc204) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc206) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<1024x2x1xi32> loc(#loc206) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S1024_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc207) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc208) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc209) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc210) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc211) + %left_valid_mask = arith.constant true loc(#loc212) + %left_valid_mask_25 = arith.constant dense : tensor<128x16xi1> loc(#loc212) + %right_valid_mask = arith.constant true loc(#loc213) + %right_valid_mask_26 = arith.constant dense : tensor<128x16xi1> loc(#loc213) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<128x16xi32> loc(#loc214) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<128x16xi32> loc(#loc215) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc248) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc83) + %1 = scf.if %0 -> (tensor<128x16xi1>) { + %cond_42 = arith.constant true loc(#loc217) + %cond_43 = arith.constant dense : tensor<128x16xi1> loc(#loc217) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<128x16xi1> loc(#loc217) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<128x16xi1> loc(#loc218) + %cond_46 = arith.ori %cond, %cond_45 : tensor<128x16xi1> loc(#loc249) + scf.yield %cond_46 : tensor<128x16xi1> loc(#loc249) + } else { + scf.yield %cond : tensor<128x16xi1> loc(#loc88) + } loc(#loc84) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc250) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S128_16S__(%ileft_13) : (tensor<128x16xi32>) -> i1 loc(#loc90) + %3 = scf.if %2 -> (tensor<128x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<128x16xi1> loc(#loc221) + %eq_43 = arith.ori %eq, %eq_42 : tensor<128x16xi1> loc(#loc251) + scf.yield %eq_43 : tensor<128x16xi1> loc(#loc251) + } else { + scf.yield %eq : tensor<128x16xi1> loc(#loc88) + } loc(#loc91) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc223) + %cond_28 = arith.andi %3, %cond_27 : tensor<128x16xi1> loc(#loc224) + %cond_29 = arith.ori %1, %cond_28 : tensor<128x16xi1> loc(#loc225) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc226) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<128x16xi1> loc(#loc227) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<128x16xi1> loc(#loc228) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<128x16xi1> loc(#loc229) + %cond_34 = tt.splat %flip : i1 -> tensor<128x16xi1> loc(#loc230) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<128x16xi1> loc(#loc230) + %ret = arith.xori %ileft_13, %iright_14 : tensor<128x16xi32> loc(#loc232) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%x) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc233) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc234) + %ret_38 = arith.xori %x, %ret_37 : tensor<128x16xi32> loc(#loc235) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<128x16xi32> loc(#loc236) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S128_16S__(%idxs) : (tensor<128x16xi32>) -> tensor<128x16xi32> loc(#loc237) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc238) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<128x16xi32> loc(#loc239) + tt.return %ret_38, %new_idxs_41 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc111) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x16xi32> loc(#loc112) + %5 = ub.poison : tensor<128x16xi32> loc(#loc112) + tt.return %4, %5 : tensor<128x16xi32>, tensor<128x16xi32> loc(#loc112) + } loc(#loc50) + tt.func private @"triton.language.standard.sum__i64S128_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x16xi64> loc("input"(#loc113))) -> tensor<128xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc114) + tt.reduce.return %2 : i64 loc(#loc114) + }) : (tensor<128x16xi64>) -> tensor<128xi64> loc(#loc114) + tt.return %0 : tensor<128xi64> loc(#loc115) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128xi64> loc(#loc116) + tt.return %1 : tensor<128xi64> loc(#loc116) + } loc(#loc113) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc117)), %b: i64 loc("b"(#loc117))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc118) + tt.return %0 : i64 loc(#loc119) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc120) + tt.return %1 : i64 loc(#loc120) + } loc(#loc117) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":33:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":34:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:38) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:49) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:54) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":38:19) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":40:33) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":41:67) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":42:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":44:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":45:26) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":45:29) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":46:20) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":47:21) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":48:21) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:35) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:32) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:47) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:37) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:4) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc145 = loc("xnumel"(#loc1)) +#loc146 = loc("r0_numel"(#loc2)) +#loc147 = loc("xoffset"(#loc3)) +#loc148 = loc("xoffset"(#loc4)) +#loc149 = loc("xindex"(#loc5)) +#loc150 = loc("xindex"(#loc6)) +#loc151 = loc("xindex"(#loc7)) +#loc152 = loc("xmask"(#loc8)) +#loc153 = loc("r0_index"(#loc9)) +#loc154 = loc("r0_index"(#loc10)) +#loc155 = loc("r0_offset"(#loc11)) +#loc156 = loc("r0_mask"(#loc12)) +#loc157 = loc("x0"(#loc13)) +#loc158 = loc("x1"(#loc14)) +#loc159 = loc("tmp0"(#loc15)) +#loc160 = loc("tmp0"(#loc16)) +#loc161 = loc("tmp0"(#loc17)) +#loc162 = loc("tmp0"(#loc18)) +#loc163 = loc("tmp0"(#loc19)) +#loc164 = loc("tmp0"(#loc20)) +#loc165 = loc("tmp2"(#loc21)) +#loc166 = loc("tmp4"(#loc22)) +#loc167 = loc("tmp7"(#loc24)) +#loc168 = loc("tmp10"(#loc25)) +#loc169 = loc("tmp11"(#loc26)) +#loc170 = loc("tmp11"(#loc27)) +#loc171 = loc("tmp12"(#loc28)) +#loc172 = loc("tmp13"(#loc29)) +#loc173 = loc("tmp14"(#loc30)) +#loc178 = loc("flip"(#loc43)) +#loc179 = loc("flip"(#loc44)) +#loc180 = loc("flip"(#loc45)) +#loc181 = loc("flip"(#loc46)) +#loc185 = loc("y"(#loc51)) +#loc186 = loc("right_mask"(#loc52)) +#loc187 = loc("right_mask"(#loc53)) +#loc188 = loc("left_mask"(#loc54)) +#loc189 = loc("ileft"(#loc55)) +#loc190 = loc("ileft"(#loc56)) +#loc191 = loc("ileft"(#loc57)) +#loc192 = loc("ileft"(#loc58)) +#loc193 = loc("iright"(#loc59)) +#loc194 = loc("iright"(#loc60)) +#loc195 = loc("iright"(#loc61)) +#loc196 = loc("iright"(#loc62)) +#loc197 = loc("ileft"(#loc63)) +#loc198 = loc("iright"(#loc64)) +#loc199 = loc("y_idx"(#loc65)) +#loc200 = loc("left_idx"(#loc66)) +#loc201 = loc("left_idx"(#loc67)) +#loc202 = loc("left_idx"(#loc68)) +#loc203 = loc("left_idx"(#loc69)) +#loc204 = loc("left_idx"(#loc70)) +#loc205 = loc("right_idx"(#loc71)) +#loc206 = loc("right_idx"(#loc72)) +#loc207 = loc("right_idx"(#loc73)) +#loc208 = loc("right_idx"(#loc74)) +#loc209 = loc("right_idx"(#loc75)) +#loc210 = loc("left_idx"(#loc76)) +#loc211 = loc("right_idx"(#loc77)) +#loc212 = loc("left_valid_mask"(#loc78)) +#loc213 = loc("right_valid_mask"(#loc79)) +#loc214 = loc("left_isnan"(#loc80)) +#loc215 = loc("right_isnan"(#loc81)) +#loc216 = loc("cond"(#loc82)) +#loc217 = loc("cond"(#loc85)) +#loc218 = loc("cond"(#loc86)) +#loc219 = loc("cond"(#loc87)) +#loc220 = loc("eq"(#loc89)) +#loc221 = loc("eq"(#loc92)) +#loc222 = loc("eq"(#loc93)) +#loc223 = loc("cond"(#loc94)) +#loc224 = loc("cond"(#loc95)) +#loc225 = loc("cond"(#loc96)) +#loc226 = loc("cond"(#loc97)) +#loc227 = loc("cond"(#loc98)) +#loc228 = loc("cond"(#loc99)) +#loc229 = loc("cond"(#loc100)) +#loc230 = loc("cond"(#loc101)) +#loc231 = loc("cond"(#loc102)) +#loc232 = loc("ret"(#loc103)) +#loc233 = loc("ret"(#loc104)) +#loc234 = loc("ret"(#loc105)) +#loc235 = loc("ret"(#loc106)) +#loc236 = loc("new_idxs"(#loc107)) +#loc237 = loc("new_idxs"(#loc108)) +#loc238 = loc("new_idxs"(#loc109)) +#loc239 = loc("new_idxs"(#loc110)) +#loc243 = loc("input"(#loc121)) +#loc247 = loc("flip"(#loc139)) +#loc248 = loc("cond"(#loc216)) +#loc249 = loc("cond"(#loc219)) +#loc250 = loc("eq"(#loc220)) +#loc251 = loc("eq"(#loc222)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..23b99bb24a142094d95efd59ee4c6d39ff5f3797 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir @@ -0,0 +1,841 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}> +#linear = #ttg.linear<{register = [[0, 2], [0, 4], [0, 8]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0]], warp = [[32, 0], [64, 0], [0, 1]], block = []}> +#linear1 = #ttg.linear<{register = [[1, 0, 0], [2, 0, 0], [4, 0, 0]], lane = [[8, 0, 0], [16, 0, 0], [32, 0, 0], [64, 0, 0], [128, 0, 0]], warp = [[256, 0, 0], [512, 0, 0], [0, 1, 0]], block = []}> +#linear2 = #ttg.linear<{register = [[0, 1, 0], [1, 0, 0], [2, 0, 0]], lane = [[4, 0, 0], [8, 0, 0], [16, 0, 0], [32, 0, 0], [64, 0, 0]], warp = [[128, 0, 0], [256, 0, 0], [0, 0, 1]], block = []}> +#linear3 = #ttg.linear<{register = [[0, 0, 2], [0, 1, 0], [1, 0, 0]], lane = [[2, 0, 0], [4, 0, 0], [8, 0, 0], [16, 0, 0], [32, 0, 0]], warp = [[64, 0, 0], [128, 0, 0], [0, 0, 1]], block = []}> +#linear4 = #ttg.linear<{register = [[0, 0, 2], [0, 0, 4], [0, 1, 0]], lane = [[1, 0, 0], [2, 0, 0], [4, 0, 0], [8, 0, 0], [16, 0, 0]], warp = [[32, 0, 0], [64, 0, 0], [0, 0, 1]], block = []}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":18:0) +#loc1 = loc(unknown) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":41:67) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":45:26) +#loc77 = loc("in_ptr0"(#loc)) +#loc78 = loc("out_ptr2"(#loc)) +#loc79 = loc("out_ptr3"(#loc)) +#loc80 = loc("xnumel"(#loc)) +#loc81 = loc("r0_numel"(#loc)) +#loc99 = loc(callsite(#loc19 at #loc20)) +#loc105 = loc("ileft"(#loc28)) +#loc109 = loc("iright"(#loc33)) +#loc118 = loc("left_idx"(#loc42)) +#loc123 = loc("right_idx"(#loc47)) +#loc143 = loc("tmp11"(#loc67)) +#loc149 = loc(callsite(#loc24 at #loc99)) +#loc153 = loc(callsite(#loc1 at #loc143)) +#loc157 = loc(callsite(#loc105 at #loc149)) +#loc161 = loc(callsite(#loc109 at #loc149)) +#loc169 = loc(callsite(#loc118 at #loc149)) +#loc174 = loc(callsite(#loc123 at #loc149)) +#loc194 = loc(callsite(#loc1 at #loc157)) +#loc196 = loc(callsite(#loc1 at #loc161)) +#loc199 = loc(callsite(#loc1 at #loc169)) +#loc202 = loc(callsite(#loc1 at #loc174)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<128x16xi32, #linear> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<128x16xi64, #blocked> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %cst_1 = arith.constant dense<128> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<128x1xi32, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<16> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<16> : tensor<128x1xi32, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<17> : tensor<1x16xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<272> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<1> : tensor<1x2x1xi32, #linear1> loc(#loc1) + %cst_8 = arith.constant dense<1> : tensor<1x2x1xi32, #linear2> loc(#loc1) + %cst_9 = arith.constant dense<1> : tensor<1x2x1xi32, #linear3> loc(#loc1) + %cst_10 = arith.constant dense<1> : tensor<1x2x1xi32, #linear4> loc(#loc1) + %cst_11 = arith.constant dense<0> : tensor<128x16xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc82) + %xoffset_12 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc83) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc84) + %xindex_13 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc84) + %xindex_14 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc84) + %xindex_15 = tt.expand_dims %xindex_13 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<128x1xi32, #blocked1> loc(#loc84) + %xindex_16 = tt.splat %xoffset_12 : i32 -> tensor<128x1xi32, #blocked> loc(#loc85) + %xindex_17 = tt.splat %xoffset_12 : i32 -> tensor<128x1xi32, #blocked1> loc(#loc85) + %xindex_18 = arith.addi %xindex_16, %xindex_14 : tensor<128x1xi32, #blocked> loc(#loc85) + %xindex_19 = arith.addi %xindex_17, %xindex_15 : tensor<128x1xi32, #blocked1> loc(#loc85) + %xmask = arith.cmpi slt, %xindex_18, %cst_1 : tensor<128x1xi32, #blocked> loc(#loc86) + %xmask_20 = arith.cmpi slt, %xindex_19, %cst_2 : tensor<128x1xi32, #blocked1> loc(#loc86) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc87) + %r0_index_21 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> loc(#loc87) + %r0_index_22 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc87) + %r0_index_23 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc87) + %r0_index_24 = tt.expand_dims %r0_index_21 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> -> tensor<1x16xi32, #linear> loc(#loc87) + %r0_index_25 = tt.expand_dims %r0_index_22 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x16xi32, #blocked1> loc(#loc87) + %x0 = arith.remsi %xindex_18, %cst_3 : tensor<128x1xi32, #blocked> loc(#loc88) + %x1 = arith.divsi %xindex_18, %cst_3 : tensor<128x1xi32, #blocked> loc(#loc89) + %tmp0 = arith.muli %r0_index_23, %cst_5 : tensor<1x16xi32, #blocked> loc(#loc90) + %tmp0_26 = tt.broadcast %x0 : tensor<128x1xi32, #blocked> -> tensor<128x16xi32, #blocked> loc(#loc91) + %tmp0_27 = tt.broadcast %tmp0 : tensor<1x16xi32, #blocked> -> tensor<128x16xi32, #blocked> loc(#loc91) + %tmp0_28 = arith.addi %tmp0_26, %tmp0_27 : tensor<128x16xi32, #blocked> loc(#loc91) + %tmp0_29 = arith.muli %x1, %cst_6 : tensor<128x1xi32, #blocked> loc(#loc92) + %tmp0_30 = tt.broadcast %tmp0_29 : tensor<128x1xi32, #blocked> -> tensor<128x16xi32, #blocked> loc(#loc93) + %tmp0_31 = arith.addi %tmp0_28, %tmp0_30 : tensor<128x16xi32, #blocked> loc(#loc93) + %tmp0_32 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x16x!tt.ptr, #blocked> loc(#loc94) + %tmp0_33 = tt.addptr %tmp0_32, %tmp0_31 : tensor<128x16x!tt.ptr, #blocked>, tensor<128x16xi32, #blocked> loc(#loc94) + %tmp0_34 = tt.broadcast %xmask : tensor<128x1xi1, #blocked> -> tensor<128x16xi1, #blocked> loc(#loc95) + %tmp0_35 = tt.broadcast %xmask_20 : tensor<128x1xi1, #blocked1> -> tensor<128x16xi1, #blocked1> loc(#loc95) + %tmp0_36 = tt.load %tmp0_33, %tmp0_34, %cst_11 : tensor<128x16x!tt.ptr, #blocked> loc(#loc95) + %tmp2 = arith.trunci %r0_index_24 : tensor<1x16xi32, #linear> to tensor<1x16xi16, #linear> loc(#loc96) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16, #linear> -> tensor<128x16xi16, #linear> loc(#loc97) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> loc(#loc146) + %flip_37 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> loc(#loc146) + %flip_38 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> loc(#loc146) + %flip_39 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> loc(#loc146) + %flip_40 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> loc(#loc146) + %flip_41 = tt.expand_dims %flip_37 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> loc(#loc146) + %flip_42 = tt.expand_dims %flip_38 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> loc(#loc146) + %flip_43 = tt.expand_dims %flip_39 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> loc(#loc146) + %flip_44 = tt.expand_dims %flip_40 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> -> tensor<1x2x1xi32, #linear2> loc(#loc146) + %flip_45 = tt.expand_dims %flip_41 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> -> tensor<1x2x1xi32, #linear1> loc(#loc146) + %flip_46 = tt.expand_dims %flip_42 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> -> tensor<1x2x1xi32, #linear3> loc(#loc146) + %flip_47 = tt.expand_dims %flip_43 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> -> tensor<1x2x1xi32, #linear4> loc(#loc146) + %flip_48 = tt.broadcast %flip_44 : tensor<1x2x1xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc147) + %flip_49 = tt.reshape %flip_48 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #blocked> loc(#loc148) + %flip_50 = tt.reshape %flip_48 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc148) + %y = tt.reshape %tmp0_36 : tensor<128x16xi32, #blocked> -> tensor<1024x2x1xi32, #linear1> loc(#loc154) + %left_mask = arith.subi %cst_7, %flip_45 : tensor<1x2x1xi32, #linear1> loc(#loc155) + %left_mask_51 = arith.subi %cst_8, %flip_44 : tensor<1x2x1xi32, #linear2> loc(#loc155) + %left_mask_52 = arith.subi %cst_9, %flip_46 : tensor<1x2x1xi32, #linear3> loc(#loc155) + %left_mask_53 = arith.subi %cst_10, %flip_47 : tensor<1x2x1xi32, #linear4> loc(#loc155) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc156) + %ileft_54 = arith.muli %y, %ileft : tensor<1024x2x1xi32, #linear1> loc(#loc156) + %ileft_55 = "tt.reduce"(%ileft_54) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_56 = tt.expand_dims %ileft_55 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc158) + %ileft_57 = tt.broadcast %ileft_56 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc159) + %iright = tt.broadcast %flip_45 : tensor<1x2x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc160) + %iright_58 = arith.muli %y, %iright : tensor<1024x2x1xi32, #linear1> loc(#loc160) + %iright_59 = "tt.reduce"(%iright_58) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_60 = tt.expand_dims %iright_59 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc162) + %iright_61 = tt.broadcast %iright_60 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc163) + %ileft_62 = tt.reshape %ileft_57 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #blocked> loc(#loc164) + %ileft_63 = tt.reshape %ileft_57 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc164) + %iright_64 = tt.reshape %iright_61 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #blocked> loc(#loc165) + %iright_65 = tt.reshape %iright_61 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc165) + %y_idx = tt.reshape %tmp4 : tensor<128x16xi16, #linear> -> tensor<1024x2x1xi16, #linear1> loc(#loc166) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc167) + %left_idx_66 = tt.broadcast %left_idx : tensor<1x2x1xi16, #linear1> -> tensor<1024x2x1xi16, #linear1> loc(#loc168) + %left_idx_67 = arith.muli %y_idx, %left_idx_66 : tensor<1024x2x1xi16, #linear1> loc(#loc168) + %input = arith.extsi %left_idx_67 : tensor<1024x2x1xi16, #linear1> to tensor<1024x2x1xi32, #linear1> loc(#loc197) + %left_idx_68 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_69 = tt.expand_dims %left_idx_68 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc170) + %left_idx_70 = tt.broadcast %left_idx_69 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc171) + %right_idx = arith.trunci %flip_45 : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc172) + %right_idx_71 = tt.broadcast %right_idx : tensor<1x2x1xi16, #linear1> -> tensor<1024x2x1xi16, #linear1> loc(#loc173) + %right_idx_72 = arith.muli %y_idx, %right_idx_71 : tensor<1024x2x1xi16, #linear1> loc(#loc173) + %input_73 = arith.extsi %right_idx_72 : tensor<1024x2x1xi16, #linear1> to tensor<1024x2x1xi32, #linear1> loc(#loc200) + %right_idx_74 = "tt.reduce"(%input_73) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_75 = tt.expand_dims %right_idx_74 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc175) + %right_idx_76 = tt.broadcast %right_idx_75 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc176) + %left_idx_77 = tt.reshape %left_idx_70 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #blocked> loc(#loc177) + %left_idx_78 = tt.reshape %left_idx_70 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc177) + %right_idx_79 = tt.reshape %right_idx_76 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #blocked> loc(#loc178) + %right_idx_80 = tt.reshape %right_idx_76 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc178) + %cond = arith.cmpi slt, %ileft_62, %iright_64 : tensor<128x16xi32, #blocked> loc(#loc179) + %cond_81 = arith.cmpi slt, %ileft_63, %iright_65 : tensor<128x16xi32, #linear> loc(#loc179) + %eq = arith.cmpi eq, %ileft_62, %iright_64 : tensor<128x16xi32, #blocked> loc(#loc180) + %eq_82 = arith.cmpi eq, %ileft_63, %iright_65 : tensor<128x16xi32, #linear> loc(#loc180) + %cond_83 = arith.cmpi sgt, %left_idx_77, %right_idx_79 : tensor<128x16xi32, #blocked> loc(#loc181) + %cond_84 = arith.cmpi sgt, %left_idx_78, %right_idx_80 : tensor<128x16xi32, #linear> loc(#loc181) + %cond_85 = arith.andi %eq, %cond_83 : tensor<128x16xi1, #blocked> loc(#loc182) + %cond_86 = arith.andi %eq_82, %cond_84 : tensor<128x16xi1, #linear> loc(#loc182) + %cond_87 = arith.ori %cond, %cond_85 : tensor<128x16xi1, #blocked> loc(#loc183) + %cond_88 = arith.ori %cond_81, %cond_86 : tensor<128x16xi1, #linear> loc(#loc183) + %cond_89 = arith.extui %cond_87 : tensor<128x16xi1, #blocked> to tensor<128x16xi32, #blocked> loc(#loc184) + %cond_90 = arith.extui %cond_88 : tensor<128x16xi1, #linear> to tensor<128x16xi32, #linear> loc(#loc184) + %cond_91 = arith.xori %cond_89, %flip_49 : tensor<128x16xi32, #blocked> loc(#loc184) + %cond_92 = arith.xori %cond_90, %flip_50 : tensor<128x16xi32, #linear> loc(#loc184) + %cond_93 = arith.cmpi ne, %cond_91, %cst_11 : tensor<128x16xi32, #blocked> loc(#loc185) + %cond_94 = arith.cmpi ne, %cond_92, %cst : tensor<128x16xi32, #linear> loc(#loc185) + %ret = arith.xori %ileft_62, %iright_64 : tensor<128x16xi32, #blocked> loc(#loc186) + %ret_95 = arith.select %cond_93, %ret, %cst_11 : tensor<128x16xi1, #blocked>, tensor<128x16xi32, #blocked> loc(#loc187) + %ret_96 = arith.xori %tmp0_36, %ret_95 : tensor<128x16xi32, #blocked> loc(#loc188) + %ret_97 = ttg.convert_layout %ret_96 : tensor<128x16xi32, #blocked> -> tensor<128x16xi32, #linear> loc(#loc188) + %new_idxs = arith.xori %left_idx_78, %right_idx_80 : tensor<128x16xi32, #linear> loc(#loc189) + %new_idxs_98 = arith.select %cond_94, %new_idxs, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc190) + %new_idxs_99 = arith.extsi %tmp2 : tensor<1x16xi16, #linear> to tensor<1x16xi32, #linear> loc(#loc191) + %new_idxs_100 = tt.broadcast %new_idxs_99 : tensor<1x16xi32, #linear> -> tensor<128x16xi32, #linear> loc(#loc191) + %new_idxs_101 = arith.xori %new_idxs_100, %new_idxs_98 : tensor<128x16xi32, #linear> loc(#loc191) + %flip_102 = tt.broadcast %flip_46 : tensor<1x2x1xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc147) + %flip_103 = tt.reshape %flip_102 : tensor<256x2x4xi32, #linear3> -> tensor<128x16xi32, #linear> loc(#loc148) + %y_104 = tt.reshape %ret_96 : tensor<128x16xi32, #blocked> -> tensor<512x2x2xi32, #linear2> loc(#loc154) + %ileft_105 = tt.broadcast %left_mask_51 : tensor<1x2x1xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc156) + %ileft_106 = arith.muli %y_104, %ileft_105 : tensor<512x2x2xi32, #linear2> loc(#loc156) + %ileft_107 = "tt.reduce"(%ileft_106) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193) + %ileft_108 = tt.expand_dims %ileft_107 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc158) + %ileft_109 = tt.broadcast %ileft_108 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc159) + %iright_110 = arith.muli %y_104, %flip_48 : tensor<512x2x2xi32, #linear2> loc(#loc160) + %iright_111 = "tt.reduce"(%iright_110) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195) + %iright_112 = tt.expand_dims %iright_111 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc162) + %iright_113 = tt.broadcast %iright_112 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc163) + %ileft_114 = tt.reshape %ileft_109 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc164) + %iright_115 = tt.reshape %iright_113 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc165) + %y_idx_116 = tt.reshape %new_idxs_101 : tensor<128x16xi32, #linear> -> tensor<512x2x2xi32, #linear2> loc(#loc166) + %left_idx_117 = arith.muli %y_idx_116, %ileft_105 : tensor<512x2x2xi32, #linear2> loc(#loc168) + %left_idx_118 = "tt.reduce"(%left_idx_117) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198) + %left_idx_119 = tt.expand_dims %left_idx_118 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc170) + %left_idx_120 = tt.broadcast %left_idx_119 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc171) + %right_idx_121 = arith.muli %y_idx_116, %flip_48 : tensor<512x2x2xi32, #linear2> loc(#loc173) + %right_idx_122 = "tt.reduce"(%right_idx_121) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201) + %right_idx_123 = tt.expand_dims %right_idx_122 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc175) + %right_idx_124 = tt.broadcast %right_idx_123 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc176) + %left_idx_125 = tt.reshape %left_idx_120 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc177) + %right_idx_126 = tt.reshape %right_idx_124 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc178) + %cond_127 = arith.cmpi slt, %ileft_114, %iright_115 : tensor<128x16xi32, #linear> loc(#loc179) + %eq_128 = arith.cmpi eq, %ileft_114, %iright_115 : tensor<128x16xi32, #linear> loc(#loc180) + %cond_129 = arith.cmpi sgt, %left_idx_125, %right_idx_126 : tensor<128x16xi32, #linear> loc(#loc181) + %cond_130 = arith.andi %eq_128, %cond_129 : tensor<128x16xi1, #linear> loc(#loc182) + %cond_131 = arith.ori %cond_127, %cond_130 : tensor<128x16xi1, #linear> loc(#loc183) + %cond_132 = arith.extui %cond_131 : tensor<128x16xi1, #linear> to tensor<128x16xi32, #linear> loc(#loc184) + %cond_133 = arith.xori %cond_132, %flip_103 : tensor<128x16xi32, #linear> loc(#loc184) + %cond_134 = arith.cmpi ne, %cond_133, %cst : tensor<128x16xi32, #linear> loc(#loc185) + %ret_135 = arith.xori %ileft_114, %iright_115 : tensor<128x16xi32, #linear> loc(#loc186) + %ret_136 = arith.select %cond_134, %ret_135, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc187) + %ret_137 = arith.xori %ret_97, %ret_136 : tensor<128x16xi32, #linear> loc(#loc188) + %new_idxs_138 = arith.xori %left_idx_125, %right_idx_126 : tensor<128x16xi32, #linear> loc(#loc189) + %new_idxs_139 = arith.select %cond_134, %new_idxs_138, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc190) + %new_idxs_140 = arith.xori %new_idxs_101, %new_idxs_139 : tensor<128x16xi32, #linear> loc(#loc191) + %y_141 = tt.reshape %ret_137 : tensor<128x16xi32, #linear> -> tensor<1024x2x1xi32, #linear1> loc(#loc154) + %ileft_142 = arith.muli %y_141, %ileft : tensor<1024x2x1xi32, #linear1> loc(#loc156) + %ileft_143 = "tt.reduce"(%ileft_142) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_144 = tt.expand_dims %ileft_143 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc158) + %ileft_145 = tt.broadcast %ileft_144 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc159) + %iright_146 = arith.muli %y_141, %iright : tensor<1024x2x1xi32, #linear1> loc(#loc160) + %iright_147 = "tt.reduce"(%iright_146) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_148 = tt.expand_dims %iright_147 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc162) + %iright_149 = tt.broadcast %iright_148 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc163) + %ileft_150 = tt.reshape %ileft_145 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc164) + %iright_151 = tt.reshape %iright_149 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc165) + %y_idx_152 = tt.reshape %new_idxs_140 : tensor<128x16xi32, #linear> -> tensor<1024x2x1xi32, #linear1> loc(#loc166) + %left_idx_153 = arith.muli %y_idx_152, %ileft : tensor<1024x2x1xi32, #linear1> loc(#loc168) + %left_idx_154 = "tt.reduce"(%left_idx_153) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_155 = tt.expand_dims %left_idx_154 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc170) + %left_idx_156 = tt.broadcast %left_idx_155 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc171) + %right_idx_157 = arith.muli %y_idx_152, %iright : tensor<1024x2x1xi32, #linear1> loc(#loc173) + %right_idx_158 = "tt.reduce"(%right_idx_157) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_159 = tt.expand_dims %right_idx_158 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc175) + %right_idx_160 = tt.broadcast %right_idx_159 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc176) + %left_idx_161 = tt.reshape %left_idx_156 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc177) + %right_idx_162 = tt.reshape %right_idx_160 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc178) + %cond_163 = arith.cmpi slt, %ileft_150, %iright_151 : tensor<128x16xi32, #linear> loc(#loc179) + %eq_164 = arith.cmpi eq, %ileft_150, %iright_151 : tensor<128x16xi32, #linear> loc(#loc180) + %cond_165 = arith.cmpi sgt, %left_idx_161, %right_idx_162 : tensor<128x16xi32, #linear> loc(#loc181) + %cond_166 = arith.andi %eq_164, %cond_165 : tensor<128x16xi1, #linear> loc(#loc182) + %cond_167 = arith.ori %cond_163, %cond_166 : tensor<128x16xi1, #linear> loc(#loc183) + %cond_168 = arith.extui %cond_167 : tensor<128x16xi1, #linear> to tensor<128x16xi32, #linear> loc(#loc184) + %cond_169 = arith.xori %cond_168, %flip_103 : tensor<128x16xi32, #linear> loc(#loc184) + %cond_170 = arith.cmpi ne, %cond_169, %cst : tensor<128x16xi32, #linear> loc(#loc185) + %ret_171 = arith.xori %ileft_150, %iright_151 : tensor<128x16xi32, #linear> loc(#loc186) + %ret_172 = arith.select %cond_170, %ret_171, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc187) + %ret_173 = arith.xori %ret_137, %ret_172 : tensor<128x16xi32, #linear> loc(#loc188) + %new_idxs_174 = arith.xori %left_idx_161, %right_idx_162 : tensor<128x16xi32, #linear> loc(#loc189) + %new_idxs_175 = arith.select %cond_170, %new_idxs_174, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc190) + %new_idxs_176 = arith.xori %new_idxs_140, %new_idxs_175 : tensor<128x16xi32, #linear> loc(#loc191) + %flip_177 = tt.broadcast %flip_47 : tensor<1x2x1xi32, #linear4> -> tensor<128x2x8xi32, #linear4> loc(#loc147) + %flip_178 = tt.reshape %flip_177 : tensor<128x2x8xi32, #linear4> -> tensor<128x16xi32, #linear> loc(#loc148) + %y_179 = tt.reshape %ret_173 : tensor<128x16xi32, #linear> -> tensor<256x2x4xi32, #linear3> loc(#loc154) + %ileft_180 = tt.broadcast %left_mask_52 : tensor<1x2x1xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc156) + %ileft_181 = arith.muli %y_179, %ileft_180 : tensor<256x2x4xi32, #linear3> loc(#loc156) + %ileft_182 = "tt.reduce"(%ileft_181) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<256x2x4xi32, #linear3>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc193) + %ileft_183 = tt.expand_dims %ileft_182 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<256x1x4xi32, #linear3> loc(#loc158) + %ileft_184 = tt.broadcast %ileft_183 : tensor<256x1x4xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc159) + %iright_185 = arith.muli %y_179, %flip_102 : tensor<256x2x4xi32, #linear3> loc(#loc160) + %iright_186 = "tt.reduce"(%iright_185) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<256x2x4xi32, #linear3>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc195) + %iright_187 = tt.expand_dims %iright_186 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<256x1x4xi32, #linear3> loc(#loc162) + %iright_188 = tt.broadcast %iright_187 : tensor<256x1x4xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc163) + %ileft_189 = tt.reshape %ileft_184 : tensor<256x2x4xi32, #linear3> -> tensor<128x16xi32, #linear> loc(#loc164) + %iright_190 = tt.reshape %iright_188 : tensor<256x2x4xi32, #linear3> -> tensor<128x16xi32, #linear> loc(#loc165) + %y_idx_191 = tt.reshape %new_idxs_176 : tensor<128x16xi32, #linear> -> tensor<256x2x4xi32, #linear3> loc(#loc166) + %left_idx_192 = arith.muli %y_idx_191, %ileft_180 : tensor<256x2x4xi32, #linear3> loc(#loc168) + %left_idx_193 = "tt.reduce"(%left_idx_192) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<256x2x4xi32, #linear3>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc198) + %left_idx_194 = tt.expand_dims %left_idx_193 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<256x1x4xi32, #linear3> loc(#loc170) + %left_idx_195 = tt.broadcast %left_idx_194 : tensor<256x1x4xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc171) + %right_idx_196 = arith.muli %y_idx_191, %flip_102 : tensor<256x2x4xi32, #linear3> loc(#loc173) + %right_idx_197 = "tt.reduce"(%right_idx_196) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<256x2x4xi32, #linear3>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc201) + %right_idx_198 = tt.expand_dims %right_idx_197 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<256x1x4xi32, #linear3> loc(#loc175) + %right_idx_199 = tt.broadcast %right_idx_198 : tensor<256x1x4xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc176) + %left_idx_200 = tt.reshape %left_idx_195 : tensor<256x2x4xi32, #linear3> -> tensor<128x16xi32, #linear> loc(#loc177) + %right_idx_201 = tt.reshape %right_idx_199 : tensor<256x2x4xi32, #linear3> -> tensor<128x16xi32, #linear> loc(#loc178) + %cond_202 = arith.cmpi slt, %ileft_189, %iright_190 : tensor<128x16xi32, #linear> loc(#loc179) + %eq_203 = arith.cmpi eq, %ileft_189, %iright_190 : tensor<128x16xi32, #linear> loc(#loc180) + %cond_204 = arith.cmpi sgt, %left_idx_200, %right_idx_201 : tensor<128x16xi32, #linear> loc(#loc181) + %cond_205 = arith.andi %eq_203, %cond_204 : tensor<128x16xi1, #linear> loc(#loc182) + %cond_206 = arith.ori %cond_202, %cond_205 : tensor<128x16xi1, #linear> loc(#loc183) + %cond_207 = arith.extui %cond_206 : tensor<128x16xi1, #linear> to tensor<128x16xi32, #linear> loc(#loc184) + %cond_208 = arith.xori %cond_207, %flip_178 : tensor<128x16xi32, #linear> loc(#loc184) + %cond_209 = arith.cmpi ne, %cond_208, %cst : tensor<128x16xi32, #linear> loc(#loc185) + %ret_210 = arith.xori %ileft_189, %iright_190 : tensor<128x16xi32, #linear> loc(#loc186) + %ret_211 = arith.select %cond_209, %ret_210, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc187) + %ret_212 = arith.xori %ret_173, %ret_211 : tensor<128x16xi32, #linear> loc(#loc188) + %new_idxs_213 = arith.xori %left_idx_200, %right_idx_201 : tensor<128x16xi32, #linear> loc(#loc189) + %new_idxs_214 = arith.select %cond_209, %new_idxs_213, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc190) + %new_idxs_215 = arith.xori %new_idxs_176, %new_idxs_214 : tensor<128x16xi32, #linear> loc(#loc191) + %y_216 = tt.reshape %ret_212 : tensor<128x16xi32, #linear> -> tensor<512x2x2xi32, #linear2> loc(#loc154) + %ileft_217 = arith.muli %y_216, %ileft_105 : tensor<512x2x2xi32, #linear2> loc(#loc156) + %ileft_218 = "tt.reduce"(%ileft_217) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193) + %ileft_219 = tt.expand_dims %ileft_218 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc158) + %ileft_220 = tt.broadcast %ileft_219 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc159) + %iright_221 = arith.muli %y_216, %flip_48 : tensor<512x2x2xi32, #linear2> loc(#loc160) + %iright_222 = "tt.reduce"(%iright_221) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195) + %iright_223 = tt.expand_dims %iright_222 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc162) + %iright_224 = tt.broadcast %iright_223 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc163) + %ileft_225 = tt.reshape %ileft_220 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc164) + %iright_226 = tt.reshape %iright_224 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc165) + %y_idx_227 = tt.reshape %new_idxs_215 : tensor<128x16xi32, #linear> -> tensor<512x2x2xi32, #linear2> loc(#loc166) + %left_idx_228 = arith.muli %y_idx_227, %ileft_105 : tensor<512x2x2xi32, #linear2> loc(#loc168) + %left_idx_229 = "tt.reduce"(%left_idx_228) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198) + %left_idx_230 = tt.expand_dims %left_idx_229 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc170) + %left_idx_231 = tt.broadcast %left_idx_230 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc171) + %right_idx_232 = arith.muli %y_idx_227, %flip_48 : tensor<512x2x2xi32, #linear2> loc(#loc173) + %right_idx_233 = "tt.reduce"(%right_idx_232) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201) + %right_idx_234 = tt.expand_dims %right_idx_233 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc175) + %right_idx_235 = tt.broadcast %right_idx_234 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc176) + %left_idx_236 = tt.reshape %left_idx_231 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc177) + %right_idx_237 = tt.reshape %right_idx_235 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc178) + %cond_238 = arith.cmpi slt, %ileft_225, %iright_226 : tensor<128x16xi32, #linear> loc(#loc179) + %eq_239 = arith.cmpi eq, %ileft_225, %iright_226 : tensor<128x16xi32, #linear> loc(#loc180) + %cond_240 = arith.cmpi sgt, %left_idx_236, %right_idx_237 : tensor<128x16xi32, #linear> loc(#loc181) + %cond_241 = arith.andi %eq_239, %cond_240 : tensor<128x16xi1, #linear> loc(#loc182) + %cond_242 = arith.ori %cond_238, %cond_241 : tensor<128x16xi1, #linear> loc(#loc183) + %cond_243 = arith.extui %cond_242 : tensor<128x16xi1, #linear> to tensor<128x16xi32, #linear> loc(#loc184) + %cond_244 = arith.xori %cond_243, %flip_178 : tensor<128x16xi32, #linear> loc(#loc184) + %cond_245 = arith.cmpi ne, %cond_244, %cst : tensor<128x16xi32, #linear> loc(#loc185) + %ret_246 = arith.xori %ileft_225, %iright_226 : tensor<128x16xi32, #linear> loc(#loc186) + %ret_247 = arith.select %cond_245, %ret_246, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc187) + %ret_248 = arith.xori %ret_212, %ret_247 : tensor<128x16xi32, #linear> loc(#loc188) + %new_idxs_249 = arith.xori %left_idx_236, %right_idx_237 : tensor<128x16xi32, #linear> loc(#loc189) + %new_idxs_250 = arith.select %cond_245, %new_idxs_249, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc190) + %new_idxs_251 = arith.xori %new_idxs_215, %new_idxs_250 : tensor<128x16xi32, #linear> loc(#loc191) + %y_252 = tt.reshape %ret_248 : tensor<128x16xi32, #linear> -> tensor<1024x2x1xi32, #linear1> loc(#loc154) + %ileft_253 = arith.muli %y_252, %ileft : tensor<1024x2x1xi32, #linear1> loc(#loc156) + %ileft_254 = "tt.reduce"(%ileft_253) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_255 = tt.expand_dims %ileft_254 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc158) + %ileft_256 = tt.broadcast %ileft_255 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc159) + %iright_257 = arith.muli %y_252, %iright : tensor<1024x2x1xi32, #linear1> loc(#loc160) + %iright_258 = "tt.reduce"(%iright_257) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_259 = tt.expand_dims %iright_258 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc162) + %iright_260 = tt.broadcast %iright_259 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc163) + %ileft_261 = tt.reshape %ileft_256 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc164) + %iright_262 = tt.reshape %iright_260 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc165) + %y_idx_263 = tt.reshape %new_idxs_251 : tensor<128x16xi32, #linear> -> tensor<1024x2x1xi32, #linear1> loc(#loc166) + %left_idx_264 = arith.muli %y_idx_263, %ileft : tensor<1024x2x1xi32, #linear1> loc(#loc168) + %left_idx_265 = "tt.reduce"(%left_idx_264) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_266 = tt.expand_dims %left_idx_265 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc170) + %left_idx_267 = tt.broadcast %left_idx_266 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc171) + %right_idx_268 = arith.muli %y_idx_263, %iright : tensor<1024x2x1xi32, #linear1> loc(#loc173) + %right_idx_269 = "tt.reduce"(%right_idx_268) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_270 = tt.expand_dims %right_idx_269 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc175) + %right_idx_271 = tt.broadcast %right_idx_270 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc176) + %left_idx_272 = tt.reshape %left_idx_267 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc177) + %right_idx_273 = tt.reshape %right_idx_271 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc178) + %cond_274 = arith.cmpi slt, %ileft_261, %iright_262 : tensor<128x16xi32, #linear> loc(#loc179) + %eq_275 = arith.cmpi eq, %ileft_261, %iright_262 : tensor<128x16xi32, #linear> loc(#loc180) + %cond_276 = arith.cmpi sgt, %left_idx_272, %right_idx_273 : tensor<128x16xi32, #linear> loc(#loc181) + %cond_277 = arith.andi %eq_275, %cond_276 : tensor<128x16xi1, #linear> loc(#loc182) + %cond_278 = arith.ori %cond_274, %cond_277 : tensor<128x16xi1, #linear> loc(#loc183) + %cond_279 = arith.extui %cond_278 : tensor<128x16xi1, #linear> to tensor<128x16xi32, #linear> loc(#loc184) + %cond_280 = arith.xori %cond_279, %flip_178 : tensor<128x16xi32, #linear> loc(#loc184) + %cond_281 = arith.cmpi ne, %cond_280, %cst : tensor<128x16xi32, #linear> loc(#loc185) + %ret_282 = arith.xori %ileft_261, %iright_262 : tensor<128x16xi32, #linear> loc(#loc186) + %ret_283 = arith.select %cond_281, %ret_282, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc187) + %ret_284 = arith.xori %ret_248, %ret_283 : tensor<128x16xi32, #linear> loc(#loc188) + %new_idxs_285 = arith.xori %left_idx_272, %right_idx_273 : tensor<128x16xi32, #linear> loc(#loc189) + %new_idxs_286 = arith.select %cond_281, %new_idxs_285, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc190) + %new_idxs_287 = arith.xori %new_idxs_251, %new_idxs_286 : tensor<128x16xi32, #linear> loc(#loc191) + %y_288 = tt.reshape %ret_284 : tensor<128x16xi32, #linear> -> tensor<128x2x8xi32, #linear4> loc(#loc154) + %ileft_289 = tt.broadcast %left_mask_53 : tensor<1x2x1xi32, #linear4> -> tensor<128x2x8xi32, #linear4> loc(#loc156) + %ileft_290 = arith.muli %y_288, %ileft_289 : tensor<128x2x8xi32, #linear4> loc(#loc156) + %ileft_291 = "tt.reduce"(%ileft_290) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<128x2x8xi32, #linear4>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc193) + %ileft_292 = tt.expand_dims %ileft_291 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<128x1x8xi32, #linear4> loc(#loc158) + %ileft_293 = tt.broadcast %ileft_292 : tensor<128x1x8xi32, #linear4> -> tensor<128x2x8xi32, #linear4> loc(#loc159) + %iright_294 = arith.muli %y_288, %flip_177 : tensor<128x2x8xi32, #linear4> loc(#loc160) + %iright_295 = "tt.reduce"(%iright_294) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<128x2x8xi32, #linear4>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc195) + %iright_296 = tt.expand_dims %iright_295 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<128x1x8xi32, #linear4> loc(#loc162) + %iright_297 = tt.broadcast %iright_296 : tensor<128x1x8xi32, #linear4> -> tensor<128x2x8xi32, #linear4> loc(#loc163) + %ileft_298 = tt.reshape %ileft_293 : tensor<128x2x8xi32, #linear4> -> tensor<128x16xi32, #linear> loc(#loc164) + %iright_299 = tt.reshape %iright_297 : tensor<128x2x8xi32, #linear4> -> tensor<128x16xi32, #linear> loc(#loc165) + %y_idx_300 = tt.reshape %new_idxs_287 : tensor<128x16xi32, #linear> -> tensor<128x2x8xi32, #linear4> loc(#loc166) + %left_idx_301 = arith.muli %y_idx_300, %ileft_289 : tensor<128x2x8xi32, #linear4> loc(#loc168) + %left_idx_302 = "tt.reduce"(%left_idx_301) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<128x2x8xi32, #linear4>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc198) + %left_idx_303 = tt.expand_dims %left_idx_302 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<128x1x8xi32, #linear4> loc(#loc170) + %left_idx_304 = tt.broadcast %left_idx_303 : tensor<128x1x8xi32, #linear4> -> tensor<128x2x8xi32, #linear4> loc(#loc171) + %right_idx_305 = arith.muli %y_idx_300, %flip_177 : tensor<128x2x8xi32, #linear4> loc(#loc173) + %right_idx_306 = "tt.reduce"(%right_idx_305) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<128x2x8xi32, #linear4>) -> tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc201) + %right_idx_307 = tt.expand_dims %right_idx_306 {axis = 1 : i32} : tensor<128x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<128x1x8xi32, #linear4> loc(#loc175) + %right_idx_308 = tt.broadcast %right_idx_307 : tensor<128x1x8xi32, #linear4> -> tensor<128x2x8xi32, #linear4> loc(#loc176) + %left_idx_309 = tt.reshape %left_idx_304 : tensor<128x2x8xi32, #linear4> -> tensor<128x16xi32, #linear> loc(#loc177) + %right_idx_310 = tt.reshape %right_idx_308 : tensor<128x2x8xi32, #linear4> -> tensor<128x16xi32, #linear> loc(#loc178) + %cond_311 = arith.cmpi slt, %ileft_298, %iright_299 : tensor<128x16xi32, #linear> loc(#loc179) + %eq_312 = arith.cmpi eq, %ileft_298, %iright_299 : tensor<128x16xi32, #linear> loc(#loc180) + %cond_313 = arith.cmpi sgt, %left_idx_309, %right_idx_310 : tensor<128x16xi32, #linear> loc(#loc181) + %cond_314 = arith.andi %eq_312, %cond_313 : tensor<128x16xi1, #linear> loc(#loc182) + %cond_315 = arith.ori %cond_311, %cond_314 : tensor<128x16xi1, #linear> loc(#loc183) + %ret_316 = arith.xori %ileft_298, %iright_299 : tensor<128x16xi32, #linear> loc(#loc186) + %ret_317 = arith.select %cond_315, %ret_316, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc187) + %ret_318 = arith.xori %ret_284, %ret_317 : tensor<128x16xi32, #linear> loc(#loc188) + %new_idxs_319 = arith.xori %left_idx_309, %right_idx_310 : tensor<128x16xi32, #linear> loc(#loc189) + %new_idxs_320 = arith.select %cond_315, %new_idxs_319, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc190) + %new_idxs_321 = arith.xori %new_idxs_287, %new_idxs_320 : tensor<128x16xi32, #linear> loc(#loc191) + %y_322 = tt.reshape %ret_318 : tensor<128x16xi32, #linear> -> tensor<256x2x4xi32, #linear3> loc(#loc154) + %ileft_323 = arith.muli %y_322, %ileft_180 : tensor<256x2x4xi32, #linear3> loc(#loc156) + %ileft_324 = "tt.reduce"(%ileft_323) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<256x2x4xi32, #linear3>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc193) + %ileft_325 = tt.expand_dims %ileft_324 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<256x1x4xi32, #linear3> loc(#loc158) + %ileft_326 = tt.broadcast %ileft_325 : tensor<256x1x4xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc159) + %iright_327 = arith.muli %y_322, %flip_102 : tensor<256x2x4xi32, #linear3> loc(#loc160) + %iright_328 = "tt.reduce"(%iright_327) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<256x2x4xi32, #linear3>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc195) + %iright_329 = tt.expand_dims %iright_328 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<256x1x4xi32, #linear3> loc(#loc162) + %iright_330 = tt.broadcast %iright_329 : tensor<256x1x4xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc163) + %ileft_331 = tt.reshape %ileft_326 : tensor<256x2x4xi32, #linear3> -> tensor<128x16xi32, #linear> loc(#loc164) + %iright_332 = tt.reshape %iright_330 : tensor<256x2x4xi32, #linear3> -> tensor<128x16xi32, #linear> loc(#loc165) + %y_idx_333 = tt.reshape %new_idxs_321 : tensor<128x16xi32, #linear> -> tensor<256x2x4xi32, #linear3> loc(#loc166) + %left_idx_334 = arith.muli %y_idx_333, %ileft_180 : tensor<256x2x4xi32, #linear3> loc(#loc168) + %left_idx_335 = "tt.reduce"(%left_idx_334) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<256x2x4xi32, #linear3>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc198) + %left_idx_336 = tt.expand_dims %left_idx_335 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<256x1x4xi32, #linear3> loc(#loc170) + %left_idx_337 = tt.broadcast %left_idx_336 : tensor<256x1x4xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc171) + %right_idx_338 = arith.muli %y_idx_333, %flip_102 : tensor<256x2x4xi32, #linear3> loc(#loc173) + %right_idx_339 = "tt.reduce"(%right_idx_338) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<256x2x4xi32, #linear3>) -> tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc201) + %right_idx_340 = tt.expand_dims %right_idx_339 {axis = 1 : i32} : tensor<256x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<256x1x4xi32, #linear3> loc(#loc175) + %right_idx_341 = tt.broadcast %right_idx_340 : tensor<256x1x4xi32, #linear3> -> tensor<256x2x4xi32, #linear3> loc(#loc176) + %left_idx_342 = tt.reshape %left_idx_337 : tensor<256x2x4xi32, #linear3> -> tensor<128x16xi32, #linear> loc(#loc177) + %right_idx_343 = tt.reshape %right_idx_341 : tensor<256x2x4xi32, #linear3> -> tensor<128x16xi32, #linear> loc(#loc178) + %cond_344 = arith.cmpi slt, %ileft_331, %iright_332 : tensor<128x16xi32, #linear> loc(#loc179) + %eq_345 = arith.cmpi eq, %ileft_331, %iright_332 : tensor<128x16xi32, #linear> loc(#loc180) + %cond_346 = arith.cmpi sgt, %left_idx_342, %right_idx_343 : tensor<128x16xi32, #linear> loc(#loc181) + %cond_347 = arith.andi %eq_345, %cond_346 : tensor<128x16xi1, #linear> loc(#loc182) + %cond_348 = arith.ori %cond_344, %cond_347 : tensor<128x16xi1, #linear> loc(#loc183) + %ret_349 = arith.xori %ileft_331, %iright_332 : tensor<128x16xi32, #linear> loc(#loc186) + %ret_350 = arith.select %cond_348, %ret_349, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc187) + %ret_351 = arith.xori %ret_318, %ret_350 : tensor<128x16xi32, #linear> loc(#loc188) + %new_idxs_352 = arith.xori %left_idx_342, %right_idx_343 : tensor<128x16xi32, #linear> loc(#loc189) + %new_idxs_353 = arith.select %cond_348, %new_idxs_352, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc190) + %new_idxs_354 = arith.xori %new_idxs_321, %new_idxs_353 : tensor<128x16xi32, #linear> loc(#loc191) + %y_355 = tt.reshape %ret_351 : tensor<128x16xi32, #linear> -> tensor<512x2x2xi32, #linear2> loc(#loc154) + %ileft_356 = arith.muli %y_355, %ileft_105 : tensor<512x2x2xi32, #linear2> loc(#loc156) + %ileft_357 = "tt.reduce"(%ileft_356) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc193) + %ileft_358 = tt.expand_dims %ileft_357 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc158) + %ileft_359 = tt.broadcast %ileft_358 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc159) + %iright_360 = arith.muli %y_355, %flip_48 : tensor<512x2x2xi32, #linear2> loc(#loc160) + %iright_361 = "tt.reduce"(%iright_360) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc195) + %iright_362 = tt.expand_dims %iright_361 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc162) + %iright_363 = tt.broadcast %iright_362 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc163) + %ileft_364 = tt.reshape %ileft_359 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc164) + %iright_365 = tt.reshape %iright_363 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc165) + %y_idx_366 = tt.reshape %new_idxs_354 : tensor<128x16xi32, #linear> -> tensor<512x2x2xi32, #linear2> loc(#loc166) + %left_idx_367 = arith.muli %y_idx_366, %ileft_105 : tensor<512x2x2xi32, #linear2> loc(#loc168) + %left_idx_368 = "tt.reduce"(%left_idx_367) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc198) + %left_idx_369 = tt.expand_dims %left_idx_368 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc170) + %left_idx_370 = tt.broadcast %left_idx_369 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc171) + %right_idx_371 = arith.muli %y_idx_366, %flip_48 : tensor<512x2x2xi32, #linear2> loc(#loc173) + %right_idx_372 = "tt.reduce"(%right_idx_371) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<512x2x2xi32, #linear2>) -> tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc201) + %right_idx_373 = tt.expand_dims %right_idx_372 {axis = 1 : i32} : tensor<512x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<512x1x2xi32, #linear2> loc(#loc175) + %right_idx_374 = tt.broadcast %right_idx_373 : tensor<512x1x2xi32, #linear2> -> tensor<512x2x2xi32, #linear2> loc(#loc176) + %left_idx_375 = tt.reshape %left_idx_370 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc177) + %right_idx_376 = tt.reshape %right_idx_374 : tensor<512x2x2xi32, #linear2> -> tensor<128x16xi32, #linear> loc(#loc178) + %cond_377 = arith.cmpi slt, %ileft_364, %iright_365 : tensor<128x16xi32, #linear> loc(#loc179) + %eq_378 = arith.cmpi eq, %ileft_364, %iright_365 : tensor<128x16xi32, #linear> loc(#loc180) + %cond_379 = arith.cmpi sgt, %left_idx_375, %right_idx_376 : tensor<128x16xi32, #linear> loc(#loc181) + %cond_380 = arith.andi %eq_378, %cond_379 : tensor<128x16xi1, #linear> loc(#loc182) + %cond_381 = arith.ori %cond_377, %cond_380 : tensor<128x16xi1, #linear> loc(#loc183) + %ret_382 = arith.xori %ileft_364, %iright_365 : tensor<128x16xi32, #linear> loc(#loc186) + %ret_383 = arith.select %cond_381, %ret_382, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc187) + %ret_384 = arith.xori %ret_351, %ret_383 : tensor<128x16xi32, #linear> loc(#loc188) + %new_idxs_385 = arith.xori %left_idx_375, %right_idx_376 : tensor<128x16xi32, #linear> loc(#loc189) + %new_idxs_386 = arith.select %cond_381, %new_idxs_385, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc190) + %new_idxs_387 = arith.xori %new_idxs_354, %new_idxs_386 : tensor<128x16xi32, #linear> loc(#loc191) + %y_388 = tt.reshape %ret_384 : tensor<128x16xi32, #linear> -> tensor<1024x2x1xi32, #linear1> loc(#loc154) + %ileft_389 = arith.muli %y_388, %ileft : tensor<1024x2x1xi32, #linear1> loc(#loc156) + %ileft_390 = "tt.reduce"(%ileft_389) <{axis = 1 : i32}> ({ + ^bb0(%ileft_419: i32 loc(callsite(#loc1 at #loc157)), %ileft_420: i32 loc(callsite(#loc1 at #loc157))): + %ileft_421 = arith.addi %ileft_419, %ileft_420 : i32 loc(#loc203) + tt.reduce.return %ileft_421 : i32 loc(#loc193) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc193) + %ileft_391 = tt.expand_dims %ileft_390 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc158) + %ileft_392 = tt.broadcast %ileft_391 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc159) + %iright_393 = arith.muli %y_388, %iright : tensor<1024x2x1xi32, #linear1> loc(#loc160) + %iright_394 = "tt.reduce"(%iright_393) <{axis = 1 : i32}> ({ + ^bb0(%iright_419: i32 loc(callsite(#loc1 at #loc161)), %iright_420: i32 loc(callsite(#loc1 at #loc161))): + %iright_421 = arith.addi %iright_419, %iright_420 : i32 loc(#loc204) + tt.reduce.return %iright_421 : i32 loc(#loc195) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc195) + %iright_395 = tt.expand_dims %iright_394 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc162) + %iright_396 = tt.broadcast %iright_395 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc163) + %ileft_397 = tt.reshape %ileft_392 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc164) + %iright_398 = tt.reshape %iright_396 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc165) + %y_idx_399 = tt.reshape %new_idxs_387 : tensor<128x16xi32, #linear> -> tensor<1024x2x1xi32, #linear1> loc(#loc166) + %left_idx_400 = arith.muli %y_idx_399, %ileft : tensor<1024x2x1xi32, #linear1> loc(#loc168) + %left_idx_401 = "tt.reduce"(%left_idx_400) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_419: i32 loc(callsite(#loc1 at #loc169)), %left_idx_420: i32 loc(callsite(#loc1 at #loc169))): + %left_idx_421 = arith.addi %left_idx_419, %left_idx_420 : i32 loc(#loc205) + tt.reduce.return %left_idx_421 : i32 loc(#loc198) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc198) + %left_idx_402 = tt.expand_dims %left_idx_401 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc170) + %left_idx_403 = tt.broadcast %left_idx_402 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc171) + %right_idx_404 = arith.muli %y_idx_399, %iright : tensor<1024x2x1xi32, #linear1> loc(#loc173) + %right_idx_405 = "tt.reduce"(%right_idx_404) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_419: i32 loc(callsite(#loc1 at #loc174)), %right_idx_420: i32 loc(callsite(#loc1 at #loc174))): + %right_idx_421 = arith.addi %right_idx_419, %right_idx_420 : i32 loc(#loc206) + tt.reduce.return %right_idx_421 : i32 loc(#loc201) + }) : (tensor<1024x2x1xi32, #linear1>) -> tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc201) + %right_idx_406 = tt.expand_dims %right_idx_405 {axis = 1 : i32} : tensor<1024x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<1024x1x1xi32, #linear1> loc(#loc175) + %right_idx_407 = tt.broadcast %right_idx_406 : tensor<1024x1x1xi32, #linear1> -> tensor<1024x2x1xi32, #linear1> loc(#loc176) + %left_idx_408 = tt.reshape %left_idx_403 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc177) + %right_idx_409 = tt.reshape %right_idx_407 : tensor<1024x2x1xi32, #linear1> -> tensor<128x16xi32, #linear> loc(#loc178) + %cond_410 = arith.cmpi slt, %ileft_397, %iright_398 : tensor<128x16xi32, #linear> loc(#loc179) + %eq_411 = arith.cmpi eq, %ileft_397, %iright_398 : tensor<128x16xi32, #linear> loc(#loc180) + %cond_412 = arith.cmpi sgt, %left_idx_408, %right_idx_409 : tensor<128x16xi32, #linear> loc(#loc181) + %cond_413 = arith.andi %eq_411, %cond_412 : tensor<128x16xi1, #linear> loc(#loc182) + %cond_414 = arith.ori %cond_410, %cond_413 : tensor<128x16xi1, #linear> loc(#loc183) + %new_idxs_415 = arith.xori %left_idx_408, %right_idx_409 : tensor<128x16xi32, #linear> loc(#loc189) + %new_idxs_416 = arith.select %cond_414, %new_idxs_415, %cst : tensor<128x16xi1, #linear>, tensor<128x16xi32, #linear> loc(#loc190) + %new_idxs_417 = arith.xori %new_idxs_387, %new_idxs_416 : tensor<128x16xi32, #linear> loc(#loc191) + %tmp7 = arith.extsi %tmp0_36 : tensor<128x16xi32, #blocked> to tensor<128x16xi64, #blocked> loc(#loc141) + %tmp10 = arith.select %tmp0_34, %tmp7, %cst_0 : tensor<128x16xi1, #blocked>, tensor<128x16xi64, #blocked> loc(#loc142) + %tmp11 = "tt.reduce"(%tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_419: i64 loc(callsite(#loc1 at #loc143)), %tmp11_420: i64 loc(callsite(#loc1 at #loc143))): + %tmp11_421 = arith.addi %tmp11_419, %tmp11_420 : i64 loc(#loc192) + tt.reduce.return %tmp11_421 : i64 loc(#loc152) + }) : (tensor<128x16xi64, #blocked>) -> tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc152) + %tmp11_418 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<128xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi64, #blocked> loc(#loc144) + %tmp14 = arith.trunci %tmp11_418 : tensor<128x1xi64, #blocked> to tensor<128x1xi32, #blocked> loc(#loc145) + %0 = arith.muli %xindex_19, %cst_4 : tensor<128x1xi32, #blocked1> loc(#loc70) + %1 = tt.broadcast %r0_index_25 : tensor<1x16xi32, #blocked1> -> tensor<128x16xi32, #blocked1> loc(#loc71) + %2 = tt.broadcast %0 : tensor<128x1xi32, #blocked1> -> tensor<128x16xi32, #blocked1> loc(#loc71) + %3 = arith.addi %1, %2 : tensor<128x16xi32, #blocked1> loc(#loc71) + %4 = tt.splat %out_ptr2 : !tt.ptr -> tensor<128x16x!tt.ptr, #blocked1> loc(#loc72) + %5 = tt.addptr %4, %3 : tensor<128x16x!tt.ptr, #blocked1>, tensor<128x16xi32, #blocked1> loc(#loc72) + %6 = ttg.convert_layout %new_idxs_417 : tensor<128x16xi32, #linear> -> tensor<128x16xi32, #blocked1> loc(#loc73) + tt.store %5, %6, %tmp0_35 : tensor<128x16x!tt.ptr, #blocked1> loc(#loc73) + %7 = tt.splat %out_ptr3 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc74) + %8 = tt.addptr %7, %xindex_18 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc74) + tt.store %8, %tmp14, %xmask : tensor<128x1x!tt.ptr, #blocked> loc(#loc75) + tt.return loc(#loc76) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":24:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":25:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":25:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":26:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":27:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":33:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":34:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:35) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:49) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:30) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:54) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":38:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":40:33) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":42:19) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":44:34) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":45:29) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":48:21) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:32) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:47) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:25) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:37) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:4) +#loc82 = loc("xoffset"(#loc2)) +#loc83 = loc("xoffset"(#loc3)) +#loc84 = loc("xindex"(#loc4)) +#loc85 = loc("xindex"(#loc5)) +#loc86 = loc("xmask"(#loc6)) +#loc87 = loc("r0_index"(#loc7)) +#loc88 = loc("x0"(#loc8)) +#loc89 = loc("x1"(#loc9)) +#loc90 = loc("tmp0"(#loc10)) +#loc91 = loc("tmp0"(#loc11)) +#loc92 = loc("tmp0"(#loc12)) +#loc93 = loc("tmp0"(#loc13)) +#loc94 = loc("tmp0"(#loc14)) +#loc95 = loc("tmp0"(#loc15)) +#loc96 = loc("tmp2"(#loc16)) +#loc97 = loc("tmp4"(#loc17)) +#loc98 = loc("flip"(#loc18)) +#loc100 = loc("flip"(#loc21)) +#loc101 = loc("flip"(#loc22)) +#loc102 = loc("y"(#loc23)) +#loc103 = loc("left_mask"(#loc25)) +#loc104 = loc("ileft"(#loc26)) +#loc106 = loc("ileft"(#loc30)) +#loc107 = loc("ileft"(#loc31)) +#loc108 = loc("iright"(#loc32)) +#loc110 = loc("iright"(#loc34)) +#loc111 = loc("iright"(#loc35)) +#loc112 = loc("ileft"(#loc36)) +#loc113 = loc("iright"(#loc37)) +#loc114 = loc("y_idx"(#loc38)) +#loc115 = loc("left_idx"(#loc39)) +#loc116 = loc("left_idx"(#loc40)) +#loc117 = loc("input"(#loc41)) +#loc119 = loc("left_idx"(#loc43)) +#loc120 = loc("left_idx"(#loc44)) +#loc121 = loc("right_idx"(#loc45)) +#loc122 = loc("right_idx"(#loc46)) +#loc124 = loc("right_idx"(#loc48)) +#loc125 = loc("right_idx"(#loc49)) +#loc126 = loc("left_idx"(#loc50)) +#loc127 = loc("right_idx"(#loc51)) +#loc128 = loc("cond"(#loc52)) +#loc129 = loc("eq"(#loc53)) +#loc130 = loc("cond"(#loc54)) +#loc131 = loc("cond"(#loc55)) +#loc132 = loc("cond"(#loc56)) +#loc133 = loc("cond"(#loc57)) +#loc134 = loc("cond"(#loc58)) +#loc135 = loc("ret"(#loc59)) +#loc136 = loc("ret"(#loc60)) +#loc137 = loc("ret"(#loc61)) +#loc138 = loc("new_idxs"(#loc62)) +#loc139 = loc("new_idxs"(#loc63)) +#loc140 = loc("new_idxs"(#loc64)) +#loc141 = loc("tmp7"(#loc65)) +#loc142 = loc("tmp10"(#loc66)) +#loc144 = loc("tmp11"(#loc68)) +#loc145 = loc("tmp14"(#loc69)) +#loc146 = loc(callsite(#loc98 at #loc99)) +#loc147 = loc(callsite(#loc100 at #loc99)) +#loc148 = loc(callsite(#loc101 at #loc99)) +#loc150 = loc("cond"(#loc128)) +#loc151 = loc("eq"(#loc129)) +#loc152 = loc(callsite(#loc27 at #loc143)) +#loc154 = loc(callsite(#loc102 at #loc149)) +#loc155 = loc(callsite(#loc103 at #loc149)) +#loc156 = loc(callsite(#loc104 at #loc149)) +#loc158 = loc(callsite(#loc106 at #loc149)) +#loc159 = loc(callsite(#loc107 at #loc149)) +#loc160 = loc(callsite(#loc108 at #loc149)) +#loc162 = loc(callsite(#loc110 at #loc149)) +#loc163 = loc(callsite(#loc111 at #loc149)) +#loc164 = loc(callsite(#loc112 at #loc149)) +#loc165 = loc(callsite(#loc113 at #loc149)) +#loc166 = loc(callsite(#loc114 at #loc149)) +#loc167 = loc(callsite(#loc115 at #loc149)) +#loc168 = loc(callsite(#loc116 at #loc149)) +#loc170 = loc(callsite(#loc119 at #loc149)) +#loc171 = loc(callsite(#loc120 at #loc149)) +#loc172 = loc(callsite(#loc121 at #loc149)) +#loc173 = loc(callsite(#loc122 at #loc149)) +#loc175 = loc(callsite(#loc124 at #loc149)) +#loc176 = loc(callsite(#loc125 at #loc149)) +#loc177 = loc(callsite(#loc126 at #loc149)) +#loc178 = loc(callsite(#loc127 at #loc149)) +#loc179 = loc(callsite(#loc150 at #loc149)) +#loc180 = loc(callsite(#loc151 at #loc149)) +#loc181 = loc(callsite(#loc130 at #loc149)) +#loc182 = loc(callsite(#loc131 at #loc149)) +#loc183 = loc(callsite(#loc132 at #loc149)) +#loc184 = loc(callsite(#loc133 at #loc149)) +#loc185 = loc(callsite(#loc134 at #loc149)) +#loc186 = loc(callsite(#loc135 at #loc149)) +#loc187 = loc(callsite(#loc136 at #loc149)) +#loc188 = loc(callsite(#loc137 at #loc149)) +#loc189 = loc(callsite(#loc138 at #loc149)) +#loc190 = loc(callsite(#loc139 at #loc149)) +#loc191 = loc(callsite(#loc140 at #loc149)) +#loc192 = loc(callsite(#loc29 at #loc152)) +#loc193 = loc(callsite(#loc27 at #loc157)) +#loc195 = loc(callsite(#loc27 at #loc161)) +#loc197 = loc(callsite(#loc117 at #loc169)) +#loc198 = loc(callsite(#loc27 at #loc169)) +#loc200 = loc(callsite(#loc117 at #loc174)) +#loc201 = loc(callsite(#loc27 at #loc174)) +#loc203 = loc(callsite(#loc29 at #loc193)) +#loc204 = loc(callsite(#loc29 at #loc195)) +#loc205 = loc(callsite(#loc29 at #loc198)) +#loc206 = loc(callsite(#loc29 at #loc201)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir new file mode 100644 index 0000000000000000000000000000000000000000..5ed652b50daef36fff34ce0cd41ad971e461103f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/WZA6QJES22AE2XEC3NBB6EXQW5G73PZ7SQ7M6XDRMYIXZ6OVS6QQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir @@ -0,0 +1,799 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":41:67) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":45:26) +#loc80 = loc("in_ptr0"(#loc)) +#loc81 = loc("out_ptr2"(#loc)) +#loc82 = loc("out_ptr3"(#loc)) +#loc83 = loc("xnumel"(#loc)) +#loc84 = loc("r0_numel"(#loc)) +#loc106 = loc(callsite(#loc23 at #loc2)) +#loc113 = loc("ileft"(#loc32)) +#loc117 = loc("iright"(#loc37)) +#loc126 = loc("left_idx"(#loc46)) +#loc131 = loc("right_idx"(#loc51)) +#loc150 = loc("tmp11"(#loc70)) +#loc157 = loc(callsite(#loc28 at #loc106)) +#loc161 = loc(callsite(#loc1 at #loc150)) +#loc165 = loc(callsite(#loc113 at #loc157)) +#loc169 = loc(callsite(#loc117 at #loc157)) +#loc177 = loc(callsite(#loc126 at #loc157)) +#loc182 = loc(callsite(#loc131 at #loc157)) +#loc202 = loc(callsite(#loc1 at #loc165)) +#loc204 = loc(callsite(#loc1 at #loc169)) +#loc207 = loc(callsite(#loc1 at #loc177)) +#loc210 = loc(callsite(#loc1 at #loc182)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc85) + %cst_0 = arith.constant dense<0> : tensor<128x16xi32> loc(#loc1) + %tmp10 = arith.constant dense<0> : tensor<128x16xi64> loc(#loc86) + %tmp0 = arith.constant dense<272> : tensor<128x1xi32> loc(#loc87) + %tmp0_1 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc88) + %cst_2 = arith.constant dense<16> : tensor<128x1xi32> loc(#loc1) + %xmask = arith.constant dense<128> : tensor<128x1xi32> loc(#loc89) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc90) + %xoffset_3 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc91) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc92) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc93) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<128x1xi32> loc(#loc94) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<128x1xi32> loc(#loc94) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<128x1xi32> loc(#loc89) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc95) + %r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc96) + %x0 = arith.remsi %xindex_6, %cst_2 : tensor<128x1xi32> loc(#loc97) + %x1 = arith.divsi %xindex_6, %cst_2 : tensor<128x1xi32> loc(#loc98) + %tmp0_9 = arith.muli %r0_index_8, %tmp0_1 : tensor<1x16xi32> loc(#loc88) + %tmp0_10 = tt.broadcast %x0 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc99) + %tmp0_11 = tt.broadcast %tmp0_9 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc99) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<128x16xi32> loc(#loc99) + %tmp0_13 = arith.muli %x1, %tmp0 : tensor<128x1xi32> loc(#loc87) + %tmp0_14 = tt.broadcast %tmp0_13 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc100) + %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<128x16xi32> loc(#loc100) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc101) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc101) + %tmp0_18 = tt.broadcast %xmask_7 : tensor<128x1xi1> -> tensor<128x16xi1> loc(#loc102) + %tmp0_19 = tt.load %tmp0_17, %tmp0_18, %cst_0 : tensor<128x16x!tt.ptr> loc(#loc102) + %tmp2 = arith.trunci %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc103) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<128x16xi16> loc(#loc104) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc153) + %flip_20 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc154) + %flip_21 = tt.expand_dims %flip_20 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc154) + %flip_22 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc155) + %flip_23 = tt.reshape %flip_22 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc156) + %y = tt.reshape %tmp0_19 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc162) + %left_mask = arith.subi %cst, %flip_21 : tensor<1x2x1xi32> loc(#loc163) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc164) + %ileft_24 = arith.muli %y, %ileft : tensor<1024x2x1xi32> loc(#loc164) + %ileft_25 = "tt.reduce"(%ileft_24) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc201) + %ileft_26 = tt.expand_dims %ileft_25 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc166) + %ileft_27 = tt.broadcast %ileft_26 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc167) + %iright = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<1024x2x1xi32> loc(#loc168) + %iright_28 = arith.muli %y, %iright : tensor<1024x2x1xi32> loc(#loc168) + %iright_29 = "tt.reduce"(%iright_28) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc203) + %iright_30 = tt.expand_dims %iright_29 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc170) + %iright_31 = tt.broadcast %iright_30 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc171) + %ileft_32 = tt.reshape %ileft_27 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc172) + %iright_33 = tt.reshape %iright_31 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc173) + %y_idx = tt.reshape %tmp4 : tensor<128x16xi16> -> tensor<1024x2x1xi16> loc(#loc174) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc175) + %left_idx_34 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<1024x2x1xi16> loc(#loc176) + %left_idx_35 = arith.muli %y_idx, %left_idx_34 : tensor<1024x2x1xi16> loc(#loc176) + %input = arith.extsi %left_idx_35 : tensor<1024x2x1xi16> to tensor<1024x2x1xi32> loc(#loc205) + %left_idx_36 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc206) + %left_idx_37 = tt.expand_dims %left_idx_36 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc178) + %left_idx_38 = tt.broadcast %left_idx_37 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc179) + %right_idx = arith.trunci %flip_21 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc180) + %right_idx_39 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<1024x2x1xi16> loc(#loc181) + %right_idx_40 = arith.muli %y_idx, %right_idx_39 : tensor<1024x2x1xi16> loc(#loc181) + %input_41 = arith.extsi %right_idx_40 : tensor<1024x2x1xi16> to tensor<1024x2x1xi32> loc(#loc208) + %right_idx_42 = "tt.reduce"(%input_41) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc209) + %right_idx_43 = tt.expand_dims %right_idx_42 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc183) + %right_idx_44 = tt.broadcast %right_idx_43 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc184) + %left_idx_45 = tt.reshape %left_idx_38 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc185) + %right_idx_46 = tt.reshape %right_idx_44 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc186) + %cond = arith.cmpi slt, %ileft_32, %iright_33 : tensor<128x16xi32> loc(#loc187) + %eq = arith.cmpi eq, %ileft_32, %iright_33 : tensor<128x16xi32> loc(#loc188) + %cond_47 = arith.cmpi sgt, %left_idx_45, %right_idx_46 : tensor<128x16xi32> loc(#loc189) + %cond_48 = arith.andi %eq, %cond_47 : tensor<128x16xi1> loc(#loc190) + %cond_49 = arith.ori %cond, %cond_48 : tensor<128x16xi1> loc(#loc191) + %cond_50 = arith.extui %cond_49 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc192) + %cond_51 = arith.xori %cond_50, %flip_23 : tensor<128x16xi32> loc(#loc192) + %cond_52 = arith.cmpi ne, %cond_51, %cst_0 : tensor<128x16xi32> loc(#loc193) + %ret = arith.xori %ileft_32, %iright_33 : tensor<128x16xi32> loc(#loc194) + %ret_53 = arith.select %cond_52, %ret, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc195) + %ret_54 = arith.xori %tmp0_19, %ret_53 : tensor<128x16xi32> loc(#loc196) + %new_idxs = arith.xori %left_idx_45, %right_idx_46 : tensor<128x16xi32> loc(#loc197) + %new_idxs_55 = arith.select %cond_52, %new_idxs, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc198) + %new_idxs_56 = arith.extsi %tmp2 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc199) + %new_idxs_57 = tt.broadcast %new_idxs_56 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc199) + %new_idxs_58 = arith.xori %new_idxs_57, %new_idxs_55 : tensor<128x16xi32> loc(#loc199) + %flip_59 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc155) + %flip_60 = tt.reshape %flip_59 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc156) + %y_61 = tt.reshape %ret_54 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc162) + %ileft_62 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<512x2x2xi32> loc(#loc164) + %ileft_63 = arith.muli %y_61, %ileft_62 : tensor<512x2x2xi32> loc(#loc164) + %ileft_64 = "tt.reduce"(%ileft_63) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc201) + %ileft_65 = tt.expand_dims %ileft_64 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc166) + %ileft_66 = tt.broadcast %ileft_65 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc167) + %iright_67 = arith.muli %y_61, %flip_22 : tensor<512x2x2xi32> loc(#loc168) + %iright_68 = "tt.reduce"(%iright_67) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc203) + %iright_69 = tt.expand_dims %iright_68 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc170) + %iright_70 = tt.broadcast %iright_69 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc171) + %ileft_71 = tt.reshape %ileft_66 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc172) + %iright_72 = tt.reshape %iright_70 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc173) + %y_idx_73 = tt.reshape %new_idxs_58 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc174) + %left_idx_74 = arith.muli %y_idx_73, %ileft_62 : tensor<512x2x2xi32> loc(#loc176) + %left_idx_75 = "tt.reduce"(%left_idx_74) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc206) + %left_idx_76 = tt.expand_dims %left_idx_75 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc178) + %left_idx_77 = tt.broadcast %left_idx_76 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc179) + %right_idx_78 = arith.muli %y_idx_73, %flip_22 : tensor<512x2x2xi32> loc(#loc181) + %right_idx_79 = "tt.reduce"(%right_idx_78) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc209) + %right_idx_80 = tt.expand_dims %right_idx_79 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc183) + %right_idx_81 = tt.broadcast %right_idx_80 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc184) + %left_idx_82 = tt.reshape %left_idx_77 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc185) + %right_idx_83 = tt.reshape %right_idx_81 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc186) + %cond_84 = arith.cmpi slt, %ileft_71, %iright_72 : tensor<128x16xi32> loc(#loc187) + %eq_85 = arith.cmpi eq, %ileft_71, %iright_72 : tensor<128x16xi32> loc(#loc188) + %cond_86 = arith.cmpi sgt, %left_idx_82, %right_idx_83 : tensor<128x16xi32> loc(#loc189) + %cond_87 = arith.andi %eq_85, %cond_86 : tensor<128x16xi1> loc(#loc190) + %cond_88 = arith.ori %cond_84, %cond_87 : tensor<128x16xi1> loc(#loc191) + %cond_89 = arith.extui %cond_88 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc192) + %cond_90 = arith.xori %cond_89, %flip_60 : tensor<128x16xi32> loc(#loc192) + %cond_91 = arith.cmpi ne, %cond_90, %cst_0 : tensor<128x16xi32> loc(#loc193) + %ret_92 = arith.xori %ileft_71, %iright_72 : tensor<128x16xi32> loc(#loc194) + %ret_93 = arith.select %cond_91, %ret_92, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc195) + %ret_94 = arith.xori %ret_54, %ret_93 : tensor<128x16xi32> loc(#loc196) + %new_idxs_95 = arith.xori %left_idx_82, %right_idx_83 : tensor<128x16xi32> loc(#loc197) + %new_idxs_96 = arith.select %cond_91, %new_idxs_95, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc198) + %new_idxs_97 = arith.xori %new_idxs_58, %new_idxs_96 : tensor<128x16xi32> loc(#loc199) + %y_98 = tt.reshape %ret_94 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc162) + %ileft_99 = arith.muli %y_98, %ileft : tensor<1024x2x1xi32> loc(#loc164) + %ileft_100 = "tt.reduce"(%ileft_99) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc201) + %ileft_101 = tt.expand_dims %ileft_100 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc166) + %ileft_102 = tt.broadcast %ileft_101 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc167) + %iright_103 = arith.muli %y_98, %iright : tensor<1024x2x1xi32> loc(#loc168) + %iright_104 = "tt.reduce"(%iright_103) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc203) + %iright_105 = tt.expand_dims %iright_104 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc170) + %iright_106 = tt.broadcast %iright_105 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc171) + %ileft_107 = tt.reshape %ileft_102 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc172) + %iright_108 = tt.reshape %iright_106 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc173) + %y_idx_109 = tt.reshape %new_idxs_97 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc174) + %left_idx_110 = arith.muli %y_idx_109, %ileft : tensor<1024x2x1xi32> loc(#loc176) + %left_idx_111 = "tt.reduce"(%left_idx_110) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc206) + %left_idx_112 = tt.expand_dims %left_idx_111 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc178) + %left_idx_113 = tt.broadcast %left_idx_112 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc179) + %right_idx_114 = arith.muli %y_idx_109, %iright : tensor<1024x2x1xi32> loc(#loc181) + %right_idx_115 = "tt.reduce"(%right_idx_114) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc209) + %right_idx_116 = tt.expand_dims %right_idx_115 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc183) + %right_idx_117 = tt.broadcast %right_idx_116 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc184) + %left_idx_118 = tt.reshape %left_idx_113 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc185) + %right_idx_119 = tt.reshape %right_idx_117 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc186) + %cond_120 = arith.cmpi slt, %ileft_107, %iright_108 : tensor<128x16xi32> loc(#loc187) + %eq_121 = arith.cmpi eq, %ileft_107, %iright_108 : tensor<128x16xi32> loc(#loc188) + %cond_122 = arith.cmpi sgt, %left_idx_118, %right_idx_119 : tensor<128x16xi32> loc(#loc189) + %cond_123 = arith.andi %eq_121, %cond_122 : tensor<128x16xi1> loc(#loc190) + %cond_124 = arith.ori %cond_120, %cond_123 : tensor<128x16xi1> loc(#loc191) + %cond_125 = arith.extui %cond_124 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc192) + %cond_126 = arith.xori %cond_125, %flip_60 : tensor<128x16xi32> loc(#loc192) + %cond_127 = arith.cmpi ne, %cond_126, %cst_0 : tensor<128x16xi32> loc(#loc193) + %ret_128 = arith.xori %ileft_107, %iright_108 : tensor<128x16xi32> loc(#loc194) + %ret_129 = arith.select %cond_127, %ret_128, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc195) + %ret_130 = arith.xori %ret_94, %ret_129 : tensor<128x16xi32> loc(#loc196) + %new_idxs_131 = arith.xori %left_idx_118, %right_idx_119 : tensor<128x16xi32> loc(#loc197) + %new_idxs_132 = arith.select %cond_127, %new_idxs_131, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc198) + %new_idxs_133 = arith.xori %new_idxs_97, %new_idxs_132 : tensor<128x16xi32> loc(#loc199) + %flip_134 = tt.broadcast %flip_21 : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc155) + %flip_135 = tt.reshape %flip_134 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc156) + %y_136 = tt.reshape %ret_130 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc162) + %ileft_137 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<256x2x4xi32> loc(#loc164) + %ileft_138 = arith.muli %y_136, %ileft_137 : tensor<256x2x4xi32> loc(#loc164) + %ileft_139 = "tt.reduce"(%ileft_138) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc201) + %ileft_140 = tt.expand_dims %ileft_139 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc166) + %ileft_141 = tt.broadcast %ileft_140 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc167) + %iright_142 = arith.muli %y_136, %flip_59 : tensor<256x2x4xi32> loc(#loc168) + %iright_143 = "tt.reduce"(%iright_142) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc203) + %iright_144 = tt.expand_dims %iright_143 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc170) + %iright_145 = tt.broadcast %iright_144 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc171) + %ileft_146 = tt.reshape %ileft_141 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc172) + %iright_147 = tt.reshape %iright_145 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc173) + %y_idx_148 = tt.reshape %new_idxs_133 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc174) + %left_idx_149 = arith.muli %y_idx_148, %ileft_137 : tensor<256x2x4xi32> loc(#loc176) + %left_idx_150 = "tt.reduce"(%left_idx_149) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc206) + %left_idx_151 = tt.expand_dims %left_idx_150 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc178) + %left_idx_152 = tt.broadcast %left_idx_151 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc179) + %right_idx_153 = arith.muli %y_idx_148, %flip_59 : tensor<256x2x4xi32> loc(#loc181) + %right_idx_154 = "tt.reduce"(%right_idx_153) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc209) + %right_idx_155 = tt.expand_dims %right_idx_154 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc183) + %right_idx_156 = tt.broadcast %right_idx_155 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc184) + %left_idx_157 = tt.reshape %left_idx_152 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc185) + %right_idx_158 = tt.reshape %right_idx_156 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc186) + %cond_159 = arith.cmpi slt, %ileft_146, %iright_147 : tensor<128x16xi32> loc(#loc187) + %eq_160 = arith.cmpi eq, %ileft_146, %iright_147 : tensor<128x16xi32> loc(#loc188) + %cond_161 = arith.cmpi sgt, %left_idx_157, %right_idx_158 : tensor<128x16xi32> loc(#loc189) + %cond_162 = arith.andi %eq_160, %cond_161 : tensor<128x16xi1> loc(#loc190) + %cond_163 = arith.ori %cond_159, %cond_162 : tensor<128x16xi1> loc(#loc191) + %cond_164 = arith.extui %cond_163 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc192) + %cond_165 = arith.xori %cond_164, %flip_135 : tensor<128x16xi32> loc(#loc192) + %cond_166 = arith.cmpi ne, %cond_165, %cst_0 : tensor<128x16xi32> loc(#loc193) + %ret_167 = arith.xori %ileft_146, %iright_147 : tensor<128x16xi32> loc(#loc194) + %ret_168 = arith.select %cond_166, %ret_167, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc195) + %ret_169 = arith.xori %ret_130, %ret_168 : tensor<128x16xi32> loc(#loc196) + %new_idxs_170 = arith.xori %left_idx_157, %right_idx_158 : tensor<128x16xi32> loc(#loc197) + %new_idxs_171 = arith.select %cond_166, %new_idxs_170, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc198) + %new_idxs_172 = arith.xori %new_idxs_133, %new_idxs_171 : tensor<128x16xi32> loc(#loc199) + %y_173 = tt.reshape %ret_169 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc162) + %ileft_174 = arith.muli %y_173, %ileft_62 : tensor<512x2x2xi32> loc(#loc164) + %ileft_175 = "tt.reduce"(%ileft_174) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc201) + %ileft_176 = tt.expand_dims %ileft_175 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc166) + %ileft_177 = tt.broadcast %ileft_176 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc167) + %iright_178 = arith.muli %y_173, %flip_22 : tensor<512x2x2xi32> loc(#loc168) + %iright_179 = "tt.reduce"(%iright_178) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc203) + %iright_180 = tt.expand_dims %iright_179 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc170) + %iright_181 = tt.broadcast %iright_180 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc171) + %ileft_182 = tt.reshape %ileft_177 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc172) + %iright_183 = tt.reshape %iright_181 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc173) + %y_idx_184 = tt.reshape %new_idxs_172 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc174) + %left_idx_185 = arith.muli %y_idx_184, %ileft_62 : tensor<512x2x2xi32> loc(#loc176) + %left_idx_186 = "tt.reduce"(%left_idx_185) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc206) + %left_idx_187 = tt.expand_dims %left_idx_186 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc178) + %left_idx_188 = tt.broadcast %left_idx_187 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc179) + %right_idx_189 = arith.muli %y_idx_184, %flip_22 : tensor<512x2x2xi32> loc(#loc181) + %right_idx_190 = "tt.reduce"(%right_idx_189) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc209) + %right_idx_191 = tt.expand_dims %right_idx_190 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc183) + %right_idx_192 = tt.broadcast %right_idx_191 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc184) + %left_idx_193 = tt.reshape %left_idx_188 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc185) + %right_idx_194 = tt.reshape %right_idx_192 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc186) + %cond_195 = arith.cmpi slt, %ileft_182, %iright_183 : tensor<128x16xi32> loc(#loc187) + %eq_196 = arith.cmpi eq, %ileft_182, %iright_183 : tensor<128x16xi32> loc(#loc188) + %cond_197 = arith.cmpi sgt, %left_idx_193, %right_idx_194 : tensor<128x16xi32> loc(#loc189) + %cond_198 = arith.andi %eq_196, %cond_197 : tensor<128x16xi1> loc(#loc190) + %cond_199 = arith.ori %cond_195, %cond_198 : tensor<128x16xi1> loc(#loc191) + %cond_200 = arith.extui %cond_199 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc192) + %cond_201 = arith.xori %cond_200, %flip_135 : tensor<128x16xi32> loc(#loc192) + %cond_202 = arith.cmpi ne, %cond_201, %cst_0 : tensor<128x16xi32> loc(#loc193) + %ret_203 = arith.xori %ileft_182, %iright_183 : tensor<128x16xi32> loc(#loc194) + %ret_204 = arith.select %cond_202, %ret_203, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc195) + %ret_205 = arith.xori %ret_169, %ret_204 : tensor<128x16xi32> loc(#loc196) + %new_idxs_206 = arith.xori %left_idx_193, %right_idx_194 : tensor<128x16xi32> loc(#loc197) + %new_idxs_207 = arith.select %cond_202, %new_idxs_206, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc198) + %new_idxs_208 = arith.xori %new_idxs_172, %new_idxs_207 : tensor<128x16xi32> loc(#loc199) + %y_209 = tt.reshape %ret_205 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc162) + %ileft_210 = arith.muli %y_209, %ileft : tensor<1024x2x1xi32> loc(#loc164) + %ileft_211 = "tt.reduce"(%ileft_210) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc201) + %ileft_212 = tt.expand_dims %ileft_211 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc166) + %ileft_213 = tt.broadcast %ileft_212 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc167) + %iright_214 = arith.muli %y_209, %iright : tensor<1024x2x1xi32> loc(#loc168) + %iright_215 = "tt.reduce"(%iright_214) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc203) + %iright_216 = tt.expand_dims %iright_215 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc170) + %iright_217 = tt.broadcast %iright_216 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc171) + %ileft_218 = tt.reshape %ileft_213 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc172) + %iright_219 = tt.reshape %iright_217 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc173) + %y_idx_220 = tt.reshape %new_idxs_208 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc174) + %left_idx_221 = arith.muli %y_idx_220, %ileft : tensor<1024x2x1xi32> loc(#loc176) + %left_idx_222 = "tt.reduce"(%left_idx_221) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc206) + %left_idx_223 = tt.expand_dims %left_idx_222 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc178) + %left_idx_224 = tt.broadcast %left_idx_223 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc179) + %right_idx_225 = arith.muli %y_idx_220, %iright : tensor<1024x2x1xi32> loc(#loc181) + %right_idx_226 = "tt.reduce"(%right_idx_225) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc209) + %right_idx_227 = tt.expand_dims %right_idx_226 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc183) + %right_idx_228 = tt.broadcast %right_idx_227 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc184) + %left_idx_229 = tt.reshape %left_idx_224 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc185) + %right_idx_230 = tt.reshape %right_idx_228 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc186) + %cond_231 = arith.cmpi slt, %ileft_218, %iright_219 : tensor<128x16xi32> loc(#loc187) + %eq_232 = arith.cmpi eq, %ileft_218, %iright_219 : tensor<128x16xi32> loc(#loc188) + %cond_233 = arith.cmpi sgt, %left_idx_229, %right_idx_230 : tensor<128x16xi32> loc(#loc189) + %cond_234 = arith.andi %eq_232, %cond_233 : tensor<128x16xi1> loc(#loc190) + %cond_235 = arith.ori %cond_231, %cond_234 : tensor<128x16xi1> loc(#loc191) + %cond_236 = arith.extui %cond_235 : tensor<128x16xi1> to tensor<128x16xi32> loc(#loc192) + %cond_237 = arith.xori %cond_236, %flip_135 : tensor<128x16xi32> loc(#loc192) + %cond_238 = arith.cmpi ne, %cond_237, %cst_0 : tensor<128x16xi32> loc(#loc193) + %ret_239 = arith.xori %ileft_218, %iright_219 : tensor<128x16xi32> loc(#loc194) + %ret_240 = arith.select %cond_238, %ret_239, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc195) + %ret_241 = arith.xori %ret_205, %ret_240 : tensor<128x16xi32> loc(#loc196) + %new_idxs_242 = arith.xori %left_idx_229, %right_idx_230 : tensor<128x16xi32> loc(#loc197) + %new_idxs_243 = arith.select %cond_238, %new_idxs_242, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc198) + %new_idxs_244 = arith.xori %new_idxs_208, %new_idxs_243 : tensor<128x16xi32> loc(#loc199) + %y_245 = tt.reshape %ret_241 : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc162) + %ileft_246 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<128x2x8xi32> loc(#loc164) + %ileft_247 = arith.muli %y_245, %ileft_246 : tensor<128x2x8xi32> loc(#loc164) + %ileft_248 = "tt.reduce"(%ileft_247) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc201) + %ileft_249 = tt.expand_dims %ileft_248 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc166) + %ileft_250 = tt.broadcast %ileft_249 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc167) + %iright_251 = arith.muli %y_245, %flip_134 : tensor<128x2x8xi32> loc(#loc168) + %iright_252 = "tt.reduce"(%iright_251) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc203) + %iright_253 = tt.expand_dims %iright_252 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc170) + %iright_254 = tt.broadcast %iright_253 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc171) + %ileft_255 = tt.reshape %ileft_250 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc172) + %iright_256 = tt.reshape %iright_254 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc173) + %y_idx_257 = tt.reshape %new_idxs_244 : tensor<128x16xi32> -> tensor<128x2x8xi32> loc(#loc174) + %left_idx_258 = arith.muli %y_idx_257, %ileft_246 : tensor<128x2x8xi32> loc(#loc176) + %left_idx_259 = "tt.reduce"(%left_idx_258) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc206) + %left_idx_260 = tt.expand_dims %left_idx_259 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc178) + %left_idx_261 = tt.broadcast %left_idx_260 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc179) + %right_idx_262 = arith.muli %y_idx_257, %flip_134 : tensor<128x2x8xi32> loc(#loc181) + %right_idx_263 = "tt.reduce"(%right_idx_262) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<128x2x8xi32>) -> tensor<128x8xi32> loc(#loc209) + %right_idx_264 = tt.expand_dims %right_idx_263 {axis = 1 : i32} : tensor<128x8xi32> -> tensor<128x1x8xi32> loc(#loc183) + %right_idx_265 = tt.broadcast %right_idx_264 : tensor<128x1x8xi32> -> tensor<128x2x8xi32> loc(#loc184) + %left_idx_266 = tt.reshape %left_idx_261 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc185) + %right_idx_267 = tt.reshape %right_idx_265 : tensor<128x2x8xi32> -> tensor<128x16xi32> loc(#loc186) + %cond_268 = arith.cmpi slt, %ileft_255, %iright_256 : tensor<128x16xi32> loc(#loc187) + %eq_269 = arith.cmpi eq, %ileft_255, %iright_256 : tensor<128x16xi32> loc(#loc188) + %cond_270 = arith.cmpi sgt, %left_idx_266, %right_idx_267 : tensor<128x16xi32> loc(#loc189) + %cond_271 = arith.andi %eq_269, %cond_270 : tensor<128x16xi1> loc(#loc190) + %cond_272 = arith.ori %cond_268, %cond_271 : tensor<128x16xi1> loc(#loc191) + %ret_273 = arith.xori %ileft_255, %iright_256 : tensor<128x16xi32> loc(#loc194) + %ret_274 = arith.select %cond_272, %ret_273, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc195) + %ret_275 = arith.xori %ret_241, %ret_274 : tensor<128x16xi32> loc(#loc196) + %new_idxs_276 = arith.xori %left_idx_266, %right_idx_267 : tensor<128x16xi32> loc(#loc197) + %new_idxs_277 = arith.select %cond_272, %new_idxs_276, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc198) + %new_idxs_278 = arith.xori %new_idxs_244, %new_idxs_277 : tensor<128x16xi32> loc(#loc199) + %y_279 = tt.reshape %ret_275 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc162) + %ileft_280 = arith.muli %y_279, %ileft_137 : tensor<256x2x4xi32> loc(#loc164) + %ileft_281 = "tt.reduce"(%ileft_280) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc201) + %ileft_282 = tt.expand_dims %ileft_281 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc166) + %ileft_283 = tt.broadcast %ileft_282 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc167) + %iright_284 = arith.muli %y_279, %flip_59 : tensor<256x2x4xi32> loc(#loc168) + %iright_285 = "tt.reduce"(%iright_284) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc203) + %iright_286 = tt.expand_dims %iright_285 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc170) + %iright_287 = tt.broadcast %iright_286 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc171) + %ileft_288 = tt.reshape %ileft_283 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc172) + %iright_289 = tt.reshape %iright_287 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc173) + %y_idx_290 = tt.reshape %new_idxs_278 : tensor<128x16xi32> -> tensor<256x2x4xi32> loc(#loc174) + %left_idx_291 = arith.muli %y_idx_290, %ileft_137 : tensor<256x2x4xi32> loc(#loc176) + %left_idx_292 = "tt.reduce"(%left_idx_291) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc206) + %left_idx_293 = tt.expand_dims %left_idx_292 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc178) + %left_idx_294 = tt.broadcast %left_idx_293 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc179) + %right_idx_295 = arith.muli %y_idx_290, %flip_59 : tensor<256x2x4xi32> loc(#loc181) + %right_idx_296 = "tt.reduce"(%right_idx_295) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<256x2x4xi32>) -> tensor<256x4xi32> loc(#loc209) + %right_idx_297 = tt.expand_dims %right_idx_296 {axis = 1 : i32} : tensor<256x4xi32> -> tensor<256x1x4xi32> loc(#loc183) + %right_idx_298 = tt.broadcast %right_idx_297 : tensor<256x1x4xi32> -> tensor<256x2x4xi32> loc(#loc184) + %left_idx_299 = tt.reshape %left_idx_294 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc185) + %right_idx_300 = tt.reshape %right_idx_298 : tensor<256x2x4xi32> -> tensor<128x16xi32> loc(#loc186) + %cond_301 = arith.cmpi slt, %ileft_288, %iright_289 : tensor<128x16xi32> loc(#loc187) + %eq_302 = arith.cmpi eq, %ileft_288, %iright_289 : tensor<128x16xi32> loc(#loc188) + %cond_303 = arith.cmpi sgt, %left_idx_299, %right_idx_300 : tensor<128x16xi32> loc(#loc189) + %cond_304 = arith.andi %eq_302, %cond_303 : tensor<128x16xi1> loc(#loc190) + %cond_305 = arith.ori %cond_301, %cond_304 : tensor<128x16xi1> loc(#loc191) + %ret_306 = arith.xori %ileft_288, %iright_289 : tensor<128x16xi32> loc(#loc194) + %ret_307 = arith.select %cond_305, %ret_306, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc195) + %ret_308 = arith.xori %ret_275, %ret_307 : tensor<128x16xi32> loc(#loc196) + %new_idxs_309 = arith.xori %left_idx_299, %right_idx_300 : tensor<128x16xi32> loc(#loc197) + %new_idxs_310 = arith.select %cond_305, %new_idxs_309, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc198) + %new_idxs_311 = arith.xori %new_idxs_278, %new_idxs_310 : tensor<128x16xi32> loc(#loc199) + %y_312 = tt.reshape %ret_308 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc162) + %ileft_313 = arith.muli %y_312, %ileft_62 : tensor<512x2x2xi32> loc(#loc164) + %ileft_314 = "tt.reduce"(%ileft_313) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc201) + %ileft_315 = tt.expand_dims %ileft_314 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc166) + %ileft_316 = tt.broadcast %ileft_315 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc167) + %iright_317 = arith.muli %y_312, %flip_22 : tensor<512x2x2xi32> loc(#loc168) + %iright_318 = "tt.reduce"(%iright_317) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc203) + %iright_319 = tt.expand_dims %iright_318 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc170) + %iright_320 = tt.broadcast %iright_319 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc171) + %ileft_321 = tt.reshape %ileft_316 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc172) + %iright_322 = tt.reshape %iright_320 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc173) + %y_idx_323 = tt.reshape %new_idxs_311 : tensor<128x16xi32> -> tensor<512x2x2xi32> loc(#loc174) + %left_idx_324 = arith.muli %y_idx_323, %ileft_62 : tensor<512x2x2xi32> loc(#loc176) + %left_idx_325 = "tt.reduce"(%left_idx_324) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc206) + %left_idx_326 = tt.expand_dims %left_idx_325 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc178) + %left_idx_327 = tt.broadcast %left_idx_326 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc179) + %right_idx_328 = arith.muli %y_idx_323, %flip_22 : tensor<512x2x2xi32> loc(#loc181) + %right_idx_329 = "tt.reduce"(%right_idx_328) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<512x2x2xi32>) -> tensor<512x2xi32> loc(#loc209) + %right_idx_330 = tt.expand_dims %right_idx_329 {axis = 1 : i32} : tensor<512x2xi32> -> tensor<512x1x2xi32> loc(#loc183) + %right_idx_331 = tt.broadcast %right_idx_330 : tensor<512x1x2xi32> -> tensor<512x2x2xi32> loc(#loc184) + %left_idx_332 = tt.reshape %left_idx_327 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc185) + %right_idx_333 = tt.reshape %right_idx_331 : tensor<512x2x2xi32> -> tensor<128x16xi32> loc(#loc186) + %cond_334 = arith.cmpi slt, %ileft_321, %iright_322 : tensor<128x16xi32> loc(#loc187) + %eq_335 = arith.cmpi eq, %ileft_321, %iright_322 : tensor<128x16xi32> loc(#loc188) + %cond_336 = arith.cmpi sgt, %left_idx_332, %right_idx_333 : tensor<128x16xi32> loc(#loc189) + %cond_337 = arith.andi %eq_335, %cond_336 : tensor<128x16xi1> loc(#loc190) + %cond_338 = arith.ori %cond_334, %cond_337 : tensor<128x16xi1> loc(#loc191) + %ret_339 = arith.xori %ileft_321, %iright_322 : tensor<128x16xi32> loc(#loc194) + %ret_340 = arith.select %cond_338, %ret_339, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc195) + %ret_341 = arith.xori %ret_308, %ret_340 : tensor<128x16xi32> loc(#loc196) + %new_idxs_342 = arith.xori %left_idx_332, %right_idx_333 : tensor<128x16xi32> loc(#loc197) + %new_idxs_343 = arith.select %cond_338, %new_idxs_342, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc198) + %new_idxs_344 = arith.xori %new_idxs_311, %new_idxs_343 : tensor<128x16xi32> loc(#loc199) + %y_345 = tt.reshape %ret_341 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc162) + %ileft_346 = arith.muli %y_345, %ileft : tensor<1024x2x1xi32> loc(#loc164) + %ileft_347 = "tt.reduce"(%ileft_346) <{axis = 1 : i32}> ({ + ^bb0(%ileft_377: i32 loc(callsite(#loc1 at #loc165)), %ileft_378: i32 loc(callsite(#loc1 at #loc165))): + %ileft_379 = arith.addi %ileft_377, %ileft_378 : i32 loc(#loc211) + tt.reduce.return %ileft_379 : i32 loc(#loc201) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc201) + %ileft_348 = tt.expand_dims %ileft_347 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc166) + %ileft_349 = tt.broadcast %ileft_348 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc167) + %iright_350 = arith.muli %y_345, %iright : tensor<1024x2x1xi32> loc(#loc168) + %iright_351 = "tt.reduce"(%iright_350) <{axis = 1 : i32}> ({ + ^bb0(%iright_377: i32 loc(callsite(#loc1 at #loc169)), %iright_378: i32 loc(callsite(#loc1 at #loc169))): + %iright_379 = arith.addi %iright_377, %iright_378 : i32 loc(#loc212) + tt.reduce.return %iright_379 : i32 loc(#loc203) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc203) + %iright_352 = tt.expand_dims %iright_351 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc170) + %iright_353 = tt.broadcast %iright_352 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc171) + %ileft_354 = tt.reshape %ileft_349 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc172) + %iright_355 = tt.reshape %iright_353 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc173) + %y_idx_356 = tt.reshape %new_idxs_344 : tensor<128x16xi32> -> tensor<1024x2x1xi32> loc(#loc174) + %left_idx_357 = arith.muli %y_idx_356, %ileft : tensor<1024x2x1xi32> loc(#loc176) + %left_idx_358 = "tt.reduce"(%left_idx_357) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_377: i32 loc(callsite(#loc1 at #loc177)), %left_idx_378: i32 loc(callsite(#loc1 at #loc177))): + %left_idx_379 = arith.addi %left_idx_377, %left_idx_378 : i32 loc(#loc213) + tt.reduce.return %left_idx_379 : i32 loc(#loc206) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc206) + %left_idx_359 = tt.expand_dims %left_idx_358 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc178) + %left_idx_360 = tt.broadcast %left_idx_359 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc179) + %right_idx_361 = arith.muli %y_idx_356, %iright : tensor<1024x2x1xi32> loc(#loc181) + %right_idx_362 = "tt.reduce"(%right_idx_361) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_377: i32 loc(callsite(#loc1 at #loc182)), %right_idx_378: i32 loc(callsite(#loc1 at #loc182))): + %right_idx_379 = arith.addi %right_idx_377, %right_idx_378 : i32 loc(#loc214) + tt.reduce.return %right_idx_379 : i32 loc(#loc209) + }) : (tensor<1024x2x1xi32>) -> tensor<1024x1xi32> loc(#loc209) + %right_idx_363 = tt.expand_dims %right_idx_362 {axis = 1 : i32} : tensor<1024x1xi32> -> tensor<1024x1x1xi32> loc(#loc183) + %right_idx_364 = tt.broadcast %right_idx_363 : tensor<1024x1x1xi32> -> tensor<1024x2x1xi32> loc(#loc184) + %left_idx_365 = tt.reshape %left_idx_360 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc185) + %right_idx_366 = tt.reshape %right_idx_364 : tensor<1024x2x1xi32> -> tensor<128x16xi32> loc(#loc186) + %cond_367 = arith.cmpi slt, %ileft_354, %iright_355 : tensor<128x16xi32> loc(#loc187) + %eq_368 = arith.cmpi eq, %ileft_354, %iright_355 : tensor<128x16xi32> loc(#loc188) + %cond_369 = arith.cmpi sgt, %left_idx_365, %right_idx_366 : tensor<128x16xi32> loc(#loc189) + %cond_370 = arith.andi %eq_368, %cond_369 : tensor<128x16xi1> loc(#loc190) + %cond_371 = arith.ori %cond_367, %cond_370 : tensor<128x16xi1> loc(#loc191) + %new_idxs_372 = arith.xori %left_idx_365, %right_idx_366 : tensor<128x16xi32> loc(#loc197) + %new_idxs_373 = arith.select %cond_371, %new_idxs_372, %cst_0 : tensor<128x16xi1>, tensor<128x16xi32> loc(#loc198) + %new_idxs_374 = arith.xori %new_idxs_344, %new_idxs_373 : tensor<128x16xi32> loc(#loc199) + %tmp7 = arith.extsi %tmp0_19 : tensor<128x16xi32> to tensor<128x16xi64> loc(#loc149) + %tmp10_375 = arith.select %tmp0_18, %tmp7, %tmp10 : tensor<128x16xi1>, tensor<128x16xi64> loc(#loc86) + %tmp11 = "tt.reduce"(%tmp10_375) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_377: i64 loc(callsite(#loc1 at #loc150)), %tmp11_378: i64 loc(callsite(#loc1 at #loc150))): + %tmp11_379 = arith.addi %tmp11_377, %tmp11_378 : i64 loc(#loc200) + tt.reduce.return %tmp11_379 : i64 loc(#loc160) + }) : (tensor<128x16xi64>) -> tensor<128xi64> loc(#loc160) + %tmp11_376 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<128xi64> -> tensor<128x1xi64> loc(#loc151) + %tmp14 = arith.trunci %tmp11_376 : tensor<128x1xi64> to tensor<128x1xi32> loc(#loc152) + %0 = arith.muli %xindex_6, %cst_2 : tensor<128x1xi32> loc(#loc73) + %1 = tt.broadcast %r0_index_8 : tensor<1x16xi32> -> tensor<128x16xi32> loc(#loc74) + %2 = tt.broadcast %0 : tensor<128x1xi32> -> tensor<128x16xi32> loc(#loc74) + %3 = arith.addi %1, %2 : tensor<128x16xi32> loc(#loc74) + %4 = tt.splat %out_ptr2 : !tt.ptr -> tensor<128x16x!tt.ptr> loc(#loc75) + %5 = tt.addptr %4, %3 : tensor<128x16x!tt.ptr>, tensor<128x16xi32> loc(#loc75) + tt.store %5, %new_idxs_374, %tmp0_18 : tensor<128x16x!tt.ptr> loc(#loc76) + %6 = tt.splat %out_ptr3 : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc77) + %7 = tt.addptr %6, %xindex_6 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc77) + tt.store %7, %tmp14, %xmask_7 : tensor<128x1x!tt.ptr> loc(#loc78) + tt.return loc(#loc79) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":44:34) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:49) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:38) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":26:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":24:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":24:33) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":25:36) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":25:44) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":25:23) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":27:28) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":27:38) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":33:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":34:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:35) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:45) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:30) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":36:54) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":38:19) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":40:33) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":42:19) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":45:29) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":48:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:35) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:32) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":49:47) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:25) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:37) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4y/c4yxgotihoxpn6o5xa4jvkcy7shlgnyv44u6dpm5e746f6dwg7oe.py":50:4) +#loc85 = loc(callsite(#loc1 at #loc2)) +#loc86 = loc("tmp10"(#loc3)) +#loc87 = loc("tmp0"(#loc4)) +#loc88 = loc("tmp0"(#loc5)) +#loc89 = loc("xmask"(#loc6)) +#loc90 = loc("xoffset"(#loc7)) +#loc91 = loc("xoffset"(#loc8)) +#loc92 = loc("xindex"(#loc9)) +#loc93 = loc("xindex"(#loc10)) +#loc94 = loc("xindex"(#loc11)) +#loc95 = loc("r0_index"(#loc12)) +#loc96 = loc("r0_index"(#loc13)) +#loc97 = loc("x0"(#loc14)) +#loc98 = loc("x1"(#loc15)) +#loc99 = loc("tmp0"(#loc16)) +#loc100 = loc("tmp0"(#loc17)) +#loc101 = loc("tmp0"(#loc18)) +#loc102 = loc("tmp0"(#loc19)) +#loc103 = loc("tmp2"(#loc20)) +#loc104 = loc("tmp4"(#loc21)) +#loc105 = loc("flip"(#loc22)) +#loc107 = loc("flip"(#loc24)) +#loc108 = loc("flip"(#loc25)) +#loc109 = loc("flip"(#loc26)) +#loc110 = loc("y"(#loc27)) +#loc111 = loc("left_mask"(#loc29)) +#loc112 = loc("ileft"(#loc30)) +#loc114 = loc("ileft"(#loc34)) +#loc115 = loc("ileft"(#loc35)) +#loc116 = loc("iright"(#loc36)) +#loc118 = loc("iright"(#loc38)) +#loc119 = loc("iright"(#loc39)) +#loc120 = loc("ileft"(#loc40)) +#loc121 = loc("iright"(#loc41)) +#loc122 = loc("y_idx"(#loc42)) +#loc123 = loc("left_idx"(#loc43)) +#loc124 = loc("left_idx"(#loc44)) +#loc125 = loc("input"(#loc45)) +#loc127 = loc("left_idx"(#loc47)) +#loc128 = loc("left_idx"(#loc48)) +#loc129 = loc("right_idx"(#loc49)) +#loc130 = loc("right_idx"(#loc50)) +#loc132 = loc("right_idx"(#loc52)) +#loc133 = loc("right_idx"(#loc53)) +#loc134 = loc("left_idx"(#loc54)) +#loc135 = loc("right_idx"(#loc55)) +#loc136 = loc("cond"(#loc56)) +#loc137 = loc("eq"(#loc57)) +#loc138 = loc("cond"(#loc58)) +#loc139 = loc("cond"(#loc59)) +#loc140 = loc("cond"(#loc60)) +#loc141 = loc("cond"(#loc61)) +#loc142 = loc("cond"(#loc62)) +#loc143 = loc("ret"(#loc63)) +#loc144 = loc("ret"(#loc64)) +#loc145 = loc("ret"(#loc65)) +#loc146 = loc("new_idxs"(#loc66)) +#loc147 = loc("new_idxs"(#loc67)) +#loc148 = loc("new_idxs"(#loc68)) +#loc149 = loc("tmp7"(#loc69)) +#loc151 = loc("tmp11"(#loc71)) +#loc152 = loc("tmp14"(#loc72)) +#loc153 = loc(callsite(#loc105 at #loc106)) +#loc154 = loc(callsite(#loc107 at #loc106)) +#loc155 = loc(callsite(#loc108 at #loc106)) +#loc156 = loc(callsite(#loc109 at #loc106)) +#loc158 = loc("cond"(#loc136)) +#loc159 = loc("eq"(#loc137)) +#loc160 = loc(callsite(#loc31 at #loc150)) +#loc162 = loc(callsite(#loc110 at #loc157)) +#loc163 = loc(callsite(#loc111 at #loc157)) +#loc164 = loc(callsite(#loc112 at #loc157)) +#loc166 = loc(callsite(#loc114 at #loc157)) +#loc167 = loc(callsite(#loc115 at #loc157)) +#loc168 = loc(callsite(#loc116 at #loc157)) +#loc170 = loc(callsite(#loc118 at #loc157)) +#loc171 = loc(callsite(#loc119 at #loc157)) +#loc172 = loc(callsite(#loc120 at #loc157)) +#loc173 = loc(callsite(#loc121 at #loc157)) +#loc174 = loc(callsite(#loc122 at #loc157)) +#loc175 = loc(callsite(#loc123 at #loc157)) +#loc176 = loc(callsite(#loc124 at #loc157)) +#loc178 = loc(callsite(#loc127 at #loc157)) +#loc179 = loc(callsite(#loc128 at #loc157)) +#loc180 = loc(callsite(#loc129 at #loc157)) +#loc181 = loc(callsite(#loc130 at #loc157)) +#loc183 = loc(callsite(#loc132 at #loc157)) +#loc184 = loc(callsite(#loc133 at #loc157)) +#loc185 = loc(callsite(#loc134 at #loc157)) +#loc186 = loc(callsite(#loc135 at #loc157)) +#loc187 = loc(callsite(#loc158 at #loc157)) +#loc188 = loc(callsite(#loc159 at #loc157)) +#loc189 = loc(callsite(#loc138 at #loc157)) +#loc190 = loc(callsite(#loc139 at #loc157)) +#loc191 = loc(callsite(#loc140 at #loc157)) +#loc192 = loc(callsite(#loc141 at #loc157)) +#loc193 = loc(callsite(#loc142 at #loc157)) +#loc194 = loc(callsite(#loc143 at #loc157)) +#loc195 = loc(callsite(#loc144 at #loc157)) +#loc196 = loc(callsite(#loc145 at #loc157)) +#loc197 = loc(callsite(#loc146 at #loc157)) +#loc198 = loc(callsite(#loc147 at #loc157)) +#loc199 = loc(callsite(#loc148 at #loc157)) +#loc200 = loc(callsite(#loc33 at #loc160)) +#loc201 = loc(callsite(#loc31 at #loc165)) +#loc203 = loc(callsite(#loc31 at #loc169)) +#loc205 = loc(callsite(#loc125 at #loc177)) +#loc206 = loc(callsite(#loc31 at #loc177)) +#loc208 = loc(callsite(#loc125 at #loc182)) +#loc209 = loc(callsite(#loc31 at #loc182)) +#loc211 = loc(callsite(#loc33 at #loc201)) +#loc212 = loc(callsite(#loc33 at #loc203)) +#loc213 = loc(callsite(#loc33 at #loc206)) +#loc214 = loc(callsite(#loc33 at #loc209)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/__grp__triton_poi_fused_new_zeros_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/__grp__triton_poi_fused_new_zeros_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e9c1215239947449e1b1a5cfda70c385540030f4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/__grp__triton_poi_fused_new_zeros_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_new_zeros_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.source", "triton_poi_fused_new_zeros_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.ttir", "triton_poi_fused_new_zeros_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.ttgir", "triton_poi_fused_new_zeros_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.llir", "triton_poi_fused_new_zeros_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.ptx", "triton_poi_fused_new_zeros_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.cubin", "triton_poi_fused_new_zeros_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..975d6c5c1c4aadf502b60c4629d25583e2116aa9 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1763cdc7954c2932273615e3329fb5dda994f1a4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.json @@ -0,0 +1 @@ +{"hash": "bf245151a76d55e2c7c292294e25034deaf6226cb38e732a25dc0548371f67ff", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_new_zeros_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..39e573acec5cffdb8d52b7c4f2edc3a1e7786b70 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.llir @@ -0,0 +1,46 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_new_zeros_1(ptr addrspace(1) %0, i32 %1, ptr addrspace(1) readnone captures(none) %2, ptr addrspace(1) readnone captures(none) %3) local_unnamed_addr #0 !dbg !4 { + %5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %6 = shl i32 %5, 7, !dbg !8 + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %8 = and i32 %7, 127, !dbg !9 + %9 = or disjoint i32 %6, %8, !dbg !10 + %10 = icmp slt i32 %9, 544, !dbg !11 + %11 = sext i32 %9 to i64, !dbg !12 + %12 = getelementptr i32, ptr addrspace(1) %0, i64 %11, !dbg !12 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 0, ptr addrspace(1) %12, i1 %10) #2, !dbg !13 + ret void, !dbg !14 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_new_zeros_1", linkageName: "triton_poi_fused_new_zeros_1", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 20, column: 28, scope: !4) +!8 = !DILocation(line: 20, column: 33, scope: !4) +!9 = !DILocation(line: 21, column: 36, scope: !4) +!10 = !DILocation(line: 21, column: 23, scope: !4) +!11 = !DILocation(line: 22, column: 21, scope: !4) +!12 = !DILocation(line: 25, column: 25, scope: !4) +!13 = !DILocation(line: 25, column: 36, scope: !4) +!14 = !DILocation(line: 25, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..8f2846766e952882e7cbdd140372c6bd464033b7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.ptx @@ -0,0 +1,206 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_new_zeros_1 // -- Begin function triton_poi_fused_new_zeros_1 + // @triton_poi_fused_new_zeros_1 +.visible .entry triton_poi_fused_new_zeros_1( + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_1_param_0, + .param .u32 triton_poi_fused_new_zeros_1_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_1_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_1_param_3 +) +.reqntid 128 +{ + .reg .pred %p<2>; + .reg .b32 %r<7>; + .reg .b64 %rd<3>; + .loc 1 18 0 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:18:0 +$L__func_begin0: + .loc 1 18 0 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:18:0 + +// %bb.0: + ld.param.b64 %rd2, [triton_poi_fused_new_zeros_1_param_0]; +$L__tmp0: + .loc 1 20 28 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:20:28 + mov.u32 %r2, %ctaid.x; + .loc 1 20 33 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:20:33 + shl.b32 %r3, %r2, 7; + .loc 1 21 36 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:21:36 + mov.u32 %r4, %tid.x; + and.b32 %r5, %r4, 127; + .loc 1 21 23 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:21:23 + or.b32 %r6, %r3, %r5; + .loc 1 22 21 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:22:21 + setp.lt.s32 %p1, %r6, 544; + .loc 1 25 25 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:25:25 + mad.wide.s32 %rd1, %r6, 4, %rd2; + mov.b32 %r1, 0; + .loc 1 25 36 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:25:36 + // begin inline asm + @%p1 st.global.b32 [ %rd1 + 0 ], { %r1 }; + // end inline asm + .loc 1 25 4 // czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py:25:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 122 +.b8 103 +.b8 118 +.b8 98 +.b8 118 +.b8 55 +.b8 109 +.b8 114 +.b8 111 +.b8 115 +.b8 111 +.b8 52 +.b8 118 +.b8 104 +.b8 101 +.b8 97 +.b8 51 +.b8 105 +.b8 98 +.b8 120 +.b8 120 +.b8 111 +.b8 115 +.b8 97 +.b8 53 +.b8 115 +.b8 121 +.b8 117 +.b8 115 +.b8 117 +.b8 99 +.b8 103 +.b8 115 +.b8 55 +.b8 117 +.b8 52 +.b8 103 +.b8 105 +.b8 50 +.b8 99 +.b8 108 +.b8 51 +.b8 98 +.b8 106 +.b8 50 +.b8 111 +.b8 105 +.b8 52 +.b8 117 +.b8 53 +.b8 108 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 122 +.b8 103 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.source new file mode 100644 index 0000000000000000000000000000000000000000..f20a4ff226b7a48fff277dec4e9988d5c55d80d3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.source @@ -0,0 +1,41 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":18:0) +#loc11 = loc("out_ptr0"(#loc)) +#loc12 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_new_zeros_1(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 544 : i32 loc(#loc13) + %xoffset = tt.get_program_id x : i32 loc(#loc14) + %xoffset_1 = arith.constant 128 : i32 loc(#loc15) + %xoffset_2 = arith.constant 128 : i32 loc(#loc15) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc15) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc16) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<128xi32> loc(#loc17) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<128xi32> loc(#loc17) + %xmask = arith.constant dense<544> : tensor<128xi32> loc(#loc18) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<128xi32> loc(#loc18) + %tmp0 = arith.constant 0 : i32 loc(#loc19) + %tmp0_7 = arith.constant dense<0> : tensor<1xi32> loc(#loc19) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc8) + %1 = tt.addptr %0, %xindex_5 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc8) + %cst = arith.constant dense<0> : tensor<128xi32> loc(#loc9) + tt.store %1, %cst, %xmask_6 : tensor<128x!tt.ptr> loc(#loc9) + tt.return loc(#loc10) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":20:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":20:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":21:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":21:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":22:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":24:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":25:25) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":25:36) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":25:4) +#loc13 = loc("xnumel"(#loc1)) +#loc14 = loc("xoffset"(#loc2)) +#loc15 = loc("xoffset"(#loc3)) +#loc16 = loc("xindex"(#loc4)) +#loc17 = loc("xindex"(#loc5)) +#loc18 = loc("xmask"(#loc6)) +#loc19 = loc("tmp0"(#loc7)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..6b39aaff379161d745639ffeda7a55917de68fcd --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.ttgir @@ -0,0 +1,35 @@ +#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_new_zeros_1(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %cst = arith.constant dense<544> : tensor<128xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<128xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc12) + %xoffset_1 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc13) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked> loc(#loc14) + %xindex_2 = tt.splat %xoffset_1 : i32 -> tensor<128xi32, #blocked> loc(#loc15) + %xindex_3 = arith.addi %xindex_2, %xindex : tensor<128xi32, #blocked> loc(#loc15) + %xmask = arith.cmpi slt, %xindex_3, %cst : tensor<128xi32, #blocked> loc(#loc16) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr, #blocked> loc(#loc7) + %1 = tt.addptr %0, %xindex_3 : tensor<128x!tt.ptr, #blocked>, tensor<128xi32, #blocked> loc(#loc7) + tt.store %1, %cst_0, %xmask : tensor<128x!tt.ptr, #blocked> loc(#loc8) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":20:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":20:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":21:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":21:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":22:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":25:25) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":25:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":25:4) +#loc12 = loc("xoffset"(#loc2)) +#loc13 = loc("xoffset"(#loc3)) +#loc14 = loc("xindex"(#loc4)) +#loc15 = loc("xindex"(#loc5)) +#loc16 = loc("xmask"(#loc6)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..1825a30e57ea4d5cabab4279f507af74a611db94 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/X4SFCUNHNVK6FR6CSIUU4JIDJXVPMITMWOHHGKRF3QCUQNY7M77Q/triton_poi_fused_new_zeros_1.ttir @@ -0,0 +1,34 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_new_zeros_1(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<128xi32> loc(#loc1) + %xmask = arith.constant dense<544> : tensor<128xi32> loc(#loc12) + %c128_i32 = arith.constant 128 : i32 loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc13) + %xoffset_0 = arith.muli %xoffset, %c128_i32 : i32 loc(#loc14) + %xindex = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc15) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<128xi32> loc(#loc16) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<128xi32> loc(#loc16) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<128xi32> loc(#loc12) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc8) + %1 = tt.addptr %0, %xindex_2 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc8) + tt.store %1, %cst, %xmask_3 : tensor<128x!tt.ptr> loc(#loc1) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":25:36) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":22:21) +#loc3 = loc(unknown) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":20:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":20:33) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":21:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":21:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":25:25) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/zg/czgvbv7mroso4vhea3ibxxosa5syusucgs7u4gi2cl3bj2oi4u5l.py":25:4) +#loc12 = loc("xmask"(#loc2)) +#loc13 = loc("xoffset"(#loc4)) +#loc14 = loc("xoffset"(#loc5)) +#loc15 = loc("xindex"(#loc6)) +#loc16 = loc("xindex"(#loc7)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/__grp__triton_poi_fused_new_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/__grp__triton_poi_fused_new_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..907af56c62cde13947b63bbefe8e6abf5b8811e2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/__grp__triton_poi_fused_new_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_new_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.source", "triton_poi_fused_new_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.ttir", "triton_poi_fused_new_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.ttgir", "triton_poi_fused_new_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.llir", "triton_poi_fused_new_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.ptx", "triton_poi_fused_new_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.cubin", "triton_poi_fused_new_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..07d7639a29bcbda6eca05e7eda53281cdb587c90 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..400dcb7a30de2557068d0b750a287abe2aa3ed7a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.json @@ -0,0 +1 @@ +{"hash": "b8115d1ad7ed32bfb5ae129b61691e0026a47bdee3fc59c1f1bcb4ce023cb26c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_new_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..c829de73d2d2b2f0fd274ea2f40e12ee9551720e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.llir @@ -0,0 +1,47 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +; Function Attrs: nounwind +define ptx_kernel void @triton_poi_fused_new_zeros_0(ptr addrspace(1) %0, i32 %1, ptr addrspace(1) readnone captures(none) %2, ptr addrspace(1) readnone captures(none) %3) local_unnamed_addr #0 !dbg !4 { + %5 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %6 = shl i32 %5, 8, !dbg !8 + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %8 = shl nuw nsw i32 %7, 1, !dbg !9 + %9 = and i32 %8, 254, !dbg !9 + %10 = or disjoint i32 %9, %6, !dbg !10 + %11 = icmp slt i32 %10, %1, !dbg !11 + %12 = sext i32 %10 to i64, !dbg !12 + %13 = getelementptr i32, ptr addrspace(1) %0, i64 %12, !dbg !12 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 0, i32 0, ptr addrspace(1) %13, i1 %11) #2, !dbg !13 + ret void, !dbg !14 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_poi_fused_new_zeros_0", linkageName: "triton_poi_fused_new_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 19, column: 28, scope: !4) +!8 = !DILocation(line: 19, column: 33, scope: !4) +!9 = !DILocation(line: 20, column: 36, scope: !4) +!10 = !DILocation(line: 20, column: 23, scope: !4) +!11 = !DILocation(line: 21, column: 21, scope: !4) +!12 = !DILocation(line: 24, column: 25, scope: !4) +!13 = !DILocation(line: 24, column: 36, scope: !4) +!14 = !DILocation(line: 24, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..12fae01a39d19fc11b37c8c305ea1317ce496a73 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.ptx @@ -0,0 +1,208 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_new_zeros_0 // -- Begin function triton_poi_fused_new_zeros_0 + // @triton_poi_fused_new_zeros_0 +.visible .entry triton_poi_fused_new_zeros_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_0_param_0, + .param .u32 triton_poi_fused_new_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_new_zeros_0_param_3 +) +.reqntid 128 +{ + .reg .pred %p<2>; + .reg .b32 %r<10>; + .reg .b64 %rd<3>; + .loc 1 18 0 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:18:0 + +// %bb.0: + ld.param.b64 %rd2, [triton_poi_fused_new_zeros_0_param_0]; + ld.param.b32 %r3, [triton_poi_fused_new_zeros_0_param_1]; +$L__tmp0: + .loc 1 19 28 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:19:28 + mov.u32 %r4, %ctaid.x; + .loc 1 19 33 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:19:33 + shl.b32 %r5, %r4, 8; + .loc 1 20 36 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:20:36 + mov.u32 %r6, %tid.x; + shl.b32 %r7, %r6, 1; + and.b32 %r8, %r7, 254; + .loc 1 20 23 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:20:23 + or.b32 %r9, %r8, %r5; + .loc 1 21 21 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:21:21 + setp.lt.s32 %p1, %r9, %r3; + .loc 1 24 25 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:24:25 + mad.wide.s32 %rd1, %r9, 4, %rd2; + mov.b32 %r1, 0; + .loc 1 24 36 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:24:36 + // begin inline asm + @%p1 st.global.v2.b32 [ %rd1 + 0 ], { %r1, %r1 }; + // end inline asm + .loc 1 24 4 // cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py:24:4 + ret; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 117 +.b8 115 +.b8 115 +.b8 53 +.b8 112 +.b8 101 +.b8 114 +.b8 101 +.b8 107 +.b8 114 +.b8 105 +.b8 118 +.b8 53 +.b8 122 +.b8 108 +.b8 117 +.b8 98 +.b8 110 +.b8 107 +.b8 53 +.b8 50 +.b8 102 +.b8 51 +.b8 112 +.b8 101 +.b8 106 +.b8 53 +.b8 113 +.b8 99 +.b8 108 +.b8 122 +.b8 114 +.b8 109 +.b8 116 +.b8 97 +.b8 55 +.b8 99 +.b8 118 +.b8 105 +.b8 100 +.b8 104 +.b8 107 +.b8 112 +.b8 122 +.b8 104 +.b8 117 +.b8 112 +.b8 109 +.b8 118 +.b8 116 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 117 +.b8 115 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..fa59df81bde0165c0e3efd25869c4d69c41667f9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.source @@ -0,0 +1,38 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_new_zeros_0(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc12) + %xoffset_0 = arith.constant 256 : i32 loc(#loc13) + %xoffset_1 = arith.constant 256 : i32 loc(#loc13) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc13) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc14) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<256xi32> loc(#loc15) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<256xi32> loc(#loc15) + %xmask = tt.splat %xnumel : i32 -> tensor<256xi32> loc(#loc16) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<256xi32> loc(#loc16) + %tmp0 = arith.constant 0 : i32 loc(#loc17) + %tmp0_6 = arith.constant dense<0> : tensor<1xi32> loc(#loc17) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc7) + %1 = tt.addptr %0, %xindex_4 : tensor<256x!tt.ptr>, tensor<256xi32> loc(#loc7) + %cst = arith.constant dense<0> : tensor<256xi32> loc(#loc8) + tt.store %1, %cst, %xmask_5 : tensor<256x!tt.ptr> loc(#loc8) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":23:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":24:25) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":24:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":24:4) +#loc12 = loc("xoffset"(#loc1)) +#loc13 = loc("xoffset"(#loc2)) +#loc14 = loc("xindex"(#loc3)) +#loc15 = loc("xindex"(#loc4)) +#loc16 = loc("xmask"(#loc5)) +#loc17 = loc("tmp0"(#loc6)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..fd0f66b9d88d91cc758ddf24f1aff0608599ed14 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.ttgir @@ -0,0 +1,35 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_new_zeros_0(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %c256_i32 = arith.constant 256 : i32 loc(#loc1) + %cst = arith.constant dense<0> : tensor<256xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc12) + %xoffset_0 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc13) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> loc(#loc14) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<256xi32, #blocked> loc(#loc15) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<256xi32, #blocked> loc(#loc15) + %xmask = tt.splat %xnumel : i32 -> tensor<256xi32, #blocked> loc(#loc16) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<256xi32, #blocked> loc(#loc16) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr, #blocked> loc(#loc7) + %1 = tt.addptr %0, %xindex_2 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> loc(#loc7) + tt.store %1, %cst, %xmask_3 : tensor<256x!tt.ptr, #blocked> loc(#loc8) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":24:25) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":24:36) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":24:4) +#loc12 = loc("xoffset"(#loc2)) +#loc13 = loc("xoffset"(#loc3)) +#loc14 = loc("xindex"(#loc4)) +#loc15 = loc("xindex"(#loc5)) +#loc16 = loc("xmask"(#loc6)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..20685715caa71f9efb40259898569e649872ee10 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XAIV2GWX5UZL7NNOCKNWC2I6AATKI6664P6FTQPRXS2M4AR4WJWA/triton_poi_fused_new_zeros_0.ttir @@ -0,0 +1,34 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":18:0) +#loc10 = loc("out_ptr0"(#loc)) +#loc11 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_new_zeros_0(%out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<256xi32> loc(#loc1) + %c256_i32 = arith.constant 256 : i32 loc(#loc2) + %xoffset = tt.get_program_id x : i32 loc(#loc12) + %xoffset_0 = arith.muli %xoffset, %c256_i32 : i32 loc(#loc13) + %xindex = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> loc(#loc14) + %xindex_1 = tt.splat %xoffset_0 : i32 -> tensor<256xi32> loc(#loc15) + %xindex_2 = arith.addi %xindex_1, %xindex : tensor<256xi32> loc(#loc15) + %xmask = tt.splat %xnumel : i32 -> tensor<256xi32> loc(#loc16) + %xmask_3 = arith.cmpi slt, %xindex_2, %xmask : tensor<256xi32> loc(#loc16) + %0 = tt.splat %out_ptr0 : !tt.ptr -> tensor<256x!tt.ptr> loc(#loc8) + %1 = tt.addptr %0, %xindex_2 : tensor<256x!tt.ptr>, tensor<256xi32> loc(#loc8) + tt.store %1, %cst, %xmask_3 : tensor<256x!tt.ptr> loc(#loc1) + tt.return loc(#loc9) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":24:36) +#loc2 = loc(unknown) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":19:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":19:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":20:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":20:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":21:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":24:25) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/us/cuss5perekriv5zlubnk52f3pej5qclzrmta7cvidhkpzhupmvt5.py":24:4) +#loc12 = loc("xoffset"(#loc3)) +#loc13 = loc("xoffset"(#loc4)) +#loc14 = loc("xindex"(#loc5)) +#loc15 = loc("xindex"(#loc6)) +#loc16 = loc("xmask"(#loc7)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/__grp__triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/__grp__triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..e74939c542262932599ababf5bb1130ddd988462 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/__grp__triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_zeros_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.source", "triton_red_fused_zeros_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.ttir", "triton_red_fused_zeros_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.ttgir", "triton_red_fused_zeros_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.llir", "triton_red_fused_zeros_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.ptx", "triton_red_fused_zeros_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.cubin", "triton_red_fused_zeros_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..76956e562174ea85f0ab0ba19d8eb48a81f2ce23 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..74d7d5029fde4ea9b6f4d7bb857d7c71a1ff8e2e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "bbdfee9d88c27f9c7b92f1d8860fe7aac979e86c36a8e255b15fd66df4a213c8", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..75a87ed5ec086f77478624b28462b6f97116e6a6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.llir @@ -0,0 +1,215 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 6, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 504, !dbg !9 + %12 = lshr exact i32 %11, 3, !dbg !9 + %13 = or disjoint i32 %12, %9, !dbg !10 + %14 = shl nuw nsw i32 %10, 3, !dbg !11 + %15 = and i32 %14, 56, !dbg !11 + %16 = sdiv i32 %13, 2048, !dbg !12 + %17 = mul i32 %16, 2048, !dbg !13 + %.decomposed = sub i32 %13, %17, !dbg !13 + %18 = srem i32 %16, 32, !dbg !14 + %19 = sdiv i32 %13, 65536, !dbg !15 + %20 = shl nsw i32 %18, 7, !dbg !16 + %21 = shl nsw i32 %.decomposed, 12, !dbg !17 + %22 = shl i32 %19, 23, !dbg !18 + %23 = shl i32 %13, 7, !dbg !19 + %24 = add i32 %22, %21 + %25 = add i32 %24, %20 + %26 = zext nneg i32 %15 to i64, !dbg !20 + %27 = sext i32 %23 to i64, !dbg !20 + %28 = or disjoint i32 %25, %15, !dbg !21 + %29 = sext i32 %28 to i64, !dbg !22 + %30 = getelementptr bfloat, ptr addrspace(1) %0, i64 %29, !dbg !22 + %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !23 + %32 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %30, i64 %31, i1 true) #4, !dbg !23 + %33 = extractvalue { i32, i32, i32, i32 } %32, 0, !dbg !23 + %34 = bitcast i32 %33 to <2 x bfloat>, !dbg !23 + %35 = extractvalue { i32, i32, i32, i32 } %32, 1, !dbg !23 + %36 = bitcast i32 %35 to <2 x bfloat>, !dbg !23 + %37 = extractvalue { i32, i32, i32, i32 } %32, 2, !dbg !23 + %38 = bitcast i32 %37 to <2 x bfloat>, !dbg !23 + %39 = extractvalue { i32, i32, i32, i32 } %32, 3, !dbg !23 + %40 = bitcast i32 %39 to <2 x bfloat>, !dbg !23 + %41 = getelementptr bfloat, ptr addrspace(1) %1, i64 %26, !dbg !24 + %42 = getelementptr bfloat, ptr addrspace(1) %41, i64 %27, !dbg !24 + %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !25 + %44 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %42, i64 %43, i1 true) #4, !dbg !25 + %45 = extractvalue { i32, i32, i32, i32 } %44, 0, !dbg !25 + %46 = bitcast i32 %45 to <2 x bfloat>, !dbg !25 + %47 = extractvalue { i32, i32, i32, i32 } %44, 1, !dbg !25 + %48 = bitcast i32 %47 to <2 x bfloat>, !dbg !25 + %49 = extractvalue { i32, i32, i32, i32 } %44, 2, !dbg !25 + %50 = bitcast i32 %49 to <2 x bfloat>, !dbg !25 + %51 = extractvalue { i32, i32, i32, i32 } %44, 3, !dbg !25 + %52 = bitcast i32 %51 to <2 x bfloat>, !dbg !25 + %53 = or disjoint i64 %26, 64, !dbg !26 + %54 = trunc nuw nsw i64 %53 to i32, !dbg !21 + %55 = or disjoint i32 %25, %54, !dbg !21 + %56 = sext i32 %55 to i64, !dbg !22 + %57 = getelementptr bfloat, ptr addrspace(1) %0, i64 %56, !dbg !22 + %58 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !23 + %59 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %57, i64 %58, i1 true) #4, !dbg !23 + %60 = extractvalue { i32, i32, i32, i32 } %59, 0, !dbg !23 + %61 = bitcast i32 %60 to <2 x bfloat>, !dbg !23 + %62 = extractvalue { i32, i32, i32, i32 } %59, 1, !dbg !23 + %63 = bitcast i32 %62 to <2 x bfloat>, !dbg !23 + %64 = extractvalue { i32, i32, i32, i32 } %59, 2, !dbg !23 + %65 = bitcast i32 %64 to <2 x bfloat>, !dbg !23 + %66 = extractvalue { i32, i32, i32, i32 } %59, 3, !dbg !23 + %67 = bitcast i32 %66 to <2 x bfloat>, !dbg !23 + %68 = getelementptr bfloat, ptr addrspace(1) %1, i64 %53, !dbg !24 + %69 = getelementptr bfloat, ptr addrspace(1) %68, i64 %27, !dbg !24 + %70 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !25 + %71 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %69, i64 %70, i1 true) #4, !dbg !25 + %72 = extractvalue { i32, i32, i32, i32 } %71, 0, !dbg !25 + %73 = bitcast i32 %72 to <2 x bfloat>, !dbg !25 + %74 = extractvalue { i32, i32, i32, i32 } %71, 1, !dbg !25 + %75 = bitcast i32 %74 to <2 x bfloat>, !dbg !25 + %76 = extractvalue { i32, i32, i32, i32 } %71, 2, !dbg !25 + %77 = bitcast i32 %76 to <2 x bfloat>, !dbg !25 + %78 = extractvalue { i32, i32, i32, i32 } %71, 3, !dbg !25 + %79 = bitcast i32 %78 to <2 x bfloat>, !dbg !25 + %80 = fpext <2 x bfloat> %34 to <2 x float>, !dbg !27 + %81 = fpext <2 x bfloat> %46 to <2 x float>, !dbg !28 + %82 = fmul <2 x float> %80, %81, !dbg !29 + %83 = fadd <2 x float> %82, zeroinitializer, !dbg !30 + %84 = fpext <2 x bfloat> %61 to <2 x float>, !dbg !27 + %85 = fpext <2 x bfloat> %73 to <2 x float>, !dbg !28 + %86 = fmul <2 x float> %84, %85, !dbg !29 + %87 = fadd <2 x float> %83, %86, !dbg !30 + %88 = fpext <2 x bfloat> %36 to <2 x float>, !dbg !27 + %89 = fpext <2 x bfloat> %48 to <2 x float>, !dbg !28 + %90 = fmul <2 x float> %88, %89, !dbg !29 + %91 = fadd <2 x float> %90, zeroinitializer, !dbg !30 + %92 = fpext <2 x bfloat> %63 to <2 x float>, !dbg !27 + %93 = fpext <2 x bfloat> %75 to <2 x float>, !dbg !28 + %94 = fmul <2 x float> %92, %93, !dbg !29 + %95 = fadd <2 x float> %91, %94, !dbg !30 + %96 = fpext <2 x bfloat> %38 to <2 x float>, !dbg !27 + %97 = fpext <2 x bfloat> %50 to <2 x float>, !dbg !28 + %98 = fmul <2 x float> %96, %97, !dbg !29 + %99 = fadd <2 x float> %98, zeroinitializer, !dbg !30 + %100 = fpext <2 x bfloat> %65 to <2 x float>, !dbg !27 + %101 = fpext <2 x bfloat> %77 to <2 x float>, !dbg !28 + %102 = fmul <2 x float> %100, %101, !dbg !29 + %103 = fadd <2 x float> %99, %102, !dbg !30 + %104 = fpext <2 x bfloat> %40 to <2 x float>, !dbg !27 + %105 = fpext <2 x bfloat> %52 to <2 x float>, !dbg !28 + %106 = fmul <2 x float> %104, %105, !dbg !29 + %107 = fadd <2 x float> %106, zeroinitializer, !dbg !30 + %108 = fpext <2 x bfloat> %67 to <2 x float>, !dbg !27 + %109 = fpext <2 x bfloat> %79 to <2 x float>, !dbg !28 + %110 = fmul <2 x float> %108, %109, !dbg !29 + %111 = fadd <2 x float> %107, %110, !dbg !30 + %112 = and i32 %10, 63, !dbg !9 + %113 = or disjoint i32 %9, %112, !dbg !10 + %shift = shufflevector <2 x float> %87, <2 x float> poison, <2 x i32> , !dbg !31 + %foldExtExtBinop = fadd <2 x float> %87, %shift, !dbg !31 + %foldExtExtBinop9 = fadd <2 x float> %95, %foldExtExtBinop, !dbg !31 + %shift11 = shufflevector <2 x float> %95, <2 x float> poison, <2 x i32> , !dbg !31 + %foldExtExtBinop12 = fadd <2 x float> %shift11, %foldExtExtBinop9, !dbg !31 + %foldExtExtBinop14 = fadd <2 x float> %103, %foldExtExtBinop12, !dbg !31 + %shift16 = shufflevector <2 x float> %103, <2 x float> poison, <2 x i32> , !dbg !31 + %foldExtExtBinop17 = fadd <2 x float> %shift16, %foldExtExtBinop14, !dbg !31 + %foldExtExtBinop19 = fadd <2 x float> %111, %foldExtExtBinop17, !dbg !31 + %shift21 = shufflevector <2 x float> %111, <2 x float> poison, <2 x i32> , !dbg !31 + %foldExtExtBinop22 = fadd <2 x float> %shift21, %foldExtExtBinop19, !dbg !31 + %114 = extractelement <2 x float> %foldExtExtBinop22, i64 0, !dbg !31 + %115 = bitcast float %114 to i32, !dbg !35 + %116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %115, i32 4, i32 31), !dbg !35 + %117 = bitcast i32 %116 to float, !dbg !35 + %118 = fadd float %114, %117, !dbg !31 + %119 = bitcast float %118 to i32, !dbg !35 + %120 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %119, i32 2, i32 31), !dbg !35 + %121 = bitcast i32 %120 to float, !dbg !35 + %122 = fadd float %118, %121, !dbg !31 + %123 = bitcast float %122 to i32, !dbg !35 + %124 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %123, i32 1, i32 31), !dbg !35 + %125 = bitcast i32 %124 to float, !dbg !35 + %126 = fadd float %122, %125, !dbg !31 + %127 = lshr exact i32 %11, 1, !dbg !36 + %128 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %127, !dbg !36 + store float %126, ptr addrspace(3) %128, align 4, !dbg !36 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !36 + %129 = shl nuw nsw i32 %112, 2, !dbg !36 + %130 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %129, !dbg !36 + %131 = load i32, ptr addrspace(3) %130, align 4, !dbg !36 + %132 = sext i32 %113 to i64, !dbg !37 + %133 = getelementptr float, ptr addrspace(1) %2, i64 %132, !dbg !37 + %134 = and i32 %10, 448, !dbg !38 + %135 = icmp eq i32 %134, 0, !dbg !38 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %131, ptr addrspace(1) %133, i1 %135) #4, !dbg !38 + ret void, !dbg !39 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 21, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 29, column: 29, scope: !4) +!15 = !DILocation(line: 30, column: 19, scope: !4) +!16 = !DILocation(line: 39, column: 45, scope: !4) +!17 = !DILocation(line: 39, column: 55, scope: !4) +!18 = !DILocation(line: 39, column: 68, scope: !4) +!19 = !DILocation(line: 40, column: 45, scope: !4) +!20 = !DILocation(line: 33, column: 40, scope: !4) +!21 = !DILocation(line: 39, column: 60, scope: !4) +!22 = !DILocation(line: 39, column: 34, scope: !4) +!23 = !DILocation(line: 39, column: 73, scope: !4) +!24 = !DILocation(line: 40, column: 34, scope: !4) +!25 = !DILocation(line: 40, column: 50, scope: !4) +!26 = !DILocation(line: 34, column: 31, scope: !4) +!27 = !DILocation(line: 39, column: 127, scope: !4) +!28 = !DILocation(line: 40, column: 104, scope: !4) +!29 = !DILocation(line: 41, column: 22, scope: !4) +!30 = !DILocation(line: 43, column: 23, scope: !4) +!31 = !DILocation(line: 261, column: 15, scope: !32, inlinedAt: !34) +!32 = distinct !DILexicalBlockFile(scope: !4, file: !33, discriminator: 0) +!33 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!34 = !DILocation(line: 45, column: 25, scope: !4) +!35 = !DILocation(line: 291, column: 36, scope: !32, inlinedAt: !34) +!36 = !DILocation(line: 45, column: 28, scope: !4) +!37 = !DILocation(line: 49, column: 25, scope: !4) +!38 = !DILocation(line: 49, column: 36, scope: !4) +!39 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..08f81c63261a07109152769e35277e8a440c5ac4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.ptx @@ -0,0 +1,511 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u32 triton_red_fused_zeros_0_param_3, + .param .u32 triton_red_fused_zeros_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_6 +) +.reqntid 512 +{ + .reg .pred %p<6>; + .reg .b16 %rs<33>; + .reg .b32 %r<131>; + .reg .b64 %rd<23>; + .loc 1 18 0 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:18:0 + +// %bb.0: + ld.param.b64 %rd14, [triton_red_fused_zeros_0_param_0]; + ld.param.b64 %rd15, [triton_red_fused_zeros_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:23:28 + mov.u32 %r34, %ctaid.x; + .loc 1 23 33 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:23:33 + shl.b32 %r35, %r34, 6; + ld.param.b64 %rd16, [triton_red_fused_zeros_0_param_2]; + .loc 1 24 44 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:24:44 + mov.u32 %r36, %tid.x; + and.b32 %r37, %r36, 504; + bfe.u32 %r38, %r36, 3, 6; + .loc 1 24 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:24:23 + or.b32 %r39, %r38, %r35; + .loc 1 26 37 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:26:37 + shl.b32 %r40, %r36, 3; + and.b32 %r41, %r40, 56; + .loc 1 29 21 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:29:21 + bfe.s32 %r42, %r34, 25, 1; + shr.u32 %r43, %r42, 21; + add.s32 %r44, %r39, %r43; + shr.s32 %r45, %r44, 11; + .loc 1 28 19 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:28:19 + and.b32 %r46, %r44, 1046528; + sub.s32 %r47, %r39, %r46; + .loc 1 29 29 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:29:29 + shr.u32 %r48, %r45, 27; + add.s32 %r49, %r45, %r48; + and.b32 %r50, %r49, 33554400; + sub.s32 %r51, %r45, %r50; + .loc 1 30 19 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:30:19 + shr.u32 %r52, %r42, 16; + add.s32 %r53, %r39, %r52; + .loc 1 39 45 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:45 + shl.b32 %r54, %r51, 7; + .loc 1 39 55 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:55 + shl.b32 %r55, %r47, 12; + .loc 1 39 68 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:68 + shl.b32 %r56, %r53, 7; + and.b32 %r57, %r56, -8388608; + .loc 1 40 45 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:45 + shl.b32 %r58, %r39, 7; + add.s32 %r59, %r57, %r55; + add.s32 %r60, %r59, %r54; + .loc 1 33 40 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:33:40 + cvt.u64.u32 %rd17, %r41; + .loc 1 39 60 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:60 + or.b32 %r61, %r60, %r41; + .loc 1 39 34 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:34 + mad.wide.s32 %rd2, %r61, 2, %rd14; + .loc 1 39 73 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:73 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd3, 1.0; + // end inline asm + mov.b32 %r5, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd2 + 0 ], %rd3; + // end inline asm + .loc 1 40 34 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:34 + mad.wide.u32 %rd18, %r41, 2, %rd15; + mad.wide.s32 %rd5, %r58, 2, %rd18; + .loc 1 40 50 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:50 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r9, %r5; + mov.u32 %r10, %r5; + mov.u32 %r11, %r5; + mov.u32 %r12, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r9, %r10, %r11, %r12 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 39 34 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:34 + cvt.s64.s32 %rd19, %r60; + or.b64 %rd20, %rd19, %rd17; + shl.b64 %rd21, %rd20, 1; + add.s64 %rd22, %rd14, %rd21; + add.s64 %rd8, %rd22, 128; + .loc 1 39 73 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:73 + // begin inline asm + mov.u64 %rd9, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd9, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r17, %r5; + mov.u32 %r18, %r5; + mov.u32 %r19, %r5; + mov.u32 %r20, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r17, %r18, %r19, %r20 }, [ %rd8 + 0 ], %rd9; + // end inline asm + .loc 1 40 34 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:34 + add.s64 %rd11, %rd5, 128; + .loc 1 40 50 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:50 + // begin inline asm + mov.u64 %rd12, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd12, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r25, %r5; + mov.u32 %r26, %r5; + mov.u32 %r27, %r5; + mov.u32 %r28, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd11 + 0 ], %rd12; + // end inline asm + .loc 1 39 127 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:127 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r62, %rs2; + cvt.f32.bf16 %r63, %rs1; + .loc 1 40 104 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:104 + mov.b32 {%rs3, %rs4}, %r9; + cvt.f32.bf16 %r64, %rs4; + cvt.f32.bf16 %r65, %rs3; + .loc 1 43 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:43:23 + fma.rn.f32 %r66, %r63, %r65, 0f00000000; + fma.rn.f32 %r67, %r62, %r64, 0f00000000; + .loc 1 39 127 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:127 + mov.b32 {%rs5, %rs6}, %r17; + cvt.f32.bf16 %r68, %rs5; + cvt.f32.bf16 %r69, %rs6; + .loc 1 40 104 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:104 + mov.b32 {%rs7, %rs8}, %r25; + cvt.f32.bf16 %r70, %rs7; + cvt.f32.bf16 %r71, %rs8; + .loc 1 43 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:43:23 + fma.rn.f32 %r72, %r69, %r71, %r67; + fma.rn.f32 %r73, %r68, %r70, %r66; + .loc 1 39 127 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:127 + mov.b32 {%rs9, %rs10}, %r2; + cvt.f32.bf16 %r74, %rs10; + cvt.f32.bf16 %r75, %rs9; + .loc 1 40 104 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:104 + mov.b32 {%rs11, %rs12}, %r10; + cvt.f32.bf16 %r76, %rs12; + cvt.f32.bf16 %r77, %rs11; + .loc 1 43 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:43:23 + fma.rn.f32 %r78, %r75, %r77, 0f00000000; + fma.rn.f32 %r79, %r74, %r76, 0f00000000; + .loc 1 39 127 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:127 + mov.b32 {%rs13, %rs14}, %r18; + cvt.f32.bf16 %r80, %rs13; + cvt.f32.bf16 %r81, %rs14; + .loc 1 40 104 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:104 + mov.b32 {%rs15, %rs16}, %r26; + cvt.f32.bf16 %r82, %rs15; + cvt.f32.bf16 %r83, %rs16; + .loc 1 43 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:43:23 + fma.rn.f32 %r84, %r81, %r83, %r79; + fma.rn.f32 %r85, %r80, %r82, %r78; + .loc 1 39 127 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:127 + mov.b32 {%rs17, %rs18}, %r3; + cvt.f32.bf16 %r86, %rs18; + cvt.f32.bf16 %r87, %rs17; + .loc 1 40 104 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:104 + mov.b32 {%rs19, %rs20}, %r11; + cvt.f32.bf16 %r88, %rs20; + cvt.f32.bf16 %r89, %rs19; + .loc 1 43 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:43:23 + fma.rn.f32 %r90, %r87, %r89, 0f00000000; + fma.rn.f32 %r91, %r86, %r88, 0f00000000; + .loc 1 39 127 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:127 + mov.b32 {%rs21, %rs22}, %r19; + cvt.f32.bf16 %r92, %rs21; + cvt.f32.bf16 %r93, %rs22; + .loc 1 40 104 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:104 + mov.b32 {%rs23, %rs24}, %r27; + cvt.f32.bf16 %r94, %rs23; + cvt.f32.bf16 %r95, %rs24; + .loc 1 43 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:43:23 + fma.rn.f32 %r96, %r93, %r95, %r91; + fma.rn.f32 %r97, %r92, %r94, %r90; + .loc 1 39 127 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:127 + mov.b32 {%rs25, %rs26}, %r4; + cvt.f32.bf16 %r98, %rs26; + cvt.f32.bf16 %r99, %rs25; + .loc 1 40 104 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:104 + mov.b32 {%rs27, %rs28}, %r12; + cvt.f32.bf16 %r100, %rs28; + cvt.f32.bf16 %r101, %rs27; + .loc 1 43 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:43:23 + fma.rn.f32 %r102, %r99, %r101, 0f00000000; + fma.rn.f32 %r103, %r98, %r100, 0f00000000; + .loc 1 39 127 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:39:127 + mov.b32 {%rs29, %rs30}, %r20; + cvt.f32.bf16 %r104, %rs29; + cvt.f32.bf16 %r105, %rs30; + .loc 1 40 104 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:40:104 + mov.b32 {%rs31, %rs32}, %r28; + cvt.f32.bf16 %r106, %rs31; + cvt.f32.bf16 %r107, %rs32; + .loc 1 43 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:43:23 + fma.rn.f32 %r108, %r105, %r107, %r103; + fma.rn.f32 %r109, %r104, %r106, %r102; + .loc 1 24 44 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:24:44 + and.b32 %r110, %r36, 63; + .loc 1 24 23 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:24:23 + or.b32 %r111, %r35, %r110; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r112, %r73, %r72; + add.f32 %r113, %r85, %r112; + add.f32 %r114, %r84, %r113; + add.f32 %r115, %r97, %r114; + add.f32 %r116, %r96, %r115; + add.f32 %r117, %r109, %r116; + add.f32 %r118, %r108, %r117; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r119, %r118, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r120, %r118, %r119; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r121, %r120, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r122, %r120, %r121; + .loc 2 291 36 // standard.py:291:36 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + shfl.sync.bfly.b32 %r123, %r122, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:25 ] + add.f32 %r124, %r122, %r123; +$L__tmp2: + .loc 1 45 28 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:45:28 + shr.u32 %r125, %r37, 1; + mov.b32 %r126, global_smem; + add.s32 %r127, %r126, %r125; + st.shared.b32 [%r127], %r124; + bar.sync 0; + shl.b32 %r128, %r110, 2; + add.s32 %r129, %r126, %r128; + ld.shared.b32 %r33, [%r129]; + .loc 1 49 25 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:49:25 + mad.wide.s32 %rd13, %r111, 4, %rd16; + .loc 1 49 36 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:49:36 + and.b32 %r130, %r36, 448; + setp.eq.b32 %p5, %r130, 0; + // begin inline asm + @%p5 st.global.b32 [ %rd13 + 0 ], { %r33 }; + // end inline asm + .loc 1 49 4 // cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py:49:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 209 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xca DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 100 +.b8 113 +.b8 120 +.b8 120 +.b8 101 +.b8 118 +.b8 100 +.b8 121 +.b8 115 +.b8 115 +.b8 111 +.b8 121 +.b8 117 +.b8 116 +.b8 50 +.b8 101 +.b8 117 +.b8 119 +.b8 53 +.b8 53 +.b8 121 +.b8 50 +.b8 55 +.b8 99 +.b8 97 +.b8 104 +.b8 113 +.b8 113 +.b8 99 +.b8 103 +.b8 109 +.b8 118 +.b8 121 +.b8 117 +.b8 104 +.b8 100 +.b8 105 +.b8 104 +.b8 98 +.b8 52 +.b8 116 +.b8 109 +.b8 110 +.b8 101 +.b8 114 +.b8 55 +.b8 99 +.b8 102 +.b8 99 +.b8 55 +.b8 102 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 100 +.b8 113 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..e15763a515e118199e0efa74585e414deae8973e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.source @@ -0,0 +1,222 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":18:0) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc46 = loc(unknown) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc53 = loc("in_ptr0"(#loc)) +#loc54 = loc("in_ptr1"(#loc)) +#loc55 = loc("out_ptr1"(#loc)) +#loc56 = loc("xnumel"(#loc)) +#loc57 = loc("r0_numel"(#loc)) +#loc97 = loc("input"(#loc44)) +#loc98 = loc("a"(#loc49)) +#loc99 = loc("b"(#loc49)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 524288 : i32 loc(#loc58) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc59) + %xoffset = tt.get_program_id x : i32 loc(#loc60) + %xoffset_2 = arith.constant 64 : i32 loc(#loc61) + %xoffset_3 = arith.constant 64 : i32 loc(#loc61) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc61) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc62) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc63) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc64) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc64) + %xmask = arith.constant true loc(#loc65) + %xmask_8 = arith.constant dense : tensor<64x64xi1> loc(#loc65) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc66) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc67) + %x0 = arith.constant 2048 : i32 loc(#loc68) + %x0_10 = arith.constant 2048 : i32 loc(#loc68) + %x0_11 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc68) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc68) + %x1 = arith.constant 2048 : i32 loc(#loc69) + %x1_13 = arith.constant 2048 : i32 loc(#loc69) + %x1_14 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc69) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc69) + %x1_16 = arith.constant 32 : i32 loc(#loc70) + %x1_17 = arith.constant 32 : i32 loc(#loc70) + %x1_18 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc70) + %x1_19 = arith.remsi %x1_15, %x1_18 : tensor<64x1xi32> loc(#loc70) + %x2 = arith.constant 65536 : i32 loc(#loc71) + %x2_20 = arith.constant 65536 : i32 loc(#loc71) + %x2_21 = arith.constant dense<65536> : tensor<64x1xi32> loc(#loc71) + %x2_22 = arith.divsi %xindex_7, %x2_21 : tensor<64x1xi32> loc(#loc71) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc72) + %_tmp4_23 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc72) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c64_i32 = arith.constant 64 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c64_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_24 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_27 = %_tmp4_23) -> (tensor<64x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc74) + %r0_index_28 = arith.addi %r0_index, %r0_base_9 : tensor<1x64xi32> loc(#loc74) + %r0_mask = arith.constant dense<128> : tensor<1x64xi32> loc(#loc75) + %r0_mask_29 = arith.cmpi slt, %r0_index_28, %r0_mask : tensor<1x64xi32> loc(#loc75) + %tmp0 = arith.constant 128 : i32 loc(#loc76) + %tmp0_30 = arith.constant 128 : i32 loc(#loc76) + %tmp0_31 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc76) + %tmp0_32 = arith.muli %tmp0_31, %x1_19 : tensor<64x1xi32> loc(#loc76) + %tmp0_33 = tt.broadcast %r0_index_28 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc77) + %tmp0_34 = tt.broadcast %tmp0_32 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc77) + %tmp0_35 = arith.addi %tmp0_33, %tmp0_34 : tensor<64x64xi32> loc(#loc77) + %tmp0_36 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_37 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_38 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc78) + %tmp0_39 = arith.muli %tmp0_38, %x0_12 : tensor<64x1xi32> loc(#loc78) + %tmp0_40 = tt.broadcast %tmp0_39 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc79) + %tmp0_41 = arith.addi %tmp0_35, %tmp0_40 : tensor<64x64xi32> loc(#loc79) + %tmp0_42 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_43 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_44 = arith.constant dense<8388608> : tensor<64x1xi32> loc(#loc80) + %tmp0_45 = arith.muli %tmp0_44, %x2_22 : tensor<64x1xi32> loc(#loc80) + %tmp0_46 = tt.broadcast %tmp0_45 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc81) + %tmp0_47 = arith.addi %tmp0_41, %tmp0_46 : tensor<64x64xi32> loc(#loc81) + %tmp0_48 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc82) + %tmp0_49 = tt.addptr %tmp0_48, %tmp0_47 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc82) + %tmp0_50 = arith.constant 0.000000e+00 : f32 loc(#loc83) + %tmp0_51 = tt.broadcast %r0_mask_29 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc83) + %tmp0_52 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc83) + %tmp0_53 = arith.truncf %tmp0_52 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc83) + %tmp0_54 = tt.load %tmp0_49, %tmp0_51, %tmp0_53 evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc83) + %tmp0_55 = arith.extf %tmp0_54 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc84) + %tmp1 = arith.constant 128 : i32 loc(#loc85) + %tmp1_56 = arith.constant 128 : i32 loc(#loc85) + %tmp1_57 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc85) + %tmp1_58 = arith.muli %tmp1_57, %xindex_7 : tensor<64x1xi32> loc(#loc85) + %tmp1_59 = tt.broadcast %r0_index_28 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc86) + %tmp1_60 = tt.broadcast %tmp1_58 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc86) + %tmp1_61 = arith.addi %tmp1_59, %tmp1_60 : tensor<64x64xi32> loc(#loc86) + %tmp1_62 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc87) + %tmp1_63 = tt.addptr %tmp1_62, %tmp1_61 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc87) + %tmp1_64 = arith.constant 0.000000e+00 : f32 loc(#loc88) + %tmp1_65 = tt.broadcast %r0_mask_29 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc88) + %tmp1_66 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc88) + %tmp1_67 = arith.truncf %tmp1_66 : tensor<64x64xf32> to tensor<64x64xbf16> loc(#loc88) + %tmp1_68 = tt.load %tmp1_63, %tmp1_65, %tmp1_67 evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc88) + %tmp1_69 = arith.extf %tmp1_68 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc89) + %tmp2 = arith.mulf %tmp0_55, %tmp1_69 : tensor<64x64xf32> loc(#loc90) + %tmp5 = arith.addf %_tmp4_27, %tmp2 : tensor<64x64xf32> loc(#loc91) + %_tmp4_70 = tt.broadcast %r0_mask_29 : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc92) + %_tmp4_71 = arith.select %_tmp4_70, %tmp5, %_tmp4_27 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc92) + scf.yield %_tmp4_71 : tensor<64x64xf32> loc(#loc36) + } loc(#loc73) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_24) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc93) + %tmp4_25 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc94) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc95) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<64x1xf32> loc(#loc96) + %tmp8_26 = arith.subf %tmp4_25, %tmp8 : tensor<64x1xf32> loc(#loc96) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc41) + %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc41) + tt.store %5, %tmp8_26 : tensor<64x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S64_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x64xf32> loc("input"(#loc44))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc45) + tt.reduce.return %2 : f32 loc(#loc45) + }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc45) + tt.return %0 : tensor<64xf32> loc(#loc47) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc48) + tt.return %1 : tensor<64xf32> loc(#loc48) + } loc(#loc44) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc49)), %b: f32 loc("b"(#loc49))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc50) + tt.return %0 : f32 loc(#loc51) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc52) + tt.return %1 : f32 loc(#loc52) + } loc(#loc49) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":30:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":32:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:68) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:60) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:73) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:127) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:45) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:50) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:104) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":41:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":43:23) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:40) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:28) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":47:11) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":48:18) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:4) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc58 = loc("xnumel"(#loc1)) +#loc59 = loc("r0_numel"(#loc2)) +#loc60 = loc("xoffset"(#loc3)) +#loc61 = loc("xoffset"(#loc4)) +#loc62 = loc("xindex"(#loc5)) +#loc63 = loc("xindex"(#loc6)) +#loc64 = loc("xindex"(#loc7)) +#loc65 = loc("xmask"(#loc8)) +#loc66 = loc("r0_base"(#loc9)) +#loc67 = loc("r0_base"(#loc10)) +#loc68 = loc("x0"(#loc11)) +#loc69 = loc("x1"(#loc12)) +#loc70 = loc("x1"(#loc13)) +#loc71 = loc("x2"(#loc14)) +#loc72 = loc("_tmp4"(#loc15)) +#loc73 = loc("_tmp4"(#loc16)) +#loc74 = loc("r0_index"(#loc17)) +#loc75 = loc("r0_mask"(#loc18)) +#loc76 = loc("tmp0"(#loc19)) +#loc77 = loc("tmp0"(#loc20)) +#loc78 = loc("tmp0"(#loc21)) +#loc79 = loc("tmp0"(#loc22)) +#loc80 = loc("tmp0"(#loc23)) +#loc81 = loc("tmp0"(#loc24)) +#loc82 = loc("tmp0"(#loc25)) +#loc83 = loc("tmp0"(#loc26)) +#loc84 = loc("tmp0"(#loc27)) +#loc85 = loc("tmp1"(#loc28)) +#loc86 = loc("tmp1"(#loc29)) +#loc87 = loc("tmp1"(#loc30)) +#loc88 = loc("tmp1"(#loc31)) +#loc89 = loc("tmp1"(#loc32)) +#loc90 = loc("tmp2"(#loc33)) +#loc91 = loc("tmp5"(#loc34)) +#loc92 = loc("_tmp4"(#loc35)) +#loc93 = loc("tmp4"(#loc37)) +#loc94 = loc("tmp4"(#loc38)) +#loc95 = loc("tmp7"(#loc39)) +#loc96 = loc("tmp8"(#loc40)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..246724146d502e24c51646bcdde04c312202ce14 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,154 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [16, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 8], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":18:0) +#loc1 = loc(unknown) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:25) +#loc39 = loc("in_ptr0"(#loc)) +#loc40 = loc("in_ptr1"(#loc)) +#loc41 = loc("out_ptr1"(#loc)) +#loc42 = loc("xnumel"(#loc)) +#loc43 = loc("r0_numel"(#loc)) +#loc73 = loc("tmp4"(#loc33)) +#loc76 = loc(callsite(#loc1 at #loc73)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x64xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<8388608> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<65536> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<2048> : tensor<64x1xi32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc44) + %xoffset_8 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc45) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc46) + %xindex_9 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc46) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc46) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc46) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked> loc(#loc47) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc47) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<64x1xi32, #blocked> loc(#loc47) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<64x1xi32, #blocked1> loc(#loc47) + %r0_base = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc48) + %r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> loc(#loc48) + %x0 = arith.remsi %xindex_14, %cst_5 : tensor<64x1xi32, #blocked> loc(#loc49) + %x1 = arith.divsi %xindex_14, %cst_5 : tensor<64x1xi32, #blocked> loc(#loc50) + %x1_17 = arith.remsi %x1, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc51) + %x2 = arith.divsi %xindex_14, %cst_3 : tensor<64x1xi32, #blocked> loc(#loc52) + %tmp0 = arith.muli %x1_17, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc53) + %tmp0_18 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc54) + %tmp0_19 = arith.muli %x0, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc55) + %tmp0_20 = tt.broadcast %tmp0_19 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc56) + %tmp0_21 = arith.muli %x2, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc57) + %tmp0_22 = tt.broadcast %tmp0_21 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc58) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> loc(#loc59) + %tmp1 = arith.muli %xindex_14, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc60) + %tmp1_24 = tt.broadcast %tmp1 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc61) + %tmp1_25 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> loc(#loc62) + %_tmp4 = scf.for %_tmp4_28 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg6 = %cst_7) -> (tensor<64x64xf32, #blocked>) : i32 { + %r0_index = tt.splat %_tmp4_28 : i32 -> tensor<1x64xi32, #blocked> loc(#loc64) + %r0_index_29 = arith.addi %r0_index, %r0_base_16 : tensor<1x64xi32, #blocked> loc(#loc64) + %r0_mask = arith.cmpi slt, %r0_index_29, %cst : tensor<1x64xi32, #blocked> loc(#loc65) + %tmp0_30 = tt.broadcast %r0_index_29 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked> loc(#loc54) + %tmp0_31 = arith.addi %tmp0_30, %tmp0_18 : tensor<64x64xi32, #blocked> loc(#loc54) + %tmp0_32 = arith.addi %tmp0_31, %tmp0_20 : tensor<64x64xi32, #blocked> loc(#loc56) + %tmp0_33 = arith.addi %tmp0_32, %tmp0_22 : tensor<64x64xi32, #blocked> loc(#loc58) + %tmp0_34 = tt.addptr %tmp0_23, %tmp0_33 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi32, #blocked> loc(#loc59) + %tmp0_35 = tt.broadcast %r0_mask : tensor<1x64xi1, #blocked> -> tensor<64x64xi1, #blocked> loc(#loc66) + %tmp0_36 = tt.load %tmp0_34, %tmp0_35, %cst_6 evictionPolicy = evict_first : tensor<64x64x!tt.ptr, #blocked> loc(#loc66) + %tmp0_37 = arith.extf %tmp0_36 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc67) + %tmp1_38 = arith.addi %tmp0_30, %tmp1_24 : tensor<64x64xi32, #blocked> loc(#loc61) + %tmp1_39 = tt.addptr %tmp1_25, %tmp1_38 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi32, #blocked> loc(#loc62) + %tmp1_40 = tt.load %tmp1_39, %tmp0_35, %cst_6 evictionPolicy = evict_first : tensor<64x64x!tt.ptr, #blocked> loc(#loc68) + %tmp1_41 = arith.extf %tmp1_40 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked> loc(#loc69) + %tmp2 = arith.mulf %tmp0_37, %tmp1_41 : tensor<64x64xf32, #blocked> loc(#loc70) + %tmp5 = arith.addf %arg6, %tmp2 : tensor<64x64xf32, #blocked> loc(#loc71) + %_tmp4_42 = arith.select %tmp0_35, %tmp5, %arg6 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked> loc(#loc72) + scf.yield %_tmp4_42 : tensor<64x64xf32, #blocked> loc(#loc31) + } loc(#loc63) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_28: f32 loc(callsite(#loc1 at #loc73)), %tmp4_29: f32 loc(callsite(#loc1 at #loc73))): + %tmp4_30 = arith.addf %tmp4_28, %tmp4_29 : f32 loc(#loc77) + tt.reduce.return %tmp4_30 : f32 loc(#loc75) + }) : (tensor<64x64xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc75) + %tmp4_26 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc74) + %tmp4_27 = tt.expand_dims %tmp4_26 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc74) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc36) + %1 = tt.addptr %0, %xindex_15 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc36) + tt.store %1, %tmp4_27 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc37) + tt.return loc(#loc38) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":30:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:41) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:55) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:50) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:68) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:60) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":33:40) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":34:31) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":35:29) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:73) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:127) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:104) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":41:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":43:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:40) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:8) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:28) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:36) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:4) +#loc44 = loc("xoffset"(#loc2)) +#loc45 = loc("xoffset"(#loc3)) +#loc46 = loc("xindex"(#loc4)) +#loc47 = loc("xindex"(#loc5)) +#loc48 = loc("r0_base"(#loc6)) +#loc49 = loc("x0"(#loc7)) +#loc50 = loc("x1"(#loc8)) +#loc51 = loc("x1"(#loc9)) +#loc52 = loc("x2"(#loc10)) +#loc53 = loc("tmp0"(#loc11)) +#loc54 = loc("tmp0"(#loc12)) +#loc55 = loc("tmp0"(#loc13)) +#loc56 = loc("tmp0"(#loc14)) +#loc57 = loc("tmp0"(#loc15)) +#loc58 = loc("tmp0"(#loc16)) +#loc59 = loc("tmp0"(#loc17)) +#loc60 = loc("tmp1"(#loc18)) +#loc61 = loc("tmp1"(#loc19)) +#loc62 = loc("tmp1"(#loc20)) +#loc63 = loc("_tmp4"(#loc21)) +#loc64 = loc("r0_index"(#loc22)) +#loc65 = loc("r0_mask"(#loc23)) +#loc66 = loc("tmp0"(#loc24)) +#loc67 = loc("tmp0"(#loc25)) +#loc68 = loc("tmp1"(#loc26)) +#loc69 = loc("tmp1"(#loc27)) +#loc70 = loc("tmp2"(#loc28)) +#loc71 = loc("tmp5"(#loc29)) +#loc72 = loc("_tmp4"(#loc30)) +#loc74 = loc("tmp4"(#loc35)) +#loc75 = loc(callsite(#loc32 at #loc73)) +#loc77 = loc(callsite(#loc34 at #loc75)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..0bd392c6009d94d70728b87dd539c1f93fd6dec4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XPP65HMIYJ7ZY64S6HMIMD7HVLEXT2DMG2UOEVNRL7LG35FCCPEA/triton_red_fused_zeros_0.ttir @@ -0,0 +1,148 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":18:0) +#loc1 = loc(unknown) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:25) +#loc40 = loc("in_ptr0"(#loc)) +#loc41 = loc("in_ptr1"(#loc)) +#loc42 = loc("out_ptr1"(#loc)) +#loc43 = loc("xnumel"(#loc)) +#loc44 = loc("r0_numel"(#loc)) +#loc75 = loc("tmp4"(#loc34)) +#loc78 = loc(callsite(#loc1 at #loc75)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x64xbf16> loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst_0 = arith.constant dense<8388608> : tensor<64x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<128> : tensor<1x64xi32> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> loc(#loc1) + %x2 = arith.constant dense<65536> : tensor<64x1xi32> loc(#loc45) + %x1 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc46) + %cst_5 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc47) + %xoffset_6 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc48) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc49) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc50) + %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<64x1xi32> loc(#loc51) + %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<64x1xi32> loc(#loc51) + %r0_base = tt.expand_dims %xindex {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc52) + %x0 = arith.remsi %xindex_9, %cst_5 : tensor<64x1xi32> loc(#loc53) + %x1_10 = arith.divsi %xindex_9, %cst_5 : tensor<64x1xi32> loc(#loc54) + %x1_11 = arith.remsi %x1_10, %x1 : tensor<64x1xi32> loc(#loc46) + %x2_12 = arith.divsi %xindex_9, %x2 : tensor<64x1xi32> loc(#loc45) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%_tmp4_14 = %cst_4) -> (tensor<64x64xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x64xi32> loc(#loc56) + %r0_index_15 = arith.addi %r0_index, %r0_base : tensor<1x64xi32> loc(#loc56) + %r0_mask = arith.cmpi slt, %r0_index_15, %cst_3 : tensor<1x64xi32> loc(#loc57) + %tmp0 = arith.muli %x1_11, %cst_2 : tensor<64x1xi32> loc(#loc58) + %tmp0_16 = tt.broadcast %r0_index_15 : tensor<1x64xi32> -> tensor<64x64xi32> loc(#loc59) + %tmp0_17 = tt.broadcast %tmp0 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc59) + %tmp0_18 = arith.addi %tmp0_16, %tmp0_17 : tensor<64x64xi32> loc(#loc59) + %tmp0_19 = arith.muli %x0, %cst_1 : tensor<64x1xi32> loc(#loc60) + %tmp0_20 = tt.broadcast %tmp0_19 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc61) + %tmp0_21 = arith.addi %tmp0_18, %tmp0_20 : tensor<64x64xi32> loc(#loc61) + %tmp0_22 = arith.muli %x2_12, %cst_0 : tensor<64x1xi32> loc(#loc62) + %tmp0_23 = tt.broadcast %tmp0_22 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc63) + %tmp0_24 = arith.addi %tmp0_21, %tmp0_23 : tensor<64x64xi32> loc(#loc63) + %tmp0_25 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc64) + %tmp0_26 = tt.addptr %tmp0_25, %tmp0_24 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc64) + %tmp0_27 = tt.broadcast %r0_mask : tensor<1x64xi1> -> tensor<64x64xi1> loc(#loc65) + %tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc65) + %tmp0_29 = arith.extf %tmp0_28 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc66) + %tmp1 = arith.muli %xindex_9, %cst_2 : tensor<64x1xi32> loc(#loc67) + %tmp1_30 = tt.broadcast %tmp1 : tensor<64x1xi32> -> tensor<64x64xi32> loc(#loc68) + %tmp1_31 = arith.addi %tmp0_16, %tmp1_30 : tensor<64x64xi32> loc(#loc68) + %tmp1_32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x64x!tt.ptr> loc(#loc69) + %tmp1_33 = tt.addptr %tmp1_32, %tmp1_31 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> loc(#loc69) + %tmp1_34 = tt.load %tmp1_33, %tmp0_27, %cst evictionPolicy = evict_first : tensor<64x64x!tt.ptr> loc(#loc70) + %tmp1_35 = arith.extf %tmp1_34 : tensor<64x64xbf16> to tensor<64x64xf32> loc(#loc71) + %tmp2 = arith.mulf %tmp0_29, %tmp1_35 : tensor<64x64xf32> loc(#loc72) + %tmp5 = arith.addf %_tmp4_14, %tmp2 : tensor<64x64xf32> loc(#loc73) + %_tmp4_36 = arith.select %tmp0_27, %tmp5, %_tmp4_14 : tensor<64x64xi1>, tensor<64x64xf32> loc(#loc74) + scf.yield %_tmp4_36 : tensor<64x64xf32> loc(#loc32) + } loc(#loc55) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_14: f32 loc(callsite(#loc1 at #loc75)), %tmp4_15: f32 loc(callsite(#loc1 at #loc75))): + %tmp4_16 = arith.addf %tmp4_14, %tmp4_15 : f32 loc(#loc79) + tt.reduce.return %tmp4_16 : f32 loc(#loc77) + }) : (tensor<64x64xf32>) -> tensor<64xf32> loc(#loc77) + %tmp4_13 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc76) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc37) + %1 = tt.addptr %0, %xindex_9 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc37) + tt.store %1, %tmp4_13 : tensor<64x1x!tt.ptr> loc(#loc38) + tt.return loc(#loc39) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":33:40) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":30:19) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:29) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:28) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":23:33) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:36) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:44) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":24:23) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":34:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":35:29) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:45) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:41) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:55) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:50) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:68) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:34) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:73) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":39:127) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:45) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:41) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:34) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:50) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":40:104) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":41:22) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":43:23) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:40) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":44:8) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":45:28) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:36) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/dq/cdqxxevdyssoyut2euw55y27cahqqcgmvyuhdihb4tmner7cfc7f.py":49:4) +#loc45 = loc("x2"(#loc3)) +#loc46 = loc("x1"(#loc4)) +#loc47 = loc("xoffset"(#loc5)) +#loc48 = loc("xoffset"(#loc6)) +#loc49 = loc("xindex"(#loc7)) +#loc50 = loc("xindex"(#loc8)) +#loc51 = loc("xindex"(#loc9)) +#loc52 = loc("r0_base"(#loc10)) +#loc53 = loc("x0"(#loc11)) +#loc54 = loc("x1"(#loc12)) +#loc55 = loc("_tmp4"(#loc2)) +#loc56 = loc("r0_index"(#loc13)) +#loc57 = loc("r0_mask"(#loc14)) +#loc58 = loc("tmp0"(#loc15)) +#loc59 = loc("tmp0"(#loc16)) +#loc60 = loc("tmp0"(#loc17)) +#loc61 = loc("tmp0"(#loc18)) +#loc62 = loc("tmp0"(#loc19)) +#loc63 = loc("tmp0"(#loc20)) +#loc64 = loc("tmp0"(#loc21)) +#loc65 = loc("tmp0"(#loc22)) +#loc66 = loc("tmp0"(#loc23)) +#loc67 = loc("tmp1"(#loc24)) +#loc68 = loc("tmp1"(#loc25)) +#loc69 = loc("tmp1"(#loc26)) +#loc70 = loc("tmp1"(#loc27)) +#loc71 = loc("tmp1"(#loc28)) +#loc72 = loc("tmp2"(#loc29)) +#loc73 = loc("tmp5"(#loc30)) +#loc74 = loc("_tmp4"(#loc31)) +#loc76 = loc("tmp4"(#loc36)) +#loc77 = loc(callsite(#loc33 at #loc75)) +#loc79 = loc(callsite(#loc35 at #loc77)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..475a743a210a8473b5b6af138ef1b7f8ffb8f8e2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/__grp__triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.llir", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin", "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..205071b9100094ab3e27cd69433e810e3c0be980 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b8f23c2417ceaa38d431ce5344048336932539b5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.json @@ -0,0 +1 @@ +{"hash": "bc63a85e7982b83906634d26dbbbc5216e8d21115485d36241b2e6befa4a472c", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..573cd55f70212da91c194fd75110088176353dff --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ptx @@ -0,0 +1,759 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0 // -- Begin function triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_2[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_2[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 109, 98, 47, 99, 109, 98, 53, 122, 108, 108, 100, 107, 114, 102, 108, 101, 121, 104, 99, 107, 104, 105, 111, 109, 104, 112, 102, 115, 118, 107, 102, 107, 108, 108, 111, 116, 98, 111, 112, 97, 108, 100, 100, 119, 107, 100, 102, 104, 108, 105, 122, 118, 101, 117, 120, 46, 112, 121}; +.global .align 1 .b8 assertMessage_2[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 54, 32, 60, 32, 107, 115, 51}; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 109, 98, 47, 99, 109, 98, 53, 122, 108, 108, 100, 107, 114, 102, 108, 101, 121, 104, 99, 107, 104, 105, 111, 109, 104, 112, 102, 115, 118, 107, 102, 107, 108, 108, 111, 116, 98, 111, 112, 97, 108, 100, 100, 119, 107, 100, 102, 104, 108, 105, 122, 118, 101, 117, 120, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[65] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 108, 46, 98, 114, 111, 97, 100, 99, 97, 115, 116, 95, 116, 111, 40, 116, 109, 112, 50, 51, 44, 32, 91, 88, 66, 76, 79, 67, 75, 93, 41, 32, 60, 32, 107, 115, 50}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 109, 98, 47, 99, 109, 98, 53, 122, 108, 108, 100, 107, 114, 102, 108, 101, 121, 104, 99, 107, 104, 105, 111, 109, 104, 112, 102, 115, 118, 107, 102, 107, 108, 108, 111, 116, 98, 111, 112, 97, 108, 100, 100, 119, 107, 100, 102, 104, 108, 105, 122, 118, 101, 117, 120, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[64] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 108, 46, 98, 114, 111, 97, 100, 99, 97, 115, 116, 95, 116, 111, 40, 116, 109, 112, 56, 44, 32, 91, 88, 66, 76, 79, 67, 75, 93, 41, 32, 60, 32, 107, 115, 50}; + // @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0 +.visible .entry triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0( + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_4, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_5, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_6, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_7, + .param .u64 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_8, + .param .u32 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_9, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_10, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_11 +) +.reqntid 256 +{ + .reg .pred %p<71>; + .reg .b16 %rs<33>; + .reg .b32 %r<57>; + .reg .b64 %rd<189>; + .loc 1 18 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:18:0 + +// %bb.0: + ld.param.b64 %rd40, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_5]; +$L__tmp0: + .loc 1 19 28 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:19:28 + mov.u32 %r2, %ctaid.x; + .loc 1 19 33 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:19:33 + shl.b32 %r3, %r2, 9; + .loc 1 20 36 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:20:36 + mov.u32 %r4, %tid.x; + shl.b32 %r5, %r4, 1; + and.b32 %r6, %r5, 510; + .loc 1 20 23 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:20:23 + or.b32 %r7, %r6, %r3; + or.b32 %r8, %r7, 1; + .loc 1 22 19 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:22:19 + cvt.s64.s32 %rd1, %r7; + cvt.s64.s32 %rd2, %r8; + .loc 1 24 21 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:24:21 + or.b64 %rd41, %rd1, %rd40; + and.b64 %rd42, %rd41, -4294967296; + setp.ne.b64 %p5, %rd42, 0; + cvt.u32.u64 %r56, %rd1; + @%p5 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd185, %rd1, %rd40; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r9, %rd40; + div.u32 %r11, %r56, %r9; + cvt.u64.u32 %rd185, %r11; +$L__BB0_3: + .loc 1 0 21 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0:21 + ld.param.b64 %rd37, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_6]; + .loc 1 24 21 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:24:21 + or.b64 %rd44, %rd2, %rd40; + and.b64 %rd45, %rd44, -4294967296; + setp.ne.b64 %p6, %rd45, 0; + cvt.u32.u64 %r55, %rd2; + @%p6 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd186, %rd2, %rd40; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r12, %rd40; + div.u32 %r14, %r55, %r12; + cvt.u64.u32 %rd186, %r14; +$L__BB0_6: + .loc 1 22 19 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:22:19 + mul.lo.s64 %rd43, %rd185, %rd40; + mul.lo.s64 %rd11, %rd186, %rd40; + .loc 1 24 28 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:24:28 + or.b64 %rd46, %rd185, %rd37; + and.b64 %rd47, %rd46, -4294967296; + setp.ne.b64 %p7, %rd47, 0; + @%p7 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + rem.s64 %rd187, %rd185, %rd37; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r15, %rd37; + cvt.u32.u64 %r16, %rd185; + rem.u32 %r17, %r16, %r15; + cvt.u64.u32 %rd187, %r17; +$L__BB0_9: + .loc 1 0 28 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0:28 + ld.param.b32 %r1, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_9]; + ld.param.b64 %rd38, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_7]; + ld.param.b64 %rd32, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_1]; + ld.param.b64 %rd31, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_0]; + sub.s64 %rd7, %rd1, %rd43; + sub.s64 %rd12, %rd2, %rd11; + .loc 1 24 28 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:24:28 + or.b64 %rd48, %rd186, %rd37; + and.b64 %rd49, %rd48, -4294967296; + setp.ne.b64 %p8, %rd49, 0; + @%p8 bra $L__BB0_11; + bra.uni $L__BB0_10; +$L__BB0_11: + rem.s64 %rd188, %rd186, %rd37; + bra.uni $L__BB0_12; +$L__BB0_10: + cvt.u32.u64 %r18, %rd37; + cvt.u32.u64 %r19, %rd186; + rem.u32 %r20, %r19, %r18; + cvt.u64.u32 %rd188, %r20; +$L__BB0_12: + .loc 1 21 21 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:21:21 + setp.lt.s32 %p12, %r55, %r1; + setp.lt.s32 %p11, %r56, %r1; + .loc 1 25 31 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:25:31 + shl.b64 %rd78, %rd1, 1; + add.s64 %rd51, %rd31, %rd78; + add.s64 %rd54, %rd51, 2; + .loc 1 25 36 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:25:36 + // begin inline asm + mov.u64 %rd50, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd50, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs11, 0x0; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd51 + 0 ], %rd50; + // end inline asm + // begin inline asm + mov.u64 %rd53, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs12, 0x0; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd54 + 0 ], %rd53; + // end inline asm + .loc 1 26 31 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:26:31 + shl.b64 %rd79, %rd187, 3; + add.s64 %rd72, %rd32, %rd79; + shl.b64 %rd80, %rd188, 3; + add.s64 %rd76, %rd32, %rd80; + .loc 1 26 36 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:26:36 + // begin inline asm + mov.u64 %rd56, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd56, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd57, 0x0; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd57 }, [ %rd72 + 0 ], %rd56; + // end inline asm + // begin inline asm + mov.u64 %rd60, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd60, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd61, 0x0; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd61 }, [ %rd76 + 0 ], %rd60; + // end inline asm + .loc 1 28 18 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:28:18 + shr.u64 %rd81, %rd40, 63; + add.s64 %rd82, %rd40, %rd81; + shr.s64 %rd23, %rd82, 1; + .loc 1 29 19 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:29:19 + setp.ge.s64 %p17, %rd7, %rd23; + setp.ge.s64 %p18, %rd12, %rd23; + .loc 1 30 35 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:35 + sub.s64 %rd83, %rd1, %rd23; + .loc 1 30 30 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:30 + shl.b64 %rd84, %rd83, 1; + add.s64 %rd65, %rd31, %rd84; + add.s64 %rd68, %rd65, 2; + .loc 1 30 60 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:60 + and.pred %p15, %p11, %p17; + and.pred %p16, %p12, %p18; + .loc 1 30 53 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:53 + // begin inline asm + mov.u64 %rd64, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd64, 1.0; + // end inline asm + mov.b16 %rs16, 0; + // begin inline asm + mov.u16 %rs13, %rs16; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd65 + 0 ], %rd64; + // end inline asm + // begin inline asm + mov.u64 %rd67, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd67, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs16; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd68 + 0 ], %rd67; + // end inline asm + .loc 1 31 35 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:31:35 + // begin inline asm + mov.u64 %rd70, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd70, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd71, 0x0; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd71 }, [ %rd72 + 0 ], %rd70; + // end inline asm + // begin inline asm + mov.u64 %rd74, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd74, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd75, 0x0; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd75 }, [ %rd76 + 0 ], %rd74; + // end inline asm + .loc 1 35 32 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:35:32 + shr.s64 %rd85, %rd71, 63; + and.b64 %rd86, %rd85, %rd38; + add.s64 %rd24, %rd86, %rd71; + shr.s64 %rd87, %rd75, 63; + and.b64 %rd88, %rd87, %rd38; + add.s64 %rd25, %rd88, %rd75; + .loc 1 36 28 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:36:28 + setp.lt.s64 %p19, %rd24, 0; + setp.lt.s64 %p20, %rd25, 0; + .loc 1 36 98 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:36:98 + setp.ge.s64 %p21, %rd24, %rd38; + setp.ge.s64 %p22, %rd25, %rd38; + .loc 1 36 64 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:36:64 + or.pred %p23, %p19, %p21; + .loc 1 36 108 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:36:108 + or.pred %p24, %p20, %p22; + .loc 1 36 106 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:36:106 + and.pred %p25, %p15, %p23; + .loc 1 36 123 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:36:123 + and.pred %p26, %p16, %p24; + or.pred %p27, %p25, %p26; + not.pred %p28, %p27; + @%p28 bra $L__BB0_14; + bra.uni $L__BB0_13; +$L__BB0_14: + .loc 1 0 123 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0:123 + ld.param.b64 %rd33, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_2]; + .loc 1 36 123 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:36:123 + bar.sync 0; + .loc 1 37 36 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:37:36 + sub.s64 %rd109, %rd7, %rd23; + .loc 1 37 58 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:37:58 + mul.lo.s64 %rd110, %rd24, %rd40; + mul.lo.s64 %rd111, %rd25, %rd40; + .loc 1 37 31 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:37:31 + shl.b64 %rd112, %rd109, 1; + add.s64 %rd113, %rd33, %rd112; + shl.b64 %rd114, %rd110, 1; + add.s64 %rd90, %rd113, %rd114; + sub.s64 %rd26, %rd1, %rd11; + sub.s64 %rd115, %rd26, %rd23; + shl.b64 %rd116, %rd115, 1; + add.s64 %rd117, %rd33, %rd116; + shl.b64 %rd118, %rd111, 1; + add.s64 %rd119, %rd117, %rd118; + add.s64 %rd93, %rd119, 2; + .loc 1 37 65 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:37:65 + // begin inline asm + mov.u64 %rd89, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd89, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs17, %rs16; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd90 + 0 ], %rd89; + // end inline asm + // begin inline asm + mov.u64 %rd92, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd92, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs19, %rs16; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd93 + 0 ], %rd92; + // end inline asm + .loc 1 44 19 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:44:19 + setp.lt.s64 %p37, %rd7, %rd23; + setp.lt.s64 %p38, %rd12, %rd23; + .loc 1 45 37 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:37 + add.s64 %rd120, %rd40, %rd1; + .loc 1 45 42 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:42 + sub.s64 %rd121, %rd120, %rd23; + .loc 1 45 31 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:31 + shl.b64 %rd122, %rd121, 1; + add.s64 %rd96, %rd31, %rd122; + add.s64 %rd99, %rd96, 2; + .loc 1 45 68 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:68 + and.pred %p33, %p11, %p37; + and.pred %p34, %p12, %p38; + .loc 1 45 60 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:60 + // begin inline asm + mov.u64 %rd95, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd95, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs21, %rs16; + @%p33 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd96 + 0 ], %rd95; + // end inline asm + // begin inline asm + mov.u64 %rd98, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd98, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs23, %rs16; + @%p34 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs23 }, [ %rd99 + 0 ], %rd98; + // end inline asm + .loc 1 46 36 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:46:36 + // begin inline asm + mov.u64 %rd101, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd101, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd102, 0x0; + @%p33 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd102 }, [ %rd72 + 0 ], %rd101; + // end inline asm + // begin inline asm + mov.u64 %rd105, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd105, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd106, 0x0; + @%p34 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd106 }, [ %rd76 + 0 ], %rd105; + // end inline asm + .loc 1 50 35 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:50:35 + shr.s64 %rd123, %rd102, 63; + and.b64 %rd124, %rd123, %rd38; + add.s64 %rd27, %rd124, %rd102; + shr.s64 %rd125, %rd106, 63; + and.b64 %rd126, %rd125, %rd38; + add.s64 %rd28, %rd126, %rd106; + .loc 1 51 28 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:51:28 + setp.lt.s64 %p39, %rd27, 0; + setp.lt.s64 %p40, %rd28, 0; + .loc 1 51 100 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:51:100 + setp.ge.s64 %p41, %rd27, %rd38; + setp.ge.s64 %p42, %rd28, %rd38; + .loc 1 51 65 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:51:65 + or.pred %p43, %p39, %p41; + .loc 1 51 110 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:51:110 + or.pred %p44, %p40, %p42; + .loc 1 51 108 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:51:108 + and.pred %p45, %p33, %p43; + .loc 1 51 126 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:51:126 + and.pred %p46, %p34, %p44; + or.pred %p47, %p45, %p46; + not.pred %p48, %p47; + @%p48 bra $L__BB0_16; + bra.uni $L__BB0_15; +$L__BB0_16: + .loc 1 0 126 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0:126 + ld.param.b64 %rd39, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_8]; + .loc 1 51 126 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:51:126 + bar.sync 0; + .loc 1 52 37 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:52:37 + sub.s64 %rd133, %rd40, %rd23; + .loc 1 52 64 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:52:64 + mul.lo.s64 %rd134, %rd27, %rd40; + mul.lo.s64 %rd135, %rd28, %rd40; + .loc 1 52 31 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:52:31 + shl.b64 %rd136, %rd133, 1; + add.s64 %rd137, %rd33, %rd136; + shl.b64 %rd138, %rd7, 1; + add.s64 %rd139, %rd137, %rd138; + shl.b64 %rd140, %rd134, 1; + add.s64 %rd128, %rd139, %rd140; + shl.b64 %rd141, %rd26, 1; + add.s64 %rd142, %rd137, %rd141; + shl.b64 %rd143, %rd135, 1; + add.s64 %rd144, %rd142, %rd143; + add.s64 %rd131, %rd144, 2; + .loc 1 52 72 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:52:72 + // begin inline asm + mov.u64 %rd127, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd127, 1.0; + // end inline asm + mov.b16 %rs26, 0; + // begin inline asm + mov.u16 %rs25, %rs26; + @%p33 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs25 }, [ %rd128 + 0 ], %rd127; + // end inline asm + // begin inline asm + mov.u64 %rd130, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd130, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs27, %rs26; + @%p34 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs27 }, [ %rd131 + 0 ], %rd130; + // end inline asm + .loc 1 61 35 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:61:35 + shr.s64 %rd145, %rd57, 63; + and.b64 %rd146, %rd145, %rd39; + add.s64 %rd29, %rd146, %rd57; + shr.s64 %rd147, %rd61, 63; + and.b64 %rd148, %rd147, %rd39; + add.s64 %rd30, %rd148, %rd61; + .loc 1 62 28 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:62:28 + setp.lt.s64 %p53, %rd29, 0; + setp.lt.s64 %p54, %rd30, 0; + .loc 1 62 46 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:62:46 + setp.ge.s64 %p55, %rd29, %rd39; + setp.ge.s64 %p56, %rd30, %rd39; + .loc 1 62 38 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:62:38 + or.pred %p57, %p53, %p55; + .loc 1 62 56 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:62:56 + or.pred %p58, %p54, %p56; + .loc 1 62 54 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:62:54 + and.pred %p59, %p11, %p57; + .loc 1 62 64 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:62:64 + and.pred %p60, %p12, %p58; + or.pred %p61, %p59, %p60; + not.pred %p62, %p61; + @%p62 bra $L__BB0_18; + bra.uni $L__BB0_17; +$L__BB0_18: + .loc 1 0 64 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0:64 + ld.param.b64 %rd35, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_4]; + ld.param.b64 %rd34, [triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0_param_3]; + .loc 1 30 111 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:111 + cvt.f32.bf16 %r29, %rs15; + .loc 1 37 123 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:37:123 + cvt.f32.bf16 %r30, %rs19; + .loc 1 39 13 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:39:13 + neg.f32 %r31, %r29; + fma.rn.f32 %r32, %r31, %r30, 0f00000000; + .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0 + selp.f32 %r33, %r32, 0f00000000, %p18; + .loc 1 45 119 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:119 + cvt.f32.bf16 %r34, %rs23; + .loc 1 52 131 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:52:131 + cvt.f32.bf16 %r35, %rs27; + .loc 1 53 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:53:20 + mul.f32 %r36, %r34, %r35; + .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0 + selp.f32 %r37, %r36, 0f00000000, %p38; + .loc 1 57 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:57:20 + add.f32 %r38, %r33, %r37; + .loc 1 30 111 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:30:111 + cvt.f32.bf16 %r39, %rs13; + .loc 1 37 123 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:37:123 + cvt.f32.bf16 %r40, %rs17; + .loc 1 39 13 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:39:13 + neg.f32 %r41, %r39; + fma.rn.f32 %r42, %r41, %r40, 0f00000000; + .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0 + selp.f32 %r43, %r42, 0f00000000, %p17; + .loc 1 45 119 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:45:119 + cvt.f32.bf16 %r44, %rs21; + .loc 1 52 131 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:52:131 + cvt.f32.bf16 %r45, %rs25; + .loc 1 53 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:53:20 + mul.f32 %r46, %r44, %r45; + .loc 1 0 0 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:0 + selp.f32 %r47, %r46, 0f00000000, %p37; + .loc 1 57 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:57:20 + add.f32 %r48, %r43, %r47; + .loc 1 25 76 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:25:76 + cvt.f32.bf16 %r49, %rs12; + cvt.f32.bf16 %r50, %rs11; + .loc 1 62 64 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:62:64 + bar.sync 0; + .loc 1 63 40 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:63:40 + mul.lo.s64 %rd157, %rd29, %rd40; + mul.lo.s64 %rd158, %rd30, %rd40; + .loc 1 63 31 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:63:31 + add.s64 %rd160, %rd34, %rd138; + shl.b64 %rd161, %rd157, 1; + add.s64 %rd150, %rd160, %rd161; + add.s64 %rd163, %rd34, %rd141; + shl.b64 %rd164, %rd158, 1; + add.s64 %rd165, %rd163, %rd164; + add.s64 %rd153, %rd165, 2; + .loc 1 63 48 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:63:48 + // begin inline asm + mov.u64 %rd151, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd151, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs29, 0x0; + @%p11 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs29 }, [ %rd150 + 0 ], %rd151; + // end inline asm + // begin inline asm + mov.u64 %rd154, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd154, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs30, 0x0; + @%p12 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs30 }, [ %rd153 + 0 ], %rd154; + // end inline asm + .loc 1 63 88 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:63:88 + cvt.f32.bf16 %r51, %rs29; + cvt.f32.bf16 %r52, %rs30; + .loc 1 65 20 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:65:20 + fma.rn.f32 %r53, %r50, %r51, %r48; + fma.rn.f32 %r54, %r49, %r52, %r38; + .loc 1 66 25 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:66:25 + add.s64 %rd155, %rd35, %rd78; + add.s64 %rd156, %rd155, 2; + .loc 1 66 37 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:66:37 + cvt.rn.bf16.f32 %rs31, %r53; + cvt.rn.bf16.f32 %rs32, %r54; + // begin inline asm + @%p11 st.global.b16 [ %rd155 + 0 ], { %rs31 }; + // end inline asm + // begin inline asm + @%p12 st.global.b16 [ %rd156 + 0 ], { %rs32 }; + // end inline asm + .loc 1 66 4 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:66:4 + ret; +$L__BB0_13: + .loc 1 36 123 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:36:123 + { // callseq 2, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd179, assertFunc_0; + cvta.global.u64 %rd180, %rd179; + st.param.b64 [param3], %rd180; + mov.b64 %rd181, assertFile_0; + cvta.global.u64 %rd182, %rd181; + st.param.b64 [param1], %rd182; + mov.b64 %rd183, assertMessage_0; + cvta.global.u64 %rd184, %rd183; + st.param.b64 [param0], %rd184; + st.param.b64 [param4], 1; + st.param.b32 [param2], 36; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 2 + trap; +$L__BB0_15: + .loc 1 51 126 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:51:126 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd173, assertFunc_1; + cvta.global.u64 %rd174, %rd173; + st.param.b64 [param3], %rd174; + mov.b64 %rd175, assertFile_1; + cvta.global.u64 %rd176, %rd175; + st.param.b64 [param1], %rd176; + mov.b64 %rd177, assertMessage_1; + cvta.global.u64 %rd178, %rd177; + st.param.b64 [param0], %rd178; + st.param.b64 [param4], 1; + st.param.b32 [param2], 51; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__BB0_17: + .loc 1 62 64 // cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py:62:64 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd167, assertFunc_2; + cvta.global.u64 %rd168, %rd167; + st.param.b64 [param3], %rd168; + mov.b64 %rd169, assertFile_2; + cvta.global.u64 %rd170, %rd169; + st.param.b64 [param1], %rd170; + mov.b64 %rd171, assertMessage_2; + cvta.global.u64 %rd172, %rd171; + st.param.b64 [param0], %rd172; + st.param.b64 [param4], 1; + st.param.b32 [param2], 62; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 109 +.b8 98 +.b8 53 +.b8 122 +.b8 108 +.b8 108 +.b8 100 +.b8 107 +.b8 114 +.b8 102 +.b8 108 +.b8 101 +.b8 121 +.b8 104 +.b8 99 +.b8 107 +.b8 104 +.b8 105 +.b8 111 +.b8 109 +.b8 104 +.b8 112 +.b8 102 +.b8 115 +.b8 118 +.b8 107 +.b8 102 +.b8 107 +.b8 108 +.b8 108 +.b8 111 +.b8 116 +.b8 98 +.b8 111 +.b8 112 +.b8 97 +.b8 108 +.b8 100 +.b8 100 +.b8 119 +.b8 107 +.b8 100 +.b8 102 +.b8 104 +.b8 108 +.b8 105 +.b8 122 +.b8 118 +.b8 101 +.b8 117 +.b8 120 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 109 +.b8 98 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source new file mode 100644 index 0000000000000000000000000000000000000000..f8d42709fd265aa0f799b464ce5022cade857641 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.source @@ -0,0 +1,419 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":18:0) +#loc109 = loc("in_ptr0"(#loc)) +#loc110 = loc("in_ptr1"(#loc)) +#loc111 = loc("in_ptr2"(#loc)) +#loc112 = loc("in_ptr3"(#loc)) +#loc113 = loc("out_ptr0"(#loc)) +#loc114 = loc("ks0"(#loc)) +#loc115 = loc("ks1"(#loc)) +#loc116 = loc("ks2"(#loc)) +#loc117 = loc("ks3"(#loc)) +#loc118 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc119) + %xoffset_0 = arith.constant 512 : i32 loc(#loc120) + %xoffset_1 = arith.constant 512 : i32 loc(#loc120) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc120) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc121) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<512xi32> loc(#loc122) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<512xi32> loc(#loc122) + %xmask = tt.splat %xnumel : i32 -> tensor<512xi32> loc(#loc123) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<512xi32> loc(#loc123) + %x0 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc124) + %x0_6 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc124) + %x0_7 = arith.remsi %x0, %x0_6 : tensor<512xi64> loc(#loc124) + %x1 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc125) + %x1_8 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc125) + %x1_9 = arith.divsi %x1, %x1_8 : tensor<512xi64> loc(#loc125) + %x1_10 = tt.splat %ks1 : i64 -> tensor<512xi64> loc(#loc126) + %x1_11 = arith.remsi %x1_9, %x1_10 : tensor<512xi64> loc(#loc126) + %tmp31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc127) + %tmp31_12 = tt.addptr %tmp31, %xindex_4 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc127) + %tmp31_13 = tt.load %tmp31_12, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc128) + %tmp31_14 = arith.extf %tmp31_13 : tensor<512xbf16> to tensor<512xf32> loc(#loc129) + %tmp32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc130) + %tmp32_15 = tt.addptr %tmp32, %x1_11 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc130) + %tmp32_16 = tt.load %tmp32_15, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc131) + %tmp1 = arith.constant 2 : i32 loc(#loc132) + %tmp1_17 = arith.constant 2 : i64 loc(#loc132) + %tmp1_18 = arith.divsi %ks0, %tmp1_17 : i64 loc(#loc132) + %tmp2 = tt.splat %tmp1_18 : i64 -> tensor<512xi64> loc(#loc133) + %tmp2_19 = arith.cmpi sge, %x0_7, %tmp2 : tensor<512xi64> loc(#loc133) + %tmp3 = arith.constant 2 : i32 loc(#loc134) + %tmp3_20 = arith.constant 2 : i64 loc(#loc134) + %tmp3_21 = arith.divsi %ks0, %tmp3_20 : i64 loc(#loc134) + %tmp3_22 = arith.constant -1 : i32 loc(#loc135) + %tmp3_23 = arith.constant -1 : i64 loc(#loc135) + %tmp3_24 = arith.muli %tmp3_23, %tmp3_21 : i64 loc(#loc135) + %tmp3_25 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc136) + %tmp3_26 = tt.splat %tmp3_24 : i64 -> tensor<512xi64> loc(#loc136) + %tmp3_27 = arith.addi %tmp3_25, %tmp3_26 : tensor<512xi64> loc(#loc136) + %tmp3_28 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc137) + %tmp3_29 = tt.addptr %tmp3_28, %tmp3_27 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc137) + %tmp3_30 = arith.andi %tmp2_19, %xmask_5 : tensor<512xi1> loc(#loc138) + %tmp3_31 = arith.constant 0.000000e+00 : f32 loc(#loc139) + %tmp3_32 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc139) + %tmp3_33 = arith.truncf %tmp3_32 : tensor<512xf32> to tensor<512xbf16> loc(#loc139) + %tmp3_34 = tt.load %tmp3_29, %tmp3_30, %tmp3_33 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc139) + %tmp3_35 = arith.extf %tmp3_34 : tensor<512xbf16> to tensor<512xf32> loc(#loc140) + %tmp4 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc141) + %tmp4_36 = tt.addptr %tmp4, %x1_11 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc141) + %tmp4_37 = arith.andi %tmp2_19, %xmask_5 : tensor<512xi1> loc(#loc142) + %tmp4_38 = arith.constant 0.000000e+00 : f32 loc(#loc143) + %tmp4_39 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc143) + %tmp4_40 = arith.fptosi %tmp4_39 : tensor<512xf32> to tensor<512xi64> loc(#loc143) + %tmp4_41 = tt.load %tmp4_36, %tmp4_37, %tmp4_40 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc143) + %tmp5 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc144) + %tmp6 = arith.addi %tmp4_41, %tmp5 : tensor<512xi64> loc(#loc145) + %tmp7 = arith.constant 0 : i32 loc(#loc146) + %tmp7_42 = arith.extsi %tmp7 : i32 to i64 loc(#loc146) + %tmp7_43 = tt.splat %tmp7_42 : i64 -> tensor<512xi64> loc(#loc146) + %tmp7_44 = arith.cmpi slt, %tmp4_41, %tmp7_43 : tensor<512xi64> loc(#loc146) + %tmp8 = arith.select %tmp7_44, %tmp6, %tmp4_41 : tensor<512xi1>, tensor<512xi64> loc(#loc147) + %c0_i32 = arith.constant 0 : i32 loc(#loc30) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc30) + %1 = tt.splat %0 : i64 -> tensor<512xi64> loc(#loc30) + %2 = arith.cmpi sle, %1, %tmp8 : tensor<512xi64> loc(#loc30) + %3 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc31) + %4 = arith.cmpi slt, %tmp8, %3 : tensor<512xi64> loc(#loc31) + %5 = arith.andi %2, %4 : tensor<512xi1> loc(#loc32) + %6 = arith.andi %tmp2_19, %xmask_5 : tensor<512xi1> loc(#loc33) + %true = arith.constant true loc(#loc34) + %cst = arith.constant dense : tensor<512xi1> loc(#loc34) + %7 = arith.xori %6, %cst : tensor<512xi1> loc(#loc34) + %8 = arith.ori %5, %7 : tensor<512xi1> loc(#loc35) + tt.assert %8, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<512xi1> loc(#loc36) + %tmp10 = arith.constant 2 : i32 loc(#loc148) + %tmp10_45 = arith.constant 2 : i64 loc(#loc148) + %tmp10_46 = arith.divsi %ks0, %tmp10_45 : i64 loc(#loc148) + %tmp10_47 = arith.constant -1 : i32 loc(#loc149) + %tmp10_48 = arith.constant -1 : i64 loc(#loc149) + %tmp10_49 = arith.muli %tmp10_48, %tmp10_46 : i64 loc(#loc149) + %tmp10_50 = tt.splat %tmp10_49 : i64 -> tensor<512xi64> loc(#loc150) + %tmp10_51 = arith.addi %x0_7, %tmp10_50 : tensor<512xi64> loc(#loc150) + %tmp10_52 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc151) + %tmp10_53 = arith.muli %tmp10_52, %tmp8 : tensor<512xi64> loc(#loc151) + %tmp10_54 = arith.addi %tmp10_51, %tmp10_53 : tensor<512xi64> loc(#loc152) + %tmp10_55 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc153) + %tmp10_56 = tt.addptr %tmp10_55, %tmp10_54 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc153) + %tmp10_57 = arith.andi %tmp2_19, %xmask_5 : tensor<512xi1> loc(#loc154) + %tmp10_58 = arith.constant 0.000000e+00 : f32 loc(#loc155) + %tmp10_59 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc155) + %tmp10_60 = arith.truncf %tmp10_59 : tensor<512xf32> to tensor<512xbf16> loc(#loc155) + %tmp10_61 = tt.load %tmp10_56, %tmp10_57, %tmp10_60 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc155) + %tmp10_62 = arith.extf %tmp10_61 : tensor<512xbf16> to tensor<512xf32> loc(#loc156) + %tmp11 = arith.mulf %tmp3_35, %tmp10_62 : tensor<512xf32> loc(#loc157) + %tmp12 = arith.constant 0.000000e+00 : f32 loc(#loc158) + %tmp12_63 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc158) + %tmp12_64 = arith.subf %tmp12_63, %tmp11 : tensor<512xf32> loc(#loc158) + %tmp13 = arith.constant 0.000000e+00 : f32 loc(#loc159) + %tmp13_65 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc159) + %tmp14 = arith.select %tmp2_19, %tmp12_64, %tmp13_65 : tensor<512xi1>, tensor<512xf32> loc(#loc160) + %tmp15 = arith.constant 0.000000e+00 : f32 loc(#loc161) + %tmp16 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc162) + %tmp16_66 = arith.select %tmp2_19, %tmp14, %tmp16 : tensor<512xi1>, tensor<512xf32> loc(#loc162) + %tmp17 = tt.splat %tmp1_18 : i64 -> tensor<512xi64> loc(#loc163) + %tmp17_67 = arith.cmpi slt, %x0_7, %tmp17 : tensor<512xi64> loc(#loc163) + %tmp18 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc164) + %tmp18_68 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc164) + %tmp18_69 = arith.addi %tmp18_68, %tmp18 : tensor<512xi64> loc(#loc164) + %tmp18_70 = arith.constant 2 : i32 loc(#loc165) + %tmp18_71 = arith.constant 2 : i64 loc(#loc165) + %tmp18_72 = arith.divsi %ks0, %tmp18_71 : i64 loc(#loc165) + %tmp18_73 = arith.constant -1 : i32 loc(#loc166) + %tmp18_74 = arith.constant -1 : i64 loc(#loc166) + %tmp18_75 = arith.muli %tmp18_74, %tmp18_72 : i64 loc(#loc166) + %tmp18_76 = tt.splat %tmp18_75 : i64 -> tensor<512xi64> loc(#loc167) + %tmp18_77 = arith.addi %tmp18_69, %tmp18_76 : tensor<512xi64> loc(#loc167) + %tmp18_78 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc168) + %tmp18_79 = tt.addptr %tmp18_78, %tmp18_77 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc168) + %tmp18_80 = arith.andi %tmp17_67, %xmask_5 : tensor<512xi1> loc(#loc169) + %tmp18_81 = arith.constant 0.000000e+00 : f32 loc(#loc170) + %tmp18_82 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc170) + %tmp18_83 = arith.truncf %tmp18_82 : tensor<512xf32> to tensor<512xbf16> loc(#loc170) + %tmp18_84 = tt.load %tmp18_79, %tmp18_80, %tmp18_83 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc170) + %tmp18_85 = arith.extf %tmp18_84 : tensor<512xbf16> to tensor<512xf32> loc(#loc171) + %tmp19 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc172) + %tmp19_86 = tt.addptr %tmp19, %x1_11 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc172) + %tmp19_87 = arith.andi %tmp17_67, %xmask_5 : tensor<512xi1> loc(#loc173) + %tmp19_88 = arith.constant 0.000000e+00 : f32 loc(#loc174) + %tmp19_89 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc174) + %tmp19_90 = arith.fptosi %tmp19_89 : tensor<512xf32> to tensor<512xi64> loc(#loc174) + %tmp19_91 = tt.load %tmp19_86, %tmp19_87, %tmp19_90 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc174) + %tmp20 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc175) + %tmp21 = arith.addi %tmp19_91, %tmp20 : tensor<512xi64> loc(#loc176) + %tmp22 = arith.constant 0 : i32 loc(#loc177) + %tmp22_92 = arith.extsi %tmp22 : i32 to i64 loc(#loc177) + %tmp22_93 = tt.splat %tmp22_92 : i64 -> tensor<512xi64> loc(#loc177) + %tmp22_94 = arith.cmpi slt, %tmp19_91, %tmp22_93 : tensor<512xi64> loc(#loc177) + %tmp23 = arith.select %tmp22_94, %tmp21, %tmp19_91 : tensor<512xi1>, tensor<512xi64> loc(#loc178) + %c0_i32_95 = arith.constant 0 : i32 loc(#loc68) + %9 = arith.extsi %c0_i32_95 : i32 to i64 loc(#loc68) + %10 = tt.splat %9 : i64 -> tensor<512xi64> loc(#loc68) + %11 = arith.cmpi sle, %10, %tmp23 : tensor<512xi64> loc(#loc68) + %12 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc69) + %13 = arith.cmpi slt, %tmp23, %12 : tensor<512xi64> loc(#loc69) + %14 = arith.andi %11, %13 : tensor<512xi1> loc(#loc70) + %15 = arith.andi %tmp17_67, %xmask_5 : tensor<512xi1> loc(#loc71) + %true_96 = arith.constant true loc(#loc72) + %cst_97 = arith.constant dense : tensor<512xi1> loc(#loc72) + %16 = arith.xori %15, %cst_97 : tensor<512xi1> loc(#loc72) + %17 = arith.ori %14, %16 : tensor<512xi1> loc(#loc73) + tt.assert %17, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<512xi1> loc(#loc74) + %tmp25 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc179) + %tmp25_98 = arith.addi %tmp25, %x0_7 : tensor<512xi64> loc(#loc179) + %tmp25_99 = arith.constant 2 : i32 loc(#loc180) + %tmp25_100 = arith.constant 2 : i64 loc(#loc180) + %tmp25_101 = arith.divsi %ks0, %tmp25_100 : i64 loc(#loc180) + %tmp25_102 = arith.constant -1 : i32 loc(#loc181) + %tmp25_103 = arith.constant -1 : i64 loc(#loc181) + %tmp25_104 = arith.muli %tmp25_103, %tmp25_101 : i64 loc(#loc181) + %tmp25_105 = tt.splat %tmp25_104 : i64 -> tensor<512xi64> loc(#loc182) + %tmp25_106 = arith.addi %tmp25_98, %tmp25_105 : tensor<512xi64> loc(#loc182) + %tmp25_107 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc183) + %tmp25_108 = arith.muli %tmp25_107, %tmp23 : tensor<512xi64> loc(#loc183) + %tmp25_109 = arith.addi %tmp25_106, %tmp25_108 : tensor<512xi64> loc(#loc184) + %tmp25_110 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc185) + %tmp25_111 = tt.addptr %tmp25_110, %tmp25_109 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc185) + %tmp25_112 = arith.andi %tmp17_67, %xmask_5 : tensor<512xi1> loc(#loc186) + %tmp25_113 = arith.constant 0.000000e+00 : f32 loc(#loc187) + %tmp25_114 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc187) + %tmp25_115 = arith.truncf %tmp25_114 : tensor<512xf32> to tensor<512xbf16> loc(#loc187) + %tmp25_116 = tt.load %tmp25_111, %tmp25_112, %tmp25_115 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc187) + %tmp25_117 = arith.extf %tmp25_116 : tensor<512xbf16> to tensor<512xf32> loc(#loc188) + %tmp26 = arith.mulf %tmp18_85, %tmp25_117 : tensor<512xf32> loc(#loc189) + %tmp27 = arith.constant 0.000000e+00 : f32 loc(#loc190) + %tmp27_118 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc190) + %tmp28 = arith.select %tmp17_67, %tmp26, %tmp27_118 : tensor<512xi1>, tensor<512xf32> loc(#loc191) + %tmp29 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc192) + %tmp29_119 = arith.select %tmp17_67, %tmp28, %tmp29 : tensor<512xi1>, tensor<512xf32> loc(#loc192) + %tmp30 = arith.addf %tmp16_66, %tmp29_119 : tensor<512xf32> loc(#loc193) + %tmp34 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc194) + %tmp34_120 = arith.addi %tmp32_16, %tmp34 : tensor<512xi64> loc(#loc194) + %tmp35 = arith.constant 0 : i32 loc(#loc195) + %tmp35_121 = arith.extsi %tmp35 : i32 to i64 loc(#loc195) + %tmp35_122 = tt.splat %tmp35_121 : i64 -> tensor<512xi64> loc(#loc195) + %tmp35_123 = arith.cmpi slt, %tmp32_16, %tmp35_122 : tensor<512xi64> loc(#loc195) + %tmp36 = arith.select %tmp35_123, %tmp34_120, %tmp32_16 : tensor<512xi1>, tensor<512xi64> loc(#loc196) + %c0_i32_124 = arith.constant 0 : i32 loc(#loc93) + %18 = arith.extsi %c0_i32_124 : i32 to i64 loc(#loc93) + %19 = tt.splat %18 : i64 -> tensor<512xi64> loc(#loc93) + %20 = arith.cmpi sle, %19, %tmp36 : tensor<512xi64> loc(#loc93) + %21 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc94) + %22 = arith.cmpi slt, %tmp36, %21 : tensor<512xi64> loc(#loc94) + %23 = arith.andi %20, %22 : tensor<512xi1> loc(#loc95) + %true_125 = arith.constant true loc(#loc96) + %cst_126 = arith.constant dense : tensor<512xi1> loc(#loc96) + %24 = arith.xori %xmask_5, %cst_126 : tensor<512xi1> loc(#loc96) + %25 = arith.ori %23, %24 : tensor<512xi1> loc(#loc97) + tt.assert %25, "index out of bounds: 0 <= tmp36 < ks3" : tensor<512xi1> loc(#loc98) + %tmp38 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc197) + %tmp38_127 = arith.muli %tmp38, %tmp36 : tensor<512xi64> loc(#loc197) + %tmp38_128 = arith.addi %x0_7, %tmp38_127 : tensor<512xi64> loc(#loc198) + %tmp38_129 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc199) + %tmp38_130 = tt.addptr %tmp38_129, %tmp38_128 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc199) + %tmp38_131 = tt.load %tmp38_130, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc200) + %tmp38_132 = arith.extf %tmp38_131 : tensor<512xbf16> to tensor<512xf32> loc(#loc201) + %tmp39 = arith.mulf %tmp31_14, %tmp38_132 : tensor<512xf32> loc(#loc202) + %tmp40 = arith.addf %tmp30, %tmp39 : tensor<512xf32> loc(#loc203) + %26 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc106) + %27 = tt.addptr %26, %xindex_4 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc106) + %28 = arith.truncf %tmp40 : tensor<512xf32> to tensor<512xbf16> loc(#loc107) + tt.store %27, %28, %xmask_5 : tensor<512x!tt.ptr> loc(#loc107) + tt.return loc(#loc108) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":22:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":24:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":25:31) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":25:36) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":25:76) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":26:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":26:36) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":28:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":29:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:48) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:35) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:111) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":31:30) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":31:42) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":31:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":32:32) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":33:18) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":34:18) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":35:32) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:28) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:98) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:64) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:115) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:108) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:106) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:123) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:49) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:42) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:36) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:58) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:54) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:31) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:72) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:65) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:123) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":38:19) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":39:13) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":40:38) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":41:34) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":42:12) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":43:34) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":44:19) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:37) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:55) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:48) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:42) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:31) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:68) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:60) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:119) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":46:31) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":46:44) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":46:36) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":47:33) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":48:20) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":49:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":50:35) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:28) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:100) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:65) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:118) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:110) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:108) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:126) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:37) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:55) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:48) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:42) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:64) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:60) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:31) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:80) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:72) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:131) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":53:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":54:38) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":55:35) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":56:35) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":57:20) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":59:20) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":60:20) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":61:35) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:28) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:46) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:38) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:56) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:54) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:64) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:40) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:36) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:31) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:48) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:88) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":64:20) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":65:20) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":66:25) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":66:37) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":66:4) +#loc119 = loc("xoffset"(#loc1)) +#loc120 = loc("xoffset"(#loc2)) +#loc121 = loc("xindex"(#loc3)) +#loc122 = loc("xindex"(#loc4)) +#loc123 = loc("xmask"(#loc5)) +#loc124 = loc("x0"(#loc6)) +#loc125 = loc("x1"(#loc7)) +#loc126 = loc("x1"(#loc8)) +#loc127 = loc("tmp31"(#loc9)) +#loc128 = loc("tmp31"(#loc10)) +#loc129 = loc("tmp31"(#loc11)) +#loc130 = loc("tmp32"(#loc12)) +#loc131 = loc("tmp32"(#loc13)) +#loc132 = loc("tmp1"(#loc14)) +#loc133 = loc("tmp2"(#loc15)) +#loc134 = loc("tmp3"(#loc16)) +#loc135 = loc("tmp3"(#loc17)) +#loc136 = loc("tmp3"(#loc18)) +#loc137 = loc("tmp3"(#loc19)) +#loc138 = loc("tmp3"(#loc20)) +#loc139 = loc("tmp3"(#loc21)) +#loc140 = loc("tmp3"(#loc22)) +#loc141 = loc("tmp4"(#loc23)) +#loc142 = loc("tmp4"(#loc24)) +#loc143 = loc("tmp4"(#loc25)) +#loc144 = loc("tmp5"(#loc26)) +#loc145 = loc("tmp6"(#loc27)) +#loc146 = loc("tmp7"(#loc28)) +#loc147 = loc("tmp8"(#loc29)) +#loc148 = loc("tmp10"(#loc37)) +#loc149 = loc("tmp10"(#loc38)) +#loc150 = loc("tmp10"(#loc39)) +#loc151 = loc("tmp10"(#loc40)) +#loc152 = loc("tmp10"(#loc41)) +#loc153 = loc("tmp10"(#loc42)) +#loc154 = loc("tmp10"(#loc43)) +#loc155 = loc("tmp10"(#loc44)) +#loc156 = loc("tmp10"(#loc45)) +#loc157 = loc("tmp11"(#loc46)) +#loc158 = loc("tmp12"(#loc47)) +#loc159 = loc("tmp13"(#loc48)) +#loc160 = loc("tmp14"(#loc49)) +#loc161 = loc("tmp15"(#loc50)) +#loc162 = loc("tmp16"(#loc51)) +#loc163 = loc("tmp17"(#loc52)) +#loc164 = loc("tmp18"(#loc53)) +#loc165 = loc("tmp18"(#loc54)) +#loc166 = loc("tmp18"(#loc55)) +#loc167 = loc("tmp18"(#loc56)) +#loc168 = loc("tmp18"(#loc57)) +#loc169 = loc("tmp18"(#loc58)) +#loc170 = loc("tmp18"(#loc59)) +#loc171 = loc("tmp18"(#loc60)) +#loc172 = loc("tmp19"(#loc61)) +#loc173 = loc("tmp19"(#loc62)) +#loc174 = loc("tmp19"(#loc63)) +#loc175 = loc("tmp20"(#loc64)) +#loc176 = loc("tmp21"(#loc65)) +#loc177 = loc("tmp22"(#loc66)) +#loc178 = loc("tmp23"(#loc67)) +#loc179 = loc("tmp25"(#loc75)) +#loc180 = loc("tmp25"(#loc76)) +#loc181 = loc("tmp25"(#loc77)) +#loc182 = loc("tmp25"(#loc78)) +#loc183 = loc("tmp25"(#loc79)) +#loc184 = loc("tmp25"(#loc80)) +#loc185 = loc("tmp25"(#loc81)) +#loc186 = loc("tmp25"(#loc82)) +#loc187 = loc("tmp25"(#loc83)) +#loc188 = loc("tmp25"(#loc84)) +#loc189 = loc("tmp26"(#loc85)) +#loc190 = loc("tmp27"(#loc86)) +#loc191 = loc("tmp28"(#loc87)) +#loc192 = loc("tmp29"(#loc88)) +#loc193 = loc("tmp30"(#loc89)) +#loc194 = loc("tmp34"(#loc90)) +#loc195 = loc("tmp35"(#loc91)) +#loc196 = loc("tmp36"(#loc92)) +#loc197 = loc("tmp38"(#loc99)) +#loc198 = loc("tmp38"(#loc100)) +#loc199 = loc("tmp38"(#loc101)) +#loc200 = loc("tmp38"(#loc102)) +#loc201 = loc("tmp38"(#loc103)) +#loc202 = loc("tmp39"(#loc104)) +#loc203 = loc("tmp40"(#loc105)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..ff73da74dad014304de207b12d354ad6526dd9fb --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttgir @@ -0,0 +1,284 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":18:0) +#loc91 = loc("in_ptr0"(#loc)) +#loc92 = loc("in_ptr1"(#loc)) +#loc93 = loc("in_ptr2"(#loc)) +#loc94 = loc("in_ptr3"(#loc)) +#loc95 = loc("out_ptr0"(#loc)) +#loc96 = loc("ks0"(#loc)) +#loc97 = loc("ks1"(#loc)) +#loc98 = loc("ks2"(#loc)) +#loc99 = loc("ks3"(#loc)) +#loc100 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<512xi1, #blocked> loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xbf16, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<512xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc101) + %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc102) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc103) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32, #blocked> loc(#loc104) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32, #blocked> loc(#loc104) + %xmask = tt.splat %xnumel : i32 -> tensor<512xi32, #blocked> loc(#loc105) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<512xi32, #blocked> loc(#loc105) + %x0 = arith.extsi %xindex_5 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked> loc(#loc106) + %x0_7 = tt.splat %ks0 : i64 -> tensor<512xi64, #blocked> loc(#loc106) + %x0_8 = arith.remsi %x0, %x0_7 : tensor<512xi64, #blocked> loc(#loc106) + %x1 = arith.divsi %x0, %x0_7 : tensor<512xi64, #blocked> loc(#loc107) + %x1_9 = tt.splat %ks1 : i64 -> tensor<512xi64, #blocked> loc(#loc108) + %x1_10 = arith.remsi %x1, %x1_9 : tensor<512xi64, #blocked> loc(#loc108) + %tmp31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc109) + %tmp31_11 = tt.addptr %tmp31, %xindex_5 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc109) + %tmp31_12 = tt.load %tmp31_11, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc110) + %tmp31_13 = arith.extf %tmp31_12 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc111) + %tmp32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc112) + %tmp32_14 = tt.addptr %tmp32, %x1_10 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc112) + %tmp32_15 = tt.load %tmp32_14, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc113) + %tmp1 = arith.divsi %ks0, %c2_i64 : i64 loc(#loc114) + %tmp2 = tt.splat %tmp1 : i64 -> tensor<512xi64, #blocked> loc(#loc115) + %tmp2_16 = arith.cmpi sge, %x0_8, %tmp2 : tensor<512xi64, #blocked> loc(#loc115) + %tmp3 = arith.muli %tmp1, %c-1_i64 : i64 loc(#loc116) + %tmp3_17 = tt.splat %tmp3 : i64 -> tensor<512xi64, #blocked> loc(#loc117) + %tmp3_18 = arith.addi %x0, %tmp3_17 : tensor<512xi64, #blocked> loc(#loc117) + %tmp3_19 = tt.addptr %tmp31, %tmp3_18 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc118) + %tmp3_20 = arith.andi %tmp2_16, %xmask_6 : tensor<512xi1, #blocked> loc(#loc119) + %tmp3_21 = tt.load %tmp3_19, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc120) + %tmp3_22 = arith.extf %tmp3_21 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc121) + %tmp4 = tt.load %tmp32_14, %tmp3_20, %cst_1 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc122) + %tmp5 = tt.splat %ks2 : i64 -> tensor<512xi64, #blocked> loc(#loc123) + %tmp6 = arith.addi %tmp4, %tmp5 : tensor<512xi64, #blocked> loc(#loc124) + %tmp7 = arith.cmpi slt, %tmp4, %cst_1 : tensor<512xi64, #blocked> loc(#loc125) + %tmp8 = arith.select %tmp7, %tmp6, %tmp4 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked> loc(#loc126) + %0 = arith.cmpi sge, %tmp8, %cst_1 : tensor<512xi64, #blocked> loc(#loc28) + %1 = arith.cmpi slt, %tmp8, %tmp5 : tensor<512xi64, #blocked> loc(#loc29) + %2 = arith.andi %0, %1 : tensor<512xi1, #blocked> loc(#loc30) + %3 = arith.xori %tmp3_20, %cst : tensor<512xi1, #blocked> loc(#loc31) + %4 = arith.ori %2, %3 : tensor<512xi1, #blocked> loc(#loc32) + tt.assert %4, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<512xi1, #blocked> loc(#loc33) + %tmp10 = arith.addi %x0_8, %tmp3_17 : tensor<512xi64, #blocked> loc(#loc127) + %tmp10_23 = arith.muli %x0_7, %tmp8 : tensor<512xi64, #blocked> loc(#loc128) + %tmp10_24 = arith.addi %tmp10, %tmp10_23 : tensor<512xi64, #blocked> loc(#loc129) + %tmp10_25 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc130) + %tmp10_26 = tt.addptr %tmp10_25, %tmp10_24 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc130) + %tmp10_27 = tt.load %tmp10_26, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc131) + %tmp10_28 = arith.extf %tmp10_27 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc132) + %tmp11 = arith.mulf %tmp3_22, %tmp10_28 : tensor<512xf32, #blocked> loc(#loc133) + %tmp12 = arith.subf %cst_2, %tmp11 : tensor<512xf32, #blocked> loc(#loc134) + %tmp16 = arith.select %tmp2_16, %tmp12, %cst_2 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc169) + %tmp17 = arith.cmpi slt, %x0_8, %tmp2 : tensor<512xi64, #blocked> loc(#loc137) + %tmp18 = arith.addi %x0_7, %x0 : tensor<512xi64, #blocked> loc(#loc138) + %tmp18_29 = arith.addi %tmp18, %tmp3_17 : tensor<512xi64, #blocked> loc(#loc139) + %tmp18_30 = tt.addptr %tmp31, %tmp18_29 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc140) + %tmp18_31 = arith.andi %tmp17, %xmask_6 : tensor<512xi1, #blocked> loc(#loc141) + %tmp18_32 = tt.load %tmp18_30, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc142) + %tmp18_33 = arith.extf %tmp18_32 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc143) + %tmp19 = tt.load %tmp32_14, %tmp18_31, %cst_1 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc144) + %tmp21 = arith.addi %tmp19, %tmp5 : tensor<512xi64, #blocked> loc(#loc145) + %tmp22 = arith.cmpi slt, %tmp19, %cst_1 : tensor<512xi64, #blocked> loc(#loc146) + %tmp23 = arith.select %tmp22, %tmp21, %tmp19 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked> loc(#loc147) + %5 = arith.cmpi sge, %tmp23, %cst_1 : tensor<512xi64, #blocked> loc(#loc55) + %6 = arith.cmpi slt, %tmp23, %tmp5 : tensor<512xi64, #blocked> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<512xi1, #blocked> loc(#loc57) + %8 = arith.xori %tmp18_31, %cst : tensor<512xi1, #blocked> loc(#loc58) + %9 = arith.ori %7, %8 : tensor<512xi1, #blocked> loc(#loc59) + tt.assert %9, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<512xi1, #blocked> loc(#loc60) + %tmp25 = arith.addi %x0_7, %x0_8 : tensor<512xi64, #blocked> loc(#loc148) + %tmp25_34 = arith.addi %tmp25, %tmp3_17 : tensor<512xi64, #blocked> loc(#loc149) + %tmp25_35 = arith.muli %x0_7, %tmp23 : tensor<512xi64, #blocked> loc(#loc150) + %tmp25_36 = arith.addi %tmp25_34, %tmp25_35 : tensor<512xi64, #blocked> loc(#loc151) + %tmp25_37 = tt.addptr %tmp10_25, %tmp25_36 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc152) + %tmp25_38 = tt.load %tmp25_37, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc153) + %tmp25_39 = arith.extf %tmp25_38 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc154) + %tmp26 = arith.mulf %tmp18_33, %tmp25_39 : tensor<512xf32, #blocked> loc(#loc155) + %tmp29 = arith.select %tmp17, %tmp26, %cst_2 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc170) + %tmp30 = arith.addf %tmp16, %tmp29 : tensor<512xf32, #blocked> loc(#loc158) + %tmp34 = tt.splat %ks3 : i64 -> tensor<512xi64, #blocked> loc(#loc159) + %tmp34_40 = arith.addi %tmp32_15, %tmp34 : tensor<512xi64, #blocked> loc(#loc159) + %tmp35 = arith.cmpi slt, %tmp32_15, %cst_1 : tensor<512xi64, #blocked> loc(#loc160) + %tmp36 = arith.select %tmp35, %tmp34_40, %tmp32_15 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked> loc(#loc161) + %10 = arith.cmpi sge, %tmp36, %cst_1 : tensor<512xi64, #blocked> loc(#loc75) + %11 = arith.cmpi slt, %tmp36, %tmp34 : tensor<512xi64, #blocked> loc(#loc76) + %12 = arith.andi %10, %11 : tensor<512xi1, #blocked> loc(#loc77) + %13 = arith.xori %xmask_6, %cst : tensor<512xi1, #blocked> loc(#loc78) + %14 = arith.ori %12, %13 : tensor<512xi1, #blocked> loc(#loc79) + tt.assert %14, "index out of bounds: 0 <= tmp36 < ks3" : tensor<512xi1, #blocked> loc(#loc80) + %tmp38 = arith.muli %x0_7, %tmp36 : tensor<512xi64, #blocked> loc(#loc162) + %tmp38_41 = arith.addi %x0_8, %tmp38 : tensor<512xi64, #blocked> loc(#loc163) + %tmp38_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc164) + %tmp38_43 = tt.addptr %tmp38_42, %tmp38_41 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc164) + %tmp38_44 = tt.load %tmp38_43, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc165) + %tmp38_45 = arith.extf %tmp38_44 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc166) + %tmp39 = arith.mulf %tmp31_13, %tmp38_45 : tensor<512xf32, #blocked> loc(#loc167) + %tmp40 = arith.addf %tmp30, %tmp39 : tensor<512xf32, #blocked> loc(#loc168) + %15 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc88) + %16 = tt.addptr %15, %xindex_5 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc88) + %17 = arith.truncf %tmp40 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc89) + tt.store %16, %17, %xmask_6 : tensor<512x!tt.ptr, #blocked> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":24:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":24:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":25:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":25:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":25:76) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":26:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":26:36) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":28:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":29:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:35) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:111) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":31:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":32:32) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":33:18) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":34:18) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":35:32) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:98) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:64) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:108) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:106) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:123) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:58) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:65) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:123) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":38:19) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":39:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":43:34) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":41:34) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":44:19) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:37) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:42) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:68) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:60) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:119) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":46:36) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":48:20) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":49:20) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":50:35) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:100) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:65) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:110) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:108) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:126) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:37) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:64) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:60) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:31) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:72) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:131) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":53:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":56:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":55:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":57:20) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":59:20) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":60:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":61:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:28) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:46) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:38) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:56) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:54) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:64) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:40) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:36) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:31) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:48) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:88) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":64:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":65:20) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":66:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":66:37) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":66:4) +#loc101 = loc("xoffset"(#loc2)) +#loc102 = loc("xoffset"(#loc3)) +#loc103 = loc("xindex"(#loc4)) +#loc104 = loc("xindex"(#loc5)) +#loc105 = loc("xmask"(#loc6)) +#loc106 = loc("x0"(#loc7)) +#loc107 = loc("x1"(#loc8)) +#loc108 = loc("x1"(#loc9)) +#loc109 = loc("tmp31"(#loc10)) +#loc110 = loc("tmp31"(#loc11)) +#loc111 = loc("tmp31"(#loc12)) +#loc112 = loc("tmp32"(#loc13)) +#loc113 = loc("tmp32"(#loc14)) +#loc114 = loc("tmp1"(#loc15)) +#loc115 = loc("tmp2"(#loc16)) +#loc116 = loc("tmp3"(#loc17)) +#loc117 = loc("tmp3"(#loc18)) +#loc118 = loc("tmp3"(#loc19)) +#loc119 = loc("tmp3"(#loc20)) +#loc120 = loc("tmp3"(#loc21)) +#loc121 = loc("tmp3"(#loc22)) +#loc122 = loc("tmp4"(#loc23)) +#loc123 = loc("tmp5"(#loc24)) +#loc124 = loc("tmp6"(#loc25)) +#loc125 = loc("tmp7"(#loc26)) +#loc126 = loc("tmp8"(#loc27)) +#loc127 = loc("tmp10"(#loc34)) +#loc128 = loc("tmp10"(#loc35)) +#loc129 = loc("tmp10"(#loc36)) +#loc130 = loc("tmp10"(#loc37)) +#loc131 = loc("tmp10"(#loc38)) +#loc132 = loc("tmp10"(#loc39)) +#loc133 = loc("tmp11"(#loc40)) +#loc134 = loc("tmp12"(#loc41)) +#loc135 = loc("tmp16"(#loc42)) +#loc136 = loc("tmp14"(#loc43)) +#loc137 = loc("tmp17"(#loc44)) +#loc138 = loc("tmp18"(#loc45)) +#loc139 = loc("tmp18"(#loc46)) +#loc140 = loc("tmp18"(#loc47)) +#loc141 = loc("tmp18"(#loc48)) +#loc142 = loc("tmp18"(#loc49)) +#loc143 = loc("tmp18"(#loc50)) +#loc144 = loc("tmp19"(#loc51)) +#loc145 = loc("tmp21"(#loc52)) +#loc146 = loc("tmp22"(#loc53)) +#loc147 = loc("tmp23"(#loc54)) +#loc148 = loc("tmp25"(#loc61)) +#loc149 = loc("tmp25"(#loc62)) +#loc150 = loc("tmp25"(#loc63)) +#loc151 = loc("tmp25"(#loc64)) +#loc152 = loc("tmp25"(#loc65)) +#loc153 = loc("tmp25"(#loc66)) +#loc154 = loc("tmp25"(#loc67)) +#loc155 = loc("tmp26"(#loc68)) +#loc156 = loc("tmp29"(#loc69)) +#loc157 = loc("tmp28"(#loc70)) +#loc158 = loc("tmp30"(#loc71)) +#loc159 = loc("tmp34"(#loc72)) +#loc160 = loc("tmp35"(#loc73)) +#loc161 = loc("tmp36"(#loc74)) +#loc162 = loc("tmp38"(#loc81)) +#loc163 = loc("tmp38"(#loc82)) +#loc164 = loc("tmp38"(#loc83)) +#loc165 = loc("tmp38"(#loc84)) +#loc166 = loc("tmp38"(#loc85)) +#loc167 = loc("tmp39"(#loc86)) +#loc168 = loc("tmp40"(#loc87)) +#loc169 = loc(fused[#loc135, #loc136]) +#loc170 = loc(fused[#loc156, #loc157]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..c014f757a26df38ed2b7ebc0ef00208e69c8e3d8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XRR2QXTZQK4DSBTDJUTNXO6FEFXI2IIRKSC5GYSBWLTL56SKI4WA/triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0.ttir @@ -0,0 +1,283 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":18:0) +#loc91 = loc("in_ptr0"(#loc)) +#loc92 = loc("in_ptr1"(#loc)) +#loc93 = loc("in_ptr2"(#loc)) +#loc94 = loc("in_ptr3"(#loc)) +#loc95 = loc("out_ptr0"(#loc)) +#loc96 = loc("ks0"(#loc)) +#loc97 = loc("ks1"(#loc)) +#loc98 = loc("ks2"(#loc)) +#loc99 = loc("ks3"(#loc)) +#loc100 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_index_mul_neg_slice_slice_backward_squeeze_unsqueeze_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<512xi64> loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xbf16> loc(#loc1) + %cst_1 = arith.constant dense : tensor<512xi1> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc101) + %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc102) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc103) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc104) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc104) + %xmask = tt.splat %xnumel : i32 -> tensor<512xi32> loc(#loc105) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<512xi32> loc(#loc105) + %x0 = arith.extsi %xindex_5 : tensor<512xi32> to tensor<512xi64> loc(#loc106) + %x0_7 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc106) + %x0_8 = arith.remsi %x0, %x0_7 : tensor<512xi64> loc(#loc106) + %x1 = arith.divsi %x0, %x0_7 : tensor<512xi64> loc(#loc107) + %x1_9 = tt.splat %ks1 : i64 -> tensor<512xi64> loc(#loc108) + %x1_10 = arith.remsi %x1, %x1_9 : tensor<512xi64> loc(#loc108) + %tmp31 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc109) + %tmp31_11 = tt.addptr %tmp31, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc109) + %tmp31_12 = tt.load %tmp31_11, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc110) + %tmp31_13 = arith.extf %tmp31_12 : tensor<512xbf16> to tensor<512xf32> loc(#loc111) + %tmp32 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc112) + %tmp32_14 = tt.addptr %tmp32, %x1_10 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc112) + %tmp32_15 = tt.load %tmp32_14, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc113) + %tmp1 = arith.divsi %ks0, %c2_i64 : i64 loc(#loc114) + %tmp2 = tt.splat %tmp1 : i64 -> tensor<512xi64> loc(#loc115) + %tmp2_16 = arith.cmpi sge, %x0_8, %tmp2 : tensor<512xi64> loc(#loc115) + %tmp3 = arith.muli %tmp1, %c-1_i64 : i64 loc(#loc116) + %tmp3_17 = tt.splat %tmp3 : i64 -> tensor<512xi64> loc(#loc117) + %tmp3_18 = arith.addi %x0, %tmp3_17 : tensor<512xi64> loc(#loc117) + %tmp3_19 = tt.addptr %tmp31, %tmp3_18 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc118) + %tmp3_20 = arith.andi %tmp2_16, %xmask_6 : tensor<512xi1> loc(#loc119) + %tmp3_21 = tt.load %tmp3_19, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc120) + %tmp3_22 = arith.extf %tmp3_21 : tensor<512xbf16> to tensor<512xf32> loc(#loc121) + %tmp4 = tt.load %tmp32_14, %tmp3_20, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc122) + %tmp5 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc123) + %tmp6 = arith.addi %tmp4, %tmp5 : tensor<512xi64> loc(#loc124) + %tmp7 = arith.cmpi slt, %tmp4, %cst : tensor<512xi64> loc(#loc125) + %tmp8 = arith.select %tmp7, %tmp6, %tmp4 : tensor<512xi1>, tensor<512xi64> loc(#loc126) + %0 = arith.cmpi sge, %tmp8, %cst : tensor<512xi64> loc(#loc28) + %1 = arith.cmpi slt, %tmp8, %tmp5 : tensor<512xi64> loc(#loc29) + %2 = arith.andi %0, %1 : tensor<512xi1> loc(#loc30) + %3 = arith.xori %tmp3_20, %cst_1 : tensor<512xi1> loc(#loc31) + %4 = arith.ori %2, %3 : tensor<512xi1> loc(#loc32) + tt.assert %4, "index out of bounds: 0 <= tl.broadcast_to(tmp8, [XBLOCK]) < ks2" : tensor<512xi1> loc(#loc33) + %tmp10 = arith.addi %x0_8, %tmp3_17 : tensor<512xi64> loc(#loc127) + %tmp10_23 = arith.muli %x0_7, %tmp8 : tensor<512xi64> loc(#loc128) + %tmp10_24 = arith.addi %tmp10, %tmp10_23 : tensor<512xi64> loc(#loc129) + %tmp10_25 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc130) + %tmp10_26 = tt.addptr %tmp10_25, %tmp10_24 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc130) + %tmp10_27 = tt.load %tmp10_26, %tmp3_20, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc131) + %tmp10_28 = arith.extf %tmp10_27 : tensor<512xbf16> to tensor<512xf32> loc(#loc132) + %tmp11 = arith.mulf %tmp3_22, %tmp10_28 : tensor<512xf32> loc(#loc133) + %tmp12 = arith.subf %cst_2, %tmp11 : tensor<512xf32> loc(#loc134) + %tmp16 = arith.select %tmp2_16, %tmp12, %cst_2 : tensor<512xi1>, tensor<512xf32> loc(#loc169) + %tmp17 = arith.cmpi slt, %x0_8, %tmp2 : tensor<512xi64> loc(#loc137) + %tmp18 = arith.addi %x0_7, %x0 : tensor<512xi64> loc(#loc138) + %tmp18_29 = arith.addi %tmp18, %tmp3_17 : tensor<512xi64> loc(#loc139) + %tmp18_30 = tt.addptr %tmp31, %tmp18_29 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc140) + %tmp18_31 = arith.andi %tmp17, %xmask_6 : tensor<512xi1> loc(#loc141) + %tmp18_32 = tt.load %tmp18_30, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc142) + %tmp18_33 = arith.extf %tmp18_32 : tensor<512xbf16> to tensor<512xf32> loc(#loc143) + %tmp19 = tt.load %tmp32_14, %tmp18_31, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc144) + %tmp21 = arith.addi %tmp19, %tmp5 : tensor<512xi64> loc(#loc145) + %tmp22 = arith.cmpi slt, %tmp19, %cst : tensor<512xi64> loc(#loc146) + %tmp23 = arith.select %tmp22, %tmp21, %tmp19 : tensor<512xi1>, tensor<512xi64> loc(#loc147) + %5 = arith.cmpi sge, %tmp23, %cst : tensor<512xi64> loc(#loc55) + %6 = arith.cmpi slt, %tmp23, %tmp5 : tensor<512xi64> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<512xi1> loc(#loc57) + %8 = arith.xori %tmp18_31, %cst_1 : tensor<512xi1> loc(#loc58) + %9 = arith.ori %7, %8 : tensor<512xi1> loc(#loc59) + tt.assert %9, "index out of bounds: 0 <= tl.broadcast_to(tmp23, [XBLOCK]) < ks2" : tensor<512xi1> loc(#loc60) + %tmp25 = arith.addi %x0_7, %x0_8 : tensor<512xi64> loc(#loc148) + %tmp25_34 = arith.addi %tmp25, %tmp3_17 : tensor<512xi64> loc(#loc149) + %tmp25_35 = arith.muli %x0_7, %tmp23 : tensor<512xi64> loc(#loc150) + %tmp25_36 = arith.addi %tmp25_34, %tmp25_35 : tensor<512xi64> loc(#loc151) + %tmp25_37 = tt.addptr %tmp10_25, %tmp25_36 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc152) + %tmp25_38 = tt.load %tmp25_37, %tmp18_31, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc153) + %tmp25_39 = arith.extf %tmp25_38 : tensor<512xbf16> to tensor<512xf32> loc(#loc154) + %tmp26 = arith.mulf %tmp18_33, %tmp25_39 : tensor<512xf32> loc(#loc155) + %tmp29 = arith.select %tmp17, %tmp26, %cst_2 : tensor<512xi1>, tensor<512xf32> loc(#loc170) + %tmp30 = arith.addf %tmp16, %tmp29 : tensor<512xf32> loc(#loc158) + %tmp34 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc159) + %tmp34_40 = arith.addi %tmp32_15, %tmp34 : tensor<512xi64> loc(#loc159) + %tmp35 = arith.cmpi slt, %tmp32_15, %cst : tensor<512xi64> loc(#loc160) + %tmp36 = arith.select %tmp35, %tmp34_40, %tmp32_15 : tensor<512xi1>, tensor<512xi64> loc(#loc161) + %10 = arith.cmpi sge, %tmp36, %cst : tensor<512xi64> loc(#loc75) + %11 = arith.cmpi slt, %tmp36, %tmp34 : tensor<512xi64> loc(#loc76) + %12 = arith.andi %10, %11 : tensor<512xi1> loc(#loc77) + %13 = arith.xori %xmask_6, %cst_1 : tensor<512xi1> loc(#loc78) + %14 = arith.ori %12, %13 : tensor<512xi1> loc(#loc79) + tt.assert %14, "index out of bounds: 0 <= tmp36 < ks3" : tensor<512xi1> loc(#loc80) + %tmp38 = arith.muli %x0_7, %tmp36 : tensor<512xi64> loc(#loc162) + %tmp38_41 = arith.addi %x0_8, %tmp38 : tensor<512xi64> loc(#loc163) + %tmp38_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc164) + %tmp38_43 = tt.addptr %tmp38_42, %tmp38_41 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc164) + %tmp38_44 = tt.load %tmp38_43, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc165) + %tmp38_45 = arith.extf %tmp38_44 : tensor<512xbf16> to tensor<512xf32> loc(#loc166) + %tmp39 = arith.mulf %tmp31_13, %tmp38_45 : tensor<512xf32> loc(#loc167) + %tmp40 = arith.addf %tmp30, %tmp39 : tensor<512xf32> loc(#loc168) + %15 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc88) + %16 = tt.addptr %15, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc88) + %17 = arith.truncf %tmp40 : tensor<512xf32> to tensor<512xbf16> loc(#loc89) + tt.store %16, %17, %xmask_6 : tensor<512x!tt.ptr> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":22:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":24:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":24:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":25:31) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":25:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":25:76) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":26:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":26:36) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":28:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":29:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:35) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:30) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:60) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:53) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":30:111) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":31:35) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":32:32) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":33:18) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":34:18) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":35:32) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:98) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:64) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:108) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:106) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":36:123) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:58) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:65) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":37:123) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":38:19) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":39:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":43:34) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":41:34) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":44:19) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:37) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:42) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:68) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:60) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":45:119) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":46:36) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":48:20) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":49:20) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":50:35) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:100) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:65) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:110) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:108) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":51:126) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:37) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:42) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:64) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:60) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:31) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:72) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":52:131) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":53:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":56:35) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":55:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":57:20) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":59:20) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":60:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":61:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:28) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:46) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:38) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:56) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:54) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":62:64) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:40) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:36) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:31) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:48) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":63:88) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":64:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":65:20) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":66:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":66:37) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/mb/cmb5zlldkrfleyhckhiomhpfsvkfkllotbopalddwkdfhlizveux.py":66:4) +#loc101 = loc("xoffset"(#loc2)) +#loc102 = loc("xoffset"(#loc3)) +#loc103 = loc("xindex"(#loc4)) +#loc104 = loc("xindex"(#loc5)) +#loc105 = loc("xmask"(#loc6)) +#loc106 = loc("x0"(#loc7)) +#loc107 = loc("x1"(#loc8)) +#loc108 = loc("x1"(#loc9)) +#loc109 = loc("tmp31"(#loc10)) +#loc110 = loc("tmp31"(#loc11)) +#loc111 = loc("tmp31"(#loc12)) +#loc112 = loc("tmp32"(#loc13)) +#loc113 = loc("tmp32"(#loc14)) +#loc114 = loc("tmp1"(#loc15)) +#loc115 = loc("tmp2"(#loc16)) +#loc116 = loc("tmp3"(#loc17)) +#loc117 = loc("tmp3"(#loc18)) +#loc118 = loc("tmp3"(#loc19)) +#loc119 = loc("tmp3"(#loc20)) +#loc120 = loc("tmp3"(#loc21)) +#loc121 = loc("tmp3"(#loc22)) +#loc122 = loc("tmp4"(#loc23)) +#loc123 = loc("tmp5"(#loc24)) +#loc124 = loc("tmp6"(#loc25)) +#loc125 = loc("tmp7"(#loc26)) +#loc126 = loc("tmp8"(#loc27)) +#loc127 = loc("tmp10"(#loc34)) +#loc128 = loc("tmp10"(#loc35)) +#loc129 = loc("tmp10"(#loc36)) +#loc130 = loc("tmp10"(#loc37)) +#loc131 = loc("tmp10"(#loc38)) +#loc132 = loc("tmp10"(#loc39)) +#loc133 = loc("tmp11"(#loc40)) +#loc134 = loc("tmp12"(#loc41)) +#loc135 = loc("tmp16"(#loc42)) +#loc136 = loc("tmp14"(#loc43)) +#loc137 = loc("tmp17"(#loc44)) +#loc138 = loc("tmp18"(#loc45)) +#loc139 = loc("tmp18"(#loc46)) +#loc140 = loc("tmp18"(#loc47)) +#loc141 = loc("tmp18"(#loc48)) +#loc142 = loc("tmp18"(#loc49)) +#loc143 = loc("tmp18"(#loc50)) +#loc144 = loc("tmp19"(#loc51)) +#loc145 = loc("tmp21"(#loc52)) +#loc146 = loc("tmp22"(#loc53)) +#loc147 = loc("tmp23"(#loc54)) +#loc148 = loc("tmp25"(#loc61)) +#loc149 = loc("tmp25"(#loc62)) +#loc150 = loc("tmp25"(#loc63)) +#loc151 = loc("tmp25"(#loc64)) +#loc152 = loc("tmp25"(#loc65)) +#loc153 = loc("tmp25"(#loc66)) +#loc154 = loc("tmp25"(#loc67)) +#loc155 = loc("tmp26"(#loc68)) +#loc156 = loc("tmp29"(#loc69)) +#loc157 = loc("tmp28"(#loc70)) +#loc158 = loc("tmp30"(#loc71)) +#loc159 = loc("tmp34"(#loc72)) +#loc160 = loc("tmp35"(#loc73)) +#loc161 = loc("tmp36"(#loc74)) +#loc162 = loc("tmp38"(#loc81)) +#loc163 = loc("tmp38"(#loc82)) +#loc164 = loc("tmp38"(#loc83)) +#loc165 = loc("tmp38"(#loc84)) +#loc166 = loc("tmp38"(#loc85)) +#loc167 = loc("tmp39"(#loc86)) +#loc168 = loc("tmp40"(#loc87)) +#loc169 = loc(fused[#loc135, #loc136]) +#loc170 = loc(fused[#loc156, #loc157]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/__grp__triton_tem_fused_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/__grp__triton_tem_fused_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cd98dc24dbe88601e3ab9642a74acabb6b0181d0 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/__grp__triton_tem_fused_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_tem_fused_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.source", "triton_tem_fused_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.ttir", "triton_tem_fused_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.ttgir", "triton_tem_fused_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.llir", "triton_tem_fused_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.ptx", "triton_tem_fused_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.cubin", "triton_tem_fused_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3612369944434bba900b339f6186188e4cf4e8fc --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.json @@ -0,0 +1 @@ +{"hash": "bce683064766017804850e391c72d2ce258dad59ae686e0a22dcfd1b2738e0ec", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 131072, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..b0e36771fadd8f25cf8dbc33286eabfeec3b0918 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.llir @@ -0,0 +1,5502 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_tem_fused_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, ptr addrspace(1) %10, i32 %11, i32 %12, i32 %13, i32 %14, i32 %15, i32 %16, ptr addrspace(1) readnone captures(none) %17, ptr addrspace(1) readnone captures(none) %18) local_unnamed_addr #0 !dbg !5 { + %20 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %21 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !9 + %22 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !10 + %23 = and i32 %21, 1, !dbg !11 + %24 = shl nuw nsw i32 %21, 12, !dbg !12 + %25 = mul i32 %24, %11, !dbg !13 + %26 = shl nuw nsw i32 %22, 7, !dbg !14 + %27 = add i32 %25, %26, !dbg !15 + %28 = shl nuw nsw i32 %23, 10, !dbg !16 + %29 = shl nuw nsw i32 %22, 5, !dbg !17 + %30 = and i32 %29, 2097024, !dbg !17 + %reass.add = add nuw nsw i32 %28, %30, !dbg !18 + %reass.mul = mul i32 %reass.add, %12, !dbg !18 + %31 = sext i32 %27 to i64, !dbg !19 + %32 = getelementptr bfloat, ptr addrspace(1) %0, i64 %31, !dbg !19 + %33 = sext i32 %reass.mul to i64, !dbg !20 + %34 = getelementptr bfloat, ptr addrspace(1) %1, i64 %33, !dbg !20 + %35 = getelementptr bfloat, ptr addrspace(1) %2, i64 %33, !dbg !21 + %36 = mul nuw nsw i32 %13, %23, !dbg !22 + %37 = add i32 %36, %20, !dbg !23 + %38 = mul nuw nsw i32 %14, %23, !dbg !24 + %reass.add388 = add i32 %38, %20, !dbg !25 + %reass.mul389 = mul i32 %reass.add388, %15, !dbg !25 + %39 = shl i32 %20, 7, !dbg !26 + %40 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !27 + %41 = lshr i32 %40, 5, !dbg !27 + %42 = and i32 %40, 240, !dbg !27 + %43 = lshr exact i32 %42, 4, !dbg !27 + %44 = or disjoint i32 %43, 16, !dbg !27 + %45 = or disjoint i32 %43, 32, !dbg !27 + %46 = or disjoint i32 %43, 48, !dbg !27 + %47 = lshr i32 %40, 1, !dbg !27 + %48 = and i32 %47, 112, !dbg !27 + %49 = lshr i32 %40, 2, !dbg !27 + %50 = and i32 %49, 7, !dbg !27 + %51 = or disjoint i32 %48, %50, !dbg !27 + %52 = or disjoint i32 %43, %39, !dbg !28 + %53 = or disjoint i32 %44, %39, !dbg !28 + %54 = or disjoint i32 %45, %39, !dbg !28 + %55 = or disjoint i32 %46, %39, !dbg !28 + %56 = or disjoint i32 %52, 64, !dbg !28 + %57 = or disjoint i32 %52, 80, !dbg !28 + %58 = or disjoint i32 %52, 96, !dbg !28 + %59 = or disjoint i32 %52, 112, !dbg !28 + %60 = shl i32 %52, 12, !dbg !29 + %61 = shl i32 %53, 12, !dbg !29 + %62 = shl i32 %54, 12, !dbg !29 + %63 = shl i32 %55, 12, !dbg !29 + %64 = shl i32 %56, 12, !dbg !29 + %65 = shl i32 %57, 12, !dbg !29 + %66 = shl i32 %58, 12, !dbg !29 + %67 = shl i32 %59, 12, !dbg !29 + %68 = sext i32 %60 to i64, !dbg !32 + %69 = getelementptr bfloat, ptr addrspace(1) %32, i64 %68, !dbg !32 + %70 = sext i32 %61 to i64, !dbg !32 + %71 = getelementptr bfloat, ptr addrspace(1) %32, i64 %70, !dbg !32 + %72 = sext i32 %62 to i64, !dbg !32 + %73 = getelementptr bfloat, ptr addrspace(1) %32, i64 %72, !dbg !32 + %74 = sext i32 %63 to i64, !dbg !32 + %75 = getelementptr bfloat, ptr addrspace(1) %32, i64 %74, !dbg !32 + %76 = sext i32 %64 to i64, !dbg !32 + %77 = getelementptr bfloat, ptr addrspace(1) %32, i64 %76, !dbg !32 + %78 = sext i32 %65 to i64, !dbg !32 + %79 = getelementptr bfloat, ptr addrspace(1) %32, i64 %78, !dbg !32 + %80 = sext i32 %66 to i64, !dbg !32 + %81 = getelementptr bfloat, ptr addrspace(1) %32, i64 %80, !dbg !32 + %82 = sext i32 %67 to i64, !dbg !32 + %83 = getelementptr bfloat, ptr addrspace(1) %32, i64 %82, !dbg !32 + %84 = shl nuw nsw i32 %40, 3, !dbg !33 + %85 = and i32 %84, 120, !dbg !33 + %86 = zext nneg i32 %85 to i64, !dbg !34 + %87 = getelementptr bfloat, ptr addrspace(1) %69, i64 %86, !dbg !34 + %88 = getelementptr bfloat, ptr addrspace(1) %71, i64 %86, !dbg !34 + %89 = getelementptr bfloat, ptr addrspace(1) %73, i64 %86, !dbg !34 + %90 = getelementptr bfloat, ptr addrspace(1) %75, i64 %86, !dbg !34 + %91 = getelementptr bfloat, ptr addrspace(1) %77, i64 %86, !dbg !34 + %92 = getelementptr bfloat, ptr addrspace(1) %79, i64 %86, !dbg !34 + %93 = getelementptr bfloat, ptr addrspace(1) %81, i64 %86, !dbg !34 + %94 = getelementptr bfloat, ptr addrspace(1) %83, i64 %86, !dbg !34 + %95 = icmp slt i32 %52, %11, !dbg !35 + %96 = icmp slt i32 %53, %11, !dbg !35 + %97 = icmp slt i32 %54, %11, !dbg !35 + %98 = icmp slt i32 %55, %11, !dbg !35 + %99 = icmp slt i32 %56, %11, !dbg !35 + %100 = icmp slt i32 %57, %11, !dbg !35 + %101 = icmp slt i32 %58, %11, !dbg !35 + %102 = icmp slt i32 %59, %11, !dbg !35 + %103 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %87, i1 %95) #2, !dbg !36 + %104 = extractvalue { i32, i32, i32, i32 } %103, 0, !dbg !36 + %105 = extractvalue { i32, i32, i32, i32 } %103, 1, !dbg !36 + %106 = extractvalue { i32, i32, i32, i32 } %103, 2, !dbg !36 + %107 = extractvalue { i32, i32, i32, i32 } %103, 3, !dbg !36 + %108 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %88, i1 %96) #2, !dbg !36 + %109 = extractvalue { i32, i32, i32, i32 } %108, 0, !dbg !36 + %110 = extractvalue { i32, i32, i32, i32 } %108, 1, !dbg !36 + %111 = extractvalue { i32, i32, i32, i32 } %108, 2, !dbg !36 + %112 = extractvalue { i32, i32, i32, i32 } %108, 3, !dbg !36 + %113 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %89, i1 %97) #2, !dbg !36 + %114 = extractvalue { i32, i32, i32, i32 } %113, 0, !dbg !36 + %115 = extractvalue { i32, i32, i32, i32 } %113, 1, !dbg !36 + %116 = extractvalue { i32, i32, i32, i32 } %113, 2, !dbg !36 + %117 = extractvalue { i32, i32, i32, i32 } %113, 3, !dbg !36 + %118 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %90, i1 %98) #2, !dbg !36 + %119 = extractvalue { i32, i32, i32, i32 } %118, 0, !dbg !36 + %120 = extractvalue { i32, i32, i32, i32 } %118, 1, !dbg !36 + %121 = extractvalue { i32, i32, i32, i32 } %118, 2, !dbg !36 + %122 = extractvalue { i32, i32, i32, i32 } %118, 3, !dbg !36 + %123 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %91, i1 %99) #2, !dbg !36 + %124 = extractvalue { i32, i32, i32, i32 } %123, 0, !dbg !36 + %125 = extractvalue { i32, i32, i32, i32 } %123, 1, !dbg !36 + %126 = extractvalue { i32, i32, i32, i32 } %123, 2, !dbg !36 + %127 = extractvalue { i32, i32, i32, i32 } %123, 3, !dbg !36 + %128 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %92, i1 %100) #2, !dbg !36 + %129 = extractvalue { i32, i32, i32, i32 } %128, 0, !dbg !36 + %130 = extractvalue { i32, i32, i32, i32 } %128, 1, !dbg !36 + %131 = extractvalue { i32, i32, i32, i32 } %128, 2, !dbg !36 + %132 = extractvalue { i32, i32, i32, i32 } %128, 3, !dbg !36 + %133 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %93, i1 %101) #2, !dbg !36 + %134 = extractvalue { i32, i32, i32, i32 } %133, 0, !dbg !36 + %135 = extractvalue { i32, i32, i32, i32 } %133, 1, !dbg !36 + %136 = extractvalue { i32, i32, i32, i32 } %133, 2, !dbg !36 + %137 = extractvalue { i32, i32, i32, i32 } %133, 3, !dbg !36 + %138 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %94, i1 %102) #2, !dbg !36 + %139 = extractvalue { i32, i32, i32, i32 } %138, 0, !dbg !36 + %140 = extractvalue { i32, i32, i32, i32 } %138, 1, !dbg !36 + %141 = extractvalue { i32, i32, i32, i32 } %138, 2, !dbg !36 + %142 = extractvalue { i32, i32, i32, i32 } %138, 3, !dbg !36 + %143 = and i32 %40, 7, !dbg !36 + %144 = shl nuw nsw i32 %143, 4, !dbg !36 + %145 = shl nuw nsw i32 %42, 3, !dbg !36 + %146 = and i32 %40, 112, !dbg !36 + %147 = and i32 %40, 8, !dbg !36 + %148 = shl nuw nsw i32 %147, 11, !dbg !36 + %149 = or disjoint i32 %144, %145, !dbg !36 + %150 = xor i32 %149, %146, !dbg !36 + %151 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %148, !dbg !36 + %152 = getelementptr inbounds nuw i8, ptr addrspace(3) %151, i32 %150, !dbg !36 + %153 = insertelement <4 x i32> poison, i32 %104, i64 0, !dbg !36 + %154 = insertelement <4 x i32> %153, i32 %105, i64 1, !dbg !36 + %155 = insertelement <4 x i32> %154, i32 %106, i64 2, !dbg !36 + %156 = insertelement <4 x i32> %155, i32 %107, i64 3, !dbg !36 + store <4 x i32> %156, ptr addrspace(3) %152, align 16, !dbg !36 + %157 = getelementptr inbounds nuw i8, ptr addrspace(3) %152, i32 2048, !dbg !36 + %158 = insertelement <4 x i32> poison, i32 %109, i64 0, !dbg !36 + %159 = insertelement <4 x i32> %158, i32 %110, i64 1, !dbg !36 + %160 = insertelement <4 x i32> %159, i32 %111, i64 2, !dbg !36 + %161 = insertelement <4 x i32> %160, i32 %112, i64 3, !dbg !36 + store <4 x i32> %161, ptr addrspace(3) %157, align 16, !dbg !36 + %162 = getelementptr inbounds nuw i8, ptr addrspace(3) %152, i32 4096, !dbg !36 + %163 = insertelement <4 x i32> poison, i32 %114, i64 0, !dbg !36 + %164 = insertelement <4 x i32> %163, i32 %115, i64 1, !dbg !36 + %165 = insertelement <4 x i32> %164, i32 %116, i64 2, !dbg !36 + %166 = insertelement <4 x i32> %165, i32 %117, i64 3, !dbg !36 + store <4 x i32> %166, ptr addrspace(3) %162, align 16, !dbg !36 + %167 = getelementptr inbounds nuw i8, ptr addrspace(3) %152, i32 6144, !dbg !36 + %168 = insertelement <4 x i32> poison, i32 %119, i64 0, !dbg !36 + %169 = insertelement <4 x i32> %168, i32 %120, i64 1, !dbg !36 + %170 = insertelement <4 x i32> %169, i32 %121, i64 2, !dbg !36 + %171 = insertelement <4 x i32> %170, i32 %122, i64 3, !dbg !36 + store <4 x i32> %171, ptr addrspace(3) %167, align 16, !dbg !36 + %172 = getelementptr inbounds nuw i8, ptr addrspace(3) %152, i32 8192, !dbg !36 + %173 = insertelement <4 x i32> poison, i32 %124, i64 0, !dbg !36 + %174 = insertelement <4 x i32> %173, i32 %125, i64 1, !dbg !36 + %175 = insertelement <4 x i32> %174, i32 %126, i64 2, !dbg !36 + %176 = insertelement <4 x i32> %175, i32 %127, i64 3, !dbg !36 + store <4 x i32> %176, ptr addrspace(3) %172, align 16, !dbg !36 + %177 = getelementptr inbounds nuw i8, ptr addrspace(3) %152, i32 10240, !dbg !36 + %178 = insertelement <4 x i32> poison, i32 %129, i64 0, !dbg !36 + %179 = insertelement <4 x i32> %178, i32 %130, i64 1, !dbg !36 + %180 = insertelement <4 x i32> %179, i32 %131, i64 2, !dbg !36 + %181 = insertelement <4 x i32> %180, i32 %132, i64 3, !dbg !36 + store <4 x i32> %181, ptr addrspace(3) %177, align 16, !dbg !36 + %182 = getelementptr inbounds nuw i8, ptr addrspace(3) %152, i32 12288, !dbg !36 + %183 = insertelement <4 x i32> poison, i32 %134, i64 0, !dbg !36 + %184 = insertelement <4 x i32> %183, i32 %135, i64 1, !dbg !36 + %185 = insertelement <4 x i32> %184, i32 %136, i64 2, !dbg !36 + %186 = insertelement <4 x i32> %185, i32 %137, i64 3, !dbg !36 + store <4 x i32> %186, ptr addrspace(3) %182, align 16, !dbg !36 + %187 = getelementptr inbounds nuw i8, ptr addrspace(3) %152, i32 14336, !dbg !36 + %188 = insertelement <4 x i32> poison, i32 %139, i64 0, !dbg !36 + %189 = insertelement <4 x i32> %188, i32 %140, i64 1, !dbg !36 + %190 = insertelement <4 x i32> %189, i32 %141, i64 2, !dbg !36 + %191 = insertelement <4 x i32> %190, i32 %142, i64 3, !dbg !36 + store <4 x i32> %191, ptr addrspace(3) %187, align 16, !dbg !36 + %192 = sext i32 %reass.mul389 to i64, !dbg !37 + %193 = getelementptr i32, ptr addrspace(1) %6, i64 %192, !dbg !37 + %194 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %193) #2, !dbg !38 + %195 = shl i32 %194, 7, !dbg !39 + %196 = sext i32 %37 to i64, !dbg !40 + %197 = getelementptr i32, ptr addrspace(1) %5, i64 %196, !dbg !40 + %198 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %197) #2, !dbg !41 + %199 = shl i32 %198, 1, !dbg !42 + %200 = add i32 %12, 63, !dbg !43 + %201 = sdiv i32 %200, 64, !dbg !47 + %202 = tail call i32 @llvm.smax.i32(i32 %201, i32 1), !dbg !48 + %203 = tail call i32 @llvm.smin.i32(i32 %199, i32 %202), !dbg !49 + %204 = and i32 %40, 3, !dbg !50 + %205 = shl nuw nsw i32 %204, 1, !dbg !50 + %206 = or disjoint i32 %205, 1, !dbg !50 + %207 = insertelement <2 x i32> poison, i32 %205, i64 0, !dbg !50 + %208 = shufflevector <2 x i32> %207, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !50 + %209 = or disjoint <2 x i32> %208, , !dbg !50 + %210 = insertelement <4 x i32> poison, i32 %205, i64 0, !dbg !50 + %211 = shufflevector <4 x i32> %210, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !50 + %212 = or disjoint <4 x i32> %211, , !dbg !50 + %213 = insertelement <8 x i32> poison, i32 %205, i64 0, !dbg !50 + %214 = shufflevector <8 x i32> %213, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !50 + %215 = or disjoint <8 x i32> %214, , !dbg !50 + %216 = or disjoint i32 %51, %39, !dbg !28 + %217 = or disjoint i32 %216, 8, !dbg !28 + %218 = insertelement <2 x i32> poison, i32 %217, i64 0, !dbg !51 + %219 = insertelement <2 x i32> %218, i32 %216, i64 1, !dbg !51 + %220 = insertelement <2 x i32> poison, i32 %11, i64 0, !dbg !51 + %221 = shufflevector <2 x i32> %219, <2 x i32> poison, <32 x i32> , !dbg !51 + %222 = shufflevector <2 x i32> %220, <2 x i32> poison, <32 x i32> zeroinitializer, !dbg !51 + %223 = srem <32 x i32> %221, %222, !dbg !51 + %224 = zext nneg i32 %21 to i64, !dbg !53 + %225 = getelementptr i64, ptr addrspace(1) %9, i64 %224, !dbg !53 + %226 = icmp sgt i32 %199, 0, !dbg !54 + %227 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %225, i1 %226) #2, !dbg !55 + %228 = extractelement <32 x i32> %223, i64 2, !dbg !56 + %229 = sext i32 %228 to i64, !dbg !56 + %230 = extractelement <32 x i32> %223, i64 0, !dbg !56 + %231 = sext i32 %230 to i64, !dbg !56 + %232 = icmp sgt i64 %227, %229, !dbg !57 + %233 = icmp sgt i64 %227, %231, !dbg !57 + %234 = or disjoint i32 %195, %43, !dbg !58 + %235 = or disjoint i32 %195, %44, !dbg !58 + %236 = or disjoint i32 %195, %45, !dbg !58 + %237 = or disjoint i32 %195, %46, !dbg !58 + %238 = shl i32 %234, 7, !dbg !59 + %239 = shl i32 %235, 7, !dbg !59 + %240 = shl i32 %236, 7, !dbg !59 + %241 = shl i32 %237, 7, !dbg !59 + %242 = sext i32 %238 to i64, !dbg !60 + %243 = getelementptr bfloat, ptr addrspace(1) %34, i64 %242, !dbg !60 + %244 = sext i32 %239 to i64, !dbg !60 + %245 = getelementptr bfloat, ptr addrspace(1) %34, i64 %244, !dbg !60 + %246 = sext i32 %240 to i64, !dbg !60 + %247 = getelementptr bfloat, ptr addrspace(1) %34, i64 %246, !dbg !60 + %248 = sext i32 %241 to i64, !dbg !60 + %249 = getelementptr bfloat, ptr addrspace(1) %34, i64 %248, !dbg !60 + %250 = getelementptr bfloat, ptr addrspace(1) %243, i64 %86, !dbg !61 + %251 = getelementptr bfloat, ptr addrspace(1) %245, i64 %86, !dbg !61 + %252 = getelementptr bfloat, ptr addrspace(1) %247, i64 %86, !dbg !61 + %253 = getelementptr bfloat, ptr addrspace(1) %249, i64 %86, !dbg !61 + %254 = icmp slt i32 %234, %12, !dbg !62 + %255 = icmp slt i32 %235, %12, !dbg !62 + %256 = icmp slt i32 %236, %12, !dbg !62 + %257 = icmp slt i32 %237, %12, !dbg !62 + %258 = and i1 %226, %254, !dbg !54 + %259 = and i1 %226, %255, !dbg !54 + %260 = and i1 %226, %256, !dbg !54 + %261 = and i1 %226, %257, !dbg !54 + %262 = shl nuw nsw i32 %147, 10, !dbg !63 + %263 = or disjoint i32 %150, %262, !dbg !63 + %264 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %263, !dbg !63 + %265 = select i1 %258, i32 16, i32 0, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %264, ptr addrspace(1) %250, i32 %265) #2, !dbg !63 + %266 = or disjoint i32 %263, 2048, !dbg !63 + %267 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %266, !dbg !63 + %268 = select i1 %259, i32 16, i32 0, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %267, ptr addrspace(1) %251, i32 %268) #2, !dbg !63 + %269 = or disjoint i32 %263, 4096, !dbg !63 + %270 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %269, !dbg !63 + %271 = select i1 %260, i32 16, i32 0, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %270, ptr addrspace(1) %252, i32 %271) #2, !dbg !63 + %272 = or disjoint i32 %263, 6144, !dbg !63 + %273 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %272, !dbg !63 + %274 = select i1 %261, i32 16, i32 0, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %273, ptr addrspace(1) %253, i32 %274) #2, !dbg !63 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !63 + %275 = getelementptr bfloat, ptr addrspace(1) %35, i64 %242, !dbg !60 + %276 = getelementptr bfloat, ptr addrspace(1) %35, i64 %244, !dbg !60 + %277 = getelementptr bfloat, ptr addrspace(1) %35, i64 %246, !dbg !60 + %278 = getelementptr bfloat, ptr addrspace(1) %35, i64 %248, !dbg !60 + %279 = getelementptr bfloat, ptr addrspace(1) %275, i64 %86, !dbg !61 + %280 = getelementptr bfloat, ptr addrspace(1) %276, i64 %86, !dbg !61 + %281 = getelementptr bfloat, ptr addrspace(1) %277, i64 %86, !dbg !61 + %282 = getelementptr bfloat, ptr addrspace(1) %278, i64 %86, !dbg !61 + %283 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %263, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %283, ptr addrspace(1) %279, i32 %265) #2, !dbg !63 + %284 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %266, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %284, ptr addrspace(1) %280, i32 %268) #2, !dbg !63 + %285 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %269, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %285, ptr addrspace(1) %281, i32 %271) #2, !dbg !63 + %286 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %272, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %286, ptr addrspace(1) %282, i32 %274) #2, !dbg !63 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !63 + %287 = icmp sgt i32 %203, 1, !dbg !54 + %288 = or disjoint i32 %195, 64, !dbg !64 + %289 = or disjoint i32 %288, %43, !dbg !58 + %290 = or disjoint i32 %288, %44, !dbg !58 + %291 = or disjoint i32 %288, %45, !dbg !58 + %292 = or disjoint i32 %288, %46, !dbg !58 + %293 = shl i32 %289, 7, !dbg !59 + %294 = shl i32 %290, 7, !dbg !59 + %295 = shl i32 %291, 7, !dbg !59 + %296 = shl i32 %292, 7, !dbg !59 + %297 = sext i32 %293 to i64, !dbg !60 + %298 = getelementptr bfloat, ptr addrspace(1) %34, i64 %297, !dbg !60 + %299 = sext i32 %294 to i64, !dbg !60 + %300 = getelementptr bfloat, ptr addrspace(1) %34, i64 %299, !dbg !60 + %301 = sext i32 %295 to i64, !dbg !60 + %302 = getelementptr bfloat, ptr addrspace(1) %34, i64 %301, !dbg !60 + %303 = sext i32 %296 to i64, !dbg !60 + %304 = getelementptr bfloat, ptr addrspace(1) %34, i64 %303, !dbg !60 + %305 = getelementptr bfloat, ptr addrspace(1) %298, i64 %86, !dbg !61 + %306 = getelementptr bfloat, ptr addrspace(1) %300, i64 %86, !dbg !61 + %307 = getelementptr bfloat, ptr addrspace(1) %302, i64 %86, !dbg !61 + %308 = getelementptr bfloat, ptr addrspace(1) %304, i64 %86, !dbg !61 + %309 = icmp slt i32 %289, %12, !dbg !62 + %310 = icmp slt i32 %290, %12, !dbg !62 + %311 = icmp slt i32 %291, %12, !dbg !62 + %312 = icmp slt i32 %292, %12, !dbg !62 + %313 = and i1 %287, %309, !dbg !54 + %314 = and i1 %287, %310, !dbg !54 + %315 = and i1 %287, %311, !dbg !54 + %316 = and i1 %287, %312, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !63 + %317 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %263, !dbg !63 + %318 = select i1 %313, i32 16, i32 0, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %317, ptr addrspace(1) %305, i32 %318) #2, !dbg !63 + %319 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %266, !dbg !63 + %320 = select i1 %314, i32 16, i32 0, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %319, ptr addrspace(1) %306, i32 %320) #2, !dbg !63 + %321 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %269, !dbg !63 + %322 = select i1 %315, i32 16, i32 0, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %321, ptr addrspace(1) %307, i32 %322) #2, !dbg !63 + %323 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %272, !dbg !63 + %324 = select i1 %316, i32 16, i32 0, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %323, ptr addrspace(1) %308, i32 %324) #2, !dbg !63 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !63 + %325 = getelementptr bfloat, ptr addrspace(1) %35, i64 %297, !dbg !60 + %326 = getelementptr bfloat, ptr addrspace(1) %35, i64 %299, !dbg !60 + %327 = getelementptr bfloat, ptr addrspace(1) %35, i64 %301, !dbg !60 + %328 = getelementptr bfloat, ptr addrspace(1) %35, i64 %303, !dbg !60 + %329 = getelementptr bfloat, ptr addrspace(1) %325, i64 %86, !dbg !61 + %330 = getelementptr bfloat, ptr addrspace(1) %326, i64 %86, !dbg !61 + %331 = getelementptr bfloat, ptr addrspace(1) %327, i64 %86, !dbg !61 + %332 = getelementptr bfloat, ptr addrspace(1) %328, i64 %86, !dbg !61 + %333 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %263, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %333, ptr addrspace(1) %329, i32 %318) #2, !dbg !63 + %334 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %266, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %334, ptr addrspace(1) %330, i32 %320) #2, !dbg !63 + %335 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %269, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %335, ptr addrspace(1) %331, i32 %322) #2, !dbg !63 + %336 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %272, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %336, ptr addrspace(1) %332, i32 %324) #2, !dbg !63 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !63 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #2, !dbg !65 + %invariant.gep = getelementptr bfloat, ptr addrspace(1) %34, i64 %86, !dbg !54 + %invariant.gep401 = getelementptr bfloat, ptr addrspace(1) %35, i64 %86, !dbg !54 + br i1 %226, label %.lr.ph, label %._crit_edge, !dbg !54 + +.lr.ph: ; preds = %19 + %337 = insertelement <16 x i32> poison, i32 %195, i64 0, !dbg !66 + %338 = shufflevector <16 x i32> %337, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !66 + %339 = insertelement <16 x i32> poison, i32 %206, i64 14, !dbg !66 + %340 = insertelement <16 x i32> %339, i32 %205, i64 15, !dbg !66 + %341 = shufflevector <8 x i32> %215, <8 x i32> poison, <16 x i32> , !dbg !66 + %342 = shufflevector <16 x i32> %341, <16 x i32> %340, <16 x i32> , !dbg !66 + %343 = shufflevector <4 x i32> %212, <4 x i32> poison, <16 x i32> , !dbg !66 + %344 = shufflevector <16 x i32> %342, <16 x i32> %343, <16 x i32> , !dbg !66 + %345 = shufflevector <2 x i32> %209, <2 x i32> poison, <16 x i32> , !dbg !66 + %346 = shufflevector <16 x i32> %344, <16 x i32> %345, <16 x i32> , !dbg !66 + %347 = or disjoint <16 x i32> %338, %346, !dbg !66 + %348 = add nsw i32 %203, -2 + %349 = add nsw i32 %203, -1 + %smax = tail call i32 @llvm.smax.i32(i32 %203, i32 1), !dbg !54 + %350 = insertelement <16 x i32> poison, i32 %12, i64 0, !dbg !51 + %351 = shufflevector <16 x i32> %350, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !51 + %352 = insertelement <32 x i32> poison, i32 %16, i64 0, !dbg !67 + %353 = shufflevector <32 x i32> %352, <32 x i32> poison, <32 x i32> zeroinitializer, !dbg !67 + %354 = insertelement <16 x i32> poison, i32 %16, i64 0, !dbg !68 + %355 = shufflevector <16 x i32> %354, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !68 + %356 = insertelement <16 x i64> poison, i64 %227, i64 0, !dbg !69 + %357 = shufflevector <16 x i64> %356, <16 x i64> poison, <16 x i32> zeroinitializer, !dbg !69 + br label %358, !dbg !54 + +358: ; preds = %.lr.ph, %__nv_exp2f.exit355 + %359 = phi i32 [ 64, %.lr.ph ], [ %2029, %__nv_exp2f.exit355 ] + %360 = phi i32 [ -1, %.lr.ph ], [ %436, %__nv_exp2f.exit355 ] + %361 = phi i32 [ 1, %.lr.ph ], [ %2033, %__nv_exp2f.exit355 ] + %362 = phi i32 [ 64, %.lr.ph ], [ %2030, %__nv_exp2f.exit355 ] + %363 = phi float [ 0.000000e+00, %.lr.ph ], [ %1596, %__nv_exp2f.exit355 ] + %364 = phi float [ 0.000000e+00, %.lr.ph ], [ %1597, %__nv_exp2f.exit355 ] + %365 = phi float [ 0.000000e+00, %.lr.ph ], [ %1943, %__nv_exp2f.exit355 ] + %366 = phi float [ 0.000000e+00, %.lr.ph ], [ %1944, %__nv_exp2f.exit355 ] + %367 = phi float [ 0.000000e+00, %.lr.ph ], [ %1945, %__nv_exp2f.exit355 ] + %368 = phi float [ 0.000000e+00, %.lr.ph ], [ %1946, %__nv_exp2f.exit355 ] + %369 = phi float [ 0.000000e+00, %.lr.ph ], [ %1947, %__nv_exp2f.exit355 ] + %370 = phi float [ 0.000000e+00, %.lr.ph ], [ %1948, %__nv_exp2f.exit355 ] + %371 = phi float [ 0.000000e+00, %.lr.ph ], [ %1949, %__nv_exp2f.exit355 ] + %372 = phi float [ 0.000000e+00, %.lr.ph ], [ %1950, %__nv_exp2f.exit355 ] + %373 = phi float [ 0.000000e+00, %.lr.ph ], [ %1951, %__nv_exp2f.exit355 ] + %374 = phi float [ 0.000000e+00, %.lr.ph ], [ %1952, %__nv_exp2f.exit355 ] + %375 = phi float [ 0.000000e+00, %.lr.ph ], [ %1953, %__nv_exp2f.exit355 ] + %376 = phi float [ 0.000000e+00, %.lr.ph ], [ %1954, %__nv_exp2f.exit355 ] + %377 = phi float [ 0.000000e+00, %.lr.ph ], [ %1955, %__nv_exp2f.exit355 ] + %378 = phi float [ 0.000000e+00, %.lr.ph ], [ %1956, %__nv_exp2f.exit355 ] + %379 = phi float [ 0.000000e+00, %.lr.ph ], [ %1957, %__nv_exp2f.exit355 ] + %380 = phi float [ 0.000000e+00, %.lr.ph ], [ %1958, %__nv_exp2f.exit355 ] + %381 = phi float [ 0.000000e+00, %.lr.ph ], [ %1959, %__nv_exp2f.exit355 ] + %382 = phi float [ 0.000000e+00, %.lr.ph ], [ %1960, %__nv_exp2f.exit355 ] + %383 = phi float [ 0.000000e+00, %.lr.ph ], [ %1961, %__nv_exp2f.exit355 ] + %384 = phi float [ 0.000000e+00, %.lr.ph ], [ %1962, %__nv_exp2f.exit355 ] + %385 = phi float [ 0.000000e+00, %.lr.ph ], [ %1963, %__nv_exp2f.exit355 ] + %386 = phi float [ 0.000000e+00, %.lr.ph ], [ %1964, %__nv_exp2f.exit355 ] + %387 = phi float [ 0.000000e+00, %.lr.ph ], [ %1965, %__nv_exp2f.exit355 ] + %388 = phi float [ 0.000000e+00, %.lr.ph ], [ %1966, %__nv_exp2f.exit355 ] + %389 = phi float [ 0.000000e+00, %.lr.ph ], [ %1967, %__nv_exp2f.exit355 ] + %390 = phi float [ 0.000000e+00, %.lr.ph ], [ %1968, %__nv_exp2f.exit355 ] + %391 = phi float [ 0.000000e+00, %.lr.ph ], [ %1969, %__nv_exp2f.exit355 ] + %392 = phi float [ 0.000000e+00, %.lr.ph ], [ %1970, %__nv_exp2f.exit355 ] + %393 = phi float [ 0.000000e+00, %.lr.ph ], [ %1971, %__nv_exp2f.exit355 ] + %394 = phi float [ 0.000000e+00, %.lr.ph ], [ %1972, %__nv_exp2f.exit355 ] + %395 = phi float [ 0.000000e+00, %.lr.ph ], [ %1973, %__nv_exp2f.exit355 ] + %396 = phi float [ 0.000000e+00, %.lr.ph ], [ %1974, %__nv_exp2f.exit355 ] + %397 = phi float [ 0.000000e+00, %.lr.ph ], [ %1975, %__nv_exp2f.exit355 ] + %398 = phi float [ 0.000000e+00, %.lr.ph ], [ %1976, %__nv_exp2f.exit355 ] + %399 = phi float [ 0.000000e+00, %.lr.ph ], [ %1977, %__nv_exp2f.exit355 ] + %400 = phi float [ 0.000000e+00, %.lr.ph ], [ %1978, %__nv_exp2f.exit355 ] + %401 = phi float [ 0.000000e+00, %.lr.ph ], [ %1979, %__nv_exp2f.exit355 ] + %402 = phi float [ 0.000000e+00, %.lr.ph ], [ %1980, %__nv_exp2f.exit355 ] + %403 = phi float [ 0.000000e+00, %.lr.ph ], [ %1981, %__nv_exp2f.exit355 ] + %404 = phi float [ 0.000000e+00, %.lr.ph ], [ %1982, %__nv_exp2f.exit355 ] + %405 = phi float [ 0.000000e+00, %.lr.ph ], [ %1983, %__nv_exp2f.exit355 ] + %406 = phi float [ 0.000000e+00, %.lr.ph ], [ %1984, %__nv_exp2f.exit355 ] + %407 = phi float [ 0.000000e+00, %.lr.ph ], [ %1985, %__nv_exp2f.exit355 ] + %408 = phi float [ 0.000000e+00, %.lr.ph ], [ %1986, %__nv_exp2f.exit355 ] + %409 = phi float [ 0.000000e+00, %.lr.ph ], [ %1987, %__nv_exp2f.exit355 ] + %410 = phi float [ 0.000000e+00, %.lr.ph ], [ %1988, %__nv_exp2f.exit355 ] + %411 = phi float [ 0.000000e+00, %.lr.ph ], [ %1989, %__nv_exp2f.exit355 ] + %412 = phi float [ 0.000000e+00, %.lr.ph ], [ %1990, %__nv_exp2f.exit355 ] + %413 = phi float [ 0.000000e+00, %.lr.ph ], [ %1991, %__nv_exp2f.exit355 ] + %414 = phi float [ 0.000000e+00, %.lr.ph ], [ %1992, %__nv_exp2f.exit355 ] + %415 = phi float [ 0.000000e+00, %.lr.ph ], [ %1993, %__nv_exp2f.exit355 ] + %416 = phi float [ 0.000000e+00, %.lr.ph ], [ %1994, %__nv_exp2f.exit355 ] + %417 = phi float [ 0.000000e+00, %.lr.ph ], [ %1995, %__nv_exp2f.exit355 ] + %418 = phi float [ 0.000000e+00, %.lr.ph ], [ %1996, %__nv_exp2f.exit355 ] + %419 = phi float [ 0.000000e+00, %.lr.ph ], [ %1997, %__nv_exp2f.exit355 ] + %420 = phi float [ 0.000000e+00, %.lr.ph ], [ %1998, %__nv_exp2f.exit355 ] + %421 = phi float [ 0.000000e+00, %.lr.ph ], [ %1999, %__nv_exp2f.exit355 ] + %422 = phi float [ 0.000000e+00, %.lr.ph ], [ %2000, %__nv_exp2f.exit355 ] + %423 = phi float [ 0.000000e+00, %.lr.ph ], [ %2001, %__nv_exp2f.exit355 ] + %424 = phi float [ 0.000000e+00, %.lr.ph ], [ %2002, %__nv_exp2f.exit355 ] + %425 = phi float [ 0.000000e+00, %.lr.ph ], [ %2003, %__nv_exp2f.exit355 ] + %426 = phi float [ 0.000000e+00, %.lr.ph ], [ %2004, %__nv_exp2f.exit355 ] + %427 = phi float [ 0.000000e+00, %.lr.ph ], [ %2005, %__nv_exp2f.exit355 ] + %428 = phi float [ 0.000000e+00, %.lr.ph ], [ %2006, %__nv_exp2f.exit355 ] + %429 = phi i32 [ 0, %.lr.ph ], [ %2010, %__nv_exp2f.exit355 ] + %430 = phi <2 x float> [ splat (float 0xFFF0000000000000), %.lr.ph ], [ %1335, %__nv_exp2f.exit355 ] + %431 = phi <16 x i32> [ %347, %.lr.ph ], [ %2009, %__nv_exp2f.exit355 ] + %432 = icmp slt i32 %429, %348, !dbg !54 + %433 = icmp slt i32 %429, %349, !dbg !54 + %434 = add i32 %360, 1, !dbg !54 + %435 = icmp sgt i32 %434, 2, !dbg !54 + %436 = select i1 %435, i32 0, i32 %434, !dbg !54 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !63 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !63 + %437 = shl i32 %436, 13, !dbg !63 + %438 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %437, !dbg !63 + %439 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %41, i32 0, i32 31), !dbg !65 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !65 + %440 = shl i32 %439, 11, !dbg !65 + %441 = and i32 %440, 8192, !dbg !65 + %442 = add i32 %441, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !65 + %443 = lshr exact i32 %442, 4, !dbg !65 + %444 = and i32 %443, 16383, !dbg !65 + %445 = zext nneg i32 %444 to i64, !dbg !65 + %446 = or disjoint i64 %445, 4611686293372403712, !dbg !65 + %447 = ptrtoint ptr addrspace(3) %438 to i32, !dbg !65 + %448 = lshr exact i32 %447, 4, !dbg !65 + %449 = and i32 %448, 16383, !dbg !65 + %450 = zext nneg i32 %449 to i64, !dbg !65 + %451 = or disjoint i64 %450, 4611686293338849280, !dbg !65 + %452 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %446, i64 %451) #2, !dbg !65 + %453 = add i32 %441, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 32), !dbg !65 + %454 = lshr exact i32 %453, 4, !dbg !65 + %455 = and i32 %454, 16383, !dbg !65 + %456 = zext nneg i32 %455 to i64, !dbg !65 + %457 = or disjoint i64 %456, 4611686293372403712, !dbg !65 + %458 = add i32 %447, 32, !dbg !65 + %459 = lshr exact i32 %458, 4, !dbg !65 + %460 = and i32 %459, 16383, !dbg !65 + %461 = zext nneg i32 %460 to i64, !dbg !65 + %462 = or disjoint i64 %461, 4611686293338849280, !dbg !65 + %463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 0, !dbg !65 + %464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 1, !dbg !65 + %465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 2, !dbg !65 + %466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 3, !dbg !65 + %467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 4, !dbg !65 + %468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 5, !dbg !65 + %469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 6, !dbg !65 + %470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 7, !dbg !65 + %471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 8, !dbg !65 + %472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 9, !dbg !65 + %473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 10, !dbg !65 + %474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 11, !dbg !65 + %475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 12, !dbg !65 + %476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 13, !dbg !65 + %477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 14, !dbg !65 + %478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 15, !dbg !65 + %479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 16, !dbg !65 + %480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 17, !dbg !65 + %481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 18, !dbg !65 + %482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 19, !dbg !65 + %483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 20, !dbg !65 + %484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 21, !dbg !65 + %485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 22, !dbg !65 + %486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 23, !dbg !65 + %487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 24, !dbg !65 + %488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 25, !dbg !65 + %489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 26, !dbg !65 + %490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 27, !dbg !65 + %491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 28, !dbg !65 + %492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 29, !dbg !65 + %493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 30, !dbg !65 + %494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %452, 31, !dbg !65 + %495 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %463, float %464, float %465, float %466, float %467, float %468, float %469, float %470, float %471, float %472, float %473, float %474, float %475, float %476, float %477, float %478, float %479, float %480, float %481, float %482, float %483, float %484, float %485, float %486, float %487, float %488, float %489, float %490, float %491, float %492, float %493, float %494, i64 %457, i64 %462, i1 true) #2, !dbg !65 + %496 = add i32 %441, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 64), !dbg !65 + %497 = lshr exact i32 %496, 4, !dbg !65 + %498 = and i32 %497, 16383, !dbg !65 + %499 = zext nneg i32 %498 to i64, !dbg !65 + %500 = or disjoint i64 %499, 4611686293372403712, !dbg !65 + %501 = add i32 %447, 64, !dbg !65 + %502 = lshr exact i32 %501, 4, !dbg !65 + %503 = and i32 %502, 16383, !dbg !65 + %504 = zext nneg i32 %503 to i64, !dbg !65 + %505 = or disjoint i64 %504, 4611686293338849280, !dbg !65 + %506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 0, !dbg !65 + %507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 1, !dbg !65 + %508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 2, !dbg !65 + %509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 3, !dbg !65 + %510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 4, !dbg !65 + %511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 5, !dbg !65 + %512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 6, !dbg !65 + %513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 7, !dbg !65 + %514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 8, !dbg !65 + %515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 9, !dbg !65 + %516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 10, !dbg !65 + %517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 11, !dbg !65 + %518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 12, !dbg !65 + %519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 13, !dbg !65 + %520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 14, !dbg !65 + %521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 15, !dbg !65 + %522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 16, !dbg !65 + %523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 17, !dbg !65 + %524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 18, !dbg !65 + %525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 19, !dbg !65 + %526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 20, !dbg !65 + %527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 21, !dbg !65 + %528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 22, !dbg !65 + %529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 23, !dbg !65 + %530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 24, !dbg !65 + %531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 25, !dbg !65 + %532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 26, !dbg !65 + %533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 27, !dbg !65 + %534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 28, !dbg !65 + %535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 29, !dbg !65 + %536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 30, !dbg !65 + %537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %495, 31, !dbg !65 + %538 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %506, float %507, float %508, float %509, float %510, float %511, float %512, float %513, float %514, float %515, float %516, float %517, float %518, float %519, float %520, float %521, float %522, float %523, float %524, float %525, float %526, float %527, float %528, float %529, float %530, float %531, float %532, float %533, float %534, float %535, float %536, float %537, i64 %500, i64 %505, i1 true) #2, !dbg !65 + %539 = add i32 %441, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 96), !dbg !65 + %540 = lshr exact i32 %539, 4, !dbg !65 + %541 = and i32 %540, 16383, !dbg !65 + %542 = zext nneg i32 %541 to i64, !dbg !65 + %543 = or disjoint i64 %542, 4611686293372403712, !dbg !65 + %544 = add i32 %447, 96, !dbg !65 + %545 = lshr exact i32 %544, 4, !dbg !65 + %546 = and i32 %545, 16383, !dbg !65 + %547 = zext nneg i32 %546 to i64, !dbg !65 + %548 = or disjoint i64 %547, 4611686293338849280, !dbg !65 + %549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 0, !dbg !65 + %550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 1, !dbg !65 + %551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 2, !dbg !65 + %552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 3, !dbg !65 + %553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 4, !dbg !65 + %554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 5, !dbg !65 + %555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 6, !dbg !65 + %556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 7, !dbg !65 + %557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 8, !dbg !65 + %558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 9, !dbg !65 + %559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 10, !dbg !65 + %560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 11, !dbg !65 + %561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 12, !dbg !65 + %562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 13, !dbg !65 + %563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 14, !dbg !65 + %564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 15, !dbg !65 + %565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 16, !dbg !65 + %566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 17, !dbg !65 + %567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 18, !dbg !65 + %568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 19, !dbg !65 + %569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 20, !dbg !65 + %570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 21, !dbg !65 + %571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 22, !dbg !65 + %572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 23, !dbg !65 + %573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 24, !dbg !65 + %574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 25, !dbg !65 + %575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 26, !dbg !65 + %576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 27, !dbg !65 + %577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 28, !dbg !65 + %578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 29, !dbg !65 + %579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 30, !dbg !65 + %580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %538, 31, !dbg !65 + %581 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %549, float %550, float %551, float %552, float %553, float %554, float %555, float %556, float %557, float %558, float %559, float %560, float %561, float %562, float %563, float %564, float %565, float %566, float %567, float %568, float %569, float %570, float %571, float %572, float %573, float %574, float %575, float %576, float %577, float %578, float %579, float %580, i64 %543, i64 %548, i1 true) #2, !dbg !65 + %582 = add i32 %441, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16384), !dbg !65 + %583 = lshr exact i32 %582, 4, !dbg !65 + %584 = and i32 %583, 16383, !dbg !65 + %585 = zext nneg i32 %584 to i64, !dbg !65 + %586 = or disjoint i64 %585, 4611686293372403712, !dbg !65 + %587 = add i32 %447, 8192, !dbg !65 + %588 = lshr exact i32 %587, 4, !dbg !65 + %589 = and i32 %588, 16383, !dbg !65 + %590 = zext nneg i32 %589 to i64, !dbg !65 + %591 = or disjoint i64 %590, 4611686293338849280, !dbg !65 + %592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 0, !dbg !65 + %593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 1, !dbg !65 + %594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 2, !dbg !65 + %595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 3, !dbg !65 + %596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 4, !dbg !65 + %597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 5, !dbg !65 + %598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 6, !dbg !65 + %599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 7, !dbg !65 + %600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 8, !dbg !65 + %601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 9, !dbg !65 + %602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 10, !dbg !65 + %603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 11, !dbg !65 + %604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 12, !dbg !65 + %605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 13, !dbg !65 + %606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 14, !dbg !65 + %607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 15, !dbg !65 + %608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 16, !dbg !65 + %609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 17, !dbg !65 + %610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 18, !dbg !65 + %611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 19, !dbg !65 + %612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 20, !dbg !65 + %613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 21, !dbg !65 + %614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 22, !dbg !65 + %615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 23, !dbg !65 + %616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 24, !dbg !65 + %617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 25, !dbg !65 + %618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 26, !dbg !65 + %619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 27, !dbg !65 + %620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 28, !dbg !65 + %621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 29, !dbg !65 + %622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 30, !dbg !65 + %623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %581, 31, !dbg !65 + %624 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %592, float %593, float %594, float %595, float %596, float %597, float %598, float %599, float %600, float %601, float %602, float %603, float %604, float %605, float %606, float %607, float %608, float %609, float %610, float %611, float %612, float %613, float %614, float %615, float %616, float %617, float %618, float %619, float %620, float %621, float %622, float %623, i64 %586, i64 %591, i1 true) #2, !dbg !65 + %625 = add i32 %441, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16416), !dbg !65 + %626 = lshr exact i32 %625, 4, !dbg !65 + %627 = and i32 %626, 16383, !dbg !65 + %628 = zext nneg i32 %627 to i64, !dbg !65 + %629 = or disjoint i64 %628, 4611686293372403712, !dbg !65 + %630 = add i32 %447, 8224, !dbg !65 + %631 = lshr exact i32 %630, 4, !dbg !65 + %632 = and i32 %631, 16383, !dbg !65 + %633 = zext nneg i32 %632 to i64, !dbg !65 + %634 = or disjoint i64 %633, 4611686293338849280, !dbg !65 + %635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 0, !dbg !65 + %636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 1, !dbg !65 + %637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 2, !dbg !65 + %638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 3, !dbg !65 + %639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 4, !dbg !65 + %640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 5, !dbg !65 + %641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 6, !dbg !65 + %642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 7, !dbg !65 + %643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 8, !dbg !65 + %644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 9, !dbg !65 + %645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 10, !dbg !65 + %646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 11, !dbg !65 + %647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 12, !dbg !65 + %648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 13, !dbg !65 + %649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 14, !dbg !65 + %650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 15, !dbg !65 + %651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 16, !dbg !65 + %652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 17, !dbg !65 + %653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 18, !dbg !65 + %654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 19, !dbg !65 + %655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 20, !dbg !65 + %656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 21, !dbg !65 + %657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 22, !dbg !65 + %658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 23, !dbg !65 + %659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 24, !dbg !65 + %660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 25, !dbg !65 + %661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 26, !dbg !65 + %662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 27, !dbg !65 + %663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 28, !dbg !65 + %664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 29, !dbg !65 + %665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 30, !dbg !65 + %666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %624, 31, !dbg !65 + %667 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %635, float %636, float %637, float %638, float %639, float %640, float %641, float %642, float %643, float %644, float %645, float %646, float %647, float %648, float %649, float %650, float %651, float %652, float %653, float %654, float %655, float %656, float %657, float %658, float %659, float %660, float %661, float %662, float %663, float %664, float %665, float %666, i64 %629, i64 %634, i1 true) #2, !dbg !65 + %668 = add i32 %441, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16448), !dbg !65 + %669 = lshr exact i32 %668, 4, !dbg !65 + %670 = and i32 %669, 16383, !dbg !65 + %671 = zext nneg i32 %670 to i64, !dbg !65 + %672 = or disjoint i64 %671, 4611686293372403712, !dbg !65 + %673 = add i32 %447, 8256, !dbg !65 + %674 = lshr exact i32 %673, 4, !dbg !65 + %675 = and i32 %674, 16383, !dbg !65 + %676 = zext nneg i32 %675 to i64, !dbg !65 + %677 = or disjoint i64 %676, 4611686293338849280, !dbg !65 + %678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 0, !dbg !65 + %679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 1, !dbg !65 + %680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 2, !dbg !65 + %681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 3, !dbg !65 + %682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 4, !dbg !65 + %683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 5, !dbg !65 + %684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 6, !dbg !65 + %685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 7, !dbg !65 + %686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 8, !dbg !65 + %687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 9, !dbg !65 + %688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 10, !dbg !65 + %689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 11, !dbg !65 + %690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 12, !dbg !65 + %691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 13, !dbg !65 + %692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 14, !dbg !65 + %693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 15, !dbg !65 + %694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 16, !dbg !65 + %695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 17, !dbg !65 + %696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 18, !dbg !65 + %697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 19, !dbg !65 + %698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 20, !dbg !65 + %699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 21, !dbg !65 + %700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 22, !dbg !65 + %701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 23, !dbg !65 + %702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 24, !dbg !65 + %703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 25, !dbg !65 + %704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 26, !dbg !65 + %705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 27, !dbg !65 + %706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 28, !dbg !65 + %707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 29, !dbg !65 + %708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 30, !dbg !65 + %709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %667, 31, !dbg !65 + %710 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %678, float %679, float %680, float %681, float %682, float %683, float %684, float %685, float %686, float %687, float %688, float %689, float %690, float %691, float %692, float %693, float %694, float %695, float %696, float %697, float %698, float %699, float %700, float %701, float %702, float %703, float %704, float %705, float %706, float %707, float %708, float %709, i64 %672, i64 %677, i1 true) #2, !dbg !65 + %711 = add i32 %441, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16480), !dbg !65 + %712 = lshr exact i32 %711, 4, !dbg !65 + %713 = and i32 %712, 16383, !dbg !65 + %714 = zext nneg i32 %713 to i64, !dbg !65 + %715 = or disjoint i64 %714, 4611686293372403712, !dbg !65 + %716 = add i32 %447, 8288, !dbg !65 + %717 = lshr exact i32 %716, 4, !dbg !65 + %718 = and i32 %717, 16383, !dbg !65 + %719 = zext nneg i32 %718 to i64, !dbg !65 + %720 = or disjoint i64 %719, 4611686293338849280, !dbg !65 + %721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 0, !dbg !65 + %722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 1, !dbg !65 + %723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 2, !dbg !65 + %724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 3, !dbg !65 + %725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 4, !dbg !65 + %726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 5, !dbg !65 + %727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 6, !dbg !65 + %728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 7, !dbg !65 + %729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 8, !dbg !65 + %730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 9, !dbg !65 + %731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 10, !dbg !65 + %732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 11, !dbg !65 + %733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 12, !dbg !65 + %734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 13, !dbg !65 + %735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 14, !dbg !65 + %736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 15, !dbg !65 + %737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 16, !dbg !65 + %738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 17, !dbg !65 + %739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 18, !dbg !65 + %740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 19, !dbg !65 + %741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 20, !dbg !65 + %742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 21, !dbg !65 + %743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 22, !dbg !65 + %744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 23, !dbg !65 + %745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 24, !dbg !65 + %746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 25, !dbg !65 + %747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 26, !dbg !65 + %748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 27, !dbg !65 + %749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 28, !dbg !65 + %750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 29, !dbg !65 + %751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 30, !dbg !65 + %752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %710, 31, !dbg !65 + %753 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %721, float %722, float %723, float %724, float %725, float %726, float %727, float %728, float %729, float %730, float %731, float %732, float %733, float %734, float %735, float %736, float %737, float %738, float %739, float %740, float %741, float %742, float %743, float %744, float %745, float %746, float %747, float %748, float %749, float %750, float %751, float %752, i64 %715, i64 %720, i1 true) #2, !dbg !65 + %754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 0, !dbg !65 + %755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 1, !dbg !65 + %756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 2, !dbg !65 + %757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 3, !dbg !65 + %758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 4, !dbg !65 + %759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 5, !dbg !65 + %760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 6, !dbg !65 + %761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 7, !dbg !65 + %762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 8, !dbg !65 + %763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 9, !dbg !65 + %764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 10, !dbg !65 + %765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 11, !dbg !65 + %766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 12, !dbg !65 + %767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 13, !dbg !65 + %768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 14, !dbg !65 + %769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 15, !dbg !65 + %770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 16, !dbg !65 + %771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 17, !dbg !65 + %772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 18, !dbg !65 + %773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 19, !dbg !65 + %774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 20, !dbg !65 + %775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 21, !dbg !65 + %776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 22, !dbg !65 + %777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 23, !dbg !65 + %778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 24, !dbg !65 + %779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 25, !dbg !65 + %780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 26, !dbg !65 + %781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 27, !dbg !65 + %782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 28, !dbg !65 + %783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 29, !dbg !65 + %784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 30, !dbg !65 + %785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %753, 31, !dbg !65 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !65 + %786 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101"(float %754, float %755, float %756, float %757, float %758, float %759, float %760, float %761, float %762, float %763, float %764, float %765, float %766, float %767, float %768, float %769, float %770, float %771, float %772, float %773, float %774, float %775, float %776, float %777, float %778, float %779, float %780, float %781, float %782, float %783, float %784, float %785, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %438, i32 0, i32 0, float %365, float %366, float %367, float %368, float %369, float %370, float %371, float %372, float %373, float %374, float %375, float %376, float %377, float %378, float %379, float %380, float %381, float %382, float %383, float %384, float %385, float %386, float %387, float %388, float %389, float %390, float %391, float %392, float %393, float %394, float %395, float %396, float %397, float %398, float %399, float %400, float %401, float %402, float %403, float %404, float %405, float %406, float %407, float %408, float %409, float %410, float %411, float %412, float %413, float %414, float %415, float %416, float %417, float %418, float %419, float %420, float %421, float %422, float %423, float %424, float %425, float %426, float %427, float %428) #2, !dbg !65 + %787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 0, !dbg !65 + %788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 1, !dbg !65 + %789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 2, !dbg !65 + %790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 3, !dbg !65 + %791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 4, !dbg !65 + %792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 5, !dbg !65 + %793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 6, !dbg !65 + %794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 7, !dbg !65 + %795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 8, !dbg !65 + %796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 9, !dbg !65 + %797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 10, !dbg !65 + %798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 11, !dbg !65 + %799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 12, !dbg !65 + %800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 13, !dbg !65 + %801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 14, !dbg !65 + %802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 15, !dbg !65 + %803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 16, !dbg !65 + %804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 17, !dbg !65 + %805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 18, !dbg !65 + %806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 19, !dbg !65 + %807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 20, !dbg !65 + %808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 21, !dbg !65 + %809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 22, !dbg !65 + %810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 23, !dbg !65 + %811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 24, !dbg !65 + %812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 25, !dbg !65 + %813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 26, !dbg !65 + %814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 27, !dbg !65 + %815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 28, !dbg !65 + %816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 29, !dbg !65 + %817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 30, !dbg !65 + %818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 31, !dbg !65 + %819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 38, !dbg !65 + %820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 39, !dbg !65 + %821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 40, !dbg !65 + %822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 41, !dbg !65 + %823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 42, !dbg !65 + %824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 43, !dbg !65 + %825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 44, !dbg !65 + %826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 45, !dbg !65 + %827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 46, !dbg !65 + %828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 47, !dbg !65 + %829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 48, !dbg !65 + %830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 49, !dbg !65 + %831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 50, !dbg !65 + %832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 51, !dbg !65 + %833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 52, !dbg !65 + %834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 53, !dbg !65 + %835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 54, !dbg !65 + %836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 55, !dbg !65 + %837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 56, !dbg !65 + %838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 57, !dbg !65 + %839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 58, !dbg !65 + %840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 59, !dbg !65 + %841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 60, !dbg !65 + %842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 61, !dbg !65 + %843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 62, !dbg !65 + %844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 63, !dbg !65 + %845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 64, !dbg !65 + %846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 65, !dbg !65 + %847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 66, !dbg !65 + %848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 67, !dbg !65 + %849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 68, !dbg !65 + %850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 69, !dbg !65 + %851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 70, !dbg !65 + %852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 71, !dbg !65 + %853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 72, !dbg !65 + %854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 73, !dbg !65 + %855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 74, !dbg !65 + %856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 75, !dbg !65 + %857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 76, !dbg !65 + %858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 77, !dbg !65 + %859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 78, !dbg !65 + %860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 79, !dbg !65 + %861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 80, !dbg !65 + %862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 81, !dbg !65 + %863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 82, !dbg !65 + %864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 83, !dbg !65 + %865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 84, !dbg !65 + %866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 85, !dbg !65 + %867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 86, !dbg !65 + %868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 87, !dbg !65 + %869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 88, !dbg !65 + %870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 89, !dbg !65 + %871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 90, !dbg !65 + %872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 91, !dbg !65 + %873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 92, !dbg !65 + %874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 93, !dbg !65 + %875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 94, !dbg !65 + %876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 95, !dbg !65 + %877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 96, !dbg !65 + %878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 97, !dbg !65 + %879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 98, !dbg !65 + %880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 99, !dbg !65 + %881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 100, !dbg !65 + %882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %786, 101, !dbg !65 + %883 = fmul float %787, 0x3FB6A09E60000000, !dbg !70 + %884 = fmul float %788, 0x3FB6A09E60000000, !dbg !70 + %885 = fmul float %789, 0x3FB6A09E60000000, !dbg !70 + %886 = fmul float %790, 0x3FB6A09E60000000, !dbg !70 + %887 = fmul float %791, 0x3FB6A09E60000000, !dbg !70 + %888 = fmul float %792, 0x3FB6A09E60000000, !dbg !70 + %889 = fmul float %793, 0x3FB6A09E60000000, !dbg !70 + %890 = fmul float %794, 0x3FB6A09E60000000, !dbg !70 + %891 = fmul float %795, 0x3FB6A09E60000000, !dbg !70 + %892 = fmul float %796, 0x3FB6A09E60000000, !dbg !70 + %893 = fmul float %797, 0x3FB6A09E60000000, !dbg !70 + %894 = fmul float %798, 0x3FB6A09E60000000, !dbg !70 + %895 = fmul float %799, 0x3FB6A09E60000000, !dbg !70 + %896 = fmul float %800, 0x3FB6A09E60000000, !dbg !70 + %897 = fmul float %801, 0x3FB6A09E60000000, !dbg !70 + %898 = fmul float %802, 0x3FB6A09E60000000, !dbg !70 + %899 = fmul float %803, 0x3FB6A09E60000000, !dbg !70 + %900 = fmul float %804, 0x3FB6A09E60000000, !dbg !70 + %901 = fmul float %805, 0x3FB6A09E60000000, !dbg !70 + %902 = fmul float %806, 0x3FB6A09E60000000, !dbg !70 + %903 = fmul float %807, 0x3FB6A09E60000000, !dbg !70 + %904 = fmul float %808, 0x3FB6A09E60000000, !dbg !70 + %905 = fmul float %809, 0x3FB6A09E60000000, !dbg !70 + %906 = fmul float %810, 0x3FB6A09E60000000, !dbg !70 + %907 = fmul float %811, 0x3FB6A09E60000000, !dbg !70 + %908 = fmul float %812, 0x3FB6A09E60000000, !dbg !70 + %909 = fmul float %813, 0x3FB6A09E60000000, !dbg !70 + %910 = fmul float %814, 0x3FB6A09E60000000, !dbg !70 + %911 = fmul float %815, 0x3FB6A09E60000000, !dbg !70 + %912 = fmul float %816, 0x3FB6A09E60000000, !dbg !70 + %913 = fmul float %817, 0x3FB6A09E60000000, !dbg !70 + %914 = fmul float %818, 0x3FB6A09E60000000, !dbg !70 + %915 = extractelement <16 x i32> %431, i64 15, !dbg !71 + %916 = icmp slt i32 %915, %12, !dbg !71 + %917 = extractelement <16 x i32> %431, i64 14, !dbg !71 + %918 = icmp slt i32 %917, %12, !dbg !71 + %919 = extractelement <16 x i32> %431, i64 13, !dbg !71 + %920 = icmp slt i32 %919, %12, !dbg !71 + %921 = extractelement <16 x i32> %431, i64 12, !dbg !71 + %922 = icmp slt i32 %921, %12, !dbg !71 + %923 = extractelement <16 x i32> %431, i64 11, !dbg !71 + %924 = icmp slt i32 %923, %12, !dbg !71 + %925 = extractelement <16 x i32> %431, i64 10, !dbg !71 + %926 = icmp slt i32 %925, %12, !dbg !71 + %927 = extractelement <16 x i32> %431, i64 9, !dbg !71 + %928 = icmp slt i32 %927, %12, !dbg !71 + %929 = extractelement <16 x i32> %431, i64 8, !dbg !71 + %930 = icmp slt i32 %929, %12, !dbg !71 + %931 = extractelement <16 x i32> %431, i64 7, !dbg !71 + %932 = icmp slt i32 %931, %12, !dbg !71 + %933 = extractelement <16 x i32> %431, i64 6, !dbg !71 + %934 = icmp slt i32 %933, %12, !dbg !71 + %935 = extractelement <16 x i32> %431, i64 5, !dbg !71 + %936 = icmp slt i32 %935, %12, !dbg !71 + %937 = extractelement <16 x i32> %431, i64 4, !dbg !71 + %938 = icmp slt i32 %937, %12, !dbg !71 + %939 = extractelement <16 x i32> %431, i64 3, !dbg !71 + %940 = icmp slt i32 %939, %12, !dbg !71 + %941 = extractelement <16 x i32> %431, i64 2, !dbg !71 + %942 = icmp slt i32 %941, %12, !dbg !71 + %943 = extractelement <16 x i32> %431, i64 1, !dbg !71 + %944 = icmp slt i32 %943, %12, !dbg !71 + %945 = extractelement <16 x i32> %431, i64 0, !dbg !71 + %946 = icmp slt i32 %945, %12, !dbg !71 + %947 = srem <16 x i32> %431, %351, !dbg !51 + %948 = shufflevector <16 x i32> %947, <16 x i32> poison, <32 x i32> , !dbg !51 + %949 = extractelement <16 x i32> %947, i64 15, !dbg !68 + %950 = icmp sge i32 %228, %949, !dbg !72 + %951 = extractelement <16 x i32> %947, i64 14, !dbg !68 + %952 = icmp sge i32 %228, %951, !dbg !72 + %953 = icmp sge i32 %230, %949, !dbg !72 + %954 = icmp sge i32 %230, %951, !dbg !72 + %955 = extractelement <16 x i32> %947, i64 13, !dbg !68 + %956 = icmp sge i32 %228, %955, !dbg !72 + %957 = extractelement <16 x i32> %947, i64 12, !dbg !68 + %958 = icmp sge i32 %228, %957, !dbg !72 + %959 = icmp sge i32 %230, %955, !dbg !72 + %960 = icmp sge i32 %230, %957, !dbg !72 + %961 = extractelement <16 x i32> %947, i64 11, !dbg !68 + %962 = icmp sge i32 %228, %961, !dbg !72 + %963 = extractelement <16 x i32> %947, i64 10, !dbg !68 + %964 = icmp sge i32 %228, %963, !dbg !72 + %965 = icmp sge i32 %230, %961, !dbg !72 + %966 = icmp sge i32 %230, %963, !dbg !72 + %967 = extractelement <16 x i32> %947, i64 9, !dbg !68 + %968 = icmp sge i32 %228, %967, !dbg !72 + %969 = extractelement <16 x i32> %947, i64 8, !dbg !68 + %970 = icmp sge i32 %228, %969, !dbg !72 + %971 = icmp sge i32 %230, %967, !dbg !72 + %972 = icmp sge i32 %230, %969, !dbg !72 + %973 = extractelement <16 x i32> %947, i64 7, !dbg !68 + %974 = icmp sge i32 %228, %973, !dbg !72 + %975 = extractelement <16 x i32> %947, i64 6, !dbg !68 + %976 = icmp sge i32 %228, %975, !dbg !72 + %977 = icmp sge i32 %230, %973, !dbg !72 + %978 = icmp sge i32 %230, %975, !dbg !72 + %979 = extractelement <16 x i32> %947, i64 5, !dbg !68 + %980 = icmp sge i32 %228, %979, !dbg !72 + %981 = extractelement <16 x i32> %947, i64 4, !dbg !68 + %982 = icmp sge i32 %228, %981, !dbg !72 + %983 = icmp sge i32 %230, %979, !dbg !72 + %984 = icmp sge i32 %230, %981, !dbg !72 + %985 = extractelement <16 x i32> %947, i64 3, !dbg !68 + %986 = icmp sge i32 %228, %985, !dbg !72 + %987 = extractelement <16 x i32> %947, i64 2, !dbg !68 + %988 = icmp sge i32 %228, %987, !dbg !72 + %989 = icmp sge i32 %230, %985, !dbg !72 + %990 = icmp sge i32 %230, %987, !dbg !72 + %991 = extractelement <16 x i32> %947, i64 1, !dbg !68 + %992 = icmp sge i32 %228, %991, !dbg !72 + %993 = extractelement <16 x i32> %947, i64 0, !dbg !68 + %994 = icmp sge i32 %228, %993, !dbg !72 + %995 = icmp sge i32 %230, %991, !dbg !72 + %996 = icmp sge i32 %230, %993, !dbg !72 + %997 = and i1 %232, %950, !dbg !73 + %998 = and i1 %232, %952, !dbg !73 + %999 = and i1 %233, %953, !dbg !73 + %1000 = and i1 %233, %954, !dbg !73 + %1001 = and i1 %232, %956, !dbg !73 + %1002 = and i1 %232, %958, !dbg !73 + %1003 = and i1 %233, %959, !dbg !73 + %1004 = and i1 %233, %960, !dbg !73 + %1005 = and i1 %232, %962, !dbg !73 + %1006 = and i1 %232, %964, !dbg !73 + %1007 = and i1 %233, %965, !dbg !73 + %1008 = and i1 %233, %966, !dbg !73 + %1009 = and i1 %232, %968, !dbg !73 + %1010 = and i1 %232, %970, !dbg !73 + %1011 = and i1 %233, %971, !dbg !73 + %1012 = and i1 %233, %972, !dbg !73 + %1013 = and i1 %232, %974, !dbg !73 + %1014 = and i1 %232, %976, !dbg !73 + %1015 = and i1 %233, %977, !dbg !73 + %1016 = and i1 %233, %978, !dbg !73 + %1017 = and i1 %232, %980, !dbg !73 + %1018 = and i1 %232, %982, !dbg !73 + %1019 = and i1 %233, %983, !dbg !73 + %1020 = and i1 %233, %984, !dbg !73 + %1021 = and i1 %232, %986, !dbg !73 + %1022 = and i1 %232, %988, !dbg !73 + %1023 = and i1 %233, %989, !dbg !73 + %1024 = and i1 %233, %990, !dbg !73 + %1025 = and i1 %232, %992, !dbg !73 + %1026 = and i1 %232, %994, !dbg !73 + %1027 = and i1 %233, %995, !dbg !73 + %1028 = and i1 %233, %996, !dbg !73 + %1029 = icmp sge i32 %949, %16, !dbg !74 + %1030 = icmp sge i32 %951, %16, !dbg !74 + %1031 = icmp sge i32 %955, %16, !dbg !74 + %1032 = icmp sge i32 %957, %16, !dbg !74 + %1033 = icmp sge i32 %961, %16, !dbg !74 + %1034 = icmp sge i32 %963, %16, !dbg !74 + %1035 = icmp sge i32 %967, %16, !dbg !74 + %1036 = icmp sge i32 %969, %16, !dbg !74 + %1037 = icmp sge i32 %973, %16, !dbg !74 + %1038 = icmp sge i32 %975, %16, !dbg !74 + %1039 = icmp sge i32 %979, %16, !dbg !74 + %1040 = icmp sge i32 %981, %16, !dbg !74 + %1041 = icmp sge i32 %985, %16, !dbg !74 + %1042 = icmp sge i32 %987, %16, !dbg !74 + %1043 = icmp sge i32 %991, %16, !dbg !74 + %1044 = icmp sge i32 %993, %16, !dbg !74 + %1045 = srem <16 x i32> %947, %355, !dbg !68 + %1046 = icmp ne <16 x i32> %1045, zeroinitializer, !dbg !75 + %1047 = xor <16 x i32> %1045, %355, !dbg !76 + %1048 = icmp slt <16 x i32> %1047, zeroinitializer, !dbg !76 + %1049 = and <16 x i1> %1046, %1048, !dbg !77 + %1050 = select <16 x i1> %1049, <16 x i32> %355, <16 x i32> zeroinitializer, !dbg !78 + %1051 = add <16 x i32> %1050, %1045, !dbg !78 + %1052 = sext <16 x i32> %1051 to <16 x i64>, !dbg !69 + %1053 = icmp sgt <16 x i64> %357, %1052, !dbg !69 + %1054 = extractelement <16 x i1> %1053, i64 15, !dbg !79 + %1055 = and i1 %1029, %1054, !dbg !79 + %1056 = extractelement <16 x i1> %1053, i64 14, !dbg !79 + %1057 = and i1 %1030, %1056, !dbg !79 + %1058 = extractelement <16 x i1> %1053, i64 13, !dbg !79 + %1059 = and i1 %1031, %1058, !dbg !79 + %1060 = extractelement <16 x i1> %1053, i64 12, !dbg !79 + %1061 = and i1 %1032, %1060, !dbg !79 + %1062 = extractelement <16 x i1> %1053, i64 11, !dbg !79 + %1063 = and i1 %1033, %1062, !dbg !79 + %1064 = extractelement <16 x i1> %1053, i64 10, !dbg !79 + %1065 = and i1 %1034, %1064, !dbg !79 + %1066 = extractelement <16 x i1> %1053, i64 9, !dbg !79 + %1067 = and i1 %1035, %1066, !dbg !79 + %1068 = extractelement <16 x i1> %1053, i64 8, !dbg !79 + %1069 = and i1 %1036, %1068, !dbg !79 + %1070 = extractelement <16 x i1> %1053, i64 7, !dbg !79 + %1071 = and i1 %1037, %1070, !dbg !79 + %1072 = extractelement <16 x i1> %1053, i64 6, !dbg !79 + %1073 = and i1 %1038, %1072, !dbg !79 + %1074 = extractelement <16 x i1> %1053, i64 5, !dbg !79 + %1075 = and i1 %1039, %1074, !dbg !79 + %1076 = extractelement <16 x i1> %1053, i64 4, !dbg !79 + %1077 = and i1 %1040, %1076, !dbg !79 + %1078 = extractelement <16 x i1> %1053, i64 3, !dbg !79 + %1079 = and i1 %1041, %1078, !dbg !79 + %1080 = extractelement <16 x i1> %1053, i64 2, !dbg !79 + %1081 = and i1 %1042, %1080, !dbg !79 + %1082 = extractelement <16 x i1> %1053, i64 1, !dbg !79 + %1083 = and i1 %1043, %1082, !dbg !79 + %1084 = extractelement <16 x i1> %1053, i64 0, !dbg !79 + %1085 = and i1 %1044, %1084, !dbg !79 + %1086 = sub <32 x i32> %948, %223, !dbg !80 + %1087 = srem <32 x i32> %1086, %353, !dbg !67 + %1088 = icmp ne <32 x i32> %1087, zeroinitializer, !dbg !81 + %1089 = xor <32 x i32> %1087, %353, !dbg !82 + %1090 = icmp slt <32 x i32> %1089, zeroinitializer, !dbg !82 + %1091 = and <32 x i1> %1088, %1090, !dbg !83 + %1092 = select <32 x i1> %1091, <32 x i32> %353, <32 x i32> zeroinitializer, !dbg !84 + %1093 = sub <32 x i32> zeroinitializer, %1092, !dbg !85 + %1094 = icmp eq <32 x i32> %1087, %1093, !dbg !85 + %1095 = extractelement <32 x i1> %1094, i64 31, !dbg !86 + %1096 = and i1 %1055, %1095, !dbg !86 + %1097 = extractelement <32 x i1> %1094, i64 30, !dbg !86 + %1098 = and i1 %1057, %1097, !dbg !86 + %1099 = extractelement <32 x i1> %1094, i64 29, !dbg !86 + %1100 = and i1 %1055, %1099, !dbg !86 + %1101 = extractelement <32 x i1> %1094, i64 28, !dbg !86 + %1102 = and i1 %1057, %1101, !dbg !86 + %1103 = extractelement <32 x i1> %1094, i64 27, !dbg !86 + %1104 = and i1 %1059, %1103, !dbg !86 + %1105 = extractelement <32 x i1> %1094, i64 26, !dbg !86 + %1106 = and i1 %1061, %1105, !dbg !86 + %1107 = extractelement <32 x i1> %1094, i64 25, !dbg !86 + %1108 = and i1 %1059, %1107, !dbg !86 + %1109 = extractelement <32 x i1> %1094, i64 24, !dbg !86 + %1110 = and i1 %1061, %1109, !dbg !86 + %1111 = extractelement <32 x i1> %1094, i64 23, !dbg !86 + %1112 = and i1 %1063, %1111, !dbg !86 + %1113 = extractelement <32 x i1> %1094, i64 22, !dbg !86 + %1114 = and i1 %1065, %1113, !dbg !86 + %1115 = extractelement <32 x i1> %1094, i64 21, !dbg !86 + %1116 = and i1 %1063, %1115, !dbg !86 + %1117 = extractelement <32 x i1> %1094, i64 20, !dbg !86 + %1118 = and i1 %1065, %1117, !dbg !86 + %1119 = extractelement <32 x i1> %1094, i64 19, !dbg !86 + %1120 = and i1 %1067, %1119, !dbg !86 + %1121 = extractelement <32 x i1> %1094, i64 18, !dbg !86 + %1122 = and i1 %1069, %1121, !dbg !86 + %1123 = extractelement <32 x i1> %1094, i64 17, !dbg !86 + %1124 = and i1 %1067, %1123, !dbg !86 + %1125 = extractelement <32 x i1> %1094, i64 16, !dbg !86 + %1126 = and i1 %1069, %1125, !dbg !86 + %1127 = extractelement <32 x i1> %1094, i64 15, !dbg !86 + %1128 = and i1 %1071, %1127, !dbg !86 + %1129 = extractelement <32 x i1> %1094, i64 14, !dbg !86 + %1130 = and i1 %1073, %1129, !dbg !86 + %1131 = extractelement <32 x i1> %1094, i64 13, !dbg !86 + %1132 = and i1 %1071, %1131, !dbg !86 + %1133 = extractelement <32 x i1> %1094, i64 12, !dbg !86 + %1134 = and i1 %1073, %1133, !dbg !86 + %1135 = extractelement <32 x i1> %1094, i64 11, !dbg !86 + %1136 = and i1 %1075, %1135, !dbg !86 + %1137 = extractelement <32 x i1> %1094, i64 10, !dbg !86 + %1138 = and i1 %1077, %1137, !dbg !86 + %1139 = extractelement <32 x i1> %1094, i64 9, !dbg !86 + %1140 = and i1 %1075, %1139, !dbg !86 + %1141 = extractelement <32 x i1> %1094, i64 8, !dbg !86 + %1142 = and i1 %1077, %1141, !dbg !86 + %1143 = extractelement <32 x i1> %1094, i64 7, !dbg !86 + %1144 = and i1 %1079, %1143, !dbg !86 + %1145 = extractelement <32 x i1> %1094, i64 6, !dbg !86 + %1146 = and i1 %1081, %1145, !dbg !86 + %1147 = extractelement <32 x i1> %1094, i64 5, !dbg !86 + %1148 = and i1 %1079, %1147, !dbg !86 + %1149 = extractelement <32 x i1> %1094, i64 4, !dbg !86 + %1150 = and i1 %1081, %1149, !dbg !86 + %1151 = extractelement <32 x i1> %1094, i64 3, !dbg !86 + %1152 = and i1 %1083, %1151, !dbg !86 + %1153 = extractelement <32 x i1> %1094, i64 2, !dbg !86 + %1154 = and i1 %1085, %1153, !dbg !86 + %1155 = extractelement <32 x i1> %1094, i64 1, !dbg !86 + %1156 = and i1 %1083, %1155, !dbg !86 + %1157 = extractelement <32 x i1> %1094, i64 0, !dbg !86 + %1158 = and i1 %1085, %1157, !dbg !86 + %1159 = or i1 %997, %1096, !dbg !87 + %1160 = or i1 %998, %1098, !dbg !87 + %1161 = or i1 %999, %1100, !dbg !87 + %1162 = or i1 %1000, %1102, !dbg !87 + %1163 = or i1 %1001, %1104, !dbg !87 + %1164 = or i1 %1002, %1106, !dbg !87 + %1165 = or i1 %1003, %1108, !dbg !87 + %1166 = or i1 %1004, %1110, !dbg !87 + %1167 = or i1 %1005, %1112, !dbg !87 + %1168 = or i1 %1006, %1114, !dbg !87 + %1169 = or i1 %1007, %1116, !dbg !87 + %1170 = or i1 %1008, %1118, !dbg !87 + %1171 = or i1 %1009, %1120, !dbg !87 + %1172 = or i1 %1010, %1122, !dbg !87 + %1173 = or i1 %1011, %1124, !dbg !87 + %1174 = or i1 %1012, %1126, !dbg !87 + %1175 = or i1 %1013, %1128, !dbg !87 + %1176 = or i1 %1014, %1130, !dbg !87 + %1177 = or i1 %1015, %1132, !dbg !87 + %1178 = or i1 %1016, %1134, !dbg !87 + %1179 = or i1 %1017, %1136, !dbg !87 + %1180 = or i1 %1018, %1138, !dbg !87 + %1181 = or i1 %1019, %1140, !dbg !87 + %1182 = or i1 %1020, %1142, !dbg !87 + %1183 = or i1 %1021, %1144, !dbg !87 + %1184 = or i1 %1022, %1146, !dbg !87 + %1185 = or i1 %1023, %1148, !dbg !87 + %1186 = or i1 %1024, %1150, !dbg !87 + %1187 = or i1 %1025, %1152, !dbg !87 + %1188 = or i1 %1026, %1154, !dbg !87 + %1189 = or i1 %1027, %1156, !dbg !87 + %1190 = or i1 %1028, %1158, !dbg !87 + %1191 = select i1 %916, i1 %1159, i1 false, !dbg !88 + %1192 = select i1 %918, i1 %1160, i1 false, !dbg !88 + %1193 = select i1 %916, i1 %1161, i1 false, !dbg !88 + %1194 = select i1 %918, i1 %1162, i1 false, !dbg !88 + %1195 = select i1 %920, i1 %1163, i1 false, !dbg !88 + %1196 = select i1 %922, i1 %1164, i1 false, !dbg !88 + %1197 = select i1 %920, i1 %1165, i1 false, !dbg !88 + %1198 = select i1 %922, i1 %1166, i1 false, !dbg !88 + %1199 = select i1 %924, i1 %1167, i1 false, !dbg !88 + %1200 = select i1 %926, i1 %1168, i1 false, !dbg !88 + %1201 = select i1 %924, i1 %1169, i1 false, !dbg !88 + %1202 = select i1 %926, i1 %1170, i1 false, !dbg !88 + %1203 = select i1 %928, i1 %1171, i1 false, !dbg !88 + %1204 = select i1 %930, i1 %1172, i1 false, !dbg !88 + %1205 = select i1 %928, i1 %1173, i1 false, !dbg !88 + %1206 = select i1 %930, i1 %1174, i1 false, !dbg !88 + %1207 = select i1 %932, i1 %1175, i1 false, !dbg !88 + %1208 = select i1 %934, i1 %1176, i1 false, !dbg !88 + %1209 = select i1 %932, i1 %1177, i1 false, !dbg !88 + %1210 = select i1 %934, i1 %1178, i1 false, !dbg !88 + %1211 = select i1 %936, i1 %1179, i1 false, !dbg !88 + %1212 = select i1 %938, i1 %1180, i1 false, !dbg !88 + %1213 = select i1 %936, i1 %1181, i1 false, !dbg !88 + %1214 = select i1 %938, i1 %1182, i1 false, !dbg !88 + %1215 = select i1 %940, i1 %1183, i1 false, !dbg !88 + %1216 = select i1 %942, i1 %1184, i1 false, !dbg !88 + %1217 = select i1 %940, i1 %1185, i1 false, !dbg !88 + %1218 = select i1 %942, i1 %1186, i1 false, !dbg !88 + %1219 = select i1 %944, i1 %1187, i1 false, !dbg !88 + %1220 = select i1 %946, i1 %1188, i1 false, !dbg !88 + %1221 = select i1 %944, i1 %1189, i1 false, !dbg !88 + %1222 = select i1 %946, i1 %1190, i1 false, !dbg !88 + %1223 = fmul float %883, 0x3FF7154760000000, !dbg !89 + %1224 = select i1 %1191, float %1223, float 0xFFF0000000000000, !dbg !90 + %1225 = fmul float %884, 0x3FF7154760000000, !dbg !89 + %1226 = select i1 %1192, float %1225, float 0xFFF0000000000000, !dbg !90 + %1227 = fmul float %885, 0x3FF7154760000000, !dbg !89 + %1228 = select i1 %1193, float %1227, float 0xFFF0000000000000, !dbg !90 + %1229 = fmul float %886, 0x3FF7154760000000, !dbg !89 + %1230 = select i1 %1194, float %1229, float 0xFFF0000000000000, !dbg !90 + %1231 = fmul float %887, 0x3FF7154760000000, !dbg !89 + %1232 = select i1 %1195, float %1231, float 0xFFF0000000000000, !dbg !90 + %1233 = fmul float %888, 0x3FF7154760000000, !dbg !89 + %1234 = select i1 %1196, float %1233, float 0xFFF0000000000000, !dbg !90 + %1235 = fmul float %889, 0x3FF7154760000000, !dbg !89 + %1236 = select i1 %1197, float %1235, float 0xFFF0000000000000, !dbg !90 + %1237 = fmul float %890, 0x3FF7154760000000, !dbg !89 + %1238 = select i1 %1198, float %1237, float 0xFFF0000000000000, !dbg !90 + %1239 = fmul float %891, 0x3FF7154760000000, !dbg !89 + %1240 = select i1 %1199, float %1239, float 0xFFF0000000000000, !dbg !90 + %1241 = fmul float %892, 0x3FF7154760000000, !dbg !89 + %1242 = select i1 %1200, float %1241, float 0xFFF0000000000000, !dbg !90 + %1243 = fmul float %893, 0x3FF7154760000000, !dbg !89 + %1244 = select i1 %1201, float %1243, float 0xFFF0000000000000, !dbg !90 + %1245 = fmul float %894, 0x3FF7154760000000, !dbg !89 + %1246 = select i1 %1202, float %1245, float 0xFFF0000000000000, !dbg !90 + %1247 = fmul float %895, 0x3FF7154760000000, !dbg !89 + %1248 = select i1 %1203, float %1247, float 0xFFF0000000000000, !dbg !90 + %1249 = fmul float %896, 0x3FF7154760000000, !dbg !89 + %1250 = select i1 %1204, float %1249, float 0xFFF0000000000000, !dbg !90 + %1251 = fmul float %897, 0x3FF7154760000000, !dbg !89 + %1252 = select i1 %1205, float %1251, float 0xFFF0000000000000, !dbg !90 + %1253 = fmul float %898, 0x3FF7154760000000, !dbg !89 + %1254 = select i1 %1206, float %1253, float 0xFFF0000000000000, !dbg !90 + %1255 = fmul float %899, 0x3FF7154760000000, !dbg !89 + %1256 = select i1 %1207, float %1255, float 0xFFF0000000000000, !dbg !90 + %1257 = fmul float %900, 0x3FF7154760000000, !dbg !89 + %1258 = select i1 %1208, float %1257, float 0xFFF0000000000000, !dbg !90 + %1259 = fmul float %901, 0x3FF7154760000000, !dbg !89 + %1260 = select i1 %1209, float %1259, float 0xFFF0000000000000, !dbg !90 + %1261 = fmul float %902, 0x3FF7154760000000, !dbg !89 + %1262 = select i1 %1210, float %1261, float 0xFFF0000000000000, !dbg !90 + %1263 = fmul float %903, 0x3FF7154760000000, !dbg !89 + %1264 = select i1 %1211, float %1263, float 0xFFF0000000000000, !dbg !90 + %1265 = fmul float %904, 0x3FF7154760000000, !dbg !89 + %1266 = select i1 %1212, float %1265, float 0xFFF0000000000000, !dbg !90 + %1267 = fmul float %905, 0x3FF7154760000000, !dbg !89 + %1268 = select i1 %1213, float %1267, float 0xFFF0000000000000, !dbg !90 + %1269 = fmul float %906, 0x3FF7154760000000, !dbg !89 + %1270 = select i1 %1214, float %1269, float 0xFFF0000000000000, !dbg !90 + %1271 = fmul float %907, 0x3FF7154760000000, !dbg !89 + %1272 = select i1 %1215, float %1271, float 0xFFF0000000000000, !dbg !90 + %1273 = fmul float %908, 0x3FF7154760000000, !dbg !89 + %1274 = select i1 %1216, float %1273, float 0xFFF0000000000000, !dbg !90 + %1275 = fmul float %909, 0x3FF7154760000000, !dbg !89 + %1276 = select i1 %1217, float %1275, float 0xFFF0000000000000, !dbg !90 + %1277 = fmul float %910, 0x3FF7154760000000, !dbg !89 + %1278 = select i1 %1218, float %1277, float 0xFFF0000000000000, !dbg !90 + %1279 = fmul float %911, 0x3FF7154760000000, !dbg !89 + %1280 = select i1 %1219, float %1279, float 0xFFF0000000000000, !dbg !90 + %1281 = fmul float %912, 0x3FF7154760000000, !dbg !89 + %1282 = select i1 %1220, float %1281, float 0xFFF0000000000000, !dbg !90 + %1283 = fmul float %913, 0x3FF7154760000000, !dbg !89 + %1284 = select i1 %1221, float %1283, float 0xFFF0000000000000, !dbg !90 + %1285 = fmul float %914, 0x3FF7154760000000, !dbg !89 + %1286 = select i1 %1222, float %1285, float 0xFFF0000000000000, !dbg !90 + %1287 = tail call float @llvm.maxnum.f32(float %1224, float %1226), !dbg !91 + %1288 = tail call float @llvm.maxnum.f32(float %1228, float %1230), !dbg !91 + %1289 = tail call float @llvm.maxnum.f32(float %1287, float %1232), !dbg !91 + %1290 = tail call float @llvm.maxnum.f32(float %1289, float %1234), !dbg !91 + %1291 = tail call float @llvm.maxnum.f32(float %1288, float %1236), !dbg !91 + %1292 = tail call float @llvm.maxnum.f32(float %1291, float %1238), !dbg !91 + %1293 = tail call float @llvm.maxnum.f32(float %1290, float %1240), !dbg !91 + %1294 = tail call float @llvm.maxnum.f32(float %1293, float %1242), !dbg !91 + %1295 = tail call float @llvm.maxnum.f32(float %1292, float %1244), !dbg !91 + %1296 = tail call float @llvm.maxnum.f32(float %1295, float %1246), !dbg !91 + %1297 = tail call float @llvm.maxnum.f32(float %1294, float %1248), !dbg !91 + %1298 = tail call float @llvm.maxnum.f32(float %1297, float %1250), !dbg !91 + %1299 = tail call float @llvm.maxnum.f32(float %1296, float %1252), !dbg !91 + %1300 = tail call float @llvm.maxnum.f32(float %1299, float %1254), !dbg !91 + %1301 = tail call float @llvm.maxnum.f32(float %1298, float %1256), !dbg !91 + %1302 = tail call float @llvm.maxnum.f32(float %1301, float %1258), !dbg !91 + %1303 = tail call float @llvm.maxnum.f32(float %1300, float %1260), !dbg !91 + %1304 = tail call float @llvm.maxnum.f32(float %1303, float %1262), !dbg !91 + %1305 = tail call float @llvm.maxnum.f32(float %1302, float %1264), !dbg !91 + %1306 = tail call float @llvm.maxnum.f32(float %1305, float %1266), !dbg !91 + %1307 = tail call float @llvm.maxnum.f32(float %1304, float %1268), !dbg !91 + %1308 = tail call float @llvm.maxnum.f32(float %1307, float %1270), !dbg !91 + %1309 = tail call float @llvm.maxnum.f32(float %1306, float %1272), !dbg !91 + %1310 = tail call float @llvm.maxnum.f32(float %1309, float %1274), !dbg !91 + %1311 = tail call float @llvm.maxnum.f32(float %1308, float %1276), !dbg !91 + %1312 = tail call float @llvm.maxnum.f32(float %1311, float %1278), !dbg !91 + %1313 = tail call float @llvm.maxnum.f32(float %1310, float %1280), !dbg !91 + %1314 = tail call float @llvm.maxnum.f32(float %1313, float %1282), !dbg !91 + %1315 = tail call float @llvm.maxnum.f32(float %1312, float %1284), !dbg !91 + %1316 = tail call float @llvm.maxnum.f32(float %1315, float %1286), !dbg !91 + %1317 = bitcast float %1314 to i32, !dbg !92 + %1318 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1317, i32 2, i32 31), !dbg !92 + %1319 = bitcast i32 %1318 to float, !dbg !92 + %1320 = bitcast float %1316 to i32, !dbg !92 + %1321 = tail call float @llvm.maxnum.f32(float %1314, float %1319), !dbg !91 + %1322 = bitcast float %1321 to i32, !dbg !92 + %1323 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1322, i32 1, i32 31), !dbg !92 + %1324 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1320, i32 2, i32 31), !dbg !92 + %1325 = bitcast i32 %1324 to float, !dbg !92 + %1326 = tail call float @llvm.maxnum.f32(float %1316, float %1325), !dbg !91 + %1327 = bitcast float %1326 to i32, !dbg !92 + %1328 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1327, i32 1, i32 31), !dbg !92 + %1329 = insertelement <2 x i32> poison, i32 %1323, i64 0, !dbg !92 + %1330 = insertelement <2 x i32> %1329, i32 %1328, i64 1, !dbg !92 + %1331 = bitcast <2 x i32> %1330 to <2 x float>, !dbg !92 + %1332 = insertelement <2 x float> poison, float %1321, i64 0, !dbg !91 + %1333 = insertelement <2 x float> %1332, float %1326, i64 1, !dbg !91 + %1334 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %1333, <2 x float> %1331), !dbg !91 + %1335 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %430, <2 x float> %1334), !dbg !93 + %1336 = extractelement <2 x float> %1335, i64 0, !dbg !94 + %1337 = fcmp oeq float %1336, 0xFFF0000000000000, !dbg !95 + %1338 = extractelement <2 x float> %1335, i64 1, !dbg !94 + %1339 = fcmp oeq float %1338, 0xFFF0000000000000, !dbg !95 + %1340 = select i1 %1337, float 0.000000e+00, float %1336, !dbg !94 + %1341 = select i1 %1339, float 0.000000e+00, float %1338, !dbg !94 + %1342 = extractelement <2 x float> %430, i64 0, !dbg !96 + %1343 = fsub float %1342, %1340, !dbg !96 + %1344 = extractelement <2 x float> %430, i64 1, !dbg !96 + %1345 = fsub float %1344, %1341, !dbg !96 + %1346 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !97 + %.not.i254 = icmp eq i32 %1346, 0, !dbg !97 + br i1 %.not.i254, label %1349, label %1347, !dbg !97 + +1347: ; preds = %358 + %1348 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1343) #2, !dbg !97 + br label %__nv_exp2f.exit256, !dbg !97 + +1349: ; preds = %358 + %1350 = tail call float @llvm.nvvm.ex2.approx.f(float %1343) #2, !dbg !97 + br label %__nv_exp2f.exit256, !dbg !97 + +__nv_exp2f.exit256: ; preds = %1347, %1349 + %.0.i255 = phi float [ %1348, %1347 ], [ %1350, %1349 ], !dbg !97 + %1351 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !97 + %.not.i257 = icmp eq i32 %1351, 0, !dbg !97 + br i1 %.not.i257, label %1354, label %1352, !dbg !97 + +1352: ; preds = %__nv_exp2f.exit256 + %1353 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1345) #2, !dbg !97 + br label %__nv_exp2f.exit259, !dbg !97 + +1354: ; preds = %__nv_exp2f.exit256 + %1355 = tail call float @llvm.nvvm.ex2.approx.f(float %1345) #2, !dbg !97 + br label %__nv_exp2f.exit259, !dbg !97 + +__nv_exp2f.exit259: ; preds = %1352, %1354 + %.0.i258 = phi float [ %1353, %1352 ], [ %1355, %1354 ], !dbg !97 + %1356 = fsub float %1224, %1340, !dbg !98 + %1357 = fsub float %1226, %1340, !dbg !98 + %1358 = fsub float %1228, %1341, !dbg !98 + %1359 = fsub float %1230, %1341, !dbg !98 + %1360 = fsub float %1232, %1340, !dbg !98 + %1361 = fsub float %1234, %1340, !dbg !98 + %1362 = fsub float %1236, %1341, !dbg !98 + %1363 = fsub float %1238, %1341, !dbg !98 + %1364 = fsub float %1240, %1340, !dbg !98 + %1365 = fsub float %1242, %1340, !dbg !98 + %1366 = fsub float %1244, %1341, !dbg !98 + %1367 = fsub float %1246, %1341, !dbg !98 + %1368 = fsub float %1248, %1340, !dbg !98 + %1369 = fsub float %1250, %1340, !dbg !98 + %1370 = fsub float %1252, %1341, !dbg !98 + %1371 = fsub float %1254, %1341, !dbg !98 + %1372 = fsub float %1256, %1340, !dbg !98 + %1373 = fsub float %1258, %1340, !dbg !98 + %1374 = fsub float %1260, %1341, !dbg !98 + %1375 = fsub float %1262, %1341, !dbg !98 + %1376 = fsub float %1264, %1340, !dbg !98 + %1377 = fsub float %1266, %1340, !dbg !98 + %1378 = fsub float %1268, %1341, !dbg !98 + %1379 = fsub float %1270, %1341, !dbg !98 + %1380 = fsub float %1272, %1340, !dbg !98 + %1381 = fsub float %1274, %1340, !dbg !98 + %1382 = fsub float %1276, %1341, !dbg !98 + %1383 = fsub float %1278, %1341, !dbg !98 + %1384 = fsub float %1280, %1340, !dbg !98 + %1385 = fsub float %1282, %1340, !dbg !98 + %1386 = fsub float %1284, %1341, !dbg !98 + %1387 = fsub float %1286, %1341, !dbg !98 + %1388 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i260 = icmp eq i32 %1388, 0, !dbg !99 + br i1 %.not.i260, label %1391, label %1389, !dbg !99 + +1389: ; preds = %__nv_exp2f.exit259 + %1390 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1356) #2, !dbg !99 + br label %__nv_exp2f.exit262, !dbg !99 + +1391: ; preds = %__nv_exp2f.exit259 + %1392 = tail call float @llvm.nvvm.ex2.approx.f(float %1356) #2, !dbg !99 + br label %__nv_exp2f.exit262, !dbg !99 + +__nv_exp2f.exit262: ; preds = %1389, %1391 + %.0.i261 = phi float [ %1390, %1389 ], [ %1392, %1391 ], !dbg !99 + %1393 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i263 = icmp eq i32 %1393, 0, !dbg !99 + br i1 %.not.i263, label %1396, label %1394, !dbg !99 + +1394: ; preds = %__nv_exp2f.exit262 + %1395 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1357) #2, !dbg !99 + br label %__nv_exp2f.exit265, !dbg !99 + +1396: ; preds = %__nv_exp2f.exit262 + %1397 = tail call float @llvm.nvvm.ex2.approx.f(float %1357) #2, !dbg !99 + br label %__nv_exp2f.exit265, !dbg !99 + +__nv_exp2f.exit265: ; preds = %1394, %1396 + %.0.i264 = phi float [ %1395, %1394 ], [ %1397, %1396 ], !dbg !99 + %1398 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i266 = icmp eq i32 %1398, 0, !dbg !99 + br i1 %.not.i266, label %1401, label %1399, !dbg !99 + +1399: ; preds = %__nv_exp2f.exit265 + %1400 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1358) #2, !dbg !99 + br label %__nv_exp2f.exit268, !dbg !99 + +1401: ; preds = %__nv_exp2f.exit265 + %1402 = tail call float @llvm.nvvm.ex2.approx.f(float %1358) #2, !dbg !99 + br label %__nv_exp2f.exit268, !dbg !99 + +__nv_exp2f.exit268: ; preds = %1399, %1401 + %.0.i267 = phi float [ %1400, %1399 ], [ %1402, %1401 ], !dbg !99 + %1403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i269 = icmp eq i32 %1403, 0, !dbg !99 + br i1 %.not.i269, label %1406, label %1404, !dbg !99 + +1404: ; preds = %__nv_exp2f.exit268 + %1405 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1359) #2, !dbg !99 + br label %__nv_exp2f.exit271, !dbg !99 + +1406: ; preds = %__nv_exp2f.exit268 + %1407 = tail call float @llvm.nvvm.ex2.approx.f(float %1359) #2, !dbg !99 + br label %__nv_exp2f.exit271, !dbg !99 + +__nv_exp2f.exit271: ; preds = %1404, %1406 + %.0.i270 = phi float [ %1405, %1404 ], [ %1407, %1406 ], !dbg !99 + %1408 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i272 = icmp eq i32 %1408, 0, !dbg !99 + br i1 %.not.i272, label %1411, label %1409, !dbg !99 + +1409: ; preds = %__nv_exp2f.exit271 + %1410 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1360) #2, !dbg !99 + br label %__nv_exp2f.exit274, !dbg !99 + +1411: ; preds = %__nv_exp2f.exit271 + %1412 = tail call float @llvm.nvvm.ex2.approx.f(float %1360) #2, !dbg !99 + br label %__nv_exp2f.exit274, !dbg !99 + +__nv_exp2f.exit274: ; preds = %1409, %1411 + %.0.i273 = phi float [ %1410, %1409 ], [ %1412, %1411 ], !dbg !99 + %1413 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i275 = icmp eq i32 %1413, 0, !dbg !99 + br i1 %.not.i275, label %1416, label %1414, !dbg !99 + +1414: ; preds = %__nv_exp2f.exit274 + %1415 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1361) #2, !dbg !99 + br label %__nv_exp2f.exit277, !dbg !99 + +1416: ; preds = %__nv_exp2f.exit274 + %1417 = tail call float @llvm.nvvm.ex2.approx.f(float %1361) #2, !dbg !99 + br label %__nv_exp2f.exit277, !dbg !99 + +__nv_exp2f.exit277: ; preds = %1414, %1416 + %.0.i276 = phi float [ %1415, %1414 ], [ %1417, %1416 ], !dbg !99 + %1418 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i278 = icmp eq i32 %1418, 0, !dbg !99 + br i1 %.not.i278, label %1421, label %1419, !dbg !99 + +1419: ; preds = %__nv_exp2f.exit277 + %1420 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1362) #2, !dbg !99 + br label %__nv_exp2f.exit280, !dbg !99 + +1421: ; preds = %__nv_exp2f.exit277 + %1422 = tail call float @llvm.nvvm.ex2.approx.f(float %1362) #2, !dbg !99 + br label %__nv_exp2f.exit280, !dbg !99 + +__nv_exp2f.exit280: ; preds = %1419, %1421 + %.0.i279 = phi float [ %1420, %1419 ], [ %1422, %1421 ], !dbg !99 + %1423 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i281 = icmp eq i32 %1423, 0, !dbg !99 + br i1 %.not.i281, label %1426, label %1424, !dbg !99 + +1424: ; preds = %__nv_exp2f.exit280 + %1425 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1363) #2, !dbg !99 + br label %__nv_exp2f.exit283, !dbg !99 + +1426: ; preds = %__nv_exp2f.exit280 + %1427 = tail call float @llvm.nvvm.ex2.approx.f(float %1363) #2, !dbg !99 + br label %__nv_exp2f.exit283, !dbg !99 + +__nv_exp2f.exit283: ; preds = %1424, %1426 + %.0.i282 = phi float [ %1425, %1424 ], [ %1427, %1426 ], !dbg !99 + %1428 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i284 = icmp eq i32 %1428, 0, !dbg !99 + br i1 %.not.i284, label %1431, label %1429, !dbg !99 + +1429: ; preds = %__nv_exp2f.exit283 + %1430 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1364) #2, !dbg !99 + br label %__nv_exp2f.exit286, !dbg !99 + +1431: ; preds = %__nv_exp2f.exit283 + %1432 = tail call float @llvm.nvvm.ex2.approx.f(float %1364) #2, !dbg !99 + br label %__nv_exp2f.exit286, !dbg !99 + +__nv_exp2f.exit286: ; preds = %1429, %1431 + %.0.i285 = phi float [ %1430, %1429 ], [ %1432, %1431 ], !dbg !99 + %1433 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i287 = icmp eq i32 %1433, 0, !dbg !99 + br i1 %.not.i287, label %1436, label %1434, !dbg !99 + +1434: ; preds = %__nv_exp2f.exit286 + %1435 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1365) #2, !dbg !99 + br label %__nv_exp2f.exit289, !dbg !99 + +1436: ; preds = %__nv_exp2f.exit286 + %1437 = tail call float @llvm.nvvm.ex2.approx.f(float %1365) #2, !dbg !99 + br label %__nv_exp2f.exit289, !dbg !99 + +__nv_exp2f.exit289: ; preds = %1434, %1436 + %.0.i288 = phi float [ %1435, %1434 ], [ %1437, %1436 ], !dbg !99 + %1438 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i290 = icmp eq i32 %1438, 0, !dbg !99 + br i1 %.not.i290, label %1441, label %1439, !dbg !99 + +1439: ; preds = %__nv_exp2f.exit289 + %1440 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1366) #2, !dbg !99 + br label %__nv_exp2f.exit292, !dbg !99 + +1441: ; preds = %__nv_exp2f.exit289 + %1442 = tail call float @llvm.nvvm.ex2.approx.f(float %1366) #2, !dbg !99 + br label %__nv_exp2f.exit292, !dbg !99 + +__nv_exp2f.exit292: ; preds = %1439, %1441 + %.0.i291 = phi float [ %1440, %1439 ], [ %1442, %1441 ], !dbg !99 + %1443 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i293 = icmp eq i32 %1443, 0, !dbg !99 + br i1 %.not.i293, label %1446, label %1444, !dbg !99 + +1444: ; preds = %__nv_exp2f.exit292 + %1445 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1367) #2, !dbg !99 + br label %__nv_exp2f.exit295, !dbg !99 + +1446: ; preds = %__nv_exp2f.exit292 + %1447 = tail call float @llvm.nvvm.ex2.approx.f(float %1367) #2, !dbg !99 + br label %__nv_exp2f.exit295, !dbg !99 + +__nv_exp2f.exit295: ; preds = %1444, %1446 + %.0.i294 = phi float [ %1445, %1444 ], [ %1447, %1446 ], !dbg !99 + %1448 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i296 = icmp eq i32 %1448, 0, !dbg !99 + br i1 %.not.i296, label %1451, label %1449, !dbg !99 + +1449: ; preds = %__nv_exp2f.exit295 + %1450 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1368) #2, !dbg !99 + br label %__nv_exp2f.exit298, !dbg !99 + +1451: ; preds = %__nv_exp2f.exit295 + %1452 = tail call float @llvm.nvvm.ex2.approx.f(float %1368) #2, !dbg !99 + br label %__nv_exp2f.exit298, !dbg !99 + +__nv_exp2f.exit298: ; preds = %1449, %1451 + %.0.i297 = phi float [ %1450, %1449 ], [ %1452, %1451 ], !dbg !99 + %1453 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i299 = icmp eq i32 %1453, 0, !dbg !99 + br i1 %.not.i299, label %1456, label %1454, !dbg !99 + +1454: ; preds = %__nv_exp2f.exit298 + %1455 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1369) #2, !dbg !99 + br label %__nv_exp2f.exit301, !dbg !99 + +1456: ; preds = %__nv_exp2f.exit298 + %1457 = tail call float @llvm.nvvm.ex2.approx.f(float %1369) #2, !dbg !99 + br label %__nv_exp2f.exit301, !dbg !99 + +__nv_exp2f.exit301: ; preds = %1454, %1456 + %.0.i300 = phi float [ %1455, %1454 ], [ %1457, %1456 ], !dbg !99 + %1458 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i302 = icmp eq i32 %1458, 0, !dbg !99 + br i1 %.not.i302, label %1461, label %1459, !dbg !99 + +1459: ; preds = %__nv_exp2f.exit301 + %1460 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1370) #2, !dbg !99 + br label %__nv_exp2f.exit304, !dbg !99 + +1461: ; preds = %__nv_exp2f.exit301 + %1462 = tail call float @llvm.nvvm.ex2.approx.f(float %1370) #2, !dbg !99 + br label %__nv_exp2f.exit304, !dbg !99 + +__nv_exp2f.exit304: ; preds = %1459, %1461 + %.0.i303 = phi float [ %1460, %1459 ], [ %1462, %1461 ], !dbg !99 + %1463 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i305 = icmp eq i32 %1463, 0, !dbg !99 + br i1 %.not.i305, label %1466, label %1464, !dbg !99 + +1464: ; preds = %__nv_exp2f.exit304 + %1465 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1371) #2, !dbg !99 + br label %__nv_exp2f.exit307, !dbg !99 + +1466: ; preds = %__nv_exp2f.exit304 + %1467 = tail call float @llvm.nvvm.ex2.approx.f(float %1371) #2, !dbg !99 + br label %__nv_exp2f.exit307, !dbg !99 + +__nv_exp2f.exit307: ; preds = %1464, %1466 + %.0.i306 = phi float [ %1465, %1464 ], [ %1467, %1466 ], !dbg !99 + %1468 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i308 = icmp eq i32 %1468, 0, !dbg !99 + br i1 %.not.i308, label %1471, label %1469, !dbg !99 + +1469: ; preds = %__nv_exp2f.exit307 + %1470 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1372) #2, !dbg !99 + br label %__nv_exp2f.exit310, !dbg !99 + +1471: ; preds = %__nv_exp2f.exit307 + %1472 = tail call float @llvm.nvvm.ex2.approx.f(float %1372) #2, !dbg !99 + br label %__nv_exp2f.exit310, !dbg !99 + +__nv_exp2f.exit310: ; preds = %1469, %1471 + %.0.i309 = phi float [ %1470, %1469 ], [ %1472, %1471 ], !dbg !99 + %1473 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i311 = icmp eq i32 %1473, 0, !dbg !99 + br i1 %.not.i311, label %1476, label %1474, !dbg !99 + +1474: ; preds = %__nv_exp2f.exit310 + %1475 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1373) #2, !dbg !99 + br label %__nv_exp2f.exit313, !dbg !99 + +1476: ; preds = %__nv_exp2f.exit310 + %1477 = tail call float @llvm.nvvm.ex2.approx.f(float %1373) #2, !dbg !99 + br label %__nv_exp2f.exit313, !dbg !99 + +__nv_exp2f.exit313: ; preds = %1474, %1476 + %.0.i312 = phi float [ %1475, %1474 ], [ %1477, %1476 ], !dbg !99 + %1478 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i314 = icmp eq i32 %1478, 0, !dbg !99 + br i1 %.not.i314, label %1481, label %1479, !dbg !99 + +1479: ; preds = %__nv_exp2f.exit313 + %1480 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1374) #2, !dbg !99 + br label %__nv_exp2f.exit316, !dbg !99 + +1481: ; preds = %__nv_exp2f.exit313 + %1482 = tail call float @llvm.nvvm.ex2.approx.f(float %1374) #2, !dbg !99 + br label %__nv_exp2f.exit316, !dbg !99 + +__nv_exp2f.exit316: ; preds = %1479, %1481 + %.0.i315 = phi float [ %1480, %1479 ], [ %1482, %1481 ], !dbg !99 + %1483 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i317 = icmp eq i32 %1483, 0, !dbg !99 + br i1 %.not.i317, label %1486, label %1484, !dbg !99 + +1484: ; preds = %__nv_exp2f.exit316 + %1485 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1375) #2, !dbg !99 + br label %__nv_exp2f.exit319, !dbg !99 + +1486: ; preds = %__nv_exp2f.exit316 + %1487 = tail call float @llvm.nvvm.ex2.approx.f(float %1375) #2, !dbg !99 + br label %__nv_exp2f.exit319, !dbg !99 + +__nv_exp2f.exit319: ; preds = %1484, %1486 + %.0.i318 = phi float [ %1485, %1484 ], [ %1487, %1486 ], !dbg !99 + %1488 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i320 = icmp eq i32 %1488, 0, !dbg !99 + br i1 %.not.i320, label %1491, label %1489, !dbg !99 + +1489: ; preds = %__nv_exp2f.exit319 + %1490 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1376) #2, !dbg !99 + br label %__nv_exp2f.exit322, !dbg !99 + +1491: ; preds = %__nv_exp2f.exit319 + %1492 = tail call float @llvm.nvvm.ex2.approx.f(float %1376) #2, !dbg !99 + br label %__nv_exp2f.exit322, !dbg !99 + +__nv_exp2f.exit322: ; preds = %1489, %1491 + %.0.i321 = phi float [ %1490, %1489 ], [ %1492, %1491 ], !dbg !99 + %1493 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i323 = icmp eq i32 %1493, 0, !dbg !99 + br i1 %.not.i323, label %1496, label %1494, !dbg !99 + +1494: ; preds = %__nv_exp2f.exit322 + %1495 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1377) #2, !dbg !99 + br label %__nv_exp2f.exit325, !dbg !99 + +1496: ; preds = %__nv_exp2f.exit322 + %1497 = tail call float @llvm.nvvm.ex2.approx.f(float %1377) #2, !dbg !99 + br label %__nv_exp2f.exit325, !dbg !99 + +__nv_exp2f.exit325: ; preds = %1494, %1496 + %.0.i324 = phi float [ %1495, %1494 ], [ %1497, %1496 ], !dbg !99 + %1498 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i326 = icmp eq i32 %1498, 0, !dbg !99 + br i1 %.not.i326, label %1501, label %1499, !dbg !99 + +1499: ; preds = %__nv_exp2f.exit325 + %1500 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1378) #2, !dbg !99 + br label %__nv_exp2f.exit328, !dbg !99 + +1501: ; preds = %__nv_exp2f.exit325 + %1502 = tail call float @llvm.nvvm.ex2.approx.f(float %1378) #2, !dbg !99 + br label %__nv_exp2f.exit328, !dbg !99 + +__nv_exp2f.exit328: ; preds = %1499, %1501 + %.0.i327 = phi float [ %1500, %1499 ], [ %1502, %1501 ], !dbg !99 + %1503 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i329 = icmp eq i32 %1503, 0, !dbg !99 + br i1 %.not.i329, label %1506, label %1504, !dbg !99 + +1504: ; preds = %__nv_exp2f.exit328 + %1505 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1379) #2, !dbg !99 + br label %__nv_exp2f.exit331, !dbg !99 + +1506: ; preds = %__nv_exp2f.exit328 + %1507 = tail call float @llvm.nvvm.ex2.approx.f(float %1379) #2, !dbg !99 + br label %__nv_exp2f.exit331, !dbg !99 + +__nv_exp2f.exit331: ; preds = %1504, %1506 + %.0.i330 = phi float [ %1505, %1504 ], [ %1507, %1506 ], !dbg !99 + %1508 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i332 = icmp eq i32 %1508, 0, !dbg !99 + br i1 %.not.i332, label %1511, label %1509, !dbg !99 + +1509: ; preds = %__nv_exp2f.exit331 + %1510 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1380) #2, !dbg !99 + br label %__nv_exp2f.exit334, !dbg !99 + +1511: ; preds = %__nv_exp2f.exit331 + %1512 = tail call float @llvm.nvvm.ex2.approx.f(float %1380) #2, !dbg !99 + br label %__nv_exp2f.exit334, !dbg !99 + +__nv_exp2f.exit334: ; preds = %1509, %1511 + %.0.i333 = phi float [ %1510, %1509 ], [ %1512, %1511 ], !dbg !99 + %1513 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i335 = icmp eq i32 %1513, 0, !dbg !99 + br i1 %.not.i335, label %1516, label %1514, !dbg !99 + +1514: ; preds = %__nv_exp2f.exit334 + %1515 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1381) #2, !dbg !99 + br label %__nv_exp2f.exit337, !dbg !99 + +1516: ; preds = %__nv_exp2f.exit334 + %1517 = tail call float @llvm.nvvm.ex2.approx.f(float %1381) #2, !dbg !99 + br label %__nv_exp2f.exit337, !dbg !99 + +__nv_exp2f.exit337: ; preds = %1514, %1516 + %.0.i336 = phi float [ %1515, %1514 ], [ %1517, %1516 ], !dbg !99 + %1518 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i338 = icmp eq i32 %1518, 0, !dbg !99 + br i1 %.not.i338, label %1521, label %1519, !dbg !99 + +1519: ; preds = %__nv_exp2f.exit337 + %1520 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1382) #2, !dbg !99 + br label %__nv_exp2f.exit340, !dbg !99 + +1521: ; preds = %__nv_exp2f.exit337 + %1522 = tail call float @llvm.nvvm.ex2.approx.f(float %1382) #2, !dbg !99 + br label %__nv_exp2f.exit340, !dbg !99 + +__nv_exp2f.exit340: ; preds = %1519, %1521 + %.0.i339 = phi float [ %1520, %1519 ], [ %1522, %1521 ], !dbg !99 + %1523 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i341 = icmp eq i32 %1523, 0, !dbg !99 + br i1 %.not.i341, label %1526, label %1524, !dbg !99 + +1524: ; preds = %__nv_exp2f.exit340 + %1525 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1383) #2, !dbg !99 + br label %__nv_exp2f.exit343, !dbg !99 + +1526: ; preds = %__nv_exp2f.exit340 + %1527 = tail call float @llvm.nvvm.ex2.approx.f(float %1383) #2, !dbg !99 + br label %__nv_exp2f.exit343, !dbg !99 + +__nv_exp2f.exit343: ; preds = %1524, %1526 + %.0.i342 = phi float [ %1525, %1524 ], [ %1527, %1526 ], !dbg !99 + %1528 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i344 = icmp eq i32 %1528, 0, !dbg !99 + br i1 %.not.i344, label %1531, label %1529, !dbg !99 + +1529: ; preds = %__nv_exp2f.exit343 + %1530 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1384) #2, !dbg !99 + br label %__nv_exp2f.exit346, !dbg !99 + +1531: ; preds = %__nv_exp2f.exit343 + %1532 = tail call float @llvm.nvvm.ex2.approx.f(float %1384) #2, !dbg !99 + br label %__nv_exp2f.exit346, !dbg !99 + +__nv_exp2f.exit346: ; preds = %1529, %1531 + %.0.i345 = phi float [ %1530, %1529 ], [ %1532, %1531 ], !dbg !99 + %1533 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i347 = icmp eq i32 %1533, 0, !dbg !99 + br i1 %.not.i347, label %1536, label %1534, !dbg !99 + +1534: ; preds = %__nv_exp2f.exit346 + %1535 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1385) #2, !dbg !99 + br label %__nv_exp2f.exit349, !dbg !99 + +1536: ; preds = %__nv_exp2f.exit346 + %1537 = tail call float @llvm.nvvm.ex2.approx.f(float %1385) #2, !dbg !99 + br label %__nv_exp2f.exit349, !dbg !99 + +__nv_exp2f.exit349: ; preds = %1534, %1536 + %.0.i348 = phi float [ %1535, %1534 ], [ %1537, %1536 ], !dbg !99 + %1538 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i350 = icmp eq i32 %1538, 0, !dbg !99 + br i1 %.not.i350, label %1541, label %1539, !dbg !99 + +1539: ; preds = %__nv_exp2f.exit349 + %1540 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1386) #2, !dbg !99 + br label %__nv_exp2f.exit352, !dbg !99 + +1541: ; preds = %__nv_exp2f.exit349 + %1542 = tail call float @llvm.nvvm.ex2.approx.f(float %1386) #2, !dbg !99 + br label %__nv_exp2f.exit352, !dbg !99 + +__nv_exp2f.exit352: ; preds = %1539, %1541 + %.0.i351 = phi float [ %1540, %1539 ], [ %1542, %1541 ], !dbg !99 + %1543 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !99 + %.not.i353 = icmp eq i32 %1543, 0, !dbg !99 + br i1 %.not.i353, label %1546, label %1544, !dbg !99 + +1544: ; preds = %__nv_exp2f.exit352 + %1545 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1387) #2, !dbg !99 + br label %__nv_exp2f.exit355, !dbg !99 + +1546: ; preds = %__nv_exp2f.exit352 + %1547 = tail call float @llvm.nvvm.ex2.approx.f(float %1387) #2, !dbg !99 + br label %__nv_exp2f.exit355, !dbg !99 + +__nv_exp2f.exit355: ; preds = %1544, %1546 + %.0.i354 = phi float [ %1545, %1544 ], [ %1547, %1546 ], !dbg !99 + %1548 = fmul float %363, %.0.i255, !dbg !100 + %1549 = fmul float %364, %.0.i258, !dbg !100 + %1550 = fadd float %.0.i261, %.0.i264, !dbg !101 + %1551 = fadd float %.0.i267, %.0.i270, !dbg !101 + %1552 = fadd float %1550, %.0.i273, !dbg !101 + %1553 = fadd float %1552, %.0.i276, !dbg !101 + %1554 = fadd float %1551, %.0.i279, !dbg !101 + %1555 = fadd float %1554, %.0.i282, !dbg !101 + %1556 = fadd float %1553, %.0.i285, !dbg !101 + %1557 = fadd float %1556, %.0.i288, !dbg !101 + %1558 = fadd float %1555, %.0.i291, !dbg !101 + %1559 = fadd float %1558, %.0.i294, !dbg !101 + %1560 = fadd float %1557, %.0.i297, !dbg !101 + %1561 = fadd float %1560, %.0.i300, !dbg !101 + %1562 = fadd float %1559, %.0.i303, !dbg !101 + %1563 = fadd float %1562, %.0.i306, !dbg !101 + %1564 = fadd float %1561, %.0.i309, !dbg !101 + %1565 = fadd float %1564, %.0.i312, !dbg !101 + %1566 = fadd float %1563, %.0.i315, !dbg !101 + %1567 = fadd float %1566, %.0.i318, !dbg !101 + %1568 = fadd float %1565, %.0.i321, !dbg !101 + %1569 = fadd float %1568, %.0.i324, !dbg !101 + %1570 = fadd float %1567, %.0.i327, !dbg !101 + %1571 = fadd float %1570, %.0.i330, !dbg !101 + %1572 = fadd float %1569, %.0.i333, !dbg !101 + %1573 = fadd float %1572, %.0.i336, !dbg !101 + %1574 = fadd float %1571, %.0.i339, !dbg !101 + %1575 = fadd float %1574, %.0.i342, !dbg !101 + %1576 = fadd float %1573, %.0.i345, !dbg !101 + %1577 = fadd float %1576, %.0.i348, !dbg !101 + %1578 = fadd float %1575, %.0.i351, !dbg !101 + %1579 = fadd float %1578, %.0.i354, !dbg !101 + %1580 = bitcast float %1577 to i32, !dbg !102 + %1581 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1580, i32 2, i32 31), !dbg !102 + %1582 = bitcast i32 %1581 to float, !dbg !102 + %1583 = fadd float %1577, %1582, !dbg !101 + %1584 = bitcast float %1583 to i32, !dbg !102 + %1585 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1584, i32 1, i32 31), !dbg !102 + %1586 = bitcast i32 %1585 to float, !dbg !102 + %1587 = fadd float %1583, %1586, !dbg !101 + %1588 = bitcast float %1579 to i32, !dbg !102 + %1589 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1588, i32 2, i32 31), !dbg !102 + %1590 = bitcast i32 %1589 to float, !dbg !102 + %1591 = fadd float %1579, %1590, !dbg !101 + %1592 = bitcast float %1591 to i32, !dbg !102 + %1593 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1592, i32 1, i32 31), !dbg !102 + %1594 = bitcast i32 %1593 to float, !dbg !102 + %1595 = fadd float %1591, %1594, !dbg !101 + %1596 = fadd float %1548, %1587, !dbg !103 + %1597 = fadd float %1549, %1595, !dbg !103 + %1598 = fmul float %819, %.0.i255, !dbg !104 + %1599 = fmul float %820, %.0.i255, !dbg !104 + %1600 = fmul float %821, %.0.i258, !dbg !104 + %1601 = fmul float %822, %.0.i258, !dbg !104 + %1602 = fmul float %823, %.0.i255, !dbg !104 + %1603 = fmul float %824, %.0.i255, !dbg !104 + %1604 = fmul float %825, %.0.i258, !dbg !104 + %1605 = fmul float %826, %.0.i258, !dbg !104 + %1606 = fmul float %827, %.0.i255, !dbg !104 + %1607 = fmul float %828, %.0.i255, !dbg !104 + %1608 = fmul float %829, %.0.i258, !dbg !104 + %1609 = fmul float %830, %.0.i258, !dbg !104 + %1610 = fmul float %831, %.0.i255, !dbg !104 + %1611 = fmul float %832, %.0.i255, !dbg !104 + %1612 = fmul float %833, %.0.i258, !dbg !104 + %1613 = fmul float %834, %.0.i258, !dbg !104 + %1614 = fmul float %835, %.0.i255, !dbg !104 + %1615 = fmul float %836, %.0.i255, !dbg !104 + %1616 = fmul float %837, %.0.i258, !dbg !104 + %1617 = fmul float %838, %.0.i258, !dbg !104 + %1618 = fmul float %839, %.0.i255, !dbg !104 + %1619 = fmul float %840, %.0.i255, !dbg !104 + %1620 = fmul float %841, %.0.i258, !dbg !104 + %1621 = fmul float %842, %.0.i258, !dbg !104 + %1622 = fmul float %843, %.0.i255, !dbg !104 + %1623 = fmul float %844, %.0.i255, !dbg !104 + %1624 = fmul float %845, %.0.i258, !dbg !104 + %1625 = fmul float %846, %.0.i258, !dbg !104 + %1626 = fmul float %847, %.0.i255, !dbg !104 + %1627 = fmul float %848, %.0.i255, !dbg !104 + %1628 = fmul float %849, %.0.i258, !dbg !104 + %1629 = fmul float %850, %.0.i258, !dbg !104 + %1630 = fmul float %851, %.0.i255, !dbg !104 + %1631 = fmul float %852, %.0.i255, !dbg !104 + %1632 = fmul float %853, %.0.i258, !dbg !104 + %1633 = fmul float %854, %.0.i258, !dbg !104 + %1634 = fmul float %855, %.0.i255, !dbg !104 + %1635 = fmul float %856, %.0.i255, !dbg !104 + %1636 = fmul float %857, %.0.i258, !dbg !104 + %1637 = fmul float %858, %.0.i258, !dbg !104 + %1638 = fmul float %859, %.0.i255, !dbg !104 + %1639 = fmul float %860, %.0.i255, !dbg !104 + %1640 = fmul float %861, %.0.i258, !dbg !104 + %1641 = fmul float %862, %.0.i258, !dbg !104 + %1642 = fmul float %863, %.0.i255, !dbg !104 + %1643 = fmul float %864, %.0.i255, !dbg !104 + %1644 = fmul float %865, %.0.i258, !dbg !104 + %1645 = fmul float %866, %.0.i258, !dbg !104 + %1646 = fmul float %867, %.0.i255, !dbg !104 + %1647 = fmul float %868, %.0.i255, !dbg !104 + %1648 = fmul float %869, %.0.i258, !dbg !104 + %1649 = fmul float %870, %.0.i258, !dbg !104 + %1650 = fmul float %871, %.0.i255, !dbg !104 + %1651 = fmul float %872, %.0.i255, !dbg !104 + %1652 = fmul float %873, %.0.i258, !dbg !104 + %1653 = fmul float %874, %.0.i258, !dbg !104 + %1654 = fmul float %875, %.0.i255, !dbg !104 + %1655 = fmul float %876, %.0.i255, !dbg !104 + %1656 = fmul float %877, %.0.i258, !dbg !104 + %1657 = fmul float %878, %.0.i258, !dbg !104 + %1658 = fmul float %879, %.0.i255, !dbg !104 + %1659 = fmul float %880, %.0.i255, !dbg !104 + %1660 = fmul float %881, %.0.i258, !dbg !104 + %1661 = fmul float %882, %.0.i258, !dbg !104 + %1662 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %437, !dbg !63 + %1663 = insertelement <2 x float> poison, float %.0.i261, i64 0, !dbg !105 + %1664 = insertelement <2 x float> %1663, float %.0.i264, i64 1, !dbg !105 + %1665 = fptrunc <2 x float> %1664 to <2 x bfloat>, !dbg !105 + %1666 = insertelement <2 x float> poison, float %.0.i267, i64 0, !dbg !105 + %1667 = insertelement <2 x float> %1666, float %.0.i270, i64 1, !dbg !105 + %1668 = fptrunc <2 x float> %1667 to <2 x bfloat>, !dbg !105 + %1669 = insertelement <2 x float> poison, float %.0.i273, i64 0, !dbg !105 + %1670 = insertelement <2 x float> %1669, float %.0.i276, i64 1, !dbg !105 + %1671 = fptrunc <2 x float> %1670 to <2 x bfloat>, !dbg !105 + %1672 = insertelement <2 x float> poison, float %.0.i279, i64 0, !dbg !105 + %1673 = insertelement <2 x float> %1672, float %.0.i282, i64 1, !dbg !105 + %1674 = fptrunc <2 x float> %1673 to <2 x bfloat>, !dbg !105 + %1675 = insertelement <2 x float> poison, float %.0.i285, i64 0, !dbg !105 + %1676 = insertelement <2 x float> %1675, float %.0.i288, i64 1, !dbg !105 + %1677 = fptrunc <2 x float> %1676 to <2 x bfloat>, !dbg !105 + %1678 = insertelement <2 x float> poison, float %.0.i291, i64 0, !dbg !105 + %1679 = insertelement <2 x float> %1678, float %.0.i294, i64 1, !dbg !105 + %1680 = fptrunc <2 x float> %1679 to <2 x bfloat>, !dbg !105 + %1681 = insertelement <2 x float> poison, float %.0.i297, i64 0, !dbg !105 + %1682 = insertelement <2 x float> %1681, float %.0.i300, i64 1, !dbg !105 + %1683 = fptrunc <2 x float> %1682 to <2 x bfloat>, !dbg !105 + %1684 = insertelement <2 x float> poison, float %.0.i303, i64 0, !dbg !105 + %1685 = insertelement <2 x float> %1684, float %.0.i306, i64 1, !dbg !105 + %1686 = fptrunc <2 x float> %1685 to <2 x bfloat>, !dbg !105 + %1687 = insertelement <2 x float> poison, float %.0.i309, i64 0, !dbg !105 + %1688 = insertelement <2 x float> %1687, float %.0.i312, i64 1, !dbg !105 + %1689 = fptrunc <2 x float> %1688 to <2 x bfloat>, !dbg !105 + %1690 = insertelement <2 x float> poison, float %.0.i315, i64 0, !dbg !105 + %1691 = insertelement <2 x float> %1690, float %.0.i318, i64 1, !dbg !105 + %1692 = fptrunc <2 x float> %1691 to <2 x bfloat>, !dbg !105 + %1693 = insertelement <2 x float> poison, float %.0.i321, i64 0, !dbg !105 + %1694 = insertelement <2 x float> %1693, float %.0.i324, i64 1, !dbg !105 + %1695 = fptrunc <2 x float> %1694 to <2 x bfloat>, !dbg !105 + %1696 = insertelement <2 x float> poison, float %.0.i327, i64 0, !dbg !105 + %1697 = insertelement <2 x float> %1696, float %.0.i330, i64 1, !dbg !105 + %1698 = fptrunc <2 x float> %1697 to <2 x bfloat>, !dbg !105 + %1699 = insertelement <2 x float> poison, float %.0.i333, i64 0, !dbg !105 + %1700 = insertelement <2 x float> %1699, float %.0.i336, i64 1, !dbg !105 + %1701 = fptrunc <2 x float> %1700 to <2 x bfloat>, !dbg !105 + %1702 = insertelement <2 x float> poison, float %.0.i339, i64 0, !dbg !105 + %1703 = insertelement <2 x float> %1702, float %.0.i342, i64 1, !dbg !105 + %1704 = fptrunc <2 x float> %1703 to <2 x bfloat>, !dbg !105 + %1705 = insertelement <2 x float> poison, float %.0.i345, i64 0, !dbg !105 + %1706 = insertelement <2 x float> %1705, float %.0.i348, i64 1, !dbg !105 + %1707 = fptrunc <2 x float> %1706 to <2 x bfloat>, !dbg !105 + %1708 = insertelement <2 x float> poison, float %.0.i351, i64 0, !dbg !105 + %1709 = insertelement <2 x float> %1708, float %.0.i354, i64 1, !dbg !105 + %1710 = fptrunc <2 x float> %1709 to <2 x bfloat>, !dbg !105 + %1711 = bitcast <2 x bfloat> %1665 to i32, !dbg !106 + %1712 = bitcast <2 x bfloat> %1668 to i32, !dbg !106 + %1713 = bitcast <2 x bfloat> %1671 to i32, !dbg !106 + %1714 = bitcast <2 x bfloat> %1674 to i32, !dbg !106 + %1715 = bitcast <2 x bfloat> %1677 to i32, !dbg !106 + %1716 = bitcast <2 x bfloat> %1680 to i32, !dbg !106 + %1717 = bitcast <2 x bfloat> %1683 to i32, !dbg !106 + %1718 = bitcast <2 x bfloat> %1686 to i32, !dbg !106 + %1719 = bitcast <2 x bfloat> %1689 to i32, !dbg !106 + %1720 = bitcast <2 x bfloat> %1692 to i32, !dbg !106 + %1721 = bitcast <2 x bfloat> %1695 to i32, !dbg !106 + %1722 = bitcast <2 x bfloat> %1698 to i32, !dbg !106 + %1723 = bitcast <2 x bfloat> %1701 to i32, !dbg !106 + %1724 = bitcast <2 x bfloat> %1704 to i32, !dbg !106 + %1725 = bitcast <2 x bfloat> %1707 to i32, !dbg !106 + %1726 = bitcast <2 x bfloat> %1710 to i32, !dbg !106 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !106 + %1727 = ptrtoint ptr addrspace(3) %1662 to i32, !dbg !106 + %1728 = lshr exact i32 %1727, 4, !dbg !106 + %1729 = and i32 %1728, 16383, !dbg !106 + %1730 = zext nneg i32 %1729 to i64, !dbg !106 + %1731 = or disjoint i64 %1730, 4611686293338849280, !dbg !106 + %1732 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1598, float %1599, float %1600, float %1601, float %1602, float %1603, float %1604, float %1605, float %1606, float %1607, float %1608, float %1609, float %1610, float %1611, float %1612, float %1613, float %1614, float %1615, float %1616, float %1617, float %1618, float %1619, float %1620, float %1621, float %1622, float %1623, float %1624, float %1625, float %1626, float %1627, float %1628, float %1629, float %1630, float %1631, float %1632, float %1633, float %1634, float %1635, float %1636, float %1637, float %1638, float %1639, float %1640, float %1641, float %1642, float %1643, float %1644, float %1645, float %1646, float %1647, float %1648, float %1649, float %1650, float %1651, float %1652, float %1653, float %1654, float %1655, float %1656, float %1657, float %1658, float %1659, float %1660, float %1661, i32 %1711, i32 %1712, i32 %1713, i32 %1714, i64 %1731, i1 true) #2, !dbg !106 + %1733 = add i32 %1727, 2048, !dbg !106 + %1734 = lshr exact i32 %1733, 4, !dbg !106 + %1735 = and i32 %1734, 16383, !dbg !106 + %1736 = zext nneg i32 %1735 to i64, !dbg !106 + %1737 = or disjoint i64 %1736, 4611686293338849280, !dbg !106 + %1738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 0, !dbg !106 + %1739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 1, !dbg !106 + %1740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 2, !dbg !106 + %1741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 3, !dbg !106 + %1742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 4, !dbg !106 + %1743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 5, !dbg !106 + %1744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 6, !dbg !106 + %1745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 7, !dbg !106 + %1746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 8, !dbg !106 + %1747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 9, !dbg !106 + %1748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 10, !dbg !106 + %1749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 11, !dbg !106 + %1750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 12, !dbg !106 + %1751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 13, !dbg !106 + %1752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 14, !dbg !106 + %1753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 15, !dbg !106 + %1754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 16, !dbg !106 + %1755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 17, !dbg !106 + %1756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 18, !dbg !106 + %1757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 19, !dbg !106 + %1758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 20, !dbg !106 + %1759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 21, !dbg !106 + %1760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 22, !dbg !106 + %1761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 23, !dbg !106 + %1762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 24, !dbg !106 + %1763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 25, !dbg !106 + %1764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 26, !dbg !106 + %1765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 27, !dbg !106 + %1766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 28, !dbg !106 + %1767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 29, !dbg !106 + %1768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 30, !dbg !106 + %1769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 31, !dbg !106 + %1770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 32, !dbg !106 + %1771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 33, !dbg !106 + %1772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 34, !dbg !106 + %1773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 35, !dbg !106 + %1774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 36, !dbg !106 + %1775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 37, !dbg !106 + %1776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 38, !dbg !106 + %1777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 39, !dbg !106 + %1778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 40, !dbg !106 + %1779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 41, !dbg !106 + %1780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 42, !dbg !106 + %1781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 43, !dbg !106 + %1782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 44, !dbg !106 + %1783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 45, !dbg !106 + %1784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 46, !dbg !106 + %1785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 47, !dbg !106 + %1786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 48, !dbg !106 + %1787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 49, !dbg !106 + %1788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 50, !dbg !106 + %1789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 51, !dbg !106 + %1790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 52, !dbg !106 + %1791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 53, !dbg !106 + %1792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 54, !dbg !106 + %1793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 55, !dbg !106 + %1794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 56, !dbg !106 + %1795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 57, !dbg !106 + %1796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 58, !dbg !106 + %1797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 59, !dbg !106 + %1798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 60, !dbg !106 + %1799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 61, !dbg !106 + %1800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 62, !dbg !106 + %1801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1732, 63, !dbg !106 + %1802 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1738, float %1739, float %1740, float %1741, float %1742, float %1743, float %1744, float %1745, float %1746, float %1747, float %1748, float %1749, float %1750, float %1751, float %1752, float %1753, float %1754, float %1755, float %1756, float %1757, float %1758, float %1759, float %1760, float %1761, float %1762, float %1763, float %1764, float %1765, float %1766, float %1767, float %1768, float %1769, float %1770, float %1771, float %1772, float %1773, float %1774, float %1775, float %1776, float %1777, float %1778, float %1779, float %1780, float %1781, float %1782, float %1783, float %1784, float %1785, float %1786, float %1787, float %1788, float %1789, float %1790, float %1791, float %1792, float %1793, float %1794, float %1795, float %1796, float %1797, float %1798, float %1799, float %1800, float %1801, i32 %1715, i32 %1716, i32 %1717, i32 %1718, i64 %1737, i1 true) #2, !dbg !106 + %1803 = add i32 %1727, 4096, !dbg !106 + %1804 = lshr exact i32 %1803, 4, !dbg !106 + %1805 = and i32 %1804, 16383, !dbg !106 + %1806 = zext nneg i32 %1805 to i64, !dbg !106 + %1807 = or disjoint i64 %1806, 4611686293338849280, !dbg !106 + %1808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 0, !dbg !106 + %1809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 1, !dbg !106 + %1810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 2, !dbg !106 + %1811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 3, !dbg !106 + %1812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 4, !dbg !106 + %1813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 5, !dbg !106 + %1814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 6, !dbg !106 + %1815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 7, !dbg !106 + %1816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 8, !dbg !106 + %1817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 9, !dbg !106 + %1818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 10, !dbg !106 + %1819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 11, !dbg !106 + %1820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 12, !dbg !106 + %1821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 13, !dbg !106 + %1822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 14, !dbg !106 + %1823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 15, !dbg !106 + %1824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 16, !dbg !106 + %1825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 17, !dbg !106 + %1826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 18, !dbg !106 + %1827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 19, !dbg !106 + %1828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 20, !dbg !106 + %1829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 21, !dbg !106 + %1830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 22, !dbg !106 + %1831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 23, !dbg !106 + %1832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 24, !dbg !106 + %1833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 25, !dbg !106 + %1834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 26, !dbg !106 + %1835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 27, !dbg !106 + %1836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 28, !dbg !106 + %1837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 29, !dbg !106 + %1838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 30, !dbg !106 + %1839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 31, !dbg !106 + %1840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 32, !dbg !106 + %1841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 33, !dbg !106 + %1842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 34, !dbg !106 + %1843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 35, !dbg !106 + %1844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 36, !dbg !106 + %1845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 37, !dbg !106 + %1846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 38, !dbg !106 + %1847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 39, !dbg !106 + %1848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 40, !dbg !106 + %1849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 41, !dbg !106 + %1850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 42, !dbg !106 + %1851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 43, !dbg !106 + %1852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 44, !dbg !106 + %1853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 45, !dbg !106 + %1854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 46, !dbg !106 + %1855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 47, !dbg !106 + %1856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 48, !dbg !106 + %1857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 49, !dbg !106 + %1858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 50, !dbg !106 + %1859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 51, !dbg !106 + %1860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 52, !dbg !106 + %1861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 53, !dbg !106 + %1862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 54, !dbg !106 + %1863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 55, !dbg !106 + %1864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 56, !dbg !106 + %1865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 57, !dbg !106 + %1866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 58, !dbg !106 + %1867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 59, !dbg !106 + %1868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 60, !dbg !106 + %1869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 61, !dbg !106 + %1870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 62, !dbg !106 + %1871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1802, 63, !dbg !106 + %1872 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1808, float %1809, float %1810, float %1811, float %1812, float %1813, float %1814, float %1815, float %1816, float %1817, float %1818, float %1819, float %1820, float %1821, float %1822, float %1823, float %1824, float %1825, float %1826, float %1827, float %1828, float %1829, float %1830, float %1831, float %1832, float %1833, float %1834, float %1835, float %1836, float %1837, float %1838, float %1839, float %1840, float %1841, float %1842, float %1843, float %1844, float %1845, float %1846, float %1847, float %1848, float %1849, float %1850, float %1851, float %1852, float %1853, float %1854, float %1855, float %1856, float %1857, float %1858, float %1859, float %1860, float %1861, float %1862, float %1863, float %1864, float %1865, float %1866, float %1867, float %1868, float %1869, float %1870, float %1871, i32 %1719, i32 %1720, i32 %1721, i32 %1722, i64 %1807, i1 true) #2, !dbg !106 + %1873 = add i32 %1727, 6144, !dbg !106 + %1874 = lshr exact i32 %1873, 4, !dbg !106 + %1875 = and i32 %1874, 16383, !dbg !106 + %1876 = zext nneg i32 %1875 to i64, !dbg !106 + %1877 = or disjoint i64 %1876, 4611686293338849280, !dbg !106 + %1878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 0, !dbg !106 + %1879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 1, !dbg !106 + %1880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 2, !dbg !106 + %1881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 3, !dbg !106 + %1882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 4, !dbg !106 + %1883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 5, !dbg !106 + %1884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 6, !dbg !106 + %1885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 7, !dbg !106 + %1886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 8, !dbg !106 + %1887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 9, !dbg !106 + %1888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 10, !dbg !106 + %1889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 11, !dbg !106 + %1890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 12, !dbg !106 + %1891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 13, !dbg !106 + %1892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 14, !dbg !106 + %1893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 15, !dbg !106 + %1894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 16, !dbg !106 + %1895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 17, !dbg !106 + %1896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 18, !dbg !106 + %1897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 19, !dbg !106 + %1898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 20, !dbg !106 + %1899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 21, !dbg !106 + %1900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 22, !dbg !106 + %1901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 23, !dbg !106 + %1902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 24, !dbg !106 + %1903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 25, !dbg !106 + %1904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 26, !dbg !106 + %1905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 27, !dbg !106 + %1906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 28, !dbg !106 + %1907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 29, !dbg !106 + %1908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 30, !dbg !106 + %1909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 31, !dbg !106 + %1910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 32, !dbg !106 + %1911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 33, !dbg !106 + %1912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 34, !dbg !106 + %1913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 35, !dbg !106 + %1914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 36, !dbg !106 + %1915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 37, !dbg !106 + %1916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 38, !dbg !106 + %1917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 39, !dbg !106 + %1918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 40, !dbg !106 + %1919 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 41, !dbg !106 + %1920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 42, !dbg !106 + %1921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 43, !dbg !106 + %1922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 44, !dbg !106 + %1923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 45, !dbg !106 + %1924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 46, !dbg !106 + %1925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 47, !dbg !106 + %1926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 48, !dbg !106 + %1927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 49, !dbg !106 + %1928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 50, !dbg !106 + %1929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 51, !dbg !106 + %1930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 52, !dbg !106 + %1931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 53, !dbg !106 + %1932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 54, !dbg !106 + %1933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 55, !dbg !106 + %1934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 56, !dbg !106 + %1935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 57, !dbg !106 + %1936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 58, !dbg !106 + %1937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 59, !dbg !106 + %1938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 60, !dbg !106 + %1939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 61, !dbg !106 + %1940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 62, !dbg !106 + %1941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1872, 63, !dbg !106 + %1942 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1878, float %1879, float %1880, float %1881, float %1882, float %1883, float %1884, float %1885, float %1886, float %1887, float %1888, float %1889, float %1890, float %1891, float %1892, float %1893, float %1894, float %1895, float %1896, float %1897, float %1898, float %1899, float %1900, float %1901, float %1902, float %1903, float %1904, float %1905, float %1906, float %1907, float %1908, float %1909, float %1910, float %1911, float %1912, float %1913, float %1914, float %1915, float %1916, float %1917, float %1918, float %1919, float %1920, float %1921, float %1922, float %1923, float %1924, float %1925, float %1926, float %1927, float %1928, float %1929, float %1930, float %1931, float %1932, float %1933, float %1934, float %1935, float %1936, float %1937, float %1938, float %1939, float %1940, float %1941, i32 %1723, i32 %1724, i32 %1725, i32 %1726, i64 %1877, i1 true) #2, !dbg !106 + %1943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 0, !dbg !106 + %1944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 1, !dbg !106 + %1945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 2, !dbg !106 + %1946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 3, !dbg !106 + %1947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 4, !dbg !106 + %1948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 5, !dbg !106 + %1949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 6, !dbg !106 + %1950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 7, !dbg !106 + %1951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 8, !dbg !106 + %1952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 9, !dbg !106 + %1953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 10, !dbg !106 + %1954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 11, !dbg !106 + %1955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 12, !dbg !106 + %1956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 13, !dbg !106 + %1957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 14, !dbg !106 + %1958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 15, !dbg !106 + %1959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 16, !dbg !106 + %1960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 17, !dbg !106 + %1961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 18, !dbg !106 + %1962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 19, !dbg !106 + %1963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 20, !dbg !106 + %1964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 21, !dbg !106 + %1965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 22, !dbg !106 + %1966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 23, !dbg !106 + %1967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 24, !dbg !106 + %1968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 25, !dbg !106 + %1969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 26, !dbg !106 + %1970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 27, !dbg !106 + %1971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 28, !dbg !106 + %1972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 29, !dbg !106 + %1973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 30, !dbg !106 + %1974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 31, !dbg !106 + %1975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 32, !dbg !106 + %1976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 33, !dbg !106 + %1977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 34, !dbg !106 + %1978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 35, !dbg !106 + %1979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 36, !dbg !106 + %1980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 37, !dbg !106 + %1981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 38, !dbg !106 + %1982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 39, !dbg !106 + %1983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 40, !dbg !106 + %1984 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 41, !dbg !106 + %1985 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 42, !dbg !106 + %1986 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 43, !dbg !106 + %1987 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 44, !dbg !106 + %1988 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 45, !dbg !106 + %1989 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 46, !dbg !106 + %1990 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 47, !dbg !106 + %1991 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 48, !dbg !106 + %1992 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 49, !dbg !106 + %1993 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 50, !dbg !106 + %1994 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 51, !dbg !106 + %1995 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 52, !dbg !106 + %1996 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 53, !dbg !106 + %1997 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 54, !dbg !106 + %1998 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 55, !dbg !106 + %1999 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 56, !dbg !106 + %2000 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 57, !dbg !106 + %2001 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 58, !dbg !106 + %2002 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 59, !dbg !106 + %2003 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 60, !dbg !106 + %2004 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 61, !dbg !106 + %2005 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 62, !dbg !106 + %2006 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1942, 63, !dbg !106 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !106 + %2007 = insertelement <16 x i32> poison, i32 %359, i64 0, !dbg !107 + %2008 = shufflevector <16 x i32> %2007, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !107 + %2009 = add <16 x i32> %2008, %431, !dbg !107 + %2010 = add nuw nsw i32 %429, 1, !dbg !54 + %2011 = lshr i32 %2010, 1, !dbg !108 + %2012 = zext nneg i32 %2011 to i64, !dbg !109 + %2013 = getelementptr i32, ptr addrspace(1) %193, i64 %2012, !dbg !109 + %2014 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !110 + %2015 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %2013, i64 %2014, i1 %433) #2, !dbg !110 + %2016 = add nuw nsw i32 %2011, 1, !dbg !111 + %2017 = icmp slt i32 %2016, %198, !dbg !112 + %2018 = getelementptr i8, ptr addrspace(1) %2013, i64 4, !dbg !113 + %2019 = and i1 %433, %2017, !dbg !54 + %2020 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !114 + %2021 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %2018, i64 %2020, i1 %2019) #2, !dbg !114 + %2022 = and i32 %429, 1, !dbg !115 + %2023 = sub i32 %2021, %2015, !dbg !116 + %2024 = shl i32 %2023, 7, !dbg !117 + %2025 = add i32 %2024, -64, !dbg !118 + %2026 = xor i32 %2022, 1, !dbg !119 + %2027 = mul nuw nsw i32 %2025, %2026, !dbg !119 + %2028 = shl nuw nsw i32 %2022, 6, !dbg !120 + %2029 = add i32 %2027, %2028, !dbg !121 + %2030 = add i32 %2029, %362, !dbg !122 + %2031 = add i32 %361, 1, !dbg !54 + %2032 = icmp sgt i32 %2031, 2, !dbg !54 + %2033 = select i1 %2032, i32 0, i32 %2031, !dbg !54 + %2034 = add i32 %2030, %195, !dbg !64 + %2035 = add i32 %2034, %43, !dbg !58 + %2036 = add i32 %2034, %44, !dbg !58 + %2037 = add i32 %2034, %45, !dbg !58 + %2038 = add i32 %2034, %46, !dbg !58 + %2039 = shl i32 %2035, 7, !dbg !59 + %2040 = shl i32 %2036, 7, !dbg !59 + %2041 = shl i32 %2037, 7, !dbg !59 + %2042 = shl i32 %2038, 7, !dbg !59 + %2043 = sext i32 %2039 to i64, !dbg !60 + %2044 = sext i32 %2040 to i64, !dbg !60 + %2045 = sext i32 %2041 to i64, !dbg !60 + %2046 = sext i32 %2042 to i64, !dbg !60 + %gep = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %2043, !dbg !61 + %gep396 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %2044, !dbg !61 + %gep398 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %2045, !dbg !61 + %gep400 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %2046, !dbg !61 + %2047 = icmp slt i32 %2035, %12, !dbg !62 + %2048 = icmp slt i32 %2036, %12, !dbg !62 + %2049 = icmp slt i32 %2037, %12, !dbg !62 + %2050 = icmp slt i32 %2038, %12, !dbg !62 + %2051 = shl i32 %2033, 13, !dbg !63 + %2052 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %2051, !dbg !63 + %2053 = and i1 %432, %2047, !dbg !54 + %2054 = and i1 %432, %2048, !dbg !54 + %2055 = and i1 %432, %2049, !dbg !54 + %2056 = and i1 %432, %2050, !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !63 + %2057 = getelementptr inbounds nuw i8, ptr addrspace(3) %2052, i32 %263, !dbg !63 + %2058 = select i1 %2053, i32 16, i32 0, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %2057, ptr addrspace(1) %gep, i32 %2058) #2, !dbg !63 + %2059 = getelementptr inbounds nuw i8, ptr addrspace(3) %2052, i32 %266, !dbg !63 + %2060 = select i1 %2054, i32 16, i32 0, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2059, ptr addrspace(1) %gep396, i32 %2060) #2, !dbg !63 + %2061 = getelementptr inbounds nuw i8, ptr addrspace(3) %2052, i32 %269, !dbg !63 + %2062 = select i1 %2055, i32 16, i32 0, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2061, ptr addrspace(1) %gep398, i32 %2062) #2, !dbg !63 + %2063 = getelementptr inbounds nuw i8, ptr addrspace(3) %2052, i32 %272, !dbg !63 + %2064 = select i1 %2056, i32 16, i32 0, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2063, ptr addrspace(1) %gep400, i32 %2064) #2, !dbg !63 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !63 + %gep402 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %2043, !dbg !61 + %gep404 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %2044, !dbg !61 + %gep406 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %2045, !dbg !61 + %gep408 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %2046, !dbg !61 + %2065 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %2051, !dbg !63 + %2066 = getelementptr inbounds nuw i8, ptr addrspace(3) %2065, i32 %263, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %2066, ptr addrspace(1) %gep402, i32 %2058) #2, !dbg !63 + %2067 = getelementptr inbounds nuw i8, ptr addrspace(3) %2065, i32 %266, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2067, ptr addrspace(1) %gep404, i32 %2060) #2, !dbg !63 + %2068 = getelementptr inbounds nuw i8, ptr addrspace(3) %2065, i32 %269, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2068, ptr addrspace(1) %gep406, i32 %2062) #2, !dbg !63 + %2069 = getelementptr inbounds nuw i8, ptr addrspace(3) %2065, i32 %272, !dbg !63 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2069, ptr addrspace(1) %gep408, i32 %2064) #2, !dbg !63 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !63 + %exitcond.not = icmp eq i32 %2010, %smax, !dbg !54 + br i1 %exitcond.not, label %._crit_edge, label %358, !dbg !54 + +._crit_edge: ; preds = %__nv_exp2f.exit355, %19 + %2070 = phi float [ 0.000000e+00, %19 ], [ %1943, %__nv_exp2f.exit355 ], !dbg !123 + %2071 = phi float [ 0.000000e+00, %19 ], [ %1944, %__nv_exp2f.exit355 ], !dbg !123 + %2072 = phi float [ 0.000000e+00, %19 ], [ %1945, %__nv_exp2f.exit355 ], !dbg !123 + %2073 = phi float [ 0.000000e+00, %19 ], [ %1946, %__nv_exp2f.exit355 ], !dbg !123 + %2074 = phi float [ 0.000000e+00, %19 ], [ %1947, %__nv_exp2f.exit355 ], !dbg !123 + %2075 = phi float [ 0.000000e+00, %19 ], [ %1948, %__nv_exp2f.exit355 ], !dbg !123 + %2076 = phi float [ 0.000000e+00, %19 ], [ %1949, %__nv_exp2f.exit355 ], !dbg !123 + %2077 = phi float [ 0.000000e+00, %19 ], [ %1950, %__nv_exp2f.exit355 ], !dbg !123 + %2078 = phi float [ 0.000000e+00, %19 ], [ %1951, %__nv_exp2f.exit355 ], !dbg !123 + %2079 = phi float [ 0.000000e+00, %19 ], [ %1952, %__nv_exp2f.exit355 ], !dbg !123 + %2080 = phi float [ 0.000000e+00, %19 ], [ %1953, %__nv_exp2f.exit355 ], !dbg !123 + %2081 = phi float [ 0.000000e+00, %19 ], [ %1954, %__nv_exp2f.exit355 ], !dbg !123 + %2082 = phi float [ 0.000000e+00, %19 ], [ %1955, %__nv_exp2f.exit355 ], !dbg !123 + %2083 = phi float [ 0.000000e+00, %19 ], [ %1956, %__nv_exp2f.exit355 ], !dbg !123 + %2084 = phi float [ 0.000000e+00, %19 ], [ %1957, %__nv_exp2f.exit355 ], !dbg !123 + %2085 = phi float [ 0.000000e+00, %19 ], [ %1958, %__nv_exp2f.exit355 ], !dbg !123 + %2086 = phi float [ 0.000000e+00, %19 ], [ %1959, %__nv_exp2f.exit355 ], !dbg !123 + %2087 = phi float [ 0.000000e+00, %19 ], [ %1960, %__nv_exp2f.exit355 ], !dbg !123 + %2088 = phi float [ 0.000000e+00, %19 ], [ %1961, %__nv_exp2f.exit355 ], !dbg !123 + %2089 = phi float [ 0.000000e+00, %19 ], [ %1962, %__nv_exp2f.exit355 ], !dbg !123 + %2090 = phi float [ 0.000000e+00, %19 ], [ %1963, %__nv_exp2f.exit355 ], !dbg !123 + %2091 = phi float [ 0.000000e+00, %19 ], [ %1964, %__nv_exp2f.exit355 ], !dbg !123 + %2092 = phi float [ 0.000000e+00, %19 ], [ %1965, %__nv_exp2f.exit355 ], !dbg !123 + %2093 = phi float [ 0.000000e+00, %19 ], [ %1966, %__nv_exp2f.exit355 ], !dbg !123 + %2094 = phi float [ 0.000000e+00, %19 ], [ %1967, %__nv_exp2f.exit355 ], !dbg !123 + %2095 = phi float [ 0.000000e+00, %19 ], [ %1968, %__nv_exp2f.exit355 ], !dbg !123 + %2096 = phi float [ 0.000000e+00, %19 ], [ %1969, %__nv_exp2f.exit355 ], !dbg !123 + %2097 = phi float [ 0.000000e+00, %19 ], [ %1970, %__nv_exp2f.exit355 ], !dbg !123 + %2098 = phi float [ 0.000000e+00, %19 ], [ %1971, %__nv_exp2f.exit355 ], !dbg !123 + %2099 = phi float [ 0.000000e+00, %19 ], [ %1972, %__nv_exp2f.exit355 ], !dbg !123 + %2100 = phi float [ 0.000000e+00, %19 ], [ %1973, %__nv_exp2f.exit355 ], !dbg !123 + %2101 = phi float [ 0.000000e+00, %19 ], [ %1974, %__nv_exp2f.exit355 ], !dbg !123 + %2102 = phi float [ 0.000000e+00, %19 ], [ %1975, %__nv_exp2f.exit355 ], !dbg !123 + %2103 = phi float [ 0.000000e+00, %19 ], [ %1976, %__nv_exp2f.exit355 ], !dbg !123 + %2104 = phi float [ 0.000000e+00, %19 ], [ %1977, %__nv_exp2f.exit355 ], !dbg !123 + %2105 = phi float [ 0.000000e+00, %19 ], [ %1978, %__nv_exp2f.exit355 ], !dbg !123 + %2106 = phi float [ 0.000000e+00, %19 ], [ %1979, %__nv_exp2f.exit355 ], !dbg !123 + %2107 = phi float [ 0.000000e+00, %19 ], [ %1980, %__nv_exp2f.exit355 ], !dbg !123 + %2108 = phi float [ 0.000000e+00, %19 ], [ %1981, %__nv_exp2f.exit355 ], !dbg !123 + %2109 = phi float [ 0.000000e+00, %19 ], [ %1982, %__nv_exp2f.exit355 ], !dbg !123 + %2110 = phi float [ 0.000000e+00, %19 ], [ %1983, %__nv_exp2f.exit355 ], !dbg !123 + %2111 = phi float [ 0.000000e+00, %19 ], [ %1984, %__nv_exp2f.exit355 ], !dbg !123 + %2112 = phi float [ 0.000000e+00, %19 ], [ %1985, %__nv_exp2f.exit355 ], !dbg !123 + %2113 = phi float [ 0.000000e+00, %19 ], [ %1986, %__nv_exp2f.exit355 ], !dbg !123 + %2114 = phi float [ 0.000000e+00, %19 ], [ %1987, %__nv_exp2f.exit355 ], !dbg !123 + %2115 = phi float [ 0.000000e+00, %19 ], [ %1988, %__nv_exp2f.exit355 ], !dbg !123 + %2116 = phi float [ 0.000000e+00, %19 ], [ %1989, %__nv_exp2f.exit355 ], !dbg !123 + %2117 = phi float [ 0.000000e+00, %19 ], [ %1990, %__nv_exp2f.exit355 ], !dbg !123 + %2118 = phi float [ 0.000000e+00, %19 ], [ %1991, %__nv_exp2f.exit355 ], !dbg !123 + %2119 = phi float [ 0.000000e+00, %19 ], [ %1992, %__nv_exp2f.exit355 ], !dbg !123 + %2120 = phi float [ 0.000000e+00, %19 ], [ %1993, %__nv_exp2f.exit355 ], !dbg !123 + %2121 = phi float [ 0.000000e+00, %19 ], [ %1994, %__nv_exp2f.exit355 ], !dbg !123 + %2122 = phi float [ 0.000000e+00, %19 ], [ %1995, %__nv_exp2f.exit355 ], !dbg !123 + %2123 = phi float [ 0.000000e+00, %19 ], [ %1996, %__nv_exp2f.exit355 ], !dbg !123 + %2124 = phi float [ 0.000000e+00, %19 ], [ %1997, %__nv_exp2f.exit355 ], !dbg !123 + %2125 = phi float [ 0.000000e+00, %19 ], [ %1998, %__nv_exp2f.exit355 ], !dbg !123 + %2126 = phi float [ 0.000000e+00, %19 ], [ %1999, %__nv_exp2f.exit355 ], !dbg !123 + %2127 = phi float [ 0.000000e+00, %19 ], [ %2000, %__nv_exp2f.exit355 ], !dbg !123 + %2128 = phi float [ 0.000000e+00, %19 ], [ %2001, %__nv_exp2f.exit355 ], !dbg !123 + %2129 = phi float [ 0.000000e+00, %19 ], [ %2002, %__nv_exp2f.exit355 ], !dbg !123 + %2130 = phi float [ 0.000000e+00, %19 ], [ %2003, %__nv_exp2f.exit355 ], !dbg !123 + %2131 = phi float [ 0.000000e+00, %19 ], [ %2004, %__nv_exp2f.exit355 ], !dbg !123 + %2132 = phi float [ 0.000000e+00, %19 ], [ %2005, %__nv_exp2f.exit355 ], !dbg !123 + %2133 = phi float [ 0.000000e+00, %19 ], [ %2006, %__nv_exp2f.exit355 ], !dbg !123 + %2134 = phi float [ 0.000000e+00, %19 ], [ %1596, %__nv_exp2f.exit355 ] + %2135 = phi float [ 0.000000e+00, %19 ], [ %1597, %__nv_exp2f.exit355 ] + %2136 = phi <2 x float> [ splat (float 0xFFF0000000000000), %19 ], [ %1335, %__nv_exp2f.exit355 ] + %2137 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %2070, float %2071, float %2072, float %2073, float %2074, float %2075, float %2076, float %2077, float %2078, float %2079, float %2080, float %2081, float %2082, float %2083, float %2084, float %2085, float %2086, float %2087, float %2088, float %2089, float %2090, float %2091, float %2092, float %2093, float %2094, float %2095, float %2096, float %2097, float %2098, float %2099, float %2100, float %2101, float %2102, float %2103, float %2104, float %2105, float %2106, float %2107, float %2108, float %2109, float %2110, float %2111, float %2112, float %2113, float %2114, float %2115, float %2116, float %2117, float %2118, float %2119, float %2120, float %2121, float %2122, float %2123, float %2124, float %2125, float %2126, float %2127, float %2128, float %2129, float %2130, float %2131, float %2132, float %2133) #2, !dbg !54 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !54 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !54 + %2138 = getelementptr i32, ptr addrspace(1) %8, i64 %192, !dbg !124 + %2139 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %2138) #2, !dbg !125 + %2140 = shl i32 %2139, 7, !dbg !126 + %2141 = getelementptr i32, ptr addrspace(1) %7, i64 %196, !dbg !127 + %2142 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %2141) #2, !dbg !128 + %2143 = shl i32 %2142, 1, !dbg !129 + %2144 = tail call i32 @llvm.smin.i32(i32 %2143, i32 %202), !dbg !130 + %2145 = icmp sgt i32 %2143, 0, !dbg !131 + %2146 = or disjoint i32 %2140, %43, !dbg !133 + %2147 = or disjoint i32 %2140, %44, !dbg !133 + %2148 = or disjoint i32 %2140, %45, !dbg !133 + %2149 = or disjoint i32 %2140, %46, !dbg !133 + %2150 = shl i32 %2146, 7, !dbg !134 + %2151 = shl i32 %2147, 7, !dbg !134 + %2152 = shl i32 %2148, 7, !dbg !134 + %2153 = shl i32 %2149, 7, !dbg !134 + %2154 = sext i32 %2150 to i64, !dbg !135 + %2155 = getelementptr bfloat, ptr addrspace(1) %34, i64 %2154, !dbg !135 + %2156 = sext i32 %2151 to i64, !dbg !135 + %2157 = getelementptr bfloat, ptr addrspace(1) %34, i64 %2156, !dbg !135 + %2158 = sext i32 %2152 to i64, !dbg !135 + %2159 = getelementptr bfloat, ptr addrspace(1) %34, i64 %2158, !dbg !135 + %2160 = sext i32 %2153 to i64, !dbg !135 + %2161 = getelementptr bfloat, ptr addrspace(1) %34, i64 %2160, !dbg !135 + %2162 = getelementptr bfloat, ptr addrspace(1) %2155, i64 %86, !dbg !136 + %2163 = getelementptr bfloat, ptr addrspace(1) %2157, i64 %86, !dbg !136 + %2164 = getelementptr bfloat, ptr addrspace(1) %2159, i64 %86, !dbg !136 + %2165 = getelementptr bfloat, ptr addrspace(1) %2161, i64 %86, !dbg !136 + %2166 = icmp slt i32 %2146, %12, !dbg !137 + %2167 = icmp slt i32 %2147, %12, !dbg !137 + %2168 = icmp slt i32 %2148, %12, !dbg !137 + %2169 = icmp slt i32 %2149, %12, !dbg !137 + %2170 = and i1 %2145, %2166, !dbg !131 + %2171 = and i1 %2145, %2167, !dbg !131 + %2172 = and i1 %2145, %2168, !dbg !131 + %2173 = and i1 %2145, %2169, !dbg !131 + %2174 = select i1 %2170, i32 16, i32 0, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %264, ptr addrspace(1) %2162, i32 %2174) #2, !dbg !138 + %2175 = select i1 %2171, i32 16, i32 0, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %267, ptr addrspace(1) %2163, i32 %2175) #2, !dbg !138 + %2176 = select i1 %2172, i32 16, i32 0, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %270, ptr addrspace(1) %2164, i32 %2176) #2, !dbg !138 + %2177 = select i1 %2173, i32 16, i32 0, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %273, ptr addrspace(1) %2165, i32 %2177) #2, !dbg !138 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !138 + %2178 = getelementptr bfloat, ptr addrspace(1) %35, i64 %2154, !dbg !135 + %2179 = getelementptr bfloat, ptr addrspace(1) %35, i64 %2156, !dbg !135 + %2180 = getelementptr bfloat, ptr addrspace(1) %35, i64 %2158, !dbg !135 + %2181 = getelementptr bfloat, ptr addrspace(1) %35, i64 %2160, !dbg !135 + %2182 = getelementptr bfloat, ptr addrspace(1) %2178, i64 %86, !dbg !136 + %2183 = getelementptr bfloat, ptr addrspace(1) %2179, i64 %86, !dbg !136 + %2184 = getelementptr bfloat, ptr addrspace(1) %2180, i64 %86, !dbg !136 + %2185 = getelementptr bfloat, ptr addrspace(1) %2181, i64 %86, !dbg !136 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %283, ptr addrspace(1) %2182, i32 %2174) #2, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %284, ptr addrspace(1) %2183, i32 %2175) #2, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %285, ptr addrspace(1) %2184, i32 %2176) #2, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %286, ptr addrspace(1) %2185, i32 %2177) #2, !dbg !138 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !138 + %2186 = icmp sgt i32 %2144, 1, !dbg !131 + %2187 = or disjoint i32 %2140, 64, !dbg !139 + %2188 = or disjoint i32 %2187, %43, !dbg !133 + %2189 = or disjoint i32 %2187, %44, !dbg !133 + %2190 = or disjoint i32 %2187, %45, !dbg !133 + %2191 = or disjoint i32 %2187, %46, !dbg !133 + %2192 = shl i32 %2188, 7, !dbg !134 + %2193 = shl i32 %2189, 7, !dbg !134 + %2194 = shl i32 %2190, 7, !dbg !134 + %2195 = shl i32 %2191, 7, !dbg !134 + %2196 = sext i32 %2192 to i64, !dbg !135 + %2197 = getelementptr bfloat, ptr addrspace(1) %34, i64 %2196, !dbg !135 + %2198 = sext i32 %2193 to i64, !dbg !135 + %2199 = getelementptr bfloat, ptr addrspace(1) %34, i64 %2198, !dbg !135 + %2200 = sext i32 %2194 to i64, !dbg !135 + %2201 = getelementptr bfloat, ptr addrspace(1) %34, i64 %2200, !dbg !135 + %2202 = sext i32 %2195 to i64, !dbg !135 + %2203 = getelementptr bfloat, ptr addrspace(1) %34, i64 %2202, !dbg !135 + %2204 = getelementptr bfloat, ptr addrspace(1) %2197, i64 %86, !dbg !136 + %2205 = getelementptr bfloat, ptr addrspace(1) %2199, i64 %86, !dbg !136 + %2206 = getelementptr bfloat, ptr addrspace(1) %2201, i64 %86, !dbg !136 + %2207 = getelementptr bfloat, ptr addrspace(1) %2203, i64 %86, !dbg !136 + %2208 = icmp slt i32 %2188, %12, !dbg !137 + %2209 = icmp slt i32 %2189, %12, !dbg !137 + %2210 = icmp slt i32 %2190, %12, !dbg !137 + %2211 = icmp slt i32 %2191, %12, !dbg !137 + %2212 = and i1 %2186, %2208, !dbg !131 + %2213 = and i1 %2186, %2209, !dbg !131 + %2214 = and i1 %2186, %2210, !dbg !131 + %2215 = and i1 %2186, %2211, !dbg !131 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !138 + %2216 = select i1 %2212, i32 16, i32 0, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %317, ptr addrspace(1) %2204, i32 %2216) #2, !dbg !138 + %2217 = select i1 %2213, i32 16, i32 0, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %319, ptr addrspace(1) %2205, i32 %2217) #2, !dbg !138 + %2218 = select i1 %2214, i32 16, i32 0, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %321, ptr addrspace(1) %2206, i32 %2218) #2, !dbg !138 + %2219 = select i1 %2215, i32 16, i32 0, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %323, ptr addrspace(1) %2207, i32 %2219) #2, !dbg !138 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !138 + %2220 = getelementptr bfloat, ptr addrspace(1) %35, i64 %2196, !dbg !135 + %2221 = getelementptr bfloat, ptr addrspace(1) %35, i64 %2198, !dbg !135 + %2222 = getelementptr bfloat, ptr addrspace(1) %35, i64 %2200, !dbg !135 + %2223 = getelementptr bfloat, ptr addrspace(1) %35, i64 %2202, !dbg !135 + %2224 = getelementptr bfloat, ptr addrspace(1) %2220, i64 %86, !dbg !136 + %2225 = getelementptr bfloat, ptr addrspace(1) %2221, i64 %86, !dbg !136 + %2226 = getelementptr bfloat, ptr addrspace(1) %2222, i64 %86, !dbg !136 + %2227 = getelementptr bfloat, ptr addrspace(1) %2223, i64 %86, !dbg !136 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %333, ptr addrspace(1) %2224, i32 %2216) #2, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %334, ptr addrspace(1) %2225, i32 %2217) #2, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %335, ptr addrspace(1) %2226, i32 %2218) #2, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %336, ptr addrspace(1) %2227, i32 %2219) #2, !dbg !138 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !138 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #2, !dbg !140 + %2228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 0, !dbg !131 + %2229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 1, !dbg !131 + %2230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 2, !dbg !131 + %2231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 3, !dbg !131 + %2232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 4, !dbg !131 + %2233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 5, !dbg !131 + %2234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 6, !dbg !131 + %2235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 7, !dbg !131 + %2236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 8, !dbg !131 + %2237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 9, !dbg !131 + %2238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 10, !dbg !131 + %2239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 11, !dbg !131 + %2240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 12, !dbg !131 + %2241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 13, !dbg !131 + %2242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 14, !dbg !131 + %2243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 15, !dbg !131 + %2244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 16, !dbg !131 + %2245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 17, !dbg !131 + %2246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 18, !dbg !131 + %2247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 19, !dbg !131 + %2248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 20, !dbg !131 + %2249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 21, !dbg !131 + %2250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 22, !dbg !131 + %2251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 23, !dbg !131 + %2252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 24, !dbg !131 + %2253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 25, !dbg !131 + %2254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 26, !dbg !131 + %2255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 27, !dbg !131 + %2256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 28, !dbg !131 + %2257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 29, !dbg !131 + %2258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 30, !dbg !131 + %2259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 31, !dbg !131 + %2260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 32, !dbg !131 + %2261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 33, !dbg !131 + %2262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 34, !dbg !131 + %2263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 35, !dbg !131 + %2264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 36, !dbg !131 + %2265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 37, !dbg !131 + %2266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 38, !dbg !131 + %2267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 39, !dbg !131 + %2268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 40, !dbg !131 + %2269 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 41, !dbg !131 + %2270 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 42, !dbg !131 + %2271 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 43, !dbg !131 + %2272 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 44, !dbg !131 + %2273 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 45, !dbg !131 + %2274 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 46, !dbg !131 + %2275 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 47, !dbg !131 + %2276 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 48, !dbg !131 + %2277 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 49, !dbg !131 + %2278 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 50, !dbg !131 + %2279 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 51, !dbg !131 + %2280 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 52, !dbg !131 + %2281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 53, !dbg !131 + %2282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 54, !dbg !131 + %2283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 55, !dbg !131 + %2284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 56, !dbg !131 + %2285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 57, !dbg !131 + %2286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 58, !dbg !131 + %2287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 59, !dbg !131 + %2288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 60, !dbg !131 + %2289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 61, !dbg !131 + %2290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 62, !dbg !131 + %2291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 63, !dbg !131 + br i1 %2145, label %.lr.ph460, label %._crit_edge461, !dbg !131 + +.lr.ph460: ; preds = %._crit_edge + %2292 = insertelement <16 x i32> poison, i32 %2140, i64 0, !dbg !141 + %2293 = shufflevector <16 x i32> %2292, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !141 + %2294 = shufflevector <2 x i32> %209, <2 x i32> poison, <16 x i32> , !dbg !141 + %2295 = insertelement <16 x i32> %2294, i32 %206, i64 14, !dbg !141 + %2296 = insertelement <16 x i32> %2295, i32 %205, i64 15, !dbg !141 + %2297 = shufflevector <8 x i32> %215, <8 x i32> poison, <16 x i32> , !dbg !141 + %2298 = shufflevector <16 x i32> %2297, <16 x i32> %2296, <16 x i32> , !dbg !141 + %2299 = shufflevector <4 x i32> %212, <4 x i32> poison, <16 x i32> , !dbg !141 + %2300 = shufflevector <16 x i32> %2298, <16 x i32> %2299, <16 x i32> , !dbg !141 + %2301 = or disjoint <16 x i32> %2293, %2300, !dbg !141 + %2302 = add nsw i32 %2144, -2 + %2303 = add nsw i32 %2144, -1 + %smax471 = tail call i32 @llvm.smax.i32(i32 %2144, i32 1), !dbg !131 + %2304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 0, !dbg !131 + %2305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 1, !dbg !131 + %2306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 2, !dbg !131 + %2307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 3, !dbg !131 + %2308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 4, !dbg !131 + %2309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 5, !dbg !131 + %2310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 6, !dbg !131 + %2311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 7, !dbg !131 + %2312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 8, !dbg !131 + %2313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 9, !dbg !131 + %2314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 10, !dbg !131 + %2315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 11, !dbg !131 + %2316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 12, !dbg !131 + %2317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 13, !dbg !131 + %2318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 14, !dbg !131 + %2319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 15, !dbg !131 + %2320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 16, !dbg !131 + %2321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 17, !dbg !131 + %2322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 18, !dbg !131 + %2323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 19, !dbg !131 + %2324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 20, !dbg !131 + %2325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 21, !dbg !131 + %2326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 22, !dbg !131 + %2327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 23, !dbg !131 + %2328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 24, !dbg !131 + %2329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 25, !dbg !131 + %2330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 26, !dbg !131 + %2331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 27, !dbg !131 + %2332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 28, !dbg !131 + %2333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 29, !dbg !131 + %2334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 30, !dbg !131 + %2335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 31, !dbg !131 + %2336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 32, !dbg !131 + %2337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 33, !dbg !131 + %2338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 34, !dbg !131 + %2339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 35, !dbg !131 + %2340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 36, !dbg !131 + %2341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 37, !dbg !131 + %2342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 38, !dbg !131 + %2343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 39, !dbg !131 + %2344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 40, !dbg !131 + %2345 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 41, !dbg !131 + %2346 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 42, !dbg !131 + %2347 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 43, !dbg !131 + %2348 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 44, !dbg !131 + %2349 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 45, !dbg !131 + %2350 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 46, !dbg !131 + %2351 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 47, !dbg !131 + %2352 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 48, !dbg !131 + %2353 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 49, !dbg !131 + %2354 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 50, !dbg !131 + %2355 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 51, !dbg !131 + %2356 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 52, !dbg !131 + %2357 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 53, !dbg !131 + %2358 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 54, !dbg !131 + %2359 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 55, !dbg !131 + %2360 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 56, !dbg !131 + %2361 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 57, !dbg !131 + %2362 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 58, !dbg !131 + %2363 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 59, !dbg !131 + %2364 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 60, !dbg !131 + %2365 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 61, !dbg !131 + %2366 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 62, !dbg !131 + %2367 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2137, 63, !dbg !131 + br label %2368, !dbg !131 + +2368: ; preds = %.lr.ph460, %__nv_exp2f.exit253 + %2369 = phi i32 [ 64, %.lr.ph460 ], [ %3697, %__nv_exp2f.exit253 ] + %2370 = phi i32 [ -1, %.lr.ph460 ], [ %2380, %__nv_exp2f.exit253 ] + %2371 = phi i32 [ 1, %.lr.ph460 ], [ %3701, %__nv_exp2f.exit253 ] + %2372 = phi i32 [ 64, %.lr.ph460 ], [ %3698, %__nv_exp2f.exit253 ] + %.pn = phi float [ %2304, %.lr.ph460 ], [ %3611, %__nv_exp2f.exit253 ] + %.pn479 = phi float [ %2305, %.lr.ph460 ], [ %3612, %__nv_exp2f.exit253 ] + %.pn480 = phi float [ %2306, %.lr.ph460 ], [ %3613, %__nv_exp2f.exit253 ] + %.pn481 = phi float [ %2307, %.lr.ph460 ], [ %3614, %__nv_exp2f.exit253 ] + %.pn482 = phi float [ %2308, %.lr.ph460 ], [ %3615, %__nv_exp2f.exit253 ] + %.pn483 = phi float [ %2309, %.lr.ph460 ], [ %3616, %__nv_exp2f.exit253 ] + %.pn484 = phi float [ %2310, %.lr.ph460 ], [ %3617, %__nv_exp2f.exit253 ] + %.pn485 = phi float [ %2311, %.lr.ph460 ], [ %3618, %__nv_exp2f.exit253 ] + %.pn486 = phi float [ %2312, %.lr.ph460 ], [ %3619, %__nv_exp2f.exit253 ] + %.pn487 = phi float [ %2313, %.lr.ph460 ], [ %3620, %__nv_exp2f.exit253 ] + %.pn488 = phi float [ %2314, %.lr.ph460 ], [ %3621, %__nv_exp2f.exit253 ] + %.pn489 = phi float [ %2315, %.lr.ph460 ], [ %3622, %__nv_exp2f.exit253 ] + %.pn490 = phi float [ %2316, %.lr.ph460 ], [ %3623, %__nv_exp2f.exit253 ] + %.pn491 = phi float [ %2317, %.lr.ph460 ], [ %3624, %__nv_exp2f.exit253 ] + %.pn492 = phi float [ %2318, %.lr.ph460 ], [ %3625, %__nv_exp2f.exit253 ] + %.pn493 = phi float [ %2319, %.lr.ph460 ], [ %3626, %__nv_exp2f.exit253 ] + %.pn494 = phi float [ %2320, %.lr.ph460 ], [ %3627, %__nv_exp2f.exit253 ] + %.pn495 = phi float [ %2321, %.lr.ph460 ], [ %3628, %__nv_exp2f.exit253 ] + %.pn496 = phi float [ %2322, %.lr.ph460 ], [ %3629, %__nv_exp2f.exit253 ] + %.pn497 = phi float [ %2323, %.lr.ph460 ], [ %3630, %__nv_exp2f.exit253 ] + %.pn498 = phi float [ %2324, %.lr.ph460 ], [ %3631, %__nv_exp2f.exit253 ] + %.pn499 = phi float [ %2325, %.lr.ph460 ], [ %3632, %__nv_exp2f.exit253 ] + %.pn500 = phi float [ %2326, %.lr.ph460 ], [ %3633, %__nv_exp2f.exit253 ] + %.pn501 = phi float [ %2327, %.lr.ph460 ], [ %3634, %__nv_exp2f.exit253 ] + %.pn502 = phi float [ %2328, %.lr.ph460 ], [ %3635, %__nv_exp2f.exit253 ] + %.pn503 = phi float [ %2329, %.lr.ph460 ], [ %3636, %__nv_exp2f.exit253 ] + %.pn504 = phi float [ %2330, %.lr.ph460 ], [ %3637, %__nv_exp2f.exit253 ] + %.pn505 = phi float [ %2331, %.lr.ph460 ], [ %3638, %__nv_exp2f.exit253 ] + %.pn506 = phi float [ %2332, %.lr.ph460 ], [ %3639, %__nv_exp2f.exit253 ] + %.pn507 = phi float [ %2333, %.lr.ph460 ], [ %3640, %__nv_exp2f.exit253 ] + %.pn508 = phi float [ %2334, %.lr.ph460 ], [ %3641, %__nv_exp2f.exit253 ] + %.pn509 = phi float [ %2335, %.lr.ph460 ], [ %3642, %__nv_exp2f.exit253 ] + %.pn510 = phi float [ %2336, %.lr.ph460 ], [ %3643, %__nv_exp2f.exit253 ] + %.pn511 = phi float [ %2337, %.lr.ph460 ], [ %3644, %__nv_exp2f.exit253 ] + %.pn512 = phi float [ %2338, %.lr.ph460 ], [ %3645, %__nv_exp2f.exit253 ] + %.pn513 = phi float [ %2339, %.lr.ph460 ], [ %3646, %__nv_exp2f.exit253 ] + %.pn514 = phi float [ %2340, %.lr.ph460 ], [ %3647, %__nv_exp2f.exit253 ] + %.pn515 = phi float [ %2341, %.lr.ph460 ], [ %3648, %__nv_exp2f.exit253 ] + %.pn516 = phi float [ %2342, %.lr.ph460 ], [ %3649, %__nv_exp2f.exit253 ] + %.pn517 = phi float [ %2343, %.lr.ph460 ], [ %3650, %__nv_exp2f.exit253 ] + %.pn518 = phi float [ %2344, %.lr.ph460 ], [ %3651, %__nv_exp2f.exit253 ] + %.pn519 = phi float [ %2345, %.lr.ph460 ], [ %3652, %__nv_exp2f.exit253 ] + %.pn520 = phi float [ %2346, %.lr.ph460 ], [ %3653, %__nv_exp2f.exit253 ] + %.pn521 = phi float [ %2347, %.lr.ph460 ], [ %3654, %__nv_exp2f.exit253 ] + %.pn522 = phi float [ %2348, %.lr.ph460 ], [ %3655, %__nv_exp2f.exit253 ] + %.pn523 = phi float [ %2349, %.lr.ph460 ], [ %3656, %__nv_exp2f.exit253 ] + %.pn524 = phi float [ %2350, %.lr.ph460 ], [ %3657, %__nv_exp2f.exit253 ] + %.pn525 = phi float [ %2351, %.lr.ph460 ], [ %3658, %__nv_exp2f.exit253 ] + %.pn526 = phi float [ %2352, %.lr.ph460 ], [ %3659, %__nv_exp2f.exit253 ] + %.pn527 = phi float [ %2353, %.lr.ph460 ], [ %3660, %__nv_exp2f.exit253 ] + %.pn528 = phi float [ %2354, %.lr.ph460 ], [ %3661, %__nv_exp2f.exit253 ] + %.pn529 = phi float [ %2355, %.lr.ph460 ], [ %3662, %__nv_exp2f.exit253 ] + %.pn530 = phi float [ %2356, %.lr.ph460 ], [ %3663, %__nv_exp2f.exit253 ] + %.pn531 = phi float [ %2357, %.lr.ph460 ], [ %3664, %__nv_exp2f.exit253 ] + %.pn532 = phi float [ %2358, %.lr.ph460 ], [ %3665, %__nv_exp2f.exit253 ] + %.pn533 = phi float [ %2359, %.lr.ph460 ], [ %3666, %__nv_exp2f.exit253 ] + %.pn534 = phi float [ %2360, %.lr.ph460 ], [ %3667, %__nv_exp2f.exit253 ] + %.pn535 = phi float [ %2361, %.lr.ph460 ], [ %3668, %__nv_exp2f.exit253 ] + %.pn536 = phi float [ %2362, %.lr.ph460 ], [ %3669, %__nv_exp2f.exit253 ] + %.pn537 = phi float [ %2363, %.lr.ph460 ], [ %3670, %__nv_exp2f.exit253 ] + %.pn538 = phi float [ %2364, %.lr.ph460 ], [ %3671, %__nv_exp2f.exit253 ] + %.pn539 = phi float [ %2365, %.lr.ph460 ], [ %3672, %__nv_exp2f.exit253 ] + %.pn540 = phi float [ %2366, %.lr.ph460 ], [ %3673, %__nv_exp2f.exit253 ] + %.pn541 = phi float [ %2367, %.lr.ph460 ], [ %3674, %__nv_exp2f.exit253 ] + %2373 = phi i32 [ 0, %.lr.ph460 ], [ %3678, %__nv_exp2f.exit253 ] + %.pn546 = phi float [ %2134, %.lr.ph460 ], [ %3264, %__nv_exp2f.exit253 ] + %.pn544 = phi float [ %2135, %.lr.ph460 ], [ %3265, %__nv_exp2f.exit253 ] + %2374 = phi <2 x float> [ %2136, %.lr.ph460 ], [ %3003, %__nv_exp2f.exit253 ] + %2375 = phi <16 x i32> [ %2301, %.lr.ph460 ], [ %3677, %__nv_exp2f.exit253 ] + %2376 = icmp slt i32 %2373, %2302, !dbg !131 + %2377 = icmp slt i32 %2373, %2303, !dbg !131 + %2378 = add i32 %2370, 1, !dbg !131 + %2379 = icmp sgt i32 %2378, 2, !dbg !131 + %2380 = select i1 %2379, i32 0, i32 %2378, !dbg !131 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !138 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !138 + %2381 = shl i32 %2380, 13, !dbg !138 + %2382 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %2381, !dbg !138 + %2383 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %41, i32 0, i32 31), !dbg !140 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !140 + %2384 = shl i32 %2383, 11, !dbg !140 + %2385 = and i32 %2384, 8192, !dbg !140 + %2386 = add i32 %2385, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !140 + %2387 = lshr exact i32 %2386, 4, !dbg !140 + %2388 = and i32 %2387, 16383, !dbg !140 + %2389 = zext nneg i32 %2388 to i64, !dbg !140 + %2390 = or disjoint i64 %2389, 4611686293372403712, !dbg !140 + %2391 = ptrtoint ptr addrspace(3) %2382 to i32, !dbg !140 + %2392 = lshr exact i32 %2391, 4, !dbg !140 + %2393 = and i32 %2392, 16383, !dbg !140 + %2394 = zext nneg i32 %2393 to i64, !dbg !140 + %2395 = or disjoint i64 %2394, 4611686293338849280, !dbg !140 + %2396 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %2390, i64 %2395) #2, !dbg !140 + %2397 = add i32 %2385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 32), !dbg !140 + %2398 = lshr exact i32 %2397, 4, !dbg !140 + %2399 = and i32 %2398, 16383, !dbg !140 + %2400 = zext nneg i32 %2399 to i64, !dbg !140 + %2401 = or disjoint i64 %2400, 4611686293372403712, !dbg !140 + %2402 = add i32 %2391, 32, !dbg !140 + %2403 = lshr exact i32 %2402, 4, !dbg !140 + %2404 = and i32 %2403, 16383, !dbg !140 + %2405 = zext nneg i32 %2404 to i64, !dbg !140 + %2406 = or disjoint i64 %2405, 4611686293338849280, !dbg !140 + %2407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 0, !dbg !140 + %2408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 1, !dbg !140 + %2409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 2, !dbg !140 + %2410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 3, !dbg !140 + %2411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 4, !dbg !140 + %2412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 5, !dbg !140 + %2413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 6, !dbg !140 + %2414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 7, !dbg !140 + %2415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 8, !dbg !140 + %2416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 9, !dbg !140 + %2417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 10, !dbg !140 + %2418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 11, !dbg !140 + %2419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 12, !dbg !140 + %2420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 13, !dbg !140 + %2421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 14, !dbg !140 + %2422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 15, !dbg !140 + %2423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 16, !dbg !140 + %2424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 17, !dbg !140 + %2425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 18, !dbg !140 + %2426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 19, !dbg !140 + %2427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 20, !dbg !140 + %2428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 21, !dbg !140 + %2429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 22, !dbg !140 + %2430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 23, !dbg !140 + %2431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 24, !dbg !140 + %2432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 25, !dbg !140 + %2433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 26, !dbg !140 + %2434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 27, !dbg !140 + %2435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 28, !dbg !140 + %2436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 29, !dbg !140 + %2437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 30, !dbg !140 + %2438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2396, 31, !dbg !140 + %2439 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2407, float %2408, float %2409, float %2410, float %2411, float %2412, float %2413, float %2414, float %2415, float %2416, float %2417, float %2418, float %2419, float %2420, float %2421, float %2422, float %2423, float %2424, float %2425, float %2426, float %2427, float %2428, float %2429, float %2430, float %2431, float %2432, float %2433, float %2434, float %2435, float %2436, float %2437, float %2438, i64 %2401, i64 %2406, i1 true) #2, !dbg !140 + %2440 = add i32 %2385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 64), !dbg !140 + %2441 = lshr exact i32 %2440, 4, !dbg !140 + %2442 = and i32 %2441, 16383, !dbg !140 + %2443 = zext nneg i32 %2442 to i64, !dbg !140 + %2444 = or disjoint i64 %2443, 4611686293372403712, !dbg !140 + %2445 = add i32 %2391, 64, !dbg !140 + %2446 = lshr exact i32 %2445, 4, !dbg !140 + %2447 = and i32 %2446, 16383, !dbg !140 + %2448 = zext nneg i32 %2447 to i64, !dbg !140 + %2449 = or disjoint i64 %2448, 4611686293338849280, !dbg !140 + %2450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 0, !dbg !140 + %2451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 1, !dbg !140 + %2452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 2, !dbg !140 + %2453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 3, !dbg !140 + %2454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 4, !dbg !140 + %2455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 5, !dbg !140 + %2456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 6, !dbg !140 + %2457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 7, !dbg !140 + %2458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 8, !dbg !140 + %2459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 9, !dbg !140 + %2460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 10, !dbg !140 + %2461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 11, !dbg !140 + %2462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 12, !dbg !140 + %2463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 13, !dbg !140 + %2464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 14, !dbg !140 + %2465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 15, !dbg !140 + %2466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 16, !dbg !140 + %2467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 17, !dbg !140 + %2468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 18, !dbg !140 + %2469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 19, !dbg !140 + %2470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 20, !dbg !140 + %2471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 21, !dbg !140 + %2472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 22, !dbg !140 + %2473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 23, !dbg !140 + %2474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 24, !dbg !140 + %2475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 25, !dbg !140 + %2476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 26, !dbg !140 + %2477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 27, !dbg !140 + %2478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 28, !dbg !140 + %2479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 29, !dbg !140 + %2480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 30, !dbg !140 + %2481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2439, 31, !dbg !140 + %2482 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2450, float %2451, float %2452, float %2453, float %2454, float %2455, float %2456, float %2457, float %2458, float %2459, float %2460, float %2461, float %2462, float %2463, float %2464, float %2465, float %2466, float %2467, float %2468, float %2469, float %2470, float %2471, float %2472, float %2473, float %2474, float %2475, float %2476, float %2477, float %2478, float %2479, float %2480, float %2481, i64 %2444, i64 %2449, i1 true) #2, !dbg !140 + %2483 = add i32 %2385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 96), !dbg !140 + %2484 = lshr exact i32 %2483, 4, !dbg !140 + %2485 = and i32 %2484, 16383, !dbg !140 + %2486 = zext nneg i32 %2485 to i64, !dbg !140 + %2487 = or disjoint i64 %2486, 4611686293372403712, !dbg !140 + %2488 = add i32 %2391, 96, !dbg !140 + %2489 = lshr exact i32 %2488, 4, !dbg !140 + %2490 = and i32 %2489, 16383, !dbg !140 + %2491 = zext nneg i32 %2490 to i64, !dbg !140 + %2492 = or disjoint i64 %2491, 4611686293338849280, !dbg !140 + %2493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 0, !dbg !140 + %2494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 1, !dbg !140 + %2495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 2, !dbg !140 + %2496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 3, !dbg !140 + %2497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 4, !dbg !140 + %2498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 5, !dbg !140 + %2499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 6, !dbg !140 + %2500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 7, !dbg !140 + %2501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 8, !dbg !140 + %2502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 9, !dbg !140 + %2503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 10, !dbg !140 + %2504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 11, !dbg !140 + %2505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 12, !dbg !140 + %2506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 13, !dbg !140 + %2507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 14, !dbg !140 + %2508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 15, !dbg !140 + %2509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 16, !dbg !140 + %2510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 17, !dbg !140 + %2511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 18, !dbg !140 + %2512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 19, !dbg !140 + %2513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 20, !dbg !140 + %2514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 21, !dbg !140 + %2515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 22, !dbg !140 + %2516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 23, !dbg !140 + %2517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 24, !dbg !140 + %2518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 25, !dbg !140 + %2519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 26, !dbg !140 + %2520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 27, !dbg !140 + %2521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 28, !dbg !140 + %2522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 29, !dbg !140 + %2523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 30, !dbg !140 + %2524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2482, 31, !dbg !140 + %2525 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2493, float %2494, float %2495, float %2496, float %2497, float %2498, float %2499, float %2500, float %2501, float %2502, float %2503, float %2504, float %2505, float %2506, float %2507, float %2508, float %2509, float %2510, float %2511, float %2512, float %2513, float %2514, float %2515, float %2516, float %2517, float %2518, float %2519, float %2520, float %2521, float %2522, float %2523, float %2524, i64 %2487, i64 %2492, i1 true) #2, !dbg !140 + %2526 = add i32 %2385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16384), !dbg !140 + %2527 = lshr exact i32 %2526, 4, !dbg !140 + %2528 = and i32 %2527, 16383, !dbg !140 + %2529 = zext nneg i32 %2528 to i64, !dbg !140 + %2530 = or disjoint i64 %2529, 4611686293372403712, !dbg !140 + %2531 = add i32 %2391, 8192, !dbg !140 + %2532 = lshr exact i32 %2531, 4, !dbg !140 + %2533 = and i32 %2532, 16383, !dbg !140 + %2534 = zext nneg i32 %2533 to i64, !dbg !140 + %2535 = or disjoint i64 %2534, 4611686293338849280, !dbg !140 + %2536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 0, !dbg !140 + %2537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 1, !dbg !140 + %2538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 2, !dbg !140 + %2539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 3, !dbg !140 + %2540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 4, !dbg !140 + %2541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 5, !dbg !140 + %2542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 6, !dbg !140 + %2543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 7, !dbg !140 + %2544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 8, !dbg !140 + %2545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 9, !dbg !140 + %2546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 10, !dbg !140 + %2547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 11, !dbg !140 + %2548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 12, !dbg !140 + %2549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 13, !dbg !140 + %2550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 14, !dbg !140 + %2551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 15, !dbg !140 + %2552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 16, !dbg !140 + %2553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 17, !dbg !140 + %2554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 18, !dbg !140 + %2555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 19, !dbg !140 + %2556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 20, !dbg !140 + %2557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 21, !dbg !140 + %2558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 22, !dbg !140 + %2559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 23, !dbg !140 + %2560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 24, !dbg !140 + %2561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 25, !dbg !140 + %2562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 26, !dbg !140 + %2563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 27, !dbg !140 + %2564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 28, !dbg !140 + %2565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 29, !dbg !140 + %2566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 30, !dbg !140 + %2567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2525, 31, !dbg !140 + %2568 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2536, float %2537, float %2538, float %2539, float %2540, float %2541, float %2542, float %2543, float %2544, float %2545, float %2546, float %2547, float %2548, float %2549, float %2550, float %2551, float %2552, float %2553, float %2554, float %2555, float %2556, float %2557, float %2558, float %2559, float %2560, float %2561, float %2562, float %2563, float %2564, float %2565, float %2566, float %2567, i64 %2530, i64 %2535, i1 true) #2, !dbg !140 + %2569 = add i32 %2385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16416), !dbg !140 + %2570 = lshr exact i32 %2569, 4, !dbg !140 + %2571 = and i32 %2570, 16383, !dbg !140 + %2572 = zext nneg i32 %2571 to i64, !dbg !140 + %2573 = or disjoint i64 %2572, 4611686293372403712, !dbg !140 + %2574 = add i32 %2391, 8224, !dbg !140 + %2575 = lshr exact i32 %2574, 4, !dbg !140 + %2576 = and i32 %2575, 16383, !dbg !140 + %2577 = zext nneg i32 %2576 to i64, !dbg !140 + %2578 = or disjoint i64 %2577, 4611686293338849280, !dbg !140 + %2579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 0, !dbg !140 + %2580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 1, !dbg !140 + %2581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 2, !dbg !140 + %2582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 3, !dbg !140 + %2583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 4, !dbg !140 + %2584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 5, !dbg !140 + %2585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 6, !dbg !140 + %2586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 7, !dbg !140 + %2587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 8, !dbg !140 + %2588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 9, !dbg !140 + %2589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 10, !dbg !140 + %2590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 11, !dbg !140 + %2591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 12, !dbg !140 + %2592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 13, !dbg !140 + %2593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 14, !dbg !140 + %2594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 15, !dbg !140 + %2595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 16, !dbg !140 + %2596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 17, !dbg !140 + %2597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 18, !dbg !140 + %2598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 19, !dbg !140 + %2599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 20, !dbg !140 + %2600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 21, !dbg !140 + %2601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 22, !dbg !140 + %2602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 23, !dbg !140 + %2603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 24, !dbg !140 + %2604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 25, !dbg !140 + %2605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 26, !dbg !140 + %2606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 27, !dbg !140 + %2607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 28, !dbg !140 + %2608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 29, !dbg !140 + %2609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 30, !dbg !140 + %2610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2568, 31, !dbg !140 + %2611 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2579, float %2580, float %2581, float %2582, float %2583, float %2584, float %2585, float %2586, float %2587, float %2588, float %2589, float %2590, float %2591, float %2592, float %2593, float %2594, float %2595, float %2596, float %2597, float %2598, float %2599, float %2600, float %2601, float %2602, float %2603, float %2604, float %2605, float %2606, float %2607, float %2608, float %2609, float %2610, i64 %2573, i64 %2578, i1 true) #2, !dbg !140 + %2612 = add i32 %2385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16448), !dbg !140 + %2613 = lshr exact i32 %2612, 4, !dbg !140 + %2614 = and i32 %2613, 16383, !dbg !140 + %2615 = zext nneg i32 %2614 to i64, !dbg !140 + %2616 = or disjoint i64 %2615, 4611686293372403712, !dbg !140 + %2617 = add i32 %2391, 8256, !dbg !140 + %2618 = lshr exact i32 %2617, 4, !dbg !140 + %2619 = and i32 %2618, 16383, !dbg !140 + %2620 = zext nneg i32 %2619 to i64, !dbg !140 + %2621 = or disjoint i64 %2620, 4611686293338849280, !dbg !140 + %2622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 0, !dbg !140 + %2623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 1, !dbg !140 + %2624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 2, !dbg !140 + %2625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 3, !dbg !140 + %2626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 4, !dbg !140 + %2627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 5, !dbg !140 + %2628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 6, !dbg !140 + %2629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 7, !dbg !140 + %2630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 8, !dbg !140 + %2631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 9, !dbg !140 + %2632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 10, !dbg !140 + %2633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 11, !dbg !140 + %2634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 12, !dbg !140 + %2635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 13, !dbg !140 + %2636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 14, !dbg !140 + %2637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 15, !dbg !140 + %2638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 16, !dbg !140 + %2639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 17, !dbg !140 + %2640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 18, !dbg !140 + %2641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 19, !dbg !140 + %2642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 20, !dbg !140 + %2643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 21, !dbg !140 + %2644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 22, !dbg !140 + %2645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 23, !dbg !140 + %2646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 24, !dbg !140 + %2647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 25, !dbg !140 + %2648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 26, !dbg !140 + %2649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 27, !dbg !140 + %2650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 28, !dbg !140 + %2651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 29, !dbg !140 + %2652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 30, !dbg !140 + %2653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2611, 31, !dbg !140 + %2654 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2622, float %2623, float %2624, float %2625, float %2626, float %2627, float %2628, float %2629, float %2630, float %2631, float %2632, float %2633, float %2634, float %2635, float %2636, float %2637, float %2638, float %2639, float %2640, float %2641, float %2642, float %2643, float %2644, float %2645, float %2646, float %2647, float %2648, float %2649, float %2650, float %2651, float %2652, float %2653, i64 %2616, i64 %2621, i1 true) #2, !dbg !140 + %2655 = add i32 %2385, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16480), !dbg !140 + %2656 = lshr exact i32 %2655, 4, !dbg !140 + %2657 = and i32 %2656, 16383, !dbg !140 + %2658 = zext nneg i32 %2657 to i64, !dbg !140 + %2659 = or disjoint i64 %2658, 4611686293372403712, !dbg !140 + %2660 = add i32 %2391, 8288, !dbg !140 + %2661 = lshr exact i32 %2660, 4, !dbg !140 + %2662 = and i32 %2661, 16383, !dbg !140 + %2663 = zext nneg i32 %2662 to i64, !dbg !140 + %2664 = or disjoint i64 %2663, 4611686293338849280, !dbg !140 + %2665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 0, !dbg !140 + %2666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 1, !dbg !140 + %2667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 2, !dbg !140 + %2668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 3, !dbg !140 + %2669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 4, !dbg !140 + %2670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 5, !dbg !140 + %2671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 6, !dbg !140 + %2672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 7, !dbg !140 + %2673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 8, !dbg !140 + %2674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 9, !dbg !140 + %2675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 10, !dbg !140 + %2676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 11, !dbg !140 + %2677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 12, !dbg !140 + %2678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 13, !dbg !140 + %2679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 14, !dbg !140 + %2680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 15, !dbg !140 + %2681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 16, !dbg !140 + %2682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 17, !dbg !140 + %2683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 18, !dbg !140 + %2684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 19, !dbg !140 + %2685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 20, !dbg !140 + %2686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 21, !dbg !140 + %2687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 22, !dbg !140 + %2688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 23, !dbg !140 + %2689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 24, !dbg !140 + %2690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 25, !dbg !140 + %2691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 26, !dbg !140 + %2692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 27, !dbg !140 + %2693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 28, !dbg !140 + %2694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 29, !dbg !140 + %2695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 30, !dbg !140 + %2696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2654, 31, !dbg !140 + %2697 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2665, float %2666, float %2667, float %2668, float %2669, float %2670, float %2671, float %2672, float %2673, float %2674, float %2675, float %2676, float %2677, float %2678, float %2679, float %2680, float %2681, float %2682, float %2683, float %2684, float %2685, float %2686, float %2687, float %2688, float %2689, float %2690, float %2691, float %2692, float %2693, float %2694, float %2695, float %2696, i64 %2659, i64 %2664, i1 true) #2, !dbg !140 + %2698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 0, !dbg !140 + %2699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 1, !dbg !140 + %2700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 2, !dbg !140 + %2701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 3, !dbg !140 + %2702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 4, !dbg !140 + %2703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 5, !dbg !140 + %2704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 6, !dbg !140 + %2705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 7, !dbg !140 + %2706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 8, !dbg !140 + %2707 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 9, !dbg !140 + %2708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 10, !dbg !140 + %2709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 11, !dbg !140 + %2710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 12, !dbg !140 + %2711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 13, !dbg !140 + %2712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 14, !dbg !140 + %2713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 15, !dbg !140 + %2714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 16, !dbg !140 + %2715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 17, !dbg !140 + %2716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 18, !dbg !140 + %2717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 19, !dbg !140 + %2718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 20, !dbg !140 + %2719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 21, !dbg !140 + %2720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 22, !dbg !140 + %2721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 23, !dbg !140 + %2722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 24, !dbg !140 + %2723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 25, !dbg !140 + %2724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 26, !dbg !140 + %2725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 27, !dbg !140 + %2726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 28, !dbg !140 + %2727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 29, !dbg !140 + %2728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 30, !dbg !140 + %2729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2697, 31, !dbg !140 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !140 + %2730 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101"(float %2698, float %2699, float %2700, float %2701, float %2702, float %2703, float %2704, float %2705, float %2706, float %2707, float %2708, float %2709, float %2710, float %2711, float %2712, float %2713, float %2714, float %2715, float %2716, float %2717, float %2718, float %2719, float %2720, float %2721, float %2722, float %2723, float %2724, float %2725, float %2726, float %2727, float %2728, float %2729, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %2382, i32 0, i32 0, float %.pn, float %.pn479, float %.pn480, float %.pn481, float %.pn482, float %.pn483, float %.pn484, float %.pn485, float %.pn486, float %.pn487, float %.pn488, float %.pn489, float %.pn490, float %.pn491, float %.pn492, float %.pn493, float %.pn494, float %.pn495, float %.pn496, float %.pn497, float %.pn498, float %.pn499, float %.pn500, float %.pn501, float %.pn502, float %.pn503, float %.pn504, float %.pn505, float %.pn506, float %.pn507, float %.pn508, float %.pn509, float %.pn510, float %.pn511, float %.pn512, float %.pn513, float %.pn514, float %.pn515, float %.pn516, float %.pn517, float %.pn518, float %.pn519, float %.pn520, float %.pn521, float %.pn522, float %.pn523, float %.pn524, float %.pn525, float %.pn526, float %.pn527, float %.pn528, float %.pn529, float %.pn530, float %.pn531, float %.pn532, float %.pn533, float %.pn534, float %.pn535, float %.pn536, float %.pn537, float %.pn538, float %.pn539, float %.pn540, float %.pn541) #2, !dbg !140 + %2731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 0, !dbg !140 + %2732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 1, !dbg !140 + %2733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 2, !dbg !140 + %2734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 3, !dbg !140 + %2735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 4, !dbg !140 + %2736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 5, !dbg !140 + %2737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 6, !dbg !140 + %2738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 7, !dbg !140 + %2739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 8, !dbg !140 + %2740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 9, !dbg !140 + %2741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 10, !dbg !140 + %2742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 11, !dbg !140 + %2743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 12, !dbg !140 + %2744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 13, !dbg !140 + %2745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 14, !dbg !140 + %2746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 15, !dbg !140 + %2747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 16, !dbg !140 + %2748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 17, !dbg !140 + %2749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 18, !dbg !140 + %2750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 19, !dbg !140 + %2751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 20, !dbg !140 + %2752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 21, !dbg !140 + %2753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 22, !dbg !140 + %2754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 23, !dbg !140 + %2755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 24, !dbg !140 + %2756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 25, !dbg !140 + %2757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 26, !dbg !140 + %2758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 27, !dbg !140 + %2759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 28, !dbg !140 + %2760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 29, !dbg !140 + %2761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 30, !dbg !140 + %2762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 31, !dbg !140 + %2763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 38, !dbg !140 + %2764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 39, !dbg !140 + %2765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 40, !dbg !140 + %2766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 41, !dbg !140 + %2767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 42, !dbg !140 + %2768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 43, !dbg !140 + %2769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 44, !dbg !140 + %2770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 45, !dbg !140 + %2771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 46, !dbg !140 + %2772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 47, !dbg !140 + %2773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 48, !dbg !140 + %2774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 49, !dbg !140 + %2775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 50, !dbg !140 + %2776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 51, !dbg !140 + %2777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 52, !dbg !140 + %2778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 53, !dbg !140 + %2779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 54, !dbg !140 + %2780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 55, !dbg !140 + %2781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 56, !dbg !140 + %2782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 57, !dbg !140 + %2783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 58, !dbg !140 + %2784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 59, !dbg !140 + %2785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 60, !dbg !140 + %2786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 61, !dbg !140 + %2787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 62, !dbg !140 + %2788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 63, !dbg !140 + %2789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 64, !dbg !140 + %2790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 65, !dbg !140 + %2791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 66, !dbg !140 + %2792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 67, !dbg !140 + %2793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 68, !dbg !140 + %2794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 69, !dbg !140 + %2795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 70, !dbg !140 + %2796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 71, !dbg !140 + %2797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 72, !dbg !140 + %2798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 73, !dbg !140 + %2799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 74, !dbg !140 + %2800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 75, !dbg !140 + %2801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 76, !dbg !140 + %2802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 77, !dbg !140 + %2803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 78, !dbg !140 + %2804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 79, !dbg !140 + %2805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 80, !dbg !140 + %2806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 81, !dbg !140 + %2807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 82, !dbg !140 + %2808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 83, !dbg !140 + %2809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 84, !dbg !140 + %2810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 85, !dbg !140 + %2811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 86, !dbg !140 + %2812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 87, !dbg !140 + %2813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 88, !dbg !140 + %2814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 89, !dbg !140 + %2815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 90, !dbg !140 + %2816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 91, !dbg !140 + %2817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 92, !dbg !140 + %2818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 93, !dbg !140 + %2819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 94, !dbg !140 + %2820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 95, !dbg !140 + %2821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 96, !dbg !140 + %2822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 97, !dbg !140 + %2823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 98, !dbg !140 + %2824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 99, !dbg !140 + %2825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 100, !dbg !140 + %2826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2730, 101, !dbg !140 + %2827 = fmul float %2731, 0x3FB6A09E60000000, !dbg !142 + %2828 = fmul float %2732, 0x3FB6A09E60000000, !dbg !142 + %2829 = fmul float %2733, 0x3FB6A09E60000000, !dbg !142 + %2830 = fmul float %2734, 0x3FB6A09E60000000, !dbg !142 + %2831 = fmul float %2735, 0x3FB6A09E60000000, !dbg !142 + %2832 = fmul float %2736, 0x3FB6A09E60000000, !dbg !142 + %2833 = fmul float %2737, 0x3FB6A09E60000000, !dbg !142 + %2834 = fmul float %2738, 0x3FB6A09E60000000, !dbg !142 + %2835 = fmul float %2739, 0x3FB6A09E60000000, !dbg !142 + %2836 = fmul float %2740, 0x3FB6A09E60000000, !dbg !142 + %2837 = fmul float %2741, 0x3FB6A09E60000000, !dbg !142 + %2838 = fmul float %2742, 0x3FB6A09E60000000, !dbg !142 + %2839 = fmul float %2743, 0x3FB6A09E60000000, !dbg !142 + %2840 = fmul float %2744, 0x3FB6A09E60000000, !dbg !142 + %2841 = fmul float %2745, 0x3FB6A09E60000000, !dbg !142 + %2842 = fmul float %2746, 0x3FB6A09E60000000, !dbg !142 + %2843 = fmul float %2747, 0x3FB6A09E60000000, !dbg !142 + %2844 = fmul float %2748, 0x3FB6A09E60000000, !dbg !142 + %2845 = fmul float %2749, 0x3FB6A09E60000000, !dbg !142 + %2846 = fmul float %2750, 0x3FB6A09E60000000, !dbg !142 + %2847 = fmul float %2751, 0x3FB6A09E60000000, !dbg !142 + %2848 = fmul float %2752, 0x3FB6A09E60000000, !dbg !142 + %2849 = fmul float %2753, 0x3FB6A09E60000000, !dbg !142 + %2850 = fmul float %2754, 0x3FB6A09E60000000, !dbg !142 + %2851 = fmul float %2755, 0x3FB6A09E60000000, !dbg !142 + %2852 = fmul float %2756, 0x3FB6A09E60000000, !dbg !142 + %2853 = fmul float %2757, 0x3FB6A09E60000000, !dbg !142 + %2854 = fmul float %2758, 0x3FB6A09E60000000, !dbg !142 + %2855 = fmul float %2759, 0x3FB6A09E60000000, !dbg !142 + %2856 = fmul float %2760, 0x3FB6A09E60000000, !dbg !142 + %2857 = fmul float %2761, 0x3FB6A09E60000000, !dbg !142 + %2858 = fmul float %2762, 0x3FB6A09E60000000, !dbg !142 + %2859 = extractelement <16 x i32> %2375, i64 15, !dbg !143 + %2860 = icmp slt i32 %2859, %12, !dbg !143 + %2861 = extractelement <16 x i32> %2375, i64 14, !dbg !143 + %2862 = icmp slt i32 %2861, %12, !dbg !143 + %2863 = extractelement <16 x i32> %2375, i64 13, !dbg !143 + %2864 = icmp slt i32 %2863, %12, !dbg !143 + %2865 = extractelement <16 x i32> %2375, i64 12, !dbg !143 + %2866 = icmp slt i32 %2865, %12, !dbg !143 + %2867 = extractelement <16 x i32> %2375, i64 11, !dbg !143 + %2868 = icmp slt i32 %2867, %12, !dbg !143 + %2869 = extractelement <16 x i32> %2375, i64 10, !dbg !143 + %2870 = icmp slt i32 %2869, %12, !dbg !143 + %2871 = extractelement <16 x i32> %2375, i64 9, !dbg !143 + %2872 = icmp slt i32 %2871, %12, !dbg !143 + %2873 = extractelement <16 x i32> %2375, i64 8, !dbg !143 + %2874 = icmp slt i32 %2873, %12, !dbg !143 + %2875 = extractelement <16 x i32> %2375, i64 7, !dbg !143 + %2876 = icmp slt i32 %2875, %12, !dbg !143 + %2877 = extractelement <16 x i32> %2375, i64 6, !dbg !143 + %2878 = icmp slt i32 %2877, %12, !dbg !143 + %2879 = extractelement <16 x i32> %2375, i64 5, !dbg !143 + %2880 = icmp slt i32 %2879, %12, !dbg !143 + %2881 = extractelement <16 x i32> %2375, i64 4, !dbg !143 + %2882 = icmp slt i32 %2881, %12, !dbg !143 + %2883 = extractelement <16 x i32> %2375, i64 3, !dbg !143 + %2884 = icmp slt i32 %2883, %12, !dbg !143 + %2885 = extractelement <16 x i32> %2375, i64 2, !dbg !143 + %2886 = icmp slt i32 %2885, %12, !dbg !143 + %2887 = extractelement <16 x i32> %2375, i64 1, !dbg !143 + %2888 = icmp slt i32 %2887, %12, !dbg !143 + %2889 = extractelement <16 x i32> %2375, i64 0, !dbg !143 + %2890 = icmp slt i32 %2889, %12, !dbg !143 + %2891 = fmul float %2827, 0x3FF7154760000000, !dbg !144 + %2892 = select i1 %2860, float %2891, float 0xFFF0000000000000, !dbg !145 + %2893 = fmul float %2828, 0x3FF7154760000000, !dbg !144 + %2894 = select i1 %2862, float %2893, float 0xFFF0000000000000, !dbg !145 + %2895 = fmul float %2829, 0x3FF7154760000000, !dbg !144 + %2896 = select i1 %2860, float %2895, float 0xFFF0000000000000, !dbg !145 + %2897 = fmul float %2830, 0x3FF7154760000000, !dbg !144 + %2898 = select i1 %2862, float %2897, float 0xFFF0000000000000, !dbg !145 + %2899 = fmul float %2831, 0x3FF7154760000000, !dbg !144 + %2900 = select i1 %2864, float %2899, float 0xFFF0000000000000, !dbg !145 + %2901 = fmul float %2832, 0x3FF7154760000000, !dbg !144 + %2902 = select i1 %2866, float %2901, float 0xFFF0000000000000, !dbg !145 + %2903 = fmul float %2833, 0x3FF7154760000000, !dbg !144 + %2904 = select i1 %2864, float %2903, float 0xFFF0000000000000, !dbg !145 + %2905 = fmul float %2834, 0x3FF7154760000000, !dbg !144 + %2906 = select i1 %2866, float %2905, float 0xFFF0000000000000, !dbg !145 + %2907 = fmul float %2835, 0x3FF7154760000000, !dbg !144 + %2908 = select i1 %2868, float %2907, float 0xFFF0000000000000, !dbg !145 + %2909 = fmul float %2836, 0x3FF7154760000000, !dbg !144 + %2910 = select i1 %2870, float %2909, float 0xFFF0000000000000, !dbg !145 + %2911 = fmul float %2837, 0x3FF7154760000000, !dbg !144 + %2912 = select i1 %2868, float %2911, float 0xFFF0000000000000, !dbg !145 + %2913 = fmul float %2838, 0x3FF7154760000000, !dbg !144 + %2914 = select i1 %2870, float %2913, float 0xFFF0000000000000, !dbg !145 + %2915 = fmul float %2839, 0x3FF7154760000000, !dbg !144 + %2916 = select i1 %2872, float %2915, float 0xFFF0000000000000, !dbg !145 + %2917 = fmul float %2840, 0x3FF7154760000000, !dbg !144 + %2918 = select i1 %2874, float %2917, float 0xFFF0000000000000, !dbg !145 + %2919 = fmul float %2841, 0x3FF7154760000000, !dbg !144 + %2920 = select i1 %2872, float %2919, float 0xFFF0000000000000, !dbg !145 + %2921 = fmul float %2842, 0x3FF7154760000000, !dbg !144 + %2922 = select i1 %2874, float %2921, float 0xFFF0000000000000, !dbg !145 + %2923 = fmul float %2843, 0x3FF7154760000000, !dbg !144 + %2924 = select i1 %2876, float %2923, float 0xFFF0000000000000, !dbg !145 + %2925 = fmul float %2844, 0x3FF7154760000000, !dbg !144 + %2926 = select i1 %2878, float %2925, float 0xFFF0000000000000, !dbg !145 + %2927 = fmul float %2845, 0x3FF7154760000000, !dbg !144 + %2928 = select i1 %2876, float %2927, float 0xFFF0000000000000, !dbg !145 + %2929 = fmul float %2846, 0x3FF7154760000000, !dbg !144 + %2930 = select i1 %2878, float %2929, float 0xFFF0000000000000, !dbg !145 + %2931 = fmul float %2847, 0x3FF7154760000000, !dbg !144 + %2932 = select i1 %2880, float %2931, float 0xFFF0000000000000, !dbg !145 + %2933 = fmul float %2848, 0x3FF7154760000000, !dbg !144 + %2934 = select i1 %2882, float %2933, float 0xFFF0000000000000, !dbg !145 + %2935 = fmul float %2849, 0x3FF7154760000000, !dbg !144 + %2936 = select i1 %2880, float %2935, float 0xFFF0000000000000, !dbg !145 + %2937 = fmul float %2850, 0x3FF7154760000000, !dbg !144 + %2938 = select i1 %2882, float %2937, float 0xFFF0000000000000, !dbg !145 + %2939 = fmul float %2851, 0x3FF7154760000000, !dbg !144 + %2940 = select i1 %2884, float %2939, float 0xFFF0000000000000, !dbg !145 + %2941 = fmul float %2852, 0x3FF7154760000000, !dbg !144 + %2942 = select i1 %2886, float %2941, float 0xFFF0000000000000, !dbg !145 + %2943 = fmul float %2853, 0x3FF7154760000000, !dbg !144 + %2944 = select i1 %2884, float %2943, float 0xFFF0000000000000, !dbg !145 + %2945 = fmul float %2854, 0x3FF7154760000000, !dbg !144 + %2946 = select i1 %2886, float %2945, float 0xFFF0000000000000, !dbg !145 + %2947 = fmul float %2855, 0x3FF7154760000000, !dbg !144 + %2948 = select i1 %2888, float %2947, float 0xFFF0000000000000, !dbg !145 + %2949 = fmul float %2856, 0x3FF7154760000000, !dbg !144 + %2950 = select i1 %2890, float %2949, float 0xFFF0000000000000, !dbg !145 + %2951 = fmul float %2857, 0x3FF7154760000000, !dbg !144 + %2952 = select i1 %2888, float %2951, float 0xFFF0000000000000, !dbg !145 + %2953 = fmul float %2858, 0x3FF7154760000000, !dbg !144 + %2954 = select i1 %2890, float %2953, float 0xFFF0000000000000, !dbg !145 + %2955 = tail call float @llvm.maxnum.f32(float %2892, float %2894), !dbg !146 + %2956 = tail call float @llvm.maxnum.f32(float %2896, float %2898), !dbg !146 + %2957 = tail call float @llvm.maxnum.f32(float %2955, float %2900), !dbg !146 + %2958 = tail call float @llvm.maxnum.f32(float %2957, float %2902), !dbg !146 + %2959 = tail call float @llvm.maxnum.f32(float %2956, float %2904), !dbg !146 + %2960 = tail call float @llvm.maxnum.f32(float %2959, float %2906), !dbg !146 + %2961 = tail call float @llvm.maxnum.f32(float %2958, float %2908), !dbg !146 + %2962 = tail call float @llvm.maxnum.f32(float %2961, float %2910), !dbg !146 + %2963 = tail call float @llvm.maxnum.f32(float %2960, float %2912), !dbg !146 + %2964 = tail call float @llvm.maxnum.f32(float %2963, float %2914), !dbg !146 + %2965 = tail call float @llvm.maxnum.f32(float %2962, float %2916), !dbg !146 + %2966 = tail call float @llvm.maxnum.f32(float %2965, float %2918), !dbg !146 + %2967 = tail call float @llvm.maxnum.f32(float %2964, float %2920), !dbg !146 + %2968 = tail call float @llvm.maxnum.f32(float %2967, float %2922), !dbg !146 + %2969 = tail call float @llvm.maxnum.f32(float %2966, float %2924), !dbg !146 + %2970 = tail call float @llvm.maxnum.f32(float %2969, float %2926), !dbg !146 + %2971 = tail call float @llvm.maxnum.f32(float %2968, float %2928), !dbg !146 + %2972 = tail call float @llvm.maxnum.f32(float %2971, float %2930), !dbg !146 + %2973 = tail call float @llvm.maxnum.f32(float %2970, float %2932), !dbg !146 + %2974 = tail call float @llvm.maxnum.f32(float %2973, float %2934), !dbg !146 + %2975 = tail call float @llvm.maxnum.f32(float %2972, float %2936), !dbg !146 + %2976 = tail call float @llvm.maxnum.f32(float %2975, float %2938), !dbg !146 + %2977 = tail call float @llvm.maxnum.f32(float %2974, float %2940), !dbg !146 + %2978 = tail call float @llvm.maxnum.f32(float %2977, float %2942), !dbg !146 + %2979 = tail call float @llvm.maxnum.f32(float %2976, float %2944), !dbg !146 + %2980 = tail call float @llvm.maxnum.f32(float %2979, float %2946), !dbg !146 + %2981 = tail call float @llvm.maxnum.f32(float %2978, float %2948), !dbg !146 + %2982 = tail call float @llvm.maxnum.f32(float %2981, float %2950), !dbg !146 + %2983 = tail call float @llvm.maxnum.f32(float %2980, float %2952), !dbg !146 + %2984 = tail call float @llvm.maxnum.f32(float %2983, float %2954), !dbg !146 + %2985 = bitcast float %2982 to i32, !dbg !147 + %2986 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2985, i32 2, i32 31), !dbg !147 + %2987 = bitcast i32 %2986 to float, !dbg !147 + %2988 = bitcast float %2984 to i32, !dbg !147 + %2989 = tail call float @llvm.maxnum.f32(float %2982, float %2987), !dbg !146 + %2990 = bitcast float %2989 to i32, !dbg !147 + %2991 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2990, i32 1, i32 31), !dbg !147 + %2992 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2988, i32 2, i32 31), !dbg !147 + %2993 = bitcast i32 %2992 to float, !dbg !147 + %2994 = tail call float @llvm.maxnum.f32(float %2984, float %2993), !dbg !146 + %2995 = bitcast float %2994 to i32, !dbg !147 + %2996 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2995, i32 1, i32 31), !dbg !147 + %2997 = insertelement <2 x i32> poison, i32 %2991, i64 0, !dbg !147 + %2998 = insertelement <2 x i32> %2997, i32 %2996, i64 1, !dbg !147 + %2999 = bitcast <2 x i32> %2998 to <2 x float>, !dbg !147 + %3000 = insertelement <2 x float> poison, float %2989, i64 0, !dbg !146 + %3001 = insertelement <2 x float> %3000, float %2994, i64 1, !dbg !146 + %3002 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %3001, <2 x float> %2999), !dbg !146 + %3003 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %2374, <2 x float> %3002), !dbg !148 + %3004 = extractelement <2 x float> %3003, i64 0, !dbg !149 + %3005 = fcmp oeq float %3004, 0xFFF0000000000000, !dbg !150 + %3006 = extractelement <2 x float> %3003, i64 1, !dbg !149 + %3007 = fcmp oeq float %3006, 0xFFF0000000000000, !dbg !150 + %3008 = select i1 %3005, float 0.000000e+00, float %3004, !dbg !149 + %3009 = select i1 %3007, float 0.000000e+00, float %3006, !dbg !149 + %3010 = extractelement <2 x float> %2374, i64 0, !dbg !151 + %3011 = fsub float %3010, %3008, !dbg !151 + %3012 = extractelement <2 x float> %2374, i64 1, !dbg !151 + %3013 = fsub float %3012, %3009, !dbg !151 + %3014 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !152 + %.not.i = icmp eq i32 %3014, 0, !dbg !152 + br i1 %.not.i, label %3017, label %3015, !dbg !152 + +3015: ; preds = %2368 + %3016 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3011) #2, !dbg !152 + br label %__nv_exp2f.exit, !dbg !152 + +3017: ; preds = %2368 + %3018 = tail call float @llvm.nvvm.ex2.approx.f(float %3011) #2, !dbg !152 + br label %__nv_exp2f.exit, !dbg !152 + +__nv_exp2f.exit: ; preds = %3015, %3017 + %.0.i = phi float [ %3016, %3015 ], [ %3018, %3017 ], !dbg !152 + %3019 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !152 + %.not.i155 = icmp eq i32 %3019, 0, !dbg !152 + br i1 %.not.i155, label %3022, label %3020, !dbg !152 + +3020: ; preds = %__nv_exp2f.exit + %3021 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3013) #2, !dbg !152 + br label %__nv_exp2f.exit157, !dbg !152 + +3022: ; preds = %__nv_exp2f.exit + %3023 = tail call float @llvm.nvvm.ex2.approx.f(float %3013) #2, !dbg !152 + br label %__nv_exp2f.exit157, !dbg !152 + +__nv_exp2f.exit157: ; preds = %3020, %3022 + %.0.i156 = phi float [ %3021, %3020 ], [ %3023, %3022 ], !dbg !152 + %3024 = fsub float %2892, %3008, !dbg !153 + %3025 = fsub float %2894, %3008, !dbg !153 + %3026 = fsub float %2896, %3009, !dbg !153 + %3027 = fsub float %2898, %3009, !dbg !153 + %3028 = fsub float %2900, %3008, !dbg !153 + %3029 = fsub float %2902, %3008, !dbg !153 + %3030 = fsub float %2904, %3009, !dbg !153 + %3031 = fsub float %2906, %3009, !dbg !153 + %3032 = fsub float %2908, %3008, !dbg !153 + %3033 = fsub float %2910, %3008, !dbg !153 + %3034 = fsub float %2912, %3009, !dbg !153 + %3035 = fsub float %2914, %3009, !dbg !153 + %3036 = fsub float %2916, %3008, !dbg !153 + %3037 = fsub float %2918, %3008, !dbg !153 + %3038 = fsub float %2920, %3009, !dbg !153 + %3039 = fsub float %2922, %3009, !dbg !153 + %3040 = fsub float %2924, %3008, !dbg !153 + %3041 = fsub float %2926, %3008, !dbg !153 + %3042 = fsub float %2928, %3009, !dbg !153 + %3043 = fsub float %2930, %3009, !dbg !153 + %3044 = fsub float %2932, %3008, !dbg !153 + %3045 = fsub float %2934, %3008, !dbg !153 + %3046 = fsub float %2936, %3009, !dbg !153 + %3047 = fsub float %2938, %3009, !dbg !153 + %3048 = fsub float %2940, %3008, !dbg !153 + %3049 = fsub float %2942, %3008, !dbg !153 + %3050 = fsub float %2944, %3009, !dbg !153 + %3051 = fsub float %2946, %3009, !dbg !153 + %3052 = fsub float %2948, %3008, !dbg !153 + %3053 = fsub float %2950, %3008, !dbg !153 + %3054 = fsub float %2952, %3009, !dbg !153 + %3055 = fsub float %2954, %3009, !dbg !153 + %3056 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i158 = icmp eq i32 %3056, 0, !dbg !154 + br i1 %.not.i158, label %3059, label %3057, !dbg !154 + +3057: ; preds = %__nv_exp2f.exit157 + %3058 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3024) #2, !dbg !154 + br label %__nv_exp2f.exit160, !dbg !154 + +3059: ; preds = %__nv_exp2f.exit157 + %3060 = tail call float @llvm.nvvm.ex2.approx.f(float %3024) #2, !dbg !154 + br label %__nv_exp2f.exit160, !dbg !154 + +__nv_exp2f.exit160: ; preds = %3057, %3059 + %.0.i159 = phi float [ %3058, %3057 ], [ %3060, %3059 ], !dbg !154 + %3061 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i161 = icmp eq i32 %3061, 0, !dbg !154 + br i1 %.not.i161, label %3064, label %3062, !dbg !154 + +3062: ; preds = %__nv_exp2f.exit160 + %3063 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3025) #2, !dbg !154 + br label %__nv_exp2f.exit163, !dbg !154 + +3064: ; preds = %__nv_exp2f.exit160 + %3065 = tail call float @llvm.nvvm.ex2.approx.f(float %3025) #2, !dbg !154 + br label %__nv_exp2f.exit163, !dbg !154 + +__nv_exp2f.exit163: ; preds = %3062, %3064 + %.0.i162 = phi float [ %3063, %3062 ], [ %3065, %3064 ], !dbg !154 + %3066 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i164 = icmp eq i32 %3066, 0, !dbg !154 + br i1 %.not.i164, label %3069, label %3067, !dbg !154 + +3067: ; preds = %__nv_exp2f.exit163 + %3068 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3026) #2, !dbg !154 + br label %__nv_exp2f.exit166, !dbg !154 + +3069: ; preds = %__nv_exp2f.exit163 + %3070 = tail call float @llvm.nvvm.ex2.approx.f(float %3026) #2, !dbg !154 + br label %__nv_exp2f.exit166, !dbg !154 + +__nv_exp2f.exit166: ; preds = %3067, %3069 + %.0.i165 = phi float [ %3068, %3067 ], [ %3070, %3069 ], !dbg !154 + %3071 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i167 = icmp eq i32 %3071, 0, !dbg !154 + br i1 %.not.i167, label %3074, label %3072, !dbg !154 + +3072: ; preds = %__nv_exp2f.exit166 + %3073 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3027) #2, !dbg !154 + br label %__nv_exp2f.exit169, !dbg !154 + +3074: ; preds = %__nv_exp2f.exit166 + %3075 = tail call float @llvm.nvvm.ex2.approx.f(float %3027) #2, !dbg !154 + br label %__nv_exp2f.exit169, !dbg !154 + +__nv_exp2f.exit169: ; preds = %3072, %3074 + %.0.i168 = phi float [ %3073, %3072 ], [ %3075, %3074 ], !dbg !154 + %3076 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i170 = icmp eq i32 %3076, 0, !dbg !154 + br i1 %.not.i170, label %3079, label %3077, !dbg !154 + +3077: ; preds = %__nv_exp2f.exit169 + %3078 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3028) #2, !dbg !154 + br label %__nv_exp2f.exit172, !dbg !154 + +3079: ; preds = %__nv_exp2f.exit169 + %3080 = tail call float @llvm.nvvm.ex2.approx.f(float %3028) #2, !dbg !154 + br label %__nv_exp2f.exit172, !dbg !154 + +__nv_exp2f.exit172: ; preds = %3077, %3079 + %.0.i171 = phi float [ %3078, %3077 ], [ %3080, %3079 ], !dbg !154 + %3081 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i173 = icmp eq i32 %3081, 0, !dbg !154 + br i1 %.not.i173, label %3084, label %3082, !dbg !154 + +3082: ; preds = %__nv_exp2f.exit172 + %3083 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3029) #2, !dbg !154 + br label %__nv_exp2f.exit175, !dbg !154 + +3084: ; preds = %__nv_exp2f.exit172 + %3085 = tail call float @llvm.nvvm.ex2.approx.f(float %3029) #2, !dbg !154 + br label %__nv_exp2f.exit175, !dbg !154 + +__nv_exp2f.exit175: ; preds = %3082, %3084 + %.0.i174 = phi float [ %3083, %3082 ], [ %3085, %3084 ], !dbg !154 + %3086 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i176 = icmp eq i32 %3086, 0, !dbg !154 + br i1 %.not.i176, label %3089, label %3087, !dbg !154 + +3087: ; preds = %__nv_exp2f.exit175 + %3088 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3030) #2, !dbg !154 + br label %__nv_exp2f.exit178, !dbg !154 + +3089: ; preds = %__nv_exp2f.exit175 + %3090 = tail call float @llvm.nvvm.ex2.approx.f(float %3030) #2, !dbg !154 + br label %__nv_exp2f.exit178, !dbg !154 + +__nv_exp2f.exit178: ; preds = %3087, %3089 + %.0.i177 = phi float [ %3088, %3087 ], [ %3090, %3089 ], !dbg !154 + %3091 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i179 = icmp eq i32 %3091, 0, !dbg !154 + br i1 %.not.i179, label %3094, label %3092, !dbg !154 + +3092: ; preds = %__nv_exp2f.exit178 + %3093 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3031) #2, !dbg !154 + br label %__nv_exp2f.exit181, !dbg !154 + +3094: ; preds = %__nv_exp2f.exit178 + %3095 = tail call float @llvm.nvvm.ex2.approx.f(float %3031) #2, !dbg !154 + br label %__nv_exp2f.exit181, !dbg !154 + +__nv_exp2f.exit181: ; preds = %3092, %3094 + %.0.i180 = phi float [ %3093, %3092 ], [ %3095, %3094 ], !dbg !154 + %3096 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i182 = icmp eq i32 %3096, 0, !dbg !154 + br i1 %.not.i182, label %3099, label %3097, !dbg !154 + +3097: ; preds = %__nv_exp2f.exit181 + %3098 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3032) #2, !dbg !154 + br label %__nv_exp2f.exit184, !dbg !154 + +3099: ; preds = %__nv_exp2f.exit181 + %3100 = tail call float @llvm.nvvm.ex2.approx.f(float %3032) #2, !dbg !154 + br label %__nv_exp2f.exit184, !dbg !154 + +__nv_exp2f.exit184: ; preds = %3097, %3099 + %.0.i183 = phi float [ %3098, %3097 ], [ %3100, %3099 ], !dbg !154 + %3101 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i185 = icmp eq i32 %3101, 0, !dbg !154 + br i1 %.not.i185, label %3104, label %3102, !dbg !154 + +3102: ; preds = %__nv_exp2f.exit184 + %3103 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3033) #2, !dbg !154 + br label %__nv_exp2f.exit187, !dbg !154 + +3104: ; preds = %__nv_exp2f.exit184 + %3105 = tail call float @llvm.nvvm.ex2.approx.f(float %3033) #2, !dbg !154 + br label %__nv_exp2f.exit187, !dbg !154 + +__nv_exp2f.exit187: ; preds = %3102, %3104 + %.0.i186 = phi float [ %3103, %3102 ], [ %3105, %3104 ], !dbg !154 + %3106 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i188 = icmp eq i32 %3106, 0, !dbg !154 + br i1 %.not.i188, label %3109, label %3107, !dbg !154 + +3107: ; preds = %__nv_exp2f.exit187 + %3108 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3034) #2, !dbg !154 + br label %__nv_exp2f.exit190, !dbg !154 + +3109: ; preds = %__nv_exp2f.exit187 + %3110 = tail call float @llvm.nvvm.ex2.approx.f(float %3034) #2, !dbg !154 + br label %__nv_exp2f.exit190, !dbg !154 + +__nv_exp2f.exit190: ; preds = %3107, %3109 + %.0.i189 = phi float [ %3108, %3107 ], [ %3110, %3109 ], !dbg !154 + %3111 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i191 = icmp eq i32 %3111, 0, !dbg !154 + br i1 %.not.i191, label %3114, label %3112, !dbg !154 + +3112: ; preds = %__nv_exp2f.exit190 + %3113 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3035) #2, !dbg !154 + br label %__nv_exp2f.exit193, !dbg !154 + +3114: ; preds = %__nv_exp2f.exit190 + %3115 = tail call float @llvm.nvvm.ex2.approx.f(float %3035) #2, !dbg !154 + br label %__nv_exp2f.exit193, !dbg !154 + +__nv_exp2f.exit193: ; preds = %3112, %3114 + %.0.i192 = phi float [ %3113, %3112 ], [ %3115, %3114 ], !dbg !154 + %3116 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i194 = icmp eq i32 %3116, 0, !dbg !154 + br i1 %.not.i194, label %3119, label %3117, !dbg !154 + +3117: ; preds = %__nv_exp2f.exit193 + %3118 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3036) #2, !dbg !154 + br label %__nv_exp2f.exit196, !dbg !154 + +3119: ; preds = %__nv_exp2f.exit193 + %3120 = tail call float @llvm.nvvm.ex2.approx.f(float %3036) #2, !dbg !154 + br label %__nv_exp2f.exit196, !dbg !154 + +__nv_exp2f.exit196: ; preds = %3117, %3119 + %.0.i195 = phi float [ %3118, %3117 ], [ %3120, %3119 ], !dbg !154 + %3121 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i197 = icmp eq i32 %3121, 0, !dbg !154 + br i1 %.not.i197, label %3124, label %3122, !dbg !154 + +3122: ; preds = %__nv_exp2f.exit196 + %3123 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3037) #2, !dbg !154 + br label %__nv_exp2f.exit199, !dbg !154 + +3124: ; preds = %__nv_exp2f.exit196 + %3125 = tail call float @llvm.nvvm.ex2.approx.f(float %3037) #2, !dbg !154 + br label %__nv_exp2f.exit199, !dbg !154 + +__nv_exp2f.exit199: ; preds = %3122, %3124 + %.0.i198 = phi float [ %3123, %3122 ], [ %3125, %3124 ], !dbg !154 + %3126 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i200 = icmp eq i32 %3126, 0, !dbg !154 + br i1 %.not.i200, label %3129, label %3127, !dbg !154 + +3127: ; preds = %__nv_exp2f.exit199 + %3128 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3038) #2, !dbg !154 + br label %__nv_exp2f.exit202, !dbg !154 + +3129: ; preds = %__nv_exp2f.exit199 + %3130 = tail call float @llvm.nvvm.ex2.approx.f(float %3038) #2, !dbg !154 + br label %__nv_exp2f.exit202, !dbg !154 + +__nv_exp2f.exit202: ; preds = %3127, %3129 + %.0.i201 = phi float [ %3128, %3127 ], [ %3130, %3129 ], !dbg !154 + %3131 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i203 = icmp eq i32 %3131, 0, !dbg !154 + br i1 %.not.i203, label %3134, label %3132, !dbg !154 + +3132: ; preds = %__nv_exp2f.exit202 + %3133 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3039) #2, !dbg !154 + br label %__nv_exp2f.exit205, !dbg !154 + +3134: ; preds = %__nv_exp2f.exit202 + %3135 = tail call float @llvm.nvvm.ex2.approx.f(float %3039) #2, !dbg !154 + br label %__nv_exp2f.exit205, !dbg !154 + +__nv_exp2f.exit205: ; preds = %3132, %3134 + %.0.i204 = phi float [ %3133, %3132 ], [ %3135, %3134 ], !dbg !154 + %3136 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i206 = icmp eq i32 %3136, 0, !dbg !154 + br i1 %.not.i206, label %3139, label %3137, !dbg !154 + +3137: ; preds = %__nv_exp2f.exit205 + %3138 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3040) #2, !dbg !154 + br label %__nv_exp2f.exit208, !dbg !154 + +3139: ; preds = %__nv_exp2f.exit205 + %3140 = tail call float @llvm.nvvm.ex2.approx.f(float %3040) #2, !dbg !154 + br label %__nv_exp2f.exit208, !dbg !154 + +__nv_exp2f.exit208: ; preds = %3137, %3139 + %.0.i207 = phi float [ %3138, %3137 ], [ %3140, %3139 ], !dbg !154 + %3141 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i209 = icmp eq i32 %3141, 0, !dbg !154 + br i1 %.not.i209, label %3144, label %3142, !dbg !154 + +3142: ; preds = %__nv_exp2f.exit208 + %3143 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3041) #2, !dbg !154 + br label %__nv_exp2f.exit211, !dbg !154 + +3144: ; preds = %__nv_exp2f.exit208 + %3145 = tail call float @llvm.nvvm.ex2.approx.f(float %3041) #2, !dbg !154 + br label %__nv_exp2f.exit211, !dbg !154 + +__nv_exp2f.exit211: ; preds = %3142, %3144 + %.0.i210 = phi float [ %3143, %3142 ], [ %3145, %3144 ], !dbg !154 + %3146 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i212 = icmp eq i32 %3146, 0, !dbg !154 + br i1 %.not.i212, label %3149, label %3147, !dbg !154 + +3147: ; preds = %__nv_exp2f.exit211 + %3148 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3042) #2, !dbg !154 + br label %__nv_exp2f.exit214, !dbg !154 + +3149: ; preds = %__nv_exp2f.exit211 + %3150 = tail call float @llvm.nvvm.ex2.approx.f(float %3042) #2, !dbg !154 + br label %__nv_exp2f.exit214, !dbg !154 + +__nv_exp2f.exit214: ; preds = %3147, %3149 + %.0.i213 = phi float [ %3148, %3147 ], [ %3150, %3149 ], !dbg !154 + %3151 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i215 = icmp eq i32 %3151, 0, !dbg !154 + br i1 %.not.i215, label %3154, label %3152, !dbg !154 + +3152: ; preds = %__nv_exp2f.exit214 + %3153 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3043) #2, !dbg !154 + br label %__nv_exp2f.exit217, !dbg !154 + +3154: ; preds = %__nv_exp2f.exit214 + %3155 = tail call float @llvm.nvvm.ex2.approx.f(float %3043) #2, !dbg !154 + br label %__nv_exp2f.exit217, !dbg !154 + +__nv_exp2f.exit217: ; preds = %3152, %3154 + %.0.i216 = phi float [ %3153, %3152 ], [ %3155, %3154 ], !dbg !154 + %3156 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i218 = icmp eq i32 %3156, 0, !dbg !154 + br i1 %.not.i218, label %3159, label %3157, !dbg !154 + +3157: ; preds = %__nv_exp2f.exit217 + %3158 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3044) #2, !dbg !154 + br label %__nv_exp2f.exit220, !dbg !154 + +3159: ; preds = %__nv_exp2f.exit217 + %3160 = tail call float @llvm.nvvm.ex2.approx.f(float %3044) #2, !dbg !154 + br label %__nv_exp2f.exit220, !dbg !154 + +__nv_exp2f.exit220: ; preds = %3157, %3159 + %.0.i219 = phi float [ %3158, %3157 ], [ %3160, %3159 ], !dbg !154 + %3161 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i221 = icmp eq i32 %3161, 0, !dbg !154 + br i1 %.not.i221, label %3164, label %3162, !dbg !154 + +3162: ; preds = %__nv_exp2f.exit220 + %3163 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3045) #2, !dbg !154 + br label %__nv_exp2f.exit223, !dbg !154 + +3164: ; preds = %__nv_exp2f.exit220 + %3165 = tail call float @llvm.nvvm.ex2.approx.f(float %3045) #2, !dbg !154 + br label %__nv_exp2f.exit223, !dbg !154 + +__nv_exp2f.exit223: ; preds = %3162, %3164 + %.0.i222 = phi float [ %3163, %3162 ], [ %3165, %3164 ], !dbg !154 + %3166 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i224 = icmp eq i32 %3166, 0, !dbg !154 + br i1 %.not.i224, label %3169, label %3167, !dbg !154 + +3167: ; preds = %__nv_exp2f.exit223 + %3168 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3046) #2, !dbg !154 + br label %__nv_exp2f.exit226, !dbg !154 + +3169: ; preds = %__nv_exp2f.exit223 + %3170 = tail call float @llvm.nvvm.ex2.approx.f(float %3046) #2, !dbg !154 + br label %__nv_exp2f.exit226, !dbg !154 + +__nv_exp2f.exit226: ; preds = %3167, %3169 + %.0.i225 = phi float [ %3168, %3167 ], [ %3170, %3169 ], !dbg !154 + %3171 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i227 = icmp eq i32 %3171, 0, !dbg !154 + br i1 %.not.i227, label %3174, label %3172, !dbg !154 + +3172: ; preds = %__nv_exp2f.exit226 + %3173 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3047) #2, !dbg !154 + br label %__nv_exp2f.exit229, !dbg !154 + +3174: ; preds = %__nv_exp2f.exit226 + %3175 = tail call float @llvm.nvvm.ex2.approx.f(float %3047) #2, !dbg !154 + br label %__nv_exp2f.exit229, !dbg !154 + +__nv_exp2f.exit229: ; preds = %3172, %3174 + %.0.i228 = phi float [ %3173, %3172 ], [ %3175, %3174 ], !dbg !154 + %3176 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i230 = icmp eq i32 %3176, 0, !dbg !154 + br i1 %.not.i230, label %3179, label %3177, !dbg !154 + +3177: ; preds = %__nv_exp2f.exit229 + %3178 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3048) #2, !dbg !154 + br label %__nv_exp2f.exit232, !dbg !154 + +3179: ; preds = %__nv_exp2f.exit229 + %3180 = tail call float @llvm.nvvm.ex2.approx.f(float %3048) #2, !dbg !154 + br label %__nv_exp2f.exit232, !dbg !154 + +__nv_exp2f.exit232: ; preds = %3177, %3179 + %.0.i231 = phi float [ %3178, %3177 ], [ %3180, %3179 ], !dbg !154 + %3181 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i233 = icmp eq i32 %3181, 0, !dbg !154 + br i1 %.not.i233, label %3184, label %3182, !dbg !154 + +3182: ; preds = %__nv_exp2f.exit232 + %3183 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3049) #2, !dbg !154 + br label %__nv_exp2f.exit235, !dbg !154 + +3184: ; preds = %__nv_exp2f.exit232 + %3185 = tail call float @llvm.nvvm.ex2.approx.f(float %3049) #2, !dbg !154 + br label %__nv_exp2f.exit235, !dbg !154 + +__nv_exp2f.exit235: ; preds = %3182, %3184 + %.0.i234 = phi float [ %3183, %3182 ], [ %3185, %3184 ], !dbg !154 + %3186 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i236 = icmp eq i32 %3186, 0, !dbg !154 + br i1 %.not.i236, label %3189, label %3187, !dbg !154 + +3187: ; preds = %__nv_exp2f.exit235 + %3188 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3050) #2, !dbg !154 + br label %__nv_exp2f.exit238, !dbg !154 + +3189: ; preds = %__nv_exp2f.exit235 + %3190 = tail call float @llvm.nvvm.ex2.approx.f(float %3050) #2, !dbg !154 + br label %__nv_exp2f.exit238, !dbg !154 + +__nv_exp2f.exit238: ; preds = %3187, %3189 + %.0.i237 = phi float [ %3188, %3187 ], [ %3190, %3189 ], !dbg !154 + %3191 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i239 = icmp eq i32 %3191, 0, !dbg !154 + br i1 %.not.i239, label %3194, label %3192, !dbg !154 + +3192: ; preds = %__nv_exp2f.exit238 + %3193 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3051) #2, !dbg !154 + br label %__nv_exp2f.exit241, !dbg !154 + +3194: ; preds = %__nv_exp2f.exit238 + %3195 = tail call float @llvm.nvvm.ex2.approx.f(float %3051) #2, !dbg !154 + br label %__nv_exp2f.exit241, !dbg !154 + +__nv_exp2f.exit241: ; preds = %3192, %3194 + %.0.i240 = phi float [ %3193, %3192 ], [ %3195, %3194 ], !dbg !154 + %3196 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i242 = icmp eq i32 %3196, 0, !dbg !154 + br i1 %.not.i242, label %3199, label %3197, !dbg !154 + +3197: ; preds = %__nv_exp2f.exit241 + %3198 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3052) #2, !dbg !154 + br label %__nv_exp2f.exit244, !dbg !154 + +3199: ; preds = %__nv_exp2f.exit241 + %3200 = tail call float @llvm.nvvm.ex2.approx.f(float %3052) #2, !dbg !154 + br label %__nv_exp2f.exit244, !dbg !154 + +__nv_exp2f.exit244: ; preds = %3197, %3199 + %.0.i243 = phi float [ %3198, %3197 ], [ %3200, %3199 ], !dbg !154 + %3201 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i245 = icmp eq i32 %3201, 0, !dbg !154 + br i1 %.not.i245, label %3204, label %3202, !dbg !154 + +3202: ; preds = %__nv_exp2f.exit244 + %3203 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3053) #2, !dbg !154 + br label %__nv_exp2f.exit247, !dbg !154 + +3204: ; preds = %__nv_exp2f.exit244 + %3205 = tail call float @llvm.nvvm.ex2.approx.f(float %3053) #2, !dbg !154 + br label %__nv_exp2f.exit247, !dbg !154 + +__nv_exp2f.exit247: ; preds = %3202, %3204 + %.0.i246 = phi float [ %3203, %3202 ], [ %3205, %3204 ], !dbg !154 + %3206 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i248 = icmp eq i32 %3206, 0, !dbg !154 + br i1 %.not.i248, label %3209, label %3207, !dbg !154 + +3207: ; preds = %__nv_exp2f.exit247 + %3208 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3054) #2, !dbg !154 + br label %__nv_exp2f.exit250, !dbg !154 + +3209: ; preds = %__nv_exp2f.exit247 + %3210 = tail call float @llvm.nvvm.ex2.approx.f(float %3054) #2, !dbg !154 + br label %__nv_exp2f.exit250, !dbg !154 + +__nv_exp2f.exit250: ; preds = %3207, %3209 + %.0.i249 = phi float [ %3208, %3207 ], [ %3210, %3209 ], !dbg !154 + %3211 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !154 + %.not.i251 = icmp eq i32 %3211, 0, !dbg !154 + br i1 %.not.i251, label %3214, label %3212, !dbg !154 + +3212: ; preds = %__nv_exp2f.exit250 + %3213 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3055) #2, !dbg !154 + br label %__nv_exp2f.exit253, !dbg !154 + +3214: ; preds = %__nv_exp2f.exit250 + %3215 = tail call float @llvm.nvvm.ex2.approx.f(float %3055) #2, !dbg !154 + br label %__nv_exp2f.exit253, !dbg !154 + +__nv_exp2f.exit253: ; preds = %3212, %3214 + %.0.i252 = phi float [ %3213, %3212 ], [ %3215, %3214 ], !dbg !154 + %3216 = fmul float %.pn546, %.0.i, !dbg !155 + %3217 = fmul float %.pn544, %.0.i156, !dbg !155 + %3218 = fadd float %.0.i159, %.0.i162, !dbg !156 + %3219 = fadd float %.0.i165, %.0.i168, !dbg !156 + %3220 = fadd float %3218, %.0.i171, !dbg !156 + %3221 = fadd float %3220, %.0.i174, !dbg !156 + %3222 = fadd float %3219, %.0.i177, !dbg !156 + %3223 = fadd float %3222, %.0.i180, !dbg !156 + %3224 = fadd float %3221, %.0.i183, !dbg !156 + %3225 = fadd float %3224, %.0.i186, !dbg !156 + %3226 = fadd float %3223, %.0.i189, !dbg !156 + %3227 = fadd float %3226, %.0.i192, !dbg !156 + %3228 = fadd float %3225, %.0.i195, !dbg !156 + %3229 = fadd float %3228, %.0.i198, !dbg !156 + %3230 = fadd float %3227, %.0.i201, !dbg !156 + %3231 = fadd float %3230, %.0.i204, !dbg !156 + %3232 = fadd float %3229, %.0.i207, !dbg !156 + %3233 = fadd float %3232, %.0.i210, !dbg !156 + %3234 = fadd float %3231, %.0.i213, !dbg !156 + %3235 = fadd float %3234, %.0.i216, !dbg !156 + %3236 = fadd float %3233, %.0.i219, !dbg !156 + %3237 = fadd float %3236, %.0.i222, !dbg !156 + %3238 = fadd float %3235, %.0.i225, !dbg !156 + %3239 = fadd float %3238, %.0.i228, !dbg !156 + %3240 = fadd float %3237, %.0.i231, !dbg !156 + %3241 = fadd float %3240, %.0.i234, !dbg !156 + %3242 = fadd float %3239, %.0.i237, !dbg !156 + %3243 = fadd float %3242, %.0.i240, !dbg !156 + %3244 = fadd float %3241, %.0.i243, !dbg !156 + %3245 = fadd float %3244, %.0.i246, !dbg !156 + %3246 = fadd float %3243, %.0.i249, !dbg !156 + %3247 = fadd float %3246, %.0.i252, !dbg !156 + %3248 = bitcast float %3245 to i32, !dbg !157 + %3249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3248, i32 2, i32 31), !dbg !157 + %3250 = bitcast i32 %3249 to float, !dbg !157 + %3251 = fadd float %3245, %3250, !dbg !156 + %3252 = bitcast float %3251 to i32, !dbg !157 + %3253 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3252, i32 1, i32 31), !dbg !157 + %3254 = bitcast i32 %3253 to float, !dbg !157 + %3255 = fadd float %3251, %3254, !dbg !156 + %3256 = bitcast float %3247 to i32, !dbg !157 + %3257 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3256, i32 2, i32 31), !dbg !157 + %3258 = bitcast i32 %3257 to float, !dbg !157 + %3259 = fadd float %3247, %3258, !dbg !156 + %3260 = bitcast float %3259 to i32, !dbg !157 + %3261 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3260, i32 1, i32 31), !dbg !157 + %3262 = bitcast i32 %3261 to float, !dbg !157 + %3263 = fadd float %3259, %3262, !dbg !156 + %3264 = fadd float %3216, %3255, !dbg !158 + %3265 = fadd float %3217, %3263, !dbg !158 + %3266 = fmul float %2763, %.0.i, !dbg !159 + %3267 = fmul float %2764, %.0.i, !dbg !159 + %3268 = fmul float %2765, %.0.i156, !dbg !159 + %3269 = fmul float %2766, %.0.i156, !dbg !159 + %3270 = fmul float %2767, %.0.i, !dbg !159 + %3271 = fmul float %2768, %.0.i, !dbg !159 + %3272 = fmul float %2769, %.0.i156, !dbg !159 + %3273 = fmul float %2770, %.0.i156, !dbg !159 + %3274 = fmul float %2771, %.0.i, !dbg !159 + %3275 = fmul float %2772, %.0.i, !dbg !159 + %3276 = fmul float %2773, %.0.i156, !dbg !159 + %3277 = fmul float %2774, %.0.i156, !dbg !159 + %3278 = fmul float %2775, %.0.i, !dbg !159 + %3279 = fmul float %2776, %.0.i, !dbg !159 + %3280 = fmul float %2777, %.0.i156, !dbg !159 + %3281 = fmul float %2778, %.0.i156, !dbg !159 + %3282 = fmul float %2779, %.0.i, !dbg !159 + %3283 = fmul float %2780, %.0.i, !dbg !159 + %3284 = fmul float %2781, %.0.i156, !dbg !159 + %3285 = fmul float %2782, %.0.i156, !dbg !159 + %3286 = fmul float %2783, %.0.i, !dbg !159 + %3287 = fmul float %2784, %.0.i, !dbg !159 + %3288 = fmul float %2785, %.0.i156, !dbg !159 + %3289 = fmul float %2786, %.0.i156, !dbg !159 + %3290 = fmul float %2787, %.0.i, !dbg !159 + %3291 = fmul float %2788, %.0.i, !dbg !159 + %3292 = fmul float %2789, %.0.i156, !dbg !159 + %3293 = fmul float %2790, %.0.i156, !dbg !159 + %3294 = fmul float %2791, %.0.i, !dbg !159 + %3295 = fmul float %2792, %.0.i, !dbg !159 + %3296 = fmul float %2793, %.0.i156, !dbg !159 + %3297 = fmul float %2794, %.0.i156, !dbg !159 + %3298 = fmul float %2795, %.0.i, !dbg !159 + %3299 = fmul float %2796, %.0.i, !dbg !159 + %3300 = fmul float %2797, %.0.i156, !dbg !159 + %3301 = fmul float %2798, %.0.i156, !dbg !159 + %3302 = fmul float %2799, %.0.i, !dbg !159 + %3303 = fmul float %2800, %.0.i, !dbg !159 + %3304 = fmul float %2801, %.0.i156, !dbg !159 + %3305 = fmul float %2802, %.0.i156, !dbg !159 + %3306 = fmul float %2803, %.0.i, !dbg !159 + %3307 = fmul float %2804, %.0.i, !dbg !159 + %3308 = fmul float %2805, %.0.i156, !dbg !159 + %3309 = fmul float %2806, %.0.i156, !dbg !159 + %3310 = fmul float %2807, %.0.i, !dbg !159 + %3311 = fmul float %2808, %.0.i, !dbg !159 + %3312 = fmul float %2809, %.0.i156, !dbg !159 + %3313 = fmul float %2810, %.0.i156, !dbg !159 + %3314 = fmul float %2811, %.0.i, !dbg !159 + %3315 = fmul float %2812, %.0.i, !dbg !159 + %3316 = fmul float %2813, %.0.i156, !dbg !159 + %3317 = fmul float %2814, %.0.i156, !dbg !159 + %3318 = fmul float %2815, %.0.i, !dbg !159 + %3319 = fmul float %2816, %.0.i, !dbg !159 + %3320 = fmul float %2817, %.0.i156, !dbg !159 + %3321 = fmul float %2818, %.0.i156, !dbg !159 + %3322 = fmul float %2819, %.0.i, !dbg !159 + %3323 = fmul float %2820, %.0.i, !dbg !159 + %3324 = fmul float %2821, %.0.i156, !dbg !159 + %3325 = fmul float %2822, %.0.i156, !dbg !159 + %3326 = fmul float %2823, %.0.i, !dbg !159 + %3327 = fmul float %2824, %.0.i, !dbg !159 + %3328 = fmul float %2825, %.0.i156, !dbg !159 + %3329 = fmul float %2826, %.0.i156, !dbg !159 + %3330 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %2381, !dbg !138 + %3331 = insertelement <2 x float> poison, float %.0.i159, i64 0, !dbg !160 + %3332 = insertelement <2 x float> %3331, float %.0.i162, i64 1, !dbg !160 + %3333 = fptrunc <2 x float> %3332 to <2 x bfloat>, !dbg !160 + %3334 = insertelement <2 x float> poison, float %.0.i165, i64 0, !dbg !160 + %3335 = insertelement <2 x float> %3334, float %.0.i168, i64 1, !dbg !160 + %3336 = fptrunc <2 x float> %3335 to <2 x bfloat>, !dbg !160 + %3337 = insertelement <2 x float> poison, float %.0.i171, i64 0, !dbg !160 + %3338 = insertelement <2 x float> %3337, float %.0.i174, i64 1, !dbg !160 + %3339 = fptrunc <2 x float> %3338 to <2 x bfloat>, !dbg !160 + %3340 = insertelement <2 x float> poison, float %.0.i177, i64 0, !dbg !160 + %3341 = insertelement <2 x float> %3340, float %.0.i180, i64 1, !dbg !160 + %3342 = fptrunc <2 x float> %3341 to <2 x bfloat>, !dbg !160 + %3343 = insertelement <2 x float> poison, float %.0.i183, i64 0, !dbg !160 + %3344 = insertelement <2 x float> %3343, float %.0.i186, i64 1, !dbg !160 + %3345 = fptrunc <2 x float> %3344 to <2 x bfloat>, !dbg !160 + %3346 = insertelement <2 x float> poison, float %.0.i189, i64 0, !dbg !160 + %3347 = insertelement <2 x float> %3346, float %.0.i192, i64 1, !dbg !160 + %3348 = fptrunc <2 x float> %3347 to <2 x bfloat>, !dbg !160 + %3349 = insertelement <2 x float> poison, float %.0.i195, i64 0, !dbg !160 + %3350 = insertelement <2 x float> %3349, float %.0.i198, i64 1, !dbg !160 + %3351 = fptrunc <2 x float> %3350 to <2 x bfloat>, !dbg !160 + %3352 = insertelement <2 x float> poison, float %.0.i201, i64 0, !dbg !160 + %3353 = insertelement <2 x float> %3352, float %.0.i204, i64 1, !dbg !160 + %3354 = fptrunc <2 x float> %3353 to <2 x bfloat>, !dbg !160 + %3355 = insertelement <2 x float> poison, float %.0.i207, i64 0, !dbg !160 + %3356 = insertelement <2 x float> %3355, float %.0.i210, i64 1, !dbg !160 + %3357 = fptrunc <2 x float> %3356 to <2 x bfloat>, !dbg !160 + %3358 = insertelement <2 x float> poison, float %.0.i213, i64 0, !dbg !160 + %3359 = insertelement <2 x float> %3358, float %.0.i216, i64 1, !dbg !160 + %3360 = fptrunc <2 x float> %3359 to <2 x bfloat>, !dbg !160 + %3361 = insertelement <2 x float> poison, float %.0.i219, i64 0, !dbg !160 + %3362 = insertelement <2 x float> %3361, float %.0.i222, i64 1, !dbg !160 + %3363 = fptrunc <2 x float> %3362 to <2 x bfloat>, !dbg !160 + %3364 = insertelement <2 x float> poison, float %.0.i225, i64 0, !dbg !160 + %3365 = insertelement <2 x float> %3364, float %.0.i228, i64 1, !dbg !160 + %3366 = fptrunc <2 x float> %3365 to <2 x bfloat>, !dbg !160 + %3367 = insertelement <2 x float> poison, float %.0.i231, i64 0, !dbg !160 + %3368 = insertelement <2 x float> %3367, float %.0.i234, i64 1, !dbg !160 + %3369 = fptrunc <2 x float> %3368 to <2 x bfloat>, !dbg !160 + %3370 = insertelement <2 x float> poison, float %.0.i237, i64 0, !dbg !160 + %3371 = insertelement <2 x float> %3370, float %.0.i240, i64 1, !dbg !160 + %3372 = fptrunc <2 x float> %3371 to <2 x bfloat>, !dbg !160 + %3373 = insertelement <2 x float> poison, float %.0.i243, i64 0, !dbg !160 + %3374 = insertelement <2 x float> %3373, float %.0.i246, i64 1, !dbg !160 + %3375 = fptrunc <2 x float> %3374 to <2 x bfloat>, !dbg !160 + %3376 = insertelement <2 x float> poison, float %.0.i249, i64 0, !dbg !160 + %3377 = insertelement <2 x float> %3376, float %.0.i252, i64 1, !dbg !160 + %3378 = fptrunc <2 x float> %3377 to <2 x bfloat>, !dbg !160 + %3379 = bitcast <2 x bfloat> %3333 to i32, !dbg !161 + %3380 = bitcast <2 x bfloat> %3336 to i32, !dbg !161 + %3381 = bitcast <2 x bfloat> %3339 to i32, !dbg !161 + %3382 = bitcast <2 x bfloat> %3342 to i32, !dbg !161 + %3383 = bitcast <2 x bfloat> %3345 to i32, !dbg !161 + %3384 = bitcast <2 x bfloat> %3348 to i32, !dbg !161 + %3385 = bitcast <2 x bfloat> %3351 to i32, !dbg !161 + %3386 = bitcast <2 x bfloat> %3354 to i32, !dbg !161 + %3387 = bitcast <2 x bfloat> %3357 to i32, !dbg !161 + %3388 = bitcast <2 x bfloat> %3360 to i32, !dbg !161 + %3389 = bitcast <2 x bfloat> %3363 to i32, !dbg !161 + %3390 = bitcast <2 x bfloat> %3366 to i32, !dbg !161 + %3391 = bitcast <2 x bfloat> %3369 to i32, !dbg !161 + %3392 = bitcast <2 x bfloat> %3372 to i32, !dbg !161 + %3393 = bitcast <2 x bfloat> %3375 to i32, !dbg !161 + %3394 = bitcast <2 x bfloat> %3378 to i32, !dbg !161 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !161 + %3395 = ptrtoint ptr addrspace(3) %3330 to i32, !dbg !161 + %3396 = lshr exact i32 %3395, 4, !dbg !161 + %3397 = and i32 %3396, 16383, !dbg !161 + %3398 = zext nneg i32 %3397 to i64, !dbg !161 + %3399 = or disjoint i64 %3398, 4611686293338849280, !dbg !161 + %3400 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3266, float %3267, float %3268, float %3269, float %3270, float %3271, float %3272, float %3273, float %3274, float %3275, float %3276, float %3277, float %3278, float %3279, float %3280, float %3281, float %3282, float %3283, float %3284, float %3285, float %3286, float %3287, float %3288, float %3289, float %3290, float %3291, float %3292, float %3293, float %3294, float %3295, float %3296, float %3297, float %3298, float %3299, float %3300, float %3301, float %3302, float %3303, float %3304, float %3305, float %3306, float %3307, float %3308, float %3309, float %3310, float %3311, float %3312, float %3313, float %3314, float %3315, float %3316, float %3317, float %3318, float %3319, float %3320, float %3321, float %3322, float %3323, float %3324, float %3325, float %3326, float %3327, float %3328, float %3329, i32 %3379, i32 %3380, i32 %3381, i32 %3382, i64 %3399, i1 true) #2, !dbg !161 + %3401 = add i32 %3395, 2048, !dbg !161 + %3402 = lshr exact i32 %3401, 4, !dbg !161 + %3403 = and i32 %3402, 16383, !dbg !161 + %3404 = zext nneg i32 %3403 to i64, !dbg !161 + %3405 = or disjoint i64 %3404, 4611686293338849280, !dbg !161 + %3406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 0, !dbg !161 + %3407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 1, !dbg !161 + %3408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 2, !dbg !161 + %3409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 3, !dbg !161 + %3410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 4, !dbg !161 + %3411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 5, !dbg !161 + %3412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 6, !dbg !161 + %3413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 7, !dbg !161 + %3414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 8, !dbg !161 + %3415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 9, !dbg !161 + %3416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 10, !dbg !161 + %3417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 11, !dbg !161 + %3418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 12, !dbg !161 + %3419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 13, !dbg !161 + %3420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 14, !dbg !161 + %3421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 15, !dbg !161 + %3422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 16, !dbg !161 + %3423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 17, !dbg !161 + %3424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 18, !dbg !161 + %3425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 19, !dbg !161 + %3426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 20, !dbg !161 + %3427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 21, !dbg !161 + %3428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 22, !dbg !161 + %3429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 23, !dbg !161 + %3430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 24, !dbg !161 + %3431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 25, !dbg !161 + %3432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 26, !dbg !161 + %3433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 27, !dbg !161 + %3434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 28, !dbg !161 + %3435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 29, !dbg !161 + %3436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 30, !dbg !161 + %3437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 31, !dbg !161 + %3438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 32, !dbg !161 + %3439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 33, !dbg !161 + %3440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 34, !dbg !161 + %3441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 35, !dbg !161 + %3442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 36, !dbg !161 + %3443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 37, !dbg !161 + %3444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 38, !dbg !161 + %3445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 39, !dbg !161 + %3446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 40, !dbg !161 + %3447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 41, !dbg !161 + %3448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 42, !dbg !161 + %3449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 43, !dbg !161 + %3450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 44, !dbg !161 + %3451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 45, !dbg !161 + %3452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 46, !dbg !161 + %3453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 47, !dbg !161 + %3454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 48, !dbg !161 + %3455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 49, !dbg !161 + %3456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 50, !dbg !161 + %3457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 51, !dbg !161 + %3458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 52, !dbg !161 + %3459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 53, !dbg !161 + %3460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 54, !dbg !161 + %3461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 55, !dbg !161 + %3462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 56, !dbg !161 + %3463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 57, !dbg !161 + %3464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 58, !dbg !161 + %3465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 59, !dbg !161 + %3466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 60, !dbg !161 + %3467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 61, !dbg !161 + %3468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 62, !dbg !161 + %3469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3400, 63, !dbg !161 + %3470 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3406, float %3407, float %3408, float %3409, float %3410, float %3411, float %3412, float %3413, float %3414, float %3415, float %3416, float %3417, float %3418, float %3419, float %3420, float %3421, float %3422, float %3423, float %3424, float %3425, float %3426, float %3427, float %3428, float %3429, float %3430, float %3431, float %3432, float %3433, float %3434, float %3435, float %3436, float %3437, float %3438, float %3439, float %3440, float %3441, float %3442, float %3443, float %3444, float %3445, float %3446, float %3447, float %3448, float %3449, float %3450, float %3451, float %3452, float %3453, float %3454, float %3455, float %3456, float %3457, float %3458, float %3459, float %3460, float %3461, float %3462, float %3463, float %3464, float %3465, float %3466, float %3467, float %3468, float %3469, i32 %3383, i32 %3384, i32 %3385, i32 %3386, i64 %3405, i1 true) #2, !dbg !161 + %3471 = add i32 %3395, 4096, !dbg !161 + %3472 = lshr exact i32 %3471, 4, !dbg !161 + %3473 = and i32 %3472, 16383, !dbg !161 + %3474 = zext nneg i32 %3473 to i64, !dbg !161 + %3475 = or disjoint i64 %3474, 4611686293338849280, !dbg !161 + %3476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 0, !dbg !161 + %3477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 1, !dbg !161 + %3478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 2, !dbg !161 + %3479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 3, !dbg !161 + %3480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 4, !dbg !161 + %3481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 5, !dbg !161 + %3482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 6, !dbg !161 + %3483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 7, !dbg !161 + %3484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 8, !dbg !161 + %3485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 9, !dbg !161 + %3486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 10, !dbg !161 + %3487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 11, !dbg !161 + %3488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 12, !dbg !161 + %3489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 13, !dbg !161 + %3490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 14, !dbg !161 + %3491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 15, !dbg !161 + %3492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 16, !dbg !161 + %3493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 17, !dbg !161 + %3494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 18, !dbg !161 + %3495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 19, !dbg !161 + %3496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 20, !dbg !161 + %3497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 21, !dbg !161 + %3498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 22, !dbg !161 + %3499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 23, !dbg !161 + %3500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 24, !dbg !161 + %3501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 25, !dbg !161 + %3502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 26, !dbg !161 + %3503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 27, !dbg !161 + %3504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 28, !dbg !161 + %3505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 29, !dbg !161 + %3506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 30, !dbg !161 + %3507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 31, !dbg !161 + %3508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 32, !dbg !161 + %3509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 33, !dbg !161 + %3510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 34, !dbg !161 + %3511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 35, !dbg !161 + %3512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 36, !dbg !161 + %3513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 37, !dbg !161 + %3514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 38, !dbg !161 + %3515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 39, !dbg !161 + %3516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 40, !dbg !161 + %3517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 41, !dbg !161 + %3518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 42, !dbg !161 + %3519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 43, !dbg !161 + %3520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 44, !dbg !161 + %3521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 45, !dbg !161 + %3522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 46, !dbg !161 + %3523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 47, !dbg !161 + %3524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 48, !dbg !161 + %3525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 49, !dbg !161 + %3526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 50, !dbg !161 + %3527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 51, !dbg !161 + %3528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 52, !dbg !161 + %3529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 53, !dbg !161 + %3530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 54, !dbg !161 + %3531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 55, !dbg !161 + %3532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 56, !dbg !161 + %3533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 57, !dbg !161 + %3534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 58, !dbg !161 + %3535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 59, !dbg !161 + %3536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 60, !dbg !161 + %3537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 61, !dbg !161 + %3538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 62, !dbg !161 + %3539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3470, 63, !dbg !161 + %3540 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3476, float %3477, float %3478, float %3479, float %3480, float %3481, float %3482, float %3483, float %3484, float %3485, float %3486, float %3487, float %3488, float %3489, float %3490, float %3491, float %3492, float %3493, float %3494, float %3495, float %3496, float %3497, float %3498, float %3499, float %3500, float %3501, float %3502, float %3503, float %3504, float %3505, float %3506, float %3507, float %3508, float %3509, float %3510, float %3511, float %3512, float %3513, float %3514, float %3515, float %3516, float %3517, float %3518, float %3519, float %3520, float %3521, float %3522, float %3523, float %3524, float %3525, float %3526, float %3527, float %3528, float %3529, float %3530, float %3531, float %3532, float %3533, float %3534, float %3535, float %3536, float %3537, float %3538, float %3539, i32 %3387, i32 %3388, i32 %3389, i32 %3390, i64 %3475, i1 true) #2, !dbg !161 + %3541 = add i32 %3395, 6144, !dbg !161 + %3542 = lshr exact i32 %3541, 4, !dbg !161 + %3543 = and i32 %3542, 16383, !dbg !161 + %3544 = zext nneg i32 %3543 to i64, !dbg !161 + %3545 = or disjoint i64 %3544, 4611686293338849280, !dbg !161 + %3546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 0, !dbg !161 + %3547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 1, !dbg !161 + %3548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 2, !dbg !161 + %3549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 3, !dbg !161 + %3550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 4, !dbg !161 + %3551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 5, !dbg !161 + %3552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 6, !dbg !161 + %3553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 7, !dbg !161 + %3554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 8, !dbg !161 + %3555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 9, !dbg !161 + %3556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 10, !dbg !161 + %3557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 11, !dbg !161 + %3558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 12, !dbg !161 + %3559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 13, !dbg !161 + %3560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 14, !dbg !161 + %3561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 15, !dbg !161 + %3562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 16, !dbg !161 + %3563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 17, !dbg !161 + %3564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 18, !dbg !161 + %3565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 19, !dbg !161 + %3566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 20, !dbg !161 + %3567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 21, !dbg !161 + %3568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 22, !dbg !161 + %3569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 23, !dbg !161 + %3570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 24, !dbg !161 + %3571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 25, !dbg !161 + %3572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 26, !dbg !161 + %3573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 27, !dbg !161 + %3574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 28, !dbg !161 + %3575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 29, !dbg !161 + %3576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 30, !dbg !161 + %3577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 31, !dbg !161 + %3578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 32, !dbg !161 + %3579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 33, !dbg !161 + %3580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 34, !dbg !161 + %3581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 35, !dbg !161 + %3582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 36, !dbg !161 + %3583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 37, !dbg !161 + %3584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 38, !dbg !161 + %3585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 39, !dbg !161 + %3586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 40, !dbg !161 + %3587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 41, !dbg !161 + %3588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 42, !dbg !161 + %3589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 43, !dbg !161 + %3590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 44, !dbg !161 + %3591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 45, !dbg !161 + %3592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 46, !dbg !161 + %3593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 47, !dbg !161 + %3594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 48, !dbg !161 + %3595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 49, !dbg !161 + %3596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 50, !dbg !161 + %3597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 51, !dbg !161 + %3598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 52, !dbg !161 + %3599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 53, !dbg !161 + %3600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 54, !dbg !161 + %3601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 55, !dbg !161 + %3602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 56, !dbg !161 + %3603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 57, !dbg !161 + %3604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 58, !dbg !161 + %3605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 59, !dbg !161 + %3606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 60, !dbg !161 + %3607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 61, !dbg !161 + %3608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 62, !dbg !161 + %3609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3540, 63, !dbg !161 + %3610 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3546, float %3547, float %3548, float %3549, float %3550, float %3551, float %3552, float %3553, float %3554, float %3555, float %3556, float %3557, float %3558, float %3559, float %3560, float %3561, float %3562, float %3563, float %3564, float %3565, float %3566, float %3567, float %3568, float %3569, float %3570, float %3571, float %3572, float %3573, float %3574, float %3575, float %3576, float %3577, float %3578, float %3579, float %3580, float %3581, float %3582, float %3583, float %3584, float %3585, float %3586, float %3587, float %3588, float %3589, float %3590, float %3591, float %3592, float %3593, float %3594, float %3595, float %3596, float %3597, float %3598, float %3599, float %3600, float %3601, float %3602, float %3603, float %3604, float %3605, float %3606, float %3607, float %3608, float %3609, i32 %3391, i32 %3392, i32 %3393, i32 %3394, i64 %3545, i1 true) #2, !dbg !161 + %3611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 0, !dbg !161 + %3612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 1, !dbg !161 + %3613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 2, !dbg !161 + %3614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 3, !dbg !161 + %3615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 4, !dbg !161 + %3616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 5, !dbg !161 + %3617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 6, !dbg !161 + %3618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 7, !dbg !161 + %3619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 8, !dbg !161 + %3620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 9, !dbg !161 + %3621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 10, !dbg !161 + %3622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 11, !dbg !161 + %3623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 12, !dbg !161 + %3624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 13, !dbg !161 + %3625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 14, !dbg !161 + %3626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 15, !dbg !161 + %3627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 16, !dbg !161 + %3628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 17, !dbg !161 + %3629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 18, !dbg !161 + %3630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 19, !dbg !161 + %3631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 20, !dbg !161 + %3632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 21, !dbg !161 + %3633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 22, !dbg !161 + %3634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 23, !dbg !161 + %3635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 24, !dbg !161 + %3636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 25, !dbg !161 + %3637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 26, !dbg !161 + %3638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 27, !dbg !161 + %3639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 28, !dbg !161 + %3640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 29, !dbg !161 + %3641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 30, !dbg !161 + %3642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 31, !dbg !161 + %3643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 32, !dbg !161 + %3644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 33, !dbg !161 + %3645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 34, !dbg !161 + %3646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 35, !dbg !161 + %3647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 36, !dbg !161 + %3648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 37, !dbg !161 + %3649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 38, !dbg !161 + %3650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 39, !dbg !161 + %3651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 40, !dbg !161 + %3652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 41, !dbg !161 + %3653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 42, !dbg !161 + %3654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 43, !dbg !161 + %3655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 44, !dbg !161 + %3656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 45, !dbg !161 + %3657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 46, !dbg !161 + %3658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 47, !dbg !161 + %3659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 48, !dbg !161 + %3660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 49, !dbg !161 + %3661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 50, !dbg !161 + %3662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 51, !dbg !161 + %3663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 52, !dbg !161 + %3664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 53, !dbg !161 + %3665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 54, !dbg !161 + %3666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 55, !dbg !161 + %3667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 56, !dbg !161 + %3668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 57, !dbg !161 + %3669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 58, !dbg !161 + %3670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 59, !dbg !161 + %3671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 60, !dbg !161 + %3672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 61, !dbg !161 + %3673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 62, !dbg !161 + %3674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3610, 63, !dbg !161 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !161 + %3675 = insertelement <16 x i32> poison, i32 %2369, i64 0, !dbg !162 + %3676 = shufflevector <16 x i32> %3675, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !162 + %3677 = add <16 x i32> %3676, %2375, !dbg !162 + %3678 = add nuw nsw i32 %2373, 1, !dbg !131 + %3679 = lshr i32 %3678, 1, !dbg !163 + %3680 = zext nneg i32 %3679 to i64, !dbg !164 + %3681 = getelementptr i32, ptr addrspace(1) %2138, i64 %3680, !dbg !164 + %3682 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !165 + %3683 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %3681, i64 %3682, i1 %2377) #2, !dbg !165 + %3684 = add nuw nsw i32 %3679, 1, !dbg !166 + %3685 = icmp slt i32 %3684, %2142, !dbg !167 + %3686 = getelementptr i8, ptr addrspace(1) %3681, i64 4, !dbg !168 + %3687 = and i1 %2377, %3685, !dbg !131 + %3688 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !169 + %3689 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %3686, i64 %3688, i1 %3687) #2, !dbg !169 + %3690 = and i32 %2373, 1, !dbg !170 + %3691 = sub i32 %3689, %3683, !dbg !171 + %3692 = shl i32 %3691, 7, !dbg !172 + %3693 = add i32 %3692, -64, !dbg !173 + %3694 = xor i32 %3690, 1, !dbg !174 + %3695 = mul nuw nsw i32 %3693, %3694, !dbg !174 + %3696 = shl nuw nsw i32 %3690, 6, !dbg !175 + %3697 = add i32 %3695, %3696, !dbg !176 + %3698 = add i32 %3697, %2372, !dbg !177 + %3699 = add i32 %2371, 1, !dbg !131 + %3700 = icmp sgt i32 %3699, 2, !dbg !131 + %3701 = select i1 %3700, i32 0, i32 %3699, !dbg !131 + %3702 = add i32 %3698, %2140, !dbg !139 + %3703 = add i32 %3702, %43, !dbg !133 + %3704 = add i32 %3702, %44, !dbg !133 + %3705 = add i32 %3702, %45, !dbg !133 + %3706 = add i32 %3702, %46, !dbg !133 + %3707 = shl i32 %3703, 7, !dbg !134 + %3708 = shl i32 %3704, 7, !dbg !134 + %3709 = shl i32 %3705, 7, !dbg !134 + %3710 = shl i32 %3706, 7, !dbg !134 + %3711 = sext i32 %3707 to i64, !dbg !135 + %3712 = sext i32 %3708 to i64, !dbg !135 + %3713 = sext i32 %3709 to i64, !dbg !135 + %3714 = sext i32 %3710 to i64, !dbg !135 + %gep428 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3711, !dbg !136 + %gep430 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3712, !dbg !136 + %gep432 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3713, !dbg !136 + %gep434 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3714, !dbg !136 + %3715 = icmp slt i32 %3703, %12, !dbg !137 + %3716 = icmp slt i32 %3704, %12, !dbg !137 + %3717 = icmp slt i32 %3705, %12, !dbg !137 + %3718 = icmp slt i32 %3706, %12, !dbg !137 + %3719 = shl i32 %3701, 13, !dbg !138 + %3720 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %3719, !dbg !138 + %3721 = and i1 %2376, %3715, !dbg !131 + %3722 = and i1 %2376, %3716, !dbg !131 + %3723 = and i1 %2376, %3717, !dbg !131 + %3724 = and i1 %2376, %3718, !dbg !131 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !138 + %3725 = getelementptr inbounds nuw i8, ptr addrspace(3) %3720, i32 %263, !dbg !138 + %3726 = select i1 %3721, i32 16, i32 0, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %3725, ptr addrspace(1) %gep428, i32 %3726) #2, !dbg !138 + %3727 = getelementptr inbounds nuw i8, ptr addrspace(3) %3720, i32 %266, !dbg !138 + %3728 = select i1 %3722, i32 16, i32 0, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3727, ptr addrspace(1) %gep430, i32 %3728) #2, !dbg !138 + %3729 = getelementptr inbounds nuw i8, ptr addrspace(3) %3720, i32 %269, !dbg !138 + %3730 = select i1 %3723, i32 16, i32 0, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3729, ptr addrspace(1) %gep432, i32 %3730) #2, !dbg !138 + %3731 = getelementptr inbounds nuw i8, ptr addrspace(3) %3720, i32 %272, !dbg !138 + %3732 = select i1 %3724, i32 16, i32 0, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3731, ptr addrspace(1) %gep434, i32 %3732) #2, !dbg !138 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !138 + %gep436 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %3711, !dbg !136 + %gep438 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %3712, !dbg !136 + %gep440 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %3713, !dbg !136 + %gep442 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %3714, !dbg !136 + %3733 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %3719, !dbg !138 + %3734 = getelementptr inbounds nuw i8, ptr addrspace(3) %3733, i32 %263, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %3734, ptr addrspace(1) %gep436, i32 %3726) #2, !dbg !138 + %3735 = getelementptr inbounds nuw i8, ptr addrspace(3) %3733, i32 %266, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3735, ptr addrspace(1) %gep438, i32 %3728) #2, !dbg !138 + %3736 = getelementptr inbounds nuw i8, ptr addrspace(3) %3733, i32 %269, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3736, ptr addrspace(1) %gep440, i32 %3730) #2, !dbg !138 + %3737 = getelementptr inbounds nuw i8, ptr addrspace(3) %3733, i32 %272, !dbg !138 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3737, ptr addrspace(1) %gep442, i32 %3732) #2, !dbg !138 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !138 + %exitcond472.not = icmp eq i32 %3678, %smax471, !dbg !131 + br i1 %exitcond472.not, label %._crit_edge461, label %2368, !dbg !131 + +._crit_edge461: ; preds = %__nv_exp2f.exit253, %._crit_edge + %3738 = phi float [ %2228, %._crit_edge ], [ %3611, %__nv_exp2f.exit253 ], !dbg !52 + %3739 = phi float [ %2229, %._crit_edge ], [ %3612, %__nv_exp2f.exit253 ], !dbg !52 + %3740 = phi float [ %2230, %._crit_edge ], [ %3613, %__nv_exp2f.exit253 ], !dbg !52 + %3741 = phi float [ %2231, %._crit_edge ], [ %3614, %__nv_exp2f.exit253 ], !dbg !52 + %3742 = phi float [ %2232, %._crit_edge ], [ %3615, %__nv_exp2f.exit253 ], !dbg !52 + %3743 = phi float [ %2233, %._crit_edge ], [ %3616, %__nv_exp2f.exit253 ], !dbg !52 + %3744 = phi float [ %2234, %._crit_edge ], [ %3617, %__nv_exp2f.exit253 ], !dbg !52 + %3745 = phi float [ %2235, %._crit_edge ], [ %3618, %__nv_exp2f.exit253 ], !dbg !52 + %3746 = phi float [ %2236, %._crit_edge ], [ %3619, %__nv_exp2f.exit253 ], !dbg !52 + %3747 = phi float [ %2237, %._crit_edge ], [ %3620, %__nv_exp2f.exit253 ], !dbg !52 + %3748 = phi float [ %2238, %._crit_edge ], [ %3621, %__nv_exp2f.exit253 ], !dbg !52 + %3749 = phi float [ %2239, %._crit_edge ], [ %3622, %__nv_exp2f.exit253 ], !dbg !52 + %3750 = phi float [ %2240, %._crit_edge ], [ %3623, %__nv_exp2f.exit253 ], !dbg !52 + %3751 = phi float [ %2241, %._crit_edge ], [ %3624, %__nv_exp2f.exit253 ], !dbg !52 + %3752 = phi float [ %2242, %._crit_edge ], [ %3625, %__nv_exp2f.exit253 ], !dbg !52 + %3753 = phi float [ %2243, %._crit_edge ], [ %3626, %__nv_exp2f.exit253 ], !dbg !52 + %3754 = phi float [ %2244, %._crit_edge ], [ %3627, %__nv_exp2f.exit253 ], !dbg !52 + %3755 = phi float [ %2245, %._crit_edge ], [ %3628, %__nv_exp2f.exit253 ], !dbg !52 + %3756 = phi float [ %2246, %._crit_edge ], [ %3629, %__nv_exp2f.exit253 ], !dbg !52 + %3757 = phi float [ %2247, %._crit_edge ], [ %3630, %__nv_exp2f.exit253 ], !dbg !52 + %3758 = phi float [ %2248, %._crit_edge ], [ %3631, %__nv_exp2f.exit253 ], !dbg !52 + %3759 = phi float [ %2249, %._crit_edge ], [ %3632, %__nv_exp2f.exit253 ], !dbg !52 + %3760 = phi float [ %2250, %._crit_edge ], [ %3633, %__nv_exp2f.exit253 ], !dbg !52 + %3761 = phi float [ %2251, %._crit_edge ], [ %3634, %__nv_exp2f.exit253 ], !dbg !52 + %3762 = phi float [ %2252, %._crit_edge ], [ %3635, %__nv_exp2f.exit253 ], !dbg !52 + %3763 = phi float [ %2253, %._crit_edge ], [ %3636, %__nv_exp2f.exit253 ], !dbg !52 + %3764 = phi float [ %2254, %._crit_edge ], [ %3637, %__nv_exp2f.exit253 ], !dbg !52 + %3765 = phi float [ %2255, %._crit_edge ], [ %3638, %__nv_exp2f.exit253 ], !dbg !52 + %3766 = phi float [ %2256, %._crit_edge ], [ %3639, %__nv_exp2f.exit253 ], !dbg !52 + %3767 = phi float [ %2257, %._crit_edge ], [ %3640, %__nv_exp2f.exit253 ], !dbg !52 + %3768 = phi float [ %2258, %._crit_edge ], [ %3641, %__nv_exp2f.exit253 ], !dbg !52 + %3769 = phi float [ %2259, %._crit_edge ], [ %3642, %__nv_exp2f.exit253 ], !dbg !52 + %3770 = phi float [ %2260, %._crit_edge ], [ %3643, %__nv_exp2f.exit253 ], !dbg !52 + %3771 = phi float [ %2261, %._crit_edge ], [ %3644, %__nv_exp2f.exit253 ], !dbg !52 + %3772 = phi float [ %2262, %._crit_edge ], [ %3645, %__nv_exp2f.exit253 ], !dbg !52 + %3773 = phi float [ %2263, %._crit_edge ], [ %3646, %__nv_exp2f.exit253 ], !dbg !52 + %3774 = phi float [ %2264, %._crit_edge ], [ %3647, %__nv_exp2f.exit253 ], !dbg !52 + %3775 = phi float [ %2265, %._crit_edge ], [ %3648, %__nv_exp2f.exit253 ], !dbg !52 + %3776 = phi float [ %2266, %._crit_edge ], [ %3649, %__nv_exp2f.exit253 ], !dbg !52 + %3777 = phi float [ %2267, %._crit_edge ], [ %3650, %__nv_exp2f.exit253 ], !dbg !52 + %3778 = phi float [ %2268, %._crit_edge ], [ %3651, %__nv_exp2f.exit253 ], !dbg !52 + %3779 = phi float [ %2269, %._crit_edge ], [ %3652, %__nv_exp2f.exit253 ], !dbg !52 + %3780 = phi float [ %2270, %._crit_edge ], [ %3653, %__nv_exp2f.exit253 ], !dbg !52 + %3781 = phi float [ %2271, %._crit_edge ], [ %3654, %__nv_exp2f.exit253 ], !dbg !52 + %3782 = phi float [ %2272, %._crit_edge ], [ %3655, %__nv_exp2f.exit253 ], !dbg !52 + %3783 = phi float [ %2273, %._crit_edge ], [ %3656, %__nv_exp2f.exit253 ], !dbg !52 + %3784 = phi float [ %2274, %._crit_edge ], [ %3657, %__nv_exp2f.exit253 ], !dbg !52 + %3785 = phi float [ %2275, %._crit_edge ], [ %3658, %__nv_exp2f.exit253 ], !dbg !52 + %3786 = phi float [ %2276, %._crit_edge ], [ %3659, %__nv_exp2f.exit253 ], !dbg !52 + %3787 = phi float [ %2277, %._crit_edge ], [ %3660, %__nv_exp2f.exit253 ], !dbg !52 + %3788 = phi float [ %2278, %._crit_edge ], [ %3661, %__nv_exp2f.exit253 ], !dbg !52 + %3789 = phi float [ %2279, %._crit_edge ], [ %3662, %__nv_exp2f.exit253 ], !dbg !52 + %3790 = phi float [ %2280, %._crit_edge ], [ %3663, %__nv_exp2f.exit253 ], !dbg !52 + %3791 = phi float [ %2281, %._crit_edge ], [ %3664, %__nv_exp2f.exit253 ], !dbg !52 + %3792 = phi float [ %2282, %._crit_edge ], [ %3665, %__nv_exp2f.exit253 ], !dbg !52 + %3793 = phi float [ %2283, %._crit_edge ], [ %3666, %__nv_exp2f.exit253 ], !dbg !52 + %3794 = phi float [ %2284, %._crit_edge ], [ %3667, %__nv_exp2f.exit253 ], !dbg !52 + %3795 = phi float [ %2285, %._crit_edge ], [ %3668, %__nv_exp2f.exit253 ], !dbg !52 + %3796 = phi float [ %2286, %._crit_edge ], [ %3669, %__nv_exp2f.exit253 ], !dbg !52 + %3797 = phi float [ %2287, %._crit_edge ], [ %3670, %__nv_exp2f.exit253 ], !dbg !52 + %3798 = phi float [ %2288, %._crit_edge ], [ %3671, %__nv_exp2f.exit253 ], !dbg !52 + %3799 = phi float [ %2289, %._crit_edge ], [ %3672, %__nv_exp2f.exit253 ], !dbg !52 + %3800 = phi float [ %2290, %._crit_edge ], [ %3673, %__nv_exp2f.exit253 ], !dbg !52 + %3801 = phi float [ %2291, %._crit_edge ], [ %3674, %__nv_exp2f.exit253 ], !dbg !52 + %3802 = phi float [ %2134, %._crit_edge ], [ %3264, %__nv_exp2f.exit253 ], !dbg !52 + %3803 = phi float [ %2135, %._crit_edge ], [ %3265, %__nv_exp2f.exit253 ], !dbg !52 + %3804 = phi <2 x float> [ %2136, %._crit_edge ], [ %3003, %__nv_exp2f.exit253 ], !dbg !52 + %3805 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %3738, float %3739, float %3740, float %3741, float %3742, float %3743, float %3744, float %3745, float %3746, float %3747, float %3748, float %3749, float %3750, float %3751, float %3752, float %3753, float %3754, float %3755, float %3756, float %3757, float %3758, float %3759, float %3760, float %3761, float %3762, float %3763, float %3764, float %3765, float %3766, float %3767, float %3768, float %3769, float %3770, float %3771, float %3772, float %3773, float %3774, float %3775, float %3776, float %3777, float %3778, float %3779, float %3780, float %3781, float %3782, float %3783, float %3784, float %3785, float %3786, float %3787, float %3788, float %3789, float %3790, float %3791, float %3792, float %3793, float %3794, float %3795, float %3796, float %3797, float %3798, float %3799, float %3800, float %3801) #2, !dbg !131 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !131 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !131 + %3806 = fcmp oeq float %3802, 0.000000e+00, !dbg !178 + %3807 = fcmp oeq float %3803, 0.000000e+00, !dbg !178 + %3808 = select i1 %3806, float 1.000000e+00, float %3802, !dbg !179 + %3809 = select i1 %3807, float 1.000000e+00, float %3803, !dbg !179 + %3810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 0, !dbg !180 + %3811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 1, !dbg !180 + %3812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 2, !dbg !180 + %3813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 3, !dbg !180 + %3814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 4, !dbg !180 + %3815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 5, !dbg !180 + %3816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 6, !dbg !180 + %3817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 7, !dbg !180 + %3818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 8, !dbg !180 + %3819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 9, !dbg !180 + %3820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 10, !dbg !180 + %3821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 11, !dbg !180 + %3822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 12, !dbg !180 + %3823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 13, !dbg !180 + %3824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 14, !dbg !180 + %3825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 15, !dbg !180 + %3826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 16, !dbg !180 + %3827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 17, !dbg !180 + %3828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 18, !dbg !180 + %3829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 19, !dbg !180 + %3830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 20, !dbg !180 + %3831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 21, !dbg !180 + %3832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 22, !dbg !180 + %3833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 23, !dbg !180 + %3834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 24, !dbg !180 + %3835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 25, !dbg !180 + %3836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 26, !dbg !180 + %3837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 27, !dbg !180 + %3838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 28, !dbg !180 + %3839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 29, !dbg !180 + %3840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 30, !dbg !180 + %3841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 31, !dbg !180 + %3842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 32, !dbg !180 + %3843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 33, !dbg !180 + %3844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 34, !dbg !180 + %3845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 35, !dbg !180 + %3846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 36, !dbg !180 + %3847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 37, !dbg !180 + %3848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 38, !dbg !180 + %3849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 39, !dbg !180 + %3850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 40, !dbg !180 + %3851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 41, !dbg !180 + %3852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 42, !dbg !180 + %3853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 43, !dbg !180 + %3854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 44, !dbg !180 + %3855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 45, !dbg !180 + %3856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 46, !dbg !180 + %3857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 47, !dbg !180 + %3858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 48, !dbg !180 + %3859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 49, !dbg !180 + %3860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 50, !dbg !180 + %3861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 51, !dbg !180 + %3862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 52, !dbg !180 + %3863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 53, !dbg !180 + %3864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 54, !dbg !180 + %3865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 55, !dbg !180 + %3866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 56, !dbg !180 + %3867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 57, !dbg !180 + %3868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 58, !dbg !180 + %3869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 59, !dbg !180 + %3870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 60, !dbg !180 + %3871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 61, !dbg !180 + %3872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 62, !dbg !180 + %3873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3805, 63, !dbg !180 + %3874 = tail call float @llvm.nvvm.div.full(float %3810, float %3808), !dbg !180 + %3875 = tail call float @llvm.nvvm.div.full(float %3811, float %3808), !dbg !180 + %3876 = tail call float @llvm.nvvm.div.full(float %3812, float %3809), !dbg !180 + %3877 = tail call float @llvm.nvvm.div.full(float %3813, float %3809), !dbg !180 + %3878 = tail call float @llvm.nvvm.div.full(float %3814, float %3808), !dbg !180 + %3879 = tail call float @llvm.nvvm.div.full(float %3815, float %3808), !dbg !180 + %3880 = tail call float @llvm.nvvm.div.full(float %3816, float %3809), !dbg !180 + %3881 = tail call float @llvm.nvvm.div.full(float %3817, float %3809), !dbg !180 + %3882 = tail call float @llvm.nvvm.div.full(float %3818, float %3808), !dbg !180 + %3883 = tail call float @llvm.nvvm.div.full(float %3819, float %3808), !dbg !180 + %3884 = tail call float @llvm.nvvm.div.full(float %3820, float %3809), !dbg !180 + %3885 = tail call float @llvm.nvvm.div.full(float %3821, float %3809), !dbg !180 + %3886 = tail call float @llvm.nvvm.div.full(float %3822, float %3808), !dbg !180 + %3887 = tail call float @llvm.nvvm.div.full(float %3823, float %3808), !dbg !180 + %3888 = tail call float @llvm.nvvm.div.full(float %3824, float %3809), !dbg !180 + %3889 = tail call float @llvm.nvvm.div.full(float %3825, float %3809), !dbg !180 + %3890 = tail call float @llvm.nvvm.div.full(float %3826, float %3808), !dbg !180 + %3891 = tail call float @llvm.nvvm.div.full(float %3827, float %3808), !dbg !180 + %3892 = tail call float @llvm.nvvm.div.full(float %3828, float %3809), !dbg !180 + %3893 = tail call float @llvm.nvvm.div.full(float %3829, float %3809), !dbg !180 + %3894 = tail call float @llvm.nvvm.div.full(float %3830, float %3808), !dbg !180 + %3895 = tail call float @llvm.nvvm.div.full(float %3831, float %3808), !dbg !180 + %3896 = tail call float @llvm.nvvm.div.full(float %3832, float %3809), !dbg !180 + %3897 = tail call float @llvm.nvvm.div.full(float %3833, float %3809), !dbg !180 + %3898 = tail call float @llvm.nvvm.div.full(float %3834, float %3808), !dbg !180 + %3899 = tail call float @llvm.nvvm.div.full(float %3835, float %3808), !dbg !180 + %3900 = tail call float @llvm.nvvm.div.full(float %3836, float %3809), !dbg !180 + %3901 = tail call float @llvm.nvvm.div.full(float %3837, float %3809), !dbg !180 + %3902 = tail call float @llvm.nvvm.div.full(float %3838, float %3808), !dbg !180 + %3903 = tail call float @llvm.nvvm.div.full(float %3839, float %3808), !dbg !180 + %3904 = tail call float @llvm.nvvm.div.full(float %3840, float %3809), !dbg !180 + %3905 = tail call float @llvm.nvvm.div.full(float %3841, float %3809), !dbg !180 + %3906 = tail call float @llvm.nvvm.div.full(float %3842, float %3808), !dbg !180 + %3907 = tail call float @llvm.nvvm.div.full(float %3843, float %3808), !dbg !180 + %3908 = tail call float @llvm.nvvm.div.full(float %3844, float %3809), !dbg !180 + %3909 = tail call float @llvm.nvvm.div.full(float %3845, float %3809), !dbg !180 + %3910 = tail call float @llvm.nvvm.div.full(float %3846, float %3808), !dbg !180 + %3911 = tail call float @llvm.nvvm.div.full(float %3847, float %3808), !dbg !180 + %3912 = tail call float @llvm.nvvm.div.full(float %3848, float %3809), !dbg !180 + %3913 = tail call float @llvm.nvvm.div.full(float %3849, float %3809), !dbg !180 + %3914 = tail call float @llvm.nvvm.div.full(float %3850, float %3808), !dbg !180 + %3915 = tail call float @llvm.nvvm.div.full(float %3851, float %3808), !dbg !180 + %3916 = tail call float @llvm.nvvm.div.full(float %3852, float %3809), !dbg !180 + %3917 = tail call float @llvm.nvvm.div.full(float %3853, float %3809), !dbg !180 + %3918 = tail call float @llvm.nvvm.div.full(float %3854, float %3808), !dbg !180 + %3919 = tail call float @llvm.nvvm.div.full(float %3855, float %3808), !dbg !180 + %3920 = tail call float @llvm.nvvm.div.full(float %3856, float %3809), !dbg !180 + %3921 = tail call float @llvm.nvvm.div.full(float %3857, float %3809), !dbg !180 + %3922 = tail call float @llvm.nvvm.div.full(float %3858, float %3808), !dbg !180 + %3923 = tail call float @llvm.nvvm.div.full(float %3859, float %3808), !dbg !180 + %3924 = tail call float @llvm.nvvm.div.full(float %3860, float %3809), !dbg !180 + %3925 = tail call float @llvm.nvvm.div.full(float %3861, float %3809), !dbg !180 + %3926 = tail call float @llvm.nvvm.div.full(float %3862, float %3808), !dbg !180 + %3927 = tail call float @llvm.nvvm.div.full(float %3863, float %3808), !dbg !180 + %3928 = tail call float @llvm.nvvm.div.full(float %3864, float %3809), !dbg !180 + %3929 = tail call float @llvm.nvvm.div.full(float %3865, float %3809), !dbg !180 + %3930 = tail call float @llvm.nvvm.div.full(float %3866, float %3808), !dbg !180 + %3931 = tail call float @llvm.nvvm.div.full(float %3867, float %3808), !dbg !180 + %3932 = tail call float @llvm.nvvm.div.full(float %3868, float %3809), !dbg !180 + %3933 = tail call float @llvm.nvvm.div.full(float %3869, float %3809), !dbg !180 + %3934 = tail call float @llvm.nvvm.div.full(float %3870, float %3808), !dbg !180 + %3935 = tail call float @llvm.nvvm.div.full(float %3871, float %3808), !dbg !180 + %3936 = tail call float @llvm.nvvm.div.full(float %3872, float %3809), !dbg !180 + %3937 = tail call float @llvm.nvvm.div.full(float %3873, float %3809), !dbg !180 + %3938 = or disjoint i32 %85, %26, !dbg !181 + %3939 = add i32 %60, %3938, !dbg !182 + %3940 = add i32 %61, %3938, !dbg !182 + %3941 = add i32 %62, %3938, !dbg !182 + %3942 = add i32 %63, %3938, !dbg !182 + %3943 = add i32 %64, %3938, !dbg !182 + %3944 = add i32 %65, %3938, !dbg !182 + %3945 = add i32 %66, %3938, !dbg !182 + %3946 = add i32 %67, %3938, !dbg !182 + %3947 = add i32 %3939, %25, !dbg !183 + %3948 = add i32 %3940, %25, !dbg !183 + %3949 = add i32 %3941, %25, !dbg !183 + %3950 = add i32 %3942, %25, !dbg !183 + %3951 = add i32 %3943, %25, !dbg !183 + %3952 = add i32 %3944, %25, !dbg !183 + %3953 = add i32 %3945, %25, !dbg !183 + %3954 = add i32 %3946, %25, !dbg !183 + %3955 = sext i32 %3947 to i64, !dbg !184 + %3956 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3955, !dbg !184 + %3957 = sext i32 %3948 to i64, !dbg !184 + %3958 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3957, !dbg !184 + %3959 = sext i32 %3949 to i64, !dbg !184 + %3960 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3959, !dbg !184 + %3961 = sext i32 %3950 to i64, !dbg !184 + %3962 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3961, !dbg !184 + %3963 = sext i32 %3951 to i64, !dbg !184 + %3964 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3963, !dbg !184 + %3965 = sext i32 %3952 to i64, !dbg !184 + %3966 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3965, !dbg !184 + %3967 = sext i32 %3953 to i64, !dbg !184 + %3968 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3967, !dbg !184 + %3969 = sext i32 %3954 to i64, !dbg !184 + %3970 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3969, !dbg !184 + %3971 = insertelement <2 x float> poison, float %3874, i64 0, !dbg !185 + %3972 = insertelement <2 x float> %3971, float %3875, i64 1, !dbg !185 + %3973 = fptrunc <2 x float> %3972 to <2 x bfloat>, !dbg !185 + %3974 = insertelement <2 x float> poison, float %3876, i64 0, !dbg !185 + %3975 = insertelement <2 x float> %3974, float %3877, i64 1, !dbg !185 + %3976 = fptrunc <2 x float> %3975 to <2 x bfloat>, !dbg !185 + %3977 = insertelement <2 x float> poison, float %3878, i64 0, !dbg !185 + %3978 = insertelement <2 x float> %3977, float %3879, i64 1, !dbg !185 + %3979 = fptrunc <2 x float> %3978 to <2 x bfloat>, !dbg !185 + %3980 = insertelement <2 x float> poison, float %3880, i64 0, !dbg !185 + %3981 = insertelement <2 x float> %3980, float %3881, i64 1, !dbg !185 + %3982 = fptrunc <2 x float> %3981 to <2 x bfloat>, !dbg !185 + %3983 = insertelement <2 x float> poison, float %3882, i64 0, !dbg !185 + %3984 = insertelement <2 x float> %3983, float %3883, i64 1, !dbg !185 + %3985 = fptrunc <2 x float> %3984 to <2 x bfloat>, !dbg !185 + %3986 = insertelement <2 x float> poison, float %3884, i64 0, !dbg !185 + %3987 = insertelement <2 x float> %3986, float %3885, i64 1, !dbg !185 + %3988 = fptrunc <2 x float> %3987 to <2 x bfloat>, !dbg !185 + %3989 = insertelement <2 x float> poison, float %3886, i64 0, !dbg !185 + %3990 = insertelement <2 x float> %3989, float %3887, i64 1, !dbg !185 + %3991 = fptrunc <2 x float> %3990 to <2 x bfloat>, !dbg !185 + %3992 = insertelement <2 x float> poison, float %3888, i64 0, !dbg !185 + %3993 = insertelement <2 x float> %3992, float %3889, i64 1, !dbg !185 + %3994 = fptrunc <2 x float> %3993 to <2 x bfloat>, !dbg !185 + %3995 = insertelement <2 x float> poison, float %3890, i64 0, !dbg !185 + %3996 = insertelement <2 x float> %3995, float %3891, i64 1, !dbg !185 + %3997 = fptrunc <2 x float> %3996 to <2 x bfloat>, !dbg !185 + %3998 = insertelement <2 x float> poison, float %3892, i64 0, !dbg !185 + %3999 = insertelement <2 x float> %3998, float %3893, i64 1, !dbg !185 + %4000 = fptrunc <2 x float> %3999 to <2 x bfloat>, !dbg !185 + %4001 = insertelement <2 x float> poison, float %3894, i64 0, !dbg !185 + %4002 = insertelement <2 x float> %4001, float %3895, i64 1, !dbg !185 + %4003 = fptrunc <2 x float> %4002 to <2 x bfloat>, !dbg !185 + %4004 = insertelement <2 x float> poison, float %3896, i64 0, !dbg !185 + %4005 = insertelement <2 x float> %4004, float %3897, i64 1, !dbg !185 + %4006 = fptrunc <2 x float> %4005 to <2 x bfloat>, !dbg !185 + %4007 = insertelement <2 x float> poison, float %3898, i64 0, !dbg !185 + %4008 = insertelement <2 x float> %4007, float %3899, i64 1, !dbg !185 + %4009 = fptrunc <2 x float> %4008 to <2 x bfloat>, !dbg !185 + %4010 = insertelement <2 x float> poison, float %3900, i64 0, !dbg !185 + %4011 = insertelement <2 x float> %4010, float %3901, i64 1, !dbg !185 + %4012 = fptrunc <2 x float> %4011 to <2 x bfloat>, !dbg !185 + %4013 = insertelement <2 x float> poison, float %3902, i64 0, !dbg !185 + %4014 = insertelement <2 x float> %4013, float %3903, i64 1, !dbg !185 + %4015 = fptrunc <2 x float> %4014 to <2 x bfloat>, !dbg !185 + %4016 = insertelement <2 x float> poison, float %3904, i64 0, !dbg !185 + %4017 = insertelement <2 x float> %4016, float %3905, i64 1, !dbg !185 + %4018 = fptrunc <2 x float> %4017 to <2 x bfloat>, !dbg !185 + %4019 = insertelement <2 x float> poison, float %3906, i64 0, !dbg !185 + %4020 = insertelement <2 x float> %4019, float %3907, i64 1, !dbg !185 + %4021 = fptrunc <2 x float> %4020 to <2 x bfloat>, !dbg !185 + %4022 = insertelement <2 x float> poison, float %3908, i64 0, !dbg !185 + %4023 = insertelement <2 x float> %4022, float %3909, i64 1, !dbg !185 + %4024 = fptrunc <2 x float> %4023 to <2 x bfloat>, !dbg !185 + %4025 = insertelement <2 x float> poison, float %3910, i64 0, !dbg !185 + %4026 = insertelement <2 x float> %4025, float %3911, i64 1, !dbg !185 + %4027 = fptrunc <2 x float> %4026 to <2 x bfloat>, !dbg !185 + %4028 = insertelement <2 x float> poison, float %3912, i64 0, !dbg !185 + %4029 = insertelement <2 x float> %4028, float %3913, i64 1, !dbg !185 + %4030 = fptrunc <2 x float> %4029 to <2 x bfloat>, !dbg !185 + %4031 = insertelement <2 x float> poison, float %3914, i64 0, !dbg !185 + %4032 = insertelement <2 x float> %4031, float %3915, i64 1, !dbg !185 + %4033 = fptrunc <2 x float> %4032 to <2 x bfloat>, !dbg !185 + %4034 = insertelement <2 x float> poison, float %3916, i64 0, !dbg !185 + %4035 = insertelement <2 x float> %4034, float %3917, i64 1, !dbg !185 + %4036 = fptrunc <2 x float> %4035 to <2 x bfloat>, !dbg !185 + %4037 = insertelement <2 x float> poison, float %3918, i64 0, !dbg !185 + %4038 = insertelement <2 x float> %4037, float %3919, i64 1, !dbg !185 + %4039 = fptrunc <2 x float> %4038 to <2 x bfloat>, !dbg !185 + %4040 = insertelement <2 x float> poison, float %3920, i64 0, !dbg !185 + %4041 = insertelement <2 x float> %4040, float %3921, i64 1, !dbg !185 + %4042 = fptrunc <2 x float> %4041 to <2 x bfloat>, !dbg !185 + %4043 = insertelement <2 x float> poison, float %3922, i64 0, !dbg !185 + %4044 = insertelement <2 x float> %4043, float %3923, i64 1, !dbg !185 + %4045 = fptrunc <2 x float> %4044 to <2 x bfloat>, !dbg !185 + %4046 = insertelement <2 x float> poison, float %3924, i64 0, !dbg !185 + %4047 = insertelement <2 x float> %4046, float %3925, i64 1, !dbg !185 + %4048 = fptrunc <2 x float> %4047 to <2 x bfloat>, !dbg !185 + %4049 = insertelement <2 x float> poison, float %3926, i64 0, !dbg !185 + %4050 = insertelement <2 x float> %4049, float %3927, i64 1, !dbg !185 + %4051 = fptrunc <2 x float> %4050 to <2 x bfloat>, !dbg !185 + %4052 = insertelement <2 x float> poison, float %3928, i64 0, !dbg !185 + %4053 = insertelement <2 x float> %4052, float %3929, i64 1, !dbg !185 + %4054 = fptrunc <2 x float> %4053 to <2 x bfloat>, !dbg !185 + %4055 = insertelement <2 x float> poison, float %3930, i64 0, !dbg !185 + %4056 = insertelement <2 x float> %4055, float %3931, i64 1, !dbg !185 + %4057 = fptrunc <2 x float> %4056 to <2 x bfloat>, !dbg !185 + %4058 = insertelement <2 x float> poison, float %3932, i64 0, !dbg !185 + %4059 = insertelement <2 x float> %4058, float %3933, i64 1, !dbg !185 + %4060 = fptrunc <2 x float> %4059 to <2 x bfloat>, !dbg !185 + %4061 = insertelement <2 x float> poison, float %3934, i64 0, !dbg !185 + %4062 = insertelement <2 x float> %4061, float %3935, i64 1, !dbg !185 + %4063 = fptrunc <2 x float> %4062 to <2 x bfloat>, !dbg !185 + %4064 = insertelement <2 x float> poison, float %3936, i64 0, !dbg !185 + %4065 = insertelement <2 x float> %4064, float %3937, i64 1, !dbg !185 + %4066 = fptrunc <2 x float> %4065 to <2 x bfloat>, !dbg !185 + %4067 = shl nuw nsw i32 %204, 13, !dbg !185 + %4068 = shl nuw nsw i32 %40, 5, !dbg !185 + %4069 = and i32 %4068, 7264, !dbg !185 + %4070 = and i32 %40, 24, !dbg !185 + %4071 = shl nuw nsw i32 %4070, 4, !dbg !185 + %4072 = shl nuw nsw i32 %40, 2, !dbg !185 + %4073 = and i32 %4072, 16, !dbg !185 + %4074 = or disjoint i32 %4067, %4073, !dbg !185 + %4075 = or disjoint i32 %4069, %4071, !dbg !185 + %4076 = or disjoint i32 %4074, %4075, !dbg !185 + %4077 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4076, !dbg !185 + %4078 = bitcast <2 x bfloat> %3973 to i32, !dbg !185 + %4079 = bitcast <2 x bfloat> %3979 to i32, !dbg !185 + %4080 = bitcast <2 x bfloat> %3985 to i32, !dbg !185 + %4081 = bitcast <2 x bfloat> %3991 to i32, !dbg !185 + %4082 = insertelement <4 x i32> poison, i32 %4078, i64 0, !dbg !185 + %4083 = insertelement <4 x i32> %4082, i32 %4079, i64 1, !dbg !185 + %4084 = insertelement <4 x i32> %4083, i32 %4080, i64 2, !dbg !185 + %4085 = insertelement <4 x i32> %4084, i32 %4081, i64 3, !dbg !185 + store <4 x i32> %4085, ptr addrspace(3) %4077, align 16, !dbg !185 + %4086 = getelementptr inbounds nuw i8, ptr addrspace(3) %4077, i32 512, !dbg !185 + %4087 = bitcast <2 x bfloat> %3976 to i32, !dbg !185 + %4088 = bitcast <2 x bfloat> %3982 to i32, !dbg !185 + %4089 = bitcast <2 x bfloat> %3988 to i32, !dbg !185 + %4090 = bitcast <2 x bfloat> %3994 to i32, !dbg !185 + %4091 = insertelement <4 x i32> poison, i32 %4087, i64 0, !dbg !185 + %4092 = insertelement <4 x i32> %4091, i32 %4088, i64 1, !dbg !185 + %4093 = insertelement <4 x i32> %4092, i32 %4089, i64 2, !dbg !185 + %4094 = insertelement <4 x i32> %4093, i32 %4090, i64 3, !dbg !185 + store <4 x i32> %4094, ptr addrspace(3) %4086, align 16, !dbg !185 + %4095 = xor i32 %4076, 32, !dbg !185 + %4096 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4095, !dbg !185 + %4097 = bitcast <2 x bfloat> %3997 to i32, !dbg !185 + %4098 = bitcast <2 x bfloat> %4003 to i32, !dbg !185 + %4099 = bitcast <2 x bfloat> %4009 to i32, !dbg !185 + %4100 = bitcast <2 x bfloat> %4015 to i32, !dbg !185 + %4101 = insertelement <4 x i32> poison, i32 %4097, i64 0, !dbg !185 + %4102 = insertelement <4 x i32> %4101, i32 %4098, i64 1, !dbg !185 + %4103 = insertelement <4 x i32> %4102, i32 %4099, i64 2, !dbg !185 + %4104 = insertelement <4 x i32> %4103, i32 %4100, i64 3, !dbg !185 + store <4 x i32> %4104, ptr addrspace(3) %4096, align 16, !dbg !185 + %4105 = getelementptr inbounds nuw i8, ptr addrspace(3) %4096, i32 512, !dbg !185 + %4106 = bitcast <2 x bfloat> %4000 to i32, !dbg !185 + %4107 = bitcast <2 x bfloat> %4006 to i32, !dbg !185 + %4108 = bitcast <2 x bfloat> %4012 to i32, !dbg !185 + %4109 = bitcast <2 x bfloat> %4018 to i32, !dbg !185 + %4110 = insertelement <4 x i32> poison, i32 %4106, i64 0, !dbg !185 + %4111 = insertelement <4 x i32> %4110, i32 %4107, i64 1, !dbg !185 + %4112 = insertelement <4 x i32> %4111, i32 %4108, i64 2, !dbg !185 + %4113 = insertelement <4 x i32> %4112, i32 %4109, i64 3, !dbg !185 + store <4 x i32> %4113, ptr addrspace(3) %4105, align 16, !dbg !185 + %4114 = xor i32 %4076, 64, !dbg !185 + %4115 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4114, !dbg !185 + %4116 = bitcast <2 x bfloat> %4021 to i32, !dbg !185 + %4117 = bitcast <2 x bfloat> %4027 to i32, !dbg !185 + %4118 = bitcast <2 x bfloat> %4033 to i32, !dbg !185 + %4119 = bitcast <2 x bfloat> %4039 to i32, !dbg !185 + %4120 = insertelement <4 x i32> poison, i32 %4116, i64 0, !dbg !185 + %4121 = insertelement <4 x i32> %4120, i32 %4117, i64 1, !dbg !185 + %4122 = insertelement <4 x i32> %4121, i32 %4118, i64 2, !dbg !185 + %4123 = insertelement <4 x i32> %4122, i32 %4119, i64 3, !dbg !185 + store <4 x i32> %4123, ptr addrspace(3) %4115, align 16, !dbg !185 + %4124 = getelementptr inbounds nuw i8, ptr addrspace(3) %4115, i32 512, !dbg !185 + %4125 = bitcast <2 x bfloat> %4024 to i32, !dbg !185 + %4126 = bitcast <2 x bfloat> %4030 to i32, !dbg !185 + %4127 = bitcast <2 x bfloat> %4036 to i32, !dbg !185 + %4128 = bitcast <2 x bfloat> %4042 to i32, !dbg !185 + %4129 = insertelement <4 x i32> poison, i32 %4125, i64 0, !dbg !185 + %4130 = insertelement <4 x i32> %4129, i32 %4126, i64 1, !dbg !185 + %4131 = insertelement <4 x i32> %4130, i32 %4127, i64 2, !dbg !185 + %4132 = insertelement <4 x i32> %4131, i32 %4128, i64 3, !dbg !185 + store <4 x i32> %4132, ptr addrspace(3) %4124, align 16, !dbg !185 + %4133 = xor i32 %4076, 96, !dbg !185 + %4134 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4133, !dbg !185 + %4135 = bitcast <2 x bfloat> %4045 to i32, !dbg !185 + %4136 = bitcast <2 x bfloat> %4051 to i32, !dbg !185 + %4137 = bitcast <2 x bfloat> %4057 to i32, !dbg !185 + %4138 = bitcast <2 x bfloat> %4063 to i32, !dbg !185 + %4139 = insertelement <4 x i32> poison, i32 %4135, i64 0, !dbg !185 + %4140 = insertelement <4 x i32> %4139, i32 %4136, i64 1, !dbg !185 + %4141 = insertelement <4 x i32> %4140, i32 %4137, i64 2, !dbg !185 + %4142 = insertelement <4 x i32> %4141, i32 %4138, i64 3, !dbg !185 + store <4 x i32> %4142, ptr addrspace(3) %4134, align 16, !dbg !185 + %4143 = getelementptr inbounds nuw i8, ptr addrspace(3) %4134, i32 512, !dbg !185 + %4144 = bitcast <2 x bfloat> %4048 to i32, !dbg !185 + %4145 = bitcast <2 x bfloat> %4054 to i32, !dbg !185 + %4146 = bitcast <2 x bfloat> %4060 to i32, !dbg !185 + %4147 = bitcast <2 x bfloat> %4066 to i32, !dbg !185 + %4148 = insertelement <4 x i32> poison, i32 %4144, i64 0, !dbg !185 + %4149 = insertelement <4 x i32> %4148, i32 %4145, i64 1, !dbg !185 + %4150 = insertelement <4 x i32> %4149, i32 %4146, i64 2, !dbg !185 + %4151 = insertelement <4 x i32> %4150, i32 %4147, i64 3, !dbg !185 + store <4 x i32> %4151, ptr addrspace(3) %4143, align 16, !dbg !185 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !185 + %4152 = shl nuw nsw i32 %4070, 10, !dbg !185 + %4153 = shl nuw nsw i32 %204, 5, !dbg !185 + %4154 = and i32 %40, 252, !dbg !185 + %4155 = shl nuw nsw i32 %4154, 2, !dbg !185 + %4156 = or disjoint i32 %4152, %4153, !dbg !185 + %4157 = xor i32 %4156, %4155, !dbg !185 + %4158 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4157, !dbg !185 + %4159 = ptrtoint ptr addrspace(3) %4158 to i32, !dbg !185 + %4160 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4159) #2, !dbg !185 + %4161 = extractvalue { i32, i32, i32, i32 } %4160, 0, !dbg !185 + %4162 = extractvalue { i32, i32, i32, i32 } %4160, 1, !dbg !185 + %4163 = extractvalue { i32, i32, i32, i32 } %4160, 2, !dbg !185 + %4164 = extractvalue { i32, i32, i32, i32 } %4160, 3, !dbg !185 + %4165 = getelementptr inbounds nuw i8, ptr addrspace(3) %4158, i32 1024, !dbg !185 + %4166 = ptrtoint ptr addrspace(3) %4165 to i32, !dbg !185 + %4167 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4166) #2, !dbg !185 + %4168 = extractvalue { i32, i32, i32, i32 } %4167, 0, !dbg !185 + %4169 = extractvalue { i32, i32, i32, i32 } %4167, 1, !dbg !185 + %4170 = extractvalue { i32, i32, i32, i32 } %4167, 2, !dbg !185 + %4171 = extractvalue { i32, i32, i32, i32 } %4167, 3, !dbg !185 + %4172 = getelementptr inbounds nuw i8, ptr addrspace(3) %4158, i32 2048, !dbg !185 + %4173 = ptrtoint ptr addrspace(3) %4172 to i32, !dbg !185 + %4174 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4173) #2, !dbg !185 + %4175 = extractvalue { i32, i32, i32, i32 } %4174, 0, !dbg !185 + %4176 = extractvalue { i32, i32, i32, i32 } %4174, 1, !dbg !185 + %4177 = extractvalue { i32, i32, i32, i32 } %4174, 2, !dbg !185 + %4178 = extractvalue { i32, i32, i32, i32 } %4174, 3, !dbg !185 + %4179 = getelementptr inbounds nuw i8, ptr addrspace(3) %4158, i32 3072, !dbg !185 + %4180 = ptrtoint ptr addrspace(3) %4179 to i32, !dbg !185 + %4181 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4180) #2, !dbg !185 + %4182 = extractvalue { i32, i32, i32, i32 } %4181, 0, !dbg !185 + %4183 = extractvalue { i32, i32, i32, i32 } %4181, 1, !dbg !185 + %4184 = extractvalue { i32, i32, i32, i32 } %4181, 2, !dbg !185 + %4185 = extractvalue { i32, i32, i32, i32 } %4181, 3, !dbg !185 + %4186 = getelementptr inbounds nuw i8, ptr addrspace(3) %4158, i32 4096, !dbg !185 + %4187 = ptrtoint ptr addrspace(3) %4186 to i32, !dbg !185 + %4188 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4187) #2, !dbg !185 + %4189 = extractvalue { i32, i32, i32, i32 } %4188, 0, !dbg !185 + %4190 = extractvalue { i32, i32, i32, i32 } %4188, 1, !dbg !185 + %4191 = extractvalue { i32, i32, i32, i32 } %4188, 2, !dbg !185 + %4192 = extractvalue { i32, i32, i32, i32 } %4188, 3, !dbg !185 + %4193 = getelementptr inbounds nuw i8, ptr addrspace(3) %4158, i32 5120, !dbg !185 + %4194 = ptrtoint ptr addrspace(3) %4193 to i32, !dbg !185 + %4195 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4194) #2, !dbg !185 + %4196 = extractvalue { i32, i32, i32, i32 } %4195, 0, !dbg !185 + %4197 = extractvalue { i32, i32, i32, i32 } %4195, 1, !dbg !185 + %4198 = extractvalue { i32, i32, i32, i32 } %4195, 2, !dbg !185 + %4199 = extractvalue { i32, i32, i32, i32 } %4195, 3, !dbg !185 + %4200 = getelementptr inbounds nuw i8, ptr addrspace(3) %4158, i32 6144, !dbg !185 + %4201 = ptrtoint ptr addrspace(3) %4200 to i32, !dbg !185 + %4202 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4201) #2, !dbg !185 + %4203 = extractvalue { i32, i32, i32, i32 } %4202, 0, !dbg !185 + %4204 = extractvalue { i32, i32, i32, i32 } %4202, 1, !dbg !185 + %4205 = extractvalue { i32, i32, i32, i32 } %4202, 2, !dbg !185 + %4206 = extractvalue { i32, i32, i32, i32 } %4202, 3, !dbg !185 + %4207 = getelementptr inbounds nuw i8, ptr addrspace(3) %4158, i32 7168, !dbg !185 + %4208 = ptrtoint ptr addrspace(3) %4207 to i32, !dbg !185 + %4209 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4208) #2, !dbg !185 + %4210 = extractvalue { i32, i32, i32, i32 } %4209, 0, !dbg !185 + %4211 = extractvalue { i32, i32, i32, i32 } %4209, 1, !dbg !185 + %4212 = extractvalue { i32, i32, i32, i32 } %4209, 2, !dbg !185 + %4213 = extractvalue { i32, i32, i32, i32 } %4209, 3, !dbg !185 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4161, i32 %4162, i32 %4163, i32 %4164, ptr addrspace(1) %3956, i1 %95) #2, !dbg !185 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4168, i32 %4169, i32 %4170, i32 %4171, ptr addrspace(1) %3958, i1 %96) #2, !dbg !185 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4175, i32 %4176, i32 %4177, i32 %4178, ptr addrspace(1) %3960, i1 %97) #2, !dbg !185 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4182, i32 %4183, i32 %4184, i32 %4185, ptr addrspace(1) %3962, i1 %98) #2, !dbg !185 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4189, i32 %4190, i32 %4191, i32 %4192, ptr addrspace(1) %3964, i1 %99) #2, !dbg !185 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4196, i32 %4197, i32 %4198, i32 %4199, ptr addrspace(1) %3966, i1 %100) #2, !dbg !185 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4203, i32 %4204, i32 %4205, i32 %4206, ptr addrspace(1) %3968, i1 %101) #2, !dbg !185 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4210, i32 %4211, i32 %4212, i32 %4213, ptr addrspace(1) %3970, i1 %102) #2, !dbg !185 + %4214 = fcmp olt float %3808, 0x3810000000000000, !dbg !186 + %4215 = fmul float %3808, 0x4160000000000000, !dbg !186 + %.02.i = select i1 %4214, float %4215, float %3808, !dbg !186 + %i.i.0.i = select i1 %4214, float -2.300000e+01, float 0.000000e+00, !dbg !186 + %4216 = bitcast float %.02.i to i32, !dbg !186 + %4217 = add i32 %4216, -1060439283, !dbg !186 + %4218 = and i32 %4217, -8388608, !dbg !186 + %4219 = sub i32 %4216, %4218, !dbg !186 + %4220 = bitcast i32 %4219 to float, !dbg !186 + %4221 = sitofp i32 %4218 to float, !dbg !186 + %4222 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not.i356 = icmp eq i32 %4222, 0, !dbg !186 + %4223 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %4221, float 0x3E80000000000000, float %i.i.0.i) #2, !dbg !186 + %4224 = tail call float @llvm.nvvm.fma.rn.f(float %4221, float 0x3E80000000000000, float %i.i.0.i) #2, !dbg !186 + %.08.i = select i1 %.not.i356, float %4224, float %4223, !dbg !186 + %4225 = fadd float %4220, -1.000000e+00, !dbg !186 + %4226 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not1.i = icmp eq i32 %4226, 0, !dbg !186 + %4227 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %4225, float 0xBFC58FE600000000) #2, !dbg !186 + %4228 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %4225, float 0xBFC58FE600000000) #2, !dbg !186 + %.010.i = select i1 %.not1.i, float %4228, float %4227, !dbg !186 + %4229 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not2.i = icmp eq i32 %4229, 0, !dbg !186 + %4230 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i, float %4225, float 0x3FC5F9E540000000) #2, !dbg !186 + %4231 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i, float %4225, float 0x3FC5F9E540000000) #2, !dbg !186 + %.011.i = select i1 %.not2.i, float %4231, float %4230, !dbg !186 + %4232 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not3.i = icmp eq i32 %4232, 0, !dbg !186 + %4233 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i, float %4225, float 0xBFC6E9C860000000) #2, !dbg !186 + %4234 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i, float %4225, float 0xBFC6E9C860000000) #2, !dbg !186 + %.012.i = select i1 %.not3.i, float %4234, float %4233, !dbg !186 + %4235 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not4.i = icmp eq i32 %4235, 0, !dbg !186 + %4236 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i, float %4225, float 0x3FCA417E80000000) #2, !dbg !186 + %4237 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i, float %4225, float 0x3FCA417E80000000) #2, !dbg !186 + %.09.i = select i1 %.not4.i, float %4237, float %4236, !dbg !186 + %4238 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not5.i = icmp eq i32 %4238, 0, !dbg !186 + %4239 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i, float %4225, float 0xBFCEC79160000000) #2, !dbg !186 + %4240 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i, float %4225, float 0xBFCEC79160000000) #2, !dbg !186 + %.05.i = select i1 %.not5.i, float %4240, float %4239, !dbg !186 + %4241 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not6.i = icmp eq i32 %4241, 0, !dbg !186 + %4242 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %4225, float 0x3FD277F320000000) #2, !dbg !186 + %4243 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %4225, float 0x3FD277F320000000) #2, !dbg !186 + %.01.i = select i1 %.not6.i, float %4243, float %4242, !dbg !186 + %4244 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not7.i = icmp eq i32 %4244, 0, !dbg !186 + %4245 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i, float %4225, float 0xBFD7154920000000) #2, !dbg !186 + %4246 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i, float %4225, float 0xBFD7154920000000) #2, !dbg !186 + %.0.i357 = select i1 %.not7.i, float %4246, float %4245, !dbg !186 + %4247 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not8.i = icmp eq i32 %4247, 0, !dbg !186 + %4248 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i357, float %4225, float 0x3FDEC70940000000) #2, !dbg !186 + %4249 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i357, float %4225, float 0x3FDEC70940000000) #2, !dbg !186 + %.07.i = select i1 %.not8.i, float %4249, float %4248, !dbg !186 + %4250 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not9.i = icmp eq i32 %4250, 0, !dbg !186 + %4251 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %4225, float 0xBFE7154760000000) #2, !dbg !186 + %4252 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %4225, float 0xBFE7154760000000) #2, !dbg !186 + %.06.i = select i1 %.not9.i, float %4252, float %4251, !dbg !186 + %4253 = fmul float %4225, %.06.i, !dbg !186 + %4254 = fmul float %4225, %4253, !dbg !186 + %4255 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not10.i = icmp eq i32 %4255, 0, !dbg !186 + %4256 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %4225, float 0x3FF7154760000000, float %4254) #2, !dbg !186 + %4257 = tail call float @llvm.nvvm.fma.rn.f(float %4225, float 0x3FF7154760000000, float %4254) #2, !dbg !186 + %.04.i = select i1 %.not10.i, float %4257, float %4256, !dbg !186 + %4258 = fadd float %.08.i, %.04.i, !dbg !186 + %4259 = icmp ugt i32 %4216, 2139095039, !dbg !186 + br i1 %4259, label %__nv_fmaf_rn.exit.i.i, label %__nv_log2f.exit, !dbg !186 + +__nv_fmaf_rn.exit.i.i: ; preds = %._crit_edge461 + %4260 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not11.i = icmp eq i32 %4260, 0, !dbg !186 + %4261 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !186 + %4262 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !186 + %.03.i = select i1 %.not11.i, float %4262, float %4261, !dbg !186 + br label %__nv_log2f.exit, !dbg !186 + +__nv_log2f.exit: ; preds = %._crit_edge461, %__nv_fmaf_rn.exit.i.i + %r.i.0.i = phi float [ %.03.i, %__nv_fmaf_rn.exit.i.i ], [ %4258, %._crit_edge461 ], !dbg !186 + %4263 = fcmp olt float %3809, 0x3810000000000000, !dbg !186 + %4264 = fmul float %3809, 0x4160000000000000, !dbg !186 + %.02.i358 = select i1 %4263, float %4264, float %3809, !dbg !186 + %i.i.0.i359 = select i1 %4263, float -2.300000e+01, float 0.000000e+00, !dbg !186 + %4265 = bitcast float %.02.i358 to i32, !dbg !186 + %4266 = add i32 %4265, -1060439283, !dbg !186 + %4267 = and i32 %4266, -8388608, !dbg !186 + %4268 = sub i32 %4265, %4267, !dbg !186 + %4269 = bitcast i32 %4268 to float, !dbg !186 + %4270 = sitofp i32 %4267 to float, !dbg !186 + %4271 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not.i360 = icmp eq i32 %4271, 0, !dbg !186 + %4272 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %4270, float 0x3E80000000000000, float %i.i.0.i359) #2, !dbg !186 + %4273 = tail call float @llvm.nvvm.fma.rn.f(float %4270, float 0x3E80000000000000, float %i.i.0.i359) #2, !dbg !186 + %.08.i361 = select i1 %.not.i360, float %4273, float %4272, !dbg !186 + %4274 = fadd float %4269, -1.000000e+00, !dbg !186 + %4275 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not1.i362 = icmp eq i32 %4275, 0, !dbg !186 + %4276 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %4274, float 0xBFC58FE600000000) #2, !dbg !186 + %4277 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %4274, float 0xBFC58FE600000000) #2, !dbg !186 + %.010.i363 = select i1 %.not1.i362, float %4277, float %4276, !dbg !186 + %4278 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not2.i364 = icmp eq i32 %4278, 0, !dbg !186 + %4279 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i363, float %4274, float 0x3FC5F9E540000000) #2, !dbg !186 + %4280 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i363, float %4274, float 0x3FC5F9E540000000) #2, !dbg !186 + %.011.i365 = select i1 %.not2.i364, float %4280, float %4279, !dbg !186 + %4281 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not3.i366 = icmp eq i32 %4281, 0, !dbg !186 + %4282 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i365, float %4274, float 0xBFC6E9C860000000) #2, !dbg !186 + %4283 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i365, float %4274, float 0xBFC6E9C860000000) #2, !dbg !186 + %.012.i367 = select i1 %.not3.i366, float %4283, float %4282, !dbg !186 + %4284 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not4.i368 = icmp eq i32 %4284, 0, !dbg !186 + %4285 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i367, float %4274, float 0x3FCA417E80000000) #2, !dbg !186 + %4286 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i367, float %4274, float 0x3FCA417E80000000) #2, !dbg !186 + %.09.i369 = select i1 %.not4.i368, float %4286, float %4285, !dbg !186 + %4287 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not5.i370 = icmp eq i32 %4287, 0, !dbg !186 + %4288 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i369, float %4274, float 0xBFCEC79160000000) #2, !dbg !186 + %4289 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i369, float %4274, float 0xBFCEC79160000000) #2, !dbg !186 + %.05.i371 = select i1 %.not5.i370, float %4289, float %4288, !dbg !186 + %4290 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not6.i372 = icmp eq i32 %4290, 0, !dbg !186 + %4291 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i371, float %4274, float 0x3FD277F320000000) #2, !dbg !186 + %4292 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i371, float %4274, float 0x3FD277F320000000) #2, !dbg !186 + %.01.i373 = select i1 %.not6.i372, float %4292, float %4291, !dbg !186 + %4293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not7.i374 = icmp eq i32 %4293, 0, !dbg !186 + %4294 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i373, float %4274, float 0xBFD7154920000000) #2, !dbg !186 + %4295 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i373, float %4274, float 0xBFD7154920000000) #2, !dbg !186 + %.0.i375 = select i1 %.not7.i374, float %4295, float %4294, !dbg !186 + %4296 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not8.i376 = icmp eq i32 %4296, 0, !dbg !186 + %4297 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i375, float %4274, float 0x3FDEC70940000000) #2, !dbg !186 + %4298 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i375, float %4274, float 0x3FDEC70940000000) #2, !dbg !186 + %.07.i377 = select i1 %.not8.i376, float %4298, float %4297, !dbg !186 + %4299 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not9.i378 = icmp eq i32 %4299, 0, !dbg !186 + %4300 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i377, float %4274, float 0xBFE7154760000000) #2, !dbg !186 + %4301 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i377, float %4274, float 0xBFE7154760000000) #2, !dbg !186 + %.06.i379 = select i1 %.not9.i378, float %4301, float %4300, !dbg !186 + %4302 = fmul float %4274, %.06.i379, !dbg !186 + %4303 = fmul float %4274, %4302, !dbg !186 + %4304 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not10.i380 = icmp eq i32 %4304, 0, !dbg !186 + %4305 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %4274, float 0x3FF7154760000000, float %4303) #2, !dbg !186 + %4306 = tail call float @llvm.nvvm.fma.rn.f(float %4274, float 0x3FF7154760000000, float %4303) #2, !dbg !186 + %.04.i381 = select i1 %.not10.i380, float %4306, float %4305, !dbg !186 + %4307 = fadd float %.08.i361, %.04.i381, !dbg !186 + %4308 = icmp ugt i32 %4265, 2139095039, !dbg !186 + br i1 %4308, label %__nv_fmaf_rn.exit.i.i384, label %__nv_log2f.exit387, !dbg !186 + +__nv_fmaf_rn.exit.i.i384: ; preds = %__nv_log2f.exit + %4309 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !186 + %.not11.i385 = icmp eq i32 %4309, 0, !dbg !186 + %4310 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i358, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !186 + %4311 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i358, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !186 + %.03.i386 = select i1 %.not11.i385, float %4311, float %4310, !dbg !186 + br label %__nv_log2f.exit387, !dbg !186 + +__nv_log2f.exit387: ; preds = %__nv_log2f.exit, %__nv_fmaf_rn.exit.i.i384 + %r.i.0.i382 = phi float [ %.03.i386, %__nv_fmaf_rn.exit.i.i384 ], [ %4307, %__nv_log2f.exit ], !dbg !186 + %4312 = insertelement <2 x float> poison, float %.02.i, i64 0, !dbg !186 + %4313 = insertelement <2 x float> %4312, float %.02.i358, i64 1, !dbg !186 + %4314 = fcmp oeq <2 x float> %4313, zeroinitializer, !dbg !186 + %4315 = shl nuw nsw i32 %21, 5, !dbg !187 + %4316 = add nuw nsw i32 %4315, %22, !dbg !188 + %4317 = mul i32 %11, %4316, !dbg !189 + %4318 = sext i32 %4317 to i64, !dbg !190 + %4319 = getelementptr float, ptr addrspace(1) %3, i64 %4318, !dbg !190 + %4320 = and i32 %40, 127, !dbg !27 + %4321 = or disjoint i32 %39, %4320, !dbg !28 + %4322 = sext i32 %4321 to i64, !dbg !191 + %4323 = getelementptr float, ptr addrspace(1) %4319, i64 %4322, !dbg !191 + %4324 = insertelement <2 x float> poison, float %r.i.0.i, i64 0, !dbg !186 + %4325 = insertelement <2 x float> %4324, float %r.i.0.i382, i64 1, !dbg !186 + %4326 = select <2 x i1> %4314, <2 x float> splat (float 0xFFF0000000000000), <2 x float> %4325, !dbg !186 + %4327 = fadd <2 x float> %3804, %4326, !dbg !192 + %4328 = icmp slt i32 %4321, %11, !dbg !193 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !194 + %4329 = shl nuw nsw i32 %4154, 1, !dbg !194 + %4330 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4329, !dbg !194 + store <2 x float> %4327, ptr addrspace(3) %4330, align 8, !dbg !194 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !194 + %4331 = shl nuw nsw i32 %143, 3, !dbg !194 + %4332 = shl nuw nsw i32 %146, 2, !dbg !194 + %4333 = lshr exact i32 %147, 1, !dbg !194 + %4334 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4331, !dbg !194 + %4335 = getelementptr inbounds nuw i8, ptr addrspace(3) %4334, i32 %4332, !dbg !194 + %4336 = getelementptr inbounds nuw i8, ptr addrspace(3) %4335, i32 %4333, !dbg !194 + %4337 = load i32, ptr addrspace(3) %4336, align 4, !dbg !194 + %4338 = and i32 %40, 128, !dbg !194 + %4339 = icmp eq i32 %4338, 0, !dbg !194 + %4340 = and i1 %4339, %4328, !dbg !194 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %4337, ptr addrspace(1) %4323, i1 %4340) #2, !dbg !194 + ret void, !dbg !195 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smax.i32(i32, i32) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smin.i32(i32, i32) #1 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.commit.group() #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.wait.group(i32 immarg) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.idx.i32(i32, i32, i32, i32) #5 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.fence.sync.aligned() #6 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.commit_group.sync.aligned() #6 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.maxnum.f32(float, float) #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #5 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #7 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.f(float, float, float) #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #8 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #6 = { convergent nounwind } +attributes #7 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #8 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_tem_fused_0", linkageName: "triton_tem_fused_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 97, column: 28, scope: !5) +!9 = !DILocation(line: 98, column: 27, scope: !5) +!10 = !DILocation(line: 99, column: 27, scope: !5) +!11 = !DILocation(line: 103, column: 23, scope: !5) +!12 = !DILocation(line: 85, column: 54, scope: !5) +!13 = !DILocation(line: 107, column: 24, scope: !5) +!14 = !DILocation(line: 107, column: 45, scope: !5) +!15 = !DILocation(line: 107, column: 36, scope: !5) +!16 = !DILocation(line: 108, column: 25, scope: !5) +!17 = !DILocation(line: 108, column: 47, scope: !5) +!18 = !DILocation(line: 108, column: 37, scope: !5) +!19 = !DILocation(line: 111, column: 12, scope: !5) +!20 = !DILocation(line: 112, column: 12, scope: !5) +!21 = !DILocation(line: 113, column: 12, scope: !5) +!22 = !DILocation(line: 142, column: 51, scope: !5) +!23 = !DILocation(line: 142, column: 74, scope: !5) +!24 = !DILocation(line: 143, column: 46, scope: !5) +!25 = !DILocation(line: 143, column: 64, scope: !5) +!26 = !DILocation(line: 144, column: 23, scope: !5) +!27 = !DILocation(line: 144, column: 46, scope: !5) +!28 = !DILocation(line: 144, column: 33, scope: !5) +!29 = !DILocation(line: 284, column: 38, scope: !30, inlinedAt: !31) +!30 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!31 = !DILocation(line: 146, column: 101, scope: !5) +!32 = !DILocation(line: 284, column: 20, scope: !30, inlinedAt: !31) +!33 = !DILocation(line: 284, column: 56, scope: !30, inlinedAt: !31) +!34 = !DILocation(line: 284, column: 49, scope: !30, inlinedAt: !31) +!35 = !DILocation(line: 292, column: 52, scope: !30, inlinedAt: !31) +!36 = !DILocation(line: 292, column: 23, scope: !30, inlinedAt: !31) +!37 = !DILocation(line: 151, column: 26, scope: !5) +!38 = !DILocation(line: 152, column: 23, scope: !5) +!39 = !DILocation(line: 152, column: 37, scope: !5) +!40 = !DILocation(line: 153, column: 42, scope: !5) +!41 = !DILocation(line: 153, column: 28, scope: !5) +!42 = !DILocation(line: 154, column: 45, scope: !5) +!43 = !DILocation(line: 41, column: 22, scope: !44, inlinedAt: !46) +!44 = distinct !DILexicalBlockFile(scope: !5, file: !45, discriminator: 0) +!45 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!46 = !DILocation(line: 154, column: 92, scope: !5) +!47 = !DILocation(line: 41, column: 28, scope: !44, inlinedAt: !46) +!48 = !DILocation(line: 154, column: 102, scope: !5) +!49 = !DILocation(line: 154, column: 65, scope: !5) +!50 = !DILocation(line: 159, column: 37, scope: !5) +!51 = !DILocation(line: 257, column: 21, scope: !30, inlinedAt: !52) +!52 = !DILocation(line: 172, column: 41, scope: !5) +!53 = !DILocation(line: 376, column: 33, scope: !30, inlinedAt: !52) +!54 = !DILocation(line: 502, column: 40, scope: !30, inlinedAt: !52) +!55 = !DILocation(line: 376, column: 23, scope: !30, inlinedAt: !52) +!56 = !DILocation(line: 378, column: 23, scope: !30, inlinedAt: !52) +!57 = !DILocation(line: 379, column: 23, scope: !30, inlinedAt: !52) +!58 = !DILocation(line: 346, column: 35, scope: !30, inlinedAt: !52) +!59 = !DILocation(line: 284, column: 38, scope: !30, inlinedAt: !52) +!60 = !DILocation(line: 284, column: 20, scope: !30, inlinedAt: !52) +!61 = !DILocation(line: 284, column: 49, scope: !30, inlinedAt: !52) +!62 = !DILocation(line: 292, column: 52, scope: !30, inlinedAt: !52) +!63 = !DILocation(line: 292, column: 23, scope: !30, inlinedAt: !52) +!64 = !DILocation(line: 342, column: 32, scope: !30, inlinedAt: !52) +!65 = !DILocation(line: 351, column: 19, scope: !30, inlinedAt: !52) +!66 = !DILocation(line: 159, column: 24, scope: !5) +!67 = !DILocation(line: 398, column: 25, scope: !30, inlinedAt: !52) +!68 = !DILocation(line: 385, column: 24, scope: !30, inlinedAt: !52) +!69 = !DILocation(line: 395, column: 24, scope: !30, inlinedAt: !52) +!70 = !DILocation(line: 353, column: 14, scope: !30, inlinedAt: !52) +!71 = !DILocation(line: 367, column: 44, scope: !30, inlinedAt: !52) +!72 = !DILocation(line: 373, column: 23, scope: !30, inlinedAt: !52) +!73 = !DILocation(line: 381, column: 23, scope: !30, inlinedAt: !52) +!74 = !DILocation(line: 384, column: 24, scope: !30, inlinedAt: !52) +!75 = !DILocation(line: 387, column: 25, scope: !30, inlinedAt: !52) +!76 = !DILocation(line: 390, column: 25, scope: !30, inlinedAt: !52) +!77 = !DILocation(line: 391, column: 24, scope: !30, inlinedAt: !52) +!78 = !DILocation(line: 393, column: 39, scope: !30, inlinedAt: !52) +!79 = !DILocation(line: 396, column: 24, scope: !30, inlinedAt: !52) +!80 = !DILocation(line: 397, column: 23, scope: !30, inlinedAt: !52) +!81 = !DILocation(line: 399, column: 25, scope: !30, inlinedAt: !52) +!82 = !DILocation(line: 401, column: 25, scope: !30, inlinedAt: !52) +!83 = !DILocation(line: 402, column: 24, scope: !30, inlinedAt: !52) +!84 = !DILocation(line: 404, column: 39, scope: !30, inlinedAt: !52) +!85 = !DILocation(line: 405, column: 25, scope: !30, inlinedAt: !52) +!86 = !DILocation(line: 406, column: 24, scope: !30, inlinedAt: !52) +!87 = !DILocation(line: 407, column: 24, scope: !30, inlinedAt: !52) +!88 = !DILocation(line: 412, column: 73, scope: !30, inlinedAt: !52) +!89 = !DILocation(line: 417, column: 27, scope: !30, inlinedAt: !52) +!90 = !DILocation(line: 414, column: 69, scope: !30, inlinedAt: !52) +!91 = !DILocation(line: 168, column: 27, scope: !44, inlinedAt: !52) +!92 = !DILocation(line: 189, column: 40, scope: !44, inlinedAt: !52) +!93 = !DILocation(line: 421, column: 27, scope: !30, inlinedAt: !52) +!94 = !DILocation(line: 424, column: 51, scope: !30, inlinedAt: !52) +!95 = !DILocation(line: 423, column: 35, scope: !30, inlinedAt: !52) +!96 = !DILocation(line: 428, column: 31, scope: !30, inlinedAt: !52) +!97 = !DILocation(line: 428, column: 25, scope: !30, inlinedAt: !52) +!98 = !DILocation(line: 429, column: 39, scope: !30, inlinedAt: !52) +!99 = !DILocation(line: 429, column: 21, scope: !30, inlinedAt: !52) +!100 = !DILocation(line: 434, column: 16, scope: !30, inlinedAt: !52) +!101 = !DILocation(line: 261, column: 15, scope: !44, inlinedAt: !52) +!102 = !DILocation(line: 291, column: 36, scope: !44, inlinedAt: !52) +!103 = !DILocation(line: 434, column: 24, scope: !30, inlinedAt: !52) +!104 = !DILocation(line: 436, column: 16, scope: !30, inlinedAt: !52) +!105 = !DILocation(line: 440, column: 22, scope: !30, inlinedAt: !52) +!106 = !DILocation(line: 440, column: 44, scope: !30, inlinedAt: !52) +!107 = !DILocation(line: 548, column: 26, scope: !30, inlinedAt: !52) +!108 = !DILocation(line: 247, column: 33, scope: !30, inlinedAt: !52) +!109 = !DILocation(line: 248, column: 38, scope: !30, inlinedAt: !52) +!110 = !DILocation(line: 248, column: 24, scope: !30, inlinedAt: !52) +!111 = !DILocation(line: 249, column: 109, scope: !30, inlinedAt: !52) +!112 = !DILocation(line: 249, column: 113, scope: !30, inlinedAt: !52) +!113 = !DILocation(line: 249, column: 55, scope: !30, inlinedAt: !52) +!114 = !DILocation(line: 249, column: 25, scope: !30, inlinedAt: !52) +!115 = !DILocation(line: 250, column: 35, scope: !30, inlinedAt: !52) +!116 = !DILocation(line: 251, column: 34, scope: !30, inlinedAt: !52) +!117 = !DILocation(line: 251, column: 48, scope: !30, inlinedAt: !52) +!118 = !DILocation(line: 251, column: 63, scope: !30, inlinedAt: !52) +!119 = !DILocation(line: 252, column: 29, scope: !30, inlinedAt: !52) +!120 = !DILocation(line: 252, column: 61, scope: !30, inlinedAt: !52) +!121 = !DILocation(line: 252, column: 42, scope: !30, inlinedAt: !52) +!122 = !DILocation(line: 549, column: 21, scope: !30, inlinedAt: !52) +!123 = !DILocation(line: 136, column: 19, scope: !5) +!124 = !DILocation(line: 181, column: 35, scope: !5) +!125 = !DILocation(line: 182, column: 27, scope: !5) +!126 = !DILocation(line: 182, column: 41, scope: !5) +!127 = !DILocation(line: 183, column: 51, scope: !5) +!128 = !DILocation(line: 183, column: 32, scope: !5) +!129 = !DILocation(line: 184, column: 49, scope: !5) +!130 = !DILocation(line: 184, column: 69, scope: !5) +!131 = !DILocation(line: 502, column: 40, scope: !30, inlinedAt: !132) +!132 = !DILocation(line: 198, column: 45, scope: !5) +!133 = !DILocation(line: 346, column: 35, scope: !30, inlinedAt: !132) +!134 = !DILocation(line: 284, column: 38, scope: !30, inlinedAt: !132) +!135 = !DILocation(line: 284, column: 20, scope: !30, inlinedAt: !132) +!136 = !DILocation(line: 284, column: 49, scope: !30, inlinedAt: !132) +!137 = !DILocation(line: 292, column: 52, scope: !30, inlinedAt: !132) +!138 = !DILocation(line: 292, column: 23, scope: !30, inlinedAt: !132) +!139 = !DILocation(line: 342, column: 32, scope: !30, inlinedAt: !132) +!140 = !DILocation(line: 351, column: 19, scope: !30, inlinedAt: !132) +!141 = !DILocation(line: 186, column: 28, scope: !5) +!142 = !DILocation(line: 353, column: 14, scope: !30, inlinedAt: !132) +!143 = !DILocation(line: 367, column: 44, scope: !30, inlinedAt: !132) +!144 = !DILocation(line: 417, column: 27, scope: !30, inlinedAt: !132) +!145 = !DILocation(line: 367, column: 69, scope: !30, inlinedAt: !132) +!146 = !DILocation(line: 168, column: 27, scope: !44, inlinedAt: !132) +!147 = !DILocation(line: 189, column: 40, scope: !44, inlinedAt: !132) +!148 = !DILocation(line: 421, column: 27, scope: !30, inlinedAt: !132) +!149 = !DILocation(line: 424, column: 51, scope: !30, inlinedAt: !132) +!150 = !DILocation(line: 423, column: 35, scope: !30, inlinedAt: !132) +!151 = !DILocation(line: 428, column: 31, scope: !30, inlinedAt: !132) +!152 = !DILocation(line: 428, column: 25, scope: !30, inlinedAt: !132) +!153 = !DILocation(line: 429, column: 39, scope: !30, inlinedAt: !132) +!154 = !DILocation(line: 429, column: 21, scope: !30, inlinedAt: !132) +!155 = !DILocation(line: 434, column: 16, scope: !30, inlinedAt: !132) +!156 = !DILocation(line: 261, column: 15, scope: !44, inlinedAt: !132) +!157 = !DILocation(line: 291, column: 36, scope: !44, inlinedAt: !132) +!158 = !DILocation(line: 434, column: 24, scope: !30, inlinedAt: !132) +!159 = !DILocation(line: 436, column: 16, scope: !30, inlinedAt: !132) +!160 = !DILocation(line: 440, column: 22, scope: !30, inlinedAt: !132) +!161 = !DILocation(line: 440, column: 44, scope: !30, inlinedAt: !132) +!162 = !DILocation(line: 548, column: 26, scope: !30, inlinedAt: !132) +!163 = !DILocation(line: 247, column: 33, scope: !30, inlinedAt: !132) +!164 = !DILocation(line: 248, column: 38, scope: !30, inlinedAt: !132) +!165 = !DILocation(line: 248, column: 24, scope: !30, inlinedAt: !132) +!166 = !DILocation(line: 249, column: 109, scope: !30, inlinedAt: !132) +!167 = !DILocation(line: 249, column: 113, scope: !30, inlinedAt: !132) +!168 = !DILocation(line: 249, column: 55, scope: !30, inlinedAt: !132) +!169 = !DILocation(line: 249, column: 25, scope: !30, inlinedAt: !132) +!170 = !DILocation(line: 250, column: 35, scope: !30, inlinedAt: !132) +!171 = !DILocation(line: 251, column: 34, scope: !30, inlinedAt: !132) +!172 = !DILocation(line: 251, column: 48, scope: !30, inlinedAt: !132) +!173 = !DILocation(line: 251, column: 63, scope: !30, inlinedAt: !132) +!174 = !DILocation(line: 252, column: 29, scope: !30, inlinedAt: !132) +!175 = !DILocation(line: 252, column: 61, scope: !30, inlinedAt: !132) +!176 = !DILocation(line: 252, column: 42, scope: !30, inlinedAt: !132) +!177 = !DILocation(line: 549, column: 21, scope: !30, inlinedAt: !132) +!178 = !DILocation(line: 206, column: 26, scope: !5) +!179 = !DILocation(line: 206, column: 34, scope: !5) +!180 = !DILocation(line: 208, column: 16, scope: !5) +!181 = !DILocation(line: 218, column: 49, scope: !5) +!182 = !DILocation(line: 218, column: 62, scope: !5) +!183 = !DILocation(line: 218, column: 75, scope: !5) +!184 = !DILocation(line: 218, column: 25, scope: !5) +!185 = !DILocation(line: 218, column: 110, scope: !5) +!186 = !DILocation(line: 223, column: 33, scope: !5) +!187 = !DILocation(line: 221, column: 26, scope: !5) +!188 = !DILocation(line: 221, column: 31, scope: !5) +!189 = !DILocation(line: 222, column: 32, scope: !5) +!190 = !DILocation(line: 222, column: 23, scope: !5) +!191 = !DILocation(line: 222, column: 40, scope: !5) +!192 = !DILocation(line: 223, column: 20, scope: !5) +!193 = !DILocation(line: 227, column: 48, scope: !5) +!194 = !DILocation(line: 227, column: 29, scope: !5) +!195 = !DILocation(line: 229, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..c759b5452281040fbf30601257a0a4a85d156b40 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.ptx @@ -0,0 +1,3717 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_tem_fused_0 // -- Begin function triton_tem_fused_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_tem_fused_0 +.visible .entry triton_tem_fused_0( + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_0, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_1, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_2, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_3, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_4, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_5, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_6, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_7, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_8, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_9, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_10, + .param .u32 triton_tem_fused_0_param_11, + .param .u32 triton_tem_fused_0_param_12, + .param .u32 triton_tem_fused_0_param_13, + .param .u32 triton_tem_fused_0_param_14, + .param .u32 triton_tem_fused_0_param_15, + .param .u32 triton_tem_fused_0_param_16, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_17, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_18 +) +.reqntid 256 +{ + .reg .pred %p<435>; + .reg .b16 %rs<54>; + .reg .b32 %r<5358>; + .reg .b64 %rd<293>; + .loc 1 18 0 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:18:0 + +// %bb.0: + ld.param.b32 %r761, [triton_tem_fused_0_param_12]; + ld.param.b32 %r760, [triton_tem_fused_0_param_11]; + ld.param.b64 %rd37, [triton_tem_fused_0_param_8]; + ld.param.b64 %rd36, [triton_tem_fused_0_param_7]; + ld.param.b64 %rd68, [triton_tem_fused_0_param_0]; +$L__tmp0: + .loc 1 97 28 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:97:28 + mov.u32 %r862, %ctaid.x; + ld.param.b64 %rd69, [triton_tem_fused_0_param_1]; + .loc 1 98 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:98:27 + mov.u32 %r1, %ctaid.y; + ld.param.b64 %rd70, [triton_tem_fused_0_param_2]; + .loc 1 99 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:99:27 + mov.u32 %r2, %ctaid.z; + .loc 1 103 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:103:23 + and.b32 %r863, %r1, 1; + ld.param.b64 %rd71, [triton_tem_fused_0_param_5]; + .loc 1 107 24 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:107:24 + mul.lo.s32 %r864, %r1, %r760; + shl.b32 %r3, %r864, 12; + ld.param.b64 %rd72, [triton_tem_fused_0_param_6]; + .loc 1 107 45 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:107:45 + shl.b32 %r4, %r2, 7; + .loc 1 107 36 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:107:36 + add.s32 %r865, %r3, %r4; + .loc 1 108 25 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:108:25 + shl.b32 %r866, %r863, 10; + ld.param.b64 %rd73, [triton_tem_fused_0_param_9]; + .loc 1 108 47 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:108:47 + shl.b32 %r867, %r2, 5; + and.b32 %r868, %r867, 2097024; + .loc 1 108 37 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:108:37 + add.s32 %r869, %r866, %r868; + mul.lo.s32 %r870, %r869, %r761; + ld.param.b32 %r871, [triton_tem_fused_0_param_13]; + ld.param.b32 %r872, [triton_tem_fused_0_param_14]; + .loc 1 111 12 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:111:12 + mad.wide.s32 %rd74, %r865, 2, %rd68; + ld.param.b32 %r873, [triton_tem_fused_0_param_15]; + .loc 1 112 12 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:112:12 + mul.wide.s32 %rd75, %r870, 2; + add.s64 %rd1, %rd69, %rd75; + .loc 1 113 12 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:113:12 + add.s64 %rd2, %rd70, %rd75; + .loc 1 142 74 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:142:74 + mad.lo.s32 %r874, %r871, %r863, %r862; + .loc 1 143 64 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:143:64 + mad.lo.s32 %r875, %r872, %r863, %r862; + mul.lo.s32 %r876, %r875, %r873; + .loc 1 144 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:144:23 + shl.b32 %r5, %r862, 7; + .loc 1 144 46 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:144:46 + mov.u32 %r6, %tid.x; + shr.u32 %r7, %r6, 5; + and.b32 %r877, %r6, 240; + bfe.u32 %r8, %r6, 4, 4; + or.b32 %r9, %r8, 16; + or.b32 %r10, %r8, 32; + or.b32 %r11, %r8, 48; + .loc 1 144 33 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:144:33 + or.b32 %r12, %r8, %r5; + or.b32 %r13, %r9, %r5; + or.b32 %r14, %r10, %r5; + or.b32 %r15, %r11, %r5; + or.b32 %r16, %r12, 64; + or.b32 %r17, %r12, 80; + or.b32 %r18, %r12, 96; + or.b32 %r19, %r12, 112; +$L__tmp1: + .loc 1 284 38 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:38 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:146:101 ] + shl.b32 %r20, %r12, 12; + shl.b32 %r21, %r13, 12; + shl.b32 %r22, %r14, 12; + shl.b32 %r23, %r15, 12; + shl.b32 %r24, %r16, 12; + shl.b32 %r25, %r17, 12; + shl.b32 %r26, %r18, 12; + shl.b32 %r27, %r19, 12; + .loc 1 284 20 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:20 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:146:101 ] + mad.wide.s32 %rd76, %r20, 2, %rd74; + mad.wide.s32 %rd77, %r21, 2, %rd74; + mad.wide.s32 %rd78, %r22, 2, %rd74; + mad.wide.s32 %rd79, %r23, 2, %rd74; + mad.wide.s32 %rd80, %r24, 2, %rd74; + mad.wide.s32 %rd81, %r25, 2, %rd74; + mad.wide.s32 %rd82, %r26, 2, %rd74; + mad.wide.s32 %rd83, %r27, 2, %rd74; + .loc 1 284 56 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:56 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:146:101 ] + shl.b32 %r882, %r6, 3; + and.b32 %r883, %r882, 120; + .loc 1 284 49 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:49 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:146:101 ] + cvt.u64.u32 %rd3, %r883; + mul.wide.u32 %rd84, %r883, 2; + add.s64 %rd39, %rd76, %rd84; + add.s64 %rd40, %rd77, %rd84; + add.s64 %rd41, %rd78, %rd84; + add.s64 %rd42, %rd79, %rd84; + add.s64 %rd43, %rd80, %rd84; + add.s64 %rd44, %rd81, %rd84; + add.s64 %rd45, %rd82, %rd84; + add.s64 %rd46, %rd83, %rd84; + .loc 1 292 52 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:52 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:146:101 ] + setp.lt.s32 %p1, %r12, %r760; + setp.lt.s32 %p2, %r13, %r760; + setp.lt.s32 %p3, %r14, %r760; + setp.lt.s32 %p4, %r15, %r760; + setp.lt.s32 %p5, %r16, %r760; + setp.lt.s32 %p6, %r17, %r760; + setp.lt.s32 %p7, %r18, %r760; + setp.lt.s32 %p8, %r19, %r760; + mov.b32 %r5120, 0; + .loc 1 292 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:146:101 ] + // begin inline asm + mov.u32 %r763, %r5120; + mov.u32 %r764, %r5120; + mov.u32 %r765, %r5120; + mov.u32 %r766, %r5120; + @%p1 ld.global.v4.b32 { %r763, %r764, %r765, %r766 }, [ %rd39 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r771, %r5120; + mov.u32 %r772, %r5120; + mov.u32 %r773, %r5120; + mov.u32 %r774, %r5120; + @%p2 ld.global.v4.b32 { %r771, %r772, %r773, %r774 }, [ %rd40 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r779, %r5120; + mov.u32 %r780, %r5120; + mov.u32 %r781, %r5120; + mov.u32 %r782, %r5120; + @%p3 ld.global.v4.b32 { %r779, %r780, %r781, %r782 }, [ %rd41 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r787, %r5120; + mov.u32 %r788, %r5120; + mov.u32 %r789, %r5120; + mov.u32 %r790, %r5120; + @%p4 ld.global.v4.b32 { %r787, %r788, %r789, %r790 }, [ %rd42 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r795, %r5120; + mov.u32 %r796, %r5120; + mov.u32 %r797, %r5120; + mov.u32 %r798, %r5120; + @%p5 ld.global.v4.b32 { %r795, %r796, %r797, %r798 }, [ %rd43 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r803, %r5120; + mov.u32 %r804, %r5120; + mov.u32 %r805, %r5120; + mov.u32 %r806, %r5120; + @%p6 ld.global.v4.b32 { %r803, %r804, %r805, %r806 }, [ %rd44 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r811, %r5120; + mov.u32 %r812, %r5120; + mov.u32 %r813, %r5120; + mov.u32 %r814, %r5120; + @%p7 ld.global.v4.b32 { %r811, %r812, %r813, %r814 }, [ %rd45 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r819, %r5120; + mov.u32 %r820, %r5120; + mov.u32 %r821, %r5120; + mov.u32 %r822, %r5120; + @%p8 ld.global.v4.b32 { %r819, %r820, %r821, %r822 }, [ %rd46 + 0 ]; + // end inline asm + and.b32 %r28, %r6, 7; + shl.b32 %r884, %r28, 4; + shl.b32 %r885, %r877, 3; + and.b32 %r29, %r6, 112; + and.b32 %r30, %r6, 8; + shl.b32 %r886, %r30, 11; + or.b32 %r887, %r884, %r885; + xor.b32 %r888, %r887, %r29; + mov.b32 %r889, global_smem; + add.s32 %r890, %r889, %r886; + add.s32 %r891, %r890, %r888; + st.shared.v4.b32 [%r891+98304], {%r763, %r764, %r765, %r766}; + st.shared.v4.b32 [%r891+100352], {%r771, %r772, %r773, %r774}; + st.shared.v4.b32 [%r891+102400], {%r779, %r780, %r781, %r782}; + st.shared.v4.b32 [%r891+104448], {%r787, %r788, %r789, %r790}; + st.shared.v4.b32 [%r891+106496], {%r795, %r796, %r797, %r798}; + st.shared.v4.b32 [%r891+108544], {%r803, %r804, %r805, %r806}; + st.shared.v4.b32 [%r891+110592], {%r811, %r812, %r813, %r814}; + st.shared.v4.b32 [%r891+112640], {%r819, %r820, %r821, %r822}; +$L__tmp2: + .loc 1 151 26 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:151:26 + cvt.s64.s32 %rd4, %r876; + mad.wide.s32 %rd47, %r876, 4, %rd72; + .loc 1 152 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:152:23 + // begin inline asm + mov.u32 %r827, 0x0; + ld.global.b32 { %r827 }, [ %rd47 + 0 ]; + // end inline asm + .loc 1 152 37 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:152:37 + shl.b32 %r31, %r827, 7; + .loc 1 153 42 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:153:42 + cvt.s64.s32 %rd6, %r874; + mad.wide.s32 %rd48, %r874, 4, %rd71; + .loc 1 153 28 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:153:28 + // begin inline asm + mov.u32 %r828, 0x0; + ld.global.b32 { %r828 }, [ %rd48 + 0 ]; + // end inline asm + .loc 1 154 45 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:154:45 + shl.b32 %r892, %r828, 1; +$L__tmp3: + .loc 2 41 22 // standard.py:41:22 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:154:92 ] + add.s32 %r893, %r761, 63; + .loc 2 41 28 // standard.py:41:28 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:154:92 ] + shr.s32 %r894, %r893, 31; + shr.u32 %r895, %r894, 26; + add.s32 %r896, %r893, %r895; + shr.s32 %r897, %r896, 6; +$L__tmp4: + .loc 1 154 102 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:154:102 + max.s32 %r33, %r897, 1; + .loc 1 154 65 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:154:65 + min.s32 %r34, %r892, %r33; + .loc 1 159 37 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:159:37 + and.b32 %r35, %r6, 3; + shl.b32 %r36, %r35, 1; + or.b32 %r37, %r36, 1; + or.b32 %r39, %r36, 8; + or.b32 %r38, %r36, 9; + or.b32 %r43, %r36, 16; + or.b32 %r42, %r36, 17; + or.b32 %r41, %r36, 24; + or.b32 %r40, %r36, 25; + or.b32 %r51, %r36, 32; + or.b32 %r50, %r36, 33; + or.b32 %r49, %r36, 40; + or.b32 %r48, %r36, 41; + or.b32 %r47, %r36, 48; + or.b32 %r46, %r36, 49; + or.b32 %r45, %r36, 56; + or.b32 %r44, %r36, 57; +$L__tmp5: + .loc 1 376 33 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:376:33 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mad.wide.u32 %rd50, %r1, 8, %rd73; + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + setp.lt.s32 %p10, %r892, 1; + setp.gt.s32 %p9, %r892, 0; + .loc 1 376 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:376:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + // begin inline asm + mov.u64 %rd7, 0x0; + @%p9 ld.global.b64 { %rd7 }, [ %rd50 + 0 ]; + // end inline asm + .loc 1 346 35 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:346:35 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + or.b32 %r900, %r31, %r8; + or.b32 %r901, %r31, %r9; + or.b32 %r902, %r31, %r10; + or.b32 %r903, %r31, %r11; + .loc 1 284 38 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:38 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + shl.b32 %r904, %r900, 7; + shl.b32 %r905, %r901, 7; + shl.b32 %r906, %r902, 7; + shl.b32 %r907, %r903, 7; + .loc 1 284 20 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:20 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.wide.s32 %rd85, %r904, 2; + mul.wide.s32 %rd86, %r905, 2; + mul.wide.s32 %rd87, %r906, 2; + mul.wide.s32 %rd88, %r907, 2; + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s64 %rd10, %rd1, %rd84; + .loc 1 284 49 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:49 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s64 %rd51, %rd10, %rd85; + add.s64 %rd52, %rd10, %rd86; + add.s64 %rd53, %rd10, %rd87; + add.s64 %rd54, %rd10, %rd88; + .loc 1 292 52 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:52 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + setp.lt.s32 %p11, %r900, %r761; + setp.lt.s32 %p12, %r901, %r761; + setp.lt.s32 %p13, %r902, %r761; + setp.lt.s32 %p14, %r903, %r761; + .loc 1 292 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + shl.b32 %r908, %r30, 10; + or.b32 %r84, %r888, %r908; + add.s32 %r3013, %r889, %r84; + selp.b32 %r909, 16, 0, %p11; + selp.b32 %r838, %r909, 0, %p9; + // begin inline asm + cp.async.cg.shared.global [ %r3013 + 0 ], [ %rd51 + 0 ], 0x10, %r838; + // end inline asm + add.s32 %r3015, %r3013, 2048; + selp.b32 %r910, 16, 0, %p12; + selp.b32 %r840, %r910, 0, %p9; + // begin inline asm + cp.async.cg.shared.global [ %r3015 + 0 ], [ %rd52 + 0 ], 0x10, %r840; + // end inline asm + add.s32 %r3017, %r3013, 4096; + selp.b32 %r911, 16, 0, %p13; + selp.b32 %r842, %r911, 0, %p9; + // begin inline asm + cp.async.cg.shared.global [ %r3017 + 0 ], [ %rd53 + 0 ], 0x10, %r842; + // end inline asm + add.s32 %r3019, %r3013, 6144; + selp.b32 %r912, 16, 0, %p14; + selp.b32 %r844, %r912, 0, %p9; + // begin inline asm + cp.async.cg.shared.global [ %r3019 + 0 ], [ %rd54 + 0 ], 0x10, %r844; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s64 %rd11, %rd2, %rd84; + .loc 1 284 49 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:49 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s64 %rd55, %rd11, %rd85; + add.s64 %rd56, %rd11, %rd86; + add.s64 %rd57, %rd11, %rd87; + add.s64 %rd58, %rd11, %rd88; + .loc 1 292 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s32 %r837, %r3013, 49152; + // begin inline asm + cp.async.cg.shared.global [ %r837 + 0 ], [ %rd55 + 0 ], 0x10, %r838; + // end inline asm + add.s32 %r839, %r3013, 51200; + // begin inline asm + cp.async.cg.shared.global [ %r839 + 0 ], [ %rd56 + 0 ], 0x10, %r840; + // end inline asm + add.s32 %r841, %r3013, 53248; + // begin inline asm + cp.async.cg.shared.global [ %r841 + 0 ], [ %rd57 + 0 ], 0x10, %r842; + // end inline asm + add.s32 %r843, %r3013, 55296; + // begin inline asm + cp.async.cg.shared.global [ %r843 + 0 ], [ %rd58 + 0 ], 0x10, %r844; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + setp.gt.s32 %p15, %r34, 1; + .loc 1 342 32 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:342:32 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + or.b32 %r913, %r31, 64; + .loc 1 346 35 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:346:35 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + or.b32 %r914, %r913, %r8; + or.b32 %r915, %r913, %r9; + or.b32 %r916, %r913, %r10; + or.b32 %r917, %r913, %r11; + .loc 1 284 38 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:38 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + shl.b32 %r918, %r914, 7; + shl.b32 %r919, %r915, 7; + shl.b32 %r920, %r916, 7; + shl.b32 %r921, %r917, 7; + .loc 1 284 20 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:20 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.wide.s32 %rd89, %r918, 2; + mul.wide.s32 %rd90, %r919, 2; + mul.wide.s32 %rd91, %r920, 2; + mul.wide.s32 %rd92, %r921, 2; + .loc 1 284 49 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:49 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s64 %rd59, %rd10, %rd89; + add.s64 %rd60, %rd10, %rd90; + add.s64 %rd61, %rd10, %rd91; + add.s64 %rd62, %rd10, %rd92; + .loc 1 292 52 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:52 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + setp.lt.s32 %p16, %r914, %r761; + setp.lt.s32 %p17, %r915, %r761; + setp.lt.s32 %p18, %r916, %r761; + setp.lt.s32 %p19, %r917, %r761; + .loc 1 292 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + bar.sync 0; + add.s32 %r845, %r3013, 16384; + selp.b32 %r922, 16, 0, %p16; + selp.b32 %r854, %r922, 0, %p15; + // begin inline asm + cp.async.cg.shared.global [ %r845 + 0 ], [ %rd59 + 0 ], 0x10, %r854; + // end inline asm + add.s32 %r847, %r3013, 18432; + selp.b32 %r923, 16, 0, %p17; + selp.b32 %r856, %r923, 0, %p15; + // begin inline asm + cp.async.cg.shared.global [ %r847 + 0 ], [ %rd60 + 0 ], 0x10, %r856; + // end inline asm + add.s32 %r849, %r3013, 20480; + selp.b32 %r924, 16, 0, %p18; + selp.b32 %r858, %r924, 0, %p15; + // begin inline asm + cp.async.cg.shared.global [ %r849 + 0 ], [ %rd61 + 0 ], 0x10, %r858; + // end inline asm + add.s32 %r851, %r3013, 22528; + selp.b32 %r925, 16, 0, %p19; + selp.b32 %r860, %r925, 0, %p15; + // begin inline asm + cp.async.cg.shared.global [ %r851 + 0 ], [ %rd62 + 0 ], 0x10, %r860; + // end inline asm + cp.async.commit_group; + .loc 1 284 49 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:49 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s64 %rd63, %rd11, %rd89; + add.s64 %rd64, %rd11, %rd90; + add.s64 %rd65, %rd11, %rd91; + add.s64 %rd66, %rd11, %rd92; + .loc 1 292 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s32 %r853, %r3013, 65536; + // begin inline asm + cp.async.cg.shared.global [ %r853 + 0 ], [ %rd63 + 0 ], 0x10, %r854; + // end inline asm + add.s32 %r855, %r3013, 67584; + // begin inline asm + cp.async.cg.shared.global [ %r855 + 0 ], [ %rd64 + 0 ], 0x10, %r856; + // end inline asm + add.s32 %r857, %r3013, 69632; + // begin inline asm + cp.async.cg.shared.global [ %r857 + 0 ], [ %rd65 + 0 ], 0x10, %r858; + // end inline asm + add.s32 %r859, %r3013, 71680; + // begin inline asm + cp.async.cg.shared.global [ %r859 + 0 ], [ %rd66 + 0 ], 0x10, %r860; + // end inline asm + cp.async.commit_group; + .loc 1 351 19 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:351:19 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + mov.b32 %r926, 0fFF800000; + mov.b64 %rd292, {%r926, %r926}; + mov.b32 %r1749, 0f00000000; + mov.b32 %r1750, %r1749; + mov.b32 %r1751, %r1749; + mov.b32 %r1752, %r1749; + mov.b32 %r1753, %r1749; + mov.b32 %r1754, %r1749; + mov.b32 %r1755, %r1749; + mov.b32 %r1756, %r1749; + mov.b32 %r1757, %r1749; + mov.b32 %r1758, %r1749; + mov.b32 %r1759, %r1749; + mov.b32 %r1760, %r1749; + mov.b32 %r1761, %r1749; + mov.b32 %r1762, %r1749; + mov.b32 %r1763, %r1749; + mov.b32 %r1764, %r1749; + mov.b32 %r1765, %r1749; + mov.b32 %r1766, %r1749; + mov.b32 %r1767, %r1749; + mov.b32 %r1768, %r1749; + mov.b32 %r1769, %r1749; + mov.b32 %r1770, %r1749; + mov.b32 %r1771, %r1749; + mov.b32 %r1772, %r1749; + mov.b32 %r1773, %r1749; + mov.b32 %r1774, %r1749; + mov.b32 %r1775, %r1749; + mov.b32 %r1776, %r1749; + mov.b32 %r1777, %r1749; + mov.b32 %r1778, %r1749; + mov.b32 %r1779, %r1749; + mov.b32 %r1780, %r1749; + mov.b32 %r1781, %r1749; + mov.b32 %r1782, %r1749; + mov.b32 %r1783, %r1749; + mov.b32 %r1784, %r1749; + mov.b32 %r1785, %r1749; + mov.b32 %r1786, %r1749; + mov.b32 %r1787, %r1749; + mov.b32 %r1788, %r1749; + mov.b32 %r1789, %r1749; + mov.b32 %r1790, %r1749; + mov.b32 %r1791, %r1749; + mov.b32 %r1792, %r1749; + mov.b32 %r1793, %r1749; + mov.b32 %r1794, %r1749; + mov.b32 %r1795, %r1749; + mov.b32 %r1796, %r1749; + mov.b32 %r1797, %r1749; + mov.b32 %r1798, %r1749; + mov.b32 %r1799, %r1749; + mov.b32 %r1800, %r1749; + mov.b32 %r1801, %r1749; + mov.b32 %r1802, %r1749; + mov.b32 %r1803, %r1749; + mov.b32 %r1804, %r1749; + mov.b32 %r1805, %r1749; + mov.b32 %r1806, %r1749; + mov.b32 %r1807, %r1749; + mov.b32 %r1808, %r1749; + mov.b32 %r1809, %r1749; + mov.b32 %r1810, %r1749; + mov.b32 %r1811, %r1749; + mov.b32 %r1812, %r1749; + mov.b32 %r5354, %r1749; + mov.b32 %r5355, %r1749; + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + @%p10 bra $L__BB0_3; +$L__tmp6: +// %bb.1: // %.lr.ph + .loc 1 0 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:0:40 + ld.param.b32 %r762, [triton_tem_fused_0_param_16]; + shr.u32 %r878, %r6, 1; + and.b32 %r879, %r878, 112; + bfe.u32 %r880, %r6, 2, 3; + or.b32 %r881, %r879, %r880; + or.b32 %r898, %r881, %r5; + or.b32 %r899, %r898, 8; + rem.s32 %r52, %r899, %r760; + rem.s32 %r54, %r898, %r760; + cvt.s64.s32 %rd8, %r54; + cvt.s64.s32 %rd9, %r52; + .loc 1 159 24 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:159:24 + or.b32 %r5136, %r31, %r36; + or.b32 %r5135, %r31, %r37; + or.b32 %r5133, %r31, %r38; + or.b32 %r5129, %r31, %r40; + or.b32 %r5121, %r31, %r44; + or.b32 %r5134, %r31, %r39; + or.b32 %r5130, %r31, %r41; + or.b32 %r5122, %r31, %r45; + or.b32 %r5131, %r31, %r42; + or.b32 %r5123, %r31, %r46; + or.b32 %r5132, %r31, %r43; + or.b32 %r5124, %r31, %r47; + or.b32 %r5125, %r31, %r48; + or.b32 %r5126, %r31, %r49; + or.b32 %r5127, %r31, %r50; + or.b32 %r5128, %r31, %r51; + add.s32 %r117, %r34, -2; + add.s32 %r118, %r34, -1; +$L__tmp7: + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + max.s32 %r119, %r34, 1; + mov.b64 %rd292, {%r926, %r926}; + mov.b32 %r5354, 0f00000000; + mov.b32 %r5052, 1; + mov.b32 %r5051, -1; + mov.b32 %r5050, 64; + mov.b32 %r5053, %r5050; + mov.b32 %r5355, %r5354; + mov.b32 %r1749, %r5354; + mov.b32 %r1750, %r5354; + mov.b32 %r1751, %r5354; + mov.b32 %r1752, %r5354; + mov.b32 %r1753, %r5354; + mov.b32 %r1754, %r5354; + mov.b32 %r1755, %r5354; + mov.b32 %r1756, %r5354; + mov.b32 %r1757, %r5354; + mov.b32 %r1758, %r5354; + mov.b32 %r1759, %r5354; + mov.b32 %r1760, %r5354; + mov.b32 %r1761, %r5354; + mov.b32 %r1762, %r5354; + mov.b32 %r1763, %r5354; + mov.b32 %r1764, %r5354; + mov.b32 %r1765, %r5354; + mov.b32 %r1766, %r5354; + mov.b32 %r1767, %r5354; + mov.b32 %r1768, %r5354; + mov.b32 %r1769, %r5354; + mov.b32 %r1770, %r5354; + mov.b32 %r1771, %r5354; + mov.b32 %r1772, %r5354; + mov.b32 %r1773, %r5354; + mov.b32 %r1774, %r5354; + mov.b32 %r1775, %r5354; + mov.b32 %r1776, %r5354; + mov.b32 %r1777, %r5354; + mov.b32 %r1778, %r5354; + mov.b32 %r1779, %r5354; + mov.b32 %r1780, %r5354; + mov.b32 %r1781, %r5354; + mov.b32 %r1782, %r5354; + mov.b32 %r1783, %r5354; + mov.b32 %r1784, %r5354; + mov.b32 %r1785, %r5354; + mov.b32 %r1786, %r5354; + mov.b32 %r1787, %r5354; + mov.b32 %r1788, %r5354; + mov.b32 %r1789, %r5354; + mov.b32 %r1790, %r5354; + mov.b32 %r1791, %r5354; + mov.b32 %r1792, %r5354; + mov.b32 %r1793, %r5354; + mov.b32 %r1794, %r5354; + mov.b32 %r1795, %r5354; + mov.b32 %r1796, %r5354; + mov.b32 %r1797, %r5354; + mov.b32 %r1798, %r5354; + mov.b32 %r1799, %r5354; + mov.b32 %r1800, %r5354; + mov.b32 %r1801, %r5354; + mov.b32 %r1802, %r5354; + mov.b32 %r1803, %r5354; + mov.b32 %r1804, %r5354; + mov.b32 %r1805, %r5354; + mov.b32 %r1806, %r5354; + mov.b32 %r1807, %r5354; + mov.b32 %r1808, %r5354; + mov.b32 %r1809, %r5354; + mov.b32 %r1810, %r5354; + mov.b32 %r1811, %r5354; + mov.b32 %r1812, %r5354; +$L__BB0_2: // %__nv_exp2f.exit256 + // =>This Inner Loop Header: Depth=1 + .loc 1 0 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:0:40 + cvt.u32.u64 %r2163, %rd9; + cvt.u32.u64 %r2164, %rd8; + .loc 1 379 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:379:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + setp.gt.s64 %p33, %rd7, %rd9; + setp.gt.s64 %p34, %rd7, %rd8; + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + setp.lt.s32 %p35, %r5120, %r117; + setp.lt.s32 %p31, %r5120, %r118; + add.s32 %r2165, %r5051, 1; + setp.gt.s32 %p36, %r2165, 2; + selp.b32 %r5051, 0, %r2165, %p36; + .loc 1 292 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r2166, %r5051, 14; + add.s32 %r1448, %r889, %r2166; + .loc 1 351 19 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:351:19 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + shfl.sync.idx.b32 %r2168, %r7, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r2169, %r2168, 11; + and.b32 %r2170, %r2169, 8192; + add.s32 %r1445, %r889, 98304; + add.s32 %r2171, %r2170, %r1445; + bfe.u32 %r2172, %r2171, 4, 14; + cvt.u64.u32 %rd128, %r2172; + or.b64 %rd94, %rd128, 4611686293372403712; + bfe.u32 %r2173, %r1448, 4, 14; + cvt.u64.u32 %rd129, %r2173; + or.b64 %rd95, %rd129, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r1029,%r1030,%r1031,%r1032,%r1033,%r1034,%r1035,%r1036,%r1037,%r1038,%r1039,%r1040,%r1041,%r1042,%r1043,%r1044,%r1045,%r1046,%r1047,%r1048,%r1049,%r1050,%r1051,%r1052,%r1053,%r1054,%r1055,%r1056,%r1057,%r1058,%r1059,%r1060}, %rd94, %rd95, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r2174, %r2171, 32; + bfe.u32 %r2175, %r2174, 4, 14; + cvt.u64.u32 %rd130, %r2175; + or.b64 %rd96, %rd130, 4611686293372403712; + add.s32 %r2176, %r1448, 32; + bfe.u32 %r2177, %r2176, 4, 14; + cvt.u64.u32 %rd131, %r2177; + or.b64 %rd97, %rd131, 4611686293338849280; + mov.pred %p20, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r1029,%r1030,%r1031,%r1032,%r1033,%r1034,%r1035,%r1036,%r1037,%r1038,%r1039,%r1040,%r1041,%r1042,%r1043,%r1044,%r1045,%r1046,%r1047,%r1048,%r1049,%r1050,%r1051,%r1052,%r1053,%r1054,%r1055,%r1056,%r1057,%r1058,%r1059,%r1060}, %rd96, %rd97, %p20, 1, 1, 0, 0; + // end inline asm + add.s32 %r2178, %r2171, 64; + bfe.u32 %r2179, %r2178, 4, 14; + cvt.u64.u32 %rd132, %r2179; + or.b64 %rd98, %rd132, 4611686293372403712; + add.s32 %r2180, %r1448, 64; + bfe.u32 %r2181, %r2180, 4, 14; + cvt.u64.u32 %rd133, %r2181; + or.b64 %rd99, %rd133, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r1029,%r1030,%r1031,%r1032,%r1033,%r1034,%r1035,%r1036,%r1037,%r1038,%r1039,%r1040,%r1041,%r1042,%r1043,%r1044,%r1045,%r1046,%r1047,%r1048,%r1049,%r1050,%r1051,%r1052,%r1053,%r1054,%r1055,%r1056,%r1057,%r1058,%r1059,%r1060}, %rd98, %rd99, %p20, 1, 1, 0, 0; + // end inline asm + add.s32 %r2182, %r2171, 96; + bfe.u32 %r2183, %r2182, 4, 14; + cvt.u64.u32 %rd134, %r2183; + or.b64 %rd100, %rd134, 4611686293372403712; + add.s32 %r2184, %r1448, 96; + bfe.u32 %r2185, %r2184, 4, 14; + cvt.u64.u32 %rd135, %r2185; + or.b64 %rd101, %rd135, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r1029,%r1030,%r1031,%r1032,%r1033,%r1034,%r1035,%r1036,%r1037,%r1038,%r1039,%r1040,%r1041,%r1042,%r1043,%r1044,%r1045,%r1046,%r1047,%r1048,%r1049,%r1050,%r1051,%r1052,%r1053,%r1054,%r1055,%r1056,%r1057,%r1058,%r1059,%r1060}, %rd100, %rd101, %p20, 1, 1, 0, 0; + // end inline asm + add.s32 %r2186, %r2171, 16384; + bfe.u32 %r2187, %r2186, 4, 14; + cvt.u64.u32 %rd136, %r2187; + or.b64 %rd102, %rd136, 4611686293372403712; + add.s32 %r2188, %r1448, 8192; + bfe.u32 %r2189, %r2188, 4, 14; + cvt.u64.u32 %rd137, %r2189; + or.b64 %rd103, %rd137, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r1029,%r1030,%r1031,%r1032,%r1033,%r1034,%r1035,%r1036,%r1037,%r1038,%r1039,%r1040,%r1041,%r1042,%r1043,%r1044,%r1045,%r1046,%r1047,%r1048,%r1049,%r1050,%r1051,%r1052,%r1053,%r1054,%r1055,%r1056,%r1057,%r1058,%r1059,%r1060}, %rd102, %rd103, %p20, 1, 1, 0, 0; + // end inline asm + add.s32 %r2190, %r2171, 16416; + bfe.u32 %r2191, %r2190, 4, 14; + cvt.u64.u32 %rd138, %r2191; + or.b64 %rd104, %rd138, 4611686293372403712; + add.s32 %r2192, %r1448, 8224; + bfe.u32 %r2193, %r2192, 4, 14; + cvt.u64.u32 %rd139, %r2193; + or.b64 %rd105, %rd139, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r1029,%r1030,%r1031,%r1032,%r1033,%r1034,%r1035,%r1036,%r1037,%r1038,%r1039,%r1040,%r1041,%r1042,%r1043,%r1044,%r1045,%r1046,%r1047,%r1048,%r1049,%r1050,%r1051,%r1052,%r1053,%r1054,%r1055,%r1056,%r1057,%r1058,%r1059,%r1060}, %rd104, %rd105, %p20, 1, 1, 0, 0; + // end inline asm + add.s32 %r2194, %r2171, 16448; + bfe.u32 %r2195, %r2194, 4, 14; + cvt.u64.u32 %rd140, %r2195; + or.b64 %rd106, %rd140, 4611686293372403712; + add.s32 %r2196, %r1448, 8256; + bfe.u32 %r2197, %r2196, 4, 14; + cvt.u64.u32 %rd141, %r2197; + or.b64 %rd107, %rd141, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r1029,%r1030,%r1031,%r1032,%r1033,%r1034,%r1035,%r1036,%r1037,%r1038,%r1039,%r1040,%r1041,%r1042,%r1043,%r1044,%r1045,%r1046,%r1047,%r1048,%r1049,%r1050,%r1051,%r1052,%r1053,%r1054,%r1055,%r1056,%r1057,%r1058,%r1059,%r1060}, %rd106, %rd107, %p20, 1, 1, 0, 0; + // end inline asm + add.s32 %r2198, %r2171, 16480; + bfe.u32 %r2199, %r2198, 4, 14; + cvt.u64.u32 %rd142, %r2199; + or.b64 %rd108, %rd142, 4611686293372403712; + add.s32 %r2200, %r1448, 8288; + bfe.u32 %r2201, %r2200, 4, 14; + cvt.u64.u32 %rd143, %r2201; + or.b64 %rd109, %rd143, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r1029,%r1030,%r1031,%r1032,%r1033,%r1034,%r1035,%r1036,%r1037,%r1038,%r1039,%r1040,%r1041,%r1042,%r1043,%r1044,%r1045,%r1046,%r1047,%r1048,%r1049,%r1050,%r1051,%r1052,%r1053,%r1054,%r1055,%r1056,%r1057,%r1058,%r1059,%r1060}, %rd108, %rd109, %p20, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r1446, 0; + mov.b32 %r1447, %r1446; + mov.b32 %r1449, %r1446; + mov.b32 %r1450, %r1446; + // begin inline asm + // wait for regs: %r1029,%r1030,%r1031,%r1032,%r1033,%r1034,%r1035,%r1036,%r1037,%r1038,%r1039,%r1040,%r1041,%r1042,%r1043,%r1044,%r1045,%r1046,%r1047,%r1048,%r1049,%r1050,%r1051,%r1052,%r1053,%r1054,%r1055,%r1056,%r1057,%r1058,%r1059,%r1060,%r1445,%r1446,%r1447,%r1448,%r1449,%r1450,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765,%r1766,%r1767,%r1768,%r1769,%r1770,%r1771,%r1772,%r1773,%r1774,%r1775,%r1776,%r1777,%r1778,%r1779,%r1780,%r1781,%r1782,%r1783,%r1784,%r1785,%r1786,%r1787,%r1788,%r1789,%r1790,%r1791,%r1792,%r1793,%r1794,%r1795,%r1796,%r1797,%r1798,%r1799,%r1800,%r1801,%r1802,%r1803,%r1804,%r1805,%r1806,%r1807,%r1808,%r1809,%r1810,%r1811,%r1812 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 353 14 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:353:14 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2202, %r1029, 0f3DB504F3; + mul.f32 %r2203, %r1030, 0f3DB504F3; + mul.f32 %r2204, %r1031, 0f3DB504F3; + mul.f32 %r2205, %r1032, 0f3DB504F3; + mul.f32 %r2206, %r1033, 0f3DB504F3; + mul.f32 %r2207, %r1034, 0f3DB504F3; + mul.f32 %r2208, %r1035, 0f3DB504F3; + mul.f32 %r2209, %r1036, 0f3DB504F3; + mul.f32 %r2210, %r1037, 0f3DB504F3; + mul.f32 %r2211, %r1038, 0f3DB504F3; + mul.f32 %r2212, %r1039, 0f3DB504F3; + mul.f32 %r2213, %r1040, 0f3DB504F3; + mul.f32 %r2214, %r1041, 0f3DB504F3; + mul.f32 %r2215, %r1042, 0f3DB504F3; + mul.f32 %r2216, %r1043, 0f3DB504F3; + mul.f32 %r2217, %r1044, 0f3DB504F3; + mul.f32 %r2218, %r1045, 0f3DB504F3; + mul.f32 %r2219, %r1046, 0f3DB504F3; + mul.f32 %r2220, %r1047, 0f3DB504F3; + mul.f32 %r2221, %r1048, 0f3DB504F3; + mul.f32 %r2222, %r1049, 0f3DB504F3; + mul.f32 %r2223, %r1050, 0f3DB504F3; + mul.f32 %r2224, %r1051, 0f3DB504F3; + mul.f32 %r2225, %r1052, 0f3DB504F3; + mul.f32 %r2226, %r1053, 0f3DB504F3; + mul.f32 %r2227, %r1054, 0f3DB504F3; + mul.f32 %r2228, %r1055, 0f3DB504F3; + mul.f32 %r2229, %r1056, 0f3DB504F3; + mul.f32 %r2230, %r1057, 0f3DB504F3; + mul.f32 %r2231, %r1058, 0f3DB504F3; + mul.f32 %r2232, %r1059, 0f3DB504F3; + mul.f32 %r2233, %r1060, 0f3DB504F3; + .loc 1 367 44 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:44 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + setp.lt.s32 %p37, %r5136, %r761; + setp.lt.s32 %p38, %r5135, %r761; + setp.lt.s32 %p39, %r5134, %r761; + setp.lt.s32 %p40, %r5133, %r761; + setp.lt.s32 %p41, %r5132, %r761; + setp.lt.s32 %p42, %r5131, %r761; + setp.lt.s32 %p43, %r5130, %r761; + setp.lt.s32 %p44, %r5129, %r761; + setp.lt.s32 %p45, %r5128, %r761; + setp.lt.s32 %p46, %r5127, %r761; + setp.lt.s32 %p47, %r5126, %r761; + setp.lt.s32 %p48, %r5125, %r761; + setp.lt.s32 %p49, %r5124, %r761; + setp.lt.s32 %p50, %r5123, %r761; + setp.lt.s32 %p51, %r5122, %r761; + setp.lt.s32 %p52, %r5121, %r761; + .loc 1 257 21 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:257:21 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + rem.s32 %r2234, %r5121, %r761; + rem.s32 %r2235, %r5122, %r761; + rem.s32 %r2236, %r5123, %r761; + rem.s32 %r2237, %r5124, %r761; + rem.s32 %r2238, %r5125, %r761; + rem.s32 %r2239, %r5126, %r761; + rem.s32 %r2240, %r5127, %r761; + rem.s32 %r2241, %r5128, %r761; + rem.s32 %r2242, %r5129, %r761; + rem.s32 %r2243, %r5130, %r761; + rem.s32 %r2244, %r5131, %r761; + rem.s32 %r2245, %r5132, %r761; + rem.s32 %r2246, %r5133, %r761; + rem.s32 %r2247, %r5134, %r761; + rem.s32 %r2248, %r5135, %r761; + rem.s32 %r2249, %r5136, %r761; + .loc 1 373 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:373:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + setp.ge.s32 %p53, %r2164, %r2249; + setp.ge.s32 %p54, %r2164, %r2248; + setp.ge.s32 %p55, %r2163, %r2249; + setp.ge.s32 %p56, %r2163, %r2248; + setp.ge.s32 %p57, %r2164, %r2247; + setp.ge.s32 %p58, %r2164, %r2246; + setp.ge.s32 %p59, %r2163, %r2247; + setp.ge.s32 %p60, %r2163, %r2246; + setp.ge.s32 %p61, %r2164, %r2245; + setp.ge.s32 %p62, %r2164, %r2244; + setp.ge.s32 %p63, %r2163, %r2245; + setp.ge.s32 %p64, %r2163, %r2244; + setp.ge.s32 %p65, %r2164, %r2243; + setp.ge.s32 %p66, %r2164, %r2242; + setp.ge.s32 %p67, %r2163, %r2243; + setp.ge.s32 %p68, %r2163, %r2242; + setp.ge.s32 %p69, %r2164, %r2241; + setp.ge.s32 %p70, %r2164, %r2240; + setp.ge.s32 %p71, %r2163, %r2241; + setp.ge.s32 %p72, %r2163, %r2240; + setp.ge.s32 %p73, %r2164, %r2239; + setp.ge.s32 %p74, %r2164, %r2238; + setp.ge.s32 %p75, %r2163, %r2239; + setp.ge.s32 %p76, %r2163, %r2238; + setp.ge.s32 %p77, %r2164, %r2237; + setp.ge.s32 %p78, %r2164, %r2236; + setp.ge.s32 %p79, %r2163, %r2237; + setp.ge.s32 %p80, %r2163, %r2236; + setp.ge.s32 %p81, %r2164, %r2235; + setp.ge.s32 %p82, %r2164, %r2234; + setp.ge.s32 %p83, %r2163, %r2235; + setp.ge.s32 %p84, %r2163, %r2234; + .loc 1 381 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:381:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + and.pred %p85, %p34, %p53; + and.pred %p86, %p34, %p54; + and.pred %p87, %p33, %p55; + and.pred %p88, %p33, %p56; + and.pred %p89, %p34, %p57; + and.pred %p90, %p34, %p58; + and.pred %p91, %p33, %p59; + and.pred %p92, %p33, %p60; + and.pred %p93, %p34, %p61; + and.pred %p94, %p34, %p62; + and.pred %p95, %p33, %p63; + and.pred %p96, %p33, %p64; + and.pred %p97, %p34, %p65; + and.pred %p98, %p34, %p66; + and.pred %p99, %p33, %p67; + and.pred %p100, %p33, %p68; + and.pred %p101, %p34, %p69; + and.pred %p102, %p34, %p70; + and.pred %p103, %p33, %p71; + and.pred %p104, %p33, %p72; + and.pred %p105, %p34, %p73; + and.pred %p106, %p34, %p74; + and.pred %p107, %p33, %p75; + and.pred %p108, %p33, %p76; + and.pred %p109, %p34, %p77; + and.pred %p110, %p34, %p78; + and.pred %p111, %p33, %p79; + and.pred %p112, %p33, %p80; + and.pred %p113, %p34, %p81; + and.pred %p114, %p34, %p82; + and.pred %p115, %p33, %p83; + and.pred %p116, %p33, %p84; + .loc 1 384 24 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:384:24 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + setp.ge.s32 %p117, %r2249, %r762; + setp.ge.s32 %p118, %r2248, %r762; + setp.ge.s32 %p119, %r2247, %r762; + setp.ge.s32 %p120, %r2246, %r762; + setp.ge.s32 %p121, %r2245, %r762; + setp.ge.s32 %p122, %r2244, %r762; + setp.ge.s32 %p123, %r2243, %r762; + setp.ge.s32 %p124, %r2242, %r762; + setp.ge.s32 %p125, %r2241, %r762; + setp.ge.s32 %p126, %r2240, %r762; + setp.ge.s32 %p127, %r2239, %r762; + setp.ge.s32 %p128, %r2238, %r762; + setp.ge.s32 %p129, %r2237, %r762; + setp.ge.s32 %p130, %r2236, %r762; + setp.ge.s32 %p131, %r2235, %r762; + setp.ge.s32 %p132, %r2234, %r762; + .loc 1 385 24 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:385:24 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + rem.s32 %r2250, %r2249, %r762; + rem.s32 %r2251, %r2248, %r762; + rem.s32 %r2252, %r2247, %r762; + rem.s32 %r2253, %r2246, %r762; + rem.s32 %r2254, %r2245, %r762; + rem.s32 %r2255, %r2244, %r762; + rem.s32 %r2256, %r2243, %r762; + rem.s32 %r2257, %r2242, %r762; + rem.s32 %r2258, %r2241, %r762; + rem.s32 %r2259, %r2240, %r762; + rem.s32 %r2260, %r2239, %r762; + rem.s32 %r2261, %r2238, %r762; + rem.s32 %r2262, %r2237, %r762; + rem.s32 %r2263, %r2236, %r762; + rem.s32 %r2264, %r2235, %r762; + rem.s32 %r2265, %r2234, %r762; + .loc 1 387 25 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:387:25 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + setp.ne.b32 %p133, %r2265, 0; + setp.ne.b32 %p134, %r2264, 0; + setp.ne.b32 %p135, %r2263, 0; + setp.ne.b32 %p136, %r2262, 0; + setp.ne.b32 %p137, %r2261, 0; + setp.ne.b32 %p138, %r2260, 0; + setp.ne.b32 %p139, %r2259, 0; + setp.ne.b32 %p140, %r2258, 0; + setp.ne.b32 %p141, %r2257, 0; + setp.ne.b32 %p142, %r2256, 0; + setp.ne.b32 %p143, %r2255, 0; + setp.ne.b32 %p144, %r2254, 0; + setp.ne.b32 %p145, %r2253, 0; + setp.ne.b32 %p146, %r2252, 0; + setp.ne.b32 %p147, %r2251, 0; + setp.ne.b32 %p148, %r2250, 0; + .loc 1 390 25 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:390:25 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + xor.b32 %r2266, %r2265, %r762; + xor.b32 %r2267, %r2264, %r762; + xor.b32 %r2268, %r2263, %r762; + xor.b32 %r2269, %r2262, %r762; + xor.b32 %r2270, %r2261, %r762; + xor.b32 %r2271, %r2260, %r762; + xor.b32 %r2272, %r2259, %r762; + xor.b32 %r2273, %r2258, %r762; + xor.b32 %r2274, %r2257, %r762; + xor.b32 %r2275, %r2256, %r762; + xor.b32 %r2276, %r2255, %r762; + xor.b32 %r2277, %r2254, %r762; + xor.b32 %r2278, %r2253, %r762; + xor.b32 %r2279, %r2252, %r762; + xor.b32 %r2280, %r2251, %r762; + xor.b32 %r2281, %r2250, %r762; + .loc 1 393 39 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:393:39 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + shr.s32 %r2282, %r2281, 31; + and.b32 %r2283, %r2282, %r762; + selp.b32 %r2284, %r2283, 0, %p148; + shr.s32 %r2285, %r2280, 31; + and.b32 %r2286, %r2285, %r762; + selp.b32 %r2287, %r2286, 0, %p147; + shr.s32 %r2288, %r2279, 31; + and.b32 %r2289, %r2288, %r762; + selp.b32 %r2290, %r2289, 0, %p146; + shr.s32 %r2291, %r2278, 31; + and.b32 %r2292, %r2291, %r762; + selp.b32 %r2293, %r2292, 0, %p145; + shr.s32 %r2294, %r2277, 31; + and.b32 %r2295, %r2294, %r762; + selp.b32 %r2296, %r2295, 0, %p144; + shr.s32 %r2297, %r2276, 31; + and.b32 %r2298, %r2297, %r762; + selp.b32 %r2299, %r2298, 0, %p143; + shr.s32 %r2300, %r2275, 31; + and.b32 %r2301, %r2300, %r762; + selp.b32 %r2302, %r2301, 0, %p142; + shr.s32 %r2303, %r2274, 31; + and.b32 %r2304, %r2303, %r762; + selp.b32 %r2305, %r2304, 0, %p141; + shr.s32 %r2306, %r2273, 31; + and.b32 %r2307, %r2306, %r762; + selp.b32 %r2308, %r2307, 0, %p140; + shr.s32 %r2309, %r2272, 31; + and.b32 %r2310, %r2309, %r762; + selp.b32 %r2311, %r2310, 0, %p139; + shr.s32 %r2312, %r2271, 31; + and.b32 %r2313, %r2312, %r762; + selp.b32 %r2314, %r2313, 0, %p138; + shr.s32 %r2315, %r2270, 31; + and.b32 %r2316, %r2315, %r762; + selp.b32 %r2317, %r2316, 0, %p137; + shr.s32 %r2318, %r2269, 31; + and.b32 %r2319, %r2318, %r762; + selp.b32 %r2320, %r2319, 0, %p136; + shr.s32 %r2321, %r2268, 31; + and.b32 %r2322, %r2321, %r762; + selp.b32 %r2323, %r2322, 0, %p135; + shr.s32 %r2324, %r2267, 31; + and.b32 %r2325, %r2324, %r762; + selp.b32 %r2326, %r2325, 0, %p134; + shr.s32 %r2327, %r2266, 31; + and.b32 %r2328, %r2327, %r762; + selp.b32 %r2329, %r2328, 0, %p133; + add.s32 %r2330, %r2329, %r2265; + add.s32 %r2331, %r2326, %r2264; + add.s32 %r2332, %r2323, %r2263; + add.s32 %r2333, %r2320, %r2262; + add.s32 %r2334, %r2317, %r2261; + add.s32 %r2335, %r2314, %r2260; + add.s32 %r2336, %r2311, %r2259; + add.s32 %r2337, %r2308, %r2258; + add.s32 %r2338, %r2305, %r2257; + add.s32 %r2339, %r2302, %r2256; + add.s32 %r2340, %r2299, %r2255; + add.s32 %r2341, %r2296, %r2254; + add.s32 %r2342, %r2293, %r2253; + add.s32 %r2343, %r2290, %r2252; + add.s32 %r2344, %r2287, %r2251; + add.s32 %r2345, %r2284, %r2250; + .loc 1 395 24 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:395:24 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + cvt.s64.s32 %rd144, %r2345; + cvt.s64.s32 %rd145, %r2344; + cvt.s64.s32 %rd146, %r2343; + cvt.s64.s32 %rd147, %r2342; + cvt.s64.s32 %rd148, %r2341; + cvt.s64.s32 %rd149, %r2340; + cvt.s64.s32 %rd150, %r2339; + cvt.s64.s32 %rd151, %r2338; + cvt.s64.s32 %rd152, %r2337; + cvt.s64.s32 %rd153, %r2336; + cvt.s64.s32 %rd154, %r2335; + cvt.s64.s32 %rd155, %r2334; + cvt.s64.s32 %rd156, %r2333; + cvt.s64.s32 %rd157, %r2332; + cvt.s64.s32 %rd158, %r2331; + cvt.s64.s32 %rd159, %r2330; + setp.gt.s64 %p149, %rd7, %rd159; + setp.gt.s64 %p150, %rd7, %rd158; + setp.gt.s64 %p151, %rd7, %rd157; + setp.gt.s64 %p152, %rd7, %rd156; + setp.gt.s64 %p153, %rd7, %rd155; + setp.gt.s64 %p154, %rd7, %rd154; + setp.gt.s64 %p155, %rd7, %rd153; + setp.gt.s64 %p156, %rd7, %rd152; + setp.gt.s64 %p157, %rd7, %rd151; + setp.gt.s64 %p158, %rd7, %rd150; + setp.gt.s64 %p159, %rd7, %rd149; + setp.gt.s64 %p160, %rd7, %rd148; + setp.gt.s64 %p161, %rd7, %rd147; + setp.gt.s64 %p162, %rd7, %rd146; + setp.gt.s64 %p163, %rd7, %rd145; + setp.gt.s64 %p164, %rd7, %rd144; + .loc 1 396 24 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:396:24 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + and.pred %p165, %p117, %p164; + and.pred %p166, %p118, %p163; + and.pred %p167, %p119, %p162; + and.pred %p168, %p120, %p161; + and.pred %p169, %p121, %p160; + and.pred %p170, %p122, %p159; + and.pred %p171, %p123, %p158; + and.pred %p172, %p124, %p157; + and.pred %p173, %p125, %p156; + and.pred %p174, %p126, %p155; + and.pred %p175, %p127, %p154; + and.pred %p176, %p128, %p153; + and.pred %p177, %p129, %p152; + and.pred %p178, %p130, %p151; + and.pred %p179, %p131, %p150; + and.pred %p180, %p132, %p149; + .loc 1 397 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:397:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + sub.s32 %r2346, %r2249, %r54; + sub.s32 %r2347, %r2248, %r54; + sub.s32 %r2348, %r2249, %r52; + sub.s32 %r2349, %r2248, %r52; + sub.s32 %r2350, %r2247, %r54; + sub.s32 %r2351, %r2246, %r54; + sub.s32 %r2352, %r2247, %r52; + sub.s32 %r2353, %r2246, %r52; + sub.s32 %r2354, %r2245, %r54; + sub.s32 %r2355, %r2244, %r54; + sub.s32 %r2356, %r2245, %r52; + sub.s32 %r2357, %r2244, %r52; + sub.s32 %r2358, %r2243, %r54; + sub.s32 %r2359, %r2242, %r54; + sub.s32 %r2360, %r2243, %r52; + sub.s32 %r2361, %r2242, %r52; + sub.s32 %r2362, %r2237, %r52; + sub.s32 %r2363, %r2236, %r52; + sub.s32 %r2364, %r2237, %r54; + sub.s32 %r2365, %r2236, %r54; + sub.s32 %r2366, %r2235, %r52; + sub.s32 %r2367, %r2234, %r52; + sub.s32 %r2368, %r2235, %r54; + sub.s32 %r2369, %r2234, %r54; + sub.s32 %r2370, %r2241, %r52; + sub.s32 %r2371, %r2240, %r52; + sub.s32 %r2372, %r2241, %r54; + sub.s32 %r2373, %r2240, %r54; + sub.s32 %r2374, %r2239, %r52; + sub.s32 %r2375, %r2238, %r52; + sub.s32 %r2376, %r2239, %r54; + sub.s32 %r2377, %r2238, %r54; + .loc 1 398 25 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:398:25 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + rem.s32 %r2378, %r2377, %r762; + rem.s32 %r2379, %r2376, %r762; + rem.s32 %r2380, %r2375, %r762; + rem.s32 %r2381, %r2374, %r762; + rem.s32 %r2382, %r2373, %r762; + rem.s32 %r2383, %r2372, %r762; + rem.s32 %r2384, %r2371, %r762; + rem.s32 %r2385, %r2370, %r762; + rem.s32 %r2386, %r2369, %r762; + rem.s32 %r2387, %r2368, %r762; + rem.s32 %r2388, %r2367, %r762; + rem.s32 %r2389, %r2366, %r762; + rem.s32 %r2390, %r2365, %r762; + rem.s32 %r2391, %r2364, %r762; + rem.s32 %r2392, %r2363, %r762; + rem.s32 %r2393, %r2362, %r762; + rem.s32 %r2394, %r2361, %r762; + rem.s32 %r2395, %r2360, %r762; + rem.s32 %r2396, %r2359, %r762; + rem.s32 %r2397, %r2358, %r762; + rem.s32 %r2398, %r2357, %r762; + rem.s32 %r2399, %r2356, %r762; + rem.s32 %r2400, %r2355, %r762; + rem.s32 %r2401, %r2354, %r762; + rem.s32 %r2402, %r2353, %r762; + rem.s32 %r2403, %r2352, %r762; + rem.s32 %r2404, %r2351, %r762; + rem.s32 %r2405, %r2350, %r762; + rem.s32 %r2406, %r2349, %r762; + rem.s32 %r2407, %r2348, %r762; + rem.s32 %r2408, %r2347, %r762; + rem.s32 %r2409, %r2346, %r762; + .loc 1 399 25 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:399:25 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + setp.ne.b32 %p181, %r2409, 0; + setp.ne.b32 %p182, %r2408, 0; + setp.ne.b32 %p183, %r2407, 0; + setp.ne.b32 %p184, %r2406, 0; + setp.ne.b32 %p185, %r2405, 0; + setp.ne.b32 %p186, %r2404, 0; + setp.ne.b32 %p187, %r2403, 0; + setp.ne.b32 %p188, %r2402, 0; + setp.ne.b32 %p189, %r2401, 0; + setp.ne.b32 %p190, %r2400, 0; + setp.ne.b32 %p191, %r2399, 0; + setp.ne.b32 %p192, %r2398, 0; + setp.ne.b32 %p193, %r2397, 0; + setp.ne.b32 %p194, %r2396, 0; + setp.ne.b32 %p195, %r2395, 0; + setp.ne.b32 %p196, %r2394, 0; + setp.ne.b32 %p197, %r2393, 0; + setp.ne.b32 %p198, %r2392, 0; + setp.ne.b32 %p199, %r2391, 0; + setp.ne.b32 %p200, %r2390, 0; + setp.ne.b32 %p201, %r2389, 0; + setp.ne.b32 %p202, %r2388, 0; + setp.ne.b32 %p203, %r2387, 0; + setp.ne.b32 %p204, %r2386, 0; + setp.ne.b32 %p205, %r2385, 0; + setp.ne.b32 %p206, %r2384, 0; + setp.ne.b32 %p207, %r2383, 0; + setp.ne.b32 %p208, %r2382, 0; + setp.ne.b32 %p209, %r2381, 0; + setp.ne.b32 %p210, %r2380, 0; + setp.ne.b32 %p211, %r2379, 0; + setp.ne.b32 %p212, %r2378, 0; + .loc 1 401 25 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:401:25 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + xor.b32 %r2410, %r2409, %r762; + xor.b32 %r2411, %r2408, %r762; + xor.b32 %r2412, %r2407, %r762; + xor.b32 %r2413, %r2406, %r762; + xor.b32 %r2414, %r2405, %r762; + xor.b32 %r2415, %r2404, %r762; + xor.b32 %r2416, %r2403, %r762; + xor.b32 %r2417, %r2402, %r762; + xor.b32 %r2418, %r2401, %r762; + xor.b32 %r2419, %r2400, %r762; + xor.b32 %r2420, %r2399, %r762; + xor.b32 %r2421, %r2398, %r762; + xor.b32 %r2422, %r2397, %r762; + xor.b32 %r2423, %r2396, %r762; + xor.b32 %r2424, %r2395, %r762; + xor.b32 %r2425, %r2394, %r762; + xor.b32 %r2426, %r2393, %r762; + xor.b32 %r2427, %r2392, %r762; + xor.b32 %r2428, %r2391, %r762; + xor.b32 %r2429, %r2390, %r762; + xor.b32 %r2430, %r2389, %r762; + xor.b32 %r2431, %r2388, %r762; + xor.b32 %r2432, %r2387, %r762; + xor.b32 %r2433, %r2386, %r762; + xor.b32 %r2434, %r2385, %r762; + xor.b32 %r2435, %r2384, %r762; + xor.b32 %r2436, %r2383, %r762; + xor.b32 %r2437, %r2382, %r762; + xor.b32 %r2438, %r2381, %r762; + xor.b32 %r2439, %r2380, %r762; + xor.b32 %r2440, %r2379, %r762; + xor.b32 %r2441, %r2378, %r762; + .loc 1 404 39 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:404:39 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + shr.s32 %r2442, %r2441, 31; + and.b32 %r2443, %r2442, %r762; + selp.b32 %r2444, %r2443, 0, %p212; + shr.s32 %r2445, %r2440, 31; + and.b32 %r2446, %r2445, %r762; + selp.b32 %r2447, %r2446, 0, %p211; + shr.s32 %r2448, %r2439, 31; + and.b32 %r2449, %r2448, %r762; + selp.b32 %r2450, %r2449, 0, %p210; + shr.s32 %r2451, %r2438, 31; + and.b32 %r2452, %r2451, %r762; + selp.b32 %r2453, %r2452, 0, %p209; + shr.s32 %r2454, %r2437, 31; + and.b32 %r2455, %r2454, %r762; + selp.b32 %r2456, %r2455, 0, %p208; + shr.s32 %r2457, %r2436, 31; + and.b32 %r2458, %r2457, %r762; + selp.b32 %r2459, %r2458, 0, %p207; + shr.s32 %r2460, %r2435, 31; + and.b32 %r2461, %r2460, %r762; + selp.b32 %r2462, %r2461, 0, %p206; + shr.s32 %r2463, %r2434, 31; + and.b32 %r2464, %r2463, %r762; + selp.b32 %r2465, %r2464, 0, %p205; + shr.s32 %r2466, %r2433, 31; + and.b32 %r2467, %r2466, %r762; + selp.b32 %r2468, %r2467, 0, %p204; + shr.s32 %r2469, %r2432, 31; + and.b32 %r2470, %r2469, %r762; + selp.b32 %r2471, %r2470, 0, %p203; + shr.s32 %r2472, %r2431, 31; + and.b32 %r2473, %r2472, %r762; + selp.b32 %r2474, %r2473, 0, %p202; + shr.s32 %r2475, %r2430, 31; + and.b32 %r2476, %r2475, %r762; + selp.b32 %r2477, %r2476, 0, %p201; + shr.s32 %r2478, %r2429, 31; + and.b32 %r2479, %r2478, %r762; + selp.b32 %r2480, %r2479, 0, %p200; + shr.s32 %r2481, %r2428, 31; + and.b32 %r2482, %r2481, %r762; + selp.b32 %r2483, %r2482, 0, %p199; + shr.s32 %r2484, %r2427, 31; + and.b32 %r2485, %r2484, %r762; + selp.b32 %r2486, %r2485, 0, %p198; + shr.s32 %r2487, %r2426, 31; + and.b32 %r2488, %r2487, %r762; + selp.b32 %r2489, %r2488, 0, %p197; + shr.s32 %r2490, %r2425, 31; + and.b32 %r2491, %r2490, %r762; + selp.b32 %r2492, %r2491, 0, %p196; + shr.s32 %r2493, %r2424, 31; + and.b32 %r2494, %r2493, %r762; + selp.b32 %r2495, %r2494, 0, %p195; + shr.s32 %r2496, %r2423, 31; + and.b32 %r2497, %r2496, %r762; + selp.b32 %r2498, %r2497, 0, %p194; + shr.s32 %r2499, %r2422, 31; + and.b32 %r2500, %r2499, %r762; + selp.b32 %r2501, %r2500, 0, %p193; + shr.s32 %r2502, %r2421, 31; + and.b32 %r2503, %r2502, %r762; + selp.b32 %r2504, %r2503, 0, %p192; + shr.s32 %r2505, %r2420, 31; + and.b32 %r2506, %r2505, %r762; + selp.b32 %r2507, %r2506, 0, %p191; + shr.s32 %r2508, %r2419, 31; + and.b32 %r2509, %r2508, %r762; + selp.b32 %r2510, %r2509, 0, %p190; + shr.s32 %r2511, %r2418, 31; + and.b32 %r2512, %r2511, %r762; + selp.b32 %r2513, %r2512, 0, %p189; + shr.s32 %r2514, %r2417, 31; + and.b32 %r2515, %r2514, %r762; + selp.b32 %r2516, %r2515, 0, %p188; + shr.s32 %r2517, %r2416, 31; + and.b32 %r2518, %r2517, %r762; + selp.b32 %r2519, %r2518, 0, %p187; + shr.s32 %r2520, %r2415, 31; + and.b32 %r2521, %r2520, %r762; + selp.b32 %r2522, %r2521, 0, %p186; + shr.s32 %r2523, %r2414, 31; + and.b32 %r2524, %r2523, %r762; + selp.b32 %r2525, %r2524, 0, %p185; + shr.s32 %r2526, %r2413, 31; + and.b32 %r2527, %r2526, %r762; + selp.b32 %r2528, %r2527, 0, %p184; + shr.s32 %r2529, %r2412, 31; + and.b32 %r2530, %r2529, %r762; + selp.b32 %r2531, %r2530, 0, %p183; + shr.s32 %r2532, %r2411, 31; + and.b32 %r2533, %r2532, %r762; + selp.b32 %r2534, %r2533, 0, %p182; + shr.s32 %r2535, %r2410, 31; + and.b32 %r2536, %r2535, %r762; + selp.b32 %r2537, %r2536, 0, %p181; + .loc 1 405 25 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:405:25 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + neg.s32 %r2538, %r2537; + neg.s32 %r2539, %r2534; + neg.s32 %r2540, %r2531; + neg.s32 %r2541, %r2528; + neg.s32 %r2542, %r2525; + neg.s32 %r2543, %r2522; + neg.s32 %r2544, %r2519; + neg.s32 %r2545, %r2516; + neg.s32 %r2546, %r2513; + neg.s32 %r2547, %r2510; + neg.s32 %r2548, %r2507; + neg.s32 %r2549, %r2504; + neg.s32 %r2550, %r2501; + neg.s32 %r2551, %r2498; + neg.s32 %r2552, %r2495; + neg.s32 %r2553, %r2492; + neg.s32 %r2554, %r2489; + neg.s32 %r2555, %r2486; + neg.s32 %r2556, %r2483; + neg.s32 %r2557, %r2480; + neg.s32 %r2558, %r2477; + neg.s32 %r2559, %r2474; + neg.s32 %r2560, %r2471; + neg.s32 %r2561, %r2468; + neg.s32 %r2562, %r2465; + neg.s32 %r2563, %r2462; + neg.s32 %r2564, %r2459; + neg.s32 %r2565, %r2456; + neg.s32 %r2566, %r2453; + neg.s32 %r2567, %r2450; + neg.s32 %r2568, %r2447; + neg.s32 %r2569, %r2444; + setp.eq.b32 %p213, %r2378, %r2569; + selp.b16 %rs1, 1, 0, %p213; + shl.b16 %rs2, %rs1, 2; + setp.eq.b32 %p214, %r2379, %r2568; + selp.b16 %rs3, -1, 0, %p214; + shl.b16 %rs4, %rs3, 3; + or.b16 %rs5, %rs4, %rs2; + setp.eq.b32 %p215, %r2380, %r2567; + selp.b16 %rs6, 1, 0, %p215; + setp.eq.b32 %p216, %r2381, %r2566; + selp.b16 %rs7, -1, 0, %p216; + shl.b16 %rs8, %rs7, 1; + or.b16 %rs9, %rs6, %rs8; + and.b16 %rs10, %rs9, 3; + or.b16 %rs11, %rs10, %rs5; + and.b16 %rs12, %rs11, 15; + shl.b16 %rs13, %rs12, 8; + setp.eq.b32 %p217, %r2382, %r2565; + selp.b16 %rs14, 1, 0, %p217; + shl.b16 %rs15, %rs14, 2; + setp.eq.b32 %p218, %r2383, %r2564; + selp.b16 %rs16, -1, 0, %p218; + shl.b16 %rs17, %rs16, 3; + or.b16 %rs18, %rs17, %rs15; + setp.eq.b32 %p219, %r2384, %r2563; + selp.b16 %rs19, 1, 0, %p219; + setp.eq.b32 %p220, %r2385, %r2562; + selp.b16 %rs20, -1, 0, %p220; + shl.b16 %rs21, %rs20, 1; + or.b16 %rs22, %rs19, %rs21; + and.b16 %rs23, %rs22, 3; + or.b16 %rs24, %rs23, %rs18; + shl.b16 %rs25, %rs24, 12; + or.b16 %rs26, %rs25, %rs13; + setp.eq.b32 %p221, %r2386, %r2561; + selp.b16 %rs27, 1, 0, %p221; + shl.b16 %rs28, %rs27, 2; + setp.eq.b32 %p222, %r2387, %r2560; + selp.b16 %rs29, -1, 0, %p222; + shl.b16 %rs30, %rs29, 3; + or.b16 %rs31, %rs30, %rs28; + setp.eq.b32 %p223, %r2388, %r2559; + selp.b16 %rs32, 1, 0, %p223; + setp.eq.b32 %p224, %r2389, %r2558; + selp.b16 %rs33, -1, 0, %p224; + shl.b16 %rs34, %rs33, 1; + or.b16 %rs35, %rs32, %rs34; + and.b16 %rs36, %rs35, 3; + or.b16 %rs37, %rs36, %rs31; + and.b16 %rs38, %rs37, 15; + setp.eq.b32 %p225, %r2390, %r2557; + selp.b16 %rs39, 1, 0, %p225; + shl.b16 %rs40, %rs39, 2; + setp.eq.b32 %p226, %r2391, %r2556; + selp.b16 %rs41, -1, 0, %p226; + shl.b16 %rs42, %rs41, 3; + or.b16 %rs43, %rs42, %rs40; + setp.eq.b32 %p227, %r2392, %r2555; + selp.b16 %rs44, 1, 0, %p227; + setp.eq.b32 %p228, %r2393, %r2554; + selp.b16 %rs45, -1, 0, %p228; + shl.b16 %rs46, %rs45, 1; + or.b16 %rs47, %rs44, %rs46; + and.b16 %rs48, %rs47, 3; + or.b16 %rs49, %rs48, %rs43; + shl.b16 %rs50, %rs49, 4; + or.b16 %rs51, %rs38, %rs50; + and.b16 %rs52, %rs51, 255; + or.b16 %rs53, %rs52, %rs26; + cvt.u32.u16 %r2570, %rs53; + setp.eq.b32 %p229, %r2394, %r2553; + setp.eq.b32 %p230, %r2395, %r2552; + setp.eq.b32 %p231, %r2396, %r2551; + setp.eq.b32 %p232, %r2397, %r2550; + setp.eq.b32 %p233, %r2398, %r2549; + setp.eq.b32 %p234, %r2399, %r2548; + setp.eq.b32 %p235, %r2400, %r2547; + setp.eq.b32 %p236, %r2401, %r2546; + setp.eq.b32 %p237, %r2402, %r2545; + setp.eq.b32 %p238, %r2403, %r2544; + setp.eq.b32 %p239, %r2404, %r2543; + setp.eq.b32 %p240, %r2405, %r2542; + setp.eq.b32 %p241, %r2406, %r2541; + setp.eq.b32 %p242, %r2407, %r2540; + setp.eq.b32 %p243, %r2408, %r2539; + setp.eq.b32 %p244, %r2409, %r2538; + .loc 1 406 24 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:406:24 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + and.pred %p245, %p165, %p244; + and.pred %p246, %p166, %p243; + and.pred %p247, %p165, %p242; + and.pred %p248, %p166, %p241; + and.pred %p249, %p167, %p240; + and.pred %p250, %p168, %p239; + and.pred %p251, %p167, %p238; + and.pred %p252, %p168, %p237; + and.pred %p253, %p169, %p236; + and.pred %p254, %p170, %p235; + and.pred %p255, %p169, %p234; + and.pred %p256, %p170, %p233; + and.pred %p257, %p171, %p232; + and.pred %p258, %p172, %p231; + and.pred %p259, %p171, %p230; + and.pred %p260, %p172, %p229; + shr.u32 %r2571, %r2570, 15; + and.b32 %r2572, %r2571, 1; + setp.ne.b32 %p261, %r2572, 0; + and.pred %p262, %p173, %p261; + shr.u32 %r2573, %r2570, 14; + and.b32 %r2574, %r2573, 1; + setp.ne.b32 %p263, %r2574, 0; + and.pred %p264, %p174, %p263; + shr.u32 %r2575, %r2570, 13; + and.b32 %r2576, %r2575, 1; + setp.ne.b32 %p265, %r2576, 0; + and.pred %p266, %p173, %p265; + shr.u32 %r2577, %r2570, 12; + and.b32 %r2578, %r2577, 1; + setp.ne.b32 %p267, %r2578, 0; + and.pred %p268, %p174, %p267; + shr.u32 %r2579, %r2570, 11; + and.b32 %r2580, %r2579, 1; + setp.ne.b32 %p269, %r2580, 0; + and.pred %p270, %p175, %p269; + shr.u32 %r2581, %r2570, 10; + and.b32 %r2582, %r2581, 1; + setp.ne.b32 %p271, %r2582, 0; + and.pred %p272, %p176, %p271; + shr.u32 %r2583, %r2570, 9; + and.b32 %r2584, %r2583, 1; + setp.ne.b32 %p273, %r2584, 0; + and.pred %p274, %p175, %p273; + shr.u32 %r2585, %r2570, 8; + and.b32 %r2586, %r2585, 1; + setp.ne.b32 %p275, %r2586, 0; + and.pred %p276, %p176, %p275; + shr.u32 %r2587, %r2570, 7; + and.b32 %r2588, %r2587, 1; + setp.ne.b32 %p277, %r2588, 0; + and.pred %p278, %p177, %p277; + shr.u32 %r2589, %r2570, 6; + and.b32 %r2590, %r2589, 1; + setp.ne.b32 %p279, %r2590, 0; + and.pred %p280, %p178, %p279; + shr.u32 %r2591, %r2570, 5; + and.b32 %r2592, %r2591, 1; + setp.ne.b32 %p281, %r2592, 0; + and.pred %p282, %p177, %p281; + shr.u32 %r2593, %r2570, 4; + and.b32 %r2594, %r2593, 1; + setp.ne.b32 %p283, %r2594, 0; + and.pred %p284, %p178, %p283; + shr.u32 %r2595, %r2570, 3; + and.b32 %r2596, %r2595, 1; + setp.ne.b32 %p285, %r2596, 0; + and.pred %p286, %p179, %p285; + shr.u32 %r2597, %r2570, 2; + and.b32 %r2598, %r2597, 1; + setp.ne.b32 %p287, %r2598, 0; + and.pred %p288, %p180, %p287; + shr.u32 %r2599, %r2570, 1; + and.b32 %r2600, %r2599, 1; + setp.ne.b32 %p289, %r2600, 0; + and.pred %p290, %p179, %p289; + and.pred %p291, %p180, %p223; + .loc 1 407 24 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:407:24 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + or.pred %p292, %p85, %p245; + or.pred %p294, %p86, %p246; + or.pred %p296, %p87, %p247; + or.pred %p298, %p88, %p248; + or.pred %p300, %p89, %p249; + or.pred %p302, %p90, %p250; + or.pred %p304, %p91, %p251; + or.pred %p306, %p92, %p252; + or.pred %p308, %p93, %p253; + or.pred %p310, %p94, %p254; + or.pred %p312, %p95, %p255; + or.pred %p314, %p96, %p256; + or.pred %p316, %p97, %p257; + or.pred %p318, %p98, %p258; + or.pred %p320, %p99, %p259; + or.pred %p322, %p100, %p260; + or.pred %p324, %p101, %p262; + or.pred %p326, %p102, %p264; + or.pred %p328, %p103, %p266; + or.pred %p330, %p104, %p268; + or.pred %p332, %p105, %p270; + or.pred %p334, %p106, %p272; + or.pred %p336, %p107, %p274; + or.pred %p338, %p108, %p276; + or.pred %p340, %p109, %p278; + or.pred %p342, %p110, %p280; + or.pred %p344, %p111, %p282; + or.pred %p346, %p112, %p284; + or.pred %p348, %p113, %p286; + or.pred %p350, %p114, %p288; + or.pred %p352, %p115, %p290; + or.pred %p354, %p116, %p291; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2601, %r2202, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2602, %r2601, 0fFF800000, %p292; + selp.f32 %r2603, %r2602, 0fFF800000, %p37; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2604, %r2203, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2605, %r2604, 0fFF800000, %p294; + selp.f32 %r2606, %r2605, 0fFF800000, %p38; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2607, %r2204, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2608, %r2607, 0fFF800000, %p296; + selp.f32 %r2609, %r2608, 0fFF800000, %p37; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2610, %r2205, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2611, %r2610, 0fFF800000, %p298; + selp.f32 %r2612, %r2611, 0fFF800000, %p38; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2613, %r2206, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2614, %r2613, 0fFF800000, %p300; + selp.f32 %r2615, %r2614, 0fFF800000, %p39; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2616, %r2207, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2617, %r2616, 0fFF800000, %p302; + selp.f32 %r2618, %r2617, 0fFF800000, %p40; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2619, %r2208, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2620, %r2619, 0fFF800000, %p304; + selp.f32 %r2621, %r2620, 0fFF800000, %p39; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2622, %r2209, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2623, %r2622, 0fFF800000, %p306; + selp.f32 %r2624, %r2623, 0fFF800000, %p40; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2625, %r2210, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2626, %r2625, 0fFF800000, %p308; + selp.f32 %r2627, %r2626, 0fFF800000, %p41; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2628, %r2211, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2629, %r2628, 0fFF800000, %p310; + selp.f32 %r2630, %r2629, 0fFF800000, %p42; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2631, %r2212, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2632, %r2631, 0fFF800000, %p312; + selp.f32 %r2633, %r2632, 0fFF800000, %p41; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2634, %r2213, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2635, %r2634, 0fFF800000, %p314; + selp.f32 %r2636, %r2635, 0fFF800000, %p42; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2637, %r2214, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2638, %r2637, 0fFF800000, %p316; + selp.f32 %r2639, %r2638, 0fFF800000, %p43; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2640, %r2215, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2641, %r2640, 0fFF800000, %p318; + selp.f32 %r2642, %r2641, 0fFF800000, %p44; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2643, %r2216, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2644, %r2643, 0fFF800000, %p320; + selp.f32 %r2645, %r2644, 0fFF800000, %p43; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2646, %r2217, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2647, %r2646, 0fFF800000, %p322; + selp.f32 %r2648, %r2647, 0fFF800000, %p44; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2649, %r2218, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2650, %r2649, 0fFF800000, %p324; + selp.f32 %r2651, %r2650, 0fFF800000, %p45; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2652, %r2219, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2653, %r2652, 0fFF800000, %p326; + selp.f32 %r2654, %r2653, 0fFF800000, %p46; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2655, %r2220, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2656, %r2655, 0fFF800000, %p328; + selp.f32 %r2657, %r2656, 0fFF800000, %p45; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2658, %r2221, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2659, %r2658, 0fFF800000, %p330; + selp.f32 %r2660, %r2659, 0fFF800000, %p46; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2661, %r2222, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2662, %r2661, 0fFF800000, %p332; + selp.f32 %r2663, %r2662, 0fFF800000, %p47; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2664, %r2223, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2665, %r2664, 0fFF800000, %p334; + selp.f32 %r2666, %r2665, 0fFF800000, %p48; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2667, %r2224, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2668, %r2667, 0fFF800000, %p336; + selp.f32 %r2669, %r2668, 0fFF800000, %p47; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2670, %r2225, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2671, %r2670, 0fFF800000, %p338; + selp.f32 %r2672, %r2671, 0fFF800000, %p48; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2673, %r2226, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2674, %r2673, 0fFF800000, %p340; + selp.f32 %r2675, %r2674, 0fFF800000, %p49; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2676, %r2227, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2677, %r2676, 0fFF800000, %p342; + selp.f32 %r2678, %r2677, 0fFF800000, %p50; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2679, %r2228, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2680, %r2679, 0fFF800000, %p344; + selp.f32 %r2681, %r2680, 0fFF800000, %p49; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2682, %r2229, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2683, %r2682, 0fFF800000, %p346; + selp.f32 %r2684, %r2683, 0fFF800000, %p50; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2685, %r2230, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2686, %r2685, 0fFF800000, %p348; + selp.f32 %r2687, %r2686, 0fFF800000, %p51; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2688, %r2231, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2689, %r2688, 0fFF800000, %p350; + selp.f32 %r2690, %r2689, 0fFF800000, %p52; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2691, %r2232, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2692, %r2691, 0fFF800000, %p352; + selp.f32 %r2693, %r2692, 0fFF800000, %p51; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r2694, %r2233, 0f3FB8AA3B; + .loc 1 414 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:414:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2695, %r2694, 0fFF800000, %p354; + selp.f32 %r2696, %r2695, 0fFF800000, %p52; + .loc 2 168 27 // standard.py:168:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + max.f32 %r2697, %r2603, %r2606; + max.f32 %r2698, %r2609, %r2612; + max.f32 %r2699, %r2697, %r2615; + max.f32 %r2700, %r2699, %r2618; + max.f32 %r2701, %r2698, %r2621; + max.f32 %r2702, %r2701, %r2624; + max.f32 %r2703, %r2700, %r2627; + max.f32 %r2704, %r2703, %r2630; + max.f32 %r2705, %r2702, %r2633; + max.f32 %r2706, %r2705, %r2636; + max.f32 %r2707, %r2704, %r2639; + max.f32 %r2708, %r2707, %r2642; + max.f32 %r2709, %r2706, %r2645; + max.f32 %r2710, %r2709, %r2648; + max.f32 %r2711, %r2708, %r2651; + max.f32 %r2712, %r2711, %r2654; + max.f32 %r2713, %r2710, %r2657; + max.f32 %r2714, %r2713, %r2660; + max.f32 %r2715, %r2712, %r2663; + max.f32 %r2716, %r2715, %r2666; + max.f32 %r2717, %r2714, %r2669; + max.f32 %r2718, %r2717, %r2672; + max.f32 %r2719, %r2716, %r2675; + max.f32 %r2720, %r2719, %r2678; + max.f32 %r2721, %r2718, %r2681; + max.f32 %r2722, %r2721, %r2684; + max.f32 %r2723, %r2720, %r2687; + max.f32 %r2724, %r2723, %r2690; + max.f32 %r2725, %r2722, %r2693; + max.f32 %r2726, %r2725, %r2696; + .loc 2 189 40 // standard.py:189:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + shfl.sync.bfly.b32 %r2727, %r2724, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + max.f32 %r2728, %r2724, %r2727; + .loc 2 189 40 // standard.py:189:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + shfl.sync.bfly.b32 %r2729, %r2728, 1, 31, -1; + shfl.sync.bfly.b32 %r2730, %r2726, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + max.f32 %r2731, %r2726, %r2730; + .loc 2 189 40 // standard.py:189:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + shfl.sync.bfly.b32 %r2732, %r2731, 1, 31, -1; + cvt.u64.u32 %rd160, %r2729; + cvt.u64.u32 %rd161, %r2732; + shl.b64 %rd162, %rd161, 32; + or.b64 %rd163, %rd160, %rd162; + .loc 2 168 27 // standard.py:168:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mov.b64 {%r2733, %r2734}, %rd163; + max.f32 %r2735, %r2728, %r2733; + max.f32 %r2736, %r2731, %r2734; + .loc 1 421 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:421:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mov.b64 {%r2737, %r2738}, %rd292; + max.f32 %r2739, %r2738, %r2736; + max.f32 %r2740, %r2737, %r2735; + mov.b64 %rd292, {%r2740, %r2739}; + .loc 1 423 35 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:423:35 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + setp.eq.f32 %p356, %r2740, 0fFF800000; + setp.eq.f32 %p357, %r2739, 0fFF800000; + .loc 1 424 51 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:424:51 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + selp.f32 %r2741, 0f00000000, %r2740, %p356; + selp.f32 %r2742, 0f00000000, %r2739, %p357; + .loc 1 428 31 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:428:31 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + sub.f32 %r2743, %r2737, %r2741; + sub.f32 %r2744, %r2738, %r2742; + .loc 1 428 25 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:428:25 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + ex2.approx.ftz.f32 %r2745, %r2743; + ex2.approx.ftz.f32 %r2746, %r2744; + .loc 1 429 39 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:429:39 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + sub.f32 %r2747, %r2603, %r2741; + sub.f32 %r2748, %r2606, %r2741; + sub.f32 %r2749, %r2609, %r2742; + sub.f32 %r2750, %r2612, %r2742; + sub.f32 %r2751, %r2615, %r2741; + sub.f32 %r2752, %r2618, %r2741; + sub.f32 %r2753, %r2621, %r2742; + sub.f32 %r2754, %r2624, %r2742; + sub.f32 %r2755, %r2627, %r2741; + sub.f32 %r2756, %r2630, %r2741; + sub.f32 %r2757, %r2633, %r2742; + sub.f32 %r2758, %r2636, %r2742; + sub.f32 %r2759, %r2639, %r2741; + sub.f32 %r2760, %r2642, %r2741; + sub.f32 %r2761, %r2645, %r2742; + sub.f32 %r2762, %r2648, %r2742; + sub.f32 %r2763, %r2651, %r2741; + sub.f32 %r2764, %r2654, %r2741; + sub.f32 %r2765, %r2657, %r2742; + sub.f32 %r2766, %r2660, %r2742; + sub.f32 %r2767, %r2663, %r2741; + sub.f32 %r2768, %r2666, %r2741; + sub.f32 %r2769, %r2669, %r2742; + sub.f32 %r2770, %r2672, %r2742; + sub.f32 %r2771, %r2675, %r2741; + sub.f32 %r2772, %r2678, %r2741; + sub.f32 %r2773, %r2681, %r2742; + sub.f32 %r2774, %r2684, %r2742; + sub.f32 %r2775, %r2687, %r2741; + sub.f32 %r2776, %r2690, %r2741; + sub.f32 %r2777, %r2693, %r2742; + sub.f32 %r2778, %r2696, %r2742; + .loc 1 429 21 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:429:21 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + ex2.approx.ftz.f32 %r2779, %r2747; + ex2.approx.ftz.f32 %r2780, %r2748; + ex2.approx.ftz.f32 %r2781, %r2749; + ex2.approx.ftz.f32 %r2782, %r2750; + ex2.approx.ftz.f32 %r2783, %r2751; + ex2.approx.ftz.f32 %r2784, %r2752; + ex2.approx.ftz.f32 %r2785, %r2753; + ex2.approx.ftz.f32 %r2786, %r2754; + ex2.approx.ftz.f32 %r2787, %r2755; + ex2.approx.ftz.f32 %r2788, %r2756; + ex2.approx.ftz.f32 %r2789, %r2757; + ex2.approx.ftz.f32 %r2790, %r2758; + ex2.approx.ftz.f32 %r2791, %r2759; + ex2.approx.ftz.f32 %r2792, %r2760; + ex2.approx.ftz.f32 %r2793, %r2761; + ex2.approx.ftz.f32 %r2794, %r2762; + ex2.approx.ftz.f32 %r2795, %r2763; + ex2.approx.ftz.f32 %r2796, %r2764; + ex2.approx.ftz.f32 %r2797, %r2765; + ex2.approx.ftz.f32 %r2798, %r2766; + ex2.approx.ftz.f32 %r2799, %r2767; + ex2.approx.ftz.f32 %r2800, %r2768; + ex2.approx.ftz.f32 %r2801, %r2769; + ex2.approx.ftz.f32 %r2802, %r2770; + ex2.approx.ftz.f32 %r2803, %r2771; + ex2.approx.ftz.f32 %r2804, %r2772; + ex2.approx.ftz.f32 %r2805, %r2773; + ex2.approx.ftz.f32 %r2806, %r2774; + ex2.approx.ftz.f32 %r2807, %r2775; + ex2.approx.ftz.f32 %r2808, %r2776; + ex2.approx.ftz.f32 %r2809, %r2777; + ex2.approx.ftz.f32 %r2810, %r2778; + .loc 2 261 15 // standard.py:261:15 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.f32 %r2811, %r2779, %r2780; + add.f32 %r2812, %r2781, %r2782; + add.f32 %r2813, %r2811, %r2783; + add.f32 %r2814, %r2813, %r2784; + add.f32 %r2815, %r2812, %r2785; + add.f32 %r2816, %r2815, %r2786; + add.f32 %r2817, %r2814, %r2787; + add.f32 %r2818, %r2817, %r2788; + add.f32 %r2819, %r2816, %r2789; + add.f32 %r2820, %r2819, %r2790; + add.f32 %r2821, %r2818, %r2791; + add.f32 %r2822, %r2821, %r2792; + add.f32 %r2823, %r2820, %r2793; + add.f32 %r2824, %r2823, %r2794; + add.f32 %r2825, %r2822, %r2795; + add.f32 %r2826, %r2825, %r2796; + add.f32 %r2827, %r2824, %r2797; + add.f32 %r2828, %r2827, %r2798; + add.f32 %r2829, %r2826, %r2799; + add.f32 %r2830, %r2829, %r2800; + add.f32 %r2831, %r2828, %r2801; + add.f32 %r2832, %r2831, %r2802; + add.f32 %r2833, %r2830, %r2803; + add.f32 %r2834, %r2833, %r2804; + add.f32 %r2835, %r2832, %r2805; + add.f32 %r2836, %r2835, %r2806; + add.f32 %r2837, %r2834, %r2807; + add.f32 %r2838, %r2837, %r2808; + add.f32 %r2839, %r2836, %r2809; + add.f32 %r2840, %r2839, %r2810; + .loc 2 291 36 // standard.py:291:36 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + shfl.sync.bfly.b32 %r2841, %r2838, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.f32 %r2842, %r2838, %r2841; + .loc 2 291 36 // standard.py:291:36 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + shfl.sync.bfly.b32 %r2843, %r2842, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.f32 %r2844, %r2842, %r2843; + .loc 2 291 36 // standard.py:291:36 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + shfl.sync.bfly.b32 %r2845, %r2840, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.f32 %r2846, %r2840, %r2845; + .loc 2 291 36 // standard.py:291:36 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + shfl.sync.bfly.b32 %r2847, %r2846, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.f32 %r2848, %r2846, %r2847; + .loc 1 434 24 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:434:24 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + fma.rn.f32 %r5354, %r5354, %r2745, %r2844; + fma.rn.f32 %r5355, %r5355, %r2746, %r2848; + .loc 1 436 16 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:436:16 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.f32 %r1749, %r1749, %r2745; + mul.f32 %r1750, %r1750, %r2745; + mul.f32 %r1751, %r1751, %r2746; + mul.f32 %r1752, %r1752, %r2746; + mul.f32 %r1753, %r1753, %r2745; + mul.f32 %r1754, %r1754, %r2745; + mul.f32 %r1755, %r1755, %r2746; + mul.f32 %r1756, %r1756, %r2746; + mul.f32 %r1757, %r1757, %r2745; + mul.f32 %r1758, %r1758, %r2745; + mul.f32 %r1759, %r1759, %r2746; + mul.f32 %r1760, %r1760, %r2746; + mul.f32 %r1761, %r1761, %r2745; + mul.f32 %r1762, %r1762, %r2745; + mul.f32 %r1763, %r1763, %r2746; + mul.f32 %r1764, %r1764, %r2746; + mul.f32 %r1765, %r1765, %r2745; + mul.f32 %r1766, %r1766, %r2745; + mul.f32 %r1767, %r1767, %r2746; + mul.f32 %r1768, %r1768, %r2746; + mul.f32 %r1769, %r1769, %r2745; + mul.f32 %r1770, %r1770, %r2745; + mul.f32 %r1771, %r1771, %r2746; + mul.f32 %r1772, %r1772, %r2746; + mul.f32 %r1773, %r1773, %r2745; + mul.f32 %r1774, %r1774, %r2745; + mul.f32 %r1775, %r1775, %r2746; + mul.f32 %r1776, %r1776, %r2746; + mul.f32 %r1777, %r1777, %r2745; + mul.f32 %r1778, %r1778, %r2745; + mul.f32 %r1779, %r1779, %r2746; + mul.f32 %r1780, %r1780, %r2746; + mul.f32 %r1781, %r1781, %r2745; + mul.f32 %r1782, %r1782, %r2745; + mul.f32 %r1783, %r1783, %r2746; + mul.f32 %r1784, %r1784, %r2746; + mul.f32 %r1785, %r1785, %r2745; + mul.f32 %r1786, %r1786, %r2745; + mul.f32 %r1787, %r1787, %r2746; + mul.f32 %r1788, %r1788, %r2746; + mul.f32 %r1789, %r1789, %r2745; + mul.f32 %r1790, %r1790, %r2745; + mul.f32 %r1791, %r1791, %r2746; + mul.f32 %r1792, %r1792, %r2746; + mul.f32 %r1793, %r1793, %r2745; + mul.f32 %r1794, %r1794, %r2745; + mul.f32 %r1795, %r1795, %r2746; + mul.f32 %r1796, %r1796, %r2746; + mul.f32 %r1797, %r1797, %r2745; + mul.f32 %r1798, %r1798, %r2745; + mul.f32 %r1799, %r1799, %r2746; + mul.f32 %r1800, %r1800, %r2746; + mul.f32 %r1801, %r1801, %r2745; + mul.f32 %r1802, %r1802, %r2745; + mul.f32 %r1803, %r1803, %r2746; + mul.f32 %r1804, %r1804, %r2746; + mul.f32 %r1805, %r1805, %r2745; + mul.f32 %r1806, %r1806, %r2745; + mul.f32 %r1807, %r1807, %r2746; + mul.f32 %r1808, %r1808, %r2746; + mul.f32 %r1809, %r1809, %r2745; + mul.f32 %r1810, %r1810, %r2745; + mul.f32 %r1811, %r1811, %r2746; + mul.f32 %r1812, %r1812, %r2746; + .loc 1 292 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s32 %r2849, %r889, 49152; + add.s32 %r2850, %r2849, %r2166; + .loc 1 440 22 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:440:22 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + cvt.rn.bf16x2.f32 %r1745, %r2780, %r2779; + cvt.rn.bf16x2.f32 %r1746, %r2782, %r2781; + cvt.rn.bf16x2.f32 %r1747, %r2784, %r2783; + cvt.rn.bf16x2.f32 %r1748, %r2786, %r2785; + cvt.rn.bf16x2.f32 %r1877, %r2788, %r2787; + cvt.rn.bf16x2.f32 %r1878, %r2790, %r2789; + cvt.rn.bf16x2.f32 %r1879, %r2792, %r2791; + cvt.rn.bf16x2.f32 %r1880, %r2794, %r2793; + cvt.rn.bf16x2.f32 %r2009, %r2796, %r2795; + cvt.rn.bf16x2.f32 %r2010, %r2798, %r2797; + cvt.rn.bf16x2.f32 %r2011, %r2800, %r2799; + cvt.rn.bf16x2.f32 %r2012, %r2802, %r2801; + cvt.rn.bf16x2.f32 %r2141, %r2804, %r2803; + cvt.rn.bf16x2.f32 %r2142, %r2806, %r2805; + cvt.rn.bf16x2.f32 %r2143, %r2808, %r2807; + cvt.rn.bf16x2.f32 %r2144, %r2810, %r2809; + .loc 1 440 44 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:440:44 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + wgmma.fence.sync.aligned; + bfe.u32 %r2851, %r2850, 4, 14; + cvt.u64.u32 %rd164, %r2851; + or.b64 %rd110, %rd164, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765,%r1766,%r1767,%r1768,%r1769,%r1770,%r1771,%r1772,%r1773,%r1774,%r1775,%r1776,%r1777,%r1778,%r1779,%r1780,%r1781,%r1782,%r1783,%r1784,%r1785,%r1786,%r1787,%r1788,%r1789,%r1790,%r1791,%r1792,%r1793,%r1794,%r1795,%r1796,%r1797,%r1798,%r1799,%r1800,%r1801,%r1802,%r1803,%r1804,%r1805,%r1806,%r1807,%r1808,%r1809,%r1810,%r1811,%r1812}, {%r1745,%r1746,%r1747,%r1748}, %rd110, %p20, 1, 1, 1; + // end inline asm + add.s32 %r2852, %r2850, 2048; + bfe.u32 %r2853, %r2852, 4, 14; + cvt.u64.u32 %rd165, %r2853; + or.b64 %rd111, %rd165, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765,%r1766,%r1767,%r1768,%r1769,%r1770,%r1771,%r1772,%r1773,%r1774,%r1775,%r1776,%r1777,%r1778,%r1779,%r1780,%r1781,%r1782,%r1783,%r1784,%r1785,%r1786,%r1787,%r1788,%r1789,%r1790,%r1791,%r1792,%r1793,%r1794,%r1795,%r1796,%r1797,%r1798,%r1799,%r1800,%r1801,%r1802,%r1803,%r1804,%r1805,%r1806,%r1807,%r1808,%r1809,%r1810,%r1811,%r1812}, {%r1877,%r1878,%r1879,%r1880}, %rd111, %p20, 1, 1, 1; + // end inline asm + add.s32 %r2854, %r2850, 4096; + bfe.u32 %r2855, %r2854, 4, 14; + cvt.u64.u32 %rd166, %r2855; + or.b64 %rd112, %rd166, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765,%r1766,%r1767,%r1768,%r1769,%r1770,%r1771,%r1772,%r1773,%r1774,%r1775,%r1776,%r1777,%r1778,%r1779,%r1780,%r1781,%r1782,%r1783,%r1784,%r1785,%r1786,%r1787,%r1788,%r1789,%r1790,%r1791,%r1792,%r1793,%r1794,%r1795,%r1796,%r1797,%r1798,%r1799,%r1800,%r1801,%r1802,%r1803,%r1804,%r1805,%r1806,%r1807,%r1808,%r1809,%r1810,%r1811,%r1812}, {%r2009,%r2010,%r2011,%r2012}, %rd112, %p20, 1, 1, 1; + // end inline asm + add.s32 %r2856, %r2850, 6144; + bfe.u32 %r2857, %r2856, 4, 14; + cvt.u64.u32 %rd167, %r2857; + or.b64 %rd113, %rd167, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765,%r1766,%r1767,%r1768,%r1769,%r1770,%r1771,%r1772,%r1773,%r1774,%r1775,%r1776,%r1777,%r1778,%r1779,%r1780,%r1781,%r1782,%r1783,%r1784,%r1785,%r1786,%r1787,%r1788,%r1789,%r1790,%r1791,%r1792,%r1793,%r1794,%r1795,%r1796,%r1797,%r1798,%r1799,%r1800,%r1801,%r1802,%r1803,%r1804,%r1805,%r1806,%r1807,%r1808,%r1809,%r1810,%r1811,%r1812}, {%r2141,%r2142,%r2143,%r2144}, %rd113, %p20, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 548 26 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:548:26 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s32 %r5121, %r5050, %r5121; + add.s32 %r5122, %r5050, %r5122; + add.s32 %r5123, %r5050, %r5123; + add.s32 %r5124, %r5050, %r5124; + add.s32 %r5125, %r5050, %r5125; + add.s32 %r5126, %r5050, %r5126; + add.s32 %r5127, %r5050, %r5127; + add.s32 %r5128, %r5050, %r5128; + add.s32 %r5129, %r5050, %r5129; + add.s32 %r5130, %r5050, %r5130; + add.s32 %r5131, %r5050, %r5131; + add.s32 %r5132, %r5050, %r5132; + add.s32 %r5133, %r5050, %r5133; + add.s32 %r5134, %r5050, %r5134; + add.s32 %r5135, %r5050, %r5135; + add.s32 %r5136, %r5050, %r5136; + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s32 %r354, %r5120, 1; + .loc 1 247 33 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:247:33 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + shr.u32 %r2858, %r354, 1; + .loc 1 248 38 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:248:38 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mad.wide.u32 %rd115, %r2858, 4, %rd47; + .loc 1 248 24 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:248:24 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + // begin inline asm + mov.u64 %rd114, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd114, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r2145, 0x0; + @%p31 ld.global.L1::evict_last.L2::cache_hint.b32 { %r2145 }, [ %rd115 + 0 ], %rd114; + // end inline asm + .loc 1 249 109 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:249:109 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s32 %r2859, %r2858, 1; + .loc 1 249 113 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:249:113 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + setp.lt.s32 %p358, %r2859, %r828; + .loc 1 249 55 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:249:55 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s64 %rd118, %rd115, 4; + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + and.pred %p32, %p31, %p358; + .loc 1 249 25 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:249:25 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + // begin inline asm + mov.u64 %rd117, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd117, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r2146, 0x0; + @%p32 ld.global.L1::evict_last.L2::cache_hint.b32 { %r2146 }, [ %rd118 + 0 ], %rd117; + // end inline asm + .loc 1 250 35 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:250:35 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + and.b32 %r2860, %r5120, 1; + .loc 1 251 34 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:251:34 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + sub.s32 %r2861, %r2146, %r2145; + .loc 1 251 48 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:251:48 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + shl.b32 %r2862, %r2861, 7; + .loc 1 251 63 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:251:63 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s32 %r2863, %r2862, -64; + .loc 1 252 29 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:252:29 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + xor.b32 %r2864, %r2860, 1; + .loc 1 252 61 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:252:61 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + shl.b32 %r2865, %r2860, 6; + .loc 1 252 42 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:252:42 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mad.lo.s32 %r5050, %r2863, %r2864, %r2865; + .loc 1 549 21 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:549:21 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s32 %r5053, %r5050, %r5053; + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s32 %r2866, %r5052, 1; + setp.gt.s32 %p359, %r2866, 2; + selp.b32 %r5052, 0, %r2866, %p359; + .loc 1 342 32 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:342:32 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s32 %r2867, %r5053, %r31; + .loc 1 346 35 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:346:35 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s32 %r2868, %r2867, %r8; + add.s32 %r2869, %r2867, %r9; + add.s32 %r2870, %r2867, %r10; + add.s32 %r2871, %r2867, %r11; + .loc 1 284 38 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:38 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + shl.b32 %r2872, %r2868, 7; + shl.b32 %r2873, %r2869, 7; + shl.b32 %r2874, %r2870, 7; + shl.b32 %r2875, %r2871, 7; + .loc 1 284 49 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:49 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + mul.wide.s32 %rd168, %r2872, 2; + add.s64 %rd120, %rd10, %rd168; + mul.wide.s32 %rd169, %r2873, 2; + add.s64 %rd121, %rd10, %rd169; + mul.wide.s32 %rd170, %r2874, 2; + add.s64 %rd122, %rd10, %rd170; + mul.wide.s32 %rd171, %r2875, 2; + add.s64 %rd123, %rd10, %rd171; + .loc 1 292 52 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:52 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + setp.lt.s32 %p360, %r2868, %r761; + setp.lt.s32 %p361, %r2869, %r761; + setp.lt.s32 %p362, %r2870, %r761; + setp.lt.s32 %p363, %r2871, %r761; + .loc 1 292 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + shl.b32 %r2876, %r5052, 14; + add.s32 %r2877, %r889, %r2876; + bar.sync 0; + add.s32 %r2147, %r2877, %r84; + selp.b32 %r2878, 16, 0, %p360; + selp.b32 %r2156, %r2878, 0, %p35; + // begin inline asm + cp.async.cg.shared.global [ %r2147 + 0 ], [ %rd120 + 0 ], 0x10, %r2156; + // end inline asm + add.s32 %r2149, %r2147, 2048; + selp.b32 %r2879, 16, 0, %p361; + selp.b32 %r2158, %r2879, 0, %p35; + // begin inline asm + cp.async.cg.shared.global [ %r2149 + 0 ], [ %rd121 + 0 ], 0x10, %r2158; + // end inline asm + add.s32 %r2151, %r2147, 4096; + selp.b32 %r2880, 16, 0, %p362; + selp.b32 %r2160, %r2880, 0, %p35; + // begin inline asm + cp.async.cg.shared.global [ %r2151 + 0 ], [ %rd122 + 0 ], 0x10, %r2160; + // end inline asm + add.s32 %r2153, %r2147, 6144; + selp.b32 %r2881, 16, 0, %p363; + selp.b32 %r2162, %r2881, 0, %p35; + // begin inline asm + cp.async.cg.shared.global [ %r2153 + 0 ], [ %rd123 + 0 ], 0x10, %r2162; + // end inline asm + cp.async.commit_group; + .loc 1 284 49 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:49 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s64 %rd124, %rd11, %rd168; + add.s64 %rd125, %rd11, %rd169; + add.s64 %rd126, %rd11, %rd170; + add.s64 %rd127, %rd11, %rd171; + .loc 1 292 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + add.s32 %r2882, %r2849, %r2876; + add.s32 %r2155, %r2882, %r84; + // begin inline asm + cp.async.cg.shared.global [ %r2155 + 0 ], [ %rd124 + 0 ], 0x10, %r2156; + // end inline asm + add.s32 %r2157, %r2155, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r2157 + 0 ], [ %rd125 + 0 ], 0x10, %r2158; + // end inline asm + add.s32 %r2159, %r2155, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r2159 + 0 ], [ %rd126 + 0 ], 0x10, %r2160; + // end inline asm + add.s32 %r2161, %r2155, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r2161 + 0 ], [ %rd127 + 0 ], 0x10, %r2162; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + setp.ne.b32 %p364, %r119, %r354; + mov.b32 %r5120, %r354; + @%p364 bra $L__BB0_2; +$L__BB0_3: // %._crit_edge + .loc 1 0 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:0:40 + ld.param.b64 %rd38, [triton_tem_fused_0_param_10]; + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:172:41 ] + // begin inline asm + // wait for regs: %r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765,%r1766,%r1767,%r1768,%r1769,%r1770,%r1771,%r1772,%r1773,%r1774,%r1775,%r1776,%r1777,%r1778,%r1779,%r1780,%r1781,%r1782,%r1783,%r1784,%r1785,%r1786,%r1787,%r1788,%r1789,%r1790,%r1791,%r1792,%r1793,%r1794,%r1795,%r1796,%r1797,%r1798,%r1799,%r1800,%r1801,%r1802,%r1803,%r1804,%r1805,%r1806,%r1807,%r1808,%r1809,%r1810,%r1811,%r1812 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp8: + .loc 1 181 35 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:181:35 + shl.b64 %rd190, %rd4, 2; + add.s64 %rd172, %rd37, %rd190; + .loc 1 182 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:182:27 + // begin inline asm + mov.u32 %r3011, 0x0; + ld.global.b32 { %r3011 }, [ %rd172 + 0 ]; + // end inline asm + .loc 1 182 41 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:182:41 + shl.b32 %r424, %r3011, 7; + .loc 1 183 51 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:183:51 + shl.b64 %rd191, %rd6, 2; + add.s64 %rd173, %rd36, %rd191; + .loc 1 183 32 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:183:32 + // begin inline asm + mov.u32 %r3012, 0x0; + ld.global.b32 { %r3012 }, [ %rd173 + 0 ]; + // end inline asm + .loc 1 184 49 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:184:49 + shl.b32 %r3045, %r3012, 1; + .loc 1 184 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:184:69 + min.s32 %r426, %r3045, %r33; +$L__tmp9: + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + setp.lt.s32 %p365, %r3045, 1; + setp.gt.s32 %p366, %r3045, 0; + .loc 1 346 35 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:346:35 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + or.b32 %r3046, %r424, %r8; + or.b32 %r3047, %r424, %r9; + or.b32 %r3048, %r424, %r10; + or.b32 %r3049, %r424, %r11; + .loc 1 284 38 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:38 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + shl.b32 %r3050, %r3046, 7; + shl.b32 %r3051, %r3047, 7; + shl.b32 %r3052, %r3048, 7; + shl.b32 %r3053, %r3049, 7; + .loc 1 284 20 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:20 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.wide.s32 %rd192, %r3050, 2; + add.s64 %rd193, %rd1, %rd192; + mul.wide.s32 %rd194, %r3051, 2; + add.s64 %rd195, %rd1, %rd194; + mul.wide.s32 %rd196, %r3052, 2; + add.s64 %rd197, %rd1, %rd196; + mul.wide.s32 %rd198, %r3053, 2; + add.s64 %rd199, %rd1, %rd198; + .loc 1 284 49 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:49 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + shl.b64 %rd200, %rd3, 1; + add.s64 %rd174, %rd193, %rd200; + add.s64 %rd175, %rd195, %rd200; + add.s64 %rd176, %rd197, %rd200; + add.s64 %rd177, %rd199, %rd200; + .loc 1 292 52 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:52 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + setp.lt.s32 %p367, %r3046, %r761; + setp.lt.s32 %p368, %r3047, %r761; + setp.lt.s32 %p369, %r3048, %r761; + setp.lt.s32 %p370, %r3049, %r761; + .loc 1 292 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.b32 %r3054, 16, 0, %p367; + selp.b32 %r3022, %r3054, 0, %p366; + // begin inline asm + cp.async.cg.shared.global [ %r3013 + 0 ], [ %rd174 + 0 ], 0x10, %r3022; + // end inline asm + selp.b32 %r3055, 16, 0, %p368; + selp.b32 %r3024, %r3055, 0, %p366; + // begin inline asm + cp.async.cg.shared.global [ %r3015 + 0 ], [ %rd175 + 0 ], 0x10, %r3024; + // end inline asm + selp.b32 %r3056, 16, 0, %p369; + selp.b32 %r3026, %r3056, 0, %p366; + // begin inline asm + cp.async.cg.shared.global [ %r3017 + 0 ], [ %rd176 + 0 ], 0x10, %r3026; + // end inline asm + selp.b32 %r3057, 16, 0, %p370; + selp.b32 %r3028, %r3057, 0, %p366; + // begin inline asm + cp.async.cg.shared.global [ %r3019 + 0 ], [ %rd177 + 0 ], 0x10, %r3028; + // end inline asm + cp.async.commit_group; + .loc 1 284 20 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:20 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.s64 %rd201, %rd2, %rd192; + add.s64 %rd202, %rd2, %rd194; + add.s64 %rd203, %rd2, %rd196; + add.s64 %rd204, %rd2, %rd198; + .loc 1 284 49 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:49 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.s64 %rd178, %rd201, %rd200; + add.s64 %rd179, %rd202, %rd200; + add.s64 %rd180, %rd203, %rd200; + add.s64 %rd181, %rd204, %rd200; + .loc 1 292 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + // begin inline asm + cp.async.cg.shared.global [ %r837 + 0 ], [ %rd178 + 0 ], 0x10, %r3022; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r839 + 0 ], [ %rd179 + 0 ], 0x10, %r3024; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r841 + 0 ], [ %rd180 + 0 ], 0x10, %r3026; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r843 + 0 ], [ %rd181 + 0 ], 0x10, %r3028; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + setp.gt.s32 %p371, %r426, 1; + .loc 1 342 32 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:342:32 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + or.b32 %r3058, %r424, 64; + .loc 1 346 35 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:346:35 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + or.b32 %r3059, %r3058, %r8; + or.b32 %r3060, %r3058, %r9; + or.b32 %r3061, %r3058, %r10; + or.b32 %r3062, %r3058, %r11; + .loc 1 284 38 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:38 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + shl.b32 %r3063, %r3059, 7; + shl.b32 %r3064, %r3060, 7; + shl.b32 %r3065, %r3061, 7; + shl.b32 %r3066, %r3062, 7; + .loc 1 284 20 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:20 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.wide.s32 %rd205, %r3063, 2; + add.s64 %rd206, %rd1, %rd205; + mul.wide.s32 %rd207, %r3064, 2; + add.s64 %rd208, %rd1, %rd207; + mul.wide.s32 %rd209, %r3065, 2; + add.s64 %rd210, %rd1, %rd209; + mul.wide.s32 %rd211, %r3066, 2; + add.s64 %rd212, %rd1, %rd211; + .loc 1 284 49 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:49 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.s64 %rd182, %rd206, %rd200; + add.s64 %rd183, %rd208, %rd200; + add.s64 %rd184, %rd210, %rd200; + add.s64 %rd185, %rd212, %rd200; + .loc 1 292 52 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:52 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + setp.lt.s32 %p372, %r3059, %r761; + setp.lt.s32 %p373, %r3060, %r761; + setp.lt.s32 %p374, %r3061, %r761; + setp.lt.s32 %p375, %r3062, %r761; + .loc 1 292 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + bar.sync 0; + selp.b32 %r3067, 16, 0, %p372; + selp.b32 %r3038, %r3067, 0, %p371; + // begin inline asm + cp.async.cg.shared.global [ %r845 + 0 ], [ %rd182 + 0 ], 0x10, %r3038; + // end inline asm + selp.b32 %r3068, 16, 0, %p373; + selp.b32 %r3040, %r3068, 0, %p371; + // begin inline asm + cp.async.cg.shared.global [ %r847 + 0 ], [ %rd183 + 0 ], 0x10, %r3040; + // end inline asm + selp.b32 %r3069, 16, 0, %p374; + selp.b32 %r3042, %r3069, 0, %p371; + // begin inline asm + cp.async.cg.shared.global [ %r849 + 0 ], [ %rd184 + 0 ], 0x10, %r3042; + // end inline asm + selp.b32 %r3070, 16, 0, %p375; + selp.b32 %r3044, %r3070, 0, %p371; + // begin inline asm + cp.async.cg.shared.global [ %r851 + 0 ], [ %rd185 + 0 ], 0x10, %r3044; + // end inline asm + cp.async.commit_group; + .loc 1 284 20 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:20 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.s64 %rd213, %rd2, %rd205; + add.s64 %rd214, %rd2, %rd207; + add.s64 %rd215, %rd2, %rd209; + add.s64 %rd216, %rd2, %rd211; + .loc 1 284 49 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:49 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.s64 %rd186, %rd213, %rd200; + add.s64 %rd187, %rd214, %rd200; + add.s64 %rd188, %rd215, %rd200; + add.s64 %rd189, %rd216, %rd200; + .loc 1 292 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + // begin inline asm + cp.async.cg.shared.global [ %r853 + 0 ], [ %rd186 + 0 ], 0x10, %r3038; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r855 + 0 ], [ %rd187 + 0 ], 0x10, %r3040; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r857 + 0 ], [ %rd188 + 0 ], 0x10, %r3042; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r859 + 0 ], [ %rd189 + 0 ], 0x10, %r3044; + // end inline asm + cp.async.commit_group; + .loc 1 351 19 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:351:19 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + @%p365 bra $L__BB0_6; +$L__tmp10: +// %bb.4: // %.lr.ph460 + .loc 1 186 28 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:186:28 + or.b32 %r5289, %r424, %r36; + or.b32 %r5288, %r424, %r37; + or.b32 %r5286, %r424, %r38; + or.b32 %r5282, %r424, %r40; + or.b32 %r5274, %r424, %r44; + or.b32 %r5287, %r424, %r39; + or.b32 %r5283, %r424, %r41; + or.b32 %r5275, %r424, %r45; + or.b32 %r5284, %r424, %r42; + or.b32 %r5276, %r424, %r46; + or.b32 %r5285, %r424, %r43; + or.b32 %r5277, %r424, %r47; + or.b32 %r5278, %r424, %r48; + or.b32 %r5279, %r424, %r49; + or.b32 %r5280, %r424, %r50; + or.b32 %r5281, %r424, %r51; + add.s32 %r507, %r426, -2; + add.s32 %r508, %r426, -1; +$L__tmp11: + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + max.s32 %r509, %r426, 1; + mov.b32 %r3690, 0; + mov.b32 %r5205, 1; + mov.b32 %r5204, -1; + mov.b32 %r5203, 64; + mov.b32 %r5206, %r5203; + mov.b32 %r5271, %r3690; +$L__BB0_5: // %__nv_exp2f.exit + // =>This Inner Loop Header: Depth=1 + setp.lt.s32 %p389, %r5271, %r507; + setp.lt.s32 %p387, %r5271, %r508; + add.s32 %r4305, %r5204, 1; + setp.gt.s32 %p390, %r4305, 2; + selp.b32 %r5204, 0, %r4305, %p390; + .loc 1 292 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r4306, %r5204, 14; + add.s32 %r3590, %r889, %r4306; + .loc 1 351 19 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:351:19 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + shfl.sync.idx.b32 %r4308, %r7, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r4309, %r4308, 11; + and.b32 %r4310, %r4309, 8192; + add.s32 %r3587, %r889, 98304; + add.s32 %r4311, %r4310, %r3587; + bfe.u32 %r4312, %r4311, 4, 14; + cvt.u64.u32 %rd251, %r4312; + or.b64 %rd217, %rd251, 4611686293372403712; + bfe.u32 %r4313, %r3590, 4, 14; + cvt.u64.u32 %rd252, %r4313; + or.b64 %rd218, %rd252, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3171,%r3172,%r3173,%r3174,%r3175,%r3176,%r3177,%r3178,%r3179,%r3180,%r3181,%r3182,%r3183,%r3184,%r3185,%r3186,%r3187,%r3188,%r3189,%r3190,%r3191,%r3192,%r3193,%r3194,%r3195,%r3196,%r3197,%r3198,%r3199,%r3200,%r3201,%r3202}, %rd217, %rd218, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r4314, %r4311, 32; + bfe.u32 %r4315, %r4314, 4, 14; + cvt.u64.u32 %rd253, %r4315; + or.b64 %rd219, %rd253, 4611686293372403712; + add.s32 %r4316, %r3590, 32; + bfe.u32 %r4317, %r4316, 4, 14; + cvt.u64.u32 %rd254, %r4317; + or.b64 %rd220, %rd254, 4611686293338849280; + mov.pred %p376, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3171,%r3172,%r3173,%r3174,%r3175,%r3176,%r3177,%r3178,%r3179,%r3180,%r3181,%r3182,%r3183,%r3184,%r3185,%r3186,%r3187,%r3188,%r3189,%r3190,%r3191,%r3192,%r3193,%r3194,%r3195,%r3196,%r3197,%r3198,%r3199,%r3200,%r3201,%r3202}, %rd219, %rd220, %p376, 1, 1, 0, 0; + // end inline asm + add.s32 %r4318, %r4311, 64; + bfe.u32 %r4319, %r4318, 4, 14; + cvt.u64.u32 %rd255, %r4319; + or.b64 %rd221, %rd255, 4611686293372403712; + add.s32 %r4320, %r3590, 64; + bfe.u32 %r4321, %r4320, 4, 14; + cvt.u64.u32 %rd256, %r4321; + or.b64 %rd222, %rd256, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3171,%r3172,%r3173,%r3174,%r3175,%r3176,%r3177,%r3178,%r3179,%r3180,%r3181,%r3182,%r3183,%r3184,%r3185,%r3186,%r3187,%r3188,%r3189,%r3190,%r3191,%r3192,%r3193,%r3194,%r3195,%r3196,%r3197,%r3198,%r3199,%r3200,%r3201,%r3202}, %rd221, %rd222, %p376, 1, 1, 0, 0; + // end inline asm + add.s32 %r4322, %r4311, 96; + bfe.u32 %r4323, %r4322, 4, 14; + cvt.u64.u32 %rd257, %r4323; + or.b64 %rd223, %rd257, 4611686293372403712; + add.s32 %r4324, %r3590, 96; + bfe.u32 %r4325, %r4324, 4, 14; + cvt.u64.u32 %rd258, %r4325; + or.b64 %rd224, %rd258, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3171,%r3172,%r3173,%r3174,%r3175,%r3176,%r3177,%r3178,%r3179,%r3180,%r3181,%r3182,%r3183,%r3184,%r3185,%r3186,%r3187,%r3188,%r3189,%r3190,%r3191,%r3192,%r3193,%r3194,%r3195,%r3196,%r3197,%r3198,%r3199,%r3200,%r3201,%r3202}, %rd223, %rd224, %p376, 1, 1, 0, 0; + // end inline asm + add.s32 %r4326, %r4311, 16384; + bfe.u32 %r4327, %r4326, 4, 14; + cvt.u64.u32 %rd259, %r4327; + or.b64 %rd225, %rd259, 4611686293372403712; + add.s32 %r4328, %r3590, 8192; + bfe.u32 %r4329, %r4328, 4, 14; + cvt.u64.u32 %rd260, %r4329; + or.b64 %rd226, %rd260, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3171,%r3172,%r3173,%r3174,%r3175,%r3176,%r3177,%r3178,%r3179,%r3180,%r3181,%r3182,%r3183,%r3184,%r3185,%r3186,%r3187,%r3188,%r3189,%r3190,%r3191,%r3192,%r3193,%r3194,%r3195,%r3196,%r3197,%r3198,%r3199,%r3200,%r3201,%r3202}, %rd225, %rd226, %p376, 1, 1, 0, 0; + // end inline asm + add.s32 %r4330, %r4311, 16416; + bfe.u32 %r4331, %r4330, 4, 14; + cvt.u64.u32 %rd261, %r4331; + or.b64 %rd227, %rd261, 4611686293372403712; + add.s32 %r4332, %r3590, 8224; + bfe.u32 %r4333, %r4332, 4, 14; + cvt.u64.u32 %rd262, %r4333; + or.b64 %rd228, %rd262, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3171,%r3172,%r3173,%r3174,%r3175,%r3176,%r3177,%r3178,%r3179,%r3180,%r3181,%r3182,%r3183,%r3184,%r3185,%r3186,%r3187,%r3188,%r3189,%r3190,%r3191,%r3192,%r3193,%r3194,%r3195,%r3196,%r3197,%r3198,%r3199,%r3200,%r3201,%r3202}, %rd227, %rd228, %p376, 1, 1, 0, 0; + // end inline asm + add.s32 %r4334, %r4311, 16448; + bfe.u32 %r4335, %r4334, 4, 14; + cvt.u64.u32 %rd263, %r4335; + or.b64 %rd229, %rd263, 4611686293372403712; + add.s32 %r4336, %r3590, 8256; + bfe.u32 %r4337, %r4336, 4, 14; + cvt.u64.u32 %rd264, %r4337; + or.b64 %rd230, %rd264, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3171,%r3172,%r3173,%r3174,%r3175,%r3176,%r3177,%r3178,%r3179,%r3180,%r3181,%r3182,%r3183,%r3184,%r3185,%r3186,%r3187,%r3188,%r3189,%r3190,%r3191,%r3192,%r3193,%r3194,%r3195,%r3196,%r3197,%r3198,%r3199,%r3200,%r3201,%r3202}, %rd229, %rd230, %p376, 1, 1, 0, 0; + // end inline asm + add.s32 %r4338, %r4311, 16480; + bfe.u32 %r4339, %r4338, 4, 14; + cvt.u64.u32 %rd265, %r4339; + or.b64 %rd231, %rd265, 4611686293372403712; + add.s32 %r4340, %r3590, 8288; + bfe.u32 %r4341, %r4340, 4, 14; + cvt.u64.u32 %rd266, %r4341; + or.b64 %rd232, %rd266, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r3171,%r3172,%r3173,%r3174,%r3175,%r3176,%r3177,%r3178,%r3179,%r3180,%r3181,%r3182,%r3183,%r3184,%r3185,%r3186,%r3187,%r3188,%r3189,%r3190,%r3191,%r3192,%r3193,%r3194,%r3195,%r3196,%r3197,%r3198,%r3199,%r3200,%r3201,%r3202}, %rd231, %rd232, %p376, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r3592, %r3690; + mov.b32 %r3588, %r3690; + mov.b32 %r3589, %r3690; + mov.b32 %r3591, %r3690; + // begin inline asm + // wait for regs: %r3171,%r3172,%r3173,%r3174,%r3175,%r3176,%r3177,%r3178,%r3179,%r3180,%r3181,%r3182,%r3183,%r3184,%r3185,%r3186,%r3187,%r3188,%r3189,%r3190,%r3191,%r3192,%r3193,%r3194,%r3195,%r3196,%r3197,%r3198,%r3199,%r3200,%r3201,%r3202,%r3587,%r3588,%r3589,%r3590,%r3591,%r3592,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765,%r1766,%r1767,%r1768,%r1769,%r1770,%r1771,%r1772,%r1773,%r1774,%r1775,%r1776,%r1777,%r1778,%r1779,%r1780,%r1781,%r1782,%r1783,%r1784,%r1785,%r1786,%r1787,%r1788,%r1789,%r1790,%r1791,%r1792,%r1793,%r1794,%r1795,%r1796,%r1797,%r1798,%r1799,%r1800,%r1801,%r1802,%r1803,%r1804,%r1805,%r1806,%r1807,%r1808,%r1809,%r1810,%r1811,%r1812 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 353 14 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:353:14 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4342, %r3171, 0f3DB504F3; + mul.f32 %r4343, %r3172, 0f3DB504F3; + mul.f32 %r4344, %r3173, 0f3DB504F3; + mul.f32 %r4345, %r3174, 0f3DB504F3; + mul.f32 %r4346, %r3175, 0f3DB504F3; + mul.f32 %r4347, %r3176, 0f3DB504F3; + mul.f32 %r4348, %r3177, 0f3DB504F3; + mul.f32 %r4349, %r3178, 0f3DB504F3; + mul.f32 %r4350, %r3179, 0f3DB504F3; + mul.f32 %r4351, %r3180, 0f3DB504F3; + mul.f32 %r4352, %r3181, 0f3DB504F3; + mul.f32 %r4353, %r3182, 0f3DB504F3; + mul.f32 %r4354, %r3183, 0f3DB504F3; + mul.f32 %r4355, %r3184, 0f3DB504F3; + mul.f32 %r4356, %r3185, 0f3DB504F3; + mul.f32 %r4357, %r3186, 0f3DB504F3; + mul.f32 %r4358, %r3187, 0f3DB504F3; + mul.f32 %r4359, %r3188, 0f3DB504F3; + mul.f32 %r4360, %r3189, 0f3DB504F3; + mul.f32 %r4361, %r3190, 0f3DB504F3; + mul.f32 %r4362, %r3191, 0f3DB504F3; + mul.f32 %r4363, %r3192, 0f3DB504F3; + mul.f32 %r4364, %r3193, 0f3DB504F3; + mul.f32 %r4365, %r3194, 0f3DB504F3; + mul.f32 %r4366, %r3195, 0f3DB504F3; + mul.f32 %r4367, %r3196, 0f3DB504F3; + mul.f32 %r4368, %r3197, 0f3DB504F3; + mul.f32 %r4369, %r3198, 0f3DB504F3; + mul.f32 %r4370, %r3199, 0f3DB504F3; + mul.f32 %r4371, %r3200, 0f3DB504F3; + mul.f32 %r4372, %r3201, 0f3DB504F3; + mul.f32 %r4373, %r3202, 0f3DB504F3; + .loc 1 367 44 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:44 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + setp.lt.s32 %p391, %r5289, %r761; + setp.lt.s32 %p392, %r5288, %r761; + setp.lt.s32 %p393, %r5287, %r761; + setp.lt.s32 %p394, %r5286, %r761; + setp.lt.s32 %p395, %r5285, %r761; + setp.lt.s32 %p396, %r5284, %r761; + setp.lt.s32 %p397, %r5283, %r761; + setp.lt.s32 %p398, %r5282, %r761; + setp.lt.s32 %p399, %r5281, %r761; + setp.lt.s32 %p400, %r5280, %r761; + setp.lt.s32 %p401, %r5279, %r761; + setp.lt.s32 %p402, %r5278, %r761; + setp.lt.s32 %p403, %r5277, %r761; + setp.lt.s32 %p404, %r5276, %r761; + setp.lt.s32 %p405, %r5275, %r761; + setp.lt.s32 %p406, %r5274, %r761; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4374, %r4342, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4375, %r4374, 0fFF800000, %p391; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4376, %r4343, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4377, %r4376, 0fFF800000, %p392; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4378, %r4344, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4379, %r4378, 0fFF800000, %p391; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4380, %r4345, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4381, %r4380, 0fFF800000, %p392; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4382, %r4346, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4383, %r4382, 0fFF800000, %p393; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4384, %r4347, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4385, %r4384, 0fFF800000, %p394; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4386, %r4348, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4387, %r4386, 0fFF800000, %p393; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4388, %r4349, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4389, %r4388, 0fFF800000, %p394; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4390, %r4350, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4391, %r4390, 0fFF800000, %p395; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4392, %r4351, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4393, %r4392, 0fFF800000, %p396; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4394, %r4352, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4395, %r4394, 0fFF800000, %p395; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4396, %r4353, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4397, %r4396, 0fFF800000, %p396; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4398, %r4354, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4399, %r4398, 0fFF800000, %p397; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4400, %r4355, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4401, %r4400, 0fFF800000, %p398; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4402, %r4356, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4403, %r4402, 0fFF800000, %p397; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4404, %r4357, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4405, %r4404, 0fFF800000, %p398; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4406, %r4358, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4407, %r4406, 0fFF800000, %p399; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4408, %r4359, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4409, %r4408, 0fFF800000, %p400; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4410, %r4360, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4411, %r4410, 0fFF800000, %p399; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4412, %r4361, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4413, %r4412, 0fFF800000, %p400; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4414, %r4362, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4415, %r4414, 0fFF800000, %p401; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4416, %r4363, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4417, %r4416, 0fFF800000, %p402; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4418, %r4364, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4419, %r4418, 0fFF800000, %p401; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4420, %r4365, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4421, %r4420, 0fFF800000, %p402; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4422, %r4366, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4423, %r4422, 0fFF800000, %p403; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4424, %r4367, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4425, %r4424, 0fFF800000, %p404; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4426, %r4368, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4427, %r4426, 0fFF800000, %p403; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4428, %r4369, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4429, %r4428, 0fFF800000, %p404; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4430, %r4370, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4431, %r4430, 0fFF800000, %p405; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4432, %r4371, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4433, %r4432, 0fFF800000, %p406; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4434, %r4372, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4435, %r4434, 0fFF800000, %p405; + .loc 1 417 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:417:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r4436, %r4373, 0f3FB8AA3B; + .loc 1 367 69 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:367:69 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4437, %r4436, 0fFF800000, %p406; + .loc 2 168 27 // standard.py:168:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + max.f32 %r4438, %r4375, %r4377; + max.f32 %r4439, %r4379, %r4381; + max.f32 %r4440, %r4438, %r4383; + max.f32 %r4441, %r4440, %r4385; + max.f32 %r4442, %r4439, %r4387; + max.f32 %r4443, %r4442, %r4389; + max.f32 %r4444, %r4441, %r4391; + max.f32 %r4445, %r4444, %r4393; + max.f32 %r4446, %r4443, %r4395; + max.f32 %r4447, %r4446, %r4397; + max.f32 %r4448, %r4445, %r4399; + max.f32 %r4449, %r4448, %r4401; + max.f32 %r4450, %r4447, %r4403; + max.f32 %r4451, %r4450, %r4405; + max.f32 %r4452, %r4449, %r4407; + max.f32 %r4453, %r4452, %r4409; + max.f32 %r4454, %r4451, %r4411; + max.f32 %r4455, %r4454, %r4413; + max.f32 %r4456, %r4453, %r4415; + max.f32 %r4457, %r4456, %r4417; + max.f32 %r4458, %r4455, %r4419; + max.f32 %r4459, %r4458, %r4421; + max.f32 %r4460, %r4457, %r4423; + max.f32 %r4461, %r4460, %r4425; + max.f32 %r4462, %r4459, %r4427; + max.f32 %r4463, %r4462, %r4429; + max.f32 %r4464, %r4461, %r4431; + max.f32 %r4465, %r4464, %r4433; + max.f32 %r4466, %r4463, %r4435; + max.f32 %r4467, %r4466, %r4437; + .loc 2 189 40 // standard.py:189:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + shfl.sync.bfly.b32 %r4468, %r4465, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + max.f32 %r4469, %r4465, %r4468; + .loc 2 189 40 // standard.py:189:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + shfl.sync.bfly.b32 %r4470, %r4469, 1, 31, -1; + shfl.sync.bfly.b32 %r4471, %r4467, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + max.f32 %r4472, %r4467, %r4471; + .loc 2 189 40 // standard.py:189:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + shfl.sync.bfly.b32 %r4473, %r4472, 1, 31, -1; + cvt.u64.u32 %rd267, %r4470; + cvt.u64.u32 %rd268, %r4473; + shl.b64 %rd269, %rd268, 32; + or.b64 %rd270, %rd267, %rd269; + .loc 2 168 27 // standard.py:168:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mov.b64 {%r4474, %r4475}, %rd270; + max.f32 %r4476, %r4469, %r4474; + max.f32 %r4477, %r4472, %r4475; + .loc 1 421 27 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:421:27 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mov.b64 {%r4478, %r4479}, %rd292; + max.f32 %r4480, %r4479, %r4477; + max.f32 %r4481, %r4478, %r4476; + mov.b64 %rd292, {%r4481, %r4480}; + .loc 1 423 35 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:423:35 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + setp.eq.f32 %p407, %r4481, 0fFF800000; + setp.eq.f32 %p408, %r4480, 0fFF800000; + .loc 1 424 51 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:424:51 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + selp.f32 %r4482, 0f00000000, %r4481, %p407; + selp.f32 %r4483, 0f00000000, %r4480, %p408; + .loc 1 428 31 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:428:31 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + sub.f32 %r4484, %r4478, %r4482; + sub.f32 %r4485, %r4479, %r4483; + .loc 1 428 25 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:428:25 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + ex2.approx.ftz.f32 %r4486, %r4484; + ex2.approx.ftz.f32 %r4487, %r4485; + .loc 1 429 39 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:429:39 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + sub.f32 %r4488, %r4375, %r4482; + sub.f32 %r4489, %r4377, %r4482; + sub.f32 %r4490, %r4379, %r4483; + sub.f32 %r4491, %r4381, %r4483; + sub.f32 %r4492, %r4383, %r4482; + sub.f32 %r4493, %r4385, %r4482; + sub.f32 %r4494, %r4387, %r4483; + sub.f32 %r4495, %r4389, %r4483; + sub.f32 %r4496, %r4391, %r4482; + sub.f32 %r4497, %r4393, %r4482; + sub.f32 %r4498, %r4395, %r4483; + sub.f32 %r4499, %r4397, %r4483; + sub.f32 %r4500, %r4399, %r4482; + sub.f32 %r4501, %r4401, %r4482; + sub.f32 %r4502, %r4403, %r4483; + sub.f32 %r4503, %r4405, %r4483; + sub.f32 %r4504, %r4407, %r4482; + sub.f32 %r4505, %r4409, %r4482; + sub.f32 %r4506, %r4411, %r4483; + sub.f32 %r4507, %r4413, %r4483; + sub.f32 %r4508, %r4415, %r4482; + sub.f32 %r4509, %r4417, %r4482; + sub.f32 %r4510, %r4419, %r4483; + sub.f32 %r4511, %r4421, %r4483; + sub.f32 %r4512, %r4423, %r4482; + sub.f32 %r4513, %r4425, %r4482; + sub.f32 %r4514, %r4427, %r4483; + sub.f32 %r4515, %r4429, %r4483; + sub.f32 %r4516, %r4431, %r4482; + sub.f32 %r4517, %r4433, %r4482; + sub.f32 %r4518, %r4435, %r4483; + sub.f32 %r4519, %r4437, %r4483; + .loc 1 429 21 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:429:21 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + ex2.approx.ftz.f32 %r4520, %r4488; + ex2.approx.ftz.f32 %r4521, %r4489; + ex2.approx.ftz.f32 %r4522, %r4490; + ex2.approx.ftz.f32 %r4523, %r4491; + ex2.approx.ftz.f32 %r4524, %r4492; + ex2.approx.ftz.f32 %r4525, %r4493; + ex2.approx.ftz.f32 %r4526, %r4494; + ex2.approx.ftz.f32 %r4527, %r4495; + ex2.approx.ftz.f32 %r4528, %r4496; + ex2.approx.ftz.f32 %r4529, %r4497; + ex2.approx.ftz.f32 %r4530, %r4498; + ex2.approx.ftz.f32 %r4531, %r4499; + ex2.approx.ftz.f32 %r4532, %r4500; + ex2.approx.ftz.f32 %r4533, %r4501; + ex2.approx.ftz.f32 %r4534, %r4502; + ex2.approx.ftz.f32 %r4535, %r4503; + ex2.approx.ftz.f32 %r4536, %r4504; + ex2.approx.ftz.f32 %r4537, %r4505; + ex2.approx.ftz.f32 %r4538, %r4506; + ex2.approx.ftz.f32 %r4539, %r4507; + ex2.approx.ftz.f32 %r4540, %r4508; + ex2.approx.ftz.f32 %r4541, %r4509; + ex2.approx.ftz.f32 %r4542, %r4510; + ex2.approx.ftz.f32 %r4543, %r4511; + ex2.approx.ftz.f32 %r4544, %r4512; + ex2.approx.ftz.f32 %r4545, %r4513; + ex2.approx.ftz.f32 %r4546, %r4514; + ex2.approx.ftz.f32 %r4547, %r4515; + ex2.approx.ftz.f32 %r4548, %r4516; + ex2.approx.ftz.f32 %r4549, %r4517; + ex2.approx.ftz.f32 %r4550, %r4518; + ex2.approx.ftz.f32 %r4551, %r4519; + .loc 2 261 15 // standard.py:261:15 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.f32 %r4552, %r4520, %r4521; + add.f32 %r4553, %r4522, %r4523; + add.f32 %r4554, %r4552, %r4524; + add.f32 %r4555, %r4554, %r4525; + add.f32 %r4556, %r4553, %r4526; + add.f32 %r4557, %r4556, %r4527; + add.f32 %r4558, %r4555, %r4528; + add.f32 %r4559, %r4558, %r4529; + add.f32 %r4560, %r4557, %r4530; + add.f32 %r4561, %r4560, %r4531; + add.f32 %r4562, %r4559, %r4532; + add.f32 %r4563, %r4562, %r4533; + add.f32 %r4564, %r4561, %r4534; + add.f32 %r4565, %r4564, %r4535; + add.f32 %r4566, %r4563, %r4536; + add.f32 %r4567, %r4566, %r4537; + add.f32 %r4568, %r4565, %r4538; + add.f32 %r4569, %r4568, %r4539; + add.f32 %r4570, %r4567, %r4540; + add.f32 %r4571, %r4570, %r4541; + add.f32 %r4572, %r4569, %r4542; + add.f32 %r4573, %r4572, %r4543; + add.f32 %r4574, %r4571, %r4544; + add.f32 %r4575, %r4574, %r4545; + add.f32 %r4576, %r4573, %r4546; + add.f32 %r4577, %r4576, %r4547; + add.f32 %r4578, %r4575, %r4548; + add.f32 %r4579, %r4578, %r4549; + add.f32 %r4580, %r4577, %r4550; + add.f32 %r4581, %r4580, %r4551; + .loc 2 291 36 // standard.py:291:36 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + shfl.sync.bfly.b32 %r4582, %r4579, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.f32 %r4583, %r4579, %r4582; + .loc 2 291 36 // standard.py:291:36 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + shfl.sync.bfly.b32 %r4584, %r4583, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.f32 %r4585, %r4583, %r4584; + .loc 2 291 36 // standard.py:291:36 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + shfl.sync.bfly.b32 %r4586, %r4581, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.f32 %r4587, %r4581, %r4586; + .loc 2 291 36 // standard.py:291:36 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + shfl.sync.bfly.b32 %r4588, %r4587, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.f32 %r4589, %r4587, %r4588; + .loc 1 434 24 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:434:24 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + fma.rn.f32 %r5354, %r5354, %r4486, %r4585; + fma.rn.f32 %r5355, %r5355, %r4487, %r4589; + .loc 1 436 16 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:436:16 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.f32 %r1749, %r1749, %r4486; + mul.f32 %r1750, %r1750, %r4486; + mul.f32 %r1751, %r1751, %r4487; + mul.f32 %r1752, %r1752, %r4487; + mul.f32 %r1753, %r1753, %r4486; + mul.f32 %r1754, %r1754, %r4486; + mul.f32 %r1755, %r1755, %r4487; + mul.f32 %r1756, %r1756, %r4487; + mul.f32 %r1757, %r1757, %r4486; + mul.f32 %r1758, %r1758, %r4486; + mul.f32 %r1759, %r1759, %r4487; + mul.f32 %r1760, %r1760, %r4487; + mul.f32 %r1761, %r1761, %r4486; + mul.f32 %r1762, %r1762, %r4486; + mul.f32 %r1763, %r1763, %r4487; + mul.f32 %r1764, %r1764, %r4487; + mul.f32 %r1765, %r1765, %r4486; + mul.f32 %r1766, %r1766, %r4486; + mul.f32 %r1767, %r1767, %r4487; + mul.f32 %r1768, %r1768, %r4487; + mul.f32 %r1769, %r1769, %r4486; + mul.f32 %r1770, %r1770, %r4486; + mul.f32 %r1771, %r1771, %r4487; + mul.f32 %r1772, %r1772, %r4487; + mul.f32 %r1773, %r1773, %r4486; + mul.f32 %r1774, %r1774, %r4486; + mul.f32 %r1775, %r1775, %r4487; + mul.f32 %r1776, %r1776, %r4487; + mul.f32 %r1777, %r1777, %r4486; + mul.f32 %r1778, %r1778, %r4486; + mul.f32 %r1779, %r1779, %r4487; + mul.f32 %r1780, %r1780, %r4487; + mul.f32 %r1781, %r1781, %r4486; + mul.f32 %r1782, %r1782, %r4486; + mul.f32 %r1783, %r1783, %r4487; + mul.f32 %r1784, %r1784, %r4487; + mul.f32 %r1785, %r1785, %r4486; + mul.f32 %r1786, %r1786, %r4486; + mul.f32 %r1787, %r1787, %r4487; + mul.f32 %r1788, %r1788, %r4487; + mul.f32 %r1789, %r1789, %r4486; + mul.f32 %r1790, %r1790, %r4486; + mul.f32 %r1791, %r1791, %r4487; + mul.f32 %r1792, %r1792, %r4487; + mul.f32 %r1793, %r1793, %r4486; + mul.f32 %r1794, %r1794, %r4486; + mul.f32 %r1795, %r1795, %r4487; + mul.f32 %r1796, %r1796, %r4487; + mul.f32 %r1797, %r1797, %r4486; + mul.f32 %r1798, %r1798, %r4486; + mul.f32 %r1799, %r1799, %r4487; + mul.f32 %r1800, %r1800, %r4487; + mul.f32 %r1801, %r1801, %r4486; + mul.f32 %r1802, %r1802, %r4486; + mul.f32 %r1803, %r1803, %r4487; + mul.f32 %r1804, %r1804, %r4487; + mul.f32 %r1805, %r1805, %r4486; + mul.f32 %r1806, %r1806, %r4486; + mul.f32 %r1807, %r1807, %r4487; + mul.f32 %r1808, %r1808, %r4487; + mul.f32 %r1809, %r1809, %r4486; + mul.f32 %r1810, %r1810, %r4486; + mul.f32 %r1811, %r1811, %r4487; + mul.f32 %r1812, %r1812, %r4487; + .loc 1 292 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.s32 %r4590, %r889, 49152; + add.s32 %r4591, %r4590, %r4306; + .loc 1 440 22 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:440:22 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + cvt.rn.bf16x2.f32 %r3887, %r4521, %r4520; + cvt.rn.bf16x2.f32 %r3888, %r4523, %r4522; + cvt.rn.bf16x2.f32 %r3889, %r4525, %r4524; + cvt.rn.bf16x2.f32 %r3890, %r4527, %r4526; + cvt.rn.bf16x2.f32 %r4019, %r4529, %r4528; + cvt.rn.bf16x2.f32 %r4020, %r4531, %r4530; + cvt.rn.bf16x2.f32 %r4021, %r4533, %r4532; + cvt.rn.bf16x2.f32 %r4022, %r4535, %r4534; + cvt.rn.bf16x2.f32 %r4151, %r4537, %r4536; + cvt.rn.bf16x2.f32 %r4152, %r4539, %r4538; + cvt.rn.bf16x2.f32 %r4153, %r4541, %r4540; + cvt.rn.bf16x2.f32 %r4154, %r4543, %r4542; + cvt.rn.bf16x2.f32 %r4283, %r4545, %r4544; + cvt.rn.bf16x2.f32 %r4284, %r4547, %r4546; + cvt.rn.bf16x2.f32 %r4285, %r4549, %r4548; + cvt.rn.bf16x2.f32 %r4286, %r4551, %r4550; + .loc 1 440 44 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:440:44 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + wgmma.fence.sync.aligned; + bfe.u32 %r4592, %r4591, 4, 14; + cvt.u64.u32 %rd271, %r4592; + or.b64 %rd233, %rd271, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765,%r1766,%r1767,%r1768,%r1769,%r1770,%r1771,%r1772,%r1773,%r1774,%r1775,%r1776,%r1777,%r1778,%r1779,%r1780,%r1781,%r1782,%r1783,%r1784,%r1785,%r1786,%r1787,%r1788,%r1789,%r1790,%r1791,%r1792,%r1793,%r1794,%r1795,%r1796,%r1797,%r1798,%r1799,%r1800,%r1801,%r1802,%r1803,%r1804,%r1805,%r1806,%r1807,%r1808,%r1809,%r1810,%r1811,%r1812}, {%r3887,%r3888,%r3889,%r3890}, %rd233, %p376, 1, 1, 1; + // end inline asm + add.s32 %r4593, %r4591, 2048; + bfe.u32 %r4594, %r4593, 4, 14; + cvt.u64.u32 %rd272, %r4594; + or.b64 %rd234, %rd272, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765,%r1766,%r1767,%r1768,%r1769,%r1770,%r1771,%r1772,%r1773,%r1774,%r1775,%r1776,%r1777,%r1778,%r1779,%r1780,%r1781,%r1782,%r1783,%r1784,%r1785,%r1786,%r1787,%r1788,%r1789,%r1790,%r1791,%r1792,%r1793,%r1794,%r1795,%r1796,%r1797,%r1798,%r1799,%r1800,%r1801,%r1802,%r1803,%r1804,%r1805,%r1806,%r1807,%r1808,%r1809,%r1810,%r1811,%r1812}, {%r4019,%r4020,%r4021,%r4022}, %rd234, %p376, 1, 1, 1; + // end inline asm + add.s32 %r4595, %r4591, 4096; + bfe.u32 %r4596, %r4595, 4, 14; + cvt.u64.u32 %rd273, %r4596; + or.b64 %rd235, %rd273, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765,%r1766,%r1767,%r1768,%r1769,%r1770,%r1771,%r1772,%r1773,%r1774,%r1775,%r1776,%r1777,%r1778,%r1779,%r1780,%r1781,%r1782,%r1783,%r1784,%r1785,%r1786,%r1787,%r1788,%r1789,%r1790,%r1791,%r1792,%r1793,%r1794,%r1795,%r1796,%r1797,%r1798,%r1799,%r1800,%r1801,%r1802,%r1803,%r1804,%r1805,%r1806,%r1807,%r1808,%r1809,%r1810,%r1811,%r1812}, {%r4151,%r4152,%r4153,%r4154}, %rd235, %p376, 1, 1, 1; + // end inline asm + add.s32 %r4597, %r4591, 6144; + bfe.u32 %r4598, %r4597, 4, 14; + cvt.u64.u32 %rd274, %r4598; + or.b64 %rd236, %rd274, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765,%r1766,%r1767,%r1768,%r1769,%r1770,%r1771,%r1772,%r1773,%r1774,%r1775,%r1776,%r1777,%r1778,%r1779,%r1780,%r1781,%r1782,%r1783,%r1784,%r1785,%r1786,%r1787,%r1788,%r1789,%r1790,%r1791,%r1792,%r1793,%r1794,%r1795,%r1796,%r1797,%r1798,%r1799,%r1800,%r1801,%r1802,%r1803,%r1804,%r1805,%r1806,%r1807,%r1808,%r1809,%r1810,%r1811,%r1812}, {%r4283,%r4284,%r4285,%r4286}, %rd236, %p376, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 548 26 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:548:26 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.s32 %r5274, %r5203, %r5274; + add.s32 %r5275, %r5203, %r5275; + add.s32 %r5276, %r5203, %r5276; + add.s32 %r5277, %r5203, %r5277; + add.s32 %r5278, %r5203, %r5278; + add.s32 %r5279, %r5203, %r5279; + add.s32 %r5280, %r5203, %r5280; + add.s32 %r5281, %r5203, %r5281; + add.s32 %r5282, %r5203, %r5282; + add.s32 %r5283, %r5203, %r5283; + add.s32 %r5284, %r5203, %r5284; + add.s32 %r5285, %r5203, %r5285; + add.s32 %r5286, %r5203, %r5286; + add.s32 %r5287, %r5203, %r5287; + add.s32 %r5288, %r5203, %r5288; + add.s32 %r5289, %r5203, %r5289; + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.s32 %r680, %r5271, 1; + .loc 1 247 33 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:247:33 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + shr.u32 %r4599, %r680, 1; + .loc 1 248 38 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:248:38 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mad.wide.u32 %rd238, %r4599, 4, %rd172; + .loc 1 248 24 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:248:24 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + // begin inline asm + mov.u64 %rd237, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd237, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4287, 0x0; + @%p387 ld.global.L1::evict_last.L2::cache_hint.b32 { %r4287 }, [ %rd238 + 0 ], %rd237; + // end inline asm + .loc 1 249 109 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:249:109 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.s32 %r4600, %r4599, 1; + .loc 1 249 113 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:249:113 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + setp.lt.s32 %p409, %r4600, %r3012; + .loc 1 249 55 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:249:55 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.s64 %rd241, %rd238, 4; + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + and.pred %p388, %p387, %p409; + .loc 1 249 25 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:249:25 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + // begin inline asm + mov.u64 %rd240, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd240, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4288, 0x0; + @%p388 ld.global.L1::evict_last.L2::cache_hint.b32 { %r4288 }, [ %rd241 + 0 ], %rd240; + // end inline asm + .loc 1 250 35 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:250:35 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + and.b32 %r4601, %r5271, 1; + .loc 1 251 34 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:251:34 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + sub.s32 %r4602, %r4288, %r4287; + .loc 1 251 48 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:251:48 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + shl.b32 %r4603, %r4602, 7; + .loc 1 251 63 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:251:63 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.s32 %r4604, %r4603, -64; + .loc 1 252 29 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:252:29 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + xor.b32 %r4605, %r4601, 1; + .loc 1 252 61 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:252:61 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + shl.b32 %r4606, %r4601, 6; + .loc 1 252 42 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:252:42 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mad.lo.s32 %r5203, %r4604, %r4605, %r4606; + .loc 1 549 21 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:549:21 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.s32 %r5206, %r5203, %r5206; + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.s32 %r4607, %r5205, 1; + setp.gt.s32 %p410, %r4607, 2; + selp.b32 %r5205, 0, %r4607, %p410; + .loc 1 342 32 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:342:32 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.s32 %r4608, %r5206, %r424; + .loc 1 346 35 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:346:35 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.s32 %r4609, %r4608, %r8; + add.s32 %r4610, %r4608, %r9; + add.s32 %r4611, %r4608, %r10; + add.s32 %r4612, %r4608, %r11; + .loc 1 284 38 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:38 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + shl.b32 %r4613, %r4609, 7; + shl.b32 %r4614, %r4610, 7; + shl.b32 %r4615, %r4611, 7; + shl.b32 %r4616, %r4612, 7; + .loc 1 284 49 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:49 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + mul.wide.s32 %rd275, %r4613, 2; + add.s64 %rd243, %rd10, %rd275; + mul.wide.s32 %rd276, %r4614, 2; + add.s64 %rd244, %rd10, %rd276; + mul.wide.s32 %rd277, %r4615, 2; + add.s64 %rd245, %rd10, %rd277; + mul.wide.s32 %rd278, %r4616, 2; + add.s64 %rd246, %rd10, %rd278; + .loc 1 292 52 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:52 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + setp.lt.s32 %p411, %r4609, %r761; + setp.lt.s32 %p412, %r4610, %r761; + setp.lt.s32 %p413, %r4611, %r761; + setp.lt.s32 %p414, %r4612, %r761; + .loc 1 292 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + shl.b32 %r4617, %r5205, 14; + add.s32 %r4618, %r889, %r4617; + bar.sync 0; + add.s32 %r4289, %r4618, %r84; + selp.b32 %r4619, 16, 0, %p411; + selp.b32 %r4298, %r4619, 0, %p389; + // begin inline asm + cp.async.cg.shared.global [ %r4289 + 0 ], [ %rd243 + 0 ], 0x10, %r4298; + // end inline asm + add.s32 %r4291, %r4289, 2048; + selp.b32 %r4620, 16, 0, %p412; + selp.b32 %r4300, %r4620, 0, %p389; + // begin inline asm + cp.async.cg.shared.global [ %r4291 + 0 ], [ %rd244 + 0 ], 0x10, %r4300; + // end inline asm + add.s32 %r4293, %r4289, 4096; + selp.b32 %r4621, 16, 0, %p413; + selp.b32 %r4302, %r4621, 0, %p389; + // begin inline asm + cp.async.cg.shared.global [ %r4293 + 0 ], [ %rd245 + 0 ], 0x10, %r4302; + // end inline asm + add.s32 %r4295, %r4289, 6144; + selp.b32 %r4622, 16, 0, %p414; + selp.b32 %r4304, %r4622, 0, %p389; + // begin inline asm + cp.async.cg.shared.global [ %r4295 + 0 ], [ %rd246 + 0 ], 0x10, %r4304; + // end inline asm + cp.async.commit_group; + .loc 1 284 49 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:284:49 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.s64 %rd247, %rd11, %rd275; + add.s64 %rd248, %rd11, %rd276; + add.s64 %rd249, %rd11, %rd277; + add.s64 %rd250, %rd11, %rd278; + .loc 1 292 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:292:23 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + add.s32 %r4623, %r4590, %r4617; + add.s32 %r4297, %r4623, %r84; + // begin inline asm + cp.async.cg.shared.global [ %r4297 + 0 ], [ %rd247 + 0 ], 0x10, %r4298; + // end inline asm + add.s32 %r4299, %r4297, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r4299 + 0 ], [ %rd248 + 0 ], 0x10, %r4300; + // end inline asm + add.s32 %r4301, %r4297, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r4301 + 0 ], [ %rd249 + 0 ], 0x10, %r4302; + // end inline asm + add.s32 %r4303, %r4297, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r4303 + 0 ], [ %rd250 + 0 ], 0x10, %r4304; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + setp.ne.b32 %p415, %r509, %r680; + mov.b32 %r5271, %r680; + @%p415 bra $L__BB0_5; +$L__BB0_6: // %._crit_edge461 + .loc 1 0 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:0:40 + cvt.u32.u64 %r4824, %rd3; + .loc 1 502 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:502:40 @[ c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:198:45 ] + // begin inline asm + // wait for regs: %r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765,%r1766,%r1767,%r1768,%r1769,%r1770,%r1771,%r1772,%r1773,%r1774,%r1775,%r1776,%r1777,%r1778,%r1779,%r1780,%r1781,%r1782,%r1783,%r1784,%r1785,%r1786,%r1787,%r1788,%r1789,%r1790,%r1791,%r1792,%r1793,%r1794,%r1795,%r1796,%r1797,%r1798,%r1799,%r1800,%r1801,%r1802,%r1803,%r1804,%r1805,%r1806,%r1807,%r1808,%r1809,%r1810,%r1811,%r1812 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp12: + .loc 1 206 26 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:206:26 + setp.eq.f32 %p424, %r5354, 0f00000000; + setp.eq.f32 %p425, %r5355, 0f00000000; + .loc 1 206 34 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:206:34 + selp.f32 %r4825, 0f3F800000, %r5354, %p424; + selp.f32 %r750, 0f3F800000, %r5355, %p425; + .loc 1 208 16 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:208:16 + div.full.f32 %r4826, %r1749, %r4825; + div.full.f32 %r4827, %r1750, %r4825; + div.full.f32 %r4828, %r1751, %r750; + div.full.f32 %r4829, %r1752, %r750; + div.full.f32 %r4830, %r1753, %r4825; + div.full.f32 %r4831, %r1754, %r4825; + div.full.f32 %r4832, %r1755, %r750; + div.full.f32 %r4833, %r1756, %r750; + div.full.f32 %r4834, %r1757, %r4825; + div.full.f32 %r4835, %r1758, %r4825; + div.full.f32 %r4836, %r1759, %r750; + div.full.f32 %r4837, %r1760, %r750; + div.full.f32 %r4838, %r1761, %r4825; + div.full.f32 %r4839, %r1762, %r4825; + div.full.f32 %r4840, %r1763, %r750; + div.full.f32 %r4841, %r1764, %r750; + div.full.f32 %r4842, %r1765, %r4825; + div.full.f32 %r4843, %r1766, %r4825; + div.full.f32 %r4844, %r1767, %r750; + div.full.f32 %r4845, %r1768, %r750; + div.full.f32 %r4846, %r1769, %r4825; + div.full.f32 %r4847, %r1770, %r4825; + div.full.f32 %r4848, %r1771, %r750; + div.full.f32 %r4849, %r1772, %r750; + div.full.f32 %r4850, %r1773, %r4825; + div.full.f32 %r4851, %r1774, %r4825; + div.full.f32 %r4852, %r1775, %r750; + div.full.f32 %r4853, %r1776, %r750; + div.full.f32 %r4854, %r1777, %r4825; + div.full.f32 %r4855, %r1778, %r4825; + div.full.f32 %r4856, %r1779, %r750; + div.full.f32 %r4857, %r1780, %r750; + div.full.f32 %r4858, %r1781, %r4825; + div.full.f32 %r4859, %r1782, %r4825; + div.full.f32 %r4860, %r1783, %r750; + div.full.f32 %r4861, %r1784, %r750; + div.full.f32 %r4862, %r1785, %r4825; + div.full.f32 %r4863, %r1786, %r4825; + div.full.f32 %r4864, %r1787, %r750; + div.full.f32 %r4865, %r1788, %r750; + div.full.f32 %r4866, %r1789, %r4825; + div.full.f32 %r4867, %r1790, %r4825; + div.full.f32 %r4868, %r1791, %r750; + div.full.f32 %r4869, %r1792, %r750; + div.full.f32 %r4870, %r1793, %r4825; + div.full.f32 %r4871, %r1794, %r4825; + div.full.f32 %r4872, %r1795, %r750; + div.full.f32 %r4873, %r1796, %r750; + div.full.f32 %r4874, %r1797, %r4825; + div.full.f32 %r4875, %r1798, %r4825; + div.full.f32 %r4876, %r1799, %r750; + div.full.f32 %r4877, %r1800, %r750; + div.full.f32 %r4878, %r1801, %r4825; + div.full.f32 %r4879, %r1802, %r4825; + div.full.f32 %r4880, %r1803, %r750; + div.full.f32 %r4881, %r1804, %r750; + div.full.f32 %r4882, %r1805, %r4825; + div.full.f32 %r4883, %r1806, %r4825; + div.full.f32 %r4884, %r1807, %r750; + div.full.f32 %r4885, %r1808, %r750; + div.full.f32 %r4886, %r1809, %r4825; + div.full.f32 %r4887, %r1810, %r4825; + div.full.f32 %r4888, %r1811, %r750; + div.full.f32 %r4889, %r1812, %r750; + .loc 1 218 49 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:218:49 + or.b32 %r4890, %r4824, %r4; + .loc 1 218 62 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:218:62 + add.s32 %r4891, %r20, %r4890; + add.s32 %r4892, %r21, %r4890; + add.s32 %r4893, %r22, %r4890; + add.s32 %r4894, %r23, %r4890; + add.s32 %r4895, %r24, %r4890; + add.s32 %r4896, %r25, %r4890; + add.s32 %r4897, %r26, %r4890; + add.s32 %r4898, %r27, %r4890; + .loc 1 218 75 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:218:75 + add.s32 %r4899, %r4891, %r3; + add.s32 %r4900, %r4892, %r3; + add.s32 %r4901, %r4893, %r3; + add.s32 %r4902, %r4894, %r3; + add.s32 %r4903, %r4895, %r3; + add.s32 %r4904, %r4896, %r3; + add.s32 %r4905, %r4897, %r3; + add.s32 %r4906, %r4898, %r3; + .loc 1 218 25 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:218:25 + mad.wide.s32 %rd279, %r4899, 2, %rd38; + mad.wide.s32 %rd280, %r4900, 2, %rd38; + mad.wide.s32 %rd281, %r4901, 2, %rd38; + mad.wide.s32 %rd282, %r4902, 2, %rd38; + mad.wide.s32 %rd283, %r4903, 2, %rd38; + mad.wide.s32 %rd284, %r4904, 2, %rd38; + mad.wide.s32 %rd285, %r4905, 2, %rd38; + mad.wide.s32 %rd286, %r4906, 2, %rd38; + .loc 1 218 110 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:218:110 + cvt.rn.bf16x2.f32 %r4907, %r4827, %r4826; + cvt.rn.bf16x2.f32 %r4908, %r4829, %r4828; + cvt.rn.bf16x2.f32 %r4909, %r4831, %r4830; + cvt.rn.bf16x2.f32 %r4910, %r4833, %r4832; + cvt.rn.bf16x2.f32 %r4911, %r4835, %r4834; + cvt.rn.bf16x2.f32 %r4912, %r4837, %r4836; + cvt.rn.bf16x2.f32 %r4913, %r4839, %r4838; + cvt.rn.bf16x2.f32 %r4914, %r4841, %r4840; + cvt.rn.bf16x2.f32 %r4915, %r4843, %r4842; + cvt.rn.bf16x2.f32 %r4916, %r4845, %r4844; + cvt.rn.bf16x2.f32 %r4917, %r4847, %r4846; + cvt.rn.bf16x2.f32 %r4918, %r4849, %r4848; + cvt.rn.bf16x2.f32 %r4919, %r4851, %r4850; + cvt.rn.bf16x2.f32 %r4920, %r4853, %r4852; + cvt.rn.bf16x2.f32 %r4921, %r4855, %r4854; + cvt.rn.bf16x2.f32 %r4922, %r4857, %r4856; + cvt.rn.bf16x2.f32 %r4923, %r4859, %r4858; + cvt.rn.bf16x2.f32 %r4924, %r4861, %r4860; + cvt.rn.bf16x2.f32 %r4925, %r4863, %r4862; + cvt.rn.bf16x2.f32 %r4926, %r4865, %r4864; + cvt.rn.bf16x2.f32 %r4927, %r4867, %r4866; + cvt.rn.bf16x2.f32 %r4928, %r4869, %r4868; + cvt.rn.bf16x2.f32 %r4929, %r4871, %r4870; + cvt.rn.bf16x2.f32 %r4930, %r4873, %r4872; + cvt.rn.bf16x2.f32 %r4931, %r4875, %r4874; + cvt.rn.bf16x2.f32 %r4932, %r4877, %r4876; + cvt.rn.bf16x2.f32 %r4933, %r4879, %r4878; + cvt.rn.bf16x2.f32 %r4934, %r4881, %r4880; + cvt.rn.bf16x2.f32 %r4935, %r4883, %r4882; + cvt.rn.bf16x2.f32 %r4936, %r4885, %r4884; + cvt.rn.bf16x2.f32 %r4937, %r4887, %r4886; + cvt.rn.bf16x2.f32 %r4938, %r4889, %r4888; + shl.b32 %r4939, %r35, 13; + shl.b32 %r4940, %r6, 5; + and.b32 %r4941, %r4940, 7264; + and.b32 %r4942, %r6, 24; + shl.b32 %r4943, %r4942, 4; + shl.b32 %r4944, %r6, 2; + and.b32 %r4945, %r4944, 16; + or.b32 %r4946, %r4939, %r4945; + or.b32 %r4947, %r4941, %r4943; + or.b32 %r4948, %r4946, %r4947; + add.s32 %r4950, %r889, %r4948; + st.shared.v4.b32 [%r4950], {%r4907, %r4909, %r4911, %r4913}; + st.shared.v4.b32 [%r4950+512], {%r4908, %r4910, %r4912, %r4914}; + xor.b32 %r4951, %r4948, 32; + add.s32 %r4952, %r889, %r4951; + st.shared.v4.b32 [%r4952], {%r4915, %r4917, %r4919, %r4921}; + st.shared.v4.b32 [%r4952+512], {%r4916, %r4918, %r4920, %r4922}; + xor.b32 %r4953, %r4948, 64; + add.s32 %r4954, %r889, %r4953; + st.shared.v4.b32 [%r4954], {%r4923, %r4925, %r4927, %r4929}; + st.shared.v4.b32 [%r4954+512], {%r4924, %r4926, %r4928, %r4930}; + xor.b32 %r4955, %r4948, 96; + add.s32 %r4956, %r889, %r4955; + st.shared.v4.b32 [%r4956], {%r4931, %r4933, %r4935, %r4937}; + st.shared.v4.b32 [%r4956+512], {%r4932, %r4934, %r4936, %r4938}; + bar.sync 0; + shl.b32 %r4957, %r4942, 10; + shl.b32 %r4958, %r35, 5; + and.b32 %r751, %r6, 252; + shl.b32 %r4959, %r751, 2; + or.b32 %r4960, %r4957, %r4958; + xor.b32 %r4961, %r4960, %r4959; + add.s32 %r4756, %r889, %r4961; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4752, %r4753, %r4754, %r4755}, [%r4756]; + // end inline asm + add.s32 %r4761, %r4756, 1024; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4757, %r4758, %r4759, %r4760}, [%r4761]; + // end inline asm + add.s32 %r4766, %r4756, 2048; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4762, %r4763, %r4764, %r4765}, [%r4766]; + // end inline asm + add.s32 %r4771, %r4756, 3072; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4767, %r4768, %r4769, %r4770}, [%r4771]; + // end inline asm + add.s32 %r4776, %r4756, 4096; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4772, %r4773, %r4774, %r4775}, [%r4776]; + // end inline asm + add.s32 %r4781, %r4756, 5120; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4777, %r4778, %r4779, %r4780}, [%r4781]; + // end inline asm + add.s32 %r4786, %r4756, 6144; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4782, %r4783, %r4784, %r4785}, [%r4786]; + // end inline asm + add.s32 %r4791, %r4756, 7168; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4787, %r4788, %r4789, %r4790}, [%r4791]; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd279 + 0 ], { %r4752, %r4753, %r4754, %r4755 }; + // end inline asm + // begin inline asm + @%p2 st.global.v4.b32 [ %rd280 + 0 ], { %r4757, %r4758, %r4759, %r4760 }; + // end inline asm + // begin inline asm + @%p3 st.global.v4.b32 [ %rd281 + 0 ], { %r4762, %r4763, %r4764, %r4765 }; + // end inline asm + // begin inline asm + @%p4 st.global.v4.b32 [ %rd282 + 0 ], { %r4767, %r4768, %r4769, %r4770 }; + // end inline asm + // begin inline asm + @%p5 st.global.v4.b32 [ %rd283 + 0 ], { %r4772, %r4773, %r4774, %r4775 }; + // end inline asm + // begin inline asm + @%p6 st.global.v4.b32 [ %rd284 + 0 ], { %r4777, %r4778, %r4779, %r4780 }; + // end inline asm + // begin inline asm + @%p7 st.global.v4.b32 [ %rd285 + 0 ], { %r4782, %r4783, %r4784, %r4785 }; + // end inline asm + // begin inline asm + @%p8 st.global.v4.b32 [ %rd286 + 0 ], { %r4787, %r4788, %r4789, %r4790 }; + // end inline asm + .loc 1 223 33 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:223:33 + setp.lt.f32 %p426, %r4825, 0f00800000; + mul.f32 %r4962, %r4825, 0f4B000000; + selp.f32 %r752, %r4962, %r4825, %p426; + selp.f32 %r4963, 0fC1B80000, 0f00000000, %p426; + add.s32 %r4964, %r752, -1060439283; + and.b32 %r4965, %r4964, -8388608; + sub.s32 %r4966, %r752, %r4965; + cvt.rn.f32.s32 %r4967, %r4965; + mov.b32 %r4968, 0f34000000; + fma.rn.ftz.f32 %r4969, %r4967, %r4968, %r4963; + add.f32 %r4970, %r4966, 0fBF800000; + mov.b32 %r4971, 0fBE2C7F30; + mov.b32 %r4972, 0f3DC6B27F; + fma.rn.ftz.f32 %r4973, %r4972, %r4970, %r4971; + mov.b32 %r4974, 0f3E2FCF2A; + fma.rn.ftz.f32 %r4975, %r4973, %r4970, %r4974; + mov.b32 %r4976, 0fBE374E43; + fma.rn.ftz.f32 %r4977, %r4975, %r4970, %r4976; + mov.b32 %r4978, 0f3E520BF4; + fma.rn.ftz.f32 %r4979, %r4977, %r4970, %r4978; + mov.b32 %r4980, 0fBE763C8B; + fma.rn.ftz.f32 %r4981, %r4979, %r4970, %r4980; + mov.b32 %r4982, 0f3E93BF99; + fma.rn.ftz.f32 %r4983, %r4981, %r4970, %r4982; + mov.b32 %r4984, 0fBEB8AA49; + fma.rn.ftz.f32 %r4985, %r4983, %r4970, %r4984; + mov.b32 %r4986, 0f3EF6384A; + fma.rn.ftz.f32 %r4987, %r4985, %r4970, %r4986; + mov.b32 %r4988, 0fBF38AA3B; + fma.rn.ftz.f32 %r4989, %r4987, %r4970, %r4988; + mul.f32 %r4990, %r4970, %r4989; + mul.f32 %r4991, %r4970, %r4990; + mov.b32 %r4992, 0f3FB8AA3B; + fma.rn.ftz.f32 %r4993, %r4970, %r4992, %r4991; + add.f32 %r5356, %r4969, %r4993; + setp.lt.u32 %p427, %r752, 2139095040; + mov.b32 %r4994, 0f7F800000; + @%p427 bra $L__BB0_8; +// %bb.7: // %__nv_fmaf_rn.exit.i.i + .loc 1 0 33 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:0:33 + fma.rn.ftz.f32 %r5356, %r752, %r4994, %r4994; +$L__BB0_8: // %__nv_log2f.exit + ld.param.b64 %rd35, [triton_tem_fused_0_param_3]; + .loc 1 223 33 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:223:33 + setp.lt.f32 %p428, %r750, 0f00800000; + mul.f32 %r4995, %r750, 0f4B000000; + selp.f32 %r756, %r4995, %r750, %p428; + selp.f32 %r4996, 0fC1B80000, 0f00000000, %p428; + add.s32 %r4997, %r756, -1060439283; + and.b32 %r4998, %r4997, -8388608; + sub.s32 %r4999, %r756, %r4998; + cvt.rn.f32.s32 %r5000, %r4998; + fma.rn.ftz.f32 %r5002, %r5000, %r4968, %r4996; + add.f32 %r5003, %r4999, 0fBF800000; + fma.rn.ftz.f32 %r5006, %r4972, %r5003, %r4971; + fma.rn.ftz.f32 %r5008, %r5006, %r5003, %r4974; + fma.rn.ftz.f32 %r5010, %r5008, %r5003, %r4976; + fma.rn.ftz.f32 %r5012, %r5010, %r5003, %r4978; + fma.rn.ftz.f32 %r5014, %r5012, %r5003, %r4980; + fma.rn.ftz.f32 %r5016, %r5014, %r5003, %r4982; + fma.rn.ftz.f32 %r5018, %r5016, %r5003, %r4984; + fma.rn.ftz.f32 %r5020, %r5018, %r5003, %r4986; + fma.rn.ftz.f32 %r5022, %r5020, %r5003, %r4988; + mul.f32 %r5023, %r5003, %r5022; + mul.f32 %r5024, %r5003, %r5023; + fma.rn.ftz.f32 %r5026, %r5003, %r4992, %r5024; + add.f32 %r5357, %r5002, %r5026; + setp.lt.u32 %p429, %r756, 2139095040; + @%p429 bra $L__BB0_10; +// %bb.9: // %__nv_fmaf_rn.exit.i.i384 + .loc 1 0 33 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:0:33 + fma.rn.ftz.f32 %r5357, %r756, %r4994, %r4994; +$L__BB0_10: // %__nv_log2f.exit387 + .loc 1 223 33 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:223:33 + setp.eq.f32 %p431, %r756, 0f00000000; + setp.eq.f32 %p432, %r752, 0f00000000; + .loc 1 221 26 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:221:26 + shl.b32 %r5029, %r1, 5; + .loc 1 221 31 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:221:31 + add.s32 %r5030, %r5029, %r2; + .loc 1 222 32 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:222:32 + mul.lo.s32 %r5031, %r760, %r5030; + .loc 1 222 23 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:222:23 + mad.wide.s32 %rd288, %r5031, 4, %rd35; + .loc 1 144 46 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:144:46 + and.b32 %r5032, %r6, 127; + .loc 1 144 33 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:144:33 + or.b32 %r5033, %r5, %r5032; + .loc 1 222 40 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:222:40 + mad.wide.s32 %rd287, %r5033, 4, %rd288; + .loc 1 223 33 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:223:33 + selp.f32 %r5034, 0fFF800000, %r5356, %p432; + selp.f32 %r5035, 0fFF800000, %r5357, %p431; + .loc 1 223 20 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:223:20 + mov.b64 {%r5036, %r5037}, %rd292; + add.f32 %r5038, %r5037, %r5035; + add.f32 %r5039, %r5036, %r5034; + .loc 1 227 48 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:227:48 + setp.lt.s32 %p433, %r5033, %r760; + .loc 1 227 29 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:227:29 + bar.sync 0; + shl.b32 %r5040, %r751, 1; + add.s32 %r5042, %r889, %r5040; + st.shared.v2.b32 [%r5042], {%r5039, %r5038}; + bar.sync 0; + shl.b32 %r5043, %r28, 3; + shl.b32 %r5044, %r29, 2; + shr.u32 %r5045, %r30, 1; + add.s32 %r5046, %r889, %r5043; + add.s32 %r5047, %r5046, %r5044; + add.s32 %r5048, %r5047, %r5045; + ld.shared.b32 %r5028, [%r5048]; + and.b32 %r5049, %r6, 128; + setp.eq.b32 %p434, %r5049, 0; + and.pred %p430, %p434, %p433; + // begin inline asm + @%p430 st.global.b32 [ %rd287 + 0 ], { %r5028 }; + // end inline asm + .loc 1 229 4 // c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py:229:4 + ret; +$L__tmp13: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 275 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x10c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 53 +.b8 53 +.b8 114 +.b8 102 +.b8 117 +.b8 99 +.b8 118 +.b8 120 +.b8 119 +.b8 50 +.b8 113 +.b8 106 +.b8 111 +.b8 111 +.b8 98 +.b8 105 +.b8 122 +.b8 109 +.b8 100 +.b8 116 +.b8 55 +.b8 107 +.b8 121 +.b8 111 +.b8 112 +.b8 98 +.b8 99 +.b8 118 +.b8 102 +.b8 105 +.b8 122 +.b8 111 +.b8 117 +.b8 114 +.b8 98 +.b8 107 +.b8 118 +.b8 51 +.b8 51 +.b8 106 +.b8 118 +.b8 118 +.b8 101 +.b8 50 +.b8 103 +.b8 102 +.b8 113 +.b8 52 +.b8 98 +.b8 112 +.b8 108 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 53 +.b8 53 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x15 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 116 +.b8 101 +.b8 109 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa0:0x76 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xb5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 146 // DW_AT_call_line +.b8 101 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xcd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 154 // DW_AT_call_line +.b8 92 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xe5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 172 // DW_AT_call_line +.b8 41 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xfd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp9 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 198 // DW_AT_call_line +.b8 45 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.source new file mode 100644 index 0000000000000000000000000000000000000000..a0482735102ebc8eba8c0315dab4b8aa3dc787f3 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.source @@ -0,0 +1,1203 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":18:0) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":271:0) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":32:0) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":448:0) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":299:0) +#loc227 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":256:0) +#loc231 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":175:0) +#loc233 = loc(unknown) +#loc236 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":167:0) +#loc240 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc244 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":240:0) +#loc269 = loc("arg_Q"(#loc)) +#loc270 = loc("arg_K"(#loc)) +#loc271 = loc("arg_V"(#loc)) +#loc272 = loc("arg_LSE"(#loc)) +#loc273 = loc("arg_MAX"(#loc)) +#loc274 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc275 = loc("arg_KV_IDX"(#loc)) +#loc276 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc277 = loc("arg_FULL_KV_IDX"(#loc)) +#loc278 = loc("in_ptr9"(#loc)) +#loc279 = loc("out_ptr0"(#loc)) +#loc280 = loc("ks0"(#loc)) +#loc281 = loc("ks1"(#loc)) +#loc282 = loc("ks2"(#loc)) +#loc283 = loc("ks3"(#loc)) +#loc284 = loc("ks4"(#loc)) +#loc285 = loc("ks5"(#loc)) +#loc382 = loc("ptr"(#loc127)) +#loc383 = loc("offs_m"(#loc127)) +#loc384 = loc("offs_n"(#loc127)) +#loc385 = loc("stride_m"(#loc127)) +#loc386 = loc("stride_n"(#loc127)) +#loc387 = loc("M_LEN"(#loc127)) +#loc394 = loc("x"(#loc139)) +#loc395 = loc("arg_Q"(#loc145)) +#loc396 = loc("arg_K"(#loc145)) +#loc397 = loc("arg_V"(#loc145)) +#loc398 = loc("arg_LSE"(#loc145)) +#loc399 = loc("arg_MAX"(#loc145)) +#loc400 = loc("arg_KV_NUM_BLKS"(#loc145)) +#loc401 = loc("arg_KV_IDX"(#loc145)) +#loc402 = loc("arg_FULL_KV_NUM_BLKS"(#loc145)) +#loc403 = loc("arg_FULL_KV_IDX"(#loc145)) +#loc404 = loc("in_ptr9"(#loc145)) +#loc405 = loc("out_ptr0"(#loc145)) +#loc406 = loc("ks0"(#loc145)) +#loc407 = loc("ks1"(#loc145)) +#loc408 = loc("ks2"(#loc145)) +#loc409 = loc("ks3"(#loc145)) +#loc410 = loc("ks4"(#loc145)) +#loc411 = loc("ks5"(#loc145)) +#loc412 = loc("q"(#loc145)) +#loc413 = loc("K"(#loc145)) +#loc414 = loc("V"(#loc145)) +#loc415 = loc("Q_LEN"(#loc145)) +#loc416 = loc("KV_LEN"(#loc145)) +#loc417 = loc("acc"(#loc145)) +#loc418 = loc("l_i"(#loc145)) +#loc419 = loc("m_i"(#loc145)) +#loc420 = loc("off_z"(#loc145)) +#loc421 = loc("off_h"(#loc145)) +#loc422 = loc("offs_m"(#loc145)) +#loc423 = loc("offs_n"(#loc145)) +#loc424 = loc("kv_start"(#loc145)) +#loc425 = loc("kv_indices"(#loc145)) +#loc426 = loc("kv_num_blocks"(#loc145)) +#loc427 = loc("block_n_end"(#loc145)) +#loc428 = loc("stride_kk"(#loc145)) +#loc429 = loc("stride_kn"(#loc145)) +#loc430 = loc("stride_vn"(#loc145)) +#loc431 = loc("stride_vk"(#loc145)) +#loc437 = loc("arg_Q"(#loc155)) +#loc438 = loc("arg_K"(#loc155)) +#loc439 = loc("arg_V"(#loc155)) +#loc440 = loc("arg_LSE"(#loc155)) +#loc441 = loc("arg_MAX"(#loc155)) +#loc442 = loc("arg_KV_NUM_BLKS"(#loc155)) +#loc443 = loc("arg_KV_IDX"(#loc155)) +#loc444 = loc("arg_FULL_KV_NUM_BLKS"(#loc155)) +#loc445 = loc("arg_FULL_KV_IDX"(#loc155)) +#loc446 = loc("in_ptr9"(#loc155)) +#loc447 = loc("out_ptr0"(#loc155)) +#loc448 = loc("ks0"(#loc155)) +#loc449 = loc("ks1"(#loc155)) +#loc450 = loc("ks2"(#loc155)) +#loc451 = loc("ks3"(#loc155)) +#loc452 = loc("ks4"(#loc155)) +#loc453 = loc("ks5"(#loc155)) +#loc454 = loc("q"(#loc155)) +#loc455 = loc("K"(#loc155)) +#loc456 = loc("V"(#loc155)) +#loc457 = loc("Q_LEN"(#loc155)) +#loc458 = loc("KV_LEN"(#loc155)) +#loc459 = loc("acc"(#loc155)) +#loc460 = loc("l_i"(#loc155)) +#loc461 = loc("m_i"(#loc155)) +#loc462 = loc("off_z"(#loc155)) +#loc463 = loc("off_h"(#loc155)) +#loc464 = loc("offs_m"(#loc155)) +#loc465 = loc("offs_n"(#loc155)) +#loc466 = loc("kv_start"(#loc155)) +#loc467 = loc("kv_offset"(#loc155)) +#loc468 = loc("stride_kk"(#loc155)) +#loc469 = loc("stride_kn"(#loc155)) +#loc470 = loc("stride_vn"(#loc155)) +#loc471 = loc("stride_vk"(#loc155)) +#loc541 = loc("indices"(#loc227)) +#loc542 = loc("max_len"(#loc227)) +#loc543 = loc("input"(#loc231)) +#loc544 = loc("a"(#loc236)) +#loc545 = loc("b"(#loc236)) +#loc546 = loc("input"(#loc240)) +#loc547 = loc("a"(#loc244)) +#loc548 = loc("b"(#loc244)) +#loc549 = loc("loop_iter"(#loc248)) +#loc550 = loc("col_indices"(#loc248)) +#loc551 = loc("total_blocks"(#loc248)) +module { + tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %in_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr9"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc)), %ks4: i32 loc("ks4"(#loc)), %ks5: i32 loc("ks5"(#loc))) attributes {noinline = false} { + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c4096_i32_0 = arith.constant 4096 : i32 loc(#loc1) + %0 = arith.muli %c4096_i32_0, %ks0 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc2) + %c4096_i32_1 = arith.constant 4096 : i32 loc(#loc2) + %c1_i32 = arith.constant 1 : i32 loc(#loc2) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc3) + %c1024_i32_2 = arith.constant 1024 : i32 loc(#loc3) + %1 = arith.muli %c1024_i32_2, %ks1 : i32 loc(#loc3) + %c128_i32_3 = arith.constant 128 : i32 loc(#loc4) + %c128_i32_4 = arith.constant 128 : i32 loc(#loc4) + %2 = arith.muli %c128_i32_4, %ks1 : i32 loc(#loc4) + %c128_i32_5 = arith.constant 128 : i32 loc(#loc5) + %c1_i32_6 = arith.constant 1 : i32 loc(#loc5) + %c1024_i32_7 = arith.constant 1024 : i32 loc(#loc6) + %c1024_i32_8 = arith.constant 1024 : i32 loc(#loc6) + %3 = arith.muli %c1024_i32_8, %ks1 : i32 loc(#loc6) + %c128_i32_9 = arith.constant 128 : i32 loc(#loc7) + %c128_i32_10 = arith.constant 128 : i32 loc(#loc7) + %4 = arith.muli %c128_i32_10, %ks1 : i32 loc(#loc7) + %c128_i32_11 = arith.constant 128 : i32 loc(#loc8) + %c1_i32_12 = arith.constant 1 : i32 loc(#loc8) + %ZQ = arith.constant 2 : i32 loc(#loc286) + %HQ = arith.constant 32 : i32 loc(#loc287) + %ZKV = arith.constant 2 : i32 loc(#loc288) + %q_start = tt.get_program_id x : i32 loc(#loc289) + %off_zq = tt.get_program_id y : i32 loc(#loc290) + %off_hq = tt.get_program_id z : i32 loc(#loc291) + %off_zkv = arith.remsi %off_zq, %ZKV : i32 loc(#loc292) + %off_hkv = arith.constant 4 : i32 loc(#loc293) + %off_hkv_13 = arith.constant 4 : i32 loc(#loc293) + %off_hkv_14 = arith.divsi %off_hq, %off_hkv_13 : i32 loc(#loc293) + %off_g = arith.constant 4 : i32 loc(#loc294) + %off_g_15 = arith.constant 4 : i32 loc(#loc294) + %off_g_16 = arith.remsi %off_hq, %off_g_15 : i32 loc(#loc294) + %q_offset = arith.muli %off_zq, %0 : i32 loc(#loc295) + %q_offset_17 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc296) + %q_offset_18 = arith.addi %q_offset, %q_offset_17 : i32 loc(#loc297) + %k_offset = arith.muli %off_zkv, %1 : i32 loc(#loc298) + %k_offset_19 = arith.muli %off_hkv_14, %2 : i32 loc(#loc299) + %k_offset_20 = arith.addi %k_offset, %k_offset_19 : i32 loc(#loc300) + %v_offset = arith.muli %off_zkv, %3 : i32 loc(#loc301) + %v_offset_21 = arith.muli %off_hkv_14, %4 : i32 loc(#loc302) + %v_offset_22 = arith.addi %v_offset, %v_offset_21 : i32 loc(#loc303) + %Q = tt.addptr %arg_Q, %q_offset_18 : !tt.ptr, i32 loc(#loc304) + %K = tt.addptr %arg_K, %k_offset_20 : !tt.ptr, i32 loc(#loc305) + %V = tt.addptr %arg_V, %v_offset_22 : !tt.ptr, i32 loc(#loc306) + %SPARSE_Z = arith.constant 2 : i32 loc(#loc307) + %SPARSE_HQ = arith.constant 1 : i32 loc(#loc308) + %sparse_idx_z = arith.remsi %off_zq, %SPARSE_Z : i32 loc(#loc309) + %sparse_idx_hq = arith.remsi %off_hq, %SPARSE_HQ : i32 loc(#loc310) + %stride_kv_idx_h = arith.muli %ks3, %ks4 : i32 loc(#loc311) + %m_i = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128xf32> loc(#loc312) + %m_i_23 = arith.constant 0x7F800000 : f32 loc(#loc313) + %m_i_24 = arith.constant 0x7F800000 : f32 loc(#loc313) + %m_i_25 = arith.constant dense<0x7F800000> : tensor<128xf32> loc(#loc313) + %m_i_26 = arith.subf %m_i, %m_i_25 : tensor<128xf32> loc(#loc313) + %l_i = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128xf32> loc(#loc314) + %acc = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc315) + %offs_m = arith.constant 128 : i32 loc(#loc316) + %offs_m_27 = arith.constant 128 : i32 loc(#loc316) + %offs_m_28 = arith.muli %q_start, %offs_m_27 : i32 loc(#loc316) + %offs_m_29 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc317) + %offs_m_30 = tt.splat %offs_m_28 : i32 -> tensor<128xi32> loc(#loc318) + %offs_m_31 = arith.addi %offs_m_30, %offs_m_29 : tensor<128xi32> loc(#loc318) + %sparse_hz_offset = arith.muli %sparse_idx_z, %SPARSE_HQ : i32 loc(#loc319) + %sparse_hz_offset_32 = arith.addi %sparse_hz_offset, %sparse_idx_hq : i32 loc(#loc320) + %sparse_kv_num_blks_offset = arith.muli %sparse_hz_offset_32, %ks2 : i32 loc(#loc321) + %sparse_kv_num_blks_offset_33 = arith.constant 1 : i32 loc(#loc322) + %sparse_kv_num_blks_offset_34 = arith.constant 1 : i32 loc(#loc322) + %sparse_kv_num_blks_offset_35 = arith.divsi %q_start, %sparse_kv_num_blks_offset_34 : i32 loc(#loc322) + %sparse_kv_num_blks_offset_36 = arith.addi %sparse_kv_num_blks_offset, %sparse_kv_num_blks_offset_35 : i32 loc(#loc323) + %sparse_kv_idx_offset = arith.muli %sparse_hz_offset_32, %stride_kv_idx_h : i32 loc(#loc324) + %sparse_kv_idx_offset_37 = arith.constant 1 : i32 loc(#loc325) + %sparse_kv_idx_offset_38 = arith.constant 1 : i32 loc(#loc325) + %sparse_kv_idx_offset_39 = arith.divsi %q_start, %sparse_kv_idx_offset_38 : i32 loc(#loc325) + %sparse_kv_idx_offset_40 = arith.muli %sparse_kv_idx_offset_39, %ks4 : i32 loc(#loc326) + %sparse_kv_idx_offset_41 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_40 : i32 loc(#loc327) + %offs_m_42 = arith.constant 128 : i32 loc(#loc328) + %offs_m_43 = arith.constant 128 : i32 loc(#loc328) + %offs_m_44 = arith.muli %q_start, %offs_m_43 : i32 loc(#loc328) + %offs_m_45 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc329) + %offs_m_46 = tt.splat %offs_m_44 : i32 -> tensor<128xi32> loc(#loc330) + %offs_m_47 = arith.addi %offs_m_46, %offs_m_45 : tensor<128xi32> loc(#loc330) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc331) + %q = tt.call @"torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%Q, %offs_m_47, %offs_k, %c4096_i32_1, %c1_i32, %ks0) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc332) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_41 : !tt.ptr, i32 loc(#loc333) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc334) + %kv_start_48 = arith.constant 128 : i32 loc(#loc335) + %kv_start_49 = arith.constant 128 : i32 loc(#loc335) + %kv_start_50 = arith.muli %kv_start, %kv_start_49 : i32 loc(#loc335) + %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_36 : !tt.ptr, i32 loc(#loc336) + %kv_num_blocks_51 = tt.load %kv_num_blocks : !tt.ptr loc(#loc337) + %block_n_end = arith.constant 2 : i32 loc(#loc338) + %block_n_end_52 = arith.constant 2 : i32 loc(#loc338) + %block_n_end_53 = arith.muli %kv_num_blocks_51, %block_n_end_52 : i32 loc(#loc338) + %block_n_end_54 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%ks1) : (i32) -> i32 loc(#loc339) + %block_n_end_55 = arith.constant 1 : i32 loc(#loc340) + %block_n_end_56 = arith.maxsi %block_n_end_54, %block_n_end_55 : i32 loc(#loc340) + %block_n_end_57 = arith.minsi %block_n_end_53, %block_n_end_56 : i32 loc(#loc341) + %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc342) + %offs_n_58 = tt.splat %kv_start_50 : i32 -> tensor<64xi32> loc(#loc343) + %offs_n_59 = arith.addi %offs_n_58, %offs_n : tensor<64xi32> loc(#loc343) + %5 = tt.expand_dims %offs_m_47 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc67) + %6 = tt.expand_dims %offs_n_59 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc68) + %7:3 = tt.call @"torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(20,)cNone_(21,)cNone_(34,)cconstexpr_0__(36,)cconstexpr_bf16__(41,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %q, %K, %V, %ks0, %ks1, %acc, %l_i, %m_i_26, %off_zq, %off_hq, %5, %6, %kv_start_50, %kv_indices, %kv_num_blocks_51, %block_n_end_57, %c1_i32_6, %c128_i32_5, %c128_i32_11, %c1_i32_12) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, !tt.ptr, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc69) + %kv_indices_60 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_41 : !tt.ptr, i32 loc(#loc344) + %kv_start_61 = tt.load %kv_indices_60 : !tt.ptr loc(#loc345) + %kv_start_62 = arith.constant 128 : i32 loc(#loc346) + %kv_start_63 = arith.constant 128 : i32 loc(#loc346) + %kv_start_64 = arith.muli %kv_start_61, %kv_start_63 : i32 loc(#loc346) + %kv_num_blocks_65 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_36 : !tt.ptr, i32 loc(#loc347) + %kv_num_blocks_66 = tt.load %kv_num_blocks_65 : !tt.ptr loc(#loc348) + %block_n_end_67 = arith.constant 2 : i32 loc(#loc349) + %block_n_end_68 = arith.constant 2 : i32 loc(#loc349) + %block_n_end_69 = arith.muli %kv_num_blocks_66, %block_n_end_68 : i32 loc(#loc349) + %block_n_end_70 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%ks1) : (i32) -> i32 loc(#loc350) + %block_n_end_71 = arith.constant 1 : i32 loc(#loc351) + %block_n_end_72 = arith.maxsi %block_n_end_70, %block_n_end_71 : i32 loc(#loc351) + %block_n_end_73 = arith.minsi %block_n_end_69, %block_n_end_72 : i32 loc(#loc352) + %offs_n_74 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc353) + %offs_n_75 = tt.splat %kv_start_64 : i32 -> tensor<64xi32> loc(#loc354) + %offs_n_76 = arith.addi %offs_n_75, %offs_n_74 : tensor<64xi32> loc(#loc354) + %8 = tt.expand_dims %offs_m_47 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc81) + %9 = tt.expand_dims %offs_n_76 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc82) + %10:3 = tt.call @"torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(20,)cNone_(21,)cNone_(34,)cconstexpr_0__(36,)cconstexpr_bf16__(41,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %q, %K, %V, %ks0, %ks1, %7#0, %7#1, %7#2, %off_zq, %off_hq, %8, %9, %kv_start_64, %kv_indices_60, %kv_num_blocks_66, %block_n_end_73, %c1_i32_6, %c128_i32_5, %c128_i32_11, %c1_i32_12) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, !tt.ptr, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc83) + %l_i_77 = arith.constant 0.000000e+00 : f32 loc(#loc355) + %l_i_78 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc355) + %l_i_79 = arith.cmpf oeq, %10#1, %l_i_78 : tensor<128xf32> loc(#loc355) + %l_i_80 = arith.constant 1 : i32 loc(#loc356) + %l_i_81 = arith.constant 1.000000e+00 : f32 loc(#loc356) + %l_i_82 = arith.constant dense<1.000000e+00> : tensor<128xf32> loc(#loc356) + %l_i_83 = arith.select %l_i_79, %l_i_82, %10#1 : tensor<128xi1>, tensor<128xf32> loc(#loc356) + %acc_84 = tt.expand_dims %l_i_83 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc357) + %acc_85 = tt.broadcast %acc_84 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc358) + %acc_86 = arith.divf %10#0, %acc_85 : tensor<128x128xf32> loc(#loc358) + %idx_zq = tt.get_program_id y : i32 loc(#loc359) + %idx_hq = tt.get_program_id z : i32 loc(#loc360) + %idx_m = tt.expand_dims %offs_m_47 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc361) + %idx_d = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc362) + %idx_d_87 = tt.expand_dims %idx_d {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc363) + %mask = tt.splat %ks0 : i32 -> tensor<128x1xi32> loc(#loc364) + %mask_88 = arith.cmpi slt, %idx_m, %mask : tensor<128x1xi32> loc(#loc364) + %mask_89 = arith.constant 128 : i32 loc(#loc365) + %mask_90 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc365) + %mask_91 = arith.cmpi slt, %idx_d_87, %mask_90 : tensor<1x128xi32> loc(#loc365) + %mask_92 = tt.broadcast %mask_88 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc366) + %mask_93 = tt.broadcast %mask_91 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc366) + %mask_94 = arith.andi %mask_92, %mask_93 : tensor<128x128xi1> loc(#loc366) + %xindex = arith.constant 128 : i32 loc(#loc367) + %xindex_95 = arith.constant 128 : i32 loc(#loc367) + %xindex_96 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc367) + %xindex_97 = arith.muli %xindex_96, %idx_m : tensor<128x1xi32> loc(#loc367) + %xindex_98 = tt.broadcast %idx_d_87 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc368) + %xindex_99 = tt.broadcast %xindex_97 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc368) + %xindex_100 = arith.addi %xindex_98, %xindex_99 : tensor<128x128xi32> loc(#loc368) + %xindex_101 = arith.constant 128 : i32 loc(#loc369) + %xindex_102 = arith.constant 128 : i32 loc(#loc369) + %xindex_103 = arith.muli %xindex_102, %idx_hq : i32 loc(#loc369) + %xindex_104 = arith.muli %xindex_103, %ks0 : i32 loc(#loc370) + %xindex_105 = tt.splat %xindex_104 : i32 -> tensor<128x128xi32> loc(#loc371) + %xindex_106 = arith.addi %xindex_100, %xindex_105 : tensor<128x128xi32> loc(#loc371) + %xindex_107 = arith.constant 4096 : i32 loc(#loc372) + %xindex_108 = arith.constant 4096 : i32 loc(#loc372) + %xindex_109 = arith.muli %xindex_108, %idx_zq : i32 loc(#loc372) + %xindex_110 = arith.muli %xindex_109, %ks0 : i32 loc(#loc373) + %xindex_111 = tt.splat %xindex_110 : i32 -> tensor<128x128xi32> loc(#loc374) + %xindex_112 = arith.addi %xindex_106, %xindex_111 : tensor<128x128xi32> loc(#loc374) + %c128_i32_113 = arith.constant 128 : i32 loc(#loc104) + %c128_i32_114 = arith.constant 128 : i32 loc(#loc104) + %11 = arith.muli %c128_i32_114, %idx_hq : i32 loc(#loc104) + %12 = tt.splat %11 : i32 -> tensor<1x128xi32> loc(#loc105) + %13 = arith.addi %idx_d_87, %12 : tensor<1x128xi32> loc(#loc105) + %c4096_i32_115 = arith.constant 4096 : i32 loc(#loc106) + %c4096_i32_116 = arith.constant 4096 : i32 loc(#loc106) + %cst = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc106) + %14 = arith.muli %cst, %idx_m : tensor<128x1xi32> loc(#loc106) + %15 = tt.broadcast %13 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc107) + %16 = tt.broadcast %14 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc107) + %17 = arith.addi %15, %16 : tensor<128x128xi32> loc(#loc107) + %c4096_i32_117 = arith.constant 4096 : i32 loc(#loc108) + %c4096_i32_118 = arith.constant 4096 : i32 loc(#loc108) + %18 = arith.muli %c4096_i32_118, %idx_zq : i32 loc(#loc108) + %19 = arith.muli %18, %ks0 : i32 loc(#loc109) + %20 = tt.splat %19 : i32 -> tensor<128x128xi32> loc(#loc110) + %21 = arith.addi %17, %20 : tensor<128x128xi32> loc(#loc110) + %22 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc111) + %23 = tt.addptr %22, %21 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc111) + %24 = arith.truncf %acc_86 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc112) + tt.store %23, %24, %mask_94 : tensor<128x128x!tt.ptr> loc(#loc112) + %off_hz = arith.muli %off_zq, %HQ : i32 loc(#loc375) + %off_hz_119 = arith.addi %off_hz, %off_hq : i32 loc(#loc376) + %l_ptrs = arith.muli %off_hz_119, %ks0 : i32 loc(#loc377) + %l_ptrs_120 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr, i32 loc(#loc378) + %l_ptrs_121 = tt.splat %l_ptrs_120 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc379) + %l_ptrs_122 = tt.addptr %l_ptrs_121, %offs_m_47 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc379) + %lse = math.log2 %l_i_83 : tensor<128xf32> loc(#loc380) + %lse_123 = arith.addf %10#2, %lse : tensor<128xf32> loc(#loc381) + %25 = tt.splat %ks0 : i32 -> tensor<128xi32> loc(#loc120) + %26 = arith.cmpi slt, %offs_m_47, %25 : tensor<128xi32> loc(#loc120) + tt.store %l_ptrs_122, %lse_123, %26 : tensor<128x!tt.ptr> loc(#loc121) + tt.return loc(#loc122) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(1,)cconstexpr_fp32_"() -> tensor<128xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc124) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc124) + tt.return %cst_0 : tensor<128xf32> loc(#loc125) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128xf32> loc(#loc126) + tt.return %0 : tensor<128xf32> loc(#loc126) + } loc(#loc123) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() -> tensor<128x128xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc124) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc124) + tt.return %cst_0 : tensor<128x128xf32> loc(#loc125) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc126) + tt.return %0 : tensor<128x128xf32> loc(#loc126) + } loc(#loc123) + tt.func private @"torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: !tt.ptr loc("ptr"(#loc127)), %offs_m: tensor<128xi32> loc("offs_m"(#loc127)), %offs_n: tensor<128xi32> loc("offs_n"(#loc127)), %stride_m: i32 loc("stride_m"(#loc127)), %stride_n: i32 loc("stride_n"(#loc127)), %M_LEN: i32 loc("M_LEN"(#loc127))) -> tensor<128x128xbf16> attributes {noinline = false} { + %ptr_0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc388) + %ptr_1 = tt.splat %stride_m : i32 -> tensor<128x1xi32> loc(#loc389) + %ptr_2 = arith.muli %ptr_0, %ptr_1 : tensor<128x1xi32> loc(#loc389) + %ptr_3 = tt.splat %ptr : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc390) + %ptr_4 = tt.addptr %ptr_3, %ptr_2 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc390) + %ptr_5 = tt.expand_dims %offs_n {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc391) + %ptr_6 = tt.splat %stride_n : i32 -> tensor<1x128xi32> loc(#loc392) + %ptr_7 = arith.muli %ptr_5, %ptr_6 : tensor<1x128xi32> loc(#loc392) + %ptr_8 = tt.broadcast %ptr_4 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc393) + %ptr_9 = tt.broadcast %ptr_7 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc393) + %ptr_10 = tt.addptr %ptr_8, %ptr_9 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc393) + %0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc134) + %1 = tt.splat %M_LEN : i32 -> tensor<128x1xi32> loc(#loc135) + %2 = arith.cmpi slt, %0, %1 : tensor<128x1xi32> loc(#loc135) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc136) + %3 = tt.broadcast %2 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc136) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc136) + %4 = arith.truncf %cst_11 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc136) + %5 = tt.load %ptr_10, %3, %4 : tensor<128x128x!tt.ptr> loc(#loc136) + tt.return %5 : tensor<128x128xbf16> loc(#loc137) + ^bb1: // no predecessors + %6 = ub.poison : tensor<128x128xbf16> loc(#loc138) + tt.return %6 : tensor<128x128xbf16> loc(#loc138) + } loc(#loc127) + tt.func private @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%x: i32 loc("x"(#loc139))) -> i32 attributes {noinline = false} { + %c64_i32 = arith.constant 64 : i32 loc(#loc140) + %c64_i32_0 = arith.constant 64 : i32 loc(#loc140) + %0 = arith.addi %x, %c64_i32_0 : i32 loc(#loc140) + %c1_i32 = arith.constant 1 : i32 loc(#loc141) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc141) + %1 = arith.subi %0, %c1_i32_1 : i32 loc(#loc141) + %c64_i32_2 = arith.constant 64 : i32 loc(#loc142) + %c64_i32_3 = arith.constant 64 : i32 loc(#loc142) + %2 = arith.divsi %1, %c64_i32_3 : i32 loc(#loc142) + tt.return %2 : i32 loc(#loc143) + ^bb1: // no predecessors + %3 = ub.poison : i32 loc(#loc144) + tt.return %3 : i32 loc(#loc144) + } loc(#loc139) + tt.func private @"torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(20,)cNone_(21,)cNone_(34,)cconstexpr_0__(36,)cconstexpr_bf16__(41,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc145)), %arg_K: !tt.ptr loc("arg_K"(#loc145)), %arg_V: !tt.ptr loc("arg_V"(#loc145)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc145)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc145)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc145)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc145)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc145)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc145)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc145)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc145)), %ks0: i32 loc("ks0"(#loc145)), %ks1: i32 loc("ks1"(#loc145)), %ks2: i32 loc("ks2"(#loc145)), %ks3: i32 loc("ks3"(#loc145)), %ks4: i32 loc("ks4"(#loc145)), %ks5: i32 loc("ks5"(#loc145)), %q: tensor<128x128xbf16> loc("q"(#loc145)), %K: !tt.ptr loc("K"(#loc145)), %V: !tt.ptr loc("V"(#loc145)), %Q_LEN: i32 loc("Q_LEN"(#loc145)), %KV_LEN: i32 loc("KV_LEN"(#loc145)), %acc: tensor<128x128xf32> loc("acc"(#loc145)), %l_i: tensor<128xf32> loc("l_i"(#loc145)), %m_i: tensor<128xf32> loc("m_i"(#loc145)), %off_z: i32 loc("off_z"(#loc145)), %off_h: i32 loc("off_h"(#loc145)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc145)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc145)), %kv_start: i32 loc("kv_start"(#loc145)), %kv_indices: !tt.ptr loc("kv_indices"(#loc145)), %kv_num_blocks: i32 loc("kv_num_blocks"(#loc145)), %block_n_end: i32 loc("block_n_end"(#loc145)), %stride_kk: i32 loc("stride_kk"(#loc145)), %stride_kn: i32 loc("stride_kn"(#loc145)), %stride_vn: i32 loc("stride_vn"(#loc145)), %stride_vk: i32 loc("stride_vk"(#loc145))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_offset = arith.constant 0 : i32 loc(#loc432) + %c0_i32 = arith.constant 0 : i32 loc(#loc147) + %c1_i32 = arith.constant 1 : i32 loc(#loc147) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc147) + %1 = arith.bitcast %block_n_end : i32 to i32 loc(#loc147) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc147) + %3 = ub.poison : i32 loc(#loc147) + %kv_offset_0:5 = scf.for %start_n = %0 to %1 step %2 iter_args(%acc_1 = %acc, %l_i_2 = %l_i, %m_i_3 = %m_i, %offs_n_4 = %offs_n, %kv_offset_5 = %kv_offset) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %7:3 = tt.call @"torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(20,)cconstexpr_None__(21,)cconstexpr_None__(33,)cconstexpr_bf16__(34,)cconstexpr_1_d_44269504__(39,)cconstexpr_False__(40,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %q, %K, %V, %Q_LEN, %KV_LEN, %acc_1, %l_i_2, %m_i_3, %off_z, %off_h, %offs_m, %offs_n_4, %kv_start, %kv_offset_5, %stride_kk, %stride_kn, %stride_vn, %stride_vk) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc148) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc434) + %offs_n_6 = tt.splat %offset : i32 -> tensor<1x64xi32> loc(#loc435) + %offs_n_7 = arith.addi %offs_n_4, %offs_n_6 : tensor<1x64xi32> loc(#loc435) + %kv_offset_8 = arith.addi %kv_offset_5, %offset : i32 loc(#loc436) + scf.yield %7#0, %7#1, %7#2, %offs_n_7, %kv_offset_8 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc152) + } loc(#loc573) + tt.return %kv_offset_0#0, %kv_offset_0#1, %kv_offset_0#2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc153) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc154) + %5 = ub.poison : tensor<128xf32> loc(#loc154) + %6 = ub.poison : tensor<128xf32> loc(#loc154) + tt.return %4, %5, %6 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc154) + } loc(#loc145) + tt.func private @"torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(20,)cconstexpr_None__(21,)cconstexpr_None__(33,)cconstexpr_bf16__(34,)cconstexpr_1_d_44269504__(39,)cconstexpr_False__(40,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc155)), %arg_K: !tt.ptr loc("arg_K"(#loc155)), %arg_V: !tt.ptr loc("arg_V"(#loc155)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc155)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc155)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc155)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc155)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc155)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc155)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc155)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc155)), %ks0: i32 loc("ks0"(#loc155)), %ks1: i32 loc("ks1"(#loc155)), %ks2: i32 loc("ks2"(#loc155)), %ks3: i32 loc("ks3"(#loc155)), %ks4: i32 loc("ks4"(#loc155)), %ks5: i32 loc("ks5"(#loc155)), %q: tensor<128x128xbf16> loc("q"(#loc155)), %K: !tt.ptr loc("K"(#loc155)), %V: !tt.ptr loc("V"(#loc155)), %Q_LEN: i32 loc("Q_LEN"(#loc155)), %KV_LEN: i32 loc("KV_LEN"(#loc155)), %acc: tensor<128x128xf32> loc("acc"(#loc155)), %l_i: tensor<128xf32> loc("l_i"(#loc155)), %m_i: tensor<128xf32> loc("m_i"(#loc155)), %off_z: i32 loc("off_z"(#loc155)), %off_h: i32 loc("off_h"(#loc155)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc155)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc155)), %kv_start: i32 loc("kv_start"(#loc155)), %kv_offset: i32 loc("kv_offset"(#loc155)), %stride_kk: i32 loc("stride_kk"(#loc155)), %stride_kn: i32 loc("stride_kn"(#loc155)), %stride_vn: i32 loc("stride_vn"(#loc155)), %stride_vk: i32 loc("stride_vk"(#loc155))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_base_offset = arith.addi %kv_start, %kv_offset : i32 loc(#loc472) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc473) + %offs_n_load = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc474) + %offs_n_load_0 = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc475) + %offs_n_load_1 = arith.addi %offs_n_load_0, %offs_n_load : tensor<64xi32> loc(#loc475) + %k = tt.call @"torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%K, %offs_n_load_1, %offs_k, %stride_kn, %stride_kk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc476) + %k_2 = tt.trans %k {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc477) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc478) + %qk_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc478) + %qk_4 = tt.dot %q, %k_2, %qk_3, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc478) + %qk_5 = arith.constant 0.0883883461 : f32 loc(#loc479) + %qk_6 = arith.constant 0.0883883461 : f32 loc(#loc479) + %qk_7 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc479) + %qk_8 = arith.mulf %qk_4, %qk_7 : tensor<128x64xf32> loc(#loc479) + %m = tt.call @torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.get_bounded_indices__i32S128_1S_i32__(%offs_m, %Q_LEN) : (tensor<128x1xi32>, i32) -> tensor<128x1xi32> loc(#loc480) + %n = tt.call @torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.get_bounded_indices__i32S1_64S_i32__(%offs_n, %KV_LEN) : (tensor<1x64xi32>, i32) -> tensor<1x64xi32> loc(#loc481) + %post_mod_scores = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc482) + %post_mod_scores_9 = arith.cmpi slt, %offs_n, %post_mod_scores : tensor<1x64xi32> loc(#loc482) + %post_mod_scores_10 = arith.constant 0xFF800000 : f32 loc(#loc483) + %post_mod_scores_11 = arith.constant 0xFF800000 : f32 loc(#loc483) + %post_mod_scores_12 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc483) + %post_mod_scores_13 = tt.broadcast %post_mod_scores_9 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc483) + %post_mod_scores_14 = arith.select %post_mod_scores_13, %qk_8, %post_mod_scores_12 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc483) + %tmp1 = arith.constant false loc(#loc484) + %tmp1_15 = arith.constant dense : tensor<1xi1> loc(#loc484) + %tmp4 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc485) + %tmp4_16 = tt.broadcast %n : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc485) + %tmp4_17 = arith.cmpi sge, %tmp4, %tmp4_16 : tensor<128x64xi32> loc(#loc485) + %tmp5 = arith.extsi %n : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc486) + %tmp7 = tt.addptr %in_ptr9, %off_z : !tt.ptr, i32 loc(#loc487) + %tmp7_18 = tt.load %tmp7 : !tt.ptr loc(#loc488) + %tmp8 = tt.splat %tmp7_18 : i64 -> tensor<1x64xi64> loc(#loc489) + %tmp8_19 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc489) + %tmp9 = arith.extsi %m : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc490) + %tmp10 = tt.splat %tmp7_18 : i64 -> tensor<128x1xi64> loc(#loc491) + %tmp10_20 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc491) + %tmp11 = tt.broadcast %tmp8_19 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc492) + %tmp11_21 = tt.broadcast %tmp10_20 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc492) + %tmp11_22 = arith.andi %tmp11, %tmp11_21 : tensor<128x64xi1> loc(#loc492) + %tmp12 = arith.andi %tmp4_17, %tmp11_22 : tensor<128x64xi1> loc(#loc493) + %tmp13 = tt.expand_dims %tmp1_15 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc494) + %tmp13_23 = tt.broadcast %tmp13 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc494) + %tmp13_24 = arith.ori %tmp13_23, %tmp12 : tensor<128x64xi1> loc(#loc494) + %tmp15 = tt.splat %ks5 : i32 -> tensor<1x64xi32> loc(#loc495) + %tmp15_25 = arith.cmpi sge, %n, %tmp15 : tensor<1x64xi32> loc(#loc495) + %tmp16 = tt.splat %ks5 : i32 -> tensor<1x64xi32> loc(#loc496) + %tmp16_26 = arith.remsi %n, %tmp16 : tensor<1x64xi32> loc(#loc496) + %tmp17 = arith.constant 0 : i32 loc(#loc497) + %tmp17_27 = arith.constant dense<0> : tensor<1xi32> loc(#loc497) + %tmp18 = tt.expand_dims %tmp17_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc498) + %tmp18_28 = tt.broadcast %tmp18 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc498) + %tmp18_29 = arith.cmpi ne, %tmp16_26, %tmp18_28 : tensor<1x64xi32> loc(#loc498) + %tmp19 = arith.constant 0 : i32 loc(#loc499) + %tmp19_30 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc499) + %tmp19_31 = arith.cmpi slt, %tmp16_26, %tmp19_30 : tensor<1x64xi32> loc(#loc499) + %tmp20 = arith.constant 0 : i32 loc(#loc500) + %tmp20_32 = arith.cmpi slt, %ks5, %tmp20 : i32 loc(#loc500) + %tmp21 = tt.splat %tmp20_32 : i1 -> tensor<1x64xi1> loc(#loc501) + %tmp21_33 = arith.cmpi ne, %tmp19_31, %tmp21 : tensor<1x64xi1> loc(#loc501) + %tmp22 = arith.andi %tmp18_29, %tmp21_33 : tensor<1x64xi1> loc(#loc502) + %tmp23 = tt.splat %ks5 : i32 -> tensor<1x64xi32> loc(#loc503) + %tmp23_34 = arith.addi %tmp16_26, %tmp23 : tensor<1x64xi32> loc(#loc503) + %tmp24 = arith.select %tmp22, %tmp23_34, %tmp16_26 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc504) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc505) + %tmp26 = tt.splat %tmp7_18 : i64 -> tensor<1x64xi64> loc(#loc506) + %tmp26_35 = arith.cmpi slt, %tmp25, %tmp26 : tensor<1x64xi64> loc(#loc506) + %tmp27 = arith.andi %tmp15_25, %tmp26_35 : tensor<1x64xi1> loc(#loc507) + %tmp28 = tt.broadcast %n : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc508) + %tmp28_36 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc508) + %tmp28_37 = arith.subi %tmp28, %tmp28_36 : tensor<128x64xi32> loc(#loc508) + %tmp29 = tt.splat %ks5 : i32 -> tensor<128x64xi32> loc(#loc509) + %tmp29_38 = arith.remsi %tmp28_37, %tmp29 : tensor<128x64xi32> loc(#loc509) + %tmp30 = tt.expand_dims %tmp17_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc510) + %tmp30_39 = tt.broadcast %tmp30 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc510) + %tmp30_40 = arith.cmpi ne, %tmp29_38, %tmp30_39 : tensor<128x64xi32> loc(#loc510) + %tmp31 = arith.constant 0 : i32 loc(#loc511) + %tmp31_41 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc511) + %tmp31_42 = arith.cmpi slt, %tmp29_38, %tmp31_41 : tensor<128x64xi32> loc(#loc511) + %tmp32 = tt.splat %tmp20_32 : i1 -> tensor<128x64xi1> loc(#loc512) + %tmp32_43 = arith.cmpi ne, %tmp31_42, %tmp32 : tensor<128x64xi1> loc(#loc512) + %tmp33 = arith.andi %tmp30_40, %tmp32_43 : tensor<128x64xi1> loc(#loc513) + %tmp34 = tt.splat %ks5 : i32 -> tensor<128x64xi32> loc(#loc514) + %tmp34_44 = arith.addi %tmp29_38, %tmp34 : tensor<128x64xi32> loc(#loc514) + %tmp35 = arith.select %tmp33, %tmp34_44, %tmp29_38 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc515) + %tmp36 = tt.expand_dims %tmp17_27 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc516) + %tmp36_45 = tt.broadcast %tmp36 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc516) + %tmp36_46 = arith.cmpi eq, %tmp35, %tmp36_45 : tensor<128x64xi32> loc(#loc516) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc517) + %tmp37_47 = arith.andi %tmp37, %tmp36_46 : tensor<128x64xi1> loc(#loc517) + %tmp38 = arith.ori %tmp13_24, %tmp37_47 : tensor<128x64xi1> loc(#loc518) + %mask_mod_output = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc519) + %mask_mod_output_48 = arith.cmpi slt, %offs_n, %mask_mod_output : tensor<1x64xi32> loc(#loc519) + %mask_mod_output_49 = arith.constant false loc(#loc520) + %mask_mod_output_50 = arith.constant false loc(#loc520) + %mask_mod_output_51 = arith.constant dense : tensor<128x64xi1> loc(#loc520) + %mask_mod_output_52 = tt.broadcast %mask_mod_output_48 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc520) + %mask_mod_output_53 = arith.select %mask_mod_output_52, %tmp38, %mask_mod_output_51 : tensor<128x64xi1>, tensor<128x64xi1> loc(#loc520) + %post_mod_scores_54 = arith.constant 0xFF800000 : f32 loc(#loc521) + %post_mod_scores_55 = arith.constant 0xFF800000 : f32 loc(#loc521) + %post_mod_scores_56 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc521) + %post_mod_scores_57 = arith.select %mask_mod_output_53, %post_mod_scores_14, %post_mod_scores_56 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc521) + %post_mod_scores_58 = arith.constant 1.44269502 : f32 loc(#loc522) + %post_mod_scores_59 = arith.constant 1.44269502 : f32 loc(#loc522) + %post_mod_scores_60 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc522) + %post_mod_scores_61 = arith.mulf %post_mod_scores_57, %post_mod_scores_60 : tensor<128x64xf32> loc(#loc522) + %m_ij = tt.call @"triton.language.standard.max__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cconstexpr_True__(4,)cconstexpr_False_"(%post_mod_scores_61) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc523) + %m_ij_62 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc524) + %masked_out_rows = arith.constant 0xFF800000 : f32 loc(#loc525) + %masked_out_rows_63 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc525) + %masked_out_rows_64 = arith.cmpf oeq, %m_ij_62, %masked_out_rows_63 : tensor<128xf32> loc(#loc525) + %m_ij_masked = arith.constant 0 : i32 loc(#loc526) + %m_ij_masked_65 = arith.constant 0.000000e+00 : f32 loc(#loc526) + %m_ij_masked_66 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc526) + %m_ij_masked_67 = arith.select %masked_out_rows_64, %m_ij_masked_66, %m_ij_62 : tensor<128xi1>, tensor<128xf32> loc(#loc526) + %alpha = arith.subf %m_i, %m_ij_masked_67 : tensor<128xf32> loc(#loc527) + %alpha_68 = math.exp2 %alpha : tensor<128xf32> loc(#loc528) + %p = tt.expand_dims %m_ij_masked_67 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc529) + %p_69 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc530) + %p_70 = arith.subf %post_mod_scores_61, %p_69 : tensor<128x64xf32> loc(#loc530) + %p_71 = math.exp2 %p_70 : tensor<128x64xf32> loc(#loc531) + %l_i_72 = arith.mulf %l_i, %alpha_68 : tensor<128xf32> loc(#loc532) + %l_i_73 = tt.call @"triton.language.standard.sum__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%p_71) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc533) + %l_i_74 = arith.addf %l_i_72, %l_i_73 : tensor<128xf32> loc(#loc534) + %acc_75 = tt.expand_dims %alpha_68 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc535) + %acc_76 = tt.broadcast %acc_75 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc536) + %acc_77 = arith.mulf %acc, %acc_76 : tensor<128x128xf32> loc(#loc536) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc537) + %v = tt.call @"torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%V, %offs_n_load_1, %offs_v, %stride_vn, %stride_vk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc538) + %acc_78 = arith.truncf %p_71 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc539) + %acc_79 = arith.constant 0.000000e+00 : f32 loc(#loc540) + %acc_80 = tt.dot %acc_78, %v, %acc_77, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc540) + tt.return %acc_80, %l_i_74, %m_ij_62 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc225) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc226) + %1 = ub.poison : tensor<128xf32> loc(#loc226) + %2 = ub.poison : tensor<128xf32> loc(#loc226) + tt.return %0, %1, %2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc226) + } loc(#loc155) + tt.func private @"torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: !tt.ptr loc("ptr"(#loc127)), %offs_m: tensor<64xi32> loc("offs_m"(#loc127)), %offs_n: tensor<128xi32> loc("offs_n"(#loc127)), %stride_m: i32 loc("stride_m"(#loc127)), %stride_n: i32 loc("stride_n"(#loc127)), %M_LEN: i32 loc("M_LEN"(#loc127))) -> tensor<64x128xbf16> attributes {noinline = false} { + %ptr_0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc388) + %ptr_1 = tt.splat %stride_m : i32 -> tensor<64x1xi32> loc(#loc389) + %ptr_2 = arith.muli %ptr_0, %ptr_1 : tensor<64x1xi32> loc(#loc389) + %ptr_3 = tt.splat %ptr : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc390) + %ptr_4 = tt.addptr %ptr_3, %ptr_2 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc390) + %ptr_5 = tt.expand_dims %offs_n {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc391) + %ptr_6 = tt.splat %stride_n : i32 -> tensor<1x128xi32> loc(#loc392) + %ptr_7 = arith.muli %ptr_5, %ptr_6 : tensor<1x128xi32> loc(#loc392) + %ptr_8 = tt.broadcast %ptr_4 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc393) + %ptr_9 = tt.broadcast %ptr_7 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc393) + %ptr_10 = tt.addptr %ptr_8, %ptr_9 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc393) + %0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc134) + %1 = tt.splat %M_LEN : i32 -> tensor<64x1xi32> loc(#loc135) + %2 = arith.cmpi slt, %0, %1 : tensor<64x1xi32> loc(#loc135) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc136) + %3 = tt.broadcast %2 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc136) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<64x128xf32> loc(#loc136) + %4 = arith.truncf %cst_11 : tensor<64x128xf32> to tensor<64x128xbf16> loc(#loc136) + %5 = tt.load %ptr_10, %3, %4 : tensor<64x128x!tt.ptr> loc(#loc136) + tt.return %5 : tensor<64x128xbf16> loc(#loc137) + ^bb1: // no predecessors + %6 = ub.poison : tensor<64x128xbf16> loc(#loc138) + tt.return %6 : tensor<64x128xbf16> loc(#loc138) + } loc(#loc127) + tt.func private @torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.get_bounded_indices__i32S128_1S_i32__(%indices: tensor<128x1xi32> loc("indices"(#loc227)), %max_len: i32 loc("max_len"(#loc227))) -> tensor<128x1xi32> attributes {noinline = false} { + %0 = tt.splat %max_len : i32 -> tensor<128x1xi32> loc(#loc228) + %1 = arith.remsi %indices, %0 : tensor<128x1xi32> loc(#loc228) + tt.return %1 : tensor<128x1xi32> loc(#loc229) + ^bb1: // no predecessors + %2 = ub.poison : tensor<128x1xi32> loc(#loc230) + tt.return %2 : tensor<128x1xi32> loc(#loc230) + } loc(#loc227) + tt.func private @torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.get_bounded_indices__i32S1_64S_i32__(%indices: tensor<1x64xi32> loc("indices"(#loc227)), %max_len: i32 loc("max_len"(#loc227))) -> tensor<1x64xi32> attributes {noinline = false} { + %0 = tt.splat %max_len : i32 -> tensor<1x64xi32> loc(#loc228) + %1 = arith.remsi %indices, %0 : tensor<1x64xi32> loc(#loc228) + tt.return %1 : tensor<1x64xi32> loc(#loc229) + ^bb1: // no predecessors + %2 = ub.poison : tensor<1x64xi32> loc(#loc230) + tt.return %2 : tensor<1x64xi32> loc(#loc230) + } loc(#loc227) + tt.func private @"triton.language.standard.max__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cconstexpr_True__(4,)cconstexpr_False_"(%input: tensor<128x64xf32> loc("input"(#loc231))) -> tensor<128xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._elementwise_max__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc232) + tt.reduce.return %2 : f32 loc(#loc232) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc232) + tt.return %0 : tensor<128xf32> loc(#loc234) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128xf32> loc(#loc235) + tt.return %1 : tensor<128xf32> loc(#loc235) + } loc(#loc231) + tt.func private @triton.language.standard._elementwise_max__fp32_fp32__(%a: f32 loc("a"(#loc236)), %b: f32 loc("b"(#loc236))) -> f32 attributes {noinline = false} { + %0 = arith.maxnumf %a, %b : f32 loc(#loc237) + tt.return %0 : f32 loc(#loc238) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc239) + tt.return %1 : f32 loc(#loc239) + } loc(#loc236) + tt.func private @"triton.language.standard.sum__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x64xf32> loc("input"(#loc240))) -> tensor<128xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc241) + tt.reduce.return %2 : f32 loc(#loc241) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc241) + tt.return %0 : tensor<128xf32> loc(#loc242) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128xf32> loc(#loc243) + tt.return %1 : tensor<128xf32> loc(#loc243) + } loc(#loc240) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc244)), %b: f32 loc("b"(#loc244))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc245) + tt.return %0 : f32 loc(#loc246) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc247) + tt.return %1 : f32 loc(#loc247) + } loc(#loc244) + tt.func private @"torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%loop_iter: i32 loc("loop_iter"(#loc248)), %col_indices: !tt.ptr loc("col_indices"(#loc248)), %total_blocks: i32 loc("total_blocks"(#loc248))) -> i32 attributes {noinline = false} { + %cur_block_idx = arith.constant 2 : i32 loc(#loc552) + %cur_block_idx_0 = arith.constant 2 : i32 loc(#loc552) + %cur_block_idx_1 = arith.divsi %loop_iter, %cur_block_idx_0 : i32 loc(#loc552) + %cur_block = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc553) + %cur_block_2 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc554) + %next_block = arith.constant 1 : i32 loc(#loc555) + %next_block_3 = arith.constant 1 : i32 loc(#loc555) + %next_block_4 = arith.addi %cur_block_idx_1, %next_block_3 : i32 loc(#loc555) + %next_block_5 = arith.cmpi slt, %next_block_4, %total_blocks : i32 loc(#loc556) + %next_block_6 = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc557) + %next_block_7 = arith.constant 1 : i32 loc(#loc558) + %next_block_8 = tt.addptr %next_block_6, %next_block_7 : !tt.ptr, i32 loc(#loc558) + %next_block_9 = tt.load %next_block_8, %next_block_5 evictionPolicy = evict_last : !tt.ptr loc(#loc559) + %needs_jump = arith.constant 1 : i32 loc(#loc560) + %needs_jump_10 = arith.constant 1 : i32 loc(#loc560) + %needs_jump_11 = arith.addi %loop_iter, %needs_jump_10 : i32 loc(#loc560) + %needs_jump_12 = arith.constant 2 : i32 loc(#loc561) + %needs_jump_13 = arith.constant 2 : i32 loc(#loc561) + %needs_jump_14 = arith.remsi %needs_jump_11, %needs_jump_13 : i32 loc(#loc561) + %needs_jump_15 = arith.constant 0 : i32 loc(#loc562) + %needs_jump_16 = arith.cmpi eq, %needs_jump_14, %needs_jump_15 : i32 loc(#loc562) + %jump_to_block = arith.subi %next_block_9, %cur_block_2 : i32 loc(#loc563) + %jump_to_block_17 = arith.constant 128 : i32 loc(#loc564) + %jump_to_block_18 = arith.constant 128 : i32 loc(#loc564) + %jump_to_block_19 = arith.muli %jump_to_block, %jump_to_block_18 : i32 loc(#loc564) + %jump_to_block_20 = arith.constant 64 : i32 loc(#loc565) + %jump_to_block_21 = arith.constant 64 : i32 loc(#loc565) + %jump_to_block_22 = arith.subi %jump_to_block_19, %jump_to_block_21 : i32 loc(#loc565) + %offset = arith.extui %needs_jump_16 : i1 to i32 loc(#loc566) + %offset_23 = arith.muli %jump_to_block_22, %offset : i32 loc(#loc566) + %offset_24 = arith.constant 1 : i32 loc(#loc567) + %offset_25 = arith.constant 1 : i32 loc(#loc567) + %offset_26 = arith.extui %needs_jump_16 : i1 to i32 loc(#loc567) + %offset_27 = arith.subi %offset_25, %offset_26 : i32 loc(#loc567) + %offset_28 = arith.constant 64 : i32 loc(#loc568) + %offset_29 = arith.constant 64 : i32 loc(#loc568) + %offset_30 = arith.muli %offset_27, %offset_29 : i32 loc(#loc568) + %offset_31 = arith.addi %offset_23, %offset_30 : i32 loc(#loc569) + tt.return %offset_31 : i32 loc(#loc267) + ^bb1: // no predecessors + %0 = ub.poison : i32 loc(#loc268) + tt.return %0 : i32 loc(#loc268) + } loc(#loc248) + tt.func private @"torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(20,)cNone_(21,)cNone_(34,)cconstexpr_0__(36,)cconstexpr_bf16__(41,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc145)), %arg_K: !tt.ptr loc("arg_K"(#loc145)), %arg_V: !tt.ptr loc("arg_V"(#loc145)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc145)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc145)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc145)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc145)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc145)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc145)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc145)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc145)), %ks0: i32 loc("ks0"(#loc145)), %ks1: i32 loc("ks1"(#loc145)), %ks2: i32 loc("ks2"(#loc145)), %ks3: i32 loc("ks3"(#loc145)), %ks4: i32 loc("ks4"(#loc145)), %ks5: i32 loc("ks5"(#loc145)), %q: tensor<128x128xbf16> loc("q"(#loc145)), %K: !tt.ptr loc("K"(#loc145)), %V: !tt.ptr loc("V"(#loc145)), %Q_LEN: i32 loc("Q_LEN"(#loc145)), %KV_LEN: i32 loc("KV_LEN"(#loc145)), %acc: tensor<128x128xf32> loc("acc"(#loc145)), %l_i: tensor<128xf32> loc("l_i"(#loc145)), %m_i: tensor<128xf32> loc("m_i"(#loc145)), %off_z: i32 loc("off_z"(#loc145)), %off_h: i32 loc("off_h"(#loc145)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc145)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc145)), %kv_start: i32 loc("kv_start"(#loc145)), %kv_indices: !tt.ptr loc("kv_indices"(#loc145)), %kv_num_blocks: i32 loc("kv_num_blocks"(#loc145)), %block_n_end: i32 loc("block_n_end"(#loc145)), %stride_kk: i32 loc("stride_kk"(#loc145)), %stride_kn: i32 loc("stride_kn"(#loc145)), %stride_vn: i32 loc("stride_vn"(#loc145)), %stride_vk: i32 loc("stride_vk"(#loc145))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_offset = arith.constant 0 : i32 loc(#loc432) + %c0_i32 = arith.constant 0 : i32 loc(#loc147) + %c1_i32 = arith.constant 1 : i32 loc(#loc147) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc147) + %1 = arith.bitcast %block_n_end : i32 to i32 loc(#loc147) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc147) + %3 = ub.poison : i32 loc(#loc147) + %kv_offset_0:5 = scf.for %start_n = %0 to %1 step %2 iter_args(%acc_1 = %acc, %l_i_2 = %l_i, %m_i_3 = %m_i, %offs_n_4 = %offs_n, %kv_offset_5 = %kv_offset) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %7:3 = tt.call @"torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(20,)cconstexpr_None__(21,)cconstexpr_None__(33,)cconstexpr_bf16__(34,)cconstexpr_1_d_44269504__(39,)cconstexpr_True__(40,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %ks0, %ks1, %ks2, %ks3, %ks4, %ks5, %q, %K, %V, %Q_LEN, %KV_LEN, %acc_1, %l_i_2, %m_i_3, %off_z, %off_h, %offs_m, %offs_n_4, %kv_start, %kv_offset_5, %stride_kk, %stride_kn, %stride_vn, %stride_vk) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, i32, i32, i32, i32, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc148) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc434) + %offs_n_6 = tt.splat %offset : i32 -> tensor<1x64xi32> loc(#loc435) + %offs_n_7 = arith.addi %offs_n_4, %offs_n_6 : tensor<1x64xi32> loc(#loc435) + %kv_offset_8 = arith.addi %kv_offset_5, %offset : i32 loc(#loc436) + scf.yield %7#0, %7#1, %7#2, %offs_n_7, %kv_offset_8 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc152) + } loc(#loc573) + tt.return %kv_offset_0#0, %kv_offset_0#1, %kv_offset_0#2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc153) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc154) + %5 = ub.poison : tensor<128xf32> loc(#loc154) + %6 = ub.poison : tensor<128xf32> loc(#loc154) + tt.return %4, %5, %6 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc154) + } loc(#loc145) + tt.func private @"torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_i32_i32_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(20,)cconstexpr_None__(21,)cconstexpr_None__(33,)cconstexpr_bf16__(34,)cconstexpr_1_d_44269504__(39,)cconstexpr_True__(40,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc155)), %arg_K: !tt.ptr loc("arg_K"(#loc155)), %arg_V: !tt.ptr loc("arg_V"(#loc155)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc155)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc155)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc155)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc155)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc155)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc155)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc155)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc155)), %ks0: i32 loc("ks0"(#loc155)), %ks1: i32 loc("ks1"(#loc155)), %ks2: i32 loc("ks2"(#loc155)), %ks3: i32 loc("ks3"(#loc155)), %ks4: i32 loc("ks4"(#loc155)), %ks5: i32 loc("ks5"(#loc155)), %q: tensor<128x128xbf16> loc("q"(#loc155)), %K: !tt.ptr loc("K"(#loc155)), %V: !tt.ptr loc("V"(#loc155)), %Q_LEN: i32 loc("Q_LEN"(#loc155)), %KV_LEN: i32 loc("KV_LEN"(#loc155)), %acc: tensor<128x128xf32> loc("acc"(#loc155)), %l_i: tensor<128xf32> loc("l_i"(#loc155)), %m_i: tensor<128xf32> loc("m_i"(#loc155)), %off_z: i32 loc("off_z"(#loc155)), %off_h: i32 loc("off_h"(#loc155)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc155)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc155)), %kv_start: i32 loc("kv_start"(#loc155)), %kv_offset: i32 loc("kv_offset"(#loc155)), %stride_kk: i32 loc("stride_kk"(#loc155)), %stride_kn: i32 loc("stride_kn"(#loc155)), %stride_vn: i32 loc("stride_vn"(#loc155)), %stride_vk: i32 loc("stride_vk"(#loc155))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_base_offset = arith.addi %kv_start, %kv_offset : i32 loc(#loc472) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc473) + %offs_n_load = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc474) + %offs_n_load_0 = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc475) + %offs_n_load_1 = arith.addi %offs_n_load_0, %offs_n_load : tensor<64xi32> loc(#loc475) + %k = tt.call @"torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%K, %offs_n_load_1, %offs_k, %stride_kn, %stride_kk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc476) + %k_2 = tt.trans %k {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc477) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc478) + %qk_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc478) + %qk_4 = tt.dot %q, %k_2, %qk_3, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc478) + %qk_5 = arith.constant 0.0883883461 : f32 loc(#loc479) + %qk_6 = arith.constant 0.0883883461 : f32 loc(#loc479) + %qk_7 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc479) + %qk_8 = arith.mulf %qk_4, %qk_7 : tensor<128x64xf32> loc(#loc479) + %m = tt.call @torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.get_bounded_indices__i32S128_1S_i32__(%offs_m, %Q_LEN) : (tensor<128x1xi32>, i32) -> tensor<128x1xi32> loc(#loc480) + %n = tt.call @torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.get_bounded_indices__i32S1_64S_i32__(%offs_n, %KV_LEN) : (tensor<1x64xi32>, i32) -> tensor<1x64xi32> loc(#loc481) + %post_mod_scores = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc482) + %post_mod_scores_9 = arith.cmpi slt, %offs_n, %post_mod_scores : tensor<1x64xi32> loc(#loc482) + %post_mod_scores_10 = arith.constant 0xFF800000 : f32 loc(#loc483) + %post_mod_scores_11 = arith.constant 0xFF800000 : f32 loc(#loc483) + %post_mod_scores_12 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc483) + %post_mod_scores_13 = tt.broadcast %post_mod_scores_9 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc483) + %post_mod_scores_14 = arith.select %post_mod_scores_13, %qk_8, %post_mod_scores_12 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc483) + %post_mod_scores_15 = arith.constant 1.44269502 : f32 loc(#loc522) + %post_mod_scores_16 = arith.constant 1.44269502 : f32 loc(#loc522) + %post_mod_scores_17 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc522) + %post_mod_scores_18 = arith.mulf %post_mod_scores_14, %post_mod_scores_17 : tensor<128x64xf32> loc(#loc522) + %m_ij = tt.call @"triton.language.standard.max__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cconstexpr_True__(4,)cconstexpr_False_"(%post_mod_scores_18) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc523) + %m_ij_19 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc524) + %masked_out_rows = arith.constant 0xFF800000 : f32 loc(#loc525) + %masked_out_rows_20 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc525) + %masked_out_rows_21 = arith.cmpf oeq, %m_ij_19, %masked_out_rows_20 : tensor<128xf32> loc(#loc525) + %m_ij_masked = arith.constant 0 : i32 loc(#loc526) + %m_ij_masked_22 = arith.constant 0.000000e+00 : f32 loc(#loc526) + %m_ij_masked_23 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc526) + %m_ij_masked_24 = arith.select %masked_out_rows_21, %m_ij_masked_23, %m_ij_19 : tensor<128xi1>, tensor<128xf32> loc(#loc526) + %alpha = arith.subf %m_i, %m_ij_masked_24 : tensor<128xf32> loc(#loc527) + %alpha_25 = math.exp2 %alpha : tensor<128xf32> loc(#loc528) + %p = tt.expand_dims %m_ij_masked_24 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc529) + %p_26 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc530) + %p_27 = arith.subf %post_mod_scores_18, %p_26 : tensor<128x64xf32> loc(#loc530) + %p_28 = math.exp2 %p_27 : tensor<128x64xf32> loc(#loc531) + %l_i_29 = arith.mulf %l_i, %alpha_25 : tensor<128xf32> loc(#loc532) + %l_i_30 = tt.call @"triton.language.standard.sum__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%p_28) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc533) + %l_i_31 = arith.addf %l_i_29, %l_i_30 : tensor<128xf32> loc(#loc534) + %acc_32 = tt.expand_dims %alpha_25 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc535) + %acc_33 = tt.broadcast %acc_32 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc536) + %acc_34 = arith.mulf %acc, %acc_33 : tensor<128x128xf32> loc(#loc536) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc537) + %v = tt.call @"torch._inductor.runtime.compile_tasks.c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%V, %offs_n_load_1, %offs_v, %stride_vn, %stride_vk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc538) + %acc_35 = arith.truncf %p_28 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc539) + %acc_36 = arith.constant 0.000000e+00 : f32 loc(#loc540) + %acc_37 = tt.dot %acc_35, %v, %acc_34, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc540) + tt.return %acc_37, %l_i_31, %m_ij_19 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc225) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc226) + %1 = ub.poison : tensor<128xf32> loc(#loc226) + %2 = ub.poison : tensor<128xf32> loc(#loc226) + tt.return %0, %1, %2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc226) + } loc(#loc155) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":85:54) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":85:49) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":86:54) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":86:63) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":86:49) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":87:54) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":87:63) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":87:49) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":89:9) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":90:9) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":92:10) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":97:28) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":98:27) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":99:27) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":103:23) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":104:24) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":105:21) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":107:24) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":107:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":107:36) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":108:25) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":108:47) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":108:37) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":109:25) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":109:47) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":109:37) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":111:12) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":112:12) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":113:12) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":120:15) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":121:16) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":123:28) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":124:29) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":130:26) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":134:19) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":134:50) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":135:19) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":136:19) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":138:23) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":138:46) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":138:33) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":141:38) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":141:50) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":142:51) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":142:85) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":142:74) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":143:46) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":143:76) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":143:97) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":143:64) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":144:23) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":144:46) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":144:33) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":145:26) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":146:101) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":151:26) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":152:23) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":152:37) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":153:42) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":153:28) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":154:45) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":154:92) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":154:102) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":154:65) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":159:37) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":159:24) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":167:31) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":167:48) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":172:41) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":181:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":182:27) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":182:41) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":183:51) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":183:32) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":184:49) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":184:96) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":184:106) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":184:69) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":186:41) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":186:28) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":193:35) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":193:52) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":198:45) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":206:26) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":206:34) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":208:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":208:16) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":209:27) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":210:27) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":211:19) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":212:25) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":212:45) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":214:20) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":214:38) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":214:30) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":217:25) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":217:21) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":217:37) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":217:44) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":217:33) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":217:55) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":217:62) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":217:50) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:53) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:49) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:67) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:62) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:80) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:87) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:75) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:25) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:110) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":221:26) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":221:31) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":222:32) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":222:23) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":222:40) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":223:33) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":223:20) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":227:48) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":227:29) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":229:4) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":284:27) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":284:38) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":284:20) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":284:56) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":284:67) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":284:49) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":292:41) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":292:52) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":292:23) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":292:15) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":287:4) +#loc140 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:16) +#loc141 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc142 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc143 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:11) +#loc144 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:4) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":499:16) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":502:40) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":538:16) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":545:63) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":548:26) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":549:21) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":549:8) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":552:11) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":552:4) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":342:32) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":345:26) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":346:48) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":346:35) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":347:107) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":349:17) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":351:19) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":353:14) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":358:36) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":359:36) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":367:44) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":367:69) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":370:35) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":373:23) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":374:23) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":376:33) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":376:23) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":377:22) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":378:23) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":379:23) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":380:23) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":381:23) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":382:23) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":384:24) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":385:24) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":386:32) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":387:25) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":388:92) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":389:92) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":390:25) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":391:24) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":392:24) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":393:39) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":394:25) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":395:24) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":396:24) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":397:23) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":398:25) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":399:25) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":400:92) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":401:25) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":402:24) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":403:24) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":404:39) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":405:25) +#loc201 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":406:24) +#loc202 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":407:24) +#loc203 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":412:48) +#loc204 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":412:73) +#loc205 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":414:69) +#loc206 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":417:27) +#loc207 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":421:51) +#loc208 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":421:27) +#loc209 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":423:35) +#loc210 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":424:51) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":428:31) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":428:25) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":429:51) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":429:39) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":429:21) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":434:16) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":434:34) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":434:24) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":436:22) +#loc220 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":436:16) +#loc221 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":438:26) +#loc222 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":439:107) +#loc223 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":440:22) +#loc224 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":440:44) +#loc225 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":445:11) +#loc226 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":445:4) +#loc228 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":257:21) +#loc229 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":257:11) +#loc230 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":257:4) +#loc232 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40) +#loc234 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:15) +#loc235 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":177:4) +#loc237 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27) +#loc238 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:11) +#loc239 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:4) +#loc241 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc242 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc243 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc245 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc246 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc247 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":247:33) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":248:38) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":248:24) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":249:109) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":249:113) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":249:39) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":249:55) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":249:25) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":250:30) +#loc258 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":250:35) +#loc259 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":250:60) +#loc260 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":251:34) +#loc261 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":251:48) +#loc262 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":251:63) +#loc263 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":252:29) +#loc264 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":252:47) +#loc265 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":252:61) +#loc266 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":252:42) +#loc267 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":253:11) +#loc268 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":253:4) +#loc286 = loc("ZQ"(#loc9)) +#loc287 = loc("HQ"(#loc10)) +#loc288 = loc("ZKV"(#loc11)) +#loc289 = loc("q_start"(#loc12)) +#loc290 = loc("off_zq"(#loc13)) +#loc291 = loc("off_hq"(#loc14)) +#loc292 = loc("off_zkv"(#loc15)) +#loc293 = loc("off_hkv"(#loc16)) +#loc294 = loc("off_g"(#loc17)) +#loc295 = loc("q_offset"(#loc18)) +#loc296 = loc("q_offset"(#loc19)) +#loc297 = loc("q_offset"(#loc20)) +#loc298 = loc("k_offset"(#loc21)) +#loc299 = loc("k_offset"(#loc22)) +#loc300 = loc("k_offset"(#loc23)) +#loc301 = loc("v_offset"(#loc24)) +#loc302 = loc("v_offset"(#loc25)) +#loc303 = loc("v_offset"(#loc26)) +#loc304 = loc("Q"(#loc27)) +#loc305 = loc("K"(#loc28)) +#loc306 = loc("V"(#loc29)) +#loc307 = loc("SPARSE_Z"(#loc30)) +#loc308 = loc("SPARSE_HQ"(#loc31)) +#loc309 = loc("sparse_idx_z"(#loc32)) +#loc310 = loc("sparse_idx_hq"(#loc33)) +#loc311 = loc("stride_kv_idx_h"(#loc34)) +#loc312 = loc("m_i"(#loc35)) +#loc313 = loc("m_i"(#loc36)) +#loc314 = loc("l_i"(#loc37)) +#loc315 = loc("acc"(#loc38)) +#loc316 = loc("offs_m"(#loc39)) +#loc317 = loc("offs_m"(#loc40)) +#loc318 = loc("offs_m"(#loc41)) +#loc319 = loc("sparse_hz_offset"(#loc42)) +#loc320 = loc("sparse_hz_offset"(#loc43)) +#loc321 = loc("sparse_kv_num_blks_offset"(#loc44)) +#loc322 = loc("sparse_kv_num_blks_offset"(#loc45)) +#loc323 = loc("sparse_kv_num_blks_offset"(#loc46)) +#loc324 = loc("sparse_kv_idx_offset"(#loc47)) +#loc325 = loc("sparse_kv_idx_offset"(#loc48)) +#loc326 = loc("sparse_kv_idx_offset"(#loc49)) +#loc327 = loc("sparse_kv_idx_offset"(#loc50)) +#loc328 = loc("offs_m"(#loc51)) +#loc329 = loc("offs_m"(#loc52)) +#loc330 = loc("offs_m"(#loc53)) +#loc331 = loc("offs_k"(#loc54)) +#loc332 = loc("q"(#loc55)) +#loc333 = loc("kv_indices"(#loc56)) +#loc334 = loc("kv_start"(#loc57)) +#loc335 = loc("kv_start"(#loc58)) +#loc336 = loc("kv_num_blocks"(#loc59)) +#loc337 = loc("kv_num_blocks"(#loc60)) +#loc338 = loc("block_n_end"(#loc61)) +#loc339 = loc("block_n_end"(#loc62)) +#loc340 = loc("block_n_end"(#loc63)) +#loc341 = loc("block_n_end"(#loc64)) +#loc342 = loc("offs_n"(#loc65)) +#loc343 = loc("offs_n"(#loc66)) +#loc344 = loc("kv_indices"(#loc70)) +#loc345 = loc("kv_start"(#loc71)) +#loc346 = loc("kv_start"(#loc72)) +#loc347 = loc("kv_num_blocks"(#loc73)) +#loc348 = loc("kv_num_blocks"(#loc74)) +#loc349 = loc("block_n_end"(#loc75)) +#loc350 = loc("block_n_end"(#loc76)) +#loc351 = loc("block_n_end"(#loc77)) +#loc352 = loc("block_n_end"(#loc78)) +#loc353 = loc("offs_n"(#loc79)) +#loc354 = loc("offs_n"(#loc80)) +#loc355 = loc("l_i"(#loc84)) +#loc356 = loc("l_i"(#loc85)) +#loc357 = loc("acc"(#loc86)) +#loc358 = loc("acc"(#loc87)) +#loc359 = loc("idx_zq"(#loc88)) +#loc360 = loc("idx_hq"(#loc89)) +#loc361 = loc("idx_m"(#loc90)) +#loc362 = loc("idx_d"(#loc91)) +#loc363 = loc("idx_d"(#loc92)) +#loc364 = loc("mask"(#loc93)) +#loc365 = loc("mask"(#loc94)) +#loc366 = loc("mask"(#loc95)) +#loc367 = loc("xindex"(#loc96)) +#loc368 = loc("xindex"(#loc97)) +#loc369 = loc("xindex"(#loc98)) +#loc370 = loc("xindex"(#loc99)) +#loc371 = loc("xindex"(#loc100)) +#loc372 = loc("xindex"(#loc101)) +#loc373 = loc("xindex"(#loc102)) +#loc374 = loc("xindex"(#loc103)) +#loc375 = loc("off_hz"(#loc113)) +#loc376 = loc("off_hz"(#loc114)) +#loc377 = loc("l_ptrs"(#loc115)) +#loc378 = loc("l_ptrs"(#loc116)) +#loc379 = loc("l_ptrs"(#loc117)) +#loc380 = loc("lse"(#loc118)) +#loc381 = loc("lse"(#loc119)) +#loc388 = loc("ptr"(#loc128)) +#loc389 = loc("ptr"(#loc129)) +#loc390 = loc("ptr"(#loc130)) +#loc391 = loc("ptr"(#loc131)) +#loc392 = loc("ptr"(#loc132)) +#loc393 = loc("ptr"(#loc133)) +#loc432 = loc("kv_offset"(#loc146)) +#loc433 = loc("acc"(#loc147)) +#loc434 = loc("offset"(#loc149)) +#loc435 = loc("offs_n"(#loc150)) +#loc436 = loc("kv_offset"(#loc151)) +#loc472 = loc("kv_base_offset"(#loc156)) +#loc473 = loc("offs_k"(#loc157)) +#loc474 = loc("offs_n_load"(#loc158)) +#loc475 = loc("offs_n_load"(#loc159)) +#loc476 = loc("k"(#loc160)) +#loc477 = loc("k"(#loc161)) +#loc478 = loc("qk"(#loc162)) +#loc479 = loc("qk"(#loc163)) +#loc480 = loc("m"(#loc164)) +#loc481 = loc("n"(#loc165)) +#loc482 = loc("post_mod_scores"(#loc166)) +#loc483 = loc("post_mod_scores"(#loc167)) +#loc484 = loc("tmp1"(#loc168)) +#loc485 = loc("tmp4"(#loc169)) +#loc486 = loc("tmp5"(#loc170)) +#loc487 = loc("tmp7"(#loc171)) +#loc488 = loc("tmp7"(#loc172)) +#loc489 = loc("tmp8"(#loc173)) +#loc490 = loc("tmp9"(#loc174)) +#loc491 = loc("tmp10"(#loc175)) +#loc492 = loc("tmp11"(#loc176)) +#loc493 = loc("tmp12"(#loc177)) +#loc494 = loc("tmp13"(#loc178)) +#loc495 = loc("tmp15"(#loc179)) +#loc496 = loc("tmp16"(#loc180)) +#loc497 = loc("tmp17"(#loc181)) +#loc498 = loc("tmp18"(#loc182)) +#loc499 = loc("tmp19"(#loc183)) +#loc500 = loc("tmp20"(#loc184)) +#loc501 = loc("tmp21"(#loc185)) +#loc502 = loc("tmp22"(#loc186)) +#loc503 = loc("tmp23"(#loc187)) +#loc504 = loc("tmp24"(#loc188)) +#loc505 = loc("tmp25"(#loc189)) +#loc506 = loc("tmp26"(#loc190)) +#loc507 = loc("tmp27"(#loc191)) +#loc508 = loc("tmp28"(#loc192)) +#loc509 = loc("tmp29"(#loc193)) +#loc510 = loc("tmp30"(#loc194)) +#loc511 = loc("tmp31"(#loc195)) +#loc512 = loc("tmp32"(#loc196)) +#loc513 = loc("tmp33"(#loc197)) +#loc514 = loc("tmp34"(#loc198)) +#loc515 = loc("tmp35"(#loc199)) +#loc516 = loc("tmp36"(#loc200)) +#loc517 = loc("tmp37"(#loc201)) +#loc518 = loc("tmp38"(#loc202)) +#loc519 = loc("mask_mod_output"(#loc203)) +#loc520 = loc("mask_mod_output"(#loc204)) +#loc521 = loc("post_mod_scores"(#loc205)) +#loc522 = loc("post_mod_scores"(#loc206)) +#loc523 = loc("m_ij"(#loc207)) +#loc524 = loc("m_ij"(#loc208)) +#loc525 = loc("masked_out_rows"(#loc209)) +#loc526 = loc("m_ij_masked"(#loc210)) +#loc527 = loc("alpha"(#loc211)) +#loc528 = loc("alpha"(#loc212)) +#loc529 = loc("p"(#loc213)) +#loc530 = loc("p"(#loc214)) +#loc531 = loc("p"(#loc215)) +#loc532 = loc("l_i"(#loc216)) +#loc533 = loc("l_i"(#loc217)) +#loc534 = loc("l_i"(#loc218)) +#loc535 = loc("acc"(#loc219)) +#loc536 = loc("acc"(#loc220)) +#loc537 = loc("offs_v"(#loc221)) +#loc538 = loc("v"(#loc222)) +#loc539 = loc("acc"(#loc223)) +#loc540 = loc("acc"(#loc224)) +#loc552 = loc("cur_block_idx"(#loc249)) +#loc553 = loc("cur_block"(#loc250)) +#loc554 = loc("cur_block"(#loc251)) +#loc555 = loc("next_block"(#loc252)) +#loc556 = loc("next_block"(#loc253)) +#loc557 = loc("next_block"(#loc254)) +#loc558 = loc("next_block"(#loc255)) +#loc559 = loc("next_block"(#loc256)) +#loc560 = loc("needs_jump"(#loc257)) +#loc561 = loc("needs_jump"(#loc258)) +#loc562 = loc("needs_jump"(#loc259)) +#loc563 = loc("jump_to_block"(#loc260)) +#loc564 = loc("jump_to_block"(#loc261)) +#loc565 = loc("jump_to_block"(#loc262)) +#loc566 = loc("offset"(#loc263)) +#loc567 = loc("offset"(#loc264)) +#loc568 = loc("offset"(#loc265)) +#loc569 = loc("offset"(#loc266)) +#loc570 = loc("l_i"(#loc433)) +#loc571 = loc("m_i"(#loc570)) +#loc572 = loc("offs_n"(#loc571)) +#loc573 = loc("kv_offset"(#loc572)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..7647d945e03b6d1cc8c35083242a478d74de95ba --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.ttgir @@ -0,0 +1,1004 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":18:0) +#loc1 = loc(unknown) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":538:16) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":172:41) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":421:51) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":434:34) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":198:45) +#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 64, 16]}> +#mma1 = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 128, 16]}> +#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}> +#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}> +#smem = #ttg.shared_memory +#loc173 = loc("arg_Q"(#loc)) +#loc174 = loc("arg_K"(#loc)) +#loc175 = loc("arg_V"(#loc)) +#loc176 = loc("arg_LSE"(#loc)) +#loc177 = loc("arg_MAX"(#loc)) +#loc178 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc179 = loc("arg_KV_IDX"(#loc)) +#loc180 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc181 = loc("arg_FULL_KV_IDX"(#loc)) +#loc182 = loc("in_ptr9"(#loc)) +#loc183 = loc("out_ptr0"(#loc)) +#loc184 = loc("ks0"(#loc)) +#loc185 = loc("ks1"(#loc)) +#loc186 = loc("ks2"(#loc)) +#loc187 = loc("ks3"(#loc)) +#loc188 = loc("ks4"(#loc)) +#loc189 = loc("ks5"(#loc)) +#loc231 = loc(callsite(#loc51 at #loc52)) +#loc278 = loc("m_ij"(#loc101)) +#loc288 = loc("l_i"(#loc113)) +#loc322 = loc(callsite(#loc51 at #loc149)) +#loc392 = loc(callsite(#loc278 at #loc231)) +#loc402 = loc(callsite(#loc288 at #loc231)) +#loc421 = loc(callsite(#loc278 at #loc322)) +#loc431 = loc(callsite(#loc288 at #loc322)) +#loc453 = loc(callsite(#loc1 at #loc392)) +#loc455 = loc(callsite(#loc1 at #loc402)) +#loc483 = loc(callsite(#loc1 at #loc421)) +#loc485 = loc(callsite(#loc1 at #loc431)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %in_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr9"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc)), %ks4: i32 loc("ks4"(#loc)), %ks5: i32 loc("ks5"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x64xi32, #mma> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<128x64xi32, #mma> loc(#loc1) + %cst_1 = arith.constant dense : tensor<128x64xi1, #mma> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_4 = arith.constant dense<4096> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %cst_6 = arith.constant dense<0.000000e+00> : tensor<128x128xbf16, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %c63_i32 = arith.constant 63 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_8 = arith.constant dense<0.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %cst_9 = arith.constant dense<1.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %cst_10 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma1> loc(#loc1) + %cst_11 = arith.constant dense<0.0883883461> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_12 = arith.constant dense<0xFF800000> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_13 = arith.constant dense<1.44269502> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_14 = arith.constant dense<0xFF800000> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %c-1_i32 = arith.constant -1 : i32 loc(#loc1) + %c3_i32 = arith.constant 3 : i32 loc(#loc1) + %0 = arith.muli %ks0, %c4096_i32 : i32 loc(#loc2) + %1 = arith.muli %ks1, %c1024_i32 : i32 loc(#loc3) + %2 = arith.muli %ks1, %c128_i32 : i32 loc(#loc4) + %q_start = tt.get_program_id x : i32 loc(#loc190) + %off_zq = tt.get_program_id y : i32 loc(#loc191) + %off_hq = tt.get_program_id z : i32 loc(#loc192) + %off_zkv = arith.remsi %off_zq, %c2_i32 : i32 loc(#loc193) + %off_hkv = arith.divsi %off_hq, %c4_i32 : i32 loc(#loc194) + %q_offset = arith.muli %off_zq, %0 : i32 loc(#loc195) + %q_offset_15 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc196) + %q_offset_16 = arith.addi %q_offset, %q_offset_15 : i32 loc(#loc197) + %k_offset = arith.muli %off_zkv, %1 : i32 loc(#loc198) + %k_offset_17 = arith.muli %off_hkv, %2 : i32 loc(#loc199) + %k_offset_18 = arith.addi %k_offset, %k_offset_17 : i32 loc(#loc200) + %Q = tt.addptr %arg_Q, %q_offset_16 : !tt.ptr, i32 loc(#loc201) + %K = tt.addptr %arg_K, %k_offset_18 : !tt.ptr, i32 loc(#loc202) + %V = tt.addptr %arg_V, %k_offset_18 : !tt.ptr, i32 loc(#loc203) + %stride_kv_idx_h = arith.muli %ks3, %ks4 : i32 loc(#loc204) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %ks2 : i32 loc(#loc205) + %sparse_kv_num_blks_offset_19 = arith.addi %sparse_kv_num_blks_offset, %q_start : i32 loc(#loc206) + %sparse_kv_idx_offset = arith.muli %off_zkv, %stride_kv_idx_h : i32 loc(#loc207) + %sparse_kv_idx_offset_20 = arith.muli %q_start, %ks4 : i32 loc(#loc208) + %sparse_kv_idx_offset_21 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_20 : i32 loc(#loc209) + %offs_m = arith.muli %q_start, %c128_i32 : i32 loc(#loc210) + %offs_m_22 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc211) + %offs_m_23 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc211) + %offs_m_24 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked1> loc(#loc211) + %offs_m_25 = tt.splat %offs_m : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc212) + %offs_m_26 = tt.splat %offs_m : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc212) + %offs_m_27 = tt.splat %offs_m : i32 -> tensor<128xi32, #blocked1> loc(#loc212) + %offs_m_28 = arith.addi %offs_m_25, %offs_m_22 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc212) + %offs_m_29 = arith.addi %offs_m_26, %offs_m_23 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc212) + %offs_m_30 = arith.addi %offs_m_27, %offs_m_24 : tensor<128xi32, #blocked1> loc(#loc212) + %ptr = tt.expand_dims %offs_m_28 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc336) + %ptr_31 = tt.expand_dims %offs_m_29 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xi32, #mma> loc(#loc336) + %ptr_32 = arith.muli %ptr, %cst_4 : tensor<128x1xi32, #blocked> loc(#loc337) + %ptr_33 = tt.splat %Q : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc338) + %ptr_34 = tt.addptr %ptr_33, %ptr_32 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc338) + %ptr_35 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc339) + %ptr_36 = tt.expand_dims %ptr_35 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc339) + %ptr_37 = tt.broadcast %ptr_34 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc340) + %ptr_38 = tt.broadcast %ptr_36 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc340) + %ptr_39 = tt.addptr %ptr_37, %ptr_38 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc340) + %q = tt.splat %ks0 : i32 -> tensor<128x1xi32, #blocked> loc(#loc341) + %q_40 = tt.splat %ks0 : i32 -> tensor<128x1xi32, #mma> loc(#loc341) + %q_41 = arith.cmpi slt, %ptr, %q : tensor<128x1xi32, #blocked> loc(#loc341) + %q_42 = tt.broadcast %q_41 : tensor<128x1xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc342) + %q_43 = tt.load %ptr_39, %q_42, %cst_6 : tensor<128x128x!tt.ptr, #blocked> loc(#loc342) + %q_44 = ttg.local_alloc %q_43 : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc342) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_21 : !tt.ptr, i32 loc(#loc219) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc220) + %kv_start_45 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc221) + %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_19 : !tt.ptr, i32 loc(#loc222) + %kv_num_blocks_46 = tt.load %kv_num_blocks : !tt.ptr loc(#loc223) + %block_n_end = arith.muli %kv_num_blocks_46, %c2_i32 : i32 loc(#loc224) + %block_n_end_47 = arith.addi %ks1, %c63_i32 : i32 loc(#loc343) + %block_n_end_48 = arith.divsi %block_n_end_47, %c64_i32 : i32 loc(#loc344) + %block_n_end_49 = arith.maxsi %block_n_end_48, %c1_i32 : i32 loc(#loc226) + %block_n_end_50 = arith.minsi %block_n_end, %block_n_end_49 : i32 loc(#loc227) + %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc228) + %offs_n_51 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc228) + %offs_n_52 = tt.splat %kv_start_45 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc229) + %offs_n_53 = arith.addi %offs_n_52, %offs_n : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc229) + %3 = tt.expand_dims %offs_n_53 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc49) + %ptr_54 = tt.splat %K : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked> loc(#loc440) + %ptr_55 = tt.broadcast %ptr_36 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked> loc(#loc441) + %k = tt.splat %ks1 : i32 -> tensor<64x1xi32, #blocked> loc(#loc442) + %m = arith.remsi %ptr_31, %q_40 : tensor<128x1xi32, #mma> loc(#loc443) + %n = tt.splat %ks1 : i32 -> tensor<1x64xi32, #mma> loc(#loc444) + %tmp4 = tt.broadcast %m : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc348) + %tmp7 = tt.addptr %in_ptr9, %off_zq : !tt.ptr, i32 loc(#loc349) + %kv_offset = arith.cmpi sgt, %block_n_end_50, %c0_i32 : i32 loc(#loc509) + %tmp7_56 = tt.load %tmp7, %kv_offset : !tt.ptr loc(#loc351) + %tmp8 = tt.splat %tmp7_56 : i64 -> tensor<1x64xi64, #mma> loc(#loc352) + %tmp9 = arith.extsi %m : tensor<128x1xi32, #mma> to tensor<128x1xi64, #mma> loc(#loc353) + %tmp10 = tt.splat %tmp7_56 : i64 -> tensor<128x1xi64, #mma> loc(#loc354) + %tmp10_57 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64, #mma> loc(#loc354) + %tmp11 = tt.broadcast %tmp10_57 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc355) + %tmp15 = tt.splat %ks5 : i32 -> tensor<1x64xi32, #mma> loc(#loc356) + %tmp20 = arith.cmpi slt, %ks5, %c0_i32 : i32 loc(#loc357) + %tmp21 = tt.splat %tmp20 : i1 -> tensor<1x64xi1, #mma> loc(#loc358) + %tmp29 = tt.splat %ks5 : i32 -> tensor<128x64xi32, #mma> loc(#loc359) + %tmp32 = tt.splat %tmp20 : i1 -> tensor<128x64xi1, #mma> loc(#loc360) + %ptr_58 = tt.splat %V : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked> loc(#loc446) + %k_59 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc447) + %v = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc448) + %offs_n_load = tt.splat %kv_start_45 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc362) + %offs_n_load_60 = arith.addi %offs_n_load, %offs_n_51 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc362) + %ptr_61 = tt.expand_dims %offs_n_load_60 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc449) + %ptr_62 = arith.muli %ptr_61, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc450) + %ptr_63 = tt.addptr %ptr_54, %ptr_62 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc440) + %ptr_64 = tt.broadcast %ptr_63 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc441) + %ptr_65 = tt.addptr %ptr_64, %ptr_55 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc441) + %k_66 = arith.cmpi slt, %ptr_61, %k : tensor<64x1xi32, #blocked> loc(#loc442) + %k_67 = tt.broadcast %k_66 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc447) + %k_68 = ttg.memdesc_index %k_59[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc447) + %kv_offset_69 = tt.splat %kv_offset : i1 -> tensor<64x128xi1, #blocked> loc(#loc509) + %kv_offset_70 = arith.andi %kv_offset_69, %k_67 : tensor<64x128xi1, #blocked> loc(#loc509) + %k_71 = ttg.async_copy_global_to_local %ptr_65, %k_68 mask %kv_offset_70 other %cst_7 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc447) + %k_72 = ttg.async_commit_group tokens %k_71 loc(#loc447) + %ptr_73 = tt.addptr %ptr_58, %ptr_62 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc446) + %ptr_74 = tt.broadcast %ptr_73 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc451) + %ptr_75 = tt.addptr %ptr_74, %ptr_55 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc451) + %v_76 = ttg.memdesc_index %v[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc448) + %v_77 = ttg.async_copy_global_to_local %ptr_75, %v_76 mask %kv_offset_70 other %cst_7 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc448) + %v_78 = ttg.async_commit_group tokens %v_77 loc(#loc448) + %kv_offset_79 = arith.cmpi sgt, %block_n_end_50, %c1_i32 : i32 loc(#loc509) + %kv_base_offset = arith.addi %kv_start_45, %c64_i32 : i32 loc(#loc363) + %offs_n_load_80 = tt.splat %kv_base_offset : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc362) + %offs_n_load_81 = arith.addi %offs_n_load_80, %offs_n_51 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc362) + %ptr_82 = tt.expand_dims %offs_n_load_81 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc449) + %ptr_83 = arith.muli %ptr_82, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc450) + %ptr_84 = tt.addptr %ptr_54, %ptr_83 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc440) + %ptr_85 = tt.broadcast %ptr_84 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc441) + %ptr_86 = tt.addptr %ptr_85, %ptr_55 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc441) + %k_87 = arith.cmpi slt, %ptr_82, %k : tensor<64x1xi32, #blocked> loc(#loc442) + %k_88 = tt.broadcast %k_87 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc447) + %k_89 = ttg.memdesc_index %k_59[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc447) + %kv_offset_90 = tt.splat %kv_offset_79 : i1 -> tensor<64x128xi1, #blocked> loc(#loc509) + %kv_offset_91 = arith.andi %kv_offset_90, %k_88 : tensor<64x128xi1, #blocked> loc(#loc509) + %k_92 = ttg.async_copy_global_to_local %ptr_86, %k_89 mask %kv_offset_91 other %cst_7 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc447) + %k_93 = ttg.async_commit_group tokens %k_92 loc(#loc447) + %ptr_94 = tt.addptr %ptr_58, %ptr_83 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc446) + %ptr_95 = tt.broadcast %ptr_94 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc451) + %ptr_96 = tt.addptr %ptr_95, %ptr_55 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc451) + %v_97 = ttg.memdesc_index %v[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc448) + %v_98 = ttg.async_copy_global_to_local %ptr_96, %v_97 mask %kv_offset_91 other %cst_7 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc448) + %v_99 = ttg.async_commit_group tokens %v_98 loc(#loc448) + ttng.fence_async_shared {bCluster = false} loc(#loc364) + %kv_offset_100:12 = scf.for %kv_offset_171 = %c0_i32 to %block_n_end_50 step %c1_i32 iter_args(%acc_172 = %cst_10, %arg19 = %cst_8, %arg20 = %cst_14, %arg21 = %c64_i32, %arg22 = %3, %arg23 = %c1_i32, %arg24 = %c-1_i32, %k_173 = %k_72, %k_174 = %k_93, %v_175 = %v_78, %v_176 = %v_99, %arg29 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 { + %kv_offset_177 = arith.subi %block_n_end_50, %c2_i32 : i32 loc(#loc509) + %kv_offset_178 = arith.cmpi slt, %kv_offset_171, %kv_offset_177 : i32 loc(#loc509) + %kv_offset_179 = arith.subi %block_n_end_50, %c1_i32 : i32 loc(#loc509) + %kv_offset_180 = arith.cmpi slt, %kv_offset_171, %kv_offset_179 : i32 loc(#loc509) + %kv_offset_181 = arith.addi %arg24, %c1_i32 : i32 loc(#loc509) + %kv_offset_182 = arith.cmpi sge, %kv_offset_181, %c3_i32 : i32 loc(#loc509) + %kv_offset_183 = arith.select %kv_offset_182, %c0_i32, %kv_offset_181 : i32 loc(#loc509) + %k_184 = ttg.async_wait %k_173, %v_175 {num = 2 : i32} loc(#loc447) + %k_185 = ttg.memdesc_index %k_59[%kv_offset_183] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc447) + %k_186 = ttg.memdesc_trans %k_185 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc365) + %qk = ttng.warp_group_dot %q_44, %k_186, %cst_3 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc364) + %qk_187:4 = ttng.warp_group_dot_wait %qk, %q_44, %k_186, %acc_172 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64>, tensor<128x128xf32, #mma1> loc(#loc364) + %qk_188 = arith.mulf %qk_187#0, %cst_11 : tensor<128x64xf32, #mma> loc(#loc366) + %n_189 = arith.remsi %arg22, %n : tensor<1x64xi32, #mma> loc(#loc444) + %post_mod_scores = arith.cmpi slt, %arg22, %n : tensor<1x64xi32, #mma> loc(#loc367) + %post_mod_scores_190 = tt.broadcast %post_mod_scores : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc368) + %post_mod_scores_191 = arith.select %post_mod_scores_190, %qk_188, %cst_12 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc368) + %tmp4_192 = tt.broadcast %n_189 : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc348) + %tmp4_193 = arith.cmpi sge, %tmp4, %tmp4_192 : tensor<128x64xi32, #mma> loc(#loc348) + %tmp5 = arith.extsi %n_189 : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc369) + %tmp8_194 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64, #mma> loc(#loc352) + %tmp11_195 = tt.broadcast %tmp8_194 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc355) + %tmp11_196 = arith.andi %tmp11_195, %tmp11 : tensor<128x64xi1, #mma> loc(#loc355) + %tmp12 = arith.andi %tmp4_193, %tmp11_196 : tensor<128x64xi1, #mma> loc(#loc370) + %tmp15_197 = arith.cmpi sge, %n_189, %tmp15 : tensor<1x64xi32, #mma> loc(#loc356) + %tmp16 = arith.remsi %n_189, %tmp15 : tensor<1x64xi32, #mma> loc(#loc371) + %tmp18 = arith.cmpi ne, %tmp16, %cst : tensor<1x64xi32, #mma> loc(#loc372) + %tmp19 = arith.cmpi slt, %tmp16, %cst : tensor<1x64xi32, #mma> loc(#loc373) + %tmp21_198 = arith.cmpi ne, %tmp19, %tmp21 : tensor<1x64xi1, #mma> loc(#loc358) + %tmp22 = arith.andi %tmp18, %tmp21_198 : tensor<1x64xi1, #mma> loc(#loc374) + %tmp23 = arith.addi %tmp16, %tmp15 : tensor<1x64xi32, #mma> loc(#loc375) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1, #mma>, tensor<1x64xi32, #mma> loc(#loc376) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc377) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64, #mma> loc(#loc378) + %tmp27 = arith.andi %tmp15_197, %tmp26 : tensor<1x64xi1, #mma> loc(#loc379) + %tmp28 = arith.subi %tmp4_192, %tmp4 : tensor<128x64xi32, #mma> loc(#loc380) + %tmp29_199 = arith.remsi %tmp28, %tmp29 : tensor<128x64xi32, #mma> loc(#loc359) + %tmp30 = arith.cmpi ne, %tmp29_199, %cst_0 : tensor<128x64xi32, #mma> loc(#loc381) + %tmp31 = arith.cmpi slt, %tmp29_199, %cst_0 : tensor<128x64xi32, #mma> loc(#loc382) + %tmp32_200 = arith.cmpi ne, %tmp31, %tmp32 : tensor<128x64xi1, #mma> loc(#loc360) + %tmp33 = arith.andi %tmp30, %tmp32_200 : tensor<128x64xi1, #mma> loc(#loc383) + %tmp34 = arith.addi %tmp29_199, %tmp29 : tensor<128x64xi32, #mma> loc(#loc384) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29_199 : tensor<128x64xi1, #mma>, tensor<128x64xi32, #mma> loc(#loc385) + %tmp36 = arith.cmpi eq, %tmp35, %cst_0 : tensor<128x64xi32, #mma> loc(#loc386) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc387) + %tmp37_201 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1, #mma> loc(#loc387) + %tmp38 = arith.ori %tmp12, %tmp37_201 : tensor<128x64xi1, #mma> loc(#loc388) + %mask_mod_output = arith.select %post_mod_scores_190, %tmp38, %cst_1 : tensor<128x64xi1, #mma>, tensor<128x64xi1, #mma> loc(#loc389) + %post_mod_scores_202 = arith.select %mask_mod_output, %post_mod_scores_191, %cst_12 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc390) + %post_mod_scores_203 = arith.mulf %post_mod_scores_202, %cst_13 : tensor<128x64xf32, #mma> loc(#loc391) + %m_ij = "tt.reduce"(%post_mod_scores_203) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_261: f32 loc(callsite(#loc1 at #loc392)), %m_ij_262: f32 loc(callsite(#loc1 at #loc392))): + %m_ij_263 = arith.maxnumf %m_ij_261, %m_ij_262 : f32 loc(#loc504) + tt.reduce.return %m_ij_263 : f32 loc(#loc452) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc452) + %m_ij_204 = arith.maxnumf %arg20, %m_ij : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc393) + %masked_out_rows = arith.cmpf oeq, %m_ij_204, %cst_14 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc394) + %m_ij_masked = arith.select %masked_out_rows, %cst_8, %m_ij_204 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc395) + %alpha = arith.subf %arg20, %m_ij_masked : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc396) + %alpha_205 = math.exp2 %alpha : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc397) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc398) + %p_206 = tt.broadcast %p : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc399) + %p_207 = arith.subf %post_mod_scores_203, %p_206 : tensor<128x64xf32, #mma> loc(#loc399) + %p_208 = math.exp2 %p_207 : tensor<128x64xf32, #mma> loc(#loc400) + %l_i_209 = arith.mulf %arg19, %alpha_205 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc401) + %l_i_210 = "tt.reduce"(%p_208) <{axis = 1 : i32}> ({ + ^bb0(%l_i_261: f32 loc(callsite(#loc1 at #loc402)), %l_i_262: f32 loc(callsite(#loc1 at #loc402))): + %l_i_263 = arith.addf %l_i_261, %l_i_262 : f32 loc(#loc505) + tt.reduce.return %l_i_263 : f32 loc(#loc454) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc454) + %l_i_211 = arith.addf %l_i_209, %l_i_210 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc403) + %acc_212 = tt.expand_dims %alpha_205 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc404) + %acc_213 = ttg.convert_layout %acc_212 : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc405) + %acc_214 = tt.broadcast %acc_213 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc405) + %acc_215 = arith.mulf %qk_187#3, %acc_214 : tensor<128x128xf32, #mma1> loc(#loc405) + %v_216 = ttg.memdesc_index %v[%kv_offset_183] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc448) + %acc_217 = arith.truncf %p_208 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc406) + %acc_218 = ttg.convert_layout %acc_217 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc406) + %acc_219 = ttng.warp_group_dot %acc_218, %v_216, %acc_215 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc407) + %offs_n_220 = tt.splat %arg29 : i32 -> tensor<1x64xi32, #mma> loc(#loc408) + %offs_n_221 = arith.addi %arg22, %offs_n_220 : tensor<1x64xi32, #mma> loc(#loc408) + %kv_offset_222 = arith.addi %kv_offset_171, %c1_i32 : i32 loc(#loc509) + %cur_block_idx = arith.divsi %kv_offset_222, %c2_i32 : i32 loc(#loc456) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc457) + %cur_block_223 = tt.load %cur_block, %kv_offset_180 evictionPolicy = evict_last : !tt.ptr loc(#loc458) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc459) + %next_block_224 = arith.cmpi slt, %next_block, %kv_num_blocks_46 : i32 loc(#loc460) + %next_block_225 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc461) + %kv_offset_226 = arith.andi %kv_offset_180, %next_block_224 : i1 loc(#loc509) + %next_block_227 = tt.load %next_block_225, %kv_offset_226 evictionPolicy = evict_last : !tt.ptr loc(#loc462) + %needs_jump = arith.addi %kv_offset_171, %c2_i32 : i32 loc(#loc463) + %needs_jump_228 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc464) + %needs_jump_229 = arith.cmpi eq, %needs_jump_228, %c0_i32 : i32 loc(#loc465) + %jump_to_block = arith.subi %next_block_227, %cur_block_223 : i32 loc(#loc466) + %jump_to_block_230 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc467) + %jump_to_block_231 = arith.subi %jump_to_block_230, %c64_i32 : i32 loc(#loc468) + %offset = arith.extui %needs_jump_229 : i1 to i32 loc(#loc469) + %offset_232 = arith.muli %jump_to_block_231, %offset : i32 loc(#loc469) + %offset_233 = arith.subi %c1_i32, %offset : i32 loc(#loc470) + %offset_234 = arith.muli %offset_233, %c64_i32 : i32 loc(#loc471) + %offset_235 = arith.addi %offset_232, %offset_234 : i32 loc(#loc472) + %kv_offset_236 = arith.addi %arg21, %offset_235 : i32 loc(#loc410) + %kv_offset_237 = arith.addi %arg23, %c1_i32 : i32 loc(#loc509) + %kv_offset_238 = arith.cmpi sge, %kv_offset_237, %c3_i32 : i32 loc(#loc509) + %kv_offset_239 = arith.select %kv_offset_238, %c0_i32, %kv_offset_237 : i32 loc(#loc509) + %kv_base_offset_240 = arith.addi %kv_start_45, %kv_offset_236 : i32 loc(#loc363) + %offs_n_load_241 = tt.splat %kv_base_offset_240 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc362) + %offs_n_load_242 = arith.addi %offs_n_load_241, %offs_n_51 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc362) + %ptr_243 = tt.expand_dims %offs_n_load_242 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc449) + %ptr_244 = arith.muli %ptr_243, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc450) + %ptr_245 = tt.addptr %ptr_54, %ptr_244 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc440) + %ptr_246 = tt.broadcast %ptr_245 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc441) + %ptr_247 = tt.addptr %ptr_246, %ptr_55 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc441) + %k_248 = arith.cmpi slt, %ptr_243, %k : tensor<64x1xi32, #blocked> loc(#loc442) + %k_249 = tt.broadcast %k_248 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc447) + %k_250 = ttg.memdesc_index %k_59[%kv_offset_239] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc447) + %kv_offset_251 = tt.splat %kv_offset_178 : i1 -> tensor<64x128xi1, #blocked> loc(#loc509) + %kv_offset_252 = arith.andi %kv_offset_251, %k_249 : tensor<64x128xi1, #blocked> loc(#loc509) + %k_253 = ttg.async_copy_global_to_local %ptr_247, %k_250 mask %kv_offset_252 other %cst_7 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc447) + %k_254 = ttg.async_commit_group tokens %k_253 loc(#loc447) + %ptr_255 = tt.addptr %ptr_58, %ptr_244 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc446) + %ptr_256 = tt.broadcast %ptr_255 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc451) + %ptr_257 = tt.addptr %ptr_256, %ptr_55 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc451) + %v_258 = ttg.memdesc_index %v[%kv_offset_239] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc448) + %v_259 = ttg.async_copy_global_to_local %ptr_257, %v_258 mask %kv_offset_252 other %cst_7 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc448) + %v_260 = ttg.async_commit_group tokens %v_259 loc(#loc448) + scf.yield %acc_219, %l_i_211, %m_ij_204, %kv_offset_236, %offs_n_221, %kv_offset_239, %kv_offset_183, %k_174, %k_254, %v_176, %v_260, %offset_235 : tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc509) + } loc(#loc509) + %kv_offset_101 = ttng.warp_group_dot_wait %kv_offset_100#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc509) + %kv_offset_102 = ttg.async_wait {num = 0 : i32} loc(#loc509) + ttg.local_dealloc %v : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc509) + ttg.local_dealloc %k_59 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc509) + %kv_indices_103 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_21 : !tt.ptr, i32 loc(#loc314) + %kv_start_104 = tt.load %kv_indices_103 : !tt.ptr loc(#loc315) + %kv_start_105 = arith.muli %kv_start_104, %c128_i32 : i32 loc(#loc316) + %kv_num_blocks_106 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_19 : !tt.ptr, i32 loc(#loc317) + %kv_num_blocks_107 = tt.load %kv_num_blocks_106 : !tt.ptr loc(#loc318) + %block_n_end_108 = arith.muli %kv_num_blocks_107, %c2_i32 : i32 loc(#loc319) + %block_n_end_109 = arith.minsi %block_n_end_108, %block_n_end_49 : i32 loc(#loc320) + %offs_n_110 = tt.splat %kv_start_105 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc321) + %offs_n_111 = arith.addi %offs_n_110, %offs_n : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc321) + %4 = tt.expand_dims %offs_n_111 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc148) + %k_112 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc473) + %v_113 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc474) + %kv_offset_114 = arith.cmpi sgt, %block_n_end_109, %c0_i32 : i32 loc(#loc510) + %offs_n_load_115 = tt.splat %kv_start_105 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc413) + %offs_n_load_116 = arith.addi %offs_n_load_115, %offs_n_51 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc413) + %ptr_117 = tt.expand_dims %offs_n_load_116 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc475) + %ptr_118 = arith.muli %ptr_117, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc476) + %ptr_119 = tt.addptr %ptr_54, %ptr_118 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc477) + %ptr_120 = tt.broadcast %ptr_119 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc478) + %ptr_121 = tt.addptr %ptr_120, %ptr_55 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc478) + %k_122 = arith.cmpi slt, %ptr_117, %k : tensor<64x1xi32, #blocked> loc(#loc479) + %k_123 = tt.broadcast %k_122 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc473) + %k_124 = ttg.memdesc_index %k_112[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc473) + %kv_offset_125 = tt.splat %kv_offset_114 : i1 -> tensor<64x128xi1, #blocked> loc(#loc510) + %kv_offset_126 = arith.andi %kv_offset_125, %k_123 : tensor<64x128xi1, #blocked> loc(#loc510) + %k_127 = ttg.async_copy_global_to_local %ptr_121, %k_124 mask %kv_offset_126 other %cst_7 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc473) + %k_128 = ttg.async_commit_group tokens %k_127 loc(#loc473) + %ptr_129 = tt.addptr %ptr_58, %ptr_118 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc480) + %ptr_130 = tt.broadcast %ptr_129 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc481) + %ptr_131 = tt.addptr %ptr_130, %ptr_55 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc481) + %v_132 = ttg.memdesc_index %v_113[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc474) + %v_133 = ttg.async_copy_global_to_local %ptr_131, %v_132 mask %kv_offset_126 other %cst_7 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc474) + %v_134 = ttg.async_commit_group tokens %v_133 loc(#loc474) + %kv_offset_135 = arith.cmpi sgt, %block_n_end_109, %c1_i32 : i32 loc(#loc510) + %kv_base_offset_136 = arith.addi %kv_start_105, %c64_i32 : i32 loc(#loc414) + %offs_n_load_137 = tt.splat %kv_base_offset_136 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc413) + %offs_n_load_138 = arith.addi %offs_n_load_137, %offs_n_51 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc413) + %ptr_139 = tt.expand_dims %offs_n_load_138 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc475) + %ptr_140 = arith.muli %ptr_139, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc476) + %ptr_141 = tt.addptr %ptr_54, %ptr_140 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc477) + %ptr_142 = tt.broadcast %ptr_141 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc478) + %ptr_143 = tt.addptr %ptr_142, %ptr_55 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc478) + %k_144 = arith.cmpi slt, %ptr_139, %k : tensor<64x1xi32, #blocked> loc(#loc479) + %k_145 = tt.broadcast %k_144 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc473) + %k_146 = ttg.memdesc_index %k_112[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc473) + %kv_offset_147 = tt.splat %kv_offset_135 : i1 -> tensor<64x128xi1, #blocked> loc(#loc510) + %kv_offset_148 = arith.andi %kv_offset_147, %k_145 : tensor<64x128xi1, #blocked> loc(#loc510) + %k_149 = ttg.async_copy_global_to_local %ptr_143, %k_146 mask %kv_offset_148 other %cst_7 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc473) + %k_150 = ttg.async_commit_group tokens %k_149 loc(#loc473) + %ptr_151 = tt.addptr %ptr_58, %ptr_140 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc480) + %ptr_152 = tt.broadcast %ptr_151 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc481) + %ptr_153 = tt.addptr %ptr_152, %ptr_55 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc481) + %v_154 = ttg.memdesc_index %v_113[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc474) + %v_155 = ttg.async_copy_global_to_local %ptr_153, %v_154 mask %kv_offset_148 other %cst_7 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc474) + %v_156 = ttg.async_commit_group tokens %v_155 loc(#loc474) + ttng.fence_async_shared {bCluster = false} loc(#loc415) + %kv_offset_157:12 = scf.for %kv_offset_171 = %c0_i32 to %block_n_end_109 step %c1_i32 iter_args(%kv_offset_172 = %kv_offset_101, %kv_offset_173 = %kv_offset_100#1, %kv_offset_174 = %kv_offset_100#2, %arg21 = %c64_i32, %arg22 = %4, %arg23 = %c1_i32, %arg24 = %c-1_i32, %k_175 = %k_128, %k_176 = %k_150, %v_177 = %v_134, %v_178 = %v_156, %arg29 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 { + %kv_offset_179 = arith.subi %block_n_end_109, %c2_i32 : i32 loc(#loc510) + %kv_offset_180 = arith.cmpi slt, %kv_offset_171, %kv_offset_179 : i32 loc(#loc510) + %kv_offset_181 = arith.subi %block_n_end_109, %c1_i32 : i32 loc(#loc510) + %kv_offset_182 = arith.cmpi slt, %kv_offset_171, %kv_offset_181 : i32 loc(#loc510) + %kv_offset_183 = arith.addi %arg24, %c1_i32 : i32 loc(#loc510) + %kv_offset_184 = arith.cmpi sge, %kv_offset_183, %c3_i32 : i32 loc(#loc510) + %kv_offset_185 = arith.select %kv_offset_184, %c0_i32, %kv_offset_183 : i32 loc(#loc510) + %k_186 = ttg.async_wait %k_175, %v_177 {num = 2 : i32} loc(#loc473) + %k_187 = ttg.memdesc_index %k_112[%kv_offset_185] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc473) + %k_188 = ttg.memdesc_trans %k_187 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc416) + %qk = ttng.warp_group_dot %q_44, %k_188, %cst_3 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc415) + %qk_189:4 = ttng.warp_group_dot_wait %qk, %q_44, %k_188, %kv_offset_172 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64>, tensor<128x128xf32, #mma1> loc(#loc415) + %qk_190 = arith.mulf %qk_189#0, %cst_11 : tensor<128x64xf32, #mma> loc(#loc417) + %post_mod_scores = arith.cmpi slt, %arg22, %n : tensor<1x64xi32, #mma> loc(#loc418) + %post_mod_scores_191 = tt.broadcast %post_mod_scores : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc419) + %post_mod_scores_192 = arith.select %post_mod_scores_191, %qk_190, %cst_12 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc419) + %post_mod_scores_193 = arith.mulf %post_mod_scores_192, %cst_13 : tensor<128x64xf32, #mma> loc(#loc420) + %m_ij = "tt.reduce"(%post_mod_scores_193) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_251: f32 loc(callsite(#loc1 at #loc421)), %m_ij_252: f32 loc(callsite(#loc1 at #loc421))): + %m_ij_253 = arith.maxnumf %m_ij_251, %m_ij_252 : f32 loc(#loc506) + tt.reduce.return %m_ij_253 : f32 loc(#loc482) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc482) + %m_ij_194 = arith.maxnumf %kv_offset_174, %m_ij : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc422) + %masked_out_rows = arith.cmpf oeq, %m_ij_194, %cst_14 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc423) + %m_ij_masked = arith.select %masked_out_rows, %cst_8, %m_ij_194 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc424) + %alpha = arith.subf %kv_offset_174, %m_ij_masked : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc425) + %alpha_195 = math.exp2 %alpha : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc426) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc427) + %p_196 = tt.broadcast %p : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc428) + %p_197 = arith.subf %post_mod_scores_193, %p_196 : tensor<128x64xf32, #mma> loc(#loc428) + %p_198 = math.exp2 %p_197 : tensor<128x64xf32, #mma> loc(#loc429) + %l_i_199 = arith.mulf %kv_offset_173, %alpha_195 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc430) + %l_i_200 = "tt.reduce"(%p_198) <{axis = 1 : i32}> ({ + ^bb0(%l_i_251: f32 loc(callsite(#loc1 at #loc431)), %l_i_252: f32 loc(callsite(#loc1 at #loc431))): + %l_i_253 = arith.addf %l_i_251, %l_i_252 : f32 loc(#loc507) + tt.reduce.return %l_i_253 : f32 loc(#loc484) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc484) + %l_i_201 = arith.addf %l_i_199, %l_i_200 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc432) + %acc_202 = tt.expand_dims %alpha_195 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc433) + %acc_203 = ttg.convert_layout %acc_202 : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc434) + %acc_204 = tt.broadcast %acc_203 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc434) + %acc_205 = arith.mulf %qk_189#3, %acc_204 : tensor<128x128xf32, #mma1> loc(#loc434) + %v_206 = ttg.memdesc_index %v_113[%kv_offset_185] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc474) + %acc_207 = arith.truncf %p_198 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc435) + %acc_208 = ttg.convert_layout %acc_207 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc435) + %acc_209 = ttng.warp_group_dot %acc_208, %v_206, %acc_205 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc436) + %offs_n_210 = tt.splat %arg29 : i32 -> tensor<1x64xi32, #mma> loc(#loc437) + %offs_n_211 = arith.addi %arg22, %offs_n_210 : tensor<1x64xi32, #mma> loc(#loc437) + %kv_offset_212 = arith.addi %kv_offset_171, %c1_i32 : i32 loc(#loc510) + %cur_block_idx = arith.divsi %kv_offset_212, %c2_i32 : i32 loc(#loc486) + %cur_block = tt.addptr %kv_indices_103, %cur_block_idx : !tt.ptr, i32 loc(#loc487) + %cur_block_213 = tt.load %cur_block, %kv_offset_182 evictionPolicy = evict_last : !tt.ptr loc(#loc488) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc489) + %next_block_214 = arith.cmpi slt, %next_block, %kv_num_blocks_107 : i32 loc(#loc490) + %next_block_215 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc491) + %kv_offset_216 = arith.andi %kv_offset_182, %next_block_214 : i1 loc(#loc510) + %next_block_217 = tt.load %next_block_215, %kv_offset_216 evictionPolicy = evict_last : !tt.ptr loc(#loc492) + %needs_jump = arith.addi %kv_offset_171, %c2_i32 : i32 loc(#loc493) + %needs_jump_218 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc494) + %needs_jump_219 = arith.cmpi eq, %needs_jump_218, %c0_i32 : i32 loc(#loc495) + %jump_to_block = arith.subi %next_block_217, %cur_block_213 : i32 loc(#loc496) + %jump_to_block_220 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc497) + %jump_to_block_221 = arith.subi %jump_to_block_220, %c64_i32 : i32 loc(#loc498) + %offset = arith.extui %needs_jump_219 : i1 to i32 loc(#loc499) + %offset_222 = arith.muli %jump_to_block_221, %offset : i32 loc(#loc499) + %offset_223 = arith.subi %c1_i32, %offset : i32 loc(#loc500) + %offset_224 = arith.muli %offset_223, %c64_i32 : i32 loc(#loc501) + %offset_225 = arith.addi %offset_222, %offset_224 : i32 loc(#loc502) + %kv_offset_226 = arith.addi %arg21, %offset_225 : i32 loc(#loc439) + %kv_offset_227 = arith.addi %arg23, %c1_i32 : i32 loc(#loc510) + %kv_offset_228 = arith.cmpi sge, %kv_offset_227, %c3_i32 : i32 loc(#loc510) + %kv_offset_229 = arith.select %kv_offset_228, %c0_i32, %kv_offset_227 : i32 loc(#loc510) + %kv_base_offset_230 = arith.addi %kv_start_105, %kv_offset_226 : i32 loc(#loc414) + %offs_n_load_231 = tt.splat %kv_base_offset_230 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc413) + %offs_n_load_232 = arith.addi %offs_n_load_231, %offs_n_51 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc413) + %ptr_233 = tt.expand_dims %offs_n_load_232 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc475) + %ptr_234 = arith.muli %ptr_233, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc476) + %ptr_235 = tt.addptr %ptr_54, %ptr_234 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc477) + %ptr_236 = tt.broadcast %ptr_235 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc478) + %ptr_237 = tt.addptr %ptr_236, %ptr_55 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc478) + %k_238 = arith.cmpi slt, %ptr_233, %k : tensor<64x1xi32, #blocked> loc(#loc479) + %k_239 = tt.broadcast %k_238 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc473) + %k_240 = ttg.memdesc_index %k_112[%kv_offset_229] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc473) + %kv_offset_241 = tt.splat %kv_offset_180 : i1 -> tensor<64x128xi1, #blocked> loc(#loc510) + %kv_offset_242 = arith.andi %kv_offset_241, %k_239 : tensor<64x128xi1, #blocked> loc(#loc510) + %k_243 = ttg.async_copy_global_to_local %ptr_237, %k_240 mask %kv_offset_242 other %cst_7 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc473) + %k_244 = ttg.async_commit_group tokens %k_243 loc(#loc473) + %ptr_245 = tt.addptr %ptr_58, %ptr_234 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc480) + %ptr_246 = tt.broadcast %ptr_245 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc481) + %ptr_247 = tt.addptr %ptr_246, %ptr_55 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc481) + %v_248 = ttg.memdesc_index %v_113[%kv_offset_229] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc474) + %v_249 = ttg.async_copy_global_to_local %ptr_247, %v_248 mask %kv_offset_242 other %cst_7 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc474) + %v_250 = ttg.async_commit_group tokens %v_249 loc(#loc474) + scf.yield %acc_209, %l_i_201, %m_ij_194, %kv_offset_226, %offs_n_211, %kv_offset_229, %kv_offset_185, %k_176, %k_244, %v_178, %v_250, %offset_225 : tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc510) + } loc(#loc510) + %kv_offset_158 = ttng.warp_group_dot_wait %kv_offset_157#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc510) + %kv_offset_159 = ttg.async_wait {num = 0 : i32} loc(#loc510) + ttg.local_dealloc %v_113 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc510) + ttg.local_dealloc %k_112 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc510) + %l_i = arith.cmpf oeq, %kv_offset_157#1, %cst_8 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc323) + %l_i_160 = arith.select %l_i, %cst_9, %kv_offset_157#1 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc324) + %acc = tt.expand_dims %l_i_160 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc325) + %acc_161 = ttg.convert_layout %acc : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc326) + %acc_162 = tt.broadcast %acc_161 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc326) + %acc_163 = arith.divf %kv_offset_158, %acc_162 : tensor<128x128xf32, #mma1> loc(#loc326) + %mask = arith.cmpi slt, %ptr_36, %cst_5 : tensor<1x128xi32, #blocked> loc(#loc327) + %mask_164 = tt.broadcast %mask : tensor<1x128xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc328) + %mask_165 = arith.andi %q_42, %mask_164 : tensor<128x128xi1, #blocked> loc(#loc328) + %5 = tt.splat %q_offset_15 : i32 -> tensor<1x128xi32, #blocked> loc(#loc156) + %6 = arith.addi %ptr_36, %5 : tensor<1x128xi32, #blocked> loc(#loc156) + %7 = tt.broadcast %6 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc157) + %8 = tt.broadcast %ptr_32 : tensor<128x1xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc157) + %9 = arith.addi %7, %8 : tensor<128x128xi32, #blocked> loc(#loc157) + %10 = arith.muli %off_zq, %c4096_i32 : i32 loc(#loc158) + %11 = arith.muli %10, %ks0 : i32 loc(#loc159) + %12 = tt.splat %11 : i32 -> tensor<128x128xi32, #blocked> loc(#loc160) + %13 = arith.addi %9, %12 : tensor<128x128xi32, #blocked> loc(#loc160) + %14 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr, #blocked> loc(#loc161) + %15 = tt.addptr %14, %13 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc161) + %16 = arith.truncf %acc_163 : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc162) + %17 = ttg.convert_layout %16 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc162) + tt.store %15, %17, %mask_165 : tensor<128x128x!tt.ptr, #blocked> loc(#loc162) + %off_hz = arith.muli %off_zq, %c32_i32 : i32 loc(#loc329) + %off_hz_166 = arith.addi %off_hz, %off_hq : i32 loc(#loc330) + %l_ptrs = arith.muli %off_hz_166, %ks0 : i32 loc(#loc331) + %l_ptrs_167 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr, i32 loc(#loc332) + %l_ptrs_168 = tt.splat %l_ptrs_167 : !tt.ptr -> tensor<128x!tt.ptr, #blocked1> loc(#loc333) + %l_ptrs_169 = tt.addptr %l_ptrs_168, %offs_m_30 : tensor<128x!tt.ptr, #blocked1>, tensor<128xi32, #blocked1> loc(#loc333) + %lse = math.log2 %l_i_160 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc334) + %lse_170 = arith.addf %kv_offset_157#2, %lse : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc335) + %18 = tt.splat %ks0 : i32 -> tensor<128xi32, #blocked1> loc(#loc170) + %19 = arith.cmpi slt, %offs_m_30, %18 : tensor<128xi32, #blocked1> loc(#loc170) + %20 = ttg.convert_layout %lse_170 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128xf32, #blocked1> loc(#loc171) + tt.store %l_ptrs_169, %20, %19 : tensor<128x!tt.ptr, #blocked1> loc(#loc171) + tt.return loc(#loc172) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":85:54) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":86:54) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":86:63) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":97:28) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":98:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":99:27) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":103:23) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":104:24) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":107:24) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":107:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":107:36) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":108:25) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":108:47) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":108:37) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":111:12) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":112:12) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":113:12) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":130:26) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":142:51) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":142:74) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":143:46) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":143:97) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":143:64) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":144:23) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":144:46) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":144:33) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":284:27) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":146:101) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":284:38) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":284:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":284:56) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":284:49) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":292:52) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":292:23) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":151:26) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":152:23) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":152:37) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":153:42) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":153:28) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":154:45) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":154:92) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":154:102) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":154:65) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":159:37) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":159:24) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":167:48) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":347:107) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":257:21) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":358:36) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":359:36) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":373:23) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":376:33) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":502:40) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":376:23) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":377:22) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":378:23) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":379:23) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":380:23) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":384:24) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":389:92) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":390:25) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":398:25) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":401:25) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":439:107) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":346:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":342:32) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":351:19) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":349:17) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":353:14) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":367:44) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":367:69) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":374:23) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":381:23) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":385:24) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":387:25) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":388:92) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":391:24) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":392:24) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":393:39) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":394:25) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":395:24) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":396:24) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":397:23) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":399:25) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":400:92) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":402:24) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":403:24) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":404:39) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":405:25) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":406:24) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":407:24) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":412:73) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":414:69) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":417:27) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":421:27) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":423:35) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":424:51) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":428:31) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":428:25) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":429:51) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":429:39) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":429:21) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":434:16) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":434:24) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":436:22) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":436:16) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":440:22) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":440:44) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":548:26) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":247:33) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":545:63) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":248:38) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":248:24) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":249:109) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":249:113) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":249:55) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":249:25) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":250:30) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":250:35) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":250:60) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":251:34) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":251:48) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":251:63) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":252:29) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":252:47) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":252:61) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":252:42) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":549:21) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":181:35) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":182:27) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":182:41) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":183:51) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":183:32) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":184:49) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":184:69) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":186:28) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":193:52) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":206:26) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":206:34) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":208:20) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":208:16) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":214:38) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":214:30) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:49) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:62) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:80) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:87) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:75) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:25) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:110) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":221:26) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":221:31) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":222:32) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":222:23) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":222:40) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":223:33) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":223:20) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":227:48) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":227:29) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":229:4) +#loc190 = loc("q_start"(#loc5)) +#loc191 = loc("off_zq"(#loc6)) +#loc192 = loc("off_hq"(#loc7)) +#loc193 = loc("off_zkv"(#loc8)) +#loc194 = loc("off_hkv"(#loc9)) +#loc195 = loc("q_offset"(#loc10)) +#loc196 = loc("q_offset"(#loc11)) +#loc197 = loc("q_offset"(#loc12)) +#loc198 = loc("k_offset"(#loc13)) +#loc199 = loc("k_offset"(#loc14)) +#loc200 = loc("k_offset"(#loc15)) +#loc201 = loc("Q"(#loc16)) +#loc202 = loc("K"(#loc17)) +#loc203 = loc("V"(#loc18)) +#loc204 = loc("stride_kv_idx_h"(#loc19)) +#loc205 = loc("sparse_kv_num_blks_offset"(#loc20)) +#loc206 = loc("sparse_kv_num_blks_offset"(#loc21)) +#loc207 = loc("sparse_kv_idx_offset"(#loc22)) +#loc208 = loc("sparse_kv_idx_offset"(#loc23)) +#loc209 = loc("sparse_kv_idx_offset"(#loc24)) +#loc210 = loc("offs_m"(#loc25)) +#loc211 = loc("offs_m"(#loc26)) +#loc212 = loc("offs_m"(#loc27)) +#loc213 = loc("ptr"(#loc28)) +#loc214 = loc("q"(#loc29)) +#loc215 = loc("ptr"(#loc30)) +#loc216 = loc("ptr"(#loc31)) +#loc217 = loc("ptr"(#loc32)) +#loc218 = loc("ptr"(#loc33)) +#loc219 = loc("kv_indices"(#loc36)) +#loc220 = loc("kv_start"(#loc37)) +#loc221 = loc("kv_start"(#loc38)) +#loc222 = loc("kv_num_blocks"(#loc39)) +#loc223 = loc("kv_num_blocks"(#loc40)) +#loc224 = loc("block_n_end"(#loc41)) +#loc225 = loc("block_n_end"(#loc43)) +#loc226 = loc("block_n_end"(#loc45)) +#loc227 = loc("block_n_end"(#loc46)) +#loc228 = loc("offs_n"(#loc47)) +#loc229 = loc("offs_n"(#loc48)) +#loc230 = loc("k"(#loc50)) +#loc232 = loc("m"(#loc54)) +#loc233 = loc("n"(#loc55)) +#loc234 = loc("tmp4"(#loc56)) +#loc235 = loc("tmp7"(#loc57)) +#loc236 = loc("acc"(#loc58)) +#loc237 = loc("tmp7"(#loc59)) +#loc238 = loc("tmp8"(#loc60)) +#loc239 = loc("tmp9"(#loc61)) +#loc240 = loc("tmp10"(#loc62)) +#loc241 = loc("tmp11"(#loc63)) +#loc242 = loc("tmp15"(#loc64)) +#loc243 = loc("tmp20"(#loc65)) +#loc244 = loc("tmp21"(#loc66)) +#loc245 = loc("tmp29"(#loc67)) +#loc246 = loc("tmp32"(#loc68)) +#loc247 = loc("v"(#loc69)) +#loc248 = loc("offs_n_load"(#loc70)) +#loc249 = loc("kv_base_offset"(#loc71)) +#loc250 = loc("qk"(#loc72)) +#loc251 = loc("k"(#loc73)) +#loc252 = loc("qk"(#loc74)) +#loc253 = loc("post_mod_scores"(#loc75)) +#loc254 = loc("post_mod_scores"(#loc76)) +#loc255 = loc("tmp5"(#loc77)) +#loc256 = loc("tmp12"(#loc78)) +#loc257 = loc("tmp16"(#loc79)) +#loc258 = loc("tmp18"(#loc80)) +#loc259 = loc("tmp19"(#loc81)) +#loc260 = loc("tmp22"(#loc82)) +#loc261 = loc("tmp23"(#loc83)) +#loc262 = loc("tmp24"(#loc84)) +#loc263 = loc("tmp25"(#loc85)) +#loc264 = loc("tmp26"(#loc86)) +#loc265 = loc("tmp27"(#loc87)) +#loc266 = loc("tmp28"(#loc88)) +#loc267 = loc("tmp30"(#loc89)) +#loc268 = loc("tmp31"(#loc90)) +#loc269 = loc("tmp33"(#loc91)) +#loc270 = loc("tmp34"(#loc92)) +#loc271 = loc("tmp35"(#loc93)) +#loc272 = loc("tmp36"(#loc94)) +#loc273 = loc("tmp37"(#loc95)) +#loc274 = loc("tmp38"(#loc96)) +#loc275 = loc("mask_mod_output"(#loc97)) +#loc276 = loc("post_mod_scores"(#loc98)) +#loc277 = loc("post_mod_scores"(#loc99)) +#loc279 = loc("m_ij"(#loc103)) +#loc280 = loc("masked_out_rows"(#loc104)) +#loc281 = loc("m_ij_masked"(#loc105)) +#loc282 = loc("alpha"(#loc106)) +#loc283 = loc("alpha"(#loc107)) +#loc284 = loc("p"(#loc108)) +#loc285 = loc("p"(#loc109)) +#loc286 = loc("p"(#loc110)) +#loc287 = loc("l_i"(#loc111)) +#loc289 = loc("l_i"(#loc115)) +#loc290 = loc("acc"(#loc116)) +#loc291 = loc("acc"(#loc117)) +#loc292 = loc("acc"(#loc118)) +#loc293 = loc("acc"(#loc119)) +#loc294 = loc("offs_n"(#loc120)) +#loc295 = loc("cur_block_idx"(#loc121)) +#loc296 = loc("offset"(#loc122)) +#loc297 = loc("cur_block"(#loc123)) +#loc298 = loc("cur_block"(#loc124)) +#loc299 = loc("next_block"(#loc125)) +#loc300 = loc("next_block"(#loc126)) +#loc301 = loc("next_block"(#loc127)) +#loc302 = loc("next_block"(#loc128)) +#loc303 = loc("needs_jump"(#loc129)) +#loc304 = loc("needs_jump"(#loc130)) +#loc305 = loc("needs_jump"(#loc131)) +#loc306 = loc("jump_to_block"(#loc132)) +#loc307 = loc("jump_to_block"(#loc133)) +#loc308 = loc("jump_to_block"(#loc134)) +#loc309 = loc("offset"(#loc135)) +#loc310 = loc("offset"(#loc136)) +#loc311 = loc("offset"(#loc137)) +#loc312 = loc("offset"(#loc138)) +#loc313 = loc("kv_offset"(#loc139)) +#loc314 = loc("kv_indices"(#loc140)) +#loc315 = loc("kv_start"(#loc141)) +#loc316 = loc("kv_start"(#loc142)) +#loc317 = loc("kv_num_blocks"(#loc143)) +#loc318 = loc("kv_num_blocks"(#loc144)) +#loc319 = loc("block_n_end"(#loc145)) +#loc320 = loc("block_n_end"(#loc146)) +#loc321 = loc("offs_n"(#loc147)) +#loc323 = loc("l_i"(#loc150)) +#loc324 = loc("l_i"(#loc151)) +#loc325 = loc("acc"(#loc152)) +#loc326 = loc("acc"(#loc153)) +#loc327 = loc("mask"(#loc154)) +#loc328 = loc("mask"(#loc155)) +#loc329 = loc("off_hz"(#loc163)) +#loc330 = loc("off_hz"(#loc164)) +#loc331 = loc("l_ptrs"(#loc165)) +#loc332 = loc("l_ptrs"(#loc166)) +#loc333 = loc("l_ptrs"(#loc167)) +#loc334 = loc("lse"(#loc168)) +#loc335 = loc("lse"(#loc169)) +#loc336 = loc(callsite(#loc213 at #loc214)) +#loc337 = loc(callsite(#loc215 at #loc214)) +#loc338 = loc(callsite(#loc216 at #loc214)) +#loc339 = loc(callsite(#loc217 at #loc214)) +#loc340 = loc(callsite(#loc218 at #loc214)) +#loc341 = loc(callsite(#loc34 at #loc214)) +#loc342 = loc(callsite(#loc35 at #loc214)) +#loc343 = loc(callsite(#loc42 at #loc225)) +#loc344 = loc(callsite(#loc44 at #loc225)) +#loc345 = loc(callsite(#loc230 at #loc231)) +#loc346 = loc(callsite(#loc232 at #loc231)) +#loc347 = loc(callsite(#loc233 at #loc231)) +#loc348 = loc(callsite(#loc234 at #loc231)) +#loc349 = loc(callsite(#loc235 at #loc231)) +#loc350 = loc("l_i"(#loc236)) +#loc351 = loc(callsite(#loc237 at #loc231)) +#loc352 = loc(callsite(#loc238 at #loc231)) +#loc353 = loc(callsite(#loc239 at #loc231)) +#loc354 = loc(callsite(#loc240 at #loc231)) +#loc355 = loc(callsite(#loc241 at #loc231)) +#loc356 = loc(callsite(#loc242 at #loc231)) +#loc357 = loc(callsite(#loc243 at #loc231)) +#loc358 = loc(callsite(#loc244 at #loc231)) +#loc359 = loc(callsite(#loc245 at #loc231)) +#loc360 = loc(callsite(#loc246 at #loc231)) +#loc361 = loc(callsite(#loc247 at #loc231)) +#loc362 = loc(callsite(#loc248 at #loc231)) +#loc363 = loc(callsite(#loc249 at #loc231)) +#loc364 = loc(callsite(#loc250 at #loc231)) +#loc365 = loc(callsite(#loc251 at #loc231)) +#loc366 = loc(callsite(#loc252 at #loc231)) +#loc367 = loc(callsite(#loc253 at #loc231)) +#loc368 = loc(callsite(#loc254 at #loc231)) +#loc369 = loc(callsite(#loc255 at #loc231)) +#loc370 = loc(callsite(#loc256 at #loc231)) +#loc371 = loc(callsite(#loc257 at #loc231)) +#loc372 = loc(callsite(#loc258 at #loc231)) +#loc373 = loc(callsite(#loc259 at #loc231)) +#loc374 = loc(callsite(#loc260 at #loc231)) +#loc375 = loc(callsite(#loc261 at #loc231)) +#loc376 = loc(callsite(#loc262 at #loc231)) +#loc377 = loc(callsite(#loc263 at #loc231)) +#loc378 = loc(callsite(#loc264 at #loc231)) +#loc379 = loc(callsite(#loc265 at #loc231)) +#loc380 = loc(callsite(#loc266 at #loc231)) +#loc381 = loc(callsite(#loc267 at #loc231)) +#loc382 = loc(callsite(#loc268 at #loc231)) +#loc383 = loc(callsite(#loc269 at #loc231)) +#loc384 = loc(callsite(#loc270 at #loc231)) +#loc385 = loc(callsite(#loc271 at #loc231)) +#loc386 = loc(callsite(#loc272 at #loc231)) +#loc387 = loc(callsite(#loc273 at #loc231)) +#loc388 = loc(callsite(#loc274 at #loc231)) +#loc389 = loc(callsite(#loc275 at #loc231)) +#loc390 = loc(callsite(#loc276 at #loc231)) +#loc391 = loc(callsite(#loc277 at #loc231)) +#loc393 = loc(callsite(#loc279 at #loc231)) +#loc394 = loc(callsite(#loc280 at #loc231)) +#loc395 = loc(callsite(#loc281 at #loc231)) +#loc396 = loc(callsite(#loc282 at #loc231)) +#loc397 = loc(callsite(#loc283 at #loc231)) +#loc398 = loc(callsite(#loc284 at #loc231)) +#loc399 = loc(callsite(#loc285 at #loc231)) +#loc400 = loc(callsite(#loc286 at #loc231)) +#loc401 = loc(callsite(#loc287 at #loc231)) +#loc403 = loc(callsite(#loc289 at #loc231)) +#loc404 = loc(callsite(#loc290 at #loc231)) +#loc405 = loc(callsite(#loc291 at #loc231)) +#loc406 = loc(callsite(#loc292 at #loc231)) +#loc407 = loc(callsite(#loc293 at #loc231)) +#loc408 = loc(callsite(#loc294 at #loc52)) +#loc409 = loc(callsite(#loc296 at #loc52)) +#loc410 = loc(callsite(#loc313 at #loc52)) +#loc411 = loc(callsite(#loc230 at #loc322)) +#loc412 = loc(callsite(#loc247 at #loc322)) +#loc413 = loc(callsite(#loc248 at #loc322)) +#loc414 = loc(callsite(#loc249 at #loc322)) +#loc415 = loc(callsite(#loc250 at #loc322)) +#loc416 = loc(callsite(#loc251 at #loc322)) +#loc417 = loc(callsite(#loc252 at #loc322)) +#loc418 = loc(callsite(#loc253 at #loc322)) +#loc419 = loc(callsite(#loc254 at #loc322)) +#loc420 = loc(callsite(#loc277 at #loc322)) +#loc422 = loc(callsite(#loc279 at #loc322)) +#loc423 = loc(callsite(#loc280 at #loc322)) +#loc424 = loc(callsite(#loc281 at #loc322)) +#loc425 = loc(callsite(#loc282 at #loc322)) +#loc426 = loc(callsite(#loc283 at #loc322)) +#loc427 = loc(callsite(#loc284 at #loc322)) +#loc428 = loc(callsite(#loc285 at #loc322)) +#loc429 = loc(callsite(#loc286 at #loc322)) +#loc430 = loc(callsite(#loc287 at #loc322)) +#loc432 = loc(callsite(#loc289 at #loc322)) +#loc433 = loc(callsite(#loc290 at #loc322)) +#loc434 = loc(callsite(#loc291 at #loc322)) +#loc435 = loc(callsite(#loc292 at #loc322)) +#loc436 = loc(callsite(#loc293 at #loc322)) +#loc437 = loc(callsite(#loc294 at #loc149)) +#loc438 = loc(callsite(#loc296 at #loc149)) +#loc439 = loc(callsite(#loc313 at #loc149)) +#loc440 = loc(callsite(#loc216 at #loc345)) +#loc441 = loc(callsite(#loc218 at #loc345)) +#loc442 = loc(callsite(#loc34 at #loc345)) +#loc443 = loc(callsite(#loc53 at #loc346)) +#loc444 = loc(callsite(#loc53 at #loc347)) +#loc445 = loc("m_i"(#loc350)) +#loc446 = loc(callsite(#loc216 at #loc361)) +#loc447 = loc(callsite(#loc35 at #loc345)) +#loc448 = loc(callsite(#loc35 at #loc361)) +#loc449 = loc(callsite(#loc213 at #loc345)) +#loc450 = loc(callsite(#loc215 at #loc345)) +#loc451 = loc(callsite(#loc218 at #loc361)) +#loc452 = loc(callsite(#loc100 at #loc392)) +#loc454 = loc(callsite(#loc112 at #loc402)) +#loc456 = loc(callsite(#loc295 at #loc409)) +#loc457 = loc(callsite(#loc297 at #loc409)) +#loc458 = loc(callsite(#loc298 at #loc409)) +#loc459 = loc(callsite(#loc299 at #loc409)) +#loc460 = loc(callsite(#loc300 at #loc409)) +#loc461 = loc(callsite(#loc301 at #loc409)) +#loc462 = loc(callsite(#loc302 at #loc409)) +#loc463 = loc(callsite(#loc303 at #loc409)) +#loc464 = loc(callsite(#loc304 at #loc409)) +#loc465 = loc(callsite(#loc305 at #loc409)) +#loc466 = loc(callsite(#loc306 at #loc409)) +#loc467 = loc(callsite(#loc307 at #loc409)) +#loc468 = loc(callsite(#loc308 at #loc409)) +#loc469 = loc(callsite(#loc309 at #loc409)) +#loc470 = loc(callsite(#loc310 at #loc409)) +#loc471 = loc(callsite(#loc311 at #loc409)) +#loc472 = loc(callsite(#loc312 at #loc409)) +#loc473 = loc(callsite(#loc35 at #loc411)) +#loc474 = loc(callsite(#loc35 at #loc412)) +#loc475 = loc(callsite(#loc213 at #loc411)) +#loc476 = loc(callsite(#loc215 at #loc411)) +#loc477 = loc(callsite(#loc216 at #loc411)) +#loc478 = loc(callsite(#loc218 at #loc411)) +#loc479 = loc(callsite(#loc34 at #loc411)) +#loc480 = loc(callsite(#loc216 at #loc412)) +#loc481 = loc(callsite(#loc218 at #loc412)) +#loc482 = loc(callsite(#loc100 at #loc421)) +#loc484 = loc(callsite(#loc112 at #loc431)) +#loc486 = loc(callsite(#loc295 at #loc438)) +#loc487 = loc(callsite(#loc297 at #loc438)) +#loc488 = loc(callsite(#loc298 at #loc438)) +#loc489 = loc(callsite(#loc299 at #loc438)) +#loc490 = loc(callsite(#loc300 at #loc438)) +#loc491 = loc(callsite(#loc301 at #loc438)) +#loc492 = loc(callsite(#loc302 at #loc438)) +#loc493 = loc(callsite(#loc303 at #loc438)) +#loc494 = loc(callsite(#loc304 at #loc438)) +#loc495 = loc(callsite(#loc305 at #loc438)) +#loc496 = loc(callsite(#loc306 at #loc438)) +#loc497 = loc(callsite(#loc307 at #loc438)) +#loc498 = loc(callsite(#loc308 at #loc438)) +#loc499 = loc(callsite(#loc309 at #loc438)) +#loc500 = loc(callsite(#loc310 at #loc438)) +#loc501 = loc(callsite(#loc311 at #loc438)) +#loc502 = loc(callsite(#loc312 at #loc438)) +#loc503 = loc("offs_n"(#loc445)) +#loc504 = loc(callsite(#loc102 at #loc452)) +#loc505 = loc(callsite(#loc114 at #loc454)) +#loc506 = loc(callsite(#loc102 at #loc482)) +#loc507 = loc(callsite(#loc114 at #loc484)) +#loc508 = loc("kv_offset"(#loc503)) +#loc509 = loc(callsite(#loc508 at #loc52)) +#loc510 = loc(callsite(#loc508 at #loc149)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..f93e3264f58f7f3292e5d0f605d1514d7dc0dbef --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XTTIGBSHMYAXQBEFBY4RY4WSZYSY3LKZVZUG4CRC3T6RWJZY4DWA/triton_tem_fused_0.ttir @@ -0,0 +1,848 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":172:41) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":538:16) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":421:51) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":434:34) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":198:45) +#loc177 = loc("arg_Q"(#loc)) +#loc178 = loc("arg_K"(#loc)) +#loc179 = loc("arg_V"(#loc)) +#loc180 = loc("arg_LSE"(#loc)) +#loc181 = loc("arg_MAX"(#loc)) +#loc182 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc183 = loc("arg_KV_IDX"(#loc)) +#loc184 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc185 = loc("arg_FULL_KV_IDX"(#loc)) +#loc186 = loc("in_ptr9"(#loc)) +#loc187 = loc("out_ptr0"(#loc)) +#loc188 = loc("ks0"(#loc)) +#loc189 = loc("ks1"(#loc)) +#loc190 = loc("ks2"(#loc)) +#loc191 = loc("ks3"(#loc)) +#loc192 = loc("ks4"(#loc)) +#loc193 = loc("ks5"(#loc)) +#loc241 = loc(callsite(#loc58 at #loc2)) +#loc286 = loc("m_ij"(#loc105)) +#loc296 = loc("l_i"(#loc117)) +#loc332 = loc(callsite(#loc58 at #loc155)) +#loc401 = loc(callsite(#loc286 at #loc241)) +#loc411 = loc(callsite(#loc296 at #loc241)) +#loc430 = loc(callsite(#loc286 at #loc332)) +#loc440 = loc(callsite(#loc296 at #loc332)) +#loc460 = loc(callsite(#loc1 at #loc401)) +#loc462 = loc(callsite(#loc1 at #loc411)) +#loc490 = loc(callsite(#loc1 at #loc430)) +#loc492 = loc(callsite(#loc1 at #loc440)) +module { + tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %in_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr9"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc)), %ks2: i32 loc("ks2"(#loc)), %ks3: i32 loc("ks3"(#loc)), %ks4: i32 loc("ks4"(#loc)), %ks5: i32 loc("ks5"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16> loc(#loc1) + %cst_1 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc1) + %cst_2 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1) + %cst_3 = arith.constant dense : tensor<128x64xi1> loc(#loc194) + %cst_4 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc194) + %cst_5 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc194) + %cst_6 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1) + %cst_7 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1) + %cst_8 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1) + %c63_i32 = arith.constant 63 : i32 loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %q = arith.constant dense<0.000000e+00> : tensor<128x128xbf16> loc(#loc345) + %acc = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc346) + %cst_9 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc1) + %mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc197) + %l_i = arith.constant dense<1.000000e+00> : tensor<128xf32> loc(#loc198) + %cst_10 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %HQ = arith.constant 32 : i32 loc(#loc199) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %0 = arith.muli %ks0, %c4096_i32 : i32 loc(#loc10) + %1 = arith.muli %ks1, %c1024_i32 : i32 loc(#loc11) + %2 = arith.muli %ks1, %c128_i32 : i32 loc(#loc12) + %q_start = tt.get_program_id x : i32 loc(#loc200) + %off_zq = tt.get_program_id y : i32 loc(#loc201) + %off_hq = tt.get_program_id z : i32 loc(#loc202) + %off_zkv = arith.remsi %off_zq, %c2_i32 : i32 loc(#loc203) + %off_hkv = arith.divsi %off_hq, %c4_i32 : i32 loc(#loc204) + %q_offset = arith.muli %off_zq, %0 : i32 loc(#loc205) + %q_offset_11 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc206) + %q_offset_12 = arith.addi %q_offset, %q_offset_11 : i32 loc(#loc207) + %k_offset = arith.muli %off_zkv, %1 : i32 loc(#loc208) + %k_offset_13 = arith.muli %off_hkv, %2 : i32 loc(#loc209) + %k_offset_14 = arith.addi %k_offset, %k_offset_13 : i32 loc(#loc210) + %Q = tt.addptr %arg_Q, %q_offset_12 : !tt.ptr, i32 loc(#loc211) + %K = tt.addptr %arg_K, %k_offset_14 : !tt.ptr, i32 loc(#loc212) + %V = tt.addptr %arg_V, %k_offset_14 : !tt.ptr, i32 loc(#loc213) + %stride_kv_idx_h = arith.muli %ks3, %ks4 : i32 loc(#loc214) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %ks2 : i32 loc(#loc215) + %sparse_kv_num_blks_offset_15 = arith.addi %sparse_kv_num_blks_offset, %q_start : i32 loc(#loc216) + %sparse_kv_idx_offset = arith.muli %off_zkv, %stride_kv_idx_h : i32 loc(#loc217) + %sparse_kv_idx_offset_16 = arith.muli %q_start, %ks4 : i32 loc(#loc218) + %sparse_kv_idx_offset_17 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_16 : i32 loc(#loc219) + %offs_m = arith.muli %q_start, %c128_i32 : i32 loc(#loc220) + %offs_m_18 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc221) + %offs_m_19 = tt.splat %offs_m : i32 -> tensor<128xi32> loc(#loc222) + %offs_m_20 = arith.addi %offs_m_19, %offs_m_18 : tensor<128xi32> loc(#loc222) + %ptr = tt.expand_dims %offs_m_20 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc347) + %ptr_21 = arith.muli %ptr, %cst_9 : tensor<128x1xi32> loc(#loc348) + %ptr_22 = tt.splat %Q : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc349) + %ptr_23 = tt.addptr %ptr_22, %ptr_21 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc349) + %ptr_24 = tt.expand_dims %offs_m_18 {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc350) + %ptr_25 = tt.broadcast %ptr_23 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc351) + %ptr_26 = tt.broadcast %ptr_24 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc351) + %ptr_27 = tt.addptr %ptr_25, %ptr_26 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc351) + %q_28 = tt.splat %ks0 : i32 -> tensor<128x1xi32> loc(#loc352) + %q_29 = arith.cmpi slt, %ptr, %q_28 : tensor<128x1xi32> loc(#loc352) + %q_30 = tt.broadcast %q_29 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc345) + %q_31 = tt.load %ptr_27, %q_30, %q : tensor<128x128x!tt.ptr> loc(#loc345) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_17 : !tt.ptr, i32 loc(#loc228) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc229) + %kv_start_32 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc230) + %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_15 : !tt.ptr, i32 loc(#loc231) + %kv_num_blocks_33 = tt.load %kv_num_blocks : !tt.ptr loc(#loc232) + %block_n_end = arith.muli %kv_num_blocks_33, %c2_i32 : i32 loc(#loc233) + %block_n_end_34 = arith.addi %ks1, %c63_i32 : i32 loc(#loc353) + %block_n_end_35 = arith.divsi %block_n_end_34, %c64_i32 : i32 loc(#loc354) + %block_n_end_36 = arith.maxsi %block_n_end_35, %c1_i32 : i32 loc(#loc235) + %block_n_end_37 = arith.minsi %block_n_end, %block_n_end_36 : i32 loc(#loc236) + %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc237) + %offs_n_38 = tt.splat %kv_start_32 : i32 -> tensor<64xi32> loc(#loc238) + %offs_n_39 = arith.addi %offs_n_38, %offs_n : tensor<64xi32> loc(#loc238) + %3 = tt.expand_dims %offs_n_39 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc55) + %kv_offset:5 = scf.for %start_n = %c0_i32 to %block_n_end_37 step %c1_i32 iter_args(%acc_63 = %acc, %l_i_64 = %cst_10, %m_i = %cst_1, %offs_n_65 = %3, %kv_offset_66 = %c0_i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %kv_base_offset = arith.addi %kv_start_32, %kv_offset_66 : i32 loc(#loc356) + %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc357) + %offs_n_load_67 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc357) + %ptr_68 = tt.expand_dims %offs_n_load_67 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc451) + %ptr_69 = arith.muli %ptr_68, %cst : tensor<64x1xi32> loc(#loc452) + %ptr_70 = tt.splat %K : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc453) + %ptr_71 = tt.addptr %ptr_70, %ptr_69 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc453) + %ptr_72 = tt.broadcast %ptr_71 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc454) + %ptr_73 = tt.broadcast %ptr_24 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc454) + %ptr_74 = tt.addptr %ptr_72, %ptr_73 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc454) + %k = tt.splat %ks1 : i32 -> tensor<64x1xi32> loc(#loc455) + %k_75 = arith.cmpi slt, %ptr_68, %k : tensor<64x1xi32> loc(#loc455) + %k_76 = tt.broadcast %k_75 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc456) + %k_77 = tt.load %ptr_74, %k_76, %cst_0 : tensor<64x128x!tt.ptr> loc(#loc456) + %k_78 = tt.trans %k_77 {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc359) + %qk = tt.dot %q_31, %k_78, %cst_8, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc360) + %qk_79 = arith.mulf %qk, %cst_7 : tensor<128x64xf32> loc(#loc361) + %m = arith.remsi %ptr, %q_28 : tensor<128x1xi32> loc(#loc457) + %n = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc458) + %n_80 = arith.remsi %offs_n_65, %n : tensor<1x64xi32> loc(#loc458) + %post_mod_scores = arith.cmpi slt, %offs_n_65, %n : tensor<1x64xi32> loc(#loc364) + %post_mod_scores_81 = tt.broadcast %post_mod_scores : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc365) + %post_mod_scores_82 = arith.select %post_mod_scores_81, %qk_79, %cst_6 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc365) + %tmp4 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc366) + %tmp4_83 = tt.broadcast %n_80 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc366) + %tmp4_84 = arith.cmpi sge, %tmp4, %tmp4_83 : tensor<128x64xi32> loc(#loc366) + %tmp5 = arith.extsi %n_80 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc367) + %tmp7 = tt.addptr %in_ptr9, %off_zq : !tt.ptr, i32 loc(#loc368) + %tmp7_85 = tt.load %tmp7 : !tt.ptr loc(#loc369) + %tmp8 = tt.splat %tmp7_85 : i64 -> tensor<1x64xi64> loc(#loc370) + %tmp8_86 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc370) + %tmp9 = arith.extsi %m : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc371) + %tmp10 = tt.splat %tmp7_85 : i64 -> tensor<128x1xi64> loc(#loc372) + %tmp10_87 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc372) + %tmp11 = tt.broadcast %tmp8_86 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc373) + %tmp11_88 = tt.broadcast %tmp10_87 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc373) + %tmp11_89 = arith.andi %tmp11, %tmp11_88 : tensor<128x64xi1> loc(#loc373) + %tmp12 = arith.andi %tmp4_84, %tmp11_89 : tensor<128x64xi1> loc(#loc374) + %tmp15 = tt.splat %ks5 : i32 -> tensor<1x64xi32> loc(#loc375) + %tmp15_90 = arith.cmpi sge, %n_80, %tmp15 : tensor<1x64xi32> loc(#loc375) + %tmp16 = arith.remsi %n_80, %tmp15 : tensor<1x64xi32> loc(#loc376) + %tmp18 = arith.cmpi ne, %tmp16, %cst_5 : tensor<1x64xi32> loc(#loc377) + %tmp19 = arith.cmpi slt, %tmp16, %cst_5 : tensor<1x64xi32> loc(#loc378) + %tmp20 = arith.cmpi slt, %ks5, %c0_i32 : i32 loc(#loc379) + %tmp21 = tt.splat %tmp20 : i1 -> tensor<1x64xi1> loc(#loc380) + %tmp21_91 = arith.cmpi ne, %tmp19, %tmp21 : tensor<1x64xi1> loc(#loc380) + %tmp22 = arith.andi %tmp18, %tmp21_91 : tensor<1x64xi1> loc(#loc381) + %tmp23 = arith.addi %tmp16, %tmp15 : tensor<1x64xi32> loc(#loc382) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc383) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc384) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64> loc(#loc385) + %tmp27 = arith.andi %tmp15_90, %tmp26 : tensor<1x64xi1> loc(#loc386) + %tmp28 = arith.subi %tmp4_83, %tmp4 : tensor<128x64xi32> loc(#loc387) + %tmp29 = tt.splat %ks5 : i32 -> tensor<128x64xi32> loc(#loc388) + %tmp29_92 = arith.remsi %tmp28, %tmp29 : tensor<128x64xi32> loc(#loc388) + %tmp30 = arith.cmpi ne, %tmp29_92, %cst_4 : tensor<128x64xi32> loc(#loc389) + %tmp31 = arith.cmpi slt, %tmp29_92, %cst_4 : tensor<128x64xi32> loc(#loc390) + %tmp32 = tt.splat %tmp20 : i1 -> tensor<128x64xi1> loc(#loc391) + %tmp32_93 = arith.cmpi ne, %tmp31, %tmp32 : tensor<128x64xi1> loc(#loc391) + %tmp33 = arith.andi %tmp30, %tmp32_93 : tensor<128x64xi1> loc(#loc392) + %tmp34 = arith.addi %tmp29_92, %tmp29 : tensor<128x64xi32> loc(#loc393) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29_92 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc394) + %tmp36 = arith.cmpi eq, %tmp35, %cst_4 : tensor<128x64xi32> loc(#loc395) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc396) + %tmp37_94 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1> loc(#loc396) + %tmp38 = arith.ori %tmp12, %tmp37_94 : tensor<128x64xi1> loc(#loc397) + %mask_mod_output = arith.select %post_mod_scores_81, %tmp38, %cst_3 : tensor<128x64xi1>, tensor<128x64xi1> loc(#loc398) + %post_mod_scores_95 = arith.select %mask_mod_output, %post_mod_scores_82, %cst_6 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc399) + %post_mod_scores_96 = arith.mulf %post_mod_scores_95, %cst_2 : tensor<128x64xf32> loc(#loc400) + %m_ij = "tt.reduce"(%post_mod_scores_96) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_129: f32 loc(callsite(#loc1 at #loc401)), %m_ij_130: f32 loc(callsite(#loc1 at #loc401))): + %m_ij_131 = arith.maxnumf %m_ij_129, %m_ij_130 : f32 loc(#loc514) + tt.reduce.return %m_ij_131 : f32 loc(#loc459) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc459) + %m_ij_97 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc402) + %masked_out_rows = arith.cmpf oeq, %m_ij_97, %cst_1 : tensor<128xf32> loc(#loc403) + %m_ij_masked = arith.select %masked_out_rows, %cst_10, %m_ij_97 : tensor<128xi1>, tensor<128xf32> loc(#loc404) + %alpha = arith.subf %m_i, %m_ij_masked : tensor<128xf32> loc(#loc405) + %alpha_98 = math.exp2 %alpha : tensor<128xf32> loc(#loc406) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc407) + %p_99 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc408) + %p_100 = arith.subf %post_mod_scores_96, %p_99 : tensor<128x64xf32> loc(#loc408) + %p_101 = math.exp2 %p_100 : tensor<128x64xf32> loc(#loc409) + %l_i_102 = arith.mulf %l_i_64, %alpha_98 : tensor<128xf32> loc(#loc410) + %l_i_103 = "tt.reduce"(%p_101) <{axis = 1 : i32}> ({ + ^bb0(%l_i_129: f32 loc(callsite(#loc1 at #loc411)), %l_i_130: f32 loc(callsite(#loc1 at #loc411))): + %l_i_131 = arith.addf %l_i_129, %l_i_130 : f32 loc(#loc515) + tt.reduce.return %l_i_131 : f32 loc(#loc461) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc461) + %l_i_104 = arith.addf %l_i_102, %l_i_103 : tensor<128xf32> loc(#loc412) + %acc_105 = tt.expand_dims %alpha_98 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc413) + %acc_106 = tt.broadcast %acc_105 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc414) + %acc_107 = arith.mulf %acc_63, %acc_106 : tensor<128x128xf32> loc(#loc414) + %ptr_108 = tt.splat %V : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc463) + %ptr_109 = tt.addptr %ptr_108, %ptr_69 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc463) + %ptr_110 = tt.broadcast %ptr_109 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc464) + %ptr_111 = tt.addptr %ptr_110, %ptr_73 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc464) + %v = tt.load %ptr_111, %k_76, %cst_0 : tensor<64x128x!tt.ptr> loc(#loc465) + %acc_112 = arith.truncf %p_101 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc416) + %acc_113 = tt.dot %acc_112, %v, %acc_107, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc417) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc466) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc467) + %cur_block_114 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc468) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc469) + %next_block_115 = arith.cmpi slt, %next_block, %kv_num_blocks_33 : i32 loc(#loc470) + %next_block_116 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc471) + %next_block_117 = tt.load %next_block_116, %next_block_115 evictionPolicy = evict_last : !tt.ptr loc(#loc472) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc473) + %needs_jump_118 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc474) + %needs_jump_119 = arith.cmpi eq, %needs_jump_118, %c0_i32 : i32 loc(#loc475) + %jump_to_block = arith.subi %next_block_117, %cur_block_114 : i32 loc(#loc476) + %jump_to_block_120 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc477) + %jump_to_block_121 = arith.subi %jump_to_block_120, %c64_i32 : i32 loc(#loc478) + %offset = arith.extui %needs_jump_119 : i1 to i32 loc(#loc479) + %offset_122 = arith.muli %jump_to_block_121, %offset : i32 loc(#loc479) + %offset_123 = arith.subi %c1_i32, %offset : i32 loc(#loc480) + %offset_124 = arith.muli %offset_123, %c64_i32 : i32 loc(#loc481) + %offset_125 = arith.addi %offset_122, %offset_124 : i32 loc(#loc482) + %offs_n_126 = tt.splat %offset_125 : i32 -> tensor<1x64xi32> loc(#loc419) + %offs_n_127 = arith.addi %offs_n_65, %offs_n_126 : tensor<1x64xi32> loc(#loc419) + %kv_offset_128 = arith.addi %kv_offset_66, %offset_125 : i32 loc(#loc420) + scf.yield %acc_113, %l_i_104, %m_ij_97, %offs_n_127, %kv_offset_128 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc323) + } loc(#loc519) + %kv_indices_40 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_17 : !tt.ptr, i32 loc(#loc324) + %kv_start_41 = tt.load %kv_indices_40 : !tt.ptr loc(#loc325) + %kv_start_42 = arith.muli %kv_start_41, %c128_i32 : i32 loc(#loc326) + %kv_num_blocks_43 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_15 : !tt.ptr, i32 loc(#loc327) + %kv_num_blocks_44 = tt.load %kv_num_blocks_43 : !tt.ptr loc(#loc328) + %block_n_end_45 = arith.muli %kv_num_blocks_44, %c2_i32 : i32 loc(#loc329) + %block_n_end_46 = arith.minsi %block_n_end_45, %block_n_end_36 : i32 loc(#loc330) + %offs_n_47 = tt.splat %kv_start_42 : i32 -> tensor<64xi32> loc(#loc331) + %offs_n_48 = arith.addi %offs_n_47, %offs_n : tensor<64xi32> loc(#loc331) + %4 = tt.expand_dims %offs_n_48 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc154) + %kv_offset_49:5 = scf.for %start_n = %c0_i32 to %block_n_end_46 step %c1_i32 iter_args(%acc_63 = %kv_offset#0, %l_i_64 = %kv_offset#1, %m_i = %kv_offset#2, %offs_n_65 = %4, %kv_offset_66 = %c0_i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %kv_base_offset = arith.addi %kv_start_42, %kv_offset_66 : i32 loc(#loc421) + %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc422) + %offs_n_load_67 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc422) + %ptr_68 = tt.expand_dims %offs_n_load_67 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc483) + %ptr_69 = arith.muli %ptr_68, %cst : tensor<64x1xi32> loc(#loc484) + %ptr_70 = tt.splat %K : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc485) + %ptr_71 = tt.addptr %ptr_70, %ptr_69 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc485) + %ptr_72 = tt.broadcast %ptr_71 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc486) + %ptr_73 = tt.broadcast %ptr_24 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc486) + %ptr_74 = tt.addptr %ptr_72, %ptr_73 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc486) + %k = tt.splat %ks1 : i32 -> tensor<64x1xi32> loc(#loc487) + %k_75 = arith.cmpi slt, %ptr_68, %k : tensor<64x1xi32> loc(#loc487) + %k_76 = tt.broadcast %k_75 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc488) + %k_77 = tt.load %ptr_74, %k_76, %cst_0 : tensor<64x128x!tt.ptr> loc(#loc488) + %k_78 = tt.trans %k_77 {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc424) + %qk = tt.dot %q_31, %k_78, %cst_8, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc425) + %qk_79 = arith.mulf %qk, %cst_7 : tensor<128x64xf32> loc(#loc426) + %post_mod_scores = tt.splat %ks1 : i32 -> tensor<1x64xi32> loc(#loc427) + %post_mod_scores_80 = arith.cmpi slt, %offs_n_65, %post_mod_scores : tensor<1x64xi32> loc(#loc427) + %post_mod_scores_81 = tt.broadcast %post_mod_scores_80 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc428) + %post_mod_scores_82 = arith.select %post_mod_scores_81, %qk_79, %cst_6 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc428) + %post_mod_scores_83 = arith.mulf %post_mod_scores_82, %cst_2 : tensor<128x64xf32> loc(#loc429) + %m_ij = "tt.reduce"(%post_mod_scores_83) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_116: f32 loc(callsite(#loc1 at #loc430)), %m_ij_117: f32 loc(callsite(#loc1 at #loc430))): + %m_ij_118 = arith.maxnumf %m_ij_116, %m_ij_117 : f32 loc(#loc516) + tt.reduce.return %m_ij_118 : f32 loc(#loc489) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc489) + %m_ij_84 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc431) + %masked_out_rows = arith.cmpf oeq, %m_ij_84, %cst_1 : tensor<128xf32> loc(#loc432) + %m_ij_masked = arith.select %masked_out_rows, %cst_10, %m_ij_84 : tensor<128xi1>, tensor<128xf32> loc(#loc433) + %alpha = arith.subf %m_i, %m_ij_masked : tensor<128xf32> loc(#loc434) + %alpha_85 = math.exp2 %alpha : tensor<128xf32> loc(#loc435) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc436) + %p_86 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc437) + %p_87 = arith.subf %post_mod_scores_83, %p_86 : tensor<128x64xf32> loc(#loc437) + %p_88 = math.exp2 %p_87 : tensor<128x64xf32> loc(#loc438) + %l_i_89 = arith.mulf %l_i_64, %alpha_85 : tensor<128xf32> loc(#loc439) + %l_i_90 = "tt.reduce"(%p_88) <{axis = 1 : i32}> ({ + ^bb0(%l_i_116: f32 loc(callsite(#loc1 at #loc440)), %l_i_117: f32 loc(callsite(#loc1 at #loc440))): + %l_i_118 = arith.addf %l_i_116, %l_i_117 : f32 loc(#loc517) + tt.reduce.return %l_i_118 : f32 loc(#loc491) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc491) + %l_i_91 = arith.addf %l_i_89, %l_i_90 : tensor<128xf32> loc(#loc441) + %acc_92 = tt.expand_dims %alpha_85 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc442) + %acc_93 = tt.broadcast %acc_92 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc443) + %acc_94 = arith.mulf %acc_63, %acc_93 : tensor<128x128xf32> loc(#loc443) + %ptr_95 = tt.splat %V : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc493) + %ptr_96 = tt.addptr %ptr_95, %ptr_69 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc493) + %ptr_97 = tt.broadcast %ptr_96 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc494) + %ptr_98 = tt.addptr %ptr_97, %ptr_73 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc494) + %v = tt.load %ptr_98, %k_76, %cst_0 : tensor<64x128x!tt.ptr> loc(#loc495) + %acc_99 = arith.truncf %p_88 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc445) + %acc_100 = tt.dot %acc_99, %v, %acc_94, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc446) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc496) + %cur_block = tt.addptr %kv_indices_40, %cur_block_idx : !tt.ptr, i32 loc(#loc497) + %cur_block_101 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc498) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc499) + %next_block_102 = arith.cmpi slt, %next_block, %kv_num_blocks_44 : i32 loc(#loc500) + %next_block_103 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc501) + %next_block_104 = tt.load %next_block_103, %next_block_102 evictionPolicy = evict_last : !tt.ptr loc(#loc502) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc503) + %needs_jump_105 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc504) + %needs_jump_106 = arith.cmpi eq, %needs_jump_105, %c0_i32 : i32 loc(#loc505) + %jump_to_block = arith.subi %next_block_104, %cur_block_101 : i32 loc(#loc506) + %jump_to_block_107 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc507) + %jump_to_block_108 = arith.subi %jump_to_block_107, %c64_i32 : i32 loc(#loc508) + %offset = arith.extui %needs_jump_106 : i1 to i32 loc(#loc509) + %offset_109 = arith.muli %jump_to_block_108, %offset : i32 loc(#loc509) + %offset_110 = arith.subi %c1_i32, %offset : i32 loc(#loc510) + %offset_111 = arith.muli %offset_110, %c64_i32 : i32 loc(#loc511) + %offset_112 = arith.addi %offset_109, %offset_111 : i32 loc(#loc512) + %offs_n_113 = tt.splat %offset_112 : i32 -> tensor<1x64xi32> loc(#loc448) + %offs_n_114 = arith.addi %offs_n_65, %offs_n_113 : tensor<1x64xi32> loc(#loc448) + %kv_offset_115 = arith.addi %kv_offset_66, %offset_112 : i32 loc(#loc449) + scf.yield %acc_100, %l_i_91, %m_ij_84, %offs_n_114, %kv_offset_115 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc333) + } loc(#loc520) + %l_i_50 = arith.cmpf oeq, %kv_offset_49#1, %cst_10 : tensor<128xf32> loc(#loc334) + %l_i_51 = arith.select %l_i_50, %l_i, %kv_offset_49#1 : tensor<128xi1>, tensor<128xf32> loc(#loc198) + %acc_52 = tt.expand_dims %l_i_51 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc335) + %acc_53 = tt.broadcast %acc_52 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc336) + %acc_54 = arith.divf %kv_offset_49#0, %acc_53 : tensor<128x128xf32> loc(#loc336) + %mask_55 = arith.cmpi slt, %ptr_24, %mask : tensor<1x128xi32> loc(#loc197) + %mask_56 = tt.broadcast %mask_55 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc337) + %mask_57 = arith.andi %q_30, %mask_56 : tensor<128x128xi1> loc(#loc337) + %5 = tt.splat %q_offset_11 : i32 -> tensor<1x128xi32> loc(#loc160) + %6 = arith.addi %ptr_24, %5 : tensor<1x128xi32> loc(#loc160) + %7 = tt.broadcast %6 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc161) + %8 = tt.broadcast %ptr_21 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc161) + %9 = arith.addi %7, %8 : tensor<128x128xi32> loc(#loc161) + %10 = arith.muli %off_zq, %c4096_i32 : i32 loc(#loc162) + %11 = arith.muli %10, %ks0 : i32 loc(#loc163) + %12 = tt.splat %11 : i32 -> tensor<128x128xi32> loc(#loc164) + %13 = arith.addi %9, %12 : tensor<128x128xi32> loc(#loc164) + %14 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc165) + %15 = tt.addptr %14, %13 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc165) + %16 = arith.truncf %acc_54 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc166) + tt.store %15, %16, %mask_57 : tensor<128x128x!tt.ptr> loc(#loc166) + %off_hz = arith.muli %off_zq, %HQ : i32 loc(#loc338) + %off_hz_58 = arith.addi %off_hz, %off_hq : i32 loc(#loc339) + %l_ptrs = arith.muli %off_hz_58, %ks0 : i32 loc(#loc340) + %l_ptrs_59 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr, i32 loc(#loc341) + %l_ptrs_60 = tt.splat %l_ptrs_59 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc342) + %l_ptrs_61 = tt.addptr %l_ptrs_60, %offs_m_20 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc342) + %lse = math.log2 %l_i_51 : tensor<128xf32> loc(#loc343) + %lse_62 = arith.addf %kv_offset_49#2, %lse : tensor<128xf32> loc(#loc344) + %17 = tt.splat %ks0 : i32 -> tensor<128xi32> loc(#loc174) + %18 = arith.cmpi slt, %offs_m_20, %17 : tensor<128xi32> loc(#loc174) + tt.store %l_ptrs_61, %lse_62, %18 : tensor<128x!tt.ptr> loc(#loc175) + tt.return loc(#loc176) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":292:23) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":146:101) +#loc5 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":136:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":214:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":206:34) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":90:9) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":85:54) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":86:54) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":86:63) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":97:28) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":98:27) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":99:27) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":103:23) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":104:24) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":107:24) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":107:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":107:36) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":108:25) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":108:47) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":108:37) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":111:12) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":112:12) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":113:12) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":130:26) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":142:51) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":142:74) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":143:46) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":143:97) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":143:64) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":144:23) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":144:46) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":144:33) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":284:27) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":284:38) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":284:20) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":284:56) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":284:49) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":292:52) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":151:26) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":152:23) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":152:37) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":153:42) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":153:28) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":154:45) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":154:92) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":154:102) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":154:65) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":159:37) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":159:24) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":167:48) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":502:40) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":342:32) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":346:35) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":347:107) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":349:17) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":351:19) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":353:14) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":257:21) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":358:36) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":359:36) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":367:44) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":367:69) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":373:23) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":374:23) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":376:33) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":376:23) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":377:22) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":378:23) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":379:23) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":380:23) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":381:23) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":384:24) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":385:24) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":387:25) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":388:92) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":389:92) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":390:25) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":391:24) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":392:24) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":393:39) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":394:25) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":395:24) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":396:24) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":397:23) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":398:25) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":399:25) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":400:92) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":401:25) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":402:24) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":403:24) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":404:39) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":405:25) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":406:24) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":407:24) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":412:73) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":414:69) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":417:27) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":421:27) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":423:35) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":424:51) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":428:31) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":428:25) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":429:51) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":429:39) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":429:21) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":434:16) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":434:24) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":436:22) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":436:16) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":439:107) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":440:22) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":440:44) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":247:33) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":545:63) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":248:38) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":248:24) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":249:109) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":249:113) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":249:55) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":249:25) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":250:30) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":250:35) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":250:60) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":251:34) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":251:48) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":251:63) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":252:29) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":252:47) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":252:61) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":252:42) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":548:26) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":549:21) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":549:8) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":181:35) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":182:27) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":182:41) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":183:51) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":183:32) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":184:49) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":184:69) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":186:28) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":193:52) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":206:26) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":208:20) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":208:16) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":214:30) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:49) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:62) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:80) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:87) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:75) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:25) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":218:110) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":221:26) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":221:31) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":222:32) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":222:23) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":222:40) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":223:33) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":223:20) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":227:48) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":227:29) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/55/c55rfucvxw2qjoobizmdt7kyopbcvfizourbkv33jvve2gfq4bpl.py":229:4) +#loc194 = loc(callsite(#loc1 at #loc2)) +#loc195 = loc("q"(#loc4)) +#loc196 = loc("acc"(#loc6)) +#loc197 = loc("mask"(#loc7)) +#loc198 = loc("l_i"(#loc8)) +#loc199 = loc("HQ"(#loc9)) +#loc200 = loc("q_start"(#loc13)) +#loc201 = loc("off_zq"(#loc14)) +#loc202 = loc("off_hq"(#loc15)) +#loc203 = loc("off_zkv"(#loc16)) +#loc204 = loc("off_hkv"(#loc17)) +#loc205 = loc("q_offset"(#loc18)) +#loc206 = loc("q_offset"(#loc19)) +#loc207 = loc("q_offset"(#loc20)) +#loc208 = loc("k_offset"(#loc21)) +#loc209 = loc("k_offset"(#loc22)) +#loc210 = loc("k_offset"(#loc23)) +#loc211 = loc("Q"(#loc24)) +#loc212 = loc("K"(#loc25)) +#loc213 = loc("V"(#loc26)) +#loc214 = loc("stride_kv_idx_h"(#loc27)) +#loc215 = loc("sparse_kv_num_blks_offset"(#loc28)) +#loc216 = loc("sparse_kv_num_blks_offset"(#loc29)) +#loc217 = loc("sparse_kv_idx_offset"(#loc30)) +#loc218 = loc("sparse_kv_idx_offset"(#loc31)) +#loc219 = loc("sparse_kv_idx_offset"(#loc32)) +#loc220 = loc("offs_m"(#loc33)) +#loc221 = loc("offs_m"(#loc34)) +#loc222 = loc("offs_m"(#loc35)) +#loc223 = loc("ptr"(#loc36)) +#loc224 = loc("ptr"(#loc37)) +#loc225 = loc("ptr"(#loc38)) +#loc226 = loc("ptr"(#loc39)) +#loc227 = loc("ptr"(#loc40)) +#loc228 = loc("kv_indices"(#loc42)) +#loc229 = loc("kv_start"(#loc43)) +#loc230 = loc("kv_start"(#loc44)) +#loc231 = loc("kv_num_blocks"(#loc45)) +#loc232 = loc("kv_num_blocks"(#loc46)) +#loc233 = loc("block_n_end"(#loc47)) +#loc234 = loc("block_n_end"(#loc49)) +#loc235 = loc("block_n_end"(#loc51)) +#loc236 = loc("block_n_end"(#loc52)) +#loc237 = loc("offs_n"(#loc53)) +#loc238 = loc("offs_n"(#loc54)) +#loc239 = loc("acc"(#loc56)) +#loc240 = loc("kv_base_offset"(#loc57)) +#loc242 = loc("offs_n_load"(#loc59)) +#loc243 = loc("k"(#loc60)) +#loc244 = loc("k"(#loc61)) +#loc245 = loc("qk"(#loc62)) +#loc246 = loc("qk"(#loc63)) +#loc247 = loc("m"(#loc65)) +#loc248 = loc("n"(#loc66)) +#loc249 = loc("post_mod_scores"(#loc67)) +#loc250 = loc("post_mod_scores"(#loc68)) +#loc251 = loc("tmp4"(#loc69)) +#loc252 = loc("tmp5"(#loc70)) +#loc253 = loc("tmp7"(#loc71)) +#loc254 = loc("tmp7"(#loc72)) +#loc255 = loc("tmp8"(#loc73)) +#loc256 = loc("tmp9"(#loc74)) +#loc257 = loc("tmp10"(#loc75)) +#loc258 = loc("tmp11"(#loc76)) +#loc259 = loc("tmp12"(#loc77)) +#loc260 = loc("tmp15"(#loc78)) +#loc261 = loc("tmp16"(#loc79)) +#loc262 = loc("tmp18"(#loc80)) +#loc263 = loc("tmp19"(#loc81)) +#loc264 = loc("tmp20"(#loc82)) +#loc265 = loc("tmp21"(#loc83)) +#loc266 = loc("tmp22"(#loc84)) +#loc267 = loc("tmp23"(#loc85)) +#loc268 = loc("tmp24"(#loc86)) +#loc269 = loc("tmp25"(#loc87)) +#loc270 = loc("tmp26"(#loc88)) +#loc271 = loc("tmp27"(#loc89)) +#loc272 = loc("tmp28"(#loc90)) +#loc273 = loc("tmp29"(#loc91)) +#loc274 = loc("tmp30"(#loc92)) +#loc275 = loc("tmp31"(#loc93)) +#loc276 = loc("tmp32"(#loc94)) +#loc277 = loc("tmp33"(#loc95)) +#loc278 = loc("tmp34"(#loc96)) +#loc279 = loc("tmp35"(#loc97)) +#loc280 = loc("tmp36"(#loc98)) +#loc281 = loc("tmp37"(#loc99)) +#loc282 = loc("tmp38"(#loc100)) +#loc283 = loc("mask_mod_output"(#loc101)) +#loc284 = loc("post_mod_scores"(#loc102)) +#loc285 = loc("post_mod_scores"(#loc103)) +#loc287 = loc("m_ij"(#loc107)) +#loc288 = loc("masked_out_rows"(#loc108)) +#loc289 = loc("m_ij_masked"(#loc109)) +#loc290 = loc("alpha"(#loc110)) +#loc291 = loc("alpha"(#loc111)) +#loc292 = loc("p"(#loc112)) +#loc293 = loc("p"(#loc113)) +#loc294 = loc("p"(#loc114)) +#loc295 = loc("l_i"(#loc115)) +#loc297 = loc("l_i"(#loc119)) +#loc298 = loc("acc"(#loc120)) +#loc299 = loc("acc"(#loc121)) +#loc300 = loc("v"(#loc122)) +#loc301 = loc("acc"(#loc123)) +#loc302 = loc("acc"(#loc124)) +#loc303 = loc("cur_block_idx"(#loc125)) +#loc304 = loc("offset"(#loc126)) +#loc305 = loc("cur_block"(#loc127)) +#loc306 = loc("cur_block"(#loc128)) +#loc307 = loc("next_block"(#loc129)) +#loc308 = loc("next_block"(#loc130)) +#loc309 = loc("next_block"(#loc131)) +#loc310 = loc("next_block"(#loc132)) +#loc311 = loc("needs_jump"(#loc133)) +#loc312 = loc("needs_jump"(#loc134)) +#loc313 = loc("needs_jump"(#loc135)) +#loc314 = loc("jump_to_block"(#loc136)) +#loc315 = loc("jump_to_block"(#loc137)) +#loc316 = loc("jump_to_block"(#loc138)) +#loc317 = loc("offset"(#loc139)) +#loc318 = loc("offset"(#loc140)) +#loc319 = loc("offset"(#loc141)) +#loc320 = loc("offset"(#loc142)) +#loc321 = loc("offs_n"(#loc143)) +#loc322 = loc("kv_offset"(#loc144)) +#loc323 = loc(callsite(#loc145 at #loc2)) +#loc324 = loc("kv_indices"(#loc146)) +#loc325 = loc("kv_start"(#loc147)) +#loc326 = loc("kv_start"(#loc148)) +#loc327 = loc("kv_num_blocks"(#loc149)) +#loc328 = loc("kv_num_blocks"(#loc150)) +#loc329 = loc("block_n_end"(#loc151)) +#loc330 = loc("block_n_end"(#loc152)) +#loc331 = loc("offs_n"(#loc153)) +#loc333 = loc(callsite(#loc145 at #loc155)) +#loc334 = loc("l_i"(#loc156)) +#loc335 = loc("acc"(#loc157)) +#loc336 = loc("acc"(#loc158)) +#loc337 = loc("mask"(#loc159)) +#loc338 = loc("off_hz"(#loc167)) +#loc339 = loc("off_hz"(#loc168)) +#loc340 = loc("l_ptrs"(#loc169)) +#loc341 = loc("l_ptrs"(#loc170)) +#loc342 = loc("l_ptrs"(#loc171)) +#loc343 = loc("lse"(#loc172)) +#loc344 = loc("lse"(#loc173)) +#loc345 = loc(callsite(#loc3 at #loc195)) +#loc346 = loc(callsite(#loc5 at #loc196)) +#loc347 = loc(callsite(#loc223 at #loc195)) +#loc348 = loc(callsite(#loc224 at #loc195)) +#loc349 = loc(callsite(#loc225 at #loc195)) +#loc350 = loc(callsite(#loc226 at #loc195)) +#loc351 = loc(callsite(#loc227 at #loc195)) +#loc352 = loc(callsite(#loc41 at #loc195)) +#loc353 = loc(callsite(#loc48 at #loc234)) +#loc354 = loc(callsite(#loc50 at #loc234)) +#loc355 = loc("l_i"(#loc239)) +#loc356 = loc(callsite(#loc240 at #loc241)) +#loc357 = loc(callsite(#loc242 at #loc241)) +#loc358 = loc(callsite(#loc243 at #loc241)) +#loc359 = loc(callsite(#loc244 at #loc241)) +#loc360 = loc(callsite(#loc245 at #loc241)) +#loc361 = loc(callsite(#loc246 at #loc241)) +#loc362 = loc(callsite(#loc247 at #loc241)) +#loc363 = loc(callsite(#loc248 at #loc241)) +#loc364 = loc(callsite(#loc249 at #loc241)) +#loc365 = loc(callsite(#loc250 at #loc241)) +#loc366 = loc(callsite(#loc251 at #loc241)) +#loc367 = loc(callsite(#loc252 at #loc241)) +#loc368 = loc(callsite(#loc253 at #loc241)) +#loc369 = loc(callsite(#loc254 at #loc241)) +#loc370 = loc(callsite(#loc255 at #loc241)) +#loc371 = loc(callsite(#loc256 at #loc241)) +#loc372 = loc(callsite(#loc257 at #loc241)) +#loc373 = loc(callsite(#loc258 at #loc241)) +#loc374 = loc(callsite(#loc259 at #loc241)) +#loc375 = loc(callsite(#loc260 at #loc241)) +#loc376 = loc(callsite(#loc261 at #loc241)) +#loc377 = loc(callsite(#loc262 at #loc241)) +#loc378 = loc(callsite(#loc263 at #loc241)) +#loc379 = loc(callsite(#loc264 at #loc241)) +#loc380 = loc(callsite(#loc265 at #loc241)) +#loc381 = loc(callsite(#loc266 at #loc241)) +#loc382 = loc(callsite(#loc267 at #loc241)) +#loc383 = loc(callsite(#loc268 at #loc241)) +#loc384 = loc(callsite(#loc269 at #loc241)) +#loc385 = loc(callsite(#loc270 at #loc241)) +#loc386 = loc(callsite(#loc271 at #loc241)) +#loc387 = loc(callsite(#loc272 at #loc241)) +#loc388 = loc(callsite(#loc273 at #loc241)) +#loc389 = loc(callsite(#loc274 at #loc241)) +#loc390 = loc(callsite(#loc275 at #loc241)) +#loc391 = loc(callsite(#loc276 at #loc241)) +#loc392 = loc(callsite(#loc277 at #loc241)) +#loc393 = loc(callsite(#loc278 at #loc241)) +#loc394 = loc(callsite(#loc279 at #loc241)) +#loc395 = loc(callsite(#loc280 at #loc241)) +#loc396 = loc(callsite(#loc281 at #loc241)) +#loc397 = loc(callsite(#loc282 at #loc241)) +#loc398 = loc(callsite(#loc283 at #loc241)) +#loc399 = loc(callsite(#loc284 at #loc241)) +#loc400 = loc(callsite(#loc285 at #loc241)) +#loc402 = loc(callsite(#loc287 at #loc241)) +#loc403 = loc(callsite(#loc288 at #loc241)) +#loc404 = loc(callsite(#loc289 at #loc241)) +#loc405 = loc(callsite(#loc290 at #loc241)) +#loc406 = loc(callsite(#loc291 at #loc241)) +#loc407 = loc(callsite(#loc292 at #loc241)) +#loc408 = loc(callsite(#loc293 at #loc241)) +#loc409 = loc(callsite(#loc294 at #loc241)) +#loc410 = loc(callsite(#loc295 at #loc241)) +#loc412 = loc(callsite(#loc297 at #loc241)) +#loc413 = loc(callsite(#loc298 at #loc241)) +#loc414 = loc(callsite(#loc299 at #loc241)) +#loc415 = loc(callsite(#loc300 at #loc241)) +#loc416 = loc(callsite(#loc301 at #loc241)) +#loc417 = loc(callsite(#loc302 at #loc241)) +#loc418 = loc(callsite(#loc304 at #loc2)) +#loc419 = loc(callsite(#loc321 at #loc2)) +#loc420 = loc(callsite(#loc322 at #loc2)) +#loc421 = loc(callsite(#loc240 at #loc332)) +#loc422 = loc(callsite(#loc242 at #loc332)) +#loc423 = loc(callsite(#loc243 at #loc332)) +#loc424 = loc(callsite(#loc244 at #loc332)) +#loc425 = loc(callsite(#loc245 at #loc332)) +#loc426 = loc(callsite(#loc246 at #loc332)) +#loc427 = loc(callsite(#loc249 at #loc332)) +#loc428 = loc(callsite(#loc250 at #loc332)) +#loc429 = loc(callsite(#loc285 at #loc332)) +#loc431 = loc(callsite(#loc287 at #loc332)) +#loc432 = loc(callsite(#loc288 at #loc332)) +#loc433 = loc(callsite(#loc289 at #loc332)) +#loc434 = loc(callsite(#loc290 at #loc332)) +#loc435 = loc(callsite(#loc291 at #loc332)) +#loc436 = loc(callsite(#loc292 at #loc332)) +#loc437 = loc(callsite(#loc293 at #loc332)) +#loc438 = loc(callsite(#loc294 at #loc332)) +#loc439 = loc(callsite(#loc295 at #loc332)) +#loc441 = loc(callsite(#loc297 at #loc332)) +#loc442 = loc(callsite(#loc298 at #loc332)) +#loc443 = loc(callsite(#loc299 at #loc332)) +#loc444 = loc(callsite(#loc300 at #loc332)) +#loc445 = loc(callsite(#loc301 at #loc332)) +#loc446 = loc(callsite(#loc302 at #loc332)) +#loc447 = loc(callsite(#loc304 at #loc155)) +#loc448 = loc(callsite(#loc321 at #loc155)) +#loc449 = loc(callsite(#loc322 at #loc155)) +#loc450 = loc("m_i"(#loc355)) +#loc451 = loc(callsite(#loc223 at #loc358)) +#loc452 = loc(callsite(#loc224 at #loc358)) +#loc453 = loc(callsite(#loc225 at #loc358)) +#loc454 = loc(callsite(#loc227 at #loc358)) +#loc455 = loc(callsite(#loc41 at #loc358)) +#loc456 = loc(callsite(#loc3 at #loc358)) +#loc457 = loc(callsite(#loc64 at #loc362)) +#loc458 = loc(callsite(#loc64 at #loc363)) +#loc459 = loc(callsite(#loc104 at #loc401)) +#loc461 = loc(callsite(#loc116 at #loc411)) +#loc463 = loc(callsite(#loc225 at #loc415)) +#loc464 = loc(callsite(#loc227 at #loc415)) +#loc465 = loc(callsite(#loc3 at #loc415)) +#loc466 = loc(callsite(#loc303 at #loc418)) +#loc467 = loc(callsite(#loc305 at #loc418)) +#loc468 = loc(callsite(#loc306 at #loc418)) +#loc469 = loc(callsite(#loc307 at #loc418)) +#loc470 = loc(callsite(#loc308 at #loc418)) +#loc471 = loc(callsite(#loc309 at #loc418)) +#loc472 = loc(callsite(#loc310 at #loc418)) +#loc473 = loc(callsite(#loc311 at #loc418)) +#loc474 = loc(callsite(#loc312 at #loc418)) +#loc475 = loc(callsite(#loc313 at #loc418)) +#loc476 = loc(callsite(#loc314 at #loc418)) +#loc477 = loc(callsite(#loc315 at #loc418)) +#loc478 = loc(callsite(#loc316 at #loc418)) +#loc479 = loc(callsite(#loc317 at #loc418)) +#loc480 = loc(callsite(#loc318 at #loc418)) +#loc481 = loc(callsite(#loc319 at #loc418)) +#loc482 = loc(callsite(#loc320 at #loc418)) +#loc483 = loc(callsite(#loc223 at #loc423)) +#loc484 = loc(callsite(#loc224 at #loc423)) +#loc485 = loc(callsite(#loc225 at #loc423)) +#loc486 = loc(callsite(#loc227 at #loc423)) +#loc487 = loc(callsite(#loc41 at #loc423)) +#loc488 = loc(callsite(#loc3 at #loc423)) +#loc489 = loc(callsite(#loc104 at #loc430)) +#loc491 = loc(callsite(#loc116 at #loc440)) +#loc493 = loc(callsite(#loc225 at #loc444)) +#loc494 = loc(callsite(#loc227 at #loc444)) +#loc495 = loc(callsite(#loc3 at #loc444)) +#loc496 = loc(callsite(#loc303 at #loc447)) +#loc497 = loc(callsite(#loc305 at #loc447)) +#loc498 = loc(callsite(#loc306 at #loc447)) +#loc499 = loc(callsite(#loc307 at #loc447)) +#loc500 = loc(callsite(#loc308 at #loc447)) +#loc501 = loc(callsite(#loc309 at #loc447)) +#loc502 = loc(callsite(#loc310 at #loc447)) +#loc503 = loc(callsite(#loc311 at #loc447)) +#loc504 = loc(callsite(#loc312 at #loc447)) +#loc505 = loc(callsite(#loc313 at #loc447)) +#loc506 = loc(callsite(#loc314 at #loc447)) +#loc507 = loc(callsite(#loc315 at #loc447)) +#loc508 = loc(callsite(#loc316 at #loc447)) +#loc509 = loc(callsite(#loc317 at #loc447)) +#loc510 = loc(callsite(#loc318 at #loc447)) +#loc511 = loc(callsite(#loc319 at #loc447)) +#loc512 = loc(callsite(#loc320 at #loc447)) +#loc513 = loc("offs_n"(#loc450)) +#loc514 = loc(callsite(#loc106 at #loc459)) +#loc515 = loc(callsite(#loc118 at #loc461)) +#loc516 = loc(callsite(#loc106 at #loc489)) +#loc517 = loc(callsite(#loc118 at #loc491)) +#loc518 = loc("kv_offset"(#loc513)) +#loc519 = loc(callsite(#loc518 at #loc2)) +#loc520 = loc(callsite(#loc518 at #loc155)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3d7ec8561193f1d92e72704052a0092c746c0913 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/__grp__triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin", "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin new file mode 100644 index 0000000000000000000000000000000000000000..8904e8c2bce71d156390a5607922aef93f1fbed7 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json b/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3a688eb158ff7a6321c6878e01f44f6746467374 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.json @@ -0,0 +1 @@ +{"hash": "be6bebcc369cb0e47047747f9251da084c7bf038c9fe7f179c070929b2fde9a9", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 512, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir new file mode 100644 index 0000000000000000000000000000000000000000..529515528cf752fc63d080fdc9f7d9110d2f4820 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.llir @@ -0,0 +1,815 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = shl i32 %9, 3, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 24, !dbg !9 + %13 = lshr i32 %11, 5, !dbg !9 + %14 = and i32 %11, 7, !dbg !9 + %15 = lshr i32 %11, 3, !dbg !9 + %16 = and i32 %15, 7, !dbg !9 + %17 = or disjoint i32 %10, %14, !dbg !10 + %18 = or disjoint i32 %16, %10, !dbg !10 + %19 = icmp slt i32 %17, %4, !dbg !11 + %20 = icmp slt i32 %18, %4, !dbg !11 + %21 = or disjoint i32 %16, 8, !dbg !12 + %22 = shl nuw nsw i32 %14, 1, !dbg !12 + %23 = sext i32 %17 to i64, !dbg !13 + %24 = sext i32 %18 to i64, !dbg !13 + %.frozen = freeze i64 %3, !dbg !14 + %25 = sdiv i64 %23, %.frozen, !dbg !14 + %26 = mul i64 %25, %.frozen, !dbg !13 + %.decomposed = sub i64 %23, %26, !dbg !13 + %.frozen33 = freeze i64 %3, !dbg !14 + %27 = sdiv i64 %24, %.frozen33, !dbg !14 + %28 = mul i64 %27, %.frozen33, !dbg !13 + %.decomposed34 = sub i64 %24, %28, !dbg !13 + %29 = zext nneg i32 %16 to i64, !dbg !15 + %30 = zext nneg i32 %21 to i64, !dbg !15 + %31 = zext nneg i32 %22 to i64, !dbg !15 + %32 = shl nsw i64 %25, 4, !dbg !16 + %33 = mul i64 %3, %29, !dbg !17 + %34 = mul i64 %3, %30, !dbg !17 + %35 = shl i64 %3, 4, !dbg !18 + %36 = mul i64 %25, %35, !dbg !19 + %37 = getelementptr i32, ptr addrspace(1) %0, i64 %.decomposed, !dbg !20 + %38 = getelementptr i32, ptr addrspace(1) %37, i64 %29, !dbg !20 + %39 = getelementptr i32, ptr addrspace(1) %38, i64 %32, !dbg !20 + %40 = getelementptr i32, ptr addrspace(1) %39, i64 %33, !dbg !20 + %41 = getelementptr i32, ptr addrspace(1) %40, i64 %36, !dbg !20 + %42 = getelementptr i32, ptr addrspace(1) %37, i64 %30, !dbg !20 + %43 = getelementptr i32, ptr addrspace(1) %42, i64 %32, !dbg !20 + %44 = getelementptr i32, ptr addrspace(1) %43, i64 %34, !dbg !20 + %45 = getelementptr i32, ptr addrspace(1) %44, i64 %36, !dbg !20 + %46 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %47 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %41, i64 %46, i1 %19) #4, !dbg !21 + %48 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %49 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %45, i64 %48, i1 %19) #4, !dbg !21 + %50 = lshr i32 %11, 4, !dbg !22 + %.lobit = and i32 %50, 1, !dbg !22 + %51 = and i32 %11, 8, !dbg !22 + %.not = icmp eq i32 %51, 0, !dbg !22 + %.lobit1 = lshr exact i32 %51, 3, !dbg !22 + %52 = and i32 %11, 32, !dbg !22 + %.not3 = icmp eq i32 %52, 0, !dbg !22 + %.lobit2 = lshr exact i32 %52, 5, !dbg !22 + %53 = xor i32 %.lobit1, 1, !dbg !26 + %54 = xor i32 %.lobit, 1, !dbg !26 + %55 = xor i32 %.lobit2, 1, !dbg !26 + %56 = mul nuw nsw i32 %47, %53, !dbg !27 + %57 = mul nuw nsw i32 %49, %53, !dbg !27 + %58 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 8, i32 31), !dbg !28 + %59 = add i32 %58, %56, !dbg !31 + %60 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %57, i32 8, i32 31), !dbg !28 + %61 = add i32 %60, %57, !dbg !31 + %62 = mul nuw nsw i32 %47, %.lobit1, !dbg !32 + %63 = mul nuw nsw i32 %49, %.lobit1, !dbg !32 + %64 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %62, i32 8, i32 31), !dbg !28 + %65 = add i32 %64, %62, !dbg !31 + %66 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 8, i32 31), !dbg !28 + %67 = add i32 %66, %63, !dbg !31 + %68 = mul nuw nsw i32 %53, %16, !dbg !33 + %69 = mul nuw nsw i32 %21, %53, !dbg !33 + %70 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %68, i32 8, i32 31), !dbg !28 + %71 = add i32 %70, %68, !dbg !31 + %72 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %69, i32 8, i32 31), !dbg !28 + %73 = add i32 %72, %69, !dbg !31 + %74 = mul nuw nsw i32 %16, %.lobit1, !dbg !34 + %75 = mul nuw nsw i32 %21, %.lobit1, !dbg !34 + %76 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %74, i32 8, i32 31), !dbg !28 + %77 = add i32 %76, %74, !dbg !31 + %78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %75, i32 8, i32 31), !dbg !28 + %79 = add i32 %78, %75, !dbg !31 + %80 = trunc i32 %50 to i1, !dbg !35 + %81 = icmp sge i32 %59, %65, !dbg !35 + %82 = icmp ne i32 %59, %65, !dbg !35 + %83 = icmp sle i32 %71, %77, !dbg !35 + %84 = or i1 %82, %83, !dbg !35 + %85 = and i1 %81, %84, !dbg !35 + %.not4 = xor i1 %85, %80, !dbg !35 + %86 = icmp sge i32 %61, %67, !dbg !35 + %87 = icmp ne i32 %61, %67, !dbg !35 + %88 = icmp sle i32 %73, %79, !dbg !35 + %89 = or i1 %87, %88, !dbg !35 + %90 = and i1 %86, %89, !dbg !35 + %.not5 = xor i1 %90, %80, !dbg !35 + %91 = xor i32 %65, %59, !dbg !36 + %92 = xor i32 %67, %61, !dbg !36 + %93 = select i1 %.not4, i32 0, i32 %91, !dbg !37 + %94 = select i1 %.not5, i32 0, i32 %92, !dbg !37 + %95 = xor i32 %93, %47, !dbg !38 + %96 = xor i32 %94, %49, !dbg !38 + %97 = xor i32 %77, %71, !dbg !39 + %98 = xor i32 %79, %73, !dbg !39 + %99 = select i1 %.not4, i32 0, i32 %97, !dbg !40 + %100 = select i1 %.not5, i32 0, i32 %98, !dbg !40 + %101 = xor i32 %99, %16, !dbg !41 + %102 = xor i32 %100, %21, !dbg !41 + %103 = mul nuw nsw i32 %95, %54, !dbg !27 + %104 = mul nuw nsw i32 %96, %54, !dbg !27 + %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %103, i32 16, i32 31), !dbg !28 + %106 = add i32 %103, %105, !dbg !31 + %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 16, i32 31), !dbg !28 + %108 = add i32 %104, %107, !dbg !31 + %109 = mul nuw nsw i32 %95, %.lobit, !dbg !32 + %110 = mul nuw nsw i32 %96, %.lobit, !dbg !32 + %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %109, i32 16, i32 31), !dbg !28 + %112 = add i32 %109, %111, !dbg !31 + %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 16, i32 31), !dbg !28 + %114 = add i32 %110, %113, !dbg !31 + %115 = mul nuw nsw i32 %101, %54, !dbg !33 + %116 = mul nuw nsw i32 %102, %54, !dbg !33 + %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %115, i32 16, i32 31), !dbg !28 + %118 = add i32 %115, %117, !dbg !31 + %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 16, i32 31), !dbg !28 + %120 = add i32 %116, %119, !dbg !31 + %121 = mul nuw nsw i32 %101, %.lobit, !dbg !34 + %122 = mul nuw nsw i32 %102, %.lobit, !dbg !34 + %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 16, i32 31), !dbg !28 + %124 = add i32 %121, %123, !dbg !31 + %125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 16, i32 31), !dbg !28 + %126 = add i32 %122, %125, !dbg !31 + %127 = icmp slt i32 %106, %112, !dbg !42 + %128 = icmp slt i32 %108, %114, !dbg !42 + %129 = icmp eq i32 %106, %112, !dbg !43 + %130 = icmp eq i32 %108, %114, !dbg !43 + %131 = icmp sgt i32 %118, %124, !dbg !44 + %132 = icmp sgt i32 %120, %126, !dbg !44 + %133 = and i1 %129, %131, !dbg !45 + %134 = and i1 %130, %132, !dbg !45 + %135 = or i1 %127, %133, !dbg !46 + %136 = or i1 %128, %134, !dbg !46 + %137 = zext i1 %135 to i32, !dbg !47 + %138 = zext i1 %136 to i32, !dbg !47 + %.not6 = icmp eq i32 %.lobit2, %137, !dbg !35 + %.not7 = icmp eq i32 %.lobit2, %138, !dbg !35 + %139 = xor i32 %106, %112, !dbg !36 + %140 = xor i32 %108, %114, !dbg !36 + %141 = select i1 %.not6, i32 0, i32 %139, !dbg !37 + %142 = select i1 %.not7, i32 0, i32 %140, !dbg !37 + %143 = xor i32 %141, %95, !dbg !38 + %144 = xor i32 %142, %96, !dbg !38 + %145 = xor i32 %118, %124, !dbg !39 + %146 = xor i32 %120, %126, !dbg !39 + %147 = select i1 %.not6, i32 0, i32 %145, !dbg !40 + %148 = select i1 %.not7, i32 0, i32 %146, !dbg !40 + %149 = xor i32 %147, %101, !dbg !41 + %150 = xor i32 %148, %102, !dbg !41 + %151 = mul nuw nsw i32 %143, %53, !dbg !27 + %152 = mul nuw nsw i32 %144, %53, !dbg !27 + %153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %151, i32 8, i32 31), !dbg !28 + %154 = add i32 %151, %153, !dbg !31 + %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 8, i32 31), !dbg !28 + %156 = add i32 %152, %155, !dbg !31 + %157 = mul nuw nsw i32 %143, %.lobit1, !dbg !32 + %158 = mul nuw nsw i32 %144, %.lobit1, !dbg !32 + %159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 8, i32 31), !dbg !28 + %160 = add i32 %157, %159, !dbg !31 + %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %158, i32 8, i32 31), !dbg !28 + %162 = add i32 %158, %161, !dbg !31 + %163 = mul nuw nsw i32 %149, %53, !dbg !33 + %164 = mul nuw nsw i32 %150, %53, !dbg !33 + %165 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %163, i32 8, i32 31), !dbg !28 + %166 = add i32 %163, %165, !dbg !31 + %167 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 8, i32 31), !dbg !28 + %168 = add i32 %164, %167, !dbg !31 + %169 = mul nuw nsw i32 %149, %.lobit1, !dbg !34 + %170 = mul nuw nsw i32 %150, %.lobit1, !dbg !34 + %171 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 8, i32 31), !dbg !28 + %172 = add i32 %169, %171, !dbg !31 + %173 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %170, i32 8, i32 31), !dbg !28 + %174 = add i32 %170, %173, !dbg !31 + %175 = icmp slt i32 %154, %160, !dbg !42 + %176 = icmp slt i32 %156, %162, !dbg !42 + %177 = icmp eq i32 %154, %160, !dbg !43 + %178 = icmp eq i32 %156, %162, !dbg !43 + %179 = icmp sgt i32 %166, %172, !dbg !44 + %180 = icmp sgt i32 %168, %174, !dbg !44 + %181 = and i1 %177, %179, !dbg !45 + %182 = and i1 %178, %180, !dbg !45 + %183 = or i1 %175, %181, !dbg !46 + %184 = or i1 %176, %182, !dbg !46 + %185 = zext i1 %183 to i32, !dbg !47 + %186 = zext i1 %184 to i32, !dbg !47 + %.not8 = icmp eq i32 %.lobit2, %185, !dbg !35 + %.not9 = icmp eq i32 %.lobit2, %186, !dbg !35 + %187 = xor i32 %154, %160, !dbg !36 + %188 = xor i32 %156, %162, !dbg !36 + %189 = select i1 %.not8, i32 0, i32 %187, !dbg !37 + %190 = select i1 %.not9, i32 0, i32 %188, !dbg !37 + %191 = xor i32 %189, %143, !dbg !38 + %192 = xor i32 %190, %144, !dbg !38 + %193 = xor i32 %166, %172, !dbg !39 + %194 = xor i32 %168, %174, !dbg !39 + %195 = select i1 %.not8, i32 0, i32 %193, !dbg !40 + %196 = select i1 %.not9, i32 0, i32 %194, !dbg !40 + %197 = xor i32 %195, %149, !dbg !41 + %198 = xor i32 %196, %150, !dbg !41 + %199 = mul nuw nsw i32 %191, %55, !dbg !27 + %200 = mul nuw nsw i32 %192, %55, !dbg !27 + %201 = and i32 %13, 1, !dbg !28 + %202 = shl nuw nsw i32 %11, 1, !dbg !28 + %203 = and i32 %202, 48, !dbg !28 + %204 = or disjoint i32 %203, %22, !dbg !28 + %.idx31 = shl nuw nsw i32 %204, 3, !dbg !28 + %205 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx31, !dbg !28 + %206 = getelementptr i32, ptr addrspace(3) %205, i32 %201, !dbg !28 + %207 = insertelement <1 x i32> poison, i32 %199, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %206, <1 x i32> %207, i1 true) #4, !dbg !28 + %208 = getelementptr i8, ptr addrspace(3) %205, i32 8, !dbg !28 + %209 = getelementptr i32, ptr addrspace(3) %208, i32 %201, !dbg !28 + %210 = insertelement <1 x i32> poison, i32 %200, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %210, i1 true) #4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %211 = icmp samesign ult i32 %11, 128, !dbg !28 + %212 = getelementptr i32, ptr addrspace(3) @global_smem, i32 %11, !dbg !28 + %213 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %212, i1 %211) #4, !dbg !28 + %214 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %213, i32 1, i32 31), !dbg !28 + %215 = add i32 %214, %213, !dbg !31 + %216 = and i32 %11, 897, !dbg !28 + %217 = icmp eq i32 %216, 0, !dbg !28 + %218 = insertelement <1 x i32> poison, i32 %215, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %218, i1 %217) #4, !dbg !28 + %219 = getelementptr i8, ptr addrspace(3) %212, i32 256, !dbg !28 + %220 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %219, i1 %211) #4, !dbg !28 + %221 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 1, i32 31), !dbg !28 + %222 = add i32 %221, %220, !dbg !31 + %223 = insertelement <1 x i32> poison, i32 %222, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %219, <1 x i32> %223, i1 %217) #4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %224 = load i32, ptr addrspace(3) %205, align 16, !dbg !28 + %225 = load i32, ptr addrspace(3) %208, align 8, !dbg !28 + %226 = mul nuw nsw i32 %191, %.lobit2, !dbg !32 + %227 = mul nuw nsw i32 %192, %.lobit2, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %228 = insertelement <1 x i32> poison, i32 %226, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %206, <1 x i32> %228, i1 true) #4, !dbg !28 + %229 = insertelement <1 x i32> poison, i32 %227, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %229, i1 true) #4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %230 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %212, i1 %211) #4, !dbg !28 + %231 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %230, i32 1, i32 31), !dbg !28 + %232 = add i32 %231, %230, !dbg !31 + %233 = insertelement <1 x i32> poison, i32 %232, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %233, i1 %217) #4, !dbg !28 + %234 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %219, i1 %211) #4, !dbg !28 + %235 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %234, i32 1, i32 31), !dbg !28 + %236 = add i32 %235, %234, !dbg !31 + %237 = insertelement <1 x i32> poison, i32 %236, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %219, <1 x i32> %237, i1 %217) #4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %238 = load i32, ptr addrspace(3) %205, align 16, !dbg !28 + %239 = load i32, ptr addrspace(3) %208, align 8, !dbg !28 + %240 = mul nuw nsw i32 %197, %55, !dbg !33 + %241 = mul nuw nsw i32 %198, %55, !dbg !33 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %242 = insertelement <1 x i32> poison, i32 %240, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %206, <1 x i32> %242, i1 true) #4, !dbg !28 + %243 = insertelement <1 x i32> poison, i32 %241, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %243, i1 true) #4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %244 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %212, i1 %211) #4, !dbg !28 + %245 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %244, i32 1, i32 31), !dbg !28 + %246 = add i32 %245, %244, !dbg !31 + %247 = insertelement <1 x i32> poison, i32 %246, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %247, i1 %217) #4, !dbg !28 + %248 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %219, i1 %211) #4, !dbg !28 + %249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %248, i32 1, i32 31), !dbg !28 + %250 = add i32 %249, %248, !dbg !31 + %251 = insertelement <1 x i32> poison, i32 %250, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %219, <1 x i32> %251, i1 %217) #4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %252 = load i32, ptr addrspace(3) %205, align 16, !dbg !28 + %253 = load i32, ptr addrspace(3) %208, align 8, !dbg !28 + %254 = mul nuw nsw i32 %197, %.lobit2, !dbg !34 + %255 = mul nuw nsw i32 %198, %.lobit2, !dbg !34 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %256 = insertelement <1 x i32> poison, i32 %254, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %206, <1 x i32> %256, i1 true) #4, !dbg !28 + %257 = insertelement <1 x i32> poison, i32 %255, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %257, i1 true) #4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %258 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %212, i1 %211) #4, !dbg !28 + %259 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %258, i32 1, i32 31), !dbg !28 + %260 = add i32 %259, %258, !dbg !31 + %261 = insertelement <1 x i32> poison, i32 %260, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %261, i1 %217) #4, !dbg !28 + %262 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %219, i1 %211) #4, !dbg !28 + %263 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %262, i32 1, i32 31), !dbg !28 + %264 = add i32 %263, %262, !dbg !31 + %265 = insertelement <1 x i32> poison, i32 %264, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %219, <1 x i32> %265, i1 %217) #4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %266 = load i32, ptr addrspace(3) %205, align 16, !dbg !28 + %267 = load i32, ptr addrspace(3) %208, align 8, !dbg !28 + %268 = icmp slt i32 %224, %238, !dbg !42 + %269 = icmp sge i32 %225, %239, !dbg !42 + %270 = icmp eq i32 %224, %238, !dbg !43 + %271 = icmp ne i32 %225, %239, !dbg !43 + %272 = icmp sgt i32 %252, %266, !dbg !44 + %273 = icmp sle i32 %253, %267, !dbg !44 + %274 = and i1 %270, %272, !dbg !45 + %.not15 = or i1 %271, %273, !dbg !46 + %275 = or i1 %268, %274, !dbg !46 + %.not12 = and i1 %269, %.not15, !dbg !47 + %276 = xor i32 %238, %224, !dbg !36 + %277 = xor i32 %239, %225, !dbg !36 + %278 = select i1 %275, i32 %276, i32 0, !dbg !37 + %279 = select i1 %.not12, i32 %277, i32 0, !dbg !37 + %280 = xor i32 %278, %191, !dbg !38 + %281 = xor i32 %279, %192, !dbg !38 + %282 = xor i32 %266, %252, !dbg !39 + %283 = xor i32 %267, %253, !dbg !39 + %284 = select i1 %275, i32 %282, i32 0, !dbg !40 + %285 = select i1 %.not12, i32 %283, i32 0, !dbg !40 + %286 = xor i32 %284, %197, !dbg !41 + %287 = xor i32 %285, %198, !dbg !41 + %288 = mul nuw nsw i32 %280, %54, !dbg !27 + %289 = mul nuw nsw i32 %281, %54, !dbg !27 + %290 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %288, i32 16, i32 31), !dbg !28 + %291 = add i32 %288, %290, !dbg !31 + %292 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %289, i32 16, i32 31), !dbg !28 + %293 = add i32 %289, %292, !dbg !31 + %294 = mul nuw nsw i32 %280, %.lobit, !dbg !32 + %295 = mul nuw nsw i32 %281, %.lobit, !dbg !32 + %296 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %294, i32 16, i32 31), !dbg !28 + %297 = add i32 %294, %296, !dbg !31 + %298 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %295, i32 16, i32 31), !dbg !28 + %299 = add i32 %295, %298, !dbg !31 + %300 = mul nuw nsw i32 %286, %54, !dbg !33 + %301 = mul nuw nsw i32 %287, %54, !dbg !33 + %302 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %300, i32 16, i32 31), !dbg !28 + %303 = add i32 %300, %302, !dbg !31 + %304 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %301, i32 16, i32 31), !dbg !28 + %305 = add i32 %301, %304, !dbg !31 + %306 = mul nuw nsw i32 %286, %.lobit, !dbg !34 + %307 = mul nuw nsw i32 %287, %.lobit, !dbg !34 + %308 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %306, i32 16, i32 31), !dbg !28 + %309 = add i32 %308, %306, !dbg !31 + %310 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %307, i32 16, i32 31), !dbg !28 + %311 = add i32 %310, %307, !dbg !31 + %312 = icmp slt i32 %291, %297, !dbg !42 + %313 = icmp sge i32 %293, %299, !dbg !42 + %314 = icmp eq i32 %291, %297, !dbg !43 + %315 = icmp ne i32 %293, %299, !dbg !43 + %316 = icmp sgt i32 %303, %309, !dbg !44 + %317 = icmp sle i32 %305, %311, !dbg !44 + %318 = and i1 %314, %316, !dbg !45 + %.not21 = or i1 %315, %317, !dbg !46 + %319 = or i1 %312, %318, !dbg !46 + %.not18 = and i1 %313, %.not21, !dbg !47 + %320 = xor i32 %291, %297, !dbg !36 + %321 = xor i32 %293, %299, !dbg !36 + %322 = select i1 %319, i32 %320, i32 0, !dbg !37 + %323 = select i1 %.not18, i32 %321, i32 0, !dbg !37 + %324 = xor i32 %322, %280, !dbg !38 + %325 = xor i32 %323, %281, !dbg !38 + %326 = xor i32 %309, %303, !dbg !39 + %327 = xor i32 %311, %305, !dbg !39 + %328 = select i1 %319, i32 %326, i32 0, !dbg !40 + %329 = select i1 %.not18, i32 %327, i32 0, !dbg !40 + %330 = xor i32 %328, %286, !dbg !41 + %331 = xor i32 %329, %287, !dbg !41 + %332 = mul nuw nsw i32 %324, %53, !dbg !27 + %333 = mul nuw nsw i32 %325, %53, !dbg !27 + %334 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %332, i32 8, i32 31), !dbg !28 + %335 = add i32 %332, %334, !dbg !31 + %336 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %333, i32 8, i32 31), !dbg !28 + %337 = add i32 %333, %336, !dbg !31 + %338 = mul nuw nsw i32 %324, %.lobit1, !dbg !32 + %339 = mul nuw nsw i32 %325, %.lobit1, !dbg !32 + %340 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %338, i32 8, i32 31), !dbg !28 + %341 = add i32 %338, %340, !dbg !31 + %342 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %339, i32 8, i32 31), !dbg !28 + %343 = add i32 %339, %342, !dbg !31 + %344 = mul nuw nsw i32 %330, %53, !dbg !33 + %345 = mul nuw nsw i32 %331, %53, !dbg !33 + %346 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %344, i32 8, i32 31), !dbg !28 + %347 = add i32 %344, %346, !dbg !31 + %348 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %345, i32 8, i32 31), !dbg !28 + %349 = add i32 %345, %348, !dbg !31 + %350 = mul nuw nsw i32 %330, %.lobit1, !dbg !34 + %351 = mul nuw nsw i32 %331, %.lobit1, !dbg !34 + %352 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %350, i32 8, i32 31), !dbg !28 + %353 = add i32 %352, %350, !dbg !31 + %354 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %351, i32 8, i32 31), !dbg !28 + %355 = add i32 %354, %351, !dbg !31 + %356 = icmp slt i32 %335, %341, !dbg !42 + %357 = icmp sge i32 %337, %343, !dbg !42 + %358 = icmp eq i32 %335, %341, !dbg !43 + %359 = icmp ne i32 %337, %343, !dbg !43 + %360 = icmp sgt i32 %347, %353, !dbg !44 + %361 = icmp sle i32 %349, %355, !dbg !44 + %362 = and i1 %358, %360, !dbg !45 + %.not27 = or i1 %359, %361, !dbg !46 + %363 = or i1 %356, %362, !dbg !46 + %.not24 = and i1 %357, %.not27, !dbg !47 + %364 = xor i32 %335, %341, !dbg !36 + %365 = xor i32 %337, %343, !dbg !36 + %366 = select i1 %363, i32 %364, i32 0, !dbg !37 + %367 = select i1 %.not24, i32 %365, i32 0, !dbg !37 + %368 = xor i32 %366, %324, !dbg !38 + %369 = xor i32 %367, %325, !dbg !38 + %370 = xor i32 %353, %347, !dbg !39 + %371 = xor i32 %355, %349, !dbg !39 + %372 = select i1 %363, i32 %370, i32 0, !dbg !40 + %373 = select i1 %.not24, i32 %371, i32 0, !dbg !40 + %374 = xor i32 %372, %330, !dbg !41 + %375 = xor i32 %373, %331, !dbg !41 + %376 = icmp slt i32 %368, %369, !dbg !42 + %377 = icmp eq i32 %368, %369, !dbg !43 + %378 = icmp sgt i32 %374, %375, !dbg !44 + %379 = and i1 %377, %378, !dbg !45 + %380 = or i1 %376, %379, !dbg !46 + %381 = xor i32 %369, %368, !dbg !36 + %382 = select i1 %380, i32 %381, i32 0, !dbg !37 + %383 = xor i32 %382, %368, !dbg !38 + %384 = xor i32 %382, %369, !dbg !38 + %385 = xor i32 %375, %374, !dbg !39 + %386 = select i1 %380, i32 %385, i32 0, !dbg !40 + %387 = xor i32 %386, %374, !dbg !41 + %388 = xor i32 %386, %375, !dbg !41 + %389 = mul nuw nsw i32 %383, %55, !dbg !27 + %390 = mul nuw nsw i32 %384, %55, !dbg !27 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %391 = insertelement <1 x i32> poison, i32 %389, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %206, <1 x i32> %391, i1 true) #4, !dbg !28 + %392 = insertelement <1 x i32> poison, i32 %390, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %392, i1 true) #4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %393 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %212, i1 %211) #4, !dbg !28 + %394 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %393, i32 1, i32 31), !dbg !28 + %395 = add i32 %394, %393, !dbg !31 + %396 = insertelement <1 x i32> poison, i32 %395, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %396, i1 %217) #4, !dbg !28 + %397 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %219, i1 %211) #4, !dbg !28 + %398 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %397, i32 1, i32 31), !dbg !28 + %399 = add i32 %398, %397, !dbg !31 + %400 = insertelement <1 x i32> poison, i32 %399, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %219, <1 x i32> %400, i1 %217) #4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %401 = load i32, ptr addrspace(3) %205, align 16, !dbg !28 + %402 = load i32, ptr addrspace(3) %208, align 8, !dbg !28 + %403 = mul nuw nsw i32 %383, %.lobit2, !dbg !32 + %404 = mul nuw nsw i32 %384, %.lobit2, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %405 = insertelement <1 x i32> poison, i32 %403, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %206, <1 x i32> %405, i1 true) #4, !dbg !28 + %406 = insertelement <1 x i32> poison, i32 %404, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %406, i1 true) #4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %407 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %212, i1 %211) #4, !dbg !28 + %408 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %407, i32 1, i32 31), !dbg !28 + %409 = add i32 %408, %407, !dbg !31 + %410 = insertelement <1 x i32> poison, i32 %409, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %410, i1 %217) #4, !dbg !28 + %411 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %219, i1 %211) #4, !dbg !28 + %412 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %411, i32 1, i32 31), !dbg !28 + %413 = add i32 %412, %411, !dbg !31 + %414 = insertelement <1 x i32> poison, i32 %413, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %219, <1 x i32> %414, i1 %217) #4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %415 = load i32, ptr addrspace(3) %205, align 16, !dbg !28 + %416 = load i32, ptr addrspace(3) %208, align 8, !dbg !28 + %417 = mul nuw nsw i32 %387, %55, !dbg !33 + %418 = mul nuw nsw i32 %388, %55, !dbg !33 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %419 = insertelement <1 x i32> poison, i32 %417, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %206, <1 x i32> %419, i1 true) #4, !dbg !28 + %420 = insertelement <1 x i32> poison, i32 %418, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %420, i1 true) #4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %421 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %212, i1 %211) #4, !dbg !28 + %422 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %421, i32 1, i32 31), !dbg !28 + %423 = add i32 %422, %421, !dbg !31 + %424 = insertelement <1 x i32> poison, i32 %423, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %424, i1 %217) #4, !dbg !28 + %425 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %219, i1 %211) #4, !dbg !28 + %426 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %425, i32 1, i32 31), !dbg !28 + %427 = add i32 %426, %425, !dbg !31 + %428 = insertelement <1 x i32> poison, i32 %427, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %219, <1 x i32> %428, i1 %217) #4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %429 = load i32, ptr addrspace(3) %205, align 16, !dbg !28 + %430 = load i32, ptr addrspace(3) %208, align 8, !dbg !28 + %431 = mul nuw nsw i32 %387, %.lobit2, !dbg !34 + %432 = mul nuw nsw i32 %388, %.lobit2, !dbg !34 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %433 = insertelement <1 x i32> poison, i32 %431, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %206, <1 x i32> %433, i1 true) #4, !dbg !28 + %434 = insertelement <1 x i32> poison, i32 %432, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, <1 x i32> %434, i1 true) #4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %435 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %212, i1 %211) #4, !dbg !28 + %436 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %435, i32 1, i32 31), !dbg !28 + %437 = add i32 %436, %435, !dbg !31 + %438 = insertelement <1 x i32> poison, i32 %437, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %212, <1 x i32> %438, i1 %217) #4, !dbg !28 + %439 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %219, i1 %211) #4, !dbg !28 + %440 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %439, i32 1, i32 31), !dbg !28 + %441 = add i32 %440, %439, !dbg !31 + %442 = insertelement <1 x i32> poison, i32 %441, i64 0, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %219, <1 x i32> %442, i1 %217) #4, !dbg !28 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !28 + %443 = load i32, ptr addrspace(3) %205, align 16, !dbg !28 + %444 = load i32, ptr addrspace(3) %208, align 8, !dbg !28 + %445 = icmp slt i32 %401, %415, !dbg !42 + %446 = icmp slt i32 %402, %416, !dbg !42 + %447 = icmp eq i32 %401, %415, !dbg !43 + %448 = icmp eq i32 %402, %416, !dbg !43 + %449 = icmp sgt i32 %429, %443, !dbg !44 + %450 = icmp sgt i32 %430, %444, !dbg !44 + %451 = and i1 %447, %449, !dbg !45 + %452 = and i1 %448, %450, !dbg !45 + %453 = or i1 %445, %451, !dbg !46 + %454 = or i1 %446, %452, !dbg !46 + %455 = xor i32 %415, %401, !dbg !36 + %456 = xor i32 %416, %402, !dbg !36 + %457 = select i1 %453, i32 %455, i32 0, !dbg !37 + %458 = select i1 %454, i32 %456, i32 0, !dbg !37 + %459 = xor i32 %457, %383, !dbg !38 + %460 = xor i32 %458, %384, !dbg !38 + %461 = xor i32 %443, %429, !dbg !39 + %462 = xor i32 %444, %430, !dbg !39 + %463 = select i1 %453, i32 %461, i32 0, !dbg !40 + %464 = select i1 %454, i32 %462, i32 0, !dbg !40 + %465 = xor i32 %463, %387, !dbg !41 + %466 = xor i32 %464, %388, !dbg !41 + %467 = mul nuw nsw i32 %459, %54, !dbg !27 + %468 = mul nuw nsw i32 %460, %54, !dbg !27 + %469 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %467, i32 16, i32 31), !dbg !28 + %470 = add i32 %467, %469, !dbg !31 + %471 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %468, i32 16, i32 31), !dbg !28 + %472 = add i32 %468, %471, !dbg !31 + %473 = mul nuw nsw i32 %459, %.lobit, !dbg !32 + %474 = mul nuw nsw i32 %460, %.lobit, !dbg !32 + %475 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %473, i32 16, i32 31), !dbg !28 + %476 = add i32 %473, %475, !dbg !31 + %477 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %474, i32 16, i32 31), !dbg !28 + %478 = add i32 %474, %477, !dbg !31 + %479 = mul nuw nsw i32 %465, %54, !dbg !33 + %480 = mul nuw nsw i32 %466, %54, !dbg !33 + %481 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %479, i32 16, i32 31), !dbg !28 + %482 = add i32 %479, %481, !dbg !31 + %483 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %480, i32 16, i32 31), !dbg !28 + %484 = add i32 %480, %483, !dbg !31 + %485 = mul nuw nsw i32 %465, %.lobit, !dbg !34 + %486 = mul nuw nsw i32 %466, %.lobit, !dbg !34 + %487 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %485, i32 16, i32 31), !dbg !28 + %488 = add i32 %487, %485, !dbg !31 + %489 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %486, i32 16, i32 31), !dbg !28 + %490 = add i32 %489, %486, !dbg !31 + %491 = icmp slt i32 %470, %476, !dbg !42 + %492 = icmp slt i32 %472, %478, !dbg !42 + %493 = icmp eq i32 %470, %476, !dbg !43 + %494 = icmp eq i32 %472, %478, !dbg !43 + %495 = icmp sgt i32 %482, %488, !dbg !44 + %496 = icmp sgt i32 %484, %490, !dbg !44 + %497 = and i1 %493, %495, !dbg !45 + %498 = and i1 %494, %496, !dbg !45 + %499 = or i1 %491, %497, !dbg !46 + %500 = or i1 %492, %498, !dbg !46 + %501 = xor i32 %470, %476, !dbg !36 + %502 = xor i32 %472, %478, !dbg !36 + %503 = select i1 %499, i32 %501, i32 0, !dbg !37 + %504 = select i1 %500, i32 %502, i32 0, !dbg !37 + %505 = xor i32 %503, %459, !dbg !38 + %506 = xor i32 %504, %460, !dbg !38 + %507 = xor i32 %488, %482, !dbg !39 + %508 = xor i32 %490, %484, !dbg !39 + %509 = select i1 %499, i32 %507, i32 0, !dbg !40 + %510 = select i1 %500, i32 %508, i32 0, !dbg !40 + %511 = xor i32 %509, %465, !dbg !41 + %512 = xor i32 %510, %466, !dbg !41 + %513 = mul nuw nsw i32 %505, %53, !dbg !27 + %514 = mul nuw nsw i32 %506, %53, !dbg !27 + %515 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %513, i32 8, i32 31), !dbg !28 + %516 = add i32 %513, %515, !dbg !31 + %517 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %514, i32 8, i32 31), !dbg !28 + %518 = add i32 %514, %517, !dbg !31 + %519 = mul nuw nsw i32 %505, %.lobit1, !dbg !32 + %520 = mul nuw nsw i32 %506, %.lobit1, !dbg !32 + %521 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %519, i32 8, i32 31), !dbg !28 + %522 = add i32 %519, %521, !dbg !31 + %523 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %520, i32 8, i32 31), !dbg !28 + %524 = add i32 %520, %523, !dbg !31 + %525 = mul nuw nsw i32 %511, %53, !dbg !33 + %526 = mul nuw nsw i32 %512, %53, !dbg !33 + %527 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %525, i32 8, i32 31), !dbg !28 + %528 = add i32 %525, %527, !dbg !31 + %529 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %526, i32 8, i32 31), !dbg !28 + %530 = add i32 %526, %529, !dbg !31 + %531 = mul nuw nsw i32 %511, %.lobit1, !dbg !34 + %532 = mul nuw nsw i32 %512, %.lobit1, !dbg !34 + %533 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %531, i32 8, i32 31), !dbg !28 + %534 = add i32 %533, %531, !dbg !31 + %535 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %532, i32 8, i32 31), !dbg !28 + %536 = add i32 %535, %532, !dbg !31 + %537 = icmp slt i32 %516, %522, !dbg !42 + %538 = icmp slt i32 %518, %524, !dbg !42 + %539 = icmp eq i32 %516, %522, !dbg !43 + %540 = icmp eq i32 %518, %524, !dbg !43 + %541 = icmp sgt i32 %528, %534, !dbg !44 + %542 = icmp sgt i32 %530, %536, !dbg !44 + %543 = and i1 %539, %541, !dbg !45 + %544 = and i1 %540, %542, !dbg !45 + %545 = or i1 %537, %543, !dbg !46 + %546 = or i1 %538, %544, !dbg !46 + %547 = xor i32 %534, %528, !dbg !39 + %548 = xor i32 %536, %530, !dbg !39 + %549 = select i1 %545, i32 %547, i32 0, !dbg !40 + %550 = select i1 %546, i32 %548, i32 0, !dbg !40 + %551 = xor i32 %549, %511, !dbg !41 + %552 = xor i32 %550, %512, !dbg !41 + %narrow = select i1 %19, i32 %47, i32 0, !dbg !48 + %553 = sext i32 %narrow to i64, !dbg !48 + %narrow28 = select i1 %19, i32 %49, i32 0, !dbg !48 + %554 = sext i32 %narrow28 to i64, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !49 + %555 = add nsw i64 %554, %553, !dbg !51 + %extelt.offset = lshr i64 %555, 32, !dbg !49 + %556 = trunc nuw i64 %extelt.offset to i32, !dbg !49 + %557 = trunc i64 %555 to i32, !dbg !49 + %558 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %557, i32 16, i32 31), !dbg !49 + %559 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %556, i32 16, i32 31), !dbg !49 + %560 = insertelement <2 x i32> poison, i32 %558, i64 0, !dbg !49 + %561 = insertelement <2 x i32> %560, i32 %559, i64 1, !dbg !49 + %562 = bitcast <2 x i32> %561 to i64, !dbg !49 + %563 = add i64 %555, %562, !dbg !51 + %extelt.offset29 = lshr i64 %563, 32, !dbg !49 + %564 = trunc nuw i64 %extelt.offset29 to i32, !dbg !49 + %565 = trunc i64 %563 to i32, !dbg !49 + %566 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %565, i32 8, i32 31), !dbg !49 + %567 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %564, i32 8, i32 31), !dbg !49 + %568 = insertelement <2 x i32> poison, i32 %566, i64 0, !dbg !49 + %569 = insertelement <2 x i32> %568, i32 %567, i64 1, !dbg !49 + %570 = bitcast <2 x i32> %569 to i64, !dbg !49 + %571 = add i64 %563, %570, !dbg !51 + %572 = icmp eq i32 %12, 0, !dbg !49 + %573 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %22, !dbg !49 + %574 = getelementptr i64, ptr addrspace(3) %573, i32 %201, !dbg !49 + %575 = insertelement <1 x i64> poison, i64 %571, i64 0, !dbg !49 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %574, <1 x i64> %575, i1 %572) #4, !dbg !49 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !49 + %576 = icmp samesign ult i32 %11, 16, !dbg !49 + %577 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %11, !dbg !49 + %578 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %577, i1 %576) #4, !dbg !49 + %extelt.offset30 = lshr i64 %578, 32, !dbg !49 + %579 = trunc nuw i64 %extelt.offset30 to i32, !dbg !49 + %580 = trunc i64 %578 to i32, !dbg !49 + %581 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %580, i32 1, i32 31), !dbg !49 + %582 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %579, i32 1, i32 31), !dbg !49 + %583 = insertelement <2 x i32> poison, i32 %581, i64 0, !dbg !49 + %584 = insertelement <2 x i32> %583, i32 %582, i64 1, !dbg !49 + %585 = bitcast <2 x i32> %584 to i64, !dbg !49 + %586 = add i64 %578, %585, !dbg !51 + %587 = and i32 %11, 1009, !dbg !49 + %588 = icmp eq i32 %587, 0, !dbg !49 + %589 = insertelement <1 x i64> poison, i64 %586, i64 0, !dbg !49 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %577, <1 x i64> %589, i1 %588) #4, !dbg !49 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !49 + %590 = load i64, ptr addrspace(3) %573, align 16, !dbg !49 + %591 = trunc i64 %590 to i32, !dbg !52 + %592 = icmp slt i64 %3, 2, !dbg !53 + %593 = icmp sgt i64 %3, 1, !dbg !54 + %594 = select i1 %593, i64 %3, i64 0, !dbg !55 + %595 = zext i1 %592 to i64, !dbg !56 + %596 = add i64 %594, %595, !dbg !57 + %597 = shl i64 %596, 4, !dbg !16 + %598 = mul i64 %597, %27, !dbg !58 + %.idx = shl nsw i64 %.decomposed34, 6, !dbg !59 + %599 = getelementptr i8, ptr addrspace(1) %1, i64 %.idx, !dbg !59 + %600 = getelementptr i32, ptr addrspace(1) %599, i64 %31, !dbg !59 + %601 = getelementptr i32, ptr addrspace(1) %600, i64 %598, !dbg !59 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !60 + %602 = and i32 %11, 3, !dbg !60 + %603 = shl nuw nsw i32 %602, 3, !dbg !60 + %604 = and i32 %202, 96, !dbg !60 + %605 = and i32 %11, 4, !dbg !60 + %606 = icmp eq i32 %605, 0, !dbg !60 + %607 = select i1 %606, i32 0, i32 192, !dbg !60 + %608 = select i1 %.not, i32 0, i32 260, !dbg !60 + %609 = or disjoint i32 %603, %604, !dbg !60 + %610 = xor i32 %609, %607, !dbg !60 + %611 = or disjoint i32 %610, %608, !dbg !60 + %612 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %611, !dbg !60 + %613 = insertelement <1 x i32> poison, i32 %551, i64 0, !dbg !60 + store <1 x i32> %613, ptr addrspace(3) %612, align 4, !dbg !60 + %614 = xor i32 %611, 4, !dbg !60 + %615 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %614, !dbg !60 + %616 = insertelement <1 x i32> poison, i32 %552, i64 0, !dbg !60 + store <1 x i32> %616, ptr addrspace(3) %615, align 4, !dbg !60 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !60 + %617 = shl nuw nsw i32 %602, 5, !dbg !60 + %618 = and i32 %11, 28, !dbg !60 + %619 = select i1 %.not3, i32 0, i32 192, !dbg !60 + %620 = or disjoint i32 %617, %618, !dbg !60 + %621 = xor i32 %620, %619, !dbg !60 + %622 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %621, !dbg !60 + %623 = load i32, ptr addrspace(3) %622, align 4, !dbg !60 + %624 = xor i32 %621, 260, !dbg !60 + %625 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %624, !dbg !60 + %626 = load i32, ptr addrspace(3) %625, align 4, !dbg !60 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %623, i32 %626, ptr addrspace(1) %601, i1 %20) #4, !dbg !60 + %627 = mul i64 %25, %596, !dbg !61 + %628 = getelementptr i32, ptr addrspace(1) %2, i64 %.decomposed, !dbg !62 + %629 = getelementptr i32, ptr addrspace(1) %628, i64 %627, !dbg !62 + %630 = and i32 %11, 56, !dbg !63 + %631 = icmp eq i32 %630, 0, !dbg !63 + %632 = and i1 %631, %19, !dbg !63 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %591, ptr addrspace(1) %629, i1 %632) #4, !dbg !63 + ret void, !dbg !64 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", linkageName: "triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 25, column: 21, scope: !4) +!12 = !DILocation(line: 26, column: 38, scope: !4) +!13 = !DILocation(line: 32, column: 19, scope: !4) +!14 = !DILocation(line: 33, column: 19, scope: !4) +!15 = !DILocation(line: 35, column: 37, scope: !4) +!16 = !DILocation(line: 35, column: 45, scope: !4) +!17 = !DILocation(line: 35, column: 54, scope: !4) +!18 = !DILocation(line: 35, column: 64, scope: !4) +!19 = !DILocation(line: 35, column: 68, scope: !4) +!20 = !DILocation(line: 35, column: 30, scope: !4) +!21 = !DILocation(line: 35, column: 73, scope: !4) +!22 = !DILocation(line: 627, column: 44, scope: !23, inlinedAt: !25) +!23 = distinct !DILexicalBlockFile(scope: !4, file: !24, discriminator: 0) +!24 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!25 = !DILocation(line: 40, column: 67, scope: !4) +!26 = !DILocation(line: 537, column: 21, scope: !23, inlinedAt: !25) +!27 = !DILocation(line: 538, column: 40, scope: !23, inlinedAt: !25) +!28 = !DILocation(line: 291, column: 36, scope: !29, inlinedAt: !25) +!29 = distinct !DILexicalBlockFile(scope: !4, file: !30, discriminator: 0) +!30 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!31 = !DILocation(line: 261, column: 15, scope: !29, inlinedAt: !25) +!32 = !DILocation(line: 539, column: 41, scope: !23, inlinedAt: !25) +!33 = !DILocation(line: 548, column: 23, scope: !23, inlinedAt: !25) +!34 = !DILocation(line: 551, column: 23, scope: !23, inlinedAt: !25) +!35 = !DILocation(line: 599, column: 28, scope: !23, inlinedAt: !25) +!36 = !DILocation(line: 600, column: 38, scope: !23, inlinedAt: !25) +!37 = !DILocation(line: 600, column: 46, scope: !23, inlinedAt: !25) +!38 = !DILocation(line: 600, column: 15, scope: !23, inlinedAt: !25) +!39 = !DILocation(line: 601, column: 48, scope: !23, inlinedAt: !25) +!40 = !DILocation(line: 601, column: 59, scope: !23, inlinedAt: !25) +!41 = !DILocation(line: 601, column: 22, scope: !23, inlinedAt: !25) +!42 = !DILocation(line: 574, column: 22, scope: !23, inlinedAt: !25) +!43 = !DILocation(line: 591, column: 21, scope: !23, inlinedAt: !25) +!44 = !DILocation(line: 594, column: 40, scope: !23, inlinedAt: !25) +!45 = !DILocation(line: 594, column: 29, scope: !23, inlinedAt: !25) +!46 = !DILocation(line: 594, column: 23, scope: !23, inlinedAt: !25) +!47 = !DILocation(line: 599, column: 19, scope: !23, inlinedAt: !25) +!48 = !DILocation(line: 43, column: 34, scope: !4) +!49 = !DILocation(line: 291, column: 36, scope: !29, inlinedAt: !50) +!50 = !DILocation(line: 44, column: 26, scope: !4) +!51 = !DILocation(line: 261, column: 15, scope: !29, inlinedAt: !50) +!52 = !DILocation(line: 47, column: 21, scope: !4) +!53 = !DILocation(line: 48, column: 62, scope: !4) +!54 = !DILocation(line: 48, column: 88, scope: !4) +!55 = !DILocation(line: 48, column: 79, scope: !4) +!56 = !DILocation(line: 48, scope: !4) +!57 = !DILocation(line: 48, column: 70, scope: !4) +!58 = !DILocation(line: 48, column: 47, scope: !4) +!59 = !DILocation(line: 48, column: 25, scope: !4) +!60 = !DILocation(line: 48, column: 102, scope: !4) +!61 = !DILocation(line: 49, column: 34, scope: !4) +!62 = !DILocation(line: 49, column: 25, scope: !4) +!63 = !DILocation(line: 49, column: 89, scope: !4) +!64 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx new file mode 100644 index 0000000000000000000000000000000000000000..c1daf89d48fdf7546d46e8df47473ff239c2c4dc --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ptx @@ -0,0 +1,1486 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 // -- Begin function triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3 +.visible .entry triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2, + .param .u64 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4, + .param .u32 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_6, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_7 +) +.reqntid 64 +{ + .reg .pred %p<159>; + .reg .b32 %r<507>; + .reg .b64 %rd<79>; + .loc 1 18 0 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:18:0 + +// %bb.0: + ld.param.b64 %rd14, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_3]; +$L__tmp0: + .loc 1 23 28 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:23:28 + mov.u32 %r8, %ctaid.x; + .loc 1 23 33 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:23:33 + shl.b32 %r9, %r8, 3; + .loc 1 24 44 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:24:44 + mov.u32 %r1, %tid.x; + and.b32 %r10, %r1, 7; + bfe.u32 %r4, %r1, 3, 3; + .loc 1 24 23 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:24:23 + or.b32 %r11, %r9, %r10; + or.b32 %r12, %r4, %r9; + .loc 1 32 19 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:32:19 + cvt.s64.s32 %rd1, %r11; + cvt.s64.s32 %rd2, %r12; + .loc 1 33 19 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:33:19 + or.b64 %rd16, %rd1, %rd14; + and.b64 %rd17, %rd16, -4294967296; + setp.ne.b64 %p1, %rd17, 0; + cvt.u32.u64 %r506, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd77, %rd1, %rd14; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r13, %rd14; + div.u32 %r15, %r506, %r13; + cvt.u64.u32 %rd77, %r15; +$L__BB0_3: + .loc 1 0 19 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:0:19 + ld.param.b32 %r7, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_4]; + ld.param.b64 %rd13, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_2]; + ld.param.b64 %rd12, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_1]; + ld.param.b64 %rd11, [triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3_param_0]; + and.b32 %r2, %r1, 24; + shr.u32 %r3, %r1, 5; + or.b32 %r5, %r4, 8; + shl.b32 %r6, %r10, 1; + .loc 1 32 19 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:32:19 + mul.lo.s64 %rd18, %rd77, %rd14; + sub.s64 %rd7, %rd1, %rd18; + .loc 1 33 19 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:33:19 + or.b64 %rd19, %rd2, %rd14; + and.b64 %rd20, %rd19, -4294967296; + setp.ne.b64 %p2, %rd20, 0; + cvt.u32.u64 %r505, %rd2; + @%p2 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd78, %rd2, %rd14; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r16, %rd14; + div.u32 %r18, %r505, %r16; + cvt.u64.u32 %rd78, %r18; +$L__BB0_6: + .loc 1 25 21 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:25:21 + setp.lt.s32 %p56, %r505, %r7; + setp.lt.s32 %p3, %r506, %r7; + .loc 1 32 19 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:32:19 + mul.lo.s64 %rd32, %rd78, %rd14; + sub.s64 %rd33, %rd2, %rd32; + .loc 1 35 37 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:35:37 + cvt.u64.u32 %rd34, %r4; + cvt.u64.u32 %rd35, %r5; + .loc 1 35 54 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:35:54 + mul.lo.s64 %rd36, %rd14, %rd34; + mul.lo.s64 %rd37, %rd14, %rd35; + .loc 1 35 68 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:35:68 + mul.lo.s64 %rd38, %rd14, %rd77; + .loc 1 35 30 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:35:30 + shl.b64 %rd39, %rd7, 2; + add.s64 %rd40, %rd11, %rd39; + mad.wide.u32 %rd41, %r4, 4, %rd40; + shl.b64 %rd42, %rd77, 6; + add.s64 %rd43, %rd41, %rd42; + shl.b64 %rd44, %rd36, 2; + add.s64 %rd45, %rd43, %rd44; + shl.b64 %rd46, %rd38, 6; + add.s64 %rd22, %rd45, %rd46; + shl.b64 %rd47, %rd37, 2; + add.s64 %rd48, %rd43, %rd47; + add.s64 %rd49, %rd48, %rd46; + add.s64 %rd25, %rd49, 32; + .loc 1 35 73 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:35:73 + // begin inline asm + mov.u64 %rd23, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd23, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r19, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b32 { %r19 }, [ %rd22 + 0 ], %rd23; + // end inline asm + // begin inline asm + mov.u64 %rd26, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd26, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r20, 0x0; + @%p3 ld.global.L1::evict_last.L2::cache_hint.b32 { %r20 }, [ %rd25 + 0 ], %rd26; + // end inline asm +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shr.u32 %r125, %r1, 4; + bfe.u32 %r126, %r1, 4, 1; + bfe.s32 %r127, %r1, 3, 1; + and.b32 %r128, %r1, 8; + bfe.u32 %r129, %r1, 3, 1; + bfe.s32 %r130, %r1, 5, 1; + bfe.u32 %r131, %r1, 5, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r132, %r129, 1; + xor.b32 %r133, %r126, 1; + xor.b32 %r134, %r131, 1; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r135, %r19, %r132; + mul.lo.s32 %r136, %r20, %r132; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r137, %r135, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r138, %r137, %r135; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r139, %r136, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r140, %r139, %r136; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r141, %r19, %r129; + mul.lo.s32 %r142, %r20, %r129; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r143, %r141, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r144, %r143, %r141; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r145, %r142, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r146, %r145, %r142; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r147, %r132, %r4; + shl.b32 %r148, %r132, 3; + or.b32 %r149, %r147, %r148; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r150, %r147, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r151, %r150, %r147; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r152, %r149, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r153, %r152, %r149; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r154, %r4, %r129; + or.b32 %r155, %r154, %r128; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r156, %r154, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r157, %r156, %r154; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r158, %r155, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r159, %r158, %r155; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.b32 %r160, %r125, 1; + setp.ne.b32 %p58, %r160, 0; + setp.ge.s32 %p59, %r138, %r144; + setp.ne.b32 %p60, %r138, %r144; + setp.le.s32 %p61, %r151, %r157; + or.pred %p62, %p60, %p61; + and.pred %p63, %p59, %p62; + xor.pred %p64, %p63, %p58; + setp.ge.s32 %p65, %r140, %r146; + setp.ne.b32 %p66, %r140, %r146; + setp.le.s32 %p67, %r153, %r159; + or.pred %p68, %p66, %p67; + and.pred %p69, %p65, %p68; + xor.pred %p70, %p69, %p58; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r161, %r144, %r138; + xor.b32 %r162, %r146, %r140; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r163, 0, %r161, %p64; + selp.b32 %r164, 0, %r162, %p70; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r165, %r163, %r19; + xor.b32 %r166, %r164, %r20; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r167, %r157, %r151; + xor.b32 %r168, %r159, %r153; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r169, 0, %r167, %p64; + selp.b32 %r170, 0, %r168, %p70; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r171, %r169, %r4; + xor.b32 %r172, %r170, %r5; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r173, %r165, %r133; + mul.lo.s32 %r174, %r166, %r133; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r175, %r173, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r176, %r173, %r175; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r177, %r174, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r178, %r174, %r177; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r179, %r165, %r126; + mul.lo.s32 %r180, %r166, %r126; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r181, %r179, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r182, %r179, %r181; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r183, %r180, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r184, %r180, %r183; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r185, %r171, %r133; + mul.lo.s32 %r186, %r172, %r133; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r187, %r185, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r188, %r185, %r187; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r189, %r186, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r190, %r186, %r189; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r191, %r171, %r126; + mul.lo.s32 %r192, %r172, %r126; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r193, %r191, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r194, %r191, %r193; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r195, %r192, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r196, %r192, %r195; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p71, %r176, %r182; + setp.lt.s32 %p72, %r178, %r184; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p73, %r176, %r182; + setp.eq.b32 %p74, %r178, %r184; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p75, %r188, %r194; + setp.gt.s32 %p76, %r190, %r196; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p77, %p73, %p75; + and.pred %p78, %p74, %p76; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p79, %p71, %p77; + or.pred %p80, %p72, %p78; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r197, 1, 0, %p79; + selp.b32 %r198, 1, 0, %p80; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p81, %r131, %r197; + setp.eq.b32 %p82, %r131, %r198; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r199, %r176, %r182; + xor.b32 %r200, %r178, %r184; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r201, 0, %r199, %p81; + selp.b32 %r202, 0, %r200, %p82; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r203, %r201, %r165; + xor.b32 %r204, %r202, %r166; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r205, %r188, %r194; + xor.b32 %r206, %r190, %r196; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r207, 0, %r205, %p81; + selp.b32 %r208, 0, %r206, %p82; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r209, %r207, %r171; + xor.b32 %r210, %r208, %r172; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r211, %r203, %r132; + mul.lo.s32 %r212, %r204, %r132; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r213, %r211, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r214, %r211, %r213; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r215, %r212, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r216, %r212, %r215; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r217, %r203, %r129; + mul.lo.s32 %r218, %r204, %r129; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r219, %r217, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r220, %r217, %r219; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r221, %r218, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r222, %r218, %r221; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r223, %r209, %r132; + mul.lo.s32 %r224, %r210, %r132; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r225, %r223, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r226, %r223, %r225; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r227, %r224, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r228, %r224, %r227; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r229, %r209, %r129; + mul.lo.s32 %r230, %r210, %r129; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r231, %r229, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r232, %r229, %r231; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r233, %r230, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r234, %r230, %r233; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p83, %r214, %r220; + setp.lt.s32 %p84, %r216, %r222; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p85, %r214, %r220; + setp.eq.b32 %p86, %r216, %r222; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p87, %r226, %r232; + setp.gt.s32 %p88, %r228, %r234; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p89, %p85, %p87; + and.pred %p90, %p86, %p88; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p91, %p83, %p89; + or.pred %p92, %p84, %p90; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r235, 1, 0, %p91; + selp.b32 %r236, 1, 0, %p92; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p93, %r131, %r235; + setp.eq.b32 %p94, %r131, %r236; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r237, %r214, %r220; + xor.b32 %r238, %r216, %r222; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r239, 0, %r237, %p93; + selp.b32 %r240, 0, %r238, %p94; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r241, %r239, %r203; + xor.b32 %r242, %r240, %r204; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r243, %r226, %r232; + xor.b32 %r244, %r228, %r234; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r245, 0, %r243, %p93; + selp.b32 %r246, 0, %r244, %p94; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r247, %r245, %r209; + xor.b32 %r248, %r246, %r210; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r22, %r241, %r134; + mul.lo.s32 %r24, %r242, %r134; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.b32 %r249, %r3, 1; + shl.b32 %r250, %r1, 1; + and.b32 %r251, %r250, 48; + or.b32 %r252, %r251, %r6; + shl.b32 %r253, %r252, 3; + mov.b32 %r254, global_smem; + add.s32 %r255, %r254, %r253; + shl.b32 %r256, %r249, 2; + add.s32 %r21, %r255, %r256; + mov.pred %p5, -1; + // begin inline asm + @%p5 st.shared.b32 [ %r21 + 0 ], %r22; + // end inline asm + add.s32 %r23, %r21, 8; + // begin inline asm + @%p5 st.shared.b32 [ %r23 + 0 ], %r24; + // end inline asm + bar.sync 0; + setp.lt.u32 %p7, %r1, 128; + shl.b32 %r257, %r1, 2; + add.s32 %r26, %r254, %r257; + // begin inline asm + @%p7 ld.shared.b32 %r25, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r258, %r25, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r28, %r258, %r25; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.b32 %r259, %r1, 897; + setp.eq.b32 %p8, %r259, 0; + // begin inline asm + @%p8 st.shared.b32 [ %r26 + 0 ], %r28; + // end inline asm + add.s32 %r30, %r26, 256; + // begin inline asm + @%p7 ld.shared.b32 %r29, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r260, %r29, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r32, %r260, %r29; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p8 st.shared.b32 [ %r30 + 0 ], %r32; + // end inline asm + bar.sync 0; + ld.shared.b32 %r261, [%r255]; + ld.shared.b32 %r262, [%r255+8]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r34, %r241, %r131; + mul.lo.s32 %r36, %r242, %r131; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r21 + 0 ], %r34; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r23 + 0 ], %r36; + // end inline asm + bar.sync 0; + // begin inline asm + @%p7 ld.shared.b32 %r37, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r263, %r37, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r40, %r263, %r37; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p8 st.shared.b32 [ %r26 + 0 ], %r40; + // end inline asm + // begin inline asm + @%p7 ld.shared.b32 %r41, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r264, %r41, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r44, %r264, %r41; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p8 st.shared.b32 [ %r30 + 0 ], %r44; + // end inline asm + bar.sync 0; + ld.shared.b32 %r265, [%r255]; + ld.shared.b32 %r266, [%r255+8]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r46, %r247, %r134; + mul.lo.s32 %r48, %r248, %r134; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r21 + 0 ], %r46; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r23 + 0 ], %r48; + // end inline asm + bar.sync 0; + // begin inline asm + @%p7 ld.shared.b32 %r49, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r267, %r49, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r52, %r267, %r49; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p8 st.shared.b32 [ %r26 + 0 ], %r52; + // end inline asm + // begin inline asm + @%p7 ld.shared.b32 %r53, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r268, %r53, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r56, %r268, %r53; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p8 st.shared.b32 [ %r30 + 0 ], %r56; + // end inline asm + bar.sync 0; + ld.shared.b32 %r269, [%r255]; + ld.shared.b32 %r270, [%r255+8]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r58, %r247, %r131; + mul.lo.s32 %r60, %r248, %r131; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r21 + 0 ], %r58; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r23 + 0 ], %r60; + // end inline asm + bar.sync 0; + // begin inline asm + @%p7 ld.shared.b32 %r61, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r271, %r61, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r64, %r271, %r61; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p8 st.shared.b32 [ %r26 + 0 ], %r64; + // end inline asm + // begin inline asm + @%p7 ld.shared.b32 %r65, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r272, %r65, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r68, %r272, %r65; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p8 st.shared.b32 [ %r30 + 0 ], %r68; + // end inline asm + bar.sync 0; + ld.shared.b32 %r273, [%r255]; + ld.shared.b32 %r274, [%r255+8]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p95, %r261, %r265; + setp.ge.s32 %p96, %r262, %r266; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p97, %r261, %r265; + setp.ne.b32 %p98, %r262, %r266; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p99, %r269, %r273; + setp.le.s32 %p100, %r270, %r274; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p101, %p97, %p99; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p102, %p98, %p100; + or.pred %p103, %p95, %p101; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p104, %p96, %p102; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r275, %r265, %r261; + xor.b32 %r276, %r266, %r262; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r277, %r275, 0, %p103; + selp.b32 %r278, %r276, 0, %p104; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r279, %r277, %r241; + xor.b32 %r280, %r278, %r242; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r281, %r273, %r269; + xor.b32 %r282, %r274, %r270; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r283, %r281, 0, %p103; + selp.b32 %r284, %r282, 0, %p104; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r285, %r283, %r247; + xor.b32 %r286, %r284, %r248; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r287, %r279, %r133; + mul.lo.s32 %r288, %r280, %r133; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r289, %r287, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r290, %r287, %r289; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r291, %r288, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r292, %r288, %r291; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r293, %r279, %r126; + mul.lo.s32 %r294, %r280, %r126; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r295, %r293, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r296, %r293, %r295; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r297, %r294, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r298, %r294, %r297; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r299, %r285, %r133; + mul.lo.s32 %r300, %r286, %r133; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r301, %r299, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r302, %r299, %r301; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r303, %r300, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r304, %r300, %r303; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r305, %r285, %r126; + mul.lo.s32 %r306, %r286, %r126; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r307, %r305, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r308, %r307, %r305; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r309, %r306, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r310, %r309, %r306; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p105, %r290, %r296; + setp.ge.s32 %p106, %r292, %r298; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p107, %r290, %r296; + setp.ne.b32 %p108, %r292, %r298; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p109, %r302, %r308; + setp.le.s32 %p110, %r304, %r310; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p111, %p107, %p109; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p112, %p108, %p110; + or.pred %p113, %p105, %p111; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p114, %p106, %p112; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r311, %r290, %r296; + xor.b32 %r312, %r292, %r298; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r313, %r311, 0, %p113; + selp.b32 %r314, %r312, 0, %p114; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r315, %r313, %r279; + xor.b32 %r316, %r314, %r280; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r317, %r308, %r302; + xor.b32 %r318, %r310, %r304; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r319, %r317, 0, %p113; + selp.b32 %r320, %r318, 0, %p114; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r321, %r319, %r285; + xor.b32 %r322, %r320, %r286; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r323, %r315, %r132; + mul.lo.s32 %r324, %r316, %r132; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r325, %r323, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r326, %r323, %r325; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r327, %r324, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r328, %r324, %r327; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r329, %r315, %r129; + mul.lo.s32 %r330, %r316, %r129; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r331, %r329, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r332, %r329, %r331; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r333, %r330, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r334, %r330, %r333; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r335, %r321, %r132; + mul.lo.s32 %r336, %r322, %r132; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r337, %r335, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r338, %r335, %r337; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r339, %r336, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r340, %r336, %r339; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r341, %r321, %r129; + mul.lo.s32 %r342, %r322, %r129; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r343, %r341, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r344, %r343, %r341; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r345, %r342, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r346, %r345, %r342; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p115, %r326, %r332; + setp.ge.s32 %p116, %r328, %r334; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p117, %r326, %r332; + setp.ne.b32 %p118, %r328, %r334; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p119, %r338, %r344; + setp.le.s32 %p120, %r340, %r346; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p121, %p117, %p119; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p122, %p118, %p120; + or.pred %p123, %p115, %p121; + .loc 2 599 19 // triton_helpers.py:599:19 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p124, %p116, %p122; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r347, %r326, %r332; + xor.b32 %r348, %r328, %r334; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r349, %r347, 0, %p123; + selp.b32 %r350, %r348, 0, %p124; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r351, %r349, %r315; + xor.b32 %r352, %r350, %r316; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r353, %r344, %r338; + xor.b32 %r354, %r346, %r340; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r355, %r353, 0, %p123; + selp.b32 %r356, %r354, 0, %p124; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r357, %r355, %r321; + xor.b32 %r358, %r356, %r322; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p125, %r351, %r352; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p126, %r351, %r352; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p127, %r357, %r358; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p128, %p126, %p127; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p129, %p125, %p128; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r359, %r352, %r351; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r360, %r359, 0, %p129; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r361, %r360, %r351; + xor.b32 %r362, %r360, %r352; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r363, %r358, %r357; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r364, %r363, 0, %p129; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r365, %r364, %r357; + xor.b32 %r366, %r364, %r358; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r70, %r361, %r134; + mul.lo.s32 %r72, %r362, %r134; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r21 + 0 ], %r70; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r23 + 0 ], %r72; + // end inline asm + bar.sync 0; + // begin inline asm + @%p7 ld.shared.b32 %r73, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r367, %r73, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r76, %r367, %r73; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p8 st.shared.b32 [ %r26 + 0 ], %r76; + // end inline asm + // begin inline asm + @%p7 ld.shared.b32 %r77, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r368, %r77, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r80, %r368, %r77; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p8 st.shared.b32 [ %r30 + 0 ], %r80; + // end inline asm + bar.sync 0; + ld.shared.b32 %r369, [%r255]; + ld.shared.b32 %r370, [%r255+8]; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r82, %r361, %r131; + mul.lo.s32 %r84, %r362, %r131; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r21 + 0 ], %r82; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r23 + 0 ], %r84; + // end inline asm + bar.sync 0; + // begin inline asm + @%p7 ld.shared.b32 %r85, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r371, %r85, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r88, %r371, %r85; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p8 st.shared.b32 [ %r26 + 0 ], %r88; + // end inline asm + // begin inline asm + @%p7 ld.shared.b32 %r89, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r372, %r89, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r92, %r372, %r89; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p8 st.shared.b32 [ %r30 + 0 ], %r92; + // end inline asm + bar.sync 0; + ld.shared.b32 %r373, [%r255]; + ld.shared.b32 %r374, [%r255+8]; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r94, %r365, %r134; + mul.lo.s32 %r96, %r366, %r134; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r21 + 0 ], %r94; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r23 + 0 ], %r96; + // end inline asm + bar.sync 0; + // begin inline asm + @%p7 ld.shared.b32 %r97, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r375, %r97, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r100, %r375, %r97; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p8 st.shared.b32 [ %r26 + 0 ], %r100; + // end inline asm + // begin inline asm + @%p7 ld.shared.b32 %r101, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r376, %r101, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r104, %r376, %r101; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p8 st.shared.b32 [ %r30 + 0 ], %r104; + // end inline asm + bar.sync 0; + ld.shared.b32 %r377, [%r255]; + ld.shared.b32 %r378, [%r255+8]; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r106, %r365, %r131; + mul.lo.s32 %r108, %r366, %r131; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + bar.sync 0; + // begin inline asm + @%p5 st.shared.b32 [ %r21 + 0 ], %r106; + // end inline asm + // begin inline asm + @%p5 st.shared.b32 [ %r23 + 0 ], %r108; + // end inline asm + bar.sync 0; + // begin inline asm + @%p7 ld.shared.b32 %r109, [ %r26 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r379, %r109, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r112, %r379, %r109; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p8 st.shared.b32 [ %r26 + 0 ], %r112; + // end inline asm + // begin inline asm + @%p7 ld.shared.b32 %r113, [ %r30 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r380, %r113, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r116, %r380, %r113; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + // begin inline asm + @%p8 st.shared.b32 [ %r30 + 0 ], %r116; + // end inline asm + bar.sync 0; + ld.shared.b32 %r381, [%r255]; + ld.shared.b32 %r382, [%r255+8]; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p130, %r369, %r373; + setp.lt.s32 %p131, %r370, %r374; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p132, %r369, %r373; + setp.eq.b32 %p133, %r370, %r374; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p134, %r377, %r381; + setp.gt.s32 %p135, %r378, %r382; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p136, %p132, %p134; + and.pred %p137, %p133, %p135; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p138, %p130, %p136; + or.pred %p139, %p131, %p137; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r383, %r373, %r369; + xor.b32 %r384, %r374, %r370; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r385, %r383, 0, %p138; + selp.b32 %r386, %r384, 0, %p139; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r387, %r385, %r361; + xor.b32 %r388, %r386, %r362; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r389, %r381, %r377; + xor.b32 %r390, %r382, %r378; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r391, %r389, 0, %p138; + selp.b32 %r392, %r390, 0, %p139; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r393, %r391, %r365; + xor.b32 %r394, %r392, %r366; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r395, %r387, %r133; + mul.lo.s32 %r396, %r388, %r133; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r397, %r395, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r398, %r395, %r397; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r399, %r396, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r400, %r396, %r399; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r401, %r387, %r126; + mul.lo.s32 %r402, %r388, %r126; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r403, %r401, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r404, %r401, %r403; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r405, %r402, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r406, %r402, %r405; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r407, %r393, %r133; + mul.lo.s32 %r408, %r394, %r133; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r409, %r407, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r410, %r407, %r409; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r411, %r408, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r412, %r408, %r411; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r413, %r393, %r126; + mul.lo.s32 %r414, %r394, %r126; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r415, %r413, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r416, %r415, %r413; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r417, %r414, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r418, %r417, %r414; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p140, %r398, %r404; + setp.lt.s32 %p141, %r400, %r406; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p142, %r398, %r404; + setp.eq.b32 %p143, %r400, %r406; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p144, %r410, %r416; + setp.gt.s32 %p145, %r412, %r418; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + and.pred %p146, %p142, %p144; + and.pred %p147, %p143, %p145; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + or.pred %p148, %p140, %p146; + or.pred %p149, %p141, %p147; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r419, %r398, %r404; + xor.b32 %r420, %r400, %r406; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r421, %r419, 0, %p148; + selp.b32 %r422, %r420, 0, %p149; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r423, %r421, %r387; + xor.b32 %r424, %r422, %r388; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r425, %r416, %r410; + xor.b32 %r426, %r418, %r412; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r427, %r425, 0, %p148; + selp.b32 %r428, %r426, 0, %p149; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r429, %r427, %r393; + xor.b32 %r430, %r428, %r394; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r431, %r423, %r132; + mul.lo.s32 %r432, %r424, %r132; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r433, %r431, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r434, %r431, %r433; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r435, %r432, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r436, %r432, %r435; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r437, %r423, %r129; + mul.lo.s32 %r438, %r424, %r129; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r439, %r437, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r440, %r437, %r439; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r441, %r438, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r442, %r438, %r441; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r443, %r429, %r132; + mul.lo.s32 %r444, %r430, %r132; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r445, %r443, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r446, %r443, %r445; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r447, %r444, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r448, %r444, %r447; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + mul.lo.s32 %r449, %r429, %r129; + mul.lo.s32 %r450, %r430, %r129; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r451, %r449, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r452, %r451, %r449; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + shfl.sync.bfly.b32 %r453, %r450, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + add.s32 %r454, %r453, %r450; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.lt.s32 %p150, %r434, %r440; + setp.lt.s32 %p151, %r436, %r442; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.eq.b32 %p152, %r434, %r440; + setp.eq.b32 %p153, %r436, %r442; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + setp.gt.s32 %p154, %r446, %r452; + setp.gt.s32 %p155, %r448, %r454; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r455, %r452, %r446; + xor.b32 %r456, %r454, %r448; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + selp.b32 %r457, %r455, 0, %p154; + selp.b32 %r458, %r457, 0, %p152; + selp.b32 %r459, %r455, %r458, %p150; + selp.b32 %r460, %r456, 0, %p155; + selp.b32 %r461, %r460, 0, %p153; + selp.b32 %r462, %r456, %r461, %p151; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:40:67 ] + xor.b32 %r463, %r459, %r429; + xor.b32 %r464, %r462, %r430; +$L__tmp2: + .loc 1 43 34 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:43:34 + selp.b32 %r465, %r19, 0, %p3; + cvt.s64.s32 %rd50, %r465; + selp.b32 %r466, %r20, 0, %p3; + cvt.s64.s32 %rd51, %r466; +$L__tmp3: + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + bar.sync 0; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + add.s64 %rd52, %rd51, %rd50; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + mov.b64 {_, %r467}, %rd52; + cvt.u32.u64 %r468, %rd52; + shfl.sync.bfly.b32 %r469, %r468, 16, 31, -1; + shfl.sync.bfly.b32 %r470, %r467, 16, 31, -1; + cvt.u64.u32 %rd53, %r469; + cvt.u64.u32 %rd54, %r470; + shl.b64 %rd55, %rd54, 32; + or.b64 %rd56, %rd53, %rd55; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + add.s64 %rd57, %rd52, %rd56; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + mov.b64 {_, %r471}, %rd57; + cvt.u32.u64 %r472, %rd57; + shfl.sync.bfly.b32 %r473, %r472, 8, 31, -1; + shfl.sync.bfly.b32 %r474, %r471, 8, 31, -1; + cvt.u64.u32 %rd58, %r473; + cvt.u64.u32 %rd59, %r474; + shl.b64 %rd60, %rd59, 32; + or.b64 %rd61, %rd58, %rd60; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + add.s64 %rd27, %rd57, %rd61; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + setp.eq.b32 %p53, %r2, 0; + shl.b32 %r475, %r6, 3; + add.s32 %r476, %r254, %r475; + shl.b32 %r477, %r249, 3; + add.s32 %r117, %r476, %r477; + // begin inline asm + @%p53 st.shared.b64 [ %r117 + 0 ], %rd27; + // end inline asm + bar.sync 0; + setp.lt.u32 %p54, %r1, 16; + shl.b32 %r478, %r1, 3; + add.s32 %r118, %r254, %r478; + // begin inline asm + @%p54 ld.shared.b64 %rd28, [ %r118 + 0 ]; + // end inline asm + mov.b64 {_, %r479}, %rd28; + cvt.u32.u64 %r480, %rd28; + shfl.sync.bfly.b32 %r481, %r480, 1, 31, -1; + shfl.sync.bfly.b32 %r482, %r479, 1, 31, -1; + cvt.u64.u32 %rd62, %r481; + cvt.u64.u32 %rd63, %r482; + shl.b64 %rd64, %rd63, 32; + or.b64 %rd65, %rd62, %rd64; + .loc 3 261 15 // standard.py:261:15 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + add.s64 %rd29, %rd28, %rd65; + .loc 3 291 36 // standard.py:291:36 @[ cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:44:26 ] + and.b32 %r483, %r1, 1009; + setp.eq.b32 %p55, %r483, 0; + // begin inline asm + @%p55 st.shared.b64 [ %r118 + 0 ], %rd29; + // end inline asm + bar.sync 0; + ld.shared.b32 %r122, [%r476]; +$L__tmp4: + .loc 1 48 62 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:62 + setp.lt.s64 %p156, %rd14, 2; + .loc 1 48 88 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:88 + setp.gt.s64 %p157, %rd14, 1; + .loc 1 48 79 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:79 + selp.b64 %rd66, %rd14, 0, %p157; + .loc 1 48 0 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48 + selp.b64 %rd67, 1, 0, %p156; + .loc 1 48 70 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:70 + add.s64 %rd68, %rd66, %rd67; + .loc 1 48 47 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:47 + mul.lo.s64 %rd69, %rd68, %rd78; + .loc 1 48 25 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:25 + shl.b64 %rd70, %rd33, 6; + add.s64 %rd71, %rd12, %rd70; + mad.wide.u32 %rd72, %r6, 4, %rd71; + shl.b64 %rd73, %rd69, 6; + add.s64 %rd30, %rd72, %rd73; + .loc 1 48 102 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:48:102 + bar.sync 0; + and.b32 %r484, %r1, 3; + shl.b32 %r485, %r484, 3; + and.b32 %r486, %r250, 96; + bfe.s32 %r487, %r1, 2, 1; + and.b32 %r488, %r487, 192; + and.b32 %r489, %r127, 260; + or.b32 %r490, %r485, %r486; + xor.b32 %r491, %r490, %r488; + or.b32 %r492, %r491, %r489; + add.s32 %r493, %r254, %r492; + st.shared.b32 [%r493], %r463; + xor.b32 %r494, %r492, 4; + add.s32 %r495, %r254, %r494; + st.shared.b32 [%r495], %r464; + bar.sync 0; + shl.b32 %r496, %r484, 5; + and.b32 %r497, %r1, 28; + and.b32 %r498, %r130, 192; + or.b32 %r499, %r496, %r497; + xor.b32 %r500, %r499, %r498; + add.s32 %r501, %r254, %r500; + ld.shared.b32 %r120, [%r501]; + xor.b32 %r502, %r500, 4; + add.s32 %r503, %r254, %r502; + ld.shared.b32 %r121, [%r503+256]; + // begin inline asm + @%p56 st.global.v2.b32 [ %rd30 + 0 ], { %r120, %r121 }; + // end inline asm + .loc 1 49 34 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:49:34 + mul.lo.s64 %rd74, %rd77, %rd68; + .loc 1 49 25 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:49:25 + add.s64 %rd75, %rd13, %rd39; + shl.b64 %rd76, %rd74, 2; + add.s64 %rd31, %rd75, %rd76; + .loc 1 49 89 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:49:89 + and.b32 %r504, %r1, 56; + setp.eq.b32 %p158, %r504, 0; + and.pred %p57, %p158, %p3; + // begin inline asm + @%p57 st.global.b32 [ %rd31 + 0 ], { %r122 }; + // end inline asm + .loc 1 49 4 // cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py:49:4 + ret; +$L__tmp5: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 267 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x104 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 119 +.b8 101 +.b8 53 +.b8 52 +.b8 113 +.b8 100 +.b8 122 +.b8 117 +.b8 100 +.b8 54 +.b8 120 +.b8 115 +.b8 107 +.b8 104 +.b8 118 +.b8 106 +.b8 115 +.b8 117 +.b8 98 +.b8 122 +.b8 113 +.b8 98 +.b8 118 +.b8 105 +.b8 111 +.b8 98 +.b8 116 +.b8 111 +.b8 102 +.b8 105 +.b8 55 +.b8 114 +.b8 116 +.b8 108 +.b8 97 +.b8 51 +.b8 99 +.b8 122 +.b8 51 +.b8 102 +.b8 118 +.b8 117 +.b8 102 +.b8 109 +.b8 101 +.b8 107 +.b8 102 +.b8 121 +.b8 52 +.b8 113 +.b8 102 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 119 +.b8 101 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x3d DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 99 +.b8 108 +.b8 111 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 108 +.b8 105 +.b8 99 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 116 +.b8 114 +.b8 97 +.b8 110 +.b8 115 +.b8 112 +.b8 111 +.b8 115 +.b8 101 +.b8 95 +.b8 51 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc8:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xdd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 40 // DW_AT_call_line +.b8 67 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xf5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source b/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source new file mode 100644 index 0000000000000000000000000000000000000000..7a3d005fab448bfb3bc3a0f10082c9ed8ed99d51 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.source @@ -0,0 +1,1289 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":18:0) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc106 = loc(unknown) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc140 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc144 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc153 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc158 = loc("in_ptr0"(#loc)) +#loc159 = loc("out_ptr2"(#loc)) +#loc160 = loc("out_ptr3"(#loc)) +#loc161 = loc("ks0"(#loc)) +#loc162 = loc("xnumel"(#loc)) +#loc163 = loc("r0_numel"(#loc)) +#loc196 = loc("x"(#loc56)) +#loc197 = loc("idxs"(#loc56)) +#loc198 = loc("x"(#loc60)) +#loc199 = loc("idxs"(#loc60)) +#loc204 = loc("x"(#loc68)) +#loc205 = loc("idxs"(#loc68)) +#loc206 = loc("flip"(#loc68)) +#loc262 = loc("input"(#loc131)) +#loc263 = loc("a"(#loc135)) +#loc264 = loc("b"(#loc135)) +#loc266 = loc("x"(#loc140)) +#loc267 = loc("x"(#loc144)) +#loc268 = loc("input"(#loc153)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %r0_numel_0 = arith.constant 16 : i32 loc(#loc164) + %xoffset = tt.get_program_id x : i32 loc(#loc165) + %xoffset_1 = arith.constant 8 : i32 loc(#loc166) + %xoffset_2 = arith.constant 8 : i32 loc(#loc166) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc166) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc167) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc168) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<8x1xi32> loc(#loc169) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<8x1xi32> loc(#loc169) + %xmask = tt.splat %xnumel : i32 -> tensor<8x1xi32> loc(#loc170) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<8x1xi32> loc(#loc170) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc171) + %r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc172) + %r0_offset = arith.constant 0 : i32 loc(#loc173) + %r0_mask = arith.constant true loc(#loc174) + %r0_mask_9 = arith.constant dense : tensor<8x16xi1> loc(#loc174) + %x0 = arith.extsi %xindex_6 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc175) + %x0_10 = tt.splat %ks0 : i64 -> tensor<8x1xi64> loc(#loc175) + %x0_11 = arith.remsi %x0, %x0_10 : tensor<8x1xi64> loc(#loc175) + %x1 = arith.extsi %xindex_6 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc176) + %x1_12 = tt.splat %ks0 : i64 -> tensor<8x1xi64> loc(#loc176) + %x1_13 = arith.divsi %x1, %x1_12 : tensor<8x1xi64> loc(#loc176) + %tmp0 = arith.extsi %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc177) + %tmp0_14 = tt.broadcast %tmp0 : tensor<1x16xi64> -> tensor<8x16xi64> loc(#loc177) + %tmp0_15 = tt.broadcast %x0_11 : tensor<8x1xi64> -> tensor<8x16xi64> loc(#loc177) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<8x16xi64> loc(#loc177) + %tmp0_17 = arith.constant 16 : i32 loc(#loc178) + %tmp0_18 = arith.constant 16 : i64 loc(#loc178) + %tmp0_19 = arith.constant dense<16> : tensor<8x1xi64> loc(#loc178) + %tmp0_20 = arith.muli %tmp0_19, %x1_13 : tensor<8x1xi64> loc(#loc178) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<8x1xi64> -> tensor<8x16xi64> loc(#loc179) + %tmp0_22 = arith.addi %tmp0_16, %tmp0_21 : tensor<8x16xi64> loc(#loc179) + %tmp0_23 = arith.extsi %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc180) + %tmp0_24 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc180) + %tmp0_25 = arith.muli %tmp0_24, %tmp0_23 : tensor<1x16xi64> loc(#loc180) + %tmp0_26 = tt.broadcast %tmp0_25 : tensor<1x16xi64> -> tensor<8x16xi64> loc(#loc181) + %tmp0_27 = arith.addi %tmp0_22, %tmp0_26 : tensor<8x16xi64> loc(#loc181) + %tmp0_28 = arith.constant 16 : i32 loc(#loc182) + %tmp0_29 = arith.constant 16 : i64 loc(#loc182) + %tmp0_30 = arith.muli %tmp0_29, %ks0 : i64 loc(#loc182) + %tmp0_31 = tt.splat %tmp0_30 : i64 -> tensor<8x1xi64> loc(#loc183) + %tmp0_32 = arith.muli %tmp0_31, %x1_13 : tensor<8x1xi64> loc(#loc183) + %tmp0_33 = tt.broadcast %tmp0_32 : tensor<8x1xi64> -> tensor<8x16xi64> loc(#loc184) + %tmp0_34 = arith.addi %tmp0_27, %tmp0_33 : tensor<8x16xi64> loc(#loc184) + %tmp0_35 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc185) + %tmp0_36 = tt.addptr %tmp0_35, %tmp0_34 : tensor<8x16x!tt.ptr>, tensor<8x16xi64> loc(#loc185) + %tmp0_37 = arith.constant 0.000000e+00 : f32 loc(#loc186) + %tmp0_38 = tt.broadcast %xmask_7 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc186) + %tmp0_39 = arith.constant dense<0.000000e+00> : tensor<8x16xf32> loc(#loc186) + %tmp0_40 = arith.fptosi %tmp0_39 : tensor<8x16xf32> to tensor<8x16xi32> loc(#loc186) + %tmp0_41 = tt.load %tmp0_36, %tmp0_38, %tmp0_40 evictionPolicy = evict_last : tensor<8x16x!tt.ptr> loc(#loc186) + %tmp2 = arith.trunci %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc187) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<8x16xi16> loc(#loc188) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp0_41, %tmp4) : (tensor<8x16xi32>, tensor<8x16xi16>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc26) + %tmp7 = arith.extsi %tmp0_41 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc189) + %tmp10 = arith.constant 0 : i32 loc(#loc190) + %tmp10_42 = arith.constant 0 : i64 loc(#loc190) + %tmp10_43 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc190) + %tmp10_44 = tt.broadcast %xmask_7 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc190) + %tmp10_45 = arith.select %tmp10_44, %tmp7, %tmp10_43 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc190) + %tmp11 = tt.call @"triton.language.standard.sum__i64S8_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp10_45) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc191) + %tmp11_46 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc192) + %tmp12 = arith.extsi %0#1 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc193) + %tmp13 = arith.trunci %tmp12 : tensor<8x16xi64> to tensor<8x16xi32> loc(#loc194) + %tmp14 = arith.trunci %tmp11_46 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc195) + %c16_i32 = arith.constant 16 : i32 loc(#loc34) + %c16_i64 = arith.constant 16 : i64 loc(#loc34) + %cst = arith.constant dense<16> : tensor<8x1xi64> loc(#loc34) + %1 = arith.muli %cst, %x0_11 : tensor<8x1xi64> loc(#loc34) + %2 = arith.extsi %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc35) + %3 = tt.broadcast %2 : tensor<1x16xi64> -> tensor<8x16xi64> loc(#loc35) + %4 = tt.broadcast %1 : tensor<8x1xi64> -> tensor<8x16xi64> loc(#loc35) + %5 = arith.addi %3, %4 : tensor<8x16xi64> loc(#loc35) + %c16_i32_47 = arith.constant 16 : i32 loc(#loc36) + %c16_i64_48 = arith.constant 16 : i64 loc(#loc36) + %cst_49 = arith.constant dense<16> : tensor<8x1xi64> loc(#loc36) + %6 = arith.muli %cst_49, %x1_13 : tensor<8x1xi64> loc(#loc36) + %c1_i32 = arith.constant 1 : i32 loc(#loc37) + %7 = arith.extsi %c1_i32 : i32 to i64 loc(#loc37) + %8 = arith.cmpi sge, %7, %ks0 : i64 loc(#loc37) + %c1_i32_50 = arith.constant 1 : i32 loc(#loc38) + %c1_i32_51 = arith.constant 1 : i32 loc(#loc38) + %9 = arith.extui %8 : i1 to i32 loc(#loc38) + %10 = arith.muli %c1_i32_51, %9 : i32 loc(#loc38) + %c1_i32_52 = arith.constant 1 : i32 loc(#loc39) + %11 = arith.extsi %c1_i32_52 : i32 to i64 loc(#loc39) + %12 = arith.cmpi sgt, %ks0, %11 : i64 loc(#loc39) + %13 = arith.extui %12 : i1 to i64 loc(#loc40) + %14 = arith.muli %ks0, %13 : i64 loc(#loc40) + %15 = arith.extsi %10 : i32 to i64 loc(#loc41) + %16 = arith.addi %15, %14 : i64 loc(#loc41) + %17 = tt.splat %16 : i64 -> tensor<8x1xi64> loc(#loc42) + %18 = arith.muli %6, %17 : tensor<8x1xi64> loc(#loc42) + %19 = tt.broadcast %18 : tensor<8x1xi64> -> tensor<8x16xi64> loc(#loc43) + %20 = arith.addi %5, %19 : tensor<8x16xi64> loc(#loc43) + %21 = tt.splat %out_ptr2 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc44) + %22 = tt.addptr %21, %20 : tensor<8x16x!tt.ptr>, tensor<8x16xi64> loc(#loc44) + %23 = tt.broadcast %xmask_7 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc45) + tt.store %22, %tmp13, %23 : tensor<8x16x!tt.ptr> loc(#loc45) + %c1_i32_53 = arith.constant 1 : i32 loc(#loc46) + %24 = arith.extsi %c1_i32_53 : i32 to i64 loc(#loc46) + %25 = arith.cmpi sge, %24, %ks0 : i64 loc(#loc46) + %c1_i32_54 = arith.constant 1 : i32 loc(#loc47) + %c1_i32_55 = arith.constant 1 : i32 loc(#loc47) + %26 = arith.extui %25 : i1 to i32 loc(#loc47) + %27 = arith.muli %c1_i32_55, %26 : i32 loc(#loc47) + %c1_i32_56 = arith.constant 1 : i32 loc(#loc48) + %28 = arith.extsi %c1_i32_56 : i32 to i64 loc(#loc48) + %29 = arith.cmpi sgt, %ks0, %28 : i64 loc(#loc48) + %30 = arith.extui %29 : i1 to i64 loc(#loc49) + %31 = arith.muli %ks0, %30 : i64 loc(#loc49) + %32 = arith.extsi %27 : i32 to i64 loc(#loc50) + %33 = arith.addi %32, %31 : i64 loc(#loc50) + %34 = tt.splat %33 : i64 -> tensor<8x1xi64> loc(#loc51) + %35 = arith.muli %x1_13, %34 : tensor<8x1xi64> loc(#loc51) + %36 = arith.addi %x0_11, %35 : tensor<8x1xi64> loc(#loc52) + %37 = tt.splat %out_ptr3 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc53) + %38 = tt.addptr %37, %36 : tensor<8x1x!tt.ptr>, tensor<8x1xi64> loc(#loc53) + tt.store %38, %tmp14, %xmask_7 : tensor<8x1x!tt.ptr> loc(#loc54) + tt.return loc(#loc55) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc56)), %idxs: tensor<8x16xi16> loc("idxs"(#loc56))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<8x16xi32>, tensor<8x16xi16>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc57) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc57) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc57) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc57) + tt.return %3#0, %3#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc58) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc59) + %5 = ub.poison : tensor<8x16xi32> loc(#loc59) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc59) + } loc(#loc56) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc60)), %idxs: tensor<8x16xi16> loc("idxs"(#loc60))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc200) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc201) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc201) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc202) + %flip_3 = tt.reshape %flip_2 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc203) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i16S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi16>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc65) + tt.return %0#0, %0#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc66) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x16xi32> loc(#loc67) + %2 = ub.poison : tensor<8x16xi32> loc(#loc67) + tt.return %1, %2 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc67) + } loc(#loc60) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i16S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc68)), %idxs: tensor<8x16xi16> loc("idxs"(#loc68)), %flip: tensor<8x16xi32> loc("flip"(#loc68))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<8x16xi16> -> tensor<64x2x1xi16> loc(#loc221) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc222) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc223) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<64x2x1xi16> loc(#loc223) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<64x2x1xi16>) -> tensor<64x1xi32> loc(#loc224) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc225) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc226) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc227) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc228) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<64x2x1xi16> loc(#loc228) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<64x2x1xi16>) -> tensor<64x1xi32> loc(#loc229) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc230) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc231) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc232) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_27 = arith.constant dense : tensor<8x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_28 = arith.constant dense : tensor<8x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_49 = arith.constant true loc(#loc239) + %cond_50 = arith.constant dense : tensor<8x16xi1> loc(#loc239) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<8x16xi1> loc(#loc239) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<8x16xi1> loc(#loc240) + %cond_53 = arith.ori %cond, %cond_52 : tensor<8x16xi1> loc(#loc271) + scf.yield %cond_53 : tensor<8x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc243) + %eq_50 = arith.ori %eq, %eq_49 : tensor<8x16xi1> loc(#loc273) + scf.yield %eq_50 : tensor<8x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc106) + } loc(#loc109) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<8x16xi32> loc(#loc245) + %cond_30 = arith.andi %3, %cond_29 : tensor<8x16xi1> loc(#loc246) + %cond_31 = arith.ori %1, %cond_30 : tensor<8x16xi1> loc(#loc247) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<8x16xi1> loc(#loc248) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<8x16xi1> loc(#loc249) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<8x16xi1> loc(#loc250) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<8x16xi1> loc(#loc251) + %cond_36 = arith.extui %cond_35 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc252) + %cond_37 = arith.xori %cond_36, %flip : tensor<8x16xi32> loc(#loc252) + %cond_38 = arith.constant 0 : i32 loc(#loc253) + %cond_39 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc253) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<8x16xi32> loc(#loc253) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc254) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc255) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc256) + %ret_43 = arith.xori %x, %ret_42 : tensor<8x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<8x16xi32> loc(#loc258) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S8_16S__(%idxs) : (tensor<8x16xi16>) -> tensor<8x16xi16> loc(#loc259) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<8x16xi16> to tensor<8x16xi32> loc(#loc260) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc260) + %new_idxs_47 = arith.extsi %idxs : tensor<8x16xi16> to tensor<8x16xi32> loc(#loc261) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<8x16xi32> loc(#loc261) + tt.return %ret_43, %new_idxs_48 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc130) + %5 = ub.poison : tensor<8x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x1xi32> loc("input"(#loc131))) -> tensor<64x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc132) + tt.reduce.return %2 : i32 loc(#loc132) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc132) + tt.return %0 : tensor<64x1xi32> loc(#loc133) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x1xi32> loc(#loc134) + tt.return %1 : tensor<64x1xi32> loc(#loc134) + } loc(#loc131) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc135)), %b: i32 loc("b"(#loc135))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc136) + tt.return %0 : i32 loc(#loc137) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc138) + tt.return %1 : i32 loc(#loc138) + } loc(#loc135) + tt.func private @"triton.language.standard.sum__i16S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x1xi16> loc("input"(#loc131))) -> tensor<64x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc265) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc132) + tt.reduce.return %2 : i32 loc(#loc132) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc132) + tt.return %0 : tensor<64x1xi32> loc(#loc133) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x1xi32> loc(#loc134) + tt.return %1 : tensor<64x1xi32> loc(#loc134) + } loc(#loc131) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%x: tensor<8x16xi32> loc("x"(#loc140))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc141) + %false = arith.constant false loc(#loc142) + tt.return %false : i1 loc(#loc142) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc143) + tt.return %1 : i1 loc(#loc143) + } loc(#loc140) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S8_16S__(%x: tensor<8x16xi32> loc("x"(#loc144))) -> tensor<8x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc145) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc146) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc146) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<8x16xi32> loc(#loc146) + %4 = arith.addi %x, %3 : tensor<8x16xi32> loc(#loc146) + tt.return %4 : tensor<8x16xi32> loc(#loc147) + ^bb1: // no predecessors + %5 = ub.poison : tensor<8x16xi32> loc(#loc148) + tt.return %5 : tensor<8x16xi32> loc(#loc148) + } loc(#loc144) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc150) + %cst = arith.constant dense : tensor<1xi1> loc(#loc150) + tt.return %cst : tensor<1xi1> loc(#loc151) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc152) + tt.return %0 : tensor<1xi1> loc(#loc152) + } loc(#loc149) + tt.func private @triton.language.standard.zeros_like__i32S8_16S__(%input: tensor<8x16xi32> loc("input"(#loc153))) -> tensor<8x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<8x16xi32> loc(#loc154) + tt.return %0 : tensor<8x16xi32> loc(#loc155) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x16xi32> loc(#loc156) + tt.return %1 : tensor<8x16xi32> loc(#loc156) + } loc(#loc153) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<8x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc150) + %cst = arith.constant dense<0> : tensor<8x16xi32> loc(#loc150) + tt.return %cst : tensor<8x16xi32> loc(#loc151) + ^bb1: // no predecessors + %0 = ub.poison : tensor<8x16xi32> loc(#loc152) + tt.return %0 : tensor<8x16xi32> loc(#loc152) + } loc(#loc149) + tt.func private @triton.language.standard.zeros_like__i16S8_16S__(%input: tensor<8x16xi16> loc("input"(#loc153))) -> tensor<8x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<8x16xi16> loc(#loc154) + tt.return %0 : tensor<8x16xi16> loc(#loc155) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x16xi16> loc(#loc156) + tt.return %1 : tensor<8x16xi16> loc(#loc156) + } loc(#loc153) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<8x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc150) + %cst = arith.constant dense<0> : tensor<8x16xi16> loc(#loc150) + tt.return %cst : tensor<8x16xi16> loc(#loc151) + ^bb1: // no predecessors + %0 = ub.poison : tensor<8x16xi16> loc(#loc152) + tt.return %0 : tensor<8x16xi16> loc(#loc152) + } loc(#loc149) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc60)), %idxs: tensor<8x16xi32> loc("idxs"(#loc60))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc200) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc201) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc201) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc202) + %flip_3 = tt.reshape %flip_2 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc203) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc65) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc65) + tt.return %1#0, %1#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc66) + ^bb1: // no predecessors + %2 = ub.poison : tensor<8x16xi32> loc(#loc67) + %3 = ub.poison : tensor<8x16xi32> loc(#loc67) + tt.return %2, %3 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc67) + } loc(#loc60) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc68)), %idxs: tensor<8x16xi32> loc("idxs"(#loc68)), %flip: tensor<8x16xi32> loc("flip"(#loc68))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x2xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<32x2x2xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x2xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x2xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_45 = arith.constant true loc(#loc239) + %cond_46 = arith.constant dense : tensor<8x16xi1> loc(#loc239) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<8x16xi1> loc(#loc239) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<8x16xi1> loc(#loc240) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc271) + scf.yield %cond_49 : tensor<8x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc243) + %eq_46 = arith.ori %eq, %eq_45 : tensor<8x16xi1> loc(#loc273) + scf.yield %eq_46 : tensor<8x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc251) + %cond_34 = arith.extui %cond_33 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc252) + %cond_35 = arith.xori %cond_34, %flip : tensor<8x16xi32> loc(#loc252) + %cond_36 = arith.constant 0 : i32 loc(#loc253) + %cond_37 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc253) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<8x16xi32> loc(#loc253) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc254) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc255) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc256) + %ret_41 = arith.xori %x, %ret_40 : tensor<8x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc258) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc259) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc260) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<8x16xi32> loc(#loc261) + tt.return %ret_41, %new_idxs_44 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc130) + %5 = ub.poison : tensor<8x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x2x2xi32> loc("input"(#loc131))) -> tensor<32x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc132) + tt.reduce.return %2 : i32 loc(#loc132) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc132) + tt.return %0 : tensor<32x2xi32> loc(#loc133) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x2xi32> loc(#loc134) + tt.return %1 : tensor<32x2xi32> loc(#loc134) + } loc(#loc131) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc68)), %idxs: tensor<8x16xi32> loc("idxs"(#loc68)), %flip: tensor<8x16xi32> loc("flip"(#loc68))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x1xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x1xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_45 = arith.constant true loc(#loc239) + %cond_46 = arith.constant dense : tensor<8x16xi1> loc(#loc239) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<8x16xi1> loc(#loc239) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<8x16xi1> loc(#loc240) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc271) + scf.yield %cond_49 : tensor<8x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc243) + %eq_46 = arith.ori %eq, %eq_45 : tensor<8x16xi1> loc(#loc273) + scf.yield %eq_46 : tensor<8x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc251) + %cond_34 = arith.extui %cond_33 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc252) + %cond_35 = arith.xori %cond_34, %flip : tensor<8x16xi32> loc(#loc252) + %cond_36 = arith.constant 0 : i32 loc(#loc253) + %cond_37 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc253) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<8x16xi32> loc(#loc253) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc254) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc255) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc256) + %ret_41 = arith.xori %x, %ret_40 : tensor<8x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc258) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc259) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc260) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<8x16xi32> loc(#loc261) + tt.return %ret_41, %new_idxs_44 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc130) + %5 = ub.poison : tensor<8x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc60)), %idxs: tensor<8x16xi32> loc("idxs"(#loc60))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc200) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc201) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc201) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc202) + %flip_3 = tt.reshape %flip_2 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc203) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc65) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc65) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc65) + tt.return %2#0, %2#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc66) + ^bb1: // no predecessors + %3 = ub.poison : tensor<8x16xi32> loc(#loc67) + %4 = ub.poison : tensor<8x16xi32> loc(#loc67) + tt.return %3, %4 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc67) + } loc(#loc60) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc68)), %idxs: tensor<8x16xi32> loc("idxs"(#loc68)), %flip: tensor<8x16xi32> loc("flip"(#loc68))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<16x2x4xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<16x2x4xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<16x2x4xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<16x2x4xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_45 = arith.constant true loc(#loc239) + %cond_46 = arith.constant dense : tensor<8x16xi1> loc(#loc239) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<8x16xi1> loc(#loc239) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<8x16xi1> loc(#loc240) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc271) + scf.yield %cond_49 : tensor<8x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc243) + %eq_46 = arith.ori %eq, %eq_45 : tensor<8x16xi1> loc(#loc273) + scf.yield %eq_46 : tensor<8x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc251) + %cond_34 = arith.extui %cond_33 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc252) + %cond_35 = arith.xori %cond_34, %flip : tensor<8x16xi32> loc(#loc252) + %cond_36 = arith.constant 0 : i32 loc(#loc253) + %cond_37 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc253) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<8x16xi32> loc(#loc253) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc254) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc255) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc256) + %ret_41 = arith.xori %x, %ret_40 : tensor<8x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc258) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc259) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc260) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<8x16xi32> loc(#loc261) + tt.return %ret_41, %new_idxs_44 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc130) + %5 = ub.poison : tensor<8x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<16x2x4xi32> loc("input"(#loc131))) -> tensor<16x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc132) + tt.reduce.return %2 : i32 loc(#loc132) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc132) + tt.return %0 : tensor<16x4xi32> loc(#loc133) + ^bb1: // no predecessors + %1 = ub.poison : tensor<16x4xi32> loc(#loc134) + tt.return %1 : tensor<16x4xi32> loc(#loc134) + } loc(#loc131) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc60)), %idxs: tensor<8x16xi32> loc("idxs"(#loc60))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc269) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc65) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc65) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc65) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc65) + tt.return %3#0, %3#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc66) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc67) + %5 = ub.poison : tensor<8x16xi32> loc(#loc67) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc67) + } loc(#loc60) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc68)), %idxs: tensor<8x16xi32> loc("idxs"(#loc68)), %flip: i1 loc("flip"(#loc68))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x8xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<8x2x8xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x8xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x8xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc239) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc239) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc239) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc240) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc271) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc243) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc273) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc251) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc252) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc252) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc254) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc255) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc256) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc258) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc259) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc260) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc261) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc130) + %5 = ub.poison : tensor<8x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x8xi32> loc("input"(#loc131))) -> tensor<8x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc132) + tt.reduce.return %2 : i32 loc(#loc132) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc132) + tt.return %0 : tensor<8x8xi32> loc(#loc133) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x8xi32> loc(#loc134) + tt.return %1 : tensor<8x8xi32> loc(#loc134) + } loc(#loc131) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc68)), %idxs: tensor<8x16xi32> loc("idxs"(#loc68)), %flip: i1 loc("flip"(#loc68))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<16x2x4xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<16x2x4xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<16x2x4xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<16x2x4xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc239) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc239) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc239) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc240) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc271) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc243) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc273) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc251) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc252) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc252) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc254) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc255) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc256) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc258) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc259) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc260) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc261) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc130) + %5 = ub.poison : tensor<8x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc68)), %idxs: tensor<8x16xi32> loc("idxs"(#loc68)), %flip: i1 loc("flip"(#loc68))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x2xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<32x2x2xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x2xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x2xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc239) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc239) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc239) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc240) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc271) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc243) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc273) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc251) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc252) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc252) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc254) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc255) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc256) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc258) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc259) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc260) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc261) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc130) + %5 = ub.poison : tensor<8x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc68)), %idxs: tensor<8x16xi32> loc("idxs"(#loc68)), %flip: i1 loc("flip"(#loc68))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc207) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc208) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc209) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc209) + %left_mask = arith.constant 1 : i32 loc(#loc210) + %left_mask_2 = arith.constant 1 : i32 loc(#loc210) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc210) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc210) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc211) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc211) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc212) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc213) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc214) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc215) + %iright_9 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc215) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc216) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc217) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc218) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc219) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc220) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc221) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc223) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x1xi32> loc(#loc223) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc224) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc225) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc226) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc228) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x1xi32> loc(#loc228) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc229) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc230) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc231) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc232) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc233) + %left_valid_mask = arith.constant true loc(#loc234) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc234) + %right_valid_mask = arith.constant true loc(#loc235) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc235) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc236) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc237) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc270) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc101) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc239) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc239) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc239) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc240) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc271) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc271) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc106) + } loc(#loc102) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc272) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc108) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc243) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc273) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc273) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc106) + } loc(#loc109) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc245) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc246) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc247) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc248) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc249) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc250) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc251) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc252) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc252) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc254) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc255) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc256) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc257) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc258) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc259) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc260) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc261) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc129) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc130) + %5 = ub.poison : tensor<8x16xi32> loc(#loc130) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc130) + } loc(#loc68) + tt.func private @"triton.language.standard.sum__i64S8_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x16xi64> loc("input"(#loc131))) -> tensor<8xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc132) + tt.reduce.return %2 : i64 loc(#loc132) + }) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc132) + tt.return %0 : tensor<8xi64> loc(#loc133) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8xi64> loc(#loc134) + tt.return %1 : tensor<8xi64> loc(#loc134) + } loc(#loc131) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc135)), %b: i64 loc("b"(#loc135))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc136) + tt.return %0 : i64 loc(#loc137) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc138) + tt.return %1 : i64 loc(#loc138) + } loc(#loc135) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":19:15) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":25:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":26:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":26:38) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":27:16) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":28:48) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":32:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":33:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:45) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:42) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:54) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:50) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:64) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:68) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:61) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:30) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:73) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":37:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":39:33) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":40:67) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":41:19) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":43:34) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":44:26) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":44:29) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":45:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":46:21) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":47:21) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:35) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:32) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:43) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:62) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:54) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:88) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:79) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:70) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:47) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:40) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:25) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:102) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:49) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:41) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:75) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:66) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:57) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:34) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:30) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:25) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:89) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:4) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc141 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc142 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc143 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc145 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc146 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc147 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc148 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc149 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc150 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc151 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc152 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc154 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc155 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc156 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc157 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc164 = loc("r0_numel"(#loc1)) +#loc165 = loc("xoffset"(#loc2)) +#loc166 = loc("xoffset"(#loc3)) +#loc167 = loc("xindex"(#loc4)) +#loc168 = loc("xindex"(#loc5)) +#loc169 = loc("xindex"(#loc6)) +#loc170 = loc("xmask"(#loc7)) +#loc171 = loc("r0_index"(#loc8)) +#loc172 = loc("r0_index"(#loc9)) +#loc173 = loc("r0_offset"(#loc10)) +#loc174 = loc("r0_mask"(#loc11)) +#loc175 = loc("x0"(#loc12)) +#loc176 = loc("x1"(#loc13)) +#loc177 = loc("tmp0"(#loc14)) +#loc178 = loc("tmp0"(#loc15)) +#loc179 = loc("tmp0"(#loc16)) +#loc180 = loc("tmp0"(#loc17)) +#loc181 = loc("tmp0"(#loc18)) +#loc182 = loc("tmp0"(#loc19)) +#loc183 = loc("tmp0"(#loc20)) +#loc184 = loc("tmp0"(#loc21)) +#loc185 = loc("tmp0"(#loc22)) +#loc186 = loc("tmp0"(#loc23)) +#loc187 = loc("tmp2"(#loc24)) +#loc188 = loc("tmp4"(#loc25)) +#loc189 = loc("tmp7"(#loc27)) +#loc190 = loc("tmp10"(#loc28)) +#loc191 = loc("tmp11"(#loc29)) +#loc192 = loc("tmp11"(#loc30)) +#loc193 = loc("tmp12"(#loc31)) +#loc194 = loc("tmp13"(#loc32)) +#loc195 = loc("tmp14"(#loc33)) +#loc200 = loc("flip"(#loc61)) +#loc201 = loc("flip"(#loc62)) +#loc202 = loc("flip"(#loc63)) +#loc203 = loc("flip"(#loc64)) +#loc207 = loc("y"(#loc69)) +#loc208 = loc("right_mask"(#loc70)) +#loc209 = loc("right_mask"(#loc71)) +#loc210 = loc("left_mask"(#loc72)) +#loc211 = loc("ileft"(#loc73)) +#loc212 = loc("ileft"(#loc74)) +#loc213 = loc("ileft"(#loc75)) +#loc214 = loc("ileft"(#loc76)) +#loc215 = loc("iright"(#loc77)) +#loc216 = loc("iright"(#loc78)) +#loc217 = loc("iright"(#loc79)) +#loc218 = loc("iright"(#loc80)) +#loc219 = loc("ileft"(#loc81)) +#loc220 = loc("iright"(#loc82)) +#loc221 = loc("y_idx"(#loc83)) +#loc222 = loc("left_idx"(#loc84)) +#loc223 = loc("left_idx"(#loc85)) +#loc224 = loc("left_idx"(#loc86)) +#loc225 = loc("left_idx"(#loc87)) +#loc226 = loc("left_idx"(#loc88)) +#loc227 = loc("right_idx"(#loc89)) +#loc228 = loc("right_idx"(#loc90)) +#loc229 = loc("right_idx"(#loc91)) +#loc230 = loc("right_idx"(#loc92)) +#loc231 = loc("right_idx"(#loc93)) +#loc232 = loc("left_idx"(#loc94)) +#loc233 = loc("right_idx"(#loc95)) +#loc234 = loc("left_valid_mask"(#loc96)) +#loc235 = loc("right_valid_mask"(#loc97)) +#loc236 = loc("left_isnan"(#loc98)) +#loc237 = loc("right_isnan"(#loc99)) +#loc238 = loc("cond"(#loc100)) +#loc239 = loc("cond"(#loc103)) +#loc240 = loc("cond"(#loc104)) +#loc241 = loc("cond"(#loc105)) +#loc242 = loc("eq"(#loc107)) +#loc243 = loc("eq"(#loc110)) +#loc244 = loc("eq"(#loc111)) +#loc245 = loc("cond"(#loc112)) +#loc246 = loc("cond"(#loc113)) +#loc247 = loc("cond"(#loc114)) +#loc248 = loc("cond"(#loc115)) +#loc249 = loc("cond"(#loc116)) +#loc250 = loc("cond"(#loc117)) +#loc251 = loc("cond"(#loc118)) +#loc252 = loc("cond"(#loc119)) +#loc253 = loc("cond"(#loc120)) +#loc254 = loc("ret"(#loc121)) +#loc255 = loc("ret"(#loc122)) +#loc256 = loc("ret"(#loc123)) +#loc257 = loc("ret"(#loc124)) +#loc258 = loc("new_idxs"(#loc125)) +#loc259 = loc("new_idxs"(#loc126)) +#loc260 = loc("new_idxs"(#loc127)) +#loc261 = loc("new_idxs"(#loc128)) +#loc265 = loc("input"(#loc139)) +#loc269 = loc("flip"(#loc157)) +#loc270 = loc("cond"(#loc238)) +#loc271 = loc("cond"(#loc241)) +#loc272 = loc("eq"(#loc242)) +#loc273 = loc("eq"(#loc244)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..5c700908e9f4440dfd27b1852d2325e065542312 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttgir @@ -0,0 +1,890 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [4, 8], warpsPerCTA = [2, 1], order = [1, 0]}> +#linear = #ttg.linear<{register = [[0, 8]], lane = [[1, 0], [2, 0], [4, 0], [0, 1], [0, 2]], warp = [[0, 4]], block = []}> +#linear1 = #ttg.linear<{register = [[4, 0, 0]], lane = [[8, 0, 0], [16, 0, 0], [32, 0, 0], [0, 1, 0], [1, 0, 0]], warp = [[2, 0, 0]], block = []}> +#linear2 = #ttg.linear<{register = [[2, 0, 0]], lane = [[4, 0, 0], [8, 0, 0], [16, 0, 0], [0, 0, 1], [0, 1, 0]], warp = [[1, 0, 0]], block = []}> +#linear3 = #ttg.linear<{register = [[1, 0, 0]], lane = [[2, 0, 0], [4, 0, 0], [8, 0, 0], [0, 0, 1], [0, 0, 2]], warp = [[0, 1, 0]], block = []}> +#linear4 = #ttg.linear<{register = [[0, 1, 0]], lane = [[1, 0, 0], [2, 0, 0], [4, 0, 0], [0, 0, 1], [0, 0, 2]], warp = [[0, 0, 4]], block = []}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":18:0) +#loc1 = loc(unknown) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":40:67) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":44:26) +#loc90 = loc("in_ptr0"(#loc)) +#loc91 = loc("out_ptr2"(#loc)) +#loc92 = loc("out_ptr3"(#loc)) +#loc93 = loc("ks0"(#loc)) +#loc94 = loc("xnumel"(#loc)) +#loc95 = loc("r0_numel"(#loc)) +#loc117 = loc(callsite(#loc23 at #loc24)) +#loc123 = loc("ileft"(#loc32)) +#loc127 = loc("iright"(#loc37)) +#loc136 = loc("left_idx"(#loc46)) +#loc141 = loc("right_idx"(#loc51)) +#loc161 = loc("tmp11"(#loc71)) +#loc168 = loc(callsite(#loc28 at #loc117)) +#loc172 = loc(callsite(#loc1 at #loc161)) +#loc176 = loc(callsite(#loc123 at #loc168)) +#loc180 = loc(callsite(#loc127 at #loc168)) +#loc188 = loc(callsite(#loc136 at #loc168)) +#loc193 = loc(callsite(#loc141 at #loc168)) +#loc213 = loc(callsite(#loc1 at #loc176)) +#loc215 = loc(callsite(#loc1 at #loc180)) +#loc218 = loc(callsite(#loc1 at #loc188)) +#loc221 = loc(callsite(#loc1 at #loc193)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<8x16xi32, #linear> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<8x16xi64, #blocked> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c16_i64 = arith.constant 16 : i64 loc(#loc1) + %cst_1 = arith.constant dense<16> : tensor<8x1xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<16> : tensor<8x1xi64, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x2x1xi32, #linear1> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x2x1xi32, #linear2> loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x2x1xi32, #linear3> loc(#loc1) + %cst_6 = arith.constant dense<1> : tensor<1x2x1xi32, #linear4> loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst_7 = arith.constant dense<0> : tensor<8x16xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc96) + %xoffset_8 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc97) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc98) + %xindex_9 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc98) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc98) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1xi32, #blocked1> loc(#loc98) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<8x1xi32, #blocked> loc(#loc99) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<8x1xi32, #blocked1> loc(#loc99) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<8x1xi32, #blocked> loc(#loc99) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<8x1xi32, #blocked1> loc(#loc99) + %xmask = tt.splat %xnumel : i32 -> tensor<8x1xi32, #blocked> loc(#loc100) + %xmask_16 = tt.splat %xnumel : i32 -> tensor<8x1xi32, #blocked1> loc(#loc100) + %xmask_17 = arith.cmpi slt, %xindex_14, %xmask : tensor<8x1xi32, #blocked> loc(#loc100) + %xmask_18 = arith.cmpi slt, %xindex_15, %xmask_16 : tensor<8x1xi32, #blocked1> loc(#loc100) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc101) + %r0_index_19 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> loc(#loc101) + %r0_index_20 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc101) + %r0_index_21 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc101) + %r0_index_22 = tt.expand_dims %r0_index_19 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #linear}>> -> tensor<1x16xi32, #linear> loc(#loc101) + %r0_index_23 = tt.expand_dims %r0_index_20 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x16xi32, #blocked1> loc(#loc101) + %x0 = arith.extsi %xindex_14 : tensor<8x1xi32, #blocked> to tensor<8x1xi64, #blocked> loc(#loc102) + %x0_24 = arith.extsi %xindex_15 : tensor<8x1xi32, #blocked1> to tensor<8x1xi64, #blocked1> loc(#loc102) + %x0_25 = tt.splat %ks0 : i64 -> tensor<8x1xi64, #blocked> loc(#loc102) + %x0_26 = tt.splat %ks0 : i64 -> tensor<8x1xi64, #blocked1> loc(#loc102) + %x0_27 = arith.remsi %x0, %x0_25 : tensor<8x1xi64, #blocked> loc(#loc102) + %x0_28 = arith.remsi %x0_24, %x0_26 : tensor<8x1xi64, #blocked1> loc(#loc102) + %x1 = arith.divsi %x0, %x0_25 : tensor<8x1xi64, #blocked> loc(#loc103) + %x1_29 = arith.divsi %x0_24, %x0_26 : tensor<8x1xi64, #blocked1> loc(#loc103) + %tmp0 = arith.extsi %r0_index_21 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc104) + %tmp0_30 = arith.extsi %r0_index_23 : tensor<1x16xi32, #blocked1> to tensor<1x16xi64, #blocked1> loc(#loc104) + %tmp0_31 = tt.broadcast %tmp0 : tensor<1x16xi64, #blocked> -> tensor<8x16xi64, #blocked> loc(#loc104) + %tmp0_32 = tt.broadcast %tmp0_30 : tensor<1x16xi64, #blocked1> -> tensor<8x16xi64, #blocked1> loc(#loc104) + %tmp0_33 = tt.broadcast %x0_27 : tensor<8x1xi64, #blocked> -> tensor<8x16xi64, #blocked> loc(#loc104) + %tmp0_34 = arith.addi %tmp0_31, %tmp0_33 : tensor<8x16xi64, #blocked> loc(#loc104) + %tmp0_35 = arith.muli %x1, %cst_1 : tensor<8x1xi64, #blocked> loc(#loc105) + %tmp0_36 = arith.muli %x1_29, %cst_2 : tensor<8x1xi64, #blocked1> loc(#loc105) + %tmp0_37 = tt.broadcast %tmp0_35 : tensor<8x1xi64, #blocked> -> tensor<8x16xi64, #blocked> loc(#loc106) + %tmp0_38 = arith.addi %tmp0_34, %tmp0_37 : tensor<8x16xi64, #blocked> loc(#loc106) + %tmp0_39 = tt.splat %ks0 : i64 -> tensor<1x16xi64, #blocked> loc(#loc107) + %tmp0_40 = arith.muli %tmp0_39, %tmp0 : tensor<1x16xi64, #blocked> loc(#loc107) + %tmp0_41 = tt.broadcast %tmp0_40 : tensor<1x16xi64, #blocked> -> tensor<8x16xi64, #blocked> loc(#loc108) + %tmp0_42 = arith.addi %tmp0_38, %tmp0_41 : tensor<8x16xi64, #blocked> loc(#loc108) + %tmp0_43 = arith.muli %ks0, %c16_i64 : i64 loc(#loc109) + %tmp0_44 = tt.splat %tmp0_43 : i64 -> tensor<8x1xi64, #blocked> loc(#loc110) + %tmp0_45 = arith.muli %tmp0_44, %x1 : tensor<8x1xi64, #blocked> loc(#loc110) + %tmp0_46 = tt.broadcast %tmp0_45 : tensor<8x1xi64, #blocked> -> tensor<8x16xi64, #blocked> loc(#loc111) + %tmp0_47 = arith.addi %tmp0_42, %tmp0_46 : tensor<8x16xi64, #blocked> loc(#loc111) + %tmp0_48 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc112) + %tmp0_49 = tt.addptr %tmp0_48, %tmp0_47 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi64, #blocked> loc(#loc112) + %tmp0_50 = tt.broadcast %xmask_17 : tensor<8x1xi1, #blocked> -> tensor<8x16xi1, #blocked> loc(#loc113) + %tmp0_51 = tt.broadcast %xmask_18 : tensor<8x1xi1, #blocked1> -> tensor<8x16xi1, #blocked1> loc(#loc113) + %tmp0_52 = tt.load %tmp0_49, %tmp0_50, %cst_7 evictionPolicy = evict_last : tensor<8x16x!tt.ptr, #blocked> loc(#loc113) + %tmp2 = arith.trunci %r0_index_22 : tensor<1x16xi32, #linear> to tensor<1x16xi16, #linear> loc(#loc114) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16, #linear> -> tensor<8x16xi16, #linear> loc(#loc115) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> loc(#loc165) + %flip_53 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> loc(#loc165) + %flip_54 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> loc(#loc165) + %flip_55 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> loc(#loc165) + %flip_56 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> loc(#loc165) + %flip_57 = tt.expand_dims %flip_53 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> loc(#loc165) + %flip_58 = tt.expand_dims %flip_54 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> loc(#loc165) + %flip_59 = tt.expand_dims %flip_55 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #linear4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> loc(#loc165) + %flip_60 = tt.expand_dims %flip_56 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear2}>> -> tensor<1x2x1xi32, #linear2> loc(#loc165) + %flip_61 = tt.expand_dims %flip_57 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear1}>> -> tensor<1x2x1xi32, #linear1> loc(#loc165) + %flip_62 = tt.expand_dims %flip_58 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear3}>> -> tensor<1x2x1xi32, #linear3> loc(#loc165) + %flip_63 = tt.expand_dims %flip_59 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #linear4}>> -> tensor<1x2x1xi32, #linear4> loc(#loc165) + %flip_64 = tt.broadcast %flip_60 : tensor<1x2x1xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc166) + %flip_65 = tt.reshape %flip_64 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #blocked> loc(#loc167) + %flip_66 = tt.reshape %flip_64 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc167) + %y = tt.reshape %tmp0_52 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #linear1> loc(#loc173) + %left_mask = arith.subi %cst_3, %flip_61 : tensor<1x2x1xi32, #linear1> loc(#loc174) + %left_mask_67 = arith.subi %cst_4, %flip_60 : tensor<1x2x1xi32, #linear2> loc(#loc174) + %left_mask_68 = arith.subi %cst_5, %flip_62 : tensor<1x2x1xi32, #linear3> loc(#loc174) + %left_mask_69 = arith.subi %cst_6, %flip_63 : tensor<1x2x1xi32, #linear4> loc(#loc174) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc175) + %ileft_70 = arith.muli %y, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc175) + %ileft_71 = "tt.reduce"(%ileft_70) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc212) + %ileft_72 = tt.expand_dims %ileft_71 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc177) + %ileft_73 = tt.broadcast %ileft_72 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc178) + %iright = tt.broadcast %flip_61 : tensor<1x2x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc179) + %iright_74 = arith.muli %y, %iright : tensor<64x2x1xi32, #linear1> loc(#loc179) + %iright_75 = "tt.reduce"(%iright_74) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc214) + %iright_76 = tt.expand_dims %iright_75 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc181) + %iright_77 = tt.broadcast %iright_76 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc182) + %ileft_78 = tt.reshape %ileft_73 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #blocked> loc(#loc183) + %ileft_79 = tt.reshape %ileft_73 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc183) + %iright_80 = tt.reshape %iright_77 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #blocked> loc(#loc184) + %iright_81 = tt.reshape %iright_77 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc184) + %y_idx = tt.reshape %tmp4 : tensor<8x16xi16, #linear> -> tensor<64x2x1xi16, #linear1> loc(#loc185) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc186) + %left_idx_82 = tt.broadcast %left_idx : tensor<1x2x1xi16, #linear1> -> tensor<64x2x1xi16, #linear1> loc(#loc187) + %left_idx_83 = arith.muli %y_idx, %left_idx_82 : tensor<64x2x1xi16, #linear1> loc(#loc187) + %input = arith.extsi %left_idx_83 : tensor<64x2x1xi16, #linear1> to tensor<64x2x1xi32, #linear1> loc(#loc216) + %left_idx_84 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc217) + %left_idx_85 = tt.expand_dims %left_idx_84 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc189) + %left_idx_86 = tt.broadcast %left_idx_85 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc190) + %right_idx = arith.trunci %flip_61 : tensor<1x2x1xi32, #linear1> to tensor<1x2x1xi16, #linear1> loc(#loc191) + %right_idx_87 = tt.broadcast %right_idx : tensor<1x2x1xi16, #linear1> -> tensor<64x2x1xi16, #linear1> loc(#loc192) + %right_idx_88 = arith.muli %y_idx, %right_idx_87 : tensor<64x2x1xi16, #linear1> loc(#loc192) + %input_89 = arith.extsi %right_idx_88 : tensor<64x2x1xi16, #linear1> to tensor<64x2x1xi32, #linear1> loc(#loc219) + %right_idx_90 = "tt.reduce"(%input_89) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc220) + %right_idx_91 = tt.expand_dims %right_idx_90 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc194) + %right_idx_92 = tt.broadcast %right_idx_91 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc195) + %left_idx_93 = tt.reshape %left_idx_86 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #blocked> loc(#loc196) + %left_idx_94 = tt.reshape %left_idx_86 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc196) + %right_idx_95 = tt.reshape %right_idx_92 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #blocked> loc(#loc197) + %right_idx_96 = tt.reshape %right_idx_92 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc197) + %cond = arith.cmpi slt, %ileft_78, %iright_80 : tensor<8x16xi32, #blocked> loc(#loc198) + %cond_97 = arith.cmpi slt, %ileft_79, %iright_81 : tensor<8x16xi32, #linear> loc(#loc198) + %eq = arith.cmpi eq, %ileft_78, %iright_80 : tensor<8x16xi32, #blocked> loc(#loc199) + %eq_98 = arith.cmpi eq, %ileft_79, %iright_81 : tensor<8x16xi32, #linear> loc(#loc199) + %cond_99 = arith.cmpi sgt, %left_idx_93, %right_idx_95 : tensor<8x16xi32, #blocked> loc(#loc200) + %cond_100 = arith.cmpi sgt, %left_idx_94, %right_idx_96 : tensor<8x16xi32, #linear> loc(#loc200) + %cond_101 = arith.andi %eq, %cond_99 : tensor<8x16xi1, #blocked> loc(#loc201) + %cond_102 = arith.andi %eq_98, %cond_100 : tensor<8x16xi1, #linear> loc(#loc201) + %cond_103 = arith.ori %cond, %cond_101 : tensor<8x16xi1, #blocked> loc(#loc202) + %cond_104 = arith.ori %cond_97, %cond_102 : tensor<8x16xi1, #linear> loc(#loc202) + %cond_105 = arith.extui %cond_103 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc203) + %cond_106 = arith.extui %cond_104 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc203) + %cond_107 = arith.xori %cond_105, %flip_65 : tensor<8x16xi32, #blocked> loc(#loc203) + %cond_108 = arith.xori %cond_106, %flip_66 : tensor<8x16xi32, #linear> loc(#loc203) + %cond_109 = arith.cmpi ne, %cond_107, %cst_7 : tensor<8x16xi32, #blocked> loc(#loc204) + %cond_110 = arith.cmpi ne, %cond_108, %cst : tensor<8x16xi32, #linear> loc(#loc204) + %ret = arith.xori %ileft_78, %iright_80 : tensor<8x16xi32, #blocked> loc(#loc205) + %ret_111 = arith.select %cond_109, %ret, %cst_7 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc206) + %ret_112 = arith.xori %tmp0_52, %ret_111 : tensor<8x16xi32, #blocked> loc(#loc207) + %ret_113 = ttg.convert_layout %ret_112 : tensor<8x16xi32, #blocked> -> tensor<8x16xi32, #linear> loc(#loc207) + %new_idxs = arith.xori %left_idx_94, %right_idx_96 : tensor<8x16xi32, #linear> loc(#loc208) + %new_idxs_114 = arith.select %cond_110, %new_idxs, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc209) + %new_idxs_115 = arith.extsi %tmp2 : tensor<1x16xi16, #linear> to tensor<1x16xi32, #linear> loc(#loc210) + %new_idxs_116 = tt.broadcast %new_idxs_115 : tensor<1x16xi32, #linear> -> tensor<8x16xi32, #linear> loc(#loc210) + %new_idxs_117 = arith.xori %new_idxs_116, %new_idxs_114 : tensor<8x16xi32, #linear> loc(#loc210) + %flip_118 = tt.broadcast %flip_62 : tensor<1x2x1xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc166) + %flip_119 = tt.reshape %flip_118 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc167) + %y_120 = tt.reshape %ret_112 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #linear2> loc(#loc173) + %ileft_121 = tt.broadcast %left_mask_67 : tensor<1x2x1xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc175) + %ileft_122 = arith.muli %y_120, %ileft_121 : tensor<32x2x2xi32, #linear2> loc(#loc175) + %ileft_123 = "tt.reduce"(%ileft_122) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc212) + %ileft_124 = tt.expand_dims %ileft_123 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc177) + %ileft_125 = tt.broadcast %ileft_124 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc178) + %iright_126 = arith.muli %y_120, %flip_64 : tensor<32x2x2xi32, #linear2> loc(#loc179) + %iright_127 = "tt.reduce"(%iright_126) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc214) + %iright_128 = tt.expand_dims %iright_127 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc181) + %iright_129 = tt.broadcast %iright_128 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc182) + %ileft_130 = tt.reshape %ileft_125 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc183) + %iright_131 = tt.reshape %iright_129 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc184) + %y_idx_132 = tt.reshape %new_idxs_117 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc185) + %left_idx_133 = arith.muli %y_idx_132, %ileft_121 : tensor<32x2x2xi32, #linear2> loc(#loc187) + %left_idx_134 = "tt.reduce"(%left_idx_133) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc217) + %left_idx_135 = tt.expand_dims %left_idx_134 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc189) + %left_idx_136 = tt.broadcast %left_idx_135 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc190) + %right_idx_137 = arith.muli %y_idx_132, %flip_64 : tensor<32x2x2xi32, #linear2> loc(#loc192) + %right_idx_138 = "tt.reduce"(%right_idx_137) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc220) + %right_idx_139 = tt.expand_dims %right_idx_138 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc194) + %right_idx_140 = tt.broadcast %right_idx_139 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc195) + %left_idx_141 = tt.reshape %left_idx_136 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc196) + %right_idx_142 = tt.reshape %right_idx_140 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc197) + %cond_143 = arith.cmpi slt, %ileft_130, %iright_131 : tensor<8x16xi32, #linear> loc(#loc198) + %eq_144 = arith.cmpi eq, %ileft_130, %iright_131 : tensor<8x16xi32, #linear> loc(#loc199) + %cond_145 = arith.cmpi sgt, %left_idx_141, %right_idx_142 : tensor<8x16xi32, #linear> loc(#loc200) + %cond_146 = arith.andi %eq_144, %cond_145 : tensor<8x16xi1, #linear> loc(#loc201) + %cond_147 = arith.ori %cond_143, %cond_146 : tensor<8x16xi1, #linear> loc(#loc202) + %cond_148 = arith.extui %cond_147 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc203) + %cond_149 = arith.xori %cond_148, %flip_119 : tensor<8x16xi32, #linear> loc(#loc203) + %cond_150 = arith.cmpi ne, %cond_149, %cst : tensor<8x16xi32, #linear> loc(#loc204) + %ret_151 = arith.xori %ileft_130, %iright_131 : tensor<8x16xi32, #linear> loc(#loc205) + %ret_152 = arith.select %cond_150, %ret_151, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc206) + %ret_153 = arith.xori %ret_113, %ret_152 : tensor<8x16xi32, #linear> loc(#loc207) + %new_idxs_154 = arith.xori %left_idx_141, %right_idx_142 : tensor<8x16xi32, #linear> loc(#loc208) + %new_idxs_155 = arith.select %cond_150, %new_idxs_154, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc209) + %new_idxs_156 = arith.xori %new_idxs_117, %new_idxs_155 : tensor<8x16xi32, #linear> loc(#loc210) + %y_157 = tt.reshape %ret_153 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc173) + %ileft_158 = arith.muli %y_157, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc175) + %ileft_159 = "tt.reduce"(%ileft_158) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc212) + %ileft_160 = tt.expand_dims %ileft_159 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc177) + %ileft_161 = tt.broadcast %ileft_160 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc178) + %iright_162 = arith.muli %y_157, %iright : tensor<64x2x1xi32, #linear1> loc(#loc179) + %iright_163 = "tt.reduce"(%iright_162) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc214) + %iright_164 = tt.expand_dims %iright_163 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc181) + %iright_165 = tt.broadcast %iright_164 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc182) + %ileft_166 = tt.reshape %ileft_161 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc183) + %iright_167 = tt.reshape %iright_165 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc184) + %y_idx_168 = tt.reshape %new_idxs_156 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc185) + %left_idx_169 = arith.muli %y_idx_168, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc187) + %left_idx_170 = "tt.reduce"(%left_idx_169) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc217) + %left_idx_171 = tt.expand_dims %left_idx_170 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc189) + %left_idx_172 = tt.broadcast %left_idx_171 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc190) + %right_idx_173 = arith.muli %y_idx_168, %iright : tensor<64x2x1xi32, #linear1> loc(#loc192) + %right_idx_174 = "tt.reduce"(%right_idx_173) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc220) + %right_idx_175 = tt.expand_dims %right_idx_174 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc194) + %right_idx_176 = tt.broadcast %right_idx_175 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc195) + %left_idx_177 = tt.reshape %left_idx_172 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc196) + %right_idx_178 = tt.reshape %right_idx_176 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc197) + %cond_179 = arith.cmpi slt, %ileft_166, %iright_167 : tensor<8x16xi32, #linear> loc(#loc198) + %eq_180 = arith.cmpi eq, %ileft_166, %iright_167 : tensor<8x16xi32, #linear> loc(#loc199) + %cond_181 = arith.cmpi sgt, %left_idx_177, %right_idx_178 : tensor<8x16xi32, #linear> loc(#loc200) + %cond_182 = arith.andi %eq_180, %cond_181 : tensor<8x16xi1, #linear> loc(#loc201) + %cond_183 = arith.ori %cond_179, %cond_182 : tensor<8x16xi1, #linear> loc(#loc202) + %cond_184 = arith.extui %cond_183 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc203) + %cond_185 = arith.xori %cond_184, %flip_119 : tensor<8x16xi32, #linear> loc(#loc203) + %cond_186 = arith.cmpi ne, %cond_185, %cst : tensor<8x16xi32, #linear> loc(#loc204) + %ret_187 = arith.xori %ileft_166, %iright_167 : tensor<8x16xi32, #linear> loc(#loc205) + %ret_188 = arith.select %cond_186, %ret_187, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc206) + %ret_189 = arith.xori %ret_153, %ret_188 : tensor<8x16xi32, #linear> loc(#loc207) + %new_idxs_190 = arith.xori %left_idx_177, %right_idx_178 : tensor<8x16xi32, #linear> loc(#loc208) + %new_idxs_191 = arith.select %cond_186, %new_idxs_190, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc209) + %new_idxs_192 = arith.xori %new_idxs_156, %new_idxs_191 : tensor<8x16xi32, #linear> loc(#loc210) + %flip_193 = tt.broadcast %flip_63 : tensor<1x2x1xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc166) + %flip_194 = tt.reshape %flip_193 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc167) + %y_195 = tt.reshape %ret_189 : tensor<8x16xi32, #linear> -> tensor<16x2x4xi32, #linear3> loc(#loc173) + %ileft_196 = tt.broadcast %left_mask_68 : tensor<1x2x1xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc175) + %ileft_197 = arith.muli %y_195, %ileft_196 : tensor<16x2x4xi32, #linear3> loc(#loc175) + %ileft_198 = "tt.reduce"(%ileft_197) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc212) + %ileft_199 = tt.expand_dims %ileft_198 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc177) + %ileft_200 = tt.broadcast %ileft_199 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc178) + %iright_201 = arith.muli %y_195, %flip_118 : tensor<16x2x4xi32, #linear3> loc(#loc179) + %iright_202 = "tt.reduce"(%iright_201) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc214) + %iright_203 = tt.expand_dims %iright_202 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc181) + %iright_204 = tt.broadcast %iright_203 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc182) + %ileft_205 = tt.reshape %ileft_200 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc183) + %iright_206 = tt.reshape %iright_204 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc184) + %y_idx_207 = tt.reshape %new_idxs_192 : tensor<8x16xi32, #linear> -> tensor<16x2x4xi32, #linear3> loc(#loc185) + %left_idx_208 = arith.muli %y_idx_207, %ileft_196 : tensor<16x2x4xi32, #linear3> loc(#loc187) + %left_idx_209 = "tt.reduce"(%left_idx_208) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc217) + %left_idx_210 = tt.expand_dims %left_idx_209 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc189) + %left_idx_211 = tt.broadcast %left_idx_210 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc190) + %right_idx_212 = arith.muli %y_idx_207, %flip_118 : tensor<16x2x4xi32, #linear3> loc(#loc192) + %right_idx_213 = "tt.reduce"(%right_idx_212) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc220) + %right_idx_214 = tt.expand_dims %right_idx_213 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc194) + %right_idx_215 = tt.broadcast %right_idx_214 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc195) + %left_idx_216 = tt.reshape %left_idx_211 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc196) + %right_idx_217 = tt.reshape %right_idx_215 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc197) + %cond_218 = arith.cmpi slt, %ileft_205, %iright_206 : tensor<8x16xi32, #linear> loc(#loc198) + %eq_219 = arith.cmpi eq, %ileft_205, %iright_206 : tensor<8x16xi32, #linear> loc(#loc199) + %cond_220 = arith.cmpi sgt, %left_idx_216, %right_idx_217 : tensor<8x16xi32, #linear> loc(#loc200) + %cond_221 = arith.andi %eq_219, %cond_220 : tensor<8x16xi1, #linear> loc(#loc201) + %cond_222 = arith.ori %cond_218, %cond_221 : tensor<8x16xi1, #linear> loc(#loc202) + %cond_223 = arith.extui %cond_222 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc203) + %cond_224 = arith.xori %cond_223, %flip_194 : tensor<8x16xi32, #linear> loc(#loc203) + %cond_225 = arith.cmpi ne, %cond_224, %cst : tensor<8x16xi32, #linear> loc(#loc204) + %ret_226 = arith.xori %ileft_205, %iright_206 : tensor<8x16xi32, #linear> loc(#loc205) + %ret_227 = arith.select %cond_225, %ret_226, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc206) + %ret_228 = arith.xori %ret_189, %ret_227 : tensor<8x16xi32, #linear> loc(#loc207) + %new_idxs_229 = arith.xori %left_idx_216, %right_idx_217 : tensor<8x16xi32, #linear> loc(#loc208) + %new_idxs_230 = arith.select %cond_225, %new_idxs_229, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc209) + %new_idxs_231 = arith.xori %new_idxs_192, %new_idxs_230 : tensor<8x16xi32, #linear> loc(#loc210) + %y_232 = tt.reshape %ret_228 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc173) + %ileft_233 = arith.muli %y_232, %ileft_121 : tensor<32x2x2xi32, #linear2> loc(#loc175) + %ileft_234 = "tt.reduce"(%ileft_233) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc212) + %ileft_235 = tt.expand_dims %ileft_234 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc177) + %ileft_236 = tt.broadcast %ileft_235 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc178) + %iright_237 = arith.muli %y_232, %flip_64 : tensor<32x2x2xi32, #linear2> loc(#loc179) + %iright_238 = "tt.reduce"(%iright_237) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc214) + %iright_239 = tt.expand_dims %iright_238 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc181) + %iright_240 = tt.broadcast %iright_239 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc182) + %ileft_241 = tt.reshape %ileft_236 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc183) + %iright_242 = tt.reshape %iright_240 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc184) + %y_idx_243 = tt.reshape %new_idxs_231 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc185) + %left_idx_244 = arith.muli %y_idx_243, %ileft_121 : tensor<32x2x2xi32, #linear2> loc(#loc187) + %left_idx_245 = "tt.reduce"(%left_idx_244) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc217) + %left_idx_246 = tt.expand_dims %left_idx_245 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc189) + %left_idx_247 = tt.broadcast %left_idx_246 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc190) + %right_idx_248 = arith.muli %y_idx_243, %flip_64 : tensor<32x2x2xi32, #linear2> loc(#loc192) + %right_idx_249 = "tt.reduce"(%right_idx_248) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc220) + %right_idx_250 = tt.expand_dims %right_idx_249 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc194) + %right_idx_251 = tt.broadcast %right_idx_250 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc195) + %left_idx_252 = tt.reshape %left_idx_247 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc196) + %right_idx_253 = tt.reshape %right_idx_251 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc197) + %cond_254 = arith.cmpi slt, %ileft_241, %iright_242 : tensor<8x16xi32, #linear> loc(#loc198) + %eq_255 = arith.cmpi eq, %ileft_241, %iright_242 : tensor<8x16xi32, #linear> loc(#loc199) + %cond_256 = arith.cmpi sgt, %left_idx_252, %right_idx_253 : tensor<8x16xi32, #linear> loc(#loc200) + %cond_257 = arith.andi %eq_255, %cond_256 : tensor<8x16xi1, #linear> loc(#loc201) + %cond_258 = arith.ori %cond_254, %cond_257 : tensor<8x16xi1, #linear> loc(#loc202) + %cond_259 = arith.extui %cond_258 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc203) + %cond_260 = arith.xori %cond_259, %flip_194 : tensor<8x16xi32, #linear> loc(#loc203) + %cond_261 = arith.cmpi ne, %cond_260, %cst : tensor<8x16xi32, #linear> loc(#loc204) + %ret_262 = arith.xori %ileft_241, %iright_242 : tensor<8x16xi32, #linear> loc(#loc205) + %ret_263 = arith.select %cond_261, %ret_262, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc206) + %ret_264 = arith.xori %ret_228, %ret_263 : tensor<8x16xi32, #linear> loc(#loc207) + %new_idxs_265 = arith.xori %left_idx_252, %right_idx_253 : tensor<8x16xi32, #linear> loc(#loc208) + %new_idxs_266 = arith.select %cond_261, %new_idxs_265, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc209) + %new_idxs_267 = arith.xori %new_idxs_231, %new_idxs_266 : tensor<8x16xi32, #linear> loc(#loc210) + %y_268 = tt.reshape %ret_264 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc173) + %ileft_269 = arith.muli %y_268, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc175) + %ileft_270 = "tt.reduce"(%ileft_269) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc212) + %ileft_271 = tt.expand_dims %ileft_270 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc177) + %ileft_272 = tt.broadcast %ileft_271 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc178) + %iright_273 = arith.muli %y_268, %iright : tensor<64x2x1xi32, #linear1> loc(#loc179) + %iright_274 = "tt.reduce"(%iright_273) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc214) + %iright_275 = tt.expand_dims %iright_274 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc181) + %iright_276 = tt.broadcast %iright_275 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc182) + %ileft_277 = tt.reshape %ileft_272 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc183) + %iright_278 = tt.reshape %iright_276 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc184) + %y_idx_279 = tt.reshape %new_idxs_267 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc185) + %left_idx_280 = arith.muli %y_idx_279, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc187) + %left_idx_281 = "tt.reduce"(%left_idx_280) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc217) + %left_idx_282 = tt.expand_dims %left_idx_281 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc189) + %left_idx_283 = tt.broadcast %left_idx_282 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc190) + %right_idx_284 = arith.muli %y_idx_279, %iright : tensor<64x2x1xi32, #linear1> loc(#loc192) + %right_idx_285 = "tt.reduce"(%right_idx_284) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc220) + %right_idx_286 = tt.expand_dims %right_idx_285 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc194) + %right_idx_287 = tt.broadcast %right_idx_286 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc195) + %left_idx_288 = tt.reshape %left_idx_283 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc196) + %right_idx_289 = tt.reshape %right_idx_287 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc197) + %cond_290 = arith.cmpi slt, %ileft_277, %iright_278 : tensor<8x16xi32, #linear> loc(#loc198) + %eq_291 = arith.cmpi eq, %ileft_277, %iright_278 : tensor<8x16xi32, #linear> loc(#loc199) + %cond_292 = arith.cmpi sgt, %left_idx_288, %right_idx_289 : tensor<8x16xi32, #linear> loc(#loc200) + %cond_293 = arith.andi %eq_291, %cond_292 : tensor<8x16xi1, #linear> loc(#loc201) + %cond_294 = arith.ori %cond_290, %cond_293 : tensor<8x16xi1, #linear> loc(#loc202) + %cond_295 = arith.extui %cond_294 : tensor<8x16xi1, #linear> to tensor<8x16xi32, #linear> loc(#loc203) + %cond_296 = arith.xori %cond_295, %flip_194 : tensor<8x16xi32, #linear> loc(#loc203) + %cond_297 = arith.cmpi ne, %cond_296, %cst : tensor<8x16xi32, #linear> loc(#loc204) + %ret_298 = arith.xori %ileft_277, %iright_278 : tensor<8x16xi32, #linear> loc(#loc205) + %ret_299 = arith.select %cond_297, %ret_298, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc206) + %ret_300 = arith.xori %ret_264, %ret_299 : tensor<8x16xi32, #linear> loc(#loc207) + %new_idxs_301 = arith.xori %left_idx_288, %right_idx_289 : tensor<8x16xi32, #linear> loc(#loc208) + %new_idxs_302 = arith.select %cond_297, %new_idxs_301, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc209) + %new_idxs_303 = arith.xori %new_idxs_267, %new_idxs_302 : tensor<8x16xi32, #linear> loc(#loc210) + %y_304 = tt.reshape %ret_300 : tensor<8x16xi32, #linear> -> tensor<8x2x8xi32, #linear4> loc(#loc173) + %ileft_305 = tt.broadcast %left_mask_69 : tensor<1x2x1xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc175) + %ileft_306 = arith.muli %y_304, %ileft_305 : tensor<8x2x8xi32, #linear4> loc(#loc175) + %ileft_307 = "tt.reduce"(%ileft_306) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<8x2x8xi32, #linear4>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc212) + %ileft_308 = tt.expand_dims %ileft_307 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<8x1x8xi32, #linear4> loc(#loc177) + %ileft_309 = tt.broadcast %ileft_308 : tensor<8x1x8xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc178) + %iright_310 = arith.muli %y_304, %flip_193 : tensor<8x2x8xi32, #linear4> loc(#loc179) + %iright_311 = "tt.reduce"(%iright_310) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<8x2x8xi32, #linear4>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc214) + %iright_312 = tt.expand_dims %iright_311 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<8x1x8xi32, #linear4> loc(#loc181) + %iright_313 = tt.broadcast %iright_312 : tensor<8x1x8xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc182) + %ileft_314 = tt.reshape %ileft_309 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc183) + %iright_315 = tt.reshape %iright_313 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc184) + %y_idx_316 = tt.reshape %new_idxs_303 : tensor<8x16xi32, #linear> -> tensor<8x2x8xi32, #linear4> loc(#loc185) + %left_idx_317 = arith.muli %y_idx_316, %ileft_305 : tensor<8x2x8xi32, #linear4> loc(#loc187) + %left_idx_318 = "tt.reduce"(%left_idx_317) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<8x2x8xi32, #linear4>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc217) + %left_idx_319 = tt.expand_dims %left_idx_318 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<8x1x8xi32, #linear4> loc(#loc189) + %left_idx_320 = tt.broadcast %left_idx_319 : tensor<8x1x8xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc190) + %right_idx_321 = arith.muli %y_idx_316, %flip_193 : tensor<8x2x8xi32, #linear4> loc(#loc192) + %right_idx_322 = "tt.reduce"(%right_idx_321) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<8x2x8xi32, #linear4>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> loc(#loc220) + %right_idx_323 = tt.expand_dims %right_idx_322 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #linear4}>> -> tensor<8x1x8xi32, #linear4> loc(#loc194) + %right_idx_324 = tt.broadcast %right_idx_323 : tensor<8x1x8xi32, #linear4> -> tensor<8x2x8xi32, #linear4> loc(#loc195) + %left_idx_325 = tt.reshape %left_idx_320 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc196) + %right_idx_326 = tt.reshape %right_idx_324 : tensor<8x2x8xi32, #linear4> -> tensor<8x16xi32, #linear> loc(#loc197) + %cond_327 = arith.cmpi slt, %ileft_314, %iright_315 : tensor<8x16xi32, #linear> loc(#loc198) + %eq_328 = arith.cmpi eq, %ileft_314, %iright_315 : tensor<8x16xi32, #linear> loc(#loc199) + %cond_329 = arith.cmpi sgt, %left_idx_325, %right_idx_326 : tensor<8x16xi32, #linear> loc(#loc200) + %cond_330 = arith.andi %eq_328, %cond_329 : tensor<8x16xi1, #linear> loc(#loc201) + %cond_331 = arith.ori %cond_327, %cond_330 : tensor<8x16xi1, #linear> loc(#loc202) + %ret_332 = arith.xori %ileft_314, %iright_315 : tensor<8x16xi32, #linear> loc(#loc205) + %ret_333 = arith.select %cond_331, %ret_332, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc206) + %ret_334 = arith.xori %ret_300, %ret_333 : tensor<8x16xi32, #linear> loc(#loc207) + %new_idxs_335 = arith.xori %left_idx_325, %right_idx_326 : tensor<8x16xi32, #linear> loc(#loc208) + %new_idxs_336 = arith.select %cond_331, %new_idxs_335, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc209) + %new_idxs_337 = arith.xori %new_idxs_303, %new_idxs_336 : tensor<8x16xi32, #linear> loc(#loc210) + %y_338 = tt.reshape %ret_334 : tensor<8x16xi32, #linear> -> tensor<16x2x4xi32, #linear3> loc(#loc173) + %ileft_339 = arith.muli %y_338, %ileft_196 : tensor<16x2x4xi32, #linear3> loc(#loc175) + %ileft_340 = "tt.reduce"(%ileft_339) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc212) + %ileft_341 = tt.expand_dims %ileft_340 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc177) + %ileft_342 = tt.broadcast %ileft_341 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc178) + %iright_343 = arith.muli %y_338, %flip_118 : tensor<16x2x4xi32, #linear3> loc(#loc179) + %iright_344 = "tt.reduce"(%iright_343) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc214) + %iright_345 = tt.expand_dims %iright_344 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc181) + %iright_346 = tt.broadcast %iright_345 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc182) + %ileft_347 = tt.reshape %ileft_342 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc183) + %iright_348 = tt.reshape %iright_346 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc184) + %y_idx_349 = tt.reshape %new_idxs_337 : tensor<8x16xi32, #linear> -> tensor<16x2x4xi32, #linear3> loc(#loc185) + %left_idx_350 = arith.muli %y_idx_349, %ileft_196 : tensor<16x2x4xi32, #linear3> loc(#loc187) + %left_idx_351 = "tt.reduce"(%left_idx_350) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc217) + %left_idx_352 = tt.expand_dims %left_idx_351 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc189) + %left_idx_353 = tt.broadcast %left_idx_352 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc190) + %right_idx_354 = arith.muli %y_idx_349, %flip_118 : tensor<16x2x4xi32, #linear3> loc(#loc192) + %right_idx_355 = "tt.reduce"(%right_idx_354) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<16x2x4xi32, #linear3>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> loc(#loc220) + %right_idx_356 = tt.expand_dims %right_idx_355 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #linear3}>> -> tensor<16x1x4xi32, #linear3> loc(#loc194) + %right_idx_357 = tt.broadcast %right_idx_356 : tensor<16x1x4xi32, #linear3> -> tensor<16x2x4xi32, #linear3> loc(#loc195) + %left_idx_358 = tt.reshape %left_idx_353 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc196) + %right_idx_359 = tt.reshape %right_idx_357 : tensor<16x2x4xi32, #linear3> -> tensor<8x16xi32, #linear> loc(#loc197) + %cond_360 = arith.cmpi slt, %ileft_347, %iright_348 : tensor<8x16xi32, #linear> loc(#loc198) + %eq_361 = arith.cmpi eq, %ileft_347, %iright_348 : tensor<8x16xi32, #linear> loc(#loc199) + %cond_362 = arith.cmpi sgt, %left_idx_358, %right_idx_359 : tensor<8x16xi32, #linear> loc(#loc200) + %cond_363 = arith.andi %eq_361, %cond_362 : tensor<8x16xi1, #linear> loc(#loc201) + %cond_364 = arith.ori %cond_360, %cond_363 : tensor<8x16xi1, #linear> loc(#loc202) + %ret_365 = arith.xori %ileft_347, %iright_348 : tensor<8x16xi32, #linear> loc(#loc205) + %ret_366 = arith.select %cond_364, %ret_365, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc206) + %ret_367 = arith.xori %ret_334, %ret_366 : tensor<8x16xi32, #linear> loc(#loc207) + %new_idxs_368 = arith.xori %left_idx_358, %right_idx_359 : tensor<8x16xi32, #linear> loc(#loc208) + %new_idxs_369 = arith.select %cond_364, %new_idxs_368, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc209) + %new_idxs_370 = arith.xori %new_idxs_337, %new_idxs_369 : tensor<8x16xi32, #linear> loc(#loc210) + %y_371 = tt.reshape %ret_367 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc173) + %ileft_372 = arith.muli %y_371, %ileft_121 : tensor<32x2x2xi32, #linear2> loc(#loc175) + %ileft_373 = "tt.reduce"(%ileft_372) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc212) + %ileft_374 = tt.expand_dims %ileft_373 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc177) + %ileft_375 = tt.broadcast %ileft_374 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc178) + %iright_376 = arith.muli %y_371, %flip_64 : tensor<32x2x2xi32, #linear2> loc(#loc179) + %iright_377 = "tt.reduce"(%iright_376) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc214) + %iright_378 = tt.expand_dims %iright_377 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc181) + %iright_379 = tt.broadcast %iright_378 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc182) + %ileft_380 = tt.reshape %ileft_375 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc183) + %iright_381 = tt.reshape %iright_379 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc184) + %y_idx_382 = tt.reshape %new_idxs_370 : tensor<8x16xi32, #linear> -> tensor<32x2x2xi32, #linear2> loc(#loc185) + %left_idx_383 = arith.muli %y_idx_382, %ileft_121 : tensor<32x2x2xi32, #linear2> loc(#loc187) + %left_idx_384 = "tt.reduce"(%left_idx_383) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc217) + %left_idx_385 = tt.expand_dims %left_idx_384 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc189) + %left_idx_386 = tt.broadcast %left_idx_385 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc190) + %right_idx_387 = arith.muli %y_idx_382, %flip_64 : tensor<32x2x2xi32, #linear2> loc(#loc192) + %right_idx_388 = "tt.reduce"(%right_idx_387) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<32x2x2xi32, #linear2>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> loc(#loc220) + %right_idx_389 = tt.expand_dims %right_idx_388 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #linear2}>> -> tensor<32x1x2xi32, #linear2> loc(#loc194) + %right_idx_390 = tt.broadcast %right_idx_389 : tensor<32x1x2xi32, #linear2> -> tensor<32x2x2xi32, #linear2> loc(#loc195) + %left_idx_391 = tt.reshape %left_idx_386 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc196) + %right_idx_392 = tt.reshape %right_idx_390 : tensor<32x2x2xi32, #linear2> -> tensor<8x16xi32, #linear> loc(#loc197) + %cond_393 = arith.cmpi slt, %ileft_380, %iright_381 : tensor<8x16xi32, #linear> loc(#loc198) + %eq_394 = arith.cmpi eq, %ileft_380, %iright_381 : tensor<8x16xi32, #linear> loc(#loc199) + %cond_395 = arith.cmpi sgt, %left_idx_391, %right_idx_392 : tensor<8x16xi32, #linear> loc(#loc200) + %cond_396 = arith.andi %eq_394, %cond_395 : tensor<8x16xi1, #linear> loc(#loc201) + %cond_397 = arith.ori %cond_393, %cond_396 : tensor<8x16xi1, #linear> loc(#loc202) + %ret_398 = arith.xori %ileft_380, %iright_381 : tensor<8x16xi32, #linear> loc(#loc205) + %ret_399 = arith.select %cond_397, %ret_398, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc206) + %ret_400 = arith.xori %ret_367, %ret_399 : tensor<8x16xi32, #linear> loc(#loc207) + %new_idxs_401 = arith.xori %left_idx_391, %right_idx_392 : tensor<8x16xi32, #linear> loc(#loc208) + %new_idxs_402 = arith.select %cond_397, %new_idxs_401, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc209) + %new_idxs_403 = arith.xori %new_idxs_370, %new_idxs_402 : tensor<8x16xi32, #linear> loc(#loc210) + %y_404 = tt.reshape %ret_400 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc173) + %ileft_405 = arith.muli %y_404, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc175) + %ileft_406 = "tt.reduce"(%ileft_405) <{axis = 1 : i32}> ({ + ^bb0(%ileft_435: i32 loc(callsite(#loc1 at #loc176)), %ileft_436: i32 loc(callsite(#loc1 at #loc176))): + %ileft_437 = arith.addi %ileft_435, %ileft_436 : i32 loc(#loc222) + tt.reduce.return %ileft_437 : i32 loc(#loc212) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc212) + %ileft_407 = tt.expand_dims %ileft_406 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc177) + %ileft_408 = tt.broadcast %ileft_407 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc178) + %iright_409 = arith.muli %y_404, %iright : tensor<64x2x1xi32, #linear1> loc(#loc179) + %iright_410 = "tt.reduce"(%iright_409) <{axis = 1 : i32}> ({ + ^bb0(%iright_435: i32 loc(callsite(#loc1 at #loc180)), %iright_436: i32 loc(callsite(#loc1 at #loc180))): + %iright_437 = arith.addi %iright_435, %iright_436 : i32 loc(#loc223) + tt.reduce.return %iright_437 : i32 loc(#loc214) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc214) + %iright_411 = tt.expand_dims %iright_410 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc181) + %iright_412 = tt.broadcast %iright_411 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc182) + %ileft_413 = tt.reshape %ileft_408 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc183) + %iright_414 = tt.reshape %iright_412 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc184) + %y_idx_415 = tt.reshape %new_idxs_403 : tensor<8x16xi32, #linear> -> tensor<64x2x1xi32, #linear1> loc(#loc185) + %left_idx_416 = arith.muli %y_idx_415, %ileft : tensor<64x2x1xi32, #linear1> loc(#loc187) + %left_idx_417 = "tt.reduce"(%left_idx_416) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_435: i32 loc(callsite(#loc1 at #loc188)), %left_idx_436: i32 loc(callsite(#loc1 at #loc188))): + %left_idx_437 = arith.addi %left_idx_435, %left_idx_436 : i32 loc(#loc224) + tt.reduce.return %left_idx_437 : i32 loc(#loc217) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc217) + %left_idx_418 = tt.expand_dims %left_idx_417 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc189) + %left_idx_419 = tt.broadcast %left_idx_418 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc190) + %right_idx_420 = arith.muli %y_idx_415, %iright : tensor<64x2x1xi32, #linear1> loc(#loc192) + %right_idx_421 = "tt.reduce"(%right_idx_420) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_435: i32 loc(callsite(#loc1 at #loc193)), %right_idx_436: i32 loc(callsite(#loc1 at #loc193))): + %right_idx_437 = arith.addi %right_idx_435, %right_idx_436 : i32 loc(#loc225) + tt.reduce.return %right_idx_437 : i32 loc(#loc220) + }) : (tensor<64x2x1xi32, #linear1>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> loc(#loc220) + %right_idx_422 = tt.expand_dims %right_idx_421 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #linear1}>> -> tensor<64x1x1xi32, #linear1> loc(#loc194) + %right_idx_423 = tt.broadcast %right_idx_422 : tensor<64x1x1xi32, #linear1> -> tensor<64x2x1xi32, #linear1> loc(#loc195) + %left_idx_424 = tt.reshape %left_idx_419 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc196) + %right_idx_425 = tt.reshape %right_idx_423 : tensor<64x2x1xi32, #linear1> -> tensor<8x16xi32, #linear> loc(#loc197) + %cond_426 = arith.cmpi slt, %ileft_413, %iright_414 : tensor<8x16xi32, #linear> loc(#loc198) + %eq_427 = arith.cmpi eq, %ileft_413, %iright_414 : tensor<8x16xi32, #linear> loc(#loc199) + %cond_428 = arith.cmpi sgt, %left_idx_424, %right_idx_425 : tensor<8x16xi32, #linear> loc(#loc200) + %cond_429 = arith.andi %eq_427, %cond_428 : tensor<8x16xi1, #linear> loc(#loc201) + %cond_430 = arith.ori %cond_426, %cond_429 : tensor<8x16xi1, #linear> loc(#loc202) + %new_idxs_431 = arith.xori %left_idx_424, %right_idx_425 : tensor<8x16xi32, #linear> loc(#loc208) + %new_idxs_432 = arith.select %cond_430, %new_idxs_431, %cst : tensor<8x16xi1, #linear>, tensor<8x16xi32, #linear> loc(#loc209) + %new_idxs_433 = arith.xori %new_idxs_403, %new_idxs_432 : tensor<8x16xi32, #linear> loc(#loc210) + %tmp7 = arith.extsi %tmp0_52 : tensor<8x16xi32, #blocked> to tensor<8x16xi64, #blocked> loc(#loc159) + %tmp10 = arith.select %tmp0_50, %tmp7, %cst_0 : tensor<8x16xi1, #blocked>, tensor<8x16xi64, #blocked> loc(#loc160) + %tmp11 = "tt.reduce"(%tmp10) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_435: i64 loc(callsite(#loc1 at #loc161)), %tmp11_436: i64 loc(callsite(#loc1 at #loc161))): + %tmp11_437 = arith.addi %tmp11_435, %tmp11_436 : i64 loc(#loc211) + tt.reduce.return %tmp11_437 : i64 loc(#loc171) + }) : (tensor<8x16xi64, #blocked>) -> tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc171) + %tmp11_434 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi64, #blocked> loc(#loc162) + %tmp14 = arith.trunci %tmp11_434 : tensor<8x1xi64, #blocked> to tensor<8x1xi32, #blocked> loc(#loc163) + %0 = arith.muli %x0_28, %cst_2 : tensor<8x1xi64, #blocked1> loc(#loc74) + %1 = tt.broadcast %0 : tensor<8x1xi64, #blocked1> -> tensor<8x16xi64, #blocked1> loc(#loc75) + %2 = arith.addi %tmp0_32, %1 : tensor<8x16xi64, #blocked1> loc(#loc75) + %3 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc76) + %4 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc77) + %5 = arith.extui %4 : i1 to i64 loc(#loc78) + %6 = arith.muli %ks0, %5 : i64 loc(#loc78) + %7 = arith.extui %3 : i1 to i64 loc(#loc164) + %8 = arith.addi %7, %6 : i64 loc(#loc79) + %9 = tt.splat %8 : i64 -> tensor<8x1xi64, #blocked1> loc(#loc81) + %10 = tt.splat %8 : i64 -> tensor<8x1xi64, #blocked> loc(#loc81) + %11 = arith.muli %tmp0_36, %9 : tensor<8x1xi64, #blocked1> loc(#loc81) + %12 = tt.broadcast %11 : tensor<8x1xi64, #blocked1> -> tensor<8x16xi64, #blocked1> loc(#loc82) + %13 = arith.addi %2, %12 : tensor<8x16xi64, #blocked1> loc(#loc82) + %14 = tt.splat %out_ptr2 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked1> loc(#loc83) + %15 = tt.addptr %14, %13 : tensor<8x16x!tt.ptr, #blocked1>, tensor<8x16xi64, #blocked1> loc(#loc83) + %16 = ttg.convert_layout %new_idxs_433 : tensor<8x16xi32, #linear> -> tensor<8x16xi32, #blocked1> loc(#loc84) + tt.store %15, %16, %tmp0_51 : tensor<8x16x!tt.ptr, #blocked1> loc(#loc84) + %17 = arith.muli %x1, %10 : tensor<8x1xi64, #blocked> loc(#loc85) + %18 = arith.addi %x0_27, %17 : tensor<8x1xi64, #blocked> loc(#loc86) + %19 = tt.splat %out_ptr3 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked> loc(#loc87) + %20 = tt.addptr %19, %18 : tensor<8x1x!tt.ptr, #blocked>, tensor<8x1xi64, #blocked> loc(#loc87) + tt.store %20, %tmp14, %xmask_17 : tensor<8x1x!tt.ptr, #blocked> loc(#loc88) + tt.return loc(#loc89) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":25:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":26:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":32:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":33:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:42) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:54) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:50) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:64) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:68) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:61) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:30) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:73) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":37:19) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":39:33) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":41:19) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":43:34) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":44:29) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":47:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:32) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:62) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:88) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:79) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:70) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:54) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:47) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:40) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:25) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:102) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:34) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:30) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:25) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:89) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:4) +#loc96 = loc("xoffset"(#loc2)) +#loc97 = loc("xoffset"(#loc3)) +#loc98 = loc("xindex"(#loc4)) +#loc99 = loc("xindex"(#loc5)) +#loc100 = loc("xmask"(#loc6)) +#loc101 = loc("r0_index"(#loc7)) +#loc102 = loc("x0"(#loc8)) +#loc103 = loc("x1"(#loc9)) +#loc104 = loc("tmp0"(#loc10)) +#loc105 = loc("tmp0"(#loc11)) +#loc106 = loc("tmp0"(#loc12)) +#loc107 = loc("tmp0"(#loc13)) +#loc108 = loc("tmp0"(#loc14)) +#loc109 = loc("tmp0"(#loc15)) +#loc110 = loc("tmp0"(#loc16)) +#loc111 = loc("tmp0"(#loc17)) +#loc112 = loc("tmp0"(#loc18)) +#loc113 = loc("tmp0"(#loc19)) +#loc114 = loc("tmp2"(#loc20)) +#loc115 = loc("tmp4"(#loc21)) +#loc116 = loc("flip"(#loc22)) +#loc118 = loc("flip"(#loc25)) +#loc119 = loc("flip"(#loc26)) +#loc120 = loc("y"(#loc27)) +#loc121 = loc("left_mask"(#loc29)) +#loc122 = loc("ileft"(#loc30)) +#loc124 = loc("ileft"(#loc34)) +#loc125 = loc("ileft"(#loc35)) +#loc126 = loc("iright"(#loc36)) +#loc128 = loc("iright"(#loc38)) +#loc129 = loc("iright"(#loc39)) +#loc130 = loc("ileft"(#loc40)) +#loc131 = loc("iright"(#loc41)) +#loc132 = loc("y_idx"(#loc42)) +#loc133 = loc("left_idx"(#loc43)) +#loc134 = loc("left_idx"(#loc44)) +#loc135 = loc("input"(#loc45)) +#loc137 = loc("left_idx"(#loc47)) +#loc138 = loc("left_idx"(#loc48)) +#loc139 = loc("right_idx"(#loc49)) +#loc140 = loc("right_idx"(#loc50)) +#loc142 = loc("right_idx"(#loc52)) +#loc143 = loc("right_idx"(#loc53)) +#loc144 = loc("left_idx"(#loc54)) +#loc145 = loc("right_idx"(#loc55)) +#loc146 = loc("cond"(#loc56)) +#loc147 = loc("eq"(#loc57)) +#loc148 = loc("cond"(#loc58)) +#loc149 = loc("cond"(#loc59)) +#loc150 = loc("cond"(#loc60)) +#loc151 = loc("cond"(#loc61)) +#loc152 = loc("cond"(#loc62)) +#loc153 = loc("ret"(#loc63)) +#loc154 = loc("ret"(#loc64)) +#loc155 = loc("ret"(#loc65)) +#loc156 = loc("new_idxs"(#loc66)) +#loc157 = loc("new_idxs"(#loc67)) +#loc158 = loc("new_idxs"(#loc68)) +#loc159 = loc("tmp7"(#loc69)) +#loc160 = loc("tmp10"(#loc70)) +#loc162 = loc("tmp11"(#loc72)) +#loc163 = loc("tmp14"(#loc73)) +#loc164 = loc(fused[#loc79, #loc80]) +#loc165 = loc(callsite(#loc116 at #loc117)) +#loc166 = loc(callsite(#loc118 at #loc117)) +#loc167 = loc(callsite(#loc119 at #loc117)) +#loc169 = loc("cond"(#loc146)) +#loc170 = loc("eq"(#loc147)) +#loc171 = loc(callsite(#loc31 at #loc161)) +#loc173 = loc(callsite(#loc120 at #loc168)) +#loc174 = loc(callsite(#loc121 at #loc168)) +#loc175 = loc(callsite(#loc122 at #loc168)) +#loc177 = loc(callsite(#loc124 at #loc168)) +#loc178 = loc(callsite(#loc125 at #loc168)) +#loc179 = loc(callsite(#loc126 at #loc168)) +#loc181 = loc(callsite(#loc128 at #loc168)) +#loc182 = loc(callsite(#loc129 at #loc168)) +#loc183 = loc(callsite(#loc130 at #loc168)) +#loc184 = loc(callsite(#loc131 at #loc168)) +#loc185 = loc(callsite(#loc132 at #loc168)) +#loc186 = loc(callsite(#loc133 at #loc168)) +#loc187 = loc(callsite(#loc134 at #loc168)) +#loc189 = loc(callsite(#loc137 at #loc168)) +#loc190 = loc(callsite(#loc138 at #loc168)) +#loc191 = loc(callsite(#loc139 at #loc168)) +#loc192 = loc(callsite(#loc140 at #loc168)) +#loc194 = loc(callsite(#loc142 at #loc168)) +#loc195 = loc(callsite(#loc143 at #loc168)) +#loc196 = loc(callsite(#loc144 at #loc168)) +#loc197 = loc(callsite(#loc145 at #loc168)) +#loc198 = loc(callsite(#loc169 at #loc168)) +#loc199 = loc(callsite(#loc170 at #loc168)) +#loc200 = loc(callsite(#loc148 at #loc168)) +#loc201 = loc(callsite(#loc149 at #loc168)) +#loc202 = loc(callsite(#loc150 at #loc168)) +#loc203 = loc(callsite(#loc151 at #loc168)) +#loc204 = loc(callsite(#loc152 at #loc168)) +#loc205 = loc(callsite(#loc153 at #loc168)) +#loc206 = loc(callsite(#loc154 at #loc168)) +#loc207 = loc(callsite(#loc155 at #loc168)) +#loc208 = loc(callsite(#loc156 at #loc168)) +#loc209 = loc(callsite(#loc157 at #loc168)) +#loc210 = loc(callsite(#loc158 at #loc168)) +#loc211 = loc(callsite(#loc33 at #loc171)) +#loc212 = loc(callsite(#loc31 at #loc176)) +#loc214 = loc(callsite(#loc31 at #loc180)) +#loc216 = loc(callsite(#loc135 at #loc188)) +#loc217 = loc(callsite(#loc31 at #loc188)) +#loc219 = loc(callsite(#loc135 at #loc193)) +#loc220 = loc(callsite(#loc31 at #loc193)) +#loc222 = loc(callsite(#loc33 at #loc212)) +#loc223 = loc(callsite(#loc33 at #loc214)) +#loc224 = loc(callsite(#loc33 at #loc217)) +#loc225 = loc(callsite(#loc33 at #loc220)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir new file mode 100644 index 0000000000000000000000000000000000000000..d44ea86eeb94069eaf8865b40581718a213489d1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/XZV6XTBWTSYOI4CHOR7ZEUO2BBGHX4BYZH7H6F44A4ESTMX55GUQ/triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3.ttir @@ -0,0 +1,840 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":40:67) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":44:26) +#loc93 = loc("in_ptr0"(#loc)) +#loc94 = loc("out_ptr2"(#loc)) +#loc95 = loc("out_ptr3"(#loc)) +#loc96 = loc("ks0"(#loc)) +#loc97 = loc("xnumel"(#loc)) +#loc98 = loc("r0_numel"(#loc)) +#loc124 = loc(callsite(#loc27 at #loc2)) +#loc131 = loc("ileft"(#loc36)) +#loc135 = loc("iright"(#loc41)) +#loc144 = loc("left_idx"(#loc50)) +#loc149 = loc("right_idx"(#loc55)) +#loc168 = loc("tmp11"(#loc74)) +#loc176 = loc(callsite(#loc32 at #loc124)) +#loc180 = loc(callsite(#loc1 at #loc168)) +#loc184 = loc(callsite(#loc131 at #loc176)) +#loc188 = loc(callsite(#loc135 at #loc176)) +#loc196 = loc(callsite(#loc144 at #loc176)) +#loc201 = loc(callsite(#loc149 at #loc176)) +#loc221 = loc(callsite(#loc1 at #loc184)) +#loc223 = loc(callsite(#loc1 at #loc188)) +#loc226 = loc(callsite(#loc1 at #loc196)) +#loc229 = loc(callsite(#loc1 at #loc201)) +module { + tt.func public @triton_per_fused__to_copy_clone_slice_sort_sum_transpose_3(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %out_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr3"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc99) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc1) + %tmp10 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc100) + %cst_1 = arith.constant dense<16> : tensor<8x1xi64> loc(#loc1) + %c16_i64 = arith.constant 16 : i64 loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc101) + %xoffset_2 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc102) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc103) + %xindex_3 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc104) + %xindex_4 = tt.splat %xoffset_2 : i32 -> tensor<8x1xi32> loc(#loc105) + %xindex_5 = arith.addi %xindex_4, %xindex_3 : tensor<8x1xi32> loc(#loc105) + %xmask = tt.splat %xnumel : i32 -> tensor<8x1xi32> loc(#loc106) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<8x1xi32> loc(#loc106) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc107) + %r0_index_7 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc108) + %x0 = arith.extsi %xindex_5 : tensor<8x1xi32> to tensor<8x1xi64> loc(#loc109) + %x0_8 = tt.splat %ks0 : i64 -> tensor<8x1xi64> loc(#loc109) + %x0_9 = arith.remsi %x0, %x0_8 : tensor<8x1xi64> loc(#loc109) + %x1 = arith.divsi %x0, %x0_8 : tensor<8x1xi64> loc(#loc110) + %tmp0 = arith.extsi %r0_index_7 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc111) + %tmp0_10 = tt.broadcast %tmp0 : tensor<1x16xi64> -> tensor<8x16xi64> loc(#loc111) + %tmp0_11 = tt.broadcast %x0_9 : tensor<8x1xi64> -> tensor<8x16xi64> loc(#loc111) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<8x16xi64> loc(#loc111) + %tmp0_13 = arith.muli %x1, %cst_1 : tensor<8x1xi64> loc(#loc112) + %tmp0_14 = tt.broadcast %tmp0_13 : tensor<8x1xi64> -> tensor<8x16xi64> loc(#loc113) + %tmp0_15 = arith.addi %tmp0_12, %tmp0_14 : tensor<8x16xi64> loc(#loc113) + %tmp0_16 = tt.splat %ks0 : i64 -> tensor<1x16xi64> loc(#loc114) + %tmp0_17 = arith.muli %tmp0_16, %tmp0 : tensor<1x16xi64> loc(#loc114) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<1x16xi64> -> tensor<8x16xi64> loc(#loc115) + %tmp0_19 = arith.addi %tmp0_15, %tmp0_18 : tensor<8x16xi64> loc(#loc115) + %tmp0_20 = arith.muli %ks0, %c16_i64 : i64 loc(#loc116) + %tmp0_21 = tt.splat %tmp0_20 : i64 -> tensor<8x1xi64> loc(#loc117) + %tmp0_22 = arith.muli %tmp0_21, %x1 : tensor<8x1xi64> loc(#loc117) + %tmp0_23 = tt.broadcast %tmp0_22 : tensor<8x1xi64> -> tensor<8x16xi64> loc(#loc118) + %tmp0_24 = arith.addi %tmp0_19, %tmp0_23 : tensor<8x16xi64> loc(#loc118) + %tmp0_25 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc119) + %tmp0_26 = tt.addptr %tmp0_25, %tmp0_24 : tensor<8x16x!tt.ptr>, tensor<8x16xi64> loc(#loc119) + %tmp0_27 = tt.broadcast %xmask_6 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc120) + %tmp0_28 = tt.load %tmp0_26, %tmp0_27, %cst_0 evictionPolicy = evict_last : tensor<8x16x!tt.ptr> loc(#loc120) + %tmp2 = arith.trunci %r0_index_7 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc121) + %tmp4 = tt.broadcast %tmp2 : tensor<1x16xi16> -> tensor<8x16xi16> loc(#loc122) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc172) + %flip_29 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc173) + %flip_30 = tt.expand_dims %flip_29 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc173) + %flip_31 = tt.broadcast %flip_30 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc174) + %flip_32 = tt.reshape %flip_31 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc175) + %y = tt.reshape %tmp0_28 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc181) + %left_mask = arith.subi %cst, %flip_30 : tensor<1x2x1xi32> loc(#loc182) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc183) + %ileft_33 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc183) + %ileft_34 = "tt.reduce"(%ileft_33) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc220) + %ileft_35 = tt.expand_dims %ileft_34 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc185) + %ileft_36 = tt.broadcast %ileft_35 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc186) + %iright = tt.broadcast %flip_30 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc187) + %iright_37 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc187) + %iright_38 = "tt.reduce"(%iright_37) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc222) + %iright_39 = tt.expand_dims %iright_38 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc189) + %iright_40 = tt.broadcast %iright_39 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc190) + %ileft_41 = tt.reshape %ileft_36 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc191) + %iright_42 = tt.reshape %iright_40 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc192) + %y_idx = tt.reshape %tmp4 : tensor<8x16xi16> -> tensor<64x2x1xi16> loc(#loc193) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc194) + %left_idx_43 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc195) + %left_idx_44 = arith.muli %y_idx, %left_idx_43 : tensor<64x2x1xi16> loc(#loc195) + %input = arith.extsi %left_idx_44 : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc224) + %left_idx_45 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc225) + %left_idx_46 = tt.expand_dims %left_idx_45 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc197) + %left_idx_47 = tt.broadcast %left_idx_46 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc198) + %right_idx = arith.trunci %flip_30 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc199) + %right_idx_48 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc200) + %right_idx_49 = arith.muli %y_idx, %right_idx_48 : tensor<64x2x1xi16> loc(#loc200) + %input_50 = arith.extsi %right_idx_49 : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc227) + %right_idx_51 = "tt.reduce"(%input_50) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc228) + %right_idx_52 = tt.expand_dims %right_idx_51 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc202) + %right_idx_53 = tt.broadcast %right_idx_52 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc203) + %left_idx_54 = tt.reshape %left_idx_47 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc204) + %right_idx_55 = tt.reshape %right_idx_53 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc205) + %cond = arith.cmpi slt, %ileft_41, %iright_42 : tensor<8x16xi32> loc(#loc206) + %eq = arith.cmpi eq, %ileft_41, %iright_42 : tensor<8x16xi32> loc(#loc207) + %cond_56 = arith.cmpi sgt, %left_idx_54, %right_idx_55 : tensor<8x16xi32> loc(#loc208) + %cond_57 = arith.andi %eq, %cond_56 : tensor<8x16xi1> loc(#loc209) + %cond_58 = arith.ori %cond, %cond_57 : tensor<8x16xi1> loc(#loc210) + %cond_59 = arith.extui %cond_58 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc211) + %cond_60 = arith.xori %cond_59, %flip_32 : tensor<8x16xi32> loc(#loc211) + %cond_61 = arith.cmpi ne, %cond_60, %cst_0 : tensor<8x16xi32> loc(#loc212) + %ret = arith.xori %ileft_41, %iright_42 : tensor<8x16xi32> loc(#loc213) + %ret_62 = arith.select %cond_61, %ret, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc214) + %ret_63 = arith.xori %tmp0_28, %ret_62 : tensor<8x16xi32> loc(#loc215) + %new_idxs = arith.xori %left_idx_54, %right_idx_55 : tensor<8x16xi32> loc(#loc216) + %new_idxs_64 = arith.select %cond_61, %new_idxs, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc217) + %new_idxs_65 = arith.extsi %tmp2 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc218) + %new_idxs_66 = tt.broadcast %new_idxs_65 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc218) + %new_idxs_67 = arith.xori %new_idxs_66, %new_idxs_64 : tensor<8x16xi32> loc(#loc218) + %flip_68 = tt.broadcast %flip_30 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc174) + %flip_69 = tt.reshape %flip_68 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc175) + %y_70 = tt.reshape %ret_63 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc181) + %ileft_71 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc183) + %ileft_72 = arith.muli %y_70, %ileft_71 : tensor<32x2x2xi32> loc(#loc183) + %ileft_73 = "tt.reduce"(%ileft_72) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc220) + %ileft_74 = tt.expand_dims %ileft_73 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc185) + %ileft_75 = tt.broadcast %ileft_74 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc186) + %iright_76 = arith.muli %y_70, %flip_31 : tensor<32x2x2xi32> loc(#loc187) + %iright_77 = "tt.reduce"(%iright_76) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc222) + %iright_78 = tt.expand_dims %iright_77 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc189) + %iright_79 = tt.broadcast %iright_78 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc190) + %ileft_80 = tt.reshape %ileft_75 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc191) + %iright_81 = tt.reshape %iright_79 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc192) + %y_idx_82 = tt.reshape %new_idxs_67 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc193) + %left_idx_83 = arith.muli %y_idx_82, %ileft_71 : tensor<32x2x2xi32> loc(#loc195) + %left_idx_84 = "tt.reduce"(%left_idx_83) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc225) + %left_idx_85 = tt.expand_dims %left_idx_84 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc197) + %left_idx_86 = tt.broadcast %left_idx_85 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc198) + %right_idx_87 = arith.muli %y_idx_82, %flip_31 : tensor<32x2x2xi32> loc(#loc200) + %right_idx_88 = "tt.reduce"(%right_idx_87) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc228) + %right_idx_89 = tt.expand_dims %right_idx_88 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc202) + %right_idx_90 = tt.broadcast %right_idx_89 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc203) + %left_idx_91 = tt.reshape %left_idx_86 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc204) + %right_idx_92 = tt.reshape %right_idx_90 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc205) + %cond_93 = arith.cmpi slt, %ileft_80, %iright_81 : tensor<8x16xi32> loc(#loc206) + %eq_94 = arith.cmpi eq, %ileft_80, %iright_81 : tensor<8x16xi32> loc(#loc207) + %cond_95 = arith.cmpi sgt, %left_idx_91, %right_idx_92 : tensor<8x16xi32> loc(#loc208) + %cond_96 = arith.andi %eq_94, %cond_95 : tensor<8x16xi1> loc(#loc209) + %cond_97 = arith.ori %cond_93, %cond_96 : tensor<8x16xi1> loc(#loc210) + %cond_98 = arith.extui %cond_97 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc211) + %cond_99 = arith.xori %cond_98, %flip_69 : tensor<8x16xi32> loc(#loc211) + %cond_100 = arith.cmpi ne, %cond_99, %cst_0 : tensor<8x16xi32> loc(#loc212) + %ret_101 = arith.xori %ileft_80, %iright_81 : tensor<8x16xi32> loc(#loc213) + %ret_102 = arith.select %cond_100, %ret_101, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc214) + %ret_103 = arith.xori %ret_63, %ret_102 : tensor<8x16xi32> loc(#loc215) + %new_idxs_104 = arith.xori %left_idx_91, %right_idx_92 : tensor<8x16xi32> loc(#loc216) + %new_idxs_105 = arith.select %cond_100, %new_idxs_104, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc217) + %new_idxs_106 = arith.xori %new_idxs_67, %new_idxs_105 : tensor<8x16xi32> loc(#loc218) + %y_107 = tt.reshape %ret_103 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc181) + %ileft_108 = arith.muli %y_107, %ileft : tensor<64x2x1xi32> loc(#loc183) + %ileft_109 = "tt.reduce"(%ileft_108) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc220) + %ileft_110 = tt.expand_dims %ileft_109 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc185) + %ileft_111 = tt.broadcast %ileft_110 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc186) + %iright_112 = arith.muli %y_107, %iright : tensor<64x2x1xi32> loc(#loc187) + %iright_113 = "tt.reduce"(%iright_112) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc222) + %iright_114 = tt.expand_dims %iright_113 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc189) + %iright_115 = tt.broadcast %iright_114 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc190) + %ileft_116 = tt.reshape %ileft_111 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc191) + %iright_117 = tt.reshape %iright_115 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc192) + %y_idx_118 = tt.reshape %new_idxs_106 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc193) + %left_idx_119 = arith.muli %y_idx_118, %ileft : tensor<64x2x1xi32> loc(#loc195) + %left_idx_120 = "tt.reduce"(%left_idx_119) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc225) + %left_idx_121 = tt.expand_dims %left_idx_120 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc197) + %left_idx_122 = tt.broadcast %left_idx_121 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc198) + %right_idx_123 = arith.muli %y_idx_118, %iright : tensor<64x2x1xi32> loc(#loc200) + %right_idx_124 = "tt.reduce"(%right_idx_123) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc228) + %right_idx_125 = tt.expand_dims %right_idx_124 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc202) + %right_idx_126 = tt.broadcast %right_idx_125 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc203) + %left_idx_127 = tt.reshape %left_idx_122 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc204) + %right_idx_128 = tt.reshape %right_idx_126 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc205) + %cond_129 = arith.cmpi slt, %ileft_116, %iright_117 : tensor<8x16xi32> loc(#loc206) + %eq_130 = arith.cmpi eq, %ileft_116, %iright_117 : tensor<8x16xi32> loc(#loc207) + %cond_131 = arith.cmpi sgt, %left_idx_127, %right_idx_128 : tensor<8x16xi32> loc(#loc208) + %cond_132 = arith.andi %eq_130, %cond_131 : tensor<8x16xi1> loc(#loc209) + %cond_133 = arith.ori %cond_129, %cond_132 : tensor<8x16xi1> loc(#loc210) + %cond_134 = arith.extui %cond_133 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc211) + %cond_135 = arith.xori %cond_134, %flip_69 : tensor<8x16xi32> loc(#loc211) + %cond_136 = arith.cmpi ne, %cond_135, %cst_0 : tensor<8x16xi32> loc(#loc212) + %ret_137 = arith.xori %ileft_116, %iright_117 : tensor<8x16xi32> loc(#loc213) + %ret_138 = arith.select %cond_136, %ret_137, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc214) + %ret_139 = arith.xori %ret_103, %ret_138 : tensor<8x16xi32> loc(#loc215) + %new_idxs_140 = arith.xori %left_idx_127, %right_idx_128 : tensor<8x16xi32> loc(#loc216) + %new_idxs_141 = arith.select %cond_136, %new_idxs_140, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc217) + %new_idxs_142 = arith.xori %new_idxs_106, %new_idxs_141 : tensor<8x16xi32> loc(#loc218) + %flip_143 = tt.broadcast %flip_30 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc174) + %flip_144 = tt.reshape %flip_143 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc175) + %y_145 = tt.reshape %ret_139 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc181) + %ileft_146 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc183) + %ileft_147 = arith.muli %y_145, %ileft_146 : tensor<16x2x4xi32> loc(#loc183) + %ileft_148 = "tt.reduce"(%ileft_147) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc220) + %ileft_149 = tt.expand_dims %ileft_148 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc185) + %ileft_150 = tt.broadcast %ileft_149 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc186) + %iright_151 = arith.muli %y_145, %flip_68 : tensor<16x2x4xi32> loc(#loc187) + %iright_152 = "tt.reduce"(%iright_151) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc222) + %iright_153 = tt.expand_dims %iright_152 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc189) + %iright_154 = tt.broadcast %iright_153 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc190) + %ileft_155 = tt.reshape %ileft_150 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc191) + %iright_156 = tt.reshape %iright_154 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc192) + %y_idx_157 = tt.reshape %new_idxs_142 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc193) + %left_idx_158 = arith.muli %y_idx_157, %ileft_146 : tensor<16x2x4xi32> loc(#loc195) + %left_idx_159 = "tt.reduce"(%left_idx_158) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc225) + %left_idx_160 = tt.expand_dims %left_idx_159 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc197) + %left_idx_161 = tt.broadcast %left_idx_160 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc198) + %right_idx_162 = arith.muli %y_idx_157, %flip_68 : tensor<16x2x4xi32> loc(#loc200) + %right_idx_163 = "tt.reduce"(%right_idx_162) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc228) + %right_idx_164 = tt.expand_dims %right_idx_163 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc202) + %right_idx_165 = tt.broadcast %right_idx_164 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc203) + %left_idx_166 = tt.reshape %left_idx_161 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc204) + %right_idx_167 = tt.reshape %right_idx_165 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc205) + %cond_168 = arith.cmpi slt, %ileft_155, %iright_156 : tensor<8x16xi32> loc(#loc206) + %eq_169 = arith.cmpi eq, %ileft_155, %iright_156 : tensor<8x16xi32> loc(#loc207) + %cond_170 = arith.cmpi sgt, %left_idx_166, %right_idx_167 : tensor<8x16xi32> loc(#loc208) + %cond_171 = arith.andi %eq_169, %cond_170 : tensor<8x16xi1> loc(#loc209) + %cond_172 = arith.ori %cond_168, %cond_171 : tensor<8x16xi1> loc(#loc210) + %cond_173 = arith.extui %cond_172 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc211) + %cond_174 = arith.xori %cond_173, %flip_144 : tensor<8x16xi32> loc(#loc211) + %cond_175 = arith.cmpi ne, %cond_174, %cst_0 : tensor<8x16xi32> loc(#loc212) + %ret_176 = arith.xori %ileft_155, %iright_156 : tensor<8x16xi32> loc(#loc213) + %ret_177 = arith.select %cond_175, %ret_176, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc214) + %ret_178 = arith.xori %ret_139, %ret_177 : tensor<8x16xi32> loc(#loc215) + %new_idxs_179 = arith.xori %left_idx_166, %right_idx_167 : tensor<8x16xi32> loc(#loc216) + %new_idxs_180 = arith.select %cond_175, %new_idxs_179, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc217) + %new_idxs_181 = arith.xori %new_idxs_142, %new_idxs_180 : tensor<8x16xi32> loc(#loc218) + %y_182 = tt.reshape %ret_178 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc181) + %ileft_183 = arith.muli %y_182, %ileft_71 : tensor<32x2x2xi32> loc(#loc183) + %ileft_184 = "tt.reduce"(%ileft_183) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc220) + %ileft_185 = tt.expand_dims %ileft_184 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc185) + %ileft_186 = tt.broadcast %ileft_185 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc186) + %iright_187 = arith.muli %y_182, %flip_31 : tensor<32x2x2xi32> loc(#loc187) + %iright_188 = "tt.reduce"(%iright_187) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc222) + %iright_189 = tt.expand_dims %iright_188 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc189) + %iright_190 = tt.broadcast %iright_189 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc190) + %ileft_191 = tt.reshape %ileft_186 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc191) + %iright_192 = tt.reshape %iright_190 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc192) + %y_idx_193 = tt.reshape %new_idxs_181 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc193) + %left_idx_194 = arith.muli %y_idx_193, %ileft_71 : tensor<32x2x2xi32> loc(#loc195) + %left_idx_195 = "tt.reduce"(%left_idx_194) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc225) + %left_idx_196 = tt.expand_dims %left_idx_195 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc197) + %left_idx_197 = tt.broadcast %left_idx_196 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc198) + %right_idx_198 = arith.muli %y_idx_193, %flip_31 : tensor<32x2x2xi32> loc(#loc200) + %right_idx_199 = "tt.reduce"(%right_idx_198) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc228) + %right_idx_200 = tt.expand_dims %right_idx_199 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc202) + %right_idx_201 = tt.broadcast %right_idx_200 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc203) + %left_idx_202 = tt.reshape %left_idx_197 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc204) + %right_idx_203 = tt.reshape %right_idx_201 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc205) + %cond_204 = arith.cmpi slt, %ileft_191, %iright_192 : tensor<8x16xi32> loc(#loc206) + %eq_205 = arith.cmpi eq, %ileft_191, %iright_192 : tensor<8x16xi32> loc(#loc207) + %cond_206 = arith.cmpi sgt, %left_idx_202, %right_idx_203 : tensor<8x16xi32> loc(#loc208) + %cond_207 = arith.andi %eq_205, %cond_206 : tensor<8x16xi1> loc(#loc209) + %cond_208 = arith.ori %cond_204, %cond_207 : tensor<8x16xi1> loc(#loc210) + %cond_209 = arith.extui %cond_208 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc211) + %cond_210 = arith.xori %cond_209, %flip_144 : tensor<8x16xi32> loc(#loc211) + %cond_211 = arith.cmpi ne, %cond_210, %cst_0 : tensor<8x16xi32> loc(#loc212) + %ret_212 = arith.xori %ileft_191, %iright_192 : tensor<8x16xi32> loc(#loc213) + %ret_213 = arith.select %cond_211, %ret_212, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc214) + %ret_214 = arith.xori %ret_178, %ret_213 : tensor<8x16xi32> loc(#loc215) + %new_idxs_215 = arith.xori %left_idx_202, %right_idx_203 : tensor<8x16xi32> loc(#loc216) + %new_idxs_216 = arith.select %cond_211, %new_idxs_215, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc217) + %new_idxs_217 = arith.xori %new_idxs_181, %new_idxs_216 : tensor<8x16xi32> loc(#loc218) + %y_218 = tt.reshape %ret_214 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc181) + %ileft_219 = arith.muli %y_218, %ileft : tensor<64x2x1xi32> loc(#loc183) + %ileft_220 = "tt.reduce"(%ileft_219) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc220) + %ileft_221 = tt.expand_dims %ileft_220 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc185) + %ileft_222 = tt.broadcast %ileft_221 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc186) + %iright_223 = arith.muli %y_218, %iright : tensor<64x2x1xi32> loc(#loc187) + %iright_224 = "tt.reduce"(%iright_223) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc222) + %iright_225 = tt.expand_dims %iright_224 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc189) + %iright_226 = tt.broadcast %iright_225 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc190) + %ileft_227 = tt.reshape %ileft_222 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc191) + %iright_228 = tt.reshape %iright_226 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc192) + %y_idx_229 = tt.reshape %new_idxs_217 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc193) + %left_idx_230 = arith.muli %y_idx_229, %ileft : tensor<64x2x1xi32> loc(#loc195) + %left_idx_231 = "tt.reduce"(%left_idx_230) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc225) + %left_idx_232 = tt.expand_dims %left_idx_231 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc197) + %left_idx_233 = tt.broadcast %left_idx_232 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc198) + %right_idx_234 = arith.muli %y_idx_229, %iright : tensor<64x2x1xi32> loc(#loc200) + %right_idx_235 = "tt.reduce"(%right_idx_234) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc228) + %right_idx_236 = tt.expand_dims %right_idx_235 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc202) + %right_idx_237 = tt.broadcast %right_idx_236 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc203) + %left_idx_238 = tt.reshape %left_idx_233 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc204) + %right_idx_239 = tt.reshape %right_idx_237 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc205) + %cond_240 = arith.cmpi slt, %ileft_227, %iright_228 : tensor<8x16xi32> loc(#loc206) + %eq_241 = arith.cmpi eq, %ileft_227, %iright_228 : tensor<8x16xi32> loc(#loc207) + %cond_242 = arith.cmpi sgt, %left_idx_238, %right_idx_239 : tensor<8x16xi32> loc(#loc208) + %cond_243 = arith.andi %eq_241, %cond_242 : tensor<8x16xi1> loc(#loc209) + %cond_244 = arith.ori %cond_240, %cond_243 : tensor<8x16xi1> loc(#loc210) + %cond_245 = arith.extui %cond_244 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc211) + %cond_246 = arith.xori %cond_245, %flip_144 : tensor<8x16xi32> loc(#loc211) + %cond_247 = arith.cmpi ne, %cond_246, %cst_0 : tensor<8x16xi32> loc(#loc212) + %ret_248 = arith.xori %ileft_227, %iright_228 : tensor<8x16xi32> loc(#loc213) + %ret_249 = arith.select %cond_247, %ret_248, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc214) + %ret_250 = arith.xori %ret_214, %ret_249 : tensor<8x16xi32> loc(#loc215) + %new_idxs_251 = arith.xori %left_idx_238, %right_idx_239 : tensor<8x16xi32> loc(#loc216) + %new_idxs_252 = arith.select %cond_247, %new_idxs_251, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc217) + %new_idxs_253 = arith.xori %new_idxs_217, %new_idxs_252 : tensor<8x16xi32> loc(#loc218) + %y_254 = tt.reshape %ret_250 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc181) + %ileft_255 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc183) + %ileft_256 = arith.muli %y_254, %ileft_255 : tensor<8x2x8xi32> loc(#loc183) + %ileft_257 = "tt.reduce"(%ileft_256) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc220) + %ileft_258 = tt.expand_dims %ileft_257 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc185) + %ileft_259 = tt.broadcast %ileft_258 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc186) + %iright_260 = arith.muli %y_254, %flip_143 : tensor<8x2x8xi32> loc(#loc187) + %iright_261 = "tt.reduce"(%iright_260) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc222) + %iright_262 = tt.expand_dims %iright_261 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc189) + %iright_263 = tt.broadcast %iright_262 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc190) + %ileft_264 = tt.reshape %ileft_259 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc191) + %iright_265 = tt.reshape %iright_263 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc192) + %y_idx_266 = tt.reshape %new_idxs_253 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc193) + %left_idx_267 = arith.muli %y_idx_266, %ileft_255 : tensor<8x2x8xi32> loc(#loc195) + %left_idx_268 = "tt.reduce"(%left_idx_267) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc225) + %left_idx_269 = tt.expand_dims %left_idx_268 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc197) + %left_idx_270 = tt.broadcast %left_idx_269 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc198) + %right_idx_271 = arith.muli %y_idx_266, %flip_143 : tensor<8x2x8xi32> loc(#loc200) + %right_idx_272 = "tt.reduce"(%right_idx_271) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc228) + %right_idx_273 = tt.expand_dims %right_idx_272 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc202) + %right_idx_274 = tt.broadcast %right_idx_273 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc203) + %left_idx_275 = tt.reshape %left_idx_270 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc204) + %right_idx_276 = tt.reshape %right_idx_274 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc205) + %cond_277 = arith.cmpi slt, %ileft_264, %iright_265 : tensor<8x16xi32> loc(#loc206) + %eq_278 = arith.cmpi eq, %ileft_264, %iright_265 : tensor<8x16xi32> loc(#loc207) + %cond_279 = arith.cmpi sgt, %left_idx_275, %right_idx_276 : tensor<8x16xi32> loc(#loc208) + %cond_280 = arith.andi %eq_278, %cond_279 : tensor<8x16xi1> loc(#loc209) + %cond_281 = arith.ori %cond_277, %cond_280 : tensor<8x16xi1> loc(#loc210) + %ret_282 = arith.xori %ileft_264, %iright_265 : tensor<8x16xi32> loc(#loc213) + %ret_283 = arith.select %cond_281, %ret_282, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc214) + %ret_284 = arith.xori %ret_250, %ret_283 : tensor<8x16xi32> loc(#loc215) + %new_idxs_285 = arith.xori %left_idx_275, %right_idx_276 : tensor<8x16xi32> loc(#loc216) + %new_idxs_286 = arith.select %cond_281, %new_idxs_285, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc217) + %new_idxs_287 = arith.xori %new_idxs_253, %new_idxs_286 : tensor<8x16xi32> loc(#loc218) + %y_288 = tt.reshape %ret_284 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc181) + %ileft_289 = arith.muli %y_288, %ileft_146 : tensor<16x2x4xi32> loc(#loc183) + %ileft_290 = "tt.reduce"(%ileft_289) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc220) + %ileft_291 = tt.expand_dims %ileft_290 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc185) + %ileft_292 = tt.broadcast %ileft_291 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc186) + %iright_293 = arith.muli %y_288, %flip_68 : tensor<16x2x4xi32> loc(#loc187) + %iright_294 = "tt.reduce"(%iright_293) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc222) + %iright_295 = tt.expand_dims %iright_294 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc189) + %iright_296 = tt.broadcast %iright_295 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc190) + %ileft_297 = tt.reshape %ileft_292 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc191) + %iright_298 = tt.reshape %iright_296 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc192) + %y_idx_299 = tt.reshape %new_idxs_287 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc193) + %left_idx_300 = arith.muli %y_idx_299, %ileft_146 : tensor<16x2x4xi32> loc(#loc195) + %left_idx_301 = "tt.reduce"(%left_idx_300) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc225) + %left_idx_302 = tt.expand_dims %left_idx_301 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc197) + %left_idx_303 = tt.broadcast %left_idx_302 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc198) + %right_idx_304 = arith.muli %y_idx_299, %flip_68 : tensor<16x2x4xi32> loc(#loc200) + %right_idx_305 = "tt.reduce"(%right_idx_304) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc228) + %right_idx_306 = tt.expand_dims %right_idx_305 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc202) + %right_idx_307 = tt.broadcast %right_idx_306 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc203) + %left_idx_308 = tt.reshape %left_idx_303 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc204) + %right_idx_309 = tt.reshape %right_idx_307 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc205) + %cond_310 = arith.cmpi slt, %ileft_297, %iright_298 : tensor<8x16xi32> loc(#loc206) + %eq_311 = arith.cmpi eq, %ileft_297, %iright_298 : tensor<8x16xi32> loc(#loc207) + %cond_312 = arith.cmpi sgt, %left_idx_308, %right_idx_309 : tensor<8x16xi32> loc(#loc208) + %cond_313 = arith.andi %eq_311, %cond_312 : tensor<8x16xi1> loc(#loc209) + %cond_314 = arith.ori %cond_310, %cond_313 : tensor<8x16xi1> loc(#loc210) + %ret_315 = arith.xori %ileft_297, %iright_298 : tensor<8x16xi32> loc(#loc213) + %ret_316 = arith.select %cond_314, %ret_315, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc214) + %ret_317 = arith.xori %ret_284, %ret_316 : tensor<8x16xi32> loc(#loc215) + %new_idxs_318 = arith.xori %left_idx_308, %right_idx_309 : tensor<8x16xi32> loc(#loc216) + %new_idxs_319 = arith.select %cond_314, %new_idxs_318, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc217) + %new_idxs_320 = arith.xori %new_idxs_287, %new_idxs_319 : tensor<8x16xi32> loc(#loc218) + %y_321 = tt.reshape %ret_317 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc181) + %ileft_322 = arith.muli %y_321, %ileft_71 : tensor<32x2x2xi32> loc(#loc183) + %ileft_323 = "tt.reduce"(%ileft_322) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc220) + %ileft_324 = tt.expand_dims %ileft_323 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc185) + %ileft_325 = tt.broadcast %ileft_324 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc186) + %iright_326 = arith.muli %y_321, %flip_31 : tensor<32x2x2xi32> loc(#loc187) + %iright_327 = "tt.reduce"(%iright_326) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc222) + %iright_328 = tt.expand_dims %iright_327 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc189) + %iright_329 = tt.broadcast %iright_328 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc190) + %ileft_330 = tt.reshape %ileft_325 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc191) + %iright_331 = tt.reshape %iright_329 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc192) + %y_idx_332 = tt.reshape %new_idxs_320 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc193) + %left_idx_333 = arith.muli %y_idx_332, %ileft_71 : tensor<32x2x2xi32> loc(#loc195) + %left_idx_334 = "tt.reduce"(%left_idx_333) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc225) + %left_idx_335 = tt.expand_dims %left_idx_334 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc197) + %left_idx_336 = tt.broadcast %left_idx_335 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc198) + %right_idx_337 = arith.muli %y_idx_332, %flip_31 : tensor<32x2x2xi32> loc(#loc200) + %right_idx_338 = "tt.reduce"(%right_idx_337) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc228) + %right_idx_339 = tt.expand_dims %right_idx_338 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc202) + %right_idx_340 = tt.broadcast %right_idx_339 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc203) + %left_idx_341 = tt.reshape %left_idx_336 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc204) + %right_idx_342 = tt.reshape %right_idx_340 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc205) + %cond_343 = arith.cmpi slt, %ileft_330, %iright_331 : tensor<8x16xi32> loc(#loc206) + %eq_344 = arith.cmpi eq, %ileft_330, %iright_331 : tensor<8x16xi32> loc(#loc207) + %cond_345 = arith.cmpi sgt, %left_idx_341, %right_idx_342 : tensor<8x16xi32> loc(#loc208) + %cond_346 = arith.andi %eq_344, %cond_345 : tensor<8x16xi1> loc(#loc209) + %cond_347 = arith.ori %cond_343, %cond_346 : tensor<8x16xi1> loc(#loc210) + %ret_348 = arith.xori %ileft_330, %iright_331 : tensor<8x16xi32> loc(#loc213) + %ret_349 = arith.select %cond_347, %ret_348, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc214) + %ret_350 = arith.xori %ret_317, %ret_349 : tensor<8x16xi32> loc(#loc215) + %new_idxs_351 = arith.xori %left_idx_341, %right_idx_342 : tensor<8x16xi32> loc(#loc216) + %new_idxs_352 = arith.select %cond_347, %new_idxs_351, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc217) + %new_idxs_353 = arith.xori %new_idxs_320, %new_idxs_352 : tensor<8x16xi32> loc(#loc218) + %y_354 = tt.reshape %ret_350 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc181) + %ileft_355 = arith.muli %y_354, %ileft : tensor<64x2x1xi32> loc(#loc183) + %ileft_356 = "tt.reduce"(%ileft_355) <{axis = 1 : i32}> ({ + ^bb0(%ileft_386: i32 loc(callsite(#loc1 at #loc184)), %ileft_387: i32 loc(callsite(#loc1 at #loc184))): + %ileft_388 = arith.addi %ileft_386, %ileft_387 : i32 loc(#loc230) + tt.reduce.return %ileft_388 : i32 loc(#loc220) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc220) + %ileft_357 = tt.expand_dims %ileft_356 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc185) + %ileft_358 = tt.broadcast %ileft_357 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc186) + %iright_359 = arith.muli %y_354, %iright : tensor<64x2x1xi32> loc(#loc187) + %iright_360 = "tt.reduce"(%iright_359) <{axis = 1 : i32}> ({ + ^bb0(%iright_386: i32 loc(callsite(#loc1 at #loc188)), %iright_387: i32 loc(callsite(#loc1 at #loc188))): + %iright_388 = arith.addi %iright_386, %iright_387 : i32 loc(#loc231) + tt.reduce.return %iright_388 : i32 loc(#loc222) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc222) + %iright_361 = tt.expand_dims %iright_360 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc189) + %iright_362 = tt.broadcast %iright_361 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc190) + %ileft_363 = tt.reshape %ileft_358 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc191) + %iright_364 = tt.reshape %iright_362 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc192) + %y_idx_365 = tt.reshape %new_idxs_353 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc193) + %left_idx_366 = arith.muli %y_idx_365, %ileft : tensor<64x2x1xi32> loc(#loc195) + %left_idx_367 = "tt.reduce"(%left_idx_366) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_386: i32 loc(callsite(#loc1 at #loc196)), %left_idx_387: i32 loc(callsite(#loc1 at #loc196))): + %left_idx_388 = arith.addi %left_idx_386, %left_idx_387 : i32 loc(#loc232) + tt.reduce.return %left_idx_388 : i32 loc(#loc225) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc225) + %left_idx_368 = tt.expand_dims %left_idx_367 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc197) + %left_idx_369 = tt.broadcast %left_idx_368 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc198) + %right_idx_370 = arith.muli %y_idx_365, %iright : tensor<64x2x1xi32> loc(#loc200) + %right_idx_371 = "tt.reduce"(%right_idx_370) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_386: i32 loc(callsite(#loc1 at #loc201)), %right_idx_387: i32 loc(callsite(#loc1 at #loc201))): + %right_idx_388 = arith.addi %right_idx_386, %right_idx_387 : i32 loc(#loc233) + tt.reduce.return %right_idx_388 : i32 loc(#loc228) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc228) + %right_idx_372 = tt.expand_dims %right_idx_371 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc202) + %right_idx_373 = tt.broadcast %right_idx_372 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc203) + %left_idx_374 = tt.reshape %left_idx_369 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc204) + %right_idx_375 = tt.reshape %right_idx_373 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc205) + %cond_376 = arith.cmpi slt, %ileft_363, %iright_364 : tensor<8x16xi32> loc(#loc206) + %eq_377 = arith.cmpi eq, %ileft_363, %iright_364 : tensor<8x16xi32> loc(#loc207) + %cond_378 = arith.cmpi sgt, %left_idx_374, %right_idx_375 : tensor<8x16xi32> loc(#loc208) + %cond_379 = arith.andi %eq_377, %cond_378 : tensor<8x16xi1> loc(#loc209) + %cond_380 = arith.ori %cond_376, %cond_379 : tensor<8x16xi1> loc(#loc210) + %new_idxs_381 = arith.xori %left_idx_374, %right_idx_375 : tensor<8x16xi32> loc(#loc216) + %new_idxs_382 = arith.select %cond_380, %new_idxs_381, %cst_0 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc217) + %new_idxs_383 = arith.xori %new_idxs_353, %new_idxs_382 : tensor<8x16xi32> loc(#loc218) + %tmp7 = arith.extsi %tmp0_28 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc167) + %tmp10_384 = arith.select %tmp0_27, %tmp7, %tmp10 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc100) + %tmp11 = "tt.reduce"(%tmp10_384) <{axis = 1 : i32}> ({ + ^bb0(%tmp11_386: i64 loc(callsite(#loc1 at #loc168)), %tmp11_387: i64 loc(callsite(#loc1 at #loc168))): + %tmp11_388 = arith.addi %tmp11_386, %tmp11_387 : i64 loc(#loc219) + tt.reduce.return %tmp11_388 : i64 loc(#loc179) + }) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc179) + %tmp11_385 = tt.expand_dims %tmp11 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc169) + %tmp14 = arith.trunci %tmp11_385 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc170) + %0 = arith.muli %x0_9, %cst_1 : tensor<8x1xi64> loc(#loc77) + %1 = tt.broadcast %0 : tensor<8x1xi64> -> tensor<8x16xi64> loc(#loc78) + %2 = arith.addi %tmp0_10, %1 : tensor<8x16xi64> loc(#loc78) + %3 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc79) + %4 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc80) + %5 = arith.extui %4 : i1 to i64 loc(#loc81) + %6 = arith.muli %ks0, %5 : i64 loc(#loc81) + %7 = arith.extui %3 : i1 to i64 loc(#loc171) + %8 = arith.addi %7, %6 : i64 loc(#loc82) + %9 = tt.splat %8 : i64 -> tensor<8x1xi64> loc(#loc84) + %10 = arith.muli %tmp0_13, %9 : tensor<8x1xi64> loc(#loc84) + %11 = tt.broadcast %10 : tensor<8x1xi64> -> tensor<8x16xi64> loc(#loc85) + %12 = arith.addi %2, %11 : tensor<8x16xi64> loc(#loc85) + %13 = tt.splat %out_ptr2 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc86) + %14 = tt.addptr %13, %12 : tensor<8x16x!tt.ptr>, tensor<8x16xi64> loc(#loc86) + tt.store %14, %new_idxs_383, %tmp0_27 : tensor<8x16x!tt.ptr> loc(#loc87) + %15 = arith.muli %x1, %9 : tensor<8x1xi64> loc(#loc88) + %16 = arith.addi %x0_9, %15 : tensor<8x1xi64> loc(#loc89) + %17 = tt.splat %out_ptr3 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc90) + %18 = tt.addptr %17, %16 : tensor<8x1x!tt.ptr>, tensor<8x1xi64> loc(#loc90) + tt.store %18, %tmp14, %xmask_6 : tensor<8x1x!tt.ptr> loc(#loc91) + tt.return loc(#loc92) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":43:34) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":23:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":23:33) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:44) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":24:23) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":25:21) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":26:28) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":26:38) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":32:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":33:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:45) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:42) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:54) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:50) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:64) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:68) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:61) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:30) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":35:73) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":37:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":39:33) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":41:19) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":44:29) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":47:21) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:35) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:32) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:62) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:88) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:79) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:70) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:54) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:47) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:40) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:25) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":48:102) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:34) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:30) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:25) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:89) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/we/cwe54qdzud6xskhvjsubzqbviobtofi7rtla3cz3fvufmekfy4qf.py":49:4) +#loc99 = loc(callsite(#loc1 at #loc2)) +#loc100 = loc("tmp10"(#loc3)) +#loc101 = loc("xoffset"(#loc4)) +#loc102 = loc("xoffset"(#loc5)) +#loc103 = loc("xindex"(#loc6)) +#loc104 = loc("xindex"(#loc7)) +#loc105 = loc("xindex"(#loc8)) +#loc106 = loc("xmask"(#loc9)) +#loc107 = loc("r0_index"(#loc10)) +#loc108 = loc("r0_index"(#loc11)) +#loc109 = loc("x0"(#loc12)) +#loc110 = loc("x1"(#loc13)) +#loc111 = loc("tmp0"(#loc14)) +#loc112 = loc("tmp0"(#loc15)) +#loc113 = loc("tmp0"(#loc16)) +#loc114 = loc("tmp0"(#loc17)) +#loc115 = loc("tmp0"(#loc18)) +#loc116 = loc("tmp0"(#loc19)) +#loc117 = loc("tmp0"(#loc20)) +#loc118 = loc("tmp0"(#loc21)) +#loc119 = loc("tmp0"(#loc22)) +#loc120 = loc("tmp0"(#loc23)) +#loc121 = loc("tmp2"(#loc24)) +#loc122 = loc("tmp4"(#loc25)) +#loc123 = loc("flip"(#loc26)) +#loc125 = loc("flip"(#loc28)) +#loc126 = loc("flip"(#loc29)) +#loc127 = loc("flip"(#loc30)) +#loc128 = loc("y"(#loc31)) +#loc129 = loc("left_mask"(#loc33)) +#loc130 = loc("ileft"(#loc34)) +#loc132 = loc("ileft"(#loc38)) +#loc133 = loc("ileft"(#loc39)) +#loc134 = loc("iright"(#loc40)) +#loc136 = loc("iright"(#loc42)) +#loc137 = loc("iright"(#loc43)) +#loc138 = loc("ileft"(#loc44)) +#loc139 = loc("iright"(#loc45)) +#loc140 = loc("y_idx"(#loc46)) +#loc141 = loc("left_idx"(#loc47)) +#loc142 = loc("left_idx"(#loc48)) +#loc143 = loc("input"(#loc49)) +#loc145 = loc("left_idx"(#loc51)) +#loc146 = loc("left_idx"(#loc52)) +#loc147 = loc("right_idx"(#loc53)) +#loc148 = loc("right_idx"(#loc54)) +#loc150 = loc("right_idx"(#loc56)) +#loc151 = loc("right_idx"(#loc57)) +#loc152 = loc("left_idx"(#loc58)) +#loc153 = loc("right_idx"(#loc59)) +#loc154 = loc("cond"(#loc60)) +#loc155 = loc("eq"(#loc61)) +#loc156 = loc("cond"(#loc62)) +#loc157 = loc("cond"(#loc63)) +#loc158 = loc("cond"(#loc64)) +#loc159 = loc("cond"(#loc65)) +#loc160 = loc("cond"(#loc66)) +#loc161 = loc("ret"(#loc67)) +#loc162 = loc("ret"(#loc68)) +#loc163 = loc("ret"(#loc69)) +#loc164 = loc("new_idxs"(#loc70)) +#loc165 = loc("new_idxs"(#loc71)) +#loc166 = loc("new_idxs"(#loc72)) +#loc167 = loc("tmp7"(#loc73)) +#loc169 = loc("tmp11"(#loc75)) +#loc170 = loc("tmp14"(#loc76)) +#loc171 = loc(fused[#loc82, #loc83]) +#loc172 = loc(callsite(#loc123 at #loc124)) +#loc173 = loc(callsite(#loc125 at #loc124)) +#loc174 = loc(callsite(#loc126 at #loc124)) +#loc175 = loc(callsite(#loc127 at #loc124)) +#loc177 = loc("cond"(#loc154)) +#loc178 = loc("eq"(#loc155)) +#loc179 = loc(callsite(#loc35 at #loc168)) +#loc181 = loc(callsite(#loc128 at #loc176)) +#loc182 = loc(callsite(#loc129 at #loc176)) +#loc183 = loc(callsite(#loc130 at #loc176)) +#loc185 = loc(callsite(#loc132 at #loc176)) +#loc186 = loc(callsite(#loc133 at #loc176)) +#loc187 = loc(callsite(#loc134 at #loc176)) +#loc189 = loc(callsite(#loc136 at #loc176)) +#loc190 = loc(callsite(#loc137 at #loc176)) +#loc191 = loc(callsite(#loc138 at #loc176)) +#loc192 = loc(callsite(#loc139 at #loc176)) +#loc193 = loc(callsite(#loc140 at #loc176)) +#loc194 = loc(callsite(#loc141 at #loc176)) +#loc195 = loc(callsite(#loc142 at #loc176)) +#loc197 = loc(callsite(#loc145 at #loc176)) +#loc198 = loc(callsite(#loc146 at #loc176)) +#loc199 = loc(callsite(#loc147 at #loc176)) +#loc200 = loc(callsite(#loc148 at #loc176)) +#loc202 = loc(callsite(#loc150 at #loc176)) +#loc203 = loc(callsite(#loc151 at #loc176)) +#loc204 = loc(callsite(#loc152 at #loc176)) +#loc205 = loc(callsite(#loc153 at #loc176)) +#loc206 = loc(callsite(#loc177 at #loc176)) +#loc207 = loc(callsite(#loc178 at #loc176)) +#loc208 = loc(callsite(#loc156 at #loc176)) +#loc209 = loc(callsite(#loc157 at #loc176)) +#loc210 = loc(callsite(#loc158 at #loc176)) +#loc211 = loc(callsite(#loc159 at #loc176)) +#loc212 = loc(callsite(#loc160 at #loc176)) +#loc213 = loc(callsite(#loc161 at #loc176)) +#loc214 = loc(callsite(#loc162 at #loc176)) +#loc215 = loc(callsite(#loc163 at #loc176)) +#loc216 = loc(callsite(#loc164 at #loc176)) +#loc217 = loc(callsite(#loc165 at #loc176)) +#loc218 = loc(callsite(#loc166 at #loc176)) +#loc219 = loc(callsite(#loc37 at #loc179)) +#loc220 = loc(callsite(#loc35 at #loc184)) +#loc222 = loc(callsite(#loc35 at #loc188)) +#loc224 = loc(callsite(#loc143 at #loc196)) +#loc225 = loc(callsite(#loc35 at #loc196)) +#loc227 = loc(callsite(#loc143 at #loc201)) +#loc228 = loc(callsite(#loc35 at #loc201)) +#loc230 = loc(callsite(#loc37 at #loc220)) +#loc231 = loc(callsite(#loc37 at #loc222)) +#loc232 = loc(callsite(#loc37 at #loc225)) +#loc233 = loc(callsite(#loc37 at #loc228)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/__grp__triton_red_fused_argmax_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/__grp__triton_red_fused_argmax_1.json new file mode 100644 index 0000000000000000000000000000000000000000..6556f0779fdf9e8ee918d709d0820b06ef34a240 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/__grp__triton_red_fused_argmax_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_argmax_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.source", "triton_red_fused_argmax_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.ttir", "triton_red_fused_argmax_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.ttgir", "triton_red_fused_argmax_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.llir", "triton_red_fused_argmax_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.ptx", "triton_red_fused_argmax_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.cubin", "triton_red_fused_argmax_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..7d7d6fdc7deab4901e1a5d5752b8b526432dc1c6 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.source new file mode 100644 index 0000000000000000000000000000000000000000..d0ca41f7e0e35a3336ac1194c51ef55693d5b1b8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/YHAVDQXMEVV7S4RZ3RZ2CHWHFBN2O3IAF5U3VLSP72AQHADF3BWQ/triton_red_fused_argmax_1.source @@ -0,0 +1,309 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":18:0) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":143:0) +#loc45 = loc(unknown) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":164:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("out_ptr0"(#loc)) +#loc72 = loc("xnumel"(#loc)) +#loc73 = loc("r0_numel"(#loc)) +#loc100 = loc("a_value"(#loc33)) +#loc101 = loc("a_index"(#loc33)) +#loc102 = loc("b_value"(#loc33)) +#loc103 = loc("b_index"(#loc33)) +#loc116 = loc("x"(#loc53)) +#loc117 = loc("x"(#loc57)) +#loc118 = loc("value"(#loc66)) +#loc119 = loc("index"(#loc66)) +module { + tt.func public @triton_red_fused_argmax_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 16384 : i32 loc(#loc74) + %r0_numel_1 = arith.constant 32000 : i32 loc(#loc75) + %xoffset = tt.get_program_id x : i32 loc(#loc76) + %xoffset_2 = arith.constant 1 : i32 loc(#loc77) + %xoffset_3 = arith.constant 1 : i32 loc(#loc77) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc77) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc78) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc79) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc80) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc80) + %xmask = arith.constant true loc(#loc81) + %xmask_8 = arith.constant dense : tensor<1x2048xi1> loc(#loc81) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc82) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc83) + %x0 = arith.constant 2048 : i32 loc(#loc84) + %x0_10 = arith.constant 2048 : i32 loc(#loc84) + %x0_11 = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc84) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<1x1xi32> loc(#loc84) + %x1 = arith.constant 2048 : i32 loc(#loc85) + %x1_13 = arith.constant 2048 : i32 loc(#loc85) + %x1_14 = arith.constant dense<2048> : tensor<1x1xi32> loc(#loc85) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<1x1xi32> loc(#loc85) + %_tmp2 = arith.constant 0xFF800000 : f32 loc(#loc86) + %_tmp2_16 = arith.constant dense<0xFF800000> : tensor<1x2048xf32> loc(#loc86) + %_tmp2_index = arith.constant 2147483647 : i32 loc(#loc87) + %_tmp2_index_17 = arith.constant dense<2147483647> : tensor<1x2048xi32> loc(#loc87) + %c0_i32 = arith.constant 0 : i32 loc(#loc15) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc15) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc15) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc15) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc15) + %3 = ub.poison : i32 loc(#loc15) + %_tmp2_index_18:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp2_19 = %_tmp2_16, %_tmp2_index_20 = %_tmp2_index_17) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc89) + %r0_index_21 = arith.addi %r0_index, %r0_base_9 : tensor<1x2048xi32> loc(#loc89) + %r0_mask = arith.constant dense<32000> : tensor<1x2048xi32> loc(#loc90) + %r0_mask_22 = arith.cmpi slt, %r0_index_21, %r0_mask : tensor<1x2048xi32> loc(#loc90) + %tmp0 = arith.constant 32000 : i32 loc(#loc91) + %tmp0_23 = arith.constant 32000 : i32 loc(#loc91) + %tmp0_24 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc91) + %tmp0_25 = arith.muli %tmp0_24, %x0_12 : tensor<1x1xi32> loc(#loc91) + %tmp0_26 = tt.broadcast %tmp0_25 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc92) + %tmp0_27 = arith.addi %r0_index_21, %tmp0_26 : tensor<1x2048xi32> loc(#loc92) + %tmp0_28 = arith.constant 65760000 : i32 loc(#loc93) + %tmp0_29 = arith.constant 65760000 : i32 loc(#loc93) + %tmp0_30 = arith.constant dense<65760000> : tensor<1x1xi32> loc(#loc93) + %tmp0_31 = arith.muli %tmp0_30, %x1_15 : tensor<1x1xi32> loc(#loc93) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<1x1xi32> -> tensor<1x2048xi32> loc(#loc94) + %tmp0_33 = arith.addi %tmp0_27, %tmp0_32 : tensor<1x2048xi32> loc(#loc94) + %tmp0_34 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc95) + %tmp0_35 = tt.addptr %tmp0_34, %tmp0_33 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi32> loc(#loc95) + %tmp0_36 = arith.constant 0.000000e+00 : f32 loc(#loc96) + %tmp0_37 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc96) + %tmp0_38 = tt.load %tmp0_35, %r0_mask_22, %tmp0_37 evictionPolicy = evict_first : tensor<1x2048x!tt.ptr> loc(#loc96) + %8:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%_tmp2_19, %_tmp2_index_20, %tmp0_38, %r0_index_21) : (tensor<1x2048xf32>, tensor<1x2048xi32>, tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) loc(#loc24) + %_tmp2_39 = arith.select %r0_mask_22, %8#0, %_tmp2_19 : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc97) + %_tmp2_index_40 = arith.select %r0_mask_22, %8#1, %_tmp2_index_20 : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc98) + scf.yield %_tmp2_39, %_tmp2_index_40 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc27) + } loc(#loc120) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%_tmp2_index_18#0, %_tmp2_index_18#1) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc28) + %tmp2 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc99) + %5 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc30) + %6 = tt.addptr %5, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc30) + %7 = arith.extsi %tmp2 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc31) + tt.store %6, %7 : tensor<1x1x!tt.ptr> loc(#loc31) + tt.return loc(#loc32) + } loc(#loc) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32S1_2048S_i32S1_2048S_fp32S1_2048S_i32S1_2048S__(%a_value: tensor<1x2048xf32> loc("a_value"(#loc33)), %a_index: tensor<1x2048xi32> loc("a_index"(#loc33)), %b_value: tensor<1x2048xf32> loc("b_value"(#loc33)), %b_index: tensor<1x2048xi32> loc("b_index"(#loc33))) -> (tensor<1x2048xf32>, tensor<1x2048xi32>) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : tensor<1x2048xf32> loc(#loc121) + %equal = arith.cmpf oeq, %a_value, %b_value : tensor<1x2048xf32> loc(#loc122) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%a_value) : (tensor<1x2048xf32>) -> i1 loc(#loc36) + %1:2 = scf.if %0 -> (tensor<1x2048xi1>, tensor<1x2048xi1>) { + %a_isnan = arith.cmpf une, %a_value, %a_value : tensor<1x2048xf32> loc(#loc106) + %b_isnan = arith.cmpf une, %b_value, %b_value : tensor<1x2048xf32> loc(#loc107) + %mask_3 = arith.constant true loc(#loc108) + %mask_4 = arith.constant dense : tensor<1x2048xi1> loc(#loc108) + %mask_5 = arith.xori %b_isnan, %mask_4 : tensor<1x2048xi1> loc(#loc108) + %mask_6 = arith.andi %a_isnan, %mask_5 : tensor<1x2048xi1> loc(#loc109) + %mask_7 = arith.ori %mask, %mask_6 : tensor<1x2048xi1> loc(#loc123) + %equal_8 = arith.andi %a_isnan, %b_isnan : tensor<1x2048xi1> loc(#loc111) + %equal_9 = arith.ori %equal, %equal_8 : tensor<1x2048xi1> loc(#loc124) + scf.yield %mask_7, %equal_9 : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc124) + } else { + scf.yield %mask, %equal : tensor<1x2048xi1>, tensor<1x2048xi1> loc(#loc45) + } loc(#loc37) + %mask_0 = arith.cmpi slt, %a_index, %b_index : tensor<1x2048xi32> loc(#loc113) + %mask_1 = arith.andi %1#1, %mask_0 : tensor<1x2048xi1> loc(#loc114) + %mask_2 = arith.ori %1#0, %mask_1 : tensor<1x2048xi1> loc(#loc115) + %2 = arith.select %mask_2, %a_value, %b_value : tensor<1x2048xi1>, tensor<1x2048xf32> loc(#loc49) + %3 = arith.select %mask_2, %a_index, %b_index : tensor<1x2048xi1>, tensor<1x2048xi32> loc(#loc50) + tt.return %2, %3 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc51) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x2048xf32> loc(#loc52) + %5 = ub.poison : tensor<1x2048xi32> loc(#loc52) + tt.return %4, %5 : tensor<1x2048xf32>, tensor<1x2048xi32> loc(#loc52) + } loc(#loc33) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc53))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x) : (tensor<1x2048xf32>) -> tensor<1x2048xf32> loc(#loc54) + %true = arith.constant true loc(#loc55) + tt.return %true : i1 loc(#loc55) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc56) + tt.return %1 : i1 loc(#loc56) + } loc(#loc53) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_2048S__(%x: tensor<1x2048xf32> loc("x"(#loc57))) -> tensor<1x2048xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc59) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x2048xf32> loc(#loc59) + %4 = arith.addf %x, %3 : tensor<1x2048xf32> loc(#loc59) + tt.return %4 : tensor<1x2048xf32> loc(#loc60) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x2048xf32> loc(#loc61) + tt.return %5 : tensor<1x2048xf32> loc(#loc61) + } loc(#loc57) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc63) + %cst = arith.constant dense : tensor<1xi1> loc(#loc63) + tt.return %cst : tensor<1xi1> loc(#loc64) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc65) + tt.return %0 : tensor<1xi1> loc(#loc65) + } loc(#loc62) + tt.func private @"torch._inductor.runtime.triton_helpers.max_with_index__fp32S1_2048S_i32S1_2048S__(2,)cconstexpr_1_"(%value: tensor<1x2048xf32> loc("value"(#loc66)), %index: tensor<1x2048xi32> loc("index"(#loc66))) -> (tensor<1xf32>, tensor<1xi32>) attributes {noinline = false} { + %0:2 = "tt.reduce"(%value, %index) <{axis = 1 : i32}> ({ + ^bb0(%arg2: f32 loc(unknown), %arg3: i32 loc(unknown), %arg4: f32 loc(unknown), %arg5: i32 loc(unknown)): + %3:2 = tt.call @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%arg2, %arg3, %arg4, %arg5) : (f32, i32, f32, i32) -> (f32, i32) loc(#loc67) + tt.reduce.return %3#0, %3#1 : f32, i32 loc(#loc67) + }) : (tensor<1x2048xf32>, tensor<1x2048xi32>) -> (tensor<1xf32>, tensor<1xi32>) loc(#loc67) + tt.return %0#0, %0#1 : tensor<1xf32>, tensor<1xi32> loc(#loc68) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc69) + %2 = ub.poison : tensor<1xi32> loc(#loc69) + tt.return %1, %2 : tensor<1xf32>, tensor<1xi32> loc(#loc69) + } loc(#loc66) + tt.func private @torch._inductor.runtime.triton_helpers.maximum_with_index__fp32_i32_fp32_i32__(%a_value: f32 loc("a_value"(#loc33)), %a_index: i32 loc("a_index"(#loc33)), %b_value: f32 loc("b_value"(#loc33)), %b_index: i32 loc("b_index"(#loc33))) -> (f32, i32) attributes {noinline = false} { + %mask = arith.cmpf ogt, %a_value, %b_value : f32 loc(#loc121) + %equal = arith.cmpf oeq, %a_value, %b_value : f32 loc(#loc122) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a_value) : (f32) -> i1 loc(#loc36) + %1:2 = scf.if %0 -> (i1, i1) { + %a_isnan = arith.cmpf une, %a_value, %a_value : f32 loc(#loc106) + %b_isnan = arith.cmpf une, %b_value, %b_value : f32 loc(#loc107) + %mask_3 = arith.constant true loc(#loc108) + %mask_4 = arith.xori %b_isnan, %mask_3 : i1 loc(#loc108) + %mask_5 = arith.andi %a_isnan, %mask_4 : i1 loc(#loc109) + %mask_6 = arith.ori %mask, %mask_5 : i1 loc(#loc123) + %equal_7 = arith.andi %a_isnan, %b_isnan : i1 loc(#loc111) + %equal_8 = arith.ori %equal, %equal_7 : i1 loc(#loc124) + scf.yield %mask_6, %equal_8 : i1, i1 loc(#loc124) + } else { + scf.yield %mask, %equal : i1, i1 loc(#loc45) + } loc(#loc37) + %mask_0 = arith.cmpi slt, %a_index, %b_index : i32 loc(#loc113) + %mask_1 = arith.andi %1#1, %mask_0 : i1 loc(#loc114) + %mask_2 = arith.ori %1#0, %mask_1 : i1 loc(#loc115) + %2 = arith.select %mask_2, %a_value, %b_value : f32 loc(#loc49) + %3 = arith.select %mask_2, %a_index, %b_index : i32 loc(#loc50) + tt.return %2, %3 : f32, i32 loc(#loc51) + ^bb1: // no predecessors + %4 = ub.poison : f32 loc(#loc52) + %5 = ub.poison : i32 loc(#loc52) + tt.return %4, %5 : f32, i32 loc(#loc52) + } loc(#loc33) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc53))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc54) + %true = arith.constant true loc(#loc55) + tt.return %true : i1 loc(#loc55) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc56) + tt.return %1 : i1 loc(#loc56) + } loc(#loc53) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc57))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc58) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc59) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc59) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc59) + tt.return %3 : tensor<1xf32> loc(#loc60) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc61) + tt.return %4 : tensor<1xf32> loc(#loc61) + } loc(#loc57) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":29:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":30:55) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":31:58) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":33:40) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":34:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":35:29) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:47) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:61) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:52) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":39:66) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":42:38) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":44:46) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:58) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":45:8) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":46:75) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":47:20) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:25) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:36) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/e5/ce5roatsezodqf5qpd32hkspt4fcfiqo7jobom7pzhsftuk27gji.py":48:4) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":144:21) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":145:23) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:19) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":146:7) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":147:29) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":148:29) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:31) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:27) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":149:16) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:27) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":151:17) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:31) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:21) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":154:12) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:35) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:69) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":155:4) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:42) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:11) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":165:4) +#loc74 = loc("xnumel"(#loc1)) +#loc75 = loc("r0_numel"(#loc2)) +#loc76 = loc("xoffset"(#loc3)) +#loc77 = loc("xoffset"(#loc4)) +#loc78 = loc("xindex"(#loc5)) +#loc79 = loc("xindex"(#loc6)) +#loc80 = loc("xindex"(#loc7)) +#loc81 = loc("xmask"(#loc8)) +#loc82 = loc("r0_base"(#loc9)) +#loc83 = loc("r0_base"(#loc10)) +#loc84 = loc("x0"(#loc11)) +#loc85 = loc("x1"(#loc12)) +#loc86 = loc("_tmp2"(#loc13)) +#loc87 = loc("_tmp2_index"(#loc14)) +#loc88 = loc("_tmp2"(#loc15)) +#loc89 = loc("r0_index"(#loc16)) +#loc90 = loc("r0_mask"(#loc17)) +#loc91 = loc("tmp0"(#loc18)) +#loc92 = loc("tmp0"(#loc19)) +#loc93 = loc("tmp0"(#loc20)) +#loc94 = loc("tmp0"(#loc21)) +#loc95 = loc("tmp0"(#loc22)) +#loc96 = loc("tmp0"(#loc23)) +#loc97 = loc("_tmp2"(#loc25)) +#loc98 = loc("_tmp2_index"(#loc26)) +#loc99 = loc("tmp2"(#loc29)) +#loc104 = loc("mask"(#loc34)) +#loc105 = loc("equal"(#loc35)) +#loc106 = loc("a_isnan"(#loc38)) +#loc107 = loc("b_isnan"(#loc39)) +#loc108 = loc("mask"(#loc40)) +#loc109 = loc("mask"(#loc41)) +#loc110 = loc("mask"(#loc42)) +#loc111 = loc("equal"(#loc43)) +#loc112 = loc("equal"(#loc44)) +#loc113 = loc("mask"(#loc46)) +#loc114 = loc("mask"(#loc47)) +#loc115 = loc("mask"(#loc48)) +#loc120 = loc("_tmp2_index"(#loc88)) +#loc121 = loc("mask"(#loc104)) +#loc122 = loc("equal"(#loc105)) +#loc123 = loc("mask"(#loc110)) +#loc124 = loc("equal"(#loc112)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/__grp__triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/__grp__triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2a7e36e651db94731629d905a95f8b0c1c8a110b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/__grp__triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.json @@ -0,0 +1 @@ +{"child_paths": {"triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.source", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.ttir", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.ttgir", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.llir", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.ptx", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.cubin", "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.cubin b/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.cubin new file mode 100644 index 0000000000000000000000000000000000000000..c3e90dfe6ba8ea590682da3df80cba2287cf0576 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.json b/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b53332c8fd1fc7412a4906ab086b3205dfaa22d2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.json @@ -0,0 +1 @@ +{"hash": "cea3601d8efd69682908883a0e7938aea44b9ecb5449067e6e7faa6a58770601", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 0, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.llir new file mode 100644 index 0000000000000000000000000000000000000000..03c4aa45a7c732d639de9a688383d9ed88e0a583 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.llir @@ -0,0 +1,271 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_1 = internal constant [8 x i8] c"unknown\00" +@assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py\00" +@assertMessage_1 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp25 < ks4\00" +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py\00" +@assertMessage_0 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp5 < ks2\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9, i32 %10, ptr addrspace(1) readnone captures(none) %11, ptr addrspace(1) readnone captures(none) %12) local_unnamed_addr #1 !dbg !9 { + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %15 = shl i32 %14, 9, !dbg !11 + %16 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %17 = shl nuw nsw i32 %16, 1, !dbg !12 + %18 = and i32 %17, 510, !dbg !12 + %19 = or disjoint i32 %18, %15, !dbg !13 + %20 = or disjoint i32 %19, 1, !dbg !13 + %21 = icmp slt i32 %19, %10, !dbg !14 + %22 = icmp slt i32 %20, %10, !dbg !14 + %23 = sext i32 %19 to i64, !dbg !15 + %24 = sext i32 %20 to i64, !dbg !15 + %25 = sdiv i64 %23, %5, !dbg !15 + %26 = sdiv i64 %24, %5, !dbg !15 + %27 = srem i64 %25, %6, !dbg !16 + %28 = srem i64 %26, %6, !dbg !16 + %29 = srem i64 %23, %8, !dbg !17 + %30 = srem i64 %24, %8, !dbg !17 + %31 = getelementptr bfloat, ptr addrspace(1) %0, i64 %23, !dbg !18 + %32 = getelementptr bfloat, ptr addrspace(1) %0, i64 %24, !dbg !18 + %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %34 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %31, i64 %33, i1 %21) #4, !dbg !19 + %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !19 + %36 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %32, i64 %35, i1 %22) #4, !dbg !19 + %37 = getelementptr i64, ptr addrspace(1) %1, i64 %27, !dbg !20 + %38 = getelementptr i64, ptr addrspace(1) %1, i64 %28, !dbg !20 + %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %37, i64 %39, i1 %21) #4, !dbg !21 + %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %38, i64 %41, i1 %22) #4, !dbg !21 + %43 = icmp slt i64 %40, 0, !dbg !22 + %44 = icmp slt i64 %42, 0, !dbg !22 + %45 = select i1 %43, i64 %7, i64 0, !dbg !23 + %46 = add i64 %45, %40, !dbg !23 + %47 = select i1 %44, i64 %7, i64 0, !dbg !23 + %48 = add i64 %47, %42, !dbg !23 + %49 = icmp slt i64 %46, 0, !dbg !24 + %50 = icmp slt i64 %48, 0, !dbg !24 + %51 = icmp sge i64 %46, %7, !dbg !25 + %52 = icmp sge i64 %48, %7, !dbg !25 + %.not4 = or i1 %49, %51, !dbg !26 + %.not8 = or i1 %50, %52, !dbg !27 + %.not1 = and i1 %21, %.not4, !dbg !28 + %.not5 = and i1 %22, %.not8, !dbg !29 + %53 = or i1 %.not1, %.not5, !dbg !29 + br i1 %53, label %54, label %55, !dbg !29 + +54: ; preds = %13 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 32, ptr nonnull @assertFunc_0, i64 1), !dbg !29 + unreachable, !dbg !29 + +55: ; preds = %13 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !29 + %56 = mul i64 %46, %8, !dbg !30 + %57 = mul i64 %48, %8, !dbg !30 + %58 = getelementptr bfloat, ptr addrspace(1) %2, i64 %29, !dbg !31 + %59 = getelementptr bfloat, ptr addrspace(1) %58, i64 %56, !dbg !31 + %60 = getelementptr bfloat, ptr addrspace(1) %2, i64 %30, !dbg !31 + %61 = getelementptr bfloat, ptr addrspace(1) %60, i64 %57, !dbg !31 + %62 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %63 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %59, i64 %62, i1 %21) #4, !dbg !32 + %64 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !32 + %65 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %61, i64 %64, i1 %22) #4, !dbg !32 + %66 = sdiv i64 %8, 2, !dbg !33 + %67 = sub i64 %8, %66, !dbg !34 + %68 = icmp slt i64 %29, %67, !dbg !35 + %69 = icmp slt i64 %30, %67, !dbg !35 + %70 = sub nsw i64 %23, %29, !dbg !36 + %71 = sub nsw i64 %24, %30, !dbg !36 + %72 = getelementptr bfloat, ptr addrspace(1) %0, i64 %70, !dbg !37 + %73 = getelementptr bfloat, ptr addrspace(1) %72, i64 %66, !dbg !37 + %74 = getelementptr bfloat, ptr addrspace(1) %73, i64 %29, !dbg !37 + %75 = getelementptr bfloat, ptr addrspace(1) %0, i64 %71, !dbg !37 + %76 = getelementptr bfloat, ptr addrspace(1) %75, i64 %66, !dbg !37 + %77 = getelementptr bfloat, ptr addrspace(1) %76, i64 %30, !dbg !37 + %78 = and i1 %21, %68, !dbg !38 + %79 = and i1 %22, %69, !dbg !38 + %80 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39 + %81 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %74, i64 %80, i1 %78) #4, !dbg !39 + %82 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !39 + %83 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %77, i64 %82, i1 %79) #4, !dbg !39 + %84 = icmp sge i64 %29, %67, !dbg !40 + %85 = icmp sge i64 %30, %67, !dbg !40 + %86 = sub i64 %29, %8, !dbg !41 + %87 = sub i64 %30, %8, !dbg !41 + %88 = getelementptr bfloat, ptr addrspace(1) %72, i64 %86, !dbg !42 + %89 = getelementptr bfloat, ptr addrspace(1) %88, i64 %66, !dbg !42 + %90 = getelementptr bfloat, ptr addrspace(1) %75, i64 %87, !dbg !42 + %91 = getelementptr bfloat, ptr addrspace(1) %90, i64 %66, !dbg !42 + %92 = and i1 %21, %84, !dbg !43 + %93 = and i1 %22, %85, !dbg !43 + %94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44 + %95 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %89, i64 %94, i1 %92) #4, !dbg !44 + %96 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !44 + %97 = tail call i16 asm sideeffect "mov.u16 $0, $1;\0A\09@$4 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $2 + 0 ], $3;", "=c,c,l,l,b"(i16 0, ptr addrspace(1) %91, i64 %96, i1 %93) #4, !dbg !44 + %98 = select i1 %43, i64 %9, i64 0, !dbg !45 + %99 = add i64 %98, %40, !dbg !45 + %100 = select i1 %44, i64 %9, i64 0, !dbg !45 + %101 = add i64 %100, %42, !dbg !45 + %102 = icmp slt i64 %99, 0, !dbg !46 + %103 = icmp slt i64 %101, 0, !dbg !46 + %104 = icmp sge i64 %99, %9, !dbg !47 + %105 = icmp sge i64 %101, %9, !dbg !47 + %.not12 = or i1 %102, %104, !dbg !48 + %.not16 = or i1 %103, %105, !dbg !49 + %.not9 = and i1 %21, %.not12, !dbg !49 + %.not13 = and i1 %22, %.not16, !dbg !50 + %106 = or i1 %.not9, %.not13, !dbg !50 + br i1 %106, label %107, label %108, !dbg !50 + +107: ; preds = %55 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 52, ptr nonnull @assertFunc_1, i64 1), !dbg !50 + unreachable, !dbg !50 + +108: ; preds = %55 + %109 = bitcast i16 %83 to bfloat, !dbg !39 + %110 = fpext bfloat %109 to float, !dbg !51 + %111 = fsub float 0.000000e+00, %110, !dbg !52 + %112 = bitcast i16 %97 to bfloat, !dbg !44 + %113 = fpext bfloat %112 to float, !dbg !53 + %114 = select i1 %69, float %111, float %113, !dbg !54 + %115 = bitcast i16 %81 to bfloat, !dbg !39 + %116 = fpext bfloat %115 to float, !dbg !51 + %117 = fsub float 0.000000e+00, %116, !dbg !52 + %118 = bitcast i16 %95 to bfloat, !dbg !44 + %119 = fpext bfloat %118 to float, !dbg !53 + %120 = select i1 %68, float %117, float %119, !dbg !54 + %121 = bitcast i16 %36 to bfloat, !dbg !19 + %122 = fpext bfloat %121 to float, !dbg !55 + %123 = bitcast i16 %34 to bfloat, !dbg !19 + %124 = fpext bfloat %123 to float, !dbg !55 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !50 + %125 = mul i64 %99, %8, !dbg !56 + %126 = mul i64 %101, %8, !dbg !56 + %127 = getelementptr bfloat, ptr addrspace(1) %3, i64 %29, !dbg !57 + %128 = getelementptr bfloat, ptr addrspace(1) %127, i64 %125, !dbg !57 + %129 = getelementptr bfloat, ptr addrspace(1) %3, i64 %30, !dbg !57 + %130 = getelementptr bfloat, ptr addrspace(1) %129, i64 %126, !dbg !57 + %131 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !58 + %132 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %128, i64 %131, i1 %21) #4, !dbg !58 + %133 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !58 + %134 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b16 { $0 }, [ $1 + 0 ], $2;", "=c,l,l,b"(ptr addrspace(1) %130, i64 %133, i1 %22) #4, !dbg !58 + %135 = insertelement <2 x i16> poison, i16 %63, i64 0, !dbg !32 + %136 = insertelement <2 x i16> %135, i16 %132, i64 1, !dbg !32 + %137 = bitcast <2 x i16> %136 to <2 x bfloat>, !dbg !32 + %138 = fpext <2 x bfloat> %137 to <2 x float>, !dbg !59 + %139 = insertelement <2 x float> poison, float %124, i64 0, !dbg !60 + %140 = insertelement <2 x float> %139, float %120, i64 1, !dbg !60 + %141 = fmul <2 x float> %140, %138, !dbg !60 + %142 = insertelement <2 x i16> poison, i16 %65, i64 0, !dbg !32 + %143 = insertelement <2 x i16> %142, i16 %134, i64 1, !dbg !32 + %144 = bitcast <2 x i16> %143 to <2 x bfloat>, !dbg !32 + %145 = fpext <2 x bfloat> %144 to <2 x float>, !dbg !59 + %146 = insertelement <2 x float> poison, float %122, i64 0, !dbg !60 + %147 = insertelement <2 x float> %146, float %114, i64 1, !dbg !60 + %148 = fmul <2 x float> %147, %145, !dbg !60 + %shift = shufflevector <2 x float> %141, <2 x float> poison, <2 x i32> , !dbg !61 + %foldExtExtBinop = fadd <2 x float> %141, %shift, !dbg !61 + %149 = extractelement <2 x float> %foldExtExtBinop, i64 0, !dbg !61 + %shift18 = shufflevector <2 x float> %148, <2 x float> poison, <2 x i32> , !dbg !61 + %foldExtExtBinop19 = fadd <2 x float> %148, %shift18, !dbg !61 + %150 = extractelement <2 x float> %foldExtExtBinop19, i64 0, !dbg !61 + %151 = getelementptr bfloat, ptr addrspace(1) %4, i64 %23, !dbg !62 + %152 = getelementptr bfloat, ptr addrspace(1) %4, i64 %24, !dbg !62 + %153 = fptrunc float %149 to bfloat, !dbg !63 + %154 = fptrunc float %150 to bfloat, !dbg !63 + %155 = bitcast bfloat %153 to i16, !dbg !63 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %155, ptr addrspace(1) %151, i1 %21) #4, !dbg !63 + %156 = bitcast bfloat %154 to i16, !dbg !63 + tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %156, ptr addrspace(1) %152, i1 %22) #4, !dbg !63 + ret void, !dbg !64 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="256" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1", linkageName: "triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 19, column: 28, scope: !9) +!11 = !DILocation(line: 19, column: 33, scope: !9) +!12 = !DILocation(line: 20, column: 36, scope: !9) +!13 = !DILocation(line: 20, column: 23, scope: !9) +!14 = !DILocation(line: 21, column: 21, scope: !9) +!15 = !DILocation(line: 23, column: 21, scope: !9) +!16 = !DILocation(line: 23, column: 28, scope: !9) +!17 = !DILocation(line: 24, column: 19, scope: !9) +!18 = !DILocation(line: 26, column: 30, scope: !9) +!19 = !DILocation(line: 26, column: 35, scope: !9) +!20 = !DILocation(line: 27, column: 30, scope: !9) +!21 = !DILocation(line: 27, column: 35, scope: !9) +!22 = !DILocation(line: 30, column: 18, scope: !9) +!23 = !DILocation(line: 31, column: 32, scope: !9) +!24 = !DILocation(line: 32, column: 28, scope: !9) +!25 = !DILocation(line: 32, column: 44, scope: !9) +!26 = !DILocation(line: 32, column: 37, scope: !9) +!27 = !DILocation(line: 32, column: 54, scope: !9) +!28 = !DILocation(line: 32, column: 52, scope: !9) +!29 = !DILocation(line: 32, column: 62, scope: !9) +!30 = !DILocation(line: 33, column: 39, scope: !9) +!31 = !DILocation(line: 33, column: 30, scope: !9) +!32 = !DILocation(line: 33, column: 46, scope: !9) +!33 = !DILocation(line: 38, column: 31, scope: !9) +!34 = !DILocation(line: 38, column: 18, scope: !9) +!35 = !DILocation(line: 39, column: 19, scope: !9) +!36 = !DILocation(line: 40, column: 35, scope: !9) +!37 = !DILocation(line: 40, column: 31, scope: !9) +!38 = !DILocation(line: 40, column: 68, scope: !9) +!39 = !DILocation(line: 40, column: 60, scope: !9) +!40 = !DILocation(line: 44, column: 20, scope: !9) +!41 = !DILocation(line: 47, column: 47, scope: !9) +!42 = !DILocation(line: 47, column: 31, scope: !9) +!43 = !DILocation(line: 47, column: 81, scope: !9) +!44 = !DILocation(line: 47, column: 73, scope: !9) +!45 = !DILocation(line: 51, column: 34, scope: !9) +!46 = !DILocation(line: 52, column: 28, scope: !9) +!47 = !DILocation(line: 52, column: 46, scope: !9) +!48 = !DILocation(line: 52, column: 38, scope: !9) +!49 = !DILocation(line: 52, column: 54, scope: !9) +!50 = !DILocation(line: 52, column: 64, scope: !9) +!51 = !DILocation(line: 40, column: 119, scope: !9) +!52 = !DILocation(line: 41, column: 13, scope: !9) +!53 = !DILocation(line: 47, column: 132, scope: !9) +!54 = !DILocation(line: 0, scope: !9) +!55 = !DILocation(line: 26, column: 75, scope: !9) +!56 = !DILocation(line: 53, column: 40, scope: !9) +!57 = !DILocation(line: 53, column: 31, scope: !9) +!58 = !DILocation(line: 53, column: 48, scope: !9) +!59 = !DILocation(line: 33, column: 86, scope: !9) +!60 = !DILocation(line: 34, column: 18, scope: !9) +!61 = !DILocation(line: 55, column: 19, scope: !9) +!62 = !DILocation(line: 56, column: 25, scope: !9) +!63 = !DILocation(line: 56, column: 37, scope: !9) +!64 = !DILocation(line: 56, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.ptx new file mode 100644 index 0000000000000000000000000000000000000000..30c7a9c8fca3ad5eb9ae2884f2d73766daf8f11e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.ptx @@ -0,0 +1,652 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1 // -- Begin function triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 121, 109, 47, 99, 121, 109, 114, 111, 113, 113, 103, 97, 116, 54, 55, 112, 120, 99, 52, 114, 110, 121, 116, 53, 100, 122, 110, 55, 116, 104, 121, 53, 117, 52, 102, 98, 99, 51, 106, 108, 53, 99, 98, 104, 117, 104, 53, 114, 120, 51, 104, 122, 51, 98, 98, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 50, 53, 32, 60, 32, 107, 115, 52}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 121, 109, 47, 99, 121, 109, 114, 111, 113, 113, 103, 97, 116, 54, 55, 112, 120, 99, 52, 114, 110, 121, 116, 53, 100, 122, 110, 55, 116, 104, 121, 53, 117, 52, 102, 98, 99, 51, 106, 108, 53, 99, 98, 104, 117, 104, 53, 114, 120, 51, 104, 122, 51, 98, 98, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 53, 32, 60, 32, 107, 115, 50}; + // @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1 +.visible .entry triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1( + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_0, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_1, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_2, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_3, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_4, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_5, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_6, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_7, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_8, + .param .u64 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_9, + .param .u32 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_10, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_11, + .param .u64 .ptr .global .align 1 triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_12 +) +.reqntid 256 +{ + .reg .pred %p<47>; + .reg .b16 %rs<29>; + .reg .b32 %r<58>; + .reg .b64 %rd<151>; + .loc 1 18 0 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:18:0 + +// %bb.0: + ld.param.b64 %rd33, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_5]; +$L__tmp0: + .loc 1 19 28 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:19:28 + mov.u32 %r2, %ctaid.x; + .loc 1 19 33 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:19:33 + shl.b32 %r3, %r2, 9; + .loc 1 20 36 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:20:36 + mov.u32 %r4, %tid.x; + shl.b32 %r5, %r4, 1; + and.b32 %r6, %r5, 510; + .loc 1 20 23 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:20:23 + or.b32 %r7, %r6, %r3; + or.b32 %r8, %r7, 1; + .loc 1 23 21 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:23:21 + cvt.s64.s32 %rd1, %r7; + cvt.s64.s32 %rd2, %r8; + or.b64 %rd38, %rd1, %rd33; + and.b64 %rd39, %rd38, -4294967296; + setp.ne.b64 %p1, %rd39, 0; + cvt.u32.u64 %r57, %rd1; + @%p1 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd145, %rd1, %rd33; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r9, %rd33; + div.u32 %r11, %r57, %r9; + cvt.u64.u32 %rd145, %r11; +$L__BB0_3: + .loc 1 0 21 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:0:21 + ld.param.b64 %rd34, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_6]; + .loc 1 23 21 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:23:21 + or.b64 %rd40, %rd2, %rd33; + and.b64 %rd41, %rd40, -4294967296; + setp.ne.b64 %p2, %rd41, 0; + cvt.u32.u64 %r56, %rd2; + @%p2 bra $L__BB0_5; + bra.uni $L__BB0_4; +$L__BB0_5: + div.s64 %rd146, %rd2, %rd33; + bra.uni $L__BB0_6; +$L__BB0_4: + cvt.u32.u64 %r12, %rd33; + div.u32 %r14, %r56, %r12; + cvt.u64.u32 %rd146, %r14; +$L__BB0_6: + .loc 1 23 28 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:23:28 + or.b64 %rd42, %rd145, %rd34; + and.b64 %rd43, %rd42, -4294967296; + setp.ne.b64 %p3, %rd43, 0; + @%p3 bra $L__BB0_8; + bra.uni $L__BB0_7; +$L__BB0_8: + rem.s64 %rd147, %rd145, %rd34; + bra.uni $L__BB0_9; +$L__BB0_7: + cvt.u32.u64 %r15, %rd34; + cvt.u32.u64 %r16, %rd145; + rem.u32 %r17, %r16, %r15; + cvt.u64.u32 %rd147, %r17; +$L__BB0_9: + .loc 1 0 28 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:0:28 + ld.param.b64 %rd36, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_8]; + .loc 1 23 28 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:23:28 + or.b64 %rd44, %rd146, %rd34; + and.b64 %rd45, %rd44, -4294967296; + setp.ne.b64 %p4, %rd45, 0; + @%p4 bra $L__BB0_11; + bra.uni $L__BB0_10; +$L__BB0_11: + rem.s64 %rd148, %rd146, %rd34; + bra.uni $L__BB0_12; +$L__BB0_10: + cvt.u32.u64 %r18, %rd34; + cvt.u32.u64 %r19, %rd146; + rem.u32 %r20, %r19, %r18; + cvt.u64.u32 %rd148, %r20; +$L__BB0_12: + .loc 1 24 19 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:24:19 + or.b64 %rd46, %rd1, %rd36; + and.b64 %rd47, %rd46, -4294967296; + setp.ne.b64 %p5, %rd47, 0; + @%p5 bra $L__BB0_14; + bra.uni $L__BB0_13; +$L__BB0_14: + rem.s64 %rd149, %rd1, %rd36; + bra.uni $L__BB0_15; +$L__BB0_13: + cvt.u32.u64 %r21, %rd36; + rem.u32 %r23, %r57, %r21; + cvt.u64.u32 %rd149, %r23; +$L__BB0_15: + .loc 1 0 19 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:0:19 + ld.param.b32 %r1, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_10]; + ld.param.b64 %rd35, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_7]; + ld.param.b64 %rd29, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_1]; + ld.param.b64 %rd28, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_0]; + .loc 1 24 19 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:24:19 + or.b64 %rd48, %rd2, %rd36; + and.b64 %rd49, %rd48, -4294967296; + setp.ne.b64 %p6, %rd49, 0; + @%p6 bra $L__BB0_17; + bra.uni $L__BB0_16; +$L__BB0_17: + rem.s64 %rd150, %rd2, %rd36; + bra.uni $L__BB0_18; +$L__BB0_16: + cvt.u32.u64 %r24, %rd36; + rem.u32 %r26, %r56, %r24; + cvt.u64.u32 %rd150, %r26; +$L__BB0_18: + .loc 1 21 21 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:21:21 + setp.lt.s32 %p10, %r56, %r1; + setp.lt.s32 %p9, %r57, %r1; + .loc 1 26 30 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:26:30 + shl.b64 %rd64, %rd1, 1; + add.s64 %rd51, %rd28, %rd64; + add.s64 %rd54, %rd51, 2; + .loc 1 26 35 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:26:35 + // begin inline asm + mov.u64 %rd50, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd50, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs9, 0x0; + @%p9 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs9 }, [ %rd51 + 0 ], %rd50; + // end inline asm + // begin inline asm + mov.u64 %rd53, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd53, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs10, 0x0; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs10 }, [ %rd54 + 0 ], %rd53; + // end inline asm + .loc 1 27 30 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:27:30 + shl.b64 %rd65, %rd147, 3; + add.s64 %rd58, %rd29, %rd65; + shl.b64 %rd66, %rd148, 3; + add.s64 %rd62, %rd29, %rd66; + .loc 1 27 35 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:27:35 + // begin inline asm + mov.u64 %rd56, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd56, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd57, 0x0; + @%p9 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd57 }, [ %rd58 + 0 ], %rd56; + // end inline asm + // begin inline asm + mov.u64 %rd60, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd60, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd61, 0x0; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd61 }, [ %rd62 + 0 ], %rd60; + // end inline asm + .loc 1 31 32 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:31:32 + shr.s64 %rd67, %rd57, 63; + and.b64 %rd68, %rd67, %rd35; + add.s64 %rd23, %rd68, %rd57; + shr.s64 %rd69, %rd61, 63; + and.b64 %rd70, %rd69, %rd35; + add.s64 %rd24, %rd70, %rd61; + .loc 1 32 28 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:32:28 + setp.lt.s64 %p11, %rd23, 0; + setp.lt.s64 %p12, %rd24, 0; + .loc 1 32 44 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:32:44 + setp.ge.s64 %p13, %rd23, %rd35; + setp.ge.s64 %p14, %rd24, %rd35; + .loc 1 32 37 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:32:37 + or.pred %p15, %p11, %p13; + .loc 1 32 54 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:32:54 + or.pred %p16, %p12, %p14; + .loc 1 32 52 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:32:52 + and.pred %p17, %p9, %p15; + .loc 1 32 62 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:32:62 + and.pred %p18, %p10, %p16; + or.pred %p19, %p17, %p18; + not.pred %p20, %p19; + @%p20 bra $L__BB0_20; + bra.uni $L__BB0_19; +$L__BB0_20: + .loc 1 0 62 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:0:62 + ld.param.b64 %rd37, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_9]; + ld.param.b64 %rd30, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_2]; + .loc 1 32 62 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:32:62 + bar.sync 0; + .loc 1 33 39 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:33:39 + mul.lo.s64 %rd89, %rd23, %rd36; + mul.lo.s64 %rd90, %rd24, %rd36; + .loc 1 33 30 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:33:30 + shl.b64 %rd91, %rd149, 1; + add.s64 %rd92, %rd30, %rd91; + shl.b64 %rd93, %rd89, 1; + add.s64 %rd72, %rd92, %rd93; + shl.b64 %rd94, %rd150, 1; + add.s64 %rd95, %rd30, %rd94; + shl.b64 %rd96, %rd90, 1; + add.s64 %rd75, %rd95, %rd96; + .loc 1 33 46 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:33:46 + // begin inline asm + mov.u64 %rd71, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd71, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs11, 0x0; + @%p9 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs11 }, [ %rd72 + 0 ], %rd71; + // end inline asm + // begin inline asm + mov.u64 %rd74, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd74, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs12, 0x0; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs12 }, [ %rd75 + 0 ], %rd74; + // end inline asm + .loc 1 38 31 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:38:31 + shr.u64 %rd97, %rd36, 63; + add.s64 %rd98, %rd36, %rd97; + shr.s64 %rd99, %rd98, 1; + .loc 1 38 18 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:38:18 + sub.s64 %rd25, %rd36, %rd99; + .loc 1 39 19 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:39:19 + setp.lt.s64 %p27, %rd149, %rd25; + setp.lt.s64 %p28, %rd150, %rd25; + .loc 1 40 35 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:40:35 + sub.s64 %rd100, %rd1, %rd149; + .loc 1 40 31 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:40:31 + shl.b64 %rd101, %rd100, 1; + add.s64 %rd102, %rd28, %rd101; + and.b64 %rd103, %rd98, -2; + add.s64 %rd104, %rd102, %rd103; + add.s64 %rd78, %rd104, %rd91; + sub.s64 %rd105, %rd1, %rd150; + shl.b64 %rd106, %rd105, 1; + add.s64 %rd107, %rd28, %rd106; + add.s64 %rd108, %rd107, %rd103; + add.s64 %rd109, %rd108, %rd94; + add.s64 %rd81, %rd109, 2; + .loc 1 40 68 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:40:68 + and.pred %p23, %p9, %p27; + and.pred %p24, %p10, %p28; + .loc 1 40 60 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:40:60 + // begin inline asm + mov.u64 %rd77, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd77, 1.0; + // end inline asm + mov.b16 %rs14, 0; + // begin inline asm + mov.u16 %rs13, %rs14; + @%p23 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs13 }, [ %rd78 + 0 ], %rd77; + // end inline asm + // begin inline asm + mov.u64 %rd80, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd80, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs15, %rs14; + @%p24 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs15 }, [ %rd81 + 0 ], %rd80; + // end inline asm + .loc 1 44 20 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:44:20 + setp.ge.s64 %p29, %rd149, %rd25; + setp.ge.s64 %p30, %rd150, %rd25; + .loc 1 47 47 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:47:47 + sub.s64 %rd110, %rd149, %rd36; + .loc 1 47 31 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:47:31 + shl.b64 %rd111, %rd110, 1; + add.s64 %rd84, %rd104, %rd111; + add.s64 %rd87, %rd84, 2; + .loc 1 47 81 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:47:81 + and.pred %p25, %p9, %p29; + and.pred %p26, %p10, %p30; + .loc 1 47 73 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:47:73 + // begin inline asm + mov.u64 %rd83, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd83, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs17, %rs14; + @%p25 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs17 }, [ %rd84 + 0 ], %rd83; + // end inline asm + // begin inline asm + mov.u64 %rd86, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd86, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs19, %rs14; + @%p26 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs19 }, [ %rd87 + 0 ], %rd86; + // end inline asm + .loc 1 51 34 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:51:34 + and.b64 %rd113, %rd67, %rd37; + add.s64 %rd26, %rd113, %rd57; + and.b64 %rd115, %rd69, %rd37; + add.s64 %rd27, %rd115, %rd61; + .loc 1 52 28 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:52:28 + setp.lt.s64 %p31, %rd26, 0; + setp.lt.s64 %p32, %rd27, 0; + .loc 1 52 46 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:52:46 + setp.ge.s64 %p33, %rd26, %rd37; + setp.ge.s64 %p34, %rd27, %rd37; + .loc 1 52 38 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:52:38 + or.pred %p35, %p31, %p33; + .loc 1 52 54 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:52:54 + or.pred %p36, %p32, %p34; + and.pred %p37, %p9, %p35; + .loc 1 52 64 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:52:64 + and.pred %p38, %p10, %p36; + or.pred %p39, %p37, %p38; + not.pred %p40, %p39; + @%p40 bra $L__BB0_22; + bra.uni $L__BB0_21; +$L__BB0_22: + .loc 1 0 64 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:0:64 + ld.param.b64 %rd32, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_4]; + ld.param.b64 %rd31, [triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1_param_3]; + .loc 1 40 119 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:40:119 + cvt.f32.bf16 %r33, %rs15; + mov.b32 %r34, 0f00000000; + .loc 1 41 13 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:41:13 + sub.f32 %r35, %r34, %r33; + .loc 1 47 132 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:47:132 + cvt.f32.bf16 %r36, %rs19; + .loc 1 0 0 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:0 + selp.f32 %r37, %r35, %r36, %p28; + .loc 1 40 119 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:40:119 + cvt.f32.bf16 %r38, %rs13; + .loc 1 41 13 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:41:13 + sub.f32 %r39, %r34, %r38; + .loc 1 47 132 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:47:132 + cvt.f32.bf16 %r40, %rs17; + .loc 1 0 0 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:0 + selp.f32 %r41, %r39, %r40, %p27; + .loc 1 26 75 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:26:75 + cvt.f32.bf16 %r42, %rs10; + cvt.f32.bf16 %r43, %rs9; + .loc 1 52 64 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:52:64 + bar.sync 0; + .loc 1 53 40 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:53:40 + mul.lo.s64 %rd124, %rd26, %rd36; + mul.lo.s64 %rd125, %rd27, %rd36; + .loc 1 53 31 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:53:31 + add.s64 %rd127, %rd31, %rd91; + shl.b64 %rd128, %rd124, 1; + add.s64 %rd117, %rd127, %rd128; + add.s64 %rd130, %rd31, %rd94; + shl.b64 %rd131, %rd125, 1; + add.s64 %rd120, %rd130, %rd131; + .loc 1 53 48 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:53:48 + // begin inline asm + mov.u64 %rd118, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd118, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs21, 0x0; + @%p9 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs21 }, [ %rd117 + 0 ], %rd118; + // end inline asm + // begin inline asm + mov.u64 %rd121, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd121, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs22, 0x0; + @%p10 ld.global.L1::evict_last.L2::cache_hint.b16 { %rs22 }, [ %rd120 + 0 ], %rd121; + // end inline asm + .loc 1 33 46 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:33:46 + mov.b32 %r44, {%rs11, %rs21}; + .loc 1 33 86 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:33:86 + mov.b32 {%rs25, %rs26}, %r44; + cvt.f32.bf16 %r45, %rs25; + cvt.f32.bf16 %r46, %rs26; + .loc 1 34 18 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:34:18 + mul.f32 %r47, %r41, %r46; + .loc 1 33 46 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:33:46 + mov.b32 %r48, {%rs12, %rs22}; + .loc 1 33 86 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:33:86 + mov.b32 {%rs27, %rs28}, %r48; + cvt.f32.bf16 %r49, %rs27; + cvt.f32.bf16 %r50, %rs28; + .loc 1 34 18 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:34:18 + mul.f32 %r51, %r37, %r50; + .loc 1 55 19 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:55:19 + fma.rn.f32 %r52, %r43, %r45, %r47; + fma.rn.f32 %r53, %r42, %r49, %r51; + .loc 1 56 25 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:56:25 + add.s64 %rd122, %rd32, %rd64; + add.s64 %rd123, %rd122, 2; + .loc 1 56 37 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:56:37 + cvt.rn.bf16.f32 %rs23, %r52; + cvt.rn.bf16.f32 %rs24, %r53; + // begin inline asm + @%p9 st.global.b16 [ %rd122 + 0 ], { %rs23 }; + // end inline asm + // begin inline asm + @%p10 st.global.b16 [ %rd123 + 0 ], { %rs24 }; + // end inline asm + .loc 1 56 4 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:56:4 + ret; +$L__BB0_19: + .loc 1 32 62 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:32:62 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd139, assertFunc_0; + cvta.global.u64 %rd140, %rd139; + st.param.b64 [param3], %rd140; + mov.b64 %rd141, assertFile_0; + cvta.global.u64 %rd142, %rd141; + st.param.b64 [param1], %rd142; + mov.b64 %rd143, assertMessage_0; + cvta.global.u64 %rd144, %rd143; + st.param.b64 [param0], %rd144; + st.param.b64 [param4], 1; + st.param.b32 [param2], 32; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__BB0_21: + .loc 1 52 64 // cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py:52:64 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd133, assertFunc_1; + cvta.global.u64 %rd134, %rd133; + st.param.b64 [param3], %rd134; + mov.b64 %rd135, assertFile_1; + cvta.global.u64 %rd136, %rd135; + st.param.b64 [param1], %rd136; + mov.b64 %rd137, assertMessage_1; + cvta.global.u64 %rd138, %rd137; + st.param.b64 [param0], %rd138; + st.param.b64 [param4], 1; + st.param.b32 [param2], 52; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp1: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 0 // DW_CHILDREN_no +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 135 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 121 +.b8 109 +.b8 114 +.b8 111 +.b8 113 +.b8 113 +.b8 103 +.b8 97 +.b8 116 +.b8 54 +.b8 55 +.b8 112 +.b8 120 +.b8 99 +.b8 52 +.b8 114 +.b8 110 +.b8 121 +.b8 116 +.b8 53 +.b8 100 +.b8 122 +.b8 110 +.b8 55 +.b8 116 +.b8 104 +.b8 121 +.b8 53 +.b8 117 +.b8 52 +.b8 102 +.b8 98 +.b8 99 +.b8 51 +.b8 106 +.b8 108 +.b8 53 +.b8 99 +.b8 98 +.b8 104 +.b8 117 +.b8 104 +.b8 53 +.b8 114 +.b8 120 +.b8 51 +.b8 104 +.b8 122 +.b8 51 +.b8 98 +.b8 98 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 121 +.b8 109 +.b8 0 + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.source b/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.source new file mode 100644 index 0000000000000000000000000000000000000000..6ef0c45a0187551cc4c184bb40d2f765600ecdb4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.source @@ -0,0 +1,299 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":18:0) +#loc78 = loc("in_ptr0"(#loc)) +#loc79 = loc("in_ptr1"(#loc)) +#loc80 = loc("in_ptr2"(#loc)) +#loc81 = loc("in_ptr3"(#loc)) +#loc82 = loc("out_ptr0"(#loc)) +#loc83 = loc("ks0"(#loc)) +#loc84 = loc("ks1"(#loc)) +#loc85 = loc("ks2"(#loc)) +#loc86 = loc("ks3"(#loc)) +#loc87 = loc("ks4"(#loc)) +#loc88 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %xoffset = tt.get_program_id x : i32 loc(#loc89) + %xoffset_0 = arith.constant 512 : i32 loc(#loc90) + %xoffset_1 = arith.constant 512 : i32 loc(#loc90) + %xoffset_2 = arith.muli %xoffset, %xoffset_1 : i32 loc(#loc90) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc91) + %xindex_3 = tt.splat %xoffset_2 : i32 -> tensor<512xi32> loc(#loc92) + %xindex_4 = arith.addi %xindex_3, %xindex : tensor<512xi32> loc(#loc92) + %xmask = tt.splat %xnumel : i32 -> tensor<512xi32> loc(#loc93) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<512xi32> loc(#loc93) + %x2 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc94) + %x2_6 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc94) + %x2_7 = arith.divsi %x2, %x2_6 : tensor<512xi64> loc(#loc94) + %x2_8 = tt.splat %ks1 : i64 -> tensor<512xi64> loc(#loc95) + %x2_9 = arith.remsi %x2_7, %x2_8 : tensor<512xi64> loc(#loc95) + %x0 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc96) + %x0_10 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc96) + %x0_11 = arith.remsi %x0, %x0_10 : tensor<512xi64> loc(#loc96) + %x5 = arith.extsi %xindex_4 : tensor<512xi32> to tensor<512xi64> loc(#loc97) + %x5_12 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc97) + %x5_13 = arith.divsi %x5, %x5_12 : tensor<512xi64> loc(#loc97) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc98) + %tmp0_14 = tt.addptr %tmp0, %xindex_4 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc98) + %tmp0_15 = tt.load %tmp0_14, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc99) + %tmp0_16 = arith.extf %tmp0_15 : tensor<512xbf16> to tensor<512xf32> loc(#loc100) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc101) + %tmp1_17 = tt.addptr %tmp1, %x2_9 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc101) + %tmp1_18 = tt.load %tmp1_17, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc102) + %tmp3 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc103) + %tmp3_19 = arith.addi %tmp1_18, %tmp3 : tensor<512xi64> loc(#loc103) + %tmp4 = arith.constant 0 : i32 loc(#loc104) + %tmp4_20 = arith.extsi %tmp4 : i32 to i64 loc(#loc104) + %tmp4_21 = tt.splat %tmp4_20 : i64 -> tensor<512xi64> loc(#loc104) + %tmp4_22 = arith.cmpi slt, %tmp1_18, %tmp4_21 : tensor<512xi64> loc(#loc104) + %tmp5 = arith.select %tmp4_22, %tmp3_19, %tmp1_18 : tensor<512xi1>, tensor<512xi64> loc(#loc105) + %c0_i32 = arith.constant 0 : i32 loc(#loc18) + %0 = arith.extsi %c0_i32 : i32 to i64 loc(#loc18) + %1 = tt.splat %0 : i64 -> tensor<512xi64> loc(#loc18) + %2 = arith.cmpi sle, %1, %tmp5 : tensor<512xi64> loc(#loc18) + %3 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc19) + %4 = arith.cmpi slt, %tmp5, %3 : tensor<512xi64> loc(#loc19) + %5 = arith.andi %2, %4 : tensor<512xi1> loc(#loc20) + %true = arith.constant true loc(#loc21) + %cst = arith.constant dense : tensor<512xi1> loc(#loc21) + %6 = arith.xori %xmask_5, %cst : tensor<512xi1> loc(#loc21) + %7 = arith.ori %5, %6 : tensor<512xi1> loc(#loc22) + tt.assert %7, "index out of bounds: 0 <= tmp5 < ks2" : tensor<512xi1> loc(#loc23) + %tmp7 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc106) + %tmp7_23 = arith.muli %tmp7, %tmp5 : tensor<512xi64> loc(#loc106) + %tmp7_24 = arith.addi %x0_11, %tmp7_23 : tensor<512xi64> loc(#loc107) + %tmp7_25 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc108) + %tmp7_26 = tt.addptr %tmp7_25, %tmp7_24 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc108) + %tmp7_27 = tt.load %tmp7_26, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc109) + %tmp7_28 = arith.extf %tmp7_27 : tensor<512xbf16> to tensor<512xf32> loc(#loc110) + %tmp8 = arith.mulf %tmp0_16, %tmp7_28 : tensor<512xf32> loc(#loc111) + %tmp10 = arith.constant 0 : i64 loc(#loc112) + %tmp10_29 = arith.constant dense<0> : tensor<1xi64> loc(#loc112) + %tmp11 = arith.constant dense<0> : tensor<512xi64> loc(#loc113) + %tmp11_30 = arith.cmpi sge, %x0_11, %tmp11 : tensor<512xi64> loc(#loc113) + %tmp12 = arith.constant 2 : i32 loc(#loc114) + %tmp12_31 = arith.constant 2 : i64 loc(#loc114) + %tmp12_32 = arith.divsi %ks3, %tmp12_31 : i64 loc(#loc114) + %tmp12_33 = arith.constant -1 : i32 loc(#loc115) + %tmp12_34 = arith.constant -1 : i64 loc(#loc115) + %tmp12_35 = arith.muli %tmp12_34, %tmp12_32 : i64 loc(#loc115) + %tmp12_36 = arith.addi %ks3, %tmp12_35 : i64 loc(#loc116) + %tmp13 = tt.splat %tmp12_36 : i64 -> tensor<512xi64> loc(#loc117) + %tmp13_37 = arith.cmpi slt, %x0_11, %tmp13 : tensor<512xi64> loc(#loc117) + %tmp14 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc118) + %tmp14_38 = arith.muli %tmp14, %x5_13 : tensor<512xi64> loc(#loc118) + %tmp14_39 = arith.constant 2 : i32 loc(#loc119) + %tmp14_40 = arith.constant 2 : i64 loc(#loc119) + %tmp14_41 = arith.divsi %ks3, %tmp14_40 : i64 loc(#loc119) + %tmp14_42 = tt.splat %tmp14_41 : i64 -> tensor<512xi64> loc(#loc120) + %tmp14_43 = arith.addi %tmp14_38, %tmp14_42 : tensor<512xi64> loc(#loc120) + %tmp14_44 = arith.addi %tmp14_43, %x0_11 : tensor<512xi64> loc(#loc121) + %tmp14_45 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc122) + %tmp14_46 = tt.addptr %tmp14_45, %tmp14_44 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc122) + %tmp14_47 = arith.andi %tmp13_37, %xmask_5 : tensor<512xi1> loc(#loc123) + %tmp14_48 = arith.constant 0.000000e+00 : f32 loc(#loc124) + %tmp14_49 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc124) + %tmp14_50 = arith.truncf %tmp14_49 : tensor<512xf32> to tensor<512xbf16> loc(#loc124) + %tmp14_51 = tt.load %tmp14_46, %tmp14_47, %tmp14_50 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc124) + %tmp14_52 = arith.extf %tmp14_51 : tensor<512xbf16> to tensor<512xf32> loc(#loc125) + %tmp15 = arith.constant 0.000000e+00 : f32 loc(#loc126) + %tmp15_53 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc126) + %tmp15_54 = arith.subf %tmp15_53, %tmp14_52 : tensor<512xf32> loc(#loc126) + %tmp16 = arith.constant 0.000000e+00 : f32 loc(#loc127) + %tmp16_55 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc127) + %tmp17 = arith.select %tmp13_37, %tmp15_54, %tmp16_55 : tensor<512xi1>, tensor<512xf32> loc(#loc128) + %tmp18 = tt.splat %tmp12_36 : i64 -> tensor<512xi64> loc(#loc129) + %tmp18_56 = arith.cmpi sge, %x0_11, %tmp18 : tensor<512xi64> loc(#loc129) + %tmp20 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc130) + %tmp20_57 = arith.cmpi slt, %x0_11, %tmp20 : tensor<512xi64> loc(#loc130) + %tmp21 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc131) + %tmp21_58 = arith.muli %tmp21, %x5_13 : tensor<512xi64> loc(#loc131) + %tmp21_59 = arith.constant -1 : i32 loc(#loc132) + %tmp21_60 = arith.constant -1 : i64 loc(#loc132) + %tmp21_61 = arith.muli %tmp21_60, %ks3 : i64 loc(#loc132) + %tmp21_62 = tt.splat %tmp21_61 : i64 -> tensor<512xi64> loc(#loc133) + %tmp21_63 = arith.addi %x0_11, %tmp21_62 : tensor<512xi64> loc(#loc133) + %tmp21_64 = arith.constant 2 : i32 loc(#loc134) + %tmp21_65 = arith.constant 2 : i64 loc(#loc134) + %tmp21_66 = arith.divsi %ks3, %tmp21_65 : i64 loc(#loc134) + %tmp21_67 = tt.splat %tmp21_66 : i64 -> tensor<512xi64> loc(#loc135) + %tmp21_68 = arith.addi %tmp21_63, %tmp21_67 : tensor<512xi64> loc(#loc135) + %tmp21_69 = arith.addi %tmp21_58, %tmp21_68 : tensor<512xi64> loc(#loc136) + %tmp21_70 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc137) + %tmp21_71 = tt.addptr %tmp21_70, %tmp21_69 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc137) + %tmp21_72 = arith.andi %tmp18_56, %xmask_5 : tensor<512xi1> loc(#loc138) + %tmp21_73 = arith.constant 0.000000e+00 : f32 loc(#loc139) + %tmp21_74 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc139) + %tmp21_75 = arith.truncf %tmp21_74 : tensor<512xf32> to tensor<512xbf16> loc(#loc139) + %tmp21_76 = tt.load %tmp21_71, %tmp21_72, %tmp21_75 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc139) + %tmp21_77 = arith.extf %tmp21_76 : tensor<512xbf16> to tensor<512xf32> loc(#loc140) + %tmp22 = arith.select %tmp13_37, %tmp17, %tmp21_77 : tensor<512xi1>, tensor<512xf32> loc(#loc141) + %tmp24 = tt.splat %ks4 : i64 -> tensor<512xi64> loc(#loc142) + %tmp24_78 = arith.addi %tmp1_18, %tmp24 : tensor<512xi64> loc(#loc142) + %tmp25 = arith.select %tmp4_22, %tmp24_78, %tmp1_18 : tensor<512xi1>, tensor<512xi64> loc(#loc143) + %c0_i32_79 = arith.constant 0 : i32 loc(#loc62) + %8 = arith.extsi %c0_i32_79 : i32 to i64 loc(#loc62) + %9 = tt.splat %8 : i64 -> tensor<512xi64> loc(#loc62) + %10 = arith.cmpi sle, %9, %tmp25 : tensor<512xi64> loc(#loc62) + %11 = tt.splat %ks4 : i64 -> tensor<512xi64> loc(#loc63) + %12 = arith.cmpi slt, %tmp25, %11 : tensor<512xi64> loc(#loc63) + %13 = arith.andi %10, %12 : tensor<512xi1> loc(#loc64) + %true_80 = arith.constant true loc(#loc65) + %cst_81 = arith.constant dense : tensor<512xi1> loc(#loc65) + %14 = arith.xori %xmask_5, %cst_81 : tensor<512xi1> loc(#loc65) + %15 = arith.ori %13, %14 : tensor<512xi1> loc(#loc66) + tt.assert %15, "index out of bounds: 0 <= tmp25 < ks4" : tensor<512xi1> loc(#loc67) + %tmp27 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc144) + %tmp27_82 = arith.muli %tmp27, %tmp25 : tensor<512xi64> loc(#loc144) + %tmp27_83 = arith.addi %x0_11, %tmp27_82 : tensor<512xi64> loc(#loc145) + %tmp27_84 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc146) + %tmp27_85 = tt.addptr %tmp27_84, %tmp27_83 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc146) + %tmp27_86 = tt.load %tmp27_85, %xmask_5 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc147) + %tmp27_87 = arith.extf %tmp27_86 : tensor<512xbf16> to tensor<512xf32> loc(#loc148) + %tmp28 = arith.mulf %tmp22, %tmp27_87 : tensor<512xf32> loc(#loc149) + %tmp29 = arith.addf %tmp8, %tmp28 : tensor<512xf32> loc(#loc150) + %16 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc75) + %17 = tt.addptr %16, %xindex_4 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc75) + %18 = arith.truncf %tmp29 : tensor<512xf32> to tensor<512xbf16> loc(#loc76) + tt.store %17, %18, %xmask_5 : tensor<512x!tt.ptr> loc(#loc76) + tt.return loc(#loc77) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":19:28) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":19:33) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":20:36) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":20:23) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":21:21) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":23:28) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":24:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":25:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":26:30) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":26:35) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":26:75) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":27:30) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":27:35) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":29:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":30:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":31:32) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":32:28) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":32:44) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":32:37) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":32:54) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":32:52) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":32:62) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":33:39) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":33:35) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":33:30) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":33:46) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":33:86) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":34:18) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":36:28) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":37:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":38:31) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":38:24) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":38:18) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":39:19) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:35) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:48) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:41) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:54) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:31) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:68) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:60) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:119) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":41:13) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":42:38) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":43:35) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":44:20) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":46:19) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:35) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:52) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:47) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:67) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:60) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:41) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:31) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:81) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:73) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:132) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":48:35) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":50:19) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":51:34) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":52:28) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":52:46) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":52:38) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":52:56) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":52:54) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":52:64) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":53:40) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":53:36) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":53:31) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":53:48) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":53:88) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":54:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":55:19) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":56:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":56:37) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":56:4) +#loc89 = loc("xoffset"(#loc1)) +#loc90 = loc("xoffset"(#loc2)) +#loc91 = loc("xindex"(#loc3)) +#loc92 = loc("xindex"(#loc4)) +#loc93 = loc("xmask"(#loc5)) +#loc94 = loc("x2"(#loc6)) +#loc95 = loc("x2"(#loc7)) +#loc96 = loc("x0"(#loc8)) +#loc97 = loc("x5"(#loc9)) +#loc98 = loc("tmp0"(#loc10)) +#loc99 = loc("tmp0"(#loc11)) +#loc100 = loc("tmp0"(#loc12)) +#loc101 = loc("tmp1"(#loc13)) +#loc102 = loc("tmp1"(#loc14)) +#loc103 = loc("tmp3"(#loc15)) +#loc104 = loc("tmp4"(#loc16)) +#loc105 = loc("tmp5"(#loc17)) +#loc106 = loc("tmp7"(#loc24)) +#loc107 = loc("tmp7"(#loc25)) +#loc108 = loc("tmp7"(#loc26)) +#loc109 = loc("tmp7"(#loc27)) +#loc110 = loc("tmp7"(#loc28)) +#loc111 = loc("tmp8"(#loc29)) +#loc112 = loc("tmp10"(#loc30)) +#loc113 = loc("tmp11"(#loc31)) +#loc114 = loc("tmp12"(#loc32)) +#loc115 = loc("tmp12"(#loc33)) +#loc116 = loc("tmp12"(#loc34)) +#loc117 = loc("tmp13"(#loc35)) +#loc118 = loc("tmp14"(#loc36)) +#loc119 = loc("tmp14"(#loc37)) +#loc120 = loc("tmp14"(#loc38)) +#loc121 = loc("tmp14"(#loc39)) +#loc122 = loc("tmp14"(#loc40)) +#loc123 = loc("tmp14"(#loc41)) +#loc124 = loc("tmp14"(#loc42)) +#loc125 = loc("tmp14"(#loc43)) +#loc126 = loc("tmp15"(#loc44)) +#loc127 = loc("tmp16"(#loc45)) +#loc128 = loc("tmp17"(#loc46)) +#loc129 = loc("tmp18"(#loc47)) +#loc130 = loc("tmp20"(#loc48)) +#loc131 = loc("tmp21"(#loc49)) +#loc132 = loc("tmp21"(#loc50)) +#loc133 = loc("tmp21"(#loc51)) +#loc134 = loc("tmp21"(#loc52)) +#loc135 = loc("tmp21"(#loc53)) +#loc136 = loc("tmp21"(#loc54)) +#loc137 = loc("tmp21"(#loc55)) +#loc138 = loc("tmp21"(#loc56)) +#loc139 = loc("tmp21"(#loc57)) +#loc140 = loc("tmp21"(#loc58)) +#loc141 = loc("tmp22"(#loc59)) +#loc142 = loc("tmp24"(#loc60)) +#loc143 = loc("tmp25"(#loc61)) +#loc144 = loc("tmp27"(#loc68)) +#loc145 = loc("tmp27"(#loc69)) +#loc146 = loc("tmp27"(#loc70)) +#loc147 = loc("tmp27"(#loc71)) +#loc148 = loc("tmp27"(#loc72)) +#loc149 = loc("tmp28"(#loc73)) +#loc150 = loc("tmp29"(#loc74)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..581e7c168fdd4e5668db4a3ee0e816fd0867ea75 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.ttgir @@ -0,0 +1,232 @@ +#blocked = #ttg.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":18:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("in_ptr1"(#loc)) +#loc72 = loc("in_ptr2"(#loc)) +#loc73 = loc("in_ptr3"(#loc)) +#loc74 = loc("out_ptr0"(#loc)) +#loc75 = loc("ks0"(#loc)) +#loc76 = loc("ks1"(#loc)) +#loc77 = loc("ks2"(#loc)) +#loc78 = loc("ks3"(#loc)) +#loc79 = loc("ks4"(#loc)) +#loc80 = loc("xnumel"(#loc)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<512xi1, #blocked> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xbf16, #blocked> loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<512xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc81) + %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc82) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> loc(#loc83) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32, #blocked> loc(#loc84) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32, #blocked> loc(#loc84) + %xmask = tt.splat %xnumel : i32 -> tensor<512xi32, #blocked> loc(#loc85) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<512xi32, #blocked> loc(#loc85) + %x2 = arith.extsi %xindex_5 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked> loc(#loc86) + %x2_7 = tt.splat %ks0 : i64 -> tensor<512xi64, #blocked> loc(#loc86) + %x2_8 = arith.divsi %x2, %x2_7 : tensor<512xi64, #blocked> loc(#loc86) + %x2_9 = tt.splat %ks1 : i64 -> tensor<512xi64, #blocked> loc(#loc87) + %x2_10 = arith.remsi %x2_8, %x2_9 : tensor<512xi64, #blocked> loc(#loc87) + %x0 = tt.splat %ks3 : i64 -> tensor<512xi64, #blocked> loc(#loc88) + %x0_11 = arith.remsi %x2, %x0 : tensor<512xi64, #blocked> loc(#loc88) + %x5 = arith.divsi %x2, %x0 : tensor<512xi64, #blocked> loc(#loc89) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc90) + %tmp0_12 = tt.addptr %tmp0, %xindex_5 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc90) + %tmp0_13 = tt.load %tmp0_12, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc91) + %tmp0_14 = arith.extf %tmp0_13 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc92) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc93) + %tmp1_15 = tt.addptr %tmp1, %x2_10 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc93) + %tmp1_16 = tt.load %tmp1_15, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc94) + %tmp3 = tt.splat %ks2 : i64 -> tensor<512xi64, #blocked> loc(#loc95) + %tmp3_17 = arith.addi %tmp1_16, %tmp3 : tensor<512xi64, #blocked> loc(#loc95) + %tmp4 = arith.cmpi slt, %tmp1_16, %cst_1 : tensor<512xi64, #blocked> loc(#loc96) + %tmp5 = arith.select %tmp4, %tmp3_17, %tmp1_16 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked> loc(#loc97) + %0 = arith.cmpi sge, %tmp5, %cst_1 : tensor<512xi64, #blocked> loc(#loc19) + %1 = arith.cmpi slt, %tmp5, %tmp3 : tensor<512xi64, #blocked> loc(#loc20) + %2 = arith.andi %0, %1 : tensor<512xi1, #blocked> loc(#loc21) + %3 = arith.xori %xmask_6, %cst : tensor<512xi1, #blocked> loc(#loc22) + %4 = arith.ori %2, %3 : tensor<512xi1, #blocked> loc(#loc23) + tt.assert %4, "index out of bounds: 0 <= tmp5 < ks2" : tensor<512xi1, #blocked> loc(#loc24) + %tmp7 = arith.muli %x0, %tmp5 : tensor<512xi64, #blocked> loc(#loc98) + %tmp7_18 = arith.addi %x0_11, %tmp7 : tensor<512xi64, #blocked> loc(#loc99) + %tmp7_19 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc100) + %tmp7_20 = tt.addptr %tmp7_19, %tmp7_18 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc100) + %tmp7_21 = tt.load %tmp7_20, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc101) + %tmp7_22 = arith.extf %tmp7_21 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc102) + %tmp8 = arith.mulf %tmp0_14, %tmp7_22 : tensor<512xf32, #blocked> loc(#loc103) + %tmp12 = arith.divsi %ks3, %c2_i64 : i64 loc(#loc104) + %tmp12_23 = arith.subi %ks3, %tmp12 : i64 loc(#loc105) + %tmp13 = tt.splat %tmp12_23 : i64 -> tensor<512xi64, #blocked> loc(#loc106) + %tmp13_24 = arith.cmpi slt, %x0_11, %tmp13 : tensor<512xi64, #blocked> loc(#loc106) + %tmp14 = arith.muli %x0, %x5 : tensor<512xi64, #blocked> loc(#loc107) + %tmp14_25 = tt.splat %tmp12 : i64 -> tensor<512xi64, #blocked> loc(#loc108) + %tmp14_26 = arith.addi %tmp14, %tmp14_25 : tensor<512xi64, #blocked> loc(#loc108) + %tmp14_27 = arith.addi %tmp14_26, %x0_11 : tensor<512xi64, #blocked> loc(#loc109) + %tmp14_28 = tt.addptr %tmp0, %tmp14_27 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc110) + %tmp14_29 = arith.andi %tmp13_24, %xmask_6 : tensor<512xi1, #blocked> loc(#loc111) + %tmp14_30 = tt.load %tmp14_28, %tmp14_29, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc112) + %tmp14_31 = arith.extf %tmp14_30 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc113) + %tmp15 = arith.subf %cst_2, %tmp14_31 : tensor<512xf32, #blocked> loc(#loc114) + %tmp18 = arith.cmpi sge, %x0_11, %tmp13 : tensor<512xi64, #blocked> loc(#loc115) + %tmp21 = arith.muli %ks3, %c-1_i64 : i64 loc(#loc116) + %tmp21_32 = tt.splat %tmp21 : i64 -> tensor<512xi64, #blocked> loc(#loc117) + %tmp21_33 = arith.addi %x0_11, %tmp21_32 : tensor<512xi64, #blocked> loc(#loc117) + %tmp21_34 = arith.addi %tmp21_33, %tmp14_25 : tensor<512xi64, #blocked> loc(#loc118) + %tmp21_35 = arith.addi %tmp14, %tmp21_34 : tensor<512xi64, #blocked> loc(#loc119) + %tmp21_36 = tt.addptr %tmp0, %tmp21_35 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc120) + %tmp21_37 = arith.andi %tmp18, %xmask_6 : tensor<512xi1, #blocked> loc(#loc121) + %tmp21_38 = tt.load %tmp21_36, %tmp21_37, %cst_0 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc122) + %tmp21_39 = arith.extf %tmp21_38 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc123) + %tmp22 = arith.select %tmp13_24, %tmp15, %tmp21_39 : tensor<512xi1, #blocked>, tensor<512xf32, #blocked> loc(#loc135) + %tmp24 = tt.splat %ks4 : i64 -> tensor<512xi64, #blocked> loc(#loc126) + %tmp24_40 = arith.addi %tmp1_16, %tmp24 : tensor<512xi64, #blocked> loc(#loc126) + %tmp25 = arith.select %tmp4, %tmp24_40, %tmp1_16 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked> loc(#loc127) + %5 = arith.cmpi sge, %tmp25, %cst_1 : tensor<512xi64, #blocked> loc(#loc55) + %6 = arith.cmpi slt, %tmp25, %tmp24 : tensor<512xi64, #blocked> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<512xi1, #blocked> loc(#loc57) + %8 = arith.ori %7, %3 : tensor<512xi1, #blocked> loc(#loc58) + tt.assert %8, "index out of bounds: 0 <= tmp25 < ks4" : tensor<512xi1, #blocked> loc(#loc59) + %tmp27 = arith.muli %x0, %tmp25 : tensor<512xi64, #blocked> loc(#loc128) + %tmp27_41 = arith.addi %x0_11, %tmp27 : tensor<512xi64, #blocked> loc(#loc129) + %tmp27_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc130) + %tmp27_43 = tt.addptr %tmp27_42, %tmp27_41 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> loc(#loc130) + %tmp27_44 = tt.load %tmp27_43, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr, #blocked> loc(#loc131) + %tmp27_45 = arith.extf %tmp27_44 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> loc(#loc132) + %tmp28 = arith.mulf %tmp22, %tmp27_45 : tensor<512xf32, #blocked> loc(#loc133) + %tmp29 = arith.addf %tmp8, %tmp28 : tensor<512xf32, #blocked> loc(#loc134) + %9 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr, #blocked> loc(#loc67) + %10 = tt.addptr %9, %xindex_5 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> loc(#loc67) + %11 = arith.truncf %tmp29 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> loc(#loc68) + tt.store %10, %11, %xmask_6 : tensor<512x!tt.ptr, #blocked> loc(#loc68) + tt.return loc(#loc69) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":23:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":23:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":24:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":25:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":26:30) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":26:35) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":26:75) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":27:30) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":27:35) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":29:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":30:18) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":31:32) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":32:28) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":32:44) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":32:37) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":32:54) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":32:52) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":32:62) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":33:39) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":33:35) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":33:30) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":33:46) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":33:86) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":34:18) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":38:31) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":38:18) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":39:19) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:35) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:41) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:68) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:60) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:119) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":41:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":44:20) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:52) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:47) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:60) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:41) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:81) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:73) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:132) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":48:35) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":43:35) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":50:19) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":51:34) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":52:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":52:46) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":52:38) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":52:54) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":52:64) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":53:40) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":53:36) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":53:31) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":53:48) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":53:88) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":54:20) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":55:19) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":56:25) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":56:37) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":56:4) +#loc81 = loc("xoffset"(#loc2)) +#loc82 = loc("xoffset"(#loc3)) +#loc83 = loc("xindex"(#loc4)) +#loc84 = loc("xindex"(#loc5)) +#loc85 = loc("xmask"(#loc6)) +#loc86 = loc("x2"(#loc7)) +#loc87 = loc("x2"(#loc8)) +#loc88 = loc("x0"(#loc9)) +#loc89 = loc("x5"(#loc10)) +#loc90 = loc("tmp0"(#loc11)) +#loc91 = loc("tmp0"(#loc12)) +#loc92 = loc("tmp0"(#loc13)) +#loc93 = loc("tmp1"(#loc14)) +#loc94 = loc("tmp1"(#loc15)) +#loc95 = loc("tmp3"(#loc16)) +#loc96 = loc("tmp4"(#loc17)) +#loc97 = loc("tmp5"(#loc18)) +#loc98 = loc("tmp7"(#loc25)) +#loc99 = loc("tmp7"(#loc26)) +#loc100 = loc("tmp7"(#loc27)) +#loc101 = loc("tmp7"(#loc28)) +#loc102 = loc("tmp7"(#loc29)) +#loc103 = loc("tmp8"(#loc30)) +#loc104 = loc("tmp12"(#loc31)) +#loc105 = loc("tmp12"(#loc32)) +#loc106 = loc("tmp13"(#loc33)) +#loc107 = loc("tmp14"(#loc34)) +#loc108 = loc("tmp14"(#loc35)) +#loc109 = loc("tmp14"(#loc36)) +#loc110 = loc("tmp14"(#loc37)) +#loc111 = loc("tmp14"(#loc38)) +#loc112 = loc("tmp14"(#loc39)) +#loc113 = loc("tmp14"(#loc40)) +#loc114 = loc("tmp15"(#loc41)) +#loc115 = loc("tmp18"(#loc42)) +#loc116 = loc("tmp21"(#loc43)) +#loc117 = loc("tmp21"(#loc44)) +#loc118 = loc("tmp21"(#loc45)) +#loc119 = loc("tmp21"(#loc46)) +#loc120 = loc("tmp21"(#loc47)) +#loc121 = loc("tmp21"(#loc48)) +#loc122 = loc("tmp21"(#loc49)) +#loc123 = loc("tmp21"(#loc50)) +#loc124 = loc("tmp22"(#loc51)) +#loc125 = loc("tmp17"(#loc52)) +#loc126 = loc("tmp24"(#loc53)) +#loc127 = loc("tmp25"(#loc54)) +#loc128 = loc("tmp27"(#loc60)) +#loc129 = loc("tmp27"(#loc61)) +#loc130 = loc("tmp27"(#loc62)) +#loc131 = loc("tmp27"(#loc63)) +#loc132 = loc("tmp27"(#loc64)) +#loc133 = loc("tmp28"(#loc65)) +#loc134 = loc("tmp29"(#loc66)) +#loc135 = loc(fused[#loc124, #loc125]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.ttir new file mode 100644 index 0000000000000000000000000000000000000000..4e4b5c230ffd5bbff0799e3533d1abe941c3068f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/Z2RWAHMO7VUWQKIIRA5A46JYV2SEXHWLKREQM7TOP6VGUWDXAYAQ/triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1.ttir @@ -0,0 +1,231 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":18:0) +#loc70 = loc("in_ptr0"(#loc)) +#loc71 = loc("in_ptr1"(#loc)) +#loc72 = loc("in_ptr2"(#loc)) +#loc73 = loc("in_ptr3"(#loc)) +#loc74 = loc("out_ptr0"(#loc)) +#loc75 = loc("ks0"(#loc)) +#loc76 = loc("ks1"(#loc)) +#loc77 = loc("ks2"(#loc)) +#loc78 = loc("ks3"(#loc)) +#loc79 = loc("ks4"(#loc)) +#loc80 = loc("xnumel"(#loc)) +module { + tt.func public @triton_poi_fused_add_cat_index_mul_neg_slice_squeeze_unsqueeze_1(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %in_ptr3: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr3"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %ks2: i64 loc("ks2"(#loc)), %ks3: i64 loc("ks3"(#loc)), %ks4: i64 loc("ks4"(#loc)), %xnumel: i32 loc("xnumel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<512xbf16> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<512xi64> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<512xf32> loc(#loc1) + %c-1_i64 = arith.constant -1 : i64 loc(#loc1) + %c2_i64 = arith.constant 2 : i64 loc(#loc1) + %cst_2 = arith.constant dense : tensor<512xi1> loc(#loc1) + %c512_i32 = arith.constant 512 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc81) + %xoffset_3 = arith.muli %xoffset, %c512_i32 : i32 loc(#loc82) + %xindex = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> loc(#loc83) + %xindex_4 = tt.splat %xoffset_3 : i32 -> tensor<512xi32> loc(#loc84) + %xindex_5 = arith.addi %xindex_4, %xindex : tensor<512xi32> loc(#loc84) + %xmask = tt.splat %xnumel : i32 -> tensor<512xi32> loc(#loc85) + %xmask_6 = arith.cmpi slt, %xindex_5, %xmask : tensor<512xi32> loc(#loc85) + %x2 = arith.extsi %xindex_5 : tensor<512xi32> to tensor<512xi64> loc(#loc86) + %x2_7 = tt.splat %ks0 : i64 -> tensor<512xi64> loc(#loc86) + %x2_8 = arith.divsi %x2, %x2_7 : tensor<512xi64> loc(#loc86) + %x2_9 = tt.splat %ks1 : i64 -> tensor<512xi64> loc(#loc87) + %x2_10 = arith.remsi %x2_8, %x2_9 : tensor<512xi64> loc(#loc87) + %x0 = tt.splat %ks3 : i64 -> tensor<512xi64> loc(#loc88) + %x0_11 = arith.remsi %x2, %x0 : tensor<512xi64> loc(#loc88) + %x5 = arith.divsi %x2, %x0 : tensor<512xi64> loc(#loc89) + %tmp0 = tt.splat %in_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc90) + %tmp0_12 = tt.addptr %tmp0, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc90) + %tmp0_13 = tt.load %tmp0_12, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc91) + %tmp0_14 = arith.extf %tmp0_13 : tensor<512xbf16> to tensor<512xf32> loc(#loc92) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc93) + %tmp1_15 = tt.addptr %tmp1, %x2_10 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc93) + %tmp1_16 = tt.load %tmp1_15, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc94) + %tmp3 = tt.splat %ks2 : i64 -> tensor<512xi64> loc(#loc95) + %tmp3_17 = arith.addi %tmp1_16, %tmp3 : tensor<512xi64> loc(#loc95) + %tmp4 = arith.cmpi slt, %tmp1_16, %cst_0 : tensor<512xi64> loc(#loc96) + %tmp5 = arith.select %tmp4, %tmp3_17, %tmp1_16 : tensor<512xi1>, tensor<512xi64> loc(#loc97) + %0 = arith.cmpi sge, %tmp5, %cst_0 : tensor<512xi64> loc(#loc19) + %1 = arith.cmpi slt, %tmp5, %tmp3 : tensor<512xi64> loc(#loc20) + %2 = arith.andi %0, %1 : tensor<512xi1> loc(#loc21) + %3 = arith.xori %xmask_6, %cst_2 : tensor<512xi1> loc(#loc22) + %4 = arith.ori %2, %3 : tensor<512xi1> loc(#loc23) + tt.assert %4, "index out of bounds: 0 <= tmp5 < ks2" : tensor<512xi1> loc(#loc24) + %tmp7 = arith.muli %x0, %tmp5 : tensor<512xi64> loc(#loc98) + %tmp7_18 = arith.addi %x0_11, %tmp7 : tensor<512xi64> loc(#loc99) + %tmp7_19 = tt.splat %in_ptr2 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc100) + %tmp7_20 = tt.addptr %tmp7_19, %tmp7_18 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc100) + %tmp7_21 = tt.load %tmp7_20, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc101) + %tmp7_22 = arith.extf %tmp7_21 : tensor<512xbf16> to tensor<512xf32> loc(#loc102) + %tmp8 = arith.mulf %tmp0_14, %tmp7_22 : tensor<512xf32> loc(#loc103) + %tmp12 = arith.divsi %ks3, %c2_i64 : i64 loc(#loc104) + %tmp12_23 = arith.subi %ks3, %tmp12 : i64 loc(#loc105) + %tmp13 = tt.splat %tmp12_23 : i64 -> tensor<512xi64> loc(#loc106) + %tmp13_24 = arith.cmpi slt, %x0_11, %tmp13 : tensor<512xi64> loc(#loc106) + %tmp14 = arith.muli %x0, %x5 : tensor<512xi64> loc(#loc107) + %tmp14_25 = tt.splat %tmp12 : i64 -> tensor<512xi64> loc(#loc108) + %tmp14_26 = arith.addi %tmp14, %tmp14_25 : tensor<512xi64> loc(#loc108) + %tmp14_27 = arith.addi %tmp14_26, %x0_11 : tensor<512xi64> loc(#loc109) + %tmp14_28 = tt.addptr %tmp0, %tmp14_27 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc110) + %tmp14_29 = arith.andi %tmp13_24, %xmask_6 : tensor<512xi1> loc(#loc111) + %tmp14_30 = tt.load %tmp14_28, %tmp14_29, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc112) + %tmp14_31 = arith.extf %tmp14_30 : tensor<512xbf16> to tensor<512xf32> loc(#loc113) + %tmp15 = arith.subf %cst_1, %tmp14_31 : tensor<512xf32> loc(#loc114) + %tmp18 = arith.cmpi sge, %x0_11, %tmp13 : tensor<512xi64> loc(#loc115) + %tmp21 = arith.muli %ks3, %c-1_i64 : i64 loc(#loc116) + %tmp21_32 = tt.splat %tmp21 : i64 -> tensor<512xi64> loc(#loc117) + %tmp21_33 = arith.addi %x0_11, %tmp21_32 : tensor<512xi64> loc(#loc117) + %tmp21_34 = arith.addi %tmp21_33, %tmp14_25 : tensor<512xi64> loc(#loc118) + %tmp21_35 = arith.addi %tmp14, %tmp21_34 : tensor<512xi64> loc(#loc119) + %tmp21_36 = tt.addptr %tmp0, %tmp21_35 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc120) + %tmp21_37 = arith.andi %tmp18, %xmask_6 : tensor<512xi1> loc(#loc121) + %tmp21_38 = tt.load %tmp21_36, %tmp21_37, %cst evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc122) + %tmp21_39 = arith.extf %tmp21_38 : tensor<512xbf16> to tensor<512xf32> loc(#loc123) + %tmp22 = arith.select %tmp13_24, %tmp15, %tmp21_39 : tensor<512xi1>, tensor<512xf32> loc(#loc135) + %tmp24 = tt.splat %ks4 : i64 -> tensor<512xi64> loc(#loc126) + %tmp24_40 = arith.addi %tmp1_16, %tmp24 : tensor<512xi64> loc(#loc126) + %tmp25 = arith.select %tmp4, %tmp24_40, %tmp1_16 : tensor<512xi1>, tensor<512xi64> loc(#loc127) + %5 = arith.cmpi sge, %tmp25, %cst_0 : tensor<512xi64> loc(#loc55) + %6 = arith.cmpi slt, %tmp25, %tmp24 : tensor<512xi64> loc(#loc56) + %7 = arith.andi %5, %6 : tensor<512xi1> loc(#loc57) + %8 = arith.ori %7, %3 : tensor<512xi1> loc(#loc58) + tt.assert %8, "index out of bounds: 0 <= tmp25 < ks4" : tensor<512xi1> loc(#loc59) + %tmp27 = arith.muli %x0, %tmp25 : tensor<512xi64> loc(#loc128) + %tmp27_41 = arith.addi %x0_11, %tmp27 : tensor<512xi64> loc(#loc129) + %tmp27_42 = tt.splat %in_ptr3 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc130) + %tmp27_43 = tt.addptr %tmp27_42, %tmp27_41 : tensor<512x!tt.ptr>, tensor<512xi64> loc(#loc130) + %tmp27_44 = tt.load %tmp27_43, %xmask_6 evictionPolicy = evict_last : tensor<512x!tt.ptr> loc(#loc131) + %tmp27_45 = arith.extf %tmp27_44 : tensor<512xbf16> to tensor<512xf32> loc(#loc132) + %tmp28 = arith.mulf %tmp22, %tmp27_45 : tensor<512xf32> loc(#loc133) + %tmp29 = arith.addf %tmp8, %tmp28 : tensor<512xf32> loc(#loc134) + %9 = tt.splat %out_ptr0 : !tt.ptr -> tensor<512x!tt.ptr> loc(#loc67) + %10 = tt.addptr %9, %xindex_5 : tensor<512x!tt.ptr>, tensor<512xi32> loc(#loc67) + %11 = arith.truncf %tmp29 : tensor<512xf32> to tensor<512xbf16> loc(#loc68) + tt.store %10, %11, %xmask_6 : tensor<512x!tt.ptr> loc(#loc68) + tt.return loc(#loc69) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":19:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":19:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":20:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":20:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":21:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":23:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":23:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":24:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":25:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":26:30) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":26:35) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":26:75) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":27:30) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":27:35) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":29:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":30:18) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":31:32) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":32:28) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":32:44) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":32:37) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":32:54) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":32:52) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":32:62) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":33:39) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":33:35) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":33:30) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":33:46) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":33:86) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":34:18) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":38:31) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":38:18) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":39:19) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:35) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:41) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:54) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:31) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:68) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:60) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":40:119) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":41:13) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":44:20) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:52) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:47) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:60) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:41) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:31) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:81) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:73) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":47:132) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":48:35) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":43:35) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":50:19) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":51:34) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":52:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":52:46) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":52:38) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":52:54) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":52:64) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":53:40) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":53:36) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":53:31) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":53:48) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":53:88) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":54:20) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":55:19) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":56:25) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":56:37) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/ym/cymroqqgat67pxc4rnyt5dzn7thy5u4fbc3jl5cbhuh5rx3hz3bb.py":56:4) +#loc81 = loc("xoffset"(#loc2)) +#loc82 = loc("xoffset"(#loc3)) +#loc83 = loc("xindex"(#loc4)) +#loc84 = loc("xindex"(#loc5)) +#loc85 = loc("xmask"(#loc6)) +#loc86 = loc("x2"(#loc7)) +#loc87 = loc("x2"(#loc8)) +#loc88 = loc("x0"(#loc9)) +#loc89 = loc("x5"(#loc10)) +#loc90 = loc("tmp0"(#loc11)) +#loc91 = loc("tmp0"(#loc12)) +#loc92 = loc("tmp0"(#loc13)) +#loc93 = loc("tmp1"(#loc14)) +#loc94 = loc("tmp1"(#loc15)) +#loc95 = loc("tmp3"(#loc16)) +#loc96 = loc("tmp4"(#loc17)) +#loc97 = loc("tmp5"(#loc18)) +#loc98 = loc("tmp7"(#loc25)) +#loc99 = loc("tmp7"(#loc26)) +#loc100 = loc("tmp7"(#loc27)) +#loc101 = loc("tmp7"(#loc28)) +#loc102 = loc("tmp7"(#loc29)) +#loc103 = loc("tmp8"(#loc30)) +#loc104 = loc("tmp12"(#loc31)) +#loc105 = loc("tmp12"(#loc32)) +#loc106 = loc("tmp13"(#loc33)) +#loc107 = loc("tmp14"(#loc34)) +#loc108 = loc("tmp14"(#loc35)) +#loc109 = loc("tmp14"(#loc36)) +#loc110 = loc("tmp14"(#loc37)) +#loc111 = loc("tmp14"(#loc38)) +#loc112 = loc("tmp14"(#loc39)) +#loc113 = loc("tmp14"(#loc40)) +#loc114 = loc("tmp15"(#loc41)) +#loc115 = loc("tmp18"(#loc42)) +#loc116 = loc("tmp21"(#loc43)) +#loc117 = loc("tmp21"(#loc44)) +#loc118 = loc("tmp21"(#loc45)) +#loc119 = loc("tmp21"(#loc46)) +#loc120 = loc("tmp21"(#loc47)) +#loc121 = loc("tmp21"(#loc48)) +#loc122 = loc("tmp21"(#loc49)) +#loc123 = loc("tmp21"(#loc50)) +#loc124 = loc("tmp22"(#loc51)) +#loc125 = loc("tmp17"(#loc52)) +#loc126 = loc("tmp24"(#loc53)) +#loc127 = loc("tmp25"(#loc54)) +#loc128 = loc("tmp27"(#loc60)) +#loc129 = loc("tmp27"(#loc61)) +#loc130 = loc("tmp27"(#loc62)) +#loc131 = loc("tmp27"(#loc63)) +#loc132 = loc("tmp27"(#loc64)) +#loc133 = loc("tmp28"(#loc65)) +#loc134 = loc("tmp29"(#loc66)) +#loc135 = loc(fused[#loc124, #loc125]) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f05409c9c09dd9dca87778d26d42b864bd9f1ecf --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6311d7f97a1c6bf091731e7dfb62c487ddbcff01 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json @@ -0,0 +1 @@ +{"hash": "c900bbb1401e9a840fd6265f230b02b4f4aacf4c472e5161655ab20f66141e12", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 16384, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..5030686743cf8e7bc74bcb827b18329c6c4a6a05 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir @@ -0,0 +1,10206 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !5 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = and i32 %8, 31, !dbg !9 + %10 = lshr i32 %8, 5, !dbg !9 + %11 = and i32 %8, 511, !dbg !9 + %12 = shl nuw nsw i32 %11, 3, !dbg !9 + %13 = or disjoint i32 %12, 28672, !dbg !9 + %14 = shl nuw nsw i32 %11, 2, !dbg !9 + %15 = or disjoint i32 %14, 30720, !dbg !9 + %16 = icmp samesign ult i32 %13, 32000, !dbg !10 + %17 = icmp samesign ult i32 %15, 32000, !dbg !10 + %18 = mul i32 %7, 32000, !dbg !11 + %19 = add i32 %12, %18, !dbg !12 + %20 = add i32 %18, 4096, !dbg !9 + %21 = add i32 %20, %12, !dbg !12 + %22 = add i32 %18, 8192, !dbg !9 + %23 = add i32 %22, %12, !dbg !12 + %24 = add i32 %18, 12288, !dbg !9 + %25 = add i32 %24, %12, !dbg !12 + %26 = add i32 %18, 16384, !dbg !9 + %27 = add i32 %26, %12, !dbg !12 + %28 = add i32 %18, 20480, !dbg !9 + %29 = add i32 %28, %12, !dbg !12 + %30 = add i32 %18, 24576, !dbg !9 + %31 = add i32 %30, %12, !dbg !12 + %32 = add i32 %13, %18, !dbg !12 + %33 = add i32 %14, %18, !dbg !12 + %34 = add i32 %18, 2048, !dbg !9 + %35 = add i32 %34, %14, !dbg !12 + %36 = add i32 %20, %14, !dbg !12 + %37 = add i32 %18, 6144, !dbg !9 + %38 = add i32 %37, %14, !dbg !12 + %39 = add i32 %22, %14, !dbg !12 + %40 = add i32 %18, 10240, !dbg !9 + %41 = add i32 %40, %14, !dbg !12 + %42 = add i32 %24, %14, !dbg !12 + %43 = add i32 %18, 14336, !dbg !9 + %44 = add i32 %43, %14, !dbg !12 + %45 = add i32 %26, %14, !dbg !12 + %46 = add i32 %18, 18432, !dbg !9 + %47 = add i32 %46, %14, !dbg !12 + %48 = add i32 %28, %14, !dbg !12 + %49 = add i32 %18, 22528, !dbg !9 + %50 = add i32 %49, %14, !dbg !12 + %51 = add i32 %30, %14, !dbg !12 + %52 = add i32 %18, 26624, !dbg !9 + %53 = add i32 %52, %14, !dbg !12 + %54 = add i32 %18, 28672, !dbg !9 + %55 = add i32 %54, %14, !dbg !12 + %56 = add i32 %15, %18, !dbg !12 + %57 = sext i32 %19 to i64, !dbg !13 + %58 = getelementptr bfloat, ptr addrspace(1) %0, i64 %57, !dbg !13 + %59 = sext i32 %21 to i64, !dbg !13 + %60 = getelementptr bfloat, ptr addrspace(1) %0, i64 %59, !dbg !13 + %61 = sext i32 %23 to i64, !dbg !13 + %62 = getelementptr bfloat, ptr addrspace(1) %0, i64 %61, !dbg !13 + %63 = sext i32 %25 to i64, !dbg !13 + %64 = getelementptr bfloat, ptr addrspace(1) %0, i64 %63, !dbg !13 + %65 = sext i32 %27 to i64, !dbg !13 + %66 = getelementptr bfloat, ptr addrspace(1) %0, i64 %65, !dbg !13 + %67 = sext i32 %29 to i64, !dbg !13 + %68 = getelementptr bfloat, ptr addrspace(1) %0, i64 %67, !dbg !13 + %69 = sext i32 %31 to i64, !dbg !13 + %70 = getelementptr bfloat, ptr addrspace(1) %0, i64 %69, !dbg !13 + %71 = sext i32 %32 to i64, !dbg !13 + %72 = getelementptr bfloat, ptr addrspace(1) %0, i64 %71, !dbg !13 + %73 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %74 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %58, i64 %73, i1 true) #6, !dbg !14 + %75 = extractvalue { i32, i32, i32, i32 } %74, 0, !dbg !14 + %76 = bitcast i32 %75 to <2 x bfloat>, !dbg !14 + %77 = extractvalue { i32, i32, i32, i32 } %74, 1, !dbg !14 + %78 = bitcast i32 %77 to <2 x bfloat>, !dbg !14 + %79 = extractvalue { i32, i32, i32, i32 } %74, 2, !dbg !14 + %80 = bitcast i32 %79 to <2 x bfloat>, !dbg !14 + %81 = extractvalue { i32, i32, i32, i32 } %74, 3, !dbg !14 + %82 = bitcast i32 %81 to <2 x bfloat>, !dbg !14 + %83 = extractelement <2 x bfloat> %76, i64 0, !dbg !14 + %84 = extractelement <2 x bfloat> %76, i64 1, !dbg !14 + %85 = extractelement <2 x bfloat> %78, i64 0, !dbg !14 + %86 = extractelement <2 x bfloat> %78, i64 1, !dbg !14 + %87 = extractelement <2 x bfloat> %80, i64 0, !dbg !14 + %88 = extractelement <2 x bfloat> %80, i64 1, !dbg !14 + %89 = extractelement <2 x bfloat> %82, i64 0, !dbg !14 + %90 = extractelement <2 x bfloat> %82, i64 1, !dbg !14 + %91 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %92 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %60, i64 %91, i1 true) #6, !dbg !14 + %93 = extractvalue { i32, i32, i32, i32 } %92, 0, !dbg !14 + %94 = bitcast i32 %93 to <2 x bfloat>, !dbg !14 + %95 = extractvalue { i32, i32, i32, i32 } %92, 1, !dbg !14 + %96 = bitcast i32 %95 to <2 x bfloat>, !dbg !14 + %97 = extractvalue { i32, i32, i32, i32 } %92, 2, !dbg !14 + %98 = bitcast i32 %97 to <2 x bfloat>, !dbg !14 + %99 = extractvalue { i32, i32, i32, i32 } %92, 3, !dbg !14 + %100 = bitcast i32 %99 to <2 x bfloat>, !dbg !14 + %101 = extractelement <2 x bfloat> %94, i64 0, !dbg !14 + %102 = extractelement <2 x bfloat> %94, i64 1, !dbg !14 + %103 = extractelement <2 x bfloat> %96, i64 0, !dbg !14 + %104 = extractelement <2 x bfloat> %96, i64 1, !dbg !14 + %105 = extractelement <2 x bfloat> %98, i64 0, !dbg !14 + %106 = extractelement <2 x bfloat> %98, i64 1, !dbg !14 + %107 = extractelement <2 x bfloat> %100, i64 0, !dbg !14 + %108 = extractelement <2 x bfloat> %100, i64 1, !dbg !14 + %109 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %110 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %62, i64 %109, i1 true) #6, !dbg !14 + %111 = extractvalue { i32, i32, i32, i32 } %110, 0, !dbg !14 + %112 = bitcast i32 %111 to <2 x bfloat>, !dbg !14 + %113 = extractvalue { i32, i32, i32, i32 } %110, 1, !dbg !14 + %114 = bitcast i32 %113 to <2 x bfloat>, !dbg !14 + %115 = extractvalue { i32, i32, i32, i32 } %110, 2, !dbg !14 + %116 = bitcast i32 %115 to <2 x bfloat>, !dbg !14 + %117 = extractvalue { i32, i32, i32, i32 } %110, 3, !dbg !14 + %118 = bitcast i32 %117 to <2 x bfloat>, !dbg !14 + %119 = extractelement <2 x bfloat> %112, i64 0, !dbg !14 + %120 = extractelement <2 x bfloat> %112, i64 1, !dbg !14 + %121 = extractelement <2 x bfloat> %114, i64 0, !dbg !14 + %122 = extractelement <2 x bfloat> %114, i64 1, !dbg !14 + %123 = extractelement <2 x bfloat> %116, i64 0, !dbg !14 + %124 = extractelement <2 x bfloat> %116, i64 1, !dbg !14 + %125 = extractelement <2 x bfloat> %118, i64 0, !dbg !14 + %126 = extractelement <2 x bfloat> %118, i64 1, !dbg !14 + %127 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %128 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %64, i64 %127, i1 true) #6, !dbg !14 + %129 = extractvalue { i32, i32, i32, i32 } %128, 0, !dbg !14 + %130 = bitcast i32 %129 to <2 x bfloat>, !dbg !14 + %131 = extractvalue { i32, i32, i32, i32 } %128, 1, !dbg !14 + %132 = bitcast i32 %131 to <2 x bfloat>, !dbg !14 + %133 = extractvalue { i32, i32, i32, i32 } %128, 2, !dbg !14 + %134 = bitcast i32 %133 to <2 x bfloat>, !dbg !14 + %135 = extractvalue { i32, i32, i32, i32 } %128, 3, !dbg !14 + %136 = bitcast i32 %135 to <2 x bfloat>, !dbg !14 + %137 = extractelement <2 x bfloat> %130, i64 0, !dbg !14 + %138 = extractelement <2 x bfloat> %130, i64 1, !dbg !14 + %139 = extractelement <2 x bfloat> %132, i64 0, !dbg !14 + %140 = extractelement <2 x bfloat> %132, i64 1, !dbg !14 + %141 = extractelement <2 x bfloat> %134, i64 0, !dbg !14 + %142 = extractelement <2 x bfloat> %134, i64 1, !dbg !14 + %143 = extractelement <2 x bfloat> %136, i64 0, !dbg !14 + %144 = extractelement <2 x bfloat> %136, i64 1, !dbg !14 + %145 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %146 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %66, i64 %145, i1 true) #6, !dbg !14 + %147 = extractvalue { i32, i32, i32, i32 } %146, 0, !dbg !14 + %148 = bitcast i32 %147 to <2 x bfloat>, !dbg !14 + %149 = extractvalue { i32, i32, i32, i32 } %146, 1, !dbg !14 + %150 = bitcast i32 %149 to <2 x bfloat>, !dbg !14 + %151 = extractvalue { i32, i32, i32, i32 } %146, 2, !dbg !14 + %152 = bitcast i32 %151 to <2 x bfloat>, !dbg !14 + %153 = extractvalue { i32, i32, i32, i32 } %146, 3, !dbg !14 + %154 = bitcast i32 %153 to <2 x bfloat>, !dbg !14 + %155 = extractelement <2 x bfloat> %148, i64 0, !dbg !14 + %156 = extractelement <2 x bfloat> %148, i64 1, !dbg !14 + %157 = extractelement <2 x bfloat> %150, i64 0, !dbg !14 + %158 = extractelement <2 x bfloat> %150, i64 1, !dbg !14 + %159 = extractelement <2 x bfloat> %152, i64 0, !dbg !14 + %160 = extractelement <2 x bfloat> %152, i64 1, !dbg !14 + %161 = extractelement <2 x bfloat> %154, i64 0, !dbg !14 + %162 = extractelement <2 x bfloat> %154, i64 1, !dbg !14 + %163 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %164 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %68, i64 %163, i1 true) #6, !dbg !14 + %165 = extractvalue { i32, i32, i32, i32 } %164, 0, !dbg !14 + %166 = bitcast i32 %165 to <2 x bfloat>, !dbg !14 + %167 = extractvalue { i32, i32, i32, i32 } %164, 1, !dbg !14 + %168 = bitcast i32 %167 to <2 x bfloat>, !dbg !14 + %169 = extractvalue { i32, i32, i32, i32 } %164, 2, !dbg !14 + %170 = bitcast i32 %169 to <2 x bfloat>, !dbg !14 + %171 = extractvalue { i32, i32, i32, i32 } %164, 3, !dbg !14 + %172 = bitcast i32 %171 to <2 x bfloat>, !dbg !14 + %173 = extractelement <2 x bfloat> %166, i64 0, !dbg !14 + %174 = extractelement <2 x bfloat> %166, i64 1, !dbg !14 + %175 = extractelement <2 x bfloat> %168, i64 0, !dbg !14 + %176 = extractelement <2 x bfloat> %168, i64 1, !dbg !14 + %177 = extractelement <2 x bfloat> %170, i64 0, !dbg !14 + %178 = extractelement <2 x bfloat> %170, i64 1, !dbg !14 + %179 = extractelement <2 x bfloat> %172, i64 0, !dbg !14 + %180 = extractelement <2 x bfloat> %172, i64 1, !dbg !14 + %181 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %182 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %70, i64 %181, i1 true) #6, !dbg !14 + %183 = extractvalue { i32, i32, i32, i32 } %182, 0, !dbg !14 + %184 = bitcast i32 %183 to <2 x bfloat>, !dbg !14 + %185 = extractvalue { i32, i32, i32, i32 } %182, 1, !dbg !14 + %186 = bitcast i32 %185 to <2 x bfloat>, !dbg !14 + %187 = extractvalue { i32, i32, i32, i32 } %182, 2, !dbg !14 + %188 = bitcast i32 %187 to <2 x bfloat>, !dbg !14 + %189 = extractvalue { i32, i32, i32, i32 } %182, 3, !dbg !14 + %190 = bitcast i32 %189 to <2 x bfloat>, !dbg !14 + %191 = extractelement <2 x bfloat> %184, i64 0, !dbg !14 + %192 = extractelement <2 x bfloat> %184, i64 1, !dbg !14 + %193 = extractelement <2 x bfloat> %186, i64 0, !dbg !14 + %194 = extractelement <2 x bfloat> %186, i64 1, !dbg !14 + %195 = extractelement <2 x bfloat> %188, i64 0, !dbg !14 + %196 = extractelement <2 x bfloat> %188, i64 1, !dbg !14 + %197 = extractelement <2 x bfloat> %190, i64 0, !dbg !14 + %198 = extractelement <2 x bfloat> %190, i64 1, !dbg !14 + %199 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !14 + %200 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %72, i64 %199, i1 %16) #6, !dbg !14 + %201 = extractvalue { i32, i32, i32, i32 } %200, 0, !dbg !14 + %202 = bitcast i32 %201 to <2 x bfloat>, !dbg !14 + %203 = extractvalue { i32, i32, i32, i32 } %200, 1, !dbg !14 + %204 = bitcast i32 %203 to <2 x bfloat>, !dbg !14 + %205 = extractvalue { i32, i32, i32, i32 } %200, 2, !dbg !14 + %206 = bitcast i32 %205 to <2 x bfloat>, !dbg !14 + %207 = extractvalue { i32, i32, i32, i32 } %200, 3, !dbg !14 + %208 = bitcast i32 %207 to <2 x bfloat>, !dbg !14 + %209 = extractelement <2 x bfloat> %202, i64 0, !dbg !14 + %210 = extractelement <2 x bfloat> %202, i64 1, !dbg !14 + %211 = fpext bfloat %83 to float, !dbg !15 + %212 = fpext bfloat %84 to float, !dbg !15 + %213 = fpext bfloat %85 to float, !dbg !15 + %214 = fpext bfloat %86 to float, !dbg !15 + %215 = fpext bfloat %87 to float, !dbg !15 + %216 = fpext bfloat %88 to float, !dbg !15 + %217 = fpext bfloat %89 to float, !dbg !15 + %218 = fpext bfloat %90 to float, !dbg !15 + %219 = fpext bfloat %101 to float, !dbg !15 + %220 = fpext bfloat %102 to float, !dbg !15 + %221 = fpext bfloat %103 to float, !dbg !15 + %222 = fpext bfloat %104 to float, !dbg !15 + %223 = fpext bfloat %105 to float, !dbg !15 + %224 = fpext bfloat %106 to float, !dbg !15 + %225 = fpext bfloat %107 to float, !dbg !15 + %226 = fpext bfloat %108 to float, !dbg !15 + %227 = fpext bfloat %119 to float, !dbg !15 + %228 = fpext bfloat %120 to float, !dbg !15 + %229 = fpext bfloat %121 to float, !dbg !15 + %230 = fpext bfloat %122 to float, !dbg !15 + %231 = fpext bfloat %123 to float, !dbg !15 + %232 = fpext bfloat %124 to float, !dbg !15 + %233 = fpext bfloat %125 to float, !dbg !15 + %234 = fpext bfloat %126 to float, !dbg !15 + %235 = fpext bfloat %137 to float, !dbg !15 + %236 = fpext bfloat %138 to float, !dbg !15 + %237 = fpext bfloat %139 to float, !dbg !15 + %238 = fpext bfloat %140 to float, !dbg !15 + %239 = fpext bfloat %141 to float, !dbg !15 + %240 = fpext bfloat %142 to float, !dbg !15 + %241 = fpext bfloat %143 to float, !dbg !15 + %242 = fpext bfloat %144 to float, !dbg !15 + %243 = fpext bfloat %155 to float, !dbg !15 + %244 = fpext bfloat %156 to float, !dbg !15 + %245 = fpext bfloat %157 to float, !dbg !15 + %246 = fpext bfloat %158 to float, !dbg !15 + %247 = fpext bfloat %159 to float, !dbg !15 + %248 = fpext bfloat %160 to float, !dbg !15 + %249 = fpext bfloat %161 to float, !dbg !15 + %250 = fpext bfloat %162 to float, !dbg !15 + %251 = fpext bfloat %173 to float, !dbg !15 + %252 = fpext bfloat %174 to float, !dbg !15 + %253 = fpext bfloat %175 to float, !dbg !15 + %254 = fpext bfloat %176 to float, !dbg !15 + %255 = fpext bfloat %177 to float, !dbg !15 + %256 = fpext bfloat %178 to float, !dbg !15 + %257 = fpext bfloat %179 to float, !dbg !15 + %258 = fpext bfloat %180 to float, !dbg !15 + %259 = fpext bfloat %191 to float, !dbg !15 + %260 = fpext bfloat %192 to float, !dbg !15 + %261 = fpext bfloat %193 to float, !dbg !15 + %262 = fpext bfloat %194 to float, !dbg !15 + %263 = fpext bfloat %195 to float, !dbg !15 + %264 = fpext bfloat %196 to float, !dbg !15 + %265 = fpext bfloat %197 to float, !dbg !15 + %266 = fpext bfloat %198 to float, !dbg !15 + %267 = fpext bfloat %209 to float, !dbg !15 + %268 = fpext bfloat %210 to float, !dbg !15 + %269 = extractelement <2 x bfloat> %204, i64 0, !dbg !15 + %270 = fpext bfloat %269 to float, !dbg !15 + %271 = extractelement <2 x bfloat> %204, i64 1, !dbg !15 + %272 = fpext bfloat %271 to float, !dbg !15 + %273 = extractelement <2 x bfloat> %206, i64 0, !dbg !15 + %274 = fpext bfloat %273 to float, !dbg !15 + %275 = extractelement <2 x bfloat> %206, i64 1, !dbg !15 + %276 = fpext bfloat %275 to float, !dbg !15 + %277 = extractelement <2 x bfloat> %208, i64 0, !dbg !15 + %278 = fpext bfloat %277 to float, !dbg !15 + %279 = extractelement <2 x bfloat> %208, i64 1, !dbg !15 + %280 = fpext bfloat %279 to float, !dbg !15 + %281 = fcmp oeq bfloat %83, 0xRFF80, !dbg !16 + %282 = fcmp oeq bfloat %84, 0xRFF80, !dbg !16 + %283 = fcmp oeq bfloat %85, 0xRFF80, !dbg !16 + %284 = fcmp oeq bfloat %86, 0xRFF80, !dbg !16 + %285 = fcmp oeq bfloat %87, 0xRFF80, !dbg !16 + %286 = fcmp oeq bfloat %88, 0xRFF80, !dbg !16 + %287 = fcmp oeq bfloat %89, 0xRFF80, !dbg !16 + %288 = fcmp oeq bfloat %90, 0xRFF80, !dbg !16 + %289 = fcmp oeq bfloat %101, 0xRFF80, !dbg !16 + %290 = fcmp oeq bfloat %102, 0xRFF80, !dbg !16 + %291 = fcmp oeq bfloat %103, 0xRFF80, !dbg !16 + %292 = fcmp oeq bfloat %104, 0xRFF80, !dbg !16 + %293 = fcmp oeq bfloat %105, 0xRFF80, !dbg !16 + %294 = fcmp oeq bfloat %106, 0xRFF80, !dbg !16 + %295 = fcmp oeq bfloat %107, 0xRFF80, !dbg !16 + %296 = fcmp oeq bfloat %108, 0xRFF80, !dbg !16 + %297 = fcmp oeq bfloat %119, 0xRFF80, !dbg !16 + %298 = fcmp oeq bfloat %120, 0xRFF80, !dbg !16 + %299 = fcmp oeq bfloat %121, 0xRFF80, !dbg !16 + %300 = fcmp oeq bfloat %122, 0xRFF80, !dbg !16 + %301 = fcmp oeq bfloat %123, 0xRFF80, !dbg !16 + %302 = fcmp oeq bfloat %124, 0xRFF80, !dbg !16 + %303 = fcmp oeq bfloat %125, 0xRFF80, !dbg !16 + %304 = fcmp oeq bfloat %126, 0xRFF80, !dbg !16 + %305 = fcmp oeq bfloat %137, 0xRFF80, !dbg !16 + %306 = fcmp oeq bfloat %138, 0xRFF80, !dbg !16 + %307 = fcmp oeq bfloat %139, 0xRFF80, !dbg !16 + %308 = fcmp oeq bfloat %140, 0xRFF80, !dbg !16 + %309 = fcmp oeq bfloat %141, 0xRFF80, !dbg !16 + %310 = fcmp oeq bfloat %142, 0xRFF80, !dbg !16 + %311 = fcmp oeq bfloat %143, 0xRFF80, !dbg !16 + %312 = fcmp oeq bfloat %144, 0xRFF80, !dbg !16 + %313 = fcmp oeq bfloat %155, 0xRFF80, !dbg !16 + %314 = fcmp oeq bfloat %156, 0xRFF80, !dbg !16 + %315 = fcmp oeq bfloat %157, 0xRFF80, !dbg !16 + %316 = fcmp oeq bfloat %158, 0xRFF80, !dbg !16 + %317 = fcmp oeq bfloat %159, 0xRFF80, !dbg !16 + %318 = fcmp oeq bfloat %160, 0xRFF80, !dbg !16 + %319 = fcmp oeq bfloat %161, 0xRFF80, !dbg !16 + %320 = fcmp oeq bfloat %162, 0xRFF80, !dbg !16 + %321 = fcmp oeq bfloat %173, 0xRFF80, !dbg !16 + %322 = fcmp oeq bfloat %174, 0xRFF80, !dbg !16 + %323 = fcmp oeq bfloat %175, 0xRFF80, !dbg !16 + %324 = fcmp oeq bfloat %176, 0xRFF80, !dbg !16 + %325 = fcmp oeq bfloat %177, 0xRFF80, !dbg !16 + %326 = fcmp oeq bfloat %178, 0xRFF80, !dbg !16 + %327 = fcmp oeq bfloat %179, 0xRFF80, !dbg !16 + %328 = fcmp oeq bfloat %180, 0xRFF80, !dbg !16 + %329 = fcmp oeq bfloat %191, 0xRFF80, !dbg !16 + %330 = fcmp oeq bfloat %192, 0xRFF80, !dbg !16 + %331 = fcmp oeq bfloat %193, 0xRFF80, !dbg !16 + %332 = fcmp oeq bfloat %194, 0xRFF80, !dbg !16 + %333 = fcmp oeq bfloat %195, 0xRFF80, !dbg !16 + %334 = fcmp oeq bfloat %196, 0xRFF80, !dbg !16 + %335 = fcmp oeq bfloat %197, 0xRFF80, !dbg !16 + %336 = fcmp oeq bfloat %198, 0xRFF80, !dbg !16 + %337 = fcmp oeq bfloat %209, 0xRFF80, !dbg !16 + %338 = fcmp oeq bfloat %210, 0xRFF80, !dbg !16 + %339 = fsub float 0xFFF0000000000000, %211, !dbg !20 + %340 = fsub float 0xFFF0000000000000, %212, !dbg !20 + %341 = fsub float 0xFFF0000000000000, %213, !dbg !20 + %342 = fsub float 0xFFF0000000000000, %214, !dbg !20 + %343 = fsub float 0xFFF0000000000000, %215, !dbg !20 + %344 = fsub float 0xFFF0000000000000, %216, !dbg !20 + %345 = fsub float 0xFFF0000000000000, %217, !dbg !20 + %346 = fsub float 0xFFF0000000000000, %218, !dbg !20 + %347 = fsub float 0xFFF0000000000000, %219, !dbg !20 + %348 = fsub float 0xFFF0000000000000, %220, !dbg !20 + %349 = fsub float 0xFFF0000000000000, %221, !dbg !20 + %350 = fsub float 0xFFF0000000000000, %222, !dbg !20 + %351 = fsub float 0xFFF0000000000000, %223, !dbg !20 + %352 = fsub float 0xFFF0000000000000, %224, !dbg !20 + %353 = fsub float 0xFFF0000000000000, %225, !dbg !20 + %354 = fsub float 0xFFF0000000000000, %226, !dbg !20 + %355 = fsub float 0xFFF0000000000000, %227, !dbg !20 + %356 = fsub float 0xFFF0000000000000, %228, !dbg !20 + %357 = fsub float 0xFFF0000000000000, %229, !dbg !20 + %358 = fsub float 0xFFF0000000000000, %230, !dbg !20 + %359 = fsub float 0xFFF0000000000000, %231, !dbg !20 + %360 = fsub float 0xFFF0000000000000, %232, !dbg !20 + %361 = fsub float 0xFFF0000000000000, %233, !dbg !20 + %362 = fsub float 0xFFF0000000000000, %234, !dbg !20 + %363 = fsub float 0xFFF0000000000000, %235, !dbg !20 + %364 = fsub float 0xFFF0000000000000, %236, !dbg !20 + %365 = fsub float 0xFFF0000000000000, %237, !dbg !20 + %366 = fsub float 0xFFF0000000000000, %238, !dbg !20 + %367 = fsub float 0xFFF0000000000000, %239, !dbg !20 + %368 = fsub float 0xFFF0000000000000, %240, !dbg !20 + %369 = fsub float 0xFFF0000000000000, %241, !dbg !20 + %370 = fsub float 0xFFF0000000000000, %242, !dbg !20 + %371 = fsub float 0xFFF0000000000000, %243, !dbg !20 + %372 = fsub float 0xFFF0000000000000, %244, !dbg !20 + %373 = fsub float 0xFFF0000000000000, %245, !dbg !20 + %374 = fsub float 0xFFF0000000000000, %246, !dbg !20 + %375 = fsub float 0xFFF0000000000000, %247, !dbg !20 + %376 = fsub float 0xFFF0000000000000, %248, !dbg !20 + %377 = fsub float 0xFFF0000000000000, %249, !dbg !20 + %378 = fsub float 0xFFF0000000000000, %250, !dbg !20 + %379 = fsub float 0xFFF0000000000000, %251, !dbg !20 + %380 = fsub float 0xFFF0000000000000, %252, !dbg !20 + %381 = fsub float 0xFFF0000000000000, %253, !dbg !20 + %382 = fsub float 0xFFF0000000000000, %254, !dbg !20 + %383 = fsub float 0xFFF0000000000000, %255, !dbg !20 + %384 = fsub float 0xFFF0000000000000, %256, !dbg !20 + %385 = fsub float 0xFFF0000000000000, %257, !dbg !20 + %386 = fsub float 0xFFF0000000000000, %258, !dbg !20 + %387 = fsub float 0xFFF0000000000000, %259, !dbg !20 + %388 = fsub float 0xFFF0000000000000, %260, !dbg !20 + %389 = fsub float 0xFFF0000000000000, %261, !dbg !20 + %390 = fsub float 0xFFF0000000000000, %262, !dbg !20 + %391 = fsub float 0xFFF0000000000000, %263, !dbg !20 + %392 = fsub float 0xFFF0000000000000, %264, !dbg !20 + %393 = fsub float 0xFFF0000000000000, %265, !dbg !20 + %394 = fsub float 0xFFF0000000000000, %266, !dbg !20 + %395 = fsub float 0xFFF0000000000000, %267, !dbg !20 + %396 = fsub float 0xFFF0000000000000, %268, !dbg !20 + %397 = fsub float 0xFFF0000000000000, %270, !dbg !20 + %398 = fsub float 0xFFF0000000000000, %272, !dbg !20 + %399 = fsub float 0xFFF0000000000000, %274, !dbg !20 + %400 = fsub float 0xFFF0000000000000, %276, !dbg !20 + %401 = fsub float 0xFFF0000000000000, %278, !dbg !20 + %402 = fsub float 0xFFF0000000000000, %280, !dbg !20 + %403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i = icmp eq i32 %403, 0, !dbg !21 + %404 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %339, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %405 = tail call float @llvm.nvvm.fma.rn.f(float %339, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i = select i1 %.not.i, float %405, float %404, !dbg !21 + %406 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i = icmp eq i32 %406, 0, !dbg !21 + %407 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i) #6, !dbg !21 + %408 = tail call float @llvm.nvvm.saturate.f(float %.02.i) #6, !dbg !21 + %.03.i = select i1 %.not1.i, float %408, float %407, !dbg !21 + %409 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i = icmp eq i32 %409, 0, !dbg !21 + %410 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %411 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i = select i1 %.not2.i, float %411, float %410, !dbg !21 + %412 = fadd float %.04.i, 0xC168000FE0000000, !dbg !21 + %413 = fneg float %412, !dbg !21 + %414 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i = icmp eq i32 %414, 0, !dbg !21 + %415 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %339, float 0x3FF7154760000000, float %413) #6, !dbg !21 + %416 = tail call float @llvm.nvvm.fma.rn.f(float %339, float 0x3FF7154760000000, float %413) #6, !dbg !21 + %.0.i = select i1 %.not3.i, float %416, float %415, !dbg !21 + %417 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i = icmp eq i32 %417, 0, !dbg !21 + %418 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %339, float 0x3E54AE0C00000000, float %.0.i) #6, !dbg !21 + %419 = tail call float @llvm.nvvm.fma.rn.f(float %339, float 0x3E54AE0C00000000, float %.0.i) #6, !dbg !21 + %.01.i = select i1 %.not4.i, float %419, float %418, !dbg !21 + %420 = bitcast float %.04.i to i32, !dbg !21 + %421 = shl i32 %420, 23, !dbg !21 + %422 = bitcast i32 %421 to float, !dbg !21 + %423 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i) #6, !dbg !21 + %424 = fmul float %423, %422, !dbg !21 + %425 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i127 = icmp eq i32 %425, 0, !dbg !21 + %426 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %340, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %427 = tail call float @llvm.nvvm.fma.rn.f(float %340, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i128 = select i1 %.not.i127, float %427, float %426, !dbg !21 + %428 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i129 = icmp eq i32 %428, 0, !dbg !21 + %429 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i128) #6, !dbg !21 + %430 = tail call float @llvm.nvvm.saturate.f(float %.02.i128) #6, !dbg !21 + %.03.i130 = select i1 %.not1.i129, float %430, float %429, !dbg !21 + %431 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i131 = icmp eq i32 %431, 0, !dbg !21 + %432 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i130, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %433 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i130, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i132 = select i1 %.not2.i131, float %433, float %432, !dbg !21 + %434 = fadd float %.04.i132, 0xC168000FE0000000, !dbg !21 + %435 = fneg float %434, !dbg !21 + %436 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i133 = icmp eq i32 %436, 0, !dbg !21 + %437 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %340, float 0x3FF7154760000000, float %435) #6, !dbg !21 + %438 = tail call float @llvm.nvvm.fma.rn.f(float %340, float 0x3FF7154760000000, float %435) #6, !dbg !21 + %.0.i134 = select i1 %.not3.i133, float %438, float %437, !dbg !21 + %439 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i135 = icmp eq i32 %439, 0, !dbg !21 + %440 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %340, float 0x3E54AE0C00000000, float %.0.i134) #6, !dbg !21 + %441 = tail call float @llvm.nvvm.fma.rn.f(float %340, float 0x3E54AE0C00000000, float %.0.i134) #6, !dbg !21 + %.01.i136 = select i1 %.not4.i135, float %441, float %440, !dbg !21 + %442 = bitcast float %.04.i132 to i32, !dbg !21 + %443 = shl i32 %442, 23, !dbg !21 + %444 = bitcast i32 %443 to float, !dbg !21 + %445 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i136) #6, !dbg !21 + %446 = fmul float %445, %444, !dbg !21 + %447 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i137 = icmp eq i32 %447, 0, !dbg !21 + %448 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %341, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %449 = tail call float @llvm.nvvm.fma.rn.f(float %341, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i138 = select i1 %.not.i137, float %449, float %448, !dbg !21 + %450 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i139 = icmp eq i32 %450, 0, !dbg !21 + %451 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i138) #6, !dbg !21 + %452 = tail call float @llvm.nvvm.saturate.f(float %.02.i138) #6, !dbg !21 + %.03.i140 = select i1 %.not1.i139, float %452, float %451, !dbg !21 + %453 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i141 = icmp eq i32 %453, 0, !dbg !21 + %454 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i140, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %455 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i140, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i142 = select i1 %.not2.i141, float %455, float %454, !dbg !21 + %456 = fadd float %.04.i142, 0xC168000FE0000000, !dbg !21 + %457 = fneg float %456, !dbg !21 + %458 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i143 = icmp eq i32 %458, 0, !dbg !21 + %459 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %341, float 0x3FF7154760000000, float %457) #6, !dbg !21 + %460 = tail call float @llvm.nvvm.fma.rn.f(float %341, float 0x3FF7154760000000, float %457) #6, !dbg !21 + %.0.i144 = select i1 %.not3.i143, float %460, float %459, !dbg !21 + %461 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i145 = icmp eq i32 %461, 0, !dbg !21 + %462 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %341, float 0x3E54AE0C00000000, float %.0.i144) #6, !dbg !21 + %463 = tail call float @llvm.nvvm.fma.rn.f(float %341, float 0x3E54AE0C00000000, float %.0.i144) #6, !dbg !21 + %.01.i146 = select i1 %.not4.i145, float %463, float %462, !dbg !21 + %464 = bitcast float %.04.i142 to i32, !dbg !21 + %465 = shl i32 %464, 23, !dbg !21 + %466 = bitcast i32 %465 to float, !dbg !21 + %467 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i146) #6, !dbg !21 + %468 = fmul float %467, %466, !dbg !21 + %469 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i147 = icmp eq i32 %469, 0, !dbg !21 + %470 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %342, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %471 = tail call float @llvm.nvvm.fma.rn.f(float %342, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i148 = select i1 %.not.i147, float %471, float %470, !dbg !21 + %472 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i149 = icmp eq i32 %472, 0, !dbg !21 + %473 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i148) #6, !dbg !21 + %474 = tail call float @llvm.nvvm.saturate.f(float %.02.i148) #6, !dbg !21 + %.03.i150 = select i1 %.not1.i149, float %474, float %473, !dbg !21 + %475 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i151 = icmp eq i32 %475, 0, !dbg !21 + %476 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i150, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %477 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i150, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i152 = select i1 %.not2.i151, float %477, float %476, !dbg !21 + %478 = fadd float %.04.i152, 0xC168000FE0000000, !dbg !21 + %479 = fneg float %478, !dbg !21 + %480 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i153 = icmp eq i32 %480, 0, !dbg !21 + %481 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %342, float 0x3FF7154760000000, float %479) #6, !dbg !21 + %482 = tail call float @llvm.nvvm.fma.rn.f(float %342, float 0x3FF7154760000000, float %479) #6, !dbg !21 + %.0.i154 = select i1 %.not3.i153, float %482, float %481, !dbg !21 + %483 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i155 = icmp eq i32 %483, 0, !dbg !21 + %484 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %342, float 0x3E54AE0C00000000, float %.0.i154) #6, !dbg !21 + %485 = tail call float @llvm.nvvm.fma.rn.f(float %342, float 0x3E54AE0C00000000, float %.0.i154) #6, !dbg !21 + %.01.i156 = select i1 %.not4.i155, float %485, float %484, !dbg !21 + %486 = bitcast float %.04.i152 to i32, !dbg !21 + %487 = shl i32 %486, 23, !dbg !21 + %488 = bitcast i32 %487 to float, !dbg !21 + %489 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i156) #6, !dbg !21 + %490 = fmul float %489, %488, !dbg !21 + %491 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i157 = icmp eq i32 %491, 0, !dbg !21 + %492 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %343, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %493 = tail call float @llvm.nvvm.fma.rn.f(float %343, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i158 = select i1 %.not.i157, float %493, float %492, !dbg !21 + %494 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i159 = icmp eq i32 %494, 0, !dbg !21 + %495 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i158) #6, !dbg !21 + %496 = tail call float @llvm.nvvm.saturate.f(float %.02.i158) #6, !dbg !21 + %.03.i160 = select i1 %.not1.i159, float %496, float %495, !dbg !21 + %497 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i161 = icmp eq i32 %497, 0, !dbg !21 + %498 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i160, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %499 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i160, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i162 = select i1 %.not2.i161, float %499, float %498, !dbg !21 + %500 = fadd float %.04.i162, 0xC168000FE0000000, !dbg !21 + %501 = fneg float %500, !dbg !21 + %502 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i163 = icmp eq i32 %502, 0, !dbg !21 + %503 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %343, float 0x3FF7154760000000, float %501) #6, !dbg !21 + %504 = tail call float @llvm.nvvm.fma.rn.f(float %343, float 0x3FF7154760000000, float %501) #6, !dbg !21 + %.0.i164 = select i1 %.not3.i163, float %504, float %503, !dbg !21 + %505 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i165 = icmp eq i32 %505, 0, !dbg !21 + %506 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %343, float 0x3E54AE0C00000000, float %.0.i164) #6, !dbg !21 + %507 = tail call float @llvm.nvvm.fma.rn.f(float %343, float 0x3E54AE0C00000000, float %.0.i164) #6, !dbg !21 + %.01.i166 = select i1 %.not4.i165, float %507, float %506, !dbg !21 + %508 = bitcast float %.04.i162 to i32, !dbg !21 + %509 = shl i32 %508, 23, !dbg !21 + %510 = bitcast i32 %509 to float, !dbg !21 + %511 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i166) #6, !dbg !21 + %512 = fmul float %511, %510, !dbg !21 + %513 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i167 = icmp eq i32 %513, 0, !dbg !21 + %514 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %344, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %515 = tail call float @llvm.nvvm.fma.rn.f(float %344, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i168 = select i1 %.not.i167, float %515, float %514, !dbg !21 + %516 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i169 = icmp eq i32 %516, 0, !dbg !21 + %517 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i168) #6, !dbg !21 + %518 = tail call float @llvm.nvvm.saturate.f(float %.02.i168) #6, !dbg !21 + %.03.i170 = select i1 %.not1.i169, float %518, float %517, !dbg !21 + %519 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i171 = icmp eq i32 %519, 0, !dbg !21 + %520 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i170, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %521 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i170, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i172 = select i1 %.not2.i171, float %521, float %520, !dbg !21 + %522 = fadd float %.04.i172, 0xC168000FE0000000, !dbg !21 + %523 = fneg float %522, !dbg !21 + %524 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i173 = icmp eq i32 %524, 0, !dbg !21 + %525 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %344, float 0x3FF7154760000000, float %523) #6, !dbg !21 + %526 = tail call float @llvm.nvvm.fma.rn.f(float %344, float 0x3FF7154760000000, float %523) #6, !dbg !21 + %.0.i174 = select i1 %.not3.i173, float %526, float %525, !dbg !21 + %527 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i175 = icmp eq i32 %527, 0, !dbg !21 + %528 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %344, float 0x3E54AE0C00000000, float %.0.i174) #6, !dbg !21 + %529 = tail call float @llvm.nvvm.fma.rn.f(float %344, float 0x3E54AE0C00000000, float %.0.i174) #6, !dbg !21 + %.01.i176 = select i1 %.not4.i175, float %529, float %528, !dbg !21 + %530 = bitcast float %.04.i172 to i32, !dbg !21 + %531 = shl i32 %530, 23, !dbg !21 + %532 = bitcast i32 %531 to float, !dbg !21 + %533 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i176) #6, !dbg !21 + %534 = fmul float %533, %532, !dbg !21 + %535 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i177 = icmp eq i32 %535, 0, !dbg !21 + %536 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %345, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %537 = tail call float @llvm.nvvm.fma.rn.f(float %345, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i178 = select i1 %.not.i177, float %537, float %536, !dbg !21 + %538 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i179 = icmp eq i32 %538, 0, !dbg !21 + %539 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i178) #6, !dbg !21 + %540 = tail call float @llvm.nvvm.saturate.f(float %.02.i178) #6, !dbg !21 + %.03.i180 = select i1 %.not1.i179, float %540, float %539, !dbg !21 + %541 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i181 = icmp eq i32 %541, 0, !dbg !21 + %542 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i180, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %543 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i180, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i182 = select i1 %.not2.i181, float %543, float %542, !dbg !21 + %544 = fadd float %.04.i182, 0xC168000FE0000000, !dbg !21 + %545 = fneg float %544, !dbg !21 + %546 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i183 = icmp eq i32 %546, 0, !dbg !21 + %547 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %345, float 0x3FF7154760000000, float %545) #6, !dbg !21 + %548 = tail call float @llvm.nvvm.fma.rn.f(float %345, float 0x3FF7154760000000, float %545) #6, !dbg !21 + %.0.i184 = select i1 %.not3.i183, float %548, float %547, !dbg !21 + %549 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i185 = icmp eq i32 %549, 0, !dbg !21 + %550 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %345, float 0x3E54AE0C00000000, float %.0.i184) #6, !dbg !21 + %551 = tail call float @llvm.nvvm.fma.rn.f(float %345, float 0x3E54AE0C00000000, float %.0.i184) #6, !dbg !21 + %.01.i186 = select i1 %.not4.i185, float %551, float %550, !dbg !21 + %552 = bitcast float %.04.i182 to i32, !dbg !21 + %553 = shl i32 %552, 23, !dbg !21 + %554 = bitcast i32 %553 to float, !dbg !21 + %555 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i186) #6, !dbg !21 + %556 = fmul float %555, %554, !dbg !21 + %557 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i187 = icmp eq i32 %557, 0, !dbg !21 + %558 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %346, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %559 = tail call float @llvm.nvvm.fma.rn.f(float %346, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i188 = select i1 %.not.i187, float %559, float %558, !dbg !21 + %560 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i189 = icmp eq i32 %560, 0, !dbg !21 + %561 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i188) #6, !dbg !21 + %562 = tail call float @llvm.nvvm.saturate.f(float %.02.i188) #6, !dbg !21 + %.03.i190 = select i1 %.not1.i189, float %562, float %561, !dbg !21 + %563 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i191 = icmp eq i32 %563, 0, !dbg !21 + %564 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i190, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %565 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i190, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i192 = select i1 %.not2.i191, float %565, float %564, !dbg !21 + %566 = fadd float %.04.i192, 0xC168000FE0000000, !dbg !21 + %567 = fneg float %566, !dbg !21 + %568 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i193 = icmp eq i32 %568, 0, !dbg !21 + %569 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %346, float 0x3FF7154760000000, float %567) #6, !dbg !21 + %570 = tail call float @llvm.nvvm.fma.rn.f(float %346, float 0x3FF7154760000000, float %567) #6, !dbg !21 + %.0.i194 = select i1 %.not3.i193, float %570, float %569, !dbg !21 + %571 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i195 = icmp eq i32 %571, 0, !dbg !21 + %572 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %346, float 0x3E54AE0C00000000, float %.0.i194) #6, !dbg !21 + %573 = tail call float @llvm.nvvm.fma.rn.f(float %346, float 0x3E54AE0C00000000, float %.0.i194) #6, !dbg !21 + %.01.i196 = select i1 %.not4.i195, float %573, float %572, !dbg !21 + %574 = bitcast float %.04.i192 to i32, !dbg !21 + %575 = shl i32 %574, 23, !dbg !21 + %576 = bitcast i32 %575 to float, !dbg !21 + %577 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i196) #6, !dbg !21 + %578 = fmul float %577, %576, !dbg !21 + %579 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i197 = icmp eq i32 %579, 0, !dbg !21 + %580 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %347, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %581 = tail call float @llvm.nvvm.fma.rn.f(float %347, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i198 = select i1 %.not.i197, float %581, float %580, !dbg !21 + %582 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i199 = icmp eq i32 %582, 0, !dbg !21 + %583 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i198) #6, !dbg !21 + %584 = tail call float @llvm.nvvm.saturate.f(float %.02.i198) #6, !dbg !21 + %.03.i200 = select i1 %.not1.i199, float %584, float %583, !dbg !21 + %585 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i201 = icmp eq i32 %585, 0, !dbg !21 + %586 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i200, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %587 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i200, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i202 = select i1 %.not2.i201, float %587, float %586, !dbg !21 + %588 = fadd float %.04.i202, 0xC168000FE0000000, !dbg !21 + %589 = fneg float %588, !dbg !21 + %590 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i203 = icmp eq i32 %590, 0, !dbg !21 + %591 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %347, float 0x3FF7154760000000, float %589) #6, !dbg !21 + %592 = tail call float @llvm.nvvm.fma.rn.f(float %347, float 0x3FF7154760000000, float %589) #6, !dbg !21 + %.0.i204 = select i1 %.not3.i203, float %592, float %591, !dbg !21 + %593 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i205 = icmp eq i32 %593, 0, !dbg !21 + %594 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %347, float 0x3E54AE0C00000000, float %.0.i204) #6, !dbg !21 + %595 = tail call float @llvm.nvvm.fma.rn.f(float %347, float 0x3E54AE0C00000000, float %.0.i204) #6, !dbg !21 + %.01.i206 = select i1 %.not4.i205, float %595, float %594, !dbg !21 + %596 = bitcast float %.04.i202 to i32, !dbg !21 + %597 = shl i32 %596, 23, !dbg !21 + %598 = bitcast i32 %597 to float, !dbg !21 + %599 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i206) #6, !dbg !21 + %600 = fmul float %599, %598, !dbg !21 + %601 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i207 = icmp eq i32 %601, 0, !dbg !21 + %602 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %348, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %603 = tail call float @llvm.nvvm.fma.rn.f(float %348, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i208 = select i1 %.not.i207, float %603, float %602, !dbg !21 + %604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i209 = icmp eq i32 %604, 0, !dbg !21 + %605 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i208) #6, !dbg !21 + %606 = tail call float @llvm.nvvm.saturate.f(float %.02.i208) #6, !dbg !21 + %.03.i210 = select i1 %.not1.i209, float %606, float %605, !dbg !21 + %607 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i211 = icmp eq i32 %607, 0, !dbg !21 + %608 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i210, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %609 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i210, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i212 = select i1 %.not2.i211, float %609, float %608, !dbg !21 + %610 = fadd float %.04.i212, 0xC168000FE0000000, !dbg !21 + %611 = fneg float %610, !dbg !21 + %612 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i213 = icmp eq i32 %612, 0, !dbg !21 + %613 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %348, float 0x3FF7154760000000, float %611) #6, !dbg !21 + %614 = tail call float @llvm.nvvm.fma.rn.f(float %348, float 0x3FF7154760000000, float %611) #6, !dbg !21 + %.0.i214 = select i1 %.not3.i213, float %614, float %613, !dbg !21 + %615 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i215 = icmp eq i32 %615, 0, !dbg !21 + %616 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %348, float 0x3E54AE0C00000000, float %.0.i214) #6, !dbg !21 + %617 = tail call float @llvm.nvvm.fma.rn.f(float %348, float 0x3E54AE0C00000000, float %.0.i214) #6, !dbg !21 + %.01.i216 = select i1 %.not4.i215, float %617, float %616, !dbg !21 + %618 = bitcast float %.04.i212 to i32, !dbg !21 + %619 = shl i32 %618, 23, !dbg !21 + %620 = bitcast i32 %619 to float, !dbg !21 + %621 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i216) #6, !dbg !21 + %622 = fmul float %621, %620, !dbg !21 + %623 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i217 = icmp eq i32 %623, 0, !dbg !21 + %624 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %349, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %625 = tail call float @llvm.nvvm.fma.rn.f(float %349, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i218 = select i1 %.not.i217, float %625, float %624, !dbg !21 + %626 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i219 = icmp eq i32 %626, 0, !dbg !21 + %627 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i218) #6, !dbg !21 + %628 = tail call float @llvm.nvvm.saturate.f(float %.02.i218) #6, !dbg !21 + %.03.i220 = select i1 %.not1.i219, float %628, float %627, !dbg !21 + %629 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i221 = icmp eq i32 %629, 0, !dbg !21 + %630 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i220, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %631 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i220, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i222 = select i1 %.not2.i221, float %631, float %630, !dbg !21 + %632 = fadd float %.04.i222, 0xC168000FE0000000, !dbg !21 + %633 = fneg float %632, !dbg !21 + %634 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i223 = icmp eq i32 %634, 0, !dbg !21 + %635 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %349, float 0x3FF7154760000000, float %633) #6, !dbg !21 + %636 = tail call float @llvm.nvvm.fma.rn.f(float %349, float 0x3FF7154760000000, float %633) #6, !dbg !21 + %.0.i224 = select i1 %.not3.i223, float %636, float %635, !dbg !21 + %637 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i225 = icmp eq i32 %637, 0, !dbg !21 + %638 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %349, float 0x3E54AE0C00000000, float %.0.i224) #6, !dbg !21 + %639 = tail call float @llvm.nvvm.fma.rn.f(float %349, float 0x3E54AE0C00000000, float %.0.i224) #6, !dbg !21 + %.01.i226 = select i1 %.not4.i225, float %639, float %638, !dbg !21 + %640 = bitcast float %.04.i222 to i32, !dbg !21 + %641 = shl i32 %640, 23, !dbg !21 + %642 = bitcast i32 %641 to float, !dbg !21 + %643 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i226) #6, !dbg !21 + %644 = fmul float %643, %642, !dbg !21 + %645 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i227 = icmp eq i32 %645, 0, !dbg !21 + %646 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %350, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %647 = tail call float @llvm.nvvm.fma.rn.f(float %350, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i228 = select i1 %.not.i227, float %647, float %646, !dbg !21 + %648 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i229 = icmp eq i32 %648, 0, !dbg !21 + %649 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i228) #6, !dbg !21 + %650 = tail call float @llvm.nvvm.saturate.f(float %.02.i228) #6, !dbg !21 + %.03.i230 = select i1 %.not1.i229, float %650, float %649, !dbg !21 + %651 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i231 = icmp eq i32 %651, 0, !dbg !21 + %652 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i230, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %653 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i230, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i232 = select i1 %.not2.i231, float %653, float %652, !dbg !21 + %654 = fadd float %.04.i232, 0xC168000FE0000000, !dbg !21 + %655 = fneg float %654, !dbg !21 + %656 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i233 = icmp eq i32 %656, 0, !dbg !21 + %657 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %350, float 0x3FF7154760000000, float %655) #6, !dbg !21 + %658 = tail call float @llvm.nvvm.fma.rn.f(float %350, float 0x3FF7154760000000, float %655) #6, !dbg !21 + %.0.i234 = select i1 %.not3.i233, float %658, float %657, !dbg !21 + %659 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i235 = icmp eq i32 %659, 0, !dbg !21 + %660 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %350, float 0x3E54AE0C00000000, float %.0.i234) #6, !dbg !21 + %661 = tail call float @llvm.nvvm.fma.rn.f(float %350, float 0x3E54AE0C00000000, float %.0.i234) #6, !dbg !21 + %.01.i236 = select i1 %.not4.i235, float %661, float %660, !dbg !21 + %662 = bitcast float %.04.i232 to i32, !dbg !21 + %663 = shl i32 %662, 23, !dbg !21 + %664 = bitcast i32 %663 to float, !dbg !21 + %665 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i236) #6, !dbg !21 + %666 = fmul float %665, %664, !dbg !21 + %667 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i237 = icmp eq i32 %667, 0, !dbg !21 + %668 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %351, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %669 = tail call float @llvm.nvvm.fma.rn.f(float %351, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i238 = select i1 %.not.i237, float %669, float %668, !dbg !21 + %670 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i239 = icmp eq i32 %670, 0, !dbg !21 + %671 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i238) #6, !dbg !21 + %672 = tail call float @llvm.nvvm.saturate.f(float %.02.i238) #6, !dbg !21 + %.03.i240 = select i1 %.not1.i239, float %672, float %671, !dbg !21 + %673 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i241 = icmp eq i32 %673, 0, !dbg !21 + %674 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i240, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %675 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i240, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i242 = select i1 %.not2.i241, float %675, float %674, !dbg !21 + %676 = fadd float %.04.i242, 0xC168000FE0000000, !dbg !21 + %677 = fneg float %676, !dbg !21 + %678 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i243 = icmp eq i32 %678, 0, !dbg !21 + %679 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %351, float 0x3FF7154760000000, float %677) #6, !dbg !21 + %680 = tail call float @llvm.nvvm.fma.rn.f(float %351, float 0x3FF7154760000000, float %677) #6, !dbg !21 + %.0.i244 = select i1 %.not3.i243, float %680, float %679, !dbg !21 + %681 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i245 = icmp eq i32 %681, 0, !dbg !21 + %682 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %351, float 0x3E54AE0C00000000, float %.0.i244) #6, !dbg !21 + %683 = tail call float @llvm.nvvm.fma.rn.f(float %351, float 0x3E54AE0C00000000, float %.0.i244) #6, !dbg !21 + %.01.i246 = select i1 %.not4.i245, float %683, float %682, !dbg !21 + %684 = bitcast float %.04.i242 to i32, !dbg !21 + %685 = shl i32 %684, 23, !dbg !21 + %686 = bitcast i32 %685 to float, !dbg !21 + %687 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i246) #6, !dbg !21 + %688 = fmul float %687, %686, !dbg !21 + %689 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i247 = icmp eq i32 %689, 0, !dbg !21 + %690 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %352, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %691 = tail call float @llvm.nvvm.fma.rn.f(float %352, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i248 = select i1 %.not.i247, float %691, float %690, !dbg !21 + %692 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i249 = icmp eq i32 %692, 0, !dbg !21 + %693 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i248) #6, !dbg !21 + %694 = tail call float @llvm.nvvm.saturate.f(float %.02.i248) #6, !dbg !21 + %.03.i250 = select i1 %.not1.i249, float %694, float %693, !dbg !21 + %695 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i251 = icmp eq i32 %695, 0, !dbg !21 + %696 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i250, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %697 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i250, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i252 = select i1 %.not2.i251, float %697, float %696, !dbg !21 + %698 = fadd float %.04.i252, 0xC168000FE0000000, !dbg !21 + %699 = fneg float %698, !dbg !21 + %700 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i253 = icmp eq i32 %700, 0, !dbg !21 + %701 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %352, float 0x3FF7154760000000, float %699) #6, !dbg !21 + %702 = tail call float @llvm.nvvm.fma.rn.f(float %352, float 0x3FF7154760000000, float %699) #6, !dbg !21 + %.0.i254 = select i1 %.not3.i253, float %702, float %701, !dbg !21 + %703 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i255 = icmp eq i32 %703, 0, !dbg !21 + %704 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %352, float 0x3E54AE0C00000000, float %.0.i254) #6, !dbg !21 + %705 = tail call float @llvm.nvvm.fma.rn.f(float %352, float 0x3E54AE0C00000000, float %.0.i254) #6, !dbg !21 + %.01.i256 = select i1 %.not4.i255, float %705, float %704, !dbg !21 + %706 = bitcast float %.04.i252 to i32, !dbg !21 + %707 = shl i32 %706, 23, !dbg !21 + %708 = bitcast i32 %707 to float, !dbg !21 + %709 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i256) #6, !dbg !21 + %710 = fmul float %709, %708, !dbg !21 + %711 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i257 = icmp eq i32 %711, 0, !dbg !21 + %712 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %353, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %713 = tail call float @llvm.nvvm.fma.rn.f(float %353, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i258 = select i1 %.not.i257, float %713, float %712, !dbg !21 + %714 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i259 = icmp eq i32 %714, 0, !dbg !21 + %715 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i258) #6, !dbg !21 + %716 = tail call float @llvm.nvvm.saturate.f(float %.02.i258) #6, !dbg !21 + %.03.i260 = select i1 %.not1.i259, float %716, float %715, !dbg !21 + %717 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i261 = icmp eq i32 %717, 0, !dbg !21 + %718 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i260, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %719 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i260, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i262 = select i1 %.not2.i261, float %719, float %718, !dbg !21 + %720 = fadd float %.04.i262, 0xC168000FE0000000, !dbg !21 + %721 = fneg float %720, !dbg !21 + %722 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i263 = icmp eq i32 %722, 0, !dbg !21 + %723 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %353, float 0x3FF7154760000000, float %721) #6, !dbg !21 + %724 = tail call float @llvm.nvvm.fma.rn.f(float %353, float 0x3FF7154760000000, float %721) #6, !dbg !21 + %.0.i264 = select i1 %.not3.i263, float %724, float %723, !dbg !21 + %725 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i265 = icmp eq i32 %725, 0, !dbg !21 + %726 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %353, float 0x3E54AE0C00000000, float %.0.i264) #6, !dbg !21 + %727 = tail call float @llvm.nvvm.fma.rn.f(float %353, float 0x3E54AE0C00000000, float %.0.i264) #6, !dbg !21 + %.01.i266 = select i1 %.not4.i265, float %727, float %726, !dbg !21 + %728 = bitcast float %.04.i262 to i32, !dbg !21 + %729 = shl i32 %728, 23, !dbg !21 + %730 = bitcast i32 %729 to float, !dbg !21 + %731 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i266) #6, !dbg !21 + %732 = fmul float %731, %730, !dbg !21 + %733 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i267 = icmp eq i32 %733, 0, !dbg !21 + %734 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %354, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %735 = tail call float @llvm.nvvm.fma.rn.f(float %354, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i268 = select i1 %.not.i267, float %735, float %734, !dbg !21 + %736 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i269 = icmp eq i32 %736, 0, !dbg !21 + %737 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i268) #6, !dbg !21 + %738 = tail call float @llvm.nvvm.saturate.f(float %.02.i268) #6, !dbg !21 + %.03.i270 = select i1 %.not1.i269, float %738, float %737, !dbg !21 + %739 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i271 = icmp eq i32 %739, 0, !dbg !21 + %740 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i270, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %741 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i270, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i272 = select i1 %.not2.i271, float %741, float %740, !dbg !21 + %742 = fadd float %.04.i272, 0xC168000FE0000000, !dbg !21 + %743 = fneg float %742, !dbg !21 + %744 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i273 = icmp eq i32 %744, 0, !dbg !21 + %745 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %354, float 0x3FF7154760000000, float %743) #6, !dbg !21 + %746 = tail call float @llvm.nvvm.fma.rn.f(float %354, float 0x3FF7154760000000, float %743) #6, !dbg !21 + %.0.i274 = select i1 %.not3.i273, float %746, float %745, !dbg !21 + %747 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i275 = icmp eq i32 %747, 0, !dbg !21 + %748 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %354, float 0x3E54AE0C00000000, float %.0.i274) #6, !dbg !21 + %749 = tail call float @llvm.nvvm.fma.rn.f(float %354, float 0x3E54AE0C00000000, float %.0.i274) #6, !dbg !21 + %.01.i276 = select i1 %.not4.i275, float %749, float %748, !dbg !21 + %750 = bitcast float %.04.i272 to i32, !dbg !21 + %751 = shl i32 %750, 23, !dbg !21 + %752 = bitcast i32 %751 to float, !dbg !21 + %753 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i276) #6, !dbg !21 + %754 = fmul float %753, %752, !dbg !21 + %755 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i277 = icmp eq i32 %755, 0, !dbg !21 + %756 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %355, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %757 = tail call float @llvm.nvvm.fma.rn.f(float %355, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i278 = select i1 %.not.i277, float %757, float %756, !dbg !21 + %758 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i279 = icmp eq i32 %758, 0, !dbg !21 + %759 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i278) #6, !dbg !21 + %760 = tail call float @llvm.nvvm.saturate.f(float %.02.i278) #6, !dbg !21 + %.03.i280 = select i1 %.not1.i279, float %760, float %759, !dbg !21 + %761 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i281 = icmp eq i32 %761, 0, !dbg !21 + %762 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i280, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %763 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i280, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i282 = select i1 %.not2.i281, float %763, float %762, !dbg !21 + %764 = fadd float %.04.i282, 0xC168000FE0000000, !dbg !21 + %765 = fneg float %764, !dbg !21 + %766 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i283 = icmp eq i32 %766, 0, !dbg !21 + %767 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %355, float 0x3FF7154760000000, float %765) #6, !dbg !21 + %768 = tail call float @llvm.nvvm.fma.rn.f(float %355, float 0x3FF7154760000000, float %765) #6, !dbg !21 + %.0.i284 = select i1 %.not3.i283, float %768, float %767, !dbg !21 + %769 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i285 = icmp eq i32 %769, 0, !dbg !21 + %770 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %355, float 0x3E54AE0C00000000, float %.0.i284) #6, !dbg !21 + %771 = tail call float @llvm.nvvm.fma.rn.f(float %355, float 0x3E54AE0C00000000, float %.0.i284) #6, !dbg !21 + %.01.i286 = select i1 %.not4.i285, float %771, float %770, !dbg !21 + %772 = bitcast float %.04.i282 to i32, !dbg !21 + %773 = shl i32 %772, 23, !dbg !21 + %774 = bitcast i32 %773 to float, !dbg !21 + %775 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i286) #6, !dbg !21 + %776 = fmul float %775, %774, !dbg !21 + %777 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i287 = icmp eq i32 %777, 0, !dbg !21 + %778 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %356, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %779 = tail call float @llvm.nvvm.fma.rn.f(float %356, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i288 = select i1 %.not.i287, float %779, float %778, !dbg !21 + %780 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i289 = icmp eq i32 %780, 0, !dbg !21 + %781 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i288) #6, !dbg !21 + %782 = tail call float @llvm.nvvm.saturate.f(float %.02.i288) #6, !dbg !21 + %.03.i290 = select i1 %.not1.i289, float %782, float %781, !dbg !21 + %783 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i291 = icmp eq i32 %783, 0, !dbg !21 + %784 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i290, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %785 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i290, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i292 = select i1 %.not2.i291, float %785, float %784, !dbg !21 + %786 = fadd float %.04.i292, 0xC168000FE0000000, !dbg !21 + %787 = fneg float %786, !dbg !21 + %788 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i293 = icmp eq i32 %788, 0, !dbg !21 + %789 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %356, float 0x3FF7154760000000, float %787) #6, !dbg !21 + %790 = tail call float @llvm.nvvm.fma.rn.f(float %356, float 0x3FF7154760000000, float %787) #6, !dbg !21 + %.0.i294 = select i1 %.not3.i293, float %790, float %789, !dbg !21 + %791 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i295 = icmp eq i32 %791, 0, !dbg !21 + %792 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %356, float 0x3E54AE0C00000000, float %.0.i294) #6, !dbg !21 + %793 = tail call float @llvm.nvvm.fma.rn.f(float %356, float 0x3E54AE0C00000000, float %.0.i294) #6, !dbg !21 + %.01.i296 = select i1 %.not4.i295, float %793, float %792, !dbg !21 + %794 = bitcast float %.04.i292 to i32, !dbg !21 + %795 = shl i32 %794, 23, !dbg !21 + %796 = bitcast i32 %795 to float, !dbg !21 + %797 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i296) #6, !dbg !21 + %798 = fmul float %797, %796, !dbg !21 + %799 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i297 = icmp eq i32 %799, 0, !dbg !21 + %800 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %357, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %801 = tail call float @llvm.nvvm.fma.rn.f(float %357, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i298 = select i1 %.not.i297, float %801, float %800, !dbg !21 + %802 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i299 = icmp eq i32 %802, 0, !dbg !21 + %803 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i298) #6, !dbg !21 + %804 = tail call float @llvm.nvvm.saturate.f(float %.02.i298) #6, !dbg !21 + %.03.i300 = select i1 %.not1.i299, float %804, float %803, !dbg !21 + %805 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i301 = icmp eq i32 %805, 0, !dbg !21 + %806 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i300, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %807 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i300, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i302 = select i1 %.not2.i301, float %807, float %806, !dbg !21 + %808 = fadd float %.04.i302, 0xC168000FE0000000, !dbg !21 + %809 = fneg float %808, !dbg !21 + %810 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i303 = icmp eq i32 %810, 0, !dbg !21 + %811 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %357, float 0x3FF7154760000000, float %809) #6, !dbg !21 + %812 = tail call float @llvm.nvvm.fma.rn.f(float %357, float 0x3FF7154760000000, float %809) #6, !dbg !21 + %.0.i304 = select i1 %.not3.i303, float %812, float %811, !dbg !21 + %813 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i305 = icmp eq i32 %813, 0, !dbg !21 + %814 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %357, float 0x3E54AE0C00000000, float %.0.i304) #6, !dbg !21 + %815 = tail call float @llvm.nvvm.fma.rn.f(float %357, float 0x3E54AE0C00000000, float %.0.i304) #6, !dbg !21 + %.01.i306 = select i1 %.not4.i305, float %815, float %814, !dbg !21 + %816 = bitcast float %.04.i302 to i32, !dbg !21 + %817 = shl i32 %816, 23, !dbg !21 + %818 = bitcast i32 %817 to float, !dbg !21 + %819 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i306) #6, !dbg !21 + %820 = fmul float %819, %818, !dbg !21 + %821 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i307 = icmp eq i32 %821, 0, !dbg !21 + %822 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %358, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %823 = tail call float @llvm.nvvm.fma.rn.f(float %358, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i308 = select i1 %.not.i307, float %823, float %822, !dbg !21 + %824 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i309 = icmp eq i32 %824, 0, !dbg !21 + %825 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i308) #6, !dbg !21 + %826 = tail call float @llvm.nvvm.saturate.f(float %.02.i308) #6, !dbg !21 + %.03.i310 = select i1 %.not1.i309, float %826, float %825, !dbg !21 + %827 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i311 = icmp eq i32 %827, 0, !dbg !21 + %828 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i310, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %829 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i310, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i312 = select i1 %.not2.i311, float %829, float %828, !dbg !21 + %830 = fadd float %.04.i312, 0xC168000FE0000000, !dbg !21 + %831 = fneg float %830, !dbg !21 + %832 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i313 = icmp eq i32 %832, 0, !dbg !21 + %833 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %358, float 0x3FF7154760000000, float %831) #6, !dbg !21 + %834 = tail call float @llvm.nvvm.fma.rn.f(float %358, float 0x3FF7154760000000, float %831) #6, !dbg !21 + %.0.i314 = select i1 %.not3.i313, float %834, float %833, !dbg !21 + %835 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i315 = icmp eq i32 %835, 0, !dbg !21 + %836 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %358, float 0x3E54AE0C00000000, float %.0.i314) #6, !dbg !21 + %837 = tail call float @llvm.nvvm.fma.rn.f(float %358, float 0x3E54AE0C00000000, float %.0.i314) #6, !dbg !21 + %.01.i316 = select i1 %.not4.i315, float %837, float %836, !dbg !21 + %838 = bitcast float %.04.i312 to i32, !dbg !21 + %839 = shl i32 %838, 23, !dbg !21 + %840 = bitcast i32 %839 to float, !dbg !21 + %841 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i316) #6, !dbg !21 + %842 = fmul float %841, %840, !dbg !21 + %843 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i317 = icmp eq i32 %843, 0, !dbg !21 + %844 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %359, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %845 = tail call float @llvm.nvvm.fma.rn.f(float %359, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i318 = select i1 %.not.i317, float %845, float %844, !dbg !21 + %846 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i319 = icmp eq i32 %846, 0, !dbg !21 + %847 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i318) #6, !dbg !21 + %848 = tail call float @llvm.nvvm.saturate.f(float %.02.i318) #6, !dbg !21 + %.03.i320 = select i1 %.not1.i319, float %848, float %847, !dbg !21 + %849 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i321 = icmp eq i32 %849, 0, !dbg !21 + %850 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i320, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %851 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i320, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i322 = select i1 %.not2.i321, float %851, float %850, !dbg !21 + %852 = fadd float %.04.i322, 0xC168000FE0000000, !dbg !21 + %853 = fneg float %852, !dbg !21 + %854 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i323 = icmp eq i32 %854, 0, !dbg !21 + %855 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %359, float 0x3FF7154760000000, float %853) #6, !dbg !21 + %856 = tail call float @llvm.nvvm.fma.rn.f(float %359, float 0x3FF7154760000000, float %853) #6, !dbg !21 + %.0.i324 = select i1 %.not3.i323, float %856, float %855, !dbg !21 + %857 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i325 = icmp eq i32 %857, 0, !dbg !21 + %858 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %359, float 0x3E54AE0C00000000, float %.0.i324) #6, !dbg !21 + %859 = tail call float @llvm.nvvm.fma.rn.f(float %359, float 0x3E54AE0C00000000, float %.0.i324) #6, !dbg !21 + %.01.i326 = select i1 %.not4.i325, float %859, float %858, !dbg !21 + %860 = bitcast float %.04.i322 to i32, !dbg !21 + %861 = shl i32 %860, 23, !dbg !21 + %862 = bitcast i32 %861 to float, !dbg !21 + %863 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i326) #6, !dbg !21 + %864 = fmul float %863, %862, !dbg !21 + %865 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i327 = icmp eq i32 %865, 0, !dbg !21 + %866 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %360, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %867 = tail call float @llvm.nvvm.fma.rn.f(float %360, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i328 = select i1 %.not.i327, float %867, float %866, !dbg !21 + %868 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i329 = icmp eq i32 %868, 0, !dbg !21 + %869 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i328) #6, !dbg !21 + %870 = tail call float @llvm.nvvm.saturate.f(float %.02.i328) #6, !dbg !21 + %.03.i330 = select i1 %.not1.i329, float %870, float %869, !dbg !21 + %871 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i331 = icmp eq i32 %871, 0, !dbg !21 + %872 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i330, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %873 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i330, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i332 = select i1 %.not2.i331, float %873, float %872, !dbg !21 + %874 = fadd float %.04.i332, 0xC168000FE0000000, !dbg !21 + %875 = fneg float %874, !dbg !21 + %876 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i333 = icmp eq i32 %876, 0, !dbg !21 + %877 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %360, float 0x3FF7154760000000, float %875) #6, !dbg !21 + %878 = tail call float @llvm.nvvm.fma.rn.f(float %360, float 0x3FF7154760000000, float %875) #6, !dbg !21 + %.0.i334 = select i1 %.not3.i333, float %878, float %877, !dbg !21 + %879 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i335 = icmp eq i32 %879, 0, !dbg !21 + %880 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %360, float 0x3E54AE0C00000000, float %.0.i334) #6, !dbg !21 + %881 = tail call float @llvm.nvvm.fma.rn.f(float %360, float 0x3E54AE0C00000000, float %.0.i334) #6, !dbg !21 + %.01.i336 = select i1 %.not4.i335, float %881, float %880, !dbg !21 + %882 = bitcast float %.04.i332 to i32, !dbg !21 + %883 = shl i32 %882, 23, !dbg !21 + %884 = bitcast i32 %883 to float, !dbg !21 + %885 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i336) #6, !dbg !21 + %886 = fmul float %885, %884, !dbg !21 + %887 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i337 = icmp eq i32 %887, 0, !dbg !21 + %888 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %361, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %889 = tail call float @llvm.nvvm.fma.rn.f(float %361, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i338 = select i1 %.not.i337, float %889, float %888, !dbg !21 + %890 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i339 = icmp eq i32 %890, 0, !dbg !21 + %891 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i338) #6, !dbg !21 + %892 = tail call float @llvm.nvvm.saturate.f(float %.02.i338) #6, !dbg !21 + %.03.i340 = select i1 %.not1.i339, float %892, float %891, !dbg !21 + %893 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i341 = icmp eq i32 %893, 0, !dbg !21 + %894 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i340, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %895 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i340, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i342 = select i1 %.not2.i341, float %895, float %894, !dbg !21 + %896 = fadd float %.04.i342, 0xC168000FE0000000, !dbg !21 + %897 = fneg float %896, !dbg !21 + %898 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i343 = icmp eq i32 %898, 0, !dbg !21 + %899 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %361, float 0x3FF7154760000000, float %897) #6, !dbg !21 + %900 = tail call float @llvm.nvvm.fma.rn.f(float %361, float 0x3FF7154760000000, float %897) #6, !dbg !21 + %.0.i344 = select i1 %.not3.i343, float %900, float %899, !dbg !21 + %901 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i345 = icmp eq i32 %901, 0, !dbg !21 + %902 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %361, float 0x3E54AE0C00000000, float %.0.i344) #6, !dbg !21 + %903 = tail call float @llvm.nvvm.fma.rn.f(float %361, float 0x3E54AE0C00000000, float %.0.i344) #6, !dbg !21 + %.01.i346 = select i1 %.not4.i345, float %903, float %902, !dbg !21 + %904 = bitcast float %.04.i342 to i32, !dbg !21 + %905 = shl i32 %904, 23, !dbg !21 + %906 = bitcast i32 %905 to float, !dbg !21 + %907 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i346) #6, !dbg !21 + %908 = fmul float %907, %906, !dbg !21 + %909 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i347 = icmp eq i32 %909, 0, !dbg !21 + %910 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %362, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %911 = tail call float @llvm.nvvm.fma.rn.f(float %362, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i348 = select i1 %.not.i347, float %911, float %910, !dbg !21 + %912 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i349 = icmp eq i32 %912, 0, !dbg !21 + %913 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i348) #6, !dbg !21 + %914 = tail call float @llvm.nvvm.saturate.f(float %.02.i348) #6, !dbg !21 + %.03.i350 = select i1 %.not1.i349, float %914, float %913, !dbg !21 + %915 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i351 = icmp eq i32 %915, 0, !dbg !21 + %916 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i350, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %917 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i350, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i352 = select i1 %.not2.i351, float %917, float %916, !dbg !21 + %918 = fadd float %.04.i352, 0xC168000FE0000000, !dbg !21 + %919 = fneg float %918, !dbg !21 + %920 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i353 = icmp eq i32 %920, 0, !dbg !21 + %921 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %362, float 0x3FF7154760000000, float %919) #6, !dbg !21 + %922 = tail call float @llvm.nvvm.fma.rn.f(float %362, float 0x3FF7154760000000, float %919) #6, !dbg !21 + %.0.i354 = select i1 %.not3.i353, float %922, float %921, !dbg !21 + %923 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i355 = icmp eq i32 %923, 0, !dbg !21 + %924 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %362, float 0x3E54AE0C00000000, float %.0.i354) #6, !dbg !21 + %925 = tail call float @llvm.nvvm.fma.rn.f(float %362, float 0x3E54AE0C00000000, float %.0.i354) #6, !dbg !21 + %.01.i356 = select i1 %.not4.i355, float %925, float %924, !dbg !21 + %926 = bitcast float %.04.i352 to i32, !dbg !21 + %927 = shl i32 %926, 23, !dbg !21 + %928 = bitcast i32 %927 to float, !dbg !21 + %929 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i356) #6, !dbg !21 + %930 = fmul float %929, %928, !dbg !21 + %931 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i357 = icmp eq i32 %931, 0, !dbg !21 + %932 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %363, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %933 = tail call float @llvm.nvvm.fma.rn.f(float %363, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i358 = select i1 %.not.i357, float %933, float %932, !dbg !21 + %934 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i359 = icmp eq i32 %934, 0, !dbg !21 + %935 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i358) #6, !dbg !21 + %936 = tail call float @llvm.nvvm.saturate.f(float %.02.i358) #6, !dbg !21 + %.03.i360 = select i1 %.not1.i359, float %936, float %935, !dbg !21 + %937 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i361 = icmp eq i32 %937, 0, !dbg !21 + %938 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i360, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %939 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i360, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i362 = select i1 %.not2.i361, float %939, float %938, !dbg !21 + %940 = fadd float %.04.i362, 0xC168000FE0000000, !dbg !21 + %941 = fneg float %940, !dbg !21 + %942 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i363 = icmp eq i32 %942, 0, !dbg !21 + %943 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %363, float 0x3FF7154760000000, float %941) #6, !dbg !21 + %944 = tail call float @llvm.nvvm.fma.rn.f(float %363, float 0x3FF7154760000000, float %941) #6, !dbg !21 + %.0.i364 = select i1 %.not3.i363, float %944, float %943, !dbg !21 + %945 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i365 = icmp eq i32 %945, 0, !dbg !21 + %946 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %363, float 0x3E54AE0C00000000, float %.0.i364) #6, !dbg !21 + %947 = tail call float @llvm.nvvm.fma.rn.f(float %363, float 0x3E54AE0C00000000, float %.0.i364) #6, !dbg !21 + %.01.i366 = select i1 %.not4.i365, float %947, float %946, !dbg !21 + %948 = bitcast float %.04.i362 to i32, !dbg !21 + %949 = shl i32 %948, 23, !dbg !21 + %950 = bitcast i32 %949 to float, !dbg !21 + %951 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i366) #6, !dbg !21 + %952 = fmul float %951, %950, !dbg !21 + %953 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i367 = icmp eq i32 %953, 0, !dbg !21 + %954 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %364, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %955 = tail call float @llvm.nvvm.fma.rn.f(float %364, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i368 = select i1 %.not.i367, float %955, float %954, !dbg !21 + %956 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i369 = icmp eq i32 %956, 0, !dbg !21 + %957 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i368) #6, !dbg !21 + %958 = tail call float @llvm.nvvm.saturate.f(float %.02.i368) #6, !dbg !21 + %.03.i370 = select i1 %.not1.i369, float %958, float %957, !dbg !21 + %959 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i371 = icmp eq i32 %959, 0, !dbg !21 + %960 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i370, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %961 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i370, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i372 = select i1 %.not2.i371, float %961, float %960, !dbg !21 + %962 = fadd float %.04.i372, 0xC168000FE0000000, !dbg !21 + %963 = fneg float %962, !dbg !21 + %964 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i373 = icmp eq i32 %964, 0, !dbg !21 + %965 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %364, float 0x3FF7154760000000, float %963) #6, !dbg !21 + %966 = tail call float @llvm.nvvm.fma.rn.f(float %364, float 0x3FF7154760000000, float %963) #6, !dbg !21 + %.0.i374 = select i1 %.not3.i373, float %966, float %965, !dbg !21 + %967 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i375 = icmp eq i32 %967, 0, !dbg !21 + %968 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %364, float 0x3E54AE0C00000000, float %.0.i374) #6, !dbg !21 + %969 = tail call float @llvm.nvvm.fma.rn.f(float %364, float 0x3E54AE0C00000000, float %.0.i374) #6, !dbg !21 + %.01.i376 = select i1 %.not4.i375, float %969, float %968, !dbg !21 + %970 = bitcast float %.04.i372 to i32, !dbg !21 + %971 = shl i32 %970, 23, !dbg !21 + %972 = bitcast i32 %971 to float, !dbg !21 + %973 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i376) #6, !dbg !21 + %974 = fmul float %973, %972, !dbg !21 + %975 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i377 = icmp eq i32 %975, 0, !dbg !21 + %976 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %365, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %977 = tail call float @llvm.nvvm.fma.rn.f(float %365, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i378 = select i1 %.not.i377, float %977, float %976, !dbg !21 + %978 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i379 = icmp eq i32 %978, 0, !dbg !21 + %979 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i378) #6, !dbg !21 + %980 = tail call float @llvm.nvvm.saturate.f(float %.02.i378) #6, !dbg !21 + %.03.i380 = select i1 %.not1.i379, float %980, float %979, !dbg !21 + %981 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i381 = icmp eq i32 %981, 0, !dbg !21 + %982 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i380, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %983 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i380, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i382 = select i1 %.not2.i381, float %983, float %982, !dbg !21 + %984 = fadd float %.04.i382, 0xC168000FE0000000, !dbg !21 + %985 = fneg float %984, !dbg !21 + %986 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i383 = icmp eq i32 %986, 0, !dbg !21 + %987 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %365, float 0x3FF7154760000000, float %985) #6, !dbg !21 + %988 = tail call float @llvm.nvvm.fma.rn.f(float %365, float 0x3FF7154760000000, float %985) #6, !dbg !21 + %.0.i384 = select i1 %.not3.i383, float %988, float %987, !dbg !21 + %989 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i385 = icmp eq i32 %989, 0, !dbg !21 + %990 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %365, float 0x3E54AE0C00000000, float %.0.i384) #6, !dbg !21 + %991 = tail call float @llvm.nvvm.fma.rn.f(float %365, float 0x3E54AE0C00000000, float %.0.i384) #6, !dbg !21 + %.01.i386 = select i1 %.not4.i385, float %991, float %990, !dbg !21 + %992 = bitcast float %.04.i382 to i32, !dbg !21 + %993 = shl i32 %992, 23, !dbg !21 + %994 = bitcast i32 %993 to float, !dbg !21 + %995 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i386) #6, !dbg !21 + %996 = fmul float %995, %994, !dbg !21 + %997 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i387 = icmp eq i32 %997, 0, !dbg !21 + %998 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %366, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %999 = tail call float @llvm.nvvm.fma.rn.f(float %366, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i388 = select i1 %.not.i387, float %999, float %998, !dbg !21 + %1000 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i389 = icmp eq i32 %1000, 0, !dbg !21 + %1001 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i388) #6, !dbg !21 + %1002 = tail call float @llvm.nvvm.saturate.f(float %.02.i388) #6, !dbg !21 + %.03.i390 = select i1 %.not1.i389, float %1002, float %1001, !dbg !21 + %1003 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i391 = icmp eq i32 %1003, 0, !dbg !21 + %1004 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i390, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1005 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i390, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i392 = select i1 %.not2.i391, float %1005, float %1004, !dbg !21 + %1006 = fadd float %.04.i392, 0xC168000FE0000000, !dbg !21 + %1007 = fneg float %1006, !dbg !21 + %1008 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i393 = icmp eq i32 %1008, 0, !dbg !21 + %1009 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %366, float 0x3FF7154760000000, float %1007) #6, !dbg !21 + %1010 = tail call float @llvm.nvvm.fma.rn.f(float %366, float 0x3FF7154760000000, float %1007) #6, !dbg !21 + %.0.i394 = select i1 %.not3.i393, float %1010, float %1009, !dbg !21 + %1011 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i395 = icmp eq i32 %1011, 0, !dbg !21 + %1012 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %366, float 0x3E54AE0C00000000, float %.0.i394) #6, !dbg !21 + %1013 = tail call float @llvm.nvvm.fma.rn.f(float %366, float 0x3E54AE0C00000000, float %.0.i394) #6, !dbg !21 + %.01.i396 = select i1 %.not4.i395, float %1013, float %1012, !dbg !21 + %1014 = bitcast float %.04.i392 to i32, !dbg !21 + %1015 = shl i32 %1014, 23, !dbg !21 + %1016 = bitcast i32 %1015 to float, !dbg !21 + %1017 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i396) #6, !dbg !21 + %1018 = fmul float %1017, %1016, !dbg !21 + %1019 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i397 = icmp eq i32 %1019, 0, !dbg !21 + %1020 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %367, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1021 = tail call float @llvm.nvvm.fma.rn.f(float %367, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i398 = select i1 %.not.i397, float %1021, float %1020, !dbg !21 + %1022 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i399 = icmp eq i32 %1022, 0, !dbg !21 + %1023 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i398) #6, !dbg !21 + %1024 = tail call float @llvm.nvvm.saturate.f(float %.02.i398) #6, !dbg !21 + %.03.i400 = select i1 %.not1.i399, float %1024, float %1023, !dbg !21 + %1025 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i401 = icmp eq i32 %1025, 0, !dbg !21 + %1026 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i400, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1027 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i400, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i402 = select i1 %.not2.i401, float %1027, float %1026, !dbg !21 + %1028 = fadd float %.04.i402, 0xC168000FE0000000, !dbg !21 + %1029 = fneg float %1028, !dbg !21 + %1030 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i403 = icmp eq i32 %1030, 0, !dbg !21 + %1031 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %367, float 0x3FF7154760000000, float %1029) #6, !dbg !21 + %1032 = tail call float @llvm.nvvm.fma.rn.f(float %367, float 0x3FF7154760000000, float %1029) #6, !dbg !21 + %.0.i404 = select i1 %.not3.i403, float %1032, float %1031, !dbg !21 + %1033 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i405 = icmp eq i32 %1033, 0, !dbg !21 + %1034 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %367, float 0x3E54AE0C00000000, float %.0.i404) #6, !dbg !21 + %1035 = tail call float @llvm.nvvm.fma.rn.f(float %367, float 0x3E54AE0C00000000, float %.0.i404) #6, !dbg !21 + %.01.i406 = select i1 %.not4.i405, float %1035, float %1034, !dbg !21 + %1036 = bitcast float %.04.i402 to i32, !dbg !21 + %1037 = shl i32 %1036, 23, !dbg !21 + %1038 = bitcast i32 %1037 to float, !dbg !21 + %1039 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i406) #6, !dbg !21 + %1040 = fmul float %1039, %1038, !dbg !21 + %1041 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i407 = icmp eq i32 %1041, 0, !dbg !21 + %1042 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %368, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1043 = tail call float @llvm.nvvm.fma.rn.f(float %368, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i408 = select i1 %.not.i407, float %1043, float %1042, !dbg !21 + %1044 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i409 = icmp eq i32 %1044, 0, !dbg !21 + %1045 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i408) #6, !dbg !21 + %1046 = tail call float @llvm.nvvm.saturate.f(float %.02.i408) #6, !dbg !21 + %.03.i410 = select i1 %.not1.i409, float %1046, float %1045, !dbg !21 + %1047 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i411 = icmp eq i32 %1047, 0, !dbg !21 + %1048 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i410, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1049 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i410, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i412 = select i1 %.not2.i411, float %1049, float %1048, !dbg !21 + %1050 = fadd float %.04.i412, 0xC168000FE0000000, !dbg !21 + %1051 = fneg float %1050, !dbg !21 + %1052 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i413 = icmp eq i32 %1052, 0, !dbg !21 + %1053 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %368, float 0x3FF7154760000000, float %1051) #6, !dbg !21 + %1054 = tail call float @llvm.nvvm.fma.rn.f(float %368, float 0x3FF7154760000000, float %1051) #6, !dbg !21 + %.0.i414 = select i1 %.not3.i413, float %1054, float %1053, !dbg !21 + %1055 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i415 = icmp eq i32 %1055, 0, !dbg !21 + %1056 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %368, float 0x3E54AE0C00000000, float %.0.i414) #6, !dbg !21 + %1057 = tail call float @llvm.nvvm.fma.rn.f(float %368, float 0x3E54AE0C00000000, float %.0.i414) #6, !dbg !21 + %.01.i416 = select i1 %.not4.i415, float %1057, float %1056, !dbg !21 + %1058 = bitcast float %.04.i412 to i32, !dbg !21 + %1059 = shl i32 %1058, 23, !dbg !21 + %1060 = bitcast i32 %1059 to float, !dbg !21 + %1061 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i416) #6, !dbg !21 + %1062 = fmul float %1061, %1060, !dbg !21 + %1063 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i417 = icmp eq i32 %1063, 0, !dbg !21 + %1064 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %369, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1065 = tail call float @llvm.nvvm.fma.rn.f(float %369, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i418 = select i1 %.not.i417, float %1065, float %1064, !dbg !21 + %1066 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i419 = icmp eq i32 %1066, 0, !dbg !21 + %1067 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i418) #6, !dbg !21 + %1068 = tail call float @llvm.nvvm.saturate.f(float %.02.i418) #6, !dbg !21 + %.03.i420 = select i1 %.not1.i419, float %1068, float %1067, !dbg !21 + %1069 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i421 = icmp eq i32 %1069, 0, !dbg !21 + %1070 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i420, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1071 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i420, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i422 = select i1 %.not2.i421, float %1071, float %1070, !dbg !21 + %1072 = fadd float %.04.i422, 0xC168000FE0000000, !dbg !21 + %1073 = fneg float %1072, !dbg !21 + %1074 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i423 = icmp eq i32 %1074, 0, !dbg !21 + %1075 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %369, float 0x3FF7154760000000, float %1073) #6, !dbg !21 + %1076 = tail call float @llvm.nvvm.fma.rn.f(float %369, float 0x3FF7154760000000, float %1073) #6, !dbg !21 + %.0.i424 = select i1 %.not3.i423, float %1076, float %1075, !dbg !21 + %1077 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i425 = icmp eq i32 %1077, 0, !dbg !21 + %1078 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %369, float 0x3E54AE0C00000000, float %.0.i424) #6, !dbg !21 + %1079 = tail call float @llvm.nvvm.fma.rn.f(float %369, float 0x3E54AE0C00000000, float %.0.i424) #6, !dbg !21 + %.01.i426 = select i1 %.not4.i425, float %1079, float %1078, !dbg !21 + %1080 = bitcast float %.04.i422 to i32, !dbg !21 + %1081 = shl i32 %1080, 23, !dbg !21 + %1082 = bitcast i32 %1081 to float, !dbg !21 + %1083 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i426) #6, !dbg !21 + %1084 = fmul float %1083, %1082, !dbg !21 + %1085 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i427 = icmp eq i32 %1085, 0, !dbg !21 + %1086 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %370, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1087 = tail call float @llvm.nvvm.fma.rn.f(float %370, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i428 = select i1 %.not.i427, float %1087, float %1086, !dbg !21 + %1088 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i429 = icmp eq i32 %1088, 0, !dbg !21 + %1089 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i428) #6, !dbg !21 + %1090 = tail call float @llvm.nvvm.saturate.f(float %.02.i428) #6, !dbg !21 + %.03.i430 = select i1 %.not1.i429, float %1090, float %1089, !dbg !21 + %1091 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i431 = icmp eq i32 %1091, 0, !dbg !21 + %1092 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i430, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1093 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i430, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i432 = select i1 %.not2.i431, float %1093, float %1092, !dbg !21 + %1094 = fadd float %.04.i432, 0xC168000FE0000000, !dbg !21 + %1095 = fneg float %1094, !dbg !21 + %1096 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i433 = icmp eq i32 %1096, 0, !dbg !21 + %1097 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %370, float 0x3FF7154760000000, float %1095) #6, !dbg !21 + %1098 = tail call float @llvm.nvvm.fma.rn.f(float %370, float 0x3FF7154760000000, float %1095) #6, !dbg !21 + %.0.i434 = select i1 %.not3.i433, float %1098, float %1097, !dbg !21 + %1099 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i435 = icmp eq i32 %1099, 0, !dbg !21 + %1100 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %370, float 0x3E54AE0C00000000, float %.0.i434) #6, !dbg !21 + %1101 = tail call float @llvm.nvvm.fma.rn.f(float %370, float 0x3E54AE0C00000000, float %.0.i434) #6, !dbg !21 + %.01.i436 = select i1 %.not4.i435, float %1101, float %1100, !dbg !21 + %1102 = bitcast float %.04.i432 to i32, !dbg !21 + %1103 = shl i32 %1102, 23, !dbg !21 + %1104 = bitcast i32 %1103 to float, !dbg !21 + %1105 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i436) #6, !dbg !21 + %1106 = fmul float %1105, %1104, !dbg !21 + %1107 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i437 = icmp eq i32 %1107, 0, !dbg !21 + %1108 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %371, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1109 = tail call float @llvm.nvvm.fma.rn.f(float %371, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i438 = select i1 %.not.i437, float %1109, float %1108, !dbg !21 + %1110 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i439 = icmp eq i32 %1110, 0, !dbg !21 + %1111 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i438) #6, !dbg !21 + %1112 = tail call float @llvm.nvvm.saturate.f(float %.02.i438) #6, !dbg !21 + %.03.i440 = select i1 %.not1.i439, float %1112, float %1111, !dbg !21 + %1113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i441 = icmp eq i32 %1113, 0, !dbg !21 + %1114 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i440, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1115 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i440, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i442 = select i1 %.not2.i441, float %1115, float %1114, !dbg !21 + %1116 = fadd float %.04.i442, 0xC168000FE0000000, !dbg !21 + %1117 = fneg float %1116, !dbg !21 + %1118 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i443 = icmp eq i32 %1118, 0, !dbg !21 + %1119 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %371, float 0x3FF7154760000000, float %1117) #6, !dbg !21 + %1120 = tail call float @llvm.nvvm.fma.rn.f(float %371, float 0x3FF7154760000000, float %1117) #6, !dbg !21 + %.0.i444 = select i1 %.not3.i443, float %1120, float %1119, !dbg !21 + %1121 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i445 = icmp eq i32 %1121, 0, !dbg !21 + %1122 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %371, float 0x3E54AE0C00000000, float %.0.i444) #6, !dbg !21 + %1123 = tail call float @llvm.nvvm.fma.rn.f(float %371, float 0x3E54AE0C00000000, float %.0.i444) #6, !dbg !21 + %.01.i446 = select i1 %.not4.i445, float %1123, float %1122, !dbg !21 + %1124 = bitcast float %.04.i442 to i32, !dbg !21 + %1125 = shl i32 %1124, 23, !dbg !21 + %1126 = bitcast i32 %1125 to float, !dbg !21 + %1127 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i446) #6, !dbg !21 + %1128 = fmul float %1127, %1126, !dbg !21 + %1129 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i447 = icmp eq i32 %1129, 0, !dbg !21 + %1130 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %372, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1131 = tail call float @llvm.nvvm.fma.rn.f(float %372, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i448 = select i1 %.not.i447, float %1131, float %1130, !dbg !21 + %1132 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i449 = icmp eq i32 %1132, 0, !dbg !21 + %1133 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i448) #6, !dbg !21 + %1134 = tail call float @llvm.nvvm.saturate.f(float %.02.i448) #6, !dbg !21 + %.03.i450 = select i1 %.not1.i449, float %1134, float %1133, !dbg !21 + %1135 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i451 = icmp eq i32 %1135, 0, !dbg !21 + %1136 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i450, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1137 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i450, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i452 = select i1 %.not2.i451, float %1137, float %1136, !dbg !21 + %1138 = fadd float %.04.i452, 0xC168000FE0000000, !dbg !21 + %1139 = fneg float %1138, !dbg !21 + %1140 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i453 = icmp eq i32 %1140, 0, !dbg !21 + %1141 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %372, float 0x3FF7154760000000, float %1139) #6, !dbg !21 + %1142 = tail call float @llvm.nvvm.fma.rn.f(float %372, float 0x3FF7154760000000, float %1139) #6, !dbg !21 + %.0.i454 = select i1 %.not3.i453, float %1142, float %1141, !dbg !21 + %1143 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i455 = icmp eq i32 %1143, 0, !dbg !21 + %1144 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %372, float 0x3E54AE0C00000000, float %.0.i454) #6, !dbg !21 + %1145 = tail call float @llvm.nvvm.fma.rn.f(float %372, float 0x3E54AE0C00000000, float %.0.i454) #6, !dbg !21 + %.01.i456 = select i1 %.not4.i455, float %1145, float %1144, !dbg !21 + %1146 = bitcast float %.04.i452 to i32, !dbg !21 + %1147 = shl i32 %1146, 23, !dbg !21 + %1148 = bitcast i32 %1147 to float, !dbg !21 + %1149 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i456) #6, !dbg !21 + %1150 = fmul float %1149, %1148, !dbg !21 + %1151 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i457 = icmp eq i32 %1151, 0, !dbg !21 + %1152 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %373, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1153 = tail call float @llvm.nvvm.fma.rn.f(float %373, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i458 = select i1 %.not.i457, float %1153, float %1152, !dbg !21 + %1154 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i459 = icmp eq i32 %1154, 0, !dbg !21 + %1155 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i458) #6, !dbg !21 + %1156 = tail call float @llvm.nvvm.saturate.f(float %.02.i458) #6, !dbg !21 + %.03.i460 = select i1 %.not1.i459, float %1156, float %1155, !dbg !21 + %1157 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i461 = icmp eq i32 %1157, 0, !dbg !21 + %1158 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i460, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1159 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i460, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i462 = select i1 %.not2.i461, float %1159, float %1158, !dbg !21 + %1160 = fadd float %.04.i462, 0xC168000FE0000000, !dbg !21 + %1161 = fneg float %1160, !dbg !21 + %1162 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i463 = icmp eq i32 %1162, 0, !dbg !21 + %1163 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %373, float 0x3FF7154760000000, float %1161) #6, !dbg !21 + %1164 = tail call float @llvm.nvvm.fma.rn.f(float %373, float 0x3FF7154760000000, float %1161) #6, !dbg !21 + %.0.i464 = select i1 %.not3.i463, float %1164, float %1163, !dbg !21 + %1165 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i465 = icmp eq i32 %1165, 0, !dbg !21 + %1166 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %373, float 0x3E54AE0C00000000, float %.0.i464) #6, !dbg !21 + %1167 = tail call float @llvm.nvvm.fma.rn.f(float %373, float 0x3E54AE0C00000000, float %.0.i464) #6, !dbg !21 + %.01.i466 = select i1 %.not4.i465, float %1167, float %1166, !dbg !21 + %1168 = bitcast float %.04.i462 to i32, !dbg !21 + %1169 = shl i32 %1168, 23, !dbg !21 + %1170 = bitcast i32 %1169 to float, !dbg !21 + %1171 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i466) #6, !dbg !21 + %1172 = fmul float %1171, %1170, !dbg !21 + %1173 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i467 = icmp eq i32 %1173, 0, !dbg !21 + %1174 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %374, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1175 = tail call float @llvm.nvvm.fma.rn.f(float %374, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i468 = select i1 %.not.i467, float %1175, float %1174, !dbg !21 + %1176 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i469 = icmp eq i32 %1176, 0, !dbg !21 + %1177 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i468) #6, !dbg !21 + %1178 = tail call float @llvm.nvvm.saturate.f(float %.02.i468) #6, !dbg !21 + %.03.i470 = select i1 %.not1.i469, float %1178, float %1177, !dbg !21 + %1179 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i471 = icmp eq i32 %1179, 0, !dbg !21 + %1180 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i470, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1181 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i470, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i472 = select i1 %.not2.i471, float %1181, float %1180, !dbg !21 + %1182 = fadd float %.04.i472, 0xC168000FE0000000, !dbg !21 + %1183 = fneg float %1182, !dbg !21 + %1184 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i473 = icmp eq i32 %1184, 0, !dbg !21 + %1185 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %374, float 0x3FF7154760000000, float %1183) #6, !dbg !21 + %1186 = tail call float @llvm.nvvm.fma.rn.f(float %374, float 0x3FF7154760000000, float %1183) #6, !dbg !21 + %.0.i474 = select i1 %.not3.i473, float %1186, float %1185, !dbg !21 + %1187 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i475 = icmp eq i32 %1187, 0, !dbg !21 + %1188 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %374, float 0x3E54AE0C00000000, float %.0.i474) #6, !dbg !21 + %1189 = tail call float @llvm.nvvm.fma.rn.f(float %374, float 0x3E54AE0C00000000, float %.0.i474) #6, !dbg !21 + %.01.i476 = select i1 %.not4.i475, float %1189, float %1188, !dbg !21 + %1190 = bitcast float %.04.i472 to i32, !dbg !21 + %1191 = shl i32 %1190, 23, !dbg !21 + %1192 = bitcast i32 %1191 to float, !dbg !21 + %1193 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i476) #6, !dbg !21 + %1194 = fmul float %1193, %1192, !dbg !21 + %1195 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i477 = icmp eq i32 %1195, 0, !dbg !21 + %1196 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %375, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1197 = tail call float @llvm.nvvm.fma.rn.f(float %375, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i478 = select i1 %.not.i477, float %1197, float %1196, !dbg !21 + %1198 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i479 = icmp eq i32 %1198, 0, !dbg !21 + %1199 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i478) #6, !dbg !21 + %1200 = tail call float @llvm.nvvm.saturate.f(float %.02.i478) #6, !dbg !21 + %.03.i480 = select i1 %.not1.i479, float %1200, float %1199, !dbg !21 + %1201 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i481 = icmp eq i32 %1201, 0, !dbg !21 + %1202 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i480, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1203 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i480, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i482 = select i1 %.not2.i481, float %1203, float %1202, !dbg !21 + %1204 = fadd float %.04.i482, 0xC168000FE0000000, !dbg !21 + %1205 = fneg float %1204, !dbg !21 + %1206 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i483 = icmp eq i32 %1206, 0, !dbg !21 + %1207 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %375, float 0x3FF7154760000000, float %1205) #6, !dbg !21 + %1208 = tail call float @llvm.nvvm.fma.rn.f(float %375, float 0x3FF7154760000000, float %1205) #6, !dbg !21 + %.0.i484 = select i1 %.not3.i483, float %1208, float %1207, !dbg !21 + %1209 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i485 = icmp eq i32 %1209, 0, !dbg !21 + %1210 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %375, float 0x3E54AE0C00000000, float %.0.i484) #6, !dbg !21 + %1211 = tail call float @llvm.nvvm.fma.rn.f(float %375, float 0x3E54AE0C00000000, float %.0.i484) #6, !dbg !21 + %.01.i486 = select i1 %.not4.i485, float %1211, float %1210, !dbg !21 + %1212 = bitcast float %.04.i482 to i32, !dbg !21 + %1213 = shl i32 %1212, 23, !dbg !21 + %1214 = bitcast i32 %1213 to float, !dbg !21 + %1215 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i486) #6, !dbg !21 + %1216 = fmul float %1215, %1214, !dbg !21 + %1217 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i487 = icmp eq i32 %1217, 0, !dbg !21 + %1218 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %376, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1219 = tail call float @llvm.nvvm.fma.rn.f(float %376, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i488 = select i1 %.not.i487, float %1219, float %1218, !dbg !21 + %1220 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i489 = icmp eq i32 %1220, 0, !dbg !21 + %1221 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i488) #6, !dbg !21 + %1222 = tail call float @llvm.nvvm.saturate.f(float %.02.i488) #6, !dbg !21 + %.03.i490 = select i1 %.not1.i489, float %1222, float %1221, !dbg !21 + %1223 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i491 = icmp eq i32 %1223, 0, !dbg !21 + %1224 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i490, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1225 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i490, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i492 = select i1 %.not2.i491, float %1225, float %1224, !dbg !21 + %1226 = fadd float %.04.i492, 0xC168000FE0000000, !dbg !21 + %1227 = fneg float %1226, !dbg !21 + %1228 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i493 = icmp eq i32 %1228, 0, !dbg !21 + %1229 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %376, float 0x3FF7154760000000, float %1227) #6, !dbg !21 + %1230 = tail call float @llvm.nvvm.fma.rn.f(float %376, float 0x3FF7154760000000, float %1227) #6, !dbg !21 + %.0.i494 = select i1 %.not3.i493, float %1230, float %1229, !dbg !21 + %1231 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i495 = icmp eq i32 %1231, 0, !dbg !21 + %1232 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %376, float 0x3E54AE0C00000000, float %.0.i494) #6, !dbg !21 + %1233 = tail call float @llvm.nvvm.fma.rn.f(float %376, float 0x3E54AE0C00000000, float %.0.i494) #6, !dbg !21 + %.01.i496 = select i1 %.not4.i495, float %1233, float %1232, !dbg !21 + %1234 = bitcast float %.04.i492 to i32, !dbg !21 + %1235 = shl i32 %1234, 23, !dbg !21 + %1236 = bitcast i32 %1235 to float, !dbg !21 + %1237 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i496) #6, !dbg !21 + %1238 = fmul float %1237, %1236, !dbg !21 + %1239 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i497 = icmp eq i32 %1239, 0, !dbg !21 + %1240 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %377, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1241 = tail call float @llvm.nvvm.fma.rn.f(float %377, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i498 = select i1 %.not.i497, float %1241, float %1240, !dbg !21 + %1242 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i499 = icmp eq i32 %1242, 0, !dbg !21 + %1243 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i498) #6, !dbg !21 + %1244 = tail call float @llvm.nvvm.saturate.f(float %.02.i498) #6, !dbg !21 + %.03.i500 = select i1 %.not1.i499, float %1244, float %1243, !dbg !21 + %1245 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i501 = icmp eq i32 %1245, 0, !dbg !21 + %1246 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i500, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1247 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i500, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i502 = select i1 %.not2.i501, float %1247, float %1246, !dbg !21 + %1248 = fadd float %.04.i502, 0xC168000FE0000000, !dbg !21 + %1249 = fneg float %1248, !dbg !21 + %1250 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i503 = icmp eq i32 %1250, 0, !dbg !21 + %1251 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %377, float 0x3FF7154760000000, float %1249) #6, !dbg !21 + %1252 = tail call float @llvm.nvvm.fma.rn.f(float %377, float 0x3FF7154760000000, float %1249) #6, !dbg !21 + %.0.i504 = select i1 %.not3.i503, float %1252, float %1251, !dbg !21 + %1253 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i505 = icmp eq i32 %1253, 0, !dbg !21 + %1254 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %377, float 0x3E54AE0C00000000, float %.0.i504) #6, !dbg !21 + %1255 = tail call float @llvm.nvvm.fma.rn.f(float %377, float 0x3E54AE0C00000000, float %.0.i504) #6, !dbg !21 + %.01.i506 = select i1 %.not4.i505, float %1255, float %1254, !dbg !21 + %1256 = bitcast float %.04.i502 to i32, !dbg !21 + %1257 = shl i32 %1256, 23, !dbg !21 + %1258 = bitcast i32 %1257 to float, !dbg !21 + %1259 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i506) #6, !dbg !21 + %1260 = fmul float %1259, %1258, !dbg !21 + %1261 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i507 = icmp eq i32 %1261, 0, !dbg !21 + %1262 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %378, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1263 = tail call float @llvm.nvvm.fma.rn.f(float %378, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i508 = select i1 %.not.i507, float %1263, float %1262, !dbg !21 + %1264 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i509 = icmp eq i32 %1264, 0, !dbg !21 + %1265 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i508) #6, !dbg !21 + %1266 = tail call float @llvm.nvvm.saturate.f(float %.02.i508) #6, !dbg !21 + %.03.i510 = select i1 %.not1.i509, float %1266, float %1265, !dbg !21 + %1267 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i511 = icmp eq i32 %1267, 0, !dbg !21 + %1268 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i510, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1269 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i510, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i512 = select i1 %.not2.i511, float %1269, float %1268, !dbg !21 + %1270 = fadd float %.04.i512, 0xC168000FE0000000, !dbg !21 + %1271 = fneg float %1270, !dbg !21 + %1272 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i513 = icmp eq i32 %1272, 0, !dbg !21 + %1273 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %378, float 0x3FF7154760000000, float %1271) #6, !dbg !21 + %1274 = tail call float @llvm.nvvm.fma.rn.f(float %378, float 0x3FF7154760000000, float %1271) #6, !dbg !21 + %.0.i514 = select i1 %.not3.i513, float %1274, float %1273, !dbg !21 + %1275 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i515 = icmp eq i32 %1275, 0, !dbg !21 + %1276 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %378, float 0x3E54AE0C00000000, float %.0.i514) #6, !dbg !21 + %1277 = tail call float @llvm.nvvm.fma.rn.f(float %378, float 0x3E54AE0C00000000, float %.0.i514) #6, !dbg !21 + %.01.i516 = select i1 %.not4.i515, float %1277, float %1276, !dbg !21 + %1278 = bitcast float %.04.i512 to i32, !dbg !21 + %1279 = shl i32 %1278, 23, !dbg !21 + %1280 = bitcast i32 %1279 to float, !dbg !21 + %1281 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i516) #6, !dbg !21 + %1282 = fmul float %1281, %1280, !dbg !21 + %1283 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i517 = icmp eq i32 %1283, 0, !dbg !21 + %1284 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %379, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1285 = tail call float @llvm.nvvm.fma.rn.f(float %379, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i518 = select i1 %.not.i517, float %1285, float %1284, !dbg !21 + %1286 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i519 = icmp eq i32 %1286, 0, !dbg !21 + %1287 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i518) #6, !dbg !21 + %1288 = tail call float @llvm.nvvm.saturate.f(float %.02.i518) #6, !dbg !21 + %.03.i520 = select i1 %.not1.i519, float %1288, float %1287, !dbg !21 + %1289 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i521 = icmp eq i32 %1289, 0, !dbg !21 + %1290 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i520, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1291 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i520, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i522 = select i1 %.not2.i521, float %1291, float %1290, !dbg !21 + %1292 = fadd float %.04.i522, 0xC168000FE0000000, !dbg !21 + %1293 = fneg float %1292, !dbg !21 + %1294 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i523 = icmp eq i32 %1294, 0, !dbg !21 + %1295 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %379, float 0x3FF7154760000000, float %1293) #6, !dbg !21 + %1296 = tail call float @llvm.nvvm.fma.rn.f(float %379, float 0x3FF7154760000000, float %1293) #6, !dbg !21 + %.0.i524 = select i1 %.not3.i523, float %1296, float %1295, !dbg !21 + %1297 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i525 = icmp eq i32 %1297, 0, !dbg !21 + %1298 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %379, float 0x3E54AE0C00000000, float %.0.i524) #6, !dbg !21 + %1299 = tail call float @llvm.nvvm.fma.rn.f(float %379, float 0x3E54AE0C00000000, float %.0.i524) #6, !dbg !21 + %.01.i526 = select i1 %.not4.i525, float %1299, float %1298, !dbg !21 + %1300 = bitcast float %.04.i522 to i32, !dbg !21 + %1301 = shl i32 %1300, 23, !dbg !21 + %1302 = bitcast i32 %1301 to float, !dbg !21 + %1303 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i526) #6, !dbg !21 + %1304 = fmul float %1303, %1302, !dbg !21 + %1305 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i527 = icmp eq i32 %1305, 0, !dbg !21 + %1306 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %380, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1307 = tail call float @llvm.nvvm.fma.rn.f(float %380, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i528 = select i1 %.not.i527, float %1307, float %1306, !dbg !21 + %1308 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i529 = icmp eq i32 %1308, 0, !dbg !21 + %1309 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i528) #6, !dbg !21 + %1310 = tail call float @llvm.nvvm.saturate.f(float %.02.i528) #6, !dbg !21 + %.03.i530 = select i1 %.not1.i529, float %1310, float %1309, !dbg !21 + %1311 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i531 = icmp eq i32 %1311, 0, !dbg !21 + %1312 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i530, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1313 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i530, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i532 = select i1 %.not2.i531, float %1313, float %1312, !dbg !21 + %1314 = fadd float %.04.i532, 0xC168000FE0000000, !dbg !21 + %1315 = fneg float %1314, !dbg !21 + %1316 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i533 = icmp eq i32 %1316, 0, !dbg !21 + %1317 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %380, float 0x3FF7154760000000, float %1315) #6, !dbg !21 + %1318 = tail call float @llvm.nvvm.fma.rn.f(float %380, float 0x3FF7154760000000, float %1315) #6, !dbg !21 + %.0.i534 = select i1 %.not3.i533, float %1318, float %1317, !dbg !21 + %1319 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i535 = icmp eq i32 %1319, 0, !dbg !21 + %1320 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %380, float 0x3E54AE0C00000000, float %.0.i534) #6, !dbg !21 + %1321 = tail call float @llvm.nvvm.fma.rn.f(float %380, float 0x3E54AE0C00000000, float %.0.i534) #6, !dbg !21 + %.01.i536 = select i1 %.not4.i535, float %1321, float %1320, !dbg !21 + %1322 = bitcast float %.04.i532 to i32, !dbg !21 + %1323 = shl i32 %1322, 23, !dbg !21 + %1324 = bitcast i32 %1323 to float, !dbg !21 + %1325 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i536) #6, !dbg !21 + %1326 = fmul float %1325, %1324, !dbg !21 + %1327 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i537 = icmp eq i32 %1327, 0, !dbg !21 + %1328 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %381, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1329 = tail call float @llvm.nvvm.fma.rn.f(float %381, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i538 = select i1 %.not.i537, float %1329, float %1328, !dbg !21 + %1330 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i539 = icmp eq i32 %1330, 0, !dbg !21 + %1331 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i538) #6, !dbg !21 + %1332 = tail call float @llvm.nvvm.saturate.f(float %.02.i538) #6, !dbg !21 + %.03.i540 = select i1 %.not1.i539, float %1332, float %1331, !dbg !21 + %1333 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i541 = icmp eq i32 %1333, 0, !dbg !21 + %1334 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i540, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1335 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i540, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i542 = select i1 %.not2.i541, float %1335, float %1334, !dbg !21 + %1336 = fadd float %.04.i542, 0xC168000FE0000000, !dbg !21 + %1337 = fneg float %1336, !dbg !21 + %1338 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i543 = icmp eq i32 %1338, 0, !dbg !21 + %1339 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %381, float 0x3FF7154760000000, float %1337) #6, !dbg !21 + %1340 = tail call float @llvm.nvvm.fma.rn.f(float %381, float 0x3FF7154760000000, float %1337) #6, !dbg !21 + %.0.i544 = select i1 %.not3.i543, float %1340, float %1339, !dbg !21 + %1341 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i545 = icmp eq i32 %1341, 0, !dbg !21 + %1342 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %381, float 0x3E54AE0C00000000, float %.0.i544) #6, !dbg !21 + %1343 = tail call float @llvm.nvvm.fma.rn.f(float %381, float 0x3E54AE0C00000000, float %.0.i544) #6, !dbg !21 + %.01.i546 = select i1 %.not4.i545, float %1343, float %1342, !dbg !21 + %1344 = bitcast float %.04.i542 to i32, !dbg !21 + %1345 = shl i32 %1344, 23, !dbg !21 + %1346 = bitcast i32 %1345 to float, !dbg !21 + %1347 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i546) #6, !dbg !21 + %1348 = fmul float %1347, %1346, !dbg !21 + %1349 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i547 = icmp eq i32 %1349, 0, !dbg !21 + %1350 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %382, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1351 = tail call float @llvm.nvvm.fma.rn.f(float %382, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i548 = select i1 %.not.i547, float %1351, float %1350, !dbg !21 + %1352 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i549 = icmp eq i32 %1352, 0, !dbg !21 + %1353 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i548) #6, !dbg !21 + %1354 = tail call float @llvm.nvvm.saturate.f(float %.02.i548) #6, !dbg !21 + %.03.i550 = select i1 %.not1.i549, float %1354, float %1353, !dbg !21 + %1355 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i551 = icmp eq i32 %1355, 0, !dbg !21 + %1356 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i550, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1357 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i550, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i552 = select i1 %.not2.i551, float %1357, float %1356, !dbg !21 + %1358 = fadd float %.04.i552, 0xC168000FE0000000, !dbg !21 + %1359 = fneg float %1358, !dbg !21 + %1360 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i553 = icmp eq i32 %1360, 0, !dbg !21 + %1361 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %382, float 0x3FF7154760000000, float %1359) #6, !dbg !21 + %1362 = tail call float @llvm.nvvm.fma.rn.f(float %382, float 0x3FF7154760000000, float %1359) #6, !dbg !21 + %.0.i554 = select i1 %.not3.i553, float %1362, float %1361, !dbg !21 + %1363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i555 = icmp eq i32 %1363, 0, !dbg !21 + %1364 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %382, float 0x3E54AE0C00000000, float %.0.i554) #6, !dbg !21 + %1365 = tail call float @llvm.nvvm.fma.rn.f(float %382, float 0x3E54AE0C00000000, float %.0.i554) #6, !dbg !21 + %.01.i556 = select i1 %.not4.i555, float %1365, float %1364, !dbg !21 + %1366 = bitcast float %.04.i552 to i32, !dbg !21 + %1367 = shl i32 %1366, 23, !dbg !21 + %1368 = bitcast i32 %1367 to float, !dbg !21 + %1369 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i556) #6, !dbg !21 + %1370 = fmul float %1369, %1368, !dbg !21 + %1371 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i557 = icmp eq i32 %1371, 0, !dbg !21 + %1372 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %383, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1373 = tail call float @llvm.nvvm.fma.rn.f(float %383, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i558 = select i1 %.not.i557, float %1373, float %1372, !dbg !21 + %1374 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i559 = icmp eq i32 %1374, 0, !dbg !21 + %1375 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i558) #6, !dbg !21 + %1376 = tail call float @llvm.nvvm.saturate.f(float %.02.i558) #6, !dbg !21 + %.03.i560 = select i1 %.not1.i559, float %1376, float %1375, !dbg !21 + %1377 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i561 = icmp eq i32 %1377, 0, !dbg !21 + %1378 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i560, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1379 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i560, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i562 = select i1 %.not2.i561, float %1379, float %1378, !dbg !21 + %1380 = fadd float %.04.i562, 0xC168000FE0000000, !dbg !21 + %1381 = fneg float %1380, !dbg !21 + %1382 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i563 = icmp eq i32 %1382, 0, !dbg !21 + %1383 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %383, float 0x3FF7154760000000, float %1381) #6, !dbg !21 + %1384 = tail call float @llvm.nvvm.fma.rn.f(float %383, float 0x3FF7154760000000, float %1381) #6, !dbg !21 + %.0.i564 = select i1 %.not3.i563, float %1384, float %1383, !dbg !21 + %1385 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i565 = icmp eq i32 %1385, 0, !dbg !21 + %1386 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %383, float 0x3E54AE0C00000000, float %.0.i564) #6, !dbg !21 + %1387 = tail call float @llvm.nvvm.fma.rn.f(float %383, float 0x3E54AE0C00000000, float %.0.i564) #6, !dbg !21 + %.01.i566 = select i1 %.not4.i565, float %1387, float %1386, !dbg !21 + %1388 = bitcast float %.04.i562 to i32, !dbg !21 + %1389 = shl i32 %1388, 23, !dbg !21 + %1390 = bitcast i32 %1389 to float, !dbg !21 + %1391 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i566) #6, !dbg !21 + %1392 = fmul float %1391, %1390, !dbg !21 + %1393 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i567 = icmp eq i32 %1393, 0, !dbg !21 + %1394 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %384, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1395 = tail call float @llvm.nvvm.fma.rn.f(float %384, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i568 = select i1 %.not.i567, float %1395, float %1394, !dbg !21 + %1396 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i569 = icmp eq i32 %1396, 0, !dbg !21 + %1397 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i568) #6, !dbg !21 + %1398 = tail call float @llvm.nvvm.saturate.f(float %.02.i568) #6, !dbg !21 + %.03.i570 = select i1 %.not1.i569, float %1398, float %1397, !dbg !21 + %1399 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i571 = icmp eq i32 %1399, 0, !dbg !21 + %1400 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i570, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1401 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i570, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i572 = select i1 %.not2.i571, float %1401, float %1400, !dbg !21 + %1402 = fadd float %.04.i572, 0xC168000FE0000000, !dbg !21 + %1403 = fneg float %1402, !dbg !21 + %1404 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i573 = icmp eq i32 %1404, 0, !dbg !21 + %1405 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %384, float 0x3FF7154760000000, float %1403) #6, !dbg !21 + %1406 = tail call float @llvm.nvvm.fma.rn.f(float %384, float 0x3FF7154760000000, float %1403) #6, !dbg !21 + %.0.i574 = select i1 %.not3.i573, float %1406, float %1405, !dbg !21 + %1407 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i575 = icmp eq i32 %1407, 0, !dbg !21 + %1408 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %384, float 0x3E54AE0C00000000, float %.0.i574) #6, !dbg !21 + %1409 = tail call float @llvm.nvvm.fma.rn.f(float %384, float 0x3E54AE0C00000000, float %.0.i574) #6, !dbg !21 + %.01.i576 = select i1 %.not4.i575, float %1409, float %1408, !dbg !21 + %1410 = bitcast float %.04.i572 to i32, !dbg !21 + %1411 = shl i32 %1410, 23, !dbg !21 + %1412 = bitcast i32 %1411 to float, !dbg !21 + %1413 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i576) #6, !dbg !21 + %1414 = fmul float %1413, %1412, !dbg !21 + %1415 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i577 = icmp eq i32 %1415, 0, !dbg !21 + %1416 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %385, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1417 = tail call float @llvm.nvvm.fma.rn.f(float %385, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i578 = select i1 %.not.i577, float %1417, float %1416, !dbg !21 + %1418 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i579 = icmp eq i32 %1418, 0, !dbg !21 + %1419 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i578) #6, !dbg !21 + %1420 = tail call float @llvm.nvvm.saturate.f(float %.02.i578) #6, !dbg !21 + %.03.i580 = select i1 %.not1.i579, float %1420, float %1419, !dbg !21 + %1421 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i581 = icmp eq i32 %1421, 0, !dbg !21 + %1422 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i580, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1423 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i580, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i582 = select i1 %.not2.i581, float %1423, float %1422, !dbg !21 + %1424 = fadd float %.04.i582, 0xC168000FE0000000, !dbg !21 + %1425 = fneg float %1424, !dbg !21 + %1426 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i583 = icmp eq i32 %1426, 0, !dbg !21 + %1427 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %385, float 0x3FF7154760000000, float %1425) #6, !dbg !21 + %1428 = tail call float @llvm.nvvm.fma.rn.f(float %385, float 0x3FF7154760000000, float %1425) #6, !dbg !21 + %.0.i584 = select i1 %.not3.i583, float %1428, float %1427, !dbg !21 + %1429 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i585 = icmp eq i32 %1429, 0, !dbg !21 + %1430 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %385, float 0x3E54AE0C00000000, float %.0.i584) #6, !dbg !21 + %1431 = tail call float @llvm.nvvm.fma.rn.f(float %385, float 0x3E54AE0C00000000, float %.0.i584) #6, !dbg !21 + %.01.i586 = select i1 %.not4.i585, float %1431, float %1430, !dbg !21 + %1432 = bitcast float %.04.i582 to i32, !dbg !21 + %1433 = shl i32 %1432, 23, !dbg !21 + %1434 = bitcast i32 %1433 to float, !dbg !21 + %1435 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i586) #6, !dbg !21 + %1436 = fmul float %1435, %1434, !dbg !21 + %1437 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i587 = icmp eq i32 %1437, 0, !dbg !21 + %1438 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %386, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1439 = tail call float @llvm.nvvm.fma.rn.f(float %386, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i588 = select i1 %.not.i587, float %1439, float %1438, !dbg !21 + %1440 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i589 = icmp eq i32 %1440, 0, !dbg !21 + %1441 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i588) #6, !dbg !21 + %1442 = tail call float @llvm.nvvm.saturate.f(float %.02.i588) #6, !dbg !21 + %.03.i590 = select i1 %.not1.i589, float %1442, float %1441, !dbg !21 + %1443 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i591 = icmp eq i32 %1443, 0, !dbg !21 + %1444 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i590, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1445 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i590, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i592 = select i1 %.not2.i591, float %1445, float %1444, !dbg !21 + %1446 = fadd float %.04.i592, 0xC168000FE0000000, !dbg !21 + %1447 = fneg float %1446, !dbg !21 + %1448 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i593 = icmp eq i32 %1448, 0, !dbg !21 + %1449 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %386, float 0x3FF7154760000000, float %1447) #6, !dbg !21 + %1450 = tail call float @llvm.nvvm.fma.rn.f(float %386, float 0x3FF7154760000000, float %1447) #6, !dbg !21 + %.0.i594 = select i1 %.not3.i593, float %1450, float %1449, !dbg !21 + %1451 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i595 = icmp eq i32 %1451, 0, !dbg !21 + %1452 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %386, float 0x3E54AE0C00000000, float %.0.i594) #6, !dbg !21 + %1453 = tail call float @llvm.nvvm.fma.rn.f(float %386, float 0x3E54AE0C00000000, float %.0.i594) #6, !dbg !21 + %.01.i596 = select i1 %.not4.i595, float %1453, float %1452, !dbg !21 + %1454 = bitcast float %.04.i592 to i32, !dbg !21 + %1455 = shl i32 %1454, 23, !dbg !21 + %1456 = bitcast i32 %1455 to float, !dbg !21 + %1457 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i596) #6, !dbg !21 + %1458 = fmul float %1457, %1456, !dbg !21 + %1459 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i597 = icmp eq i32 %1459, 0, !dbg !21 + %1460 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %387, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1461 = tail call float @llvm.nvvm.fma.rn.f(float %387, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i598 = select i1 %.not.i597, float %1461, float %1460, !dbg !21 + %1462 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i599 = icmp eq i32 %1462, 0, !dbg !21 + %1463 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i598) #6, !dbg !21 + %1464 = tail call float @llvm.nvvm.saturate.f(float %.02.i598) #6, !dbg !21 + %.03.i600 = select i1 %.not1.i599, float %1464, float %1463, !dbg !21 + %1465 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i601 = icmp eq i32 %1465, 0, !dbg !21 + %1466 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i600, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1467 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i600, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i602 = select i1 %.not2.i601, float %1467, float %1466, !dbg !21 + %1468 = fadd float %.04.i602, 0xC168000FE0000000, !dbg !21 + %1469 = fneg float %1468, !dbg !21 + %1470 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i603 = icmp eq i32 %1470, 0, !dbg !21 + %1471 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %387, float 0x3FF7154760000000, float %1469) #6, !dbg !21 + %1472 = tail call float @llvm.nvvm.fma.rn.f(float %387, float 0x3FF7154760000000, float %1469) #6, !dbg !21 + %.0.i604 = select i1 %.not3.i603, float %1472, float %1471, !dbg !21 + %1473 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i605 = icmp eq i32 %1473, 0, !dbg !21 + %1474 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %387, float 0x3E54AE0C00000000, float %.0.i604) #6, !dbg !21 + %1475 = tail call float @llvm.nvvm.fma.rn.f(float %387, float 0x3E54AE0C00000000, float %.0.i604) #6, !dbg !21 + %.01.i606 = select i1 %.not4.i605, float %1475, float %1474, !dbg !21 + %1476 = bitcast float %.04.i602 to i32, !dbg !21 + %1477 = shl i32 %1476, 23, !dbg !21 + %1478 = bitcast i32 %1477 to float, !dbg !21 + %1479 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i606) #6, !dbg !21 + %1480 = fmul float %1479, %1478, !dbg !21 + %1481 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i607 = icmp eq i32 %1481, 0, !dbg !21 + %1482 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %388, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1483 = tail call float @llvm.nvvm.fma.rn.f(float %388, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i608 = select i1 %.not.i607, float %1483, float %1482, !dbg !21 + %1484 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i609 = icmp eq i32 %1484, 0, !dbg !21 + %1485 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i608) #6, !dbg !21 + %1486 = tail call float @llvm.nvvm.saturate.f(float %.02.i608) #6, !dbg !21 + %.03.i610 = select i1 %.not1.i609, float %1486, float %1485, !dbg !21 + %1487 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i611 = icmp eq i32 %1487, 0, !dbg !21 + %1488 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i610, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1489 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i610, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i612 = select i1 %.not2.i611, float %1489, float %1488, !dbg !21 + %1490 = fadd float %.04.i612, 0xC168000FE0000000, !dbg !21 + %1491 = fneg float %1490, !dbg !21 + %1492 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i613 = icmp eq i32 %1492, 0, !dbg !21 + %1493 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %388, float 0x3FF7154760000000, float %1491) #6, !dbg !21 + %1494 = tail call float @llvm.nvvm.fma.rn.f(float %388, float 0x3FF7154760000000, float %1491) #6, !dbg !21 + %.0.i614 = select i1 %.not3.i613, float %1494, float %1493, !dbg !21 + %1495 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i615 = icmp eq i32 %1495, 0, !dbg !21 + %1496 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %388, float 0x3E54AE0C00000000, float %.0.i614) #6, !dbg !21 + %1497 = tail call float @llvm.nvvm.fma.rn.f(float %388, float 0x3E54AE0C00000000, float %.0.i614) #6, !dbg !21 + %.01.i616 = select i1 %.not4.i615, float %1497, float %1496, !dbg !21 + %1498 = bitcast float %.04.i612 to i32, !dbg !21 + %1499 = shl i32 %1498, 23, !dbg !21 + %1500 = bitcast i32 %1499 to float, !dbg !21 + %1501 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i616) #6, !dbg !21 + %1502 = fmul float %1501, %1500, !dbg !21 + %1503 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i617 = icmp eq i32 %1503, 0, !dbg !21 + %1504 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %389, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1505 = tail call float @llvm.nvvm.fma.rn.f(float %389, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i618 = select i1 %.not.i617, float %1505, float %1504, !dbg !21 + %1506 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i619 = icmp eq i32 %1506, 0, !dbg !21 + %1507 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i618) #6, !dbg !21 + %1508 = tail call float @llvm.nvvm.saturate.f(float %.02.i618) #6, !dbg !21 + %.03.i620 = select i1 %.not1.i619, float %1508, float %1507, !dbg !21 + %1509 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i621 = icmp eq i32 %1509, 0, !dbg !21 + %1510 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i620, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1511 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i620, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i622 = select i1 %.not2.i621, float %1511, float %1510, !dbg !21 + %1512 = fadd float %.04.i622, 0xC168000FE0000000, !dbg !21 + %1513 = fneg float %1512, !dbg !21 + %1514 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i623 = icmp eq i32 %1514, 0, !dbg !21 + %1515 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %389, float 0x3FF7154760000000, float %1513) #6, !dbg !21 + %1516 = tail call float @llvm.nvvm.fma.rn.f(float %389, float 0x3FF7154760000000, float %1513) #6, !dbg !21 + %.0.i624 = select i1 %.not3.i623, float %1516, float %1515, !dbg !21 + %1517 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i625 = icmp eq i32 %1517, 0, !dbg !21 + %1518 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %389, float 0x3E54AE0C00000000, float %.0.i624) #6, !dbg !21 + %1519 = tail call float @llvm.nvvm.fma.rn.f(float %389, float 0x3E54AE0C00000000, float %.0.i624) #6, !dbg !21 + %.01.i626 = select i1 %.not4.i625, float %1519, float %1518, !dbg !21 + %1520 = bitcast float %.04.i622 to i32, !dbg !21 + %1521 = shl i32 %1520, 23, !dbg !21 + %1522 = bitcast i32 %1521 to float, !dbg !21 + %1523 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i626) #6, !dbg !21 + %1524 = fmul float %1523, %1522, !dbg !21 + %1525 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i627 = icmp eq i32 %1525, 0, !dbg !21 + %1526 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %390, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1527 = tail call float @llvm.nvvm.fma.rn.f(float %390, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i628 = select i1 %.not.i627, float %1527, float %1526, !dbg !21 + %1528 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i629 = icmp eq i32 %1528, 0, !dbg !21 + %1529 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i628) #6, !dbg !21 + %1530 = tail call float @llvm.nvvm.saturate.f(float %.02.i628) #6, !dbg !21 + %.03.i630 = select i1 %.not1.i629, float %1530, float %1529, !dbg !21 + %1531 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i631 = icmp eq i32 %1531, 0, !dbg !21 + %1532 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i630, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1533 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i630, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i632 = select i1 %.not2.i631, float %1533, float %1532, !dbg !21 + %1534 = fadd float %.04.i632, 0xC168000FE0000000, !dbg !21 + %1535 = fneg float %1534, !dbg !21 + %1536 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i633 = icmp eq i32 %1536, 0, !dbg !21 + %1537 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %390, float 0x3FF7154760000000, float %1535) #6, !dbg !21 + %1538 = tail call float @llvm.nvvm.fma.rn.f(float %390, float 0x3FF7154760000000, float %1535) #6, !dbg !21 + %.0.i634 = select i1 %.not3.i633, float %1538, float %1537, !dbg !21 + %1539 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i635 = icmp eq i32 %1539, 0, !dbg !21 + %1540 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %390, float 0x3E54AE0C00000000, float %.0.i634) #6, !dbg !21 + %1541 = tail call float @llvm.nvvm.fma.rn.f(float %390, float 0x3E54AE0C00000000, float %.0.i634) #6, !dbg !21 + %.01.i636 = select i1 %.not4.i635, float %1541, float %1540, !dbg !21 + %1542 = bitcast float %.04.i632 to i32, !dbg !21 + %1543 = shl i32 %1542, 23, !dbg !21 + %1544 = bitcast i32 %1543 to float, !dbg !21 + %1545 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i636) #6, !dbg !21 + %1546 = fmul float %1545, %1544, !dbg !21 + %1547 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i637 = icmp eq i32 %1547, 0, !dbg !21 + %1548 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %391, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1549 = tail call float @llvm.nvvm.fma.rn.f(float %391, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i638 = select i1 %.not.i637, float %1549, float %1548, !dbg !21 + %1550 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i639 = icmp eq i32 %1550, 0, !dbg !21 + %1551 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i638) #6, !dbg !21 + %1552 = tail call float @llvm.nvvm.saturate.f(float %.02.i638) #6, !dbg !21 + %.03.i640 = select i1 %.not1.i639, float %1552, float %1551, !dbg !21 + %1553 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i641 = icmp eq i32 %1553, 0, !dbg !21 + %1554 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i640, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1555 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i640, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i642 = select i1 %.not2.i641, float %1555, float %1554, !dbg !21 + %1556 = fadd float %.04.i642, 0xC168000FE0000000, !dbg !21 + %1557 = fneg float %1556, !dbg !21 + %1558 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i643 = icmp eq i32 %1558, 0, !dbg !21 + %1559 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %391, float 0x3FF7154760000000, float %1557) #6, !dbg !21 + %1560 = tail call float @llvm.nvvm.fma.rn.f(float %391, float 0x3FF7154760000000, float %1557) #6, !dbg !21 + %.0.i644 = select i1 %.not3.i643, float %1560, float %1559, !dbg !21 + %1561 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i645 = icmp eq i32 %1561, 0, !dbg !21 + %1562 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %391, float 0x3E54AE0C00000000, float %.0.i644) #6, !dbg !21 + %1563 = tail call float @llvm.nvvm.fma.rn.f(float %391, float 0x3E54AE0C00000000, float %.0.i644) #6, !dbg !21 + %.01.i646 = select i1 %.not4.i645, float %1563, float %1562, !dbg !21 + %1564 = bitcast float %.04.i642 to i32, !dbg !21 + %1565 = shl i32 %1564, 23, !dbg !21 + %1566 = bitcast i32 %1565 to float, !dbg !21 + %1567 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i646) #6, !dbg !21 + %1568 = fmul float %1567, %1566, !dbg !21 + %1569 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i647 = icmp eq i32 %1569, 0, !dbg !21 + %1570 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %392, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1571 = tail call float @llvm.nvvm.fma.rn.f(float %392, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i648 = select i1 %.not.i647, float %1571, float %1570, !dbg !21 + %1572 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i649 = icmp eq i32 %1572, 0, !dbg !21 + %1573 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i648) #6, !dbg !21 + %1574 = tail call float @llvm.nvvm.saturate.f(float %.02.i648) #6, !dbg !21 + %.03.i650 = select i1 %.not1.i649, float %1574, float %1573, !dbg !21 + %1575 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i651 = icmp eq i32 %1575, 0, !dbg !21 + %1576 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i650, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1577 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i650, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i652 = select i1 %.not2.i651, float %1577, float %1576, !dbg !21 + %1578 = fadd float %.04.i652, 0xC168000FE0000000, !dbg !21 + %1579 = fneg float %1578, !dbg !21 + %1580 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i653 = icmp eq i32 %1580, 0, !dbg !21 + %1581 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %392, float 0x3FF7154760000000, float %1579) #6, !dbg !21 + %1582 = tail call float @llvm.nvvm.fma.rn.f(float %392, float 0x3FF7154760000000, float %1579) #6, !dbg !21 + %.0.i654 = select i1 %.not3.i653, float %1582, float %1581, !dbg !21 + %1583 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i655 = icmp eq i32 %1583, 0, !dbg !21 + %1584 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %392, float 0x3E54AE0C00000000, float %.0.i654) #6, !dbg !21 + %1585 = tail call float @llvm.nvvm.fma.rn.f(float %392, float 0x3E54AE0C00000000, float %.0.i654) #6, !dbg !21 + %.01.i656 = select i1 %.not4.i655, float %1585, float %1584, !dbg !21 + %1586 = bitcast float %.04.i652 to i32, !dbg !21 + %1587 = shl i32 %1586, 23, !dbg !21 + %1588 = bitcast i32 %1587 to float, !dbg !21 + %1589 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i656) #6, !dbg !21 + %1590 = fmul float %1589, %1588, !dbg !21 + %1591 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i657 = icmp eq i32 %1591, 0, !dbg !21 + %1592 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %393, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1593 = tail call float @llvm.nvvm.fma.rn.f(float %393, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i658 = select i1 %.not.i657, float %1593, float %1592, !dbg !21 + %1594 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i659 = icmp eq i32 %1594, 0, !dbg !21 + %1595 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i658) #6, !dbg !21 + %1596 = tail call float @llvm.nvvm.saturate.f(float %.02.i658) #6, !dbg !21 + %.03.i660 = select i1 %.not1.i659, float %1596, float %1595, !dbg !21 + %1597 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i661 = icmp eq i32 %1597, 0, !dbg !21 + %1598 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i660, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1599 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i660, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i662 = select i1 %.not2.i661, float %1599, float %1598, !dbg !21 + %1600 = fadd float %.04.i662, 0xC168000FE0000000, !dbg !21 + %1601 = fneg float %1600, !dbg !21 + %1602 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i663 = icmp eq i32 %1602, 0, !dbg !21 + %1603 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %393, float 0x3FF7154760000000, float %1601) #6, !dbg !21 + %1604 = tail call float @llvm.nvvm.fma.rn.f(float %393, float 0x3FF7154760000000, float %1601) #6, !dbg !21 + %.0.i664 = select i1 %.not3.i663, float %1604, float %1603, !dbg !21 + %1605 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i665 = icmp eq i32 %1605, 0, !dbg !21 + %1606 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %393, float 0x3E54AE0C00000000, float %.0.i664) #6, !dbg !21 + %1607 = tail call float @llvm.nvvm.fma.rn.f(float %393, float 0x3E54AE0C00000000, float %.0.i664) #6, !dbg !21 + %.01.i666 = select i1 %.not4.i665, float %1607, float %1606, !dbg !21 + %1608 = bitcast float %.04.i662 to i32, !dbg !21 + %1609 = shl i32 %1608, 23, !dbg !21 + %1610 = bitcast i32 %1609 to float, !dbg !21 + %1611 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i666) #6, !dbg !21 + %1612 = fmul float %1611, %1610, !dbg !21 + %1613 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i667 = icmp eq i32 %1613, 0, !dbg !21 + %1614 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %394, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1615 = tail call float @llvm.nvvm.fma.rn.f(float %394, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i668 = select i1 %.not.i667, float %1615, float %1614, !dbg !21 + %1616 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i669 = icmp eq i32 %1616, 0, !dbg !21 + %1617 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i668) #6, !dbg !21 + %1618 = tail call float @llvm.nvvm.saturate.f(float %.02.i668) #6, !dbg !21 + %.03.i670 = select i1 %.not1.i669, float %1618, float %1617, !dbg !21 + %1619 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i671 = icmp eq i32 %1619, 0, !dbg !21 + %1620 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i670, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1621 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i670, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i672 = select i1 %.not2.i671, float %1621, float %1620, !dbg !21 + %1622 = fadd float %.04.i672, 0xC168000FE0000000, !dbg !21 + %1623 = fneg float %1622, !dbg !21 + %1624 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i673 = icmp eq i32 %1624, 0, !dbg !21 + %1625 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %394, float 0x3FF7154760000000, float %1623) #6, !dbg !21 + %1626 = tail call float @llvm.nvvm.fma.rn.f(float %394, float 0x3FF7154760000000, float %1623) #6, !dbg !21 + %.0.i674 = select i1 %.not3.i673, float %1626, float %1625, !dbg !21 + %1627 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i675 = icmp eq i32 %1627, 0, !dbg !21 + %1628 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %394, float 0x3E54AE0C00000000, float %.0.i674) #6, !dbg !21 + %1629 = tail call float @llvm.nvvm.fma.rn.f(float %394, float 0x3E54AE0C00000000, float %.0.i674) #6, !dbg !21 + %.01.i676 = select i1 %.not4.i675, float %1629, float %1628, !dbg !21 + %1630 = bitcast float %.04.i672 to i32, !dbg !21 + %1631 = shl i32 %1630, 23, !dbg !21 + %1632 = bitcast i32 %1631 to float, !dbg !21 + %1633 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i676) #6, !dbg !21 + %1634 = fmul float %1633, %1632, !dbg !21 + %1635 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i677 = icmp eq i32 %1635, 0, !dbg !21 + %1636 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %395, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1637 = tail call float @llvm.nvvm.fma.rn.f(float %395, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i678 = select i1 %.not.i677, float %1637, float %1636, !dbg !21 + %1638 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i679 = icmp eq i32 %1638, 0, !dbg !21 + %1639 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i678) #6, !dbg !21 + %1640 = tail call float @llvm.nvvm.saturate.f(float %.02.i678) #6, !dbg !21 + %.03.i680 = select i1 %.not1.i679, float %1640, float %1639, !dbg !21 + %1641 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i681 = icmp eq i32 %1641, 0, !dbg !21 + %1642 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i680, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1643 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i680, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i682 = select i1 %.not2.i681, float %1643, float %1642, !dbg !21 + %1644 = fadd float %.04.i682, 0xC168000FE0000000, !dbg !21 + %1645 = fneg float %1644, !dbg !21 + %1646 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i683 = icmp eq i32 %1646, 0, !dbg !21 + %1647 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %395, float 0x3FF7154760000000, float %1645) #6, !dbg !21 + %1648 = tail call float @llvm.nvvm.fma.rn.f(float %395, float 0x3FF7154760000000, float %1645) #6, !dbg !21 + %.0.i684 = select i1 %.not3.i683, float %1648, float %1647, !dbg !21 + %1649 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i685 = icmp eq i32 %1649, 0, !dbg !21 + %1650 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %395, float 0x3E54AE0C00000000, float %.0.i684) #6, !dbg !21 + %1651 = tail call float @llvm.nvvm.fma.rn.f(float %395, float 0x3E54AE0C00000000, float %.0.i684) #6, !dbg !21 + %.01.i686 = select i1 %.not4.i685, float %1651, float %1650, !dbg !21 + %1652 = bitcast float %.04.i682 to i32, !dbg !21 + %1653 = shl i32 %1652, 23, !dbg !21 + %1654 = bitcast i32 %1653 to float, !dbg !21 + %1655 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i686) #6, !dbg !21 + %1656 = fmul float %1655, %1654, !dbg !21 + %1657 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i687 = icmp eq i32 %1657, 0, !dbg !21 + %1658 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %396, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1659 = tail call float @llvm.nvvm.fma.rn.f(float %396, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i688 = select i1 %.not.i687, float %1659, float %1658, !dbg !21 + %1660 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i689 = icmp eq i32 %1660, 0, !dbg !21 + %1661 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i688) #6, !dbg !21 + %1662 = tail call float @llvm.nvvm.saturate.f(float %.02.i688) #6, !dbg !21 + %.03.i690 = select i1 %.not1.i689, float %1662, float %1661, !dbg !21 + %1663 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i691 = icmp eq i32 %1663, 0, !dbg !21 + %1664 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i690, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1665 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i690, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i692 = select i1 %.not2.i691, float %1665, float %1664, !dbg !21 + %1666 = fadd float %.04.i692, 0xC168000FE0000000, !dbg !21 + %1667 = fneg float %1666, !dbg !21 + %1668 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i693 = icmp eq i32 %1668, 0, !dbg !21 + %1669 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %396, float 0x3FF7154760000000, float %1667) #6, !dbg !21 + %1670 = tail call float @llvm.nvvm.fma.rn.f(float %396, float 0x3FF7154760000000, float %1667) #6, !dbg !21 + %.0.i694 = select i1 %.not3.i693, float %1670, float %1669, !dbg !21 + %1671 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i695 = icmp eq i32 %1671, 0, !dbg !21 + %1672 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %396, float 0x3E54AE0C00000000, float %.0.i694) #6, !dbg !21 + %1673 = tail call float @llvm.nvvm.fma.rn.f(float %396, float 0x3E54AE0C00000000, float %.0.i694) #6, !dbg !21 + %.01.i696 = select i1 %.not4.i695, float %1673, float %1672, !dbg !21 + %1674 = bitcast float %.04.i692 to i32, !dbg !21 + %1675 = shl i32 %1674, 23, !dbg !21 + %1676 = bitcast i32 %1675 to float, !dbg !21 + %1677 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i696) #6, !dbg !21 + %1678 = fmul float %1677, %1676, !dbg !21 + %1679 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i697 = icmp eq i32 %1679, 0, !dbg !21 + %1680 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %397, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1681 = tail call float @llvm.nvvm.fma.rn.f(float %397, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i698 = select i1 %.not.i697, float %1681, float %1680, !dbg !21 + %1682 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i699 = icmp eq i32 %1682, 0, !dbg !21 + %1683 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i698) #6, !dbg !21 + %1684 = tail call float @llvm.nvvm.saturate.f(float %.02.i698) #6, !dbg !21 + %.03.i700 = select i1 %.not1.i699, float %1684, float %1683, !dbg !21 + %1685 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %1686 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i700, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1687 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i700, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1688 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i703 = icmp eq i32 %1688, 0, !dbg !21 + %1689 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i705 = icmp eq i32 %1689, 0, !dbg !21 + %1690 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i707 = icmp eq i32 %1690, 0, !dbg !21 + %1691 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %398, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1692 = tail call float @llvm.nvvm.fma.rn.f(float %398, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i708 = select i1 %.not.i707, float %1692, float %1691, !dbg !21 + %1693 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i709 = icmp eq i32 %1693, 0, !dbg !21 + %1694 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i708) #6, !dbg !21 + %1695 = tail call float @llvm.nvvm.saturate.f(float %.02.i708) #6, !dbg !21 + %.03.i710 = select i1 %.not1.i709, float %1695, float %1694, !dbg !21 + %1696 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %1697 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i710, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1698 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i710, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1699 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i713 = icmp eq i32 %1699, 0, !dbg !21 + %1700 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i715 = icmp eq i32 %1700, 0, !dbg !21 + %1701 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i717 = icmp eq i32 %1701, 0, !dbg !21 + %1702 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %399, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1703 = tail call float @llvm.nvvm.fma.rn.f(float %399, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i718 = select i1 %.not.i717, float %1703, float %1702, !dbg !21 + %1704 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i719 = icmp eq i32 %1704, 0, !dbg !21 + %1705 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i718) #6, !dbg !21 + %1706 = tail call float @llvm.nvvm.saturate.f(float %.02.i718) #6, !dbg !21 + %.03.i720 = select i1 %.not1.i719, float %1706, float %1705, !dbg !21 + %1707 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %1708 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i720, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1709 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i720, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1710 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i723 = icmp eq i32 %1710, 0, !dbg !21 + %1711 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i725 = icmp eq i32 %1711, 0, !dbg !21 + %1712 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i727 = icmp eq i32 %1712, 0, !dbg !21 + %1713 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %400, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1714 = tail call float @llvm.nvvm.fma.rn.f(float %400, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i728 = select i1 %.not.i727, float %1714, float %1713, !dbg !21 + %1715 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i729 = icmp eq i32 %1715, 0, !dbg !21 + %1716 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i728) #6, !dbg !21 + %1717 = tail call float @llvm.nvvm.saturate.f(float %.02.i728) #6, !dbg !21 + %.03.i730 = select i1 %.not1.i729, float %1717, float %1716, !dbg !21 + %1718 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %1719 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i730, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1720 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i730, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1721 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i733 = icmp eq i32 %1721, 0, !dbg !21 + %1722 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i735 = icmp eq i32 %1722, 0, !dbg !21 + %1723 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i737 = icmp eq i32 %1723, 0, !dbg !21 + %1724 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %401, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1725 = tail call float @llvm.nvvm.fma.rn.f(float %401, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i738 = select i1 %.not.i737, float %1725, float %1724, !dbg !21 + %1726 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i739 = icmp eq i32 %1726, 0, !dbg !21 + %1727 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i738) #6, !dbg !21 + %1728 = tail call float @llvm.nvvm.saturate.f(float %.02.i738) #6, !dbg !21 + %.03.i740 = select i1 %.not1.i739, float %1728, float %1727, !dbg !21 + %1729 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %1730 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i740, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1731 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i740, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1732 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i743 = icmp eq i32 %1732, 0, !dbg !21 + %1733 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i745 = icmp eq i32 %1733, 0, !dbg !21 + %1734 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i747 = icmp eq i32 %1734, 0, !dbg !21 + %1735 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %402, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1736 = tail call float @llvm.nvvm.fma.rn.f(float %402, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i748 = select i1 %.not.i747, float %1736, float %1735, !dbg !21 + %1737 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i749 = icmp eq i32 %1737, 0, !dbg !21 + %1738 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i748) #6, !dbg !21 + %1739 = tail call float @llvm.nvvm.saturate.f(float %.02.i748) #6, !dbg !21 + %.03.i750 = select i1 %.not1.i749, float %1739, float %1738, !dbg !21 + %1740 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %1741 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i750, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1742 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i750, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1743 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i753 = icmp eq i32 %1743, 0, !dbg !21 + %1744 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i755 = icmp eq i32 %1744, 0, !dbg !21 + %1745 = fsub float %211, %211, !dbg !22 + %1746 = fsub float %212, %212, !dbg !22 + %1747 = fsub float %213, %213, !dbg !22 + %1748 = fsub float %214, %214, !dbg !22 + %1749 = fsub float %215, %215, !dbg !22 + %1750 = fsub float %216, %216, !dbg !22 + %1751 = fsub float %217, %217, !dbg !22 + %1752 = fsub float %218, %218, !dbg !22 + %1753 = fsub float %219, %219, !dbg !22 + %1754 = fsub float %220, %220, !dbg !22 + %1755 = fsub float %221, %221, !dbg !22 + %1756 = fsub float %222, %222, !dbg !22 + %1757 = fsub float %223, %223, !dbg !22 + %1758 = fsub float %224, %224, !dbg !22 + %1759 = fsub float %225, %225, !dbg !22 + %1760 = fsub float %226, %226, !dbg !22 + %1761 = fsub float %227, %227, !dbg !22 + %1762 = fsub float %228, %228, !dbg !22 + %1763 = fsub float %229, %229, !dbg !22 + %1764 = fsub float %230, %230, !dbg !22 + %1765 = fsub float %231, %231, !dbg !22 + %1766 = fsub float %232, %232, !dbg !22 + %1767 = fsub float %233, %233, !dbg !22 + %1768 = fsub float %234, %234, !dbg !22 + %1769 = fsub float %235, %235, !dbg !22 + %1770 = fsub float %236, %236, !dbg !22 + %1771 = fsub float %237, %237, !dbg !22 + %1772 = fsub float %238, %238, !dbg !22 + %1773 = fsub float %239, %239, !dbg !22 + %1774 = fsub float %240, %240, !dbg !22 + %1775 = fsub float %241, %241, !dbg !22 + %1776 = fsub float %242, %242, !dbg !22 + %1777 = fsub float %243, %243, !dbg !22 + %1778 = fsub float %244, %244, !dbg !22 + %1779 = fsub float %245, %245, !dbg !22 + %1780 = fsub float %246, %246, !dbg !22 + %1781 = fsub float %247, %247, !dbg !22 + %1782 = fsub float %248, %248, !dbg !22 + %1783 = fsub float %249, %249, !dbg !22 + %1784 = fsub float %250, %250, !dbg !22 + %1785 = fsub float %251, %251, !dbg !22 + %1786 = fsub float %252, %252, !dbg !22 + %1787 = fsub float %253, %253, !dbg !22 + %1788 = fsub float %254, %254, !dbg !22 + %1789 = fsub float %255, %255, !dbg !22 + %1790 = fsub float %256, %256, !dbg !22 + %1791 = fsub float %257, %257, !dbg !22 + %1792 = fsub float %258, %258, !dbg !22 + %1793 = fsub float %259, %259, !dbg !22 + %1794 = fsub float %260, %260, !dbg !22 + %1795 = fsub float %261, %261, !dbg !22 + %1796 = fsub float %262, %262, !dbg !22 + %1797 = fsub float %263, %263, !dbg !22 + %1798 = fsub float %264, %264, !dbg !22 + %1799 = fsub float %265, %265, !dbg !22 + %1800 = fsub float %266, %266, !dbg !22 + %1801 = fsub float %267, %267, !dbg !22 + %1802 = fsub float %268, %268, !dbg !22 + %1803 = fsub float %270, %270, !dbg !22 + %1804 = fsub float %272, %272, !dbg !22 + %1805 = fsub float %274, %274, !dbg !22 + %1806 = fsub float %276, %276, !dbg !22 + %1807 = fsub float %278, %278, !dbg !22 + %1808 = fsub float %280, %280, !dbg !22 + %1809 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i757 = icmp eq i32 %1809, 0, !dbg !21 + %1810 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1745, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1811 = tail call float @llvm.nvvm.fma.rn.f(float %1745, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i758 = select i1 %.not.i757, float %1811, float %1810, !dbg !21 + %1812 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i759 = icmp eq i32 %1812, 0, !dbg !21 + %1813 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i758) #6, !dbg !21 + %1814 = tail call float @llvm.nvvm.saturate.f(float %.02.i758) #6, !dbg !21 + %.03.i760 = select i1 %.not1.i759, float %1814, float %1813, !dbg !21 + %1815 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i761 = icmp eq i32 %1815, 0, !dbg !21 + %1816 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i760, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1817 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i760, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i762 = select i1 %.not2.i761, float %1817, float %1816, !dbg !21 + %1818 = fadd float %.04.i762, 0xC168000FE0000000, !dbg !21 + %1819 = fneg float %1818, !dbg !21 + %1820 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i763 = icmp eq i32 %1820, 0, !dbg !21 + %1821 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1745, float 0x3FF7154760000000, float %1819) #6, !dbg !21 + %1822 = tail call float @llvm.nvvm.fma.rn.f(float %1745, float 0x3FF7154760000000, float %1819) #6, !dbg !21 + %.0.i764 = select i1 %.not3.i763, float %1822, float %1821, !dbg !21 + %1823 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i765 = icmp eq i32 %1823, 0, !dbg !21 + %1824 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1745, float 0x3E54AE0C00000000, float %.0.i764) #6, !dbg !21 + %1825 = tail call float @llvm.nvvm.fma.rn.f(float %1745, float 0x3E54AE0C00000000, float %.0.i764) #6, !dbg !21 + %.01.i766 = select i1 %.not4.i765, float %1825, float %1824, !dbg !21 + %1826 = bitcast float %.04.i762 to i32, !dbg !21 + %1827 = shl i32 %1826, 23, !dbg !21 + %1828 = bitcast i32 %1827 to float, !dbg !21 + %1829 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i766) #6, !dbg !21 + %1830 = fmul float %1829, %1828, !dbg !21 + %1831 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i767 = icmp eq i32 %1831, 0, !dbg !21 + %1832 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1746, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1833 = tail call float @llvm.nvvm.fma.rn.f(float %1746, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i768 = select i1 %.not.i767, float %1833, float %1832, !dbg !21 + %1834 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i769 = icmp eq i32 %1834, 0, !dbg !21 + %1835 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i768) #6, !dbg !21 + %1836 = tail call float @llvm.nvvm.saturate.f(float %.02.i768) #6, !dbg !21 + %.03.i770 = select i1 %.not1.i769, float %1836, float %1835, !dbg !21 + %1837 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i771 = icmp eq i32 %1837, 0, !dbg !21 + %1838 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i770, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1839 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i770, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i772 = select i1 %.not2.i771, float %1839, float %1838, !dbg !21 + %1840 = fadd float %.04.i772, 0xC168000FE0000000, !dbg !21 + %1841 = fneg float %1840, !dbg !21 + %1842 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i773 = icmp eq i32 %1842, 0, !dbg !21 + %1843 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1746, float 0x3FF7154760000000, float %1841) #6, !dbg !21 + %1844 = tail call float @llvm.nvvm.fma.rn.f(float %1746, float 0x3FF7154760000000, float %1841) #6, !dbg !21 + %.0.i774 = select i1 %.not3.i773, float %1844, float %1843, !dbg !21 + %1845 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i775 = icmp eq i32 %1845, 0, !dbg !21 + %1846 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1746, float 0x3E54AE0C00000000, float %.0.i774) #6, !dbg !21 + %1847 = tail call float @llvm.nvvm.fma.rn.f(float %1746, float 0x3E54AE0C00000000, float %.0.i774) #6, !dbg !21 + %.01.i776 = select i1 %.not4.i775, float %1847, float %1846, !dbg !21 + %1848 = bitcast float %.04.i772 to i32, !dbg !21 + %1849 = shl i32 %1848, 23, !dbg !21 + %1850 = bitcast i32 %1849 to float, !dbg !21 + %1851 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i776) #6, !dbg !21 + %1852 = fmul float %1851, %1850, !dbg !21 + %1853 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i777 = icmp eq i32 %1853, 0, !dbg !21 + %1854 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1747, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1855 = tail call float @llvm.nvvm.fma.rn.f(float %1747, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i778 = select i1 %.not.i777, float %1855, float %1854, !dbg !21 + %1856 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i779 = icmp eq i32 %1856, 0, !dbg !21 + %1857 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i778) #6, !dbg !21 + %1858 = tail call float @llvm.nvvm.saturate.f(float %.02.i778) #6, !dbg !21 + %.03.i780 = select i1 %.not1.i779, float %1858, float %1857, !dbg !21 + %1859 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i781 = icmp eq i32 %1859, 0, !dbg !21 + %1860 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i780, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1861 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i780, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i782 = select i1 %.not2.i781, float %1861, float %1860, !dbg !21 + %1862 = fadd float %.04.i782, 0xC168000FE0000000, !dbg !21 + %1863 = fneg float %1862, !dbg !21 + %1864 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i783 = icmp eq i32 %1864, 0, !dbg !21 + %1865 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1747, float 0x3FF7154760000000, float %1863) #6, !dbg !21 + %1866 = tail call float @llvm.nvvm.fma.rn.f(float %1747, float 0x3FF7154760000000, float %1863) #6, !dbg !21 + %.0.i784 = select i1 %.not3.i783, float %1866, float %1865, !dbg !21 + %1867 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i785 = icmp eq i32 %1867, 0, !dbg !21 + %1868 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1747, float 0x3E54AE0C00000000, float %.0.i784) #6, !dbg !21 + %1869 = tail call float @llvm.nvvm.fma.rn.f(float %1747, float 0x3E54AE0C00000000, float %.0.i784) #6, !dbg !21 + %.01.i786 = select i1 %.not4.i785, float %1869, float %1868, !dbg !21 + %1870 = bitcast float %.04.i782 to i32, !dbg !21 + %1871 = shl i32 %1870, 23, !dbg !21 + %1872 = bitcast i32 %1871 to float, !dbg !21 + %1873 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i786) #6, !dbg !21 + %1874 = fmul float %1873, %1872, !dbg !21 + %1875 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i787 = icmp eq i32 %1875, 0, !dbg !21 + %1876 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1748, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1877 = tail call float @llvm.nvvm.fma.rn.f(float %1748, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i788 = select i1 %.not.i787, float %1877, float %1876, !dbg !21 + %1878 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i789 = icmp eq i32 %1878, 0, !dbg !21 + %1879 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i788) #6, !dbg !21 + %1880 = tail call float @llvm.nvvm.saturate.f(float %.02.i788) #6, !dbg !21 + %.03.i790 = select i1 %.not1.i789, float %1880, float %1879, !dbg !21 + %1881 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i791 = icmp eq i32 %1881, 0, !dbg !21 + %1882 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i790, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1883 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i790, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i792 = select i1 %.not2.i791, float %1883, float %1882, !dbg !21 + %1884 = fadd float %.04.i792, 0xC168000FE0000000, !dbg !21 + %1885 = fneg float %1884, !dbg !21 + %1886 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i793 = icmp eq i32 %1886, 0, !dbg !21 + %1887 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1748, float 0x3FF7154760000000, float %1885) #6, !dbg !21 + %1888 = tail call float @llvm.nvvm.fma.rn.f(float %1748, float 0x3FF7154760000000, float %1885) #6, !dbg !21 + %.0.i794 = select i1 %.not3.i793, float %1888, float %1887, !dbg !21 + %1889 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i795 = icmp eq i32 %1889, 0, !dbg !21 + %1890 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1748, float 0x3E54AE0C00000000, float %.0.i794) #6, !dbg !21 + %1891 = tail call float @llvm.nvvm.fma.rn.f(float %1748, float 0x3E54AE0C00000000, float %.0.i794) #6, !dbg !21 + %.01.i796 = select i1 %.not4.i795, float %1891, float %1890, !dbg !21 + %1892 = bitcast float %.04.i792 to i32, !dbg !21 + %1893 = shl i32 %1892, 23, !dbg !21 + %1894 = bitcast i32 %1893 to float, !dbg !21 + %1895 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i796) #6, !dbg !21 + %1896 = fmul float %1895, %1894, !dbg !21 + %1897 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i797 = icmp eq i32 %1897, 0, !dbg !21 + %1898 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1749, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1899 = tail call float @llvm.nvvm.fma.rn.f(float %1749, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i798 = select i1 %.not.i797, float %1899, float %1898, !dbg !21 + %1900 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i799 = icmp eq i32 %1900, 0, !dbg !21 + %1901 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i798) #6, !dbg !21 + %1902 = tail call float @llvm.nvvm.saturate.f(float %.02.i798) #6, !dbg !21 + %.03.i800 = select i1 %.not1.i799, float %1902, float %1901, !dbg !21 + %1903 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i801 = icmp eq i32 %1903, 0, !dbg !21 + %1904 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i800, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1905 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i800, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i802 = select i1 %.not2.i801, float %1905, float %1904, !dbg !21 + %1906 = fadd float %.04.i802, 0xC168000FE0000000, !dbg !21 + %1907 = fneg float %1906, !dbg !21 + %1908 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i803 = icmp eq i32 %1908, 0, !dbg !21 + %1909 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1749, float 0x3FF7154760000000, float %1907) #6, !dbg !21 + %1910 = tail call float @llvm.nvvm.fma.rn.f(float %1749, float 0x3FF7154760000000, float %1907) #6, !dbg !21 + %.0.i804 = select i1 %.not3.i803, float %1910, float %1909, !dbg !21 + %1911 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i805 = icmp eq i32 %1911, 0, !dbg !21 + %1912 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1749, float 0x3E54AE0C00000000, float %.0.i804) #6, !dbg !21 + %1913 = tail call float @llvm.nvvm.fma.rn.f(float %1749, float 0x3E54AE0C00000000, float %.0.i804) #6, !dbg !21 + %.01.i806 = select i1 %.not4.i805, float %1913, float %1912, !dbg !21 + %1914 = bitcast float %.04.i802 to i32, !dbg !21 + %1915 = shl i32 %1914, 23, !dbg !21 + %1916 = bitcast i32 %1915 to float, !dbg !21 + %1917 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i806) #6, !dbg !21 + %1918 = fmul float %1917, %1916, !dbg !21 + %1919 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i807 = icmp eq i32 %1919, 0, !dbg !21 + %1920 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1750, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1921 = tail call float @llvm.nvvm.fma.rn.f(float %1750, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i808 = select i1 %.not.i807, float %1921, float %1920, !dbg !21 + %1922 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i809 = icmp eq i32 %1922, 0, !dbg !21 + %1923 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i808) #6, !dbg !21 + %1924 = tail call float @llvm.nvvm.saturate.f(float %.02.i808) #6, !dbg !21 + %.03.i810 = select i1 %.not1.i809, float %1924, float %1923, !dbg !21 + %1925 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i811 = icmp eq i32 %1925, 0, !dbg !21 + %1926 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i810, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1927 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i810, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i812 = select i1 %.not2.i811, float %1927, float %1926, !dbg !21 + %1928 = fadd float %.04.i812, 0xC168000FE0000000, !dbg !21 + %1929 = fneg float %1928, !dbg !21 + %1930 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i813 = icmp eq i32 %1930, 0, !dbg !21 + %1931 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1750, float 0x3FF7154760000000, float %1929) #6, !dbg !21 + %1932 = tail call float @llvm.nvvm.fma.rn.f(float %1750, float 0x3FF7154760000000, float %1929) #6, !dbg !21 + %.0.i814 = select i1 %.not3.i813, float %1932, float %1931, !dbg !21 + %1933 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i815 = icmp eq i32 %1933, 0, !dbg !21 + %1934 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1750, float 0x3E54AE0C00000000, float %.0.i814) #6, !dbg !21 + %1935 = tail call float @llvm.nvvm.fma.rn.f(float %1750, float 0x3E54AE0C00000000, float %.0.i814) #6, !dbg !21 + %.01.i816 = select i1 %.not4.i815, float %1935, float %1934, !dbg !21 + %1936 = bitcast float %.04.i812 to i32, !dbg !21 + %1937 = shl i32 %1936, 23, !dbg !21 + %1938 = bitcast i32 %1937 to float, !dbg !21 + %1939 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i816) #6, !dbg !21 + %1940 = fmul float %1939, %1938, !dbg !21 + %1941 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i817 = icmp eq i32 %1941, 0, !dbg !21 + %1942 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1751, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1943 = tail call float @llvm.nvvm.fma.rn.f(float %1751, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i818 = select i1 %.not.i817, float %1943, float %1942, !dbg !21 + %1944 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i819 = icmp eq i32 %1944, 0, !dbg !21 + %1945 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i818) #6, !dbg !21 + %1946 = tail call float @llvm.nvvm.saturate.f(float %.02.i818) #6, !dbg !21 + %.03.i820 = select i1 %.not1.i819, float %1946, float %1945, !dbg !21 + %1947 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i821 = icmp eq i32 %1947, 0, !dbg !21 + %1948 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i820, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1949 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i820, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i822 = select i1 %.not2.i821, float %1949, float %1948, !dbg !21 + %1950 = fadd float %.04.i822, 0xC168000FE0000000, !dbg !21 + %1951 = fneg float %1950, !dbg !21 + %1952 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i823 = icmp eq i32 %1952, 0, !dbg !21 + %1953 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1751, float 0x3FF7154760000000, float %1951) #6, !dbg !21 + %1954 = tail call float @llvm.nvvm.fma.rn.f(float %1751, float 0x3FF7154760000000, float %1951) #6, !dbg !21 + %.0.i824 = select i1 %.not3.i823, float %1954, float %1953, !dbg !21 + %1955 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i825 = icmp eq i32 %1955, 0, !dbg !21 + %1956 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1751, float 0x3E54AE0C00000000, float %.0.i824) #6, !dbg !21 + %1957 = tail call float @llvm.nvvm.fma.rn.f(float %1751, float 0x3E54AE0C00000000, float %.0.i824) #6, !dbg !21 + %.01.i826 = select i1 %.not4.i825, float %1957, float %1956, !dbg !21 + %1958 = bitcast float %.04.i822 to i32, !dbg !21 + %1959 = shl i32 %1958, 23, !dbg !21 + %1960 = bitcast i32 %1959 to float, !dbg !21 + %1961 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i826) #6, !dbg !21 + %1962 = fmul float %1961, %1960, !dbg !21 + %1963 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i827 = icmp eq i32 %1963, 0, !dbg !21 + %1964 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1752, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1965 = tail call float @llvm.nvvm.fma.rn.f(float %1752, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i828 = select i1 %.not.i827, float %1965, float %1964, !dbg !21 + %1966 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i829 = icmp eq i32 %1966, 0, !dbg !21 + %1967 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i828) #6, !dbg !21 + %1968 = tail call float @llvm.nvvm.saturate.f(float %.02.i828) #6, !dbg !21 + %.03.i830 = select i1 %.not1.i829, float %1968, float %1967, !dbg !21 + %1969 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i831 = icmp eq i32 %1969, 0, !dbg !21 + %1970 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i830, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1971 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i830, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i832 = select i1 %.not2.i831, float %1971, float %1970, !dbg !21 + %1972 = fadd float %.04.i832, 0xC168000FE0000000, !dbg !21 + %1973 = fneg float %1972, !dbg !21 + %1974 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i833 = icmp eq i32 %1974, 0, !dbg !21 + %1975 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1752, float 0x3FF7154760000000, float %1973) #6, !dbg !21 + %1976 = tail call float @llvm.nvvm.fma.rn.f(float %1752, float 0x3FF7154760000000, float %1973) #6, !dbg !21 + %.0.i834 = select i1 %.not3.i833, float %1976, float %1975, !dbg !21 + %1977 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i835 = icmp eq i32 %1977, 0, !dbg !21 + %1978 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1752, float 0x3E54AE0C00000000, float %.0.i834) #6, !dbg !21 + %1979 = tail call float @llvm.nvvm.fma.rn.f(float %1752, float 0x3E54AE0C00000000, float %.0.i834) #6, !dbg !21 + %.01.i836 = select i1 %.not4.i835, float %1979, float %1978, !dbg !21 + %1980 = bitcast float %.04.i832 to i32, !dbg !21 + %1981 = shl i32 %1980, 23, !dbg !21 + %1982 = bitcast i32 %1981 to float, !dbg !21 + %1983 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i836) #6, !dbg !21 + %1984 = fmul float %1983, %1982, !dbg !21 + %1985 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i837 = icmp eq i32 %1985, 0, !dbg !21 + %1986 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1753, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %1987 = tail call float @llvm.nvvm.fma.rn.f(float %1753, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i838 = select i1 %.not.i837, float %1987, float %1986, !dbg !21 + %1988 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i839 = icmp eq i32 %1988, 0, !dbg !21 + %1989 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i838) #6, !dbg !21 + %1990 = tail call float @llvm.nvvm.saturate.f(float %.02.i838) #6, !dbg !21 + %.03.i840 = select i1 %.not1.i839, float %1990, float %1989, !dbg !21 + %1991 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i841 = icmp eq i32 %1991, 0, !dbg !21 + %1992 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i840, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %1993 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i840, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i842 = select i1 %.not2.i841, float %1993, float %1992, !dbg !21 + %1994 = fadd float %.04.i842, 0xC168000FE0000000, !dbg !21 + %1995 = fneg float %1994, !dbg !21 + %1996 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i843 = icmp eq i32 %1996, 0, !dbg !21 + %1997 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1753, float 0x3FF7154760000000, float %1995) #6, !dbg !21 + %1998 = tail call float @llvm.nvvm.fma.rn.f(float %1753, float 0x3FF7154760000000, float %1995) #6, !dbg !21 + %.0.i844 = select i1 %.not3.i843, float %1998, float %1997, !dbg !21 + %1999 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i845 = icmp eq i32 %1999, 0, !dbg !21 + %2000 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1753, float 0x3E54AE0C00000000, float %.0.i844) #6, !dbg !21 + %2001 = tail call float @llvm.nvvm.fma.rn.f(float %1753, float 0x3E54AE0C00000000, float %.0.i844) #6, !dbg !21 + %.01.i846 = select i1 %.not4.i845, float %2001, float %2000, !dbg !21 + %2002 = bitcast float %.04.i842 to i32, !dbg !21 + %2003 = shl i32 %2002, 23, !dbg !21 + %2004 = bitcast i32 %2003 to float, !dbg !21 + %2005 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i846) #6, !dbg !21 + %2006 = fmul float %2005, %2004, !dbg !21 + %2007 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i847 = icmp eq i32 %2007, 0, !dbg !21 + %2008 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1754, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2009 = tail call float @llvm.nvvm.fma.rn.f(float %1754, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i848 = select i1 %.not.i847, float %2009, float %2008, !dbg !21 + %2010 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i849 = icmp eq i32 %2010, 0, !dbg !21 + %2011 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i848) #6, !dbg !21 + %2012 = tail call float @llvm.nvvm.saturate.f(float %.02.i848) #6, !dbg !21 + %.03.i850 = select i1 %.not1.i849, float %2012, float %2011, !dbg !21 + %2013 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i851 = icmp eq i32 %2013, 0, !dbg !21 + %2014 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i850, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2015 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i850, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i852 = select i1 %.not2.i851, float %2015, float %2014, !dbg !21 + %2016 = fadd float %.04.i852, 0xC168000FE0000000, !dbg !21 + %2017 = fneg float %2016, !dbg !21 + %2018 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i853 = icmp eq i32 %2018, 0, !dbg !21 + %2019 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1754, float 0x3FF7154760000000, float %2017) #6, !dbg !21 + %2020 = tail call float @llvm.nvvm.fma.rn.f(float %1754, float 0x3FF7154760000000, float %2017) #6, !dbg !21 + %.0.i854 = select i1 %.not3.i853, float %2020, float %2019, !dbg !21 + %2021 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i855 = icmp eq i32 %2021, 0, !dbg !21 + %2022 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1754, float 0x3E54AE0C00000000, float %.0.i854) #6, !dbg !21 + %2023 = tail call float @llvm.nvvm.fma.rn.f(float %1754, float 0x3E54AE0C00000000, float %.0.i854) #6, !dbg !21 + %.01.i856 = select i1 %.not4.i855, float %2023, float %2022, !dbg !21 + %2024 = bitcast float %.04.i852 to i32, !dbg !21 + %2025 = shl i32 %2024, 23, !dbg !21 + %2026 = bitcast i32 %2025 to float, !dbg !21 + %2027 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i856) #6, !dbg !21 + %2028 = fmul float %2027, %2026, !dbg !21 + %2029 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i857 = icmp eq i32 %2029, 0, !dbg !21 + %2030 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1755, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2031 = tail call float @llvm.nvvm.fma.rn.f(float %1755, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i858 = select i1 %.not.i857, float %2031, float %2030, !dbg !21 + %2032 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i859 = icmp eq i32 %2032, 0, !dbg !21 + %2033 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i858) #6, !dbg !21 + %2034 = tail call float @llvm.nvvm.saturate.f(float %.02.i858) #6, !dbg !21 + %.03.i860 = select i1 %.not1.i859, float %2034, float %2033, !dbg !21 + %2035 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i861 = icmp eq i32 %2035, 0, !dbg !21 + %2036 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i860, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2037 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i860, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i862 = select i1 %.not2.i861, float %2037, float %2036, !dbg !21 + %2038 = fadd float %.04.i862, 0xC168000FE0000000, !dbg !21 + %2039 = fneg float %2038, !dbg !21 + %2040 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i863 = icmp eq i32 %2040, 0, !dbg !21 + %2041 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1755, float 0x3FF7154760000000, float %2039) #6, !dbg !21 + %2042 = tail call float @llvm.nvvm.fma.rn.f(float %1755, float 0x3FF7154760000000, float %2039) #6, !dbg !21 + %.0.i864 = select i1 %.not3.i863, float %2042, float %2041, !dbg !21 + %2043 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i865 = icmp eq i32 %2043, 0, !dbg !21 + %2044 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1755, float 0x3E54AE0C00000000, float %.0.i864) #6, !dbg !21 + %2045 = tail call float @llvm.nvvm.fma.rn.f(float %1755, float 0x3E54AE0C00000000, float %.0.i864) #6, !dbg !21 + %.01.i866 = select i1 %.not4.i865, float %2045, float %2044, !dbg !21 + %2046 = bitcast float %.04.i862 to i32, !dbg !21 + %2047 = shl i32 %2046, 23, !dbg !21 + %2048 = bitcast i32 %2047 to float, !dbg !21 + %2049 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i866) #6, !dbg !21 + %2050 = fmul float %2049, %2048, !dbg !21 + %2051 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i867 = icmp eq i32 %2051, 0, !dbg !21 + %2052 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1756, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2053 = tail call float @llvm.nvvm.fma.rn.f(float %1756, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i868 = select i1 %.not.i867, float %2053, float %2052, !dbg !21 + %2054 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i869 = icmp eq i32 %2054, 0, !dbg !21 + %2055 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i868) #6, !dbg !21 + %2056 = tail call float @llvm.nvvm.saturate.f(float %.02.i868) #6, !dbg !21 + %.03.i870 = select i1 %.not1.i869, float %2056, float %2055, !dbg !21 + %2057 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i871 = icmp eq i32 %2057, 0, !dbg !21 + %2058 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i870, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2059 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i870, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i872 = select i1 %.not2.i871, float %2059, float %2058, !dbg !21 + %2060 = fadd float %.04.i872, 0xC168000FE0000000, !dbg !21 + %2061 = fneg float %2060, !dbg !21 + %2062 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i873 = icmp eq i32 %2062, 0, !dbg !21 + %2063 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1756, float 0x3FF7154760000000, float %2061) #6, !dbg !21 + %2064 = tail call float @llvm.nvvm.fma.rn.f(float %1756, float 0x3FF7154760000000, float %2061) #6, !dbg !21 + %.0.i874 = select i1 %.not3.i873, float %2064, float %2063, !dbg !21 + %2065 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i875 = icmp eq i32 %2065, 0, !dbg !21 + %2066 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1756, float 0x3E54AE0C00000000, float %.0.i874) #6, !dbg !21 + %2067 = tail call float @llvm.nvvm.fma.rn.f(float %1756, float 0x3E54AE0C00000000, float %.0.i874) #6, !dbg !21 + %.01.i876 = select i1 %.not4.i875, float %2067, float %2066, !dbg !21 + %2068 = bitcast float %.04.i872 to i32, !dbg !21 + %2069 = shl i32 %2068, 23, !dbg !21 + %2070 = bitcast i32 %2069 to float, !dbg !21 + %2071 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i876) #6, !dbg !21 + %2072 = fmul float %2071, %2070, !dbg !21 + %2073 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i877 = icmp eq i32 %2073, 0, !dbg !21 + %2074 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1757, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2075 = tail call float @llvm.nvvm.fma.rn.f(float %1757, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i878 = select i1 %.not.i877, float %2075, float %2074, !dbg !21 + %2076 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i879 = icmp eq i32 %2076, 0, !dbg !21 + %2077 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i878) #6, !dbg !21 + %2078 = tail call float @llvm.nvvm.saturate.f(float %.02.i878) #6, !dbg !21 + %.03.i880 = select i1 %.not1.i879, float %2078, float %2077, !dbg !21 + %2079 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i881 = icmp eq i32 %2079, 0, !dbg !21 + %2080 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i880, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2081 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i880, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i882 = select i1 %.not2.i881, float %2081, float %2080, !dbg !21 + %2082 = fadd float %.04.i882, 0xC168000FE0000000, !dbg !21 + %2083 = fneg float %2082, !dbg !21 + %2084 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i883 = icmp eq i32 %2084, 0, !dbg !21 + %2085 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1757, float 0x3FF7154760000000, float %2083) #6, !dbg !21 + %2086 = tail call float @llvm.nvvm.fma.rn.f(float %1757, float 0x3FF7154760000000, float %2083) #6, !dbg !21 + %.0.i884 = select i1 %.not3.i883, float %2086, float %2085, !dbg !21 + %2087 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i885 = icmp eq i32 %2087, 0, !dbg !21 + %2088 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1757, float 0x3E54AE0C00000000, float %.0.i884) #6, !dbg !21 + %2089 = tail call float @llvm.nvvm.fma.rn.f(float %1757, float 0x3E54AE0C00000000, float %.0.i884) #6, !dbg !21 + %.01.i886 = select i1 %.not4.i885, float %2089, float %2088, !dbg !21 + %2090 = bitcast float %.04.i882 to i32, !dbg !21 + %2091 = shl i32 %2090, 23, !dbg !21 + %2092 = bitcast i32 %2091 to float, !dbg !21 + %2093 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i886) #6, !dbg !21 + %2094 = fmul float %2093, %2092, !dbg !21 + %2095 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i887 = icmp eq i32 %2095, 0, !dbg !21 + %2096 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1758, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2097 = tail call float @llvm.nvvm.fma.rn.f(float %1758, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i888 = select i1 %.not.i887, float %2097, float %2096, !dbg !21 + %2098 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i889 = icmp eq i32 %2098, 0, !dbg !21 + %2099 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i888) #6, !dbg !21 + %2100 = tail call float @llvm.nvvm.saturate.f(float %.02.i888) #6, !dbg !21 + %.03.i890 = select i1 %.not1.i889, float %2100, float %2099, !dbg !21 + %2101 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i891 = icmp eq i32 %2101, 0, !dbg !21 + %2102 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i890, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2103 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i890, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i892 = select i1 %.not2.i891, float %2103, float %2102, !dbg !21 + %2104 = fadd float %.04.i892, 0xC168000FE0000000, !dbg !21 + %2105 = fneg float %2104, !dbg !21 + %2106 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i893 = icmp eq i32 %2106, 0, !dbg !21 + %2107 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1758, float 0x3FF7154760000000, float %2105) #6, !dbg !21 + %2108 = tail call float @llvm.nvvm.fma.rn.f(float %1758, float 0x3FF7154760000000, float %2105) #6, !dbg !21 + %.0.i894 = select i1 %.not3.i893, float %2108, float %2107, !dbg !21 + %2109 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i895 = icmp eq i32 %2109, 0, !dbg !21 + %2110 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1758, float 0x3E54AE0C00000000, float %.0.i894) #6, !dbg !21 + %2111 = tail call float @llvm.nvvm.fma.rn.f(float %1758, float 0x3E54AE0C00000000, float %.0.i894) #6, !dbg !21 + %.01.i896 = select i1 %.not4.i895, float %2111, float %2110, !dbg !21 + %2112 = bitcast float %.04.i892 to i32, !dbg !21 + %2113 = shl i32 %2112, 23, !dbg !21 + %2114 = bitcast i32 %2113 to float, !dbg !21 + %2115 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i896) #6, !dbg !21 + %2116 = fmul float %2115, %2114, !dbg !21 + %2117 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i897 = icmp eq i32 %2117, 0, !dbg !21 + %2118 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1759, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2119 = tail call float @llvm.nvvm.fma.rn.f(float %1759, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i898 = select i1 %.not.i897, float %2119, float %2118, !dbg !21 + %2120 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i899 = icmp eq i32 %2120, 0, !dbg !21 + %2121 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i898) #6, !dbg !21 + %2122 = tail call float @llvm.nvvm.saturate.f(float %.02.i898) #6, !dbg !21 + %.03.i900 = select i1 %.not1.i899, float %2122, float %2121, !dbg !21 + %2123 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i901 = icmp eq i32 %2123, 0, !dbg !21 + %2124 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i900, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2125 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i900, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i902 = select i1 %.not2.i901, float %2125, float %2124, !dbg !21 + %2126 = fadd float %.04.i902, 0xC168000FE0000000, !dbg !21 + %2127 = fneg float %2126, !dbg !21 + %2128 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i903 = icmp eq i32 %2128, 0, !dbg !21 + %2129 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1759, float 0x3FF7154760000000, float %2127) #6, !dbg !21 + %2130 = tail call float @llvm.nvvm.fma.rn.f(float %1759, float 0x3FF7154760000000, float %2127) #6, !dbg !21 + %.0.i904 = select i1 %.not3.i903, float %2130, float %2129, !dbg !21 + %2131 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i905 = icmp eq i32 %2131, 0, !dbg !21 + %2132 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1759, float 0x3E54AE0C00000000, float %.0.i904) #6, !dbg !21 + %2133 = tail call float @llvm.nvvm.fma.rn.f(float %1759, float 0x3E54AE0C00000000, float %.0.i904) #6, !dbg !21 + %.01.i906 = select i1 %.not4.i905, float %2133, float %2132, !dbg !21 + %2134 = bitcast float %.04.i902 to i32, !dbg !21 + %2135 = shl i32 %2134, 23, !dbg !21 + %2136 = bitcast i32 %2135 to float, !dbg !21 + %2137 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i906) #6, !dbg !21 + %2138 = fmul float %2137, %2136, !dbg !21 + %2139 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i907 = icmp eq i32 %2139, 0, !dbg !21 + %2140 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1760, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2141 = tail call float @llvm.nvvm.fma.rn.f(float %1760, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i908 = select i1 %.not.i907, float %2141, float %2140, !dbg !21 + %2142 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i909 = icmp eq i32 %2142, 0, !dbg !21 + %2143 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i908) #6, !dbg !21 + %2144 = tail call float @llvm.nvvm.saturate.f(float %.02.i908) #6, !dbg !21 + %.03.i910 = select i1 %.not1.i909, float %2144, float %2143, !dbg !21 + %2145 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i911 = icmp eq i32 %2145, 0, !dbg !21 + %2146 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i910, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2147 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i910, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i912 = select i1 %.not2.i911, float %2147, float %2146, !dbg !21 + %2148 = fadd float %.04.i912, 0xC168000FE0000000, !dbg !21 + %2149 = fneg float %2148, !dbg !21 + %2150 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i913 = icmp eq i32 %2150, 0, !dbg !21 + %2151 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1760, float 0x3FF7154760000000, float %2149) #6, !dbg !21 + %2152 = tail call float @llvm.nvvm.fma.rn.f(float %1760, float 0x3FF7154760000000, float %2149) #6, !dbg !21 + %.0.i914 = select i1 %.not3.i913, float %2152, float %2151, !dbg !21 + %2153 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i915 = icmp eq i32 %2153, 0, !dbg !21 + %2154 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1760, float 0x3E54AE0C00000000, float %.0.i914) #6, !dbg !21 + %2155 = tail call float @llvm.nvvm.fma.rn.f(float %1760, float 0x3E54AE0C00000000, float %.0.i914) #6, !dbg !21 + %.01.i916 = select i1 %.not4.i915, float %2155, float %2154, !dbg !21 + %2156 = bitcast float %.04.i912 to i32, !dbg !21 + %2157 = shl i32 %2156, 23, !dbg !21 + %2158 = bitcast i32 %2157 to float, !dbg !21 + %2159 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i916) #6, !dbg !21 + %2160 = fmul float %2159, %2158, !dbg !21 + %2161 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i917 = icmp eq i32 %2161, 0, !dbg !21 + %2162 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1761, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2163 = tail call float @llvm.nvvm.fma.rn.f(float %1761, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i918 = select i1 %.not.i917, float %2163, float %2162, !dbg !21 + %2164 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i919 = icmp eq i32 %2164, 0, !dbg !21 + %2165 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i918) #6, !dbg !21 + %2166 = tail call float @llvm.nvvm.saturate.f(float %.02.i918) #6, !dbg !21 + %.03.i920 = select i1 %.not1.i919, float %2166, float %2165, !dbg !21 + %2167 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i921 = icmp eq i32 %2167, 0, !dbg !21 + %2168 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i920, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2169 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i920, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i922 = select i1 %.not2.i921, float %2169, float %2168, !dbg !21 + %2170 = fadd float %.04.i922, 0xC168000FE0000000, !dbg !21 + %2171 = fneg float %2170, !dbg !21 + %2172 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i923 = icmp eq i32 %2172, 0, !dbg !21 + %2173 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1761, float 0x3FF7154760000000, float %2171) #6, !dbg !21 + %2174 = tail call float @llvm.nvvm.fma.rn.f(float %1761, float 0x3FF7154760000000, float %2171) #6, !dbg !21 + %.0.i924 = select i1 %.not3.i923, float %2174, float %2173, !dbg !21 + %2175 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i925 = icmp eq i32 %2175, 0, !dbg !21 + %2176 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1761, float 0x3E54AE0C00000000, float %.0.i924) #6, !dbg !21 + %2177 = tail call float @llvm.nvvm.fma.rn.f(float %1761, float 0x3E54AE0C00000000, float %.0.i924) #6, !dbg !21 + %.01.i926 = select i1 %.not4.i925, float %2177, float %2176, !dbg !21 + %2178 = bitcast float %.04.i922 to i32, !dbg !21 + %2179 = shl i32 %2178, 23, !dbg !21 + %2180 = bitcast i32 %2179 to float, !dbg !21 + %2181 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i926) #6, !dbg !21 + %2182 = fmul float %2181, %2180, !dbg !21 + %2183 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i927 = icmp eq i32 %2183, 0, !dbg !21 + %2184 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1762, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2185 = tail call float @llvm.nvvm.fma.rn.f(float %1762, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i928 = select i1 %.not.i927, float %2185, float %2184, !dbg !21 + %2186 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i929 = icmp eq i32 %2186, 0, !dbg !21 + %2187 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i928) #6, !dbg !21 + %2188 = tail call float @llvm.nvvm.saturate.f(float %.02.i928) #6, !dbg !21 + %.03.i930 = select i1 %.not1.i929, float %2188, float %2187, !dbg !21 + %2189 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i931 = icmp eq i32 %2189, 0, !dbg !21 + %2190 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i930, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2191 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i930, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i932 = select i1 %.not2.i931, float %2191, float %2190, !dbg !21 + %2192 = fadd float %.04.i932, 0xC168000FE0000000, !dbg !21 + %2193 = fneg float %2192, !dbg !21 + %2194 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i933 = icmp eq i32 %2194, 0, !dbg !21 + %2195 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1762, float 0x3FF7154760000000, float %2193) #6, !dbg !21 + %2196 = tail call float @llvm.nvvm.fma.rn.f(float %1762, float 0x3FF7154760000000, float %2193) #6, !dbg !21 + %.0.i934 = select i1 %.not3.i933, float %2196, float %2195, !dbg !21 + %2197 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i935 = icmp eq i32 %2197, 0, !dbg !21 + %2198 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1762, float 0x3E54AE0C00000000, float %.0.i934) #6, !dbg !21 + %2199 = tail call float @llvm.nvvm.fma.rn.f(float %1762, float 0x3E54AE0C00000000, float %.0.i934) #6, !dbg !21 + %.01.i936 = select i1 %.not4.i935, float %2199, float %2198, !dbg !21 + %2200 = bitcast float %.04.i932 to i32, !dbg !21 + %2201 = shl i32 %2200, 23, !dbg !21 + %2202 = bitcast i32 %2201 to float, !dbg !21 + %2203 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i936) #6, !dbg !21 + %2204 = fmul float %2203, %2202, !dbg !21 + %2205 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i937 = icmp eq i32 %2205, 0, !dbg !21 + %2206 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1763, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2207 = tail call float @llvm.nvvm.fma.rn.f(float %1763, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i938 = select i1 %.not.i937, float %2207, float %2206, !dbg !21 + %2208 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i939 = icmp eq i32 %2208, 0, !dbg !21 + %2209 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i938) #6, !dbg !21 + %2210 = tail call float @llvm.nvvm.saturate.f(float %.02.i938) #6, !dbg !21 + %.03.i940 = select i1 %.not1.i939, float %2210, float %2209, !dbg !21 + %2211 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i941 = icmp eq i32 %2211, 0, !dbg !21 + %2212 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i940, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2213 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i940, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i942 = select i1 %.not2.i941, float %2213, float %2212, !dbg !21 + %2214 = fadd float %.04.i942, 0xC168000FE0000000, !dbg !21 + %2215 = fneg float %2214, !dbg !21 + %2216 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i943 = icmp eq i32 %2216, 0, !dbg !21 + %2217 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1763, float 0x3FF7154760000000, float %2215) #6, !dbg !21 + %2218 = tail call float @llvm.nvvm.fma.rn.f(float %1763, float 0x3FF7154760000000, float %2215) #6, !dbg !21 + %.0.i944 = select i1 %.not3.i943, float %2218, float %2217, !dbg !21 + %2219 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i945 = icmp eq i32 %2219, 0, !dbg !21 + %2220 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1763, float 0x3E54AE0C00000000, float %.0.i944) #6, !dbg !21 + %2221 = tail call float @llvm.nvvm.fma.rn.f(float %1763, float 0x3E54AE0C00000000, float %.0.i944) #6, !dbg !21 + %.01.i946 = select i1 %.not4.i945, float %2221, float %2220, !dbg !21 + %2222 = bitcast float %.04.i942 to i32, !dbg !21 + %2223 = shl i32 %2222, 23, !dbg !21 + %2224 = bitcast i32 %2223 to float, !dbg !21 + %2225 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i946) #6, !dbg !21 + %2226 = fmul float %2225, %2224, !dbg !21 + %2227 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i947 = icmp eq i32 %2227, 0, !dbg !21 + %2228 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1764, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2229 = tail call float @llvm.nvvm.fma.rn.f(float %1764, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i948 = select i1 %.not.i947, float %2229, float %2228, !dbg !21 + %2230 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i949 = icmp eq i32 %2230, 0, !dbg !21 + %2231 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i948) #6, !dbg !21 + %2232 = tail call float @llvm.nvvm.saturate.f(float %.02.i948) #6, !dbg !21 + %.03.i950 = select i1 %.not1.i949, float %2232, float %2231, !dbg !21 + %2233 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i951 = icmp eq i32 %2233, 0, !dbg !21 + %2234 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i950, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2235 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i950, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i952 = select i1 %.not2.i951, float %2235, float %2234, !dbg !21 + %2236 = fadd float %.04.i952, 0xC168000FE0000000, !dbg !21 + %2237 = fneg float %2236, !dbg !21 + %2238 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i953 = icmp eq i32 %2238, 0, !dbg !21 + %2239 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1764, float 0x3FF7154760000000, float %2237) #6, !dbg !21 + %2240 = tail call float @llvm.nvvm.fma.rn.f(float %1764, float 0x3FF7154760000000, float %2237) #6, !dbg !21 + %.0.i954 = select i1 %.not3.i953, float %2240, float %2239, !dbg !21 + %2241 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i955 = icmp eq i32 %2241, 0, !dbg !21 + %2242 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1764, float 0x3E54AE0C00000000, float %.0.i954) #6, !dbg !21 + %2243 = tail call float @llvm.nvvm.fma.rn.f(float %1764, float 0x3E54AE0C00000000, float %.0.i954) #6, !dbg !21 + %.01.i956 = select i1 %.not4.i955, float %2243, float %2242, !dbg !21 + %2244 = bitcast float %.04.i952 to i32, !dbg !21 + %2245 = shl i32 %2244, 23, !dbg !21 + %2246 = bitcast i32 %2245 to float, !dbg !21 + %2247 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i956) #6, !dbg !21 + %2248 = fmul float %2247, %2246, !dbg !21 + %2249 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i957 = icmp eq i32 %2249, 0, !dbg !21 + %2250 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1765, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2251 = tail call float @llvm.nvvm.fma.rn.f(float %1765, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i958 = select i1 %.not.i957, float %2251, float %2250, !dbg !21 + %2252 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i959 = icmp eq i32 %2252, 0, !dbg !21 + %2253 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i958) #6, !dbg !21 + %2254 = tail call float @llvm.nvvm.saturate.f(float %.02.i958) #6, !dbg !21 + %.03.i960 = select i1 %.not1.i959, float %2254, float %2253, !dbg !21 + %2255 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i961 = icmp eq i32 %2255, 0, !dbg !21 + %2256 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i960, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2257 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i960, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i962 = select i1 %.not2.i961, float %2257, float %2256, !dbg !21 + %2258 = fadd float %.04.i962, 0xC168000FE0000000, !dbg !21 + %2259 = fneg float %2258, !dbg !21 + %2260 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i963 = icmp eq i32 %2260, 0, !dbg !21 + %2261 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1765, float 0x3FF7154760000000, float %2259) #6, !dbg !21 + %2262 = tail call float @llvm.nvvm.fma.rn.f(float %1765, float 0x3FF7154760000000, float %2259) #6, !dbg !21 + %.0.i964 = select i1 %.not3.i963, float %2262, float %2261, !dbg !21 + %2263 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i965 = icmp eq i32 %2263, 0, !dbg !21 + %2264 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1765, float 0x3E54AE0C00000000, float %.0.i964) #6, !dbg !21 + %2265 = tail call float @llvm.nvvm.fma.rn.f(float %1765, float 0x3E54AE0C00000000, float %.0.i964) #6, !dbg !21 + %.01.i966 = select i1 %.not4.i965, float %2265, float %2264, !dbg !21 + %2266 = bitcast float %.04.i962 to i32, !dbg !21 + %2267 = shl i32 %2266, 23, !dbg !21 + %2268 = bitcast i32 %2267 to float, !dbg !21 + %2269 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i966) #6, !dbg !21 + %2270 = fmul float %2269, %2268, !dbg !21 + %2271 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i967 = icmp eq i32 %2271, 0, !dbg !21 + %2272 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1766, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2273 = tail call float @llvm.nvvm.fma.rn.f(float %1766, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i968 = select i1 %.not.i967, float %2273, float %2272, !dbg !21 + %2274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i969 = icmp eq i32 %2274, 0, !dbg !21 + %2275 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i968) #6, !dbg !21 + %2276 = tail call float @llvm.nvvm.saturate.f(float %.02.i968) #6, !dbg !21 + %.03.i970 = select i1 %.not1.i969, float %2276, float %2275, !dbg !21 + %2277 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i971 = icmp eq i32 %2277, 0, !dbg !21 + %2278 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i970, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2279 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i970, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i972 = select i1 %.not2.i971, float %2279, float %2278, !dbg !21 + %2280 = fadd float %.04.i972, 0xC168000FE0000000, !dbg !21 + %2281 = fneg float %2280, !dbg !21 + %2282 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i973 = icmp eq i32 %2282, 0, !dbg !21 + %2283 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1766, float 0x3FF7154760000000, float %2281) #6, !dbg !21 + %2284 = tail call float @llvm.nvvm.fma.rn.f(float %1766, float 0x3FF7154760000000, float %2281) #6, !dbg !21 + %.0.i974 = select i1 %.not3.i973, float %2284, float %2283, !dbg !21 + %2285 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i975 = icmp eq i32 %2285, 0, !dbg !21 + %2286 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1766, float 0x3E54AE0C00000000, float %.0.i974) #6, !dbg !21 + %2287 = tail call float @llvm.nvvm.fma.rn.f(float %1766, float 0x3E54AE0C00000000, float %.0.i974) #6, !dbg !21 + %.01.i976 = select i1 %.not4.i975, float %2287, float %2286, !dbg !21 + %2288 = bitcast float %.04.i972 to i32, !dbg !21 + %2289 = shl i32 %2288, 23, !dbg !21 + %2290 = bitcast i32 %2289 to float, !dbg !21 + %2291 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i976) #6, !dbg !21 + %2292 = fmul float %2291, %2290, !dbg !21 + %2293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i977 = icmp eq i32 %2293, 0, !dbg !21 + %2294 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1767, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2295 = tail call float @llvm.nvvm.fma.rn.f(float %1767, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i978 = select i1 %.not.i977, float %2295, float %2294, !dbg !21 + %2296 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i979 = icmp eq i32 %2296, 0, !dbg !21 + %2297 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i978) #6, !dbg !21 + %2298 = tail call float @llvm.nvvm.saturate.f(float %.02.i978) #6, !dbg !21 + %.03.i980 = select i1 %.not1.i979, float %2298, float %2297, !dbg !21 + %2299 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i981 = icmp eq i32 %2299, 0, !dbg !21 + %2300 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i980, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2301 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i980, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i982 = select i1 %.not2.i981, float %2301, float %2300, !dbg !21 + %2302 = fadd float %.04.i982, 0xC168000FE0000000, !dbg !21 + %2303 = fneg float %2302, !dbg !21 + %2304 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i983 = icmp eq i32 %2304, 0, !dbg !21 + %2305 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1767, float 0x3FF7154760000000, float %2303) #6, !dbg !21 + %2306 = tail call float @llvm.nvvm.fma.rn.f(float %1767, float 0x3FF7154760000000, float %2303) #6, !dbg !21 + %.0.i984 = select i1 %.not3.i983, float %2306, float %2305, !dbg !21 + %2307 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i985 = icmp eq i32 %2307, 0, !dbg !21 + %2308 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1767, float 0x3E54AE0C00000000, float %.0.i984) #6, !dbg !21 + %2309 = tail call float @llvm.nvvm.fma.rn.f(float %1767, float 0x3E54AE0C00000000, float %.0.i984) #6, !dbg !21 + %.01.i986 = select i1 %.not4.i985, float %2309, float %2308, !dbg !21 + %2310 = bitcast float %.04.i982 to i32, !dbg !21 + %2311 = shl i32 %2310, 23, !dbg !21 + %2312 = bitcast i32 %2311 to float, !dbg !21 + %2313 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i986) #6, !dbg !21 + %2314 = fmul float %2313, %2312, !dbg !21 + %2315 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i987 = icmp eq i32 %2315, 0, !dbg !21 + %2316 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1768, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2317 = tail call float @llvm.nvvm.fma.rn.f(float %1768, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i988 = select i1 %.not.i987, float %2317, float %2316, !dbg !21 + %2318 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i989 = icmp eq i32 %2318, 0, !dbg !21 + %2319 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i988) #6, !dbg !21 + %2320 = tail call float @llvm.nvvm.saturate.f(float %.02.i988) #6, !dbg !21 + %.03.i990 = select i1 %.not1.i989, float %2320, float %2319, !dbg !21 + %2321 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i991 = icmp eq i32 %2321, 0, !dbg !21 + %2322 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i990, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2323 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i990, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i992 = select i1 %.not2.i991, float %2323, float %2322, !dbg !21 + %2324 = fadd float %.04.i992, 0xC168000FE0000000, !dbg !21 + %2325 = fneg float %2324, !dbg !21 + %2326 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i993 = icmp eq i32 %2326, 0, !dbg !21 + %2327 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1768, float 0x3FF7154760000000, float %2325) #6, !dbg !21 + %2328 = tail call float @llvm.nvvm.fma.rn.f(float %1768, float 0x3FF7154760000000, float %2325) #6, !dbg !21 + %.0.i994 = select i1 %.not3.i993, float %2328, float %2327, !dbg !21 + %2329 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i995 = icmp eq i32 %2329, 0, !dbg !21 + %2330 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1768, float 0x3E54AE0C00000000, float %.0.i994) #6, !dbg !21 + %2331 = tail call float @llvm.nvvm.fma.rn.f(float %1768, float 0x3E54AE0C00000000, float %.0.i994) #6, !dbg !21 + %.01.i996 = select i1 %.not4.i995, float %2331, float %2330, !dbg !21 + %2332 = bitcast float %.04.i992 to i32, !dbg !21 + %2333 = shl i32 %2332, 23, !dbg !21 + %2334 = bitcast i32 %2333 to float, !dbg !21 + %2335 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i996) #6, !dbg !21 + %2336 = fmul float %2335, %2334, !dbg !21 + %2337 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i997 = icmp eq i32 %2337, 0, !dbg !21 + %2338 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1769, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2339 = tail call float @llvm.nvvm.fma.rn.f(float %1769, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i998 = select i1 %.not.i997, float %2339, float %2338, !dbg !21 + %2340 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i999 = icmp eq i32 %2340, 0, !dbg !21 + %2341 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i998) #6, !dbg !21 + %2342 = tail call float @llvm.nvvm.saturate.f(float %.02.i998) #6, !dbg !21 + %.03.i1000 = select i1 %.not1.i999, float %2342, float %2341, !dbg !21 + %2343 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1001 = icmp eq i32 %2343, 0, !dbg !21 + %2344 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1000, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2345 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1000, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1002 = select i1 %.not2.i1001, float %2345, float %2344, !dbg !21 + %2346 = fadd float %.04.i1002, 0xC168000FE0000000, !dbg !21 + %2347 = fneg float %2346, !dbg !21 + %2348 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1003 = icmp eq i32 %2348, 0, !dbg !21 + %2349 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1769, float 0x3FF7154760000000, float %2347) #6, !dbg !21 + %2350 = tail call float @llvm.nvvm.fma.rn.f(float %1769, float 0x3FF7154760000000, float %2347) #6, !dbg !21 + %.0.i1004 = select i1 %.not3.i1003, float %2350, float %2349, !dbg !21 + %2351 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1005 = icmp eq i32 %2351, 0, !dbg !21 + %2352 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1769, float 0x3E54AE0C00000000, float %.0.i1004) #6, !dbg !21 + %2353 = tail call float @llvm.nvvm.fma.rn.f(float %1769, float 0x3E54AE0C00000000, float %.0.i1004) #6, !dbg !21 + %.01.i1006 = select i1 %.not4.i1005, float %2353, float %2352, !dbg !21 + %2354 = bitcast float %.04.i1002 to i32, !dbg !21 + %2355 = shl i32 %2354, 23, !dbg !21 + %2356 = bitcast i32 %2355 to float, !dbg !21 + %2357 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1006) #6, !dbg !21 + %2358 = fmul float %2357, %2356, !dbg !21 + %2359 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1007 = icmp eq i32 %2359, 0, !dbg !21 + %2360 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1770, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2361 = tail call float @llvm.nvvm.fma.rn.f(float %1770, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1008 = select i1 %.not.i1007, float %2361, float %2360, !dbg !21 + %2362 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1009 = icmp eq i32 %2362, 0, !dbg !21 + %2363 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1008) #6, !dbg !21 + %2364 = tail call float @llvm.nvvm.saturate.f(float %.02.i1008) #6, !dbg !21 + %.03.i1010 = select i1 %.not1.i1009, float %2364, float %2363, !dbg !21 + %2365 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1011 = icmp eq i32 %2365, 0, !dbg !21 + %2366 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1010, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2367 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1010, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1012 = select i1 %.not2.i1011, float %2367, float %2366, !dbg !21 + %2368 = fadd float %.04.i1012, 0xC168000FE0000000, !dbg !21 + %2369 = fneg float %2368, !dbg !21 + %2370 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1013 = icmp eq i32 %2370, 0, !dbg !21 + %2371 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1770, float 0x3FF7154760000000, float %2369) #6, !dbg !21 + %2372 = tail call float @llvm.nvvm.fma.rn.f(float %1770, float 0x3FF7154760000000, float %2369) #6, !dbg !21 + %.0.i1014 = select i1 %.not3.i1013, float %2372, float %2371, !dbg !21 + %2373 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1015 = icmp eq i32 %2373, 0, !dbg !21 + %2374 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1770, float 0x3E54AE0C00000000, float %.0.i1014) #6, !dbg !21 + %2375 = tail call float @llvm.nvvm.fma.rn.f(float %1770, float 0x3E54AE0C00000000, float %.0.i1014) #6, !dbg !21 + %.01.i1016 = select i1 %.not4.i1015, float %2375, float %2374, !dbg !21 + %2376 = bitcast float %.04.i1012 to i32, !dbg !21 + %2377 = shl i32 %2376, 23, !dbg !21 + %2378 = bitcast i32 %2377 to float, !dbg !21 + %2379 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1016) #6, !dbg !21 + %2380 = fmul float %2379, %2378, !dbg !21 + %2381 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1017 = icmp eq i32 %2381, 0, !dbg !21 + %2382 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1771, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2383 = tail call float @llvm.nvvm.fma.rn.f(float %1771, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1018 = select i1 %.not.i1017, float %2383, float %2382, !dbg !21 + %2384 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1019 = icmp eq i32 %2384, 0, !dbg !21 + %2385 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1018) #6, !dbg !21 + %2386 = tail call float @llvm.nvvm.saturate.f(float %.02.i1018) #6, !dbg !21 + %.03.i1020 = select i1 %.not1.i1019, float %2386, float %2385, !dbg !21 + %2387 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1021 = icmp eq i32 %2387, 0, !dbg !21 + %2388 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1020, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2389 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1020, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1022 = select i1 %.not2.i1021, float %2389, float %2388, !dbg !21 + %2390 = fadd float %.04.i1022, 0xC168000FE0000000, !dbg !21 + %2391 = fneg float %2390, !dbg !21 + %2392 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1023 = icmp eq i32 %2392, 0, !dbg !21 + %2393 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1771, float 0x3FF7154760000000, float %2391) #6, !dbg !21 + %2394 = tail call float @llvm.nvvm.fma.rn.f(float %1771, float 0x3FF7154760000000, float %2391) #6, !dbg !21 + %.0.i1024 = select i1 %.not3.i1023, float %2394, float %2393, !dbg !21 + %2395 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1025 = icmp eq i32 %2395, 0, !dbg !21 + %2396 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1771, float 0x3E54AE0C00000000, float %.0.i1024) #6, !dbg !21 + %2397 = tail call float @llvm.nvvm.fma.rn.f(float %1771, float 0x3E54AE0C00000000, float %.0.i1024) #6, !dbg !21 + %.01.i1026 = select i1 %.not4.i1025, float %2397, float %2396, !dbg !21 + %2398 = bitcast float %.04.i1022 to i32, !dbg !21 + %2399 = shl i32 %2398, 23, !dbg !21 + %2400 = bitcast i32 %2399 to float, !dbg !21 + %2401 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1026) #6, !dbg !21 + %2402 = fmul float %2401, %2400, !dbg !21 + %2403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1027 = icmp eq i32 %2403, 0, !dbg !21 + %2404 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1772, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2405 = tail call float @llvm.nvvm.fma.rn.f(float %1772, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1028 = select i1 %.not.i1027, float %2405, float %2404, !dbg !21 + %2406 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1029 = icmp eq i32 %2406, 0, !dbg !21 + %2407 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1028) #6, !dbg !21 + %2408 = tail call float @llvm.nvvm.saturate.f(float %.02.i1028) #6, !dbg !21 + %.03.i1030 = select i1 %.not1.i1029, float %2408, float %2407, !dbg !21 + %2409 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1031 = icmp eq i32 %2409, 0, !dbg !21 + %2410 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1030, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2411 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1030, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1032 = select i1 %.not2.i1031, float %2411, float %2410, !dbg !21 + %2412 = fadd float %.04.i1032, 0xC168000FE0000000, !dbg !21 + %2413 = fneg float %2412, !dbg !21 + %2414 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1033 = icmp eq i32 %2414, 0, !dbg !21 + %2415 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1772, float 0x3FF7154760000000, float %2413) #6, !dbg !21 + %2416 = tail call float @llvm.nvvm.fma.rn.f(float %1772, float 0x3FF7154760000000, float %2413) #6, !dbg !21 + %.0.i1034 = select i1 %.not3.i1033, float %2416, float %2415, !dbg !21 + %2417 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1035 = icmp eq i32 %2417, 0, !dbg !21 + %2418 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1772, float 0x3E54AE0C00000000, float %.0.i1034) #6, !dbg !21 + %2419 = tail call float @llvm.nvvm.fma.rn.f(float %1772, float 0x3E54AE0C00000000, float %.0.i1034) #6, !dbg !21 + %.01.i1036 = select i1 %.not4.i1035, float %2419, float %2418, !dbg !21 + %2420 = bitcast float %.04.i1032 to i32, !dbg !21 + %2421 = shl i32 %2420, 23, !dbg !21 + %2422 = bitcast i32 %2421 to float, !dbg !21 + %2423 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1036) #6, !dbg !21 + %2424 = fmul float %2423, %2422, !dbg !21 + %2425 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1037 = icmp eq i32 %2425, 0, !dbg !21 + %2426 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1773, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2427 = tail call float @llvm.nvvm.fma.rn.f(float %1773, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1038 = select i1 %.not.i1037, float %2427, float %2426, !dbg !21 + %2428 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1039 = icmp eq i32 %2428, 0, !dbg !21 + %2429 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1038) #6, !dbg !21 + %2430 = tail call float @llvm.nvvm.saturate.f(float %.02.i1038) #6, !dbg !21 + %.03.i1040 = select i1 %.not1.i1039, float %2430, float %2429, !dbg !21 + %2431 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1041 = icmp eq i32 %2431, 0, !dbg !21 + %2432 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1040, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2433 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1040, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1042 = select i1 %.not2.i1041, float %2433, float %2432, !dbg !21 + %2434 = fadd float %.04.i1042, 0xC168000FE0000000, !dbg !21 + %2435 = fneg float %2434, !dbg !21 + %2436 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1043 = icmp eq i32 %2436, 0, !dbg !21 + %2437 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1773, float 0x3FF7154760000000, float %2435) #6, !dbg !21 + %2438 = tail call float @llvm.nvvm.fma.rn.f(float %1773, float 0x3FF7154760000000, float %2435) #6, !dbg !21 + %.0.i1044 = select i1 %.not3.i1043, float %2438, float %2437, !dbg !21 + %2439 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1045 = icmp eq i32 %2439, 0, !dbg !21 + %2440 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1773, float 0x3E54AE0C00000000, float %.0.i1044) #6, !dbg !21 + %2441 = tail call float @llvm.nvvm.fma.rn.f(float %1773, float 0x3E54AE0C00000000, float %.0.i1044) #6, !dbg !21 + %.01.i1046 = select i1 %.not4.i1045, float %2441, float %2440, !dbg !21 + %2442 = bitcast float %.04.i1042 to i32, !dbg !21 + %2443 = shl i32 %2442, 23, !dbg !21 + %2444 = bitcast i32 %2443 to float, !dbg !21 + %2445 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1046) #6, !dbg !21 + %2446 = fmul float %2445, %2444, !dbg !21 + %2447 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1047 = icmp eq i32 %2447, 0, !dbg !21 + %2448 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1774, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2449 = tail call float @llvm.nvvm.fma.rn.f(float %1774, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1048 = select i1 %.not.i1047, float %2449, float %2448, !dbg !21 + %2450 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1049 = icmp eq i32 %2450, 0, !dbg !21 + %2451 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1048) #6, !dbg !21 + %2452 = tail call float @llvm.nvvm.saturate.f(float %.02.i1048) #6, !dbg !21 + %.03.i1050 = select i1 %.not1.i1049, float %2452, float %2451, !dbg !21 + %2453 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1051 = icmp eq i32 %2453, 0, !dbg !21 + %2454 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1050, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2455 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1050, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1052 = select i1 %.not2.i1051, float %2455, float %2454, !dbg !21 + %2456 = fadd float %.04.i1052, 0xC168000FE0000000, !dbg !21 + %2457 = fneg float %2456, !dbg !21 + %2458 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1053 = icmp eq i32 %2458, 0, !dbg !21 + %2459 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1774, float 0x3FF7154760000000, float %2457) #6, !dbg !21 + %2460 = tail call float @llvm.nvvm.fma.rn.f(float %1774, float 0x3FF7154760000000, float %2457) #6, !dbg !21 + %.0.i1054 = select i1 %.not3.i1053, float %2460, float %2459, !dbg !21 + %2461 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1055 = icmp eq i32 %2461, 0, !dbg !21 + %2462 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1774, float 0x3E54AE0C00000000, float %.0.i1054) #6, !dbg !21 + %2463 = tail call float @llvm.nvvm.fma.rn.f(float %1774, float 0x3E54AE0C00000000, float %.0.i1054) #6, !dbg !21 + %.01.i1056 = select i1 %.not4.i1055, float %2463, float %2462, !dbg !21 + %2464 = bitcast float %.04.i1052 to i32, !dbg !21 + %2465 = shl i32 %2464, 23, !dbg !21 + %2466 = bitcast i32 %2465 to float, !dbg !21 + %2467 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1056) #6, !dbg !21 + %2468 = fmul float %2467, %2466, !dbg !21 + %2469 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1057 = icmp eq i32 %2469, 0, !dbg !21 + %2470 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1775, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2471 = tail call float @llvm.nvvm.fma.rn.f(float %1775, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1058 = select i1 %.not.i1057, float %2471, float %2470, !dbg !21 + %2472 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1059 = icmp eq i32 %2472, 0, !dbg !21 + %2473 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1058) #6, !dbg !21 + %2474 = tail call float @llvm.nvvm.saturate.f(float %.02.i1058) #6, !dbg !21 + %.03.i1060 = select i1 %.not1.i1059, float %2474, float %2473, !dbg !21 + %2475 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1061 = icmp eq i32 %2475, 0, !dbg !21 + %2476 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1060, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2477 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1060, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1062 = select i1 %.not2.i1061, float %2477, float %2476, !dbg !21 + %2478 = fadd float %.04.i1062, 0xC168000FE0000000, !dbg !21 + %2479 = fneg float %2478, !dbg !21 + %2480 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1063 = icmp eq i32 %2480, 0, !dbg !21 + %2481 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1775, float 0x3FF7154760000000, float %2479) #6, !dbg !21 + %2482 = tail call float @llvm.nvvm.fma.rn.f(float %1775, float 0x3FF7154760000000, float %2479) #6, !dbg !21 + %.0.i1064 = select i1 %.not3.i1063, float %2482, float %2481, !dbg !21 + %2483 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1065 = icmp eq i32 %2483, 0, !dbg !21 + %2484 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1775, float 0x3E54AE0C00000000, float %.0.i1064) #6, !dbg !21 + %2485 = tail call float @llvm.nvvm.fma.rn.f(float %1775, float 0x3E54AE0C00000000, float %.0.i1064) #6, !dbg !21 + %.01.i1066 = select i1 %.not4.i1065, float %2485, float %2484, !dbg !21 + %2486 = bitcast float %.04.i1062 to i32, !dbg !21 + %2487 = shl i32 %2486, 23, !dbg !21 + %2488 = bitcast i32 %2487 to float, !dbg !21 + %2489 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1066) #6, !dbg !21 + %2490 = fmul float %2489, %2488, !dbg !21 + %2491 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1067 = icmp eq i32 %2491, 0, !dbg !21 + %2492 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1776, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2493 = tail call float @llvm.nvvm.fma.rn.f(float %1776, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1068 = select i1 %.not.i1067, float %2493, float %2492, !dbg !21 + %2494 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1069 = icmp eq i32 %2494, 0, !dbg !21 + %2495 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1068) #6, !dbg !21 + %2496 = tail call float @llvm.nvvm.saturate.f(float %.02.i1068) #6, !dbg !21 + %.03.i1070 = select i1 %.not1.i1069, float %2496, float %2495, !dbg !21 + %2497 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1071 = icmp eq i32 %2497, 0, !dbg !21 + %2498 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1070, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2499 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1070, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1072 = select i1 %.not2.i1071, float %2499, float %2498, !dbg !21 + %2500 = fadd float %.04.i1072, 0xC168000FE0000000, !dbg !21 + %2501 = fneg float %2500, !dbg !21 + %2502 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1073 = icmp eq i32 %2502, 0, !dbg !21 + %2503 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1776, float 0x3FF7154760000000, float %2501) #6, !dbg !21 + %2504 = tail call float @llvm.nvvm.fma.rn.f(float %1776, float 0x3FF7154760000000, float %2501) #6, !dbg !21 + %.0.i1074 = select i1 %.not3.i1073, float %2504, float %2503, !dbg !21 + %2505 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1075 = icmp eq i32 %2505, 0, !dbg !21 + %2506 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1776, float 0x3E54AE0C00000000, float %.0.i1074) #6, !dbg !21 + %2507 = tail call float @llvm.nvvm.fma.rn.f(float %1776, float 0x3E54AE0C00000000, float %.0.i1074) #6, !dbg !21 + %.01.i1076 = select i1 %.not4.i1075, float %2507, float %2506, !dbg !21 + %2508 = bitcast float %.04.i1072 to i32, !dbg !21 + %2509 = shl i32 %2508, 23, !dbg !21 + %2510 = bitcast i32 %2509 to float, !dbg !21 + %2511 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1076) #6, !dbg !21 + %2512 = fmul float %2511, %2510, !dbg !21 + %2513 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1077 = icmp eq i32 %2513, 0, !dbg !21 + %2514 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1777, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2515 = tail call float @llvm.nvvm.fma.rn.f(float %1777, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1078 = select i1 %.not.i1077, float %2515, float %2514, !dbg !21 + %2516 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1079 = icmp eq i32 %2516, 0, !dbg !21 + %2517 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1078) #6, !dbg !21 + %2518 = tail call float @llvm.nvvm.saturate.f(float %.02.i1078) #6, !dbg !21 + %.03.i1080 = select i1 %.not1.i1079, float %2518, float %2517, !dbg !21 + %2519 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1081 = icmp eq i32 %2519, 0, !dbg !21 + %2520 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1080, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2521 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1080, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1082 = select i1 %.not2.i1081, float %2521, float %2520, !dbg !21 + %2522 = fadd float %.04.i1082, 0xC168000FE0000000, !dbg !21 + %2523 = fneg float %2522, !dbg !21 + %2524 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1083 = icmp eq i32 %2524, 0, !dbg !21 + %2525 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1777, float 0x3FF7154760000000, float %2523) #6, !dbg !21 + %2526 = tail call float @llvm.nvvm.fma.rn.f(float %1777, float 0x3FF7154760000000, float %2523) #6, !dbg !21 + %.0.i1084 = select i1 %.not3.i1083, float %2526, float %2525, !dbg !21 + %2527 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1085 = icmp eq i32 %2527, 0, !dbg !21 + %2528 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1777, float 0x3E54AE0C00000000, float %.0.i1084) #6, !dbg !21 + %2529 = tail call float @llvm.nvvm.fma.rn.f(float %1777, float 0x3E54AE0C00000000, float %.0.i1084) #6, !dbg !21 + %.01.i1086 = select i1 %.not4.i1085, float %2529, float %2528, !dbg !21 + %2530 = bitcast float %.04.i1082 to i32, !dbg !21 + %2531 = shl i32 %2530, 23, !dbg !21 + %2532 = bitcast i32 %2531 to float, !dbg !21 + %2533 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1086) #6, !dbg !21 + %2534 = fmul float %2533, %2532, !dbg !21 + %2535 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1087 = icmp eq i32 %2535, 0, !dbg !21 + %2536 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1778, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2537 = tail call float @llvm.nvvm.fma.rn.f(float %1778, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1088 = select i1 %.not.i1087, float %2537, float %2536, !dbg !21 + %2538 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1089 = icmp eq i32 %2538, 0, !dbg !21 + %2539 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1088) #6, !dbg !21 + %2540 = tail call float @llvm.nvvm.saturate.f(float %.02.i1088) #6, !dbg !21 + %.03.i1090 = select i1 %.not1.i1089, float %2540, float %2539, !dbg !21 + %2541 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1091 = icmp eq i32 %2541, 0, !dbg !21 + %2542 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1090, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2543 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1090, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1092 = select i1 %.not2.i1091, float %2543, float %2542, !dbg !21 + %2544 = fadd float %.04.i1092, 0xC168000FE0000000, !dbg !21 + %2545 = fneg float %2544, !dbg !21 + %2546 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1093 = icmp eq i32 %2546, 0, !dbg !21 + %2547 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1778, float 0x3FF7154760000000, float %2545) #6, !dbg !21 + %2548 = tail call float @llvm.nvvm.fma.rn.f(float %1778, float 0x3FF7154760000000, float %2545) #6, !dbg !21 + %.0.i1094 = select i1 %.not3.i1093, float %2548, float %2547, !dbg !21 + %2549 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1095 = icmp eq i32 %2549, 0, !dbg !21 + %2550 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1778, float 0x3E54AE0C00000000, float %.0.i1094) #6, !dbg !21 + %2551 = tail call float @llvm.nvvm.fma.rn.f(float %1778, float 0x3E54AE0C00000000, float %.0.i1094) #6, !dbg !21 + %.01.i1096 = select i1 %.not4.i1095, float %2551, float %2550, !dbg !21 + %2552 = bitcast float %.04.i1092 to i32, !dbg !21 + %2553 = shl i32 %2552, 23, !dbg !21 + %2554 = bitcast i32 %2553 to float, !dbg !21 + %2555 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1096) #6, !dbg !21 + %2556 = fmul float %2555, %2554, !dbg !21 + %2557 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1097 = icmp eq i32 %2557, 0, !dbg !21 + %2558 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1779, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2559 = tail call float @llvm.nvvm.fma.rn.f(float %1779, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1098 = select i1 %.not.i1097, float %2559, float %2558, !dbg !21 + %2560 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1099 = icmp eq i32 %2560, 0, !dbg !21 + %2561 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1098) #6, !dbg !21 + %2562 = tail call float @llvm.nvvm.saturate.f(float %.02.i1098) #6, !dbg !21 + %.03.i1100 = select i1 %.not1.i1099, float %2562, float %2561, !dbg !21 + %2563 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1101 = icmp eq i32 %2563, 0, !dbg !21 + %2564 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1100, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2565 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1100, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1102 = select i1 %.not2.i1101, float %2565, float %2564, !dbg !21 + %2566 = fadd float %.04.i1102, 0xC168000FE0000000, !dbg !21 + %2567 = fneg float %2566, !dbg !21 + %2568 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1103 = icmp eq i32 %2568, 0, !dbg !21 + %2569 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1779, float 0x3FF7154760000000, float %2567) #6, !dbg !21 + %2570 = tail call float @llvm.nvvm.fma.rn.f(float %1779, float 0x3FF7154760000000, float %2567) #6, !dbg !21 + %.0.i1104 = select i1 %.not3.i1103, float %2570, float %2569, !dbg !21 + %2571 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1105 = icmp eq i32 %2571, 0, !dbg !21 + %2572 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1779, float 0x3E54AE0C00000000, float %.0.i1104) #6, !dbg !21 + %2573 = tail call float @llvm.nvvm.fma.rn.f(float %1779, float 0x3E54AE0C00000000, float %.0.i1104) #6, !dbg !21 + %.01.i1106 = select i1 %.not4.i1105, float %2573, float %2572, !dbg !21 + %2574 = bitcast float %.04.i1102 to i32, !dbg !21 + %2575 = shl i32 %2574, 23, !dbg !21 + %2576 = bitcast i32 %2575 to float, !dbg !21 + %2577 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1106) #6, !dbg !21 + %2578 = fmul float %2577, %2576, !dbg !21 + %2579 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1107 = icmp eq i32 %2579, 0, !dbg !21 + %2580 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1780, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2581 = tail call float @llvm.nvvm.fma.rn.f(float %1780, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1108 = select i1 %.not.i1107, float %2581, float %2580, !dbg !21 + %2582 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1109 = icmp eq i32 %2582, 0, !dbg !21 + %2583 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1108) #6, !dbg !21 + %2584 = tail call float @llvm.nvvm.saturate.f(float %.02.i1108) #6, !dbg !21 + %.03.i1110 = select i1 %.not1.i1109, float %2584, float %2583, !dbg !21 + %2585 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1111 = icmp eq i32 %2585, 0, !dbg !21 + %2586 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1110, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2587 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1110, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1112 = select i1 %.not2.i1111, float %2587, float %2586, !dbg !21 + %2588 = fadd float %.04.i1112, 0xC168000FE0000000, !dbg !21 + %2589 = fneg float %2588, !dbg !21 + %2590 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1113 = icmp eq i32 %2590, 0, !dbg !21 + %2591 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1780, float 0x3FF7154760000000, float %2589) #6, !dbg !21 + %2592 = tail call float @llvm.nvvm.fma.rn.f(float %1780, float 0x3FF7154760000000, float %2589) #6, !dbg !21 + %.0.i1114 = select i1 %.not3.i1113, float %2592, float %2591, !dbg !21 + %2593 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1115 = icmp eq i32 %2593, 0, !dbg !21 + %2594 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1780, float 0x3E54AE0C00000000, float %.0.i1114) #6, !dbg !21 + %2595 = tail call float @llvm.nvvm.fma.rn.f(float %1780, float 0x3E54AE0C00000000, float %.0.i1114) #6, !dbg !21 + %.01.i1116 = select i1 %.not4.i1115, float %2595, float %2594, !dbg !21 + %2596 = bitcast float %.04.i1112 to i32, !dbg !21 + %2597 = shl i32 %2596, 23, !dbg !21 + %2598 = bitcast i32 %2597 to float, !dbg !21 + %2599 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1116) #6, !dbg !21 + %2600 = fmul float %2599, %2598, !dbg !21 + %2601 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1117 = icmp eq i32 %2601, 0, !dbg !21 + %2602 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1781, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2603 = tail call float @llvm.nvvm.fma.rn.f(float %1781, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1118 = select i1 %.not.i1117, float %2603, float %2602, !dbg !21 + %2604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1119 = icmp eq i32 %2604, 0, !dbg !21 + %2605 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1118) #6, !dbg !21 + %2606 = tail call float @llvm.nvvm.saturate.f(float %.02.i1118) #6, !dbg !21 + %.03.i1120 = select i1 %.not1.i1119, float %2606, float %2605, !dbg !21 + %2607 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1121 = icmp eq i32 %2607, 0, !dbg !21 + %2608 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1120, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2609 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1120, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1122 = select i1 %.not2.i1121, float %2609, float %2608, !dbg !21 + %2610 = fadd float %.04.i1122, 0xC168000FE0000000, !dbg !21 + %2611 = fneg float %2610, !dbg !21 + %2612 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1123 = icmp eq i32 %2612, 0, !dbg !21 + %2613 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1781, float 0x3FF7154760000000, float %2611) #6, !dbg !21 + %2614 = tail call float @llvm.nvvm.fma.rn.f(float %1781, float 0x3FF7154760000000, float %2611) #6, !dbg !21 + %.0.i1124 = select i1 %.not3.i1123, float %2614, float %2613, !dbg !21 + %2615 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1125 = icmp eq i32 %2615, 0, !dbg !21 + %2616 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1781, float 0x3E54AE0C00000000, float %.0.i1124) #6, !dbg !21 + %2617 = tail call float @llvm.nvvm.fma.rn.f(float %1781, float 0x3E54AE0C00000000, float %.0.i1124) #6, !dbg !21 + %.01.i1126 = select i1 %.not4.i1125, float %2617, float %2616, !dbg !21 + %2618 = bitcast float %.04.i1122 to i32, !dbg !21 + %2619 = shl i32 %2618, 23, !dbg !21 + %2620 = bitcast i32 %2619 to float, !dbg !21 + %2621 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1126) #6, !dbg !21 + %2622 = fmul float %2621, %2620, !dbg !21 + %2623 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1127 = icmp eq i32 %2623, 0, !dbg !21 + %2624 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1782, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2625 = tail call float @llvm.nvvm.fma.rn.f(float %1782, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1128 = select i1 %.not.i1127, float %2625, float %2624, !dbg !21 + %2626 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1129 = icmp eq i32 %2626, 0, !dbg !21 + %2627 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1128) #6, !dbg !21 + %2628 = tail call float @llvm.nvvm.saturate.f(float %.02.i1128) #6, !dbg !21 + %.03.i1130 = select i1 %.not1.i1129, float %2628, float %2627, !dbg !21 + %2629 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1131 = icmp eq i32 %2629, 0, !dbg !21 + %2630 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1130, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2631 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1130, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1132 = select i1 %.not2.i1131, float %2631, float %2630, !dbg !21 + %2632 = fadd float %.04.i1132, 0xC168000FE0000000, !dbg !21 + %2633 = fneg float %2632, !dbg !21 + %2634 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1133 = icmp eq i32 %2634, 0, !dbg !21 + %2635 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1782, float 0x3FF7154760000000, float %2633) #6, !dbg !21 + %2636 = tail call float @llvm.nvvm.fma.rn.f(float %1782, float 0x3FF7154760000000, float %2633) #6, !dbg !21 + %.0.i1134 = select i1 %.not3.i1133, float %2636, float %2635, !dbg !21 + %2637 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1135 = icmp eq i32 %2637, 0, !dbg !21 + %2638 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1782, float 0x3E54AE0C00000000, float %.0.i1134) #6, !dbg !21 + %2639 = tail call float @llvm.nvvm.fma.rn.f(float %1782, float 0x3E54AE0C00000000, float %.0.i1134) #6, !dbg !21 + %.01.i1136 = select i1 %.not4.i1135, float %2639, float %2638, !dbg !21 + %2640 = bitcast float %.04.i1132 to i32, !dbg !21 + %2641 = shl i32 %2640, 23, !dbg !21 + %2642 = bitcast i32 %2641 to float, !dbg !21 + %2643 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1136) #6, !dbg !21 + %2644 = fmul float %2643, %2642, !dbg !21 + %2645 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1137 = icmp eq i32 %2645, 0, !dbg !21 + %2646 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1783, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2647 = tail call float @llvm.nvvm.fma.rn.f(float %1783, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1138 = select i1 %.not.i1137, float %2647, float %2646, !dbg !21 + %2648 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1139 = icmp eq i32 %2648, 0, !dbg !21 + %2649 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1138) #6, !dbg !21 + %2650 = tail call float @llvm.nvvm.saturate.f(float %.02.i1138) #6, !dbg !21 + %.03.i1140 = select i1 %.not1.i1139, float %2650, float %2649, !dbg !21 + %2651 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1141 = icmp eq i32 %2651, 0, !dbg !21 + %2652 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1140, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2653 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1140, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1142 = select i1 %.not2.i1141, float %2653, float %2652, !dbg !21 + %2654 = fadd float %.04.i1142, 0xC168000FE0000000, !dbg !21 + %2655 = fneg float %2654, !dbg !21 + %2656 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1143 = icmp eq i32 %2656, 0, !dbg !21 + %2657 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1783, float 0x3FF7154760000000, float %2655) #6, !dbg !21 + %2658 = tail call float @llvm.nvvm.fma.rn.f(float %1783, float 0x3FF7154760000000, float %2655) #6, !dbg !21 + %.0.i1144 = select i1 %.not3.i1143, float %2658, float %2657, !dbg !21 + %2659 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1145 = icmp eq i32 %2659, 0, !dbg !21 + %2660 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1783, float 0x3E54AE0C00000000, float %.0.i1144) #6, !dbg !21 + %2661 = tail call float @llvm.nvvm.fma.rn.f(float %1783, float 0x3E54AE0C00000000, float %.0.i1144) #6, !dbg !21 + %.01.i1146 = select i1 %.not4.i1145, float %2661, float %2660, !dbg !21 + %2662 = bitcast float %.04.i1142 to i32, !dbg !21 + %2663 = shl i32 %2662, 23, !dbg !21 + %2664 = bitcast i32 %2663 to float, !dbg !21 + %2665 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1146) #6, !dbg !21 + %2666 = fmul float %2665, %2664, !dbg !21 + %2667 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1147 = icmp eq i32 %2667, 0, !dbg !21 + %2668 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1784, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2669 = tail call float @llvm.nvvm.fma.rn.f(float %1784, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1148 = select i1 %.not.i1147, float %2669, float %2668, !dbg !21 + %2670 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1149 = icmp eq i32 %2670, 0, !dbg !21 + %2671 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1148) #6, !dbg !21 + %2672 = tail call float @llvm.nvvm.saturate.f(float %.02.i1148) #6, !dbg !21 + %.03.i1150 = select i1 %.not1.i1149, float %2672, float %2671, !dbg !21 + %2673 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1151 = icmp eq i32 %2673, 0, !dbg !21 + %2674 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1150, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2675 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1150, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1152 = select i1 %.not2.i1151, float %2675, float %2674, !dbg !21 + %2676 = fadd float %.04.i1152, 0xC168000FE0000000, !dbg !21 + %2677 = fneg float %2676, !dbg !21 + %2678 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1153 = icmp eq i32 %2678, 0, !dbg !21 + %2679 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1784, float 0x3FF7154760000000, float %2677) #6, !dbg !21 + %2680 = tail call float @llvm.nvvm.fma.rn.f(float %1784, float 0x3FF7154760000000, float %2677) #6, !dbg !21 + %.0.i1154 = select i1 %.not3.i1153, float %2680, float %2679, !dbg !21 + %2681 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1155 = icmp eq i32 %2681, 0, !dbg !21 + %2682 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1784, float 0x3E54AE0C00000000, float %.0.i1154) #6, !dbg !21 + %2683 = tail call float @llvm.nvvm.fma.rn.f(float %1784, float 0x3E54AE0C00000000, float %.0.i1154) #6, !dbg !21 + %.01.i1156 = select i1 %.not4.i1155, float %2683, float %2682, !dbg !21 + %2684 = bitcast float %.04.i1152 to i32, !dbg !21 + %2685 = shl i32 %2684, 23, !dbg !21 + %2686 = bitcast i32 %2685 to float, !dbg !21 + %2687 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1156) #6, !dbg !21 + %2688 = fmul float %2687, %2686, !dbg !21 + %2689 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1157 = icmp eq i32 %2689, 0, !dbg !21 + %2690 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1785, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2691 = tail call float @llvm.nvvm.fma.rn.f(float %1785, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1158 = select i1 %.not.i1157, float %2691, float %2690, !dbg !21 + %2692 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1159 = icmp eq i32 %2692, 0, !dbg !21 + %2693 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1158) #6, !dbg !21 + %2694 = tail call float @llvm.nvvm.saturate.f(float %.02.i1158) #6, !dbg !21 + %.03.i1160 = select i1 %.not1.i1159, float %2694, float %2693, !dbg !21 + %2695 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1161 = icmp eq i32 %2695, 0, !dbg !21 + %2696 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1160, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2697 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1160, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1162 = select i1 %.not2.i1161, float %2697, float %2696, !dbg !21 + %2698 = fadd float %.04.i1162, 0xC168000FE0000000, !dbg !21 + %2699 = fneg float %2698, !dbg !21 + %2700 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1163 = icmp eq i32 %2700, 0, !dbg !21 + %2701 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1785, float 0x3FF7154760000000, float %2699) #6, !dbg !21 + %2702 = tail call float @llvm.nvvm.fma.rn.f(float %1785, float 0x3FF7154760000000, float %2699) #6, !dbg !21 + %.0.i1164 = select i1 %.not3.i1163, float %2702, float %2701, !dbg !21 + %2703 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1165 = icmp eq i32 %2703, 0, !dbg !21 + %2704 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1785, float 0x3E54AE0C00000000, float %.0.i1164) #6, !dbg !21 + %2705 = tail call float @llvm.nvvm.fma.rn.f(float %1785, float 0x3E54AE0C00000000, float %.0.i1164) #6, !dbg !21 + %.01.i1166 = select i1 %.not4.i1165, float %2705, float %2704, !dbg !21 + %2706 = bitcast float %.04.i1162 to i32, !dbg !21 + %2707 = shl i32 %2706, 23, !dbg !21 + %2708 = bitcast i32 %2707 to float, !dbg !21 + %2709 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1166) #6, !dbg !21 + %2710 = fmul float %2709, %2708, !dbg !21 + %2711 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1167 = icmp eq i32 %2711, 0, !dbg !21 + %2712 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1786, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2713 = tail call float @llvm.nvvm.fma.rn.f(float %1786, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1168 = select i1 %.not.i1167, float %2713, float %2712, !dbg !21 + %2714 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1169 = icmp eq i32 %2714, 0, !dbg !21 + %2715 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1168) #6, !dbg !21 + %2716 = tail call float @llvm.nvvm.saturate.f(float %.02.i1168) #6, !dbg !21 + %.03.i1170 = select i1 %.not1.i1169, float %2716, float %2715, !dbg !21 + %2717 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1171 = icmp eq i32 %2717, 0, !dbg !21 + %2718 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1170, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2719 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1170, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1172 = select i1 %.not2.i1171, float %2719, float %2718, !dbg !21 + %2720 = fadd float %.04.i1172, 0xC168000FE0000000, !dbg !21 + %2721 = fneg float %2720, !dbg !21 + %2722 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1173 = icmp eq i32 %2722, 0, !dbg !21 + %2723 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1786, float 0x3FF7154760000000, float %2721) #6, !dbg !21 + %2724 = tail call float @llvm.nvvm.fma.rn.f(float %1786, float 0x3FF7154760000000, float %2721) #6, !dbg !21 + %.0.i1174 = select i1 %.not3.i1173, float %2724, float %2723, !dbg !21 + %2725 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1175 = icmp eq i32 %2725, 0, !dbg !21 + %2726 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1786, float 0x3E54AE0C00000000, float %.0.i1174) #6, !dbg !21 + %2727 = tail call float @llvm.nvvm.fma.rn.f(float %1786, float 0x3E54AE0C00000000, float %.0.i1174) #6, !dbg !21 + %.01.i1176 = select i1 %.not4.i1175, float %2727, float %2726, !dbg !21 + %2728 = bitcast float %.04.i1172 to i32, !dbg !21 + %2729 = shl i32 %2728, 23, !dbg !21 + %2730 = bitcast i32 %2729 to float, !dbg !21 + %2731 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1176) #6, !dbg !21 + %2732 = fmul float %2731, %2730, !dbg !21 + %2733 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1177 = icmp eq i32 %2733, 0, !dbg !21 + %2734 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1787, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2735 = tail call float @llvm.nvvm.fma.rn.f(float %1787, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1178 = select i1 %.not.i1177, float %2735, float %2734, !dbg !21 + %2736 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1179 = icmp eq i32 %2736, 0, !dbg !21 + %2737 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1178) #6, !dbg !21 + %2738 = tail call float @llvm.nvvm.saturate.f(float %.02.i1178) #6, !dbg !21 + %.03.i1180 = select i1 %.not1.i1179, float %2738, float %2737, !dbg !21 + %2739 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1181 = icmp eq i32 %2739, 0, !dbg !21 + %2740 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1180, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2741 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1180, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1182 = select i1 %.not2.i1181, float %2741, float %2740, !dbg !21 + %2742 = fadd float %.04.i1182, 0xC168000FE0000000, !dbg !21 + %2743 = fneg float %2742, !dbg !21 + %2744 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1183 = icmp eq i32 %2744, 0, !dbg !21 + %2745 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1787, float 0x3FF7154760000000, float %2743) #6, !dbg !21 + %2746 = tail call float @llvm.nvvm.fma.rn.f(float %1787, float 0x3FF7154760000000, float %2743) #6, !dbg !21 + %.0.i1184 = select i1 %.not3.i1183, float %2746, float %2745, !dbg !21 + %2747 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1185 = icmp eq i32 %2747, 0, !dbg !21 + %2748 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1787, float 0x3E54AE0C00000000, float %.0.i1184) #6, !dbg !21 + %2749 = tail call float @llvm.nvvm.fma.rn.f(float %1787, float 0x3E54AE0C00000000, float %.0.i1184) #6, !dbg !21 + %.01.i1186 = select i1 %.not4.i1185, float %2749, float %2748, !dbg !21 + %2750 = bitcast float %.04.i1182 to i32, !dbg !21 + %2751 = shl i32 %2750, 23, !dbg !21 + %2752 = bitcast i32 %2751 to float, !dbg !21 + %2753 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1186) #6, !dbg !21 + %2754 = fmul float %2753, %2752, !dbg !21 + %2755 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1187 = icmp eq i32 %2755, 0, !dbg !21 + %2756 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1788, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2757 = tail call float @llvm.nvvm.fma.rn.f(float %1788, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1188 = select i1 %.not.i1187, float %2757, float %2756, !dbg !21 + %2758 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1189 = icmp eq i32 %2758, 0, !dbg !21 + %2759 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1188) #6, !dbg !21 + %2760 = tail call float @llvm.nvvm.saturate.f(float %.02.i1188) #6, !dbg !21 + %.03.i1190 = select i1 %.not1.i1189, float %2760, float %2759, !dbg !21 + %2761 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1191 = icmp eq i32 %2761, 0, !dbg !21 + %2762 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1190, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2763 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1190, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1192 = select i1 %.not2.i1191, float %2763, float %2762, !dbg !21 + %2764 = fadd float %.04.i1192, 0xC168000FE0000000, !dbg !21 + %2765 = fneg float %2764, !dbg !21 + %2766 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1193 = icmp eq i32 %2766, 0, !dbg !21 + %2767 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1788, float 0x3FF7154760000000, float %2765) #6, !dbg !21 + %2768 = tail call float @llvm.nvvm.fma.rn.f(float %1788, float 0x3FF7154760000000, float %2765) #6, !dbg !21 + %.0.i1194 = select i1 %.not3.i1193, float %2768, float %2767, !dbg !21 + %2769 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1195 = icmp eq i32 %2769, 0, !dbg !21 + %2770 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1788, float 0x3E54AE0C00000000, float %.0.i1194) #6, !dbg !21 + %2771 = tail call float @llvm.nvvm.fma.rn.f(float %1788, float 0x3E54AE0C00000000, float %.0.i1194) #6, !dbg !21 + %.01.i1196 = select i1 %.not4.i1195, float %2771, float %2770, !dbg !21 + %2772 = bitcast float %.04.i1192 to i32, !dbg !21 + %2773 = shl i32 %2772, 23, !dbg !21 + %2774 = bitcast i32 %2773 to float, !dbg !21 + %2775 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1196) #6, !dbg !21 + %2776 = fmul float %2775, %2774, !dbg !21 + %2777 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1197 = icmp eq i32 %2777, 0, !dbg !21 + %2778 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1789, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2779 = tail call float @llvm.nvvm.fma.rn.f(float %1789, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1198 = select i1 %.not.i1197, float %2779, float %2778, !dbg !21 + %2780 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1199 = icmp eq i32 %2780, 0, !dbg !21 + %2781 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1198) #6, !dbg !21 + %2782 = tail call float @llvm.nvvm.saturate.f(float %.02.i1198) #6, !dbg !21 + %.03.i1200 = select i1 %.not1.i1199, float %2782, float %2781, !dbg !21 + %2783 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1201 = icmp eq i32 %2783, 0, !dbg !21 + %2784 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1200, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2785 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1200, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1202 = select i1 %.not2.i1201, float %2785, float %2784, !dbg !21 + %2786 = fadd float %.04.i1202, 0xC168000FE0000000, !dbg !21 + %2787 = fneg float %2786, !dbg !21 + %2788 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1203 = icmp eq i32 %2788, 0, !dbg !21 + %2789 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1789, float 0x3FF7154760000000, float %2787) #6, !dbg !21 + %2790 = tail call float @llvm.nvvm.fma.rn.f(float %1789, float 0x3FF7154760000000, float %2787) #6, !dbg !21 + %.0.i1204 = select i1 %.not3.i1203, float %2790, float %2789, !dbg !21 + %2791 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1205 = icmp eq i32 %2791, 0, !dbg !21 + %2792 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1789, float 0x3E54AE0C00000000, float %.0.i1204) #6, !dbg !21 + %2793 = tail call float @llvm.nvvm.fma.rn.f(float %1789, float 0x3E54AE0C00000000, float %.0.i1204) #6, !dbg !21 + %.01.i1206 = select i1 %.not4.i1205, float %2793, float %2792, !dbg !21 + %2794 = bitcast float %.04.i1202 to i32, !dbg !21 + %2795 = shl i32 %2794, 23, !dbg !21 + %2796 = bitcast i32 %2795 to float, !dbg !21 + %2797 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1206) #6, !dbg !21 + %2798 = fmul float %2797, %2796, !dbg !21 + %2799 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1207 = icmp eq i32 %2799, 0, !dbg !21 + %2800 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1790, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2801 = tail call float @llvm.nvvm.fma.rn.f(float %1790, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1208 = select i1 %.not.i1207, float %2801, float %2800, !dbg !21 + %2802 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1209 = icmp eq i32 %2802, 0, !dbg !21 + %2803 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1208) #6, !dbg !21 + %2804 = tail call float @llvm.nvvm.saturate.f(float %.02.i1208) #6, !dbg !21 + %.03.i1210 = select i1 %.not1.i1209, float %2804, float %2803, !dbg !21 + %2805 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1211 = icmp eq i32 %2805, 0, !dbg !21 + %2806 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1210, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2807 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1210, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1212 = select i1 %.not2.i1211, float %2807, float %2806, !dbg !21 + %2808 = fadd float %.04.i1212, 0xC168000FE0000000, !dbg !21 + %2809 = fneg float %2808, !dbg !21 + %2810 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1213 = icmp eq i32 %2810, 0, !dbg !21 + %2811 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1790, float 0x3FF7154760000000, float %2809) #6, !dbg !21 + %2812 = tail call float @llvm.nvvm.fma.rn.f(float %1790, float 0x3FF7154760000000, float %2809) #6, !dbg !21 + %.0.i1214 = select i1 %.not3.i1213, float %2812, float %2811, !dbg !21 + %2813 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1215 = icmp eq i32 %2813, 0, !dbg !21 + %2814 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1790, float 0x3E54AE0C00000000, float %.0.i1214) #6, !dbg !21 + %2815 = tail call float @llvm.nvvm.fma.rn.f(float %1790, float 0x3E54AE0C00000000, float %.0.i1214) #6, !dbg !21 + %.01.i1216 = select i1 %.not4.i1215, float %2815, float %2814, !dbg !21 + %2816 = bitcast float %.04.i1212 to i32, !dbg !21 + %2817 = shl i32 %2816, 23, !dbg !21 + %2818 = bitcast i32 %2817 to float, !dbg !21 + %2819 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1216) #6, !dbg !21 + %2820 = fmul float %2819, %2818, !dbg !21 + %2821 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1217 = icmp eq i32 %2821, 0, !dbg !21 + %2822 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1791, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2823 = tail call float @llvm.nvvm.fma.rn.f(float %1791, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1218 = select i1 %.not.i1217, float %2823, float %2822, !dbg !21 + %2824 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1219 = icmp eq i32 %2824, 0, !dbg !21 + %2825 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1218) #6, !dbg !21 + %2826 = tail call float @llvm.nvvm.saturate.f(float %.02.i1218) #6, !dbg !21 + %.03.i1220 = select i1 %.not1.i1219, float %2826, float %2825, !dbg !21 + %2827 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1221 = icmp eq i32 %2827, 0, !dbg !21 + %2828 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1220, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2829 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1220, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1222 = select i1 %.not2.i1221, float %2829, float %2828, !dbg !21 + %2830 = fadd float %.04.i1222, 0xC168000FE0000000, !dbg !21 + %2831 = fneg float %2830, !dbg !21 + %2832 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1223 = icmp eq i32 %2832, 0, !dbg !21 + %2833 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1791, float 0x3FF7154760000000, float %2831) #6, !dbg !21 + %2834 = tail call float @llvm.nvvm.fma.rn.f(float %1791, float 0x3FF7154760000000, float %2831) #6, !dbg !21 + %.0.i1224 = select i1 %.not3.i1223, float %2834, float %2833, !dbg !21 + %2835 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1225 = icmp eq i32 %2835, 0, !dbg !21 + %2836 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1791, float 0x3E54AE0C00000000, float %.0.i1224) #6, !dbg !21 + %2837 = tail call float @llvm.nvvm.fma.rn.f(float %1791, float 0x3E54AE0C00000000, float %.0.i1224) #6, !dbg !21 + %.01.i1226 = select i1 %.not4.i1225, float %2837, float %2836, !dbg !21 + %2838 = bitcast float %.04.i1222 to i32, !dbg !21 + %2839 = shl i32 %2838, 23, !dbg !21 + %2840 = bitcast i32 %2839 to float, !dbg !21 + %2841 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1226) #6, !dbg !21 + %2842 = fmul float %2841, %2840, !dbg !21 + %2843 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1227 = icmp eq i32 %2843, 0, !dbg !21 + %2844 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1792, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2845 = tail call float @llvm.nvvm.fma.rn.f(float %1792, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1228 = select i1 %.not.i1227, float %2845, float %2844, !dbg !21 + %2846 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1229 = icmp eq i32 %2846, 0, !dbg !21 + %2847 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1228) #6, !dbg !21 + %2848 = tail call float @llvm.nvvm.saturate.f(float %.02.i1228) #6, !dbg !21 + %.03.i1230 = select i1 %.not1.i1229, float %2848, float %2847, !dbg !21 + %2849 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1231 = icmp eq i32 %2849, 0, !dbg !21 + %2850 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1230, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2851 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1230, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1232 = select i1 %.not2.i1231, float %2851, float %2850, !dbg !21 + %2852 = fadd float %.04.i1232, 0xC168000FE0000000, !dbg !21 + %2853 = fneg float %2852, !dbg !21 + %2854 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1233 = icmp eq i32 %2854, 0, !dbg !21 + %2855 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1792, float 0x3FF7154760000000, float %2853) #6, !dbg !21 + %2856 = tail call float @llvm.nvvm.fma.rn.f(float %1792, float 0x3FF7154760000000, float %2853) #6, !dbg !21 + %.0.i1234 = select i1 %.not3.i1233, float %2856, float %2855, !dbg !21 + %2857 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1235 = icmp eq i32 %2857, 0, !dbg !21 + %2858 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1792, float 0x3E54AE0C00000000, float %.0.i1234) #6, !dbg !21 + %2859 = tail call float @llvm.nvvm.fma.rn.f(float %1792, float 0x3E54AE0C00000000, float %.0.i1234) #6, !dbg !21 + %.01.i1236 = select i1 %.not4.i1235, float %2859, float %2858, !dbg !21 + %2860 = bitcast float %.04.i1232 to i32, !dbg !21 + %2861 = shl i32 %2860, 23, !dbg !21 + %2862 = bitcast i32 %2861 to float, !dbg !21 + %2863 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1236) #6, !dbg !21 + %2864 = fmul float %2863, %2862, !dbg !21 + %2865 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1237 = icmp eq i32 %2865, 0, !dbg !21 + %2866 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1793, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2867 = tail call float @llvm.nvvm.fma.rn.f(float %1793, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1238 = select i1 %.not.i1237, float %2867, float %2866, !dbg !21 + %2868 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1239 = icmp eq i32 %2868, 0, !dbg !21 + %2869 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1238) #6, !dbg !21 + %2870 = tail call float @llvm.nvvm.saturate.f(float %.02.i1238) #6, !dbg !21 + %.03.i1240 = select i1 %.not1.i1239, float %2870, float %2869, !dbg !21 + %2871 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1241 = icmp eq i32 %2871, 0, !dbg !21 + %2872 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1240, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2873 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1240, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1242 = select i1 %.not2.i1241, float %2873, float %2872, !dbg !21 + %2874 = fadd float %.04.i1242, 0xC168000FE0000000, !dbg !21 + %2875 = fneg float %2874, !dbg !21 + %2876 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1243 = icmp eq i32 %2876, 0, !dbg !21 + %2877 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1793, float 0x3FF7154760000000, float %2875) #6, !dbg !21 + %2878 = tail call float @llvm.nvvm.fma.rn.f(float %1793, float 0x3FF7154760000000, float %2875) #6, !dbg !21 + %.0.i1244 = select i1 %.not3.i1243, float %2878, float %2877, !dbg !21 + %2879 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1245 = icmp eq i32 %2879, 0, !dbg !21 + %2880 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1793, float 0x3E54AE0C00000000, float %.0.i1244) #6, !dbg !21 + %2881 = tail call float @llvm.nvvm.fma.rn.f(float %1793, float 0x3E54AE0C00000000, float %.0.i1244) #6, !dbg !21 + %.01.i1246 = select i1 %.not4.i1245, float %2881, float %2880, !dbg !21 + %2882 = bitcast float %.04.i1242 to i32, !dbg !21 + %2883 = shl i32 %2882, 23, !dbg !21 + %2884 = bitcast i32 %2883 to float, !dbg !21 + %2885 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1246) #6, !dbg !21 + %2886 = fmul float %2885, %2884, !dbg !21 + %2887 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1247 = icmp eq i32 %2887, 0, !dbg !21 + %2888 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1794, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2889 = tail call float @llvm.nvvm.fma.rn.f(float %1794, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1248 = select i1 %.not.i1247, float %2889, float %2888, !dbg !21 + %2890 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1249 = icmp eq i32 %2890, 0, !dbg !21 + %2891 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1248) #6, !dbg !21 + %2892 = tail call float @llvm.nvvm.saturate.f(float %.02.i1248) #6, !dbg !21 + %.03.i1250 = select i1 %.not1.i1249, float %2892, float %2891, !dbg !21 + %2893 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1251 = icmp eq i32 %2893, 0, !dbg !21 + %2894 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1250, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2895 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1250, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1252 = select i1 %.not2.i1251, float %2895, float %2894, !dbg !21 + %2896 = fadd float %.04.i1252, 0xC168000FE0000000, !dbg !21 + %2897 = fneg float %2896, !dbg !21 + %2898 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1253 = icmp eq i32 %2898, 0, !dbg !21 + %2899 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1794, float 0x3FF7154760000000, float %2897) #6, !dbg !21 + %2900 = tail call float @llvm.nvvm.fma.rn.f(float %1794, float 0x3FF7154760000000, float %2897) #6, !dbg !21 + %.0.i1254 = select i1 %.not3.i1253, float %2900, float %2899, !dbg !21 + %2901 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1255 = icmp eq i32 %2901, 0, !dbg !21 + %2902 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1794, float 0x3E54AE0C00000000, float %.0.i1254) #6, !dbg !21 + %2903 = tail call float @llvm.nvvm.fma.rn.f(float %1794, float 0x3E54AE0C00000000, float %.0.i1254) #6, !dbg !21 + %.01.i1256 = select i1 %.not4.i1255, float %2903, float %2902, !dbg !21 + %2904 = bitcast float %.04.i1252 to i32, !dbg !21 + %2905 = shl i32 %2904, 23, !dbg !21 + %2906 = bitcast i32 %2905 to float, !dbg !21 + %2907 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1256) #6, !dbg !21 + %2908 = fmul float %2907, %2906, !dbg !21 + %2909 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1257 = icmp eq i32 %2909, 0, !dbg !21 + %2910 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1795, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2911 = tail call float @llvm.nvvm.fma.rn.f(float %1795, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1258 = select i1 %.not.i1257, float %2911, float %2910, !dbg !21 + %2912 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1259 = icmp eq i32 %2912, 0, !dbg !21 + %2913 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1258) #6, !dbg !21 + %2914 = tail call float @llvm.nvvm.saturate.f(float %.02.i1258) #6, !dbg !21 + %.03.i1260 = select i1 %.not1.i1259, float %2914, float %2913, !dbg !21 + %2915 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1261 = icmp eq i32 %2915, 0, !dbg !21 + %2916 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1260, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2917 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1260, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1262 = select i1 %.not2.i1261, float %2917, float %2916, !dbg !21 + %2918 = fadd float %.04.i1262, 0xC168000FE0000000, !dbg !21 + %2919 = fneg float %2918, !dbg !21 + %2920 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1263 = icmp eq i32 %2920, 0, !dbg !21 + %2921 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1795, float 0x3FF7154760000000, float %2919) #6, !dbg !21 + %2922 = tail call float @llvm.nvvm.fma.rn.f(float %1795, float 0x3FF7154760000000, float %2919) #6, !dbg !21 + %.0.i1264 = select i1 %.not3.i1263, float %2922, float %2921, !dbg !21 + %2923 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1265 = icmp eq i32 %2923, 0, !dbg !21 + %2924 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1795, float 0x3E54AE0C00000000, float %.0.i1264) #6, !dbg !21 + %2925 = tail call float @llvm.nvvm.fma.rn.f(float %1795, float 0x3E54AE0C00000000, float %.0.i1264) #6, !dbg !21 + %.01.i1266 = select i1 %.not4.i1265, float %2925, float %2924, !dbg !21 + %2926 = bitcast float %.04.i1262 to i32, !dbg !21 + %2927 = shl i32 %2926, 23, !dbg !21 + %2928 = bitcast i32 %2927 to float, !dbg !21 + %2929 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1266) #6, !dbg !21 + %2930 = fmul float %2929, %2928, !dbg !21 + %2931 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1267 = icmp eq i32 %2931, 0, !dbg !21 + %2932 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1796, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2933 = tail call float @llvm.nvvm.fma.rn.f(float %1796, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1268 = select i1 %.not.i1267, float %2933, float %2932, !dbg !21 + %2934 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1269 = icmp eq i32 %2934, 0, !dbg !21 + %2935 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1268) #6, !dbg !21 + %2936 = tail call float @llvm.nvvm.saturate.f(float %.02.i1268) #6, !dbg !21 + %.03.i1270 = select i1 %.not1.i1269, float %2936, float %2935, !dbg !21 + %2937 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1271 = icmp eq i32 %2937, 0, !dbg !21 + %2938 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1270, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2939 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1270, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1272 = select i1 %.not2.i1271, float %2939, float %2938, !dbg !21 + %2940 = fadd float %.04.i1272, 0xC168000FE0000000, !dbg !21 + %2941 = fneg float %2940, !dbg !21 + %2942 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1273 = icmp eq i32 %2942, 0, !dbg !21 + %2943 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1796, float 0x3FF7154760000000, float %2941) #6, !dbg !21 + %2944 = tail call float @llvm.nvvm.fma.rn.f(float %1796, float 0x3FF7154760000000, float %2941) #6, !dbg !21 + %.0.i1274 = select i1 %.not3.i1273, float %2944, float %2943, !dbg !21 + %2945 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1275 = icmp eq i32 %2945, 0, !dbg !21 + %2946 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1796, float 0x3E54AE0C00000000, float %.0.i1274) #6, !dbg !21 + %2947 = tail call float @llvm.nvvm.fma.rn.f(float %1796, float 0x3E54AE0C00000000, float %.0.i1274) #6, !dbg !21 + %.01.i1276 = select i1 %.not4.i1275, float %2947, float %2946, !dbg !21 + %2948 = bitcast float %.04.i1272 to i32, !dbg !21 + %2949 = shl i32 %2948, 23, !dbg !21 + %2950 = bitcast i32 %2949 to float, !dbg !21 + %2951 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1276) #6, !dbg !21 + %2952 = fmul float %2951, %2950, !dbg !21 + %2953 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1277 = icmp eq i32 %2953, 0, !dbg !21 + %2954 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1797, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2955 = tail call float @llvm.nvvm.fma.rn.f(float %1797, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1278 = select i1 %.not.i1277, float %2955, float %2954, !dbg !21 + %2956 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1279 = icmp eq i32 %2956, 0, !dbg !21 + %2957 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1278) #6, !dbg !21 + %2958 = tail call float @llvm.nvvm.saturate.f(float %.02.i1278) #6, !dbg !21 + %.03.i1280 = select i1 %.not1.i1279, float %2958, float %2957, !dbg !21 + %2959 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1281 = icmp eq i32 %2959, 0, !dbg !21 + %2960 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1280, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2961 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1280, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1282 = select i1 %.not2.i1281, float %2961, float %2960, !dbg !21 + %2962 = fadd float %.04.i1282, 0xC168000FE0000000, !dbg !21 + %2963 = fneg float %2962, !dbg !21 + %2964 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1283 = icmp eq i32 %2964, 0, !dbg !21 + %2965 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1797, float 0x3FF7154760000000, float %2963) #6, !dbg !21 + %2966 = tail call float @llvm.nvvm.fma.rn.f(float %1797, float 0x3FF7154760000000, float %2963) #6, !dbg !21 + %.0.i1284 = select i1 %.not3.i1283, float %2966, float %2965, !dbg !21 + %2967 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1285 = icmp eq i32 %2967, 0, !dbg !21 + %2968 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1797, float 0x3E54AE0C00000000, float %.0.i1284) #6, !dbg !21 + %2969 = tail call float @llvm.nvvm.fma.rn.f(float %1797, float 0x3E54AE0C00000000, float %.0.i1284) #6, !dbg !21 + %.01.i1286 = select i1 %.not4.i1285, float %2969, float %2968, !dbg !21 + %2970 = bitcast float %.04.i1282 to i32, !dbg !21 + %2971 = shl i32 %2970, 23, !dbg !21 + %2972 = bitcast i32 %2971 to float, !dbg !21 + %2973 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1286) #6, !dbg !21 + %2974 = fmul float %2973, %2972, !dbg !21 + %2975 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1287 = icmp eq i32 %2975, 0, !dbg !21 + %2976 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1798, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2977 = tail call float @llvm.nvvm.fma.rn.f(float %1798, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1288 = select i1 %.not.i1287, float %2977, float %2976, !dbg !21 + %2978 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1289 = icmp eq i32 %2978, 0, !dbg !21 + %2979 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1288) #6, !dbg !21 + %2980 = tail call float @llvm.nvvm.saturate.f(float %.02.i1288) #6, !dbg !21 + %.03.i1290 = select i1 %.not1.i1289, float %2980, float %2979, !dbg !21 + %2981 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1291 = icmp eq i32 %2981, 0, !dbg !21 + %2982 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1290, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %2983 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1290, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1292 = select i1 %.not2.i1291, float %2983, float %2982, !dbg !21 + %2984 = fadd float %.04.i1292, 0xC168000FE0000000, !dbg !21 + %2985 = fneg float %2984, !dbg !21 + %2986 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1293 = icmp eq i32 %2986, 0, !dbg !21 + %2987 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1798, float 0x3FF7154760000000, float %2985) #6, !dbg !21 + %2988 = tail call float @llvm.nvvm.fma.rn.f(float %1798, float 0x3FF7154760000000, float %2985) #6, !dbg !21 + %.0.i1294 = select i1 %.not3.i1293, float %2988, float %2987, !dbg !21 + %2989 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1295 = icmp eq i32 %2989, 0, !dbg !21 + %2990 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1798, float 0x3E54AE0C00000000, float %.0.i1294) #6, !dbg !21 + %2991 = tail call float @llvm.nvvm.fma.rn.f(float %1798, float 0x3E54AE0C00000000, float %.0.i1294) #6, !dbg !21 + %.01.i1296 = select i1 %.not4.i1295, float %2991, float %2990, !dbg !21 + %2992 = bitcast float %.04.i1292 to i32, !dbg !21 + %2993 = shl i32 %2992, 23, !dbg !21 + %2994 = bitcast i32 %2993 to float, !dbg !21 + %2995 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1296) #6, !dbg !21 + %2996 = fmul float %2995, %2994, !dbg !21 + %2997 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1297 = icmp eq i32 %2997, 0, !dbg !21 + %2998 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1799, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %2999 = tail call float @llvm.nvvm.fma.rn.f(float %1799, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1298 = select i1 %.not.i1297, float %2999, float %2998, !dbg !21 + %3000 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1299 = icmp eq i32 %3000, 0, !dbg !21 + %3001 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1298) #6, !dbg !21 + %3002 = tail call float @llvm.nvvm.saturate.f(float %.02.i1298) #6, !dbg !21 + %.03.i1300 = select i1 %.not1.i1299, float %3002, float %3001, !dbg !21 + %3003 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1301 = icmp eq i32 %3003, 0, !dbg !21 + %3004 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1300, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3005 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1300, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1302 = select i1 %.not2.i1301, float %3005, float %3004, !dbg !21 + %3006 = fadd float %.04.i1302, 0xC168000FE0000000, !dbg !21 + %3007 = fneg float %3006, !dbg !21 + %3008 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1303 = icmp eq i32 %3008, 0, !dbg !21 + %3009 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1799, float 0x3FF7154760000000, float %3007) #6, !dbg !21 + %3010 = tail call float @llvm.nvvm.fma.rn.f(float %1799, float 0x3FF7154760000000, float %3007) #6, !dbg !21 + %.0.i1304 = select i1 %.not3.i1303, float %3010, float %3009, !dbg !21 + %3011 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1305 = icmp eq i32 %3011, 0, !dbg !21 + %3012 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1799, float 0x3E54AE0C00000000, float %.0.i1304) #6, !dbg !21 + %3013 = tail call float @llvm.nvvm.fma.rn.f(float %1799, float 0x3E54AE0C00000000, float %.0.i1304) #6, !dbg !21 + %.01.i1306 = select i1 %.not4.i1305, float %3013, float %3012, !dbg !21 + %3014 = bitcast float %.04.i1302 to i32, !dbg !21 + %3015 = shl i32 %3014, 23, !dbg !21 + %3016 = bitcast i32 %3015 to float, !dbg !21 + %3017 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1306) #6, !dbg !21 + %3018 = fmul float %3017, %3016, !dbg !21 + %3019 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1307 = icmp eq i32 %3019, 0, !dbg !21 + %3020 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1800, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %3021 = tail call float @llvm.nvvm.fma.rn.f(float %1800, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1308 = select i1 %.not.i1307, float %3021, float %3020, !dbg !21 + %3022 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1309 = icmp eq i32 %3022, 0, !dbg !21 + %3023 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1308) #6, !dbg !21 + %3024 = tail call float @llvm.nvvm.saturate.f(float %.02.i1308) #6, !dbg !21 + %.03.i1310 = select i1 %.not1.i1309, float %3024, float %3023, !dbg !21 + %3025 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1311 = icmp eq i32 %3025, 0, !dbg !21 + %3026 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1310, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3027 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1310, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1312 = select i1 %.not2.i1311, float %3027, float %3026, !dbg !21 + %3028 = fadd float %.04.i1312, 0xC168000FE0000000, !dbg !21 + %3029 = fneg float %3028, !dbg !21 + %3030 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1313 = icmp eq i32 %3030, 0, !dbg !21 + %3031 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1800, float 0x3FF7154760000000, float %3029) #6, !dbg !21 + %3032 = tail call float @llvm.nvvm.fma.rn.f(float %1800, float 0x3FF7154760000000, float %3029) #6, !dbg !21 + %.0.i1314 = select i1 %.not3.i1313, float %3032, float %3031, !dbg !21 + %3033 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1315 = icmp eq i32 %3033, 0, !dbg !21 + %3034 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1800, float 0x3E54AE0C00000000, float %.0.i1314) #6, !dbg !21 + %3035 = tail call float @llvm.nvvm.fma.rn.f(float %1800, float 0x3E54AE0C00000000, float %.0.i1314) #6, !dbg !21 + %.01.i1316 = select i1 %.not4.i1315, float %3035, float %3034, !dbg !21 + %3036 = bitcast float %.04.i1312 to i32, !dbg !21 + %3037 = shl i32 %3036, 23, !dbg !21 + %3038 = bitcast i32 %3037 to float, !dbg !21 + %3039 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1316) #6, !dbg !21 + %3040 = fmul float %3039, %3038, !dbg !21 + %3041 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1317 = icmp eq i32 %3041, 0, !dbg !21 + %3042 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1801, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %3043 = tail call float @llvm.nvvm.fma.rn.f(float %1801, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1318 = select i1 %.not.i1317, float %3043, float %3042, !dbg !21 + %3044 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1319 = icmp eq i32 %3044, 0, !dbg !21 + %3045 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1318) #6, !dbg !21 + %3046 = tail call float @llvm.nvvm.saturate.f(float %.02.i1318) #6, !dbg !21 + %.03.i1320 = select i1 %.not1.i1319, float %3046, float %3045, !dbg !21 + %3047 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1321 = icmp eq i32 %3047, 0, !dbg !21 + %3048 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1320, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3049 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1320, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1322 = select i1 %.not2.i1321, float %3049, float %3048, !dbg !21 + %3050 = fadd float %.04.i1322, 0xC168000FE0000000, !dbg !21 + %3051 = fneg float %3050, !dbg !21 + %3052 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1323 = icmp eq i32 %3052, 0, !dbg !21 + %3053 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1801, float 0x3FF7154760000000, float %3051) #6, !dbg !21 + %3054 = tail call float @llvm.nvvm.fma.rn.f(float %1801, float 0x3FF7154760000000, float %3051) #6, !dbg !21 + %.0.i1324 = select i1 %.not3.i1323, float %3054, float %3053, !dbg !21 + %3055 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1325 = icmp eq i32 %3055, 0, !dbg !21 + %3056 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1801, float 0x3E54AE0C00000000, float %.0.i1324) #6, !dbg !21 + %3057 = tail call float @llvm.nvvm.fma.rn.f(float %1801, float 0x3E54AE0C00000000, float %.0.i1324) #6, !dbg !21 + %.01.i1326 = select i1 %.not4.i1325, float %3057, float %3056, !dbg !21 + %3058 = bitcast float %.04.i1322 to i32, !dbg !21 + %3059 = shl i32 %3058, 23, !dbg !21 + %3060 = bitcast i32 %3059 to float, !dbg !21 + %3061 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1326) #6, !dbg !21 + %3062 = fmul float %3061, %3060, !dbg !21 + %3063 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1327 = icmp eq i32 %3063, 0, !dbg !21 + %3064 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1802, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %3065 = tail call float @llvm.nvvm.fma.rn.f(float %1802, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1328 = select i1 %.not.i1327, float %3065, float %3064, !dbg !21 + %3066 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1329 = icmp eq i32 %3066, 0, !dbg !21 + %3067 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1328) #6, !dbg !21 + %3068 = tail call float @llvm.nvvm.saturate.f(float %.02.i1328) #6, !dbg !21 + %.03.i1330 = select i1 %.not1.i1329, float %3068, float %3067, !dbg !21 + %3069 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not2.i1331 = icmp eq i32 %3069, 0, !dbg !21 + %3070 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1330, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3071 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1330, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %.04.i1332 = select i1 %.not2.i1331, float %3071, float %3070, !dbg !21 + %3072 = fadd float %.04.i1332, 0xC168000FE0000000, !dbg !21 + %3073 = fneg float %3072, !dbg !21 + %3074 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1333 = icmp eq i32 %3074, 0, !dbg !21 + %3075 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1802, float 0x3FF7154760000000, float %3073) #6, !dbg !21 + %3076 = tail call float @llvm.nvvm.fma.rn.f(float %1802, float 0x3FF7154760000000, float %3073) #6, !dbg !21 + %.0.i1334 = select i1 %.not3.i1333, float %3076, float %3075, !dbg !21 + %3077 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1335 = icmp eq i32 %3077, 0, !dbg !21 + %3078 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1802, float 0x3E54AE0C00000000, float %.0.i1334) #6, !dbg !21 + %3079 = tail call float @llvm.nvvm.fma.rn.f(float %1802, float 0x3E54AE0C00000000, float %.0.i1334) #6, !dbg !21 + %.01.i1336 = select i1 %.not4.i1335, float %3079, float %3078, !dbg !21 + %3080 = bitcast float %.04.i1332 to i32, !dbg !21 + %3081 = shl i32 %3080, 23, !dbg !21 + %3082 = bitcast i32 %3081 to float, !dbg !21 + %3083 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1336) #6, !dbg !21 + %3084 = fmul float %3083, %3082, !dbg !21 + %3085 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1337 = icmp eq i32 %3085, 0, !dbg !21 + %3086 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1803, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %3087 = tail call float @llvm.nvvm.fma.rn.f(float %1803, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1338 = select i1 %.not.i1337, float %3087, float %3086, !dbg !21 + %3088 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1339 = icmp eq i32 %3088, 0, !dbg !21 + %3089 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1338) #6, !dbg !21 + %3090 = tail call float @llvm.nvvm.saturate.f(float %.02.i1338) #6, !dbg !21 + %.03.i1340 = select i1 %.not1.i1339, float %3090, float %3089, !dbg !21 + %3091 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %3092 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1340, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3093 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1340, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3094 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1343 = icmp eq i32 %3094, 0, !dbg !21 + %3095 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1345 = icmp eq i32 %3095, 0, !dbg !21 + %3096 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1347 = icmp eq i32 %3096, 0, !dbg !21 + %3097 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1804, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %3098 = tail call float @llvm.nvvm.fma.rn.f(float %1804, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1348 = select i1 %.not.i1347, float %3098, float %3097, !dbg !21 + %3099 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1349 = icmp eq i32 %3099, 0, !dbg !21 + %3100 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1348) #6, !dbg !21 + %3101 = tail call float @llvm.nvvm.saturate.f(float %.02.i1348) #6, !dbg !21 + %.03.i1350 = select i1 %.not1.i1349, float %3101, float %3100, !dbg !21 + %3102 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %3103 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1350, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3104 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1350, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3105 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1353 = icmp eq i32 %3105, 0, !dbg !21 + %3106 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1355 = icmp eq i32 %3106, 0, !dbg !21 + %3107 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1357 = icmp eq i32 %3107, 0, !dbg !21 + %3108 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1805, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %3109 = tail call float @llvm.nvvm.fma.rn.f(float %1805, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1358 = select i1 %.not.i1357, float %3109, float %3108, !dbg !21 + %3110 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1359 = icmp eq i32 %3110, 0, !dbg !21 + %3111 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1358) #6, !dbg !21 + %3112 = tail call float @llvm.nvvm.saturate.f(float %.02.i1358) #6, !dbg !21 + %.03.i1360 = select i1 %.not1.i1359, float %3112, float %3111, !dbg !21 + %3113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %3114 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1360, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3115 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1360, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3116 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1363 = icmp eq i32 %3116, 0, !dbg !21 + %3117 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1365 = icmp eq i32 %3117, 0, !dbg !21 + %3118 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1367 = icmp eq i32 %3118, 0, !dbg !21 + %3119 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1806, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %3120 = tail call float @llvm.nvvm.fma.rn.f(float %1806, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1368 = select i1 %.not.i1367, float %3120, float %3119, !dbg !21 + %3121 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1369 = icmp eq i32 %3121, 0, !dbg !21 + %3122 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1368) #6, !dbg !21 + %3123 = tail call float @llvm.nvvm.saturate.f(float %.02.i1368) #6, !dbg !21 + %.03.i1370 = select i1 %.not1.i1369, float %3123, float %3122, !dbg !21 + %3124 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %3125 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1370, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3126 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1370, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3127 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1373 = icmp eq i32 %3127, 0, !dbg !21 + %3128 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1375 = icmp eq i32 %3128, 0, !dbg !21 + %3129 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1377 = icmp eq i32 %3129, 0, !dbg !21 + %3130 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1807, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %3131 = tail call float @llvm.nvvm.fma.rn.f(float %1807, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1378 = select i1 %.not.i1377, float %3131, float %3130, !dbg !21 + %3132 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1379 = icmp eq i32 %3132, 0, !dbg !21 + %3133 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1378) #6, !dbg !21 + %3134 = tail call float @llvm.nvvm.saturate.f(float %.02.i1378) #6, !dbg !21 + %.03.i1380 = select i1 %.not1.i1379, float %3134, float %3133, !dbg !21 + %3135 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %3136 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1380, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3137 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1380, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3138 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1383 = icmp eq i32 %3138, 0, !dbg !21 + %3139 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1385 = icmp eq i32 %3139, 0, !dbg !21 + %3140 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not.i1387 = icmp eq i32 %3140, 0, !dbg !21 + %3141 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1808, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %3142 = tail call float @llvm.nvvm.fma.rn.f(float %1808, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !21 + %.02.i1388 = select i1 %.not.i1387, float %3142, float %3141, !dbg !21 + %3143 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not1.i1389 = icmp eq i32 %3143, 0, !dbg !21 + %3144 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1388) #6, !dbg !21 + %3145 = tail call float @llvm.nvvm.saturate.f(float %.02.i1388) #6, !dbg !21 + %.03.i1390 = select i1 %.not1.i1389, float %3145, float %3144, !dbg !21 + %3146 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %3147 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1390, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3148 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1390, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !21 + %3149 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not3.i1393 = icmp eq i32 %3149, 0, !dbg !21 + %3150 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !21 + %.not4.i1395 = icmp eq i32 %3150, 0, !dbg !21 + %3151 = fmul float %424, 0.000000e+00, !dbg !23 + %3152 = fmul float %446, 0.000000e+00, !dbg !23 + %3153 = fmul float %468, 0.000000e+00, !dbg !23 + %3154 = fmul float %490, 0.000000e+00, !dbg !23 + %3155 = fmul float %512, 0.000000e+00, !dbg !23 + %3156 = fmul float %534, 0.000000e+00, !dbg !23 + %3157 = fmul float %556, 0.000000e+00, !dbg !23 + %3158 = fmul float %578, 0.000000e+00, !dbg !23 + %3159 = fmul float %600, 0.000000e+00, !dbg !23 + %3160 = fmul float %622, 0.000000e+00, !dbg !23 + %3161 = fmul float %644, 0.000000e+00, !dbg !23 + %3162 = fmul float %666, 0.000000e+00, !dbg !23 + %3163 = fmul float %688, 0.000000e+00, !dbg !23 + %3164 = fmul float %710, 0.000000e+00, !dbg !23 + %3165 = fmul float %732, 0.000000e+00, !dbg !23 + %3166 = fmul float %754, 0.000000e+00, !dbg !23 + %3167 = fmul float %776, 0.000000e+00, !dbg !23 + %3168 = fmul float %798, 0.000000e+00, !dbg !23 + %3169 = fmul float %820, 0.000000e+00, !dbg !23 + %3170 = fmul float %842, 0.000000e+00, !dbg !23 + %3171 = fmul float %864, 0.000000e+00, !dbg !23 + %3172 = fmul float %886, 0.000000e+00, !dbg !23 + %3173 = fmul float %908, 0.000000e+00, !dbg !23 + %3174 = fmul float %930, 0.000000e+00, !dbg !23 + %3175 = fmul float %952, 0.000000e+00, !dbg !23 + %3176 = fmul float %974, 0.000000e+00, !dbg !23 + %3177 = fmul float %996, 0.000000e+00, !dbg !23 + %3178 = fmul float %1018, 0.000000e+00, !dbg !23 + %3179 = fmul float %1040, 0.000000e+00, !dbg !23 + %3180 = fmul float %1062, 0.000000e+00, !dbg !23 + %3181 = fmul float %1084, 0.000000e+00, !dbg !23 + %3182 = fmul float %1106, 0.000000e+00, !dbg !23 + %3183 = fmul float %1128, 0.000000e+00, !dbg !23 + %3184 = fmul float %1150, 0.000000e+00, !dbg !23 + %3185 = fmul float %1172, 0.000000e+00, !dbg !23 + %3186 = fmul float %1194, 0.000000e+00, !dbg !23 + %3187 = fmul float %1216, 0.000000e+00, !dbg !23 + %3188 = fmul float %1238, 0.000000e+00, !dbg !23 + %3189 = fmul float %1260, 0.000000e+00, !dbg !23 + %3190 = fmul float %1282, 0.000000e+00, !dbg !23 + %3191 = fmul float %1304, 0.000000e+00, !dbg !23 + %3192 = fmul float %1326, 0.000000e+00, !dbg !23 + %3193 = fmul float %1348, 0.000000e+00, !dbg !23 + %3194 = fmul float %1370, 0.000000e+00, !dbg !23 + %3195 = fmul float %1392, 0.000000e+00, !dbg !23 + %3196 = fmul float %1414, 0.000000e+00, !dbg !23 + %3197 = fmul float %1436, 0.000000e+00, !dbg !23 + %3198 = fmul float %1458, 0.000000e+00, !dbg !23 + %3199 = fmul float %1480, 0.000000e+00, !dbg !23 + %3200 = fmul float %1502, 0.000000e+00, !dbg !23 + %3201 = fmul float %1524, 0.000000e+00, !dbg !23 + %3202 = fmul float %1546, 0.000000e+00, !dbg !23 + %3203 = fmul float %1568, 0.000000e+00, !dbg !23 + %3204 = fmul float %1590, 0.000000e+00, !dbg !23 + %3205 = fmul float %1612, 0.000000e+00, !dbg !23 + %3206 = fmul float %1634, 0.000000e+00, !dbg !23 + %3207 = fmul float %1656, 0.000000e+00, !dbg !23 + %3208 = fmul float %1678, 0.000000e+00, !dbg !23 + %3209 = fadd float %3151, %1830, !dbg !24 + %3210 = select i1 %281, float 1.000000e+00, float %3209, !dbg !24 + %3211 = fadd float %3152, %1852, !dbg !24 + %3212 = select i1 %282, float 1.000000e+00, float %3211, !dbg !24 + %3213 = fadd float %3153, %1874, !dbg !24 + %3214 = select i1 %283, float 1.000000e+00, float %3213, !dbg !24 + %3215 = fadd float %3154, %1896, !dbg !24 + %3216 = select i1 %284, float 1.000000e+00, float %3215, !dbg !24 + %3217 = fadd float %3155, %1918, !dbg !24 + %3218 = select i1 %285, float 1.000000e+00, float %3217, !dbg !24 + %3219 = fadd float %3156, %1940, !dbg !24 + %3220 = select i1 %286, float 1.000000e+00, float %3219, !dbg !24 + %3221 = fadd float %3157, %1962, !dbg !24 + %3222 = select i1 %287, float 1.000000e+00, float %3221, !dbg !24 + %3223 = fadd float %3158, %1984, !dbg !24 + %3224 = select i1 %288, float 1.000000e+00, float %3223, !dbg !24 + %3225 = fadd float %3159, %2006, !dbg !24 + %3226 = select i1 %289, float 1.000000e+00, float %3225, !dbg !24 + %3227 = fadd float %3160, %2028, !dbg !24 + %3228 = select i1 %290, float 1.000000e+00, float %3227, !dbg !24 + %3229 = fadd float %3161, %2050, !dbg !24 + %3230 = select i1 %291, float 1.000000e+00, float %3229, !dbg !24 + %3231 = fadd float %3162, %2072, !dbg !24 + %3232 = select i1 %292, float 1.000000e+00, float %3231, !dbg !24 + %3233 = fadd float %3163, %2094, !dbg !24 + %3234 = select i1 %293, float 1.000000e+00, float %3233, !dbg !24 + %3235 = fadd float %3164, %2116, !dbg !24 + %3236 = select i1 %294, float 1.000000e+00, float %3235, !dbg !24 + %3237 = fadd float %3165, %2138, !dbg !24 + %3238 = select i1 %295, float 1.000000e+00, float %3237, !dbg !24 + %3239 = fadd float %3166, %2160, !dbg !24 + %3240 = select i1 %296, float 1.000000e+00, float %3239, !dbg !24 + %3241 = fadd float %3167, %2182, !dbg !24 + %3242 = select i1 %297, float 1.000000e+00, float %3241, !dbg !24 + %3243 = fadd float %3168, %2204, !dbg !24 + %3244 = select i1 %298, float 1.000000e+00, float %3243, !dbg !24 + %3245 = fadd float %3169, %2226, !dbg !24 + %3246 = select i1 %299, float 1.000000e+00, float %3245, !dbg !24 + %3247 = fadd float %3170, %2248, !dbg !24 + %3248 = select i1 %300, float 1.000000e+00, float %3247, !dbg !24 + %3249 = fadd float %3171, %2270, !dbg !24 + %3250 = select i1 %301, float 1.000000e+00, float %3249, !dbg !24 + %3251 = fadd float %3172, %2292, !dbg !24 + %3252 = select i1 %302, float 1.000000e+00, float %3251, !dbg !24 + %3253 = fadd float %3173, %2314, !dbg !24 + %3254 = select i1 %303, float 1.000000e+00, float %3253, !dbg !24 + %3255 = fadd float %3174, %2336, !dbg !24 + %3256 = select i1 %304, float 1.000000e+00, float %3255, !dbg !24 + %3257 = fadd float %3175, %2358, !dbg !24 + %3258 = select i1 %305, float 1.000000e+00, float %3257, !dbg !24 + %3259 = fadd float %3176, %2380, !dbg !24 + %3260 = select i1 %306, float 1.000000e+00, float %3259, !dbg !24 + %3261 = fadd float %3177, %2402, !dbg !24 + %3262 = select i1 %307, float 1.000000e+00, float %3261, !dbg !24 + %3263 = fadd float %3178, %2424, !dbg !24 + %3264 = select i1 %308, float 1.000000e+00, float %3263, !dbg !24 + %3265 = fadd float %3179, %2446, !dbg !24 + %3266 = select i1 %309, float 1.000000e+00, float %3265, !dbg !24 + %3267 = fadd float %3180, %2468, !dbg !24 + %3268 = select i1 %310, float 1.000000e+00, float %3267, !dbg !24 + %3269 = fadd float %3181, %2490, !dbg !24 + %3270 = select i1 %311, float 1.000000e+00, float %3269, !dbg !24 + %3271 = fadd float %3182, %2512, !dbg !24 + %3272 = select i1 %312, float 1.000000e+00, float %3271, !dbg !24 + %3273 = fadd float %3183, %2534, !dbg !24 + %3274 = select i1 %313, float 1.000000e+00, float %3273, !dbg !24 + %3275 = fadd float %3184, %2556, !dbg !24 + %3276 = select i1 %314, float 1.000000e+00, float %3275, !dbg !24 + %3277 = fadd float %3185, %2578, !dbg !24 + %3278 = select i1 %315, float 1.000000e+00, float %3277, !dbg !24 + %3279 = fadd float %3186, %2600, !dbg !24 + %3280 = select i1 %316, float 1.000000e+00, float %3279, !dbg !24 + %3281 = fadd float %3187, %2622, !dbg !24 + %3282 = select i1 %317, float 1.000000e+00, float %3281, !dbg !24 + %3283 = fadd float %3188, %2644, !dbg !24 + %3284 = select i1 %318, float 1.000000e+00, float %3283, !dbg !24 + %3285 = fadd float %3189, %2666, !dbg !24 + %3286 = select i1 %319, float 1.000000e+00, float %3285, !dbg !24 + %3287 = fadd float %3190, %2688, !dbg !24 + %3288 = select i1 %320, float 1.000000e+00, float %3287, !dbg !24 + %3289 = fadd float %3191, %2710, !dbg !24 + %3290 = select i1 %321, float 1.000000e+00, float %3289, !dbg !24 + %3291 = fadd float %3192, %2732, !dbg !24 + %3292 = select i1 %322, float 1.000000e+00, float %3291, !dbg !24 + %3293 = fadd float %3193, %2754, !dbg !24 + %3294 = select i1 %323, float 1.000000e+00, float %3293, !dbg !24 + %3295 = fadd float %3194, %2776, !dbg !24 + %3296 = select i1 %324, float 1.000000e+00, float %3295, !dbg !24 + %3297 = fadd float %3195, %2798, !dbg !24 + %3298 = select i1 %325, float 1.000000e+00, float %3297, !dbg !24 + %3299 = fadd float %3196, %2820, !dbg !24 + %3300 = select i1 %326, float 1.000000e+00, float %3299, !dbg !24 + %3301 = fadd float %3197, %2842, !dbg !24 + %3302 = select i1 %327, float 1.000000e+00, float %3301, !dbg !24 + %3303 = fadd float %3198, %2864, !dbg !24 + %3304 = select i1 %328, float 1.000000e+00, float %3303, !dbg !24 + %3305 = fadd float %3199, %2886, !dbg !24 + %3306 = select i1 %329, float 1.000000e+00, float %3305, !dbg !24 + %3307 = fadd float %3200, %2908, !dbg !24 + %3308 = select i1 %330, float 1.000000e+00, float %3307, !dbg !24 + %3309 = fadd float %3201, %2930, !dbg !24 + %3310 = select i1 %331, float 1.000000e+00, float %3309, !dbg !24 + %3311 = fadd float %3202, %2952, !dbg !24 + %3312 = select i1 %332, float 1.000000e+00, float %3311, !dbg !24 + %3313 = fadd float %3203, %2974, !dbg !24 + %3314 = select i1 %333, float 1.000000e+00, float %3313, !dbg !24 + %3315 = fadd float %3204, %2996, !dbg !24 + %3316 = select i1 %334, float 1.000000e+00, float %3315, !dbg !24 + %3317 = fadd float %3205, %3018, !dbg !24 + %3318 = select i1 %335, float 1.000000e+00, float %3317, !dbg !24 + %3319 = fadd float %3206, %3040, !dbg !24 + %3320 = select i1 %336, float 1.000000e+00, float %3319, !dbg !24 + %3321 = fadd float %3207, %3062, !dbg !24 + %3322 = select i1 %337, float 1.000000e+00, float %3321, !dbg !24 + %3323 = fadd float %3208, %3084, !dbg !24 + %3324 = select i1 %338, float 1.000000e+00, float %3323, !dbg !24 + %3325 = select i1 %16, float %267, float 0xFFF0000000000000, !dbg !25 + %3326 = select i1 %16, float %268, float 0xFFF0000000000000, !dbg !25 + %3327 = select i1 %16, float %270, float 0xFFF0000000000000, !dbg !25 + %3328 = select i1 %16, float %272, float 0xFFF0000000000000, !dbg !25 + %3329 = select i1 %16, float %274, float 0xFFF0000000000000, !dbg !25 + %3330 = select i1 %16, float %276, float 0xFFF0000000000000, !dbg !25 + %3331 = select i1 %16, float %278, float 0xFFF0000000000000, !dbg !25 + %3332 = select i1 %16, float %280, float 0xFFF0000000000000, !dbg !25 + %3333 = select i1 %16, float %3322, float 0.000000e+00, !dbg !26 + %3334 = select i1 %16, float %3324, float 0.000000e+00, !dbg !26 + %3335 = fcmp ogt bfloat %83, %84, !dbg !27 + %3336 = fcmp uno bfloat %83, 0xR0000, !dbg !29 + %3337 = or i1 %3335, %3336, !dbg !30 + %3338 = select i1 %3337, float %211, float %212, !dbg !31 + %3339 = fcmp ogt float %3338, %213, !dbg !27 + %3340 = fcmp uno float %3338, 0.000000e+00, !dbg !29 + %3341 = or i1 %3339, %3340, !dbg !30 + %3342 = select i1 %3341, float %3338, float %213, !dbg !31 + %3343 = fcmp ogt float %3342, %214, !dbg !27 + %3344 = fcmp uno float %3342, 0.000000e+00, !dbg !29 + %3345 = or i1 %3343, %3344, !dbg !30 + %3346 = select i1 %3345, float %3342, float %214, !dbg !31 + %3347 = fcmp ogt float %3346, %215, !dbg !27 + %3348 = fcmp uno float %3346, 0.000000e+00, !dbg !29 + %3349 = or i1 %3347, %3348, !dbg !30 + %3350 = select i1 %3349, float %3346, float %215, !dbg !31 + %3351 = fcmp ogt float %3350, %216, !dbg !27 + %3352 = fcmp uno float %3350, 0.000000e+00, !dbg !29 + %3353 = or i1 %3351, %3352, !dbg !30 + %3354 = select i1 %3353, float %3350, float %216, !dbg !31 + %3355 = fcmp ogt float %3354, %217, !dbg !27 + %3356 = fcmp uno float %3354, 0.000000e+00, !dbg !29 + %3357 = or i1 %3355, %3356, !dbg !30 + %3358 = select i1 %3357, float %3354, float %217, !dbg !31 + %3359 = fcmp ogt float %3358, %218, !dbg !27 + %3360 = fcmp uno float %3358, 0.000000e+00, !dbg !29 + %3361 = or i1 %3359, %3360, !dbg !30 + %3362 = select i1 %3361, float %3358, float %218, !dbg !31 + %3363 = fcmp ogt float %3362, %219, !dbg !27 + %3364 = fcmp uno float %3362, 0.000000e+00, !dbg !29 + %3365 = or i1 %3363, %3364, !dbg !30 + %3366 = select i1 %3365, float %3362, float %219, !dbg !31 + %3367 = fcmp ogt float %3366, %220, !dbg !27 + %3368 = fcmp uno float %3366, 0.000000e+00, !dbg !29 + %3369 = or i1 %3367, %3368, !dbg !30 + %3370 = select i1 %3369, float %3366, float %220, !dbg !31 + %3371 = fcmp ogt float %3370, %221, !dbg !27 + %3372 = fcmp uno float %3370, 0.000000e+00, !dbg !29 + %3373 = or i1 %3371, %3372, !dbg !30 + %3374 = select i1 %3373, float %3370, float %221, !dbg !31 + %3375 = fcmp ogt float %3374, %222, !dbg !27 + %3376 = fcmp uno float %3374, 0.000000e+00, !dbg !29 + %3377 = or i1 %3375, %3376, !dbg !30 + %3378 = select i1 %3377, float %3374, float %222, !dbg !31 + %3379 = fcmp ogt float %3378, %223, !dbg !27 + %3380 = fcmp uno float %3378, 0.000000e+00, !dbg !29 + %3381 = or i1 %3379, %3380, !dbg !30 + %3382 = select i1 %3381, float %3378, float %223, !dbg !31 + %3383 = fcmp ogt float %3382, %224, !dbg !27 + %3384 = fcmp uno float %3382, 0.000000e+00, !dbg !29 + %3385 = or i1 %3383, %3384, !dbg !30 + %3386 = select i1 %3385, float %3382, float %224, !dbg !31 + %3387 = fcmp ogt float %3386, %225, !dbg !27 + %3388 = fcmp uno float %3386, 0.000000e+00, !dbg !29 + %3389 = or i1 %3387, %3388, !dbg !30 + %3390 = select i1 %3389, float %3386, float %225, !dbg !31 + %3391 = fcmp ogt float %3390, %226, !dbg !27 + %3392 = fcmp uno float %3390, 0.000000e+00, !dbg !29 + %3393 = or i1 %3391, %3392, !dbg !30 + %3394 = select i1 %3393, float %3390, float %226, !dbg !31 + %3395 = fcmp ogt float %3394, %227, !dbg !27 + %3396 = fcmp uno float %3394, 0.000000e+00, !dbg !29 + %3397 = or i1 %3395, %3396, !dbg !30 + %3398 = select i1 %3397, float %3394, float %227, !dbg !31 + %3399 = fcmp ogt float %3398, %228, !dbg !27 + %3400 = fcmp uno float %3398, 0.000000e+00, !dbg !29 + %3401 = or i1 %3399, %3400, !dbg !30 + %3402 = select i1 %3401, float %3398, float %228, !dbg !31 + %3403 = fcmp ogt float %3402, %229, !dbg !27 + %3404 = fcmp uno float %3402, 0.000000e+00, !dbg !29 + %3405 = or i1 %3403, %3404, !dbg !30 + %3406 = select i1 %3405, float %3402, float %229, !dbg !31 + %3407 = fcmp ogt float %3406, %230, !dbg !27 + %3408 = fcmp uno float %3406, 0.000000e+00, !dbg !29 + %3409 = or i1 %3407, %3408, !dbg !30 + %3410 = select i1 %3409, float %3406, float %230, !dbg !31 + %3411 = fcmp ogt float %3410, %231, !dbg !27 + %3412 = fcmp uno float %3410, 0.000000e+00, !dbg !29 + %3413 = or i1 %3411, %3412, !dbg !30 + %3414 = select i1 %3413, float %3410, float %231, !dbg !31 + %3415 = fcmp ogt float %3414, %232, !dbg !27 + %3416 = fcmp uno float %3414, 0.000000e+00, !dbg !29 + %3417 = or i1 %3415, %3416, !dbg !30 + %3418 = select i1 %3417, float %3414, float %232, !dbg !31 + %3419 = fcmp ogt float %3418, %233, !dbg !27 + %3420 = fcmp uno float %3418, 0.000000e+00, !dbg !29 + %3421 = or i1 %3419, %3420, !dbg !30 + %3422 = select i1 %3421, float %3418, float %233, !dbg !31 + %3423 = fcmp ogt float %3422, %234, !dbg !27 + %3424 = fcmp uno float %3422, 0.000000e+00, !dbg !29 + %3425 = or i1 %3423, %3424, !dbg !30 + %3426 = select i1 %3425, float %3422, float %234, !dbg !31 + %3427 = fcmp ogt float %3426, %235, !dbg !27 + %3428 = fcmp uno float %3426, 0.000000e+00, !dbg !29 + %3429 = or i1 %3427, %3428, !dbg !30 + %3430 = select i1 %3429, float %3426, float %235, !dbg !31 + %3431 = fcmp ogt float %3430, %236, !dbg !27 + %3432 = fcmp uno float %3430, 0.000000e+00, !dbg !29 + %3433 = or i1 %3431, %3432, !dbg !30 + %3434 = select i1 %3433, float %3430, float %236, !dbg !31 + %3435 = fcmp ogt float %3434, %237, !dbg !27 + %3436 = fcmp uno float %3434, 0.000000e+00, !dbg !29 + %3437 = or i1 %3435, %3436, !dbg !30 + %3438 = select i1 %3437, float %3434, float %237, !dbg !31 + %3439 = fcmp ogt float %3438, %238, !dbg !27 + %3440 = fcmp uno float %3438, 0.000000e+00, !dbg !29 + %3441 = or i1 %3439, %3440, !dbg !30 + %3442 = select i1 %3441, float %3438, float %238, !dbg !31 + %3443 = fcmp ogt float %3442, %239, !dbg !27 + %3444 = fcmp uno float %3442, 0.000000e+00, !dbg !29 + %3445 = or i1 %3443, %3444, !dbg !30 + %3446 = select i1 %3445, float %3442, float %239, !dbg !31 + %3447 = fcmp ogt float %3446, %240, !dbg !27 + %3448 = fcmp uno float %3446, 0.000000e+00, !dbg !29 + %3449 = or i1 %3447, %3448, !dbg !30 + %3450 = select i1 %3449, float %3446, float %240, !dbg !31 + %3451 = fcmp ogt float %3450, %241, !dbg !27 + %3452 = fcmp uno float %3450, 0.000000e+00, !dbg !29 + %3453 = or i1 %3451, %3452, !dbg !30 + %3454 = select i1 %3453, float %3450, float %241, !dbg !31 + %3455 = fcmp ogt float %3454, %242, !dbg !27 + %3456 = fcmp uno float %3454, 0.000000e+00, !dbg !29 + %3457 = or i1 %3455, %3456, !dbg !30 + %3458 = select i1 %3457, float %3454, float %242, !dbg !31 + %3459 = fcmp ogt float %3458, %243, !dbg !27 + %3460 = fcmp uno float %3458, 0.000000e+00, !dbg !29 + %3461 = or i1 %3459, %3460, !dbg !30 + %3462 = select i1 %3461, float %3458, float %243, !dbg !31 + %3463 = fcmp ogt float %3462, %244, !dbg !27 + %3464 = fcmp uno float %3462, 0.000000e+00, !dbg !29 + %3465 = or i1 %3463, %3464, !dbg !30 + %3466 = select i1 %3465, float %3462, float %244, !dbg !31 + %3467 = fcmp ogt float %3466, %245, !dbg !27 + %3468 = fcmp uno float %3466, 0.000000e+00, !dbg !29 + %3469 = or i1 %3467, %3468, !dbg !30 + %3470 = select i1 %3469, float %3466, float %245, !dbg !31 + %3471 = fcmp ogt float %3470, %246, !dbg !27 + %3472 = fcmp uno float %3470, 0.000000e+00, !dbg !29 + %3473 = or i1 %3471, %3472, !dbg !30 + %3474 = select i1 %3473, float %3470, float %246, !dbg !31 + %3475 = fcmp ogt float %3474, %247, !dbg !27 + %3476 = fcmp uno float %3474, 0.000000e+00, !dbg !29 + %3477 = or i1 %3475, %3476, !dbg !30 + %3478 = select i1 %3477, float %3474, float %247, !dbg !31 + %3479 = fcmp ogt float %3478, %248, !dbg !27 + %3480 = fcmp uno float %3478, 0.000000e+00, !dbg !29 + %3481 = or i1 %3479, %3480, !dbg !30 + %3482 = select i1 %3481, float %3478, float %248, !dbg !31 + %3483 = fcmp ogt float %3482, %249, !dbg !27 + %3484 = fcmp uno float %3482, 0.000000e+00, !dbg !29 + %3485 = or i1 %3483, %3484, !dbg !30 + %3486 = select i1 %3485, float %3482, float %249, !dbg !31 + %3487 = fcmp ogt float %3486, %250, !dbg !27 + %3488 = fcmp uno float %3486, 0.000000e+00, !dbg !29 + %3489 = or i1 %3487, %3488, !dbg !30 + %3490 = select i1 %3489, float %3486, float %250, !dbg !31 + %3491 = fcmp ogt float %3490, %251, !dbg !27 + %3492 = fcmp uno float %3490, 0.000000e+00, !dbg !29 + %3493 = or i1 %3491, %3492, !dbg !30 + %3494 = select i1 %3493, float %3490, float %251, !dbg !31 + %3495 = fcmp ogt float %3494, %252, !dbg !27 + %3496 = fcmp uno float %3494, 0.000000e+00, !dbg !29 + %3497 = or i1 %3495, %3496, !dbg !30 + %3498 = select i1 %3497, float %3494, float %252, !dbg !31 + %3499 = fcmp ogt float %3498, %253, !dbg !27 + %3500 = fcmp uno float %3498, 0.000000e+00, !dbg !29 + %3501 = or i1 %3499, %3500, !dbg !30 + %3502 = select i1 %3501, float %3498, float %253, !dbg !31 + %3503 = fcmp ogt float %3502, %254, !dbg !27 + %3504 = fcmp uno float %3502, 0.000000e+00, !dbg !29 + %3505 = or i1 %3503, %3504, !dbg !30 + %3506 = select i1 %3505, float %3502, float %254, !dbg !31 + %3507 = fcmp ogt float %3506, %255, !dbg !27 + %3508 = fcmp uno float %3506, 0.000000e+00, !dbg !29 + %3509 = or i1 %3507, %3508, !dbg !30 + %3510 = select i1 %3509, float %3506, float %255, !dbg !31 + %3511 = fcmp ogt float %3510, %256, !dbg !27 + %3512 = fcmp uno float %3510, 0.000000e+00, !dbg !29 + %3513 = or i1 %3511, %3512, !dbg !30 + %3514 = select i1 %3513, float %3510, float %256, !dbg !31 + %3515 = fcmp ogt float %3514, %257, !dbg !27 + %3516 = fcmp uno float %3514, 0.000000e+00, !dbg !29 + %3517 = or i1 %3515, %3516, !dbg !30 + %3518 = select i1 %3517, float %3514, float %257, !dbg !31 + %3519 = fcmp ogt float %3518, %258, !dbg !27 + %3520 = fcmp uno float %3518, 0.000000e+00, !dbg !29 + %3521 = or i1 %3519, %3520, !dbg !30 + %3522 = select i1 %3521, float %3518, float %258, !dbg !31 + %3523 = fcmp ogt float %3522, %259, !dbg !27 + %3524 = fcmp uno float %3522, 0.000000e+00, !dbg !29 + %3525 = or i1 %3523, %3524, !dbg !30 + %3526 = select i1 %3525, float %3522, float %259, !dbg !31 + %3527 = fcmp ogt float %3526, %260, !dbg !27 + %3528 = fcmp uno float %3526, 0.000000e+00, !dbg !29 + %3529 = or i1 %3527, %3528, !dbg !30 + %3530 = select i1 %3529, float %3526, float %260, !dbg !31 + %3531 = fcmp ogt float %3530, %261, !dbg !27 + %3532 = fcmp uno float %3530, 0.000000e+00, !dbg !29 + %3533 = or i1 %3531, %3532, !dbg !30 + %3534 = select i1 %3533, float %3530, float %261, !dbg !31 + %3535 = fcmp ogt float %3534, %262, !dbg !27 + %3536 = fcmp uno float %3534, 0.000000e+00, !dbg !29 + %3537 = or i1 %3535, %3536, !dbg !30 + %3538 = select i1 %3537, float %3534, float %262, !dbg !31 + %3539 = fcmp ogt float %3538, %263, !dbg !27 + %3540 = fcmp uno float %3538, 0.000000e+00, !dbg !29 + %3541 = or i1 %3539, %3540, !dbg !30 + %3542 = select i1 %3541, float %3538, float %263, !dbg !31 + %3543 = fcmp ogt float %3542, %264, !dbg !27 + %3544 = fcmp uno float %3542, 0.000000e+00, !dbg !29 + %3545 = or i1 %3543, %3544, !dbg !30 + %3546 = select i1 %3545, float %3542, float %264, !dbg !31 + %3547 = fcmp ogt float %3546, %265, !dbg !27 + %3548 = fcmp uno float %3546, 0.000000e+00, !dbg !29 + %3549 = or i1 %3547, %3548, !dbg !30 + %3550 = select i1 %3549, float %3546, float %265, !dbg !31 + %3551 = fcmp ogt float %3550, %266, !dbg !27 + %3552 = fcmp uno float %3550, 0.000000e+00, !dbg !29 + %3553 = or i1 %3551, %3552, !dbg !30 + %3554 = select i1 %3553, float %3550, float %266, !dbg !31 + %3555 = fcmp ogt float %3554, %3325, !dbg !27 + %3556 = fcmp uno float %3554, 0.000000e+00, !dbg !29 + %3557 = or i1 %3555, %3556, !dbg !30 + %3558 = select i1 %3557, float %3554, float %3325, !dbg !31 + %3559 = fcmp ogt float %3558, %3326, !dbg !27 + %3560 = fcmp uno float %3558, 0.000000e+00, !dbg !29 + %3561 = or i1 %3559, %3560, !dbg !30 + %3562 = select i1 %3561, float %3558, float %3326, !dbg !31 + %3563 = fcmp ogt float %3562, %3327, !dbg !27 + %3564 = fcmp uno float %3562, 0.000000e+00, !dbg !29 + %3565 = or i1 %3563, %3564, !dbg !30 + %3566 = select i1 %3565, float %3562, float %3327, !dbg !31 + %3567 = fcmp ogt float %3566, %3328, !dbg !27 + %3568 = fcmp uno float %3566, 0.000000e+00, !dbg !29 + %3569 = or i1 %3567, %3568, !dbg !30 + %3570 = select i1 %3569, float %3566, float %3328, !dbg !31 + %3571 = fcmp ogt float %3570, %3329, !dbg !27 + %3572 = fcmp uno float %3570, 0.000000e+00, !dbg !29 + %3573 = or i1 %3571, %3572, !dbg !30 + %3574 = select i1 %3573, float %3570, float %3329, !dbg !31 + %3575 = fcmp ogt float %3574, %3330, !dbg !27 + %3576 = fcmp uno float %3574, 0.000000e+00, !dbg !29 + %3577 = or i1 %3575, %3576, !dbg !30 + %3578 = select i1 %3577, float %3574, float %3330, !dbg !31 + %3579 = fcmp ogt float %3578, %3331, !dbg !27 + %3580 = fcmp uno float %3578, 0.000000e+00, !dbg !29 + %3581 = or i1 %3579, %3580, !dbg !30 + %3582 = select i1 %3581, float %3578, float %3331, !dbg !31 + %3583 = fcmp ogt float %3582, %3332, !dbg !27 + %3584 = fcmp uno float %3582, 0.000000e+00, !dbg !29 + %3585 = or i1 %3583, %3584, !dbg !30 + %3586 = select i1 %3585, float %3582, float %3332, !dbg !31 + %3587 = bitcast float %3586 to i32, !dbg !32 + %3588 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3587, i32 16, i32 31), !dbg !32 + %3589 = bitcast i32 %3588 to float, !dbg !32 + %3590 = fcmp ogt float %3586, %3589, !dbg !27 + %3591 = fcmp uno float %3586, 0.000000e+00, !dbg !29 + %3592 = or i1 %3591, %3590, !dbg !30 + %3593 = select i1 %3592, float %3586, float %3589, !dbg !31 + %3594 = bitcast float %3593 to i32, !dbg !32 + %3595 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3594, i32 8, i32 31), !dbg !32 + %3596 = bitcast i32 %3595 to float, !dbg !32 + %3597 = fcmp ogt float %3593, %3596, !dbg !27 + %3598 = fcmp uno float %3593, 0.000000e+00, !dbg !29 + %3599 = or i1 %3597, %3598, !dbg !30 + %3600 = select i1 %3599, float %3593, float %3596, !dbg !31 + %3601 = bitcast float %3600 to i32, !dbg !32 + %3602 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3601, i32 4, i32 31), !dbg !32 + %3603 = bitcast i32 %3602 to float, !dbg !32 + %3604 = fcmp ogt float %3600, %3603, !dbg !27 + %3605 = fcmp uno float %3600, 0.000000e+00, !dbg !29 + %3606 = or i1 %3604, %3605, !dbg !30 + %3607 = select i1 %3606, float %3600, float %3603, !dbg !31 + %3608 = bitcast float %3607 to i32, !dbg !32 + %3609 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3608, i32 2, i32 31), !dbg !32 + %3610 = bitcast i32 %3609 to float, !dbg !32 + %3611 = fcmp ogt float %3607, %3610, !dbg !27 + %3612 = fcmp uno float %3607, 0.000000e+00, !dbg !29 + %3613 = or i1 %3611, %3612, !dbg !30 + %3614 = select i1 %3613, float %3607, float %3610, !dbg !31 + %3615 = bitcast float %3614 to i32, !dbg !32 + %3616 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3615, i32 1, i32 31), !dbg !32 + %3617 = bitcast i32 %3616 to float, !dbg !32 + %3618 = fcmp ogt float %3614, %3617, !dbg !27 + %3619 = fcmp uno float %3614, 0.000000e+00, !dbg !29 + %3620 = or i1 %3618, %3619, !dbg !30 + %3621 = and i32 %10, 15, !dbg !32 + %3622 = icmp eq i32 %9, 0, !dbg !32 + %3623 = getelementptr float, ptr addrspace(3) @global_smem, i32 %3621, !dbg !32 + %3624 = select i1 %3620, i32 %3615, i32 %3616, !dbg !31 + %3625 = insertelement <1 x i32> poison, i32 %3624, i64 0, !dbg !32 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %3623, <1 x i32> %3625, i1 %3622) #6, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !32 + %3626 = icmp samesign ult i32 %8, 16, !dbg !32 + %3627 = getelementptr float, ptr addrspace(3) @global_smem, i32 %8, !dbg !32 + %3628 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %3627, i1 %3626) #6, !dbg !32 + %3629 = bitcast i32 %3628 to float, !dbg !32 + %3630 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3628, i32 8, i32 31), !dbg !32 + %3631 = bitcast i32 %3630 to float, !dbg !32 + %3632 = fcmp ogt float %3629, %3631, !dbg !27 + %3633 = fcmp uno float %3629, 0.000000e+00, !dbg !29 + %3634 = or i1 %3633, %3632, !dbg !30 + %3635 = select i1 %3634, float %3629, float %3631, !dbg !31 + %3636 = bitcast float %3635 to i32, !dbg !32 + %3637 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3636, i32 4, i32 31), !dbg !32 + %3638 = bitcast i32 %3637 to float, !dbg !32 + %3639 = fcmp ogt float %3635, %3638, !dbg !27 + %3640 = fcmp uno float %3635, 0.000000e+00, !dbg !29 + %3641 = or i1 %3639, %3640, !dbg !30 + %3642 = select i1 %3641, float %3635, float %3638, !dbg !31 + %3643 = bitcast float %3642 to i32, !dbg !32 + %3644 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3643, i32 2, i32 31), !dbg !32 + %3645 = bitcast i32 %3644 to float, !dbg !32 + %3646 = fcmp ogt float %3642, %3645, !dbg !27 + %3647 = fcmp uno float %3642, 0.000000e+00, !dbg !29 + %3648 = or i1 %3646, %3647, !dbg !30 + %3649 = select i1 %3648, float %3642, float %3645, !dbg !31 + %3650 = bitcast float %3649 to i32, !dbg !32 + %3651 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3650, i32 1, i32 31), !dbg !32 + %3652 = bitcast i32 %3651 to float, !dbg !32 + %3653 = fcmp ogt float %3649, %3652, !dbg !27 + %3654 = fcmp uno float %3649, 0.000000e+00, !dbg !29 + %3655 = or i1 %3653, %3654, !dbg !30 + %3656 = icmp eq i32 %8, 0, !dbg !32 + %3657 = select i1 %3655, i32 %3650, i32 %3651, !dbg !31 + %3658 = insertelement <1 x i32> poison, i32 %3657, i64 0, !dbg !32 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %3627, <1 x i32> %3658, i1 %3656) #6, !dbg !32 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !32 + %3659 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !32 + %3660 = fcmp oeq float %3659, 0xFFF0000000000000, !dbg !33 + %3661 = fsub float %211, %3659, !dbg !34 + %3662 = fsub float %212, %3659, !dbg !34 + %3663 = fsub float %213, %3659, !dbg !34 + %3664 = fsub float %214, %3659, !dbg !34 + %3665 = fsub float %215, %3659, !dbg !34 + %3666 = fsub float %216, %3659, !dbg !34 + %3667 = fsub float %217, %3659, !dbg !34 + %3668 = fsub float %218, %3659, !dbg !34 + %3669 = fsub float %219, %3659, !dbg !34 + %3670 = fsub float %220, %3659, !dbg !34 + %3671 = fsub float %221, %3659, !dbg !34 + %3672 = fsub float %222, %3659, !dbg !34 + %3673 = fsub float %223, %3659, !dbg !34 + %3674 = fsub float %224, %3659, !dbg !34 + %3675 = fsub float %225, %3659, !dbg !34 + %3676 = fsub float %226, %3659, !dbg !34 + %3677 = fsub float %227, %3659, !dbg !34 + %3678 = fsub float %228, %3659, !dbg !34 + %3679 = fsub float %229, %3659, !dbg !34 + %3680 = fsub float %230, %3659, !dbg !34 + %3681 = fsub float %231, %3659, !dbg !34 + %3682 = fsub float %232, %3659, !dbg !34 + %3683 = fsub float %233, %3659, !dbg !34 + %3684 = fsub float %234, %3659, !dbg !34 + %3685 = fsub float %235, %3659, !dbg !34 + %3686 = fsub float %236, %3659, !dbg !34 + %3687 = fsub float %237, %3659, !dbg !34 + %3688 = fsub float %238, %3659, !dbg !34 + %3689 = fsub float %239, %3659, !dbg !34 + %3690 = fsub float %240, %3659, !dbg !34 + %3691 = fsub float %241, %3659, !dbg !34 + %3692 = fsub float %242, %3659, !dbg !34 + %3693 = fsub float %243, %3659, !dbg !34 + %3694 = fsub float %244, %3659, !dbg !34 + %3695 = fsub float %245, %3659, !dbg !34 + %3696 = fsub float %246, %3659, !dbg !34 + %3697 = fsub float %247, %3659, !dbg !34 + %3698 = fsub float %248, %3659, !dbg !34 + %3699 = fsub float %249, %3659, !dbg !34 + %3700 = fsub float %250, %3659, !dbg !34 + %3701 = fsub float %251, %3659, !dbg !34 + %3702 = fsub float %252, %3659, !dbg !34 + %3703 = fsub float %253, %3659, !dbg !34 + %3704 = fsub float %254, %3659, !dbg !34 + %3705 = fsub float %255, %3659, !dbg !34 + %3706 = fsub float %256, %3659, !dbg !34 + %3707 = fsub float %257, %3659, !dbg !34 + %3708 = fsub float %258, %3659, !dbg !34 + %3709 = fsub float %259, %3659, !dbg !34 + %3710 = fsub float %260, %3659, !dbg !34 + %3711 = fsub float %261, %3659, !dbg !34 + %3712 = fsub float %262, %3659, !dbg !34 + %3713 = fsub float %263, %3659, !dbg !34 + %3714 = fsub float %264, %3659, !dbg !34 + %3715 = fsub float %265, %3659, !dbg !34 + %3716 = fsub float %266, %3659, !dbg !34 + %3717 = fsub float %3325, %3659, !dbg !34 + %3718 = fsub float %3326, %3659, !dbg !34 + %3719 = fsub float %3327, %3659, !dbg !34 + %3720 = fsub float %3328, %3659, !dbg !34 + %3721 = fsub float %3329, %3659, !dbg !34 + %3722 = fsub float %3330, %3659, !dbg !34 + %3723 = fsub float %3331, %3659, !dbg !34 + %3724 = fsub float %3332, %3659, !dbg !34 + %3725 = select i1 %3660, float 0.000000e+00, float %3661, !dbg !35 + %3726 = select i1 %3660, float 0.000000e+00, float %3662, !dbg !35 + %3727 = select i1 %3660, float 0.000000e+00, float %3663, !dbg !35 + %3728 = select i1 %3660, float 0.000000e+00, float %3664, !dbg !35 + %3729 = select i1 %3660, float 0.000000e+00, float %3665, !dbg !35 + %3730 = select i1 %3660, float 0.000000e+00, float %3666, !dbg !35 + %3731 = select i1 %3660, float 0.000000e+00, float %3667, !dbg !35 + %3732 = select i1 %3660, float 0.000000e+00, float %3668, !dbg !35 + %3733 = select i1 %3660, float 0.000000e+00, float %3669, !dbg !35 + %3734 = select i1 %3660, float 0.000000e+00, float %3670, !dbg !35 + %3735 = select i1 %3660, float 0.000000e+00, float %3671, !dbg !35 + %3736 = select i1 %3660, float 0.000000e+00, float %3672, !dbg !35 + %3737 = select i1 %3660, float 0.000000e+00, float %3673, !dbg !35 + %3738 = select i1 %3660, float 0.000000e+00, float %3674, !dbg !35 + %3739 = select i1 %3660, float 0.000000e+00, float %3675, !dbg !35 + %3740 = select i1 %3660, float 0.000000e+00, float %3676, !dbg !35 + %3741 = select i1 %3660, float 0.000000e+00, float %3677, !dbg !35 + %3742 = select i1 %3660, float 0.000000e+00, float %3678, !dbg !35 + %3743 = select i1 %3660, float 0.000000e+00, float %3679, !dbg !35 + %3744 = select i1 %3660, float 0.000000e+00, float %3680, !dbg !35 + %3745 = select i1 %3660, float 0.000000e+00, float %3681, !dbg !35 + %3746 = select i1 %3660, float 0.000000e+00, float %3682, !dbg !35 + %3747 = select i1 %3660, float 0.000000e+00, float %3683, !dbg !35 + %3748 = select i1 %3660, float 0.000000e+00, float %3684, !dbg !35 + %3749 = select i1 %3660, float 0.000000e+00, float %3685, !dbg !35 + %3750 = select i1 %3660, float 0.000000e+00, float %3686, !dbg !35 + %3751 = select i1 %3660, float 0.000000e+00, float %3687, !dbg !35 + %3752 = select i1 %3660, float 0.000000e+00, float %3688, !dbg !35 + %3753 = select i1 %3660, float 0.000000e+00, float %3689, !dbg !35 + %3754 = select i1 %3660, float 0.000000e+00, float %3690, !dbg !35 + %3755 = select i1 %3660, float 0.000000e+00, float %3691, !dbg !35 + %3756 = select i1 %3660, float 0.000000e+00, float %3692, !dbg !35 + %3757 = select i1 %3660, float 0.000000e+00, float %3693, !dbg !35 + %3758 = select i1 %3660, float 0.000000e+00, float %3694, !dbg !35 + %3759 = select i1 %3660, float 0.000000e+00, float %3695, !dbg !35 + %3760 = select i1 %3660, float 0.000000e+00, float %3696, !dbg !35 + %3761 = select i1 %3660, float 0.000000e+00, float %3697, !dbg !35 + %3762 = select i1 %3660, float 0.000000e+00, float %3698, !dbg !35 + %3763 = select i1 %3660, float 0.000000e+00, float %3699, !dbg !35 + %3764 = select i1 %3660, float 0.000000e+00, float %3700, !dbg !35 + %3765 = select i1 %3660, float 0.000000e+00, float %3701, !dbg !35 + %3766 = select i1 %3660, float 0.000000e+00, float %3702, !dbg !35 + %3767 = select i1 %3660, float 0.000000e+00, float %3703, !dbg !35 + %3768 = select i1 %3660, float 0.000000e+00, float %3704, !dbg !35 + %3769 = select i1 %3660, float 0.000000e+00, float %3705, !dbg !35 + %3770 = select i1 %3660, float 0.000000e+00, float %3706, !dbg !35 + %3771 = select i1 %3660, float 0.000000e+00, float %3707, !dbg !35 + %3772 = select i1 %3660, float 0.000000e+00, float %3708, !dbg !35 + %3773 = select i1 %3660, float 0.000000e+00, float %3709, !dbg !35 + %3774 = select i1 %3660, float 0.000000e+00, float %3710, !dbg !35 + %3775 = select i1 %3660, float 0.000000e+00, float %3711, !dbg !35 + %3776 = select i1 %3660, float 0.000000e+00, float %3712, !dbg !35 + %3777 = select i1 %3660, float 0.000000e+00, float %3713, !dbg !35 + %3778 = select i1 %3660, float 0.000000e+00, float %3714, !dbg !35 + %3779 = select i1 %3660, float 0.000000e+00, float %3715, !dbg !35 + %3780 = select i1 %3660, float 0.000000e+00, float %3716, !dbg !35 + %3781 = select i1 %3660, float 0.000000e+00, float %3717, !dbg !35 + %3782 = select i1 %3660, float 0.000000e+00, float %3718, !dbg !35 + %3783 = select i1 %3660, float 0.000000e+00, float %3719, !dbg !35 + %3784 = select i1 %3660, float 0.000000e+00, float %3720, !dbg !35 + %3785 = select i1 %3660, float 0.000000e+00, float %3721, !dbg !35 + %3786 = select i1 %3660, float 0.000000e+00, float %3722, !dbg !35 + %3787 = select i1 %3660, float 0.000000e+00, float %3723, !dbg !35 + %3788 = select i1 %3660, float 0.000000e+00, float %3724, !dbg !35 + %3789 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1397 = icmp eq i32 %3789, 0, !dbg !36 + %3790 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3725, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3791 = tail call float @llvm.nvvm.fma.rn.f(float %3725, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1398 = select i1 %.not.i1397, float %3791, float %3790, !dbg !36 + %3792 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1399 = icmp eq i32 %3792, 0, !dbg !36 + %3793 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1398) #6, !dbg !36 + %3794 = tail call float @llvm.nvvm.saturate.f(float %.02.i1398) #6, !dbg !36 + %.03.i1400 = select i1 %.not1.i1399, float %3794, float %3793, !dbg !36 + %3795 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1401 = icmp eq i32 %3795, 0, !dbg !36 + %3796 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1400, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3797 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1400, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1402 = select i1 %.not2.i1401, float %3797, float %3796, !dbg !36 + %3798 = fadd float %.04.i1402, 0xC168000FE0000000, !dbg !36 + %3799 = fneg float %3798, !dbg !36 + %3800 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1403 = icmp eq i32 %3800, 0, !dbg !36 + %3801 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3725, float 0x3FF7154760000000, float %3799) #6, !dbg !36 + %3802 = tail call float @llvm.nvvm.fma.rn.f(float %3725, float 0x3FF7154760000000, float %3799) #6, !dbg !36 + %.0.i1404 = select i1 %.not3.i1403, float %3802, float %3801, !dbg !36 + %3803 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1405 = icmp eq i32 %3803, 0, !dbg !36 + %3804 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3725, float 0x3E54AE0C00000000, float %.0.i1404) #6, !dbg !36 + %3805 = tail call float @llvm.nvvm.fma.rn.f(float %3725, float 0x3E54AE0C00000000, float %.0.i1404) #6, !dbg !36 + %.01.i1406 = select i1 %.not4.i1405, float %3805, float %3804, !dbg !36 + %3806 = bitcast float %.04.i1402 to i32, !dbg !36 + %3807 = shl i32 %3806, 23, !dbg !36 + %3808 = bitcast i32 %3807 to float, !dbg !36 + %3809 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1406) #6, !dbg !36 + %3810 = fmul float %3809, %3808, !dbg !36 + %3811 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1407 = icmp eq i32 %3811, 0, !dbg !36 + %3812 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3726, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3813 = tail call float @llvm.nvvm.fma.rn.f(float %3726, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1408 = select i1 %.not.i1407, float %3813, float %3812, !dbg !36 + %3814 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1409 = icmp eq i32 %3814, 0, !dbg !36 + %3815 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1408) #6, !dbg !36 + %3816 = tail call float @llvm.nvvm.saturate.f(float %.02.i1408) #6, !dbg !36 + %.03.i1410 = select i1 %.not1.i1409, float %3816, float %3815, !dbg !36 + %3817 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1411 = icmp eq i32 %3817, 0, !dbg !36 + %3818 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1410, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3819 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1410, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1412 = select i1 %.not2.i1411, float %3819, float %3818, !dbg !36 + %3820 = fadd float %.04.i1412, 0xC168000FE0000000, !dbg !36 + %3821 = fneg float %3820, !dbg !36 + %3822 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1413 = icmp eq i32 %3822, 0, !dbg !36 + %3823 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3726, float 0x3FF7154760000000, float %3821) #6, !dbg !36 + %3824 = tail call float @llvm.nvvm.fma.rn.f(float %3726, float 0x3FF7154760000000, float %3821) #6, !dbg !36 + %.0.i1414 = select i1 %.not3.i1413, float %3824, float %3823, !dbg !36 + %3825 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1415 = icmp eq i32 %3825, 0, !dbg !36 + %3826 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3726, float 0x3E54AE0C00000000, float %.0.i1414) #6, !dbg !36 + %3827 = tail call float @llvm.nvvm.fma.rn.f(float %3726, float 0x3E54AE0C00000000, float %.0.i1414) #6, !dbg !36 + %.01.i1416 = select i1 %.not4.i1415, float %3827, float %3826, !dbg !36 + %3828 = bitcast float %.04.i1412 to i32, !dbg !36 + %3829 = shl i32 %3828, 23, !dbg !36 + %3830 = bitcast i32 %3829 to float, !dbg !36 + %3831 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1416) #6, !dbg !36 + %3832 = fmul float %3831, %3830, !dbg !36 + %3833 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1417 = icmp eq i32 %3833, 0, !dbg !36 + %3834 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3727, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3835 = tail call float @llvm.nvvm.fma.rn.f(float %3727, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1418 = select i1 %.not.i1417, float %3835, float %3834, !dbg !36 + %3836 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1419 = icmp eq i32 %3836, 0, !dbg !36 + %3837 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1418) #6, !dbg !36 + %3838 = tail call float @llvm.nvvm.saturate.f(float %.02.i1418) #6, !dbg !36 + %.03.i1420 = select i1 %.not1.i1419, float %3838, float %3837, !dbg !36 + %3839 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1421 = icmp eq i32 %3839, 0, !dbg !36 + %3840 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1420, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3841 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1420, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1422 = select i1 %.not2.i1421, float %3841, float %3840, !dbg !36 + %3842 = fadd float %.04.i1422, 0xC168000FE0000000, !dbg !36 + %3843 = fneg float %3842, !dbg !36 + %3844 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1423 = icmp eq i32 %3844, 0, !dbg !36 + %3845 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3727, float 0x3FF7154760000000, float %3843) #6, !dbg !36 + %3846 = tail call float @llvm.nvvm.fma.rn.f(float %3727, float 0x3FF7154760000000, float %3843) #6, !dbg !36 + %.0.i1424 = select i1 %.not3.i1423, float %3846, float %3845, !dbg !36 + %3847 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1425 = icmp eq i32 %3847, 0, !dbg !36 + %3848 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3727, float 0x3E54AE0C00000000, float %.0.i1424) #6, !dbg !36 + %3849 = tail call float @llvm.nvvm.fma.rn.f(float %3727, float 0x3E54AE0C00000000, float %.0.i1424) #6, !dbg !36 + %.01.i1426 = select i1 %.not4.i1425, float %3849, float %3848, !dbg !36 + %3850 = bitcast float %.04.i1422 to i32, !dbg !36 + %3851 = shl i32 %3850, 23, !dbg !36 + %3852 = bitcast i32 %3851 to float, !dbg !36 + %3853 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1426) #6, !dbg !36 + %3854 = fmul float %3853, %3852, !dbg !36 + %3855 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1427 = icmp eq i32 %3855, 0, !dbg !36 + %3856 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3728, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3857 = tail call float @llvm.nvvm.fma.rn.f(float %3728, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1428 = select i1 %.not.i1427, float %3857, float %3856, !dbg !36 + %3858 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1429 = icmp eq i32 %3858, 0, !dbg !36 + %3859 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1428) #6, !dbg !36 + %3860 = tail call float @llvm.nvvm.saturate.f(float %.02.i1428) #6, !dbg !36 + %.03.i1430 = select i1 %.not1.i1429, float %3860, float %3859, !dbg !36 + %3861 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1431 = icmp eq i32 %3861, 0, !dbg !36 + %3862 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1430, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3863 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1430, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1432 = select i1 %.not2.i1431, float %3863, float %3862, !dbg !36 + %3864 = fadd float %.04.i1432, 0xC168000FE0000000, !dbg !36 + %3865 = fneg float %3864, !dbg !36 + %3866 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1433 = icmp eq i32 %3866, 0, !dbg !36 + %3867 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3728, float 0x3FF7154760000000, float %3865) #6, !dbg !36 + %3868 = tail call float @llvm.nvvm.fma.rn.f(float %3728, float 0x3FF7154760000000, float %3865) #6, !dbg !36 + %.0.i1434 = select i1 %.not3.i1433, float %3868, float %3867, !dbg !36 + %3869 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1435 = icmp eq i32 %3869, 0, !dbg !36 + %3870 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3728, float 0x3E54AE0C00000000, float %.0.i1434) #6, !dbg !36 + %3871 = tail call float @llvm.nvvm.fma.rn.f(float %3728, float 0x3E54AE0C00000000, float %.0.i1434) #6, !dbg !36 + %.01.i1436 = select i1 %.not4.i1435, float %3871, float %3870, !dbg !36 + %3872 = bitcast float %.04.i1432 to i32, !dbg !36 + %3873 = shl i32 %3872, 23, !dbg !36 + %3874 = bitcast i32 %3873 to float, !dbg !36 + %3875 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1436) #6, !dbg !36 + %3876 = fmul float %3875, %3874, !dbg !36 + %3877 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1437 = icmp eq i32 %3877, 0, !dbg !36 + %3878 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3729, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3879 = tail call float @llvm.nvvm.fma.rn.f(float %3729, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1438 = select i1 %.not.i1437, float %3879, float %3878, !dbg !36 + %3880 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1439 = icmp eq i32 %3880, 0, !dbg !36 + %3881 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1438) #6, !dbg !36 + %3882 = tail call float @llvm.nvvm.saturate.f(float %.02.i1438) #6, !dbg !36 + %.03.i1440 = select i1 %.not1.i1439, float %3882, float %3881, !dbg !36 + %3883 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1441 = icmp eq i32 %3883, 0, !dbg !36 + %3884 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1440, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3885 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1440, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1442 = select i1 %.not2.i1441, float %3885, float %3884, !dbg !36 + %3886 = fadd float %.04.i1442, 0xC168000FE0000000, !dbg !36 + %3887 = fneg float %3886, !dbg !36 + %3888 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1443 = icmp eq i32 %3888, 0, !dbg !36 + %3889 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3729, float 0x3FF7154760000000, float %3887) #6, !dbg !36 + %3890 = tail call float @llvm.nvvm.fma.rn.f(float %3729, float 0x3FF7154760000000, float %3887) #6, !dbg !36 + %.0.i1444 = select i1 %.not3.i1443, float %3890, float %3889, !dbg !36 + %3891 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1445 = icmp eq i32 %3891, 0, !dbg !36 + %3892 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3729, float 0x3E54AE0C00000000, float %.0.i1444) #6, !dbg !36 + %3893 = tail call float @llvm.nvvm.fma.rn.f(float %3729, float 0x3E54AE0C00000000, float %.0.i1444) #6, !dbg !36 + %.01.i1446 = select i1 %.not4.i1445, float %3893, float %3892, !dbg !36 + %3894 = bitcast float %.04.i1442 to i32, !dbg !36 + %3895 = shl i32 %3894, 23, !dbg !36 + %3896 = bitcast i32 %3895 to float, !dbg !36 + %3897 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1446) #6, !dbg !36 + %3898 = fmul float %3897, %3896, !dbg !36 + %3899 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1447 = icmp eq i32 %3899, 0, !dbg !36 + %3900 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3730, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3901 = tail call float @llvm.nvvm.fma.rn.f(float %3730, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1448 = select i1 %.not.i1447, float %3901, float %3900, !dbg !36 + %3902 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1449 = icmp eq i32 %3902, 0, !dbg !36 + %3903 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1448) #6, !dbg !36 + %3904 = tail call float @llvm.nvvm.saturate.f(float %.02.i1448) #6, !dbg !36 + %.03.i1450 = select i1 %.not1.i1449, float %3904, float %3903, !dbg !36 + %3905 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1451 = icmp eq i32 %3905, 0, !dbg !36 + %3906 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1450, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3907 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1450, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1452 = select i1 %.not2.i1451, float %3907, float %3906, !dbg !36 + %3908 = fadd float %.04.i1452, 0xC168000FE0000000, !dbg !36 + %3909 = fneg float %3908, !dbg !36 + %3910 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1453 = icmp eq i32 %3910, 0, !dbg !36 + %3911 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3730, float 0x3FF7154760000000, float %3909) #6, !dbg !36 + %3912 = tail call float @llvm.nvvm.fma.rn.f(float %3730, float 0x3FF7154760000000, float %3909) #6, !dbg !36 + %.0.i1454 = select i1 %.not3.i1453, float %3912, float %3911, !dbg !36 + %3913 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1455 = icmp eq i32 %3913, 0, !dbg !36 + %3914 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3730, float 0x3E54AE0C00000000, float %.0.i1454) #6, !dbg !36 + %3915 = tail call float @llvm.nvvm.fma.rn.f(float %3730, float 0x3E54AE0C00000000, float %.0.i1454) #6, !dbg !36 + %.01.i1456 = select i1 %.not4.i1455, float %3915, float %3914, !dbg !36 + %3916 = bitcast float %.04.i1452 to i32, !dbg !36 + %3917 = shl i32 %3916, 23, !dbg !36 + %3918 = bitcast i32 %3917 to float, !dbg !36 + %3919 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1456) #6, !dbg !36 + %3920 = fmul float %3919, %3918, !dbg !36 + %3921 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1457 = icmp eq i32 %3921, 0, !dbg !36 + %3922 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3731, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3923 = tail call float @llvm.nvvm.fma.rn.f(float %3731, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1458 = select i1 %.not.i1457, float %3923, float %3922, !dbg !36 + %3924 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1459 = icmp eq i32 %3924, 0, !dbg !36 + %3925 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1458) #6, !dbg !36 + %3926 = tail call float @llvm.nvvm.saturate.f(float %.02.i1458) #6, !dbg !36 + %.03.i1460 = select i1 %.not1.i1459, float %3926, float %3925, !dbg !36 + %3927 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1461 = icmp eq i32 %3927, 0, !dbg !36 + %3928 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1460, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3929 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1460, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1462 = select i1 %.not2.i1461, float %3929, float %3928, !dbg !36 + %3930 = fadd float %.04.i1462, 0xC168000FE0000000, !dbg !36 + %3931 = fneg float %3930, !dbg !36 + %3932 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1463 = icmp eq i32 %3932, 0, !dbg !36 + %3933 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3731, float 0x3FF7154760000000, float %3931) #6, !dbg !36 + %3934 = tail call float @llvm.nvvm.fma.rn.f(float %3731, float 0x3FF7154760000000, float %3931) #6, !dbg !36 + %.0.i1464 = select i1 %.not3.i1463, float %3934, float %3933, !dbg !36 + %3935 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1465 = icmp eq i32 %3935, 0, !dbg !36 + %3936 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3731, float 0x3E54AE0C00000000, float %.0.i1464) #6, !dbg !36 + %3937 = tail call float @llvm.nvvm.fma.rn.f(float %3731, float 0x3E54AE0C00000000, float %.0.i1464) #6, !dbg !36 + %.01.i1466 = select i1 %.not4.i1465, float %3937, float %3936, !dbg !36 + %3938 = bitcast float %.04.i1462 to i32, !dbg !36 + %3939 = shl i32 %3938, 23, !dbg !36 + %3940 = bitcast i32 %3939 to float, !dbg !36 + %3941 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1466) #6, !dbg !36 + %3942 = fmul float %3941, %3940, !dbg !36 + %3943 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1467 = icmp eq i32 %3943, 0, !dbg !36 + %3944 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3732, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3945 = tail call float @llvm.nvvm.fma.rn.f(float %3732, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1468 = select i1 %.not.i1467, float %3945, float %3944, !dbg !36 + %3946 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1469 = icmp eq i32 %3946, 0, !dbg !36 + %3947 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1468) #6, !dbg !36 + %3948 = tail call float @llvm.nvvm.saturate.f(float %.02.i1468) #6, !dbg !36 + %.03.i1470 = select i1 %.not1.i1469, float %3948, float %3947, !dbg !36 + %3949 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1471 = icmp eq i32 %3949, 0, !dbg !36 + %3950 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1470, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3951 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1470, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1472 = select i1 %.not2.i1471, float %3951, float %3950, !dbg !36 + %3952 = fadd float %.04.i1472, 0xC168000FE0000000, !dbg !36 + %3953 = fneg float %3952, !dbg !36 + %3954 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1473 = icmp eq i32 %3954, 0, !dbg !36 + %3955 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3732, float 0x3FF7154760000000, float %3953) #6, !dbg !36 + %3956 = tail call float @llvm.nvvm.fma.rn.f(float %3732, float 0x3FF7154760000000, float %3953) #6, !dbg !36 + %.0.i1474 = select i1 %.not3.i1473, float %3956, float %3955, !dbg !36 + %3957 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1475 = icmp eq i32 %3957, 0, !dbg !36 + %3958 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3732, float 0x3E54AE0C00000000, float %.0.i1474) #6, !dbg !36 + %3959 = tail call float @llvm.nvvm.fma.rn.f(float %3732, float 0x3E54AE0C00000000, float %.0.i1474) #6, !dbg !36 + %.01.i1476 = select i1 %.not4.i1475, float %3959, float %3958, !dbg !36 + %3960 = bitcast float %.04.i1472 to i32, !dbg !36 + %3961 = shl i32 %3960, 23, !dbg !36 + %3962 = bitcast i32 %3961 to float, !dbg !36 + %3963 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1476) #6, !dbg !36 + %3964 = fmul float %3963, %3962, !dbg !36 + %3965 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1477 = icmp eq i32 %3965, 0, !dbg !36 + %3966 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3733, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3967 = tail call float @llvm.nvvm.fma.rn.f(float %3733, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1478 = select i1 %.not.i1477, float %3967, float %3966, !dbg !36 + %3968 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1479 = icmp eq i32 %3968, 0, !dbg !36 + %3969 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1478) #6, !dbg !36 + %3970 = tail call float @llvm.nvvm.saturate.f(float %.02.i1478) #6, !dbg !36 + %.03.i1480 = select i1 %.not1.i1479, float %3970, float %3969, !dbg !36 + %3971 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1481 = icmp eq i32 %3971, 0, !dbg !36 + %3972 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1480, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3973 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1480, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1482 = select i1 %.not2.i1481, float %3973, float %3972, !dbg !36 + %3974 = fadd float %.04.i1482, 0xC168000FE0000000, !dbg !36 + %3975 = fneg float %3974, !dbg !36 + %3976 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1483 = icmp eq i32 %3976, 0, !dbg !36 + %3977 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3733, float 0x3FF7154760000000, float %3975) #6, !dbg !36 + %3978 = tail call float @llvm.nvvm.fma.rn.f(float %3733, float 0x3FF7154760000000, float %3975) #6, !dbg !36 + %.0.i1484 = select i1 %.not3.i1483, float %3978, float %3977, !dbg !36 + %3979 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1485 = icmp eq i32 %3979, 0, !dbg !36 + %3980 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3733, float 0x3E54AE0C00000000, float %.0.i1484) #6, !dbg !36 + %3981 = tail call float @llvm.nvvm.fma.rn.f(float %3733, float 0x3E54AE0C00000000, float %.0.i1484) #6, !dbg !36 + %.01.i1486 = select i1 %.not4.i1485, float %3981, float %3980, !dbg !36 + %3982 = bitcast float %.04.i1482 to i32, !dbg !36 + %3983 = shl i32 %3982, 23, !dbg !36 + %3984 = bitcast i32 %3983 to float, !dbg !36 + %3985 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1486) #6, !dbg !36 + %3986 = fmul float %3985, %3984, !dbg !36 + %3987 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1487 = icmp eq i32 %3987, 0, !dbg !36 + %3988 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3734, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %3989 = tail call float @llvm.nvvm.fma.rn.f(float %3734, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1488 = select i1 %.not.i1487, float %3989, float %3988, !dbg !36 + %3990 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1489 = icmp eq i32 %3990, 0, !dbg !36 + %3991 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1488) #6, !dbg !36 + %3992 = tail call float @llvm.nvvm.saturate.f(float %.02.i1488) #6, !dbg !36 + %.03.i1490 = select i1 %.not1.i1489, float %3992, float %3991, !dbg !36 + %3993 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1491 = icmp eq i32 %3993, 0, !dbg !36 + %3994 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1490, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %3995 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1490, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1492 = select i1 %.not2.i1491, float %3995, float %3994, !dbg !36 + %3996 = fadd float %.04.i1492, 0xC168000FE0000000, !dbg !36 + %3997 = fneg float %3996, !dbg !36 + %3998 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1493 = icmp eq i32 %3998, 0, !dbg !36 + %3999 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3734, float 0x3FF7154760000000, float %3997) #6, !dbg !36 + %4000 = tail call float @llvm.nvvm.fma.rn.f(float %3734, float 0x3FF7154760000000, float %3997) #6, !dbg !36 + %.0.i1494 = select i1 %.not3.i1493, float %4000, float %3999, !dbg !36 + %4001 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1495 = icmp eq i32 %4001, 0, !dbg !36 + %4002 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3734, float 0x3E54AE0C00000000, float %.0.i1494) #6, !dbg !36 + %4003 = tail call float @llvm.nvvm.fma.rn.f(float %3734, float 0x3E54AE0C00000000, float %.0.i1494) #6, !dbg !36 + %.01.i1496 = select i1 %.not4.i1495, float %4003, float %4002, !dbg !36 + %4004 = bitcast float %.04.i1492 to i32, !dbg !36 + %4005 = shl i32 %4004, 23, !dbg !36 + %4006 = bitcast i32 %4005 to float, !dbg !36 + %4007 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1496) #6, !dbg !36 + %4008 = fmul float %4007, %4006, !dbg !36 + %4009 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1497 = icmp eq i32 %4009, 0, !dbg !36 + %4010 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3735, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4011 = tail call float @llvm.nvvm.fma.rn.f(float %3735, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1498 = select i1 %.not.i1497, float %4011, float %4010, !dbg !36 + %4012 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1499 = icmp eq i32 %4012, 0, !dbg !36 + %4013 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1498) #6, !dbg !36 + %4014 = tail call float @llvm.nvvm.saturate.f(float %.02.i1498) #6, !dbg !36 + %.03.i1500 = select i1 %.not1.i1499, float %4014, float %4013, !dbg !36 + %4015 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1501 = icmp eq i32 %4015, 0, !dbg !36 + %4016 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1500, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4017 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1500, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1502 = select i1 %.not2.i1501, float %4017, float %4016, !dbg !36 + %4018 = fadd float %.04.i1502, 0xC168000FE0000000, !dbg !36 + %4019 = fneg float %4018, !dbg !36 + %4020 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1503 = icmp eq i32 %4020, 0, !dbg !36 + %4021 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3735, float 0x3FF7154760000000, float %4019) #6, !dbg !36 + %4022 = tail call float @llvm.nvvm.fma.rn.f(float %3735, float 0x3FF7154760000000, float %4019) #6, !dbg !36 + %.0.i1504 = select i1 %.not3.i1503, float %4022, float %4021, !dbg !36 + %4023 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1505 = icmp eq i32 %4023, 0, !dbg !36 + %4024 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3735, float 0x3E54AE0C00000000, float %.0.i1504) #6, !dbg !36 + %4025 = tail call float @llvm.nvvm.fma.rn.f(float %3735, float 0x3E54AE0C00000000, float %.0.i1504) #6, !dbg !36 + %.01.i1506 = select i1 %.not4.i1505, float %4025, float %4024, !dbg !36 + %4026 = bitcast float %.04.i1502 to i32, !dbg !36 + %4027 = shl i32 %4026, 23, !dbg !36 + %4028 = bitcast i32 %4027 to float, !dbg !36 + %4029 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1506) #6, !dbg !36 + %4030 = fmul float %4029, %4028, !dbg !36 + %4031 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1507 = icmp eq i32 %4031, 0, !dbg !36 + %4032 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3736, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4033 = tail call float @llvm.nvvm.fma.rn.f(float %3736, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1508 = select i1 %.not.i1507, float %4033, float %4032, !dbg !36 + %4034 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1509 = icmp eq i32 %4034, 0, !dbg !36 + %4035 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1508) #6, !dbg !36 + %4036 = tail call float @llvm.nvvm.saturate.f(float %.02.i1508) #6, !dbg !36 + %.03.i1510 = select i1 %.not1.i1509, float %4036, float %4035, !dbg !36 + %4037 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1511 = icmp eq i32 %4037, 0, !dbg !36 + %4038 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1510, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4039 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1510, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1512 = select i1 %.not2.i1511, float %4039, float %4038, !dbg !36 + %4040 = fadd float %.04.i1512, 0xC168000FE0000000, !dbg !36 + %4041 = fneg float %4040, !dbg !36 + %4042 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1513 = icmp eq i32 %4042, 0, !dbg !36 + %4043 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3736, float 0x3FF7154760000000, float %4041) #6, !dbg !36 + %4044 = tail call float @llvm.nvvm.fma.rn.f(float %3736, float 0x3FF7154760000000, float %4041) #6, !dbg !36 + %.0.i1514 = select i1 %.not3.i1513, float %4044, float %4043, !dbg !36 + %4045 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1515 = icmp eq i32 %4045, 0, !dbg !36 + %4046 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3736, float 0x3E54AE0C00000000, float %.0.i1514) #6, !dbg !36 + %4047 = tail call float @llvm.nvvm.fma.rn.f(float %3736, float 0x3E54AE0C00000000, float %.0.i1514) #6, !dbg !36 + %.01.i1516 = select i1 %.not4.i1515, float %4047, float %4046, !dbg !36 + %4048 = bitcast float %.04.i1512 to i32, !dbg !36 + %4049 = shl i32 %4048, 23, !dbg !36 + %4050 = bitcast i32 %4049 to float, !dbg !36 + %4051 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1516) #6, !dbg !36 + %4052 = fmul float %4051, %4050, !dbg !36 + %4053 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1517 = icmp eq i32 %4053, 0, !dbg !36 + %4054 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3737, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4055 = tail call float @llvm.nvvm.fma.rn.f(float %3737, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1518 = select i1 %.not.i1517, float %4055, float %4054, !dbg !36 + %4056 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1519 = icmp eq i32 %4056, 0, !dbg !36 + %4057 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1518) #6, !dbg !36 + %4058 = tail call float @llvm.nvvm.saturate.f(float %.02.i1518) #6, !dbg !36 + %.03.i1520 = select i1 %.not1.i1519, float %4058, float %4057, !dbg !36 + %4059 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1521 = icmp eq i32 %4059, 0, !dbg !36 + %4060 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1520, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4061 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1520, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1522 = select i1 %.not2.i1521, float %4061, float %4060, !dbg !36 + %4062 = fadd float %.04.i1522, 0xC168000FE0000000, !dbg !36 + %4063 = fneg float %4062, !dbg !36 + %4064 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1523 = icmp eq i32 %4064, 0, !dbg !36 + %4065 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3737, float 0x3FF7154760000000, float %4063) #6, !dbg !36 + %4066 = tail call float @llvm.nvvm.fma.rn.f(float %3737, float 0x3FF7154760000000, float %4063) #6, !dbg !36 + %.0.i1524 = select i1 %.not3.i1523, float %4066, float %4065, !dbg !36 + %4067 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1525 = icmp eq i32 %4067, 0, !dbg !36 + %4068 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3737, float 0x3E54AE0C00000000, float %.0.i1524) #6, !dbg !36 + %4069 = tail call float @llvm.nvvm.fma.rn.f(float %3737, float 0x3E54AE0C00000000, float %.0.i1524) #6, !dbg !36 + %.01.i1526 = select i1 %.not4.i1525, float %4069, float %4068, !dbg !36 + %4070 = bitcast float %.04.i1522 to i32, !dbg !36 + %4071 = shl i32 %4070, 23, !dbg !36 + %4072 = bitcast i32 %4071 to float, !dbg !36 + %4073 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1526) #6, !dbg !36 + %4074 = fmul float %4073, %4072, !dbg !36 + %4075 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1527 = icmp eq i32 %4075, 0, !dbg !36 + %4076 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3738, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4077 = tail call float @llvm.nvvm.fma.rn.f(float %3738, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1528 = select i1 %.not.i1527, float %4077, float %4076, !dbg !36 + %4078 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1529 = icmp eq i32 %4078, 0, !dbg !36 + %4079 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1528) #6, !dbg !36 + %4080 = tail call float @llvm.nvvm.saturate.f(float %.02.i1528) #6, !dbg !36 + %.03.i1530 = select i1 %.not1.i1529, float %4080, float %4079, !dbg !36 + %4081 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1531 = icmp eq i32 %4081, 0, !dbg !36 + %4082 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1530, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4083 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1530, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1532 = select i1 %.not2.i1531, float %4083, float %4082, !dbg !36 + %4084 = fadd float %.04.i1532, 0xC168000FE0000000, !dbg !36 + %4085 = fneg float %4084, !dbg !36 + %4086 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1533 = icmp eq i32 %4086, 0, !dbg !36 + %4087 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3738, float 0x3FF7154760000000, float %4085) #6, !dbg !36 + %4088 = tail call float @llvm.nvvm.fma.rn.f(float %3738, float 0x3FF7154760000000, float %4085) #6, !dbg !36 + %.0.i1534 = select i1 %.not3.i1533, float %4088, float %4087, !dbg !36 + %4089 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1535 = icmp eq i32 %4089, 0, !dbg !36 + %4090 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3738, float 0x3E54AE0C00000000, float %.0.i1534) #6, !dbg !36 + %4091 = tail call float @llvm.nvvm.fma.rn.f(float %3738, float 0x3E54AE0C00000000, float %.0.i1534) #6, !dbg !36 + %.01.i1536 = select i1 %.not4.i1535, float %4091, float %4090, !dbg !36 + %4092 = bitcast float %.04.i1532 to i32, !dbg !36 + %4093 = shl i32 %4092, 23, !dbg !36 + %4094 = bitcast i32 %4093 to float, !dbg !36 + %4095 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1536) #6, !dbg !36 + %4096 = fmul float %4095, %4094, !dbg !36 + %4097 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1537 = icmp eq i32 %4097, 0, !dbg !36 + %4098 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3739, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4099 = tail call float @llvm.nvvm.fma.rn.f(float %3739, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1538 = select i1 %.not.i1537, float %4099, float %4098, !dbg !36 + %4100 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1539 = icmp eq i32 %4100, 0, !dbg !36 + %4101 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1538) #6, !dbg !36 + %4102 = tail call float @llvm.nvvm.saturate.f(float %.02.i1538) #6, !dbg !36 + %.03.i1540 = select i1 %.not1.i1539, float %4102, float %4101, !dbg !36 + %4103 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1541 = icmp eq i32 %4103, 0, !dbg !36 + %4104 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1540, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4105 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1540, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1542 = select i1 %.not2.i1541, float %4105, float %4104, !dbg !36 + %4106 = fadd float %.04.i1542, 0xC168000FE0000000, !dbg !36 + %4107 = fneg float %4106, !dbg !36 + %4108 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1543 = icmp eq i32 %4108, 0, !dbg !36 + %4109 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3739, float 0x3FF7154760000000, float %4107) #6, !dbg !36 + %4110 = tail call float @llvm.nvvm.fma.rn.f(float %3739, float 0x3FF7154760000000, float %4107) #6, !dbg !36 + %.0.i1544 = select i1 %.not3.i1543, float %4110, float %4109, !dbg !36 + %4111 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1545 = icmp eq i32 %4111, 0, !dbg !36 + %4112 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3739, float 0x3E54AE0C00000000, float %.0.i1544) #6, !dbg !36 + %4113 = tail call float @llvm.nvvm.fma.rn.f(float %3739, float 0x3E54AE0C00000000, float %.0.i1544) #6, !dbg !36 + %.01.i1546 = select i1 %.not4.i1545, float %4113, float %4112, !dbg !36 + %4114 = bitcast float %.04.i1542 to i32, !dbg !36 + %4115 = shl i32 %4114, 23, !dbg !36 + %4116 = bitcast i32 %4115 to float, !dbg !36 + %4117 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1546) #6, !dbg !36 + %4118 = fmul float %4117, %4116, !dbg !36 + %4119 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1547 = icmp eq i32 %4119, 0, !dbg !36 + %4120 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3740, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4121 = tail call float @llvm.nvvm.fma.rn.f(float %3740, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1548 = select i1 %.not.i1547, float %4121, float %4120, !dbg !36 + %4122 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1549 = icmp eq i32 %4122, 0, !dbg !36 + %4123 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1548) #6, !dbg !36 + %4124 = tail call float @llvm.nvvm.saturate.f(float %.02.i1548) #6, !dbg !36 + %.03.i1550 = select i1 %.not1.i1549, float %4124, float %4123, !dbg !36 + %4125 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1551 = icmp eq i32 %4125, 0, !dbg !36 + %4126 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1550, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4127 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1550, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1552 = select i1 %.not2.i1551, float %4127, float %4126, !dbg !36 + %4128 = fadd float %.04.i1552, 0xC168000FE0000000, !dbg !36 + %4129 = fneg float %4128, !dbg !36 + %4130 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1553 = icmp eq i32 %4130, 0, !dbg !36 + %4131 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3740, float 0x3FF7154760000000, float %4129) #6, !dbg !36 + %4132 = tail call float @llvm.nvvm.fma.rn.f(float %3740, float 0x3FF7154760000000, float %4129) #6, !dbg !36 + %.0.i1554 = select i1 %.not3.i1553, float %4132, float %4131, !dbg !36 + %4133 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1555 = icmp eq i32 %4133, 0, !dbg !36 + %4134 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3740, float 0x3E54AE0C00000000, float %.0.i1554) #6, !dbg !36 + %4135 = tail call float @llvm.nvvm.fma.rn.f(float %3740, float 0x3E54AE0C00000000, float %.0.i1554) #6, !dbg !36 + %.01.i1556 = select i1 %.not4.i1555, float %4135, float %4134, !dbg !36 + %4136 = bitcast float %.04.i1552 to i32, !dbg !36 + %4137 = shl i32 %4136, 23, !dbg !36 + %4138 = bitcast i32 %4137 to float, !dbg !36 + %4139 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1556) #6, !dbg !36 + %4140 = fmul float %4139, %4138, !dbg !36 + %4141 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1557 = icmp eq i32 %4141, 0, !dbg !36 + %4142 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3741, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4143 = tail call float @llvm.nvvm.fma.rn.f(float %3741, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1558 = select i1 %.not.i1557, float %4143, float %4142, !dbg !36 + %4144 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1559 = icmp eq i32 %4144, 0, !dbg !36 + %4145 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1558) #6, !dbg !36 + %4146 = tail call float @llvm.nvvm.saturate.f(float %.02.i1558) #6, !dbg !36 + %.03.i1560 = select i1 %.not1.i1559, float %4146, float %4145, !dbg !36 + %4147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1561 = icmp eq i32 %4147, 0, !dbg !36 + %4148 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1560, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4149 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1560, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1562 = select i1 %.not2.i1561, float %4149, float %4148, !dbg !36 + %4150 = fadd float %.04.i1562, 0xC168000FE0000000, !dbg !36 + %4151 = fneg float %4150, !dbg !36 + %4152 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1563 = icmp eq i32 %4152, 0, !dbg !36 + %4153 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3741, float 0x3FF7154760000000, float %4151) #6, !dbg !36 + %4154 = tail call float @llvm.nvvm.fma.rn.f(float %3741, float 0x3FF7154760000000, float %4151) #6, !dbg !36 + %.0.i1564 = select i1 %.not3.i1563, float %4154, float %4153, !dbg !36 + %4155 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1565 = icmp eq i32 %4155, 0, !dbg !36 + %4156 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3741, float 0x3E54AE0C00000000, float %.0.i1564) #6, !dbg !36 + %4157 = tail call float @llvm.nvvm.fma.rn.f(float %3741, float 0x3E54AE0C00000000, float %.0.i1564) #6, !dbg !36 + %.01.i1566 = select i1 %.not4.i1565, float %4157, float %4156, !dbg !36 + %4158 = bitcast float %.04.i1562 to i32, !dbg !36 + %4159 = shl i32 %4158, 23, !dbg !36 + %4160 = bitcast i32 %4159 to float, !dbg !36 + %4161 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1566) #6, !dbg !36 + %4162 = fmul float %4161, %4160, !dbg !36 + %4163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1567 = icmp eq i32 %4163, 0, !dbg !36 + %4164 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3742, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4165 = tail call float @llvm.nvvm.fma.rn.f(float %3742, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1568 = select i1 %.not.i1567, float %4165, float %4164, !dbg !36 + %4166 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1569 = icmp eq i32 %4166, 0, !dbg !36 + %4167 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1568) #6, !dbg !36 + %4168 = tail call float @llvm.nvvm.saturate.f(float %.02.i1568) #6, !dbg !36 + %.03.i1570 = select i1 %.not1.i1569, float %4168, float %4167, !dbg !36 + %4169 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1571 = icmp eq i32 %4169, 0, !dbg !36 + %4170 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1570, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4171 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1570, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1572 = select i1 %.not2.i1571, float %4171, float %4170, !dbg !36 + %4172 = fadd float %.04.i1572, 0xC168000FE0000000, !dbg !36 + %4173 = fneg float %4172, !dbg !36 + %4174 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1573 = icmp eq i32 %4174, 0, !dbg !36 + %4175 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3742, float 0x3FF7154760000000, float %4173) #6, !dbg !36 + %4176 = tail call float @llvm.nvvm.fma.rn.f(float %3742, float 0x3FF7154760000000, float %4173) #6, !dbg !36 + %.0.i1574 = select i1 %.not3.i1573, float %4176, float %4175, !dbg !36 + %4177 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1575 = icmp eq i32 %4177, 0, !dbg !36 + %4178 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3742, float 0x3E54AE0C00000000, float %.0.i1574) #6, !dbg !36 + %4179 = tail call float @llvm.nvvm.fma.rn.f(float %3742, float 0x3E54AE0C00000000, float %.0.i1574) #6, !dbg !36 + %.01.i1576 = select i1 %.not4.i1575, float %4179, float %4178, !dbg !36 + %4180 = bitcast float %.04.i1572 to i32, !dbg !36 + %4181 = shl i32 %4180, 23, !dbg !36 + %4182 = bitcast i32 %4181 to float, !dbg !36 + %4183 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1576) #6, !dbg !36 + %4184 = fmul float %4183, %4182, !dbg !36 + %4185 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1577 = icmp eq i32 %4185, 0, !dbg !36 + %4186 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3743, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4187 = tail call float @llvm.nvvm.fma.rn.f(float %3743, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1578 = select i1 %.not.i1577, float %4187, float %4186, !dbg !36 + %4188 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1579 = icmp eq i32 %4188, 0, !dbg !36 + %4189 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1578) #6, !dbg !36 + %4190 = tail call float @llvm.nvvm.saturate.f(float %.02.i1578) #6, !dbg !36 + %.03.i1580 = select i1 %.not1.i1579, float %4190, float %4189, !dbg !36 + %4191 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1581 = icmp eq i32 %4191, 0, !dbg !36 + %4192 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1580, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4193 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1580, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1582 = select i1 %.not2.i1581, float %4193, float %4192, !dbg !36 + %4194 = fadd float %.04.i1582, 0xC168000FE0000000, !dbg !36 + %4195 = fneg float %4194, !dbg !36 + %4196 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1583 = icmp eq i32 %4196, 0, !dbg !36 + %4197 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3743, float 0x3FF7154760000000, float %4195) #6, !dbg !36 + %4198 = tail call float @llvm.nvvm.fma.rn.f(float %3743, float 0x3FF7154760000000, float %4195) #6, !dbg !36 + %.0.i1584 = select i1 %.not3.i1583, float %4198, float %4197, !dbg !36 + %4199 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1585 = icmp eq i32 %4199, 0, !dbg !36 + %4200 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3743, float 0x3E54AE0C00000000, float %.0.i1584) #6, !dbg !36 + %4201 = tail call float @llvm.nvvm.fma.rn.f(float %3743, float 0x3E54AE0C00000000, float %.0.i1584) #6, !dbg !36 + %.01.i1586 = select i1 %.not4.i1585, float %4201, float %4200, !dbg !36 + %4202 = bitcast float %.04.i1582 to i32, !dbg !36 + %4203 = shl i32 %4202, 23, !dbg !36 + %4204 = bitcast i32 %4203 to float, !dbg !36 + %4205 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1586) #6, !dbg !36 + %4206 = fmul float %4205, %4204, !dbg !36 + %4207 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1587 = icmp eq i32 %4207, 0, !dbg !36 + %4208 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3744, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4209 = tail call float @llvm.nvvm.fma.rn.f(float %3744, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1588 = select i1 %.not.i1587, float %4209, float %4208, !dbg !36 + %4210 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1589 = icmp eq i32 %4210, 0, !dbg !36 + %4211 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1588) #6, !dbg !36 + %4212 = tail call float @llvm.nvvm.saturate.f(float %.02.i1588) #6, !dbg !36 + %.03.i1590 = select i1 %.not1.i1589, float %4212, float %4211, !dbg !36 + %4213 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1591 = icmp eq i32 %4213, 0, !dbg !36 + %4214 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1590, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4215 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1590, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1592 = select i1 %.not2.i1591, float %4215, float %4214, !dbg !36 + %4216 = fadd float %.04.i1592, 0xC168000FE0000000, !dbg !36 + %4217 = fneg float %4216, !dbg !36 + %4218 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1593 = icmp eq i32 %4218, 0, !dbg !36 + %4219 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3744, float 0x3FF7154760000000, float %4217) #6, !dbg !36 + %4220 = tail call float @llvm.nvvm.fma.rn.f(float %3744, float 0x3FF7154760000000, float %4217) #6, !dbg !36 + %.0.i1594 = select i1 %.not3.i1593, float %4220, float %4219, !dbg !36 + %4221 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1595 = icmp eq i32 %4221, 0, !dbg !36 + %4222 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3744, float 0x3E54AE0C00000000, float %.0.i1594) #6, !dbg !36 + %4223 = tail call float @llvm.nvvm.fma.rn.f(float %3744, float 0x3E54AE0C00000000, float %.0.i1594) #6, !dbg !36 + %.01.i1596 = select i1 %.not4.i1595, float %4223, float %4222, !dbg !36 + %4224 = bitcast float %.04.i1592 to i32, !dbg !36 + %4225 = shl i32 %4224, 23, !dbg !36 + %4226 = bitcast i32 %4225 to float, !dbg !36 + %4227 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1596) #6, !dbg !36 + %4228 = fmul float %4227, %4226, !dbg !36 + %4229 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1597 = icmp eq i32 %4229, 0, !dbg !36 + %4230 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3745, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4231 = tail call float @llvm.nvvm.fma.rn.f(float %3745, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1598 = select i1 %.not.i1597, float %4231, float %4230, !dbg !36 + %4232 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1599 = icmp eq i32 %4232, 0, !dbg !36 + %4233 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1598) #6, !dbg !36 + %4234 = tail call float @llvm.nvvm.saturate.f(float %.02.i1598) #6, !dbg !36 + %.03.i1600 = select i1 %.not1.i1599, float %4234, float %4233, !dbg !36 + %4235 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1601 = icmp eq i32 %4235, 0, !dbg !36 + %4236 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1600, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4237 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1600, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1602 = select i1 %.not2.i1601, float %4237, float %4236, !dbg !36 + %4238 = fadd float %.04.i1602, 0xC168000FE0000000, !dbg !36 + %4239 = fneg float %4238, !dbg !36 + %4240 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1603 = icmp eq i32 %4240, 0, !dbg !36 + %4241 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3745, float 0x3FF7154760000000, float %4239) #6, !dbg !36 + %4242 = tail call float @llvm.nvvm.fma.rn.f(float %3745, float 0x3FF7154760000000, float %4239) #6, !dbg !36 + %.0.i1604 = select i1 %.not3.i1603, float %4242, float %4241, !dbg !36 + %4243 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1605 = icmp eq i32 %4243, 0, !dbg !36 + %4244 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3745, float 0x3E54AE0C00000000, float %.0.i1604) #6, !dbg !36 + %4245 = tail call float @llvm.nvvm.fma.rn.f(float %3745, float 0x3E54AE0C00000000, float %.0.i1604) #6, !dbg !36 + %.01.i1606 = select i1 %.not4.i1605, float %4245, float %4244, !dbg !36 + %4246 = bitcast float %.04.i1602 to i32, !dbg !36 + %4247 = shl i32 %4246, 23, !dbg !36 + %4248 = bitcast i32 %4247 to float, !dbg !36 + %4249 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1606) #6, !dbg !36 + %4250 = fmul float %4249, %4248, !dbg !36 + %4251 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1607 = icmp eq i32 %4251, 0, !dbg !36 + %4252 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3746, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4253 = tail call float @llvm.nvvm.fma.rn.f(float %3746, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1608 = select i1 %.not.i1607, float %4253, float %4252, !dbg !36 + %4254 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1609 = icmp eq i32 %4254, 0, !dbg !36 + %4255 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1608) #6, !dbg !36 + %4256 = tail call float @llvm.nvvm.saturate.f(float %.02.i1608) #6, !dbg !36 + %.03.i1610 = select i1 %.not1.i1609, float %4256, float %4255, !dbg !36 + %4257 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1611 = icmp eq i32 %4257, 0, !dbg !36 + %4258 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1610, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4259 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1610, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1612 = select i1 %.not2.i1611, float %4259, float %4258, !dbg !36 + %4260 = fadd float %.04.i1612, 0xC168000FE0000000, !dbg !36 + %4261 = fneg float %4260, !dbg !36 + %4262 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1613 = icmp eq i32 %4262, 0, !dbg !36 + %4263 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3746, float 0x3FF7154760000000, float %4261) #6, !dbg !36 + %4264 = tail call float @llvm.nvvm.fma.rn.f(float %3746, float 0x3FF7154760000000, float %4261) #6, !dbg !36 + %.0.i1614 = select i1 %.not3.i1613, float %4264, float %4263, !dbg !36 + %4265 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1615 = icmp eq i32 %4265, 0, !dbg !36 + %4266 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3746, float 0x3E54AE0C00000000, float %.0.i1614) #6, !dbg !36 + %4267 = tail call float @llvm.nvvm.fma.rn.f(float %3746, float 0x3E54AE0C00000000, float %.0.i1614) #6, !dbg !36 + %.01.i1616 = select i1 %.not4.i1615, float %4267, float %4266, !dbg !36 + %4268 = bitcast float %.04.i1612 to i32, !dbg !36 + %4269 = shl i32 %4268, 23, !dbg !36 + %4270 = bitcast i32 %4269 to float, !dbg !36 + %4271 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1616) #6, !dbg !36 + %4272 = fmul float %4271, %4270, !dbg !36 + %4273 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1617 = icmp eq i32 %4273, 0, !dbg !36 + %4274 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3747, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4275 = tail call float @llvm.nvvm.fma.rn.f(float %3747, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1618 = select i1 %.not.i1617, float %4275, float %4274, !dbg !36 + %4276 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1619 = icmp eq i32 %4276, 0, !dbg !36 + %4277 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1618) #6, !dbg !36 + %4278 = tail call float @llvm.nvvm.saturate.f(float %.02.i1618) #6, !dbg !36 + %.03.i1620 = select i1 %.not1.i1619, float %4278, float %4277, !dbg !36 + %4279 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1621 = icmp eq i32 %4279, 0, !dbg !36 + %4280 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1620, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4281 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1620, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1622 = select i1 %.not2.i1621, float %4281, float %4280, !dbg !36 + %4282 = fadd float %.04.i1622, 0xC168000FE0000000, !dbg !36 + %4283 = fneg float %4282, !dbg !36 + %4284 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1623 = icmp eq i32 %4284, 0, !dbg !36 + %4285 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3747, float 0x3FF7154760000000, float %4283) #6, !dbg !36 + %4286 = tail call float @llvm.nvvm.fma.rn.f(float %3747, float 0x3FF7154760000000, float %4283) #6, !dbg !36 + %.0.i1624 = select i1 %.not3.i1623, float %4286, float %4285, !dbg !36 + %4287 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1625 = icmp eq i32 %4287, 0, !dbg !36 + %4288 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3747, float 0x3E54AE0C00000000, float %.0.i1624) #6, !dbg !36 + %4289 = tail call float @llvm.nvvm.fma.rn.f(float %3747, float 0x3E54AE0C00000000, float %.0.i1624) #6, !dbg !36 + %.01.i1626 = select i1 %.not4.i1625, float %4289, float %4288, !dbg !36 + %4290 = bitcast float %.04.i1622 to i32, !dbg !36 + %4291 = shl i32 %4290, 23, !dbg !36 + %4292 = bitcast i32 %4291 to float, !dbg !36 + %4293 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1626) #6, !dbg !36 + %4294 = fmul float %4293, %4292, !dbg !36 + %4295 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1627 = icmp eq i32 %4295, 0, !dbg !36 + %4296 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3748, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4297 = tail call float @llvm.nvvm.fma.rn.f(float %3748, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1628 = select i1 %.not.i1627, float %4297, float %4296, !dbg !36 + %4298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1629 = icmp eq i32 %4298, 0, !dbg !36 + %4299 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1628) #6, !dbg !36 + %4300 = tail call float @llvm.nvvm.saturate.f(float %.02.i1628) #6, !dbg !36 + %.03.i1630 = select i1 %.not1.i1629, float %4300, float %4299, !dbg !36 + %4301 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1631 = icmp eq i32 %4301, 0, !dbg !36 + %4302 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1630, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4303 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1630, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1632 = select i1 %.not2.i1631, float %4303, float %4302, !dbg !36 + %4304 = fadd float %.04.i1632, 0xC168000FE0000000, !dbg !36 + %4305 = fneg float %4304, !dbg !36 + %4306 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1633 = icmp eq i32 %4306, 0, !dbg !36 + %4307 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3748, float 0x3FF7154760000000, float %4305) #6, !dbg !36 + %4308 = tail call float @llvm.nvvm.fma.rn.f(float %3748, float 0x3FF7154760000000, float %4305) #6, !dbg !36 + %.0.i1634 = select i1 %.not3.i1633, float %4308, float %4307, !dbg !36 + %4309 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1635 = icmp eq i32 %4309, 0, !dbg !36 + %4310 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3748, float 0x3E54AE0C00000000, float %.0.i1634) #6, !dbg !36 + %4311 = tail call float @llvm.nvvm.fma.rn.f(float %3748, float 0x3E54AE0C00000000, float %.0.i1634) #6, !dbg !36 + %.01.i1636 = select i1 %.not4.i1635, float %4311, float %4310, !dbg !36 + %4312 = bitcast float %.04.i1632 to i32, !dbg !36 + %4313 = shl i32 %4312, 23, !dbg !36 + %4314 = bitcast i32 %4313 to float, !dbg !36 + %4315 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1636) #6, !dbg !36 + %4316 = fmul float %4315, %4314, !dbg !36 + %4317 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1637 = icmp eq i32 %4317, 0, !dbg !36 + %4318 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3749, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4319 = tail call float @llvm.nvvm.fma.rn.f(float %3749, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1638 = select i1 %.not.i1637, float %4319, float %4318, !dbg !36 + %4320 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1639 = icmp eq i32 %4320, 0, !dbg !36 + %4321 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1638) #6, !dbg !36 + %4322 = tail call float @llvm.nvvm.saturate.f(float %.02.i1638) #6, !dbg !36 + %.03.i1640 = select i1 %.not1.i1639, float %4322, float %4321, !dbg !36 + %4323 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1641 = icmp eq i32 %4323, 0, !dbg !36 + %4324 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1640, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4325 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1640, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1642 = select i1 %.not2.i1641, float %4325, float %4324, !dbg !36 + %4326 = fadd float %.04.i1642, 0xC168000FE0000000, !dbg !36 + %4327 = fneg float %4326, !dbg !36 + %4328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1643 = icmp eq i32 %4328, 0, !dbg !36 + %4329 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3749, float 0x3FF7154760000000, float %4327) #6, !dbg !36 + %4330 = tail call float @llvm.nvvm.fma.rn.f(float %3749, float 0x3FF7154760000000, float %4327) #6, !dbg !36 + %.0.i1644 = select i1 %.not3.i1643, float %4330, float %4329, !dbg !36 + %4331 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1645 = icmp eq i32 %4331, 0, !dbg !36 + %4332 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3749, float 0x3E54AE0C00000000, float %.0.i1644) #6, !dbg !36 + %4333 = tail call float @llvm.nvvm.fma.rn.f(float %3749, float 0x3E54AE0C00000000, float %.0.i1644) #6, !dbg !36 + %.01.i1646 = select i1 %.not4.i1645, float %4333, float %4332, !dbg !36 + %4334 = bitcast float %.04.i1642 to i32, !dbg !36 + %4335 = shl i32 %4334, 23, !dbg !36 + %4336 = bitcast i32 %4335 to float, !dbg !36 + %4337 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1646) #6, !dbg !36 + %4338 = fmul float %4337, %4336, !dbg !36 + %4339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1647 = icmp eq i32 %4339, 0, !dbg !36 + %4340 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3750, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4341 = tail call float @llvm.nvvm.fma.rn.f(float %3750, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1648 = select i1 %.not.i1647, float %4341, float %4340, !dbg !36 + %4342 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1649 = icmp eq i32 %4342, 0, !dbg !36 + %4343 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1648) #6, !dbg !36 + %4344 = tail call float @llvm.nvvm.saturate.f(float %.02.i1648) #6, !dbg !36 + %.03.i1650 = select i1 %.not1.i1649, float %4344, float %4343, !dbg !36 + %4345 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1651 = icmp eq i32 %4345, 0, !dbg !36 + %4346 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1650, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4347 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1650, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1652 = select i1 %.not2.i1651, float %4347, float %4346, !dbg !36 + %4348 = fadd float %.04.i1652, 0xC168000FE0000000, !dbg !36 + %4349 = fneg float %4348, !dbg !36 + %4350 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1653 = icmp eq i32 %4350, 0, !dbg !36 + %4351 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3750, float 0x3FF7154760000000, float %4349) #6, !dbg !36 + %4352 = tail call float @llvm.nvvm.fma.rn.f(float %3750, float 0x3FF7154760000000, float %4349) #6, !dbg !36 + %.0.i1654 = select i1 %.not3.i1653, float %4352, float %4351, !dbg !36 + %4353 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1655 = icmp eq i32 %4353, 0, !dbg !36 + %4354 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3750, float 0x3E54AE0C00000000, float %.0.i1654) #6, !dbg !36 + %4355 = tail call float @llvm.nvvm.fma.rn.f(float %3750, float 0x3E54AE0C00000000, float %.0.i1654) #6, !dbg !36 + %.01.i1656 = select i1 %.not4.i1655, float %4355, float %4354, !dbg !36 + %4356 = bitcast float %.04.i1652 to i32, !dbg !36 + %4357 = shl i32 %4356, 23, !dbg !36 + %4358 = bitcast i32 %4357 to float, !dbg !36 + %4359 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1656) #6, !dbg !36 + %4360 = fmul float %4359, %4358, !dbg !36 + %4361 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1657 = icmp eq i32 %4361, 0, !dbg !36 + %4362 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3751, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4363 = tail call float @llvm.nvvm.fma.rn.f(float %3751, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1658 = select i1 %.not.i1657, float %4363, float %4362, !dbg !36 + %4364 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1659 = icmp eq i32 %4364, 0, !dbg !36 + %4365 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1658) #6, !dbg !36 + %4366 = tail call float @llvm.nvvm.saturate.f(float %.02.i1658) #6, !dbg !36 + %.03.i1660 = select i1 %.not1.i1659, float %4366, float %4365, !dbg !36 + %4367 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1661 = icmp eq i32 %4367, 0, !dbg !36 + %4368 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1660, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4369 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1660, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1662 = select i1 %.not2.i1661, float %4369, float %4368, !dbg !36 + %4370 = fadd float %.04.i1662, 0xC168000FE0000000, !dbg !36 + %4371 = fneg float %4370, !dbg !36 + %4372 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1663 = icmp eq i32 %4372, 0, !dbg !36 + %4373 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3751, float 0x3FF7154760000000, float %4371) #6, !dbg !36 + %4374 = tail call float @llvm.nvvm.fma.rn.f(float %3751, float 0x3FF7154760000000, float %4371) #6, !dbg !36 + %.0.i1664 = select i1 %.not3.i1663, float %4374, float %4373, !dbg !36 + %4375 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1665 = icmp eq i32 %4375, 0, !dbg !36 + %4376 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3751, float 0x3E54AE0C00000000, float %.0.i1664) #6, !dbg !36 + %4377 = tail call float @llvm.nvvm.fma.rn.f(float %3751, float 0x3E54AE0C00000000, float %.0.i1664) #6, !dbg !36 + %.01.i1666 = select i1 %.not4.i1665, float %4377, float %4376, !dbg !36 + %4378 = bitcast float %.04.i1662 to i32, !dbg !36 + %4379 = shl i32 %4378, 23, !dbg !36 + %4380 = bitcast i32 %4379 to float, !dbg !36 + %4381 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1666) #6, !dbg !36 + %4382 = fmul float %4381, %4380, !dbg !36 + %4383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1667 = icmp eq i32 %4383, 0, !dbg !36 + %4384 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3752, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4385 = tail call float @llvm.nvvm.fma.rn.f(float %3752, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1668 = select i1 %.not.i1667, float %4385, float %4384, !dbg !36 + %4386 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1669 = icmp eq i32 %4386, 0, !dbg !36 + %4387 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1668) #6, !dbg !36 + %4388 = tail call float @llvm.nvvm.saturate.f(float %.02.i1668) #6, !dbg !36 + %.03.i1670 = select i1 %.not1.i1669, float %4388, float %4387, !dbg !36 + %4389 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1671 = icmp eq i32 %4389, 0, !dbg !36 + %4390 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1670, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4391 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1670, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1672 = select i1 %.not2.i1671, float %4391, float %4390, !dbg !36 + %4392 = fadd float %.04.i1672, 0xC168000FE0000000, !dbg !36 + %4393 = fneg float %4392, !dbg !36 + %4394 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1673 = icmp eq i32 %4394, 0, !dbg !36 + %4395 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3752, float 0x3FF7154760000000, float %4393) #6, !dbg !36 + %4396 = tail call float @llvm.nvvm.fma.rn.f(float %3752, float 0x3FF7154760000000, float %4393) #6, !dbg !36 + %.0.i1674 = select i1 %.not3.i1673, float %4396, float %4395, !dbg !36 + %4397 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1675 = icmp eq i32 %4397, 0, !dbg !36 + %4398 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3752, float 0x3E54AE0C00000000, float %.0.i1674) #6, !dbg !36 + %4399 = tail call float @llvm.nvvm.fma.rn.f(float %3752, float 0x3E54AE0C00000000, float %.0.i1674) #6, !dbg !36 + %.01.i1676 = select i1 %.not4.i1675, float %4399, float %4398, !dbg !36 + %4400 = bitcast float %.04.i1672 to i32, !dbg !36 + %4401 = shl i32 %4400, 23, !dbg !36 + %4402 = bitcast i32 %4401 to float, !dbg !36 + %4403 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1676) #6, !dbg !36 + %4404 = fmul float %4403, %4402, !dbg !36 + %4405 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1677 = icmp eq i32 %4405, 0, !dbg !36 + %4406 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3753, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4407 = tail call float @llvm.nvvm.fma.rn.f(float %3753, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1678 = select i1 %.not.i1677, float %4407, float %4406, !dbg !36 + %4408 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1679 = icmp eq i32 %4408, 0, !dbg !36 + %4409 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1678) #6, !dbg !36 + %4410 = tail call float @llvm.nvvm.saturate.f(float %.02.i1678) #6, !dbg !36 + %.03.i1680 = select i1 %.not1.i1679, float %4410, float %4409, !dbg !36 + %4411 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1681 = icmp eq i32 %4411, 0, !dbg !36 + %4412 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1680, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4413 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1680, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1682 = select i1 %.not2.i1681, float %4413, float %4412, !dbg !36 + %4414 = fadd float %.04.i1682, 0xC168000FE0000000, !dbg !36 + %4415 = fneg float %4414, !dbg !36 + %4416 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1683 = icmp eq i32 %4416, 0, !dbg !36 + %4417 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3753, float 0x3FF7154760000000, float %4415) #6, !dbg !36 + %4418 = tail call float @llvm.nvvm.fma.rn.f(float %3753, float 0x3FF7154760000000, float %4415) #6, !dbg !36 + %.0.i1684 = select i1 %.not3.i1683, float %4418, float %4417, !dbg !36 + %4419 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1685 = icmp eq i32 %4419, 0, !dbg !36 + %4420 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3753, float 0x3E54AE0C00000000, float %.0.i1684) #6, !dbg !36 + %4421 = tail call float @llvm.nvvm.fma.rn.f(float %3753, float 0x3E54AE0C00000000, float %.0.i1684) #6, !dbg !36 + %.01.i1686 = select i1 %.not4.i1685, float %4421, float %4420, !dbg !36 + %4422 = bitcast float %.04.i1682 to i32, !dbg !36 + %4423 = shl i32 %4422, 23, !dbg !36 + %4424 = bitcast i32 %4423 to float, !dbg !36 + %4425 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1686) #6, !dbg !36 + %4426 = fmul float %4425, %4424, !dbg !36 + %4427 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1687 = icmp eq i32 %4427, 0, !dbg !36 + %4428 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3754, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4429 = tail call float @llvm.nvvm.fma.rn.f(float %3754, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1688 = select i1 %.not.i1687, float %4429, float %4428, !dbg !36 + %4430 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1689 = icmp eq i32 %4430, 0, !dbg !36 + %4431 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1688) #6, !dbg !36 + %4432 = tail call float @llvm.nvvm.saturate.f(float %.02.i1688) #6, !dbg !36 + %.03.i1690 = select i1 %.not1.i1689, float %4432, float %4431, !dbg !36 + %4433 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1691 = icmp eq i32 %4433, 0, !dbg !36 + %4434 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1690, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4435 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1690, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1692 = select i1 %.not2.i1691, float %4435, float %4434, !dbg !36 + %4436 = fadd float %.04.i1692, 0xC168000FE0000000, !dbg !36 + %4437 = fneg float %4436, !dbg !36 + %4438 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1693 = icmp eq i32 %4438, 0, !dbg !36 + %4439 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3754, float 0x3FF7154760000000, float %4437) #6, !dbg !36 + %4440 = tail call float @llvm.nvvm.fma.rn.f(float %3754, float 0x3FF7154760000000, float %4437) #6, !dbg !36 + %.0.i1694 = select i1 %.not3.i1693, float %4440, float %4439, !dbg !36 + %4441 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1695 = icmp eq i32 %4441, 0, !dbg !36 + %4442 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3754, float 0x3E54AE0C00000000, float %.0.i1694) #6, !dbg !36 + %4443 = tail call float @llvm.nvvm.fma.rn.f(float %3754, float 0x3E54AE0C00000000, float %.0.i1694) #6, !dbg !36 + %.01.i1696 = select i1 %.not4.i1695, float %4443, float %4442, !dbg !36 + %4444 = bitcast float %.04.i1692 to i32, !dbg !36 + %4445 = shl i32 %4444, 23, !dbg !36 + %4446 = bitcast i32 %4445 to float, !dbg !36 + %4447 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1696) #6, !dbg !36 + %4448 = fmul float %4447, %4446, !dbg !36 + %4449 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1697 = icmp eq i32 %4449, 0, !dbg !36 + %4450 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3755, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4451 = tail call float @llvm.nvvm.fma.rn.f(float %3755, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1698 = select i1 %.not.i1697, float %4451, float %4450, !dbg !36 + %4452 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1699 = icmp eq i32 %4452, 0, !dbg !36 + %4453 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1698) #6, !dbg !36 + %4454 = tail call float @llvm.nvvm.saturate.f(float %.02.i1698) #6, !dbg !36 + %.03.i1700 = select i1 %.not1.i1699, float %4454, float %4453, !dbg !36 + %4455 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1701 = icmp eq i32 %4455, 0, !dbg !36 + %4456 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1700, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4457 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1700, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1702 = select i1 %.not2.i1701, float %4457, float %4456, !dbg !36 + %4458 = fadd float %.04.i1702, 0xC168000FE0000000, !dbg !36 + %4459 = fneg float %4458, !dbg !36 + %4460 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1703 = icmp eq i32 %4460, 0, !dbg !36 + %4461 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3755, float 0x3FF7154760000000, float %4459) #6, !dbg !36 + %4462 = tail call float @llvm.nvvm.fma.rn.f(float %3755, float 0x3FF7154760000000, float %4459) #6, !dbg !36 + %.0.i1704 = select i1 %.not3.i1703, float %4462, float %4461, !dbg !36 + %4463 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1705 = icmp eq i32 %4463, 0, !dbg !36 + %4464 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3755, float 0x3E54AE0C00000000, float %.0.i1704) #6, !dbg !36 + %4465 = tail call float @llvm.nvvm.fma.rn.f(float %3755, float 0x3E54AE0C00000000, float %.0.i1704) #6, !dbg !36 + %.01.i1706 = select i1 %.not4.i1705, float %4465, float %4464, !dbg !36 + %4466 = bitcast float %.04.i1702 to i32, !dbg !36 + %4467 = shl i32 %4466, 23, !dbg !36 + %4468 = bitcast i32 %4467 to float, !dbg !36 + %4469 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1706) #6, !dbg !36 + %4470 = fmul float %4469, %4468, !dbg !36 + %4471 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1707 = icmp eq i32 %4471, 0, !dbg !36 + %4472 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3756, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4473 = tail call float @llvm.nvvm.fma.rn.f(float %3756, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1708 = select i1 %.not.i1707, float %4473, float %4472, !dbg !36 + %4474 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1709 = icmp eq i32 %4474, 0, !dbg !36 + %4475 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1708) #6, !dbg !36 + %4476 = tail call float @llvm.nvvm.saturate.f(float %.02.i1708) #6, !dbg !36 + %.03.i1710 = select i1 %.not1.i1709, float %4476, float %4475, !dbg !36 + %4477 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1711 = icmp eq i32 %4477, 0, !dbg !36 + %4478 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1710, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4479 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1710, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1712 = select i1 %.not2.i1711, float %4479, float %4478, !dbg !36 + %4480 = fadd float %.04.i1712, 0xC168000FE0000000, !dbg !36 + %4481 = fneg float %4480, !dbg !36 + %4482 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1713 = icmp eq i32 %4482, 0, !dbg !36 + %4483 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3756, float 0x3FF7154760000000, float %4481) #6, !dbg !36 + %4484 = tail call float @llvm.nvvm.fma.rn.f(float %3756, float 0x3FF7154760000000, float %4481) #6, !dbg !36 + %.0.i1714 = select i1 %.not3.i1713, float %4484, float %4483, !dbg !36 + %4485 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1715 = icmp eq i32 %4485, 0, !dbg !36 + %4486 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3756, float 0x3E54AE0C00000000, float %.0.i1714) #6, !dbg !36 + %4487 = tail call float @llvm.nvvm.fma.rn.f(float %3756, float 0x3E54AE0C00000000, float %.0.i1714) #6, !dbg !36 + %.01.i1716 = select i1 %.not4.i1715, float %4487, float %4486, !dbg !36 + %4488 = bitcast float %.04.i1712 to i32, !dbg !36 + %4489 = shl i32 %4488, 23, !dbg !36 + %4490 = bitcast i32 %4489 to float, !dbg !36 + %4491 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1716) #6, !dbg !36 + %4492 = fmul float %4491, %4490, !dbg !36 + %4493 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1717 = icmp eq i32 %4493, 0, !dbg !36 + %4494 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3757, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4495 = tail call float @llvm.nvvm.fma.rn.f(float %3757, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1718 = select i1 %.not.i1717, float %4495, float %4494, !dbg !36 + %4496 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1719 = icmp eq i32 %4496, 0, !dbg !36 + %4497 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1718) #6, !dbg !36 + %4498 = tail call float @llvm.nvvm.saturate.f(float %.02.i1718) #6, !dbg !36 + %.03.i1720 = select i1 %.not1.i1719, float %4498, float %4497, !dbg !36 + %4499 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1721 = icmp eq i32 %4499, 0, !dbg !36 + %4500 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1720, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4501 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1720, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1722 = select i1 %.not2.i1721, float %4501, float %4500, !dbg !36 + %4502 = fadd float %.04.i1722, 0xC168000FE0000000, !dbg !36 + %4503 = fneg float %4502, !dbg !36 + %4504 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1723 = icmp eq i32 %4504, 0, !dbg !36 + %4505 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3757, float 0x3FF7154760000000, float %4503) #6, !dbg !36 + %4506 = tail call float @llvm.nvvm.fma.rn.f(float %3757, float 0x3FF7154760000000, float %4503) #6, !dbg !36 + %.0.i1724 = select i1 %.not3.i1723, float %4506, float %4505, !dbg !36 + %4507 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1725 = icmp eq i32 %4507, 0, !dbg !36 + %4508 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3757, float 0x3E54AE0C00000000, float %.0.i1724) #6, !dbg !36 + %4509 = tail call float @llvm.nvvm.fma.rn.f(float %3757, float 0x3E54AE0C00000000, float %.0.i1724) #6, !dbg !36 + %.01.i1726 = select i1 %.not4.i1725, float %4509, float %4508, !dbg !36 + %4510 = bitcast float %.04.i1722 to i32, !dbg !36 + %4511 = shl i32 %4510, 23, !dbg !36 + %4512 = bitcast i32 %4511 to float, !dbg !36 + %4513 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1726) #6, !dbg !36 + %4514 = fmul float %4513, %4512, !dbg !36 + %4515 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1727 = icmp eq i32 %4515, 0, !dbg !36 + %4516 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3758, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4517 = tail call float @llvm.nvvm.fma.rn.f(float %3758, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1728 = select i1 %.not.i1727, float %4517, float %4516, !dbg !36 + %4518 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1729 = icmp eq i32 %4518, 0, !dbg !36 + %4519 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1728) #6, !dbg !36 + %4520 = tail call float @llvm.nvvm.saturate.f(float %.02.i1728) #6, !dbg !36 + %.03.i1730 = select i1 %.not1.i1729, float %4520, float %4519, !dbg !36 + %4521 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1731 = icmp eq i32 %4521, 0, !dbg !36 + %4522 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1730, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4523 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1730, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1732 = select i1 %.not2.i1731, float %4523, float %4522, !dbg !36 + %4524 = fadd float %.04.i1732, 0xC168000FE0000000, !dbg !36 + %4525 = fneg float %4524, !dbg !36 + %4526 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1733 = icmp eq i32 %4526, 0, !dbg !36 + %4527 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3758, float 0x3FF7154760000000, float %4525) #6, !dbg !36 + %4528 = tail call float @llvm.nvvm.fma.rn.f(float %3758, float 0x3FF7154760000000, float %4525) #6, !dbg !36 + %.0.i1734 = select i1 %.not3.i1733, float %4528, float %4527, !dbg !36 + %4529 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1735 = icmp eq i32 %4529, 0, !dbg !36 + %4530 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3758, float 0x3E54AE0C00000000, float %.0.i1734) #6, !dbg !36 + %4531 = tail call float @llvm.nvvm.fma.rn.f(float %3758, float 0x3E54AE0C00000000, float %.0.i1734) #6, !dbg !36 + %.01.i1736 = select i1 %.not4.i1735, float %4531, float %4530, !dbg !36 + %4532 = bitcast float %.04.i1732 to i32, !dbg !36 + %4533 = shl i32 %4532, 23, !dbg !36 + %4534 = bitcast i32 %4533 to float, !dbg !36 + %4535 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1736) #6, !dbg !36 + %4536 = fmul float %4535, %4534, !dbg !36 + %4537 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1737 = icmp eq i32 %4537, 0, !dbg !36 + %4538 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3759, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4539 = tail call float @llvm.nvvm.fma.rn.f(float %3759, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1738 = select i1 %.not.i1737, float %4539, float %4538, !dbg !36 + %4540 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1739 = icmp eq i32 %4540, 0, !dbg !36 + %4541 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1738) #6, !dbg !36 + %4542 = tail call float @llvm.nvvm.saturate.f(float %.02.i1738) #6, !dbg !36 + %.03.i1740 = select i1 %.not1.i1739, float %4542, float %4541, !dbg !36 + %4543 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1741 = icmp eq i32 %4543, 0, !dbg !36 + %4544 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1740, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4545 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1740, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1742 = select i1 %.not2.i1741, float %4545, float %4544, !dbg !36 + %4546 = fadd float %.04.i1742, 0xC168000FE0000000, !dbg !36 + %4547 = fneg float %4546, !dbg !36 + %4548 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1743 = icmp eq i32 %4548, 0, !dbg !36 + %4549 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3759, float 0x3FF7154760000000, float %4547) #6, !dbg !36 + %4550 = tail call float @llvm.nvvm.fma.rn.f(float %3759, float 0x3FF7154760000000, float %4547) #6, !dbg !36 + %.0.i1744 = select i1 %.not3.i1743, float %4550, float %4549, !dbg !36 + %4551 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1745 = icmp eq i32 %4551, 0, !dbg !36 + %4552 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3759, float 0x3E54AE0C00000000, float %.0.i1744) #6, !dbg !36 + %4553 = tail call float @llvm.nvvm.fma.rn.f(float %3759, float 0x3E54AE0C00000000, float %.0.i1744) #6, !dbg !36 + %.01.i1746 = select i1 %.not4.i1745, float %4553, float %4552, !dbg !36 + %4554 = bitcast float %.04.i1742 to i32, !dbg !36 + %4555 = shl i32 %4554, 23, !dbg !36 + %4556 = bitcast i32 %4555 to float, !dbg !36 + %4557 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1746) #6, !dbg !36 + %4558 = fmul float %4557, %4556, !dbg !36 + %4559 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1747 = icmp eq i32 %4559, 0, !dbg !36 + %4560 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3760, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4561 = tail call float @llvm.nvvm.fma.rn.f(float %3760, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1748 = select i1 %.not.i1747, float %4561, float %4560, !dbg !36 + %4562 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1749 = icmp eq i32 %4562, 0, !dbg !36 + %4563 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1748) #6, !dbg !36 + %4564 = tail call float @llvm.nvvm.saturate.f(float %.02.i1748) #6, !dbg !36 + %.03.i1750 = select i1 %.not1.i1749, float %4564, float %4563, !dbg !36 + %4565 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1751 = icmp eq i32 %4565, 0, !dbg !36 + %4566 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1750, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4567 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1750, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1752 = select i1 %.not2.i1751, float %4567, float %4566, !dbg !36 + %4568 = fadd float %.04.i1752, 0xC168000FE0000000, !dbg !36 + %4569 = fneg float %4568, !dbg !36 + %4570 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1753 = icmp eq i32 %4570, 0, !dbg !36 + %4571 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3760, float 0x3FF7154760000000, float %4569) #6, !dbg !36 + %4572 = tail call float @llvm.nvvm.fma.rn.f(float %3760, float 0x3FF7154760000000, float %4569) #6, !dbg !36 + %.0.i1754 = select i1 %.not3.i1753, float %4572, float %4571, !dbg !36 + %4573 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1755 = icmp eq i32 %4573, 0, !dbg !36 + %4574 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3760, float 0x3E54AE0C00000000, float %.0.i1754) #6, !dbg !36 + %4575 = tail call float @llvm.nvvm.fma.rn.f(float %3760, float 0x3E54AE0C00000000, float %.0.i1754) #6, !dbg !36 + %.01.i1756 = select i1 %.not4.i1755, float %4575, float %4574, !dbg !36 + %4576 = bitcast float %.04.i1752 to i32, !dbg !36 + %4577 = shl i32 %4576, 23, !dbg !36 + %4578 = bitcast i32 %4577 to float, !dbg !36 + %4579 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1756) #6, !dbg !36 + %4580 = fmul float %4579, %4578, !dbg !36 + %4581 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1757 = icmp eq i32 %4581, 0, !dbg !36 + %4582 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3761, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4583 = tail call float @llvm.nvvm.fma.rn.f(float %3761, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1758 = select i1 %.not.i1757, float %4583, float %4582, !dbg !36 + %4584 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1759 = icmp eq i32 %4584, 0, !dbg !36 + %4585 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1758) #6, !dbg !36 + %4586 = tail call float @llvm.nvvm.saturate.f(float %.02.i1758) #6, !dbg !36 + %.03.i1760 = select i1 %.not1.i1759, float %4586, float %4585, !dbg !36 + %4587 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1761 = icmp eq i32 %4587, 0, !dbg !36 + %4588 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1760, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4589 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1760, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1762 = select i1 %.not2.i1761, float %4589, float %4588, !dbg !36 + %4590 = fadd float %.04.i1762, 0xC168000FE0000000, !dbg !36 + %4591 = fneg float %4590, !dbg !36 + %4592 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1763 = icmp eq i32 %4592, 0, !dbg !36 + %4593 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3761, float 0x3FF7154760000000, float %4591) #6, !dbg !36 + %4594 = tail call float @llvm.nvvm.fma.rn.f(float %3761, float 0x3FF7154760000000, float %4591) #6, !dbg !36 + %.0.i1764 = select i1 %.not3.i1763, float %4594, float %4593, !dbg !36 + %4595 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1765 = icmp eq i32 %4595, 0, !dbg !36 + %4596 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3761, float 0x3E54AE0C00000000, float %.0.i1764) #6, !dbg !36 + %4597 = tail call float @llvm.nvvm.fma.rn.f(float %3761, float 0x3E54AE0C00000000, float %.0.i1764) #6, !dbg !36 + %.01.i1766 = select i1 %.not4.i1765, float %4597, float %4596, !dbg !36 + %4598 = bitcast float %.04.i1762 to i32, !dbg !36 + %4599 = shl i32 %4598, 23, !dbg !36 + %4600 = bitcast i32 %4599 to float, !dbg !36 + %4601 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1766) #6, !dbg !36 + %4602 = fmul float %4601, %4600, !dbg !36 + %4603 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1767 = icmp eq i32 %4603, 0, !dbg !36 + %4604 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3762, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4605 = tail call float @llvm.nvvm.fma.rn.f(float %3762, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1768 = select i1 %.not.i1767, float %4605, float %4604, !dbg !36 + %4606 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1769 = icmp eq i32 %4606, 0, !dbg !36 + %4607 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1768) #6, !dbg !36 + %4608 = tail call float @llvm.nvvm.saturate.f(float %.02.i1768) #6, !dbg !36 + %.03.i1770 = select i1 %.not1.i1769, float %4608, float %4607, !dbg !36 + %4609 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1771 = icmp eq i32 %4609, 0, !dbg !36 + %4610 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1770, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4611 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1770, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1772 = select i1 %.not2.i1771, float %4611, float %4610, !dbg !36 + %4612 = fadd float %.04.i1772, 0xC168000FE0000000, !dbg !36 + %4613 = fneg float %4612, !dbg !36 + %4614 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1773 = icmp eq i32 %4614, 0, !dbg !36 + %4615 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3762, float 0x3FF7154760000000, float %4613) #6, !dbg !36 + %4616 = tail call float @llvm.nvvm.fma.rn.f(float %3762, float 0x3FF7154760000000, float %4613) #6, !dbg !36 + %.0.i1774 = select i1 %.not3.i1773, float %4616, float %4615, !dbg !36 + %4617 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1775 = icmp eq i32 %4617, 0, !dbg !36 + %4618 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3762, float 0x3E54AE0C00000000, float %.0.i1774) #6, !dbg !36 + %4619 = tail call float @llvm.nvvm.fma.rn.f(float %3762, float 0x3E54AE0C00000000, float %.0.i1774) #6, !dbg !36 + %.01.i1776 = select i1 %.not4.i1775, float %4619, float %4618, !dbg !36 + %4620 = bitcast float %.04.i1772 to i32, !dbg !36 + %4621 = shl i32 %4620, 23, !dbg !36 + %4622 = bitcast i32 %4621 to float, !dbg !36 + %4623 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1776) #6, !dbg !36 + %4624 = fmul float %4623, %4622, !dbg !36 + %4625 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1777 = icmp eq i32 %4625, 0, !dbg !36 + %4626 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3763, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4627 = tail call float @llvm.nvvm.fma.rn.f(float %3763, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1778 = select i1 %.not.i1777, float %4627, float %4626, !dbg !36 + %4628 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1779 = icmp eq i32 %4628, 0, !dbg !36 + %4629 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1778) #6, !dbg !36 + %4630 = tail call float @llvm.nvvm.saturate.f(float %.02.i1778) #6, !dbg !36 + %.03.i1780 = select i1 %.not1.i1779, float %4630, float %4629, !dbg !36 + %4631 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1781 = icmp eq i32 %4631, 0, !dbg !36 + %4632 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1780, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4633 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1780, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1782 = select i1 %.not2.i1781, float %4633, float %4632, !dbg !36 + %4634 = fadd float %.04.i1782, 0xC168000FE0000000, !dbg !36 + %4635 = fneg float %4634, !dbg !36 + %4636 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1783 = icmp eq i32 %4636, 0, !dbg !36 + %4637 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3763, float 0x3FF7154760000000, float %4635) #6, !dbg !36 + %4638 = tail call float @llvm.nvvm.fma.rn.f(float %3763, float 0x3FF7154760000000, float %4635) #6, !dbg !36 + %.0.i1784 = select i1 %.not3.i1783, float %4638, float %4637, !dbg !36 + %4639 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1785 = icmp eq i32 %4639, 0, !dbg !36 + %4640 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3763, float 0x3E54AE0C00000000, float %.0.i1784) #6, !dbg !36 + %4641 = tail call float @llvm.nvvm.fma.rn.f(float %3763, float 0x3E54AE0C00000000, float %.0.i1784) #6, !dbg !36 + %.01.i1786 = select i1 %.not4.i1785, float %4641, float %4640, !dbg !36 + %4642 = bitcast float %.04.i1782 to i32, !dbg !36 + %4643 = shl i32 %4642, 23, !dbg !36 + %4644 = bitcast i32 %4643 to float, !dbg !36 + %4645 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1786) #6, !dbg !36 + %4646 = fmul float %4645, %4644, !dbg !36 + %4647 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1787 = icmp eq i32 %4647, 0, !dbg !36 + %4648 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3764, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4649 = tail call float @llvm.nvvm.fma.rn.f(float %3764, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1788 = select i1 %.not.i1787, float %4649, float %4648, !dbg !36 + %4650 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1789 = icmp eq i32 %4650, 0, !dbg !36 + %4651 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1788) #6, !dbg !36 + %4652 = tail call float @llvm.nvvm.saturate.f(float %.02.i1788) #6, !dbg !36 + %.03.i1790 = select i1 %.not1.i1789, float %4652, float %4651, !dbg !36 + %4653 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1791 = icmp eq i32 %4653, 0, !dbg !36 + %4654 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1790, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4655 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1790, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1792 = select i1 %.not2.i1791, float %4655, float %4654, !dbg !36 + %4656 = fadd float %.04.i1792, 0xC168000FE0000000, !dbg !36 + %4657 = fneg float %4656, !dbg !36 + %4658 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1793 = icmp eq i32 %4658, 0, !dbg !36 + %4659 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3764, float 0x3FF7154760000000, float %4657) #6, !dbg !36 + %4660 = tail call float @llvm.nvvm.fma.rn.f(float %3764, float 0x3FF7154760000000, float %4657) #6, !dbg !36 + %.0.i1794 = select i1 %.not3.i1793, float %4660, float %4659, !dbg !36 + %4661 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1795 = icmp eq i32 %4661, 0, !dbg !36 + %4662 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3764, float 0x3E54AE0C00000000, float %.0.i1794) #6, !dbg !36 + %4663 = tail call float @llvm.nvvm.fma.rn.f(float %3764, float 0x3E54AE0C00000000, float %.0.i1794) #6, !dbg !36 + %.01.i1796 = select i1 %.not4.i1795, float %4663, float %4662, !dbg !36 + %4664 = bitcast float %.04.i1792 to i32, !dbg !36 + %4665 = shl i32 %4664, 23, !dbg !36 + %4666 = bitcast i32 %4665 to float, !dbg !36 + %4667 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1796) #6, !dbg !36 + %4668 = fmul float %4667, %4666, !dbg !36 + %4669 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1797 = icmp eq i32 %4669, 0, !dbg !36 + %4670 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3765, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4671 = tail call float @llvm.nvvm.fma.rn.f(float %3765, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1798 = select i1 %.not.i1797, float %4671, float %4670, !dbg !36 + %4672 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1799 = icmp eq i32 %4672, 0, !dbg !36 + %4673 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1798) #6, !dbg !36 + %4674 = tail call float @llvm.nvvm.saturate.f(float %.02.i1798) #6, !dbg !36 + %.03.i1800 = select i1 %.not1.i1799, float %4674, float %4673, !dbg !36 + %4675 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1801 = icmp eq i32 %4675, 0, !dbg !36 + %4676 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1800, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4677 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1800, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1802 = select i1 %.not2.i1801, float %4677, float %4676, !dbg !36 + %4678 = fadd float %.04.i1802, 0xC168000FE0000000, !dbg !36 + %4679 = fneg float %4678, !dbg !36 + %4680 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1803 = icmp eq i32 %4680, 0, !dbg !36 + %4681 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3765, float 0x3FF7154760000000, float %4679) #6, !dbg !36 + %4682 = tail call float @llvm.nvvm.fma.rn.f(float %3765, float 0x3FF7154760000000, float %4679) #6, !dbg !36 + %.0.i1804 = select i1 %.not3.i1803, float %4682, float %4681, !dbg !36 + %4683 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1805 = icmp eq i32 %4683, 0, !dbg !36 + %4684 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3765, float 0x3E54AE0C00000000, float %.0.i1804) #6, !dbg !36 + %4685 = tail call float @llvm.nvvm.fma.rn.f(float %3765, float 0x3E54AE0C00000000, float %.0.i1804) #6, !dbg !36 + %.01.i1806 = select i1 %.not4.i1805, float %4685, float %4684, !dbg !36 + %4686 = bitcast float %.04.i1802 to i32, !dbg !36 + %4687 = shl i32 %4686, 23, !dbg !36 + %4688 = bitcast i32 %4687 to float, !dbg !36 + %4689 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1806) #6, !dbg !36 + %4690 = fmul float %4689, %4688, !dbg !36 + %4691 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1807 = icmp eq i32 %4691, 0, !dbg !36 + %4692 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3766, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4693 = tail call float @llvm.nvvm.fma.rn.f(float %3766, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1808 = select i1 %.not.i1807, float %4693, float %4692, !dbg !36 + %4694 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1809 = icmp eq i32 %4694, 0, !dbg !36 + %4695 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1808) #6, !dbg !36 + %4696 = tail call float @llvm.nvvm.saturate.f(float %.02.i1808) #6, !dbg !36 + %.03.i1810 = select i1 %.not1.i1809, float %4696, float %4695, !dbg !36 + %4697 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1811 = icmp eq i32 %4697, 0, !dbg !36 + %4698 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1810, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4699 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1810, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1812 = select i1 %.not2.i1811, float %4699, float %4698, !dbg !36 + %4700 = fadd float %.04.i1812, 0xC168000FE0000000, !dbg !36 + %4701 = fneg float %4700, !dbg !36 + %4702 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1813 = icmp eq i32 %4702, 0, !dbg !36 + %4703 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3766, float 0x3FF7154760000000, float %4701) #6, !dbg !36 + %4704 = tail call float @llvm.nvvm.fma.rn.f(float %3766, float 0x3FF7154760000000, float %4701) #6, !dbg !36 + %.0.i1814 = select i1 %.not3.i1813, float %4704, float %4703, !dbg !36 + %4705 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1815 = icmp eq i32 %4705, 0, !dbg !36 + %4706 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3766, float 0x3E54AE0C00000000, float %.0.i1814) #6, !dbg !36 + %4707 = tail call float @llvm.nvvm.fma.rn.f(float %3766, float 0x3E54AE0C00000000, float %.0.i1814) #6, !dbg !36 + %.01.i1816 = select i1 %.not4.i1815, float %4707, float %4706, !dbg !36 + %4708 = bitcast float %.04.i1812 to i32, !dbg !36 + %4709 = shl i32 %4708, 23, !dbg !36 + %4710 = bitcast i32 %4709 to float, !dbg !36 + %4711 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1816) #6, !dbg !36 + %4712 = fmul float %4711, %4710, !dbg !36 + %4713 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1817 = icmp eq i32 %4713, 0, !dbg !36 + %4714 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3767, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4715 = tail call float @llvm.nvvm.fma.rn.f(float %3767, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1818 = select i1 %.not.i1817, float %4715, float %4714, !dbg !36 + %4716 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1819 = icmp eq i32 %4716, 0, !dbg !36 + %4717 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1818) #6, !dbg !36 + %4718 = tail call float @llvm.nvvm.saturate.f(float %.02.i1818) #6, !dbg !36 + %.03.i1820 = select i1 %.not1.i1819, float %4718, float %4717, !dbg !36 + %4719 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1821 = icmp eq i32 %4719, 0, !dbg !36 + %4720 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1820, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4721 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1820, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1822 = select i1 %.not2.i1821, float %4721, float %4720, !dbg !36 + %4722 = fadd float %.04.i1822, 0xC168000FE0000000, !dbg !36 + %4723 = fneg float %4722, !dbg !36 + %4724 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1823 = icmp eq i32 %4724, 0, !dbg !36 + %4725 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3767, float 0x3FF7154760000000, float %4723) #6, !dbg !36 + %4726 = tail call float @llvm.nvvm.fma.rn.f(float %3767, float 0x3FF7154760000000, float %4723) #6, !dbg !36 + %.0.i1824 = select i1 %.not3.i1823, float %4726, float %4725, !dbg !36 + %4727 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1825 = icmp eq i32 %4727, 0, !dbg !36 + %4728 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3767, float 0x3E54AE0C00000000, float %.0.i1824) #6, !dbg !36 + %4729 = tail call float @llvm.nvvm.fma.rn.f(float %3767, float 0x3E54AE0C00000000, float %.0.i1824) #6, !dbg !36 + %.01.i1826 = select i1 %.not4.i1825, float %4729, float %4728, !dbg !36 + %4730 = bitcast float %.04.i1822 to i32, !dbg !36 + %4731 = shl i32 %4730, 23, !dbg !36 + %4732 = bitcast i32 %4731 to float, !dbg !36 + %4733 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1826) #6, !dbg !36 + %4734 = fmul float %4733, %4732, !dbg !36 + %4735 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1827 = icmp eq i32 %4735, 0, !dbg !36 + %4736 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3768, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4737 = tail call float @llvm.nvvm.fma.rn.f(float %3768, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1828 = select i1 %.not.i1827, float %4737, float %4736, !dbg !36 + %4738 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1829 = icmp eq i32 %4738, 0, !dbg !36 + %4739 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1828) #6, !dbg !36 + %4740 = tail call float @llvm.nvvm.saturate.f(float %.02.i1828) #6, !dbg !36 + %.03.i1830 = select i1 %.not1.i1829, float %4740, float %4739, !dbg !36 + %4741 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1831 = icmp eq i32 %4741, 0, !dbg !36 + %4742 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1830, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4743 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1830, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1832 = select i1 %.not2.i1831, float %4743, float %4742, !dbg !36 + %4744 = fadd float %.04.i1832, 0xC168000FE0000000, !dbg !36 + %4745 = fneg float %4744, !dbg !36 + %4746 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1833 = icmp eq i32 %4746, 0, !dbg !36 + %4747 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3768, float 0x3FF7154760000000, float %4745) #6, !dbg !36 + %4748 = tail call float @llvm.nvvm.fma.rn.f(float %3768, float 0x3FF7154760000000, float %4745) #6, !dbg !36 + %.0.i1834 = select i1 %.not3.i1833, float %4748, float %4747, !dbg !36 + %4749 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1835 = icmp eq i32 %4749, 0, !dbg !36 + %4750 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3768, float 0x3E54AE0C00000000, float %.0.i1834) #6, !dbg !36 + %4751 = tail call float @llvm.nvvm.fma.rn.f(float %3768, float 0x3E54AE0C00000000, float %.0.i1834) #6, !dbg !36 + %.01.i1836 = select i1 %.not4.i1835, float %4751, float %4750, !dbg !36 + %4752 = bitcast float %.04.i1832 to i32, !dbg !36 + %4753 = shl i32 %4752, 23, !dbg !36 + %4754 = bitcast i32 %4753 to float, !dbg !36 + %4755 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1836) #6, !dbg !36 + %4756 = fmul float %4755, %4754, !dbg !36 + %4757 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1837 = icmp eq i32 %4757, 0, !dbg !36 + %4758 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3769, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4759 = tail call float @llvm.nvvm.fma.rn.f(float %3769, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1838 = select i1 %.not.i1837, float %4759, float %4758, !dbg !36 + %4760 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1839 = icmp eq i32 %4760, 0, !dbg !36 + %4761 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1838) #6, !dbg !36 + %4762 = tail call float @llvm.nvvm.saturate.f(float %.02.i1838) #6, !dbg !36 + %.03.i1840 = select i1 %.not1.i1839, float %4762, float %4761, !dbg !36 + %4763 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1841 = icmp eq i32 %4763, 0, !dbg !36 + %4764 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1840, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4765 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1840, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1842 = select i1 %.not2.i1841, float %4765, float %4764, !dbg !36 + %4766 = fadd float %.04.i1842, 0xC168000FE0000000, !dbg !36 + %4767 = fneg float %4766, !dbg !36 + %4768 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1843 = icmp eq i32 %4768, 0, !dbg !36 + %4769 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3769, float 0x3FF7154760000000, float %4767) #6, !dbg !36 + %4770 = tail call float @llvm.nvvm.fma.rn.f(float %3769, float 0x3FF7154760000000, float %4767) #6, !dbg !36 + %.0.i1844 = select i1 %.not3.i1843, float %4770, float %4769, !dbg !36 + %4771 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1845 = icmp eq i32 %4771, 0, !dbg !36 + %4772 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3769, float 0x3E54AE0C00000000, float %.0.i1844) #6, !dbg !36 + %4773 = tail call float @llvm.nvvm.fma.rn.f(float %3769, float 0x3E54AE0C00000000, float %.0.i1844) #6, !dbg !36 + %.01.i1846 = select i1 %.not4.i1845, float %4773, float %4772, !dbg !36 + %4774 = bitcast float %.04.i1842 to i32, !dbg !36 + %4775 = shl i32 %4774, 23, !dbg !36 + %4776 = bitcast i32 %4775 to float, !dbg !36 + %4777 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1846) #6, !dbg !36 + %4778 = fmul float %4777, %4776, !dbg !36 + %4779 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1847 = icmp eq i32 %4779, 0, !dbg !36 + %4780 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3770, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4781 = tail call float @llvm.nvvm.fma.rn.f(float %3770, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1848 = select i1 %.not.i1847, float %4781, float %4780, !dbg !36 + %4782 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1849 = icmp eq i32 %4782, 0, !dbg !36 + %4783 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1848) #6, !dbg !36 + %4784 = tail call float @llvm.nvvm.saturate.f(float %.02.i1848) #6, !dbg !36 + %.03.i1850 = select i1 %.not1.i1849, float %4784, float %4783, !dbg !36 + %4785 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1851 = icmp eq i32 %4785, 0, !dbg !36 + %4786 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1850, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4787 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1850, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1852 = select i1 %.not2.i1851, float %4787, float %4786, !dbg !36 + %4788 = fadd float %.04.i1852, 0xC168000FE0000000, !dbg !36 + %4789 = fneg float %4788, !dbg !36 + %4790 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1853 = icmp eq i32 %4790, 0, !dbg !36 + %4791 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3770, float 0x3FF7154760000000, float %4789) #6, !dbg !36 + %4792 = tail call float @llvm.nvvm.fma.rn.f(float %3770, float 0x3FF7154760000000, float %4789) #6, !dbg !36 + %.0.i1854 = select i1 %.not3.i1853, float %4792, float %4791, !dbg !36 + %4793 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1855 = icmp eq i32 %4793, 0, !dbg !36 + %4794 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3770, float 0x3E54AE0C00000000, float %.0.i1854) #6, !dbg !36 + %4795 = tail call float @llvm.nvvm.fma.rn.f(float %3770, float 0x3E54AE0C00000000, float %.0.i1854) #6, !dbg !36 + %.01.i1856 = select i1 %.not4.i1855, float %4795, float %4794, !dbg !36 + %4796 = bitcast float %.04.i1852 to i32, !dbg !36 + %4797 = shl i32 %4796, 23, !dbg !36 + %4798 = bitcast i32 %4797 to float, !dbg !36 + %4799 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1856) #6, !dbg !36 + %4800 = fmul float %4799, %4798, !dbg !36 + %4801 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1857 = icmp eq i32 %4801, 0, !dbg !36 + %4802 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3771, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4803 = tail call float @llvm.nvvm.fma.rn.f(float %3771, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1858 = select i1 %.not.i1857, float %4803, float %4802, !dbg !36 + %4804 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1859 = icmp eq i32 %4804, 0, !dbg !36 + %4805 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1858) #6, !dbg !36 + %4806 = tail call float @llvm.nvvm.saturate.f(float %.02.i1858) #6, !dbg !36 + %.03.i1860 = select i1 %.not1.i1859, float %4806, float %4805, !dbg !36 + %4807 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1861 = icmp eq i32 %4807, 0, !dbg !36 + %4808 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1860, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4809 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1860, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1862 = select i1 %.not2.i1861, float %4809, float %4808, !dbg !36 + %4810 = fadd float %.04.i1862, 0xC168000FE0000000, !dbg !36 + %4811 = fneg float %4810, !dbg !36 + %4812 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1863 = icmp eq i32 %4812, 0, !dbg !36 + %4813 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3771, float 0x3FF7154760000000, float %4811) #6, !dbg !36 + %4814 = tail call float @llvm.nvvm.fma.rn.f(float %3771, float 0x3FF7154760000000, float %4811) #6, !dbg !36 + %.0.i1864 = select i1 %.not3.i1863, float %4814, float %4813, !dbg !36 + %4815 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1865 = icmp eq i32 %4815, 0, !dbg !36 + %4816 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3771, float 0x3E54AE0C00000000, float %.0.i1864) #6, !dbg !36 + %4817 = tail call float @llvm.nvvm.fma.rn.f(float %3771, float 0x3E54AE0C00000000, float %.0.i1864) #6, !dbg !36 + %.01.i1866 = select i1 %.not4.i1865, float %4817, float %4816, !dbg !36 + %4818 = bitcast float %.04.i1862 to i32, !dbg !36 + %4819 = shl i32 %4818, 23, !dbg !36 + %4820 = bitcast i32 %4819 to float, !dbg !36 + %4821 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1866) #6, !dbg !36 + %4822 = fmul float %4821, %4820, !dbg !36 + %4823 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1867 = icmp eq i32 %4823, 0, !dbg !36 + %4824 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3772, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4825 = tail call float @llvm.nvvm.fma.rn.f(float %3772, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1868 = select i1 %.not.i1867, float %4825, float %4824, !dbg !36 + %4826 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1869 = icmp eq i32 %4826, 0, !dbg !36 + %4827 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1868) #6, !dbg !36 + %4828 = tail call float @llvm.nvvm.saturate.f(float %.02.i1868) #6, !dbg !36 + %.03.i1870 = select i1 %.not1.i1869, float %4828, float %4827, !dbg !36 + %4829 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1871 = icmp eq i32 %4829, 0, !dbg !36 + %4830 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1870, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4831 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1870, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1872 = select i1 %.not2.i1871, float %4831, float %4830, !dbg !36 + %4832 = fadd float %.04.i1872, 0xC168000FE0000000, !dbg !36 + %4833 = fneg float %4832, !dbg !36 + %4834 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1873 = icmp eq i32 %4834, 0, !dbg !36 + %4835 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3772, float 0x3FF7154760000000, float %4833) #6, !dbg !36 + %4836 = tail call float @llvm.nvvm.fma.rn.f(float %3772, float 0x3FF7154760000000, float %4833) #6, !dbg !36 + %.0.i1874 = select i1 %.not3.i1873, float %4836, float %4835, !dbg !36 + %4837 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1875 = icmp eq i32 %4837, 0, !dbg !36 + %4838 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3772, float 0x3E54AE0C00000000, float %.0.i1874) #6, !dbg !36 + %4839 = tail call float @llvm.nvvm.fma.rn.f(float %3772, float 0x3E54AE0C00000000, float %.0.i1874) #6, !dbg !36 + %.01.i1876 = select i1 %.not4.i1875, float %4839, float %4838, !dbg !36 + %4840 = bitcast float %.04.i1872 to i32, !dbg !36 + %4841 = shl i32 %4840, 23, !dbg !36 + %4842 = bitcast i32 %4841 to float, !dbg !36 + %4843 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1876) #6, !dbg !36 + %4844 = fmul float %4843, %4842, !dbg !36 + %4845 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1877 = icmp eq i32 %4845, 0, !dbg !36 + %4846 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3773, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4847 = tail call float @llvm.nvvm.fma.rn.f(float %3773, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1878 = select i1 %.not.i1877, float %4847, float %4846, !dbg !36 + %4848 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1879 = icmp eq i32 %4848, 0, !dbg !36 + %4849 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1878) #6, !dbg !36 + %4850 = tail call float @llvm.nvvm.saturate.f(float %.02.i1878) #6, !dbg !36 + %.03.i1880 = select i1 %.not1.i1879, float %4850, float %4849, !dbg !36 + %4851 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1881 = icmp eq i32 %4851, 0, !dbg !36 + %4852 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1880, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4853 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1880, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1882 = select i1 %.not2.i1881, float %4853, float %4852, !dbg !36 + %4854 = fadd float %.04.i1882, 0xC168000FE0000000, !dbg !36 + %4855 = fneg float %4854, !dbg !36 + %4856 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1883 = icmp eq i32 %4856, 0, !dbg !36 + %4857 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3773, float 0x3FF7154760000000, float %4855) #6, !dbg !36 + %4858 = tail call float @llvm.nvvm.fma.rn.f(float %3773, float 0x3FF7154760000000, float %4855) #6, !dbg !36 + %.0.i1884 = select i1 %.not3.i1883, float %4858, float %4857, !dbg !36 + %4859 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1885 = icmp eq i32 %4859, 0, !dbg !36 + %4860 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3773, float 0x3E54AE0C00000000, float %.0.i1884) #6, !dbg !36 + %4861 = tail call float @llvm.nvvm.fma.rn.f(float %3773, float 0x3E54AE0C00000000, float %.0.i1884) #6, !dbg !36 + %.01.i1886 = select i1 %.not4.i1885, float %4861, float %4860, !dbg !36 + %4862 = bitcast float %.04.i1882 to i32, !dbg !36 + %4863 = shl i32 %4862, 23, !dbg !36 + %4864 = bitcast i32 %4863 to float, !dbg !36 + %4865 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1886) #6, !dbg !36 + %4866 = fmul float %4865, %4864, !dbg !36 + %4867 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1887 = icmp eq i32 %4867, 0, !dbg !36 + %4868 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3774, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4869 = tail call float @llvm.nvvm.fma.rn.f(float %3774, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1888 = select i1 %.not.i1887, float %4869, float %4868, !dbg !36 + %4870 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1889 = icmp eq i32 %4870, 0, !dbg !36 + %4871 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1888) #6, !dbg !36 + %4872 = tail call float @llvm.nvvm.saturate.f(float %.02.i1888) #6, !dbg !36 + %.03.i1890 = select i1 %.not1.i1889, float %4872, float %4871, !dbg !36 + %4873 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1891 = icmp eq i32 %4873, 0, !dbg !36 + %4874 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1890, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4875 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1890, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1892 = select i1 %.not2.i1891, float %4875, float %4874, !dbg !36 + %4876 = fadd float %.04.i1892, 0xC168000FE0000000, !dbg !36 + %4877 = fneg float %4876, !dbg !36 + %4878 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1893 = icmp eq i32 %4878, 0, !dbg !36 + %4879 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3774, float 0x3FF7154760000000, float %4877) #6, !dbg !36 + %4880 = tail call float @llvm.nvvm.fma.rn.f(float %3774, float 0x3FF7154760000000, float %4877) #6, !dbg !36 + %.0.i1894 = select i1 %.not3.i1893, float %4880, float %4879, !dbg !36 + %4881 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1895 = icmp eq i32 %4881, 0, !dbg !36 + %4882 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3774, float 0x3E54AE0C00000000, float %.0.i1894) #6, !dbg !36 + %4883 = tail call float @llvm.nvvm.fma.rn.f(float %3774, float 0x3E54AE0C00000000, float %.0.i1894) #6, !dbg !36 + %.01.i1896 = select i1 %.not4.i1895, float %4883, float %4882, !dbg !36 + %4884 = bitcast float %.04.i1892 to i32, !dbg !36 + %4885 = shl i32 %4884, 23, !dbg !36 + %4886 = bitcast i32 %4885 to float, !dbg !36 + %4887 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1896) #6, !dbg !36 + %4888 = fmul float %4887, %4886, !dbg !36 + %4889 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1897 = icmp eq i32 %4889, 0, !dbg !36 + %4890 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3775, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4891 = tail call float @llvm.nvvm.fma.rn.f(float %3775, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1898 = select i1 %.not.i1897, float %4891, float %4890, !dbg !36 + %4892 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1899 = icmp eq i32 %4892, 0, !dbg !36 + %4893 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1898) #6, !dbg !36 + %4894 = tail call float @llvm.nvvm.saturate.f(float %.02.i1898) #6, !dbg !36 + %.03.i1900 = select i1 %.not1.i1899, float %4894, float %4893, !dbg !36 + %4895 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1901 = icmp eq i32 %4895, 0, !dbg !36 + %4896 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1900, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4897 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1900, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1902 = select i1 %.not2.i1901, float %4897, float %4896, !dbg !36 + %4898 = fadd float %.04.i1902, 0xC168000FE0000000, !dbg !36 + %4899 = fneg float %4898, !dbg !36 + %4900 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1903 = icmp eq i32 %4900, 0, !dbg !36 + %4901 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3775, float 0x3FF7154760000000, float %4899) #6, !dbg !36 + %4902 = tail call float @llvm.nvvm.fma.rn.f(float %3775, float 0x3FF7154760000000, float %4899) #6, !dbg !36 + %.0.i1904 = select i1 %.not3.i1903, float %4902, float %4901, !dbg !36 + %4903 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1905 = icmp eq i32 %4903, 0, !dbg !36 + %4904 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3775, float 0x3E54AE0C00000000, float %.0.i1904) #6, !dbg !36 + %4905 = tail call float @llvm.nvvm.fma.rn.f(float %3775, float 0x3E54AE0C00000000, float %.0.i1904) #6, !dbg !36 + %.01.i1906 = select i1 %.not4.i1905, float %4905, float %4904, !dbg !36 + %4906 = bitcast float %.04.i1902 to i32, !dbg !36 + %4907 = shl i32 %4906, 23, !dbg !36 + %4908 = bitcast i32 %4907 to float, !dbg !36 + %4909 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1906) #6, !dbg !36 + %4910 = fmul float %4909, %4908, !dbg !36 + %4911 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1907 = icmp eq i32 %4911, 0, !dbg !36 + %4912 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3776, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4913 = tail call float @llvm.nvvm.fma.rn.f(float %3776, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1908 = select i1 %.not.i1907, float %4913, float %4912, !dbg !36 + %4914 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1909 = icmp eq i32 %4914, 0, !dbg !36 + %4915 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1908) #6, !dbg !36 + %4916 = tail call float @llvm.nvvm.saturate.f(float %.02.i1908) #6, !dbg !36 + %.03.i1910 = select i1 %.not1.i1909, float %4916, float %4915, !dbg !36 + %4917 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1911 = icmp eq i32 %4917, 0, !dbg !36 + %4918 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1910, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4919 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1910, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1912 = select i1 %.not2.i1911, float %4919, float %4918, !dbg !36 + %4920 = fadd float %.04.i1912, 0xC168000FE0000000, !dbg !36 + %4921 = fneg float %4920, !dbg !36 + %4922 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1913 = icmp eq i32 %4922, 0, !dbg !36 + %4923 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3776, float 0x3FF7154760000000, float %4921) #6, !dbg !36 + %4924 = tail call float @llvm.nvvm.fma.rn.f(float %3776, float 0x3FF7154760000000, float %4921) #6, !dbg !36 + %.0.i1914 = select i1 %.not3.i1913, float %4924, float %4923, !dbg !36 + %4925 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1915 = icmp eq i32 %4925, 0, !dbg !36 + %4926 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3776, float 0x3E54AE0C00000000, float %.0.i1914) #6, !dbg !36 + %4927 = tail call float @llvm.nvvm.fma.rn.f(float %3776, float 0x3E54AE0C00000000, float %.0.i1914) #6, !dbg !36 + %.01.i1916 = select i1 %.not4.i1915, float %4927, float %4926, !dbg !36 + %4928 = bitcast float %.04.i1912 to i32, !dbg !36 + %4929 = shl i32 %4928, 23, !dbg !36 + %4930 = bitcast i32 %4929 to float, !dbg !36 + %4931 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1916) #6, !dbg !36 + %4932 = fmul float %4931, %4930, !dbg !36 + %4933 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1917 = icmp eq i32 %4933, 0, !dbg !36 + %4934 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3777, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4935 = tail call float @llvm.nvvm.fma.rn.f(float %3777, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1918 = select i1 %.not.i1917, float %4935, float %4934, !dbg !36 + %4936 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1919 = icmp eq i32 %4936, 0, !dbg !36 + %4937 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1918) #6, !dbg !36 + %4938 = tail call float @llvm.nvvm.saturate.f(float %.02.i1918) #6, !dbg !36 + %.03.i1920 = select i1 %.not1.i1919, float %4938, float %4937, !dbg !36 + %4939 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1921 = icmp eq i32 %4939, 0, !dbg !36 + %4940 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1920, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4941 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1920, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1922 = select i1 %.not2.i1921, float %4941, float %4940, !dbg !36 + %4942 = fadd float %.04.i1922, 0xC168000FE0000000, !dbg !36 + %4943 = fneg float %4942, !dbg !36 + %4944 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1923 = icmp eq i32 %4944, 0, !dbg !36 + %4945 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3777, float 0x3FF7154760000000, float %4943) #6, !dbg !36 + %4946 = tail call float @llvm.nvvm.fma.rn.f(float %3777, float 0x3FF7154760000000, float %4943) #6, !dbg !36 + %.0.i1924 = select i1 %.not3.i1923, float %4946, float %4945, !dbg !36 + %4947 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1925 = icmp eq i32 %4947, 0, !dbg !36 + %4948 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3777, float 0x3E54AE0C00000000, float %.0.i1924) #6, !dbg !36 + %4949 = tail call float @llvm.nvvm.fma.rn.f(float %3777, float 0x3E54AE0C00000000, float %.0.i1924) #6, !dbg !36 + %.01.i1926 = select i1 %.not4.i1925, float %4949, float %4948, !dbg !36 + %4950 = bitcast float %.04.i1922 to i32, !dbg !36 + %4951 = shl i32 %4950, 23, !dbg !36 + %4952 = bitcast i32 %4951 to float, !dbg !36 + %4953 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1926) #6, !dbg !36 + %4954 = fmul float %4953, %4952, !dbg !36 + %4955 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1927 = icmp eq i32 %4955, 0, !dbg !36 + %4956 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3778, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4957 = tail call float @llvm.nvvm.fma.rn.f(float %3778, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1928 = select i1 %.not.i1927, float %4957, float %4956, !dbg !36 + %4958 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1929 = icmp eq i32 %4958, 0, !dbg !36 + %4959 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1928) #6, !dbg !36 + %4960 = tail call float @llvm.nvvm.saturate.f(float %.02.i1928) #6, !dbg !36 + %.03.i1930 = select i1 %.not1.i1929, float %4960, float %4959, !dbg !36 + %4961 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1931 = icmp eq i32 %4961, 0, !dbg !36 + %4962 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1930, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4963 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1930, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1932 = select i1 %.not2.i1931, float %4963, float %4962, !dbg !36 + %4964 = fadd float %.04.i1932, 0xC168000FE0000000, !dbg !36 + %4965 = fneg float %4964, !dbg !36 + %4966 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1933 = icmp eq i32 %4966, 0, !dbg !36 + %4967 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3778, float 0x3FF7154760000000, float %4965) #6, !dbg !36 + %4968 = tail call float @llvm.nvvm.fma.rn.f(float %3778, float 0x3FF7154760000000, float %4965) #6, !dbg !36 + %.0.i1934 = select i1 %.not3.i1933, float %4968, float %4967, !dbg !36 + %4969 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1935 = icmp eq i32 %4969, 0, !dbg !36 + %4970 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3778, float 0x3E54AE0C00000000, float %.0.i1934) #6, !dbg !36 + %4971 = tail call float @llvm.nvvm.fma.rn.f(float %3778, float 0x3E54AE0C00000000, float %.0.i1934) #6, !dbg !36 + %.01.i1936 = select i1 %.not4.i1935, float %4971, float %4970, !dbg !36 + %4972 = bitcast float %.04.i1932 to i32, !dbg !36 + %4973 = shl i32 %4972, 23, !dbg !36 + %4974 = bitcast i32 %4973 to float, !dbg !36 + %4975 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1936) #6, !dbg !36 + %4976 = fmul float %4975, %4974, !dbg !36 + %4977 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1937 = icmp eq i32 %4977, 0, !dbg !36 + %4978 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3779, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %4979 = tail call float @llvm.nvvm.fma.rn.f(float %3779, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1938 = select i1 %.not.i1937, float %4979, float %4978, !dbg !36 + %4980 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1939 = icmp eq i32 %4980, 0, !dbg !36 + %4981 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1938) #6, !dbg !36 + %4982 = tail call float @llvm.nvvm.saturate.f(float %.02.i1938) #6, !dbg !36 + %.03.i1940 = select i1 %.not1.i1939, float %4982, float %4981, !dbg !36 + %4983 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1941 = icmp eq i32 %4983, 0, !dbg !36 + %4984 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1940, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %4985 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1940, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1942 = select i1 %.not2.i1941, float %4985, float %4984, !dbg !36 + %4986 = fadd float %.04.i1942, 0xC168000FE0000000, !dbg !36 + %4987 = fneg float %4986, !dbg !36 + %4988 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1943 = icmp eq i32 %4988, 0, !dbg !36 + %4989 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3779, float 0x3FF7154760000000, float %4987) #6, !dbg !36 + %4990 = tail call float @llvm.nvvm.fma.rn.f(float %3779, float 0x3FF7154760000000, float %4987) #6, !dbg !36 + %.0.i1944 = select i1 %.not3.i1943, float %4990, float %4989, !dbg !36 + %4991 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1945 = icmp eq i32 %4991, 0, !dbg !36 + %4992 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3779, float 0x3E54AE0C00000000, float %.0.i1944) #6, !dbg !36 + %4993 = tail call float @llvm.nvvm.fma.rn.f(float %3779, float 0x3E54AE0C00000000, float %.0.i1944) #6, !dbg !36 + %.01.i1946 = select i1 %.not4.i1945, float %4993, float %4992, !dbg !36 + %4994 = bitcast float %.04.i1942 to i32, !dbg !36 + %4995 = shl i32 %4994, 23, !dbg !36 + %4996 = bitcast i32 %4995 to float, !dbg !36 + %4997 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1946) #6, !dbg !36 + %4998 = fmul float %4997, %4996, !dbg !36 + %4999 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1947 = icmp eq i32 %4999, 0, !dbg !36 + %5000 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3780, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %5001 = tail call float @llvm.nvvm.fma.rn.f(float %3780, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1948 = select i1 %.not.i1947, float %5001, float %5000, !dbg !36 + %5002 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1949 = icmp eq i32 %5002, 0, !dbg !36 + %5003 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1948) #6, !dbg !36 + %5004 = tail call float @llvm.nvvm.saturate.f(float %.02.i1948) #6, !dbg !36 + %.03.i1950 = select i1 %.not1.i1949, float %5004, float %5003, !dbg !36 + %5005 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1951 = icmp eq i32 %5005, 0, !dbg !36 + %5006 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1950, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5007 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1950, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1952 = select i1 %.not2.i1951, float %5007, float %5006, !dbg !36 + %5008 = fadd float %.04.i1952, 0xC168000FE0000000, !dbg !36 + %5009 = fneg float %5008, !dbg !36 + %5010 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1953 = icmp eq i32 %5010, 0, !dbg !36 + %5011 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3780, float 0x3FF7154760000000, float %5009) #6, !dbg !36 + %5012 = tail call float @llvm.nvvm.fma.rn.f(float %3780, float 0x3FF7154760000000, float %5009) #6, !dbg !36 + %.0.i1954 = select i1 %.not3.i1953, float %5012, float %5011, !dbg !36 + %5013 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1955 = icmp eq i32 %5013, 0, !dbg !36 + %5014 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3780, float 0x3E54AE0C00000000, float %.0.i1954) #6, !dbg !36 + %5015 = tail call float @llvm.nvvm.fma.rn.f(float %3780, float 0x3E54AE0C00000000, float %.0.i1954) #6, !dbg !36 + %.01.i1956 = select i1 %.not4.i1955, float %5015, float %5014, !dbg !36 + %5016 = bitcast float %.04.i1952 to i32, !dbg !36 + %5017 = shl i32 %5016, 23, !dbg !36 + %5018 = bitcast i32 %5017 to float, !dbg !36 + %5019 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1956) #6, !dbg !36 + %5020 = fmul float %5019, %5018, !dbg !36 + %5021 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1957 = icmp eq i32 %5021, 0, !dbg !36 + %5022 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3781, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %5023 = tail call float @llvm.nvvm.fma.rn.f(float %3781, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1958 = select i1 %.not.i1957, float %5023, float %5022, !dbg !36 + %5024 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1959 = icmp eq i32 %5024, 0, !dbg !36 + %5025 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1958) #6, !dbg !36 + %5026 = tail call float @llvm.nvvm.saturate.f(float %.02.i1958) #6, !dbg !36 + %.03.i1960 = select i1 %.not1.i1959, float %5026, float %5025, !dbg !36 + %5027 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1961 = icmp eq i32 %5027, 0, !dbg !36 + %5028 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1960, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5029 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1960, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1962 = select i1 %.not2.i1961, float %5029, float %5028, !dbg !36 + %5030 = fadd float %.04.i1962, 0xC168000FE0000000, !dbg !36 + %5031 = fneg float %5030, !dbg !36 + %5032 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1963 = icmp eq i32 %5032, 0, !dbg !36 + %5033 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3781, float 0x3FF7154760000000, float %5031) #6, !dbg !36 + %5034 = tail call float @llvm.nvvm.fma.rn.f(float %3781, float 0x3FF7154760000000, float %5031) #6, !dbg !36 + %.0.i1964 = select i1 %.not3.i1963, float %5034, float %5033, !dbg !36 + %5035 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1965 = icmp eq i32 %5035, 0, !dbg !36 + %5036 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3781, float 0x3E54AE0C00000000, float %.0.i1964) #6, !dbg !36 + %5037 = tail call float @llvm.nvvm.fma.rn.f(float %3781, float 0x3E54AE0C00000000, float %.0.i1964) #6, !dbg !36 + %.01.i1966 = select i1 %.not4.i1965, float %5037, float %5036, !dbg !36 + %5038 = bitcast float %.04.i1962 to i32, !dbg !36 + %5039 = shl i32 %5038, 23, !dbg !36 + %5040 = bitcast i32 %5039 to float, !dbg !36 + %5041 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1966) #6, !dbg !36 + %5042 = fmul float %5041, %5040, !dbg !36 + %5043 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1967 = icmp eq i32 %5043, 0, !dbg !36 + %5044 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3782, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %5045 = tail call float @llvm.nvvm.fma.rn.f(float %3782, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1968 = select i1 %.not.i1967, float %5045, float %5044, !dbg !36 + %5046 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1969 = icmp eq i32 %5046, 0, !dbg !36 + %5047 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1968) #6, !dbg !36 + %5048 = tail call float @llvm.nvvm.saturate.f(float %.02.i1968) #6, !dbg !36 + %.03.i1970 = select i1 %.not1.i1969, float %5048, float %5047, !dbg !36 + %5049 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not2.i1971 = icmp eq i32 %5049, 0, !dbg !36 + %5050 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1970, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5051 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1970, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %.04.i1972 = select i1 %.not2.i1971, float %5051, float %5050, !dbg !36 + %5052 = fadd float %.04.i1972, 0xC168000FE0000000, !dbg !36 + %5053 = fneg float %5052, !dbg !36 + %5054 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1973 = icmp eq i32 %5054, 0, !dbg !36 + %5055 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3782, float 0x3FF7154760000000, float %5053) #6, !dbg !36 + %5056 = tail call float @llvm.nvvm.fma.rn.f(float %3782, float 0x3FF7154760000000, float %5053) #6, !dbg !36 + %.0.i1974 = select i1 %.not3.i1973, float %5056, float %5055, !dbg !36 + %5057 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1975 = icmp eq i32 %5057, 0, !dbg !36 + %5058 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3782, float 0x3E54AE0C00000000, float %.0.i1974) #6, !dbg !36 + %5059 = tail call float @llvm.nvvm.fma.rn.f(float %3782, float 0x3E54AE0C00000000, float %.0.i1974) #6, !dbg !36 + %.01.i1976 = select i1 %.not4.i1975, float %5059, float %5058, !dbg !36 + %5060 = bitcast float %.04.i1972 to i32, !dbg !36 + %5061 = shl i32 %5060, 23, !dbg !36 + %5062 = bitcast i32 %5061 to float, !dbg !36 + %5063 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1976) #6, !dbg !36 + %5064 = fmul float %5063, %5062, !dbg !36 + %5065 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1977 = icmp eq i32 %5065, 0, !dbg !36 + %5066 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3783, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %5067 = tail call float @llvm.nvvm.fma.rn.f(float %3783, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1978 = select i1 %.not.i1977, float %5067, float %5066, !dbg !36 + %5068 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1979 = icmp eq i32 %5068, 0, !dbg !36 + %5069 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1978) #6, !dbg !36 + %5070 = tail call float @llvm.nvvm.saturate.f(float %.02.i1978) #6, !dbg !36 + %.03.i1980 = select i1 %.not1.i1979, float %5070, float %5069, !dbg !36 + %5071 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %5072 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1980, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5073 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1980, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5074 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1983 = icmp eq i32 %5074, 0, !dbg !36 + %5075 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1985 = icmp eq i32 %5075, 0, !dbg !36 + %5076 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1987 = icmp eq i32 %5076, 0, !dbg !36 + %5077 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3784, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %5078 = tail call float @llvm.nvvm.fma.rn.f(float %3784, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1988 = select i1 %.not.i1987, float %5078, float %5077, !dbg !36 + %5079 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1989 = icmp eq i32 %5079, 0, !dbg !36 + %5080 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1988) #6, !dbg !36 + %5081 = tail call float @llvm.nvvm.saturate.f(float %.02.i1988) #6, !dbg !36 + %.03.i1990 = select i1 %.not1.i1989, float %5081, float %5080, !dbg !36 + %5082 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %5083 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1990, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5084 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1990, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5085 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i1993 = icmp eq i32 %5085, 0, !dbg !36 + %5086 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i1995 = icmp eq i32 %5086, 0, !dbg !36 + %5087 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i1997 = icmp eq i32 %5087, 0, !dbg !36 + %5088 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3785, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %5089 = tail call float @llvm.nvvm.fma.rn.f(float %3785, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i1998 = select i1 %.not.i1997, float %5089, float %5088, !dbg !36 + %5090 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i1999 = icmp eq i32 %5090, 0, !dbg !36 + %5091 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1998) #6, !dbg !36 + %5092 = tail call float @llvm.nvvm.saturate.f(float %.02.i1998) #6, !dbg !36 + %.03.i2000 = select i1 %.not1.i1999, float %5092, float %5091, !dbg !36 + %5093 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %5094 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2000, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5095 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2000, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5096 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i2003 = icmp eq i32 %5096, 0, !dbg !36 + %5097 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i2005 = icmp eq i32 %5097, 0, !dbg !36 + %5098 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i2007 = icmp eq i32 %5098, 0, !dbg !36 + %5099 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3786, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %5100 = tail call float @llvm.nvvm.fma.rn.f(float %3786, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i2008 = select i1 %.not.i2007, float %5100, float %5099, !dbg !36 + %5101 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i2009 = icmp eq i32 %5101, 0, !dbg !36 + %5102 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2008) #6, !dbg !36 + %5103 = tail call float @llvm.nvvm.saturate.f(float %.02.i2008) #6, !dbg !36 + %.03.i2010 = select i1 %.not1.i2009, float %5103, float %5102, !dbg !36 + %5104 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %5105 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2010, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5106 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2010, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5107 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i2013 = icmp eq i32 %5107, 0, !dbg !36 + %5108 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i2015 = icmp eq i32 %5108, 0, !dbg !36 + %5109 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i2017 = icmp eq i32 %5109, 0, !dbg !36 + %5110 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3787, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %5111 = tail call float @llvm.nvvm.fma.rn.f(float %3787, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i2018 = select i1 %.not.i2017, float %5111, float %5110, !dbg !36 + %5112 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i2019 = icmp eq i32 %5112, 0, !dbg !36 + %5113 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2018) #6, !dbg !36 + %5114 = tail call float @llvm.nvvm.saturate.f(float %.02.i2018) #6, !dbg !36 + %.03.i2020 = select i1 %.not1.i2019, float %5114, float %5113, !dbg !36 + %5115 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %5116 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2020, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5117 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2020, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5118 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i2023 = icmp eq i32 %5118, 0, !dbg !36 + %5119 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i2025 = icmp eq i32 %5119, 0, !dbg !36 + %5120 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not.i2027 = icmp eq i32 %5120, 0, !dbg !36 + %5121 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3788, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %5122 = tail call float @llvm.nvvm.fma.rn.f(float %3788, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !36 + %.02.i2028 = select i1 %.not.i2027, float %5122, float %5121, !dbg !36 + %5123 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not1.i2029 = icmp eq i32 %5123, 0, !dbg !36 + %5124 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2028) #6, !dbg !36 + %5125 = tail call float @llvm.nvvm.saturate.f(float %.02.i2028) #6, !dbg !36 + %.03.i2030 = select i1 %.not1.i2029, float %5125, float %5124, !dbg !36 + %5126 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %5127 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2030, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5128 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2030, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !36 + %5129 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not3.i2033 = icmp eq i32 %5129, 0, !dbg !36 + %5130 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !36 + %.not4.i2035 = icmp eq i32 %5130, 0, !dbg !36 + %5131 = fmul float %3210, %3810, !dbg !37 + %5132 = fmul float %3212, %3832, !dbg !37 + %5133 = fmul float %3214, %3854, !dbg !37 + %5134 = fmul float %3216, %3876, !dbg !37 + %5135 = fmul float %3218, %3898, !dbg !37 + %5136 = fmul float %3220, %3920, !dbg !37 + %5137 = fmul float %3222, %3942, !dbg !37 + %5138 = fmul float %3224, %3964, !dbg !37 + %5139 = fmul float %3226, %3986, !dbg !37 + %5140 = fmul float %3228, %4008, !dbg !37 + %5141 = fmul float %3230, %4030, !dbg !37 + %5142 = fmul float %3232, %4052, !dbg !37 + %5143 = fmul float %3234, %4074, !dbg !37 + %5144 = fmul float %3236, %4096, !dbg !37 + %5145 = fmul float %3238, %4118, !dbg !37 + %5146 = fmul float %3240, %4140, !dbg !37 + %5147 = fmul float %3242, %4162, !dbg !37 + %5148 = fmul float %3244, %4184, !dbg !37 + %5149 = fmul float %3246, %4206, !dbg !37 + %5150 = fmul float %3248, %4228, !dbg !37 + %5151 = fmul float %3250, %4250, !dbg !37 + %5152 = fmul float %3252, %4272, !dbg !37 + %5153 = fmul float %3254, %4294, !dbg !37 + %5154 = fmul float %3256, %4316, !dbg !37 + %5155 = fmul float %3258, %4338, !dbg !37 + %5156 = fmul float %3260, %4360, !dbg !37 + %5157 = fmul float %3262, %4382, !dbg !37 + %5158 = fmul float %3264, %4404, !dbg !37 + %5159 = fmul float %3266, %4426, !dbg !37 + %5160 = fmul float %3268, %4448, !dbg !37 + %5161 = fmul float %3270, %4470, !dbg !37 + %5162 = fmul float %3272, %4492, !dbg !37 + %5163 = fmul float %3274, %4514, !dbg !37 + %5164 = fmul float %3276, %4536, !dbg !37 + %5165 = fmul float %3278, %4558, !dbg !37 + %5166 = fmul float %3280, %4580, !dbg !37 + %5167 = fmul float %3282, %4602, !dbg !37 + %5168 = fmul float %3284, %4624, !dbg !37 + %5169 = fmul float %3286, %4646, !dbg !37 + %5170 = fmul float %3288, %4668, !dbg !37 + %5171 = fmul float %3290, %4690, !dbg !37 + %5172 = fmul float %3292, %4712, !dbg !37 + %5173 = fmul float %3294, %4734, !dbg !37 + %5174 = fmul float %3296, %4756, !dbg !37 + %5175 = fmul float %3298, %4778, !dbg !37 + %5176 = fmul float %3300, %4800, !dbg !37 + %5177 = fmul float %3302, %4822, !dbg !37 + %5178 = fmul float %3304, %4844, !dbg !37 + %5179 = fmul float %3306, %4866, !dbg !37 + %5180 = fmul float %3308, %4888, !dbg !37 + %5181 = fmul float %3310, %4910, !dbg !37 + %5182 = fmul float %3312, %4932, !dbg !37 + %5183 = fmul float %3314, %4954, !dbg !37 + %5184 = fmul float %3316, %4976, !dbg !37 + %5185 = fmul float %3318, %4998, !dbg !37 + %5186 = fmul float %3320, %5020, !dbg !37 + %5187 = fmul float %3333, %5042, !dbg !37 + %5188 = fmul float %3334, %5064, !dbg !37 + %5189 = fcmp oeq <2 x bfloat> %204, splat (bfloat 0xRFF80), !dbg !16 + %5190 = insertelement <2 x i32> poison, i32 %1685, i64 0, !dbg !21 + %5191 = insertelement <2 x i32> %5190, i32 %1696, i64 1, !dbg !21 + %5192 = icmp eq <2 x i32> %5191, zeroinitializer, !dbg !21 + %5193 = insertelement <2 x float> poison, float %1687, i64 0, !dbg !21 + %5194 = insertelement <2 x float> %5193, float %1698, i64 1, !dbg !21 + %5195 = insertelement <2 x float> poison, float %1686, i64 0, !dbg !21 + %5196 = insertelement <2 x float> %5195, float %1697, i64 1, !dbg !21 + %5197 = select <2 x i1> %5192, <2 x float> %5194, <2 x float> %5196, !dbg !21 + %5198 = extractelement <2 x float> %5197, i64 0, !dbg !21 + %5199 = fadd float %5198, 0xC168000FE0000000, !dbg !21 + %5200 = fneg float %5199, !dbg !21 + %5201 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %397, float 0x3FF7154760000000, float %5200) #6, !dbg !21 + %5202 = tail call float @llvm.nvvm.fma.rn.f(float %397, float 0x3FF7154760000000, float %5200) #6, !dbg !21 + %.0.i704 = select i1 %.not3.i703, float %5202, float %5201, !dbg !21 + %5203 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %397, float 0x3E54AE0C00000000, float %.0.i704) #6, !dbg !21 + %5204 = tail call float @llvm.nvvm.fma.rn.f(float %397, float 0x3E54AE0C00000000, float %.0.i704) #6, !dbg !21 + %.01.i706 = select i1 %.not4.i705, float %5204, float %5203, !dbg !21 + %5205 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i706) #6, !dbg !21 + %5206 = extractelement <2 x float> %5197, i64 1, !dbg !21 + %5207 = fadd float %5206, 0xC168000FE0000000, !dbg !21 + %5208 = fneg float %5207, !dbg !21 + %5209 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %398, float 0x3FF7154760000000, float %5208) #6, !dbg !21 + %5210 = tail call float @llvm.nvvm.fma.rn.f(float %398, float 0x3FF7154760000000, float %5208) #6, !dbg !21 + %.0.i714 = select i1 %.not3.i713, float %5210, float %5209, !dbg !21 + %5211 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %398, float 0x3E54AE0C00000000, float %.0.i714) #6, !dbg !21 + %5212 = tail call float @llvm.nvvm.fma.rn.f(float %398, float 0x3E54AE0C00000000, float %.0.i714) #6, !dbg !21 + %.01.i716 = select i1 %.not4.i715, float %5212, float %5211, !dbg !21 + %5213 = bitcast <2 x float> %5197 to <2 x i32>, !dbg !21 + %5214 = shl <2 x i32> %5213, splat (i32 23), !dbg !21 + %5215 = bitcast <2 x i32> %5214 to <2 x float>, !dbg !21 + %5216 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i716) #6, !dbg !21 + %5217 = insertelement <2 x float> poison, float %5205, i64 0, !dbg !21 + %5218 = insertelement <2 x float> %5217, float %5216, i64 1, !dbg !21 + %5219 = fmul <2 x float> %5218, %5215, !dbg !21 + %5220 = insertelement <2 x i32> poison, i32 %3091, i64 0, !dbg !21 + %5221 = insertelement <2 x i32> %5220, i32 %3102, i64 1, !dbg !21 + %5222 = icmp eq <2 x i32> %5221, zeroinitializer, !dbg !21 + %5223 = insertelement <2 x float> poison, float %3093, i64 0, !dbg !21 + %5224 = insertelement <2 x float> %5223, float %3104, i64 1, !dbg !21 + %5225 = insertelement <2 x float> poison, float %3092, i64 0, !dbg !21 + %5226 = insertelement <2 x float> %5225, float %3103, i64 1, !dbg !21 + %5227 = select <2 x i1> %5222, <2 x float> %5224, <2 x float> %5226, !dbg !21 + %5228 = extractelement <2 x float> %5227, i64 0, !dbg !21 + %5229 = fadd float %5228, 0xC168000FE0000000, !dbg !21 + %5230 = fneg float %5229, !dbg !21 + %5231 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1803, float 0x3FF7154760000000, float %5230) #6, !dbg !21 + %5232 = tail call float @llvm.nvvm.fma.rn.f(float %1803, float 0x3FF7154760000000, float %5230) #6, !dbg !21 + %.0.i1344 = select i1 %.not3.i1343, float %5232, float %5231, !dbg !21 + %5233 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1803, float 0x3E54AE0C00000000, float %.0.i1344) #6, !dbg !21 + %5234 = tail call float @llvm.nvvm.fma.rn.f(float %1803, float 0x3E54AE0C00000000, float %.0.i1344) #6, !dbg !21 + %.01.i1346 = select i1 %.not4.i1345, float %5234, float %5233, !dbg !21 + %5235 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1346) #6, !dbg !21 + %5236 = extractelement <2 x float> %5227, i64 1, !dbg !21 + %5237 = fadd float %5236, 0xC168000FE0000000, !dbg !21 + %5238 = fneg float %5237, !dbg !21 + %5239 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1804, float 0x3FF7154760000000, float %5238) #6, !dbg !21 + %5240 = tail call float @llvm.nvvm.fma.rn.f(float %1804, float 0x3FF7154760000000, float %5238) #6, !dbg !21 + %.0.i1354 = select i1 %.not3.i1353, float %5240, float %5239, !dbg !21 + %5241 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1804, float 0x3E54AE0C00000000, float %.0.i1354) #6, !dbg !21 + %5242 = tail call float @llvm.nvvm.fma.rn.f(float %1804, float 0x3E54AE0C00000000, float %.0.i1354) #6, !dbg !21 + %.01.i1356 = select i1 %.not4.i1355, float %5242, float %5241, !dbg !21 + %5243 = bitcast <2 x float> %5227 to <2 x i32>, !dbg !21 + %5244 = shl <2 x i32> %5243, splat (i32 23), !dbg !21 + %5245 = bitcast <2 x i32> %5244 to <2 x float>, !dbg !21 + %5246 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1356) #6, !dbg !21 + %5247 = insertelement <2 x float> poison, float %5235, i64 0, !dbg !21 + %5248 = insertelement <2 x float> %5247, float %5246, i64 1, !dbg !21 + %5249 = fmul <2 x float> %5248, %5245, !dbg !21 + %5250 = fmul <2 x float> %5219, zeroinitializer, !dbg !23 + %5251 = fadd <2 x float> %5250, %5249, !dbg !24 + %5252 = select <2 x i1> %5189, <2 x float> splat (float 1.000000e+00), <2 x float> %5251, !dbg !24 + %5253 = insertelement <2 x i1> poison, i1 %16, i64 0, !dbg !26 + %5254 = shufflevector <2 x i1> %5253, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !26 + %5255 = select <2 x i1> %5254, <2 x float> %5252, <2 x float> zeroinitializer, !dbg !26 + %5256 = insertelement <2 x i32> poison, i32 %5071, i64 0, !dbg !36 + %5257 = insertelement <2 x i32> %5256, i32 %5082, i64 1, !dbg !36 + %5258 = icmp eq <2 x i32> %5257, zeroinitializer, !dbg !36 + %5259 = insertelement <2 x float> poison, float %5073, i64 0, !dbg !36 + %5260 = insertelement <2 x float> %5259, float %5084, i64 1, !dbg !36 + %5261 = insertelement <2 x float> poison, float %5072, i64 0, !dbg !36 + %5262 = insertelement <2 x float> %5261, float %5083, i64 1, !dbg !36 + %5263 = select <2 x i1> %5258, <2 x float> %5260, <2 x float> %5262, !dbg !36 + %5264 = extractelement <2 x float> %5263, i64 0, !dbg !36 + %5265 = fadd float %5264, 0xC168000FE0000000, !dbg !36 + %5266 = fneg float %5265, !dbg !36 + %5267 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3783, float 0x3FF7154760000000, float %5266) #6, !dbg !36 + %5268 = tail call float @llvm.nvvm.fma.rn.f(float %3783, float 0x3FF7154760000000, float %5266) #6, !dbg !36 + %.0.i1984 = select i1 %.not3.i1983, float %5268, float %5267, !dbg !36 + %5269 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3783, float 0x3E54AE0C00000000, float %.0.i1984) #6, !dbg !36 + %5270 = tail call float @llvm.nvvm.fma.rn.f(float %3783, float 0x3E54AE0C00000000, float %.0.i1984) #6, !dbg !36 + %.01.i1986 = select i1 %.not4.i1985, float %5270, float %5269, !dbg !36 + %5271 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1986) #6, !dbg !36 + %5272 = extractelement <2 x float> %5263, i64 1, !dbg !36 + %5273 = fadd float %5272, 0xC168000FE0000000, !dbg !36 + %5274 = fneg float %5273, !dbg !36 + %5275 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3784, float 0x3FF7154760000000, float %5274) #6, !dbg !36 + %5276 = tail call float @llvm.nvvm.fma.rn.f(float %3784, float 0x3FF7154760000000, float %5274) #6, !dbg !36 + %.0.i1994 = select i1 %.not3.i1993, float %5276, float %5275, !dbg !36 + %5277 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3784, float 0x3E54AE0C00000000, float %.0.i1994) #6, !dbg !36 + %5278 = tail call float @llvm.nvvm.fma.rn.f(float %3784, float 0x3E54AE0C00000000, float %.0.i1994) #6, !dbg !36 + %.01.i1996 = select i1 %.not4.i1995, float %5278, float %5277, !dbg !36 + %5279 = bitcast <2 x float> %5263 to <2 x i32>, !dbg !36 + %5280 = shl <2 x i32> %5279, splat (i32 23), !dbg !36 + %5281 = bitcast <2 x i32> %5280 to <2 x float>, !dbg !36 + %5282 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1996) #6, !dbg !36 + %5283 = insertelement <2 x float> poison, float %5271, i64 0, !dbg !36 + %5284 = insertelement <2 x float> %5283, float %5282, i64 1, !dbg !36 + %5285 = fmul <2 x float> %5284, %5281, !dbg !36 + %5286 = fmul <2 x float> %5255, %5285, !dbg !37 + %5287 = fcmp oeq <2 x bfloat> %206, splat (bfloat 0xRFF80), !dbg !16 + %5288 = insertelement <2 x i32> poison, i32 %1707, i64 0, !dbg !21 + %5289 = insertelement <2 x i32> %5288, i32 %1718, i64 1, !dbg !21 + %5290 = icmp eq <2 x i32> %5289, zeroinitializer, !dbg !21 + %5291 = insertelement <2 x float> poison, float %1709, i64 0, !dbg !21 + %5292 = insertelement <2 x float> %5291, float %1720, i64 1, !dbg !21 + %5293 = insertelement <2 x float> poison, float %1708, i64 0, !dbg !21 + %5294 = insertelement <2 x float> %5293, float %1719, i64 1, !dbg !21 + %5295 = select <2 x i1> %5290, <2 x float> %5292, <2 x float> %5294, !dbg !21 + %5296 = extractelement <2 x float> %5295, i64 0, !dbg !21 + %5297 = fadd float %5296, 0xC168000FE0000000, !dbg !21 + %5298 = fneg float %5297, !dbg !21 + %5299 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %399, float 0x3FF7154760000000, float %5298) #6, !dbg !21 + %5300 = tail call float @llvm.nvvm.fma.rn.f(float %399, float 0x3FF7154760000000, float %5298) #6, !dbg !21 + %.0.i724 = select i1 %.not3.i723, float %5300, float %5299, !dbg !21 + %5301 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %399, float 0x3E54AE0C00000000, float %.0.i724) #6, !dbg !21 + %5302 = tail call float @llvm.nvvm.fma.rn.f(float %399, float 0x3E54AE0C00000000, float %.0.i724) #6, !dbg !21 + %.01.i726 = select i1 %.not4.i725, float %5302, float %5301, !dbg !21 + %5303 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i726) #6, !dbg !21 + %5304 = extractelement <2 x float> %5295, i64 1, !dbg !21 + %5305 = fadd float %5304, 0xC168000FE0000000, !dbg !21 + %5306 = fneg float %5305, !dbg !21 + %5307 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %400, float 0x3FF7154760000000, float %5306) #6, !dbg !21 + %5308 = tail call float @llvm.nvvm.fma.rn.f(float %400, float 0x3FF7154760000000, float %5306) #6, !dbg !21 + %.0.i734 = select i1 %.not3.i733, float %5308, float %5307, !dbg !21 + %5309 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %400, float 0x3E54AE0C00000000, float %.0.i734) #6, !dbg !21 + %5310 = tail call float @llvm.nvvm.fma.rn.f(float %400, float 0x3E54AE0C00000000, float %.0.i734) #6, !dbg !21 + %.01.i736 = select i1 %.not4.i735, float %5310, float %5309, !dbg !21 + %5311 = bitcast <2 x float> %5295 to <2 x i32>, !dbg !21 + %5312 = shl <2 x i32> %5311, splat (i32 23), !dbg !21 + %5313 = bitcast <2 x i32> %5312 to <2 x float>, !dbg !21 + %5314 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i736) #6, !dbg !21 + %5315 = insertelement <2 x float> poison, float %5303, i64 0, !dbg !21 + %5316 = insertelement <2 x float> %5315, float %5314, i64 1, !dbg !21 + %5317 = fmul <2 x float> %5316, %5313, !dbg !21 + %5318 = insertelement <2 x i32> poison, i32 %3113, i64 0, !dbg !21 + %5319 = insertelement <2 x i32> %5318, i32 %3124, i64 1, !dbg !21 + %5320 = icmp eq <2 x i32> %5319, zeroinitializer, !dbg !21 + %5321 = insertelement <2 x float> poison, float %3115, i64 0, !dbg !21 + %5322 = insertelement <2 x float> %5321, float %3126, i64 1, !dbg !21 + %5323 = insertelement <2 x float> poison, float %3114, i64 0, !dbg !21 + %5324 = insertelement <2 x float> %5323, float %3125, i64 1, !dbg !21 + %5325 = select <2 x i1> %5320, <2 x float> %5322, <2 x float> %5324, !dbg !21 + %5326 = extractelement <2 x float> %5325, i64 0, !dbg !21 + %5327 = fadd float %5326, 0xC168000FE0000000, !dbg !21 + %5328 = fneg float %5327, !dbg !21 + %5329 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1805, float 0x3FF7154760000000, float %5328) #6, !dbg !21 + %5330 = tail call float @llvm.nvvm.fma.rn.f(float %1805, float 0x3FF7154760000000, float %5328) #6, !dbg !21 + %.0.i1364 = select i1 %.not3.i1363, float %5330, float %5329, !dbg !21 + %5331 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1805, float 0x3E54AE0C00000000, float %.0.i1364) #6, !dbg !21 + %5332 = tail call float @llvm.nvvm.fma.rn.f(float %1805, float 0x3E54AE0C00000000, float %.0.i1364) #6, !dbg !21 + %.01.i1366 = select i1 %.not4.i1365, float %5332, float %5331, !dbg !21 + %5333 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1366) #6, !dbg !21 + %5334 = extractelement <2 x float> %5325, i64 1, !dbg !21 + %5335 = fadd float %5334, 0xC168000FE0000000, !dbg !21 + %5336 = fneg float %5335, !dbg !21 + %5337 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1806, float 0x3FF7154760000000, float %5336) #6, !dbg !21 + %5338 = tail call float @llvm.nvvm.fma.rn.f(float %1806, float 0x3FF7154760000000, float %5336) #6, !dbg !21 + %.0.i1374 = select i1 %.not3.i1373, float %5338, float %5337, !dbg !21 + %5339 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1806, float 0x3E54AE0C00000000, float %.0.i1374) #6, !dbg !21 + %5340 = tail call float @llvm.nvvm.fma.rn.f(float %1806, float 0x3E54AE0C00000000, float %.0.i1374) #6, !dbg !21 + %.01.i1376 = select i1 %.not4.i1375, float %5340, float %5339, !dbg !21 + %5341 = bitcast <2 x float> %5325 to <2 x i32>, !dbg !21 + %5342 = shl <2 x i32> %5341, splat (i32 23), !dbg !21 + %5343 = bitcast <2 x i32> %5342 to <2 x float>, !dbg !21 + %5344 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1376) #6, !dbg !21 + %5345 = insertelement <2 x float> poison, float %5333, i64 0, !dbg !21 + %5346 = insertelement <2 x float> %5345, float %5344, i64 1, !dbg !21 + %5347 = fmul <2 x float> %5346, %5343, !dbg !21 + %5348 = fmul <2 x float> %5317, zeroinitializer, !dbg !23 + %5349 = fadd <2 x float> %5348, %5347, !dbg !24 + %5350 = select <2 x i1> %5287, <2 x float> splat (float 1.000000e+00), <2 x float> %5349, !dbg !24 + %5351 = select <2 x i1> %5254, <2 x float> %5350, <2 x float> zeroinitializer, !dbg !26 + %5352 = insertelement <2 x i32> poison, i32 %5093, i64 0, !dbg !36 + %5353 = insertelement <2 x i32> %5352, i32 %5104, i64 1, !dbg !36 + %5354 = icmp eq <2 x i32> %5353, zeroinitializer, !dbg !36 + %5355 = insertelement <2 x float> poison, float %5095, i64 0, !dbg !36 + %5356 = insertelement <2 x float> %5355, float %5106, i64 1, !dbg !36 + %5357 = insertelement <2 x float> poison, float %5094, i64 0, !dbg !36 + %5358 = insertelement <2 x float> %5357, float %5105, i64 1, !dbg !36 + %5359 = select <2 x i1> %5354, <2 x float> %5356, <2 x float> %5358, !dbg !36 + %5360 = extractelement <2 x float> %5359, i64 0, !dbg !36 + %5361 = fadd float %5360, 0xC168000FE0000000, !dbg !36 + %5362 = fneg float %5361, !dbg !36 + %5363 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3785, float 0x3FF7154760000000, float %5362) #6, !dbg !36 + %5364 = tail call float @llvm.nvvm.fma.rn.f(float %3785, float 0x3FF7154760000000, float %5362) #6, !dbg !36 + %.0.i2004 = select i1 %.not3.i2003, float %5364, float %5363, !dbg !36 + %5365 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3785, float 0x3E54AE0C00000000, float %.0.i2004) #6, !dbg !36 + %5366 = tail call float @llvm.nvvm.fma.rn.f(float %3785, float 0x3E54AE0C00000000, float %.0.i2004) #6, !dbg !36 + %.01.i2006 = select i1 %.not4.i2005, float %5366, float %5365, !dbg !36 + %5367 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2006) #6, !dbg !36 + %5368 = extractelement <2 x float> %5359, i64 1, !dbg !36 + %5369 = fadd float %5368, 0xC168000FE0000000, !dbg !36 + %5370 = fneg float %5369, !dbg !36 + %5371 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3786, float 0x3FF7154760000000, float %5370) #6, !dbg !36 + %5372 = tail call float @llvm.nvvm.fma.rn.f(float %3786, float 0x3FF7154760000000, float %5370) #6, !dbg !36 + %.0.i2014 = select i1 %.not3.i2013, float %5372, float %5371, !dbg !36 + %5373 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3786, float 0x3E54AE0C00000000, float %.0.i2014) #6, !dbg !36 + %5374 = tail call float @llvm.nvvm.fma.rn.f(float %3786, float 0x3E54AE0C00000000, float %.0.i2014) #6, !dbg !36 + %.01.i2016 = select i1 %.not4.i2015, float %5374, float %5373, !dbg !36 + %5375 = bitcast <2 x float> %5359 to <2 x i32>, !dbg !36 + %5376 = shl <2 x i32> %5375, splat (i32 23), !dbg !36 + %5377 = bitcast <2 x i32> %5376 to <2 x float>, !dbg !36 + %5378 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2016) #6, !dbg !36 + %5379 = insertelement <2 x float> poison, float %5367, i64 0, !dbg !36 + %5380 = insertelement <2 x float> %5379, float %5378, i64 1, !dbg !36 + %5381 = fmul <2 x float> %5380, %5377, !dbg !36 + %5382 = fmul <2 x float> %5351, %5381, !dbg !37 + %5383 = fcmp oeq <2 x bfloat> %208, splat (bfloat 0xRFF80), !dbg !16 + %5384 = insertelement <2 x i32> poison, i32 %1729, i64 0, !dbg !21 + %5385 = insertelement <2 x i32> %5384, i32 %1740, i64 1, !dbg !21 + %5386 = icmp eq <2 x i32> %5385, zeroinitializer, !dbg !21 + %5387 = insertelement <2 x float> poison, float %1731, i64 0, !dbg !21 + %5388 = insertelement <2 x float> %5387, float %1742, i64 1, !dbg !21 + %5389 = insertelement <2 x float> poison, float %1730, i64 0, !dbg !21 + %5390 = insertelement <2 x float> %5389, float %1741, i64 1, !dbg !21 + %5391 = select <2 x i1> %5386, <2 x float> %5388, <2 x float> %5390, !dbg !21 + %5392 = extractelement <2 x float> %5391, i64 0, !dbg !21 + %5393 = fadd float %5392, 0xC168000FE0000000, !dbg !21 + %5394 = fneg float %5393, !dbg !21 + %5395 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %401, float 0x3FF7154760000000, float %5394) #6, !dbg !21 + %5396 = tail call float @llvm.nvvm.fma.rn.f(float %401, float 0x3FF7154760000000, float %5394) #6, !dbg !21 + %.0.i744 = select i1 %.not3.i743, float %5396, float %5395, !dbg !21 + %5397 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %401, float 0x3E54AE0C00000000, float %.0.i744) #6, !dbg !21 + %5398 = tail call float @llvm.nvvm.fma.rn.f(float %401, float 0x3E54AE0C00000000, float %.0.i744) #6, !dbg !21 + %.01.i746 = select i1 %.not4.i745, float %5398, float %5397, !dbg !21 + %5399 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i746) #6, !dbg !21 + %5400 = extractelement <2 x float> %5391, i64 1, !dbg !21 + %5401 = fadd float %5400, 0xC168000FE0000000, !dbg !21 + %5402 = fneg float %5401, !dbg !21 + %5403 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %402, float 0x3FF7154760000000, float %5402) #6, !dbg !21 + %5404 = tail call float @llvm.nvvm.fma.rn.f(float %402, float 0x3FF7154760000000, float %5402) #6, !dbg !21 + %.0.i754 = select i1 %.not3.i753, float %5404, float %5403, !dbg !21 + %5405 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %402, float 0x3E54AE0C00000000, float %.0.i754) #6, !dbg !21 + %5406 = tail call float @llvm.nvvm.fma.rn.f(float %402, float 0x3E54AE0C00000000, float %.0.i754) #6, !dbg !21 + %.01.i756 = select i1 %.not4.i755, float %5406, float %5405, !dbg !21 + %5407 = bitcast <2 x float> %5391 to <2 x i32>, !dbg !21 + %5408 = shl <2 x i32> %5407, splat (i32 23), !dbg !21 + %5409 = bitcast <2 x i32> %5408 to <2 x float>, !dbg !21 + %5410 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i756) #6, !dbg !21 + %5411 = insertelement <2 x float> poison, float %5399, i64 0, !dbg !21 + %5412 = insertelement <2 x float> %5411, float %5410, i64 1, !dbg !21 + %5413 = fmul <2 x float> %5412, %5409, !dbg !21 + %5414 = insertelement <2 x i32> poison, i32 %3135, i64 0, !dbg !21 + %5415 = insertelement <2 x i32> %5414, i32 %3146, i64 1, !dbg !21 + %5416 = icmp eq <2 x i32> %5415, zeroinitializer, !dbg !21 + %5417 = insertelement <2 x float> poison, float %3137, i64 0, !dbg !21 + %5418 = insertelement <2 x float> %5417, float %3148, i64 1, !dbg !21 + %5419 = insertelement <2 x float> poison, float %3136, i64 0, !dbg !21 + %5420 = insertelement <2 x float> %5419, float %3147, i64 1, !dbg !21 + %5421 = select <2 x i1> %5416, <2 x float> %5418, <2 x float> %5420, !dbg !21 + %5422 = extractelement <2 x float> %5421, i64 0, !dbg !21 + %5423 = fadd float %5422, 0xC168000FE0000000, !dbg !21 + %5424 = fneg float %5423, !dbg !21 + %5425 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1807, float 0x3FF7154760000000, float %5424) #6, !dbg !21 + %5426 = tail call float @llvm.nvvm.fma.rn.f(float %1807, float 0x3FF7154760000000, float %5424) #6, !dbg !21 + %.0.i1384 = select i1 %.not3.i1383, float %5426, float %5425, !dbg !21 + %5427 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1807, float 0x3E54AE0C00000000, float %.0.i1384) #6, !dbg !21 + %5428 = tail call float @llvm.nvvm.fma.rn.f(float %1807, float 0x3E54AE0C00000000, float %.0.i1384) #6, !dbg !21 + %.01.i1386 = select i1 %.not4.i1385, float %5428, float %5427, !dbg !21 + %5429 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1386) #6, !dbg !21 + %5430 = extractelement <2 x float> %5421, i64 1, !dbg !21 + %5431 = fadd float %5430, 0xC168000FE0000000, !dbg !21 + %5432 = fneg float %5431, !dbg !21 + %5433 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1808, float 0x3FF7154760000000, float %5432) #6, !dbg !21 + %5434 = tail call float @llvm.nvvm.fma.rn.f(float %1808, float 0x3FF7154760000000, float %5432) #6, !dbg !21 + %.0.i1394 = select i1 %.not3.i1393, float %5434, float %5433, !dbg !21 + %5435 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1808, float 0x3E54AE0C00000000, float %.0.i1394) #6, !dbg !21 + %5436 = tail call float @llvm.nvvm.fma.rn.f(float %1808, float 0x3E54AE0C00000000, float %.0.i1394) #6, !dbg !21 + %.01.i1396 = select i1 %.not4.i1395, float %5436, float %5435, !dbg !21 + %5437 = bitcast <2 x float> %5421 to <2 x i32>, !dbg !21 + %5438 = shl <2 x i32> %5437, splat (i32 23), !dbg !21 + %5439 = bitcast <2 x i32> %5438 to <2 x float>, !dbg !21 + %5440 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1396) #6, !dbg !21 + %5441 = insertelement <2 x float> poison, float %5429, i64 0, !dbg !21 + %5442 = insertelement <2 x float> %5441, float %5440, i64 1, !dbg !21 + %5443 = fmul <2 x float> %5442, %5439, !dbg !21 + %5444 = fmul <2 x float> %5413, zeroinitializer, !dbg !23 + %5445 = fadd <2 x float> %5444, %5443, !dbg !24 + %5446 = select <2 x i1> %5383, <2 x float> splat (float 1.000000e+00), <2 x float> %5445, !dbg !24 + %5447 = select <2 x i1> %5254, <2 x float> %5446, <2 x float> zeroinitializer, !dbg !26 + %5448 = insertelement <2 x i32> poison, i32 %5115, i64 0, !dbg !36 + %5449 = insertelement <2 x i32> %5448, i32 %5126, i64 1, !dbg !36 + %5450 = icmp eq <2 x i32> %5449, zeroinitializer, !dbg !36 + %5451 = insertelement <2 x float> poison, float %5117, i64 0, !dbg !36 + %5452 = insertelement <2 x float> %5451, float %5128, i64 1, !dbg !36 + %5453 = insertelement <2 x float> poison, float %5116, i64 0, !dbg !36 + %5454 = insertelement <2 x float> %5453, float %5127, i64 1, !dbg !36 + %5455 = select <2 x i1> %5450, <2 x float> %5452, <2 x float> %5454, !dbg !36 + %5456 = extractelement <2 x float> %5455, i64 0, !dbg !36 + %5457 = fadd float %5456, 0xC168000FE0000000, !dbg !36 + %5458 = fneg float %5457, !dbg !36 + %5459 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3787, float 0x3FF7154760000000, float %5458) #6, !dbg !36 + %5460 = tail call float @llvm.nvvm.fma.rn.f(float %3787, float 0x3FF7154760000000, float %5458) #6, !dbg !36 + %.0.i2024 = select i1 %.not3.i2023, float %5460, float %5459, !dbg !36 + %5461 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3787, float 0x3E54AE0C00000000, float %.0.i2024) #6, !dbg !36 + %5462 = tail call float @llvm.nvvm.fma.rn.f(float %3787, float 0x3E54AE0C00000000, float %.0.i2024) #6, !dbg !36 + %.01.i2026 = select i1 %.not4.i2025, float %5462, float %5461, !dbg !36 + %5463 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2026) #6, !dbg !36 + %5464 = extractelement <2 x float> %5455, i64 1, !dbg !36 + %5465 = fadd float %5464, 0xC168000FE0000000, !dbg !36 + %5466 = fneg float %5465, !dbg !36 + %5467 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3788, float 0x3FF7154760000000, float %5466) #6, !dbg !36 + %5468 = tail call float @llvm.nvvm.fma.rn.f(float %3788, float 0x3FF7154760000000, float %5466) #6, !dbg !36 + %.0.i2034 = select i1 %.not3.i2033, float %5468, float %5467, !dbg !36 + %5469 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3788, float 0x3E54AE0C00000000, float %.0.i2034) #6, !dbg !36 + %5470 = tail call float @llvm.nvvm.fma.rn.f(float %3788, float 0x3E54AE0C00000000, float %.0.i2034) #6, !dbg !36 + %.01.i2036 = select i1 %.not4.i2035, float %5470, float %5469, !dbg !36 + %5471 = bitcast <2 x float> %5455 to <2 x i32>, !dbg !36 + %5472 = shl <2 x i32> %5471, splat (i32 23), !dbg !36 + %5473 = bitcast <2 x i32> %5472 to <2 x float>, !dbg !36 + %5474 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2036) #6, !dbg !36 + %5475 = insertelement <2 x float> poison, float %5463, i64 0, !dbg !36 + %5476 = insertelement <2 x float> %5475, float %5474, i64 1, !dbg !36 + %5477 = fmul <2 x float> %5476, %5473, !dbg !36 + %5478 = fmul <2 x float> %5447, %5477, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38 + %5479 = fadd float %5131, %5132, !dbg !41 + %5480 = fadd float %5479, %5133, !dbg !41 + %5481 = fadd float %5480, %5134, !dbg !41 + %5482 = fadd float %5481, %5135, !dbg !41 + %5483 = fadd float %5482, %5136, !dbg !41 + %5484 = fadd float %5483, %5137, !dbg !41 + %5485 = fadd float %5484, %5138, !dbg !41 + %5486 = fadd float %5485, %5139, !dbg !41 + %5487 = fadd float %5486, %5140, !dbg !41 + %5488 = fadd float %5487, %5141, !dbg !41 + %5489 = fadd float %5488, %5142, !dbg !41 + %5490 = fadd float %5489, %5143, !dbg !41 + %5491 = fadd float %5490, %5144, !dbg !41 + %5492 = fadd float %5491, %5145, !dbg !41 + %5493 = fadd float %5492, %5146, !dbg !41 + %5494 = fadd float %5493, %5147, !dbg !41 + %5495 = fadd float %5494, %5148, !dbg !41 + %5496 = fadd float %5495, %5149, !dbg !41 + %5497 = fadd float %5496, %5150, !dbg !41 + %5498 = fadd float %5497, %5151, !dbg !41 + %5499 = fadd float %5498, %5152, !dbg !41 + %5500 = fadd float %5499, %5153, !dbg !41 + %5501 = fadd float %5500, %5154, !dbg !41 + %5502 = fadd float %5501, %5155, !dbg !41 + %5503 = fadd float %5502, %5156, !dbg !41 + %5504 = fadd float %5503, %5157, !dbg !41 + %5505 = fadd float %5504, %5158, !dbg !41 + %5506 = fadd float %5505, %5159, !dbg !41 + %5507 = fadd float %5506, %5160, !dbg !41 + %5508 = fadd float %5507, %5161, !dbg !41 + %5509 = fadd float %5508, %5162, !dbg !41 + %5510 = fadd float %5509, %5163, !dbg !41 + %5511 = fadd float %5510, %5164, !dbg !41 + %5512 = fadd float %5511, %5165, !dbg !41 + %5513 = fadd float %5512, %5166, !dbg !41 + %5514 = fadd float %5513, %5167, !dbg !41 + %5515 = fadd float %5514, %5168, !dbg !41 + %5516 = fadd float %5515, %5169, !dbg !41 + %5517 = fadd float %5516, %5170, !dbg !41 + %5518 = fadd float %5517, %5171, !dbg !41 + %5519 = fadd float %5518, %5172, !dbg !41 + %5520 = fadd float %5519, %5173, !dbg !41 + %5521 = fadd float %5520, %5174, !dbg !41 + %5522 = fadd float %5521, %5175, !dbg !41 + %5523 = fadd float %5522, %5176, !dbg !41 + %5524 = fadd float %5523, %5177, !dbg !41 + %5525 = fadd float %5524, %5178, !dbg !41 + %5526 = fadd float %5525, %5179, !dbg !41 + %5527 = fadd float %5526, %5180, !dbg !41 + %5528 = fadd float %5527, %5181, !dbg !41 + %5529 = fadd float %5528, %5182, !dbg !41 + %5530 = fadd float %5529, %5183, !dbg !41 + %5531 = fadd float %5530, %5184, !dbg !41 + %5532 = fadd float %5531, %5185, !dbg !41 + %5533 = fadd float %5532, %5186, !dbg !41 + %5534 = fadd float %5533, %5187, !dbg !41 + %5535 = fadd float %5534, %5188, !dbg !41 + %5536 = extractelement <2 x float> %5286, i64 0, !dbg !41 + %5537 = fadd float %5535, %5536, !dbg !41 + %5538 = extractelement <2 x float> %5286, i64 1, !dbg !41 + %5539 = fadd float %5537, %5538, !dbg !41 + %5540 = extractelement <2 x float> %5382, i64 0, !dbg !41 + %5541 = fadd float %5539, %5540, !dbg !41 + %5542 = extractelement <2 x float> %5382, i64 1, !dbg !41 + %5543 = fadd float %5541, %5542, !dbg !41 + %5544 = extractelement <2 x float> %5478, i64 0, !dbg !41 + %5545 = fadd float %5543, %5544, !dbg !41 + %5546 = extractelement <2 x float> %5478, i64 1, !dbg !41 + %5547 = fadd float %5545, %5546, !dbg !41 + %5548 = bitcast float %5547 to i32, !dbg !38 + %5549 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %5548, i32 16, i32 31), !dbg !38 + %5550 = bitcast i32 %5549 to float, !dbg !38 + %5551 = fadd float %5547, %5550, !dbg !41 + %5552 = bitcast float %5551 to i32, !dbg !38 + %5553 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %5552, i32 8, i32 31), !dbg !38 + %5554 = bitcast i32 %5553 to float, !dbg !38 + %5555 = fadd float %5551, %5554, !dbg !41 + %5556 = bitcast float %5555 to i32, !dbg !38 + %5557 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %5556, i32 4, i32 31), !dbg !38 + %5558 = bitcast i32 %5557 to float, !dbg !38 + %5559 = fadd float %5555, %5558, !dbg !41 + %5560 = bitcast float %5559 to i32, !dbg !38 + %5561 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %5560, i32 2, i32 31), !dbg !38 + %5562 = bitcast i32 %5561 to float, !dbg !38 + %5563 = fadd float %5559, %5562, !dbg !41 + %5564 = bitcast float %5563 to i32, !dbg !38 + %5565 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %5564, i32 1, i32 31), !dbg !38 + %5566 = bitcast i32 %5565 to float, !dbg !38 + %5567 = fadd float %5563, %5566, !dbg !41 + %5568 = bitcast float %5567 to <1 x i32>, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %3623, <1 x i32> %5568, i1 %3622) #6, !dbg !38 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38 + %5569 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %3627, i1 %3626) #6, !dbg !38 + %5570 = bitcast i32 %5569 to float, !dbg !38 + %5571 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %5569, i32 8, i32 31), !dbg !38 + %5572 = bitcast i32 %5571 to float, !dbg !38 + %5573 = fadd float %5570, %5572, !dbg !41 + %5574 = bitcast float %5573 to i32, !dbg !38 + %5575 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %5574, i32 4, i32 31), !dbg !38 + %5576 = bitcast i32 %5575 to float, !dbg !38 + %5577 = fadd float %5573, %5576, !dbg !41 + %5578 = bitcast float %5577 to i32, !dbg !38 + %5579 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %5578, i32 2, i32 31), !dbg !38 + %5580 = bitcast i32 %5579 to float, !dbg !38 + %5581 = fadd float %5577, %5580, !dbg !41 + %5582 = bitcast float %5581 to i32, !dbg !38 + %5583 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %5582, i32 1, i32 31), !dbg !38 + %5584 = bitcast i32 %5583 to float, !dbg !38 + %5585 = fadd float %5581, %5584, !dbg !41 + %5586 = bitcast float %5585 to <1 x i32>, !dbg !38 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %3627, <1 x i32> %5586, i1 %3656) #6, !dbg !38 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !38 + %5587 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !38 + %5588 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !42 + %5589 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %58, i64 %5588, i1 true) #6, !dbg !42 + %5590 = extractvalue { i32, i32, i32, i32 } %5589, 0, !dbg !42 + %5591 = bitcast i32 %5590 to <2 x bfloat>, !dbg !42 + %5592 = extractvalue { i32, i32, i32, i32 } %5589, 1, !dbg !42 + %5593 = bitcast i32 %5592 to <2 x bfloat>, !dbg !42 + %5594 = extractvalue { i32, i32, i32, i32 } %5589, 2, !dbg !42 + %5595 = bitcast i32 %5594 to <2 x bfloat>, !dbg !42 + %5596 = extractvalue { i32, i32, i32, i32 } %5589, 3, !dbg !42 + %5597 = bitcast i32 %5596 to <2 x bfloat>, !dbg !42 + %5598 = extractelement <2 x bfloat> %5591, i64 0, !dbg !42 + %5599 = extractelement <2 x bfloat> %5591, i64 1, !dbg !42 + %5600 = extractelement <2 x bfloat> %5593, i64 0, !dbg !42 + %5601 = extractelement <2 x bfloat> %5593, i64 1, !dbg !42 + %5602 = extractelement <2 x bfloat> %5595, i64 0, !dbg !42 + %5603 = extractelement <2 x bfloat> %5595, i64 1, !dbg !42 + %5604 = extractelement <2 x bfloat> %5597, i64 0, !dbg !42 + %5605 = extractelement <2 x bfloat> %5597, i64 1, !dbg !42 + %5606 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !42 + %5607 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %60, i64 %5606, i1 true) #6, !dbg !42 + %5608 = extractvalue { i32, i32, i32, i32 } %5607, 0, !dbg !42 + %5609 = bitcast i32 %5608 to <2 x bfloat>, !dbg !42 + %5610 = extractvalue { i32, i32, i32, i32 } %5607, 1, !dbg !42 + %5611 = bitcast i32 %5610 to <2 x bfloat>, !dbg !42 + %5612 = extractvalue { i32, i32, i32, i32 } %5607, 2, !dbg !42 + %5613 = bitcast i32 %5612 to <2 x bfloat>, !dbg !42 + %5614 = extractvalue { i32, i32, i32, i32 } %5607, 3, !dbg !42 + %5615 = bitcast i32 %5614 to <2 x bfloat>, !dbg !42 + %5616 = extractelement <2 x bfloat> %5609, i64 0, !dbg !42 + %5617 = extractelement <2 x bfloat> %5609, i64 1, !dbg !42 + %5618 = extractelement <2 x bfloat> %5611, i64 0, !dbg !42 + %5619 = extractelement <2 x bfloat> %5611, i64 1, !dbg !42 + %5620 = extractelement <2 x bfloat> %5613, i64 0, !dbg !42 + %5621 = extractelement <2 x bfloat> %5613, i64 1, !dbg !42 + %5622 = extractelement <2 x bfloat> %5615, i64 0, !dbg !42 + %5623 = extractelement <2 x bfloat> %5615, i64 1, !dbg !42 + %5624 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !42 + %5625 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %62, i64 %5624, i1 true) #6, !dbg !42 + %5626 = extractvalue { i32, i32, i32, i32 } %5625, 0, !dbg !42 + %5627 = bitcast i32 %5626 to <2 x bfloat>, !dbg !42 + %5628 = extractvalue { i32, i32, i32, i32 } %5625, 1, !dbg !42 + %5629 = bitcast i32 %5628 to <2 x bfloat>, !dbg !42 + %5630 = extractvalue { i32, i32, i32, i32 } %5625, 2, !dbg !42 + %5631 = bitcast i32 %5630 to <2 x bfloat>, !dbg !42 + %5632 = extractvalue { i32, i32, i32, i32 } %5625, 3, !dbg !42 + %5633 = bitcast i32 %5632 to <2 x bfloat>, !dbg !42 + %5634 = extractelement <2 x bfloat> %5627, i64 0, !dbg !42 + %5635 = extractelement <2 x bfloat> %5627, i64 1, !dbg !42 + %5636 = extractelement <2 x bfloat> %5629, i64 0, !dbg !42 + %5637 = extractelement <2 x bfloat> %5629, i64 1, !dbg !42 + %5638 = extractelement <2 x bfloat> %5631, i64 0, !dbg !42 + %5639 = extractelement <2 x bfloat> %5631, i64 1, !dbg !42 + %5640 = extractelement <2 x bfloat> %5633, i64 0, !dbg !42 + %5641 = extractelement <2 x bfloat> %5633, i64 1, !dbg !42 + %5642 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !42 + %5643 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %64, i64 %5642, i1 true) #6, !dbg !42 + %5644 = extractvalue { i32, i32, i32, i32 } %5643, 0, !dbg !42 + %5645 = bitcast i32 %5644 to <2 x bfloat>, !dbg !42 + %5646 = extractvalue { i32, i32, i32, i32 } %5643, 1, !dbg !42 + %5647 = bitcast i32 %5646 to <2 x bfloat>, !dbg !42 + %5648 = extractvalue { i32, i32, i32, i32 } %5643, 2, !dbg !42 + %5649 = bitcast i32 %5648 to <2 x bfloat>, !dbg !42 + %5650 = extractvalue { i32, i32, i32, i32 } %5643, 3, !dbg !42 + %5651 = bitcast i32 %5650 to <2 x bfloat>, !dbg !42 + %5652 = extractelement <2 x bfloat> %5645, i64 0, !dbg !42 + %5653 = extractelement <2 x bfloat> %5645, i64 1, !dbg !42 + %5654 = extractelement <2 x bfloat> %5647, i64 0, !dbg !42 + %5655 = extractelement <2 x bfloat> %5647, i64 1, !dbg !42 + %5656 = extractelement <2 x bfloat> %5649, i64 0, !dbg !42 + %5657 = extractelement <2 x bfloat> %5649, i64 1, !dbg !42 + %5658 = extractelement <2 x bfloat> %5651, i64 0, !dbg !42 + %5659 = extractelement <2 x bfloat> %5651, i64 1, !dbg !42 + %5660 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !42 + %5661 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %66, i64 %5660, i1 true) #6, !dbg !42 + %5662 = extractvalue { i32, i32, i32, i32 } %5661, 0, !dbg !42 + %5663 = bitcast i32 %5662 to <2 x bfloat>, !dbg !42 + %5664 = extractvalue { i32, i32, i32, i32 } %5661, 1, !dbg !42 + %5665 = bitcast i32 %5664 to <2 x bfloat>, !dbg !42 + %5666 = extractvalue { i32, i32, i32, i32 } %5661, 2, !dbg !42 + %5667 = bitcast i32 %5666 to <2 x bfloat>, !dbg !42 + %5668 = extractvalue { i32, i32, i32, i32 } %5661, 3, !dbg !42 + %5669 = bitcast i32 %5668 to <2 x bfloat>, !dbg !42 + %5670 = extractelement <2 x bfloat> %5663, i64 0, !dbg !42 + %5671 = extractelement <2 x bfloat> %5663, i64 1, !dbg !42 + %5672 = extractelement <2 x bfloat> %5665, i64 0, !dbg !42 + %5673 = extractelement <2 x bfloat> %5665, i64 1, !dbg !42 + %5674 = extractelement <2 x bfloat> %5667, i64 0, !dbg !42 + %5675 = extractelement <2 x bfloat> %5667, i64 1, !dbg !42 + %5676 = extractelement <2 x bfloat> %5669, i64 0, !dbg !42 + %5677 = extractelement <2 x bfloat> %5669, i64 1, !dbg !42 + %5678 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !42 + %5679 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %68, i64 %5678, i1 true) #6, !dbg !42 + %5680 = extractvalue { i32, i32, i32, i32 } %5679, 0, !dbg !42 + %5681 = bitcast i32 %5680 to <2 x bfloat>, !dbg !42 + %5682 = extractvalue { i32, i32, i32, i32 } %5679, 1, !dbg !42 + %5683 = bitcast i32 %5682 to <2 x bfloat>, !dbg !42 + %5684 = extractvalue { i32, i32, i32, i32 } %5679, 2, !dbg !42 + %5685 = bitcast i32 %5684 to <2 x bfloat>, !dbg !42 + %5686 = extractvalue { i32, i32, i32, i32 } %5679, 3, !dbg !42 + %5687 = bitcast i32 %5686 to <2 x bfloat>, !dbg !42 + %5688 = extractelement <2 x bfloat> %5681, i64 0, !dbg !42 + %5689 = extractelement <2 x bfloat> %5681, i64 1, !dbg !42 + %5690 = extractelement <2 x bfloat> %5683, i64 0, !dbg !42 + %5691 = extractelement <2 x bfloat> %5683, i64 1, !dbg !42 + %5692 = extractelement <2 x bfloat> %5685, i64 0, !dbg !42 + %5693 = extractelement <2 x bfloat> %5685, i64 1, !dbg !42 + %5694 = extractelement <2 x bfloat> %5687, i64 0, !dbg !42 + %5695 = extractelement <2 x bfloat> %5687, i64 1, !dbg !42 + %5696 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !42 + %5697 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %70, i64 %5696, i1 true) #6, !dbg !42 + %5698 = extractvalue { i32, i32, i32, i32 } %5697, 0, !dbg !42 + %5699 = bitcast i32 %5698 to <2 x bfloat>, !dbg !42 + %5700 = extractvalue { i32, i32, i32, i32 } %5697, 1, !dbg !42 + %5701 = bitcast i32 %5700 to <2 x bfloat>, !dbg !42 + %5702 = extractvalue { i32, i32, i32, i32 } %5697, 2, !dbg !42 + %5703 = bitcast i32 %5702 to <2 x bfloat>, !dbg !42 + %5704 = extractvalue { i32, i32, i32, i32 } %5697, 3, !dbg !42 + %5705 = bitcast i32 %5704 to <2 x bfloat>, !dbg !42 + %5706 = extractelement <2 x bfloat> %5699, i64 0, !dbg !42 + %5707 = extractelement <2 x bfloat> %5699, i64 1, !dbg !42 + %5708 = extractelement <2 x bfloat> %5701, i64 0, !dbg !42 + %5709 = extractelement <2 x bfloat> %5701, i64 1, !dbg !42 + %5710 = extractelement <2 x bfloat> %5703, i64 0, !dbg !42 + %5711 = extractelement <2 x bfloat> %5703, i64 1, !dbg !42 + %5712 = extractelement <2 x bfloat> %5705, i64 0, !dbg !42 + %5713 = extractelement <2 x bfloat> %5705, i64 1, !dbg !42 + %5714 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !42 + %5715 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %72, i64 %5714, i1 %16) #6, !dbg !42 + %5716 = extractvalue { i32, i32, i32, i32 } %5715, 0, !dbg !42 + %5717 = bitcast i32 %5716 to <2 x bfloat>, !dbg !42 + %5718 = extractvalue { i32, i32, i32, i32 } %5715, 1, !dbg !42 + %5719 = bitcast i32 %5718 to <2 x bfloat>, !dbg !42 + %5720 = extractvalue { i32, i32, i32, i32 } %5715, 2, !dbg !42 + %5721 = bitcast i32 %5720 to <2 x bfloat>, !dbg !42 + %5722 = extractvalue { i32, i32, i32, i32 } %5715, 3, !dbg !42 + %5723 = bitcast i32 %5722 to <2 x bfloat>, !dbg !42 + %5724 = extractelement <2 x bfloat> %5717, i64 0, !dbg !42 + %5725 = extractelement <2 x bfloat> %5717, i64 1, !dbg !42 + %5726 = extractelement <2 x bfloat> %5719, i64 0, !dbg !42 + %5727 = extractelement <2 x bfloat> %5719, i64 1, !dbg !42 + %5728 = extractelement <2 x bfloat> %5721, i64 0, !dbg !42 + %5729 = extractelement <2 x bfloat> %5721, i64 1, !dbg !42 + %5730 = extractelement <2 x bfloat> %5723, i64 0, !dbg !42 + %5731 = extractelement <2 x bfloat> %5723, i64 1, !dbg !42 + %5732 = fpext bfloat %5598 to float, !dbg !43 + %5733 = fpext bfloat %5599 to float, !dbg !43 + %5734 = fpext bfloat %5600 to float, !dbg !43 + %5735 = fpext bfloat %5601 to float, !dbg !43 + %5736 = fpext bfloat %5602 to float, !dbg !43 + %5737 = fpext bfloat %5603 to float, !dbg !43 + %5738 = fpext bfloat %5604 to float, !dbg !43 + %5739 = fpext bfloat %5605 to float, !dbg !43 + %5740 = fpext bfloat %5616 to float, !dbg !43 + %5741 = fpext bfloat %5617 to float, !dbg !43 + %5742 = fpext bfloat %5618 to float, !dbg !43 + %5743 = fpext bfloat %5619 to float, !dbg !43 + %5744 = fpext bfloat %5620 to float, !dbg !43 + %5745 = fpext bfloat %5621 to float, !dbg !43 + %5746 = fpext bfloat %5622 to float, !dbg !43 + %5747 = fpext bfloat %5623 to float, !dbg !43 + %5748 = fpext bfloat %5634 to float, !dbg !43 + %5749 = fpext bfloat %5635 to float, !dbg !43 + %5750 = fpext bfloat %5636 to float, !dbg !43 + %5751 = fpext bfloat %5637 to float, !dbg !43 + %5752 = fpext bfloat %5638 to float, !dbg !43 + %5753 = fpext bfloat %5639 to float, !dbg !43 + %5754 = fpext bfloat %5640 to float, !dbg !43 + %5755 = fpext bfloat %5641 to float, !dbg !43 + %5756 = fpext bfloat %5652 to float, !dbg !43 + %5757 = fpext bfloat %5653 to float, !dbg !43 + %5758 = fpext bfloat %5654 to float, !dbg !43 + %5759 = fpext bfloat %5655 to float, !dbg !43 + %5760 = fpext bfloat %5656 to float, !dbg !43 + %5761 = fpext bfloat %5657 to float, !dbg !43 + %5762 = fpext bfloat %5658 to float, !dbg !43 + %5763 = fpext bfloat %5659 to float, !dbg !43 + %5764 = fpext bfloat %5670 to float, !dbg !43 + %5765 = fpext bfloat %5671 to float, !dbg !43 + %5766 = fpext bfloat %5672 to float, !dbg !43 + %5767 = fpext bfloat %5673 to float, !dbg !43 + %5768 = fpext bfloat %5674 to float, !dbg !43 + %5769 = fpext bfloat %5675 to float, !dbg !43 + %5770 = fpext bfloat %5676 to float, !dbg !43 + %5771 = fpext bfloat %5677 to float, !dbg !43 + %5772 = fpext bfloat %5688 to float, !dbg !43 + %5773 = fpext bfloat %5689 to float, !dbg !43 + %5774 = fpext bfloat %5690 to float, !dbg !43 + %5775 = fpext bfloat %5691 to float, !dbg !43 + %5776 = fpext bfloat %5692 to float, !dbg !43 + %5777 = fpext bfloat %5693 to float, !dbg !43 + %5778 = fpext bfloat %5694 to float, !dbg !43 + %5779 = fpext bfloat %5695 to float, !dbg !43 + %5780 = fpext bfloat %5706 to float, !dbg !43 + %5781 = fpext bfloat %5707 to float, !dbg !43 + %5782 = fpext bfloat %5708 to float, !dbg !43 + %5783 = fpext bfloat %5709 to float, !dbg !43 + %5784 = fpext bfloat %5710 to float, !dbg !43 + %5785 = fpext bfloat %5711 to float, !dbg !43 + %5786 = fpext bfloat %5712 to float, !dbg !43 + %5787 = fpext bfloat %5713 to float, !dbg !43 + %5788 = fpext bfloat %5724 to float, !dbg !43 + %5789 = fpext bfloat %5725 to float, !dbg !43 + %5790 = fpext bfloat %5726 to float, !dbg !43 + %5791 = fpext bfloat %5727 to float, !dbg !43 + %5792 = fpext bfloat %5728 to float, !dbg !43 + %5793 = fpext bfloat %5729 to float, !dbg !43 + %5794 = fpext bfloat %5730 to float, !dbg !43 + %5795 = fpext bfloat %5731 to float, !dbg !43 + %5796 = fsub float %5732, %3659, !dbg !44 + %5797 = fsub float %5733, %3659, !dbg !44 + %5798 = fsub float %5734, %3659, !dbg !44 + %5799 = fsub float %5735, %3659, !dbg !44 + %5800 = fsub float %5736, %3659, !dbg !44 + %5801 = fsub float %5737, %3659, !dbg !44 + %5802 = fsub float %5738, %3659, !dbg !44 + %5803 = fsub float %5739, %3659, !dbg !44 + %5804 = fsub float %5740, %3659, !dbg !44 + %5805 = fsub float %5741, %3659, !dbg !44 + %5806 = fsub float %5742, %3659, !dbg !44 + %5807 = fsub float %5743, %3659, !dbg !44 + %5808 = fsub float %5744, %3659, !dbg !44 + %5809 = fsub float %5745, %3659, !dbg !44 + %5810 = fsub float %5746, %3659, !dbg !44 + %5811 = fsub float %5747, %3659, !dbg !44 + %5812 = fsub float %5748, %3659, !dbg !44 + %5813 = fsub float %5749, %3659, !dbg !44 + %5814 = fsub float %5750, %3659, !dbg !44 + %5815 = fsub float %5751, %3659, !dbg !44 + %5816 = fsub float %5752, %3659, !dbg !44 + %5817 = fsub float %5753, %3659, !dbg !44 + %5818 = fsub float %5754, %3659, !dbg !44 + %5819 = fsub float %5755, %3659, !dbg !44 + %5820 = fsub float %5756, %3659, !dbg !44 + %5821 = fsub float %5757, %3659, !dbg !44 + %5822 = fsub float %5758, %3659, !dbg !44 + %5823 = fsub float %5759, %3659, !dbg !44 + %5824 = fsub float %5760, %3659, !dbg !44 + %5825 = fsub float %5761, %3659, !dbg !44 + %5826 = fsub float %5762, %3659, !dbg !44 + %5827 = fsub float %5763, %3659, !dbg !44 + %5828 = fsub float %5764, %3659, !dbg !44 + %5829 = fsub float %5765, %3659, !dbg !44 + %5830 = fsub float %5766, %3659, !dbg !44 + %5831 = fsub float %5767, %3659, !dbg !44 + %5832 = fsub float %5768, %3659, !dbg !44 + %5833 = fsub float %5769, %3659, !dbg !44 + %5834 = fsub float %5770, %3659, !dbg !44 + %5835 = fsub float %5771, %3659, !dbg !44 + %5836 = fsub float %5772, %3659, !dbg !44 + %5837 = fsub float %5773, %3659, !dbg !44 + %5838 = fsub float %5774, %3659, !dbg !44 + %5839 = fsub float %5775, %3659, !dbg !44 + %5840 = fsub float %5776, %3659, !dbg !44 + %5841 = fsub float %5777, %3659, !dbg !44 + %5842 = fsub float %5778, %3659, !dbg !44 + %5843 = fsub float %5779, %3659, !dbg !44 + %5844 = fsub float %5780, %3659, !dbg !44 + %5845 = fsub float %5781, %3659, !dbg !44 + %5846 = fsub float %5782, %3659, !dbg !44 + %5847 = fsub float %5783, %3659, !dbg !44 + %5848 = fsub float %5784, %3659, !dbg !44 + %5849 = fsub float %5785, %3659, !dbg !44 + %5850 = fsub float %5786, %3659, !dbg !44 + %5851 = fsub float %5787, %3659, !dbg !44 + %5852 = fsub float %5788, %3659, !dbg !44 + %5853 = fsub float %5789, %3659, !dbg !44 + %5854 = fsub float %5790, %3659, !dbg !44 + %5855 = fsub float %5791, %3659, !dbg !44 + %5856 = fsub float %5792, %3659, !dbg !44 + %5857 = fsub float %5793, %3659, !dbg !44 + %5858 = fsub float %5794, %3659, !dbg !44 + %5859 = fsub float %5795, %3659, !dbg !44 + %5860 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2037 = icmp eq i32 %5860, 0, !dbg !45 + %5861 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5796, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %5862 = tail call float @llvm.nvvm.fma.rn.f(float %5796, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2038 = select i1 %.not.i2037, float %5862, float %5861, !dbg !45 + %5863 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2039 = icmp eq i32 %5863, 0, !dbg !45 + %5864 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2038) #6, !dbg !45 + %5865 = tail call float @llvm.nvvm.saturate.f(float %.02.i2038) #6, !dbg !45 + %.03.i2040 = select i1 %.not1.i2039, float %5865, float %5864, !dbg !45 + %5866 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2041 = icmp eq i32 %5866, 0, !dbg !45 + %5867 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2040, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %5868 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2040, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2042 = select i1 %.not2.i2041, float %5868, float %5867, !dbg !45 + %5869 = fadd float %.04.i2042, 0xC168000FE0000000, !dbg !45 + %5870 = fneg float %5869, !dbg !45 + %5871 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2043 = icmp eq i32 %5871, 0, !dbg !45 + %5872 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5796, float 0x3FF7154760000000, float %5870) #6, !dbg !45 + %5873 = tail call float @llvm.nvvm.fma.rn.f(float %5796, float 0x3FF7154760000000, float %5870) #6, !dbg !45 + %.0.i2044 = select i1 %.not3.i2043, float %5873, float %5872, !dbg !45 + %5874 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2045 = icmp eq i32 %5874, 0, !dbg !45 + %5875 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5796, float 0x3E54AE0C00000000, float %.0.i2044) #6, !dbg !45 + %5876 = tail call float @llvm.nvvm.fma.rn.f(float %5796, float 0x3E54AE0C00000000, float %.0.i2044) #6, !dbg !45 + %.01.i2046 = select i1 %.not4.i2045, float %5876, float %5875, !dbg !45 + %5877 = bitcast float %.04.i2042 to i32, !dbg !45 + %5878 = shl i32 %5877, 23, !dbg !45 + %5879 = bitcast i32 %5878 to float, !dbg !45 + %5880 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2046) #6, !dbg !45 + %5881 = fmul float %5880, %5879, !dbg !45 + %5882 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2047 = icmp eq i32 %5882, 0, !dbg !45 + %5883 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5797, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %5884 = tail call float @llvm.nvvm.fma.rn.f(float %5797, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2048 = select i1 %.not.i2047, float %5884, float %5883, !dbg !45 + %5885 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2049 = icmp eq i32 %5885, 0, !dbg !45 + %5886 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2048) #6, !dbg !45 + %5887 = tail call float @llvm.nvvm.saturate.f(float %.02.i2048) #6, !dbg !45 + %.03.i2050 = select i1 %.not1.i2049, float %5887, float %5886, !dbg !45 + %5888 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2051 = icmp eq i32 %5888, 0, !dbg !45 + %5889 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2050, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %5890 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2050, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2052 = select i1 %.not2.i2051, float %5890, float %5889, !dbg !45 + %5891 = fadd float %.04.i2052, 0xC168000FE0000000, !dbg !45 + %5892 = fneg float %5891, !dbg !45 + %5893 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2053 = icmp eq i32 %5893, 0, !dbg !45 + %5894 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5797, float 0x3FF7154760000000, float %5892) #6, !dbg !45 + %5895 = tail call float @llvm.nvvm.fma.rn.f(float %5797, float 0x3FF7154760000000, float %5892) #6, !dbg !45 + %.0.i2054 = select i1 %.not3.i2053, float %5895, float %5894, !dbg !45 + %5896 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2055 = icmp eq i32 %5896, 0, !dbg !45 + %5897 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5797, float 0x3E54AE0C00000000, float %.0.i2054) #6, !dbg !45 + %5898 = tail call float @llvm.nvvm.fma.rn.f(float %5797, float 0x3E54AE0C00000000, float %.0.i2054) #6, !dbg !45 + %.01.i2056 = select i1 %.not4.i2055, float %5898, float %5897, !dbg !45 + %5899 = bitcast float %.04.i2052 to i32, !dbg !45 + %5900 = shl i32 %5899, 23, !dbg !45 + %5901 = bitcast i32 %5900 to float, !dbg !45 + %5902 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2056) #6, !dbg !45 + %5903 = fmul float %5902, %5901, !dbg !45 + %5904 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2057 = icmp eq i32 %5904, 0, !dbg !45 + %5905 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5798, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %5906 = tail call float @llvm.nvvm.fma.rn.f(float %5798, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2058 = select i1 %.not.i2057, float %5906, float %5905, !dbg !45 + %5907 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2059 = icmp eq i32 %5907, 0, !dbg !45 + %5908 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2058) #6, !dbg !45 + %5909 = tail call float @llvm.nvvm.saturate.f(float %.02.i2058) #6, !dbg !45 + %.03.i2060 = select i1 %.not1.i2059, float %5909, float %5908, !dbg !45 + %5910 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2061 = icmp eq i32 %5910, 0, !dbg !45 + %5911 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2060, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %5912 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2060, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2062 = select i1 %.not2.i2061, float %5912, float %5911, !dbg !45 + %5913 = fadd float %.04.i2062, 0xC168000FE0000000, !dbg !45 + %5914 = fneg float %5913, !dbg !45 + %5915 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2063 = icmp eq i32 %5915, 0, !dbg !45 + %5916 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5798, float 0x3FF7154760000000, float %5914) #6, !dbg !45 + %5917 = tail call float @llvm.nvvm.fma.rn.f(float %5798, float 0x3FF7154760000000, float %5914) #6, !dbg !45 + %.0.i2064 = select i1 %.not3.i2063, float %5917, float %5916, !dbg !45 + %5918 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2065 = icmp eq i32 %5918, 0, !dbg !45 + %5919 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5798, float 0x3E54AE0C00000000, float %.0.i2064) #6, !dbg !45 + %5920 = tail call float @llvm.nvvm.fma.rn.f(float %5798, float 0x3E54AE0C00000000, float %.0.i2064) #6, !dbg !45 + %.01.i2066 = select i1 %.not4.i2065, float %5920, float %5919, !dbg !45 + %5921 = bitcast float %.04.i2062 to i32, !dbg !45 + %5922 = shl i32 %5921, 23, !dbg !45 + %5923 = bitcast i32 %5922 to float, !dbg !45 + %5924 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2066) #6, !dbg !45 + %5925 = fmul float %5924, %5923, !dbg !45 + %5926 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2067 = icmp eq i32 %5926, 0, !dbg !45 + %5927 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5799, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %5928 = tail call float @llvm.nvvm.fma.rn.f(float %5799, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2068 = select i1 %.not.i2067, float %5928, float %5927, !dbg !45 + %5929 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2069 = icmp eq i32 %5929, 0, !dbg !45 + %5930 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2068) #6, !dbg !45 + %5931 = tail call float @llvm.nvvm.saturate.f(float %.02.i2068) #6, !dbg !45 + %.03.i2070 = select i1 %.not1.i2069, float %5931, float %5930, !dbg !45 + %5932 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2071 = icmp eq i32 %5932, 0, !dbg !45 + %5933 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2070, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %5934 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2070, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2072 = select i1 %.not2.i2071, float %5934, float %5933, !dbg !45 + %5935 = fadd float %.04.i2072, 0xC168000FE0000000, !dbg !45 + %5936 = fneg float %5935, !dbg !45 + %5937 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2073 = icmp eq i32 %5937, 0, !dbg !45 + %5938 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5799, float 0x3FF7154760000000, float %5936) #6, !dbg !45 + %5939 = tail call float @llvm.nvvm.fma.rn.f(float %5799, float 0x3FF7154760000000, float %5936) #6, !dbg !45 + %.0.i2074 = select i1 %.not3.i2073, float %5939, float %5938, !dbg !45 + %5940 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2075 = icmp eq i32 %5940, 0, !dbg !45 + %5941 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5799, float 0x3E54AE0C00000000, float %.0.i2074) #6, !dbg !45 + %5942 = tail call float @llvm.nvvm.fma.rn.f(float %5799, float 0x3E54AE0C00000000, float %.0.i2074) #6, !dbg !45 + %.01.i2076 = select i1 %.not4.i2075, float %5942, float %5941, !dbg !45 + %5943 = bitcast float %.04.i2072 to i32, !dbg !45 + %5944 = shl i32 %5943, 23, !dbg !45 + %5945 = bitcast i32 %5944 to float, !dbg !45 + %5946 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2076) #6, !dbg !45 + %5947 = fmul float %5946, %5945, !dbg !45 + %5948 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2077 = icmp eq i32 %5948, 0, !dbg !45 + %5949 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5800, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %5950 = tail call float @llvm.nvvm.fma.rn.f(float %5800, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2078 = select i1 %.not.i2077, float %5950, float %5949, !dbg !45 + %5951 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2079 = icmp eq i32 %5951, 0, !dbg !45 + %5952 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2078) #6, !dbg !45 + %5953 = tail call float @llvm.nvvm.saturate.f(float %.02.i2078) #6, !dbg !45 + %.03.i2080 = select i1 %.not1.i2079, float %5953, float %5952, !dbg !45 + %5954 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2081 = icmp eq i32 %5954, 0, !dbg !45 + %5955 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2080, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %5956 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2080, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2082 = select i1 %.not2.i2081, float %5956, float %5955, !dbg !45 + %5957 = fadd float %.04.i2082, 0xC168000FE0000000, !dbg !45 + %5958 = fneg float %5957, !dbg !45 + %5959 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2083 = icmp eq i32 %5959, 0, !dbg !45 + %5960 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5800, float 0x3FF7154760000000, float %5958) #6, !dbg !45 + %5961 = tail call float @llvm.nvvm.fma.rn.f(float %5800, float 0x3FF7154760000000, float %5958) #6, !dbg !45 + %.0.i2084 = select i1 %.not3.i2083, float %5961, float %5960, !dbg !45 + %5962 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2085 = icmp eq i32 %5962, 0, !dbg !45 + %5963 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5800, float 0x3E54AE0C00000000, float %.0.i2084) #6, !dbg !45 + %5964 = tail call float @llvm.nvvm.fma.rn.f(float %5800, float 0x3E54AE0C00000000, float %.0.i2084) #6, !dbg !45 + %.01.i2086 = select i1 %.not4.i2085, float %5964, float %5963, !dbg !45 + %5965 = bitcast float %.04.i2082 to i32, !dbg !45 + %5966 = shl i32 %5965, 23, !dbg !45 + %5967 = bitcast i32 %5966 to float, !dbg !45 + %5968 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2086) #6, !dbg !45 + %5969 = fmul float %5968, %5967, !dbg !45 + %5970 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2087 = icmp eq i32 %5970, 0, !dbg !45 + %5971 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5801, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %5972 = tail call float @llvm.nvvm.fma.rn.f(float %5801, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2088 = select i1 %.not.i2087, float %5972, float %5971, !dbg !45 + %5973 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2089 = icmp eq i32 %5973, 0, !dbg !45 + %5974 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2088) #6, !dbg !45 + %5975 = tail call float @llvm.nvvm.saturate.f(float %.02.i2088) #6, !dbg !45 + %.03.i2090 = select i1 %.not1.i2089, float %5975, float %5974, !dbg !45 + %5976 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2091 = icmp eq i32 %5976, 0, !dbg !45 + %5977 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2090, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %5978 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2090, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2092 = select i1 %.not2.i2091, float %5978, float %5977, !dbg !45 + %5979 = fadd float %.04.i2092, 0xC168000FE0000000, !dbg !45 + %5980 = fneg float %5979, !dbg !45 + %5981 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2093 = icmp eq i32 %5981, 0, !dbg !45 + %5982 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5801, float 0x3FF7154760000000, float %5980) #6, !dbg !45 + %5983 = tail call float @llvm.nvvm.fma.rn.f(float %5801, float 0x3FF7154760000000, float %5980) #6, !dbg !45 + %.0.i2094 = select i1 %.not3.i2093, float %5983, float %5982, !dbg !45 + %5984 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2095 = icmp eq i32 %5984, 0, !dbg !45 + %5985 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5801, float 0x3E54AE0C00000000, float %.0.i2094) #6, !dbg !45 + %5986 = tail call float @llvm.nvvm.fma.rn.f(float %5801, float 0x3E54AE0C00000000, float %.0.i2094) #6, !dbg !45 + %.01.i2096 = select i1 %.not4.i2095, float %5986, float %5985, !dbg !45 + %5987 = bitcast float %.04.i2092 to i32, !dbg !45 + %5988 = shl i32 %5987, 23, !dbg !45 + %5989 = bitcast i32 %5988 to float, !dbg !45 + %5990 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2096) #6, !dbg !45 + %5991 = fmul float %5990, %5989, !dbg !45 + %5992 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2097 = icmp eq i32 %5992, 0, !dbg !45 + %5993 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5802, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %5994 = tail call float @llvm.nvvm.fma.rn.f(float %5802, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2098 = select i1 %.not.i2097, float %5994, float %5993, !dbg !45 + %5995 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2099 = icmp eq i32 %5995, 0, !dbg !45 + %5996 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2098) #6, !dbg !45 + %5997 = tail call float @llvm.nvvm.saturate.f(float %.02.i2098) #6, !dbg !45 + %.03.i2100 = select i1 %.not1.i2099, float %5997, float %5996, !dbg !45 + %5998 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2101 = icmp eq i32 %5998, 0, !dbg !45 + %5999 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2100, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6000 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2100, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2102 = select i1 %.not2.i2101, float %6000, float %5999, !dbg !45 + %6001 = fadd float %.04.i2102, 0xC168000FE0000000, !dbg !45 + %6002 = fneg float %6001, !dbg !45 + %6003 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2103 = icmp eq i32 %6003, 0, !dbg !45 + %6004 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5802, float 0x3FF7154760000000, float %6002) #6, !dbg !45 + %6005 = tail call float @llvm.nvvm.fma.rn.f(float %5802, float 0x3FF7154760000000, float %6002) #6, !dbg !45 + %.0.i2104 = select i1 %.not3.i2103, float %6005, float %6004, !dbg !45 + %6006 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2105 = icmp eq i32 %6006, 0, !dbg !45 + %6007 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5802, float 0x3E54AE0C00000000, float %.0.i2104) #6, !dbg !45 + %6008 = tail call float @llvm.nvvm.fma.rn.f(float %5802, float 0x3E54AE0C00000000, float %.0.i2104) #6, !dbg !45 + %.01.i2106 = select i1 %.not4.i2105, float %6008, float %6007, !dbg !45 + %6009 = bitcast float %.04.i2102 to i32, !dbg !45 + %6010 = shl i32 %6009, 23, !dbg !45 + %6011 = bitcast i32 %6010 to float, !dbg !45 + %6012 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2106) #6, !dbg !45 + %6013 = fmul float %6012, %6011, !dbg !45 + %6014 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2107 = icmp eq i32 %6014, 0, !dbg !45 + %6015 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5803, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6016 = tail call float @llvm.nvvm.fma.rn.f(float %5803, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2108 = select i1 %.not.i2107, float %6016, float %6015, !dbg !45 + %6017 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2109 = icmp eq i32 %6017, 0, !dbg !45 + %6018 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2108) #6, !dbg !45 + %6019 = tail call float @llvm.nvvm.saturate.f(float %.02.i2108) #6, !dbg !45 + %.03.i2110 = select i1 %.not1.i2109, float %6019, float %6018, !dbg !45 + %6020 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2111 = icmp eq i32 %6020, 0, !dbg !45 + %6021 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2110, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6022 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2110, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2112 = select i1 %.not2.i2111, float %6022, float %6021, !dbg !45 + %6023 = fadd float %.04.i2112, 0xC168000FE0000000, !dbg !45 + %6024 = fneg float %6023, !dbg !45 + %6025 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2113 = icmp eq i32 %6025, 0, !dbg !45 + %6026 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5803, float 0x3FF7154760000000, float %6024) #6, !dbg !45 + %6027 = tail call float @llvm.nvvm.fma.rn.f(float %5803, float 0x3FF7154760000000, float %6024) #6, !dbg !45 + %.0.i2114 = select i1 %.not3.i2113, float %6027, float %6026, !dbg !45 + %6028 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2115 = icmp eq i32 %6028, 0, !dbg !45 + %6029 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5803, float 0x3E54AE0C00000000, float %.0.i2114) #6, !dbg !45 + %6030 = tail call float @llvm.nvvm.fma.rn.f(float %5803, float 0x3E54AE0C00000000, float %.0.i2114) #6, !dbg !45 + %.01.i2116 = select i1 %.not4.i2115, float %6030, float %6029, !dbg !45 + %6031 = bitcast float %.04.i2112 to i32, !dbg !45 + %6032 = shl i32 %6031, 23, !dbg !45 + %6033 = bitcast i32 %6032 to float, !dbg !45 + %6034 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2116) #6, !dbg !45 + %6035 = fmul float %6034, %6033, !dbg !45 + %6036 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2117 = icmp eq i32 %6036, 0, !dbg !45 + %6037 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5804, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6038 = tail call float @llvm.nvvm.fma.rn.f(float %5804, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2118 = select i1 %.not.i2117, float %6038, float %6037, !dbg !45 + %6039 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2119 = icmp eq i32 %6039, 0, !dbg !45 + %6040 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2118) #6, !dbg !45 + %6041 = tail call float @llvm.nvvm.saturate.f(float %.02.i2118) #6, !dbg !45 + %.03.i2120 = select i1 %.not1.i2119, float %6041, float %6040, !dbg !45 + %6042 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2121 = icmp eq i32 %6042, 0, !dbg !45 + %6043 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2120, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6044 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2120, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2122 = select i1 %.not2.i2121, float %6044, float %6043, !dbg !45 + %6045 = fadd float %.04.i2122, 0xC168000FE0000000, !dbg !45 + %6046 = fneg float %6045, !dbg !45 + %6047 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2123 = icmp eq i32 %6047, 0, !dbg !45 + %6048 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5804, float 0x3FF7154760000000, float %6046) #6, !dbg !45 + %6049 = tail call float @llvm.nvvm.fma.rn.f(float %5804, float 0x3FF7154760000000, float %6046) #6, !dbg !45 + %.0.i2124 = select i1 %.not3.i2123, float %6049, float %6048, !dbg !45 + %6050 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2125 = icmp eq i32 %6050, 0, !dbg !45 + %6051 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5804, float 0x3E54AE0C00000000, float %.0.i2124) #6, !dbg !45 + %6052 = tail call float @llvm.nvvm.fma.rn.f(float %5804, float 0x3E54AE0C00000000, float %.0.i2124) #6, !dbg !45 + %.01.i2126 = select i1 %.not4.i2125, float %6052, float %6051, !dbg !45 + %6053 = bitcast float %.04.i2122 to i32, !dbg !45 + %6054 = shl i32 %6053, 23, !dbg !45 + %6055 = bitcast i32 %6054 to float, !dbg !45 + %6056 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2126) #6, !dbg !45 + %6057 = fmul float %6056, %6055, !dbg !45 + %6058 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2127 = icmp eq i32 %6058, 0, !dbg !45 + %6059 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5805, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6060 = tail call float @llvm.nvvm.fma.rn.f(float %5805, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2128 = select i1 %.not.i2127, float %6060, float %6059, !dbg !45 + %6061 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2129 = icmp eq i32 %6061, 0, !dbg !45 + %6062 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2128) #6, !dbg !45 + %6063 = tail call float @llvm.nvvm.saturate.f(float %.02.i2128) #6, !dbg !45 + %.03.i2130 = select i1 %.not1.i2129, float %6063, float %6062, !dbg !45 + %6064 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2131 = icmp eq i32 %6064, 0, !dbg !45 + %6065 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2130, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6066 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2130, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2132 = select i1 %.not2.i2131, float %6066, float %6065, !dbg !45 + %6067 = fadd float %.04.i2132, 0xC168000FE0000000, !dbg !45 + %6068 = fneg float %6067, !dbg !45 + %6069 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2133 = icmp eq i32 %6069, 0, !dbg !45 + %6070 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5805, float 0x3FF7154760000000, float %6068) #6, !dbg !45 + %6071 = tail call float @llvm.nvvm.fma.rn.f(float %5805, float 0x3FF7154760000000, float %6068) #6, !dbg !45 + %.0.i2134 = select i1 %.not3.i2133, float %6071, float %6070, !dbg !45 + %6072 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2135 = icmp eq i32 %6072, 0, !dbg !45 + %6073 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5805, float 0x3E54AE0C00000000, float %.0.i2134) #6, !dbg !45 + %6074 = tail call float @llvm.nvvm.fma.rn.f(float %5805, float 0x3E54AE0C00000000, float %.0.i2134) #6, !dbg !45 + %.01.i2136 = select i1 %.not4.i2135, float %6074, float %6073, !dbg !45 + %6075 = bitcast float %.04.i2132 to i32, !dbg !45 + %6076 = shl i32 %6075, 23, !dbg !45 + %6077 = bitcast i32 %6076 to float, !dbg !45 + %6078 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2136) #6, !dbg !45 + %6079 = fmul float %6078, %6077, !dbg !45 + %6080 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2137 = icmp eq i32 %6080, 0, !dbg !45 + %6081 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5806, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6082 = tail call float @llvm.nvvm.fma.rn.f(float %5806, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2138 = select i1 %.not.i2137, float %6082, float %6081, !dbg !45 + %6083 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2139 = icmp eq i32 %6083, 0, !dbg !45 + %6084 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2138) #6, !dbg !45 + %6085 = tail call float @llvm.nvvm.saturate.f(float %.02.i2138) #6, !dbg !45 + %.03.i2140 = select i1 %.not1.i2139, float %6085, float %6084, !dbg !45 + %6086 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2141 = icmp eq i32 %6086, 0, !dbg !45 + %6087 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2140, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6088 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2140, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2142 = select i1 %.not2.i2141, float %6088, float %6087, !dbg !45 + %6089 = fadd float %.04.i2142, 0xC168000FE0000000, !dbg !45 + %6090 = fneg float %6089, !dbg !45 + %6091 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2143 = icmp eq i32 %6091, 0, !dbg !45 + %6092 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5806, float 0x3FF7154760000000, float %6090) #6, !dbg !45 + %6093 = tail call float @llvm.nvvm.fma.rn.f(float %5806, float 0x3FF7154760000000, float %6090) #6, !dbg !45 + %.0.i2144 = select i1 %.not3.i2143, float %6093, float %6092, !dbg !45 + %6094 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2145 = icmp eq i32 %6094, 0, !dbg !45 + %6095 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5806, float 0x3E54AE0C00000000, float %.0.i2144) #6, !dbg !45 + %6096 = tail call float @llvm.nvvm.fma.rn.f(float %5806, float 0x3E54AE0C00000000, float %.0.i2144) #6, !dbg !45 + %.01.i2146 = select i1 %.not4.i2145, float %6096, float %6095, !dbg !45 + %6097 = bitcast float %.04.i2142 to i32, !dbg !45 + %6098 = shl i32 %6097, 23, !dbg !45 + %6099 = bitcast i32 %6098 to float, !dbg !45 + %6100 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2146) #6, !dbg !45 + %6101 = fmul float %6100, %6099, !dbg !45 + %6102 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2147 = icmp eq i32 %6102, 0, !dbg !45 + %6103 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5807, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6104 = tail call float @llvm.nvvm.fma.rn.f(float %5807, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2148 = select i1 %.not.i2147, float %6104, float %6103, !dbg !45 + %6105 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2149 = icmp eq i32 %6105, 0, !dbg !45 + %6106 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2148) #6, !dbg !45 + %6107 = tail call float @llvm.nvvm.saturate.f(float %.02.i2148) #6, !dbg !45 + %.03.i2150 = select i1 %.not1.i2149, float %6107, float %6106, !dbg !45 + %6108 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2151 = icmp eq i32 %6108, 0, !dbg !45 + %6109 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2150, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6110 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2150, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2152 = select i1 %.not2.i2151, float %6110, float %6109, !dbg !45 + %6111 = fadd float %.04.i2152, 0xC168000FE0000000, !dbg !45 + %6112 = fneg float %6111, !dbg !45 + %6113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2153 = icmp eq i32 %6113, 0, !dbg !45 + %6114 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5807, float 0x3FF7154760000000, float %6112) #6, !dbg !45 + %6115 = tail call float @llvm.nvvm.fma.rn.f(float %5807, float 0x3FF7154760000000, float %6112) #6, !dbg !45 + %.0.i2154 = select i1 %.not3.i2153, float %6115, float %6114, !dbg !45 + %6116 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2155 = icmp eq i32 %6116, 0, !dbg !45 + %6117 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5807, float 0x3E54AE0C00000000, float %.0.i2154) #6, !dbg !45 + %6118 = tail call float @llvm.nvvm.fma.rn.f(float %5807, float 0x3E54AE0C00000000, float %.0.i2154) #6, !dbg !45 + %.01.i2156 = select i1 %.not4.i2155, float %6118, float %6117, !dbg !45 + %6119 = bitcast float %.04.i2152 to i32, !dbg !45 + %6120 = shl i32 %6119, 23, !dbg !45 + %6121 = bitcast i32 %6120 to float, !dbg !45 + %6122 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2156) #6, !dbg !45 + %6123 = fmul float %6122, %6121, !dbg !45 + %6124 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2157 = icmp eq i32 %6124, 0, !dbg !45 + %6125 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5808, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6126 = tail call float @llvm.nvvm.fma.rn.f(float %5808, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2158 = select i1 %.not.i2157, float %6126, float %6125, !dbg !45 + %6127 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2159 = icmp eq i32 %6127, 0, !dbg !45 + %6128 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2158) #6, !dbg !45 + %6129 = tail call float @llvm.nvvm.saturate.f(float %.02.i2158) #6, !dbg !45 + %.03.i2160 = select i1 %.not1.i2159, float %6129, float %6128, !dbg !45 + %6130 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2161 = icmp eq i32 %6130, 0, !dbg !45 + %6131 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2160, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6132 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2160, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2162 = select i1 %.not2.i2161, float %6132, float %6131, !dbg !45 + %6133 = fadd float %.04.i2162, 0xC168000FE0000000, !dbg !45 + %6134 = fneg float %6133, !dbg !45 + %6135 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2163 = icmp eq i32 %6135, 0, !dbg !45 + %6136 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5808, float 0x3FF7154760000000, float %6134) #6, !dbg !45 + %6137 = tail call float @llvm.nvvm.fma.rn.f(float %5808, float 0x3FF7154760000000, float %6134) #6, !dbg !45 + %.0.i2164 = select i1 %.not3.i2163, float %6137, float %6136, !dbg !45 + %6138 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2165 = icmp eq i32 %6138, 0, !dbg !45 + %6139 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5808, float 0x3E54AE0C00000000, float %.0.i2164) #6, !dbg !45 + %6140 = tail call float @llvm.nvvm.fma.rn.f(float %5808, float 0x3E54AE0C00000000, float %.0.i2164) #6, !dbg !45 + %.01.i2166 = select i1 %.not4.i2165, float %6140, float %6139, !dbg !45 + %6141 = bitcast float %.04.i2162 to i32, !dbg !45 + %6142 = shl i32 %6141, 23, !dbg !45 + %6143 = bitcast i32 %6142 to float, !dbg !45 + %6144 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2166) #6, !dbg !45 + %6145 = fmul float %6144, %6143, !dbg !45 + %6146 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2167 = icmp eq i32 %6146, 0, !dbg !45 + %6147 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5809, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6148 = tail call float @llvm.nvvm.fma.rn.f(float %5809, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2168 = select i1 %.not.i2167, float %6148, float %6147, !dbg !45 + %6149 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2169 = icmp eq i32 %6149, 0, !dbg !45 + %6150 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2168) #6, !dbg !45 + %6151 = tail call float @llvm.nvvm.saturate.f(float %.02.i2168) #6, !dbg !45 + %.03.i2170 = select i1 %.not1.i2169, float %6151, float %6150, !dbg !45 + %6152 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2171 = icmp eq i32 %6152, 0, !dbg !45 + %6153 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2170, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6154 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2170, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2172 = select i1 %.not2.i2171, float %6154, float %6153, !dbg !45 + %6155 = fadd float %.04.i2172, 0xC168000FE0000000, !dbg !45 + %6156 = fneg float %6155, !dbg !45 + %6157 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2173 = icmp eq i32 %6157, 0, !dbg !45 + %6158 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5809, float 0x3FF7154760000000, float %6156) #6, !dbg !45 + %6159 = tail call float @llvm.nvvm.fma.rn.f(float %5809, float 0x3FF7154760000000, float %6156) #6, !dbg !45 + %.0.i2174 = select i1 %.not3.i2173, float %6159, float %6158, !dbg !45 + %6160 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2175 = icmp eq i32 %6160, 0, !dbg !45 + %6161 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5809, float 0x3E54AE0C00000000, float %.0.i2174) #6, !dbg !45 + %6162 = tail call float @llvm.nvvm.fma.rn.f(float %5809, float 0x3E54AE0C00000000, float %.0.i2174) #6, !dbg !45 + %.01.i2176 = select i1 %.not4.i2175, float %6162, float %6161, !dbg !45 + %6163 = bitcast float %.04.i2172 to i32, !dbg !45 + %6164 = shl i32 %6163, 23, !dbg !45 + %6165 = bitcast i32 %6164 to float, !dbg !45 + %6166 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2176) #6, !dbg !45 + %6167 = fmul float %6166, %6165, !dbg !45 + %6168 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2177 = icmp eq i32 %6168, 0, !dbg !45 + %6169 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5810, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6170 = tail call float @llvm.nvvm.fma.rn.f(float %5810, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2178 = select i1 %.not.i2177, float %6170, float %6169, !dbg !45 + %6171 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2179 = icmp eq i32 %6171, 0, !dbg !45 + %6172 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2178) #6, !dbg !45 + %6173 = tail call float @llvm.nvvm.saturate.f(float %.02.i2178) #6, !dbg !45 + %.03.i2180 = select i1 %.not1.i2179, float %6173, float %6172, !dbg !45 + %6174 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2181 = icmp eq i32 %6174, 0, !dbg !45 + %6175 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2180, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6176 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2180, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2182 = select i1 %.not2.i2181, float %6176, float %6175, !dbg !45 + %6177 = fadd float %.04.i2182, 0xC168000FE0000000, !dbg !45 + %6178 = fneg float %6177, !dbg !45 + %6179 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2183 = icmp eq i32 %6179, 0, !dbg !45 + %6180 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5810, float 0x3FF7154760000000, float %6178) #6, !dbg !45 + %6181 = tail call float @llvm.nvvm.fma.rn.f(float %5810, float 0x3FF7154760000000, float %6178) #6, !dbg !45 + %.0.i2184 = select i1 %.not3.i2183, float %6181, float %6180, !dbg !45 + %6182 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2185 = icmp eq i32 %6182, 0, !dbg !45 + %6183 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5810, float 0x3E54AE0C00000000, float %.0.i2184) #6, !dbg !45 + %6184 = tail call float @llvm.nvvm.fma.rn.f(float %5810, float 0x3E54AE0C00000000, float %.0.i2184) #6, !dbg !45 + %.01.i2186 = select i1 %.not4.i2185, float %6184, float %6183, !dbg !45 + %6185 = bitcast float %.04.i2182 to i32, !dbg !45 + %6186 = shl i32 %6185, 23, !dbg !45 + %6187 = bitcast i32 %6186 to float, !dbg !45 + %6188 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2186) #6, !dbg !45 + %6189 = fmul float %6188, %6187, !dbg !45 + %6190 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2187 = icmp eq i32 %6190, 0, !dbg !45 + %6191 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5811, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6192 = tail call float @llvm.nvvm.fma.rn.f(float %5811, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2188 = select i1 %.not.i2187, float %6192, float %6191, !dbg !45 + %6193 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2189 = icmp eq i32 %6193, 0, !dbg !45 + %6194 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2188) #6, !dbg !45 + %6195 = tail call float @llvm.nvvm.saturate.f(float %.02.i2188) #6, !dbg !45 + %.03.i2190 = select i1 %.not1.i2189, float %6195, float %6194, !dbg !45 + %6196 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2191 = icmp eq i32 %6196, 0, !dbg !45 + %6197 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2190, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6198 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2190, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2192 = select i1 %.not2.i2191, float %6198, float %6197, !dbg !45 + %6199 = fadd float %.04.i2192, 0xC168000FE0000000, !dbg !45 + %6200 = fneg float %6199, !dbg !45 + %6201 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2193 = icmp eq i32 %6201, 0, !dbg !45 + %6202 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5811, float 0x3FF7154760000000, float %6200) #6, !dbg !45 + %6203 = tail call float @llvm.nvvm.fma.rn.f(float %5811, float 0x3FF7154760000000, float %6200) #6, !dbg !45 + %.0.i2194 = select i1 %.not3.i2193, float %6203, float %6202, !dbg !45 + %6204 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2195 = icmp eq i32 %6204, 0, !dbg !45 + %6205 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5811, float 0x3E54AE0C00000000, float %.0.i2194) #6, !dbg !45 + %6206 = tail call float @llvm.nvvm.fma.rn.f(float %5811, float 0x3E54AE0C00000000, float %.0.i2194) #6, !dbg !45 + %.01.i2196 = select i1 %.not4.i2195, float %6206, float %6205, !dbg !45 + %6207 = bitcast float %.04.i2192 to i32, !dbg !45 + %6208 = shl i32 %6207, 23, !dbg !45 + %6209 = bitcast i32 %6208 to float, !dbg !45 + %6210 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2196) #6, !dbg !45 + %6211 = fmul float %6210, %6209, !dbg !45 + %6212 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2197 = icmp eq i32 %6212, 0, !dbg !45 + %6213 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5812, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6214 = tail call float @llvm.nvvm.fma.rn.f(float %5812, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2198 = select i1 %.not.i2197, float %6214, float %6213, !dbg !45 + %6215 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2199 = icmp eq i32 %6215, 0, !dbg !45 + %6216 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2198) #6, !dbg !45 + %6217 = tail call float @llvm.nvvm.saturate.f(float %.02.i2198) #6, !dbg !45 + %.03.i2200 = select i1 %.not1.i2199, float %6217, float %6216, !dbg !45 + %6218 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2201 = icmp eq i32 %6218, 0, !dbg !45 + %6219 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2200, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6220 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2200, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2202 = select i1 %.not2.i2201, float %6220, float %6219, !dbg !45 + %6221 = fadd float %.04.i2202, 0xC168000FE0000000, !dbg !45 + %6222 = fneg float %6221, !dbg !45 + %6223 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2203 = icmp eq i32 %6223, 0, !dbg !45 + %6224 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5812, float 0x3FF7154760000000, float %6222) #6, !dbg !45 + %6225 = tail call float @llvm.nvvm.fma.rn.f(float %5812, float 0x3FF7154760000000, float %6222) #6, !dbg !45 + %.0.i2204 = select i1 %.not3.i2203, float %6225, float %6224, !dbg !45 + %6226 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2205 = icmp eq i32 %6226, 0, !dbg !45 + %6227 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5812, float 0x3E54AE0C00000000, float %.0.i2204) #6, !dbg !45 + %6228 = tail call float @llvm.nvvm.fma.rn.f(float %5812, float 0x3E54AE0C00000000, float %.0.i2204) #6, !dbg !45 + %.01.i2206 = select i1 %.not4.i2205, float %6228, float %6227, !dbg !45 + %6229 = bitcast float %.04.i2202 to i32, !dbg !45 + %6230 = shl i32 %6229, 23, !dbg !45 + %6231 = bitcast i32 %6230 to float, !dbg !45 + %6232 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2206) #6, !dbg !45 + %6233 = fmul float %6232, %6231, !dbg !45 + %6234 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2207 = icmp eq i32 %6234, 0, !dbg !45 + %6235 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5813, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6236 = tail call float @llvm.nvvm.fma.rn.f(float %5813, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2208 = select i1 %.not.i2207, float %6236, float %6235, !dbg !45 + %6237 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2209 = icmp eq i32 %6237, 0, !dbg !45 + %6238 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2208) #6, !dbg !45 + %6239 = tail call float @llvm.nvvm.saturate.f(float %.02.i2208) #6, !dbg !45 + %.03.i2210 = select i1 %.not1.i2209, float %6239, float %6238, !dbg !45 + %6240 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2211 = icmp eq i32 %6240, 0, !dbg !45 + %6241 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2210, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6242 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2210, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2212 = select i1 %.not2.i2211, float %6242, float %6241, !dbg !45 + %6243 = fadd float %.04.i2212, 0xC168000FE0000000, !dbg !45 + %6244 = fneg float %6243, !dbg !45 + %6245 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2213 = icmp eq i32 %6245, 0, !dbg !45 + %6246 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5813, float 0x3FF7154760000000, float %6244) #6, !dbg !45 + %6247 = tail call float @llvm.nvvm.fma.rn.f(float %5813, float 0x3FF7154760000000, float %6244) #6, !dbg !45 + %.0.i2214 = select i1 %.not3.i2213, float %6247, float %6246, !dbg !45 + %6248 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2215 = icmp eq i32 %6248, 0, !dbg !45 + %6249 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5813, float 0x3E54AE0C00000000, float %.0.i2214) #6, !dbg !45 + %6250 = tail call float @llvm.nvvm.fma.rn.f(float %5813, float 0x3E54AE0C00000000, float %.0.i2214) #6, !dbg !45 + %.01.i2216 = select i1 %.not4.i2215, float %6250, float %6249, !dbg !45 + %6251 = bitcast float %.04.i2212 to i32, !dbg !45 + %6252 = shl i32 %6251, 23, !dbg !45 + %6253 = bitcast i32 %6252 to float, !dbg !45 + %6254 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2216) #6, !dbg !45 + %6255 = fmul float %6254, %6253, !dbg !45 + %6256 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2217 = icmp eq i32 %6256, 0, !dbg !45 + %6257 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5814, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6258 = tail call float @llvm.nvvm.fma.rn.f(float %5814, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2218 = select i1 %.not.i2217, float %6258, float %6257, !dbg !45 + %6259 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2219 = icmp eq i32 %6259, 0, !dbg !45 + %6260 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2218) #6, !dbg !45 + %6261 = tail call float @llvm.nvvm.saturate.f(float %.02.i2218) #6, !dbg !45 + %.03.i2220 = select i1 %.not1.i2219, float %6261, float %6260, !dbg !45 + %6262 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2221 = icmp eq i32 %6262, 0, !dbg !45 + %6263 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2220, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6264 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2220, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2222 = select i1 %.not2.i2221, float %6264, float %6263, !dbg !45 + %6265 = fadd float %.04.i2222, 0xC168000FE0000000, !dbg !45 + %6266 = fneg float %6265, !dbg !45 + %6267 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2223 = icmp eq i32 %6267, 0, !dbg !45 + %6268 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5814, float 0x3FF7154760000000, float %6266) #6, !dbg !45 + %6269 = tail call float @llvm.nvvm.fma.rn.f(float %5814, float 0x3FF7154760000000, float %6266) #6, !dbg !45 + %.0.i2224 = select i1 %.not3.i2223, float %6269, float %6268, !dbg !45 + %6270 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2225 = icmp eq i32 %6270, 0, !dbg !45 + %6271 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5814, float 0x3E54AE0C00000000, float %.0.i2224) #6, !dbg !45 + %6272 = tail call float @llvm.nvvm.fma.rn.f(float %5814, float 0x3E54AE0C00000000, float %.0.i2224) #6, !dbg !45 + %.01.i2226 = select i1 %.not4.i2225, float %6272, float %6271, !dbg !45 + %6273 = bitcast float %.04.i2222 to i32, !dbg !45 + %6274 = shl i32 %6273, 23, !dbg !45 + %6275 = bitcast i32 %6274 to float, !dbg !45 + %6276 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2226) #6, !dbg !45 + %6277 = fmul float %6276, %6275, !dbg !45 + %6278 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2227 = icmp eq i32 %6278, 0, !dbg !45 + %6279 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5815, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6280 = tail call float @llvm.nvvm.fma.rn.f(float %5815, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2228 = select i1 %.not.i2227, float %6280, float %6279, !dbg !45 + %6281 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2229 = icmp eq i32 %6281, 0, !dbg !45 + %6282 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2228) #6, !dbg !45 + %6283 = tail call float @llvm.nvvm.saturate.f(float %.02.i2228) #6, !dbg !45 + %.03.i2230 = select i1 %.not1.i2229, float %6283, float %6282, !dbg !45 + %6284 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2231 = icmp eq i32 %6284, 0, !dbg !45 + %6285 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2230, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6286 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2230, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2232 = select i1 %.not2.i2231, float %6286, float %6285, !dbg !45 + %6287 = fadd float %.04.i2232, 0xC168000FE0000000, !dbg !45 + %6288 = fneg float %6287, !dbg !45 + %6289 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2233 = icmp eq i32 %6289, 0, !dbg !45 + %6290 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5815, float 0x3FF7154760000000, float %6288) #6, !dbg !45 + %6291 = tail call float @llvm.nvvm.fma.rn.f(float %5815, float 0x3FF7154760000000, float %6288) #6, !dbg !45 + %.0.i2234 = select i1 %.not3.i2233, float %6291, float %6290, !dbg !45 + %6292 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2235 = icmp eq i32 %6292, 0, !dbg !45 + %6293 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5815, float 0x3E54AE0C00000000, float %.0.i2234) #6, !dbg !45 + %6294 = tail call float @llvm.nvvm.fma.rn.f(float %5815, float 0x3E54AE0C00000000, float %.0.i2234) #6, !dbg !45 + %.01.i2236 = select i1 %.not4.i2235, float %6294, float %6293, !dbg !45 + %6295 = bitcast float %.04.i2232 to i32, !dbg !45 + %6296 = shl i32 %6295, 23, !dbg !45 + %6297 = bitcast i32 %6296 to float, !dbg !45 + %6298 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2236) #6, !dbg !45 + %6299 = fmul float %6298, %6297, !dbg !45 + %6300 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2237 = icmp eq i32 %6300, 0, !dbg !45 + %6301 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5816, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6302 = tail call float @llvm.nvvm.fma.rn.f(float %5816, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2238 = select i1 %.not.i2237, float %6302, float %6301, !dbg !45 + %6303 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2239 = icmp eq i32 %6303, 0, !dbg !45 + %6304 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2238) #6, !dbg !45 + %6305 = tail call float @llvm.nvvm.saturate.f(float %.02.i2238) #6, !dbg !45 + %.03.i2240 = select i1 %.not1.i2239, float %6305, float %6304, !dbg !45 + %6306 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2241 = icmp eq i32 %6306, 0, !dbg !45 + %6307 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2240, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6308 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2240, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2242 = select i1 %.not2.i2241, float %6308, float %6307, !dbg !45 + %6309 = fadd float %.04.i2242, 0xC168000FE0000000, !dbg !45 + %6310 = fneg float %6309, !dbg !45 + %6311 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2243 = icmp eq i32 %6311, 0, !dbg !45 + %6312 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5816, float 0x3FF7154760000000, float %6310) #6, !dbg !45 + %6313 = tail call float @llvm.nvvm.fma.rn.f(float %5816, float 0x3FF7154760000000, float %6310) #6, !dbg !45 + %.0.i2244 = select i1 %.not3.i2243, float %6313, float %6312, !dbg !45 + %6314 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2245 = icmp eq i32 %6314, 0, !dbg !45 + %6315 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5816, float 0x3E54AE0C00000000, float %.0.i2244) #6, !dbg !45 + %6316 = tail call float @llvm.nvvm.fma.rn.f(float %5816, float 0x3E54AE0C00000000, float %.0.i2244) #6, !dbg !45 + %.01.i2246 = select i1 %.not4.i2245, float %6316, float %6315, !dbg !45 + %6317 = bitcast float %.04.i2242 to i32, !dbg !45 + %6318 = shl i32 %6317, 23, !dbg !45 + %6319 = bitcast i32 %6318 to float, !dbg !45 + %6320 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2246) #6, !dbg !45 + %6321 = fmul float %6320, %6319, !dbg !45 + %6322 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2247 = icmp eq i32 %6322, 0, !dbg !45 + %6323 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5817, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6324 = tail call float @llvm.nvvm.fma.rn.f(float %5817, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2248 = select i1 %.not.i2247, float %6324, float %6323, !dbg !45 + %6325 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2249 = icmp eq i32 %6325, 0, !dbg !45 + %6326 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2248) #6, !dbg !45 + %6327 = tail call float @llvm.nvvm.saturate.f(float %.02.i2248) #6, !dbg !45 + %.03.i2250 = select i1 %.not1.i2249, float %6327, float %6326, !dbg !45 + %6328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2251 = icmp eq i32 %6328, 0, !dbg !45 + %6329 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2250, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6330 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2250, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2252 = select i1 %.not2.i2251, float %6330, float %6329, !dbg !45 + %6331 = fadd float %.04.i2252, 0xC168000FE0000000, !dbg !45 + %6332 = fneg float %6331, !dbg !45 + %6333 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2253 = icmp eq i32 %6333, 0, !dbg !45 + %6334 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5817, float 0x3FF7154760000000, float %6332) #6, !dbg !45 + %6335 = tail call float @llvm.nvvm.fma.rn.f(float %5817, float 0x3FF7154760000000, float %6332) #6, !dbg !45 + %.0.i2254 = select i1 %.not3.i2253, float %6335, float %6334, !dbg !45 + %6336 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2255 = icmp eq i32 %6336, 0, !dbg !45 + %6337 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5817, float 0x3E54AE0C00000000, float %.0.i2254) #6, !dbg !45 + %6338 = tail call float @llvm.nvvm.fma.rn.f(float %5817, float 0x3E54AE0C00000000, float %.0.i2254) #6, !dbg !45 + %.01.i2256 = select i1 %.not4.i2255, float %6338, float %6337, !dbg !45 + %6339 = bitcast float %.04.i2252 to i32, !dbg !45 + %6340 = shl i32 %6339, 23, !dbg !45 + %6341 = bitcast i32 %6340 to float, !dbg !45 + %6342 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2256) #6, !dbg !45 + %6343 = fmul float %6342, %6341, !dbg !45 + %6344 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2257 = icmp eq i32 %6344, 0, !dbg !45 + %6345 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5818, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6346 = tail call float @llvm.nvvm.fma.rn.f(float %5818, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2258 = select i1 %.not.i2257, float %6346, float %6345, !dbg !45 + %6347 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2259 = icmp eq i32 %6347, 0, !dbg !45 + %6348 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2258) #6, !dbg !45 + %6349 = tail call float @llvm.nvvm.saturate.f(float %.02.i2258) #6, !dbg !45 + %.03.i2260 = select i1 %.not1.i2259, float %6349, float %6348, !dbg !45 + %6350 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2261 = icmp eq i32 %6350, 0, !dbg !45 + %6351 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2260, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6352 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2260, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2262 = select i1 %.not2.i2261, float %6352, float %6351, !dbg !45 + %6353 = fadd float %.04.i2262, 0xC168000FE0000000, !dbg !45 + %6354 = fneg float %6353, !dbg !45 + %6355 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2263 = icmp eq i32 %6355, 0, !dbg !45 + %6356 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5818, float 0x3FF7154760000000, float %6354) #6, !dbg !45 + %6357 = tail call float @llvm.nvvm.fma.rn.f(float %5818, float 0x3FF7154760000000, float %6354) #6, !dbg !45 + %.0.i2264 = select i1 %.not3.i2263, float %6357, float %6356, !dbg !45 + %6358 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2265 = icmp eq i32 %6358, 0, !dbg !45 + %6359 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5818, float 0x3E54AE0C00000000, float %.0.i2264) #6, !dbg !45 + %6360 = tail call float @llvm.nvvm.fma.rn.f(float %5818, float 0x3E54AE0C00000000, float %.0.i2264) #6, !dbg !45 + %.01.i2266 = select i1 %.not4.i2265, float %6360, float %6359, !dbg !45 + %6361 = bitcast float %.04.i2262 to i32, !dbg !45 + %6362 = shl i32 %6361, 23, !dbg !45 + %6363 = bitcast i32 %6362 to float, !dbg !45 + %6364 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2266) #6, !dbg !45 + %6365 = fmul float %6364, %6363, !dbg !45 + %6366 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2267 = icmp eq i32 %6366, 0, !dbg !45 + %6367 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5819, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6368 = tail call float @llvm.nvvm.fma.rn.f(float %5819, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2268 = select i1 %.not.i2267, float %6368, float %6367, !dbg !45 + %6369 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2269 = icmp eq i32 %6369, 0, !dbg !45 + %6370 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2268) #6, !dbg !45 + %6371 = tail call float @llvm.nvvm.saturate.f(float %.02.i2268) #6, !dbg !45 + %.03.i2270 = select i1 %.not1.i2269, float %6371, float %6370, !dbg !45 + %6372 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2271 = icmp eq i32 %6372, 0, !dbg !45 + %6373 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2270, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6374 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2270, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2272 = select i1 %.not2.i2271, float %6374, float %6373, !dbg !45 + %6375 = fadd float %.04.i2272, 0xC168000FE0000000, !dbg !45 + %6376 = fneg float %6375, !dbg !45 + %6377 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2273 = icmp eq i32 %6377, 0, !dbg !45 + %6378 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5819, float 0x3FF7154760000000, float %6376) #6, !dbg !45 + %6379 = tail call float @llvm.nvvm.fma.rn.f(float %5819, float 0x3FF7154760000000, float %6376) #6, !dbg !45 + %.0.i2274 = select i1 %.not3.i2273, float %6379, float %6378, !dbg !45 + %6380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2275 = icmp eq i32 %6380, 0, !dbg !45 + %6381 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5819, float 0x3E54AE0C00000000, float %.0.i2274) #6, !dbg !45 + %6382 = tail call float @llvm.nvvm.fma.rn.f(float %5819, float 0x3E54AE0C00000000, float %.0.i2274) #6, !dbg !45 + %.01.i2276 = select i1 %.not4.i2275, float %6382, float %6381, !dbg !45 + %6383 = bitcast float %.04.i2272 to i32, !dbg !45 + %6384 = shl i32 %6383, 23, !dbg !45 + %6385 = bitcast i32 %6384 to float, !dbg !45 + %6386 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2276) #6, !dbg !45 + %6387 = fmul float %6386, %6385, !dbg !45 + %6388 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2277 = icmp eq i32 %6388, 0, !dbg !45 + %6389 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5820, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6390 = tail call float @llvm.nvvm.fma.rn.f(float %5820, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2278 = select i1 %.not.i2277, float %6390, float %6389, !dbg !45 + %6391 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2279 = icmp eq i32 %6391, 0, !dbg !45 + %6392 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2278) #6, !dbg !45 + %6393 = tail call float @llvm.nvvm.saturate.f(float %.02.i2278) #6, !dbg !45 + %.03.i2280 = select i1 %.not1.i2279, float %6393, float %6392, !dbg !45 + %6394 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2281 = icmp eq i32 %6394, 0, !dbg !45 + %6395 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2280, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6396 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2280, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2282 = select i1 %.not2.i2281, float %6396, float %6395, !dbg !45 + %6397 = fadd float %.04.i2282, 0xC168000FE0000000, !dbg !45 + %6398 = fneg float %6397, !dbg !45 + %6399 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2283 = icmp eq i32 %6399, 0, !dbg !45 + %6400 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5820, float 0x3FF7154760000000, float %6398) #6, !dbg !45 + %6401 = tail call float @llvm.nvvm.fma.rn.f(float %5820, float 0x3FF7154760000000, float %6398) #6, !dbg !45 + %.0.i2284 = select i1 %.not3.i2283, float %6401, float %6400, !dbg !45 + %6402 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2285 = icmp eq i32 %6402, 0, !dbg !45 + %6403 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5820, float 0x3E54AE0C00000000, float %.0.i2284) #6, !dbg !45 + %6404 = tail call float @llvm.nvvm.fma.rn.f(float %5820, float 0x3E54AE0C00000000, float %.0.i2284) #6, !dbg !45 + %.01.i2286 = select i1 %.not4.i2285, float %6404, float %6403, !dbg !45 + %6405 = bitcast float %.04.i2282 to i32, !dbg !45 + %6406 = shl i32 %6405, 23, !dbg !45 + %6407 = bitcast i32 %6406 to float, !dbg !45 + %6408 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2286) #6, !dbg !45 + %6409 = fmul float %6408, %6407, !dbg !45 + %6410 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2287 = icmp eq i32 %6410, 0, !dbg !45 + %6411 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5821, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6412 = tail call float @llvm.nvvm.fma.rn.f(float %5821, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2288 = select i1 %.not.i2287, float %6412, float %6411, !dbg !45 + %6413 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2289 = icmp eq i32 %6413, 0, !dbg !45 + %6414 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2288) #6, !dbg !45 + %6415 = tail call float @llvm.nvvm.saturate.f(float %.02.i2288) #6, !dbg !45 + %.03.i2290 = select i1 %.not1.i2289, float %6415, float %6414, !dbg !45 + %6416 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2291 = icmp eq i32 %6416, 0, !dbg !45 + %6417 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2290, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6418 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2290, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2292 = select i1 %.not2.i2291, float %6418, float %6417, !dbg !45 + %6419 = fadd float %.04.i2292, 0xC168000FE0000000, !dbg !45 + %6420 = fneg float %6419, !dbg !45 + %6421 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2293 = icmp eq i32 %6421, 0, !dbg !45 + %6422 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5821, float 0x3FF7154760000000, float %6420) #6, !dbg !45 + %6423 = tail call float @llvm.nvvm.fma.rn.f(float %5821, float 0x3FF7154760000000, float %6420) #6, !dbg !45 + %.0.i2294 = select i1 %.not3.i2293, float %6423, float %6422, !dbg !45 + %6424 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2295 = icmp eq i32 %6424, 0, !dbg !45 + %6425 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5821, float 0x3E54AE0C00000000, float %.0.i2294) #6, !dbg !45 + %6426 = tail call float @llvm.nvvm.fma.rn.f(float %5821, float 0x3E54AE0C00000000, float %.0.i2294) #6, !dbg !45 + %.01.i2296 = select i1 %.not4.i2295, float %6426, float %6425, !dbg !45 + %6427 = bitcast float %.04.i2292 to i32, !dbg !45 + %6428 = shl i32 %6427, 23, !dbg !45 + %6429 = bitcast i32 %6428 to float, !dbg !45 + %6430 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2296) #6, !dbg !45 + %6431 = fmul float %6430, %6429, !dbg !45 + %6432 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2297 = icmp eq i32 %6432, 0, !dbg !45 + %6433 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5822, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6434 = tail call float @llvm.nvvm.fma.rn.f(float %5822, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2298 = select i1 %.not.i2297, float %6434, float %6433, !dbg !45 + %6435 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2299 = icmp eq i32 %6435, 0, !dbg !45 + %6436 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2298) #6, !dbg !45 + %6437 = tail call float @llvm.nvvm.saturate.f(float %.02.i2298) #6, !dbg !45 + %.03.i2300 = select i1 %.not1.i2299, float %6437, float %6436, !dbg !45 + %6438 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2301 = icmp eq i32 %6438, 0, !dbg !45 + %6439 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2300, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6440 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2300, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2302 = select i1 %.not2.i2301, float %6440, float %6439, !dbg !45 + %6441 = fadd float %.04.i2302, 0xC168000FE0000000, !dbg !45 + %6442 = fneg float %6441, !dbg !45 + %6443 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2303 = icmp eq i32 %6443, 0, !dbg !45 + %6444 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5822, float 0x3FF7154760000000, float %6442) #6, !dbg !45 + %6445 = tail call float @llvm.nvvm.fma.rn.f(float %5822, float 0x3FF7154760000000, float %6442) #6, !dbg !45 + %.0.i2304 = select i1 %.not3.i2303, float %6445, float %6444, !dbg !45 + %6446 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2305 = icmp eq i32 %6446, 0, !dbg !45 + %6447 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5822, float 0x3E54AE0C00000000, float %.0.i2304) #6, !dbg !45 + %6448 = tail call float @llvm.nvvm.fma.rn.f(float %5822, float 0x3E54AE0C00000000, float %.0.i2304) #6, !dbg !45 + %.01.i2306 = select i1 %.not4.i2305, float %6448, float %6447, !dbg !45 + %6449 = bitcast float %.04.i2302 to i32, !dbg !45 + %6450 = shl i32 %6449, 23, !dbg !45 + %6451 = bitcast i32 %6450 to float, !dbg !45 + %6452 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2306) #6, !dbg !45 + %6453 = fmul float %6452, %6451, !dbg !45 + %6454 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2307 = icmp eq i32 %6454, 0, !dbg !45 + %6455 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5823, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6456 = tail call float @llvm.nvvm.fma.rn.f(float %5823, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2308 = select i1 %.not.i2307, float %6456, float %6455, !dbg !45 + %6457 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2309 = icmp eq i32 %6457, 0, !dbg !45 + %6458 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2308) #6, !dbg !45 + %6459 = tail call float @llvm.nvvm.saturate.f(float %.02.i2308) #6, !dbg !45 + %.03.i2310 = select i1 %.not1.i2309, float %6459, float %6458, !dbg !45 + %6460 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2311 = icmp eq i32 %6460, 0, !dbg !45 + %6461 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2310, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6462 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2310, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2312 = select i1 %.not2.i2311, float %6462, float %6461, !dbg !45 + %6463 = fadd float %.04.i2312, 0xC168000FE0000000, !dbg !45 + %6464 = fneg float %6463, !dbg !45 + %6465 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2313 = icmp eq i32 %6465, 0, !dbg !45 + %6466 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5823, float 0x3FF7154760000000, float %6464) #6, !dbg !45 + %6467 = tail call float @llvm.nvvm.fma.rn.f(float %5823, float 0x3FF7154760000000, float %6464) #6, !dbg !45 + %.0.i2314 = select i1 %.not3.i2313, float %6467, float %6466, !dbg !45 + %6468 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2315 = icmp eq i32 %6468, 0, !dbg !45 + %6469 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5823, float 0x3E54AE0C00000000, float %.0.i2314) #6, !dbg !45 + %6470 = tail call float @llvm.nvvm.fma.rn.f(float %5823, float 0x3E54AE0C00000000, float %.0.i2314) #6, !dbg !45 + %.01.i2316 = select i1 %.not4.i2315, float %6470, float %6469, !dbg !45 + %6471 = bitcast float %.04.i2312 to i32, !dbg !45 + %6472 = shl i32 %6471, 23, !dbg !45 + %6473 = bitcast i32 %6472 to float, !dbg !45 + %6474 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2316) #6, !dbg !45 + %6475 = fmul float %6474, %6473, !dbg !45 + %6476 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2317 = icmp eq i32 %6476, 0, !dbg !45 + %6477 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5824, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6478 = tail call float @llvm.nvvm.fma.rn.f(float %5824, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2318 = select i1 %.not.i2317, float %6478, float %6477, !dbg !45 + %6479 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2319 = icmp eq i32 %6479, 0, !dbg !45 + %6480 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2318) #6, !dbg !45 + %6481 = tail call float @llvm.nvvm.saturate.f(float %.02.i2318) #6, !dbg !45 + %.03.i2320 = select i1 %.not1.i2319, float %6481, float %6480, !dbg !45 + %6482 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2321 = icmp eq i32 %6482, 0, !dbg !45 + %6483 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2320, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6484 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2320, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2322 = select i1 %.not2.i2321, float %6484, float %6483, !dbg !45 + %6485 = fadd float %.04.i2322, 0xC168000FE0000000, !dbg !45 + %6486 = fneg float %6485, !dbg !45 + %6487 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2323 = icmp eq i32 %6487, 0, !dbg !45 + %6488 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5824, float 0x3FF7154760000000, float %6486) #6, !dbg !45 + %6489 = tail call float @llvm.nvvm.fma.rn.f(float %5824, float 0x3FF7154760000000, float %6486) #6, !dbg !45 + %.0.i2324 = select i1 %.not3.i2323, float %6489, float %6488, !dbg !45 + %6490 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2325 = icmp eq i32 %6490, 0, !dbg !45 + %6491 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5824, float 0x3E54AE0C00000000, float %.0.i2324) #6, !dbg !45 + %6492 = tail call float @llvm.nvvm.fma.rn.f(float %5824, float 0x3E54AE0C00000000, float %.0.i2324) #6, !dbg !45 + %.01.i2326 = select i1 %.not4.i2325, float %6492, float %6491, !dbg !45 + %6493 = bitcast float %.04.i2322 to i32, !dbg !45 + %6494 = shl i32 %6493, 23, !dbg !45 + %6495 = bitcast i32 %6494 to float, !dbg !45 + %6496 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2326) #6, !dbg !45 + %6497 = fmul float %6496, %6495, !dbg !45 + %6498 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2327 = icmp eq i32 %6498, 0, !dbg !45 + %6499 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5825, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6500 = tail call float @llvm.nvvm.fma.rn.f(float %5825, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2328 = select i1 %.not.i2327, float %6500, float %6499, !dbg !45 + %6501 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2329 = icmp eq i32 %6501, 0, !dbg !45 + %6502 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2328) #6, !dbg !45 + %6503 = tail call float @llvm.nvvm.saturate.f(float %.02.i2328) #6, !dbg !45 + %.03.i2330 = select i1 %.not1.i2329, float %6503, float %6502, !dbg !45 + %6504 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2331 = icmp eq i32 %6504, 0, !dbg !45 + %6505 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2330, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6506 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2330, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2332 = select i1 %.not2.i2331, float %6506, float %6505, !dbg !45 + %6507 = fadd float %.04.i2332, 0xC168000FE0000000, !dbg !45 + %6508 = fneg float %6507, !dbg !45 + %6509 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2333 = icmp eq i32 %6509, 0, !dbg !45 + %6510 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5825, float 0x3FF7154760000000, float %6508) #6, !dbg !45 + %6511 = tail call float @llvm.nvvm.fma.rn.f(float %5825, float 0x3FF7154760000000, float %6508) #6, !dbg !45 + %.0.i2334 = select i1 %.not3.i2333, float %6511, float %6510, !dbg !45 + %6512 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2335 = icmp eq i32 %6512, 0, !dbg !45 + %6513 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5825, float 0x3E54AE0C00000000, float %.0.i2334) #6, !dbg !45 + %6514 = tail call float @llvm.nvvm.fma.rn.f(float %5825, float 0x3E54AE0C00000000, float %.0.i2334) #6, !dbg !45 + %.01.i2336 = select i1 %.not4.i2335, float %6514, float %6513, !dbg !45 + %6515 = bitcast float %.04.i2332 to i32, !dbg !45 + %6516 = shl i32 %6515, 23, !dbg !45 + %6517 = bitcast i32 %6516 to float, !dbg !45 + %6518 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2336) #6, !dbg !45 + %6519 = fmul float %6518, %6517, !dbg !45 + %6520 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2337 = icmp eq i32 %6520, 0, !dbg !45 + %6521 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5826, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6522 = tail call float @llvm.nvvm.fma.rn.f(float %5826, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2338 = select i1 %.not.i2337, float %6522, float %6521, !dbg !45 + %6523 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2339 = icmp eq i32 %6523, 0, !dbg !45 + %6524 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2338) #6, !dbg !45 + %6525 = tail call float @llvm.nvvm.saturate.f(float %.02.i2338) #6, !dbg !45 + %.03.i2340 = select i1 %.not1.i2339, float %6525, float %6524, !dbg !45 + %6526 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2341 = icmp eq i32 %6526, 0, !dbg !45 + %6527 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2340, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6528 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2340, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2342 = select i1 %.not2.i2341, float %6528, float %6527, !dbg !45 + %6529 = fadd float %.04.i2342, 0xC168000FE0000000, !dbg !45 + %6530 = fneg float %6529, !dbg !45 + %6531 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2343 = icmp eq i32 %6531, 0, !dbg !45 + %6532 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5826, float 0x3FF7154760000000, float %6530) #6, !dbg !45 + %6533 = tail call float @llvm.nvvm.fma.rn.f(float %5826, float 0x3FF7154760000000, float %6530) #6, !dbg !45 + %.0.i2344 = select i1 %.not3.i2343, float %6533, float %6532, !dbg !45 + %6534 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2345 = icmp eq i32 %6534, 0, !dbg !45 + %6535 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5826, float 0x3E54AE0C00000000, float %.0.i2344) #6, !dbg !45 + %6536 = tail call float @llvm.nvvm.fma.rn.f(float %5826, float 0x3E54AE0C00000000, float %.0.i2344) #6, !dbg !45 + %.01.i2346 = select i1 %.not4.i2345, float %6536, float %6535, !dbg !45 + %6537 = bitcast float %.04.i2342 to i32, !dbg !45 + %6538 = shl i32 %6537, 23, !dbg !45 + %6539 = bitcast i32 %6538 to float, !dbg !45 + %6540 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2346) #6, !dbg !45 + %6541 = fmul float %6540, %6539, !dbg !45 + %6542 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2347 = icmp eq i32 %6542, 0, !dbg !45 + %6543 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5827, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6544 = tail call float @llvm.nvvm.fma.rn.f(float %5827, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2348 = select i1 %.not.i2347, float %6544, float %6543, !dbg !45 + %6545 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2349 = icmp eq i32 %6545, 0, !dbg !45 + %6546 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2348) #6, !dbg !45 + %6547 = tail call float @llvm.nvvm.saturate.f(float %.02.i2348) #6, !dbg !45 + %.03.i2350 = select i1 %.not1.i2349, float %6547, float %6546, !dbg !45 + %6548 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2351 = icmp eq i32 %6548, 0, !dbg !45 + %6549 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2350, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6550 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2350, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2352 = select i1 %.not2.i2351, float %6550, float %6549, !dbg !45 + %6551 = fadd float %.04.i2352, 0xC168000FE0000000, !dbg !45 + %6552 = fneg float %6551, !dbg !45 + %6553 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2353 = icmp eq i32 %6553, 0, !dbg !45 + %6554 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5827, float 0x3FF7154760000000, float %6552) #6, !dbg !45 + %6555 = tail call float @llvm.nvvm.fma.rn.f(float %5827, float 0x3FF7154760000000, float %6552) #6, !dbg !45 + %.0.i2354 = select i1 %.not3.i2353, float %6555, float %6554, !dbg !45 + %6556 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2355 = icmp eq i32 %6556, 0, !dbg !45 + %6557 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5827, float 0x3E54AE0C00000000, float %.0.i2354) #6, !dbg !45 + %6558 = tail call float @llvm.nvvm.fma.rn.f(float %5827, float 0x3E54AE0C00000000, float %.0.i2354) #6, !dbg !45 + %.01.i2356 = select i1 %.not4.i2355, float %6558, float %6557, !dbg !45 + %6559 = bitcast float %.04.i2352 to i32, !dbg !45 + %6560 = shl i32 %6559, 23, !dbg !45 + %6561 = bitcast i32 %6560 to float, !dbg !45 + %6562 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2356) #6, !dbg !45 + %6563 = fmul float %6562, %6561, !dbg !45 + %6564 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2357 = icmp eq i32 %6564, 0, !dbg !45 + %6565 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5828, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6566 = tail call float @llvm.nvvm.fma.rn.f(float %5828, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2358 = select i1 %.not.i2357, float %6566, float %6565, !dbg !45 + %6567 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2359 = icmp eq i32 %6567, 0, !dbg !45 + %6568 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2358) #6, !dbg !45 + %6569 = tail call float @llvm.nvvm.saturate.f(float %.02.i2358) #6, !dbg !45 + %.03.i2360 = select i1 %.not1.i2359, float %6569, float %6568, !dbg !45 + %6570 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2361 = icmp eq i32 %6570, 0, !dbg !45 + %6571 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2360, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6572 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2360, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2362 = select i1 %.not2.i2361, float %6572, float %6571, !dbg !45 + %6573 = fadd float %.04.i2362, 0xC168000FE0000000, !dbg !45 + %6574 = fneg float %6573, !dbg !45 + %6575 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2363 = icmp eq i32 %6575, 0, !dbg !45 + %6576 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5828, float 0x3FF7154760000000, float %6574) #6, !dbg !45 + %6577 = tail call float @llvm.nvvm.fma.rn.f(float %5828, float 0x3FF7154760000000, float %6574) #6, !dbg !45 + %.0.i2364 = select i1 %.not3.i2363, float %6577, float %6576, !dbg !45 + %6578 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2365 = icmp eq i32 %6578, 0, !dbg !45 + %6579 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5828, float 0x3E54AE0C00000000, float %.0.i2364) #6, !dbg !45 + %6580 = tail call float @llvm.nvvm.fma.rn.f(float %5828, float 0x3E54AE0C00000000, float %.0.i2364) #6, !dbg !45 + %.01.i2366 = select i1 %.not4.i2365, float %6580, float %6579, !dbg !45 + %6581 = bitcast float %.04.i2362 to i32, !dbg !45 + %6582 = shl i32 %6581, 23, !dbg !45 + %6583 = bitcast i32 %6582 to float, !dbg !45 + %6584 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2366) #6, !dbg !45 + %6585 = fmul float %6584, %6583, !dbg !45 + %6586 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2367 = icmp eq i32 %6586, 0, !dbg !45 + %6587 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5829, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6588 = tail call float @llvm.nvvm.fma.rn.f(float %5829, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2368 = select i1 %.not.i2367, float %6588, float %6587, !dbg !45 + %6589 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2369 = icmp eq i32 %6589, 0, !dbg !45 + %6590 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2368) #6, !dbg !45 + %6591 = tail call float @llvm.nvvm.saturate.f(float %.02.i2368) #6, !dbg !45 + %.03.i2370 = select i1 %.not1.i2369, float %6591, float %6590, !dbg !45 + %6592 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2371 = icmp eq i32 %6592, 0, !dbg !45 + %6593 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2370, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6594 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2370, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2372 = select i1 %.not2.i2371, float %6594, float %6593, !dbg !45 + %6595 = fadd float %.04.i2372, 0xC168000FE0000000, !dbg !45 + %6596 = fneg float %6595, !dbg !45 + %6597 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2373 = icmp eq i32 %6597, 0, !dbg !45 + %6598 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5829, float 0x3FF7154760000000, float %6596) #6, !dbg !45 + %6599 = tail call float @llvm.nvvm.fma.rn.f(float %5829, float 0x3FF7154760000000, float %6596) #6, !dbg !45 + %.0.i2374 = select i1 %.not3.i2373, float %6599, float %6598, !dbg !45 + %6600 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2375 = icmp eq i32 %6600, 0, !dbg !45 + %6601 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5829, float 0x3E54AE0C00000000, float %.0.i2374) #6, !dbg !45 + %6602 = tail call float @llvm.nvvm.fma.rn.f(float %5829, float 0x3E54AE0C00000000, float %.0.i2374) #6, !dbg !45 + %.01.i2376 = select i1 %.not4.i2375, float %6602, float %6601, !dbg !45 + %6603 = bitcast float %.04.i2372 to i32, !dbg !45 + %6604 = shl i32 %6603, 23, !dbg !45 + %6605 = bitcast i32 %6604 to float, !dbg !45 + %6606 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2376) #6, !dbg !45 + %6607 = fmul float %6606, %6605, !dbg !45 + %6608 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2377 = icmp eq i32 %6608, 0, !dbg !45 + %6609 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5830, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6610 = tail call float @llvm.nvvm.fma.rn.f(float %5830, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2378 = select i1 %.not.i2377, float %6610, float %6609, !dbg !45 + %6611 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2379 = icmp eq i32 %6611, 0, !dbg !45 + %6612 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2378) #6, !dbg !45 + %6613 = tail call float @llvm.nvvm.saturate.f(float %.02.i2378) #6, !dbg !45 + %.03.i2380 = select i1 %.not1.i2379, float %6613, float %6612, !dbg !45 + %6614 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2381 = icmp eq i32 %6614, 0, !dbg !45 + %6615 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2380, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6616 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2380, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2382 = select i1 %.not2.i2381, float %6616, float %6615, !dbg !45 + %6617 = fadd float %.04.i2382, 0xC168000FE0000000, !dbg !45 + %6618 = fneg float %6617, !dbg !45 + %6619 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2383 = icmp eq i32 %6619, 0, !dbg !45 + %6620 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5830, float 0x3FF7154760000000, float %6618) #6, !dbg !45 + %6621 = tail call float @llvm.nvvm.fma.rn.f(float %5830, float 0x3FF7154760000000, float %6618) #6, !dbg !45 + %.0.i2384 = select i1 %.not3.i2383, float %6621, float %6620, !dbg !45 + %6622 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2385 = icmp eq i32 %6622, 0, !dbg !45 + %6623 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5830, float 0x3E54AE0C00000000, float %.0.i2384) #6, !dbg !45 + %6624 = tail call float @llvm.nvvm.fma.rn.f(float %5830, float 0x3E54AE0C00000000, float %.0.i2384) #6, !dbg !45 + %.01.i2386 = select i1 %.not4.i2385, float %6624, float %6623, !dbg !45 + %6625 = bitcast float %.04.i2382 to i32, !dbg !45 + %6626 = shl i32 %6625, 23, !dbg !45 + %6627 = bitcast i32 %6626 to float, !dbg !45 + %6628 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2386) #6, !dbg !45 + %6629 = fmul float %6628, %6627, !dbg !45 + %6630 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2387 = icmp eq i32 %6630, 0, !dbg !45 + %6631 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5831, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6632 = tail call float @llvm.nvvm.fma.rn.f(float %5831, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2388 = select i1 %.not.i2387, float %6632, float %6631, !dbg !45 + %6633 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2389 = icmp eq i32 %6633, 0, !dbg !45 + %6634 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2388) #6, !dbg !45 + %6635 = tail call float @llvm.nvvm.saturate.f(float %.02.i2388) #6, !dbg !45 + %.03.i2390 = select i1 %.not1.i2389, float %6635, float %6634, !dbg !45 + %6636 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2391 = icmp eq i32 %6636, 0, !dbg !45 + %6637 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2390, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6638 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2390, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2392 = select i1 %.not2.i2391, float %6638, float %6637, !dbg !45 + %6639 = fadd float %.04.i2392, 0xC168000FE0000000, !dbg !45 + %6640 = fneg float %6639, !dbg !45 + %6641 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2393 = icmp eq i32 %6641, 0, !dbg !45 + %6642 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5831, float 0x3FF7154760000000, float %6640) #6, !dbg !45 + %6643 = tail call float @llvm.nvvm.fma.rn.f(float %5831, float 0x3FF7154760000000, float %6640) #6, !dbg !45 + %.0.i2394 = select i1 %.not3.i2393, float %6643, float %6642, !dbg !45 + %6644 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2395 = icmp eq i32 %6644, 0, !dbg !45 + %6645 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5831, float 0x3E54AE0C00000000, float %.0.i2394) #6, !dbg !45 + %6646 = tail call float @llvm.nvvm.fma.rn.f(float %5831, float 0x3E54AE0C00000000, float %.0.i2394) #6, !dbg !45 + %.01.i2396 = select i1 %.not4.i2395, float %6646, float %6645, !dbg !45 + %6647 = bitcast float %.04.i2392 to i32, !dbg !45 + %6648 = shl i32 %6647, 23, !dbg !45 + %6649 = bitcast i32 %6648 to float, !dbg !45 + %6650 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2396) #6, !dbg !45 + %6651 = fmul float %6650, %6649, !dbg !45 + %6652 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2397 = icmp eq i32 %6652, 0, !dbg !45 + %6653 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5832, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6654 = tail call float @llvm.nvvm.fma.rn.f(float %5832, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2398 = select i1 %.not.i2397, float %6654, float %6653, !dbg !45 + %6655 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2399 = icmp eq i32 %6655, 0, !dbg !45 + %6656 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2398) #6, !dbg !45 + %6657 = tail call float @llvm.nvvm.saturate.f(float %.02.i2398) #6, !dbg !45 + %.03.i2400 = select i1 %.not1.i2399, float %6657, float %6656, !dbg !45 + %6658 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2401 = icmp eq i32 %6658, 0, !dbg !45 + %6659 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2400, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6660 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2400, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2402 = select i1 %.not2.i2401, float %6660, float %6659, !dbg !45 + %6661 = fadd float %.04.i2402, 0xC168000FE0000000, !dbg !45 + %6662 = fneg float %6661, !dbg !45 + %6663 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2403 = icmp eq i32 %6663, 0, !dbg !45 + %6664 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5832, float 0x3FF7154760000000, float %6662) #6, !dbg !45 + %6665 = tail call float @llvm.nvvm.fma.rn.f(float %5832, float 0x3FF7154760000000, float %6662) #6, !dbg !45 + %.0.i2404 = select i1 %.not3.i2403, float %6665, float %6664, !dbg !45 + %6666 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2405 = icmp eq i32 %6666, 0, !dbg !45 + %6667 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5832, float 0x3E54AE0C00000000, float %.0.i2404) #6, !dbg !45 + %6668 = tail call float @llvm.nvvm.fma.rn.f(float %5832, float 0x3E54AE0C00000000, float %.0.i2404) #6, !dbg !45 + %.01.i2406 = select i1 %.not4.i2405, float %6668, float %6667, !dbg !45 + %6669 = bitcast float %.04.i2402 to i32, !dbg !45 + %6670 = shl i32 %6669, 23, !dbg !45 + %6671 = bitcast i32 %6670 to float, !dbg !45 + %6672 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2406) #6, !dbg !45 + %6673 = fmul float %6672, %6671, !dbg !45 + %6674 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2407 = icmp eq i32 %6674, 0, !dbg !45 + %6675 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5833, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6676 = tail call float @llvm.nvvm.fma.rn.f(float %5833, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2408 = select i1 %.not.i2407, float %6676, float %6675, !dbg !45 + %6677 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2409 = icmp eq i32 %6677, 0, !dbg !45 + %6678 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2408) #6, !dbg !45 + %6679 = tail call float @llvm.nvvm.saturate.f(float %.02.i2408) #6, !dbg !45 + %.03.i2410 = select i1 %.not1.i2409, float %6679, float %6678, !dbg !45 + %6680 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2411 = icmp eq i32 %6680, 0, !dbg !45 + %6681 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2410, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6682 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2410, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2412 = select i1 %.not2.i2411, float %6682, float %6681, !dbg !45 + %6683 = fadd float %.04.i2412, 0xC168000FE0000000, !dbg !45 + %6684 = fneg float %6683, !dbg !45 + %6685 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2413 = icmp eq i32 %6685, 0, !dbg !45 + %6686 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5833, float 0x3FF7154760000000, float %6684) #6, !dbg !45 + %6687 = tail call float @llvm.nvvm.fma.rn.f(float %5833, float 0x3FF7154760000000, float %6684) #6, !dbg !45 + %.0.i2414 = select i1 %.not3.i2413, float %6687, float %6686, !dbg !45 + %6688 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2415 = icmp eq i32 %6688, 0, !dbg !45 + %6689 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5833, float 0x3E54AE0C00000000, float %.0.i2414) #6, !dbg !45 + %6690 = tail call float @llvm.nvvm.fma.rn.f(float %5833, float 0x3E54AE0C00000000, float %.0.i2414) #6, !dbg !45 + %.01.i2416 = select i1 %.not4.i2415, float %6690, float %6689, !dbg !45 + %6691 = bitcast float %.04.i2412 to i32, !dbg !45 + %6692 = shl i32 %6691, 23, !dbg !45 + %6693 = bitcast i32 %6692 to float, !dbg !45 + %6694 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2416) #6, !dbg !45 + %6695 = fmul float %6694, %6693, !dbg !45 + %6696 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2417 = icmp eq i32 %6696, 0, !dbg !45 + %6697 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5834, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6698 = tail call float @llvm.nvvm.fma.rn.f(float %5834, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2418 = select i1 %.not.i2417, float %6698, float %6697, !dbg !45 + %6699 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2419 = icmp eq i32 %6699, 0, !dbg !45 + %6700 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2418) #6, !dbg !45 + %6701 = tail call float @llvm.nvvm.saturate.f(float %.02.i2418) #6, !dbg !45 + %.03.i2420 = select i1 %.not1.i2419, float %6701, float %6700, !dbg !45 + %6702 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2421 = icmp eq i32 %6702, 0, !dbg !45 + %6703 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2420, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6704 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2420, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2422 = select i1 %.not2.i2421, float %6704, float %6703, !dbg !45 + %6705 = fadd float %.04.i2422, 0xC168000FE0000000, !dbg !45 + %6706 = fneg float %6705, !dbg !45 + %6707 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2423 = icmp eq i32 %6707, 0, !dbg !45 + %6708 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5834, float 0x3FF7154760000000, float %6706) #6, !dbg !45 + %6709 = tail call float @llvm.nvvm.fma.rn.f(float %5834, float 0x3FF7154760000000, float %6706) #6, !dbg !45 + %.0.i2424 = select i1 %.not3.i2423, float %6709, float %6708, !dbg !45 + %6710 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2425 = icmp eq i32 %6710, 0, !dbg !45 + %6711 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5834, float 0x3E54AE0C00000000, float %.0.i2424) #6, !dbg !45 + %6712 = tail call float @llvm.nvvm.fma.rn.f(float %5834, float 0x3E54AE0C00000000, float %.0.i2424) #6, !dbg !45 + %.01.i2426 = select i1 %.not4.i2425, float %6712, float %6711, !dbg !45 + %6713 = bitcast float %.04.i2422 to i32, !dbg !45 + %6714 = shl i32 %6713, 23, !dbg !45 + %6715 = bitcast i32 %6714 to float, !dbg !45 + %6716 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2426) #6, !dbg !45 + %6717 = fmul float %6716, %6715, !dbg !45 + %6718 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2427 = icmp eq i32 %6718, 0, !dbg !45 + %6719 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5835, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6720 = tail call float @llvm.nvvm.fma.rn.f(float %5835, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2428 = select i1 %.not.i2427, float %6720, float %6719, !dbg !45 + %6721 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2429 = icmp eq i32 %6721, 0, !dbg !45 + %6722 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2428) #6, !dbg !45 + %6723 = tail call float @llvm.nvvm.saturate.f(float %.02.i2428) #6, !dbg !45 + %.03.i2430 = select i1 %.not1.i2429, float %6723, float %6722, !dbg !45 + %6724 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2431 = icmp eq i32 %6724, 0, !dbg !45 + %6725 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2430, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6726 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2430, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2432 = select i1 %.not2.i2431, float %6726, float %6725, !dbg !45 + %6727 = fadd float %.04.i2432, 0xC168000FE0000000, !dbg !45 + %6728 = fneg float %6727, !dbg !45 + %6729 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2433 = icmp eq i32 %6729, 0, !dbg !45 + %6730 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5835, float 0x3FF7154760000000, float %6728) #6, !dbg !45 + %6731 = tail call float @llvm.nvvm.fma.rn.f(float %5835, float 0x3FF7154760000000, float %6728) #6, !dbg !45 + %.0.i2434 = select i1 %.not3.i2433, float %6731, float %6730, !dbg !45 + %6732 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2435 = icmp eq i32 %6732, 0, !dbg !45 + %6733 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5835, float 0x3E54AE0C00000000, float %.0.i2434) #6, !dbg !45 + %6734 = tail call float @llvm.nvvm.fma.rn.f(float %5835, float 0x3E54AE0C00000000, float %.0.i2434) #6, !dbg !45 + %.01.i2436 = select i1 %.not4.i2435, float %6734, float %6733, !dbg !45 + %6735 = bitcast float %.04.i2432 to i32, !dbg !45 + %6736 = shl i32 %6735, 23, !dbg !45 + %6737 = bitcast i32 %6736 to float, !dbg !45 + %6738 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2436) #6, !dbg !45 + %6739 = fmul float %6738, %6737, !dbg !45 + %6740 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2437 = icmp eq i32 %6740, 0, !dbg !45 + %6741 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5836, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6742 = tail call float @llvm.nvvm.fma.rn.f(float %5836, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2438 = select i1 %.not.i2437, float %6742, float %6741, !dbg !45 + %6743 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2439 = icmp eq i32 %6743, 0, !dbg !45 + %6744 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2438) #6, !dbg !45 + %6745 = tail call float @llvm.nvvm.saturate.f(float %.02.i2438) #6, !dbg !45 + %.03.i2440 = select i1 %.not1.i2439, float %6745, float %6744, !dbg !45 + %6746 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2441 = icmp eq i32 %6746, 0, !dbg !45 + %6747 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2440, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6748 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2440, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2442 = select i1 %.not2.i2441, float %6748, float %6747, !dbg !45 + %6749 = fadd float %.04.i2442, 0xC168000FE0000000, !dbg !45 + %6750 = fneg float %6749, !dbg !45 + %6751 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2443 = icmp eq i32 %6751, 0, !dbg !45 + %6752 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5836, float 0x3FF7154760000000, float %6750) #6, !dbg !45 + %6753 = tail call float @llvm.nvvm.fma.rn.f(float %5836, float 0x3FF7154760000000, float %6750) #6, !dbg !45 + %.0.i2444 = select i1 %.not3.i2443, float %6753, float %6752, !dbg !45 + %6754 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2445 = icmp eq i32 %6754, 0, !dbg !45 + %6755 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5836, float 0x3E54AE0C00000000, float %.0.i2444) #6, !dbg !45 + %6756 = tail call float @llvm.nvvm.fma.rn.f(float %5836, float 0x3E54AE0C00000000, float %.0.i2444) #6, !dbg !45 + %.01.i2446 = select i1 %.not4.i2445, float %6756, float %6755, !dbg !45 + %6757 = bitcast float %.04.i2442 to i32, !dbg !45 + %6758 = shl i32 %6757, 23, !dbg !45 + %6759 = bitcast i32 %6758 to float, !dbg !45 + %6760 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2446) #6, !dbg !45 + %6761 = fmul float %6760, %6759, !dbg !45 + %6762 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2447 = icmp eq i32 %6762, 0, !dbg !45 + %6763 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5837, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6764 = tail call float @llvm.nvvm.fma.rn.f(float %5837, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2448 = select i1 %.not.i2447, float %6764, float %6763, !dbg !45 + %6765 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2449 = icmp eq i32 %6765, 0, !dbg !45 + %6766 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2448) #6, !dbg !45 + %6767 = tail call float @llvm.nvvm.saturate.f(float %.02.i2448) #6, !dbg !45 + %.03.i2450 = select i1 %.not1.i2449, float %6767, float %6766, !dbg !45 + %6768 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2451 = icmp eq i32 %6768, 0, !dbg !45 + %6769 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2450, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6770 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2450, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2452 = select i1 %.not2.i2451, float %6770, float %6769, !dbg !45 + %6771 = fadd float %.04.i2452, 0xC168000FE0000000, !dbg !45 + %6772 = fneg float %6771, !dbg !45 + %6773 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2453 = icmp eq i32 %6773, 0, !dbg !45 + %6774 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5837, float 0x3FF7154760000000, float %6772) #6, !dbg !45 + %6775 = tail call float @llvm.nvvm.fma.rn.f(float %5837, float 0x3FF7154760000000, float %6772) #6, !dbg !45 + %.0.i2454 = select i1 %.not3.i2453, float %6775, float %6774, !dbg !45 + %6776 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2455 = icmp eq i32 %6776, 0, !dbg !45 + %6777 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5837, float 0x3E54AE0C00000000, float %.0.i2454) #6, !dbg !45 + %6778 = tail call float @llvm.nvvm.fma.rn.f(float %5837, float 0x3E54AE0C00000000, float %.0.i2454) #6, !dbg !45 + %.01.i2456 = select i1 %.not4.i2455, float %6778, float %6777, !dbg !45 + %6779 = bitcast float %.04.i2452 to i32, !dbg !45 + %6780 = shl i32 %6779, 23, !dbg !45 + %6781 = bitcast i32 %6780 to float, !dbg !45 + %6782 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2456) #6, !dbg !45 + %6783 = fmul float %6782, %6781, !dbg !45 + %6784 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2457 = icmp eq i32 %6784, 0, !dbg !45 + %6785 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5838, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6786 = tail call float @llvm.nvvm.fma.rn.f(float %5838, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2458 = select i1 %.not.i2457, float %6786, float %6785, !dbg !45 + %6787 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2459 = icmp eq i32 %6787, 0, !dbg !45 + %6788 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2458) #6, !dbg !45 + %6789 = tail call float @llvm.nvvm.saturate.f(float %.02.i2458) #6, !dbg !45 + %.03.i2460 = select i1 %.not1.i2459, float %6789, float %6788, !dbg !45 + %6790 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2461 = icmp eq i32 %6790, 0, !dbg !45 + %6791 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2460, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6792 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2460, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2462 = select i1 %.not2.i2461, float %6792, float %6791, !dbg !45 + %6793 = fadd float %.04.i2462, 0xC168000FE0000000, !dbg !45 + %6794 = fneg float %6793, !dbg !45 + %6795 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2463 = icmp eq i32 %6795, 0, !dbg !45 + %6796 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5838, float 0x3FF7154760000000, float %6794) #6, !dbg !45 + %6797 = tail call float @llvm.nvvm.fma.rn.f(float %5838, float 0x3FF7154760000000, float %6794) #6, !dbg !45 + %.0.i2464 = select i1 %.not3.i2463, float %6797, float %6796, !dbg !45 + %6798 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2465 = icmp eq i32 %6798, 0, !dbg !45 + %6799 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5838, float 0x3E54AE0C00000000, float %.0.i2464) #6, !dbg !45 + %6800 = tail call float @llvm.nvvm.fma.rn.f(float %5838, float 0x3E54AE0C00000000, float %.0.i2464) #6, !dbg !45 + %.01.i2466 = select i1 %.not4.i2465, float %6800, float %6799, !dbg !45 + %6801 = bitcast float %.04.i2462 to i32, !dbg !45 + %6802 = shl i32 %6801, 23, !dbg !45 + %6803 = bitcast i32 %6802 to float, !dbg !45 + %6804 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2466) #6, !dbg !45 + %6805 = fmul float %6804, %6803, !dbg !45 + %6806 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2467 = icmp eq i32 %6806, 0, !dbg !45 + %6807 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5839, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6808 = tail call float @llvm.nvvm.fma.rn.f(float %5839, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2468 = select i1 %.not.i2467, float %6808, float %6807, !dbg !45 + %6809 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2469 = icmp eq i32 %6809, 0, !dbg !45 + %6810 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2468) #6, !dbg !45 + %6811 = tail call float @llvm.nvvm.saturate.f(float %.02.i2468) #6, !dbg !45 + %.03.i2470 = select i1 %.not1.i2469, float %6811, float %6810, !dbg !45 + %6812 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2471 = icmp eq i32 %6812, 0, !dbg !45 + %6813 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2470, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6814 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2470, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2472 = select i1 %.not2.i2471, float %6814, float %6813, !dbg !45 + %6815 = fadd float %.04.i2472, 0xC168000FE0000000, !dbg !45 + %6816 = fneg float %6815, !dbg !45 + %6817 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2473 = icmp eq i32 %6817, 0, !dbg !45 + %6818 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5839, float 0x3FF7154760000000, float %6816) #6, !dbg !45 + %6819 = tail call float @llvm.nvvm.fma.rn.f(float %5839, float 0x3FF7154760000000, float %6816) #6, !dbg !45 + %.0.i2474 = select i1 %.not3.i2473, float %6819, float %6818, !dbg !45 + %6820 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2475 = icmp eq i32 %6820, 0, !dbg !45 + %6821 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5839, float 0x3E54AE0C00000000, float %.0.i2474) #6, !dbg !45 + %6822 = tail call float @llvm.nvvm.fma.rn.f(float %5839, float 0x3E54AE0C00000000, float %.0.i2474) #6, !dbg !45 + %.01.i2476 = select i1 %.not4.i2475, float %6822, float %6821, !dbg !45 + %6823 = bitcast float %.04.i2472 to i32, !dbg !45 + %6824 = shl i32 %6823, 23, !dbg !45 + %6825 = bitcast i32 %6824 to float, !dbg !45 + %6826 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2476) #6, !dbg !45 + %6827 = fmul float %6826, %6825, !dbg !45 + %6828 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2477 = icmp eq i32 %6828, 0, !dbg !45 + %6829 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5840, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6830 = tail call float @llvm.nvvm.fma.rn.f(float %5840, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2478 = select i1 %.not.i2477, float %6830, float %6829, !dbg !45 + %6831 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2479 = icmp eq i32 %6831, 0, !dbg !45 + %6832 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2478) #6, !dbg !45 + %6833 = tail call float @llvm.nvvm.saturate.f(float %.02.i2478) #6, !dbg !45 + %.03.i2480 = select i1 %.not1.i2479, float %6833, float %6832, !dbg !45 + %6834 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2481 = icmp eq i32 %6834, 0, !dbg !45 + %6835 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2480, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6836 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2480, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2482 = select i1 %.not2.i2481, float %6836, float %6835, !dbg !45 + %6837 = fadd float %.04.i2482, 0xC168000FE0000000, !dbg !45 + %6838 = fneg float %6837, !dbg !45 + %6839 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2483 = icmp eq i32 %6839, 0, !dbg !45 + %6840 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5840, float 0x3FF7154760000000, float %6838) #6, !dbg !45 + %6841 = tail call float @llvm.nvvm.fma.rn.f(float %5840, float 0x3FF7154760000000, float %6838) #6, !dbg !45 + %.0.i2484 = select i1 %.not3.i2483, float %6841, float %6840, !dbg !45 + %6842 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2485 = icmp eq i32 %6842, 0, !dbg !45 + %6843 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5840, float 0x3E54AE0C00000000, float %.0.i2484) #6, !dbg !45 + %6844 = tail call float @llvm.nvvm.fma.rn.f(float %5840, float 0x3E54AE0C00000000, float %.0.i2484) #6, !dbg !45 + %.01.i2486 = select i1 %.not4.i2485, float %6844, float %6843, !dbg !45 + %6845 = bitcast float %.04.i2482 to i32, !dbg !45 + %6846 = shl i32 %6845, 23, !dbg !45 + %6847 = bitcast i32 %6846 to float, !dbg !45 + %6848 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2486) #6, !dbg !45 + %6849 = fmul float %6848, %6847, !dbg !45 + %6850 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2487 = icmp eq i32 %6850, 0, !dbg !45 + %6851 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5841, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6852 = tail call float @llvm.nvvm.fma.rn.f(float %5841, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2488 = select i1 %.not.i2487, float %6852, float %6851, !dbg !45 + %6853 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2489 = icmp eq i32 %6853, 0, !dbg !45 + %6854 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2488) #6, !dbg !45 + %6855 = tail call float @llvm.nvvm.saturate.f(float %.02.i2488) #6, !dbg !45 + %.03.i2490 = select i1 %.not1.i2489, float %6855, float %6854, !dbg !45 + %6856 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2491 = icmp eq i32 %6856, 0, !dbg !45 + %6857 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2490, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6858 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2490, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2492 = select i1 %.not2.i2491, float %6858, float %6857, !dbg !45 + %6859 = fadd float %.04.i2492, 0xC168000FE0000000, !dbg !45 + %6860 = fneg float %6859, !dbg !45 + %6861 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2493 = icmp eq i32 %6861, 0, !dbg !45 + %6862 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5841, float 0x3FF7154760000000, float %6860) #6, !dbg !45 + %6863 = tail call float @llvm.nvvm.fma.rn.f(float %5841, float 0x3FF7154760000000, float %6860) #6, !dbg !45 + %.0.i2494 = select i1 %.not3.i2493, float %6863, float %6862, !dbg !45 + %6864 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2495 = icmp eq i32 %6864, 0, !dbg !45 + %6865 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5841, float 0x3E54AE0C00000000, float %.0.i2494) #6, !dbg !45 + %6866 = tail call float @llvm.nvvm.fma.rn.f(float %5841, float 0x3E54AE0C00000000, float %.0.i2494) #6, !dbg !45 + %.01.i2496 = select i1 %.not4.i2495, float %6866, float %6865, !dbg !45 + %6867 = bitcast float %.04.i2492 to i32, !dbg !45 + %6868 = shl i32 %6867, 23, !dbg !45 + %6869 = bitcast i32 %6868 to float, !dbg !45 + %6870 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2496) #6, !dbg !45 + %6871 = fmul float %6870, %6869, !dbg !45 + %6872 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2497 = icmp eq i32 %6872, 0, !dbg !45 + %6873 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5842, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6874 = tail call float @llvm.nvvm.fma.rn.f(float %5842, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2498 = select i1 %.not.i2497, float %6874, float %6873, !dbg !45 + %6875 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2499 = icmp eq i32 %6875, 0, !dbg !45 + %6876 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2498) #6, !dbg !45 + %6877 = tail call float @llvm.nvvm.saturate.f(float %.02.i2498) #6, !dbg !45 + %.03.i2500 = select i1 %.not1.i2499, float %6877, float %6876, !dbg !45 + %6878 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2501 = icmp eq i32 %6878, 0, !dbg !45 + %6879 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2500, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6880 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2500, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2502 = select i1 %.not2.i2501, float %6880, float %6879, !dbg !45 + %6881 = fadd float %.04.i2502, 0xC168000FE0000000, !dbg !45 + %6882 = fneg float %6881, !dbg !45 + %6883 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2503 = icmp eq i32 %6883, 0, !dbg !45 + %6884 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5842, float 0x3FF7154760000000, float %6882) #6, !dbg !45 + %6885 = tail call float @llvm.nvvm.fma.rn.f(float %5842, float 0x3FF7154760000000, float %6882) #6, !dbg !45 + %.0.i2504 = select i1 %.not3.i2503, float %6885, float %6884, !dbg !45 + %6886 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2505 = icmp eq i32 %6886, 0, !dbg !45 + %6887 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5842, float 0x3E54AE0C00000000, float %.0.i2504) #6, !dbg !45 + %6888 = tail call float @llvm.nvvm.fma.rn.f(float %5842, float 0x3E54AE0C00000000, float %.0.i2504) #6, !dbg !45 + %.01.i2506 = select i1 %.not4.i2505, float %6888, float %6887, !dbg !45 + %6889 = bitcast float %.04.i2502 to i32, !dbg !45 + %6890 = shl i32 %6889, 23, !dbg !45 + %6891 = bitcast i32 %6890 to float, !dbg !45 + %6892 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2506) #6, !dbg !45 + %6893 = fmul float %6892, %6891, !dbg !45 + %6894 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2507 = icmp eq i32 %6894, 0, !dbg !45 + %6895 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5843, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6896 = tail call float @llvm.nvvm.fma.rn.f(float %5843, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2508 = select i1 %.not.i2507, float %6896, float %6895, !dbg !45 + %6897 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2509 = icmp eq i32 %6897, 0, !dbg !45 + %6898 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2508) #6, !dbg !45 + %6899 = tail call float @llvm.nvvm.saturate.f(float %.02.i2508) #6, !dbg !45 + %.03.i2510 = select i1 %.not1.i2509, float %6899, float %6898, !dbg !45 + %6900 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2511 = icmp eq i32 %6900, 0, !dbg !45 + %6901 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2510, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6902 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2510, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2512 = select i1 %.not2.i2511, float %6902, float %6901, !dbg !45 + %6903 = fadd float %.04.i2512, 0xC168000FE0000000, !dbg !45 + %6904 = fneg float %6903, !dbg !45 + %6905 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2513 = icmp eq i32 %6905, 0, !dbg !45 + %6906 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5843, float 0x3FF7154760000000, float %6904) #6, !dbg !45 + %6907 = tail call float @llvm.nvvm.fma.rn.f(float %5843, float 0x3FF7154760000000, float %6904) #6, !dbg !45 + %.0.i2514 = select i1 %.not3.i2513, float %6907, float %6906, !dbg !45 + %6908 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2515 = icmp eq i32 %6908, 0, !dbg !45 + %6909 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5843, float 0x3E54AE0C00000000, float %.0.i2514) #6, !dbg !45 + %6910 = tail call float @llvm.nvvm.fma.rn.f(float %5843, float 0x3E54AE0C00000000, float %.0.i2514) #6, !dbg !45 + %.01.i2516 = select i1 %.not4.i2515, float %6910, float %6909, !dbg !45 + %6911 = bitcast float %.04.i2512 to i32, !dbg !45 + %6912 = shl i32 %6911, 23, !dbg !45 + %6913 = bitcast i32 %6912 to float, !dbg !45 + %6914 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2516) #6, !dbg !45 + %6915 = fmul float %6914, %6913, !dbg !45 + %6916 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2517 = icmp eq i32 %6916, 0, !dbg !45 + %6917 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5844, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6918 = tail call float @llvm.nvvm.fma.rn.f(float %5844, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2518 = select i1 %.not.i2517, float %6918, float %6917, !dbg !45 + %6919 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2519 = icmp eq i32 %6919, 0, !dbg !45 + %6920 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2518) #6, !dbg !45 + %6921 = tail call float @llvm.nvvm.saturate.f(float %.02.i2518) #6, !dbg !45 + %.03.i2520 = select i1 %.not1.i2519, float %6921, float %6920, !dbg !45 + %6922 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2521 = icmp eq i32 %6922, 0, !dbg !45 + %6923 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2520, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6924 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2520, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2522 = select i1 %.not2.i2521, float %6924, float %6923, !dbg !45 + %6925 = fadd float %.04.i2522, 0xC168000FE0000000, !dbg !45 + %6926 = fneg float %6925, !dbg !45 + %6927 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2523 = icmp eq i32 %6927, 0, !dbg !45 + %6928 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5844, float 0x3FF7154760000000, float %6926) #6, !dbg !45 + %6929 = tail call float @llvm.nvvm.fma.rn.f(float %5844, float 0x3FF7154760000000, float %6926) #6, !dbg !45 + %.0.i2524 = select i1 %.not3.i2523, float %6929, float %6928, !dbg !45 + %6930 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2525 = icmp eq i32 %6930, 0, !dbg !45 + %6931 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5844, float 0x3E54AE0C00000000, float %.0.i2524) #6, !dbg !45 + %6932 = tail call float @llvm.nvvm.fma.rn.f(float %5844, float 0x3E54AE0C00000000, float %.0.i2524) #6, !dbg !45 + %.01.i2526 = select i1 %.not4.i2525, float %6932, float %6931, !dbg !45 + %6933 = bitcast float %.04.i2522 to i32, !dbg !45 + %6934 = shl i32 %6933, 23, !dbg !45 + %6935 = bitcast i32 %6934 to float, !dbg !45 + %6936 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2526) #6, !dbg !45 + %6937 = fmul float %6936, %6935, !dbg !45 + %6938 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2527 = icmp eq i32 %6938, 0, !dbg !45 + %6939 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5845, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6940 = tail call float @llvm.nvvm.fma.rn.f(float %5845, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2528 = select i1 %.not.i2527, float %6940, float %6939, !dbg !45 + %6941 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2529 = icmp eq i32 %6941, 0, !dbg !45 + %6942 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2528) #6, !dbg !45 + %6943 = tail call float @llvm.nvvm.saturate.f(float %.02.i2528) #6, !dbg !45 + %.03.i2530 = select i1 %.not1.i2529, float %6943, float %6942, !dbg !45 + %6944 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2531 = icmp eq i32 %6944, 0, !dbg !45 + %6945 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2530, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6946 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2530, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2532 = select i1 %.not2.i2531, float %6946, float %6945, !dbg !45 + %6947 = fadd float %.04.i2532, 0xC168000FE0000000, !dbg !45 + %6948 = fneg float %6947, !dbg !45 + %6949 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2533 = icmp eq i32 %6949, 0, !dbg !45 + %6950 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5845, float 0x3FF7154760000000, float %6948) #6, !dbg !45 + %6951 = tail call float @llvm.nvvm.fma.rn.f(float %5845, float 0x3FF7154760000000, float %6948) #6, !dbg !45 + %.0.i2534 = select i1 %.not3.i2533, float %6951, float %6950, !dbg !45 + %6952 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2535 = icmp eq i32 %6952, 0, !dbg !45 + %6953 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5845, float 0x3E54AE0C00000000, float %.0.i2534) #6, !dbg !45 + %6954 = tail call float @llvm.nvvm.fma.rn.f(float %5845, float 0x3E54AE0C00000000, float %.0.i2534) #6, !dbg !45 + %.01.i2536 = select i1 %.not4.i2535, float %6954, float %6953, !dbg !45 + %6955 = bitcast float %.04.i2532 to i32, !dbg !45 + %6956 = shl i32 %6955, 23, !dbg !45 + %6957 = bitcast i32 %6956 to float, !dbg !45 + %6958 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2536) #6, !dbg !45 + %6959 = fmul float %6958, %6957, !dbg !45 + %6960 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2537 = icmp eq i32 %6960, 0, !dbg !45 + %6961 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5846, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6962 = tail call float @llvm.nvvm.fma.rn.f(float %5846, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2538 = select i1 %.not.i2537, float %6962, float %6961, !dbg !45 + %6963 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2539 = icmp eq i32 %6963, 0, !dbg !45 + %6964 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2538) #6, !dbg !45 + %6965 = tail call float @llvm.nvvm.saturate.f(float %.02.i2538) #6, !dbg !45 + %.03.i2540 = select i1 %.not1.i2539, float %6965, float %6964, !dbg !45 + %6966 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2541 = icmp eq i32 %6966, 0, !dbg !45 + %6967 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2540, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6968 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2540, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2542 = select i1 %.not2.i2541, float %6968, float %6967, !dbg !45 + %6969 = fadd float %.04.i2542, 0xC168000FE0000000, !dbg !45 + %6970 = fneg float %6969, !dbg !45 + %6971 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2543 = icmp eq i32 %6971, 0, !dbg !45 + %6972 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5846, float 0x3FF7154760000000, float %6970) #6, !dbg !45 + %6973 = tail call float @llvm.nvvm.fma.rn.f(float %5846, float 0x3FF7154760000000, float %6970) #6, !dbg !45 + %.0.i2544 = select i1 %.not3.i2543, float %6973, float %6972, !dbg !45 + %6974 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2545 = icmp eq i32 %6974, 0, !dbg !45 + %6975 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5846, float 0x3E54AE0C00000000, float %.0.i2544) #6, !dbg !45 + %6976 = tail call float @llvm.nvvm.fma.rn.f(float %5846, float 0x3E54AE0C00000000, float %.0.i2544) #6, !dbg !45 + %.01.i2546 = select i1 %.not4.i2545, float %6976, float %6975, !dbg !45 + %6977 = bitcast float %.04.i2542 to i32, !dbg !45 + %6978 = shl i32 %6977, 23, !dbg !45 + %6979 = bitcast i32 %6978 to float, !dbg !45 + %6980 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2546) #6, !dbg !45 + %6981 = fmul float %6980, %6979, !dbg !45 + %6982 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2547 = icmp eq i32 %6982, 0, !dbg !45 + %6983 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5847, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %6984 = tail call float @llvm.nvvm.fma.rn.f(float %5847, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2548 = select i1 %.not.i2547, float %6984, float %6983, !dbg !45 + %6985 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2549 = icmp eq i32 %6985, 0, !dbg !45 + %6986 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2548) #6, !dbg !45 + %6987 = tail call float @llvm.nvvm.saturate.f(float %.02.i2548) #6, !dbg !45 + %.03.i2550 = select i1 %.not1.i2549, float %6987, float %6986, !dbg !45 + %6988 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2551 = icmp eq i32 %6988, 0, !dbg !45 + %6989 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2550, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %6990 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2550, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2552 = select i1 %.not2.i2551, float %6990, float %6989, !dbg !45 + %6991 = fadd float %.04.i2552, 0xC168000FE0000000, !dbg !45 + %6992 = fneg float %6991, !dbg !45 + %6993 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2553 = icmp eq i32 %6993, 0, !dbg !45 + %6994 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5847, float 0x3FF7154760000000, float %6992) #6, !dbg !45 + %6995 = tail call float @llvm.nvvm.fma.rn.f(float %5847, float 0x3FF7154760000000, float %6992) #6, !dbg !45 + %.0.i2554 = select i1 %.not3.i2553, float %6995, float %6994, !dbg !45 + %6996 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2555 = icmp eq i32 %6996, 0, !dbg !45 + %6997 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5847, float 0x3E54AE0C00000000, float %.0.i2554) #6, !dbg !45 + %6998 = tail call float @llvm.nvvm.fma.rn.f(float %5847, float 0x3E54AE0C00000000, float %.0.i2554) #6, !dbg !45 + %.01.i2556 = select i1 %.not4.i2555, float %6998, float %6997, !dbg !45 + %6999 = bitcast float %.04.i2552 to i32, !dbg !45 + %7000 = shl i32 %6999, 23, !dbg !45 + %7001 = bitcast i32 %7000 to float, !dbg !45 + %7002 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2556) #6, !dbg !45 + %7003 = fmul float %7002, %7001, !dbg !45 + %7004 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2557 = icmp eq i32 %7004, 0, !dbg !45 + %7005 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5848, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7006 = tail call float @llvm.nvvm.fma.rn.f(float %5848, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2558 = select i1 %.not.i2557, float %7006, float %7005, !dbg !45 + %7007 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2559 = icmp eq i32 %7007, 0, !dbg !45 + %7008 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2558) #6, !dbg !45 + %7009 = tail call float @llvm.nvvm.saturate.f(float %.02.i2558) #6, !dbg !45 + %.03.i2560 = select i1 %.not1.i2559, float %7009, float %7008, !dbg !45 + %7010 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2561 = icmp eq i32 %7010, 0, !dbg !45 + %7011 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2560, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7012 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2560, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2562 = select i1 %.not2.i2561, float %7012, float %7011, !dbg !45 + %7013 = fadd float %.04.i2562, 0xC168000FE0000000, !dbg !45 + %7014 = fneg float %7013, !dbg !45 + %7015 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2563 = icmp eq i32 %7015, 0, !dbg !45 + %7016 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5848, float 0x3FF7154760000000, float %7014) #6, !dbg !45 + %7017 = tail call float @llvm.nvvm.fma.rn.f(float %5848, float 0x3FF7154760000000, float %7014) #6, !dbg !45 + %.0.i2564 = select i1 %.not3.i2563, float %7017, float %7016, !dbg !45 + %7018 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2565 = icmp eq i32 %7018, 0, !dbg !45 + %7019 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5848, float 0x3E54AE0C00000000, float %.0.i2564) #6, !dbg !45 + %7020 = tail call float @llvm.nvvm.fma.rn.f(float %5848, float 0x3E54AE0C00000000, float %.0.i2564) #6, !dbg !45 + %.01.i2566 = select i1 %.not4.i2565, float %7020, float %7019, !dbg !45 + %7021 = bitcast float %.04.i2562 to i32, !dbg !45 + %7022 = shl i32 %7021, 23, !dbg !45 + %7023 = bitcast i32 %7022 to float, !dbg !45 + %7024 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2566) #6, !dbg !45 + %7025 = fmul float %7024, %7023, !dbg !45 + %7026 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2567 = icmp eq i32 %7026, 0, !dbg !45 + %7027 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5849, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7028 = tail call float @llvm.nvvm.fma.rn.f(float %5849, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2568 = select i1 %.not.i2567, float %7028, float %7027, !dbg !45 + %7029 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2569 = icmp eq i32 %7029, 0, !dbg !45 + %7030 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2568) #6, !dbg !45 + %7031 = tail call float @llvm.nvvm.saturate.f(float %.02.i2568) #6, !dbg !45 + %.03.i2570 = select i1 %.not1.i2569, float %7031, float %7030, !dbg !45 + %7032 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2571 = icmp eq i32 %7032, 0, !dbg !45 + %7033 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2570, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7034 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2570, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2572 = select i1 %.not2.i2571, float %7034, float %7033, !dbg !45 + %7035 = fadd float %.04.i2572, 0xC168000FE0000000, !dbg !45 + %7036 = fneg float %7035, !dbg !45 + %7037 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2573 = icmp eq i32 %7037, 0, !dbg !45 + %7038 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5849, float 0x3FF7154760000000, float %7036) #6, !dbg !45 + %7039 = tail call float @llvm.nvvm.fma.rn.f(float %5849, float 0x3FF7154760000000, float %7036) #6, !dbg !45 + %.0.i2574 = select i1 %.not3.i2573, float %7039, float %7038, !dbg !45 + %7040 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2575 = icmp eq i32 %7040, 0, !dbg !45 + %7041 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5849, float 0x3E54AE0C00000000, float %.0.i2574) #6, !dbg !45 + %7042 = tail call float @llvm.nvvm.fma.rn.f(float %5849, float 0x3E54AE0C00000000, float %.0.i2574) #6, !dbg !45 + %.01.i2576 = select i1 %.not4.i2575, float %7042, float %7041, !dbg !45 + %7043 = bitcast float %.04.i2572 to i32, !dbg !45 + %7044 = shl i32 %7043, 23, !dbg !45 + %7045 = bitcast i32 %7044 to float, !dbg !45 + %7046 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2576) #6, !dbg !45 + %7047 = fmul float %7046, %7045, !dbg !45 + %7048 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2577 = icmp eq i32 %7048, 0, !dbg !45 + %7049 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5850, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7050 = tail call float @llvm.nvvm.fma.rn.f(float %5850, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2578 = select i1 %.not.i2577, float %7050, float %7049, !dbg !45 + %7051 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2579 = icmp eq i32 %7051, 0, !dbg !45 + %7052 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2578) #6, !dbg !45 + %7053 = tail call float @llvm.nvvm.saturate.f(float %.02.i2578) #6, !dbg !45 + %.03.i2580 = select i1 %.not1.i2579, float %7053, float %7052, !dbg !45 + %7054 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2581 = icmp eq i32 %7054, 0, !dbg !45 + %7055 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2580, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7056 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2580, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2582 = select i1 %.not2.i2581, float %7056, float %7055, !dbg !45 + %7057 = fadd float %.04.i2582, 0xC168000FE0000000, !dbg !45 + %7058 = fneg float %7057, !dbg !45 + %7059 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2583 = icmp eq i32 %7059, 0, !dbg !45 + %7060 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5850, float 0x3FF7154760000000, float %7058) #6, !dbg !45 + %7061 = tail call float @llvm.nvvm.fma.rn.f(float %5850, float 0x3FF7154760000000, float %7058) #6, !dbg !45 + %.0.i2584 = select i1 %.not3.i2583, float %7061, float %7060, !dbg !45 + %7062 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2585 = icmp eq i32 %7062, 0, !dbg !45 + %7063 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5850, float 0x3E54AE0C00000000, float %.0.i2584) #6, !dbg !45 + %7064 = tail call float @llvm.nvvm.fma.rn.f(float %5850, float 0x3E54AE0C00000000, float %.0.i2584) #6, !dbg !45 + %.01.i2586 = select i1 %.not4.i2585, float %7064, float %7063, !dbg !45 + %7065 = bitcast float %.04.i2582 to i32, !dbg !45 + %7066 = shl i32 %7065, 23, !dbg !45 + %7067 = bitcast i32 %7066 to float, !dbg !45 + %7068 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2586) #6, !dbg !45 + %7069 = fmul float %7068, %7067, !dbg !45 + %7070 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2587 = icmp eq i32 %7070, 0, !dbg !45 + %7071 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5851, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7072 = tail call float @llvm.nvvm.fma.rn.f(float %5851, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2588 = select i1 %.not.i2587, float %7072, float %7071, !dbg !45 + %7073 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2589 = icmp eq i32 %7073, 0, !dbg !45 + %7074 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2588) #6, !dbg !45 + %7075 = tail call float @llvm.nvvm.saturate.f(float %.02.i2588) #6, !dbg !45 + %.03.i2590 = select i1 %.not1.i2589, float %7075, float %7074, !dbg !45 + %7076 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2591 = icmp eq i32 %7076, 0, !dbg !45 + %7077 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2590, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7078 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2590, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2592 = select i1 %.not2.i2591, float %7078, float %7077, !dbg !45 + %7079 = fadd float %.04.i2592, 0xC168000FE0000000, !dbg !45 + %7080 = fneg float %7079, !dbg !45 + %7081 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2593 = icmp eq i32 %7081, 0, !dbg !45 + %7082 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5851, float 0x3FF7154760000000, float %7080) #6, !dbg !45 + %7083 = tail call float @llvm.nvvm.fma.rn.f(float %5851, float 0x3FF7154760000000, float %7080) #6, !dbg !45 + %.0.i2594 = select i1 %.not3.i2593, float %7083, float %7082, !dbg !45 + %7084 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2595 = icmp eq i32 %7084, 0, !dbg !45 + %7085 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5851, float 0x3E54AE0C00000000, float %.0.i2594) #6, !dbg !45 + %7086 = tail call float @llvm.nvvm.fma.rn.f(float %5851, float 0x3E54AE0C00000000, float %.0.i2594) #6, !dbg !45 + %.01.i2596 = select i1 %.not4.i2595, float %7086, float %7085, !dbg !45 + %7087 = bitcast float %.04.i2592 to i32, !dbg !45 + %7088 = shl i32 %7087, 23, !dbg !45 + %7089 = bitcast i32 %7088 to float, !dbg !45 + %7090 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2596) #6, !dbg !45 + %7091 = fmul float %7090, %7089, !dbg !45 + %7092 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2597 = icmp eq i32 %7092, 0, !dbg !45 + %7093 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5852, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7094 = tail call float @llvm.nvvm.fma.rn.f(float %5852, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2598 = select i1 %.not.i2597, float %7094, float %7093, !dbg !45 + %7095 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2599 = icmp eq i32 %7095, 0, !dbg !45 + %7096 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2598) #6, !dbg !45 + %7097 = tail call float @llvm.nvvm.saturate.f(float %.02.i2598) #6, !dbg !45 + %.03.i2600 = select i1 %.not1.i2599, float %7097, float %7096, !dbg !45 + %7098 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2601 = icmp eq i32 %7098, 0, !dbg !45 + %7099 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2600, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7100 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2600, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2602 = select i1 %.not2.i2601, float %7100, float %7099, !dbg !45 + %7101 = fadd float %.04.i2602, 0xC168000FE0000000, !dbg !45 + %7102 = fneg float %7101, !dbg !45 + %7103 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2603 = icmp eq i32 %7103, 0, !dbg !45 + %7104 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5852, float 0x3FF7154760000000, float %7102) #6, !dbg !45 + %7105 = tail call float @llvm.nvvm.fma.rn.f(float %5852, float 0x3FF7154760000000, float %7102) #6, !dbg !45 + %.0.i2604 = select i1 %.not3.i2603, float %7105, float %7104, !dbg !45 + %7106 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2605 = icmp eq i32 %7106, 0, !dbg !45 + %7107 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5852, float 0x3E54AE0C00000000, float %.0.i2604) #6, !dbg !45 + %7108 = tail call float @llvm.nvvm.fma.rn.f(float %5852, float 0x3E54AE0C00000000, float %.0.i2604) #6, !dbg !45 + %.01.i2606 = select i1 %.not4.i2605, float %7108, float %7107, !dbg !45 + %7109 = bitcast float %.04.i2602 to i32, !dbg !45 + %7110 = shl i32 %7109, 23, !dbg !45 + %7111 = bitcast i32 %7110 to float, !dbg !45 + %7112 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2606) #6, !dbg !45 + %7113 = fmul float %7112, %7111, !dbg !45 + %7114 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2607 = icmp eq i32 %7114, 0, !dbg !45 + %7115 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5853, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7116 = tail call float @llvm.nvvm.fma.rn.f(float %5853, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2608 = select i1 %.not.i2607, float %7116, float %7115, !dbg !45 + %7117 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2609 = icmp eq i32 %7117, 0, !dbg !45 + %7118 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2608) #6, !dbg !45 + %7119 = tail call float @llvm.nvvm.saturate.f(float %.02.i2608) #6, !dbg !45 + %.03.i2610 = select i1 %.not1.i2609, float %7119, float %7118, !dbg !45 + %7120 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2611 = icmp eq i32 %7120, 0, !dbg !45 + %7121 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2610, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7122 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2610, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2612 = select i1 %.not2.i2611, float %7122, float %7121, !dbg !45 + %7123 = fadd float %.04.i2612, 0xC168000FE0000000, !dbg !45 + %7124 = fneg float %7123, !dbg !45 + %7125 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2613 = icmp eq i32 %7125, 0, !dbg !45 + %7126 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5853, float 0x3FF7154760000000, float %7124) #6, !dbg !45 + %7127 = tail call float @llvm.nvvm.fma.rn.f(float %5853, float 0x3FF7154760000000, float %7124) #6, !dbg !45 + %.0.i2614 = select i1 %.not3.i2613, float %7127, float %7126, !dbg !45 + %7128 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2615 = icmp eq i32 %7128, 0, !dbg !45 + %7129 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5853, float 0x3E54AE0C00000000, float %.0.i2614) #6, !dbg !45 + %7130 = tail call float @llvm.nvvm.fma.rn.f(float %5853, float 0x3E54AE0C00000000, float %.0.i2614) #6, !dbg !45 + %.01.i2616 = select i1 %.not4.i2615, float %7130, float %7129, !dbg !45 + %7131 = bitcast float %.04.i2612 to i32, !dbg !45 + %7132 = shl i32 %7131, 23, !dbg !45 + %7133 = bitcast i32 %7132 to float, !dbg !45 + %7134 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2616) #6, !dbg !45 + %7135 = fmul float %7134, %7133, !dbg !45 + %7136 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2617 = icmp eq i32 %7136, 0, !dbg !45 + %7137 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5854, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7138 = tail call float @llvm.nvvm.fma.rn.f(float %5854, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2618 = select i1 %.not.i2617, float %7138, float %7137, !dbg !45 + %7139 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2619 = icmp eq i32 %7139, 0, !dbg !45 + %7140 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2618) #6, !dbg !45 + %7141 = tail call float @llvm.nvvm.saturate.f(float %.02.i2618) #6, !dbg !45 + %.03.i2620 = select i1 %.not1.i2619, float %7141, float %7140, !dbg !45 + %7142 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2621 = icmp eq i32 %7142, 0, !dbg !45 + %7143 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2620, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7144 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2620, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2622 = select i1 %.not2.i2621, float %7144, float %7143, !dbg !45 + %7145 = fadd float %.04.i2622, 0xC168000FE0000000, !dbg !45 + %7146 = fneg float %7145, !dbg !45 + %7147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2623 = icmp eq i32 %7147, 0, !dbg !45 + %7148 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5854, float 0x3FF7154760000000, float %7146) #6, !dbg !45 + %7149 = tail call float @llvm.nvvm.fma.rn.f(float %5854, float 0x3FF7154760000000, float %7146) #6, !dbg !45 + %.0.i2624 = select i1 %.not3.i2623, float %7149, float %7148, !dbg !45 + %7150 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2625 = icmp eq i32 %7150, 0, !dbg !45 + %7151 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5854, float 0x3E54AE0C00000000, float %.0.i2624) #6, !dbg !45 + %7152 = tail call float @llvm.nvvm.fma.rn.f(float %5854, float 0x3E54AE0C00000000, float %.0.i2624) #6, !dbg !45 + %.01.i2626 = select i1 %.not4.i2625, float %7152, float %7151, !dbg !45 + %7153 = bitcast float %.04.i2622 to i32, !dbg !45 + %7154 = shl i32 %7153, 23, !dbg !45 + %7155 = bitcast i32 %7154 to float, !dbg !45 + %7156 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2626) #6, !dbg !45 + %7157 = fmul float %7156, %7155, !dbg !45 + %7158 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2627 = icmp eq i32 %7158, 0, !dbg !45 + %7159 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5855, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7160 = tail call float @llvm.nvvm.fma.rn.f(float %5855, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2628 = select i1 %.not.i2627, float %7160, float %7159, !dbg !45 + %7161 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2629 = icmp eq i32 %7161, 0, !dbg !45 + %7162 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2628) #6, !dbg !45 + %7163 = tail call float @llvm.nvvm.saturate.f(float %.02.i2628) #6, !dbg !45 + %.03.i2630 = select i1 %.not1.i2629, float %7163, float %7162, !dbg !45 + %7164 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2631 = icmp eq i32 %7164, 0, !dbg !45 + %7165 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2630, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7166 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2630, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2632 = select i1 %.not2.i2631, float %7166, float %7165, !dbg !45 + %7167 = fadd float %.04.i2632, 0xC168000FE0000000, !dbg !45 + %7168 = fneg float %7167, !dbg !45 + %7169 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2633 = icmp eq i32 %7169, 0, !dbg !45 + %7170 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5855, float 0x3FF7154760000000, float %7168) #6, !dbg !45 + %7171 = tail call float @llvm.nvvm.fma.rn.f(float %5855, float 0x3FF7154760000000, float %7168) #6, !dbg !45 + %.0.i2634 = select i1 %.not3.i2633, float %7171, float %7170, !dbg !45 + %7172 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2635 = icmp eq i32 %7172, 0, !dbg !45 + %7173 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5855, float 0x3E54AE0C00000000, float %.0.i2634) #6, !dbg !45 + %7174 = tail call float @llvm.nvvm.fma.rn.f(float %5855, float 0x3E54AE0C00000000, float %.0.i2634) #6, !dbg !45 + %.01.i2636 = select i1 %.not4.i2635, float %7174, float %7173, !dbg !45 + %7175 = bitcast float %.04.i2632 to i32, !dbg !45 + %7176 = shl i32 %7175, 23, !dbg !45 + %7177 = bitcast i32 %7176 to float, !dbg !45 + %7178 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2636) #6, !dbg !45 + %7179 = fmul float %7178, %7177, !dbg !45 + %7180 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2637 = icmp eq i32 %7180, 0, !dbg !45 + %7181 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5856, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7182 = tail call float @llvm.nvvm.fma.rn.f(float %5856, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2638 = select i1 %.not.i2637, float %7182, float %7181, !dbg !45 + %7183 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2639 = icmp eq i32 %7183, 0, !dbg !45 + %7184 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2638) #6, !dbg !45 + %7185 = tail call float @llvm.nvvm.saturate.f(float %.02.i2638) #6, !dbg !45 + %.03.i2640 = select i1 %.not1.i2639, float %7185, float %7184, !dbg !45 + %7186 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2641 = icmp eq i32 %7186, 0, !dbg !45 + %7187 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2640, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7188 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2640, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2642 = select i1 %.not2.i2641, float %7188, float %7187, !dbg !45 + %7189 = fadd float %.04.i2642, 0xC168000FE0000000, !dbg !45 + %7190 = fneg float %7189, !dbg !45 + %7191 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2643 = icmp eq i32 %7191, 0, !dbg !45 + %7192 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5856, float 0x3FF7154760000000, float %7190) #6, !dbg !45 + %7193 = tail call float @llvm.nvvm.fma.rn.f(float %5856, float 0x3FF7154760000000, float %7190) #6, !dbg !45 + %.0.i2644 = select i1 %.not3.i2643, float %7193, float %7192, !dbg !45 + %7194 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2645 = icmp eq i32 %7194, 0, !dbg !45 + %7195 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5856, float 0x3E54AE0C00000000, float %.0.i2644) #6, !dbg !45 + %7196 = tail call float @llvm.nvvm.fma.rn.f(float %5856, float 0x3E54AE0C00000000, float %.0.i2644) #6, !dbg !45 + %.01.i2646 = select i1 %.not4.i2645, float %7196, float %7195, !dbg !45 + %7197 = bitcast float %.04.i2642 to i32, !dbg !45 + %7198 = shl i32 %7197, 23, !dbg !45 + %7199 = bitcast i32 %7198 to float, !dbg !45 + %7200 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2646) #6, !dbg !45 + %7201 = fmul float %7200, %7199, !dbg !45 + %7202 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2647 = icmp eq i32 %7202, 0, !dbg !45 + %7203 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5857, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7204 = tail call float @llvm.nvvm.fma.rn.f(float %5857, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2648 = select i1 %.not.i2647, float %7204, float %7203, !dbg !45 + %7205 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2649 = icmp eq i32 %7205, 0, !dbg !45 + %7206 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2648) #6, !dbg !45 + %7207 = tail call float @llvm.nvvm.saturate.f(float %.02.i2648) #6, !dbg !45 + %.03.i2650 = select i1 %.not1.i2649, float %7207, float %7206, !dbg !45 + %7208 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2651 = icmp eq i32 %7208, 0, !dbg !45 + %7209 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2650, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7210 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2650, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2652 = select i1 %.not2.i2651, float %7210, float %7209, !dbg !45 + %7211 = fadd float %.04.i2652, 0xC168000FE0000000, !dbg !45 + %7212 = fneg float %7211, !dbg !45 + %7213 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2653 = icmp eq i32 %7213, 0, !dbg !45 + %7214 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5857, float 0x3FF7154760000000, float %7212) #6, !dbg !45 + %7215 = tail call float @llvm.nvvm.fma.rn.f(float %5857, float 0x3FF7154760000000, float %7212) #6, !dbg !45 + %.0.i2654 = select i1 %.not3.i2653, float %7215, float %7214, !dbg !45 + %7216 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2655 = icmp eq i32 %7216, 0, !dbg !45 + %7217 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5857, float 0x3E54AE0C00000000, float %.0.i2654) #6, !dbg !45 + %7218 = tail call float @llvm.nvvm.fma.rn.f(float %5857, float 0x3E54AE0C00000000, float %.0.i2654) #6, !dbg !45 + %.01.i2656 = select i1 %.not4.i2655, float %7218, float %7217, !dbg !45 + %7219 = bitcast float %.04.i2652 to i32, !dbg !45 + %7220 = shl i32 %7219, 23, !dbg !45 + %7221 = bitcast i32 %7220 to float, !dbg !45 + %7222 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2656) #6, !dbg !45 + %7223 = fmul float %7222, %7221, !dbg !45 + %7224 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2657 = icmp eq i32 %7224, 0, !dbg !45 + %7225 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5858, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7226 = tail call float @llvm.nvvm.fma.rn.f(float %5858, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2658 = select i1 %.not.i2657, float %7226, float %7225, !dbg !45 + %7227 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2659 = icmp eq i32 %7227, 0, !dbg !45 + %7228 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2658) #6, !dbg !45 + %7229 = tail call float @llvm.nvvm.saturate.f(float %.02.i2658) #6, !dbg !45 + %.03.i2660 = select i1 %.not1.i2659, float %7229, float %7228, !dbg !45 + %7230 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2661 = icmp eq i32 %7230, 0, !dbg !45 + %7231 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2660, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7232 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2660, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2662 = select i1 %.not2.i2661, float %7232, float %7231, !dbg !45 + %7233 = fadd float %.04.i2662, 0xC168000FE0000000, !dbg !45 + %7234 = fneg float %7233, !dbg !45 + %7235 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2663 = icmp eq i32 %7235, 0, !dbg !45 + %7236 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5858, float 0x3FF7154760000000, float %7234) #6, !dbg !45 + %7237 = tail call float @llvm.nvvm.fma.rn.f(float %5858, float 0x3FF7154760000000, float %7234) #6, !dbg !45 + %.0.i2664 = select i1 %.not3.i2663, float %7237, float %7236, !dbg !45 + %7238 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2665 = icmp eq i32 %7238, 0, !dbg !45 + %7239 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5858, float 0x3E54AE0C00000000, float %.0.i2664) #6, !dbg !45 + %7240 = tail call float @llvm.nvvm.fma.rn.f(float %5858, float 0x3E54AE0C00000000, float %.0.i2664) #6, !dbg !45 + %.01.i2666 = select i1 %.not4.i2665, float %7240, float %7239, !dbg !45 + %7241 = bitcast float %.04.i2662 to i32, !dbg !45 + %7242 = shl i32 %7241, 23, !dbg !45 + %7243 = bitcast i32 %7242 to float, !dbg !45 + %7244 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2666) #6, !dbg !45 + %7245 = fmul float %7244, %7243, !dbg !45 + %7246 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not.i2667 = icmp eq i32 %7246, 0, !dbg !45 + %7247 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5859, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %7248 = tail call float @llvm.nvvm.fma.rn.f(float %5859, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !45 + %.02.i2668 = select i1 %.not.i2667, float %7248, float %7247, !dbg !45 + %7249 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not1.i2669 = icmp eq i32 %7249, 0, !dbg !45 + %7250 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i2668) #6, !dbg !45 + %7251 = tail call float @llvm.nvvm.saturate.f(float %.02.i2668) #6, !dbg !45 + %.03.i2670 = select i1 %.not1.i2669, float %7251, float %7250, !dbg !45 + %7252 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not2.i2671 = icmp eq i32 %7252, 0, !dbg !45 + %7253 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i2670, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %7254 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i2670, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !45 + %.04.i2672 = select i1 %.not2.i2671, float %7254, float %7253, !dbg !45 + %7255 = fadd float %.04.i2672, 0xC168000FE0000000, !dbg !45 + %7256 = fneg float %7255, !dbg !45 + %7257 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not3.i2673 = icmp eq i32 %7257, 0, !dbg !45 + %7258 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5859, float 0x3FF7154760000000, float %7256) #6, !dbg !45 + %7259 = tail call float @llvm.nvvm.fma.rn.f(float %5859, float 0x3FF7154760000000, float %7256) #6, !dbg !45 + %.0.i2674 = select i1 %.not3.i2673, float %7259, float %7258, !dbg !45 + %7260 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !45 + %.not4.i2675 = icmp eq i32 %7260, 0, !dbg !45 + %7261 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %5859, float 0x3E54AE0C00000000, float %.0.i2674) #6, !dbg !45 + %7262 = tail call float @llvm.nvvm.fma.rn.f(float %5859, float 0x3E54AE0C00000000, float %.0.i2674) #6, !dbg !45 + %.01.i2676 = select i1 %.not4.i2675, float %7262, float %7261, !dbg !45 + %7263 = bitcast float %.04.i2672 to i32, !dbg !45 + %7264 = shl i32 %7263, 23, !dbg !45 + %7265 = bitcast i32 %7264 to float, !dbg !45 + %7266 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i2676) #6, !dbg !45 + %7267 = fmul float %7266, %7265, !dbg !45 + %7268 = tail call float @llvm.nvvm.div.full(float %5881, float %5587), !dbg !46 + %7269 = tail call float @llvm.nvvm.div.full(float %5903, float %5587), !dbg !46 + %7270 = tail call float @llvm.nvvm.div.full(float %5925, float %5587), !dbg !46 + %7271 = tail call float @llvm.nvvm.div.full(float %5947, float %5587), !dbg !46 + %7272 = tail call float @llvm.nvvm.div.full(float %5969, float %5587), !dbg !46 + %7273 = tail call float @llvm.nvvm.div.full(float %5991, float %5587), !dbg !46 + %7274 = tail call float @llvm.nvvm.div.full(float %6013, float %5587), !dbg !46 + %7275 = tail call float @llvm.nvvm.div.full(float %6035, float %5587), !dbg !46 + %7276 = tail call float @llvm.nvvm.div.full(float %6057, float %5587), !dbg !46 + %7277 = tail call float @llvm.nvvm.div.full(float %6079, float %5587), !dbg !46 + %7278 = tail call float @llvm.nvvm.div.full(float %6101, float %5587), !dbg !46 + %7279 = tail call float @llvm.nvvm.div.full(float %6123, float %5587), !dbg !46 + %7280 = tail call float @llvm.nvvm.div.full(float %6145, float %5587), !dbg !46 + %7281 = tail call float @llvm.nvvm.div.full(float %6167, float %5587), !dbg !46 + %7282 = tail call float @llvm.nvvm.div.full(float %6189, float %5587), !dbg !46 + %7283 = tail call float @llvm.nvvm.div.full(float %6211, float %5587), !dbg !46 + %7284 = tail call float @llvm.nvvm.div.full(float %6233, float %5587), !dbg !46 + %7285 = tail call float @llvm.nvvm.div.full(float %6255, float %5587), !dbg !46 + %7286 = tail call float @llvm.nvvm.div.full(float %6277, float %5587), !dbg !46 + %7287 = tail call float @llvm.nvvm.div.full(float %6299, float %5587), !dbg !46 + %7288 = tail call float @llvm.nvvm.div.full(float %6321, float %5587), !dbg !46 + %7289 = tail call float @llvm.nvvm.div.full(float %6343, float %5587), !dbg !46 + %7290 = tail call float @llvm.nvvm.div.full(float %6365, float %5587), !dbg !46 + %7291 = tail call float @llvm.nvvm.div.full(float %6387, float %5587), !dbg !46 + %7292 = tail call float @llvm.nvvm.div.full(float %6409, float %5587), !dbg !46 + %7293 = tail call float @llvm.nvvm.div.full(float %6431, float %5587), !dbg !46 + %7294 = tail call float @llvm.nvvm.div.full(float %6453, float %5587), !dbg !46 + %7295 = tail call float @llvm.nvvm.div.full(float %6475, float %5587), !dbg !46 + %7296 = tail call float @llvm.nvvm.div.full(float %6497, float %5587), !dbg !46 + %7297 = tail call float @llvm.nvvm.div.full(float %6519, float %5587), !dbg !46 + %7298 = tail call float @llvm.nvvm.div.full(float %6541, float %5587), !dbg !46 + %7299 = tail call float @llvm.nvvm.div.full(float %6563, float %5587), !dbg !46 + %7300 = tail call float @llvm.nvvm.div.full(float %6585, float %5587), !dbg !46 + %7301 = tail call float @llvm.nvvm.div.full(float %6607, float %5587), !dbg !46 + %7302 = tail call float @llvm.nvvm.div.full(float %6629, float %5587), !dbg !46 + %7303 = tail call float @llvm.nvvm.div.full(float %6651, float %5587), !dbg !46 + %7304 = tail call float @llvm.nvvm.div.full(float %6673, float %5587), !dbg !46 + %7305 = tail call float @llvm.nvvm.div.full(float %6695, float %5587), !dbg !46 + %7306 = tail call float @llvm.nvvm.div.full(float %6717, float %5587), !dbg !46 + %7307 = tail call float @llvm.nvvm.div.full(float %6739, float %5587), !dbg !46 + %7308 = tail call float @llvm.nvvm.div.full(float %6761, float %5587), !dbg !46 + %7309 = tail call float @llvm.nvvm.div.full(float %6783, float %5587), !dbg !46 + %7310 = tail call float @llvm.nvvm.div.full(float %6805, float %5587), !dbg !46 + %7311 = tail call float @llvm.nvvm.div.full(float %6827, float %5587), !dbg !46 + %7312 = tail call float @llvm.nvvm.div.full(float %6849, float %5587), !dbg !46 + %7313 = tail call float @llvm.nvvm.div.full(float %6871, float %5587), !dbg !46 + %7314 = tail call float @llvm.nvvm.div.full(float %6893, float %5587), !dbg !46 + %7315 = tail call float @llvm.nvvm.div.full(float %6915, float %5587), !dbg !46 + %7316 = tail call float @llvm.nvvm.div.full(float %6937, float %5587), !dbg !46 + %7317 = tail call float @llvm.nvvm.div.full(float %6959, float %5587), !dbg !46 + %7318 = tail call float @llvm.nvvm.div.full(float %6981, float %5587), !dbg !46 + %7319 = tail call float @llvm.nvvm.div.full(float %7003, float %5587), !dbg !46 + %7320 = tail call float @llvm.nvvm.div.full(float %7025, float %5587), !dbg !46 + %7321 = tail call float @llvm.nvvm.div.full(float %7047, float %5587), !dbg !46 + %7322 = tail call float @llvm.nvvm.div.full(float %7069, float %5587), !dbg !46 + %7323 = tail call float @llvm.nvvm.div.full(float %7091, float %5587), !dbg !46 + %7324 = tail call float @llvm.nvvm.div.full(float %7113, float %5587), !dbg !46 + %7325 = tail call float @llvm.nvvm.div.full(float %7135, float %5587), !dbg !46 + %7326 = tail call float @llvm.nvvm.div.full(float %7157, float %5587), !dbg !46 + %7327 = tail call float @llvm.nvvm.div.full(float %7179, float %5587), !dbg !46 + %7328 = tail call float @llvm.nvvm.div.full(float %7201, float %5587), !dbg !46 + %7329 = tail call float @llvm.nvvm.div.full(float %7223, float %5587), !dbg !46 + %7330 = tail call float @llvm.nvvm.div.full(float %7245, float %5587), !dbg !46 + %7331 = tail call float @llvm.nvvm.div.full(float %7267, float %5587), !dbg !46 + %7332 = sext i32 %33 to i64, !dbg !47 + %7333 = getelementptr float, ptr addrspace(1) %1, i64 %7332, !dbg !47 + %7334 = sext i32 %35 to i64, !dbg !47 + %7335 = getelementptr float, ptr addrspace(1) %1, i64 %7334, !dbg !47 + %7336 = sext i32 %36 to i64, !dbg !47 + %7337 = getelementptr float, ptr addrspace(1) %1, i64 %7336, !dbg !47 + %7338 = sext i32 %38 to i64, !dbg !47 + %7339 = getelementptr float, ptr addrspace(1) %1, i64 %7338, !dbg !47 + %7340 = sext i32 %39 to i64, !dbg !47 + %7341 = getelementptr float, ptr addrspace(1) %1, i64 %7340, !dbg !47 + %7342 = sext i32 %41 to i64, !dbg !47 + %7343 = getelementptr float, ptr addrspace(1) %1, i64 %7342, !dbg !47 + %7344 = sext i32 %42 to i64, !dbg !47 + %7345 = getelementptr float, ptr addrspace(1) %1, i64 %7344, !dbg !47 + %7346 = sext i32 %44 to i64, !dbg !47 + %7347 = getelementptr float, ptr addrspace(1) %1, i64 %7346, !dbg !47 + %7348 = sext i32 %45 to i64, !dbg !47 + %7349 = getelementptr float, ptr addrspace(1) %1, i64 %7348, !dbg !47 + %7350 = sext i32 %47 to i64, !dbg !47 + %7351 = getelementptr float, ptr addrspace(1) %1, i64 %7350, !dbg !47 + %7352 = sext i32 %48 to i64, !dbg !47 + %7353 = getelementptr float, ptr addrspace(1) %1, i64 %7352, !dbg !47 + %7354 = sext i32 %50 to i64, !dbg !47 + %7355 = getelementptr float, ptr addrspace(1) %1, i64 %7354, !dbg !47 + %7356 = sext i32 %51 to i64, !dbg !47 + %7357 = getelementptr float, ptr addrspace(1) %1, i64 %7356, !dbg !47 + %7358 = sext i32 %53 to i64, !dbg !47 + %7359 = getelementptr float, ptr addrspace(1) %1, i64 %7358, !dbg !47 + %7360 = sext i32 %55 to i64, !dbg !47 + %7361 = getelementptr float, ptr addrspace(1) %1, i64 %7360, !dbg !47 + %7362 = sext i32 %56 to i64, !dbg !47 + %7363 = getelementptr float, ptr addrspace(1) %1, i64 %7362, !dbg !47 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7364 = shl nuw nsw i32 %11, 4, !dbg !48 + %7365 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %7364, !dbg !48 + %7366 = insertelement <4 x float> poison, float %7268, i64 0, !dbg !48 + %7367 = insertelement <4 x float> %7366, float %7269, i64 1, !dbg !48 + %7368 = insertelement <4 x float> %7367, float %7270, i64 2, !dbg !48 + %7369 = insertelement <4 x float> %7368, float %7271, i64 3, !dbg !48 + store <4 x float> %7369, ptr addrspace(3) %7365, align 16, !dbg !48 + %7370 = xor i32 %7364, 8256, !dbg !48 + %7371 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %7370, !dbg !48 + %7372 = insertelement <4 x float> poison, float %7272, i64 0, !dbg !48 + %7373 = insertelement <4 x float> %7372, float %7273, i64 1, !dbg !48 + %7374 = insertelement <4 x float> %7373, float %7274, i64 2, !dbg !48 + %7375 = insertelement <4 x float> %7374, float %7275, i64 3, !dbg !48 + store <4 x float> %7375, ptr addrspace(3) %7371, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7376 = shl nuw nsw i32 %8, 3, !dbg !48 + %7377 = and i32 %7376, 4080, !dbg !48 + %7378 = and i32 %8, 1, !dbg !48 + %7379 = icmp eq i32 %7378, 0, !dbg !48 + %7380 = select i1 %7379, i32 0, i32 8256, !dbg !48 + %7381 = xor i32 %7380, %7377, !dbg !48 + %7382 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %7381, !dbg !48 + %7383 = load <4 x i32>, ptr addrspace(3) %7382, align 16, !dbg !48 + %7384 = getelementptr inbounds nuw i8, ptr addrspace(3) %7382, i32 4096, !dbg !48 + %7385 = load <4 x i32>, ptr addrspace(3) %7384, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7386 = insertelement <4 x float> poison, float %7276, i64 0, !dbg !48 + %7387 = insertelement <4 x float> %7386, float %7277, i64 1, !dbg !48 + %7388 = insertelement <4 x float> %7387, float %7278, i64 2, !dbg !48 + %7389 = insertelement <4 x float> %7388, float %7279, i64 3, !dbg !48 + store <4 x float> %7389, ptr addrspace(3) %7365, align 16, !dbg !48 + %7390 = insertelement <4 x float> poison, float %7280, i64 0, !dbg !48 + %7391 = insertelement <4 x float> %7390, float %7281, i64 1, !dbg !48 + %7392 = insertelement <4 x float> %7391, float %7282, i64 2, !dbg !48 + %7393 = insertelement <4 x float> %7392, float %7283, i64 3, !dbg !48 + store <4 x float> %7393, ptr addrspace(3) %7371, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7394 = load <4 x i32>, ptr addrspace(3) %7382, align 16, !dbg !48 + %7395 = load <4 x i32>, ptr addrspace(3) %7384, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7396 = insertelement <4 x float> poison, float %7284, i64 0, !dbg !48 + %7397 = insertelement <4 x float> %7396, float %7285, i64 1, !dbg !48 + %7398 = insertelement <4 x float> %7397, float %7286, i64 2, !dbg !48 + %7399 = insertelement <4 x float> %7398, float %7287, i64 3, !dbg !48 + store <4 x float> %7399, ptr addrspace(3) %7365, align 16, !dbg !48 + %7400 = insertelement <4 x float> poison, float %7288, i64 0, !dbg !48 + %7401 = insertelement <4 x float> %7400, float %7289, i64 1, !dbg !48 + %7402 = insertelement <4 x float> %7401, float %7290, i64 2, !dbg !48 + %7403 = insertelement <4 x float> %7402, float %7291, i64 3, !dbg !48 + store <4 x float> %7403, ptr addrspace(3) %7371, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7404 = load <4 x i32>, ptr addrspace(3) %7382, align 16, !dbg !48 + %7405 = load <4 x i32>, ptr addrspace(3) %7384, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7406 = insertelement <4 x float> poison, float %7292, i64 0, !dbg !48 + %7407 = insertelement <4 x float> %7406, float %7293, i64 1, !dbg !48 + %7408 = insertelement <4 x float> %7407, float %7294, i64 2, !dbg !48 + %7409 = insertelement <4 x float> %7408, float %7295, i64 3, !dbg !48 + store <4 x float> %7409, ptr addrspace(3) %7365, align 16, !dbg !48 + %7410 = insertelement <4 x float> poison, float %7296, i64 0, !dbg !48 + %7411 = insertelement <4 x float> %7410, float %7297, i64 1, !dbg !48 + %7412 = insertelement <4 x float> %7411, float %7298, i64 2, !dbg !48 + %7413 = insertelement <4 x float> %7412, float %7299, i64 3, !dbg !48 + store <4 x float> %7413, ptr addrspace(3) %7371, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7414 = load <4 x i32>, ptr addrspace(3) %7382, align 16, !dbg !48 + %7415 = load <4 x i32>, ptr addrspace(3) %7384, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7416 = insertelement <4 x float> poison, float %7300, i64 0, !dbg !48 + %7417 = insertelement <4 x float> %7416, float %7301, i64 1, !dbg !48 + %7418 = insertelement <4 x float> %7417, float %7302, i64 2, !dbg !48 + %7419 = insertelement <4 x float> %7418, float %7303, i64 3, !dbg !48 + store <4 x float> %7419, ptr addrspace(3) %7365, align 16, !dbg !48 + %7420 = insertelement <4 x float> poison, float %7304, i64 0, !dbg !48 + %7421 = insertelement <4 x float> %7420, float %7305, i64 1, !dbg !48 + %7422 = insertelement <4 x float> %7421, float %7306, i64 2, !dbg !48 + %7423 = insertelement <4 x float> %7422, float %7307, i64 3, !dbg !48 + store <4 x float> %7423, ptr addrspace(3) %7371, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7424 = load <4 x i32>, ptr addrspace(3) %7382, align 16, !dbg !48 + %7425 = load <4 x i32>, ptr addrspace(3) %7384, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7426 = insertelement <4 x float> poison, float %7308, i64 0, !dbg !48 + %7427 = insertelement <4 x float> %7426, float %7309, i64 1, !dbg !48 + %7428 = insertelement <4 x float> %7427, float %7310, i64 2, !dbg !48 + %7429 = insertelement <4 x float> %7428, float %7311, i64 3, !dbg !48 + store <4 x float> %7429, ptr addrspace(3) %7365, align 16, !dbg !48 + %7430 = insertelement <4 x float> poison, float %7312, i64 0, !dbg !48 + %7431 = insertelement <4 x float> %7430, float %7313, i64 1, !dbg !48 + %7432 = insertelement <4 x float> %7431, float %7314, i64 2, !dbg !48 + %7433 = insertelement <4 x float> %7432, float %7315, i64 3, !dbg !48 + store <4 x float> %7433, ptr addrspace(3) %7371, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7434 = load <4 x i32>, ptr addrspace(3) %7382, align 16, !dbg !48 + %7435 = load <4 x i32>, ptr addrspace(3) %7384, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7436 = insertelement <4 x float> poison, float %7316, i64 0, !dbg !48 + %7437 = insertelement <4 x float> %7436, float %7317, i64 1, !dbg !48 + %7438 = insertelement <4 x float> %7437, float %7318, i64 2, !dbg !48 + %7439 = insertelement <4 x float> %7438, float %7319, i64 3, !dbg !48 + store <4 x float> %7439, ptr addrspace(3) %7365, align 16, !dbg !48 + %7440 = insertelement <4 x float> poison, float %7320, i64 0, !dbg !48 + %7441 = insertelement <4 x float> %7440, float %7321, i64 1, !dbg !48 + %7442 = insertelement <4 x float> %7441, float %7322, i64 2, !dbg !48 + %7443 = insertelement <4 x float> %7442, float %7323, i64 3, !dbg !48 + store <4 x float> %7443, ptr addrspace(3) %7371, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7444 = load <4 x i32>, ptr addrspace(3) %7382, align 16, !dbg !48 + %7445 = load <4 x i32>, ptr addrspace(3) %7384, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7446 = insertelement <4 x float> poison, float %7324, i64 0, !dbg !48 + %7447 = insertelement <4 x float> %7446, float %7325, i64 1, !dbg !48 + %7448 = insertelement <4 x float> %7447, float %7326, i64 2, !dbg !48 + %7449 = insertelement <4 x float> %7448, float %7327, i64 3, !dbg !48 + store <4 x float> %7449, ptr addrspace(3) %7365, align 16, !dbg !48 + %7450 = insertelement <4 x float> poison, float %7328, i64 0, !dbg !48 + %7451 = insertelement <4 x float> %7450, float %7329, i64 1, !dbg !48 + %7452 = insertelement <4 x float> %7451, float %7330, i64 2, !dbg !48 + %7453 = insertelement <4 x float> %7452, float %7331, i64 3, !dbg !48 + store <4 x float> %7453, ptr addrspace(3) %7371, align 16, !dbg !48 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !48 + %7454 = load <4 x i32>, ptr addrspace(3) %7382, align 16, !dbg !48 + %7455 = load <4 x i32>, ptr addrspace(3) %7384, align 16, !dbg !48 + %.extract = extractelement <4 x i32> %7383, i64 0, !dbg !48 + %.extract64 = extractelement <4 x i32> %7383, i64 1, !dbg !48 + %.extract65 = extractelement <4 x i32> %7383, i64 2, !dbg !48 + %.extract66 = extractelement <4 x i32> %7383, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract, i32 %.extract64, i32 %.extract65, i32 %.extract66, ptr addrspace(1) %7333, i1 true) #6, !dbg !48 + %.extract67 = extractelement <4 x i32> %7385, i64 0, !dbg !48 + %.extract68 = extractelement <4 x i32> %7385, i64 1, !dbg !48 + %.extract69 = extractelement <4 x i32> %7385, i64 2, !dbg !48 + %.extract70 = extractelement <4 x i32> %7385, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract67, i32 %.extract68, i32 %.extract69, i32 %.extract70, ptr addrspace(1) %7335, i1 true) #6, !dbg !48 + %.extract71 = extractelement <4 x i32> %7394, i64 0, !dbg !48 + %.extract72 = extractelement <4 x i32> %7394, i64 1, !dbg !48 + %.extract73 = extractelement <4 x i32> %7394, i64 2, !dbg !48 + %.extract74 = extractelement <4 x i32> %7394, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract71, i32 %.extract72, i32 %.extract73, i32 %.extract74, ptr addrspace(1) %7337, i1 true) #6, !dbg !48 + %.extract75 = extractelement <4 x i32> %7395, i64 0, !dbg !48 + %.extract76 = extractelement <4 x i32> %7395, i64 1, !dbg !48 + %.extract77 = extractelement <4 x i32> %7395, i64 2, !dbg !48 + %.extract78 = extractelement <4 x i32> %7395, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract75, i32 %.extract76, i32 %.extract77, i32 %.extract78, ptr addrspace(1) %7339, i1 true) #6, !dbg !48 + %.extract79 = extractelement <4 x i32> %7404, i64 0, !dbg !48 + %.extract80 = extractelement <4 x i32> %7404, i64 1, !dbg !48 + %.extract81 = extractelement <4 x i32> %7404, i64 2, !dbg !48 + %.extract82 = extractelement <4 x i32> %7404, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract79, i32 %.extract80, i32 %.extract81, i32 %.extract82, ptr addrspace(1) %7341, i1 true) #6, !dbg !48 + %.extract83 = extractelement <4 x i32> %7405, i64 0, !dbg !48 + %.extract84 = extractelement <4 x i32> %7405, i64 1, !dbg !48 + %.extract85 = extractelement <4 x i32> %7405, i64 2, !dbg !48 + %.extract86 = extractelement <4 x i32> %7405, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract83, i32 %.extract84, i32 %.extract85, i32 %.extract86, ptr addrspace(1) %7343, i1 true) #6, !dbg !48 + %.extract87 = extractelement <4 x i32> %7414, i64 0, !dbg !48 + %.extract88 = extractelement <4 x i32> %7414, i64 1, !dbg !48 + %.extract89 = extractelement <4 x i32> %7414, i64 2, !dbg !48 + %.extract90 = extractelement <4 x i32> %7414, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract87, i32 %.extract88, i32 %.extract89, i32 %.extract90, ptr addrspace(1) %7345, i1 true) #6, !dbg !48 + %.extract91 = extractelement <4 x i32> %7415, i64 0, !dbg !48 + %.extract92 = extractelement <4 x i32> %7415, i64 1, !dbg !48 + %.extract93 = extractelement <4 x i32> %7415, i64 2, !dbg !48 + %.extract94 = extractelement <4 x i32> %7415, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract91, i32 %.extract92, i32 %.extract93, i32 %.extract94, ptr addrspace(1) %7347, i1 true) #6, !dbg !48 + %.extract95 = extractelement <4 x i32> %7424, i64 0, !dbg !48 + %.extract96 = extractelement <4 x i32> %7424, i64 1, !dbg !48 + %.extract97 = extractelement <4 x i32> %7424, i64 2, !dbg !48 + %.extract98 = extractelement <4 x i32> %7424, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract95, i32 %.extract96, i32 %.extract97, i32 %.extract98, ptr addrspace(1) %7349, i1 true) #6, !dbg !48 + %.extract99 = extractelement <4 x i32> %7425, i64 0, !dbg !48 + %.extract100 = extractelement <4 x i32> %7425, i64 1, !dbg !48 + %.extract101 = extractelement <4 x i32> %7425, i64 2, !dbg !48 + %.extract102 = extractelement <4 x i32> %7425, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract99, i32 %.extract100, i32 %.extract101, i32 %.extract102, ptr addrspace(1) %7351, i1 true) #6, !dbg !48 + %.extract103 = extractelement <4 x i32> %7434, i64 0, !dbg !48 + %.extract104 = extractelement <4 x i32> %7434, i64 1, !dbg !48 + %.extract105 = extractelement <4 x i32> %7434, i64 2, !dbg !48 + %.extract106 = extractelement <4 x i32> %7434, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract103, i32 %.extract104, i32 %.extract105, i32 %.extract106, ptr addrspace(1) %7353, i1 true) #6, !dbg !48 + %.extract107 = extractelement <4 x i32> %7435, i64 0, !dbg !48 + %.extract108 = extractelement <4 x i32> %7435, i64 1, !dbg !48 + %.extract109 = extractelement <4 x i32> %7435, i64 2, !dbg !48 + %.extract110 = extractelement <4 x i32> %7435, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract107, i32 %.extract108, i32 %.extract109, i32 %.extract110, ptr addrspace(1) %7355, i1 true) #6, !dbg !48 + %.extract111 = extractelement <4 x i32> %7444, i64 0, !dbg !48 + %.extract112 = extractelement <4 x i32> %7444, i64 1, !dbg !48 + %.extract113 = extractelement <4 x i32> %7444, i64 2, !dbg !48 + %.extract114 = extractelement <4 x i32> %7444, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract111, i32 %.extract112, i32 %.extract113, i32 %.extract114, ptr addrspace(1) %7357, i1 true) #6, !dbg !48 + %.extract115 = extractelement <4 x i32> %7445, i64 0, !dbg !48 + %.extract116 = extractelement <4 x i32> %7445, i64 1, !dbg !48 + %.extract117 = extractelement <4 x i32> %7445, i64 2, !dbg !48 + %.extract118 = extractelement <4 x i32> %7445, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract115, i32 %.extract116, i32 %.extract117, i32 %.extract118, ptr addrspace(1) %7359, i1 true) #6, !dbg !48 + %.extract119 = extractelement <4 x i32> %7454, i64 0, !dbg !48 + %.extract120 = extractelement <4 x i32> %7454, i64 1, !dbg !48 + %.extract121 = extractelement <4 x i32> %7454, i64 2, !dbg !48 + %.extract122 = extractelement <4 x i32> %7454, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract119, i32 %.extract120, i32 %.extract121, i32 %.extract122, ptr addrspace(1) %7361, i1 true) #6, !dbg !48 + %.extract123 = extractelement <4 x i32> %7455, i64 0, !dbg !48 + %.extract124 = extractelement <4 x i32> %7455, i64 1, !dbg !48 + %.extract125 = extractelement <4 x i32> %7455, i64 2, !dbg !48 + %.extract126 = extractelement <4 x i32> %7455, i64 3, !dbg !48 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract123, i32 %.extract124, i32 %.extract125, i32 %.extract126, ptr addrspace(1) %7363, i1 %17) #6, !dbg !48 + ret void, !dbg !49 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.saturate.ftz.f(float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.saturate.f(float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rm.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rm.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0", linkageName: "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 26, column: 37, scope: !5) +!10 = !DILocation(line: 33, column: 29, scope: !5) +!11 = !DILocation(line: 37, column: 47, scope: !5) +!12 = !DILocation(line: 37, column: 41, scope: !5) +!13 = !DILocation(line: 37, column: 34, scope: !5) +!14 = !DILocation(line: 37, column: 52, scope: !5) +!15 = !DILocation(line: 37, column: 105, scope: !5) +!16 = !DILocation(line: 196, column: 19, scope: !17, inlinedAt: !19) +!17 = distinct !DILexicalBlockFile(scope: !5, file: !18, discriminator: 0) +!18 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!19 = !DILocation(line: 42, column: 40, scope: !5) +!20 = !DILocation(line: 196, column: 53, scope: !17, inlinedAt: !19) +!21 = !DILocation(line: 173, column: 29, scope: !17, inlinedAt: !19) +!22 = !DILocation(line: 199, column: 53, scope: !17, inlinedAt: !19) +!23 = !DILocation(line: 205, column: 24, scope: !17, inlinedAt: !19) +!24 = !DILocation(line: 205, column: 36, scope: !17, inlinedAt: !19) +!25 = !DILocation(line: 45, column: 54, scope: !5) +!26 = !DILocation(line: 46, column: 54, scope: !5) +!27 = !DILocation(line: 110, column: 15, scope: !17, inlinedAt: !28) +!28 = !DILocation(line: 49, column: 33, scope: !5) +!29 = !DILocation(line: 112, column: 21, scope: !17, inlinedAt: !28) +!30 = !DILocation(line: 112, column: 16, scope: !17, inlinedAt: !28) +!31 = !DILocation(line: 113, column: 29, scope: !17, inlinedAt: !28) +!32 = !DILocation(line: 123, column: 29, scope: !17, inlinedAt: !28) +!33 = !DILocation(line: 180, column: 40, scope: !17, inlinedAt: !28) +!34 = !DILocation(line: 180, column: 68, scope: !17, inlinedAt: !28) +!35 = !DILocation(line: 180, column: 58, scope: !17, inlinedAt: !28) +!36 = !DILocation(line: 173, column: 29, scope: !17, inlinedAt: !28) +!37 = !DILocation(line: 181, column: 31, scope: !17, inlinedAt: !28) +!38 = !DILocation(line: 291, column: 36, scope: !39, inlinedAt: !28) +!39 = distinct !DILexicalBlockFile(scope: !5, file: !40, discriminator: 0) +!40 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!41 = !DILocation(line: 261, column: 15, scope: !39, inlinedAt: !28) +!42 = !DILocation(line: 58, column: 52, scope: !5) +!43 = !DILocation(line: 58, column: 106, scope: !5) +!44 = !DILocation(line: 60, column: 22, scope: !5) +!45 = !DILocation(line: 61, column: 29, scope: !5) +!46 = !DILocation(line: 62, column: 23, scope: !5) +!47 = !DILocation(line: 63, column: 29, scope: !5) +!48 = !DILocation(line: 63, column: 53, scope: !5) +!49 = !DILocation(line: 52, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..c985bdc3fd36cd0b5a99e0b179f1856369335be0 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx @@ -0,0 +1,4793 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 // -- Begin function triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 +.visible .entry triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0( + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_1, + .param .u32 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_2, + .param .u32 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_5 +) +.reqntid 512 +{ + .reg .pred %p<248>; + .reg .b16 %rs<130>; + .reg .b32 %r<3729>; + .reg .b64 %rd<112>; + .loc 1 18 0 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:18:0 + +// %bb.0: + ld.param.b64 %rd65, [triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_0]; + ld.param.b64 %rd66, [triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:23:28 + mov.u32 %r205, %ctaid.x; + .loc 1 26 37 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:26:37 + mov.u32 %r206, %tid.x; + and.b32 %r207, %r206, 31; + and.b32 %r208, %r206, 511; + shl.b32 %r209, %r208, 3; + or.b32 %r210, %r209, 28672; + shl.b32 %r211, %r208, 2; + or.b32 %r212, %r211, 30720; + .loc 1 33 29 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:33:29 + setp.lt.u32 %p8, %r210, 32000; + setp.lt.u32 %p38, %r212, 32000; + .loc 1 37 47 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:47 + mul.lo.s32 %r213, %r205, 32000; + .loc 1 37 41 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:41 + add.s32 %r214, %r209, %r213; + add.s32 %r215, %r214, 4096; + add.s32 %r216, %r214, 8192; + add.s32 %r217, %r214, 12288; + add.s32 %r218, %r214, 16384; + add.s32 %r219, %r214, 20480; + add.s32 %r220, %r214, 24576; + add.s32 %r221, %r210, %r213; + sub.s32 %r222, %r214, %r211; + .loc 1 26 37 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:26:37 + add.s32 %r223, %r213, %r211; + .loc 1 37 41 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:41 + add.s32 %r224, %r223, 2048; + sub.s32 %r225, %r215, %r211; + add.s32 %r226, %r223, 6144; + sub.s32 %r227, %r216, %r211; + add.s32 %r228, %r223, 10240; + sub.s32 %r229, %r217, %r211; + add.s32 %r230, %r223, 14336; + sub.s32 %r231, %r218, %r211; + add.s32 %r232, %r223, 18432; + sub.s32 %r233, %r219, %r211; + add.s32 %r234, %r223, 22528; + sub.s32 %r235, %r220, %r211; + add.s32 %r236, %r223, 26624; + add.s32 %r237, %r223, 28672; + add.s32 %r238, %r212, %r213; + .loc 1 37 34 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:34 + mad.wide.s32 %rd2, %r214, 2, %rd65; + mad.wide.s32 %rd5, %r215, 2, %rd65; + mad.wide.s32 %rd8, %r216, 2, %rd65; + mad.wide.s32 %rd11, %r217, 2, %rd65; + mad.wide.s32 %rd14, %r218, 2, %rd65; + mad.wide.s32 %rd17, %r219, 2, %rd65; + mad.wide.s32 %rd20, %r220, 2, %rd65; + mad.wide.s32 %rd23, %r221, 2, %rd65; + .loc 1 37 52 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:52 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd3, 1.0; + // end inline asm + mov.b32 %r5, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r5; + mov.u32 %r2, %r5; + mov.u32 %r3, %r5; + mov.u32 %r4, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r1, %r2, %r3, %r4 }, [ %rd2 + 0 ], %rd3; + // end inline asm + mov.b32 {%rs1, %rs2}, %r1; + mov.b32 {%rs3, %rs4}, %r2; + mov.b32 {%rs5, %rs6}, %r3; + mov.b32 {%rs7, %rs8}, %r4; + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r9, %r5; + mov.u32 %r10, %r5; + mov.u32 %r11, %r5; + mov.u32 %r12, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r9, %r10, %r11, %r12 }, [ %rd5 + 0 ], %rd6; + // end inline asm + mov.b32 {%rs9, %rs10}, %r9; + mov.b32 {%rs11, %rs12}, %r10; + mov.b32 {%rs13, %rs14}, %r11; + mov.b32 {%rs15, %rs16}, %r12; + // begin inline asm + mov.u64 %rd9, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd9, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r17, %r5; + mov.u32 %r18, %r5; + mov.u32 %r19, %r5; + mov.u32 %r20, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r17, %r18, %r19, %r20 }, [ %rd8 + 0 ], %rd9; + // end inline asm + mov.b32 {%rs17, %rs18}, %r17; + mov.b32 {%rs19, %rs20}, %r18; + mov.b32 {%rs21, %rs22}, %r19; + mov.b32 {%rs23, %rs24}, %r20; + // begin inline asm + mov.u64 %rd12, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd12, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r25, %r5; + mov.u32 %r26, %r5; + mov.u32 %r27, %r5; + mov.u32 %r28, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd11 + 0 ], %rd12; + // end inline asm + mov.b32 {%rs25, %rs26}, %r25; + mov.b32 {%rs27, %rs28}, %r26; + mov.b32 {%rs29, %rs30}, %r27; + mov.b32 {%rs31, %rs32}, %r28; + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd15, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r33, %r5; + mov.u32 %r34, %r5; + mov.u32 %r35, %r5; + mov.u32 %r36, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd14 + 0 ], %rd15; + // end inline asm + mov.b32 {%rs33, %rs34}, %r33; + mov.b32 {%rs35, %rs36}, %r34; + mov.b32 {%rs37, %rs38}, %r35; + mov.b32 {%rs39, %rs40}, %r36; + // begin inline asm + mov.u64 %rd18, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd18, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r41, %r5; + mov.u32 %r42, %r5; + mov.u32 %r43, %r5; + mov.u32 %r44, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r41, %r42, %r43, %r44 }, [ %rd17 + 0 ], %rd18; + // end inline asm + mov.b32 {%rs41, %rs42}, %r41; + mov.b32 {%rs43, %rs44}, %r42; + mov.b32 {%rs45, %rs46}, %r43; + mov.b32 {%rs47, %rs48}, %r44; + // begin inline asm + mov.u64 %rd21, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd21, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r49, %r5; + mov.u32 %r50, %r5; + mov.u32 %r51, %r5; + mov.u32 %r52, %r5; + @%p1 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r49, %r50, %r51, %r52 }, [ %rd20 + 0 ], %rd21; + // end inline asm + mov.b32 {%rs49, %rs50}, %r49; + mov.b32 {%rs51, %rs52}, %r50; + mov.b32 {%rs53, %rs54}, %r51; + mov.b32 {%rs55, %rs56}, %r52; + // begin inline asm + mov.u64 %rd24, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd24, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r57, %r5; + mov.u32 %r58, %r5; + mov.u32 %r59, %r5; + mov.u32 %r60, %r5; + @%p8 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r57, %r58, %r59, %r60 }, [ %rd23 + 0 ], %rd24; + // end inline asm + mov.b32 {%rs57, %rs58}, %r57; + .loc 1 37 105 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:37:105 + cvt.f32.bf16 %r239, %rs1; + cvt.f32.bf16 %r240, %rs2; + cvt.f32.bf16 %r241, %rs3; + cvt.f32.bf16 %r242, %rs4; + cvt.f32.bf16 %r243, %rs5; + cvt.f32.bf16 %r244, %rs6; + cvt.f32.bf16 %r245, %rs7; + cvt.f32.bf16 %r246, %rs8; + cvt.f32.bf16 %r247, %rs9; + cvt.f32.bf16 %r248, %rs10; + cvt.f32.bf16 %r249, %rs11; + cvt.f32.bf16 %r250, %rs12; + cvt.f32.bf16 %r251, %rs13; + cvt.f32.bf16 %r252, %rs14; + cvt.f32.bf16 %r253, %rs15; + cvt.f32.bf16 %r254, %rs16; + cvt.f32.bf16 %r255, %rs17; + cvt.f32.bf16 %r256, %rs18; + cvt.f32.bf16 %r257, %rs19; + cvt.f32.bf16 %r258, %rs20; + cvt.f32.bf16 %r259, %rs21; + cvt.f32.bf16 %r260, %rs22; + cvt.f32.bf16 %r261, %rs23; + cvt.f32.bf16 %r262, %rs24; + cvt.f32.bf16 %r263, %rs25; + cvt.f32.bf16 %r264, %rs26; + cvt.f32.bf16 %r265, %rs27; + cvt.f32.bf16 %r266, %rs28; + cvt.f32.bf16 %r267, %rs29; + cvt.f32.bf16 %r268, %rs30; + cvt.f32.bf16 %r269, %rs31; + cvt.f32.bf16 %r270, %rs32; + cvt.f32.bf16 %r271, %rs33; + cvt.f32.bf16 %r272, %rs34; + cvt.f32.bf16 %r273, %rs35; + cvt.f32.bf16 %r274, %rs36; + cvt.f32.bf16 %r275, %rs37; + cvt.f32.bf16 %r276, %rs38; + cvt.f32.bf16 %r277, %rs39; + cvt.f32.bf16 %r278, %rs40; + cvt.f32.bf16 %r279, %rs41; + cvt.f32.bf16 %r280, %rs42; + cvt.f32.bf16 %r281, %rs43; + cvt.f32.bf16 %r282, %rs44; + cvt.f32.bf16 %r283, %rs45; + cvt.f32.bf16 %r284, %rs46; + cvt.f32.bf16 %r285, %rs47; + cvt.f32.bf16 %r286, %rs48; + cvt.f32.bf16 %r287, %rs49; + cvt.f32.bf16 %r288, %rs50; + cvt.f32.bf16 %r289, %rs51; + cvt.f32.bf16 %r290, %rs52; + cvt.f32.bf16 %r291, %rs53; + cvt.f32.bf16 %r292, %rs54; + cvt.f32.bf16 %r293, %rs55; + cvt.f32.bf16 %r294, %rs56; + cvt.f32.bf16 %r295, %rs57; + cvt.f32.bf16 %r296, %rs58; + mov.b32 {%rs59, %rs60}, %r58; + cvt.f32.bf16 %r297, %rs59; + cvt.f32.bf16 %r298, %rs60; + mov.b32 {%rs61, %rs62}, %r59; + cvt.f32.bf16 %r299, %rs61; + cvt.f32.bf16 %r300, %rs62; + mov.b32 {%rs63, %rs64}, %r60; + cvt.f32.bf16 %r301, %rs63; + cvt.f32.bf16 %r302, %rs64; + mov.b16 %rs65, 0xFF80; +$L__tmp1: + .loc 2 196 19 // triton_helpers.py:196:19 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.eq.bf16 %p39, %rs1, %rs65; + setp.eq.bf16 %p40, %rs2, %rs65; + setp.eq.bf16 %p41, %rs3, %rs65; + setp.eq.bf16 %p42, %rs4, %rs65; + setp.eq.bf16 %p43, %rs5, %rs65; + setp.eq.bf16 %p44, %rs6, %rs65; + setp.eq.bf16 %p45, %rs7, %rs65; + setp.eq.bf16 %p46, %rs8, %rs65; + setp.eq.bf16 %p47, %rs9, %rs65; + setp.eq.bf16 %p48, %rs10, %rs65; + setp.eq.bf16 %p49, %rs11, %rs65; + setp.eq.bf16 %p50, %rs12, %rs65; + setp.eq.bf16 %p51, %rs13, %rs65; + setp.eq.bf16 %p52, %rs14, %rs65; + setp.eq.bf16 %p53, %rs15, %rs65; + setp.eq.bf16 %p54, %rs16, %rs65; + setp.eq.bf16 %p55, %rs17, %rs65; + setp.eq.bf16 %p56, %rs18, %rs65; + setp.eq.bf16 %p57, %rs19, %rs65; + setp.eq.bf16 %p58, %rs20, %rs65; + setp.eq.bf16 %p59, %rs21, %rs65; + setp.eq.bf16 %p60, %rs22, %rs65; + setp.eq.bf16 %p61, %rs23, %rs65; + setp.eq.bf16 %p62, %rs24, %rs65; + setp.eq.bf16 %p63, %rs25, %rs65; + setp.eq.bf16 %p64, %rs26, %rs65; + setp.eq.bf16 %p65, %rs27, %rs65; + setp.eq.bf16 %p66, %rs28, %rs65; + setp.eq.bf16 %p67, %rs29, %rs65; + setp.eq.bf16 %p68, %rs30, %rs65; + setp.eq.bf16 %p69, %rs31, %rs65; + setp.eq.bf16 %p70, %rs32, %rs65; + setp.eq.bf16 %p71, %rs33, %rs65; + setp.eq.bf16 %p72, %rs34, %rs65; + setp.eq.bf16 %p73, %rs35, %rs65; + setp.eq.bf16 %p74, %rs36, %rs65; + setp.eq.bf16 %p75, %rs37, %rs65; + setp.eq.bf16 %p76, %rs38, %rs65; + setp.eq.bf16 %p77, %rs39, %rs65; + setp.eq.bf16 %p78, %rs40, %rs65; + setp.eq.bf16 %p79, %rs41, %rs65; + setp.eq.bf16 %p80, %rs42, %rs65; + setp.eq.bf16 %p81, %rs43, %rs65; + setp.eq.bf16 %p82, %rs44, %rs65; + setp.eq.bf16 %p83, %rs45, %rs65; + setp.eq.bf16 %p84, %rs46, %rs65; + setp.eq.bf16 %p85, %rs47, %rs65; + setp.eq.bf16 %p86, %rs48, %rs65; + setp.eq.bf16 %p87, %rs49, %rs65; + setp.eq.bf16 %p88, %rs50, %rs65; + setp.eq.bf16 %p89, %rs51, %rs65; + setp.eq.bf16 %p90, %rs52, %rs65; + setp.eq.bf16 %p91, %rs53, %rs65; + setp.eq.bf16 %p92, %rs54, %rs65; + setp.eq.bf16 %p93, %rs55, %rs65; + setp.eq.bf16 %p94, %rs56, %rs65; + setp.eq.bf16 %p95, %rs57, %rs65; + setp.eq.bf16 %p96, %rs58, %rs65; + mov.b32 %r303, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + sub.f32 %r304, %r303, %r239; + sub.f32 %r305, %r303, %r240; + sub.f32 %r306, %r303, %r241; + sub.f32 %r307, %r303, %r242; + sub.f32 %r308, %r303, %r243; + sub.f32 %r309, %r303, %r244; + sub.f32 %r310, %r303, %r245; + sub.f32 %r311, %r303, %r246; + sub.f32 %r312, %r303, %r247; + sub.f32 %r313, %r303, %r248; + sub.f32 %r314, %r303, %r249; + sub.f32 %r315, %r303, %r250; + sub.f32 %r316, %r303, %r251; + sub.f32 %r317, %r303, %r252; + sub.f32 %r318, %r303, %r253; + sub.f32 %r319, %r303, %r254; + sub.f32 %r320, %r303, %r255; + sub.f32 %r321, %r303, %r256; + sub.f32 %r322, %r303, %r257; + sub.f32 %r323, %r303, %r258; + sub.f32 %r324, %r303, %r259; + sub.f32 %r325, %r303, %r260; + sub.f32 %r326, %r303, %r261; + sub.f32 %r327, %r303, %r262; + sub.f32 %r328, %r303, %r263; + sub.f32 %r329, %r303, %r264; + sub.f32 %r330, %r303, %r265; + sub.f32 %r331, %r303, %r266; + sub.f32 %r332, %r303, %r267; + sub.f32 %r333, %r303, %r268; + sub.f32 %r334, %r303, %r269; + sub.f32 %r335, %r303, %r270; + sub.f32 %r336, %r303, %r271; + sub.f32 %r337, %r303, %r272; + sub.f32 %r338, %r303, %r273; + sub.f32 %r339, %r303, %r274; + sub.f32 %r340, %r303, %r275; + sub.f32 %r341, %r303, %r276; + sub.f32 %r342, %r303, %r277; + sub.f32 %r343, %r303, %r278; + sub.f32 %r344, %r303, %r279; + sub.f32 %r345, %r303, %r280; + sub.f32 %r346, %r303, %r281; + sub.f32 %r347, %r303, %r282; + sub.f32 %r348, %r303, %r283; + sub.f32 %r349, %r303, %r284; + sub.f32 %r350, %r303, %r285; + sub.f32 %r351, %r303, %r286; + sub.f32 %r352, %r303, %r287; + sub.f32 %r353, %r303, %r288; + sub.f32 %r354, %r303, %r289; + sub.f32 %r355, %r303, %r290; + sub.f32 %r356, %r303, %r291; + sub.f32 %r357, %r303, %r292; + sub.f32 %r358, %r303, %r293; + sub.f32 %r359, %r303, %r294; + sub.f32 %r360, %r303, %r295; + sub.f32 %r361, %r303, %r296; + sub.f32 %r362, %r303, %r297; + sub.f32 %r363, %r303, %r298; + sub.f32 %r364, %r303, %r299; + sub.f32 %r365, %r303, %r300; + sub.f32 %r366, %r303, %r301; + sub.f32 %r367, %r303, %r302; + mov.b32 %r368, 0f3F000000; + mov.b32 %r369, 0f3BBB989D; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.ftz.f32 %r370, %r304, %r369, %r368; + cvt.ftz.sat.f32.f32 %r371, %r370; + mov.b32 %r372, 0f4B400001; + mov.b32 %r373, 0f437C0000; + fma.rm.ftz.f32 %r374, %r371, %r373, %r372; + add.f32 %r375, %r374, 0fCB40007F; + neg.f32 %r376, %r375; + mov.b32 %r377, 0f3FB8AA3B; + fma.rn.ftz.f32 %r378, %r304, %r377, %r376; + mov.b32 %r379, 0f32A57060; + fma.rn.ftz.f32 %r380, %r304, %r379, %r378; + shl.b32 %r381, %r374, 23; + ex2.approx.ftz.f32 %r382, %r380; + mul.f32 %r383, %r382, %r381; + fma.rn.ftz.f32 %r384, %r305, %r369, %r368; + cvt.ftz.sat.f32.f32 %r385, %r384; + fma.rm.ftz.f32 %r386, %r385, %r373, %r372; + add.f32 %r387, %r386, 0fCB40007F; + neg.f32 %r388, %r387; + fma.rn.ftz.f32 %r389, %r305, %r377, %r388; + fma.rn.ftz.f32 %r390, %r305, %r379, %r389; + shl.b32 %r391, %r386, 23; + ex2.approx.ftz.f32 %r392, %r390; + mul.f32 %r393, %r392, %r391; + fma.rn.ftz.f32 %r394, %r306, %r369, %r368; + cvt.ftz.sat.f32.f32 %r395, %r394; + fma.rm.ftz.f32 %r396, %r395, %r373, %r372; + add.f32 %r397, %r396, 0fCB40007F; + neg.f32 %r398, %r397; + fma.rn.ftz.f32 %r399, %r306, %r377, %r398; + fma.rn.ftz.f32 %r400, %r306, %r379, %r399; + shl.b32 %r401, %r396, 23; + ex2.approx.ftz.f32 %r402, %r400; + mul.f32 %r403, %r402, %r401; + fma.rn.ftz.f32 %r404, %r307, %r369, %r368; + cvt.ftz.sat.f32.f32 %r405, %r404; + fma.rm.ftz.f32 %r406, %r405, %r373, %r372; + add.f32 %r407, %r406, 0fCB40007F; + neg.f32 %r408, %r407; + fma.rn.ftz.f32 %r409, %r307, %r377, %r408; + fma.rn.ftz.f32 %r410, %r307, %r379, %r409; + shl.b32 %r411, %r406, 23; + ex2.approx.ftz.f32 %r412, %r410; + mul.f32 %r413, %r412, %r411; + fma.rn.ftz.f32 %r414, %r308, %r369, %r368; + cvt.ftz.sat.f32.f32 %r415, %r414; + fma.rm.ftz.f32 %r416, %r415, %r373, %r372; + add.f32 %r417, %r416, 0fCB40007F; + neg.f32 %r418, %r417; + fma.rn.ftz.f32 %r419, %r308, %r377, %r418; + fma.rn.ftz.f32 %r420, %r308, %r379, %r419; + shl.b32 %r421, %r416, 23; + ex2.approx.ftz.f32 %r422, %r420; + mul.f32 %r423, %r422, %r421; + fma.rn.ftz.f32 %r424, %r309, %r369, %r368; + cvt.ftz.sat.f32.f32 %r425, %r424; + fma.rm.ftz.f32 %r426, %r425, %r373, %r372; + add.f32 %r427, %r426, 0fCB40007F; + neg.f32 %r428, %r427; + fma.rn.ftz.f32 %r429, %r309, %r377, %r428; + fma.rn.ftz.f32 %r430, %r309, %r379, %r429; + shl.b32 %r431, %r426, 23; + ex2.approx.ftz.f32 %r432, %r430; + mul.f32 %r433, %r432, %r431; + fma.rn.ftz.f32 %r434, %r310, %r369, %r368; + cvt.ftz.sat.f32.f32 %r435, %r434; + fma.rm.ftz.f32 %r436, %r435, %r373, %r372; + add.f32 %r437, %r436, 0fCB40007F; + neg.f32 %r438, %r437; + fma.rn.ftz.f32 %r439, %r310, %r377, %r438; + fma.rn.ftz.f32 %r440, %r310, %r379, %r439; + shl.b32 %r441, %r436, 23; + ex2.approx.ftz.f32 %r442, %r440; + mul.f32 %r443, %r442, %r441; + fma.rn.ftz.f32 %r444, %r311, %r369, %r368; + cvt.ftz.sat.f32.f32 %r445, %r444; + fma.rm.ftz.f32 %r446, %r445, %r373, %r372; + add.f32 %r447, %r446, 0fCB40007F; + neg.f32 %r448, %r447; + fma.rn.ftz.f32 %r449, %r311, %r377, %r448; + fma.rn.ftz.f32 %r450, %r311, %r379, %r449; + shl.b32 %r451, %r446, 23; + ex2.approx.ftz.f32 %r452, %r450; + mul.f32 %r453, %r452, %r451; + fma.rn.ftz.f32 %r454, %r312, %r369, %r368; + cvt.ftz.sat.f32.f32 %r455, %r454; + fma.rm.ftz.f32 %r456, %r455, %r373, %r372; + add.f32 %r457, %r456, 0fCB40007F; + neg.f32 %r458, %r457; + fma.rn.ftz.f32 %r459, %r312, %r377, %r458; + fma.rn.ftz.f32 %r460, %r312, %r379, %r459; + shl.b32 %r461, %r456, 23; + ex2.approx.ftz.f32 %r462, %r460; + mul.f32 %r463, %r462, %r461; + fma.rn.ftz.f32 %r464, %r313, %r369, %r368; + cvt.ftz.sat.f32.f32 %r465, %r464; + fma.rm.ftz.f32 %r466, %r465, %r373, %r372; + add.f32 %r467, %r466, 0fCB40007F; + neg.f32 %r468, %r467; + fma.rn.ftz.f32 %r469, %r313, %r377, %r468; + fma.rn.ftz.f32 %r470, %r313, %r379, %r469; + shl.b32 %r471, %r466, 23; + ex2.approx.ftz.f32 %r472, %r470; + mul.f32 %r473, %r472, %r471; + fma.rn.ftz.f32 %r474, %r314, %r369, %r368; + cvt.ftz.sat.f32.f32 %r475, %r474; + fma.rm.ftz.f32 %r476, %r475, %r373, %r372; + add.f32 %r477, %r476, 0fCB40007F; + neg.f32 %r478, %r477; + fma.rn.ftz.f32 %r479, %r314, %r377, %r478; + fma.rn.ftz.f32 %r480, %r314, %r379, %r479; + shl.b32 %r481, %r476, 23; + ex2.approx.ftz.f32 %r482, %r480; + mul.f32 %r483, %r482, %r481; + fma.rn.ftz.f32 %r484, %r315, %r369, %r368; + cvt.ftz.sat.f32.f32 %r485, %r484; + fma.rm.ftz.f32 %r486, %r485, %r373, %r372; + add.f32 %r487, %r486, 0fCB40007F; + neg.f32 %r488, %r487; + fma.rn.ftz.f32 %r489, %r315, %r377, %r488; + fma.rn.ftz.f32 %r490, %r315, %r379, %r489; + shl.b32 %r491, %r486, 23; + ex2.approx.ftz.f32 %r492, %r490; + mul.f32 %r493, %r492, %r491; + fma.rn.ftz.f32 %r494, %r316, %r369, %r368; + cvt.ftz.sat.f32.f32 %r495, %r494; + fma.rm.ftz.f32 %r496, %r495, %r373, %r372; + add.f32 %r497, %r496, 0fCB40007F; + neg.f32 %r498, %r497; + fma.rn.ftz.f32 %r499, %r316, %r377, %r498; + fma.rn.ftz.f32 %r500, %r316, %r379, %r499; + shl.b32 %r501, %r496, 23; + ex2.approx.ftz.f32 %r502, %r500; + mul.f32 %r503, %r502, %r501; + fma.rn.ftz.f32 %r504, %r317, %r369, %r368; + cvt.ftz.sat.f32.f32 %r505, %r504; + fma.rm.ftz.f32 %r506, %r505, %r373, %r372; + add.f32 %r507, %r506, 0fCB40007F; + neg.f32 %r508, %r507; + fma.rn.ftz.f32 %r509, %r317, %r377, %r508; + fma.rn.ftz.f32 %r510, %r317, %r379, %r509; + shl.b32 %r511, %r506, 23; + ex2.approx.ftz.f32 %r512, %r510; + mul.f32 %r513, %r512, %r511; + fma.rn.ftz.f32 %r514, %r318, %r369, %r368; + cvt.ftz.sat.f32.f32 %r515, %r514; + fma.rm.ftz.f32 %r516, %r515, %r373, %r372; + add.f32 %r517, %r516, 0fCB40007F; + neg.f32 %r518, %r517; + fma.rn.ftz.f32 %r519, %r318, %r377, %r518; + fma.rn.ftz.f32 %r520, %r318, %r379, %r519; + shl.b32 %r521, %r516, 23; + ex2.approx.ftz.f32 %r522, %r520; + mul.f32 %r523, %r522, %r521; + fma.rn.ftz.f32 %r524, %r319, %r369, %r368; + cvt.ftz.sat.f32.f32 %r525, %r524; + fma.rm.ftz.f32 %r526, %r525, %r373, %r372; + add.f32 %r527, %r526, 0fCB40007F; + neg.f32 %r528, %r527; + fma.rn.ftz.f32 %r529, %r319, %r377, %r528; + fma.rn.ftz.f32 %r530, %r319, %r379, %r529; + shl.b32 %r531, %r526, 23; + ex2.approx.ftz.f32 %r532, %r530; + mul.f32 %r533, %r532, %r531; + fma.rn.ftz.f32 %r534, %r320, %r369, %r368; + cvt.ftz.sat.f32.f32 %r535, %r534; + fma.rm.ftz.f32 %r536, %r535, %r373, %r372; + add.f32 %r537, %r536, 0fCB40007F; + neg.f32 %r538, %r537; + fma.rn.ftz.f32 %r539, %r320, %r377, %r538; + fma.rn.ftz.f32 %r540, %r320, %r379, %r539; + shl.b32 %r541, %r536, 23; + ex2.approx.ftz.f32 %r542, %r540; + mul.f32 %r543, %r542, %r541; + fma.rn.ftz.f32 %r544, %r321, %r369, %r368; + cvt.ftz.sat.f32.f32 %r545, %r544; + fma.rm.ftz.f32 %r546, %r545, %r373, %r372; + add.f32 %r547, %r546, 0fCB40007F; + neg.f32 %r548, %r547; + fma.rn.ftz.f32 %r549, %r321, %r377, %r548; + fma.rn.ftz.f32 %r550, %r321, %r379, %r549; + shl.b32 %r551, %r546, 23; + ex2.approx.ftz.f32 %r552, %r550; + mul.f32 %r553, %r552, %r551; + fma.rn.ftz.f32 %r554, %r322, %r369, %r368; + cvt.ftz.sat.f32.f32 %r555, %r554; + fma.rm.ftz.f32 %r556, %r555, %r373, %r372; + add.f32 %r557, %r556, 0fCB40007F; + neg.f32 %r558, %r557; + fma.rn.ftz.f32 %r559, %r322, %r377, %r558; + fma.rn.ftz.f32 %r560, %r322, %r379, %r559; + shl.b32 %r561, %r556, 23; + ex2.approx.ftz.f32 %r562, %r560; + mul.f32 %r563, %r562, %r561; + fma.rn.ftz.f32 %r564, %r323, %r369, %r368; + cvt.ftz.sat.f32.f32 %r565, %r564; + fma.rm.ftz.f32 %r566, %r565, %r373, %r372; + add.f32 %r567, %r566, 0fCB40007F; + neg.f32 %r568, %r567; + fma.rn.ftz.f32 %r569, %r323, %r377, %r568; + fma.rn.ftz.f32 %r570, %r323, %r379, %r569; + shl.b32 %r571, %r566, 23; + ex2.approx.ftz.f32 %r572, %r570; + mul.f32 %r573, %r572, %r571; + fma.rn.ftz.f32 %r574, %r324, %r369, %r368; + cvt.ftz.sat.f32.f32 %r575, %r574; + fma.rm.ftz.f32 %r576, %r575, %r373, %r372; + add.f32 %r577, %r576, 0fCB40007F; + neg.f32 %r578, %r577; + fma.rn.ftz.f32 %r579, %r324, %r377, %r578; + fma.rn.ftz.f32 %r580, %r324, %r379, %r579; + shl.b32 %r581, %r576, 23; + ex2.approx.ftz.f32 %r582, %r580; + mul.f32 %r583, %r582, %r581; + fma.rn.ftz.f32 %r584, %r325, %r369, %r368; + cvt.ftz.sat.f32.f32 %r585, %r584; + fma.rm.ftz.f32 %r586, %r585, %r373, %r372; + add.f32 %r587, %r586, 0fCB40007F; + neg.f32 %r588, %r587; + fma.rn.ftz.f32 %r589, %r325, %r377, %r588; + fma.rn.ftz.f32 %r590, %r325, %r379, %r589; + shl.b32 %r591, %r586, 23; + ex2.approx.ftz.f32 %r592, %r590; + mul.f32 %r593, %r592, %r591; + fma.rn.ftz.f32 %r594, %r326, %r369, %r368; + cvt.ftz.sat.f32.f32 %r595, %r594; + fma.rm.ftz.f32 %r596, %r595, %r373, %r372; + add.f32 %r597, %r596, 0fCB40007F; + neg.f32 %r598, %r597; + fma.rn.ftz.f32 %r599, %r326, %r377, %r598; + fma.rn.ftz.f32 %r600, %r326, %r379, %r599; + shl.b32 %r601, %r596, 23; + ex2.approx.ftz.f32 %r602, %r600; + mul.f32 %r603, %r602, %r601; + fma.rn.ftz.f32 %r604, %r327, %r369, %r368; + cvt.ftz.sat.f32.f32 %r605, %r604; + fma.rm.ftz.f32 %r606, %r605, %r373, %r372; + add.f32 %r607, %r606, 0fCB40007F; + neg.f32 %r608, %r607; + fma.rn.ftz.f32 %r609, %r327, %r377, %r608; + fma.rn.ftz.f32 %r610, %r327, %r379, %r609; + shl.b32 %r611, %r606, 23; + ex2.approx.ftz.f32 %r612, %r610; + mul.f32 %r613, %r612, %r611; + fma.rn.ftz.f32 %r614, %r328, %r369, %r368; + cvt.ftz.sat.f32.f32 %r615, %r614; + fma.rm.ftz.f32 %r616, %r615, %r373, %r372; + add.f32 %r617, %r616, 0fCB40007F; + neg.f32 %r618, %r617; + fma.rn.ftz.f32 %r619, %r328, %r377, %r618; + fma.rn.ftz.f32 %r620, %r328, %r379, %r619; + shl.b32 %r621, %r616, 23; + ex2.approx.ftz.f32 %r622, %r620; + mul.f32 %r623, %r622, %r621; + fma.rn.ftz.f32 %r624, %r329, %r369, %r368; + cvt.ftz.sat.f32.f32 %r625, %r624; + fma.rm.ftz.f32 %r626, %r625, %r373, %r372; + add.f32 %r627, %r626, 0fCB40007F; + neg.f32 %r628, %r627; + fma.rn.ftz.f32 %r629, %r329, %r377, %r628; + fma.rn.ftz.f32 %r630, %r329, %r379, %r629; + shl.b32 %r631, %r626, 23; + ex2.approx.ftz.f32 %r632, %r630; + mul.f32 %r633, %r632, %r631; + fma.rn.ftz.f32 %r634, %r330, %r369, %r368; + cvt.ftz.sat.f32.f32 %r635, %r634; + fma.rm.ftz.f32 %r636, %r635, %r373, %r372; + add.f32 %r637, %r636, 0fCB40007F; + neg.f32 %r638, %r637; + fma.rn.ftz.f32 %r639, %r330, %r377, %r638; + fma.rn.ftz.f32 %r640, %r330, %r379, %r639; + shl.b32 %r641, %r636, 23; + ex2.approx.ftz.f32 %r642, %r640; + mul.f32 %r643, %r642, %r641; + fma.rn.ftz.f32 %r644, %r331, %r369, %r368; + cvt.ftz.sat.f32.f32 %r645, %r644; + fma.rm.ftz.f32 %r646, %r645, %r373, %r372; + add.f32 %r647, %r646, 0fCB40007F; + neg.f32 %r648, %r647; + fma.rn.ftz.f32 %r649, %r331, %r377, %r648; + fma.rn.ftz.f32 %r650, %r331, %r379, %r649; + shl.b32 %r651, %r646, 23; + ex2.approx.ftz.f32 %r652, %r650; + mul.f32 %r653, %r652, %r651; + fma.rn.ftz.f32 %r654, %r332, %r369, %r368; + cvt.ftz.sat.f32.f32 %r655, %r654; + fma.rm.ftz.f32 %r656, %r655, %r373, %r372; + add.f32 %r657, %r656, 0fCB40007F; + neg.f32 %r658, %r657; + fma.rn.ftz.f32 %r659, %r332, %r377, %r658; + fma.rn.ftz.f32 %r660, %r332, %r379, %r659; + shl.b32 %r661, %r656, 23; + ex2.approx.ftz.f32 %r662, %r660; + mul.f32 %r663, %r662, %r661; + fma.rn.ftz.f32 %r664, %r333, %r369, %r368; + cvt.ftz.sat.f32.f32 %r665, %r664; + fma.rm.ftz.f32 %r666, %r665, %r373, %r372; + add.f32 %r667, %r666, 0fCB40007F; + neg.f32 %r668, %r667; + fma.rn.ftz.f32 %r669, %r333, %r377, %r668; + fma.rn.ftz.f32 %r670, %r333, %r379, %r669; + shl.b32 %r671, %r666, 23; + ex2.approx.ftz.f32 %r672, %r670; + mul.f32 %r673, %r672, %r671; + fma.rn.ftz.f32 %r674, %r334, %r369, %r368; + cvt.ftz.sat.f32.f32 %r675, %r674; + fma.rm.ftz.f32 %r676, %r675, %r373, %r372; + add.f32 %r677, %r676, 0fCB40007F; + neg.f32 %r678, %r677; + fma.rn.ftz.f32 %r679, %r334, %r377, %r678; + fma.rn.ftz.f32 %r680, %r334, %r379, %r679; + shl.b32 %r681, %r676, 23; + ex2.approx.ftz.f32 %r682, %r680; + mul.f32 %r683, %r682, %r681; + fma.rn.ftz.f32 %r684, %r335, %r369, %r368; + cvt.ftz.sat.f32.f32 %r685, %r684; + fma.rm.ftz.f32 %r686, %r685, %r373, %r372; + add.f32 %r687, %r686, 0fCB40007F; + neg.f32 %r688, %r687; + fma.rn.ftz.f32 %r689, %r335, %r377, %r688; + fma.rn.ftz.f32 %r690, %r335, %r379, %r689; + shl.b32 %r691, %r686, 23; + ex2.approx.ftz.f32 %r692, %r690; + mul.f32 %r693, %r692, %r691; + fma.rn.ftz.f32 %r694, %r336, %r369, %r368; + cvt.ftz.sat.f32.f32 %r695, %r694; + fma.rm.ftz.f32 %r696, %r695, %r373, %r372; + add.f32 %r697, %r696, 0fCB40007F; + neg.f32 %r698, %r697; + fma.rn.ftz.f32 %r699, %r336, %r377, %r698; + fma.rn.ftz.f32 %r700, %r336, %r379, %r699; + shl.b32 %r701, %r696, 23; + ex2.approx.ftz.f32 %r702, %r700; + mul.f32 %r703, %r702, %r701; + fma.rn.ftz.f32 %r704, %r337, %r369, %r368; + cvt.ftz.sat.f32.f32 %r705, %r704; + fma.rm.ftz.f32 %r706, %r705, %r373, %r372; + add.f32 %r707, %r706, 0fCB40007F; + neg.f32 %r708, %r707; + fma.rn.ftz.f32 %r709, %r337, %r377, %r708; + fma.rn.ftz.f32 %r710, %r337, %r379, %r709; + shl.b32 %r711, %r706, 23; + ex2.approx.ftz.f32 %r712, %r710; + mul.f32 %r713, %r712, %r711; + fma.rn.ftz.f32 %r714, %r338, %r369, %r368; + cvt.ftz.sat.f32.f32 %r715, %r714; + fma.rm.ftz.f32 %r716, %r715, %r373, %r372; + add.f32 %r717, %r716, 0fCB40007F; + neg.f32 %r718, %r717; + fma.rn.ftz.f32 %r719, %r338, %r377, %r718; + fma.rn.ftz.f32 %r720, %r338, %r379, %r719; + shl.b32 %r721, %r716, 23; + ex2.approx.ftz.f32 %r722, %r720; + mul.f32 %r723, %r722, %r721; + fma.rn.ftz.f32 %r724, %r339, %r369, %r368; + cvt.ftz.sat.f32.f32 %r725, %r724; + fma.rm.ftz.f32 %r726, %r725, %r373, %r372; + add.f32 %r727, %r726, 0fCB40007F; + neg.f32 %r728, %r727; + fma.rn.ftz.f32 %r729, %r339, %r377, %r728; + fma.rn.ftz.f32 %r730, %r339, %r379, %r729; + shl.b32 %r731, %r726, 23; + ex2.approx.ftz.f32 %r732, %r730; + mul.f32 %r733, %r732, %r731; + fma.rn.ftz.f32 %r734, %r340, %r369, %r368; + cvt.ftz.sat.f32.f32 %r735, %r734; + fma.rm.ftz.f32 %r736, %r735, %r373, %r372; + add.f32 %r737, %r736, 0fCB40007F; + neg.f32 %r738, %r737; + fma.rn.ftz.f32 %r739, %r340, %r377, %r738; + fma.rn.ftz.f32 %r740, %r340, %r379, %r739; + shl.b32 %r741, %r736, 23; + ex2.approx.ftz.f32 %r742, %r740; + mul.f32 %r743, %r742, %r741; + fma.rn.ftz.f32 %r744, %r341, %r369, %r368; + cvt.ftz.sat.f32.f32 %r745, %r744; + fma.rm.ftz.f32 %r746, %r745, %r373, %r372; + add.f32 %r747, %r746, 0fCB40007F; + neg.f32 %r748, %r747; + fma.rn.ftz.f32 %r749, %r341, %r377, %r748; + fma.rn.ftz.f32 %r750, %r341, %r379, %r749; + shl.b32 %r751, %r746, 23; + ex2.approx.ftz.f32 %r752, %r750; + mul.f32 %r753, %r752, %r751; + fma.rn.ftz.f32 %r754, %r342, %r369, %r368; + cvt.ftz.sat.f32.f32 %r755, %r754; + fma.rm.ftz.f32 %r756, %r755, %r373, %r372; + add.f32 %r757, %r756, 0fCB40007F; + neg.f32 %r758, %r757; + fma.rn.ftz.f32 %r759, %r342, %r377, %r758; + fma.rn.ftz.f32 %r760, %r342, %r379, %r759; + shl.b32 %r761, %r756, 23; + ex2.approx.ftz.f32 %r762, %r760; + mul.f32 %r763, %r762, %r761; + fma.rn.ftz.f32 %r764, %r343, %r369, %r368; + cvt.ftz.sat.f32.f32 %r765, %r764; + fma.rm.ftz.f32 %r766, %r765, %r373, %r372; + add.f32 %r767, %r766, 0fCB40007F; + neg.f32 %r768, %r767; + fma.rn.ftz.f32 %r769, %r343, %r377, %r768; + fma.rn.ftz.f32 %r770, %r343, %r379, %r769; + shl.b32 %r771, %r766, 23; + ex2.approx.ftz.f32 %r772, %r770; + mul.f32 %r773, %r772, %r771; + fma.rn.ftz.f32 %r774, %r344, %r369, %r368; + cvt.ftz.sat.f32.f32 %r775, %r774; + fma.rm.ftz.f32 %r776, %r775, %r373, %r372; + add.f32 %r777, %r776, 0fCB40007F; + neg.f32 %r778, %r777; + fma.rn.ftz.f32 %r779, %r344, %r377, %r778; + fma.rn.ftz.f32 %r780, %r344, %r379, %r779; + shl.b32 %r781, %r776, 23; + ex2.approx.ftz.f32 %r782, %r780; + mul.f32 %r783, %r782, %r781; + fma.rn.ftz.f32 %r784, %r345, %r369, %r368; + cvt.ftz.sat.f32.f32 %r785, %r784; + fma.rm.ftz.f32 %r786, %r785, %r373, %r372; + add.f32 %r787, %r786, 0fCB40007F; + neg.f32 %r788, %r787; + fma.rn.ftz.f32 %r789, %r345, %r377, %r788; + fma.rn.ftz.f32 %r790, %r345, %r379, %r789; + shl.b32 %r791, %r786, 23; + ex2.approx.ftz.f32 %r792, %r790; + mul.f32 %r793, %r792, %r791; + fma.rn.ftz.f32 %r794, %r346, %r369, %r368; + cvt.ftz.sat.f32.f32 %r795, %r794; + fma.rm.ftz.f32 %r796, %r795, %r373, %r372; + add.f32 %r797, %r796, 0fCB40007F; + neg.f32 %r798, %r797; + fma.rn.ftz.f32 %r799, %r346, %r377, %r798; + fma.rn.ftz.f32 %r800, %r346, %r379, %r799; + shl.b32 %r801, %r796, 23; + ex2.approx.ftz.f32 %r802, %r800; + mul.f32 %r803, %r802, %r801; + fma.rn.ftz.f32 %r804, %r347, %r369, %r368; + cvt.ftz.sat.f32.f32 %r805, %r804; + fma.rm.ftz.f32 %r806, %r805, %r373, %r372; + add.f32 %r807, %r806, 0fCB40007F; + neg.f32 %r808, %r807; + fma.rn.ftz.f32 %r809, %r347, %r377, %r808; + fma.rn.ftz.f32 %r810, %r347, %r379, %r809; + shl.b32 %r811, %r806, 23; + ex2.approx.ftz.f32 %r812, %r810; + mul.f32 %r813, %r812, %r811; + fma.rn.ftz.f32 %r814, %r348, %r369, %r368; + cvt.ftz.sat.f32.f32 %r815, %r814; + fma.rm.ftz.f32 %r816, %r815, %r373, %r372; + add.f32 %r817, %r816, 0fCB40007F; + neg.f32 %r818, %r817; + fma.rn.ftz.f32 %r819, %r348, %r377, %r818; + fma.rn.ftz.f32 %r820, %r348, %r379, %r819; + shl.b32 %r821, %r816, 23; + ex2.approx.ftz.f32 %r822, %r820; + mul.f32 %r823, %r822, %r821; + fma.rn.ftz.f32 %r824, %r349, %r369, %r368; + cvt.ftz.sat.f32.f32 %r825, %r824; + fma.rm.ftz.f32 %r826, %r825, %r373, %r372; + add.f32 %r827, %r826, 0fCB40007F; + neg.f32 %r828, %r827; + fma.rn.ftz.f32 %r829, %r349, %r377, %r828; + fma.rn.ftz.f32 %r830, %r349, %r379, %r829; + shl.b32 %r831, %r826, 23; + ex2.approx.ftz.f32 %r832, %r830; + mul.f32 %r833, %r832, %r831; + fma.rn.ftz.f32 %r834, %r350, %r369, %r368; + cvt.ftz.sat.f32.f32 %r835, %r834; + fma.rm.ftz.f32 %r836, %r835, %r373, %r372; + add.f32 %r837, %r836, 0fCB40007F; + neg.f32 %r838, %r837; + fma.rn.ftz.f32 %r839, %r350, %r377, %r838; + fma.rn.ftz.f32 %r840, %r350, %r379, %r839; + shl.b32 %r841, %r836, 23; + ex2.approx.ftz.f32 %r842, %r840; + mul.f32 %r843, %r842, %r841; + fma.rn.ftz.f32 %r844, %r351, %r369, %r368; + cvt.ftz.sat.f32.f32 %r845, %r844; + fma.rm.ftz.f32 %r846, %r845, %r373, %r372; + add.f32 %r847, %r846, 0fCB40007F; + neg.f32 %r848, %r847; + fma.rn.ftz.f32 %r849, %r351, %r377, %r848; + fma.rn.ftz.f32 %r850, %r351, %r379, %r849; + shl.b32 %r851, %r846, 23; + ex2.approx.ftz.f32 %r852, %r850; + mul.f32 %r853, %r852, %r851; + fma.rn.ftz.f32 %r854, %r352, %r369, %r368; + cvt.ftz.sat.f32.f32 %r855, %r854; + fma.rm.ftz.f32 %r856, %r855, %r373, %r372; + add.f32 %r857, %r856, 0fCB40007F; + neg.f32 %r858, %r857; + fma.rn.ftz.f32 %r859, %r352, %r377, %r858; + fma.rn.ftz.f32 %r860, %r352, %r379, %r859; + shl.b32 %r861, %r856, 23; + ex2.approx.ftz.f32 %r862, %r860; + mul.f32 %r863, %r862, %r861; + fma.rn.ftz.f32 %r864, %r353, %r369, %r368; + cvt.ftz.sat.f32.f32 %r865, %r864; + fma.rm.ftz.f32 %r866, %r865, %r373, %r372; + add.f32 %r867, %r866, 0fCB40007F; + neg.f32 %r868, %r867; + fma.rn.ftz.f32 %r869, %r353, %r377, %r868; + fma.rn.ftz.f32 %r870, %r353, %r379, %r869; + shl.b32 %r871, %r866, 23; + ex2.approx.ftz.f32 %r872, %r870; + mul.f32 %r873, %r872, %r871; + fma.rn.ftz.f32 %r874, %r354, %r369, %r368; + cvt.ftz.sat.f32.f32 %r875, %r874; + fma.rm.ftz.f32 %r876, %r875, %r373, %r372; + add.f32 %r877, %r876, 0fCB40007F; + neg.f32 %r878, %r877; + fma.rn.ftz.f32 %r879, %r354, %r377, %r878; + fma.rn.ftz.f32 %r880, %r354, %r379, %r879; + shl.b32 %r881, %r876, 23; + ex2.approx.ftz.f32 %r882, %r880; + mul.f32 %r883, %r882, %r881; + fma.rn.ftz.f32 %r884, %r355, %r369, %r368; + cvt.ftz.sat.f32.f32 %r885, %r884; + fma.rm.ftz.f32 %r886, %r885, %r373, %r372; + add.f32 %r887, %r886, 0fCB40007F; + neg.f32 %r888, %r887; + fma.rn.ftz.f32 %r889, %r355, %r377, %r888; + fma.rn.ftz.f32 %r890, %r355, %r379, %r889; + shl.b32 %r891, %r886, 23; + ex2.approx.ftz.f32 %r892, %r890; + mul.f32 %r893, %r892, %r891; + fma.rn.ftz.f32 %r894, %r356, %r369, %r368; + cvt.ftz.sat.f32.f32 %r895, %r894; + fma.rm.ftz.f32 %r896, %r895, %r373, %r372; + add.f32 %r897, %r896, 0fCB40007F; + neg.f32 %r898, %r897; + fma.rn.ftz.f32 %r899, %r356, %r377, %r898; + fma.rn.ftz.f32 %r900, %r356, %r379, %r899; + shl.b32 %r901, %r896, 23; + ex2.approx.ftz.f32 %r902, %r900; + mul.f32 %r903, %r902, %r901; + fma.rn.ftz.f32 %r904, %r357, %r369, %r368; + cvt.ftz.sat.f32.f32 %r905, %r904; + fma.rm.ftz.f32 %r906, %r905, %r373, %r372; + add.f32 %r907, %r906, 0fCB40007F; + neg.f32 %r908, %r907; + fma.rn.ftz.f32 %r909, %r357, %r377, %r908; + fma.rn.ftz.f32 %r910, %r357, %r379, %r909; + shl.b32 %r911, %r906, 23; + ex2.approx.ftz.f32 %r912, %r910; + mul.f32 %r913, %r912, %r911; + fma.rn.ftz.f32 %r914, %r358, %r369, %r368; + cvt.ftz.sat.f32.f32 %r915, %r914; + fma.rm.ftz.f32 %r916, %r915, %r373, %r372; + add.f32 %r917, %r916, 0fCB40007F; + neg.f32 %r918, %r917; + fma.rn.ftz.f32 %r919, %r358, %r377, %r918; + fma.rn.ftz.f32 %r920, %r358, %r379, %r919; + shl.b32 %r921, %r916, 23; + ex2.approx.ftz.f32 %r922, %r920; + mul.f32 %r923, %r922, %r921; + fma.rn.ftz.f32 %r924, %r359, %r369, %r368; + cvt.ftz.sat.f32.f32 %r925, %r924; + fma.rm.ftz.f32 %r926, %r925, %r373, %r372; + add.f32 %r927, %r926, 0fCB40007F; + neg.f32 %r928, %r927; + fma.rn.ftz.f32 %r929, %r359, %r377, %r928; + fma.rn.ftz.f32 %r930, %r359, %r379, %r929; + shl.b32 %r931, %r926, 23; + ex2.approx.ftz.f32 %r932, %r930; + mul.f32 %r933, %r932, %r931; + fma.rn.ftz.f32 %r934, %r360, %r369, %r368; + cvt.ftz.sat.f32.f32 %r935, %r934; + fma.rm.ftz.f32 %r936, %r935, %r373, %r372; + add.f32 %r937, %r936, 0fCB40007F; + neg.f32 %r938, %r937; + fma.rn.ftz.f32 %r939, %r360, %r377, %r938; + fma.rn.ftz.f32 %r940, %r360, %r379, %r939; + shl.b32 %r941, %r936, 23; + ex2.approx.ftz.f32 %r942, %r940; + mul.f32 %r943, %r942, %r941; + fma.rn.ftz.f32 %r944, %r361, %r369, %r368; + cvt.ftz.sat.f32.f32 %r945, %r944; + fma.rm.ftz.f32 %r946, %r945, %r373, %r372; + add.f32 %r947, %r946, 0fCB40007F; + neg.f32 %r948, %r947; + fma.rn.ftz.f32 %r949, %r361, %r377, %r948; + fma.rn.ftz.f32 %r950, %r361, %r379, %r949; + shl.b32 %r951, %r946, 23; + ex2.approx.ftz.f32 %r952, %r950; + mul.f32 %r953, %r952, %r951; + fma.rn.ftz.f32 %r954, %r362, %r369, %r368; + cvt.ftz.sat.f32.f32 %r955, %r954; + fma.rm.ftz.f32 %r956, %r955, %r373, %r372; + fma.rn.ftz.f32 %r957, %r363, %r369, %r368; + cvt.ftz.sat.f32.f32 %r958, %r957; + fma.rm.ftz.f32 %r959, %r958, %r373, %r372; + fma.rn.ftz.f32 %r960, %r364, %r369, %r368; + cvt.ftz.sat.f32.f32 %r961, %r960; + fma.rm.ftz.f32 %r962, %r961, %r373, %r372; + fma.rn.ftz.f32 %r963, %r365, %r369, %r368; + cvt.ftz.sat.f32.f32 %r964, %r963; + fma.rm.ftz.f32 %r965, %r964, %r373, %r372; + fma.rn.ftz.f32 %r966, %r366, %r369, %r368; + cvt.ftz.sat.f32.f32 %r967, %r966; + fma.rm.ftz.f32 %r968, %r967, %r373, %r372; + fma.rn.ftz.f32 %r969, %r367, %r369, %r368; + cvt.ftz.sat.f32.f32 %r970, %r969; + fma.rm.ftz.f32 %r971, %r970, %r373, %r372; + .loc 2 199 53 // triton_helpers.py:199:53 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + sub.f32 %r972, %r239, %r239; + sub.f32 %r973, %r240, %r240; + sub.f32 %r974, %r241, %r241; + sub.f32 %r975, %r242, %r242; + sub.f32 %r976, %r243, %r243; + sub.f32 %r977, %r244, %r244; + sub.f32 %r978, %r245, %r245; + sub.f32 %r979, %r246, %r246; + sub.f32 %r980, %r247, %r247; + sub.f32 %r981, %r248, %r248; + sub.f32 %r982, %r249, %r249; + sub.f32 %r983, %r250, %r250; + sub.f32 %r984, %r251, %r251; + sub.f32 %r985, %r252, %r252; + sub.f32 %r986, %r253, %r253; + sub.f32 %r987, %r254, %r254; + sub.f32 %r988, %r255, %r255; + sub.f32 %r989, %r256, %r256; + sub.f32 %r990, %r257, %r257; + sub.f32 %r991, %r258, %r258; + sub.f32 %r992, %r259, %r259; + sub.f32 %r993, %r260, %r260; + sub.f32 %r994, %r261, %r261; + sub.f32 %r995, %r262, %r262; + sub.f32 %r996, %r263, %r263; + sub.f32 %r997, %r264, %r264; + sub.f32 %r998, %r265, %r265; + sub.f32 %r999, %r266, %r266; + sub.f32 %r1000, %r267, %r267; + sub.f32 %r1001, %r268, %r268; + sub.f32 %r1002, %r269, %r269; + sub.f32 %r1003, %r270, %r270; + sub.f32 %r1004, %r271, %r271; + sub.f32 %r1005, %r272, %r272; + sub.f32 %r1006, %r273, %r273; + sub.f32 %r1007, %r274, %r274; + sub.f32 %r1008, %r275, %r275; + sub.f32 %r1009, %r276, %r276; + sub.f32 %r1010, %r277, %r277; + sub.f32 %r1011, %r278, %r278; + sub.f32 %r1012, %r279, %r279; + sub.f32 %r1013, %r280, %r280; + sub.f32 %r1014, %r281, %r281; + sub.f32 %r1015, %r282, %r282; + sub.f32 %r1016, %r283, %r283; + sub.f32 %r1017, %r284, %r284; + sub.f32 %r1018, %r285, %r285; + sub.f32 %r1019, %r286, %r286; + sub.f32 %r1020, %r287, %r287; + sub.f32 %r1021, %r288, %r288; + sub.f32 %r1022, %r289, %r289; + sub.f32 %r1023, %r290, %r290; + sub.f32 %r1024, %r291, %r291; + sub.f32 %r1025, %r292, %r292; + sub.f32 %r1026, %r293, %r293; + sub.f32 %r1027, %r294, %r294; + sub.f32 %r1028, %r295, %r295; + sub.f32 %r1029, %r296, %r296; + sub.f32 %r1030, %r297, %r297; + sub.f32 %r1031, %r298, %r298; + sub.f32 %r1032, %r299, %r299; + sub.f32 %r1033, %r300, %r300; + sub.f32 %r1034, %r301, %r301; + sub.f32 %r1035, %r302, %r302; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.ftz.f32 %r1036, %r972, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1037, %r1036; + fma.rm.ftz.f32 %r1038, %r1037, %r373, %r372; + add.f32 %r1039, %r1038, 0fCB40007F; + neg.f32 %r1040, %r1039; + fma.rn.ftz.f32 %r1041, %r972, %r377, %r1040; + fma.rn.ftz.f32 %r1042, %r972, %r379, %r1041; + shl.b32 %r1043, %r1038, 23; + ex2.approx.ftz.f32 %r1044, %r1042; + mul.f32 %r1045, %r1044, %r1043; + fma.rn.ftz.f32 %r1046, %r973, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1047, %r1046; + fma.rm.ftz.f32 %r1048, %r1047, %r373, %r372; + add.f32 %r1049, %r1048, 0fCB40007F; + neg.f32 %r1050, %r1049; + fma.rn.ftz.f32 %r1051, %r973, %r377, %r1050; + fma.rn.ftz.f32 %r1052, %r973, %r379, %r1051; + shl.b32 %r1053, %r1048, 23; + ex2.approx.ftz.f32 %r1054, %r1052; + mul.f32 %r1055, %r1054, %r1053; + fma.rn.ftz.f32 %r1056, %r974, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1057, %r1056; + fma.rm.ftz.f32 %r1058, %r1057, %r373, %r372; + add.f32 %r1059, %r1058, 0fCB40007F; + neg.f32 %r1060, %r1059; + fma.rn.ftz.f32 %r1061, %r974, %r377, %r1060; + fma.rn.ftz.f32 %r1062, %r974, %r379, %r1061; + shl.b32 %r1063, %r1058, 23; + ex2.approx.ftz.f32 %r1064, %r1062; + mul.f32 %r1065, %r1064, %r1063; + fma.rn.ftz.f32 %r1066, %r975, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1067, %r1066; + fma.rm.ftz.f32 %r1068, %r1067, %r373, %r372; + add.f32 %r1069, %r1068, 0fCB40007F; + neg.f32 %r1070, %r1069; + fma.rn.ftz.f32 %r1071, %r975, %r377, %r1070; + fma.rn.ftz.f32 %r1072, %r975, %r379, %r1071; + shl.b32 %r1073, %r1068, 23; + ex2.approx.ftz.f32 %r1074, %r1072; + mul.f32 %r1075, %r1074, %r1073; + fma.rn.ftz.f32 %r1076, %r976, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1077, %r1076; + fma.rm.ftz.f32 %r1078, %r1077, %r373, %r372; + add.f32 %r1079, %r1078, 0fCB40007F; + neg.f32 %r1080, %r1079; + fma.rn.ftz.f32 %r1081, %r976, %r377, %r1080; + fma.rn.ftz.f32 %r1082, %r976, %r379, %r1081; + shl.b32 %r1083, %r1078, 23; + ex2.approx.ftz.f32 %r1084, %r1082; + mul.f32 %r1085, %r1084, %r1083; + fma.rn.ftz.f32 %r1086, %r977, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1087, %r1086; + fma.rm.ftz.f32 %r1088, %r1087, %r373, %r372; + add.f32 %r1089, %r1088, 0fCB40007F; + neg.f32 %r1090, %r1089; + fma.rn.ftz.f32 %r1091, %r977, %r377, %r1090; + fma.rn.ftz.f32 %r1092, %r977, %r379, %r1091; + shl.b32 %r1093, %r1088, 23; + ex2.approx.ftz.f32 %r1094, %r1092; + mul.f32 %r1095, %r1094, %r1093; + fma.rn.ftz.f32 %r1096, %r978, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1097, %r1096; + fma.rm.ftz.f32 %r1098, %r1097, %r373, %r372; + add.f32 %r1099, %r1098, 0fCB40007F; + neg.f32 %r1100, %r1099; + fma.rn.ftz.f32 %r1101, %r978, %r377, %r1100; + fma.rn.ftz.f32 %r1102, %r978, %r379, %r1101; + shl.b32 %r1103, %r1098, 23; + ex2.approx.ftz.f32 %r1104, %r1102; + mul.f32 %r1105, %r1104, %r1103; + fma.rn.ftz.f32 %r1106, %r979, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1107, %r1106; + fma.rm.ftz.f32 %r1108, %r1107, %r373, %r372; + add.f32 %r1109, %r1108, 0fCB40007F; + neg.f32 %r1110, %r1109; + fma.rn.ftz.f32 %r1111, %r979, %r377, %r1110; + fma.rn.ftz.f32 %r1112, %r979, %r379, %r1111; + shl.b32 %r1113, %r1108, 23; + ex2.approx.ftz.f32 %r1114, %r1112; + mul.f32 %r1115, %r1114, %r1113; + fma.rn.ftz.f32 %r1116, %r980, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1117, %r1116; + fma.rm.ftz.f32 %r1118, %r1117, %r373, %r372; + add.f32 %r1119, %r1118, 0fCB40007F; + neg.f32 %r1120, %r1119; + fma.rn.ftz.f32 %r1121, %r980, %r377, %r1120; + fma.rn.ftz.f32 %r1122, %r980, %r379, %r1121; + shl.b32 %r1123, %r1118, 23; + ex2.approx.ftz.f32 %r1124, %r1122; + mul.f32 %r1125, %r1124, %r1123; + fma.rn.ftz.f32 %r1126, %r981, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1127, %r1126; + fma.rm.ftz.f32 %r1128, %r1127, %r373, %r372; + add.f32 %r1129, %r1128, 0fCB40007F; + neg.f32 %r1130, %r1129; + fma.rn.ftz.f32 %r1131, %r981, %r377, %r1130; + fma.rn.ftz.f32 %r1132, %r981, %r379, %r1131; + shl.b32 %r1133, %r1128, 23; + ex2.approx.ftz.f32 %r1134, %r1132; + mul.f32 %r1135, %r1134, %r1133; + fma.rn.ftz.f32 %r1136, %r982, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1137, %r1136; + fma.rm.ftz.f32 %r1138, %r1137, %r373, %r372; + add.f32 %r1139, %r1138, 0fCB40007F; + neg.f32 %r1140, %r1139; + fma.rn.ftz.f32 %r1141, %r982, %r377, %r1140; + fma.rn.ftz.f32 %r1142, %r982, %r379, %r1141; + shl.b32 %r1143, %r1138, 23; + ex2.approx.ftz.f32 %r1144, %r1142; + mul.f32 %r1145, %r1144, %r1143; + fma.rn.ftz.f32 %r1146, %r983, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1147, %r1146; + fma.rm.ftz.f32 %r1148, %r1147, %r373, %r372; + add.f32 %r1149, %r1148, 0fCB40007F; + neg.f32 %r1150, %r1149; + fma.rn.ftz.f32 %r1151, %r983, %r377, %r1150; + fma.rn.ftz.f32 %r1152, %r983, %r379, %r1151; + shl.b32 %r1153, %r1148, 23; + ex2.approx.ftz.f32 %r1154, %r1152; + mul.f32 %r1155, %r1154, %r1153; + fma.rn.ftz.f32 %r1156, %r984, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1157, %r1156; + fma.rm.ftz.f32 %r1158, %r1157, %r373, %r372; + add.f32 %r1159, %r1158, 0fCB40007F; + neg.f32 %r1160, %r1159; + fma.rn.ftz.f32 %r1161, %r984, %r377, %r1160; + fma.rn.ftz.f32 %r1162, %r984, %r379, %r1161; + shl.b32 %r1163, %r1158, 23; + ex2.approx.ftz.f32 %r1164, %r1162; + mul.f32 %r1165, %r1164, %r1163; + fma.rn.ftz.f32 %r1166, %r985, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1167, %r1166; + fma.rm.ftz.f32 %r1168, %r1167, %r373, %r372; + add.f32 %r1169, %r1168, 0fCB40007F; + neg.f32 %r1170, %r1169; + fma.rn.ftz.f32 %r1171, %r985, %r377, %r1170; + fma.rn.ftz.f32 %r1172, %r985, %r379, %r1171; + shl.b32 %r1173, %r1168, 23; + ex2.approx.ftz.f32 %r1174, %r1172; + mul.f32 %r1175, %r1174, %r1173; + fma.rn.ftz.f32 %r1176, %r986, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1177, %r1176; + fma.rm.ftz.f32 %r1178, %r1177, %r373, %r372; + add.f32 %r1179, %r1178, 0fCB40007F; + neg.f32 %r1180, %r1179; + fma.rn.ftz.f32 %r1181, %r986, %r377, %r1180; + fma.rn.ftz.f32 %r1182, %r986, %r379, %r1181; + shl.b32 %r1183, %r1178, 23; + ex2.approx.ftz.f32 %r1184, %r1182; + mul.f32 %r1185, %r1184, %r1183; + fma.rn.ftz.f32 %r1186, %r987, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1187, %r1186; + fma.rm.ftz.f32 %r1188, %r1187, %r373, %r372; + add.f32 %r1189, %r1188, 0fCB40007F; + neg.f32 %r1190, %r1189; + fma.rn.ftz.f32 %r1191, %r987, %r377, %r1190; + fma.rn.ftz.f32 %r1192, %r987, %r379, %r1191; + shl.b32 %r1193, %r1188, 23; + ex2.approx.ftz.f32 %r1194, %r1192; + mul.f32 %r1195, %r1194, %r1193; + fma.rn.ftz.f32 %r1196, %r988, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1197, %r1196; + fma.rm.ftz.f32 %r1198, %r1197, %r373, %r372; + add.f32 %r1199, %r1198, 0fCB40007F; + neg.f32 %r1200, %r1199; + fma.rn.ftz.f32 %r1201, %r988, %r377, %r1200; + fma.rn.ftz.f32 %r1202, %r988, %r379, %r1201; + shl.b32 %r1203, %r1198, 23; + ex2.approx.ftz.f32 %r1204, %r1202; + mul.f32 %r1205, %r1204, %r1203; + fma.rn.ftz.f32 %r1206, %r989, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1207, %r1206; + fma.rm.ftz.f32 %r1208, %r1207, %r373, %r372; + add.f32 %r1209, %r1208, 0fCB40007F; + neg.f32 %r1210, %r1209; + fma.rn.ftz.f32 %r1211, %r989, %r377, %r1210; + fma.rn.ftz.f32 %r1212, %r989, %r379, %r1211; + shl.b32 %r1213, %r1208, 23; + ex2.approx.ftz.f32 %r1214, %r1212; + mul.f32 %r1215, %r1214, %r1213; + fma.rn.ftz.f32 %r1216, %r990, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1217, %r1216; + fma.rm.ftz.f32 %r1218, %r1217, %r373, %r372; + add.f32 %r1219, %r1218, 0fCB40007F; + neg.f32 %r1220, %r1219; + fma.rn.ftz.f32 %r1221, %r990, %r377, %r1220; + fma.rn.ftz.f32 %r1222, %r990, %r379, %r1221; + shl.b32 %r1223, %r1218, 23; + ex2.approx.ftz.f32 %r1224, %r1222; + mul.f32 %r1225, %r1224, %r1223; + fma.rn.ftz.f32 %r1226, %r991, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1227, %r1226; + fma.rm.ftz.f32 %r1228, %r1227, %r373, %r372; + add.f32 %r1229, %r1228, 0fCB40007F; + neg.f32 %r1230, %r1229; + fma.rn.ftz.f32 %r1231, %r991, %r377, %r1230; + fma.rn.ftz.f32 %r1232, %r991, %r379, %r1231; + shl.b32 %r1233, %r1228, 23; + ex2.approx.ftz.f32 %r1234, %r1232; + mul.f32 %r1235, %r1234, %r1233; + fma.rn.ftz.f32 %r1236, %r992, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1237, %r1236; + fma.rm.ftz.f32 %r1238, %r1237, %r373, %r372; + add.f32 %r1239, %r1238, 0fCB40007F; + neg.f32 %r1240, %r1239; + fma.rn.ftz.f32 %r1241, %r992, %r377, %r1240; + fma.rn.ftz.f32 %r1242, %r992, %r379, %r1241; + shl.b32 %r1243, %r1238, 23; + ex2.approx.ftz.f32 %r1244, %r1242; + mul.f32 %r1245, %r1244, %r1243; + fma.rn.ftz.f32 %r1246, %r993, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1247, %r1246; + fma.rm.ftz.f32 %r1248, %r1247, %r373, %r372; + add.f32 %r1249, %r1248, 0fCB40007F; + neg.f32 %r1250, %r1249; + fma.rn.ftz.f32 %r1251, %r993, %r377, %r1250; + fma.rn.ftz.f32 %r1252, %r993, %r379, %r1251; + shl.b32 %r1253, %r1248, 23; + ex2.approx.ftz.f32 %r1254, %r1252; + mul.f32 %r1255, %r1254, %r1253; + fma.rn.ftz.f32 %r1256, %r994, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1257, %r1256; + fma.rm.ftz.f32 %r1258, %r1257, %r373, %r372; + add.f32 %r1259, %r1258, 0fCB40007F; + neg.f32 %r1260, %r1259; + fma.rn.ftz.f32 %r1261, %r994, %r377, %r1260; + fma.rn.ftz.f32 %r1262, %r994, %r379, %r1261; + shl.b32 %r1263, %r1258, 23; + ex2.approx.ftz.f32 %r1264, %r1262; + mul.f32 %r1265, %r1264, %r1263; + fma.rn.ftz.f32 %r1266, %r995, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1267, %r1266; + fma.rm.ftz.f32 %r1268, %r1267, %r373, %r372; + add.f32 %r1269, %r1268, 0fCB40007F; + neg.f32 %r1270, %r1269; + fma.rn.ftz.f32 %r1271, %r995, %r377, %r1270; + fma.rn.ftz.f32 %r1272, %r995, %r379, %r1271; + shl.b32 %r1273, %r1268, 23; + ex2.approx.ftz.f32 %r1274, %r1272; + mul.f32 %r1275, %r1274, %r1273; + fma.rn.ftz.f32 %r1276, %r996, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1277, %r1276; + fma.rm.ftz.f32 %r1278, %r1277, %r373, %r372; + add.f32 %r1279, %r1278, 0fCB40007F; + neg.f32 %r1280, %r1279; + fma.rn.ftz.f32 %r1281, %r996, %r377, %r1280; + fma.rn.ftz.f32 %r1282, %r996, %r379, %r1281; + shl.b32 %r1283, %r1278, 23; + ex2.approx.ftz.f32 %r1284, %r1282; + mul.f32 %r1285, %r1284, %r1283; + fma.rn.ftz.f32 %r1286, %r997, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1287, %r1286; + fma.rm.ftz.f32 %r1288, %r1287, %r373, %r372; + add.f32 %r1289, %r1288, 0fCB40007F; + neg.f32 %r1290, %r1289; + fma.rn.ftz.f32 %r1291, %r997, %r377, %r1290; + fma.rn.ftz.f32 %r1292, %r997, %r379, %r1291; + shl.b32 %r1293, %r1288, 23; + ex2.approx.ftz.f32 %r1294, %r1292; + mul.f32 %r1295, %r1294, %r1293; + fma.rn.ftz.f32 %r1296, %r998, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1297, %r1296; + fma.rm.ftz.f32 %r1298, %r1297, %r373, %r372; + add.f32 %r1299, %r1298, 0fCB40007F; + neg.f32 %r1300, %r1299; + fma.rn.ftz.f32 %r1301, %r998, %r377, %r1300; + fma.rn.ftz.f32 %r1302, %r998, %r379, %r1301; + shl.b32 %r1303, %r1298, 23; + ex2.approx.ftz.f32 %r1304, %r1302; + mul.f32 %r1305, %r1304, %r1303; + fma.rn.ftz.f32 %r1306, %r999, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1307, %r1306; + fma.rm.ftz.f32 %r1308, %r1307, %r373, %r372; + add.f32 %r1309, %r1308, 0fCB40007F; + neg.f32 %r1310, %r1309; + fma.rn.ftz.f32 %r1311, %r999, %r377, %r1310; + fma.rn.ftz.f32 %r1312, %r999, %r379, %r1311; + shl.b32 %r1313, %r1308, 23; + ex2.approx.ftz.f32 %r1314, %r1312; + mul.f32 %r1315, %r1314, %r1313; + fma.rn.ftz.f32 %r1316, %r1000, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1317, %r1316; + fma.rm.ftz.f32 %r1318, %r1317, %r373, %r372; + add.f32 %r1319, %r1318, 0fCB40007F; + neg.f32 %r1320, %r1319; + fma.rn.ftz.f32 %r1321, %r1000, %r377, %r1320; + fma.rn.ftz.f32 %r1322, %r1000, %r379, %r1321; + shl.b32 %r1323, %r1318, 23; + ex2.approx.ftz.f32 %r1324, %r1322; + mul.f32 %r1325, %r1324, %r1323; + fma.rn.ftz.f32 %r1326, %r1001, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1327, %r1326; + fma.rm.ftz.f32 %r1328, %r1327, %r373, %r372; + add.f32 %r1329, %r1328, 0fCB40007F; + neg.f32 %r1330, %r1329; + fma.rn.ftz.f32 %r1331, %r1001, %r377, %r1330; + fma.rn.ftz.f32 %r1332, %r1001, %r379, %r1331; + shl.b32 %r1333, %r1328, 23; + ex2.approx.ftz.f32 %r1334, %r1332; + mul.f32 %r1335, %r1334, %r1333; + fma.rn.ftz.f32 %r1336, %r1002, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1337, %r1336; + fma.rm.ftz.f32 %r1338, %r1337, %r373, %r372; + add.f32 %r1339, %r1338, 0fCB40007F; + neg.f32 %r1340, %r1339; + fma.rn.ftz.f32 %r1341, %r1002, %r377, %r1340; + fma.rn.ftz.f32 %r1342, %r1002, %r379, %r1341; + shl.b32 %r1343, %r1338, 23; + ex2.approx.ftz.f32 %r1344, %r1342; + mul.f32 %r1345, %r1344, %r1343; + fma.rn.ftz.f32 %r1346, %r1003, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1347, %r1346; + fma.rm.ftz.f32 %r1348, %r1347, %r373, %r372; + add.f32 %r1349, %r1348, 0fCB40007F; + neg.f32 %r1350, %r1349; + fma.rn.ftz.f32 %r1351, %r1003, %r377, %r1350; + fma.rn.ftz.f32 %r1352, %r1003, %r379, %r1351; + shl.b32 %r1353, %r1348, 23; + ex2.approx.ftz.f32 %r1354, %r1352; + mul.f32 %r1355, %r1354, %r1353; + fma.rn.ftz.f32 %r1356, %r1004, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1357, %r1356; + fma.rm.ftz.f32 %r1358, %r1357, %r373, %r372; + add.f32 %r1359, %r1358, 0fCB40007F; + neg.f32 %r1360, %r1359; + fma.rn.ftz.f32 %r1361, %r1004, %r377, %r1360; + fma.rn.ftz.f32 %r1362, %r1004, %r379, %r1361; + shl.b32 %r1363, %r1358, 23; + ex2.approx.ftz.f32 %r1364, %r1362; + mul.f32 %r1365, %r1364, %r1363; + fma.rn.ftz.f32 %r1366, %r1005, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1367, %r1366; + fma.rm.ftz.f32 %r1368, %r1367, %r373, %r372; + add.f32 %r1369, %r1368, 0fCB40007F; + neg.f32 %r1370, %r1369; + fma.rn.ftz.f32 %r1371, %r1005, %r377, %r1370; + fma.rn.ftz.f32 %r1372, %r1005, %r379, %r1371; + shl.b32 %r1373, %r1368, 23; + ex2.approx.ftz.f32 %r1374, %r1372; + mul.f32 %r1375, %r1374, %r1373; + fma.rn.ftz.f32 %r1376, %r1006, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1377, %r1376; + fma.rm.ftz.f32 %r1378, %r1377, %r373, %r372; + add.f32 %r1379, %r1378, 0fCB40007F; + neg.f32 %r1380, %r1379; + fma.rn.ftz.f32 %r1381, %r1006, %r377, %r1380; + fma.rn.ftz.f32 %r1382, %r1006, %r379, %r1381; + shl.b32 %r1383, %r1378, 23; + ex2.approx.ftz.f32 %r1384, %r1382; + mul.f32 %r1385, %r1384, %r1383; + fma.rn.ftz.f32 %r1386, %r1007, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1387, %r1386; + fma.rm.ftz.f32 %r1388, %r1387, %r373, %r372; + add.f32 %r1389, %r1388, 0fCB40007F; + neg.f32 %r1390, %r1389; + fma.rn.ftz.f32 %r1391, %r1007, %r377, %r1390; + fma.rn.ftz.f32 %r1392, %r1007, %r379, %r1391; + shl.b32 %r1393, %r1388, 23; + ex2.approx.ftz.f32 %r1394, %r1392; + mul.f32 %r1395, %r1394, %r1393; + fma.rn.ftz.f32 %r1396, %r1008, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1397, %r1396; + fma.rm.ftz.f32 %r1398, %r1397, %r373, %r372; + add.f32 %r1399, %r1398, 0fCB40007F; + neg.f32 %r1400, %r1399; + fma.rn.ftz.f32 %r1401, %r1008, %r377, %r1400; + fma.rn.ftz.f32 %r1402, %r1008, %r379, %r1401; + shl.b32 %r1403, %r1398, 23; + ex2.approx.ftz.f32 %r1404, %r1402; + mul.f32 %r1405, %r1404, %r1403; + fma.rn.ftz.f32 %r1406, %r1009, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1407, %r1406; + fma.rm.ftz.f32 %r1408, %r1407, %r373, %r372; + add.f32 %r1409, %r1408, 0fCB40007F; + neg.f32 %r1410, %r1409; + fma.rn.ftz.f32 %r1411, %r1009, %r377, %r1410; + fma.rn.ftz.f32 %r1412, %r1009, %r379, %r1411; + shl.b32 %r1413, %r1408, 23; + ex2.approx.ftz.f32 %r1414, %r1412; + mul.f32 %r1415, %r1414, %r1413; + fma.rn.ftz.f32 %r1416, %r1010, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1417, %r1416; + fma.rm.ftz.f32 %r1418, %r1417, %r373, %r372; + add.f32 %r1419, %r1418, 0fCB40007F; + neg.f32 %r1420, %r1419; + fma.rn.ftz.f32 %r1421, %r1010, %r377, %r1420; + fma.rn.ftz.f32 %r1422, %r1010, %r379, %r1421; + shl.b32 %r1423, %r1418, 23; + ex2.approx.ftz.f32 %r1424, %r1422; + mul.f32 %r1425, %r1424, %r1423; + fma.rn.ftz.f32 %r1426, %r1011, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1427, %r1426; + fma.rm.ftz.f32 %r1428, %r1427, %r373, %r372; + add.f32 %r1429, %r1428, 0fCB40007F; + neg.f32 %r1430, %r1429; + fma.rn.ftz.f32 %r1431, %r1011, %r377, %r1430; + fma.rn.ftz.f32 %r1432, %r1011, %r379, %r1431; + shl.b32 %r1433, %r1428, 23; + ex2.approx.ftz.f32 %r1434, %r1432; + mul.f32 %r1435, %r1434, %r1433; + fma.rn.ftz.f32 %r1436, %r1012, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1437, %r1436; + fma.rm.ftz.f32 %r1438, %r1437, %r373, %r372; + add.f32 %r1439, %r1438, 0fCB40007F; + neg.f32 %r1440, %r1439; + fma.rn.ftz.f32 %r1441, %r1012, %r377, %r1440; + fma.rn.ftz.f32 %r1442, %r1012, %r379, %r1441; + shl.b32 %r1443, %r1438, 23; + ex2.approx.ftz.f32 %r1444, %r1442; + mul.f32 %r1445, %r1444, %r1443; + fma.rn.ftz.f32 %r1446, %r1013, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1447, %r1446; + fma.rm.ftz.f32 %r1448, %r1447, %r373, %r372; + add.f32 %r1449, %r1448, 0fCB40007F; + neg.f32 %r1450, %r1449; + fma.rn.ftz.f32 %r1451, %r1013, %r377, %r1450; + fma.rn.ftz.f32 %r1452, %r1013, %r379, %r1451; + shl.b32 %r1453, %r1448, 23; + ex2.approx.ftz.f32 %r1454, %r1452; + mul.f32 %r1455, %r1454, %r1453; + fma.rn.ftz.f32 %r1456, %r1014, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1457, %r1456; + fma.rm.ftz.f32 %r1458, %r1457, %r373, %r372; + add.f32 %r1459, %r1458, 0fCB40007F; + neg.f32 %r1460, %r1459; + fma.rn.ftz.f32 %r1461, %r1014, %r377, %r1460; + fma.rn.ftz.f32 %r1462, %r1014, %r379, %r1461; + shl.b32 %r1463, %r1458, 23; + ex2.approx.ftz.f32 %r1464, %r1462; + mul.f32 %r1465, %r1464, %r1463; + fma.rn.ftz.f32 %r1466, %r1015, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1467, %r1466; + fma.rm.ftz.f32 %r1468, %r1467, %r373, %r372; + add.f32 %r1469, %r1468, 0fCB40007F; + neg.f32 %r1470, %r1469; + fma.rn.ftz.f32 %r1471, %r1015, %r377, %r1470; + fma.rn.ftz.f32 %r1472, %r1015, %r379, %r1471; + shl.b32 %r1473, %r1468, 23; + ex2.approx.ftz.f32 %r1474, %r1472; + mul.f32 %r1475, %r1474, %r1473; + fma.rn.ftz.f32 %r1476, %r1016, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1477, %r1476; + fma.rm.ftz.f32 %r1478, %r1477, %r373, %r372; + add.f32 %r1479, %r1478, 0fCB40007F; + neg.f32 %r1480, %r1479; + fma.rn.ftz.f32 %r1481, %r1016, %r377, %r1480; + fma.rn.ftz.f32 %r1482, %r1016, %r379, %r1481; + shl.b32 %r1483, %r1478, 23; + ex2.approx.ftz.f32 %r1484, %r1482; + mul.f32 %r1485, %r1484, %r1483; + fma.rn.ftz.f32 %r1486, %r1017, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1487, %r1486; + fma.rm.ftz.f32 %r1488, %r1487, %r373, %r372; + add.f32 %r1489, %r1488, 0fCB40007F; + neg.f32 %r1490, %r1489; + fma.rn.ftz.f32 %r1491, %r1017, %r377, %r1490; + fma.rn.ftz.f32 %r1492, %r1017, %r379, %r1491; + shl.b32 %r1493, %r1488, 23; + ex2.approx.ftz.f32 %r1494, %r1492; + mul.f32 %r1495, %r1494, %r1493; + fma.rn.ftz.f32 %r1496, %r1018, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1497, %r1496; + fma.rm.ftz.f32 %r1498, %r1497, %r373, %r372; + add.f32 %r1499, %r1498, 0fCB40007F; + neg.f32 %r1500, %r1499; + fma.rn.ftz.f32 %r1501, %r1018, %r377, %r1500; + fma.rn.ftz.f32 %r1502, %r1018, %r379, %r1501; + shl.b32 %r1503, %r1498, 23; + ex2.approx.ftz.f32 %r1504, %r1502; + mul.f32 %r1505, %r1504, %r1503; + fma.rn.ftz.f32 %r1506, %r1019, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1507, %r1506; + fma.rm.ftz.f32 %r1508, %r1507, %r373, %r372; + add.f32 %r1509, %r1508, 0fCB40007F; + neg.f32 %r1510, %r1509; + fma.rn.ftz.f32 %r1511, %r1019, %r377, %r1510; + fma.rn.ftz.f32 %r1512, %r1019, %r379, %r1511; + shl.b32 %r1513, %r1508, 23; + ex2.approx.ftz.f32 %r1514, %r1512; + mul.f32 %r1515, %r1514, %r1513; + fma.rn.ftz.f32 %r1516, %r1020, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1517, %r1516; + fma.rm.ftz.f32 %r1518, %r1517, %r373, %r372; + add.f32 %r1519, %r1518, 0fCB40007F; + neg.f32 %r1520, %r1519; + fma.rn.ftz.f32 %r1521, %r1020, %r377, %r1520; + fma.rn.ftz.f32 %r1522, %r1020, %r379, %r1521; + shl.b32 %r1523, %r1518, 23; + ex2.approx.ftz.f32 %r1524, %r1522; + mul.f32 %r1525, %r1524, %r1523; + fma.rn.ftz.f32 %r1526, %r1021, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1527, %r1526; + fma.rm.ftz.f32 %r1528, %r1527, %r373, %r372; + add.f32 %r1529, %r1528, 0fCB40007F; + neg.f32 %r1530, %r1529; + fma.rn.ftz.f32 %r1531, %r1021, %r377, %r1530; + fma.rn.ftz.f32 %r1532, %r1021, %r379, %r1531; + shl.b32 %r1533, %r1528, 23; + ex2.approx.ftz.f32 %r1534, %r1532; + mul.f32 %r1535, %r1534, %r1533; + fma.rn.ftz.f32 %r1536, %r1022, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1537, %r1536; + fma.rm.ftz.f32 %r1538, %r1537, %r373, %r372; + add.f32 %r1539, %r1538, 0fCB40007F; + neg.f32 %r1540, %r1539; + fma.rn.ftz.f32 %r1541, %r1022, %r377, %r1540; + fma.rn.ftz.f32 %r1542, %r1022, %r379, %r1541; + shl.b32 %r1543, %r1538, 23; + ex2.approx.ftz.f32 %r1544, %r1542; + mul.f32 %r1545, %r1544, %r1543; + fma.rn.ftz.f32 %r1546, %r1023, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1547, %r1546; + fma.rm.ftz.f32 %r1548, %r1547, %r373, %r372; + add.f32 %r1549, %r1548, 0fCB40007F; + neg.f32 %r1550, %r1549; + fma.rn.ftz.f32 %r1551, %r1023, %r377, %r1550; + fma.rn.ftz.f32 %r1552, %r1023, %r379, %r1551; + shl.b32 %r1553, %r1548, 23; + ex2.approx.ftz.f32 %r1554, %r1552; + mul.f32 %r1555, %r1554, %r1553; + fma.rn.ftz.f32 %r1556, %r1024, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1557, %r1556; + fma.rm.ftz.f32 %r1558, %r1557, %r373, %r372; + add.f32 %r1559, %r1558, 0fCB40007F; + neg.f32 %r1560, %r1559; + fma.rn.ftz.f32 %r1561, %r1024, %r377, %r1560; + fma.rn.ftz.f32 %r1562, %r1024, %r379, %r1561; + shl.b32 %r1563, %r1558, 23; + ex2.approx.ftz.f32 %r1564, %r1562; + mul.f32 %r1565, %r1564, %r1563; + fma.rn.ftz.f32 %r1566, %r1025, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1567, %r1566; + fma.rm.ftz.f32 %r1568, %r1567, %r373, %r372; + add.f32 %r1569, %r1568, 0fCB40007F; + neg.f32 %r1570, %r1569; + fma.rn.ftz.f32 %r1571, %r1025, %r377, %r1570; + fma.rn.ftz.f32 %r1572, %r1025, %r379, %r1571; + shl.b32 %r1573, %r1568, 23; + ex2.approx.ftz.f32 %r1574, %r1572; + mul.f32 %r1575, %r1574, %r1573; + fma.rn.ftz.f32 %r1576, %r1026, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1577, %r1576; + fma.rm.ftz.f32 %r1578, %r1577, %r373, %r372; + add.f32 %r1579, %r1578, 0fCB40007F; + neg.f32 %r1580, %r1579; + fma.rn.ftz.f32 %r1581, %r1026, %r377, %r1580; + fma.rn.ftz.f32 %r1582, %r1026, %r379, %r1581; + shl.b32 %r1583, %r1578, 23; + ex2.approx.ftz.f32 %r1584, %r1582; + mul.f32 %r1585, %r1584, %r1583; + fma.rn.ftz.f32 %r1586, %r1027, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1587, %r1586; + fma.rm.ftz.f32 %r1588, %r1587, %r373, %r372; + add.f32 %r1589, %r1588, 0fCB40007F; + neg.f32 %r1590, %r1589; + fma.rn.ftz.f32 %r1591, %r1027, %r377, %r1590; + fma.rn.ftz.f32 %r1592, %r1027, %r379, %r1591; + shl.b32 %r1593, %r1588, 23; + ex2.approx.ftz.f32 %r1594, %r1592; + mul.f32 %r1595, %r1594, %r1593; + fma.rn.ftz.f32 %r1596, %r1028, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1597, %r1596; + fma.rm.ftz.f32 %r1598, %r1597, %r373, %r372; + add.f32 %r1599, %r1598, 0fCB40007F; + neg.f32 %r1600, %r1599; + fma.rn.ftz.f32 %r1601, %r1028, %r377, %r1600; + fma.rn.ftz.f32 %r1602, %r1028, %r379, %r1601; + shl.b32 %r1603, %r1598, 23; + ex2.approx.ftz.f32 %r1604, %r1602; + mul.f32 %r1605, %r1604, %r1603; + fma.rn.ftz.f32 %r1606, %r1029, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1607, %r1606; + fma.rm.ftz.f32 %r1608, %r1607, %r373, %r372; + add.f32 %r1609, %r1608, 0fCB40007F; + neg.f32 %r1610, %r1609; + fma.rn.ftz.f32 %r1611, %r1029, %r377, %r1610; + fma.rn.ftz.f32 %r1612, %r1029, %r379, %r1611; + shl.b32 %r1613, %r1608, 23; + ex2.approx.ftz.f32 %r1614, %r1612; + mul.f32 %r1615, %r1614, %r1613; + fma.rn.ftz.f32 %r1616, %r1030, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1617, %r1616; + fma.rm.ftz.f32 %r1618, %r1617, %r373, %r372; + fma.rn.ftz.f32 %r1619, %r1031, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1620, %r1619; + fma.rm.ftz.f32 %r1621, %r1620, %r373, %r372; + fma.rn.ftz.f32 %r1622, %r1032, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1623, %r1622; + fma.rm.ftz.f32 %r1624, %r1623, %r373, %r372; + fma.rn.ftz.f32 %r1625, %r1033, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1626, %r1625; + fma.rm.ftz.f32 %r1627, %r1626, %r373, %r372; + fma.rn.ftz.f32 %r1628, %r1034, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1629, %r1628; + fma.rm.ftz.f32 %r1630, %r1629, %r373, %r372; + fma.rn.ftz.f32 %r1631, %r1035, %r369, %r368; + cvt.ftz.sat.f32.f32 %r1632, %r1631; + fma.rm.ftz.f32 %r1633, %r1632, %r373, %r372; + .loc 2 205 36 // triton_helpers.py:205:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.f32 %r1634, %r383, 0f00000000, %r1045; + selp.f32 %r1635, 0f3F800000, %r1634, %p39; + fma.rn.f32 %r1636, %r393, 0f00000000, %r1055; + selp.f32 %r1637, 0f3F800000, %r1636, %p40; + fma.rn.f32 %r1638, %r403, 0f00000000, %r1065; + selp.f32 %r1639, 0f3F800000, %r1638, %p41; + fma.rn.f32 %r1640, %r413, 0f00000000, %r1075; + selp.f32 %r1641, 0f3F800000, %r1640, %p42; + fma.rn.f32 %r1642, %r423, 0f00000000, %r1085; + selp.f32 %r1643, 0f3F800000, %r1642, %p43; + fma.rn.f32 %r1644, %r433, 0f00000000, %r1095; + selp.f32 %r1645, 0f3F800000, %r1644, %p44; + fma.rn.f32 %r1646, %r443, 0f00000000, %r1105; + selp.f32 %r1647, 0f3F800000, %r1646, %p45; + fma.rn.f32 %r1648, %r453, 0f00000000, %r1115; + selp.f32 %r1649, 0f3F800000, %r1648, %p46; + fma.rn.f32 %r1650, %r463, 0f00000000, %r1125; + selp.f32 %r1651, 0f3F800000, %r1650, %p47; + fma.rn.f32 %r1652, %r473, 0f00000000, %r1135; + selp.f32 %r1653, 0f3F800000, %r1652, %p48; + fma.rn.f32 %r1654, %r483, 0f00000000, %r1145; + selp.f32 %r1655, 0f3F800000, %r1654, %p49; + fma.rn.f32 %r1656, %r493, 0f00000000, %r1155; + selp.f32 %r1657, 0f3F800000, %r1656, %p50; + fma.rn.f32 %r1658, %r503, 0f00000000, %r1165; + selp.f32 %r1659, 0f3F800000, %r1658, %p51; + fma.rn.f32 %r1660, %r513, 0f00000000, %r1175; + selp.f32 %r1661, 0f3F800000, %r1660, %p52; + fma.rn.f32 %r1662, %r523, 0f00000000, %r1185; + selp.f32 %r1663, 0f3F800000, %r1662, %p53; + fma.rn.f32 %r1664, %r533, 0f00000000, %r1195; + selp.f32 %r1665, 0f3F800000, %r1664, %p54; + fma.rn.f32 %r1666, %r543, 0f00000000, %r1205; + selp.f32 %r1667, 0f3F800000, %r1666, %p55; + fma.rn.f32 %r1668, %r553, 0f00000000, %r1215; + selp.f32 %r1669, 0f3F800000, %r1668, %p56; + fma.rn.f32 %r1670, %r563, 0f00000000, %r1225; + selp.f32 %r1671, 0f3F800000, %r1670, %p57; + fma.rn.f32 %r1672, %r573, 0f00000000, %r1235; + selp.f32 %r1673, 0f3F800000, %r1672, %p58; + fma.rn.f32 %r1674, %r583, 0f00000000, %r1245; + selp.f32 %r1675, 0f3F800000, %r1674, %p59; + fma.rn.f32 %r1676, %r593, 0f00000000, %r1255; + selp.f32 %r1677, 0f3F800000, %r1676, %p60; + fma.rn.f32 %r1678, %r603, 0f00000000, %r1265; + selp.f32 %r1679, 0f3F800000, %r1678, %p61; + fma.rn.f32 %r1680, %r613, 0f00000000, %r1275; + selp.f32 %r1681, 0f3F800000, %r1680, %p62; + fma.rn.f32 %r1682, %r623, 0f00000000, %r1285; + selp.f32 %r1683, 0f3F800000, %r1682, %p63; + fma.rn.f32 %r1684, %r633, 0f00000000, %r1295; + selp.f32 %r1685, 0f3F800000, %r1684, %p64; + fma.rn.f32 %r1686, %r643, 0f00000000, %r1305; + selp.f32 %r1687, 0f3F800000, %r1686, %p65; + fma.rn.f32 %r1688, %r653, 0f00000000, %r1315; + selp.f32 %r1689, 0f3F800000, %r1688, %p66; + fma.rn.f32 %r1690, %r663, 0f00000000, %r1325; + selp.f32 %r1691, 0f3F800000, %r1690, %p67; + fma.rn.f32 %r1692, %r673, 0f00000000, %r1335; + selp.f32 %r1693, 0f3F800000, %r1692, %p68; + fma.rn.f32 %r1694, %r683, 0f00000000, %r1345; + selp.f32 %r1695, 0f3F800000, %r1694, %p69; + fma.rn.f32 %r1696, %r693, 0f00000000, %r1355; + selp.f32 %r1697, 0f3F800000, %r1696, %p70; + fma.rn.f32 %r1698, %r703, 0f00000000, %r1365; + selp.f32 %r1699, 0f3F800000, %r1698, %p71; + fma.rn.f32 %r1700, %r713, 0f00000000, %r1375; + selp.f32 %r1701, 0f3F800000, %r1700, %p72; + fma.rn.f32 %r1702, %r723, 0f00000000, %r1385; + selp.f32 %r1703, 0f3F800000, %r1702, %p73; + fma.rn.f32 %r1704, %r733, 0f00000000, %r1395; + selp.f32 %r1705, 0f3F800000, %r1704, %p74; + fma.rn.f32 %r1706, %r743, 0f00000000, %r1405; + selp.f32 %r1707, 0f3F800000, %r1706, %p75; + fma.rn.f32 %r1708, %r753, 0f00000000, %r1415; + selp.f32 %r1709, 0f3F800000, %r1708, %p76; + fma.rn.f32 %r1710, %r763, 0f00000000, %r1425; + selp.f32 %r1711, 0f3F800000, %r1710, %p77; + fma.rn.f32 %r1712, %r773, 0f00000000, %r1435; + selp.f32 %r1713, 0f3F800000, %r1712, %p78; + fma.rn.f32 %r1714, %r783, 0f00000000, %r1445; + selp.f32 %r1715, 0f3F800000, %r1714, %p79; + fma.rn.f32 %r1716, %r793, 0f00000000, %r1455; + selp.f32 %r1717, 0f3F800000, %r1716, %p80; + fma.rn.f32 %r1718, %r803, 0f00000000, %r1465; + selp.f32 %r1719, 0f3F800000, %r1718, %p81; + fma.rn.f32 %r1720, %r813, 0f00000000, %r1475; + selp.f32 %r1721, 0f3F800000, %r1720, %p82; + fma.rn.f32 %r1722, %r823, 0f00000000, %r1485; + selp.f32 %r1723, 0f3F800000, %r1722, %p83; + fma.rn.f32 %r1724, %r833, 0f00000000, %r1495; + selp.f32 %r1725, 0f3F800000, %r1724, %p84; + fma.rn.f32 %r1726, %r843, 0f00000000, %r1505; + selp.f32 %r1727, 0f3F800000, %r1726, %p85; + fma.rn.f32 %r1728, %r853, 0f00000000, %r1515; + selp.f32 %r1729, 0f3F800000, %r1728, %p86; + fma.rn.f32 %r1730, %r863, 0f00000000, %r1525; + selp.f32 %r1731, 0f3F800000, %r1730, %p87; + fma.rn.f32 %r1732, %r873, 0f00000000, %r1535; + selp.f32 %r1733, 0f3F800000, %r1732, %p88; + fma.rn.f32 %r1734, %r883, 0f00000000, %r1545; + selp.f32 %r1735, 0f3F800000, %r1734, %p89; + fma.rn.f32 %r1736, %r893, 0f00000000, %r1555; + selp.f32 %r1737, 0f3F800000, %r1736, %p90; + fma.rn.f32 %r1738, %r903, 0f00000000, %r1565; + selp.f32 %r1739, 0f3F800000, %r1738, %p91; + fma.rn.f32 %r1740, %r913, 0f00000000, %r1575; + selp.f32 %r1741, 0f3F800000, %r1740, %p92; + fma.rn.f32 %r1742, %r923, 0f00000000, %r1585; + selp.f32 %r1743, 0f3F800000, %r1742, %p93; + fma.rn.f32 %r1744, %r933, 0f00000000, %r1595; + selp.f32 %r1745, 0f3F800000, %r1744, %p94; + fma.rn.f32 %r1746, %r943, 0f00000000, %r1605; + selp.f32 %r1747, 0f3F800000, %r1746, %p95; + fma.rn.f32 %r1748, %r953, 0f00000000, %r1615; + selp.f32 %r1749, 0f3F800000, %r1748, %p96; +$L__tmp2: + .loc 1 45 54 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:45:54 + selp.f32 %r1750, %r295, 0fFF800000, %p8; + selp.f32 %r1751, %r296, 0fFF800000, %p8; + selp.f32 %r1752, %r297, 0fFF800000, %p8; + selp.f32 %r1753, %r298, 0fFF800000, %p8; + selp.f32 %r1754, %r299, 0fFF800000, %p8; + selp.f32 %r1755, %r300, 0fFF800000, %p8; + selp.f32 %r1756, %r301, 0fFF800000, %p8; + selp.f32 %r1757, %r302, 0fFF800000, %p8; + .loc 1 46 54 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:46:54 + selp.f32 %r1758, %r1747, 0f00000000, %p8; + selp.f32 %r1759, %r1749, 0f00000000, %p8; +$L__tmp3: + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.bf16 %p97, %rs1, %rs2; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.bf16 %p98, %rs1, %rs1; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1760, %r239, %r240, %p98; + selp.f32 %r1761, %r239, %r1760, %p97; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p99, %r1761, %r241; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p100, %r1761, %r1761; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1762, %r1761, %r241, %p100; + selp.f32 %r1763, %r1761, %r1762, %p99; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p101, %r1763, %r242; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p102, %r1763, %r1763; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1764, %r1763, %r242, %p102; + selp.f32 %r1765, %r1763, %r1764, %p101; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p103, %r1765, %r243; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p104, %r1765, %r1765; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1766, %r1765, %r243, %p104; + selp.f32 %r1767, %r1765, %r1766, %p103; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p105, %r1767, %r244; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p106, %r1767, %r1767; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1768, %r1767, %r244, %p106; + selp.f32 %r1769, %r1767, %r1768, %p105; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p107, %r1769, %r245; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p108, %r1769, %r1769; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1770, %r1769, %r245, %p108; + selp.f32 %r1771, %r1769, %r1770, %p107; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p109, %r1771, %r246; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p110, %r1771, %r1771; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1772, %r1771, %r246, %p110; + selp.f32 %r1773, %r1771, %r1772, %p109; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p111, %r1773, %r247; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p112, %r1773, %r1773; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1774, %r1773, %r247, %p112; + selp.f32 %r1775, %r1773, %r1774, %p111; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p113, %r1775, %r248; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p114, %r1775, %r1775; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1776, %r1775, %r248, %p114; + selp.f32 %r1777, %r1775, %r1776, %p113; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p115, %r1777, %r249; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p116, %r1777, %r1777; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1778, %r1777, %r249, %p116; + selp.f32 %r1779, %r1777, %r1778, %p115; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p117, %r1779, %r250; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p118, %r1779, %r1779; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1780, %r1779, %r250, %p118; + selp.f32 %r1781, %r1779, %r1780, %p117; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p119, %r1781, %r251; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p120, %r1781, %r1781; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1782, %r1781, %r251, %p120; + selp.f32 %r1783, %r1781, %r1782, %p119; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p121, %r1783, %r252; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p122, %r1783, %r1783; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1784, %r1783, %r252, %p122; + selp.f32 %r1785, %r1783, %r1784, %p121; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p123, %r1785, %r253; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p124, %r1785, %r1785; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1786, %r1785, %r253, %p124; + selp.f32 %r1787, %r1785, %r1786, %p123; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p125, %r1787, %r254; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p126, %r1787, %r1787; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1788, %r1787, %r254, %p126; + selp.f32 %r1789, %r1787, %r1788, %p125; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p127, %r1789, %r255; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p128, %r1789, %r1789; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1790, %r1789, %r255, %p128; + selp.f32 %r1791, %r1789, %r1790, %p127; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p129, %r1791, %r256; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p130, %r1791, %r1791; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1792, %r1791, %r256, %p130; + selp.f32 %r1793, %r1791, %r1792, %p129; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p131, %r1793, %r257; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p132, %r1793, %r1793; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1794, %r1793, %r257, %p132; + selp.f32 %r1795, %r1793, %r1794, %p131; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p133, %r1795, %r258; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p134, %r1795, %r1795; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1796, %r1795, %r258, %p134; + selp.f32 %r1797, %r1795, %r1796, %p133; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p135, %r1797, %r259; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p136, %r1797, %r1797; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1798, %r1797, %r259, %p136; + selp.f32 %r1799, %r1797, %r1798, %p135; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p137, %r1799, %r260; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p138, %r1799, %r1799; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1800, %r1799, %r260, %p138; + selp.f32 %r1801, %r1799, %r1800, %p137; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p139, %r1801, %r261; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p140, %r1801, %r1801; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1802, %r1801, %r261, %p140; + selp.f32 %r1803, %r1801, %r1802, %p139; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p141, %r1803, %r262; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p142, %r1803, %r1803; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1804, %r1803, %r262, %p142; + selp.f32 %r1805, %r1803, %r1804, %p141; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p143, %r1805, %r263; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p144, %r1805, %r1805; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1806, %r1805, %r263, %p144; + selp.f32 %r1807, %r1805, %r1806, %p143; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p145, %r1807, %r264; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p146, %r1807, %r1807; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1808, %r1807, %r264, %p146; + selp.f32 %r1809, %r1807, %r1808, %p145; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p147, %r1809, %r265; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p148, %r1809, %r1809; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1810, %r1809, %r265, %p148; + selp.f32 %r1811, %r1809, %r1810, %p147; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p149, %r1811, %r266; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p150, %r1811, %r1811; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1812, %r1811, %r266, %p150; + selp.f32 %r1813, %r1811, %r1812, %p149; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p151, %r1813, %r267; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p152, %r1813, %r1813; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1814, %r1813, %r267, %p152; + selp.f32 %r1815, %r1813, %r1814, %p151; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p153, %r1815, %r268; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p154, %r1815, %r1815; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1816, %r1815, %r268, %p154; + selp.f32 %r1817, %r1815, %r1816, %p153; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p155, %r1817, %r269; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p156, %r1817, %r1817; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1818, %r1817, %r269, %p156; + selp.f32 %r1819, %r1817, %r1818, %p155; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p157, %r1819, %r270; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p158, %r1819, %r1819; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1820, %r1819, %r270, %p158; + selp.f32 %r1821, %r1819, %r1820, %p157; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p159, %r1821, %r271; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p160, %r1821, %r1821; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1822, %r1821, %r271, %p160; + selp.f32 %r1823, %r1821, %r1822, %p159; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p161, %r1823, %r272; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p162, %r1823, %r1823; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1824, %r1823, %r272, %p162; + selp.f32 %r1825, %r1823, %r1824, %p161; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p163, %r1825, %r273; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p164, %r1825, %r1825; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1826, %r1825, %r273, %p164; + selp.f32 %r1827, %r1825, %r1826, %p163; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p165, %r1827, %r274; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p166, %r1827, %r1827; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1828, %r1827, %r274, %p166; + selp.f32 %r1829, %r1827, %r1828, %p165; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p167, %r1829, %r275; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p168, %r1829, %r1829; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1830, %r1829, %r275, %p168; + selp.f32 %r1831, %r1829, %r1830, %p167; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p169, %r1831, %r276; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p170, %r1831, %r1831; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1832, %r1831, %r276, %p170; + selp.f32 %r1833, %r1831, %r1832, %p169; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p171, %r1833, %r277; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p172, %r1833, %r1833; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1834, %r1833, %r277, %p172; + selp.f32 %r1835, %r1833, %r1834, %p171; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p173, %r1835, %r278; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p174, %r1835, %r1835; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1836, %r1835, %r278, %p174; + selp.f32 %r1837, %r1835, %r1836, %p173; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p175, %r1837, %r279; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p176, %r1837, %r1837; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1838, %r1837, %r279, %p176; + selp.f32 %r1839, %r1837, %r1838, %p175; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p177, %r1839, %r280; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p178, %r1839, %r1839; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1840, %r1839, %r280, %p178; + selp.f32 %r1841, %r1839, %r1840, %p177; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p179, %r1841, %r281; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p180, %r1841, %r1841; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1842, %r1841, %r281, %p180; + selp.f32 %r1843, %r1841, %r1842, %p179; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p181, %r1843, %r282; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p182, %r1843, %r1843; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1844, %r1843, %r282, %p182; + selp.f32 %r1845, %r1843, %r1844, %p181; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p183, %r1845, %r283; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p184, %r1845, %r1845; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1846, %r1845, %r283, %p184; + selp.f32 %r1847, %r1845, %r1846, %p183; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p185, %r1847, %r284; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p186, %r1847, %r1847; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1848, %r1847, %r284, %p186; + selp.f32 %r1849, %r1847, %r1848, %p185; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p187, %r1849, %r285; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p188, %r1849, %r1849; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1850, %r1849, %r285, %p188; + selp.f32 %r1851, %r1849, %r1850, %p187; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p189, %r1851, %r286; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p190, %r1851, %r1851; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1852, %r1851, %r286, %p190; + selp.f32 %r1853, %r1851, %r1852, %p189; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p191, %r1853, %r287; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p192, %r1853, %r1853; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1854, %r1853, %r287, %p192; + selp.f32 %r1855, %r1853, %r1854, %p191; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p193, %r1855, %r288; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p194, %r1855, %r1855; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1856, %r1855, %r288, %p194; + selp.f32 %r1857, %r1855, %r1856, %p193; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p195, %r1857, %r289; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p196, %r1857, %r1857; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1858, %r1857, %r289, %p196; + selp.f32 %r1859, %r1857, %r1858, %p195; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p197, %r1859, %r290; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p198, %r1859, %r1859; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1860, %r1859, %r290, %p198; + selp.f32 %r1861, %r1859, %r1860, %p197; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p199, %r1861, %r291; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p200, %r1861, %r1861; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1862, %r1861, %r291, %p200; + selp.f32 %r1863, %r1861, %r1862, %p199; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p201, %r1863, %r292; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p202, %r1863, %r1863; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1864, %r1863, %r292, %p202; + selp.f32 %r1865, %r1863, %r1864, %p201; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p203, %r1865, %r293; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p204, %r1865, %r1865; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1866, %r1865, %r293, %p204; + selp.f32 %r1867, %r1865, %r1866, %p203; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p205, %r1867, %r294; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p206, %r1867, %r1867; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1868, %r1867, %r294, %p206; + selp.f32 %r1869, %r1867, %r1868, %p205; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p207, %r1869, %r1750; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p208, %r1869, %r1869; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1870, %r1869, %r1750, %p208; + selp.f32 %r1871, %r1869, %r1870, %p207; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p209, %r1871, %r1751; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p210, %r1871, %r1871; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1872, %r1871, %r1751, %p210; + selp.f32 %r1873, %r1871, %r1872, %p209; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p211, %r1873, %r1752; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p212, %r1873, %r1873; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1874, %r1873, %r1752, %p212; + selp.f32 %r1875, %r1873, %r1874, %p211; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p213, %r1875, %r1753; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p214, %r1875, %r1875; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1876, %r1875, %r1753, %p214; + selp.f32 %r1877, %r1875, %r1876, %p213; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p215, %r1877, %r1754; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p216, %r1877, %r1877; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1878, %r1877, %r1754, %p216; + selp.f32 %r1879, %r1877, %r1878, %p215; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p217, %r1879, %r1755; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p218, %r1879, %r1879; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1880, %r1879, %r1755, %p218; + selp.f32 %r1881, %r1879, %r1880, %p217; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p219, %r1881, %r1756; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p220, %r1881, %r1881; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1882, %r1881, %r1756, %p220; + selp.f32 %r1883, %r1881, %r1882, %p219; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p221, %r1883, %r1757; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p222, %r1883, %r1883; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1884, %r1883, %r1757, %p222; + selp.f32 %r1885, %r1883, %r1884, %p221; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1886, %r1885, 16, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p223, %r1885, %r1886; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p224, %r1885, %r1885; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1887, %r1885, %r1886, %p223; + selp.f32 %r1888, %r1885, %r1887, %p224; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1889, %r1888, 8, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p225, %r1888, %r1889; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p226, %r1888, %r1888; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1890, %r1888, %r1889, %p226; + selp.f32 %r1891, %r1888, %r1890, %p225; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1892, %r1891, 4, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p227, %r1891, %r1892; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p228, %r1891, %r1891; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1893, %r1891, %r1892, %p228; + selp.f32 %r1894, %r1891, %r1893, %p227; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1895, %r1894, 2, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p229, %r1894, %r1895; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p230, %r1894, %r1894; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1896, %r1894, %r1895, %p230; + selp.f32 %r1897, %r1894, %r1896, %p229; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1898, %r1897, 1, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p231, %r1897, %r1898; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p232, %r1897, %r1897; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.eq.b32 %p9, %r207, 0; + shr.u32 %r1899, %r206, 3; + and.b32 %r1900, %r1899, 60; + mov.b32 %r1901, global_smem; + add.s32 %r65, %r1901, %r1900; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.b32 %r1902, %r1897, %r1898, %p232; + selp.b32 %r66, %r1897, %r1902, %p231; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + // begin inline asm + @%p9 st.shared.b32 [ %r65 + 0 ], %r66; + // end inline asm + bar.sync 0; + setp.lt.u32 %p10, %r206, 16; + shl.b32 %r1903, %r206, 2; + add.s32 %r68, %r1901, %r1903; + // begin inline asm + @%p10 ld.shared.b32 %r67, [ %r68 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1904, %r67, 8, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p233, %r67, %r1904; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p234, %r67, %r67; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1905, %r67, %r1904, %p233; + selp.f32 %r1906, %r67, %r1905, %p234; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1907, %r1906, 4, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p235, %r1906, %r1907; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p236, %r1906, %r1906; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1908, %r1906, %r1907, %p236; + selp.f32 %r1909, %r1906, %r1908, %p235; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1910, %r1909, 2, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p237, %r1909, %r1910; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p238, %r1909, %r1909; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1911, %r1909, %r1910, %p238; + selp.f32 %r1912, %r1909, %r1911, %p237; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r1913, %r1912, 1, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.gt.f32 %p239, %r1912, %r1913; + .loc 2 112 21 // triton_helpers.py:112:21 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.nan.f32 %p240, %r1912, %r1912; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.eq.b32 %p11, %r206, 0; + .loc 2 113 29 // triton_helpers.py:113:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.b32 %r1914, %r1912, %r1913, %p240; + selp.b32 %r70, %r1912, %r1914, %p239; + .loc 2 123 29 // triton_helpers.py:123:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + // begin inline asm + @%p11 st.shared.b32 [ %r68 + 0 ], %r70; + // end inline asm + bar.sync 0; + ld.shared.b32 %r1915, [global_smem]; + .loc 2 180 40 // triton_helpers.py:180:40 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + setp.eq.f32 %p241, %r1915, 0fFF800000; + .loc 2 180 68 // triton_helpers.py:180:68 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + sub.f32 %r1916, %r239, %r1915; + sub.f32 %r1917, %r240, %r1915; + sub.f32 %r1918, %r241, %r1915; + sub.f32 %r1919, %r242, %r1915; + sub.f32 %r1920, %r243, %r1915; + sub.f32 %r1921, %r244, %r1915; + sub.f32 %r1922, %r245, %r1915; + sub.f32 %r1923, %r246, %r1915; + sub.f32 %r1924, %r247, %r1915; + sub.f32 %r1925, %r248, %r1915; + sub.f32 %r1926, %r249, %r1915; + sub.f32 %r1927, %r250, %r1915; + sub.f32 %r1928, %r251, %r1915; + sub.f32 %r1929, %r252, %r1915; + sub.f32 %r1930, %r253, %r1915; + sub.f32 %r1931, %r254, %r1915; + sub.f32 %r1932, %r255, %r1915; + sub.f32 %r1933, %r256, %r1915; + sub.f32 %r1934, %r257, %r1915; + sub.f32 %r1935, %r258, %r1915; + sub.f32 %r1936, %r259, %r1915; + sub.f32 %r1937, %r260, %r1915; + sub.f32 %r1938, %r261, %r1915; + sub.f32 %r1939, %r262, %r1915; + sub.f32 %r1940, %r263, %r1915; + sub.f32 %r1941, %r264, %r1915; + sub.f32 %r1942, %r265, %r1915; + sub.f32 %r1943, %r266, %r1915; + sub.f32 %r1944, %r267, %r1915; + sub.f32 %r1945, %r268, %r1915; + sub.f32 %r1946, %r269, %r1915; + sub.f32 %r1947, %r270, %r1915; + sub.f32 %r1948, %r271, %r1915; + sub.f32 %r1949, %r272, %r1915; + sub.f32 %r1950, %r273, %r1915; + sub.f32 %r1951, %r274, %r1915; + sub.f32 %r1952, %r275, %r1915; + sub.f32 %r1953, %r276, %r1915; + sub.f32 %r1954, %r277, %r1915; + sub.f32 %r1955, %r278, %r1915; + sub.f32 %r1956, %r279, %r1915; + sub.f32 %r1957, %r280, %r1915; + sub.f32 %r1958, %r281, %r1915; + sub.f32 %r1959, %r282, %r1915; + sub.f32 %r1960, %r283, %r1915; + sub.f32 %r1961, %r284, %r1915; + sub.f32 %r1962, %r285, %r1915; + sub.f32 %r1963, %r286, %r1915; + sub.f32 %r1964, %r287, %r1915; + sub.f32 %r1965, %r288, %r1915; + sub.f32 %r1966, %r289, %r1915; + sub.f32 %r1967, %r290, %r1915; + sub.f32 %r1968, %r291, %r1915; + sub.f32 %r1969, %r292, %r1915; + sub.f32 %r1970, %r293, %r1915; + sub.f32 %r1971, %r294, %r1915; + sub.f32 %r1972, %r1750, %r1915; + sub.f32 %r1973, %r1751, %r1915; + sub.f32 %r1974, %r1752, %r1915; + sub.f32 %r1975, %r1753, %r1915; + sub.f32 %r1976, %r1754, %r1915; + sub.f32 %r1977, %r1755, %r1915; + sub.f32 %r1978, %r1756, %r1915; + sub.f32 %r1979, %r1757, %r1915; + .loc 2 180 58 // triton_helpers.py:180:58 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + selp.f32 %r1980, 0f00000000, %r1916, %p241; + selp.f32 %r1981, 0f00000000, %r1917, %p241; + selp.f32 %r1982, 0f00000000, %r1918, %p241; + selp.f32 %r1983, 0f00000000, %r1919, %p241; + selp.f32 %r1984, 0f00000000, %r1920, %p241; + selp.f32 %r1985, 0f00000000, %r1921, %p241; + selp.f32 %r1986, 0f00000000, %r1922, %p241; + selp.f32 %r1987, 0f00000000, %r1923, %p241; + selp.f32 %r1988, 0f00000000, %r1924, %p241; + selp.f32 %r1989, 0f00000000, %r1925, %p241; + selp.f32 %r1990, 0f00000000, %r1926, %p241; + selp.f32 %r1991, 0f00000000, %r1927, %p241; + selp.f32 %r1992, 0f00000000, %r1928, %p241; + selp.f32 %r1993, 0f00000000, %r1929, %p241; + selp.f32 %r1994, 0f00000000, %r1930, %p241; + selp.f32 %r1995, 0f00000000, %r1931, %p241; + selp.f32 %r1996, 0f00000000, %r1932, %p241; + selp.f32 %r1997, 0f00000000, %r1933, %p241; + selp.f32 %r1998, 0f00000000, %r1934, %p241; + selp.f32 %r1999, 0f00000000, %r1935, %p241; + selp.f32 %r2000, 0f00000000, %r1936, %p241; + selp.f32 %r2001, 0f00000000, %r1937, %p241; + selp.f32 %r2002, 0f00000000, %r1938, %p241; + selp.f32 %r2003, 0f00000000, %r1939, %p241; + selp.f32 %r2004, 0f00000000, %r1940, %p241; + selp.f32 %r2005, 0f00000000, %r1941, %p241; + selp.f32 %r2006, 0f00000000, %r1942, %p241; + selp.f32 %r2007, 0f00000000, %r1943, %p241; + selp.f32 %r2008, 0f00000000, %r1944, %p241; + selp.f32 %r2009, 0f00000000, %r1945, %p241; + selp.f32 %r2010, 0f00000000, %r1946, %p241; + selp.f32 %r2011, 0f00000000, %r1947, %p241; + selp.f32 %r2012, 0f00000000, %r1948, %p241; + selp.f32 %r2013, 0f00000000, %r1949, %p241; + selp.f32 %r2014, 0f00000000, %r1950, %p241; + selp.f32 %r2015, 0f00000000, %r1951, %p241; + selp.f32 %r2016, 0f00000000, %r1952, %p241; + selp.f32 %r2017, 0f00000000, %r1953, %p241; + selp.f32 %r2018, 0f00000000, %r1954, %p241; + selp.f32 %r2019, 0f00000000, %r1955, %p241; + selp.f32 %r2020, 0f00000000, %r1956, %p241; + selp.f32 %r2021, 0f00000000, %r1957, %p241; + selp.f32 %r2022, 0f00000000, %r1958, %p241; + selp.f32 %r2023, 0f00000000, %r1959, %p241; + selp.f32 %r2024, 0f00000000, %r1960, %p241; + selp.f32 %r2025, 0f00000000, %r1961, %p241; + selp.f32 %r2026, 0f00000000, %r1962, %p241; + selp.f32 %r2027, 0f00000000, %r1963, %p241; + selp.f32 %r2028, 0f00000000, %r1964, %p241; + selp.f32 %r2029, 0f00000000, %r1965, %p241; + selp.f32 %r2030, 0f00000000, %r1966, %p241; + selp.f32 %r2031, 0f00000000, %r1967, %p241; + selp.f32 %r2032, 0f00000000, %r1968, %p241; + selp.f32 %r2033, 0f00000000, %r1969, %p241; + selp.f32 %r2034, 0f00000000, %r1970, %p241; + selp.f32 %r2035, 0f00000000, %r1971, %p241; + selp.f32 %r2036, 0f00000000, %r1972, %p241; + selp.f32 %r2037, 0f00000000, %r1973, %p241; + selp.f32 %r2038, 0f00000000, %r1974, %p241; + selp.f32 %r2039, 0f00000000, %r1975, %p241; + selp.f32 %r2040, 0f00000000, %r1976, %p241; + selp.f32 %r2041, 0f00000000, %r1977, %p241; + selp.f32 %r2042, 0f00000000, %r1978, %p241; + selp.f32 %r2043, 0f00000000, %r1979, %p241; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + fma.rn.ftz.f32 %r2044, %r1980, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2045, %r2044; + fma.rm.ftz.f32 %r2046, %r2045, %r373, %r372; + add.f32 %r2047, %r2046, 0fCB40007F; + neg.f32 %r2048, %r2047; + fma.rn.ftz.f32 %r2049, %r1980, %r377, %r2048; + fma.rn.ftz.f32 %r2050, %r1980, %r379, %r2049; + shl.b32 %r2051, %r2046, 23; + ex2.approx.ftz.f32 %r2052, %r2050; + mul.f32 %r2053, %r2052, %r2051; + fma.rn.ftz.f32 %r2054, %r1981, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2055, %r2054; + fma.rm.ftz.f32 %r2056, %r2055, %r373, %r372; + add.f32 %r2057, %r2056, 0fCB40007F; + neg.f32 %r2058, %r2057; + fma.rn.ftz.f32 %r2059, %r1981, %r377, %r2058; + fma.rn.ftz.f32 %r2060, %r1981, %r379, %r2059; + shl.b32 %r2061, %r2056, 23; + ex2.approx.ftz.f32 %r2062, %r2060; + mul.f32 %r2063, %r2062, %r2061; + fma.rn.ftz.f32 %r2064, %r1982, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2065, %r2064; + fma.rm.ftz.f32 %r2066, %r2065, %r373, %r372; + add.f32 %r2067, %r2066, 0fCB40007F; + neg.f32 %r2068, %r2067; + fma.rn.ftz.f32 %r2069, %r1982, %r377, %r2068; + fma.rn.ftz.f32 %r2070, %r1982, %r379, %r2069; + shl.b32 %r2071, %r2066, 23; + ex2.approx.ftz.f32 %r2072, %r2070; + mul.f32 %r2073, %r2072, %r2071; + fma.rn.ftz.f32 %r2074, %r1983, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2075, %r2074; + fma.rm.ftz.f32 %r2076, %r2075, %r373, %r372; + add.f32 %r2077, %r2076, 0fCB40007F; + neg.f32 %r2078, %r2077; + fma.rn.ftz.f32 %r2079, %r1983, %r377, %r2078; + fma.rn.ftz.f32 %r2080, %r1983, %r379, %r2079; + shl.b32 %r2081, %r2076, 23; + ex2.approx.ftz.f32 %r2082, %r2080; + mul.f32 %r2083, %r2082, %r2081; + fma.rn.ftz.f32 %r2084, %r1984, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2085, %r2084; + fma.rm.ftz.f32 %r2086, %r2085, %r373, %r372; + add.f32 %r2087, %r2086, 0fCB40007F; + neg.f32 %r2088, %r2087; + fma.rn.ftz.f32 %r2089, %r1984, %r377, %r2088; + fma.rn.ftz.f32 %r2090, %r1984, %r379, %r2089; + shl.b32 %r2091, %r2086, 23; + ex2.approx.ftz.f32 %r2092, %r2090; + mul.f32 %r2093, %r2092, %r2091; + fma.rn.ftz.f32 %r2094, %r1985, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2095, %r2094; + fma.rm.ftz.f32 %r2096, %r2095, %r373, %r372; + add.f32 %r2097, %r2096, 0fCB40007F; + neg.f32 %r2098, %r2097; + fma.rn.ftz.f32 %r2099, %r1985, %r377, %r2098; + fma.rn.ftz.f32 %r2100, %r1985, %r379, %r2099; + shl.b32 %r2101, %r2096, 23; + ex2.approx.ftz.f32 %r2102, %r2100; + mul.f32 %r2103, %r2102, %r2101; + fma.rn.ftz.f32 %r2104, %r1986, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2105, %r2104; + fma.rm.ftz.f32 %r2106, %r2105, %r373, %r372; + add.f32 %r2107, %r2106, 0fCB40007F; + neg.f32 %r2108, %r2107; + fma.rn.ftz.f32 %r2109, %r1986, %r377, %r2108; + fma.rn.ftz.f32 %r2110, %r1986, %r379, %r2109; + shl.b32 %r2111, %r2106, 23; + ex2.approx.ftz.f32 %r2112, %r2110; + mul.f32 %r2113, %r2112, %r2111; + fma.rn.ftz.f32 %r2114, %r1987, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2115, %r2114; + fma.rm.ftz.f32 %r2116, %r2115, %r373, %r372; + add.f32 %r2117, %r2116, 0fCB40007F; + neg.f32 %r2118, %r2117; + fma.rn.ftz.f32 %r2119, %r1987, %r377, %r2118; + fma.rn.ftz.f32 %r2120, %r1987, %r379, %r2119; + shl.b32 %r2121, %r2116, 23; + ex2.approx.ftz.f32 %r2122, %r2120; + mul.f32 %r2123, %r2122, %r2121; + fma.rn.ftz.f32 %r2124, %r1988, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2125, %r2124; + fma.rm.ftz.f32 %r2126, %r2125, %r373, %r372; + add.f32 %r2127, %r2126, 0fCB40007F; + neg.f32 %r2128, %r2127; + fma.rn.ftz.f32 %r2129, %r1988, %r377, %r2128; + fma.rn.ftz.f32 %r2130, %r1988, %r379, %r2129; + shl.b32 %r2131, %r2126, 23; + ex2.approx.ftz.f32 %r2132, %r2130; + mul.f32 %r2133, %r2132, %r2131; + fma.rn.ftz.f32 %r2134, %r1989, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2135, %r2134; + fma.rm.ftz.f32 %r2136, %r2135, %r373, %r372; + add.f32 %r2137, %r2136, 0fCB40007F; + neg.f32 %r2138, %r2137; + fma.rn.ftz.f32 %r2139, %r1989, %r377, %r2138; + fma.rn.ftz.f32 %r2140, %r1989, %r379, %r2139; + shl.b32 %r2141, %r2136, 23; + ex2.approx.ftz.f32 %r2142, %r2140; + mul.f32 %r2143, %r2142, %r2141; + fma.rn.ftz.f32 %r2144, %r1990, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2145, %r2144; + fma.rm.ftz.f32 %r2146, %r2145, %r373, %r372; + add.f32 %r2147, %r2146, 0fCB40007F; + neg.f32 %r2148, %r2147; + fma.rn.ftz.f32 %r2149, %r1990, %r377, %r2148; + fma.rn.ftz.f32 %r2150, %r1990, %r379, %r2149; + shl.b32 %r2151, %r2146, 23; + ex2.approx.ftz.f32 %r2152, %r2150; + mul.f32 %r2153, %r2152, %r2151; + fma.rn.ftz.f32 %r2154, %r1991, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2155, %r2154; + fma.rm.ftz.f32 %r2156, %r2155, %r373, %r372; + add.f32 %r2157, %r2156, 0fCB40007F; + neg.f32 %r2158, %r2157; + fma.rn.ftz.f32 %r2159, %r1991, %r377, %r2158; + fma.rn.ftz.f32 %r2160, %r1991, %r379, %r2159; + shl.b32 %r2161, %r2156, 23; + ex2.approx.ftz.f32 %r2162, %r2160; + mul.f32 %r2163, %r2162, %r2161; + fma.rn.ftz.f32 %r2164, %r1992, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2165, %r2164; + fma.rm.ftz.f32 %r2166, %r2165, %r373, %r372; + add.f32 %r2167, %r2166, 0fCB40007F; + neg.f32 %r2168, %r2167; + fma.rn.ftz.f32 %r2169, %r1992, %r377, %r2168; + fma.rn.ftz.f32 %r2170, %r1992, %r379, %r2169; + shl.b32 %r2171, %r2166, 23; + ex2.approx.ftz.f32 %r2172, %r2170; + mul.f32 %r2173, %r2172, %r2171; + fma.rn.ftz.f32 %r2174, %r1993, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2175, %r2174; + fma.rm.ftz.f32 %r2176, %r2175, %r373, %r372; + add.f32 %r2177, %r2176, 0fCB40007F; + neg.f32 %r2178, %r2177; + fma.rn.ftz.f32 %r2179, %r1993, %r377, %r2178; + fma.rn.ftz.f32 %r2180, %r1993, %r379, %r2179; + shl.b32 %r2181, %r2176, 23; + ex2.approx.ftz.f32 %r2182, %r2180; + mul.f32 %r2183, %r2182, %r2181; + fma.rn.ftz.f32 %r2184, %r1994, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2185, %r2184; + fma.rm.ftz.f32 %r2186, %r2185, %r373, %r372; + add.f32 %r2187, %r2186, 0fCB40007F; + neg.f32 %r2188, %r2187; + fma.rn.ftz.f32 %r2189, %r1994, %r377, %r2188; + fma.rn.ftz.f32 %r2190, %r1994, %r379, %r2189; + shl.b32 %r2191, %r2186, 23; + ex2.approx.ftz.f32 %r2192, %r2190; + mul.f32 %r2193, %r2192, %r2191; + fma.rn.ftz.f32 %r2194, %r1995, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2195, %r2194; + fma.rm.ftz.f32 %r2196, %r2195, %r373, %r372; + add.f32 %r2197, %r2196, 0fCB40007F; + neg.f32 %r2198, %r2197; + fma.rn.ftz.f32 %r2199, %r1995, %r377, %r2198; + fma.rn.ftz.f32 %r2200, %r1995, %r379, %r2199; + shl.b32 %r2201, %r2196, 23; + ex2.approx.ftz.f32 %r2202, %r2200; + mul.f32 %r2203, %r2202, %r2201; + fma.rn.ftz.f32 %r2204, %r1996, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2205, %r2204; + fma.rm.ftz.f32 %r2206, %r2205, %r373, %r372; + add.f32 %r2207, %r2206, 0fCB40007F; + neg.f32 %r2208, %r2207; + fma.rn.ftz.f32 %r2209, %r1996, %r377, %r2208; + fma.rn.ftz.f32 %r2210, %r1996, %r379, %r2209; + shl.b32 %r2211, %r2206, 23; + ex2.approx.ftz.f32 %r2212, %r2210; + mul.f32 %r2213, %r2212, %r2211; + fma.rn.ftz.f32 %r2214, %r1997, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2215, %r2214; + fma.rm.ftz.f32 %r2216, %r2215, %r373, %r372; + add.f32 %r2217, %r2216, 0fCB40007F; + neg.f32 %r2218, %r2217; + fma.rn.ftz.f32 %r2219, %r1997, %r377, %r2218; + fma.rn.ftz.f32 %r2220, %r1997, %r379, %r2219; + shl.b32 %r2221, %r2216, 23; + ex2.approx.ftz.f32 %r2222, %r2220; + mul.f32 %r2223, %r2222, %r2221; + fma.rn.ftz.f32 %r2224, %r1998, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2225, %r2224; + fma.rm.ftz.f32 %r2226, %r2225, %r373, %r372; + add.f32 %r2227, %r2226, 0fCB40007F; + neg.f32 %r2228, %r2227; + fma.rn.ftz.f32 %r2229, %r1998, %r377, %r2228; + fma.rn.ftz.f32 %r2230, %r1998, %r379, %r2229; + shl.b32 %r2231, %r2226, 23; + ex2.approx.ftz.f32 %r2232, %r2230; + mul.f32 %r2233, %r2232, %r2231; + fma.rn.ftz.f32 %r2234, %r1999, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2235, %r2234; + fma.rm.ftz.f32 %r2236, %r2235, %r373, %r372; + add.f32 %r2237, %r2236, 0fCB40007F; + neg.f32 %r2238, %r2237; + fma.rn.ftz.f32 %r2239, %r1999, %r377, %r2238; + fma.rn.ftz.f32 %r2240, %r1999, %r379, %r2239; + shl.b32 %r2241, %r2236, 23; + ex2.approx.ftz.f32 %r2242, %r2240; + mul.f32 %r2243, %r2242, %r2241; + fma.rn.ftz.f32 %r2244, %r2000, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2245, %r2244; + fma.rm.ftz.f32 %r2246, %r2245, %r373, %r372; + add.f32 %r2247, %r2246, 0fCB40007F; + neg.f32 %r2248, %r2247; + fma.rn.ftz.f32 %r2249, %r2000, %r377, %r2248; + fma.rn.ftz.f32 %r2250, %r2000, %r379, %r2249; + shl.b32 %r2251, %r2246, 23; + ex2.approx.ftz.f32 %r2252, %r2250; + mul.f32 %r2253, %r2252, %r2251; + fma.rn.ftz.f32 %r2254, %r2001, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2255, %r2254; + fma.rm.ftz.f32 %r2256, %r2255, %r373, %r372; + add.f32 %r2257, %r2256, 0fCB40007F; + neg.f32 %r2258, %r2257; + fma.rn.ftz.f32 %r2259, %r2001, %r377, %r2258; + fma.rn.ftz.f32 %r2260, %r2001, %r379, %r2259; + shl.b32 %r2261, %r2256, 23; + ex2.approx.ftz.f32 %r2262, %r2260; + mul.f32 %r2263, %r2262, %r2261; + fma.rn.ftz.f32 %r2264, %r2002, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2265, %r2264; + fma.rm.ftz.f32 %r2266, %r2265, %r373, %r372; + add.f32 %r2267, %r2266, 0fCB40007F; + neg.f32 %r2268, %r2267; + fma.rn.ftz.f32 %r2269, %r2002, %r377, %r2268; + fma.rn.ftz.f32 %r2270, %r2002, %r379, %r2269; + shl.b32 %r2271, %r2266, 23; + ex2.approx.ftz.f32 %r2272, %r2270; + mul.f32 %r2273, %r2272, %r2271; + fma.rn.ftz.f32 %r2274, %r2003, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2275, %r2274; + fma.rm.ftz.f32 %r2276, %r2275, %r373, %r372; + add.f32 %r2277, %r2276, 0fCB40007F; + neg.f32 %r2278, %r2277; + fma.rn.ftz.f32 %r2279, %r2003, %r377, %r2278; + fma.rn.ftz.f32 %r2280, %r2003, %r379, %r2279; + shl.b32 %r2281, %r2276, 23; + ex2.approx.ftz.f32 %r2282, %r2280; + mul.f32 %r2283, %r2282, %r2281; + fma.rn.ftz.f32 %r2284, %r2004, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2285, %r2284; + fma.rm.ftz.f32 %r2286, %r2285, %r373, %r372; + add.f32 %r2287, %r2286, 0fCB40007F; + neg.f32 %r2288, %r2287; + fma.rn.ftz.f32 %r2289, %r2004, %r377, %r2288; + fma.rn.ftz.f32 %r2290, %r2004, %r379, %r2289; + shl.b32 %r2291, %r2286, 23; + ex2.approx.ftz.f32 %r2292, %r2290; + mul.f32 %r2293, %r2292, %r2291; + fma.rn.ftz.f32 %r2294, %r2005, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2295, %r2294; + fma.rm.ftz.f32 %r2296, %r2295, %r373, %r372; + add.f32 %r2297, %r2296, 0fCB40007F; + neg.f32 %r2298, %r2297; + fma.rn.ftz.f32 %r2299, %r2005, %r377, %r2298; + fma.rn.ftz.f32 %r2300, %r2005, %r379, %r2299; + shl.b32 %r2301, %r2296, 23; + ex2.approx.ftz.f32 %r2302, %r2300; + mul.f32 %r2303, %r2302, %r2301; + fma.rn.ftz.f32 %r2304, %r2006, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2305, %r2304; + fma.rm.ftz.f32 %r2306, %r2305, %r373, %r372; + add.f32 %r2307, %r2306, 0fCB40007F; + neg.f32 %r2308, %r2307; + fma.rn.ftz.f32 %r2309, %r2006, %r377, %r2308; + fma.rn.ftz.f32 %r2310, %r2006, %r379, %r2309; + shl.b32 %r2311, %r2306, 23; + ex2.approx.ftz.f32 %r2312, %r2310; + mul.f32 %r2313, %r2312, %r2311; + fma.rn.ftz.f32 %r2314, %r2007, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2315, %r2314; + fma.rm.ftz.f32 %r2316, %r2315, %r373, %r372; + add.f32 %r2317, %r2316, 0fCB40007F; + neg.f32 %r2318, %r2317; + fma.rn.ftz.f32 %r2319, %r2007, %r377, %r2318; + fma.rn.ftz.f32 %r2320, %r2007, %r379, %r2319; + shl.b32 %r2321, %r2316, 23; + ex2.approx.ftz.f32 %r2322, %r2320; + mul.f32 %r2323, %r2322, %r2321; + fma.rn.ftz.f32 %r2324, %r2008, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2325, %r2324; + fma.rm.ftz.f32 %r2326, %r2325, %r373, %r372; + add.f32 %r2327, %r2326, 0fCB40007F; + neg.f32 %r2328, %r2327; + fma.rn.ftz.f32 %r2329, %r2008, %r377, %r2328; + fma.rn.ftz.f32 %r2330, %r2008, %r379, %r2329; + shl.b32 %r2331, %r2326, 23; + ex2.approx.ftz.f32 %r2332, %r2330; + mul.f32 %r2333, %r2332, %r2331; + fma.rn.ftz.f32 %r2334, %r2009, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2335, %r2334; + fma.rm.ftz.f32 %r2336, %r2335, %r373, %r372; + add.f32 %r2337, %r2336, 0fCB40007F; + neg.f32 %r2338, %r2337; + fma.rn.ftz.f32 %r2339, %r2009, %r377, %r2338; + fma.rn.ftz.f32 %r2340, %r2009, %r379, %r2339; + shl.b32 %r2341, %r2336, 23; + ex2.approx.ftz.f32 %r2342, %r2340; + mul.f32 %r2343, %r2342, %r2341; + fma.rn.ftz.f32 %r2344, %r2010, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2345, %r2344; + fma.rm.ftz.f32 %r2346, %r2345, %r373, %r372; + add.f32 %r2347, %r2346, 0fCB40007F; + neg.f32 %r2348, %r2347; + fma.rn.ftz.f32 %r2349, %r2010, %r377, %r2348; + fma.rn.ftz.f32 %r2350, %r2010, %r379, %r2349; + shl.b32 %r2351, %r2346, 23; + ex2.approx.ftz.f32 %r2352, %r2350; + mul.f32 %r2353, %r2352, %r2351; + fma.rn.ftz.f32 %r2354, %r2011, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2355, %r2354; + fma.rm.ftz.f32 %r2356, %r2355, %r373, %r372; + add.f32 %r2357, %r2356, 0fCB40007F; + neg.f32 %r2358, %r2357; + fma.rn.ftz.f32 %r2359, %r2011, %r377, %r2358; + fma.rn.ftz.f32 %r2360, %r2011, %r379, %r2359; + shl.b32 %r2361, %r2356, 23; + ex2.approx.ftz.f32 %r2362, %r2360; + mul.f32 %r2363, %r2362, %r2361; + fma.rn.ftz.f32 %r2364, %r2012, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2365, %r2364; + fma.rm.ftz.f32 %r2366, %r2365, %r373, %r372; + add.f32 %r2367, %r2366, 0fCB40007F; + neg.f32 %r2368, %r2367; + fma.rn.ftz.f32 %r2369, %r2012, %r377, %r2368; + fma.rn.ftz.f32 %r2370, %r2012, %r379, %r2369; + shl.b32 %r2371, %r2366, 23; + ex2.approx.ftz.f32 %r2372, %r2370; + mul.f32 %r2373, %r2372, %r2371; + fma.rn.ftz.f32 %r2374, %r2013, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2375, %r2374; + fma.rm.ftz.f32 %r2376, %r2375, %r373, %r372; + add.f32 %r2377, %r2376, 0fCB40007F; + neg.f32 %r2378, %r2377; + fma.rn.ftz.f32 %r2379, %r2013, %r377, %r2378; + fma.rn.ftz.f32 %r2380, %r2013, %r379, %r2379; + shl.b32 %r2381, %r2376, 23; + ex2.approx.ftz.f32 %r2382, %r2380; + mul.f32 %r2383, %r2382, %r2381; + fma.rn.ftz.f32 %r2384, %r2014, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2385, %r2384; + fma.rm.ftz.f32 %r2386, %r2385, %r373, %r372; + add.f32 %r2387, %r2386, 0fCB40007F; + neg.f32 %r2388, %r2387; + fma.rn.ftz.f32 %r2389, %r2014, %r377, %r2388; + fma.rn.ftz.f32 %r2390, %r2014, %r379, %r2389; + shl.b32 %r2391, %r2386, 23; + ex2.approx.ftz.f32 %r2392, %r2390; + mul.f32 %r2393, %r2392, %r2391; + fma.rn.ftz.f32 %r2394, %r2015, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2395, %r2394; + fma.rm.ftz.f32 %r2396, %r2395, %r373, %r372; + add.f32 %r2397, %r2396, 0fCB40007F; + neg.f32 %r2398, %r2397; + fma.rn.ftz.f32 %r2399, %r2015, %r377, %r2398; + fma.rn.ftz.f32 %r2400, %r2015, %r379, %r2399; + shl.b32 %r2401, %r2396, 23; + ex2.approx.ftz.f32 %r2402, %r2400; + mul.f32 %r2403, %r2402, %r2401; + fma.rn.ftz.f32 %r2404, %r2016, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2405, %r2404; + fma.rm.ftz.f32 %r2406, %r2405, %r373, %r372; + add.f32 %r2407, %r2406, 0fCB40007F; + neg.f32 %r2408, %r2407; + fma.rn.ftz.f32 %r2409, %r2016, %r377, %r2408; + fma.rn.ftz.f32 %r2410, %r2016, %r379, %r2409; + shl.b32 %r2411, %r2406, 23; + ex2.approx.ftz.f32 %r2412, %r2410; + mul.f32 %r2413, %r2412, %r2411; + fma.rn.ftz.f32 %r2414, %r2017, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2415, %r2414; + fma.rm.ftz.f32 %r2416, %r2415, %r373, %r372; + add.f32 %r2417, %r2416, 0fCB40007F; + neg.f32 %r2418, %r2417; + fma.rn.ftz.f32 %r2419, %r2017, %r377, %r2418; + fma.rn.ftz.f32 %r2420, %r2017, %r379, %r2419; + shl.b32 %r2421, %r2416, 23; + ex2.approx.ftz.f32 %r2422, %r2420; + mul.f32 %r2423, %r2422, %r2421; + fma.rn.ftz.f32 %r2424, %r2018, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2425, %r2424; + fma.rm.ftz.f32 %r2426, %r2425, %r373, %r372; + add.f32 %r2427, %r2426, 0fCB40007F; + neg.f32 %r2428, %r2427; + fma.rn.ftz.f32 %r2429, %r2018, %r377, %r2428; + fma.rn.ftz.f32 %r2430, %r2018, %r379, %r2429; + shl.b32 %r2431, %r2426, 23; + ex2.approx.ftz.f32 %r2432, %r2430; + mul.f32 %r2433, %r2432, %r2431; + fma.rn.ftz.f32 %r2434, %r2019, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2435, %r2434; + fma.rm.ftz.f32 %r2436, %r2435, %r373, %r372; + add.f32 %r2437, %r2436, 0fCB40007F; + neg.f32 %r2438, %r2437; + fma.rn.ftz.f32 %r2439, %r2019, %r377, %r2438; + fma.rn.ftz.f32 %r2440, %r2019, %r379, %r2439; + shl.b32 %r2441, %r2436, 23; + ex2.approx.ftz.f32 %r2442, %r2440; + mul.f32 %r2443, %r2442, %r2441; + fma.rn.ftz.f32 %r2444, %r2020, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2445, %r2444; + fma.rm.ftz.f32 %r2446, %r2445, %r373, %r372; + add.f32 %r2447, %r2446, 0fCB40007F; + neg.f32 %r2448, %r2447; + fma.rn.ftz.f32 %r2449, %r2020, %r377, %r2448; + fma.rn.ftz.f32 %r2450, %r2020, %r379, %r2449; + shl.b32 %r2451, %r2446, 23; + ex2.approx.ftz.f32 %r2452, %r2450; + mul.f32 %r2453, %r2452, %r2451; + fma.rn.ftz.f32 %r2454, %r2021, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2455, %r2454; + fma.rm.ftz.f32 %r2456, %r2455, %r373, %r372; + add.f32 %r2457, %r2456, 0fCB40007F; + neg.f32 %r2458, %r2457; + fma.rn.ftz.f32 %r2459, %r2021, %r377, %r2458; + fma.rn.ftz.f32 %r2460, %r2021, %r379, %r2459; + shl.b32 %r2461, %r2456, 23; + ex2.approx.ftz.f32 %r2462, %r2460; + mul.f32 %r2463, %r2462, %r2461; + fma.rn.ftz.f32 %r2464, %r2022, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2465, %r2464; + fma.rm.ftz.f32 %r2466, %r2465, %r373, %r372; + add.f32 %r2467, %r2466, 0fCB40007F; + neg.f32 %r2468, %r2467; + fma.rn.ftz.f32 %r2469, %r2022, %r377, %r2468; + fma.rn.ftz.f32 %r2470, %r2022, %r379, %r2469; + shl.b32 %r2471, %r2466, 23; + ex2.approx.ftz.f32 %r2472, %r2470; + mul.f32 %r2473, %r2472, %r2471; + fma.rn.ftz.f32 %r2474, %r2023, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2475, %r2474; + fma.rm.ftz.f32 %r2476, %r2475, %r373, %r372; + add.f32 %r2477, %r2476, 0fCB40007F; + neg.f32 %r2478, %r2477; + fma.rn.ftz.f32 %r2479, %r2023, %r377, %r2478; + fma.rn.ftz.f32 %r2480, %r2023, %r379, %r2479; + shl.b32 %r2481, %r2476, 23; + ex2.approx.ftz.f32 %r2482, %r2480; + mul.f32 %r2483, %r2482, %r2481; + fma.rn.ftz.f32 %r2484, %r2024, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2485, %r2484; + fma.rm.ftz.f32 %r2486, %r2485, %r373, %r372; + add.f32 %r2487, %r2486, 0fCB40007F; + neg.f32 %r2488, %r2487; + fma.rn.ftz.f32 %r2489, %r2024, %r377, %r2488; + fma.rn.ftz.f32 %r2490, %r2024, %r379, %r2489; + shl.b32 %r2491, %r2486, 23; + ex2.approx.ftz.f32 %r2492, %r2490; + mul.f32 %r2493, %r2492, %r2491; + fma.rn.ftz.f32 %r2494, %r2025, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2495, %r2494; + fma.rm.ftz.f32 %r2496, %r2495, %r373, %r372; + add.f32 %r2497, %r2496, 0fCB40007F; + neg.f32 %r2498, %r2497; + fma.rn.ftz.f32 %r2499, %r2025, %r377, %r2498; + fma.rn.ftz.f32 %r2500, %r2025, %r379, %r2499; + shl.b32 %r2501, %r2496, 23; + ex2.approx.ftz.f32 %r2502, %r2500; + mul.f32 %r2503, %r2502, %r2501; + fma.rn.ftz.f32 %r2504, %r2026, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2505, %r2504; + fma.rm.ftz.f32 %r2506, %r2505, %r373, %r372; + add.f32 %r2507, %r2506, 0fCB40007F; + neg.f32 %r2508, %r2507; + fma.rn.ftz.f32 %r2509, %r2026, %r377, %r2508; + fma.rn.ftz.f32 %r2510, %r2026, %r379, %r2509; + shl.b32 %r2511, %r2506, 23; + ex2.approx.ftz.f32 %r2512, %r2510; + mul.f32 %r2513, %r2512, %r2511; + fma.rn.ftz.f32 %r2514, %r2027, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2515, %r2514; + fma.rm.ftz.f32 %r2516, %r2515, %r373, %r372; + add.f32 %r2517, %r2516, 0fCB40007F; + neg.f32 %r2518, %r2517; + fma.rn.ftz.f32 %r2519, %r2027, %r377, %r2518; + fma.rn.ftz.f32 %r2520, %r2027, %r379, %r2519; + shl.b32 %r2521, %r2516, 23; + ex2.approx.ftz.f32 %r2522, %r2520; + mul.f32 %r2523, %r2522, %r2521; + fma.rn.ftz.f32 %r2524, %r2028, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2525, %r2524; + fma.rm.ftz.f32 %r2526, %r2525, %r373, %r372; + add.f32 %r2527, %r2526, 0fCB40007F; + neg.f32 %r2528, %r2527; + fma.rn.ftz.f32 %r2529, %r2028, %r377, %r2528; + fma.rn.ftz.f32 %r2530, %r2028, %r379, %r2529; + shl.b32 %r2531, %r2526, 23; + ex2.approx.ftz.f32 %r2532, %r2530; + mul.f32 %r2533, %r2532, %r2531; + fma.rn.ftz.f32 %r2534, %r2029, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2535, %r2534; + fma.rm.ftz.f32 %r2536, %r2535, %r373, %r372; + add.f32 %r2537, %r2536, 0fCB40007F; + neg.f32 %r2538, %r2537; + fma.rn.ftz.f32 %r2539, %r2029, %r377, %r2538; + fma.rn.ftz.f32 %r2540, %r2029, %r379, %r2539; + shl.b32 %r2541, %r2536, 23; + ex2.approx.ftz.f32 %r2542, %r2540; + mul.f32 %r2543, %r2542, %r2541; + fma.rn.ftz.f32 %r2544, %r2030, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2545, %r2544; + fma.rm.ftz.f32 %r2546, %r2545, %r373, %r372; + add.f32 %r2547, %r2546, 0fCB40007F; + neg.f32 %r2548, %r2547; + fma.rn.ftz.f32 %r2549, %r2030, %r377, %r2548; + fma.rn.ftz.f32 %r2550, %r2030, %r379, %r2549; + shl.b32 %r2551, %r2546, 23; + ex2.approx.ftz.f32 %r2552, %r2550; + mul.f32 %r2553, %r2552, %r2551; + fma.rn.ftz.f32 %r2554, %r2031, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2555, %r2554; + fma.rm.ftz.f32 %r2556, %r2555, %r373, %r372; + add.f32 %r2557, %r2556, 0fCB40007F; + neg.f32 %r2558, %r2557; + fma.rn.ftz.f32 %r2559, %r2031, %r377, %r2558; + fma.rn.ftz.f32 %r2560, %r2031, %r379, %r2559; + shl.b32 %r2561, %r2556, 23; + ex2.approx.ftz.f32 %r2562, %r2560; + mul.f32 %r2563, %r2562, %r2561; + fma.rn.ftz.f32 %r2564, %r2032, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2565, %r2564; + fma.rm.ftz.f32 %r2566, %r2565, %r373, %r372; + add.f32 %r2567, %r2566, 0fCB40007F; + neg.f32 %r2568, %r2567; + fma.rn.ftz.f32 %r2569, %r2032, %r377, %r2568; + fma.rn.ftz.f32 %r2570, %r2032, %r379, %r2569; + shl.b32 %r2571, %r2566, 23; + ex2.approx.ftz.f32 %r2572, %r2570; + mul.f32 %r2573, %r2572, %r2571; + fma.rn.ftz.f32 %r2574, %r2033, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2575, %r2574; + fma.rm.ftz.f32 %r2576, %r2575, %r373, %r372; + add.f32 %r2577, %r2576, 0fCB40007F; + neg.f32 %r2578, %r2577; + fma.rn.ftz.f32 %r2579, %r2033, %r377, %r2578; + fma.rn.ftz.f32 %r2580, %r2033, %r379, %r2579; + shl.b32 %r2581, %r2576, 23; + ex2.approx.ftz.f32 %r2582, %r2580; + mul.f32 %r2583, %r2582, %r2581; + fma.rn.ftz.f32 %r2584, %r2034, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2585, %r2584; + fma.rm.ftz.f32 %r2586, %r2585, %r373, %r372; + add.f32 %r2587, %r2586, 0fCB40007F; + neg.f32 %r2588, %r2587; + fma.rn.ftz.f32 %r2589, %r2034, %r377, %r2588; + fma.rn.ftz.f32 %r2590, %r2034, %r379, %r2589; + shl.b32 %r2591, %r2586, 23; + ex2.approx.ftz.f32 %r2592, %r2590; + mul.f32 %r2593, %r2592, %r2591; + fma.rn.ftz.f32 %r2594, %r2035, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2595, %r2594; + fma.rm.ftz.f32 %r2596, %r2595, %r373, %r372; + add.f32 %r2597, %r2596, 0fCB40007F; + neg.f32 %r2598, %r2597; + fma.rn.ftz.f32 %r2599, %r2035, %r377, %r2598; + fma.rn.ftz.f32 %r2600, %r2035, %r379, %r2599; + shl.b32 %r2601, %r2596, 23; + ex2.approx.ftz.f32 %r2602, %r2600; + mul.f32 %r2603, %r2602, %r2601; + fma.rn.ftz.f32 %r2604, %r2036, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2605, %r2604; + fma.rm.ftz.f32 %r2606, %r2605, %r373, %r372; + add.f32 %r2607, %r2606, 0fCB40007F; + neg.f32 %r2608, %r2607; + fma.rn.ftz.f32 %r2609, %r2036, %r377, %r2608; + fma.rn.ftz.f32 %r2610, %r2036, %r379, %r2609; + shl.b32 %r2611, %r2606, 23; + ex2.approx.ftz.f32 %r2612, %r2610; + mul.f32 %r2613, %r2612, %r2611; + fma.rn.ftz.f32 %r2614, %r2037, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2615, %r2614; + fma.rm.ftz.f32 %r2616, %r2615, %r373, %r372; + add.f32 %r2617, %r2616, 0fCB40007F; + neg.f32 %r2618, %r2617; + fma.rn.ftz.f32 %r2619, %r2037, %r377, %r2618; + fma.rn.ftz.f32 %r2620, %r2037, %r379, %r2619; + shl.b32 %r2621, %r2616, 23; + ex2.approx.ftz.f32 %r2622, %r2620; + mul.f32 %r2623, %r2622, %r2621; + fma.rn.ftz.f32 %r2624, %r2038, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2625, %r2624; + fma.rm.ftz.f32 %r2626, %r2625, %r373, %r372; + fma.rn.ftz.f32 %r2627, %r2039, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2628, %r2627; + fma.rm.ftz.f32 %r2629, %r2628, %r373, %r372; + fma.rn.ftz.f32 %r2630, %r2040, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2631, %r2630; + fma.rm.ftz.f32 %r2632, %r2631, %r373, %r372; + fma.rn.ftz.f32 %r2633, %r2041, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2634, %r2633; + fma.rm.ftz.f32 %r2635, %r2634, %r373, %r372; + fma.rn.ftz.f32 %r2636, %r2042, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2637, %r2636; + fma.rm.ftz.f32 %r2638, %r2637, %r373, %r372; + fma.rn.ftz.f32 %r2639, %r2043, %r369, %r368; + cvt.ftz.sat.f32.f32 %r2640, %r2639; + fma.rm.ftz.f32 %r2641, %r2640, %r373, %r372; + .loc 2 181 31 // triton_helpers.py:181:31 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mul.f32 %r2642, %r1637, %r2063; + mov.b32 %r2643, -8323200; +$L__tmp4: + .loc 2 196 19 // triton_helpers.py:196:19 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.eq.bf16x2 %p242|%p243, %r58, %r2643; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + mov.b64 %rd67, {%r956, %r959}; + cvt.u32.u64 %r2644, %rd67; + add.f32 %r2645, %r956, 0fCB40007F; + neg.f32 %r2646, %r2645; + fma.rn.ftz.f32 %r2647, %r362, %r377, %r2646; + fma.rn.ftz.f32 %r2648, %r362, %r379, %r2647; + ex2.approx.ftz.f32 %r2649, %r2648; + add.f32 %r2650, %r959, 0fCB40007F; + neg.f32 %r2651, %r2650; + fma.rn.ftz.f32 %r2652, %r363, %r377, %r2651; + fma.rn.ftz.f32 %r2653, %r363, %r379, %r2652; + shl.b64 %rd68, %rd67, 23; + and.b64 %rd69, %rd68, -36028797018963968; + shl.b32 %r2654, %r2644, 23; + cvt.u64.u32 %rd70, %r2654; + or.b64 %rd71, %rd70, %rd69; + ex2.approx.ftz.f32 %r2655, %r2653; + mov.b64 {%r2656, %r2657}, %rd71; + mul.f32 %r2658, %r2649, %r2656; + mul.f32 %r2659, %r2655, %r2657; + mov.b64 %rd72, {%r1618, %r1621}; + cvt.u32.u64 %r2660, %rd72; + add.f32 %r2661, %r1618, 0fCB40007F; + neg.f32 %r2662, %r2661; + fma.rn.ftz.f32 %r2663, %r1030, %r377, %r2662; + fma.rn.ftz.f32 %r2664, %r1030, %r379, %r2663; + ex2.approx.ftz.f32 %r2665, %r2664; + add.f32 %r2666, %r1621, 0fCB40007F; + neg.f32 %r2667, %r2666; + fma.rn.ftz.f32 %r2668, %r1031, %r377, %r2667; + fma.rn.ftz.f32 %r2669, %r1031, %r379, %r2668; + shl.b64 %rd73, %rd72, 23; + and.b64 %rd74, %rd73, -36028797018963968; + shl.b32 %r2670, %r2660, 23; + cvt.u64.u32 %rd75, %r2670; + or.b64 %rd76, %rd75, %rd74; + ex2.approx.ftz.f32 %r2671, %r2669; + mov.b64 {%r2672, %r2673}, %rd76; + mul.f32 %r2674, %r2665, %r2672; + mul.f32 %r2675, %r2671, %r2673; + .loc 2 205 36 // triton_helpers.py:205:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.f32 %r2676, %r2659, 0f00000000, %r2675; + fma.rn.f32 %r2677, %r2658, 0f00000000, %r2674; + selp.f32 %r2678, 0f3F800000, %r2677, %p242; + selp.f32 %r2679, 0f3F800000, %r2676, %p243; +$L__tmp5: + .loc 1 46 54 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:46:54 + selp.f32 %r2680, %r2679, 0f00000000, %p8; + selp.f32 %r2681, %r2678, 0f00000000, %p8; +$L__tmp6: + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mov.b64 %rd77, {%r2626, %r2629}; + cvt.u32.u64 %r2682, %rd77; + add.f32 %r2683, %r2626, 0fCB40007F; + neg.f32 %r2684, %r2683; + fma.rn.ftz.f32 %r2685, %r2038, %r377, %r2684; + fma.rn.ftz.f32 %r2686, %r2038, %r379, %r2685; + ex2.approx.ftz.f32 %r2687, %r2686; + add.f32 %r2688, %r2629, 0fCB40007F; + neg.f32 %r2689, %r2688; + fma.rn.ftz.f32 %r2690, %r2039, %r377, %r2689; + fma.rn.ftz.f32 %r2691, %r2039, %r379, %r2690; + shl.b64 %rd78, %rd77, 23; + and.b64 %rd79, %rd78, -36028797018963968; + shl.b32 %r2692, %r2682, 23; + cvt.u64.u32 %rd80, %r2692; + or.b64 %rd81, %rd80, %rd79; + ex2.approx.ftz.f32 %r2693, %r2691; + mov.b64 {%r2694, %r2695}, %rd81; + mul.f32 %r2696, %r2693, %r2695; + mul.f32 %r2697, %r2687, %r2694; +$L__tmp7: + .loc 2 196 19 // triton_helpers.py:196:19 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.eq.bf16x2 %p244|%p245, %r59, %r2643; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + mov.b64 %rd82, {%r962, %r965}; + cvt.u32.u64 %r2698, %rd82; + add.f32 %r2699, %r962, 0fCB40007F; + neg.f32 %r2700, %r2699; + fma.rn.ftz.f32 %r2701, %r364, %r377, %r2700; + fma.rn.ftz.f32 %r2702, %r364, %r379, %r2701; + ex2.approx.ftz.f32 %r2703, %r2702; + add.f32 %r2704, %r965, 0fCB40007F; + neg.f32 %r2705, %r2704; + fma.rn.ftz.f32 %r2706, %r365, %r377, %r2705; + fma.rn.ftz.f32 %r2707, %r365, %r379, %r2706; + shl.b64 %rd83, %rd82, 23; + and.b64 %rd84, %rd83, -36028797018963968; + shl.b32 %r2708, %r2698, 23; + cvt.u64.u32 %rd85, %r2708; + or.b64 %rd86, %rd85, %rd84; + ex2.approx.ftz.f32 %r2709, %r2707; + mov.b64 {%r2710, %r2711}, %rd86; + mul.f32 %r2712, %r2703, %r2710; + mul.f32 %r2713, %r2709, %r2711; + mov.b64 %rd87, {%r1624, %r1627}; + cvt.u32.u64 %r2714, %rd87; + add.f32 %r2715, %r1624, 0fCB40007F; + neg.f32 %r2716, %r2715; + fma.rn.ftz.f32 %r2717, %r1032, %r377, %r2716; + fma.rn.ftz.f32 %r2718, %r1032, %r379, %r2717; + ex2.approx.ftz.f32 %r2719, %r2718; + add.f32 %r2720, %r1627, 0fCB40007F; + neg.f32 %r2721, %r2720; + fma.rn.ftz.f32 %r2722, %r1033, %r377, %r2721; + fma.rn.ftz.f32 %r2723, %r1033, %r379, %r2722; + shl.b64 %rd88, %rd87, 23; + and.b64 %rd89, %rd88, -36028797018963968; + shl.b32 %r2724, %r2714, 23; + cvt.u64.u32 %rd90, %r2724; + or.b64 %rd91, %rd90, %rd89; + ex2.approx.ftz.f32 %r2725, %r2723; + mov.b64 {%r2726, %r2727}, %rd91; + mul.f32 %r2728, %r2719, %r2726; + mul.f32 %r2729, %r2725, %r2727; + .loc 2 205 36 // triton_helpers.py:205:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.f32 %r2730, %r2713, 0f00000000, %r2729; + fma.rn.f32 %r2731, %r2712, 0f00000000, %r2728; + selp.f32 %r2732, 0f3F800000, %r2731, %p244; + selp.f32 %r2733, 0f3F800000, %r2730, %p245; +$L__tmp8: + .loc 1 46 54 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:46:54 + selp.f32 %r2734, %r2733, 0f00000000, %p8; + selp.f32 %r2735, %r2732, 0f00000000, %p8; +$L__tmp9: + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mov.b64 %rd92, {%r2632, %r2635}; + cvt.u32.u64 %r2736, %rd92; + add.f32 %r2737, %r2632, 0fCB40007F; + neg.f32 %r2738, %r2737; + fma.rn.ftz.f32 %r2739, %r2040, %r377, %r2738; + fma.rn.ftz.f32 %r2740, %r2040, %r379, %r2739; + ex2.approx.ftz.f32 %r2741, %r2740; + add.f32 %r2742, %r2635, 0fCB40007F; + neg.f32 %r2743, %r2742; + fma.rn.ftz.f32 %r2744, %r2041, %r377, %r2743; + fma.rn.ftz.f32 %r2745, %r2041, %r379, %r2744; + shl.b64 %rd93, %rd92, 23; + and.b64 %rd94, %rd93, -36028797018963968; + shl.b32 %r2746, %r2736, 23; + cvt.u64.u32 %rd95, %r2746; + or.b64 %rd96, %rd95, %rd94; + ex2.approx.ftz.f32 %r2747, %r2745; + mov.b64 {%r2748, %r2749}, %rd96; + mul.f32 %r2750, %r2747, %r2749; + mul.f32 %r2751, %r2741, %r2748; +$L__tmp10: + .loc 2 196 19 // triton_helpers.py:196:19 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + setp.eq.bf16x2 %p246|%p247, %r60, %r2643; + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + mov.b64 %rd97, {%r968, %r971}; + cvt.u32.u64 %r2752, %rd97; + add.f32 %r2753, %r968, 0fCB40007F; + neg.f32 %r2754, %r2753; + fma.rn.ftz.f32 %r2755, %r366, %r377, %r2754; + fma.rn.ftz.f32 %r2756, %r366, %r379, %r2755; + ex2.approx.ftz.f32 %r2757, %r2756; + add.f32 %r2758, %r971, 0fCB40007F; + neg.f32 %r2759, %r2758; + fma.rn.ftz.f32 %r2760, %r367, %r377, %r2759; + fma.rn.ftz.f32 %r2761, %r367, %r379, %r2760; + shl.b64 %rd98, %rd97, 23; + and.b64 %rd99, %rd98, -36028797018963968; + shl.b32 %r2762, %r2752, 23; + cvt.u64.u32 %rd100, %r2762; + or.b64 %rd101, %rd100, %rd99; + ex2.approx.ftz.f32 %r2763, %r2761; + mov.b64 {%r2764, %r2765}, %rd101; + mul.f32 %r2766, %r2757, %r2764; + mul.f32 %r2767, %r2763, %r2765; + mov.b64 %rd102, {%r1630, %r1633}; + cvt.u32.u64 %r2768, %rd102; + add.f32 %r2769, %r1630, 0fCB40007F; + neg.f32 %r2770, %r2769; + fma.rn.ftz.f32 %r2771, %r1034, %r377, %r2770; + fma.rn.ftz.f32 %r2772, %r1034, %r379, %r2771; + ex2.approx.ftz.f32 %r2773, %r2772; + add.f32 %r2774, %r1633, 0fCB40007F; + neg.f32 %r2775, %r2774; + fma.rn.ftz.f32 %r2776, %r1035, %r377, %r2775; + fma.rn.ftz.f32 %r2777, %r1035, %r379, %r2776; + shl.b64 %rd103, %rd102, 23; + and.b64 %rd104, %rd103, -36028797018963968; + shl.b32 %r2778, %r2768, 23; + cvt.u64.u32 %rd105, %r2778; + or.b64 %rd106, %rd105, %rd104; + ex2.approx.ftz.f32 %r2779, %r2777; + mov.b64 {%r2780, %r2781}, %rd106; + mul.f32 %r2782, %r2773, %r2780; + mul.f32 %r2783, %r2779, %r2781; + .loc 2 205 36 // triton_helpers.py:205:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:42:40 ] + fma.rn.f32 %r2784, %r2767, 0f00000000, %r2783; + fma.rn.f32 %r2785, %r2766, 0f00000000, %r2782; + selp.f32 %r2786, 0f3F800000, %r2785, %p246; + selp.f32 %r2787, 0f3F800000, %r2784, %p247; +$L__tmp11: + .loc 1 46 54 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:46:54 + selp.f32 %r2788, %r2787, 0f00000000, %p8; + selp.f32 %r2789, %r2786, 0f00000000, %p8; +$L__tmp12: + .loc 2 173 29 // triton_helpers.py:173:29 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + mov.b64 %rd107, {%r2638, %r2641}; + cvt.u32.u64 %r2790, %rd107; + add.f32 %r2791, %r2638, 0fCB40007F; + neg.f32 %r2792, %r2791; + fma.rn.ftz.f32 %r2793, %r2042, %r377, %r2792; + fma.rn.ftz.f32 %r2794, %r2042, %r379, %r2793; + ex2.approx.ftz.f32 %r2795, %r2794; + add.f32 %r2796, %r2641, 0fCB40007F; + neg.f32 %r2797, %r2796; + fma.rn.ftz.f32 %r2798, %r2043, %r377, %r2797; + fma.rn.ftz.f32 %r2799, %r2043, %r379, %r2798; + shl.b64 %rd108, %rd107, 23; + and.b64 %rd109, %rd108, -36028797018963968; + shl.b32 %r2800, %r2790, 23; + cvt.u64.u32 %rd110, %r2800; + or.b64 %rd111, %rd110, %rd109; + ex2.approx.ftz.f32 %r2801, %r2799; + mov.b64 {%r2802, %r2803}, %rd111; + mul.f32 %r2804, %r2801, %r2803; + mul.f32 %r2805, %r2795, %r2802; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + bar.sync 0; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + fma.rn.f32 %r2806, %r1635, %r2053, %r2642; + fma.rn.f32 %r2807, %r1639, %r2073, %r2806; + fma.rn.f32 %r2808, %r1641, %r2083, %r2807; + fma.rn.f32 %r2809, %r1643, %r2093, %r2808; + fma.rn.f32 %r2810, %r1645, %r2103, %r2809; + fma.rn.f32 %r2811, %r1647, %r2113, %r2810; + fma.rn.f32 %r2812, %r1649, %r2123, %r2811; + fma.rn.f32 %r2813, %r1651, %r2133, %r2812; + fma.rn.f32 %r2814, %r1653, %r2143, %r2813; + fma.rn.f32 %r2815, %r1655, %r2153, %r2814; + fma.rn.f32 %r2816, %r1657, %r2163, %r2815; + fma.rn.f32 %r2817, %r1659, %r2173, %r2816; + fma.rn.f32 %r2818, %r1661, %r2183, %r2817; + fma.rn.f32 %r2819, %r1663, %r2193, %r2818; + fma.rn.f32 %r2820, %r1665, %r2203, %r2819; + fma.rn.f32 %r2821, %r1667, %r2213, %r2820; + fma.rn.f32 %r2822, %r1669, %r2223, %r2821; + fma.rn.f32 %r2823, %r1671, %r2233, %r2822; + fma.rn.f32 %r2824, %r1673, %r2243, %r2823; + fma.rn.f32 %r2825, %r1675, %r2253, %r2824; + fma.rn.f32 %r2826, %r1677, %r2263, %r2825; + fma.rn.f32 %r2827, %r1679, %r2273, %r2826; + fma.rn.f32 %r2828, %r1681, %r2283, %r2827; + fma.rn.f32 %r2829, %r1683, %r2293, %r2828; + fma.rn.f32 %r2830, %r1685, %r2303, %r2829; + fma.rn.f32 %r2831, %r1687, %r2313, %r2830; + fma.rn.f32 %r2832, %r1689, %r2323, %r2831; + fma.rn.f32 %r2833, %r1691, %r2333, %r2832; + fma.rn.f32 %r2834, %r1693, %r2343, %r2833; + fma.rn.f32 %r2835, %r1695, %r2353, %r2834; + fma.rn.f32 %r2836, %r1697, %r2363, %r2835; + fma.rn.f32 %r2837, %r1699, %r2373, %r2836; + fma.rn.f32 %r2838, %r1701, %r2383, %r2837; + fma.rn.f32 %r2839, %r1703, %r2393, %r2838; + fma.rn.f32 %r2840, %r1705, %r2403, %r2839; + fma.rn.f32 %r2841, %r1707, %r2413, %r2840; + fma.rn.f32 %r2842, %r1709, %r2423, %r2841; + fma.rn.f32 %r2843, %r1711, %r2433, %r2842; + fma.rn.f32 %r2844, %r1713, %r2443, %r2843; + fma.rn.f32 %r2845, %r1715, %r2453, %r2844; + fma.rn.f32 %r2846, %r1717, %r2463, %r2845; + fma.rn.f32 %r2847, %r1719, %r2473, %r2846; + fma.rn.f32 %r2848, %r1721, %r2483, %r2847; + fma.rn.f32 %r2849, %r1723, %r2493, %r2848; + fma.rn.f32 %r2850, %r1725, %r2503, %r2849; + fma.rn.f32 %r2851, %r1727, %r2513, %r2850; + fma.rn.f32 %r2852, %r1729, %r2523, %r2851; + fma.rn.f32 %r2853, %r1731, %r2533, %r2852; + fma.rn.f32 %r2854, %r1733, %r2543, %r2853; + fma.rn.f32 %r2855, %r1735, %r2553, %r2854; + fma.rn.f32 %r2856, %r1737, %r2563, %r2855; + fma.rn.f32 %r2857, %r1739, %r2573, %r2856; + fma.rn.f32 %r2858, %r1741, %r2583, %r2857; + fma.rn.f32 %r2859, %r1743, %r2593, %r2858; + fma.rn.f32 %r2860, %r1745, %r2603, %r2859; + fma.rn.f32 %r2861, %r1758, %r2613, %r2860; + fma.rn.f32 %r2862, %r1759, %r2623, %r2861; + fma.rn.f32 %r2863, %r2681, %r2697, %r2862; + fma.rn.f32 %r2864, %r2680, %r2696, %r2863; + fma.rn.f32 %r2865, %r2735, %r2751, %r2864; + fma.rn.f32 %r2866, %r2734, %r2750, %r2865; + fma.rn.f32 %r2867, %r2789, %r2805, %r2866; + fma.rn.f32 %r2868, %r2788, %r2804, %r2867; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r2869, %r2868, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r2870, %r2868, %r2869; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r2871, %r2870, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r2872, %r2870, %r2871; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r2873, %r2872, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r2874, %r2872, %r2873; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r2875, %r2874, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r2876, %r2874, %r2875; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r2877, %r2876, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r72, %r2876, %r2877; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + // begin inline asm + @%p9 st.shared.b32 [ %r65 + 0 ], %r72; + // end inline asm + bar.sync 0; + // begin inline asm + @%p10 ld.shared.b32 %r73, [ %r68 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r2878, %r73, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r2879, %r73, %r2878; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r2880, %r2879, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r2881, %r2879, %r2880; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r2882, %r2881, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r2883, %r2881, %r2882; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + shfl.sync.bfly.b32 %r2884, %r2883, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + add.f32 %r76, %r2883, %r2884; + .loc 3 291 36 // standard.py:291:36 @[ cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:49:33 ] + // begin inline asm + @%p11 st.shared.b32 [ %r68 + 0 ], %r76; + // end inline asm + bar.sync 0; + ld.shared.b32 %r2885, [global_smem]; +$L__tmp13: + .loc 1 58 52 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:58:52 + // begin inline asm + mov.u64 %rd27, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd27, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r77, %r5; + mov.u32 %r78, %r5; + mov.u32 %r79, %r5; + mov.u32 %r80, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r77, %r78, %r79, %r80 }, [ %rd2 + 0 ], %rd27; + // end inline asm + mov.b32 {%rs66, %rs67}, %r77; + mov.b32 {%rs68, %rs69}, %r78; + mov.b32 {%rs70, %rs71}, %r79; + mov.b32 {%rs72, %rs73}, %r80; + // begin inline asm + mov.u64 %rd30, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd30, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r85, %r5; + mov.u32 %r86, %r5; + mov.u32 %r87, %r5; + mov.u32 %r88, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r85, %r86, %r87, %r88 }, [ %rd5 + 0 ], %rd30; + // end inline asm + mov.b32 {%rs74, %rs75}, %r85; + mov.b32 {%rs76, %rs77}, %r86; + mov.b32 {%rs78, %rs79}, %r87; + mov.b32 {%rs80, %rs81}, %r88; + // begin inline asm + mov.u64 %rd33, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd33, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r93, %r5; + mov.u32 %r94, %r5; + mov.u32 %r95, %r5; + mov.u32 %r96, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r93, %r94, %r95, %r96 }, [ %rd8 + 0 ], %rd33; + // end inline asm + mov.b32 {%rs82, %rs83}, %r93; + mov.b32 {%rs84, %rs85}, %r94; + mov.b32 {%rs86, %rs87}, %r95; + mov.b32 {%rs88, %rs89}, %r96; + // begin inline asm + mov.u64 %rd36, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd36, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r101, %r5; + mov.u32 %r102, %r5; + mov.u32 %r103, %r5; + mov.u32 %r104, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r101, %r102, %r103, %r104 }, [ %rd11 + 0 ], %rd36; + // end inline asm + mov.b32 {%rs90, %rs91}, %r101; + mov.b32 {%rs92, %rs93}, %r102; + mov.b32 {%rs94, %rs95}, %r103; + mov.b32 {%rs96, %rs97}, %r104; + // begin inline asm + mov.u64 %rd39, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd39, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r109, %r5; + mov.u32 %r110, %r5; + mov.u32 %r111, %r5; + mov.u32 %r112, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r109, %r110, %r111, %r112 }, [ %rd14 + 0 ], %rd39; + // end inline asm + mov.b32 {%rs98, %rs99}, %r109; + mov.b32 {%rs100, %rs101}, %r110; + mov.b32 {%rs102, %rs103}, %r111; + mov.b32 {%rs104, %rs105}, %r112; + // begin inline asm + mov.u64 %rd42, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd42, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r117, %r5; + mov.u32 %r118, %r5; + mov.u32 %r119, %r5; + mov.u32 %r120, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r117, %r118, %r119, %r120 }, [ %rd17 + 0 ], %rd42; + // end inline asm + mov.b32 {%rs106, %rs107}, %r117; + mov.b32 {%rs108, %rs109}, %r118; + mov.b32 {%rs110, %rs111}, %r119; + mov.b32 {%rs112, %rs113}, %r120; + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd45, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r125, %r5; + mov.u32 %r126, %r5; + mov.u32 %r127, %r5; + mov.u32 %r128, %r5; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r125, %r126, %r127, %r128 }, [ %rd20 + 0 ], %rd45; + // end inline asm + mov.b32 {%rs114, %rs115}, %r125; + mov.b32 {%rs116, %rs117}, %r126; + mov.b32 {%rs118, %rs119}, %r127; + mov.b32 {%rs120, %rs121}, %r128; + // begin inline asm + mov.u64 %rd48, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd48, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r133, %r5; + mov.u32 %r134, %r5; + mov.u32 %r135, %r5; + mov.u32 %r136, %r5; + @%p8 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r133, %r134, %r135, %r136 }, [ %rd23 + 0 ], %rd48; + // end inline asm + mov.b32 {%rs122, %rs123}, %r133; + mov.b32 {%rs124, %rs125}, %r134; + mov.b32 {%rs126, %rs127}, %r135; + mov.b32 {%rs128, %rs129}, %r136; + .loc 1 58 106 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:58:106 + cvt.f32.bf16 %r2886, %rs66; + cvt.f32.bf16 %r2887, %rs67; + cvt.f32.bf16 %r2888, %rs68; + cvt.f32.bf16 %r2889, %rs69; + cvt.f32.bf16 %r2890, %rs70; + cvt.f32.bf16 %r2891, %rs71; + cvt.f32.bf16 %r2892, %rs72; + cvt.f32.bf16 %r2893, %rs73; + cvt.f32.bf16 %r2894, %rs74; + cvt.f32.bf16 %r2895, %rs75; + cvt.f32.bf16 %r2896, %rs76; + cvt.f32.bf16 %r2897, %rs77; + cvt.f32.bf16 %r2898, %rs78; + cvt.f32.bf16 %r2899, %rs79; + cvt.f32.bf16 %r2900, %rs80; + cvt.f32.bf16 %r2901, %rs81; + cvt.f32.bf16 %r2902, %rs82; + cvt.f32.bf16 %r2903, %rs83; + cvt.f32.bf16 %r2904, %rs84; + cvt.f32.bf16 %r2905, %rs85; + cvt.f32.bf16 %r2906, %rs86; + cvt.f32.bf16 %r2907, %rs87; + cvt.f32.bf16 %r2908, %rs88; + cvt.f32.bf16 %r2909, %rs89; + cvt.f32.bf16 %r2910, %rs90; + cvt.f32.bf16 %r2911, %rs91; + cvt.f32.bf16 %r2912, %rs92; + cvt.f32.bf16 %r2913, %rs93; + cvt.f32.bf16 %r2914, %rs94; + cvt.f32.bf16 %r2915, %rs95; + cvt.f32.bf16 %r2916, %rs96; + cvt.f32.bf16 %r2917, %rs97; + cvt.f32.bf16 %r2918, %rs98; + cvt.f32.bf16 %r2919, %rs99; + cvt.f32.bf16 %r2920, %rs100; + cvt.f32.bf16 %r2921, %rs101; + cvt.f32.bf16 %r2922, %rs102; + cvt.f32.bf16 %r2923, %rs103; + cvt.f32.bf16 %r2924, %rs104; + cvt.f32.bf16 %r2925, %rs105; + cvt.f32.bf16 %r2926, %rs106; + cvt.f32.bf16 %r2927, %rs107; + cvt.f32.bf16 %r2928, %rs108; + cvt.f32.bf16 %r2929, %rs109; + cvt.f32.bf16 %r2930, %rs110; + cvt.f32.bf16 %r2931, %rs111; + cvt.f32.bf16 %r2932, %rs112; + cvt.f32.bf16 %r2933, %rs113; + cvt.f32.bf16 %r2934, %rs114; + cvt.f32.bf16 %r2935, %rs115; + cvt.f32.bf16 %r2936, %rs116; + cvt.f32.bf16 %r2937, %rs117; + cvt.f32.bf16 %r2938, %rs118; + cvt.f32.bf16 %r2939, %rs119; + cvt.f32.bf16 %r2940, %rs120; + cvt.f32.bf16 %r2941, %rs121; + cvt.f32.bf16 %r2942, %rs122; + cvt.f32.bf16 %r2943, %rs123; + cvt.f32.bf16 %r2944, %rs124; + cvt.f32.bf16 %r2945, %rs125; + cvt.f32.bf16 %r2946, %rs126; + cvt.f32.bf16 %r2947, %rs127; + cvt.f32.bf16 %r2948, %rs128; + cvt.f32.bf16 %r2949, %rs129; + .loc 1 60 22 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:60:22 + sub.f32 %r2950, %r2886, %r1915; + sub.f32 %r2951, %r2887, %r1915; + sub.f32 %r2952, %r2888, %r1915; + sub.f32 %r2953, %r2889, %r1915; + sub.f32 %r2954, %r2890, %r1915; + sub.f32 %r2955, %r2891, %r1915; + sub.f32 %r2956, %r2892, %r1915; + sub.f32 %r2957, %r2893, %r1915; + sub.f32 %r2958, %r2894, %r1915; + sub.f32 %r2959, %r2895, %r1915; + sub.f32 %r2960, %r2896, %r1915; + sub.f32 %r2961, %r2897, %r1915; + sub.f32 %r2962, %r2898, %r1915; + sub.f32 %r2963, %r2899, %r1915; + sub.f32 %r2964, %r2900, %r1915; + sub.f32 %r2965, %r2901, %r1915; + sub.f32 %r2966, %r2902, %r1915; + sub.f32 %r2967, %r2903, %r1915; + sub.f32 %r2968, %r2904, %r1915; + sub.f32 %r2969, %r2905, %r1915; + sub.f32 %r2970, %r2906, %r1915; + sub.f32 %r2971, %r2907, %r1915; + sub.f32 %r2972, %r2908, %r1915; + sub.f32 %r2973, %r2909, %r1915; + sub.f32 %r2974, %r2910, %r1915; + sub.f32 %r2975, %r2911, %r1915; + sub.f32 %r2976, %r2912, %r1915; + sub.f32 %r2977, %r2913, %r1915; + sub.f32 %r2978, %r2914, %r1915; + sub.f32 %r2979, %r2915, %r1915; + sub.f32 %r2980, %r2916, %r1915; + sub.f32 %r2981, %r2917, %r1915; + sub.f32 %r2982, %r2918, %r1915; + sub.f32 %r2983, %r2919, %r1915; + sub.f32 %r2984, %r2920, %r1915; + sub.f32 %r2985, %r2921, %r1915; + sub.f32 %r2986, %r2922, %r1915; + sub.f32 %r2987, %r2923, %r1915; + sub.f32 %r2988, %r2924, %r1915; + sub.f32 %r2989, %r2925, %r1915; + sub.f32 %r2990, %r2926, %r1915; + sub.f32 %r2991, %r2927, %r1915; + sub.f32 %r2992, %r2928, %r1915; + sub.f32 %r2993, %r2929, %r1915; + sub.f32 %r2994, %r2930, %r1915; + sub.f32 %r2995, %r2931, %r1915; + sub.f32 %r2996, %r2932, %r1915; + sub.f32 %r2997, %r2933, %r1915; + sub.f32 %r2998, %r2934, %r1915; + sub.f32 %r2999, %r2935, %r1915; + sub.f32 %r3000, %r2936, %r1915; + sub.f32 %r3001, %r2937, %r1915; + sub.f32 %r3002, %r2938, %r1915; + sub.f32 %r3003, %r2939, %r1915; + sub.f32 %r3004, %r2940, %r1915; + sub.f32 %r3005, %r2941, %r1915; + sub.f32 %r3006, %r2942, %r1915; + sub.f32 %r3007, %r2943, %r1915; + sub.f32 %r3008, %r2944, %r1915; + sub.f32 %r3009, %r2945, %r1915; + sub.f32 %r3010, %r2946, %r1915; + sub.f32 %r3011, %r2947, %r1915; + sub.f32 %r3012, %r2948, %r1915; + sub.f32 %r3013, %r2949, %r1915; + .loc 1 61 29 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:61:29 + fma.rn.ftz.f32 %r3014, %r2950, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3015, %r3014; + fma.rm.ftz.f32 %r3016, %r3015, %r373, %r372; + add.f32 %r3017, %r3016, 0fCB40007F; + neg.f32 %r3018, %r3017; + fma.rn.ftz.f32 %r3019, %r2950, %r377, %r3018; + fma.rn.ftz.f32 %r3020, %r2950, %r379, %r3019; + shl.b32 %r3021, %r3016, 23; + ex2.approx.ftz.f32 %r3022, %r3020; + mul.f32 %r3023, %r3022, %r3021; + fma.rn.ftz.f32 %r3024, %r2951, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3025, %r3024; + fma.rm.ftz.f32 %r3026, %r3025, %r373, %r372; + add.f32 %r3027, %r3026, 0fCB40007F; + neg.f32 %r3028, %r3027; + fma.rn.ftz.f32 %r3029, %r2951, %r377, %r3028; + fma.rn.ftz.f32 %r3030, %r2951, %r379, %r3029; + shl.b32 %r3031, %r3026, 23; + ex2.approx.ftz.f32 %r3032, %r3030; + mul.f32 %r3033, %r3032, %r3031; + fma.rn.ftz.f32 %r3034, %r2952, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3035, %r3034; + fma.rm.ftz.f32 %r3036, %r3035, %r373, %r372; + add.f32 %r3037, %r3036, 0fCB40007F; + neg.f32 %r3038, %r3037; + fma.rn.ftz.f32 %r3039, %r2952, %r377, %r3038; + fma.rn.ftz.f32 %r3040, %r2952, %r379, %r3039; + shl.b32 %r3041, %r3036, 23; + ex2.approx.ftz.f32 %r3042, %r3040; + mul.f32 %r3043, %r3042, %r3041; + fma.rn.ftz.f32 %r3044, %r2953, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3045, %r3044; + fma.rm.ftz.f32 %r3046, %r3045, %r373, %r372; + add.f32 %r3047, %r3046, 0fCB40007F; + neg.f32 %r3048, %r3047; + fma.rn.ftz.f32 %r3049, %r2953, %r377, %r3048; + fma.rn.ftz.f32 %r3050, %r2953, %r379, %r3049; + shl.b32 %r3051, %r3046, 23; + ex2.approx.ftz.f32 %r3052, %r3050; + mul.f32 %r3053, %r3052, %r3051; + fma.rn.ftz.f32 %r3054, %r2954, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3055, %r3054; + fma.rm.ftz.f32 %r3056, %r3055, %r373, %r372; + add.f32 %r3057, %r3056, 0fCB40007F; + neg.f32 %r3058, %r3057; + fma.rn.ftz.f32 %r3059, %r2954, %r377, %r3058; + fma.rn.ftz.f32 %r3060, %r2954, %r379, %r3059; + shl.b32 %r3061, %r3056, 23; + ex2.approx.ftz.f32 %r3062, %r3060; + mul.f32 %r3063, %r3062, %r3061; + fma.rn.ftz.f32 %r3064, %r2955, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3065, %r3064; + fma.rm.ftz.f32 %r3066, %r3065, %r373, %r372; + add.f32 %r3067, %r3066, 0fCB40007F; + neg.f32 %r3068, %r3067; + fma.rn.ftz.f32 %r3069, %r2955, %r377, %r3068; + fma.rn.ftz.f32 %r3070, %r2955, %r379, %r3069; + shl.b32 %r3071, %r3066, 23; + ex2.approx.ftz.f32 %r3072, %r3070; + mul.f32 %r3073, %r3072, %r3071; + fma.rn.ftz.f32 %r3074, %r2956, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3075, %r3074; + fma.rm.ftz.f32 %r3076, %r3075, %r373, %r372; + add.f32 %r3077, %r3076, 0fCB40007F; + neg.f32 %r3078, %r3077; + fma.rn.ftz.f32 %r3079, %r2956, %r377, %r3078; + fma.rn.ftz.f32 %r3080, %r2956, %r379, %r3079; + shl.b32 %r3081, %r3076, 23; + ex2.approx.ftz.f32 %r3082, %r3080; + mul.f32 %r3083, %r3082, %r3081; + fma.rn.ftz.f32 %r3084, %r2957, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3085, %r3084; + fma.rm.ftz.f32 %r3086, %r3085, %r373, %r372; + add.f32 %r3087, %r3086, 0fCB40007F; + neg.f32 %r3088, %r3087; + fma.rn.ftz.f32 %r3089, %r2957, %r377, %r3088; + fma.rn.ftz.f32 %r3090, %r2957, %r379, %r3089; + shl.b32 %r3091, %r3086, 23; + ex2.approx.ftz.f32 %r3092, %r3090; + mul.f32 %r3093, %r3092, %r3091; + fma.rn.ftz.f32 %r3094, %r2958, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3095, %r3094; + fma.rm.ftz.f32 %r3096, %r3095, %r373, %r372; + add.f32 %r3097, %r3096, 0fCB40007F; + neg.f32 %r3098, %r3097; + fma.rn.ftz.f32 %r3099, %r2958, %r377, %r3098; + fma.rn.ftz.f32 %r3100, %r2958, %r379, %r3099; + shl.b32 %r3101, %r3096, 23; + ex2.approx.ftz.f32 %r3102, %r3100; + mul.f32 %r3103, %r3102, %r3101; + fma.rn.ftz.f32 %r3104, %r2959, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3105, %r3104; + fma.rm.ftz.f32 %r3106, %r3105, %r373, %r372; + add.f32 %r3107, %r3106, 0fCB40007F; + neg.f32 %r3108, %r3107; + fma.rn.ftz.f32 %r3109, %r2959, %r377, %r3108; + fma.rn.ftz.f32 %r3110, %r2959, %r379, %r3109; + shl.b32 %r3111, %r3106, 23; + ex2.approx.ftz.f32 %r3112, %r3110; + mul.f32 %r3113, %r3112, %r3111; + fma.rn.ftz.f32 %r3114, %r2960, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3115, %r3114; + fma.rm.ftz.f32 %r3116, %r3115, %r373, %r372; + add.f32 %r3117, %r3116, 0fCB40007F; + neg.f32 %r3118, %r3117; + fma.rn.ftz.f32 %r3119, %r2960, %r377, %r3118; + fma.rn.ftz.f32 %r3120, %r2960, %r379, %r3119; + shl.b32 %r3121, %r3116, 23; + ex2.approx.ftz.f32 %r3122, %r3120; + mul.f32 %r3123, %r3122, %r3121; + fma.rn.ftz.f32 %r3124, %r2961, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3125, %r3124; + fma.rm.ftz.f32 %r3126, %r3125, %r373, %r372; + add.f32 %r3127, %r3126, 0fCB40007F; + neg.f32 %r3128, %r3127; + fma.rn.ftz.f32 %r3129, %r2961, %r377, %r3128; + fma.rn.ftz.f32 %r3130, %r2961, %r379, %r3129; + shl.b32 %r3131, %r3126, 23; + ex2.approx.ftz.f32 %r3132, %r3130; + mul.f32 %r3133, %r3132, %r3131; + fma.rn.ftz.f32 %r3134, %r2962, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3135, %r3134; + fma.rm.ftz.f32 %r3136, %r3135, %r373, %r372; + add.f32 %r3137, %r3136, 0fCB40007F; + neg.f32 %r3138, %r3137; + fma.rn.ftz.f32 %r3139, %r2962, %r377, %r3138; + fma.rn.ftz.f32 %r3140, %r2962, %r379, %r3139; + shl.b32 %r3141, %r3136, 23; + ex2.approx.ftz.f32 %r3142, %r3140; + mul.f32 %r3143, %r3142, %r3141; + fma.rn.ftz.f32 %r3144, %r2963, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3145, %r3144; + fma.rm.ftz.f32 %r3146, %r3145, %r373, %r372; + add.f32 %r3147, %r3146, 0fCB40007F; + neg.f32 %r3148, %r3147; + fma.rn.ftz.f32 %r3149, %r2963, %r377, %r3148; + fma.rn.ftz.f32 %r3150, %r2963, %r379, %r3149; + shl.b32 %r3151, %r3146, 23; + ex2.approx.ftz.f32 %r3152, %r3150; + mul.f32 %r3153, %r3152, %r3151; + fma.rn.ftz.f32 %r3154, %r2964, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3155, %r3154; + fma.rm.ftz.f32 %r3156, %r3155, %r373, %r372; + add.f32 %r3157, %r3156, 0fCB40007F; + neg.f32 %r3158, %r3157; + fma.rn.ftz.f32 %r3159, %r2964, %r377, %r3158; + fma.rn.ftz.f32 %r3160, %r2964, %r379, %r3159; + shl.b32 %r3161, %r3156, 23; + ex2.approx.ftz.f32 %r3162, %r3160; + mul.f32 %r3163, %r3162, %r3161; + fma.rn.ftz.f32 %r3164, %r2965, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3165, %r3164; + fma.rm.ftz.f32 %r3166, %r3165, %r373, %r372; + add.f32 %r3167, %r3166, 0fCB40007F; + neg.f32 %r3168, %r3167; + fma.rn.ftz.f32 %r3169, %r2965, %r377, %r3168; + fma.rn.ftz.f32 %r3170, %r2965, %r379, %r3169; + shl.b32 %r3171, %r3166, 23; + ex2.approx.ftz.f32 %r3172, %r3170; + mul.f32 %r3173, %r3172, %r3171; + fma.rn.ftz.f32 %r3174, %r2966, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3175, %r3174; + fma.rm.ftz.f32 %r3176, %r3175, %r373, %r372; + add.f32 %r3177, %r3176, 0fCB40007F; + neg.f32 %r3178, %r3177; + fma.rn.ftz.f32 %r3179, %r2966, %r377, %r3178; + fma.rn.ftz.f32 %r3180, %r2966, %r379, %r3179; + shl.b32 %r3181, %r3176, 23; + ex2.approx.ftz.f32 %r3182, %r3180; + mul.f32 %r3183, %r3182, %r3181; + fma.rn.ftz.f32 %r3184, %r2967, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3185, %r3184; + fma.rm.ftz.f32 %r3186, %r3185, %r373, %r372; + add.f32 %r3187, %r3186, 0fCB40007F; + neg.f32 %r3188, %r3187; + fma.rn.ftz.f32 %r3189, %r2967, %r377, %r3188; + fma.rn.ftz.f32 %r3190, %r2967, %r379, %r3189; + shl.b32 %r3191, %r3186, 23; + ex2.approx.ftz.f32 %r3192, %r3190; + mul.f32 %r3193, %r3192, %r3191; + fma.rn.ftz.f32 %r3194, %r2968, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3195, %r3194; + fma.rm.ftz.f32 %r3196, %r3195, %r373, %r372; + add.f32 %r3197, %r3196, 0fCB40007F; + neg.f32 %r3198, %r3197; + fma.rn.ftz.f32 %r3199, %r2968, %r377, %r3198; + fma.rn.ftz.f32 %r3200, %r2968, %r379, %r3199; + shl.b32 %r3201, %r3196, 23; + ex2.approx.ftz.f32 %r3202, %r3200; + mul.f32 %r3203, %r3202, %r3201; + fma.rn.ftz.f32 %r3204, %r2969, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3205, %r3204; + fma.rm.ftz.f32 %r3206, %r3205, %r373, %r372; + add.f32 %r3207, %r3206, 0fCB40007F; + neg.f32 %r3208, %r3207; + fma.rn.ftz.f32 %r3209, %r2969, %r377, %r3208; + fma.rn.ftz.f32 %r3210, %r2969, %r379, %r3209; + shl.b32 %r3211, %r3206, 23; + ex2.approx.ftz.f32 %r3212, %r3210; + mul.f32 %r3213, %r3212, %r3211; + fma.rn.ftz.f32 %r3214, %r2970, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3215, %r3214; + fma.rm.ftz.f32 %r3216, %r3215, %r373, %r372; + add.f32 %r3217, %r3216, 0fCB40007F; + neg.f32 %r3218, %r3217; + fma.rn.ftz.f32 %r3219, %r2970, %r377, %r3218; + fma.rn.ftz.f32 %r3220, %r2970, %r379, %r3219; + shl.b32 %r3221, %r3216, 23; + ex2.approx.ftz.f32 %r3222, %r3220; + mul.f32 %r3223, %r3222, %r3221; + fma.rn.ftz.f32 %r3224, %r2971, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3225, %r3224; + fma.rm.ftz.f32 %r3226, %r3225, %r373, %r372; + add.f32 %r3227, %r3226, 0fCB40007F; + neg.f32 %r3228, %r3227; + fma.rn.ftz.f32 %r3229, %r2971, %r377, %r3228; + fma.rn.ftz.f32 %r3230, %r2971, %r379, %r3229; + shl.b32 %r3231, %r3226, 23; + ex2.approx.ftz.f32 %r3232, %r3230; + mul.f32 %r3233, %r3232, %r3231; + fma.rn.ftz.f32 %r3234, %r2972, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3235, %r3234; + fma.rm.ftz.f32 %r3236, %r3235, %r373, %r372; + add.f32 %r3237, %r3236, 0fCB40007F; + neg.f32 %r3238, %r3237; + fma.rn.ftz.f32 %r3239, %r2972, %r377, %r3238; + fma.rn.ftz.f32 %r3240, %r2972, %r379, %r3239; + shl.b32 %r3241, %r3236, 23; + ex2.approx.ftz.f32 %r3242, %r3240; + mul.f32 %r3243, %r3242, %r3241; + fma.rn.ftz.f32 %r3244, %r2973, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3245, %r3244; + fma.rm.ftz.f32 %r3246, %r3245, %r373, %r372; + add.f32 %r3247, %r3246, 0fCB40007F; + neg.f32 %r3248, %r3247; + fma.rn.ftz.f32 %r3249, %r2973, %r377, %r3248; + fma.rn.ftz.f32 %r3250, %r2973, %r379, %r3249; + shl.b32 %r3251, %r3246, 23; + ex2.approx.ftz.f32 %r3252, %r3250; + mul.f32 %r3253, %r3252, %r3251; + fma.rn.ftz.f32 %r3254, %r2974, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3255, %r3254; + fma.rm.ftz.f32 %r3256, %r3255, %r373, %r372; + add.f32 %r3257, %r3256, 0fCB40007F; + neg.f32 %r3258, %r3257; + fma.rn.ftz.f32 %r3259, %r2974, %r377, %r3258; + fma.rn.ftz.f32 %r3260, %r2974, %r379, %r3259; + shl.b32 %r3261, %r3256, 23; + ex2.approx.ftz.f32 %r3262, %r3260; + mul.f32 %r3263, %r3262, %r3261; + fma.rn.ftz.f32 %r3264, %r2975, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3265, %r3264; + fma.rm.ftz.f32 %r3266, %r3265, %r373, %r372; + add.f32 %r3267, %r3266, 0fCB40007F; + neg.f32 %r3268, %r3267; + fma.rn.ftz.f32 %r3269, %r2975, %r377, %r3268; + fma.rn.ftz.f32 %r3270, %r2975, %r379, %r3269; + shl.b32 %r3271, %r3266, 23; + ex2.approx.ftz.f32 %r3272, %r3270; + mul.f32 %r3273, %r3272, %r3271; + fma.rn.ftz.f32 %r3274, %r2976, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3275, %r3274; + fma.rm.ftz.f32 %r3276, %r3275, %r373, %r372; + add.f32 %r3277, %r3276, 0fCB40007F; + neg.f32 %r3278, %r3277; + fma.rn.ftz.f32 %r3279, %r2976, %r377, %r3278; + fma.rn.ftz.f32 %r3280, %r2976, %r379, %r3279; + shl.b32 %r3281, %r3276, 23; + ex2.approx.ftz.f32 %r3282, %r3280; + mul.f32 %r3283, %r3282, %r3281; + fma.rn.ftz.f32 %r3284, %r2977, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3285, %r3284; + fma.rm.ftz.f32 %r3286, %r3285, %r373, %r372; + add.f32 %r3287, %r3286, 0fCB40007F; + neg.f32 %r3288, %r3287; + fma.rn.ftz.f32 %r3289, %r2977, %r377, %r3288; + fma.rn.ftz.f32 %r3290, %r2977, %r379, %r3289; + shl.b32 %r3291, %r3286, 23; + ex2.approx.ftz.f32 %r3292, %r3290; + mul.f32 %r3293, %r3292, %r3291; + fma.rn.ftz.f32 %r3294, %r2978, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3295, %r3294; + fma.rm.ftz.f32 %r3296, %r3295, %r373, %r372; + add.f32 %r3297, %r3296, 0fCB40007F; + neg.f32 %r3298, %r3297; + fma.rn.ftz.f32 %r3299, %r2978, %r377, %r3298; + fma.rn.ftz.f32 %r3300, %r2978, %r379, %r3299; + shl.b32 %r3301, %r3296, 23; + ex2.approx.ftz.f32 %r3302, %r3300; + mul.f32 %r3303, %r3302, %r3301; + fma.rn.ftz.f32 %r3304, %r2979, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3305, %r3304; + fma.rm.ftz.f32 %r3306, %r3305, %r373, %r372; + add.f32 %r3307, %r3306, 0fCB40007F; + neg.f32 %r3308, %r3307; + fma.rn.ftz.f32 %r3309, %r2979, %r377, %r3308; + fma.rn.ftz.f32 %r3310, %r2979, %r379, %r3309; + shl.b32 %r3311, %r3306, 23; + ex2.approx.ftz.f32 %r3312, %r3310; + mul.f32 %r3313, %r3312, %r3311; + fma.rn.ftz.f32 %r3314, %r2980, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3315, %r3314; + fma.rm.ftz.f32 %r3316, %r3315, %r373, %r372; + add.f32 %r3317, %r3316, 0fCB40007F; + neg.f32 %r3318, %r3317; + fma.rn.ftz.f32 %r3319, %r2980, %r377, %r3318; + fma.rn.ftz.f32 %r3320, %r2980, %r379, %r3319; + shl.b32 %r3321, %r3316, 23; + ex2.approx.ftz.f32 %r3322, %r3320; + mul.f32 %r3323, %r3322, %r3321; + fma.rn.ftz.f32 %r3324, %r2981, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3325, %r3324; + fma.rm.ftz.f32 %r3326, %r3325, %r373, %r372; + add.f32 %r3327, %r3326, 0fCB40007F; + neg.f32 %r3328, %r3327; + fma.rn.ftz.f32 %r3329, %r2981, %r377, %r3328; + fma.rn.ftz.f32 %r3330, %r2981, %r379, %r3329; + shl.b32 %r3331, %r3326, 23; + ex2.approx.ftz.f32 %r3332, %r3330; + mul.f32 %r3333, %r3332, %r3331; + fma.rn.ftz.f32 %r3334, %r2982, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3335, %r3334; + fma.rm.ftz.f32 %r3336, %r3335, %r373, %r372; + add.f32 %r3337, %r3336, 0fCB40007F; + neg.f32 %r3338, %r3337; + fma.rn.ftz.f32 %r3339, %r2982, %r377, %r3338; + fma.rn.ftz.f32 %r3340, %r2982, %r379, %r3339; + shl.b32 %r3341, %r3336, 23; + ex2.approx.ftz.f32 %r3342, %r3340; + mul.f32 %r3343, %r3342, %r3341; + fma.rn.ftz.f32 %r3344, %r2983, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3345, %r3344; + fma.rm.ftz.f32 %r3346, %r3345, %r373, %r372; + add.f32 %r3347, %r3346, 0fCB40007F; + neg.f32 %r3348, %r3347; + fma.rn.ftz.f32 %r3349, %r2983, %r377, %r3348; + fma.rn.ftz.f32 %r3350, %r2983, %r379, %r3349; + shl.b32 %r3351, %r3346, 23; + ex2.approx.ftz.f32 %r3352, %r3350; + mul.f32 %r3353, %r3352, %r3351; + fma.rn.ftz.f32 %r3354, %r2984, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3355, %r3354; + fma.rm.ftz.f32 %r3356, %r3355, %r373, %r372; + add.f32 %r3357, %r3356, 0fCB40007F; + neg.f32 %r3358, %r3357; + fma.rn.ftz.f32 %r3359, %r2984, %r377, %r3358; + fma.rn.ftz.f32 %r3360, %r2984, %r379, %r3359; + shl.b32 %r3361, %r3356, 23; + ex2.approx.ftz.f32 %r3362, %r3360; + mul.f32 %r3363, %r3362, %r3361; + fma.rn.ftz.f32 %r3364, %r2985, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3365, %r3364; + fma.rm.ftz.f32 %r3366, %r3365, %r373, %r372; + add.f32 %r3367, %r3366, 0fCB40007F; + neg.f32 %r3368, %r3367; + fma.rn.ftz.f32 %r3369, %r2985, %r377, %r3368; + fma.rn.ftz.f32 %r3370, %r2985, %r379, %r3369; + shl.b32 %r3371, %r3366, 23; + ex2.approx.ftz.f32 %r3372, %r3370; + mul.f32 %r3373, %r3372, %r3371; + fma.rn.ftz.f32 %r3374, %r2986, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3375, %r3374; + fma.rm.ftz.f32 %r3376, %r3375, %r373, %r372; + add.f32 %r3377, %r3376, 0fCB40007F; + neg.f32 %r3378, %r3377; + fma.rn.ftz.f32 %r3379, %r2986, %r377, %r3378; + fma.rn.ftz.f32 %r3380, %r2986, %r379, %r3379; + shl.b32 %r3381, %r3376, 23; + ex2.approx.ftz.f32 %r3382, %r3380; + mul.f32 %r3383, %r3382, %r3381; + fma.rn.ftz.f32 %r3384, %r2987, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3385, %r3384; + fma.rm.ftz.f32 %r3386, %r3385, %r373, %r372; + add.f32 %r3387, %r3386, 0fCB40007F; + neg.f32 %r3388, %r3387; + fma.rn.ftz.f32 %r3389, %r2987, %r377, %r3388; + fma.rn.ftz.f32 %r3390, %r2987, %r379, %r3389; + shl.b32 %r3391, %r3386, 23; + ex2.approx.ftz.f32 %r3392, %r3390; + mul.f32 %r3393, %r3392, %r3391; + fma.rn.ftz.f32 %r3394, %r2988, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3395, %r3394; + fma.rm.ftz.f32 %r3396, %r3395, %r373, %r372; + add.f32 %r3397, %r3396, 0fCB40007F; + neg.f32 %r3398, %r3397; + fma.rn.ftz.f32 %r3399, %r2988, %r377, %r3398; + fma.rn.ftz.f32 %r3400, %r2988, %r379, %r3399; + shl.b32 %r3401, %r3396, 23; + ex2.approx.ftz.f32 %r3402, %r3400; + mul.f32 %r3403, %r3402, %r3401; + fma.rn.ftz.f32 %r3404, %r2989, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3405, %r3404; + fma.rm.ftz.f32 %r3406, %r3405, %r373, %r372; + add.f32 %r3407, %r3406, 0fCB40007F; + neg.f32 %r3408, %r3407; + fma.rn.ftz.f32 %r3409, %r2989, %r377, %r3408; + fma.rn.ftz.f32 %r3410, %r2989, %r379, %r3409; + shl.b32 %r3411, %r3406, 23; + ex2.approx.ftz.f32 %r3412, %r3410; + mul.f32 %r3413, %r3412, %r3411; + fma.rn.ftz.f32 %r3414, %r2990, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3415, %r3414; + fma.rm.ftz.f32 %r3416, %r3415, %r373, %r372; + add.f32 %r3417, %r3416, 0fCB40007F; + neg.f32 %r3418, %r3417; + fma.rn.ftz.f32 %r3419, %r2990, %r377, %r3418; + fma.rn.ftz.f32 %r3420, %r2990, %r379, %r3419; + shl.b32 %r3421, %r3416, 23; + ex2.approx.ftz.f32 %r3422, %r3420; + mul.f32 %r3423, %r3422, %r3421; + fma.rn.ftz.f32 %r3424, %r2991, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3425, %r3424; + fma.rm.ftz.f32 %r3426, %r3425, %r373, %r372; + add.f32 %r3427, %r3426, 0fCB40007F; + neg.f32 %r3428, %r3427; + fma.rn.ftz.f32 %r3429, %r2991, %r377, %r3428; + fma.rn.ftz.f32 %r3430, %r2991, %r379, %r3429; + shl.b32 %r3431, %r3426, 23; + ex2.approx.ftz.f32 %r3432, %r3430; + mul.f32 %r3433, %r3432, %r3431; + fma.rn.ftz.f32 %r3434, %r2992, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3435, %r3434; + fma.rm.ftz.f32 %r3436, %r3435, %r373, %r372; + add.f32 %r3437, %r3436, 0fCB40007F; + neg.f32 %r3438, %r3437; + fma.rn.ftz.f32 %r3439, %r2992, %r377, %r3438; + fma.rn.ftz.f32 %r3440, %r2992, %r379, %r3439; + shl.b32 %r3441, %r3436, 23; + ex2.approx.ftz.f32 %r3442, %r3440; + mul.f32 %r3443, %r3442, %r3441; + fma.rn.ftz.f32 %r3444, %r2993, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3445, %r3444; + fma.rm.ftz.f32 %r3446, %r3445, %r373, %r372; + add.f32 %r3447, %r3446, 0fCB40007F; + neg.f32 %r3448, %r3447; + fma.rn.ftz.f32 %r3449, %r2993, %r377, %r3448; + fma.rn.ftz.f32 %r3450, %r2993, %r379, %r3449; + shl.b32 %r3451, %r3446, 23; + ex2.approx.ftz.f32 %r3452, %r3450; + mul.f32 %r3453, %r3452, %r3451; + fma.rn.ftz.f32 %r3454, %r2994, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3455, %r3454; + fma.rm.ftz.f32 %r3456, %r3455, %r373, %r372; + add.f32 %r3457, %r3456, 0fCB40007F; + neg.f32 %r3458, %r3457; + fma.rn.ftz.f32 %r3459, %r2994, %r377, %r3458; + fma.rn.ftz.f32 %r3460, %r2994, %r379, %r3459; + shl.b32 %r3461, %r3456, 23; + ex2.approx.ftz.f32 %r3462, %r3460; + mul.f32 %r3463, %r3462, %r3461; + fma.rn.ftz.f32 %r3464, %r2995, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3465, %r3464; + fma.rm.ftz.f32 %r3466, %r3465, %r373, %r372; + add.f32 %r3467, %r3466, 0fCB40007F; + neg.f32 %r3468, %r3467; + fma.rn.ftz.f32 %r3469, %r2995, %r377, %r3468; + fma.rn.ftz.f32 %r3470, %r2995, %r379, %r3469; + shl.b32 %r3471, %r3466, 23; + ex2.approx.ftz.f32 %r3472, %r3470; + mul.f32 %r3473, %r3472, %r3471; + fma.rn.ftz.f32 %r3474, %r2996, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3475, %r3474; + fma.rm.ftz.f32 %r3476, %r3475, %r373, %r372; + add.f32 %r3477, %r3476, 0fCB40007F; + neg.f32 %r3478, %r3477; + fma.rn.ftz.f32 %r3479, %r2996, %r377, %r3478; + fma.rn.ftz.f32 %r3480, %r2996, %r379, %r3479; + shl.b32 %r3481, %r3476, 23; + ex2.approx.ftz.f32 %r3482, %r3480; + mul.f32 %r3483, %r3482, %r3481; + fma.rn.ftz.f32 %r3484, %r2997, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3485, %r3484; + fma.rm.ftz.f32 %r3486, %r3485, %r373, %r372; + add.f32 %r3487, %r3486, 0fCB40007F; + neg.f32 %r3488, %r3487; + fma.rn.ftz.f32 %r3489, %r2997, %r377, %r3488; + fma.rn.ftz.f32 %r3490, %r2997, %r379, %r3489; + shl.b32 %r3491, %r3486, 23; + ex2.approx.ftz.f32 %r3492, %r3490; + mul.f32 %r3493, %r3492, %r3491; + fma.rn.ftz.f32 %r3494, %r2998, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3495, %r3494; + fma.rm.ftz.f32 %r3496, %r3495, %r373, %r372; + add.f32 %r3497, %r3496, 0fCB40007F; + neg.f32 %r3498, %r3497; + fma.rn.ftz.f32 %r3499, %r2998, %r377, %r3498; + fma.rn.ftz.f32 %r3500, %r2998, %r379, %r3499; + shl.b32 %r3501, %r3496, 23; + ex2.approx.ftz.f32 %r3502, %r3500; + mul.f32 %r3503, %r3502, %r3501; + fma.rn.ftz.f32 %r3504, %r2999, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3505, %r3504; + fma.rm.ftz.f32 %r3506, %r3505, %r373, %r372; + add.f32 %r3507, %r3506, 0fCB40007F; + neg.f32 %r3508, %r3507; + fma.rn.ftz.f32 %r3509, %r2999, %r377, %r3508; + fma.rn.ftz.f32 %r3510, %r2999, %r379, %r3509; + shl.b32 %r3511, %r3506, 23; + ex2.approx.ftz.f32 %r3512, %r3510; + mul.f32 %r3513, %r3512, %r3511; + fma.rn.ftz.f32 %r3514, %r3000, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3515, %r3514; + fma.rm.ftz.f32 %r3516, %r3515, %r373, %r372; + add.f32 %r3517, %r3516, 0fCB40007F; + neg.f32 %r3518, %r3517; + fma.rn.ftz.f32 %r3519, %r3000, %r377, %r3518; + fma.rn.ftz.f32 %r3520, %r3000, %r379, %r3519; + shl.b32 %r3521, %r3516, 23; + ex2.approx.ftz.f32 %r3522, %r3520; + mul.f32 %r3523, %r3522, %r3521; + fma.rn.ftz.f32 %r3524, %r3001, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3525, %r3524; + fma.rm.ftz.f32 %r3526, %r3525, %r373, %r372; + add.f32 %r3527, %r3526, 0fCB40007F; + neg.f32 %r3528, %r3527; + fma.rn.ftz.f32 %r3529, %r3001, %r377, %r3528; + fma.rn.ftz.f32 %r3530, %r3001, %r379, %r3529; + shl.b32 %r3531, %r3526, 23; + ex2.approx.ftz.f32 %r3532, %r3530; + mul.f32 %r3533, %r3532, %r3531; + fma.rn.ftz.f32 %r3534, %r3002, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3535, %r3534; + fma.rm.ftz.f32 %r3536, %r3535, %r373, %r372; + add.f32 %r3537, %r3536, 0fCB40007F; + neg.f32 %r3538, %r3537; + fma.rn.ftz.f32 %r3539, %r3002, %r377, %r3538; + fma.rn.ftz.f32 %r3540, %r3002, %r379, %r3539; + shl.b32 %r3541, %r3536, 23; + ex2.approx.ftz.f32 %r3542, %r3540; + mul.f32 %r3543, %r3542, %r3541; + fma.rn.ftz.f32 %r3544, %r3003, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3545, %r3544; + fma.rm.ftz.f32 %r3546, %r3545, %r373, %r372; + add.f32 %r3547, %r3546, 0fCB40007F; + neg.f32 %r3548, %r3547; + fma.rn.ftz.f32 %r3549, %r3003, %r377, %r3548; + fma.rn.ftz.f32 %r3550, %r3003, %r379, %r3549; + shl.b32 %r3551, %r3546, 23; + ex2.approx.ftz.f32 %r3552, %r3550; + mul.f32 %r3553, %r3552, %r3551; + fma.rn.ftz.f32 %r3554, %r3004, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3555, %r3554; + fma.rm.ftz.f32 %r3556, %r3555, %r373, %r372; + add.f32 %r3557, %r3556, 0fCB40007F; + neg.f32 %r3558, %r3557; + fma.rn.ftz.f32 %r3559, %r3004, %r377, %r3558; + fma.rn.ftz.f32 %r3560, %r3004, %r379, %r3559; + shl.b32 %r3561, %r3556, 23; + ex2.approx.ftz.f32 %r3562, %r3560; + mul.f32 %r3563, %r3562, %r3561; + fma.rn.ftz.f32 %r3564, %r3005, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3565, %r3564; + fma.rm.ftz.f32 %r3566, %r3565, %r373, %r372; + add.f32 %r3567, %r3566, 0fCB40007F; + neg.f32 %r3568, %r3567; + fma.rn.ftz.f32 %r3569, %r3005, %r377, %r3568; + fma.rn.ftz.f32 %r3570, %r3005, %r379, %r3569; + shl.b32 %r3571, %r3566, 23; + ex2.approx.ftz.f32 %r3572, %r3570; + mul.f32 %r3573, %r3572, %r3571; + fma.rn.ftz.f32 %r3574, %r3006, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3575, %r3574; + fma.rm.ftz.f32 %r3576, %r3575, %r373, %r372; + add.f32 %r3577, %r3576, 0fCB40007F; + neg.f32 %r3578, %r3577; + fma.rn.ftz.f32 %r3579, %r3006, %r377, %r3578; + fma.rn.ftz.f32 %r3580, %r3006, %r379, %r3579; + shl.b32 %r3581, %r3576, 23; + ex2.approx.ftz.f32 %r3582, %r3580; + mul.f32 %r3583, %r3582, %r3581; + fma.rn.ftz.f32 %r3584, %r3007, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3585, %r3584; + fma.rm.ftz.f32 %r3586, %r3585, %r373, %r372; + add.f32 %r3587, %r3586, 0fCB40007F; + neg.f32 %r3588, %r3587; + fma.rn.ftz.f32 %r3589, %r3007, %r377, %r3588; + fma.rn.ftz.f32 %r3590, %r3007, %r379, %r3589; + shl.b32 %r3591, %r3586, 23; + ex2.approx.ftz.f32 %r3592, %r3590; + mul.f32 %r3593, %r3592, %r3591; + fma.rn.ftz.f32 %r3594, %r3008, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3595, %r3594; + fma.rm.ftz.f32 %r3596, %r3595, %r373, %r372; + add.f32 %r3597, %r3596, 0fCB40007F; + neg.f32 %r3598, %r3597; + fma.rn.ftz.f32 %r3599, %r3008, %r377, %r3598; + fma.rn.ftz.f32 %r3600, %r3008, %r379, %r3599; + shl.b32 %r3601, %r3596, 23; + ex2.approx.ftz.f32 %r3602, %r3600; + mul.f32 %r3603, %r3602, %r3601; + fma.rn.ftz.f32 %r3604, %r3009, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3605, %r3604; + fma.rm.ftz.f32 %r3606, %r3605, %r373, %r372; + add.f32 %r3607, %r3606, 0fCB40007F; + neg.f32 %r3608, %r3607; + fma.rn.ftz.f32 %r3609, %r3009, %r377, %r3608; + fma.rn.ftz.f32 %r3610, %r3009, %r379, %r3609; + shl.b32 %r3611, %r3606, 23; + ex2.approx.ftz.f32 %r3612, %r3610; + mul.f32 %r3613, %r3612, %r3611; + fma.rn.ftz.f32 %r3614, %r3010, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3615, %r3614; + fma.rm.ftz.f32 %r3616, %r3615, %r373, %r372; + add.f32 %r3617, %r3616, 0fCB40007F; + neg.f32 %r3618, %r3617; + fma.rn.ftz.f32 %r3619, %r3010, %r377, %r3618; + fma.rn.ftz.f32 %r3620, %r3010, %r379, %r3619; + shl.b32 %r3621, %r3616, 23; + ex2.approx.ftz.f32 %r3622, %r3620; + mul.f32 %r3623, %r3622, %r3621; + fma.rn.ftz.f32 %r3624, %r3011, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3625, %r3624; + fma.rm.ftz.f32 %r3626, %r3625, %r373, %r372; + add.f32 %r3627, %r3626, 0fCB40007F; + neg.f32 %r3628, %r3627; + fma.rn.ftz.f32 %r3629, %r3011, %r377, %r3628; + fma.rn.ftz.f32 %r3630, %r3011, %r379, %r3629; + shl.b32 %r3631, %r3626, 23; + ex2.approx.ftz.f32 %r3632, %r3630; + mul.f32 %r3633, %r3632, %r3631; + fma.rn.ftz.f32 %r3634, %r3012, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3635, %r3634; + fma.rm.ftz.f32 %r3636, %r3635, %r373, %r372; + add.f32 %r3637, %r3636, 0fCB40007F; + neg.f32 %r3638, %r3637; + fma.rn.ftz.f32 %r3639, %r3012, %r377, %r3638; + fma.rn.ftz.f32 %r3640, %r3012, %r379, %r3639; + shl.b32 %r3641, %r3636, 23; + ex2.approx.ftz.f32 %r3642, %r3640; + mul.f32 %r3643, %r3642, %r3641; + fma.rn.ftz.f32 %r3644, %r3013, %r369, %r368; + cvt.ftz.sat.f32.f32 %r3645, %r3644; + fma.rm.ftz.f32 %r3646, %r3645, %r373, %r372; + add.f32 %r3647, %r3646, 0fCB40007F; + neg.f32 %r3648, %r3647; + fma.rn.ftz.f32 %r3649, %r3013, %r377, %r3648; + fma.rn.ftz.f32 %r3650, %r3013, %r379, %r3649; + shl.b32 %r3651, %r3646, 23; + ex2.approx.ftz.f32 %r3652, %r3650; + mul.f32 %r3653, %r3652, %r3651; + .loc 1 62 23 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:62:23 + div.full.f32 %r3654, %r3023, %r2885; + div.full.f32 %r3655, %r3033, %r2885; + div.full.f32 %r3656, %r3043, %r2885; + div.full.f32 %r3657, %r3053, %r2885; + div.full.f32 %r3658, %r3063, %r2885; + div.full.f32 %r3659, %r3073, %r2885; + div.full.f32 %r3660, %r3083, %r2885; + div.full.f32 %r3661, %r3093, %r2885; + div.full.f32 %r3662, %r3103, %r2885; + div.full.f32 %r3663, %r3113, %r2885; + div.full.f32 %r3664, %r3123, %r2885; + div.full.f32 %r3665, %r3133, %r2885; + div.full.f32 %r3666, %r3143, %r2885; + div.full.f32 %r3667, %r3153, %r2885; + div.full.f32 %r3668, %r3163, %r2885; + div.full.f32 %r3669, %r3173, %r2885; + div.full.f32 %r3670, %r3183, %r2885; + div.full.f32 %r3671, %r3193, %r2885; + div.full.f32 %r3672, %r3203, %r2885; + div.full.f32 %r3673, %r3213, %r2885; + div.full.f32 %r3674, %r3223, %r2885; + div.full.f32 %r3675, %r3233, %r2885; + div.full.f32 %r3676, %r3243, %r2885; + div.full.f32 %r3677, %r3253, %r2885; + div.full.f32 %r3678, %r3263, %r2885; + div.full.f32 %r3679, %r3273, %r2885; + div.full.f32 %r3680, %r3283, %r2885; + div.full.f32 %r3681, %r3293, %r2885; + div.full.f32 %r3682, %r3303, %r2885; + div.full.f32 %r3683, %r3313, %r2885; + div.full.f32 %r3684, %r3323, %r2885; + div.full.f32 %r3685, %r3333, %r2885; + div.full.f32 %r3686, %r3343, %r2885; + div.full.f32 %r3687, %r3353, %r2885; + div.full.f32 %r3688, %r3363, %r2885; + div.full.f32 %r3689, %r3373, %r2885; + div.full.f32 %r3690, %r3383, %r2885; + div.full.f32 %r3691, %r3393, %r2885; + div.full.f32 %r3692, %r3403, %r2885; + div.full.f32 %r3693, %r3413, %r2885; + div.full.f32 %r3694, %r3423, %r2885; + div.full.f32 %r3695, %r3433, %r2885; + div.full.f32 %r3696, %r3443, %r2885; + div.full.f32 %r3697, %r3453, %r2885; + div.full.f32 %r3698, %r3463, %r2885; + div.full.f32 %r3699, %r3473, %r2885; + div.full.f32 %r3700, %r3483, %r2885; + div.full.f32 %r3701, %r3493, %r2885; + div.full.f32 %r3702, %r3503, %r2885; + div.full.f32 %r3703, %r3513, %r2885; + div.full.f32 %r3704, %r3523, %r2885; + div.full.f32 %r3705, %r3533, %r2885; + div.full.f32 %r3706, %r3543, %r2885; + div.full.f32 %r3707, %r3553, %r2885; + div.full.f32 %r3708, %r3563, %r2885; + div.full.f32 %r3709, %r3573, %r2885; + div.full.f32 %r3710, %r3583, %r2885; + div.full.f32 %r3711, %r3593, %r2885; + div.full.f32 %r3712, %r3603, %r2885; + div.full.f32 %r3713, %r3613, %r2885; + div.full.f32 %r3714, %r3623, %r2885; + div.full.f32 %r3715, %r3633, %r2885; + div.full.f32 %r3716, %r3643, %r2885; + div.full.f32 %r3717, %r3653, %r2885; + .loc 1 63 29 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:63:29 + mad.wide.s32 %rd49, %r222, 4, %rd66; + mad.wide.s32 %rd50, %r224, 4, %rd66; + mad.wide.s32 %rd51, %r225, 4, %rd66; + mad.wide.s32 %rd52, %r226, 4, %rd66; + mad.wide.s32 %rd53, %r227, 4, %rd66; + mad.wide.s32 %rd54, %r228, 4, %rd66; + mad.wide.s32 %rd55, %r229, 4, %rd66; + mad.wide.s32 %rd56, %r230, 4, %rd66; + mad.wide.s32 %rd57, %r231, 4, %rd66; + mad.wide.s32 %rd58, %r232, 4, %rd66; + mad.wide.s32 %rd59, %r233, 4, %rd66; + mad.wide.s32 %rd60, %r234, 4, %rd66; + mad.wide.s32 %rd61, %r235, 4, %rd66; + mad.wide.s32 %rd62, %r236, 4, %rd66; + mad.wide.s32 %rd63, %r237, 4, %rd66; + mad.wide.s32 %rd64, %r238, 4, %rd66; + .loc 1 63 53 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:63:53 + bar.sync 0; + shl.b32 %r3718, %r208, 4; + add.s32 %r3719, %r1901, %r3718; + st.shared.v4.b32 [%r3719], {%r3654, %r3655, %r3656, %r3657}; + xor.b32 %r3720, %r3718, 64; + add.s32 %r3721, %r1901, %r3720; + st.shared.v4.b32 [%r3721+8192], {%r3658, %r3659, %r3660, %r3661}; + bar.sync 0; + shl.b32 %r3722, %r206, 3; + and.b32 %r3723, %r3722, 4080; + and.b32 %r3724, %r206, 1; + neg.s32 %r3725, %r3724; + and.b32 %r3726, %r3725, 8256; + xor.b32 %r3727, %r3726, %r3723; + add.s32 %r3728, %r1901, %r3727; + ld.shared.v4.b32 {%r141, %r142, %r143, %r144}, [%r3728]; + ld.shared.v4.b32 {%r145, %r146, %r147, %r148}, [%r3728+4096]; + bar.sync 0; + st.shared.v4.b32 [%r3719], {%r3662, %r3663, %r3664, %r3665}; + st.shared.v4.b32 [%r3721+8192], {%r3666, %r3667, %r3668, %r3669}; + bar.sync 0; + ld.shared.v4.b32 {%r149, %r150, %r151, %r152}, [%r3728]; + ld.shared.v4.b32 {%r153, %r154, %r155, %r156}, [%r3728+4096]; + bar.sync 0; + st.shared.v4.b32 [%r3719], {%r3670, %r3671, %r3672, %r3673}; + st.shared.v4.b32 [%r3721+8192], {%r3674, %r3675, %r3676, %r3677}; + bar.sync 0; + ld.shared.v4.b32 {%r157, %r158, %r159, %r160}, [%r3728]; + ld.shared.v4.b32 {%r161, %r162, %r163, %r164}, [%r3728+4096]; + bar.sync 0; + st.shared.v4.b32 [%r3719], {%r3678, %r3679, %r3680, %r3681}; + st.shared.v4.b32 [%r3721+8192], {%r3682, %r3683, %r3684, %r3685}; + bar.sync 0; + ld.shared.v4.b32 {%r165, %r166, %r167, %r168}, [%r3728]; + ld.shared.v4.b32 {%r169, %r170, %r171, %r172}, [%r3728+4096]; + bar.sync 0; + st.shared.v4.b32 [%r3719], {%r3686, %r3687, %r3688, %r3689}; + st.shared.v4.b32 [%r3721+8192], {%r3690, %r3691, %r3692, %r3693}; + bar.sync 0; + ld.shared.v4.b32 {%r173, %r174, %r175, %r176}, [%r3728]; + ld.shared.v4.b32 {%r177, %r178, %r179, %r180}, [%r3728+4096]; + bar.sync 0; + st.shared.v4.b32 [%r3719], {%r3694, %r3695, %r3696, %r3697}; + st.shared.v4.b32 [%r3721+8192], {%r3698, %r3699, %r3700, %r3701}; + bar.sync 0; + ld.shared.v4.b32 {%r181, %r182, %r183, %r184}, [%r3728]; + ld.shared.v4.b32 {%r185, %r186, %r187, %r188}, [%r3728+4096]; + bar.sync 0; + st.shared.v4.b32 [%r3719], {%r3702, %r3703, %r3704, %r3705}; + st.shared.v4.b32 [%r3721+8192], {%r3706, %r3707, %r3708, %r3709}; + bar.sync 0; + ld.shared.v4.b32 {%r189, %r190, %r191, %r192}, [%r3728]; + ld.shared.v4.b32 {%r193, %r194, %r195, %r196}, [%r3728+4096]; + bar.sync 0; + st.shared.v4.b32 [%r3719], {%r3710, %r3711, %r3712, %r3713}; + st.shared.v4.b32 [%r3721+8192], {%r3714, %r3715, %r3716, %r3717}; + bar.sync 0; + ld.shared.v4.b32 {%r197, %r198, %r199, %r200}, [%r3728]; + ld.shared.v4.b32 {%r201, %r202, %r203, %r204}, [%r3728+4096]; + // begin inline asm + @%p1 st.global.v4.b32 [ %rd49 + 0 ], { %r141, %r142, %r143, %r144 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd50 + 0 ], { %r145, %r146, %r147, %r148 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd51 + 0 ], { %r149, %r150, %r151, %r152 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd52 + 0 ], { %r153, %r154, %r155, %r156 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd53 + 0 ], { %r157, %r158, %r159, %r160 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd54 + 0 ], { %r161, %r162, %r163, %r164 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd55 + 0 ], { %r165, %r166, %r167, %r168 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd56 + 0 ], { %r169, %r170, %r171, %r172 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd57 + 0 ], { %r173, %r174, %r175, %r176 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd58 + 0 ], { %r177, %r178, %r179, %r180 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd59 + 0 ], { %r181, %r182, %r183, %r184 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd60 + 0 ], { %r185, %r186, %r187, %r188 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd61 + 0 ], { %r189, %r190, %r191, %r192 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd62 + 0 ], { %r193, %r194, %r195, %r196 }; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd63 + 0 ], { %r197, %r198, %r199, %r200 }; + // end inline asm + // begin inline asm + @%p38 st.global.v4.b32 [ %rd64 + 0 ], { %r201, %r202, %r203, %r204 }; + // end inline asm + .loc 1 52 4 // cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py:52:4 + ret; +$L__tmp14: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 276 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x10d DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 99 +.b8 122 +.b8 109 +.b8 109 +.b8 114 +.b8 108 +.b8 117 +.b8 51 +.b8 117 +.b8 54 +.b8 55 +.b8 106 +.b8 110 +.b8 51 +.b8 103 +.b8 105 +.b8 116 +.b8 102 +.b8 119 +.b8 121 +.b8 117 +.b8 51 +.b8 106 +.b8 105 +.b8 102 +.b8 51 +.b8 98 +.b8 51 +.b8 112 +.b8 119 +.b8 111 +.b8 120 +.b8 106 +.b8 51 +.b8 55 +.b8 120 +.b8 112 +.b8 52 +.b8 100 +.b8 100 +.b8 97 +.b8 99 +.b8 107 +.b8 99 +.b8 117 +.b8 102 +.b8 119 +.b8 105 +.b8 113 +.b8 114 +.b8 105 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 99 +.b8 122 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x46 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 101 +.b8 120 +.b8 112 +.b8 95 +.b8 112 +.b8 114 +.b8 101 +.b8 112 +.b8 97 +.b8 114 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 111 +.b8 110 +.b8 108 +.b8 105 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 117 +.b8 98 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xd1:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xe6:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp11 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 40 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xfe:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp13 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 49 // DW_AT_call_line +.b8 33 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source new file mode 100644 index 0000000000000000000000000000000000000000..fcf16353637b7a9a1a3cf036d76e64ead00b151b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source @@ -0,0 +1,449 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":18:0) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":186:0) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":109:0) +#loc68 = loc(unknown) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":169:0) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":177:0) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":122:0) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc108 = loc("in_ptr0"(#loc)) +#loc109 = loc("out_ptr2"(#loc)) +#loc110 = loc("xnumel"(#loc)) +#loc111 = loc("r0_numel"(#loc)) +#loc146 = loc("lhs_max"(#loc48)) +#loc147 = loc("lhs_sum"(#loc48)) +#loc148 = loc("rhs_max"(#loc48)) +#loc160 = loc("a"(#loc62)) +#loc161 = loc("b"(#loc62)) +#loc165 = loc("x"(#loc72)) +#loc166 = loc("x"(#loc76)) +#loc167 = loc("x"(#loc81)) +#loc168 = loc("lhs_max"(#loc85)) +#loc169 = loc("lhs_sum"(#loc85)) +#loc178 = loc("a"(#loc96)) +#loc179 = loc("input"(#loc100)) +#loc180 = loc("a"(#loc104)) +#loc181 = loc("b"(#loc104)) +module { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 4096 : i32 loc(#loc112) + %r0_numel_1 = arith.constant 32000 : i32 loc(#loc113) + %xoffset = tt.get_program_id x : i32 loc(#loc114) + %xoffset_2 = arith.constant 1 : i32 loc(#loc115) + %xoffset_3 = arith.constant 1 : i32 loc(#loc115) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc115) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc116) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc117) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc118) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc118) + %xmask = arith.constant true loc(#loc119) + %xmask_8 = arith.constant dense : tensor<1x32768xi1> loc(#loc119) + %r0_base = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc120) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32768xi32> -> tensor<1x32768xi32> loc(#loc121) + %_tmp3_max = arith.constant 0xFF800000 : f32 loc(#loc122) + %_tmp3_max_10 = arith.constant dense<0xFF800000> : tensor<1x32768xf32> loc(#loc122) + %_tmp3_sum = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_32768__(1,)cconstexpr_fp32_"() : () -> tensor<1x32768xf32> loc(#loc123) + %c0_i32 = arith.constant 0 : i32 loc(#loc13) + %c32768_i32 = arith.constant 32768 : i32 loc(#loc13) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc13) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc13) + %2 = arith.bitcast %c32768_i32 : i32 to i32 loc(#loc13) + %3 = ub.poison : i32 loc(#loc13) + %_tmp3_sum_11:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_max_14 = %_tmp3_max_10, %_tmp3_sum_15 = %_tmp3_sum) -> (tensor<1x32768xf32>, tensor<1x32768xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32768xi32> loc(#loc125) + %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x32768xi32> loc(#loc125) + %r0_mask = arith.constant dense<32000> : tensor<1x32768xi32> loc(#loc126) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x32768xi32> loc(#loc126) + %tmp0 = arith.constant 32000 : i32 loc(#loc127) + %tmp0_18 = arith.constant 32000 : i32 loc(#loc127) + %tmp0_19 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc127) + %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc127) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x32768xi32> loc(#loc128) + %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x32768xi32> loc(#loc128) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32768x!tt.ptr> loc(#loc129) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x32768x!tt.ptr>, tensor<1x32768xi32> loc(#loc129) + %tmp0_25 = arith.constant 0.000000e+00 : f32 loc(#loc130) + %tmp0_26 = arith.constant dense<0.000000e+00> : tensor<1x32768xf32> loc(#loc130) + %tmp0_27 = arith.truncf %tmp0_26 : tensor<1x32768xf32> to tensor<1x32768xbf16> loc(#loc130) + %tmp0_28 = tt.load %tmp0_24, %r0_mask_17, %tmp0_27 evictionPolicy = evict_last : tensor<1x32768x!tt.ptr> loc(#loc130) + %tmp0_29 = arith.extf %tmp0_28 : tensor<1x32768xbf16> to tensor<1x32768xf32> loc(#loc131) + %9:2 = tt.call @"torch._inductor.runtime.triton_helpers.online_softmax_combine__fp32S1_32768S_fp32S1_32768S_fp32S1_32768S__(3,)cconstexpr_False_"(%_tmp3_max_14, %_tmp3_sum_15, %tmp0_29) : (tensor<1x32768xf32>, tensor<1x32768xf32>, tensor<1x32768xf32>) -> (tensor<1x32768xf32>, tensor<1x32768xf32>) loc(#loc21) + %_tmp3_max_30 = arith.select %r0_mask_17, %9#0, %_tmp3_max_14 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc132) + %_tmp3_sum_31 = arith.select %r0_mask_17, %9#1, %_tmp3_sum_15 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc133) + scf.yield %_tmp3_max_30, %_tmp3_sum_31 : tensor<1x32768xf32>, tensor<1x32768xf32> loc(#loc24) + } loc(#loc182) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.online_softmax_reduce__fp32S1_32768S_fp32S1_32768S__(2,)cconstexpr_1__(3,)cconstexpr_False_"(%_tmp3_sum_11#0, %_tmp3_sum_11#1) : (tensor<1x32768xf32>, tensor<1x32768xf32>) -> (tensor<1xf32>, tensor<1xf32>) loc(#loc25) + %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc134) + %tmp4 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc135) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc28) + %c32768_i32_13 = arith.constant 32768 : i32 loc(#loc28) + %5 = arith.bitcast %c0_i32_12 : i32 to i32 loc(#loc28) + %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc28) + %7 = arith.bitcast %c32768_i32_13 : i32 to i32 loc(#loc28) + %8 = ub.poison : i32 loc(#loc28) + scf.for %r0_offset = %5 to %6 step %7 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x32768xi32> loc(#loc136) + %r0_index_14 = arith.addi %r0_index, %r0_base_9 : tensor<1x32768xi32> loc(#loc136) + %r0_mask = arith.constant dense<32000> : tensor<1x32768xi32> loc(#loc137) + %r0_mask_15 = arith.cmpi slt, %r0_index_14, %r0_mask : tensor<1x32768xi32> loc(#loc137) + %tmp5 = arith.constant 32000 : i32 loc(#loc138) + %tmp5_16 = arith.constant 32000 : i32 loc(#loc138) + %tmp5_17 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc138) + %tmp5_18 = arith.muli %tmp5_17, %xindex_7 : tensor<1x1xi32> loc(#loc138) + %tmp5_19 = tt.broadcast %tmp5_18 : tensor<1x1xi32> -> tensor<1x32768xi32> loc(#loc139) + %tmp5_20 = arith.addi %r0_index_14, %tmp5_19 : tensor<1x32768xi32> loc(#loc139) + %tmp5_21 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32768x!tt.ptr> loc(#loc140) + %tmp5_22 = tt.addptr %tmp5_21, %tmp5_20 : tensor<1x32768x!tt.ptr>, tensor<1x32768xi32> loc(#loc140) + %tmp5_23 = arith.constant 0.000000e+00 : f32 loc(#loc141) + %tmp5_24 = arith.constant dense<0.000000e+00> : tensor<1x32768xf32> loc(#loc141) + %tmp5_25 = arith.truncf %tmp5_24 : tensor<1x32768xf32> to tensor<1x32768xbf16> loc(#loc141) + %tmp5_26 = tt.load %tmp5_22, %r0_mask_15, %tmp5_25 evictionPolicy = evict_first : tensor<1x32768x!tt.ptr> loc(#loc141) + %tmp5_27 = arith.extf %tmp5_26 : tensor<1x32768xbf16> to tensor<1x32768xf32> loc(#loc142) + %tmp7 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x32768xf32> loc(#loc143) + %tmp7_28 = arith.subf %tmp5_27, %tmp7 : tensor<1x32768xf32> loc(#loc143) + %tmp8 = tt.extern_elementwise %tmp7_28 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc144) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32> -> tensor<1x32768xf32> loc(#loc145) + %tmp9_29 = arith.divf %tmp8, %tmp9 : tensor<1x32768xf32> loc(#loc145) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc39) + %c32000_i32_30 = arith.constant 32000 : i32 loc(#loc39) + %cst = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc39) + %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc39) + %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x32768xi32> loc(#loc40) + %11 = arith.addi %r0_index_14, %10 : tensor<1x32768xi32> loc(#loc40) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32768x!tt.ptr> loc(#loc41) + %13 = tt.addptr %12, %11 : tensor<1x32768x!tt.ptr>, tensor<1x32768xi32> loc(#loc41) + tt.store %13, %tmp9_29, %r0_mask_15 : tensor<1x32768x!tt.ptr> loc(#loc42) + } loc(#loc28) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_32768__(1,)cconstexpr_fp32_"() -> tensor<1x32768xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc45) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x32768xf32> loc(#loc45) + tt.return %cst_0 : tensor<1x32768xf32> loc(#loc46) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x32768xf32> loc(#loc47) + tt.return %0 : tensor<1x32768xf32> loc(#loc47) + } loc(#loc44) + tt.func private @"torch._inductor.runtime.triton_helpers.online_softmax_combine__fp32S1_32768S_fp32S1_32768S_fp32S1_32768S__(3,)cconstexpr_False_"(%lhs_max: tensor<1x32768xf32> loc("lhs_max"(#loc48)), %lhs_sum: tensor<1x32768xf32> loc("lhs_sum"(#loc48)), %rhs_max: tensor<1x32768xf32> loc("rhs_max"(#loc48))) -> (tensor<1x32768xf32>, tensor<1x32768xf32>) attributes {noinline = false} { + %out_max = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32S1_32768S_fp32S1_32768S__(%lhs_max, %rhs_max) : (tensor<1x32768xf32>, tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc149) + %lhs_scale = arith.constant 0xFF800000 : f32 loc(#loc150) + %lhs_scale_0 = arith.constant dense<0xFF800000> : tensor<1x32768xf32> loc(#loc150) + %lhs_scale_1 = arith.cmpf oeq, %out_max, %lhs_scale_0 : tensor<1x32768xf32> loc(#loc150) + %lhs_scale_2 = arith.subf %lhs_max, %out_max : tensor<1x32768xf32> loc(#loc151) + %lhs_scale_3 = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_32768S__(1,)cconstexpr_False_"(%lhs_scale_2) : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc152) + %lhs_scale_4 = arith.constant 1.000000e+00 : f32 loc(#loc153) + %lhs_scale_5 = arith.constant 1.000000e+00 : f32 loc(#loc153) + %lhs_scale_6 = arith.constant dense<1.000000e+00> : tensor<1x32768xf32> loc(#loc153) + %lhs_scale_7 = arith.select %lhs_scale_1, %lhs_scale_6, %lhs_scale_3 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc153) + %rhs_scale = arith.constant 0xFF800000 : f32 loc(#loc154) + %rhs_scale_8 = arith.constant dense<0xFF800000> : tensor<1x32768xf32> loc(#loc154) + %rhs_scale_9 = arith.cmpf oeq, %out_max, %rhs_scale_8 : tensor<1x32768xf32> loc(#loc154) + %rhs_scale_10 = arith.subf %rhs_max, %out_max : tensor<1x32768xf32> loc(#loc155) + %rhs_scale_11 = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_32768S__(1,)cconstexpr_False_"(%rhs_scale_10) : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc156) + %rhs_scale_12 = arith.constant 1.000000e+00 : f32 loc(#loc157) + %rhs_scale_13 = arith.constant 1.000000e+00 : f32 loc(#loc157) + %rhs_scale_14 = arith.constant dense<1.000000e+00> : tensor<1x32768xf32> loc(#loc157) + %rhs_scale_15 = arith.select %rhs_scale_9, %rhs_scale_14, %rhs_scale_11 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc157) + %out_sum = arith.mulf %lhs_sum, %lhs_scale_7 : tensor<1x32768xf32> loc(#loc158) + %out_sum_16 = arith.addf %out_sum, %rhs_scale_15 : tensor<1x32768xf32> loc(#loc159) + tt.return %out_max, %out_sum_16 : tensor<1x32768xf32>, tensor<1x32768xf32> loc(#loc60) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x32768xf32> loc(#loc61) + %1 = ub.poison : tensor<1x32768xf32> loc(#loc61) + tt.return %0, %1 : tensor<1x32768xf32>, tensor<1x32768xf32> loc(#loc61) + } loc(#loc48) + tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32S1_32768S_fp32S1_32768S__(%a: tensor<1x32768xf32> loc("a"(#loc62)), %b: tensor<1x32768xf32> loc("b"(#loc62))) -> tensor<1x32768xf32> attributes {noinline = false} { + %mask = arith.cmpf ogt, %a, %b : tensor<1x32768xf32> loc(#loc183) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_32768S__(%a) : (tensor<1x32768xf32>) -> i1 loc(#loc64) + %1 = scf.if %0 -> (tensor<1x32768xi1>) { + %mask_0 = arith.cmpf une, %a, %a : tensor<1x32768xf32> loc(#loc163) + %mask_1 = arith.ori %mask, %mask_0 : tensor<1x32768xi1> loc(#loc184) + scf.yield %mask_1 : tensor<1x32768xi1> loc(#loc184) + } else { + scf.yield %mask : tensor<1x32768xi1> loc(#loc68) + } loc(#loc65) + %2 = arith.select %1, %a, %b : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc69) + tt.return %2 : tensor<1x32768xf32> loc(#loc70) + ^bb1: // no predecessors + %3 = ub.poison : tensor<1x32768xf32> loc(#loc71) + tt.return %3 : tensor<1x32768xf32> loc(#loc71) + } loc(#loc62) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_32768S__(%x: tensor<1x32768xf32> loc("x"(#loc72))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_32768S__(%x) : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc73) + %true = arith.constant true loc(#loc74) + tt.return %true : i1 loc(#loc74) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc75) + tt.return %1 : i1 loc(#loc75) + } loc(#loc72) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_32768S__(%x: tensor<1x32768xf32> loc("x"(#loc76))) -> tensor<1x32768xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc77) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc78) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc78) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x32768xf32> loc(#loc78) + %4 = arith.addf %x, %3 : tensor<1x32768xf32> loc(#loc78) + tt.return %4 : tensor<1x32768xf32> loc(#loc79) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x32768xf32> loc(#loc80) + tt.return %5 : tensor<1x32768xf32> loc(#loc80) + } loc(#loc76) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc45) + %cst = arith.constant dense : tensor<1xi1> loc(#loc45) + tt.return %cst : tensor<1xi1> loc(#loc46) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc47) + tt.return %0 : tensor<1xi1> loc(#loc47) + } loc(#loc44) + tt.func private @"torch._inductor.runtime.triton_helpers.exp__fp32S1_32768S__(1,)cconstexpr_False_"(%x: tensor<1x32768xf32> loc("x"(#loc81))) -> tensor<1x32768xf32> attributes {noinline = false} { + %0 = tt.extern_elementwise %x {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc82) + tt.return %0 : tensor<1x32768xf32> loc(#loc83) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x32768xf32> loc(#loc84) + tt.return %1 : tensor<1x32768xf32> loc(#loc84) + } loc(#loc81) + tt.func private @"torch._inductor.runtime.triton_helpers.online_softmax_reduce__fp32S1_32768S_fp32S1_32768S__(2,)cconstexpr_1__(3,)cconstexpr_False_"(%lhs_max: tensor<1x32768xf32> loc("lhs_max"(#loc85)), %lhs_sum: tensor<1x32768xf32> loc("lhs_sum"(#loc85))) -> (tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} { + %out_max = tt.call @"torch._inductor.runtime.triton_helpers.max2__fp32S1_32768S__(1,)cconstexpr_1_"(%lhs_max) : (tensor<1x32768xf32>) -> tensor<1xf32> loc(#loc170) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc171) + %delta = arith.constant 0xFF800000 : f32 loc(#loc172) + %delta_0 = arith.constant dense<0xFF800000> : tensor<1x1xf32> loc(#loc172) + %delta_1 = arith.cmpf oeq, %out_max_keepdim, %delta_0 : tensor<1x1xf32> loc(#loc172) + %delta_2 = tt.broadcast %out_max_keepdim : tensor<1x1xf32> -> tensor<1x32768xf32> loc(#loc173) + %delta_3 = arith.subf %lhs_max, %delta_2 : tensor<1x32768xf32> loc(#loc173) + %delta_4 = arith.constant 0 : i32 loc(#loc174) + %delta_5 = arith.constant 0.000000e+00 : f32 loc(#loc174) + %delta_6 = arith.constant dense<0.000000e+00> : tensor<1x32768xf32> loc(#loc174) + %delta_7 = tt.broadcast %delta_1 : tensor<1x1xi1> -> tensor<1x32768xi1> loc(#loc174) + %delta_8 = arith.select %delta_7, %delta_6, %delta_3 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc174) + %out_sum = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_32768S__(1,)cconstexpr_False_"(%delta_8) : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc175) + %out_sum_9 = arith.mulf %lhs_sum, %out_sum : tensor<1x32768xf32> loc(#loc176) + %out_sum_10 = tt.call @"triton.language.standard.sum__fp32S1_32768S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%out_sum_9) : (tensor<1x32768xf32>) -> tensor<1xf32> loc(#loc177) + tt.return %out_max, %out_sum_10 : tensor<1xf32>, tensor<1xf32> loc(#loc94) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xf32> loc(#loc95) + %1 = ub.poison : tensor<1xf32> loc(#loc95) + tt.return %0, %1 : tensor<1xf32>, tensor<1xf32> loc(#loc95) + } loc(#loc85) + tt.func private @"torch._inductor.runtime.triton_helpers.max2__fp32S1_32768S__(1,)cconstexpr_1_"(%a: tensor<1x32768xf32> loc("a"(#loc96))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%a) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc97) + tt.reduce.return %2 : f32 loc(#loc97) + }) : (tensor<1x32768xf32>) -> tensor<1xf32> loc(#loc97) + tt.return %0 : tensor<1xf32> loc(#loc98) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc99) + tt.return %1 : tensor<1xf32> loc(#loc99) + } loc(#loc96) + tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%a: f32 loc("a"(#loc62)), %b: f32 loc("b"(#loc62))) -> f32 attributes {noinline = false} { + %mask = arith.cmpf ogt, %a, %b : f32 loc(#loc183) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a) : (f32) -> i1 loc(#loc64) + %1 = scf.if %0 -> (i1) { + %mask_0 = arith.cmpf une, %a, %a : f32 loc(#loc163) + %mask_1 = arith.ori %mask, %mask_0 : i1 loc(#loc184) + scf.yield %mask_1 : i1 loc(#loc184) + } else { + scf.yield %mask : i1 loc(#loc68) + } loc(#loc65) + %2 = arith.select %1, %a, %b : f32 loc(#loc69) + tt.return %2 : f32 loc(#loc70) + ^bb1: // no predecessors + %3 = ub.poison : f32 loc(#loc71) + tt.return %3 : f32 loc(#loc71) + } loc(#loc62) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc72))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc73) + %true = arith.constant true loc(#loc74) + tt.return %true : i1 loc(#loc74) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc75) + tt.return %1 : i1 loc(#loc75) + } loc(#loc72) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc76))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc77) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc78) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc78) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc78) + tt.return %3 : tensor<1xf32> loc(#loc79) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc80) + tt.return %4 : tensor<1xf32> loc(#loc80) + } loc(#loc76) + tt.func private @"triton.language.standard.sum__fp32S1_32768S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x32768xf32> loc("input"(#loc100))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc101) + tt.reduce.return %2 : f32 loc(#loc101) + }) : (tensor<1x32768xf32>) -> tensor<1xf32> loc(#loc101) + tt.return %0 : tensor<1xf32> loc(#loc102) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc103) + tt.return %1 : tensor<1xf32> loc(#loc103) + } loc(#loc100) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc104)), %b: f32 loc("b"(#loc104))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc105) + tt.return %0 : f32 loc(#loc106) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc107) + tt.return %1 : f32 loc(#loc107) + } loc(#loc104) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":29:59) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":30:45) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":31:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":32:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":33:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:47) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:52) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:105) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":42:40) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":45:54) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":46:54) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":46:8) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":49:33) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":50:16) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":51:16) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":52:40) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":53:31) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":54:29) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:47) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:52) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:106) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":60:22) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":61:29) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":62:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:42) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:36) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:29) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:53) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":52:4) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:19) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":206:11) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":206:4) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:19) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:7) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:11) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:4) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:15) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":170:4) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":182:11) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":182:4) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:11) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:4) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc112 = loc("xnumel"(#loc1)) +#loc113 = loc("r0_numel"(#loc2)) +#loc114 = loc("xoffset"(#loc3)) +#loc115 = loc("xoffset"(#loc4)) +#loc116 = loc("xindex"(#loc5)) +#loc117 = loc("xindex"(#loc6)) +#loc118 = loc("xindex"(#loc7)) +#loc119 = loc("xmask"(#loc8)) +#loc120 = loc("r0_base"(#loc9)) +#loc121 = loc("r0_base"(#loc10)) +#loc122 = loc("_tmp3_max"(#loc11)) +#loc123 = loc("_tmp3_sum"(#loc12)) +#loc124 = loc("_tmp3_max"(#loc13)) +#loc125 = loc("r0_index"(#loc14)) +#loc126 = loc("r0_mask"(#loc15)) +#loc127 = loc("tmp0"(#loc16)) +#loc128 = loc("tmp0"(#loc17)) +#loc129 = loc("tmp0"(#loc18)) +#loc130 = loc("tmp0"(#loc19)) +#loc131 = loc("tmp0"(#loc20)) +#loc132 = loc("_tmp3_max"(#loc22)) +#loc133 = loc("_tmp3_sum"(#loc23)) +#loc134 = loc("tmp3"(#loc26)) +#loc135 = loc("tmp4"(#loc27)) +#loc136 = loc("r0_index"(#loc29)) +#loc137 = loc("r0_mask"(#loc30)) +#loc138 = loc("tmp5"(#loc31)) +#loc139 = loc("tmp5"(#loc32)) +#loc140 = loc("tmp5"(#loc33)) +#loc141 = loc("tmp5"(#loc34)) +#loc142 = loc("tmp5"(#loc35)) +#loc143 = loc("tmp7"(#loc36)) +#loc144 = loc("tmp8"(#loc37)) +#loc145 = loc("tmp9"(#loc38)) +#loc149 = loc("out_max"(#loc49)) +#loc150 = loc("lhs_scale"(#loc50)) +#loc151 = loc("lhs_scale"(#loc51)) +#loc152 = loc("lhs_scale"(#loc52)) +#loc153 = loc("lhs_scale"(#loc53)) +#loc154 = loc("rhs_scale"(#loc54)) +#loc155 = loc("rhs_scale"(#loc55)) +#loc156 = loc("rhs_scale"(#loc56)) +#loc157 = loc("rhs_scale"(#loc57)) +#loc158 = loc("out_sum"(#loc58)) +#loc159 = loc("out_sum"(#loc59)) +#loc162 = loc("mask"(#loc63)) +#loc163 = loc("mask"(#loc66)) +#loc164 = loc("mask"(#loc67)) +#loc170 = loc("out_max"(#loc86)) +#loc171 = loc("out_max_keepdim"(#loc87)) +#loc172 = loc("delta"(#loc88)) +#loc173 = loc("delta"(#loc89)) +#loc174 = loc("delta"(#loc90)) +#loc175 = loc("out_sum"(#loc91)) +#loc176 = loc("out_sum"(#loc92)) +#loc177 = loc("out_sum"(#loc93)) +#loc182 = loc("_tmp3_sum"(#loc124)) +#loc183 = loc("mask"(#loc162)) +#loc184 = loc("mask"(#loc164)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..c976c8f59457aef67d69770e8b8f0594e30e92c4 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir @@ -0,0 +1,201 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":18:0) +#loc1 = loc(unknown) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":49:33) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc49 = loc("in_ptr0"(#loc)) +#loc50 = loc("out_ptr2"(#loc)) +#loc51 = loc("xnumel"(#loc)) +#loc52 = loc("r0_numel"(#loc)) +#loc74 = loc("out_max"(#loc27)) +#loc83 = loc("out_sum"(#loc38)) +#loc102 = loc(callsite(#loc74 at #loc28)) +#loc110 = loc(callsite(#loc83 at #loc28)) +#loc116 = loc(callsite(#loc1 at #loc102)) +#loc119 = loc(callsite(#loc1 at #loc110)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32000> : tensor<1x32768xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<32000> : tensor<1x32768xi32, #blocked1> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x32768xbf16, #blocked1> loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %cst_2 = arith.constant dense<0xFF800000> : tensor<1x1xf32, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x32768xf32, #blocked1> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<1x32768xf32, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<0xFF800000> : tensor<1x32768xf32, #blocked1> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc53) + %r0_base = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc54) + %r0_base_6 = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc54) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32768xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32768xi32, #blocked1> loc(#loc54) + %r0_base_8 = tt.expand_dims %r0_base_6 {axis = 0 : i32} : tensor<32768xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32768xi32, #blocked> loc(#loc54) + %r0_mask = arith.cmpi slt, %r0_base_7, %cst_0 : tensor<1x32768xi32, #blocked1> loc(#loc55) + %r0_mask_9 = arith.cmpi slt, %r0_base_8, %cst : tensor<1x32768xi32, #blocked> loc(#loc55) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc56) + %tmp0_10 = tt.splat %tmp0 : i32 -> tensor<1x32768xi32, #blocked1> loc(#loc90) + %tmp0_11 = tt.splat %tmp0 : i32 -> tensor<1x32768xi32, #blocked> loc(#loc90) + %tmp0_12 = arith.addi %r0_base_7, %tmp0_10 : tensor<1x32768xi32, #blocked1> loc(#loc57) + %tmp0_13 = arith.addi %r0_base_8, %tmp0_11 : tensor<1x32768xi32, #blocked> loc(#loc57) + %tmp0_14 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32768x!tt.ptr, #blocked1> loc(#loc58) + %tmp0_15 = tt.addptr %tmp0_14, %tmp0_12 : tensor<1x32768x!tt.ptr, #blocked1>, tensor<1x32768xi32, #blocked1> loc(#loc58) + %tmp0_16 = tt.load %tmp0_15, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x32768x!tt.ptr, #blocked1> loc(#loc59) + %tmp0_17 = arith.extf %tmp0_16 : tensor<1x32768xbf16, #blocked1> to tensor<1x32768xf32, #blocked1> loc(#loc60) + %mask = arith.cmpf ogt, %cst_5, %tmp0_17 : tensor<1x32768xf32, #blocked1> loc(#loc111) + %out_max = arith.select %mask, %cst_5, %tmp0_17 : tensor<1x32768xi1, #blocked1>, tensor<1x32768xf32, #blocked1> loc(#loc112) + %lhs_scale = arith.cmpf oeq, %out_max, %cst_5 : tensor<1x32768xf32, #blocked1> loc(#loc93) + %lhs_scale_18 = arith.subf %cst_5, %out_max : tensor<1x32768xf32, #blocked1> loc(#loc94) + %lhs_scale_19 = tt.extern_elementwise %lhs_scale_18 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32, #blocked1>) -> tensor<1x32768xf32, #blocked1> loc(#loc113) + %lhs_scale_20 = arith.select %lhs_scale, %cst_3, %lhs_scale_19 : tensor<1x32768xi1, #blocked1>, tensor<1x32768xf32, #blocked1> loc(#loc96) + %rhs_scale = arith.subf %tmp0_17, %out_max : tensor<1x32768xf32, #blocked1> loc(#loc97) + %rhs_scale_21 = tt.extern_elementwise %rhs_scale {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32, #blocked1>) -> tensor<1x32768xf32, #blocked1> loc(#loc114) + %rhs_scale_22 = arith.select %lhs_scale, %cst_3, %rhs_scale_21 : tensor<1x32768xi1, #blocked1>, tensor<1x32768xf32, #blocked1> loc(#loc99) + %out_sum = arith.mulf %lhs_scale_20, %cst_4 : tensor<1x32768xf32, #blocked1> loc(#loc100) + %out_sum_23 = arith.addf %out_sum, %rhs_scale_22 : tensor<1x32768xf32, #blocked1> loc(#loc101) + %_tmp3_max = arith.select %r0_mask, %out_max, %cst_5 : tensor<1x32768xi1, #blocked1>, tensor<1x32768xf32, #blocked1> loc(#loc72) + %_tmp3_sum = arith.select %r0_mask, %out_sum_23, %cst_4 : tensor<1x32768xi1, #blocked1>, tensor<1x32768xf32, #blocked1> loc(#loc73) + %out_max_24 = "tt.reduce"(%_tmp3_max) <{axis = 1 : i32}> ({ + ^bb0(%out_max_34: f32 loc(callsite(#loc1 at #loc102)), %out_max_35: f32 loc(callsite(#loc1 at #loc102))): + %mask_36 = arith.cmpf ogt, %out_max_34, %out_max_35 : f32 loc(#loc120) + %mask_37 = arith.cmpf une, %out_max_34, %out_max_34 : f32 loc(#loc121) + %mask_38 = arith.ori %mask_36, %mask_37 : i1 loc(#loc122) + %out_max_39 = arith.select %mask_38, %out_max_34, %out_max_35 : f32 loc(#loc123) + tt.reduce.return %out_max_39 : f32 loc(#loc115) + }) : (tensor<1x32768xf32, #blocked1>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc115) + %out_max_keepdim = tt.expand_dims %out_max_24 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xf32, #blocked1> loc(#loc104) + %delta = arith.cmpf oeq, %out_max_keepdim, %cst_2 : tensor<1x1xf32, #blocked1> loc(#loc105) + %delta_25 = tt.broadcast %out_max_keepdim : tensor<1x1xf32, #blocked1> -> tensor<1x32768xf32, #blocked1> loc(#loc106) + %delta_26 = arith.subf %_tmp3_max, %delta_25 : tensor<1x32768xf32, #blocked1> loc(#loc106) + %delta_27 = tt.broadcast %delta : tensor<1x1xi1, #blocked1> -> tensor<1x32768xi1, #blocked1> loc(#loc107) + %delta_28 = arith.select %delta_27, %cst_4, %delta_26 : tensor<1x32768xi1, #blocked1>, tensor<1x32768xf32, #blocked1> loc(#loc107) + %out_sum_29 = tt.extern_elementwise %delta_28 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32, #blocked1>) -> tensor<1x32768xf32, #blocked1> loc(#loc117) + %out_sum_30 = arith.mulf %_tmp3_sum, %out_sum_29 : tensor<1x32768xf32, #blocked1> loc(#loc109) + %out_sum_31 = "tt.reduce"(%out_sum_30) <{axis = 1 : i32}> ({ + ^bb0(%out_sum_34: f32 loc(callsite(#loc1 at #loc110)), %out_sum_35: f32 loc(callsite(#loc1 at #loc110))): + %out_sum_36 = arith.addf %out_sum_34, %out_sum_35 : f32 loc(#loc124) + tt.reduce.return %out_sum_36 : f32 loc(#loc118) + }) : (tensor<1x32768xf32, #blocked1>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc118) + %tmp4 = tt.expand_dims %out_sum_31 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xf32, #blocked1> loc(#loc84) + %tmp5 = tt.load %tmp0_15, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x32768x!tt.ptr, #blocked1> loc(#loc85) + %tmp5_32 = arith.extf %tmp5 : tensor<1x32768xbf16, #blocked1> to tensor<1x32768xf32, #blocked1> loc(#loc86) + %tmp7 = arith.subf %tmp5_32, %delta_25 : tensor<1x32768xf32, #blocked1> loc(#loc87) + %tmp8 = tt.extern_elementwise %tmp7 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32, #blocked1>) -> tensor<1x32768xf32, #blocked1> loc(#loc88) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32, #blocked1> -> tensor<1x32768xf32, #blocked1> loc(#loc89) + %tmp9_33 = arith.divf %tmp8, %tmp9 : tensor<1x32768xf32, #blocked1> loc(#loc89) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32768x!tt.ptr, #blocked> loc(#loc46) + %1 = tt.addptr %0, %tmp0_13 : tensor<1x32768x!tt.ptr, #blocked>, tensor<1x32768xi32, #blocked> loc(#loc46) + %2 = ttg.convert_layout %tmp9_33 : tensor<1x32768xf32, #blocked1> -> tensor<1x32768xf32, #blocked> loc(#loc47) + tt.store %1, %2, %r0_mask_9 : tensor<1x32768x!tt.ptr, #blocked> loc(#loc47) + tt.return loc(#loc48) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":26:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":33:29) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:47) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:41) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:34) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:52) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:105) +#loc10 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc11 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":42:40) +#loc13 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":45:54) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":46:54) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":51:16) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:52) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:106) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":60:22) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":61:29) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":62:23) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:29) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:53) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":52:4) +#loc53 = loc("xoffset"(#loc2)) +#loc54 = loc("r0_base"(#loc3)) +#loc55 = loc("r0_mask"(#loc4)) +#loc56 = loc("tmp0"(#loc5)) +#loc57 = loc("tmp0"(#loc6)) +#loc58 = loc("tmp0"(#loc7)) +#loc59 = loc("tmp0"(#loc8)) +#loc60 = loc("tmp0"(#loc9)) +#loc61 = loc("mask"(#loc10)) +#loc62 = loc("out_max"(#loc11)) +#loc63 = loc("lhs_scale"(#loc14)) +#loc64 = loc("lhs_scale"(#loc15)) +#loc65 = loc("lhs_scale"(#loc17)) +#loc66 = loc("lhs_scale"(#loc18)) +#loc67 = loc("rhs_scale"(#loc19)) +#loc68 = loc("rhs_scale"(#loc20)) +#loc69 = loc("rhs_scale"(#loc21)) +#loc70 = loc("out_sum"(#loc22)) +#loc71 = loc("out_sum"(#loc23)) +#loc72 = loc("_tmp3_max"(#loc24)) +#loc73 = loc("_tmp3_sum"(#loc25)) +#loc75 = loc("mask"(#loc29)) +#loc76 = loc("mask"(#loc30)) +#loc77 = loc("out_max_keepdim"(#loc31)) +#loc78 = loc("delta"(#loc32)) +#loc79 = loc("delta"(#loc33)) +#loc80 = loc("delta"(#loc34)) +#loc81 = loc("out_sum"(#loc35)) +#loc82 = loc("out_sum"(#loc36)) +#loc84 = loc("tmp4"(#loc40)) +#loc85 = loc("tmp5"(#loc41)) +#loc86 = loc("tmp5"(#loc42)) +#loc87 = loc("tmp7"(#loc43)) +#loc88 = loc("tmp8"(#loc44)) +#loc89 = loc("tmp9"(#loc45)) +#loc90 = loc(fused[#loc57, #loc56]) +#loc91 = loc("mask"(#loc61)) +#loc92 = loc(callsite(#loc62 at #loc12)) +#loc93 = loc(callsite(#loc63 at #loc12)) +#loc94 = loc(callsite(#loc64 at #loc12)) +#loc95 = loc(callsite(#loc65 at #loc12)) +#loc96 = loc(callsite(#loc66 at #loc12)) +#loc97 = loc(callsite(#loc67 at #loc12)) +#loc98 = loc(callsite(#loc68 at #loc12)) +#loc99 = loc(callsite(#loc69 at #loc12)) +#loc100 = loc(callsite(#loc70 at #loc12)) +#loc101 = loc(callsite(#loc71 at #loc12)) +#loc103 = loc("mask"(#loc76)) +#loc104 = loc(callsite(#loc77 at #loc28)) +#loc105 = loc(callsite(#loc78 at #loc28)) +#loc106 = loc(callsite(#loc79 at #loc28)) +#loc107 = loc(callsite(#loc80 at #loc28)) +#loc108 = loc(callsite(#loc81 at #loc28)) +#loc109 = loc(callsite(#loc82 at #loc28)) +#loc111 = loc(callsite(#loc91 at #loc92)) +#loc112 = loc(callsite(#loc13 at #loc92)) +#loc113 = loc(callsite(#loc16 at #loc95)) +#loc114 = loc(callsite(#loc16 at #loc98)) +#loc115 = loc(callsite(#loc26 at #loc102)) +#loc117 = loc(callsite(#loc16 at #loc108)) +#loc118 = loc(callsite(#loc37 at #loc110)) +#loc120 = loc(callsite(#loc91 at #loc115)) +#loc121 = loc(callsite(#loc75 at #loc115)) +#loc122 = loc(callsite(#loc103 at #loc115)) +#loc123 = loc(callsite(#loc13 at #loc115)) +#loc124 = loc(callsite(#loc39 at #loc118)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..04e94ec78cbaa2e3bf52eae6f1703285d508df22 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZEALXMKAD2NIID6WEZPSGCYCWT2KVT2MI4XFCYLFLKZA6ZQUDYJA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir @@ -0,0 +1,195 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":18:0) +#loc1 = loc(unknown) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":49:33) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc50 = loc("in_ptr0"(#loc)) +#loc51 = loc("out_ptr2"(#loc)) +#loc52 = loc("xnumel"(#loc)) +#loc53 = loc("r0_numel"(#loc)) +#loc78 = loc("out_max"(#loc30)) +#loc86 = loc("out_sum"(#loc39)) +#loc106 = loc(callsite(#loc78 at #loc3)) +#loc113 = loc(callsite(#loc86 at #loc3)) +#loc119 = loc(callsite(#loc1 at #loc106)) +#loc122 = loc(callsite(#loc1 at #loc113)) +module { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %delta = arith.constant dense<0xFF800000> : tensor<1x1xf32> loc(#loc93) + %cst = arith.constant dense<1.000000e+00> : tensor<1x32768xf32> loc(#loc55) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x32768xf32> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x32768xbf16> loc(#loc1) + %cst_2 = arith.constant dense<32000> : tensor<1x32768xi32> loc(#loc1) + %cst_3 = arith.constant dense<0xFF800000> : tensor<1x32768xf32> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc56) + %r0_base = tt.make_range {end = 32768 : i32, start = 0 : i32} : tensor<32768xi32> loc(#loc57) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<32768xi32> -> tensor<1x32768xi32> loc(#loc58) + %r0_mask = arith.cmpi slt, %r0_base_4, %cst_2 : tensor<1x32768xi32> loc(#loc59) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc60) + %tmp0_5 = tt.splat %tmp0 : i32 -> tensor<1x32768xi32> loc(#loc94) + %tmp0_6 = arith.addi %r0_base_4, %tmp0_5 : tensor<1x32768xi32> loc(#loc61) + %tmp0_7 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x32768x!tt.ptr> loc(#loc62) + %tmp0_8 = tt.addptr %tmp0_7, %tmp0_6 : tensor<1x32768x!tt.ptr>, tensor<1x32768xi32> loc(#loc62) + %tmp0_9 = tt.load %tmp0_8, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x32768x!tt.ptr> loc(#loc63) + %tmp0_10 = arith.extf %tmp0_9 : tensor<1x32768xbf16> to tensor<1x32768xf32> loc(#loc64) + %mask = arith.cmpf ogt, %cst_3, %tmp0_10 : tensor<1x32768xf32> loc(#loc114) + %out_max = arith.select %mask, %cst_3, %tmp0_10 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc115) + %lhs_scale = arith.cmpf oeq, %out_max, %cst_3 : tensor<1x32768xf32> loc(#loc97) + %lhs_scale_11 = arith.subf %cst_3, %out_max : tensor<1x32768xf32> loc(#loc98) + %lhs_scale_12 = tt.extern_elementwise %lhs_scale_11 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc116) + %lhs_scale_13 = arith.select %lhs_scale, %cst, %lhs_scale_12 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc100) + %rhs_scale = arith.subf %tmp0_10, %out_max : tensor<1x32768xf32> loc(#loc101) + %rhs_scale_14 = tt.extern_elementwise %rhs_scale {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc117) + %rhs_scale_15 = arith.select %lhs_scale, %cst, %rhs_scale_14 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc103) + %out_sum = arith.mulf %lhs_scale_13, %cst_0 : tensor<1x32768xf32> loc(#loc104) + %out_sum_16 = arith.addf %out_sum, %rhs_scale_15 : tensor<1x32768xf32> loc(#loc105) + %_tmp3_max = arith.select %r0_mask, %out_max, %cst_3 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc76) + %_tmp3_sum = arith.select %r0_mask, %out_sum_16, %cst_0 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc77) + %out_max_17 = "tt.reduce"(%_tmp3_max) <{axis = 1 : i32}> ({ + ^bb0(%out_max_28: f32 loc(callsite(#loc1 at #loc106)), %out_max_29: f32 loc(callsite(#loc1 at #loc106))): + %mask_30 = arith.cmpf ogt, %out_max_28, %out_max_29 : f32 loc(#loc123) + %mask_31 = arith.cmpf une, %out_max_28, %out_max_28 : f32 loc(#loc124) + %mask_32 = arith.ori %mask_30, %mask_31 : i1 loc(#loc125) + %out_max_33 = arith.select %mask_32, %out_max_28, %out_max_29 : f32 loc(#loc126) + tt.reduce.return %out_max_33 : f32 loc(#loc118) + }) : (tensor<1x32768xf32>) -> tensor<1xf32> loc(#loc118) + %out_max_keepdim = tt.expand_dims %out_max_17 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc108) + %delta_18 = arith.cmpf oeq, %out_max_keepdim, %delta : tensor<1x1xf32> loc(#loc93) + %delta_19 = tt.broadcast %out_max_keepdim : tensor<1x1xf32> -> tensor<1x32768xf32> loc(#loc109) + %delta_20 = arith.subf %_tmp3_max, %delta_19 : tensor<1x32768xf32> loc(#loc109) + %delta_21 = tt.broadcast %delta_18 : tensor<1x1xi1> -> tensor<1x32768xi1> loc(#loc110) + %delta_22 = arith.select %delta_21, %cst_0, %delta_20 : tensor<1x32768xi1>, tensor<1x32768xf32> loc(#loc110) + %out_sum_23 = tt.extern_elementwise %delta_22 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc120) + %out_sum_24 = arith.mulf %_tmp3_sum, %out_sum_23 : tensor<1x32768xf32> loc(#loc112) + %out_sum_25 = "tt.reduce"(%out_sum_24) <{axis = 1 : i32}> ({ + ^bb0(%out_sum_28: f32 loc(callsite(#loc1 at #loc113)), %out_sum_29: f32 loc(callsite(#loc1 at #loc113))): + %out_sum_30 = arith.addf %out_sum_28, %out_sum_29 : f32 loc(#loc127) + tt.reduce.return %out_sum_30 : f32 loc(#loc121) + }) : (tensor<1x32768xf32>) -> tensor<1xf32> loc(#loc121) + %tmp4 = tt.expand_dims %out_sum_25 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc87) + %tmp5 = tt.load %tmp0_8, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x32768x!tt.ptr> loc(#loc88) + %tmp5_26 = arith.extf %tmp5 : tensor<1x32768xbf16> to tensor<1x32768xf32> loc(#loc89) + %tmp7 = arith.subf %tmp5_26, %delta_19 : tensor<1x32768xf32> loc(#loc90) + %tmp8 = tt.extern_elementwise %tmp7 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x32768xf32>) -> tensor<1x32768xf32> loc(#loc91) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32> -> tensor<1x32768xf32> loc(#loc92) + %tmp9_27 = arith.divf %tmp8, %tmp9 : tensor<1x32768xf32> loc(#loc92) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x32768x!tt.ptr> loc(#loc47) + %1 = tt.addptr %0, %tmp0_6 : tensor<1x32768x!tt.ptr>, tensor<1x32768xi32> loc(#loc47) + tt.store %1, %tmp9_27, %r0_mask : tensor<1x32768x!tt.ptr> loc(#loc48) + tt.return loc(#loc49) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":42:40) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":23:28) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":26:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":26:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":33:29) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:47) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:41) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:34) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:52) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":37:105) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":45:54) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":46:54) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":51:16) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:52) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":58:106) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":60:22) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":61:29) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":62:23) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:29) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":63:53) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/cz/cczmmrlu3u67jn3gitfwyu3jif3b3pwoxj37xp4ddackcufwiqri.py":52:4) +#loc54 = loc("delta"(#loc2)) +#loc55 = loc(callsite(#loc1 at #loc4)) +#loc56 = loc("xoffset"(#loc5)) +#loc57 = loc("r0_base"(#loc6)) +#loc58 = loc("r0_base"(#loc7)) +#loc59 = loc("r0_mask"(#loc8)) +#loc60 = loc("tmp0"(#loc9)) +#loc61 = loc("tmp0"(#loc10)) +#loc62 = loc("tmp0"(#loc11)) +#loc63 = loc("tmp0"(#loc12)) +#loc64 = loc("tmp0"(#loc13)) +#loc65 = loc("mask"(#loc14)) +#loc66 = loc("out_max"(#loc15)) +#loc67 = loc("lhs_scale"(#loc17)) +#loc68 = loc("lhs_scale"(#loc18)) +#loc69 = loc("lhs_scale"(#loc20)) +#loc70 = loc("lhs_scale"(#loc21)) +#loc71 = loc("rhs_scale"(#loc22)) +#loc72 = loc("rhs_scale"(#loc23)) +#loc73 = loc("rhs_scale"(#loc24)) +#loc74 = loc("out_sum"(#loc25)) +#loc75 = loc("out_sum"(#loc26)) +#loc76 = loc("_tmp3_max"(#loc27)) +#loc77 = loc("_tmp3_sum"(#loc28)) +#loc79 = loc("mask"(#loc31)) +#loc80 = loc("mask"(#loc32)) +#loc81 = loc("out_max_keepdim"(#loc33)) +#loc82 = loc("delta"(#loc34)) +#loc83 = loc("delta"(#loc35)) +#loc84 = loc("out_sum"(#loc36)) +#loc85 = loc("out_sum"(#loc37)) +#loc87 = loc("tmp4"(#loc41)) +#loc88 = loc("tmp5"(#loc42)) +#loc89 = loc("tmp5"(#loc43)) +#loc90 = loc("tmp7"(#loc44)) +#loc91 = loc("tmp8"(#loc45)) +#loc92 = loc("tmp9"(#loc46)) +#loc93 = loc(callsite(#loc54 at #loc3)) +#loc94 = loc(fused[#loc61, #loc60]) +#loc95 = loc("mask"(#loc65)) +#loc96 = loc(callsite(#loc66 at #loc4)) +#loc97 = loc(callsite(#loc67 at #loc4)) +#loc98 = loc(callsite(#loc68 at #loc4)) +#loc99 = loc(callsite(#loc69 at #loc4)) +#loc100 = loc(callsite(#loc70 at #loc4)) +#loc101 = loc(callsite(#loc71 at #loc4)) +#loc102 = loc(callsite(#loc72 at #loc4)) +#loc103 = loc(callsite(#loc73 at #loc4)) +#loc104 = loc(callsite(#loc74 at #loc4)) +#loc105 = loc(callsite(#loc75 at #loc4)) +#loc107 = loc("mask"(#loc80)) +#loc108 = loc(callsite(#loc81 at #loc3)) +#loc109 = loc(callsite(#loc82 at #loc3)) +#loc110 = loc(callsite(#loc83 at #loc3)) +#loc111 = loc(callsite(#loc84 at #loc3)) +#loc112 = loc(callsite(#loc85 at #loc3)) +#loc114 = loc(callsite(#loc95 at #loc96)) +#loc115 = loc(callsite(#loc16 at #loc96)) +#loc116 = loc(callsite(#loc19 at #loc99)) +#loc117 = loc(callsite(#loc19 at #loc102)) +#loc118 = loc(callsite(#loc29 at #loc106)) +#loc120 = loc(callsite(#loc19 at #loc111)) +#loc121 = loc(callsite(#loc38 at #loc113)) +#loc123 = loc(callsite(#loc95 at #loc118)) +#loc124 = loc(callsite(#loc79 at #loc118)) +#loc125 = loc(callsite(#loc107 at #loc118)) +#loc126 = loc(callsite(#loc16 at #loc118)) +#loc127 = loc(callsite(#loc40 at #loc121)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5d45650382f404c5e928ce5f4a3ba9583d52a296 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/__grp__triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.cubin", "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5ac27e76f0f4ef1d2814ef6057b620f64d6612d7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.json @@ -0,0 +1 @@ +{"hash": "cbeeb05ece6de899def1fce9a8d0fb3230af73641873680820e75339d05b1c88", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 16384, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..0b4f611fd1fa373eff27966d61a5f74f6726677f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.llir @@ -0,0 +1,5386 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) readnone captures(none) %5) local_unnamed_addr #0 !dbg !5 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %9 = and i32 %8, 511, !dbg !9 + %10 = shl nuw nsw i32 %9, 3, !dbg !9 + %11 = or disjoint i32 %10, 4096, !dbg !9 + %12 = or disjoint i32 %10, 8192, !dbg !9 + %13 = or disjoint i32 %10, 12288, !dbg !9 + %14 = mul i32 %7, 32000, !dbg !10 + %15 = zext nneg i32 %13 to i64, !dbg !11 + br label %16, !dbg !11 + +16: ; preds = %6, %16 + %17 = phi i1 [ true, %6 ], [ false, %16 ] + %indvars.iv = phi i64 [ 0, %6 ], [ 16384, %16 ] + %18 = phi float [ 0.000000e+00, %6 ], [ %1512, %16 ] + %19 = phi float [ 0.000000e+00, %6 ], [ %1513, %16 ] + %20 = phi float [ 0.000000e+00, %6 ], [ %1514, %16 ] + %21 = phi float [ 0.000000e+00, %6 ], [ %1515, %16 ] + %22 = phi float [ 0.000000e+00, %6 ], [ %1516, %16 ] + %23 = phi float [ 0.000000e+00, %6 ], [ %1517, %16 ] + %24 = phi float [ 0.000000e+00, %6 ], [ %1518, %16 ] + %25 = phi float [ 0.000000e+00, %6 ], [ %1519, %16 ] + %26 = phi float [ 0.000000e+00, %6 ], [ %1820, %16 ] + %27 = phi float [ 0.000000e+00, %6 ], [ %1821, %16 ] + %28 = phi float [ 0xFFF0000000000000, %6 ], [ %1813, %16 ] + %29 = phi float [ 0xFFF0000000000000, %6 ], [ %1814, %16 ] + %30 = phi <2 x float> [ zeroinitializer, %6 ], [ %1824, %16 ] + %31 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %1819, %16 ] + %32 = phi <2 x float> [ zeroinitializer, %6 ], [ %1823, %16 ] + %33 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %1818, %16 ] + %34 = phi <2 x float> [ zeroinitializer, %6 ], [ %1822, %16 ] + %35 = phi <2 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %1817, %16 ] + %36 = phi <16 x float> [ zeroinitializer, %6 ], [ %1511, %16 ] + %37 = phi <16 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %876, %16 ] + %38 = phi <8 x float> [ splat (float 0xFFF0000000000000), %6 ], [ %127, %16 ] + %39 = or disjoint i64 %indvars.iv, %15, !dbg !12 + %40 = icmp samesign ult i64 %39, 32000, !dbg !13 + %41 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !14 + %42 = or disjoint i32 %10, %41, !dbg !14 + %43 = add i32 %42, %14, !dbg !14 + %44 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !14 + %45 = or disjoint i32 %11, %44, !dbg !14 + %46 = add i32 %45, %14, !dbg !14 + %47 = trunc nuw nsw i64 %indvars.iv to i32, !dbg !14 + %48 = or disjoint i32 %12, %47, !dbg !14 + %49 = add i32 %48, %14, !dbg !14 + %50 = trunc nuw nsw i64 %39 to i32, !dbg !14 + %51 = add i32 %14, %50, !dbg !14 + %52 = sext i32 %43 to i64, !dbg !15 + %53 = getelementptr bfloat, ptr addrspace(1) %0, i64 %52, !dbg !15 + %54 = sext i32 %46 to i64, !dbg !15 + %55 = getelementptr bfloat, ptr addrspace(1) %0, i64 %54, !dbg !15 + %56 = sext i32 %49 to i64, !dbg !15 + %57 = getelementptr bfloat, ptr addrspace(1) %0, i64 %56, !dbg !15 + %58 = sext i32 %51 to i64, !dbg !15 + %59 = getelementptr bfloat, ptr addrspace(1) %0, i64 %58, !dbg !15 + %60 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !16 + %61 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %53, i64 %60, i1 true) #6, !dbg !16 + %62 = extractvalue { i32, i32, i32, i32 } %61, 0, !dbg !16 + %63 = bitcast i32 %62 to <2 x bfloat>, !dbg !16 + %64 = extractvalue { i32, i32, i32, i32 } %61, 1, !dbg !16 + %65 = bitcast i32 %64 to <2 x bfloat>, !dbg !16 + %66 = extractvalue { i32, i32, i32, i32 } %61, 2, !dbg !16 + %67 = bitcast i32 %66 to <2 x bfloat>, !dbg !16 + %68 = extractvalue { i32, i32, i32, i32 } %61, 3, !dbg !16 + %69 = bitcast i32 %68 to <2 x bfloat>, !dbg !16 + %70 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !16 + %71 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %55, i64 %70, i1 true) #6, !dbg !16 + %72 = extractvalue { i32, i32, i32, i32 } %71, 0, !dbg !16 + %73 = bitcast i32 %72 to <2 x bfloat>, !dbg !16 + %74 = extractvalue { i32, i32, i32, i32 } %71, 1, !dbg !16 + %75 = bitcast i32 %74 to <2 x bfloat>, !dbg !16 + %76 = extractvalue { i32, i32, i32, i32 } %71, 2, !dbg !16 + %77 = bitcast i32 %76 to <2 x bfloat>, !dbg !16 + %78 = extractvalue { i32, i32, i32, i32 } %71, 3, !dbg !16 + %79 = bitcast i32 %78 to <2 x bfloat>, !dbg !16 + %80 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !16 + %81 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %57, i64 %80, i1 true) #6, !dbg !16 + %82 = extractvalue { i32, i32, i32, i32 } %81, 0, !dbg !16 + %83 = bitcast i32 %82 to <2 x bfloat>, !dbg !16 + %84 = extractvalue { i32, i32, i32, i32 } %81, 1, !dbg !16 + %85 = bitcast i32 %84 to <2 x bfloat>, !dbg !16 + %86 = extractvalue { i32, i32, i32, i32 } %81, 2, !dbg !16 + %87 = bitcast i32 %86 to <2 x bfloat>, !dbg !16 + %88 = extractvalue { i32, i32, i32, i32 } %81, 3, !dbg !16 + %89 = bitcast i32 %88 to <2 x bfloat>, !dbg !16 + %90 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #6, !dbg !16 + %91 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %59, i64 %90, i1 %40) #6, !dbg !16 + %92 = extractvalue { i32, i32, i32, i32 } %91, 0, !dbg !16 + %93 = bitcast i32 %92 to <2 x bfloat>, !dbg !16 + %94 = extractvalue { i32, i32, i32, i32 } %91, 1, !dbg !16 + %95 = bitcast i32 %94 to <2 x bfloat>, !dbg !16 + %96 = extractvalue { i32, i32, i32, i32 } %91, 2, !dbg !16 + %97 = bitcast i32 %96 to <2 x bfloat>, !dbg !16 + %98 = extractvalue { i32, i32, i32, i32 } %91, 3, !dbg !16 + %99 = bitcast i32 %98 to <2 x bfloat>, !dbg !16 + %100 = extractelement <2 x bfloat> %93, i64 0, !dbg !16 + %101 = extractelement <2 x bfloat> %93, i64 1, !dbg !16 + %102 = fpext bfloat %100 to float, !dbg !17 + %103 = fpext bfloat %101 to float, !dbg !17 + %104 = fcmp ogt float %28, %102, !dbg !18 + %105 = fcmp ogt float %29, %103, !dbg !18 + %106 = fcmp uno <16 x float> %37, zeroinitializer, !dbg !22 + %107 = fcmp uno <8 x float> %38, zeroinitializer, !dbg !22 + %108 = fcmp uno float %28, 0.000000e+00, !dbg !22 + %109 = fcmp uno float %29, 0.000000e+00, !dbg !22 + %110 = fcmp uno <2 x float> %35, zeroinitializer, !dbg !22 + %111 = fcmp uno <2 x float> %33, zeroinitializer, !dbg !22 + %112 = fcmp uno <2 x float> %31, zeroinitializer, !dbg !22 + %113 = or i1 %108, %104, !dbg !23 + %114 = or i1 %109, %105, !dbg !23 + %115 = shufflevector <2 x bfloat> %83, <2 x bfloat> %85, <8 x i32> , !dbg !17 + %116 = shufflevector <2 x bfloat> %87, <2 x bfloat> poison, <8 x i32> , !dbg !17 + %117 = shufflevector <8 x bfloat> %115, <8 x bfloat> %116, <8 x i32> , !dbg !17 + %118 = shufflevector <2 x bfloat> %87, <2 x bfloat> poison, <8 x i32> , !dbg !17 + %119 = shufflevector <8 x bfloat> %117, <8 x bfloat> %118, <8 x i32> , !dbg !17 + %120 = shufflevector <2 x bfloat> %89, <2 x bfloat> poison, <8 x i32> , !dbg !17 + %121 = shufflevector <8 x bfloat> %119, <8 x bfloat> %120, <8 x i32> , !dbg !17 + %122 = shufflevector <2 x bfloat> %89, <2 x bfloat> poison, <8 x i32> , !dbg !17 + %123 = shufflevector <8 x bfloat> %121, <8 x bfloat> %122, <8 x i32> , !dbg !17 + %124 = fpext <8 x bfloat> %123 to <8 x float>, !dbg !17 + %125 = fcmp ogt <8 x float> %38, %124, !dbg !18 + %126 = or <8 x i1> %107, %125, !dbg !23 + %127 = select <8 x i1> %126, <8 x float> %38, <8 x float> %124, !dbg !24 + %128 = select i1 %113, float %28, float %102, !dbg !24 + %129 = select i1 %114, float %29, float %103, !dbg !24 + %130 = extractelement <8 x float> %127, i64 0, !dbg !25 + %131 = fcmp oeq float %130, 0xFFF0000000000000, !dbg !26 + %132 = extractelement <8 x float> %127, i64 1, !dbg !25 + %133 = fcmp oeq float %132, 0xFFF0000000000000, !dbg !26 + %134 = extractelement <8 x float> %127, i64 2, !dbg !25 + %135 = fcmp oeq float %134, 0xFFF0000000000000, !dbg !26 + %136 = extractelement <8 x float> %127, i64 3, !dbg !25 + %137 = fcmp oeq float %136, 0xFFF0000000000000, !dbg !26 + %138 = extractelement <8 x float> %127, i64 4, !dbg !25 + %139 = fcmp oeq float %138, 0xFFF0000000000000, !dbg !26 + %140 = extractelement <8 x float> %127, i64 5, !dbg !25 + %141 = fcmp oeq float %140, 0xFFF0000000000000, !dbg !26 + %142 = extractelement <8 x float> %127, i64 6, !dbg !25 + %143 = fcmp oeq float %142, 0xFFF0000000000000, !dbg !26 + %144 = extractelement <8 x float> %127, i64 7, !dbg !25 + %145 = fcmp oeq float %144, 0xFFF0000000000000, !dbg !26 + %146 = fcmp oeq float %128, 0xFFF0000000000000, !dbg !26 + %147 = fcmp oeq float %129, 0xFFF0000000000000, !dbg !26 + %foldExtExtBinop = fsub <8 x float> %38, %127, !dbg !27 + %148 = extractelement <8 x float> %foldExtExtBinop, i64 0, !dbg !27 + %foldExtExtBinop1527 = fsub <8 x float> %38, %127, !dbg !27 + %149 = extractelement <8 x float> %foldExtExtBinop1527, i64 1, !dbg !27 + %foldExtExtBinop1529 = fsub <8 x float> %38, %127, !dbg !27 + %150 = extractelement <8 x float> %foldExtExtBinop1529, i64 2, !dbg !27 + %foldExtExtBinop1531 = fsub <8 x float> %38, %127, !dbg !27 + %151 = extractelement <8 x float> %foldExtExtBinop1531, i64 3, !dbg !27 + %foldExtExtBinop1533 = fsub <8 x float> %38, %127, !dbg !27 + %152 = extractelement <8 x float> %foldExtExtBinop1533, i64 4, !dbg !27 + %foldExtExtBinop1535 = fsub <8 x float> %38, %127, !dbg !27 + %153 = extractelement <8 x float> %foldExtExtBinop1535, i64 5, !dbg !27 + %foldExtExtBinop1537 = fsub <8 x float> %38, %127, !dbg !27 + %154 = extractelement <8 x float> %foldExtExtBinop1537, i64 6, !dbg !27 + %foldExtExtBinop1539 = fsub <8 x float> %38, %127, !dbg !27 + %155 = extractelement <8 x float> %foldExtExtBinop1539, i64 7, !dbg !27 + %156 = fsub float %28, %128, !dbg !27 + %157 = fsub float %29, %129, !dbg !27 + %158 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i694 = icmp eq i32 %158, 0, !dbg !28 + %159 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i696 = icmp eq i32 %159, 0, !dbg !28 + %160 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %161 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i700 = icmp eq i32 %161, 0, !dbg !28 + %162 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i702 = icmp eq i32 %162, 0, !dbg !28 + %163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i704 = icmp eq i32 %163, 0, !dbg !28 + %164 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i706 = icmp eq i32 %164, 0, !dbg !28 + %165 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %166 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i710 = icmp eq i32 %166, 0, !dbg !28 + %167 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i712 = icmp eq i32 %167, 0, !dbg !28 + %168 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i714 = icmp eq i32 %168, 0, !dbg !28 + %169 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i716 = icmp eq i32 %169, 0, !dbg !28 + %170 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %171 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i720 = icmp eq i32 %171, 0, !dbg !28 + %172 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i722 = icmp eq i32 %172, 0, !dbg !28 + %173 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i724 = icmp eq i32 %173, 0, !dbg !28 + %174 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i726 = icmp eq i32 %174, 0, !dbg !28 + %175 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %176 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i730 = icmp eq i32 %176, 0, !dbg !28 + %177 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i732 = icmp eq i32 %177, 0, !dbg !28 + %178 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i734 = icmp eq i32 %178, 0, !dbg !28 + %179 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i736 = icmp eq i32 %179, 0, !dbg !28 + %180 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %181 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i740 = icmp eq i32 %181, 0, !dbg !28 + %182 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i742 = icmp eq i32 %182, 0, !dbg !28 + %183 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i744 = icmp eq i32 %183, 0, !dbg !28 + %184 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i746 = icmp eq i32 %184, 0, !dbg !28 + %185 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %186 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i750 = icmp eq i32 %186, 0, !dbg !28 + %187 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i752 = icmp eq i32 %187, 0, !dbg !28 + %188 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i754 = icmp eq i32 %188, 0, !dbg !28 + %189 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i756 = icmp eq i32 %189, 0, !dbg !28 + %190 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %191 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i760 = icmp eq i32 %191, 0, !dbg !28 + %192 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i762 = icmp eq i32 %192, 0, !dbg !28 + %193 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i764 = icmp eq i32 %193, 0, !dbg !28 + %194 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i766 = icmp eq i32 %194, 0, !dbg !28 + %195 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %196 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i770 = icmp eq i32 %196, 0, !dbg !28 + %197 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i772 = icmp eq i32 %197, 0, !dbg !28 + %198 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i774 = icmp eq i32 %198, 0, !dbg !28 + %199 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i776 = icmp eq i32 %199, 0, !dbg !28 + %200 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %201 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i780 = icmp eq i32 %201, 0, !dbg !28 + %202 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i782 = icmp eq i32 %202, 0, !dbg !28 + %203 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i784 = icmp eq i32 %203, 0, !dbg !28 + %204 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i786 = icmp eq i32 %204, 0, !dbg !28 + %205 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %206 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i790 = icmp eq i32 %206, 0, !dbg !28 + %207 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i792 = icmp eq i32 %207, 0, !dbg !28 + %208 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i794 = icmp eq i32 %208, 0, !dbg !28 + %209 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i796 = icmp eq i32 %209, 0, !dbg !28 + %210 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %211 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i800 = icmp eq i32 %211, 0, !dbg !28 + %212 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i802 = icmp eq i32 %212, 0, !dbg !28 + %213 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i804 = icmp eq i32 %213, 0, !dbg !28 + %214 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i806 = icmp eq i32 %214, 0, !dbg !28 + %215 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %216 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i810 = icmp eq i32 %216, 0, !dbg !28 + %217 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i812 = icmp eq i32 %217, 0, !dbg !28 + %218 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i814 = icmp eq i32 %218, 0, !dbg !28 + %219 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i816 = icmp eq i32 %219, 0, !dbg !28 + %220 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %221 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i820 = icmp eq i32 %221, 0, !dbg !28 + %222 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i822 = icmp eq i32 %222, 0, !dbg !28 + %223 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i824 = icmp eq i32 %223, 0, !dbg !28 + %224 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i826 = icmp eq i32 %224, 0, !dbg !28 + %225 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %226 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i830 = icmp eq i32 %226, 0, !dbg !28 + %227 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i832 = icmp eq i32 %227, 0, !dbg !28 + %228 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i834 = icmp eq i32 %228, 0, !dbg !28 + %229 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i836 = icmp eq i32 %229, 0, !dbg !28 + %230 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %231 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i840 = icmp eq i32 %231, 0, !dbg !28 + %232 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i842 = icmp eq i32 %232, 0, !dbg !28 + %233 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i844 = icmp eq i32 %233, 0, !dbg !28 + %234 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i846 = icmp eq i32 %234, 0, !dbg !28 + %235 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %236 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i850 = icmp eq i32 %236, 0, !dbg !28 + %237 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i852 = icmp eq i32 %237, 0, !dbg !28 + %238 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i854 = icmp eq i32 %238, 0, !dbg !28 + %239 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %148, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %240 = tail call float @llvm.nvvm.fma.rn.f(float %148, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i855 = select i1 %.not.i854, float %240, float %239, !dbg !28 + %241 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i856 = icmp eq i32 %241, 0, !dbg !28 + %242 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i855) #6, !dbg !28 + %243 = tail call float @llvm.nvvm.saturate.f(float %.02.i855) #6, !dbg !28 + %.03.i857 = select i1 %.not1.i856, float %243, float %242, !dbg !28 + %244 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i858 = icmp eq i32 %244, 0, !dbg !28 + %245 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i857, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %246 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i857, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i859 = select i1 %.not2.i858, float %246, float %245, !dbg !28 + %247 = fadd float %.04.i859, 0xC168000FE0000000, !dbg !28 + %248 = fneg float %247, !dbg !28 + %249 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i860 = icmp eq i32 %249, 0, !dbg !28 + %250 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %148, float 0x3FF7154760000000, float %248) #6, !dbg !28 + %251 = tail call float @llvm.nvvm.fma.rn.f(float %148, float 0x3FF7154760000000, float %248) #6, !dbg !28 + %.0.i861 = select i1 %.not3.i860, float %251, float %250, !dbg !28 + %252 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i862 = icmp eq i32 %252, 0, !dbg !28 + %253 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %148, float 0x3E54AE0C00000000, float %.0.i861) #6, !dbg !28 + %254 = tail call float @llvm.nvvm.fma.rn.f(float %148, float 0x3E54AE0C00000000, float %.0.i861) #6, !dbg !28 + %.01.i863 = select i1 %.not4.i862, float %254, float %253, !dbg !28 + %255 = bitcast float %.04.i859 to i32, !dbg !28 + %256 = shl i32 %255, 23, !dbg !28 + %257 = bitcast i32 %256 to float, !dbg !28 + %258 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i863) #6, !dbg !28 + %259 = fmul float %258, %257, !dbg !28 + %260 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i864 = icmp eq i32 %260, 0, !dbg !28 + %261 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %149, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %262 = tail call float @llvm.nvvm.fma.rn.f(float %149, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i865 = select i1 %.not.i864, float %262, float %261, !dbg !28 + %263 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i866 = icmp eq i32 %263, 0, !dbg !28 + %264 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i865) #6, !dbg !28 + %265 = tail call float @llvm.nvvm.saturate.f(float %.02.i865) #6, !dbg !28 + %.03.i867 = select i1 %.not1.i866, float %265, float %264, !dbg !28 + %266 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i868 = icmp eq i32 %266, 0, !dbg !28 + %267 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i867, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %268 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i867, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i869 = select i1 %.not2.i868, float %268, float %267, !dbg !28 + %269 = fadd float %.04.i869, 0xC168000FE0000000, !dbg !28 + %270 = fneg float %269, !dbg !28 + %271 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i870 = icmp eq i32 %271, 0, !dbg !28 + %272 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %149, float 0x3FF7154760000000, float %270) #6, !dbg !28 + %273 = tail call float @llvm.nvvm.fma.rn.f(float %149, float 0x3FF7154760000000, float %270) #6, !dbg !28 + %.0.i871 = select i1 %.not3.i870, float %273, float %272, !dbg !28 + %274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i872 = icmp eq i32 %274, 0, !dbg !28 + %275 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %149, float 0x3E54AE0C00000000, float %.0.i871) #6, !dbg !28 + %276 = tail call float @llvm.nvvm.fma.rn.f(float %149, float 0x3E54AE0C00000000, float %.0.i871) #6, !dbg !28 + %.01.i873 = select i1 %.not4.i872, float %276, float %275, !dbg !28 + %277 = bitcast float %.04.i869 to i32, !dbg !28 + %278 = shl i32 %277, 23, !dbg !28 + %279 = bitcast i32 %278 to float, !dbg !28 + %280 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i873) #6, !dbg !28 + %281 = fmul float %280, %279, !dbg !28 + %282 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i874 = icmp eq i32 %282, 0, !dbg !28 + %283 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %150, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %284 = tail call float @llvm.nvvm.fma.rn.f(float %150, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i875 = select i1 %.not.i874, float %284, float %283, !dbg !28 + %285 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i876 = icmp eq i32 %285, 0, !dbg !28 + %286 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i875) #6, !dbg !28 + %287 = tail call float @llvm.nvvm.saturate.f(float %.02.i875) #6, !dbg !28 + %.03.i877 = select i1 %.not1.i876, float %287, float %286, !dbg !28 + %288 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i878 = icmp eq i32 %288, 0, !dbg !28 + %289 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i877, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %290 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i877, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i879 = select i1 %.not2.i878, float %290, float %289, !dbg !28 + %291 = fadd float %.04.i879, 0xC168000FE0000000, !dbg !28 + %292 = fneg float %291, !dbg !28 + %293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i880 = icmp eq i32 %293, 0, !dbg !28 + %294 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %150, float 0x3FF7154760000000, float %292) #6, !dbg !28 + %295 = tail call float @llvm.nvvm.fma.rn.f(float %150, float 0x3FF7154760000000, float %292) #6, !dbg !28 + %.0.i881 = select i1 %.not3.i880, float %295, float %294, !dbg !28 + %296 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i882 = icmp eq i32 %296, 0, !dbg !28 + %297 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %150, float 0x3E54AE0C00000000, float %.0.i881) #6, !dbg !28 + %298 = tail call float @llvm.nvvm.fma.rn.f(float %150, float 0x3E54AE0C00000000, float %.0.i881) #6, !dbg !28 + %.01.i883 = select i1 %.not4.i882, float %298, float %297, !dbg !28 + %299 = bitcast float %.04.i879 to i32, !dbg !28 + %300 = shl i32 %299, 23, !dbg !28 + %301 = bitcast i32 %300 to float, !dbg !28 + %302 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i883) #6, !dbg !28 + %303 = fmul float %302, %301, !dbg !28 + %304 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i884 = icmp eq i32 %304, 0, !dbg !28 + %305 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %151, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %306 = tail call float @llvm.nvvm.fma.rn.f(float %151, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i885 = select i1 %.not.i884, float %306, float %305, !dbg !28 + %307 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i886 = icmp eq i32 %307, 0, !dbg !28 + %308 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i885) #6, !dbg !28 + %309 = tail call float @llvm.nvvm.saturate.f(float %.02.i885) #6, !dbg !28 + %.03.i887 = select i1 %.not1.i886, float %309, float %308, !dbg !28 + %310 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i888 = icmp eq i32 %310, 0, !dbg !28 + %311 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i887, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %312 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i887, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i889 = select i1 %.not2.i888, float %312, float %311, !dbg !28 + %313 = fadd float %.04.i889, 0xC168000FE0000000, !dbg !28 + %314 = fneg float %313, !dbg !28 + %315 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i890 = icmp eq i32 %315, 0, !dbg !28 + %316 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %151, float 0x3FF7154760000000, float %314) #6, !dbg !28 + %317 = tail call float @llvm.nvvm.fma.rn.f(float %151, float 0x3FF7154760000000, float %314) #6, !dbg !28 + %.0.i891 = select i1 %.not3.i890, float %317, float %316, !dbg !28 + %318 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i892 = icmp eq i32 %318, 0, !dbg !28 + %319 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %151, float 0x3E54AE0C00000000, float %.0.i891) #6, !dbg !28 + %320 = tail call float @llvm.nvvm.fma.rn.f(float %151, float 0x3E54AE0C00000000, float %.0.i891) #6, !dbg !28 + %.01.i893 = select i1 %.not4.i892, float %320, float %319, !dbg !28 + %321 = bitcast float %.04.i889 to i32, !dbg !28 + %322 = shl i32 %321, 23, !dbg !28 + %323 = bitcast i32 %322 to float, !dbg !28 + %324 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i893) #6, !dbg !28 + %325 = fmul float %324, %323, !dbg !28 + %326 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i894 = icmp eq i32 %326, 0, !dbg !28 + %327 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %152, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %328 = tail call float @llvm.nvvm.fma.rn.f(float %152, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i895 = select i1 %.not.i894, float %328, float %327, !dbg !28 + %329 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i896 = icmp eq i32 %329, 0, !dbg !28 + %330 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i895) #6, !dbg !28 + %331 = tail call float @llvm.nvvm.saturate.f(float %.02.i895) #6, !dbg !28 + %.03.i897 = select i1 %.not1.i896, float %331, float %330, !dbg !28 + %332 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i898 = icmp eq i32 %332, 0, !dbg !28 + %333 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i897, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %334 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i897, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i899 = select i1 %.not2.i898, float %334, float %333, !dbg !28 + %335 = fadd float %.04.i899, 0xC168000FE0000000, !dbg !28 + %336 = fneg float %335, !dbg !28 + %337 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i900 = icmp eq i32 %337, 0, !dbg !28 + %338 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %152, float 0x3FF7154760000000, float %336) #6, !dbg !28 + %339 = tail call float @llvm.nvvm.fma.rn.f(float %152, float 0x3FF7154760000000, float %336) #6, !dbg !28 + %.0.i901 = select i1 %.not3.i900, float %339, float %338, !dbg !28 + %340 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i902 = icmp eq i32 %340, 0, !dbg !28 + %341 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %152, float 0x3E54AE0C00000000, float %.0.i901) #6, !dbg !28 + %342 = tail call float @llvm.nvvm.fma.rn.f(float %152, float 0x3E54AE0C00000000, float %.0.i901) #6, !dbg !28 + %.01.i903 = select i1 %.not4.i902, float %342, float %341, !dbg !28 + %343 = bitcast float %.04.i899 to i32, !dbg !28 + %344 = shl i32 %343, 23, !dbg !28 + %345 = bitcast i32 %344 to float, !dbg !28 + %346 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i903) #6, !dbg !28 + %347 = fmul float %346, %345, !dbg !28 + %348 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i904 = icmp eq i32 %348, 0, !dbg !28 + %349 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %153, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %350 = tail call float @llvm.nvvm.fma.rn.f(float %153, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i905 = select i1 %.not.i904, float %350, float %349, !dbg !28 + %351 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i906 = icmp eq i32 %351, 0, !dbg !28 + %352 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i905) #6, !dbg !28 + %353 = tail call float @llvm.nvvm.saturate.f(float %.02.i905) #6, !dbg !28 + %.03.i907 = select i1 %.not1.i906, float %353, float %352, !dbg !28 + %354 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i908 = icmp eq i32 %354, 0, !dbg !28 + %355 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i907, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %356 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i907, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i909 = select i1 %.not2.i908, float %356, float %355, !dbg !28 + %357 = fadd float %.04.i909, 0xC168000FE0000000, !dbg !28 + %358 = fneg float %357, !dbg !28 + %359 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i910 = icmp eq i32 %359, 0, !dbg !28 + %360 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %153, float 0x3FF7154760000000, float %358) #6, !dbg !28 + %361 = tail call float @llvm.nvvm.fma.rn.f(float %153, float 0x3FF7154760000000, float %358) #6, !dbg !28 + %.0.i911 = select i1 %.not3.i910, float %361, float %360, !dbg !28 + %362 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i912 = icmp eq i32 %362, 0, !dbg !28 + %363 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %153, float 0x3E54AE0C00000000, float %.0.i911) #6, !dbg !28 + %364 = tail call float @llvm.nvvm.fma.rn.f(float %153, float 0x3E54AE0C00000000, float %.0.i911) #6, !dbg !28 + %.01.i913 = select i1 %.not4.i912, float %364, float %363, !dbg !28 + %365 = bitcast float %.04.i909 to i32, !dbg !28 + %366 = shl i32 %365, 23, !dbg !28 + %367 = bitcast i32 %366 to float, !dbg !28 + %368 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i913) #6, !dbg !28 + %369 = fmul float %368, %367, !dbg !28 + %370 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i914 = icmp eq i32 %370, 0, !dbg !28 + %371 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %154, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %372 = tail call float @llvm.nvvm.fma.rn.f(float %154, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i915 = select i1 %.not.i914, float %372, float %371, !dbg !28 + %373 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i916 = icmp eq i32 %373, 0, !dbg !28 + %374 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i915) #6, !dbg !28 + %375 = tail call float @llvm.nvvm.saturate.f(float %.02.i915) #6, !dbg !28 + %.03.i917 = select i1 %.not1.i916, float %375, float %374, !dbg !28 + %376 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i918 = icmp eq i32 %376, 0, !dbg !28 + %377 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i917, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %378 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i917, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i919 = select i1 %.not2.i918, float %378, float %377, !dbg !28 + %379 = fadd float %.04.i919, 0xC168000FE0000000, !dbg !28 + %380 = fneg float %379, !dbg !28 + %381 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i920 = icmp eq i32 %381, 0, !dbg !28 + %382 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %154, float 0x3FF7154760000000, float %380) #6, !dbg !28 + %383 = tail call float @llvm.nvvm.fma.rn.f(float %154, float 0x3FF7154760000000, float %380) #6, !dbg !28 + %.0.i921 = select i1 %.not3.i920, float %383, float %382, !dbg !28 + %384 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i922 = icmp eq i32 %384, 0, !dbg !28 + %385 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %154, float 0x3E54AE0C00000000, float %.0.i921) #6, !dbg !28 + %386 = tail call float @llvm.nvvm.fma.rn.f(float %154, float 0x3E54AE0C00000000, float %.0.i921) #6, !dbg !28 + %.01.i923 = select i1 %.not4.i922, float %386, float %385, !dbg !28 + %387 = bitcast float %.04.i919 to i32, !dbg !28 + %388 = shl i32 %387, 23, !dbg !28 + %389 = bitcast i32 %388 to float, !dbg !28 + %390 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i923) #6, !dbg !28 + %391 = fmul float %390, %389, !dbg !28 + %392 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i924 = icmp eq i32 %392, 0, !dbg !28 + %393 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %155, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %394 = tail call float @llvm.nvvm.fma.rn.f(float %155, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i925 = select i1 %.not.i924, float %394, float %393, !dbg !28 + %395 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i926 = icmp eq i32 %395, 0, !dbg !28 + %396 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i925) #6, !dbg !28 + %397 = tail call float @llvm.nvvm.saturate.f(float %.02.i925) #6, !dbg !28 + %.03.i927 = select i1 %.not1.i926, float %397, float %396, !dbg !28 + %398 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i928 = icmp eq i32 %398, 0, !dbg !28 + %399 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i927, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %400 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i927, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i929 = select i1 %.not2.i928, float %400, float %399, !dbg !28 + %401 = fadd float %.04.i929, 0xC168000FE0000000, !dbg !28 + %402 = fneg float %401, !dbg !28 + %403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i930 = icmp eq i32 %403, 0, !dbg !28 + %404 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %155, float 0x3FF7154760000000, float %402) #6, !dbg !28 + %405 = tail call float @llvm.nvvm.fma.rn.f(float %155, float 0x3FF7154760000000, float %402) #6, !dbg !28 + %.0.i931 = select i1 %.not3.i930, float %405, float %404, !dbg !28 + %406 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i932 = icmp eq i32 %406, 0, !dbg !28 + %407 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %155, float 0x3E54AE0C00000000, float %.0.i931) #6, !dbg !28 + %408 = tail call float @llvm.nvvm.fma.rn.f(float %155, float 0x3E54AE0C00000000, float %.0.i931) #6, !dbg !28 + %.01.i933 = select i1 %.not4.i932, float %408, float %407, !dbg !28 + %409 = bitcast float %.04.i929 to i32, !dbg !28 + %410 = shl i32 %409, 23, !dbg !28 + %411 = bitcast i32 %410 to float, !dbg !28 + %412 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i933) #6, !dbg !28 + %413 = fmul float %412, %411, !dbg !28 + %414 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i934 = icmp eq i32 %414, 0, !dbg !28 + %415 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %156, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %416 = tail call float @llvm.nvvm.fma.rn.f(float %156, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i935 = select i1 %.not.i934, float %416, float %415, !dbg !28 + %417 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i936 = icmp eq i32 %417, 0, !dbg !28 + %418 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i935) #6, !dbg !28 + %419 = tail call float @llvm.nvvm.saturate.f(float %.02.i935) #6, !dbg !28 + %.03.i937 = select i1 %.not1.i936, float %419, float %418, !dbg !28 + %420 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i938 = icmp eq i32 %420, 0, !dbg !28 + %421 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i937, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %422 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i937, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i939 = select i1 %.not2.i938, float %422, float %421, !dbg !28 + %423 = fadd float %.04.i939, 0xC168000FE0000000, !dbg !28 + %424 = fneg float %423, !dbg !28 + %425 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i940 = icmp eq i32 %425, 0, !dbg !28 + %426 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %156, float 0x3FF7154760000000, float %424) #6, !dbg !28 + %427 = tail call float @llvm.nvvm.fma.rn.f(float %156, float 0x3FF7154760000000, float %424) #6, !dbg !28 + %.0.i941 = select i1 %.not3.i940, float %427, float %426, !dbg !28 + %428 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i942 = icmp eq i32 %428, 0, !dbg !28 + %429 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %156, float 0x3E54AE0C00000000, float %.0.i941) #6, !dbg !28 + %430 = tail call float @llvm.nvvm.fma.rn.f(float %156, float 0x3E54AE0C00000000, float %.0.i941) #6, !dbg !28 + %.01.i943 = select i1 %.not4.i942, float %430, float %429, !dbg !28 + %431 = bitcast float %.04.i939 to i32, !dbg !28 + %432 = shl i32 %431, 23, !dbg !28 + %433 = bitcast i32 %432 to float, !dbg !28 + %434 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i943) #6, !dbg !28 + %435 = fmul float %434, %433, !dbg !28 + %436 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i944 = icmp eq i32 %436, 0, !dbg !28 + %437 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %157, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %438 = tail call float @llvm.nvvm.fma.rn.f(float %157, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i945 = select i1 %.not.i944, float %438, float %437, !dbg !28 + %439 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i946 = icmp eq i32 %439, 0, !dbg !28 + %440 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i945) #6, !dbg !28 + %441 = tail call float @llvm.nvvm.saturate.f(float %.02.i945) #6, !dbg !28 + %.03.i947 = select i1 %.not1.i946, float %441, float %440, !dbg !28 + %442 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i948 = icmp eq i32 %442, 0, !dbg !28 + %443 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i947, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %444 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i947, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i949 = select i1 %.not2.i948, float %444, float %443, !dbg !28 + %445 = fadd float %.04.i949, 0xC168000FE0000000, !dbg !28 + %446 = fneg float %445, !dbg !28 + %447 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i950 = icmp eq i32 %447, 0, !dbg !28 + %448 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %157, float 0x3FF7154760000000, float %446) #6, !dbg !28 + %449 = tail call float @llvm.nvvm.fma.rn.f(float %157, float 0x3FF7154760000000, float %446) #6, !dbg !28 + %.0.i951 = select i1 %.not3.i950, float %449, float %448, !dbg !28 + %450 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i952 = icmp eq i32 %450, 0, !dbg !28 + %451 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %157, float 0x3E54AE0C00000000, float %.0.i951) #6, !dbg !28 + %452 = tail call float @llvm.nvvm.fma.rn.f(float %157, float 0x3E54AE0C00000000, float %.0.i951) #6, !dbg !28 + %.01.i953 = select i1 %.not4.i952, float %452, float %451, !dbg !28 + %453 = bitcast float %.04.i949 to i32, !dbg !28 + %454 = shl i32 %453, 23, !dbg !28 + %455 = bitcast i32 %454 to float, !dbg !28 + %456 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i953) #6, !dbg !28 + %457 = fmul float %456, %455, !dbg !28 + %458 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i954 = icmp eq i32 %458, 0, !dbg !28 + %459 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i956 = icmp eq i32 %459, 0, !dbg !28 + %460 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %461 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i960 = icmp eq i32 %461, 0, !dbg !28 + %462 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i962 = icmp eq i32 %462, 0, !dbg !28 + %463 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i964 = icmp eq i32 %463, 0, !dbg !28 + %464 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i966 = icmp eq i32 %464, 0, !dbg !28 + %465 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %466 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i970 = icmp eq i32 %466, 0, !dbg !28 + %467 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i972 = icmp eq i32 %467, 0, !dbg !28 + %468 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i974 = icmp eq i32 %468, 0, !dbg !28 + %469 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i976 = icmp eq i32 %469, 0, !dbg !28 + %470 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %471 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i980 = icmp eq i32 %471, 0, !dbg !28 + %472 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i982 = icmp eq i32 %472, 0, !dbg !28 + %473 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i984 = icmp eq i32 %473, 0, !dbg !28 + %474 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i986 = icmp eq i32 %474, 0, !dbg !28 + %475 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %476 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i990 = icmp eq i32 %476, 0, !dbg !28 + %477 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i992 = icmp eq i32 %477, 0, !dbg !28 + %478 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i994 = icmp eq i32 %478, 0, !dbg !28 + %479 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i996 = icmp eq i32 %479, 0, !dbg !28 + %480 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %481 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1000 = icmp eq i32 %481, 0, !dbg !28 + %482 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1002 = icmp eq i32 %482, 0, !dbg !28 + %483 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1004 = icmp eq i32 %483, 0, !dbg !28 + %484 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1006 = icmp eq i32 %484, 0, !dbg !28 + %485 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %486 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1010 = icmp eq i32 %486, 0, !dbg !28 + %487 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1012 = icmp eq i32 %487, 0, !dbg !28 + %488 = select i1 %131, float 1.000000e+00, float %259, !dbg !29 + %489 = select i1 %133, float 1.000000e+00, float %281, !dbg !29 + %490 = select i1 %135, float 1.000000e+00, float %303, !dbg !29 + %491 = select i1 %137, float 1.000000e+00, float %325, !dbg !29 + %492 = select i1 %139, float 1.000000e+00, float %347, !dbg !29 + %493 = select i1 %141, float 1.000000e+00, float %369, !dbg !29 + %494 = select i1 %143, float 1.000000e+00, float %391, !dbg !29 + %495 = select i1 %145, float 1.000000e+00, float %413, !dbg !29 + %496 = select i1 %146, float 1.000000e+00, float %435, !dbg !29 + %497 = select i1 %147, float 1.000000e+00, float %457, !dbg !29 + %foldExtExtBinop1541 = fsub <8 x float> %124, %127, !dbg !25 + %498 = extractelement <8 x float> %foldExtExtBinop1541, i64 0, !dbg !25 + %foldExtExtBinop1543 = fsub <8 x float> %124, %127, !dbg !25 + %499 = extractelement <8 x float> %foldExtExtBinop1543, i64 1, !dbg !25 + %foldExtExtBinop1545 = fsub <8 x float> %124, %127, !dbg !25 + %500 = extractelement <8 x float> %foldExtExtBinop1545, i64 2, !dbg !25 + %foldExtExtBinop1547 = fsub <8 x float> %124, %127, !dbg !25 + %501 = extractelement <8 x float> %foldExtExtBinop1547, i64 3, !dbg !25 + %foldExtExtBinop1549 = fsub <8 x float> %124, %127, !dbg !25 + %502 = extractelement <8 x float> %foldExtExtBinop1549, i64 4, !dbg !25 + %foldExtExtBinop1551 = fsub <8 x float> %124, %127, !dbg !25 + %503 = extractelement <8 x float> %foldExtExtBinop1551, i64 5, !dbg !25 + %foldExtExtBinop1553 = fsub <8 x float> %124, %127, !dbg !25 + %504 = extractelement <8 x float> %foldExtExtBinop1553, i64 6, !dbg !25 + %foldExtExtBinop1555 = fsub <8 x float> %124, %127, !dbg !25 + %505 = extractelement <8 x float> %foldExtExtBinop1555, i64 7, !dbg !25 + %506 = fsub float %102, %128, !dbg !25 + %507 = fsub float %103, %129, !dbg !25 + %508 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1014 = icmp eq i32 %508, 0, !dbg !28 + %509 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1016 = icmp eq i32 %509, 0, !dbg !28 + %510 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %511 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1020 = icmp eq i32 %511, 0, !dbg !28 + %512 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1022 = icmp eq i32 %512, 0, !dbg !28 + %513 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1024 = icmp eq i32 %513, 0, !dbg !28 + %514 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1026 = icmp eq i32 %514, 0, !dbg !28 + %515 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %516 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1030 = icmp eq i32 %516, 0, !dbg !28 + %517 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1032 = icmp eq i32 %517, 0, !dbg !28 + %518 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1034 = icmp eq i32 %518, 0, !dbg !28 + %519 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1036 = icmp eq i32 %519, 0, !dbg !28 + %520 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %521 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1040 = icmp eq i32 %521, 0, !dbg !28 + %522 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1042 = icmp eq i32 %522, 0, !dbg !28 + %523 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1044 = icmp eq i32 %523, 0, !dbg !28 + %524 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1046 = icmp eq i32 %524, 0, !dbg !28 + %525 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %526 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1050 = icmp eq i32 %526, 0, !dbg !28 + %527 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1052 = icmp eq i32 %527, 0, !dbg !28 + %528 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1054 = icmp eq i32 %528, 0, !dbg !28 + %529 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1056 = icmp eq i32 %529, 0, !dbg !28 + %530 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %531 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1060 = icmp eq i32 %531, 0, !dbg !28 + %532 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1062 = icmp eq i32 %532, 0, !dbg !28 + %533 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1064 = icmp eq i32 %533, 0, !dbg !28 + %534 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1066 = icmp eq i32 %534, 0, !dbg !28 + %535 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %536 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1070 = icmp eq i32 %536, 0, !dbg !28 + %537 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1072 = icmp eq i32 %537, 0, !dbg !28 + %538 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1074 = icmp eq i32 %538, 0, !dbg !28 + %539 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1076 = icmp eq i32 %539, 0, !dbg !28 + %540 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %541 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1080 = icmp eq i32 %541, 0, !dbg !28 + %542 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1082 = icmp eq i32 %542, 0, !dbg !28 + %543 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1084 = icmp eq i32 %543, 0, !dbg !28 + %544 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1086 = icmp eq i32 %544, 0, !dbg !28 + %545 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %546 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1090 = icmp eq i32 %546, 0, !dbg !28 + %547 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1092 = icmp eq i32 %547, 0, !dbg !28 + %548 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1094 = icmp eq i32 %548, 0, !dbg !28 + %549 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1096 = icmp eq i32 %549, 0, !dbg !28 + %550 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %551 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1100 = icmp eq i32 %551, 0, !dbg !28 + %552 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1102 = icmp eq i32 %552, 0, !dbg !28 + %553 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1104 = icmp eq i32 %553, 0, !dbg !28 + %554 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1106 = icmp eq i32 %554, 0, !dbg !28 + %555 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %556 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1110 = icmp eq i32 %556, 0, !dbg !28 + %557 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1112 = icmp eq i32 %557, 0, !dbg !28 + %558 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1114 = icmp eq i32 %558, 0, !dbg !28 + %559 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1116 = icmp eq i32 %559, 0, !dbg !28 + %560 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %561 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1120 = icmp eq i32 %561, 0, !dbg !28 + %562 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1122 = icmp eq i32 %562, 0, !dbg !28 + %563 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1124 = icmp eq i32 %563, 0, !dbg !28 + %564 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1126 = icmp eq i32 %564, 0, !dbg !28 + %565 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %566 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1130 = icmp eq i32 %566, 0, !dbg !28 + %567 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1132 = icmp eq i32 %567, 0, !dbg !28 + %568 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1134 = icmp eq i32 %568, 0, !dbg !28 + %569 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1136 = icmp eq i32 %569, 0, !dbg !28 + %570 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %571 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1140 = icmp eq i32 %571, 0, !dbg !28 + %572 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1142 = icmp eq i32 %572, 0, !dbg !28 + %573 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1144 = icmp eq i32 %573, 0, !dbg !28 + %574 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1146 = icmp eq i32 %574, 0, !dbg !28 + %575 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %576 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1150 = icmp eq i32 %576, 0, !dbg !28 + %577 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1152 = icmp eq i32 %577, 0, !dbg !28 + %578 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1154 = icmp eq i32 %578, 0, !dbg !28 + %579 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1156 = icmp eq i32 %579, 0, !dbg !28 + %580 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %581 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1160 = icmp eq i32 %581, 0, !dbg !28 + %582 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1162 = icmp eq i32 %582, 0, !dbg !28 + %583 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1164 = icmp eq i32 %583, 0, !dbg !28 + %584 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1166 = icmp eq i32 %584, 0, !dbg !28 + %585 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %586 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1170 = icmp eq i32 %586, 0, !dbg !28 + %587 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1172 = icmp eq i32 %587, 0, !dbg !28 + %588 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1174 = icmp eq i32 %588, 0, !dbg !28 + %589 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %498, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %590 = tail call float @llvm.nvvm.fma.rn.f(float %498, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1175 = select i1 %.not.i1174, float %590, float %589, !dbg !28 + %591 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1176 = icmp eq i32 %591, 0, !dbg !28 + %592 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1175) #6, !dbg !28 + %593 = tail call float @llvm.nvvm.saturate.f(float %.02.i1175) #6, !dbg !28 + %.03.i1177 = select i1 %.not1.i1176, float %593, float %592, !dbg !28 + %594 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1178 = icmp eq i32 %594, 0, !dbg !28 + %595 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1177, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %596 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1177, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1179 = select i1 %.not2.i1178, float %596, float %595, !dbg !28 + %597 = fadd float %.04.i1179, 0xC168000FE0000000, !dbg !28 + %598 = fneg float %597, !dbg !28 + %599 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1180 = icmp eq i32 %599, 0, !dbg !28 + %600 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %498, float 0x3FF7154760000000, float %598) #6, !dbg !28 + %601 = tail call float @llvm.nvvm.fma.rn.f(float %498, float 0x3FF7154760000000, float %598) #6, !dbg !28 + %.0.i1181 = select i1 %.not3.i1180, float %601, float %600, !dbg !28 + %602 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1182 = icmp eq i32 %602, 0, !dbg !28 + %603 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %498, float 0x3E54AE0C00000000, float %.0.i1181) #6, !dbg !28 + %604 = tail call float @llvm.nvvm.fma.rn.f(float %498, float 0x3E54AE0C00000000, float %.0.i1181) #6, !dbg !28 + %.01.i1183 = select i1 %.not4.i1182, float %604, float %603, !dbg !28 + %605 = bitcast float %.04.i1179 to i32, !dbg !28 + %606 = shl i32 %605, 23, !dbg !28 + %607 = bitcast i32 %606 to float, !dbg !28 + %608 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1183) #6, !dbg !28 + %609 = fmul float %608, %607, !dbg !28 + %610 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1184 = icmp eq i32 %610, 0, !dbg !28 + %611 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %499, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %612 = tail call float @llvm.nvvm.fma.rn.f(float %499, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1185 = select i1 %.not.i1184, float %612, float %611, !dbg !28 + %613 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1186 = icmp eq i32 %613, 0, !dbg !28 + %614 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1185) #6, !dbg !28 + %615 = tail call float @llvm.nvvm.saturate.f(float %.02.i1185) #6, !dbg !28 + %.03.i1187 = select i1 %.not1.i1186, float %615, float %614, !dbg !28 + %616 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1188 = icmp eq i32 %616, 0, !dbg !28 + %617 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1187, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %618 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1187, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1189 = select i1 %.not2.i1188, float %618, float %617, !dbg !28 + %619 = fadd float %.04.i1189, 0xC168000FE0000000, !dbg !28 + %620 = fneg float %619, !dbg !28 + %621 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1190 = icmp eq i32 %621, 0, !dbg !28 + %622 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %499, float 0x3FF7154760000000, float %620) #6, !dbg !28 + %623 = tail call float @llvm.nvvm.fma.rn.f(float %499, float 0x3FF7154760000000, float %620) #6, !dbg !28 + %.0.i1191 = select i1 %.not3.i1190, float %623, float %622, !dbg !28 + %624 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1192 = icmp eq i32 %624, 0, !dbg !28 + %625 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %499, float 0x3E54AE0C00000000, float %.0.i1191) #6, !dbg !28 + %626 = tail call float @llvm.nvvm.fma.rn.f(float %499, float 0x3E54AE0C00000000, float %.0.i1191) #6, !dbg !28 + %.01.i1193 = select i1 %.not4.i1192, float %626, float %625, !dbg !28 + %627 = bitcast float %.04.i1189 to i32, !dbg !28 + %628 = shl i32 %627, 23, !dbg !28 + %629 = bitcast i32 %628 to float, !dbg !28 + %630 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1193) #6, !dbg !28 + %631 = fmul float %630, %629, !dbg !28 + %632 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1194 = icmp eq i32 %632, 0, !dbg !28 + %633 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %500, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %634 = tail call float @llvm.nvvm.fma.rn.f(float %500, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1195 = select i1 %.not.i1194, float %634, float %633, !dbg !28 + %635 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1196 = icmp eq i32 %635, 0, !dbg !28 + %636 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1195) #6, !dbg !28 + %637 = tail call float @llvm.nvvm.saturate.f(float %.02.i1195) #6, !dbg !28 + %.03.i1197 = select i1 %.not1.i1196, float %637, float %636, !dbg !28 + %638 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1198 = icmp eq i32 %638, 0, !dbg !28 + %639 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1197, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %640 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1197, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1199 = select i1 %.not2.i1198, float %640, float %639, !dbg !28 + %641 = fadd float %.04.i1199, 0xC168000FE0000000, !dbg !28 + %642 = fneg float %641, !dbg !28 + %643 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1200 = icmp eq i32 %643, 0, !dbg !28 + %644 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %500, float 0x3FF7154760000000, float %642) #6, !dbg !28 + %645 = tail call float @llvm.nvvm.fma.rn.f(float %500, float 0x3FF7154760000000, float %642) #6, !dbg !28 + %.0.i1201 = select i1 %.not3.i1200, float %645, float %644, !dbg !28 + %646 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1202 = icmp eq i32 %646, 0, !dbg !28 + %647 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %500, float 0x3E54AE0C00000000, float %.0.i1201) #6, !dbg !28 + %648 = tail call float @llvm.nvvm.fma.rn.f(float %500, float 0x3E54AE0C00000000, float %.0.i1201) #6, !dbg !28 + %.01.i1203 = select i1 %.not4.i1202, float %648, float %647, !dbg !28 + %649 = bitcast float %.04.i1199 to i32, !dbg !28 + %650 = shl i32 %649, 23, !dbg !28 + %651 = bitcast i32 %650 to float, !dbg !28 + %652 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1203) #6, !dbg !28 + %653 = fmul float %652, %651, !dbg !28 + %654 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1204 = icmp eq i32 %654, 0, !dbg !28 + %655 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %501, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %656 = tail call float @llvm.nvvm.fma.rn.f(float %501, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1205 = select i1 %.not.i1204, float %656, float %655, !dbg !28 + %657 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1206 = icmp eq i32 %657, 0, !dbg !28 + %658 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1205) #6, !dbg !28 + %659 = tail call float @llvm.nvvm.saturate.f(float %.02.i1205) #6, !dbg !28 + %.03.i1207 = select i1 %.not1.i1206, float %659, float %658, !dbg !28 + %660 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1208 = icmp eq i32 %660, 0, !dbg !28 + %661 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1207, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %662 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1207, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1209 = select i1 %.not2.i1208, float %662, float %661, !dbg !28 + %663 = fadd float %.04.i1209, 0xC168000FE0000000, !dbg !28 + %664 = fneg float %663, !dbg !28 + %665 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1210 = icmp eq i32 %665, 0, !dbg !28 + %666 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %501, float 0x3FF7154760000000, float %664) #6, !dbg !28 + %667 = tail call float @llvm.nvvm.fma.rn.f(float %501, float 0x3FF7154760000000, float %664) #6, !dbg !28 + %.0.i1211 = select i1 %.not3.i1210, float %667, float %666, !dbg !28 + %668 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1212 = icmp eq i32 %668, 0, !dbg !28 + %669 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %501, float 0x3E54AE0C00000000, float %.0.i1211) #6, !dbg !28 + %670 = tail call float @llvm.nvvm.fma.rn.f(float %501, float 0x3E54AE0C00000000, float %.0.i1211) #6, !dbg !28 + %.01.i1213 = select i1 %.not4.i1212, float %670, float %669, !dbg !28 + %671 = bitcast float %.04.i1209 to i32, !dbg !28 + %672 = shl i32 %671, 23, !dbg !28 + %673 = bitcast i32 %672 to float, !dbg !28 + %674 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1213) #6, !dbg !28 + %675 = fmul float %674, %673, !dbg !28 + %676 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1214 = icmp eq i32 %676, 0, !dbg !28 + %677 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %502, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %678 = tail call float @llvm.nvvm.fma.rn.f(float %502, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1215 = select i1 %.not.i1214, float %678, float %677, !dbg !28 + %679 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1216 = icmp eq i32 %679, 0, !dbg !28 + %680 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1215) #6, !dbg !28 + %681 = tail call float @llvm.nvvm.saturate.f(float %.02.i1215) #6, !dbg !28 + %.03.i1217 = select i1 %.not1.i1216, float %681, float %680, !dbg !28 + %682 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1218 = icmp eq i32 %682, 0, !dbg !28 + %683 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1217, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %684 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1217, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1219 = select i1 %.not2.i1218, float %684, float %683, !dbg !28 + %685 = fadd float %.04.i1219, 0xC168000FE0000000, !dbg !28 + %686 = fneg float %685, !dbg !28 + %687 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1220 = icmp eq i32 %687, 0, !dbg !28 + %688 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %502, float 0x3FF7154760000000, float %686) #6, !dbg !28 + %689 = tail call float @llvm.nvvm.fma.rn.f(float %502, float 0x3FF7154760000000, float %686) #6, !dbg !28 + %.0.i1221 = select i1 %.not3.i1220, float %689, float %688, !dbg !28 + %690 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1222 = icmp eq i32 %690, 0, !dbg !28 + %691 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %502, float 0x3E54AE0C00000000, float %.0.i1221) #6, !dbg !28 + %692 = tail call float @llvm.nvvm.fma.rn.f(float %502, float 0x3E54AE0C00000000, float %.0.i1221) #6, !dbg !28 + %.01.i1223 = select i1 %.not4.i1222, float %692, float %691, !dbg !28 + %693 = bitcast float %.04.i1219 to i32, !dbg !28 + %694 = shl i32 %693, 23, !dbg !28 + %695 = bitcast i32 %694 to float, !dbg !28 + %696 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1223) #6, !dbg !28 + %697 = fmul float %696, %695, !dbg !28 + %698 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1224 = icmp eq i32 %698, 0, !dbg !28 + %699 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %503, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %700 = tail call float @llvm.nvvm.fma.rn.f(float %503, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1225 = select i1 %.not.i1224, float %700, float %699, !dbg !28 + %701 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1226 = icmp eq i32 %701, 0, !dbg !28 + %702 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1225) #6, !dbg !28 + %703 = tail call float @llvm.nvvm.saturate.f(float %.02.i1225) #6, !dbg !28 + %.03.i1227 = select i1 %.not1.i1226, float %703, float %702, !dbg !28 + %704 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1228 = icmp eq i32 %704, 0, !dbg !28 + %705 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1227, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %706 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1227, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1229 = select i1 %.not2.i1228, float %706, float %705, !dbg !28 + %707 = fadd float %.04.i1229, 0xC168000FE0000000, !dbg !28 + %708 = fneg float %707, !dbg !28 + %709 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1230 = icmp eq i32 %709, 0, !dbg !28 + %710 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %503, float 0x3FF7154760000000, float %708) #6, !dbg !28 + %711 = tail call float @llvm.nvvm.fma.rn.f(float %503, float 0x3FF7154760000000, float %708) #6, !dbg !28 + %.0.i1231 = select i1 %.not3.i1230, float %711, float %710, !dbg !28 + %712 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1232 = icmp eq i32 %712, 0, !dbg !28 + %713 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %503, float 0x3E54AE0C00000000, float %.0.i1231) #6, !dbg !28 + %714 = tail call float @llvm.nvvm.fma.rn.f(float %503, float 0x3E54AE0C00000000, float %.0.i1231) #6, !dbg !28 + %.01.i1233 = select i1 %.not4.i1232, float %714, float %713, !dbg !28 + %715 = bitcast float %.04.i1229 to i32, !dbg !28 + %716 = shl i32 %715, 23, !dbg !28 + %717 = bitcast i32 %716 to float, !dbg !28 + %718 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1233) #6, !dbg !28 + %719 = fmul float %718, %717, !dbg !28 + %720 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1234 = icmp eq i32 %720, 0, !dbg !28 + %721 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %504, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %722 = tail call float @llvm.nvvm.fma.rn.f(float %504, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1235 = select i1 %.not.i1234, float %722, float %721, !dbg !28 + %723 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1236 = icmp eq i32 %723, 0, !dbg !28 + %724 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1235) #6, !dbg !28 + %725 = tail call float @llvm.nvvm.saturate.f(float %.02.i1235) #6, !dbg !28 + %.03.i1237 = select i1 %.not1.i1236, float %725, float %724, !dbg !28 + %726 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1238 = icmp eq i32 %726, 0, !dbg !28 + %727 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1237, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %728 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1237, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1239 = select i1 %.not2.i1238, float %728, float %727, !dbg !28 + %729 = fadd float %.04.i1239, 0xC168000FE0000000, !dbg !28 + %730 = fneg float %729, !dbg !28 + %731 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1240 = icmp eq i32 %731, 0, !dbg !28 + %732 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %504, float 0x3FF7154760000000, float %730) #6, !dbg !28 + %733 = tail call float @llvm.nvvm.fma.rn.f(float %504, float 0x3FF7154760000000, float %730) #6, !dbg !28 + %.0.i1241 = select i1 %.not3.i1240, float %733, float %732, !dbg !28 + %734 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1242 = icmp eq i32 %734, 0, !dbg !28 + %735 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %504, float 0x3E54AE0C00000000, float %.0.i1241) #6, !dbg !28 + %736 = tail call float @llvm.nvvm.fma.rn.f(float %504, float 0x3E54AE0C00000000, float %.0.i1241) #6, !dbg !28 + %.01.i1243 = select i1 %.not4.i1242, float %736, float %735, !dbg !28 + %737 = bitcast float %.04.i1239 to i32, !dbg !28 + %738 = shl i32 %737, 23, !dbg !28 + %739 = bitcast i32 %738 to float, !dbg !28 + %740 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1243) #6, !dbg !28 + %741 = fmul float %740, %739, !dbg !28 + %742 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1244 = icmp eq i32 %742, 0, !dbg !28 + %743 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %505, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %744 = tail call float @llvm.nvvm.fma.rn.f(float %505, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1245 = select i1 %.not.i1244, float %744, float %743, !dbg !28 + %745 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1246 = icmp eq i32 %745, 0, !dbg !28 + %746 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1245) #6, !dbg !28 + %747 = tail call float @llvm.nvvm.saturate.f(float %.02.i1245) #6, !dbg !28 + %.03.i1247 = select i1 %.not1.i1246, float %747, float %746, !dbg !28 + %748 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1248 = icmp eq i32 %748, 0, !dbg !28 + %749 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1247, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %750 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1247, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1249 = select i1 %.not2.i1248, float %750, float %749, !dbg !28 + %751 = fadd float %.04.i1249, 0xC168000FE0000000, !dbg !28 + %752 = fneg float %751, !dbg !28 + %753 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1250 = icmp eq i32 %753, 0, !dbg !28 + %754 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %505, float 0x3FF7154760000000, float %752) #6, !dbg !28 + %755 = tail call float @llvm.nvvm.fma.rn.f(float %505, float 0x3FF7154760000000, float %752) #6, !dbg !28 + %.0.i1251 = select i1 %.not3.i1250, float %755, float %754, !dbg !28 + %756 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1252 = icmp eq i32 %756, 0, !dbg !28 + %757 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %505, float 0x3E54AE0C00000000, float %.0.i1251) #6, !dbg !28 + %758 = tail call float @llvm.nvvm.fma.rn.f(float %505, float 0x3E54AE0C00000000, float %.0.i1251) #6, !dbg !28 + %.01.i1253 = select i1 %.not4.i1252, float %758, float %757, !dbg !28 + %759 = bitcast float %.04.i1249 to i32, !dbg !28 + %760 = shl i32 %759, 23, !dbg !28 + %761 = bitcast i32 %760 to float, !dbg !28 + %762 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1253) #6, !dbg !28 + %763 = fmul float %762, %761, !dbg !28 + %764 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1254 = icmp eq i32 %764, 0, !dbg !28 + %765 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %506, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %766 = tail call float @llvm.nvvm.fma.rn.f(float %506, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1255 = select i1 %.not.i1254, float %766, float %765, !dbg !28 + %767 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1256 = icmp eq i32 %767, 0, !dbg !28 + %768 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1255) #6, !dbg !28 + %769 = tail call float @llvm.nvvm.saturate.f(float %.02.i1255) #6, !dbg !28 + %.03.i1257 = select i1 %.not1.i1256, float %769, float %768, !dbg !28 + %770 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1258 = icmp eq i32 %770, 0, !dbg !28 + %771 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1257, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %772 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1257, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1259 = select i1 %.not2.i1258, float %772, float %771, !dbg !28 + %773 = fadd float %.04.i1259, 0xC168000FE0000000, !dbg !28 + %774 = fneg float %773, !dbg !28 + %775 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1260 = icmp eq i32 %775, 0, !dbg !28 + %776 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %506, float 0x3FF7154760000000, float %774) #6, !dbg !28 + %777 = tail call float @llvm.nvvm.fma.rn.f(float %506, float 0x3FF7154760000000, float %774) #6, !dbg !28 + %.0.i1261 = select i1 %.not3.i1260, float %777, float %776, !dbg !28 + %778 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1262 = icmp eq i32 %778, 0, !dbg !28 + %779 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %506, float 0x3E54AE0C00000000, float %.0.i1261) #6, !dbg !28 + %780 = tail call float @llvm.nvvm.fma.rn.f(float %506, float 0x3E54AE0C00000000, float %.0.i1261) #6, !dbg !28 + %.01.i1263 = select i1 %.not4.i1262, float %780, float %779, !dbg !28 + %781 = bitcast float %.04.i1259 to i32, !dbg !28 + %782 = shl i32 %781, 23, !dbg !28 + %783 = bitcast i32 %782 to float, !dbg !28 + %784 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1263) #6, !dbg !28 + %785 = fmul float %784, %783, !dbg !28 + %786 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1264 = icmp eq i32 %786, 0, !dbg !28 + %787 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %507, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %788 = tail call float @llvm.nvvm.fma.rn.f(float %507, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1265 = select i1 %.not.i1264, float %788, float %787, !dbg !28 + %789 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1266 = icmp eq i32 %789, 0, !dbg !28 + %790 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1265) #6, !dbg !28 + %791 = tail call float @llvm.nvvm.saturate.f(float %.02.i1265) #6, !dbg !28 + %.03.i1267 = select i1 %.not1.i1266, float %791, float %790, !dbg !28 + %792 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not2.i1268 = icmp eq i32 %792, 0, !dbg !28 + %793 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1267, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %794 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1267, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %.04.i1269 = select i1 %.not2.i1268, float %794, float %793, !dbg !28 + %795 = fadd float %.04.i1269, 0xC168000FE0000000, !dbg !28 + %796 = fneg float %795, !dbg !28 + %797 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1270 = icmp eq i32 %797, 0, !dbg !28 + %798 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %507, float 0x3FF7154760000000, float %796) #6, !dbg !28 + %799 = tail call float @llvm.nvvm.fma.rn.f(float %507, float 0x3FF7154760000000, float %796) #6, !dbg !28 + %.0.i1271 = select i1 %.not3.i1270, float %799, float %798, !dbg !28 + %800 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1272 = icmp eq i32 %800, 0, !dbg !28 + %801 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %507, float 0x3E54AE0C00000000, float %.0.i1271) #6, !dbg !28 + %802 = tail call float @llvm.nvvm.fma.rn.f(float %507, float 0x3E54AE0C00000000, float %.0.i1271) #6, !dbg !28 + %.01.i1273 = select i1 %.not4.i1272, float %802, float %801, !dbg !28 + %803 = bitcast float %.04.i1269 to i32, !dbg !28 + %804 = shl i32 %803, 23, !dbg !28 + %805 = bitcast i32 %804 to float, !dbg !28 + %806 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1273) #6, !dbg !28 + %807 = fmul float %806, %805, !dbg !28 + %808 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1274 = icmp eq i32 %808, 0, !dbg !28 + %809 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1276 = icmp eq i32 %809, 0, !dbg !28 + %810 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %811 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1280 = icmp eq i32 %811, 0, !dbg !28 + %812 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1282 = icmp eq i32 %812, 0, !dbg !28 + %813 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1284 = icmp eq i32 %813, 0, !dbg !28 + %814 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1286 = icmp eq i32 %814, 0, !dbg !28 + %815 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %816 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1290 = icmp eq i32 %816, 0, !dbg !28 + %817 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1292 = icmp eq i32 %817, 0, !dbg !28 + %818 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1294 = icmp eq i32 %818, 0, !dbg !28 + %819 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1296 = icmp eq i32 %819, 0, !dbg !28 + %820 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %821 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1300 = icmp eq i32 %821, 0, !dbg !28 + %822 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1302 = icmp eq i32 %822, 0, !dbg !28 + %823 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1304 = icmp eq i32 %823, 0, !dbg !28 + %824 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1306 = icmp eq i32 %824, 0, !dbg !28 + %825 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %826 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1310 = icmp eq i32 %826, 0, !dbg !28 + %827 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1312 = icmp eq i32 %827, 0, !dbg !28 + %828 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1314 = icmp eq i32 %828, 0, !dbg !28 + %829 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1316 = icmp eq i32 %829, 0, !dbg !28 + %830 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %831 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1320 = icmp eq i32 %831, 0, !dbg !28 + %832 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1322 = icmp eq i32 %832, 0, !dbg !28 + %833 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not.i1324 = icmp eq i32 %833, 0, !dbg !28 + %834 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not1.i1326 = icmp eq i32 %834, 0, !dbg !28 + %835 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %836 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not3.i1330 = icmp eq i32 %836, 0, !dbg !28 + %837 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !28 + %.not4.i1332 = icmp eq i32 %837, 0, !dbg !28 + %838 = select i1 %131, float 1.000000e+00, float %609, !dbg !30 + %839 = select i1 %133, float 1.000000e+00, float %631, !dbg !30 + %840 = select i1 %135, float 1.000000e+00, float %653, !dbg !30 + %841 = select i1 %137, float 1.000000e+00, float %675, !dbg !30 + %842 = select i1 %139, float 1.000000e+00, float %697, !dbg !30 + %843 = select i1 %141, float 1.000000e+00, float %719, !dbg !30 + %844 = select i1 %143, float 1.000000e+00, float %741, !dbg !30 + %845 = select i1 %145, float 1.000000e+00, float %763, !dbg !30 + %846 = select i1 %146, float 1.000000e+00, float %785, !dbg !30 + %847 = select i1 %147, float 1.000000e+00, float %807, !dbg !30 + %848 = shufflevector <2 x bfloat> %63, <2 x bfloat> %65, <16 x i32> , !dbg !17 + %849 = shufflevector <2 x bfloat> %67, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %850 = shufflevector <16 x bfloat> %848, <16 x bfloat> %849, <16 x i32> , !dbg !17 + %851 = shufflevector <2 x bfloat> %67, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %852 = shufflevector <16 x bfloat> %850, <16 x bfloat> %851, <16 x i32> , !dbg !17 + %853 = shufflevector <2 x bfloat> %69, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %854 = shufflevector <16 x bfloat> %852, <16 x bfloat> %853, <16 x i32> , !dbg !17 + %855 = shufflevector <2 x bfloat> %69, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %856 = shufflevector <16 x bfloat> %854, <16 x bfloat> %855, <16 x i32> , !dbg !17 + %857 = shufflevector <2 x bfloat> %73, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %858 = shufflevector <16 x bfloat> %856, <16 x bfloat> %857, <16 x i32> , !dbg !17 + %859 = shufflevector <2 x bfloat> %73, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %860 = shufflevector <16 x bfloat> %858, <16 x bfloat> %859, <16 x i32> , !dbg !17 + %861 = shufflevector <2 x bfloat> %75, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %862 = shufflevector <16 x bfloat> %860, <16 x bfloat> %861, <16 x i32> , !dbg !17 + %863 = shufflevector <2 x bfloat> %75, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %864 = shufflevector <16 x bfloat> %862, <16 x bfloat> %863, <16 x i32> , !dbg !17 + %865 = shufflevector <2 x bfloat> %77, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %866 = shufflevector <16 x bfloat> %864, <16 x bfloat> %865, <16 x i32> , !dbg !17 + %867 = shufflevector <2 x bfloat> %77, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %868 = shufflevector <16 x bfloat> %866, <16 x bfloat> %867, <16 x i32> , !dbg !17 + %869 = shufflevector <2 x bfloat> %79, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %870 = shufflevector <16 x bfloat> %868, <16 x bfloat> %869, <16 x i32> , !dbg !17 + %871 = shufflevector <2 x bfloat> %79, <2 x bfloat> poison, <16 x i32> , !dbg !17 + %872 = shufflevector <16 x bfloat> %870, <16 x bfloat> %871, <16 x i32> , !dbg !17 + %873 = fpext <16 x bfloat> %872 to <16 x float>, !dbg !17 + %874 = fcmp ogt <16 x float> %37, %873, !dbg !18 + %875 = or <16 x i1> %106, %874, !dbg !23 + %876 = select <16 x i1> %875, <16 x float> %37, <16 x float> %873, !dbg !24 + %877 = fcmp oeq <16 x float> %876, splat (float 0xFFF0000000000000), !dbg !26 + %foldExtExtBinop1557 = fsub <16 x float> %37, %876, !dbg !27 + %878 = extractelement <16 x float> %foldExtExtBinop1557, i64 0, !dbg !27 + %foldExtExtBinop1559 = fsub <16 x float> %37, %876, !dbg !27 + %879 = extractelement <16 x float> %foldExtExtBinop1559, i64 1, !dbg !27 + %foldExtExtBinop1561 = fsub <16 x float> %37, %876, !dbg !27 + %880 = extractelement <16 x float> %foldExtExtBinop1561, i64 2, !dbg !27 + %foldExtExtBinop1563 = fsub <16 x float> %37, %876, !dbg !27 + %881 = extractelement <16 x float> %foldExtExtBinop1563, i64 3, !dbg !27 + %foldExtExtBinop1565 = fsub <16 x float> %37, %876, !dbg !27 + %882 = extractelement <16 x float> %foldExtExtBinop1565, i64 4, !dbg !27 + %foldExtExtBinop1567 = fsub <16 x float> %37, %876, !dbg !27 + %883 = extractelement <16 x float> %foldExtExtBinop1567, i64 5, !dbg !27 + %foldExtExtBinop1569 = fsub <16 x float> %37, %876, !dbg !27 + %884 = extractelement <16 x float> %foldExtExtBinop1569, i64 6, !dbg !27 + %foldExtExtBinop1571 = fsub <16 x float> %37, %876, !dbg !27 + %885 = extractelement <16 x float> %foldExtExtBinop1571, i64 7, !dbg !27 + %foldExtExtBinop1573 = fsub <16 x float> %37, %876, !dbg !27 + %886 = extractelement <16 x float> %foldExtExtBinop1573, i64 8, !dbg !27 + %foldExtExtBinop1575 = fsub <16 x float> %37, %876, !dbg !27 + %887 = extractelement <16 x float> %foldExtExtBinop1575, i64 9, !dbg !27 + %foldExtExtBinop1577 = fsub <16 x float> %37, %876, !dbg !27 + %888 = extractelement <16 x float> %foldExtExtBinop1577, i64 10, !dbg !27 + %foldExtExtBinop1579 = fsub <16 x float> %37, %876, !dbg !27 + %889 = extractelement <16 x float> %foldExtExtBinop1579, i64 11, !dbg !27 + %foldExtExtBinop1581 = fsub <16 x float> %37, %876, !dbg !27 + %890 = extractelement <16 x float> %foldExtExtBinop1581, i64 12, !dbg !27 + %foldExtExtBinop1583 = fsub <16 x float> %37, %876, !dbg !27 + %891 = extractelement <16 x float> %foldExtExtBinop1583, i64 13, !dbg !27 + %foldExtExtBinop1585 = fsub <16 x float> %37, %876, !dbg !27 + %892 = extractelement <16 x float> %foldExtExtBinop1585, i64 14, !dbg !27 + %foldExtExtBinop1587 = fsub <16 x float> %37, %876, !dbg !27 + %893 = extractelement <16 x float> %foldExtExtBinop1587, i64 15, !dbg !27 + %894 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %878, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %895 = tail call float @llvm.nvvm.fma.rn.f(float %878, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i695 = select i1 %.not.i694, float %895, float %894, !dbg !28 + %896 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i695) #6, !dbg !28 + %897 = tail call float @llvm.nvvm.saturate.f(float %.02.i695) #6, !dbg !28 + %.03.i697 = select i1 %.not1.i696, float %897, float %896, !dbg !28 + %898 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i697, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %899 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i697, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %900 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %879, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %901 = tail call float @llvm.nvvm.fma.rn.f(float %879, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i705 = select i1 %.not.i704, float %901, float %900, !dbg !28 + %902 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i705) #6, !dbg !28 + %903 = tail call float @llvm.nvvm.saturate.f(float %.02.i705) #6, !dbg !28 + %.03.i707 = select i1 %.not1.i706, float %903, float %902, !dbg !28 + %904 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i707, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %905 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i707, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %906 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %880, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %907 = tail call float @llvm.nvvm.fma.rn.f(float %880, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i715 = select i1 %.not.i714, float %907, float %906, !dbg !28 + %908 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i715) #6, !dbg !28 + %909 = tail call float @llvm.nvvm.saturate.f(float %.02.i715) #6, !dbg !28 + %.03.i717 = select i1 %.not1.i716, float %909, float %908, !dbg !28 + %910 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i717, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %911 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i717, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %912 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %881, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %913 = tail call float @llvm.nvvm.fma.rn.f(float %881, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i725 = select i1 %.not.i724, float %913, float %912, !dbg !28 + %914 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i725) #6, !dbg !28 + %915 = tail call float @llvm.nvvm.saturate.f(float %.02.i725) #6, !dbg !28 + %.03.i727 = select i1 %.not1.i726, float %915, float %914, !dbg !28 + %916 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i727, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %917 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i727, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %918 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %882, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %919 = tail call float @llvm.nvvm.fma.rn.f(float %882, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i735 = select i1 %.not.i734, float %919, float %918, !dbg !28 + %920 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i735) #6, !dbg !28 + %921 = tail call float @llvm.nvvm.saturate.f(float %.02.i735) #6, !dbg !28 + %.03.i737 = select i1 %.not1.i736, float %921, float %920, !dbg !28 + %922 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i737, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %923 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i737, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %924 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %883, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %925 = tail call float @llvm.nvvm.fma.rn.f(float %883, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i745 = select i1 %.not.i744, float %925, float %924, !dbg !28 + %926 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i745) #6, !dbg !28 + %927 = tail call float @llvm.nvvm.saturate.f(float %.02.i745) #6, !dbg !28 + %.03.i747 = select i1 %.not1.i746, float %927, float %926, !dbg !28 + %928 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i747, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %929 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i747, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %930 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %884, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %931 = tail call float @llvm.nvvm.fma.rn.f(float %884, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i755 = select i1 %.not.i754, float %931, float %930, !dbg !28 + %932 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i755) #6, !dbg !28 + %933 = tail call float @llvm.nvvm.saturate.f(float %.02.i755) #6, !dbg !28 + %.03.i757 = select i1 %.not1.i756, float %933, float %932, !dbg !28 + %934 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i757, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %935 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i757, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %936 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %885, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %937 = tail call float @llvm.nvvm.fma.rn.f(float %885, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i765 = select i1 %.not.i764, float %937, float %936, !dbg !28 + %938 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i765) #6, !dbg !28 + %939 = tail call float @llvm.nvvm.saturate.f(float %.02.i765) #6, !dbg !28 + %.03.i767 = select i1 %.not1.i766, float %939, float %938, !dbg !28 + %940 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i767, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %941 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i767, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %942 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %886, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %943 = tail call float @llvm.nvvm.fma.rn.f(float %886, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i775 = select i1 %.not.i774, float %943, float %942, !dbg !28 + %944 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i775) #6, !dbg !28 + %945 = tail call float @llvm.nvvm.saturate.f(float %.02.i775) #6, !dbg !28 + %.03.i777 = select i1 %.not1.i776, float %945, float %944, !dbg !28 + %946 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i777, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %947 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i777, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %948 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %887, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %949 = tail call float @llvm.nvvm.fma.rn.f(float %887, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i785 = select i1 %.not.i784, float %949, float %948, !dbg !28 + %950 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i785) #6, !dbg !28 + %951 = tail call float @llvm.nvvm.saturate.f(float %.02.i785) #6, !dbg !28 + %.03.i787 = select i1 %.not1.i786, float %951, float %950, !dbg !28 + %952 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i787, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %953 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i787, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %954 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %888, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %955 = tail call float @llvm.nvvm.fma.rn.f(float %888, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i795 = select i1 %.not.i794, float %955, float %954, !dbg !28 + %956 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i795) #6, !dbg !28 + %957 = tail call float @llvm.nvvm.saturate.f(float %.02.i795) #6, !dbg !28 + %.03.i797 = select i1 %.not1.i796, float %957, float %956, !dbg !28 + %958 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i797, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %959 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i797, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %960 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %889, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %961 = tail call float @llvm.nvvm.fma.rn.f(float %889, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i805 = select i1 %.not.i804, float %961, float %960, !dbg !28 + %962 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i805) #6, !dbg !28 + %963 = tail call float @llvm.nvvm.saturate.f(float %.02.i805) #6, !dbg !28 + %.03.i807 = select i1 %.not1.i806, float %963, float %962, !dbg !28 + %964 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i807, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %965 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i807, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %966 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %890, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %967 = tail call float @llvm.nvvm.fma.rn.f(float %890, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i815 = select i1 %.not.i814, float %967, float %966, !dbg !28 + %968 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i815) #6, !dbg !28 + %969 = tail call float @llvm.nvvm.saturate.f(float %.02.i815) #6, !dbg !28 + %.03.i817 = select i1 %.not1.i816, float %969, float %968, !dbg !28 + %970 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i817, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %971 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i817, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %972 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %891, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %973 = tail call float @llvm.nvvm.fma.rn.f(float %891, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i825 = select i1 %.not.i824, float %973, float %972, !dbg !28 + %974 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i825) #6, !dbg !28 + %975 = tail call float @llvm.nvvm.saturate.f(float %.02.i825) #6, !dbg !28 + %.03.i827 = select i1 %.not1.i826, float %975, float %974, !dbg !28 + %976 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i827, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %977 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i827, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %978 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %892, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %979 = tail call float @llvm.nvvm.fma.rn.f(float %892, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i835 = select i1 %.not.i834, float %979, float %978, !dbg !28 + %980 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i835) #6, !dbg !28 + %981 = tail call float @llvm.nvvm.saturate.f(float %.02.i835) #6, !dbg !28 + %.03.i837 = select i1 %.not1.i836, float %981, float %980, !dbg !28 + %982 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i837, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %983 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i837, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %984 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %893, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %985 = tail call float @llvm.nvvm.fma.rn.f(float %893, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i845 = select i1 %.not.i844, float %985, float %984, !dbg !28 + %986 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i845) #6, !dbg !28 + %987 = tail call float @llvm.nvvm.saturate.f(float %.02.i845) #6, !dbg !28 + %.03.i847 = select i1 %.not1.i846, float %987, float %986, !dbg !28 + %988 = insertelement <16 x i32> poison, i32 %160, i64 0, !dbg !28 + %989 = insertelement <16 x i32> %988, i32 %165, i64 1, !dbg !28 + %990 = insertelement <16 x i32> %989, i32 %170, i64 2, !dbg !28 + %991 = insertelement <16 x i32> %990, i32 %175, i64 3, !dbg !28 + %992 = insertelement <16 x i32> %991, i32 %180, i64 4, !dbg !28 + %993 = insertelement <16 x i32> %992, i32 %185, i64 5, !dbg !28 + %994 = insertelement <16 x i32> %993, i32 %190, i64 6, !dbg !28 + %995 = insertelement <16 x i32> %994, i32 %195, i64 7, !dbg !28 + %996 = insertelement <16 x i32> %995, i32 %200, i64 8, !dbg !28 + %997 = insertelement <16 x i32> %996, i32 %205, i64 9, !dbg !28 + %998 = insertelement <16 x i32> %997, i32 %210, i64 10, !dbg !28 + %999 = insertelement <16 x i32> %998, i32 %215, i64 11, !dbg !28 + %1000 = insertelement <16 x i32> %999, i32 %220, i64 12, !dbg !28 + %1001 = insertelement <16 x i32> %1000, i32 %225, i64 13, !dbg !28 + %1002 = insertelement <16 x i32> %1001, i32 %230, i64 14, !dbg !28 + %1003 = insertelement <16 x i32> %1002, i32 %235, i64 15, !dbg !28 + %1004 = icmp eq <16 x i32> %1003, zeroinitializer, !dbg !28 + %1005 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i847, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1006 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i847, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1007 = insertelement <16 x float> poison, float %899, i64 0, !dbg !28 + %1008 = insertelement <16 x float> %1007, float %905, i64 1, !dbg !28 + %1009 = insertelement <16 x float> %1008, float %911, i64 2, !dbg !28 + %1010 = insertelement <16 x float> %1009, float %917, i64 3, !dbg !28 + %1011 = insertelement <16 x float> %1010, float %923, i64 4, !dbg !28 + %1012 = insertelement <16 x float> %1011, float %929, i64 5, !dbg !28 + %1013 = insertelement <16 x float> %1012, float %935, i64 6, !dbg !28 + %1014 = insertelement <16 x float> %1013, float %941, i64 7, !dbg !28 + %1015 = insertelement <16 x float> %1014, float %947, i64 8, !dbg !28 + %1016 = insertelement <16 x float> %1015, float %953, i64 9, !dbg !28 + %1017 = insertelement <16 x float> %1016, float %959, i64 10, !dbg !28 + %1018 = insertelement <16 x float> %1017, float %965, i64 11, !dbg !28 + %1019 = insertelement <16 x float> %1018, float %971, i64 12, !dbg !28 + %1020 = insertelement <16 x float> %1019, float %977, i64 13, !dbg !28 + %1021 = insertelement <16 x float> %1020, float %983, i64 14, !dbg !28 + %1022 = insertelement <16 x float> %1021, float %1006, i64 15, !dbg !28 + %1023 = insertelement <16 x float> poison, float %898, i64 0, !dbg !28 + %1024 = insertelement <16 x float> %1023, float %904, i64 1, !dbg !28 + %1025 = insertelement <16 x float> %1024, float %910, i64 2, !dbg !28 + %1026 = insertelement <16 x float> %1025, float %916, i64 3, !dbg !28 + %1027 = insertelement <16 x float> %1026, float %922, i64 4, !dbg !28 + %1028 = insertelement <16 x float> %1027, float %928, i64 5, !dbg !28 + %1029 = insertelement <16 x float> %1028, float %934, i64 6, !dbg !28 + %1030 = insertelement <16 x float> %1029, float %940, i64 7, !dbg !28 + %1031 = insertelement <16 x float> %1030, float %946, i64 8, !dbg !28 + %1032 = insertelement <16 x float> %1031, float %952, i64 9, !dbg !28 + %1033 = insertelement <16 x float> %1032, float %958, i64 10, !dbg !28 + %1034 = insertelement <16 x float> %1033, float %964, i64 11, !dbg !28 + %1035 = insertelement <16 x float> %1034, float %970, i64 12, !dbg !28 + %1036 = insertelement <16 x float> %1035, float %976, i64 13, !dbg !28 + %1037 = insertelement <16 x float> %1036, float %982, i64 14, !dbg !28 + %1038 = insertelement <16 x float> %1037, float %1005, i64 15, !dbg !28 + %1039 = select <16 x i1> %1004, <16 x float> %1022, <16 x float> %1038, !dbg !28 + %1040 = extractelement <16 x float> %1039, i64 0, !dbg !28 + %1041 = fadd float %1040, 0xC168000FE0000000, !dbg !28 + %1042 = fneg float %1041, !dbg !28 + %1043 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %878, float 0x3FF7154760000000, float %1042) #6, !dbg !28 + %1044 = tail call float @llvm.nvvm.fma.rn.f(float %878, float 0x3FF7154760000000, float %1042) #6, !dbg !28 + %.0.i701 = select i1 %.not3.i700, float %1044, float %1043, !dbg !28 + %1045 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %878, float 0x3E54AE0C00000000, float %.0.i701) #6, !dbg !28 + %1046 = tail call float @llvm.nvvm.fma.rn.f(float %878, float 0x3E54AE0C00000000, float %.0.i701) #6, !dbg !28 + %.01.i703 = select i1 %.not4.i702, float %1046, float %1045, !dbg !28 + %1047 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i703) #6, !dbg !28 + %1048 = extractelement <16 x float> %1039, i64 1, !dbg !28 + %1049 = fadd float %1048, 0xC168000FE0000000, !dbg !28 + %1050 = fneg float %1049, !dbg !28 + %1051 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %879, float 0x3FF7154760000000, float %1050) #6, !dbg !28 + %1052 = tail call float @llvm.nvvm.fma.rn.f(float %879, float 0x3FF7154760000000, float %1050) #6, !dbg !28 + %.0.i711 = select i1 %.not3.i710, float %1052, float %1051, !dbg !28 + %1053 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %879, float 0x3E54AE0C00000000, float %.0.i711) #6, !dbg !28 + %1054 = tail call float @llvm.nvvm.fma.rn.f(float %879, float 0x3E54AE0C00000000, float %.0.i711) #6, !dbg !28 + %.01.i713 = select i1 %.not4.i712, float %1054, float %1053, !dbg !28 + %1055 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i713) #6, !dbg !28 + %1056 = extractelement <16 x float> %1039, i64 2, !dbg !28 + %1057 = fadd float %1056, 0xC168000FE0000000, !dbg !28 + %1058 = fneg float %1057, !dbg !28 + %1059 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %880, float 0x3FF7154760000000, float %1058) #6, !dbg !28 + %1060 = tail call float @llvm.nvvm.fma.rn.f(float %880, float 0x3FF7154760000000, float %1058) #6, !dbg !28 + %.0.i721 = select i1 %.not3.i720, float %1060, float %1059, !dbg !28 + %1061 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %880, float 0x3E54AE0C00000000, float %.0.i721) #6, !dbg !28 + %1062 = tail call float @llvm.nvvm.fma.rn.f(float %880, float 0x3E54AE0C00000000, float %.0.i721) #6, !dbg !28 + %.01.i723 = select i1 %.not4.i722, float %1062, float %1061, !dbg !28 + %1063 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i723) #6, !dbg !28 + %1064 = extractelement <16 x float> %1039, i64 3, !dbg !28 + %1065 = fadd float %1064, 0xC168000FE0000000, !dbg !28 + %1066 = fneg float %1065, !dbg !28 + %1067 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %881, float 0x3FF7154760000000, float %1066) #6, !dbg !28 + %1068 = tail call float @llvm.nvvm.fma.rn.f(float %881, float 0x3FF7154760000000, float %1066) #6, !dbg !28 + %.0.i731 = select i1 %.not3.i730, float %1068, float %1067, !dbg !28 + %1069 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %881, float 0x3E54AE0C00000000, float %.0.i731) #6, !dbg !28 + %1070 = tail call float @llvm.nvvm.fma.rn.f(float %881, float 0x3E54AE0C00000000, float %.0.i731) #6, !dbg !28 + %.01.i733 = select i1 %.not4.i732, float %1070, float %1069, !dbg !28 + %1071 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i733) #6, !dbg !28 + %1072 = extractelement <16 x float> %1039, i64 4, !dbg !28 + %1073 = fadd float %1072, 0xC168000FE0000000, !dbg !28 + %1074 = fneg float %1073, !dbg !28 + %1075 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %882, float 0x3FF7154760000000, float %1074) #6, !dbg !28 + %1076 = tail call float @llvm.nvvm.fma.rn.f(float %882, float 0x3FF7154760000000, float %1074) #6, !dbg !28 + %.0.i741 = select i1 %.not3.i740, float %1076, float %1075, !dbg !28 + %1077 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %882, float 0x3E54AE0C00000000, float %.0.i741) #6, !dbg !28 + %1078 = tail call float @llvm.nvvm.fma.rn.f(float %882, float 0x3E54AE0C00000000, float %.0.i741) #6, !dbg !28 + %.01.i743 = select i1 %.not4.i742, float %1078, float %1077, !dbg !28 + %1079 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i743) #6, !dbg !28 + %1080 = extractelement <16 x float> %1039, i64 5, !dbg !28 + %1081 = fadd float %1080, 0xC168000FE0000000, !dbg !28 + %1082 = fneg float %1081, !dbg !28 + %1083 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %883, float 0x3FF7154760000000, float %1082) #6, !dbg !28 + %1084 = tail call float @llvm.nvvm.fma.rn.f(float %883, float 0x3FF7154760000000, float %1082) #6, !dbg !28 + %.0.i751 = select i1 %.not3.i750, float %1084, float %1083, !dbg !28 + %1085 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %883, float 0x3E54AE0C00000000, float %.0.i751) #6, !dbg !28 + %1086 = tail call float @llvm.nvvm.fma.rn.f(float %883, float 0x3E54AE0C00000000, float %.0.i751) #6, !dbg !28 + %.01.i753 = select i1 %.not4.i752, float %1086, float %1085, !dbg !28 + %1087 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i753) #6, !dbg !28 + %1088 = extractelement <16 x float> %1039, i64 6, !dbg !28 + %1089 = fadd float %1088, 0xC168000FE0000000, !dbg !28 + %1090 = fneg float %1089, !dbg !28 + %1091 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %884, float 0x3FF7154760000000, float %1090) #6, !dbg !28 + %1092 = tail call float @llvm.nvvm.fma.rn.f(float %884, float 0x3FF7154760000000, float %1090) #6, !dbg !28 + %.0.i761 = select i1 %.not3.i760, float %1092, float %1091, !dbg !28 + %1093 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %884, float 0x3E54AE0C00000000, float %.0.i761) #6, !dbg !28 + %1094 = tail call float @llvm.nvvm.fma.rn.f(float %884, float 0x3E54AE0C00000000, float %.0.i761) #6, !dbg !28 + %.01.i763 = select i1 %.not4.i762, float %1094, float %1093, !dbg !28 + %1095 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i763) #6, !dbg !28 + %1096 = extractelement <16 x float> %1039, i64 7, !dbg !28 + %1097 = fadd float %1096, 0xC168000FE0000000, !dbg !28 + %1098 = fneg float %1097, !dbg !28 + %1099 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %885, float 0x3FF7154760000000, float %1098) #6, !dbg !28 + %1100 = tail call float @llvm.nvvm.fma.rn.f(float %885, float 0x3FF7154760000000, float %1098) #6, !dbg !28 + %.0.i771 = select i1 %.not3.i770, float %1100, float %1099, !dbg !28 + %1101 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %885, float 0x3E54AE0C00000000, float %.0.i771) #6, !dbg !28 + %1102 = tail call float @llvm.nvvm.fma.rn.f(float %885, float 0x3E54AE0C00000000, float %.0.i771) #6, !dbg !28 + %.01.i773 = select i1 %.not4.i772, float %1102, float %1101, !dbg !28 + %1103 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i773) #6, !dbg !28 + %1104 = extractelement <16 x float> %1039, i64 8, !dbg !28 + %1105 = fadd float %1104, 0xC168000FE0000000, !dbg !28 + %1106 = fneg float %1105, !dbg !28 + %1107 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %886, float 0x3FF7154760000000, float %1106) #6, !dbg !28 + %1108 = tail call float @llvm.nvvm.fma.rn.f(float %886, float 0x3FF7154760000000, float %1106) #6, !dbg !28 + %.0.i781 = select i1 %.not3.i780, float %1108, float %1107, !dbg !28 + %1109 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %886, float 0x3E54AE0C00000000, float %.0.i781) #6, !dbg !28 + %1110 = tail call float @llvm.nvvm.fma.rn.f(float %886, float 0x3E54AE0C00000000, float %.0.i781) #6, !dbg !28 + %.01.i783 = select i1 %.not4.i782, float %1110, float %1109, !dbg !28 + %1111 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i783) #6, !dbg !28 + %1112 = extractelement <16 x float> %1039, i64 9, !dbg !28 + %1113 = fadd float %1112, 0xC168000FE0000000, !dbg !28 + %1114 = fneg float %1113, !dbg !28 + %1115 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %887, float 0x3FF7154760000000, float %1114) #6, !dbg !28 + %1116 = tail call float @llvm.nvvm.fma.rn.f(float %887, float 0x3FF7154760000000, float %1114) #6, !dbg !28 + %.0.i791 = select i1 %.not3.i790, float %1116, float %1115, !dbg !28 + %1117 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %887, float 0x3E54AE0C00000000, float %.0.i791) #6, !dbg !28 + %1118 = tail call float @llvm.nvvm.fma.rn.f(float %887, float 0x3E54AE0C00000000, float %.0.i791) #6, !dbg !28 + %.01.i793 = select i1 %.not4.i792, float %1118, float %1117, !dbg !28 + %1119 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i793) #6, !dbg !28 + %1120 = extractelement <16 x float> %1039, i64 10, !dbg !28 + %1121 = fadd float %1120, 0xC168000FE0000000, !dbg !28 + %1122 = fneg float %1121, !dbg !28 + %1123 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %888, float 0x3FF7154760000000, float %1122) #6, !dbg !28 + %1124 = tail call float @llvm.nvvm.fma.rn.f(float %888, float 0x3FF7154760000000, float %1122) #6, !dbg !28 + %.0.i801 = select i1 %.not3.i800, float %1124, float %1123, !dbg !28 + %1125 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %888, float 0x3E54AE0C00000000, float %.0.i801) #6, !dbg !28 + %1126 = tail call float @llvm.nvvm.fma.rn.f(float %888, float 0x3E54AE0C00000000, float %.0.i801) #6, !dbg !28 + %.01.i803 = select i1 %.not4.i802, float %1126, float %1125, !dbg !28 + %1127 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i803) #6, !dbg !28 + %1128 = extractelement <16 x float> %1039, i64 11, !dbg !28 + %1129 = fadd float %1128, 0xC168000FE0000000, !dbg !28 + %1130 = fneg float %1129, !dbg !28 + %1131 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %889, float 0x3FF7154760000000, float %1130) #6, !dbg !28 + %1132 = tail call float @llvm.nvvm.fma.rn.f(float %889, float 0x3FF7154760000000, float %1130) #6, !dbg !28 + %.0.i811 = select i1 %.not3.i810, float %1132, float %1131, !dbg !28 + %1133 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %889, float 0x3E54AE0C00000000, float %.0.i811) #6, !dbg !28 + %1134 = tail call float @llvm.nvvm.fma.rn.f(float %889, float 0x3E54AE0C00000000, float %.0.i811) #6, !dbg !28 + %.01.i813 = select i1 %.not4.i812, float %1134, float %1133, !dbg !28 + %1135 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i813) #6, !dbg !28 + %1136 = extractelement <16 x float> %1039, i64 12, !dbg !28 + %1137 = fadd float %1136, 0xC168000FE0000000, !dbg !28 + %1138 = fneg float %1137, !dbg !28 + %1139 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %890, float 0x3FF7154760000000, float %1138) #6, !dbg !28 + %1140 = tail call float @llvm.nvvm.fma.rn.f(float %890, float 0x3FF7154760000000, float %1138) #6, !dbg !28 + %.0.i821 = select i1 %.not3.i820, float %1140, float %1139, !dbg !28 + %1141 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %890, float 0x3E54AE0C00000000, float %.0.i821) #6, !dbg !28 + %1142 = tail call float @llvm.nvvm.fma.rn.f(float %890, float 0x3E54AE0C00000000, float %.0.i821) #6, !dbg !28 + %.01.i823 = select i1 %.not4.i822, float %1142, float %1141, !dbg !28 + %1143 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i823) #6, !dbg !28 + %1144 = extractelement <16 x float> %1039, i64 13, !dbg !28 + %1145 = fadd float %1144, 0xC168000FE0000000, !dbg !28 + %1146 = fneg float %1145, !dbg !28 + %1147 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %891, float 0x3FF7154760000000, float %1146) #6, !dbg !28 + %1148 = tail call float @llvm.nvvm.fma.rn.f(float %891, float 0x3FF7154760000000, float %1146) #6, !dbg !28 + %.0.i831 = select i1 %.not3.i830, float %1148, float %1147, !dbg !28 + %1149 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %891, float 0x3E54AE0C00000000, float %.0.i831) #6, !dbg !28 + %1150 = tail call float @llvm.nvvm.fma.rn.f(float %891, float 0x3E54AE0C00000000, float %.0.i831) #6, !dbg !28 + %.01.i833 = select i1 %.not4.i832, float %1150, float %1149, !dbg !28 + %1151 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i833) #6, !dbg !28 + %1152 = extractelement <16 x float> %1039, i64 14, !dbg !28 + %1153 = fadd float %1152, 0xC168000FE0000000, !dbg !28 + %1154 = fneg float %1153, !dbg !28 + %1155 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %892, float 0x3FF7154760000000, float %1154) #6, !dbg !28 + %1156 = tail call float @llvm.nvvm.fma.rn.f(float %892, float 0x3FF7154760000000, float %1154) #6, !dbg !28 + %.0.i841 = select i1 %.not3.i840, float %1156, float %1155, !dbg !28 + %1157 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %892, float 0x3E54AE0C00000000, float %.0.i841) #6, !dbg !28 + %1158 = tail call float @llvm.nvvm.fma.rn.f(float %892, float 0x3E54AE0C00000000, float %.0.i841) #6, !dbg !28 + %.01.i843 = select i1 %.not4.i842, float %1158, float %1157, !dbg !28 + %1159 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i843) #6, !dbg !28 + %1160 = extractelement <16 x float> %1039, i64 15, !dbg !28 + %1161 = fadd float %1160, 0xC168000FE0000000, !dbg !28 + %1162 = fneg float %1161, !dbg !28 + %1163 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %893, float 0x3FF7154760000000, float %1162) #6, !dbg !28 + %1164 = tail call float @llvm.nvvm.fma.rn.f(float %893, float 0x3FF7154760000000, float %1162) #6, !dbg !28 + %.0.i851 = select i1 %.not3.i850, float %1164, float %1163, !dbg !28 + %1165 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %893, float 0x3E54AE0C00000000, float %.0.i851) #6, !dbg !28 + %1166 = tail call float @llvm.nvvm.fma.rn.f(float %893, float 0x3E54AE0C00000000, float %.0.i851) #6, !dbg !28 + %.01.i853 = select i1 %.not4.i852, float %1166, float %1165, !dbg !28 + %1167 = bitcast <16 x float> %1039 to <16 x i32>, !dbg !28 + %1168 = shl <16 x i32> %1167, splat (i32 23), !dbg !28 + %1169 = bitcast <16 x i32> %1168 to <16 x float>, !dbg !28 + %1170 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i853) #6, !dbg !28 + %1171 = insertelement <16 x float> poison, float %1047, i64 0, !dbg !28 + %1172 = insertelement <16 x float> %1171, float %1055, i64 1, !dbg !28 + %1173 = insertelement <16 x float> %1172, float %1063, i64 2, !dbg !28 + %1174 = insertelement <16 x float> %1173, float %1071, i64 3, !dbg !28 + %1175 = insertelement <16 x float> %1174, float %1079, i64 4, !dbg !28 + %1176 = insertelement <16 x float> %1175, float %1087, i64 5, !dbg !28 + %1177 = insertelement <16 x float> %1176, float %1095, i64 6, !dbg !28 + %1178 = insertelement <16 x float> %1177, float %1103, i64 7, !dbg !28 + %1179 = insertelement <16 x float> %1178, float %1111, i64 8, !dbg !28 + %1180 = insertelement <16 x float> %1179, float %1119, i64 9, !dbg !28 + %1181 = insertelement <16 x float> %1180, float %1127, i64 10, !dbg !28 + %1182 = insertelement <16 x float> %1181, float %1135, i64 11, !dbg !28 + %1183 = insertelement <16 x float> %1182, float %1143, i64 12, !dbg !28 + %1184 = insertelement <16 x float> %1183, float %1151, i64 13, !dbg !28 + %1185 = insertelement <16 x float> %1184, float %1159, i64 14, !dbg !28 + %1186 = insertelement <16 x float> %1185, float %1170, i64 15, !dbg !28 + %1187 = fmul <16 x float> %1186, %1169, !dbg !28 + %1188 = select <16 x i1> %877, <16 x float> splat (float 1.000000e+00), <16 x float> %1187, !dbg !29 + %foldExtExtBinop1589 = fsub <16 x float> %873, %876, !dbg !25 + %1189 = extractelement <16 x float> %foldExtExtBinop1589, i64 0, !dbg !25 + %foldExtExtBinop1591 = fsub <16 x float> %873, %876, !dbg !25 + %1190 = extractelement <16 x float> %foldExtExtBinop1591, i64 1, !dbg !25 + %foldExtExtBinop1593 = fsub <16 x float> %873, %876, !dbg !25 + %1191 = extractelement <16 x float> %foldExtExtBinop1593, i64 2, !dbg !25 + %foldExtExtBinop1595 = fsub <16 x float> %873, %876, !dbg !25 + %1192 = extractelement <16 x float> %foldExtExtBinop1595, i64 3, !dbg !25 + %foldExtExtBinop1597 = fsub <16 x float> %873, %876, !dbg !25 + %1193 = extractelement <16 x float> %foldExtExtBinop1597, i64 4, !dbg !25 + %foldExtExtBinop1599 = fsub <16 x float> %873, %876, !dbg !25 + %1194 = extractelement <16 x float> %foldExtExtBinop1599, i64 5, !dbg !25 + %foldExtExtBinop1601 = fsub <16 x float> %873, %876, !dbg !25 + %1195 = extractelement <16 x float> %foldExtExtBinop1601, i64 6, !dbg !25 + %foldExtExtBinop1603 = fsub <16 x float> %873, %876, !dbg !25 + %1196 = extractelement <16 x float> %foldExtExtBinop1603, i64 7, !dbg !25 + %foldExtExtBinop1605 = fsub <16 x float> %873, %876, !dbg !25 + %1197 = extractelement <16 x float> %foldExtExtBinop1605, i64 8, !dbg !25 + %foldExtExtBinop1607 = fsub <16 x float> %873, %876, !dbg !25 + %1198 = extractelement <16 x float> %foldExtExtBinop1607, i64 9, !dbg !25 + %foldExtExtBinop1609 = fsub <16 x float> %873, %876, !dbg !25 + %1199 = extractelement <16 x float> %foldExtExtBinop1609, i64 10, !dbg !25 + %foldExtExtBinop1611 = fsub <16 x float> %873, %876, !dbg !25 + %1200 = extractelement <16 x float> %foldExtExtBinop1611, i64 11, !dbg !25 + %foldExtExtBinop1613 = fsub <16 x float> %873, %876, !dbg !25 + %1201 = extractelement <16 x float> %foldExtExtBinop1613, i64 12, !dbg !25 + %foldExtExtBinop1615 = fsub <16 x float> %873, %876, !dbg !25 + %1202 = extractelement <16 x float> %foldExtExtBinop1615, i64 13, !dbg !25 + %foldExtExtBinop1617 = fsub <16 x float> %873, %876, !dbg !25 + %1203 = extractelement <16 x float> %foldExtExtBinop1617, i64 14, !dbg !25 + %foldExtExtBinop1619 = fsub <16 x float> %873, %876, !dbg !25 + %1204 = extractelement <16 x float> %foldExtExtBinop1619, i64 15, !dbg !25 + %1205 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1189, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1206 = tail call float @llvm.nvvm.fma.rn.f(float %1189, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1015 = select i1 %.not.i1014, float %1206, float %1205, !dbg !28 + %1207 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1015) #6, !dbg !28 + %1208 = tail call float @llvm.nvvm.saturate.f(float %.02.i1015) #6, !dbg !28 + %.03.i1017 = select i1 %.not1.i1016, float %1208, float %1207, !dbg !28 + %1209 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1017, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1210 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1017, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1211 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1190, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1212 = tail call float @llvm.nvvm.fma.rn.f(float %1190, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1025 = select i1 %.not.i1024, float %1212, float %1211, !dbg !28 + %1213 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1025) #6, !dbg !28 + %1214 = tail call float @llvm.nvvm.saturate.f(float %.02.i1025) #6, !dbg !28 + %.03.i1027 = select i1 %.not1.i1026, float %1214, float %1213, !dbg !28 + %1215 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1027, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1216 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1027, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1217 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1191, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1218 = tail call float @llvm.nvvm.fma.rn.f(float %1191, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1035 = select i1 %.not.i1034, float %1218, float %1217, !dbg !28 + %1219 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1035) #6, !dbg !28 + %1220 = tail call float @llvm.nvvm.saturate.f(float %.02.i1035) #6, !dbg !28 + %.03.i1037 = select i1 %.not1.i1036, float %1220, float %1219, !dbg !28 + %1221 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1037, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1222 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1037, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1223 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1192, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1224 = tail call float @llvm.nvvm.fma.rn.f(float %1192, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1045 = select i1 %.not.i1044, float %1224, float %1223, !dbg !28 + %1225 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1045) #6, !dbg !28 + %1226 = tail call float @llvm.nvvm.saturate.f(float %.02.i1045) #6, !dbg !28 + %.03.i1047 = select i1 %.not1.i1046, float %1226, float %1225, !dbg !28 + %1227 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1047, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1228 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1047, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1229 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1193, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1230 = tail call float @llvm.nvvm.fma.rn.f(float %1193, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1055 = select i1 %.not.i1054, float %1230, float %1229, !dbg !28 + %1231 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1055) #6, !dbg !28 + %1232 = tail call float @llvm.nvvm.saturate.f(float %.02.i1055) #6, !dbg !28 + %.03.i1057 = select i1 %.not1.i1056, float %1232, float %1231, !dbg !28 + %1233 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1057, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1234 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1057, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1235 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1194, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1236 = tail call float @llvm.nvvm.fma.rn.f(float %1194, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1065 = select i1 %.not.i1064, float %1236, float %1235, !dbg !28 + %1237 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1065) #6, !dbg !28 + %1238 = tail call float @llvm.nvvm.saturate.f(float %.02.i1065) #6, !dbg !28 + %.03.i1067 = select i1 %.not1.i1066, float %1238, float %1237, !dbg !28 + %1239 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1067, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1240 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1067, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1241 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1195, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1242 = tail call float @llvm.nvvm.fma.rn.f(float %1195, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1075 = select i1 %.not.i1074, float %1242, float %1241, !dbg !28 + %1243 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1075) #6, !dbg !28 + %1244 = tail call float @llvm.nvvm.saturate.f(float %.02.i1075) #6, !dbg !28 + %.03.i1077 = select i1 %.not1.i1076, float %1244, float %1243, !dbg !28 + %1245 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1077, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1246 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1077, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1247 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1196, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1248 = tail call float @llvm.nvvm.fma.rn.f(float %1196, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1085 = select i1 %.not.i1084, float %1248, float %1247, !dbg !28 + %1249 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1085) #6, !dbg !28 + %1250 = tail call float @llvm.nvvm.saturate.f(float %.02.i1085) #6, !dbg !28 + %.03.i1087 = select i1 %.not1.i1086, float %1250, float %1249, !dbg !28 + %1251 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1087, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1252 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1087, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1253 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1197, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1254 = tail call float @llvm.nvvm.fma.rn.f(float %1197, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1095 = select i1 %.not.i1094, float %1254, float %1253, !dbg !28 + %1255 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1095) #6, !dbg !28 + %1256 = tail call float @llvm.nvvm.saturate.f(float %.02.i1095) #6, !dbg !28 + %.03.i1097 = select i1 %.not1.i1096, float %1256, float %1255, !dbg !28 + %1257 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1097, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1258 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1097, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1259 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1198, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1260 = tail call float @llvm.nvvm.fma.rn.f(float %1198, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1105 = select i1 %.not.i1104, float %1260, float %1259, !dbg !28 + %1261 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1105) #6, !dbg !28 + %1262 = tail call float @llvm.nvvm.saturate.f(float %.02.i1105) #6, !dbg !28 + %.03.i1107 = select i1 %.not1.i1106, float %1262, float %1261, !dbg !28 + %1263 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1107, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1264 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1107, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1265 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1199, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1266 = tail call float @llvm.nvvm.fma.rn.f(float %1199, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1115 = select i1 %.not.i1114, float %1266, float %1265, !dbg !28 + %1267 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1115) #6, !dbg !28 + %1268 = tail call float @llvm.nvvm.saturate.f(float %.02.i1115) #6, !dbg !28 + %.03.i1117 = select i1 %.not1.i1116, float %1268, float %1267, !dbg !28 + %1269 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1117, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1270 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1117, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1271 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1200, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1272 = tail call float @llvm.nvvm.fma.rn.f(float %1200, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1125 = select i1 %.not.i1124, float %1272, float %1271, !dbg !28 + %1273 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1125) #6, !dbg !28 + %1274 = tail call float @llvm.nvvm.saturate.f(float %.02.i1125) #6, !dbg !28 + %.03.i1127 = select i1 %.not1.i1126, float %1274, float %1273, !dbg !28 + %1275 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1127, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1276 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1127, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1277 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1201, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1278 = tail call float @llvm.nvvm.fma.rn.f(float %1201, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1135 = select i1 %.not.i1134, float %1278, float %1277, !dbg !28 + %1279 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1135) #6, !dbg !28 + %1280 = tail call float @llvm.nvvm.saturate.f(float %.02.i1135) #6, !dbg !28 + %.03.i1137 = select i1 %.not1.i1136, float %1280, float %1279, !dbg !28 + %1281 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1137, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1282 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1137, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1283 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1202, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1284 = tail call float @llvm.nvvm.fma.rn.f(float %1202, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1145 = select i1 %.not.i1144, float %1284, float %1283, !dbg !28 + %1285 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1145) #6, !dbg !28 + %1286 = tail call float @llvm.nvvm.saturate.f(float %.02.i1145) #6, !dbg !28 + %.03.i1147 = select i1 %.not1.i1146, float %1286, float %1285, !dbg !28 + %1287 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1147, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1288 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1147, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1289 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1203, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1290 = tail call float @llvm.nvvm.fma.rn.f(float %1203, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1155 = select i1 %.not.i1154, float %1290, float %1289, !dbg !28 + %1291 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1155) #6, !dbg !28 + %1292 = tail call float @llvm.nvvm.saturate.f(float %.02.i1155) #6, !dbg !28 + %.03.i1157 = select i1 %.not1.i1156, float %1292, float %1291, !dbg !28 + %1293 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1157, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1294 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1157, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1295 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1204, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1296 = tail call float @llvm.nvvm.fma.rn.f(float %1204, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1165 = select i1 %.not.i1164, float %1296, float %1295, !dbg !28 + %1297 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1165) #6, !dbg !28 + %1298 = tail call float @llvm.nvvm.saturate.f(float %.02.i1165) #6, !dbg !28 + %.03.i1167 = select i1 %.not1.i1166, float %1298, float %1297, !dbg !28 + %1299 = insertelement <16 x i32> poison, i32 %510, i64 0, !dbg !28 + %1300 = insertelement <16 x i32> %1299, i32 %515, i64 1, !dbg !28 + %1301 = insertelement <16 x i32> %1300, i32 %520, i64 2, !dbg !28 + %1302 = insertelement <16 x i32> %1301, i32 %525, i64 3, !dbg !28 + %1303 = insertelement <16 x i32> %1302, i32 %530, i64 4, !dbg !28 + %1304 = insertelement <16 x i32> %1303, i32 %535, i64 5, !dbg !28 + %1305 = insertelement <16 x i32> %1304, i32 %540, i64 6, !dbg !28 + %1306 = insertelement <16 x i32> %1305, i32 %545, i64 7, !dbg !28 + %1307 = insertelement <16 x i32> %1306, i32 %550, i64 8, !dbg !28 + %1308 = insertelement <16 x i32> %1307, i32 %555, i64 9, !dbg !28 + %1309 = insertelement <16 x i32> %1308, i32 %560, i64 10, !dbg !28 + %1310 = insertelement <16 x i32> %1309, i32 %565, i64 11, !dbg !28 + %1311 = insertelement <16 x i32> %1310, i32 %570, i64 12, !dbg !28 + %1312 = insertelement <16 x i32> %1311, i32 %575, i64 13, !dbg !28 + %1313 = insertelement <16 x i32> %1312, i32 %580, i64 14, !dbg !28 + %1314 = insertelement <16 x i32> %1313, i32 %585, i64 15, !dbg !28 + %1315 = icmp eq <16 x i32> %1314, zeroinitializer, !dbg !28 + %1316 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1167, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1317 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1167, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1318 = insertelement <16 x float> poison, float %1210, i64 0, !dbg !28 + %1319 = insertelement <16 x float> %1318, float %1216, i64 1, !dbg !28 + %1320 = insertelement <16 x float> %1319, float %1222, i64 2, !dbg !28 + %1321 = insertelement <16 x float> %1320, float %1228, i64 3, !dbg !28 + %1322 = insertelement <16 x float> %1321, float %1234, i64 4, !dbg !28 + %1323 = insertelement <16 x float> %1322, float %1240, i64 5, !dbg !28 + %1324 = insertelement <16 x float> %1323, float %1246, i64 6, !dbg !28 + %1325 = insertelement <16 x float> %1324, float %1252, i64 7, !dbg !28 + %1326 = insertelement <16 x float> %1325, float %1258, i64 8, !dbg !28 + %1327 = insertelement <16 x float> %1326, float %1264, i64 9, !dbg !28 + %1328 = insertelement <16 x float> %1327, float %1270, i64 10, !dbg !28 + %1329 = insertelement <16 x float> %1328, float %1276, i64 11, !dbg !28 + %1330 = insertelement <16 x float> %1329, float %1282, i64 12, !dbg !28 + %1331 = insertelement <16 x float> %1330, float %1288, i64 13, !dbg !28 + %1332 = insertelement <16 x float> %1331, float %1294, i64 14, !dbg !28 + %1333 = insertelement <16 x float> %1332, float %1317, i64 15, !dbg !28 + %1334 = insertelement <16 x float> poison, float %1209, i64 0, !dbg !28 + %1335 = insertelement <16 x float> %1334, float %1215, i64 1, !dbg !28 + %1336 = insertelement <16 x float> %1335, float %1221, i64 2, !dbg !28 + %1337 = insertelement <16 x float> %1336, float %1227, i64 3, !dbg !28 + %1338 = insertelement <16 x float> %1337, float %1233, i64 4, !dbg !28 + %1339 = insertelement <16 x float> %1338, float %1239, i64 5, !dbg !28 + %1340 = insertelement <16 x float> %1339, float %1245, i64 6, !dbg !28 + %1341 = insertelement <16 x float> %1340, float %1251, i64 7, !dbg !28 + %1342 = insertelement <16 x float> %1341, float %1257, i64 8, !dbg !28 + %1343 = insertelement <16 x float> %1342, float %1263, i64 9, !dbg !28 + %1344 = insertelement <16 x float> %1343, float %1269, i64 10, !dbg !28 + %1345 = insertelement <16 x float> %1344, float %1275, i64 11, !dbg !28 + %1346 = insertelement <16 x float> %1345, float %1281, i64 12, !dbg !28 + %1347 = insertelement <16 x float> %1346, float %1287, i64 13, !dbg !28 + %1348 = insertelement <16 x float> %1347, float %1293, i64 14, !dbg !28 + %1349 = insertelement <16 x float> %1348, float %1316, i64 15, !dbg !28 + %1350 = select <16 x i1> %1315, <16 x float> %1333, <16 x float> %1349, !dbg !28 + %1351 = extractelement <16 x float> %1350, i64 0, !dbg !28 + %1352 = fadd float %1351, 0xC168000FE0000000, !dbg !28 + %1353 = fneg float %1352, !dbg !28 + %1354 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1189, float 0x3FF7154760000000, float %1353) #6, !dbg !28 + %1355 = tail call float @llvm.nvvm.fma.rn.f(float %1189, float 0x3FF7154760000000, float %1353) #6, !dbg !28 + %.0.i1021 = select i1 %.not3.i1020, float %1355, float %1354, !dbg !28 + %1356 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1189, float 0x3E54AE0C00000000, float %.0.i1021) #6, !dbg !28 + %1357 = tail call float @llvm.nvvm.fma.rn.f(float %1189, float 0x3E54AE0C00000000, float %.0.i1021) #6, !dbg !28 + %.01.i1023 = select i1 %.not4.i1022, float %1357, float %1356, !dbg !28 + %1358 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1023) #6, !dbg !28 + %1359 = extractelement <16 x float> %1350, i64 1, !dbg !28 + %1360 = fadd float %1359, 0xC168000FE0000000, !dbg !28 + %1361 = fneg float %1360, !dbg !28 + %1362 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1190, float 0x3FF7154760000000, float %1361) #6, !dbg !28 + %1363 = tail call float @llvm.nvvm.fma.rn.f(float %1190, float 0x3FF7154760000000, float %1361) #6, !dbg !28 + %.0.i1031 = select i1 %.not3.i1030, float %1363, float %1362, !dbg !28 + %1364 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1190, float 0x3E54AE0C00000000, float %.0.i1031) #6, !dbg !28 + %1365 = tail call float @llvm.nvvm.fma.rn.f(float %1190, float 0x3E54AE0C00000000, float %.0.i1031) #6, !dbg !28 + %.01.i1033 = select i1 %.not4.i1032, float %1365, float %1364, !dbg !28 + %1366 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1033) #6, !dbg !28 + %1367 = extractelement <16 x float> %1350, i64 2, !dbg !28 + %1368 = fadd float %1367, 0xC168000FE0000000, !dbg !28 + %1369 = fneg float %1368, !dbg !28 + %1370 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1191, float 0x3FF7154760000000, float %1369) #6, !dbg !28 + %1371 = tail call float @llvm.nvvm.fma.rn.f(float %1191, float 0x3FF7154760000000, float %1369) #6, !dbg !28 + %.0.i1041 = select i1 %.not3.i1040, float %1371, float %1370, !dbg !28 + %1372 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1191, float 0x3E54AE0C00000000, float %.0.i1041) #6, !dbg !28 + %1373 = tail call float @llvm.nvvm.fma.rn.f(float %1191, float 0x3E54AE0C00000000, float %.0.i1041) #6, !dbg !28 + %.01.i1043 = select i1 %.not4.i1042, float %1373, float %1372, !dbg !28 + %1374 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1043) #6, !dbg !28 + %1375 = extractelement <16 x float> %1350, i64 3, !dbg !28 + %1376 = fadd float %1375, 0xC168000FE0000000, !dbg !28 + %1377 = fneg float %1376, !dbg !28 + %1378 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1192, float 0x3FF7154760000000, float %1377) #6, !dbg !28 + %1379 = tail call float @llvm.nvvm.fma.rn.f(float %1192, float 0x3FF7154760000000, float %1377) #6, !dbg !28 + %.0.i1051 = select i1 %.not3.i1050, float %1379, float %1378, !dbg !28 + %1380 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1192, float 0x3E54AE0C00000000, float %.0.i1051) #6, !dbg !28 + %1381 = tail call float @llvm.nvvm.fma.rn.f(float %1192, float 0x3E54AE0C00000000, float %.0.i1051) #6, !dbg !28 + %.01.i1053 = select i1 %.not4.i1052, float %1381, float %1380, !dbg !28 + %1382 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1053) #6, !dbg !28 + %1383 = extractelement <16 x float> %1350, i64 4, !dbg !28 + %1384 = fadd float %1383, 0xC168000FE0000000, !dbg !28 + %1385 = fneg float %1384, !dbg !28 + %1386 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1193, float 0x3FF7154760000000, float %1385) #6, !dbg !28 + %1387 = tail call float @llvm.nvvm.fma.rn.f(float %1193, float 0x3FF7154760000000, float %1385) #6, !dbg !28 + %.0.i1061 = select i1 %.not3.i1060, float %1387, float %1386, !dbg !28 + %1388 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1193, float 0x3E54AE0C00000000, float %.0.i1061) #6, !dbg !28 + %1389 = tail call float @llvm.nvvm.fma.rn.f(float %1193, float 0x3E54AE0C00000000, float %.0.i1061) #6, !dbg !28 + %.01.i1063 = select i1 %.not4.i1062, float %1389, float %1388, !dbg !28 + %1390 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1063) #6, !dbg !28 + %1391 = extractelement <16 x float> %1350, i64 5, !dbg !28 + %1392 = fadd float %1391, 0xC168000FE0000000, !dbg !28 + %1393 = fneg float %1392, !dbg !28 + %1394 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1194, float 0x3FF7154760000000, float %1393) #6, !dbg !28 + %1395 = tail call float @llvm.nvvm.fma.rn.f(float %1194, float 0x3FF7154760000000, float %1393) #6, !dbg !28 + %.0.i1071 = select i1 %.not3.i1070, float %1395, float %1394, !dbg !28 + %1396 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1194, float 0x3E54AE0C00000000, float %.0.i1071) #6, !dbg !28 + %1397 = tail call float @llvm.nvvm.fma.rn.f(float %1194, float 0x3E54AE0C00000000, float %.0.i1071) #6, !dbg !28 + %.01.i1073 = select i1 %.not4.i1072, float %1397, float %1396, !dbg !28 + %1398 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1073) #6, !dbg !28 + %1399 = extractelement <16 x float> %1350, i64 6, !dbg !28 + %1400 = fadd float %1399, 0xC168000FE0000000, !dbg !28 + %1401 = fneg float %1400, !dbg !28 + %1402 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1195, float 0x3FF7154760000000, float %1401) #6, !dbg !28 + %1403 = tail call float @llvm.nvvm.fma.rn.f(float %1195, float 0x3FF7154760000000, float %1401) #6, !dbg !28 + %.0.i1081 = select i1 %.not3.i1080, float %1403, float %1402, !dbg !28 + %1404 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1195, float 0x3E54AE0C00000000, float %.0.i1081) #6, !dbg !28 + %1405 = tail call float @llvm.nvvm.fma.rn.f(float %1195, float 0x3E54AE0C00000000, float %.0.i1081) #6, !dbg !28 + %.01.i1083 = select i1 %.not4.i1082, float %1405, float %1404, !dbg !28 + %1406 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1083) #6, !dbg !28 + %1407 = extractelement <16 x float> %1350, i64 7, !dbg !28 + %1408 = fadd float %1407, 0xC168000FE0000000, !dbg !28 + %1409 = fneg float %1408, !dbg !28 + %1410 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1196, float 0x3FF7154760000000, float %1409) #6, !dbg !28 + %1411 = tail call float @llvm.nvvm.fma.rn.f(float %1196, float 0x3FF7154760000000, float %1409) #6, !dbg !28 + %.0.i1091 = select i1 %.not3.i1090, float %1411, float %1410, !dbg !28 + %1412 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1196, float 0x3E54AE0C00000000, float %.0.i1091) #6, !dbg !28 + %1413 = tail call float @llvm.nvvm.fma.rn.f(float %1196, float 0x3E54AE0C00000000, float %.0.i1091) #6, !dbg !28 + %.01.i1093 = select i1 %.not4.i1092, float %1413, float %1412, !dbg !28 + %1414 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1093) #6, !dbg !28 + %1415 = extractelement <16 x float> %1350, i64 8, !dbg !28 + %1416 = fadd float %1415, 0xC168000FE0000000, !dbg !28 + %1417 = fneg float %1416, !dbg !28 + %1418 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1197, float 0x3FF7154760000000, float %1417) #6, !dbg !28 + %1419 = tail call float @llvm.nvvm.fma.rn.f(float %1197, float 0x3FF7154760000000, float %1417) #6, !dbg !28 + %.0.i1101 = select i1 %.not3.i1100, float %1419, float %1418, !dbg !28 + %1420 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1197, float 0x3E54AE0C00000000, float %.0.i1101) #6, !dbg !28 + %1421 = tail call float @llvm.nvvm.fma.rn.f(float %1197, float 0x3E54AE0C00000000, float %.0.i1101) #6, !dbg !28 + %.01.i1103 = select i1 %.not4.i1102, float %1421, float %1420, !dbg !28 + %1422 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1103) #6, !dbg !28 + %1423 = extractelement <16 x float> %1350, i64 9, !dbg !28 + %1424 = fadd float %1423, 0xC168000FE0000000, !dbg !28 + %1425 = fneg float %1424, !dbg !28 + %1426 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1198, float 0x3FF7154760000000, float %1425) #6, !dbg !28 + %1427 = tail call float @llvm.nvvm.fma.rn.f(float %1198, float 0x3FF7154760000000, float %1425) #6, !dbg !28 + %.0.i1111 = select i1 %.not3.i1110, float %1427, float %1426, !dbg !28 + %1428 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1198, float 0x3E54AE0C00000000, float %.0.i1111) #6, !dbg !28 + %1429 = tail call float @llvm.nvvm.fma.rn.f(float %1198, float 0x3E54AE0C00000000, float %.0.i1111) #6, !dbg !28 + %.01.i1113 = select i1 %.not4.i1112, float %1429, float %1428, !dbg !28 + %1430 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1113) #6, !dbg !28 + %1431 = extractelement <16 x float> %1350, i64 10, !dbg !28 + %1432 = fadd float %1431, 0xC168000FE0000000, !dbg !28 + %1433 = fneg float %1432, !dbg !28 + %1434 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1199, float 0x3FF7154760000000, float %1433) #6, !dbg !28 + %1435 = tail call float @llvm.nvvm.fma.rn.f(float %1199, float 0x3FF7154760000000, float %1433) #6, !dbg !28 + %.0.i1121 = select i1 %.not3.i1120, float %1435, float %1434, !dbg !28 + %1436 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1199, float 0x3E54AE0C00000000, float %.0.i1121) #6, !dbg !28 + %1437 = tail call float @llvm.nvvm.fma.rn.f(float %1199, float 0x3E54AE0C00000000, float %.0.i1121) #6, !dbg !28 + %.01.i1123 = select i1 %.not4.i1122, float %1437, float %1436, !dbg !28 + %1438 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1123) #6, !dbg !28 + %1439 = extractelement <16 x float> %1350, i64 11, !dbg !28 + %1440 = fadd float %1439, 0xC168000FE0000000, !dbg !28 + %1441 = fneg float %1440, !dbg !28 + %1442 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1200, float 0x3FF7154760000000, float %1441) #6, !dbg !28 + %1443 = tail call float @llvm.nvvm.fma.rn.f(float %1200, float 0x3FF7154760000000, float %1441) #6, !dbg !28 + %.0.i1131 = select i1 %.not3.i1130, float %1443, float %1442, !dbg !28 + %1444 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1200, float 0x3E54AE0C00000000, float %.0.i1131) #6, !dbg !28 + %1445 = tail call float @llvm.nvvm.fma.rn.f(float %1200, float 0x3E54AE0C00000000, float %.0.i1131) #6, !dbg !28 + %.01.i1133 = select i1 %.not4.i1132, float %1445, float %1444, !dbg !28 + %1446 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1133) #6, !dbg !28 + %1447 = extractelement <16 x float> %1350, i64 12, !dbg !28 + %1448 = fadd float %1447, 0xC168000FE0000000, !dbg !28 + %1449 = fneg float %1448, !dbg !28 + %1450 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1201, float 0x3FF7154760000000, float %1449) #6, !dbg !28 + %1451 = tail call float @llvm.nvvm.fma.rn.f(float %1201, float 0x3FF7154760000000, float %1449) #6, !dbg !28 + %.0.i1141 = select i1 %.not3.i1140, float %1451, float %1450, !dbg !28 + %1452 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1201, float 0x3E54AE0C00000000, float %.0.i1141) #6, !dbg !28 + %1453 = tail call float @llvm.nvvm.fma.rn.f(float %1201, float 0x3E54AE0C00000000, float %.0.i1141) #6, !dbg !28 + %.01.i1143 = select i1 %.not4.i1142, float %1453, float %1452, !dbg !28 + %1454 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1143) #6, !dbg !28 + %1455 = extractelement <16 x float> %1350, i64 13, !dbg !28 + %1456 = fadd float %1455, 0xC168000FE0000000, !dbg !28 + %1457 = fneg float %1456, !dbg !28 + %1458 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1202, float 0x3FF7154760000000, float %1457) #6, !dbg !28 + %1459 = tail call float @llvm.nvvm.fma.rn.f(float %1202, float 0x3FF7154760000000, float %1457) #6, !dbg !28 + %.0.i1151 = select i1 %.not3.i1150, float %1459, float %1458, !dbg !28 + %1460 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1202, float 0x3E54AE0C00000000, float %.0.i1151) #6, !dbg !28 + %1461 = tail call float @llvm.nvvm.fma.rn.f(float %1202, float 0x3E54AE0C00000000, float %.0.i1151) #6, !dbg !28 + %.01.i1153 = select i1 %.not4.i1152, float %1461, float %1460, !dbg !28 + %1462 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1153) #6, !dbg !28 + %1463 = extractelement <16 x float> %1350, i64 14, !dbg !28 + %1464 = fadd float %1463, 0xC168000FE0000000, !dbg !28 + %1465 = fneg float %1464, !dbg !28 + %1466 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1203, float 0x3FF7154760000000, float %1465) #6, !dbg !28 + %1467 = tail call float @llvm.nvvm.fma.rn.f(float %1203, float 0x3FF7154760000000, float %1465) #6, !dbg !28 + %.0.i1161 = select i1 %.not3.i1160, float %1467, float %1466, !dbg !28 + %1468 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1203, float 0x3E54AE0C00000000, float %.0.i1161) #6, !dbg !28 + %1469 = tail call float @llvm.nvvm.fma.rn.f(float %1203, float 0x3E54AE0C00000000, float %.0.i1161) #6, !dbg !28 + %.01.i1163 = select i1 %.not4.i1162, float %1469, float %1468, !dbg !28 + %1470 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1163) #6, !dbg !28 + %1471 = extractelement <16 x float> %1350, i64 15, !dbg !28 + %1472 = fadd float %1471, 0xC168000FE0000000, !dbg !28 + %1473 = fneg float %1472, !dbg !28 + %1474 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1204, float 0x3FF7154760000000, float %1473) #6, !dbg !28 + %1475 = tail call float @llvm.nvvm.fma.rn.f(float %1204, float 0x3FF7154760000000, float %1473) #6, !dbg !28 + %.0.i1171 = select i1 %.not3.i1170, float %1475, float %1474, !dbg !28 + %1476 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1204, float 0x3E54AE0C00000000, float %.0.i1171) #6, !dbg !28 + %1477 = tail call float @llvm.nvvm.fma.rn.f(float %1204, float 0x3E54AE0C00000000, float %.0.i1171) #6, !dbg !28 + %.01.i1173 = select i1 %.not4.i1172, float %1477, float %1476, !dbg !28 + %1478 = bitcast <16 x float> %1350 to <16 x i32>, !dbg !28 + %1479 = shl <16 x i32> %1478, splat (i32 23), !dbg !28 + %1480 = bitcast <16 x i32> %1479 to <16 x float>, !dbg !28 + %1481 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1173) #6, !dbg !28 + %1482 = insertelement <16 x float> poison, float %1358, i64 0, !dbg !28 + %1483 = insertelement <16 x float> %1482, float %1366, i64 1, !dbg !28 + %1484 = insertelement <16 x float> %1483, float %1374, i64 2, !dbg !28 + %1485 = insertelement <16 x float> %1484, float %1382, i64 3, !dbg !28 + %1486 = insertelement <16 x float> %1485, float %1390, i64 4, !dbg !28 + %1487 = insertelement <16 x float> %1486, float %1398, i64 5, !dbg !28 + %1488 = insertelement <16 x float> %1487, float %1406, i64 6, !dbg !28 + %1489 = insertelement <16 x float> %1488, float %1414, i64 7, !dbg !28 + %1490 = insertelement <16 x float> %1489, float %1422, i64 8, !dbg !28 + %1491 = insertelement <16 x float> %1490, float %1430, i64 9, !dbg !28 + %1492 = insertelement <16 x float> %1491, float %1438, i64 10, !dbg !28 + %1493 = insertelement <16 x float> %1492, float %1446, i64 11, !dbg !28 + %1494 = insertelement <16 x float> %1493, float %1454, i64 12, !dbg !28 + %1495 = insertelement <16 x float> %1494, float %1462, i64 13, !dbg !28 + %1496 = insertelement <16 x float> %1495, float %1470, i64 14, !dbg !28 + %1497 = insertelement <16 x float> %1496, float %1481, i64 15, !dbg !28 + %1498 = fmul <16 x float> %1497, %1480, !dbg !28 + %1499 = select <16 x i1> %877, <16 x float> splat (float 1.000000e+00), <16 x float> %1498, !dbg !30 + %1500 = fmul <16 x float> %36, %1188, !dbg !31 + %1501 = fmul float %18, %488, !dbg !31 + %1502 = fmul float %19, %489, !dbg !31 + %1503 = fmul float %20, %490, !dbg !31 + %1504 = fmul float %21, %491, !dbg !31 + %1505 = fmul float %22, %492, !dbg !31 + %1506 = fmul float %23, %493, !dbg !31 + %1507 = fmul float %24, %494, !dbg !31 + %1508 = fmul float %25, %495, !dbg !31 + %1509 = fmul float %26, %496, !dbg !31 + %1510 = fmul float %27, %497, !dbg !31 + %1511 = fadd <16 x float> %1500, %1499, !dbg !32 + %1512 = fadd float %1501, %838, !dbg !32 + %1513 = fadd float %1502, %839, !dbg !32 + %1514 = fadd float %1503, %840, !dbg !32 + %1515 = fadd float %1504, %841, !dbg !32 + %1516 = fadd float %1505, %842, !dbg !32 + %1517 = fadd float %1506, %843, !dbg !32 + %1518 = fadd float %1507, %844, !dbg !32 + %1519 = fadd float %1508, %845, !dbg !32 + %1520 = fadd float %1509, %846, !dbg !32 + %1521 = fadd float %1510, %847, !dbg !32 + %1522 = fpext <2 x bfloat> %95 to <2 x float>, !dbg !17 + %1523 = fcmp ogt <2 x float> %35, %1522, !dbg !18 + %1524 = or <2 x i1> %110, %1523, !dbg !23 + %1525 = select <2 x i1> %1524, <2 x float> %35, <2 x float> %1522, !dbg !24 + %1526 = fcmp oeq <2 x float> %1525, splat (float 0xFFF0000000000000), !dbg !26 + %foldExtExtBinop1621 = fsub <2 x float> %35, %1525, !dbg !27 + %1527 = extractelement <2 x float> %foldExtExtBinop1621, i64 0, !dbg !27 + %foldExtExtBinop1623 = fsub <2 x float> %35, %1525, !dbg !27 + %1528 = extractelement <2 x float> %foldExtExtBinop1623, i64 1, !dbg !27 + %1529 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1527, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1530 = tail call float @llvm.nvvm.fma.rn.f(float %1527, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i955 = select i1 %.not.i954, float %1530, float %1529, !dbg !28 + %1531 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i955) #6, !dbg !28 + %1532 = tail call float @llvm.nvvm.saturate.f(float %.02.i955) #6, !dbg !28 + %.03.i957 = select i1 %.not1.i956, float %1532, float %1531, !dbg !28 + %1533 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i957, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1534 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i957, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1535 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1528, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1536 = tail call float @llvm.nvvm.fma.rn.f(float %1528, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i965 = select i1 %.not.i964, float %1536, float %1535, !dbg !28 + %1537 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i965) #6, !dbg !28 + %1538 = tail call float @llvm.nvvm.saturate.f(float %.02.i965) #6, !dbg !28 + %.03.i967 = select i1 %.not1.i966, float %1538, float %1537, !dbg !28 + %1539 = insertelement <2 x i32> poison, i32 %460, i64 0, !dbg !28 + %1540 = insertelement <2 x i32> %1539, i32 %465, i64 1, !dbg !28 + %1541 = icmp eq <2 x i32> %1540, zeroinitializer, !dbg !28 + %1542 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i967, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1543 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i967, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1544 = insertelement <2 x float> poison, float %1534, i64 0, !dbg !28 + %1545 = insertelement <2 x float> %1544, float %1543, i64 1, !dbg !28 + %1546 = insertelement <2 x float> poison, float %1533, i64 0, !dbg !28 + %1547 = insertelement <2 x float> %1546, float %1542, i64 1, !dbg !28 + %1548 = select <2 x i1> %1541, <2 x float> %1545, <2 x float> %1547, !dbg !28 + %1549 = extractelement <2 x float> %1548, i64 0, !dbg !28 + %1550 = fadd float %1549, 0xC168000FE0000000, !dbg !28 + %1551 = fneg float %1550, !dbg !28 + %1552 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1527, float 0x3FF7154760000000, float %1551) #6, !dbg !28 + %1553 = tail call float @llvm.nvvm.fma.rn.f(float %1527, float 0x3FF7154760000000, float %1551) #6, !dbg !28 + %.0.i961 = select i1 %.not3.i960, float %1553, float %1552, !dbg !28 + %1554 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1527, float 0x3E54AE0C00000000, float %.0.i961) #6, !dbg !28 + %1555 = tail call float @llvm.nvvm.fma.rn.f(float %1527, float 0x3E54AE0C00000000, float %.0.i961) #6, !dbg !28 + %.01.i963 = select i1 %.not4.i962, float %1555, float %1554, !dbg !28 + %1556 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i963) #6, !dbg !28 + %1557 = extractelement <2 x float> %1548, i64 1, !dbg !28 + %1558 = fadd float %1557, 0xC168000FE0000000, !dbg !28 + %1559 = fneg float %1558, !dbg !28 + %1560 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1528, float 0x3FF7154760000000, float %1559) #6, !dbg !28 + %1561 = tail call float @llvm.nvvm.fma.rn.f(float %1528, float 0x3FF7154760000000, float %1559) #6, !dbg !28 + %.0.i971 = select i1 %.not3.i970, float %1561, float %1560, !dbg !28 + %1562 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1528, float 0x3E54AE0C00000000, float %.0.i971) #6, !dbg !28 + %1563 = tail call float @llvm.nvvm.fma.rn.f(float %1528, float 0x3E54AE0C00000000, float %.0.i971) #6, !dbg !28 + %.01.i973 = select i1 %.not4.i972, float %1563, float %1562, !dbg !28 + %1564 = bitcast <2 x float> %1548 to <2 x i32>, !dbg !28 + %1565 = shl <2 x i32> %1564, splat (i32 23), !dbg !28 + %1566 = bitcast <2 x i32> %1565 to <2 x float>, !dbg !28 + %1567 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i973) #6, !dbg !28 + %1568 = insertelement <2 x float> poison, float %1556, i64 0, !dbg !28 + %1569 = insertelement <2 x float> %1568, float %1567, i64 1, !dbg !28 + %1570 = fmul <2 x float> %1569, %1566, !dbg !28 + %1571 = select <2 x i1> %1526, <2 x float> splat (float 1.000000e+00), <2 x float> %1570, !dbg !29 + %foldExtExtBinop1625 = fsub <2 x float> %1522, %1525, !dbg !25 + %1572 = extractelement <2 x float> %foldExtExtBinop1625, i64 0, !dbg !25 + %foldExtExtBinop1627 = fsub <2 x float> %1522, %1525, !dbg !25 + %1573 = extractelement <2 x float> %foldExtExtBinop1627, i64 1, !dbg !25 + %1574 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1572, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1575 = tail call float @llvm.nvvm.fma.rn.f(float %1572, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1275 = select i1 %.not.i1274, float %1575, float %1574, !dbg !28 + %1576 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1275) #6, !dbg !28 + %1577 = tail call float @llvm.nvvm.saturate.f(float %.02.i1275) #6, !dbg !28 + %.03.i1277 = select i1 %.not1.i1276, float %1577, float %1576, !dbg !28 + %1578 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1277, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1579 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1277, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1580 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1573, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1581 = tail call float @llvm.nvvm.fma.rn.f(float %1573, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1285 = select i1 %.not.i1284, float %1581, float %1580, !dbg !28 + %1582 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1285) #6, !dbg !28 + %1583 = tail call float @llvm.nvvm.saturate.f(float %.02.i1285) #6, !dbg !28 + %.03.i1287 = select i1 %.not1.i1286, float %1583, float %1582, !dbg !28 + %1584 = insertelement <2 x i32> poison, i32 %810, i64 0, !dbg !28 + %1585 = insertelement <2 x i32> %1584, i32 %815, i64 1, !dbg !28 + %1586 = icmp eq <2 x i32> %1585, zeroinitializer, !dbg !28 + %1587 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1287, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1588 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1287, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1589 = insertelement <2 x float> poison, float %1579, i64 0, !dbg !28 + %1590 = insertelement <2 x float> %1589, float %1588, i64 1, !dbg !28 + %1591 = insertelement <2 x float> poison, float %1578, i64 0, !dbg !28 + %1592 = insertelement <2 x float> %1591, float %1587, i64 1, !dbg !28 + %1593 = select <2 x i1> %1586, <2 x float> %1590, <2 x float> %1592, !dbg !28 + %1594 = extractelement <2 x float> %1593, i64 0, !dbg !28 + %1595 = fadd float %1594, 0xC168000FE0000000, !dbg !28 + %1596 = fneg float %1595, !dbg !28 + %1597 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1572, float 0x3FF7154760000000, float %1596) #6, !dbg !28 + %1598 = tail call float @llvm.nvvm.fma.rn.f(float %1572, float 0x3FF7154760000000, float %1596) #6, !dbg !28 + %.0.i1281 = select i1 %.not3.i1280, float %1598, float %1597, !dbg !28 + %1599 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1572, float 0x3E54AE0C00000000, float %.0.i1281) #6, !dbg !28 + %1600 = tail call float @llvm.nvvm.fma.rn.f(float %1572, float 0x3E54AE0C00000000, float %.0.i1281) #6, !dbg !28 + %.01.i1283 = select i1 %.not4.i1282, float %1600, float %1599, !dbg !28 + %1601 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1283) #6, !dbg !28 + %1602 = extractelement <2 x float> %1593, i64 1, !dbg !28 + %1603 = fadd float %1602, 0xC168000FE0000000, !dbg !28 + %1604 = fneg float %1603, !dbg !28 + %1605 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1573, float 0x3FF7154760000000, float %1604) #6, !dbg !28 + %1606 = tail call float @llvm.nvvm.fma.rn.f(float %1573, float 0x3FF7154760000000, float %1604) #6, !dbg !28 + %.0.i1291 = select i1 %.not3.i1290, float %1606, float %1605, !dbg !28 + %1607 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1573, float 0x3E54AE0C00000000, float %.0.i1291) #6, !dbg !28 + %1608 = tail call float @llvm.nvvm.fma.rn.f(float %1573, float 0x3E54AE0C00000000, float %.0.i1291) #6, !dbg !28 + %.01.i1293 = select i1 %.not4.i1292, float %1608, float %1607, !dbg !28 + %1609 = bitcast <2 x float> %1593 to <2 x i32>, !dbg !28 + %1610 = shl <2 x i32> %1609, splat (i32 23), !dbg !28 + %1611 = bitcast <2 x i32> %1610 to <2 x float>, !dbg !28 + %1612 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1293) #6, !dbg !28 + %1613 = insertelement <2 x float> poison, float %1601, i64 0, !dbg !28 + %1614 = insertelement <2 x float> %1613, float %1612, i64 1, !dbg !28 + %1615 = fmul <2 x float> %1614, %1611, !dbg !28 + %1616 = select <2 x i1> %1526, <2 x float> splat (float 1.000000e+00), <2 x float> %1615, !dbg !30 + %1617 = fmul <2 x float> %34, %1571, !dbg !31 + %1618 = fadd <2 x float> %1617, %1616, !dbg !32 + %1619 = fpext <2 x bfloat> %97 to <2 x float>, !dbg !17 + %1620 = fcmp ogt <2 x float> %33, %1619, !dbg !18 + %1621 = or <2 x i1> %111, %1620, !dbg !23 + %1622 = select <2 x i1> %1621, <2 x float> %33, <2 x float> %1619, !dbg !24 + %1623 = fcmp oeq <2 x float> %1622, splat (float 0xFFF0000000000000), !dbg !26 + %foldExtExtBinop1629 = fsub <2 x float> %33, %1622, !dbg !27 + %1624 = extractelement <2 x float> %foldExtExtBinop1629, i64 0, !dbg !27 + %foldExtExtBinop1631 = fsub <2 x float> %33, %1622, !dbg !27 + %1625 = extractelement <2 x float> %foldExtExtBinop1631, i64 1, !dbg !27 + %1626 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1624, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1627 = tail call float @llvm.nvvm.fma.rn.f(float %1624, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i975 = select i1 %.not.i974, float %1627, float %1626, !dbg !28 + %1628 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i975) #6, !dbg !28 + %1629 = tail call float @llvm.nvvm.saturate.f(float %.02.i975) #6, !dbg !28 + %.03.i977 = select i1 %.not1.i976, float %1629, float %1628, !dbg !28 + %1630 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i977, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1631 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i977, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1632 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1625, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1633 = tail call float @llvm.nvvm.fma.rn.f(float %1625, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i985 = select i1 %.not.i984, float %1633, float %1632, !dbg !28 + %1634 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i985) #6, !dbg !28 + %1635 = tail call float @llvm.nvvm.saturate.f(float %.02.i985) #6, !dbg !28 + %.03.i987 = select i1 %.not1.i986, float %1635, float %1634, !dbg !28 + %1636 = insertelement <2 x i32> poison, i32 %470, i64 0, !dbg !28 + %1637 = insertelement <2 x i32> %1636, i32 %475, i64 1, !dbg !28 + %1638 = icmp eq <2 x i32> %1637, zeroinitializer, !dbg !28 + %1639 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i987, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1640 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i987, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1641 = insertelement <2 x float> poison, float %1631, i64 0, !dbg !28 + %1642 = insertelement <2 x float> %1641, float %1640, i64 1, !dbg !28 + %1643 = insertelement <2 x float> poison, float %1630, i64 0, !dbg !28 + %1644 = insertelement <2 x float> %1643, float %1639, i64 1, !dbg !28 + %1645 = select <2 x i1> %1638, <2 x float> %1642, <2 x float> %1644, !dbg !28 + %1646 = extractelement <2 x float> %1645, i64 0, !dbg !28 + %1647 = fadd float %1646, 0xC168000FE0000000, !dbg !28 + %1648 = fneg float %1647, !dbg !28 + %1649 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1624, float 0x3FF7154760000000, float %1648) #6, !dbg !28 + %1650 = tail call float @llvm.nvvm.fma.rn.f(float %1624, float 0x3FF7154760000000, float %1648) #6, !dbg !28 + %.0.i981 = select i1 %.not3.i980, float %1650, float %1649, !dbg !28 + %1651 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1624, float 0x3E54AE0C00000000, float %.0.i981) #6, !dbg !28 + %1652 = tail call float @llvm.nvvm.fma.rn.f(float %1624, float 0x3E54AE0C00000000, float %.0.i981) #6, !dbg !28 + %.01.i983 = select i1 %.not4.i982, float %1652, float %1651, !dbg !28 + %1653 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i983) #6, !dbg !28 + %1654 = extractelement <2 x float> %1645, i64 1, !dbg !28 + %1655 = fadd float %1654, 0xC168000FE0000000, !dbg !28 + %1656 = fneg float %1655, !dbg !28 + %1657 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1625, float 0x3FF7154760000000, float %1656) #6, !dbg !28 + %1658 = tail call float @llvm.nvvm.fma.rn.f(float %1625, float 0x3FF7154760000000, float %1656) #6, !dbg !28 + %.0.i991 = select i1 %.not3.i990, float %1658, float %1657, !dbg !28 + %1659 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1625, float 0x3E54AE0C00000000, float %.0.i991) #6, !dbg !28 + %1660 = tail call float @llvm.nvvm.fma.rn.f(float %1625, float 0x3E54AE0C00000000, float %.0.i991) #6, !dbg !28 + %.01.i993 = select i1 %.not4.i992, float %1660, float %1659, !dbg !28 + %1661 = bitcast <2 x float> %1645 to <2 x i32>, !dbg !28 + %1662 = shl <2 x i32> %1661, splat (i32 23), !dbg !28 + %1663 = bitcast <2 x i32> %1662 to <2 x float>, !dbg !28 + %1664 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i993) #6, !dbg !28 + %1665 = insertelement <2 x float> poison, float %1653, i64 0, !dbg !28 + %1666 = insertelement <2 x float> %1665, float %1664, i64 1, !dbg !28 + %1667 = fmul <2 x float> %1666, %1663, !dbg !28 + %1668 = select <2 x i1> %1623, <2 x float> splat (float 1.000000e+00), <2 x float> %1667, !dbg !29 + %foldExtExtBinop1633 = fsub <2 x float> %1619, %1622, !dbg !25 + %1669 = extractelement <2 x float> %foldExtExtBinop1633, i64 0, !dbg !25 + %foldExtExtBinop1635 = fsub <2 x float> %1619, %1622, !dbg !25 + %1670 = extractelement <2 x float> %foldExtExtBinop1635, i64 1, !dbg !25 + %1671 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1669, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1672 = tail call float @llvm.nvvm.fma.rn.f(float %1669, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1295 = select i1 %.not.i1294, float %1672, float %1671, !dbg !28 + %1673 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1295) #6, !dbg !28 + %1674 = tail call float @llvm.nvvm.saturate.f(float %.02.i1295) #6, !dbg !28 + %.03.i1297 = select i1 %.not1.i1296, float %1674, float %1673, !dbg !28 + %1675 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1297, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1676 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1297, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1677 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1670, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1678 = tail call float @llvm.nvvm.fma.rn.f(float %1670, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1305 = select i1 %.not.i1304, float %1678, float %1677, !dbg !28 + %1679 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1305) #6, !dbg !28 + %1680 = tail call float @llvm.nvvm.saturate.f(float %.02.i1305) #6, !dbg !28 + %.03.i1307 = select i1 %.not1.i1306, float %1680, float %1679, !dbg !28 + %1681 = insertelement <2 x i32> poison, i32 %820, i64 0, !dbg !28 + %1682 = insertelement <2 x i32> %1681, i32 %825, i64 1, !dbg !28 + %1683 = icmp eq <2 x i32> %1682, zeroinitializer, !dbg !28 + %1684 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1307, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1685 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1307, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1686 = insertelement <2 x float> poison, float %1676, i64 0, !dbg !28 + %1687 = insertelement <2 x float> %1686, float %1685, i64 1, !dbg !28 + %1688 = insertelement <2 x float> poison, float %1675, i64 0, !dbg !28 + %1689 = insertelement <2 x float> %1688, float %1684, i64 1, !dbg !28 + %1690 = select <2 x i1> %1683, <2 x float> %1687, <2 x float> %1689, !dbg !28 + %1691 = extractelement <2 x float> %1690, i64 0, !dbg !28 + %1692 = fadd float %1691, 0xC168000FE0000000, !dbg !28 + %1693 = fneg float %1692, !dbg !28 + %1694 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1669, float 0x3FF7154760000000, float %1693) #6, !dbg !28 + %1695 = tail call float @llvm.nvvm.fma.rn.f(float %1669, float 0x3FF7154760000000, float %1693) #6, !dbg !28 + %.0.i1301 = select i1 %.not3.i1300, float %1695, float %1694, !dbg !28 + %1696 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1669, float 0x3E54AE0C00000000, float %.0.i1301) #6, !dbg !28 + %1697 = tail call float @llvm.nvvm.fma.rn.f(float %1669, float 0x3E54AE0C00000000, float %.0.i1301) #6, !dbg !28 + %.01.i1303 = select i1 %.not4.i1302, float %1697, float %1696, !dbg !28 + %1698 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1303) #6, !dbg !28 + %1699 = extractelement <2 x float> %1690, i64 1, !dbg !28 + %1700 = fadd float %1699, 0xC168000FE0000000, !dbg !28 + %1701 = fneg float %1700, !dbg !28 + %1702 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1670, float 0x3FF7154760000000, float %1701) #6, !dbg !28 + %1703 = tail call float @llvm.nvvm.fma.rn.f(float %1670, float 0x3FF7154760000000, float %1701) #6, !dbg !28 + %.0.i1311 = select i1 %.not3.i1310, float %1703, float %1702, !dbg !28 + %1704 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1670, float 0x3E54AE0C00000000, float %.0.i1311) #6, !dbg !28 + %1705 = tail call float @llvm.nvvm.fma.rn.f(float %1670, float 0x3E54AE0C00000000, float %.0.i1311) #6, !dbg !28 + %.01.i1313 = select i1 %.not4.i1312, float %1705, float %1704, !dbg !28 + %1706 = bitcast <2 x float> %1690 to <2 x i32>, !dbg !28 + %1707 = shl <2 x i32> %1706, splat (i32 23), !dbg !28 + %1708 = bitcast <2 x i32> %1707 to <2 x float>, !dbg !28 + %1709 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1313) #6, !dbg !28 + %1710 = insertelement <2 x float> poison, float %1698, i64 0, !dbg !28 + %1711 = insertelement <2 x float> %1710, float %1709, i64 1, !dbg !28 + %1712 = fmul <2 x float> %1711, %1708, !dbg !28 + %1713 = select <2 x i1> %1623, <2 x float> splat (float 1.000000e+00), <2 x float> %1712, !dbg !30 + %1714 = fmul <2 x float> %32, %1668, !dbg !31 + %1715 = fadd <2 x float> %1714, %1713, !dbg !32 + %1716 = fpext <2 x bfloat> %99 to <2 x float>, !dbg !17 + %1717 = fcmp ogt <2 x float> %31, %1716, !dbg !18 + %1718 = or <2 x i1> %112, %1717, !dbg !23 + %1719 = select <2 x i1> %1718, <2 x float> %31, <2 x float> %1716, !dbg !24 + %1720 = fcmp oeq <2 x float> %1719, splat (float 0xFFF0000000000000), !dbg !26 + %foldExtExtBinop1637 = fsub <2 x float> %31, %1719, !dbg !27 + %1721 = extractelement <2 x float> %foldExtExtBinop1637, i64 0, !dbg !27 + %foldExtExtBinop1639 = fsub <2 x float> %31, %1719, !dbg !27 + %1722 = extractelement <2 x float> %foldExtExtBinop1639, i64 1, !dbg !27 + %1723 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1721, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1724 = tail call float @llvm.nvvm.fma.rn.f(float %1721, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i995 = select i1 %.not.i994, float %1724, float %1723, !dbg !28 + %1725 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i995) #6, !dbg !28 + %1726 = tail call float @llvm.nvvm.saturate.f(float %.02.i995) #6, !dbg !28 + %.03.i997 = select i1 %.not1.i996, float %1726, float %1725, !dbg !28 + %1727 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i997, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1728 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i997, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1729 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1722, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1730 = tail call float @llvm.nvvm.fma.rn.f(float %1722, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1005 = select i1 %.not.i1004, float %1730, float %1729, !dbg !28 + %1731 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1005) #6, !dbg !28 + %1732 = tail call float @llvm.nvvm.saturate.f(float %.02.i1005) #6, !dbg !28 + %.03.i1007 = select i1 %.not1.i1006, float %1732, float %1731, !dbg !28 + %1733 = insertelement <2 x i32> poison, i32 %480, i64 0, !dbg !28 + %1734 = insertelement <2 x i32> %1733, i32 %485, i64 1, !dbg !28 + %1735 = icmp eq <2 x i32> %1734, zeroinitializer, !dbg !28 + %1736 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1007, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1737 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1007, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1738 = insertelement <2 x float> poison, float %1728, i64 0, !dbg !28 + %1739 = insertelement <2 x float> %1738, float %1737, i64 1, !dbg !28 + %1740 = insertelement <2 x float> poison, float %1727, i64 0, !dbg !28 + %1741 = insertelement <2 x float> %1740, float %1736, i64 1, !dbg !28 + %1742 = select <2 x i1> %1735, <2 x float> %1739, <2 x float> %1741, !dbg !28 + %1743 = extractelement <2 x float> %1742, i64 0, !dbg !28 + %1744 = fadd float %1743, 0xC168000FE0000000, !dbg !28 + %1745 = fneg float %1744, !dbg !28 + %1746 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1721, float 0x3FF7154760000000, float %1745) #6, !dbg !28 + %1747 = tail call float @llvm.nvvm.fma.rn.f(float %1721, float 0x3FF7154760000000, float %1745) #6, !dbg !28 + %.0.i1001 = select i1 %.not3.i1000, float %1747, float %1746, !dbg !28 + %1748 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1721, float 0x3E54AE0C00000000, float %.0.i1001) #6, !dbg !28 + %1749 = tail call float @llvm.nvvm.fma.rn.f(float %1721, float 0x3E54AE0C00000000, float %.0.i1001) #6, !dbg !28 + %.01.i1003 = select i1 %.not4.i1002, float %1749, float %1748, !dbg !28 + %1750 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1003) #6, !dbg !28 + %1751 = extractelement <2 x float> %1742, i64 1, !dbg !28 + %1752 = fadd float %1751, 0xC168000FE0000000, !dbg !28 + %1753 = fneg float %1752, !dbg !28 + %1754 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1722, float 0x3FF7154760000000, float %1753) #6, !dbg !28 + %1755 = tail call float @llvm.nvvm.fma.rn.f(float %1722, float 0x3FF7154760000000, float %1753) #6, !dbg !28 + %.0.i1011 = select i1 %.not3.i1010, float %1755, float %1754, !dbg !28 + %1756 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1722, float 0x3E54AE0C00000000, float %.0.i1011) #6, !dbg !28 + %1757 = tail call float @llvm.nvvm.fma.rn.f(float %1722, float 0x3E54AE0C00000000, float %.0.i1011) #6, !dbg !28 + %.01.i1013 = select i1 %.not4.i1012, float %1757, float %1756, !dbg !28 + %1758 = bitcast <2 x float> %1742 to <2 x i32>, !dbg !28 + %1759 = shl <2 x i32> %1758, splat (i32 23), !dbg !28 + %1760 = bitcast <2 x i32> %1759 to <2 x float>, !dbg !28 + %1761 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1013) #6, !dbg !28 + %1762 = insertelement <2 x float> poison, float %1750, i64 0, !dbg !28 + %1763 = insertelement <2 x float> %1762, float %1761, i64 1, !dbg !28 + %1764 = fmul <2 x float> %1763, %1760, !dbg !28 + %1765 = select <2 x i1> %1720, <2 x float> splat (float 1.000000e+00), <2 x float> %1764, !dbg !29 + %foldExtExtBinop1641 = fsub <2 x float> %1716, %1719, !dbg !25 + %1766 = extractelement <2 x float> %foldExtExtBinop1641, i64 0, !dbg !25 + %foldExtExtBinop1643 = fsub <2 x float> %1716, %1719, !dbg !25 + %1767 = extractelement <2 x float> %foldExtExtBinop1643, i64 1, !dbg !25 + %1768 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1766, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1769 = tail call float @llvm.nvvm.fma.rn.f(float %1766, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1315 = select i1 %.not.i1314, float %1769, float %1768, !dbg !28 + %1770 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1315) #6, !dbg !28 + %1771 = tail call float @llvm.nvvm.saturate.f(float %.02.i1315) #6, !dbg !28 + %.03.i1317 = select i1 %.not1.i1316, float %1771, float %1770, !dbg !28 + %1772 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1317, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1773 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1317, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1774 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1767, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %1775 = tail call float @llvm.nvvm.fma.rn.f(float %1767, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !28 + %.02.i1325 = select i1 %.not.i1324, float %1775, float %1774, !dbg !28 + %1776 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i1325) #6, !dbg !28 + %1777 = tail call float @llvm.nvvm.saturate.f(float %.02.i1325) #6, !dbg !28 + %.03.i1327 = select i1 %.not1.i1326, float %1777, float %1776, !dbg !28 + %1778 = insertelement <2 x i32> poison, i32 %830, i64 0, !dbg !28 + %1779 = insertelement <2 x i32> %1778, i32 %835, i64 1, !dbg !28 + %1780 = icmp eq <2 x i32> %1779, zeroinitializer, !dbg !28 + %1781 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i1327, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1782 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i1327, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !28 + %1783 = insertelement <2 x float> poison, float %1773, i64 0, !dbg !28 + %1784 = insertelement <2 x float> %1783, float %1782, i64 1, !dbg !28 + %1785 = insertelement <2 x float> poison, float %1772, i64 0, !dbg !28 + %1786 = insertelement <2 x float> %1785, float %1781, i64 1, !dbg !28 + %1787 = select <2 x i1> %1780, <2 x float> %1784, <2 x float> %1786, !dbg !28 + %1788 = extractelement <2 x float> %1787, i64 0, !dbg !28 + %1789 = fadd float %1788, 0xC168000FE0000000, !dbg !28 + %1790 = fneg float %1789, !dbg !28 + %1791 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1766, float 0x3FF7154760000000, float %1790) #6, !dbg !28 + %1792 = tail call float @llvm.nvvm.fma.rn.f(float %1766, float 0x3FF7154760000000, float %1790) #6, !dbg !28 + %.0.i1321 = select i1 %.not3.i1320, float %1792, float %1791, !dbg !28 + %1793 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1766, float 0x3E54AE0C00000000, float %.0.i1321) #6, !dbg !28 + %1794 = tail call float @llvm.nvvm.fma.rn.f(float %1766, float 0x3E54AE0C00000000, float %.0.i1321) #6, !dbg !28 + %.01.i1323 = select i1 %.not4.i1322, float %1794, float %1793, !dbg !28 + %1795 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1323) #6, !dbg !28 + %1796 = extractelement <2 x float> %1787, i64 1, !dbg !28 + %1797 = fadd float %1796, 0xC168000FE0000000, !dbg !28 + %1798 = fneg float %1797, !dbg !28 + %1799 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1767, float 0x3FF7154760000000, float %1798) #6, !dbg !28 + %1800 = tail call float @llvm.nvvm.fma.rn.f(float %1767, float 0x3FF7154760000000, float %1798) #6, !dbg !28 + %.0.i1331 = select i1 %.not3.i1330, float %1800, float %1799, !dbg !28 + %1801 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %1767, float 0x3E54AE0C00000000, float %.0.i1331) #6, !dbg !28 + %1802 = tail call float @llvm.nvvm.fma.rn.f(float %1767, float 0x3E54AE0C00000000, float %.0.i1331) #6, !dbg !28 + %.01.i1333 = select i1 %.not4.i1332, float %1802, float %1801, !dbg !28 + %1803 = bitcast <2 x float> %1787 to <2 x i32>, !dbg !28 + %1804 = shl <2 x i32> %1803, splat (i32 23), !dbg !28 + %1805 = bitcast <2 x i32> %1804 to <2 x float>, !dbg !28 + %1806 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i1333) #6, !dbg !28 + %1807 = insertelement <2 x float> poison, float %1795, i64 0, !dbg !28 + %1808 = insertelement <2 x float> %1807, float %1806, i64 1, !dbg !28 + %1809 = fmul <2 x float> %1808, %1805, !dbg !28 + %1810 = select <2 x i1> %1720, <2 x float> splat (float 1.000000e+00), <2 x float> %1809, !dbg !30 + %1811 = fmul <2 x float> %30, %1765, !dbg !31 + %1812 = fadd <2 x float> %1811, %1810, !dbg !32 + %1813 = select i1 %40, float %128, float %28, !dbg !33 + %1814 = select i1 %40, float %129, float %29, !dbg !33 + %1815 = insertelement <2 x i1> poison, i1 %40, i64 0, !dbg !33 + %1816 = shufflevector <2 x i1> %1815, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !33 + %1817 = select <2 x i1> %1816, <2 x float> %1525, <2 x float> %35, !dbg !33 + %1818 = select <2 x i1> %1816, <2 x float> %1622, <2 x float> %33, !dbg !33 + %1819 = select <2 x i1> %1816, <2 x float> %1719, <2 x float> %31, !dbg !33 + %1820 = select i1 %40, float %1520, float %26, !dbg !34 + %1821 = select i1 %40, float %1521, float %27, !dbg !34 + %1822 = select <2 x i1> %1816, <2 x float> %1618, <2 x float> %34, !dbg !34 + %1823 = select <2 x i1> %1816, <2 x float> %1715, <2 x float> %32, !dbg !34 + %1824 = select <2 x i1> %1816, <2 x float> %1812, <2 x float> %30, !dbg !34 + br i1 %17, label %16, label %1825, !dbg !11 + +1825: ; preds = %16 + %1826 = extractelement <16 x float> %876, i64 15, !dbg !25 + %1827 = extractelement <16 x float> %876, i64 14, !dbg !25 + %1828 = extractelement <16 x float> %876, i64 13, !dbg !25 + %1829 = extractelement <16 x float> %876, i64 12, !dbg !25 + %1830 = extractelement <16 x float> %876, i64 11, !dbg !25 + %1831 = extractelement <16 x float> %876, i64 10, !dbg !25 + %1832 = extractelement <16 x float> %876, i64 9, !dbg !25 + %1833 = extractelement <16 x float> %876, i64 8, !dbg !25 + %1834 = extractelement <16 x float> %876, i64 7, !dbg !25 + %1835 = extractelement <16 x float> %876, i64 6, !dbg !25 + %1836 = extractelement <16 x float> %876, i64 5, !dbg !25 + %1837 = extractelement <16 x float> %876, i64 4, !dbg !25 + %1838 = extractelement <16 x float> %876, i64 3, !dbg !25 + %1839 = extractelement <16 x float> %876, i64 2, !dbg !25 + %1840 = extractelement <16 x float> %876, i64 1, !dbg !25 + %1841 = extractelement <16 x float> %876, i64 0, !dbg !25 + %1842 = and i32 %8, 31, !dbg !9 + %1843 = lshr i32 %8, 5, !dbg !9 + %1844 = shl nuw nsw i32 %9, 2, !dbg !9 + %1845 = fcmp ogt float %1841, %1840, !dbg !35 + %1846 = fcmp uno float %1841, 0.000000e+00, !dbg !37 + %1847 = or i1 %1845, %1846, !dbg !38 + %1848 = select i1 %1847, float %1841, float %1840, !dbg !39 + %1849 = fcmp ogt float %1848, %1839, !dbg !35 + %1850 = fcmp uno float %1848, 0.000000e+00, !dbg !37 + %1851 = or i1 %1849, %1850, !dbg !38 + %1852 = select i1 %1851, float %1848, float %1839, !dbg !39 + %1853 = fcmp ogt float %1852, %1838, !dbg !35 + %1854 = fcmp uno float %1852, 0.000000e+00, !dbg !37 + %1855 = or i1 %1853, %1854, !dbg !38 + %1856 = select i1 %1855, float %1852, float %1838, !dbg !39 + %1857 = fcmp ogt float %1856, %1837, !dbg !35 + %1858 = fcmp uno float %1856, 0.000000e+00, !dbg !37 + %1859 = or i1 %1857, %1858, !dbg !38 + %1860 = select i1 %1859, float %1856, float %1837, !dbg !39 + %1861 = fcmp ogt float %1860, %1836, !dbg !35 + %1862 = fcmp uno float %1860, 0.000000e+00, !dbg !37 + %1863 = or i1 %1861, %1862, !dbg !38 + %1864 = select i1 %1863, float %1860, float %1836, !dbg !39 + %1865 = fcmp ogt float %1864, %1835, !dbg !35 + %1866 = fcmp uno float %1864, 0.000000e+00, !dbg !37 + %1867 = or i1 %1865, %1866, !dbg !38 + %1868 = select i1 %1867, float %1864, float %1835, !dbg !39 + %1869 = fcmp ogt float %1868, %1834, !dbg !35 + %1870 = fcmp uno float %1868, 0.000000e+00, !dbg !37 + %1871 = or i1 %1869, %1870, !dbg !38 + %1872 = select i1 %1871, float %1868, float %1834, !dbg !39 + %1873 = fcmp ogt float %1872, %1833, !dbg !35 + %1874 = fcmp uno float %1872, 0.000000e+00, !dbg !37 + %1875 = or i1 %1873, %1874, !dbg !38 + %1876 = select i1 %1875, float %1872, float %1833, !dbg !39 + %1877 = fcmp ogt float %1876, %1832, !dbg !35 + %1878 = fcmp uno float %1876, 0.000000e+00, !dbg !37 + %1879 = or i1 %1877, %1878, !dbg !38 + %1880 = select i1 %1879, float %1876, float %1832, !dbg !39 + %1881 = fcmp ogt float %1880, %1831, !dbg !35 + %1882 = fcmp uno float %1880, 0.000000e+00, !dbg !37 + %1883 = or i1 %1881, %1882, !dbg !38 + %1884 = select i1 %1883, float %1880, float %1831, !dbg !39 + %1885 = fcmp ogt float %1884, %1830, !dbg !35 + %1886 = fcmp uno float %1884, 0.000000e+00, !dbg !37 + %1887 = or i1 %1885, %1886, !dbg !38 + %1888 = select i1 %1887, float %1884, float %1830, !dbg !39 + %1889 = fcmp ogt float %1888, %1829, !dbg !35 + %1890 = fcmp uno float %1888, 0.000000e+00, !dbg !37 + %1891 = or i1 %1889, %1890, !dbg !38 + %1892 = select i1 %1891, float %1888, float %1829, !dbg !39 + %1893 = fcmp ogt float %1892, %1828, !dbg !35 + %1894 = fcmp uno float %1892, 0.000000e+00, !dbg !37 + %1895 = or i1 %1893, %1894, !dbg !38 + %1896 = select i1 %1895, float %1892, float %1828, !dbg !39 + %1897 = fcmp ogt float %1896, %1827, !dbg !35 + %1898 = fcmp uno float %1896, 0.000000e+00, !dbg !37 + %1899 = or i1 %1897, %1898, !dbg !38 + %1900 = select i1 %1899, float %1896, float %1827, !dbg !39 + %1901 = fcmp ogt float %1900, %1826, !dbg !35 + %1902 = fcmp uno float %1900, 0.000000e+00, !dbg !37 + %1903 = or i1 %1901, %1902, !dbg !38 + %1904 = select i1 %1903, float %1900, float %1826, !dbg !39 + %1905 = fcmp ogt float %1904, %130, !dbg !35 + %1906 = fcmp uno float %1904, 0.000000e+00, !dbg !37 + %1907 = or i1 %1905, %1906, !dbg !38 + %1908 = select i1 %1907, float %1904, float %130, !dbg !39 + %1909 = fcmp ogt float %1908, %132, !dbg !35 + %1910 = fcmp uno float %1908, 0.000000e+00, !dbg !37 + %1911 = or i1 %1909, %1910, !dbg !38 + %1912 = select i1 %1911, float %1908, float %132, !dbg !39 + %1913 = fcmp ogt float %1912, %134, !dbg !35 + %1914 = fcmp uno float %1912, 0.000000e+00, !dbg !37 + %1915 = or i1 %1913, %1914, !dbg !38 + %1916 = select i1 %1915, float %1912, float %134, !dbg !39 + %1917 = fcmp ogt float %1916, %136, !dbg !35 + %1918 = fcmp uno float %1916, 0.000000e+00, !dbg !37 + %1919 = or i1 %1917, %1918, !dbg !38 + %1920 = select i1 %1919, float %1916, float %136, !dbg !39 + %1921 = fcmp ogt float %1920, %138, !dbg !35 + %1922 = fcmp uno float %1920, 0.000000e+00, !dbg !37 + %1923 = or i1 %1921, %1922, !dbg !38 + %1924 = select i1 %1923, float %1920, float %138, !dbg !39 + %1925 = fcmp ogt float %1924, %140, !dbg !35 + %1926 = fcmp uno float %1924, 0.000000e+00, !dbg !37 + %1927 = or i1 %1925, %1926, !dbg !38 + %1928 = select i1 %1927, float %1924, float %140, !dbg !39 + %1929 = fcmp ogt float %1928, %142, !dbg !35 + %1930 = fcmp uno float %1928, 0.000000e+00, !dbg !37 + %1931 = or i1 %1929, %1930, !dbg !38 + %1932 = select i1 %1931, float %1928, float %142, !dbg !39 + %1933 = fcmp ogt float %1932, %144, !dbg !35 + %1934 = fcmp uno float %1932, 0.000000e+00, !dbg !37 + %1935 = or i1 %1933, %1934, !dbg !38 + %1936 = select i1 %1935, float %1932, float %144, !dbg !39 + %1937 = fcmp ogt float %1936, %1813, !dbg !35 + %1938 = fcmp uno float %1936, 0.000000e+00, !dbg !37 + %1939 = or i1 %1937, %1938, !dbg !38 + %1940 = select i1 %1939, float %1936, float %1813, !dbg !39 + %1941 = fcmp ogt float %1940, %1814, !dbg !35 + %1942 = fcmp uno float %1940, 0.000000e+00, !dbg !37 + %1943 = or i1 %1941, %1942, !dbg !38 + %1944 = select i1 %1943, float %1940, float %1814, !dbg !39 + %1945 = extractelement <2 x float> %1817, i64 0, !dbg !35 + %1946 = fcmp ogt float %1944, %1945, !dbg !35 + %1947 = fcmp uno float %1944, 0.000000e+00, !dbg !37 + %1948 = or i1 %1946, %1947, !dbg !38 + %1949 = select i1 %1948, float %1944, float %1945, !dbg !39 + %1950 = extractelement <2 x float> %1817, i64 1, !dbg !35 + %1951 = fcmp ogt float %1949, %1950, !dbg !35 + %1952 = fcmp uno float %1949, 0.000000e+00, !dbg !37 + %1953 = or i1 %1951, %1952, !dbg !38 + %1954 = select i1 %1953, float %1949, float %1950, !dbg !39 + %1955 = extractelement <2 x float> %1818, i64 0, !dbg !35 + %1956 = fcmp ogt float %1954, %1955, !dbg !35 + %1957 = fcmp uno float %1954, 0.000000e+00, !dbg !37 + %1958 = or i1 %1956, %1957, !dbg !38 + %1959 = select i1 %1958, float %1954, float %1955, !dbg !39 + %1960 = extractelement <2 x float> %1818, i64 1, !dbg !35 + %1961 = fcmp ogt float %1959, %1960, !dbg !35 + %1962 = fcmp uno float %1959, 0.000000e+00, !dbg !37 + %1963 = or i1 %1961, %1962, !dbg !38 + %1964 = select i1 %1963, float %1959, float %1960, !dbg !39 + %1965 = extractelement <2 x float> %1819, i64 0, !dbg !35 + %1966 = fcmp ogt float %1964, %1965, !dbg !35 + %1967 = fcmp uno float %1964, 0.000000e+00, !dbg !37 + %1968 = or i1 %1966, %1967, !dbg !38 + %1969 = select i1 %1968, float %1964, float %1965, !dbg !39 + %1970 = extractelement <2 x float> %1819, i64 1, !dbg !35 + %1971 = fcmp ogt float %1969, %1970, !dbg !35 + %1972 = fcmp uno float %1969, 0.000000e+00, !dbg !37 + %1973 = or i1 %1971, %1972, !dbg !38 + %1974 = select i1 %1973, float %1969, float %1970, !dbg !39 + %1975 = bitcast float %1974 to i32, !dbg !40 + %1976 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1975, i32 16, i32 31), !dbg !40 + %1977 = bitcast i32 %1976 to float, !dbg !40 + %1978 = fcmp ogt float %1974, %1977, !dbg !35 + %1979 = fcmp uno float %1974, 0.000000e+00, !dbg !37 + %1980 = or i1 %1979, %1978, !dbg !38 + %1981 = select i1 %1980, float %1974, float %1977, !dbg !39 + %1982 = bitcast float %1981 to i32, !dbg !40 + %1983 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1982, i32 8, i32 31), !dbg !40 + %1984 = bitcast i32 %1983 to float, !dbg !40 + %1985 = fcmp ogt float %1981, %1984, !dbg !35 + %1986 = fcmp uno float %1981, 0.000000e+00, !dbg !37 + %1987 = or i1 %1985, %1986, !dbg !38 + %1988 = select i1 %1987, float %1981, float %1984, !dbg !39 + %1989 = bitcast float %1988 to i32, !dbg !40 + %1990 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1989, i32 4, i32 31), !dbg !40 + %1991 = bitcast i32 %1990 to float, !dbg !40 + %1992 = fcmp ogt float %1988, %1991, !dbg !35 + %1993 = fcmp uno float %1988, 0.000000e+00, !dbg !37 + %1994 = or i1 %1992, %1993, !dbg !38 + %1995 = select i1 %1994, float %1988, float %1991, !dbg !39 + %1996 = bitcast float %1995 to i32, !dbg !40 + %1997 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1996, i32 2, i32 31), !dbg !40 + %1998 = bitcast i32 %1997 to float, !dbg !40 + %1999 = fcmp ogt float %1995, %1998, !dbg !35 + %2000 = fcmp uno float %1995, 0.000000e+00, !dbg !37 + %2001 = or i1 %1999, %2000, !dbg !38 + %2002 = select i1 %2001, float %1995, float %1998, !dbg !39 + %2003 = bitcast float %2002 to i32, !dbg !40 + %2004 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2003, i32 1, i32 31), !dbg !40 + %2005 = bitcast i32 %2004 to float, !dbg !40 + %2006 = fcmp ogt float %2002, %2005, !dbg !35 + %2007 = fcmp uno float %2002, 0.000000e+00, !dbg !37 + %2008 = or i1 %2006, %2007, !dbg !38 + %2009 = and i32 %1843, 15, !dbg !40 + %2010 = icmp eq i32 %1842, 0, !dbg !40 + %2011 = getelementptr float, ptr addrspace(3) @global_smem, i32 %2009, !dbg !40 + %2012 = select i1 %2008, i32 %2003, i32 %2004, !dbg !39 + %2013 = insertelement <1 x i32> poison, i32 %2012, i64 0, !dbg !40 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %2011, <1 x i32> %2013, i1 %2010) #6, !dbg !40 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !40 + %2014 = icmp samesign ult i32 %8, 16, !dbg !40 + %2015 = getelementptr float, ptr addrspace(3) @global_smem, i32 %8, !dbg !40 + %2016 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %2015, i1 %2014) #6, !dbg !40 + %2017 = bitcast i32 %2016 to float, !dbg !40 + %2018 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2016, i32 8, i32 31), !dbg !40 + %2019 = bitcast i32 %2018 to float, !dbg !40 + %2020 = fcmp ogt float %2017, %2019, !dbg !35 + %2021 = fcmp uno float %2017, 0.000000e+00, !dbg !37 + %2022 = or i1 %2021, %2020, !dbg !38 + %2023 = select i1 %2022, float %2017, float %2019, !dbg !39 + %2024 = bitcast float %2023 to i32, !dbg !40 + %2025 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2024, i32 4, i32 31), !dbg !40 + %2026 = bitcast i32 %2025 to float, !dbg !40 + %2027 = fcmp ogt float %2023, %2026, !dbg !35 + %2028 = fcmp uno float %2023, 0.000000e+00, !dbg !37 + %2029 = or i1 %2027, %2028, !dbg !38 + %2030 = select i1 %2029, float %2023, float %2026, !dbg !39 + %2031 = bitcast float %2030 to i32, !dbg !40 + %2032 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2031, i32 2, i32 31), !dbg !40 + %2033 = bitcast i32 %2032 to float, !dbg !40 + %2034 = fcmp ogt float %2030, %2033, !dbg !35 + %2035 = fcmp uno float %2030, 0.000000e+00, !dbg !37 + %2036 = or i1 %2034, %2035, !dbg !38 + %2037 = select i1 %2036, float %2030, float %2033, !dbg !39 + %2038 = bitcast float %2037 to i32, !dbg !40 + %2039 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2038, i32 1, i32 31), !dbg !40 + %2040 = bitcast i32 %2039 to float, !dbg !40 + %2041 = fcmp ogt float %2037, %2040, !dbg !35 + %2042 = fcmp uno float %2037, 0.000000e+00, !dbg !37 + %2043 = or i1 %2041, %2042, !dbg !38 + %2044 = icmp eq i32 %8, 0, !dbg !40 + %2045 = select i1 %2043, i32 %2038, i32 %2039, !dbg !39 + %2046 = insertelement <1 x i32> poison, i32 %2045, i64 0, !dbg !40 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %2015, <1 x i32> %2046, i1 %2044) #6, !dbg !40 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !40 + %2047 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !40 + %2048 = fcmp oeq float %2047, 0xFFF0000000000000, !dbg !41 + %2049 = fsub float %1841, %2047, !dbg !42 + %2050 = fsub float %1840, %2047, !dbg !42 + %2051 = fsub float %1839, %2047, !dbg !42 + %2052 = fsub float %1838, %2047, !dbg !42 + %2053 = fsub float %1837, %2047, !dbg !42 + %2054 = fsub float %1836, %2047, !dbg !42 + %2055 = fsub float %1835, %2047, !dbg !42 + %2056 = fsub float %1834, %2047, !dbg !42 + %2057 = fsub float %1833, %2047, !dbg !42 + %2058 = fsub float %1832, %2047, !dbg !42 + %2059 = fsub float %1831, %2047, !dbg !42 + %2060 = fsub float %1830, %2047, !dbg !42 + %2061 = fsub float %1829, %2047, !dbg !42 + %2062 = fsub float %1828, %2047, !dbg !42 + %2063 = fsub float %1827, %2047, !dbg !42 + %2064 = fsub float %1826, %2047, !dbg !42 + %2065 = fsub float %130, %2047, !dbg !42 + %2066 = fsub float %132, %2047, !dbg !42 + %2067 = fsub float %134, %2047, !dbg !42 + %2068 = fsub float %136, %2047, !dbg !42 + %2069 = fsub float %138, %2047, !dbg !42 + %2070 = fsub float %140, %2047, !dbg !42 + %2071 = fsub float %142, %2047, !dbg !42 + %2072 = fsub float %144, %2047, !dbg !42 + %2073 = fsub float %1813, %2047, !dbg !42 + %2074 = fsub float %1814, %2047, !dbg !42 + %2075 = fsub float %1945, %2047, !dbg !42 + %2076 = fsub float %1950, %2047, !dbg !42 + %2077 = fsub float %1955, %2047, !dbg !42 + %2078 = fsub float %1960, %2047, !dbg !42 + %2079 = fsub float %1965, %2047, !dbg !42 + %2080 = fsub float %1970, %2047, !dbg !42 + %2081 = select i1 %2048, float 0.000000e+00, float %2049, !dbg !43 + %2082 = select i1 %2048, float 0.000000e+00, float %2050, !dbg !43 + %2083 = select i1 %2048, float 0.000000e+00, float %2051, !dbg !43 + %2084 = select i1 %2048, float 0.000000e+00, float %2052, !dbg !43 + %2085 = select i1 %2048, float 0.000000e+00, float %2053, !dbg !43 + %2086 = select i1 %2048, float 0.000000e+00, float %2054, !dbg !43 + %2087 = select i1 %2048, float 0.000000e+00, float %2055, !dbg !43 + %2088 = select i1 %2048, float 0.000000e+00, float %2056, !dbg !43 + %2089 = select i1 %2048, float 0.000000e+00, float %2057, !dbg !43 + %2090 = select i1 %2048, float 0.000000e+00, float %2058, !dbg !43 + %2091 = select i1 %2048, float 0.000000e+00, float %2059, !dbg !43 + %2092 = select i1 %2048, float 0.000000e+00, float %2060, !dbg !43 + %2093 = select i1 %2048, float 0.000000e+00, float %2061, !dbg !43 + %2094 = select i1 %2048, float 0.000000e+00, float %2062, !dbg !43 + %2095 = select i1 %2048, float 0.000000e+00, float %2063, !dbg !43 + %2096 = select i1 %2048, float 0.000000e+00, float %2064, !dbg !43 + %2097 = select i1 %2048, float 0.000000e+00, float %2065, !dbg !43 + %2098 = select i1 %2048, float 0.000000e+00, float %2066, !dbg !43 + %2099 = select i1 %2048, float 0.000000e+00, float %2067, !dbg !43 + %2100 = select i1 %2048, float 0.000000e+00, float %2068, !dbg !43 + %2101 = select i1 %2048, float 0.000000e+00, float %2069, !dbg !43 + %2102 = select i1 %2048, float 0.000000e+00, float %2070, !dbg !43 + %2103 = select i1 %2048, float 0.000000e+00, float %2071, !dbg !43 + %2104 = select i1 %2048, float 0.000000e+00, float %2072, !dbg !43 + %2105 = select i1 %2048, float 0.000000e+00, float %2073, !dbg !43 + %2106 = select i1 %2048, float 0.000000e+00, float %2074, !dbg !43 + %2107 = select i1 %2048, float 0.000000e+00, float %2075, !dbg !43 + %2108 = select i1 %2048, float 0.000000e+00, float %2076, !dbg !43 + %2109 = select i1 %2048, float 0.000000e+00, float %2077, !dbg !43 + %2110 = select i1 %2048, float 0.000000e+00, float %2078, !dbg !43 + %2111 = select i1 %2048, float 0.000000e+00, float %2079, !dbg !43 + %2112 = select i1 %2048, float 0.000000e+00, float %2080, !dbg !43 + %2113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i = icmp eq i32 %2113, 0, !dbg !44 + %2114 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2081, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2115 = tail call float @llvm.nvvm.fma.rn.f(float %2081, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i = select i1 %.not.i, float %2115, float %2114, !dbg !44 + %2116 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i = icmp eq i32 %2116, 0, !dbg !44 + %2117 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i) #6, !dbg !44 + %2118 = tail call float @llvm.nvvm.saturate.f(float %.02.i) #6, !dbg !44 + %.03.i = select i1 %.not1.i, float %2118, float %2117, !dbg !44 + %2119 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i = icmp eq i32 %2119, 0, !dbg !44 + %2120 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2121 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i = select i1 %.not2.i, float %2121, float %2120, !dbg !44 + %2122 = fadd float %.04.i, 0xC168000FE0000000, !dbg !44 + %2123 = fneg float %2122, !dbg !44 + %2124 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i = icmp eq i32 %2124, 0, !dbg !44 + %2125 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2081, float 0x3FF7154760000000, float %2123) #6, !dbg !44 + %2126 = tail call float @llvm.nvvm.fma.rn.f(float %2081, float 0x3FF7154760000000, float %2123) #6, !dbg !44 + %.0.i = select i1 %.not3.i, float %2126, float %2125, !dbg !44 + %2127 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i = icmp eq i32 %2127, 0, !dbg !44 + %2128 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2081, float 0x3E54AE0C00000000, float %.0.i) #6, !dbg !44 + %2129 = tail call float @llvm.nvvm.fma.rn.f(float %2081, float 0x3E54AE0C00000000, float %.0.i) #6, !dbg !44 + %.01.i = select i1 %.not4.i, float %2129, float %2128, !dbg !44 + %2130 = bitcast float %.04.i to i32, !dbg !44 + %2131 = shl i32 %2130, 23, !dbg !44 + %2132 = bitcast i32 %2131 to float, !dbg !44 + %2133 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i) #6, !dbg !44 + %2134 = fmul float %2133, %2132, !dbg !44 + %2135 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i64 = icmp eq i32 %2135, 0, !dbg !44 + %2136 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2082, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2137 = tail call float @llvm.nvvm.fma.rn.f(float %2082, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i65 = select i1 %.not.i64, float %2137, float %2136, !dbg !44 + %2138 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i66 = icmp eq i32 %2138, 0, !dbg !44 + %2139 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i65) #6, !dbg !44 + %2140 = tail call float @llvm.nvvm.saturate.f(float %.02.i65) #6, !dbg !44 + %.03.i67 = select i1 %.not1.i66, float %2140, float %2139, !dbg !44 + %2141 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i68 = icmp eq i32 %2141, 0, !dbg !44 + %2142 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i67, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2143 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i67, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i69 = select i1 %.not2.i68, float %2143, float %2142, !dbg !44 + %2144 = fadd float %.04.i69, 0xC168000FE0000000, !dbg !44 + %2145 = fneg float %2144, !dbg !44 + %2146 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i70 = icmp eq i32 %2146, 0, !dbg !44 + %2147 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2082, float 0x3FF7154760000000, float %2145) #6, !dbg !44 + %2148 = tail call float @llvm.nvvm.fma.rn.f(float %2082, float 0x3FF7154760000000, float %2145) #6, !dbg !44 + %.0.i71 = select i1 %.not3.i70, float %2148, float %2147, !dbg !44 + %2149 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i72 = icmp eq i32 %2149, 0, !dbg !44 + %2150 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2082, float 0x3E54AE0C00000000, float %.0.i71) #6, !dbg !44 + %2151 = tail call float @llvm.nvvm.fma.rn.f(float %2082, float 0x3E54AE0C00000000, float %.0.i71) #6, !dbg !44 + %.01.i73 = select i1 %.not4.i72, float %2151, float %2150, !dbg !44 + %2152 = bitcast float %.04.i69 to i32, !dbg !44 + %2153 = shl i32 %2152, 23, !dbg !44 + %2154 = bitcast i32 %2153 to float, !dbg !44 + %2155 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i73) #6, !dbg !44 + %2156 = fmul float %2155, %2154, !dbg !44 + %2157 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i74 = icmp eq i32 %2157, 0, !dbg !44 + %2158 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2083, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2159 = tail call float @llvm.nvvm.fma.rn.f(float %2083, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i75 = select i1 %.not.i74, float %2159, float %2158, !dbg !44 + %2160 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i76 = icmp eq i32 %2160, 0, !dbg !44 + %2161 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i75) #6, !dbg !44 + %2162 = tail call float @llvm.nvvm.saturate.f(float %.02.i75) #6, !dbg !44 + %.03.i77 = select i1 %.not1.i76, float %2162, float %2161, !dbg !44 + %2163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i78 = icmp eq i32 %2163, 0, !dbg !44 + %2164 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i77, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2165 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i77, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i79 = select i1 %.not2.i78, float %2165, float %2164, !dbg !44 + %2166 = fadd float %.04.i79, 0xC168000FE0000000, !dbg !44 + %2167 = fneg float %2166, !dbg !44 + %2168 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i80 = icmp eq i32 %2168, 0, !dbg !44 + %2169 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2083, float 0x3FF7154760000000, float %2167) #6, !dbg !44 + %2170 = tail call float @llvm.nvvm.fma.rn.f(float %2083, float 0x3FF7154760000000, float %2167) #6, !dbg !44 + %.0.i81 = select i1 %.not3.i80, float %2170, float %2169, !dbg !44 + %2171 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i82 = icmp eq i32 %2171, 0, !dbg !44 + %2172 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2083, float 0x3E54AE0C00000000, float %.0.i81) #6, !dbg !44 + %2173 = tail call float @llvm.nvvm.fma.rn.f(float %2083, float 0x3E54AE0C00000000, float %.0.i81) #6, !dbg !44 + %.01.i83 = select i1 %.not4.i82, float %2173, float %2172, !dbg !44 + %2174 = bitcast float %.04.i79 to i32, !dbg !44 + %2175 = shl i32 %2174, 23, !dbg !44 + %2176 = bitcast i32 %2175 to float, !dbg !44 + %2177 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i83) #6, !dbg !44 + %2178 = fmul float %2177, %2176, !dbg !44 + %2179 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i84 = icmp eq i32 %2179, 0, !dbg !44 + %2180 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2084, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2181 = tail call float @llvm.nvvm.fma.rn.f(float %2084, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i85 = select i1 %.not.i84, float %2181, float %2180, !dbg !44 + %2182 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i86 = icmp eq i32 %2182, 0, !dbg !44 + %2183 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i85) #6, !dbg !44 + %2184 = tail call float @llvm.nvvm.saturate.f(float %.02.i85) #6, !dbg !44 + %.03.i87 = select i1 %.not1.i86, float %2184, float %2183, !dbg !44 + %2185 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i88 = icmp eq i32 %2185, 0, !dbg !44 + %2186 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i87, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2187 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i87, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i89 = select i1 %.not2.i88, float %2187, float %2186, !dbg !44 + %2188 = fadd float %.04.i89, 0xC168000FE0000000, !dbg !44 + %2189 = fneg float %2188, !dbg !44 + %2190 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i90 = icmp eq i32 %2190, 0, !dbg !44 + %2191 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2084, float 0x3FF7154760000000, float %2189) #6, !dbg !44 + %2192 = tail call float @llvm.nvvm.fma.rn.f(float %2084, float 0x3FF7154760000000, float %2189) #6, !dbg !44 + %.0.i91 = select i1 %.not3.i90, float %2192, float %2191, !dbg !44 + %2193 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i92 = icmp eq i32 %2193, 0, !dbg !44 + %2194 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2084, float 0x3E54AE0C00000000, float %.0.i91) #6, !dbg !44 + %2195 = tail call float @llvm.nvvm.fma.rn.f(float %2084, float 0x3E54AE0C00000000, float %.0.i91) #6, !dbg !44 + %.01.i93 = select i1 %.not4.i92, float %2195, float %2194, !dbg !44 + %2196 = bitcast float %.04.i89 to i32, !dbg !44 + %2197 = shl i32 %2196, 23, !dbg !44 + %2198 = bitcast i32 %2197 to float, !dbg !44 + %2199 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i93) #6, !dbg !44 + %2200 = fmul float %2199, %2198, !dbg !44 + %2201 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i94 = icmp eq i32 %2201, 0, !dbg !44 + %2202 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2085, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2203 = tail call float @llvm.nvvm.fma.rn.f(float %2085, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i95 = select i1 %.not.i94, float %2203, float %2202, !dbg !44 + %2204 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i96 = icmp eq i32 %2204, 0, !dbg !44 + %2205 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i95) #6, !dbg !44 + %2206 = tail call float @llvm.nvvm.saturate.f(float %.02.i95) #6, !dbg !44 + %.03.i97 = select i1 %.not1.i96, float %2206, float %2205, !dbg !44 + %2207 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i98 = icmp eq i32 %2207, 0, !dbg !44 + %2208 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i97, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2209 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i97, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i99 = select i1 %.not2.i98, float %2209, float %2208, !dbg !44 + %2210 = fadd float %.04.i99, 0xC168000FE0000000, !dbg !44 + %2211 = fneg float %2210, !dbg !44 + %2212 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i100 = icmp eq i32 %2212, 0, !dbg !44 + %2213 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2085, float 0x3FF7154760000000, float %2211) #6, !dbg !44 + %2214 = tail call float @llvm.nvvm.fma.rn.f(float %2085, float 0x3FF7154760000000, float %2211) #6, !dbg !44 + %.0.i101 = select i1 %.not3.i100, float %2214, float %2213, !dbg !44 + %2215 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i102 = icmp eq i32 %2215, 0, !dbg !44 + %2216 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2085, float 0x3E54AE0C00000000, float %.0.i101) #6, !dbg !44 + %2217 = tail call float @llvm.nvvm.fma.rn.f(float %2085, float 0x3E54AE0C00000000, float %.0.i101) #6, !dbg !44 + %.01.i103 = select i1 %.not4.i102, float %2217, float %2216, !dbg !44 + %2218 = bitcast float %.04.i99 to i32, !dbg !44 + %2219 = shl i32 %2218, 23, !dbg !44 + %2220 = bitcast i32 %2219 to float, !dbg !44 + %2221 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i103) #6, !dbg !44 + %2222 = fmul float %2221, %2220, !dbg !44 + %2223 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i104 = icmp eq i32 %2223, 0, !dbg !44 + %2224 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2086, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2225 = tail call float @llvm.nvvm.fma.rn.f(float %2086, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i105 = select i1 %.not.i104, float %2225, float %2224, !dbg !44 + %2226 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i106 = icmp eq i32 %2226, 0, !dbg !44 + %2227 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i105) #6, !dbg !44 + %2228 = tail call float @llvm.nvvm.saturate.f(float %.02.i105) #6, !dbg !44 + %.03.i107 = select i1 %.not1.i106, float %2228, float %2227, !dbg !44 + %2229 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i108 = icmp eq i32 %2229, 0, !dbg !44 + %2230 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i107, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2231 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i107, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i109 = select i1 %.not2.i108, float %2231, float %2230, !dbg !44 + %2232 = fadd float %.04.i109, 0xC168000FE0000000, !dbg !44 + %2233 = fneg float %2232, !dbg !44 + %2234 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i110 = icmp eq i32 %2234, 0, !dbg !44 + %2235 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2086, float 0x3FF7154760000000, float %2233) #6, !dbg !44 + %2236 = tail call float @llvm.nvvm.fma.rn.f(float %2086, float 0x3FF7154760000000, float %2233) #6, !dbg !44 + %.0.i111 = select i1 %.not3.i110, float %2236, float %2235, !dbg !44 + %2237 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i112 = icmp eq i32 %2237, 0, !dbg !44 + %2238 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2086, float 0x3E54AE0C00000000, float %.0.i111) #6, !dbg !44 + %2239 = tail call float @llvm.nvvm.fma.rn.f(float %2086, float 0x3E54AE0C00000000, float %.0.i111) #6, !dbg !44 + %.01.i113 = select i1 %.not4.i112, float %2239, float %2238, !dbg !44 + %2240 = bitcast float %.04.i109 to i32, !dbg !44 + %2241 = shl i32 %2240, 23, !dbg !44 + %2242 = bitcast i32 %2241 to float, !dbg !44 + %2243 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i113) #6, !dbg !44 + %2244 = fmul float %2243, %2242, !dbg !44 + %2245 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i114 = icmp eq i32 %2245, 0, !dbg !44 + %2246 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2087, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2247 = tail call float @llvm.nvvm.fma.rn.f(float %2087, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i115 = select i1 %.not.i114, float %2247, float %2246, !dbg !44 + %2248 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i116 = icmp eq i32 %2248, 0, !dbg !44 + %2249 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i115) #6, !dbg !44 + %2250 = tail call float @llvm.nvvm.saturate.f(float %.02.i115) #6, !dbg !44 + %.03.i117 = select i1 %.not1.i116, float %2250, float %2249, !dbg !44 + %2251 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i118 = icmp eq i32 %2251, 0, !dbg !44 + %2252 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i117, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2253 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i117, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i119 = select i1 %.not2.i118, float %2253, float %2252, !dbg !44 + %2254 = fadd float %.04.i119, 0xC168000FE0000000, !dbg !44 + %2255 = fneg float %2254, !dbg !44 + %2256 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i120 = icmp eq i32 %2256, 0, !dbg !44 + %2257 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2087, float 0x3FF7154760000000, float %2255) #6, !dbg !44 + %2258 = tail call float @llvm.nvvm.fma.rn.f(float %2087, float 0x3FF7154760000000, float %2255) #6, !dbg !44 + %.0.i121 = select i1 %.not3.i120, float %2258, float %2257, !dbg !44 + %2259 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i122 = icmp eq i32 %2259, 0, !dbg !44 + %2260 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2087, float 0x3E54AE0C00000000, float %.0.i121) #6, !dbg !44 + %2261 = tail call float @llvm.nvvm.fma.rn.f(float %2087, float 0x3E54AE0C00000000, float %.0.i121) #6, !dbg !44 + %.01.i123 = select i1 %.not4.i122, float %2261, float %2260, !dbg !44 + %2262 = bitcast float %.04.i119 to i32, !dbg !44 + %2263 = shl i32 %2262, 23, !dbg !44 + %2264 = bitcast i32 %2263 to float, !dbg !44 + %2265 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i123) #6, !dbg !44 + %2266 = fmul float %2265, %2264, !dbg !44 + %2267 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i124 = icmp eq i32 %2267, 0, !dbg !44 + %2268 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2088, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2269 = tail call float @llvm.nvvm.fma.rn.f(float %2088, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i125 = select i1 %.not.i124, float %2269, float %2268, !dbg !44 + %2270 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i126 = icmp eq i32 %2270, 0, !dbg !44 + %2271 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i125) #6, !dbg !44 + %2272 = tail call float @llvm.nvvm.saturate.f(float %.02.i125) #6, !dbg !44 + %.03.i127 = select i1 %.not1.i126, float %2272, float %2271, !dbg !44 + %2273 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i128 = icmp eq i32 %2273, 0, !dbg !44 + %2274 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i127, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2275 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i127, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i129 = select i1 %.not2.i128, float %2275, float %2274, !dbg !44 + %2276 = fadd float %.04.i129, 0xC168000FE0000000, !dbg !44 + %2277 = fneg float %2276, !dbg !44 + %2278 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i130 = icmp eq i32 %2278, 0, !dbg !44 + %2279 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2088, float 0x3FF7154760000000, float %2277) #6, !dbg !44 + %2280 = tail call float @llvm.nvvm.fma.rn.f(float %2088, float 0x3FF7154760000000, float %2277) #6, !dbg !44 + %.0.i131 = select i1 %.not3.i130, float %2280, float %2279, !dbg !44 + %2281 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i132 = icmp eq i32 %2281, 0, !dbg !44 + %2282 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2088, float 0x3E54AE0C00000000, float %.0.i131) #6, !dbg !44 + %2283 = tail call float @llvm.nvvm.fma.rn.f(float %2088, float 0x3E54AE0C00000000, float %.0.i131) #6, !dbg !44 + %.01.i133 = select i1 %.not4.i132, float %2283, float %2282, !dbg !44 + %2284 = bitcast float %.04.i129 to i32, !dbg !44 + %2285 = shl i32 %2284, 23, !dbg !44 + %2286 = bitcast i32 %2285 to float, !dbg !44 + %2287 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i133) #6, !dbg !44 + %2288 = fmul float %2287, %2286, !dbg !44 + %2289 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i134 = icmp eq i32 %2289, 0, !dbg !44 + %2290 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2089, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2291 = tail call float @llvm.nvvm.fma.rn.f(float %2089, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i135 = select i1 %.not.i134, float %2291, float %2290, !dbg !44 + %2292 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i136 = icmp eq i32 %2292, 0, !dbg !44 + %2293 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i135) #6, !dbg !44 + %2294 = tail call float @llvm.nvvm.saturate.f(float %.02.i135) #6, !dbg !44 + %.03.i137 = select i1 %.not1.i136, float %2294, float %2293, !dbg !44 + %2295 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i138 = icmp eq i32 %2295, 0, !dbg !44 + %2296 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i137, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2297 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i137, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i139 = select i1 %.not2.i138, float %2297, float %2296, !dbg !44 + %2298 = fadd float %.04.i139, 0xC168000FE0000000, !dbg !44 + %2299 = fneg float %2298, !dbg !44 + %2300 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i140 = icmp eq i32 %2300, 0, !dbg !44 + %2301 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2089, float 0x3FF7154760000000, float %2299) #6, !dbg !44 + %2302 = tail call float @llvm.nvvm.fma.rn.f(float %2089, float 0x3FF7154760000000, float %2299) #6, !dbg !44 + %.0.i141 = select i1 %.not3.i140, float %2302, float %2301, !dbg !44 + %2303 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i142 = icmp eq i32 %2303, 0, !dbg !44 + %2304 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2089, float 0x3E54AE0C00000000, float %.0.i141) #6, !dbg !44 + %2305 = tail call float @llvm.nvvm.fma.rn.f(float %2089, float 0x3E54AE0C00000000, float %.0.i141) #6, !dbg !44 + %.01.i143 = select i1 %.not4.i142, float %2305, float %2304, !dbg !44 + %2306 = bitcast float %.04.i139 to i32, !dbg !44 + %2307 = shl i32 %2306, 23, !dbg !44 + %2308 = bitcast i32 %2307 to float, !dbg !44 + %2309 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i143) #6, !dbg !44 + %2310 = fmul float %2309, %2308, !dbg !44 + %2311 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i144 = icmp eq i32 %2311, 0, !dbg !44 + %2312 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2090, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2313 = tail call float @llvm.nvvm.fma.rn.f(float %2090, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i145 = select i1 %.not.i144, float %2313, float %2312, !dbg !44 + %2314 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i146 = icmp eq i32 %2314, 0, !dbg !44 + %2315 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i145) #6, !dbg !44 + %2316 = tail call float @llvm.nvvm.saturate.f(float %.02.i145) #6, !dbg !44 + %.03.i147 = select i1 %.not1.i146, float %2316, float %2315, !dbg !44 + %2317 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i148 = icmp eq i32 %2317, 0, !dbg !44 + %2318 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i147, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2319 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i147, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i149 = select i1 %.not2.i148, float %2319, float %2318, !dbg !44 + %2320 = fadd float %.04.i149, 0xC168000FE0000000, !dbg !44 + %2321 = fneg float %2320, !dbg !44 + %2322 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i150 = icmp eq i32 %2322, 0, !dbg !44 + %2323 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2090, float 0x3FF7154760000000, float %2321) #6, !dbg !44 + %2324 = tail call float @llvm.nvvm.fma.rn.f(float %2090, float 0x3FF7154760000000, float %2321) #6, !dbg !44 + %.0.i151 = select i1 %.not3.i150, float %2324, float %2323, !dbg !44 + %2325 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i152 = icmp eq i32 %2325, 0, !dbg !44 + %2326 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2090, float 0x3E54AE0C00000000, float %.0.i151) #6, !dbg !44 + %2327 = tail call float @llvm.nvvm.fma.rn.f(float %2090, float 0x3E54AE0C00000000, float %.0.i151) #6, !dbg !44 + %.01.i153 = select i1 %.not4.i152, float %2327, float %2326, !dbg !44 + %2328 = bitcast float %.04.i149 to i32, !dbg !44 + %2329 = shl i32 %2328, 23, !dbg !44 + %2330 = bitcast i32 %2329 to float, !dbg !44 + %2331 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i153) #6, !dbg !44 + %2332 = fmul float %2331, %2330, !dbg !44 + %2333 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i154 = icmp eq i32 %2333, 0, !dbg !44 + %2334 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2091, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2335 = tail call float @llvm.nvvm.fma.rn.f(float %2091, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i155 = select i1 %.not.i154, float %2335, float %2334, !dbg !44 + %2336 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i156 = icmp eq i32 %2336, 0, !dbg !44 + %2337 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i155) #6, !dbg !44 + %2338 = tail call float @llvm.nvvm.saturate.f(float %.02.i155) #6, !dbg !44 + %.03.i157 = select i1 %.not1.i156, float %2338, float %2337, !dbg !44 + %2339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i158 = icmp eq i32 %2339, 0, !dbg !44 + %2340 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i157, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2341 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i157, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i159 = select i1 %.not2.i158, float %2341, float %2340, !dbg !44 + %2342 = fadd float %.04.i159, 0xC168000FE0000000, !dbg !44 + %2343 = fneg float %2342, !dbg !44 + %2344 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i160 = icmp eq i32 %2344, 0, !dbg !44 + %2345 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2091, float 0x3FF7154760000000, float %2343) #6, !dbg !44 + %2346 = tail call float @llvm.nvvm.fma.rn.f(float %2091, float 0x3FF7154760000000, float %2343) #6, !dbg !44 + %.0.i161 = select i1 %.not3.i160, float %2346, float %2345, !dbg !44 + %2347 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i162 = icmp eq i32 %2347, 0, !dbg !44 + %2348 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2091, float 0x3E54AE0C00000000, float %.0.i161) #6, !dbg !44 + %2349 = tail call float @llvm.nvvm.fma.rn.f(float %2091, float 0x3E54AE0C00000000, float %.0.i161) #6, !dbg !44 + %.01.i163 = select i1 %.not4.i162, float %2349, float %2348, !dbg !44 + %2350 = bitcast float %.04.i159 to i32, !dbg !44 + %2351 = shl i32 %2350, 23, !dbg !44 + %2352 = bitcast i32 %2351 to float, !dbg !44 + %2353 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i163) #6, !dbg !44 + %2354 = fmul float %2353, %2352, !dbg !44 + %2355 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i164 = icmp eq i32 %2355, 0, !dbg !44 + %2356 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2092, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2357 = tail call float @llvm.nvvm.fma.rn.f(float %2092, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i165 = select i1 %.not.i164, float %2357, float %2356, !dbg !44 + %2358 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i166 = icmp eq i32 %2358, 0, !dbg !44 + %2359 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i165) #6, !dbg !44 + %2360 = tail call float @llvm.nvvm.saturate.f(float %.02.i165) #6, !dbg !44 + %.03.i167 = select i1 %.not1.i166, float %2360, float %2359, !dbg !44 + %2361 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i168 = icmp eq i32 %2361, 0, !dbg !44 + %2362 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i167, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2363 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i167, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i169 = select i1 %.not2.i168, float %2363, float %2362, !dbg !44 + %2364 = fadd float %.04.i169, 0xC168000FE0000000, !dbg !44 + %2365 = fneg float %2364, !dbg !44 + %2366 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i170 = icmp eq i32 %2366, 0, !dbg !44 + %2367 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2092, float 0x3FF7154760000000, float %2365) #6, !dbg !44 + %2368 = tail call float @llvm.nvvm.fma.rn.f(float %2092, float 0x3FF7154760000000, float %2365) #6, !dbg !44 + %.0.i171 = select i1 %.not3.i170, float %2368, float %2367, !dbg !44 + %2369 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i172 = icmp eq i32 %2369, 0, !dbg !44 + %2370 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2092, float 0x3E54AE0C00000000, float %.0.i171) #6, !dbg !44 + %2371 = tail call float @llvm.nvvm.fma.rn.f(float %2092, float 0x3E54AE0C00000000, float %.0.i171) #6, !dbg !44 + %.01.i173 = select i1 %.not4.i172, float %2371, float %2370, !dbg !44 + %2372 = bitcast float %.04.i169 to i32, !dbg !44 + %2373 = shl i32 %2372, 23, !dbg !44 + %2374 = bitcast i32 %2373 to float, !dbg !44 + %2375 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i173) #6, !dbg !44 + %2376 = fmul float %2375, %2374, !dbg !44 + %2377 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i174 = icmp eq i32 %2377, 0, !dbg !44 + %2378 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2093, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2379 = tail call float @llvm.nvvm.fma.rn.f(float %2093, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i175 = select i1 %.not.i174, float %2379, float %2378, !dbg !44 + %2380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i176 = icmp eq i32 %2380, 0, !dbg !44 + %2381 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i175) #6, !dbg !44 + %2382 = tail call float @llvm.nvvm.saturate.f(float %.02.i175) #6, !dbg !44 + %.03.i177 = select i1 %.not1.i176, float %2382, float %2381, !dbg !44 + %2383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i178 = icmp eq i32 %2383, 0, !dbg !44 + %2384 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i177, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2385 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i177, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i179 = select i1 %.not2.i178, float %2385, float %2384, !dbg !44 + %2386 = fadd float %.04.i179, 0xC168000FE0000000, !dbg !44 + %2387 = fneg float %2386, !dbg !44 + %2388 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i180 = icmp eq i32 %2388, 0, !dbg !44 + %2389 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2093, float 0x3FF7154760000000, float %2387) #6, !dbg !44 + %2390 = tail call float @llvm.nvvm.fma.rn.f(float %2093, float 0x3FF7154760000000, float %2387) #6, !dbg !44 + %.0.i181 = select i1 %.not3.i180, float %2390, float %2389, !dbg !44 + %2391 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i182 = icmp eq i32 %2391, 0, !dbg !44 + %2392 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2093, float 0x3E54AE0C00000000, float %.0.i181) #6, !dbg !44 + %2393 = tail call float @llvm.nvvm.fma.rn.f(float %2093, float 0x3E54AE0C00000000, float %.0.i181) #6, !dbg !44 + %.01.i183 = select i1 %.not4.i182, float %2393, float %2392, !dbg !44 + %2394 = bitcast float %.04.i179 to i32, !dbg !44 + %2395 = shl i32 %2394, 23, !dbg !44 + %2396 = bitcast i32 %2395 to float, !dbg !44 + %2397 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i183) #6, !dbg !44 + %2398 = fmul float %2397, %2396, !dbg !44 + %2399 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i184 = icmp eq i32 %2399, 0, !dbg !44 + %2400 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2094, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2401 = tail call float @llvm.nvvm.fma.rn.f(float %2094, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i185 = select i1 %.not.i184, float %2401, float %2400, !dbg !44 + %2402 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i186 = icmp eq i32 %2402, 0, !dbg !44 + %2403 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i185) #6, !dbg !44 + %2404 = tail call float @llvm.nvvm.saturate.f(float %.02.i185) #6, !dbg !44 + %.03.i187 = select i1 %.not1.i186, float %2404, float %2403, !dbg !44 + %2405 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i188 = icmp eq i32 %2405, 0, !dbg !44 + %2406 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i187, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2407 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i187, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i189 = select i1 %.not2.i188, float %2407, float %2406, !dbg !44 + %2408 = fadd float %.04.i189, 0xC168000FE0000000, !dbg !44 + %2409 = fneg float %2408, !dbg !44 + %2410 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i190 = icmp eq i32 %2410, 0, !dbg !44 + %2411 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2094, float 0x3FF7154760000000, float %2409) #6, !dbg !44 + %2412 = tail call float @llvm.nvvm.fma.rn.f(float %2094, float 0x3FF7154760000000, float %2409) #6, !dbg !44 + %.0.i191 = select i1 %.not3.i190, float %2412, float %2411, !dbg !44 + %2413 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i192 = icmp eq i32 %2413, 0, !dbg !44 + %2414 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2094, float 0x3E54AE0C00000000, float %.0.i191) #6, !dbg !44 + %2415 = tail call float @llvm.nvvm.fma.rn.f(float %2094, float 0x3E54AE0C00000000, float %.0.i191) #6, !dbg !44 + %.01.i193 = select i1 %.not4.i192, float %2415, float %2414, !dbg !44 + %2416 = bitcast float %.04.i189 to i32, !dbg !44 + %2417 = shl i32 %2416, 23, !dbg !44 + %2418 = bitcast i32 %2417 to float, !dbg !44 + %2419 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i193) #6, !dbg !44 + %2420 = fmul float %2419, %2418, !dbg !44 + %2421 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i194 = icmp eq i32 %2421, 0, !dbg !44 + %2422 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2095, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2423 = tail call float @llvm.nvvm.fma.rn.f(float %2095, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i195 = select i1 %.not.i194, float %2423, float %2422, !dbg !44 + %2424 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i196 = icmp eq i32 %2424, 0, !dbg !44 + %2425 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i195) #6, !dbg !44 + %2426 = tail call float @llvm.nvvm.saturate.f(float %.02.i195) #6, !dbg !44 + %.03.i197 = select i1 %.not1.i196, float %2426, float %2425, !dbg !44 + %2427 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i198 = icmp eq i32 %2427, 0, !dbg !44 + %2428 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i197, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2429 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i197, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i199 = select i1 %.not2.i198, float %2429, float %2428, !dbg !44 + %2430 = fadd float %.04.i199, 0xC168000FE0000000, !dbg !44 + %2431 = fneg float %2430, !dbg !44 + %2432 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i200 = icmp eq i32 %2432, 0, !dbg !44 + %2433 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2095, float 0x3FF7154760000000, float %2431) #6, !dbg !44 + %2434 = tail call float @llvm.nvvm.fma.rn.f(float %2095, float 0x3FF7154760000000, float %2431) #6, !dbg !44 + %.0.i201 = select i1 %.not3.i200, float %2434, float %2433, !dbg !44 + %2435 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i202 = icmp eq i32 %2435, 0, !dbg !44 + %2436 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2095, float 0x3E54AE0C00000000, float %.0.i201) #6, !dbg !44 + %2437 = tail call float @llvm.nvvm.fma.rn.f(float %2095, float 0x3E54AE0C00000000, float %.0.i201) #6, !dbg !44 + %.01.i203 = select i1 %.not4.i202, float %2437, float %2436, !dbg !44 + %2438 = bitcast float %.04.i199 to i32, !dbg !44 + %2439 = shl i32 %2438, 23, !dbg !44 + %2440 = bitcast i32 %2439 to float, !dbg !44 + %2441 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i203) #6, !dbg !44 + %2442 = fmul float %2441, %2440, !dbg !44 + %2443 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i204 = icmp eq i32 %2443, 0, !dbg !44 + %2444 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2096, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2445 = tail call float @llvm.nvvm.fma.rn.f(float %2096, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i205 = select i1 %.not.i204, float %2445, float %2444, !dbg !44 + %2446 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i206 = icmp eq i32 %2446, 0, !dbg !44 + %2447 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i205) #6, !dbg !44 + %2448 = tail call float @llvm.nvvm.saturate.f(float %.02.i205) #6, !dbg !44 + %.03.i207 = select i1 %.not1.i206, float %2448, float %2447, !dbg !44 + %2449 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i208 = icmp eq i32 %2449, 0, !dbg !44 + %2450 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i207, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2451 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i207, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i209 = select i1 %.not2.i208, float %2451, float %2450, !dbg !44 + %2452 = fadd float %.04.i209, 0xC168000FE0000000, !dbg !44 + %2453 = fneg float %2452, !dbg !44 + %2454 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i210 = icmp eq i32 %2454, 0, !dbg !44 + %2455 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2096, float 0x3FF7154760000000, float %2453) #6, !dbg !44 + %2456 = tail call float @llvm.nvvm.fma.rn.f(float %2096, float 0x3FF7154760000000, float %2453) #6, !dbg !44 + %.0.i211 = select i1 %.not3.i210, float %2456, float %2455, !dbg !44 + %2457 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i212 = icmp eq i32 %2457, 0, !dbg !44 + %2458 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2096, float 0x3E54AE0C00000000, float %.0.i211) #6, !dbg !44 + %2459 = tail call float @llvm.nvvm.fma.rn.f(float %2096, float 0x3E54AE0C00000000, float %.0.i211) #6, !dbg !44 + %.01.i213 = select i1 %.not4.i212, float %2459, float %2458, !dbg !44 + %2460 = bitcast float %.04.i209 to i32, !dbg !44 + %2461 = shl i32 %2460, 23, !dbg !44 + %2462 = bitcast i32 %2461 to float, !dbg !44 + %2463 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i213) #6, !dbg !44 + %2464 = fmul float %2463, %2462, !dbg !44 + %2465 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i214 = icmp eq i32 %2465, 0, !dbg !44 + %2466 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2097, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2467 = tail call float @llvm.nvvm.fma.rn.f(float %2097, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i215 = select i1 %.not.i214, float %2467, float %2466, !dbg !44 + %2468 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i216 = icmp eq i32 %2468, 0, !dbg !44 + %2469 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i215) #6, !dbg !44 + %2470 = tail call float @llvm.nvvm.saturate.f(float %.02.i215) #6, !dbg !44 + %.03.i217 = select i1 %.not1.i216, float %2470, float %2469, !dbg !44 + %2471 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i218 = icmp eq i32 %2471, 0, !dbg !44 + %2472 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i217, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2473 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i217, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i219 = select i1 %.not2.i218, float %2473, float %2472, !dbg !44 + %2474 = fadd float %.04.i219, 0xC168000FE0000000, !dbg !44 + %2475 = fneg float %2474, !dbg !44 + %2476 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i220 = icmp eq i32 %2476, 0, !dbg !44 + %2477 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2097, float 0x3FF7154760000000, float %2475) #6, !dbg !44 + %2478 = tail call float @llvm.nvvm.fma.rn.f(float %2097, float 0x3FF7154760000000, float %2475) #6, !dbg !44 + %.0.i221 = select i1 %.not3.i220, float %2478, float %2477, !dbg !44 + %2479 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i222 = icmp eq i32 %2479, 0, !dbg !44 + %2480 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2097, float 0x3E54AE0C00000000, float %.0.i221) #6, !dbg !44 + %2481 = tail call float @llvm.nvvm.fma.rn.f(float %2097, float 0x3E54AE0C00000000, float %.0.i221) #6, !dbg !44 + %.01.i223 = select i1 %.not4.i222, float %2481, float %2480, !dbg !44 + %2482 = bitcast float %.04.i219 to i32, !dbg !44 + %2483 = shl i32 %2482, 23, !dbg !44 + %2484 = bitcast i32 %2483 to float, !dbg !44 + %2485 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i223) #6, !dbg !44 + %2486 = fmul float %2485, %2484, !dbg !44 + %2487 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i224 = icmp eq i32 %2487, 0, !dbg !44 + %2488 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2098, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2489 = tail call float @llvm.nvvm.fma.rn.f(float %2098, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i225 = select i1 %.not.i224, float %2489, float %2488, !dbg !44 + %2490 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i226 = icmp eq i32 %2490, 0, !dbg !44 + %2491 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i225) #6, !dbg !44 + %2492 = tail call float @llvm.nvvm.saturate.f(float %.02.i225) #6, !dbg !44 + %.03.i227 = select i1 %.not1.i226, float %2492, float %2491, !dbg !44 + %2493 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i228 = icmp eq i32 %2493, 0, !dbg !44 + %2494 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i227, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2495 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i227, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i229 = select i1 %.not2.i228, float %2495, float %2494, !dbg !44 + %2496 = fadd float %.04.i229, 0xC168000FE0000000, !dbg !44 + %2497 = fneg float %2496, !dbg !44 + %2498 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i230 = icmp eq i32 %2498, 0, !dbg !44 + %2499 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2098, float 0x3FF7154760000000, float %2497) #6, !dbg !44 + %2500 = tail call float @llvm.nvvm.fma.rn.f(float %2098, float 0x3FF7154760000000, float %2497) #6, !dbg !44 + %.0.i231 = select i1 %.not3.i230, float %2500, float %2499, !dbg !44 + %2501 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i232 = icmp eq i32 %2501, 0, !dbg !44 + %2502 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2098, float 0x3E54AE0C00000000, float %.0.i231) #6, !dbg !44 + %2503 = tail call float @llvm.nvvm.fma.rn.f(float %2098, float 0x3E54AE0C00000000, float %.0.i231) #6, !dbg !44 + %.01.i233 = select i1 %.not4.i232, float %2503, float %2502, !dbg !44 + %2504 = bitcast float %.04.i229 to i32, !dbg !44 + %2505 = shl i32 %2504, 23, !dbg !44 + %2506 = bitcast i32 %2505 to float, !dbg !44 + %2507 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i233) #6, !dbg !44 + %2508 = fmul float %2507, %2506, !dbg !44 + %2509 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i234 = icmp eq i32 %2509, 0, !dbg !44 + %2510 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2099, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2511 = tail call float @llvm.nvvm.fma.rn.f(float %2099, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i235 = select i1 %.not.i234, float %2511, float %2510, !dbg !44 + %2512 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i236 = icmp eq i32 %2512, 0, !dbg !44 + %2513 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i235) #6, !dbg !44 + %2514 = tail call float @llvm.nvvm.saturate.f(float %.02.i235) #6, !dbg !44 + %.03.i237 = select i1 %.not1.i236, float %2514, float %2513, !dbg !44 + %2515 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i238 = icmp eq i32 %2515, 0, !dbg !44 + %2516 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i237, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2517 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i237, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i239 = select i1 %.not2.i238, float %2517, float %2516, !dbg !44 + %2518 = fadd float %.04.i239, 0xC168000FE0000000, !dbg !44 + %2519 = fneg float %2518, !dbg !44 + %2520 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i240 = icmp eq i32 %2520, 0, !dbg !44 + %2521 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2099, float 0x3FF7154760000000, float %2519) #6, !dbg !44 + %2522 = tail call float @llvm.nvvm.fma.rn.f(float %2099, float 0x3FF7154760000000, float %2519) #6, !dbg !44 + %.0.i241 = select i1 %.not3.i240, float %2522, float %2521, !dbg !44 + %2523 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i242 = icmp eq i32 %2523, 0, !dbg !44 + %2524 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2099, float 0x3E54AE0C00000000, float %.0.i241) #6, !dbg !44 + %2525 = tail call float @llvm.nvvm.fma.rn.f(float %2099, float 0x3E54AE0C00000000, float %.0.i241) #6, !dbg !44 + %.01.i243 = select i1 %.not4.i242, float %2525, float %2524, !dbg !44 + %2526 = bitcast float %.04.i239 to i32, !dbg !44 + %2527 = shl i32 %2526, 23, !dbg !44 + %2528 = bitcast i32 %2527 to float, !dbg !44 + %2529 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i243) #6, !dbg !44 + %2530 = fmul float %2529, %2528, !dbg !44 + %2531 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i244 = icmp eq i32 %2531, 0, !dbg !44 + %2532 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2100, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2533 = tail call float @llvm.nvvm.fma.rn.f(float %2100, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i245 = select i1 %.not.i244, float %2533, float %2532, !dbg !44 + %2534 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i246 = icmp eq i32 %2534, 0, !dbg !44 + %2535 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i245) #6, !dbg !44 + %2536 = tail call float @llvm.nvvm.saturate.f(float %.02.i245) #6, !dbg !44 + %.03.i247 = select i1 %.not1.i246, float %2536, float %2535, !dbg !44 + %2537 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i248 = icmp eq i32 %2537, 0, !dbg !44 + %2538 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i247, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2539 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i247, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i249 = select i1 %.not2.i248, float %2539, float %2538, !dbg !44 + %2540 = fadd float %.04.i249, 0xC168000FE0000000, !dbg !44 + %2541 = fneg float %2540, !dbg !44 + %2542 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i250 = icmp eq i32 %2542, 0, !dbg !44 + %2543 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2100, float 0x3FF7154760000000, float %2541) #6, !dbg !44 + %2544 = tail call float @llvm.nvvm.fma.rn.f(float %2100, float 0x3FF7154760000000, float %2541) #6, !dbg !44 + %.0.i251 = select i1 %.not3.i250, float %2544, float %2543, !dbg !44 + %2545 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i252 = icmp eq i32 %2545, 0, !dbg !44 + %2546 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2100, float 0x3E54AE0C00000000, float %.0.i251) #6, !dbg !44 + %2547 = tail call float @llvm.nvvm.fma.rn.f(float %2100, float 0x3E54AE0C00000000, float %.0.i251) #6, !dbg !44 + %.01.i253 = select i1 %.not4.i252, float %2547, float %2546, !dbg !44 + %2548 = bitcast float %.04.i249 to i32, !dbg !44 + %2549 = shl i32 %2548, 23, !dbg !44 + %2550 = bitcast i32 %2549 to float, !dbg !44 + %2551 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i253) #6, !dbg !44 + %2552 = fmul float %2551, %2550, !dbg !44 + %2553 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i254 = icmp eq i32 %2553, 0, !dbg !44 + %2554 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2101, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2555 = tail call float @llvm.nvvm.fma.rn.f(float %2101, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i255 = select i1 %.not.i254, float %2555, float %2554, !dbg !44 + %2556 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i256 = icmp eq i32 %2556, 0, !dbg !44 + %2557 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i255) #6, !dbg !44 + %2558 = tail call float @llvm.nvvm.saturate.f(float %.02.i255) #6, !dbg !44 + %.03.i257 = select i1 %.not1.i256, float %2558, float %2557, !dbg !44 + %2559 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i258 = icmp eq i32 %2559, 0, !dbg !44 + %2560 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i257, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2561 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i257, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i259 = select i1 %.not2.i258, float %2561, float %2560, !dbg !44 + %2562 = fadd float %.04.i259, 0xC168000FE0000000, !dbg !44 + %2563 = fneg float %2562, !dbg !44 + %2564 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i260 = icmp eq i32 %2564, 0, !dbg !44 + %2565 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2101, float 0x3FF7154760000000, float %2563) #6, !dbg !44 + %2566 = tail call float @llvm.nvvm.fma.rn.f(float %2101, float 0x3FF7154760000000, float %2563) #6, !dbg !44 + %.0.i261 = select i1 %.not3.i260, float %2566, float %2565, !dbg !44 + %2567 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i262 = icmp eq i32 %2567, 0, !dbg !44 + %2568 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2101, float 0x3E54AE0C00000000, float %.0.i261) #6, !dbg !44 + %2569 = tail call float @llvm.nvvm.fma.rn.f(float %2101, float 0x3E54AE0C00000000, float %.0.i261) #6, !dbg !44 + %.01.i263 = select i1 %.not4.i262, float %2569, float %2568, !dbg !44 + %2570 = bitcast float %.04.i259 to i32, !dbg !44 + %2571 = shl i32 %2570, 23, !dbg !44 + %2572 = bitcast i32 %2571 to float, !dbg !44 + %2573 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i263) #6, !dbg !44 + %2574 = fmul float %2573, %2572, !dbg !44 + %2575 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i264 = icmp eq i32 %2575, 0, !dbg !44 + %2576 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2102, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2577 = tail call float @llvm.nvvm.fma.rn.f(float %2102, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i265 = select i1 %.not.i264, float %2577, float %2576, !dbg !44 + %2578 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i266 = icmp eq i32 %2578, 0, !dbg !44 + %2579 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i265) #6, !dbg !44 + %2580 = tail call float @llvm.nvvm.saturate.f(float %.02.i265) #6, !dbg !44 + %.03.i267 = select i1 %.not1.i266, float %2580, float %2579, !dbg !44 + %2581 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i268 = icmp eq i32 %2581, 0, !dbg !44 + %2582 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i267, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2583 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i267, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i269 = select i1 %.not2.i268, float %2583, float %2582, !dbg !44 + %2584 = fadd float %.04.i269, 0xC168000FE0000000, !dbg !44 + %2585 = fneg float %2584, !dbg !44 + %2586 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i270 = icmp eq i32 %2586, 0, !dbg !44 + %2587 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2102, float 0x3FF7154760000000, float %2585) #6, !dbg !44 + %2588 = tail call float @llvm.nvvm.fma.rn.f(float %2102, float 0x3FF7154760000000, float %2585) #6, !dbg !44 + %.0.i271 = select i1 %.not3.i270, float %2588, float %2587, !dbg !44 + %2589 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i272 = icmp eq i32 %2589, 0, !dbg !44 + %2590 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2102, float 0x3E54AE0C00000000, float %.0.i271) #6, !dbg !44 + %2591 = tail call float @llvm.nvvm.fma.rn.f(float %2102, float 0x3E54AE0C00000000, float %.0.i271) #6, !dbg !44 + %.01.i273 = select i1 %.not4.i272, float %2591, float %2590, !dbg !44 + %2592 = bitcast float %.04.i269 to i32, !dbg !44 + %2593 = shl i32 %2592, 23, !dbg !44 + %2594 = bitcast i32 %2593 to float, !dbg !44 + %2595 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i273) #6, !dbg !44 + %2596 = fmul float %2595, %2594, !dbg !44 + %2597 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i274 = icmp eq i32 %2597, 0, !dbg !44 + %2598 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2103, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2599 = tail call float @llvm.nvvm.fma.rn.f(float %2103, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i275 = select i1 %.not.i274, float %2599, float %2598, !dbg !44 + %2600 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i276 = icmp eq i32 %2600, 0, !dbg !44 + %2601 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i275) #6, !dbg !44 + %2602 = tail call float @llvm.nvvm.saturate.f(float %.02.i275) #6, !dbg !44 + %.03.i277 = select i1 %.not1.i276, float %2602, float %2601, !dbg !44 + %2603 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i278 = icmp eq i32 %2603, 0, !dbg !44 + %2604 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i277, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2605 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i277, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i279 = select i1 %.not2.i278, float %2605, float %2604, !dbg !44 + %2606 = fadd float %.04.i279, 0xC168000FE0000000, !dbg !44 + %2607 = fneg float %2606, !dbg !44 + %2608 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i280 = icmp eq i32 %2608, 0, !dbg !44 + %2609 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2103, float 0x3FF7154760000000, float %2607) #6, !dbg !44 + %2610 = tail call float @llvm.nvvm.fma.rn.f(float %2103, float 0x3FF7154760000000, float %2607) #6, !dbg !44 + %.0.i281 = select i1 %.not3.i280, float %2610, float %2609, !dbg !44 + %2611 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i282 = icmp eq i32 %2611, 0, !dbg !44 + %2612 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2103, float 0x3E54AE0C00000000, float %.0.i281) #6, !dbg !44 + %2613 = tail call float @llvm.nvvm.fma.rn.f(float %2103, float 0x3E54AE0C00000000, float %.0.i281) #6, !dbg !44 + %.01.i283 = select i1 %.not4.i282, float %2613, float %2612, !dbg !44 + %2614 = bitcast float %.04.i279 to i32, !dbg !44 + %2615 = shl i32 %2614, 23, !dbg !44 + %2616 = bitcast i32 %2615 to float, !dbg !44 + %2617 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i283) #6, !dbg !44 + %2618 = fmul float %2617, %2616, !dbg !44 + %2619 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i284 = icmp eq i32 %2619, 0, !dbg !44 + %2620 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2104, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2621 = tail call float @llvm.nvvm.fma.rn.f(float %2104, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i285 = select i1 %.not.i284, float %2621, float %2620, !dbg !44 + %2622 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i286 = icmp eq i32 %2622, 0, !dbg !44 + %2623 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i285) #6, !dbg !44 + %2624 = tail call float @llvm.nvvm.saturate.f(float %.02.i285) #6, !dbg !44 + %.03.i287 = select i1 %.not1.i286, float %2624, float %2623, !dbg !44 + %2625 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i288 = icmp eq i32 %2625, 0, !dbg !44 + %2626 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i287, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2627 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i287, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i289 = select i1 %.not2.i288, float %2627, float %2626, !dbg !44 + %2628 = fadd float %.04.i289, 0xC168000FE0000000, !dbg !44 + %2629 = fneg float %2628, !dbg !44 + %2630 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i290 = icmp eq i32 %2630, 0, !dbg !44 + %2631 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2104, float 0x3FF7154760000000, float %2629) #6, !dbg !44 + %2632 = tail call float @llvm.nvvm.fma.rn.f(float %2104, float 0x3FF7154760000000, float %2629) #6, !dbg !44 + %.0.i291 = select i1 %.not3.i290, float %2632, float %2631, !dbg !44 + %2633 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i292 = icmp eq i32 %2633, 0, !dbg !44 + %2634 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2104, float 0x3E54AE0C00000000, float %.0.i291) #6, !dbg !44 + %2635 = tail call float @llvm.nvvm.fma.rn.f(float %2104, float 0x3E54AE0C00000000, float %.0.i291) #6, !dbg !44 + %.01.i293 = select i1 %.not4.i292, float %2635, float %2634, !dbg !44 + %2636 = bitcast float %.04.i289 to i32, !dbg !44 + %2637 = shl i32 %2636, 23, !dbg !44 + %2638 = bitcast i32 %2637 to float, !dbg !44 + %2639 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i293) #6, !dbg !44 + %2640 = fmul float %2639, %2638, !dbg !44 + %2641 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i294 = icmp eq i32 %2641, 0, !dbg !44 + %2642 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2105, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2643 = tail call float @llvm.nvvm.fma.rn.f(float %2105, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i295 = select i1 %.not.i294, float %2643, float %2642, !dbg !44 + %2644 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i296 = icmp eq i32 %2644, 0, !dbg !44 + %2645 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i295) #6, !dbg !44 + %2646 = tail call float @llvm.nvvm.saturate.f(float %.02.i295) #6, !dbg !44 + %.03.i297 = select i1 %.not1.i296, float %2646, float %2645, !dbg !44 + %2647 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i298 = icmp eq i32 %2647, 0, !dbg !44 + %2648 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i297, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2649 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i297, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i299 = select i1 %.not2.i298, float %2649, float %2648, !dbg !44 + %2650 = fadd float %.04.i299, 0xC168000FE0000000, !dbg !44 + %2651 = fneg float %2650, !dbg !44 + %2652 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i300 = icmp eq i32 %2652, 0, !dbg !44 + %2653 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2105, float 0x3FF7154760000000, float %2651) #6, !dbg !44 + %2654 = tail call float @llvm.nvvm.fma.rn.f(float %2105, float 0x3FF7154760000000, float %2651) #6, !dbg !44 + %.0.i301 = select i1 %.not3.i300, float %2654, float %2653, !dbg !44 + %2655 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i302 = icmp eq i32 %2655, 0, !dbg !44 + %2656 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2105, float 0x3E54AE0C00000000, float %.0.i301) #6, !dbg !44 + %2657 = tail call float @llvm.nvvm.fma.rn.f(float %2105, float 0x3E54AE0C00000000, float %.0.i301) #6, !dbg !44 + %.01.i303 = select i1 %.not4.i302, float %2657, float %2656, !dbg !44 + %2658 = bitcast float %.04.i299 to i32, !dbg !44 + %2659 = shl i32 %2658, 23, !dbg !44 + %2660 = bitcast i32 %2659 to float, !dbg !44 + %2661 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i303) #6, !dbg !44 + %2662 = fmul float %2661, %2660, !dbg !44 + %2663 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i304 = icmp eq i32 %2663, 0, !dbg !44 + %2664 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2106, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2665 = tail call float @llvm.nvvm.fma.rn.f(float %2106, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i305 = select i1 %.not.i304, float %2665, float %2664, !dbg !44 + %2666 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i306 = icmp eq i32 %2666, 0, !dbg !44 + %2667 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i305) #6, !dbg !44 + %2668 = tail call float @llvm.nvvm.saturate.f(float %.02.i305) #6, !dbg !44 + %.03.i307 = select i1 %.not1.i306, float %2668, float %2667, !dbg !44 + %2669 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not2.i308 = icmp eq i32 %2669, 0, !dbg !44 + %2670 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i307, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2671 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i307, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %.04.i309 = select i1 %.not2.i308, float %2671, float %2670, !dbg !44 + %2672 = fadd float %.04.i309, 0xC168000FE0000000, !dbg !44 + %2673 = fneg float %2672, !dbg !44 + %2674 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i310 = icmp eq i32 %2674, 0, !dbg !44 + %2675 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2106, float 0x3FF7154760000000, float %2673) #6, !dbg !44 + %2676 = tail call float @llvm.nvvm.fma.rn.f(float %2106, float 0x3FF7154760000000, float %2673) #6, !dbg !44 + %.0.i311 = select i1 %.not3.i310, float %2676, float %2675, !dbg !44 + %2677 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i312 = icmp eq i32 %2677, 0, !dbg !44 + %2678 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2106, float 0x3E54AE0C00000000, float %.0.i311) #6, !dbg !44 + %2679 = tail call float @llvm.nvvm.fma.rn.f(float %2106, float 0x3E54AE0C00000000, float %.0.i311) #6, !dbg !44 + %.01.i313 = select i1 %.not4.i312, float %2679, float %2678, !dbg !44 + %2680 = bitcast float %.04.i309 to i32, !dbg !44 + %2681 = shl i32 %2680, 23, !dbg !44 + %2682 = bitcast i32 %2681 to float, !dbg !44 + %2683 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i313) #6, !dbg !44 + %2684 = fmul float %2683, %2682, !dbg !44 + %2685 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i314 = icmp eq i32 %2685, 0, !dbg !44 + %2686 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2107, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2687 = tail call float @llvm.nvvm.fma.rn.f(float %2107, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i315 = select i1 %.not.i314, float %2687, float %2686, !dbg !44 + %2688 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i316 = icmp eq i32 %2688, 0, !dbg !44 + %2689 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i315) #6, !dbg !44 + %2690 = tail call float @llvm.nvvm.saturate.f(float %.02.i315) #6, !dbg !44 + %.03.i317 = select i1 %.not1.i316, float %2690, float %2689, !dbg !44 + %2691 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %2692 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i317, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2693 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i317, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2694 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i320 = icmp eq i32 %2694, 0, !dbg !44 + %2695 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i322 = icmp eq i32 %2695, 0, !dbg !44 + %2696 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i324 = icmp eq i32 %2696, 0, !dbg !44 + %2697 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2108, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2698 = tail call float @llvm.nvvm.fma.rn.f(float %2108, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i325 = select i1 %.not.i324, float %2698, float %2697, !dbg !44 + %2699 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i326 = icmp eq i32 %2699, 0, !dbg !44 + %2700 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i325) #6, !dbg !44 + %2701 = tail call float @llvm.nvvm.saturate.f(float %.02.i325) #6, !dbg !44 + %.03.i327 = select i1 %.not1.i326, float %2701, float %2700, !dbg !44 + %2702 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %2703 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i327, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2704 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i327, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2705 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i330 = icmp eq i32 %2705, 0, !dbg !44 + %2706 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i332 = icmp eq i32 %2706, 0, !dbg !44 + %2707 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i334 = icmp eq i32 %2707, 0, !dbg !44 + %2708 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2109, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2709 = tail call float @llvm.nvvm.fma.rn.f(float %2109, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i335 = select i1 %.not.i334, float %2709, float %2708, !dbg !44 + %2710 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i336 = icmp eq i32 %2710, 0, !dbg !44 + %2711 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i335) #6, !dbg !44 + %2712 = tail call float @llvm.nvvm.saturate.f(float %.02.i335) #6, !dbg !44 + %.03.i337 = select i1 %.not1.i336, float %2712, float %2711, !dbg !44 + %2713 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %2714 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i337, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2715 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i337, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2716 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i340 = icmp eq i32 %2716, 0, !dbg !44 + %2717 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i342 = icmp eq i32 %2717, 0, !dbg !44 + %2718 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i344 = icmp eq i32 %2718, 0, !dbg !44 + %2719 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2110, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2720 = tail call float @llvm.nvvm.fma.rn.f(float %2110, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i345 = select i1 %.not.i344, float %2720, float %2719, !dbg !44 + %2721 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i346 = icmp eq i32 %2721, 0, !dbg !44 + %2722 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i345) #6, !dbg !44 + %2723 = tail call float @llvm.nvvm.saturate.f(float %.02.i345) #6, !dbg !44 + %.03.i347 = select i1 %.not1.i346, float %2723, float %2722, !dbg !44 + %2724 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %2725 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i347, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2726 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i347, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2727 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i350 = icmp eq i32 %2727, 0, !dbg !44 + %2728 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i352 = icmp eq i32 %2728, 0, !dbg !44 + %2729 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i354 = icmp eq i32 %2729, 0, !dbg !44 + %2730 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2111, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2731 = tail call float @llvm.nvvm.fma.rn.f(float %2111, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i355 = select i1 %.not.i354, float %2731, float %2730, !dbg !44 + %2732 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i356 = icmp eq i32 %2732, 0, !dbg !44 + %2733 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i355) #6, !dbg !44 + %2734 = tail call float @llvm.nvvm.saturate.f(float %.02.i355) #6, !dbg !44 + %.03.i357 = select i1 %.not1.i356, float %2734, float %2733, !dbg !44 + %2735 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %2736 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i357, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2737 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i357, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2738 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i360 = icmp eq i32 %2738, 0, !dbg !44 + %2739 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i362 = icmp eq i32 %2739, 0, !dbg !44 + %2740 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not.i364 = icmp eq i32 %2740, 0, !dbg !44 + %2741 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2112, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %2742 = tail call float @llvm.nvvm.fma.rn.f(float %2112, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !44 + %.02.i365 = select i1 %.not.i364, float %2742, float %2741, !dbg !44 + %2743 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not1.i366 = icmp eq i32 %2743, 0, !dbg !44 + %2744 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i365) #6, !dbg !44 + %2745 = tail call float @llvm.nvvm.saturate.f(float %.02.i365) #6, !dbg !44 + %.03.i367 = select i1 %.not1.i366, float %2745, float %2744, !dbg !44 + %2746 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %2747 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i367, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2748 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i367, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !44 + %2749 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not3.i370 = icmp eq i32 %2749, 0, !dbg !44 + %2750 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !44 + %.not4.i372 = icmp eq i32 %2750, 0, !dbg !44 + %2751 = extractelement <16 x float> %1511, i64 0, !dbg !45 + %2752 = fmul float %2751, %2134, !dbg !45 + %2753 = extractelement <16 x float> %1511, i64 1, !dbg !45 + %2754 = fmul float %2753, %2156, !dbg !45 + %2755 = extractelement <16 x float> %1511, i64 2, !dbg !45 + %2756 = fmul float %2755, %2178, !dbg !45 + %2757 = extractelement <16 x float> %1511, i64 3, !dbg !45 + %2758 = fmul float %2757, %2200, !dbg !45 + %2759 = extractelement <16 x float> %1511, i64 4, !dbg !45 + %2760 = fmul float %2759, %2222, !dbg !45 + %2761 = extractelement <16 x float> %1511, i64 5, !dbg !45 + %2762 = fmul float %2761, %2244, !dbg !45 + %2763 = extractelement <16 x float> %1511, i64 6, !dbg !45 + %2764 = fmul float %2763, %2266, !dbg !45 + %2765 = extractelement <16 x float> %1511, i64 7, !dbg !45 + %2766 = fmul float %2765, %2288, !dbg !45 + %2767 = extractelement <16 x float> %1511, i64 8, !dbg !45 + %2768 = fmul float %2767, %2310, !dbg !45 + %2769 = extractelement <16 x float> %1511, i64 9, !dbg !45 + %2770 = fmul float %2769, %2332, !dbg !45 + %2771 = extractelement <16 x float> %1511, i64 10, !dbg !45 + %2772 = fmul float %2771, %2354, !dbg !45 + %2773 = extractelement <16 x float> %1511, i64 11, !dbg !45 + %2774 = fmul float %2773, %2376, !dbg !45 + %2775 = extractelement <16 x float> %1511, i64 12, !dbg !45 + %2776 = fmul float %2775, %2398, !dbg !45 + %2777 = extractelement <16 x float> %1511, i64 13, !dbg !45 + %2778 = fmul float %2777, %2420, !dbg !45 + %2779 = extractelement <16 x float> %1511, i64 14, !dbg !45 + %2780 = fmul float %2779, %2442, !dbg !45 + %2781 = extractelement <16 x float> %1511, i64 15, !dbg !45 + %2782 = fmul float %2781, %2464, !dbg !45 + %2783 = fmul float %1512, %2486, !dbg !45 + %2784 = fmul float %1513, %2508, !dbg !45 + %2785 = fmul float %1514, %2530, !dbg !45 + %2786 = fmul float %1515, %2552, !dbg !45 + %2787 = fmul float %1516, %2574, !dbg !45 + %2788 = fmul float %1517, %2596, !dbg !45 + %2789 = fmul float %1518, %2618, !dbg !45 + %2790 = fmul float %1519, %2640, !dbg !45 + %2791 = fmul float %1820, %2662, !dbg !45 + %2792 = fmul float %1821, %2684, !dbg !45 + %2793 = insertelement <2 x i32> poison, i32 %2691, i64 0, !dbg !44 + %2794 = insertelement <2 x i32> %2793, i32 %2702, i64 1, !dbg !44 + %2795 = icmp eq <2 x i32> %2794, zeroinitializer, !dbg !44 + %2796 = insertelement <2 x float> poison, float %2693, i64 0, !dbg !44 + %2797 = insertelement <2 x float> %2796, float %2704, i64 1, !dbg !44 + %2798 = insertelement <2 x float> poison, float %2692, i64 0, !dbg !44 + %2799 = insertelement <2 x float> %2798, float %2703, i64 1, !dbg !44 + %2800 = select <2 x i1> %2795, <2 x float> %2797, <2 x float> %2799, !dbg !44 + %2801 = extractelement <2 x float> %2800, i64 0, !dbg !44 + %2802 = fadd float %2801, 0xC168000FE0000000, !dbg !44 + %2803 = fneg float %2802, !dbg !44 + %2804 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2107, float 0x3FF7154760000000, float %2803) #6, !dbg !44 + %2805 = tail call float @llvm.nvvm.fma.rn.f(float %2107, float 0x3FF7154760000000, float %2803) #6, !dbg !44 + %.0.i321 = select i1 %.not3.i320, float %2805, float %2804, !dbg !44 + %2806 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2107, float 0x3E54AE0C00000000, float %.0.i321) #6, !dbg !44 + %2807 = tail call float @llvm.nvvm.fma.rn.f(float %2107, float 0x3E54AE0C00000000, float %.0.i321) #6, !dbg !44 + %.01.i323 = select i1 %.not4.i322, float %2807, float %2806, !dbg !44 + %2808 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i323) #6, !dbg !44 + %2809 = extractelement <2 x float> %2800, i64 1, !dbg !44 + %2810 = fadd float %2809, 0xC168000FE0000000, !dbg !44 + %2811 = fneg float %2810, !dbg !44 + %2812 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2108, float 0x3FF7154760000000, float %2811) #6, !dbg !44 + %2813 = tail call float @llvm.nvvm.fma.rn.f(float %2108, float 0x3FF7154760000000, float %2811) #6, !dbg !44 + %.0.i331 = select i1 %.not3.i330, float %2813, float %2812, !dbg !44 + %2814 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2108, float 0x3E54AE0C00000000, float %.0.i331) #6, !dbg !44 + %2815 = tail call float @llvm.nvvm.fma.rn.f(float %2108, float 0x3E54AE0C00000000, float %.0.i331) #6, !dbg !44 + %.01.i333 = select i1 %.not4.i332, float %2815, float %2814, !dbg !44 + %2816 = bitcast <2 x float> %2800 to <2 x i32>, !dbg !44 + %2817 = shl <2 x i32> %2816, splat (i32 23), !dbg !44 + %2818 = bitcast <2 x i32> %2817 to <2 x float>, !dbg !44 + %2819 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i333) #6, !dbg !44 + %2820 = insertelement <2 x float> poison, float %2808, i64 0, !dbg !44 + %2821 = insertelement <2 x float> %2820, float %2819, i64 1, !dbg !44 + %2822 = fmul <2 x float> %2821, %2818, !dbg !44 + %2823 = fmul <2 x float> %1822, %2822, !dbg !45 + %2824 = insertelement <2 x i32> poison, i32 %2713, i64 0, !dbg !44 + %2825 = insertelement <2 x i32> %2824, i32 %2724, i64 1, !dbg !44 + %2826 = icmp eq <2 x i32> %2825, zeroinitializer, !dbg !44 + %2827 = insertelement <2 x float> poison, float %2715, i64 0, !dbg !44 + %2828 = insertelement <2 x float> %2827, float %2726, i64 1, !dbg !44 + %2829 = insertelement <2 x float> poison, float %2714, i64 0, !dbg !44 + %2830 = insertelement <2 x float> %2829, float %2725, i64 1, !dbg !44 + %2831 = select <2 x i1> %2826, <2 x float> %2828, <2 x float> %2830, !dbg !44 + %2832 = extractelement <2 x float> %2831, i64 0, !dbg !44 + %2833 = fadd float %2832, 0xC168000FE0000000, !dbg !44 + %2834 = fneg float %2833, !dbg !44 + %2835 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2109, float 0x3FF7154760000000, float %2834) #6, !dbg !44 + %2836 = tail call float @llvm.nvvm.fma.rn.f(float %2109, float 0x3FF7154760000000, float %2834) #6, !dbg !44 + %.0.i341 = select i1 %.not3.i340, float %2836, float %2835, !dbg !44 + %2837 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2109, float 0x3E54AE0C00000000, float %.0.i341) #6, !dbg !44 + %2838 = tail call float @llvm.nvvm.fma.rn.f(float %2109, float 0x3E54AE0C00000000, float %.0.i341) #6, !dbg !44 + %.01.i343 = select i1 %.not4.i342, float %2838, float %2837, !dbg !44 + %2839 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i343) #6, !dbg !44 + %2840 = extractelement <2 x float> %2831, i64 1, !dbg !44 + %2841 = fadd float %2840, 0xC168000FE0000000, !dbg !44 + %2842 = fneg float %2841, !dbg !44 + %2843 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2110, float 0x3FF7154760000000, float %2842) #6, !dbg !44 + %2844 = tail call float @llvm.nvvm.fma.rn.f(float %2110, float 0x3FF7154760000000, float %2842) #6, !dbg !44 + %.0.i351 = select i1 %.not3.i350, float %2844, float %2843, !dbg !44 + %2845 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2110, float 0x3E54AE0C00000000, float %.0.i351) #6, !dbg !44 + %2846 = tail call float @llvm.nvvm.fma.rn.f(float %2110, float 0x3E54AE0C00000000, float %.0.i351) #6, !dbg !44 + %.01.i353 = select i1 %.not4.i352, float %2846, float %2845, !dbg !44 + %2847 = bitcast <2 x float> %2831 to <2 x i32>, !dbg !44 + %2848 = shl <2 x i32> %2847, splat (i32 23), !dbg !44 + %2849 = bitcast <2 x i32> %2848 to <2 x float>, !dbg !44 + %2850 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i353) #6, !dbg !44 + %2851 = insertelement <2 x float> poison, float %2839, i64 0, !dbg !44 + %2852 = insertelement <2 x float> %2851, float %2850, i64 1, !dbg !44 + %2853 = fmul <2 x float> %2852, %2849, !dbg !44 + %2854 = fmul <2 x float> %1823, %2853, !dbg !45 + %2855 = insertelement <2 x i32> poison, i32 %2735, i64 0, !dbg !44 + %2856 = insertelement <2 x i32> %2855, i32 %2746, i64 1, !dbg !44 + %2857 = icmp eq <2 x i32> %2856, zeroinitializer, !dbg !44 + %2858 = insertelement <2 x float> poison, float %2737, i64 0, !dbg !44 + %2859 = insertelement <2 x float> %2858, float %2748, i64 1, !dbg !44 + %2860 = insertelement <2 x float> poison, float %2736, i64 0, !dbg !44 + %2861 = insertelement <2 x float> %2860, float %2747, i64 1, !dbg !44 + %2862 = select <2 x i1> %2857, <2 x float> %2859, <2 x float> %2861, !dbg !44 + %2863 = extractelement <2 x float> %2862, i64 0, !dbg !44 + %2864 = fadd float %2863, 0xC168000FE0000000, !dbg !44 + %2865 = fneg float %2864, !dbg !44 + %2866 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2111, float 0x3FF7154760000000, float %2865) #6, !dbg !44 + %2867 = tail call float @llvm.nvvm.fma.rn.f(float %2111, float 0x3FF7154760000000, float %2865) #6, !dbg !44 + %.0.i361 = select i1 %.not3.i360, float %2867, float %2866, !dbg !44 + %2868 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2111, float 0x3E54AE0C00000000, float %.0.i361) #6, !dbg !44 + %2869 = tail call float @llvm.nvvm.fma.rn.f(float %2111, float 0x3E54AE0C00000000, float %.0.i361) #6, !dbg !44 + %.01.i363 = select i1 %.not4.i362, float %2869, float %2868, !dbg !44 + %2870 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i363) #6, !dbg !44 + %2871 = extractelement <2 x float> %2862, i64 1, !dbg !44 + %2872 = fadd float %2871, 0xC168000FE0000000, !dbg !44 + %2873 = fneg float %2872, !dbg !44 + %2874 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2112, float 0x3FF7154760000000, float %2873) #6, !dbg !44 + %2875 = tail call float @llvm.nvvm.fma.rn.f(float %2112, float 0x3FF7154760000000, float %2873) #6, !dbg !44 + %.0.i371 = select i1 %.not3.i370, float %2875, float %2874, !dbg !44 + %2876 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %2112, float 0x3E54AE0C00000000, float %.0.i371) #6, !dbg !44 + %2877 = tail call float @llvm.nvvm.fma.rn.f(float %2112, float 0x3E54AE0C00000000, float %.0.i371) #6, !dbg !44 + %.01.i373 = select i1 %.not4.i372, float %2877, float %2876, !dbg !44 + %2878 = bitcast <2 x float> %2862 to <2 x i32>, !dbg !44 + %2879 = shl <2 x i32> %2878, splat (i32 23), !dbg !44 + %2880 = bitcast <2 x i32> %2879 to <2 x float>, !dbg !44 + %2881 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i373) #6, !dbg !44 + %2882 = insertelement <2 x float> poison, float %2870, i64 0, !dbg !44 + %2883 = insertelement <2 x float> %2882, float %2881, i64 1, !dbg !44 + %2884 = fmul <2 x float> %2883, %2880, !dbg !44 + %2885 = fmul <2 x float> %1824, %2884, !dbg !45 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %2886 = fadd float %2752, %2754, !dbg !49 + %2887 = fadd float %2886, %2756, !dbg !49 + %2888 = fadd float %2887, %2758, !dbg !49 + %2889 = fadd float %2888, %2760, !dbg !49 + %2890 = fadd float %2889, %2762, !dbg !49 + %2891 = fadd float %2890, %2764, !dbg !49 + %2892 = fadd float %2891, %2766, !dbg !49 + %2893 = fadd float %2892, %2768, !dbg !49 + %2894 = fadd float %2893, %2770, !dbg !49 + %2895 = fadd float %2894, %2772, !dbg !49 + %2896 = fadd float %2895, %2774, !dbg !49 + %2897 = fadd float %2896, %2776, !dbg !49 + %2898 = fadd float %2897, %2778, !dbg !49 + %2899 = fadd float %2898, %2780, !dbg !49 + %2900 = fadd float %2899, %2782, !dbg !49 + %2901 = fadd float %2900, %2783, !dbg !49 + %2902 = fadd float %2901, %2784, !dbg !49 + %2903 = fadd float %2902, %2785, !dbg !49 + %2904 = fadd float %2903, %2786, !dbg !49 + %2905 = fadd float %2904, %2787, !dbg !49 + %2906 = fadd float %2905, %2788, !dbg !49 + %2907 = fadd float %2906, %2789, !dbg !49 + %2908 = fadd float %2907, %2790, !dbg !49 + %2909 = fadd float %2908, %2791, !dbg !49 + %2910 = fadd float %2909, %2792, !dbg !49 + %2911 = extractelement <2 x float> %2823, i64 0, !dbg !49 + %2912 = fadd float %2910, %2911, !dbg !49 + %2913 = extractelement <2 x float> %2823, i64 1, !dbg !49 + %2914 = fadd float %2912, %2913, !dbg !49 + %2915 = extractelement <2 x float> %2854, i64 0, !dbg !49 + %2916 = fadd float %2914, %2915, !dbg !49 + %2917 = extractelement <2 x float> %2854, i64 1, !dbg !49 + %2918 = fadd float %2916, %2917, !dbg !49 + %2919 = extractelement <2 x float> %2885, i64 0, !dbg !49 + %2920 = fadd float %2918, %2919, !dbg !49 + %2921 = extractelement <2 x float> %2885, i64 1, !dbg !49 + %2922 = fadd float %2920, %2921, !dbg !49 + %2923 = bitcast float %2922 to i32, !dbg !46 + %2924 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2923, i32 16, i32 31), !dbg !46 + %2925 = bitcast i32 %2924 to float, !dbg !46 + %2926 = fadd float %2922, %2925, !dbg !49 + %2927 = bitcast float %2926 to i32, !dbg !46 + %2928 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2927, i32 8, i32 31), !dbg !46 + %2929 = bitcast i32 %2928 to float, !dbg !46 + %2930 = fadd float %2926, %2929, !dbg !49 + %2931 = bitcast float %2930 to i32, !dbg !46 + %2932 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2931, i32 4, i32 31), !dbg !46 + %2933 = bitcast i32 %2932 to float, !dbg !46 + %2934 = fadd float %2930, %2933, !dbg !49 + %2935 = bitcast float %2934 to i32, !dbg !46 + %2936 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2935, i32 2, i32 31), !dbg !46 + %2937 = bitcast i32 %2936 to float, !dbg !46 + %2938 = fadd float %2934, %2937, !dbg !49 + %2939 = bitcast float %2938 to i32, !dbg !46 + %2940 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2939, i32 1, i32 31), !dbg !46 + %2941 = bitcast i32 %2940 to float, !dbg !46 + %2942 = fadd float %2938, %2941, !dbg !49 + %2943 = bitcast float %2942 to <1 x i32>, !dbg !46 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %2011, <1 x i32> %2943, i1 %2010) #6, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %2944 = tail call i32 asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %2015, i1 %2014) #6, !dbg !46 + %2945 = bitcast i32 %2944 to float, !dbg !46 + %2946 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2944, i32 8, i32 31), !dbg !46 + %2947 = bitcast i32 %2946 to float, !dbg !46 + %2948 = fadd float %2945, %2947, !dbg !49 + %2949 = bitcast float %2948 to i32, !dbg !46 + %2950 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2949, i32 4, i32 31), !dbg !46 + %2951 = bitcast i32 %2950 to float, !dbg !46 + %2952 = fadd float %2948, %2951, !dbg !49 + %2953 = bitcast float %2952 to i32, !dbg !46 + %2954 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2953, i32 2, i32 31), !dbg !46 + %2955 = bitcast i32 %2954 to float, !dbg !46 + %2956 = fadd float %2952, %2955, !dbg !49 + %2957 = bitcast float %2956 to i32, !dbg !46 + %2958 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2957, i32 1, i32 31), !dbg !46 + %2959 = bitcast i32 %2958 to float, !dbg !46 + %2960 = fadd float %2956, %2959, !dbg !49 + %2961 = bitcast float %2960 to <1 x i32>, !dbg !46 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %2015, <1 x i32> %2961, i1 %2044) #6, !dbg !46 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !46 + %2962 = load float, ptr addrspace(3) @global_smem, align 16, !dbg !46 + %2963 = shl nuw nsw i32 %9, 4 + %2964 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2963 + %2965 = xor i32 %2963, 8256 + %2966 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2965 + %2967 = shl nuw nsw i32 %8, 3 + %2968 = and i32 %2967, 4080 + %2969 = and i32 %8, 1 + %2970 = icmp eq i32 %2969, 0 + %2971 = select i1 %2970, i32 0, i32 8256 + %2972 = xor i32 %2971, %2968 + %2973 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %2972 + %2974 = getelementptr inbounds nuw i8, ptr addrspace(3) %2973, i32 4096 + %2975 = zext nneg i32 %1844 to i64, !dbg !50 + br label %2976, !dbg !50 + +2976: ; preds = %1825, %2976 + %2977 = phi i1 [ true, %1825 ], [ false, %2976 ] + %indvars.iv1398 = phi i64 [ 0, %1825 ], [ 16384, %2976 ] + %2978 = or disjoint i64 %indvars.iv1398, %15, !dbg !51 + %2979 = or disjoint i64 %indvars.iv1398, %2975, !dbg !51 + %2980 = or disjoint i64 %2979, 14336, !dbg !51 + %2981 = icmp samesign ult i64 %2978, 32000, !dbg !52 + %2982 = icmp samesign ult i64 %2980, 32000, !dbg !52 + %2983 = trunc nuw nsw i64 %indvars.iv1398 to i32, !dbg !53 + %2984 = or disjoint i32 %10, %2983, !dbg !53 + %2985 = add i32 %2984, %14, !dbg !53 + %2986 = trunc nuw nsw i64 %indvars.iv1398 to i32, !dbg !53 + %2987 = or disjoint i32 %11, %2986, !dbg !53 + %2988 = add i32 %2987, %14, !dbg !53 + %2989 = trunc nuw nsw i64 %indvars.iv1398 to i32, !dbg !53 + %2990 = or disjoint i32 %12, %2989, !dbg !53 + %2991 = add i32 %2990, %14, !dbg !53 + %2992 = trunc nuw nsw i64 %2978 to i32, !dbg !53 + %2993 = add i32 %14, %2992, !dbg !53 + %2994 = trunc nuw nsw i64 %2979 to i32, !dbg !53 + %2995 = add i32 %14, %2994, !dbg !53 + %2996 = trunc nuw nsw i64 %2979 to i32, !dbg !53 + %2997 = or disjoint i32 %2996, 2048, !dbg !53 + %2998 = add i32 %2997, %14, !dbg !53 + %2999 = trunc nuw nsw i64 %2979 to i32, !dbg !53 + %3000 = or disjoint i32 %2999, 4096, !dbg !53 + %3001 = add i32 %3000, %14, !dbg !53 + %3002 = trunc nuw nsw i64 %2979 to i32, !dbg !53 + %3003 = or disjoint i32 %3002, 6144, !dbg !53 + %3004 = add i32 %3003, %14, !dbg !53 + %3005 = trunc nuw nsw i64 %2979 to i32, !dbg !53 + %3006 = or disjoint i32 %3005, 8192, !dbg !53 + %3007 = add i32 %3006, %14, !dbg !53 + %3008 = trunc nuw nsw i64 %2979 to i32, !dbg !53 + %3009 = or disjoint i32 %3008, 10240, !dbg !53 + %3010 = add i32 %3009, %14, !dbg !53 + %3011 = trunc nuw nsw i64 %2979 to i32, !dbg !53 + %3012 = or disjoint i32 %3011, 12288, !dbg !53 + %3013 = add i32 %3012, %14, !dbg !53 + %3014 = trunc nuw nsw i64 %2980 to i32, !dbg !53 + %3015 = add i32 %14, %3014, !dbg !53 + %3016 = sext i32 %2985 to i64, !dbg !54 + %3017 = getelementptr bfloat, ptr addrspace(1) %0, i64 %3016, !dbg !54 + %3018 = sext i32 %2988 to i64, !dbg !54 + %3019 = getelementptr bfloat, ptr addrspace(1) %0, i64 %3018, !dbg !54 + %3020 = sext i32 %2991 to i64, !dbg !54 + %3021 = getelementptr bfloat, ptr addrspace(1) %0, i64 %3020, !dbg !54 + %3022 = sext i32 %2993 to i64, !dbg !54 + %3023 = getelementptr bfloat, ptr addrspace(1) %0, i64 %3022, !dbg !54 + %3024 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !55 + %3025 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %3017, i64 %3024, i1 true) #6, !dbg !55 + %3026 = extractvalue { i32, i32, i32, i32 } %3025, 0, !dbg !55 + %3027 = bitcast i32 %3026 to <2 x bfloat>, !dbg !55 + %3028 = extractvalue { i32, i32, i32, i32 } %3025, 1, !dbg !55 + %3029 = bitcast i32 %3028 to <2 x bfloat>, !dbg !55 + %3030 = extractvalue { i32, i32, i32, i32 } %3025, 2, !dbg !55 + %3031 = bitcast i32 %3030 to <2 x bfloat>, !dbg !55 + %3032 = extractvalue { i32, i32, i32, i32 } %3025, 3, !dbg !55 + %3033 = bitcast i32 %3032 to <2 x bfloat>, !dbg !55 + %3034 = extractelement <2 x bfloat> %3027, i64 0, !dbg !55 + %3035 = extractelement <2 x bfloat> %3027, i64 1, !dbg !55 + %3036 = extractelement <2 x bfloat> %3029, i64 0, !dbg !55 + %3037 = extractelement <2 x bfloat> %3029, i64 1, !dbg !55 + %3038 = extractelement <2 x bfloat> %3031, i64 0, !dbg !55 + %3039 = extractelement <2 x bfloat> %3031, i64 1, !dbg !55 + %3040 = extractelement <2 x bfloat> %3033, i64 0, !dbg !55 + %3041 = extractelement <2 x bfloat> %3033, i64 1, !dbg !55 + %3042 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !55 + %3043 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %3019, i64 %3042, i1 true) #6, !dbg !55 + %3044 = extractvalue { i32, i32, i32, i32 } %3043, 0, !dbg !55 + %3045 = bitcast i32 %3044 to <2 x bfloat>, !dbg !55 + %3046 = extractvalue { i32, i32, i32, i32 } %3043, 1, !dbg !55 + %3047 = bitcast i32 %3046 to <2 x bfloat>, !dbg !55 + %3048 = extractvalue { i32, i32, i32, i32 } %3043, 2, !dbg !55 + %3049 = bitcast i32 %3048 to <2 x bfloat>, !dbg !55 + %3050 = extractvalue { i32, i32, i32, i32 } %3043, 3, !dbg !55 + %3051 = bitcast i32 %3050 to <2 x bfloat>, !dbg !55 + %3052 = extractelement <2 x bfloat> %3045, i64 0, !dbg !55 + %3053 = extractelement <2 x bfloat> %3045, i64 1, !dbg !55 + %3054 = extractelement <2 x bfloat> %3047, i64 0, !dbg !55 + %3055 = extractelement <2 x bfloat> %3047, i64 1, !dbg !55 + %3056 = extractelement <2 x bfloat> %3049, i64 0, !dbg !55 + %3057 = extractelement <2 x bfloat> %3049, i64 1, !dbg !55 + %3058 = extractelement <2 x bfloat> %3051, i64 0, !dbg !55 + %3059 = extractelement <2 x bfloat> %3051, i64 1, !dbg !55 + %3060 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !55 + %3061 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %3021, i64 %3060, i1 true) #6, !dbg !55 + %3062 = extractvalue { i32, i32, i32, i32 } %3061, 0, !dbg !55 + %3063 = bitcast i32 %3062 to <2 x bfloat>, !dbg !55 + %3064 = extractvalue { i32, i32, i32, i32 } %3061, 1, !dbg !55 + %3065 = bitcast i32 %3064 to <2 x bfloat>, !dbg !55 + %3066 = extractvalue { i32, i32, i32, i32 } %3061, 2, !dbg !55 + %3067 = bitcast i32 %3066 to <2 x bfloat>, !dbg !55 + %3068 = extractvalue { i32, i32, i32, i32 } %3061, 3, !dbg !55 + %3069 = bitcast i32 %3068 to <2 x bfloat>, !dbg !55 + %3070 = extractelement <2 x bfloat> %3063, i64 0, !dbg !55 + %3071 = extractelement <2 x bfloat> %3063, i64 1, !dbg !55 + %3072 = extractelement <2 x bfloat> %3065, i64 0, !dbg !55 + %3073 = extractelement <2 x bfloat> %3065, i64 1, !dbg !55 + %3074 = extractelement <2 x bfloat> %3067, i64 0, !dbg !55 + %3075 = extractelement <2 x bfloat> %3067, i64 1, !dbg !55 + %3076 = extractelement <2 x bfloat> %3069, i64 0, !dbg !55 + %3077 = extractelement <2 x bfloat> %3069, i64 1, !dbg !55 + %3078 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #6, !dbg !55 + %3079 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$10 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ], $9;", "=r,=r,=r,=r,r,r,r,r,l,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %3023, i64 %3078, i1 %2981) #6, !dbg !55 + %3080 = extractvalue { i32, i32, i32, i32 } %3079, 0, !dbg !55 + %3081 = bitcast i32 %3080 to <2 x bfloat>, !dbg !55 + %3082 = extractvalue { i32, i32, i32, i32 } %3079, 1, !dbg !55 + %3083 = bitcast i32 %3082 to <2 x bfloat>, !dbg !55 + %3084 = extractvalue { i32, i32, i32, i32 } %3079, 2, !dbg !55 + %3085 = bitcast i32 %3084 to <2 x bfloat>, !dbg !55 + %3086 = extractvalue { i32, i32, i32, i32 } %3079, 3, !dbg !55 + %3087 = bitcast i32 %3086 to <2 x bfloat>, !dbg !55 + %3088 = extractelement <2 x bfloat> %3081, i64 0, !dbg !55 + %3089 = extractelement <2 x bfloat> %3081, i64 1, !dbg !55 + %3090 = extractelement <2 x bfloat> %3083, i64 0, !dbg !55 + %3091 = extractelement <2 x bfloat> %3083, i64 1, !dbg !55 + %3092 = extractelement <2 x bfloat> %3085, i64 0, !dbg !55 + %3093 = extractelement <2 x bfloat> %3085, i64 1, !dbg !55 + %3094 = extractelement <2 x bfloat> %3087, i64 0, !dbg !55 + %3095 = extractelement <2 x bfloat> %3087, i64 1, !dbg !55 + %3096 = fpext bfloat %3034 to float, !dbg !56 + %3097 = fpext bfloat %3035 to float, !dbg !56 + %3098 = fpext bfloat %3036 to float, !dbg !56 + %3099 = fpext bfloat %3037 to float, !dbg !56 + %3100 = fpext bfloat %3038 to float, !dbg !56 + %3101 = fpext bfloat %3039 to float, !dbg !56 + %3102 = fpext bfloat %3040 to float, !dbg !56 + %3103 = fpext bfloat %3041 to float, !dbg !56 + %3104 = fpext bfloat %3052 to float, !dbg !56 + %3105 = fpext bfloat %3053 to float, !dbg !56 + %3106 = fpext bfloat %3054 to float, !dbg !56 + %3107 = fpext bfloat %3055 to float, !dbg !56 + %3108 = fpext bfloat %3056 to float, !dbg !56 + %3109 = fpext bfloat %3057 to float, !dbg !56 + %3110 = fpext bfloat %3058 to float, !dbg !56 + %3111 = fpext bfloat %3059 to float, !dbg !56 + %3112 = fpext bfloat %3070 to float, !dbg !56 + %3113 = fpext bfloat %3071 to float, !dbg !56 + %3114 = fpext bfloat %3072 to float, !dbg !56 + %3115 = fpext bfloat %3073 to float, !dbg !56 + %3116 = fpext bfloat %3074 to float, !dbg !56 + %3117 = fpext bfloat %3075 to float, !dbg !56 + %3118 = fpext bfloat %3076 to float, !dbg !56 + %3119 = fpext bfloat %3077 to float, !dbg !56 + %3120 = fpext bfloat %3088 to float, !dbg !56 + %3121 = fpext bfloat %3089 to float, !dbg !56 + %3122 = fpext bfloat %3090 to float, !dbg !56 + %3123 = fpext bfloat %3091 to float, !dbg !56 + %3124 = fpext bfloat %3092 to float, !dbg !56 + %3125 = fpext bfloat %3093 to float, !dbg !56 + %3126 = fpext bfloat %3094 to float, !dbg !56 + %3127 = fpext bfloat %3095 to float, !dbg !56 + %3128 = fsub float %3096, %2047, !dbg !57 + %3129 = fsub float %3097, %2047, !dbg !57 + %3130 = fsub float %3098, %2047, !dbg !57 + %3131 = fsub float %3099, %2047, !dbg !57 + %3132 = fsub float %3100, %2047, !dbg !57 + %3133 = fsub float %3101, %2047, !dbg !57 + %3134 = fsub float %3102, %2047, !dbg !57 + %3135 = fsub float %3103, %2047, !dbg !57 + %3136 = fsub float %3104, %2047, !dbg !57 + %3137 = fsub float %3105, %2047, !dbg !57 + %3138 = fsub float %3106, %2047, !dbg !57 + %3139 = fsub float %3107, %2047, !dbg !57 + %3140 = fsub float %3108, %2047, !dbg !57 + %3141 = fsub float %3109, %2047, !dbg !57 + %3142 = fsub float %3110, %2047, !dbg !57 + %3143 = fsub float %3111, %2047, !dbg !57 + %3144 = fsub float %3112, %2047, !dbg !57 + %3145 = fsub float %3113, %2047, !dbg !57 + %3146 = fsub float %3114, %2047, !dbg !57 + %3147 = fsub float %3115, %2047, !dbg !57 + %3148 = fsub float %3116, %2047, !dbg !57 + %3149 = fsub float %3117, %2047, !dbg !57 + %3150 = fsub float %3118, %2047, !dbg !57 + %3151 = fsub float %3119, %2047, !dbg !57 + %3152 = fsub float %3120, %2047, !dbg !57 + %3153 = fsub float %3121, %2047, !dbg !57 + %3154 = fsub float %3122, %2047, !dbg !57 + %3155 = fsub float %3123, %2047, !dbg !57 + %3156 = fsub float %3124, %2047, !dbg !57 + %3157 = fsub float %3125, %2047, !dbg !57 + %3158 = fsub float %3126, %2047, !dbg !57 + %3159 = fsub float %3127, %2047, !dbg !57 + %3160 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i374 = icmp eq i32 %3160, 0, !dbg !58 + %3161 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3128, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3162 = tail call float @llvm.nvvm.fma.rn.f(float %3128, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i375 = select i1 %.not.i374, float %3162, float %3161, !dbg !58 + %3163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i376 = icmp eq i32 %3163, 0, !dbg !58 + %3164 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i375) #6, !dbg !58 + %3165 = tail call float @llvm.nvvm.saturate.f(float %.02.i375) #6, !dbg !58 + %.03.i377 = select i1 %.not1.i376, float %3165, float %3164, !dbg !58 + %3166 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i378 = icmp eq i32 %3166, 0, !dbg !58 + %3167 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i377, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3168 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i377, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i379 = select i1 %.not2.i378, float %3168, float %3167, !dbg !58 + %3169 = fadd float %.04.i379, 0xC168000FE0000000, !dbg !58 + %3170 = fneg float %3169, !dbg !58 + %3171 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i380 = icmp eq i32 %3171, 0, !dbg !58 + %3172 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3128, float 0x3FF7154760000000, float %3170) #6, !dbg !58 + %3173 = tail call float @llvm.nvvm.fma.rn.f(float %3128, float 0x3FF7154760000000, float %3170) #6, !dbg !58 + %.0.i381 = select i1 %.not3.i380, float %3173, float %3172, !dbg !58 + %3174 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i382 = icmp eq i32 %3174, 0, !dbg !58 + %3175 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3128, float 0x3E54AE0C00000000, float %.0.i381) #6, !dbg !58 + %3176 = tail call float @llvm.nvvm.fma.rn.f(float %3128, float 0x3E54AE0C00000000, float %.0.i381) #6, !dbg !58 + %.01.i383 = select i1 %.not4.i382, float %3176, float %3175, !dbg !58 + %3177 = bitcast float %.04.i379 to i32, !dbg !58 + %3178 = shl i32 %3177, 23, !dbg !58 + %3179 = bitcast i32 %3178 to float, !dbg !58 + %3180 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i383) #6, !dbg !58 + %3181 = fmul float %3180, %3179, !dbg !58 + %3182 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i384 = icmp eq i32 %3182, 0, !dbg !58 + %3183 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3129, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3184 = tail call float @llvm.nvvm.fma.rn.f(float %3129, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i385 = select i1 %.not.i384, float %3184, float %3183, !dbg !58 + %3185 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i386 = icmp eq i32 %3185, 0, !dbg !58 + %3186 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i385) #6, !dbg !58 + %3187 = tail call float @llvm.nvvm.saturate.f(float %.02.i385) #6, !dbg !58 + %.03.i387 = select i1 %.not1.i386, float %3187, float %3186, !dbg !58 + %3188 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i388 = icmp eq i32 %3188, 0, !dbg !58 + %3189 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i387, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3190 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i387, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i389 = select i1 %.not2.i388, float %3190, float %3189, !dbg !58 + %3191 = fadd float %.04.i389, 0xC168000FE0000000, !dbg !58 + %3192 = fneg float %3191, !dbg !58 + %3193 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i390 = icmp eq i32 %3193, 0, !dbg !58 + %3194 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3129, float 0x3FF7154760000000, float %3192) #6, !dbg !58 + %3195 = tail call float @llvm.nvvm.fma.rn.f(float %3129, float 0x3FF7154760000000, float %3192) #6, !dbg !58 + %.0.i391 = select i1 %.not3.i390, float %3195, float %3194, !dbg !58 + %3196 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i392 = icmp eq i32 %3196, 0, !dbg !58 + %3197 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3129, float 0x3E54AE0C00000000, float %.0.i391) #6, !dbg !58 + %3198 = tail call float @llvm.nvvm.fma.rn.f(float %3129, float 0x3E54AE0C00000000, float %.0.i391) #6, !dbg !58 + %.01.i393 = select i1 %.not4.i392, float %3198, float %3197, !dbg !58 + %3199 = bitcast float %.04.i389 to i32, !dbg !58 + %3200 = shl i32 %3199, 23, !dbg !58 + %3201 = bitcast i32 %3200 to float, !dbg !58 + %3202 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i393) #6, !dbg !58 + %3203 = fmul float %3202, %3201, !dbg !58 + %3204 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i394 = icmp eq i32 %3204, 0, !dbg !58 + %3205 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3130, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3206 = tail call float @llvm.nvvm.fma.rn.f(float %3130, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i395 = select i1 %.not.i394, float %3206, float %3205, !dbg !58 + %3207 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i396 = icmp eq i32 %3207, 0, !dbg !58 + %3208 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i395) #6, !dbg !58 + %3209 = tail call float @llvm.nvvm.saturate.f(float %.02.i395) #6, !dbg !58 + %.03.i397 = select i1 %.not1.i396, float %3209, float %3208, !dbg !58 + %3210 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i398 = icmp eq i32 %3210, 0, !dbg !58 + %3211 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i397, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3212 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i397, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i399 = select i1 %.not2.i398, float %3212, float %3211, !dbg !58 + %3213 = fadd float %.04.i399, 0xC168000FE0000000, !dbg !58 + %3214 = fneg float %3213, !dbg !58 + %3215 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i400 = icmp eq i32 %3215, 0, !dbg !58 + %3216 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3130, float 0x3FF7154760000000, float %3214) #6, !dbg !58 + %3217 = tail call float @llvm.nvvm.fma.rn.f(float %3130, float 0x3FF7154760000000, float %3214) #6, !dbg !58 + %.0.i401 = select i1 %.not3.i400, float %3217, float %3216, !dbg !58 + %3218 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i402 = icmp eq i32 %3218, 0, !dbg !58 + %3219 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3130, float 0x3E54AE0C00000000, float %.0.i401) #6, !dbg !58 + %3220 = tail call float @llvm.nvvm.fma.rn.f(float %3130, float 0x3E54AE0C00000000, float %.0.i401) #6, !dbg !58 + %.01.i403 = select i1 %.not4.i402, float %3220, float %3219, !dbg !58 + %3221 = bitcast float %.04.i399 to i32, !dbg !58 + %3222 = shl i32 %3221, 23, !dbg !58 + %3223 = bitcast i32 %3222 to float, !dbg !58 + %3224 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i403) #6, !dbg !58 + %3225 = fmul float %3224, %3223, !dbg !58 + %3226 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i404 = icmp eq i32 %3226, 0, !dbg !58 + %3227 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3131, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3228 = tail call float @llvm.nvvm.fma.rn.f(float %3131, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i405 = select i1 %.not.i404, float %3228, float %3227, !dbg !58 + %3229 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i406 = icmp eq i32 %3229, 0, !dbg !58 + %3230 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i405) #6, !dbg !58 + %3231 = tail call float @llvm.nvvm.saturate.f(float %.02.i405) #6, !dbg !58 + %.03.i407 = select i1 %.not1.i406, float %3231, float %3230, !dbg !58 + %3232 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i408 = icmp eq i32 %3232, 0, !dbg !58 + %3233 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i407, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3234 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i407, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i409 = select i1 %.not2.i408, float %3234, float %3233, !dbg !58 + %3235 = fadd float %.04.i409, 0xC168000FE0000000, !dbg !58 + %3236 = fneg float %3235, !dbg !58 + %3237 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i410 = icmp eq i32 %3237, 0, !dbg !58 + %3238 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3131, float 0x3FF7154760000000, float %3236) #6, !dbg !58 + %3239 = tail call float @llvm.nvvm.fma.rn.f(float %3131, float 0x3FF7154760000000, float %3236) #6, !dbg !58 + %.0.i411 = select i1 %.not3.i410, float %3239, float %3238, !dbg !58 + %3240 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i412 = icmp eq i32 %3240, 0, !dbg !58 + %3241 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3131, float 0x3E54AE0C00000000, float %.0.i411) #6, !dbg !58 + %3242 = tail call float @llvm.nvvm.fma.rn.f(float %3131, float 0x3E54AE0C00000000, float %.0.i411) #6, !dbg !58 + %.01.i413 = select i1 %.not4.i412, float %3242, float %3241, !dbg !58 + %3243 = bitcast float %.04.i409 to i32, !dbg !58 + %3244 = shl i32 %3243, 23, !dbg !58 + %3245 = bitcast i32 %3244 to float, !dbg !58 + %3246 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i413) #6, !dbg !58 + %3247 = fmul float %3246, %3245, !dbg !58 + %3248 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i414 = icmp eq i32 %3248, 0, !dbg !58 + %3249 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3132, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3250 = tail call float @llvm.nvvm.fma.rn.f(float %3132, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i415 = select i1 %.not.i414, float %3250, float %3249, !dbg !58 + %3251 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i416 = icmp eq i32 %3251, 0, !dbg !58 + %3252 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i415) #6, !dbg !58 + %3253 = tail call float @llvm.nvvm.saturate.f(float %.02.i415) #6, !dbg !58 + %.03.i417 = select i1 %.not1.i416, float %3253, float %3252, !dbg !58 + %3254 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i418 = icmp eq i32 %3254, 0, !dbg !58 + %3255 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i417, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3256 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i417, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i419 = select i1 %.not2.i418, float %3256, float %3255, !dbg !58 + %3257 = fadd float %.04.i419, 0xC168000FE0000000, !dbg !58 + %3258 = fneg float %3257, !dbg !58 + %3259 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i420 = icmp eq i32 %3259, 0, !dbg !58 + %3260 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3132, float 0x3FF7154760000000, float %3258) #6, !dbg !58 + %3261 = tail call float @llvm.nvvm.fma.rn.f(float %3132, float 0x3FF7154760000000, float %3258) #6, !dbg !58 + %.0.i421 = select i1 %.not3.i420, float %3261, float %3260, !dbg !58 + %3262 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i422 = icmp eq i32 %3262, 0, !dbg !58 + %3263 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3132, float 0x3E54AE0C00000000, float %.0.i421) #6, !dbg !58 + %3264 = tail call float @llvm.nvvm.fma.rn.f(float %3132, float 0x3E54AE0C00000000, float %.0.i421) #6, !dbg !58 + %.01.i423 = select i1 %.not4.i422, float %3264, float %3263, !dbg !58 + %3265 = bitcast float %.04.i419 to i32, !dbg !58 + %3266 = shl i32 %3265, 23, !dbg !58 + %3267 = bitcast i32 %3266 to float, !dbg !58 + %3268 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i423) #6, !dbg !58 + %3269 = fmul float %3268, %3267, !dbg !58 + %3270 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i424 = icmp eq i32 %3270, 0, !dbg !58 + %3271 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3133, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3272 = tail call float @llvm.nvvm.fma.rn.f(float %3133, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i425 = select i1 %.not.i424, float %3272, float %3271, !dbg !58 + %3273 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i426 = icmp eq i32 %3273, 0, !dbg !58 + %3274 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i425) #6, !dbg !58 + %3275 = tail call float @llvm.nvvm.saturate.f(float %.02.i425) #6, !dbg !58 + %.03.i427 = select i1 %.not1.i426, float %3275, float %3274, !dbg !58 + %3276 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i428 = icmp eq i32 %3276, 0, !dbg !58 + %3277 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i427, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3278 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i427, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i429 = select i1 %.not2.i428, float %3278, float %3277, !dbg !58 + %3279 = fadd float %.04.i429, 0xC168000FE0000000, !dbg !58 + %3280 = fneg float %3279, !dbg !58 + %3281 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i430 = icmp eq i32 %3281, 0, !dbg !58 + %3282 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3133, float 0x3FF7154760000000, float %3280) #6, !dbg !58 + %3283 = tail call float @llvm.nvvm.fma.rn.f(float %3133, float 0x3FF7154760000000, float %3280) #6, !dbg !58 + %.0.i431 = select i1 %.not3.i430, float %3283, float %3282, !dbg !58 + %3284 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i432 = icmp eq i32 %3284, 0, !dbg !58 + %3285 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3133, float 0x3E54AE0C00000000, float %.0.i431) #6, !dbg !58 + %3286 = tail call float @llvm.nvvm.fma.rn.f(float %3133, float 0x3E54AE0C00000000, float %.0.i431) #6, !dbg !58 + %.01.i433 = select i1 %.not4.i432, float %3286, float %3285, !dbg !58 + %3287 = bitcast float %.04.i429 to i32, !dbg !58 + %3288 = shl i32 %3287, 23, !dbg !58 + %3289 = bitcast i32 %3288 to float, !dbg !58 + %3290 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i433) #6, !dbg !58 + %3291 = fmul float %3290, %3289, !dbg !58 + %3292 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i434 = icmp eq i32 %3292, 0, !dbg !58 + %3293 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3134, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3294 = tail call float @llvm.nvvm.fma.rn.f(float %3134, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i435 = select i1 %.not.i434, float %3294, float %3293, !dbg !58 + %3295 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i436 = icmp eq i32 %3295, 0, !dbg !58 + %3296 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i435) #6, !dbg !58 + %3297 = tail call float @llvm.nvvm.saturate.f(float %.02.i435) #6, !dbg !58 + %.03.i437 = select i1 %.not1.i436, float %3297, float %3296, !dbg !58 + %3298 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i438 = icmp eq i32 %3298, 0, !dbg !58 + %3299 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i437, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3300 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i437, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i439 = select i1 %.not2.i438, float %3300, float %3299, !dbg !58 + %3301 = fadd float %.04.i439, 0xC168000FE0000000, !dbg !58 + %3302 = fneg float %3301, !dbg !58 + %3303 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i440 = icmp eq i32 %3303, 0, !dbg !58 + %3304 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3134, float 0x3FF7154760000000, float %3302) #6, !dbg !58 + %3305 = tail call float @llvm.nvvm.fma.rn.f(float %3134, float 0x3FF7154760000000, float %3302) #6, !dbg !58 + %.0.i441 = select i1 %.not3.i440, float %3305, float %3304, !dbg !58 + %3306 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i442 = icmp eq i32 %3306, 0, !dbg !58 + %3307 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3134, float 0x3E54AE0C00000000, float %.0.i441) #6, !dbg !58 + %3308 = tail call float @llvm.nvvm.fma.rn.f(float %3134, float 0x3E54AE0C00000000, float %.0.i441) #6, !dbg !58 + %.01.i443 = select i1 %.not4.i442, float %3308, float %3307, !dbg !58 + %3309 = bitcast float %.04.i439 to i32, !dbg !58 + %3310 = shl i32 %3309, 23, !dbg !58 + %3311 = bitcast i32 %3310 to float, !dbg !58 + %3312 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i443) #6, !dbg !58 + %3313 = fmul float %3312, %3311, !dbg !58 + %3314 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i444 = icmp eq i32 %3314, 0, !dbg !58 + %3315 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3135, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3316 = tail call float @llvm.nvvm.fma.rn.f(float %3135, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i445 = select i1 %.not.i444, float %3316, float %3315, !dbg !58 + %3317 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i446 = icmp eq i32 %3317, 0, !dbg !58 + %3318 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i445) #6, !dbg !58 + %3319 = tail call float @llvm.nvvm.saturate.f(float %.02.i445) #6, !dbg !58 + %.03.i447 = select i1 %.not1.i446, float %3319, float %3318, !dbg !58 + %3320 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i448 = icmp eq i32 %3320, 0, !dbg !58 + %3321 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i447, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3322 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i447, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i449 = select i1 %.not2.i448, float %3322, float %3321, !dbg !58 + %3323 = fadd float %.04.i449, 0xC168000FE0000000, !dbg !58 + %3324 = fneg float %3323, !dbg !58 + %3325 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i450 = icmp eq i32 %3325, 0, !dbg !58 + %3326 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3135, float 0x3FF7154760000000, float %3324) #6, !dbg !58 + %3327 = tail call float @llvm.nvvm.fma.rn.f(float %3135, float 0x3FF7154760000000, float %3324) #6, !dbg !58 + %.0.i451 = select i1 %.not3.i450, float %3327, float %3326, !dbg !58 + %3328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i452 = icmp eq i32 %3328, 0, !dbg !58 + %3329 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3135, float 0x3E54AE0C00000000, float %.0.i451) #6, !dbg !58 + %3330 = tail call float @llvm.nvvm.fma.rn.f(float %3135, float 0x3E54AE0C00000000, float %.0.i451) #6, !dbg !58 + %.01.i453 = select i1 %.not4.i452, float %3330, float %3329, !dbg !58 + %3331 = bitcast float %.04.i449 to i32, !dbg !58 + %3332 = shl i32 %3331, 23, !dbg !58 + %3333 = bitcast i32 %3332 to float, !dbg !58 + %3334 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i453) #6, !dbg !58 + %3335 = fmul float %3334, %3333, !dbg !58 + %3336 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i454 = icmp eq i32 %3336, 0, !dbg !58 + %3337 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3136, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3338 = tail call float @llvm.nvvm.fma.rn.f(float %3136, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i455 = select i1 %.not.i454, float %3338, float %3337, !dbg !58 + %3339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i456 = icmp eq i32 %3339, 0, !dbg !58 + %3340 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i455) #6, !dbg !58 + %3341 = tail call float @llvm.nvvm.saturate.f(float %.02.i455) #6, !dbg !58 + %.03.i457 = select i1 %.not1.i456, float %3341, float %3340, !dbg !58 + %3342 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i458 = icmp eq i32 %3342, 0, !dbg !58 + %3343 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i457, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3344 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i457, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i459 = select i1 %.not2.i458, float %3344, float %3343, !dbg !58 + %3345 = fadd float %.04.i459, 0xC168000FE0000000, !dbg !58 + %3346 = fneg float %3345, !dbg !58 + %3347 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i460 = icmp eq i32 %3347, 0, !dbg !58 + %3348 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3136, float 0x3FF7154760000000, float %3346) #6, !dbg !58 + %3349 = tail call float @llvm.nvvm.fma.rn.f(float %3136, float 0x3FF7154760000000, float %3346) #6, !dbg !58 + %.0.i461 = select i1 %.not3.i460, float %3349, float %3348, !dbg !58 + %3350 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i462 = icmp eq i32 %3350, 0, !dbg !58 + %3351 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3136, float 0x3E54AE0C00000000, float %.0.i461) #6, !dbg !58 + %3352 = tail call float @llvm.nvvm.fma.rn.f(float %3136, float 0x3E54AE0C00000000, float %.0.i461) #6, !dbg !58 + %.01.i463 = select i1 %.not4.i462, float %3352, float %3351, !dbg !58 + %3353 = bitcast float %.04.i459 to i32, !dbg !58 + %3354 = shl i32 %3353, 23, !dbg !58 + %3355 = bitcast i32 %3354 to float, !dbg !58 + %3356 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i463) #6, !dbg !58 + %3357 = fmul float %3356, %3355, !dbg !58 + %3358 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i464 = icmp eq i32 %3358, 0, !dbg !58 + %3359 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3137, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3360 = tail call float @llvm.nvvm.fma.rn.f(float %3137, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i465 = select i1 %.not.i464, float %3360, float %3359, !dbg !58 + %3361 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i466 = icmp eq i32 %3361, 0, !dbg !58 + %3362 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i465) #6, !dbg !58 + %3363 = tail call float @llvm.nvvm.saturate.f(float %.02.i465) #6, !dbg !58 + %.03.i467 = select i1 %.not1.i466, float %3363, float %3362, !dbg !58 + %3364 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i468 = icmp eq i32 %3364, 0, !dbg !58 + %3365 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i467, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3366 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i467, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i469 = select i1 %.not2.i468, float %3366, float %3365, !dbg !58 + %3367 = fadd float %.04.i469, 0xC168000FE0000000, !dbg !58 + %3368 = fneg float %3367, !dbg !58 + %3369 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i470 = icmp eq i32 %3369, 0, !dbg !58 + %3370 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3137, float 0x3FF7154760000000, float %3368) #6, !dbg !58 + %3371 = tail call float @llvm.nvvm.fma.rn.f(float %3137, float 0x3FF7154760000000, float %3368) #6, !dbg !58 + %.0.i471 = select i1 %.not3.i470, float %3371, float %3370, !dbg !58 + %3372 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i472 = icmp eq i32 %3372, 0, !dbg !58 + %3373 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3137, float 0x3E54AE0C00000000, float %.0.i471) #6, !dbg !58 + %3374 = tail call float @llvm.nvvm.fma.rn.f(float %3137, float 0x3E54AE0C00000000, float %.0.i471) #6, !dbg !58 + %.01.i473 = select i1 %.not4.i472, float %3374, float %3373, !dbg !58 + %3375 = bitcast float %.04.i469 to i32, !dbg !58 + %3376 = shl i32 %3375, 23, !dbg !58 + %3377 = bitcast i32 %3376 to float, !dbg !58 + %3378 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i473) #6, !dbg !58 + %3379 = fmul float %3378, %3377, !dbg !58 + %3380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i474 = icmp eq i32 %3380, 0, !dbg !58 + %3381 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3138, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3382 = tail call float @llvm.nvvm.fma.rn.f(float %3138, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i475 = select i1 %.not.i474, float %3382, float %3381, !dbg !58 + %3383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i476 = icmp eq i32 %3383, 0, !dbg !58 + %3384 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i475) #6, !dbg !58 + %3385 = tail call float @llvm.nvvm.saturate.f(float %.02.i475) #6, !dbg !58 + %.03.i477 = select i1 %.not1.i476, float %3385, float %3384, !dbg !58 + %3386 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i478 = icmp eq i32 %3386, 0, !dbg !58 + %3387 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i477, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3388 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i477, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i479 = select i1 %.not2.i478, float %3388, float %3387, !dbg !58 + %3389 = fadd float %.04.i479, 0xC168000FE0000000, !dbg !58 + %3390 = fneg float %3389, !dbg !58 + %3391 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i480 = icmp eq i32 %3391, 0, !dbg !58 + %3392 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3138, float 0x3FF7154760000000, float %3390) #6, !dbg !58 + %3393 = tail call float @llvm.nvvm.fma.rn.f(float %3138, float 0x3FF7154760000000, float %3390) #6, !dbg !58 + %.0.i481 = select i1 %.not3.i480, float %3393, float %3392, !dbg !58 + %3394 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i482 = icmp eq i32 %3394, 0, !dbg !58 + %3395 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3138, float 0x3E54AE0C00000000, float %.0.i481) #6, !dbg !58 + %3396 = tail call float @llvm.nvvm.fma.rn.f(float %3138, float 0x3E54AE0C00000000, float %.0.i481) #6, !dbg !58 + %.01.i483 = select i1 %.not4.i482, float %3396, float %3395, !dbg !58 + %3397 = bitcast float %.04.i479 to i32, !dbg !58 + %3398 = shl i32 %3397, 23, !dbg !58 + %3399 = bitcast i32 %3398 to float, !dbg !58 + %3400 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i483) #6, !dbg !58 + %3401 = fmul float %3400, %3399, !dbg !58 + %3402 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i484 = icmp eq i32 %3402, 0, !dbg !58 + %3403 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3139, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3404 = tail call float @llvm.nvvm.fma.rn.f(float %3139, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i485 = select i1 %.not.i484, float %3404, float %3403, !dbg !58 + %3405 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i486 = icmp eq i32 %3405, 0, !dbg !58 + %3406 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i485) #6, !dbg !58 + %3407 = tail call float @llvm.nvvm.saturate.f(float %.02.i485) #6, !dbg !58 + %.03.i487 = select i1 %.not1.i486, float %3407, float %3406, !dbg !58 + %3408 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i488 = icmp eq i32 %3408, 0, !dbg !58 + %3409 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i487, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3410 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i487, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i489 = select i1 %.not2.i488, float %3410, float %3409, !dbg !58 + %3411 = fadd float %.04.i489, 0xC168000FE0000000, !dbg !58 + %3412 = fneg float %3411, !dbg !58 + %3413 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i490 = icmp eq i32 %3413, 0, !dbg !58 + %3414 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3139, float 0x3FF7154760000000, float %3412) #6, !dbg !58 + %3415 = tail call float @llvm.nvvm.fma.rn.f(float %3139, float 0x3FF7154760000000, float %3412) #6, !dbg !58 + %.0.i491 = select i1 %.not3.i490, float %3415, float %3414, !dbg !58 + %3416 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i492 = icmp eq i32 %3416, 0, !dbg !58 + %3417 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3139, float 0x3E54AE0C00000000, float %.0.i491) #6, !dbg !58 + %3418 = tail call float @llvm.nvvm.fma.rn.f(float %3139, float 0x3E54AE0C00000000, float %.0.i491) #6, !dbg !58 + %.01.i493 = select i1 %.not4.i492, float %3418, float %3417, !dbg !58 + %3419 = bitcast float %.04.i489 to i32, !dbg !58 + %3420 = shl i32 %3419, 23, !dbg !58 + %3421 = bitcast i32 %3420 to float, !dbg !58 + %3422 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i493) #6, !dbg !58 + %3423 = fmul float %3422, %3421, !dbg !58 + %3424 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i494 = icmp eq i32 %3424, 0, !dbg !58 + %3425 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3140, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3426 = tail call float @llvm.nvvm.fma.rn.f(float %3140, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i495 = select i1 %.not.i494, float %3426, float %3425, !dbg !58 + %3427 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i496 = icmp eq i32 %3427, 0, !dbg !58 + %3428 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i495) #6, !dbg !58 + %3429 = tail call float @llvm.nvvm.saturate.f(float %.02.i495) #6, !dbg !58 + %.03.i497 = select i1 %.not1.i496, float %3429, float %3428, !dbg !58 + %3430 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i498 = icmp eq i32 %3430, 0, !dbg !58 + %3431 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i497, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3432 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i497, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i499 = select i1 %.not2.i498, float %3432, float %3431, !dbg !58 + %3433 = fadd float %.04.i499, 0xC168000FE0000000, !dbg !58 + %3434 = fneg float %3433, !dbg !58 + %3435 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i500 = icmp eq i32 %3435, 0, !dbg !58 + %3436 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3140, float 0x3FF7154760000000, float %3434) #6, !dbg !58 + %3437 = tail call float @llvm.nvvm.fma.rn.f(float %3140, float 0x3FF7154760000000, float %3434) #6, !dbg !58 + %.0.i501 = select i1 %.not3.i500, float %3437, float %3436, !dbg !58 + %3438 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i502 = icmp eq i32 %3438, 0, !dbg !58 + %3439 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3140, float 0x3E54AE0C00000000, float %.0.i501) #6, !dbg !58 + %3440 = tail call float @llvm.nvvm.fma.rn.f(float %3140, float 0x3E54AE0C00000000, float %.0.i501) #6, !dbg !58 + %.01.i503 = select i1 %.not4.i502, float %3440, float %3439, !dbg !58 + %3441 = bitcast float %.04.i499 to i32, !dbg !58 + %3442 = shl i32 %3441, 23, !dbg !58 + %3443 = bitcast i32 %3442 to float, !dbg !58 + %3444 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i503) #6, !dbg !58 + %3445 = fmul float %3444, %3443, !dbg !58 + %3446 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i504 = icmp eq i32 %3446, 0, !dbg !58 + %3447 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3141, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3448 = tail call float @llvm.nvvm.fma.rn.f(float %3141, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i505 = select i1 %.not.i504, float %3448, float %3447, !dbg !58 + %3449 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i506 = icmp eq i32 %3449, 0, !dbg !58 + %3450 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i505) #6, !dbg !58 + %3451 = tail call float @llvm.nvvm.saturate.f(float %.02.i505) #6, !dbg !58 + %.03.i507 = select i1 %.not1.i506, float %3451, float %3450, !dbg !58 + %3452 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i508 = icmp eq i32 %3452, 0, !dbg !58 + %3453 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i507, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3454 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i507, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i509 = select i1 %.not2.i508, float %3454, float %3453, !dbg !58 + %3455 = fadd float %.04.i509, 0xC168000FE0000000, !dbg !58 + %3456 = fneg float %3455, !dbg !58 + %3457 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i510 = icmp eq i32 %3457, 0, !dbg !58 + %3458 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3141, float 0x3FF7154760000000, float %3456) #6, !dbg !58 + %3459 = tail call float @llvm.nvvm.fma.rn.f(float %3141, float 0x3FF7154760000000, float %3456) #6, !dbg !58 + %.0.i511 = select i1 %.not3.i510, float %3459, float %3458, !dbg !58 + %3460 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i512 = icmp eq i32 %3460, 0, !dbg !58 + %3461 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3141, float 0x3E54AE0C00000000, float %.0.i511) #6, !dbg !58 + %3462 = tail call float @llvm.nvvm.fma.rn.f(float %3141, float 0x3E54AE0C00000000, float %.0.i511) #6, !dbg !58 + %.01.i513 = select i1 %.not4.i512, float %3462, float %3461, !dbg !58 + %3463 = bitcast float %.04.i509 to i32, !dbg !58 + %3464 = shl i32 %3463, 23, !dbg !58 + %3465 = bitcast i32 %3464 to float, !dbg !58 + %3466 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i513) #6, !dbg !58 + %3467 = fmul float %3466, %3465, !dbg !58 + %3468 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i514 = icmp eq i32 %3468, 0, !dbg !58 + %3469 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3142, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3470 = tail call float @llvm.nvvm.fma.rn.f(float %3142, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i515 = select i1 %.not.i514, float %3470, float %3469, !dbg !58 + %3471 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i516 = icmp eq i32 %3471, 0, !dbg !58 + %3472 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i515) #6, !dbg !58 + %3473 = tail call float @llvm.nvvm.saturate.f(float %.02.i515) #6, !dbg !58 + %.03.i517 = select i1 %.not1.i516, float %3473, float %3472, !dbg !58 + %3474 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i518 = icmp eq i32 %3474, 0, !dbg !58 + %3475 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i517, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3476 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i517, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i519 = select i1 %.not2.i518, float %3476, float %3475, !dbg !58 + %3477 = fadd float %.04.i519, 0xC168000FE0000000, !dbg !58 + %3478 = fneg float %3477, !dbg !58 + %3479 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i520 = icmp eq i32 %3479, 0, !dbg !58 + %3480 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3142, float 0x3FF7154760000000, float %3478) #6, !dbg !58 + %3481 = tail call float @llvm.nvvm.fma.rn.f(float %3142, float 0x3FF7154760000000, float %3478) #6, !dbg !58 + %.0.i521 = select i1 %.not3.i520, float %3481, float %3480, !dbg !58 + %3482 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i522 = icmp eq i32 %3482, 0, !dbg !58 + %3483 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3142, float 0x3E54AE0C00000000, float %.0.i521) #6, !dbg !58 + %3484 = tail call float @llvm.nvvm.fma.rn.f(float %3142, float 0x3E54AE0C00000000, float %.0.i521) #6, !dbg !58 + %.01.i523 = select i1 %.not4.i522, float %3484, float %3483, !dbg !58 + %3485 = bitcast float %.04.i519 to i32, !dbg !58 + %3486 = shl i32 %3485, 23, !dbg !58 + %3487 = bitcast i32 %3486 to float, !dbg !58 + %3488 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i523) #6, !dbg !58 + %3489 = fmul float %3488, %3487, !dbg !58 + %3490 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i524 = icmp eq i32 %3490, 0, !dbg !58 + %3491 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3143, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3492 = tail call float @llvm.nvvm.fma.rn.f(float %3143, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i525 = select i1 %.not.i524, float %3492, float %3491, !dbg !58 + %3493 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i526 = icmp eq i32 %3493, 0, !dbg !58 + %3494 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i525) #6, !dbg !58 + %3495 = tail call float @llvm.nvvm.saturate.f(float %.02.i525) #6, !dbg !58 + %.03.i527 = select i1 %.not1.i526, float %3495, float %3494, !dbg !58 + %3496 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i528 = icmp eq i32 %3496, 0, !dbg !58 + %3497 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i527, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3498 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i527, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i529 = select i1 %.not2.i528, float %3498, float %3497, !dbg !58 + %3499 = fadd float %.04.i529, 0xC168000FE0000000, !dbg !58 + %3500 = fneg float %3499, !dbg !58 + %3501 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i530 = icmp eq i32 %3501, 0, !dbg !58 + %3502 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3143, float 0x3FF7154760000000, float %3500) #6, !dbg !58 + %3503 = tail call float @llvm.nvvm.fma.rn.f(float %3143, float 0x3FF7154760000000, float %3500) #6, !dbg !58 + %.0.i531 = select i1 %.not3.i530, float %3503, float %3502, !dbg !58 + %3504 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i532 = icmp eq i32 %3504, 0, !dbg !58 + %3505 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3143, float 0x3E54AE0C00000000, float %.0.i531) #6, !dbg !58 + %3506 = tail call float @llvm.nvvm.fma.rn.f(float %3143, float 0x3E54AE0C00000000, float %.0.i531) #6, !dbg !58 + %.01.i533 = select i1 %.not4.i532, float %3506, float %3505, !dbg !58 + %3507 = bitcast float %.04.i529 to i32, !dbg !58 + %3508 = shl i32 %3507, 23, !dbg !58 + %3509 = bitcast i32 %3508 to float, !dbg !58 + %3510 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i533) #6, !dbg !58 + %3511 = fmul float %3510, %3509, !dbg !58 + %3512 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i534 = icmp eq i32 %3512, 0, !dbg !58 + %3513 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3144, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3514 = tail call float @llvm.nvvm.fma.rn.f(float %3144, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i535 = select i1 %.not.i534, float %3514, float %3513, !dbg !58 + %3515 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i536 = icmp eq i32 %3515, 0, !dbg !58 + %3516 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i535) #6, !dbg !58 + %3517 = tail call float @llvm.nvvm.saturate.f(float %.02.i535) #6, !dbg !58 + %.03.i537 = select i1 %.not1.i536, float %3517, float %3516, !dbg !58 + %3518 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i538 = icmp eq i32 %3518, 0, !dbg !58 + %3519 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i537, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3520 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i537, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i539 = select i1 %.not2.i538, float %3520, float %3519, !dbg !58 + %3521 = fadd float %.04.i539, 0xC168000FE0000000, !dbg !58 + %3522 = fneg float %3521, !dbg !58 + %3523 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i540 = icmp eq i32 %3523, 0, !dbg !58 + %3524 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3144, float 0x3FF7154760000000, float %3522) #6, !dbg !58 + %3525 = tail call float @llvm.nvvm.fma.rn.f(float %3144, float 0x3FF7154760000000, float %3522) #6, !dbg !58 + %.0.i541 = select i1 %.not3.i540, float %3525, float %3524, !dbg !58 + %3526 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i542 = icmp eq i32 %3526, 0, !dbg !58 + %3527 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3144, float 0x3E54AE0C00000000, float %.0.i541) #6, !dbg !58 + %3528 = tail call float @llvm.nvvm.fma.rn.f(float %3144, float 0x3E54AE0C00000000, float %.0.i541) #6, !dbg !58 + %.01.i543 = select i1 %.not4.i542, float %3528, float %3527, !dbg !58 + %3529 = bitcast float %.04.i539 to i32, !dbg !58 + %3530 = shl i32 %3529, 23, !dbg !58 + %3531 = bitcast i32 %3530 to float, !dbg !58 + %3532 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i543) #6, !dbg !58 + %3533 = fmul float %3532, %3531, !dbg !58 + %3534 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i544 = icmp eq i32 %3534, 0, !dbg !58 + %3535 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3145, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3536 = tail call float @llvm.nvvm.fma.rn.f(float %3145, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i545 = select i1 %.not.i544, float %3536, float %3535, !dbg !58 + %3537 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i546 = icmp eq i32 %3537, 0, !dbg !58 + %3538 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i545) #6, !dbg !58 + %3539 = tail call float @llvm.nvvm.saturate.f(float %.02.i545) #6, !dbg !58 + %.03.i547 = select i1 %.not1.i546, float %3539, float %3538, !dbg !58 + %3540 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i548 = icmp eq i32 %3540, 0, !dbg !58 + %3541 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i547, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3542 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i547, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i549 = select i1 %.not2.i548, float %3542, float %3541, !dbg !58 + %3543 = fadd float %.04.i549, 0xC168000FE0000000, !dbg !58 + %3544 = fneg float %3543, !dbg !58 + %3545 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i550 = icmp eq i32 %3545, 0, !dbg !58 + %3546 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3145, float 0x3FF7154760000000, float %3544) #6, !dbg !58 + %3547 = tail call float @llvm.nvvm.fma.rn.f(float %3145, float 0x3FF7154760000000, float %3544) #6, !dbg !58 + %.0.i551 = select i1 %.not3.i550, float %3547, float %3546, !dbg !58 + %3548 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i552 = icmp eq i32 %3548, 0, !dbg !58 + %3549 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3145, float 0x3E54AE0C00000000, float %.0.i551) #6, !dbg !58 + %3550 = tail call float @llvm.nvvm.fma.rn.f(float %3145, float 0x3E54AE0C00000000, float %.0.i551) #6, !dbg !58 + %.01.i553 = select i1 %.not4.i552, float %3550, float %3549, !dbg !58 + %3551 = bitcast float %.04.i549 to i32, !dbg !58 + %3552 = shl i32 %3551, 23, !dbg !58 + %3553 = bitcast i32 %3552 to float, !dbg !58 + %3554 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i553) #6, !dbg !58 + %3555 = fmul float %3554, %3553, !dbg !58 + %3556 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i554 = icmp eq i32 %3556, 0, !dbg !58 + %3557 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3146, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3558 = tail call float @llvm.nvvm.fma.rn.f(float %3146, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i555 = select i1 %.not.i554, float %3558, float %3557, !dbg !58 + %3559 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i556 = icmp eq i32 %3559, 0, !dbg !58 + %3560 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i555) #6, !dbg !58 + %3561 = tail call float @llvm.nvvm.saturate.f(float %.02.i555) #6, !dbg !58 + %.03.i557 = select i1 %.not1.i556, float %3561, float %3560, !dbg !58 + %3562 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i558 = icmp eq i32 %3562, 0, !dbg !58 + %3563 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i557, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3564 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i557, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i559 = select i1 %.not2.i558, float %3564, float %3563, !dbg !58 + %3565 = fadd float %.04.i559, 0xC168000FE0000000, !dbg !58 + %3566 = fneg float %3565, !dbg !58 + %3567 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i560 = icmp eq i32 %3567, 0, !dbg !58 + %3568 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3146, float 0x3FF7154760000000, float %3566) #6, !dbg !58 + %3569 = tail call float @llvm.nvvm.fma.rn.f(float %3146, float 0x3FF7154760000000, float %3566) #6, !dbg !58 + %.0.i561 = select i1 %.not3.i560, float %3569, float %3568, !dbg !58 + %3570 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i562 = icmp eq i32 %3570, 0, !dbg !58 + %3571 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3146, float 0x3E54AE0C00000000, float %.0.i561) #6, !dbg !58 + %3572 = tail call float @llvm.nvvm.fma.rn.f(float %3146, float 0x3E54AE0C00000000, float %.0.i561) #6, !dbg !58 + %.01.i563 = select i1 %.not4.i562, float %3572, float %3571, !dbg !58 + %3573 = bitcast float %.04.i559 to i32, !dbg !58 + %3574 = shl i32 %3573, 23, !dbg !58 + %3575 = bitcast i32 %3574 to float, !dbg !58 + %3576 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i563) #6, !dbg !58 + %3577 = fmul float %3576, %3575, !dbg !58 + %3578 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i564 = icmp eq i32 %3578, 0, !dbg !58 + %3579 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3147, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3580 = tail call float @llvm.nvvm.fma.rn.f(float %3147, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i565 = select i1 %.not.i564, float %3580, float %3579, !dbg !58 + %3581 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i566 = icmp eq i32 %3581, 0, !dbg !58 + %3582 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i565) #6, !dbg !58 + %3583 = tail call float @llvm.nvvm.saturate.f(float %.02.i565) #6, !dbg !58 + %.03.i567 = select i1 %.not1.i566, float %3583, float %3582, !dbg !58 + %3584 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i568 = icmp eq i32 %3584, 0, !dbg !58 + %3585 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i567, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3586 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i567, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i569 = select i1 %.not2.i568, float %3586, float %3585, !dbg !58 + %3587 = fadd float %.04.i569, 0xC168000FE0000000, !dbg !58 + %3588 = fneg float %3587, !dbg !58 + %3589 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i570 = icmp eq i32 %3589, 0, !dbg !58 + %3590 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3147, float 0x3FF7154760000000, float %3588) #6, !dbg !58 + %3591 = tail call float @llvm.nvvm.fma.rn.f(float %3147, float 0x3FF7154760000000, float %3588) #6, !dbg !58 + %.0.i571 = select i1 %.not3.i570, float %3591, float %3590, !dbg !58 + %3592 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i572 = icmp eq i32 %3592, 0, !dbg !58 + %3593 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3147, float 0x3E54AE0C00000000, float %.0.i571) #6, !dbg !58 + %3594 = tail call float @llvm.nvvm.fma.rn.f(float %3147, float 0x3E54AE0C00000000, float %.0.i571) #6, !dbg !58 + %.01.i573 = select i1 %.not4.i572, float %3594, float %3593, !dbg !58 + %3595 = bitcast float %.04.i569 to i32, !dbg !58 + %3596 = shl i32 %3595, 23, !dbg !58 + %3597 = bitcast i32 %3596 to float, !dbg !58 + %3598 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i573) #6, !dbg !58 + %3599 = fmul float %3598, %3597, !dbg !58 + %3600 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i574 = icmp eq i32 %3600, 0, !dbg !58 + %3601 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3148, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3602 = tail call float @llvm.nvvm.fma.rn.f(float %3148, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i575 = select i1 %.not.i574, float %3602, float %3601, !dbg !58 + %3603 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i576 = icmp eq i32 %3603, 0, !dbg !58 + %3604 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i575) #6, !dbg !58 + %3605 = tail call float @llvm.nvvm.saturate.f(float %.02.i575) #6, !dbg !58 + %.03.i577 = select i1 %.not1.i576, float %3605, float %3604, !dbg !58 + %3606 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i578 = icmp eq i32 %3606, 0, !dbg !58 + %3607 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i577, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3608 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i577, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i579 = select i1 %.not2.i578, float %3608, float %3607, !dbg !58 + %3609 = fadd float %.04.i579, 0xC168000FE0000000, !dbg !58 + %3610 = fneg float %3609, !dbg !58 + %3611 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i580 = icmp eq i32 %3611, 0, !dbg !58 + %3612 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3148, float 0x3FF7154760000000, float %3610) #6, !dbg !58 + %3613 = tail call float @llvm.nvvm.fma.rn.f(float %3148, float 0x3FF7154760000000, float %3610) #6, !dbg !58 + %.0.i581 = select i1 %.not3.i580, float %3613, float %3612, !dbg !58 + %3614 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i582 = icmp eq i32 %3614, 0, !dbg !58 + %3615 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3148, float 0x3E54AE0C00000000, float %.0.i581) #6, !dbg !58 + %3616 = tail call float @llvm.nvvm.fma.rn.f(float %3148, float 0x3E54AE0C00000000, float %.0.i581) #6, !dbg !58 + %.01.i583 = select i1 %.not4.i582, float %3616, float %3615, !dbg !58 + %3617 = bitcast float %.04.i579 to i32, !dbg !58 + %3618 = shl i32 %3617, 23, !dbg !58 + %3619 = bitcast i32 %3618 to float, !dbg !58 + %3620 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i583) #6, !dbg !58 + %3621 = fmul float %3620, %3619, !dbg !58 + %3622 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i584 = icmp eq i32 %3622, 0, !dbg !58 + %3623 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3149, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3624 = tail call float @llvm.nvvm.fma.rn.f(float %3149, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i585 = select i1 %.not.i584, float %3624, float %3623, !dbg !58 + %3625 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i586 = icmp eq i32 %3625, 0, !dbg !58 + %3626 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i585) #6, !dbg !58 + %3627 = tail call float @llvm.nvvm.saturate.f(float %.02.i585) #6, !dbg !58 + %.03.i587 = select i1 %.not1.i586, float %3627, float %3626, !dbg !58 + %3628 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i588 = icmp eq i32 %3628, 0, !dbg !58 + %3629 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i587, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3630 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i587, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i589 = select i1 %.not2.i588, float %3630, float %3629, !dbg !58 + %3631 = fadd float %.04.i589, 0xC168000FE0000000, !dbg !58 + %3632 = fneg float %3631, !dbg !58 + %3633 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i590 = icmp eq i32 %3633, 0, !dbg !58 + %3634 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3149, float 0x3FF7154760000000, float %3632) #6, !dbg !58 + %3635 = tail call float @llvm.nvvm.fma.rn.f(float %3149, float 0x3FF7154760000000, float %3632) #6, !dbg !58 + %.0.i591 = select i1 %.not3.i590, float %3635, float %3634, !dbg !58 + %3636 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i592 = icmp eq i32 %3636, 0, !dbg !58 + %3637 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3149, float 0x3E54AE0C00000000, float %.0.i591) #6, !dbg !58 + %3638 = tail call float @llvm.nvvm.fma.rn.f(float %3149, float 0x3E54AE0C00000000, float %.0.i591) #6, !dbg !58 + %.01.i593 = select i1 %.not4.i592, float %3638, float %3637, !dbg !58 + %3639 = bitcast float %.04.i589 to i32, !dbg !58 + %3640 = shl i32 %3639, 23, !dbg !58 + %3641 = bitcast i32 %3640 to float, !dbg !58 + %3642 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i593) #6, !dbg !58 + %3643 = fmul float %3642, %3641, !dbg !58 + %3644 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i594 = icmp eq i32 %3644, 0, !dbg !58 + %3645 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3150, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3646 = tail call float @llvm.nvvm.fma.rn.f(float %3150, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i595 = select i1 %.not.i594, float %3646, float %3645, !dbg !58 + %3647 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i596 = icmp eq i32 %3647, 0, !dbg !58 + %3648 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i595) #6, !dbg !58 + %3649 = tail call float @llvm.nvvm.saturate.f(float %.02.i595) #6, !dbg !58 + %.03.i597 = select i1 %.not1.i596, float %3649, float %3648, !dbg !58 + %3650 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i598 = icmp eq i32 %3650, 0, !dbg !58 + %3651 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i597, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3652 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i597, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i599 = select i1 %.not2.i598, float %3652, float %3651, !dbg !58 + %3653 = fadd float %.04.i599, 0xC168000FE0000000, !dbg !58 + %3654 = fneg float %3653, !dbg !58 + %3655 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i600 = icmp eq i32 %3655, 0, !dbg !58 + %3656 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3150, float 0x3FF7154760000000, float %3654) #6, !dbg !58 + %3657 = tail call float @llvm.nvvm.fma.rn.f(float %3150, float 0x3FF7154760000000, float %3654) #6, !dbg !58 + %.0.i601 = select i1 %.not3.i600, float %3657, float %3656, !dbg !58 + %3658 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i602 = icmp eq i32 %3658, 0, !dbg !58 + %3659 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3150, float 0x3E54AE0C00000000, float %.0.i601) #6, !dbg !58 + %3660 = tail call float @llvm.nvvm.fma.rn.f(float %3150, float 0x3E54AE0C00000000, float %.0.i601) #6, !dbg !58 + %.01.i603 = select i1 %.not4.i602, float %3660, float %3659, !dbg !58 + %3661 = bitcast float %.04.i599 to i32, !dbg !58 + %3662 = shl i32 %3661, 23, !dbg !58 + %3663 = bitcast i32 %3662 to float, !dbg !58 + %3664 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i603) #6, !dbg !58 + %3665 = fmul float %3664, %3663, !dbg !58 + %3666 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i604 = icmp eq i32 %3666, 0, !dbg !58 + %3667 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3151, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3668 = tail call float @llvm.nvvm.fma.rn.f(float %3151, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i605 = select i1 %.not.i604, float %3668, float %3667, !dbg !58 + %3669 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i606 = icmp eq i32 %3669, 0, !dbg !58 + %3670 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i605) #6, !dbg !58 + %3671 = tail call float @llvm.nvvm.saturate.f(float %.02.i605) #6, !dbg !58 + %.03.i607 = select i1 %.not1.i606, float %3671, float %3670, !dbg !58 + %3672 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i608 = icmp eq i32 %3672, 0, !dbg !58 + %3673 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i607, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3674 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i607, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i609 = select i1 %.not2.i608, float %3674, float %3673, !dbg !58 + %3675 = fadd float %.04.i609, 0xC168000FE0000000, !dbg !58 + %3676 = fneg float %3675, !dbg !58 + %3677 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i610 = icmp eq i32 %3677, 0, !dbg !58 + %3678 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3151, float 0x3FF7154760000000, float %3676) #6, !dbg !58 + %3679 = tail call float @llvm.nvvm.fma.rn.f(float %3151, float 0x3FF7154760000000, float %3676) #6, !dbg !58 + %.0.i611 = select i1 %.not3.i610, float %3679, float %3678, !dbg !58 + %3680 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i612 = icmp eq i32 %3680, 0, !dbg !58 + %3681 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3151, float 0x3E54AE0C00000000, float %.0.i611) #6, !dbg !58 + %3682 = tail call float @llvm.nvvm.fma.rn.f(float %3151, float 0x3E54AE0C00000000, float %.0.i611) #6, !dbg !58 + %.01.i613 = select i1 %.not4.i612, float %3682, float %3681, !dbg !58 + %3683 = bitcast float %.04.i609 to i32, !dbg !58 + %3684 = shl i32 %3683, 23, !dbg !58 + %3685 = bitcast i32 %3684 to float, !dbg !58 + %3686 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i613) #6, !dbg !58 + %3687 = fmul float %3686, %3685, !dbg !58 + %3688 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i614 = icmp eq i32 %3688, 0, !dbg !58 + %3689 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3152, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3690 = tail call float @llvm.nvvm.fma.rn.f(float %3152, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i615 = select i1 %.not.i614, float %3690, float %3689, !dbg !58 + %3691 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i616 = icmp eq i32 %3691, 0, !dbg !58 + %3692 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i615) #6, !dbg !58 + %3693 = tail call float @llvm.nvvm.saturate.f(float %.02.i615) #6, !dbg !58 + %.03.i617 = select i1 %.not1.i616, float %3693, float %3692, !dbg !58 + %3694 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i618 = icmp eq i32 %3694, 0, !dbg !58 + %3695 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i617, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3696 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i617, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i619 = select i1 %.not2.i618, float %3696, float %3695, !dbg !58 + %3697 = fadd float %.04.i619, 0xC168000FE0000000, !dbg !58 + %3698 = fneg float %3697, !dbg !58 + %3699 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i620 = icmp eq i32 %3699, 0, !dbg !58 + %3700 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3152, float 0x3FF7154760000000, float %3698) #6, !dbg !58 + %3701 = tail call float @llvm.nvvm.fma.rn.f(float %3152, float 0x3FF7154760000000, float %3698) #6, !dbg !58 + %.0.i621 = select i1 %.not3.i620, float %3701, float %3700, !dbg !58 + %3702 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i622 = icmp eq i32 %3702, 0, !dbg !58 + %3703 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3152, float 0x3E54AE0C00000000, float %.0.i621) #6, !dbg !58 + %3704 = tail call float @llvm.nvvm.fma.rn.f(float %3152, float 0x3E54AE0C00000000, float %.0.i621) #6, !dbg !58 + %.01.i623 = select i1 %.not4.i622, float %3704, float %3703, !dbg !58 + %3705 = bitcast float %.04.i619 to i32, !dbg !58 + %3706 = shl i32 %3705, 23, !dbg !58 + %3707 = bitcast i32 %3706 to float, !dbg !58 + %3708 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i623) #6, !dbg !58 + %3709 = fmul float %3708, %3707, !dbg !58 + %3710 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i624 = icmp eq i32 %3710, 0, !dbg !58 + %3711 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3153, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3712 = tail call float @llvm.nvvm.fma.rn.f(float %3153, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i625 = select i1 %.not.i624, float %3712, float %3711, !dbg !58 + %3713 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i626 = icmp eq i32 %3713, 0, !dbg !58 + %3714 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i625) #6, !dbg !58 + %3715 = tail call float @llvm.nvvm.saturate.f(float %.02.i625) #6, !dbg !58 + %.03.i627 = select i1 %.not1.i626, float %3715, float %3714, !dbg !58 + %3716 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i628 = icmp eq i32 %3716, 0, !dbg !58 + %3717 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i627, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3718 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i627, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i629 = select i1 %.not2.i628, float %3718, float %3717, !dbg !58 + %3719 = fadd float %.04.i629, 0xC168000FE0000000, !dbg !58 + %3720 = fneg float %3719, !dbg !58 + %3721 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i630 = icmp eq i32 %3721, 0, !dbg !58 + %3722 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3153, float 0x3FF7154760000000, float %3720) #6, !dbg !58 + %3723 = tail call float @llvm.nvvm.fma.rn.f(float %3153, float 0x3FF7154760000000, float %3720) #6, !dbg !58 + %.0.i631 = select i1 %.not3.i630, float %3723, float %3722, !dbg !58 + %3724 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i632 = icmp eq i32 %3724, 0, !dbg !58 + %3725 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3153, float 0x3E54AE0C00000000, float %.0.i631) #6, !dbg !58 + %3726 = tail call float @llvm.nvvm.fma.rn.f(float %3153, float 0x3E54AE0C00000000, float %.0.i631) #6, !dbg !58 + %.01.i633 = select i1 %.not4.i632, float %3726, float %3725, !dbg !58 + %3727 = bitcast float %.04.i629 to i32, !dbg !58 + %3728 = shl i32 %3727, 23, !dbg !58 + %3729 = bitcast i32 %3728 to float, !dbg !58 + %3730 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i633) #6, !dbg !58 + %3731 = fmul float %3730, %3729, !dbg !58 + %3732 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i634 = icmp eq i32 %3732, 0, !dbg !58 + %3733 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3154, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3734 = tail call float @llvm.nvvm.fma.rn.f(float %3154, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i635 = select i1 %.not.i634, float %3734, float %3733, !dbg !58 + %3735 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i636 = icmp eq i32 %3735, 0, !dbg !58 + %3736 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i635) #6, !dbg !58 + %3737 = tail call float @llvm.nvvm.saturate.f(float %.02.i635) #6, !dbg !58 + %.03.i637 = select i1 %.not1.i636, float %3737, float %3736, !dbg !58 + %3738 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i638 = icmp eq i32 %3738, 0, !dbg !58 + %3739 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i637, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3740 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i637, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i639 = select i1 %.not2.i638, float %3740, float %3739, !dbg !58 + %3741 = fadd float %.04.i639, 0xC168000FE0000000, !dbg !58 + %3742 = fneg float %3741, !dbg !58 + %3743 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i640 = icmp eq i32 %3743, 0, !dbg !58 + %3744 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3154, float 0x3FF7154760000000, float %3742) #6, !dbg !58 + %3745 = tail call float @llvm.nvvm.fma.rn.f(float %3154, float 0x3FF7154760000000, float %3742) #6, !dbg !58 + %.0.i641 = select i1 %.not3.i640, float %3745, float %3744, !dbg !58 + %3746 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i642 = icmp eq i32 %3746, 0, !dbg !58 + %3747 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3154, float 0x3E54AE0C00000000, float %.0.i641) #6, !dbg !58 + %3748 = tail call float @llvm.nvvm.fma.rn.f(float %3154, float 0x3E54AE0C00000000, float %.0.i641) #6, !dbg !58 + %.01.i643 = select i1 %.not4.i642, float %3748, float %3747, !dbg !58 + %3749 = bitcast float %.04.i639 to i32, !dbg !58 + %3750 = shl i32 %3749, 23, !dbg !58 + %3751 = bitcast i32 %3750 to float, !dbg !58 + %3752 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i643) #6, !dbg !58 + %3753 = fmul float %3752, %3751, !dbg !58 + %3754 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i644 = icmp eq i32 %3754, 0, !dbg !58 + %3755 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3155, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3756 = tail call float @llvm.nvvm.fma.rn.f(float %3155, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i645 = select i1 %.not.i644, float %3756, float %3755, !dbg !58 + %3757 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i646 = icmp eq i32 %3757, 0, !dbg !58 + %3758 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i645) #6, !dbg !58 + %3759 = tail call float @llvm.nvvm.saturate.f(float %.02.i645) #6, !dbg !58 + %.03.i647 = select i1 %.not1.i646, float %3759, float %3758, !dbg !58 + %3760 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i648 = icmp eq i32 %3760, 0, !dbg !58 + %3761 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i647, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3762 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i647, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i649 = select i1 %.not2.i648, float %3762, float %3761, !dbg !58 + %3763 = fadd float %.04.i649, 0xC168000FE0000000, !dbg !58 + %3764 = fneg float %3763, !dbg !58 + %3765 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i650 = icmp eq i32 %3765, 0, !dbg !58 + %3766 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3155, float 0x3FF7154760000000, float %3764) #6, !dbg !58 + %3767 = tail call float @llvm.nvvm.fma.rn.f(float %3155, float 0x3FF7154760000000, float %3764) #6, !dbg !58 + %.0.i651 = select i1 %.not3.i650, float %3767, float %3766, !dbg !58 + %3768 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i652 = icmp eq i32 %3768, 0, !dbg !58 + %3769 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3155, float 0x3E54AE0C00000000, float %.0.i651) #6, !dbg !58 + %3770 = tail call float @llvm.nvvm.fma.rn.f(float %3155, float 0x3E54AE0C00000000, float %.0.i651) #6, !dbg !58 + %.01.i653 = select i1 %.not4.i652, float %3770, float %3769, !dbg !58 + %3771 = bitcast float %.04.i649 to i32, !dbg !58 + %3772 = shl i32 %3771, 23, !dbg !58 + %3773 = bitcast i32 %3772 to float, !dbg !58 + %3774 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i653) #6, !dbg !58 + %3775 = fmul float %3774, %3773, !dbg !58 + %3776 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i654 = icmp eq i32 %3776, 0, !dbg !58 + %3777 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3156, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3778 = tail call float @llvm.nvvm.fma.rn.f(float %3156, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i655 = select i1 %.not.i654, float %3778, float %3777, !dbg !58 + %3779 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i656 = icmp eq i32 %3779, 0, !dbg !58 + %3780 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i655) #6, !dbg !58 + %3781 = tail call float @llvm.nvvm.saturate.f(float %.02.i655) #6, !dbg !58 + %.03.i657 = select i1 %.not1.i656, float %3781, float %3780, !dbg !58 + %3782 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i658 = icmp eq i32 %3782, 0, !dbg !58 + %3783 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i657, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3784 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i657, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i659 = select i1 %.not2.i658, float %3784, float %3783, !dbg !58 + %3785 = fadd float %.04.i659, 0xC168000FE0000000, !dbg !58 + %3786 = fneg float %3785, !dbg !58 + %3787 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i660 = icmp eq i32 %3787, 0, !dbg !58 + %3788 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3156, float 0x3FF7154760000000, float %3786) #6, !dbg !58 + %3789 = tail call float @llvm.nvvm.fma.rn.f(float %3156, float 0x3FF7154760000000, float %3786) #6, !dbg !58 + %.0.i661 = select i1 %.not3.i660, float %3789, float %3788, !dbg !58 + %3790 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i662 = icmp eq i32 %3790, 0, !dbg !58 + %3791 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3156, float 0x3E54AE0C00000000, float %.0.i661) #6, !dbg !58 + %3792 = tail call float @llvm.nvvm.fma.rn.f(float %3156, float 0x3E54AE0C00000000, float %.0.i661) #6, !dbg !58 + %.01.i663 = select i1 %.not4.i662, float %3792, float %3791, !dbg !58 + %3793 = bitcast float %.04.i659 to i32, !dbg !58 + %3794 = shl i32 %3793, 23, !dbg !58 + %3795 = bitcast i32 %3794 to float, !dbg !58 + %3796 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i663) #6, !dbg !58 + %3797 = fmul float %3796, %3795, !dbg !58 + %3798 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i664 = icmp eq i32 %3798, 0, !dbg !58 + %3799 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3157, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3800 = tail call float @llvm.nvvm.fma.rn.f(float %3157, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i665 = select i1 %.not.i664, float %3800, float %3799, !dbg !58 + %3801 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i666 = icmp eq i32 %3801, 0, !dbg !58 + %3802 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i665) #6, !dbg !58 + %3803 = tail call float @llvm.nvvm.saturate.f(float %.02.i665) #6, !dbg !58 + %.03.i667 = select i1 %.not1.i666, float %3803, float %3802, !dbg !58 + %3804 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i668 = icmp eq i32 %3804, 0, !dbg !58 + %3805 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i667, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3806 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i667, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i669 = select i1 %.not2.i668, float %3806, float %3805, !dbg !58 + %3807 = fadd float %.04.i669, 0xC168000FE0000000, !dbg !58 + %3808 = fneg float %3807, !dbg !58 + %3809 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i670 = icmp eq i32 %3809, 0, !dbg !58 + %3810 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3157, float 0x3FF7154760000000, float %3808) #6, !dbg !58 + %3811 = tail call float @llvm.nvvm.fma.rn.f(float %3157, float 0x3FF7154760000000, float %3808) #6, !dbg !58 + %.0.i671 = select i1 %.not3.i670, float %3811, float %3810, !dbg !58 + %3812 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i672 = icmp eq i32 %3812, 0, !dbg !58 + %3813 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3157, float 0x3E54AE0C00000000, float %.0.i671) #6, !dbg !58 + %3814 = tail call float @llvm.nvvm.fma.rn.f(float %3157, float 0x3E54AE0C00000000, float %.0.i671) #6, !dbg !58 + %.01.i673 = select i1 %.not4.i672, float %3814, float %3813, !dbg !58 + %3815 = bitcast float %.04.i669 to i32, !dbg !58 + %3816 = shl i32 %3815, 23, !dbg !58 + %3817 = bitcast i32 %3816 to float, !dbg !58 + %3818 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i673) #6, !dbg !58 + %3819 = fmul float %3818, %3817, !dbg !58 + %3820 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i674 = icmp eq i32 %3820, 0, !dbg !58 + %3821 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3158, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3822 = tail call float @llvm.nvvm.fma.rn.f(float %3158, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i675 = select i1 %.not.i674, float %3822, float %3821, !dbg !58 + %3823 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i676 = icmp eq i32 %3823, 0, !dbg !58 + %3824 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i675) #6, !dbg !58 + %3825 = tail call float @llvm.nvvm.saturate.f(float %.02.i675) #6, !dbg !58 + %.03.i677 = select i1 %.not1.i676, float %3825, float %3824, !dbg !58 + %3826 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i678 = icmp eq i32 %3826, 0, !dbg !58 + %3827 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i677, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3828 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i677, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i679 = select i1 %.not2.i678, float %3828, float %3827, !dbg !58 + %3829 = fadd float %.04.i679, 0xC168000FE0000000, !dbg !58 + %3830 = fneg float %3829, !dbg !58 + %3831 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i680 = icmp eq i32 %3831, 0, !dbg !58 + %3832 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3158, float 0x3FF7154760000000, float %3830) #6, !dbg !58 + %3833 = tail call float @llvm.nvvm.fma.rn.f(float %3158, float 0x3FF7154760000000, float %3830) #6, !dbg !58 + %.0.i681 = select i1 %.not3.i680, float %3833, float %3832, !dbg !58 + %3834 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i682 = icmp eq i32 %3834, 0, !dbg !58 + %3835 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3158, float 0x3E54AE0C00000000, float %.0.i681) #6, !dbg !58 + %3836 = tail call float @llvm.nvvm.fma.rn.f(float %3158, float 0x3E54AE0C00000000, float %.0.i681) #6, !dbg !58 + %.01.i683 = select i1 %.not4.i682, float %3836, float %3835, !dbg !58 + %3837 = bitcast float %.04.i679 to i32, !dbg !58 + %3838 = shl i32 %3837, 23, !dbg !58 + %3839 = bitcast i32 %3838 to float, !dbg !58 + %3840 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i683) #6, !dbg !58 + %3841 = fmul float %3840, %3839, !dbg !58 + %3842 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i684 = icmp eq i32 %3842, 0, !dbg !58 + %3843 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3159, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %3844 = tail call float @llvm.nvvm.fma.rn.f(float %3159, float 0x3F777313A0000000, float 5.000000e-01) #6, !dbg !58 + %.02.i685 = select i1 %.not.i684, float %3844, float %3843, !dbg !58 + %3845 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not1.i686 = icmp eq i32 %3845, 0, !dbg !58 + %3846 = tail call float @llvm.nvvm.saturate.ftz.f(float %.02.i685) #6, !dbg !58 + %3847 = tail call float @llvm.nvvm.saturate.f(float %.02.i685) #6, !dbg !58 + %.03.i687 = select i1 %.not1.i686, float %3847, float %3846, !dbg !58 + %3848 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not2.i688 = icmp eq i32 %3848, 0, !dbg !58 + %3849 = tail call float @llvm.nvvm.fma.rm.ftz.f(float %.03.i687, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %3850 = tail call float @llvm.nvvm.fma.rm.f(float %.03.i687, float 2.520000e+02, float 0x4168000020000000) #6, !dbg !58 + %.04.i689 = select i1 %.not2.i688, float %3850, float %3849, !dbg !58 + %3851 = fadd float %.04.i689, 0xC168000FE0000000, !dbg !58 + %3852 = fneg float %3851, !dbg !58 + %3853 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not3.i690 = icmp eq i32 %3853, 0, !dbg !58 + %3854 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3159, float 0x3FF7154760000000, float %3852) #6, !dbg !58 + %3855 = tail call float @llvm.nvvm.fma.rn.f(float %3159, float 0x3FF7154760000000, float %3852) #6, !dbg !58 + %.0.i691 = select i1 %.not3.i690, float %3855, float %3854, !dbg !58 + %3856 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not4.i692 = icmp eq i32 %3856, 0, !dbg !58 + %3857 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %3159, float 0x3E54AE0C00000000, float %.0.i691) #6, !dbg !58 + %3858 = tail call float @llvm.nvvm.fma.rn.f(float %3159, float 0x3E54AE0C00000000, float %.0.i691) #6, !dbg !58 + %.01.i693 = select i1 %.not4.i692, float %3858, float %3857, !dbg !58 + %3859 = bitcast float %.04.i689 to i32, !dbg !58 + %3860 = shl i32 %3859, 23, !dbg !58 + %3861 = bitcast i32 %3860 to float, !dbg !58 + %3862 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.01.i693) #6, !dbg !58 + %3863 = fmul float %3862, %3861, !dbg !58 + %3864 = tail call float @llvm.nvvm.div.full(float %3181, float %2962), !dbg !59 + %3865 = tail call float @llvm.nvvm.div.full(float %3203, float %2962), !dbg !59 + %3866 = tail call float @llvm.nvvm.div.full(float %3225, float %2962), !dbg !59 + %3867 = tail call float @llvm.nvvm.div.full(float %3247, float %2962), !dbg !59 + %3868 = tail call float @llvm.nvvm.div.full(float %3269, float %2962), !dbg !59 + %3869 = tail call float @llvm.nvvm.div.full(float %3291, float %2962), !dbg !59 + %3870 = tail call float @llvm.nvvm.div.full(float %3313, float %2962), !dbg !59 + %3871 = tail call float @llvm.nvvm.div.full(float %3335, float %2962), !dbg !59 + %3872 = tail call float @llvm.nvvm.div.full(float %3357, float %2962), !dbg !59 + %3873 = tail call float @llvm.nvvm.div.full(float %3379, float %2962), !dbg !59 + %3874 = tail call float @llvm.nvvm.div.full(float %3401, float %2962), !dbg !59 + %3875 = tail call float @llvm.nvvm.div.full(float %3423, float %2962), !dbg !59 + %3876 = tail call float @llvm.nvvm.div.full(float %3445, float %2962), !dbg !59 + %3877 = tail call float @llvm.nvvm.div.full(float %3467, float %2962), !dbg !59 + %3878 = tail call float @llvm.nvvm.div.full(float %3489, float %2962), !dbg !59 + %3879 = tail call float @llvm.nvvm.div.full(float %3511, float %2962), !dbg !59 + %3880 = tail call float @llvm.nvvm.div.full(float %3533, float %2962), !dbg !59 + %3881 = tail call float @llvm.nvvm.div.full(float %3555, float %2962), !dbg !59 + %3882 = tail call float @llvm.nvvm.div.full(float %3577, float %2962), !dbg !59 + %3883 = tail call float @llvm.nvvm.div.full(float %3599, float %2962), !dbg !59 + %3884 = tail call float @llvm.nvvm.div.full(float %3621, float %2962), !dbg !59 + %3885 = tail call float @llvm.nvvm.div.full(float %3643, float %2962), !dbg !59 + %3886 = tail call float @llvm.nvvm.div.full(float %3665, float %2962), !dbg !59 + %3887 = tail call float @llvm.nvvm.div.full(float %3687, float %2962), !dbg !59 + %3888 = tail call float @llvm.nvvm.div.full(float %3709, float %2962), !dbg !59 + %3889 = tail call float @llvm.nvvm.div.full(float %3731, float %2962), !dbg !59 + %3890 = tail call float @llvm.nvvm.div.full(float %3753, float %2962), !dbg !59 + %3891 = tail call float @llvm.nvvm.div.full(float %3775, float %2962), !dbg !59 + %3892 = tail call float @llvm.nvvm.div.full(float %3797, float %2962), !dbg !59 + %3893 = tail call float @llvm.nvvm.div.full(float %3819, float %2962), !dbg !59 + %3894 = tail call float @llvm.nvvm.div.full(float %3841, float %2962), !dbg !59 + %3895 = tail call float @llvm.nvvm.div.full(float %3863, float %2962), !dbg !59 + %3896 = sext i32 %2995 to i64, !dbg !60 + %3897 = getelementptr float, ptr addrspace(1) %1, i64 %3896, !dbg !60 + %3898 = sext i32 %2998 to i64, !dbg !60 + %3899 = getelementptr float, ptr addrspace(1) %1, i64 %3898, !dbg !60 + %3900 = sext i32 %3001 to i64, !dbg !60 + %3901 = getelementptr float, ptr addrspace(1) %1, i64 %3900, !dbg !60 + %3902 = sext i32 %3004 to i64, !dbg !60 + %3903 = getelementptr float, ptr addrspace(1) %1, i64 %3902, !dbg !60 + %3904 = sext i32 %3007 to i64, !dbg !60 + %3905 = getelementptr float, ptr addrspace(1) %1, i64 %3904, !dbg !60 + %3906 = sext i32 %3010 to i64, !dbg !60 + %3907 = getelementptr float, ptr addrspace(1) %1, i64 %3906, !dbg !60 + %3908 = sext i32 %3013 to i64, !dbg !60 + %3909 = getelementptr float, ptr addrspace(1) %1, i64 %3908, !dbg !60 + %3910 = sext i32 %3015 to i64, !dbg !60 + %3911 = getelementptr float, ptr addrspace(1) %1, i64 %3910, !dbg !60 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %3912 = insertelement <4 x float> poison, float %3864, i64 0, !dbg !61 + %3913 = insertelement <4 x float> %3912, float %3865, i64 1, !dbg !61 + %3914 = insertelement <4 x float> %3913, float %3866, i64 2, !dbg !61 + %3915 = insertelement <4 x float> %3914, float %3867, i64 3, !dbg !61 + store <4 x float> %3915, ptr addrspace(3) %2964, align 16, !dbg !61 + %3916 = insertelement <4 x float> poison, float %3868, i64 0, !dbg !61 + %3917 = insertelement <4 x float> %3916, float %3869, i64 1, !dbg !61 + %3918 = insertelement <4 x float> %3917, float %3870, i64 2, !dbg !61 + %3919 = insertelement <4 x float> %3918, float %3871, i64 3, !dbg !61 + store <4 x float> %3919, ptr addrspace(3) %2966, align 16, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %3920 = load <4 x i32>, ptr addrspace(3) %2973, align 16, !dbg !61 + %3921 = load <4 x i32>, ptr addrspace(3) %2974, align 16, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %3922 = insertelement <4 x float> poison, float %3872, i64 0, !dbg !61 + %3923 = insertelement <4 x float> %3922, float %3873, i64 1, !dbg !61 + %3924 = insertelement <4 x float> %3923, float %3874, i64 2, !dbg !61 + %3925 = insertelement <4 x float> %3924, float %3875, i64 3, !dbg !61 + store <4 x float> %3925, ptr addrspace(3) %2964, align 16, !dbg !61 + %3926 = insertelement <4 x float> poison, float %3876, i64 0, !dbg !61 + %3927 = insertelement <4 x float> %3926, float %3877, i64 1, !dbg !61 + %3928 = insertelement <4 x float> %3927, float %3878, i64 2, !dbg !61 + %3929 = insertelement <4 x float> %3928, float %3879, i64 3, !dbg !61 + store <4 x float> %3929, ptr addrspace(3) %2966, align 16, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %3930 = load <4 x i32>, ptr addrspace(3) %2973, align 16, !dbg !61 + %3931 = load <4 x i32>, ptr addrspace(3) %2974, align 16, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %3932 = insertelement <4 x float> poison, float %3880, i64 0, !dbg !61 + %3933 = insertelement <4 x float> %3932, float %3881, i64 1, !dbg !61 + %3934 = insertelement <4 x float> %3933, float %3882, i64 2, !dbg !61 + %3935 = insertelement <4 x float> %3934, float %3883, i64 3, !dbg !61 + store <4 x float> %3935, ptr addrspace(3) %2964, align 16, !dbg !61 + %3936 = insertelement <4 x float> poison, float %3884, i64 0, !dbg !61 + %3937 = insertelement <4 x float> %3936, float %3885, i64 1, !dbg !61 + %3938 = insertelement <4 x float> %3937, float %3886, i64 2, !dbg !61 + %3939 = insertelement <4 x float> %3938, float %3887, i64 3, !dbg !61 + store <4 x float> %3939, ptr addrspace(3) %2966, align 16, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %3940 = load <4 x i32>, ptr addrspace(3) %2973, align 16, !dbg !61 + %3941 = load <4 x i32>, ptr addrspace(3) %2974, align 16, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %3942 = insertelement <4 x float> poison, float %3888, i64 0, !dbg !61 + %3943 = insertelement <4 x float> %3942, float %3889, i64 1, !dbg !61 + %3944 = insertelement <4 x float> %3943, float %3890, i64 2, !dbg !61 + %3945 = insertelement <4 x float> %3944, float %3891, i64 3, !dbg !61 + store <4 x float> %3945, ptr addrspace(3) %2964, align 16, !dbg !61 + %3946 = insertelement <4 x float> poison, float %3892, i64 0, !dbg !61 + %3947 = insertelement <4 x float> %3946, float %3893, i64 1, !dbg !61 + %3948 = insertelement <4 x float> %3947, float %3894, i64 2, !dbg !61 + %3949 = insertelement <4 x float> %3948, float %3895, i64 3, !dbg !61 + store <4 x float> %3949, ptr addrspace(3) %2966, align 16, !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %3950 = load <4 x i32>, ptr addrspace(3) %2973, align 16, !dbg !61 + %3951 = load <4 x i32>, ptr addrspace(3) %2974, align 16, !dbg !61 + %.extract = extractelement <4 x i32> %3920, i64 0, !dbg !61 + %.extract33 = extractelement <4 x i32> %3920, i64 1, !dbg !61 + %.extract34 = extractelement <4 x i32> %3920, i64 2, !dbg !61 + %.extract35 = extractelement <4 x i32> %3920, i64 3, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract, i32 %.extract33, i32 %.extract34, i32 %.extract35, ptr addrspace(1) %3897, i1 true) #6, !dbg !61 + %.extract36 = extractelement <4 x i32> %3921, i64 0, !dbg !61 + %.extract37 = extractelement <4 x i32> %3921, i64 1, !dbg !61 + %.extract38 = extractelement <4 x i32> %3921, i64 2, !dbg !61 + %.extract39 = extractelement <4 x i32> %3921, i64 3, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract36, i32 %.extract37, i32 %.extract38, i32 %.extract39, ptr addrspace(1) %3899, i1 true) #6, !dbg !61 + %.extract40 = extractelement <4 x i32> %3930, i64 0, !dbg !61 + %.extract41 = extractelement <4 x i32> %3930, i64 1, !dbg !61 + %.extract42 = extractelement <4 x i32> %3930, i64 2, !dbg !61 + %.extract43 = extractelement <4 x i32> %3930, i64 3, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract40, i32 %.extract41, i32 %.extract42, i32 %.extract43, ptr addrspace(1) %3901, i1 true) #6, !dbg !61 + %.extract44 = extractelement <4 x i32> %3931, i64 0, !dbg !61 + %.extract45 = extractelement <4 x i32> %3931, i64 1, !dbg !61 + %.extract46 = extractelement <4 x i32> %3931, i64 2, !dbg !61 + %.extract47 = extractelement <4 x i32> %3931, i64 3, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract44, i32 %.extract45, i32 %.extract46, i32 %.extract47, ptr addrspace(1) %3903, i1 true) #6, !dbg !61 + %.extract48 = extractelement <4 x i32> %3940, i64 0, !dbg !61 + %.extract49 = extractelement <4 x i32> %3940, i64 1, !dbg !61 + %.extract50 = extractelement <4 x i32> %3940, i64 2, !dbg !61 + %.extract51 = extractelement <4 x i32> %3940, i64 3, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract48, i32 %.extract49, i32 %.extract50, i32 %.extract51, ptr addrspace(1) %3905, i1 true) #6, !dbg !61 + %.extract52 = extractelement <4 x i32> %3941, i64 0, !dbg !61 + %.extract53 = extractelement <4 x i32> %3941, i64 1, !dbg !61 + %.extract54 = extractelement <4 x i32> %3941, i64 2, !dbg !61 + %.extract55 = extractelement <4 x i32> %3941, i64 3, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract52, i32 %.extract53, i32 %.extract54, i32 %.extract55, ptr addrspace(1) %3907, i1 true) #6, !dbg !61 + %.extract56 = extractelement <4 x i32> %3950, i64 0, !dbg !61 + %.extract57 = extractelement <4 x i32> %3950, i64 1, !dbg !61 + %.extract58 = extractelement <4 x i32> %3950, i64 2, !dbg !61 + %.extract59 = extractelement <4 x i32> %3950, i64 3, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract56, i32 %.extract57, i32 %.extract58, i32 %.extract59, ptr addrspace(1) %3909, i1 true) #6, !dbg !61 + %.extract60 = extractelement <4 x i32> %3951, i64 0, !dbg !61 + %.extract61 = extractelement <4 x i32> %3951, i64 1, !dbg !61 + %.extract62 = extractelement <4 x i32> %3951, i64 2, !dbg !61 + %.extract63 = extractelement <4 x i32> %3951, i64 3, !dbg !61 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %.extract60, i32 %.extract61, i32 %.extract62, i32 %.extract63, ptr addrspace(1) %3911, i1 %2982) #6, !dbg !61 + br i1 %2977, label %2976, label %3952, !dbg !50 + +3952: ; preds = %2976 + ret void, !dbg !62 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.saturate.ftz.f(float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.saturate.f(float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rm.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rm.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0", linkageName: "triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 23, column: 28, scope: !5) +!9 = !DILocation(line: 26, column: 37, scope: !5) +!10 = !DILocation(line: 37, column: 47, scope: !5) +!11 = !DILocation(line: 31, column: 40, scope: !5) +!12 = !DILocation(line: 32, column: 31, scope: !5) +!13 = !DILocation(line: 33, column: 29, scope: !5) +!14 = !DILocation(line: 37, column: 41, scope: !5) +!15 = !DILocation(line: 37, column: 34, scope: !5) +!16 = !DILocation(line: 37, column: 52, scope: !5) +!17 = !DILocation(line: 37, column: 105, scope: !5) +!18 = !DILocation(line: 110, column: 15, scope: !19, inlinedAt: !21) +!19 = distinct !DILexicalBlockFile(scope: !5, file: !20, discriminator: 0) +!20 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!21 = !DILocation(line: 42, column: 40, scope: !5) +!22 = !DILocation(line: 112, column: 21, scope: !19, inlinedAt: !21) +!23 = !DILocation(line: 112, column: 16, scope: !19, inlinedAt: !21) +!24 = !DILocation(line: 113, column: 29, scope: !19, inlinedAt: !21) +!25 = !DILocation(line: 199, column: 53, scope: !19, inlinedAt: !21) +!26 = !DILocation(line: 196, column: 19, scope: !19, inlinedAt: !21) +!27 = !DILocation(line: 196, column: 53, scope: !19, inlinedAt: !21) +!28 = !DILocation(line: 173, column: 29, scope: !19, inlinedAt: !21) +!29 = !DILocation(line: 196, column: 39, scope: !19, inlinedAt: !21) +!30 = !DILocation(line: 199, column: 39, scope: !19, inlinedAt: !21) +!31 = !DILocation(line: 205, column: 24, scope: !19, inlinedAt: !21) +!32 = !DILocation(line: 205, column: 36, scope: !19, inlinedAt: !21) +!33 = !DILocation(line: 45, column: 54, scope: !5) +!34 = !DILocation(line: 46, column: 54, scope: !5) +!35 = !DILocation(line: 110, column: 15, scope: !19, inlinedAt: !36) +!36 = !DILocation(line: 49, column: 33, scope: !5) +!37 = !DILocation(line: 112, column: 21, scope: !19, inlinedAt: !36) +!38 = !DILocation(line: 112, column: 16, scope: !19, inlinedAt: !36) +!39 = !DILocation(line: 113, column: 29, scope: !19, inlinedAt: !36) +!40 = !DILocation(line: 123, column: 29, scope: !19, inlinedAt: !36) +!41 = !DILocation(line: 180, column: 40, scope: !19, inlinedAt: !36) +!42 = !DILocation(line: 180, column: 68, scope: !19, inlinedAt: !36) +!43 = !DILocation(line: 180, column: 58, scope: !19, inlinedAt: !36) +!44 = !DILocation(line: 173, column: 29, scope: !19, inlinedAt: !36) +!45 = !DILocation(line: 181, column: 31, scope: !19, inlinedAt: !36) +!46 = !DILocation(line: 291, column: 36, scope: !47, inlinedAt: !36) +!47 = distinct !DILexicalBlockFile(scope: !5, file: !48, discriminator: 0) +!48 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!49 = !DILocation(line: 261, column: 15, scope: !47, inlinedAt: !36) +!50 = !DILocation(line: 52, column: 40, scope: !5) +!51 = !DILocation(line: 53, column: 31, scope: !5) +!52 = !DILocation(line: 54, column: 29, scope: !5) +!53 = !DILocation(line: 58, column: 41, scope: !5) +!54 = !DILocation(line: 58, column: 34, scope: !5) +!55 = !DILocation(line: 58, column: 52, scope: !5) +!56 = !DILocation(line: 58, column: 106, scope: !5) +!57 = !DILocation(line: 60, column: 22, scope: !5) +!58 = !DILocation(line: 61, column: 29, scope: !5) +!59 = !DILocation(line: 62, column: 23, scope: !5) +!60 = !DILocation(line: 63, column: 29, scope: !5) +!61 = !DILocation(line: 63, column: 53, scope: !5) +!62 = !DILocation(line: 52, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..6b218b9212549a1e8176f0bddb532ddecc10f889 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ptx @@ -0,0 +1,3186 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 // -- Begin function triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0 +.visible .entry triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0( + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_1, + .param .u32 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_2, + .param .u32 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_3, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_5 +) +.reqntid 512 +{ + .reg .pred %p<208>; + .reg .b16 %rs<65>; + .reg .b32 %r<2200>; + .reg .b64 %rd<274>; + .loc 1 18 0 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:18:0 + +// %bb.0: + ld.param.b64 %rd58, [triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_1]; + ld.param.b64 %rd57, [triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0_param_0]; +$L__tmp0: + .loc 1 23 28 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:23:28 + mov.u32 %r46, %ctaid.x; + .loc 1 26 37 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:26:37 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 511; + shl.b32 %r3, %r2, 3; + or.b32 %r4, %r3, 4096; + or.b32 %r5, %r3, 8192; + or.b32 %r47, %r3, 12288; + .loc 1 37 47 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:47 + mul.lo.s32 %r6, %r46, 32000; + mov.b32 %r2198, 0fFF800000; + mov.b64 %rd248, {%r2198, %r2198}; + mov.b32 %r2188, 0f00000000; + mov.b64 %rd247, {%r2188, %r2188}; + .loc 1 31 40 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:31:40 + cvt.u64.u32 %rd1, %r47; + mov.b64 %rd246, 0; + mov.pred %p4, -1; + mov.pred %p206, %p4; + mov.b32 %r2189, %r2188; + mov.b32 %r2190, %r2188; + mov.b32 %r2191, %r2188; + mov.b32 %r2192, %r2188; + mov.b32 %r2193, %r2188; + mov.b32 %r2194, %r2188; + mov.b32 %r2195, %r2188; + mov.b32 %r2196, %r2188; + mov.b32 %r2197, %r2188; + mov.b32 %r2199, %r2198; + mov.b64 %rd249, %rd247; + mov.b64 %rd250, %rd248; + mov.b64 %rd251, %rd247; + mov.b64 %rd252, %rd248; + mov.b64 %rd253, %rd247; + mov.b64 %rd254, %rd247; + mov.b64 %rd255, %rd247; + mov.b64 %rd256, %rd247; + mov.b64 %rd257, %rd247; + mov.b64 %rd258, %rd247; + mov.b64 %rd259, %rd247; + mov.b64 %rd260, %rd247; + mov.b64 %rd261, %rd248; + mov.b64 %rd262, %rd248; + mov.b64 %rd263, %rd248; + mov.b64 %rd264, %rd248; + mov.b64 %rd265, %rd248; + mov.b64 %rd266, %rd248; + mov.b64 %rd267, %rd248; + mov.b64 %rd268, %rd248; + mov.b64 %rd269, %rd248; + mov.b64 %rd270, %rd248; + mov.b64 %rd271, %rd248; + mov.b64 %rd272, %rd248; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 0 40 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:0:40 + mov.pred %p1, %p206; + .loc 1 32 31 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:32:31 + or.b64 %rd95, %rd246, %rd1; + .loc 1 33 29 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:33:29 + setp.lt.u64 %p7, %rd95, 32000; + .loc 1 37 41 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:41 + cvt.u32.u64 %r80, %rd246; + or.b32 %r81, %r3, %r80; + add.s32 %r82, %r81, %r6; + or.b32 %r83, %r4, %r80; + add.s32 %r84, %r83, %r6; + or.b32 %r85, %r5, %r80; + add.s32 %r86, %r85, %r6; + cvt.u32.u64 %r87, %rd95; + add.s32 %r88, %r6, %r87; + .loc 1 37 34 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:34 + mad.wide.s32 %rd83, %r82, 2, %rd57; + mad.wide.s32 %rd86, %r84, 2, %rd57; + mad.wide.s32 %rd89, %r86, 2, %rd57; + mad.wide.s32 %rd92, %r88, 2, %rd57; + .loc 1 37 52 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:52 + // begin inline asm + mov.u64 %rd82, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd82, 1.0; + // end inline asm + mov.b32 %r52, 0; + // begin inline asm + mov.u32 %r48, %r52; + mov.u32 %r49, %r52; + mov.u32 %r50, %r52; + mov.u32 %r51, %r52; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r48, %r49, %r50, %r51 }, [ %rd83 + 0 ], %rd82; + // end inline asm + // begin inline asm + mov.u64 %rd85, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd85, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r56, %r52; + mov.u32 %r57, %r52; + mov.u32 %r58, %r52; + mov.u32 %r59, %r52; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r56, %r57, %r58, %r59 }, [ %rd86 + 0 ], %rd85; + // end inline asm + // begin inline asm + mov.u64 %rd88, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd88, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r64, %r52; + mov.u32 %r65, %r52; + mov.u32 %r66, %r52; + mov.u32 %r67, %r52; + @%p4 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r64, %r65, %r66, %r67 }, [ %rd89 + 0 ], %rd88; + // end inline asm + // begin inline asm + mov.u64 %rd91, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd91, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r72, %r52; + mov.u32 %r73, %r52; + mov.u32 %r74, %r52; + mov.u32 %r75, %r52; + @%p7 ld.global.L1::evict_last.L2::cache_hint.v4.b32 { %r72, %r73, %r74, %r75 }, [ %rd92 + 0 ], %rd91; + // end inline asm + mov.b32 {%rs1, %rs2}, %r72; + .loc 1 37 105 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:105 + cvt.f32.bf16 %r89, %rs1; + cvt.f32.bf16 %r90, %rs2; +$L__tmp1: + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.gt.f32 %p9, %r2198, %r89; + setp.gt.f32 %p10, %r2199, %r90; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + mov.b64 {%r91, %r92}, %rd265; + setp.nan.f32 %p11, %r91, %r91; + setp.nan.f32 %p12, %r92, %r92; + mov.b64 {%r93, %r94}, %rd266; + setp.nan.f32 %p13, %r93, %r93; + setp.nan.f32 %p14, %r94, %r94; + mov.b64 {%r95, %r96}, %rd267; + setp.nan.f32 %p15, %r95, %r95; + setp.nan.f32 %p16, %r96, %r96; + mov.b64 {%r97, %r98}, %rd268; + setp.nan.f32 %p17, %r97, %r97; + setp.nan.f32 %p18, %r98, %r98; + mov.b64 {%r99, %r100}, %rd261; + setp.nan.f32 %p19, %r99, %r99; + setp.nan.f32 %p20, %r100, %r100; + mov.b64 {%r101, %r102}, %rd262; + setp.nan.f32 %p21, %r101, %r101; + setp.nan.f32 %p22, %r102, %r102; + mov.b64 {%r103, %r104}, %rd263; + setp.nan.f32 %p23, %r103, %r103; + setp.nan.f32 %p24, %r104, %r104; + mov.b64 {%r105, %r106}, %rd264; + setp.nan.f32 %p25, %r105, %r105; + setp.nan.f32 %p26, %r106, %r106; + mov.b64 {%r107, %r108}, %rd269; + setp.nan.f32 %p27, %r107, %r107; + setp.nan.f32 %p28, %r108, %r108; + mov.b64 {%r109, %r110}, %rd270; + setp.nan.f32 %p29, %r109, %r109; + setp.nan.f32 %p30, %r110, %r110; + mov.b64 {%r111, %r112}, %rd271; + setp.nan.f32 %p31, %r111, %r111; + setp.nan.f32 %p32, %r112, %r112; + mov.b64 {%r113, %r114}, %rd272; + setp.nan.f32 %p33, %r113, %r113; + setp.nan.f32 %p34, %r114, %r114; + setp.nan.f32 %p35, %r2198, %r2198; + setp.nan.f32 %p36, %r2199, %r2199; + mov.b64 {%r115, %r116}, %rd252; + setp.nan.f32 %p37, %r115, %r115; + setp.nan.f32 %p38, %r116, %r116; + mov.b64 {%r117, %r118}, %rd250; + setp.nan.f32 %p39, %r117, %r117; + setp.nan.f32 %p40, %r118, %r118; + mov.b64 {%r119, %r120}, %rd248; + setp.nan.f32 %p41, %r119, %r119; + setp.nan.f32 %p42, %r120, %r120; +$L__tmp2: + .loc 1 37 105 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:105 + mov.b32 {%rs3, %rs4}, %r67; + cvt.f32.bf16 %r121, %rs4; + cvt.f32.bf16 %r122, %rs3; + mov.b32 {%rs5, %rs6}, %r66; + cvt.f32.bf16 %r123, %rs6; + cvt.f32.bf16 %r124, %rs5; + mov.b32 {%rs7, %rs8}, %r65; + cvt.f32.bf16 %r125, %rs8; + cvt.f32.bf16 %r126, %rs7; + mov.b32 {%rs9, %rs10}, %r64; + cvt.f32.bf16 %r127, %rs10; + cvt.f32.bf16 %r128, %rs9; +$L__tmp3: + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.gt.f32 %p43, %r107, %r128; + setp.gt.f32 %p44, %r108, %r127; + setp.gt.f32 %p45, %r109, %r126; + setp.gt.f32 %p46, %r110, %r125; + setp.gt.f32 %p47, %r111, %r124; + setp.gt.f32 %p48, %r112, %r123; + setp.gt.f32 %p49, %r113, %r122; + setp.gt.f32 %p50, %r114, %r121; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r129, %r114, %r121, %p50; + selp.f32 %r26, %r114, %r129, %p34; + selp.f32 %r130, %r113, %r122, %p49; + selp.f32 %r25, %r113, %r130, %p33; + mov.b64 %rd272, {%r25, %r26}; + selp.f32 %r131, %r112, %r123, %p48; + selp.f32 %r24, %r112, %r131, %p32; + selp.f32 %r132, %r111, %r124, %p47; + selp.f32 %r23, %r111, %r132, %p31; + mov.b64 %rd271, {%r23, %r24}; + selp.f32 %r133, %r110, %r125, %p46; + selp.f32 %r22, %r110, %r133, %p30; + selp.f32 %r134, %r109, %r126, %p45; + selp.f32 %r21, %r109, %r134, %p29; + mov.b64 %rd270, {%r21, %r22}; + selp.f32 %r135, %r108, %r127, %p44; + selp.f32 %r20, %r108, %r135, %p28; + selp.f32 %r136, %r107, %r128, %p43; + selp.f32 %r19, %r107, %r136, %p27; + mov.b64 %rd269, {%r19, %r20}; + selp.f32 %r137, %r2198, %r89, %p9; + selp.f32 %r138, %r2198, %r137, %p35; + selp.f32 %r139, %r2199, %r90, %p10; + selp.f32 %r140, %r2199, %r139, %p36; + .loc 2 196 19 // triton_helpers.py:196:19 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.eq.f32 %p51, %r19, 0fFF800000; + setp.eq.f32 %p52, %r20, 0fFF800000; + setp.eq.f32 %p53, %r21, 0fFF800000; + setp.eq.f32 %p54, %r22, 0fFF800000; + setp.eq.f32 %p55, %r23, 0fFF800000; + setp.eq.f32 %p56, %r24, 0fFF800000; + setp.eq.f32 %p57, %r25, 0fFF800000; + setp.eq.f32 %p58, %r26, 0fFF800000; + setp.eq.f32 %p59, %r138, 0fFF800000; + setp.eq.f32 %p60, %r140, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + sub.f32 %r141, %r114, %r26; + sub.f32 %r142, %r113, %r25; + sub.f32 %r143, %r112, %r24; + sub.f32 %r144, %r111, %r23; + sub.f32 %r145, %r110, %r22; + sub.f32 %r146, %r109, %r21; + sub.f32 %r147, %r108, %r20; + sub.f32 %r148, %r107, %r19; + sub.f32 %r149, %r2198, %r138; + sub.f32 %r150, %r2199, %r140; + mov.b32 %r151, 0f3F000000; + mov.b32 %r152, 0f3BBB989D; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.ftz.f32 %r153, %r148, %r152, %r151; + cvt.ftz.sat.f32.f32 %r154, %r153; + mov.b32 %r155, 0f4B400001; + mov.b32 %r156, 0f437C0000; + fma.rm.ftz.f32 %r157, %r154, %r156, %r155; + add.f32 %r158, %r157, 0fCB40007F; + neg.f32 %r159, %r158; + mov.b32 %r160, 0f3FB8AA3B; + fma.rn.ftz.f32 %r161, %r148, %r160, %r159; + mov.b32 %r162, 0f32A57060; + fma.rn.ftz.f32 %r163, %r148, %r162, %r161; + shl.b32 %r164, %r157, 23; + ex2.approx.ftz.f32 %r165, %r163; + mul.f32 %r166, %r165, %r164; + fma.rn.ftz.f32 %r167, %r147, %r152, %r151; + cvt.ftz.sat.f32.f32 %r168, %r167; + fma.rm.ftz.f32 %r169, %r168, %r156, %r155; + add.f32 %r170, %r169, 0fCB40007F; + neg.f32 %r171, %r170; + fma.rn.ftz.f32 %r172, %r147, %r160, %r171; + fma.rn.ftz.f32 %r173, %r147, %r162, %r172; + shl.b32 %r174, %r169, 23; + ex2.approx.ftz.f32 %r175, %r173; + mul.f32 %r176, %r175, %r174; + fma.rn.ftz.f32 %r177, %r146, %r152, %r151; + cvt.ftz.sat.f32.f32 %r178, %r177; + fma.rm.ftz.f32 %r179, %r178, %r156, %r155; + add.f32 %r180, %r179, 0fCB40007F; + neg.f32 %r181, %r180; + fma.rn.ftz.f32 %r182, %r146, %r160, %r181; + fma.rn.ftz.f32 %r183, %r146, %r162, %r182; + shl.b32 %r184, %r179, 23; + ex2.approx.ftz.f32 %r185, %r183; + mul.f32 %r186, %r185, %r184; + fma.rn.ftz.f32 %r187, %r145, %r152, %r151; + cvt.ftz.sat.f32.f32 %r188, %r187; + fma.rm.ftz.f32 %r189, %r188, %r156, %r155; + add.f32 %r190, %r189, 0fCB40007F; + neg.f32 %r191, %r190; + fma.rn.ftz.f32 %r192, %r145, %r160, %r191; + fma.rn.ftz.f32 %r193, %r145, %r162, %r192; + shl.b32 %r194, %r189, 23; + ex2.approx.ftz.f32 %r195, %r193; + mul.f32 %r196, %r195, %r194; + fma.rn.ftz.f32 %r197, %r144, %r152, %r151; + cvt.ftz.sat.f32.f32 %r198, %r197; + fma.rm.ftz.f32 %r199, %r198, %r156, %r155; + add.f32 %r200, %r199, 0fCB40007F; + neg.f32 %r201, %r200; + fma.rn.ftz.f32 %r202, %r144, %r160, %r201; + fma.rn.ftz.f32 %r203, %r144, %r162, %r202; + shl.b32 %r204, %r199, 23; + ex2.approx.ftz.f32 %r205, %r203; + mul.f32 %r206, %r205, %r204; + fma.rn.ftz.f32 %r207, %r143, %r152, %r151; + cvt.ftz.sat.f32.f32 %r208, %r207; + fma.rm.ftz.f32 %r209, %r208, %r156, %r155; + add.f32 %r210, %r209, 0fCB40007F; + neg.f32 %r211, %r210; + fma.rn.ftz.f32 %r212, %r143, %r160, %r211; + fma.rn.ftz.f32 %r213, %r143, %r162, %r212; + shl.b32 %r214, %r209, 23; + ex2.approx.ftz.f32 %r215, %r213; + mul.f32 %r216, %r215, %r214; + fma.rn.ftz.f32 %r217, %r142, %r152, %r151; + cvt.ftz.sat.f32.f32 %r218, %r217; + fma.rm.ftz.f32 %r219, %r218, %r156, %r155; + add.f32 %r220, %r219, 0fCB40007F; + neg.f32 %r221, %r220; + fma.rn.ftz.f32 %r222, %r142, %r160, %r221; + fma.rn.ftz.f32 %r223, %r142, %r162, %r222; + shl.b32 %r224, %r219, 23; + ex2.approx.ftz.f32 %r225, %r223; + mul.f32 %r226, %r225, %r224; + fma.rn.ftz.f32 %r227, %r141, %r152, %r151; + cvt.ftz.sat.f32.f32 %r228, %r227; + fma.rm.ftz.f32 %r229, %r228, %r156, %r155; + add.f32 %r230, %r229, 0fCB40007F; + neg.f32 %r231, %r230; + fma.rn.ftz.f32 %r232, %r141, %r160, %r231; + fma.rn.ftz.f32 %r233, %r141, %r162, %r232; + shl.b32 %r234, %r229, 23; + ex2.approx.ftz.f32 %r235, %r233; + mul.f32 %r236, %r235, %r234; + fma.rn.ftz.f32 %r237, %r149, %r152, %r151; + cvt.ftz.sat.f32.f32 %r238, %r237; + fma.rm.ftz.f32 %r239, %r238, %r156, %r155; + add.f32 %r240, %r239, 0fCB40007F; + neg.f32 %r241, %r240; + fma.rn.ftz.f32 %r242, %r149, %r160, %r241; + fma.rn.ftz.f32 %r243, %r149, %r162, %r242; + shl.b32 %r244, %r239, 23; + ex2.approx.ftz.f32 %r245, %r243; + mul.f32 %r246, %r245, %r244; + fma.rn.ftz.f32 %r247, %r150, %r152, %r151; + cvt.ftz.sat.f32.f32 %r248, %r247; + fma.rm.ftz.f32 %r249, %r248, %r156, %r155; + add.f32 %r250, %r249, 0fCB40007F; + neg.f32 %r251, %r250; + fma.rn.ftz.f32 %r252, %r150, %r160, %r251; + fma.rn.ftz.f32 %r253, %r150, %r162, %r252; + shl.b32 %r254, %r249, 23; + ex2.approx.ftz.f32 %r255, %r253; + mul.f32 %r256, %r255, %r254; + .loc 2 196 39 // triton_helpers.py:196:39 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r257, 0f3F800000, %r166, %p51; + selp.f32 %r258, 0f3F800000, %r176, %p52; + selp.f32 %r259, 0f3F800000, %r186, %p53; + selp.f32 %r260, 0f3F800000, %r196, %p54; + selp.f32 %r261, 0f3F800000, %r206, %p55; + selp.f32 %r262, 0f3F800000, %r216, %p56; + selp.f32 %r263, 0f3F800000, %r226, %p57; + selp.f32 %r264, 0f3F800000, %r236, %p58; + selp.f32 %r265, 0f3F800000, %r246, %p59; + selp.f32 %r266, 0f3F800000, %r256, %p60; + .loc 2 199 53 // triton_helpers.py:199:53 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + sub.f32 %r267, %r121, %r26; + sub.f32 %r268, %r122, %r25; + sub.f32 %r269, %r123, %r24; + sub.f32 %r270, %r124, %r23; + sub.f32 %r271, %r125, %r22; + sub.f32 %r272, %r126, %r21; + sub.f32 %r273, %r127, %r20; + sub.f32 %r274, %r128, %r19; + sub.f32 %r275, %r89, %r138; + sub.f32 %r276, %r90, %r140; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.ftz.f32 %r277, %r274, %r152, %r151; + cvt.ftz.sat.f32.f32 %r278, %r277; + fma.rm.ftz.f32 %r279, %r278, %r156, %r155; + add.f32 %r280, %r279, 0fCB40007F; + neg.f32 %r281, %r280; + fma.rn.ftz.f32 %r282, %r274, %r160, %r281; + fma.rn.ftz.f32 %r283, %r274, %r162, %r282; + shl.b32 %r284, %r279, 23; + ex2.approx.ftz.f32 %r285, %r283; + mul.f32 %r286, %r285, %r284; + fma.rn.ftz.f32 %r287, %r273, %r152, %r151; + cvt.ftz.sat.f32.f32 %r288, %r287; + fma.rm.ftz.f32 %r289, %r288, %r156, %r155; + add.f32 %r290, %r289, 0fCB40007F; + neg.f32 %r291, %r290; + fma.rn.ftz.f32 %r292, %r273, %r160, %r291; + fma.rn.ftz.f32 %r293, %r273, %r162, %r292; + shl.b32 %r294, %r289, 23; + ex2.approx.ftz.f32 %r295, %r293; + mul.f32 %r296, %r295, %r294; + fma.rn.ftz.f32 %r297, %r272, %r152, %r151; + cvt.ftz.sat.f32.f32 %r298, %r297; + fma.rm.ftz.f32 %r299, %r298, %r156, %r155; + add.f32 %r300, %r299, 0fCB40007F; + neg.f32 %r301, %r300; + fma.rn.ftz.f32 %r302, %r272, %r160, %r301; + fma.rn.ftz.f32 %r303, %r272, %r162, %r302; + shl.b32 %r304, %r299, 23; + ex2.approx.ftz.f32 %r305, %r303; + mul.f32 %r306, %r305, %r304; + fma.rn.ftz.f32 %r307, %r271, %r152, %r151; + cvt.ftz.sat.f32.f32 %r308, %r307; + fma.rm.ftz.f32 %r309, %r308, %r156, %r155; + add.f32 %r310, %r309, 0fCB40007F; + neg.f32 %r311, %r310; + fma.rn.ftz.f32 %r312, %r271, %r160, %r311; + fma.rn.ftz.f32 %r313, %r271, %r162, %r312; + shl.b32 %r314, %r309, 23; + ex2.approx.ftz.f32 %r315, %r313; + mul.f32 %r316, %r315, %r314; + fma.rn.ftz.f32 %r317, %r270, %r152, %r151; + cvt.ftz.sat.f32.f32 %r318, %r317; + fma.rm.ftz.f32 %r319, %r318, %r156, %r155; + add.f32 %r320, %r319, 0fCB40007F; + neg.f32 %r321, %r320; + fma.rn.ftz.f32 %r322, %r270, %r160, %r321; + fma.rn.ftz.f32 %r323, %r270, %r162, %r322; + shl.b32 %r324, %r319, 23; + ex2.approx.ftz.f32 %r325, %r323; + mul.f32 %r326, %r325, %r324; + fma.rn.ftz.f32 %r327, %r269, %r152, %r151; + cvt.ftz.sat.f32.f32 %r328, %r327; + fma.rm.ftz.f32 %r329, %r328, %r156, %r155; + add.f32 %r330, %r329, 0fCB40007F; + neg.f32 %r331, %r330; + fma.rn.ftz.f32 %r332, %r269, %r160, %r331; + fma.rn.ftz.f32 %r333, %r269, %r162, %r332; + shl.b32 %r334, %r329, 23; + ex2.approx.ftz.f32 %r335, %r333; + mul.f32 %r336, %r335, %r334; + fma.rn.ftz.f32 %r337, %r268, %r152, %r151; + cvt.ftz.sat.f32.f32 %r338, %r337; + fma.rm.ftz.f32 %r339, %r338, %r156, %r155; + add.f32 %r340, %r339, 0fCB40007F; + neg.f32 %r341, %r340; + fma.rn.ftz.f32 %r342, %r268, %r160, %r341; + fma.rn.ftz.f32 %r343, %r268, %r162, %r342; + shl.b32 %r344, %r339, 23; + ex2.approx.ftz.f32 %r345, %r343; + mul.f32 %r346, %r345, %r344; + fma.rn.ftz.f32 %r347, %r267, %r152, %r151; + cvt.ftz.sat.f32.f32 %r348, %r347; + fma.rm.ftz.f32 %r349, %r348, %r156, %r155; + add.f32 %r350, %r349, 0fCB40007F; + neg.f32 %r351, %r350; + fma.rn.ftz.f32 %r352, %r267, %r160, %r351; + fma.rn.ftz.f32 %r353, %r267, %r162, %r352; + shl.b32 %r354, %r349, 23; + ex2.approx.ftz.f32 %r355, %r353; + mul.f32 %r356, %r355, %r354; + fma.rn.ftz.f32 %r357, %r275, %r152, %r151; + cvt.ftz.sat.f32.f32 %r358, %r357; + fma.rm.ftz.f32 %r359, %r358, %r156, %r155; + add.f32 %r360, %r359, 0fCB40007F; + neg.f32 %r361, %r360; + fma.rn.ftz.f32 %r362, %r275, %r160, %r361; + fma.rn.ftz.f32 %r363, %r275, %r162, %r362; + shl.b32 %r364, %r359, 23; + ex2.approx.ftz.f32 %r365, %r363; + mul.f32 %r366, %r365, %r364; + fma.rn.ftz.f32 %r367, %r276, %r152, %r151; + cvt.ftz.sat.f32.f32 %r368, %r367; + fma.rm.ftz.f32 %r369, %r368, %r156, %r155; + add.f32 %r370, %r369, 0fCB40007F; + neg.f32 %r371, %r370; + fma.rn.ftz.f32 %r372, %r276, %r160, %r371; + fma.rn.ftz.f32 %r373, %r276, %r162, %r372; + shl.b32 %r374, %r369, 23; + ex2.approx.ftz.f32 %r375, %r373; + mul.f32 %r376, %r375, %r374; + .loc 2 199 39 // triton_helpers.py:199:39 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r377, 0f3F800000, %r286, %p51; + selp.f32 %r378, 0f3F800000, %r296, %p52; + selp.f32 %r379, 0f3F800000, %r306, %p53; + selp.f32 %r380, 0f3F800000, %r316, %p54; + selp.f32 %r381, 0f3F800000, %r326, %p55; + selp.f32 %r382, 0f3F800000, %r336, %p56; + selp.f32 %r383, 0f3F800000, %r346, %p57; + selp.f32 %r384, 0f3F800000, %r356, %p58; + selp.f32 %r385, 0f3F800000, %r366, %p59; + selp.f32 %r386, 0f3F800000, %r376, %p60; +$L__tmp4: + .loc 1 37 105 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:105 + mov.b32 {%rs11, %rs12}, %r51; + cvt.f32.bf16 %r387, %rs12; + cvt.f32.bf16 %r388, %rs11; + mov.b32 {%rs13, %rs14}, %r50; + cvt.f32.bf16 %r389, %rs14; + cvt.f32.bf16 %r390, %rs13; + mov.b32 {%rs15, %rs16}, %r49; + cvt.f32.bf16 %r391, %rs16; + cvt.f32.bf16 %r392, %rs15; + mov.b32 {%rs17, %rs18}, %r48; + cvt.f32.bf16 %r393, %rs18; + cvt.f32.bf16 %r394, %rs17; + mov.b32 {%rs19, %rs20}, %r59; + cvt.f32.bf16 %r395, %rs20; + cvt.f32.bf16 %r396, %rs19; + mov.b32 {%rs21, %rs22}, %r58; + cvt.f32.bf16 %r397, %rs22; + cvt.f32.bf16 %r398, %rs21; + mov.b32 {%rs23, %rs24}, %r57; + cvt.f32.bf16 %r399, %rs24; + cvt.f32.bf16 %r400, %rs23; + mov.b32 {%rs25, %rs26}, %r56; + cvt.f32.bf16 %r401, %rs26; + cvt.f32.bf16 %r402, %rs25; +$L__tmp5: + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.gt.f32 %p61, %r91, %r402; + setp.gt.f32 %p62, %r92, %r401; + setp.gt.f32 %p63, %r93, %r400; + setp.gt.f32 %p64, %r94, %r399; + setp.gt.f32 %p65, %r95, %r398; + setp.gt.f32 %p66, %r96, %r397; + setp.gt.f32 %p67, %r97, %r396; + setp.gt.f32 %p68, %r98, %r395; + setp.gt.f32 %p69, %r99, %r394; + setp.gt.f32 %p70, %r100, %r393; + setp.gt.f32 %p71, %r101, %r392; + setp.gt.f32 %p72, %r102, %r391; + setp.gt.f32 %p73, %r103, %r390; + setp.gt.f32 %p74, %r104, %r389; + setp.gt.f32 %p75, %r105, %r388; + setp.gt.f32 %p76, %r106, %r387; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r403, %r106, %r387, %p76; + selp.f32 %r404, %r106, %r403, %p26; + selp.f32 %r405, %r105, %r388, %p75; + selp.f32 %r406, %r105, %r405, %p25; + mov.b64 %rd264, {%r406, %r404}; + selp.f32 %r407, %r104, %r389, %p74; + selp.f32 %r408, %r104, %r407, %p24; + selp.f32 %r409, %r103, %r390, %p73; + selp.f32 %r410, %r103, %r409, %p23; + mov.b64 %rd263, {%r410, %r408}; + selp.f32 %r411, %r102, %r391, %p72; + selp.f32 %r412, %r102, %r411, %p22; + selp.f32 %r413, %r101, %r392, %p71; + selp.f32 %r414, %r101, %r413, %p21; + mov.b64 %rd262, {%r414, %r412}; + selp.f32 %r415, %r100, %r393, %p70; + selp.f32 %r416, %r100, %r415, %p20; + selp.f32 %r417, %r99, %r394, %p69; + selp.f32 %r418, %r99, %r417, %p19; + mov.b64 %rd261, {%r418, %r416}; + selp.f32 %r419, %r98, %r395, %p68; + selp.f32 %r420, %r98, %r419, %p18; + selp.f32 %r421, %r97, %r396, %p67; + selp.f32 %r422, %r97, %r421, %p17; + mov.b64 %rd268, {%r422, %r420}; + selp.f32 %r423, %r96, %r397, %p66; + selp.f32 %r424, %r96, %r423, %p16; + selp.f32 %r425, %r95, %r398, %p65; + selp.f32 %r426, %r95, %r425, %p15; + mov.b64 %rd267, {%r426, %r424}; + selp.f32 %r427, %r94, %r399, %p64; + selp.f32 %r428, %r94, %r427, %p14; + selp.f32 %r429, %r93, %r400, %p63; + selp.f32 %r430, %r93, %r429, %p13; + mov.b64 %rd266, {%r430, %r428}; + selp.f32 %r431, %r92, %r401, %p62; + selp.f32 %r432, %r92, %r431, %p12; + selp.f32 %r433, %r91, %r402, %p61; + selp.f32 %r434, %r91, %r433, %p11; + mov.b64 %rd265, {%r434, %r432}; + .loc 2 196 19 // triton_helpers.py:196:19 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.eq.f32 %p77, %r404, 0fFF800000; + setp.eq.f32 %p78, %r406, 0fFF800000; + setp.eq.f32 %p79, %r408, 0fFF800000; + setp.eq.f32 %p80, %r410, 0fFF800000; + setp.eq.f32 %p81, %r412, 0fFF800000; + setp.eq.f32 %p82, %r414, 0fFF800000; + setp.eq.f32 %p83, %r416, 0fFF800000; + setp.eq.f32 %p84, %r418, 0fFF800000; + setp.eq.f32 %p85, %r420, 0fFF800000; + setp.eq.f32 %p86, %r422, 0fFF800000; + setp.eq.f32 %p87, %r424, 0fFF800000; + setp.eq.f32 %p88, %r426, 0fFF800000; + setp.eq.f32 %p89, %r428, 0fFF800000; + setp.eq.f32 %p90, %r430, 0fFF800000; + setp.eq.f32 %p91, %r432, 0fFF800000; + setp.eq.f32 %p92, %r434, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + sub.f32 %r435, %r98, %r420; + sub.f32 %r436, %r97, %r422; + sub.f32 %r437, %r96, %r424; + sub.f32 %r438, %r95, %r426; + sub.f32 %r439, %r94, %r428; + sub.f32 %r440, %r93, %r430; + sub.f32 %r441, %r92, %r432; + sub.f32 %r442, %r91, %r434; + sub.f32 %r443, %r106, %r404; + sub.f32 %r444, %r105, %r406; + sub.f32 %r445, %r104, %r408; + sub.f32 %r446, %r103, %r410; + sub.f32 %r447, %r102, %r412; + sub.f32 %r448, %r101, %r414; + sub.f32 %r449, %r100, %r416; + sub.f32 %r450, %r99, %r418; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.ftz.f32 %r451, %r450, %r152, %r151; + cvt.ftz.sat.f32.f32 %r452, %r451; + fma.rm.ftz.f32 %r453, %r452, %r156, %r155; + fma.rn.ftz.f32 %r454, %r449, %r152, %r151; + cvt.ftz.sat.f32.f32 %r455, %r454; + fma.rm.ftz.f32 %r456, %r455, %r156, %r155; + fma.rn.ftz.f32 %r457, %r448, %r152, %r151; + cvt.ftz.sat.f32.f32 %r458, %r457; + fma.rm.ftz.f32 %r459, %r458, %r156, %r155; + fma.rn.ftz.f32 %r460, %r447, %r152, %r151; + cvt.ftz.sat.f32.f32 %r461, %r460; + fma.rm.ftz.f32 %r462, %r461, %r156, %r155; + fma.rn.ftz.f32 %r463, %r446, %r152, %r151; + cvt.ftz.sat.f32.f32 %r464, %r463; + fma.rm.ftz.f32 %r465, %r464, %r156, %r155; + fma.rn.ftz.f32 %r466, %r445, %r152, %r151; + cvt.ftz.sat.f32.f32 %r467, %r466; + fma.rm.ftz.f32 %r468, %r467, %r156, %r155; + fma.rn.ftz.f32 %r469, %r444, %r152, %r151; + cvt.ftz.sat.f32.f32 %r470, %r469; + fma.rm.ftz.f32 %r471, %r470, %r156, %r155; + fma.rn.ftz.f32 %r472, %r443, %r152, %r151; + cvt.ftz.sat.f32.f32 %r473, %r472; + fma.rm.ftz.f32 %r474, %r473, %r156, %r155; + fma.rn.ftz.f32 %r475, %r442, %r152, %r151; + cvt.ftz.sat.f32.f32 %r476, %r475; + fma.rm.ftz.f32 %r477, %r476, %r156, %r155; + fma.rn.ftz.f32 %r478, %r441, %r152, %r151; + cvt.ftz.sat.f32.f32 %r479, %r478; + fma.rm.ftz.f32 %r480, %r479, %r156, %r155; + fma.rn.ftz.f32 %r481, %r440, %r152, %r151; + cvt.ftz.sat.f32.f32 %r482, %r481; + fma.rm.ftz.f32 %r483, %r482, %r156, %r155; + fma.rn.ftz.f32 %r484, %r439, %r152, %r151; + cvt.ftz.sat.f32.f32 %r485, %r484; + fma.rm.ftz.f32 %r486, %r485, %r156, %r155; + fma.rn.ftz.f32 %r487, %r438, %r152, %r151; + cvt.ftz.sat.f32.f32 %r488, %r487; + fma.rm.ftz.f32 %r489, %r488, %r156, %r155; + fma.rn.ftz.f32 %r490, %r437, %r152, %r151; + cvt.ftz.sat.f32.f32 %r491, %r490; + fma.rm.ftz.f32 %r492, %r491, %r156, %r155; + fma.rn.ftz.f32 %r493, %r436, %r152, %r151; + cvt.ftz.sat.f32.f32 %r494, %r493; + fma.rm.ftz.f32 %r495, %r494, %r156, %r155; + fma.rn.ftz.f32 %r496, %r435, %r152, %r151; + cvt.ftz.sat.f32.f32 %r497, %r496; + fma.rm.ftz.f32 %r498, %r497, %r156, %r155; + mov.b64 %rd96, {%r471, %r474}; + cvt.u32.u64 %r499, %rd96; + mov.b64 %rd97, {%r465, %r468}; + cvt.u32.u64 %r500, %rd97; + mov.b64 %rd98, {%r459, %r462}; + cvt.u32.u64 %r501, %rd98; + mov.b64 %rd99, {%r453, %r456}; + cvt.u32.u64 %r502, %rd99; + mov.b64 %rd100, {%r495, %r498}; + cvt.u32.u64 %r503, %rd100; + mov.b64 %rd101, {%r489, %r492}; + cvt.u32.u64 %r504, %rd101; + mov.b64 %rd102, {%r483, %r486}; + cvt.u32.u64 %r505, %rd102; + mov.b64 %rd103, {%r477, %r480}; + cvt.u32.u64 %r506, %rd103; + add.f32 %r507, %r453, 0fCB40007F; + neg.f32 %r508, %r507; + fma.rn.ftz.f32 %r509, %r450, %r160, %r508; + fma.rn.ftz.f32 %r510, %r450, %r162, %r509; + ex2.approx.ftz.f32 %r511, %r510; + add.f32 %r512, %r456, 0fCB40007F; + neg.f32 %r513, %r512; + fma.rn.ftz.f32 %r514, %r449, %r160, %r513; + fma.rn.ftz.f32 %r515, %r449, %r162, %r514; + ex2.approx.ftz.f32 %r516, %r515; + add.f32 %r517, %r459, 0fCB40007F; + neg.f32 %r518, %r517; + fma.rn.ftz.f32 %r519, %r448, %r160, %r518; + fma.rn.ftz.f32 %r520, %r448, %r162, %r519; + ex2.approx.ftz.f32 %r521, %r520; + add.f32 %r522, %r462, 0fCB40007F; + neg.f32 %r523, %r522; + fma.rn.ftz.f32 %r524, %r447, %r160, %r523; + fma.rn.ftz.f32 %r525, %r447, %r162, %r524; + ex2.approx.ftz.f32 %r526, %r525; + add.f32 %r527, %r465, 0fCB40007F; + neg.f32 %r528, %r527; + fma.rn.ftz.f32 %r529, %r446, %r160, %r528; + fma.rn.ftz.f32 %r530, %r446, %r162, %r529; + ex2.approx.ftz.f32 %r531, %r530; + add.f32 %r532, %r468, 0fCB40007F; + neg.f32 %r533, %r532; + fma.rn.ftz.f32 %r534, %r445, %r160, %r533; + fma.rn.ftz.f32 %r535, %r445, %r162, %r534; + ex2.approx.ftz.f32 %r536, %r535; + add.f32 %r537, %r471, 0fCB40007F; + neg.f32 %r538, %r537; + fma.rn.ftz.f32 %r539, %r444, %r160, %r538; + fma.rn.ftz.f32 %r540, %r444, %r162, %r539; + ex2.approx.ftz.f32 %r541, %r540; + add.f32 %r542, %r474, 0fCB40007F; + neg.f32 %r543, %r542; + fma.rn.ftz.f32 %r544, %r443, %r160, %r543; + fma.rn.ftz.f32 %r545, %r443, %r162, %r544; + ex2.approx.ftz.f32 %r546, %r545; + add.f32 %r547, %r477, 0fCB40007F; + neg.f32 %r548, %r547; + fma.rn.ftz.f32 %r549, %r442, %r160, %r548; + fma.rn.ftz.f32 %r550, %r442, %r162, %r549; + ex2.approx.ftz.f32 %r551, %r550; + add.f32 %r552, %r480, 0fCB40007F; + neg.f32 %r553, %r552; + fma.rn.ftz.f32 %r554, %r441, %r160, %r553; + fma.rn.ftz.f32 %r555, %r441, %r162, %r554; + ex2.approx.ftz.f32 %r556, %r555; + add.f32 %r557, %r483, 0fCB40007F; + neg.f32 %r558, %r557; + fma.rn.ftz.f32 %r559, %r440, %r160, %r558; + fma.rn.ftz.f32 %r560, %r440, %r162, %r559; + ex2.approx.ftz.f32 %r561, %r560; + add.f32 %r562, %r486, 0fCB40007F; + neg.f32 %r563, %r562; + fma.rn.ftz.f32 %r564, %r439, %r160, %r563; + fma.rn.ftz.f32 %r565, %r439, %r162, %r564; + ex2.approx.ftz.f32 %r566, %r565; + add.f32 %r567, %r489, 0fCB40007F; + neg.f32 %r568, %r567; + fma.rn.ftz.f32 %r569, %r438, %r160, %r568; + fma.rn.ftz.f32 %r570, %r438, %r162, %r569; + ex2.approx.ftz.f32 %r571, %r570; + add.f32 %r572, %r492, 0fCB40007F; + neg.f32 %r573, %r572; + fma.rn.ftz.f32 %r574, %r437, %r160, %r573; + fma.rn.ftz.f32 %r575, %r437, %r162, %r574; + ex2.approx.ftz.f32 %r576, %r575; + add.f32 %r577, %r495, 0fCB40007F; + neg.f32 %r578, %r577; + fma.rn.ftz.f32 %r579, %r436, %r160, %r578; + fma.rn.ftz.f32 %r580, %r436, %r162, %r579; + ex2.approx.ftz.f32 %r581, %r580; + add.f32 %r582, %r498, 0fCB40007F; + neg.f32 %r583, %r582; + fma.rn.ftz.f32 %r584, %r435, %r160, %r583; + fma.rn.ftz.f32 %r585, %r435, %r162, %r584; + shl.b64 %rd104, %rd103, 23; + and.b64 %rd105, %rd104, -36028797018963968; + shl.b32 %r586, %r506, 23; + cvt.u64.u32 %rd106, %r586; + or.b64 %rd107, %rd106, %rd105; + shl.b64 %rd108, %rd102, 23; + and.b64 %rd109, %rd108, -36028797018963968; + shl.b32 %r587, %r505, 23; + cvt.u64.u32 %rd110, %r587; + or.b64 %rd111, %rd110, %rd109; + shl.b64 %rd112, %rd101, 23; + and.b64 %rd113, %rd112, -36028797018963968; + shl.b32 %r588, %r504, 23; + cvt.u64.u32 %rd114, %r588; + or.b64 %rd115, %rd114, %rd113; + shl.b64 %rd116, %rd100, 23; + and.b64 %rd117, %rd116, -36028797018963968; + shl.b32 %r589, %r503, 23; + cvt.u64.u32 %rd118, %r589; + or.b64 %rd119, %rd118, %rd117; + shl.b64 %rd120, %rd99, 23; + and.b64 %rd121, %rd120, -36028797018963968; + shl.b32 %r590, %r502, 23; + cvt.u64.u32 %rd122, %r590; + or.b64 %rd123, %rd122, %rd121; + shl.b64 %rd124, %rd98, 23; + and.b64 %rd125, %rd124, -36028797018963968; + shl.b32 %r591, %r501, 23; + cvt.u64.u32 %rd126, %r591; + or.b64 %rd127, %rd126, %rd125; + shl.b64 %rd128, %rd97, 23; + and.b64 %rd129, %rd128, -36028797018963968; + shl.b32 %r592, %r500, 23; + cvt.u64.u32 %rd130, %r592; + or.b64 %rd131, %rd130, %rd129; + shl.b64 %rd132, %rd96, 23; + and.b64 %rd133, %rd132, -36028797018963968; + shl.b32 %r593, %r499, 23; + cvt.u64.u32 %rd134, %r593; + or.b64 %rd135, %rd134, %rd133; + ex2.approx.ftz.f32 %r594, %r585; + mov.b64 {%r595, %r596}, %rd135; + mul.f32 %r597, %r546, %r596; + mul.f32 %r598, %r541, %r595; + mov.b64 {%r599, %r600}, %rd131; + mul.f32 %r601, %r536, %r600; + mul.f32 %r602, %r531, %r599; + mov.b64 {%r603, %r604}, %rd127; + mul.f32 %r605, %r526, %r604; + mul.f32 %r606, %r521, %r603; + mov.b64 {%r607, %r608}, %rd123; + mul.f32 %r609, %r516, %r608; + mul.f32 %r610, %r511, %r607; + mov.b64 {%r611, %r612}, %rd119; + mul.f32 %r613, %r594, %r612; + mul.f32 %r614, %r581, %r611; + mov.b64 {%r615, %r616}, %rd115; + mul.f32 %r617, %r576, %r616; + mul.f32 %r618, %r571, %r615; + mov.b64 {%r619, %r620}, %rd111; + mul.f32 %r621, %r566, %r620; + mul.f32 %r622, %r561, %r619; + mov.b64 {%r623, %r624}, %rd107; + mul.f32 %r625, %r556, %r624; + mul.f32 %r626, %r551, %r623; + .loc 2 196 39 // triton_helpers.py:196:39 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r627, 0f3F800000, %r626, %p92; + selp.f32 %r628, 0f3F800000, %r625, %p91; + selp.f32 %r629, 0f3F800000, %r622, %p90; + selp.f32 %r630, 0f3F800000, %r621, %p89; + selp.f32 %r631, 0f3F800000, %r618, %p88; + selp.f32 %r632, 0f3F800000, %r617, %p87; + selp.f32 %r633, 0f3F800000, %r614, %p86; + selp.f32 %r634, 0f3F800000, %r613, %p85; + selp.f32 %r635, 0f3F800000, %r610, %p84; + selp.f32 %r636, 0f3F800000, %r609, %p83; + selp.f32 %r637, 0f3F800000, %r606, %p82; + selp.f32 %r638, 0f3F800000, %r605, %p81; + selp.f32 %r639, 0f3F800000, %r602, %p80; + selp.f32 %r640, 0f3F800000, %r601, %p79; + selp.f32 %r641, 0f3F800000, %r598, %p78; + selp.f32 %r642, 0f3F800000, %r597, %p77; + .loc 2 199 53 // triton_helpers.py:199:53 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + sub.f32 %r643, %r395, %r420; + sub.f32 %r644, %r396, %r422; + sub.f32 %r645, %r397, %r424; + sub.f32 %r646, %r398, %r426; + sub.f32 %r647, %r399, %r428; + sub.f32 %r648, %r400, %r430; + sub.f32 %r649, %r401, %r432; + sub.f32 %r650, %r402, %r434; + sub.f32 %r651, %r387, %r404; + sub.f32 %r652, %r388, %r406; + sub.f32 %r653, %r389, %r408; + sub.f32 %r654, %r390, %r410; + sub.f32 %r655, %r391, %r412; + sub.f32 %r656, %r392, %r414; + sub.f32 %r657, %r393, %r416; + sub.f32 %r658, %r394, %r418; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.ftz.f32 %r659, %r658, %r152, %r151; + cvt.ftz.sat.f32.f32 %r660, %r659; + fma.rm.ftz.f32 %r661, %r660, %r156, %r155; + fma.rn.ftz.f32 %r662, %r657, %r152, %r151; + cvt.ftz.sat.f32.f32 %r663, %r662; + fma.rm.ftz.f32 %r664, %r663, %r156, %r155; + fma.rn.ftz.f32 %r665, %r656, %r152, %r151; + cvt.ftz.sat.f32.f32 %r666, %r665; + fma.rm.ftz.f32 %r667, %r666, %r156, %r155; + fma.rn.ftz.f32 %r668, %r655, %r152, %r151; + cvt.ftz.sat.f32.f32 %r669, %r668; + fma.rm.ftz.f32 %r670, %r669, %r156, %r155; + fma.rn.ftz.f32 %r671, %r654, %r152, %r151; + cvt.ftz.sat.f32.f32 %r672, %r671; + fma.rm.ftz.f32 %r673, %r672, %r156, %r155; + fma.rn.ftz.f32 %r674, %r653, %r152, %r151; + cvt.ftz.sat.f32.f32 %r675, %r674; + fma.rm.ftz.f32 %r676, %r675, %r156, %r155; + fma.rn.ftz.f32 %r677, %r652, %r152, %r151; + cvt.ftz.sat.f32.f32 %r678, %r677; + fma.rm.ftz.f32 %r679, %r678, %r156, %r155; + fma.rn.ftz.f32 %r680, %r651, %r152, %r151; + cvt.ftz.sat.f32.f32 %r681, %r680; + fma.rm.ftz.f32 %r682, %r681, %r156, %r155; + fma.rn.ftz.f32 %r683, %r650, %r152, %r151; + cvt.ftz.sat.f32.f32 %r684, %r683; + fma.rm.ftz.f32 %r685, %r684, %r156, %r155; + fma.rn.ftz.f32 %r686, %r649, %r152, %r151; + cvt.ftz.sat.f32.f32 %r687, %r686; + fma.rm.ftz.f32 %r688, %r687, %r156, %r155; + fma.rn.ftz.f32 %r689, %r648, %r152, %r151; + cvt.ftz.sat.f32.f32 %r690, %r689; + fma.rm.ftz.f32 %r691, %r690, %r156, %r155; + fma.rn.ftz.f32 %r692, %r647, %r152, %r151; + cvt.ftz.sat.f32.f32 %r693, %r692; + fma.rm.ftz.f32 %r694, %r693, %r156, %r155; + fma.rn.ftz.f32 %r695, %r646, %r152, %r151; + cvt.ftz.sat.f32.f32 %r696, %r695; + fma.rm.ftz.f32 %r697, %r696, %r156, %r155; + fma.rn.ftz.f32 %r698, %r645, %r152, %r151; + cvt.ftz.sat.f32.f32 %r699, %r698; + fma.rm.ftz.f32 %r700, %r699, %r156, %r155; + fma.rn.ftz.f32 %r701, %r644, %r152, %r151; + cvt.ftz.sat.f32.f32 %r702, %r701; + fma.rm.ftz.f32 %r703, %r702, %r156, %r155; + fma.rn.ftz.f32 %r704, %r643, %r152, %r151; + cvt.ftz.sat.f32.f32 %r705, %r704; + fma.rm.ftz.f32 %r706, %r705, %r156, %r155; + mov.b64 %rd136, {%r679, %r682}; + cvt.u32.u64 %r707, %rd136; + mov.b64 %rd137, {%r673, %r676}; + cvt.u32.u64 %r708, %rd137; + mov.b64 %rd138, {%r667, %r670}; + cvt.u32.u64 %r709, %rd138; + mov.b64 %rd139, {%r661, %r664}; + cvt.u32.u64 %r710, %rd139; + mov.b64 %rd140, {%r703, %r706}; + cvt.u32.u64 %r711, %rd140; + mov.b64 %rd141, {%r697, %r700}; + cvt.u32.u64 %r712, %rd141; + mov.b64 %rd142, {%r691, %r694}; + cvt.u32.u64 %r713, %rd142; + mov.b64 %rd143, {%r685, %r688}; + cvt.u32.u64 %r714, %rd143; + add.f32 %r715, %r661, 0fCB40007F; + neg.f32 %r716, %r715; + fma.rn.ftz.f32 %r717, %r658, %r160, %r716; + fma.rn.ftz.f32 %r718, %r658, %r162, %r717; + ex2.approx.ftz.f32 %r719, %r718; + add.f32 %r720, %r664, 0fCB40007F; + neg.f32 %r721, %r720; + fma.rn.ftz.f32 %r722, %r657, %r160, %r721; + fma.rn.ftz.f32 %r723, %r657, %r162, %r722; + ex2.approx.ftz.f32 %r724, %r723; + add.f32 %r725, %r667, 0fCB40007F; + neg.f32 %r726, %r725; + fma.rn.ftz.f32 %r727, %r656, %r160, %r726; + fma.rn.ftz.f32 %r728, %r656, %r162, %r727; + ex2.approx.ftz.f32 %r729, %r728; + add.f32 %r730, %r670, 0fCB40007F; + neg.f32 %r731, %r730; + fma.rn.ftz.f32 %r732, %r655, %r160, %r731; + fma.rn.ftz.f32 %r733, %r655, %r162, %r732; + ex2.approx.ftz.f32 %r734, %r733; + add.f32 %r735, %r673, 0fCB40007F; + neg.f32 %r736, %r735; + fma.rn.ftz.f32 %r737, %r654, %r160, %r736; + fma.rn.ftz.f32 %r738, %r654, %r162, %r737; + ex2.approx.ftz.f32 %r739, %r738; + add.f32 %r740, %r676, 0fCB40007F; + neg.f32 %r741, %r740; + fma.rn.ftz.f32 %r742, %r653, %r160, %r741; + fma.rn.ftz.f32 %r743, %r653, %r162, %r742; + ex2.approx.ftz.f32 %r744, %r743; + add.f32 %r745, %r679, 0fCB40007F; + neg.f32 %r746, %r745; + fma.rn.ftz.f32 %r747, %r652, %r160, %r746; + fma.rn.ftz.f32 %r748, %r652, %r162, %r747; + ex2.approx.ftz.f32 %r749, %r748; + add.f32 %r750, %r682, 0fCB40007F; + neg.f32 %r751, %r750; + fma.rn.ftz.f32 %r752, %r651, %r160, %r751; + fma.rn.ftz.f32 %r753, %r651, %r162, %r752; + ex2.approx.ftz.f32 %r754, %r753; + add.f32 %r755, %r685, 0fCB40007F; + neg.f32 %r756, %r755; + fma.rn.ftz.f32 %r757, %r650, %r160, %r756; + fma.rn.ftz.f32 %r758, %r650, %r162, %r757; + ex2.approx.ftz.f32 %r759, %r758; + add.f32 %r760, %r688, 0fCB40007F; + neg.f32 %r761, %r760; + fma.rn.ftz.f32 %r762, %r649, %r160, %r761; + fma.rn.ftz.f32 %r763, %r649, %r162, %r762; + ex2.approx.ftz.f32 %r764, %r763; + add.f32 %r765, %r691, 0fCB40007F; + neg.f32 %r766, %r765; + fma.rn.ftz.f32 %r767, %r648, %r160, %r766; + fma.rn.ftz.f32 %r768, %r648, %r162, %r767; + ex2.approx.ftz.f32 %r769, %r768; + add.f32 %r770, %r694, 0fCB40007F; + neg.f32 %r771, %r770; + fma.rn.ftz.f32 %r772, %r647, %r160, %r771; + fma.rn.ftz.f32 %r773, %r647, %r162, %r772; + ex2.approx.ftz.f32 %r774, %r773; + add.f32 %r775, %r697, 0fCB40007F; + neg.f32 %r776, %r775; + fma.rn.ftz.f32 %r777, %r646, %r160, %r776; + fma.rn.ftz.f32 %r778, %r646, %r162, %r777; + ex2.approx.ftz.f32 %r779, %r778; + add.f32 %r780, %r700, 0fCB40007F; + neg.f32 %r781, %r780; + fma.rn.ftz.f32 %r782, %r645, %r160, %r781; + fma.rn.ftz.f32 %r783, %r645, %r162, %r782; + ex2.approx.ftz.f32 %r784, %r783; + add.f32 %r785, %r703, 0fCB40007F; + neg.f32 %r786, %r785; + fma.rn.ftz.f32 %r787, %r644, %r160, %r786; + fma.rn.ftz.f32 %r788, %r644, %r162, %r787; + ex2.approx.ftz.f32 %r789, %r788; + add.f32 %r790, %r706, 0fCB40007F; + neg.f32 %r791, %r790; + fma.rn.ftz.f32 %r792, %r643, %r160, %r791; + fma.rn.ftz.f32 %r793, %r643, %r162, %r792; + shl.b64 %rd144, %rd143, 23; + and.b64 %rd145, %rd144, -36028797018963968; + shl.b32 %r794, %r714, 23; + cvt.u64.u32 %rd146, %r794; + or.b64 %rd147, %rd146, %rd145; + shl.b64 %rd148, %rd142, 23; + and.b64 %rd149, %rd148, -36028797018963968; + shl.b32 %r795, %r713, 23; + cvt.u64.u32 %rd150, %r795; + or.b64 %rd151, %rd150, %rd149; + shl.b64 %rd152, %rd141, 23; + and.b64 %rd153, %rd152, -36028797018963968; + shl.b32 %r796, %r712, 23; + cvt.u64.u32 %rd154, %r796; + or.b64 %rd155, %rd154, %rd153; + shl.b64 %rd156, %rd140, 23; + and.b64 %rd157, %rd156, -36028797018963968; + shl.b32 %r797, %r711, 23; + cvt.u64.u32 %rd158, %r797; + or.b64 %rd159, %rd158, %rd157; + shl.b64 %rd160, %rd139, 23; + and.b64 %rd161, %rd160, -36028797018963968; + shl.b32 %r798, %r710, 23; + cvt.u64.u32 %rd162, %r798; + or.b64 %rd163, %rd162, %rd161; + shl.b64 %rd164, %rd138, 23; + and.b64 %rd165, %rd164, -36028797018963968; + shl.b32 %r799, %r709, 23; + cvt.u64.u32 %rd166, %r799; + or.b64 %rd167, %rd166, %rd165; + shl.b64 %rd168, %rd137, 23; + and.b64 %rd169, %rd168, -36028797018963968; + shl.b32 %r800, %r708, 23; + cvt.u64.u32 %rd170, %r800; + or.b64 %rd171, %rd170, %rd169; + shl.b64 %rd172, %rd136, 23; + and.b64 %rd173, %rd172, -36028797018963968; + shl.b32 %r801, %r707, 23; + cvt.u64.u32 %rd174, %r801; + or.b64 %rd175, %rd174, %rd173; + ex2.approx.ftz.f32 %r802, %r793; + mov.b64 {%r803, %r804}, %rd175; + mul.f32 %r805, %r754, %r804; + mul.f32 %r806, %r749, %r803; + mov.b64 {%r807, %r808}, %rd171; + mul.f32 %r809, %r744, %r808; + mul.f32 %r810, %r739, %r807; + mov.b64 {%r811, %r812}, %rd167; + mul.f32 %r813, %r734, %r812; + mul.f32 %r814, %r729, %r811; + mov.b64 {%r815, %r816}, %rd163; + mul.f32 %r817, %r724, %r816; + mul.f32 %r818, %r719, %r815; + mov.b64 {%r819, %r820}, %rd159; + mul.f32 %r821, %r802, %r820; + mul.f32 %r822, %r789, %r819; + mov.b64 {%r823, %r824}, %rd155; + mul.f32 %r825, %r784, %r824; + mul.f32 %r826, %r779, %r823; + mov.b64 {%r827, %r828}, %rd151; + mul.f32 %r829, %r774, %r828; + mul.f32 %r830, %r769, %r827; + mov.b64 {%r831, %r832}, %rd147; + mul.f32 %r833, %r764, %r832; + mul.f32 %r834, %r759, %r831; + .loc 2 199 39 // triton_helpers.py:199:39 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r835, 0f3F800000, %r834, %p92; + selp.f32 %r836, 0f3F800000, %r833, %p91; + selp.f32 %r837, 0f3F800000, %r830, %p90; + selp.f32 %r838, 0f3F800000, %r829, %p89; + selp.f32 %r839, 0f3F800000, %r826, %p88; + selp.f32 %r840, 0f3F800000, %r825, %p87; + selp.f32 %r841, 0f3F800000, %r822, %p86; + selp.f32 %r842, 0f3F800000, %r821, %p85; + selp.f32 %r843, 0f3F800000, %r818, %p84; + selp.f32 %r844, 0f3F800000, %r817, %p83; + selp.f32 %r845, 0f3F800000, %r814, %p82; + selp.f32 %r846, 0f3F800000, %r813, %p81; + selp.f32 %r847, 0f3F800000, %r810, %p80; + selp.f32 %r848, 0f3F800000, %r809, %p79; + selp.f32 %r849, 0f3F800000, %r806, %p78; + selp.f32 %r850, 0f3F800000, %r805, %p77; + .loc 2 205 36 // triton_helpers.py:205:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + mov.b64 {%r851, %r852}, %rd256; + fma.rn.f32 %r853, %r852, %r642, %r850; + fma.rn.f32 %r854, %r851, %r641, %r849; + mov.b64 %rd256, {%r854, %r853}; + mov.b64 {%r855, %r856}, %rd255; + fma.rn.f32 %r857, %r856, %r640, %r848; + fma.rn.f32 %r858, %r855, %r639, %r847; + mov.b64 %rd255, {%r858, %r857}; + mov.b64 {%r859, %r860}, %rd254; + fma.rn.f32 %r861, %r860, %r638, %r846; + fma.rn.f32 %r862, %r859, %r637, %r845; + mov.b64 %rd254, {%r862, %r861}; + mov.b64 {%r863, %r864}, %rd253; + fma.rn.f32 %r865, %r864, %r636, %r844; + fma.rn.f32 %r866, %r863, %r635, %r843; + mov.b64 %rd253, {%r866, %r865}; + mov.b64 {%r867, %r868}, %rd260; + fma.rn.f32 %r869, %r868, %r634, %r842; + fma.rn.f32 %r870, %r867, %r633, %r841; + mov.b64 %rd260, {%r870, %r869}; + mov.b64 {%r871, %r872}, %rd259; + fma.rn.f32 %r873, %r872, %r632, %r840; + fma.rn.f32 %r874, %r871, %r631, %r839; + mov.b64 %rd259, {%r874, %r873}; + mov.b64 {%r875, %r876}, %rd258; + fma.rn.f32 %r877, %r876, %r630, %r838; + fma.rn.f32 %r878, %r875, %r629, %r837; + mov.b64 %rd258, {%r878, %r877}; + mov.b64 {%r879, %r880}, %rd257; + fma.rn.f32 %r881, %r880, %r628, %r836; + fma.rn.f32 %r882, %r879, %r627, %r835; + mov.b64 %rd257, {%r882, %r881}; + fma.rn.f32 %r2188, %r2188, %r257, %r377; + fma.rn.f32 %r2189, %r2189, %r258, %r378; + fma.rn.f32 %r2190, %r2190, %r259, %r379; + fma.rn.f32 %r2191, %r2191, %r260, %r380; + fma.rn.f32 %r2192, %r2192, %r261, %r381; + fma.rn.f32 %r2193, %r2193, %r262, %r382; + fma.rn.f32 %r2194, %r2194, %r263, %r383; + fma.rn.f32 %r2195, %r2195, %r264, %r384; + fma.rn.f32 %r883, %r2196, %r265, %r385; + fma.rn.f32 %r884, %r2197, %r266, %r386; +$L__tmp6: + .loc 1 37 105 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:105 + mov.b32 {%rs27, %rs28}, %r73; + cvt.f32.bf16 %r885, %rs28; + cvt.f32.bf16 %r886, %rs27; +$L__tmp7: + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.gt.f32 %p93, %r115, %r886; + setp.gt.f32 %p94, %r116, %r885; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r887, %r116, %r885, %p94; + selp.f32 %r888, %r116, %r887, %p38; + selp.f32 %r889, %r115, %r886, %p93; + selp.f32 %r890, %r115, %r889, %p37; + .loc 2 196 19 // triton_helpers.py:196:19 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.eq.f32 %p95, %r890, 0fFF800000; + setp.eq.f32 %p96, %r888, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + sub.f32 %r891, %r116, %r888; + sub.f32 %r892, %r115, %r890; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.ftz.f32 %r893, %r892, %r152, %r151; + cvt.ftz.sat.f32.f32 %r894, %r893; + fma.rm.ftz.f32 %r895, %r894, %r156, %r155; + fma.rn.ftz.f32 %r896, %r891, %r152, %r151; + cvt.ftz.sat.f32.f32 %r897, %r896; + fma.rm.ftz.f32 %r898, %r897, %r156, %r155; + mov.b64 %rd176, {%r895, %r898}; + cvt.u32.u64 %r899, %rd176; + add.f32 %r900, %r895, 0fCB40007F; + neg.f32 %r901, %r900; + fma.rn.ftz.f32 %r902, %r892, %r160, %r901; + fma.rn.ftz.f32 %r903, %r892, %r162, %r902; + ex2.approx.ftz.f32 %r904, %r903; + add.f32 %r905, %r898, 0fCB40007F; + neg.f32 %r906, %r905; + fma.rn.ftz.f32 %r907, %r891, %r160, %r906; + fma.rn.ftz.f32 %r908, %r891, %r162, %r907; + shl.b64 %rd177, %rd176, 23; + and.b64 %rd178, %rd177, -36028797018963968; + shl.b32 %r909, %r899, 23; + cvt.u64.u32 %rd179, %r909; + or.b64 %rd180, %rd179, %rd178; + ex2.approx.ftz.f32 %r910, %r908; + mov.b64 {%r911, %r912}, %rd180; + mul.f32 %r913, %r904, %r911; + mul.f32 %r914, %r910, %r912; + .loc 2 196 39 // triton_helpers.py:196:39 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r915, 0f3F800000, %r914, %p96; + selp.f32 %r916, 0f3F800000, %r913, %p95; + .loc 2 199 53 // triton_helpers.py:199:53 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + sub.f32 %r917, %r885, %r888; + sub.f32 %r918, %r886, %r890; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.ftz.f32 %r919, %r918, %r152, %r151; + cvt.ftz.sat.f32.f32 %r920, %r919; + fma.rm.ftz.f32 %r921, %r920, %r156, %r155; + fma.rn.ftz.f32 %r922, %r917, %r152, %r151; + cvt.ftz.sat.f32.f32 %r923, %r922; + fma.rm.ftz.f32 %r924, %r923, %r156, %r155; + mov.b64 %rd181, {%r921, %r924}; + cvt.u32.u64 %r925, %rd181; + add.f32 %r926, %r921, 0fCB40007F; + neg.f32 %r927, %r926; + fma.rn.ftz.f32 %r928, %r918, %r160, %r927; + fma.rn.ftz.f32 %r929, %r918, %r162, %r928; + ex2.approx.ftz.f32 %r930, %r929; + add.f32 %r931, %r924, 0fCB40007F; + neg.f32 %r932, %r931; + fma.rn.ftz.f32 %r933, %r917, %r160, %r932; + fma.rn.ftz.f32 %r934, %r917, %r162, %r933; + shl.b64 %rd182, %rd181, 23; + and.b64 %rd183, %rd182, -36028797018963968; + shl.b32 %r935, %r925, 23; + cvt.u64.u32 %rd184, %r935; + or.b64 %rd185, %rd184, %rd183; + ex2.approx.ftz.f32 %r936, %r934; + mov.b64 {%r937, %r938}, %rd185; + mul.f32 %r939, %r930, %r937; + mul.f32 %r940, %r936, %r938; + .loc 2 199 39 // triton_helpers.py:199:39 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r941, 0f3F800000, %r940, %p96; + selp.f32 %r942, 0f3F800000, %r939, %p95; + .loc 2 205 36 // triton_helpers.py:205:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + mov.b64 {%r943, %r944}, %rd251; + fma.rn.f32 %r945, %r943, %r916, %r942; + fma.rn.f32 %r946, %r944, %r915, %r941; +$L__tmp8: + .loc 1 37 105 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:105 + mov.b32 {%rs29, %rs30}, %r74; + cvt.f32.bf16 %r947, %rs30; + cvt.f32.bf16 %r948, %rs29; +$L__tmp9: + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.gt.f32 %p97, %r117, %r948; + setp.gt.f32 %p98, %r118, %r947; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r949, %r118, %r947, %p98; + selp.f32 %r950, %r118, %r949, %p40; + selp.f32 %r951, %r117, %r948, %p97; + selp.f32 %r952, %r117, %r951, %p39; + .loc 2 196 19 // triton_helpers.py:196:19 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.eq.f32 %p99, %r952, 0fFF800000; + setp.eq.f32 %p100, %r950, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + sub.f32 %r953, %r118, %r950; + sub.f32 %r954, %r117, %r952; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.ftz.f32 %r955, %r954, %r152, %r151; + cvt.ftz.sat.f32.f32 %r956, %r955; + fma.rm.ftz.f32 %r957, %r956, %r156, %r155; + fma.rn.ftz.f32 %r958, %r953, %r152, %r151; + cvt.ftz.sat.f32.f32 %r959, %r958; + fma.rm.ftz.f32 %r960, %r959, %r156, %r155; + mov.b64 %rd186, {%r957, %r960}; + cvt.u32.u64 %r961, %rd186; + add.f32 %r962, %r957, 0fCB40007F; + neg.f32 %r963, %r962; + fma.rn.ftz.f32 %r964, %r954, %r160, %r963; + fma.rn.ftz.f32 %r965, %r954, %r162, %r964; + ex2.approx.ftz.f32 %r966, %r965; + add.f32 %r967, %r960, 0fCB40007F; + neg.f32 %r968, %r967; + fma.rn.ftz.f32 %r969, %r953, %r160, %r968; + fma.rn.ftz.f32 %r970, %r953, %r162, %r969; + shl.b64 %rd187, %rd186, 23; + and.b64 %rd188, %rd187, -36028797018963968; + shl.b32 %r971, %r961, 23; + cvt.u64.u32 %rd189, %r971; + or.b64 %rd190, %rd189, %rd188; + ex2.approx.ftz.f32 %r972, %r970; + mov.b64 {%r973, %r974}, %rd190; + mul.f32 %r975, %r966, %r973; + mul.f32 %r976, %r972, %r974; + .loc 2 196 39 // triton_helpers.py:196:39 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r977, 0f3F800000, %r976, %p100; + selp.f32 %r978, 0f3F800000, %r975, %p99; + .loc 2 199 53 // triton_helpers.py:199:53 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + sub.f32 %r979, %r947, %r950; + sub.f32 %r980, %r948, %r952; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.ftz.f32 %r981, %r980, %r152, %r151; + cvt.ftz.sat.f32.f32 %r982, %r981; + fma.rm.ftz.f32 %r983, %r982, %r156, %r155; + fma.rn.ftz.f32 %r984, %r979, %r152, %r151; + cvt.ftz.sat.f32.f32 %r985, %r984; + fma.rm.ftz.f32 %r986, %r985, %r156, %r155; + mov.b64 %rd191, {%r983, %r986}; + cvt.u32.u64 %r987, %rd191; + add.f32 %r988, %r983, 0fCB40007F; + neg.f32 %r989, %r988; + fma.rn.ftz.f32 %r990, %r980, %r160, %r989; + fma.rn.ftz.f32 %r991, %r980, %r162, %r990; + ex2.approx.ftz.f32 %r992, %r991; + add.f32 %r993, %r986, 0fCB40007F; + neg.f32 %r994, %r993; + fma.rn.ftz.f32 %r995, %r979, %r160, %r994; + fma.rn.ftz.f32 %r996, %r979, %r162, %r995; + shl.b64 %rd192, %rd191, 23; + and.b64 %rd193, %rd192, -36028797018963968; + shl.b32 %r997, %r987, 23; + cvt.u64.u32 %rd194, %r997; + or.b64 %rd195, %rd194, %rd193; + ex2.approx.ftz.f32 %r998, %r996; + mov.b64 {%r999, %r1000}, %rd195; + mul.f32 %r1001, %r992, %r999; + mul.f32 %r1002, %r998, %r1000; + .loc 2 199 39 // triton_helpers.py:199:39 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r1003, 0f3F800000, %r1002, %p100; + selp.f32 %r1004, 0f3F800000, %r1001, %p99; + .loc 2 205 36 // triton_helpers.py:205:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + mov.b64 {%r1005, %r1006}, %rd249; + fma.rn.f32 %r1007, %r1005, %r978, %r1004; + fma.rn.f32 %r1008, %r1006, %r977, %r1003; +$L__tmp10: + .loc 1 37 105 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:37:105 + mov.b32 {%rs31, %rs32}, %r75; + cvt.f32.bf16 %r1009, %rs32; + cvt.f32.bf16 %r1010, %rs31; +$L__tmp11: + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.gt.f32 %p101, %r119, %r1010; + setp.gt.f32 %p102, %r120, %r1009; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r1011, %r120, %r1009, %p102; + selp.f32 %r1012, %r120, %r1011, %p42; + selp.f32 %r1013, %r119, %r1010, %p101; + selp.f32 %r1014, %r119, %r1013, %p41; + .loc 2 196 19 // triton_helpers.py:196:19 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + setp.eq.f32 %p103, %r1014, 0fFF800000; + setp.eq.f32 %p104, %r1012, 0fFF800000; + .loc 2 196 53 // triton_helpers.py:196:53 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + sub.f32 %r1015, %r120, %r1012; + sub.f32 %r1016, %r119, %r1014; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.ftz.f32 %r1017, %r1016, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1018, %r1017; + fma.rm.ftz.f32 %r1019, %r1018, %r156, %r155; + fma.rn.ftz.f32 %r1020, %r1015, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1021, %r1020; + fma.rm.ftz.f32 %r1022, %r1021, %r156, %r155; + mov.b64 %rd196, {%r1019, %r1022}; + cvt.u32.u64 %r1023, %rd196; + add.f32 %r1024, %r1019, 0fCB40007F; + neg.f32 %r1025, %r1024; + fma.rn.ftz.f32 %r1026, %r1016, %r160, %r1025; + fma.rn.ftz.f32 %r1027, %r1016, %r162, %r1026; + ex2.approx.ftz.f32 %r1028, %r1027; + add.f32 %r1029, %r1022, 0fCB40007F; + neg.f32 %r1030, %r1029; + fma.rn.ftz.f32 %r1031, %r1015, %r160, %r1030; + fma.rn.ftz.f32 %r1032, %r1015, %r162, %r1031; + shl.b64 %rd197, %rd196, 23; + and.b64 %rd198, %rd197, -36028797018963968; + shl.b32 %r1033, %r1023, 23; + cvt.u64.u32 %rd199, %r1033; + or.b64 %rd200, %rd199, %rd198; + ex2.approx.ftz.f32 %r1034, %r1032; + mov.b64 {%r1035, %r1036}, %rd200; + mul.f32 %r1037, %r1028, %r1035; + mul.f32 %r1038, %r1034, %r1036; + .loc 2 196 39 // triton_helpers.py:196:39 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r1039, 0f3F800000, %r1038, %p104; + selp.f32 %r1040, 0f3F800000, %r1037, %p103; + .loc 2 199 53 // triton_helpers.py:199:53 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + sub.f32 %r1041, %r1009, %r1012; + sub.f32 %r1042, %r1010, %r1014; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + fma.rn.ftz.f32 %r1043, %r1042, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1044, %r1043; + fma.rm.ftz.f32 %r1045, %r1044, %r156, %r155; + fma.rn.ftz.f32 %r1046, %r1041, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1047, %r1046; + fma.rm.ftz.f32 %r1048, %r1047, %r156, %r155; + mov.b64 %rd201, {%r1045, %r1048}; + cvt.u32.u64 %r1049, %rd201; + add.f32 %r1050, %r1045, 0fCB40007F; + neg.f32 %r1051, %r1050; + fma.rn.ftz.f32 %r1052, %r1042, %r160, %r1051; + fma.rn.ftz.f32 %r1053, %r1042, %r162, %r1052; + ex2.approx.ftz.f32 %r1054, %r1053; + add.f32 %r1055, %r1048, 0fCB40007F; + neg.f32 %r1056, %r1055; + fma.rn.ftz.f32 %r1057, %r1041, %r160, %r1056; + fma.rn.ftz.f32 %r1058, %r1041, %r162, %r1057; + shl.b64 %rd202, %rd201, 23; + and.b64 %rd203, %rd202, -36028797018963968; + shl.b32 %r1059, %r1049, 23; + cvt.u64.u32 %rd204, %r1059; + or.b64 %rd205, %rd204, %rd203; + ex2.approx.ftz.f32 %r1060, %r1058; + mov.b64 {%r1061, %r1062}, %rd205; + mul.f32 %r1063, %r1054, %r1061; + mul.f32 %r1064, %r1060, %r1062; + .loc 2 199 39 // triton_helpers.py:199:39 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + selp.f32 %r1065, 0f3F800000, %r1064, %p104; + selp.f32 %r1066, 0f3F800000, %r1063, %p103; + .loc 2 205 36 // triton_helpers.py:205:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + mov.b64 {%r1067, %r1068}, %rd247; + fma.rn.f32 %r1069, %r1067, %r1040, %r1066; + fma.rn.f32 %r1070, %r1068, %r1039, %r1065; +$L__tmp12: + .loc 1 45 54 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:45:54 + selp.f32 %r2198, %r138, %r2198, %p7; + selp.f32 %r2199, %r140, %r2199, %p7; + selp.f32 %r1071, %r888, %r116, %p7; + selp.f32 %r1072, %r890, %r115, %p7; + mov.b64 %rd252, {%r1072, %r1071}; + selp.f32 %r1073, %r950, %r118, %p7; + selp.f32 %r1074, %r952, %r117, %p7; + mov.b64 %rd250, {%r1074, %r1073}; + selp.f32 %r1075, %r1012, %r120, %p7; + selp.f32 %r1076, %r1014, %r119, %p7; + mov.b64 %rd248, {%r1076, %r1075}; + .loc 1 46 54 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:46:54 + selp.f32 %r2196, %r883, %r2196, %p7; + selp.f32 %r2197, %r884, %r2197, %p7; + selp.f32 %r1077, %r946, %r944, %p7; + selp.f32 %r1078, %r945, %r943, %p7; + mov.b64 %rd251, {%r1078, %r1077}; + selp.f32 %r1079, %r1008, %r1006, %p7; + selp.f32 %r1080, %r1007, %r1005, %p7; + mov.b64 %rd249, {%r1080, %r1079}; + selp.f32 %r1081, %r1070, %r1068, %p7; + selp.f32 %r1082, %r1069, %r1067, %p7; + mov.b64 %rd247, {%r1082, %r1081}; + mov.b64 %rd246, 16384; + mov.pred %p206, 0; + .loc 1 31 40 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:31:40 + @%p1 bra $L__BB0_1; +// %bb.2: +$L__tmp13: + .loc 2 199 53 // triton_helpers.py:199:53 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:42:40 ] + mov.b64 {%r1095, %r1096}, %rd268; + mov.b64 {%r1097, %r1098}, %rd267; + mov.b64 {%r1099, %r1100}, %rd266; + mov.b64 {%r1101, %r1102}, %rd265; + mov.b64 {%r1103, %r1104}, %rd264; + mov.b64 {%r1105, %r1106}, %rd263; + mov.b64 {%r1107, %r1108}, %rd262; + mov.b64 {%r1109, %r1110}, %rd261; +$L__tmp14: + .loc 1 26 37 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:26:37 + and.b32 %r1111, %r1, 31; +$L__tmp15: + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p112, %r1109, %r1110; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p113, %r1109, %r1109; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1112, %r1109, %r1110, %p113; + selp.f32 %r1113, %r1109, %r1112, %p112; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p114, %r1113, %r1107; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p115, %r1113, %r1113; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1114, %r1113, %r1107, %p115; + selp.f32 %r1115, %r1113, %r1114, %p114; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p116, %r1115, %r1108; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p117, %r1115, %r1115; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1116, %r1115, %r1108, %p117; + selp.f32 %r1117, %r1115, %r1116, %p116; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p118, %r1117, %r1105; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p119, %r1117, %r1117; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1118, %r1117, %r1105, %p119; + selp.f32 %r1119, %r1117, %r1118, %p118; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p120, %r1119, %r1106; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p121, %r1119, %r1119; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1120, %r1119, %r1106, %p121; + selp.f32 %r1121, %r1119, %r1120, %p120; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p122, %r1121, %r1103; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p123, %r1121, %r1121; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1122, %r1121, %r1103, %p123; + selp.f32 %r1123, %r1121, %r1122, %p122; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p124, %r1123, %r1104; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p125, %r1123, %r1123; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1124, %r1123, %r1104, %p125; + selp.f32 %r1125, %r1123, %r1124, %p124; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p126, %r1125, %r1101; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p127, %r1125, %r1125; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1126, %r1125, %r1101, %p127; + selp.f32 %r1127, %r1125, %r1126, %p126; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p128, %r1127, %r1102; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p129, %r1127, %r1127; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1128, %r1127, %r1102, %p129; + selp.f32 %r1129, %r1127, %r1128, %p128; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p130, %r1129, %r1099; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p131, %r1129, %r1129; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1130, %r1129, %r1099, %p131; + selp.f32 %r1131, %r1129, %r1130, %p130; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p132, %r1131, %r1100; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p133, %r1131, %r1131; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1132, %r1131, %r1100, %p133; + selp.f32 %r1133, %r1131, %r1132, %p132; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p134, %r1133, %r1097; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p135, %r1133, %r1133; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1134, %r1133, %r1097, %p135; + selp.f32 %r1135, %r1133, %r1134, %p134; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p136, %r1135, %r1098; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p137, %r1135, %r1135; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1136, %r1135, %r1098, %p137; + selp.f32 %r1137, %r1135, %r1136, %p136; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p138, %r1137, %r1095; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p139, %r1137, %r1137; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1138, %r1137, %r1095, %p139; + selp.f32 %r1139, %r1137, %r1138, %p138; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p140, %r1139, %r1096; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p141, %r1139, %r1139; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1140, %r1139, %r1096, %p141; + selp.f32 %r1141, %r1139, %r1140, %p140; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p142, %r1141, %r19; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p143, %r1141, %r1141; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1142, %r1141, %r19, %p143; + selp.f32 %r1143, %r1141, %r1142, %p142; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p144, %r1143, %r20; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p145, %r1143, %r1143; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1144, %r1143, %r20, %p145; + selp.f32 %r1145, %r1143, %r1144, %p144; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p146, %r1145, %r21; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p147, %r1145, %r1145; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1146, %r1145, %r21, %p147; + selp.f32 %r1147, %r1145, %r1146, %p146; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p148, %r1147, %r22; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p149, %r1147, %r1147; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1148, %r1147, %r22, %p149; + selp.f32 %r1149, %r1147, %r1148, %p148; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p150, %r1149, %r23; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p151, %r1149, %r1149; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1150, %r1149, %r23, %p151; + selp.f32 %r1151, %r1149, %r1150, %p150; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p152, %r1151, %r24; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p153, %r1151, %r1151; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1152, %r1151, %r24, %p153; + selp.f32 %r1153, %r1151, %r1152, %p152; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p154, %r1153, %r25; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p155, %r1153, %r1153; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1154, %r1153, %r25, %p155; + selp.f32 %r1155, %r1153, %r1154, %p154; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p156, %r1155, %r26; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p157, %r1155, %r1155; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1156, %r1155, %r26, %p157; + selp.f32 %r1157, %r1155, %r1156, %p156; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p158, %r1157, %r2198; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p159, %r1157, %r1157; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1158, %r1157, %r2198, %p159; + selp.f32 %r1159, %r1157, %r1158, %p158; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p160, %r1159, %r2199; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p161, %r1159, %r1159; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1160, %r1159, %r2199, %p161; + selp.f32 %r1161, %r1159, %r1160, %p160; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mov.b64 {%r1162, %r1163}, %rd252; + setp.gt.f32 %p162, %r1161, %r1162; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p163, %r1161, %r1161; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1164, %r1161, %r1162, %p163; + selp.f32 %r1165, %r1161, %r1164, %p162; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p164, %r1165, %r1163; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p165, %r1165, %r1165; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1166, %r1165, %r1163, %p165; + selp.f32 %r1167, %r1165, %r1166, %p164; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mov.b64 {%r1168, %r1169}, %rd250; + setp.gt.f32 %p166, %r1167, %r1168; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p167, %r1167, %r1167; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1170, %r1167, %r1168, %p167; + selp.f32 %r1171, %r1167, %r1170, %p166; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p168, %r1171, %r1169; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p169, %r1171, %r1171; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1172, %r1171, %r1169, %p169; + selp.f32 %r1173, %r1171, %r1172, %p168; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mov.b64 {%r1174, %r1175}, %rd248; + setp.gt.f32 %p170, %r1173, %r1174; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p171, %r1173, %r1173; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1176, %r1173, %r1174, %p171; + selp.f32 %r1177, %r1173, %r1176, %p170; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p172, %r1177, %r1175; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p173, %r1177, %r1177; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1178, %r1177, %r1175, %p173; + selp.f32 %r1179, %r1177, %r1178, %p172; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1180, %r1179, 16, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p174, %r1179, %r1180; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p175, %r1179, %r1179; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1181, %r1179, %r1180, %p174; + selp.f32 %r1182, %r1179, %r1181, %p175; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1183, %r1182, 8, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p176, %r1182, %r1183; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p177, %r1182, %r1182; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1184, %r1182, %r1183, %p177; + selp.f32 %r1185, %r1182, %r1184, %p176; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1186, %r1185, 4, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p178, %r1185, %r1186; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p179, %r1185, %r1185; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1187, %r1185, %r1186, %p179; + selp.f32 %r1188, %r1185, %r1187, %p178; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1189, %r1188, 2, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p180, %r1188, %r1189; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p181, %r1188, %r1188; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1190, %r1188, %r1189, %p181; + selp.f32 %r1191, %r1188, %r1190, %p180; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1192, %r1191, 1, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p182, %r1191, %r1192; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p183, %r1191, %r1191; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.eq.b32 %p105, %r1111, 0; + shr.u32 %r1193, %r1, 3; + and.b32 %r1194, %r1193, 60; + mov.b32 %r1195, global_smem; + add.s32 %r1083, %r1195, %r1194; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.b32 %r1196, %r1191, %r1192, %p183; + selp.b32 %r1084, %r1191, %r1196, %p182; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + // begin inline asm + @%p105 st.shared.b32 [ %r1083 + 0 ], %r1084; + // end inline asm + bar.sync 0; + setp.lt.u32 %p106, %r1, 16; + shl.b32 %r1197, %r1, 2; + add.s32 %r1086, %r1195, %r1197; + // begin inline asm + @%p106 ld.shared.b32 %r1085, [ %r1086 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1198, %r1085, 8, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p184, %r1085, %r1198; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p185, %r1085, %r1085; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1199, %r1085, %r1198, %p184; + selp.f32 %r1200, %r1085, %r1199, %p185; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1201, %r1200, 4, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p186, %r1200, %r1201; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p187, %r1200, %r1200; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1202, %r1200, %r1201, %p187; + selp.f32 %r1203, %r1200, %r1202, %p186; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1204, %r1203, 2, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p188, %r1203, %r1204; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p189, %r1203, %r1203; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1205, %r1203, %r1204, %p189; + selp.f32 %r1206, %r1203, %r1205, %p188; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1207, %r1206, 1, 31, -1; + .loc 2 110 15 // triton_helpers.py:110:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.gt.f32 %p190, %r1206, %r1207; + .loc 2 112 21 // triton_helpers.py:112:21 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.nan.f32 %p191, %r1206, %r1206; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.eq.b32 %p107, %r1, 0; + .loc 2 113 29 // triton_helpers.py:113:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.b32 %r1208, %r1206, %r1207, %p191; + selp.b32 %r1088, %r1206, %r1208, %p190; + .loc 2 123 29 // triton_helpers.py:123:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + // begin inline asm + @%p107 st.shared.b32 [ %r1086 + 0 ], %r1088; + // end inline asm + bar.sync 0; + ld.shared.b32 %r39, [global_smem]; + .loc 2 180 40 // triton_helpers.py:180:40 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + setp.eq.f32 %p192, %r39, 0fFF800000; + .loc 2 180 68 // triton_helpers.py:180:68 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + sub.f32 %r1209, %r1109, %r39; + sub.f32 %r1210, %r1110, %r39; + sub.f32 %r1211, %r1107, %r39; + sub.f32 %r1212, %r1108, %r39; + sub.f32 %r1213, %r1105, %r39; + sub.f32 %r1214, %r1106, %r39; + sub.f32 %r1215, %r1103, %r39; + sub.f32 %r1216, %r1104, %r39; + sub.f32 %r1217, %r1101, %r39; + sub.f32 %r1218, %r1102, %r39; + sub.f32 %r1219, %r1099, %r39; + sub.f32 %r1220, %r1100, %r39; + sub.f32 %r1221, %r1097, %r39; + sub.f32 %r1222, %r1098, %r39; + sub.f32 %r1223, %r1095, %r39; + sub.f32 %r1224, %r1096, %r39; + sub.f32 %r1225, %r19, %r39; + sub.f32 %r1226, %r20, %r39; + sub.f32 %r1227, %r21, %r39; + sub.f32 %r1228, %r22, %r39; + sub.f32 %r1229, %r23, %r39; + sub.f32 %r1230, %r24, %r39; + sub.f32 %r1231, %r25, %r39; + sub.f32 %r1232, %r26, %r39; + sub.f32 %r1233, %r2198, %r39; + sub.f32 %r1234, %r2199, %r39; + sub.f32 %r1235, %r1162, %r39; + sub.f32 %r1236, %r1163, %r39; + sub.f32 %r1237, %r1168, %r39; + sub.f32 %r1238, %r1169, %r39; + sub.f32 %r1239, %r1174, %r39; + sub.f32 %r1240, %r1175, %r39; + .loc 2 180 58 // triton_helpers.py:180:58 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + selp.f32 %r1241, 0f00000000, %r1209, %p192; + selp.f32 %r1242, 0f00000000, %r1210, %p192; + selp.f32 %r1243, 0f00000000, %r1211, %p192; + selp.f32 %r1244, 0f00000000, %r1212, %p192; + selp.f32 %r1245, 0f00000000, %r1213, %p192; + selp.f32 %r1246, 0f00000000, %r1214, %p192; + selp.f32 %r1247, 0f00000000, %r1215, %p192; + selp.f32 %r1248, 0f00000000, %r1216, %p192; + selp.f32 %r1249, 0f00000000, %r1217, %p192; + selp.f32 %r1250, 0f00000000, %r1218, %p192; + selp.f32 %r1251, 0f00000000, %r1219, %p192; + selp.f32 %r1252, 0f00000000, %r1220, %p192; + selp.f32 %r1253, 0f00000000, %r1221, %p192; + selp.f32 %r1254, 0f00000000, %r1222, %p192; + selp.f32 %r1255, 0f00000000, %r1223, %p192; + selp.f32 %r1256, 0f00000000, %r1224, %p192; + selp.f32 %r1257, 0f00000000, %r1225, %p192; + selp.f32 %r1258, 0f00000000, %r1226, %p192; + selp.f32 %r1259, 0f00000000, %r1227, %p192; + selp.f32 %r1260, 0f00000000, %r1228, %p192; + selp.f32 %r1261, 0f00000000, %r1229, %p192; + selp.f32 %r1262, 0f00000000, %r1230, %p192; + selp.f32 %r1263, 0f00000000, %r1231, %p192; + selp.f32 %r1264, 0f00000000, %r1232, %p192; + selp.f32 %r1265, 0f00000000, %r1233, %p192; + selp.f32 %r1266, 0f00000000, %r1234, %p192; + selp.f32 %r1267, 0f00000000, %r1235, %p192; + selp.f32 %r1268, 0f00000000, %r1236, %p192; + selp.f32 %r1269, 0f00000000, %r1237, %p192; + selp.f32 %r1270, 0f00000000, %r1238, %p192; + selp.f32 %r1271, 0f00000000, %r1239, %p192; + selp.f32 %r1272, 0f00000000, %r1240, %p192; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + fma.rn.ftz.f32 %r1275, %r1241, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1276, %r1275; + fma.rm.ftz.f32 %r1279, %r1276, %r156, %r155; + add.f32 %r1280, %r1279, 0fCB40007F; + neg.f32 %r1281, %r1280; + fma.rn.ftz.f32 %r1283, %r1241, %r160, %r1281; + fma.rn.ftz.f32 %r1285, %r1241, %r162, %r1283; + shl.b32 %r1286, %r1279, 23; + ex2.approx.ftz.f32 %r1287, %r1285; + mul.f32 %r1288, %r1287, %r1286; + fma.rn.ftz.f32 %r1289, %r1242, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1290, %r1289; + fma.rm.ftz.f32 %r1291, %r1290, %r156, %r155; + add.f32 %r1292, %r1291, 0fCB40007F; + neg.f32 %r1293, %r1292; + fma.rn.ftz.f32 %r1294, %r1242, %r160, %r1293; + fma.rn.ftz.f32 %r1295, %r1242, %r162, %r1294; + shl.b32 %r1296, %r1291, 23; + ex2.approx.ftz.f32 %r1297, %r1295; + mul.f32 %r1298, %r1297, %r1296; + fma.rn.ftz.f32 %r1299, %r1243, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1300, %r1299; + fma.rm.ftz.f32 %r1301, %r1300, %r156, %r155; + add.f32 %r1302, %r1301, 0fCB40007F; + neg.f32 %r1303, %r1302; + fma.rn.ftz.f32 %r1304, %r1243, %r160, %r1303; + fma.rn.ftz.f32 %r1305, %r1243, %r162, %r1304; + shl.b32 %r1306, %r1301, 23; + ex2.approx.ftz.f32 %r1307, %r1305; + mul.f32 %r1308, %r1307, %r1306; + fma.rn.ftz.f32 %r1309, %r1244, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1310, %r1309; + fma.rm.ftz.f32 %r1311, %r1310, %r156, %r155; + add.f32 %r1312, %r1311, 0fCB40007F; + neg.f32 %r1313, %r1312; + fma.rn.ftz.f32 %r1314, %r1244, %r160, %r1313; + fma.rn.ftz.f32 %r1315, %r1244, %r162, %r1314; + shl.b32 %r1316, %r1311, 23; + ex2.approx.ftz.f32 %r1317, %r1315; + mul.f32 %r1318, %r1317, %r1316; + fma.rn.ftz.f32 %r1319, %r1245, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1320, %r1319; + fma.rm.ftz.f32 %r1321, %r1320, %r156, %r155; + add.f32 %r1322, %r1321, 0fCB40007F; + neg.f32 %r1323, %r1322; + fma.rn.ftz.f32 %r1324, %r1245, %r160, %r1323; + fma.rn.ftz.f32 %r1325, %r1245, %r162, %r1324; + shl.b32 %r1326, %r1321, 23; + ex2.approx.ftz.f32 %r1327, %r1325; + mul.f32 %r1328, %r1327, %r1326; + fma.rn.ftz.f32 %r1329, %r1246, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1330, %r1329; + fma.rm.ftz.f32 %r1331, %r1330, %r156, %r155; + add.f32 %r1332, %r1331, 0fCB40007F; + neg.f32 %r1333, %r1332; + fma.rn.ftz.f32 %r1334, %r1246, %r160, %r1333; + fma.rn.ftz.f32 %r1335, %r1246, %r162, %r1334; + shl.b32 %r1336, %r1331, 23; + ex2.approx.ftz.f32 %r1337, %r1335; + mul.f32 %r1338, %r1337, %r1336; + fma.rn.ftz.f32 %r1339, %r1247, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1340, %r1339; + fma.rm.ftz.f32 %r1341, %r1340, %r156, %r155; + add.f32 %r1342, %r1341, 0fCB40007F; + neg.f32 %r1343, %r1342; + fma.rn.ftz.f32 %r1344, %r1247, %r160, %r1343; + fma.rn.ftz.f32 %r1345, %r1247, %r162, %r1344; + shl.b32 %r1346, %r1341, 23; + ex2.approx.ftz.f32 %r1347, %r1345; + mul.f32 %r1348, %r1347, %r1346; + fma.rn.ftz.f32 %r1349, %r1248, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1350, %r1349; + fma.rm.ftz.f32 %r1351, %r1350, %r156, %r155; + add.f32 %r1352, %r1351, 0fCB40007F; + neg.f32 %r1353, %r1352; + fma.rn.ftz.f32 %r1354, %r1248, %r160, %r1353; + fma.rn.ftz.f32 %r1355, %r1248, %r162, %r1354; + shl.b32 %r1356, %r1351, 23; + ex2.approx.ftz.f32 %r1357, %r1355; + mul.f32 %r1358, %r1357, %r1356; + fma.rn.ftz.f32 %r1359, %r1249, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1360, %r1359; + fma.rm.ftz.f32 %r1361, %r1360, %r156, %r155; + add.f32 %r1362, %r1361, 0fCB40007F; + neg.f32 %r1363, %r1362; + fma.rn.ftz.f32 %r1364, %r1249, %r160, %r1363; + fma.rn.ftz.f32 %r1365, %r1249, %r162, %r1364; + shl.b32 %r1366, %r1361, 23; + ex2.approx.ftz.f32 %r1367, %r1365; + mul.f32 %r1368, %r1367, %r1366; + fma.rn.ftz.f32 %r1369, %r1250, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1370, %r1369; + fma.rm.ftz.f32 %r1371, %r1370, %r156, %r155; + add.f32 %r1372, %r1371, 0fCB40007F; + neg.f32 %r1373, %r1372; + fma.rn.ftz.f32 %r1374, %r1250, %r160, %r1373; + fma.rn.ftz.f32 %r1375, %r1250, %r162, %r1374; + shl.b32 %r1376, %r1371, 23; + ex2.approx.ftz.f32 %r1377, %r1375; + mul.f32 %r1378, %r1377, %r1376; + fma.rn.ftz.f32 %r1379, %r1251, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1380, %r1379; + fma.rm.ftz.f32 %r1381, %r1380, %r156, %r155; + add.f32 %r1382, %r1381, 0fCB40007F; + neg.f32 %r1383, %r1382; + fma.rn.ftz.f32 %r1384, %r1251, %r160, %r1383; + fma.rn.ftz.f32 %r1385, %r1251, %r162, %r1384; + shl.b32 %r1386, %r1381, 23; + ex2.approx.ftz.f32 %r1387, %r1385; + mul.f32 %r1388, %r1387, %r1386; + fma.rn.ftz.f32 %r1389, %r1252, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1390, %r1389; + fma.rm.ftz.f32 %r1391, %r1390, %r156, %r155; + add.f32 %r1392, %r1391, 0fCB40007F; + neg.f32 %r1393, %r1392; + fma.rn.ftz.f32 %r1394, %r1252, %r160, %r1393; + fma.rn.ftz.f32 %r1395, %r1252, %r162, %r1394; + shl.b32 %r1396, %r1391, 23; + ex2.approx.ftz.f32 %r1397, %r1395; + mul.f32 %r1398, %r1397, %r1396; + fma.rn.ftz.f32 %r1399, %r1253, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1400, %r1399; + fma.rm.ftz.f32 %r1401, %r1400, %r156, %r155; + add.f32 %r1402, %r1401, 0fCB40007F; + neg.f32 %r1403, %r1402; + fma.rn.ftz.f32 %r1404, %r1253, %r160, %r1403; + fma.rn.ftz.f32 %r1405, %r1253, %r162, %r1404; + shl.b32 %r1406, %r1401, 23; + ex2.approx.ftz.f32 %r1407, %r1405; + mul.f32 %r1408, %r1407, %r1406; + fma.rn.ftz.f32 %r1409, %r1254, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1410, %r1409; + fma.rm.ftz.f32 %r1411, %r1410, %r156, %r155; + add.f32 %r1412, %r1411, 0fCB40007F; + neg.f32 %r1413, %r1412; + fma.rn.ftz.f32 %r1414, %r1254, %r160, %r1413; + fma.rn.ftz.f32 %r1415, %r1254, %r162, %r1414; + shl.b32 %r1416, %r1411, 23; + ex2.approx.ftz.f32 %r1417, %r1415; + mul.f32 %r1418, %r1417, %r1416; + fma.rn.ftz.f32 %r1419, %r1255, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1420, %r1419; + fma.rm.ftz.f32 %r1421, %r1420, %r156, %r155; + add.f32 %r1422, %r1421, 0fCB40007F; + neg.f32 %r1423, %r1422; + fma.rn.ftz.f32 %r1424, %r1255, %r160, %r1423; + fma.rn.ftz.f32 %r1425, %r1255, %r162, %r1424; + shl.b32 %r1426, %r1421, 23; + ex2.approx.ftz.f32 %r1427, %r1425; + mul.f32 %r1428, %r1427, %r1426; + fma.rn.ftz.f32 %r1429, %r1256, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1430, %r1429; + fma.rm.ftz.f32 %r1431, %r1430, %r156, %r155; + add.f32 %r1432, %r1431, 0fCB40007F; + neg.f32 %r1433, %r1432; + fma.rn.ftz.f32 %r1434, %r1256, %r160, %r1433; + fma.rn.ftz.f32 %r1435, %r1256, %r162, %r1434; + shl.b32 %r1436, %r1431, 23; + ex2.approx.ftz.f32 %r1437, %r1435; + mul.f32 %r1438, %r1437, %r1436; + fma.rn.ftz.f32 %r1439, %r1257, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1440, %r1439; + fma.rm.ftz.f32 %r1441, %r1440, %r156, %r155; + add.f32 %r1442, %r1441, 0fCB40007F; + neg.f32 %r1443, %r1442; + fma.rn.ftz.f32 %r1444, %r1257, %r160, %r1443; + fma.rn.ftz.f32 %r1445, %r1257, %r162, %r1444; + shl.b32 %r1446, %r1441, 23; + ex2.approx.ftz.f32 %r1447, %r1445; + mul.f32 %r1448, %r1447, %r1446; + fma.rn.ftz.f32 %r1449, %r1258, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1450, %r1449; + fma.rm.ftz.f32 %r1451, %r1450, %r156, %r155; + add.f32 %r1452, %r1451, 0fCB40007F; + neg.f32 %r1453, %r1452; + fma.rn.ftz.f32 %r1454, %r1258, %r160, %r1453; + fma.rn.ftz.f32 %r1455, %r1258, %r162, %r1454; + shl.b32 %r1456, %r1451, 23; + ex2.approx.ftz.f32 %r1457, %r1455; + mul.f32 %r1458, %r1457, %r1456; + fma.rn.ftz.f32 %r1459, %r1259, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1460, %r1459; + fma.rm.ftz.f32 %r1461, %r1460, %r156, %r155; + add.f32 %r1462, %r1461, 0fCB40007F; + neg.f32 %r1463, %r1462; + fma.rn.ftz.f32 %r1464, %r1259, %r160, %r1463; + fma.rn.ftz.f32 %r1465, %r1259, %r162, %r1464; + shl.b32 %r1466, %r1461, 23; + ex2.approx.ftz.f32 %r1467, %r1465; + mul.f32 %r1468, %r1467, %r1466; + fma.rn.ftz.f32 %r1469, %r1260, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1470, %r1469; + fma.rm.ftz.f32 %r1471, %r1470, %r156, %r155; + add.f32 %r1472, %r1471, 0fCB40007F; + neg.f32 %r1473, %r1472; + fma.rn.ftz.f32 %r1474, %r1260, %r160, %r1473; + fma.rn.ftz.f32 %r1475, %r1260, %r162, %r1474; + shl.b32 %r1476, %r1471, 23; + ex2.approx.ftz.f32 %r1477, %r1475; + mul.f32 %r1478, %r1477, %r1476; + fma.rn.ftz.f32 %r1479, %r1261, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1480, %r1479; + fma.rm.ftz.f32 %r1481, %r1480, %r156, %r155; + add.f32 %r1482, %r1481, 0fCB40007F; + neg.f32 %r1483, %r1482; + fma.rn.ftz.f32 %r1484, %r1261, %r160, %r1483; + fma.rn.ftz.f32 %r1485, %r1261, %r162, %r1484; + shl.b32 %r1486, %r1481, 23; + ex2.approx.ftz.f32 %r1487, %r1485; + mul.f32 %r1488, %r1487, %r1486; + fma.rn.ftz.f32 %r1489, %r1262, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1490, %r1489; + fma.rm.ftz.f32 %r1491, %r1490, %r156, %r155; + add.f32 %r1492, %r1491, 0fCB40007F; + neg.f32 %r1493, %r1492; + fma.rn.ftz.f32 %r1494, %r1262, %r160, %r1493; + fma.rn.ftz.f32 %r1495, %r1262, %r162, %r1494; + shl.b32 %r1496, %r1491, 23; + ex2.approx.ftz.f32 %r1497, %r1495; + mul.f32 %r1498, %r1497, %r1496; + fma.rn.ftz.f32 %r1499, %r1263, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1500, %r1499; + fma.rm.ftz.f32 %r1501, %r1500, %r156, %r155; + add.f32 %r1502, %r1501, 0fCB40007F; + neg.f32 %r1503, %r1502; + fma.rn.ftz.f32 %r1504, %r1263, %r160, %r1503; + fma.rn.ftz.f32 %r1505, %r1263, %r162, %r1504; + shl.b32 %r1506, %r1501, 23; + ex2.approx.ftz.f32 %r1507, %r1505; + mul.f32 %r1508, %r1507, %r1506; + fma.rn.ftz.f32 %r1509, %r1264, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1510, %r1509; + fma.rm.ftz.f32 %r1511, %r1510, %r156, %r155; + add.f32 %r1512, %r1511, 0fCB40007F; + neg.f32 %r1513, %r1512; + fma.rn.ftz.f32 %r1514, %r1264, %r160, %r1513; + fma.rn.ftz.f32 %r1515, %r1264, %r162, %r1514; + shl.b32 %r1516, %r1511, 23; + ex2.approx.ftz.f32 %r1517, %r1515; + mul.f32 %r1518, %r1517, %r1516; + fma.rn.ftz.f32 %r1519, %r1265, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1520, %r1519; + fma.rm.ftz.f32 %r1521, %r1520, %r156, %r155; + add.f32 %r1522, %r1521, 0fCB40007F; + neg.f32 %r1523, %r1522; + fma.rn.ftz.f32 %r1524, %r1265, %r160, %r1523; + fma.rn.ftz.f32 %r1525, %r1265, %r162, %r1524; + shl.b32 %r1526, %r1521, 23; + ex2.approx.ftz.f32 %r1527, %r1525; + mul.f32 %r1528, %r1527, %r1526; + fma.rn.ftz.f32 %r1529, %r1266, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1530, %r1529; + fma.rm.ftz.f32 %r1531, %r1530, %r156, %r155; + add.f32 %r1532, %r1531, 0fCB40007F; + neg.f32 %r1533, %r1532; + fma.rn.ftz.f32 %r1534, %r1266, %r160, %r1533; + fma.rn.ftz.f32 %r1535, %r1266, %r162, %r1534; + shl.b32 %r1536, %r1531, 23; + ex2.approx.ftz.f32 %r1537, %r1535; + mul.f32 %r1538, %r1537, %r1536; + fma.rn.ftz.f32 %r1539, %r1267, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1540, %r1539; + fma.rm.ftz.f32 %r1541, %r1540, %r156, %r155; + fma.rn.ftz.f32 %r1542, %r1268, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1543, %r1542; + fma.rm.ftz.f32 %r1544, %r1543, %r156, %r155; + fma.rn.ftz.f32 %r1545, %r1269, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1546, %r1545; + fma.rm.ftz.f32 %r1547, %r1546, %r156, %r155; + fma.rn.ftz.f32 %r1548, %r1270, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1549, %r1548; + fma.rm.ftz.f32 %r1550, %r1549, %r156, %r155; + fma.rn.ftz.f32 %r1551, %r1271, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1552, %r1551; + fma.rm.ftz.f32 %r1553, %r1552, %r156, %r155; + fma.rn.ftz.f32 %r1554, %r1272, %r152, %r151; + cvt.ftz.sat.f32.f32 %r1555, %r1554; + fma.rm.ftz.f32 %r1556, %r1555, %r156, %r155; + .loc 2 181 31 // triton_helpers.py:181:31 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mov.b64 {%r1557, %r1558}, %rd253; + mul.f32 %r1559, %r1558, %r1298; + mov.b64 {%r1560, %r1561}, %rd254; + mov.b64 {%r1562, %r1563}, %rd255; + mov.b64 {%r1564, %r1565}, %rd256; + mov.b64 {%r1566, %r1567}, %rd257; + mov.b64 {%r1568, %r1569}, %rd258; + mov.b64 {%r1570, %r1571}, %rd259; + mov.b64 {%r1572, %r1573}, %rd260; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mov.b64 %rd207, {%r1541, %r1544}; + cvt.u32.u64 %r1574, %rd207; + add.f32 %r1575, %r1541, 0fCB40007F; + neg.f32 %r1576, %r1575; + fma.rn.ftz.f32 %r1577, %r1267, %r160, %r1576; + fma.rn.ftz.f32 %r1578, %r1267, %r162, %r1577; + ex2.approx.ftz.f32 %r1579, %r1578; + add.f32 %r1580, %r1544, 0fCB40007F; + neg.f32 %r1581, %r1580; + fma.rn.ftz.f32 %r1582, %r1268, %r160, %r1581; + fma.rn.ftz.f32 %r1583, %r1268, %r162, %r1582; + shl.b64 %rd208, %rd207, 23; + and.b64 %rd209, %rd208, -36028797018963968; + shl.b32 %r1584, %r1574, 23; + cvt.u64.u32 %rd210, %r1584; + or.b64 %rd211, %rd210, %rd209; + ex2.approx.ftz.f32 %r1585, %r1583; + mov.b64 {%r1586, %r1587}, %rd211; + mul.f32 %r1588, %r1585, %r1587; + mul.f32 %r1589, %r1579, %r1586; + .loc 2 181 31 // triton_helpers.py:181:31 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mov.b64 {%r1590, %r1591}, %rd251; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mov.b64 %rd212, {%r1547, %r1550}; + cvt.u32.u64 %r1592, %rd212; + add.f32 %r1593, %r1547, 0fCB40007F; + neg.f32 %r1594, %r1593; + fma.rn.ftz.f32 %r1595, %r1269, %r160, %r1594; + fma.rn.ftz.f32 %r1596, %r1269, %r162, %r1595; + ex2.approx.ftz.f32 %r1597, %r1596; + add.f32 %r1598, %r1550, 0fCB40007F; + neg.f32 %r1599, %r1598; + fma.rn.ftz.f32 %r1600, %r1270, %r160, %r1599; + fma.rn.ftz.f32 %r1601, %r1270, %r162, %r1600; + shl.b64 %rd213, %rd212, 23; + and.b64 %rd214, %rd213, -36028797018963968; + shl.b32 %r1602, %r1592, 23; + cvt.u64.u32 %rd215, %r1602; + or.b64 %rd216, %rd215, %rd214; + ex2.approx.ftz.f32 %r1603, %r1601; + mov.b64 {%r1604, %r1605}, %rd216; + mul.f32 %r1606, %r1603, %r1605; + mul.f32 %r1607, %r1597, %r1604; + .loc 2 181 31 // triton_helpers.py:181:31 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mov.b64 {%r1608, %r1609}, %rd249; + .loc 2 173 29 // triton_helpers.py:173:29 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mov.b64 %rd217, {%r1553, %r1556}; + cvt.u32.u64 %r1610, %rd217; + add.f32 %r1611, %r1553, 0fCB40007F; + neg.f32 %r1612, %r1611; + fma.rn.ftz.f32 %r1613, %r1271, %r160, %r1612; + fma.rn.ftz.f32 %r1614, %r1271, %r162, %r1613; + ex2.approx.ftz.f32 %r1615, %r1614; + add.f32 %r1616, %r1556, 0fCB40007F; + neg.f32 %r1617, %r1616; + fma.rn.ftz.f32 %r1618, %r1272, %r160, %r1617; + fma.rn.ftz.f32 %r1619, %r1272, %r162, %r1618; + shl.b64 %rd218, %rd217, 23; + and.b64 %rd219, %rd218, -36028797018963968; + shl.b32 %r1620, %r1610, 23; + cvt.u64.u32 %rd220, %r1620; + or.b64 %rd221, %rd220, %rd219; + ex2.approx.ftz.f32 %r1621, %r1619; + mov.b64 {%r1622, %r1623}, %rd221; + mul.f32 %r1624, %r1621, %r1623; + mul.f32 %r1625, %r1615, %r1622; + .loc 2 181 31 // triton_helpers.py:181:31 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + mov.b64 {%r1626, %r1627}, %rd247; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + bar.sync 0; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + fma.rn.f32 %r1628, %r1557, %r1288, %r1559; + fma.rn.f32 %r1629, %r1560, %r1308, %r1628; + fma.rn.f32 %r1630, %r1561, %r1318, %r1629; + fma.rn.f32 %r1631, %r1562, %r1328, %r1630; + fma.rn.f32 %r1632, %r1563, %r1338, %r1631; + fma.rn.f32 %r1633, %r1564, %r1348, %r1632; + fma.rn.f32 %r1634, %r1565, %r1358, %r1633; + fma.rn.f32 %r1635, %r1566, %r1368, %r1634; + fma.rn.f32 %r1636, %r1567, %r1378, %r1635; + fma.rn.f32 %r1637, %r1568, %r1388, %r1636; + fma.rn.f32 %r1638, %r1569, %r1398, %r1637; + fma.rn.f32 %r1639, %r1570, %r1408, %r1638; + fma.rn.f32 %r1640, %r1571, %r1418, %r1639; + fma.rn.f32 %r1641, %r1572, %r1428, %r1640; + fma.rn.f32 %r1642, %r1573, %r1438, %r1641; + fma.rn.f32 %r1643, %r2188, %r1448, %r1642; + fma.rn.f32 %r1644, %r2189, %r1458, %r1643; + fma.rn.f32 %r1645, %r2190, %r1468, %r1644; + fma.rn.f32 %r1646, %r2191, %r1478, %r1645; + fma.rn.f32 %r1647, %r2192, %r1488, %r1646; + fma.rn.f32 %r1648, %r2193, %r1498, %r1647; + fma.rn.f32 %r1649, %r2194, %r1508, %r1648; + fma.rn.f32 %r1650, %r2195, %r1518, %r1649; + fma.rn.f32 %r1651, %r2196, %r1528, %r1650; + fma.rn.f32 %r1652, %r2197, %r1538, %r1651; + fma.rn.f32 %r1653, %r1590, %r1589, %r1652; + fma.rn.f32 %r1654, %r1591, %r1588, %r1653; + fma.rn.f32 %r1655, %r1608, %r1607, %r1654; + fma.rn.f32 %r1656, %r1609, %r1606, %r1655; + fma.rn.f32 %r1657, %r1626, %r1625, %r1656; + fma.rn.f32 %r1658, %r1627, %r1624, %r1657; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1659, %r1658, 16, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r1660, %r1658, %r1659; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1661, %r1660, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r1662, %r1660, %r1661; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1663, %r1662, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r1664, %r1662, %r1663; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1665, %r1664, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r1666, %r1664, %r1665; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1667, %r1666, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r1090, %r1666, %r1667; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + // begin inline asm + @%p105 st.shared.b32 [ %r1083 + 0 ], %r1090; + // end inline asm + bar.sync 0; + // begin inline asm + @%p106 ld.shared.b32 %r1091, [ %r1086 + 0 ]; + // end inline asm + shfl.sync.bfly.b32 %r1668, %r1091, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r1669, %r1091, %r1668; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1670, %r1669, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r1671, %r1669, %r1670; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1672, %r1671, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r1673, %r1671, %r1672; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + shfl.sync.bfly.b32 %r1674, %r1673, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + add.f32 %r1094, %r1673, %r1674; + .loc 3 291 36 // standard.py:291:36 @[ c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:49:33 ] + // begin inline asm + @%p107 st.shared.b32 [ %r1086 + 0 ], %r1094; + // end inline asm + bar.sync 0; + ld.shared.b32 %r40, [global_smem]; + shl.b32 %r1675, %r2, 4; + add.s32 %r41, %r1195, %r1675; + xor.b32 %r1676, %r1675, 64; + add.s32 %r42, %r1195, %r1676; + shl.b32 %r1677, %r1, 3; + and.b32 %r1678, %r1677, 4080; + and.b32 %r1679, %r1, 1; + neg.s32 %r1680, %r1679; + and.b32 %r1681, %r1680, 8256; + xor.b32 %r1682, %r1681, %r1678; + add.s32 %r43, %r1195, %r1682; +$L__tmp16: + .loc 1 52 40 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:52:40 + mul.wide.u32 %rd55, %r2, 4; + mov.b64 %rd273, 0; + mov.pred %p193, -1; + mov.pred %p207, %p193; +$L__BB0_3: // =>This Inner Loop Header: Depth=1 + .loc 1 0 40 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:0:40 + mov.pred %p2, %p207; + .loc 1 53 31 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:53:31 + or.b64 %rd243, %rd273, %rd1; + or.b64 %rd244, %rd273, %rd55; + or.b64 %rd245, %rd244, 14336; + .loc 1 54 29 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:54:29 + setp.lt.u64 %p196, %rd243, 32000; + setp.lt.u64 %p204, %rd245, 32000; + .loc 1 58 41 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:58:41 + cvt.u32.u64 %r1747, %rd273; + or.b32 %r1748, %r3, %r1747; + add.s32 %r1749, %r1748, %r6; + or.b32 %r1750, %r4, %r1747; + add.s32 %r1751, %r1750, %r6; + or.b32 %r1752, %r5, %r1747; + add.s32 %r1753, %r1752, %r6; + cvt.u32.u64 %r1754, %rd243; + add.s32 %r1755, %r6, %r1754; + cvt.u32.u64 %r1756, %rd244; + add.s32 %r1757, %r6, %r1756; + add.s32 %r1758, %r1757, 2048; + add.s32 %r1759, %r1757, 4096; + add.s32 %r1760, %r1757, 6144; + add.s32 %r1761, %r1757, 8192; + add.s32 %r1762, %r1757, 10240; + add.s32 %r1763, %r1757, 12288; + cvt.u32.u64 %r1764, %rd245; + add.s32 %r1765, %r6, %r1764; + .loc 1 58 34 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:58:34 + mad.wide.s32 %rd223, %r1749, 2, %rd57; + mad.wide.s32 %rd226, %r1751, 2, %rd57; + mad.wide.s32 %rd229, %r1753, 2, %rd57; + mad.wide.s32 %rd232, %r1755, 2, %rd57; + .loc 1 58 52 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:58:52 + // begin inline asm + mov.u64 %rd222, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd222, 1.0; + // end inline asm + mov.b32 %r1687, 0; + // begin inline asm + mov.u32 %r1683, %r1687; + mov.u32 %r1684, %r1687; + mov.u32 %r1685, %r1687; + mov.u32 %r1686, %r1687; + @%p193 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1683, %r1684, %r1685, %r1686 }, [ %rd223 + 0 ], %rd222; + // end inline asm + mov.b32 {%rs33, %rs34}, %r1683; + mov.b32 {%rs35, %rs36}, %r1684; + mov.b32 {%rs37, %rs38}, %r1685; + mov.b32 {%rs39, %rs40}, %r1686; + // begin inline asm + mov.u64 %rd225, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd225, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r1691, %r1687; + mov.u32 %r1692, %r1687; + mov.u32 %r1693, %r1687; + mov.u32 %r1694, %r1687; + @%p193 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1691, %r1692, %r1693, %r1694 }, [ %rd226 + 0 ], %rd225; + // end inline asm + mov.b32 {%rs41, %rs42}, %r1691; + mov.b32 {%rs43, %rs44}, %r1692; + mov.b32 {%rs45, %rs46}, %r1693; + mov.b32 {%rs47, %rs48}, %r1694; + // begin inline asm + mov.u64 %rd228, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd228, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r1699, %r1687; + mov.u32 %r1700, %r1687; + mov.u32 %r1701, %r1687; + mov.u32 %r1702, %r1687; + @%p193 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1699, %r1700, %r1701, %r1702 }, [ %rd229 + 0 ], %rd228; + // end inline asm + mov.b32 {%rs49, %rs50}, %r1699; + mov.b32 {%rs51, %rs52}, %r1700; + mov.b32 {%rs53, %rs54}, %r1701; + mov.b32 {%rs55, %rs56}, %r1702; + // begin inline asm + mov.u64 %rd231, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd231, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r1707, %r1687; + mov.u32 %r1708, %r1687; + mov.u32 %r1709, %r1687; + mov.u32 %r1710, %r1687; + @%p196 ld.global.L1::evict_first.L2::cache_hint.v4.b32 { %r1707, %r1708, %r1709, %r1710 }, [ %rd232 + 0 ], %rd231; + // end inline asm + mov.b32 {%rs57, %rs58}, %r1707; + mov.b32 {%rs59, %rs60}, %r1708; + mov.b32 {%rs61, %rs62}, %r1709; + mov.b32 {%rs63, %rs64}, %r1710; + .loc 1 58 106 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:58:106 + cvt.f32.bf16 %r1766, %rs33; + cvt.f32.bf16 %r1767, %rs34; + cvt.f32.bf16 %r1768, %rs35; + cvt.f32.bf16 %r1769, %rs36; + cvt.f32.bf16 %r1770, %rs37; + cvt.f32.bf16 %r1771, %rs38; + cvt.f32.bf16 %r1772, %rs39; + cvt.f32.bf16 %r1773, %rs40; + cvt.f32.bf16 %r1774, %rs41; + cvt.f32.bf16 %r1775, %rs42; + cvt.f32.bf16 %r1776, %rs43; + cvt.f32.bf16 %r1777, %rs44; + cvt.f32.bf16 %r1778, %rs45; + cvt.f32.bf16 %r1779, %rs46; + cvt.f32.bf16 %r1780, %rs47; + cvt.f32.bf16 %r1781, %rs48; + cvt.f32.bf16 %r1782, %rs49; + cvt.f32.bf16 %r1783, %rs50; + cvt.f32.bf16 %r1784, %rs51; + cvt.f32.bf16 %r1785, %rs52; + cvt.f32.bf16 %r1786, %rs53; + cvt.f32.bf16 %r1787, %rs54; + cvt.f32.bf16 %r1788, %rs55; + cvt.f32.bf16 %r1789, %rs56; + cvt.f32.bf16 %r1790, %rs57; + cvt.f32.bf16 %r1791, %rs58; + cvt.f32.bf16 %r1792, %rs59; + cvt.f32.bf16 %r1793, %rs60; + cvt.f32.bf16 %r1794, %rs61; + cvt.f32.bf16 %r1795, %rs62; + cvt.f32.bf16 %r1796, %rs63; + cvt.f32.bf16 %r1797, %rs64; + .loc 1 60 22 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:60:22 + sub.f32 %r1798, %r1766, %r39; + sub.f32 %r1799, %r1767, %r39; + sub.f32 %r1800, %r1768, %r39; + sub.f32 %r1801, %r1769, %r39; + sub.f32 %r1802, %r1770, %r39; + sub.f32 %r1803, %r1771, %r39; + sub.f32 %r1804, %r1772, %r39; + sub.f32 %r1805, %r1773, %r39; + sub.f32 %r1806, %r1774, %r39; + sub.f32 %r1807, %r1775, %r39; + sub.f32 %r1808, %r1776, %r39; + sub.f32 %r1809, %r1777, %r39; + sub.f32 %r1810, %r1778, %r39; + sub.f32 %r1811, %r1779, %r39; + sub.f32 %r1812, %r1780, %r39; + sub.f32 %r1813, %r1781, %r39; + sub.f32 %r1814, %r1782, %r39; + sub.f32 %r1815, %r1783, %r39; + sub.f32 %r1816, %r1784, %r39; + sub.f32 %r1817, %r1785, %r39; + sub.f32 %r1818, %r1786, %r39; + sub.f32 %r1819, %r1787, %r39; + sub.f32 %r1820, %r1788, %r39; + sub.f32 %r1821, %r1789, %r39; + sub.f32 %r1822, %r1790, %r39; + sub.f32 %r1823, %r1791, %r39; + sub.f32 %r1824, %r1792, %r39; + sub.f32 %r1825, %r1793, %r39; + sub.f32 %r1826, %r1794, %r39; + sub.f32 %r1827, %r1795, %r39; + sub.f32 %r1828, %r1796, %r39; + sub.f32 %r1829, %r1797, %r39; + mov.b32 %r1830, 0f3F000000; + mov.b32 %r1831, 0f3BBB989D; + .loc 1 61 29 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:61:29 + fma.rn.ftz.f32 %r1832, %r1798, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1833, %r1832; + mov.b32 %r1834, 0f4B400001; + mov.b32 %r1835, 0f437C0000; + fma.rm.ftz.f32 %r1836, %r1833, %r1835, %r1834; + add.f32 %r1837, %r1836, 0fCB40007F; + neg.f32 %r1838, %r1837; + mov.b32 %r1839, 0f3FB8AA3B; + fma.rn.ftz.f32 %r1840, %r1798, %r1839, %r1838; + mov.b32 %r1841, 0f32A57060; + fma.rn.ftz.f32 %r1842, %r1798, %r1841, %r1840; + shl.b32 %r1843, %r1836, 23; + ex2.approx.ftz.f32 %r1844, %r1842; + mul.f32 %r1845, %r1844, %r1843; + fma.rn.ftz.f32 %r1846, %r1799, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1847, %r1846; + fma.rm.ftz.f32 %r1848, %r1847, %r1835, %r1834; + add.f32 %r1849, %r1848, 0fCB40007F; + neg.f32 %r1850, %r1849; + fma.rn.ftz.f32 %r1851, %r1799, %r1839, %r1850; + fma.rn.ftz.f32 %r1852, %r1799, %r1841, %r1851; + shl.b32 %r1853, %r1848, 23; + ex2.approx.ftz.f32 %r1854, %r1852; + mul.f32 %r1855, %r1854, %r1853; + fma.rn.ftz.f32 %r1856, %r1800, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1857, %r1856; + fma.rm.ftz.f32 %r1858, %r1857, %r1835, %r1834; + add.f32 %r1859, %r1858, 0fCB40007F; + neg.f32 %r1860, %r1859; + fma.rn.ftz.f32 %r1861, %r1800, %r1839, %r1860; + fma.rn.ftz.f32 %r1862, %r1800, %r1841, %r1861; + shl.b32 %r1863, %r1858, 23; + ex2.approx.ftz.f32 %r1864, %r1862; + mul.f32 %r1865, %r1864, %r1863; + fma.rn.ftz.f32 %r1866, %r1801, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1867, %r1866; + fma.rm.ftz.f32 %r1868, %r1867, %r1835, %r1834; + add.f32 %r1869, %r1868, 0fCB40007F; + neg.f32 %r1870, %r1869; + fma.rn.ftz.f32 %r1871, %r1801, %r1839, %r1870; + fma.rn.ftz.f32 %r1872, %r1801, %r1841, %r1871; + shl.b32 %r1873, %r1868, 23; + ex2.approx.ftz.f32 %r1874, %r1872; + mul.f32 %r1875, %r1874, %r1873; + fma.rn.ftz.f32 %r1876, %r1802, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1877, %r1876; + fma.rm.ftz.f32 %r1878, %r1877, %r1835, %r1834; + add.f32 %r1879, %r1878, 0fCB40007F; + neg.f32 %r1880, %r1879; + fma.rn.ftz.f32 %r1881, %r1802, %r1839, %r1880; + fma.rn.ftz.f32 %r1882, %r1802, %r1841, %r1881; + shl.b32 %r1883, %r1878, 23; + ex2.approx.ftz.f32 %r1884, %r1882; + mul.f32 %r1885, %r1884, %r1883; + fma.rn.ftz.f32 %r1886, %r1803, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1887, %r1886; + fma.rm.ftz.f32 %r1888, %r1887, %r1835, %r1834; + add.f32 %r1889, %r1888, 0fCB40007F; + neg.f32 %r1890, %r1889; + fma.rn.ftz.f32 %r1891, %r1803, %r1839, %r1890; + fma.rn.ftz.f32 %r1892, %r1803, %r1841, %r1891; + shl.b32 %r1893, %r1888, 23; + ex2.approx.ftz.f32 %r1894, %r1892; + mul.f32 %r1895, %r1894, %r1893; + fma.rn.ftz.f32 %r1896, %r1804, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1897, %r1896; + fma.rm.ftz.f32 %r1898, %r1897, %r1835, %r1834; + add.f32 %r1899, %r1898, 0fCB40007F; + neg.f32 %r1900, %r1899; + fma.rn.ftz.f32 %r1901, %r1804, %r1839, %r1900; + fma.rn.ftz.f32 %r1902, %r1804, %r1841, %r1901; + shl.b32 %r1903, %r1898, 23; + ex2.approx.ftz.f32 %r1904, %r1902; + mul.f32 %r1905, %r1904, %r1903; + fma.rn.ftz.f32 %r1906, %r1805, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1907, %r1906; + fma.rm.ftz.f32 %r1908, %r1907, %r1835, %r1834; + add.f32 %r1909, %r1908, 0fCB40007F; + neg.f32 %r1910, %r1909; + fma.rn.ftz.f32 %r1911, %r1805, %r1839, %r1910; + fma.rn.ftz.f32 %r1912, %r1805, %r1841, %r1911; + shl.b32 %r1913, %r1908, 23; + ex2.approx.ftz.f32 %r1914, %r1912; + mul.f32 %r1915, %r1914, %r1913; + fma.rn.ftz.f32 %r1916, %r1806, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1917, %r1916; + fma.rm.ftz.f32 %r1918, %r1917, %r1835, %r1834; + add.f32 %r1919, %r1918, 0fCB40007F; + neg.f32 %r1920, %r1919; + fma.rn.ftz.f32 %r1921, %r1806, %r1839, %r1920; + fma.rn.ftz.f32 %r1922, %r1806, %r1841, %r1921; + shl.b32 %r1923, %r1918, 23; + ex2.approx.ftz.f32 %r1924, %r1922; + mul.f32 %r1925, %r1924, %r1923; + fma.rn.ftz.f32 %r1926, %r1807, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1927, %r1926; + fma.rm.ftz.f32 %r1928, %r1927, %r1835, %r1834; + add.f32 %r1929, %r1928, 0fCB40007F; + neg.f32 %r1930, %r1929; + fma.rn.ftz.f32 %r1931, %r1807, %r1839, %r1930; + fma.rn.ftz.f32 %r1932, %r1807, %r1841, %r1931; + shl.b32 %r1933, %r1928, 23; + ex2.approx.ftz.f32 %r1934, %r1932; + mul.f32 %r1935, %r1934, %r1933; + fma.rn.ftz.f32 %r1936, %r1808, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1937, %r1936; + fma.rm.ftz.f32 %r1938, %r1937, %r1835, %r1834; + add.f32 %r1939, %r1938, 0fCB40007F; + neg.f32 %r1940, %r1939; + fma.rn.ftz.f32 %r1941, %r1808, %r1839, %r1940; + fma.rn.ftz.f32 %r1942, %r1808, %r1841, %r1941; + shl.b32 %r1943, %r1938, 23; + ex2.approx.ftz.f32 %r1944, %r1942; + mul.f32 %r1945, %r1944, %r1943; + fma.rn.ftz.f32 %r1946, %r1809, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1947, %r1946; + fma.rm.ftz.f32 %r1948, %r1947, %r1835, %r1834; + add.f32 %r1949, %r1948, 0fCB40007F; + neg.f32 %r1950, %r1949; + fma.rn.ftz.f32 %r1951, %r1809, %r1839, %r1950; + fma.rn.ftz.f32 %r1952, %r1809, %r1841, %r1951; + shl.b32 %r1953, %r1948, 23; + ex2.approx.ftz.f32 %r1954, %r1952; + mul.f32 %r1955, %r1954, %r1953; + fma.rn.ftz.f32 %r1956, %r1810, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1957, %r1956; + fma.rm.ftz.f32 %r1958, %r1957, %r1835, %r1834; + add.f32 %r1959, %r1958, 0fCB40007F; + neg.f32 %r1960, %r1959; + fma.rn.ftz.f32 %r1961, %r1810, %r1839, %r1960; + fma.rn.ftz.f32 %r1962, %r1810, %r1841, %r1961; + shl.b32 %r1963, %r1958, 23; + ex2.approx.ftz.f32 %r1964, %r1962; + mul.f32 %r1965, %r1964, %r1963; + fma.rn.ftz.f32 %r1966, %r1811, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1967, %r1966; + fma.rm.ftz.f32 %r1968, %r1967, %r1835, %r1834; + add.f32 %r1969, %r1968, 0fCB40007F; + neg.f32 %r1970, %r1969; + fma.rn.ftz.f32 %r1971, %r1811, %r1839, %r1970; + fma.rn.ftz.f32 %r1972, %r1811, %r1841, %r1971; + shl.b32 %r1973, %r1968, 23; + ex2.approx.ftz.f32 %r1974, %r1972; + mul.f32 %r1975, %r1974, %r1973; + fma.rn.ftz.f32 %r1976, %r1812, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1977, %r1976; + fma.rm.ftz.f32 %r1978, %r1977, %r1835, %r1834; + add.f32 %r1979, %r1978, 0fCB40007F; + neg.f32 %r1980, %r1979; + fma.rn.ftz.f32 %r1981, %r1812, %r1839, %r1980; + fma.rn.ftz.f32 %r1982, %r1812, %r1841, %r1981; + shl.b32 %r1983, %r1978, 23; + ex2.approx.ftz.f32 %r1984, %r1982; + mul.f32 %r1985, %r1984, %r1983; + fma.rn.ftz.f32 %r1986, %r1813, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1987, %r1986; + fma.rm.ftz.f32 %r1988, %r1987, %r1835, %r1834; + add.f32 %r1989, %r1988, 0fCB40007F; + neg.f32 %r1990, %r1989; + fma.rn.ftz.f32 %r1991, %r1813, %r1839, %r1990; + fma.rn.ftz.f32 %r1992, %r1813, %r1841, %r1991; + shl.b32 %r1993, %r1988, 23; + ex2.approx.ftz.f32 %r1994, %r1992; + mul.f32 %r1995, %r1994, %r1993; + fma.rn.ftz.f32 %r1996, %r1814, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r1997, %r1996; + fma.rm.ftz.f32 %r1998, %r1997, %r1835, %r1834; + add.f32 %r1999, %r1998, 0fCB40007F; + neg.f32 %r2000, %r1999; + fma.rn.ftz.f32 %r2001, %r1814, %r1839, %r2000; + fma.rn.ftz.f32 %r2002, %r1814, %r1841, %r2001; + shl.b32 %r2003, %r1998, 23; + ex2.approx.ftz.f32 %r2004, %r2002; + mul.f32 %r2005, %r2004, %r2003; + fma.rn.ftz.f32 %r2006, %r1815, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2007, %r2006; + fma.rm.ftz.f32 %r2008, %r2007, %r1835, %r1834; + add.f32 %r2009, %r2008, 0fCB40007F; + neg.f32 %r2010, %r2009; + fma.rn.ftz.f32 %r2011, %r1815, %r1839, %r2010; + fma.rn.ftz.f32 %r2012, %r1815, %r1841, %r2011; + shl.b32 %r2013, %r2008, 23; + ex2.approx.ftz.f32 %r2014, %r2012; + mul.f32 %r2015, %r2014, %r2013; + fma.rn.ftz.f32 %r2016, %r1816, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2017, %r2016; + fma.rm.ftz.f32 %r2018, %r2017, %r1835, %r1834; + add.f32 %r2019, %r2018, 0fCB40007F; + neg.f32 %r2020, %r2019; + fma.rn.ftz.f32 %r2021, %r1816, %r1839, %r2020; + fma.rn.ftz.f32 %r2022, %r1816, %r1841, %r2021; + shl.b32 %r2023, %r2018, 23; + ex2.approx.ftz.f32 %r2024, %r2022; + mul.f32 %r2025, %r2024, %r2023; + fma.rn.ftz.f32 %r2026, %r1817, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2027, %r2026; + fma.rm.ftz.f32 %r2028, %r2027, %r1835, %r1834; + add.f32 %r2029, %r2028, 0fCB40007F; + neg.f32 %r2030, %r2029; + fma.rn.ftz.f32 %r2031, %r1817, %r1839, %r2030; + fma.rn.ftz.f32 %r2032, %r1817, %r1841, %r2031; + shl.b32 %r2033, %r2028, 23; + ex2.approx.ftz.f32 %r2034, %r2032; + mul.f32 %r2035, %r2034, %r2033; + fma.rn.ftz.f32 %r2036, %r1818, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2037, %r2036; + fma.rm.ftz.f32 %r2038, %r2037, %r1835, %r1834; + add.f32 %r2039, %r2038, 0fCB40007F; + neg.f32 %r2040, %r2039; + fma.rn.ftz.f32 %r2041, %r1818, %r1839, %r2040; + fma.rn.ftz.f32 %r2042, %r1818, %r1841, %r2041; + shl.b32 %r2043, %r2038, 23; + ex2.approx.ftz.f32 %r2044, %r2042; + mul.f32 %r2045, %r2044, %r2043; + fma.rn.ftz.f32 %r2046, %r1819, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2047, %r2046; + fma.rm.ftz.f32 %r2048, %r2047, %r1835, %r1834; + add.f32 %r2049, %r2048, 0fCB40007F; + neg.f32 %r2050, %r2049; + fma.rn.ftz.f32 %r2051, %r1819, %r1839, %r2050; + fma.rn.ftz.f32 %r2052, %r1819, %r1841, %r2051; + shl.b32 %r2053, %r2048, 23; + ex2.approx.ftz.f32 %r2054, %r2052; + mul.f32 %r2055, %r2054, %r2053; + fma.rn.ftz.f32 %r2056, %r1820, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2057, %r2056; + fma.rm.ftz.f32 %r2058, %r2057, %r1835, %r1834; + add.f32 %r2059, %r2058, 0fCB40007F; + neg.f32 %r2060, %r2059; + fma.rn.ftz.f32 %r2061, %r1820, %r1839, %r2060; + fma.rn.ftz.f32 %r2062, %r1820, %r1841, %r2061; + shl.b32 %r2063, %r2058, 23; + ex2.approx.ftz.f32 %r2064, %r2062; + mul.f32 %r2065, %r2064, %r2063; + fma.rn.ftz.f32 %r2066, %r1821, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2067, %r2066; + fma.rm.ftz.f32 %r2068, %r2067, %r1835, %r1834; + add.f32 %r2069, %r2068, 0fCB40007F; + neg.f32 %r2070, %r2069; + fma.rn.ftz.f32 %r2071, %r1821, %r1839, %r2070; + fma.rn.ftz.f32 %r2072, %r1821, %r1841, %r2071; + shl.b32 %r2073, %r2068, 23; + ex2.approx.ftz.f32 %r2074, %r2072; + mul.f32 %r2075, %r2074, %r2073; + fma.rn.ftz.f32 %r2076, %r1822, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2077, %r2076; + fma.rm.ftz.f32 %r2078, %r2077, %r1835, %r1834; + add.f32 %r2079, %r2078, 0fCB40007F; + neg.f32 %r2080, %r2079; + fma.rn.ftz.f32 %r2081, %r1822, %r1839, %r2080; + fma.rn.ftz.f32 %r2082, %r1822, %r1841, %r2081; + shl.b32 %r2083, %r2078, 23; + ex2.approx.ftz.f32 %r2084, %r2082; + mul.f32 %r2085, %r2084, %r2083; + fma.rn.ftz.f32 %r2086, %r1823, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2087, %r2086; + fma.rm.ftz.f32 %r2088, %r2087, %r1835, %r1834; + add.f32 %r2089, %r2088, 0fCB40007F; + neg.f32 %r2090, %r2089; + fma.rn.ftz.f32 %r2091, %r1823, %r1839, %r2090; + fma.rn.ftz.f32 %r2092, %r1823, %r1841, %r2091; + shl.b32 %r2093, %r2088, 23; + ex2.approx.ftz.f32 %r2094, %r2092; + mul.f32 %r2095, %r2094, %r2093; + fma.rn.ftz.f32 %r2096, %r1824, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2097, %r2096; + fma.rm.ftz.f32 %r2098, %r2097, %r1835, %r1834; + add.f32 %r2099, %r2098, 0fCB40007F; + neg.f32 %r2100, %r2099; + fma.rn.ftz.f32 %r2101, %r1824, %r1839, %r2100; + fma.rn.ftz.f32 %r2102, %r1824, %r1841, %r2101; + shl.b32 %r2103, %r2098, 23; + ex2.approx.ftz.f32 %r2104, %r2102; + mul.f32 %r2105, %r2104, %r2103; + fma.rn.ftz.f32 %r2106, %r1825, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2107, %r2106; + fma.rm.ftz.f32 %r2108, %r2107, %r1835, %r1834; + add.f32 %r2109, %r2108, 0fCB40007F; + neg.f32 %r2110, %r2109; + fma.rn.ftz.f32 %r2111, %r1825, %r1839, %r2110; + fma.rn.ftz.f32 %r2112, %r1825, %r1841, %r2111; + shl.b32 %r2113, %r2108, 23; + ex2.approx.ftz.f32 %r2114, %r2112; + mul.f32 %r2115, %r2114, %r2113; + fma.rn.ftz.f32 %r2116, %r1826, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2117, %r2116; + fma.rm.ftz.f32 %r2118, %r2117, %r1835, %r1834; + add.f32 %r2119, %r2118, 0fCB40007F; + neg.f32 %r2120, %r2119; + fma.rn.ftz.f32 %r2121, %r1826, %r1839, %r2120; + fma.rn.ftz.f32 %r2122, %r1826, %r1841, %r2121; + shl.b32 %r2123, %r2118, 23; + ex2.approx.ftz.f32 %r2124, %r2122; + mul.f32 %r2125, %r2124, %r2123; + fma.rn.ftz.f32 %r2126, %r1827, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2127, %r2126; + fma.rm.ftz.f32 %r2128, %r2127, %r1835, %r1834; + add.f32 %r2129, %r2128, 0fCB40007F; + neg.f32 %r2130, %r2129; + fma.rn.ftz.f32 %r2131, %r1827, %r1839, %r2130; + fma.rn.ftz.f32 %r2132, %r1827, %r1841, %r2131; + shl.b32 %r2133, %r2128, 23; + ex2.approx.ftz.f32 %r2134, %r2132; + mul.f32 %r2135, %r2134, %r2133; + fma.rn.ftz.f32 %r2136, %r1828, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2137, %r2136; + fma.rm.ftz.f32 %r2138, %r2137, %r1835, %r1834; + add.f32 %r2139, %r2138, 0fCB40007F; + neg.f32 %r2140, %r2139; + fma.rn.ftz.f32 %r2141, %r1828, %r1839, %r2140; + fma.rn.ftz.f32 %r2142, %r1828, %r1841, %r2141; + shl.b32 %r2143, %r2138, 23; + ex2.approx.ftz.f32 %r2144, %r2142; + mul.f32 %r2145, %r2144, %r2143; + fma.rn.ftz.f32 %r2146, %r1829, %r1831, %r1830; + cvt.ftz.sat.f32.f32 %r2147, %r2146; + fma.rm.ftz.f32 %r2148, %r2147, %r1835, %r1834; + add.f32 %r2149, %r2148, 0fCB40007F; + neg.f32 %r2150, %r2149; + fma.rn.ftz.f32 %r2151, %r1829, %r1839, %r2150; + fma.rn.ftz.f32 %r2152, %r1829, %r1841, %r2151; + shl.b32 %r2153, %r2148, 23; + ex2.approx.ftz.f32 %r2154, %r2152; + mul.f32 %r2155, %r2154, %r2153; + .loc 1 62 23 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:62:23 + div.full.f32 %r2156, %r1845, %r40; + div.full.f32 %r2157, %r1855, %r40; + div.full.f32 %r2158, %r1865, %r40; + div.full.f32 %r2159, %r1875, %r40; + div.full.f32 %r2160, %r1885, %r40; + div.full.f32 %r2161, %r1895, %r40; + div.full.f32 %r2162, %r1905, %r40; + div.full.f32 %r2163, %r1915, %r40; + div.full.f32 %r2164, %r1925, %r40; + div.full.f32 %r2165, %r1935, %r40; + div.full.f32 %r2166, %r1945, %r40; + div.full.f32 %r2167, %r1955, %r40; + div.full.f32 %r2168, %r1965, %r40; + div.full.f32 %r2169, %r1975, %r40; + div.full.f32 %r2170, %r1985, %r40; + div.full.f32 %r2171, %r1995, %r40; + div.full.f32 %r2172, %r2005, %r40; + div.full.f32 %r2173, %r2015, %r40; + div.full.f32 %r2174, %r2025, %r40; + div.full.f32 %r2175, %r2035, %r40; + div.full.f32 %r2176, %r2045, %r40; + div.full.f32 %r2177, %r2055, %r40; + div.full.f32 %r2178, %r2065, %r40; + div.full.f32 %r2179, %r2075, %r40; + div.full.f32 %r2180, %r2085, %r40; + div.full.f32 %r2181, %r2095, %r40; + div.full.f32 %r2182, %r2105, %r40; + div.full.f32 %r2183, %r2115, %r40; + div.full.f32 %r2184, %r2125, %r40; + div.full.f32 %r2185, %r2135, %r40; + div.full.f32 %r2186, %r2145, %r40; + div.full.f32 %r2187, %r2155, %r40; + .loc 1 63 29 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:63:29 + mad.wide.s32 %rd234, %r1757, 4, %rd58; + mad.wide.s32 %rd235, %r1758, 4, %rd58; + mad.wide.s32 %rd236, %r1759, 4, %rd58; + mad.wide.s32 %rd237, %r1760, 4, %rd58; + mad.wide.s32 %rd238, %r1761, 4, %rd58; + mad.wide.s32 %rd239, %r1762, 4, %rd58; + mad.wide.s32 %rd240, %r1763, 4, %rd58; + mad.wide.s32 %rd241, %r1765, 4, %rd58; + .loc 1 63 53 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:63:53 + bar.sync 0; + st.shared.v4.b32 [%r41], {%r2156, %r2157, %r2158, %r2159}; + st.shared.v4.b32 [%r42+8192], {%r2160, %r2161, %r2162, %r2163}; + bar.sync 0; + ld.shared.v4.b32 {%r1715, %r1716, %r1717, %r1718}, [%r43]; + ld.shared.v4.b32 {%r1719, %r1720, %r1721, %r1722}, [%r43+4096]; + bar.sync 0; + st.shared.v4.b32 [%r41], {%r2164, %r2165, %r2166, %r2167}; + st.shared.v4.b32 [%r42+8192], {%r2168, %r2169, %r2170, %r2171}; + bar.sync 0; + ld.shared.v4.b32 {%r1723, %r1724, %r1725, %r1726}, [%r43]; + ld.shared.v4.b32 {%r1727, %r1728, %r1729, %r1730}, [%r43+4096]; + bar.sync 0; + st.shared.v4.b32 [%r41], {%r2172, %r2173, %r2174, %r2175}; + st.shared.v4.b32 [%r42+8192], {%r2176, %r2177, %r2178, %r2179}; + bar.sync 0; + ld.shared.v4.b32 {%r1731, %r1732, %r1733, %r1734}, [%r43]; + ld.shared.v4.b32 {%r1735, %r1736, %r1737, %r1738}, [%r43+4096]; + bar.sync 0; + st.shared.v4.b32 [%r41], {%r2180, %r2181, %r2182, %r2183}; + st.shared.v4.b32 [%r42+8192], {%r2184, %r2185, %r2186, %r2187}; + bar.sync 0; + ld.shared.v4.b32 {%r1739, %r1740, %r1741, %r1742}, [%r43]; + ld.shared.v4.b32 {%r1743, %r1744, %r1745, %r1746}, [%r43+4096]; + // begin inline asm + @%p193 st.global.v4.b32 [ %rd234 + 0 ], { %r1715, %r1716, %r1717, %r1718 }; + // end inline asm + // begin inline asm + @%p193 st.global.v4.b32 [ %rd235 + 0 ], { %r1719, %r1720, %r1721, %r1722 }; + // end inline asm + // begin inline asm + @%p193 st.global.v4.b32 [ %rd236 + 0 ], { %r1723, %r1724, %r1725, %r1726 }; + // end inline asm + // begin inline asm + @%p193 st.global.v4.b32 [ %rd237 + 0 ], { %r1727, %r1728, %r1729, %r1730 }; + // end inline asm + // begin inline asm + @%p193 st.global.v4.b32 [ %rd238 + 0 ], { %r1731, %r1732, %r1733, %r1734 }; + // end inline asm + // begin inline asm + @%p193 st.global.v4.b32 [ %rd239 + 0 ], { %r1735, %r1736, %r1737, %r1738 }; + // end inline asm + // begin inline asm + @%p193 st.global.v4.b32 [ %rd240 + 0 ], { %r1739, %r1740, %r1741, %r1742 }; + // end inline asm + // begin inline asm + @%p204 st.global.v4.b32 [ %rd241 + 0 ], { %r1743, %r1744, %r1745, %r1746 }; + // end inline asm + mov.b64 %rd273, 16384; + mov.pred %p207, 0; + .loc 1 52 40 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:52:40 + @%p2 bra $L__BB0_3; +// %bb.4: + .loc 1 52 4 // c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py:52:4 + ret; +$L__tmp17: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 276 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x10d DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 52 +.b8 105 +.b8 119 +.b8 110 +.b8 104 +.b8 115 +.b8 102 +.b8 53 +.b8 107 +.b8 102 +.b8 109 +.b8 109 +.b8 55 +.b8 106 +.b8 110 +.b8 122 +.b8 114 +.b8 107 +.b8 121 +.b8 105 +.b8 118 +.b8 52 +.b8 120 +.b8 51 +.b8 121 +.b8 97 +.b8 104 +.b8 106 +.b8 111 +.b8 103 +.b8 54 +.b8 100 +.b8 121 +.b8 104 +.b8 102 +.b8 52 +.b8 112 +.b8 114 +.b8 109 +.b8 50 +.b8 99 +.b8 106 +.b8 100 +.b8 105 +.b8 53 +.b8 120 +.b8 104 +.b8 108 +.b8 108 +.b8 120 +.b8 50 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 52 +.b8 105 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x46 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 101 +.b8 120 +.b8 112 +.b8 95 +.b8 112 +.b8 114 +.b8 101 +.b8 112 +.b8 97 +.b8 114 +.b8 101 +.b8 95 +.b8 115 +.b8 111 +.b8 102 +.b8 116 +.b8 109 +.b8 97 +.b8 120 +.b8 95 +.b8 111 +.b8 110 +.b8 108 +.b8 105 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 117 +.b8 98 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xd1:0x46 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xe6:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp14 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 42 // DW_AT_call_line +.b8 40 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xfe:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp15 // DW_AT_low_pc +.b64 $L__tmp16 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 49 // DW_AT_call_line +.b8 33 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source new file mode 100644 index 0000000000000000000000000000000000000000..00ef6430c6048078094767a875d3502dd06d5371 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.source @@ -0,0 +1,449 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":18:0) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":186:0) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":109:0) +#loc68 = loc(unknown) +#loc72 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc76 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc81 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":169:0) +#loc85 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":177:0) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":122:0) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc108 = loc("in_ptr0"(#loc)) +#loc109 = loc("out_ptr2"(#loc)) +#loc110 = loc("xnumel"(#loc)) +#loc111 = loc("r0_numel"(#loc)) +#loc146 = loc("lhs_max"(#loc48)) +#loc147 = loc("lhs_sum"(#loc48)) +#loc148 = loc("rhs_max"(#loc48)) +#loc160 = loc("a"(#loc62)) +#loc161 = loc("b"(#loc62)) +#loc165 = loc("x"(#loc72)) +#loc166 = loc("x"(#loc76)) +#loc167 = loc("x"(#loc81)) +#loc168 = loc("lhs_max"(#loc85)) +#loc169 = loc("lhs_sum"(#loc85)) +#loc178 = loc("a"(#loc96)) +#loc179 = loc("input"(#loc100)) +#loc180 = loc("a"(#loc104)) +#loc181 = loc("b"(#loc104)) +module { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 16384 : i32 loc(#loc112) + %r0_numel_1 = arith.constant 32000 : i32 loc(#loc113) + %xoffset = tt.get_program_id x : i32 loc(#loc114) + %xoffset_2 = arith.constant 1 : i32 loc(#loc115) + %xoffset_3 = arith.constant 1 : i32 loc(#loc115) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc115) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc116) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc117) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc118) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc118) + %xmask = arith.constant true loc(#loc119) + %xmask_8 = arith.constant dense : tensor<1x16384xi1> loc(#loc119) + %r0_base = tt.make_range {end = 16384 : i32, start = 0 : i32} : tensor<16384xi32> loc(#loc120) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16384xi32> -> tensor<1x16384xi32> loc(#loc121) + %_tmp3_max = arith.constant 0xFF800000 : f32 loc(#loc122) + %_tmp3_max_10 = arith.constant dense<0xFF800000> : tensor<1x16384xf32> loc(#loc122) + %_tmp3_sum = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16384__(1,)cconstexpr_fp32_"() : () -> tensor<1x16384xf32> loc(#loc123) + %c0_i32 = arith.constant 0 : i32 loc(#loc13) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc13) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc13) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc13) + %2 = arith.bitcast %c16384_i32 : i32 to i32 loc(#loc13) + %3 = ub.poison : i32 loc(#loc13) + %_tmp3_sum_11:2 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp3_max_14 = %_tmp3_max_10, %_tmp3_sum_15 = %_tmp3_sum) -> (tensor<1x16384xf32>, tensor<1x16384xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16384xi32> loc(#loc125) + %r0_index_16 = arith.addi %r0_index, %r0_base_9 : tensor<1x16384xi32> loc(#loc125) + %r0_mask = arith.constant dense<32000> : tensor<1x16384xi32> loc(#loc126) + %r0_mask_17 = arith.cmpi slt, %r0_index_16, %r0_mask : tensor<1x16384xi32> loc(#loc126) + %tmp0 = arith.constant 32000 : i32 loc(#loc127) + %tmp0_18 = arith.constant 32000 : i32 loc(#loc127) + %tmp0_19 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc127) + %tmp0_20 = arith.muli %tmp0_19, %xindex_7 : tensor<1x1xi32> loc(#loc127) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<1x1xi32> -> tensor<1x16384xi32> loc(#loc128) + %tmp0_22 = arith.addi %r0_index_16, %tmp0_21 : tensor<1x16384xi32> loc(#loc128) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16384x!tt.ptr> loc(#loc129) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<1x16384x!tt.ptr>, tensor<1x16384xi32> loc(#loc129) + %tmp0_25 = arith.constant 0.000000e+00 : f32 loc(#loc130) + %tmp0_26 = arith.constant dense<0.000000e+00> : tensor<1x16384xf32> loc(#loc130) + %tmp0_27 = arith.truncf %tmp0_26 : tensor<1x16384xf32> to tensor<1x16384xbf16> loc(#loc130) + %tmp0_28 = tt.load %tmp0_24, %r0_mask_17, %tmp0_27 evictionPolicy = evict_last : tensor<1x16384x!tt.ptr> loc(#loc130) + %tmp0_29 = arith.extf %tmp0_28 : tensor<1x16384xbf16> to tensor<1x16384xf32> loc(#loc131) + %9:2 = tt.call @"torch._inductor.runtime.triton_helpers.online_softmax_combine__fp32S1_16384S_fp32S1_16384S_fp32S1_16384S__(3,)cconstexpr_False_"(%_tmp3_max_14, %_tmp3_sum_15, %tmp0_29) : (tensor<1x16384xf32>, tensor<1x16384xf32>, tensor<1x16384xf32>) -> (tensor<1x16384xf32>, tensor<1x16384xf32>) loc(#loc21) + %_tmp3_max_30 = arith.select %r0_mask_17, %9#0, %_tmp3_max_14 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc132) + %_tmp3_sum_31 = arith.select %r0_mask_17, %9#1, %_tmp3_sum_15 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc133) + scf.yield %_tmp3_max_30, %_tmp3_sum_31 : tensor<1x16384xf32>, tensor<1x16384xf32> loc(#loc24) + } loc(#loc182) + %4:2 = tt.call @"torch._inductor.runtime.triton_helpers.online_softmax_reduce__fp32S1_16384S_fp32S1_16384S__(2,)cconstexpr_1__(3,)cconstexpr_False_"(%_tmp3_sum_11#0, %_tmp3_sum_11#1) : (tensor<1x16384xf32>, tensor<1x16384xf32>) -> (tensor<1xf32>, tensor<1xf32>) loc(#loc25) + %tmp3 = tt.expand_dims %4#0 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc134) + %tmp4 = tt.expand_dims %4#1 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc135) + %c0_i32_12 = arith.constant 0 : i32 loc(#loc28) + %c16384_i32_13 = arith.constant 16384 : i32 loc(#loc28) + %5 = arith.bitcast %c0_i32_12 : i32 to i32 loc(#loc28) + %6 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc28) + %7 = arith.bitcast %c16384_i32_13 : i32 to i32 loc(#loc28) + %8 = ub.poison : i32 loc(#loc28) + scf.for %r0_offset = %5 to %6 step %7 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16384xi32> loc(#loc136) + %r0_index_14 = arith.addi %r0_index, %r0_base_9 : tensor<1x16384xi32> loc(#loc136) + %r0_mask = arith.constant dense<32000> : tensor<1x16384xi32> loc(#loc137) + %r0_mask_15 = arith.cmpi slt, %r0_index_14, %r0_mask : tensor<1x16384xi32> loc(#loc137) + %tmp5 = arith.constant 32000 : i32 loc(#loc138) + %tmp5_16 = arith.constant 32000 : i32 loc(#loc138) + %tmp5_17 = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc138) + %tmp5_18 = arith.muli %tmp5_17, %xindex_7 : tensor<1x1xi32> loc(#loc138) + %tmp5_19 = tt.broadcast %tmp5_18 : tensor<1x1xi32> -> tensor<1x16384xi32> loc(#loc139) + %tmp5_20 = arith.addi %r0_index_14, %tmp5_19 : tensor<1x16384xi32> loc(#loc139) + %tmp5_21 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16384x!tt.ptr> loc(#loc140) + %tmp5_22 = tt.addptr %tmp5_21, %tmp5_20 : tensor<1x16384x!tt.ptr>, tensor<1x16384xi32> loc(#loc140) + %tmp5_23 = arith.constant 0.000000e+00 : f32 loc(#loc141) + %tmp5_24 = arith.constant dense<0.000000e+00> : tensor<1x16384xf32> loc(#loc141) + %tmp5_25 = arith.truncf %tmp5_24 : tensor<1x16384xf32> to tensor<1x16384xbf16> loc(#loc141) + %tmp5_26 = tt.load %tmp5_22, %r0_mask_15, %tmp5_25 evictionPolicy = evict_first : tensor<1x16384x!tt.ptr> loc(#loc141) + %tmp5_27 = arith.extf %tmp5_26 : tensor<1x16384xbf16> to tensor<1x16384xf32> loc(#loc142) + %tmp7 = tt.broadcast %tmp3 : tensor<1x1xf32> -> tensor<1x16384xf32> loc(#loc143) + %tmp7_28 = arith.subf %tmp5_27, %tmp7 : tensor<1x16384xf32> loc(#loc143) + %tmp8 = tt.extern_elementwise %tmp7_28 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc144) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32> -> tensor<1x16384xf32> loc(#loc145) + %tmp9_29 = arith.divf %tmp8, %tmp9 : tensor<1x16384xf32> loc(#loc145) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc39) + %c32000_i32_30 = arith.constant 32000 : i32 loc(#loc39) + %cst = arith.constant dense<32000> : tensor<1x1xi32> loc(#loc39) + %9 = arith.muli %cst, %xindex_7 : tensor<1x1xi32> loc(#loc39) + %10 = tt.broadcast %9 : tensor<1x1xi32> -> tensor<1x16384xi32> loc(#loc40) + %11 = arith.addi %r0_index_14, %10 : tensor<1x16384xi32> loc(#loc40) + %12 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16384x!tt.ptr> loc(#loc41) + %13 = tt.addptr %12, %11 : tensor<1x16384x!tt.ptr>, tensor<1x16384xi32> loc(#loc41) + tt.store %13, %tmp9_29, %r0_mask_15 : tensor<1x16384x!tt.ptr> loc(#loc42) + } loc(#loc28) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16384__(1,)cconstexpr_fp32_"() -> tensor<1x16384xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc45) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x16384xf32> loc(#loc45) + tt.return %cst_0 : tensor<1x16384xf32> loc(#loc46) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16384xf32> loc(#loc47) + tt.return %0 : tensor<1x16384xf32> loc(#loc47) + } loc(#loc44) + tt.func private @"torch._inductor.runtime.triton_helpers.online_softmax_combine__fp32S1_16384S_fp32S1_16384S_fp32S1_16384S__(3,)cconstexpr_False_"(%lhs_max: tensor<1x16384xf32> loc("lhs_max"(#loc48)), %lhs_sum: tensor<1x16384xf32> loc("lhs_sum"(#loc48)), %rhs_max: tensor<1x16384xf32> loc("rhs_max"(#loc48))) -> (tensor<1x16384xf32>, tensor<1x16384xf32>) attributes {noinline = false} { + %out_max = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32S1_16384S_fp32S1_16384S__(%lhs_max, %rhs_max) : (tensor<1x16384xf32>, tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc149) + %lhs_scale = arith.constant 0xFF800000 : f32 loc(#loc150) + %lhs_scale_0 = arith.constant dense<0xFF800000> : tensor<1x16384xf32> loc(#loc150) + %lhs_scale_1 = arith.cmpf oeq, %out_max, %lhs_scale_0 : tensor<1x16384xf32> loc(#loc150) + %lhs_scale_2 = arith.subf %lhs_max, %out_max : tensor<1x16384xf32> loc(#loc151) + %lhs_scale_3 = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_16384S__(1,)cconstexpr_False_"(%lhs_scale_2) : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc152) + %lhs_scale_4 = arith.constant 1.000000e+00 : f32 loc(#loc153) + %lhs_scale_5 = arith.constant 1.000000e+00 : f32 loc(#loc153) + %lhs_scale_6 = arith.constant dense<1.000000e+00> : tensor<1x16384xf32> loc(#loc153) + %lhs_scale_7 = arith.select %lhs_scale_1, %lhs_scale_6, %lhs_scale_3 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc153) + %rhs_scale = arith.constant 0xFF800000 : f32 loc(#loc154) + %rhs_scale_8 = arith.constant dense<0xFF800000> : tensor<1x16384xf32> loc(#loc154) + %rhs_scale_9 = arith.cmpf oeq, %out_max, %rhs_scale_8 : tensor<1x16384xf32> loc(#loc154) + %rhs_scale_10 = arith.subf %rhs_max, %out_max : tensor<1x16384xf32> loc(#loc155) + %rhs_scale_11 = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_16384S__(1,)cconstexpr_False_"(%rhs_scale_10) : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc156) + %rhs_scale_12 = arith.constant 1.000000e+00 : f32 loc(#loc157) + %rhs_scale_13 = arith.constant 1.000000e+00 : f32 loc(#loc157) + %rhs_scale_14 = arith.constant dense<1.000000e+00> : tensor<1x16384xf32> loc(#loc157) + %rhs_scale_15 = arith.select %rhs_scale_9, %rhs_scale_14, %rhs_scale_11 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc157) + %out_sum = arith.mulf %lhs_sum, %lhs_scale_7 : tensor<1x16384xf32> loc(#loc158) + %out_sum_16 = arith.addf %out_sum, %rhs_scale_15 : tensor<1x16384xf32> loc(#loc159) + tt.return %out_max, %out_sum_16 : tensor<1x16384xf32>, tensor<1x16384xf32> loc(#loc60) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16384xf32> loc(#loc61) + %1 = ub.poison : tensor<1x16384xf32> loc(#loc61) + tt.return %0, %1 : tensor<1x16384xf32>, tensor<1x16384xf32> loc(#loc61) + } loc(#loc48) + tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32S1_16384S_fp32S1_16384S__(%a: tensor<1x16384xf32> loc("a"(#loc62)), %b: tensor<1x16384xf32> loc("b"(#loc62))) -> tensor<1x16384xf32> attributes {noinline = false} { + %mask = arith.cmpf ogt, %a, %b : tensor<1x16384xf32> loc(#loc183) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_16384S__(%a) : (tensor<1x16384xf32>) -> i1 loc(#loc64) + %1 = scf.if %0 -> (tensor<1x16384xi1>) { + %mask_0 = arith.cmpf une, %a, %a : tensor<1x16384xf32> loc(#loc163) + %mask_1 = arith.ori %mask, %mask_0 : tensor<1x16384xi1> loc(#loc184) + scf.yield %mask_1 : tensor<1x16384xi1> loc(#loc184) + } else { + scf.yield %mask : tensor<1x16384xi1> loc(#loc68) + } loc(#loc65) + %2 = arith.select %1, %a, %b : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc69) + tt.return %2 : tensor<1x16384xf32> loc(#loc70) + ^bb1: // no predecessors + %3 = ub.poison : tensor<1x16384xf32> loc(#loc71) + tt.return %3 : tensor<1x16384xf32> loc(#loc71) + } loc(#loc62) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32S1_16384S__(%x: tensor<1x16384xf32> loc("x"(#loc72))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_16384S__(%x) : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc73) + %true = arith.constant true loc(#loc74) + tt.return %true : i1 loc(#loc74) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc75) + tt.return %1 : i1 loc(#loc75) + } loc(#loc72) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32S1_16384S__(%x: tensor<1x16384xf32> loc("x"(#loc76))) -> tensor<1x16384xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc77) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc78) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc78) + %3 = tt.broadcast %2 : tensor<1x1xf32> -> tensor<1x16384xf32> loc(#loc78) + %4 = arith.addf %x, %3 : tensor<1x16384xf32> loc(#loc78) + tt.return %4 : tensor<1x16384xf32> loc(#loc79) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x16384xf32> loc(#loc80) + tt.return %5 : tensor<1x16384xf32> loc(#loc80) + } loc(#loc76) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc45) + %cst = arith.constant dense : tensor<1xi1> loc(#loc45) + tt.return %cst : tensor<1xi1> loc(#loc46) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc47) + tt.return %0 : tensor<1xi1> loc(#loc47) + } loc(#loc44) + tt.func private @"torch._inductor.runtime.triton_helpers.exp__fp32S1_16384S__(1,)cconstexpr_False_"(%x: tensor<1x16384xf32> loc("x"(#loc81))) -> tensor<1x16384xf32> attributes {noinline = false} { + %0 = tt.extern_elementwise %x {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc82) + tt.return %0 : tensor<1x16384xf32> loc(#loc83) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16384xf32> loc(#loc84) + tt.return %1 : tensor<1x16384xf32> loc(#loc84) + } loc(#loc81) + tt.func private @"torch._inductor.runtime.triton_helpers.online_softmax_reduce__fp32S1_16384S_fp32S1_16384S__(2,)cconstexpr_1__(3,)cconstexpr_False_"(%lhs_max: tensor<1x16384xf32> loc("lhs_max"(#loc85)), %lhs_sum: tensor<1x16384xf32> loc("lhs_sum"(#loc85))) -> (tensor<1xf32>, tensor<1xf32>) attributes {noinline = false} { + %out_max = tt.call @"torch._inductor.runtime.triton_helpers.max2__fp32S1_16384S__(1,)cconstexpr_1_"(%lhs_max) : (tensor<1x16384xf32>) -> tensor<1xf32> loc(#loc170) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc171) + %delta = arith.constant 0xFF800000 : f32 loc(#loc172) + %delta_0 = arith.constant dense<0xFF800000> : tensor<1x1xf32> loc(#loc172) + %delta_1 = arith.cmpf oeq, %out_max_keepdim, %delta_0 : tensor<1x1xf32> loc(#loc172) + %delta_2 = tt.broadcast %out_max_keepdim : tensor<1x1xf32> -> tensor<1x16384xf32> loc(#loc173) + %delta_3 = arith.subf %lhs_max, %delta_2 : tensor<1x16384xf32> loc(#loc173) + %delta_4 = arith.constant 0 : i32 loc(#loc174) + %delta_5 = arith.constant 0.000000e+00 : f32 loc(#loc174) + %delta_6 = arith.constant dense<0.000000e+00> : tensor<1x16384xf32> loc(#loc174) + %delta_7 = tt.broadcast %delta_1 : tensor<1x1xi1> -> tensor<1x16384xi1> loc(#loc174) + %delta_8 = arith.select %delta_7, %delta_6, %delta_3 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc174) + %out_sum = tt.call @"torch._inductor.runtime.triton_helpers.exp__fp32S1_16384S__(1,)cconstexpr_False_"(%delta_8) : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc175) + %out_sum_9 = arith.mulf %lhs_sum, %out_sum : tensor<1x16384xf32> loc(#loc176) + %out_sum_10 = tt.call @"triton.language.standard.sum__fp32S1_16384S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%out_sum_9) : (tensor<1x16384xf32>) -> tensor<1xf32> loc(#loc177) + tt.return %out_max, %out_sum_10 : tensor<1xf32>, tensor<1xf32> loc(#loc94) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xf32> loc(#loc95) + %1 = ub.poison : tensor<1xf32> loc(#loc95) + tt.return %0, %1 : tensor<1xf32>, tensor<1xf32> loc(#loc95) + } loc(#loc85) + tt.func private @"torch._inductor.runtime.triton_helpers.max2__fp32S1_16384S__(1,)cconstexpr_1_"(%a: tensor<1x16384xf32> loc("a"(#loc96))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%a) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc97) + tt.reduce.return %2 : f32 loc(#loc97) + }) : (tensor<1x16384xf32>) -> tensor<1xf32> loc(#loc97) + tt.return %0 : tensor<1xf32> loc(#loc98) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc99) + tt.return %1 : tensor<1xf32> loc(#loc99) + } loc(#loc96) + tt.func private @torch._inductor.runtime.triton_helpers.maximum__fp32_fp32__(%a: f32 loc("a"(#loc62)), %b: f32 loc("b"(#loc62))) -> f32 attributes {noinline = false} { + %mask = arith.cmpf ogt, %a, %b : f32 loc(#loc183) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%a) : (f32) -> i1 loc(#loc64) + %1 = scf.if %0 -> (i1) { + %mask_0 = arith.cmpf une, %a, %a : f32 loc(#loc163) + %mask_1 = arith.ori %mask, %mask_0 : i1 loc(#loc184) + scf.yield %mask_1 : i1 loc(#loc184) + } else { + scf.yield %mask : i1 loc(#loc68) + } loc(#loc65) + %2 = arith.select %1, %a, %b : f32 loc(#loc69) + tt.return %2 : f32 loc(#loc70) + ^bb1: // no predecessors + %3 = ub.poison : f32 loc(#loc71) + tt.return %3 : f32 loc(#loc71) + } loc(#loc62) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__fp32__(%x: f32 loc("x"(#loc72))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x) : (f32) -> tensor<1xf32> loc(#loc73) + %true = arith.constant true loc(#loc74) + tt.return %true : i1 loc(#loc74) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc75) + tt.return %1 : i1 loc(#loc75) + } loc(#loc72) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__fp32__(%x: f32 loc("x"(#loc76))) -> tensor<1xf32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc77) + %1 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf32> loc(#loc78) + %2 = tt.splat %x : f32 -> tensor<1xf32> loc(#loc78) + %3 = arith.addf %2, %1 : tensor<1xf32> loc(#loc78) + tt.return %3 : tensor<1xf32> loc(#loc79) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1xf32> loc(#loc80) + tt.return %4 : tensor<1xf32> loc(#loc80) + } loc(#loc76) + tt.func private @"triton.language.standard.sum__fp32S1_16384S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x16384xf32> loc("input"(#loc100))) -> tensor<1xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc101) + tt.reduce.return %2 : f32 loc(#loc101) + }) : (tensor<1x16384xf32>) -> tensor<1xf32> loc(#loc101) + tt.return %0 : tensor<1xf32> loc(#loc102) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xf32> loc(#loc103) + tt.return %1 : tensor<1xf32> loc(#loc103) + } loc(#loc100) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc104)), %b: f32 loc("b"(#loc104))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc105) + tt.return %0 : f32 loc(#loc106) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc107) + tt.return %1 : f32 loc(#loc107) + } loc(#loc104) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":29:59) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":30:45) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":31:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":32:31) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":33:29) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:47) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:41) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:52) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:105) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":42:40) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":45:54) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":46:54) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":46:8) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":49:33) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":50:16) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":51:16) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":52:40) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":53:31) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":54:29) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:47) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:41) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:52) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:106) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":60:22) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":61:29) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":62:23) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:42) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:36) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:29) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:53) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":52:4) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:19) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":206:11) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":206:4) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:19) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":111:7) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc69 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc70 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:11) +#loc71 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:4) +#loc73 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc74 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc75 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc77 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc78 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc79 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc80 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc82 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc83 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:15) +#loc84 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":170:4) +#loc86 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc87 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc88 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc89 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":182:11) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":182:4) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:11) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:4) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc112 = loc("xnumel"(#loc1)) +#loc113 = loc("r0_numel"(#loc2)) +#loc114 = loc("xoffset"(#loc3)) +#loc115 = loc("xoffset"(#loc4)) +#loc116 = loc("xindex"(#loc5)) +#loc117 = loc("xindex"(#loc6)) +#loc118 = loc("xindex"(#loc7)) +#loc119 = loc("xmask"(#loc8)) +#loc120 = loc("r0_base"(#loc9)) +#loc121 = loc("r0_base"(#loc10)) +#loc122 = loc("_tmp3_max"(#loc11)) +#loc123 = loc("_tmp3_sum"(#loc12)) +#loc124 = loc("_tmp3_max"(#loc13)) +#loc125 = loc("r0_index"(#loc14)) +#loc126 = loc("r0_mask"(#loc15)) +#loc127 = loc("tmp0"(#loc16)) +#loc128 = loc("tmp0"(#loc17)) +#loc129 = loc("tmp0"(#loc18)) +#loc130 = loc("tmp0"(#loc19)) +#loc131 = loc("tmp0"(#loc20)) +#loc132 = loc("_tmp3_max"(#loc22)) +#loc133 = loc("_tmp3_sum"(#loc23)) +#loc134 = loc("tmp3"(#loc26)) +#loc135 = loc("tmp4"(#loc27)) +#loc136 = loc("r0_index"(#loc29)) +#loc137 = loc("r0_mask"(#loc30)) +#loc138 = loc("tmp5"(#loc31)) +#loc139 = loc("tmp5"(#loc32)) +#loc140 = loc("tmp5"(#loc33)) +#loc141 = loc("tmp5"(#loc34)) +#loc142 = loc("tmp5"(#loc35)) +#loc143 = loc("tmp7"(#loc36)) +#loc144 = loc("tmp8"(#loc37)) +#loc145 = loc("tmp9"(#loc38)) +#loc149 = loc("out_max"(#loc49)) +#loc150 = loc("lhs_scale"(#loc50)) +#loc151 = loc("lhs_scale"(#loc51)) +#loc152 = loc("lhs_scale"(#loc52)) +#loc153 = loc("lhs_scale"(#loc53)) +#loc154 = loc("rhs_scale"(#loc54)) +#loc155 = loc("rhs_scale"(#loc55)) +#loc156 = loc("rhs_scale"(#loc56)) +#loc157 = loc("rhs_scale"(#loc57)) +#loc158 = loc("out_sum"(#loc58)) +#loc159 = loc("out_sum"(#loc59)) +#loc162 = loc("mask"(#loc63)) +#loc163 = loc("mask"(#loc66)) +#loc164 = loc("mask"(#loc67)) +#loc170 = loc("out_max"(#loc86)) +#loc171 = loc("out_max_keepdim"(#loc87)) +#loc172 = loc("delta"(#loc88)) +#loc173 = loc("delta"(#loc89)) +#loc174 = loc("delta"(#loc90)) +#loc175 = loc("out_sum"(#loc91)) +#loc176 = loc("out_sum"(#loc92)) +#loc177 = loc("out_sum"(#loc93)) +#loc182 = loc("_tmp3_sum"(#loc124)) +#loc183 = loc("mask"(#loc162)) +#loc184 = loc("mask"(#loc164)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..1e302f4c1ef5b4f00d741efcc00dac5e806b57f8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttgir @@ -0,0 +1,239 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":18:0) +#loc1 = loc(unknown) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":49:33) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc58 = loc("in_ptr0"(#loc)) +#loc59 = loc("out_ptr2"(#loc)) +#loc60 = loc("xnumel"(#loc)) +#loc61 = loc("r0_numel"(#loc)) +#loc87 = loc("out_max"(#loc32)) +#loc94 = loc("out_sum"(#loc41)) +#loc120 = loc(callsite(#loc87 at #loc33)) +#loc127 = loc(callsite(#loc94 at #loc33)) +#loc136 = loc(callsite(#loc1 at #loc120)) +#loc139 = loc(callsite(#loc1 at #loc127)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32000> : tensor<1x16384xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<32000> : tensor<1x16384xi32, #blocked1> loc(#loc1) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x16384xbf16, #blocked1> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc1) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc1) + %cst_2 = arith.constant dense<0xFF800000> : tensor<1x1xf32, #blocked1> loc(#loc1) + %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x16384xf32, #blocked1> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<1x16384xf32, #blocked1> loc(#loc1) + %cst_5 = arith.constant dense<0xFF800000> : tensor<1x16384xf32, #blocked1> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc62) + %r0_base = tt.make_range {end = 16384 : i32, start = 0 : i32} : tensor<16384xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> loc(#loc63) + %r0_base_6 = tt.make_range {end = 16384 : i32, start = 0 : i32} : tensor<16384xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc63) + %r0_base_7 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16384xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x16384xi32, #blocked1> loc(#loc63) + %r0_base_8 = tt.expand_dims %r0_base_6 {axis = 0 : i32} : tensor<16384xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16384xi32, #blocked> loc(#loc63) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc64) + %tmp0_9 = tt.splat %tmp0 : i32 -> tensor<1x16384xi32, #blocked1> loc(#loc106) + %tmp0_10 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16384x!tt.ptr, #blocked1> loc(#loc66) + %_tmp3_sum:2 = scf.for %_tmp3_sum_17 = %c0_i32 to %c32000_i32 step %c16384_i32 iter_args(%arg5 = %cst_5, %arg6 = %cst_4) -> (tensor<1x16384xf32, #blocked1>, tensor<1x16384xf32, #blocked1>) : i32 { + %r0_index = tt.splat %_tmp3_sum_17 : i32 -> tensor<1x16384xi32, #blocked1> loc(#loc68) + %r0_index_18 = arith.addi %r0_index, %r0_base_7 : tensor<1x16384xi32, #blocked1> loc(#loc68) + %r0_mask = arith.cmpi slt, %r0_index_18, %cst_0 : tensor<1x16384xi32, #blocked1> loc(#loc69) + %tmp0_19 = arith.addi %r0_index_18, %tmp0_9 : tensor<1x16384xi32, #blocked1> loc(#loc65) + %tmp0_20 = tt.addptr %tmp0_10, %tmp0_19 : tensor<1x16384x!tt.ptr, #blocked1>, tensor<1x16384xi32, #blocked1> loc(#loc66) + %tmp0_21 = tt.load %tmp0_20, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x16384x!tt.ptr, #blocked1> loc(#loc70) + %tmp0_22 = arith.extf %tmp0_21 : tensor<1x16384xbf16, #blocked1> to tensor<1x16384xf32, #blocked1> loc(#loc71) + %mask = arith.cmpf ogt, %arg5, %tmp0_22 : tensor<1x16384xf32, #blocked1> loc(#loc129) + %mask_23 = arith.cmpf une, %arg5, %arg5 : tensor<1x16384xf32, #blocked1> loc(#loc130) + %mask_24 = arith.ori %mask, %mask_23 : tensor<1x16384xi1, #blocked1> loc(#loc131) + %out_max_25 = arith.select %mask_24, %arg5, %tmp0_22 : tensor<1x16384xi1, #blocked1>, tensor<1x16384xf32, #blocked1> loc(#loc132) + %lhs_scale = arith.cmpf oeq, %out_max_25, %cst_5 : tensor<1x16384xf32, #blocked1> loc(#loc111) + %lhs_scale_26 = arith.subf %arg5, %out_max_25 : tensor<1x16384xf32, #blocked1> loc(#loc112) + %lhs_scale_27 = tt.extern_elementwise %lhs_scale_26 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32, #blocked1>) -> tensor<1x16384xf32, #blocked1> loc(#loc133) + %lhs_scale_28 = arith.select %lhs_scale, %cst_3, %lhs_scale_27 : tensor<1x16384xi1, #blocked1>, tensor<1x16384xf32, #blocked1> loc(#loc114) + %rhs_scale = arith.subf %tmp0_22, %out_max_25 : tensor<1x16384xf32, #blocked1> loc(#loc115) + %rhs_scale_29 = tt.extern_elementwise %rhs_scale {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32, #blocked1>) -> tensor<1x16384xf32, #blocked1> loc(#loc134) + %rhs_scale_30 = arith.select %lhs_scale, %cst_3, %rhs_scale_29 : tensor<1x16384xi1, #blocked1>, tensor<1x16384xf32, #blocked1> loc(#loc117) + %out_sum_31 = arith.mulf %arg6, %lhs_scale_28 : tensor<1x16384xf32, #blocked1> loc(#loc118) + %out_sum_32 = arith.addf %out_sum_31, %rhs_scale_30 : tensor<1x16384xf32, #blocked1> loc(#loc119) + %_tmp3_max = arith.select %r0_mask, %out_max_25, %arg5 : tensor<1x16384xi1, #blocked1>, tensor<1x16384xf32, #blocked1> loc(#loc85) + %_tmp3_sum_33 = arith.select %r0_mask, %out_sum_32, %arg6 : tensor<1x16384xi1, #blocked1>, tensor<1x16384xf32, #blocked1> loc(#loc86) + scf.yield %_tmp3_max, %_tmp3_sum_33 : tensor<1x16384xf32, #blocked1>, tensor<1x16384xf32, #blocked1> loc(#loc30) + } loc(#loc107) + %out_max = "tt.reduce"(%_tmp3_sum#0) <{axis = 1 : i32}> ({ + ^bb0(%out_max_17: f32 loc(callsite(#loc1 at #loc120)), %out_max_18: f32 loc(callsite(#loc1 at #loc120))): + %mask = arith.cmpf ogt, %out_max_17, %out_max_18 : f32 loc(#loc140) + %mask_19 = arith.cmpf une, %out_max_17, %out_max_17 : f32 loc(#loc141) + %mask_20 = arith.ori %mask, %mask_19 : i1 loc(#loc142) + %out_max_21 = arith.select %mask_20, %out_max_17, %out_max_18 : f32 loc(#loc143) + tt.reduce.return %out_max_21 : f32 loc(#loc135) + }) : (tensor<1x16384xf32, #blocked1>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc135) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xf32, #blocked1> loc(#loc121) + %delta = arith.cmpf oeq, %out_max_keepdim, %cst_2 : tensor<1x1xf32, #blocked1> loc(#loc122) + %delta_11 = tt.broadcast %out_max_keepdim : tensor<1x1xf32, #blocked1> -> tensor<1x16384xf32, #blocked1> loc(#loc123) + %delta_12 = arith.subf %_tmp3_sum#0, %delta_11 : tensor<1x16384xf32, #blocked1> loc(#loc123) + %delta_13 = tt.broadcast %delta : tensor<1x1xi1, #blocked1> -> tensor<1x16384xi1, #blocked1> loc(#loc124) + %delta_14 = arith.select %delta_13, %cst_4, %delta_12 : tensor<1x16384xi1, #blocked1>, tensor<1x16384xf32, #blocked1> loc(#loc124) + %out_sum = tt.extern_elementwise %delta_14 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32, #blocked1>) -> tensor<1x16384xf32, #blocked1> loc(#loc137) + %out_sum_15 = arith.mulf %_tmp3_sum#1, %out_sum : tensor<1x16384xf32, #blocked1> loc(#loc126) + %out_sum_16 = "tt.reduce"(%out_sum_15) <{axis = 1 : i32}> ({ + ^bb0(%out_sum_17: f32 loc(callsite(#loc1 at #loc127)), %out_sum_18: f32 loc(callsite(#loc1 at #loc127))): + %out_sum_19 = arith.addf %out_sum_17, %out_sum_18 : f32 loc(#loc144) + tt.reduce.return %out_sum_19 : f32 loc(#loc138) + }) : (tensor<1x16384xf32, #blocked1>) -> tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc138) + %tmp4 = tt.expand_dims %out_sum_16 {axis = 1 : i32} : tensor<1xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1xf32, #blocked1> loc(#loc95) + %tmp5 = tt.splat %tmp0 : i32 -> tensor<1x16384xi32, #blocked> loc(#loc128) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32, #blocked1> -> tensor<1x16384xf32, #blocked1> loc(#loc98) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16384x!tt.ptr, #blocked> loc(#loc47) + scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c16384_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16384xi32, #blocked1> loc(#loc99) + %r0_index_17 = tt.splat %r0_offset : i32 -> tensor<1x16384xi32, #blocked> loc(#loc99) + %r0_index_18 = arith.addi %r0_index, %r0_base_7 : tensor<1x16384xi32, #blocked1> loc(#loc99) + %r0_index_19 = arith.addi %r0_index_17, %r0_base_8 : tensor<1x16384xi32, #blocked> loc(#loc99) + %r0_mask = arith.cmpi slt, %r0_index_18, %cst_0 : tensor<1x16384xi32, #blocked1> loc(#loc100) + %r0_mask_20 = arith.cmpi slt, %r0_index_19, %cst : tensor<1x16384xi32, #blocked> loc(#loc100) + %tmp5_21 = arith.addi %r0_index_18, %tmp0_9 : tensor<1x16384xi32, #blocked1> loc(#loc96) + %tmp5_22 = arith.addi %r0_index_19, %tmp5 : tensor<1x16384xi32, #blocked> loc(#loc96) + %tmp5_23 = tt.addptr %tmp0_10, %tmp5_21 : tensor<1x16384x!tt.ptr, #blocked1>, tensor<1x16384xi32, #blocked1> loc(#loc101) + %tmp5_24 = tt.load %tmp5_23, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x16384x!tt.ptr, #blocked1> loc(#loc102) + %tmp5_25 = arith.extf %tmp5_24 : tensor<1x16384xbf16, #blocked1> to tensor<1x16384xf32, #blocked1> loc(#loc103) + %tmp7 = arith.subf %tmp5_25, %delta_11 : tensor<1x16384xf32, #blocked1> loc(#loc104) + %tmp8 = tt.extern_elementwise %tmp7 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32, #blocked1>) -> tensor<1x16384xf32, #blocked1> loc(#loc105) + %tmp9_26 = arith.divf %tmp8, %tmp9 : tensor<1x16384xf32, #blocked1> loc(#loc98) + %1 = tt.addptr %0, %tmp5_22 : tensor<1x16384x!tt.ptr, #blocked>, tensor<1x16384xi32, #blocked> loc(#loc47) + %2 = ttg.convert_layout %tmp9_26 : tensor<1x16384xf32, #blocked1> -> tensor<1x16384xf32, #blocked> loc(#loc56) + tt.store %1, %2, %r0_mask_20 : tensor<1x16384x!tt.ptr, #blocked> loc(#loc56) + } loc(#loc48) + tt.return loc(#loc57) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":26:37) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:47) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:41) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:34) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":31:40) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":32:31) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":33:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:52) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:105) +#loc12 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc13 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":42:40) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":45:54) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":46:54) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":46:8) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":51:16) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:41) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:47) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":62:23) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:29) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":52:40) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":53:31) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":54:29) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:34) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:52) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:106) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":60:22) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":61:29) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:53) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":52:4) +#loc62 = loc("xoffset"(#loc2)) +#loc63 = loc("r0_base"(#loc3)) +#loc64 = loc("tmp0"(#loc4)) +#loc65 = loc("tmp0"(#loc5)) +#loc66 = loc("tmp0"(#loc6)) +#loc67 = loc("_tmp3_max"(#loc7)) +#loc68 = loc("r0_index"(#loc8)) +#loc69 = loc("r0_mask"(#loc9)) +#loc70 = loc("tmp0"(#loc10)) +#loc71 = loc("tmp0"(#loc11)) +#loc72 = loc("mask"(#loc12)) +#loc73 = loc("out_max"(#loc13)) +#loc74 = loc("mask"(#loc15)) +#loc75 = loc("mask"(#loc16)) +#loc76 = loc("lhs_scale"(#loc18)) +#loc77 = loc("lhs_scale"(#loc19)) +#loc78 = loc("lhs_scale"(#loc21)) +#loc79 = loc("lhs_scale"(#loc22)) +#loc80 = loc("rhs_scale"(#loc23)) +#loc81 = loc("rhs_scale"(#loc24)) +#loc82 = loc("rhs_scale"(#loc25)) +#loc83 = loc("out_sum"(#loc26)) +#loc84 = loc("out_sum"(#loc27)) +#loc85 = loc("_tmp3_max"(#loc28)) +#loc86 = loc("_tmp3_sum"(#loc29)) +#loc88 = loc("out_max_keepdim"(#loc34)) +#loc89 = loc("delta"(#loc35)) +#loc90 = loc("delta"(#loc36)) +#loc91 = loc("delta"(#loc37)) +#loc92 = loc("out_sum"(#loc38)) +#loc93 = loc("out_sum"(#loc39)) +#loc95 = loc("tmp4"(#loc43)) +#loc96 = loc("tmp5"(#loc44)) +#loc97 = loc("tmp5"(#loc45)) +#loc98 = loc("tmp9"(#loc46)) +#loc99 = loc("r0_index"(#loc49)) +#loc100 = loc("r0_mask"(#loc50)) +#loc101 = loc("tmp5"(#loc51)) +#loc102 = loc("tmp5"(#loc52)) +#loc103 = loc("tmp5"(#loc53)) +#loc104 = loc("tmp7"(#loc54)) +#loc105 = loc("tmp8"(#loc55)) +#loc106 = loc(fused[#loc65, #loc64]) +#loc107 = loc("_tmp3_sum"(#loc67)) +#loc108 = loc("mask"(#loc72)) +#loc109 = loc(callsite(#loc73 at #loc14)) +#loc110 = loc("mask"(#loc75)) +#loc111 = loc(callsite(#loc76 at #loc14)) +#loc112 = loc(callsite(#loc77 at #loc14)) +#loc113 = loc(callsite(#loc78 at #loc14)) +#loc114 = loc(callsite(#loc79 at #loc14)) +#loc115 = loc(callsite(#loc80 at #loc14)) +#loc116 = loc(callsite(#loc81 at #loc14)) +#loc117 = loc(callsite(#loc82 at #loc14)) +#loc118 = loc(callsite(#loc83 at #loc14)) +#loc119 = loc(callsite(#loc84 at #loc14)) +#loc121 = loc(callsite(#loc88 at #loc33)) +#loc122 = loc(callsite(#loc89 at #loc33)) +#loc123 = loc(callsite(#loc90 at #loc33)) +#loc124 = loc(callsite(#loc91 at #loc33)) +#loc125 = loc(callsite(#loc92 at #loc33)) +#loc126 = loc(callsite(#loc93 at #loc33)) +#loc128 = loc(fused[#loc96, #loc97]) +#loc129 = loc(callsite(#loc108 at #loc109)) +#loc130 = loc(callsite(#loc74 at #loc109)) +#loc131 = loc(callsite(#loc110 at #loc109)) +#loc132 = loc(callsite(#loc17 at #loc109)) +#loc133 = loc(callsite(#loc20 at #loc113)) +#loc134 = loc(callsite(#loc20 at #loc116)) +#loc135 = loc(callsite(#loc31 at #loc120)) +#loc137 = loc(callsite(#loc20 at #loc125)) +#loc138 = loc(callsite(#loc40 at #loc127)) +#loc140 = loc(callsite(#loc108 at #loc135)) +#loc141 = loc(callsite(#loc74 at #loc135)) +#loc142 = loc(callsite(#loc110 at #loc135)) +#loc143 = loc(callsite(#loc17 at #loc135)) +#loc144 = loc(callsite(#loc42 at #loc138)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..822bac303a0108ed8142054803e3c922742e1a81 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZPXLAXWONXUJTXXR7TU2RUH3GIYK643EDBZWQCBA45JTTUC3DSEA/triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0.ttir @@ -0,0 +1,233 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":18:0) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":49:33) +#loc3 = loc(unknown) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":178:28) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:58) +#loc59 = loc("in_ptr0"(#loc)) +#loc60 = loc("out_ptr2"(#loc)) +#loc61 = loc("xnumel"(#loc)) +#loc62 = loc("r0_numel"(#loc)) +#loc90 = loc("out_max"(#loc35)) +#loc96 = loc("out_sum"(#loc42)) +#loc123 = loc(callsite(#loc90 at #loc2)) +#loc129 = loc(callsite(#loc96 at #loc2)) +#loc138 = loc(callsite(#loc3 at #loc123)) +#loc141 = loc(callsite(#loc3 at #loc129)) +module { + tt.func public @triton_red_fused__softmax__to_copy_exp_prepare_softmax_online_sub_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr2"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %delta = arith.constant dense<0xFF800000> : tensor<1x1xf32> loc(#loc108) + %cst = arith.constant dense<1.000000e+00> : tensor<1x16384xf32> loc(#loc3) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x16384xf32> loc(#loc3) + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x16384xbf16> loc(#loc3) + %c16384_i32 = arith.constant 16384 : i32 loc(#loc3) + %c32000_i32 = arith.constant 32000 : i32 loc(#loc3) + %c0_i32 = arith.constant 0 : i32 loc(#loc3) + %cst_2 = arith.constant dense<32000> : tensor<1x16384xi32> loc(#loc3) + %cst_3 = arith.constant dense<0xFF800000> : tensor<1x16384xf32> loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc64) + %r0_base = tt.make_range {end = 16384 : i32, start = 0 : i32} : tensor<16384xi32> loc(#loc65) + %r0_base_4 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16384xi32> -> tensor<1x16384xi32> loc(#loc66) + %_tmp3_sum:2 = scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c16384_i32 iter_args(%_tmp3_max = %cst_3, %_tmp3_sum_12 = %cst_0) -> (tensor<1x16384xf32>, tensor<1x16384xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16384xi32> loc(#loc68) + %r0_index_13 = arith.addi %r0_index, %r0_base_4 : tensor<1x16384xi32> loc(#loc68) + %r0_mask = arith.cmpi slt, %r0_index_13, %cst_2 : tensor<1x16384xi32> loc(#loc69) + %tmp0 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc70) + %tmp0_14 = tt.splat %tmp0 : i32 -> tensor<1x16384xi32> loc(#loc110) + %tmp0_15 = arith.addi %r0_index_13, %tmp0_14 : tensor<1x16384xi32> loc(#loc71) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16384x!tt.ptr> loc(#loc72) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<1x16384x!tt.ptr>, tensor<1x16384xi32> loc(#loc72) + %tmp0_18 = tt.load %tmp0_17, %r0_mask, %cst_1 evictionPolicy = evict_last : tensor<1x16384x!tt.ptr> loc(#loc73) + %tmp0_19 = arith.extf %tmp0_18 : tensor<1x16384xbf16> to tensor<1x16384xf32> loc(#loc74) + %mask = arith.cmpf ogt, %_tmp3_max, %tmp0_19 : tensor<1x16384xf32> loc(#loc131) + %mask_20 = arith.cmpf une, %_tmp3_max, %_tmp3_max : tensor<1x16384xf32> loc(#loc132) + %mask_21 = arith.ori %mask, %mask_20 : tensor<1x16384xi1> loc(#loc133) + %out_max_22 = arith.select %mask_21, %_tmp3_max, %tmp0_19 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc134) + %lhs_scale = arith.cmpf oeq, %out_max_22, %cst_3 : tensor<1x16384xf32> loc(#loc114) + %lhs_scale_23 = arith.subf %_tmp3_max, %out_max_22 : tensor<1x16384xf32> loc(#loc115) + %lhs_scale_24 = tt.extern_elementwise %lhs_scale_23 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc135) + %lhs_scale_25 = arith.select %lhs_scale, %cst, %lhs_scale_24 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc117) + %rhs_scale = arith.subf %tmp0_19, %out_max_22 : tensor<1x16384xf32> loc(#loc118) + %rhs_scale_26 = tt.extern_elementwise %rhs_scale {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc136) + %rhs_scale_27 = arith.select %lhs_scale, %cst, %rhs_scale_26 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc120) + %out_sum_28 = arith.mulf %_tmp3_sum_12, %lhs_scale_25 : tensor<1x16384xf32> loc(#loc121) + %out_sum_29 = arith.addf %out_sum_28, %rhs_scale_27 : tensor<1x16384xf32> loc(#loc122) + %_tmp3_max_30 = arith.select %r0_mask, %out_max_22, %_tmp3_max : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc88) + %_tmp3_sum_31 = arith.select %r0_mask, %out_sum_29, %_tmp3_sum_12 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc89) + scf.yield %_tmp3_max_30, %_tmp3_sum_31 : tensor<1x16384xf32>, tensor<1x16384xf32> loc(#loc33) + } loc(#loc109) + %out_max = "tt.reduce"(%_tmp3_sum#0) <{axis = 1 : i32}> ({ + ^bb0(%out_max_12: f32 loc(callsite(#loc3 at #loc123)), %out_max_13: f32 loc(callsite(#loc3 at #loc123))): + %mask = arith.cmpf ogt, %out_max_12, %out_max_13 : f32 loc(#loc142) + %mask_14 = arith.cmpf une, %out_max_12, %out_max_12 : f32 loc(#loc143) + %mask_15 = arith.ori %mask, %mask_14 : i1 loc(#loc144) + %out_max_16 = arith.select %mask_15, %out_max_12, %out_max_13 : f32 loc(#loc145) + tt.reduce.return %out_max_16 : f32 loc(#loc137) + }) : (tensor<1x16384xf32>) -> tensor<1xf32> loc(#loc137) + %out_max_keepdim = tt.expand_dims %out_max {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc124) + %delta_5 = arith.cmpf oeq, %out_max_keepdim, %delta : tensor<1x1xf32> loc(#loc108) + %delta_6 = tt.broadcast %out_max_keepdim : tensor<1x1xf32> -> tensor<1x16384xf32> loc(#loc125) + %delta_7 = arith.subf %_tmp3_sum#0, %delta_6 : tensor<1x16384xf32> loc(#loc125) + %delta_8 = tt.broadcast %delta_5 : tensor<1x1xi1> -> tensor<1x16384xi1> loc(#loc126) + %delta_9 = arith.select %delta_8, %cst_0, %delta_7 : tensor<1x16384xi1>, tensor<1x16384xf32> loc(#loc126) + %out_sum = tt.extern_elementwise %delta_9 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc139) + %out_sum_10 = arith.mulf %_tmp3_sum#1, %out_sum : tensor<1x16384xf32> loc(#loc128) + %out_sum_11 = "tt.reduce"(%out_sum_10) <{axis = 1 : i32}> ({ + ^bb0(%out_sum_12: f32 loc(callsite(#loc3 at #loc129)), %out_sum_13: f32 loc(callsite(#loc3 at #loc129))): + %out_sum_14 = arith.addf %out_sum_12, %out_sum_13 : f32 loc(#loc146) + tt.reduce.return %out_sum_14 : f32 loc(#loc140) + }) : (tensor<1x16384xf32>) -> tensor<1xf32> loc(#loc140) + %tmp4 = tt.expand_dims %out_sum_11 {axis = 1 : i32} : tensor<1xf32> -> tensor<1x1xf32> loc(#loc97) + scf.for %r0_offset = %c0_i32 to %c32000_i32 step %c16384_i32 : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x16384xi32> loc(#loc98) + %r0_index_12 = arith.addi %r0_index, %r0_base_4 : tensor<1x16384xi32> loc(#loc98) + %r0_mask = arith.cmpi slt, %r0_index_12, %cst_2 : tensor<1x16384xi32> loc(#loc99) + %tmp5 = arith.muli %xoffset, %c32000_i32 : i32 loc(#loc100) + %tmp5_13 = tt.splat %tmp5 : i32 -> tensor<1x16384xi32> loc(#loc130) + %tmp5_14 = arith.addi %r0_index_12, %tmp5_13 : tensor<1x16384xi32> loc(#loc101) + %tmp5_15 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16384x!tt.ptr> loc(#loc102) + %tmp5_16 = tt.addptr %tmp5_15, %tmp5_14 : tensor<1x16384x!tt.ptr>, tensor<1x16384xi32> loc(#loc102) + %tmp5_17 = tt.load %tmp5_16, %r0_mask, %cst_1 evictionPolicy = evict_first : tensor<1x16384x!tt.ptr> loc(#loc103) + %tmp5_18 = arith.extf %tmp5_17 : tensor<1x16384xbf16> to tensor<1x16384xf32> loc(#loc104) + %tmp7 = arith.subf %tmp5_18, %delta_6 : tensor<1x16384xf32> loc(#loc105) + %tmp8 = tt.extern_elementwise %tmp7 {libname = "", libpath = "", pure = true, symbol = "__nv_expf"} : (tensor<1x16384xf32>) -> tensor<1x16384xf32> loc(#loc106) + %tmp9 = tt.broadcast %tmp4 : tensor<1x1xf32> -> tensor<1x16384xf32> loc(#loc107) + %tmp9_19 = arith.divf %tmp8, %tmp9 : tensor<1x16384xf32> loc(#loc107) + %0 = tt.splat %out_ptr2 : !tt.ptr -> tensor<1x16384x!tt.ptr> loc(#loc56) + %1 = tt.addptr %0, %tmp5_14 : tensor<1x16384x!tt.ptr>, tensor<1x16384xi32> loc(#loc56) + tt.store %1, %tmp9_19, %r0_mask : tensor<1x16384x!tt.ptr> loc(#loc57) + } loc(#loc45) + tt.return loc(#loc58) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":23:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":26:27) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":31:40) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":32:31) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":33:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:47) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:41) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:34) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:52) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":37:105) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":110:15) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":193:31) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":42:40) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:21) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":112:16) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":113:29) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:19) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:53) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":173:29) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:62) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":196:39) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:53) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:62) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":199:39) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:24) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":205:36) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":45:54) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":46:54) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":46:8) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":123:29) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":179:46) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:68) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":180:58) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:42) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":181:31) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":51:16) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":52:40) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":53:31) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":54:29) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:47) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:41) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:34) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:52) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":58:106) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":60:22) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":61:29) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":62:23) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:29) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":63:53) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/4i/c4iwnhsf5kfmm7jnzrkyiv4x3yahjog6dyhf4prm2cjdi5xhllx2.py":52:4) +#loc63 = loc("delta"(#loc1)) +#loc64 = loc("xoffset"(#loc4)) +#loc65 = loc("r0_base"(#loc5)) +#loc66 = loc("r0_base"(#loc6)) +#loc67 = loc("_tmp3_max"(#loc7)) +#loc68 = loc("r0_index"(#loc8)) +#loc69 = loc("r0_mask"(#loc9)) +#loc70 = loc("tmp0"(#loc10)) +#loc71 = loc("tmp0"(#loc11)) +#loc72 = loc("tmp0"(#loc12)) +#loc73 = loc("tmp0"(#loc13)) +#loc74 = loc("tmp0"(#loc14)) +#loc75 = loc("mask"(#loc15)) +#loc76 = loc("out_max"(#loc16)) +#loc77 = loc("mask"(#loc18)) +#loc78 = loc("mask"(#loc19)) +#loc79 = loc("lhs_scale"(#loc21)) +#loc80 = loc("lhs_scale"(#loc22)) +#loc81 = loc("lhs_scale"(#loc24)) +#loc82 = loc("lhs_scale"(#loc25)) +#loc83 = loc("rhs_scale"(#loc26)) +#loc84 = loc("rhs_scale"(#loc27)) +#loc85 = loc("rhs_scale"(#loc28)) +#loc86 = loc("out_sum"(#loc29)) +#loc87 = loc("out_sum"(#loc30)) +#loc88 = loc("_tmp3_max"(#loc31)) +#loc89 = loc("_tmp3_sum"(#loc32)) +#loc91 = loc("out_max_keepdim"(#loc36)) +#loc92 = loc("delta"(#loc37)) +#loc93 = loc("delta"(#loc38)) +#loc94 = loc("out_sum"(#loc39)) +#loc95 = loc("out_sum"(#loc40)) +#loc97 = loc("tmp4"(#loc44)) +#loc98 = loc("r0_index"(#loc46)) +#loc99 = loc("r0_mask"(#loc47)) +#loc100 = loc("tmp5"(#loc48)) +#loc101 = loc("tmp5"(#loc49)) +#loc102 = loc("tmp5"(#loc50)) +#loc103 = loc("tmp5"(#loc51)) +#loc104 = loc("tmp5"(#loc52)) +#loc105 = loc("tmp7"(#loc53)) +#loc106 = loc("tmp8"(#loc54)) +#loc107 = loc("tmp9"(#loc55)) +#loc108 = loc(callsite(#loc63 at #loc2)) +#loc109 = loc("_tmp3_sum"(#loc67)) +#loc110 = loc(fused[#loc71, #loc70]) +#loc111 = loc("mask"(#loc75)) +#loc112 = loc(callsite(#loc76 at #loc17)) +#loc113 = loc("mask"(#loc78)) +#loc114 = loc(callsite(#loc79 at #loc17)) +#loc115 = loc(callsite(#loc80 at #loc17)) +#loc116 = loc(callsite(#loc81 at #loc17)) +#loc117 = loc(callsite(#loc82 at #loc17)) +#loc118 = loc(callsite(#loc83 at #loc17)) +#loc119 = loc(callsite(#loc84 at #loc17)) +#loc120 = loc(callsite(#loc85 at #loc17)) +#loc121 = loc(callsite(#loc86 at #loc17)) +#loc122 = loc(callsite(#loc87 at #loc17)) +#loc124 = loc(callsite(#loc91 at #loc2)) +#loc125 = loc(callsite(#loc92 at #loc2)) +#loc126 = loc(callsite(#loc93 at #loc2)) +#loc127 = loc(callsite(#loc94 at #loc2)) +#loc128 = loc(callsite(#loc95 at #loc2)) +#loc130 = loc(fused[#loc101, #loc100]) +#loc131 = loc(callsite(#loc111 at #loc112)) +#loc132 = loc(callsite(#loc77 at #loc112)) +#loc133 = loc(callsite(#loc113 at #loc112)) +#loc134 = loc(callsite(#loc20 at #loc112)) +#loc135 = loc(callsite(#loc23 at #loc116)) +#loc136 = loc(callsite(#loc23 at #loc119)) +#loc137 = loc(callsite(#loc34 at #loc123)) +#loc139 = loc(callsite(#loc23 at #loc127)) +#loc140 = loc(callsite(#loc41 at #loc129)) +#loc142 = loc(callsite(#loc111 at #loc137)) +#loc143 = loc(callsite(#loc77 at #loc137)) +#loc144 = loc(callsite(#loc113 at #loc137)) +#loc145 = loc(callsite(#loc20 at #loc137)) +#loc146 = loc(callsite(#loc43 at #loc140)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/__grp__triton_tem_fused_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/__grp__triton_tem_fused_0.json new file mode 100644 index 0000000000000000000000000000000000000000..0238fef6dc28b332e38b9e0be40941070dc0474e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/__grp__triton_tem_fused_0.json @@ -0,0 +1 @@ +{"child_paths": {"triton_tem_fused_0.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.source", "triton_tem_fused_0.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.ttir", "triton_tem_fused_0.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.ttgir", "triton_tem_fused_0.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.llir", "triton_tem_fused_0.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.ptx", "triton_tem_fused_0.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.cubin", "triton_tem_fused_0.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.json b/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6735826db7a5207ad6829d62ba5667ad6dc5bc84 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.json @@ -0,0 +1 @@ +{"hash": "cca552c5e62118d98c056f99f6a409588dbbad9ea752de25d8817306e00ee94a", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 3, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 131072, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_tem_fused_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..3b0da35c39d997c8336ea8dc450f6faef37c781a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.llir @@ -0,0 +1,5464 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +; Function Attrs: nounwind +define ptx_kernel void @triton_tem_fused_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) readnone captures(none) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, ptr addrspace(1) %10, i32 %11, i32 %12, ptr addrspace(1) readnone captures(none) %13, ptr addrspace(1) readnone captures(none) %14) local_unnamed_addr #0 !dbg !5 { + %16 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !8 + %17 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !dbg !9 + %18 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !dbg !10 + %19 = and i32 %17, 7, !dbg !11 + %20 = shl i32 %17, 23, !dbg !12 + %21 = shl nuw nsw i32 %18, 7, !dbg !13 + %22 = or disjoint i32 %20, %21, !dbg !14 + %23 = shl nuw nsw i32 %19, 10, !dbg !15 + %24 = shl nuw nsw i32 %18, 5, !dbg !16 + %25 = and i32 %24, 2097024, !dbg !16 + %reass.add = add nuw nsw i32 %23, %25, !dbg !17 + %reass.mul = mul i32 %reass.add, %11, !dbg !17 + %26 = sext i32 %22 to i64, !dbg !18 + %27 = getelementptr bfloat, ptr addrspace(1) %0, i64 %26, !dbg !18 + %28 = sext i32 %reass.mul to i64, !dbg !19 + %29 = getelementptr bfloat, ptr addrspace(1) %1, i64 %28, !dbg !19 + %30 = getelementptr bfloat, ptr addrspace(1) %2, i64 %28, !dbg !20 + %31 = shl nuw nsw i32 %19, 4, !dbg !21 + %32 = add nuw i32 %31, %16, !dbg !22 + %reass.mul389 = mul i32 %32, %12, !dbg !23 + %33 = shl i32 %16, 7, !dbg !24 + %34 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !25 + %35 = lshr i32 %34, 5, !dbg !25 + %36 = and i32 %34, 240, !dbg !25 + %37 = lshr exact i32 %36, 4, !dbg !25 + %38 = or disjoint i32 %37, 16, !dbg !25 + %39 = or disjoint i32 %37, 32, !dbg !25 + %40 = or disjoint i32 %37, 48, !dbg !25 + %41 = lshr i32 %34, 1, !dbg !25 + %42 = and i32 %41, 112, !dbg !25 + %43 = lshr i32 %34, 2, !dbg !25 + %44 = and i32 %43, 7, !dbg !25 + %45 = or disjoint i32 %42, %44, !dbg !25 + %46 = or disjoint i32 %37, %33, !dbg !26 + %47 = or disjoint i32 %38, %33, !dbg !26 + %48 = or disjoint i32 %39, %33, !dbg !26 + %49 = or disjoint i32 %40, %33, !dbg !26 + %50 = or disjoint i32 %46, 64, !dbg !26 + %51 = or disjoint i32 %46, 80, !dbg !26 + %52 = or disjoint i32 %46, 96, !dbg !26 + %53 = or disjoint i32 %46, 112, !dbg !26 + %54 = shl i32 %46, 12, !dbg !27 + %55 = shl i32 %47, 12, !dbg !27 + %56 = shl i32 %48, 12, !dbg !27 + %57 = shl i32 %49, 12, !dbg !27 + %58 = shl i32 %50, 12, !dbg !27 + %59 = shl i32 %51, 12, !dbg !27 + %60 = shl i32 %52, 12, !dbg !27 + %61 = shl i32 %53, 12, !dbg !27 + %62 = sext i32 %54 to i64, !dbg !30 + %63 = getelementptr bfloat, ptr addrspace(1) %27, i64 %62, !dbg !30 + %64 = sext i32 %55 to i64, !dbg !30 + %65 = getelementptr bfloat, ptr addrspace(1) %27, i64 %64, !dbg !30 + %66 = sext i32 %56 to i64, !dbg !30 + %67 = getelementptr bfloat, ptr addrspace(1) %27, i64 %66, !dbg !30 + %68 = sext i32 %57 to i64, !dbg !30 + %69 = getelementptr bfloat, ptr addrspace(1) %27, i64 %68, !dbg !30 + %70 = sext i32 %58 to i64, !dbg !30 + %71 = getelementptr bfloat, ptr addrspace(1) %27, i64 %70, !dbg !30 + %72 = sext i32 %59 to i64, !dbg !30 + %73 = getelementptr bfloat, ptr addrspace(1) %27, i64 %72, !dbg !30 + %74 = sext i32 %60 to i64, !dbg !30 + %75 = getelementptr bfloat, ptr addrspace(1) %27, i64 %74, !dbg !30 + %76 = sext i32 %61 to i64, !dbg !30 + %77 = getelementptr bfloat, ptr addrspace(1) %27, i64 %76, !dbg !30 + %78 = shl nuw nsw i32 %34, 3, !dbg !31 + %79 = and i32 %78, 120, !dbg !31 + %80 = zext nneg i32 %79 to i64, !dbg !32 + %81 = getelementptr bfloat, ptr addrspace(1) %63, i64 %80, !dbg !32 + %82 = getelementptr bfloat, ptr addrspace(1) %65, i64 %80, !dbg !32 + %83 = getelementptr bfloat, ptr addrspace(1) %67, i64 %80, !dbg !32 + %84 = getelementptr bfloat, ptr addrspace(1) %69, i64 %80, !dbg !32 + %85 = getelementptr bfloat, ptr addrspace(1) %71, i64 %80, !dbg !32 + %86 = getelementptr bfloat, ptr addrspace(1) %73, i64 %80, !dbg !32 + %87 = getelementptr bfloat, ptr addrspace(1) %75, i64 %80, !dbg !32 + %88 = getelementptr bfloat, ptr addrspace(1) %77, i64 %80, !dbg !32 + %89 = icmp slt i32 %46, 2048, !dbg !33 + %90 = icmp slt i32 %47, 2048, !dbg !33 + %91 = icmp slt i32 %48, 2048, !dbg !33 + %92 = icmp slt i32 %49, 2048, !dbg !33 + %93 = icmp slt i32 %50, 2048, !dbg !33 + %94 = icmp slt i32 %51, 2048, !dbg !33 + %95 = icmp slt i32 %52, 2048, !dbg !33 + %96 = icmp slt i32 %53, 2048, !dbg !33 + %97 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %81, i1 %89) #2, !dbg !34 + %98 = extractvalue { i32, i32, i32, i32 } %97, 0, !dbg !34 + %99 = extractvalue { i32, i32, i32, i32 } %97, 1, !dbg !34 + %100 = extractvalue { i32, i32, i32, i32 } %97, 2, !dbg !34 + %101 = extractvalue { i32, i32, i32, i32 } %97, 3, !dbg !34 + %102 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %82, i1 %90) #2, !dbg !34 + %103 = extractvalue { i32, i32, i32, i32 } %102, 0, !dbg !34 + %104 = extractvalue { i32, i32, i32, i32 } %102, 1, !dbg !34 + %105 = extractvalue { i32, i32, i32, i32 } %102, 2, !dbg !34 + %106 = extractvalue { i32, i32, i32, i32 } %102, 3, !dbg !34 + %107 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %83, i1 %91) #2, !dbg !34 + %108 = extractvalue { i32, i32, i32, i32 } %107, 0, !dbg !34 + %109 = extractvalue { i32, i32, i32, i32 } %107, 1, !dbg !34 + %110 = extractvalue { i32, i32, i32, i32 } %107, 2, !dbg !34 + %111 = extractvalue { i32, i32, i32, i32 } %107, 3, !dbg !34 + %112 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %84, i1 %92) #2, !dbg !34 + %113 = extractvalue { i32, i32, i32, i32 } %112, 0, !dbg !34 + %114 = extractvalue { i32, i32, i32, i32 } %112, 1, !dbg !34 + %115 = extractvalue { i32, i32, i32, i32 } %112, 2, !dbg !34 + %116 = extractvalue { i32, i32, i32, i32 } %112, 3, !dbg !34 + %117 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %85, i1 %93) #2, !dbg !34 + %118 = extractvalue { i32, i32, i32, i32 } %117, 0, !dbg !34 + %119 = extractvalue { i32, i32, i32, i32 } %117, 1, !dbg !34 + %120 = extractvalue { i32, i32, i32, i32 } %117, 2, !dbg !34 + %121 = extractvalue { i32, i32, i32, i32 } %117, 3, !dbg !34 + %122 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %86, i1 %94) #2, !dbg !34 + %123 = extractvalue { i32, i32, i32, i32 } %122, 0, !dbg !34 + %124 = extractvalue { i32, i32, i32, i32 } %122, 1, !dbg !34 + %125 = extractvalue { i32, i32, i32, i32 } %122, 2, !dbg !34 + %126 = extractvalue { i32, i32, i32, i32 } %122, 3, !dbg !34 + %127 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %87, i1 %95) #2, !dbg !34 + %128 = extractvalue { i32, i32, i32, i32 } %127, 0, !dbg !34 + %129 = extractvalue { i32, i32, i32, i32 } %127, 1, !dbg !34 + %130 = extractvalue { i32, i32, i32, i32 } %127, 2, !dbg !34 + %131 = extractvalue { i32, i32, i32, i32 } %127, 3, !dbg !34 + %132 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, $4;\0A\09mov.u32 $1, $5;\0A\09mov.u32 $2, $6;\0A\09mov.u32 $3, $7;\0A\09@$9 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $8 + 0 ];", "=r,=r,=r,=r,r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %88, i1 %96) #2, !dbg !34 + %133 = extractvalue { i32, i32, i32, i32 } %132, 0, !dbg !34 + %134 = extractvalue { i32, i32, i32, i32 } %132, 1, !dbg !34 + %135 = extractvalue { i32, i32, i32, i32 } %132, 2, !dbg !34 + %136 = extractvalue { i32, i32, i32, i32 } %132, 3, !dbg !34 + %137 = and i32 %34, 7, !dbg !34 + %138 = shl nuw nsw i32 %137, 4, !dbg !34 + %139 = shl nuw nsw i32 %36, 3, !dbg !34 + %140 = and i32 %34, 112, !dbg !34 + %141 = and i32 %34, 8, !dbg !34 + %142 = shl nuw nsw i32 %141, 11, !dbg !34 + %143 = or disjoint i32 %138, %139, !dbg !34 + %144 = xor i32 %143, %140, !dbg !34 + %145 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 %142, !dbg !34 + %146 = getelementptr inbounds nuw i8, ptr addrspace(3) %145, i32 %144, !dbg !34 + %147 = insertelement <4 x i32> poison, i32 %98, i64 0, !dbg !34 + %148 = insertelement <4 x i32> %147, i32 %99, i64 1, !dbg !34 + %149 = insertelement <4 x i32> %148, i32 %100, i64 2, !dbg !34 + %150 = insertelement <4 x i32> %149, i32 %101, i64 3, !dbg !34 + store <4 x i32> %150, ptr addrspace(3) %146, align 16, !dbg !34 + %151 = getelementptr inbounds nuw i8, ptr addrspace(3) %146, i32 2048, !dbg !34 + %152 = insertelement <4 x i32> poison, i32 %103, i64 0, !dbg !34 + %153 = insertelement <4 x i32> %152, i32 %104, i64 1, !dbg !34 + %154 = insertelement <4 x i32> %153, i32 %105, i64 2, !dbg !34 + %155 = insertelement <4 x i32> %154, i32 %106, i64 3, !dbg !34 + store <4 x i32> %155, ptr addrspace(3) %151, align 16, !dbg !34 + %156 = getelementptr inbounds nuw i8, ptr addrspace(3) %146, i32 4096, !dbg !34 + %157 = insertelement <4 x i32> poison, i32 %108, i64 0, !dbg !34 + %158 = insertelement <4 x i32> %157, i32 %109, i64 1, !dbg !34 + %159 = insertelement <4 x i32> %158, i32 %110, i64 2, !dbg !34 + %160 = insertelement <4 x i32> %159, i32 %111, i64 3, !dbg !34 + store <4 x i32> %160, ptr addrspace(3) %156, align 16, !dbg !34 + %161 = getelementptr inbounds nuw i8, ptr addrspace(3) %146, i32 6144, !dbg !34 + %162 = insertelement <4 x i32> poison, i32 %113, i64 0, !dbg !34 + %163 = insertelement <4 x i32> %162, i32 %114, i64 1, !dbg !34 + %164 = insertelement <4 x i32> %163, i32 %115, i64 2, !dbg !34 + %165 = insertelement <4 x i32> %164, i32 %116, i64 3, !dbg !34 + store <4 x i32> %165, ptr addrspace(3) %161, align 16, !dbg !34 + %166 = getelementptr inbounds nuw i8, ptr addrspace(3) %146, i32 8192, !dbg !34 + %167 = insertelement <4 x i32> poison, i32 %118, i64 0, !dbg !34 + %168 = insertelement <4 x i32> %167, i32 %119, i64 1, !dbg !34 + %169 = insertelement <4 x i32> %168, i32 %120, i64 2, !dbg !34 + %170 = insertelement <4 x i32> %169, i32 %121, i64 3, !dbg !34 + store <4 x i32> %170, ptr addrspace(3) %166, align 16, !dbg !34 + %171 = getelementptr inbounds nuw i8, ptr addrspace(3) %146, i32 10240, !dbg !34 + %172 = insertelement <4 x i32> poison, i32 %123, i64 0, !dbg !34 + %173 = insertelement <4 x i32> %172, i32 %124, i64 1, !dbg !34 + %174 = insertelement <4 x i32> %173, i32 %125, i64 2, !dbg !34 + %175 = insertelement <4 x i32> %174, i32 %126, i64 3, !dbg !34 + store <4 x i32> %175, ptr addrspace(3) %171, align 16, !dbg !34 + %176 = getelementptr inbounds nuw i8, ptr addrspace(3) %146, i32 12288, !dbg !34 + %177 = insertelement <4 x i32> poison, i32 %128, i64 0, !dbg !34 + %178 = insertelement <4 x i32> %177, i32 %129, i64 1, !dbg !34 + %179 = insertelement <4 x i32> %178, i32 %130, i64 2, !dbg !34 + %180 = insertelement <4 x i32> %179, i32 %131, i64 3, !dbg !34 + store <4 x i32> %180, ptr addrspace(3) %176, align 16, !dbg !34 + %181 = getelementptr inbounds nuw i8, ptr addrspace(3) %146, i32 14336, !dbg !34 + %182 = insertelement <4 x i32> poison, i32 %133, i64 0, !dbg !34 + %183 = insertelement <4 x i32> %182, i32 %134, i64 1, !dbg !34 + %184 = insertelement <4 x i32> %183, i32 %135, i64 2, !dbg !34 + %185 = insertelement <4 x i32> %184, i32 %136, i64 3, !dbg !34 + store <4 x i32> %185, ptr addrspace(3) %181, align 16, !dbg !34 + %186 = sext i32 %reass.mul389 to i64, !dbg !35 + %187 = getelementptr i32, ptr addrspace(1) %6, i64 %186, !dbg !35 + %188 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %187) #2, !dbg !36 + %189 = shl i32 %188, 7, !dbg !37 + %190 = sext i32 %32 to i64, !dbg !38 + %191 = getelementptr i32, ptr addrspace(1) %5, i64 %190, !dbg !38 + %192 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %191) #2, !dbg !39 + %193 = shl i32 %192, 1, !dbg !40 + %194 = add i32 %11, 63, !dbg !41 + %195 = sdiv i32 %194, 64, !dbg !45 + %196 = tail call i32 @llvm.smax.i32(i32 %195, i32 1), !dbg !46 + %197 = tail call i32 @llvm.smin.i32(i32 %193, i32 %196), !dbg !47 + %198 = and i32 %34, 3, !dbg !48 + %199 = shl nuw nsw i32 %198, 1, !dbg !48 + %200 = or disjoint i32 %199, 1, !dbg !48 + %201 = insertelement <2 x i32> poison, i32 %199, i64 0, !dbg !48 + %202 = shufflevector <2 x i32> %201, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !48 + %203 = or disjoint <2 x i32> %202, , !dbg !48 + %204 = insertelement <4 x i32> poison, i32 %199, i64 0, !dbg !48 + %205 = shufflevector <4 x i32> %204, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !48 + %206 = or disjoint <4 x i32> %205, , !dbg !48 + %207 = insertelement <8 x i32> poison, i32 %199, i64 0, !dbg !48 + %208 = shufflevector <8 x i32> %207, <8 x i32> poison, <8 x i32> zeroinitializer, !dbg !48 + %209 = or disjoint <8 x i32> %208, , !dbg !48 + %210 = or disjoint i32 %45, %33, !dbg !26 + %211 = or disjoint i32 %210, 8, !dbg !26 + %212 = insertelement <2 x i32> poison, i32 %211, i64 0, !dbg !49 + %213 = insertelement <2 x i32> %212, i32 %210, i64 1, !dbg !49 + %214 = srem <2 x i32> %213, splat (i32 2048), !dbg !49 + %215 = shufflevector <2 x i32> %214, <2 x i32> poison, <32 x i32> , !dbg !49 + %216 = zext nneg i32 %17 to i64, !dbg !51 + %217 = getelementptr i64, ptr addrspace(1) %9, i64 %216, !dbg !51 + %218 = icmp sgt i32 %193, 0, !dbg !52 + %219 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %217, i1 %218) #2, !dbg !53 + %220 = extractelement <2 x i32> %214, i64 1, !dbg !54 + %221 = sext i32 %220 to i64, !dbg !54 + %222 = extractelement <2 x i32> %214, i64 0, !dbg !54 + %223 = sext i32 %222 to i64, !dbg !54 + %224 = icmp sgt i64 %219, %221, !dbg !55 + %225 = icmp sgt i64 %219, %223, !dbg !55 + %226 = or disjoint i32 %189, %37, !dbg !56 + %227 = or disjoint i32 %189, %38, !dbg !56 + %228 = or disjoint i32 %189, %39, !dbg !56 + %229 = or disjoint i32 %189, %40, !dbg !56 + %230 = shl i32 %226, 7, !dbg !57 + %231 = shl i32 %227, 7, !dbg !57 + %232 = shl i32 %228, 7, !dbg !57 + %233 = shl i32 %229, 7, !dbg !57 + %234 = sext i32 %230 to i64, !dbg !58 + %235 = getelementptr bfloat, ptr addrspace(1) %29, i64 %234, !dbg !58 + %236 = sext i32 %231 to i64, !dbg !58 + %237 = getelementptr bfloat, ptr addrspace(1) %29, i64 %236, !dbg !58 + %238 = sext i32 %232 to i64, !dbg !58 + %239 = getelementptr bfloat, ptr addrspace(1) %29, i64 %238, !dbg !58 + %240 = sext i32 %233 to i64, !dbg !58 + %241 = getelementptr bfloat, ptr addrspace(1) %29, i64 %240, !dbg !58 + %242 = getelementptr bfloat, ptr addrspace(1) %235, i64 %80, !dbg !59 + %243 = getelementptr bfloat, ptr addrspace(1) %237, i64 %80, !dbg !59 + %244 = getelementptr bfloat, ptr addrspace(1) %239, i64 %80, !dbg !59 + %245 = getelementptr bfloat, ptr addrspace(1) %241, i64 %80, !dbg !59 + %246 = icmp slt i32 %226, %11, !dbg !60 + %247 = icmp slt i32 %227, %11, !dbg !60 + %248 = icmp slt i32 %228, %11, !dbg !60 + %249 = icmp slt i32 %229, %11, !dbg !60 + %250 = and i1 %218, %246, !dbg !52 + %251 = and i1 %218, %247, !dbg !52 + %252 = and i1 %218, %248, !dbg !52 + %253 = and i1 %218, %249, !dbg !52 + %254 = shl nuw nsw i32 %141, 10, !dbg !61 + %255 = or disjoint i32 %144, %254, !dbg !61 + %256 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %255, !dbg !61 + %257 = select i1 %250, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %256, ptr addrspace(1) %242, i32 %257) #2, !dbg !61 + %258 = or disjoint i32 %255, 2048, !dbg !61 + %259 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %258, !dbg !61 + %260 = select i1 %251, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %259, ptr addrspace(1) %243, i32 %260) #2, !dbg !61 + %261 = or disjoint i32 %255, 4096, !dbg !61 + %262 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %261, !dbg !61 + %263 = select i1 %252, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %262, ptr addrspace(1) %244, i32 %263) #2, !dbg !61 + %264 = or disjoint i32 %255, 6144, !dbg !61 + %265 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %264, !dbg !61 + %266 = select i1 %253, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %265, ptr addrspace(1) %245, i32 %266) #2, !dbg !61 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !61 + %267 = getelementptr bfloat, ptr addrspace(1) %30, i64 %234, !dbg !58 + %268 = getelementptr bfloat, ptr addrspace(1) %30, i64 %236, !dbg !58 + %269 = getelementptr bfloat, ptr addrspace(1) %30, i64 %238, !dbg !58 + %270 = getelementptr bfloat, ptr addrspace(1) %30, i64 %240, !dbg !58 + %271 = getelementptr bfloat, ptr addrspace(1) %267, i64 %80, !dbg !59 + %272 = getelementptr bfloat, ptr addrspace(1) %268, i64 %80, !dbg !59 + %273 = getelementptr bfloat, ptr addrspace(1) %269, i64 %80, !dbg !59 + %274 = getelementptr bfloat, ptr addrspace(1) %270, i64 %80, !dbg !59 + %275 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %255, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %275, ptr addrspace(1) %271, i32 %257) #2, !dbg !61 + %276 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %258, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %276, ptr addrspace(1) %272, i32 %260) #2, !dbg !61 + %277 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %261, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %277, ptr addrspace(1) %273, i32 %263) #2, !dbg !61 + %278 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %264, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %278, ptr addrspace(1) %274, i32 %266) #2, !dbg !61 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !61 + %279 = icmp sgt i32 %197, 1, !dbg !52 + %280 = or disjoint i32 %189, 64, !dbg !62 + %281 = or disjoint i32 %280, %37, !dbg !56 + %282 = or disjoint i32 %280, %38, !dbg !56 + %283 = or disjoint i32 %280, %39, !dbg !56 + %284 = or disjoint i32 %280, %40, !dbg !56 + %285 = shl i32 %281, 7, !dbg !57 + %286 = shl i32 %282, 7, !dbg !57 + %287 = shl i32 %283, 7, !dbg !57 + %288 = shl i32 %284, 7, !dbg !57 + %289 = sext i32 %285 to i64, !dbg !58 + %290 = getelementptr bfloat, ptr addrspace(1) %29, i64 %289, !dbg !58 + %291 = sext i32 %286 to i64, !dbg !58 + %292 = getelementptr bfloat, ptr addrspace(1) %29, i64 %291, !dbg !58 + %293 = sext i32 %287 to i64, !dbg !58 + %294 = getelementptr bfloat, ptr addrspace(1) %29, i64 %293, !dbg !58 + %295 = sext i32 %288 to i64, !dbg !58 + %296 = getelementptr bfloat, ptr addrspace(1) %29, i64 %295, !dbg !58 + %297 = getelementptr bfloat, ptr addrspace(1) %290, i64 %80, !dbg !59 + %298 = getelementptr bfloat, ptr addrspace(1) %292, i64 %80, !dbg !59 + %299 = getelementptr bfloat, ptr addrspace(1) %294, i64 %80, !dbg !59 + %300 = getelementptr bfloat, ptr addrspace(1) %296, i64 %80, !dbg !59 + %301 = icmp slt i32 %281, %11, !dbg !60 + %302 = icmp slt i32 %282, %11, !dbg !60 + %303 = icmp slt i32 %283, %11, !dbg !60 + %304 = icmp slt i32 %284, %11, !dbg !60 + %305 = and i1 %279, %301, !dbg !52 + %306 = and i1 %279, %302, !dbg !52 + %307 = and i1 %279, %303, !dbg !52 + %308 = and i1 %279, %304, !dbg !52 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %309 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %255, !dbg !61 + %310 = select i1 %305, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %309, ptr addrspace(1) %297, i32 %310) #2, !dbg !61 + %311 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %258, !dbg !61 + %312 = select i1 %306, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %311, ptr addrspace(1) %298, i32 %312) #2, !dbg !61 + %313 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %261, !dbg !61 + %314 = select i1 %307, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %313, ptr addrspace(1) %299, i32 %314) #2, !dbg !61 + %315 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %264, !dbg !61 + %316 = select i1 %308, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %315, ptr addrspace(1) %300, i32 %316) #2, !dbg !61 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !61 + %317 = getelementptr bfloat, ptr addrspace(1) %30, i64 %289, !dbg !58 + %318 = getelementptr bfloat, ptr addrspace(1) %30, i64 %291, !dbg !58 + %319 = getelementptr bfloat, ptr addrspace(1) %30, i64 %293, !dbg !58 + %320 = getelementptr bfloat, ptr addrspace(1) %30, i64 %295, !dbg !58 + %321 = getelementptr bfloat, ptr addrspace(1) %317, i64 %80, !dbg !59 + %322 = getelementptr bfloat, ptr addrspace(1) %318, i64 %80, !dbg !59 + %323 = getelementptr bfloat, ptr addrspace(1) %319, i64 %80, !dbg !59 + %324 = getelementptr bfloat, ptr addrspace(1) %320, i64 %80, !dbg !59 + %325 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %255, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %325, ptr addrspace(1) %321, i32 %310) #2, !dbg !61 + %326 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %258, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %326, ptr addrspace(1) %322, i32 %312) #2, !dbg !61 + %327 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %261, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %327, ptr addrspace(1) %323, i32 %314) #2, !dbg !61 + %328 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %264, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %328, ptr addrspace(1) %324, i32 %316) #2, !dbg !61 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !61 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #2, !dbg !63 + %invariant.gep = getelementptr bfloat, ptr addrspace(1) %29, i64 %80, !dbg !52 + %invariant.gep401 = getelementptr bfloat, ptr addrspace(1) %30, i64 %80, !dbg !52 + br i1 %218, label %.lr.ph, label %._crit_edge, !dbg !52 + +.lr.ph: ; preds = %15 + %329 = insertelement <16 x i32> poison, i32 %189, i64 0, !dbg !64 + %330 = shufflevector <16 x i32> %329, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !64 + %331 = insertelement <16 x i32> poison, i32 %200, i64 14, !dbg !64 + %332 = insertelement <16 x i32> %331, i32 %199, i64 15, !dbg !64 + %333 = shufflevector <8 x i32> %209, <8 x i32> poison, <16 x i32> , !dbg !64 + %334 = shufflevector <16 x i32> %333, <16 x i32> %332, <16 x i32> , !dbg !64 + %335 = shufflevector <4 x i32> %206, <4 x i32> poison, <16 x i32> , !dbg !64 + %336 = shufflevector <16 x i32> %334, <16 x i32> %335, <16 x i32> , !dbg !64 + %337 = shufflevector <2 x i32> %203, <2 x i32> poison, <16 x i32> , !dbg !64 + %338 = shufflevector <16 x i32> %336, <16 x i32> %337, <16 x i32> , !dbg !64 + %339 = or disjoint <16 x i32> %330, %338, !dbg !64 + %340 = add nsw i32 %197, -2 + %341 = add nsw i32 %197, -1 + %smax = tail call i32 @llvm.smax.i32(i32 %197, i32 1), !dbg !52 + %342 = insertelement <16 x i32> poison, i32 %11, i64 0, !dbg !49 + %343 = shufflevector <16 x i32> %342, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !49 + %344 = insertelement <16 x i64> poison, i64 %219, i64 0, !dbg !65 + %345 = shufflevector <16 x i64> %344, <16 x i64> poison, <16 x i32> zeroinitializer, !dbg !65 + br label %346, !dbg !52 + +346: ; preds = %.lr.ph, %__nv_exp2f.exit355 + %347 = phi i32 [ 64, %.lr.ph ], [ %2006, %__nv_exp2f.exit355 ] + %348 = phi i32 [ -1, %.lr.ph ], [ %424, %__nv_exp2f.exit355 ] + %349 = phi i32 [ 1, %.lr.ph ], [ %2010, %__nv_exp2f.exit355 ] + %350 = phi i32 [ 64, %.lr.ph ], [ %2007, %__nv_exp2f.exit355 ] + %351 = phi float [ 0.000000e+00, %.lr.ph ], [ %1573, %__nv_exp2f.exit355 ] + %352 = phi float [ 0.000000e+00, %.lr.ph ], [ %1574, %__nv_exp2f.exit355 ] + %353 = phi float [ 0.000000e+00, %.lr.ph ], [ %1920, %__nv_exp2f.exit355 ] + %354 = phi float [ 0.000000e+00, %.lr.ph ], [ %1921, %__nv_exp2f.exit355 ] + %355 = phi float [ 0.000000e+00, %.lr.ph ], [ %1922, %__nv_exp2f.exit355 ] + %356 = phi float [ 0.000000e+00, %.lr.ph ], [ %1923, %__nv_exp2f.exit355 ] + %357 = phi float [ 0.000000e+00, %.lr.ph ], [ %1924, %__nv_exp2f.exit355 ] + %358 = phi float [ 0.000000e+00, %.lr.ph ], [ %1925, %__nv_exp2f.exit355 ] + %359 = phi float [ 0.000000e+00, %.lr.ph ], [ %1926, %__nv_exp2f.exit355 ] + %360 = phi float [ 0.000000e+00, %.lr.ph ], [ %1927, %__nv_exp2f.exit355 ] + %361 = phi float [ 0.000000e+00, %.lr.ph ], [ %1928, %__nv_exp2f.exit355 ] + %362 = phi float [ 0.000000e+00, %.lr.ph ], [ %1929, %__nv_exp2f.exit355 ] + %363 = phi float [ 0.000000e+00, %.lr.ph ], [ %1930, %__nv_exp2f.exit355 ] + %364 = phi float [ 0.000000e+00, %.lr.ph ], [ %1931, %__nv_exp2f.exit355 ] + %365 = phi float [ 0.000000e+00, %.lr.ph ], [ %1932, %__nv_exp2f.exit355 ] + %366 = phi float [ 0.000000e+00, %.lr.ph ], [ %1933, %__nv_exp2f.exit355 ] + %367 = phi float [ 0.000000e+00, %.lr.ph ], [ %1934, %__nv_exp2f.exit355 ] + %368 = phi float [ 0.000000e+00, %.lr.ph ], [ %1935, %__nv_exp2f.exit355 ] + %369 = phi float [ 0.000000e+00, %.lr.ph ], [ %1936, %__nv_exp2f.exit355 ] + %370 = phi float [ 0.000000e+00, %.lr.ph ], [ %1937, %__nv_exp2f.exit355 ] + %371 = phi float [ 0.000000e+00, %.lr.ph ], [ %1938, %__nv_exp2f.exit355 ] + %372 = phi float [ 0.000000e+00, %.lr.ph ], [ %1939, %__nv_exp2f.exit355 ] + %373 = phi float [ 0.000000e+00, %.lr.ph ], [ %1940, %__nv_exp2f.exit355 ] + %374 = phi float [ 0.000000e+00, %.lr.ph ], [ %1941, %__nv_exp2f.exit355 ] + %375 = phi float [ 0.000000e+00, %.lr.ph ], [ %1942, %__nv_exp2f.exit355 ] + %376 = phi float [ 0.000000e+00, %.lr.ph ], [ %1943, %__nv_exp2f.exit355 ] + %377 = phi float [ 0.000000e+00, %.lr.ph ], [ %1944, %__nv_exp2f.exit355 ] + %378 = phi float [ 0.000000e+00, %.lr.ph ], [ %1945, %__nv_exp2f.exit355 ] + %379 = phi float [ 0.000000e+00, %.lr.ph ], [ %1946, %__nv_exp2f.exit355 ] + %380 = phi float [ 0.000000e+00, %.lr.ph ], [ %1947, %__nv_exp2f.exit355 ] + %381 = phi float [ 0.000000e+00, %.lr.ph ], [ %1948, %__nv_exp2f.exit355 ] + %382 = phi float [ 0.000000e+00, %.lr.ph ], [ %1949, %__nv_exp2f.exit355 ] + %383 = phi float [ 0.000000e+00, %.lr.ph ], [ %1950, %__nv_exp2f.exit355 ] + %384 = phi float [ 0.000000e+00, %.lr.ph ], [ %1951, %__nv_exp2f.exit355 ] + %385 = phi float [ 0.000000e+00, %.lr.ph ], [ %1952, %__nv_exp2f.exit355 ] + %386 = phi float [ 0.000000e+00, %.lr.ph ], [ %1953, %__nv_exp2f.exit355 ] + %387 = phi float [ 0.000000e+00, %.lr.ph ], [ %1954, %__nv_exp2f.exit355 ] + %388 = phi float [ 0.000000e+00, %.lr.ph ], [ %1955, %__nv_exp2f.exit355 ] + %389 = phi float [ 0.000000e+00, %.lr.ph ], [ %1956, %__nv_exp2f.exit355 ] + %390 = phi float [ 0.000000e+00, %.lr.ph ], [ %1957, %__nv_exp2f.exit355 ] + %391 = phi float [ 0.000000e+00, %.lr.ph ], [ %1958, %__nv_exp2f.exit355 ] + %392 = phi float [ 0.000000e+00, %.lr.ph ], [ %1959, %__nv_exp2f.exit355 ] + %393 = phi float [ 0.000000e+00, %.lr.ph ], [ %1960, %__nv_exp2f.exit355 ] + %394 = phi float [ 0.000000e+00, %.lr.ph ], [ %1961, %__nv_exp2f.exit355 ] + %395 = phi float [ 0.000000e+00, %.lr.ph ], [ %1962, %__nv_exp2f.exit355 ] + %396 = phi float [ 0.000000e+00, %.lr.ph ], [ %1963, %__nv_exp2f.exit355 ] + %397 = phi float [ 0.000000e+00, %.lr.ph ], [ %1964, %__nv_exp2f.exit355 ] + %398 = phi float [ 0.000000e+00, %.lr.ph ], [ %1965, %__nv_exp2f.exit355 ] + %399 = phi float [ 0.000000e+00, %.lr.ph ], [ %1966, %__nv_exp2f.exit355 ] + %400 = phi float [ 0.000000e+00, %.lr.ph ], [ %1967, %__nv_exp2f.exit355 ] + %401 = phi float [ 0.000000e+00, %.lr.ph ], [ %1968, %__nv_exp2f.exit355 ] + %402 = phi float [ 0.000000e+00, %.lr.ph ], [ %1969, %__nv_exp2f.exit355 ] + %403 = phi float [ 0.000000e+00, %.lr.ph ], [ %1970, %__nv_exp2f.exit355 ] + %404 = phi float [ 0.000000e+00, %.lr.ph ], [ %1971, %__nv_exp2f.exit355 ] + %405 = phi float [ 0.000000e+00, %.lr.ph ], [ %1972, %__nv_exp2f.exit355 ] + %406 = phi float [ 0.000000e+00, %.lr.ph ], [ %1973, %__nv_exp2f.exit355 ] + %407 = phi float [ 0.000000e+00, %.lr.ph ], [ %1974, %__nv_exp2f.exit355 ] + %408 = phi float [ 0.000000e+00, %.lr.ph ], [ %1975, %__nv_exp2f.exit355 ] + %409 = phi float [ 0.000000e+00, %.lr.ph ], [ %1976, %__nv_exp2f.exit355 ] + %410 = phi float [ 0.000000e+00, %.lr.ph ], [ %1977, %__nv_exp2f.exit355 ] + %411 = phi float [ 0.000000e+00, %.lr.ph ], [ %1978, %__nv_exp2f.exit355 ] + %412 = phi float [ 0.000000e+00, %.lr.ph ], [ %1979, %__nv_exp2f.exit355 ] + %413 = phi float [ 0.000000e+00, %.lr.ph ], [ %1980, %__nv_exp2f.exit355 ] + %414 = phi float [ 0.000000e+00, %.lr.ph ], [ %1981, %__nv_exp2f.exit355 ] + %415 = phi float [ 0.000000e+00, %.lr.ph ], [ %1982, %__nv_exp2f.exit355 ] + %416 = phi float [ 0.000000e+00, %.lr.ph ], [ %1983, %__nv_exp2f.exit355 ] + %417 = phi i32 [ 0, %.lr.ph ], [ %1987, %__nv_exp2f.exit355 ] + %418 = phi <2 x float> [ splat (float 0xFFF0000000000000), %.lr.ph ], [ %1312, %__nv_exp2f.exit355 ] + %419 = phi <16 x i32> [ %339, %.lr.ph ], [ %1986, %__nv_exp2f.exit355 ] + %420 = icmp slt i32 %417, %340, !dbg !52 + %421 = icmp slt i32 %417, %341, !dbg !52 + %422 = add i32 %348, 1, !dbg !52 + %423 = icmp sgt i32 %422, 2, !dbg !52 + %424 = select i1 %423, i32 0, i32 %422, !dbg !52 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !61 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %425 = shl i32 %424, 13, !dbg !61 + %426 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %425, !dbg !61 + %427 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %35, i32 0, i32 31), !dbg !63 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !63 + %428 = shl i32 %427, 11, !dbg !63 + %429 = and i32 %428, 8192, !dbg !63 + %430 = add i32 %429, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !63 + %431 = lshr exact i32 %430, 4, !dbg !63 + %432 = and i32 %431, 16383, !dbg !63 + %433 = zext nneg i32 %432 to i64, !dbg !63 + %434 = or disjoint i64 %433, 4611686293372403712, !dbg !63 + %435 = ptrtoint ptr addrspace(3) %426 to i32, !dbg !63 + %436 = lshr exact i32 %435, 4, !dbg !63 + %437 = and i32 %436, 16383, !dbg !63 + %438 = zext nneg i32 %437 to i64, !dbg !63 + %439 = or disjoint i64 %438, 4611686293338849280, !dbg !63 + %440 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %434, i64 %439) #2, !dbg !63 + %441 = add i32 %429, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 32), !dbg !63 + %442 = lshr exact i32 %441, 4, !dbg !63 + %443 = and i32 %442, 16383, !dbg !63 + %444 = zext nneg i32 %443 to i64, !dbg !63 + %445 = or disjoint i64 %444, 4611686293372403712, !dbg !63 + %446 = add i32 %435, 32, !dbg !63 + %447 = lshr exact i32 %446, 4, !dbg !63 + %448 = and i32 %447, 16383, !dbg !63 + %449 = zext nneg i32 %448 to i64, !dbg !63 + %450 = or disjoint i64 %449, 4611686293338849280, !dbg !63 + %451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 0, !dbg !63 + %452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 1, !dbg !63 + %453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 2, !dbg !63 + %454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 3, !dbg !63 + %455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 4, !dbg !63 + %456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 5, !dbg !63 + %457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 6, !dbg !63 + %458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 7, !dbg !63 + %459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 8, !dbg !63 + %460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 9, !dbg !63 + %461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 10, !dbg !63 + %462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 11, !dbg !63 + %463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 12, !dbg !63 + %464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 13, !dbg !63 + %465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 14, !dbg !63 + %466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 15, !dbg !63 + %467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 16, !dbg !63 + %468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 17, !dbg !63 + %469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 18, !dbg !63 + %470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 19, !dbg !63 + %471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 20, !dbg !63 + %472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 21, !dbg !63 + %473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 22, !dbg !63 + %474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 23, !dbg !63 + %475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 24, !dbg !63 + %476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 25, !dbg !63 + %477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 26, !dbg !63 + %478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 27, !dbg !63 + %479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 28, !dbg !63 + %480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 29, !dbg !63 + %481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 30, !dbg !63 + %482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %440, 31, !dbg !63 + %483 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %451, float %452, float %453, float %454, float %455, float %456, float %457, float %458, float %459, float %460, float %461, float %462, float %463, float %464, float %465, float %466, float %467, float %468, float %469, float %470, float %471, float %472, float %473, float %474, float %475, float %476, float %477, float %478, float %479, float %480, float %481, float %482, i64 %445, i64 %450, i1 true) #2, !dbg !63 + %484 = add i32 %429, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 64), !dbg !63 + %485 = lshr exact i32 %484, 4, !dbg !63 + %486 = and i32 %485, 16383, !dbg !63 + %487 = zext nneg i32 %486 to i64, !dbg !63 + %488 = or disjoint i64 %487, 4611686293372403712, !dbg !63 + %489 = add i32 %435, 64, !dbg !63 + %490 = lshr exact i32 %489, 4, !dbg !63 + %491 = and i32 %490, 16383, !dbg !63 + %492 = zext nneg i32 %491 to i64, !dbg !63 + %493 = or disjoint i64 %492, 4611686293338849280, !dbg !63 + %494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 0, !dbg !63 + %495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 1, !dbg !63 + %496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 2, !dbg !63 + %497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 3, !dbg !63 + %498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 4, !dbg !63 + %499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 5, !dbg !63 + %500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 6, !dbg !63 + %501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 7, !dbg !63 + %502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 8, !dbg !63 + %503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 9, !dbg !63 + %504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 10, !dbg !63 + %505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 11, !dbg !63 + %506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 12, !dbg !63 + %507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 13, !dbg !63 + %508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 14, !dbg !63 + %509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 15, !dbg !63 + %510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 16, !dbg !63 + %511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 17, !dbg !63 + %512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 18, !dbg !63 + %513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 19, !dbg !63 + %514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 20, !dbg !63 + %515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 21, !dbg !63 + %516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 22, !dbg !63 + %517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 23, !dbg !63 + %518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 24, !dbg !63 + %519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 25, !dbg !63 + %520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 26, !dbg !63 + %521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 27, !dbg !63 + %522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 28, !dbg !63 + %523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 29, !dbg !63 + %524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 30, !dbg !63 + %525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %483, 31, !dbg !63 + %526 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %494, float %495, float %496, float %497, float %498, float %499, float %500, float %501, float %502, float %503, float %504, float %505, float %506, float %507, float %508, float %509, float %510, float %511, float %512, float %513, float %514, float %515, float %516, float %517, float %518, float %519, float %520, float %521, float %522, float %523, float %524, float %525, i64 %488, i64 %493, i1 true) #2, !dbg !63 + %527 = add i32 %429, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 96), !dbg !63 + %528 = lshr exact i32 %527, 4, !dbg !63 + %529 = and i32 %528, 16383, !dbg !63 + %530 = zext nneg i32 %529 to i64, !dbg !63 + %531 = or disjoint i64 %530, 4611686293372403712, !dbg !63 + %532 = add i32 %435, 96, !dbg !63 + %533 = lshr exact i32 %532, 4, !dbg !63 + %534 = and i32 %533, 16383, !dbg !63 + %535 = zext nneg i32 %534 to i64, !dbg !63 + %536 = or disjoint i64 %535, 4611686293338849280, !dbg !63 + %537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 0, !dbg !63 + %538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 1, !dbg !63 + %539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 2, !dbg !63 + %540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 3, !dbg !63 + %541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 4, !dbg !63 + %542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 5, !dbg !63 + %543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 6, !dbg !63 + %544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 7, !dbg !63 + %545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 8, !dbg !63 + %546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 9, !dbg !63 + %547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 10, !dbg !63 + %548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 11, !dbg !63 + %549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 12, !dbg !63 + %550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 13, !dbg !63 + %551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 14, !dbg !63 + %552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 15, !dbg !63 + %553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 16, !dbg !63 + %554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 17, !dbg !63 + %555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 18, !dbg !63 + %556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 19, !dbg !63 + %557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 20, !dbg !63 + %558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 21, !dbg !63 + %559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 22, !dbg !63 + %560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 23, !dbg !63 + %561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 24, !dbg !63 + %562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 25, !dbg !63 + %563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 26, !dbg !63 + %564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 27, !dbg !63 + %565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 28, !dbg !63 + %566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 29, !dbg !63 + %567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 30, !dbg !63 + %568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %526, 31, !dbg !63 + %569 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %537, float %538, float %539, float %540, float %541, float %542, float %543, float %544, float %545, float %546, float %547, float %548, float %549, float %550, float %551, float %552, float %553, float %554, float %555, float %556, float %557, float %558, float %559, float %560, float %561, float %562, float %563, float %564, float %565, float %566, float %567, float %568, i64 %531, i64 %536, i1 true) #2, !dbg !63 + %570 = add i32 %429, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16384), !dbg !63 + %571 = lshr exact i32 %570, 4, !dbg !63 + %572 = and i32 %571, 16383, !dbg !63 + %573 = zext nneg i32 %572 to i64, !dbg !63 + %574 = or disjoint i64 %573, 4611686293372403712, !dbg !63 + %575 = add i32 %435, 8192, !dbg !63 + %576 = lshr exact i32 %575, 4, !dbg !63 + %577 = and i32 %576, 16383, !dbg !63 + %578 = zext nneg i32 %577 to i64, !dbg !63 + %579 = or disjoint i64 %578, 4611686293338849280, !dbg !63 + %580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 0, !dbg !63 + %581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 1, !dbg !63 + %582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 2, !dbg !63 + %583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 3, !dbg !63 + %584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 4, !dbg !63 + %585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 5, !dbg !63 + %586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 6, !dbg !63 + %587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 7, !dbg !63 + %588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 8, !dbg !63 + %589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 9, !dbg !63 + %590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 10, !dbg !63 + %591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 11, !dbg !63 + %592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 12, !dbg !63 + %593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 13, !dbg !63 + %594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 14, !dbg !63 + %595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 15, !dbg !63 + %596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 16, !dbg !63 + %597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 17, !dbg !63 + %598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 18, !dbg !63 + %599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 19, !dbg !63 + %600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 20, !dbg !63 + %601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 21, !dbg !63 + %602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 22, !dbg !63 + %603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 23, !dbg !63 + %604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 24, !dbg !63 + %605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 25, !dbg !63 + %606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 26, !dbg !63 + %607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 27, !dbg !63 + %608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 28, !dbg !63 + %609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 29, !dbg !63 + %610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 30, !dbg !63 + %611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %569, 31, !dbg !63 + %612 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %580, float %581, float %582, float %583, float %584, float %585, float %586, float %587, float %588, float %589, float %590, float %591, float %592, float %593, float %594, float %595, float %596, float %597, float %598, float %599, float %600, float %601, float %602, float %603, float %604, float %605, float %606, float %607, float %608, float %609, float %610, float %611, i64 %574, i64 %579, i1 true) #2, !dbg !63 + %613 = add i32 %429, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16416), !dbg !63 + %614 = lshr exact i32 %613, 4, !dbg !63 + %615 = and i32 %614, 16383, !dbg !63 + %616 = zext nneg i32 %615 to i64, !dbg !63 + %617 = or disjoint i64 %616, 4611686293372403712, !dbg !63 + %618 = add i32 %435, 8224, !dbg !63 + %619 = lshr exact i32 %618, 4, !dbg !63 + %620 = and i32 %619, 16383, !dbg !63 + %621 = zext nneg i32 %620 to i64, !dbg !63 + %622 = or disjoint i64 %621, 4611686293338849280, !dbg !63 + %623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 0, !dbg !63 + %624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 1, !dbg !63 + %625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 2, !dbg !63 + %626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 3, !dbg !63 + %627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 4, !dbg !63 + %628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 5, !dbg !63 + %629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 6, !dbg !63 + %630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 7, !dbg !63 + %631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 8, !dbg !63 + %632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 9, !dbg !63 + %633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 10, !dbg !63 + %634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 11, !dbg !63 + %635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 12, !dbg !63 + %636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 13, !dbg !63 + %637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 14, !dbg !63 + %638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 15, !dbg !63 + %639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 16, !dbg !63 + %640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 17, !dbg !63 + %641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 18, !dbg !63 + %642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 19, !dbg !63 + %643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 20, !dbg !63 + %644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 21, !dbg !63 + %645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 22, !dbg !63 + %646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 23, !dbg !63 + %647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 24, !dbg !63 + %648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 25, !dbg !63 + %649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 26, !dbg !63 + %650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 27, !dbg !63 + %651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 28, !dbg !63 + %652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 29, !dbg !63 + %653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 30, !dbg !63 + %654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %612, 31, !dbg !63 + %655 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %623, float %624, float %625, float %626, float %627, float %628, float %629, float %630, float %631, float %632, float %633, float %634, float %635, float %636, float %637, float %638, float %639, float %640, float %641, float %642, float %643, float %644, float %645, float %646, float %647, float %648, float %649, float %650, float %651, float %652, float %653, float %654, i64 %617, i64 %622, i1 true) #2, !dbg !63 + %656 = add i32 %429, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16448), !dbg !63 + %657 = lshr exact i32 %656, 4, !dbg !63 + %658 = and i32 %657, 16383, !dbg !63 + %659 = zext nneg i32 %658 to i64, !dbg !63 + %660 = or disjoint i64 %659, 4611686293372403712, !dbg !63 + %661 = add i32 %435, 8256, !dbg !63 + %662 = lshr exact i32 %661, 4, !dbg !63 + %663 = and i32 %662, 16383, !dbg !63 + %664 = zext nneg i32 %663 to i64, !dbg !63 + %665 = or disjoint i64 %664, 4611686293338849280, !dbg !63 + %666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 0, !dbg !63 + %667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 1, !dbg !63 + %668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 2, !dbg !63 + %669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 3, !dbg !63 + %670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 4, !dbg !63 + %671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 5, !dbg !63 + %672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 6, !dbg !63 + %673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 7, !dbg !63 + %674 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 8, !dbg !63 + %675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 9, !dbg !63 + %676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 10, !dbg !63 + %677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 11, !dbg !63 + %678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 12, !dbg !63 + %679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 13, !dbg !63 + %680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 14, !dbg !63 + %681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 15, !dbg !63 + %682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 16, !dbg !63 + %683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 17, !dbg !63 + %684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 18, !dbg !63 + %685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 19, !dbg !63 + %686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 20, !dbg !63 + %687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 21, !dbg !63 + %688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 22, !dbg !63 + %689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 23, !dbg !63 + %690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 24, !dbg !63 + %691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 25, !dbg !63 + %692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 26, !dbg !63 + %693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 27, !dbg !63 + %694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 28, !dbg !63 + %695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 29, !dbg !63 + %696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 30, !dbg !63 + %697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %655, 31, !dbg !63 + %698 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %666, float %667, float %668, float %669, float %670, float %671, float %672, float %673, float %674, float %675, float %676, float %677, float %678, float %679, float %680, float %681, float %682, float %683, float %684, float %685, float %686, float %687, float %688, float %689, float %690, float %691, float %692, float %693, float %694, float %695, float %696, float %697, i64 %660, i64 %665, i1 true) #2, !dbg !63 + %699 = add i32 %429, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16480), !dbg !63 + %700 = lshr exact i32 %699, 4, !dbg !63 + %701 = and i32 %700, 16383, !dbg !63 + %702 = zext nneg i32 %701 to i64, !dbg !63 + %703 = or disjoint i64 %702, 4611686293372403712, !dbg !63 + %704 = add i32 %435, 8288, !dbg !63 + %705 = lshr exact i32 %704, 4, !dbg !63 + %706 = and i32 %705, 16383, !dbg !63 + %707 = zext nneg i32 %706 to i64, !dbg !63 + %708 = or disjoint i64 %707, 4611686293338849280, !dbg !63 + %709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 0, !dbg !63 + %710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 1, !dbg !63 + %711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 2, !dbg !63 + %712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 3, !dbg !63 + %713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 4, !dbg !63 + %714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 5, !dbg !63 + %715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 6, !dbg !63 + %716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 7, !dbg !63 + %717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 8, !dbg !63 + %718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 9, !dbg !63 + %719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 10, !dbg !63 + %720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 11, !dbg !63 + %721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 12, !dbg !63 + %722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 13, !dbg !63 + %723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 14, !dbg !63 + %724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 15, !dbg !63 + %725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 16, !dbg !63 + %726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 17, !dbg !63 + %727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 18, !dbg !63 + %728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 19, !dbg !63 + %729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 20, !dbg !63 + %730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 21, !dbg !63 + %731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 22, !dbg !63 + %732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 23, !dbg !63 + %733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 24, !dbg !63 + %734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 25, !dbg !63 + %735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 26, !dbg !63 + %736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 27, !dbg !63 + %737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 28, !dbg !63 + %738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 29, !dbg !63 + %739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 30, !dbg !63 + %740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %698, 31, !dbg !63 + %741 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %709, float %710, float %711, float %712, float %713, float %714, float %715, float %716, float %717, float %718, float %719, float %720, float %721, float %722, float %723, float %724, float %725, float %726, float %727, float %728, float %729, float %730, float %731, float %732, float %733, float %734, float %735, float %736, float %737, float %738, float %739, float %740, i64 %703, i64 %708, i1 true) #2, !dbg !63 + %742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 0, !dbg !63 + %743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 1, !dbg !63 + %744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 2, !dbg !63 + %745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 3, !dbg !63 + %746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 4, !dbg !63 + %747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 5, !dbg !63 + %748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 6, !dbg !63 + %749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 7, !dbg !63 + %750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 8, !dbg !63 + %751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 9, !dbg !63 + %752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 10, !dbg !63 + %753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 11, !dbg !63 + %754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 12, !dbg !63 + %755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 13, !dbg !63 + %756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 14, !dbg !63 + %757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 15, !dbg !63 + %758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 16, !dbg !63 + %759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 17, !dbg !63 + %760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 18, !dbg !63 + %761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 19, !dbg !63 + %762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 20, !dbg !63 + %763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 21, !dbg !63 + %764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 22, !dbg !63 + %765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 23, !dbg !63 + %766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 24, !dbg !63 + %767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 25, !dbg !63 + %768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 26, !dbg !63 + %769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 27, !dbg !63 + %770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 28, !dbg !63 + %771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 29, !dbg !63 + %772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 30, !dbg !63 + %773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %741, 31, !dbg !63 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !63 + %774 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101"(float %742, float %743, float %744, float %745, float %746, float %747, float %748, float %749, float %750, float %751, float %752, float %753, float %754, float %755, float %756, float %757, float %758, float %759, float %760, float %761, float %762, float %763, float %764, float %765, float %766, float %767, float %768, float %769, float %770, float %771, float %772, float %773, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %426, i32 0, i32 0, float %353, float %354, float %355, float %356, float %357, float %358, float %359, float %360, float %361, float %362, float %363, float %364, float %365, float %366, float %367, float %368, float %369, float %370, float %371, float %372, float %373, float %374, float %375, float %376, float %377, float %378, float %379, float %380, float %381, float %382, float %383, float %384, float %385, float %386, float %387, float %388, float %389, float %390, float %391, float %392, float %393, float %394, float %395, float %396, float %397, float %398, float %399, float %400, float %401, float %402, float %403, float %404, float %405, float %406, float %407, float %408, float %409, float %410, float %411, float %412, float %413, float %414, float %415, float %416) #2, !dbg !63 + %775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 0, !dbg !63 + %776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 1, !dbg !63 + %777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 2, !dbg !63 + %778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 3, !dbg !63 + %779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 4, !dbg !63 + %780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 5, !dbg !63 + %781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 6, !dbg !63 + %782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 7, !dbg !63 + %783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 8, !dbg !63 + %784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 9, !dbg !63 + %785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 10, !dbg !63 + %786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 11, !dbg !63 + %787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 12, !dbg !63 + %788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 13, !dbg !63 + %789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 14, !dbg !63 + %790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 15, !dbg !63 + %791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 16, !dbg !63 + %792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 17, !dbg !63 + %793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 18, !dbg !63 + %794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 19, !dbg !63 + %795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 20, !dbg !63 + %796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 21, !dbg !63 + %797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 22, !dbg !63 + %798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 23, !dbg !63 + %799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 24, !dbg !63 + %800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 25, !dbg !63 + %801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 26, !dbg !63 + %802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 27, !dbg !63 + %803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 28, !dbg !63 + %804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 29, !dbg !63 + %805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 30, !dbg !63 + %806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 31, !dbg !63 + %807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 38, !dbg !63 + %808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 39, !dbg !63 + %809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 40, !dbg !63 + %810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 41, !dbg !63 + %811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 42, !dbg !63 + %812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 43, !dbg !63 + %813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 44, !dbg !63 + %814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 45, !dbg !63 + %815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 46, !dbg !63 + %816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 47, !dbg !63 + %817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 48, !dbg !63 + %818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 49, !dbg !63 + %819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 50, !dbg !63 + %820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 51, !dbg !63 + %821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 52, !dbg !63 + %822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 53, !dbg !63 + %823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 54, !dbg !63 + %824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 55, !dbg !63 + %825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 56, !dbg !63 + %826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 57, !dbg !63 + %827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 58, !dbg !63 + %828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 59, !dbg !63 + %829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 60, !dbg !63 + %830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 61, !dbg !63 + %831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 62, !dbg !63 + %832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 63, !dbg !63 + %833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 64, !dbg !63 + %834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 65, !dbg !63 + %835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 66, !dbg !63 + %836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 67, !dbg !63 + %837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 68, !dbg !63 + %838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 69, !dbg !63 + %839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 70, !dbg !63 + %840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 71, !dbg !63 + %841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 72, !dbg !63 + %842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 73, !dbg !63 + %843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 74, !dbg !63 + %844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 75, !dbg !63 + %845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 76, !dbg !63 + %846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 77, !dbg !63 + %847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 78, !dbg !63 + %848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 79, !dbg !63 + %849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 80, !dbg !63 + %850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 81, !dbg !63 + %851 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 82, !dbg !63 + %852 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 83, !dbg !63 + %853 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 84, !dbg !63 + %854 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 85, !dbg !63 + %855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 86, !dbg !63 + %856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 87, !dbg !63 + %857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 88, !dbg !63 + %858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 89, !dbg !63 + %859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 90, !dbg !63 + %860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 91, !dbg !63 + %861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 92, !dbg !63 + %862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 93, !dbg !63 + %863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 94, !dbg !63 + %864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 95, !dbg !63 + %865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 96, !dbg !63 + %866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 97, !dbg !63 + %867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 98, !dbg !63 + %868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 99, !dbg !63 + %869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 100, !dbg !63 + %870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %774, 101, !dbg !63 + %871 = fmul float %775, 0x3FB6A09E60000000, !dbg !66 + %872 = fmul float %776, 0x3FB6A09E60000000, !dbg !66 + %873 = fmul float %777, 0x3FB6A09E60000000, !dbg !66 + %874 = fmul float %778, 0x3FB6A09E60000000, !dbg !66 + %875 = fmul float %779, 0x3FB6A09E60000000, !dbg !66 + %876 = fmul float %780, 0x3FB6A09E60000000, !dbg !66 + %877 = fmul float %781, 0x3FB6A09E60000000, !dbg !66 + %878 = fmul float %782, 0x3FB6A09E60000000, !dbg !66 + %879 = fmul float %783, 0x3FB6A09E60000000, !dbg !66 + %880 = fmul float %784, 0x3FB6A09E60000000, !dbg !66 + %881 = fmul float %785, 0x3FB6A09E60000000, !dbg !66 + %882 = fmul float %786, 0x3FB6A09E60000000, !dbg !66 + %883 = fmul float %787, 0x3FB6A09E60000000, !dbg !66 + %884 = fmul float %788, 0x3FB6A09E60000000, !dbg !66 + %885 = fmul float %789, 0x3FB6A09E60000000, !dbg !66 + %886 = fmul float %790, 0x3FB6A09E60000000, !dbg !66 + %887 = fmul float %791, 0x3FB6A09E60000000, !dbg !66 + %888 = fmul float %792, 0x3FB6A09E60000000, !dbg !66 + %889 = fmul float %793, 0x3FB6A09E60000000, !dbg !66 + %890 = fmul float %794, 0x3FB6A09E60000000, !dbg !66 + %891 = fmul float %795, 0x3FB6A09E60000000, !dbg !66 + %892 = fmul float %796, 0x3FB6A09E60000000, !dbg !66 + %893 = fmul float %797, 0x3FB6A09E60000000, !dbg !66 + %894 = fmul float %798, 0x3FB6A09E60000000, !dbg !66 + %895 = fmul float %799, 0x3FB6A09E60000000, !dbg !66 + %896 = fmul float %800, 0x3FB6A09E60000000, !dbg !66 + %897 = fmul float %801, 0x3FB6A09E60000000, !dbg !66 + %898 = fmul float %802, 0x3FB6A09E60000000, !dbg !66 + %899 = fmul float %803, 0x3FB6A09E60000000, !dbg !66 + %900 = fmul float %804, 0x3FB6A09E60000000, !dbg !66 + %901 = fmul float %805, 0x3FB6A09E60000000, !dbg !66 + %902 = fmul float %806, 0x3FB6A09E60000000, !dbg !66 + %903 = extractelement <16 x i32> %419, i64 15, !dbg !67 + %904 = icmp slt i32 %903, %11, !dbg !67 + %905 = extractelement <16 x i32> %419, i64 14, !dbg !67 + %906 = icmp slt i32 %905, %11, !dbg !67 + %907 = extractelement <16 x i32> %419, i64 13, !dbg !67 + %908 = icmp slt i32 %907, %11, !dbg !67 + %909 = extractelement <16 x i32> %419, i64 12, !dbg !67 + %910 = icmp slt i32 %909, %11, !dbg !67 + %911 = extractelement <16 x i32> %419, i64 11, !dbg !67 + %912 = icmp slt i32 %911, %11, !dbg !67 + %913 = extractelement <16 x i32> %419, i64 10, !dbg !67 + %914 = icmp slt i32 %913, %11, !dbg !67 + %915 = extractelement <16 x i32> %419, i64 9, !dbg !67 + %916 = icmp slt i32 %915, %11, !dbg !67 + %917 = extractelement <16 x i32> %419, i64 8, !dbg !67 + %918 = icmp slt i32 %917, %11, !dbg !67 + %919 = extractelement <16 x i32> %419, i64 7, !dbg !67 + %920 = icmp slt i32 %919, %11, !dbg !67 + %921 = extractelement <16 x i32> %419, i64 6, !dbg !67 + %922 = icmp slt i32 %921, %11, !dbg !67 + %923 = extractelement <16 x i32> %419, i64 5, !dbg !67 + %924 = icmp slt i32 %923, %11, !dbg !67 + %925 = extractelement <16 x i32> %419, i64 4, !dbg !67 + %926 = icmp slt i32 %925, %11, !dbg !67 + %927 = extractelement <16 x i32> %419, i64 3, !dbg !67 + %928 = icmp slt i32 %927, %11, !dbg !67 + %929 = extractelement <16 x i32> %419, i64 2, !dbg !67 + %930 = icmp slt i32 %929, %11, !dbg !67 + %931 = extractelement <16 x i32> %419, i64 1, !dbg !67 + %932 = icmp slt i32 %931, %11, !dbg !67 + %933 = extractelement <16 x i32> %419, i64 0, !dbg !67 + %934 = icmp slt i32 %933, %11, !dbg !67 + %935 = srem <16 x i32> %419, %343, !dbg !49 + %936 = shufflevector <16 x i32> %935, <16 x i32> poison, <32 x i32> , !dbg !49 + %937 = extractelement <16 x i32> %935, i64 15, !dbg !68 + %938 = icmp sge i32 %220, %937, !dbg !69 + %939 = extractelement <16 x i32> %935, i64 14, !dbg !68 + %940 = icmp sge i32 %220, %939, !dbg !69 + %941 = icmp sge i32 %222, %937, !dbg !69 + %942 = icmp sge i32 %222, %939, !dbg !69 + %943 = extractelement <16 x i32> %935, i64 13, !dbg !68 + %944 = icmp sge i32 %220, %943, !dbg !69 + %945 = extractelement <16 x i32> %935, i64 12, !dbg !68 + %946 = icmp sge i32 %220, %945, !dbg !69 + %947 = icmp sge i32 %222, %943, !dbg !69 + %948 = icmp sge i32 %222, %945, !dbg !69 + %949 = extractelement <16 x i32> %935, i64 11, !dbg !68 + %950 = icmp sge i32 %220, %949, !dbg !69 + %951 = extractelement <16 x i32> %935, i64 10, !dbg !68 + %952 = icmp sge i32 %220, %951, !dbg !69 + %953 = icmp sge i32 %222, %949, !dbg !69 + %954 = icmp sge i32 %222, %951, !dbg !69 + %955 = extractelement <16 x i32> %935, i64 9, !dbg !68 + %956 = icmp sge i32 %220, %955, !dbg !69 + %957 = extractelement <16 x i32> %935, i64 8, !dbg !68 + %958 = icmp sge i32 %220, %957, !dbg !69 + %959 = icmp sge i32 %222, %955, !dbg !69 + %960 = icmp sge i32 %222, %957, !dbg !69 + %961 = extractelement <16 x i32> %935, i64 7, !dbg !68 + %962 = icmp sge i32 %220, %961, !dbg !69 + %963 = extractelement <16 x i32> %935, i64 6, !dbg !68 + %964 = icmp sge i32 %220, %963, !dbg !69 + %965 = icmp sge i32 %222, %961, !dbg !69 + %966 = icmp sge i32 %222, %963, !dbg !69 + %967 = extractelement <16 x i32> %935, i64 5, !dbg !68 + %968 = icmp sge i32 %220, %967, !dbg !69 + %969 = extractelement <16 x i32> %935, i64 4, !dbg !68 + %970 = icmp sge i32 %220, %969, !dbg !69 + %971 = icmp sge i32 %222, %967, !dbg !69 + %972 = icmp sge i32 %222, %969, !dbg !69 + %973 = extractelement <16 x i32> %935, i64 3, !dbg !68 + %974 = icmp sge i32 %220, %973, !dbg !69 + %975 = extractelement <16 x i32> %935, i64 2, !dbg !68 + %976 = icmp sge i32 %220, %975, !dbg !69 + %977 = icmp sge i32 %222, %973, !dbg !69 + %978 = icmp sge i32 %222, %975, !dbg !69 + %979 = extractelement <16 x i32> %935, i64 1, !dbg !68 + %980 = icmp sge i32 %220, %979, !dbg !69 + %981 = extractelement <16 x i32> %935, i64 0, !dbg !68 + %982 = icmp sge i32 %220, %981, !dbg !69 + %983 = icmp sge i32 %222, %979, !dbg !69 + %984 = icmp sge i32 %222, %981, !dbg !69 + %985 = and i1 %224, %938, !dbg !70 + %986 = and i1 %224, %940, !dbg !70 + %987 = and i1 %225, %941, !dbg !70 + %988 = and i1 %225, %942, !dbg !70 + %989 = and i1 %224, %944, !dbg !70 + %990 = and i1 %224, %946, !dbg !70 + %991 = and i1 %225, %947, !dbg !70 + %992 = and i1 %225, %948, !dbg !70 + %993 = and i1 %224, %950, !dbg !70 + %994 = and i1 %224, %952, !dbg !70 + %995 = and i1 %225, %953, !dbg !70 + %996 = and i1 %225, %954, !dbg !70 + %997 = and i1 %224, %956, !dbg !70 + %998 = and i1 %224, %958, !dbg !70 + %999 = and i1 %225, %959, !dbg !70 + %1000 = and i1 %225, %960, !dbg !70 + %1001 = and i1 %224, %962, !dbg !70 + %1002 = and i1 %224, %964, !dbg !70 + %1003 = and i1 %225, %965, !dbg !70 + %1004 = and i1 %225, %966, !dbg !70 + %1005 = and i1 %224, %968, !dbg !70 + %1006 = and i1 %224, %970, !dbg !70 + %1007 = and i1 %225, %971, !dbg !70 + %1008 = and i1 %225, %972, !dbg !70 + %1009 = and i1 %224, %974, !dbg !70 + %1010 = and i1 %224, %976, !dbg !70 + %1011 = and i1 %225, %977, !dbg !70 + %1012 = and i1 %225, %978, !dbg !70 + %1013 = and i1 %224, %980, !dbg !70 + %1014 = and i1 %224, %982, !dbg !70 + %1015 = and i1 %225, %983, !dbg !70 + %1016 = and i1 %225, %984, !dbg !70 + %1017 = icmp sgt i32 %937, 2047, !dbg !71 + %1018 = icmp sgt i32 %939, 2047, !dbg !71 + %1019 = icmp sgt i32 %943, 2047, !dbg !71 + %1020 = icmp sgt i32 %945, 2047, !dbg !71 + %1021 = icmp sgt i32 %949, 2047, !dbg !71 + %1022 = icmp sgt i32 %951, 2047, !dbg !71 + %1023 = icmp sgt i32 %955, 2047, !dbg !71 + %1024 = icmp sgt i32 %957, 2047, !dbg !71 + %1025 = icmp sgt i32 %961, 2047, !dbg !71 + %1026 = icmp sgt i32 %963, 2047, !dbg !71 + %1027 = icmp sgt i32 %967, 2047, !dbg !71 + %1028 = icmp sgt i32 %969, 2047, !dbg !71 + %1029 = icmp sgt i32 %973, 2047, !dbg !71 + %1030 = icmp sgt i32 %975, 2047, !dbg !71 + %1031 = icmp sgt i32 %979, 2047, !dbg !71 + %1032 = icmp sgt i32 %981, 2047, !dbg !71 + %1033 = trunc <16 x i32> %935 to <16 x i16>, !dbg !72 + %1034 = and <16 x i16> %1033, splat (i16 2047), !dbg !72 + %1035 = zext nneg <16 x i16> %1034 to <16 x i64>, !dbg !65 + %1036 = icmp sgt <16 x i64> %345, %1035, !dbg !65 + %1037 = extractelement <16 x i1> %1036, i64 15, !dbg !73 + %1038 = and i1 %1017, %1037, !dbg !73 + %1039 = extractelement <16 x i1> %1036, i64 14, !dbg !73 + %1040 = and i1 %1018, %1039, !dbg !73 + %1041 = extractelement <16 x i1> %1036, i64 13, !dbg !73 + %1042 = and i1 %1019, %1041, !dbg !73 + %1043 = extractelement <16 x i1> %1036, i64 12, !dbg !73 + %1044 = and i1 %1020, %1043, !dbg !73 + %1045 = extractelement <16 x i1> %1036, i64 11, !dbg !73 + %1046 = and i1 %1021, %1045, !dbg !73 + %1047 = extractelement <16 x i1> %1036, i64 10, !dbg !73 + %1048 = and i1 %1022, %1047, !dbg !73 + %1049 = extractelement <16 x i1> %1036, i64 9, !dbg !73 + %1050 = and i1 %1023, %1049, !dbg !73 + %1051 = extractelement <16 x i1> %1036, i64 8, !dbg !73 + %1052 = and i1 %1024, %1051, !dbg !73 + %1053 = extractelement <16 x i1> %1036, i64 7, !dbg !73 + %1054 = and i1 %1025, %1053, !dbg !73 + %1055 = extractelement <16 x i1> %1036, i64 6, !dbg !73 + %1056 = and i1 %1026, %1055, !dbg !73 + %1057 = extractelement <16 x i1> %1036, i64 5, !dbg !73 + %1058 = and i1 %1027, %1057, !dbg !73 + %1059 = extractelement <16 x i1> %1036, i64 4, !dbg !73 + %1060 = and i1 %1028, %1059, !dbg !73 + %1061 = extractelement <16 x i1> %1036, i64 3, !dbg !73 + %1062 = and i1 %1029, %1061, !dbg !73 + %1063 = extractelement <16 x i1> %1036, i64 2, !dbg !73 + %1064 = and i1 %1030, %1063, !dbg !73 + %1065 = extractelement <16 x i1> %1036, i64 1, !dbg !73 + %1066 = and i1 %1031, %1065, !dbg !73 + %1067 = extractelement <16 x i1> %1036, i64 0, !dbg !73 + %1068 = and i1 %1032, %1067, !dbg !73 + %1069 = sub <32 x i32> %936, %215, !dbg !74 + %1070 = and <32 x i32> %1069, splat (i32 2047), !dbg !75 + %1071 = icmp eq <32 x i32> %1070, zeroinitializer, !dbg !76 + %1072 = extractelement <32 x i1> %1071, i64 31, !dbg !77 + %1073 = and i1 %1072, %1038, !dbg !77 + %1074 = extractelement <32 x i1> %1071, i64 30, !dbg !77 + %1075 = and i1 %1074, %1040, !dbg !77 + %1076 = extractelement <32 x i1> %1071, i64 29, !dbg !77 + %1077 = and i1 %1076, %1038, !dbg !77 + %1078 = extractelement <32 x i1> %1071, i64 28, !dbg !77 + %1079 = and i1 %1078, %1040, !dbg !77 + %1080 = extractelement <32 x i1> %1071, i64 27, !dbg !77 + %1081 = and i1 %1080, %1042, !dbg !77 + %1082 = extractelement <32 x i1> %1071, i64 26, !dbg !77 + %1083 = and i1 %1082, %1044, !dbg !77 + %1084 = extractelement <32 x i1> %1071, i64 25, !dbg !77 + %1085 = and i1 %1084, %1042, !dbg !77 + %1086 = extractelement <32 x i1> %1071, i64 24, !dbg !77 + %1087 = and i1 %1086, %1044, !dbg !77 + %1088 = extractelement <32 x i1> %1071, i64 23, !dbg !77 + %1089 = and i1 %1088, %1046, !dbg !77 + %1090 = extractelement <32 x i1> %1071, i64 22, !dbg !77 + %1091 = and i1 %1090, %1048, !dbg !77 + %1092 = extractelement <32 x i1> %1071, i64 21, !dbg !77 + %1093 = and i1 %1092, %1046, !dbg !77 + %1094 = extractelement <32 x i1> %1071, i64 20, !dbg !77 + %1095 = and i1 %1094, %1048, !dbg !77 + %1096 = extractelement <32 x i1> %1071, i64 19, !dbg !77 + %1097 = and i1 %1096, %1050, !dbg !77 + %1098 = extractelement <32 x i1> %1071, i64 18, !dbg !77 + %1099 = and i1 %1098, %1052, !dbg !77 + %1100 = extractelement <32 x i1> %1071, i64 17, !dbg !77 + %1101 = and i1 %1100, %1050, !dbg !77 + %1102 = extractelement <32 x i1> %1071, i64 16, !dbg !77 + %1103 = and i1 %1102, %1052, !dbg !77 + %1104 = extractelement <32 x i1> %1071, i64 15, !dbg !77 + %1105 = and i1 %1104, %1054, !dbg !77 + %1106 = extractelement <32 x i1> %1071, i64 14, !dbg !77 + %1107 = and i1 %1106, %1056, !dbg !77 + %1108 = extractelement <32 x i1> %1071, i64 13, !dbg !77 + %1109 = and i1 %1108, %1054, !dbg !77 + %1110 = extractelement <32 x i1> %1071, i64 12, !dbg !77 + %1111 = and i1 %1110, %1056, !dbg !77 + %1112 = extractelement <32 x i1> %1071, i64 11, !dbg !77 + %1113 = and i1 %1112, %1058, !dbg !77 + %1114 = extractelement <32 x i1> %1071, i64 10, !dbg !77 + %1115 = and i1 %1114, %1060, !dbg !77 + %1116 = extractelement <32 x i1> %1071, i64 9, !dbg !77 + %1117 = and i1 %1116, %1058, !dbg !77 + %1118 = extractelement <32 x i1> %1071, i64 8, !dbg !77 + %1119 = and i1 %1118, %1060, !dbg !77 + %1120 = extractelement <32 x i1> %1071, i64 7, !dbg !77 + %1121 = and i1 %1120, %1062, !dbg !77 + %1122 = extractelement <32 x i1> %1071, i64 6, !dbg !77 + %1123 = and i1 %1122, %1064, !dbg !77 + %1124 = extractelement <32 x i1> %1071, i64 5, !dbg !77 + %1125 = and i1 %1124, %1062, !dbg !77 + %1126 = extractelement <32 x i1> %1071, i64 4, !dbg !77 + %1127 = and i1 %1126, %1064, !dbg !77 + %1128 = extractelement <32 x i1> %1071, i64 3, !dbg !77 + %1129 = and i1 %1128, %1066, !dbg !77 + %1130 = extractelement <32 x i1> %1071, i64 2, !dbg !77 + %1131 = and i1 %1130, %1068, !dbg !77 + %1132 = extractelement <32 x i1> %1071, i64 1, !dbg !77 + %1133 = and i1 %1132, %1066, !dbg !77 + %1134 = extractelement <32 x i1> %1071, i64 0, !dbg !77 + %1135 = and i1 %1134, %1068, !dbg !77 + %1136 = or i1 %985, %1073, !dbg !78 + %1137 = or i1 %986, %1075, !dbg !78 + %1138 = or i1 %987, %1077, !dbg !78 + %1139 = or i1 %988, %1079, !dbg !78 + %1140 = or i1 %989, %1081, !dbg !78 + %1141 = or i1 %990, %1083, !dbg !78 + %1142 = or i1 %991, %1085, !dbg !78 + %1143 = or i1 %992, %1087, !dbg !78 + %1144 = or i1 %993, %1089, !dbg !78 + %1145 = or i1 %994, %1091, !dbg !78 + %1146 = or i1 %995, %1093, !dbg !78 + %1147 = or i1 %996, %1095, !dbg !78 + %1148 = or i1 %997, %1097, !dbg !78 + %1149 = or i1 %998, %1099, !dbg !78 + %1150 = or i1 %999, %1101, !dbg !78 + %1151 = or i1 %1000, %1103, !dbg !78 + %1152 = or i1 %1001, %1105, !dbg !78 + %1153 = or i1 %1002, %1107, !dbg !78 + %1154 = or i1 %1003, %1109, !dbg !78 + %1155 = or i1 %1004, %1111, !dbg !78 + %1156 = or i1 %1005, %1113, !dbg !78 + %1157 = or i1 %1006, %1115, !dbg !78 + %1158 = or i1 %1007, %1117, !dbg !78 + %1159 = or i1 %1008, %1119, !dbg !78 + %1160 = or i1 %1009, %1121, !dbg !78 + %1161 = or i1 %1010, %1123, !dbg !78 + %1162 = or i1 %1011, %1125, !dbg !78 + %1163 = or i1 %1012, %1127, !dbg !78 + %1164 = or i1 %1013, %1129, !dbg !78 + %1165 = or i1 %1014, %1131, !dbg !78 + %1166 = or i1 %1015, %1133, !dbg !78 + %1167 = or i1 %1016, %1135, !dbg !78 + %1168 = select i1 %904, i1 %1136, i1 false, !dbg !79 + %1169 = select i1 %906, i1 %1137, i1 false, !dbg !79 + %1170 = select i1 %904, i1 %1138, i1 false, !dbg !79 + %1171 = select i1 %906, i1 %1139, i1 false, !dbg !79 + %1172 = select i1 %908, i1 %1140, i1 false, !dbg !79 + %1173 = select i1 %910, i1 %1141, i1 false, !dbg !79 + %1174 = select i1 %908, i1 %1142, i1 false, !dbg !79 + %1175 = select i1 %910, i1 %1143, i1 false, !dbg !79 + %1176 = select i1 %912, i1 %1144, i1 false, !dbg !79 + %1177 = select i1 %914, i1 %1145, i1 false, !dbg !79 + %1178 = select i1 %912, i1 %1146, i1 false, !dbg !79 + %1179 = select i1 %914, i1 %1147, i1 false, !dbg !79 + %1180 = select i1 %916, i1 %1148, i1 false, !dbg !79 + %1181 = select i1 %918, i1 %1149, i1 false, !dbg !79 + %1182 = select i1 %916, i1 %1150, i1 false, !dbg !79 + %1183 = select i1 %918, i1 %1151, i1 false, !dbg !79 + %1184 = select i1 %920, i1 %1152, i1 false, !dbg !79 + %1185 = select i1 %922, i1 %1153, i1 false, !dbg !79 + %1186 = select i1 %920, i1 %1154, i1 false, !dbg !79 + %1187 = select i1 %922, i1 %1155, i1 false, !dbg !79 + %1188 = select i1 %924, i1 %1156, i1 false, !dbg !79 + %1189 = select i1 %926, i1 %1157, i1 false, !dbg !79 + %1190 = select i1 %924, i1 %1158, i1 false, !dbg !79 + %1191 = select i1 %926, i1 %1159, i1 false, !dbg !79 + %1192 = select i1 %928, i1 %1160, i1 false, !dbg !79 + %1193 = select i1 %930, i1 %1161, i1 false, !dbg !79 + %1194 = select i1 %928, i1 %1162, i1 false, !dbg !79 + %1195 = select i1 %930, i1 %1163, i1 false, !dbg !79 + %1196 = select i1 %932, i1 %1164, i1 false, !dbg !79 + %1197 = select i1 %934, i1 %1165, i1 false, !dbg !79 + %1198 = select i1 %932, i1 %1166, i1 false, !dbg !79 + %1199 = select i1 %934, i1 %1167, i1 false, !dbg !79 + %1200 = fmul float %871, 0x3FF7154760000000, !dbg !80 + %1201 = select i1 %1168, float %1200, float 0xFFF0000000000000, !dbg !81 + %1202 = fmul float %872, 0x3FF7154760000000, !dbg !80 + %1203 = select i1 %1169, float %1202, float 0xFFF0000000000000, !dbg !81 + %1204 = fmul float %873, 0x3FF7154760000000, !dbg !80 + %1205 = select i1 %1170, float %1204, float 0xFFF0000000000000, !dbg !81 + %1206 = fmul float %874, 0x3FF7154760000000, !dbg !80 + %1207 = select i1 %1171, float %1206, float 0xFFF0000000000000, !dbg !81 + %1208 = fmul float %875, 0x3FF7154760000000, !dbg !80 + %1209 = select i1 %1172, float %1208, float 0xFFF0000000000000, !dbg !81 + %1210 = fmul float %876, 0x3FF7154760000000, !dbg !80 + %1211 = select i1 %1173, float %1210, float 0xFFF0000000000000, !dbg !81 + %1212 = fmul float %877, 0x3FF7154760000000, !dbg !80 + %1213 = select i1 %1174, float %1212, float 0xFFF0000000000000, !dbg !81 + %1214 = fmul float %878, 0x3FF7154760000000, !dbg !80 + %1215 = select i1 %1175, float %1214, float 0xFFF0000000000000, !dbg !81 + %1216 = fmul float %879, 0x3FF7154760000000, !dbg !80 + %1217 = select i1 %1176, float %1216, float 0xFFF0000000000000, !dbg !81 + %1218 = fmul float %880, 0x3FF7154760000000, !dbg !80 + %1219 = select i1 %1177, float %1218, float 0xFFF0000000000000, !dbg !81 + %1220 = fmul float %881, 0x3FF7154760000000, !dbg !80 + %1221 = select i1 %1178, float %1220, float 0xFFF0000000000000, !dbg !81 + %1222 = fmul float %882, 0x3FF7154760000000, !dbg !80 + %1223 = select i1 %1179, float %1222, float 0xFFF0000000000000, !dbg !81 + %1224 = fmul float %883, 0x3FF7154760000000, !dbg !80 + %1225 = select i1 %1180, float %1224, float 0xFFF0000000000000, !dbg !81 + %1226 = fmul float %884, 0x3FF7154760000000, !dbg !80 + %1227 = select i1 %1181, float %1226, float 0xFFF0000000000000, !dbg !81 + %1228 = fmul float %885, 0x3FF7154760000000, !dbg !80 + %1229 = select i1 %1182, float %1228, float 0xFFF0000000000000, !dbg !81 + %1230 = fmul float %886, 0x3FF7154760000000, !dbg !80 + %1231 = select i1 %1183, float %1230, float 0xFFF0000000000000, !dbg !81 + %1232 = fmul float %887, 0x3FF7154760000000, !dbg !80 + %1233 = select i1 %1184, float %1232, float 0xFFF0000000000000, !dbg !81 + %1234 = fmul float %888, 0x3FF7154760000000, !dbg !80 + %1235 = select i1 %1185, float %1234, float 0xFFF0000000000000, !dbg !81 + %1236 = fmul float %889, 0x3FF7154760000000, !dbg !80 + %1237 = select i1 %1186, float %1236, float 0xFFF0000000000000, !dbg !81 + %1238 = fmul float %890, 0x3FF7154760000000, !dbg !80 + %1239 = select i1 %1187, float %1238, float 0xFFF0000000000000, !dbg !81 + %1240 = fmul float %891, 0x3FF7154760000000, !dbg !80 + %1241 = select i1 %1188, float %1240, float 0xFFF0000000000000, !dbg !81 + %1242 = fmul float %892, 0x3FF7154760000000, !dbg !80 + %1243 = select i1 %1189, float %1242, float 0xFFF0000000000000, !dbg !81 + %1244 = fmul float %893, 0x3FF7154760000000, !dbg !80 + %1245 = select i1 %1190, float %1244, float 0xFFF0000000000000, !dbg !81 + %1246 = fmul float %894, 0x3FF7154760000000, !dbg !80 + %1247 = select i1 %1191, float %1246, float 0xFFF0000000000000, !dbg !81 + %1248 = fmul float %895, 0x3FF7154760000000, !dbg !80 + %1249 = select i1 %1192, float %1248, float 0xFFF0000000000000, !dbg !81 + %1250 = fmul float %896, 0x3FF7154760000000, !dbg !80 + %1251 = select i1 %1193, float %1250, float 0xFFF0000000000000, !dbg !81 + %1252 = fmul float %897, 0x3FF7154760000000, !dbg !80 + %1253 = select i1 %1194, float %1252, float 0xFFF0000000000000, !dbg !81 + %1254 = fmul float %898, 0x3FF7154760000000, !dbg !80 + %1255 = select i1 %1195, float %1254, float 0xFFF0000000000000, !dbg !81 + %1256 = fmul float %899, 0x3FF7154760000000, !dbg !80 + %1257 = select i1 %1196, float %1256, float 0xFFF0000000000000, !dbg !81 + %1258 = fmul float %900, 0x3FF7154760000000, !dbg !80 + %1259 = select i1 %1197, float %1258, float 0xFFF0000000000000, !dbg !81 + %1260 = fmul float %901, 0x3FF7154760000000, !dbg !80 + %1261 = select i1 %1198, float %1260, float 0xFFF0000000000000, !dbg !81 + %1262 = fmul float %902, 0x3FF7154760000000, !dbg !80 + %1263 = select i1 %1199, float %1262, float 0xFFF0000000000000, !dbg !81 + %1264 = tail call float @llvm.maxnum.f32(float %1201, float %1203), !dbg !82 + %1265 = tail call float @llvm.maxnum.f32(float %1205, float %1207), !dbg !82 + %1266 = tail call float @llvm.maxnum.f32(float %1264, float %1209), !dbg !82 + %1267 = tail call float @llvm.maxnum.f32(float %1266, float %1211), !dbg !82 + %1268 = tail call float @llvm.maxnum.f32(float %1265, float %1213), !dbg !82 + %1269 = tail call float @llvm.maxnum.f32(float %1268, float %1215), !dbg !82 + %1270 = tail call float @llvm.maxnum.f32(float %1267, float %1217), !dbg !82 + %1271 = tail call float @llvm.maxnum.f32(float %1270, float %1219), !dbg !82 + %1272 = tail call float @llvm.maxnum.f32(float %1269, float %1221), !dbg !82 + %1273 = tail call float @llvm.maxnum.f32(float %1272, float %1223), !dbg !82 + %1274 = tail call float @llvm.maxnum.f32(float %1271, float %1225), !dbg !82 + %1275 = tail call float @llvm.maxnum.f32(float %1274, float %1227), !dbg !82 + %1276 = tail call float @llvm.maxnum.f32(float %1273, float %1229), !dbg !82 + %1277 = tail call float @llvm.maxnum.f32(float %1276, float %1231), !dbg !82 + %1278 = tail call float @llvm.maxnum.f32(float %1275, float %1233), !dbg !82 + %1279 = tail call float @llvm.maxnum.f32(float %1278, float %1235), !dbg !82 + %1280 = tail call float @llvm.maxnum.f32(float %1277, float %1237), !dbg !82 + %1281 = tail call float @llvm.maxnum.f32(float %1280, float %1239), !dbg !82 + %1282 = tail call float @llvm.maxnum.f32(float %1279, float %1241), !dbg !82 + %1283 = tail call float @llvm.maxnum.f32(float %1282, float %1243), !dbg !82 + %1284 = tail call float @llvm.maxnum.f32(float %1281, float %1245), !dbg !82 + %1285 = tail call float @llvm.maxnum.f32(float %1284, float %1247), !dbg !82 + %1286 = tail call float @llvm.maxnum.f32(float %1283, float %1249), !dbg !82 + %1287 = tail call float @llvm.maxnum.f32(float %1286, float %1251), !dbg !82 + %1288 = tail call float @llvm.maxnum.f32(float %1285, float %1253), !dbg !82 + %1289 = tail call float @llvm.maxnum.f32(float %1288, float %1255), !dbg !82 + %1290 = tail call float @llvm.maxnum.f32(float %1287, float %1257), !dbg !82 + %1291 = tail call float @llvm.maxnum.f32(float %1290, float %1259), !dbg !82 + %1292 = tail call float @llvm.maxnum.f32(float %1289, float %1261), !dbg !82 + %1293 = tail call float @llvm.maxnum.f32(float %1292, float %1263), !dbg !82 + %1294 = bitcast float %1291 to i32, !dbg !83 + %1295 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1294, i32 2, i32 31), !dbg !83 + %1296 = bitcast i32 %1295 to float, !dbg !83 + %1297 = bitcast float %1293 to i32, !dbg !83 + %1298 = tail call float @llvm.maxnum.f32(float %1291, float %1296), !dbg !82 + %1299 = bitcast float %1298 to i32, !dbg !83 + %1300 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1299, i32 1, i32 31), !dbg !83 + %1301 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1297, i32 2, i32 31), !dbg !83 + %1302 = bitcast i32 %1301 to float, !dbg !83 + %1303 = tail call float @llvm.maxnum.f32(float %1293, float %1302), !dbg !82 + %1304 = bitcast float %1303 to i32, !dbg !83 + %1305 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1304, i32 1, i32 31), !dbg !83 + %1306 = insertelement <2 x i32> poison, i32 %1300, i64 0, !dbg !83 + %1307 = insertelement <2 x i32> %1306, i32 %1305, i64 1, !dbg !83 + %1308 = bitcast <2 x i32> %1307 to <2 x float>, !dbg !83 + %1309 = insertelement <2 x float> poison, float %1298, i64 0, !dbg !82 + %1310 = insertelement <2 x float> %1309, float %1303, i64 1, !dbg !82 + %1311 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %1310, <2 x float> %1308), !dbg !82 + %1312 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %418, <2 x float> %1311), !dbg !84 + %1313 = extractelement <2 x float> %1312, i64 0, !dbg !85 + %1314 = fcmp oeq float %1313, 0xFFF0000000000000, !dbg !86 + %1315 = extractelement <2 x float> %1312, i64 1, !dbg !85 + %1316 = fcmp oeq float %1315, 0xFFF0000000000000, !dbg !86 + %1317 = select i1 %1314, float 0.000000e+00, float %1313, !dbg !85 + %1318 = select i1 %1316, float 0.000000e+00, float %1315, !dbg !85 + %1319 = extractelement <2 x float> %418, i64 0, !dbg !87 + %1320 = fsub float %1319, %1317, !dbg !87 + %1321 = extractelement <2 x float> %418, i64 1, !dbg !87 + %1322 = fsub float %1321, %1318, !dbg !87 + %1323 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i254 = icmp eq i32 %1323, 0, !dbg !88 + br i1 %.not.i254, label %1326, label %1324, !dbg !88 + +1324: ; preds = %346 + %1325 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1320) #2, !dbg !88 + br label %__nv_exp2f.exit256, !dbg !88 + +1326: ; preds = %346 + %1327 = tail call float @llvm.nvvm.ex2.approx.f(float %1320) #2, !dbg !88 + br label %__nv_exp2f.exit256, !dbg !88 + +__nv_exp2f.exit256: ; preds = %1324, %1326 + %.0.i255 = phi float [ %1325, %1324 ], [ %1327, %1326 ], !dbg !88 + %1328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !88 + %.not.i257 = icmp eq i32 %1328, 0, !dbg !88 + br i1 %.not.i257, label %1331, label %1329, !dbg !88 + +1329: ; preds = %__nv_exp2f.exit256 + %1330 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1322) #2, !dbg !88 + br label %__nv_exp2f.exit259, !dbg !88 + +1331: ; preds = %__nv_exp2f.exit256 + %1332 = tail call float @llvm.nvvm.ex2.approx.f(float %1322) #2, !dbg !88 + br label %__nv_exp2f.exit259, !dbg !88 + +__nv_exp2f.exit259: ; preds = %1329, %1331 + %.0.i258 = phi float [ %1330, %1329 ], [ %1332, %1331 ], !dbg !88 + %1333 = fsub float %1201, %1317, !dbg !89 + %1334 = fsub float %1203, %1317, !dbg !89 + %1335 = fsub float %1205, %1318, !dbg !89 + %1336 = fsub float %1207, %1318, !dbg !89 + %1337 = fsub float %1209, %1317, !dbg !89 + %1338 = fsub float %1211, %1317, !dbg !89 + %1339 = fsub float %1213, %1318, !dbg !89 + %1340 = fsub float %1215, %1318, !dbg !89 + %1341 = fsub float %1217, %1317, !dbg !89 + %1342 = fsub float %1219, %1317, !dbg !89 + %1343 = fsub float %1221, %1318, !dbg !89 + %1344 = fsub float %1223, %1318, !dbg !89 + %1345 = fsub float %1225, %1317, !dbg !89 + %1346 = fsub float %1227, %1317, !dbg !89 + %1347 = fsub float %1229, %1318, !dbg !89 + %1348 = fsub float %1231, %1318, !dbg !89 + %1349 = fsub float %1233, %1317, !dbg !89 + %1350 = fsub float %1235, %1317, !dbg !89 + %1351 = fsub float %1237, %1318, !dbg !89 + %1352 = fsub float %1239, %1318, !dbg !89 + %1353 = fsub float %1241, %1317, !dbg !89 + %1354 = fsub float %1243, %1317, !dbg !89 + %1355 = fsub float %1245, %1318, !dbg !89 + %1356 = fsub float %1247, %1318, !dbg !89 + %1357 = fsub float %1249, %1317, !dbg !89 + %1358 = fsub float %1251, %1317, !dbg !89 + %1359 = fsub float %1253, %1318, !dbg !89 + %1360 = fsub float %1255, %1318, !dbg !89 + %1361 = fsub float %1257, %1317, !dbg !89 + %1362 = fsub float %1259, %1317, !dbg !89 + %1363 = fsub float %1261, %1318, !dbg !89 + %1364 = fsub float %1263, %1318, !dbg !89 + %1365 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i260 = icmp eq i32 %1365, 0, !dbg !90 + br i1 %.not.i260, label %1368, label %1366, !dbg !90 + +1366: ; preds = %__nv_exp2f.exit259 + %1367 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1333) #2, !dbg !90 + br label %__nv_exp2f.exit262, !dbg !90 + +1368: ; preds = %__nv_exp2f.exit259 + %1369 = tail call float @llvm.nvvm.ex2.approx.f(float %1333) #2, !dbg !90 + br label %__nv_exp2f.exit262, !dbg !90 + +__nv_exp2f.exit262: ; preds = %1366, %1368 + %.0.i261 = phi float [ %1367, %1366 ], [ %1369, %1368 ], !dbg !90 + %1370 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i263 = icmp eq i32 %1370, 0, !dbg !90 + br i1 %.not.i263, label %1373, label %1371, !dbg !90 + +1371: ; preds = %__nv_exp2f.exit262 + %1372 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1334) #2, !dbg !90 + br label %__nv_exp2f.exit265, !dbg !90 + +1373: ; preds = %__nv_exp2f.exit262 + %1374 = tail call float @llvm.nvvm.ex2.approx.f(float %1334) #2, !dbg !90 + br label %__nv_exp2f.exit265, !dbg !90 + +__nv_exp2f.exit265: ; preds = %1371, %1373 + %.0.i264 = phi float [ %1372, %1371 ], [ %1374, %1373 ], !dbg !90 + %1375 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i266 = icmp eq i32 %1375, 0, !dbg !90 + br i1 %.not.i266, label %1378, label %1376, !dbg !90 + +1376: ; preds = %__nv_exp2f.exit265 + %1377 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1335) #2, !dbg !90 + br label %__nv_exp2f.exit268, !dbg !90 + +1378: ; preds = %__nv_exp2f.exit265 + %1379 = tail call float @llvm.nvvm.ex2.approx.f(float %1335) #2, !dbg !90 + br label %__nv_exp2f.exit268, !dbg !90 + +__nv_exp2f.exit268: ; preds = %1376, %1378 + %.0.i267 = phi float [ %1377, %1376 ], [ %1379, %1378 ], !dbg !90 + %1380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i269 = icmp eq i32 %1380, 0, !dbg !90 + br i1 %.not.i269, label %1383, label %1381, !dbg !90 + +1381: ; preds = %__nv_exp2f.exit268 + %1382 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1336) #2, !dbg !90 + br label %__nv_exp2f.exit271, !dbg !90 + +1383: ; preds = %__nv_exp2f.exit268 + %1384 = tail call float @llvm.nvvm.ex2.approx.f(float %1336) #2, !dbg !90 + br label %__nv_exp2f.exit271, !dbg !90 + +__nv_exp2f.exit271: ; preds = %1381, %1383 + %.0.i270 = phi float [ %1382, %1381 ], [ %1384, %1383 ], !dbg !90 + %1385 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i272 = icmp eq i32 %1385, 0, !dbg !90 + br i1 %.not.i272, label %1388, label %1386, !dbg !90 + +1386: ; preds = %__nv_exp2f.exit271 + %1387 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1337) #2, !dbg !90 + br label %__nv_exp2f.exit274, !dbg !90 + +1388: ; preds = %__nv_exp2f.exit271 + %1389 = tail call float @llvm.nvvm.ex2.approx.f(float %1337) #2, !dbg !90 + br label %__nv_exp2f.exit274, !dbg !90 + +__nv_exp2f.exit274: ; preds = %1386, %1388 + %.0.i273 = phi float [ %1387, %1386 ], [ %1389, %1388 ], !dbg !90 + %1390 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i275 = icmp eq i32 %1390, 0, !dbg !90 + br i1 %.not.i275, label %1393, label %1391, !dbg !90 + +1391: ; preds = %__nv_exp2f.exit274 + %1392 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1338) #2, !dbg !90 + br label %__nv_exp2f.exit277, !dbg !90 + +1393: ; preds = %__nv_exp2f.exit274 + %1394 = tail call float @llvm.nvvm.ex2.approx.f(float %1338) #2, !dbg !90 + br label %__nv_exp2f.exit277, !dbg !90 + +__nv_exp2f.exit277: ; preds = %1391, %1393 + %.0.i276 = phi float [ %1392, %1391 ], [ %1394, %1393 ], !dbg !90 + %1395 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i278 = icmp eq i32 %1395, 0, !dbg !90 + br i1 %.not.i278, label %1398, label %1396, !dbg !90 + +1396: ; preds = %__nv_exp2f.exit277 + %1397 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1339) #2, !dbg !90 + br label %__nv_exp2f.exit280, !dbg !90 + +1398: ; preds = %__nv_exp2f.exit277 + %1399 = tail call float @llvm.nvvm.ex2.approx.f(float %1339) #2, !dbg !90 + br label %__nv_exp2f.exit280, !dbg !90 + +__nv_exp2f.exit280: ; preds = %1396, %1398 + %.0.i279 = phi float [ %1397, %1396 ], [ %1399, %1398 ], !dbg !90 + %1400 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i281 = icmp eq i32 %1400, 0, !dbg !90 + br i1 %.not.i281, label %1403, label %1401, !dbg !90 + +1401: ; preds = %__nv_exp2f.exit280 + %1402 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1340) #2, !dbg !90 + br label %__nv_exp2f.exit283, !dbg !90 + +1403: ; preds = %__nv_exp2f.exit280 + %1404 = tail call float @llvm.nvvm.ex2.approx.f(float %1340) #2, !dbg !90 + br label %__nv_exp2f.exit283, !dbg !90 + +__nv_exp2f.exit283: ; preds = %1401, %1403 + %.0.i282 = phi float [ %1402, %1401 ], [ %1404, %1403 ], !dbg !90 + %1405 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i284 = icmp eq i32 %1405, 0, !dbg !90 + br i1 %.not.i284, label %1408, label %1406, !dbg !90 + +1406: ; preds = %__nv_exp2f.exit283 + %1407 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1341) #2, !dbg !90 + br label %__nv_exp2f.exit286, !dbg !90 + +1408: ; preds = %__nv_exp2f.exit283 + %1409 = tail call float @llvm.nvvm.ex2.approx.f(float %1341) #2, !dbg !90 + br label %__nv_exp2f.exit286, !dbg !90 + +__nv_exp2f.exit286: ; preds = %1406, %1408 + %.0.i285 = phi float [ %1407, %1406 ], [ %1409, %1408 ], !dbg !90 + %1410 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i287 = icmp eq i32 %1410, 0, !dbg !90 + br i1 %.not.i287, label %1413, label %1411, !dbg !90 + +1411: ; preds = %__nv_exp2f.exit286 + %1412 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1342) #2, !dbg !90 + br label %__nv_exp2f.exit289, !dbg !90 + +1413: ; preds = %__nv_exp2f.exit286 + %1414 = tail call float @llvm.nvvm.ex2.approx.f(float %1342) #2, !dbg !90 + br label %__nv_exp2f.exit289, !dbg !90 + +__nv_exp2f.exit289: ; preds = %1411, %1413 + %.0.i288 = phi float [ %1412, %1411 ], [ %1414, %1413 ], !dbg !90 + %1415 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i290 = icmp eq i32 %1415, 0, !dbg !90 + br i1 %.not.i290, label %1418, label %1416, !dbg !90 + +1416: ; preds = %__nv_exp2f.exit289 + %1417 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1343) #2, !dbg !90 + br label %__nv_exp2f.exit292, !dbg !90 + +1418: ; preds = %__nv_exp2f.exit289 + %1419 = tail call float @llvm.nvvm.ex2.approx.f(float %1343) #2, !dbg !90 + br label %__nv_exp2f.exit292, !dbg !90 + +__nv_exp2f.exit292: ; preds = %1416, %1418 + %.0.i291 = phi float [ %1417, %1416 ], [ %1419, %1418 ], !dbg !90 + %1420 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i293 = icmp eq i32 %1420, 0, !dbg !90 + br i1 %.not.i293, label %1423, label %1421, !dbg !90 + +1421: ; preds = %__nv_exp2f.exit292 + %1422 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1344) #2, !dbg !90 + br label %__nv_exp2f.exit295, !dbg !90 + +1423: ; preds = %__nv_exp2f.exit292 + %1424 = tail call float @llvm.nvvm.ex2.approx.f(float %1344) #2, !dbg !90 + br label %__nv_exp2f.exit295, !dbg !90 + +__nv_exp2f.exit295: ; preds = %1421, %1423 + %.0.i294 = phi float [ %1422, %1421 ], [ %1424, %1423 ], !dbg !90 + %1425 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i296 = icmp eq i32 %1425, 0, !dbg !90 + br i1 %.not.i296, label %1428, label %1426, !dbg !90 + +1426: ; preds = %__nv_exp2f.exit295 + %1427 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1345) #2, !dbg !90 + br label %__nv_exp2f.exit298, !dbg !90 + +1428: ; preds = %__nv_exp2f.exit295 + %1429 = tail call float @llvm.nvvm.ex2.approx.f(float %1345) #2, !dbg !90 + br label %__nv_exp2f.exit298, !dbg !90 + +__nv_exp2f.exit298: ; preds = %1426, %1428 + %.0.i297 = phi float [ %1427, %1426 ], [ %1429, %1428 ], !dbg !90 + %1430 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i299 = icmp eq i32 %1430, 0, !dbg !90 + br i1 %.not.i299, label %1433, label %1431, !dbg !90 + +1431: ; preds = %__nv_exp2f.exit298 + %1432 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1346) #2, !dbg !90 + br label %__nv_exp2f.exit301, !dbg !90 + +1433: ; preds = %__nv_exp2f.exit298 + %1434 = tail call float @llvm.nvvm.ex2.approx.f(float %1346) #2, !dbg !90 + br label %__nv_exp2f.exit301, !dbg !90 + +__nv_exp2f.exit301: ; preds = %1431, %1433 + %.0.i300 = phi float [ %1432, %1431 ], [ %1434, %1433 ], !dbg !90 + %1435 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i302 = icmp eq i32 %1435, 0, !dbg !90 + br i1 %.not.i302, label %1438, label %1436, !dbg !90 + +1436: ; preds = %__nv_exp2f.exit301 + %1437 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1347) #2, !dbg !90 + br label %__nv_exp2f.exit304, !dbg !90 + +1438: ; preds = %__nv_exp2f.exit301 + %1439 = tail call float @llvm.nvvm.ex2.approx.f(float %1347) #2, !dbg !90 + br label %__nv_exp2f.exit304, !dbg !90 + +__nv_exp2f.exit304: ; preds = %1436, %1438 + %.0.i303 = phi float [ %1437, %1436 ], [ %1439, %1438 ], !dbg !90 + %1440 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i305 = icmp eq i32 %1440, 0, !dbg !90 + br i1 %.not.i305, label %1443, label %1441, !dbg !90 + +1441: ; preds = %__nv_exp2f.exit304 + %1442 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1348) #2, !dbg !90 + br label %__nv_exp2f.exit307, !dbg !90 + +1443: ; preds = %__nv_exp2f.exit304 + %1444 = tail call float @llvm.nvvm.ex2.approx.f(float %1348) #2, !dbg !90 + br label %__nv_exp2f.exit307, !dbg !90 + +__nv_exp2f.exit307: ; preds = %1441, %1443 + %.0.i306 = phi float [ %1442, %1441 ], [ %1444, %1443 ], !dbg !90 + %1445 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i308 = icmp eq i32 %1445, 0, !dbg !90 + br i1 %.not.i308, label %1448, label %1446, !dbg !90 + +1446: ; preds = %__nv_exp2f.exit307 + %1447 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1349) #2, !dbg !90 + br label %__nv_exp2f.exit310, !dbg !90 + +1448: ; preds = %__nv_exp2f.exit307 + %1449 = tail call float @llvm.nvvm.ex2.approx.f(float %1349) #2, !dbg !90 + br label %__nv_exp2f.exit310, !dbg !90 + +__nv_exp2f.exit310: ; preds = %1446, %1448 + %.0.i309 = phi float [ %1447, %1446 ], [ %1449, %1448 ], !dbg !90 + %1450 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i311 = icmp eq i32 %1450, 0, !dbg !90 + br i1 %.not.i311, label %1453, label %1451, !dbg !90 + +1451: ; preds = %__nv_exp2f.exit310 + %1452 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1350) #2, !dbg !90 + br label %__nv_exp2f.exit313, !dbg !90 + +1453: ; preds = %__nv_exp2f.exit310 + %1454 = tail call float @llvm.nvvm.ex2.approx.f(float %1350) #2, !dbg !90 + br label %__nv_exp2f.exit313, !dbg !90 + +__nv_exp2f.exit313: ; preds = %1451, %1453 + %.0.i312 = phi float [ %1452, %1451 ], [ %1454, %1453 ], !dbg !90 + %1455 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i314 = icmp eq i32 %1455, 0, !dbg !90 + br i1 %.not.i314, label %1458, label %1456, !dbg !90 + +1456: ; preds = %__nv_exp2f.exit313 + %1457 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1351) #2, !dbg !90 + br label %__nv_exp2f.exit316, !dbg !90 + +1458: ; preds = %__nv_exp2f.exit313 + %1459 = tail call float @llvm.nvvm.ex2.approx.f(float %1351) #2, !dbg !90 + br label %__nv_exp2f.exit316, !dbg !90 + +__nv_exp2f.exit316: ; preds = %1456, %1458 + %.0.i315 = phi float [ %1457, %1456 ], [ %1459, %1458 ], !dbg !90 + %1460 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i317 = icmp eq i32 %1460, 0, !dbg !90 + br i1 %.not.i317, label %1463, label %1461, !dbg !90 + +1461: ; preds = %__nv_exp2f.exit316 + %1462 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1352) #2, !dbg !90 + br label %__nv_exp2f.exit319, !dbg !90 + +1463: ; preds = %__nv_exp2f.exit316 + %1464 = tail call float @llvm.nvvm.ex2.approx.f(float %1352) #2, !dbg !90 + br label %__nv_exp2f.exit319, !dbg !90 + +__nv_exp2f.exit319: ; preds = %1461, %1463 + %.0.i318 = phi float [ %1462, %1461 ], [ %1464, %1463 ], !dbg !90 + %1465 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i320 = icmp eq i32 %1465, 0, !dbg !90 + br i1 %.not.i320, label %1468, label %1466, !dbg !90 + +1466: ; preds = %__nv_exp2f.exit319 + %1467 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1353) #2, !dbg !90 + br label %__nv_exp2f.exit322, !dbg !90 + +1468: ; preds = %__nv_exp2f.exit319 + %1469 = tail call float @llvm.nvvm.ex2.approx.f(float %1353) #2, !dbg !90 + br label %__nv_exp2f.exit322, !dbg !90 + +__nv_exp2f.exit322: ; preds = %1466, %1468 + %.0.i321 = phi float [ %1467, %1466 ], [ %1469, %1468 ], !dbg !90 + %1470 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i323 = icmp eq i32 %1470, 0, !dbg !90 + br i1 %.not.i323, label %1473, label %1471, !dbg !90 + +1471: ; preds = %__nv_exp2f.exit322 + %1472 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1354) #2, !dbg !90 + br label %__nv_exp2f.exit325, !dbg !90 + +1473: ; preds = %__nv_exp2f.exit322 + %1474 = tail call float @llvm.nvvm.ex2.approx.f(float %1354) #2, !dbg !90 + br label %__nv_exp2f.exit325, !dbg !90 + +__nv_exp2f.exit325: ; preds = %1471, %1473 + %.0.i324 = phi float [ %1472, %1471 ], [ %1474, %1473 ], !dbg !90 + %1475 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i326 = icmp eq i32 %1475, 0, !dbg !90 + br i1 %.not.i326, label %1478, label %1476, !dbg !90 + +1476: ; preds = %__nv_exp2f.exit325 + %1477 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1355) #2, !dbg !90 + br label %__nv_exp2f.exit328, !dbg !90 + +1478: ; preds = %__nv_exp2f.exit325 + %1479 = tail call float @llvm.nvvm.ex2.approx.f(float %1355) #2, !dbg !90 + br label %__nv_exp2f.exit328, !dbg !90 + +__nv_exp2f.exit328: ; preds = %1476, %1478 + %.0.i327 = phi float [ %1477, %1476 ], [ %1479, %1478 ], !dbg !90 + %1480 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i329 = icmp eq i32 %1480, 0, !dbg !90 + br i1 %.not.i329, label %1483, label %1481, !dbg !90 + +1481: ; preds = %__nv_exp2f.exit328 + %1482 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1356) #2, !dbg !90 + br label %__nv_exp2f.exit331, !dbg !90 + +1483: ; preds = %__nv_exp2f.exit328 + %1484 = tail call float @llvm.nvvm.ex2.approx.f(float %1356) #2, !dbg !90 + br label %__nv_exp2f.exit331, !dbg !90 + +__nv_exp2f.exit331: ; preds = %1481, %1483 + %.0.i330 = phi float [ %1482, %1481 ], [ %1484, %1483 ], !dbg !90 + %1485 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i332 = icmp eq i32 %1485, 0, !dbg !90 + br i1 %.not.i332, label %1488, label %1486, !dbg !90 + +1486: ; preds = %__nv_exp2f.exit331 + %1487 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1357) #2, !dbg !90 + br label %__nv_exp2f.exit334, !dbg !90 + +1488: ; preds = %__nv_exp2f.exit331 + %1489 = tail call float @llvm.nvvm.ex2.approx.f(float %1357) #2, !dbg !90 + br label %__nv_exp2f.exit334, !dbg !90 + +__nv_exp2f.exit334: ; preds = %1486, %1488 + %.0.i333 = phi float [ %1487, %1486 ], [ %1489, %1488 ], !dbg !90 + %1490 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i335 = icmp eq i32 %1490, 0, !dbg !90 + br i1 %.not.i335, label %1493, label %1491, !dbg !90 + +1491: ; preds = %__nv_exp2f.exit334 + %1492 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1358) #2, !dbg !90 + br label %__nv_exp2f.exit337, !dbg !90 + +1493: ; preds = %__nv_exp2f.exit334 + %1494 = tail call float @llvm.nvvm.ex2.approx.f(float %1358) #2, !dbg !90 + br label %__nv_exp2f.exit337, !dbg !90 + +__nv_exp2f.exit337: ; preds = %1491, %1493 + %.0.i336 = phi float [ %1492, %1491 ], [ %1494, %1493 ], !dbg !90 + %1495 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i338 = icmp eq i32 %1495, 0, !dbg !90 + br i1 %.not.i338, label %1498, label %1496, !dbg !90 + +1496: ; preds = %__nv_exp2f.exit337 + %1497 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1359) #2, !dbg !90 + br label %__nv_exp2f.exit340, !dbg !90 + +1498: ; preds = %__nv_exp2f.exit337 + %1499 = tail call float @llvm.nvvm.ex2.approx.f(float %1359) #2, !dbg !90 + br label %__nv_exp2f.exit340, !dbg !90 + +__nv_exp2f.exit340: ; preds = %1496, %1498 + %.0.i339 = phi float [ %1497, %1496 ], [ %1499, %1498 ], !dbg !90 + %1500 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i341 = icmp eq i32 %1500, 0, !dbg !90 + br i1 %.not.i341, label %1503, label %1501, !dbg !90 + +1501: ; preds = %__nv_exp2f.exit340 + %1502 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1360) #2, !dbg !90 + br label %__nv_exp2f.exit343, !dbg !90 + +1503: ; preds = %__nv_exp2f.exit340 + %1504 = tail call float @llvm.nvvm.ex2.approx.f(float %1360) #2, !dbg !90 + br label %__nv_exp2f.exit343, !dbg !90 + +__nv_exp2f.exit343: ; preds = %1501, %1503 + %.0.i342 = phi float [ %1502, %1501 ], [ %1504, %1503 ], !dbg !90 + %1505 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i344 = icmp eq i32 %1505, 0, !dbg !90 + br i1 %.not.i344, label %1508, label %1506, !dbg !90 + +1506: ; preds = %__nv_exp2f.exit343 + %1507 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1361) #2, !dbg !90 + br label %__nv_exp2f.exit346, !dbg !90 + +1508: ; preds = %__nv_exp2f.exit343 + %1509 = tail call float @llvm.nvvm.ex2.approx.f(float %1361) #2, !dbg !90 + br label %__nv_exp2f.exit346, !dbg !90 + +__nv_exp2f.exit346: ; preds = %1506, %1508 + %.0.i345 = phi float [ %1507, %1506 ], [ %1509, %1508 ], !dbg !90 + %1510 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i347 = icmp eq i32 %1510, 0, !dbg !90 + br i1 %.not.i347, label %1513, label %1511, !dbg !90 + +1511: ; preds = %__nv_exp2f.exit346 + %1512 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1362) #2, !dbg !90 + br label %__nv_exp2f.exit349, !dbg !90 + +1513: ; preds = %__nv_exp2f.exit346 + %1514 = tail call float @llvm.nvvm.ex2.approx.f(float %1362) #2, !dbg !90 + br label %__nv_exp2f.exit349, !dbg !90 + +__nv_exp2f.exit349: ; preds = %1511, %1513 + %.0.i348 = phi float [ %1512, %1511 ], [ %1514, %1513 ], !dbg !90 + %1515 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i350 = icmp eq i32 %1515, 0, !dbg !90 + br i1 %.not.i350, label %1518, label %1516, !dbg !90 + +1516: ; preds = %__nv_exp2f.exit349 + %1517 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1363) #2, !dbg !90 + br label %__nv_exp2f.exit352, !dbg !90 + +1518: ; preds = %__nv_exp2f.exit349 + %1519 = tail call float @llvm.nvvm.ex2.approx.f(float %1363) #2, !dbg !90 + br label %__nv_exp2f.exit352, !dbg !90 + +__nv_exp2f.exit352: ; preds = %1516, %1518 + %.0.i351 = phi float [ %1517, %1516 ], [ %1519, %1518 ], !dbg !90 + %1520 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !90 + %.not.i353 = icmp eq i32 %1520, 0, !dbg !90 + br i1 %.not.i353, label %1523, label %1521, !dbg !90 + +1521: ; preds = %__nv_exp2f.exit352 + %1522 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %1364) #2, !dbg !90 + br label %__nv_exp2f.exit355, !dbg !90 + +1523: ; preds = %__nv_exp2f.exit352 + %1524 = tail call float @llvm.nvvm.ex2.approx.f(float %1364) #2, !dbg !90 + br label %__nv_exp2f.exit355, !dbg !90 + +__nv_exp2f.exit355: ; preds = %1521, %1523 + %.0.i354 = phi float [ %1522, %1521 ], [ %1524, %1523 ], !dbg !90 + %1525 = fmul float %351, %.0.i255, !dbg !91 + %1526 = fmul float %352, %.0.i258, !dbg !91 + %1527 = fadd float %.0.i261, %.0.i264, !dbg !92 + %1528 = fadd float %.0.i267, %.0.i270, !dbg !92 + %1529 = fadd float %1527, %.0.i273, !dbg !92 + %1530 = fadd float %1529, %.0.i276, !dbg !92 + %1531 = fadd float %1528, %.0.i279, !dbg !92 + %1532 = fadd float %1531, %.0.i282, !dbg !92 + %1533 = fadd float %1530, %.0.i285, !dbg !92 + %1534 = fadd float %1533, %.0.i288, !dbg !92 + %1535 = fadd float %1532, %.0.i291, !dbg !92 + %1536 = fadd float %1535, %.0.i294, !dbg !92 + %1537 = fadd float %1534, %.0.i297, !dbg !92 + %1538 = fadd float %1537, %.0.i300, !dbg !92 + %1539 = fadd float %1536, %.0.i303, !dbg !92 + %1540 = fadd float %1539, %.0.i306, !dbg !92 + %1541 = fadd float %1538, %.0.i309, !dbg !92 + %1542 = fadd float %1541, %.0.i312, !dbg !92 + %1543 = fadd float %1540, %.0.i315, !dbg !92 + %1544 = fadd float %1543, %.0.i318, !dbg !92 + %1545 = fadd float %1542, %.0.i321, !dbg !92 + %1546 = fadd float %1545, %.0.i324, !dbg !92 + %1547 = fadd float %1544, %.0.i327, !dbg !92 + %1548 = fadd float %1547, %.0.i330, !dbg !92 + %1549 = fadd float %1546, %.0.i333, !dbg !92 + %1550 = fadd float %1549, %.0.i336, !dbg !92 + %1551 = fadd float %1548, %.0.i339, !dbg !92 + %1552 = fadd float %1551, %.0.i342, !dbg !92 + %1553 = fadd float %1550, %.0.i345, !dbg !92 + %1554 = fadd float %1553, %.0.i348, !dbg !92 + %1555 = fadd float %1552, %.0.i351, !dbg !92 + %1556 = fadd float %1555, %.0.i354, !dbg !92 + %1557 = bitcast float %1554 to i32, !dbg !93 + %1558 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1557, i32 2, i32 31), !dbg !93 + %1559 = bitcast i32 %1558 to float, !dbg !93 + %1560 = fadd float %1554, %1559, !dbg !92 + %1561 = bitcast float %1560 to i32, !dbg !93 + %1562 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1561, i32 1, i32 31), !dbg !93 + %1563 = bitcast i32 %1562 to float, !dbg !93 + %1564 = fadd float %1560, %1563, !dbg !92 + %1565 = bitcast float %1556 to i32, !dbg !93 + %1566 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1565, i32 2, i32 31), !dbg !93 + %1567 = bitcast i32 %1566 to float, !dbg !93 + %1568 = fadd float %1556, %1567, !dbg !92 + %1569 = bitcast float %1568 to i32, !dbg !93 + %1570 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1569, i32 1, i32 31), !dbg !93 + %1571 = bitcast i32 %1570 to float, !dbg !93 + %1572 = fadd float %1568, %1571, !dbg !92 + %1573 = fadd float %1525, %1564, !dbg !94 + %1574 = fadd float %1526, %1572, !dbg !94 + %1575 = fmul float %807, %.0.i255, !dbg !95 + %1576 = fmul float %808, %.0.i255, !dbg !95 + %1577 = fmul float %809, %.0.i258, !dbg !95 + %1578 = fmul float %810, %.0.i258, !dbg !95 + %1579 = fmul float %811, %.0.i255, !dbg !95 + %1580 = fmul float %812, %.0.i255, !dbg !95 + %1581 = fmul float %813, %.0.i258, !dbg !95 + %1582 = fmul float %814, %.0.i258, !dbg !95 + %1583 = fmul float %815, %.0.i255, !dbg !95 + %1584 = fmul float %816, %.0.i255, !dbg !95 + %1585 = fmul float %817, %.0.i258, !dbg !95 + %1586 = fmul float %818, %.0.i258, !dbg !95 + %1587 = fmul float %819, %.0.i255, !dbg !95 + %1588 = fmul float %820, %.0.i255, !dbg !95 + %1589 = fmul float %821, %.0.i258, !dbg !95 + %1590 = fmul float %822, %.0.i258, !dbg !95 + %1591 = fmul float %823, %.0.i255, !dbg !95 + %1592 = fmul float %824, %.0.i255, !dbg !95 + %1593 = fmul float %825, %.0.i258, !dbg !95 + %1594 = fmul float %826, %.0.i258, !dbg !95 + %1595 = fmul float %827, %.0.i255, !dbg !95 + %1596 = fmul float %828, %.0.i255, !dbg !95 + %1597 = fmul float %829, %.0.i258, !dbg !95 + %1598 = fmul float %830, %.0.i258, !dbg !95 + %1599 = fmul float %831, %.0.i255, !dbg !95 + %1600 = fmul float %832, %.0.i255, !dbg !95 + %1601 = fmul float %833, %.0.i258, !dbg !95 + %1602 = fmul float %834, %.0.i258, !dbg !95 + %1603 = fmul float %835, %.0.i255, !dbg !95 + %1604 = fmul float %836, %.0.i255, !dbg !95 + %1605 = fmul float %837, %.0.i258, !dbg !95 + %1606 = fmul float %838, %.0.i258, !dbg !95 + %1607 = fmul float %839, %.0.i255, !dbg !95 + %1608 = fmul float %840, %.0.i255, !dbg !95 + %1609 = fmul float %841, %.0.i258, !dbg !95 + %1610 = fmul float %842, %.0.i258, !dbg !95 + %1611 = fmul float %843, %.0.i255, !dbg !95 + %1612 = fmul float %844, %.0.i255, !dbg !95 + %1613 = fmul float %845, %.0.i258, !dbg !95 + %1614 = fmul float %846, %.0.i258, !dbg !95 + %1615 = fmul float %847, %.0.i255, !dbg !95 + %1616 = fmul float %848, %.0.i255, !dbg !95 + %1617 = fmul float %849, %.0.i258, !dbg !95 + %1618 = fmul float %850, %.0.i258, !dbg !95 + %1619 = fmul float %851, %.0.i255, !dbg !95 + %1620 = fmul float %852, %.0.i255, !dbg !95 + %1621 = fmul float %853, %.0.i258, !dbg !95 + %1622 = fmul float %854, %.0.i258, !dbg !95 + %1623 = fmul float %855, %.0.i255, !dbg !95 + %1624 = fmul float %856, %.0.i255, !dbg !95 + %1625 = fmul float %857, %.0.i258, !dbg !95 + %1626 = fmul float %858, %.0.i258, !dbg !95 + %1627 = fmul float %859, %.0.i255, !dbg !95 + %1628 = fmul float %860, %.0.i255, !dbg !95 + %1629 = fmul float %861, %.0.i258, !dbg !95 + %1630 = fmul float %862, %.0.i258, !dbg !95 + %1631 = fmul float %863, %.0.i255, !dbg !95 + %1632 = fmul float %864, %.0.i255, !dbg !95 + %1633 = fmul float %865, %.0.i258, !dbg !95 + %1634 = fmul float %866, %.0.i258, !dbg !95 + %1635 = fmul float %867, %.0.i255, !dbg !95 + %1636 = fmul float %868, %.0.i255, !dbg !95 + %1637 = fmul float %869, %.0.i258, !dbg !95 + %1638 = fmul float %870, %.0.i258, !dbg !95 + %1639 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %425, !dbg !61 + %1640 = insertelement <2 x float> poison, float %.0.i261, i64 0, !dbg !96 + %1641 = insertelement <2 x float> %1640, float %.0.i264, i64 1, !dbg !96 + %1642 = fptrunc <2 x float> %1641 to <2 x bfloat>, !dbg !96 + %1643 = insertelement <2 x float> poison, float %.0.i267, i64 0, !dbg !96 + %1644 = insertelement <2 x float> %1643, float %.0.i270, i64 1, !dbg !96 + %1645 = fptrunc <2 x float> %1644 to <2 x bfloat>, !dbg !96 + %1646 = insertelement <2 x float> poison, float %.0.i273, i64 0, !dbg !96 + %1647 = insertelement <2 x float> %1646, float %.0.i276, i64 1, !dbg !96 + %1648 = fptrunc <2 x float> %1647 to <2 x bfloat>, !dbg !96 + %1649 = insertelement <2 x float> poison, float %.0.i279, i64 0, !dbg !96 + %1650 = insertelement <2 x float> %1649, float %.0.i282, i64 1, !dbg !96 + %1651 = fptrunc <2 x float> %1650 to <2 x bfloat>, !dbg !96 + %1652 = insertelement <2 x float> poison, float %.0.i285, i64 0, !dbg !96 + %1653 = insertelement <2 x float> %1652, float %.0.i288, i64 1, !dbg !96 + %1654 = fptrunc <2 x float> %1653 to <2 x bfloat>, !dbg !96 + %1655 = insertelement <2 x float> poison, float %.0.i291, i64 0, !dbg !96 + %1656 = insertelement <2 x float> %1655, float %.0.i294, i64 1, !dbg !96 + %1657 = fptrunc <2 x float> %1656 to <2 x bfloat>, !dbg !96 + %1658 = insertelement <2 x float> poison, float %.0.i297, i64 0, !dbg !96 + %1659 = insertelement <2 x float> %1658, float %.0.i300, i64 1, !dbg !96 + %1660 = fptrunc <2 x float> %1659 to <2 x bfloat>, !dbg !96 + %1661 = insertelement <2 x float> poison, float %.0.i303, i64 0, !dbg !96 + %1662 = insertelement <2 x float> %1661, float %.0.i306, i64 1, !dbg !96 + %1663 = fptrunc <2 x float> %1662 to <2 x bfloat>, !dbg !96 + %1664 = insertelement <2 x float> poison, float %.0.i309, i64 0, !dbg !96 + %1665 = insertelement <2 x float> %1664, float %.0.i312, i64 1, !dbg !96 + %1666 = fptrunc <2 x float> %1665 to <2 x bfloat>, !dbg !96 + %1667 = insertelement <2 x float> poison, float %.0.i315, i64 0, !dbg !96 + %1668 = insertelement <2 x float> %1667, float %.0.i318, i64 1, !dbg !96 + %1669 = fptrunc <2 x float> %1668 to <2 x bfloat>, !dbg !96 + %1670 = insertelement <2 x float> poison, float %.0.i321, i64 0, !dbg !96 + %1671 = insertelement <2 x float> %1670, float %.0.i324, i64 1, !dbg !96 + %1672 = fptrunc <2 x float> %1671 to <2 x bfloat>, !dbg !96 + %1673 = insertelement <2 x float> poison, float %.0.i327, i64 0, !dbg !96 + %1674 = insertelement <2 x float> %1673, float %.0.i330, i64 1, !dbg !96 + %1675 = fptrunc <2 x float> %1674 to <2 x bfloat>, !dbg !96 + %1676 = insertelement <2 x float> poison, float %.0.i333, i64 0, !dbg !96 + %1677 = insertelement <2 x float> %1676, float %.0.i336, i64 1, !dbg !96 + %1678 = fptrunc <2 x float> %1677 to <2 x bfloat>, !dbg !96 + %1679 = insertelement <2 x float> poison, float %.0.i339, i64 0, !dbg !96 + %1680 = insertelement <2 x float> %1679, float %.0.i342, i64 1, !dbg !96 + %1681 = fptrunc <2 x float> %1680 to <2 x bfloat>, !dbg !96 + %1682 = insertelement <2 x float> poison, float %.0.i345, i64 0, !dbg !96 + %1683 = insertelement <2 x float> %1682, float %.0.i348, i64 1, !dbg !96 + %1684 = fptrunc <2 x float> %1683 to <2 x bfloat>, !dbg !96 + %1685 = insertelement <2 x float> poison, float %.0.i351, i64 0, !dbg !96 + %1686 = insertelement <2 x float> %1685, float %.0.i354, i64 1, !dbg !96 + %1687 = fptrunc <2 x float> %1686 to <2 x bfloat>, !dbg !96 + %1688 = bitcast <2 x bfloat> %1642 to i32, !dbg !97 + %1689 = bitcast <2 x bfloat> %1645 to i32, !dbg !97 + %1690 = bitcast <2 x bfloat> %1648 to i32, !dbg !97 + %1691 = bitcast <2 x bfloat> %1651 to i32, !dbg !97 + %1692 = bitcast <2 x bfloat> %1654 to i32, !dbg !97 + %1693 = bitcast <2 x bfloat> %1657 to i32, !dbg !97 + %1694 = bitcast <2 x bfloat> %1660 to i32, !dbg !97 + %1695 = bitcast <2 x bfloat> %1663 to i32, !dbg !97 + %1696 = bitcast <2 x bfloat> %1666 to i32, !dbg !97 + %1697 = bitcast <2 x bfloat> %1669 to i32, !dbg !97 + %1698 = bitcast <2 x bfloat> %1672 to i32, !dbg !97 + %1699 = bitcast <2 x bfloat> %1675 to i32, !dbg !97 + %1700 = bitcast <2 x bfloat> %1678 to i32, !dbg !97 + %1701 = bitcast <2 x bfloat> %1681 to i32, !dbg !97 + %1702 = bitcast <2 x bfloat> %1684 to i32, !dbg !97 + %1703 = bitcast <2 x bfloat> %1687 to i32, !dbg !97 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !97 + %1704 = ptrtoint ptr addrspace(3) %1639 to i32, !dbg !97 + %1705 = lshr exact i32 %1704, 4, !dbg !97 + %1706 = and i32 %1705, 16383, !dbg !97 + %1707 = zext nneg i32 %1706 to i64, !dbg !97 + %1708 = or disjoint i64 %1707, 4611686293338849280, !dbg !97 + %1709 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1575, float %1576, float %1577, float %1578, float %1579, float %1580, float %1581, float %1582, float %1583, float %1584, float %1585, float %1586, float %1587, float %1588, float %1589, float %1590, float %1591, float %1592, float %1593, float %1594, float %1595, float %1596, float %1597, float %1598, float %1599, float %1600, float %1601, float %1602, float %1603, float %1604, float %1605, float %1606, float %1607, float %1608, float %1609, float %1610, float %1611, float %1612, float %1613, float %1614, float %1615, float %1616, float %1617, float %1618, float %1619, float %1620, float %1621, float %1622, float %1623, float %1624, float %1625, float %1626, float %1627, float %1628, float %1629, float %1630, float %1631, float %1632, float %1633, float %1634, float %1635, float %1636, float %1637, float %1638, i32 %1688, i32 %1689, i32 %1690, i32 %1691, i64 %1708, i1 true) #2, !dbg !97 + %1710 = add i32 %1704, 2048, !dbg !97 + %1711 = lshr exact i32 %1710, 4, !dbg !97 + %1712 = and i32 %1711, 16383, !dbg !97 + %1713 = zext nneg i32 %1712 to i64, !dbg !97 + %1714 = or disjoint i64 %1713, 4611686293338849280, !dbg !97 + %1715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 0, !dbg !97 + %1716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 1, !dbg !97 + %1717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 2, !dbg !97 + %1718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 3, !dbg !97 + %1719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 4, !dbg !97 + %1720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 5, !dbg !97 + %1721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 6, !dbg !97 + %1722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 7, !dbg !97 + %1723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 8, !dbg !97 + %1724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 9, !dbg !97 + %1725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 10, !dbg !97 + %1726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 11, !dbg !97 + %1727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 12, !dbg !97 + %1728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 13, !dbg !97 + %1729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 14, !dbg !97 + %1730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 15, !dbg !97 + %1731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 16, !dbg !97 + %1732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 17, !dbg !97 + %1733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 18, !dbg !97 + %1734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 19, !dbg !97 + %1735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 20, !dbg !97 + %1736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 21, !dbg !97 + %1737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 22, !dbg !97 + %1738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 23, !dbg !97 + %1739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 24, !dbg !97 + %1740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 25, !dbg !97 + %1741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 26, !dbg !97 + %1742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 27, !dbg !97 + %1743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 28, !dbg !97 + %1744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 29, !dbg !97 + %1745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 30, !dbg !97 + %1746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 31, !dbg !97 + %1747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 32, !dbg !97 + %1748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 33, !dbg !97 + %1749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 34, !dbg !97 + %1750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 35, !dbg !97 + %1751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 36, !dbg !97 + %1752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 37, !dbg !97 + %1753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 38, !dbg !97 + %1754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 39, !dbg !97 + %1755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 40, !dbg !97 + %1756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 41, !dbg !97 + %1757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 42, !dbg !97 + %1758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 43, !dbg !97 + %1759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 44, !dbg !97 + %1760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 45, !dbg !97 + %1761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 46, !dbg !97 + %1762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 47, !dbg !97 + %1763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 48, !dbg !97 + %1764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 49, !dbg !97 + %1765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 50, !dbg !97 + %1766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 51, !dbg !97 + %1767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 52, !dbg !97 + %1768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 53, !dbg !97 + %1769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 54, !dbg !97 + %1770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 55, !dbg !97 + %1771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 56, !dbg !97 + %1772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 57, !dbg !97 + %1773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 58, !dbg !97 + %1774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 59, !dbg !97 + %1775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 60, !dbg !97 + %1776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 61, !dbg !97 + %1777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 62, !dbg !97 + %1778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1709, 63, !dbg !97 + %1779 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1715, float %1716, float %1717, float %1718, float %1719, float %1720, float %1721, float %1722, float %1723, float %1724, float %1725, float %1726, float %1727, float %1728, float %1729, float %1730, float %1731, float %1732, float %1733, float %1734, float %1735, float %1736, float %1737, float %1738, float %1739, float %1740, float %1741, float %1742, float %1743, float %1744, float %1745, float %1746, float %1747, float %1748, float %1749, float %1750, float %1751, float %1752, float %1753, float %1754, float %1755, float %1756, float %1757, float %1758, float %1759, float %1760, float %1761, float %1762, float %1763, float %1764, float %1765, float %1766, float %1767, float %1768, float %1769, float %1770, float %1771, float %1772, float %1773, float %1774, float %1775, float %1776, float %1777, float %1778, i32 %1692, i32 %1693, i32 %1694, i32 %1695, i64 %1714, i1 true) #2, !dbg !97 + %1780 = add i32 %1704, 4096, !dbg !97 + %1781 = lshr exact i32 %1780, 4, !dbg !97 + %1782 = and i32 %1781, 16383, !dbg !97 + %1783 = zext nneg i32 %1782 to i64, !dbg !97 + %1784 = or disjoint i64 %1783, 4611686293338849280, !dbg !97 + %1785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 0, !dbg !97 + %1786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 1, !dbg !97 + %1787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 2, !dbg !97 + %1788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 3, !dbg !97 + %1789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 4, !dbg !97 + %1790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 5, !dbg !97 + %1791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 6, !dbg !97 + %1792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 7, !dbg !97 + %1793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 8, !dbg !97 + %1794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 9, !dbg !97 + %1795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 10, !dbg !97 + %1796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 11, !dbg !97 + %1797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 12, !dbg !97 + %1798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 13, !dbg !97 + %1799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 14, !dbg !97 + %1800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 15, !dbg !97 + %1801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 16, !dbg !97 + %1802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 17, !dbg !97 + %1803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 18, !dbg !97 + %1804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 19, !dbg !97 + %1805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 20, !dbg !97 + %1806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 21, !dbg !97 + %1807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 22, !dbg !97 + %1808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 23, !dbg !97 + %1809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 24, !dbg !97 + %1810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 25, !dbg !97 + %1811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 26, !dbg !97 + %1812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 27, !dbg !97 + %1813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 28, !dbg !97 + %1814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 29, !dbg !97 + %1815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 30, !dbg !97 + %1816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 31, !dbg !97 + %1817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 32, !dbg !97 + %1818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 33, !dbg !97 + %1819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 34, !dbg !97 + %1820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 35, !dbg !97 + %1821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 36, !dbg !97 + %1822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 37, !dbg !97 + %1823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 38, !dbg !97 + %1824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 39, !dbg !97 + %1825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 40, !dbg !97 + %1826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 41, !dbg !97 + %1827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 42, !dbg !97 + %1828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 43, !dbg !97 + %1829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 44, !dbg !97 + %1830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 45, !dbg !97 + %1831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 46, !dbg !97 + %1832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 47, !dbg !97 + %1833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 48, !dbg !97 + %1834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 49, !dbg !97 + %1835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 50, !dbg !97 + %1836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 51, !dbg !97 + %1837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 52, !dbg !97 + %1838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 53, !dbg !97 + %1839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 54, !dbg !97 + %1840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 55, !dbg !97 + %1841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 56, !dbg !97 + %1842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 57, !dbg !97 + %1843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 58, !dbg !97 + %1844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 59, !dbg !97 + %1845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 60, !dbg !97 + %1846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 61, !dbg !97 + %1847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 62, !dbg !97 + %1848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1779, 63, !dbg !97 + %1849 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1785, float %1786, float %1787, float %1788, float %1789, float %1790, float %1791, float %1792, float %1793, float %1794, float %1795, float %1796, float %1797, float %1798, float %1799, float %1800, float %1801, float %1802, float %1803, float %1804, float %1805, float %1806, float %1807, float %1808, float %1809, float %1810, float %1811, float %1812, float %1813, float %1814, float %1815, float %1816, float %1817, float %1818, float %1819, float %1820, float %1821, float %1822, float %1823, float %1824, float %1825, float %1826, float %1827, float %1828, float %1829, float %1830, float %1831, float %1832, float %1833, float %1834, float %1835, float %1836, float %1837, float %1838, float %1839, float %1840, float %1841, float %1842, float %1843, float %1844, float %1845, float %1846, float %1847, float %1848, i32 %1696, i32 %1697, i32 %1698, i32 %1699, i64 %1784, i1 true) #2, !dbg !97 + %1850 = add i32 %1704, 6144, !dbg !97 + %1851 = lshr exact i32 %1850, 4, !dbg !97 + %1852 = and i32 %1851, 16383, !dbg !97 + %1853 = zext nneg i32 %1852 to i64, !dbg !97 + %1854 = or disjoint i64 %1853, 4611686293338849280, !dbg !97 + %1855 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 0, !dbg !97 + %1856 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 1, !dbg !97 + %1857 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 2, !dbg !97 + %1858 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 3, !dbg !97 + %1859 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 4, !dbg !97 + %1860 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 5, !dbg !97 + %1861 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 6, !dbg !97 + %1862 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 7, !dbg !97 + %1863 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 8, !dbg !97 + %1864 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 9, !dbg !97 + %1865 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 10, !dbg !97 + %1866 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 11, !dbg !97 + %1867 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 12, !dbg !97 + %1868 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 13, !dbg !97 + %1869 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 14, !dbg !97 + %1870 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 15, !dbg !97 + %1871 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 16, !dbg !97 + %1872 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 17, !dbg !97 + %1873 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 18, !dbg !97 + %1874 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 19, !dbg !97 + %1875 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 20, !dbg !97 + %1876 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 21, !dbg !97 + %1877 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 22, !dbg !97 + %1878 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 23, !dbg !97 + %1879 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 24, !dbg !97 + %1880 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 25, !dbg !97 + %1881 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 26, !dbg !97 + %1882 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 27, !dbg !97 + %1883 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 28, !dbg !97 + %1884 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 29, !dbg !97 + %1885 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 30, !dbg !97 + %1886 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 31, !dbg !97 + %1887 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 32, !dbg !97 + %1888 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 33, !dbg !97 + %1889 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 34, !dbg !97 + %1890 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 35, !dbg !97 + %1891 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 36, !dbg !97 + %1892 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 37, !dbg !97 + %1893 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 38, !dbg !97 + %1894 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 39, !dbg !97 + %1895 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 40, !dbg !97 + %1896 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 41, !dbg !97 + %1897 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 42, !dbg !97 + %1898 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 43, !dbg !97 + %1899 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 44, !dbg !97 + %1900 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 45, !dbg !97 + %1901 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 46, !dbg !97 + %1902 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 47, !dbg !97 + %1903 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 48, !dbg !97 + %1904 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 49, !dbg !97 + %1905 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 50, !dbg !97 + %1906 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 51, !dbg !97 + %1907 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 52, !dbg !97 + %1908 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 53, !dbg !97 + %1909 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 54, !dbg !97 + %1910 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 55, !dbg !97 + %1911 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 56, !dbg !97 + %1912 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 57, !dbg !97 + %1913 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 58, !dbg !97 + %1914 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 59, !dbg !97 + %1915 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 60, !dbg !97 + %1916 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 61, !dbg !97 + %1917 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 62, !dbg !97 + %1918 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1849, 63, !dbg !97 + %1919 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %1855, float %1856, float %1857, float %1858, float %1859, float %1860, float %1861, float %1862, float %1863, float %1864, float %1865, float %1866, float %1867, float %1868, float %1869, float %1870, float %1871, float %1872, float %1873, float %1874, float %1875, float %1876, float %1877, float %1878, float %1879, float %1880, float %1881, float %1882, float %1883, float %1884, float %1885, float %1886, float %1887, float %1888, float %1889, float %1890, float %1891, float %1892, float %1893, float %1894, float %1895, float %1896, float %1897, float %1898, float %1899, float %1900, float %1901, float %1902, float %1903, float %1904, float %1905, float %1906, float %1907, float %1908, float %1909, float %1910, float %1911, float %1912, float %1913, float %1914, float %1915, float %1916, float %1917, float %1918, i32 %1700, i32 %1701, i32 %1702, i32 %1703, i64 %1854, i1 true) #2, !dbg !97 + %1920 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 0, !dbg !97 + %1921 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 1, !dbg !97 + %1922 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 2, !dbg !97 + %1923 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 3, !dbg !97 + %1924 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 4, !dbg !97 + %1925 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 5, !dbg !97 + %1926 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 6, !dbg !97 + %1927 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 7, !dbg !97 + %1928 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 8, !dbg !97 + %1929 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 9, !dbg !97 + %1930 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 10, !dbg !97 + %1931 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 11, !dbg !97 + %1932 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 12, !dbg !97 + %1933 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 13, !dbg !97 + %1934 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 14, !dbg !97 + %1935 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 15, !dbg !97 + %1936 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 16, !dbg !97 + %1937 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 17, !dbg !97 + %1938 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 18, !dbg !97 + %1939 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 19, !dbg !97 + %1940 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 20, !dbg !97 + %1941 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 21, !dbg !97 + %1942 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 22, !dbg !97 + %1943 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 23, !dbg !97 + %1944 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 24, !dbg !97 + %1945 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 25, !dbg !97 + %1946 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 26, !dbg !97 + %1947 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 27, !dbg !97 + %1948 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 28, !dbg !97 + %1949 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 29, !dbg !97 + %1950 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 30, !dbg !97 + %1951 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 31, !dbg !97 + %1952 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 32, !dbg !97 + %1953 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 33, !dbg !97 + %1954 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 34, !dbg !97 + %1955 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 35, !dbg !97 + %1956 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 36, !dbg !97 + %1957 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 37, !dbg !97 + %1958 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 38, !dbg !97 + %1959 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 39, !dbg !97 + %1960 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 40, !dbg !97 + %1961 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 41, !dbg !97 + %1962 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 42, !dbg !97 + %1963 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 43, !dbg !97 + %1964 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 44, !dbg !97 + %1965 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 45, !dbg !97 + %1966 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 46, !dbg !97 + %1967 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 47, !dbg !97 + %1968 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 48, !dbg !97 + %1969 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 49, !dbg !97 + %1970 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 50, !dbg !97 + %1971 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 51, !dbg !97 + %1972 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 52, !dbg !97 + %1973 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 53, !dbg !97 + %1974 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 54, !dbg !97 + %1975 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 55, !dbg !97 + %1976 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 56, !dbg !97 + %1977 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 57, !dbg !97 + %1978 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 58, !dbg !97 + %1979 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 59, !dbg !97 + %1980 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 60, !dbg !97 + %1981 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 61, !dbg !97 + %1982 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 62, !dbg !97 + %1983 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %1919, 63, !dbg !97 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !97 + %1984 = insertelement <16 x i32> poison, i32 %347, i64 0, !dbg !98 + %1985 = shufflevector <16 x i32> %1984, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !98 + %1986 = add <16 x i32> %1985, %419, !dbg !98 + %1987 = add nuw nsw i32 %417, 1, !dbg !52 + %1988 = lshr i32 %1987, 1, !dbg !99 + %1989 = zext nneg i32 %1988 to i64, !dbg !100 + %1990 = getelementptr i32, ptr addrspace(1) %187, i64 %1989, !dbg !100 + %1991 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !101 + %1992 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %1990, i64 %1991, i1 %421) #2, !dbg !101 + %1993 = add nuw nsw i32 %1988, 1, !dbg !102 + %1994 = icmp slt i32 %1993, %192, !dbg !103 + %1995 = getelementptr i8, ptr addrspace(1) %1990, i64 4, !dbg !104 + %1996 = and i1 %421, %1994, !dbg !52 + %1997 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !105 + %1998 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %1995, i64 %1997, i1 %1996) #2, !dbg !105 + %1999 = and i32 %417, 1, !dbg !106 + %2000 = sub i32 %1998, %1992, !dbg !107 + %2001 = shl i32 %2000, 7, !dbg !108 + %2002 = add i32 %2001, -64, !dbg !109 + %2003 = xor i32 %1999, 1, !dbg !110 + %2004 = mul nuw nsw i32 %2002, %2003, !dbg !110 + %2005 = shl nuw nsw i32 %1999, 6, !dbg !111 + %2006 = add i32 %2004, %2005, !dbg !112 + %2007 = add i32 %2006, %350, !dbg !113 + %2008 = add i32 %349, 1, !dbg !52 + %2009 = icmp sgt i32 %2008, 2, !dbg !52 + %2010 = select i1 %2009, i32 0, i32 %2008, !dbg !52 + %2011 = add i32 %2007, %189, !dbg !62 + %2012 = add i32 %2011, %37, !dbg !56 + %2013 = add i32 %2011, %38, !dbg !56 + %2014 = add i32 %2011, %39, !dbg !56 + %2015 = add i32 %2011, %40, !dbg !56 + %2016 = shl i32 %2012, 7, !dbg !57 + %2017 = shl i32 %2013, 7, !dbg !57 + %2018 = shl i32 %2014, 7, !dbg !57 + %2019 = shl i32 %2015, 7, !dbg !57 + %2020 = sext i32 %2016 to i64, !dbg !58 + %2021 = sext i32 %2017 to i64, !dbg !58 + %2022 = sext i32 %2018 to i64, !dbg !58 + %2023 = sext i32 %2019 to i64, !dbg !58 + %gep = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %2020, !dbg !59 + %gep396 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %2021, !dbg !59 + %gep398 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %2022, !dbg !59 + %gep400 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %2023, !dbg !59 + %2024 = icmp slt i32 %2012, %11, !dbg !60 + %2025 = icmp slt i32 %2013, %11, !dbg !60 + %2026 = icmp slt i32 %2014, %11, !dbg !60 + %2027 = icmp slt i32 %2015, %11, !dbg !60 + %2028 = shl i32 %2010, 13, !dbg !61 + %2029 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %2028, !dbg !61 + %2030 = and i1 %420, %2024, !dbg !52 + %2031 = and i1 %420, %2025, !dbg !52 + %2032 = and i1 %420, %2026, !dbg !52 + %2033 = and i1 %420, %2027, !dbg !52 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !61 + %2034 = getelementptr inbounds nuw i8, ptr addrspace(3) %2029, i32 %255, !dbg !61 + %2035 = select i1 %2030, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %2034, ptr addrspace(1) %gep, i32 %2035) #2, !dbg !61 + %2036 = getelementptr inbounds nuw i8, ptr addrspace(3) %2029, i32 %258, !dbg !61 + %2037 = select i1 %2031, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2036, ptr addrspace(1) %gep396, i32 %2037) #2, !dbg !61 + %2038 = getelementptr inbounds nuw i8, ptr addrspace(3) %2029, i32 %261, !dbg !61 + %2039 = select i1 %2032, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2038, ptr addrspace(1) %gep398, i32 %2039) #2, !dbg !61 + %2040 = getelementptr inbounds nuw i8, ptr addrspace(3) %2029, i32 %264, !dbg !61 + %2041 = select i1 %2033, i32 16, i32 0, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2040, ptr addrspace(1) %gep400, i32 %2041) #2, !dbg !61 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !61 + %gep402 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %2020, !dbg !59 + %gep404 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %2021, !dbg !59 + %gep406 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %2022, !dbg !59 + %gep408 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %2023, !dbg !59 + %2042 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %2028, !dbg !61 + %2043 = getelementptr inbounds nuw i8, ptr addrspace(3) %2042, i32 %255, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %2043, ptr addrspace(1) %gep402, i32 %2035) #2, !dbg !61 + %2044 = getelementptr inbounds nuw i8, ptr addrspace(3) %2042, i32 %258, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2044, ptr addrspace(1) %gep404, i32 %2037) #2, !dbg !61 + %2045 = getelementptr inbounds nuw i8, ptr addrspace(3) %2042, i32 %261, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2045, ptr addrspace(1) %gep406, i32 %2039) #2, !dbg !61 + %2046 = getelementptr inbounds nuw i8, ptr addrspace(3) %2042, i32 %264, !dbg !61 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %2046, ptr addrspace(1) %gep408, i32 %2041) #2, !dbg !61 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !61 + %exitcond.not = icmp eq i32 %1987, %smax, !dbg !52 + br i1 %exitcond.not, label %._crit_edge, label %346, !dbg !52 + +._crit_edge: ; preds = %__nv_exp2f.exit355, %15 + %2047 = phi float [ 0.000000e+00, %15 ], [ %1920, %__nv_exp2f.exit355 ], !dbg !114 + %2048 = phi float [ 0.000000e+00, %15 ], [ %1921, %__nv_exp2f.exit355 ], !dbg !114 + %2049 = phi float [ 0.000000e+00, %15 ], [ %1922, %__nv_exp2f.exit355 ], !dbg !114 + %2050 = phi float [ 0.000000e+00, %15 ], [ %1923, %__nv_exp2f.exit355 ], !dbg !114 + %2051 = phi float [ 0.000000e+00, %15 ], [ %1924, %__nv_exp2f.exit355 ], !dbg !114 + %2052 = phi float [ 0.000000e+00, %15 ], [ %1925, %__nv_exp2f.exit355 ], !dbg !114 + %2053 = phi float [ 0.000000e+00, %15 ], [ %1926, %__nv_exp2f.exit355 ], !dbg !114 + %2054 = phi float [ 0.000000e+00, %15 ], [ %1927, %__nv_exp2f.exit355 ], !dbg !114 + %2055 = phi float [ 0.000000e+00, %15 ], [ %1928, %__nv_exp2f.exit355 ], !dbg !114 + %2056 = phi float [ 0.000000e+00, %15 ], [ %1929, %__nv_exp2f.exit355 ], !dbg !114 + %2057 = phi float [ 0.000000e+00, %15 ], [ %1930, %__nv_exp2f.exit355 ], !dbg !114 + %2058 = phi float [ 0.000000e+00, %15 ], [ %1931, %__nv_exp2f.exit355 ], !dbg !114 + %2059 = phi float [ 0.000000e+00, %15 ], [ %1932, %__nv_exp2f.exit355 ], !dbg !114 + %2060 = phi float [ 0.000000e+00, %15 ], [ %1933, %__nv_exp2f.exit355 ], !dbg !114 + %2061 = phi float [ 0.000000e+00, %15 ], [ %1934, %__nv_exp2f.exit355 ], !dbg !114 + %2062 = phi float [ 0.000000e+00, %15 ], [ %1935, %__nv_exp2f.exit355 ], !dbg !114 + %2063 = phi float [ 0.000000e+00, %15 ], [ %1936, %__nv_exp2f.exit355 ], !dbg !114 + %2064 = phi float [ 0.000000e+00, %15 ], [ %1937, %__nv_exp2f.exit355 ], !dbg !114 + %2065 = phi float [ 0.000000e+00, %15 ], [ %1938, %__nv_exp2f.exit355 ], !dbg !114 + %2066 = phi float [ 0.000000e+00, %15 ], [ %1939, %__nv_exp2f.exit355 ], !dbg !114 + %2067 = phi float [ 0.000000e+00, %15 ], [ %1940, %__nv_exp2f.exit355 ], !dbg !114 + %2068 = phi float [ 0.000000e+00, %15 ], [ %1941, %__nv_exp2f.exit355 ], !dbg !114 + %2069 = phi float [ 0.000000e+00, %15 ], [ %1942, %__nv_exp2f.exit355 ], !dbg !114 + %2070 = phi float [ 0.000000e+00, %15 ], [ %1943, %__nv_exp2f.exit355 ], !dbg !114 + %2071 = phi float [ 0.000000e+00, %15 ], [ %1944, %__nv_exp2f.exit355 ], !dbg !114 + %2072 = phi float [ 0.000000e+00, %15 ], [ %1945, %__nv_exp2f.exit355 ], !dbg !114 + %2073 = phi float [ 0.000000e+00, %15 ], [ %1946, %__nv_exp2f.exit355 ], !dbg !114 + %2074 = phi float [ 0.000000e+00, %15 ], [ %1947, %__nv_exp2f.exit355 ], !dbg !114 + %2075 = phi float [ 0.000000e+00, %15 ], [ %1948, %__nv_exp2f.exit355 ], !dbg !114 + %2076 = phi float [ 0.000000e+00, %15 ], [ %1949, %__nv_exp2f.exit355 ], !dbg !114 + %2077 = phi float [ 0.000000e+00, %15 ], [ %1950, %__nv_exp2f.exit355 ], !dbg !114 + %2078 = phi float [ 0.000000e+00, %15 ], [ %1951, %__nv_exp2f.exit355 ], !dbg !114 + %2079 = phi float [ 0.000000e+00, %15 ], [ %1952, %__nv_exp2f.exit355 ], !dbg !114 + %2080 = phi float [ 0.000000e+00, %15 ], [ %1953, %__nv_exp2f.exit355 ], !dbg !114 + %2081 = phi float [ 0.000000e+00, %15 ], [ %1954, %__nv_exp2f.exit355 ], !dbg !114 + %2082 = phi float [ 0.000000e+00, %15 ], [ %1955, %__nv_exp2f.exit355 ], !dbg !114 + %2083 = phi float [ 0.000000e+00, %15 ], [ %1956, %__nv_exp2f.exit355 ], !dbg !114 + %2084 = phi float [ 0.000000e+00, %15 ], [ %1957, %__nv_exp2f.exit355 ], !dbg !114 + %2085 = phi float [ 0.000000e+00, %15 ], [ %1958, %__nv_exp2f.exit355 ], !dbg !114 + %2086 = phi float [ 0.000000e+00, %15 ], [ %1959, %__nv_exp2f.exit355 ], !dbg !114 + %2087 = phi float [ 0.000000e+00, %15 ], [ %1960, %__nv_exp2f.exit355 ], !dbg !114 + %2088 = phi float [ 0.000000e+00, %15 ], [ %1961, %__nv_exp2f.exit355 ], !dbg !114 + %2089 = phi float [ 0.000000e+00, %15 ], [ %1962, %__nv_exp2f.exit355 ], !dbg !114 + %2090 = phi float [ 0.000000e+00, %15 ], [ %1963, %__nv_exp2f.exit355 ], !dbg !114 + %2091 = phi float [ 0.000000e+00, %15 ], [ %1964, %__nv_exp2f.exit355 ], !dbg !114 + %2092 = phi float [ 0.000000e+00, %15 ], [ %1965, %__nv_exp2f.exit355 ], !dbg !114 + %2093 = phi float [ 0.000000e+00, %15 ], [ %1966, %__nv_exp2f.exit355 ], !dbg !114 + %2094 = phi float [ 0.000000e+00, %15 ], [ %1967, %__nv_exp2f.exit355 ], !dbg !114 + %2095 = phi float [ 0.000000e+00, %15 ], [ %1968, %__nv_exp2f.exit355 ], !dbg !114 + %2096 = phi float [ 0.000000e+00, %15 ], [ %1969, %__nv_exp2f.exit355 ], !dbg !114 + %2097 = phi float [ 0.000000e+00, %15 ], [ %1970, %__nv_exp2f.exit355 ], !dbg !114 + %2098 = phi float [ 0.000000e+00, %15 ], [ %1971, %__nv_exp2f.exit355 ], !dbg !114 + %2099 = phi float [ 0.000000e+00, %15 ], [ %1972, %__nv_exp2f.exit355 ], !dbg !114 + %2100 = phi float [ 0.000000e+00, %15 ], [ %1973, %__nv_exp2f.exit355 ], !dbg !114 + %2101 = phi float [ 0.000000e+00, %15 ], [ %1974, %__nv_exp2f.exit355 ], !dbg !114 + %2102 = phi float [ 0.000000e+00, %15 ], [ %1975, %__nv_exp2f.exit355 ], !dbg !114 + %2103 = phi float [ 0.000000e+00, %15 ], [ %1976, %__nv_exp2f.exit355 ], !dbg !114 + %2104 = phi float [ 0.000000e+00, %15 ], [ %1977, %__nv_exp2f.exit355 ], !dbg !114 + %2105 = phi float [ 0.000000e+00, %15 ], [ %1978, %__nv_exp2f.exit355 ], !dbg !114 + %2106 = phi float [ 0.000000e+00, %15 ], [ %1979, %__nv_exp2f.exit355 ], !dbg !114 + %2107 = phi float [ 0.000000e+00, %15 ], [ %1980, %__nv_exp2f.exit355 ], !dbg !114 + %2108 = phi float [ 0.000000e+00, %15 ], [ %1981, %__nv_exp2f.exit355 ], !dbg !114 + %2109 = phi float [ 0.000000e+00, %15 ], [ %1982, %__nv_exp2f.exit355 ], !dbg !114 + %2110 = phi float [ 0.000000e+00, %15 ], [ %1983, %__nv_exp2f.exit355 ], !dbg !114 + %2111 = phi float [ 0.000000e+00, %15 ], [ %1573, %__nv_exp2f.exit355 ] + %2112 = phi float [ 0.000000e+00, %15 ], [ %1574, %__nv_exp2f.exit355 ] + %2113 = phi <2 x float> [ splat (float 0xFFF0000000000000), %15 ], [ %1312, %__nv_exp2f.exit355 ] + %2114 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %2047, float %2048, float %2049, float %2050, float %2051, float %2052, float %2053, float %2054, float %2055, float %2056, float %2057, float %2058, float %2059, float %2060, float %2061, float %2062, float %2063, float %2064, float %2065, float %2066, float %2067, float %2068, float %2069, float %2070, float %2071, float %2072, float %2073, float %2074, float %2075, float %2076, float %2077, float %2078, float %2079, float %2080, float %2081, float %2082, float %2083, float %2084, float %2085, float %2086, float %2087, float %2088, float %2089, float %2090, float %2091, float %2092, float %2093, float %2094, float %2095, float %2096, float %2097, float %2098, float %2099, float %2100, float %2101, float %2102, float %2103, float %2104, float %2105, float %2106, float %2107, float %2108, float %2109, float %2110) #2, !dbg !52 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !52 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !52 + %2115 = getelementptr i32, ptr addrspace(1) %8, i64 %186, !dbg !115 + %2116 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %2115) #2, !dbg !116 + %2117 = shl i32 %2116, 7, !dbg !117 + %2118 = getelementptr i32, ptr addrspace(1) %7, i64 %190, !dbg !118 + %2119 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l"(ptr addrspace(1) %2118) #2, !dbg !119 + %2120 = shl i32 %2119, 1, !dbg !120 + %2121 = tail call i32 @llvm.smin.i32(i32 %2120, i32 %196), !dbg !121 + %2122 = icmp sgt i32 %2120, 0, !dbg !122 + %2123 = or disjoint i32 %2117, %37, !dbg !124 + %2124 = or disjoint i32 %2117, %38, !dbg !124 + %2125 = or disjoint i32 %2117, %39, !dbg !124 + %2126 = or disjoint i32 %2117, %40, !dbg !124 + %2127 = shl i32 %2123, 7, !dbg !125 + %2128 = shl i32 %2124, 7, !dbg !125 + %2129 = shl i32 %2125, 7, !dbg !125 + %2130 = shl i32 %2126, 7, !dbg !125 + %2131 = sext i32 %2127 to i64, !dbg !126 + %2132 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2131, !dbg !126 + %2133 = sext i32 %2128 to i64, !dbg !126 + %2134 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2133, !dbg !126 + %2135 = sext i32 %2129 to i64, !dbg !126 + %2136 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2135, !dbg !126 + %2137 = sext i32 %2130 to i64, !dbg !126 + %2138 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2137, !dbg !126 + %2139 = getelementptr bfloat, ptr addrspace(1) %2132, i64 %80, !dbg !127 + %2140 = getelementptr bfloat, ptr addrspace(1) %2134, i64 %80, !dbg !127 + %2141 = getelementptr bfloat, ptr addrspace(1) %2136, i64 %80, !dbg !127 + %2142 = getelementptr bfloat, ptr addrspace(1) %2138, i64 %80, !dbg !127 + %2143 = icmp slt i32 %2123, %11, !dbg !128 + %2144 = icmp slt i32 %2124, %11, !dbg !128 + %2145 = icmp slt i32 %2125, %11, !dbg !128 + %2146 = icmp slt i32 %2126, %11, !dbg !128 + %2147 = and i1 %2122, %2143, !dbg !122 + %2148 = and i1 %2122, %2144, !dbg !122 + %2149 = and i1 %2122, %2145, !dbg !122 + %2150 = and i1 %2122, %2146, !dbg !122 + %2151 = select i1 %2147, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %256, ptr addrspace(1) %2139, i32 %2151) #2, !dbg !129 + %2152 = select i1 %2148, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %259, ptr addrspace(1) %2140, i32 %2152) #2, !dbg !129 + %2153 = select i1 %2149, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %262, ptr addrspace(1) %2141, i32 %2153) #2, !dbg !129 + %2154 = select i1 %2150, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %265, ptr addrspace(1) %2142, i32 %2154) #2, !dbg !129 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !129 + %2155 = getelementptr bfloat, ptr addrspace(1) %30, i64 %2131, !dbg !126 + %2156 = getelementptr bfloat, ptr addrspace(1) %30, i64 %2133, !dbg !126 + %2157 = getelementptr bfloat, ptr addrspace(1) %30, i64 %2135, !dbg !126 + %2158 = getelementptr bfloat, ptr addrspace(1) %30, i64 %2137, !dbg !126 + %2159 = getelementptr bfloat, ptr addrspace(1) %2155, i64 %80, !dbg !127 + %2160 = getelementptr bfloat, ptr addrspace(1) %2156, i64 %80, !dbg !127 + %2161 = getelementptr bfloat, ptr addrspace(1) %2157, i64 %80, !dbg !127 + %2162 = getelementptr bfloat, ptr addrspace(1) %2158, i64 %80, !dbg !127 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %275, ptr addrspace(1) %2159, i32 %2151) #2, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %276, ptr addrspace(1) %2160, i32 %2152) #2, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %277, ptr addrspace(1) %2161, i32 %2153) #2, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %278, ptr addrspace(1) %2162, i32 %2154) #2, !dbg !129 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !129 + %2163 = icmp sgt i32 %2121, 1, !dbg !122 + %2164 = or disjoint i32 %2117, 64, !dbg !130 + %2165 = or disjoint i32 %2164, %37, !dbg !124 + %2166 = or disjoint i32 %2164, %38, !dbg !124 + %2167 = or disjoint i32 %2164, %39, !dbg !124 + %2168 = or disjoint i32 %2164, %40, !dbg !124 + %2169 = shl i32 %2165, 7, !dbg !125 + %2170 = shl i32 %2166, 7, !dbg !125 + %2171 = shl i32 %2167, 7, !dbg !125 + %2172 = shl i32 %2168, 7, !dbg !125 + %2173 = sext i32 %2169 to i64, !dbg !126 + %2174 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2173, !dbg !126 + %2175 = sext i32 %2170 to i64, !dbg !126 + %2176 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2175, !dbg !126 + %2177 = sext i32 %2171 to i64, !dbg !126 + %2178 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2177, !dbg !126 + %2179 = sext i32 %2172 to i64, !dbg !126 + %2180 = getelementptr bfloat, ptr addrspace(1) %29, i64 %2179, !dbg !126 + %2181 = getelementptr bfloat, ptr addrspace(1) %2174, i64 %80, !dbg !127 + %2182 = getelementptr bfloat, ptr addrspace(1) %2176, i64 %80, !dbg !127 + %2183 = getelementptr bfloat, ptr addrspace(1) %2178, i64 %80, !dbg !127 + %2184 = getelementptr bfloat, ptr addrspace(1) %2180, i64 %80, !dbg !127 + %2185 = icmp slt i32 %2165, %11, !dbg !128 + %2186 = icmp slt i32 %2166, %11, !dbg !128 + %2187 = icmp slt i32 %2167, %11, !dbg !128 + %2188 = icmp slt i32 %2168, %11, !dbg !128 + %2189 = and i1 %2163, %2185, !dbg !122 + %2190 = and i1 %2163, %2186, !dbg !122 + %2191 = and i1 %2163, %2187, !dbg !122 + %2192 = and i1 %2163, %2188, !dbg !122 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !129 + %2193 = select i1 %2189, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %309, ptr addrspace(1) %2181, i32 %2193) #2, !dbg !129 + %2194 = select i1 %2190, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %311, ptr addrspace(1) %2182, i32 %2194) #2, !dbg !129 + %2195 = select i1 %2191, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %313, ptr addrspace(1) %2183, i32 %2195) #2, !dbg !129 + %2196 = select i1 %2192, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %315, ptr addrspace(1) %2184, i32 %2196) #2, !dbg !129 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !129 + %2197 = getelementptr bfloat, ptr addrspace(1) %30, i64 %2173, !dbg !126 + %2198 = getelementptr bfloat, ptr addrspace(1) %30, i64 %2175, !dbg !126 + %2199 = getelementptr bfloat, ptr addrspace(1) %30, i64 %2177, !dbg !126 + %2200 = getelementptr bfloat, ptr addrspace(1) %30, i64 %2179, !dbg !126 + %2201 = getelementptr bfloat, ptr addrspace(1) %2197, i64 %80, !dbg !127 + %2202 = getelementptr bfloat, ptr addrspace(1) %2198, i64 %80, !dbg !127 + %2203 = getelementptr bfloat, ptr addrspace(1) %2199, i64 %80, !dbg !127 + %2204 = getelementptr bfloat, ptr addrspace(1) %2200, i64 %80, !dbg !127 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %325, ptr addrspace(1) %2201, i32 %2193) #2, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %326, ptr addrspace(1) %2202, i32 %2194) #2, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %327, ptr addrspace(1) %2203, i32 %2195) #2, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %328, ptr addrspace(1) %2204, i32 %2196) #2, !dbg !129 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !129 + tail call void asm sideeffect "fence.proxy.async.shared::cta;", ""() #2, !dbg !131 + %2205 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 0, !dbg !122 + %2206 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 1, !dbg !122 + %2207 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 2, !dbg !122 + %2208 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 3, !dbg !122 + %2209 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 4, !dbg !122 + %2210 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 5, !dbg !122 + %2211 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 6, !dbg !122 + %2212 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 7, !dbg !122 + %2213 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 8, !dbg !122 + %2214 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 9, !dbg !122 + %2215 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 10, !dbg !122 + %2216 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 11, !dbg !122 + %2217 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 12, !dbg !122 + %2218 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 13, !dbg !122 + %2219 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 14, !dbg !122 + %2220 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 15, !dbg !122 + %2221 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 16, !dbg !122 + %2222 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 17, !dbg !122 + %2223 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 18, !dbg !122 + %2224 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 19, !dbg !122 + %2225 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 20, !dbg !122 + %2226 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 21, !dbg !122 + %2227 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 22, !dbg !122 + %2228 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 23, !dbg !122 + %2229 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 24, !dbg !122 + %2230 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 25, !dbg !122 + %2231 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 26, !dbg !122 + %2232 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 27, !dbg !122 + %2233 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 28, !dbg !122 + %2234 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 29, !dbg !122 + %2235 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 30, !dbg !122 + %2236 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 31, !dbg !122 + %2237 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 32, !dbg !122 + %2238 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 33, !dbg !122 + %2239 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 34, !dbg !122 + %2240 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 35, !dbg !122 + %2241 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 36, !dbg !122 + %2242 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 37, !dbg !122 + %2243 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 38, !dbg !122 + %2244 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 39, !dbg !122 + %2245 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 40, !dbg !122 + %2246 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 41, !dbg !122 + %2247 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 42, !dbg !122 + %2248 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 43, !dbg !122 + %2249 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 44, !dbg !122 + %2250 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 45, !dbg !122 + %2251 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 46, !dbg !122 + %2252 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 47, !dbg !122 + %2253 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 48, !dbg !122 + %2254 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 49, !dbg !122 + %2255 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 50, !dbg !122 + %2256 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 51, !dbg !122 + %2257 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 52, !dbg !122 + %2258 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 53, !dbg !122 + %2259 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 54, !dbg !122 + %2260 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 55, !dbg !122 + %2261 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 56, !dbg !122 + %2262 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 57, !dbg !122 + %2263 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 58, !dbg !122 + %2264 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 59, !dbg !122 + %2265 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 60, !dbg !122 + %2266 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 61, !dbg !122 + %2267 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 62, !dbg !122 + %2268 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 63, !dbg !122 + br i1 %2122, label %.lr.ph460, label %._crit_edge461, !dbg !122 + +.lr.ph460: ; preds = %._crit_edge + %2269 = insertelement <16 x i32> poison, i32 %2117, i64 0, !dbg !132 + %2270 = shufflevector <16 x i32> %2269, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !132 + %2271 = shufflevector <2 x i32> %203, <2 x i32> poison, <16 x i32> , !dbg !132 + %2272 = insertelement <16 x i32> %2271, i32 %200, i64 14, !dbg !132 + %2273 = insertelement <16 x i32> %2272, i32 %199, i64 15, !dbg !132 + %2274 = shufflevector <8 x i32> %209, <8 x i32> poison, <16 x i32> , !dbg !132 + %2275 = shufflevector <16 x i32> %2274, <16 x i32> %2273, <16 x i32> , !dbg !132 + %2276 = shufflevector <4 x i32> %206, <4 x i32> poison, <16 x i32> , !dbg !132 + %2277 = shufflevector <16 x i32> %2275, <16 x i32> %2276, <16 x i32> , !dbg !132 + %2278 = or disjoint <16 x i32> %2270, %2277, !dbg !132 + %2279 = add nsw i32 %2121, -2 + %2280 = add nsw i32 %2121, -1 + %smax471 = tail call i32 @llvm.smax.i32(i32 %2121, i32 1), !dbg !122 + %2281 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 0, !dbg !122 + %2282 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 1, !dbg !122 + %2283 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 2, !dbg !122 + %2284 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 3, !dbg !122 + %2285 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 4, !dbg !122 + %2286 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 5, !dbg !122 + %2287 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 6, !dbg !122 + %2288 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 7, !dbg !122 + %2289 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 8, !dbg !122 + %2290 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 9, !dbg !122 + %2291 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 10, !dbg !122 + %2292 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 11, !dbg !122 + %2293 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 12, !dbg !122 + %2294 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 13, !dbg !122 + %2295 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 14, !dbg !122 + %2296 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 15, !dbg !122 + %2297 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 16, !dbg !122 + %2298 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 17, !dbg !122 + %2299 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 18, !dbg !122 + %2300 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 19, !dbg !122 + %2301 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 20, !dbg !122 + %2302 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 21, !dbg !122 + %2303 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 22, !dbg !122 + %2304 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 23, !dbg !122 + %2305 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 24, !dbg !122 + %2306 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 25, !dbg !122 + %2307 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 26, !dbg !122 + %2308 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 27, !dbg !122 + %2309 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 28, !dbg !122 + %2310 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 29, !dbg !122 + %2311 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 30, !dbg !122 + %2312 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 31, !dbg !122 + %2313 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 32, !dbg !122 + %2314 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 33, !dbg !122 + %2315 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 34, !dbg !122 + %2316 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 35, !dbg !122 + %2317 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 36, !dbg !122 + %2318 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 37, !dbg !122 + %2319 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 38, !dbg !122 + %2320 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 39, !dbg !122 + %2321 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 40, !dbg !122 + %2322 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 41, !dbg !122 + %2323 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 42, !dbg !122 + %2324 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 43, !dbg !122 + %2325 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 44, !dbg !122 + %2326 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 45, !dbg !122 + %2327 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 46, !dbg !122 + %2328 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 47, !dbg !122 + %2329 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 48, !dbg !122 + %2330 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 49, !dbg !122 + %2331 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 50, !dbg !122 + %2332 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 51, !dbg !122 + %2333 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 52, !dbg !122 + %2334 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 53, !dbg !122 + %2335 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 54, !dbg !122 + %2336 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 55, !dbg !122 + %2337 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 56, !dbg !122 + %2338 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 57, !dbg !122 + %2339 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 58, !dbg !122 + %2340 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 59, !dbg !122 + %2341 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 60, !dbg !122 + %2342 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 61, !dbg !122 + %2343 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 62, !dbg !122 + %2344 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2114, 63, !dbg !122 + br label %2345, !dbg !122 + +2345: ; preds = %.lr.ph460, %__nv_exp2f.exit253 + %2346 = phi i32 [ 64, %.lr.ph460 ], [ %3674, %__nv_exp2f.exit253 ] + %2347 = phi i32 [ -1, %.lr.ph460 ], [ %2357, %__nv_exp2f.exit253 ] + %2348 = phi i32 [ 1, %.lr.ph460 ], [ %3678, %__nv_exp2f.exit253 ] + %2349 = phi i32 [ 64, %.lr.ph460 ], [ %3675, %__nv_exp2f.exit253 ] + %.pn = phi float [ %2281, %.lr.ph460 ], [ %3588, %__nv_exp2f.exit253 ] + %.pn479 = phi float [ %2282, %.lr.ph460 ], [ %3589, %__nv_exp2f.exit253 ] + %.pn480 = phi float [ %2283, %.lr.ph460 ], [ %3590, %__nv_exp2f.exit253 ] + %.pn481 = phi float [ %2284, %.lr.ph460 ], [ %3591, %__nv_exp2f.exit253 ] + %.pn482 = phi float [ %2285, %.lr.ph460 ], [ %3592, %__nv_exp2f.exit253 ] + %.pn483 = phi float [ %2286, %.lr.ph460 ], [ %3593, %__nv_exp2f.exit253 ] + %.pn484 = phi float [ %2287, %.lr.ph460 ], [ %3594, %__nv_exp2f.exit253 ] + %.pn485 = phi float [ %2288, %.lr.ph460 ], [ %3595, %__nv_exp2f.exit253 ] + %.pn486 = phi float [ %2289, %.lr.ph460 ], [ %3596, %__nv_exp2f.exit253 ] + %.pn487 = phi float [ %2290, %.lr.ph460 ], [ %3597, %__nv_exp2f.exit253 ] + %.pn488 = phi float [ %2291, %.lr.ph460 ], [ %3598, %__nv_exp2f.exit253 ] + %.pn489 = phi float [ %2292, %.lr.ph460 ], [ %3599, %__nv_exp2f.exit253 ] + %.pn490 = phi float [ %2293, %.lr.ph460 ], [ %3600, %__nv_exp2f.exit253 ] + %.pn491 = phi float [ %2294, %.lr.ph460 ], [ %3601, %__nv_exp2f.exit253 ] + %.pn492 = phi float [ %2295, %.lr.ph460 ], [ %3602, %__nv_exp2f.exit253 ] + %.pn493 = phi float [ %2296, %.lr.ph460 ], [ %3603, %__nv_exp2f.exit253 ] + %.pn494 = phi float [ %2297, %.lr.ph460 ], [ %3604, %__nv_exp2f.exit253 ] + %.pn495 = phi float [ %2298, %.lr.ph460 ], [ %3605, %__nv_exp2f.exit253 ] + %.pn496 = phi float [ %2299, %.lr.ph460 ], [ %3606, %__nv_exp2f.exit253 ] + %.pn497 = phi float [ %2300, %.lr.ph460 ], [ %3607, %__nv_exp2f.exit253 ] + %.pn498 = phi float [ %2301, %.lr.ph460 ], [ %3608, %__nv_exp2f.exit253 ] + %.pn499 = phi float [ %2302, %.lr.ph460 ], [ %3609, %__nv_exp2f.exit253 ] + %.pn500 = phi float [ %2303, %.lr.ph460 ], [ %3610, %__nv_exp2f.exit253 ] + %.pn501 = phi float [ %2304, %.lr.ph460 ], [ %3611, %__nv_exp2f.exit253 ] + %.pn502 = phi float [ %2305, %.lr.ph460 ], [ %3612, %__nv_exp2f.exit253 ] + %.pn503 = phi float [ %2306, %.lr.ph460 ], [ %3613, %__nv_exp2f.exit253 ] + %.pn504 = phi float [ %2307, %.lr.ph460 ], [ %3614, %__nv_exp2f.exit253 ] + %.pn505 = phi float [ %2308, %.lr.ph460 ], [ %3615, %__nv_exp2f.exit253 ] + %.pn506 = phi float [ %2309, %.lr.ph460 ], [ %3616, %__nv_exp2f.exit253 ] + %.pn507 = phi float [ %2310, %.lr.ph460 ], [ %3617, %__nv_exp2f.exit253 ] + %.pn508 = phi float [ %2311, %.lr.ph460 ], [ %3618, %__nv_exp2f.exit253 ] + %.pn509 = phi float [ %2312, %.lr.ph460 ], [ %3619, %__nv_exp2f.exit253 ] + %.pn510 = phi float [ %2313, %.lr.ph460 ], [ %3620, %__nv_exp2f.exit253 ] + %.pn511 = phi float [ %2314, %.lr.ph460 ], [ %3621, %__nv_exp2f.exit253 ] + %.pn512 = phi float [ %2315, %.lr.ph460 ], [ %3622, %__nv_exp2f.exit253 ] + %.pn513 = phi float [ %2316, %.lr.ph460 ], [ %3623, %__nv_exp2f.exit253 ] + %.pn514 = phi float [ %2317, %.lr.ph460 ], [ %3624, %__nv_exp2f.exit253 ] + %.pn515 = phi float [ %2318, %.lr.ph460 ], [ %3625, %__nv_exp2f.exit253 ] + %.pn516 = phi float [ %2319, %.lr.ph460 ], [ %3626, %__nv_exp2f.exit253 ] + %.pn517 = phi float [ %2320, %.lr.ph460 ], [ %3627, %__nv_exp2f.exit253 ] + %.pn518 = phi float [ %2321, %.lr.ph460 ], [ %3628, %__nv_exp2f.exit253 ] + %.pn519 = phi float [ %2322, %.lr.ph460 ], [ %3629, %__nv_exp2f.exit253 ] + %.pn520 = phi float [ %2323, %.lr.ph460 ], [ %3630, %__nv_exp2f.exit253 ] + %.pn521 = phi float [ %2324, %.lr.ph460 ], [ %3631, %__nv_exp2f.exit253 ] + %.pn522 = phi float [ %2325, %.lr.ph460 ], [ %3632, %__nv_exp2f.exit253 ] + %.pn523 = phi float [ %2326, %.lr.ph460 ], [ %3633, %__nv_exp2f.exit253 ] + %.pn524 = phi float [ %2327, %.lr.ph460 ], [ %3634, %__nv_exp2f.exit253 ] + %.pn525 = phi float [ %2328, %.lr.ph460 ], [ %3635, %__nv_exp2f.exit253 ] + %.pn526 = phi float [ %2329, %.lr.ph460 ], [ %3636, %__nv_exp2f.exit253 ] + %.pn527 = phi float [ %2330, %.lr.ph460 ], [ %3637, %__nv_exp2f.exit253 ] + %.pn528 = phi float [ %2331, %.lr.ph460 ], [ %3638, %__nv_exp2f.exit253 ] + %.pn529 = phi float [ %2332, %.lr.ph460 ], [ %3639, %__nv_exp2f.exit253 ] + %.pn530 = phi float [ %2333, %.lr.ph460 ], [ %3640, %__nv_exp2f.exit253 ] + %.pn531 = phi float [ %2334, %.lr.ph460 ], [ %3641, %__nv_exp2f.exit253 ] + %.pn532 = phi float [ %2335, %.lr.ph460 ], [ %3642, %__nv_exp2f.exit253 ] + %.pn533 = phi float [ %2336, %.lr.ph460 ], [ %3643, %__nv_exp2f.exit253 ] + %.pn534 = phi float [ %2337, %.lr.ph460 ], [ %3644, %__nv_exp2f.exit253 ] + %.pn535 = phi float [ %2338, %.lr.ph460 ], [ %3645, %__nv_exp2f.exit253 ] + %.pn536 = phi float [ %2339, %.lr.ph460 ], [ %3646, %__nv_exp2f.exit253 ] + %.pn537 = phi float [ %2340, %.lr.ph460 ], [ %3647, %__nv_exp2f.exit253 ] + %.pn538 = phi float [ %2341, %.lr.ph460 ], [ %3648, %__nv_exp2f.exit253 ] + %.pn539 = phi float [ %2342, %.lr.ph460 ], [ %3649, %__nv_exp2f.exit253 ] + %.pn540 = phi float [ %2343, %.lr.ph460 ], [ %3650, %__nv_exp2f.exit253 ] + %.pn541 = phi float [ %2344, %.lr.ph460 ], [ %3651, %__nv_exp2f.exit253 ] + %2350 = phi i32 [ 0, %.lr.ph460 ], [ %3655, %__nv_exp2f.exit253 ] + %.pn546 = phi float [ %2111, %.lr.ph460 ], [ %3241, %__nv_exp2f.exit253 ] + %.pn544 = phi float [ %2112, %.lr.ph460 ], [ %3242, %__nv_exp2f.exit253 ] + %2351 = phi <2 x float> [ %2113, %.lr.ph460 ], [ %2980, %__nv_exp2f.exit253 ] + %2352 = phi <16 x i32> [ %2278, %.lr.ph460 ], [ %3654, %__nv_exp2f.exit253 ] + %2353 = icmp slt i32 %2350, %2279, !dbg !122 + %2354 = icmp slt i32 %2350, %2280, !dbg !122 + %2355 = add i32 %2347, 1, !dbg !122 + %2356 = icmp sgt i32 %2355, 2, !dbg !122 + %2357 = select i1 %2356, i32 0, i32 %2355, !dbg !122 + tail call void @llvm.nvvm.cp.async.wait.group(i32 2), !dbg !129 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !129 + %2358 = shl i32 %2357, 13, !dbg !129 + %2359 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %2358, !dbg !129 + %2360 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 -1, i32 %35, i32 0, i32 31), !dbg !131 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !131 + %2361 = shl i32 %2360, 11, !dbg !131 + %2362 = and i32 %2361, 8192, !dbg !131 + %2363 = add i32 %2362, ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), !dbg !131 + %2364 = lshr exact i32 %2363, 4, !dbg !131 + %2365 = and i32 %2364, 16383, !dbg !131 + %2366 = zext nneg i32 %2365 to i64, !dbg !131 + %2367 = or disjoint i64 %2366, 4611686293372403712, !dbg !131 + %2368 = ptrtoint ptr addrspace(3) %2359 to i32, !dbg !131 + %2369 = lshr exact i32 %2368, 4, !dbg !131 + %2370 = and i32 %2369, 16383, !dbg !131 + %2371 = zext nneg i32 %2370 to i64, !dbg !131 + %2372 = or disjoint i64 %2371, 4611686293338849280, !dbg !131 + %2373 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $32, $33, 0, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,l,l"(i64 %2367, i64 %2372) #2, !dbg !131 + %2374 = add i32 %2362, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 32), !dbg !131 + %2375 = lshr exact i32 %2374, 4, !dbg !131 + %2376 = and i32 %2375, 16383, !dbg !131 + %2377 = zext nneg i32 %2376 to i64, !dbg !131 + %2378 = or disjoint i64 %2377, 4611686293372403712, !dbg !131 + %2379 = add i32 %2368, 32, !dbg !131 + %2380 = lshr exact i32 %2379, 4, !dbg !131 + %2381 = and i32 %2380, 16383, !dbg !131 + %2382 = zext nneg i32 %2381 to i64, !dbg !131 + %2383 = or disjoint i64 %2382, 4611686293338849280, !dbg !131 + %2384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 0, !dbg !131 + %2385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 1, !dbg !131 + %2386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 2, !dbg !131 + %2387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 3, !dbg !131 + %2388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 4, !dbg !131 + %2389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 5, !dbg !131 + %2390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 6, !dbg !131 + %2391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 7, !dbg !131 + %2392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 8, !dbg !131 + %2393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 9, !dbg !131 + %2394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 10, !dbg !131 + %2395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 11, !dbg !131 + %2396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 12, !dbg !131 + %2397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 13, !dbg !131 + %2398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 14, !dbg !131 + %2399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 15, !dbg !131 + %2400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 16, !dbg !131 + %2401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 17, !dbg !131 + %2402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 18, !dbg !131 + %2403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 19, !dbg !131 + %2404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 20, !dbg !131 + %2405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 21, !dbg !131 + %2406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 22, !dbg !131 + %2407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 23, !dbg !131 + %2408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 24, !dbg !131 + %2409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 25, !dbg !131 + %2410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 26, !dbg !131 + %2411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 27, !dbg !131 + %2412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 28, !dbg !131 + %2413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 29, !dbg !131 + %2414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 30, !dbg !131 + %2415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2373, 31, !dbg !131 + %2416 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2384, float %2385, float %2386, float %2387, float %2388, float %2389, float %2390, float %2391, float %2392, float %2393, float %2394, float %2395, float %2396, float %2397, float %2398, float %2399, float %2400, float %2401, float %2402, float %2403, float %2404, float %2405, float %2406, float %2407, float %2408, float %2409, float %2410, float %2411, float %2412, float %2413, float %2414, float %2415, i64 %2378, i64 %2383, i1 true) #2, !dbg !131 + %2417 = add i32 %2362, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 64), !dbg !131 + %2418 = lshr exact i32 %2417, 4, !dbg !131 + %2419 = and i32 %2418, 16383, !dbg !131 + %2420 = zext nneg i32 %2419 to i64, !dbg !131 + %2421 = or disjoint i64 %2420, 4611686293372403712, !dbg !131 + %2422 = add i32 %2368, 64, !dbg !131 + %2423 = lshr exact i32 %2422, 4, !dbg !131 + %2424 = and i32 %2423, 16383, !dbg !131 + %2425 = zext nneg i32 %2424 to i64, !dbg !131 + %2426 = or disjoint i64 %2425, 4611686293338849280, !dbg !131 + %2427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 0, !dbg !131 + %2428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 1, !dbg !131 + %2429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 2, !dbg !131 + %2430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 3, !dbg !131 + %2431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 4, !dbg !131 + %2432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 5, !dbg !131 + %2433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 6, !dbg !131 + %2434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 7, !dbg !131 + %2435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 8, !dbg !131 + %2436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 9, !dbg !131 + %2437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 10, !dbg !131 + %2438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 11, !dbg !131 + %2439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 12, !dbg !131 + %2440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 13, !dbg !131 + %2441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 14, !dbg !131 + %2442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 15, !dbg !131 + %2443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 16, !dbg !131 + %2444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 17, !dbg !131 + %2445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 18, !dbg !131 + %2446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 19, !dbg !131 + %2447 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 20, !dbg !131 + %2448 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 21, !dbg !131 + %2449 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 22, !dbg !131 + %2450 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 23, !dbg !131 + %2451 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 24, !dbg !131 + %2452 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 25, !dbg !131 + %2453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 26, !dbg !131 + %2454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 27, !dbg !131 + %2455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 28, !dbg !131 + %2456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 29, !dbg !131 + %2457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 30, !dbg !131 + %2458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2416, 31, !dbg !131 + %2459 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2427, float %2428, float %2429, float %2430, float %2431, float %2432, float %2433, float %2434, float %2435, float %2436, float %2437, float %2438, float %2439, float %2440, float %2441, float %2442, float %2443, float %2444, float %2445, float %2446, float %2447, float %2448, float %2449, float %2450, float %2451, float %2452, float %2453, float %2454, float %2455, float %2456, float %2457, float %2458, i64 %2421, i64 %2426, i1 true) #2, !dbg !131 + %2460 = add i32 %2362, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 96), !dbg !131 + %2461 = lshr exact i32 %2460, 4, !dbg !131 + %2462 = and i32 %2461, 16383, !dbg !131 + %2463 = zext nneg i32 %2462 to i64, !dbg !131 + %2464 = or disjoint i64 %2463, 4611686293372403712, !dbg !131 + %2465 = add i32 %2368, 96, !dbg !131 + %2466 = lshr exact i32 %2465, 4, !dbg !131 + %2467 = and i32 %2466, 16383, !dbg !131 + %2468 = zext nneg i32 %2467 to i64, !dbg !131 + %2469 = or disjoint i64 %2468, 4611686293338849280, !dbg !131 + %2470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 0, !dbg !131 + %2471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 1, !dbg !131 + %2472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 2, !dbg !131 + %2473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 3, !dbg !131 + %2474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 4, !dbg !131 + %2475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 5, !dbg !131 + %2476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 6, !dbg !131 + %2477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 7, !dbg !131 + %2478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 8, !dbg !131 + %2479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 9, !dbg !131 + %2480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 10, !dbg !131 + %2481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 11, !dbg !131 + %2482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 12, !dbg !131 + %2483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 13, !dbg !131 + %2484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 14, !dbg !131 + %2485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 15, !dbg !131 + %2486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 16, !dbg !131 + %2487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 17, !dbg !131 + %2488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 18, !dbg !131 + %2489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 19, !dbg !131 + %2490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 20, !dbg !131 + %2491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 21, !dbg !131 + %2492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 22, !dbg !131 + %2493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 23, !dbg !131 + %2494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 24, !dbg !131 + %2495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 25, !dbg !131 + %2496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 26, !dbg !131 + %2497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 27, !dbg !131 + %2498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 28, !dbg !131 + %2499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 29, !dbg !131 + %2500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 30, !dbg !131 + %2501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2459, 31, !dbg !131 + %2502 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2470, float %2471, float %2472, float %2473, float %2474, float %2475, float %2476, float %2477, float %2478, float %2479, float %2480, float %2481, float %2482, float %2483, float %2484, float %2485, float %2486, float %2487, float %2488, float %2489, float %2490, float %2491, float %2492, float %2493, float %2494, float %2495, float %2496, float %2497, float %2498, float %2499, float %2500, float %2501, i64 %2464, i64 %2469, i1 true) #2, !dbg !131 + %2503 = add i32 %2362, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16384), !dbg !131 + %2504 = lshr exact i32 %2503, 4, !dbg !131 + %2505 = and i32 %2504, 16383, !dbg !131 + %2506 = zext nneg i32 %2505 to i64, !dbg !131 + %2507 = or disjoint i64 %2506, 4611686293372403712, !dbg !131 + %2508 = add i32 %2368, 8192, !dbg !131 + %2509 = lshr exact i32 %2508, 4, !dbg !131 + %2510 = and i32 %2509, 16383, !dbg !131 + %2511 = zext nneg i32 %2510 to i64, !dbg !131 + %2512 = or disjoint i64 %2511, 4611686293338849280, !dbg !131 + %2513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 0, !dbg !131 + %2514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 1, !dbg !131 + %2515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 2, !dbg !131 + %2516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 3, !dbg !131 + %2517 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 4, !dbg !131 + %2518 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 5, !dbg !131 + %2519 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 6, !dbg !131 + %2520 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 7, !dbg !131 + %2521 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 8, !dbg !131 + %2522 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 9, !dbg !131 + %2523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 10, !dbg !131 + %2524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 11, !dbg !131 + %2525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 12, !dbg !131 + %2526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 13, !dbg !131 + %2527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 14, !dbg !131 + %2528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 15, !dbg !131 + %2529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 16, !dbg !131 + %2530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 17, !dbg !131 + %2531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 18, !dbg !131 + %2532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 19, !dbg !131 + %2533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 20, !dbg !131 + %2534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 21, !dbg !131 + %2535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 22, !dbg !131 + %2536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 23, !dbg !131 + %2537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 24, !dbg !131 + %2538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 25, !dbg !131 + %2539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 26, !dbg !131 + %2540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 27, !dbg !131 + %2541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 28, !dbg !131 + %2542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 29, !dbg !131 + %2543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 30, !dbg !131 + %2544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2502, 31, !dbg !131 + %2545 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2513, float %2514, float %2515, float %2516, float %2517, float %2518, float %2519, float %2520, float %2521, float %2522, float %2523, float %2524, float %2525, float %2526, float %2527, float %2528, float %2529, float %2530, float %2531, float %2532, float %2533, float %2534, float %2535, float %2536, float %2537, float %2538, float %2539, float %2540, float %2541, float %2542, float %2543, float %2544, i64 %2507, i64 %2512, i1 true) #2, !dbg !131 + %2546 = add i32 %2362, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16416), !dbg !131 + %2547 = lshr exact i32 %2546, 4, !dbg !131 + %2548 = and i32 %2547, 16383, !dbg !131 + %2549 = zext nneg i32 %2548 to i64, !dbg !131 + %2550 = or disjoint i64 %2549, 4611686293372403712, !dbg !131 + %2551 = add i32 %2368, 8224, !dbg !131 + %2552 = lshr exact i32 %2551, 4, !dbg !131 + %2553 = and i32 %2552, 16383, !dbg !131 + %2554 = zext nneg i32 %2553 to i64, !dbg !131 + %2555 = or disjoint i64 %2554, 4611686293338849280, !dbg !131 + %2556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 0, !dbg !131 + %2557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 1, !dbg !131 + %2558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 2, !dbg !131 + %2559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 3, !dbg !131 + %2560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 4, !dbg !131 + %2561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 5, !dbg !131 + %2562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 6, !dbg !131 + %2563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 7, !dbg !131 + %2564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 8, !dbg !131 + %2565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 9, !dbg !131 + %2566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 10, !dbg !131 + %2567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 11, !dbg !131 + %2568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 12, !dbg !131 + %2569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 13, !dbg !131 + %2570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 14, !dbg !131 + %2571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 15, !dbg !131 + %2572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 16, !dbg !131 + %2573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 17, !dbg !131 + %2574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 18, !dbg !131 + %2575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 19, !dbg !131 + %2576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 20, !dbg !131 + %2577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 21, !dbg !131 + %2578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 22, !dbg !131 + %2579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 23, !dbg !131 + %2580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 24, !dbg !131 + %2581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 25, !dbg !131 + %2582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 26, !dbg !131 + %2583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 27, !dbg !131 + %2584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 28, !dbg !131 + %2585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 29, !dbg !131 + %2586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 30, !dbg !131 + %2587 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2545, 31, !dbg !131 + %2588 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2556, float %2557, float %2558, float %2559, float %2560, float %2561, float %2562, float %2563, float %2564, float %2565, float %2566, float %2567, float %2568, float %2569, float %2570, float %2571, float %2572, float %2573, float %2574, float %2575, float %2576, float %2577, float %2578, float %2579, float %2580, float %2581, float %2582, float %2583, float %2584, float %2585, float %2586, float %2587, i64 %2550, i64 %2555, i1 true) #2, !dbg !131 + %2589 = add i32 %2362, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16448), !dbg !131 + %2590 = lshr exact i32 %2589, 4, !dbg !131 + %2591 = and i32 %2590, 16383, !dbg !131 + %2592 = zext nneg i32 %2591 to i64, !dbg !131 + %2593 = or disjoint i64 %2592, 4611686293372403712, !dbg !131 + %2594 = add i32 %2368, 8256, !dbg !131 + %2595 = lshr exact i32 %2594, 4, !dbg !131 + %2596 = and i32 %2595, 16383, !dbg !131 + %2597 = zext nneg i32 %2596 to i64, !dbg !131 + %2598 = or disjoint i64 %2597, 4611686293338849280, !dbg !131 + %2599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 0, !dbg !131 + %2600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 1, !dbg !131 + %2601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 2, !dbg !131 + %2602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 3, !dbg !131 + %2603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 4, !dbg !131 + %2604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 5, !dbg !131 + %2605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 6, !dbg !131 + %2606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 7, !dbg !131 + %2607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 8, !dbg !131 + %2608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 9, !dbg !131 + %2609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 10, !dbg !131 + %2610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 11, !dbg !131 + %2611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 12, !dbg !131 + %2612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 13, !dbg !131 + %2613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 14, !dbg !131 + %2614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 15, !dbg !131 + %2615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 16, !dbg !131 + %2616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 17, !dbg !131 + %2617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 18, !dbg !131 + %2618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 19, !dbg !131 + %2619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 20, !dbg !131 + %2620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 21, !dbg !131 + %2621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 22, !dbg !131 + %2622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 23, !dbg !131 + %2623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 24, !dbg !131 + %2624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 25, !dbg !131 + %2625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 26, !dbg !131 + %2626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 27, !dbg !131 + %2627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 28, !dbg !131 + %2628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 29, !dbg !131 + %2629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 30, !dbg !131 + %2630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2588, 31, !dbg !131 + %2631 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2599, float %2600, float %2601, float %2602, float %2603, float %2604, float %2605, float %2606, float %2607, float %2608, float %2609, float %2610, float %2611, float %2612, float %2613, float %2614, float %2615, float %2616, float %2617, float %2618, float %2619, float %2620, float %2621, float %2622, float %2623, float %2624, float %2625, float %2626, float %2627, float %2628, float %2629, float %2630, i64 %2593, i64 %2598, i1 true) #2, !dbg !131 + %2632 = add i32 %2362, add (i32 ptrtoint (ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304) to i32), i32 16480), !dbg !131 + %2633 = lshr exact i32 %2632, 4, !dbg !131 + %2634 = and i32 %2633, 16383, !dbg !131 + %2635 = zext nneg i32 %2634 to i64, !dbg !131 + %2636 = or disjoint i64 %2635, 4611686293372403712, !dbg !131 + %2637 = add i32 %2368, 8288, !dbg !131 + %2638 = lshr exact i32 %2637, 4, !dbg !131 + %2639 = and i32 %2638, 16383, !dbg !131 + %2640 = zext nneg i32 %2639 to i64, !dbg !131 + %2641 = or disjoint i64 %2640, 4611686293338849280, !dbg !131 + %2642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 0, !dbg !131 + %2643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 1, !dbg !131 + %2644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 2, !dbg !131 + %2645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 3, !dbg !131 + %2646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 4, !dbg !131 + %2647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 5, !dbg !131 + %2648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 6, !dbg !131 + %2649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 7, !dbg !131 + %2650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 8, !dbg !131 + %2651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 9, !dbg !131 + %2652 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 10, !dbg !131 + %2653 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 11, !dbg !131 + %2654 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 12, !dbg !131 + %2655 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 13, !dbg !131 + %2656 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 14, !dbg !131 + %2657 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 15, !dbg !131 + %2658 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 16, !dbg !131 + %2659 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 17, !dbg !131 + %2660 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 18, !dbg !131 + %2661 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 19, !dbg !131 + %2662 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 20, !dbg !131 + %2663 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 21, !dbg !131 + %2664 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 22, !dbg !131 + %2665 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 23, !dbg !131 + %2666 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 24, !dbg !131 + %2667 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 25, !dbg !131 + %2668 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 26, !dbg !131 + %2669 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 27, !dbg !131 + %2670 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 28, !dbg !131 + %2671 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 29, !dbg !131 + %2672 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 30, !dbg !131 + %2673 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2631, 31, !dbg !131 + %2674 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31}, $64, $65, $66, 1, 1, 0, 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,b"(float %2642, float %2643, float %2644, float %2645, float %2646, float %2647, float %2648, float %2649, float %2650, float %2651, float %2652, float %2653, float %2654, float %2655, float %2656, float %2657, float %2658, float %2659, float %2660, float %2661, float %2662, float %2663, float %2664, float %2665, float %2666, float %2667, float %2668, float %2669, float %2670, float %2671, float %2672, float %2673, i64 %2636, i64 %2641, i1 true) #2, !dbg !131 + %2675 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 0, !dbg !131 + %2676 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 1, !dbg !131 + %2677 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 2, !dbg !131 + %2678 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 3, !dbg !131 + %2679 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 4, !dbg !131 + %2680 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 5, !dbg !131 + %2681 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 6, !dbg !131 + %2682 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 7, !dbg !131 + %2683 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 8, !dbg !131 + %2684 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 9, !dbg !131 + %2685 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 10, !dbg !131 + %2686 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 11, !dbg !131 + %2687 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 12, !dbg !131 + %2688 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 13, !dbg !131 + %2689 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 14, !dbg !131 + %2690 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 15, !dbg !131 + %2691 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 16, !dbg !131 + %2692 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 17, !dbg !131 + %2693 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 18, !dbg !131 + %2694 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 19, !dbg !131 + %2695 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 20, !dbg !131 + %2696 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 21, !dbg !131 + %2697 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 22, !dbg !131 + %2698 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 23, !dbg !131 + %2699 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 24, !dbg !131 + %2700 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 25, !dbg !131 + %2701 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 26, !dbg !131 + %2702 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 27, !dbg !131 + %2703 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 28, !dbg !131 + %2704 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 29, !dbg !131 + %2705 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 30, !dbg !131 + %2706 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2674, 31, !dbg !131 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !131 + %2707 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63,$64,$65,$66,$67,$68,$69,$70,$71,$72,$73,$74,$75,$76,$77,$78,$79,$80,$81,$82,$83,$84,$85,$86,$87,$88,$89,$90,$91,$92,$93,$94,$95,$96,$97,$98,$99,$100,$101\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101"(float %2675, float %2676, float %2677, float %2678, float %2679, float %2680, float %2681, float %2682, float %2683, float %2684, float %2685, float %2686, float %2687, float %2688, float %2689, float %2690, float %2691, float %2692, float %2693, float %2694, float %2695, float %2696, float %2697, float %2698, float %2699, float %2700, float %2701, float %2702, float %2703, float %2704, float %2705, float %2706, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 98304), i32 0, i32 0, ptr addrspace(3) %2359, i32 0, i32 0, float %.pn, float %.pn479, float %.pn480, float %.pn481, float %.pn482, float %.pn483, float %.pn484, float %.pn485, float %.pn486, float %.pn487, float %.pn488, float %.pn489, float %.pn490, float %.pn491, float %.pn492, float %.pn493, float %.pn494, float %.pn495, float %.pn496, float %.pn497, float %.pn498, float %.pn499, float %.pn500, float %.pn501, float %.pn502, float %.pn503, float %.pn504, float %.pn505, float %.pn506, float %.pn507, float %.pn508, float %.pn509, float %.pn510, float %.pn511, float %.pn512, float %.pn513, float %.pn514, float %.pn515, float %.pn516, float %.pn517, float %.pn518, float %.pn519, float %.pn520, float %.pn521, float %.pn522, float %.pn523, float %.pn524, float %.pn525, float %.pn526, float %.pn527, float %.pn528, float %.pn529, float %.pn530, float %.pn531, float %.pn532, float %.pn533, float %.pn534, float %.pn535, float %.pn536, float %.pn537, float %.pn538, float %.pn539, float %.pn540, float %.pn541) #2, !dbg !131 + %2708 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 0, !dbg !131 + %2709 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 1, !dbg !131 + %2710 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 2, !dbg !131 + %2711 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 3, !dbg !131 + %2712 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 4, !dbg !131 + %2713 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 5, !dbg !131 + %2714 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 6, !dbg !131 + %2715 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 7, !dbg !131 + %2716 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 8, !dbg !131 + %2717 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 9, !dbg !131 + %2718 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 10, !dbg !131 + %2719 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 11, !dbg !131 + %2720 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 12, !dbg !131 + %2721 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 13, !dbg !131 + %2722 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 14, !dbg !131 + %2723 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 15, !dbg !131 + %2724 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 16, !dbg !131 + %2725 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 17, !dbg !131 + %2726 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 18, !dbg !131 + %2727 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 19, !dbg !131 + %2728 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 20, !dbg !131 + %2729 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 21, !dbg !131 + %2730 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 22, !dbg !131 + %2731 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 23, !dbg !131 + %2732 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 24, !dbg !131 + %2733 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 25, !dbg !131 + %2734 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 26, !dbg !131 + %2735 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 27, !dbg !131 + %2736 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 28, !dbg !131 + %2737 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 29, !dbg !131 + %2738 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 30, !dbg !131 + %2739 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 31, !dbg !131 + %2740 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 38, !dbg !131 + %2741 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 39, !dbg !131 + %2742 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 40, !dbg !131 + %2743 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 41, !dbg !131 + %2744 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 42, !dbg !131 + %2745 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 43, !dbg !131 + %2746 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 44, !dbg !131 + %2747 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 45, !dbg !131 + %2748 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 46, !dbg !131 + %2749 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 47, !dbg !131 + %2750 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 48, !dbg !131 + %2751 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 49, !dbg !131 + %2752 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 50, !dbg !131 + %2753 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 51, !dbg !131 + %2754 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 52, !dbg !131 + %2755 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 53, !dbg !131 + %2756 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 54, !dbg !131 + %2757 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 55, !dbg !131 + %2758 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 56, !dbg !131 + %2759 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 57, !dbg !131 + %2760 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 58, !dbg !131 + %2761 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 59, !dbg !131 + %2762 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 60, !dbg !131 + %2763 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 61, !dbg !131 + %2764 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 62, !dbg !131 + %2765 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 63, !dbg !131 + %2766 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 64, !dbg !131 + %2767 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 65, !dbg !131 + %2768 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 66, !dbg !131 + %2769 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 67, !dbg !131 + %2770 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 68, !dbg !131 + %2771 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 69, !dbg !131 + %2772 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 70, !dbg !131 + %2773 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 71, !dbg !131 + %2774 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 72, !dbg !131 + %2775 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 73, !dbg !131 + %2776 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 74, !dbg !131 + %2777 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 75, !dbg !131 + %2778 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 76, !dbg !131 + %2779 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 77, !dbg !131 + %2780 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 78, !dbg !131 + %2781 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 79, !dbg !131 + %2782 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 80, !dbg !131 + %2783 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 81, !dbg !131 + %2784 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 82, !dbg !131 + %2785 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 83, !dbg !131 + %2786 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 84, !dbg !131 + %2787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 85, !dbg !131 + %2788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 86, !dbg !131 + %2789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 87, !dbg !131 + %2790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 88, !dbg !131 + %2791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 89, !dbg !131 + %2792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 90, !dbg !131 + %2793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 91, !dbg !131 + %2794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 92, !dbg !131 + %2795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 93, !dbg !131 + %2796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 94, !dbg !131 + %2797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 95, !dbg !131 + %2798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 96, !dbg !131 + %2799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 97, !dbg !131 + %2800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 98, !dbg !131 + %2801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 99, !dbg !131 + %2802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 100, !dbg !131 + %2803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, ptr addrspace(3), i32, i32, ptr addrspace(3), i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %2707, 101, !dbg !131 + %2804 = fmul float %2708, 0x3FB6A09E60000000, !dbg !133 + %2805 = fmul float %2709, 0x3FB6A09E60000000, !dbg !133 + %2806 = fmul float %2710, 0x3FB6A09E60000000, !dbg !133 + %2807 = fmul float %2711, 0x3FB6A09E60000000, !dbg !133 + %2808 = fmul float %2712, 0x3FB6A09E60000000, !dbg !133 + %2809 = fmul float %2713, 0x3FB6A09E60000000, !dbg !133 + %2810 = fmul float %2714, 0x3FB6A09E60000000, !dbg !133 + %2811 = fmul float %2715, 0x3FB6A09E60000000, !dbg !133 + %2812 = fmul float %2716, 0x3FB6A09E60000000, !dbg !133 + %2813 = fmul float %2717, 0x3FB6A09E60000000, !dbg !133 + %2814 = fmul float %2718, 0x3FB6A09E60000000, !dbg !133 + %2815 = fmul float %2719, 0x3FB6A09E60000000, !dbg !133 + %2816 = fmul float %2720, 0x3FB6A09E60000000, !dbg !133 + %2817 = fmul float %2721, 0x3FB6A09E60000000, !dbg !133 + %2818 = fmul float %2722, 0x3FB6A09E60000000, !dbg !133 + %2819 = fmul float %2723, 0x3FB6A09E60000000, !dbg !133 + %2820 = fmul float %2724, 0x3FB6A09E60000000, !dbg !133 + %2821 = fmul float %2725, 0x3FB6A09E60000000, !dbg !133 + %2822 = fmul float %2726, 0x3FB6A09E60000000, !dbg !133 + %2823 = fmul float %2727, 0x3FB6A09E60000000, !dbg !133 + %2824 = fmul float %2728, 0x3FB6A09E60000000, !dbg !133 + %2825 = fmul float %2729, 0x3FB6A09E60000000, !dbg !133 + %2826 = fmul float %2730, 0x3FB6A09E60000000, !dbg !133 + %2827 = fmul float %2731, 0x3FB6A09E60000000, !dbg !133 + %2828 = fmul float %2732, 0x3FB6A09E60000000, !dbg !133 + %2829 = fmul float %2733, 0x3FB6A09E60000000, !dbg !133 + %2830 = fmul float %2734, 0x3FB6A09E60000000, !dbg !133 + %2831 = fmul float %2735, 0x3FB6A09E60000000, !dbg !133 + %2832 = fmul float %2736, 0x3FB6A09E60000000, !dbg !133 + %2833 = fmul float %2737, 0x3FB6A09E60000000, !dbg !133 + %2834 = fmul float %2738, 0x3FB6A09E60000000, !dbg !133 + %2835 = fmul float %2739, 0x3FB6A09E60000000, !dbg !133 + %2836 = extractelement <16 x i32> %2352, i64 15, !dbg !134 + %2837 = icmp slt i32 %2836, %11, !dbg !134 + %2838 = extractelement <16 x i32> %2352, i64 14, !dbg !134 + %2839 = icmp slt i32 %2838, %11, !dbg !134 + %2840 = extractelement <16 x i32> %2352, i64 13, !dbg !134 + %2841 = icmp slt i32 %2840, %11, !dbg !134 + %2842 = extractelement <16 x i32> %2352, i64 12, !dbg !134 + %2843 = icmp slt i32 %2842, %11, !dbg !134 + %2844 = extractelement <16 x i32> %2352, i64 11, !dbg !134 + %2845 = icmp slt i32 %2844, %11, !dbg !134 + %2846 = extractelement <16 x i32> %2352, i64 10, !dbg !134 + %2847 = icmp slt i32 %2846, %11, !dbg !134 + %2848 = extractelement <16 x i32> %2352, i64 9, !dbg !134 + %2849 = icmp slt i32 %2848, %11, !dbg !134 + %2850 = extractelement <16 x i32> %2352, i64 8, !dbg !134 + %2851 = icmp slt i32 %2850, %11, !dbg !134 + %2852 = extractelement <16 x i32> %2352, i64 7, !dbg !134 + %2853 = icmp slt i32 %2852, %11, !dbg !134 + %2854 = extractelement <16 x i32> %2352, i64 6, !dbg !134 + %2855 = icmp slt i32 %2854, %11, !dbg !134 + %2856 = extractelement <16 x i32> %2352, i64 5, !dbg !134 + %2857 = icmp slt i32 %2856, %11, !dbg !134 + %2858 = extractelement <16 x i32> %2352, i64 4, !dbg !134 + %2859 = icmp slt i32 %2858, %11, !dbg !134 + %2860 = extractelement <16 x i32> %2352, i64 3, !dbg !134 + %2861 = icmp slt i32 %2860, %11, !dbg !134 + %2862 = extractelement <16 x i32> %2352, i64 2, !dbg !134 + %2863 = icmp slt i32 %2862, %11, !dbg !134 + %2864 = extractelement <16 x i32> %2352, i64 1, !dbg !134 + %2865 = icmp slt i32 %2864, %11, !dbg !134 + %2866 = extractelement <16 x i32> %2352, i64 0, !dbg !134 + %2867 = icmp slt i32 %2866, %11, !dbg !134 + %2868 = fmul float %2804, 0x3FF7154760000000, !dbg !135 + %2869 = select i1 %2837, float %2868, float 0xFFF0000000000000, !dbg !136 + %2870 = fmul float %2805, 0x3FF7154760000000, !dbg !135 + %2871 = select i1 %2839, float %2870, float 0xFFF0000000000000, !dbg !136 + %2872 = fmul float %2806, 0x3FF7154760000000, !dbg !135 + %2873 = select i1 %2837, float %2872, float 0xFFF0000000000000, !dbg !136 + %2874 = fmul float %2807, 0x3FF7154760000000, !dbg !135 + %2875 = select i1 %2839, float %2874, float 0xFFF0000000000000, !dbg !136 + %2876 = fmul float %2808, 0x3FF7154760000000, !dbg !135 + %2877 = select i1 %2841, float %2876, float 0xFFF0000000000000, !dbg !136 + %2878 = fmul float %2809, 0x3FF7154760000000, !dbg !135 + %2879 = select i1 %2843, float %2878, float 0xFFF0000000000000, !dbg !136 + %2880 = fmul float %2810, 0x3FF7154760000000, !dbg !135 + %2881 = select i1 %2841, float %2880, float 0xFFF0000000000000, !dbg !136 + %2882 = fmul float %2811, 0x3FF7154760000000, !dbg !135 + %2883 = select i1 %2843, float %2882, float 0xFFF0000000000000, !dbg !136 + %2884 = fmul float %2812, 0x3FF7154760000000, !dbg !135 + %2885 = select i1 %2845, float %2884, float 0xFFF0000000000000, !dbg !136 + %2886 = fmul float %2813, 0x3FF7154760000000, !dbg !135 + %2887 = select i1 %2847, float %2886, float 0xFFF0000000000000, !dbg !136 + %2888 = fmul float %2814, 0x3FF7154760000000, !dbg !135 + %2889 = select i1 %2845, float %2888, float 0xFFF0000000000000, !dbg !136 + %2890 = fmul float %2815, 0x3FF7154760000000, !dbg !135 + %2891 = select i1 %2847, float %2890, float 0xFFF0000000000000, !dbg !136 + %2892 = fmul float %2816, 0x3FF7154760000000, !dbg !135 + %2893 = select i1 %2849, float %2892, float 0xFFF0000000000000, !dbg !136 + %2894 = fmul float %2817, 0x3FF7154760000000, !dbg !135 + %2895 = select i1 %2851, float %2894, float 0xFFF0000000000000, !dbg !136 + %2896 = fmul float %2818, 0x3FF7154760000000, !dbg !135 + %2897 = select i1 %2849, float %2896, float 0xFFF0000000000000, !dbg !136 + %2898 = fmul float %2819, 0x3FF7154760000000, !dbg !135 + %2899 = select i1 %2851, float %2898, float 0xFFF0000000000000, !dbg !136 + %2900 = fmul float %2820, 0x3FF7154760000000, !dbg !135 + %2901 = select i1 %2853, float %2900, float 0xFFF0000000000000, !dbg !136 + %2902 = fmul float %2821, 0x3FF7154760000000, !dbg !135 + %2903 = select i1 %2855, float %2902, float 0xFFF0000000000000, !dbg !136 + %2904 = fmul float %2822, 0x3FF7154760000000, !dbg !135 + %2905 = select i1 %2853, float %2904, float 0xFFF0000000000000, !dbg !136 + %2906 = fmul float %2823, 0x3FF7154760000000, !dbg !135 + %2907 = select i1 %2855, float %2906, float 0xFFF0000000000000, !dbg !136 + %2908 = fmul float %2824, 0x3FF7154760000000, !dbg !135 + %2909 = select i1 %2857, float %2908, float 0xFFF0000000000000, !dbg !136 + %2910 = fmul float %2825, 0x3FF7154760000000, !dbg !135 + %2911 = select i1 %2859, float %2910, float 0xFFF0000000000000, !dbg !136 + %2912 = fmul float %2826, 0x3FF7154760000000, !dbg !135 + %2913 = select i1 %2857, float %2912, float 0xFFF0000000000000, !dbg !136 + %2914 = fmul float %2827, 0x3FF7154760000000, !dbg !135 + %2915 = select i1 %2859, float %2914, float 0xFFF0000000000000, !dbg !136 + %2916 = fmul float %2828, 0x3FF7154760000000, !dbg !135 + %2917 = select i1 %2861, float %2916, float 0xFFF0000000000000, !dbg !136 + %2918 = fmul float %2829, 0x3FF7154760000000, !dbg !135 + %2919 = select i1 %2863, float %2918, float 0xFFF0000000000000, !dbg !136 + %2920 = fmul float %2830, 0x3FF7154760000000, !dbg !135 + %2921 = select i1 %2861, float %2920, float 0xFFF0000000000000, !dbg !136 + %2922 = fmul float %2831, 0x3FF7154760000000, !dbg !135 + %2923 = select i1 %2863, float %2922, float 0xFFF0000000000000, !dbg !136 + %2924 = fmul float %2832, 0x3FF7154760000000, !dbg !135 + %2925 = select i1 %2865, float %2924, float 0xFFF0000000000000, !dbg !136 + %2926 = fmul float %2833, 0x3FF7154760000000, !dbg !135 + %2927 = select i1 %2867, float %2926, float 0xFFF0000000000000, !dbg !136 + %2928 = fmul float %2834, 0x3FF7154760000000, !dbg !135 + %2929 = select i1 %2865, float %2928, float 0xFFF0000000000000, !dbg !136 + %2930 = fmul float %2835, 0x3FF7154760000000, !dbg !135 + %2931 = select i1 %2867, float %2930, float 0xFFF0000000000000, !dbg !136 + %2932 = tail call float @llvm.maxnum.f32(float %2869, float %2871), !dbg !137 + %2933 = tail call float @llvm.maxnum.f32(float %2873, float %2875), !dbg !137 + %2934 = tail call float @llvm.maxnum.f32(float %2932, float %2877), !dbg !137 + %2935 = tail call float @llvm.maxnum.f32(float %2934, float %2879), !dbg !137 + %2936 = tail call float @llvm.maxnum.f32(float %2933, float %2881), !dbg !137 + %2937 = tail call float @llvm.maxnum.f32(float %2936, float %2883), !dbg !137 + %2938 = tail call float @llvm.maxnum.f32(float %2935, float %2885), !dbg !137 + %2939 = tail call float @llvm.maxnum.f32(float %2938, float %2887), !dbg !137 + %2940 = tail call float @llvm.maxnum.f32(float %2937, float %2889), !dbg !137 + %2941 = tail call float @llvm.maxnum.f32(float %2940, float %2891), !dbg !137 + %2942 = tail call float @llvm.maxnum.f32(float %2939, float %2893), !dbg !137 + %2943 = tail call float @llvm.maxnum.f32(float %2942, float %2895), !dbg !137 + %2944 = tail call float @llvm.maxnum.f32(float %2941, float %2897), !dbg !137 + %2945 = tail call float @llvm.maxnum.f32(float %2944, float %2899), !dbg !137 + %2946 = tail call float @llvm.maxnum.f32(float %2943, float %2901), !dbg !137 + %2947 = tail call float @llvm.maxnum.f32(float %2946, float %2903), !dbg !137 + %2948 = tail call float @llvm.maxnum.f32(float %2945, float %2905), !dbg !137 + %2949 = tail call float @llvm.maxnum.f32(float %2948, float %2907), !dbg !137 + %2950 = tail call float @llvm.maxnum.f32(float %2947, float %2909), !dbg !137 + %2951 = tail call float @llvm.maxnum.f32(float %2950, float %2911), !dbg !137 + %2952 = tail call float @llvm.maxnum.f32(float %2949, float %2913), !dbg !137 + %2953 = tail call float @llvm.maxnum.f32(float %2952, float %2915), !dbg !137 + %2954 = tail call float @llvm.maxnum.f32(float %2951, float %2917), !dbg !137 + %2955 = tail call float @llvm.maxnum.f32(float %2954, float %2919), !dbg !137 + %2956 = tail call float @llvm.maxnum.f32(float %2953, float %2921), !dbg !137 + %2957 = tail call float @llvm.maxnum.f32(float %2956, float %2923), !dbg !137 + %2958 = tail call float @llvm.maxnum.f32(float %2955, float %2925), !dbg !137 + %2959 = tail call float @llvm.maxnum.f32(float %2958, float %2927), !dbg !137 + %2960 = tail call float @llvm.maxnum.f32(float %2957, float %2929), !dbg !137 + %2961 = tail call float @llvm.maxnum.f32(float %2960, float %2931), !dbg !137 + %2962 = bitcast float %2959 to i32, !dbg !138 + %2963 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2962, i32 2, i32 31), !dbg !138 + %2964 = bitcast i32 %2963 to float, !dbg !138 + %2965 = bitcast float %2961 to i32, !dbg !138 + %2966 = tail call float @llvm.maxnum.f32(float %2959, float %2964), !dbg !137 + %2967 = bitcast float %2966 to i32, !dbg !138 + %2968 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2967, i32 1, i32 31), !dbg !138 + %2969 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2965, i32 2, i32 31), !dbg !138 + %2970 = bitcast i32 %2969 to float, !dbg !138 + %2971 = tail call float @llvm.maxnum.f32(float %2961, float %2970), !dbg !137 + %2972 = bitcast float %2971 to i32, !dbg !138 + %2973 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %2972, i32 1, i32 31), !dbg !138 + %2974 = insertelement <2 x i32> poison, i32 %2968, i64 0, !dbg !138 + %2975 = insertelement <2 x i32> %2974, i32 %2973, i64 1, !dbg !138 + %2976 = bitcast <2 x i32> %2975 to <2 x float>, !dbg !138 + %2977 = insertelement <2 x float> poison, float %2966, i64 0, !dbg !137 + %2978 = insertelement <2 x float> %2977, float %2971, i64 1, !dbg !137 + %2979 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %2978, <2 x float> %2976), !dbg !137 + %2980 = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %2351, <2 x float> %2979), !dbg !139 + %2981 = extractelement <2 x float> %2980, i64 0, !dbg !140 + %2982 = fcmp oeq float %2981, 0xFFF0000000000000, !dbg !141 + %2983 = extractelement <2 x float> %2980, i64 1, !dbg !140 + %2984 = fcmp oeq float %2983, 0xFFF0000000000000, !dbg !141 + %2985 = select i1 %2982, float 0.000000e+00, float %2981, !dbg !140 + %2986 = select i1 %2984, float 0.000000e+00, float %2983, !dbg !140 + %2987 = extractelement <2 x float> %2351, i64 0, !dbg !142 + %2988 = fsub float %2987, %2985, !dbg !142 + %2989 = extractelement <2 x float> %2351, i64 1, !dbg !142 + %2990 = fsub float %2989, %2986, !dbg !142 + %2991 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !143 + %.not.i = icmp eq i32 %2991, 0, !dbg !143 + br i1 %.not.i, label %2994, label %2992, !dbg !143 + +2992: ; preds = %2345 + %2993 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2988) #2, !dbg !143 + br label %__nv_exp2f.exit, !dbg !143 + +2994: ; preds = %2345 + %2995 = tail call float @llvm.nvvm.ex2.approx.f(float %2988) #2, !dbg !143 + br label %__nv_exp2f.exit, !dbg !143 + +__nv_exp2f.exit: ; preds = %2992, %2994 + %.0.i = phi float [ %2993, %2992 ], [ %2995, %2994 ], !dbg !143 + %2996 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !143 + %.not.i155 = icmp eq i32 %2996, 0, !dbg !143 + br i1 %.not.i155, label %2999, label %2997, !dbg !143 + +2997: ; preds = %__nv_exp2f.exit + %2998 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %2990) #2, !dbg !143 + br label %__nv_exp2f.exit157, !dbg !143 + +2999: ; preds = %__nv_exp2f.exit + %3000 = tail call float @llvm.nvvm.ex2.approx.f(float %2990) #2, !dbg !143 + br label %__nv_exp2f.exit157, !dbg !143 + +__nv_exp2f.exit157: ; preds = %2997, %2999 + %.0.i156 = phi float [ %2998, %2997 ], [ %3000, %2999 ], !dbg !143 + %3001 = fsub float %2869, %2985, !dbg !144 + %3002 = fsub float %2871, %2985, !dbg !144 + %3003 = fsub float %2873, %2986, !dbg !144 + %3004 = fsub float %2875, %2986, !dbg !144 + %3005 = fsub float %2877, %2985, !dbg !144 + %3006 = fsub float %2879, %2985, !dbg !144 + %3007 = fsub float %2881, %2986, !dbg !144 + %3008 = fsub float %2883, %2986, !dbg !144 + %3009 = fsub float %2885, %2985, !dbg !144 + %3010 = fsub float %2887, %2985, !dbg !144 + %3011 = fsub float %2889, %2986, !dbg !144 + %3012 = fsub float %2891, %2986, !dbg !144 + %3013 = fsub float %2893, %2985, !dbg !144 + %3014 = fsub float %2895, %2985, !dbg !144 + %3015 = fsub float %2897, %2986, !dbg !144 + %3016 = fsub float %2899, %2986, !dbg !144 + %3017 = fsub float %2901, %2985, !dbg !144 + %3018 = fsub float %2903, %2985, !dbg !144 + %3019 = fsub float %2905, %2986, !dbg !144 + %3020 = fsub float %2907, %2986, !dbg !144 + %3021 = fsub float %2909, %2985, !dbg !144 + %3022 = fsub float %2911, %2985, !dbg !144 + %3023 = fsub float %2913, %2986, !dbg !144 + %3024 = fsub float %2915, %2986, !dbg !144 + %3025 = fsub float %2917, %2985, !dbg !144 + %3026 = fsub float %2919, %2985, !dbg !144 + %3027 = fsub float %2921, %2986, !dbg !144 + %3028 = fsub float %2923, %2986, !dbg !144 + %3029 = fsub float %2925, %2985, !dbg !144 + %3030 = fsub float %2927, %2985, !dbg !144 + %3031 = fsub float %2929, %2986, !dbg !144 + %3032 = fsub float %2931, %2986, !dbg !144 + %3033 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i158 = icmp eq i32 %3033, 0, !dbg !145 + br i1 %.not.i158, label %3036, label %3034, !dbg !145 + +3034: ; preds = %__nv_exp2f.exit157 + %3035 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3001) #2, !dbg !145 + br label %__nv_exp2f.exit160, !dbg !145 + +3036: ; preds = %__nv_exp2f.exit157 + %3037 = tail call float @llvm.nvvm.ex2.approx.f(float %3001) #2, !dbg !145 + br label %__nv_exp2f.exit160, !dbg !145 + +__nv_exp2f.exit160: ; preds = %3034, %3036 + %.0.i159 = phi float [ %3035, %3034 ], [ %3037, %3036 ], !dbg !145 + %3038 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i161 = icmp eq i32 %3038, 0, !dbg !145 + br i1 %.not.i161, label %3041, label %3039, !dbg !145 + +3039: ; preds = %__nv_exp2f.exit160 + %3040 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3002) #2, !dbg !145 + br label %__nv_exp2f.exit163, !dbg !145 + +3041: ; preds = %__nv_exp2f.exit160 + %3042 = tail call float @llvm.nvvm.ex2.approx.f(float %3002) #2, !dbg !145 + br label %__nv_exp2f.exit163, !dbg !145 + +__nv_exp2f.exit163: ; preds = %3039, %3041 + %.0.i162 = phi float [ %3040, %3039 ], [ %3042, %3041 ], !dbg !145 + %3043 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i164 = icmp eq i32 %3043, 0, !dbg !145 + br i1 %.not.i164, label %3046, label %3044, !dbg !145 + +3044: ; preds = %__nv_exp2f.exit163 + %3045 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3003) #2, !dbg !145 + br label %__nv_exp2f.exit166, !dbg !145 + +3046: ; preds = %__nv_exp2f.exit163 + %3047 = tail call float @llvm.nvvm.ex2.approx.f(float %3003) #2, !dbg !145 + br label %__nv_exp2f.exit166, !dbg !145 + +__nv_exp2f.exit166: ; preds = %3044, %3046 + %.0.i165 = phi float [ %3045, %3044 ], [ %3047, %3046 ], !dbg !145 + %3048 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i167 = icmp eq i32 %3048, 0, !dbg !145 + br i1 %.not.i167, label %3051, label %3049, !dbg !145 + +3049: ; preds = %__nv_exp2f.exit166 + %3050 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3004) #2, !dbg !145 + br label %__nv_exp2f.exit169, !dbg !145 + +3051: ; preds = %__nv_exp2f.exit166 + %3052 = tail call float @llvm.nvvm.ex2.approx.f(float %3004) #2, !dbg !145 + br label %__nv_exp2f.exit169, !dbg !145 + +__nv_exp2f.exit169: ; preds = %3049, %3051 + %.0.i168 = phi float [ %3050, %3049 ], [ %3052, %3051 ], !dbg !145 + %3053 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i170 = icmp eq i32 %3053, 0, !dbg !145 + br i1 %.not.i170, label %3056, label %3054, !dbg !145 + +3054: ; preds = %__nv_exp2f.exit169 + %3055 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3005) #2, !dbg !145 + br label %__nv_exp2f.exit172, !dbg !145 + +3056: ; preds = %__nv_exp2f.exit169 + %3057 = tail call float @llvm.nvvm.ex2.approx.f(float %3005) #2, !dbg !145 + br label %__nv_exp2f.exit172, !dbg !145 + +__nv_exp2f.exit172: ; preds = %3054, %3056 + %.0.i171 = phi float [ %3055, %3054 ], [ %3057, %3056 ], !dbg !145 + %3058 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i173 = icmp eq i32 %3058, 0, !dbg !145 + br i1 %.not.i173, label %3061, label %3059, !dbg !145 + +3059: ; preds = %__nv_exp2f.exit172 + %3060 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3006) #2, !dbg !145 + br label %__nv_exp2f.exit175, !dbg !145 + +3061: ; preds = %__nv_exp2f.exit172 + %3062 = tail call float @llvm.nvvm.ex2.approx.f(float %3006) #2, !dbg !145 + br label %__nv_exp2f.exit175, !dbg !145 + +__nv_exp2f.exit175: ; preds = %3059, %3061 + %.0.i174 = phi float [ %3060, %3059 ], [ %3062, %3061 ], !dbg !145 + %3063 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i176 = icmp eq i32 %3063, 0, !dbg !145 + br i1 %.not.i176, label %3066, label %3064, !dbg !145 + +3064: ; preds = %__nv_exp2f.exit175 + %3065 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3007) #2, !dbg !145 + br label %__nv_exp2f.exit178, !dbg !145 + +3066: ; preds = %__nv_exp2f.exit175 + %3067 = tail call float @llvm.nvvm.ex2.approx.f(float %3007) #2, !dbg !145 + br label %__nv_exp2f.exit178, !dbg !145 + +__nv_exp2f.exit178: ; preds = %3064, %3066 + %.0.i177 = phi float [ %3065, %3064 ], [ %3067, %3066 ], !dbg !145 + %3068 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i179 = icmp eq i32 %3068, 0, !dbg !145 + br i1 %.not.i179, label %3071, label %3069, !dbg !145 + +3069: ; preds = %__nv_exp2f.exit178 + %3070 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3008) #2, !dbg !145 + br label %__nv_exp2f.exit181, !dbg !145 + +3071: ; preds = %__nv_exp2f.exit178 + %3072 = tail call float @llvm.nvvm.ex2.approx.f(float %3008) #2, !dbg !145 + br label %__nv_exp2f.exit181, !dbg !145 + +__nv_exp2f.exit181: ; preds = %3069, %3071 + %.0.i180 = phi float [ %3070, %3069 ], [ %3072, %3071 ], !dbg !145 + %3073 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i182 = icmp eq i32 %3073, 0, !dbg !145 + br i1 %.not.i182, label %3076, label %3074, !dbg !145 + +3074: ; preds = %__nv_exp2f.exit181 + %3075 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3009) #2, !dbg !145 + br label %__nv_exp2f.exit184, !dbg !145 + +3076: ; preds = %__nv_exp2f.exit181 + %3077 = tail call float @llvm.nvvm.ex2.approx.f(float %3009) #2, !dbg !145 + br label %__nv_exp2f.exit184, !dbg !145 + +__nv_exp2f.exit184: ; preds = %3074, %3076 + %.0.i183 = phi float [ %3075, %3074 ], [ %3077, %3076 ], !dbg !145 + %3078 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i185 = icmp eq i32 %3078, 0, !dbg !145 + br i1 %.not.i185, label %3081, label %3079, !dbg !145 + +3079: ; preds = %__nv_exp2f.exit184 + %3080 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3010) #2, !dbg !145 + br label %__nv_exp2f.exit187, !dbg !145 + +3081: ; preds = %__nv_exp2f.exit184 + %3082 = tail call float @llvm.nvvm.ex2.approx.f(float %3010) #2, !dbg !145 + br label %__nv_exp2f.exit187, !dbg !145 + +__nv_exp2f.exit187: ; preds = %3079, %3081 + %.0.i186 = phi float [ %3080, %3079 ], [ %3082, %3081 ], !dbg !145 + %3083 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i188 = icmp eq i32 %3083, 0, !dbg !145 + br i1 %.not.i188, label %3086, label %3084, !dbg !145 + +3084: ; preds = %__nv_exp2f.exit187 + %3085 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3011) #2, !dbg !145 + br label %__nv_exp2f.exit190, !dbg !145 + +3086: ; preds = %__nv_exp2f.exit187 + %3087 = tail call float @llvm.nvvm.ex2.approx.f(float %3011) #2, !dbg !145 + br label %__nv_exp2f.exit190, !dbg !145 + +__nv_exp2f.exit190: ; preds = %3084, %3086 + %.0.i189 = phi float [ %3085, %3084 ], [ %3087, %3086 ], !dbg !145 + %3088 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i191 = icmp eq i32 %3088, 0, !dbg !145 + br i1 %.not.i191, label %3091, label %3089, !dbg !145 + +3089: ; preds = %__nv_exp2f.exit190 + %3090 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3012) #2, !dbg !145 + br label %__nv_exp2f.exit193, !dbg !145 + +3091: ; preds = %__nv_exp2f.exit190 + %3092 = tail call float @llvm.nvvm.ex2.approx.f(float %3012) #2, !dbg !145 + br label %__nv_exp2f.exit193, !dbg !145 + +__nv_exp2f.exit193: ; preds = %3089, %3091 + %.0.i192 = phi float [ %3090, %3089 ], [ %3092, %3091 ], !dbg !145 + %3093 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i194 = icmp eq i32 %3093, 0, !dbg !145 + br i1 %.not.i194, label %3096, label %3094, !dbg !145 + +3094: ; preds = %__nv_exp2f.exit193 + %3095 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3013) #2, !dbg !145 + br label %__nv_exp2f.exit196, !dbg !145 + +3096: ; preds = %__nv_exp2f.exit193 + %3097 = tail call float @llvm.nvvm.ex2.approx.f(float %3013) #2, !dbg !145 + br label %__nv_exp2f.exit196, !dbg !145 + +__nv_exp2f.exit196: ; preds = %3094, %3096 + %.0.i195 = phi float [ %3095, %3094 ], [ %3097, %3096 ], !dbg !145 + %3098 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i197 = icmp eq i32 %3098, 0, !dbg !145 + br i1 %.not.i197, label %3101, label %3099, !dbg !145 + +3099: ; preds = %__nv_exp2f.exit196 + %3100 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3014) #2, !dbg !145 + br label %__nv_exp2f.exit199, !dbg !145 + +3101: ; preds = %__nv_exp2f.exit196 + %3102 = tail call float @llvm.nvvm.ex2.approx.f(float %3014) #2, !dbg !145 + br label %__nv_exp2f.exit199, !dbg !145 + +__nv_exp2f.exit199: ; preds = %3099, %3101 + %.0.i198 = phi float [ %3100, %3099 ], [ %3102, %3101 ], !dbg !145 + %3103 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i200 = icmp eq i32 %3103, 0, !dbg !145 + br i1 %.not.i200, label %3106, label %3104, !dbg !145 + +3104: ; preds = %__nv_exp2f.exit199 + %3105 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3015) #2, !dbg !145 + br label %__nv_exp2f.exit202, !dbg !145 + +3106: ; preds = %__nv_exp2f.exit199 + %3107 = tail call float @llvm.nvvm.ex2.approx.f(float %3015) #2, !dbg !145 + br label %__nv_exp2f.exit202, !dbg !145 + +__nv_exp2f.exit202: ; preds = %3104, %3106 + %.0.i201 = phi float [ %3105, %3104 ], [ %3107, %3106 ], !dbg !145 + %3108 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i203 = icmp eq i32 %3108, 0, !dbg !145 + br i1 %.not.i203, label %3111, label %3109, !dbg !145 + +3109: ; preds = %__nv_exp2f.exit202 + %3110 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3016) #2, !dbg !145 + br label %__nv_exp2f.exit205, !dbg !145 + +3111: ; preds = %__nv_exp2f.exit202 + %3112 = tail call float @llvm.nvvm.ex2.approx.f(float %3016) #2, !dbg !145 + br label %__nv_exp2f.exit205, !dbg !145 + +__nv_exp2f.exit205: ; preds = %3109, %3111 + %.0.i204 = phi float [ %3110, %3109 ], [ %3112, %3111 ], !dbg !145 + %3113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i206 = icmp eq i32 %3113, 0, !dbg !145 + br i1 %.not.i206, label %3116, label %3114, !dbg !145 + +3114: ; preds = %__nv_exp2f.exit205 + %3115 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3017) #2, !dbg !145 + br label %__nv_exp2f.exit208, !dbg !145 + +3116: ; preds = %__nv_exp2f.exit205 + %3117 = tail call float @llvm.nvvm.ex2.approx.f(float %3017) #2, !dbg !145 + br label %__nv_exp2f.exit208, !dbg !145 + +__nv_exp2f.exit208: ; preds = %3114, %3116 + %.0.i207 = phi float [ %3115, %3114 ], [ %3117, %3116 ], !dbg !145 + %3118 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i209 = icmp eq i32 %3118, 0, !dbg !145 + br i1 %.not.i209, label %3121, label %3119, !dbg !145 + +3119: ; preds = %__nv_exp2f.exit208 + %3120 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3018) #2, !dbg !145 + br label %__nv_exp2f.exit211, !dbg !145 + +3121: ; preds = %__nv_exp2f.exit208 + %3122 = tail call float @llvm.nvvm.ex2.approx.f(float %3018) #2, !dbg !145 + br label %__nv_exp2f.exit211, !dbg !145 + +__nv_exp2f.exit211: ; preds = %3119, %3121 + %.0.i210 = phi float [ %3120, %3119 ], [ %3122, %3121 ], !dbg !145 + %3123 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i212 = icmp eq i32 %3123, 0, !dbg !145 + br i1 %.not.i212, label %3126, label %3124, !dbg !145 + +3124: ; preds = %__nv_exp2f.exit211 + %3125 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3019) #2, !dbg !145 + br label %__nv_exp2f.exit214, !dbg !145 + +3126: ; preds = %__nv_exp2f.exit211 + %3127 = tail call float @llvm.nvvm.ex2.approx.f(float %3019) #2, !dbg !145 + br label %__nv_exp2f.exit214, !dbg !145 + +__nv_exp2f.exit214: ; preds = %3124, %3126 + %.0.i213 = phi float [ %3125, %3124 ], [ %3127, %3126 ], !dbg !145 + %3128 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i215 = icmp eq i32 %3128, 0, !dbg !145 + br i1 %.not.i215, label %3131, label %3129, !dbg !145 + +3129: ; preds = %__nv_exp2f.exit214 + %3130 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3020) #2, !dbg !145 + br label %__nv_exp2f.exit217, !dbg !145 + +3131: ; preds = %__nv_exp2f.exit214 + %3132 = tail call float @llvm.nvvm.ex2.approx.f(float %3020) #2, !dbg !145 + br label %__nv_exp2f.exit217, !dbg !145 + +__nv_exp2f.exit217: ; preds = %3129, %3131 + %.0.i216 = phi float [ %3130, %3129 ], [ %3132, %3131 ], !dbg !145 + %3133 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i218 = icmp eq i32 %3133, 0, !dbg !145 + br i1 %.not.i218, label %3136, label %3134, !dbg !145 + +3134: ; preds = %__nv_exp2f.exit217 + %3135 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3021) #2, !dbg !145 + br label %__nv_exp2f.exit220, !dbg !145 + +3136: ; preds = %__nv_exp2f.exit217 + %3137 = tail call float @llvm.nvvm.ex2.approx.f(float %3021) #2, !dbg !145 + br label %__nv_exp2f.exit220, !dbg !145 + +__nv_exp2f.exit220: ; preds = %3134, %3136 + %.0.i219 = phi float [ %3135, %3134 ], [ %3137, %3136 ], !dbg !145 + %3138 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i221 = icmp eq i32 %3138, 0, !dbg !145 + br i1 %.not.i221, label %3141, label %3139, !dbg !145 + +3139: ; preds = %__nv_exp2f.exit220 + %3140 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3022) #2, !dbg !145 + br label %__nv_exp2f.exit223, !dbg !145 + +3141: ; preds = %__nv_exp2f.exit220 + %3142 = tail call float @llvm.nvvm.ex2.approx.f(float %3022) #2, !dbg !145 + br label %__nv_exp2f.exit223, !dbg !145 + +__nv_exp2f.exit223: ; preds = %3139, %3141 + %.0.i222 = phi float [ %3140, %3139 ], [ %3142, %3141 ], !dbg !145 + %3143 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i224 = icmp eq i32 %3143, 0, !dbg !145 + br i1 %.not.i224, label %3146, label %3144, !dbg !145 + +3144: ; preds = %__nv_exp2f.exit223 + %3145 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3023) #2, !dbg !145 + br label %__nv_exp2f.exit226, !dbg !145 + +3146: ; preds = %__nv_exp2f.exit223 + %3147 = tail call float @llvm.nvvm.ex2.approx.f(float %3023) #2, !dbg !145 + br label %__nv_exp2f.exit226, !dbg !145 + +__nv_exp2f.exit226: ; preds = %3144, %3146 + %.0.i225 = phi float [ %3145, %3144 ], [ %3147, %3146 ], !dbg !145 + %3148 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i227 = icmp eq i32 %3148, 0, !dbg !145 + br i1 %.not.i227, label %3151, label %3149, !dbg !145 + +3149: ; preds = %__nv_exp2f.exit226 + %3150 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3024) #2, !dbg !145 + br label %__nv_exp2f.exit229, !dbg !145 + +3151: ; preds = %__nv_exp2f.exit226 + %3152 = tail call float @llvm.nvvm.ex2.approx.f(float %3024) #2, !dbg !145 + br label %__nv_exp2f.exit229, !dbg !145 + +__nv_exp2f.exit229: ; preds = %3149, %3151 + %.0.i228 = phi float [ %3150, %3149 ], [ %3152, %3151 ], !dbg !145 + %3153 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i230 = icmp eq i32 %3153, 0, !dbg !145 + br i1 %.not.i230, label %3156, label %3154, !dbg !145 + +3154: ; preds = %__nv_exp2f.exit229 + %3155 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3025) #2, !dbg !145 + br label %__nv_exp2f.exit232, !dbg !145 + +3156: ; preds = %__nv_exp2f.exit229 + %3157 = tail call float @llvm.nvvm.ex2.approx.f(float %3025) #2, !dbg !145 + br label %__nv_exp2f.exit232, !dbg !145 + +__nv_exp2f.exit232: ; preds = %3154, %3156 + %.0.i231 = phi float [ %3155, %3154 ], [ %3157, %3156 ], !dbg !145 + %3158 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i233 = icmp eq i32 %3158, 0, !dbg !145 + br i1 %.not.i233, label %3161, label %3159, !dbg !145 + +3159: ; preds = %__nv_exp2f.exit232 + %3160 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3026) #2, !dbg !145 + br label %__nv_exp2f.exit235, !dbg !145 + +3161: ; preds = %__nv_exp2f.exit232 + %3162 = tail call float @llvm.nvvm.ex2.approx.f(float %3026) #2, !dbg !145 + br label %__nv_exp2f.exit235, !dbg !145 + +__nv_exp2f.exit235: ; preds = %3159, %3161 + %.0.i234 = phi float [ %3160, %3159 ], [ %3162, %3161 ], !dbg !145 + %3163 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i236 = icmp eq i32 %3163, 0, !dbg !145 + br i1 %.not.i236, label %3166, label %3164, !dbg !145 + +3164: ; preds = %__nv_exp2f.exit235 + %3165 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3027) #2, !dbg !145 + br label %__nv_exp2f.exit238, !dbg !145 + +3166: ; preds = %__nv_exp2f.exit235 + %3167 = tail call float @llvm.nvvm.ex2.approx.f(float %3027) #2, !dbg !145 + br label %__nv_exp2f.exit238, !dbg !145 + +__nv_exp2f.exit238: ; preds = %3164, %3166 + %.0.i237 = phi float [ %3165, %3164 ], [ %3167, %3166 ], !dbg !145 + %3168 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i239 = icmp eq i32 %3168, 0, !dbg !145 + br i1 %.not.i239, label %3171, label %3169, !dbg !145 + +3169: ; preds = %__nv_exp2f.exit238 + %3170 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3028) #2, !dbg !145 + br label %__nv_exp2f.exit241, !dbg !145 + +3171: ; preds = %__nv_exp2f.exit238 + %3172 = tail call float @llvm.nvvm.ex2.approx.f(float %3028) #2, !dbg !145 + br label %__nv_exp2f.exit241, !dbg !145 + +__nv_exp2f.exit241: ; preds = %3169, %3171 + %.0.i240 = phi float [ %3170, %3169 ], [ %3172, %3171 ], !dbg !145 + %3173 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i242 = icmp eq i32 %3173, 0, !dbg !145 + br i1 %.not.i242, label %3176, label %3174, !dbg !145 + +3174: ; preds = %__nv_exp2f.exit241 + %3175 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3029) #2, !dbg !145 + br label %__nv_exp2f.exit244, !dbg !145 + +3176: ; preds = %__nv_exp2f.exit241 + %3177 = tail call float @llvm.nvvm.ex2.approx.f(float %3029) #2, !dbg !145 + br label %__nv_exp2f.exit244, !dbg !145 + +__nv_exp2f.exit244: ; preds = %3174, %3176 + %.0.i243 = phi float [ %3175, %3174 ], [ %3177, %3176 ], !dbg !145 + %3178 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i245 = icmp eq i32 %3178, 0, !dbg !145 + br i1 %.not.i245, label %3181, label %3179, !dbg !145 + +3179: ; preds = %__nv_exp2f.exit244 + %3180 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3030) #2, !dbg !145 + br label %__nv_exp2f.exit247, !dbg !145 + +3181: ; preds = %__nv_exp2f.exit244 + %3182 = tail call float @llvm.nvvm.ex2.approx.f(float %3030) #2, !dbg !145 + br label %__nv_exp2f.exit247, !dbg !145 + +__nv_exp2f.exit247: ; preds = %3179, %3181 + %.0.i246 = phi float [ %3180, %3179 ], [ %3182, %3181 ], !dbg !145 + %3183 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i248 = icmp eq i32 %3183, 0, !dbg !145 + br i1 %.not.i248, label %3186, label %3184, !dbg !145 + +3184: ; preds = %__nv_exp2f.exit247 + %3185 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3031) #2, !dbg !145 + br label %__nv_exp2f.exit250, !dbg !145 + +3186: ; preds = %__nv_exp2f.exit247 + %3187 = tail call float @llvm.nvvm.ex2.approx.f(float %3031) #2, !dbg !145 + br label %__nv_exp2f.exit250, !dbg !145 + +__nv_exp2f.exit250: ; preds = %3184, %3186 + %.0.i249 = phi float [ %3185, %3184 ], [ %3187, %3186 ], !dbg !145 + %3188 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !145 + %.not.i251 = icmp eq i32 %3188, 0, !dbg !145 + br i1 %.not.i251, label %3191, label %3189, !dbg !145 + +3189: ; preds = %__nv_exp2f.exit250 + %3190 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %3032) #2, !dbg !145 + br label %__nv_exp2f.exit253, !dbg !145 + +3191: ; preds = %__nv_exp2f.exit250 + %3192 = tail call float @llvm.nvvm.ex2.approx.f(float %3032) #2, !dbg !145 + br label %__nv_exp2f.exit253, !dbg !145 + +__nv_exp2f.exit253: ; preds = %3189, %3191 + %.0.i252 = phi float [ %3190, %3189 ], [ %3192, %3191 ], !dbg !145 + %3193 = fmul float %.pn546, %.0.i, !dbg !146 + %3194 = fmul float %.pn544, %.0.i156, !dbg !146 + %3195 = fadd float %.0.i159, %.0.i162, !dbg !147 + %3196 = fadd float %.0.i165, %.0.i168, !dbg !147 + %3197 = fadd float %3195, %.0.i171, !dbg !147 + %3198 = fadd float %3197, %.0.i174, !dbg !147 + %3199 = fadd float %3196, %.0.i177, !dbg !147 + %3200 = fadd float %3199, %.0.i180, !dbg !147 + %3201 = fadd float %3198, %.0.i183, !dbg !147 + %3202 = fadd float %3201, %.0.i186, !dbg !147 + %3203 = fadd float %3200, %.0.i189, !dbg !147 + %3204 = fadd float %3203, %.0.i192, !dbg !147 + %3205 = fadd float %3202, %.0.i195, !dbg !147 + %3206 = fadd float %3205, %.0.i198, !dbg !147 + %3207 = fadd float %3204, %.0.i201, !dbg !147 + %3208 = fadd float %3207, %.0.i204, !dbg !147 + %3209 = fadd float %3206, %.0.i207, !dbg !147 + %3210 = fadd float %3209, %.0.i210, !dbg !147 + %3211 = fadd float %3208, %.0.i213, !dbg !147 + %3212 = fadd float %3211, %.0.i216, !dbg !147 + %3213 = fadd float %3210, %.0.i219, !dbg !147 + %3214 = fadd float %3213, %.0.i222, !dbg !147 + %3215 = fadd float %3212, %.0.i225, !dbg !147 + %3216 = fadd float %3215, %.0.i228, !dbg !147 + %3217 = fadd float %3214, %.0.i231, !dbg !147 + %3218 = fadd float %3217, %.0.i234, !dbg !147 + %3219 = fadd float %3216, %.0.i237, !dbg !147 + %3220 = fadd float %3219, %.0.i240, !dbg !147 + %3221 = fadd float %3218, %.0.i243, !dbg !147 + %3222 = fadd float %3221, %.0.i246, !dbg !147 + %3223 = fadd float %3220, %.0.i249, !dbg !147 + %3224 = fadd float %3223, %.0.i252, !dbg !147 + %3225 = bitcast float %3222 to i32, !dbg !148 + %3226 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3225, i32 2, i32 31), !dbg !148 + %3227 = bitcast i32 %3226 to float, !dbg !148 + %3228 = fadd float %3222, %3227, !dbg !147 + %3229 = bitcast float %3228 to i32, !dbg !148 + %3230 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3229, i32 1, i32 31), !dbg !148 + %3231 = bitcast i32 %3230 to float, !dbg !148 + %3232 = fadd float %3228, %3231, !dbg !147 + %3233 = bitcast float %3224 to i32, !dbg !148 + %3234 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3233, i32 2, i32 31), !dbg !148 + %3235 = bitcast i32 %3234 to float, !dbg !148 + %3236 = fadd float %3224, %3235, !dbg !147 + %3237 = bitcast float %3236 to i32, !dbg !148 + %3238 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %3237, i32 1, i32 31), !dbg !148 + %3239 = bitcast i32 %3238 to float, !dbg !148 + %3240 = fadd float %3236, %3239, !dbg !147 + %3241 = fadd float %3193, %3232, !dbg !149 + %3242 = fadd float %3194, %3240, !dbg !149 + %3243 = fmul float %2740, %.0.i, !dbg !150 + %3244 = fmul float %2741, %.0.i, !dbg !150 + %3245 = fmul float %2742, %.0.i156, !dbg !150 + %3246 = fmul float %2743, %.0.i156, !dbg !150 + %3247 = fmul float %2744, %.0.i, !dbg !150 + %3248 = fmul float %2745, %.0.i, !dbg !150 + %3249 = fmul float %2746, %.0.i156, !dbg !150 + %3250 = fmul float %2747, %.0.i156, !dbg !150 + %3251 = fmul float %2748, %.0.i, !dbg !150 + %3252 = fmul float %2749, %.0.i, !dbg !150 + %3253 = fmul float %2750, %.0.i156, !dbg !150 + %3254 = fmul float %2751, %.0.i156, !dbg !150 + %3255 = fmul float %2752, %.0.i, !dbg !150 + %3256 = fmul float %2753, %.0.i, !dbg !150 + %3257 = fmul float %2754, %.0.i156, !dbg !150 + %3258 = fmul float %2755, %.0.i156, !dbg !150 + %3259 = fmul float %2756, %.0.i, !dbg !150 + %3260 = fmul float %2757, %.0.i, !dbg !150 + %3261 = fmul float %2758, %.0.i156, !dbg !150 + %3262 = fmul float %2759, %.0.i156, !dbg !150 + %3263 = fmul float %2760, %.0.i, !dbg !150 + %3264 = fmul float %2761, %.0.i, !dbg !150 + %3265 = fmul float %2762, %.0.i156, !dbg !150 + %3266 = fmul float %2763, %.0.i156, !dbg !150 + %3267 = fmul float %2764, %.0.i, !dbg !150 + %3268 = fmul float %2765, %.0.i, !dbg !150 + %3269 = fmul float %2766, %.0.i156, !dbg !150 + %3270 = fmul float %2767, %.0.i156, !dbg !150 + %3271 = fmul float %2768, %.0.i, !dbg !150 + %3272 = fmul float %2769, %.0.i, !dbg !150 + %3273 = fmul float %2770, %.0.i156, !dbg !150 + %3274 = fmul float %2771, %.0.i156, !dbg !150 + %3275 = fmul float %2772, %.0.i, !dbg !150 + %3276 = fmul float %2773, %.0.i, !dbg !150 + %3277 = fmul float %2774, %.0.i156, !dbg !150 + %3278 = fmul float %2775, %.0.i156, !dbg !150 + %3279 = fmul float %2776, %.0.i, !dbg !150 + %3280 = fmul float %2777, %.0.i, !dbg !150 + %3281 = fmul float %2778, %.0.i156, !dbg !150 + %3282 = fmul float %2779, %.0.i156, !dbg !150 + %3283 = fmul float %2780, %.0.i, !dbg !150 + %3284 = fmul float %2781, %.0.i, !dbg !150 + %3285 = fmul float %2782, %.0.i156, !dbg !150 + %3286 = fmul float %2783, %.0.i156, !dbg !150 + %3287 = fmul float %2784, %.0.i, !dbg !150 + %3288 = fmul float %2785, %.0.i, !dbg !150 + %3289 = fmul float %2786, %.0.i156, !dbg !150 + %3290 = fmul float %2787, %.0.i156, !dbg !150 + %3291 = fmul float %2788, %.0.i, !dbg !150 + %3292 = fmul float %2789, %.0.i, !dbg !150 + %3293 = fmul float %2790, %.0.i156, !dbg !150 + %3294 = fmul float %2791, %.0.i156, !dbg !150 + %3295 = fmul float %2792, %.0.i, !dbg !150 + %3296 = fmul float %2793, %.0.i, !dbg !150 + %3297 = fmul float %2794, %.0.i156, !dbg !150 + %3298 = fmul float %2795, %.0.i156, !dbg !150 + %3299 = fmul float %2796, %.0.i, !dbg !150 + %3300 = fmul float %2797, %.0.i, !dbg !150 + %3301 = fmul float %2798, %.0.i156, !dbg !150 + %3302 = fmul float %2799, %.0.i156, !dbg !150 + %3303 = fmul float %2800, %.0.i, !dbg !150 + %3304 = fmul float %2801, %.0.i, !dbg !150 + %3305 = fmul float %2802, %.0.i156, !dbg !150 + %3306 = fmul float %2803, %.0.i156, !dbg !150 + %3307 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %2358, !dbg !129 + %3308 = insertelement <2 x float> poison, float %.0.i159, i64 0, !dbg !151 + %3309 = insertelement <2 x float> %3308, float %.0.i162, i64 1, !dbg !151 + %3310 = fptrunc <2 x float> %3309 to <2 x bfloat>, !dbg !151 + %3311 = insertelement <2 x float> poison, float %.0.i165, i64 0, !dbg !151 + %3312 = insertelement <2 x float> %3311, float %.0.i168, i64 1, !dbg !151 + %3313 = fptrunc <2 x float> %3312 to <2 x bfloat>, !dbg !151 + %3314 = insertelement <2 x float> poison, float %.0.i171, i64 0, !dbg !151 + %3315 = insertelement <2 x float> %3314, float %.0.i174, i64 1, !dbg !151 + %3316 = fptrunc <2 x float> %3315 to <2 x bfloat>, !dbg !151 + %3317 = insertelement <2 x float> poison, float %.0.i177, i64 0, !dbg !151 + %3318 = insertelement <2 x float> %3317, float %.0.i180, i64 1, !dbg !151 + %3319 = fptrunc <2 x float> %3318 to <2 x bfloat>, !dbg !151 + %3320 = insertelement <2 x float> poison, float %.0.i183, i64 0, !dbg !151 + %3321 = insertelement <2 x float> %3320, float %.0.i186, i64 1, !dbg !151 + %3322 = fptrunc <2 x float> %3321 to <2 x bfloat>, !dbg !151 + %3323 = insertelement <2 x float> poison, float %.0.i189, i64 0, !dbg !151 + %3324 = insertelement <2 x float> %3323, float %.0.i192, i64 1, !dbg !151 + %3325 = fptrunc <2 x float> %3324 to <2 x bfloat>, !dbg !151 + %3326 = insertelement <2 x float> poison, float %.0.i195, i64 0, !dbg !151 + %3327 = insertelement <2 x float> %3326, float %.0.i198, i64 1, !dbg !151 + %3328 = fptrunc <2 x float> %3327 to <2 x bfloat>, !dbg !151 + %3329 = insertelement <2 x float> poison, float %.0.i201, i64 0, !dbg !151 + %3330 = insertelement <2 x float> %3329, float %.0.i204, i64 1, !dbg !151 + %3331 = fptrunc <2 x float> %3330 to <2 x bfloat>, !dbg !151 + %3332 = insertelement <2 x float> poison, float %.0.i207, i64 0, !dbg !151 + %3333 = insertelement <2 x float> %3332, float %.0.i210, i64 1, !dbg !151 + %3334 = fptrunc <2 x float> %3333 to <2 x bfloat>, !dbg !151 + %3335 = insertelement <2 x float> poison, float %.0.i213, i64 0, !dbg !151 + %3336 = insertelement <2 x float> %3335, float %.0.i216, i64 1, !dbg !151 + %3337 = fptrunc <2 x float> %3336 to <2 x bfloat>, !dbg !151 + %3338 = insertelement <2 x float> poison, float %.0.i219, i64 0, !dbg !151 + %3339 = insertelement <2 x float> %3338, float %.0.i222, i64 1, !dbg !151 + %3340 = fptrunc <2 x float> %3339 to <2 x bfloat>, !dbg !151 + %3341 = insertelement <2 x float> poison, float %.0.i225, i64 0, !dbg !151 + %3342 = insertelement <2 x float> %3341, float %.0.i228, i64 1, !dbg !151 + %3343 = fptrunc <2 x float> %3342 to <2 x bfloat>, !dbg !151 + %3344 = insertelement <2 x float> poison, float %.0.i231, i64 0, !dbg !151 + %3345 = insertelement <2 x float> %3344, float %.0.i234, i64 1, !dbg !151 + %3346 = fptrunc <2 x float> %3345 to <2 x bfloat>, !dbg !151 + %3347 = insertelement <2 x float> poison, float %.0.i237, i64 0, !dbg !151 + %3348 = insertelement <2 x float> %3347, float %.0.i240, i64 1, !dbg !151 + %3349 = fptrunc <2 x float> %3348 to <2 x bfloat>, !dbg !151 + %3350 = insertelement <2 x float> poison, float %.0.i243, i64 0, !dbg !151 + %3351 = insertelement <2 x float> %3350, float %.0.i246, i64 1, !dbg !151 + %3352 = fptrunc <2 x float> %3351 to <2 x bfloat>, !dbg !151 + %3353 = insertelement <2 x float> poison, float %.0.i249, i64 0, !dbg !151 + %3354 = insertelement <2 x float> %3353, float %.0.i252, i64 1, !dbg !151 + %3355 = fptrunc <2 x float> %3354 to <2 x bfloat>, !dbg !151 + %3356 = bitcast <2 x bfloat> %3310 to i32, !dbg !152 + %3357 = bitcast <2 x bfloat> %3313 to i32, !dbg !152 + %3358 = bitcast <2 x bfloat> %3316 to i32, !dbg !152 + %3359 = bitcast <2 x bfloat> %3319 to i32, !dbg !152 + %3360 = bitcast <2 x bfloat> %3322 to i32, !dbg !152 + %3361 = bitcast <2 x bfloat> %3325 to i32, !dbg !152 + %3362 = bitcast <2 x bfloat> %3328 to i32, !dbg !152 + %3363 = bitcast <2 x bfloat> %3331 to i32, !dbg !152 + %3364 = bitcast <2 x bfloat> %3334 to i32, !dbg !152 + %3365 = bitcast <2 x bfloat> %3337 to i32, !dbg !152 + %3366 = bitcast <2 x bfloat> %3340 to i32, !dbg !152 + %3367 = bitcast <2 x bfloat> %3343 to i32, !dbg !152 + %3368 = bitcast <2 x bfloat> %3346 to i32, !dbg !152 + %3369 = bitcast <2 x bfloat> %3349 to i32, !dbg !152 + %3370 = bitcast <2 x bfloat> %3352 to i32, !dbg !152 + %3371 = bitcast <2 x bfloat> %3355 to i32, !dbg !152 + tail call void @llvm.nvvm.wgmma.fence.sync.aligned(), !dbg !152 + %3372 = ptrtoint ptr addrspace(3) %3307 to i32, !dbg !152 + %3373 = lshr exact i32 %3372, 4, !dbg !152 + %3374 = and i32 %3373, 16383, !dbg !152 + %3375 = zext nneg i32 %3374 to i64, !dbg !152 + %3376 = or disjoint i64 %3375, 4611686293338849280, !dbg !152 + %3377 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3243, float %3244, float %3245, float %3246, float %3247, float %3248, float %3249, float %3250, float %3251, float %3252, float %3253, float %3254, float %3255, float %3256, float %3257, float %3258, float %3259, float %3260, float %3261, float %3262, float %3263, float %3264, float %3265, float %3266, float %3267, float %3268, float %3269, float %3270, float %3271, float %3272, float %3273, float %3274, float %3275, float %3276, float %3277, float %3278, float %3279, float %3280, float %3281, float %3282, float %3283, float %3284, float %3285, float %3286, float %3287, float %3288, float %3289, float %3290, float %3291, float %3292, float %3293, float %3294, float %3295, float %3296, float %3297, float %3298, float %3299, float %3300, float %3301, float %3302, float %3303, float %3304, float %3305, float %3306, i32 %3356, i32 %3357, i32 %3358, i32 %3359, i64 %3376, i1 true) #2, !dbg !152 + %3378 = add i32 %3372, 2048, !dbg !152 + %3379 = lshr exact i32 %3378, 4, !dbg !152 + %3380 = and i32 %3379, 16383, !dbg !152 + %3381 = zext nneg i32 %3380 to i64, !dbg !152 + %3382 = or disjoint i64 %3381, 4611686293338849280, !dbg !152 + %3383 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 0, !dbg !152 + %3384 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 1, !dbg !152 + %3385 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 2, !dbg !152 + %3386 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 3, !dbg !152 + %3387 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 4, !dbg !152 + %3388 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 5, !dbg !152 + %3389 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 6, !dbg !152 + %3390 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 7, !dbg !152 + %3391 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 8, !dbg !152 + %3392 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 9, !dbg !152 + %3393 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 10, !dbg !152 + %3394 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 11, !dbg !152 + %3395 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 12, !dbg !152 + %3396 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 13, !dbg !152 + %3397 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 14, !dbg !152 + %3398 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 15, !dbg !152 + %3399 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 16, !dbg !152 + %3400 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 17, !dbg !152 + %3401 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 18, !dbg !152 + %3402 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 19, !dbg !152 + %3403 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 20, !dbg !152 + %3404 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 21, !dbg !152 + %3405 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 22, !dbg !152 + %3406 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 23, !dbg !152 + %3407 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 24, !dbg !152 + %3408 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 25, !dbg !152 + %3409 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 26, !dbg !152 + %3410 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 27, !dbg !152 + %3411 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 28, !dbg !152 + %3412 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 29, !dbg !152 + %3413 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 30, !dbg !152 + %3414 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 31, !dbg !152 + %3415 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 32, !dbg !152 + %3416 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 33, !dbg !152 + %3417 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 34, !dbg !152 + %3418 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 35, !dbg !152 + %3419 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 36, !dbg !152 + %3420 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 37, !dbg !152 + %3421 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 38, !dbg !152 + %3422 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 39, !dbg !152 + %3423 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 40, !dbg !152 + %3424 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 41, !dbg !152 + %3425 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 42, !dbg !152 + %3426 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 43, !dbg !152 + %3427 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 44, !dbg !152 + %3428 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 45, !dbg !152 + %3429 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 46, !dbg !152 + %3430 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 47, !dbg !152 + %3431 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 48, !dbg !152 + %3432 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 49, !dbg !152 + %3433 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 50, !dbg !152 + %3434 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 51, !dbg !152 + %3435 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 52, !dbg !152 + %3436 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 53, !dbg !152 + %3437 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 54, !dbg !152 + %3438 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 55, !dbg !152 + %3439 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 56, !dbg !152 + %3440 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 57, !dbg !152 + %3441 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 58, !dbg !152 + %3442 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 59, !dbg !152 + %3443 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 60, !dbg !152 + %3444 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 61, !dbg !152 + %3445 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 62, !dbg !152 + %3446 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3377, 63, !dbg !152 + %3447 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3383, float %3384, float %3385, float %3386, float %3387, float %3388, float %3389, float %3390, float %3391, float %3392, float %3393, float %3394, float %3395, float %3396, float %3397, float %3398, float %3399, float %3400, float %3401, float %3402, float %3403, float %3404, float %3405, float %3406, float %3407, float %3408, float %3409, float %3410, float %3411, float %3412, float %3413, float %3414, float %3415, float %3416, float %3417, float %3418, float %3419, float %3420, float %3421, float %3422, float %3423, float %3424, float %3425, float %3426, float %3427, float %3428, float %3429, float %3430, float %3431, float %3432, float %3433, float %3434, float %3435, float %3436, float %3437, float %3438, float %3439, float %3440, float %3441, float %3442, float %3443, float %3444, float %3445, float %3446, i32 %3360, i32 %3361, i32 %3362, i32 %3363, i64 %3382, i1 true) #2, !dbg !152 + %3448 = add i32 %3372, 4096, !dbg !152 + %3449 = lshr exact i32 %3448, 4, !dbg !152 + %3450 = and i32 %3449, 16383, !dbg !152 + %3451 = zext nneg i32 %3450 to i64, !dbg !152 + %3452 = or disjoint i64 %3451, 4611686293338849280, !dbg !152 + %3453 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 0, !dbg !152 + %3454 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 1, !dbg !152 + %3455 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 2, !dbg !152 + %3456 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 3, !dbg !152 + %3457 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 4, !dbg !152 + %3458 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 5, !dbg !152 + %3459 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 6, !dbg !152 + %3460 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 7, !dbg !152 + %3461 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 8, !dbg !152 + %3462 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 9, !dbg !152 + %3463 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 10, !dbg !152 + %3464 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 11, !dbg !152 + %3465 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 12, !dbg !152 + %3466 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 13, !dbg !152 + %3467 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 14, !dbg !152 + %3468 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 15, !dbg !152 + %3469 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 16, !dbg !152 + %3470 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 17, !dbg !152 + %3471 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 18, !dbg !152 + %3472 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 19, !dbg !152 + %3473 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 20, !dbg !152 + %3474 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 21, !dbg !152 + %3475 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 22, !dbg !152 + %3476 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 23, !dbg !152 + %3477 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 24, !dbg !152 + %3478 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 25, !dbg !152 + %3479 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 26, !dbg !152 + %3480 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 27, !dbg !152 + %3481 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 28, !dbg !152 + %3482 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 29, !dbg !152 + %3483 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 30, !dbg !152 + %3484 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 31, !dbg !152 + %3485 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 32, !dbg !152 + %3486 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 33, !dbg !152 + %3487 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 34, !dbg !152 + %3488 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 35, !dbg !152 + %3489 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 36, !dbg !152 + %3490 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 37, !dbg !152 + %3491 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 38, !dbg !152 + %3492 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 39, !dbg !152 + %3493 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 40, !dbg !152 + %3494 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 41, !dbg !152 + %3495 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 42, !dbg !152 + %3496 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 43, !dbg !152 + %3497 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 44, !dbg !152 + %3498 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 45, !dbg !152 + %3499 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 46, !dbg !152 + %3500 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 47, !dbg !152 + %3501 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 48, !dbg !152 + %3502 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 49, !dbg !152 + %3503 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 50, !dbg !152 + %3504 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 51, !dbg !152 + %3505 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 52, !dbg !152 + %3506 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 53, !dbg !152 + %3507 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 54, !dbg !152 + %3508 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 55, !dbg !152 + %3509 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 56, !dbg !152 + %3510 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 57, !dbg !152 + %3511 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 58, !dbg !152 + %3512 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 59, !dbg !152 + %3513 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 60, !dbg !152 + %3514 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 61, !dbg !152 + %3515 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 62, !dbg !152 + %3516 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3447, 63, !dbg !152 + %3517 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3453, float %3454, float %3455, float %3456, float %3457, float %3458, float %3459, float %3460, float %3461, float %3462, float %3463, float %3464, float %3465, float %3466, float %3467, float %3468, float %3469, float %3470, float %3471, float %3472, float %3473, float %3474, float %3475, float %3476, float %3477, float %3478, float %3479, float %3480, float %3481, float %3482, float %3483, float %3484, float %3485, float %3486, float %3487, float %3488, float %3489, float %3490, float %3491, float %3492, float %3493, float %3494, float %3495, float %3496, float %3497, float %3498, float %3499, float %3500, float %3501, float %3502, float %3503, float %3504, float %3505, float %3506, float %3507, float %3508, float %3509, float %3510, float %3511, float %3512, float %3513, float %3514, float %3515, float %3516, i32 %3364, i32 %3365, i32 %3366, i32 %3367, i64 %3452, i1 true) #2, !dbg !152 + %3518 = add i32 %3372, 6144, !dbg !152 + %3519 = lshr exact i32 %3518, 4, !dbg !152 + %3520 = and i32 %3519, 16383, !dbg !152 + %3521 = zext nneg i32 %3520 to i64, !dbg !152 + %3522 = or disjoint i64 %3521, 4611686293338849280, !dbg !152 + %3523 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 0, !dbg !152 + %3524 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 1, !dbg !152 + %3525 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 2, !dbg !152 + %3526 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 3, !dbg !152 + %3527 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 4, !dbg !152 + %3528 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 5, !dbg !152 + %3529 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 6, !dbg !152 + %3530 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 7, !dbg !152 + %3531 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 8, !dbg !152 + %3532 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 9, !dbg !152 + %3533 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 10, !dbg !152 + %3534 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 11, !dbg !152 + %3535 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 12, !dbg !152 + %3536 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 13, !dbg !152 + %3537 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 14, !dbg !152 + %3538 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 15, !dbg !152 + %3539 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 16, !dbg !152 + %3540 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 17, !dbg !152 + %3541 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 18, !dbg !152 + %3542 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 19, !dbg !152 + %3543 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 20, !dbg !152 + %3544 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 21, !dbg !152 + %3545 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 22, !dbg !152 + %3546 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 23, !dbg !152 + %3547 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 24, !dbg !152 + %3548 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 25, !dbg !152 + %3549 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 26, !dbg !152 + %3550 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 27, !dbg !152 + %3551 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 28, !dbg !152 + %3552 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 29, !dbg !152 + %3553 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 30, !dbg !152 + %3554 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 31, !dbg !152 + %3555 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 32, !dbg !152 + %3556 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 33, !dbg !152 + %3557 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 34, !dbg !152 + %3558 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 35, !dbg !152 + %3559 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 36, !dbg !152 + %3560 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 37, !dbg !152 + %3561 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 38, !dbg !152 + %3562 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 39, !dbg !152 + %3563 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 40, !dbg !152 + %3564 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 41, !dbg !152 + %3565 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 42, !dbg !152 + %3566 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 43, !dbg !152 + %3567 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 44, !dbg !152 + %3568 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 45, !dbg !152 + %3569 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 46, !dbg !152 + %3570 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 47, !dbg !152 + %3571 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 48, !dbg !152 + %3572 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 49, !dbg !152 + %3573 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 50, !dbg !152 + %3574 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 51, !dbg !152 + %3575 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 52, !dbg !152 + %3576 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 53, !dbg !152 + %3577 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 54, !dbg !152 + %3578 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 55, !dbg !152 + %3579 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 56, !dbg !152 + %3580 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 57, !dbg !152 + %3581 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 58, !dbg !152 + %3582 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 59, !dbg !152 + %3583 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 60, !dbg !152 + %3584 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 61, !dbg !152 + %3585 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 62, !dbg !152 + %3586 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3517, 63, !dbg !152 + %3587 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63}, {$128,$129,$130,$131}, $132, $133, 1, 1, 1;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,r,r,r,r,l,b"(float %3523, float %3524, float %3525, float %3526, float %3527, float %3528, float %3529, float %3530, float %3531, float %3532, float %3533, float %3534, float %3535, float %3536, float %3537, float %3538, float %3539, float %3540, float %3541, float %3542, float %3543, float %3544, float %3545, float %3546, float %3547, float %3548, float %3549, float %3550, float %3551, float %3552, float %3553, float %3554, float %3555, float %3556, float %3557, float %3558, float %3559, float %3560, float %3561, float %3562, float %3563, float %3564, float %3565, float %3566, float %3567, float %3568, float %3569, float %3570, float %3571, float %3572, float %3573, float %3574, float %3575, float %3576, float %3577, float %3578, float %3579, float %3580, float %3581, float %3582, float %3583, float %3584, float %3585, float %3586, i32 %3368, i32 %3369, i32 %3370, i32 %3371, i64 %3522, i1 true) #2, !dbg !152 + %3588 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 0, !dbg !152 + %3589 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 1, !dbg !152 + %3590 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 2, !dbg !152 + %3591 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 3, !dbg !152 + %3592 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 4, !dbg !152 + %3593 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 5, !dbg !152 + %3594 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 6, !dbg !152 + %3595 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 7, !dbg !152 + %3596 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 8, !dbg !152 + %3597 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 9, !dbg !152 + %3598 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 10, !dbg !152 + %3599 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 11, !dbg !152 + %3600 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 12, !dbg !152 + %3601 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 13, !dbg !152 + %3602 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 14, !dbg !152 + %3603 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 15, !dbg !152 + %3604 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 16, !dbg !152 + %3605 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 17, !dbg !152 + %3606 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 18, !dbg !152 + %3607 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 19, !dbg !152 + %3608 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 20, !dbg !152 + %3609 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 21, !dbg !152 + %3610 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 22, !dbg !152 + %3611 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 23, !dbg !152 + %3612 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 24, !dbg !152 + %3613 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 25, !dbg !152 + %3614 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 26, !dbg !152 + %3615 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 27, !dbg !152 + %3616 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 28, !dbg !152 + %3617 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 29, !dbg !152 + %3618 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 30, !dbg !152 + %3619 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 31, !dbg !152 + %3620 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 32, !dbg !152 + %3621 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 33, !dbg !152 + %3622 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 34, !dbg !152 + %3623 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 35, !dbg !152 + %3624 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 36, !dbg !152 + %3625 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 37, !dbg !152 + %3626 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 38, !dbg !152 + %3627 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 39, !dbg !152 + %3628 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 40, !dbg !152 + %3629 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 41, !dbg !152 + %3630 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 42, !dbg !152 + %3631 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 43, !dbg !152 + %3632 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 44, !dbg !152 + %3633 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 45, !dbg !152 + %3634 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 46, !dbg !152 + %3635 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 47, !dbg !152 + %3636 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 48, !dbg !152 + %3637 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 49, !dbg !152 + %3638 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 50, !dbg !152 + %3639 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 51, !dbg !152 + %3640 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 52, !dbg !152 + %3641 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 53, !dbg !152 + %3642 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 54, !dbg !152 + %3643 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 55, !dbg !152 + %3644 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 56, !dbg !152 + %3645 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 57, !dbg !152 + %3646 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 58, !dbg !152 + %3647 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 59, !dbg !152 + %3648 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 60, !dbg !152 + %3649 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 61, !dbg !152 + %3650 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 62, !dbg !152 + %3651 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3587, 63, !dbg !152 + tail call void @llvm.nvvm.wgmma.commit_group.sync.aligned(), !dbg !152 + %3652 = insertelement <16 x i32> poison, i32 %2346, i64 0, !dbg !153 + %3653 = shufflevector <16 x i32> %3652, <16 x i32> poison, <16 x i32> zeroinitializer, !dbg !153 + %3654 = add <16 x i32> %3653, %2352, !dbg !153 + %3655 = add nuw nsw i32 %2350, 1, !dbg !122 + %3656 = lshr i32 %3655, 1, !dbg !154 + %3657 = zext nneg i32 %3656 to i64, !dbg !155 + %3658 = getelementptr i32, ptr addrspace(1) %2115, i64 %3657, !dbg !155 + %3659 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !156 + %3660 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %3658, i64 %3659, i1 %2354) #2, !dbg !156 + %3661 = add nuw nsw i32 %3656, 1, !dbg !157 + %3662 = icmp slt i32 %3661, %2119, !dbg !158 + %3663 = getelementptr i8, ptr addrspace(1) %3658, i64 4, !dbg !159 + %3664 = and i1 %2354, %3662, !dbg !122 + %3665 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #2, !dbg !160 + %3666 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %3663, i64 %3665, i1 %3664) #2, !dbg !160 + %3667 = and i32 %2350, 1, !dbg !161 + %3668 = sub i32 %3666, %3660, !dbg !162 + %3669 = shl i32 %3668, 7, !dbg !163 + %3670 = add i32 %3669, -64, !dbg !164 + %3671 = xor i32 %3667, 1, !dbg !165 + %3672 = mul nuw nsw i32 %3670, %3671, !dbg !165 + %3673 = shl nuw nsw i32 %3667, 6, !dbg !166 + %3674 = add i32 %3672, %3673, !dbg !167 + %3675 = add i32 %3674, %2349, !dbg !168 + %3676 = add i32 %2348, 1, !dbg !122 + %3677 = icmp sgt i32 %3676, 2, !dbg !122 + %3678 = select i1 %3677, i32 0, i32 %3676, !dbg !122 + %3679 = add i32 %3675, %2117, !dbg !130 + %3680 = add i32 %3679, %37, !dbg !124 + %3681 = add i32 %3679, %38, !dbg !124 + %3682 = add i32 %3679, %39, !dbg !124 + %3683 = add i32 %3679, %40, !dbg !124 + %3684 = shl i32 %3680, 7, !dbg !125 + %3685 = shl i32 %3681, 7, !dbg !125 + %3686 = shl i32 %3682, 7, !dbg !125 + %3687 = shl i32 %3683, 7, !dbg !125 + %3688 = sext i32 %3684 to i64, !dbg !126 + %3689 = sext i32 %3685 to i64, !dbg !126 + %3690 = sext i32 %3686 to i64, !dbg !126 + %3691 = sext i32 %3687 to i64, !dbg !126 + %gep428 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3688, !dbg !127 + %gep430 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3689, !dbg !127 + %gep432 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3690, !dbg !127 + %gep434 = getelementptr bfloat, ptr addrspace(1) %invariant.gep, i64 %3691, !dbg !127 + %3692 = icmp slt i32 %3680, %11, !dbg !128 + %3693 = icmp slt i32 %3681, %11, !dbg !128 + %3694 = icmp slt i32 %3682, %11, !dbg !128 + %3695 = icmp slt i32 %3683, %11, !dbg !128 + %3696 = shl i32 %3678, 13, !dbg !129 + %3697 = getelementptr bfloat, ptr addrspace(3) @global_smem, i32 %3696, !dbg !129 + %3698 = and i1 %2353, %3692, !dbg !122 + %3699 = and i1 %2353, %3693, !dbg !122 + %3700 = and i1 %2353, %3694, !dbg !122 + %3701 = and i1 %2353, %3695, !dbg !122 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !129 + %3702 = getelementptr inbounds nuw i8, ptr addrspace(3) %3697, i32 %255, !dbg !129 + %3703 = select i1 %3698, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %3702, ptr addrspace(1) %gep428, i32 %3703) #2, !dbg !129 + %3704 = getelementptr inbounds nuw i8, ptr addrspace(3) %3697, i32 %258, !dbg !129 + %3705 = select i1 %3699, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3704, ptr addrspace(1) %gep430, i32 %3705) #2, !dbg !129 + %3706 = getelementptr inbounds nuw i8, ptr addrspace(3) %3697, i32 %261, !dbg !129 + %3707 = select i1 %3700, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3706, ptr addrspace(1) %gep432, i32 %3707) #2, !dbg !129 + %3708 = getelementptr inbounds nuw i8, ptr addrspace(3) %3697, i32 %264, !dbg !129 + %3709 = select i1 %3701, i32 16, i32 0, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3708, ptr addrspace(1) %gep434, i32 %3709) #2, !dbg !129 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !129 + %gep436 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %3688, !dbg !127 + %gep438 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %3689, !dbg !127 + %gep440 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %3690, !dbg !127 + %gep442 = getelementptr bfloat, ptr addrspace(1) %invariant.gep401, i64 %3691, !dbg !127 + %3710 = getelementptr bfloat, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 49152), i32 %3696, !dbg !129 + %3711 = getelementptr inbounds nuw i8, ptr addrspace(3) %3710, i32 %255, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) %3711, ptr addrspace(1) %gep436, i32 %3703) #2, !dbg !129 + %3712 = getelementptr inbounds nuw i8, ptr addrspace(3) %3710, i32 %258, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3712, ptr addrspace(1) %gep438, i32 %3705) #2, !dbg !129 + %3713 = getelementptr inbounds nuw i8, ptr addrspace(3) %3710, i32 %261, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3713, ptr addrspace(1) %gep440, i32 %3707) #2, !dbg !129 + %3714 = getelementptr inbounds nuw i8, ptr addrspace(3) %3710, i32 %264, !dbg !129 + tail call void asm sideeffect "cp.async.cg.shared.global [ $0 + 0 ], [ $1 + 0 ], 0x10, $2;", "r,l,r"(ptr addrspace(3) nonnull %3714, ptr addrspace(1) %gep442, i32 %3709) #2, !dbg !129 + tail call void @llvm.nvvm.cp.async.commit.group(), !dbg !129 + %exitcond472.not = icmp eq i32 %3655, %smax471, !dbg !122 + br i1 %exitcond472.not, label %._crit_edge461, label %2345, !dbg !122 + +._crit_edge461: ; preds = %__nv_exp2f.exit253, %._crit_edge + %3715 = phi float [ %2205, %._crit_edge ], [ %3588, %__nv_exp2f.exit253 ], !dbg !50 + %3716 = phi float [ %2206, %._crit_edge ], [ %3589, %__nv_exp2f.exit253 ], !dbg !50 + %3717 = phi float [ %2207, %._crit_edge ], [ %3590, %__nv_exp2f.exit253 ], !dbg !50 + %3718 = phi float [ %2208, %._crit_edge ], [ %3591, %__nv_exp2f.exit253 ], !dbg !50 + %3719 = phi float [ %2209, %._crit_edge ], [ %3592, %__nv_exp2f.exit253 ], !dbg !50 + %3720 = phi float [ %2210, %._crit_edge ], [ %3593, %__nv_exp2f.exit253 ], !dbg !50 + %3721 = phi float [ %2211, %._crit_edge ], [ %3594, %__nv_exp2f.exit253 ], !dbg !50 + %3722 = phi float [ %2212, %._crit_edge ], [ %3595, %__nv_exp2f.exit253 ], !dbg !50 + %3723 = phi float [ %2213, %._crit_edge ], [ %3596, %__nv_exp2f.exit253 ], !dbg !50 + %3724 = phi float [ %2214, %._crit_edge ], [ %3597, %__nv_exp2f.exit253 ], !dbg !50 + %3725 = phi float [ %2215, %._crit_edge ], [ %3598, %__nv_exp2f.exit253 ], !dbg !50 + %3726 = phi float [ %2216, %._crit_edge ], [ %3599, %__nv_exp2f.exit253 ], !dbg !50 + %3727 = phi float [ %2217, %._crit_edge ], [ %3600, %__nv_exp2f.exit253 ], !dbg !50 + %3728 = phi float [ %2218, %._crit_edge ], [ %3601, %__nv_exp2f.exit253 ], !dbg !50 + %3729 = phi float [ %2219, %._crit_edge ], [ %3602, %__nv_exp2f.exit253 ], !dbg !50 + %3730 = phi float [ %2220, %._crit_edge ], [ %3603, %__nv_exp2f.exit253 ], !dbg !50 + %3731 = phi float [ %2221, %._crit_edge ], [ %3604, %__nv_exp2f.exit253 ], !dbg !50 + %3732 = phi float [ %2222, %._crit_edge ], [ %3605, %__nv_exp2f.exit253 ], !dbg !50 + %3733 = phi float [ %2223, %._crit_edge ], [ %3606, %__nv_exp2f.exit253 ], !dbg !50 + %3734 = phi float [ %2224, %._crit_edge ], [ %3607, %__nv_exp2f.exit253 ], !dbg !50 + %3735 = phi float [ %2225, %._crit_edge ], [ %3608, %__nv_exp2f.exit253 ], !dbg !50 + %3736 = phi float [ %2226, %._crit_edge ], [ %3609, %__nv_exp2f.exit253 ], !dbg !50 + %3737 = phi float [ %2227, %._crit_edge ], [ %3610, %__nv_exp2f.exit253 ], !dbg !50 + %3738 = phi float [ %2228, %._crit_edge ], [ %3611, %__nv_exp2f.exit253 ], !dbg !50 + %3739 = phi float [ %2229, %._crit_edge ], [ %3612, %__nv_exp2f.exit253 ], !dbg !50 + %3740 = phi float [ %2230, %._crit_edge ], [ %3613, %__nv_exp2f.exit253 ], !dbg !50 + %3741 = phi float [ %2231, %._crit_edge ], [ %3614, %__nv_exp2f.exit253 ], !dbg !50 + %3742 = phi float [ %2232, %._crit_edge ], [ %3615, %__nv_exp2f.exit253 ], !dbg !50 + %3743 = phi float [ %2233, %._crit_edge ], [ %3616, %__nv_exp2f.exit253 ], !dbg !50 + %3744 = phi float [ %2234, %._crit_edge ], [ %3617, %__nv_exp2f.exit253 ], !dbg !50 + %3745 = phi float [ %2235, %._crit_edge ], [ %3618, %__nv_exp2f.exit253 ], !dbg !50 + %3746 = phi float [ %2236, %._crit_edge ], [ %3619, %__nv_exp2f.exit253 ], !dbg !50 + %3747 = phi float [ %2237, %._crit_edge ], [ %3620, %__nv_exp2f.exit253 ], !dbg !50 + %3748 = phi float [ %2238, %._crit_edge ], [ %3621, %__nv_exp2f.exit253 ], !dbg !50 + %3749 = phi float [ %2239, %._crit_edge ], [ %3622, %__nv_exp2f.exit253 ], !dbg !50 + %3750 = phi float [ %2240, %._crit_edge ], [ %3623, %__nv_exp2f.exit253 ], !dbg !50 + %3751 = phi float [ %2241, %._crit_edge ], [ %3624, %__nv_exp2f.exit253 ], !dbg !50 + %3752 = phi float [ %2242, %._crit_edge ], [ %3625, %__nv_exp2f.exit253 ], !dbg !50 + %3753 = phi float [ %2243, %._crit_edge ], [ %3626, %__nv_exp2f.exit253 ], !dbg !50 + %3754 = phi float [ %2244, %._crit_edge ], [ %3627, %__nv_exp2f.exit253 ], !dbg !50 + %3755 = phi float [ %2245, %._crit_edge ], [ %3628, %__nv_exp2f.exit253 ], !dbg !50 + %3756 = phi float [ %2246, %._crit_edge ], [ %3629, %__nv_exp2f.exit253 ], !dbg !50 + %3757 = phi float [ %2247, %._crit_edge ], [ %3630, %__nv_exp2f.exit253 ], !dbg !50 + %3758 = phi float [ %2248, %._crit_edge ], [ %3631, %__nv_exp2f.exit253 ], !dbg !50 + %3759 = phi float [ %2249, %._crit_edge ], [ %3632, %__nv_exp2f.exit253 ], !dbg !50 + %3760 = phi float [ %2250, %._crit_edge ], [ %3633, %__nv_exp2f.exit253 ], !dbg !50 + %3761 = phi float [ %2251, %._crit_edge ], [ %3634, %__nv_exp2f.exit253 ], !dbg !50 + %3762 = phi float [ %2252, %._crit_edge ], [ %3635, %__nv_exp2f.exit253 ], !dbg !50 + %3763 = phi float [ %2253, %._crit_edge ], [ %3636, %__nv_exp2f.exit253 ], !dbg !50 + %3764 = phi float [ %2254, %._crit_edge ], [ %3637, %__nv_exp2f.exit253 ], !dbg !50 + %3765 = phi float [ %2255, %._crit_edge ], [ %3638, %__nv_exp2f.exit253 ], !dbg !50 + %3766 = phi float [ %2256, %._crit_edge ], [ %3639, %__nv_exp2f.exit253 ], !dbg !50 + %3767 = phi float [ %2257, %._crit_edge ], [ %3640, %__nv_exp2f.exit253 ], !dbg !50 + %3768 = phi float [ %2258, %._crit_edge ], [ %3641, %__nv_exp2f.exit253 ], !dbg !50 + %3769 = phi float [ %2259, %._crit_edge ], [ %3642, %__nv_exp2f.exit253 ], !dbg !50 + %3770 = phi float [ %2260, %._crit_edge ], [ %3643, %__nv_exp2f.exit253 ], !dbg !50 + %3771 = phi float [ %2261, %._crit_edge ], [ %3644, %__nv_exp2f.exit253 ], !dbg !50 + %3772 = phi float [ %2262, %._crit_edge ], [ %3645, %__nv_exp2f.exit253 ], !dbg !50 + %3773 = phi float [ %2263, %._crit_edge ], [ %3646, %__nv_exp2f.exit253 ], !dbg !50 + %3774 = phi float [ %2264, %._crit_edge ], [ %3647, %__nv_exp2f.exit253 ], !dbg !50 + %3775 = phi float [ %2265, %._crit_edge ], [ %3648, %__nv_exp2f.exit253 ], !dbg !50 + %3776 = phi float [ %2266, %._crit_edge ], [ %3649, %__nv_exp2f.exit253 ], !dbg !50 + %3777 = phi float [ %2267, %._crit_edge ], [ %3650, %__nv_exp2f.exit253 ], !dbg !50 + %3778 = phi float [ %2268, %._crit_edge ], [ %3651, %__nv_exp2f.exit253 ], !dbg !50 + %3779 = phi float [ %2111, %._crit_edge ], [ %3241, %__nv_exp2f.exit253 ], !dbg !50 + %3780 = phi float [ %2112, %._crit_edge ], [ %3242, %__nv_exp2f.exit253 ], !dbg !50 + %3781 = phi <2 x float> [ %2113, %._crit_edge ], [ %2980, %__nv_exp2f.exit253 ], !dbg !50 + %3782 = tail call { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } asm sideeffect "// wait for regs: $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38,$39,$40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$50,$51,$52,$53,$54,$55,$56,$57,$58,$59,$60,$61,$62,$63\0A\09wgmma.wait_group.sync.aligned 0;", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63"(float %3715, float %3716, float %3717, float %3718, float %3719, float %3720, float %3721, float %3722, float %3723, float %3724, float %3725, float %3726, float %3727, float %3728, float %3729, float %3730, float %3731, float %3732, float %3733, float %3734, float %3735, float %3736, float %3737, float %3738, float %3739, float %3740, float %3741, float %3742, float %3743, float %3744, float %3745, float %3746, float %3747, float %3748, float %3749, float %3750, float %3751, float %3752, float %3753, float %3754, float %3755, float %3756, float %3757, float %3758, float %3759, float %3760, float %3761, float %3762, float %3763, float %3764, float %3765, float %3766, float %3767, float %3768, float %3769, float %3770, float %3771, float %3772, float %3773, float %3774, float %3775, float %3776, float %3777, float %3778) #2, !dbg !122 + tail call void @llvm.nvvm.cp.async.wait.group(i32 0), !dbg !122 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !122 + %3783 = fcmp oeq float %3779, 0.000000e+00, !dbg !169 + %3784 = fcmp oeq float %3780, 0.000000e+00, !dbg !169 + %3785 = select i1 %3783, float 1.000000e+00, float %3779, !dbg !170 + %3786 = select i1 %3784, float 1.000000e+00, float %3780, !dbg !170 + %3787 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 0, !dbg !171 + %3788 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 1, !dbg !171 + %3789 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 2, !dbg !171 + %3790 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 3, !dbg !171 + %3791 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 4, !dbg !171 + %3792 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 5, !dbg !171 + %3793 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 6, !dbg !171 + %3794 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 7, !dbg !171 + %3795 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 8, !dbg !171 + %3796 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 9, !dbg !171 + %3797 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 10, !dbg !171 + %3798 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 11, !dbg !171 + %3799 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 12, !dbg !171 + %3800 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 13, !dbg !171 + %3801 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 14, !dbg !171 + %3802 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 15, !dbg !171 + %3803 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 16, !dbg !171 + %3804 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 17, !dbg !171 + %3805 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 18, !dbg !171 + %3806 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 19, !dbg !171 + %3807 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 20, !dbg !171 + %3808 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 21, !dbg !171 + %3809 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 22, !dbg !171 + %3810 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 23, !dbg !171 + %3811 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 24, !dbg !171 + %3812 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 25, !dbg !171 + %3813 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 26, !dbg !171 + %3814 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 27, !dbg !171 + %3815 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 28, !dbg !171 + %3816 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 29, !dbg !171 + %3817 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 30, !dbg !171 + %3818 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 31, !dbg !171 + %3819 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 32, !dbg !171 + %3820 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 33, !dbg !171 + %3821 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 34, !dbg !171 + %3822 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 35, !dbg !171 + %3823 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 36, !dbg !171 + %3824 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 37, !dbg !171 + %3825 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 38, !dbg !171 + %3826 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 39, !dbg !171 + %3827 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 40, !dbg !171 + %3828 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 41, !dbg !171 + %3829 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 42, !dbg !171 + %3830 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 43, !dbg !171 + %3831 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 44, !dbg !171 + %3832 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 45, !dbg !171 + %3833 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 46, !dbg !171 + %3834 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 47, !dbg !171 + %3835 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 48, !dbg !171 + %3836 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 49, !dbg !171 + %3837 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 50, !dbg !171 + %3838 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 51, !dbg !171 + %3839 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 52, !dbg !171 + %3840 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 53, !dbg !171 + %3841 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 54, !dbg !171 + %3842 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 55, !dbg !171 + %3843 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 56, !dbg !171 + %3844 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 57, !dbg !171 + %3845 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 58, !dbg !171 + %3846 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 59, !dbg !171 + %3847 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 60, !dbg !171 + %3848 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 61, !dbg !171 + %3849 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 62, !dbg !171 + %3850 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %3782, 63, !dbg !171 + %3851 = tail call float @llvm.nvvm.div.full(float %3787, float %3785), !dbg !171 + %3852 = tail call float @llvm.nvvm.div.full(float %3788, float %3785), !dbg !171 + %3853 = tail call float @llvm.nvvm.div.full(float %3789, float %3786), !dbg !171 + %3854 = tail call float @llvm.nvvm.div.full(float %3790, float %3786), !dbg !171 + %3855 = tail call float @llvm.nvvm.div.full(float %3791, float %3785), !dbg !171 + %3856 = tail call float @llvm.nvvm.div.full(float %3792, float %3785), !dbg !171 + %3857 = tail call float @llvm.nvvm.div.full(float %3793, float %3786), !dbg !171 + %3858 = tail call float @llvm.nvvm.div.full(float %3794, float %3786), !dbg !171 + %3859 = tail call float @llvm.nvvm.div.full(float %3795, float %3785), !dbg !171 + %3860 = tail call float @llvm.nvvm.div.full(float %3796, float %3785), !dbg !171 + %3861 = tail call float @llvm.nvvm.div.full(float %3797, float %3786), !dbg !171 + %3862 = tail call float @llvm.nvvm.div.full(float %3798, float %3786), !dbg !171 + %3863 = tail call float @llvm.nvvm.div.full(float %3799, float %3785), !dbg !171 + %3864 = tail call float @llvm.nvvm.div.full(float %3800, float %3785), !dbg !171 + %3865 = tail call float @llvm.nvvm.div.full(float %3801, float %3786), !dbg !171 + %3866 = tail call float @llvm.nvvm.div.full(float %3802, float %3786), !dbg !171 + %3867 = tail call float @llvm.nvvm.div.full(float %3803, float %3785), !dbg !171 + %3868 = tail call float @llvm.nvvm.div.full(float %3804, float %3785), !dbg !171 + %3869 = tail call float @llvm.nvvm.div.full(float %3805, float %3786), !dbg !171 + %3870 = tail call float @llvm.nvvm.div.full(float %3806, float %3786), !dbg !171 + %3871 = tail call float @llvm.nvvm.div.full(float %3807, float %3785), !dbg !171 + %3872 = tail call float @llvm.nvvm.div.full(float %3808, float %3785), !dbg !171 + %3873 = tail call float @llvm.nvvm.div.full(float %3809, float %3786), !dbg !171 + %3874 = tail call float @llvm.nvvm.div.full(float %3810, float %3786), !dbg !171 + %3875 = tail call float @llvm.nvvm.div.full(float %3811, float %3785), !dbg !171 + %3876 = tail call float @llvm.nvvm.div.full(float %3812, float %3785), !dbg !171 + %3877 = tail call float @llvm.nvvm.div.full(float %3813, float %3786), !dbg !171 + %3878 = tail call float @llvm.nvvm.div.full(float %3814, float %3786), !dbg !171 + %3879 = tail call float @llvm.nvvm.div.full(float %3815, float %3785), !dbg !171 + %3880 = tail call float @llvm.nvvm.div.full(float %3816, float %3785), !dbg !171 + %3881 = tail call float @llvm.nvvm.div.full(float %3817, float %3786), !dbg !171 + %3882 = tail call float @llvm.nvvm.div.full(float %3818, float %3786), !dbg !171 + %3883 = tail call float @llvm.nvvm.div.full(float %3819, float %3785), !dbg !171 + %3884 = tail call float @llvm.nvvm.div.full(float %3820, float %3785), !dbg !171 + %3885 = tail call float @llvm.nvvm.div.full(float %3821, float %3786), !dbg !171 + %3886 = tail call float @llvm.nvvm.div.full(float %3822, float %3786), !dbg !171 + %3887 = tail call float @llvm.nvvm.div.full(float %3823, float %3785), !dbg !171 + %3888 = tail call float @llvm.nvvm.div.full(float %3824, float %3785), !dbg !171 + %3889 = tail call float @llvm.nvvm.div.full(float %3825, float %3786), !dbg !171 + %3890 = tail call float @llvm.nvvm.div.full(float %3826, float %3786), !dbg !171 + %3891 = tail call float @llvm.nvvm.div.full(float %3827, float %3785), !dbg !171 + %3892 = tail call float @llvm.nvvm.div.full(float %3828, float %3785), !dbg !171 + %3893 = tail call float @llvm.nvvm.div.full(float %3829, float %3786), !dbg !171 + %3894 = tail call float @llvm.nvvm.div.full(float %3830, float %3786), !dbg !171 + %3895 = tail call float @llvm.nvvm.div.full(float %3831, float %3785), !dbg !171 + %3896 = tail call float @llvm.nvvm.div.full(float %3832, float %3785), !dbg !171 + %3897 = tail call float @llvm.nvvm.div.full(float %3833, float %3786), !dbg !171 + %3898 = tail call float @llvm.nvvm.div.full(float %3834, float %3786), !dbg !171 + %3899 = tail call float @llvm.nvvm.div.full(float %3835, float %3785), !dbg !171 + %3900 = tail call float @llvm.nvvm.div.full(float %3836, float %3785), !dbg !171 + %3901 = tail call float @llvm.nvvm.div.full(float %3837, float %3786), !dbg !171 + %3902 = tail call float @llvm.nvvm.div.full(float %3838, float %3786), !dbg !171 + %3903 = tail call float @llvm.nvvm.div.full(float %3839, float %3785), !dbg !171 + %3904 = tail call float @llvm.nvvm.div.full(float %3840, float %3785), !dbg !171 + %3905 = tail call float @llvm.nvvm.div.full(float %3841, float %3786), !dbg !171 + %3906 = tail call float @llvm.nvvm.div.full(float %3842, float %3786), !dbg !171 + %3907 = tail call float @llvm.nvvm.div.full(float %3843, float %3785), !dbg !171 + %3908 = tail call float @llvm.nvvm.div.full(float %3844, float %3785), !dbg !171 + %3909 = tail call float @llvm.nvvm.div.full(float %3845, float %3786), !dbg !171 + %3910 = tail call float @llvm.nvvm.div.full(float %3846, float %3786), !dbg !171 + %3911 = tail call float @llvm.nvvm.div.full(float %3847, float %3785), !dbg !171 + %3912 = tail call float @llvm.nvvm.div.full(float %3848, float %3785), !dbg !171 + %3913 = tail call float @llvm.nvvm.div.full(float %3849, float %3786), !dbg !171 + %3914 = tail call float @llvm.nvvm.div.full(float %3850, float %3786), !dbg !171 + %3915 = or disjoint i32 %79, %21, !dbg !172 + %3916 = or disjoint i32 %3915, %20, !dbg !173 + %3917 = add i32 %3916, %54, !dbg !174 + %3918 = add i32 %3916, %55, !dbg !174 + %3919 = add i32 %3916, %56, !dbg !174 + %3920 = add i32 %3916, %57, !dbg !174 + %3921 = add i32 %3916, %58, !dbg !174 + %3922 = add i32 %3916, %59, !dbg !174 + %3923 = add i32 %3916, %60, !dbg !174 + %3924 = add i32 %3916, %61, !dbg !174 + %3925 = sext i32 %3917 to i64, !dbg !175 + %3926 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3925, !dbg !175 + %3927 = sext i32 %3918 to i64, !dbg !175 + %3928 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3927, !dbg !175 + %3929 = sext i32 %3919 to i64, !dbg !175 + %3930 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3929, !dbg !175 + %3931 = sext i32 %3920 to i64, !dbg !175 + %3932 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3931, !dbg !175 + %3933 = sext i32 %3921 to i64, !dbg !175 + %3934 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3933, !dbg !175 + %3935 = sext i32 %3922 to i64, !dbg !175 + %3936 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3935, !dbg !175 + %3937 = sext i32 %3923 to i64, !dbg !175 + %3938 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3937, !dbg !175 + %3939 = sext i32 %3924 to i64, !dbg !175 + %3940 = getelementptr bfloat, ptr addrspace(1) %10, i64 %3939, !dbg !175 + %3941 = insertelement <2 x float> poison, float %3851, i64 0, !dbg !176 + %3942 = insertelement <2 x float> %3941, float %3852, i64 1, !dbg !176 + %3943 = fptrunc <2 x float> %3942 to <2 x bfloat>, !dbg !176 + %3944 = insertelement <2 x float> poison, float %3853, i64 0, !dbg !176 + %3945 = insertelement <2 x float> %3944, float %3854, i64 1, !dbg !176 + %3946 = fptrunc <2 x float> %3945 to <2 x bfloat>, !dbg !176 + %3947 = insertelement <2 x float> poison, float %3855, i64 0, !dbg !176 + %3948 = insertelement <2 x float> %3947, float %3856, i64 1, !dbg !176 + %3949 = fptrunc <2 x float> %3948 to <2 x bfloat>, !dbg !176 + %3950 = insertelement <2 x float> poison, float %3857, i64 0, !dbg !176 + %3951 = insertelement <2 x float> %3950, float %3858, i64 1, !dbg !176 + %3952 = fptrunc <2 x float> %3951 to <2 x bfloat>, !dbg !176 + %3953 = insertelement <2 x float> poison, float %3859, i64 0, !dbg !176 + %3954 = insertelement <2 x float> %3953, float %3860, i64 1, !dbg !176 + %3955 = fptrunc <2 x float> %3954 to <2 x bfloat>, !dbg !176 + %3956 = insertelement <2 x float> poison, float %3861, i64 0, !dbg !176 + %3957 = insertelement <2 x float> %3956, float %3862, i64 1, !dbg !176 + %3958 = fptrunc <2 x float> %3957 to <2 x bfloat>, !dbg !176 + %3959 = insertelement <2 x float> poison, float %3863, i64 0, !dbg !176 + %3960 = insertelement <2 x float> %3959, float %3864, i64 1, !dbg !176 + %3961 = fptrunc <2 x float> %3960 to <2 x bfloat>, !dbg !176 + %3962 = insertelement <2 x float> poison, float %3865, i64 0, !dbg !176 + %3963 = insertelement <2 x float> %3962, float %3866, i64 1, !dbg !176 + %3964 = fptrunc <2 x float> %3963 to <2 x bfloat>, !dbg !176 + %3965 = insertelement <2 x float> poison, float %3867, i64 0, !dbg !176 + %3966 = insertelement <2 x float> %3965, float %3868, i64 1, !dbg !176 + %3967 = fptrunc <2 x float> %3966 to <2 x bfloat>, !dbg !176 + %3968 = insertelement <2 x float> poison, float %3869, i64 0, !dbg !176 + %3969 = insertelement <2 x float> %3968, float %3870, i64 1, !dbg !176 + %3970 = fptrunc <2 x float> %3969 to <2 x bfloat>, !dbg !176 + %3971 = insertelement <2 x float> poison, float %3871, i64 0, !dbg !176 + %3972 = insertelement <2 x float> %3971, float %3872, i64 1, !dbg !176 + %3973 = fptrunc <2 x float> %3972 to <2 x bfloat>, !dbg !176 + %3974 = insertelement <2 x float> poison, float %3873, i64 0, !dbg !176 + %3975 = insertelement <2 x float> %3974, float %3874, i64 1, !dbg !176 + %3976 = fptrunc <2 x float> %3975 to <2 x bfloat>, !dbg !176 + %3977 = insertelement <2 x float> poison, float %3875, i64 0, !dbg !176 + %3978 = insertelement <2 x float> %3977, float %3876, i64 1, !dbg !176 + %3979 = fptrunc <2 x float> %3978 to <2 x bfloat>, !dbg !176 + %3980 = insertelement <2 x float> poison, float %3877, i64 0, !dbg !176 + %3981 = insertelement <2 x float> %3980, float %3878, i64 1, !dbg !176 + %3982 = fptrunc <2 x float> %3981 to <2 x bfloat>, !dbg !176 + %3983 = insertelement <2 x float> poison, float %3879, i64 0, !dbg !176 + %3984 = insertelement <2 x float> %3983, float %3880, i64 1, !dbg !176 + %3985 = fptrunc <2 x float> %3984 to <2 x bfloat>, !dbg !176 + %3986 = insertelement <2 x float> poison, float %3881, i64 0, !dbg !176 + %3987 = insertelement <2 x float> %3986, float %3882, i64 1, !dbg !176 + %3988 = fptrunc <2 x float> %3987 to <2 x bfloat>, !dbg !176 + %3989 = insertelement <2 x float> poison, float %3883, i64 0, !dbg !176 + %3990 = insertelement <2 x float> %3989, float %3884, i64 1, !dbg !176 + %3991 = fptrunc <2 x float> %3990 to <2 x bfloat>, !dbg !176 + %3992 = insertelement <2 x float> poison, float %3885, i64 0, !dbg !176 + %3993 = insertelement <2 x float> %3992, float %3886, i64 1, !dbg !176 + %3994 = fptrunc <2 x float> %3993 to <2 x bfloat>, !dbg !176 + %3995 = insertelement <2 x float> poison, float %3887, i64 0, !dbg !176 + %3996 = insertelement <2 x float> %3995, float %3888, i64 1, !dbg !176 + %3997 = fptrunc <2 x float> %3996 to <2 x bfloat>, !dbg !176 + %3998 = insertelement <2 x float> poison, float %3889, i64 0, !dbg !176 + %3999 = insertelement <2 x float> %3998, float %3890, i64 1, !dbg !176 + %4000 = fptrunc <2 x float> %3999 to <2 x bfloat>, !dbg !176 + %4001 = insertelement <2 x float> poison, float %3891, i64 0, !dbg !176 + %4002 = insertelement <2 x float> %4001, float %3892, i64 1, !dbg !176 + %4003 = fptrunc <2 x float> %4002 to <2 x bfloat>, !dbg !176 + %4004 = insertelement <2 x float> poison, float %3893, i64 0, !dbg !176 + %4005 = insertelement <2 x float> %4004, float %3894, i64 1, !dbg !176 + %4006 = fptrunc <2 x float> %4005 to <2 x bfloat>, !dbg !176 + %4007 = insertelement <2 x float> poison, float %3895, i64 0, !dbg !176 + %4008 = insertelement <2 x float> %4007, float %3896, i64 1, !dbg !176 + %4009 = fptrunc <2 x float> %4008 to <2 x bfloat>, !dbg !176 + %4010 = insertelement <2 x float> poison, float %3897, i64 0, !dbg !176 + %4011 = insertelement <2 x float> %4010, float %3898, i64 1, !dbg !176 + %4012 = fptrunc <2 x float> %4011 to <2 x bfloat>, !dbg !176 + %4013 = insertelement <2 x float> poison, float %3899, i64 0, !dbg !176 + %4014 = insertelement <2 x float> %4013, float %3900, i64 1, !dbg !176 + %4015 = fptrunc <2 x float> %4014 to <2 x bfloat>, !dbg !176 + %4016 = insertelement <2 x float> poison, float %3901, i64 0, !dbg !176 + %4017 = insertelement <2 x float> %4016, float %3902, i64 1, !dbg !176 + %4018 = fptrunc <2 x float> %4017 to <2 x bfloat>, !dbg !176 + %4019 = insertelement <2 x float> poison, float %3903, i64 0, !dbg !176 + %4020 = insertelement <2 x float> %4019, float %3904, i64 1, !dbg !176 + %4021 = fptrunc <2 x float> %4020 to <2 x bfloat>, !dbg !176 + %4022 = insertelement <2 x float> poison, float %3905, i64 0, !dbg !176 + %4023 = insertelement <2 x float> %4022, float %3906, i64 1, !dbg !176 + %4024 = fptrunc <2 x float> %4023 to <2 x bfloat>, !dbg !176 + %4025 = insertelement <2 x float> poison, float %3907, i64 0, !dbg !176 + %4026 = insertelement <2 x float> %4025, float %3908, i64 1, !dbg !176 + %4027 = fptrunc <2 x float> %4026 to <2 x bfloat>, !dbg !176 + %4028 = insertelement <2 x float> poison, float %3909, i64 0, !dbg !176 + %4029 = insertelement <2 x float> %4028, float %3910, i64 1, !dbg !176 + %4030 = fptrunc <2 x float> %4029 to <2 x bfloat>, !dbg !176 + %4031 = insertelement <2 x float> poison, float %3911, i64 0, !dbg !176 + %4032 = insertelement <2 x float> %4031, float %3912, i64 1, !dbg !176 + %4033 = fptrunc <2 x float> %4032 to <2 x bfloat>, !dbg !176 + %4034 = insertelement <2 x float> poison, float %3913, i64 0, !dbg !176 + %4035 = insertelement <2 x float> %4034, float %3914, i64 1, !dbg !176 + %4036 = fptrunc <2 x float> %4035 to <2 x bfloat>, !dbg !176 + %4037 = shl nuw nsw i32 %198, 13, !dbg !176 + %4038 = shl nuw nsw i32 %34, 5, !dbg !176 + %4039 = and i32 %4038, 7264, !dbg !176 + %4040 = and i32 %34, 24, !dbg !176 + %4041 = shl nuw nsw i32 %4040, 4, !dbg !176 + %4042 = shl nuw nsw i32 %34, 2, !dbg !176 + %4043 = and i32 %4042, 16, !dbg !176 + %4044 = or disjoint i32 %4037, %4043, !dbg !176 + %4045 = or disjoint i32 %4039, %4041, !dbg !176 + %4046 = or disjoint i32 %4044, %4045, !dbg !176 + %4047 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4046, !dbg !176 + %4048 = bitcast <2 x bfloat> %3943 to i32, !dbg !176 + %4049 = bitcast <2 x bfloat> %3949 to i32, !dbg !176 + %4050 = bitcast <2 x bfloat> %3955 to i32, !dbg !176 + %4051 = bitcast <2 x bfloat> %3961 to i32, !dbg !176 + %4052 = insertelement <4 x i32> poison, i32 %4048, i64 0, !dbg !176 + %4053 = insertelement <4 x i32> %4052, i32 %4049, i64 1, !dbg !176 + %4054 = insertelement <4 x i32> %4053, i32 %4050, i64 2, !dbg !176 + %4055 = insertelement <4 x i32> %4054, i32 %4051, i64 3, !dbg !176 + store <4 x i32> %4055, ptr addrspace(3) %4047, align 16, !dbg !176 + %4056 = getelementptr inbounds nuw i8, ptr addrspace(3) %4047, i32 512, !dbg !176 + %4057 = bitcast <2 x bfloat> %3946 to i32, !dbg !176 + %4058 = bitcast <2 x bfloat> %3952 to i32, !dbg !176 + %4059 = bitcast <2 x bfloat> %3958 to i32, !dbg !176 + %4060 = bitcast <2 x bfloat> %3964 to i32, !dbg !176 + %4061 = insertelement <4 x i32> poison, i32 %4057, i64 0, !dbg !176 + %4062 = insertelement <4 x i32> %4061, i32 %4058, i64 1, !dbg !176 + %4063 = insertelement <4 x i32> %4062, i32 %4059, i64 2, !dbg !176 + %4064 = insertelement <4 x i32> %4063, i32 %4060, i64 3, !dbg !176 + store <4 x i32> %4064, ptr addrspace(3) %4056, align 16, !dbg !176 + %4065 = xor i32 %4046, 32, !dbg !176 + %4066 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4065, !dbg !176 + %4067 = bitcast <2 x bfloat> %3967 to i32, !dbg !176 + %4068 = bitcast <2 x bfloat> %3973 to i32, !dbg !176 + %4069 = bitcast <2 x bfloat> %3979 to i32, !dbg !176 + %4070 = bitcast <2 x bfloat> %3985 to i32, !dbg !176 + %4071 = insertelement <4 x i32> poison, i32 %4067, i64 0, !dbg !176 + %4072 = insertelement <4 x i32> %4071, i32 %4068, i64 1, !dbg !176 + %4073 = insertelement <4 x i32> %4072, i32 %4069, i64 2, !dbg !176 + %4074 = insertelement <4 x i32> %4073, i32 %4070, i64 3, !dbg !176 + store <4 x i32> %4074, ptr addrspace(3) %4066, align 16, !dbg !176 + %4075 = getelementptr inbounds nuw i8, ptr addrspace(3) %4066, i32 512, !dbg !176 + %4076 = bitcast <2 x bfloat> %3970 to i32, !dbg !176 + %4077 = bitcast <2 x bfloat> %3976 to i32, !dbg !176 + %4078 = bitcast <2 x bfloat> %3982 to i32, !dbg !176 + %4079 = bitcast <2 x bfloat> %3988 to i32, !dbg !176 + %4080 = insertelement <4 x i32> poison, i32 %4076, i64 0, !dbg !176 + %4081 = insertelement <4 x i32> %4080, i32 %4077, i64 1, !dbg !176 + %4082 = insertelement <4 x i32> %4081, i32 %4078, i64 2, !dbg !176 + %4083 = insertelement <4 x i32> %4082, i32 %4079, i64 3, !dbg !176 + store <4 x i32> %4083, ptr addrspace(3) %4075, align 16, !dbg !176 + %4084 = xor i32 %4046, 64, !dbg !176 + %4085 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4084, !dbg !176 + %4086 = bitcast <2 x bfloat> %3991 to i32, !dbg !176 + %4087 = bitcast <2 x bfloat> %3997 to i32, !dbg !176 + %4088 = bitcast <2 x bfloat> %4003 to i32, !dbg !176 + %4089 = bitcast <2 x bfloat> %4009 to i32, !dbg !176 + %4090 = insertelement <4 x i32> poison, i32 %4086, i64 0, !dbg !176 + %4091 = insertelement <4 x i32> %4090, i32 %4087, i64 1, !dbg !176 + %4092 = insertelement <4 x i32> %4091, i32 %4088, i64 2, !dbg !176 + %4093 = insertelement <4 x i32> %4092, i32 %4089, i64 3, !dbg !176 + store <4 x i32> %4093, ptr addrspace(3) %4085, align 16, !dbg !176 + %4094 = getelementptr inbounds nuw i8, ptr addrspace(3) %4085, i32 512, !dbg !176 + %4095 = bitcast <2 x bfloat> %3994 to i32, !dbg !176 + %4096 = bitcast <2 x bfloat> %4000 to i32, !dbg !176 + %4097 = bitcast <2 x bfloat> %4006 to i32, !dbg !176 + %4098 = bitcast <2 x bfloat> %4012 to i32, !dbg !176 + %4099 = insertelement <4 x i32> poison, i32 %4095, i64 0, !dbg !176 + %4100 = insertelement <4 x i32> %4099, i32 %4096, i64 1, !dbg !176 + %4101 = insertelement <4 x i32> %4100, i32 %4097, i64 2, !dbg !176 + %4102 = insertelement <4 x i32> %4101, i32 %4098, i64 3, !dbg !176 + store <4 x i32> %4102, ptr addrspace(3) %4094, align 16, !dbg !176 + %4103 = xor i32 %4046, 96, !dbg !176 + %4104 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4103, !dbg !176 + %4105 = bitcast <2 x bfloat> %4015 to i32, !dbg !176 + %4106 = bitcast <2 x bfloat> %4021 to i32, !dbg !176 + %4107 = bitcast <2 x bfloat> %4027 to i32, !dbg !176 + %4108 = bitcast <2 x bfloat> %4033 to i32, !dbg !176 + %4109 = insertelement <4 x i32> poison, i32 %4105, i64 0, !dbg !176 + %4110 = insertelement <4 x i32> %4109, i32 %4106, i64 1, !dbg !176 + %4111 = insertelement <4 x i32> %4110, i32 %4107, i64 2, !dbg !176 + %4112 = insertelement <4 x i32> %4111, i32 %4108, i64 3, !dbg !176 + store <4 x i32> %4112, ptr addrspace(3) %4104, align 16, !dbg !176 + %4113 = getelementptr inbounds nuw i8, ptr addrspace(3) %4104, i32 512, !dbg !176 + %4114 = bitcast <2 x bfloat> %4018 to i32, !dbg !176 + %4115 = bitcast <2 x bfloat> %4024 to i32, !dbg !176 + %4116 = bitcast <2 x bfloat> %4030 to i32, !dbg !176 + %4117 = bitcast <2 x bfloat> %4036 to i32, !dbg !176 + %4118 = insertelement <4 x i32> poison, i32 %4114, i64 0, !dbg !176 + %4119 = insertelement <4 x i32> %4118, i32 %4115, i64 1, !dbg !176 + %4120 = insertelement <4 x i32> %4119, i32 %4116, i64 2, !dbg !176 + %4121 = insertelement <4 x i32> %4120, i32 %4117, i64 3, !dbg !176 + store <4 x i32> %4121, ptr addrspace(3) %4113, align 16, !dbg !176 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !176 + %4122 = shl nuw nsw i32 %4040, 10, !dbg !176 + %4123 = shl nuw nsw i32 %198, 5, !dbg !176 + %4124 = and i32 %34, 252, !dbg !176 + %4125 = shl nuw nsw i32 %4124, 2, !dbg !176 + %4126 = or disjoint i32 %4122, %4123, !dbg !176 + %4127 = xor i32 %4126, %4125, !dbg !176 + %4128 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4127, !dbg !176 + %4129 = ptrtoint ptr addrspace(3) %4128 to i32, !dbg !176 + %4130 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4129) #2, !dbg !176 + %4131 = extractvalue { i32, i32, i32, i32 } %4130, 0, !dbg !176 + %4132 = extractvalue { i32, i32, i32, i32 } %4130, 1, !dbg !176 + %4133 = extractvalue { i32, i32, i32, i32 } %4130, 2, !dbg !176 + %4134 = extractvalue { i32, i32, i32, i32 } %4130, 3, !dbg !176 + %4135 = getelementptr inbounds nuw i8, ptr addrspace(3) %4128, i32 1024, !dbg !176 + %4136 = ptrtoint ptr addrspace(3) %4135 to i32, !dbg !176 + %4137 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4136) #2, !dbg !176 + %4138 = extractvalue { i32, i32, i32, i32 } %4137, 0, !dbg !176 + %4139 = extractvalue { i32, i32, i32, i32 } %4137, 1, !dbg !176 + %4140 = extractvalue { i32, i32, i32, i32 } %4137, 2, !dbg !176 + %4141 = extractvalue { i32, i32, i32, i32 } %4137, 3, !dbg !176 + %4142 = getelementptr inbounds nuw i8, ptr addrspace(3) %4128, i32 2048, !dbg !176 + %4143 = ptrtoint ptr addrspace(3) %4142 to i32, !dbg !176 + %4144 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4143) #2, !dbg !176 + %4145 = extractvalue { i32, i32, i32, i32 } %4144, 0, !dbg !176 + %4146 = extractvalue { i32, i32, i32, i32 } %4144, 1, !dbg !176 + %4147 = extractvalue { i32, i32, i32, i32 } %4144, 2, !dbg !176 + %4148 = extractvalue { i32, i32, i32, i32 } %4144, 3, !dbg !176 + %4149 = getelementptr inbounds nuw i8, ptr addrspace(3) %4128, i32 3072, !dbg !176 + %4150 = ptrtoint ptr addrspace(3) %4149 to i32, !dbg !176 + %4151 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4150) #2, !dbg !176 + %4152 = extractvalue { i32, i32, i32, i32 } %4151, 0, !dbg !176 + %4153 = extractvalue { i32, i32, i32, i32 } %4151, 1, !dbg !176 + %4154 = extractvalue { i32, i32, i32, i32 } %4151, 2, !dbg !176 + %4155 = extractvalue { i32, i32, i32, i32 } %4151, 3, !dbg !176 + %4156 = getelementptr inbounds nuw i8, ptr addrspace(3) %4128, i32 4096, !dbg !176 + %4157 = ptrtoint ptr addrspace(3) %4156 to i32, !dbg !176 + %4158 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4157) #2, !dbg !176 + %4159 = extractvalue { i32, i32, i32, i32 } %4158, 0, !dbg !176 + %4160 = extractvalue { i32, i32, i32, i32 } %4158, 1, !dbg !176 + %4161 = extractvalue { i32, i32, i32, i32 } %4158, 2, !dbg !176 + %4162 = extractvalue { i32, i32, i32, i32 } %4158, 3, !dbg !176 + %4163 = getelementptr inbounds nuw i8, ptr addrspace(3) %4128, i32 5120, !dbg !176 + %4164 = ptrtoint ptr addrspace(3) %4163 to i32, !dbg !176 + %4165 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4164) #2, !dbg !176 + %4166 = extractvalue { i32, i32, i32, i32 } %4165, 0, !dbg !176 + %4167 = extractvalue { i32, i32, i32, i32 } %4165, 1, !dbg !176 + %4168 = extractvalue { i32, i32, i32, i32 } %4165, 2, !dbg !176 + %4169 = extractvalue { i32, i32, i32, i32 } %4165, 3, !dbg !176 + %4170 = getelementptr inbounds nuw i8, ptr addrspace(3) %4128, i32 6144, !dbg !176 + %4171 = ptrtoint ptr addrspace(3) %4170 to i32, !dbg !176 + %4172 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4171) #2, !dbg !176 + %4173 = extractvalue { i32, i32, i32, i32 } %4172, 0, !dbg !176 + %4174 = extractvalue { i32, i32, i32, i32 } %4172, 1, !dbg !176 + %4175 = extractvalue { i32, i32, i32, i32 } %4172, 2, !dbg !176 + %4176 = extractvalue { i32, i32, i32, i32 } %4172, 3, !dbg !176 + %4177 = getelementptr inbounds nuw i8, ptr addrspace(3) %4128, i32 7168, !dbg !176 + %4178 = ptrtoint ptr addrspace(3) %4177 to i32, !dbg !176 + %4179 = tail call { i32, i32, i32, i32 } asm sideeffect "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {$0, $1, $2, $3}, [$4];", "=r,=r,=r,=r,r"(i32 %4178) #2, !dbg !176 + %4180 = extractvalue { i32, i32, i32, i32 } %4179, 0, !dbg !176 + %4181 = extractvalue { i32, i32, i32, i32 } %4179, 1, !dbg !176 + %4182 = extractvalue { i32, i32, i32, i32 } %4179, 2, !dbg !176 + %4183 = extractvalue { i32, i32, i32, i32 } %4179, 3, !dbg !176 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4131, i32 %4132, i32 %4133, i32 %4134, ptr addrspace(1) %3926, i1 %89) #2, !dbg !176 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4138, i32 %4139, i32 %4140, i32 %4141, ptr addrspace(1) %3928, i1 %90) #2, !dbg !176 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4145, i32 %4146, i32 %4147, i32 %4148, ptr addrspace(1) %3930, i1 %91) #2, !dbg !176 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4152, i32 %4153, i32 %4154, i32 %4155, ptr addrspace(1) %3932, i1 %92) #2, !dbg !176 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4159, i32 %4160, i32 %4161, i32 %4162, ptr addrspace(1) %3934, i1 %93) #2, !dbg !176 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4166, i32 %4167, i32 %4168, i32 %4169, ptr addrspace(1) %3936, i1 %94) #2, !dbg !176 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4173, i32 %4174, i32 %4175, i32 %4176, ptr addrspace(1) %3938, i1 %95) #2, !dbg !176 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %4180, i32 %4181, i32 %4182, i32 %4183, ptr addrspace(1) %3940, i1 %96) #2, !dbg !176 + %4184 = fcmp olt float %3785, 0x3810000000000000, !dbg !177 + %4185 = fmul float %3785, 0x4160000000000000, !dbg !177 + %.02.i = select i1 %4184, float %4185, float %3785, !dbg !177 + %i.i.0.i = select i1 %4184, float -2.300000e+01, float 0.000000e+00, !dbg !177 + %4186 = bitcast float %.02.i to i32, !dbg !177 + %4187 = add i32 %4186, -1060439283, !dbg !177 + %4188 = and i32 %4187, -8388608, !dbg !177 + %4189 = sub i32 %4186, %4188, !dbg !177 + %4190 = bitcast i32 %4189 to float, !dbg !177 + %4191 = sitofp i32 %4188 to float, !dbg !177 + %4192 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not.i356 = icmp eq i32 %4192, 0, !dbg !177 + %4193 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %4191, float 0x3E80000000000000, float %i.i.0.i) #2, !dbg !177 + %4194 = tail call float @llvm.nvvm.fma.rn.f(float %4191, float 0x3E80000000000000, float %i.i.0.i) #2, !dbg !177 + %.08.i = select i1 %.not.i356, float %4194, float %4193, !dbg !177 + %4195 = fadd float %4190, -1.000000e+00, !dbg !177 + %4196 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not1.i = icmp eq i32 %4196, 0, !dbg !177 + %4197 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %4195, float 0xBFC58FE600000000) #2, !dbg !177 + %4198 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %4195, float 0xBFC58FE600000000) #2, !dbg !177 + %.010.i = select i1 %.not1.i, float %4198, float %4197, !dbg !177 + %4199 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not2.i = icmp eq i32 %4199, 0, !dbg !177 + %4200 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i, float %4195, float 0x3FC5F9E540000000) #2, !dbg !177 + %4201 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i, float %4195, float 0x3FC5F9E540000000) #2, !dbg !177 + %.011.i = select i1 %.not2.i, float %4201, float %4200, !dbg !177 + %4202 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not3.i = icmp eq i32 %4202, 0, !dbg !177 + %4203 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i, float %4195, float 0xBFC6E9C860000000) #2, !dbg !177 + %4204 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i, float %4195, float 0xBFC6E9C860000000) #2, !dbg !177 + %.012.i = select i1 %.not3.i, float %4204, float %4203, !dbg !177 + %4205 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not4.i = icmp eq i32 %4205, 0, !dbg !177 + %4206 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i, float %4195, float 0x3FCA417E80000000) #2, !dbg !177 + %4207 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i, float %4195, float 0x3FCA417E80000000) #2, !dbg !177 + %.09.i = select i1 %.not4.i, float %4207, float %4206, !dbg !177 + %4208 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not5.i = icmp eq i32 %4208, 0, !dbg !177 + %4209 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i, float %4195, float 0xBFCEC79160000000) #2, !dbg !177 + %4210 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i, float %4195, float 0xBFCEC79160000000) #2, !dbg !177 + %.05.i = select i1 %.not5.i, float %4210, float %4209, !dbg !177 + %4211 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not6.i = icmp eq i32 %4211, 0, !dbg !177 + %4212 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %4195, float 0x3FD277F320000000) #2, !dbg !177 + %4213 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %4195, float 0x3FD277F320000000) #2, !dbg !177 + %.01.i = select i1 %.not6.i, float %4213, float %4212, !dbg !177 + %4214 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not7.i = icmp eq i32 %4214, 0, !dbg !177 + %4215 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i, float %4195, float 0xBFD7154920000000) #2, !dbg !177 + %4216 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i, float %4195, float 0xBFD7154920000000) #2, !dbg !177 + %.0.i357 = select i1 %.not7.i, float %4216, float %4215, !dbg !177 + %4217 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not8.i = icmp eq i32 %4217, 0, !dbg !177 + %4218 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i357, float %4195, float 0x3FDEC70940000000) #2, !dbg !177 + %4219 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i357, float %4195, float 0x3FDEC70940000000) #2, !dbg !177 + %.07.i = select i1 %.not8.i, float %4219, float %4218, !dbg !177 + %4220 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not9.i = icmp eq i32 %4220, 0, !dbg !177 + %4221 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %4195, float 0xBFE7154760000000) #2, !dbg !177 + %4222 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %4195, float 0xBFE7154760000000) #2, !dbg !177 + %.06.i = select i1 %.not9.i, float %4222, float %4221, !dbg !177 + %4223 = fmul float %4195, %.06.i, !dbg !177 + %4224 = fmul float %4195, %4223, !dbg !177 + %4225 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not10.i = icmp eq i32 %4225, 0, !dbg !177 + %4226 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %4195, float 0x3FF7154760000000, float %4224) #2, !dbg !177 + %4227 = tail call float @llvm.nvvm.fma.rn.f(float %4195, float 0x3FF7154760000000, float %4224) #2, !dbg !177 + %.04.i = select i1 %.not10.i, float %4227, float %4226, !dbg !177 + %4228 = fadd float %.08.i, %.04.i, !dbg !177 + %4229 = icmp ugt i32 %4186, 2139095039, !dbg !177 + br i1 %4229, label %__nv_fmaf_rn.exit.i.i, label %__nv_log2f.exit, !dbg !177 + +__nv_fmaf_rn.exit.i.i: ; preds = %._crit_edge461 + %4230 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not11.i = icmp eq i32 %4230, 0, !dbg !177 + %4231 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !177 + %4232 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !177 + %.03.i = select i1 %.not11.i, float %4232, float %4231, !dbg !177 + br label %__nv_log2f.exit, !dbg !177 + +__nv_log2f.exit: ; preds = %._crit_edge461, %__nv_fmaf_rn.exit.i.i + %r.i.0.i = phi float [ %.03.i, %__nv_fmaf_rn.exit.i.i ], [ %4228, %._crit_edge461 ], !dbg !177 + %4233 = fcmp olt float %3786, 0x3810000000000000, !dbg !177 + %4234 = fmul float %3786, 0x4160000000000000, !dbg !177 + %.02.i358 = select i1 %4233, float %4234, float %3786, !dbg !177 + %i.i.0.i359 = select i1 %4233, float -2.300000e+01, float 0.000000e+00, !dbg !177 + %4235 = bitcast float %.02.i358 to i32, !dbg !177 + %4236 = add i32 %4235, -1060439283, !dbg !177 + %4237 = and i32 %4236, -8388608, !dbg !177 + %4238 = sub i32 %4235, %4237, !dbg !177 + %4239 = bitcast i32 %4238 to float, !dbg !177 + %4240 = sitofp i32 %4237 to float, !dbg !177 + %4241 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not.i360 = icmp eq i32 %4241, 0, !dbg !177 + %4242 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %4240, float 0x3E80000000000000, float %i.i.0.i359) #2, !dbg !177 + %4243 = tail call float @llvm.nvvm.fma.rn.f(float %4240, float 0x3E80000000000000, float %i.i.0.i359) #2, !dbg !177 + %.08.i361 = select i1 %.not.i360, float %4243, float %4242, !dbg !177 + %4244 = fadd float %4239, -1.000000e+00, !dbg !177 + %4245 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not1.i362 = icmp eq i32 %4245, 0, !dbg !177 + %4246 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0x3FB8D64FE0000000, float %4244, float 0xBFC58FE600000000) #2, !dbg !177 + %4247 = tail call float @llvm.nvvm.fma.rn.f(float 0x3FB8D64FE0000000, float %4244, float 0xBFC58FE600000000) #2, !dbg !177 + %.010.i363 = select i1 %.not1.i362, float %4247, float %4246, !dbg !177 + %4248 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not2.i364 = icmp eq i32 %4248, 0, !dbg !177 + %4249 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i363, float %4244, float 0x3FC5F9E540000000) #2, !dbg !177 + %4250 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i363, float %4244, float 0x3FC5F9E540000000) #2, !dbg !177 + %.011.i365 = select i1 %.not2.i364, float %4250, float %4249, !dbg !177 + %4251 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not3.i366 = icmp eq i32 %4251, 0, !dbg !177 + %4252 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i365, float %4244, float 0xBFC6E9C860000000) #2, !dbg !177 + %4253 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i365, float %4244, float 0xBFC6E9C860000000) #2, !dbg !177 + %.012.i367 = select i1 %.not3.i366, float %4253, float %4252, !dbg !177 + %4254 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not4.i368 = icmp eq i32 %4254, 0, !dbg !177 + %4255 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i367, float %4244, float 0x3FCA417E80000000) #2, !dbg !177 + %4256 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i367, float %4244, float 0x3FCA417E80000000) #2, !dbg !177 + %.09.i369 = select i1 %.not4.i368, float %4256, float %4255, !dbg !177 + %4257 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not5.i370 = icmp eq i32 %4257, 0, !dbg !177 + %4258 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i369, float %4244, float 0xBFCEC79160000000) #2, !dbg !177 + %4259 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i369, float %4244, float 0xBFCEC79160000000) #2, !dbg !177 + %.05.i371 = select i1 %.not5.i370, float %4259, float %4258, !dbg !177 + %4260 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not6.i372 = icmp eq i32 %4260, 0, !dbg !177 + %4261 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i371, float %4244, float 0x3FD277F320000000) #2, !dbg !177 + %4262 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i371, float %4244, float 0x3FD277F320000000) #2, !dbg !177 + %.01.i373 = select i1 %.not6.i372, float %4262, float %4261, !dbg !177 + %4263 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not7.i374 = icmp eq i32 %4263, 0, !dbg !177 + %4264 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i373, float %4244, float 0xBFD7154920000000) #2, !dbg !177 + %4265 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i373, float %4244, float 0xBFD7154920000000) #2, !dbg !177 + %.0.i375 = select i1 %.not7.i374, float %4265, float %4264, !dbg !177 + %4266 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not8.i376 = icmp eq i32 %4266, 0, !dbg !177 + %4267 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i375, float %4244, float 0x3FDEC70940000000) #2, !dbg !177 + %4268 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i375, float %4244, float 0x3FDEC70940000000) #2, !dbg !177 + %.07.i377 = select i1 %.not8.i376, float %4268, float %4267, !dbg !177 + %4269 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not9.i378 = icmp eq i32 %4269, 0, !dbg !177 + %4270 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i377, float %4244, float 0xBFE7154760000000) #2, !dbg !177 + %4271 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i377, float %4244, float 0xBFE7154760000000) #2, !dbg !177 + %.06.i379 = select i1 %.not9.i378, float %4271, float %4270, !dbg !177 + %4272 = fmul float %4244, %.06.i379, !dbg !177 + %4273 = fmul float %4244, %4272, !dbg !177 + %4274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not10.i380 = icmp eq i32 %4274, 0, !dbg !177 + %4275 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %4244, float 0x3FF7154760000000, float %4273) #2, !dbg !177 + %4276 = tail call float @llvm.nvvm.fma.rn.f(float %4244, float 0x3FF7154760000000, float %4273) #2, !dbg !177 + %.04.i381 = select i1 %.not10.i380, float %4276, float %4275, !dbg !177 + %4277 = fadd float %.08.i361, %.04.i381, !dbg !177 + %4278 = icmp ugt i32 %4235, 2139095039, !dbg !177 + br i1 %4278, label %__nv_fmaf_rn.exit.i.i384, label %__nv_log2f.exit387, !dbg !177 + +__nv_fmaf_rn.exit.i.i384: ; preds = %__nv_log2f.exit + %4279 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #2, !dbg !177 + %.not11.i385 = icmp eq i32 %4279, 0, !dbg !177 + %4280 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i358, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !177 + %4281 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i358, float 0x7FF0000000000000, float 0x7FF0000000000000) #2, !dbg !177 + %.03.i386 = select i1 %.not11.i385, float %4281, float %4280, !dbg !177 + br label %__nv_log2f.exit387, !dbg !177 + +__nv_log2f.exit387: ; preds = %__nv_log2f.exit, %__nv_fmaf_rn.exit.i.i384 + %r.i.0.i382 = phi float [ %.03.i386, %__nv_fmaf_rn.exit.i.i384 ], [ %4277, %__nv_log2f.exit ], !dbg !177 + %4282 = insertelement <2 x float> poison, float %.02.i, i64 0, !dbg !177 + %4283 = insertelement <2 x float> %4282, float %.02.i358, i64 1, !dbg !177 + %4284 = fcmp oeq <2 x float> %4283, zeroinitializer, !dbg !177 + %4285 = shl nuw i32 %17, 16, !dbg !178 + %4286 = shl nuw nsw i32 %18, 11, !dbg !178 + %4287 = add i32 %4285, %4286, !dbg !178 + %4288 = sext i32 %4287 to i64, !dbg !179 + %4289 = getelementptr float, ptr addrspace(1) %3, i64 %4288, !dbg !179 + %4290 = and i32 %34, 127, !dbg !25 + %4291 = or disjoint i32 %33, %4290, !dbg !26 + %4292 = sext i32 %4291 to i64, !dbg !180 + %4293 = getelementptr float, ptr addrspace(1) %4289, i64 %4292, !dbg !180 + %4294 = insertelement <2 x float> poison, float %r.i.0.i, i64 0, !dbg !177 + %4295 = insertelement <2 x float> %4294, float %r.i.0.i382, i64 1, !dbg !177 + %4296 = select <2 x i1> %4284, <2 x float> splat (float 0xFFF0000000000000), <2 x float> %4295, !dbg !177 + %4297 = fadd <2 x float> %3781, %4296, !dbg !181 + %4298 = icmp slt i32 %4291, 2048, !dbg !182 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !183 + %4299 = shl nuw nsw i32 %4124, 1, !dbg !183 + %4300 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4299, !dbg !183 + store <2 x float> %4297, ptr addrspace(3) %4300, align 8, !dbg !183 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !183 + %4301 = shl nuw nsw i32 %137, 3, !dbg !183 + %4302 = shl nuw nsw i32 %140, 2, !dbg !183 + %4303 = lshr exact i32 %141, 1, !dbg !183 + %4304 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %4301, !dbg !183 + %4305 = getelementptr inbounds nuw i8, ptr addrspace(3) %4304, i32 %4302, !dbg !183 + %4306 = getelementptr inbounds nuw i8, ptr addrspace(3) %4305, i32 %4303, !dbg !183 + %4307 = load i32, ptr addrspace(3) %4306, align 4, !dbg !183 + %4308 = and i32 %34, 128, !dbg !183 + %4309 = icmp eq i32 %4308, 0, !dbg !183 + %4310 = and i1 %4309, %4298, !dbg !183 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %4307, ptr addrspace(1) %4293, i1 %4310) #2, !dbg !183 + ret void, !dbg !184 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smax.i32(i32, i32) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smin.i32(i32, i32) #1 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.commit.group() #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: nounwind +declare void @llvm.nvvm.cp.async.wait.group(i32 immarg) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.div.full(float, float) #4 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.idx.i32(i32, i32, i32, i32) #5 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.fence.sync.aligned() #6 + +; Function Attrs: convergent nounwind +declare void @llvm.nvvm.wgmma.commit_group.sync.aligned() #6 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.maxnum.f32(float, float) #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #5 + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #7 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.f(float) #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.f(float, float, float) #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #8 + +attributes #0 = { nounwind "nvvm.reqntid"="256" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nounwind } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #5 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #6 = { convergent nounwind } +attributes #7 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #8 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = distinct !DISubprogram(name: "triton_tem_fused_0", linkageName: "triton_tem_fused_0", scope: !1, file: !1, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 97, column: 28, scope: !5) +!9 = !DILocation(line: 98, column: 27, scope: !5) +!10 = !DILocation(line: 99, column: 27, scope: !5) +!11 = !DILocation(line: 103, column: 23, scope: !5) +!12 = !DILocation(line: 107, column: 24, scope: !5) +!13 = !DILocation(line: 107, column: 45, scope: !5) +!14 = !DILocation(line: 107, column: 36, scope: !5) +!15 = !DILocation(line: 108, column: 25, scope: !5) +!16 = !DILocation(line: 108, column: 47, scope: !5) +!17 = !DILocation(line: 108, column: 37, scope: !5) +!18 = !DILocation(line: 111, column: 12, scope: !5) +!19 = !DILocation(line: 112, column: 12, scope: !5) +!20 = !DILocation(line: 113, column: 12, scope: !5) +!21 = !DILocation(line: 142, column: 51, scope: !5) +!22 = !DILocation(line: 142, column: 74, scope: !5) +!23 = !DILocation(line: 143, column: 64, scope: !5) +!24 = !DILocation(line: 144, column: 23, scope: !5) +!25 = !DILocation(line: 144, column: 46, scope: !5) +!26 = !DILocation(line: 144, column: 33, scope: !5) +!27 = !DILocation(line: 284, column: 38, scope: !28, inlinedAt: !29) +!28 = distinct !DILexicalBlockFile(scope: !5, file: !1, discriminator: 0) +!29 = !DILocation(line: 146, column: 101, scope: !5) +!30 = !DILocation(line: 284, column: 20, scope: !28, inlinedAt: !29) +!31 = !DILocation(line: 284, column: 56, scope: !28, inlinedAt: !29) +!32 = !DILocation(line: 284, column: 49, scope: !28, inlinedAt: !29) +!33 = !DILocation(line: 292, column: 52, scope: !28, inlinedAt: !29) +!34 = !DILocation(line: 292, column: 23, scope: !28, inlinedAt: !29) +!35 = !DILocation(line: 151, column: 26, scope: !5) +!36 = !DILocation(line: 152, column: 23, scope: !5) +!37 = !DILocation(line: 152, column: 37, scope: !5) +!38 = !DILocation(line: 153, column: 42, scope: !5) +!39 = !DILocation(line: 153, column: 28, scope: !5) +!40 = !DILocation(line: 154, column: 45, scope: !5) +!41 = !DILocation(line: 41, column: 22, scope: !42, inlinedAt: !44) +!42 = distinct !DILexicalBlockFile(scope: !5, file: !43, discriminator: 0) +!43 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!44 = !DILocation(line: 154, column: 92, scope: !5) +!45 = !DILocation(line: 41, column: 28, scope: !42, inlinedAt: !44) +!46 = !DILocation(line: 154, column: 102, scope: !5) +!47 = !DILocation(line: 154, column: 65, scope: !5) +!48 = !DILocation(line: 159, column: 37, scope: !5) +!49 = !DILocation(line: 257, column: 21, scope: !28, inlinedAt: !50) +!50 = !DILocation(line: 172, column: 41, scope: !5) +!51 = !DILocation(line: 376, column: 33, scope: !28, inlinedAt: !50) +!52 = !DILocation(line: 502, column: 40, scope: !28, inlinedAt: !50) +!53 = !DILocation(line: 376, column: 23, scope: !28, inlinedAt: !50) +!54 = !DILocation(line: 378, column: 23, scope: !28, inlinedAt: !50) +!55 = !DILocation(line: 379, column: 23, scope: !28, inlinedAt: !50) +!56 = !DILocation(line: 346, column: 35, scope: !28, inlinedAt: !50) +!57 = !DILocation(line: 284, column: 38, scope: !28, inlinedAt: !50) +!58 = !DILocation(line: 284, column: 20, scope: !28, inlinedAt: !50) +!59 = !DILocation(line: 284, column: 49, scope: !28, inlinedAt: !50) +!60 = !DILocation(line: 292, column: 52, scope: !28, inlinedAt: !50) +!61 = !DILocation(line: 292, column: 23, scope: !28, inlinedAt: !50) +!62 = !DILocation(line: 342, column: 32, scope: !28, inlinedAt: !50) +!63 = !DILocation(line: 351, column: 19, scope: !28, inlinedAt: !50) +!64 = !DILocation(line: 159, column: 24, scope: !5) +!65 = !DILocation(line: 395, column: 24, scope: !28, inlinedAt: !50) +!66 = !DILocation(line: 353, column: 14, scope: !28, inlinedAt: !50) +!67 = !DILocation(line: 367, column: 44, scope: !28, inlinedAt: !50) +!68 = !DILocation(line: 393, column: 39, scope: !28, inlinedAt: !50) +!69 = !DILocation(line: 373, column: 23, scope: !28, inlinedAt: !50) +!70 = !DILocation(line: 381, column: 23, scope: !28, inlinedAt: !50) +!71 = !DILocation(line: 384, column: 24, scope: !28, inlinedAt: !50) +!72 = !DILocation(line: 394, column: 25, scope: !28, inlinedAt: !50) +!73 = !DILocation(line: 396, column: 24, scope: !28, inlinedAt: !50) +!74 = !DILocation(line: 397, column: 23, scope: !28, inlinedAt: !50) +!75 = !DILocation(line: 404, column: 39, scope: !28, inlinedAt: !50) +!76 = !DILocation(line: 405, column: 25, scope: !28, inlinedAt: !50) +!77 = !DILocation(line: 406, column: 24, scope: !28, inlinedAt: !50) +!78 = !DILocation(line: 407, column: 24, scope: !28, inlinedAt: !50) +!79 = !DILocation(line: 412, column: 73, scope: !28, inlinedAt: !50) +!80 = !DILocation(line: 417, column: 27, scope: !28, inlinedAt: !50) +!81 = !DILocation(line: 414, column: 69, scope: !28, inlinedAt: !50) +!82 = !DILocation(line: 168, column: 27, scope: !42, inlinedAt: !50) +!83 = !DILocation(line: 189, column: 40, scope: !42, inlinedAt: !50) +!84 = !DILocation(line: 421, column: 27, scope: !28, inlinedAt: !50) +!85 = !DILocation(line: 424, column: 51, scope: !28, inlinedAt: !50) +!86 = !DILocation(line: 423, column: 35, scope: !28, inlinedAt: !50) +!87 = !DILocation(line: 428, column: 31, scope: !28, inlinedAt: !50) +!88 = !DILocation(line: 428, column: 25, scope: !28, inlinedAt: !50) +!89 = !DILocation(line: 429, column: 39, scope: !28, inlinedAt: !50) +!90 = !DILocation(line: 429, column: 21, scope: !28, inlinedAt: !50) +!91 = !DILocation(line: 434, column: 16, scope: !28, inlinedAt: !50) +!92 = !DILocation(line: 261, column: 15, scope: !42, inlinedAt: !50) +!93 = !DILocation(line: 291, column: 36, scope: !42, inlinedAt: !50) +!94 = !DILocation(line: 434, column: 24, scope: !28, inlinedAt: !50) +!95 = !DILocation(line: 436, column: 16, scope: !28, inlinedAt: !50) +!96 = !DILocation(line: 440, column: 22, scope: !28, inlinedAt: !50) +!97 = !DILocation(line: 440, column: 44, scope: !28, inlinedAt: !50) +!98 = !DILocation(line: 548, column: 26, scope: !28, inlinedAt: !50) +!99 = !DILocation(line: 247, column: 33, scope: !28, inlinedAt: !50) +!100 = !DILocation(line: 248, column: 38, scope: !28, inlinedAt: !50) +!101 = !DILocation(line: 248, column: 24, scope: !28, inlinedAt: !50) +!102 = !DILocation(line: 249, column: 109, scope: !28, inlinedAt: !50) +!103 = !DILocation(line: 249, column: 113, scope: !28, inlinedAt: !50) +!104 = !DILocation(line: 249, column: 55, scope: !28, inlinedAt: !50) +!105 = !DILocation(line: 249, column: 25, scope: !28, inlinedAt: !50) +!106 = !DILocation(line: 250, column: 35, scope: !28, inlinedAt: !50) +!107 = !DILocation(line: 251, column: 34, scope: !28, inlinedAt: !50) +!108 = !DILocation(line: 251, column: 48, scope: !28, inlinedAt: !50) +!109 = !DILocation(line: 251, column: 63, scope: !28, inlinedAt: !50) +!110 = !DILocation(line: 252, column: 29, scope: !28, inlinedAt: !50) +!111 = !DILocation(line: 252, column: 61, scope: !28, inlinedAt: !50) +!112 = !DILocation(line: 252, column: 42, scope: !28, inlinedAt: !50) +!113 = !DILocation(line: 549, column: 21, scope: !28, inlinedAt: !50) +!114 = !DILocation(line: 136, column: 19, scope: !5) +!115 = !DILocation(line: 181, column: 35, scope: !5) +!116 = !DILocation(line: 182, column: 27, scope: !5) +!117 = !DILocation(line: 182, column: 41, scope: !5) +!118 = !DILocation(line: 183, column: 51, scope: !5) +!119 = !DILocation(line: 183, column: 32, scope: !5) +!120 = !DILocation(line: 184, column: 49, scope: !5) +!121 = !DILocation(line: 184, column: 69, scope: !5) +!122 = !DILocation(line: 502, column: 40, scope: !28, inlinedAt: !123) +!123 = !DILocation(line: 198, column: 45, scope: !5) +!124 = !DILocation(line: 346, column: 35, scope: !28, inlinedAt: !123) +!125 = !DILocation(line: 284, column: 38, scope: !28, inlinedAt: !123) +!126 = !DILocation(line: 284, column: 20, scope: !28, inlinedAt: !123) +!127 = !DILocation(line: 284, column: 49, scope: !28, inlinedAt: !123) +!128 = !DILocation(line: 292, column: 52, scope: !28, inlinedAt: !123) +!129 = !DILocation(line: 292, column: 23, scope: !28, inlinedAt: !123) +!130 = !DILocation(line: 342, column: 32, scope: !28, inlinedAt: !123) +!131 = !DILocation(line: 351, column: 19, scope: !28, inlinedAt: !123) +!132 = !DILocation(line: 186, column: 28, scope: !5) +!133 = !DILocation(line: 353, column: 14, scope: !28, inlinedAt: !123) +!134 = !DILocation(line: 367, column: 44, scope: !28, inlinedAt: !123) +!135 = !DILocation(line: 417, column: 27, scope: !28, inlinedAt: !123) +!136 = !DILocation(line: 367, column: 69, scope: !28, inlinedAt: !123) +!137 = !DILocation(line: 168, column: 27, scope: !42, inlinedAt: !123) +!138 = !DILocation(line: 189, column: 40, scope: !42, inlinedAt: !123) +!139 = !DILocation(line: 421, column: 27, scope: !28, inlinedAt: !123) +!140 = !DILocation(line: 424, column: 51, scope: !28, inlinedAt: !123) +!141 = !DILocation(line: 423, column: 35, scope: !28, inlinedAt: !123) +!142 = !DILocation(line: 428, column: 31, scope: !28, inlinedAt: !123) +!143 = !DILocation(line: 428, column: 25, scope: !28, inlinedAt: !123) +!144 = !DILocation(line: 429, column: 39, scope: !28, inlinedAt: !123) +!145 = !DILocation(line: 429, column: 21, scope: !28, inlinedAt: !123) +!146 = !DILocation(line: 434, column: 16, scope: !28, inlinedAt: !123) +!147 = !DILocation(line: 261, column: 15, scope: !42, inlinedAt: !123) +!148 = !DILocation(line: 291, column: 36, scope: !42, inlinedAt: !123) +!149 = !DILocation(line: 434, column: 24, scope: !28, inlinedAt: !123) +!150 = !DILocation(line: 436, column: 16, scope: !28, inlinedAt: !123) +!151 = !DILocation(line: 440, column: 22, scope: !28, inlinedAt: !123) +!152 = !DILocation(line: 440, column: 44, scope: !28, inlinedAt: !123) +!153 = !DILocation(line: 548, column: 26, scope: !28, inlinedAt: !123) +!154 = !DILocation(line: 247, column: 33, scope: !28, inlinedAt: !123) +!155 = !DILocation(line: 248, column: 38, scope: !28, inlinedAt: !123) +!156 = !DILocation(line: 248, column: 24, scope: !28, inlinedAt: !123) +!157 = !DILocation(line: 249, column: 109, scope: !28, inlinedAt: !123) +!158 = !DILocation(line: 249, column: 113, scope: !28, inlinedAt: !123) +!159 = !DILocation(line: 249, column: 55, scope: !28, inlinedAt: !123) +!160 = !DILocation(line: 249, column: 25, scope: !28, inlinedAt: !123) +!161 = !DILocation(line: 250, column: 35, scope: !28, inlinedAt: !123) +!162 = !DILocation(line: 251, column: 34, scope: !28, inlinedAt: !123) +!163 = !DILocation(line: 251, column: 48, scope: !28, inlinedAt: !123) +!164 = !DILocation(line: 251, column: 63, scope: !28, inlinedAt: !123) +!165 = !DILocation(line: 252, column: 29, scope: !28, inlinedAt: !123) +!166 = !DILocation(line: 252, column: 61, scope: !28, inlinedAt: !123) +!167 = !DILocation(line: 252, column: 42, scope: !28, inlinedAt: !123) +!168 = !DILocation(line: 549, column: 21, scope: !28, inlinedAt: !123) +!169 = !DILocation(line: 206, column: 26, scope: !5) +!170 = !DILocation(line: 206, column: 34, scope: !5) +!171 = !DILocation(line: 208, column: 16, scope: !5) +!172 = !DILocation(line: 218, column: 49, scope: !5) +!173 = !DILocation(line: 218, column: 62, scope: !5) +!174 = !DILocation(line: 218, column: 75, scope: !5) +!175 = !DILocation(line: 218, column: 25, scope: !5) +!176 = !DILocation(line: 218, column: 109, scope: !5) +!177 = !DILocation(line: 223, column: 33, scope: !5) +!178 = !DILocation(line: 222, column: 32, scope: !5) +!179 = !DILocation(line: 222, column: 23, scope: !5) +!180 = !DILocation(line: 222, column: 40, scope: !5) +!181 = !DILocation(line: 223, column: 20, scope: !5) +!182 = !DILocation(line: 227, column: 48, scope: !5) +!183 = !DILocation(line: 227, column: 29, scope: !5) +!184 = !DILocation(line: 229, column: 4, scope: !5) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..b88d28277c922cd58513862769dda590c2100798 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.ptx @@ -0,0 +1,3420 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_tem_fused_0 // -- Begin function triton_tem_fused_0 +.extern .shared .align 16 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90}; + // @triton_tem_fused_0 +.visible .entry triton_tem_fused_0( + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_0, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_1, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_2, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_3, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_4, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_5, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_6, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_7, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_8, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_9, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_10, + .param .u32 triton_tem_fused_0_param_11, + .param .u32 triton_tem_fused_0_param_12, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_13, + .param .u64 .ptr .global .align 1 triton_tem_fused_0_param_14 +) +.reqntid 256 +{ + .reg .pred %p<387>; + .reg .b16 %rs<70>; + .reg .b32 %r<5064>; + .reg .b64 %rd<293>; + .loc 1 18 0 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:18:0 + +// %bb.0: + ld.param.b32 %r712, [triton_tem_fused_0_param_11]; + ld.param.b64 %rd37, [triton_tem_fused_0_param_8]; + ld.param.b64 %rd36, [triton_tem_fused_0_param_7]; + ld.param.b64 %rd68, [triton_tem_fused_0_param_0]; +$L__tmp0: + .loc 1 97 28 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:97:28 + mov.u32 %r812, %ctaid.x; + ld.param.b64 %rd69, [triton_tem_fused_0_param_1]; + .loc 1 98 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:98:27 + mov.u32 %r1, %ctaid.y; + ld.param.b64 %rd70, [triton_tem_fused_0_param_2]; + .loc 1 99 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:99:27 + mov.u32 %r2, %ctaid.z; + .loc 1 103 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:103:23 + and.b32 %r813, %r1, 7; + .loc 1 107 24 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:107:24 + shl.b32 %r3, %r1, 23; + ld.param.b64 %rd71, [triton_tem_fused_0_param_5]; + .loc 1 107 45 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:107:45 + shl.b32 %r4, %r2, 7; + ld.param.b64 %rd72, [triton_tem_fused_0_param_6]; + .loc 1 107 36 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:107:36 + or.b32 %r814, %r3, %r4; + .loc 1 108 25 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:108:25 + shl.b32 %r815, %r813, 10; + .loc 1 108 47 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:108:47 + shl.b32 %r816, %r2, 5; + ld.param.b64 %rd73, [triton_tem_fused_0_param_9]; + and.b32 %r817, %r816, 2097024; + .loc 1 108 37 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:108:37 + add.s32 %r818, %r815, %r817; + mul.lo.s32 %r819, %r818, %r712; + ld.param.b32 %r820, [triton_tem_fused_0_param_12]; + .loc 1 111 12 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:111:12 + mad.wide.s32 %rd74, %r814, 2, %rd68; + .loc 1 112 12 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:112:12 + mul.wide.s32 %rd75, %r819, 2; + add.s64 %rd1, %rd69, %rd75; + .loc 1 113 12 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:113:12 + add.s64 %rd2, %rd70, %rd75; + .loc 1 142 51 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:142:51 + shl.b32 %r821, %r813, 4; + .loc 1 142 74 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:142:74 + add.s32 %r822, %r821, %r812; + .loc 1 143 64 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:143:64 + mul.lo.s32 %r823, %r822, %r820; + .loc 1 144 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:144:23 + shl.b32 %r5, %r812, 7; + .loc 1 144 46 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:144:46 + mov.u32 %r6, %tid.x; + shr.u32 %r7, %r6, 5; + and.b32 %r824, %r6, 240; + bfe.u32 %r8, %r6, 4, 4; + or.b32 %r9, %r8, 16; + or.b32 %r10, %r8, 32; + or.b32 %r11, %r8, 48; + .loc 1 144 33 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:144:33 + or.b32 %r12, %r8, %r5; + or.b32 %r13, %r9, %r5; + or.b32 %r14, %r10, %r5; + or.b32 %r15, %r11, %r5; + or.b32 %r16, %r12, 64; + or.b32 %r17, %r12, 80; + or.b32 %r18, %r12, 96; + or.b32 %r19, %r12, 112; +$L__tmp1: + .loc 1 284 38 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:38 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:146:101 ] + shl.b32 %r20, %r12, 12; + shl.b32 %r21, %r13, 12; + shl.b32 %r22, %r14, 12; + shl.b32 %r23, %r15, 12; + shl.b32 %r24, %r16, 12; + shl.b32 %r25, %r17, 12; + shl.b32 %r26, %r18, 12; + shl.b32 %r27, %r19, 12; + .loc 1 284 20 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:20 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:146:101 ] + mad.wide.s32 %rd76, %r20, 2, %rd74; + mad.wide.s32 %rd77, %r21, 2, %rd74; + mad.wide.s32 %rd78, %r22, 2, %rd74; + mad.wide.s32 %rd79, %r23, 2, %rd74; + mad.wide.s32 %rd80, %r24, 2, %rd74; + mad.wide.s32 %rd81, %r25, 2, %rd74; + mad.wide.s32 %rd82, %r26, 2, %rd74; + mad.wide.s32 %rd83, %r27, 2, %rd74; + .loc 1 284 56 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:56 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:146:101 ] + shl.b32 %r829, %r6, 3; + and.b32 %r830, %r829, 120; + .loc 1 284 49 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:49 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:146:101 ] + cvt.u64.u32 %rd3, %r830; + mul.wide.u32 %rd84, %r830, 2; + add.s64 %rd39, %rd76, %rd84; + add.s64 %rd40, %rd77, %rd84; + add.s64 %rd41, %rd78, %rd84; + add.s64 %rd42, %rd79, %rd84; + add.s64 %rd43, %rd80, %rd84; + add.s64 %rd44, %rd81, %rd84; + add.s64 %rd45, %rd82, %rd84; + add.s64 %rd46, %rd83, %rd84; + .loc 1 292 52 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:52 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:146:101 ] + setp.lt.s32 %p1, %r12, 2048; + setp.lt.s32 %p2, %r13, 2048; + setp.lt.s32 %p3, %r14, 2048; + setp.lt.s32 %p4, %r15, 2048; + setp.lt.s32 %p5, %r16, 2048; + setp.lt.s32 %p6, %r17, 2048; + setp.lt.s32 %p7, %r18, 2048; + setp.lt.s32 %p8, %r19, 2048; + mov.b32 %r4826, 0; + .loc 1 292 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:146:101 ] + // begin inline asm + mov.u32 %r713, %r4826; + mov.u32 %r714, %r4826; + mov.u32 %r715, %r4826; + mov.u32 %r716, %r4826; + @%p1 ld.global.v4.b32 { %r713, %r714, %r715, %r716 }, [ %rd39 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r721, %r4826; + mov.u32 %r722, %r4826; + mov.u32 %r723, %r4826; + mov.u32 %r724, %r4826; + @%p2 ld.global.v4.b32 { %r721, %r722, %r723, %r724 }, [ %rd40 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r729, %r4826; + mov.u32 %r730, %r4826; + mov.u32 %r731, %r4826; + mov.u32 %r732, %r4826; + @%p3 ld.global.v4.b32 { %r729, %r730, %r731, %r732 }, [ %rd41 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r737, %r4826; + mov.u32 %r738, %r4826; + mov.u32 %r739, %r4826; + mov.u32 %r740, %r4826; + @%p4 ld.global.v4.b32 { %r737, %r738, %r739, %r740 }, [ %rd42 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r745, %r4826; + mov.u32 %r746, %r4826; + mov.u32 %r747, %r4826; + mov.u32 %r748, %r4826; + @%p5 ld.global.v4.b32 { %r745, %r746, %r747, %r748 }, [ %rd43 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r753, %r4826; + mov.u32 %r754, %r4826; + mov.u32 %r755, %r4826; + mov.u32 %r756, %r4826; + @%p6 ld.global.v4.b32 { %r753, %r754, %r755, %r756 }, [ %rd44 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r761, %r4826; + mov.u32 %r762, %r4826; + mov.u32 %r763, %r4826; + mov.u32 %r764, %r4826; + @%p7 ld.global.v4.b32 { %r761, %r762, %r763, %r764 }, [ %rd45 + 0 ]; + // end inline asm + // begin inline asm + mov.u32 %r769, %r4826; + mov.u32 %r770, %r4826; + mov.u32 %r771, %r4826; + mov.u32 %r772, %r4826; + @%p8 ld.global.v4.b32 { %r769, %r770, %r771, %r772 }, [ %rd46 + 0 ]; + // end inline asm + and.b32 %r28, %r6, 7; + shl.b32 %r831, %r28, 4; + shl.b32 %r832, %r824, 3; + and.b32 %r29, %r6, 112; + and.b32 %r30, %r6, 8; + shl.b32 %r833, %r30, 11; + or.b32 %r834, %r831, %r832; + xor.b32 %r835, %r834, %r29; + mov.b32 %r836, global_smem; + add.s32 %r837, %r836, %r833; + add.s32 %r838, %r837, %r835; + st.shared.v4.b32 [%r838+98304], {%r713, %r714, %r715, %r716}; + st.shared.v4.b32 [%r838+100352], {%r721, %r722, %r723, %r724}; + st.shared.v4.b32 [%r838+102400], {%r729, %r730, %r731, %r732}; + st.shared.v4.b32 [%r838+104448], {%r737, %r738, %r739, %r740}; + st.shared.v4.b32 [%r838+106496], {%r745, %r746, %r747, %r748}; + st.shared.v4.b32 [%r838+108544], {%r753, %r754, %r755, %r756}; + st.shared.v4.b32 [%r838+110592], {%r761, %r762, %r763, %r764}; + st.shared.v4.b32 [%r838+112640], {%r769, %r770, %r771, %r772}; +$L__tmp2: + .loc 1 151 26 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:151:26 + cvt.s64.s32 %rd4, %r823; + mad.wide.s32 %rd47, %r823, 4, %rd72; + .loc 1 152 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:152:23 + // begin inline asm + mov.u32 %r777, 0x0; + ld.global.b32 { %r777 }, [ %rd47 + 0 ]; + // end inline asm + .loc 1 152 37 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:152:37 + shl.b32 %r31, %r777, 7; + .loc 1 153 42 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:153:42 + cvt.s64.s32 %rd6, %r822; + mad.wide.s32 %rd48, %r822, 4, %rd71; + .loc 1 153 28 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:153:28 + // begin inline asm + mov.u32 %r778, 0x0; + ld.global.b32 { %r778 }, [ %rd48 + 0 ]; + // end inline asm + .loc 1 154 45 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:154:45 + shl.b32 %r839, %r778, 1; +$L__tmp3: + .loc 2 41 22 // standard.py:41:22 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:154:92 ] + add.s32 %r840, %r712, 63; + .loc 2 41 28 // standard.py:41:28 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:154:92 ] + shr.s32 %r841, %r840, 31; + shr.u32 %r842, %r841, 26; + add.s32 %r843, %r840, %r842; + shr.s32 %r844, %r843, 6; +$L__tmp4: + .loc 1 154 102 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:154:102 + max.s32 %r33, %r844, 1; + .loc 1 154 65 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:154:65 + min.s32 %r34, %r839, %r33; + .loc 1 159 37 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:159:37 + and.b32 %r35, %r6, 3; + shl.b32 %r36, %r35, 1; + or.b32 %r37, %r36, 1; + or.b32 %r39, %r36, 8; + or.b32 %r38, %r36, 9; + or.b32 %r43, %r36, 16; + or.b32 %r42, %r36, 17; + or.b32 %r41, %r36, 24; + or.b32 %r40, %r36, 25; + or.b32 %r51, %r36, 32; + or.b32 %r50, %r36, 33; + or.b32 %r49, %r36, 40; + or.b32 %r48, %r36, 41; + or.b32 %r47, %r36, 48; + or.b32 %r46, %r36, 49; + or.b32 %r45, %r36, 56; + or.b32 %r44, %r36, 57; +$L__tmp5: + .loc 1 376 33 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:376:33 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mad.wide.u32 %rd50, %r1, 8, %rd73; + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + setp.lt.s32 %p10, %r839, 1; + setp.gt.s32 %p9, %r839, 0; + .loc 1 376 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:376:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + // begin inline asm + mov.u64 %rd7, 0x0; + @%p9 ld.global.b64 { %rd7 }, [ %rd50 + 0 ]; + // end inline asm + .loc 1 346 35 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:346:35 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + or.b32 %r853, %r31, %r8; + or.b32 %r854, %r31, %r9; + or.b32 %r855, %r31, %r10; + or.b32 %r856, %r31, %r11; + .loc 1 284 38 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:38 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + shl.b32 %r857, %r853, 7; + shl.b32 %r858, %r854, 7; + shl.b32 %r859, %r855, 7; + shl.b32 %r860, %r856, 7; + .loc 1 284 20 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:20 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.wide.s32 %rd85, %r857, 2; + mul.wide.s32 %rd86, %r858, 2; + mul.wide.s32 %rd87, %r859, 2; + mul.wide.s32 %rd88, %r860, 2; + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s64 %rd10, %rd1, %rd84; + .loc 1 284 49 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:49 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s64 %rd51, %rd10, %rd85; + add.s64 %rd52, %rd10, %rd86; + add.s64 %rd53, %rd10, %rd87; + add.s64 %rd54, %rd10, %rd88; + .loc 1 292 52 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:52 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + setp.lt.s32 %p11, %r853, %r712; + setp.lt.s32 %p12, %r854, %r712; + setp.lt.s32 %p13, %r855, %r712; + setp.lt.s32 %p14, %r856, %r712; + .loc 1 292 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + shl.b32 %r861, %r30, 10; + or.b32 %r84, %r835, %r861; + add.s32 %r2726, %r836, %r84; + selp.b32 %r862, 16, 0, %p11; + selp.b32 %r788, %r862, 0, %p9; + // begin inline asm + cp.async.cg.shared.global [ %r2726 + 0 ], [ %rd51 + 0 ], 0x10, %r788; + // end inline asm + add.s32 %r2728, %r2726, 2048; + selp.b32 %r863, 16, 0, %p12; + selp.b32 %r790, %r863, 0, %p9; + // begin inline asm + cp.async.cg.shared.global [ %r2728 + 0 ], [ %rd52 + 0 ], 0x10, %r790; + // end inline asm + add.s32 %r2730, %r2726, 4096; + selp.b32 %r864, 16, 0, %p13; + selp.b32 %r792, %r864, 0, %p9; + // begin inline asm + cp.async.cg.shared.global [ %r2730 + 0 ], [ %rd53 + 0 ], 0x10, %r792; + // end inline asm + add.s32 %r2732, %r2726, 6144; + selp.b32 %r865, 16, 0, %p14; + selp.b32 %r794, %r865, 0, %p9; + // begin inline asm + cp.async.cg.shared.global [ %r2732 + 0 ], [ %rd54 + 0 ], 0x10, %r794; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s64 %rd11, %rd2, %rd84; + .loc 1 284 49 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:49 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s64 %rd55, %rd11, %rd85; + add.s64 %rd56, %rd11, %rd86; + add.s64 %rd57, %rd11, %rd87; + add.s64 %rd58, %rd11, %rd88; + .loc 1 292 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s32 %r787, %r2726, 49152; + // begin inline asm + cp.async.cg.shared.global [ %r787 + 0 ], [ %rd55 + 0 ], 0x10, %r788; + // end inline asm + add.s32 %r789, %r2726, 51200; + // begin inline asm + cp.async.cg.shared.global [ %r789 + 0 ], [ %rd56 + 0 ], 0x10, %r790; + // end inline asm + add.s32 %r791, %r2726, 53248; + // begin inline asm + cp.async.cg.shared.global [ %r791 + 0 ], [ %rd57 + 0 ], 0x10, %r792; + // end inline asm + add.s32 %r793, %r2726, 55296; + // begin inline asm + cp.async.cg.shared.global [ %r793 + 0 ], [ %rd58 + 0 ], 0x10, %r794; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + setp.gt.s32 %p15, %r34, 1; + .loc 1 342 32 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:342:32 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + or.b32 %r866, %r31, 64; + .loc 1 346 35 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:346:35 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + or.b32 %r867, %r866, %r8; + or.b32 %r868, %r866, %r9; + or.b32 %r869, %r866, %r10; + or.b32 %r870, %r866, %r11; + .loc 1 284 38 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:38 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + shl.b32 %r871, %r867, 7; + shl.b32 %r872, %r868, 7; + shl.b32 %r873, %r869, 7; + shl.b32 %r874, %r870, 7; + .loc 1 284 20 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:20 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.wide.s32 %rd89, %r871, 2; + mul.wide.s32 %rd90, %r872, 2; + mul.wide.s32 %rd91, %r873, 2; + mul.wide.s32 %rd92, %r874, 2; + .loc 1 284 49 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:49 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s64 %rd59, %rd10, %rd89; + add.s64 %rd60, %rd10, %rd90; + add.s64 %rd61, %rd10, %rd91; + add.s64 %rd62, %rd10, %rd92; + .loc 1 292 52 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:52 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + setp.lt.s32 %p16, %r867, %r712; + setp.lt.s32 %p17, %r868, %r712; + setp.lt.s32 %p18, %r869, %r712; + setp.lt.s32 %p19, %r870, %r712; + .loc 1 292 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + bar.sync 0; + add.s32 %r795, %r2726, 16384; + selp.b32 %r875, 16, 0, %p16; + selp.b32 %r804, %r875, 0, %p15; + // begin inline asm + cp.async.cg.shared.global [ %r795 + 0 ], [ %rd59 + 0 ], 0x10, %r804; + // end inline asm + add.s32 %r797, %r2726, 18432; + selp.b32 %r876, 16, 0, %p17; + selp.b32 %r806, %r876, 0, %p15; + // begin inline asm + cp.async.cg.shared.global [ %r797 + 0 ], [ %rd60 + 0 ], 0x10, %r806; + // end inline asm + add.s32 %r799, %r2726, 20480; + selp.b32 %r877, 16, 0, %p18; + selp.b32 %r808, %r877, 0, %p15; + // begin inline asm + cp.async.cg.shared.global [ %r799 + 0 ], [ %rd61 + 0 ], 0x10, %r808; + // end inline asm + add.s32 %r801, %r2726, 22528; + selp.b32 %r878, 16, 0, %p19; + selp.b32 %r810, %r878, 0, %p15; + // begin inline asm + cp.async.cg.shared.global [ %r801 + 0 ], [ %rd62 + 0 ], 0x10, %r810; + // end inline asm + cp.async.commit_group; + .loc 1 284 49 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:49 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s64 %rd63, %rd11, %rd89; + add.s64 %rd64, %rd11, %rd90; + add.s64 %rd65, %rd11, %rd91; + add.s64 %rd66, %rd11, %rd92; + .loc 1 292 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s32 %r803, %r2726, 65536; + // begin inline asm + cp.async.cg.shared.global [ %r803 + 0 ], [ %rd63 + 0 ], 0x10, %r804; + // end inline asm + add.s32 %r805, %r2726, 67584; + // begin inline asm + cp.async.cg.shared.global [ %r805 + 0 ], [ %rd64 + 0 ], 0x10, %r806; + // end inline asm + add.s32 %r807, %r2726, 69632; + // begin inline asm + cp.async.cg.shared.global [ %r807 + 0 ], [ %rd65 + 0 ], 0x10, %r808; + // end inline asm + add.s32 %r809, %r2726, 71680; + // begin inline asm + cp.async.cg.shared.global [ %r809 + 0 ], [ %rd66 + 0 ], 0x10, %r810; + // end inline asm + cp.async.commit_group; + .loc 1 351 19 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:351:19 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + mov.b32 %r879, 0fFF800000; + mov.b64 %rd292, {%r879, %r879}; + mov.b32 %r1702, 0f00000000; + mov.b32 %r1703, %r1702; + mov.b32 %r1704, %r1702; + mov.b32 %r1705, %r1702; + mov.b32 %r1706, %r1702; + mov.b32 %r1707, %r1702; + mov.b32 %r1708, %r1702; + mov.b32 %r1709, %r1702; + mov.b32 %r1710, %r1702; + mov.b32 %r1711, %r1702; + mov.b32 %r1712, %r1702; + mov.b32 %r1713, %r1702; + mov.b32 %r1714, %r1702; + mov.b32 %r1715, %r1702; + mov.b32 %r1716, %r1702; + mov.b32 %r1717, %r1702; + mov.b32 %r1718, %r1702; + mov.b32 %r1719, %r1702; + mov.b32 %r1720, %r1702; + mov.b32 %r1721, %r1702; + mov.b32 %r1722, %r1702; + mov.b32 %r1723, %r1702; + mov.b32 %r1724, %r1702; + mov.b32 %r1725, %r1702; + mov.b32 %r1726, %r1702; + mov.b32 %r1727, %r1702; + mov.b32 %r1728, %r1702; + mov.b32 %r1729, %r1702; + mov.b32 %r1730, %r1702; + mov.b32 %r1731, %r1702; + mov.b32 %r1732, %r1702; + mov.b32 %r1733, %r1702; + mov.b32 %r1734, %r1702; + mov.b32 %r1735, %r1702; + mov.b32 %r1736, %r1702; + mov.b32 %r1737, %r1702; + mov.b32 %r1738, %r1702; + mov.b32 %r1739, %r1702; + mov.b32 %r1740, %r1702; + mov.b32 %r1741, %r1702; + mov.b32 %r1742, %r1702; + mov.b32 %r1743, %r1702; + mov.b32 %r1744, %r1702; + mov.b32 %r1745, %r1702; + mov.b32 %r1746, %r1702; + mov.b32 %r1747, %r1702; + mov.b32 %r1748, %r1702; + mov.b32 %r1749, %r1702; + mov.b32 %r1750, %r1702; + mov.b32 %r1751, %r1702; + mov.b32 %r1752, %r1702; + mov.b32 %r1753, %r1702; + mov.b32 %r1754, %r1702; + mov.b32 %r1755, %r1702; + mov.b32 %r1756, %r1702; + mov.b32 %r1757, %r1702; + mov.b32 %r1758, %r1702; + mov.b32 %r1759, %r1702; + mov.b32 %r1760, %r1702; + mov.b32 %r1761, %r1702; + mov.b32 %r1762, %r1702; + mov.b32 %r1763, %r1702; + mov.b32 %r1764, %r1702; + mov.b32 %r1765, %r1702; + mov.b32 %r5060, %r1702; + mov.b32 %r5061, %r1702; + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + @%p10 bra $L__BB0_3; +$L__tmp6: +// %bb.1: // %.lr.ph + .loc 1 0 0 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:0 + shr.u32 %r825, %r6, 1; + and.b32 %r826, %r825, 112; + bfe.u32 %r827, %r6, 2, 3; + or.b32 %r828, %r826, %r827; + or.b32 %r845, %r828, %r5; + or.b32 %r846, %r845, 8; + bfe.s32 %r847, %r812, 24, 1; + shr.u32 %r848, %r847, 21; + add.s32 %r849, %r846, %r848; + and.b32 %r850, %r849, -2048; + sub.s32 %r81, %r846, %r850; + add.s32 %r851, %r845, %r848; + and.b32 %r852, %r851, -2048; + sub.s32 %r83, %r845, %r852; + cvt.s64.s32 %rd8, %r83; + cvt.s64.s32 %rd9, %r81; + .loc 1 159 24 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:159:24 + or.b32 %r4842, %r31, %r36; + or.b32 %r4841, %r31, %r37; + or.b32 %r4839, %r31, %r38; + or.b32 %r4835, %r31, %r40; + or.b32 %r4827, %r31, %r44; + or.b32 %r4840, %r31, %r39; + or.b32 %r4836, %r31, %r41; + or.b32 %r4828, %r31, %r45; + or.b32 %r4837, %r31, %r42; + or.b32 %r4829, %r31, %r46; + or.b32 %r4838, %r31, %r43; + or.b32 %r4830, %r31, %r47; + or.b32 %r4831, %r31, %r48; + or.b32 %r4832, %r31, %r49; + or.b32 %r4833, %r31, %r50; + or.b32 %r4834, %r31, %r51; + add.s32 %r117, %r34, -2; + add.s32 %r118, %r34, -1; +$L__tmp7: + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + max.s32 %r119, %r34, 1; + mov.b64 %rd292, {%r879, %r879}; + mov.b32 %r5060, 0f00000000; + mov.b32 %r4758, 1; + mov.b32 %r4757, -1; + mov.b32 %r4756, 64; + mov.b32 %r4759, %r4756; + mov.b32 %r5061, %r5060; + mov.b32 %r1702, %r5060; + mov.b32 %r1703, %r5060; + mov.b32 %r1704, %r5060; + mov.b32 %r1705, %r5060; + mov.b32 %r1706, %r5060; + mov.b32 %r1707, %r5060; + mov.b32 %r1708, %r5060; + mov.b32 %r1709, %r5060; + mov.b32 %r1710, %r5060; + mov.b32 %r1711, %r5060; + mov.b32 %r1712, %r5060; + mov.b32 %r1713, %r5060; + mov.b32 %r1714, %r5060; + mov.b32 %r1715, %r5060; + mov.b32 %r1716, %r5060; + mov.b32 %r1717, %r5060; + mov.b32 %r1718, %r5060; + mov.b32 %r1719, %r5060; + mov.b32 %r1720, %r5060; + mov.b32 %r1721, %r5060; + mov.b32 %r1722, %r5060; + mov.b32 %r1723, %r5060; + mov.b32 %r1724, %r5060; + mov.b32 %r1725, %r5060; + mov.b32 %r1726, %r5060; + mov.b32 %r1727, %r5060; + mov.b32 %r1728, %r5060; + mov.b32 %r1729, %r5060; + mov.b32 %r1730, %r5060; + mov.b32 %r1731, %r5060; + mov.b32 %r1732, %r5060; + mov.b32 %r1733, %r5060; + mov.b32 %r1734, %r5060; + mov.b32 %r1735, %r5060; + mov.b32 %r1736, %r5060; + mov.b32 %r1737, %r5060; + mov.b32 %r1738, %r5060; + mov.b32 %r1739, %r5060; + mov.b32 %r1740, %r5060; + mov.b32 %r1741, %r5060; + mov.b32 %r1742, %r5060; + mov.b32 %r1743, %r5060; + mov.b32 %r1744, %r5060; + mov.b32 %r1745, %r5060; + mov.b32 %r1746, %r5060; + mov.b32 %r1747, %r5060; + mov.b32 %r1748, %r5060; + mov.b32 %r1749, %r5060; + mov.b32 %r1750, %r5060; + mov.b32 %r1751, %r5060; + mov.b32 %r1752, %r5060; + mov.b32 %r1753, %r5060; + mov.b32 %r1754, %r5060; + mov.b32 %r1755, %r5060; + mov.b32 %r1756, %r5060; + mov.b32 %r1757, %r5060; + mov.b32 %r1758, %r5060; + mov.b32 %r1759, %r5060; + mov.b32 %r1760, %r5060; + mov.b32 %r1761, %r5060; + mov.b32 %r1762, %r5060; + mov.b32 %r1763, %r5060; + mov.b32 %r1764, %r5060; + mov.b32 %r1765, %r5060; +$L__BB0_2: // %__nv_exp2f.exit256 + // =>This Inner Loop Header: Depth=1 + .loc 1 0 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:0:40 + cvt.u32.u64 %r2116, %rd9; + cvt.u32.u64 %r2117, %rd8; + .loc 1 379 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:379:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + setp.gt.s64 %p33, %rd7, %rd9; + setp.gt.s64 %p34, %rd7, %rd8; + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + setp.lt.s32 %p35, %r4826, %r117; + setp.lt.s32 %p31, %r4826, %r118; + add.s32 %r2118, %r4757, 1; + setp.gt.s32 %p36, %r2118, 2; + selp.b32 %r4757, 0, %r2118, %p36; + .loc 1 292 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r2119, %r4757, 14; + add.s32 %r1401, %r836, %r2119; + .loc 1 351 19 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:351:19 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + shfl.sync.idx.b32 %r2121, %r7, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r2122, %r2121, 11; + and.b32 %r2123, %r2122, 8192; + add.s32 %r1398, %r836, 98304; + add.s32 %r2124, %r2123, %r1398; + bfe.u32 %r2125, %r2124, 4, 14; + cvt.u64.u32 %rd128, %r2125; + or.b64 %rd94, %rd128, 4611686293372403712; + bfe.u32 %r2126, %r1401, 4, 14; + cvt.u64.u32 %rd129, %r2126; + or.b64 %rd95, %rd129, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r982,%r983,%r984,%r985,%r986,%r987,%r988,%r989,%r990,%r991,%r992,%r993,%r994,%r995,%r996,%r997,%r998,%r999,%r1000,%r1001,%r1002,%r1003,%r1004,%r1005,%r1006,%r1007,%r1008,%r1009,%r1010,%r1011,%r1012,%r1013}, %rd94, %rd95, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r2127, %r2124, 32; + bfe.u32 %r2128, %r2127, 4, 14; + cvt.u64.u32 %rd130, %r2128; + or.b64 %rd96, %rd130, 4611686293372403712; + add.s32 %r2129, %r1401, 32; + bfe.u32 %r2130, %r2129, 4, 14; + cvt.u64.u32 %rd131, %r2130; + or.b64 %rd97, %rd131, 4611686293338849280; + mov.pred %p20, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r982,%r983,%r984,%r985,%r986,%r987,%r988,%r989,%r990,%r991,%r992,%r993,%r994,%r995,%r996,%r997,%r998,%r999,%r1000,%r1001,%r1002,%r1003,%r1004,%r1005,%r1006,%r1007,%r1008,%r1009,%r1010,%r1011,%r1012,%r1013}, %rd96, %rd97, %p20, 1, 1, 0, 0; + // end inline asm + add.s32 %r2131, %r2124, 64; + bfe.u32 %r2132, %r2131, 4, 14; + cvt.u64.u32 %rd132, %r2132; + or.b64 %rd98, %rd132, 4611686293372403712; + add.s32 %r2133, %r1401, 64; + bfe.u32 %r2134, %r2133, 4, 14; + cvt.u64.u32 %rd133, %r2134; + or.b64 %rd99, %rd133, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r982,%r983,%r984,%r985,%r986,%r987,%r988,%r989,%r990,%r991,%r992,%r993,%r994,%r995,%r996,%r997,%r998,%r999,%r1000,%r1001,%r1002,%r1003,%r1004,%r1005,%r1006,%r1007,%r1008,%r1009,%r1010,%r1011,%r1012,%r1013}, %rd98, %rd99, %p20, 1, 1, 0, 0; + // end inline asm + add.s32 %r2135, %r2124, 96; + bfe.u32 %r2136, %r2135, 4, 14; + cvt.u64.u32 %rd134, %r2136; + or.b64 %rd100, %rd134, 4611686293372403712; + add.s32 %r2137, %r1401, 96; + bfe.u32 %r2138, %r2137, 4, 14; + cvt.u64.u32 %rd135, %r2138; + or.b64 %rd101, %rd135, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r982,%r983,%r984,%r985,%r986,%r987,%r988,%r989,%r990,%r991,%r992,%r993,%r994,%r995,%r996,%r997,%r998,%r999,%r1000,%r1001,%r1002,%r1003,%r1004,%r1005,%r1006,%r1007,%r1008,%r1009,%r1010,%r1011,%r1012,%r1013}, %rd100, %rd101, %p20, 1, 1, 0, 0; + // end inline asm + add.s32 %r2139, %r2124, 16384; + bfe.u32 %r2140, %r2139, 4, 14; + cvt.u64.u32 %rd136, %r2140; + or.b64 %rd102, %rd136, 4611686293372403712; + add.s32 %r2141, %r1401, 8192; + bfe.u32 %r2142, %r2141, 4, 14; + cvt.u64.u32 %rd137, %r2142; + or.b64 %rd103, %rd137, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r982,%r983,%r984,%r985,%r986,%r987,%r988,%r989,%r990,%r991,%r992,%r993,%r994,%r995,%r996,%r997,%r998,%r999,%r1000,%r1001,%r1002,%r1003,%r1004,%r1005,%r1006,%r1007,%r1008,%r1009,%r1010,%r1011,%r1012,%r1013}, %rd102, %rd103, %p20, 1, 1, 0, 0; + // end inline asm + add.s32 %r2143, %r2124, 16416; + bfe.u32 %r2144, %r2143, 4, 14; + cvt.u64.u32 %rd138, %r2144; + or.b64 %rd104, %rd138, 4611686293372403712; + add.s32 %r2145, %r1401, 8224; + bfe.u32 %r2146, %r2145, 4, 14; + cvt.u64.u32 %rd139, %r2146; + or.b64 %rd105, %rd139, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r982,%r983,%r984,%r985,%r986,%r987,%r988,%r989,%r990,%r991,%r992,%r993,%r994,%r995,%r996,%r997,%r998,%r999,%r1000,%r1001,%r1002,%r1003,%r1004,%r1005,%r1006,%r1007,%r1008,%r1009,%r1010,%r1011,%r1012,%r1013}, %rd104, %rd105, %p20, 1, 1, 0, 0; + // end inline asm + add.s32 %r2147, %r2124, 16448; + bfe.u32 %r2148, %r2147, 4, 14; + cvt.u64.u32 %rd140, %r2148; + or.b64 %rd106, %rd140, 4611686293372403712; + add.s32 %r2149, %r1401, 8256; + bfe.u32 %r2150, %r2149, 4, 14; + cvt.u64.u32 %rd141, %r2150; + or.b64 %rd107, %rd141, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r982,%r983,%r984,%r985,%r986,%r987,%r988,%r989,%r990,%r991,%r992,%r993,%r994,%r995,%r996,%r997,%r998,%r999,%r1000,%r1001,%r1002,%r1003,%r1004,%r1005,%r1006,%r1007,%r1008,%r1009,%r1010,%r1011,%r1012,%r1013}, %rd106, %rd107, %p20, 1, 1, 0, 0; + // end inline asm + add.s32 %r2151, %r2124, 16480; + bfe.u32 %r2152, %r2151, 4, 14; + cvt.u64.u32 %rd142, %r2152; + or.b64 %rd108, %rd142, 4611686293372403712; + add.s32 %r2153, %r1401, 8288; + bfe.u32 %r2154, %r2153, 4, 14; + cvt.u64.u32 %rd143, %r2154; + or.b64 %rd109, %rd143, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r982,%r983,%r984,%r985,%r986,%r987,%r988,%r989,%r990,%r991,%r992,%r993,%r994,%r995,%r996,%r997,%r998,%r999,%r1000,%r1001,%r1002,%r1003,%r1004,%r1005,%r1006,%r1007,%r1008,%r1009,%r1010,%r1011,%r1012,%r1013}, %rd108, %rd109, %p20, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r1399, 0; + mov.b32 %r1400, %r1399; + mov.b32 %r1402, %r1399; + mov.b32 %r1403, %r1399; + // begin inline asm + // wait for regs: %r982,%r983,%r984,%r985,%r986,%r987,%r988,%r989,%r990,%r991,%r992,%r993,%r994,%r995,%r996,%r997,%r998,%r999,%r1000,%r1001,%r1002,%r1003,%r1004,%r1005,%r1006,%r1007,%r1008,%r1009,%r1010,%r1011,%r1012,%r1013,%r1398,%r1399,%r1400,%r1401,%r1402,%r1403,%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 353 14 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:353:14 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2155, %r982, 0f3DB504F3; + mul.f32 %r2156, %r983, 0f3DB504F3; + mul.f32 %r2157, %r984, 0f3DB504F3; + mul.f32 %r2158, %r985, 0f3DB504F3; + mul.f32 %r2159, %r986, 0f3DB504F3; + mul.f32 %r2160, %r987, 0f3DB504F3; + mul.f32 %r2161, %r988, 0f3DB504F3; + mul.f32 %r2162, %r989, 0f3DB504F3; + mul.f32 %r2163, %r990, 0f3DB504F3; + mul.f32 %r2164, %r991, 0f3DB504F3; + mul.f32 %r2165, %r992, 0f3DB504F3; + mul.f32 %r2166, %r993, 0f3DB504F3; + mul.f32 %r2167, %r994, 0f3DB504F3; + mul.f32 %r2168, %r995, 0f3DB504F3; + mul.f32 %r2169, %r996, 0f3DB504F3; + mul.f32 %r2170, %r997, 0f3DB504F3; + mul.f32 %r2171, %r998, 0f3DB504F3; + mul.f32 %r2172, %r999, 0f3DB504F3; + mul.f32 %r2173, %r1000, 0f3DB504F3; + mul.f32 %r2174, %r1001, 0f3DB504F3; + mul.f32 %r2175, %r1002, 0f3DB504F3; + mul.f32 %r2176, %r1003, 0f3DB504F3; + mul.f32 %r2177, %r1004, 0f3DB504F3; + mul.f32 %r2178, %r1005, 0f3DB504F3; + mul.f32 %r2179, %r1006, 0f3DB504F3; + mul.f32 %r2180, %r1007, 0f3DB504F3; + mul.f32 %r2181, %r1008, 0f3DB504F3; + mul.f32 %r2182, %r1009, 0f3DB504F3; + mul.f32 %r2183, %r1010, 0f3DB504F3; + mul.f32 %r2184, %r1011, 0f3DB504F3; + mul.f32 %r2185, %r1012, 0f3DB504F3; + mul.f32 %r2186, %r1013, 0f3DB504F3; + .loc 1 367 44 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:44 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + setp.lt.s32 %p37, %r4842, %r712; + setp.lt.s32 %p38, %r4841, %r712; + setp.lt.s32 %p39, %r4840, %r712; + setp.lt.s32 %p40, %r4839, %r712; + setp.lt.s32 %p41, %r4838, %r712; + setp.lt.s32 %p42, %r4837, %r712; + setp.lt.s32 %p43, %r4836, %r712; + setp.lt.s32 %p44, %r4835, %r712; + setp.lt.s32 %p45, %r4834, %r712; + setp.lt.s32 %p46, %r4833, %r712; + setp.lt.s32 %p47, %r4832, %r712; + setp.lt.s32 %p48, %r4831, %r712; + setp.lt.s32 %p49, %r4830, %r712; + setp.lt.s32 %p50, %r4829, %r712; + setp.lt.s32 %p51, %r4828, %r712; + setp.lt.s32 %p52, %r4827, %r712; + .loc 1 257 21 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:257:21 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + rem.s32 %r2187, %r4827, %r712; + rem.s32 %r2188, %r4828, %r712; + rem.s32 %r2189, %r4829, %r712; + rem.s32 %r2190, %r4830, %r712; + rem.s32 %r2191, %r4831, %r712; + rem.s32 %r2192, %r4832, %r712; + rem.s32 %r2193, %r4833, %r712; + rem.s32 %r2194, %r4834, %r712; + rem.s32 %r2195, %r4835, %r712; + rem.s32 %r2196, %r4836, %r712; + rem.s32 %r2197, %r4837, %r712; + rem.s32 %r2198, %r4838, %r712; + rem.s32 %r2199, %r4839, %r712; + rem.s32 %r2200, %r4840, %r712; + rem.s32 %r2201, %r4841, %r712; + rem.s32 %r2202, %r4842, %r712; + .loc 1 373 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:373:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + setp.ge.s32 %p53, %r2117, %r2202; + setp.ge.s32 %p54, %r2117, %r2201; + setp.ge.s32 %p55, %r2116, %r2202; + setp.ge.s32 %p56, %r2116, %r2201; + setp.ge.s32 %p57, %r2117, %r2200; + setp.ge.s32 %p58, %r2117, %r2199; + setp.ge.s32 %p59, %r2116, %r2200; + setp.ge.s32 %p60, %r2116, %r2199; + setp.ge.s32 %p61, %r2117, %r2198; + setp.ge.s32 %p62, %r2117, %r2197; + setp.ge.s32 %p63, %r2116, %r2198; + setp.ge.s32 %p64, %r2116, %r2197; + setp.ge.s32 %p65, %r2117, %r2196; + setp.ge.s32 %p66, %r2117, %r2195; + setp.ge.s32 %p67, %r2116, %r2196; + setp.ge.s32 %p68, %r2116, %r2195; + setp.ge.s32 %p69, %r2117, %r2194; + setp.ge.s32 %p70, %r2117, %r2193; + setp.ge.s32 %p71, %r2116, %r2194; + setp.ge.s32 %p72, %r2116, %r2193; + setp.ge.s32 %p73, %r2117, %r2192; + setp.ge.s32 %p74, %r2117, %r2191; + setp.ge.s32 %p75, %r2116, %r2192; + setp.ge.s32 %p76, %r2116, %r2191; + setp.ge.s32 %p77, %r2117, %r2190; + setp.ge.s32 %p78, %r2117, %r2189; + setp.ge.s32 %p79, %r2116, %r2190; + setp.ge.s32 %p80, %r2116, %r2189; + setp.ge.s32 %p81, %r2117, %r2188; + setp.ge.s32 %p82, %r2117, %r2187; + setp.ge.s32 %p83, %r2116, %r2188; + setp.ge.s32 %p84, %r2116, %r2187; + .loc 1 381 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:381:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + and.pred %p85, %p34, %p53; + and.pred %p86, %p34, %p54; + and.pred %p87, %p33, %p55; + and.pred %p88, %p33, %p56; + and.pred %p89, %p34, %p57; + and.pred %p90, %p34, %p58; + and.pred %p91, %p33, %p59; + and.pred %p92, %p33, %p60; + and.pred %p93, %p34, %p61; + and.pred %p94, %p34, %p62; + and.pred %p95, %p33, %p63; + and.pred %p96, %p33, %p64; + and.pred %p97, %p34, %p65; + and.pred %p98, %p34, %p66; + and.pred %p99, %p33, %p67; + and.pred %p100, %p33, %p68; + and.pred %p101, %p34, %p69; + and.pred %p102, %p34, %p70; + and.pred %p103, %p33, %p71; + and.pred %p104, %p33, %p72; + and.pred %p105, %p34, %p73; + and.pred %p106, %p34, %p74; + and.pred %p107, %p33, %p75; + and.pred %p108, %p33, %p76; + and.pred %p109, %p34, %p77; + and.pred %p110, %p34, %p78; + and.pred %p111, %p33, %p79; + and.pred %p112, %p33, %p80; + and.pred %p113, %p34, %p81; + and.pred %p114, %p34, %p82; + and.pred %p115, %p33, %p83; + and.pred %p116, %p33, %p84; + .loc 1 384 24 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:384:24 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + setp.gt.s32 %p117, %r2202, 2047; + setp.gt.s32 %p118, %r2201, 2047; + setp.gt.s32 %p119, %r2200, 2047; + setp.gt.s32 %p120, %r2199, 2047; + setp.gt.s32 %p121, %r2198, 2047; + setp.gt.s32 %p122, %r2197, 2047; + setp.gt.s32 %p123, %r2196, 2047; + setp.gt.s32 %p124, %r2195, 2047; + setp.gt.s32 %p125, %r2194, 2047; + setp.gt.s32 %p126, %r2193, 2047; + setp.gt.s32 %p127, %r2192, 2047; + setp.gt.s32 %p128, %r2191, 2047; + setp.gt.s32 %p129, %r2190, 2047; + setp.gt.s32 %p130, %r2189, 2047; + setp.gt.s32 %p131, %r2188, 2047; + setp.gt.s32 %p132, %r2187, 2047; + .loc 1 394 25 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:394:25 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + prmt.b32 %r2203, %r2201, %r2202, 0x5410U; + prmt.b32 %r2204, %r2199, %r2200, 0x5410U; + prmt.b32 %r2205, %r2197, %r2198, 0x5410U; + prmt.b32 %r2206, %r2195, %r2196, 0x5410U; + prmt.b32 %r2207, %r2193, %r2194, 0x5410U; + prmt.b32 %r2208, %r2191, %r2192, 0x5410U; + prmt.b32 %r2209, %r2189, %r2190, 0x5410U; + prmt.b32 %r2210, %r2187, %r2188, 0x5410U; + and.b32 %r2211, %r2210, 134154239; + and.b32 %r2212, %r2209, 134154239; + and.b32 %r2213, %r2208, 134154239; + and.b32 %r2214, %r2207, 134154239; + and.b32 %r2215, %r2206, 134154239; + and.b32 %r2216, %r2205, 134154239; + and.b32 %r2217, %r2204, 134154239; + and.b32 %r2218, %r2203, 134154239; + .loc 1 395 24 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:395:24 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mov.b32 {%rs1, %rs2}, %r2218; + cvt.u64.u16 %rd144, %rs2; + cvt.u64.u16 %rd145, %rs1; + mov.b32 {%rs3, %rs4}, %r2217; + cvt.u64.u16 %rd146, %rs4; + cvt.u64.u16 %rd147, %rs3; + mov.b32 {%rs5, %rs6}, %r2216; + cvt.u64.u16 %rd148, %rs6; + cvt.u64.u16 %rd149, %rs5; + mov.b32 {%rs7, %rs8}, %r2215; + cvt.u64.u16 %rd150, %rs8; + cvt.u64.u16 %rd151, %rs7; + mov.b32 {%rs9, %rs10}, %r2214; + cvt.u64.u16 %rd152, %rs10; + cvt.u64.u16 %rd153, %rs9; + mov.b32 {%rs11, %rs12}, %r2213; + cvt.u64.u16 %rd154, %rs12; + cvt.u64.u16 %rd155, %rs11; + mov.b32 {%rs13, %rs14}, %r2212; + cvt.u64.u16 %rd156, %rs14; + cvt.u64.u16 %rd157, %rs13; + mov.b32 {%rs15, %rs16}, %r2211; + cvt.u64.u16 %rd158, %rs16; + cvt.u64.u16 %rd159, %rs15; + setp.gt.s64 %p133, %rd7, %rd159; + setp.gt.s64 %p134, %rd7, %rd158; + setp.gt.s64 %p135, %rd7, %rd157; + setp.gt.s64 %p136, %rd7, %rd156; + setp.gt.s64 %p137, %rd7, %rd155; + setp.gt.s64 %p138, %rd7, %rd154; + setp.gt.s64 %p139, %rd7, %rd153; + setp.gt.s64 %p140, %rd7, %rd152; + setp.gt.s64 %p141, %rd7, %rd151; + setp.gt.s64 %p142, %rd7, %rd150; + setp.gt.s64 %p143, %rd7, %rd149; + setp.gt.s64 %p144, %rd7, %rd148; + setp.gt.s64 %p145, %rd7, %rd147; + setp.gt.s64 %p146, %rd7, %rd146; + setp.gt.s64 %p147, %rd7, %rd145; + setp.gt.s64 %p148, %rd7, %rd144; + .loc 1 396 24 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:396:24 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + and.pred %p149, %p117, %p148; + and.pred %p150, %p118, %p147; + and.pred %p151, %p119, %p146; + and.pred %p152, %p120, %p145; + and.pred %p153, %p121, %p144; + and.pred %p154, %p122, %p143; + and.pred %p155, %p123, %p142; + and.pred %p156, %p124, %p141; + and.pred %p157, %p125, %p140; + and.pred %p158, %p126, %p139; + and.pred %p159, %p127, %p138; + and.pred %p160, %p128, %p137; + and.pred %p161, %p129, %p136; + and.pred %p162, %p130, %p135; + and.pred %p163, %p131, %p134; + and.pred %p164, %p132, %p133; + .loc 1 397 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:397:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + sub.s32 %r2219, %r2191, %r83; + sub.s32 %r2220, %r2192, %r83; + sub.s32 %r2221, %r2191, %r81; + sub.s32 %r2222, %r2192, %r81; + sub.s32 %r2223, %r2193, %r83; + sub.s32 %r2224, %r2194, %r83; + sub.s32 %r2225, %r2193, %r81; + sub.s32 %r2226, %r2194, %r81; + sub.s32 %r2227, %r2187, %r83; + sub.s32 %r2228, %r2188, %r83; + sub.s32 %r2229, %r2187, %r81; + sub.s32 %r2230, %r2188, %r81; + sub.s32 %r2231, %r2189, %r83; + sub.s32 %r2232, %r2190, %r83; + sub.s32 %r2233, %r2189, %r81; + sub.s32 %r2234, %r2190, %r81; + sub.s32 %r2235, %r2195, %r81; + sub.s32 %r2236, %r2196, %r81; + sub.s32 %r2237, %r2195, %r83; + sub.s32 %r2238, %r2196, %r83; + sub.s32 %r2239, %r2197, %r81; + sub.s32 %r2240, %r2198, %r81; + sub.s32 %r2241, %r2197, %r83; + sub.s32 %r2242, %r2198, %r83; + sub.s32 %r2243, %r2199, %r81; + sub.s32 %r2244, %r2200, %r81; + sub.s32 %r2245, %r2199, %r83; + sub.s32 %r2246, %r2200, %r83; + sub.s32 %r2247, %r2201, %r81; + sub.s32 %r2248, %r2202, %r81; + sub.s32 %r2249, %r2201, %r83; + sub.s32 %r2250, %r2202, %r83; + .loc 1 404 39 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:404:39 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + and.b32 %r2251, %r2250, 2047; + and.b32 %r2252, %r2249, 2047; + and.b32 %r2253, %r2248, 2047; + and.b32 %r2254, %r2247, 2047; + and.b32 %r2255, %r2246, 2047; + and.b32 %r2256, %r2245, 2047; + and.b32 %r2257, %r2244, 2047; + and.b32 %r2258, %r2243, 2047; + and.b32 %r2259, %r2242, 2047; + and.b32 %r2260, %r2241, 2047; + and.b32 %r2261, %r2240, 2047; + and.b32 %r2262, %r2239, 2047; + and.b32 %r2263, %r2238, 2047; + and.b32 %r2264, %r2237, 2047; + and.b32 %r2265, %r2236, 2047; + and.b32 %r2266, %r2235, 2047; + and.b32 %r2267, %r2234, 2047; + and.b32 %r2268, %r2233, 2047; + and.b32 %r2269, %r2232, 2047; + and.b32 %r2270, %r2231, 2047; + and.b32 %r2271, %r2230, 2047; + and.b32 %r2272, %r2229, 2047; + and.b32 %r2273, %r2228, 2047; + and.b32 %r2274, %r2227, 2047; + and.b32 %r2275, %r2226, 2047; + and.b32 %r2276, %r2225, 2047; + and.b32 %r2277, %r2224, 2047; + and.b32 %r2278, %r2223, 2047; + and.b32 %r2279, %r2222, 2047; + and.b32 %r2280, %r2221, 2047; + and.b32 %r2281, %r2220, 2047; + and.b32 %r2282, %r2219, 2047; + .loc 1 405 25 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:405:25 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + setp.eq.b32 %p165, %r2282, 0; + selp.b16 %rs17, 1, 0, %p165; + shl.b16 %rs18, %rs17, 2; + setp.eq.b32 %p166, %r2281, 0; + selp.b16 %rs19, -1, 0, %p166; + shl.b16 %rs20, %rs19, 3; + or.b16 %rs21, %rs20, %rs18; + setp.eq.b32 %p167, %r2280, 0; + selp.b16 %rs22, 1, 0, %p167; + setp.eq.b32 %p168, %r2279, 0; + selp.b16 %rs23, -1, 0, %p168; + shl.b16 %rs24, %rs23, 1; + or.b16 %rs25, %rs22, %rs24; + and.b16 %rs26, %rs25, 3; + or.b16 %rs27, %rs26, %rs21; + and.b16 %rs28, %rs27, 15; + shl.b16 %rs29, %rs28, 8; + setp.eq.b32 %p169, %r2278, 0; + selp.b16 %rs30, 1, 0, %p169; + shl.b16 %rs31, %rs30, 2; + setp.eq.b32 %p170, %r2277, 0; + selp.b16 %rs32, -1, 0, %p170; + shl.b16 %rs33, %rs32, 3; + or.b16 %rs34, %rs33, %rs31; + setp.eq.b32 %p171, %r2276, 0; + selp.b16 %rs35, 1, 0, %p171; + setp.eq.b32 %p172, %r2275, 0; + selp.b16 %rs36, -1, 0, %p172; + shl.b16 %rs37, %rs36, 1; + or.b16 %rs38, %rs35, %rs37; + and.b16 %rs39, %rs38, 3; + or.b16 %rs40, %rs39, %rs34; + shl.b16 %rs41, %rs40, 12; + or.b16 %rs42, %rs41, %rs29; + setp.eq.b32 %p173, %r2274, 0; + selp.b16 %rs43, 1, 0, %p173; + shl.b16 %rs44, %rs43, 2; + setp.eq.b32 %p174, %r2273, 0; + selp.b16 %rs45, -1, 0, %p174; + shl.b16 %rs46, %rs45, 3; + or.b16 %rs47, %rs46, %rs44; + setp.eq.b32 %p175, %r2272, 0; + selp.b16 %rs48, 1, 0, %p175; + setp.eq.b32 %p176, %r2271, 0; + selp.b16 %rs49, -1, 0, %p176; + shl.b16 %rs50, %rs49, 1; + or.b16 %rs51, %rs48, %rs50; + and.b16 %rs52, %rs51, 3; + or.b16 %rs53, %rs52, %rs47; + and.b16 %rs54, %rs53, 15; + setp.eq.b32 %p177, %r2270, 0; + selp.b16 %rs55, 1, 0, %p177; + shl.b16 %rs56, %rs55, 2; + setp.eq.b32 %p178, %r2269, 0; + selp.b16 %rs57, -1, 0, %p178; + shl.b16 %rs58, %rs57, 3; + or.b16 %rs59, %rs58, %rs56; + setp.eq.b32 %p179, %r2268, 0; + selp.b16 %rs60, 1, 0, %p179; + setp.eq.b32 %p180, %r2267, 0; + selp.b16 %rs61, -1, 0, %p180; + shl.b16 %rs62, %rs61, 1; + or.b16 %rs63, %rs60, %rs62; + and.b16 %rs64, %rs63, 3; + or.b16 %rs65, %rs64, %rs59; + shl.b16 %rs66, %rs65, 4; + or.b16 %rs67, %rs54, %rs66; + and.b16 %rs68, %rs67, 255; + or.b16 %rs69, %rs68, %rs42; + cvt.u32.u16 %r2283, %rs69; + setp.eq.b32 %p181, %r2266, 0; + setp.eq.b32 %p182, %r2265, 0; + setp.eq.b32 %p183, %r2264, 0; + setp.eq.b32 %p184, %r2263, 0; + setp.eq.b32 %p185, %r2262, 0; + setp.eq.b32 %p186, %r2261, 0; + setp.eq.b32 %p187, %r2260, 0; + setp.eq.b32 %p188, %r2259, 0; + setp.eq.b32 %p189, %r2258, 0; + setp.eq.b32 %p190, %r2257, 0; + setp.eq.b32 %p191, %r2256, 0; + setp.eq.b32 %p192, %r2255, 0; + setp.eq.b32 %p193, %r2254, 0; + setp.eq.b32 %p194, %r2253, 0; + setp.eq.b32 %p195, %r2252, 0; + setp.eq.b32 %p196, %r2251, 0; + .loc 1 406 24 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:406:24 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + and.pred %p197, %p196, %p149; + and.pred %p198, %p195, %p150; + and.pred %p199, %p194, %p149; + and.pred %p200, %p193, %p150; + and.pred %p201, %p192, %p151; + and.pred %p202, %p191, %p152; + and.pred %p203, %p190, %p151; + and.pred %p204, %p189, %p152; + and.pred %p205, %p188, %p153; + and.pred %p206, %p187, %p154; + and.pred %p207, %p186, %p153; + and.pred %p208, %p185, %p154; + and.pred %p209, %p184, %p155; + and.pred %p210, %p183, %p156; + and.pred %p211, %p182, %p155; + and.pred %p212, %p181, %p156; + shr.u32 %r2284, %r2283, 15; + and.b32 %r2285, %r2284, 1; + setp.ne.b32 %p213, %r2285, 0; + and.pred %p214, %p213, %p157; + shr.u32 %r2286, %r2283, 14; + and.b32 %r2287, %r2286, 1; + setp.ne.b32 %p215, %r2287, 0; + and.pred %p216, %p215, %p158; + shr.u32 %r2288, %r2283, 13; + and.b32 %r2289, %r2288, 1; + setp.ne.b32 %p217, %r2289, 0; + and.pred %p218, %p217, %p157; + shr.u32 %r2290, %r2283, 12; + and.b32 %r2291, %r2290, 1; + setp.ne.b32 %p219, %r2291, 0; + and.pred %p220, %p219, %p158; + shr.u32 %r2292, %r2283, 11; + and.b32 %r2293, %r2292, 1; + setp.ne.b32 %p221, %r2293, 0; + and.pred %p222, %p221, %p159; + shr.u32 %r2294, %r2283, 10; + and.b32 %r2295, %r2294, 1; + setp.ne.b32 %p223, %r2295, 0; + and.pred %p224, %p223, %p160; + shr.u32 %r2296, %r2283, 9; + and.b32 %r2297, %r2296, 1; + setp.ne.b32 %p225, %r2297, 0; + and.pred %p226, %p225, %p159; + shr.u32 %r2298, %r2283, 8; + and.b32 %r2299, %r2298, 1; + setp.ne.b32 %p227, %r2299, 0; + and.pred %p228, %p227, %p160; + shr.u32 %r2300, %r2283, 7; + and.b32 %r2301, %r2300, 1; + setp.ne.b32 %p229, %r2301, 0; + and.pred %p230, %p229, %p161; + shr.u32 %r2302, %r2283, 6; + and.b32 %r2303, %r2302, 1; + setp.ne.b32 %p231, %r2303, 0; + and.pred %p232, %p231, %p162; + shr.u32 %r2304, %r2283, 5; + and.b32 %r2305, %r2304, 1; + setp.ne.b32 %p233, %r2305, 0; + and.pred %p234, %p233, %p161; + shr.u32 %r2306, %r2283, 4; + and.b32 %r2307, %r2306, 1; + setp.ne.b32 %p235, %r2307, 0; + and.pred %p236, %p235, %p162; + shr.u32 %r2308, %r2283, 3; + and.b32 %r2309, %r2308, 1; + setp.ne.b32 %p237, %r2309, 0; + and.pred %p238, %p237, %p163; + shr.u32 %r2310, %r2283, 2; + and.b32 %r2311, %r2310, 1; + setp.ne.b32 %p239, %r2311, 0; + and.pred %p240, %p239, %p164; + shr.u32 %r2312, %r2283, 1; + and.b32 %r2313, %r2312, 1; + setp.ne.b32 %p241, %r2313, 0; + and.pred %p242, %p241, %p163; + and.pred %p243, %p175, %p164; + .loc 1 407 24 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:407:24 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + or.pred %p244, %p85, %p197; + or.pred %p246, %p86, %p198; + or.pred %p248, %p87, %p199; + or.pred %p250, %p88, %p200; + or.pred %p252, %p89, %p201; + or.pred %p254, %p90, %p202; + or.pred %p256, %p91, %p203; + or.pred %p258, %p92, %p204; + or.pred %p260, %p93, %p205; + or.pred %p262, %p94, %p206; + or.pred %p264, %p95, %p207; + or.pred %p266, %p96, %p208; + or.pred %p268, %p97, %p209; + or.pred %p270, %p98, %p210; + or.pred %p272, %p99, %p211; + or.pred %p274, %p100, %p212; + or.pred %p276, %p101, %p214; + or.pred %p278, %p102, %p216; + or.pred %p280, %p103, %p218; + or.pred %p282, %p104, %p220; + or.pred %p284, %p105, %p222; + or.pred %p286, %p106, %p224; + or.pred %p288, %p107, %p226; + or.pred %p290, %p108, %p228; + or.pred %p292, %p109, %p230; + or.pred %p294, %p110, %p232; + or.pred %p296, %p111, %p234; + or.pred %p298, %p112, %p236; + or.pred %p300, %p113, %p238; + or.pred %p302, %p114, %p240; + or.pred %p304, %p115, %p242; + or.pred %p306, %p116, %p243; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2314, %r2155, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2315, %r2314, 0fFF800000, %p244; + selp.f32 %r2316, %r2315, 0fFF800000, %p37; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2317, %r2156, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2318, %r2317, 0fFF800000, %p246; + selp.f32 %r2319, %r2318, 0fFF800000, %p38; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2320, %r2157, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2321, %r2320, 0fFF800000, %p248; + selp.f32 %r2322, %r2321, 0fFF800000, %p37; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2323, %r2158, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2324, %r2323, 0fFF800000, %p250; + selp.f32 %r2325, %r2324, 0fFF800000, %p38; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2326, %r2159, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2327, %r2326, 0fFF800000, %p252; + selp.f32 %r2328, %r2327, 0fFF800000, %p39; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2329, %r2160, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2330, %r2329, 0fFF800000, %p254; + selp.f32 %r2331, %r2330, 0fFF800000, %p40; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2332, %r2161, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2333, %r2332, 0fFF800000, %p256; + selp.f32 %r2334, %r2333, 0fFF800000, %p39; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2335, %r2162, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2336, %r2335, 0fFF800000, %p258; + selp.f32 %r2337, %r2336, 0fFF800000, %p40; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2338, %r2163, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2339, %r2338, 0fFF800000, %p260; + selp.f32 %r2340, %r2339, 0fFF800000, %p41; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2341, %r2164, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2342, %r2341, 0fFF800000, %p262; + selp.f32 %r2343, %r2342, 0fFF800000, %p42; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2344, %r2165, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2345, %r2344, 0fFF800000, %p264; + selp.f32 %r2346, %r2345, 0fFF800000, %p41; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2347, %r2166, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2348, %r2347, 0fFF800000, %p266; + selp.f32 %r2349, %r2348, 0fFF800000, %p42; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2350, %r2167, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2351, %r2350, 0fFF800000, %p268; + selp.f32 %r2352, %r2351, 0fFF800000, %p43; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2353, %r2168, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2354, %r2353, 0fFF800000, %p270; + selp.f32 %r2355, %r2354, 0fFF800000, %p44; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2356, %r2169, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2357, %r2356, 0fFF800000, %p272; + selp.f32 %r2358, %r2357, 0fFF800000, %p43; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2359, %r2170, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2360, %r2359, 0fFF800000, %p274; + selp.f32 %r2361, %r2360, 0fFF800000, %p44; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2362, %r2171, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2363, %r2362, 0fFF800000, %p276; + selp.f32 %r2364, %r2363, 0fFF800000, %p45; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2365, %r2172, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2366, %r2365, 0fFF800000, %p278; + selp.f32 %r2367, %r2366, 0fFF800000, %p46; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2368, %r2173, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2369, %r2368, 0fFF800000, %p280; + selp.f32 %r2370, %r2369, 0fFF800000, %p45; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2371, %r2174, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2372, %r2371, 0fFF800000, %p282; + selp.f32 %r2373, %r2372, 0fFF800000, %p46; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2374, %r2175, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2375, %r2374, 0fFF800000, %p284; + selp.f32 %r2376, %r2375, 0fFF800000, %p47; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2377, %r2176, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2378, %r2377, 0fFF800000, %p286; + selp.f32 %r2379, %r2378, 0fFF800000, %p48; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2380, %r2177, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2381, %r2380, 0fFF800000, %p288; + selp.f32 %r2382, %r2381, 0fFF800000, %p47; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2383, %r2178, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2384, %r2383, 0fFF800000, %p290; + selp.f32 %r2385, %r2384, 0fFF800000, %p48; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2386, %r2179, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2387, %r2386, 0fFF800000, %p292; + selp.f32 %r2388, %r2387, 0fFF800000, %p49; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2389, %r2180, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2390, %r2389, 0fFF800000, %p294; + selp.f32 %r2391, %r2390, 0fFF800000, %p50; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2392, %r2181, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2393, %r2392, 0fFF800000, %p296; + selp.f32 %r2394, %r2393, 0fFF800000, %p49; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2395, %r2182, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2396, %r2395, 0fFF800000, %p298; + selp.f32 %r2397, %r2396, 0fFF800000, %p50; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2398, %r2183, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2399, %r2398, 0fFF800000, %p300; + selp.f32 %r2400, %r2399, 0fFF800000, %p51; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2401, %r2184, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2402, %r2401, 0fFF800000, %p302; + selp.f32 %r2403, %r2402, 0fFF800000, %p52; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2404, %r2185, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2405, %r2404, 0fFF800000, %p304; + selp.f32 %r2406, %r2405, 0fFF800000, %p51; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r2407, %r2186, 0f3FB8AA3B; + .loc 1 414 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:414:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2408, %r2407, 0fFF800000, %p306; + selp.f32 %r2409, %r2408, 0fFF800000, %p52; + .loc 2 168 27 // standard.py:168:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + max.f32 %r2410, %r2316, %r2319; + max.f32 %r2411, %r2322, %r2325; + max.f32 %r2412, %r2410, %r2328; + max.f32 %r2413, %r2412, %r2331; + max.f32 %r2414, %r2411, %r2334; + max.f32 %r2415, %r2414, %r2337; + max.f32 %r2416, %r2413, %r2340; + max.f32 %r2417, %r2416, %r2343; + max.f32 %r2418, %r2415, %r2346; + max.f32 %r2419, %r2418, %r2349; + max.f32 %r2420, %r2417, %r2352; + max.f32 %r2421, %r2420, %r2355; + max.f32 %r2422, %r2419, %r2358; + max.f32 %r2423, %r2422, %r2361; + max.f32 %r2424, %r2421, %r2364; + max.f32 %r2425, %r2424, %r2367; + max.f32 %r2426, %r2423, %r2370; + max.f32 %r2427, %r2426, %r2373; + max.f32 %r2428, %r2425, %r2376; + max.f32 %r2429, %r2428, %r2379; + max.f32 %r2430, %r2427, %r2382; + max.f32 %r2431, %r2430, %r2385; + max.f32 %r2432, %r2429, %r2388; + max.f32 %r2433, %r2432, %r2391; + max.f32 %r2434, %r2431, %r2394; + max.f32 %r2435, %r2434, %r2397; + max.f32 %r2436, %r2433, %r2400; + max.f32 %r2437, %r2436, %r2403; + max.f32 %r2438, %r2435, %r2406; + max.f32 %r2439, %r2438, %r2409; + .loc 2 189 40 // standard.py:189:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + shfl.sync.bfly.b32 %r2440, %r2437, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + max.f32 %r2441, %r2437, %r2440; + .loc 2 189 40 // standard.py:189:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + shfl.sync.bfly.b32 %r2442, %r2441, 1, 31, -1; + shfl.sync.bfly.b32 %r2443, %r2439, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + max.f32 %r2444, %r2439, %r2443; + .loc 2 189 40 // standard.py:189:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + shfl.sync.bfly.b32 %r2445, %r2444, 1, 31, -1; + cvt.u64.u32 %rd160, %r2442; + cvt.u64.u32 %rd161, %r2445; + shl.b64 %rd162, %rd161, 32; + or.b64 %rd163, %rd160, %rd162; + .loc 2 168 27 // standard.py:168:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mov.b64 {%r2446, %r2447}, %rd163; + max.f32 %r2448, %r2441, %r2446; + max.f32 %r2449, %r2444, %r2447; + .loc 1 421 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:421:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mov.b64 {%r2450, %r2451}, %rd292; + max.f32 %r2452, %r2451, %r2449; + max.f32 %r2453, %r2450, %r2448; + mov.b64 %rd292, {%r2453, %r2452}; + .loc 1 423 35 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:423:35 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + setp.eq.f32 %p308, %r2453, 0fFF800000; + setp.eq.f32 %p309, %r2452, 0fFF800000; + .loc 1 424 51 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:424:51 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + selp.f32 %r2454, 0f00000000, %r2453, %p308; + selp.f32 %r2455, 0f00000000, %r2452, %p309; + .loc 1 428 31 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:428:31 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + sub.f32 %r2456, %r2450, %r2454; + sub.f32 %r2457, %r2451, %r2455; + .loc 1 428 25 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:428:25 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + ex2.approx.ftz.f32 %r2458, %r2456; + ex2.approx.ftz.f32 %r2459, %r2457; + .loc 1 429 39 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:429:39 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + sub.f32 %r2460, %r2316, %r2454; + sub.f32 %r2461, %r2319, %r2454; + sub.f32 %r2462, %r2322, %r2455; + sub.f32 %r2463, %r2325, %r2455; + sub.f32 %r2464, %r2328, %r2454; + sub.f32 %r2465, %r2331, %r2454; + sub.f32 %r2466, %r2334, %r2455; + sub.f32 %r2467, %r2337, %r2455; + sub.f32 %r2468, %r2340, %r2454; + sub.f32 %r2469, %r2343, %r2454; + sub.f32 %r2470, %r2346, %r2455; + sub.f32 %r2471, %r2349, %r2455; + sub.f32 %r2472, %r2352, %r2454; + sub.f32 %r2473, %r2355, %r2454; + sub.f32 %r2474, %r2358, %r2455; + sub.f32 %r2475, %r2361, %r2455; + sub.f32 %r2476, %r2364, %r2454; + sub.f32 %r2477, %r2367, %r2454; + sub.f32 %r2478, %r2370, %r2455; + sub.f32 %r2479, %r2373, %r2455; + sub.f32 %r2480, %r2376, %r2454; + sub.f32 %r2481, %r2379, %r2454; + sub.f32 %r2482, %r2382, %r2455; + sub.f32 %r2483, %r2385, %r2455; + sub.f32 %r2484, %r2388, %r2454; + sub.f32 %r2485, %r2391, %r2454; + sub.f32 %r2486, %r2394, %r2455; + sub.f32 %r2487, %r2397, %r2455; + sub.f32 %r2488, %r2400, %r2454; + sub.f32 %r2489, %r2403, %r2454; + sub.f32 %r2490, %r2406, %r2455; + sub.f32 %r2491, %r2409, %r2455; + .loc 1 429 21 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:429:21 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + ex2.approx.ftz.f32 %r2492, %r2460; + ex2.approx.ftz.f32 %r2493, %r2461; + ex2.approx.ftz.f32 %r2494, %r2462; + ex2.approx.ftz.f32 %r2495, %r2463; + ex2.approx.ftz.f32 %r2496, %r2464; + ex2.approx.ftz.f32 %r2497, %r2465; + ex2.approx.ftz.f32 %r2498, %r2466; + ex2.approx.ftz.f32 %r2499, %r2467; + ex2.approx.ftz.f32 %r2500, %r2468; + ex2.approx.ftz.f32 %r2501, %r2469; + ex2.approx.ftz.f32 %r2502, %r2470; + ex2.approx.ftz.f32 %r2503, %r2471; + ex2.approx.ftz.f32 %r2504, %r2472; + ex2.approx.ftz.f32 %r2505, %r2473; + ex2.approx.ftz.f32 %r2506, %r2474; + ex2.approx.ftz.f32 %r2507, %r2475; + ex2.approx.ftz.f32 %r2508, %r2476; + ex2.approx.ftz.f32 %r2509, %r2477; + ex2.approx.ftz.f32 %r2510, %r2478; + ex2.approx.ftz.f32 %r2511, %r2479; + ex2.approx.ftz.f32 %r2512, %r2480; + ex2.approx.ftz.f32 %r2513, %r2481; + ex2.approx.ftz.f32 %r2514, %r2482; + ex2.approx.ftz.f32 %r2515, %r2483; + ex2.approx.ftz.f32 %r2516, %r2484; + ex2.approx.ftz.f32 %r2517, %r2485; + ex2.approx.ftz.f32 %r2518, %r2486; + ex2.approx.ftz.f32 %r2519, %r2487; + ex2.approx.ftz.f32 %r2520, %r2488; + ex2.approx.ftz.f32 %r2521, %r2489; + ex2.approx.ftz.f32 %r2522, %r2490; + ex2.approx.ftz.f32 %r2523, %r2491; + .loc 2 261 15 // standard.py:261:15 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.f32 %r2524, %r2492, %r2493; + add.f32 %r2525, %r2494, %r2495; + add.f32 %r2526, %r2524, %r2496; + add.f32 %r2527, %r2526, %r2497; + add.f32 %r2528, %r2525, %r2498; + add.f32 %r2529, %r2528, %r2499; + add.f32 %r2530, %r2527, %r2500; + add.f32 %r2531, %r2530, %r2501; + add.f32 %r2532, %r2529, %r2502; + add.f32 %r2533, %r2532, %r2503; + add.f32 %r2534, %r2531, %r2504; + add.f32 %r2535, %r2534, %r2505; + add.f32 %r2536, %r2533, %r2506; + add.f32 %r2537, %r2536, %r2507; + add.f32 %r2538, %r2535, %r2508; + add.f32 %r2539, %r2538, %r2509; + add.f32 %r2540, %r2537, %r2510; + add.f32 %r2541, %r2540, %r2511; + add.f32 %r2542, %r2539, %r2512; + add.f32 %r2543, %r2542, %r2513; + add.f32 %r2544, %r2541, %r2514; + add.f32 %r2545, %r2544, %r2515; + add.f32 %r2546, %r2543, %r2516; + add.f32 %r2547, %r2546, %r2517; + add.f32 %r2548, %r2545, %r2518; + add.f32 %r2549, %r2548, %r2519; + add.f32 %r2550, %r2547, %r2520; + add.f32 %r2551, %r2550, %r2521; + add.f32 %r2552, %r2549, %r2522; + add.f32 %r2553, %r2552, %r2523; + .loc 2 291 36 // standard.py:291:36 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + shfl.sync.bfly.b32 %r2554, %r2551, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.f32 %r2555, %r2551, %r2554; + .loc 2 291 36 // standard.py:291:36 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + shfl.sync.bfly.b32 %r2556, %r2555, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.f32 %r2557, %r2555, %r2556; + .loc 2 291 36 // standard.py:291:36 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + shfl.sync.bfly.b32 %r2558, %r2553, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.f32 %r2559, %r2553, %r2558; + .loc 2 291 36 // standard.py:291:36 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + shfl.sync.bfly.b32 %r2560, %r2559, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.f32 %r2561, %r2559, %r2560; + .loc 1 434 24 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:434:24 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + fma.rn.f32 %r5060, %r5060, %r2458, %r2557; + fma.rn.f32 %r5061, %r5061, %r2459, %r2561; + .loc 1 436 16 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:436:16 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.f32 %r1702, %r1702, %r2458; + mul.f32 %r1703, %r1703, %r2458; + mul.f32 %r1704, %r1704, %r2459; + mul.f32 %r1705, %r1705, %r2459; + mul.f32 %r1706, %r1706, %r2458; + mul.f32 %r1707, %r1707, %r2458; + mul.f32 %r1708, %r1708, %r2459; + mul.f32 %r1709, %r1709, %r2459; + mul.f32 %r1710, %r1710, %r2458; + mul.f32 %r1711, %r1711, %r2458; + mul.f32 %r1712, %r1712, %r2459; + mul.f32 %r1713, %r1713, %r2459; + mul.f32 %r1714, %r1714, %r2458; + mul.f32 %r1715, %r1715, %r2458; + mul.f32 %r1716, %r1716, %r2459; + mul.f32 %r1717, %r1717, %r2459; + mul.f32 %r1718, %r1718, %r2458; + mul.f32 %r1719, %r1719, %r2458; + mul.f32 %r1720, %r1720, %r2459; + mul.f32 %r1721, %r1721, %r2459; + mul.f32 %r1722, %r1722, %r2458; + mul.f32 %r1723, %r1723, %r2458; + mul.f32 %r1724, %r1724, %r2459; + mul.f32 %r1725, %r1725, %r2459; + mul.f32 %r1726, %r1726, %r2458; + mul.f32 %r1727, %r1727, %r2458; + mul.f32 %r1728, %r1728, %r2459; + mul.f32 %r1729, %r1729, %r2459; + mul.f32 %r1730, %r1730, %r2458; + mul.f32 %r1731, %r1731, %r2458; + mul.f32 %r1732, %r1732, %r2459; + mul.f32 %r1733, %r1733, %r2459; + mul.f32 %r1734, %r1734, %r2458; + mul.f32 %r1735, %r1735, %r2458; + mul.f32 %r1736, %r1736, %r2459; + mul.f32 %r1737, %r1737, %r2459; + mul.f32 %r1738, %r1738, %r2458; + mul.f32 %r1739, %r1739, %r2458; + mul.f32 %r1740, %r1740, %r2459; + mul.f32 %r1741, %r1741, %r2459; + mul.f32 %r1742, %r1742, %r2458; + mul.f32 %r1743, %r1743, %r2458; + mul.f32 %r1744, %r1744, %r2459; + mul.f32 %r1745, %r1745, %r2459; + mul.f32 %r1746, %r1746, %r2458; + mul.f32 %r1747, %r1747, %r2458; + mul.f32 %r1748, %r1748, %r2459; + mul.f32 %r1749, %r1749, %r2459; + mul.f32 %r1750, %r1750, %r2458; + mul.f32 %r1751, %r1751, %r2458; + mul.f32 %r1752, %r1752, %r2459; + mul.f32 %r1753, %r1753, %r2459; + mul.f32 %r1754, %r1754, %r2458; + mul.f32 %r1755, %r1755, %r2458; + mul.f32 %r1756, %r1756, %r2459; + mul.f32 %r1757, %r1757, %r2459; + mul.f32 %r1758, %r1758, %r2458; + mul.f32 %r1759, %r1759, %r2458; + mul.f32 %r1760, %r1760, %r2459; + mul.f32 %r1761, %r1761, %r2459; + mul.f32 %r1762, %r1762, %r2458; + mul.f32 %r1763, %r1763, %r2458; + mul.f32 %r1764, %r1764, %r2459; + mul.f32 %r1765, %r1765, %r2459; + .loc 1 292 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s32 %r2562, %r836, 49152; + add.s32 %r2563, %r2562, %r2119; + .loc 1 440 22 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:440:22 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + cvt.rn.bf16x2.f32 %r1698, %r2493, %r2492; + cvt.rn.bf16x2.f32 %r1699, %r2495, %r2494; + cvt.rn.bf16x2.f32 %r1700, %r2497, %r2496; + cvt.rn.bf16x2.f32 %r1701, %r2499, %r2498; + cvt.rn.bf16x2.f32 %r1830, %r2501, %r2500; + cvt.rn.bf16x2.f32 %r1831, %r2503, %r2502; + cvt.rn.bf16x2.f32 %r1832, %r2505, %r2504; + cvt.rn.bf16x2.f32 %r1833, %r2507, %r2506; + cvt.rn.bf16x2.f32 %r1962, %r2509, %r2508; + cvt.rn.bf16x2.f32 %r1963, %r2511, %r2510; + cvt.rn.bf16x2.f32 %r1964, %r2513, %r2512; + cvt.rn.bf16x2.f32 %r1965, %r2515, %r2514; + cvt.rn.bf16x2.f32 %r2094, %r2517, %r2516; + cvt.rn.bf16x2.f32 %r2095, %r2519, %r2518; + cvt.rn.bf16x2.f32 %r2096, %r2521, %r2520; + cvt.rn.bf16x2.f32 %r2097, %r2523, %r2522; + .loc 1 440 44 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:440:44 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + wgmma.fence.sync.aligned; + bfe.u32 %r2564, %r2563, 4, 14; + cvt.u64.u32 %rd164, %r2564; + or.b64 %rd110, %rd164, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765}, {%r1698,%r1699,%r1700,%r1701}, %rd110, %p20, 1, 1, 1; + // end inline asm + add.s32 %r2565, %r2563, 2048; + bfe.u32 %r2566, %r2565, 4, 14; + cvt.u64.u32 %rd165, %r2566; + or.b64 %rd111, %rd165, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765}, {%r1830,%r1831,%r1832,%r1833}, %rd111, %p20, 1, 1, 1; + // end inline asm + add.s32 %r2567, %r2563, 4096; + bfe.u32 %r2568, %r2567, 4, 14; + cvt.u64.u32 %rd166, %r2568; + or.b64 %rd112, %rd166, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765}, {%r1962,%r1963,%r1964,%r1965}, %rd112, %p20, 1, 1, 1; + // end inline asm + add.s32 %r2569, %r2563, 6144; + bfe.u32 %r2570, %r2569, 4, 14; + cvt.u64.u32 %rd167, %r2570; + or.b64 %rd113, %rd167, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765}, {%r2094,%r2095,%r2096,%r2097}, %rd113, %p20, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 548 26 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:548:26 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s32 %r4827, %r4756, %r4827; + add.s32 %r4828, %r4756, %r4828; + add.s32 %r4829, %r4756, %r4829; + add.s32 %r4830, %r4756, %r4830; + add.s32 %r4831, %r4756, %r4831; + add.s32 %r4832, %r4756, %r4832; + add.s32 %r4833, %r4756, %r4833; + add.s32 %r4834, %r4756, %r4834; + add.s32 %r4835, %r4756, %r4835; + add.s32 %r4836, %r4756, %r4836; + add.s32 %r4837, %r4756, %r4837; + add.s32 %r4838, %r4756, %r4838; + add.s32 %r4839, %r4756, %r4839; + add.s32 %r4840, %r4756, %r4840; + add.s32 %r4841, %r4756, %r4841; + add.s32 %r4842, %r4756, %r4842; + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s32 %r306, %r4826, 1; + .loc 1 247 33 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:247:33 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + shr.u32 %r2571, %r306, 1; + .loc 1 248 38 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:248:38 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mad.wide.u32 %rd115, %r2571, 4, %rd47; + .loc 1 248 24 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:248:24 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + // begin inline asm + mov.u64 %rd114, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd114, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r2098, 0x0; + @%p31 ld.global.L1::evict_last.L2::cache_hint.b32 { %r2098 }, [ %rd115 + 0 ], %rd114; + // end inline asm + .loc 1 249 109 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:249:109 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s32 %r2572, %r2571, 1; + .loc 1 249 113 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:249:113 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + setp.lt.s32 %p310, %r2572, %r778; + .loc 1 249 55 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:249:55 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s64 %rd118, %rd115, 4; + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + and.pred %p32, %p31, %p310; + .loc 1 249 25 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:249:25 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + // begin inline asm + mov.u64 %rd117, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd117, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r2099, 0x0; + @%p32 ld.global.L1::evict_last.L2::cache_hint.b32 { %r2099 }, [ %rd118 + 0 ], %rd117; + // end inline asm + .loc 1 250 35 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:250:35 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + and.b32 %r2573, %r4826, 1; + .loc 1 251 34 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:251:34 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + sub.s32 %r2574, %r2099, %r2098; + .loc 1 251 48 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:251:48 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + shl.b32 %r2575, %r2574, 7; + .loc 1 251 63 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:251:63 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s32 %r2576, %r2575, -64; + .loc 1 252 29 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:252:29 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + xor.b32 %r2577, %r2573, 1; + .loc 1 252 61 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:252:61 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + shl.b32 %r2578, %r2573, 6; + .loc 1 252 42 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:252:42 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mad.lo.s32 %r4756, %r2576, %r2577, %r2578; + .loc 1 549 21 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:549:21 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s32 %r4759, %r4756, %r4759; + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s32 %r2579, %r4758, 1; + setp.gt.s32 %p311, %r2579, 2; + selp.b32 %r4758, 0, %r2579, %p311; + .loc 1 342 32 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:342:32 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s32 %r2580, %r4759, %r31; + .loc 1 346 35 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:346:35 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s32 %r2581, %r2580, %r8; + add.s32 %r2582, %r2580, %r9; + add.s32 %r2583, %r2580, %r10; + add.s32 %r2584, %r2580, %r11; + .loc 1 284 38 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:38 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + shl.b32 %r2585, %r2581, 7; + shl.b32 %r2586, %r2582, 7; + shl.b32 %r2587, %r2583, 7; + shl.b32 %r2588, %r2584, 7; + .loc 1 284 49 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:49 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + mul.wide.s32 %rd168, %r2585, 2; + add.s64 %rd120, %rd10, %rd168; + mul.wide.s32 %rd169, %r2586, 2; + add.s64 %rd121, %rd10, %rd169; + mul.wide.s32 %rd170, %r2587, 2; + add.s64 %rd122, %rd10, %rd170; + mul.wide.s32 %rd171, %r2588, 2; + add.s64 %rd123, %rd10, %rd171; + .loc 1 292 52 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:52 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + setp.lt.s32 %p312, %r2581, %r712; + setp.lt.s32 %p313, %r2582, %r712; + setp.lt.s32 %p314, %r2583, %r712; + setp.lt.s32 %p315, %r2584, %r712; + .loc 1 292 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + shl.b32 %r2589, %r4758, 14; + add.s32 %r2590, %r836, %r2589; + bar.sync 0; + add.s32 %r2100, %r2590, %r84; + selp.b32 %r2591, 16, 0, %p312; + selp.b32 %r2109, %r2591, 0, %p35; + // begin inline asm + cp.async.cg.shared.global [ %r2100 + 0 ], [ %rd120 + 0 ], 0x10, %r2109; + // end inline asm + add.s32 %r2102, %r2100, 2048; + selp.b32 %r2592, 16, 0, %p313; + selp.b32 %r2111, %r2592, 0, %p35; + // begin inline asm + cp.async.cg.shared.global [ %r2102 + 0 ], [ %rd121 + 0 ], 0x10, %r2111; + // end inline asm + add.s32 %r2104, %r2100, 4096; + selp.b32 %r2593, 16, 0, %p314; + selp.b32 %r2113, %r2593, 0, %p35; + // begin inline asm + cp.async.cg.shared.global [ %r2104 + 0 ], [ %rd122 + 0 ], 0x10, %r2113; + // end inline asm + add.s32 %r2106, %r2100, 6144; + selp.b32 %r2594, 16, 0, %p315; + selp.b32 %r2115, %r2594, 0, %p35; + // begin inline asm + cp.async.cg.shared.global [ %r2106 + 0 ], [ %rd123 + 0 ], 0x10, %r2115; + // end inline asm + cp.async.commit_group; + .loc 1 284 49 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:49 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s64 %rd124, %rd11, %rd168; + add.s64 %rd125, %rd11, %rd169; + add.s64 %rd126, %rd11, %rd170; + add.s64 %rd127, %rd11, %rd171; + .loc 1 292 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + add.s32 %r2595, %r2562, %r2589; + add.s32 %r2108, %r2595, %r84; + // begin inline asm + cp.async.cg.shared.global [ %r2108 + 0 ], [ %rd124 + 0 ], 0x10, %r2109; + // end inline asm + add.s32 %r2110, %r2108, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r2110 + 0 ], [ %rd125 + 0 ], 0x10, %r2111; + // end inline asm + add.s32 %r2112, %r2108, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r2112 + 0 ], [ %rd126 + 0 ], 0x10, %r2113; + // end inline asm + add.s32 %r2114, %r2108, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r2114 + 0 ], [ %rd127 + 0 ], 0x10, %r2115; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + setp.ne.b32 %p316, %r119, %r306; + mov.b32 %r4826, %r306; + @%p316 bra $L__BB0_2; +$L__BB0_3: // %._crit_edge + .loc 1 0 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:0:40 + ld.param.b64 %rd38, [triton_tem_fused_0_param_10]; + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:172:41 ] + // begin inline asm + // wait for regs: %r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp8: + .loc 1 181 35 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:181:35 + shl.b64 %rd190, %rd4, 2; + add.s64 %rd172, %rd37, %rd190; + .loc 1 182 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:182:27 + // begin inline asm + mov.u32 %r2724, 0x0; + ld.global.b32 { %r2724 }, [ %rd172 + 0 ]; + // end inline asm + .loc 1 182 41 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:182:41 + shl.b32 %r376, %r2724, 7; + .loc 1 183 51 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:183:51 + shl.b64 %rd191, %rd6, 2; + add.s64 %rd173, %rd36, %rd191; + .loc 1 183 32 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:183:32 + // begin inline asm + mov.u32 %r2725, 0x0; + ld.global.b32 { %r2725 }, [ %rd173 + 0 ]; + // end inline asm + .loc 1 184 49 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:184:49 + shl.b32 %r2758, %r2725, 1; + .loc 1 184 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:184:69 + min.s32 %r378, %r2758, %r33; +$L__tmp9: + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + setp.lt.s32 %p317, %r2758, 1; + setp.gt.s32 %p318, %r2758, 0; + .loc 1 346 35 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:346:35 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + or.b32 %r2759, %r376, %r8; + or.b32 %r2760, %r376, %r9; + or.b32 %r2761, %r376, %r10; + or.b32 %r2762, %r376, %r11; + .loc 1 284 38 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:38 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + shl.b32 %r2763, %r2759, 7; + shl.b32 %r2764, %r2760, 7; + shl.b32 %r2765, %r2761, 7; + shl.b32 %r2766, %r2762, 7; + .loc 1 284 20 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:20 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.wide.s32 %rd192, %r2763, 2; + add.s64 %rd193, %rd1, %rd192; + mul.wide.s32 %rd194, %r2764, 2; + add.s64 %rd195, %rd1, %rd194; + mul.wide.s32 %rd196, %r2765, 2; + add.s64 %rd197, %rd1, %rd196; + mul.wide.s32 %rd198, %r2766, 2; + add.s64 %rd199, %rd1, %rd198; + .loc 1 284 49 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:49 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + shl.b64 %rd200, %rd3, 1; + add.s64 %rd174, %rd193, %rd200; + add.s64 %rd175, %rd195, %rd200; + add.s64 %rd176, %rd197, %rd200; + add.s64 %rd177, %rd199, %rd200; + .loc 1 292 52 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:52 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + setp.lt.s32 %p319, %r2759, %r712; + setp.lt.s32 %p320, %r2760, %r712; + setp.lt.s32 %p321, %r2761, %r712; + setp.lt.s32 %p322, %r2762, %r712; + .loc 1 292 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.b32 %r2767, 16, 0, %p319; + selp.b32 %r2735, %r2767, 0, %p318; + // begin inline asm + cp.async.cg.shared.global [ %r2726 + 0 ], [ %rd174 + 0 ], 0x10, %r2735; + // end inline asm + selp.b32 %r2768, 16, 0, %p320; + selp.b32 %r2737, %r2768, 0, %p318; + // begin inline asm + cp.async.cg.shared.global [ %r2728 + 0 ], [ %rd175 + 0 ], 0x10, %r2737; + // end inline asm + selp.b32 %r2769, 16, 0, %p321; + selp.b32 %r2739, %r2769, 0, %p318; + // begin inline asm + cp.async.cg.shared.global [ %r2730 + 0 ], [ %rd176 + 0 ], 0x10, %r2739; + // end inline asm + selp.b32 %r2770, 16, 0, %p322; + selp.b32 %r2741, %r2770, 0, %p318; + // begin inline asm + cp.async.cg.shared.global [ %r2732 + 0 ], [ %rd177 + 0 ], 0x10, %r2741; + // end inline asm + cp.async.commit_group; + .loc 1 284 20 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:20 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.s64 %rd201, %rd2, %rd192; + add.s64 %rd202, %rd2, %rd194; + add.s64 %rd203, %rd2, %rd196; + add.s64 %rd204, %rd2, %rd198; + .loc 1 284 49 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:49 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.s64 %rd178, %rd201, %rd200; + add.s64 %rd179, %rd202, %rd200; + add.s64 %rd180, %rd203, %rd200; + add.s64 %rd181, %rd204, %rd200; + .loc 1 292 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + // begin inline asm + cp.async.cg.shared.global [ %r787 + 0 ], [ %rd178 + 0 ], 0x10, %r2735; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r789 + 0 ], [ %rd179 + 0 ], 0x10, %r2737; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r791 + 0 ], [ %rd180 + 0 ], 0x10, %r2739; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r793 + 0 ], [ %rd181 + 0 ], 0x10, %r2741; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + setp.gt.s32 %p323, %r378, 1; + .loc 1 342 32 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:342:32 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + or.b32 %r2771, %r376, 64; + .loc 1 346 35 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:346:35 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + or.b32 %r2772, %r2771, %r8; + or.b32 %r2773, %r2771, %r9; + or.b32 %r2774, %r2771, %r10; + or.b32 %r2775, %r2771, %r11; + .loc 1 284 38 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:38 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + shl.b32 %r2776, %r2772, 7; + shl.b32 %r2777, %r2773, 7; + shl.b32 %r2778, %r2774, 7; + shl.b32 %r2779, %r2775, 7; + .loc 1 284 20 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:20 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.wide.s32 %rd205, %r2776, 2; + add.s64 %rd206, %rd1, %rd205; + mul.wide.s32 %rd207, %r2777, 2; + add.s64 %rd208, %rd1, %rd207; + mul.wide.s32 %rd209, %r2778, 2; + add.s64 %rd210, %rd1, %rd209; + mul.wide.s32 %rd211, %r2779, 2; + add.s64 %rd212, %rd1, %rd211; + .loc 1 284 49 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:49 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.s64 %rd182, %rd206, %rd200; + add.s64 %rd183, %rd208, %rd200; + add.s64 %rd184, %rd210, %rd200; + add.s64 %rd185, %rd212, %rd200; + .loc 1 292 52 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:52 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + setp.lt.s32 %p324, %r2772, %r712; + setp.lt.s32 %p325, %r2773, %r712; + setp.lt.s32 %p326, %r2774, %r712; + setp.lt.s32 %p327, %r2775, %r712; + .loc 1 292 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + bar.sync 0; + selp.b32 %r2780, 16, 0, %p324; + selp.b32 %r2751, %r2780, 0, %p323; + // begin inline asm + cp.async.cg.shared.global [ %r795 + 0 ], [ %rd182 + 0 ], 0x10, %r2751; + // end inline asm + selp.b32 %r2781, 16, 0, %p325; + selp.b32 %r2753, %r2781, 0, %p323; + // begin inline asm + cp.async.cg.shared.global [ %r797 + 0 ], [ %rd183 + 0 ], 0x10, %r2753; + // end inline asm + selp.b32 %r2782, 16, 0, %p326; + selp.b32 %r2755, %r2782, 0, %p323; + // begin inline asm + cp.async.cg.shared.global [ %r799 + 0 ], [ %rd184 + 0 ], 0x10, %r2755; + // end inline asm + selp.b32 %r2783, 16, 0, %p327; + selp.b32 %r2757, %r2783, 0, %p323; + // begin inline asm + cp.async.cg.shared.global [ %r801 + 0 ], [ %rd185 + 0 ], 0x10, %r2757; + // end inline asm + cp.async.commit_group; + .loc 1 284 20 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:20 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.s64 %rd213, %rd2, %rd205; + add.s64 %rd214, %rd2, %rd207; + add.s64 %rd215, %rd2, %rd209; + add.s64 %rd216, %rd2, %rd211; + .loc 1 284 49 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:49 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.s64 %rd186, %rd213, %rd200; + add.s64 %rd187, %rd214, %rd200; + add.s64 %rd188, %rd215, %rd200; + add.s64 %rd189, %rd216, %rd200; + .loc 1 292 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + // begin inline asm + cp.async.cg.shared.global [ %r803 + 0 ], [ %rd186 + 0 ], 0x10, %r2751; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r805 + 0 ], [ %rd187 + 0 ], 0x10, %r2753; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r807 + 0 ], [ %rd188 + 0 ], 0x10, %r2755; + // end inline asm + // begin inline asm + cp.async.cg.shared.global [ %r809 + 0 ], [ %rd189 + 0 ], 0x10, %r2757; + // end inline asm + cp.async.commit_group; + .loc 1 351 19 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:351:19 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + // begin inline asm + fence.proxy.async.shared::cta; + // end inline asm + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + @%p317 bra $L__BB0_6; +$L__tmp10: +// %bb.4: // %.lr.ph460 + .loc 1 186 28 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:186:28 + or.b32 %r4995, %r376, %r36; + or.b32 %r4994, %r376, %r37; + or.b32 %r4992, %r376, %r38; + or.b32 %r4988, %r376, %r40; + or.b32 %r4980, %r376, %r44; + or.b32 %r4993, %r376, %r39; + or.b32 %r4989, %r376, %r41; + or.b32 %r4981, %r376, %r45; + or.b32 %r4990, %r376, %r42; + or.b32 %r4982, %r376, %r46; + or.b32 %r4991, %r376, %r43; + or.b32 %r4983, %r376, %r47; + or.b32 %r4984, %r376, %r48; + or.b32 %r4985, %r376, %r49; + or.b32 %r4986, %r376, %r50; + or.b32 %r4987, %r376, %r51; + add.s32 %r459, %r378, -2; + add.s32 %r460, %r378, -1; +$L__tmp11: + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + max.s32 %r461, %r378, 1; + mov.b32 %r3403, 0; + mov.b32 %r4911, 1; + mov.b32 %r4910, -1; + mov.b32 %r4909, 64; + mov.b32 %r4912, %r4909; + mov.b32 %r4977, %r3403; +$L__BB0_5: // %__nv_exp2f.exit + // =>This Inner Loop Header: Depth=1 + setp.lt.s32 %p341, %r4977, %r459; + setp.lt.s32 %p339, %r4977, %r460; + add.s32 %r4018, %r4910, 1; + setp.gt.s32 %p342, %r4018, 2; + selp.b32 %r4910, 0, %r4018, %p342; + .loc 1 292 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + cp.async.wait_group 2; + bar.sync 0; + shl.b32 %r4019, %r4910, 14; + add.s32 %r3303, %r836, %r4019; + .loc 1 351 19 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:351:19 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + shfl.sync.idx.b32 %r4021, %r7, 0, 31, -1; + wgmma.fence.sync.aligned; + shl.b32 %r4022, %r4021, 11; + and.b32 %r4023, %r4022, 8192; + add.s32 %r3300, %r836, 98304; + add.s32 %r4024, %r4023, %r3300; + bfe.u32 %r4025, %r4024, 4, 14; + cvt.u64.u32 %rd251, %r4025; + or.b64 %rd217, %rd251, 4611686293372403712; + bfe.u32 %r4026, %r3303, 4, 14; + cvt.u64.u32 %rd252, %r4026; + or.b64 %rd218, %rd252, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2884,%r2885,%r2886,%r2887,%r2888,%r2889,%r2890,%r2891,%r2892,%r2893,%r2894,%r2895,%r2896,%r2897,%r2898,%r2899,%r2900,%r2901,%r2902,%r2903,%r2904,%r2905,%r2906,%r2907,%r2908,%r2909,%r2910,%r2911,%r2912,%r2913,%r2914,%r2915}, %rd217, %rd218, 0, 1, 1, 0, 0; + // end inline asm + add.s32 %r4027, %r4024, 32; + bfe.u32 %r4028, %r4027, 4, 14; + cvt.u64.u32 %rd253, %r4028; + or.b64 %rd219, %rd253, 4611686293372403712; + add.s32 %r4029, %r3303, 32; + bfe.u32 %r4030, %r4029, 4, 14; + cvt.u64.u32 %rd254, %r4030; + or.b64 %rd220, %rd254, 4611686293338849280; + mov.pred %p328, -1; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2884,%r2885,%r2886,%r2887,%r2888,%r2889,%r2890,%r2891,%r2892,%r2893,%r2894,%r2895,%r2896,%r2897,%r2898,%r2899,%r2900,%r2901,%r2902,%r2903,%r2904,%r2905,%r2906,%r2907,%r2908,%r2909,%r2910,%r2911,%r2912,%r2913,%r2914,%r2915}, %rd219, %rd220, %p328, 1, 1, 0, 0; + // end inline asm + add.s32 %r4031, %r4024, 64; + bfe.u32 %r4032, %r4031, 4, 14; + cvt.u64.u32 %rd255, %r4032; + or.b64 %rd221, %rd255, 4611686293372403712; + add.s32 %r4033, %r3303, 64; + bfe.u32 %r4034, %r4033, 4, 14; + cvt.u64.u32 %rd256, %r4034; + or.b64 %rd222, %rd256, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2884,%r2885,%r2886,%r2887,%r2888,%r2889,%r2890,%r2891,%r2892,%r2893,%r2894,%r2895,%r2896,%r2897,%r2898,%r2899,%r2900,%r2901,%r2902,%r2903,%r2904,%r2905,%r2906,%r2907,%r2908,%r2909,%r2910,%r2911,%r2912,%r2913,%r2914,%r2915}, %rd221, %rd222, %p328, 1, 1, 0, 0; + // end inline asm + add.s32 %r4035, %r4024, 96; + bfe.u32 %r4036, %r4035, 4, 14; + cvt.u64.u32 %rd257, %r4036; + or.b64 %rd223, %rd257, 4611686293372403712; + add.s32 %r4037, %r3303, 96; + bfe.u32 %r4038, %r4037, 4, 14; + cvt.u64.u32 %rd258, %r4038; + or.b64 %rd224, %rd258, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2884,%r2885,%r2886,%r2887,%r2888,%r2889,%r2890,%r2891,%r2892,%r2893,%r2894,%r2895,%r2896,%r2897,%r2898,%r2899,%r2900,%r2901,%r2902,%r2903,%r2904,%r2905,%r2906,%r2907,%r2908,%r2909,%r2910,%r2911,%r2912,%r2913,%r2914,%r2915}, %rd223, %rd224, %p328, 1, 1, 0, 0; + // end inline asm + add.s32 %r4039, %r4024, 16384; + bfe.u32 %r4040, %r4039, 4, 14; + cvt.u64.u32 %rd259, %r4040; + or.b64 %rd225, %rd259, 4611686293372403712; + add.s32 %r4041, %r3303, 8192; + bfe.u32 %r4042, %r4041, 4, 14; + cvt.u64.u32 %rd260, %r4042; + or.b64 %rd226, %rd260, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2884,%r2885,%r2886,%r2887,%r2888,%r2889,%r2890,%r2891,%r2892,%r2893,%r2894,%r2895,%r2896,%r2897,%r2898,%r2899,%r2900,%r2901,%r2902,%r2903,%r2904,%r2905,%r2906,%r2907,%r2908,%r2909,%r2910,%r2911,%r2912,%r2913,%r2914,%r2915}, %rd225, %rd226, %p328, 1, 1, 0, 0; + // end inline asm + add.s32 %r4043, %r4024, 16416; + bfe.u32 %r4044, %r4043, 4, 14; + cvt.u64.u32 %rd261, %r4044; + or.b64 %rd227, %rd261, 4611686293372403712; + add.s32 %r4045, %r3303, 8224; + bfe.u32 %r4046, %r4045, 4, 14; + cvt.u64.u32 %rd262, %r4046; + or.b64 %rd228, %rd262, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2884,%r2885,%r2886,%r2887,%r2888,%r2889,%r2890,%r2891,%r2892,%r2893,%r2894,%r2895,%r2896,%r2897,%r2898,%r2899,%r2900,%r2901,%r2902,%r2903,%r2904,%r2905,%r2906,%r2907,%r2908,%r2909,%r2910,%r2911,%r2912,%r2913,%r2914,%r2915}, %rd227, %rd228, %p328, 1, 1, 0, 0; + // end inline asm + add.s32 %r4047, %r4024, 16448; + bfe.u32 %r4048, %r4047, 4, 14; + cvt.u64.u32 %rd263, %r4048; + or.b64 %rd229, %rd263, 4611686293372403712; + add.s32 %r4049, %r3303, 8256; + bfe.u32 %r4050, %r4049, 4, 14; + cvt.u64.u32 %rd264, %r4050; + or.b64 %rd230, %rd264, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2884,%r2885,%r2886,%r2887,%r2888,%r2889,%r2890,%r2891,%r2892,%r2893,%r2894,%r2895,%r2896,%r2897,%r2898,%r2899,%r2900,%r2901,%r2902,%r2903,%r2904,%r2905,%r2906,%r2907,%r2908,%r2909,%r2910,%r2911,%r2912,%r2913,%r2914,%r2915}, %rd229, %rd230, %p328, 1, 1, 0, 0; + // end inline asm + add.s32 %r4051, %r4024, 16480; + bfe.u32 %r4052, %r4051, 4, 14; + cvt.u64.u32 %rd265, %r4052; + or.b64 %rd231, %rd265, 4611686293372403712; + add.s32 %r4053, %r3303, 8288; + bfe.u32 %r4054, %r4053, 4, 14; + cvt.u64.u32 %rd266, %r4054; + or.b64 %rd232, %rd266, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n64k16.f32.bf16.bf16 {%r2884,%r2885,%r2886,%r2887,%r2888,%r2889,%r2890,%r2891,%r2892,%r2893,%r2894,%r2895,%r2896,%r2897,%r2898,%r2899,%r2900,%r2901,%r2902,%r2903,%r2904,%r2905,%r2906,%r2907,%r2908,%r2909,%r2910,%r2911,%r2912,%r2913,%r2914,%r2915}, %rd231, %rd232, %p328, 1, 1, 0, 0; + // end inline asm + wgmma.commit_group.sync.aligned; + mov.b32 %r3305, %r3403; + mov.b32 %r3301, %r3403; + mov.b32 %r3302, %r3403; + mov.b32 %r3304, %r3403; + // begin inline asm + // wait for regs: %r2884,%r2885,%r2886,%r2887,%r2888,%r2889,%r2890,%r2891,%r2892,%r2893,%r2894,%r2895,%r2896,%r2897,%r2898,%r2899,%r2900,%r2901,%r2902,%r2903,%r2904,%r2905,%r2906,%r2907,%r2908,%r2909,%r2910,%r2911,%r2912,%r2913,%r2914,%r2915,%r3300,%r3301,%r3302,%r3303,%r3304,%r3305,%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765 + wgmma.wait_group.sync.aligned 0; + // end inline asm + .loc 1 353 14 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:353:14 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4055, %r2884, 0f3DB504F3; + mul.f32 %r4056, %r2885, 0f3DB504F3; + mul.f32 %r4057, %r2886, 0f3DB504F3; + mul.f32 %r4058, %r2887, 0f3DB504F3; + mul.f32 %r4059, %r2888, 0f3DB504F3; + mul.f32 %r4060, %r2889, 0f3DB504F3; + mul.f32 %r4061, %r2890, 0f3DB504F3; + mul.f32 %r4062, %r2891, 0f3DB504F3; + mul.f32 %r4063, %r2892, 0f3DB504F3; + mul.f32 %r4064, %r2893, 0f3DB504F3; + mul.f32 %r4065, %r2894, 0f3DB504F3; + mul.f32 %r4066, %r2895, 0f3DB504F3; + mul.f32 %r4067, %r2896, 0f3DB504F3; + mul.f32 %r4068, %r2897, 0f3DB504F3; + mul.f32 %r4069, %r2898, 0f3DB504F3; + mul.f32 %r4070, %r2899, 0f3DB504F3; + mul.f32 %r4071, %r2900, 0f3DB504F3; + mul.f32 %r4072, %r2901, 0f3DB504F3; + mul.f32 %r4073, %r2902, 0f3DB504F3; + mul.f32 %r4074, %r2903, 0f3DB504F3; + mul.f32 %r4075, %r2904, 0f3DB504F3; + mul.f32 %r4076, %r2905, 0f3DB504F3; + mul.f32 %r4077, %r2906, 0f3DB504F3; + mul.f32 %r4078, %r2907, 0f3DB504F3; + mul.f32 %r4079, %r2908, 0f3DB504F3; + mul.f32 %r4080, %r2909, 0f3DB504F3; + mul.f32 %r4081, %r2910, 0f3DB504F3; + mul.f32 %r4082, %r2911, 0f3DB504F3; + mul.f32 %r4083, %r2912, 0f3DB504F3; + mul.f32 %r4084, %r2913, 0f3DB504F3; + mul.f32 %r4085, %r2914, 0f3DB504F3; + mul.f32 %r4086, %r2915, 0f3DB504F3; + .loc 1 367 44 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:44 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + setp.lt.s32 %p343, %r4995, %r712; + setp.lt.s32 %p344, %r4994, %r712; + setp.lt.s32 %p345, %r4993, %r712; + setp.lt.s32 %p346, %r4992, %r712; + setp.lt.s32 %p347, %r4991, %r712; + setp.lt.s32 %p348, %r4990, %r712; + setp.lt.s32 %p349, %r4989, %r712; + setp.lt.s32 %p350, %r4988, %r712; + setp.lt.s32 %p351, %r4987, %r712; + setp.lt.s32 %p352, %r4986, %r712; + setp.lt.s32 %p353, %r4985, %r712; + setp.lt.s32 %p354, %r4984, %r712; + setp.lt.s32 %p355, %r4983, %r712; + setp.lt.s32 %p356, %r4982, %r712; + setp.lt.s32 %p357, %r4981, %r712; + setp.lt.s32 %p358, %r4980, %r712; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4087, %r4055, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4088, %r4087, 0fFF800000, %p343; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4089, %r4056, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4090, %r4089, 0fFF800000, %p344; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4091, %r4057, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4092, %r4091, 0fFF800000, %p343; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4093, %r4058, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4094, %r4093, 0fFF800000, %p344; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4095, %r4059, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4096, %r4095, 0fFF800000, %p345; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4097, %r4060, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4098, %r4097, 0fFF800000, %p346; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4099, %r4061, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4100, %r4099, 0fFF800000, %p345; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4101, %r4062, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4102, %r4101, 0fFF800000, %p346; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4103, %r4063, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4104, %r4103, 0fFF800000, %p347; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4105, %r4064, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4106, %r4105, 0fFF800000, %p348; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4107, %r4065, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4108, %r4107, 0fFF800000, %p347; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4109, %r4066, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4110, %r4109, 0fFF800000, %p348; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4111, %r4067, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4112, %r4111, 0fFF800000, %p349; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4113, %r4068, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4114, %r4113, 0fFF800000, %p350; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4115, %r4069, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4116, %r4115, 0fFF800000, %p349; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4117, %r4070, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4118, %r4117, 0fFF800000, %p350; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4119, %r4071, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4120, %r4119, 0fFF800000, %p351; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4121, %r4072, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4122, %r4121, 0fFF800000, %p352; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4123, %r4073, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4124, %r4123, 0fFF800000, %p351; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4125, %r4074, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4126, %r4125, 0fFF800000, %p352; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4127, %r4075, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4128, %r4127, 0fFF800000, %p353; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4129, %r4076, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4130, %r4129, 0fFF800000, %p354; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4131, %r4077, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4132, %r4131, 0fFF800000, %p353; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4133, %r4078, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4134, %r4133, 0fFF800000, %p354; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4135, %r4079, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4136, %r4135, 0fFF800000, %p355; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4137, %r4080, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4138, %r4137, 0fFF800000, %p356; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4139, %r4081, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4140, %r4139, 0fFF800000, %p355; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4141, %r4082, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4142, %r4141, 0fFF800000, %p356; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4143, %r4083, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4144, %r4143, 0fFF800000, %p357; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4145, %r4084, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4146, %r4145, 0fFF800000, %p358; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4147, %r4085, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4148, %r4147, 0fFF800000, %p357; + .loc 1 417 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:417:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r4149, %r4086, 0f3FB8AA3B; + .loc 1 367 69 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:367:69 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4150, %r4149, 0fFF800000, %p358; + .loc 2 168 27 // standard.py:168:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + max.f32 %r4151, %r4088, %r4090; + max.f32 %r4152, %r4092, %r4094; + max.f32 %r4153, %r4151, %r4096; + max.f32 %r4154, %r4153, %r4098; + max.f32 %r4155, %r4152, %r4100; + max.f32 %r4156, %r4155, %r4102; + max.f32 %r4157, %r4154, %r4104; + max.f32 %r4158, %r4157, %r4106; + max.f32 %r4159, %r4156, %r4108; + max.f32 %r4160, %r4159, %r4110; + max.f32 %r4161, %r4158, %r4112; + max.f32 %r4162, %r4161, %r4114; + max.f32 %r4163, %r4160, %r4116; + max.f32 %r4164, %r4163, %r4118; + max.f32 %r4165, %r4162, %r4120; + max.f32 %r4166, %r4165, %r4122; + max.f32 %r4167, %r4164, %r4124; + max.f32 %r4168, %r4167, %r4126; + max.f32 %r4169, %r4166, %r4128; + max.f32 %r4170, %r4169, %r4130; + max.f32 %r4171, %r4168, %r4132; + max.f32 %r4172, %r4171, %r4134; + max.f32 %r4173, %r4170, %r4136; + max.f32 %r4174, %r4173, %r4138; + max.f32 %r4175, %r4172, %r4140; + max.f32 %r4176, %r4175, %r4142; + max.f32 %r4177, %r4174, %r4144; + max.f32 %r4178, %r4177, %r4146; + max.f32 %r4179, %r4176, %r4148; + max.f32 %r4180, %r4179, %r4150; + .loc 2 189 40 // standard.py:189:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + shfl.sync.bfly.b32 %r4181, %r4178, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + max.f32 %r4182, %r4178, %r4181; + .loc 2 189 40 // standard.py:189:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + shfl.sync.bfly.b32 %r4183, %r4182, 1, 31, -1; + shfl.sync.bfly.b32 %r4184, %r4180, 2, 31, -1; + .loc 2 168 27 // standard.py:168:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + max.f32 %r4185, %r4180, %r4184; + .loc 2 189 40 // standard.py:189:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + shfl.sync.bfly.b32 %r4186, %r4185, 1, 31, -1; + cvt.u64.u32 %rd267, %r4183; + cvt.u64.u32 %rd268, %r4186; + shl.b64 %rd269, %rd268, 32; + or.b64 %rd270, %rd267, %rd269; + .loc 2 168 27 // standard.py:168:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mov.b64 {%r4187, %r4188}, %rd270; + max.f32 %r4189, %r4182, %r4187; + max.f32 %r4190, %r4185, %r4188; + .loc 1 421 27 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:421:27 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mov.b64 {%r4191, %r4192}, %rd292; + max.f32 %r4193, %r4192, %r4190; + max.f32 %r4194, %r4191, %r4189; + mov.b64 %rd292, {%r4194, %r4193}; + .loc 1 423 35 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:423:35 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + setp.eq.f32 %p359, %r4194, 0fFF800000; + setp.eq.f32 %p360, %r4193, 0fFF800000; + .loc 1 424 51 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:424:51 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + selp.f32 %r4195, 0f00000000, %r4194, %p359; + selp.f32 %r4196, 0f00000000, %r4193, %p360; + .loc 1 428 31 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:428:31 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + sub.f32 %r4197, %r4191, %r4195; + sub.f32 %r4198, %r4192, %r4196; + .loc 1 428 25 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:428:25 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + ex2.approx.ftz.f32 %r4199, %r4197; + ex2.approx.ftz.f32 %r4200, %r4198; + .loc 1 429 39 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:429:39 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + sub.f32 %r4201, %r4088, %r4195; + sub.f32 %r4202, %r4090, %r4195; + sub.f32 %r4203, %r4092, %r4196; + sub.f32 %r4204, %r4094, %r4196; + sub.f32 %r4205, %r4096, %r4195; + sub.f32 %r4206, %r4098, %r4195; + sub.f32 %r4207, %r4100, %r4196; + sub.f32 %r4208, %r4102, %r4196; + sub.f32 %r4209, %r4104, %r4195; + sub.f32 %r4210, %r4106, %r4195; + sub.f32 %r4211, %r4108, %r4196; + sub.f32 %r4212, %r4110, %r4196; + sub.f32 %r4213, %r4112, %r4195; + sub.f32 %r4214, %r4114, %r4195; + sub.f32 %r4215, %r4116, %r4196; + sub.f32 %r4216, %r4118, %r4196; + sub.f32 %r4217, %r4120, %r4195; + sub.f32 %r4218, %r4122, %r4195; + sub.f32 %r4219, %r4124, %r4196; + sub.f32 %r4220, %r4126, %r4196; + sub.f32 %r4221, %r4128, %r4195; + sub.f32 %r4222, %r4130, %r4195; + sub.f32 %r4223, %r4132, %r4196; + sub.f32 %r4224, %r4134, %r4196; + sub.f32 %r4225, %r4136, %r4195; + sub.f32 %r4226, %r4138, %r4195; + sub.f32 %r4227, %r4140, %r4196; + sub.f32 %r4228, %r4142, %r4196; + sub.f32 %r4229, %r4144, %r4195; + sub.f32 %r4230, %r4146, %r4195; + sub.f32 %r4231, %r4148, %r4196; + sub.f32 %r4232, %r4150, %r4196; + .loc 1 429 21 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:429:21 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + ex2.approx.ftz.f32 %r4233, %r4201; + ex2.approx.ftz.f32 %r4234, %r4202; + ex2.approx.ftz.f32 %r4235, %r4203; + ex2.approx.ftz.f32 %r4236, %r4204; + ex2.approx.ftz.f32 %r4237, %r4205; + ex2.approx.ftz.f32 %r4238, %r4206; + ex2.approx.ftz.f32 %r4239, %r4207; + ex2.approx.ftz.f32 %r4240, %r4208; + ex2.approx.ftz.f32 %r4241, %r4209; + ex2.approx.ftz.f32 %r4242, %r4210; + ex2.approx.ftz.f32 %r4243, %r4211; + ex2.approx.ftz.f32 %r4244, %r4212; + ex2.approx.ftz.f32 %r4245, %r4213; + ex2.approx.ftz.f32 %r4246, %r4214; + ex2.approx.ftz.f32 %r4247, %r4215; + ex2.approx.ftz.f32 %r4248, %r4216; + ex2.approx.ftz.f32 %r4249, %r4217; + ex2.approx.ftz.f32 %r4250, %r4218; + ex2.approx.ftz.f32 %r4251, %r4219; + ex2.approx.ftz.f32 %r4252, %r4220; + ex2.approx.ftz.f32 %r4253, %r4221; + ex2.approx.ftz.f32 %r4254, %r4222; + ex2.approx.ftz.f32 %r4255, %r4223; + ex2.approx.ftz.f32 %r4256, %r4224; + ex2.approx.ftz.f32 %r4257, %r4225; + ex2.approx.ftz.f32 %r4258, %r4226; + ex2.approx.ftz.f32 %r4259, %r4227; + ex2.approx.ftz.f32 %r4260, %r4228; + ex2.approx.ftz.f32 %r4261, %r4229; + ex2.approx.ftz.f32 %r4262, %r4230; + ex2.approx.ftz.f32 %r4263, %r4231; + ex2.approx.ftz.f32 %r4264, %r4232; + .loc 2 261 15 // standard.py:261:15 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.f32 %r4265, %r4233, %r4234; + add.f32 %r4266, %r4235, %r4236; + add.f32 %r4267, %r4265, %r4237; + add.f32 %r4268, %r4267, %r4238; + add.f32 %r4269, %r4266, %r4239; + add.f32 %r4270, %r4269, %r4240; + add.f32 %r4271, %r4268, %r4241; + add.f32 %r4272, %r4271, %r4242; + add.f32 %r4273, %r4270, %r4243; + add.f32 %r4274, %r4273, %r4244; + add.f32 %r4275, %r4272, %r4245; + add.f32 %r4276, %r4275, %r4246; + add.f32 %r4277, %r4274, %r4247; + add.f32 %r4278, %r4277, %r4248; + add.f32 %r4279, %r4276, %r4249; + add.f32 %r4280, %r4279, %r4250; + add.f32 %r4281, %r4278, %r4251; + add.f32 %r4282, %r4281, %r4252; + add.f32 %r4283, %r4280, %r4253; + add.f32 %r4284, %r4283, %r4254; + add.f32 %r4285, %r4282, %r4255; + add.f32 %r4286, %r4285, %r4256; + add.f32 %r4287, %r4284, %r4257; + add.f32 %r4288, %r4287, %r4258; + add.f32 %r4289, %r4286, %r4259; + add.f32 %r4290, %r4289, %r4260; + add.f32 %r4291, %r4288, %r4261; + add.f32 %r4292, %r4291, %r4262; + add.f32 %r4293, %r4290, %r4263; + add.f32 %r4294, %r4293, %r4264; + .loc 2 291 36 // standard.py:291:36 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + shfl.sync.bfly.b32 %r4295, %r4292, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.f32 %r4296, %r4292, %r4295; + .loc 2 291 36 // standard.py:291:36 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + shfl.sync.bfly.b32 %r4297, %r4296, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.f32 %r4298, %r4296, %r4297; + .loc 2 291 36 // standard.py:291:36 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + shfl.sync.bfly.b32 %r4299, %r4294, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.f32 %r4300, %r4294, %r4299; + .loc 2 291 36 // standard.py:291:36 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + shfl.sync.bfly.b32 %r4301, %r4300, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.f32 %r4302, %r4300, %r4301; + .loc 1 434 24 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:434:24 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + fma.rn.f32 %r5060, %r5060, %r4199, %r4298; + fma.rn.f32 %r5061, %r5061, %r4200, %r4302; + .loc 1 436 16 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:436:16 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.f32 %r1702, %r1702, %r4199; + mul.f32 %r1703, %r1703, %r4199; + mul.f32 %r1704, %r1704, %r4200; + mul.f32 %r1705, %r1705, %r4200; + mul.f32 %r1706, %r1706, %r4199; + mul.f32 %r1707, %r1707, %r4199; + mul.f32 %r1708, %r1708, %r4200; + mul.f32 %r1709, %r1709, %r4200; + mul.f32 %r1710, %r1710, %r4199; + mul.f32 %r1711, %r1711, %r4199; + mul.f32 %r1712, %r1712, %r4200; + mul.f32 %r1713, %r1713, %r4200; + mul.f32 %r1714, %r1714, %r4199; + mul.f32 %r1715, %r1715, %r4199; + mul.f32 %r1716, %r1716, %r4200; + mul.f32 %r1717, %r1717, %r4200; + mul.f32 %r1718, %r1718, %r4199; + mul.f32 %r1719, %r1719, %r4199; + mul.f32 %r1720, %r1720, %r4200; + mul.f32 %r1721, %r1721, %r4200; + mul.f32 %r1722, %r1722, %r4199; + mul.f32 %r1723, %r1723, %r4199; + mul.f32 %r1724, %r1724, %r4200; + mul.f32 %r1725, %r1725, %r4200; + mul.f32 %r1726, %r1726, %r4199; + mul.f32 %r1727, %r1727, %r4199; + mul.f32 %r1728, %r1728, %r4200; + mul.f32 %r1729, %r1729, %r4200; + mul.f32 %r1730, %r1730, %r4199; + mul.f32 %r1731, %r1731, %r4199; + mul.f32 %r1732, %r1732, %r4200; + mul.f32 %r1733, %r1733, %r4200; + mul.f32 %r1734, %r1734, %r4199; + mul.f32 %r1735, %r1735, %r4199; + mul.f32 %r1736, %r1736, %r4200; + mul.f32 %r1737, %r1737, %r4200; + mul.f32 %r1738, %r1738, %r4199; + mul.f32 %r1739, %r1739, %r4199; + mul.f32 %r1740, %r1740, %r4200; + mul.f32 %r1741, %r1741, %r4200; + mul.f32 %r1742, %r1742, %r4199; + mul.f32 %r1743, %r1743, %r4199; + mul.f32 %r1744, %r1744, %r4200; + mul.f32 %r1745, %r1745, %r4200; + mul.f32 %r1746, %r1746, %r4199; + mul.f32 %r1747, %r1747, %r4199; + mul.f32 %r1748, %r1748, %r4200; + mul.f32 %r1749, %r1749, %r4200; + mul.f32 %r1750, %r1750, %r4199; + mul.f32 %r1751, %r1751, %r4199; + mul.f32 %r1752, %r1752, %r4200; + mul.f32 %r1753, %r1753, %r4200; + mul.f32 %r1754, %r1754, %r4199; + mul.f32 %r1755, %r1755, %r4199; + mul.f32 %r1756, %r1756, %r4200; + mul.f32 %r1757, %r1757, %r4200; + mul.f32 %r1758, %r1758, %r4199; + mul.f32 %r1759, %r1759, %r4199; + mul.f32 %r1760, %r1760, %r4200; + mul.f32 %r1761, %r1761, %r4200; + mul.f32 %r1762, %r1762, %r4199; + mul.f32 %r1763, %r1763, %r4199; + mul.f32 %r1764, %r1764, %r4200; + mul.f32 %r1765, %r1765, %r4200; + .loc 1 292 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.s32 %r4303, %r836, 49152; + add.s32 %r4304, %r4303, %r4019; + .loc 1 440 22 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:440:22 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + cvt.rn.bf16x2.f32 %r3600, %r4234, %r4233; + cvt.rn.bf16x2.f32 %r3601, %r4236, %r4235; + cvt.rn.bf16x2.f32 %r3602, %r4238, %r4237; + cvt.rn.bf16x2.f32 %r3603, %r4240, %r4239; + cvt.rn.bf16x2.f32 %r3732, %r4242, %r4241; + cvt.rn.bf16x2.f32 %r3733, %r4244, %r4243; + cvt.rn.bf16x2.f32 %r3734, %r4246, %r4245; + cvt.rn.bf16x2.f32 %r3735, %r4248, %r4247; + cvt.rn.bf16x2.f32 %r3864, %r4250, %r4249; + cvt.rn.bf16x2.f32 %r3865, %r4252, %r4251; + cvt.rn.bf16x2.f32 %r3866, %r4254, %r4253; + cvt.rn.bf16x2.f32 %r3867, %r4256, %r4255; + cvt.rn.bf16x2.f32 %r3996, %r4258, %r4257; + cvt.rn.bf16x2.f32 %r3997, %r4260, %r4259; + cvt.rn.bf16x2.f32 %r3998, %r4262, %r4261; + cvt.rn.bf16x2.f32 %r3999, %r4264, %r4263; + .loc 1 440 44 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:440:44 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + wgmma.fence.sync.aligned; + bfe.u32 %r4305, %r4304, 4, 14; + cvt.u64.u32 %rd271, %r4305; + or.b64 %rd233, %rd271, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765}, {%r3600,%r3601,%r3602,%r3603}, %rd233, %p328, 1, 1, 1; + // end inline asm + add.s32 %r4306, %r4304, 2048; + bfe.u32 %r4307, %r4306, 4, 14; + cvt.u64.u32 %rd272, %r4307; + or.b64 %rd234, %rd272, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765}, {%r3732,%r3733,%r3734,%r3735}, %rd234, %p328, 1, 1, 1; + // end inline asm + add.s32 %r4308, %r4304, 4096; + bfe.u32 %r4309, %r4308, 4, 14; + cvt.u64.u32 %rd273, %r4309; + or.b64 %rd235, %rd273, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765}, {%r3864,%r3865,%r3866,%r3867}, %rd235, %p328, 1, 1, 1; + // end inline asm + add.s32 %r4310, %r4304, 6144; + bfe.u32 %r4311, %r4310, 4, 14; + cvt.u64.u32 %rd274, %r4311; + or.b64 %rd236, %rd274, 4611686293338849280; + // begin inline asm + wgmma.mma_async.sync.aligned.m64n128k16.f32.bf16.bf16 {%r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765}, {%r3996,%r3997,%r3998,%r3999}, %rd236, %p328, 1, 1, 1; + // end inline asm + wgmma.commit_group.sync.aligned; + .loc 1 548 26 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:548:26 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.s32 %r4980, %r4909, %r4980; + add.s32 %r4981, %r4909, %r4981; + add.s32 %r4982, %r4909, %r4982; + add.s32 %r4983, %r4909, %r4983; + add.s32 %r4984, %r4909, %r4984; + add.s32 %r4985, %r4909, %r4985; + add.s32 %r4986, %r4909, %r4986; + add.s32 %r4987, %r4909, %r4987; + add.s32 %r4988, %r4909, %r4988; + add.s32 %r4989, %r4909, %r4989; + add.s32 %r4990, %r4909, %r4990; + add.s32 %r4991, %r4909, %r4991; + add.s32 %r4992, %r4909, %r4992; + add.s32 %r4993, %r4909, %r4993; + add.s32 %r4994, %r4909, %r4994; + add.s32 %r4995, %r4909, %r4995; + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.s32 %r632, %r4977, 1; + .loc 1 247 33 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:247:33 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + shr.u32 %r4312, %r632, 1; + .loc 1 248 38 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:248:38 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mad.wide.u32 %rd238, %r4312, 4, %rd172; + .loc 1 248 24 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:248:24 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + // begin inline asm + mov.u64 %rd237, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd237, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4000, 0x0; + @%p339 ld.global.L1::evict_last.L2::cache_hint.b32 { %r4000 }, [ %rd238 + 0 ], %rd237; + // end inline asm + .loc 1 249 109 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:249:109 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.s32 %r4313, %r4312, 1; + .loc 1 249 113 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:249:113 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + setp.lt.s32 %p361, %r4313, %r2725; + .loc 1 249 55 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:249:55 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.s64 %rd241, %rd238, 4; + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + and.pred %p340, %p339, %p361; + .loc 1 249 25 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:249:25 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + // begin inline asm + mov.u64 %rd240, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd240, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r4001, 0x0; + @%p340 ld.global.L1::evict_last.L2::cache_hint.b32 { %r4001 }, [ %rd241 + 0 ], %rd240; + // end inline asm + .loc 1 250 35 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:250:35 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + and.b32 %r4314, %r4977, 1; + .loc 1 251 34 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:251:34 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + sub.s32 %r4315, %r4001, %r4000; + .loc 1 251 48 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:251:48 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + shl.b32 %r4316, %r4315, 7; + .loc 1 251 63 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:251:63 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.s32 %r4317, %r4316, -64; + .loc 1 252 29 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:252:29 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + xor.b32 %r4318, %r4314, 1; + .loc 1 252 61 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:252:61 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + shl.b32 %r4319, %r4314, 6; + .loc 1 252 42 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:252:42 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mad.lo.s32 %r4909, %r4317, %r4318, %r4319; + .loc 1 549 21 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:549:21 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.s32 %r4912, %r4909, %r4912; + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.s32 %r4320, %r4911, 1; + setp.gt.s32 %p362, %r4320, 2; + selp.b32 %r4911, 0, %r4320, %p362; + .loc 1 342 32 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:342:32 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.s32 %r4321, %r4912, %r376; + .loc 1 346 35 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:346:35 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.s32 %r4322, %r4321, %r8; + add.s32 %r4323, %r4321, %r9; + add.s32 %r4324, %r4321, %r10; + add.s32 %r4325, %r4321, %r11; + .loc 1 284 38 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:38 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + shl.b32 %r4326, %r4322, 7; + shl.b32 %r4327, %r4323, 7; + shl.b32 %r4328, %r4324, 7; + shl.b32 %r4329, %r4325, 7; + .loc 1 284 49 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:49 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + mul.wide.s32 %rd275, %r4326, 2; + add.s64 %rd243, %rd10, %rd275; + mul.wide.s32 %rd276, %r4327, 2; + add.s64 %rd244, %rd10, %rd276; + mul.wide.s32 %rd277, %r4328, 2; + add.s64 %rd245, %rd10, %rd277; + mul.wide.s32 %rd278, %r4329, 2; + add.s64 %rd246, %rd10, %rd278; + .loc 1 292 52 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:52 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + setp.lt.s32 %p363, %r4322, %r712; + setp.lt.s32 %p364, %r4323, %r712; + setp.lt.s32 %p365, %r4324, %r712; + setp.lt.s32 %p366, %r4325, %r712; + .loc 1 292 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + shl.b32 %r4330, %r4911, 14; + add.s32 %r4331, %r836, %r4330; + bar.sync 0; + add.s32 %r4002, %r4331, %r84; + selp.b32 %r4332, 16, 0, %p363; + selp.b32 %r4011, %r4332, 0, %p341; + // begin inline asm + cp.async.cg.shared.global [ %r4002 + 0 ], [ %rd243 + 0 ], 0x10, %r4011; + // end inline asm + add.s32 %r4004, %r4002, 2048; + selp.b32 %r4333, 16, 0, %p364; + selp.b32 %r4013, %r4333, 0, %p341; + // begin inline asm + cp.async.cg.shared.global [ %r4004 + 0 ], [ %rd244 + 0 ], 0x10, %r4013; + // end inline asm + add.s32 %r4006, %r4002, 4096; + selp.b32 %r4334, 16, 0, %p365; + selp.b32 %r4015, %r4334, 0, %p341; + // begin inline asm + cp.async.cg.shared.global [ %r4006 + 0 ], [ %rd245 + 0 ], 0x10, %r4015; + // end inline asm + add.s32 %r4008, %r4002, 6144; + selp.b32 %r4335, 16, 0, %p366; + selp.b32 %r4017, %r4335, 0, %p341; + // begin inline asm + cp.async.cg.shared.global [ %r4008 + 0 ], [ %rd246 + 0 ], 0x10, %r4017; + // end inline asm + cp.async.commit_group; + .loc 1 284 49 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:284:49 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.s64 %rd247, %rd11, %rd275; + add.s64 %rd248, %rd11, %rd276; + add.s64 %rd249, %rd11, %rd277; + add.s64 %rd250, %rd11, %rd278; + .loc 1 292 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:292:23 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + add.s32 %r4336, %r4303, %r4330; + add.s32 %r4010, %r4336, %r84; + // begin inline asm + cp.async.cg.shared.global [ %r4010 + 0 ], [ %rd247 + 0 ], 0x10, %r4011; + // end inline asm + add.s32 %r4012, %r4010, 2048; + // begin inline asm + cp.async.cg.shared.global [ %r4012 + 0 ], [ %rd248 + 0 ], 0x10, %r4013; + // end inline asm + add.s32 %r4014, %r4010, 4096; + // begin inline asm + cp.async.cg.shared.global [ %r4014 + 0 ], [ %rd249 + 0 ], 0x10, %r4015; + // end inline asm + add.s32 %r4016, %r4010, 6144; + // begin inline asm + cp.async.cg.shared.global [ %r4016 + 0 ], [ %rd250 + 0 ], 0x10, %r4017; + // end inline asm + cp.async.commit_group; + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + setp.ne.b32 %p367, %r461, %r632; + mov.b32 %r4977, %r632; + @%p367 bra $L__BB0_5; +$L__BB0_6: // %._crit_edge461 + .loc 1 0 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:0:40 + cvt.u32.u64 %r4537, %rd3; + .loc 1 502 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:502:40 @[ cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:198:45 ] + // begin inline asm + // wait for regs: %r1702,%r1703,%r1704,%r1705,%r1706,%r1707,%r1708,%r1709,%r1710,%r1711,%r1712,%r1713,%r1714,%r1715,%r1716,%r1717,%r1718,%r1719,%r1720,%r1721,%r1722,%r1723,%r1724,%r1725,%r1726,%r1727,%r1728,%r1729,%r1730,%r1731,%r1732,%r1733,%r1734,%r1735,%r1736,%r1737,%r1738,%r1739,%r1740,%r1741,%r1742,%r1743,%r1744,%r1745,%r1746,%r1747,%r1748,%r1749,%r1750,%r1751,%r1752,%r1753,%r1754,%r1755,%r1756,%r1757,%r1758,%r1759,%r1760,%r1761,%r1762,%r1763,%r1764,%r1765 + wgmma.wait_group.sync.aligned 0; + // end inline asm + cp.async.wait_group 0; + bar.sync 0; +$L__tmp12: + .loc 1 206 26 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:206:26 + setp.eq.f32 %p376, %r5060, 0f00000000; + setp.eq.f32 %p377, %r5061, 0f00000000; + .loc 1 206 34 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:206:34 + selp.f32 %r4538, 0f3F800000, %r5060, %p376; + selp.f32 %r702, 0f3F800000, %r5061, %p377; + .loc 1 208 16 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:208:16 + div.full.f32 %r4539, %r1702, %r4538; + div.full.f32 %r4540, %r1703, %r4538; + div.full.f32 %r4541, %r1704, %r702; + div.full.f32 %r4542, %r1705, %r702; + div.full.f32 %r4543, %r1706, %r4538; + div.full.f32 %r4544, %r1707, %r4538; + div.full.f32 %r4545, %r1708, %r702; + div.full.f32 %r4546, %r1709, %r702; + div.full.f32 %r4547, %r1710, %r4538; + div.full.f32 %r4548, %r1711, %r4538; + div.full.f32 %r4549, %r1712, %r702; + div.full.f32 %r4550, %r1713, %r702; + div.full.f32 %r4551, %r1714, %r4538; + div.full.f32 %r4552, %r1715, %r4538; + div.full.f32 %r4553, %r1716, %r702; + div.full.f32 %r4554, %r1717, %r702; + div.full.f32 %r4555, %r1718, %r4538; + div.full.f32 %r4556, %r1719, %r4538; + div.full.f32 %r4557, %r1720, %r702; + div.full.f32 %r4558, %r1721, %r702; + div.full.f32 %r4559, %r1722, %r4538; + div.full.f32 %r4560, %r1723, %r4538; + div.full.f32 %r4561, %r1724, %r702; + div.full.f32 %r4562, %r1725, %r702; + div.full.f32 %r4563, %r1726, %r4538; + div.full.f32 %r4564, %r1727, %r4538; + div.full.f32 %r4565, %r1728, %r702; + div.full.f32 %r4566, %r1729, %r702; + div.full.f32 %r4567, %r1730, %r4538; + div.full.f32 %r4568, %r1731, %r4538; + div.full.f32 %r4569, %r1732, %r702; + div.full.f32 %r4570, %r1733, %r702; + div.full.f32 %r4571, %r1734, %r4538; + div.full.f32 %r4572, %r1735, %r4538; + div.full.f32 %r4573, %r1736, %r702; + div.full.f32 %r4574, %r1737, %r702; + div.full.f32 %r4575, %r1738, %r4538; + div.full.f32 %r4576, %r1739, %r4538; + div.full.f32 %r4577, %r1740, %r702; + div.full.f32 %r4578, %r1741, %r702; + div.full.f32 %r4579, %r1742, %r4538; + div.full.f32 %r4580, %r1743, %r4538; + div.full.f32 %r4581, %r1744, %r702; + div.full.f32 %r4582, %r1745, %r702; + div.full.f32 %r4583, %r1746, %r4538; + div.full.f32 %r4584, %r1747, %r4538; + div.full.f32 %r4585, %r1748, %r702; + div.full.f32 %r4586, %r1749, %r702; + div.full.f32 %r4587, %r1750, %r4538; + div.full.f32 %r4588, %r1751, %r4538; + div.full.f32 %r4589, %r1752, %r702; + div.full.f32 %r4590, %r1753, %r702; + div.full.f32 %r4591, %r1754, %r4538; + div.full.f32 %r4592, %r1755, %r4538; + div.full.f32 %r4593, %r1756, %r702; + div.full.f32 %r4594, %r1757, %r702; + div.full.f32 %r4595, %r1758, %r4538; + div.full.f32 %r4596, %r1759, %r4538; + div.full.f32 %r4597, %r1760, %r702; + div.full.f32 %r4598, %r1761, %r702; + div.full.f32 %r4599, %r1762, %r4538; + div.full.f32 %r4600, %r1763, %r4538; + div.full.f32 %r4601, %r1764, %r702; + div.full.f32 %r4602, %r1765, %r702; + .loc 1 218 49 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:218:49 + or.b32 %r4603, %r4537, %r4; + .loc 1 218 62 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:218:62 + or.b32 %r4604, %r4603, %r3; + .loc 1 218 75 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:218:75 + add.s32 %r4605, %r4604, %r20; + add.s32 %r4606, %r4604, %r21; + add.s32 %r4607, %r4604, %r22; + add.s32 %r4608, %r4604, %r23; + add.s32 %r4609, %r4604, %r24; + add.s32 %r4610, %r4604, %r25; + add.s32 %r4611, %r4604, %r26; + add.s32 %r4612, %r4604, %r27; + .loc 1 218 25 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:218:25 + mad.wide.s32 %rd279, %r4605, 2, %rd38; + mad.wide.s32 %rd280, %r4606, 2, %rd38; + mad.wide.s32 %rd281, %r4607, 2, %rd38; + mad.wide.s32 %rd282, %r4608, 2, %rd38; + mad.wide.s32 %rd283, %r4609, 2, %rd38; + mad.wide.s32 %rd284, %r4610, 2, %rd38; + mad.wide.s32 %rd285, %r4611, 2, %rd38; + mad.wide.s32 %rd286, %r4612, 2, %rd38; + .loc 1 218 109 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:218:109 + cvt.rn.bf16x2.f32 %r4613, %r4540, %r4539; + cvt.rn.bf16x2.f32 %r4614, %r4542, %r4541; + cvt.rn.bf16x2.f32 %r4615, %r4544, %r4543; + cvt.rn.bf16x2.f32 %r4616, %r4546, %r4545; + cvt.rn.bf16x2.f32 %r4617, %r4548, %r4547; + cvt.rn.bf16x2.f32 %r4618, %r4550, %r4549; + cvt.rn.bf16x2.f32 %r4619, %r4552, %r4551; + cvt.rn.bf16x2.f32 %r4620, %r4554, %r4553; + cvt.rn.bf16x2.f32 %r4621, %r4556, %r4555; + cvt.rn.bf16x2.f32 %r4622, %r4558, %r4557; + cvt.rn.bf16x2.f32 %r4623, %r4560, %r4559; + cvt.rn.bf16x2.f32 %r4624, %r4562, %r4561; + cvt.rn.bf16x2.f32 %r4625, %r4564, %r4563; + cvt.rn.bf16x2.f32 %r4626, %r4566, %r4565; + cvt.rn.bf16x2.f32 %r4627, %r4568, %r4567; + cvt.rn.bf16x2.f32 %r4628, %r4570, %r4569; + cvt.rn.bf16x2.f32 %r4629, %r4572, %r4571; + cvt.rn.bf16x2.f32 %r4630, %r4574, %r4573; + cvt.rn.bf16x2.f32 %r4631, %r4576, %r4575; + cvt.rn.bf16x2.f32 %r4632, %r4578, %r4577; + cvt.rn.bf16x2.f32 %r4633, %r4580, %r4579; + cvt.rn.bf16x2.f32 %r4634, %r4582, %r4581; + cvt.rn.bf16x2.f32 %r4635, %r4584, %r4583; + cvt.rn.bf16x2.f32 %r4636, %r4586, %r4585; + cvt.rn.bf16x2.f32 %r4637, %r4588, %r4587; + cvt.rn.bf16x2.f32 %r4638, %r4590, %r4589; + cvt.rn.bf16x2.f32 %r4639, %r4592, %r4591; + cvt.rn.bf16x2.f32 %r4640, %r4594, %r4593; + cvt.rn.bf16x2.f32 %r4641, %r4596, %r4595; + cvt.rn.bf16x2.f32 %r4642, %r4598, %r4597; + cvt.rn.bf16x2.f32 %r4643, %r4600, %r4599; + cvt.rn.bf16x2.f32 %r4644, %r4602, %r4601; + shl.b32 %r4645, %r35, 13; + shl.b32 %r4646, %r6, 5; + and.b32 %r4647, %r4646, 7264; + and.b32 %r4648, %r6, 24; + shl.b32 %r4649, %r4648, 4; + shl.b32 %r4650, %r6, 2; + and.b32 %r4651, %r4650, 16; + or.b32 %r4652, %r4645, %r4651; + or.b32 %r4653, %r4647, %r4649; + or.b32 %r4654, %r4652, %r4653; + add.s32 %r4656, %r836, %r4654; + st.shared.v4.b32 [%r4656], {%r4613, %r4615, %r4617, %r4619}; + st.shared.v4.b32 [%r4656+512], {%r4614, %r4616, %r4618, %r4620}; + xor.b32 %r4657, %r4654, 32; + add.s32 %r4658, %r836, %r4657; + st.shared.v4.b32 [%r4658], {%r4621, %r4623, %r4625, %r4627}; + st.shared.v4.b32 [%r4658+512], {%r4622, %r4624, %r4626, %r4628}; + xor.b32 %r4659, %r4654, 64; + add.s32 %r4660, %r836, %r4659; + st.shared.v4.b32 [%r4660], {%r4629, %r4631, %r4633, %r4635}; + st.shared.v4.b32 [%r4660+512], {%r4630, %r4632, %r4634, %r4636}; + xor.b32 %r4661, %r4654, 96; + add.s32 %r4662, %r836, %r4661; + st.shared.v4.b32 [%r4662], {%r4637, %r4639, %r4641, %r4643}; + st.shared.v4.b32 [%r4662+512], {%r4638, %r4640, %r4642, %r4644}; + bar.sync 0; + shl.b32 %r4663, %r4648, 10; + shl.b32 %r4664, %r35, 5; + and.b32 %r703, %r6, 252; + shl.b32 %r4665, %r703, 2; + or.b32 %r4666, %r4663, %r4664; + xor.b32 %r4667, %r4666, %r4665; + add.s32 %r4469, %r836, %r4667; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4465, %r4466, %r4467, %r4468}, [%r4469]; + // end inline asm + add.s32 %r4474, %r4469, 1024; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4470, %r4471, %r4472, %r4473}, [%r4474]; + // end inline asm + add.s32 %r4479, %r4469, 2048; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4475, %r4476, %r4477, %r4478}, [%r4479]; + // end inline asm + add.s32 %r4484, %r4469, 3072; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4480, %r4481, %r4482, %r4483}, [%r4484]; + // end inline asm + add.s32 %r4489, %r4469, 4096; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4485, %r4486, %r4487, %r4488}, [%r4489]; + // end inline asm + add.s32 %r4494, %r4469, 5120; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4490, %r4491, %r4492, %r4493}, [%r4494]; + // end inline asm + add.s32 %r4499, %r4469, 6144; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4495, %r4496, %r4497, %r4498}, [%r4499]; + // end inline asm + add.s32 %r4504, %r4469, 7168; + // begin inline asm + ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r4500, %r4501, %r4502, %r4503}, [%r4504]; + // end inline asm + // begin inline asm + @%p1 st.global.v4.b32 [ %rd279 + 0 ], { %r4465, %r4466, %r4467, %r4468 }; + // end inline asm + // begin inline asm + @%p2 st.global.v4.b32 [ %rd280 + 0 ], { %r4470, %r4471, %r4472, %r4473 }; + // end inline asm + // begin inline asm + @%p3 st.global.v4.b32 [ %rd281 + 0 ], { %r4475, %r4476, %r4477, %r4478 }; + // end inline asm + // begin inline asm + @%p4 st.global.v4.b32 [ %rd282 + 0 ], { %r4480, %r4481, %r4482, %r4483 }; + // end inline asm + // begin inline asm + @%p5 st.global.v4.b32 [ %rd283 + 0 ], { %r4485, %r4486, %r4487, %r4488 }; + // end inline asm + // begin inline asm + @%p6 st.global.v4.b32 [ %rd284 + 0 ], { %r4490, %r4491, %r4492, %r4493 }; + // end inline asm + // begin inline asm + @%p7 st.global.v4.b32 [ %rd285 + 0 ], { %r4495, %r4496, %r4497, %r4498 }; + // end inline asm + // begin inline asm + @%p8 st.global.v4.b32 [ %rd286 + 0 ], { %r4500, %r4501, %r4502, %r4503 }; + // end inline asm + .loc 1 223 33 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:223:33 + setp.lt.f32 %p378, %r4538, 0f00800000; + mul.f32 %r4668, %r4538, 0f4B000000; + selp.f32 %r704, %r4668, %r4538, %p378; + selp.f32 %r4669, 0fC1B80000, 0f00000000, %p378; + add.s32 %r4670, %r704, -1060439283; + and.b32 %r4671, %r4670, -8388608; + sub.s32 %r4672, %r704, %r4671; + cvt.rn.f32.s32 %r4673, %r4671; + mov.b32 %r4674, 0f34000000; + fma.rn.ftz.f32 %r4675, %r4673, %r4674, %r4669; + add.f32 %r4676, %r4672, 0fBF800000; + mov.b32 %r4677, 0fBE2C7F30; + mov.b32 %r4678, 0f3DC6B27F; + fma.rn.ftz.f32 %r4679, %r4678, %r4676, %r4677; + mov.b32 %r4680, 0f3E2FCF2A; + fma.rn.ftz.f32 %r4681, %r4679, %r4676, %r4680; + mov.b32 %r4682, 0fBE374E43; + fma.rn.ftz.f32 %r4683, %r4681, %r4676, %r4682; + mov.b32 %r4684, 0f3E520BF4; + fma.rn.ftz.f32 %r4685, %r4683, %r4676, %r4684; + mov.b32 %r4686, 0fBE763C8B; + fma.rn.ftz.f32 %r4687, %r4685, %r4676, %r4686; + mov.b32 %r4688, 0f3E93BF99; + fma.rn.ftz.f32 %r4689, %r4687, %r4676, %r4688; + mov.b32 %r4690, 0fBEB8AA49; + fma.rn.ftz.f32 %r4691, %r4689, %r4676, %r4690; + mov.b32 %r4692, 0f3EF6384A; + fma.rn.ftz.f32 %r4693, %r4691, %r4676, %r4692; + mov.b32 %r4694, 0fBF38AA3B; + fma.rn.ftz.f32 %r4695, %r4693, %r4676, %r4694; + mul.f32 %r4696, %r4676, %r4695; + mul.f32 %r4697, %r4676, %r4696; + mov.b32 %r4698, 0f3FB8AA3B; + fma.rn.ftz.f32 %r4699, %r4676, %r4698, %r4697; + add.f32 %r5062, %r4675, %r4699; + setp.lt.u32 %p379, %r704, 2139095040; + mov.b32 %r4700, 0f7F800000; + @%p379 bra $L__BB0_8; +// %bb.7: // %__nv_fmaf_rn.exit.i.i + .loc 1 0 33 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:0:33 + fma.rn.ftz.f32 %r5062, %r704, %r4700, %r4700; +$L__BB0_8: // %__nv_log2f.exit + ld.param.b64 %rd35, [triton_tem_fused_0_param_3]; + .loc 1 223 33 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:223:33 + setp.lt.f32 %p380, %r702, 0f00800000; + mul.f32 %r4701, %r702, 0f4B000000; + selp.f32 %r708, %r4701, %r702, %p380; + selp.f32 %r4702, 0fC1B80000, 0f00000000, %p380; + add.s32 %r4703, %r708, -1060439283; + and.b32 %r4704, %r4703, -8388608; + sub.s32 %r4705, %r708, %r4704; + cvt.rn.f32.s32 %r4706, %r4704; + fma.rn.ftz.f32 %r4708, %r4706, %r4674, %r4702; + add.f32 %r4709, %r4705, 0fBF800000; + fma.rn.ftz.f32 %r4712, %r4678, %r4709, %r4677; + fma.rn.ftz.f32 %r4714, %r4712, %r4709, %r4680; + fma.rn.ftz.f32 %r4716, %r4714, %r4709, %r4682; + fma.rn.ftz.f32 %r4718, %r4716, %r4709, %r4684; + fma.rn.ftz.f32 %r4720, %r4718, %r4709, %r4686; + fma.rn.ftz.f32 %r4722, %r4720, %r4709, %r4688; + fma.rn.ftz.f32 %r4724, %r4722, %r4709, %r4690; + fma.rn.ftz.f32 %r4726, %r4724, %r4709, %r4692; + fma.rn.ftz.f32 %r4728, %r4726, %r4709, %r4694; + mul.f32 %r4729, %r4709, %r4728; + mul.f32 %r4730, %r4709, %r4729; + fma.rn.ftz.f32 %r4732, %r4709, %r4698, %r4730; + add.f32 %r5063, %r4708, %r4732; + setp.lt.u32 %p381, %r708, 2139095040; + @%p381 bra $L__BB0_10; +// %bb.9: // %__nv_fmaf_rn.exit.i.i384 + .loc 1 0 33 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:0:33 + fma.rn.ftz.f32 %r5063, %r708, %r4700, %r4700; +$L__BB0_10: // %__nv_log2f.exit387 + .loc 1 223 33 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:223:33 + setp.eq.f32 %p383, %r708, 0f00000000; + setp.eq.f32 %p384, %r704, 0f00000000; + .loc 1 222 32 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:222:32 + shl.b32 %r4735, %r1, 16; + shl.b32 %r4736, %r2, 11; + add.s32 %r4737, %r4735, %r4736; + .loc 1 222 23 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:222:23 + mad.wide.s32 %rd288, %r4737, 4, %rd35; + .loc 1 144 46 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:144:46 + and.b32 %r4738, %r6, 127; + .loc 1 144 33 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:144:33 + or.b32 %r4739, %r5, %r4738; + .loc 1 222 40 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:222:40 + mad.wide.s32 %rd287, %r4739, 4, %rd288; + .loc 1 223 33 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:223:33 + selp.f32 %r4740, 0fFF800000, %r5062, %p384; + selp.f32 %r4741, 0fFF800000, %r5063, %p383; + .loc 1 223 20 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:223:20 + mov.b64 {%r4742, %r4743}, %rd292; + add.f32 %r4744, %r4743, %r4741; + add.f32 %r4745, %r4742, %r4740; + .loc 1 227 48 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:227:48 + setp.lt.s32 %p385, %r4739, 2048; + .loc 1 227 29 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:227:29 + bar.sync 0; + shl.b32 %r4746, %r703, 1; + add.s32 %r4748, %r836, %r4746; + st.shared.v2.b32 [%r4748], {%r4745, %r4744}; + bar.sync 0; + shl.b32 %r4749, %r28, 3; + shl.b32 %r4750, %r29, 2; + shr.u32 %r4751, %r30, 1; + add.s32 %r4752, %r836, %r4749; + add.s32 %r4753, %r4752, %r4750; + add.s32 %r4754, %r4753, %r4751; + ld.shared.b32 %r4734, [%r4754]; + and.b32 %r4755, %r6, 128; + setp.eq.b32 %p386, %r4755, 0; + and.pred %p382, %p386, %p385; + // begin inline asm + @%p382 st.global.b32 [ %rd287 + 0 ], { %r4734 }; + // end inline asm + .loc 1 229 4 // cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py:229:4 + ret; +$L__tmp13: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 275 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x10c DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 102 +.b8 99 +.b8 108 +.b8 114 +.b8 102 +.b8 122 +.b8 106 +.b8 108 +.b8 51 +.b8 116 +.b8 117 +.b8 98 +.b8 122 +.b8 100 +.b8 106 +.b8 97 +.b8 54 +.b8 97 +.b8 109 +.b8 98 +.b8 115 +.b8 107 +.b8 106 +.b8 120 +.b8 107 +.b8 54 +.b8 117 +.b8 55 +.b8 101 +.b8 97 +.b8 119 +.b8 108 +.b8 106 +.b8 99 +.b8 121 +.b8 117 +.b8 53 +.b8 52 +.b8 103 +.b8 112 +.b8 54 +.b8 102 +.b8 118 +.b8 53 +.b8 103 +.b8 52 +.b8 116 +.b8 54 +.b8 122 +.b8 108 +.b8 115 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 102 +.b8 99 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x15 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 116 +.b8 101 +.b8 109 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa0:0x76 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xb5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 146 // DW_AT_call_line +.b8 101 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xcd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 154 // DW_AT_call_line +.b8 92 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xe5:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 172 // DW_AT_call_line +.b8 41 // DW_AT_call_column +.b8 4 // Abbrev [4] 0xfd:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp9 // DW_AT_low_pc +.b64 $L__tmp12 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 198 // DW_AT_call_line +.b8 45 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.source b/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.source new file mode 100644 index 0000000000000000000000000000000000000000..eee45d695e634dc9ac9e273aa3deb132ccf4352b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.source @@ -0,0 +1,1200 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":18:0) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":271:0) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":32:0) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":448:0) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":299:0) +#loc226 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":256:0) +#loc230 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":175:0) +#loc232 = loc(unknown) +#loc235 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":167:0) +#loc239 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc243 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc247 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":240:0) +#loc268 = loc("arg_Q"(#loc)) +#loc269 = loc("arg_K"(#loc)) +#loc270 = loc("arg_V"(#loc)) +#loc271 = loc("arg_LSE"(#loc)) +#loc272 = loc("arg_MAX"(#loc)) +#loc273 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc274 = loc("arg_KV_IDX"(#loc)) +#loc275 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc276 = loc("arg_FULL_KV_IDX"(#loc)) +#loc277 = loc("in_ptr9"(#loc)) +#loc278 = loc("out_ptr0"(#loc)) +#loc279 = loc("ks0"(#loc)) +#loc280 = loc("ks1"(#loc)) +#loc377 = loc("ptr"(#loc125)) +#loc378 = loc("offs_m"(#loc125)) +#loc379 = loc("offs_n"(#loc125)) +#loc380 = loc("stride_m"(#loc125)) +#loc381 = loc("stride_n"(#loc125)) +#loc382 = loc("M_LEN"(#loc125)) +#loc389 = loc("x"(#loc137)) +#loc390 = loc("arg_Q"(#loc143)) +#loc391 = loc("arg_K"(#loc143)) +#loc392 = loc("arg_V"(#loc143)) +#loc393 = loc("arg_LSE"(#loc143)) +#loc394 = loc("arg_MAX"(#loc143)) +#loc395 = loc("arg_KV_NUM_BLKS"(#loc143)) +#loc396 = loc("arg_KV_IDX"(#loc143)) +#loc397 = loc("arg_FULL_KV_NUM_BLKS"(#loc143)) +#loc398 = loc("arg_FULL_KV_IDX"(#loc143)) +#loc399 = loc("in_ptr9"(#loc143)) +#loc400 = loc("out_ptr0"(#loc143)) +#loc401 = loc("ks0"(#loc143)) +#loc402 = loc("ks1"(#loc143)) +#loc403 = loc("q"(#loc143)) +#loc404 = loc("K"(#loc143)) +#loc405 = loc("V"(#loc143)) +#loc406 = loc("Q_LEN"(#loc143)) +#loc407 = loc("KV_LEN"(#loc143)) +#loc408 = loc("acc"(#loc143)) +#loc409 = loc("l_i"(#loc143)) +#loc410 = loc("m_i"(#loc143)) +#loc411 = loc("off_z"(#loc143)) +#loc412 = loc("off_h"(#loc143)) +#loc413 = loc("offs_m"(#loc143)) +#loc414 = loc("offs_n"(#loc143)) +#loc415 = loc("kv_start"(#loc143)) +#loc416 = loc("kv_indices"(#loc143)) +#loc417 = loc("kv_num_blocks"(#loc143)) +#loc418 = loc("block_n_end"(#loc143)) +#loc419 = loc("stride_kk"(#loc143)) +#loc420 = loc("stride_kn"(#loc143)) +#loc421 = loc("stride_vn"(#loc143)) +#loc422 = loc("stride_vk"(#loc143)) +#loc428 = loc("arg_Q"(#loc153)) +#loc429 = loc("arg_K"(#loc153)) +#loc430 = loc("arg_V"(#loc153)) +#loc431 = loc("arg_LSE"(#loc153)) +#loc432 = loc("arg_MAX"(#loc153)) +#loc433 = loc("arg_KV_NUM_BLKS"(#loc153)) +#loc434 = loc("arg_KV_IDX"(#loc153)) +#loc435 = loc("arg_FULL_KV_NUM_BLKS"(#loc153)) +#loc436 = loc("arg_FULL_KV_IDX"(#loc153)) +#loc437 = loc("in_ptr9"(#loc153)) +#loc438 = loc("out_ptr0"(#loc153)) +#loc439 = loc("ks0"(#loc153)) +#loc440 = loc("ks1"(#loc153)) +#loc441 = loc("q"(#loc153)) +#loc442 = loc("K"(#loc153)) +#loc443 = loc("V"(#loc153)) +#loc444 = loc("Q_LEN"(#loc153)) +#loc445 = loc("KV_LEN"(#loc153)) +#loc446 = loc("acc"(#loc153)) +#loc447 = loc("l_i"(#loc153)) +#loc448 = loc("m_i"(#loc153)) +#loc449 = loc("off_z"(#loc153)) +#loc450 = loc("off_h"(#loc153)) +#loc451 = loc("offs_m"(#loc153)) +#loc452 = loc("offs_n"(#loc153)) +#loc453 = loc("kv_start"(#loc153)) +#loc454 = loc("kv_offset"(#loc153)) +#loc455 = loc("stride_kk"(#loc153)) +#loc456 = loc("stride_kn"(#loc153)) +#loc457 = loc("stride_vn"(#loc153)) +#loc458 = loc("stride_vk"(#loc153)) +#loc529 = loc("indices"(#loc226)) +#loc530 = loc("max_len"(#loc226)) +#loc531 = loc("input"(#loc230)) +#loc532 = loc("a"(#loc235)) +#loc533 = loc("b"(#loc235)) +#loc534 = loc("input"(#loc239)) +#loc535 = loc("a"(#loc243)) +#loc536 = loc("b"(#loc243)) +#loc537 = loc("loop_iter"(#loc247)) +#loc538 = loc("col_indices"(#loc247)) +#loc539 = loc("total_blocks"(#loc247)) +module { + tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %in_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr9"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc))) attributes {noinline = false} { + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4096_i32 = arith.constant 4096 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc2) + %c1024_i32_0 = arith.constant 1024 : i32 loc(#loc2) + %0 = arith.muli %c1024_i32_0, %ks0 : i32 loc(#loc2) + %c128_i32_1 = arith.constant 128 : i32 loc(#loc3) + %c128_i32_2 = arith.constant 128 : i32 loc(#loc3) + %1 = arith.muli %c128_i32_2, %ks0 : i32 loc(#loc3) + %c128_i32_3 = arith.constant 128 : i32 loc(#loc4) + %c1_i32_4 = arith.constant 1 : i32 loc(#loc4) + %c1024_i32_5 = arith.constant 1024 : i32 loc(#loc5) + %c1024_i32_6 = arith.constant 1024 : i32 loc(#loc5) + %2 = arith.muli %c1024_i32_6, %ks0 : i32 loc(#loc5) + %c128_i32_7 = arith.constant 128 : i32 loc(#loc6) + %c128_i32_8 = arith.constant 128 : i32 loc(#loc6) + %3 = arith.muli %c128_i32_8, %ks0 : i32 loc(#loc6) + %c128_i32_9 = arith.constant 128 : i32 loc(#loc7) + %c1_i32_10 = arith.constant 1 : i32 loc(#loc7) + %ZQ = arith.constant 8 : i32 loc(#loc281) + %HQ = arith.constant 32 : i32 loc(#loc282) + %Q_LEN = arith.constant 2048 : i32 loc(#loc283) + %ZKV = arith.constant 8 : i32 loc(#loc284) + %q_start = tt.get_program_id x : i32 loc(#loc285) + %off_zq = tt.get_program_id y : i32 loc(#loc286) + %off_hq = tt.get_program_id z : i32 loc(#loc287) + %off_zkv = arith.remsi %off_zq, %ZKV : i32 loc(#loc288) + %off_hkv = arith.constant 4 : i32 loc(#loc289) + %off_hkv_11 = arith.constant 4 : i32 loc(#loc289) + %off_hkv_12 = arith.divsi %off_hq, %off_hkv_11 : i32 loc(#loc289) + %off_g = arith.constant 4 : i32 loc(#loc290) + %off_g_13 = arith.constant 4 : i32 loc(#loc290) + %off_g_14 = arith.remsi %off_hq, %off_g_13 : i32 loc(#loc290) + %q_offset = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc291) + %q_offset_15 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc292) + %q_offset_16 = arith.addi %q_offset, %q_offset_15 : i32 loc(#loc293) + %k_offset = arith.muli %off_zkv, %0 : i32 loc(#loc294) + %k_offset_17 = arith.muli %off_hkv_12, %1 : i32 loc(#loc295) + %k_offset_18 = arith.addi %k_offset, %k_offset_17 : i32 loc(#loc296) + %v_offset = arith.muli %off_zkv, %2 : i32 loc(#loc297) + %v_offset_19 = arith.muli %off_hkv_12, %3 : i32 loc(#loc298) + %v_offset_20 = arith.addi %v_offset, %v_offset_19 : i32 loc(#loc299) + %Q = tt.addptr %arg_Q, %q_offset_16 : !tt.ptr, i32 loc(#loc300) + %K = tt.addptr %arg_K, %k_offset_18 : !tt.ptr, i32 loc(#loc301) + %V = tt.addptr %arg_V, %v_offset_20 : !tt.ptr, i32 loc(#loc302) + %SPARSE_Z = arith.constant 8 : i32 loc(#loc303) + %SPARSE_HQ = arith.constant 1 : i32 loc(#loc304) + %sparse_idx_z = arith.remsi %off_zq, %SPARSE_Z : i32 loc(#loc305) + %sparse_idx_hq = arith.remsi %off_hq, %SPARSE_HQ : i32 loc(#loc306) + %stride_kv_num_blks_h = arith.constant 16 : i32 loc(#loc307) + %stride_kv_idx_h = arith.constant 16 : i32 loc(#loc308) + %stride_kv_idx_h_21 = arith.constant 16 : i32 loc(#loc308) + %stride_kv_idx_h_22 = arith.muli %stride_kv_idx_h_21, %ks1 : i32 loc(#loc308) + %m_i = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128xf32> loc(#loc309) + %m_i_23 = arith.constant 0x7F800000 : f32 loc(#loc310) + %m_i_24 = arith.constant 0x7F800000 : f32 loc(#loc310) + %m_i_25 = arith.constant dense<0x7F800000> : tensor<128xf32> loc(#loc310) + %m_i_26 = arith.subf %m_i, %m_i_25 : tensor<128xf32> loc(#loc310) + %l_i = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128xf32> loc(#loc311) + %acc = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() : () -> tensor<128x128xf32> loc(#loc312) + %offs_m = arith.constant 128 : i32 loc(#loc313) + %offs_m_27 = arith.constant 128 : i32 loc(#loc313) + %offs_m_28 = arith.muli %q_start, %offs_m_27 : i32 loc(#loc313) + %offs_m_29 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc314) + %offs_m_30 = tt.splat %offs_m_28 : i32 -> tensor<128xi32> loc(#loc315) + %offs_m_31 = arith.addi %offs_m_30, %offs_m_29 : tensor<128xi32> loc(#loc315) + %sparse_hz_offset = arith.muli %sparse_idx_z, %SPARSE_HQ : i32 loc(#loc316) + %sparse_hz_offset_32 = arith.addi %sparse_hz_offset, %sparse_idx_hq : i32 loc(#loc317) + %sparse_kv_num_blks_offset = arith.muli %sparse_hz_offset_32, %stride_kv_num_blks_h : i32 loc(#loc318) + %sparse_kv_num_blks_offset_33 = arith.constant 1 : i32 loc(#loc319) + %sparse_kv_num_blks_offset_34 = arith.constant 1 : i32 loc(#loc319) + %sparse_kv_num_blks_offset_35 = arith.divsi %q_start, %sparse_kv_num_blks_offset_34 : i32 loc(#loc319) + %sparse_kv_num_blks_offset_36 = arith.addi %sparse_kv_num_blks_offset, %sparse_kv_num_blks_offset_35 : i32 loc(#loc320) + %sparse_kv_idx_offset = arith.muli %sparse_hz_offset_32, %stride_kv_idx_h_22 : i32 loc(#loc321) + %sparse_kv_idx_offset_37 = arith.constant 1 : i32 loc(#loc322) + %sparse_kv_idx_offset_38 = arith.constant 1 : i32 loc(#loc322) + %sparse_kv_idx_offset_39 = arith.divsi %q_start, %sparse_kv_idx_offset_38 : i32 loc(#loc322) + %sparse_kv_idx_offset_40 = arith.muli %sparse_kv_idx_offset_39, %ks1 : i32 loc(#loc323) + %sparse_kv_idx_offset_41 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_40 : i32 loc(#loc324) + %offs_m_42 = arith.constant 128 : i32 loc(#loc325) + %offs_m_43 = arith.constant 128 : i32 loc(#loc325) + %offs_m_44 = arith.muli %q_start, %offs_m_43 : i32 loc(#loc325) + %offs_m_45 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc326) + %offs_m_46 = tt.splat %offs_m_44 : i32 -> tensor<128xi32> loc(#loc327) + %offs_m_47 = arith.addi %offs_m_46, %offs_m_45 : tensor<128xi32> loc(#loc327) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc328) + %q = tt.call @"torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%Q, %offs_m_47, %offs_k, %c4096_i32, %c1_i32, %Q_LEN) : (!tt.ptr, tensor<128xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<128x128xbf16> loc(#loc329) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_41 : !tt.ptr, i32 loc(#loc330) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc331) + %kv_start_48 = arith.constant 128 : i32 loc(#loc332) + %kv_start_49 = arith.constant 128 : i32 loc(#loc332) + %kv_start_50 = arith.muli %kv_start, %kv_start_49 : i32 loc(#loc332) + %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_36 : !tt.ptr, i32 loc(#loc333) + %kv_num_blocks_51 = tt.load %kv_num_blocks : !tt.ptr loc(#loc334) + %block_n_end = arith.constant 2 : i32 loc(#loc335) + %block_n_end_52 = arith.constant 2 : i32 loc(#loc335) + %block_n_end_53 = arith.muli %kv_num_blocks_51, %block_n_end_52 : i32 loc(#loc335) + %block_n_end_54 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%ks0) : (i32) -> i32 loc(#loc336) + %block_n_end_55 = arith.constant 1 : i32 loc(#loc337) + %block_n_end_56 = arith.maxsi %block_n_end_54, %block_n_end_55 : i32 loc(#loc337) + %block_n_end_57 = arith.minsi %block_n_end_53, %block_n_end_56 : i32 loc(#loc338) + %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc339) + %offs_n_58 = tt.splat %kv_start_50 : i32 -> tensor<64xi32> loc(#loc340) + %offs_n_59 = arith.addi %offs_n_58, %offs_n : tensor<64xi32> loc(#loc340) + %4 = tt.expand_dims %offs_m_47 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc68) + %5 = tt.expand_dims %offs_n_59 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc69) + %6:3 = tt.call @"torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(16,)cNone_(17,)cNone_(30,)cconstexpr_0__(32,)cconstexpr_bf16__(37,)cconstexpr_False_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %ks0, %ks1, %q, %K, %V, %Q_LEN, %ks0, %acc, %l_i, %m_i_26, %off_zq, %off_hq, %4, %5, %kv_start_50, %kv_indices, %kv_num_blocks_51, %block_n_end_57, %c1_i32_4, %c128_i32_3, %c128_i32_9, %c1_i32_10) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, !tt.ptr, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc70) + %kv_indices_60 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_41 : !tt.ptr, i32 loc(#loc341) + %kv_start_61 = tt.load %kv_indices_60 : !tt.ptr loc(#loc342) + %kv_start_62 = arith.constant 128 : i32 loc(#loc343) + %kv_start_63 = arith.constant 128 : i32 loc(#loc343) + %kv_start_64 = arith.muli %kv_start_61, %kv_start_63 : i32 loc(#loc343) + %kv_num_blocks_65 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_36 : !tt.ptr, i32 loc(#loc344) + %kv_num_blocks_66 = tt.load %kv_num_blocks_65 : !tt.ptr loc(#loc345) + %block_n_end_67 = arith.constant 2 : i32 loc(#loc346) + %block_n_end_68 = arith.constant 2 : i32 loc(#loc346) + %block_n_end_69 = arith.muli %kv_num_blocks_66, %block_n_end_68 : i32 loc(#loc346) + %block_n_end_70 = tt.call @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%ks0) : (i32) -> i32 loc(#loc347) + %block_n_end_71 = arith.constant 1 : i32 loc(#loc348) + %block_n_end_72 = arith.maxsi %block_n_end_70, %block_n_end_71 : i32 loc(#loc348) + %block_n_end_73 = arith.minsi %block_n_end_69, %block_n_end_72 : i32 loc(#loc349) + %offs_n_74 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc350) + %offs_n_75 = tt.splat %kv_start_64 : i32 -> tensor<64xi32> loc(#loc351) + %offs_n_76 = arith.addi %offs_n_75, %offs_n_74 : tensor<64xi32> loc(#loc351) + %7 = tt.expand_dims %offs_m_47 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc82) + %8 = tt.expand_dims %offs_n_76 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc83) + %9:3 = tt.call @"torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(16,)cNone_(17,)cNone_(30,)cconstexpr_0__(32,)cconstexpr_bf16__(37,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %ks0, %ks1, %q, %K, %V, %Q_LEN, %ks0, %6#0, %6#1, %6#2, %off_zq, %off_hq, %7, %8, %kv_start_64, %kv_indices_60, %kv_num_blocks_66, %block_n_end_73, %c1_i32_4, %c128_i32_3, %c128_i32_9, %c1_i32_10) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, !tt.ptr, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc84) + %l_i_77 = arith.constant 0.000000e+00 : f32 loc(#loc352) + %l_i_78 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc352) + %l_i_79 = arith.cmpf oeq, %9#1, %l_i_78 : tensor<128xf32> loc(#loc352) + %l_i_80 = arith.constant 1 : i32 loc(#loc353) + %l_i_81 = arith.constant 1.000000e+00 : f32 loc(#loc353) + %l_i_82 = arith.constant dense<1.000000e+00> : tensor<128xf32> loc(#loc353) + %l_i_83 = arith.select %l_i_79, %l_i_82, %9#1 : tensor<128xi1>, tensor<128xf32> loc(#loc353) + %acc_84 = tt.expand_dims %l_i_83 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc354) + %acc_85 = tt.broadcast %acc_84 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc355) + %acc_86 = arith.divf %9#0, %acc_85 : tensor<128x128xf32> loc(#loc355) + %idx_zq = tt.get_program_id y : i32 loc(#loc356) + %idx_hq = tt.get_program_id z : i32 loc(#loc357) + %idx_m = tt.expand_dims %offs_m_47 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc358) + %idx_d = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc359) + %idx_d_87 = tt.expand_dims %idx_d {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc360) + %mask = arith.constant dense<2048> : tensor<128x1xi32> loc(#loc361) + %mask_88 = arith.cmpi slt, %idx_m, %mask : tensor<128x1xi32> loc(#loc361) + %mask_89 = arith.constant 128 : i32 loc(#loc362) + %mask_90 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc362) + %mask_91 = arith.cmpi slt, %idx_d_87, %mask_90 : tensor<1x128xi32> loc(#loc362) + %mask_92 = tt.broadcast %mask_88 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc363) + %mask_93 = tt.broadcast %mask_91 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc363) + %mask_94 = arith.andi %mask_92, %mask_93 : tensor<128x128xi1> loc(#loc363) + %xindex = arith.constant 128 : i32 loc(#loc364) + %xindex_95 = arith.constant 128 : i32 loc(#loc364) + %xindex_96 = arith.constant dense<128> : tensor<128x1xi32> loc(#loc364) + %xindex_97 = arith.muli %xindex_96, %idx_m : tensor<128x1xi32> loc(#loc364) + %xindex_98 = tt.broadcast %idx_d_87 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc365) + %xindex_99 = tt.broadcast %xindex_97 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc365) + %xindex_100 = arith.addi %xindex_98, %xindex_99 : tensor<128x128xi32> loc(#loc365) + %xindex_101 = arith.constant 262144 : i32 loc(#loc366) + %xindex_102 = arith.constant 262144 : i32 loc(#loc366) + %xindex_103 = arith.muli %xindex_102, %idx_hq : i32 loc(#loc366) + %xindex_104 = tt.splat %xindex_103 : i32 -> tensor<128x128xi32> loc(#loc367) + %xindex_105 = arith.addi %xindex_100, %xindex_104 : tensor<128x128xi32> loc(#loc367) + %xindex_106 = arith.constant 8388608 : i32 loc(#loc368) + %xindex_107 = arith.constant 8388608 : i32 loc(#loc368) + %xindex_108 = arith.muli %xindex_107, %idx_zq : i32 loc(#loc368) + %xindex_109 = tt.splat %xindex_108 : i32 -> tensor<128x128xi32> loc(#loc369) + %xindex_110 = arith.addi %xindex_105, %xindex_109 : tensor<128x128xi32> loc(#loc369) + %c128_i32_111 = arith.constant 128 : i32 loc(#loc103) + %c128_i32_112 = arith.constant 128 : i32 loc(#loc103) + %10 = arith.muli %c128_i32_112, %idx_hq : i32 loc(#loc103) + %11 = tt.splat %10 : i32 -> tensor<1x128xi32> loc(#loc104) + %12 = arith.addi %idx_d_87, %11 : tensor<1x128xi32> loc(#loc104) + %c4096_i32_113 = arith.constant 4096 : i32 loc(#loc105) + %c4096_i32_114 = arith.constant 4096 : i32 loc(#loc105) + %cst = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc105) + %13 = arith.muli %cst, %idx_m : tensor<128x1xi32> loc(#loc105) + %14 = tt.broadcast %12 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc106) + %15 = tt.broadcast %13 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc106) + %16 = arith.addi %14, %15 : tensor<128x128xi32> loc(#loc106) + %c8388608_i32_115 = arith.constant 8388608 : i32 loc(#loc107) + %c8388608_i32_116 = arith.constant 8388608 : i32 loc(#loc107) + %17 = arith.muli %c8388608_i32_116, %idx_zq : i32 loc(#loc107) + %18 = tt.splat %17 : i32 -> tensor<128x128xi32> loc(#loc108) + %19 = arith.addi %16, %18 : tensor<128x128xi32> loc(#loc108) + %20 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc109) + %21 = tt.addptr %20, %19 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc109) + %22 = arith.truncf %acc_86 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc110) + tt.store %21, %22, %mask_94 : tensor<128x128x!tt.ptr> loc(#loc110) + %off_hz = arith.muli %off_zq, %HQ : i32 loc(#loc370) + %off_hz_117 = arith.addi %off_hz, %off_hq : i32 loc(#loc371) + %l_ptrs = arith.muli %off_hz_117, %Q_LEN : i32 loc(#loc372) + %l_ptrs_118 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr, i32 loc(#loc373) + %l_ptrs_119 = tt.splat %l_ptrs_118 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc374) + %l_ptrs_120 = tt.addptr %l_ptrs_119, %offs_m_47 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc374) + %lse = math.log2 %l_i_83 : tensor<128xf32> loc(#loc375) + %lse_121 = arith.addf %9#2, %lse : tensor<128xf32> loc(#loc376) + %cst_122 = arith.constant dense<2048> : tensor<128xi32> loc(#loc118) + %23 = arith.cmpi slt, %offs_m_47, %cst_122 : tensor<128xi32> loc(#loc118) + tt.store %l_ptrs_120, %lse_121, %23 : tensor<128x!tt.ptr> loc(#loc119) + tt.return loc(#loc120) + } loc(#loc) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(1,)cconstexpr_fp32_"() -> tensor<128xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc122) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc122) + tt.return %cst_0 : tensor<128xf32> loc(#loc123) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128xf32> loc(#loc124) + tt.return %0 : tensor<128xf32> loc(#loc124) + } loc(#loc121) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_128__(0, 1)cconstexpr_128__(1,)cconstexpr_fp32_"() -> tensor<128x128xf32> attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 loc(#loc122) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc122) + tt.return %cst_0 : tensor<128x128xf32> loc(#loc123) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc124) + tt.return %0 : tensor<128x128xf32> loc(#loc124) + } loc(#loc121) + tt.func private @"torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.load_checked_2d__Pbf16_i32S128S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: !tt.ptr loc("ptr"(#loc125)), %offs_m: tensor<128xi32> loc("offs_m"(#loc125)), %offs_n: tensor<128xi32> loc("offs_n"(#loc125)), %stride_m: i32 loc("stride_m"(#loc125)), %stride_n: i32 loc("stride_n"(#loc125)), %M_LEN: i32 loc("M_LEN"(#loc125))) -> tensor<128x128xbf16> attributes {noinline = false} { + %ptr_0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc383) + %ptr_1 = tt.splat %stride_m : i32 -> tensor<128x1xi32> loc(#loc384) + %ptr_2 = arith.muli %ptr_0, %ptr_1 : tensor<128x1xi32> loc(#loc384) + %ptr_3 = tt.splat %ptr : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc385) + %ptr_4 = tt.addptr %ptr_3, %ptr_2 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc385) + %ptr_5 = tt.expand_dims %offs_n {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc386) + %ptr_6 = tt.splat %stride_n : i32 -> tensor<1x128xi32> loc(#loc387) + %ptr_7 = arith.muli %ptr_5, %ptr_6 : tensor<1x128xi32> loc(#loc387) + %ptr_8 = tt.broadcast %ptr_4 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc388) + %ptr_9 = tt.broadcast %ptr_7 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc388) + %ptr_10 = tt.addptr %ptr_8, %ptr_9 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc388) + %0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc132) + %1 = tt.splat %M_LEN : i32 -> tensor<128x1xi32> loc(#loc133) + %2 = arith.cmpi slt, %0, %1 : tensor<128x1xi32> loc(#loc133) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc134) + %3 = tt.broadcast %2 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc134) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc134) + %4 = arith.truncf %cst_11 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc134) + %5 = tt.load %ptr_10, %3, %4 : tensor<128x128x!tt.ptr> loc(#loc134) + tt.return %5 : tensor<128x128xbf16> loc(#loc135) + ^bb1: // no predecessors + %6 = ub.poison : tensor<128x128xbf16> loc(#loc136) + tt.return %6 : tensor<128x128xbf16> loc(#loc136) + } loc(#loc125) + tt.func private @"triton.language.standard.cdiv__i32__(1,)cconstexpr_64_"(%x: i32 loc("x"(#loc137))) -> i32 attributes {noinline = false} { + %c64_i32 = arith.constant 64 : i32 loc(#loc138) + %c64_i32_0 = arith.constant 64 : i32 loc(#loc138) + %0 = arith.addi %x, %c64_i32_0 : i32 loc(#loc138) + %c1_i32 = arith.constant 1 : i32 loc(#loc139) + %c1_i32_1 = arith.constant 1 : i32 loc(#loc139) + %1 = arith.subi %0, %c1_i32_1 : i32 loc(#loc139) + %c64_i32_2 = arith.constant 64 : i32 loc(#loc140) + %c64_i32_3 = arith.constant 64 : i32 loc(#loc140) + %2 = arith.divsi %1, %c64_i32_3 : i32 loc(#loc140) + tt.return %2 : i32 loc(#loc141) + ^bb1: // no predecessors + %3 = ub.poison : i32 loc(#loc142) + tt.return %3 : i32 loc(#loc142) + } loc(#loc137) + tt.func private @"torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(16,)cNone_(17,)cNone_(30,)cconstexpr_0__(32,)cconstexpr_bf16__(37,)cconstexpr_False_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc143)), %arg_K: !tt.ptr loc("arg_K"(#loc143)), %arg_V: !tt.ptr loc("arg_V"(#loc143)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc143)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc143)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc143)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc143)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc143)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc143)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc143)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc143)), %ks0: i32 loc("ks0"(#loc143)), %ks1: i32 loc("ks1"(#loc143)), %q: tensor<128x128xbf16> loc("q"(#loc143)), %K: !tt.ptr loc("K"(#loc143)), %V: !tt.ptr loc("V"(#loc143)), %Q_LEN: i32 loc("Q_LEN"(#loc143)), %KV_LEN: i32 loc("KV_LEN"(#loc143)), %acc: tensor<128x128xf32> loc("acc"(#loc143)), %l_i: tensor<128xf32> loc("l_i"(#loc143)), %m_i: tensor<128xf32> loc("m_i"(#loc143)), %off_z: i32 loc("off_z"(#loc143)), %off_h: i32 loc("off_h"(#loc143)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc143)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc143)), %kv_start: i32 loc("kv_start"(#loc143)), %kv_indices: !tt.ptr loc("kv_indices"(#loc143)), %kv_num_blocks: i32 loc("kv_num_blocks"(#loc143)), %block_n_end: i32 loc("block_n_end"(#loc143)), %stride_kk: i32 loc("stride_kk"(#loc143)), %stride_kn: i32 loc("stride_kn"(#loc143)), %stride_vn: i32 loc("stride_vn"(#loc143)), %stride_vk: i32 loc("stride_vk"(#loc143))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_offset = arith.constant 0 : i32 loc(#loc423) + %c0_i32 = arith.constant 0 : i32 loc(#loc145) + %c1_i32 = arith.constant 1 : i32 loc(#loc145) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc145) + %1 = arith.bitcast %block_n_end : i32 to i32 loc(#loc145) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc145) + %3 = ub.poison : i32 loc(#loc145) + %kv_offset_0:5 = scf.for %start_n = %0 to %1 step %2 iter_args(%acc_1 = %acc, %l_i_2 = %l_i, %m_i_3 = %m_i, %offs_n_4 = %offs_n, %kv_offset_5 = %kv_offset) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %7:3 = tt.call @"torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(16,)cconstexpr_None__(17,)cconstexpr_None__(29,)cconstexpr_bf16__(30,)cconstexpr_1_d_44269504__(35,)cconstexpr_False__(36,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %ks0, %ks1, %q, %K, %V, %Q_LEN, %KV_LEN, %acc_1, %l_i_2, %m_i_3, %off_z, %off_h, %offs_m, %offs_n_4, %kv_start, %kv_offset_5, %stride_kk, %stride_kn, %stride_vn, %stride_vk) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc146) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc425) + %offs_n_6 = tt.splat %offset : i32 -> tensor<1x64xi32> loc(#loc426) + %offs_n_7 = arith.addi %offs_n_4, %offs_n_6 : tensor<1x64xi32> loc(#loc426) + %kv_offset_8 = arith.addi %kv_offset_5, %offset : i32 loc(#loc427) + scf.yield %7#0, %7#1, %7#2, %offs_n_7, %kv_offset_8 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc150) + } loc(#loc561) + tt.return %kv_offset_0#0, %kv_offset_0#1, %kv_offset_0#2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc151) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc152) + %5 = ub.poison : tensor<128xf32> loc(#loc152) + %6 = ub.poison : tensor<128xf32> loc(#loc152) + tt.return %4, %5, %6 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc152) + } loc(#loc143) + tt.func private @"torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(16,)cconstexpr_None__(17,)cconstexpr_None__(29,)cconstexpr_bf16__(30,)cconstexpr_1_d_44269504__(35,)cconstexpr_False__(36,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc153)), %arg_K: !tt.ptr loc("arg_K"(#loc153)), %arg_V: !tt.ptr loc("arg_V"(#loc153)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc153)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc153)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc153)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc153)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc153)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc153)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc153)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc153)), %ks0: i32 loc("ks0"(#loc153)), %ks1: i32 loc("ks1"(#loc153)), %q: tensor<128x128xbf16> loc("q"(#loc153)), %K: !tt.ptr loc("K"(#loc153)), %V: !tt.ptr loc("V"(#loc153)), %Q_LEN: i32 loc("Q_LEN"(#loc153)), %KV_LEN: i32 loc("KV_LEN"(#loc153)), %acc: tensor<128x128xf32> loc("acc"(#loc153)), %l_i: tensor<128xf32> loc("l_i"(#loc153)), %m_i: tensor<128xf32> loc("m_i"(#loc153)), %off_z: i32 loc("off_z"(#loc153)), %off_h: i32 loc("off_h"(#loc153)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc153)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc153)), %kv_start: i32 loc("kv_start"(#loc153)), %kv_offset: i32 loc("kv_offset"(#loc153)), %stride_kk: i32 loc("stride_kk"(#loc153)), %stride_kn: i32 loc("stride_kn"(#loc153)), %stride_vn: i32 loc("stride_vn"(#loc153)), %stride_vk: i32 loc("stride_vk"(#loc153))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_base_offset = arith.addi %kv_start, %kv_offset : i32 loc(#loc459) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc460) + %offs_n_load = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc461) + %offs_n_load_0 = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc462) + %offs_n_load_1 = arith.addi %offs_n_load_0, %offs_n_load : tensor<64xi32> loc(#loc462) + %k = tt.call @"torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%K, %offs_n_load_1, %offs_k, %stride_kn, %stride_kk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc463) + %k_2 = tt.trans %k {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc464) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc465) + %qk_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc465) + %qk_4 = tt.dot %q, %k_2, %qk_3, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc465) + %qk_5 = arith.constant 0.0883883461 : f32 loc(#loc466) + %qk_6 = arith.constant 0.0883883461 : f32 loc(#loc466) + %qk_7 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc466) + %qk_8 = arith.mulf %qk_4, %qk_7 : tensor<128x64xf32> loc(#loc466) + %m = tt.call @torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.get_bounded_indices__i32S128_1S_i32__(%offs_m, %Q_LEN) : (tensor<128x1xi32>, i32) -> tensor<128x1xi32> loc(#loc467) + %n = tt.call @torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.get_bounded_indices__i32S1_64S_i32__(%offs_n, %KV_LEN) : (tensor<1x64xi32>, i32) -> tensor<1x64xi32> loc(#loc468) + %post_mod_scores = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc469) + %post_mod_scores_9 = arith.cmpi slt, %offs_n, %post_mod_scores : tensor<1x64xi32> loc(#loc469) + %post_mod_scores_10 = arith.constant 0xFF800000 : f32 loc(#loc470) + %post_mod_scores_11 = arith.constant 0xFF800000 : f32 loc(#loc470) + %post_mod_scores_12 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc470) + %post_mod_scores_13 = tt.broadcast %post_mod_scores_9 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc470) + %post_mod_scores_14 = arith.select %post_mod_scores_13, %qk_8, %post_mod_scores_12 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc470) + %tmp1 = arith.constant false loc(#loc471) + %tmp1_15 = arith.constant dense : tensor<1xi1> loc(#loc471) + %tmp4 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc472) + %tmp4_16 = tt.broadcast %n : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc472) + %tmp4_17 = arith.cmpi sge, %tmp4, %tmp4_16 : tensor<128x64xi32> loc(#loc472) + %tmp5 = arith.extsi %n : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc473) + %tmp7 = tt.addptr %in_ptr9, %off_z : !tt.ptr, i32 loc(#loc474) + %tmp7_18 = tt.load %tmp7 : !tt.ptr loc(#loc475) + %tmp8 = tt.splat %tmp7_18 : i64 -> tensor<1x64xi64> loc(#loc476) + %tmp8_19 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc476) + %tmp9 = arith.extsi %m : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc477) + %tmp10 = tt.splat %tmp7_18 : i64 -> tensor<128x1xi64> loc(#loc478) + %tmp10_20 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc478) + %tmp11 = tt.broadcast %tmp8_19 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc479) + %tmp11_21 = tt.broadcast %tmp10_20 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc479) + %tmp11_22 = arith.andi %tmp11, %tmp11_21 : tensor<128x64xi1> loc(#loc479) + %tmp12 = arith.andi %tmp4_17, %tmp11_22 : tensor<128x64xi1> loc(#loc480) + %tmp13 = tt.expand_dims %tmp1_15 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc481) + %tmp13_23 = tt.broadcast %tmp13 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc481) + %tmp13_24 = arith.ori %tmp13_23, %tmp12 : tensor<128x64xi1> loc(#loc481) + %tmp14 = arith.constant 2048 : i32 loc(#loc482) + %tmp14_25 = arith.constant dense<2048> : tensor<1xi32> loc(#loc482) + %tmp15 = tt.expand_dims %tmp14_25 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc483) + %tmp15_26 = tt.broadcast %tmp15 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc483) + %tmp15_27 = arith.cmpi sge, %n, %tmp15_26 : tensor<1x64xi32> loc(#loc483) + %tmp16 = tt.expand_dims %tmp14_25 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc484) + %tmp16_28 = tt.broadcast %tmp16 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc484) + %tmp16_29 = arith.remsi %n, %tmp16_28 : tensor<1x64xi32> loc(#loc484) + %tmp17 = arith.constant 0 : i32 loc(#loc485) + %tmp17_30 = arith.constant dense<0> : tensor<1xi32> loc(#loc485) + %tmp18 = tt.expand_dims %tmp17_30 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc486) + %tmp18_31 = tt.broadcast %tmp18 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc486) + %tmp18_32 = arith.cmpi ne, %tmp16_29, %tmp18_31 : tensor<1x64xi32> loc(#loc486) + %tmp19 = arith.constant 0 : i32 loc(#loc487) + %tmp19_33 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc487) + %tmp19_34 = arith.cmpi slt, %tmp16_29, %tmp19_33 : tensor<1x64xi32> loc(#loc487) + %tmp20 = arith.constant 0 : i32 loc(#loc488) + %tmp20_35 = arith.constant dense<0> : tensor<1xi32> loc(#loc488) + %tmp20_36 = arith.cmpi slt, %tmp14_25, %tmp20_35 : tensor<1xi32> loc(#loc488) + %tmp21 = tt.expand_dims %tmp20_36 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc489) + %tmp21_37 = tt.broadcast %tmp21 : tensor<1x1xi1> -> tensor<1x64xi1> loc(#loc489) + %tmp21_38 = arith.cmpi ne, %tmp19_34, %tmp21_37 : tensor<1x64xi1> loc(#loc489) + %tmp22 = arith.andi %tmp18_32, %tmp21_38 : tensor<1x64xi1> loc(#loc490) + %tmp23 = tt.expand_dims %tmp14_25 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc491) + %tmp23_39 = tt.broadcast %tmp23 : tensor<1x1xi32> -> tensor<1x64xi32> loc(#loc491) + %tmp23_40 = arith.addi %tmp16_29, %tmp23_39 : tensor<1x64xi32> loc(#loc491) + %tmp24 = arith.select %tmp22, %tmp23_40, %tmp16_29 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc492) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc493) + %tmp26 = tt.splat %tmp7_18 : i64 -> tensor<1x64xi64> loc(#loc494) + %tmp26_41 = arith.cmpi slt, %tmp25, %tmp26 : tensor<1x64xi64> loc(#loc494) + %tmp27 = arith.andi %tmp15_27, %tmp26_41 : tensor<1x64xi1> loc(#loc495) + %tmp28 = tt.broadcast %n : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc496) + %tmp28_42 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc496) + %tmp28_43 = arith.subi %tmp28, %tmp28_42 : tensor<128x64xi32> loc(#loc496) + %tmp29 = tt.expand_dims %tmp14_25 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc497) + %tmp29_44 = tt.broadcast %tmp29 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc497) + %tmp29_45 = arith.remsi %tmp28_43, %tmp29_44 : tensor<128x64xi32> loc(#loc497) + %tmp30 = tt.expand_dims %tmp17_30 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc498) + %tmp30_46 = tt.broadcast %tmp30 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc498) + %tmp30_47 = arith.cmpi ne, %tmp29_45, %tmp30_46 : tensor<128x64xi32> loc(#loc498) + %tmp31 = arith.constant 0 : i32 loc(#loc499) + %tmp31_48 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc499) + %tmp31_49 = arith.cmpi slt, %tmp29_45, %tmp31_48 : tensor<128x64xi32> loc(#loc499) + %tmp32 = tt.expand_dims %tmp20_36 {axis = 0 : i32} : tensor<1xi1> -> tensor<1x1xi1> loc(#loc500) + %tmp32_50 = tt.broadcast %tmp32 : tensor<1x1xi1> -> tensor<128x64xi1> loc(#loc500) + %tmp32_51 = arith.cmpi ne, %tmp31_49, %tmp32_50 : tensor<128x64xi1> loc(#loc500) + %tmp33 = arith.andi %tmp30_47, %tmp32_51 : tensor<128x64xi1> loc(#loc501) + %tmp34 = tt.expand_dims %tmp14_25 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc502) + %tmp34_52 = tt.broadcast %tmp34 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc502) + %tmp34_53 = arith.addi %tmp29_45, %tmp34_52 : tensor<128x64xi32> loc(#loc502) + %tmp35 = arith.select %tmp33, %tmp34_53, %tmp29_45 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc503) + %tmp36 = tt.expand_dims %tmp17_30 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc504) + %tmp36_54 = tt.broadcast %tmp36 : tensor<1x1xi32> -> tensor<128x64xi32> loc(#loc504) + %tmp36_55 = arith.cmpi eq, %tmp35, %tmp36_54 : tensor<128x64xi32> loc(#loc504) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc505) + %tmp37_56 = arith.andi %tmp37, %tmp36_55 : tensor<128x64xi1> loc(#loc505) + %tmp38 = arith.ori %tmp13_24, %tmp37_56 : tensor<128x64xi1> loc(#loc506) + %mask_mod_output = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc507) + %mask_mod_output_57 = arith.cmpi slt, %offs_n, %mask_mod_output : tensor<1x64xi32> loc(#loc507) + %mask_mod_output_58 = arith.constant false loc(#loc508) + %mask_mod_output_59 = arith.constant false loc(#loc508) + %mask_mod_output_60 = arith.constant dense : tensor<128x64xi1> loc(#loc508) + %mask_mod_output_61 = tt.broadcast %mask_mod_output_57 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc508) + %mask_mod_output_62 = arith.select %mask_mod_output_61, %tmp38, %mask_mod_output_60 : tensor<128x64xi1>, tensor<128x64xi1> loc(#loc508) + %post_mod_scores_63 = arith.constant 0xFF800000 : f32 loc(#loc509) + %post_mod_scores_64 = arith.constant 0xFF800000 : f32 loc(#loc509) + %post_mod_scores_65 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc509) + %post_mod_scores_66 = arith.select %mask_mod_output_62, %post_mod_scores_14, %post_mod_scores_65 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc509) + %post_mod_scores_67 = arith.constant 1.44269502 : f32 loc(#loc510) + %post_mod_scores_68 = arith.constant 1.44269502 : f32 loc(#loc510) + %post_mod_scores_69 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc510) + %post_mod_scores_70 = arith.mulf %post_mod_scores_66, %post_mod_scores_69 : tensor<128x64xf32> loc(#loc510) + %m_ij = tt.call @"triton.language.standard.max__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cconstexpr_True__(4,)cconstexpr_False_"(%post_mod_scores_70) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc511) + %m_ij_71 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc512) + %masked_out_rows = arith.constant 0xFF800000 : f32 loc(#loc513) + %masked_out_rows_72 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc513) + %masked_out_rows_73 = arith.cmpf oeq, %m_ij_71, %masked_out_rows_72 : tensor<128xf32> loc(#loc513) + %m_ij_masked = arith.constant 0 : i32 loc(#loc514) + %m_ij_masked_74 = arith.constant 0.000000e+00 : f32 loc(#loc514) + %m_ij_masked_75 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc514) + %m_ij_masked_76 = arith.select %masked_out_rows_73, %m_ij_masked_75, %m_ij_71 : tensor<128xi1>, tensor<128xf32> loc(#loc514) + %alpha = arith.subf %m_i, %m_ij_masked_76 : tensor<128xf32> loc(#loc515) + %alpha_77 = math.exp2 %alpha : tensor<128xf32> loc(#loc516) + %p = tt.expand_dims %m_ij_masked_76 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc517) + %p_78 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc518) + %p_79 = arith.subf %post_mod_scores_70, %p_78 : tensor<128x64xf32> loc(#loc518) + %p_80 = math.exp2 %p_79 : tensor<128x64xf32> loc(#loc519) + %l_i_81 = arith.mulf %l_i, %alpha_77 : tensor<128xf32> loc(#loc520) + %l_i_82 = tt.call @"triton.language.standard.sum__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%p_80) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc521) + %l_i_83 = arith.addf %l_i_81, %l_i_82 : tensor<128xf32> loc(#loc522) + %acc_84 = tt.expand_dims %alpha_77 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc523) + %acc_85 = tt.broadcast %acc_84 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc524) + %acc_86 = arith.mulf %acc, %acc_85 : tensor<128x128xf32> loc(#loc524) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc525) + %v = tt.call @"torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%V, %offs_n_load_1, %offs_v, %stride_vn, %stride_vk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc526) + %acc_87 = arith.truncf %p_80 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc527) + %acc_88 = arith.constant 0.000000e+00 : f32 loc(#loc528) + %acc_89 = tt.dot %acc_87, %v, %acc_86, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc528) + tt.return %acc_89, %l_i_83, %m_ij_71 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc224) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc225) + %1 = ub.poison : tensor<128xf32> loc(#loc225) + %2 = ub.poison : tensor<128xf32> loc(#loc225) + tt.return %0, %1, %2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc225) + } loc(#loc153) + tt.func private @"torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%ptr: !tt.ptr loc("ptr"(#loc125)), %offs_m: tensor<64xi32> loc("offs_m"(#loc125)), %offs_n: tensor<128xi32> loc("offs_n"(#loc125)), %stride_m: i32 loc("stride_m"(#loc125)), %stride_n: i32 loc("stride_n"(#loc125)), %M_LEN: i32 loc("M_LEN"(#loc125))) -> tensor<64x128xbf16> attributes {noinline = false} { + %ptr_0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc383) + %ptr_1 = tt.splat %stride_m : i32 -> tensor<64x1xi32> loc(#loc384) + %ptr_2 = arith.muli %ptr_0, %ptr_1 : tensor<64x1xi32> loc(#loc384) + %ptr_3 = tt.splat %ptr : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc385) + %ptr_4 = tt.addptr %ptr_3, %ptr_2 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc385) + %ptr_5 = tt.expand_dims %offs_n {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc386) + %ptr_6 = tt.splat %stride_n : i32 -> tensor<1x128xi32> loc(#loc387) + %ptr_7 = arith.muli %ptr_5, %ptr_6 : tensor<1x128xi32> loc(#loc387) + %ptr_8 = tt.broadcast %ptr_4 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc388) + %ptr_9 = tt.broadcast %ptr_7 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc388) + %ptr_10 = tt.addptr %ptr_8, %ptr_9 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc388) + %0 = tt.expand_dims %offs_m {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc132) + %1 = tt.splat %M_LEN : i32 -> tensor<64x1xi32> loc(#loc133) + %2 = arith.cmpi slt, %0, %1 : tensor<64x1xi32> loc(#loc133) + %cst = arith.constant 0.000000e+00 : f32 loc(#loc134) + %3 = tt.broadcast %2 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc134) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<64x128xf32> loc(#loc134) + %4 = arith.truncf %cst_11 : tensor<64x128xf32> to tensor<64x128xbf16> loc(#loc134) + %5 = tt.load %ptr_10, %3, %4 : tensor<64x128x!tt.ptr> loc(#loc134) + tt.return %5 : tensor<64x128xbf16> loc(#loc135) + ^bb1: // no predecessors + %6 = ub.poison : tensor<64x128xbf16> loc(#loc136) + tt.return %6 : tensor<64x128xbf16> loc(#loc136) + } loc(#loc125) + tt.func private @torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.get_bounded_indices__i32S128_1S_i32__(%indices: tensor<128x1xi32> loc("indices"(#loc226)), %max_len: i32 loc("max_len"(#loc226))) -> tensor<128x1xi32> attributes {noinline = false} { + %0 = tt.splat %max_len : i32 -> tensor<128x1xi32> loc(#loc227) + %1 = arith.remsi %indices, %0 : tensor<128x1xi32> loc(#loc227) + tt.return %1 : tensor<128x1xi32> loc(#loc228) + ^bb1: // no predecessors + %2 = ub.poison : tensor<128x1xi32> loc(#loc229) + tt.return %2 : tensor<128x1xi32> loc(#loc229) + } loc(#loc226) + tt.func private @torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.get_bounded_indices__i32S1_64S_i32__(%indices: tensor<1x64xi32> loc("indices"(#loc226)), %max_len: i32 loc("max_len"(#loc226))) -> tensor<1x64xi32> attributes {noinline = false} { + %0 = tt.splat %max_len : i32 -> tensor<1x64xi32> loc(#loc227) + %1 = arith.remsi %indices, %0 : tensor<1x64xi32> loc(#loc227) + tt.return %1 : tensor<1x64xi32> loc(#loc228) + ^bb1: // no predecessors + %2 = ub.poison : tensor<1x64xi32> loc(#loc229) + tt.return %2 : tensor<1x64xi32> loc(#loc229) + } loc(#loc226) + tt.func private @"triton.language.standard.max__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cconstexpr_True__(4,)cconstexpr_False_"(%input: tensor<128x64xf32> loc("input"(#loc230))) -> tensor<128xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._elementwise_max__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc231) + tt.reduce.return %2 : f32 loc(#loc231) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc231) + tt.return %0 : tensor<128xf32> loc(#loc233) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128xf32> loc(#loc234) + tt.return %1 : tensor<128xf32> loc(#loc234) + } loc(#loc230) + tt.func private @triton.language.standard._elementwise_max__fp32_fp32__(%a: f32 loc("a"(#loc235)), %b: f32 loc("b"(#loc235))) -> f32 attributes {noinline = false} { + %0 = arith.maxnumf %a, %b : f32 loc(#loc236) + tt.return %0 : f32 loc(#loc237) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc238) + tt.return %1 : f32 loc(#loc238) + } loc(#loc235) + tt.func private @"triton.language.standard.sum__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x64xf32> loc("input"(#loc239))) -> tensor<128xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc240) + tt.reduce.return %2 : f32 loc(#loc240) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc240) + tt.return %0 : tensor<128xf32> loc(#loc241) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128xf32> loc(#loc242) + tt.return %1 : tensor<128xf32> loc(#loc242) + } loc(#loc239) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc243)), %b: f32 loc("b"(#loc243))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc244) + tt.return %0 : f32 loc(#loc245) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc246) + tt.return %1 : f32 loc(#loc246) + } loc(#loc243) + tt.func private @"torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%loop_iter: i32 loc("loop_iter"(#loc247)), %col_indices: !tt.ptr loc("col_indices"(#loc247)), %total_blocks: i32 loc("total_blocks"(#loc247))) -> i32 attributes {noinline = false} { + %cur_block_idx = arith.constant 2 : i32 loc(#loc540) + %cur_block_idx_0 = arith.constant 2 : i32 loc(#loc540) + %cur_block_idx_1 = arith.divsi %loop_iter, %cur_block_idx_0 : i32 loc(#loc540) + %cur_block = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc541) + %cur_block_2 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc542) + %next_block = arith.constant 1 : i32 loc(#loc543) + %next_block_3 = arith.constant 1 : i32 loc(#loc543) + %next_block_4 = arith.addi %cur_block_idx_1, %next_block_3 : i32 loc(#loc543) + %next_block_5 = arith.cmpi slt, %next_block_4, %total_blocks : i32 loc(#loc544) + %next_block_6 = tt.addptr %col_indices, %cur_block_idx_1 : !tt.ptr, i32 loc(#loc545) + %next_block_7 = arith.constant 1 : i32 loc(#loc546) + %next_block_8 = tt.addptr %next_block_6, %next_block_7 : !tt.ptr, i32 loc(#loc546) + %next_block_9 = tt.load %next_block_8, %next_block_5 evictionPolicy = evict_last : !tt.ptr loc(#loc547) + %needs_jump = arith.constant 1 : i32 loc(#loc548) + %needs_jump_10 = arith.constant 1 : i32 loc(#loc548) + %needs_jump_11 = arith.addi %loop_iter, %needs_jump_10 : i32 loc(#loc548) + %needs_jump_12 = arith.constant 2 : i32 loc(#loc549) + %needs_jump_13 = arith.constant 2 : i32 loc(#loc549) + %needs_jump_14 = arith.remsi %needs_jump_11, %needs_jump_13 : i32 loc(#loc549) + %needs_jump_15 = arith.constant 0 : i32 loc(#loc550) + %needs_jump_16 = arith.cmpi eq, %needs_jump_14, %needs_jump_15 : i32 loc(#loc550) + %jump_to_block = arith.subi %next_block_9, %cur_block_2 : i32 loc(#loc551) + %jump_to_block_17 = arith.constant 128 : i32 loc(#loc552) + %jump_to_block_18 = arith.constant 128 : i32 loc(#loc552) + %jump_to_block_19 = arith.muli %jump_to_block, %jump_to_block_18 : i32 loc(#loc552) + %jump_to_block_20 = arith.constant 64 : i32 loc(#loc553) + %jump_to_block_21 = arith.constant 64 : i32 loc(#loc553) + %jump_to_block_22 = arith.subi %jump_to_block_19, %jump_to_block_21 : i32 loc(#loc553) + %offset = arith.extui %needs_jump_16 : i1 to i32 loc(#loc554) + %offset_23 = arith.muli %jump_to_block_22, %offset : i32 loc(#loc554) + %offset_24 = arith.constant 1 : i32 loc(#loc555) + %offset_25 = arith.constant 1 : i32 loc(#loc555) + %offset_26 = arith.extui %needs_jump_16 : i1 to i32 loc(#loc555) + %offset_27 = arith.subi %offset_25, %offset_26 : i32 loc(#loc555) + %offset_28 = arith.constant 64 : i32 loc(#loc556) + %offset_29 = arith.constant 64 : i32 loc(#loc556) + %offset_30 = arith.muli %offset_27, %offset_29 : i32 loc(#loc556) + %offset_31 = arith.addi %offset_23, %offset_30 : i32 loc(#loc557) + tt.return %offset_31 : i32 loc(#loc266) + ^bb1: // no predecessors + %0 = ub.poison : i32 loc(#loc267) + tt.return %0 : i32 loc(#loc267) + } loc(#loc247) + tt.func private @"torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.forward_inner__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_Pi32_i32_i32_i32_i32_i32_i32__(16,)cNone_(17,)cNone_(30,)cconstexpr_0__(32,)cconstexpr_bf16__(37,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc143)), %arg_K: !tt.ptr loc("arg_K"(#loc143)), %arg_V: !tt.ptr loc("arg_V"(#loc143)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc143)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc143)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc143)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc143)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc143)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc143)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc143)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc143)), %ks0: i32 loc("ks0"(#loc143)), %ks1: i32 loc("ks1"(#loc143)), %q: tensor<128x128xbf16> loc("q"(#loc143)), %K: !tt.ptr loc("K"(#loc143)), %V: !tt.ptr loc("V"(#loc143)), %Q_LEN: i32 loc("Q_LEN"(#loc143)), %KV_LEN: i32 loc("KV_LEN"(#loc143)), %acc: tensor<128x128xf32> loc("acc"(#loc143)), %l_i: tensor<128xf32> loc("l_i"(#loc143)), %m_i: tensor<128xf32> loc("m_i"(#loc143)), %off_z: i32 loc("off_z"(#loc143)), %off_h: i32 loc("off_h"(#loc143)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc143)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc143)), %kv_start: i32 loc("kv_start"(#loc143)), %kv_indices: !tt.ptr loc("kv_indices"(#loc143)), %kv_num_blocks: i32 loc("kv_num_blocks"(#loc143)), %block_n_end: i32 loc("block_n_end"(#loc143)), %stride_kk: i32 loc("stride_kk"(#loc143)), %stride_kn: i32 loc("stride_kn"(#loc143)), %stride_vn: i32 loc("stride_vn"(#loc143)), %stride_vk: i32 loc("stride_vk"(#loc143))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_offset = arith.constant 0 : i32 loc(#loc423) + %c0_i32 = arith.constant 0 : i32 loc(#loc145) + %c1_i32 = arith.constant 1 : i32 loc(#loc145) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc145) + %1 = arith.bitcast %block_n_end : i32 to i32 loc(#loc145) + %2 = arith.bitcast %c1_i32 : i32 to i32 loc(#loc145) + %3 = ub.poison : i32 loc(#loc145) + %kv_offset_0:5 = scf.for %start_n = %0 to %1 step %2 iter_args(%acc_1 = %acc, %l_i_2 = %l_i, %m_i_3 = %m_i, %offs_n_4 = %offs_n, %kv_offset_5 = %kv_offset) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %7:3 = tt.call @"torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(16,)cconstexpr_None__(17,)cconstexpr_None__(29,)cconstexpr_bf16__(30,)cconstexpr_1_d_44269504__(35,)cconstexpr_True__(36,)cconstexpr_True_"(%arg_Q, %arg_K, %arg_V, %arg_LSE, %arg_MAX, %arg_KV_NUM_BLKS, %arg_KV_IDX, %arg_FULL_KV_NUM_BLKS, %arg_FULL_KV_IDX, %in_ptr9, %out_ptr0, %ks0, %ks1, %q, %K, %V, %Q_LEN, %KV_LEN, %acc_1, %l_i_2, %m_i_3, %off_z, %off_h, %offs_m, %offs_n_4, %kv_start, %kv_offset_5, %stride_kk, %stride_kn, %stride_vn, %stride_vk) : (!tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xbf16>, !tt.ptr, !tt.ptr, i32, i32, tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, i32, i32, tensor<128x1xi32>, tensor<1x64xi32>, i32, i32, i32, i32, i32, i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) loc(#loc146) + %offset = tt.call @"torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.get_offset_for_next_block__i32_Pi32_i32__(3,)cconstexpr_128__(4,)cconstexpr_2__(5,)cconstexpr_64__(6,)cconstexpr_False_"(%start_n, %kv_indices, %kv_num_blocks) : (i32, !tt.ptr, i32) -> i32 loc(#loc425) + %offs_n_6 = tt.splat %offset : i32 -> tensor<1x64xi32> loc(#loc426) + %offs_n_7 = arith.addi %offs_n_4, %offs_n_6 : tensor<1x64xi32> loc(#loc426) + %kv_offset_8 = arith.addi %kv_offset_5, %offset : i32 loc(#loc427) + scf.yield %7#0, %7#1, %7#2, %offs_n_7, %kv_offset_8 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc150) + } loc(#loc561) + tt.return %kv_offset_0#0, %kv_offset_0#1, %kv_offset_0#2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc151) + ^bb1: // no predecessors + %4 = ub.poison : tensor<128x128xf32> loc(#loc152) + %5 = ub.poison : tensor<128xf32> loc(#loc152) + %6 = ub.poison : tensor<128xf32> loc(#loc152) + tt.return %4, %5, %6 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc152) + } loc(#loc143) + tt.func private @"torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.forward_block_mn__Pbf16_Pbf16_Pbf16_Pfp32_Pfp32_Pi32_Pi32_Pi32_Pi32_Pi64_Pbf16_i32_i32_bf16S128_128S_Pbf16_Pbf16_i32_i32_fp32S128_128S_fp32S128S_fp32S128S_i32_i32_i32S128_1S_i32S1_64S_i32_i32_i32_i32_i32_i32__(16,)cconstexpr_None__(17,)cconstexpr_None__(29,)cconstexpr_bf16__(30,)cconstexpr_1_d_44269504__(35,)cconstexpr_True__(36,)cconstexpr_True_"(%arg_Q: !tt.ptr loc("arg_Q"(#loc153)), %arg_K: !tt.ptr loc("arg_K"(#loc153)), %arg_V: !tt.ptr loc("arg_V"(#loc153)), %arg_LSE: !tt.ptr loc("arg_LSE"(#loc153)), %arg_MAX: !tt.ptr loc("arg_MAX"(#loc153)), %arg_KV_NUM_BLKS: !tt.ptr loc("arg_KV_NUM_BLKS"(#loc153)), %arg_KV_IDX: !tt.ptr loc("arg_KV_IDX"(#loc153)), %arg_FULL_KV_NUM_BLKS: !tt.ptr loc("arg_FULL_KV_NUM_BLKS"(#loc153)), %arg_FULL_KV_IDX: !tt.ptr loc("arg_FULL_KV_IDX"(#loc153)), %in_ptr9: !tt.ptr loc("in_ptr9"(#loc153)), %out_ptr0: !tt.ptr loc("out_ptr0"(#loc153)), %ks0: i32 loc("ks0"(#loc153)), %ks1: i32 loc("ks1"(#loc153)), %q: tensor<128x128xbf16> loc("q"(#loc153)), %K: !tt.ptr loc("K"(#loc153)), %V: !tt.ptr loc("V"(#loc153)), %Q_LEN: i32 loc("Q_LEN"(#loc153)), %KV_LEN: i32 loc("KV_LEN"(#loc153)), %acc: tensor<128x128xf32> loc("acc"(#loc153)), %l_i: tensor<128xf32> loc("l_i"(#loc153)), %m_i: tensor<128xf32> loc("m_i"(#loc153)), %off_z: i32 loc("off_z"(#loc153)), %off_h: i32 loc("off_h"(#loc153)), %offs_m: tensor<128x1xi32> loc("offs_m"(#loc153)), %offs_n: tensor<1x64xi32> loc("offs_n"(#loc153)), %kv_start: i32 loc("kv_start"(#loc153)), %kv_offset: i32 loc("kv_offset"(#loc153)), %stride_kk: i32 loc("stride_kk"(#loc153)), %stride_kn: i32 loc("stride_kn"(#loc153)), %stride_vn: i32 loc("stride_vn"(#loc153)), %stride_vk: i32 loc("stride_vk"(#loc153))) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>) attributes {noinline = false} { + %kv_base_offset = arith.addi %kv_start, %kv_offset : i32 loc(#loc459) + %offs_k = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc460) + %offs_n_load = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc461) + %offs_n_load_0 = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc462) + %offs_n_load_1 = arith.addi %offs_n_load_0, %offs_n_load : tensor<64xi32> loc(#loc462) + %k = tt.call @"torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%K, %offs_n_load_1, %offs_k, %stride_kn, %stride_kk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc463) + %k_2 = tt.trans %k {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc464) + %qk = arith.constant 0.000000e+00 : f32 loc(#loc465) + %qk_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc465) + %qk_4 = tt.dot %q, %k_2, %qk_3, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc465) + %qk_5 = arith.constant 0.0883883461 : f32 loc(#loc466) + %qk_6 = arith.constant 0.0883883461 : f32 loc(#loc466) + %qk_7 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc466) + %qk_8 = arith.mulf %qk_4, %qk_7 : tensor<128x64xf32> loc(#loc466) + %m = tt.call @torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.get_bounded_indices__i32S128_1S_i32__(%offs_m, %Q_LEN) : (tensor<128x1xi32>, i32) -> tensor<128x1xi32> loc(#loc467) + %n = tt.call @torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.get_bounded_indices__i32S1_64S_i32__(%offs_n, %KV_LEN) : (tensor<1x64xi32>, i32) -> tensor<1x64xi32> loc(#loc468) + %post_mod_scores = tt.splat %KV_LEN : i32 -> tensor<1x64xi32> loc(#loc469) + %post_mod_scores_9 = arith.cmpi slt, %offs_n, %post_mod_scores : tensor<1x64xi32> loc(#loc469) + %post_mod_scores_10 = arith.constant 0xFF800000 : f32 loc(#loc470) + %post_mod_scores_11 = arith.constant 0xFF800000 : f32 loc(#loc470) + %post_mod_scores_12 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc470) + %post_mod_scores_13 = tt.broadcast %post_mod_scores_9 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc470) + %post_mod_scores_14 = arith.select %post_mod_scores_13, %qk_8, %post_mod_scores_12 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc470) + %post_mod_scores_15 = arith.constant 1.44269502 : f32 loc(#loc510) + %post_mod_scores_16 = arith.constant 1.44269502 : f32 loc(#loc510) + %post_mod_scores_17 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc510) + %post_mod_scores_18 = arith.mulf %post_mod_scores_14, %post_mod_scores_17 : tensor<128x64xf32> loc(#loc510) + %m_ij = tt.call @"triton.language.standard.max__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cconstexpr_True__(4,)cconstexpr_False_"(%post_mod_scores_18) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc511) + %m_ij_19 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc512) + %masked_out_rows = arith.constant 0xFF800000 : f32 loc(#loc513) + %masked_out_rows_20 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc513) + %masked_out_rows_21 = arith.cmpf oeq, %m_ij_19, %masked_out_rows_20 : tensor<128xf32> loc(#loc513) + %m_ij_masked = arith.constant 0 : i32 loc(#loc514) + %m_ij_masked_22 = arith.constant 0.000000e+00 : f32 loc(#loc514) + %m_ij_masked_23 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc514) + %m_ij_masked_24 = arith.select %masked_out_rows_21, %m_ij_masked_23, %m_ij_19 : tensor<128xi1>, tensor<128xf32> loc(#loc514) + %alpha = arith.subf %m_i, %m_ij_masked_24 : tensor<128xf32> loc(#loc515) + %alpha_25 = math.exp2 %alpha : tensor<128xf32> loc(#loc516) + %p = tt.expand_dims %m_ij_masked_24 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc517) + %p_26 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc518) + %p_27 = arith.subf %post_mod_scores_18, %p_26 : tensor<128x64xf32> loc(#loc518) + %p_28 = math.exp2 %p_27 : tensor<128x64xf32> loc(#loc519) + %l_i_29 = arith.mulf %l_i, %alpha_25 : tensor<128xf32> loc(#loc520) + %l_i_30 = tt.call @"triton.language.standard.sum__fp32S128_64S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%p_28) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc521) + %l_i_31 = arith.addf %l_i_29, %l_i_30 : tensor<128xf32> loc(#loc522) + %acc_32 = tt.expand_dims %alpha_25 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc523) + %acc_33 = tt.broadcast %acc_32 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc524) + %acc_34 = arith.mulf %acc, %acc_33 : tensor<128x128xf32> loc(#loc524) + %offs_v = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc525) + %v = tt.call @"torch._inductor.runtime.compile_tasks.cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.load_checked_2d__Pbf16_i32S64S_i32S128S_i32_i32_i32__(5,)cconstexpr_False__(6,)cconstexpr_True__(8,)cconstexpr_128_"(%V, %offs_n_load_1, %offs_v, %stride_vn, %stride_vk, %KV_LEN) : (!tt.ptr, tensor<64xi32>, tensor<128xi32>, i32, i32, i32) -> tensor<64x128xbf16> loc(#loc526) + %acc_35 = arith.truncf %p_28 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc527) + %acc_36 = arith.constant 0.000000e+00 : f32 loc(#loc528) + %acc_37 = tt.dot %acc_35, %v, %acc_34, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc528) + tt.return %acc_37, %l_i_31, %m_ij_19 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc224) + ^bb1: // no predecessors + %0 = ub.poison : tensor<128x128xf32> loc(#loc225) + %1 = ub.poison : tensor<128xf32> loc(#loc225) + %2 = ub.poison : tensor<128xf32> loc(#loc225) + tt.return %0, %1, %2 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32> loc(#loc225) + } loc(#loc153) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":85:49) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":86:54) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":86:63) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":86:49) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":87:54) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":87:63) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":87:49) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":89:9) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":90:9) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":91:12) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":92:10) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":97:28) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":98:27) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":99:27) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":103:23) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":104:24) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":105:21) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":107:24) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":107:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":107:36) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":108:25) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":108:47) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":108:37) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":109:25) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":109:47) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":109:37) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":111:12) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":112:12) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":113:12) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":120:15) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":121:16) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":123:28) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":124:29) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":129:27) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":130:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":134:19) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":134:50) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":135:19) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":136:19) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":138:23) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":138:46) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":138:33) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":141:38) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":141:50) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":142:51) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":142:85) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":142:74) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":143:46) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":143:76) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":143:97) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":143:64) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":144:23) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":144:46) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":144:33) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":145:26) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":146:101) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":151:26) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":152:23) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":152:37) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":153:42) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":153:28) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":154:45) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":154:92) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":154:102) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":154:65) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":159:37) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":159:24) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":167:31) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":167:48) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":172:41) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":181:35) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":182:27) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":182:41) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":183:51) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":183:32) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":184:49) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":184:96) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":184:106) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":184:69) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":186:41) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":186:28) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":193:35) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":193:52) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":198:45) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":206:26) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":206:34) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":208:20) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":208:16) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":209:27) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":210:27) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":211:19) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":212:25) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":212:45) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":214:20) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":214:38) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":214:30) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":217:25) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":217:21) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":217:40) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":217:33) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":217:57) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":217:49) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":218:53) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":218:49) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":218:67) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":218:62) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":218:83) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":218:75) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":218:25) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":218:109) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":221:26) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":221:31) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":222:32) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":222:23) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":222:40) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":223:33) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":223:20) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":227:48) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":227:29) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":229:4) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":284:27) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":284:38) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":284:20) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":284:56) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":284:67) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":284:49) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":292:41) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":292:52) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":292:23) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":292:15) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":287:4) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:16) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc140 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc141 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:11) +#loc142 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:4) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":499:16) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":502:40) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":538:16) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":545:63) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":548:26) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":549:21) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":549:8) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":552:11) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":552:4) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":342:32) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":345:26) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":346:48) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":346:35) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":347:107) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":349:17) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":351:19) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":353:14) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":358:36) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":359:36) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":367:44) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":367:69) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":370:35) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":373:23) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":374:23) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":376:33) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":376:23) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":377:22) +#loc172 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":378:23) +#loc173 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":379:23) +#loc174 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":380:23) +#loc175 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":381:23) +#loc176 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":382:23) +#loc177 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":383:35) +#loc178 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":384:24) +#loc179 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":385:24) +#loc180 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":386:32) +#loc181 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":387:25) +#loc182 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":388:92) +#loc183 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":389:92) +#loc184 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":390:25) +#loc185 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":391:24) +#loc186 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":392:24) +#loc187 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":393:39) +#loc188 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":394:25) +#loc189 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":395:24) +#loc190 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":396:24) +#loc191 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":397:23) +#loc192 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":398:25) +#loc193 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":399:25) +#loc194 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":400:92) +#loc195 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":401:25) +#loc196 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":402:24) +#loc197 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":403:24) +#loc198 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":404:39) +#loc199 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":405:25) +#loc200 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":406:24) +#loc201 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":407:24) +#loc202 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":412:48) +#loc203 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":412:73) +#loc204 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":414:69) +#loc205 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":417:27) +#loc206 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":421:51) +#loc207 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":421:27) +#loc208 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":423:35) +#loc209 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":424:51) +#loc210 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":428:31) +#loc211 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":428:25) +#loc212 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":429:51) +#loc213 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":429:39) +#loc214 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":429:21) +#loc215 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":434:16) +#loc216 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":434:34) +#loc217 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":434:24) +#loc218 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":436:22) +#loc219 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":436:16) +#loc220 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":438:26) +#loc221 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":439:107) +#loc222 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":440:22) +#loc223 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":440:44) +#loc224 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":445:11) +#loc225 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":445:4) +#loc227 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":257:21) +#loc228 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":257:11) +#loc229 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":257:4) +#loc231 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40) +#loc233 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:15) +#loc234 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":177:4) +#loc236 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27) +#loc237 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:11) +#loc238 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:4) +#loc240 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc241 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc242 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc244 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc245 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc246 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc248 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":247:33) +#loc249 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":248:38) +#loc250 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":248:24) +#loc251 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":249:109) +#loc252 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":249:113) +#loc253 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":249:39) +#loc254 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":249:55) +#loc255 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":249:25) +#loc256 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":250:30) +#loc257 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":250:35) +#loc258 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":250:60) +#loc259 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":251:34) +#loc260 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":251:48) +#loc261 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":251:63) +#loc262 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":252:29) +#loc263 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":252:47) +#loc264 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":252:61) +#loc265 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":252:42) +#loc266 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":253:11) +#loc267 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":253:4) +#loc281 = loc("ZQ"(#loc8)) +#loc282 = loc("HQ"(#loc9)) +#loc283 = loc("Q_LEN"(#loc10)) +#loc284 = loc("ZKV"(#loc11)) +#loc285 = loc("q_start"(#loc12)) +#loc286 = loc("off_zq"(#loc13)) +#loc287 = loc("off_hq"(#loc14)) +#loc288 = loc("off_zkv"(#loc15)) +#loc289 = loc("off_hkv"(#loc16)) +#loc290 = loc("off_g"(#loc17)) +#loc291 = loc("q_offset"(#loc18)) +#loc292 = loc("q_offset"(#loc19)) +#loc293 = loc("q_offset"(#loc20)) +#loc294 = loc("k_offset"(#loc21)) +#loc295 = loc("k_offset"(#loc22)) +#loc296 = loc("k_offset"(#loc23)) +#loc297 = loc("v_offset"(#loc24)) +#loc298 = loc("v_offset"(#loc25)) +#loc299 = loc("v_offset"(#loc26)) +#loc300 = loc("Q"(#loc27)) +#loc301 = loc("K"(#loc28)) +#loc302 = loc("V"(#loc29)) +#loc303 = loc("SPARSE_Z"(#loc30)) +#loc304 = loc("SPARSE_HQ"(#loc31)) +#loc305 = loc("sparse_idx_z"(#loc32)) +#loc306 = loc("sparse_idx_hq"(#loc33)) +#loc307 = loc("stride_kv_num_blks_h"(#loc34)) +#loc308 = loc("stride_kv_idx_h"(#loc35)) +#loc309 = loc("m_i"(#loc36)) +#loc310 = loc("m_i"(#loc37)) +#loc311 = loc("l_i"(#loc38)) +#loc312 = loc("acc"(#loc39)) +#loc313 = loc("offs_m"(#loc40)) +#loc314 = loc("offs_m"(#loc41)) +#loc315 = loc("offs_m"(#loc42)) +#loc316 = loc("sparse_hz_offset"(#loc43)) +#loc317 = loc("sparse_hz_offset"(#loc44)) +#loc318 = loc("sparse_kv_num_blks_offset"(#loc45)) +#loc319 = loc("sparse_kv_num_blks_offset"(#loc46)) +#loc320 = loc("sparse_kv_num_blks_offset"(#loc47)) +#loc321 = loc("sparse_kv_idx_offset"(#loc48)) +#loc322 = loc("sparse_kv_idx_offset"(#loc49)) +#loc323 = loc("sparse_kv_idx_offset"(#loc50)) +#loc324 = loc("sparse_kv_idx_offset"(#loc51)) +#loc325 = loc("offs_m"(#loc52)) +#loc326 = loc("offs_m"(#loc53)) +#loc327 = loc("offs_m"(#loc54)) +#loc328 = loc("offs_k"(#loc55)) +#loc329 = loc("q"(#loc56)) +#loc330 = loc("kv_indices"(#loc57)) +#loc331 = loc("kv_start"(#loc58)) +#loc332 = loc("kv_start"(#loc59)) +#loc333 = loc("kv_num_blocks"(#loc60)) +#loc334 = loc("kv_num_blocks"(#loc61)) +#loc335 = loc("block_n_end"(#loc62)) +#loc336 = loc("block_n_end"(#loc63)) +#loc337 = loc("block_n_end"(#loc64)) +#loc338 = loc("block_n_end"(#loc65)) +#loc339 = loc("offs_n"(#loc66)) +#loc340 = loc("offs_n"(#loc67)) +#loc341 = loc("kv_indices"(#loc71)) +#loc342 = loc("kv_start"(#loc72)) +#loc343 = loc("kv_start"(#loc73)) +#loc344 = loc("kv_num_blocks"(#loc74)) +#loc345 = loc("kv_num_blocks"(#loc75)) +#loc346 = loc("block_n_end"(#loc76)) +#loc347 = loc("block_n_end"(#loc77)) +#loc348 = loc("block_n_end"(#loc78)) +#loc349 = loc("block_n_end"(#loc79)) +#loc350 = loc("offs_n"(#loc80)) +#loc351 = loc("offs_n"(#loc81)) +#loc352 = loc("l_i"(#loc85)) +#loc353 = loc("l_i"(#loc86)) +#loc354 = loc("acc"(#loc87)) +#loc355 = loc("acc"(#loc88)) +#loc356 = loc("idx_zq"(#loc89)) +#loc357 = loc("idx_hq"(#loc90)) +#loc358 = loc("idx_m"(#loc91)) +#loc359 = loc("idx_d"(#loc92)) +#loc360 = loc("idx_d"(#loc93)) +#loc361 = loc("mask"(#loc94)) +#loc362 = loc("mask"(#loc95)) +#loc363 = loc("mask"(#loc96)) +#loc364 = loc("xindex"(#loc97)) +#loc365 = loc("xindex"(#loc98)) +#loc366 = loc("xindex"(#loc99)) +#loc367 = loc("xindex"(#loc100)) +#loc368 = loc("xindex"(#loc101)) +#loc369 = loc("xindex"(#loc102)) +#loc370 = loc("off_hz"(#loc111)) +#loc371 = loc("off_hz"(#loc112)) +#loc372 = loc("l_ptrs"(#loc113)) +#loc373 = loc("l_ptrs"(#loc114)) +#loc374 = loc("l_ptrs"(#loc115)) +#loc375 = loc("lse"(#loc116)) +#loc376 = loc("lse"(#loc117)) +#loc383 = loc("ptr"(#loc126)) +#loc384 = loc("ptr"(#loc127)) +#loc385 = loc("ptr"(#loc128)) +#loc386 = loc("ptr"(#loc129)) +#loc387 = loc("ptr"(#loc130)) +#loc388 = loc("ptr"(#loc131)) +#loc423 = loc("kv_offset"(#loc144)) +#loc424 = loc("acc"(#loc145)) +#loc425 = loc("offset"(#loc147)) +#loc426 = loc("offs_n"(#loc148)) +#loc427 = loc("kv_offset"(#loc149)) +#loc459 = loc("kv_base_offset"(#loc154)) +#loc460 = loc("offs_k"(#loc155)) +#loc461 = loc("offs_n_load"(#loc156)) +#loc462 = loc("offs_n_load"(#loc157)) +#loc463 = loc("k"(#loc158)) +#loc464 = loc("k"(#loc159)) +#loc465 = loc("qk"(#loc160)) +#loc466 = loc("qk"(#loc161)) +#loc467 = loc("m"(#loc162)) +#loc468 = loc("n"(#loc163)) +#loc469 = loc("post_mod_scores"(#loc164)) +#loc470 = loc("post_mod_scores"(#loc165)) +#loc471 = loc("tmp1"(#loc166)) +#loc472 = loc("tmp4"(#loc167)) +#loc473 = loc("tmp5"(#loc168)) +#loc474 = loc("tmp7"(#loc169)) +#loc475 = loc("tmp7"(#loc170)) +#loc476 = loc("tmp8"(#loc171)) +#loc477 = loc("tmp9"(#loc172)) +#loc478 = loc("tmp10"(#loc173)) +#loc479 = loc("tmp11"(#loc174)) +#loc480 = loc("tmp12"(#loc175)) +#loc481 = loc("tmp13"(#loc176)) +#loc482 = loc("tmp14"(#loc177)) +#loc483 = loc("tmp15"(#loc178)) +#loc484 = loc("tmp16"(#loc179)) +#loc485 = loc("tmp17"(#loc180)) +#loc486 = loc("tmp18"(#loc181)) +#loc487 = loc("tmp19"(#loc182)) +#loc488 = loc("tmp20"(#loc183)) +#loc489 = loc("tmp21"(#loc184)) +#loc490 = loc("tmp22"(#loc185)) +#loc491 = loc("tmp23"(#loc186)) +#loc492 = loc("tmp24"(#loc187)) +#loc493 = loc("tmp25"(#loc188)) +#loc494 = loc("tmp26"(#loc189)) +#loc495 = loc("tmp27"(#loc190)) +#loc496 = loc("tmp28"(#loc191)) +#loc497 = loc("tmp29"(#loc192)) +#loc498 = loc("tmp30"(#loc193)) +#loc499 = loc("tmp31"(#loc194)) +#loc500 = loc("tmp32"(#loc195)) +#loc501 = loc("tmp33"(#loc196)) +#loc502 = loc("tmp34"(#loc197)) +#loc503 = loc("tmp35"(#loc198)) +#loc504 = loc("tmp36"(#loc199)) +#loc505 = loc("tmp37"(#loc200)) +#loc506 = loc("tmp38"(#loc201)) +#loc507 = loc("mask_mod_output"(#loc202)) +#loc508 = loc("mask_mod_output"(#loc203)) +#loc509 = loc("post_mod_scores"(#loc204)) +#loc510 = loc("post_mod_scores"(#loc205)) +#loc511 = loc("m_ij"(#loc206)) +#loc512 = loc("m_ij"(#loc207)) +#loc513 = loc("masked_out_rows"(#loc208)) +#loc514 = loc("m_ij_masked"(#loc209)) +#loc515 = loc("alpha"(#loc210)) +#loc516 = loc("alpha"(#loc211)) +#loc517 = loc("p"(#loc212)) +#loc518 = loc("p"(#loc213)) +#loc519 = loc("p"(#loc214)) +#loc520 = loc("l_i"(#loc215)) +#loc521 = loc("l_i"(#loc216)) +#loc522 = loc("l_i"(#loc217)) +#loc523 = loc("acc"(#loc218)) +#loc524 = loc("acc"(#loc219)) +#loc525 = loc("offs_v"(#loc220)) +#loc526 = loc("v"(#loc221)) +#loc527 = loc("acc"(#loc222)) +#loc528 = loc("acc"(#loc223)) +#loc540 = loc("cur_block_idx"(#loc248)) +#loc541 = loc("cur_block"(#loc249)) +#loc542 = loc("cur_block"(#loc250)) +#loc543 = loc("next_block"(#loc251)) +#loc544 = loc("next_block"(#loc252)) +#loc545 = loc("next_block"(#loc253)) +#loc546 = loc("next_block"(#loc254)) +#loc547 = loc("next_block"(#loc255)) +#loc548 = loc("needs_jump"(#loc256)) +#loc549 = loc("needs_jump"(#loc257)) +#loc550 = loc("needs_jump"(#loc258)) +#loc551 = loc("jump_to_block"(#loc259)) +#loc552 = loc("jump_to_block"(#loc260)) +#loc553 = loc("jump_to_block"(#loc261)) +#loc554 = loc("offset"(#loc262)) +#loc555 = loc("offset"(#loc263)) +#loc556 = loc("offset"(#loc264)) +#loc557 = loc("offset"(#loc265)) +#loc558 = loc("l_i"(#loc424)) +#loc559 = loc("m_i"(#loc558)) +#loc560 = loc("offs_n"(#loc559)) +#loc561 = loc("kv_offset"(#loc560)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..90ada1beed7efba663666d5e70413c650b37a48e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.ttgir @@ -0,0 +1,983 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":18:0) +#loc1 = loc(unknown) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":538:16) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":172:41) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":421:51) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":434:34) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":198:45) +#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 64, 16]}> +#mma1 = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 128, 16]}> +#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}> +#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}> +#smem = #ttg.shared_memory +#loc167 = loc("arg_Q"(#loc)) +#loc168 = loc("arg_K"(#loc)) +#loc169 = loc("arg_V"(#loc)) +#loc170 = loc("arg_LSE"(#loc)) +#loc171 = loc("arg_MAX"(#loc)) +#loc172 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc173 = loc("arg_KV_IDX"(#loc)) +#loc174 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc175 = loc("arg_FULL_KV_IDX"(#loc)) +#loc176 = loc("in_ptr9"(#loc)) +#loc177 = loc("out_ptr0"(#loc)) +#loc178 = loc("ks0"(#loc)) +#loc179 = loc("ks1"(#loc)) +#loc221 = loc(callsite(#loc50 at #loc51)) +#loc265 = loc("m_ij"(#loc97)) +#loc275 = loc("l_i"(#loc109)) +#loc309 = loc(callsite(#loc50 at #loc145)) +#loc376 = loc(callsite(#loc265 at #loc221)) +#loc386 = loc(callsite(#loc275 at #loc221)) +#loc405 = loc(callsite(#loc265 at #loc309)) +#loc415 = loc(callsite(#loc275 at #loc309)) +#loc437 = loc(callsite(#loc1 at #loc376)) +#loc439 = loc(callsite(#loc1 at #loc386)) +#loc467 = loc(callsite(#loc1 at #loc405)) +#loc469 = loc(callsite(#loc1 at #loc415)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %in_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr9"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<2048> : tensor<128x1xi32, #mma> loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x64xi32, #mma> loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<128x64xi32, #mma> loc(#loc1) + %cst_2 = arith.constant dense : tensor<128x64xi1, #mma> loc(#loc1) + %cst_3 = arith.constant dense<2048> : tensor<128x64xi32, #mma> loc(#loc1) + %cst_4 = arith.constant dense<2048> : tensor<1x64xi32, #mma> loc(#loc1) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<4096> : tensor<128x1xi32, #blocked> loc(#loc1) + %cst_8 = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_9 = arith.constant dense<2048> : tensor<128x1xi32, #blocked> loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %cst_10 = arith.constant dense<0.000000e+00> : tensor<128x128xbf16, #blocked> loc(#loc1) + %cst_11 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16, #blocked> loc(#loc1) + %cst_12 = arith.constant dense<2048> : tensor<128xi32, #blocked1> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %c63_i32 = arith.constant 63 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_13 = arith.constant dense<0.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %cst_14 = arith.constant dense<1.000000e+00> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %cst_15 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma1> loc(#loc1) + %cst_16 = arith.constant dense<0.0883883461> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_17 = arith.constant dense<0xFF800000> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_18 = arith.constant dense<1.44269502> : tensor<128x64xf32, #mma> loc(#loc1) + %cst_19 = arith.constant dense<0xFF800000> : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc1) + %c-1_i32 = arith.constant -1 : i32 loc(#loc1) + %c3_i32 = arith.constant 3 : i32 loc(#loc1) + %0 = arith.muli %ks0, %c1024_i32 : i32 loc(#loc2) + %1 = arith.muli %ks0, %c128_i32 : i32 loc(#loc3) + %q_start = tt.get_program_id x : i32 loc(#loc180) + %off_zq = tt.get_program_id y : i32 loc(#loc181) + %off_hq = tt.get_program_id z : i32 loc(#loc182) + %off_zkv = arith.remsi %off_zq, %c8_i32 : i32 loc(#loc183) + %off_hkv = arith.divsi %off_hq, %c4_i32 : i32 loc(#loc184) + %q_offset = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc185) + %q_offset_20 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc186) + %q_offset_21 = arith.addi %q_offset, %q_offset_20 : i32 loc(#loc187) + %k_offset = arith.muli %off_zkv, %0 : i32 loc(#loc188) + %k_offset_22 = arith.muli %off_hkv, %1 : i32 loc(#loc189) + %k_offset_23 = arith.addi %k_offset, %k_offset_22 : i32 loc(#loc190) + %Q = tt.addptr %arg_Q, %q_offset_21 : !tt.ptr, i32 loc(#loc191) + %K = tt.addptr %arg_K, %k_offset_23 : !tt.ptr, i32 loc(#loc192) + %V = tt.addptr %arg_V, %k_offset_23 : !tt.ptr, i32 loc(#loc193) + %stride_kv_idx_h = arith.muli %ks1, %c16_i32 : i32 loc(#loc194) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %c16_i32 : i32 loc(#loc195) + %sparse_kv_num_blks_offset_24 = arith.addi %sparse_kv_num_blks_offset, %q_start : i32 loc(#loc196) + %sparse_kv_idx_offset = arith.muli %off_zkv, %stride_kv_idx_h : i32 loc(#loc197) + %sparse_kv_idx_offset_25 = arith.muli %q_start, %ks1 : i32 loc(#loc198) + %sparse_kv_idx_offset_26 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_25 : i32 loc(#loc199) + %offs_m = arith.muli %q_start, %c128_i32 : i32 loc(#loc200) + %offs_m_27 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc201) + %offs_m_28 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc201) + %offs_m_29 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked1> loc(#loc201) + %offs_m_30 = tt.splat %offs_m : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc202) + %offs_m_31 = tt.splat %offs_m : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc202) + %offs_m_32 = tt.splat %offs_m : i32 -> tensor<128xi32, #blocked1> loc(#loc202) + %offs_m_33 = arith.addi %offs_m_30, %offs_m_27 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc202) + %offs_m_34 = arith.addi %offs_m_31, %offs_m_28 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc202) + %offs_m_35 = arith.addi %offs_m_32, %offs_m_29 : tensor<128xi32, #blocked1> loc(#loc202) + %ptr = tt.expand_dims %offs_m_33 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked> loc(#loc323) + %ptr_36 = tt.expand_dims %offs_m_34 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xi32, #mma> loc(#loc323) + %ptr_37 = arith.muli %ptr, %cst_7 : tensor<128x1xi32, #blocked> loc(#loc324) + %ptr_38 = tt.splat %Q : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> loc(#loc325) + %ptr_39 = tt.addptr %ptr_38, %ptr_37 : tensor<128x1x!tt.ptr, #blocked>, tensor<128x1xi32, #blocked> loc(#loc325) + %ptr_40 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc326) + %ptr_41 = tt.expand_dims %ptr_40 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc326) + %ptr_42 = tt.broadcast %ptr_39 : tensor<128x1x!tt.ptr, #blocked> -> tensor<128x128x!tt.ptr, #blocked> loc(#loc327) + %ptr_43 = tt.broadcast %ptr_41 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc327) + %ptr_44 = tt.addptr %ptr_42, %ptr_43 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc327) + %q = arith.cmpi slt, %ptr, %cst_9 : tensor<128x1xi32, #blocked> loc(#loc328) + %q_45 = tt.broadcast %q : tensor<128x1xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc329) + %q_46 = tt.load %ptr_44, %q_45, %cst_10 : tensor<128x128x!tt.ptr, #blocked> loc(#loc329) + %q_47 = ttg.local_alloc %q_46 : (tensor<128x128xbf16, #blocked>) -> !ttg.memdesc<128x128xbf16, #shared, #smem> loc(#loc329) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_26 : !tt.ptr, i32 loc(#loc209) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc210) + %kv_start_48 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc211) + %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_24 : !tt.ptr, i32 loc(#loc212) + %kv_num_blocks_49 = tt.load %kv_num_blocks : !tt.ptr loc(#loc213) + %block_n_end = arith.muli %kv_num_blocks_49, %c2_i32 : i32 loc(#loc214) + %block_n_end_50 = arith.addi %ks0, %c63_i32 : i32 loc(#loc330) + %block_n_end_51 = arith.divsi %block_n_end_50, %c64_i32 : i32 loc(#loc331) + %block_n_end_52 = arith.maxsi %block_n_end_51, %c1_i32 : i32 loc(#loc216) + %block_n_end_53 = arith.minsi %block_n_end, %block_n_end_52 : i32 loc(#loc217) + %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc218) + %offs_n_54 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc218) + %offs_n_55 = tt.splat %kv_start_48 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc219) + %offs_n_56 = arith.addi %offs_n_55, %offs_n : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc219) + %2 = tt.expand_dims %offs_n_56 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc48) + %ptr_57 = tt.splat %K : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked> loc(#loc424) + %ptr_58 = tt.broadcast %ptr_41 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked> loc(#loc425) + %k = tt.splat %ks0 : i32 -> tensor<64x1xi32, #blocked> loc(#loc426) + %m = arith.remsi %ptr_36, %cst : tensor<128x1xi32, #mma> loc(#loc427) + %n = tt.splat %ks0 : i32 -> tensor<1x64xi32, #mma> loc(#loc428) + %tmp4 = tt.broadcast %m : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc335) + %tmp7 = tt.addptr %in_ptr9, %off_zq : !tt.ptr, i32 loc(#loc336) + %kv_offset = arith.cmpi sgt, %block_n_end_53, %c0_i32 : i32 loc(#loc493) + %tmp7_59 = tt.load %tmp7, %kv_offset : !tt.ptr loc(#loc338) + %tmp8 = tt.splat %tmp7_59 : i64 -> tensor<1x64xi64, #mma> loc(#loc339) + %tmp9 = arith.extsi %m : tensor<128x1xi32, #mma> to tensor<128x1xi64, #mma> loc(#loc340) + %tmp10 = tt.splat %tmp7_59 : i64 -> tensor<128x1xi64, #mma> loc(#loc341) + %tmp10_60 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64, #mma> loc(#loc341) + %tmp11 = tt.broadcast %tmp10_60 : tensor<128x1xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc342) + %ptr_61 = tt.splat %V : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked> loc(#loc430) + %k_62 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc431) + %v = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc432) + %offs_n_load = tt.splat %kv_start_48 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc344) + %offs_n_load_63 = arith.addi %offs_n_load, %offs_n_54 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc344) + %ptr_64 = tt.expand_dims %offs_n_load_63 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc433) + %ptr_65 = arith.muli %ptr_64, %cst_6 : tensor<64x1xi32, #blocked> loc(#loc434) + %ptr_66 = tt.addptr %ptr_57, %ptr_65 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc424) + %ptr_67 = tt.broadcast %ptr_66 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc425) + %ptr_68 = tt.addptr %ptr_67, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc425) + %k_69 = arith.cmpi slt, %ptr_64, %k : tensor<64x1xi32, #blocked> loc(#loc426) + %k_70 = tt.broadcast %k_69 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc431) + %k_71 = ttg.memdesc_index %k_62[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc431) + %kv_offset_72 = tt.splat %kv_offset : i1 -> tensor<64x128xi1, #blocked> loc(#loc493) + %kv_offset_73 = arith.andi %kv_offset_72, %k_70 : tensor<64x128xi1, #blocked> loc(#loc493) + %k_74 = ttg.async_copy_global_to_local %ptr_68, %k_71 mask %kv_offset_73 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc431) + %k_75 = ttg.async_commit_group tokens %k_74 loc(#loc431) + %ptr_76 = tt.addptr %ptr_61, %ptr_65 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc430) + %ptr_77 = tt.broadcast %ptr_76 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc435) + %ptr_78 = tt.addptr %ptr_77, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc435) + %v_79 = ttg.memdesc_index %v[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc432) + %v_80 = ttg.async_copy_global_to_local %ptr_78, %v_79 mask %kv_offset_73 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc432) + %v_81 = ttg.async_commit_group tokens %v_80 loc(#loc432) + %kv_offset_82 = arith.cmpi sgt, %block_n_end_53, %c1_i32 : i32 loc(#loc493) + %kv_base_offset = arith.addi %kv_start_48, %c64_i32 : i32 loc(#loc345) + %offs_n_load_83 = tt.splat %kv_base_offset : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc344) + %offs_n_load_84 = arith.addi %offs_n_load_83, %offs_n_54 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc344) + %ptr_85 = tt.expand_dims %offs_n_load_84 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc433) + %ptr_86 = arith.muli %ptr_85, %cst_6 : tensor<64x1xi32, #blocked> loc(#loc434) + %ptr_87 = tt.addptr %ptr_57, %ptr_86 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc424) + %ptr_88 = tt.broadcast %ptr_87 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc425) + %ptr_89 = tt.addptr %ptr_88, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc425) + %k_90 = arith.cmpi slt, %ptr_85, %k : tensor<64x1xi32, #blocked> loc(#loc426) + %k_91 = tt.broadcast %k_90 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc431) + %k_92 = ttg.memdesc_index %k_62[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc431) + %kv_offset_93 = tt.splat %kv_offset_82 : i1 -> tensor<64x128xi1, #blocked> loc(#loc493) + %kv_offset_94 = arith.andi %kv_offset_93, %k_91 : tensor<64x128xi1, #blocked> loc(#loc493) + %k_95 = ttg.async_copy_global_to_local %ptr_89, %k_92 mask %kv_offset_94 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc431) + %k_96 = ttg.async_commit_group tokens %k_95 loc(#loc431) + %ptr_97 = tt.addptr %ptr_61, %ptr_86 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc430) + %ptr_98 = tt.broadcast %ptr_97 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc435) + %ptr_99 = tt.addptr %ptr_98, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc435) + %v_100 = ttg.memdesc_index %v[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc432) + %v_101 = ttg.async_copy_global_to_local %ptr_99, %v_100 mask %kv_offset_94 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc432) + %v_102 = ttg.async_commit_group tokens %v_101 loc(#loc432) + ttng.fence_async_shared {bCluster = false} loc(#loc346) + %kv_offset_103:12 = scf.for %kv_offset_174 = %c0_i32 to %block_n_end_53 step %c1_i32 iter_args(%acc_175 = %cst_15, %arg15 = %cst_13, %arg16 = %cst_19, %arg17 = %c64_i32, %arg18 = %2, %arg19 = %c1_i32, %arg20 = %c-1_i32, %k_176 = %k_75, %k_177 = %k_96, %v_178 = %v_81, %v_179 = %v_102, %arg25 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 { + %kv_offset_180 = arith.subi %block_n_end_53, %c2_i32 : i32 loc(#loc493) + %kv_offset_181 = arith.cmpi slt, %kv_offset_174, %kv_offset_180 : i32 loc(#loc493) + %kv_offset_182 = arith.subi %block_n_end_53, %c1_i32 : i32 loc(#loc493) + %kv_offset_183 = arith.cmpi slt, %kv_offset_174, %kv_offset_182 : i32 loc(#loc493) + %kv_offset_184 = arith.addi %arg20, %c1_i32 : i32 loc(#loc493) + %kv_offset_185 = arith.cmpi sge, %kv_offset_184, %c3_i32 : i32 loc(#loc493) + %kv_offset_186 = arith.select %kv_offset_185, %c0_i32, %kv_offset_184 : i32 loc(#loc493) + %k_187 = ttg.async_wait %k_176, %v_178 {num = 2 : i32} loc(#loc431) + %k_188 = ttg.memdesc_index %k_62[%kv_offset_186] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc431) + %k_189 = ttg.memdesc_trans %k_188 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc347) + %qk = ttng.warp_group_dot %q_47, %k_189, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc346) + %qk_190:4 = ttng.warp_group_dot_wait %qk, %q_47, %k_189, %acc_175 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64>, tensor<128x128xf32, #mma1> loc(#loc346) + %qk_191 = arith.mulf %qk_190#0, %cst_16 : tensor<128x64xf32, #mma> loc(#loc348) + %n_192 = arith.remsi %arg18, %n : tensor<1x64xi32, #mma> loc(#loc428) + %post_mod_scores = arith.cmpi slt, %arg18, %n : tensor<1x64xi32, #mma> loc(#loc349) + %post_mod_scores_193 = tt.broadcast %post_mod_scores : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc350) + %post_mod_scores_194 = arith.select %post_mod_scores_193, %qk_191, %cst_17 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc350) + %tmp4_195 = tt.broadcast %n_192 : tensor<1x64xi32, #mma> -> tensor<128x64xi32, #mma> loc(#loc335) + %tmp4_196 = arith.cmpi sge, %tmp4, %tmp4_195 : tensor<128x64xi32, #mma> loc(#loc335) + %tmp5 = arith.extsi %n_192 : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc351) + %tmp8_197 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64, #mma> loc(#loc339) + %tmp11_198 = tt.broadcast %tmp8_197 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc342) + %tmp11_199 = arith.andi %tmp11_198, %tmp11 : tensor<128x64xi1, #mma> loc(#loc342) + %tmp12 = arith.andi %tmp4_196, %tmp11_199 : tensor<128x64xi1, #mma> loc(#loc352) + %tmp15 = arith.cmpi sge, %n_192, %cst_4 : tensor<1x64xi32, #mma> loc(#loc353) + %tmp16 = arith.remsi %n_192, %cst_4 : tensor<1x64xi32, #mma> loc(#loc354) + %tmp18 = arith.cmpi ne, %tmp16, %cst_0 : tensor<1x64xi32, #mma> loc(#loc355) + %tmp19 = arith.cmpi slt, %tmp16, %cst_0 : tensor<1x64xi32, #mma> loc(#loc356) + %tmp22 = arith.andi %tmp18, %tmp19 : tensor<1x64xi1, #mma> loc(#loc357) + %tmp23 = arith.addi %tmp16, %cst_4 : tensor<1x64xi32, #mma> loc(#loc358) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1, #mma>, tensor<1x64xi32, #mma> loc(#loc359) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32, #mma> to tensor<1x64xi64, #mma> loc(#loc360) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64, #mma> loc(#loc361) + %tmp27 = arith.andi %tmp15, %tmp26 : tensor<1x64xi1, #mma> loc(#loc362) + %tmp28 = arith.subi %tmp4_195, %tmp4 : tensor<128x64xi32, #mma> loc(#loc363) + %tmp29 = arith.remsi %tmp28, %cst_3 : tensor<128x64xi32, #mma> loc(#loc364) + %tmp30 = arith.cmpi ne, %tmp29, %cst_1 : tensor<128x64xi32, #mma> loc(#loc365) + %tmp31 = arith.cmpi slt, %tmp29, %cst_1 : tensor<128x64xi32, #mma> loc(#loc366) + %tmp33 = arith.andi %tmp30, %tmp31 : tensor<128x64xi1, #mma> loc(#loc367) + %tmp34 = arith.addi %tmp29, %cst_3 : tensor<128x64xi32, #mma> loc(#loc368) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29 : tensor<128x64xi1, #mma>, tensor<128x64xi32, #mma> loc(#loc369) + %tmp36 = arith.cmpi eq, %tmp35, %cst_1 : tensor<128x64xi32, #mma> loc(#loc370) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc371) + %tmp37_200 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1, #mma> loc(#loc371) + %tmp38 = arith.ori %tmp12, %tmp37_200 : tensor<128x64xi1, #mma> loc(#loc372) + %mask_mod_output = arith.select %post_mod_scores_193, %tmp38, %cst_2 : tensor<128x64xi1, #mma>, tensor<128x64xi1, #mma> loc(#loc373) + %post_mod_scores_201 = arith.select %mask_mod_output, %post_mod_scores_194, %cst_17 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc374) + %post_mod_scores_202 = arith.mulf %post_mod_scores_201, %cst_18 : tensor<128x64xf32, #mma> loc(#loc375) + %m_ij = "tt.reduce"(%post_mod_scores_202) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_260: f32 loc(callsite(#loc1 at #loc376)), %m_ij_261: f32 loc(callsite(#loc1 at #loc376))): + %m_ij_262 = arith.maxnumf %m_ij_260, %m_ij_261 : f32 loc(#loc488) + tt.reduce.return %m_ij_262 : f32 loc(#loc436) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc436) + %m_ij_203 = arith.maxnumf %arg16, %m_ij : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc377) + %masked_out_rows = arith.cmpf oeq, %m_ij_203, %cst_19 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc378) + %m_ij_masked = arith.select %masked_out_rows, %cst_13, %m_ij_203 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc379) + %alpha = arith.subf %arg16, %m_ij_masked : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc380) + %alpha_204 = math.exp2 %alpha : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc381) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc382) + %p_205 = tt.broadcast %p : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc383) + %p_206 = arith.subf %post_mod_scores_202, %p_205 : tensor<128x64xf32, #mma> loc(#loc383) + %p_207 = math.exp2 %p_206 : tensor<128x64xf32, #mma> loc(#loc384) + %l_i_208 = arith.mulf %arg15, %alpha_204 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc385) + %l_i_209 = "tt.reduce"(%p_207) <{axis = 1 : i32}> ({ + ^bb0(%l_i_260: f32 loc(callsite(#loc1 at #loc386)), %l_i_261: f32 loc(callsite(#loc1 at #loc386))): + %l_i_262 = arith.addf %l_i_260, %l_i_261 : f32 loc(#loc489) + tt.reduce.return %l_i_262 : f32 loc(#loc438) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc438) + %l_i_210 = arith.addf %l_i_208, %l_i_209 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc387) + %acc_211 = tt.expand_dims %alpha_204 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc388) + %acc_212 = ttg.convert_layout %acc_211 : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc389) + %acc_213 = tt.broadcast %acc_212 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc389) + %acc_214 = arith.mulf %qk_190#3, %acc_213 : tensor<128x128xf32, #mma1> loc(#loc389) + %v_215 = ttg.memdesc_index %v[%kv_offset_186] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc432) + %acc_216 = arith.truncf %p_207 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc390) + %acc_217 = ttg.convert_layout %acc_216 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc390) + %acc_218 = ttng.warp_group_dot %acc_217, %v_215, %acc_214 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc391) + %offs_n_219 = tt.splat %arg25 : i32 -> tensor<1x64xi32, #mma> loc(#loc392) + %offs_n_220 = arith.addi %arg18, %offs_n_219 : tensor<1x64xi32, #mma> loc(#loc392) + %kv_offset_221 = arith.addi %kv_offset_174, %c1_i32 : i32 loc(#loc493) + %cur_block_idx = arith.divsi %kv_offset_221, %c2_i32 : i32 loc(#loc440) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc441) + %cur_block_222 = tt.load %cur_block, %kv_offset_183 evictionPolicy = evict_last : !tt.ptr loc(#loc442) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc443) + %next_block_223 = arith.cmpi slt, %next_block, %kv_num_blocks_49 : i32 loc(#loc444) + %next_block_224 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc445) + %kv_offset_225 = arith.andi %kv_offset_183, %next_block_223 : i1 loc(#loc493) + %next_block_226 = tt.load %next_block_224, %kv_offset_225 evictionPolicy = evict_last : !tt.ptr loc(#loc446) + %needs_jump = arith.addi %kv_offset_174, %c2_i32 : i32 loc(#loc447) + %needs_jump_227 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc448) + %needs_jump_228 = arith.cmpi eq, %needs_jump_227, %c0_i32 : i32 loc(#loc449) + %jump_to_block = arith.subi %next_block_226, %cur_block_222 : i32 loc(#loc450) + %jump_to_block_229 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc451) + %jump_to_block_230 = arith.subi %jump_to_block_229, %c64_i32 : i32 loc(#loc452) + %offset = arith.extui %needs_jump_228 : i1 to i32 loc(#loc453) + %offset_231 = arith.muli %jump_to_block_230, %offset : i32 loc(#loc453) + %offset_232 = arith.subi %c1_i32, %offset : i32 loc(#loc454) + %offset_233 = arith.muli %offset_232, %c64_i32 : i32 loc(#loc455) + %offset_234 = arith.addi %offset_231, %offset_233 : i32 loc(#loc456) + %kv_offset_235 = arith.addi %arg17, %offset_234 : i32 loc(#loc394) + %kv_offset_236 = arith.addi %arg19, %c1_i32 : i32 loc(#loc493) + %kv_offset_237 = arith.cmpi sge, %kv_offset_236, %c3_i32 : i32 loc(#loc493) + %kv_offset_238 = arith.select %kv_offset_237, %c0_i32, %kv_offset_236 : i32 loc(#loc493) + %kv_base_offset_239 = arith.addi %kv_start_48, %kv_offset_235 : i32 loc(#loc345) + %offs_n_load_240 = tt.splat %kv_base_offset_239 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc344) + %offs_n_load_241 = arith.addi %offs_n_load_240, %offs_n_54 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc344) + %ptr_242 = tt.expand_dims %offs_n_load_241 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc433) + %ptr_243 = arith.muli %ptr_242, %cst_6 : tensor<64x1xi32, #blocked> loc(#loc434) + %ptr_244 = tt.addptr %ptr_57, %ptr_243 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc424) + %ptr_245 = tt.broadcast %ptr_244 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc425) + %ptr_246 = tt.addptr %ptr_245, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc425) + %k_247 = arith.cmpi slt, %ptr_242, %k : tensor<64x1xi32, #blocked> loc(#loc426) + %k_248 = tt.broadcast %k_247 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc431) + %k_249 = ttg.memdesc_index %k_62[%kv_offset_238] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc431) + %kv_offset_250 = tt.splat %kv_offset_181 : i1 -> tensor<64x128xi1, #blocked> loc(#loc493) + %kv_offset_251 = arith.andi %kv_offset_250, %k_248 : tensor<64x128xi1, #blocked> loc(#loc493) + %k_252 = ttg.async_copy_global_to_local %ptr_246, %k_249 mask %kv_offset_251 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc431) + %k_253 = ttg.async_commit_group tokens %k_252 loc(#loc431) + %ptr_254 = tt.addptr %ptr_61, %ptr_243 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc430) + %ptr_255 = tt.broadcast %ptr_254 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc435) + %ptr_256 = tt.addptr %ptr_255, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc435) + %v_257 = ttg.memdesc_index %v[%kv_offset_238] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc432) + %v_258 = ttg.async_copy_global_to_local %ptr_256, %v_257 mask %kv_offset_251 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc432) + %v_259 = ttg.async_commit_group tokens %v_258 loc(#loc432) + scf.yield %acc_218, %l_i_210, %m_ij_203, %kv_offset_235, %offs_n_220, %kv_offset_238, %kv_offset_186, %k_177, %k_253, %v_179, %v_259, %offset_234 : tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc493) + } loc(#loc493) + %kv_offset_104 = ttng.warp_group_dot_wait %kv_offset_103#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc493) + %kv_offset_105 = ttg.async_wait {num = 0 : i32} loc(#loc493) + ttg.local_dealloc %v : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc493) + ttg.local_dealloc %k_62 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc493) + %kv_indices_106 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_26 : !tt.ptr, i32 loc(#loc301) + %kv_start_107 = tt.load %kv_indices_106 : !tt.ptr loc(#loc302) + %kv_start_108 = arith.muli %kv_start_107, %c128_i32 : i32 loc(#loc303) + %kv_num_blocks_109 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_24 : !tt.ptr, i32 loc(#loc304) + %kv_num_blocks_110 = tt.load %kv_num_blocks_109 : !tt.ptr loc(#loc305) + %block_n_end_111 = arith.muli %kv_num_blocks_110, %c2_i32 : i32 loc(#loc306) + %block_n_end_112 = arith.minsi %block_n_end_111, %block_n_end_52 : i32 loc(#loc307) + %offs_n_113 = tt.splat %kv_start_108 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc308) + %offs_n_114 = arith.addi %offs_n_113, %offs_n : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> loc(#loc308) + %3 = tt.expand_dims %offs_n_114 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #mma}>> -> tensor<1x64xi32, #mma> loc(#loc144) + %k_115 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc457) + %v_116 = ttg.local_alloc : () -> !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc458) + %kv_offset_117 = arith.cmpi sgt, %block_n_end_112, %c0_i32 : i32 loc(#loc494) + %offs_n_load_118 = tt.splat %kv_start_108 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc397) + %offs_n_load_119 = arith.addi %offs_n_load_118, %offs_n_54 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc397) + %ptr_120 = tt.expand_dims %offs_n_load_119 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc459) + %ptr_121 = arith.muli %ptr_120, %cst_6 : tensor<64x1xi32, #blocked> loc(#loc460) + %ptr_122 = tt.addptr %ptr_57, %ptr_121 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc461) + %ptr_123 = tt.broadcast %ptr_122 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc462) + %ptr_124 = tt.addptr %ptr_123, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc462) + %k_125 = arith.cmpi slt, %ptr_120, %k : tensor<64x1xi32, #blocked> loc(#loc463) + %k_126 = tt.broadcast %k_125 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc457) + %k_127 = ttg.memdesc_index %k_115[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc457) + %kv_offset_128 = tt.splat %kv_offset_117 : i1 -> tensor<64x128xi1, #blocked> loc(#loc494) + %kv_offset_129 = arith.andi %kv_offset_128, %k_126 : tensor<64x128xi1, #blocked> loc(#loc494) + %k_130 = ttg.async_copy_global_to_local %ptr_124, %k_127 mask %kv_offset_129 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc457) + %k_131 = ttg.async_commit_group tokens %k_130 loc(#loc457) + %ptr_132 = tt.addptr %ptr_61, %ptr_121 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc464) + %ptr_133 = tt.broadcast %ptr_132 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc465) + %ptr_134 = tt.addptr %ptr_133, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc465) + %v_135 = ttg.memdesc_index %v_116[%c0_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc458) + %v_136 = ttg.async_copy_global_to_local %ptr_134, %v_135 mask %kv_offset_129 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc458) + %v_137 = ttg.async_commit_group tokens %v_136 loc(#loc458) + %kv_offset_138 = arith.cmpi sgt, %block_n_end_112, %c1_i32 : i32 loc(#loc494) + %kv_base_offset_139 = arith.addi %kv_start_108, %c64_i32 : i32 loc(#loc398) + %offs_n_load_140 = tt.splat %kv_base_offset_139 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc397) + %offs_n_load_141 = arith.addi %offs_n_load_140, %offs_n_54 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc397) + %ptr_142 = tt.expand_dims %offs_n_load_141 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc459) + %ptr_143 = arith.muli %ptr_142, %cst_6 : tensor<64x1xi32, #blocked> loc(#loc460) + %ptr_144 = tt.addptr %ptr_57, %ptr_143 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc461) + %ptr_145 = tt.broadcast %ptr_144 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc462) + %ptr_146 = tt.addptr %ptr_145, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc462) + %k_147 = arith.cmpi slt, %ptr_142, %k : tensor<64x1xi32, #blocked> loc(#loc463) + %k_148 = tt.broadcast %k_147 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc457) + %k_149 = ttg.memdesc_index %k_115[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc457) + %kv_offset_150 = tt.splat %kv_offset_138 : i1 -> tensor<64x128xi1, #blocked> loc(#loc494) + %kv_offset_151 = arith.andi %kv_offset_150, %k_148 : tensor<64x128xi1, #blocked> loc(#loc494) + %k_152 = ttg.async_copy_global_to_local %ptr_146, %k_149 mask %kv_offset_151 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc457) + %k_153 = ttg.async_commit_group tokens %k_152 loc(#loc457) + %ptr_154 = tt.addptr %ptr_61, %ptr_143 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc464) + %ptr_155 = tt.broadcast %ptr_154 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc465) + %ptr_156 = tt.addptr %ptr_155, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc465) + %v_157 = ttg.memdesc_index %v_116[%c1_i32] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc458) + %v_158 = ttg.async_copy_global_to_local %ptr_156, %v_157 mask %kv_offset_151 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc458) + %v_159 = ttg.async_commit_group tokens %v_158 loc(#loc458) + ttng.fence_async_shared {bCluster = false} loc(#loc399) + %kv_offset_160:12 = scf.for %kv_offset_174 = %c0_i32 to %block_n_end_112 step %c1_i32 iter_args(%kv_offset_175 = %kv_offset_104, %kv_offset_176 = %kv_offset_103#1, %kv_offset_177 = %kv_offset_103#2, %arg17 = %c64_i32, %arg18 = %3, %arg19 = %c1_i32, %arg20 = %c-1_i32, %k_178 = %k_131, %k_179 = %k_153, %v_180 = %v_137, %v_181 = %v_159, %arg25 = %c64_i32) -> (tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32) : i32 { + %kv_offset_182 = arith.subi %block_n_end_112, %c2_i32 : i32 loc(#loc494) + %kv_offset_183 = arith.cmpi slt, %kv_offset_174, %kv_offset_182 : i32 loc(#loc494) + %kv_offset_184 = arith.subi %block_n_end_112, %c1_i32 : i32 loc(#loc494) + %kv_offset_185 = arith.cmpi slt, %kv_offset_174, %kv_offset_184 : i32 loc(#loc494) + %kv_offset_186 = arith.addi %arg20, %c1_i32 : i32 loc(#loc494) + %kv_offset_187 = arith.cmpi sge, %kv_offset_186, %c3_i32 : i32 loc(#loc494) + %kv_offset_188 = arith.select %kv_offset_187, %c0_i32, %kv_offset_186 : i32 loc(#loc494) + %k_189 = ttg.async_wait %k_178, %v_180 {num = 2 : i32} loc(#loc457) + %k_190 = ttg.memdesc_index %k_115[%kv_offset_188] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc457) + %k_191 = ttg.memdesc_trans %k_190 {order = array} : !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> loc(#loc400) + %qk = ttng.warp_group_dot %q_47, %k_191, %cst_5 {inputPrecision = 0 : i32, isAsync = true} : !ttg.memdesc<128x128xbf16, #shared, #smem> * !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64> -> tensor<128x64xf32, #mma> loc(#loc399) + %qk_192:4 = ttng.warp_group_dot_wait %qk, %q_47, %k_191, %kv_offset_175 {pendings = 0 : i32} : tensor<128x64xf32, #mma>, !ttg.memdesc<128x128xbf16, #shared, #smem>, !ttg.memdesc<128x64xbf16, #shared1, #smem, mutable, 3x128x64>, tensor<128x128xf32, #mma1> loc(#loc399) + %qk_193 = arith.mulf %qk_192#0, %cst_16 : tensor<128x64xf32, #mma> loc(#loc401) + %post_mod_scores = arith.cmpi slt, %arg18, %n : tensor<1x64xi32, #mma> loc(#loc402) + %post_mod_scores_194 = tt.broadcast %post_mod_scores : tensor<1x64xi1, #mma> -> tensor<128x64xi1, #mma> loc(#loc403) + %post_mod_scores_195 = arith.select %post_mod_scores_194, %qk_193, %cst_17 : tensor<128x64xi1, #mma>, tensor<128x64xf32, #mma> loc(#loc403) + %post_mod_scores_196 = arith.mulf %post_mod_scores_195, %cst_18 : tensor<128x64xf32, #mma> loc(#loc404) + %m_ij = "tt.reduce"(%post_mod_scores_196) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_254: f32 loc(callsite(#loc1 at #loc405)), %m_ij_255: f32 loc(callsite(#loc1 at #loc405))): + %m_ij_256 = arith.maxnumf %m_ij_254, %m_ij_255 : f32 loc(#loc490) + tt.reduce.return %m_ij_256 : f32 loc(#loc466) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc466) + %m_ij_197 = arith.maxnumf %kv_offset_177, %m_ij : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc406) + %masked_out_rows = arith.cmpf oeq, %m_ij_197, %cst_19 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc407) + %m_ij_masked = arith.select %masked_out_rows, %cst_13, %m_ij_197 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc408) + %alpha = arith.subf %kv_offset_177, %m_ij_masked : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc409) + %alpha_198 = math.exp2 %alpha : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc410) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc411) + %p_199 = tt.broadcast %p : tensor<128x1xf32, #mma> -> tensor<128x64xf32, #mma> loc(#loc412) + %p_200 = arith.subf %post_mod_scores_196, %p_199 : tensor<128x64xf32, #mma> loc(#loc412) + %p_201 = math.exp2 %p_200 : tensor<128x64xf32, #mma> loc(#loc413) + %l_i_202 = arith.mulf %kv_offset_176, %alpha_198 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc414) + %l_i_203 = "tt.reduce"(%p_201) <{axis = 1 : i32}> ({ + ^bb0(%l_i_254: f32 loc(callsite(#loc1 at #loc415)), %l_i_255: f32 loc(callsite(#loc1 at #loc415))): + %l_i_256 = arith.addf %l_i_254, %l_i_255 : f32 loc(#loc491) + tt.reduce.return %l_i_256 : f32 loc(#loc468) + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc468) + %l_i_204 = arith.addf %l_i_202, %l_i_203 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc416) + %acc_205 = tt.expand_dims %alpha_198 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc417) + %acc_206 = ttg.convert_layout %acc_205 : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc418) + %acc_207 = tt.broadcast %acc_206 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc418) + %acc_208 = arith.mulf %qk_192#3, %acc_207 : tensor<128x128xf32, #mma1> loc(#loc418) + %v_209 = ttg.memdesc_index %v_116[%kv_offset_188] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc458) + %acc_210 = arith.truncf %p_201 : tensor<128x64xf32, #mma> to tensor<128x64xbf16, #mma> loc(#loc419) + %acc_211 = ttg.convert_layout %acc_210 : tensor<128x64xbf16, #mma> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> loc(#loc419) + %acc_212 = ttng.warp_group_dot %acc_211, %v_209, %acc_208 {inputPrecision = 0 : i32, isAsync = true} : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 2}>> * !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> -> tensor<128x128xf32, #mma1> loc(#loc420) + %offs_n_213 = tt.splat %arg25 : i32 -> tensor<1x64xi32, #mma> loc(#loc421) + %offs_n_214 = arith.addi %arg18, %offs_n_213 : tensor<1x64xi32, #mma> loc(#loc421) + %kv_offset_215 = arith.addi %kv_offset_174, %c1_i32 : i32 loc(#loc494) + %cur_block_idx = arith.divsi %kv_offset_215, %c2_i32 : i32 loc(#loc470) + %cur_block = tt.addptr %kv_indices_106, %cur_block_idx : !tt.ptr, i32 loc(#loc471) + %cur_block_216 = tt.load %cur_block, %kv_offset_185 evictionPolicy = evict_last : !tt.ptr loc(#loc472) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc473) + %next_block_217 = arith.cmpi slt, %next_block, %kv_num_blocks_110 : i32 loc(#loc474) + %next_block_218 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc475) + %kv_offset_219 = arith.andi %kv_offset_185, %next_block_217 : i1 loc(#loc494) + %next_block_220 = tt.load %next_block_218, %kv_offset_219 evictionPolicy = evict_last : !tt.ptr loc(#loc476) + %needs_jump = arith.addi %kv_offset_174, %c2_i32 : i32 loc(#loc477) + %needs_jump_221 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc478) + %needs_jump_222 = arith.cmpi eq, %needs_jump_221, %c0_i32 : i32 loc(#loc479) + %jump_to_block = arith.subi %next_block_220, %cur_block_216 : i32 loc(#loc480) + %jump_to_block_223 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc481) + %jump_to_block_224 = arith.subi %jump_to_block_223, %c64_i32 : i32 loc(#loc482) + %offset = arith.extui %needs_jump_222 : i1 to i32 loc(#loc483) + %offset_225 = arith.muli %jump_to_block_224, %offset : i32 loc(#loc483) + %offset_226 = arith.subi %c1_i32, %offset : i32 loc(#loc484) + %offset_227 = arith.muli %offset_226, %c64_i32 : i32 loc(#loc485) + %offset_228 = arith.addi %offset_225, %offset_227 : i32 loc(#loc486) + %kv_offset_229 = arith.addi %arg17, %offset_228 : i32 loc(#loc423) + %kv_offset_230 = arith.addi %arg19, %c1_i32 : i32 loc(#loc494) + %kv_offset_231 = arith.cmpi sge, %kv_offset_230, %c3_i32 : i32 loc(#loc494) + %kv_offset_232 = arith.select %kv_offset_231, %c0_i32, %kv_offset_230 : i32 loc(#loc494) + %kv_base_offset_233 = arith.addi %kv_start_108, %kv_offset_229 : i32 loc(#loc398) + %offs_n_load_234 = tt.splat %kv_base_offset_233 : i32 -> tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc397) + %offs_n_load_235 = arith.addi %offs_n_load_234, %offs_n_54 : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc397) + %ptr_236 = tt.expand_dims %offs_n_load_235 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc459) + %ptr_237 = arith.muli %ptr_236, %cst_6 : tensor<64x1xi32, #blocked> loc(#loc460) + %ptr_238 = tt.addptr %ptr_57, %ptr_237 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc461) + %ptr_239 = tt.broadcast %ptr_238 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc462) + %ptr_240 = tt.addptr %ptr_239, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc462) + %k_241 = arith.cmpi slt, %ptr_236, %k : tensor<64x1xi32, #blocked> loc(#loc463) + %k_242 = tt.broadcast %k_241 : tensor<64x1xi1, #blocked> -> tensor<64x128xi1, #blocked> loc(#loc457) + %k_243 = ttg.memdesc_index %k_115[%kv_offset_232] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc457) + %kv_offset_244 = tt.splat %kv_offset_183 : i1 -> tensor<64x128xi1, #blocked> loc(#loc494) + %kv_offset_245 = arith.andi %kv_offset_244, %k_242 : tensor<64x128xi1, #blocked> loc(#loc494) + %k_246 = ttg.async_copy_global_to_local %ptr_240, %k_243 mask %kv_offset_245 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc457) + %k_247 = ttg.async_commit_group tokens %k_246 loc(#loc457) + %ptr_248 = tt.addptr %ptr_61, %ptr_237 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> loc(#loc464) + %ptr_249 = tt.broadcast %ptr_248 : tensor<64x1x!tt.ptr, #blocked> -> tensor<64x128x!tt.ptr, #blocked> loc(#loc465) + %ptr_250 = tt.addptr %ptr_249, %ptr_58 : tensor<64x128x!tt.ptr, #blocked>, tensor<64x128xi32, #blocked> loc(#loc465) + %v_251 = ttg.memdesc_index %v_116[%kv_offset_232] : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> -> !ttg.memdesc<64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc458) + %v_252 = ttg.async_copy_global_to_local %ptr_250, %v_251 mask %kv_offset_245 other %cst_11 : tensor<64x128x!tt.ptr, #blocked> -> <64x128xbf16, #shared, #smem, mutable, 3x64x128> loc(#loc458) + %v_253 = ttg.async_commit_group tokens %v_252 loc(#loc458) + scf.yield %acc_212, %l_i_204, %m_ij_197, %kv_offset_229, %offs_n_214, %kv_offset_232, %kv_offset_188, %k_179, %k_247, %v_181, %v_253, %offset_228 : tensor<128x128xf32, #mma1>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, i32, tensor<1x64xi32, #mma>, i32, i32, !ttg.async.token, !ttg.async.token, !ttg.async.token, !ttg.async.token, i32 loc(#loc494) + } loc(#loc494) + %kv_offset_161 = ttng.warp_group_dot_wait %kv_offset_160#0 {pendings = 0 : i32} : tensor<128x128xf32, #mma1> loc(#loc494) + %kv_offset_162 = ttg.async_wait {num = 0 : i32} loc(#loc494) + ttg.local_dealloc %v_116 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc494) + ttg.local_dealloc %k_115 : !ttg.memdesc<3x64x128xbf16, #shared, #smem, mutable> loc(#loc494) + %l_i = arith.cmpf oeq, %kv_offset_160#1, %cst_13 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc310) + %l_i_163 = arith.select %l_i, %cst_14, %kv_offset_160#1 : tensor<128xi1, #ttg.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc311) + %acc = tt.expand_dims %l_i_163 {axis = 1 : i32} : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128x1xf32, #mma> loc(#loc312) + %acc_164 = ttg.convert_layout %acc : tensor<128x1xf32, #mma> -> tensor<128x1xf32, #mma1> loc(#loc313) + %acc_165 = tt.broadcast %acc_164 : tensor<128x1xf32, #mma1> -> tensor<128x128xf32, #mma1> loc(#loc313) + %acc_166 = arith.divf %kv_offset_161, %acc_165 : tensor<128x128xf32, #mma1> loc(#loc313) + %mask = arith.cmpi slt, %ptr_41, %cst_8 : tensor<1x128xi32, #blocked> loc(#loc314) + %mask_167 = tt.broadcast %mask : tensor<1x128xi1, #blocked> -> tensor<128x128xi1, #blocked> loc(#loc315) + %mask_168 = arith.andi %q_45, %mask_167 : tensor<128x128xi1, #blocked> loc(#loc315) + %4 = tt.splat %q_offset_20 : i32 -> tensor<1x128xi32, #blocked> loc(#loc152) + %5 = arith.addi %ptr_41, %4 : tensor<1x128xi32, #blocked> loc(#loc152) + %6 = tt.broadcast %5 : tensor<1x128xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc153) + %7 = tt.broadcast %ptr_37 : tensor<128x1xi32, #blocked> -> tensor<128x128xi32, #blocked> loc(#loc153) + %8 = arith.addi %6, %7 : tensor<128x128xi32, #blocked> loc(#loc153) + %9 = tt.splat %q_offset : i32 -> tensor<128x128xi32, #blocked> loc(#loc154) + %10 = arith.addi %8, %9 : tensor<128x128xi32, #blocked> loc(#loc154) + %11 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr, #blocked> loc(#loc155) + %12 = tt.addptr %11, %10 : tensor<128x128x!tt.ptr, #blocked>, tensor<128x128xi32, #blocked> loc(#loc155) + %13 = arith.truncf %acc_166 : tensor<128x128xf32, #mma1> to tensor<128x128xbf16, #mma1> loc(#loc156) + %14 = ttg.convert_layout %13 : tensor<128x128xbf16, #mma1> -> tensor<128x128xbf16, #blocked> loc(#loc156) + tt.store %12, %14, %mask_168 : tensor<128x128x!tt.ptr, #blocked> loc(#loc156) + %off_hz = arith.muli %off_zq, %c32_i32 : i32 loc(#loc316) + %off_hz_169 = arith.addi %off_hz, %off_hq : i32 loc(#loc317) + %l_ptrs = arith.muli %off_hz_169, %c2048_i32 : i32 loc(#loc318) + %l_ptrs_170 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr, i32 loc(#loc319) + %l_ptrs_171 = tt.splat %l_ptrs_170 : !tt.ptr -> tensor<128x!tt.ptr, #blocked1> loc(#loc320) + %l_ptrs_172 = tt.addptr %l_ptrs_171, %offs_m_35 : tensor<128x!tt.ptr, #blocked1>, tensor<128xi32, #blocked1> loc(#loc320) + %lse = math.log2 %l_i_163 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc321) + %lse_173 = arith.addf %kv_offset_160#2, %lse : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> loc(#loc322) + %15 = arith.cmpi slt, %offs_m_35, %cst_12 : tensor<128xi32, #blocked1> loc(#loc164) + %16 = ttg.convert_layout %lse_173 : tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>> -> tensor<128xf32, #blocked1> loc(#loc165) + tt.store %l_ptrs_172, %16, %15 : tensor<128x!tt.ptr, #blocked1> loc(#loc165) + tt.return loc(#loc166) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":86:54) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":86:63) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":97:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":98:27) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":99:27) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":103:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":104:24) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":107:24) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":107:45) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":107:36) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":108:25) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":108:47) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":108:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":111:12) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":112:12) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":113:12) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":130:25) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":142:51) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":142:74) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":143:46) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":143:97) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":143:64) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":144:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":144:46) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":144:33) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":284:27) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":146:101) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":284:38) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":284:20) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":284:56) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":284:49) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":292:52) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":292:23) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":151:26) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":152:23) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":152:37) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":153:42) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":153:28) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":154:45) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":154:92) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":154:102) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":154:65) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":159:37) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":159:24) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":167:48) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":347:107) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":257:21) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":358:36) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":359:36) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":373:23) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":376:33) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":502:40) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":376:23) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":377:22) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":378:23) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":379:23) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":380:23) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":439:107) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":346:35) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":342:32) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":351:19) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":349:17) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":353:14) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":367:44) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":367:69) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":374:23) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":381:23) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":384:24) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":385:24) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":387:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":388:92) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":391:24) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":392:24) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":393:39) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":394:25) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":395:24) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":396:24) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":397:23) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":398:25) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":399:25) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":400:92) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":402:24) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":403:24) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":404:39) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":405:25) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":406:24) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":407:24) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":412:73) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":414:69) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":417:27) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":421:27) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":423:35) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":424:51) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":428:31) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":428:25) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":429:51) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":429:39) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":429:21) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":434:16) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":434:24) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":436:22) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":436:16) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":440:22) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":440:44) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":548:26) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":247:33) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":545:63) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":248:38) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":248:24) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":249:109) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":249:113) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":249:55) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":249:25) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":250:30) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":250:35) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":250:60) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":251:34) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":251:48) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":251:63) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":252:29) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":252:47) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":252:61) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":252:42) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":549:21) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":181:35) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":182:27) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":182:41) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":183:51) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":183:32) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":184:49) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":184:69) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":186:28) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":193:52) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":206:26) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":206:34) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":208:20) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":208:16) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":214:38) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":214:30) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":218:49) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":218:62) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":218:75) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":218:25) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":218:109) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":221:26) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":221:31) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":222:32) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":222:23) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":222:40) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":223:33) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":223:20) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":227:48) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":227:29) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":229:4) +#loc180 = loc("q_start"(#loc4)) +#loc181 = loc("off_zq"(#loc5)) +#loc182 = loc("off_hq"(#loc6)) +#loc183 = loc("off_zkv"(#loc7)) +#loc184 = loc("off_hkv"(#loc8)) +#loc185 = loc("q_offset"(#loc9)) +#loc186 = loc("q_offset"(#loc10)) +#loc187 = loc("q_offset"(#loc11)) +#loc188 = loc("k_offset"(#loc12)) +#loc189 = loc("k_offset"(#loc13)) +#loc190 = loc("k_offset"(#loc14)) +#loc191 = loc("Q"(#loc15)) +#loc192 = loc("K"(#loc16)) +#loc193 = loc("V"(#loc17)) +#loc194 = loc("stride_kv_idx_h"(#loc18)) +#loc195 = loc("sparse_kv_num_blks_offset"(#loc19)) +#loc196 = loc("sparse_kv_num_blks_offset"(#loc20)) +#loc197 = loc("sparse_kv_idx_offset"(#loc21)) +#loc198 = loc("sparse_kv_idx_offset"(#loc22)) +#loc199 = loc("sparse_kv_idx_offset"(#loc23)) +#loc200 = loc("offs_m"(#loc24)) +#loc201 = loc("offs_m"(#loc25)) +#loc202 = loc("offs_m"(#loc26)) +#loc203 = loc("ptr"(#loc27)) +#loc204 = loc("q"(#loc28)) +#loc205 = loc("ptr"(#loc29)) +#loc206 = loc("ptr"(#loc30)) +#loc207 = loc("ptr"(#loc31)) +#loc208 = loc("ptr"(#loc32)) +#loc209 = loc("kv_indices"(#loc35)) +#loc210 = loc("kv_start"(#loc36)) +#loc211 = loc("kv_start"(#loc37)) +#loc212 = loc("kv_num_blocks"(#loc38)) +#loc213 = loc("kv_num_blocks"(#loc39)) +#loc214 = loc("block_n_end"(#loc40)) +#loc215 = loc("block_n_end"(#loc42)) +#loc216 = loc("block_n_end"(#loc44)) +#loc217 = loc("block_n_end"(#loc45)) +#loc218 = loc("offs_n"(#loc46)) +#loc219 = loc("offs_n"(#loc47)) +#loc220 = loc("k"(#loc49)) +#loc222 = loc("m"(#loc53)) +#loc223 = loc("n"(#loc54)) +#loc224 = loc("tmp4"(#loc55)) +#loc225 = loc("tmp7"(#loc56)) +#loc226 = loc("acc"(#loc57)) +#loc227 = loc("tmp7"(#loc58)) +#loc228 = loc("tmp8"(#loc59)) +#loc229 = loc("tmp9"(#loc60)) +#loc230 = loc("tmp10"(#loc61)) +#loc231 = loc("tmp11"(#loc62)) +#loc232 = loc("v"(#loc63)) +#loc233 = loc("offs_n_load"(#loc64)) +#loc234 = loc("kv_base_offset"(#loc65)) +#loc235 = loc("qk"(#loc66)) +#loc236 = loc("k"(#loc67)) +#loc237 = loc("qk"(#loc68)) +#loc238 = loc("post_mod_scores"(#loc69)) +#loc239 = loc("post_mod_scores"(#loc70)) +#loc240 = loc("tmp5"(#loc71)) +#loc241 = loc("tmp12"(#loc72)) +#loc242 = loc("tmp15"(#loc73)) +#loc243 = loc("tmp16"(#loc74)) +#loc244 = loc("tmp18"(#loc75)) +#loc245 = loc("tmp19"(#loc76)) +#loc246 = loc("tmp22"(#loc77)) +#loc247 = loc("tmp23"(#loc78)) +#loc248 = loc("tmp24"(#loc79)) +#loc249 = loc("tmp25"(#loc80)) +#loc250 = loc("tmp26"(#loc81)) +#loc251 = loc("tmp27"(#loc82)) +#loc252 = loc("tmp28"(#loc83)) +#loc253 = loc("tmp29"(#loc84)) +#loc254 = loc("tmp30"(#loc85)) +#loc255 = loc("tmp31"(#loc86)) +#loc256 = loc("tmp33"(#loc87)) +#loc257 = loc("tmp34"(#loc88)) +#loc258 = loc("tmp35"(#loc89)) +#loc259 = loc("tmp36"(#loc90)) +#loc260 = loc("tmp37"(#loc91)) +#loc261 = loc("tmp38"(#loc92)) +#loc262 = loc("mask_mod_output"(#loc93)) +#loc263 = loc("post_mod_scores"(#loc94)) +#loc264 = loc("post_mod_scores"(#loc95)) +#loc266 = loc("m_ij"(#loc99)) +#loc267 = loc("masked_out_rows"(#loc100)) +#loc268 = loc("m_ij_masked"(#loc101)) +#loc269 = loc("alpha"(#loc102)) +#loc270 = loc("alpha"(#loc103)) +#loc271 = loc("p"(#loc104)) +#loc272 = loc("p"(#loc105)) +#loc273 = loc("p"(#loc106)) +#loc274 = loc("l_i"(#loc107)) +#loc276 = loc("l_i"(#loc111)) +#loc277 = loc("acc"(#loc112)) +#loc278 = loc("acc"(#loc113)) +#loc279 = loc("acc"(#loc114)) +#loc280 = loc("acc"(#loc115)) +#loc281 = loc("offs_n"(#loc116)) +#loc282 = loc("cur_block_idx"(#loc117)) +#loc283 = loc("offset"(#loc118)) +#loc284 = loc("cur_block"(#loc119)) +#loc285 = loc("cur_block"(#loc120)) +#loc286 = loc("next_block"(#loc121)) +#loc287 = loc("next_block"(#loc122)) +#loc288 = loc("next_block"(#loc123)) +#loc289 = loc("next_block"(#loc124)) +#loc290 = loc("needs_jump"(#loc125)) +#loc291 = loc("needs_jump"(#loc126)) +#loc292 = loc("needs_jump"(#loc127)) +#loc293 = loc("jump_to_block"(#loc128)) +#loc294 = loc("jump_to_block"(#loc129)) +#loc295 = loc("jump_to_block"(#loc130)) +#loc296 = loc("offset"(#loc131)) +#loc297 = loc("offset"(#loc132)) +#loc298 = loc("offset"(#loc133)) +#loc299 = loc("offset"(#loc134)) +#loc300 = loc("kv_offset"(#loc135)) +#loc301 = loc("kv_indices"(#loc136)) +#loc302 = loc("kv_start"(#loc137)) +#loc303 = loc("kv_start"(#loc138)) +#loc304 = loc("kv_num_blocks"(#loc139)) +#loc305 = loc("kv_num_blocks"(#loc140)) +#loc306 = loc("block_n_end"(#loc141)) +#loc307 = loc("block_n_end"(#loc142)) +#loc308 = loc("offs_n"(#loc143)) +#loc310 = loc("l_i"(#loc146)) +#loc311 = loc("l_i"(#loc147)) +#loc312 = loc("acc"(#loc148)) +#loc313 = loc("acc"(#loc149)) +#loc314 = loc("mask"(#loc150)) +#loc315 = loc("mask"(#loc151)) +#loc316 = loc("off_hz"(#loc157)) +#loc317 = loc("off_hz"(#loc158)) +#loc318 = loc("l_ptrs"(#loc159)) +#loc319 = loc("l_ptrs"(#loc160)) +#loc320 = loc("l_ptrs"(#loc161)) +#loc321 = loc("lse"(#loc162)) +#loc322 = loc("lse"(#loc163)) +#loc323 = loc(callsite(#loc203 at #loc204)) +#loc324 = loc(callsite(#loc205 at #loc204)) +#loc325 = loc(callsite(#loc206 at #loc204)) +#loc326 = loc(callsite(#loc207 at #loc204)) +#loc327 = loc(callsite(#loc208 at #loc204)) +#loc328 = loc(callsite(#loc33 at #loc204)) +#loc329 = loc(callsite(#loc34 at #loc204)) +#loc330 = loc(callsite(#loc41 at #loc215)) +#loc331 = loc(callsite(#loc43 at #loc215)) +#loc332 = loc(callsite(#loc220 at #loc221)) +#loc333 = loc(callsite(#loc222 at #loc221)) +#loc334 = loc(callsite(#loc223 at #loc221)) +#loc335 = loc(callsite(#loc224 at #loc221)) +#loc336 = loc(callsite(#loc225 at #loc221)) +#loc337 = loc("l_i"(#loc226)) +#loc338 = loc(callsite(#loc227 at #loc221)) +#loc339 = loc(callsite(#loc228 at #loc221)) +#loc340 = loc(callsite(#loc229 at #loc221)) +#loc341 = loc(callsite(#loc230 at #loc221)) +#loc342 = loc(callsite(#loc231 at #loc221)) +#loc343 = loc(callsite(#loc232 at #loc221)) +#loc344 = loc(callsite(#loc233 at #loc221)) +#loc345 = loc(callsite(#loc234 at #loc221)) +#loc346 = loc(callsite(#loc235 at #loc221)) +#loc347 = loc(callsite(#loc236 at #loc221)) +#loc348 = loc(callsite(#loc237 at #loc221)) +#loc349 = loc(callsite(#loc238 at #loc221)) +#loc350 = loc(callsite(#loc239 at #loc221)) +#loc351 = loc(callsite(#loc240 at #loc221)) +#loc352 = loc(callsite(#loc241 at #loc221)) +#loc353 = loc(callsite(#loc242 at #loc221)) +#loc354 = loc(callsite(#loc243 at #loc221)) +#loc355 = loc(callsite(#loc244 at #loc221)) +#loc356 = loc(callsite(#loc245 at #loc221)) +#loc357 = loc(callsite(#loc246 at #loc221)) +#loc358 = loc(callsite(#loc247 at #loc221)) +#loc359 = loc(callsite(#loc248 at #loc221)) +#loc360 = loc(callsite(#loc249 at #loc221)) +#loc361 = loc(callsite(#loc250 at #loc221)) +#loc362 = loc(callsite(#loc251 at #loc221)) +#loc363 = loc(callsite(#loc252 at #loc221)) +#loc364 = loc(callsite(#loc253 at #loc221)) +#loc365 = loc(callsite(#loc254 at #loc221)) +#loc366 = loc(callsite(#loc255 at #loc221)) +#loc367 = loc(callsite(#loc256 at #loc221)) +#loc368 = loc(callsite(#loc257 at #loc221)) +#loc369 = loc(callsite(#loc258 at #loc221)) +#loc370 = loc(callsite(#loc259 at #loc221)) +#loc371 = loc(callsite(#loc260 at #loc221)) +#loc372 = loc(callsite(#loc261 at #loc221)) +#loc373 = loc(callsite(#loc262 at #loc221)) +#loc374 = loc(callsite(#loc263 at #loc221)) +#loc375 = loc(callsite(#loc264 at #loc221)) +#loc377 = loc(callsite(#loc266 at #loc221)) +#loc378 = loc(callsite(#loc267 at #loc221)) +#loc379 = loc(callsite(#loc268 at #loc221)) +#loc380 = loc(callsite(#loc269 at #loc221)) +#loc381 = loc(callsite(#loc270 at #loc221)) +#loc382 = loc(callsite(#loc271 at #loc221)) +#loc383 = loc(callsite(#loc272 at #loc221)) +#loc384 = loc(callsite(#loc273 at #loc221)) +#loc385 = loc(callsite(#loc274 at #loc221)) +#loc387 = loc(callsite(#loc276 at #loc221)) +#loc388 = loc(callsite(#loc277 at #loc221)) +#loc389 = loc(callsite(#loc278 at #loc221)) +#loc390 = loc(callsite(#loc279 at #loc221)) +#loc391 = loc(callsite(#loc280 at #loc221)) +#loc392 = loc(callsite(#loc281 at #loc51)) +#loc393 = loc(callsite(#loc283 at #loc51)) +#loc394 = loc(callsite(#loc300 at #loc51)) +#loc395 = loc(callsite(#loc220 at #loc309)) +#loc396 = loc(callsite(#loc232 at #loc309)) +#loc397 = loc(callsite(#loc233 at #loc309)) +#loc398 = loc(callsite(#loc234 at #loc309)) +#loc399 = loc(callsite(#loc235 at #loc309)) +#loc400 = loc(callsite(#loc236 at #loc309)) +#loc401 = loc(callsite(#loc237 at #loc309)) +#loc402 = loc(callsite(#loc238 at #loc309)) +#loc403 = loc(callsite(#loc239 at #loc309)) +#loc404 = loc(callsite(#loc264 at #loc309)) +#loc406 = loc(callsite(#loc266 at #loc309)) +#loc407 = loc(callsite(#loc267 at #loc309)) +#loc408 = loc(callsite(#loc268 at #loc309)) +#loc409 = loc(callsite(#loc269 at #loc309)) +#loc410 = loc(callsite(#loc270 at #loc309)) +#loc411 = loc(callsite(#loc271 at #loc309)) +#loc412 = loc(callsite(#loc272 at #loc309)) +#loc413 = loc(callsite(#loc273 at #loc309)) +#loc414 = loc(callsite(#loc274 at #loc309)) +#loc416 = loc(callsite(#loc276 at #loc309)) +#loc417 = loc(callsite(#loc277 at #loc309)) +#loc418 = loc(callsite(#loc278 at #loc309)) +#loc419 = loc(callsite(#loc279 at #loc309)) +#loc420 = loc(callsite(#loc280 at #loc309)) +#loc421 = loc(callsite(#loc281 at #loc145)) +#loc422 = loc(callsite(#loc283 at #loc145)) +#loc423 = loc(callsite(#loc300 at #loc145)) +#loc424 = loc(callsite(#loc206 at #loc332)) +#loc425 = loc(callsite(#loc208 at #loc332)) +#loc426 = loc(callsite(#loc33 at #loc332)) +#loc427 = loc(callsite(#loc52 at #loc333)) +#loc428 = loc(callsite(#loc52 at #loc334)) +#loc429 = loc("m_i"(#loc337)) +#loc430 = loc(callsite(#loc206 at #loc343)) +#loc431 = loc(callsite(#loc34 at #loc332)) +#loc432 = loc(callsite(#loc34 at #loc343)) +#loc433 = loc(callsite(#loc203 at #loc332)) +#loc434 = loc(callsite(#loc205 at #loc332)) +#loc435 = loc(callsite(#loc208 at #loc343)) +#loc436 = loc(callsite(#loc96 at #loc376)) +#loc438 = loc(callsite(#loc108 at #loc386)) +#loc440 = loc(callsite(#loc282 at #loc393)) +#loc441 = loc(callsite(#loc284 at #loc393)) +#loc442 = loc(callsite(#loc285 at #loc393)) +#loc443 = loc(callsite(#loc286 at #loc393)) +#loc444 = loc(callsite(#loc287 at #loc393)) +#loc445 = loc(callsite(#loc288 at #loc393)) +#loc446 = loc(callsite(#loc289 at #loc393)) +#loc447 = loc(callsite(#loc290 at #loc393)) +#loc448 = loc(callsite(#loc291 at #loc393)) +#loc449 = loc(callsite(#loc292 at #loc393)) +#loc450 = loc(callsite(#loc293 at #loc393)) +#loc451 = loc(callsite(#loc294 at #loc393)) +#loc452 = loc(callsite(#loc295 at #loc393)) +#loc453 = loc(callsite(#loc296 at #loc393)) +#loc454 = loc(callsite(#loc297 at #loc393)) +#loc455 = loc(callsite(#loc298 at #loc393)) +#loc456 = loc(callsite(#loc299 at #loc393)) +#loc457 = loc(callsite(#loc34 at #loc395)) +#loc458 = loc(callsite(#loc34 at #loc396)) +#loc459 = loc(callsite(#loc203 at #loc395)) +#loc460 = loc(callsite(#loc205 at #loc395)) +#loc461 = loc(callsite(#loc206 at #loc395)) +#loc462 = loc(callsite(#loc208 at #loc395)) +#loc463 = loc(callsite(#loc33 at #loc395)) +#loc464 = loc(callsite(#loc206 at #loc396)) +#loc465 = loc(callsite(#loc208 at #loc396)) +#loc466 = loc(callsite(#loc96 at #loc405)) +#loc468 = loc(callsite(#loc108 at #loc415)) +#loc470 = loc(callsite(#loc282 at #loc422)) +#loc471 = loc(callsite(#loc284 at #loc422)) +#loc472 = loc(callsite(#loc285 at #loc422)) +#loc473 = loc(callsite(#loc286 at #loc422)) +#loc474 = loc(callsite(#loc287 at #loc422)) +#loc475 = loc(callsite(#loc288 at #loc422)) +#loc476 = loc(callsite(#loc289 at #loc422)) +#loc477 = loc(callsite(#loc290 at #loc422)) +#loc478 = loc(callsite(#loc291 at #loc422)) +#loc479 = loc(callsite(#loc292 at #loc422)) +#loc480 = loc(callsite(#loc293 at #loc422)) +#loc481 = loc(callsite(#loc294 at #loc422)) +#loc482 = loc(callsite(#loc295 at #loc422)) +#loc483 = loc(callsite(#loc296 at #loc422)) +#loc484 = loc(callsite(#loc297 at #loc422)) +#loc485 = loc(callsite(#loc298 at #loc422)) +#loc486 = loc(callsite(#loc299 at #loc422)) +#loc487 = loc("offs_n"(#loc429)) +#loc488 = loc(callsite(#loc98 at #loc436)) +#loc489 = loc(callsite(#loc110 at #loc438)) +#loc490 = loc(callsite(#loc98 at #loc466)) +#loc491 = loc(callsite(#loc110 at #loc468)) +#loc492 = loc("kv_offset"(#loc487)) +#loc493 = loc(callsite(#loc492 at #loc51)) +#loc494 = loc(callsite(#loc492 at #loc145)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..0b4ff220e8c63637c4fbe8959ef22aca2b54a5bc --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/6/ZSSVFRPGEEMNTDAFN6M7NJAJLCG3XLM6U5JN4JOYQFZQNYAO5FFA/triton_tem_fused_0.ttir @@ -0,0 +1,829 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":18:0) +#loc1 = loc(unknown) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":172:41) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":538:16) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":421:51) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":434:34) +#loc153 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":198:45) +#loc172 = loc("arg_Q"(#loc)) +#loc173 = loc("arg_K"(#loc)) +#loc174 = loc("arg_V"(#loc)) +#loc175 = loc("arg_LSE"(#loc)) +#loc176 = loc("arg_MAX"(#loc)) +#loc177 = loc("arg_KV_NUM_BLKS"(#loc)) +#loc178 = loc("arg_KV_IDX"(#loc)) +#loc179 = loc("arg_FULL_KV_NUM_BLKS"(#loc)) +#loc180 = loc("arg_FULL_KV_IDX"(#loc)) +#loc181 = loc("in_ptr9"(#loc)) +#loc182 = loc("out_ptr0"(#loc)) +#loc183 = loc("ks0"(#loc)) +#loc184 = loc("ks1"(#loc)) +#loc233 = loc(callsite(#loc59 at #loc2)) +#loc275 = loc("m_ij"(#loc103)) +#loc285 = loc("l_i"(#loc115)) +#loc321 = loc(callsite(#loc59 at #loc153)) +#loc387 = loc(callsite(#loc275 at #loc233)) +#loc397 = loc(callsite(#loc285 at #loc233)) +#loc416 = loc(callsite(#loc275 at #loc321)) +#loc426 = loc(callsite(#loc285 at #loc321)) +#loc446 = loc(callsite(#loc1 at #loc387)) +#loc448 = loc(callsite(#loc1 at #loc397)) +#loc476 = loc(callsite(#loc1 at #loc416)) +#loc478 = loc(callsite(#loc1 at #loc426)) +module { + tt.func public @triton_tem_fused_0(%arg_Q: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_Q"(#loc)), %arg_K: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_K"(#loc)), %arg_V: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_V"(#loc)), %arg_LSE: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_LSE"(#loc)), %arg_MAX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_MAX"(#loc)), %arg_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_NUM_BLKS"(#loc)), %arg_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_KV_IDX"(#loc)), %arg_FULL_KV_NUM_BLKS: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_NUM_BLKS"(#loc)), %arg_FULL_KV_IDX: !tt.ptr {tt.divisibility = 16 : i32} loc("arg_FULL_KV_IDX"(#loc)), %in_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr9"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i32 loc("ks0"(#loc)), %ks1: i32 loc("ks1"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<64x1xi32> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xbf16> loc(#loc1) + %cst_1 = arith.constant dense<2048> : tensor<128x64xi32> loc(#loc185) + %cst_2 = arith.constant dense<2048> : tensor<1x64xi32> loc(#loc185) + %cst_3 = arith.constant dense<0xFF800000> : tensor<128xf32> loc(#loc1) + %cst_4 = arith.constant dense<1.44269502> : tensor<128x64xf32> loc(#loc1) + %cst_5 = arith.constant dense : tensor<128x64xi1> loc(#loc185) + %cst_6 = arith.constant dense<0> : tensor<128x64xi32> loc(#loc185) + %cst_7 = arith.constant dense<0> : tensor<1x64xi32> loc(#loc185) + %cst_8 = arith.constant dense<0xFF800000> : tensor<128x64xf32> loc(#loc1) + %cst_9 = arith.constant dense<0.0883883461> : tensor<128x64xf32> loc(#loc1) + %cst_10 = arith.constant dense<0.000000e+00> : tensor<128x64xf32> loc(#loc1) + %c63_i32 = arith.constant 63 : i32 loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %q = arith.constant dense<0.000000e+00> : tensor<128x128xbf16> loc(#loc334) + %acc = arith.constant dense<0.000000e+00> : tensor<128x128xf32> loc(#loc335) + %cst_11 = arith.constant dense<2048> : tensor<128xi32> loc(#loc7) + %cst_12 = arith.constant dense<4096> : tensor<128x1xi32> loc(#loc1) + %mask = arith.constant dense<128> : tensor<1x128xi32> loc(#loc188) + %cst_13 = arith.constant dense<2048> : tensor<128x1xi32> loc(#loc1) + %l_i = arith.constant dense<1.000000e+00> : tensor<128xf32> loc(#loc189) + %cst_14 = arith.constant dense<0.000000e+00> : tensor<128xf32> loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %Q_LEN = arith.constant 2048 : i32 loc(#loc190) + %HQ = arith.constant 32 : i32 loc(#loc191) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %c8388608_i32 = arith.constant 8388608 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c1_i32 = arith.constant 1 : i32 loc(#loc1) + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %0 = arith.muli %ks0, %c1024_i32 : i32 loc(#loc12) + %1 = arith.muli %ks0, %c128_i32 : i32 loc(#loc13) + %q_start = tt.get_program_id x : i32 loc(#loc192) + %off_zq = tt.get_program_id y : i32 loc(#loc193) + %off_hq = tt.get_program_id z : i32 loc(#loc194) + %off_zkv = arith.remsi %off_zq, %c8_i32 : i32 loc(#loc195) + %off_hkv = arith.divsi %off_hq, %c4_i32 : i32 loc(#loc196) + %q_offset = arith.muli %off_zq, %c8388608_i32 : i32 loc(#loc197) + %q_offset_15 = arith.muli %off_hq, %c128_i32 : i32 loc(#loc198) + %q_offset_16 = arith.addi %q_offset, %q_offset_15 : i32 loc(#loc199) + %k_offset = arith.muli %off_zkv, %0 : i32 loc(#loc200) + %k_offset_17 = arith.muli %off_hkv, %1 : i32 loc(#loc201) + %k_offset_18 = arith.addi %k_offset, %k_offset_17 : i32 loc(#loc202) + %Q = tt.addptr %arg_Q, %q_offset_16 : !tt.ptr, i32 loc(#loc203) + %K = tt.addptr %arg_K, %k_offset_18 : !tt.ptr, i32 loc(#loc204) + %V = tt.addptr %arg_V, %k_offset_18 : !tt.ptr, i32 loc(#loc205) + %stride_kv_idx_h = arith.muli %ks1, %c16_i32 : i32 loc(#loc206) + %sparse_kv_num_blks_offset = arith.muli %off_zkv, %c16_i32 : i32 loc(#loc207) + %sparse_kv_num_blks_offset_19 = arith.addi %sparse_kv_num_blks_offset, %q_start : i32 loc(#loc208) + %sparse_kv_idx_offset = arith.muli %off_zkv, %stride_kv_idx_h : i32 loc(#loc209) + %sparse_kv_idx_offset_20 = arith.muli %q_start, %ks1 : i32 loc(#loc210) + %sparse_kv_idx_offset_21 = arith.addi %sparse_kv_idx_offset, %sparse_kv_idx_offset_20 : i32 loc(#loc211) + %offs_m = arith.muli %q_start, %c128_i32 : i32 loc(#loc212) + %offs_m_22 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc213) + %offs_m_23 = tt.splat %offs_m : i32 -> tensor<128xi32> loc(#loc214) + %offs_m_24 = arith.addi %offs_m_23, %offs_m_22 : tensor<128xi32> loc(#loc214) + %ptr = tt.expand_dims %offs_m_24 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32> loc(#loc336) + %ptr_25 = arith.muli %ptr, %cst_12 : tensor<128x1xi32> loc(#loc337) + %ptr_26 = tt.splat %Q : !tt.ptr -> tensor<128x1x!tt.ptr> loc(#loc338) + %ptr_27 = tt.addptr %ptr_26, %ptr_25 : tensor<128x1x!tt.ptr>, tensor<128x1xi32> loc(#loc338) + %ptr_28 = tt.expand_dims %offs_m_22 {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc339) + %ptr_29 = tt.broadcast %ptr_27 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr> loc(#loc340) + %ptr_30 = tt.broadcast %ptr_28 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc340) + %ptr_31 = tt.addptr %ptr_29, %ptr_30 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc340) + %q_32 = arith.cmpi slt, %ptr, %cst_13 : tensor<128x1xi32> loc(#loc341) + %q_33 = tt.broadcast %q_32 : tensor<128x1xi1> -> tensor<128x128xi1> loc(#loc334) + %q_34 = tt.load %ptr_31, %q_33, %q : tensor<128x128x!tt.ptr> loc(#loc334) + %kv_indices = tt.addptr %arg_KV_IDX, %sparse_kv_idx_offset_21 : !tt.ptr, i32 loc(#loc220) + %kv_start = tt.load %kv_indices : !tt.ptr loc(#loc221) + %kv_start_35 = arith.muli %kv_start, %c128_i32 : i32 loc(#loc222) + %kv_num_blocks = tt.addptr %arg_KV_NUM_BLKS, %sparse_kv_num_blks_offset_19 : !tt.ptr, i32 loc(#loc223) + %kv_num_blocks_36 = tt.load %kv_num_blocks : !tt.ptr loc(#loc224) + %block_n_end = arith.muli %kv_num_blocks_36, %c2_i32 : i32 loc(#loc225) + %block_n_end_37 = arith.addi %ks0, %c63_i32 : i32 loc(#loc342) + %block_n_end_38 = arith.divsi %block_n_end_37, %c64_i32 : i32 loc(#loc343) + %block_n_end_39 = arith.maxsi %block_n_end_38, %c1_i32 : i32 loc(#loc227) + %block_n_end_40 = arith.minsi %block_n_end, %block_n_end_39 : i32 loc(#loc228) + %offs_n = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc229) + %offs_n_41 = tt.splat %kv_start_35 : i32 -> tensor<64xi32> loc(#loc230) + %offs_n_42 = arith.addi %offs_n_41, %offs_n : tensor<64xi32> loc(#loc230) + %2 = tt.expand_dims %offs_n_42 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc56) + %kv_offset:5 = scf.for %start_n = %c0_i32 to %block_n_end_40 step %c1_i32 iter_args(%acc_66 = %acc, %l_i_67 = %cst_14, %m_i = %cst_3, %offs_n_68 = %2, %kv_offset_69 = %c0_i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %kv_base_offset = arith.addi %kv_start_35, %kv_offset_69 : i32 loc(#loc345) + %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc346) + %offs_n_load_70 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc346) + %ptr_71 = tt.expand_dims %offs_n_load_70 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc437) + %ptr_72 = arith.muli %ptr_71, %cst : tensor<64x1xi32> loc(#loc438) + %ptr_73 = tt.splat %K : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc439) + %ptr_74 = tt.addptr %ptr_73, %ptr_72 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc439) + %ptr_75 = tt.broadcast %ptr_74 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc440) + %ptr_76 = tt.broadcast %ptr_28 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc440) + %ptr_77 = tt.addptr %ptr_75, %ptr_76 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc440) + %k = tt.splat %ks0 : i32 -> tensor<64x1xi32> loc(#loc441) + %k_78 = arith.cmpi slt, %ptr_71, %k : tensor<64x1xi32> loc(#loc441) + %k_79 = tt.broadcast %k_78 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc442) + %k_80 = tt.load %ptr_77, %k_79, %cst_0 : tensor<64x128x!tt.ptr> loc(#loc442) + %k_81 = tt.trans %k_80 {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc348) + %qk = tt.dot %q_34, %k_81, %cst_10, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc349) + %qk_82 = arith.mulf %qk, %cst_9 : tensor<128x64xf32> loc(#loc350) + %m = arith.remsi %ptr, %cst_13 : tensor<128x1xi32> loc(#loc443) + %n = tt.splat %ks0 : i32 -> tensor<1x64xi32> loc(#loc444) + %n_83 = arith.remsi %offs_n_68, %n : tensor<1x64xi32> loc(#loc444) + %post_mod_scores = arith.cmpi slt, %offs_n_68, %n : tensor<1x64xi32> loc(#loc353) + %post_mod_scores_84 = tt.broadcast %post_mod_scores : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc354) + %post_mod_scores_85 = arith.select %post_mod_scores_84, %qk_82, %cst_8 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc354) + %tmp4 = tt.broadcast %m : tensor<128x1xi32> -> tensor<128x64xi32> loc(#loc355) + %tmp4_86 = tt.broadcast %n_83 : tensor<1x64xi32> -> tensor<128x64xi32> loc(#loc355) + %tmp4_87 = arith.cmpi sge, %tmp4, %tmp4_86 : tensor<128x64xi32> loc(#loc355) + %tmp5 = arith.extsi %n_83 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc356) + %tmp7 = tt.addptr %in_ptr9, %off_zq : !tt.ptr, i32 loc(#loc357) + %tmp7_88 = tt.load %tmp7 : !tt.ptr loc(#loc358) + %tmp8 = tt.splat %tmp7_88 : i64 -> tensor<1x64xi64> loc(#loc359) + %tmp8_89 = arith.cmpi slt, %tmp5, %tmp8 : tensor<1x64xi64> loc(#loc359) + %tmp9 = arith.extsi %m : tensor<128x1xi32> to tensor<128x1xi64> loc(#loc360) + %tmp10 = tt.splat %tmp7_88 : i64 -> tensor<128x1xi64> loc(#loc361) + %tmp10_90 = arith.cmpi slt, %tmp9, %tmp10 : tensor<128x1xi64> loc(#loc361) + %tmp11 = tt.broadcast %tmp8_89 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc362) + %tmp11_91 = tt.broadcast %tmp10_90 : tensor<128x1xi1> -> tensor<128x64xi1> loc(#loc362) + %tmp11_92 = arith.andi %tmp11, %tmp11_91 : tensor<128x64xi1> loc(#loc362) + %tmp12 = arith.andi %tmp4_87, %tmp11_92 : tensor<128x64xi1> loc(#loc363) + %tmp15 = arith.cmpi sge, %n_83, %cst_2 : tensor<1x64xi32> loc(#loc364) + %tmp16 = arith.remsi %n_83, %cst_2 : tensor<1x64xi32> loc(#loc365) + %tmp18 = arith.cmpi ne, %tmp16, %cst_7 : tensor<1x64xi32> loc(#loc366) + %tmp19 = arith.cmpi slt, %tmp16, %cst_7 : tensor<1x64xi32> loc(#loc367) + %tmp22 = arith.andi %tmp18, %tmp19 : tensor<1x64xi1> loc(#loc368) + %tmp23 = arith.addi %tmp16, %cst_2 : tensor<1x64xi32> loc(#loc369) + %tmp24 = arith.select %tmp22, %tmp23, %tmp16 : tensor<1x64xi1>, tensor<1x64xi32> loc(#loc370) + %tmp25 = arith.extsi %tmp24 : tensor<1x64xi32> to tensor<1x64xi64> loc(#loc371) + %tmp26 = arith.cmpi slt, %tmp25, %tmp8 : tensor<1x64xi64> loc(#loc372) + %tmp27 = arith.andi %tmp15, %tmp26 : tensor<1x64xi1> loc(#loc373) + %tmp28 = arith.subi %tmp4_86, %tmp4 : tensor<128x64xi32> loc(#loc374) + %tmp29 = arith.remsi %tmp28, %cst_1 : tensor<128x64xi32> loc(#loc375) + %tmp30 = arith.cmpi ne, %tmp29, %cst_6 : tensor<128x64xi32> loc(#loc376) + %tmp31 = arith.cmpi slt, %tmp29, %cst_6 : tensor<128x64xi32> loc(#loc377) + %tmp33 = arith.andi %tmp30, %tmp31 : tensor<128x64xi1> loc(#loc378) + %tmp34 = arith.addi %tmp29, %cst_1 : tensor<128x64xi32> loc(#loc379) + %tmp35 = arith.select %tmp33, %tmp34, %tmp29 : tensor<128x64xi1>, tensor<128x64xi32> loc(#loc380) + %tmp36 = arith.cmpi eq, %tmp35, %cst_6 : tensor<128x64xi32> loc(#loc381) + %tmp37 = tt.broadcast %tmp27 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc382) + %tmp37_93 = arith.andi %tmp37, %tmp36 : tensor<128x64xi1> loc(#loc382) + %tmp38 = arith.ori %tmp12, %tmp37_93 : tensor<128x64xi1> loc(#loc383) + %mask_mod_output = arith.select %post_mod_scores_84, %tmp38, %cst_5 : tensor<128x64xi1>, tensor<128x64xi1> loc(#loc384) + %post_mod_scores_94 = arith.select %mask_mod_output, %post_mod_scores_85, %cst_8 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc385) + %post_mod_scores_95 = arith.mulf %post_mod_scores_94, %cst_4 : tensor<128x64xf32> loc(#loc386) + %m_ij = "tt.reduce"(%post_mod_scores_95) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_128: f32 loc(callsite(#loc1 at #loc387)), %m_ij_129: f32 loc(callsite(#loc1 at #loc387))): + %m_ij_130 = arith.maxnumf %m_ij_128, %m_ij_129 : f32 loc(#loc500) + tt.reduce.return %m_ij_130 : f32 loc(#loc445) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc445) + %m_ij_96 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc388) + %masked_out_rows = arith.cmpf oeq, %m_ij_96, %cst_3 : tensor<128xf32> loc(#loc389) + %m_ij_masked = arith.select %masked_out_rows, %cst_14, %m_ij_96 : tensor<128xi1>, tensor<128xf32> loc(#loc390) + %alpha = arith.subf %m_i, %m_ij_masked : tensor<128xf32> loc(#loc391) + %alpha_97 = math.exp2 %alpha : tensor<128xf32> loc(#loc392) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc393) + %p_98 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc394) + %p_99 = arith.subf %post_mod_scores_95, %p_98 : tensor<128x64xf32> loc(#loc394) + %p_100 = math.exp2 %p_99 : tensor<128x64xf32> loc(#loc395) + %l_i_101 = arith.mulf %l_i_67, %alpha_97 : tensor<128xf32> loc(#loc396) + %l_i_102 = "tt.reduce"(%p_100) <{axis = 1 : i32}> ({ + ^bb0(%l_i_128: f32 loc(callsite(#loc1 at #loc397)), %l_i_129: f32 loc(callsite(#loc1 at #loc397))): + %l_i_130 = arith.addf %l_i_128, %l_i_129 : f32 loc(#loc501) + tt.reduce.return %l_i_130 : f32 loc(#loc447) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc447) + %l_i_103 = arith.addf %l_i_101, %l_i_102 : tensor<128xf32> loc(#loc398) + %acc_104 = tt.expand_dims %alpha_97 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc399) + %acc_105 = tt.broadcast %acc_104 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc400) + %acc_106 = arith.mulf %acc_66, %acc_105 : tensor<128x128xf32> loc(#loc400) + %ptr_107 = tt.splat %V : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc449) + %ptr_108 = tt.addptr %ptr_107, %ptr_72 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc449) + %ptr_109 = tt.broadcast %ptr_108 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc450) + %ptr_110 = tt.addptr %ptr_109, %ptr_76 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc450) + %v = tt.load %ptr_110, %k_79, %cst_0 : tensor<64x128x!tt.ptr> loc(#loc451) + %acc_111 = arith.truncf %p_100 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc402) + %acc_112 = tt.dot %acc_111, %v, %acc_106, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc403) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc452) + %cur_block = tt.addptr %kv_indices, %cur_block_idx : !tt.ptr, i32 loc(#loc453) + %cur_block_113 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc454) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc455) + %next_block_114 = arith.cmpi slt, %next_block, %kv_num_blocks_36 : i32 loc(#loc456) + %next_block_115 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc457) + %next_block_116 = tt.load %next_block_115, %next_block_114 evictionPolicy = evict_last : !tt.ptr loc(#loc458) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc459) + %needs_jump_117 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc460) + %needs_jump_118 = arith.cmpi eq, %needs_jump_117, %c0_i32 : i32 loc(#loc461) + %jump_to_block = arith.subi %next_block_116, %cur_block_113 : i32 loc(#loc462) + %jump_to_block_119 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc463) + %jump_to_block_120 = arith.subi %jump_to_block_119, %c64_i32 : i32 loc(#loc464) + %offset = arith.extui %needs_jump_118 : i1 to i32 loc(#loc465) + %offset_121 = arith.muli %jump_to_block_120, %offset : i32 loc(#loc465) + %offset_122 = arith.subi %c1_i32, %offset : i32 loc(#loc466) + %offset_123 = arith.muli %offset_122, %c64_i32 : i32 loc(#loc467) + %offset_124 = arith.addi %offset_121, %offset_123 : i32 loc(#loc468) + %offs_n_125 = tt.splat %offset_124 : i32 -> tensor<1x64xi32> loc(#loc405) + %offs_n_126 = arith.addi %offs_n_68, %offs_n_125 : tensor<1x64xi32> loc(#loc405) + %kv_offset_127 = arith.addi %kv_offset_69, %offset_124 : i32 loc(#loc406) + scf.yield %acc_112, %l_i_103, %m_ij_96, %offs_n_126, %kv_offset_127 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc312) + } loc(#loc505) + %kv_indices_43 = tt.addptr %arg_FULL_KV_IDX, %sparse_kv_idx_offset_21 : !tt.ptr, i32 loc(#loc313) + %kv_start_44 = tt.load %kv_indices_43 : !tt.ptr loc(#loc314) + %kv_start_45 = arith.muli %kv_start_44, %c128_i32 : i32 loc(#loc315) + %kv_num_blocks_46 = tt.addptr %arg_FULL_KV_NUM_BLKS, %sparse_kv_num_blks_offset_19 : !tt.ptr, i32 loc(#loc316) + %kv_num_blocks_47 = tt.load %kv_num_blocks_46 : !tt.ptr loc(#loc317) + %block_n_end_48 = arith.muli %kv_num_blocks_47, %c2_i32 : i32 loc(#loc318) + %block_n_end_49 = arith.minsi %block_n_end_48, %block_n_end_39 : i32 loc(#loc319) + %offs_n_50 = tt.splat %kv_start_45 : i32 -> tensor<64xi32> loc(#loc320) + %offs_n_51 = arith.addi %offs_n_50, %offs_n : tensor<64xi32> loc(#loc320) + %3 = tt.expand_dims %offs_n_51 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> loc(#loc152) + %kv_offset_52:5 = scf.for %start_n = %c0_i32 to %block_n_end_49 step %c1_i32 iter_args(%acc_66 = %kv_offset#0, %l_i_67 = %kv_offset#1, %m_i = %kv_offset#2, %offs_n_68 = %3, %kv_offset_69 = %c0_i32) -> (tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32) : i32 { + %kv_base_offset = arith.addi %kv_start_45, %kv_offset_69 : i32 loc(#loc407) + %offs_n_load = tt.splat %kv_base_offset : i32 -> tensor<64xi32> loc(#loc408) + %offs_n_load_70 = arith.addi %offs_n_load, %offs_n : tensor<64xi32> loc(#loc408) + %ptr_71 = tt.expand_dims %offs_n_load_70 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc469) + %ptr_72 = arith.muli %ptr_71, %cst : tensor<64x1xi32> loc(#loc470) + %ptr_73 = tt.splat %K : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc471) + %ptr_74 = tt.addptr %ptr_73, %ptr_72 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc471) + %ptr_75 = tt.broadcast %ptr_74 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc472) + %ptr_76 = tt.broadcast %ptr_28 : tensor<1x128xi32> -> tensor<64x128xi32> loc(#loc472) + %ptr_77 = tt.addptr %ptr_75, %ptr_76 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc472) + %k = tt.splat %ks0 : i32 -> tensor<64x1xi32> loc(#loc473) + %k_78 = arith.cmpi slt, %ptr_71, %k : tensor<64x1xi32> loc(#loc473) + %k_79 = tt.broadcast %k_78 : tensor<64x1xi1> -> tensor<64x128xi1> loc(#loc474) + %k_80 = tt.load %ptr_77, %k_79, %cst_0 : tensor<64x128x!tt.ptr> loc(#loc474) + %k_81 = tt.trans %k_80 {order = array} : tensor<64x128xbf16> -> tensor<128x64xbf16> loc(#loc410) + %qk = tt.dot %q_34, %k_81, %cst_10, inputPrecision = tf32 : tensor<128x128xbf16> * tensor<128x64xbf16> -> tensor<128x64xf32> loc(#loc411) + %qk_82 = arith.mulf %qk, %cst_9 : tensor<128x64xf32> loc(#loc412) + %post_mod_scores = tt.splat %ks0 : i32 -> tensor<1x64xi32> loc(#loc413) + %post_mod_scores_83 = arith.cmpi slt, %offs_n_68, %post_mod_scores : tensor<1x64xi32> loc(#loc413) + %post_mod_scores_84 = tt.broadcast %post_mod_scores_83 : tensor<1x64xi1> -> tensor<128x64xi1> loc(#loc414) + %post_mod_scores_85 = arith.select %post_mod_scores_84, %qk_82, %cst_8 : tensor<128x64xi1>, tensor<128x64xf32> loc(#loc414) + %post_mod_scores_86 = arith.mulf %post_mod_scores_85, %cst_4 : tensor<128x64xf32> loc(#loc415) + %m_ij = "tt.reduce"(%post_mod_scores_86) <{axis = 1 : i32}> ({ + ^bb0(%m_ij_119: f32 loc(callsite(#loc1 at #loc416)), %m_ij_120: f32 loc(callsite(#loc1 at #loc416))): + %m_ij_121 = arith.maxnumf %m_ij_119, %m_ij_120 : f32 loc(#loc502) + tt.reduce.return %m_ij_121 : f32 loc(#loc475) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc475) + %m_ij_87 = arith.maxnumf %m_i, %m_ij : tensor<128xf32> loc(#loc417) + %masked_out_rows = arith.cmpf oeq, %m_ij_87, %cst_3 : tensor<128xf32> loc(#loc418) + %m_ij_masked = arith.select %masked_out_rows, %cst_14, %m_ij_87 : tensor<128xi1>, tensor<128xf32> loc(#loc419) + %alpha = arith.subf %m_i, %m_ij_masked : tensor<128xf32> loc(#loc420) + %alpha_88 = math.exp2 %alpha : tensor<128xf32> loc(#loc421) + %p = tt.expand_dims %m_ij_masked {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc422) + %p_89 = tt.broadcast %p : tensor<128x1xf32> -> tensor<128x64xf32> loc(#loc423) + %p_90 = arith.subf %post_mod_scores_86, %p_89 : tensor<128x64xf32> loc(#loc423) + %p_91 = math.exp2 %p_90 : tensor<128x64xf32> loc(#loc424) + %l_i_92 = arith.mulf %l_i_67, %alpha_88 : tensor<128xf32> loc(#loc425) + %l_i_93 = "tt.reduce"(%p_91) <{axis = 1 : i32}> ({ + ^bb0(%l_i_119: f32 loc(callsite(#loc1 at #loc426)), %l_i_120: f32 loc(callsite(#loc1 at #loc426))): + %l_i_121 = arith.addf %l_i_119, %l_i_120 : f32 loc(#loc503) + tt.reduce.return %l_i_121 : f32 loc(#loc477) + }) : (tensor<128x64xf32>) -> tensor<128xf32> loc(#loc477) + %l_i_94 = arith.addf %l_i_92, %l_i_93 : tensor<128xf32> loc(#loc427) + %acc_95 = tt.expand_dims %alpha_88 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc428) + %acc_96 = tt.broadcast %acc_95 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc429) + %acc_97 = arith.mulf %acc_66, %acc_96 : tensor<128x128xf32> loc(#loc429) + %ptr_98 = tt.splat %V : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc479) + %ptr_99 = tt.addptr %ptr_98, %ptr_72 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc479) + %ptr_100 = tt.broadcast %ptr_99 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr> loc(#loc480) + %ptr_101 = tt.addptr %ptr_100, %ptr_76 : tensor<64x128x!tt.ptr>, tensor<64x128xi32> loc(#loc480) + %v = tt.load %ptr_101, %k_79, %cst_0 : tensor<64x128x!tt.ptr> loc(#loc481) + %acc_102 = arith.truncf %p_91 : tensor<128x64xf32> to tensor<128x64xbf16> loc(#loc431) + %acc_103 = tt.dot %acc_102, %v, %acc_97, inputPrecision = tf32 : tensor<128x64xbf16> * tensor<64x128xbf16> -> tensor<128x128xf32> loc(#loc432) + %cur_block_idx = arith.divsi %start_n, %c2_i32 : i32 loc(#loc482) + %cur_block = tt.addptr %kv_indices_43, %cur_block_idx : !tt.ptr, i32 loc(#loc483) + %cur_block_104 = tt.load %cur_block evictionPolicy = evict_last : !tt.ptr loc(#loc484) + %next_block = arith.addi %cur_block_idx, %c1_i32 : i32 loc(#loc485) + %next_block_105 = arith.cmpi slt, %next_block, %kv_num_blocks_47 : i32 loc(#loc486) + %next_block_106 = tt.addptr %cur_block, %c1_i32 : !tt.ptr, i32 loc(#loc487) + %next_block_107 = tt.load %next_block_106, %next_block_105 evictionPolicy = evict_last : !tt.ptr loc(#loc488) + %needs_jump = arith.addi %start_n, %c1_i32 : i32 loc(#loc489) + %needs_jump_108 = arith.remsi %needs_jump, %c2_i32 : i32 loc(#loc490) + %needs_jump_109 = arith.cmpi eq, %needs_jump_108, %c0_i32 : i32 loc(#loc491) + %jump_to_block = arith.subi %next_block_107, %cur_block_104 : i32 loc(#loc492) + %jump_to_block_110 = arith.muli %jump_to_block, %c128_i32 : i32 loc(#loc493) + %jump_to_block_111 = arith.subi %jump_to_block_110, %c64_i32 : i32 loc(#loc494) + %offset = arith.extui %needs_jump_109 : i1 to i32 loc(#loc495) + %offset_112 = arith.muli %jump_to_block_111, %offset : i32 loc(#loc495) + %offset_113 = arith.subi %c1_i32, %offset : i32 loc(#loc496) + %offset_114 = arith.muli %offset_113, %c64_i32 : i32 loc(#loc497) + %offset_115 = arith.addi %offset_112, %offset_114 : i32 loc(#loc498) + %offs_n_116 = tt.splat %offset_115 : i32 -> tensor<1x64xi32> loc(#loc434) + %offs_n_117 = arith.addi %offs_n_68, %offs_n_116 : tensor<1x64xi32> loc(#loc434) + %kv_offset_118 = arith.addi %kv_offset_69, %offset_115 : i32 loc(#loc435) + scf.yield %acc_103, %l_i_94, %m_ij_87, %offs_n_117, %kv_offset_118 : tensor<128x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x64xi32>, i32 loc(#loc322) + } loc(#loc506) + %l_i_53 = arith.cmpf oeq, %kv_offset_52#1, %cst_14 : tensor<128xf32> loc(#loc323) + %l_i_54 = arith.select %l_i_53, %l_i, %kv_offset_52#1 : tensor<128xi1>, tensor<128xf32> loc(#loc189) + %acc_55 = tt.expand_dims %l_i_54 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32> loc(#loc324) + %acc_56 = tt.broadcast %acc_55 : tensor<128x1xf32> -> tensor<128x128xf32> loc(#loc325) + %acc_57 = arith.divf %kv_offset_52#0, %acc_56 : tensor<128x128xf32> loc(#loc325) + %mask_58 = arith.cmpi slt, %ptr_28, %mask : tensor<1x128xi32> loc(#loc188) + %mask_59 = tt.broadcast %mask_58 : tensor<1x128xi1> -> tensor<128x128xi1> loc(#loc326) + %mask_60 = arith.andi %q_33, %mask_59 : tensor<128x128xi1> loc(#loc326) + %4 = tt.splat %q_offset_15 : i32 -> tensor<1x128xi32> loc(#loc158) + %5 = arith.addi %ptr_28, %4 : tensor<1x128xi32> loc(#loc158) + %6 = tt.broadcast %5 : tensor<1x128xi32> -> tensor<128x128xi32> loc(#loc159) + %7 = tt.broadcast %ptr_25 : tensor<128x1xi32> -> tensor<128x128xi32> loc(#loc159) + %8 = arith.addi %6, %7 : tensor<128x128xi32> loc(#loc159) + %9 = tt.splat %q_offset : i32 -> tensor<128x128xi32> loc(#loc160) + %10 = arith.addi %8, %9 : tensor<128x128xi32> loc(#loc160) + %11 = tt.splat %out_ptr0 : !tt.ptr -> tensor<128x128x!tt.ptr> loc(#loc161) + %12 = tt.addptr %11, %10 : tensor<128x128x!tt.ptr>, tensor<128x128xi32> loc(#loc161) + %13 = arith.truncf %acc_57 : tensor<128x128xf32> to tensor<128x128xbf16> loc(#loc162) + tt.store %12, %13, %mask_60 : tensor<128x128x!tt.ptr> loc(#loc162) + %off_hz = arith.muli %off_zq, %HQ : i32 loc(#loc327) + %off_hz_61 = arith.addi %off_hz, %off_hq : i32 loc(#loc328) + %l_ptrs = arith.muli %off_hz_61, %Q_LEN : i32 loc(#loc329) + %l_ptrs_62 = tt.addptr %arg_LSE, %l_ptrs : !tt.ptr, i32 loc(#loc330) + %l_ptrs_63 = tt.splat %l_ptrs_62 : !tt.ptr -> tensor<128x!tt.ptr> loc(#loc331) + %l_ptrs_64 = tt.addptr %l_ptrs_63, %offs_m_24 : tensor<128x!tt.ptr>, tensor<128xi32> loc(#loc331) + %lse = math.log2 %l_i_54 : tensor<128xf32> loc(#loc332) + %lse_65 = arith.addf %kv_offset_52#2, %lse : tensor<128xf32> loc(#loc333) + %14 = arith.cmpi slt, %offs_m_24, %cst_11 : tensor<128xi32> loc(#loc7) + tt.store %l_ptrs_64, %lse_65, %14 : tensor<128x!tt.ptr> loc(#loc170) + tt.return loc(#loc171) + } loc(#loc) +} loc(#loc) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":292:23) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":146:101) +#loc5 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":136:19) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":227:48) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":214:38) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":206:34) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":91:12) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":90:9) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":86:54) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":86:63) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":97:28) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":98:27) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":99:27) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":103:23) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":104:24) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":107:24) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":107:45) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":107:36) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":108:25) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":108:47) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":108:37) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":111:12) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":112:12) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":113:12) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":130:25) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":142:51) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":142:74) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":143:46) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":143:97) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":143:64) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":144:23) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":144:46) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":144:33) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":284:27) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":284:38) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":284:20) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":284:56) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":284:49) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":292:52) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":151:26) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":152:23) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":152:37) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":153:42) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":153:28) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":154:45) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:22) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":154:92) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":41:28) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":154:102) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":154:65) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":159:37) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":159:24) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":167:48) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":502:40) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":342:32) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":346:35) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":347:107) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":349:17) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":351:19) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":353:14) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":257:21) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":358:36) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":359:36) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":367:44) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":367:69) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":373:23) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":374:23) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":376:33) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":376:23) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":377:22) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":378:23) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":379:23) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":380:23) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":381:23) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":384:24) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":385:24) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":387:25) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":388:92) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":391:24) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":392:24) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":393:39) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":394:25) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":395:24) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":396:24) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":397:23) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":398:25) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":399:25) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":400:92) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":402:24) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":403:24) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":404:39) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":405:25) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":406:24) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":407:24) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":412:73) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":414:69) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":417:27) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":189:40) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":168:27) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":421:27) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":423:35) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":424:51) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":428:31) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":428:25) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":429:51) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":429:39) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":429:21) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":434:16) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":434:24) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":436:22) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":436:16) +#loc120 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":439:107) +#loc121 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":440:22) +#loc122 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":440:44) +#loc123 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":247:33) +#loc124 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":545:63) +#loc125 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":248:38) +#loc126 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":248:24) +#loc127 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":249:109) +#loc128 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":249:113) +#loc129 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":249:55) +#loc130 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":249:25) +#loc131 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":250:30) +#loc132 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":250:35) +#loc133 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":250:60) +#loc134 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":251:34) +#loc135 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":251:48) +#loc136 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":251:63) +#loc137 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":252:29) +#loc138 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":252:47) +#loc139 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":252:61) +#loc140 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":252:42) +#loc141 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":548:26) +#loc142 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":549:21) +#loc143 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":549:8) +#loc144 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":181:35) +#loc145 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":182:27) +#loc146 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":182:41) +#loc147 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":183:51) +#loc148 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":183:32) +#loc149 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":184:49) +#loc150 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":184:69) +#loc151 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":186:28) +#loc152 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":193:52) +#loc154 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":206:26) +#loc155 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":208:20) +#loc156 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":208:16) +#loc157 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":214:30) +#loc158 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":218:49) +#loc159 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":218:62) +#loc160 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":218:75) +#loc161 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":218:25) +#loc162 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":218:109) +#loc163 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":221:26) +#loc164 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":221:31) +#loc165 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":222:32) +#loc166 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":222:23) +#loc167 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":222:40) +#loc168 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":223:33) +#loc169 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":223:20) +#loc170 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":227:29) +#loc171 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fc/cfclrfzjl3tubzdja6ambskjxk6u7eawljcyu54gp6fv5g4t6zls.py":229:4) +#loc185 = loc(callsite(#loc1 at #loc2)) +#loc186 = loc("q"(#loc4)) +#loc187 = loc("acc"(#loc6)) +#loc188 = loc("mask"(#loc8)) +#loc189 = loc("l_i"(#loc9)) +#loc190 = loc("Q_LEN"(#loc10)) +#loc191 = loc("HQ"(#loc11)) +#loc192 = loc("q_start"(#loc14)) +#loc193 = loc("off_zq"(#loc15)) +#loc194 = loc("off_hq"(#loc16)) +#loc195 = loc("off_zkv"(#loc17)) +#loc196 = loc("off_hkv"(#loc18)) +#loc197 = loc("q_offset"(#loc19)) +#loc198 = loc("q_offset"(#loc20)) +#loc199 = loc("q_offset"(#loc21)) +#loc200 = loc("k_offset"(#loc22)) +#loc201 = loc("k_offset"(#loc23)) +#loc202 = loc("k_offset"(#loc24)) +#loc203 = loc("Q"(#loc25)) +#loc204 = loc("K"(#loc26)) +#loc205 = loc("V"(#loc27)) +#loc206 = loc("stride_kv_idx_h"(#loc28)) +#loc207 = loc("sparse_kv_num_blks_offset"(#loc29)) +#loc208 = loc("sparse_kv_num_blks_offset"(#loc30)) +#loc209 = loc("sparse_kv_idx_offset"(#loc31)) +#loc210 = loc("sparse_kv_idx_offset"(#loc32)) +#loc211 = loc("sparse_kv_idx_offset"(#loc33)) +#loc212 = loc("offs_m"(#loc34)) +#loc213 = loc("offs_m"(#loc35)) +#loc214 = loc("offs_m"(#loc36)) +#loc215 = loc("ptr"(#loc37)) +#loc216 = loc("ptr"(#loc38)) +#loc217 = loc("ptr"(#loc39)) +#loc218 = loc("ptr"(#loc40)) +#loc219 = loc("ptr"(#loc41)) +#loc220 = loc("kv_indices"(#loc43)) +#loc221 = loc("kv_start"(#loc44)) +#loc222 = loc("kv_start"(#loc45)) +#loc223 = loc("kv_num_blocks"(#loc46)) +#loc224 = loc("kv_num_blocks"(#loc47)) +#loc225 = loc("block_n_end"(#loc48)) +#loc226 = loc("block_n_end"(#loc50)) +#loc227 = loc("block_n_end"(#loc52)) +#loc228 = loc("block_n_end"(#loc53)) +#loc229 = loc("offs_n"(#loc54)) +#loc230 = loc("offs_n"(#loc55)) +#loc231 = loc("acc"(#loc57)) +#loc232 = loc("kv_base_offset"(#loc58)) +#loc234 = loc("offs_n_load"(#loc60)) +#loc235 = loc("k"(#loc61)) +#loc236 = loc("k"(#loc62)) +#loc237 = loc("qk"(#loc63)) +#loc238 = loc("qk"(#loc64)) +#loc239 = loc("m"(#loc66)) +#loc240 = loc("n"(#loc67)) +#loc241 = loc("post_mod_scores"(#loc68)) +#loc242 = loc("post_mod_scores"(#loc69)) +#loc243 = loc("tmp4"(#loc70)) +#loc244 = loc("tmp5"(#loc71)) +#loc245 = loc("tmp7"(#loc72)) +#loc246 = loc("tmp7"(#loc73)) +#loc247 = loc("tmp8"(#loc74)) +#loc248 = loc("tmp9"(#loc75)) +#loc249 = loc("tmp10"(#loc76)) +#loc250 = loc("tmp11"(#loc77)) +#loc251 = loc("tmp12"(#loc78)) +#loc252 = loc("tmp15"(#loc79)) +#loc253 = loc("tmp16"(#loc80)) +#loc254 = loc("tmp18"(#loc81)) +#loc255 = loc("tmp19"(#loc82)) +#loc256 = loc("tmp22"(#loc83)) +#loc257 = loc("tmp23"(#loc84)) +#loc258 = loc("tmp24"(#loc85)) +#loc259 = loc("tmp25"(#loc86)) +#loc260 = loc("tmp26"(#loc87)) +#loc261 = loc("tmp27"(#loc88)) +#loc262 = loc("tmp28"(#loc89)) +#loc263 = loc("tmp29"(#loc90)) +#loc264 = loc("tmp30"(#loc91)) +#loc265 = loc("tmp31"(#loc92)) +#loc266 = loc("tmp33"(#loc93)) +#loc267 = loc("tmp34"(#loc94)) +#loc268 = loc("tmp35"(#loc95)) +#loc269 = loc("tmp36"(#loc96)) +#loc270 = loc("tmp37"(#loc97)) +#loc271 = loc("tmp38"(#loc98)) +#loc272 = loc("mask_mod_output"(#loc99)) +#loc273 = loc("post_mod_scores"(#loc100)) +#loc274 = loc("post_mod_scores"(#loc101)) +#loc276 = loc("m_ij"(#loc105)) +#loc277 = loc("masked_out_rows"(#loc106)) +#loc278 = loc("m_ij_masked"(#loc107)) +#loc279 = loc("alpha"(#loc108)) +#loc280 = loc("alpha"(#loc109)) +#loc281 = loc("p"(#loc110)) +#loc282 = loc("p"(#loc111)) +#loc283 = loc("p"(#loc112)) +#loc284 = loc("l_i"(#loc113)) +#loc286 = loc("l_i"(#loc117)) +#loc287 = loc("acc"(#loc118)) +#loc288 = loc("acc"(#loc119)) +#loc289 = loc("v"(#loc120)) +#loc290 = loc("acc"(#loc121)) +#loc291 = loc("acc"(#loc122)) +#loc292 = loc("cur_block_idx"(#loc123)) +#loc293 = loc("offset"(#loc124)) +#loc294 = loc("cur_block"(#loc125)) +#loc295 = loc("cur_block"(#loc126)) +#loc296 = loc("next_block"(#loc127)) +#loc297 = loc("next_block"(#loc128)) +#loc298 = loc("next_block"(#loc129)) +#loc299 = loc("next_block"(#loc130)) +#loc300 = loc("needs_jump"(#loc131)) +#loc301 = loc("needs_jump"(#loc132)) +#loc302 = loc("needs_jump"(#loc133)) +#loc303 = loc("jump_to_block"(#loc134)) +#loc304 = loc("jump_to_block"(#loc135)) +#loc305 = loc("jump_to_block"(#loc136)) +#loc306 = loc("offset"(#loc137)) +#loc307 = loc("offset"(#loc138)) +#loc308 = loc("offset"(#loc139)) +#loc309 = loc("offset"(#loc140)) +#loc310 = loc("offs_n"(#loc141)) +#loc311 = loc("kv_offset"(#loc142)) +#loc312 = loc(callsite(#loc143 at #loc2)) +#loc313 = loc("kv_indices"(#loc144)) +#loc314 = loc("kv_start"(#loc145)) +#loc315 = loc("kv_start"(#loc146)) +#loc316 = loc("kv_num_blocks"(#loc147)) +#loc317 = loc("kv_num_blocks"(#loc148)) +#loc318 = loc("block_n_end"(#loc149)) +#loc319 = loc("block_n_end"(#loc150)) +#loc320 = loc("offs_n"(#loc151)) +#loc322 = loc(callsite(#loc143 at #loc153)) +#loc323 = loc("l_i"(#loc154)) +#loc324 = loc("acc"(#loc155)) +#loc325 = loc("acc"(#loc156)) +#loc326 = loc("mask"(#loc157)) +#loc327 = loc("off_hz"(#loc163)) +#loc328 = loc("off_hz"(#loc164)) +#loc329 = loc("l_ptrs"(#loc165)) +#loc330 = loc("l_ptrs"(#loc166)) +#loc331 = loc("l_ptrs"(#loc167)) +#loc332 = loc("lse"(#loc168)) +#loc333 = loc("lse"(#loc169)) +#loc334 = loc(callsite(#loc3 at #loc186)) +#loc335 = loc(callsite(#loc5 at #loc187)) +#loc336 = loc(callsite(#loc215 at #loc186)) +#loc337 = loc(callsite(#loc216 at #loc186)) +#loc338 = loc(callsite(#loc217 at #loc186)) +#loc339 = loc(callsite(#loc218 at #loc186)) +#loc340 = loc(callsite(#loc219 at #loc186)) +#loc341 = loc(callsite(#loc42 at #loc186)) +#loc342 = loc(callsite(#loc49 at #loc226)) +#loc343 = loc(callsite(#loc51 at #loc226)) +#loc344 = loc("l_i"(#loc231)) +#loc345 = loc(callsite(#loc232 at #loc233)) +#loc346 = loc(callsite(#loc234 at #loc233)) +#loc347 = loc(callsite(#loc235 at #loc233)) +#loc348 = loc(callsite(#loc236 at #loc233)) +#loc349 = loc(callsite(#loc237 at #loc233)) +#loc350 = loc(callsite(#loc238 at #loc233)) +#loc351 = loc(callsite(#loc239 at #loc233)) +#loc352 = loc(callsite(#loc240 at #loc233)) +#loc353 = loc(callsite(#loc241 at #loc233)) +#loc354 = loc(callsite(#loc242 at #loc233)) +#loc355 = loc(callsite(#loc243 at #loc233)) +#loc356 = loc(callsite(#loc244 at #loc233)) +#loc357 = loc(callsite(#loc245 at #loc233)) +#loc358 = loc(callsite(#loc246 at #loc233)) +#loc359 = loc(callsite(#loc247 at #loc233)) +#loc360 = loc(callsite(#loc248 at #loc233)) +#loc361 = loc(callsite(#loc249 at #loc233)) +#loc362 = loc(callsite(#loc250 at #loc233)) +#loc363 = loc(callsite(#loc251 at #loc233)) +#loc364 = loc(callsite(#loc252 at #loc233)) +#loc365 = loc(callsite(#loc253 at #loc233)) +#loc366 = loc(callsite(#loc254 at #loc233)) +#loc367 = loc(callsite(#loc255 at #loc233)) +#loc368 = loc(callsite(#loc256 at #loc233)) +#loc369 = loc(callsite(#loc257 at #loc233)) +#loc370 = loc(callsite(#loc258 at #loc233)) +#loc371 = loc(callsite(#loc259 at #loc233)) +#loc372 = loc(callsite(#loc260 at #loc233)) +#loc373 = loc(callsite(#loc261 at #loc233)) +#loc374 = loc(callsite(#loc262 at #loc233)) +#loc375 = loc(callsite(#loc263 at #loc233)) +#loc376 = loc(callsite(#loc264 at #loc233)) +#loc377 = loc(callsite(#loc265 at #loc233)) +#loc378 = loc(callsite(#loc266 at #loc233)) +#loc379 = loc(callsite(#loc267 at #loc233)) +#loc380 = loc(callsite(#loc268 at #loc233)) +#loc381 = loc(callsite(#loc269 at #loc233)) +#loc382 = loc(callsite(#loc270 at #loc233)) +#loc383 = loc(callsite(#loc271 at #loc233)) +#loc384 = loc(callsite(#loc272 at #loc233)) +#loc385 = loc(callsite(#loc273 at #loc233)) +#loc386 = loc(callsite(#loc274 at #loc233)) +#loc388 = loc(callsite(#loc276 at #loc233)) +#loc389 = loc(callsite(#loc277 at #loc233)) +#loc390 = loc(callsite(#loc278 at #loc233)) +#loc391 = loc(callsite(#loc279 at #loc233)) +#loc392 = loc(callsite(#loc280 at #loc233)) +#loc393 = loc(callsite(#loc281 at #loc233)) +#loc394 = loc(callsite(#loc282 at #loc233)) +#loc395 = loc(callsite(#loc283 at #loc233)) +#loc396 = loc(callsite(#loc284 at #loc233)) +#loc398 = loc(callsite(#loc286 at #loc233)) +#loc399 = loc(callsite(#loc287 at #loc233)) +#loc400 = loc(callsite(#loc288 at #loc233)) +#loc401 = loc(callsite(#loc289 at #loc233)) +#loc402 = loc(callsite(#loc290 at #loc233)) +#loc403 = loc(callsite(#loc291 at #loc233)) +#loc404 = loc(callsite(#loc293 at #loc2)) +#loc405 = loc(callsite(#loc310 at #loc2)) +#loc406 = loc(callsite(#loc311 at #loc2)) +#loc407 = loc(callsite(#loc232 at #loc321)) +#loc408 = loc(callsite(#loc234 at #loc321)) +#loc409 = loc(callsite(#loc235 at #loc321)) +#loc410 = loc(callsite(#loc236 at #loc321)) +#loc411 = loc(callsite(#loc237 at #loc321)) +#loc412 = loc(callsite(#loc238 at #loc321)) +#loc413 = loc(callsite(#loc241 at #loc321)) +#loc414 = loc(callsite(#loc242 at #loc321)) +#loc415 = loc(callsite(#loc274 at #loc321)) +#loc417 = loc(callsite(#loc276 at #loc321)) +#loc418 = loc(callsite(#loc277 at #loc321)) +#loc419 = loc(callsite(#loc278 at #loc321)) +#loc420 = loc(callsite(#loc279 at #loc321)) +#loc421 = loc(callsite(#loc280 at #loc321)) +#loc422 = loc(callsite(#loc281 at #loc321)) +#loc423 = loc(callsite(#loc282 at #loc321)) +#loc424 = loc(callsite(#loc283 at #loc321)) +#loc425 = loc(callsite(#loc284 at #loc321)) +#loc427 = loc(callsite(#loc286 at #loc321)) +#loc428 = loc(callsite(#loc287 at #loc321)) +#loc429 = loc(callsite(#loc288 at #loc321)) +#loc430 = loc(callsite(#loc289 at #loc321)) +#loc431 = loc(callsite(#loc290 at #loc321)) +#loc432 = loc(callsite(#loc291 at #loc321)) +#loc433 = loc(callsite(#loc293 at #loc153)) +#loc434 = loc(callsite(#loc310 at #loc153)) +#loc435 = loc(callsite(#loc311 at #loc153)) +#loc436 = loc("m_i"(#loc344)) +#loc437 = loc(callsite(#loc215 at #loc347)) +#loc438 = loc(callsite(#loc216 at #loc347)) +#loc439 = loc(callsite(#loc217 at #loc347)) +#loc440 = loc(callsite(#loc219 at #loc347)) +#loc441 = loc(callsite(#loc42 at #loc347)) +#loc442 = loc(callsite(#loc3 at #loc347)) +#loc443 = loc(callsite(#loc65 at #loc351)) +#loc444 = loc(callsite(#loc65 at #loc352)) +#loc445 = loc(callsite(#loc102 at #loc387)) +#loc447 = loc(callsite(#loc114 at #loc397)) +#loc449 = loc(callsite(#loc217 at #loc401)) +#loc450 = loc(callsite(#loc219 at #loc401)) +#loc451 = loc(callsite(#loc3 at #loc401)) +#loc452 = loc(callsite(#loc292 at #loc404)) +#loc453 = loc(callsite(#loc294 at #loc404)) +#loc454 = loc(callsite(#loc295 at #loc404)) +#loc455 = loc(callsite(#loc296 at #loc404)) +#loc456 = loc(callsite(#loc297 at #loc404)) +#loc457 = loc(callsite(#loc298 at #loc404)) +#loc458 = loc(callsite(#loc299 at #loc404)) +#loc459 = loc(callsite(#loc300 at #loc404)) +#loc460 = loc(callsite(#loc301 at #loc404)) +#loc461 = loc(callsite(#loc302 at #loc404)) +#loc462 = loc(callsite(#loc303 at #loc404)) +#loc463 = loc(callsite(#loc304 at #loc404)) +#loc464 = loc(callsite(#loc305 at #loc404)) +#loc465 = loc(callsite(#loc306 at #loc404)) +#loc466 = loc(callsite(#loc307 at #loc404)) +#loc467 = loc(callsite(#loc308 at #loc404)) +#loc468 = loc(callsite(#loc309 at #loc404)) +#loc469 = loc(callsite(#loc215 at #loc409)) +#loc470 = loc(callsite(#loc216 at #loc409)) +#loc471 = loc(callsite(#loc217 at #loc409)) +#loc472 = loc(callsite(#loc219 at #loc409)) +#loc473 = loc(callsite(#loc42 at #loc409)) +#loc474 = loc(callsite(#loc3 at #loc409)) +#loc475 = loc(callsite(#loc102 at #loc416)) +#loc477 = loc(callsite(#loc114 at #loc426)) +#loc479 = loc(callsite(#loc217 at #loc430)) +#loc480 = loc(callsite(#loc219 at #loc430)) +#loc481 = loc(callsite(#loc3 at #loc430)) +#loc482 = loc(callsite(#loc292 at #loc433)) +#loc483 = loc(callsite(#loc294 at #loc433)) +#loc484 = loc(callsite(#loc295 at #loc433)) +#loc485 = loc(callsite(#loc296 at #loc433)) +#loc486 = loc(callsite(#loc297 at #loc433)) +#loc487 = loc(callsite(#loc298 at #loc433)) +#loc488 = loc(callsite(#loc299 at #loc433)) +#loc489 = loc(callsite(#loc300 at #loc433)) +#loc490 = loc(callsite(#loc301 at #loc433)) +#loc491 = loc(callsite(#loc302 at #loc433)) +#loc492 = loc(callsite(#loc303 at #loc433)) +#loc493 = loc(callsite(#loc304 at #loc433)) +#loc494 = loc(callsite(#loc305 at #loc433)) +#loc495 = loc(callsite(#loc306 at #loc433)) +#loc496 = loc(callsite(#loc307 at #loc433)) +#loc497 = loc(callsite(#loc308 at #loc433)) +#loc498 = loc(callsite(#loc309 at #loc433)) +#loc499 = loc("offs_n"(#loc436)) +#loc500 = loc(callsite(#loc104 at #loc445)) +#loc501 = loc(callsite(#loc116 at #loc447)) +#loc502 = loc(callsite(#loc104 at #loc475)) +#loc503 = loc(callsite(#loc116 at #loc477)) +#loc504 = loc("kv_offset"(#loc499)) +#loc505 = loc(callsite(#loc504 at #loc2)) +#loc506 = loc(callsite(#loc504 at #loc153)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d4b7c8056918bbdbe13fe22f3d7f1c4d40eac11c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..2e407c86b48fa24511652dde1376aec978918a9f Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f63e3783c40a4f8173585ceb6a84f45083fc2ae5 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"hash": "ebfdec6a3c082a052340cf1ce53a4289947fc953ecb0b9bf91b98341040940f3", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 2, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 1024, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..95328530ebd6a0530e689df49a0a73441f873495 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx @@ -0,0 +1,1774 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 // -- Begin function triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 102, 104, 47, 99, 102, 104, 119, 55, 106, 115, 114, 53, 115, 107, 55, 52, 107, 99, 104, 100, 104, 113, 111, 103, 51, 113, 121, 113, 55, 103, 101, 51, 113, 106, 113, 109, 109, 113, 119, 121, 51, 55, 108, 114, 50, 98, 101, 52, 108, 112, 121, 107, 122, 120, 51, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 57, 32, 60, 32, 49, 55}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 102, 104, 47, 99, 102, 104, 119, 55, 106, 115, 114, 53, 115, 107, 55, 52, 107, 99, 104, 100, 104, 113, 111, 103, 51, 113, 121, 113, 55, 103, 101, 51, 113, 106, 113, 109, 109, 113, 119, 121, 51, 55, 108, 114, 50, 98, 101, 52, 108, 112, 121, 107, 122, 120, 51, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 48, 32, 60, 32, 49, 55}; +.extern .shared .align 16 .b8 global_smem[]; + // @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.visible .entry triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_7, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_8, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_9, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_10 +) +.reqntid 64 +{ + .reg .pred %p<204>; + .reg .b32 %r<600>; + .reg .b64 %rd<78>; + .loc 1 18 0 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:18:0 + +// %bb.0: + ld.param.b64 %rd14, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0]; +$L__tmp0: + .loc 1 24 28 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:24:28 + mov.u32 %r16, %ctaid.x; + .loc 1 24 33 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:24:33 + shl.b32 %r1, %r16, 3; + .loc 1 25 44 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:25:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 56; + bfe.u32 %r17, %r2, 3, 3; + and.b32 %r4, %r2, 7; + .loc 1 25 23 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:25:23 + or.b32 %r5, %r17, %r1; + .loc 1 26 21 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:26:21 + setp.gt.s32 %p2, %r5, 31; + setp.lt.s32 %p1, %r5, 32; + .loc 1 27 38 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:27:38 + shl.b32 %r6, %r4, 1; + or.b32 %r7, %r6, 1; + .loc 1 34 40 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:34:40 + shl.b32 %r18, %r5, 4; + .loc 1 34 37 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:34:37 + or.b32 %r19, %r18, %r6; + .loc 1 34 30 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:34:30 + mad.wide.s32 %rd13, %r19, 8, %rd14; + .loc 1 34 45 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:34:45 + // begin inline asm + mov.u64 %rd11, 0x0; + mov.u64 %rd12, 0x0; + @%p1 ld.global.v2.b64 { %rd11, %rd12 }, [ %rd13 + 0 ]; + // end inline asm + .loc 1 39 18 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:39:18 + add.s64 %rd15, %rd11, -1; + setp.gt.u64 %p3, %rd15, 16382; + setp.lt.u64 %p4, %rd15, 16383; + add.s64 %rd16, %rd12, -1; + setp.lt.u64 %p5, %rd16, 16383; + .loc 1 0 0 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:0 + selp.b32 %r20, 1, 0, %p4; + selp.b32 %r21, 1, 0, %p5; +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + and.b32 %r22, %r2, 1; + shr.u32 %r23, %r2, 1; + bfe.u32 %r24, %r2, 1, 1; + shr.u32 %r25, %r2, 2; + bfe.u32 %r26, %r2, 2, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r27, %r22, 1; + xor.b32 %r28, %r24, 1; + xor.b32 %r29, %r26, 1; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + and.pred %p6, %p5, %p3; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.ne.b32 %p7, %r22, 0; + xor.pred %p8, %p6, %p7; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r30, %r20, %r21; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r31, %r30, 0, %p8; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r32, %r31, %r20; + xor.b32 %r33, %r31, %r21; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r34, %r7, %r6; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r35, %r34, 0, %p8; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r36, %r35, %r6; + xor.b32 %r37, %r35, %r7; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r38, %r32, %r27; + mul.lo.s32 %r39, %r33, %r27; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r40, %r38, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r41, %r38, %r40; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r42, %r39, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r43, %r39, %r42; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r44, %r32, %r22; + mul.lo.s32 %r45, %r33, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r46, %r44, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r47, %r44, %r46; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r48, %r45, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r49, %r45, %r48; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r50, %r36, %r27; + mul.lo.s32 %r51, %r37, %r27; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r52, %r50, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r53, %r50, %r52; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r54, %r51, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r55, %r51, %r54; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r56, %r36, %r22; + mul.lo.s32 %r57, %r37, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r58, %r56, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r59, %r56, %r58; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r60, %r57, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r61, %r57, %r60; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + and.b32 %r62, %r23, 1; + setp.ne.b32 %p9, %r62, 0; + setp.ge.s32 %p10, %r41, %r47; + setp.ge.s32 %p11, %r43, %r49; + setp.ne.b32 %p12, %r43, %r49; + setp.ne.b32 %p13, %r41, %r47; + setp.le.s32 %p14, %r55, %r61; + setp.le.s32 %p15, %r53, %r59; + or.pred %p16, %p13, %p15; + or.pred %p17, %p12, %p14; + and.pred %p18, %p11, %p17; + and.pred %p19, %p10, %p16; + xor.pred %p20, %p19, %p9; + xor.pred %p21, %p18, %p9; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r63, %r41, %r47; + xor.b32 %r64, %r43, %r49; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r65, 0, %r64, %p21; + selp.b32 %r66, 0, %r63, %p20; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r67, %r66, %r32; + xor.b32 %r68, %r65, %r33; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r69, %r53, %r59; + xor.b32 %r70, %r55, %r61; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r71, 0, %r70, %p21; + selp.b32 %r72, 0, %r69, %p20; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r73, %r72, %r36; + xor.b32 %r74, %r71, %r37; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.ge.s32 %p22, %r67, %r68; + setp.ne.b32 %p23, %r67, %r68; + setp.le.s32 %p24, %r73, %r74; + or.pred %p25, %p23, %p24; + and.pred %p26, %p22, %p25; + xor.pred %p27, %p26, %p9; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r75, %r67, %r68; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r76, 0, %r75, %p27; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r77, %r73, %r74; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r78, 0, %r77, %p27; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r79, %r78, %r73; + xor.b32 %r80, %r78, %r74; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r81, %r79, %r28; + mul.lo.s32 %r82, %r80, %r28; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r83, %r79, %r24; + mul.lo.s32 %r84, %r80, %r24; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + and.b32 %r85, %r25, 1; + setp.ne.b32 %p28, %r85, 0; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r86, %r76, %r68; + xor.b32 %r87, %r76, %r67; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r88, %r87, %r28; + mul.lo.s32 %r89, %r86, %r28; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r90, %r88, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r91, %r88, %r90; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r92, %r89, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r93, %r89, %r92; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r94, %r87, %r24; + mul.lo.s32 %r95, %r86, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r96, %r94, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r97, %r94, %r96; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r98, %r95, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r99, %r95, %r98; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r100, %r81, 2, 31, -1; + shfl.sync.bfly.b32 %r101, %r82, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r102, %r82, %r101; + add.s32 %r103, %r81, %r100; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r104, %r83, 2, 31, -1; + shfl.sync.bfly.b32 %r105, %r84, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r106, %r84, %r105; + add.s32 %r107, %r83, %r104; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.ge.s32 %p29, %r93, %r99; + setp.ge.s32 %p30, %r91, %r97; + setp.ne.b32 %p31, %r91, %r97; + setp.ne.b32 %p32, %r93, %r99; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r108, %r93, %r99; + xor.b32 %r109, %r91, %r97; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.le.s32 %p33, %r103, %r107; + setp.le.s32 %p34, %r102, %r106; + or.pred %p35, %p32, %p34; + or.pred %p36, %p31, %p33; + and.pred %p37, %p30, %p36; + and.pred %p38, %p29, %p35; + xor.pred %p39, %p38, %p28; + xor.pred %p40, %p37, %p28; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r110, 0, %r109, %p40; + selp.b32 %r111, 0, %r108, %p39; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r112, %r111, %r86; + xor.b32 %r113, %r110, %r87; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r114, %r102, %r106; + xor.b32 %r115, %r103, %r107; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r116, 0, %r115, %p40; + selp.b32 %r117, 0, %r114, %p39; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r118, %r117, %r80; + xor.b32 %r119, %r116, %r79; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r120, %r113, %r27; + mul.lo.s32 %r121, %r112, %r27; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r122, %r120, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r123, %r120, %r122; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r124, %r121, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r125, %r121, %r124; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r126, %r113, %r22; + mul.lo.s32 %r127, %r112, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r128, %r126, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r129, %r126, %r128; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r130, %r127, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r131, %r127, %r130; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r132, %r119, %r27; + mul.lo.s32 %r133, %r118, %r27; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r134, %r132, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r135, %r132, %r134; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r136, %r133, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r137, %r133, %r136; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r138, %r119, %r22; + mul.lo.s32 %r139, %r118, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r140, %r138, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r141, %r138, %r140; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r142, %r139, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r143, %r139, %r142; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.ge.s32 %p41, %r123, %r129; + setp.ge.s32 %p42, %r125, %r131; + setp.ne.b32 %p43, %r125, %r131; + setp.ne.b32 %p44, %r123, %r129; + setp.le.s32 %p45, %r137, %r143; + setp.le.s32 %p46, %r135, %r141; + or.pred %p47, %p44, %p46; + or.pred %p48, %p43, %p45; + and.pred %p49, %p42, %p48; + and.pred %p50, %p41, %p47; + xor.pred %p51, %p50, %p28; + xor.pred %p52, %p49, %p28; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r144, %r123, %r129; + xor.b32 %r145, %r125, %r131; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r146, 0, %r145, %p52; + selp.b32 %r147, 0, %r144, %p51; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r148, %r147, %r113; + xor.b32 %r149, %r146, %r112; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r150, %r135, %r141; + xor.b32 %r151, %r137, %r143; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r152, 0, %r151, %p52; + selp.b32 %r153, 0, %r150, %p51; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r154, %r153, %r119; + xor.b32 %r155, %r152, %r118; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.ge.s32 %p53, %r148, %r149; + setp.ne.b32 %p54, %r148, %r149; + setp.le.s32 %p55, %r154, %r155; + or.pred %p56, %p54, %p55; + and.pred %p57, %p53, %p56; + xor.pred %p58, %p57, %p28; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r156, %r148, %r149; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r157, 0, %r156, %p58; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r158, %r157, %r149; + xor.b32 %r159, %r157, %r148; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r160, %r154, %r155; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r161, 0, %r160, %p58; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r162, %r161, %r155; + xor.b32 %r163, %r161, %r154; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r164, %r159, %r29; + mul.lo.s32 %r165, %r158, %r29; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r166, %r164, 4, 31, -1; + shfl.sync.bfly.b32 %r167, %r165, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r168, %r164, %r166; + add.s32 %r169, %r165, %r167; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r170, %r159, %r26; + mul.lo.s32 %r171, %r158, %r26; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r172, %r170, 4, 31, -1; + shfl.sync.bfly.b32 %r173, %r171, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r174, %r170, %r172; + add.s32 %r175, %r171, %r173; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r176, %r163, %r29; + mul.lo.s32 %r177, %r162, %r29; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r178, %r176, 4, 31, -1; + shfl.sync.bfly.b32 %r179, %r177, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r180, %r177, %r179; + add.s32 %r181, %r176, %r178; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r182, %r163, %r26; + mul.lo.s32 %r183, %r162, %r26; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r184, %r182, 4, 31, -1; + shfl.sync.bfly.b32 %r185, %r183, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r186, %r183, %r185; + add.s32 %r187, %r182, %r184; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.lt.s32 %p59, %r169, %r175; + setp.lt.s32 %p60, %r168, %r174; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.eq.b32 %p61, %r168, %r174; + setp.eq.b32 %p62, %r169, %r175; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.gt.s32 %p63, %r181, %r187; + setp.gt.s32 %p64, %r180, %r186; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + and.pred %p65, %p62, %p64; + and.pred %p66, %p61, %p63; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + or.pred %p67, %p60, %p66; + or.pred %p68, %p59, %p65; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r188, %r168, %r174; + xor.b32 %r189, %r169, %r175; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r190, %r189, 0, %p68; + selp.b32 %r191, %r188, 0, %p67; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r192, %r191, %r159; + xor.b32 %r193, %r190, %r158; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r194, %r180, %r186; + xor.b32 %r195, %r181, %r187; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r196, %r195, 0, %p67; + selp.b32 %r197, %r194, 0, %p68; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r198, %r197, %r162; + xor.b32 %r199, %r196, %r163; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r200, %r193, %r28; + mul.lo.s32 %r201, %r192, %r28; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r202, %r201, 2, 31, -1; + shfl.sync.bfly.b32 %r203, %r200, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r204, %r201, %r202; + add.s32 %r205, %r200, %r203; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r206, %r193, %r24; + mul.lo.s32 %r207, %r192, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r208, %r207, 2, 31, -1; + shfl.sync.bfly.b32 %r209, %r206, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r210, %r207, %r208; + add.s32 %r211, %r206, %r209; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r212, %r199, %r28; + mul.lo.s32 %r213, %r198, %r28; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r214, %r212, 2, 31, -1; + shfl.sync.bfly.b32 %r215, %r213, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r216, %r213, %r215; + add.s32 %r217, %r212, %r214; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r218, %r199, %r24; + mul.lo.s32 %r219, %r198, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r220, %r218, 2, 31, -1; + shfl.sync.bfly.b32 %r221, %r219, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r222, %r219, %r221; + add.s32 %r223, %r218, %r220; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.lt.s32 %p69, %r205, %r211; + setp.lt.s32 %p70, %r204, %r210; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.eq.b32 %p71, %r204, %r210; + setp.eq.b32 %p72, %r205, %r211; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.gt.s32 %p73, %r217, %r223; + setp.gt.s32 %p74, %r216, %r222; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + and.pred %p75, %p72, %p74; + and.pred %p76, %p71, %p73; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + or.pred %p77, %p70, %p76; + or.pred %p78, %p69, %p75; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r224, %r204, %r210; + xor.b32 %r225, %r205, %r211; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r226, %r225, 0, %p78; + selp.b32 %r227, %r224, 0, %p77; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r228, %r227, %r192; + xor.b32 %r229, %r226, %r193; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r230, %r217, %r223; + xor.b32 %r231, %r216, %r222; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r232, %r231, 0, %p78; + selp.b32 %r233, %r230, 0, %p77; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r234, %r233, %r199; + xor.b32 %r235, %r232, %r198; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r236, %r229, %r27; + mul.lo.s32 %r237, %r228, %r27; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r238, %r237, 1, 31, -1; + shfl.sync.bfly.b32 %r239, %r236, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r240, %r237, %r238; + add.s32 %r241, %r236, %r239; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r242, %r229, %r22; + mul.lo.s32 %r243, %r228, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r244, %r243, 1, 31, -1; + shfl.sync.bfly.b32 %r245, %r242, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r246, %r243, %r244; + add.s32 %r247, %r242, %r245; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r248, %r235, %r27; + mul.lo.s32 %r249, %r234, %r27; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r250, %r249, 1, 31, -1; + shfl.sync.bfly.b32 %r251, %r248, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r252, %r248, %r251; + add.s32 %r253, %r249, %r250; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r254, %r235, %r22; + mul.lo.s32 %r255, %r234, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r256, %r255, 1, 31, -1; + shfl.sync.bfly.b32 %r257, %r254, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r258, %r254, %r257; + add.s32 %r259, %r255, %r256; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.lt.s32 %p79, %r241, %r247; + setp.lt.s32 %p80, %r240, %r246; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.eq.b32 %p81, %r240, %r246; + setp.eq.b32 %p82, %r241, %r247; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.gt.s32 %p83, %r253, %r259; + setp.gt.s32 %p84, %r252, %r258; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + and.pred %p85, %p82, %p84; + and.pred %p86, %p81, %p83; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + or.pred %p87, %p80, %p86; + or.pred %p88, %p79, %p85; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r260, %r240, %r246; + xor.b32 %r261, %r241, %r247; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r262, %r261, 0, %p88; + selp.b32 %r263, %r260, 0, %p87; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r264, %r263, %r228; + xor.b32 %r265, %r262, %r229; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r266, %r253, %r259; + xor.b32 %r267, %r252, %r258; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r268, %r267, 0, %p88; + selp.b32 %r269, %r266, 0, %p87; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r270, %r269, %r234; + xor.b32 %r271, %r268, %r235; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.lt.s32 %p89, %r264, %r265; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.eq.b32 %p90, %r264, %r265; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.gt.s32 %p91, %r270, %r271; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r272, %r270, %r271; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r273, %r272, 0, %p91; + selp.b32 %r274, %r273, 0, %p90; + selp.b32 %r275, %r272, %r274, %p89; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r8, %r275, %r270; + xor.b32 %r9, %r275, %r271; +$L__tmp2: + .loc 1 47 20 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:47:20 + setp.ne.b64 %p92, %rd11, 16384; + setp.eq.b64 %p93, %rd11, 16384; + setp.eq.b64 %p94, %rd12, 16384; + .loc 1 0 0 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:0 + selp.b32 %r276, 1, 0, %p93; + selp.b32 %r277, 1, 0, %p94; +$L__tmp3: + .loc 2 574 22 // triton_helpers.py:574:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + and.pred %p95, %p94, %p92; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.pred %p96, %p95, %p7; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r278, %r276, %r277; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r279, %r278, 0, %p96; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r280, %r279, %r276; + xor.b32 %r281, %r279, %r277; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r282, %r34, 0, %p96; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r283, %r282, %r6; + xor.b32 %r284, %r282, %r7; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r285, %r280, %r27; + mul.lo.s32 %r286, %r281, %r27; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r287, %r285, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r288, %r287, %r285; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r289, %r286, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r290, %r289, %r286; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r291, %r280, %r22; + mul.lo.s32 %r292, %r281, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r293, %r291, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r294, %r293, %r291; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r295, %r292, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r296, %r295, %r292; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r297, %r283, %r27; + mul.lo.s32 %r298, %r284, %r27; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r299, %r297, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r300, %r299, %r297; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r301, %r298, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r302, %r301, %r298; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r303, %r283, %r22; + mul.lo.s32 %r304, %r284, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r305, %r303, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r306, %r305, %r303; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r307, %r304, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r308, %r307, %r304; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.ge.s32 %p97, %r288, %r294; + setp.ge.s32 %p98, %r290, %r296; + setp.ne.b32 %p99, %r290, %r296; + setp.ne.b32 %p100, %r288, %r294; + setp.le.s32 %p101, %r302, %r308; + setp.le.s32 %p102, %r300, %r306; + or.pred %p103, %p100, %p102; + or.pred %p104, %p99, %p101; + and.pred %p105, %p98, %p104; + and.pred %p106, %p97, %p103; + xor.pred %p107, %p106, %p9; + xor.pred %p108, %p105, %p9; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r309, %r296, %r290; + xor.b32 %r310, %r306, %r300; + xor.b32 %r311, %r294, %r288; + xor.b32 %r312, %r308, %r302; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r313, 0, %r309, %p108; + selp.b32 %r314, 0, %r310, %p107; + selp.b32 %r315, 0, %r311, %p107; + selp.b32 %r316, 0, %r312, %p108; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r317, %r314, %r283; + xor.b32 %r318, %r313, %r281; + xor.b32 %r319, %r316, %r284; + xor.b32 %r320, %r315, %r280; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.ge.s32 %p109, %r320, %r318; + setp.ne.b32 %p110, %r318, %r320; + setp.le.s32 %p111, %r317, %r319; + or.pred %p112, %p110, %p111; + and.pred %p113, %p109, %p112; + xor.pred %p114, %p113, %p9; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r321, %r318, %r320; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r322, 0, %r321, %p114; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r323, %r322, %r320; + xor.b32 %r324, %r322, %r318; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r325, %r319, %r317; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r326, 0, %r325, %p114; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r327, %r326, %r317; + xor.b32 %r328, %r326, %r319; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r329, %r323, %r28; + mul.lo.s32 %r330, %r324, %r28; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r331, %r329, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r332, %r329, %r331; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r333, %r330, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r334, %r330, %r333; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r335, %r323, %r24; + mul.lo.s32 %r336, %r324, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r337, %r335, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r338, %r335, %r337; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r339, %r336, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r340, %r336, %r339; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r341, %r327, %r28; + mul.lo.s32 %r342, %r328, %r28; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r343, %r341, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r344, %r341, %r343; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r345, %r342, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r346, %r342, %r345; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r347, %r327, %r24; + mul.lo.s32 %r348, %r328, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r349, %r347, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r350, %r347, %r349; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r351, %r348, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r352, %r348, %r351; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.ge.s32 %p115, %r334, %r340; + setp.ge.s32 %p116, %r332, %r338; + setp.ne.b32 %p117, %r332, %r338; + setp.ne.b32 %p118, %r334, %r340; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r353, %r332, %r338; + xor.b32 %r354, %r334, %r340; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.le.s32 %p119, %r344, %r350; + setp.le.s32 %p120, %r346, %r352; + or.pred %p121, %p118, %p120; + or.pred %p122, %p117, %p119; + and.pred %p123, %p116, %p122; + and.pred %p124, %p115, %p121; + xor.pred %p125, %p124, %p28; + xor.pred %p126, %p123, %p28; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r355, 0, %r353, %p126; + selp.b32 %r356, 0, %r354, %p125; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r357, %r355, %r323; + xor.b32 %r358, %r356, %r324; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r359, %r346, %r352; + xor.b32 %r360, %r344, %r350; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r361, 0, %r360, %p126; + selp.b32 %r362, 0, %r359, %p125; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r363, %r362, %r328; + xor.b32 %r364, %r361, %r327; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r365, %r357, %r27; + mul.lo.s32 %r366, %r358, %r27; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r367, %r365, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r368, %r365, %r367; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r369, %r366, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r370, %r366, %r369; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r371, %r357, %r22; + mul.lo.s32 %r372, %r358, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r373, %r371, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r374, %r371, %r373; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r375, %r372, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r376, %r372, %r375; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r377, %r364, %r27; + mul.lo.s32 %r378, %r363, %r27; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r379, %r377, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r380, %r377, %r379; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r381, %r378, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r382, %r378, %r381; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r383, %r364, %r22; + mul.lo.s32 %r384, %r363, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r385, %r383, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r386, %r383, %r385; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r387, %r384, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r388, %r384, %r387; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.ge.s32 %p127, %r368, %r374; + setp.ge.s32 %p128, %r370, %r376; + setp.ne.b32 %p129, %r370, %r376; + setp.ne.b32 %p130, %r368, %r374; + setp.le.s32 %p131, %r382, %r388; + setp.le.s32 %p132, %r380, %r386; + or.pred %p133, %p130, %p132; + or.pred %p134, %p129, %p131; + and.pred %p135, %p128, %p134; + and.pred %p136, %p127, %p133; + xor.pred %p137, %p136, %p28; + xor.pred %p138, %p135, %p28; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r389, %r368, %r374; + xor.b32 %r390, %r370, %r376; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r391, 0, %r390, %p138; + selp.b32 %r392, 0, %r389, %p137; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r393, %r392, %r357; + xor.b32 %r394, %r391, %r358; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r395, %r380, %r386; + xor.b32 %r396, %r382, %r388; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r397, 0, %r396, %p138; + selp.b32 %r398, 0, %r395, %p137; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r399, %r398, %r364; + xor.b32 %r400, %r397, %r363; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.ge.s32 %p139, %r393, %r394; + setp.ne.b32 %p140, %r393, %r394; + setp.le.s32 %p141, %r399, %r400; + or.pred %p142, %p140, %p141; + and.pred %p143, %p139, %p142; + xor.pred %p144, %p143, %p28; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r401, %r393, %r394; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r402, 0, %r401, %p144; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r403, %r402, %r393; + xor.b32 %r404, %r402, %r394; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r405, %r399, %r400; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r406, 0, %r405, %p144; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r407, %r406, %r399; + xor.b32 %r408, %r406, %r400; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r409, %r403, %r29; + mul.lo.s32 %r410, %r404, %r29; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r411, %r409, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r412, %r409, %r411; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r413, %r410, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r414, %r410, %r413; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r415, %r403, %r26; + mul.lo.s32 %r416, %r404, %r26; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r417, %r415, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r418, %r415, %r417; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r419, %r416, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r420, %r416, %r419; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r421, %r407, %r29; + mul.lo.s32 %r422, %r408, %r29; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r423, %r421, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r424, %r421, %r423; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r425, %r422, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r426, %r422, %r425; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r427, %r407, %r26; + mul.lo.s32 %r428, %r408, %r26; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r429, %r427, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r430, %r427, %r429; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r431, %r428, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r432, %r428, %r431; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.lt.s32 %p145, %r412, %r418; + setp.lt.s32 %p146, %r414, %r420; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.eq.b32 %p147, %r414, %r420; + setp.eq.b32 %p148, %r412, %r418; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.gt.s32 %p149, %r426, %r432; + setp.gt.s32 %p150, %r424, %r430; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + and.pred %p151, %p148, %p150; + and.pred %p152, %p147, %p149; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + or.pred %p153, %p146, %p152; + or.pred %p154, %p145, %p151; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r433, %r414, %r420; + xor.b32 %r434, %r412, %r418; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r435, %r434, 0, %p154; + selp.b32 %r436, %r433, 0, %p153; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r437, %r436, %r404; + xor.b32 %r438, %r435, %r403; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r439, %r424, %r430; + xor.b32 %r440, %r426, %r432; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r441, %r439, 0, %p154; + selp.b32 %r442, %r440, 0, %p153; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r443, %r441, %r407; + xor.b32 %r444, %r442, %r408; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r445, %r438, %r28; + mul.lo.s32 %r446, %r437, %r28; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r447, %r445, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r448, %r445, %r447; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r449, %r446, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r450, %r446, %r449; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r451, %r438, %r24; + mul.lo.s32 %r452, %r437, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r453, %r451, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r454, %r451, %r453; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r455, %r452, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r456, %r452, %r455; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r457, %r443, %r28; + mul.lo.s32 %r458, %r444, %r28; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r459, %r457, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r460, %r457, %r459; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r461, %r458, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r462, %r458, %r461; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r463, %r443, %r24; + mul.lo.s32 %r464, %r444, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r465, %r463, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r466, %r463, %r465; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r467, %r464, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r468, %r464, %r467; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.lt.s32 %p155, %r448, %r454; + setp.lt.s32 %p156, %r450, %r456; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.eq.b32 %p157, %r450, %r456; + setp.eq.b32 %p158, %r448, %r454; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.gt.s32 %p159, %r462, %r468; + setp.gt.s32 %p160, %r460, %r466; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + and.pred %p161, %p158, %p160; + and.pred %p162, %p157, %p159; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + or.pred %p163, %p156, %p162; + or.pred %p164, %p155, %p161; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r469, %r450, %r456; + xor.b32 %r470, %r448, %r454; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r471, %r470, 0, %p164; + selp.b32 %r472, %r469, 0, %p163; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r473, %r472, %r437; + xor.b32 %r474, %r471, %r438; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r475, %r460, %r466; + xor.b32 %r476, %r462, %r468; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r477, %r475, 0, %p164; + selp.b32 %r478, %r476, 0, %p163; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r479, %r477, %r443; + xor.b32 %r480, %r478, %r444; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r481, %r474, %r27; + mul.lo.s32 %r482, %r473, %r27; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r483, %r481, 1, 31, -1; + shfl.sync.bfly.b32 %r485, %r482, 1, 31, -1; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r487, %r474, %r22; + mul.lo.s32 %r488, %r473, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r489, %r487, 1, 31, -1; + shfl.sync.bfly.b32 %r491, %r488, 1, 31, -1; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r493, %r479, %r27; + mul.lo.s32 %r494, %r480, %r27; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r495, %r493, 1, 31, -1; + shfl.sync.bfly.b32 %r497, %r494, 1, 31, -1; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r499, %r479, %r22; + mul.lo.s32 %r500, %r480, %r22; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r501, %r499, 1, 31, -1; + shfl.sync.bfly.b32 %r503, %r500, 1, 31, -1; +$L__tmp4: + .loc 1 54 35 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:54:35 + and.pred %p178, %p1, %p4; + selp.b64 %rd17, 1, 0, %p178; + and.pred %p179, %p1, %p5; + selp.b64 %rd18, 1, 0, %p179; +$L__tmp5: + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + add.s64 %rd19, %rd17, %rd18; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + cvt.u32.u64 %r521, %rd19; + shfl.sync.bfly.b32 %r522, %r521, 4, 31, -1; + mov.b32 %r523, 0; + shfl.sync.bfly.b32 %r524, %r523, 4, 31, -1; + cvt.u64.u32 %rd20, %r522; + cvt.u64.u32 %rd21, %r524; + shl.b64 %rd22, %rd21, 32; + or.b64 %rd23, %rd20, %rd22; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + add.s64 %rd24, %rd19, %rd23; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + mov.b64 {_, %r525}, %rd24; + cvt.u32.u64 %r526, %rd24; + shfl.sync.bfly.b32 %r527, %r526, 2, 31, -1; + shfl.sync.bfly.b32 %r528, %r525, 2, 31, -1; + cvt.u64.u32 %rd25, %r527; + cvt.u64.u32 %rd26, %r528; + shl.b64 %rd27, %rd26, 32; + or.b64 %rd28, %rd25, %rd27; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + add.s64 %rd29, %rd24, %rd28; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + mov.b64 {_, %r529}, %rd29; + cvt.u32.u64 %r530, %rd29; + shfl.sync.bfly.b32 %r531, %r530, 1, 31, -1; + shfl.sync.bfly.b32 %r532, %r529, 1, 31, -1; + cvt.u64.u32 %rd30, %r531; + cvt.u64.u32 %rd31, %r532; + shl.b64 %rd32, %rd31, 32; + or.b64 %rd33, %rd30, %rd32; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + add.s64 %rd34, %rd29, %rd33; +$L__tmp6: + .loc 1 60 21 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:60:21 + mov.b32 %r533, global_smem; + add.s32 %r534, %r533, %r3; + st.shared.b64 [%r534], %rd34; + bar.sync 0; + shl.b32 %r535, %r4, 3; + add.s32 %r536, %r533, %r535; + ld.shared.b64 %rd2, [%r536]; + .loc 1 58 35 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:58:35 + and.pred %p180, %p1, %p93; + selp.b64 %rd35, 1, 0, %p180; + and.pred %p181, %p1, %p94; + selp.b64 %rd36, 1, 0, %p181; +$L__tmp7: + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + add.s64 %rd37, %rd35, %rd36; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + cvt.u32.u64 %r537, %rd37; + shfl.sync.bfly.b32 %r538, %r537, 4, 31, -1; + shfl.sync.bfly.b32 %r539, %r523, 4, 31, -1; + cvt.u64.u32 %rd38, %r538; + cvt.u64.u32 %rd39, %r539; + shl.b64 %rd40, %rd39, 32; + or.b64 %rd41, %rd38, %rd40; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + add.s64 %rd42, %rd37, %rd41; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + mov.b64 {_, %r540}, %rd42; + cvt.u32.u64 %r541, %rd42; + shfl.sync.bfly.b32 %r542, %r541, 2, 31, -1; + shfl.sync.bfly.b32 %r543, %r540, 2, 31, -1; + cvt.u64.u32 %rd43, %r542; + cvt.u64.u32 %rd44, %r543; + shl.b64 %rd45, %rd44, 32; + or.b64 %rd46, %rd43, %rd45; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + add.s64 %rd47, %rd42, %rd46; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + mov.b64 {_, %r544}, %rd47; + cvt.u32.u64 %r545, %rd47; + shfl.sync.bfly.b32 %r546, %r545, 1, 31, -1; + shfl.sync.bfly.b32 %r547, %r544, 1, 31, -1; + cvt.u64.u32 %rd48, %r546; + cvt.u64.u32 %rd49, %r547; + shl.b64 %rd50, %rd49, 32; + or.b64 %rd51, %rd48, %rd50; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + add.s64 %rd3, %rd47, %rd51; +$L__tmp8: + .loc 1 61 21 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:61:21 + bar.sync 0; + st.shared.b64 [%r534], %rd3; + bar.sync 0; + .loc 1 60 21 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:60:21 + cvt.u32.u64 %r548, %rd34; + .loc 1 64 19 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:64:19 + setp.lt.s32 %p182, %r6, %r548; + setp.lt.s32 %p183, %r7, %r548; + .loc 1 66 35 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:66:35 + selp.b32 %r549, %r8, 16, %p182; + selp.b32 %r550, %r9, 16, %p183; + .loc 1 68 20 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:68:20 + add.s32 %r551, %r549, 17; + add.s32 %r552, %r550, 17; + .loc 1 69 20 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:69:20 + setp.lt.s32 %p184, %r549, 0; + setp.lt.s32 %p185, %r550, 0; + .loc 1 70 35 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:70:35 + selp.b32 %r12, %r551, %r549, %p184; + selp.b32 %r13, %r552, %r550, %p185; + .loc 1 71 63 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:71:63 + max.u32 %r553, %r12, %r13; + setp.lt.u32 %p186, %r553, 17; + or.pred %p187, %p2, %p186; + @%p187 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + .loc 1 0 0 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:0 + add.s32 %r484, %r481, %r483; + add.s32 %r486, %r482, %r485; + add.s32 %r490, %r487, %r489; + add.s32 %r492, %r488, %r491; + add.s32 %r496, %r493, %r495; + add.s32 %r498, %r494, %r497; + add.s32 %r502, %r499, %r501; + add.s32 %r504, %r500, %r503; + setp.lt.s32 %p165, %r486, %r492; + setp.lt.s32 %p166, %r484, %r490; + setp.eq.b32 %p167, %r484, %r490; + setp.eq.b32 %p168, %r486, %r492; + setp.gt.s32 %p169, %r496, %r502; + setp.gt.s32 %p170, %r498, %r504; + and.pred %p171, %p168, %p170; + and.pred %p172, %p167, %p169; + or.pred %p173, %p166, %p172; + or.pred %p174, %p165, %p171; + xor.b32 %r505, %r484, %r490; + xor.b32 %r506, %r486, %r492; + selp.b32 %r507, %r506, 0, %p174; + selp.b32 %r508, %r505, 0, %p173; + xor.b32 %r509, %r508, %r474; + xor.b32 %r510, %r507, %r473; + xor.b32 %r511, %r496, %r502; + xor.b32 %r512, %r498, %r504; + selp.b32 %r513, %r511, 0, %p173; + selp.b32 %r514, %r512, 0, %p174; + xor.b32 %r515, %r513, %r479; + xor.b32 %r516, %r514, %r480; + setp.lt.s32 %p175, %r509, %r510; + setp.eq.b32 %p176, %r509, %r510; + setp.gt.s32 %p177, %r515, %r516; + xor.b32 %r517, %r515, %r516; + selp.b32 %r518, %r517, 0, %p177; + selp.b32 %r519, %r518, 0, %p176; + selp.b32 %r520, %r517, %r519, %p175; + xor.b32 %r10, %r520, %r515; + xor.b32 %r11, %r520, %r516; + .loc 1 61 21 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:61:21 + ld.shared.b64 %rd4, [%r536]; + cvt.u32.u64 %r554, %rd3; + .loc 1 71 63 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:71:63 + bar.sync 0; + .loc 1 75 19 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:75:19 + setp.lt.s32 %p189, %r6, %r554; + setp.lt.s32 %p190, %r7, %r554; + .loc 1 76 35 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:76:35 + selp.b32 %r555, %r10, 16, %p189; + selp.b32 %r556, %r11, 16, %p190; + .loc 1 77 20 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:77:20 + add.s32 %r557, %r555, 17; + add.s32 %r558, %r556, 17; + .loc 1 78 20 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:78:20 + setp.lt.s32 %p191, %r555, 0; + setp.lt.s32 %p192, %r556, 0; + .loc 1 79 35 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:79:35 + selp.b32 %r14, %r557, %r555, %p191; + selp.b32 %r15, %r558, %r556, %p192; + .loc 1 80 63 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:80:63 + max.u32 %r559, %r14, %r15; + setp.lt.u32 %p193, %r559, 17; + or.pred %p194, %p2, %p193; + @%p194 bra $L__BB0_4; + bra.uni $L__BB0_3; +$L__BB0_4: + .loc 1 0 63 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:0:63 + ld.param.b64 %rd10, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6]; + ld.param.b64 %rd9, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5]; + ld.param.b64 %rd8, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4]; + ld.param.b64 %rd7, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3]; + ld.param.b64 %rd6, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2]; + ld.param.b64 %rd5, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1]; + cvt.s64.s32 %rd1, %r19; + .loc 1 61 21 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:61:21 + cvt.u32.u64 %r561, %rd4; + .loc 1 60 21 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:60:21 + cvt.u32.u64 %r560, %rd2; + .loc 1 25 23 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:25:23 + or.b32 %r570, %r1, %r4; + .loc 1 26 21 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:26:21 + setp.lt.s32 %p198, %r570, 32; + .loc 1 80 63 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:80:63 + bar.sync 0; + .loc 1 81 25 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:81:25 + mul.wide.s32 %rd60, %r570, 4; + add.s64 %rd52, %rd5, %rd60; + .loc 1 81 37 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:81:37 + setp.eq.b32 %p203, %r3, 0; + and.pred %p195, %p203, %p198; + // begin inline asm + @%p195 st.global.b32 [ %rd52 + 0 ], { %r560 }; + // end inline asm + .loc 1 82 25 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:82:25 + add.s64 %rd53, %rd6, %rd60; + .loc 1 82 37 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:82:37 + // begin inline asm + @%p195 st.global.b32 [ %rd53 + 0 ], { %r561 }; + // end inline asm + .loc 1 83 25 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:83:25 + shl.b64 %rd61, %rd1, 2; + add.s64 %rd54, %rd7, %rd61; + .loc 1 83 47 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:83:47 + // begin inline asm + @%p1 st.global.v2.b32 [ %rd54 + 0 ], { %r8, %r9 }; + // end inline asm + .loc 1 84 52 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:84:52 + mul.lo.s32 %r571, %r5, 17; + .loc 1 84 49 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:84:49 + add.s32 %r572, %r12, %r571; + add.s32 %r573, %r13, %r571; + .loc 1 84 25 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:84:25 + mad.wide.s32 %rd62, %r572, 4, %rd8; + mad.wide.s32 %rd63, %r573, 4, %rd8; + .loc 1 84 85 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:84:85 + bar.sync 0; + shl.b32 %r574, %r2, 5; + and.b32 %r575, %r574, 96; + and.b32 %r576, %r2, 48; + shl.b32 %r577, %r576, 3; + shl.b32 %r578, %r2, 1; + and.b32 %r579, %r578, 120; + or.b32 %r580, %r575, %r577; + xor.b32 %r581, %r580, %r579; + add.s32 %r583, %r533, %r581; + st.shared.b64 [%r583], %rd62; + xor.b32 %r584, %r581, 8; + add.s32 %r585, %r533, %r584; + st.shared.b64 [%r585+512], %rd63; + bar.sync 0; + shl.b32 %r586, %r2, 6; + and.b32 %r587, %r586, 384; + shl.b32 %r588, %r4, 4; + shl.b32 %r589, %r576, 1; + bfe.s32 %r590, %r2, 3, 1; + and.b32 %r591, %r590, 520; + or.b32 %r592, %r587, %r588; + xor.b32 %r593, %r592, %r589; + or.b32 %r594, %r593, %r591; + add.s32 %r595, %r533, %r594; + ld.shared.b64 %rd55, [%r595]; + xor.b32 %r596, %r594, 8; + add.s32 %r597, %r533, %r596; + ld.shared.b64 %rd56, [%r597]; + mov.b32 %r564, 1; + // begin inline asm + @%p198 st.global.b32 [ %rd55 + 0 ], { %r564 }; + // end inline asm + // begin inline asm + @%p198 st.global.b32 [ %rd56 + 0 ], { %r564 }; + // end inline asm + .loc 1 85 25 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:85:25 + add.s64 %rd57, %rd9, %rd61; + .loc 1 85 47 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:85:47 + // begin inline asm + @%p1 st.global.v2.b32 [ %rd57 + 0 ], { %r10, %r11 }; + // end inline asm + .loc 1 86 49 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:86:49 + add.s32 %r598, %r14, %r571; + add.s32 %r599, %r15, %r571; + .loc 1 86 25 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:86:25 + mad.wide.s32 %rd64, %r598, 4, %rd10; + mad.wide.s32 %rd65, %r599, 4, %rd10; + .loc 1 86 85 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:86:85 + bar.sync 0; + st.shared.b64 [%r583], %rd64; + st.shared.b64 [%r585+512], %rd65; + bar.sync 0; + ld.shared.b64 %rd58, [%r595]; + ld.shared.b64 %rd59, [%r597]; + // begin inline asm + @%p198 st.global.b32 [ %rd58 + 0 ], { %r564 }; + // end inline asm + // begin inline asm + @%p198 st.global.b32 [ %rd59 + 0 ], { %r564 }; + // end inline asm + .loc 1 86 4 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:86:4 + ret; +$L__BB0_1: + .loc 1 71 63 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:71:63 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd72, assertFunc_0; + cvta.global.u64 %rd73, %rd72; + st.param.b64 [param3], %rd73; + mov.b64 %rd74, assertFile_0; + cvta.global.u64 %rd75, %rd74; + st.param.b64 [param1], %rd75; + mov.b64 %rd76, assertMessage_0; + cvta.global.u64 %rd77, %rd76; + st.param.b64 [param0], %rd77; + st.param.b64 [param4], 1; + st.param.b32 [param2], 71; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__BB0_3: + .loc 1 80 63 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:80:63 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd66, assertFunc_1; + cvta.global.u64 %rd67, %rd66; + st.param.b64 [param3], %rd67; + mov.b64 %rd68, assertFile_1; + cvta.global.u64 %rd69, %rd68; + st.param.b64 [param1], %rd69; + mov.b64 %rd70, assertMessage_1; + cvta.global.u64 %rd71, %rd70; + st.param.b64 [param0], %rd71; + st.param.b64 [param4], 1; + st.param.b32 [param2], 80; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp9: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 376 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x171 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 102 +.b8 104 +.b8 119 +.b8 55 +.b8 106 +.b8 115 +.b8 114 +.b8 53 +.b8 115 +.b8 107 +.b8 55 +.b8 52 +.b8 107 +.b8 99 +.b8 104 +.b8 100 +.b8 104 +.b8 113 +.b8 111 +.b8 103 +.b8 51 +.b8 113 +.b8 121 +.b8 113 +.b8 55 +.b8 103 +.b8 101 +.b8 51 +.b8 113 +.b8 106 +.b8 113 +.b8 109 +.b8 109 +.b8 113 +.b8 119 +.b8 121 +.b8 51 +.b8 55 +.b8 108 +.b8 114 +.b8 50 +.b8 98 +.b8 101 +.b8 52 +.b8 108 +.b8 112 +.b8 121 +.b8 107 +.b8 122 +.b8 120 +.b8 51 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 102 +.b8 104 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x7a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 116 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 112 +.b8 117 +.b8 116 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 110 +.b8 101 +.b8 119 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 115 +.b8 99 +.b8 97 +.b8 108 +.b8 97 +.b8 114 +.b8 95 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 119 +.b8 104 +.b8 101 +.b8 114 +.b8 101 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x105:0x76 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x132:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x14a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 55 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x162:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp8 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 59 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source b/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source new file mode 100644 index 0000000000000000000000000000000000000000..00df6ae5bc66e2a85dccc64454a2defd01f22f6b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source @@ -0,0 +1,1405 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":18:0) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc141 = loc(unknown) +#loc166 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc170 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc175 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc179 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc188 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc193 = loc("in_ptr0"(#loc)) +#loc194 = loc("out_ptr4"(#loc)) +#loc195 = loc("out_ptr5"(#loc)) +#loc196 = loc("out_ptr6"(#loc)) +#loc197 = loc("out_ptr7"(#loc)) +#loc198 = loc("out_ptr8"(#loc)) +#loc199 = loc("out_ptr9"(#loc)) +#loc200 = loc("xnumel"(#loc)) +#loc201 = loc("r0_numel"(#loc)) +#loc257 = loc("x"(#loc91)) +#loc258 = loc("idxs"(#loc91)) +#loc259 = loc("x"(#loc95)) +#loc260 = loc("idxs"(#loc95)) +#loc265 = loc("x"(#loc103)) +#loc266 = loc("idxs"(#loc103)) +#loc267 = loc("flip"(#loc103)) +#loc323 = loc("input"(#loc166)) +#loc324 = loc("a"(#loc170)) +#loc325 = loc("b"(#loc170)) +#loc327 = loc("x"(#loc175)) +#loc328 = loc("x"(#loc179)) +#loc329 = loc("input"(#loc188)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 32 : i32 loc(#loc202) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc203) + %xoffset = tt.get_program_id x : i32 loc(#loc204) + %xoffset_2 = arith.constant 8 : i32 loc(#loc205) + %xoffset_3 = arith.constant 8 : i32 loc(#loc205) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc205) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc206) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc207) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<8x1xi32> loc(#loc208) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<8x1xi32> loc(#loc208) + %xmask = arith.constant dense<32> : tensor<8x1xi32> loc(#loc209) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<8x1xi32> loc(#loc209) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc210) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc211) + %r0_offset = arith.constant 0 : i32 loc(#loc212) + %r0_mask = arith.constant true loc(#loc213) + %r0_mask_10 = arith.constant dense : tensor<8x16xi1> loc(#loc213) + %tmp0 = arith.constant 16 : i32 loc(#loc214) + %tmp0_11 = arith.constant 16 : i32 loc(#loc214) + %tmp0_12 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc214) + %tmp0_13 = arith.muli %tmp0_12, %xindex_7 : tensor<8x1xi32> loc(#loc214) + %tmp0_14 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc215) + %tmp0_15 = tt.broadcast %tmp0_13 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc215) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<8x16xi32> loc(#loc215) + %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc216) + %tmp0_18 = tt.addptr %tmp0_17, %tmp0_16 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc216) + %tmp0_19 = arith.constant 0.000000e+00 : f32 loc(#loc217) + %tmp0_20 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc217) + %tmp0_21 = arith.constant dense<0.000000e+00> : tensor<8x16xf32> loc(#loc217) + %tmp0_22 = arith.fptosi %tmp0_21 : tensor<8x16xf32> to tensor<8x16xi64> loc(#loc217) + %tmp0_23 = tt.load %tmp0_18, %tmp0_20, %tmp0_22 : tensor<8x16x!tt.ptr> loc(#loc217) + %tmp1 = arith.constant 0 : i64 loc(#loc218) + %tmp1_24 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc218) + %tmp2 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc219) + %tmp2_25 = arith.cmpi sgt, %tmp0_23, %tmp2 : tensor<8x16xi64> loc(#loc219) + %tmp3 = arith.constant 16384 : i64 loc(#loc220) + %tmp3_26 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc220) + %tmp4 = arith.constant dense<16384> : tensor<8x16xi64> loc(#loc221) + %tmp4_27 = arith.cmpi slt, %tmp0_23, %tmp4 : tensor<8x16xi64> loc(#loc221) + %tmp5 = arith.andi %tmp2_25, %tmp4_27 : tensor<8x16xi1> loc(#loc222) + %tmp6 = arith.extui %tmp5 : tensor<8x16xi1> to tensor<8x16xi8> loc(#loc223) + %tmp7 = arith.extsi %tmp6 : tensor<8x16xi8> to tensor<8x16xi32> loc(#loc224) + %tmp9 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc225) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16> -> tensor<8x16xi16> loc(#loc226) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp7, %tmp11) : (tensor<8x16xi32>, tensor<8x16xi16>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc26) + %tmp14 = arith.constant dense<16384> : tensor<8x16xi64> loc(#loc227) + %tmp14_28 = arith.cmpi eq, %tmp0_23, %tmp14 : tensor<8x16xi64> loc(#loc227) + %tmp15 = arith.extui %tmp14_28 : tensor<8x16xi1> to tensor<8x16xi8> loc(#loc228) + %tmp16 = arith.extsi %tmp15 : tensor<8x16xi8> to tensor<8x16xi32> loc(#loc229) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp16, %tmp11) : (tensor<8x16xi32>, tensor<8x16xi16>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc30) + %tmp20 = arith.extsi %tmp7 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc230) + %tmp23 = arith.constant 0 : i32 loc(#loc231) + %tmp23_29 = arith.constant 0 : i64 loc(#loc231) + %tmp23_30 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc231) + %tmp23_31 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc231) + %tmp23_32 = arith.select %tmp23_31, %tmp20, %tmp23_30 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc231) + %tmp24 = tt.call @"triton.language.standard.sum__i64S8_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp23_32) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc232) + %tmp24_33 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc233) + %tmp25 = arith.extsi %tmp16 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc234) + %tmp28 = arith.constant 0 : i32 loc(#loc235) + %tmp28_34 = arith.constant 0 : i64 loc(#loc235) + %tmp28_35 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc235) + %tmp28_36 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc235) + %tmp28_37 = arith.select %tmp28_36, %tmp25, %tmp28_35 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc235) + %tmp29 = tt.call @"triton.language.standard.sum__i64S8_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp28_37) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc236) + %tmp29_38 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc237) + %tmp30 = arith.trunci %tmp24_33 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc238) + %tmp31 = arith.trunci %tmp29_38 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc239) + %tmp32 = arith.extsi %0#1 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc240) + %tmp33 = arith.trunci %tmp32 : tensor<8x16xi64> to tensor<8x16xi32> loc(#loc241) + %tmp34 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc242) + %tmp34_39 = tt.broadcast %tmp30 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc242) + %tmp34_40 = arith.cmpi slt, %tmp34, %tmp34_39 : tensor<8x16xi32> loc(#loc242) + %tmp35 = arith.constant 16 : i32 loc(#loc243) + %tmp35_41 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc243) + %tmp36 = arith.constant dense<16> : tensor<8x16xi32> loc(#loc244) + %tmp36_42 = arith.select %tmp34_40, %tmp33, %tmp36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc244) + %tmp37 = arith.constant 17 : i32 loc(#loc245) + %tmp37_43 = arith.constant dense<17> : tensor<8x16xi32> loc(#loc245) + %tmp38 = arith.addi %tmp36_42, %tmp37_43 : tensor<8x16xi32> loc(#loc246) + %tmp39 = arith.constant 0 : i32 loc(#loc247) + %tmp39_44 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc247) + %tmp39_45 = arith.cmpi slt, %tmp36_42, %tmp39_44 : tensor<8x16xi32> loc(#loc247) + %tmp40 = arith.select %tmp39_45, %tmp38, %tmp36_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc248) + %c0_i32 = arith.constant 0 : i32 loc(#loc50) + %cst = arith.constant dense<0> : tensor<8x16xi32> loc(#loc50) + %2 = arith.cmpi sle, %cst, %tmp40 : tensor<8x16xi32> loc(#loc50) + %c17_i32 = arith.constant 17 : i32 loc(#loc51) + %cst_46 = arith.constant dense<17> : tensor<8x16xi32> loc(#loc51) + %3 = arith.cmpi slt, %tmp40, %cst_46 : tensor<8x16xi32> loc(#loc51) + %4 = arith.andi %2, %3 : tensor<8x16xi1> loc(#loc52) + %true = arith.constant true loc(#loc53) + %cst_47 = arith.constant dense : tensor<8x1xi1> loc(#loc53) + %5 = arith.xori %xmask_8, %cst_47 : tensor<8x1xi1> loc(#loc53) + %6 = tt.broadcast %5 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc54) + %7 = arith.ori %4, %6 : tensor<8x16xi1> loc(#loc54) + tt.assert %7, "index out of bounds: 0 <= tmp40 < 17" : tensor<8x16xi1> loc(#loc55) + %tmp42 = arith.constant 1 : i32 loc(#loc249) + %tmp42_48 = arith.constant dense<1> : tensor<1x1xi32> loc(#loc249) + %tmp43 = arith.extsi %1#1 : tensor<8x16xi32> to tensor<8x16xi64> loc(#loc250) + %tmp44 = arith.trunci %tmp43 : tensor<8x16xi64> to tensor<8x16xi32> loc(#loc251) + %tmp45 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc252) + %tmp45_49 = tt.broadcast %tmp31 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc252) + %tmp45_50 = arith.cmpi slt, %tmp45, %tmp45_49 : tensor<8x16xi32> loc(#loc252) + %tmp46 = arith.constant dense<16> : tensor<8x16xi32> loc(#loc253) + %tmp46_51 = arith.select %tmp45_50, %tmp44, %tmp46 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc253) + %tmp47 = arith.addi %tmp46_51, %tmp37_43 : tensor<8x16xi32> loc(#loc254) + %tmp48 = arith.constant 0 : i32 loc(#loc255) + %tmp48_52 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc255) + %tmp48_53 = arith.cmpi slt, %tmp46_51, %tmp48_52 : tensor<8x16xi32> loc(#loc255) + %tmp49 = arith.select %tmp48_53, %tmp47, %tmp46_51 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc256) + %c0_i32_54 = arith.constant 0 : i32 loc(#loc64) + %cst_55 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc64) + %8 = arith.cmpi sle, %cst_55, %tmp49 : tensor<8x16xi32> loc(#loc64) + %c17_i32_56 = arith.constant 17 : i32 loc(#loc65) + %cst_57 = arith.constant dense<17> : tensor<8x16xi32> loc(#loc65) + %9 = arith.cmpi slt, %tmp49, %cst_57 : tensor<8x16xi32> loc(#loc65) + %10 = arith.andi %8, %9 : tensor<8x16xi1> loc(#loc66) + %true_58 = arith.constant true loc(#loc67) + %cst_59 = arith.constant dense : tensor<8x1xi1> loc(#loc67) + %11 = arith.xori %xmask_8, %cst_59 : tensor<8x1xi1> loc(#loc67) + %12 = tt.broadcast %11 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc68) + %13 = arith.ori %10, %12 : tensor<8x16xi1> loc(#loc68) + tt.assert %13, "index out of bounds: 0 <= tmp49 < 17" : tensor<8x16xi1> loc(#loc69) + %14 = tt.splat %out_ptr4 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc70) + %15 = tt.addptr %14, %xindex_7 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc70) + tt.store %15, %tmp30, %xmask_8 : tensor<8x1x!tt.ptr> loc(#loc71) + %16 = tt.splat %out_ptr5 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc72) + %17 = tt.addptr %16, %xindex_7 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc72) + tt.store %17, %tmp31, %xmask_8 : tensor<8x1x!tt.ptr> loc(#loc73) + %c16_i32 = arith.constant 16 : i32 loc(#loc74) + %c16_i32_60 = arith.constant 16 : i32 loc(#loc74) + %cst_61 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc74) + %18 = arith.muli %cst_61, %xindex_7 : tensor<8x1xi32> loc(#loc74) + %19 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc75) + %20 = tt.broadcast %18 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc75) + %21 = arith.addi %19, %20 : tensor<8x16xi32> loc(#loc75) + %22 = tt.splat %out_ptr6 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc76) + %23 = tt.addptr %22, %21 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc76) + %24 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc77) + tt.store %23, %tmp33, %24 : tensor<8x16x!tt.ptr> loc(#loc77) + %c17_i32_62 = arith.constant 17 : i32 loc(#loc78) + %c17_i32_63 = arith.constant 17 : i32 loc(#loc78) + %cst_64 = arith.constant dense<17> : tensor<8x1xi32> loc(#loc78) + %25 = arith.muli %cst_64, %xindex_7 : tensor<8x1xi32> loc(#loc78) + %26 = tt.broadcast %25 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc79) + %27 = arith.addi %tmp40, %26 : tensor<8x16xi32> loc(#loc79) + %28 = tt.splat %out_ptr7 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc80) + %29 = tt.addptr %28, %27 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc80) + %cst_65 = arith.constant dense<1> : tensor<8x16xi32> loc(#loc81) + %30 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc81) + tt.store %29, %cst_65, %30 : tensor<8x16x!tt.ptr> loc(#loc81) + %c16_i32_66 = arith.constant 16 : i32 loc(#loc82) + %c16_i32_67 = arith.constant 16 : i32 loc(#loc82) + %cst_68 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc82) + %31 = arith.muli %cst_68, %xindex_7 : tensor<8x1xi32> loc(#loc82) + %32 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc83) + %33 = tt.broadcast %31 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc83) + %34 = arith.addi %32, %33 : tensor<8x16xi32> loc(#loc83) + %35 = tt.splat %out_ptr8 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc84) + %36 = tt.addptr %35, %34 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc84) + %37 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc85) + tt.store %36, %tmp44, %37 : tensor<8x16x!tt.ptr> loc(#loc85) + %c17_i32_69 = arith.constant 17 : i32 loc(#loc86) + %c17_i32_70 = arith.constant 17 : i32 loc(#loc86) + %cst_71 = arith.constant dense<17> : tensor<8x1xi32> loc(#loc86) + %38 = arith.muli %cst_71, %xindex_7 : tensor<8x1xi32> loc(#loc86) + %39 = tt.broadcast %38 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc87) + %40 = arith.addi %tmp49, %39 : tensor<8x16xi32> loc(#loc87) + %41 = tt.splat %out_ptr9 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc88) + %42 = tt.addptr %41, %40 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc88) + %cst_72 = arith.constant dense<1> : tensor<8x16xi32> loc(#loc89) + %43 = tt.broadcast %xmask_8 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc89) + tt.store %42, %cst_72, %43 : tensor<8x16x!tt.ptr> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc91)), %idxs: tensor<8x16xi16> loc("idxs"(#loc91))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<8x16xi32>, tensor<8x16xi16>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc92) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc92) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc92) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc92) + tt.return %3#0, %3#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc93) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc94) + %5 = ub.poison : tensor<8x16xi32> loc(#loc94) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc94) + } loc(#loc91) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i16S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc95)), %idxs: tensor<8x16xi16> loc("idxs"(#loc95))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i16S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi16>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + tt.return %0#0, %0#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc101) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x16xi32> loc(#loc102) + %2 = ub.poison : tensor<8x16xi32> loc(#loc102) + tt.return %1, %2 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i16S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi16> loc("idxs"(#loc103)), %flip: tensor<8x16xi32> loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi16> -> tensor<64x2x1xi16> loc(#loc282) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc283) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc284) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<64x2x1xi16> loc(#loc284) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<64x2x1xi16>) -> tensor<64x1xi32> loc(#loc285) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc286) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc287) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc288) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc289) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<64x2x1xi16> loc(#loc289) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<64x2x1xi16>) -> tensor<64x1xi32> loc(#loc290) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc291) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc292) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_27 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_28 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_49 = arith.constant true loc(#loc300) + %cond_50 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<8x16xi1> loc(#loc300) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<8x16xi1> loc(#loc301) + %cond_53 = arith.ori %cond, %cond_52 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_53 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_50 = arith.ori %eq, %eq_49 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_50 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<8x16xi32> loc(#loc306) + %cond_30 = arith.andi %3, %cond_29 : tensor<8x16xi1> loc(#loc307) + %cond_31 = arith.ori %1, %cond_30 : tensor<8x16xi1> loc(#loc308) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<8x16xi1> loc(#loc309) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<8x16xi1> loc(#loc310) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<8x16xi1> loc(#loc311) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<8x16xi1> loc(#loc312) + %cond_36 = arith.extui %cond_35 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc313) + %cond_37 = arith.xori %cond_36, %flip : tensor<8x16xi32> loc(#loc313) + %cond_38 = arith.constant 0 : i32 loc(#loc314) + %cond_39 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc314) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<8x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_43 = arith.xori %x, %ret_42 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<8x16xi32> loc(#loc319) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S8_16S__(%idxs) : (tensor<8x16xi16>) -> tensor<8x16xi16> loc(#loc320) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<8x16xi16> to tensor<8x16xi32> loc(#loc321) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_47 = arith.extsi %idxs : tensor<8x16xi16> to tensor<8x16xi32> loc(#loc322) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_43, %new_idxs_48 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x1xi32> loc("input"(#loc166))) -> tensor<64x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc167) + tt.return %0 : tensor<64x1xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x1xi32> loc(#loc169) + tt.return %1 : tensor<64x1xi32> loc(#loc169) + } loc(#loc166) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc170)), %b: i32 loc("b"(#loc170))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc171) + tt.return %0 : i32 loc(#loc172) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc173) + tt.return %1 : i32 loc(#loc173) + } loc(#loc170) + tt.func private @"triton.language.standard.sum__i16S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x1xi16> loc("input"(#loc166))) -> tensor<64x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc326) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc167) + tt.return %0 : tensor<64x1xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x1xi32> loc(#loc169) + tt.return %1 : tensor<64x1xi32> loc(#loc169) + } loc(#loc166) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%x: tensor<8x16xi32> loc("x"(#loc175))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc176) + %false = arith.constant false loc(#loc177) + tt.return %false : i1 loc(#loc177) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc178) + tt.return %1 : i1 loc(#loc178) + } loc(#loc175) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S8_16S__(%x: tensor<8x16xi32> loc("x"(#loc179))) -> tensor<8x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc180) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc181) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc181) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<8x16xi32> loc(#loc181) + %4 = arith.addi %x, %3 : tensor<8x16xi32> loc(#loc181) + tt.return %4 : tensor<8x16xi32> loc(#loc182) + ^bb1: // no predecessors + %5 = ub.poison : tensor<8x16xi32> loc(#loc183) + tt.return %5 : tensor<8x16xi32> loc(#loc183) + } loc(#loc179) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc185) + %cst = arith.constant dense : tensor<1xi1> loc(#loc185) + tt.return %cst : tensor<1xi1> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc187) + tt.return %0 : tensor<1xi1> loc(#loc187) + } loc(#loc184) + tt.func private @triton.language.standard.zeros_like__i32S8_16S__(%input: tensor<8x16xi32> loc("input"(#loc188))) -> tensor<8x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<8x16xi32> loc(#loc189) + tt.return %0 : tensor<8x16xi32> loc(#loc190) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x16xi32> loc(#loc191) + tt.return %1 : tensor<8x16xi32> loc(#loc191) + } loc(#loc188) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<8x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc185) + %cst = arith.constant dense<0> : tensor<8x16xi32> loc(#loc185) + tt.return %cst : tensor<8x16xi32> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<8x16xi32> loc(#loc187) + tt.return %0 : tensor<8x16xi32> loc(#loc187) + } loc(#loc184) + tt.func private @triton.language.standard.zeros_like__i16S8_16S__(%input: tensor<8x16xi16> loc("input"(#loc188))) -> tensor<8x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<8x16xi16> loc(#loc189) + tt.return %0 : tensor<8x16xi16> loc(#loc190) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x16xi16> loc(#loc191) + tt.return %1 : tensor<8x16xi16> loc(#loc191) + } loc(#loc188) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_8__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<8x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc185) + %cst = arith.constant dense<0> : tensor<8x16xi16> loc(#loc185) + tt.return %cst : tensor<8x16xi16> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<8x16xi16> loc(#loc187) + tt.return %0 : tensor<8x16xi16> loc(#loc187) + } loc(#loc184) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc95)), %idxs: tensor<8x16xi32> loc("idxs"(#loc95))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + tt.return %1#0, %1#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc101) + ^bb1: // no predecessors + %2 = ub.poison : tensor<8x16xi32> loc(#loc102) + %3 = ub.poison : tensor<8x16xi32> loc(#loc102) + tt.return %2, %3 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: tensor<8x16xi32> loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x2xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<32x2x2xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x2xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x2xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<8x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<8x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<8x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<8x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x2x2xi32> loc("input"(#loc166))) -> tensor<32x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc167) + tt.return %0 : tensor<32x2xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x2xi32> loc(#loc169) + tt.return %1 : tensor<32x2xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: tensor<8x16xi32> loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x1xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x1xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<8x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<8x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<8x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<8x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc95)), %idxs: tensor<8x16xi32> loc("idxs"(#loc95))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<8x16xi32>, tensor<8x16xi32>, tensor<8x16xi32>) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + tt.return %2#0, %2#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc101) + ^bb1: // no predecessors + %3 = ub.poison : tensor<8x16xi32> loc(#loc102) + %4 = ub.poison : tensor<8x16xi32> loc(#loc102) + tt.return %3, %4 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: tensor<8x16xi32> loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<16x2x4xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<16x2x4xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<16x2x4xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<16x2x4xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<8x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<8x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<8x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<8x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<16x2x4xi32> loc("input"(#loc166))) -> tensor<16x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc167) + tt.return %0 : tensor<16x4xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<16x4xi32> loc(#loc169) + tt.return %1 : tensor<16x4xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S8_16S_i32S8_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc95)), %idxs: tensor<8x16xi32> loc("idxs"(#loc95))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc330) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<8x16xi32>, tensor<8x16xi32>, i1) -> (tensor<8x16xi32>, tensor<8x16xi32>) loc(#loc100) + tt.return %3#0, %3#1 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc101) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc102) + %5 = ub.poison : tensor<8x16xi32> loc(#loc102) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x8xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<8x2x8xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x8xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x8xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S8_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x8xi32> loc("input"(#loc166))) -> tensor<8x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc167) + tt.return %0 : tensor<8x8xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x8xi32> loc(#loc169) + tt.return %1 : tensor<8x8xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<16x2x4xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<16x2x4xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<16x2x4xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<16x2x4xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S16_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x2xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<32x2x2xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x2xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x2xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S8_16S_i32S8_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<8x16xi32> loc("x"(#loc103)), %idxs: tensor<8x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<8x16xi32>, tensor<8x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x1xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x1xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<8x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<8x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<8x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<8x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<8x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<8x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<8x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<8x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<8x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<8x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<8x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S8_16S__(%ileft_13) : (tensor<8x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<8x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<8x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<8x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<8x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<8x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<8x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<8x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<8x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<8x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<8x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<8x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<8x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<8x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%x) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<8x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<8x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S8_16S__(%idxs) : (tensor<8x16xi32>) -> tensor<8x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<8x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<8x16xi32> loc(#loc165) + %5 = ub.poison : tensor<8x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<8x16xi32>, tensor<8x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i64S8_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x16xi64> loc("input"(#loc166))) -> tensor<8xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc167) + tt.reduce.return %2 : i64 loc(#loc167) + }) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc167) + tt.return %0 : tensor<8xi64> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8xi64> loc(#loc169) + tt.return %1 : tensor<8xi64> loc(#loc169) + } loc(#loc166) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc170)), %b: i64 loc("b"(#loc170))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc171) + tt.return %0 : i64 loc(#loc172) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc173) + tt.return %1 : i64 loc(#loc173) + } loc(#loc170) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:30) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:45) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":35:30) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":36:18) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":37:34) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":38:18) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":39:18) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":40:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":41:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":43:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":45:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":46:71) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":47:20) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":48:21) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":49:21) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":51:71) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":52:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":54:35) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":55:26) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":55:29) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":56:21) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":58:35) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":59:26) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":59:29) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":60:21) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":61:21) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":62:21) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":63:21) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":64:19) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":65:32) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":66:35) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":67:44) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":68:20) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":69:20) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":70:35) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:28) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:46) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:38) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:55) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:53) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:63) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":72:31) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":73:21) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":74:21) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":75:19) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":76:35) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":77:20) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":78:20) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":79:35) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:28) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:46) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:38) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:55) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:53) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:63) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":81:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":81:37) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":82:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":82:37) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":83:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":83:32) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":83:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":83:47) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:52) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:49) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:25) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:85) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":85:35) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":85:32) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":85:25) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":85:47) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:52) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:49) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:85) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:4) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc140 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc142 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc143 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc144 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc145 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc146 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc147 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc148 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc149 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc150 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc151 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc152 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc153 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc154 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc155 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc156 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc157 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc158 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc159 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc160 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc161 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc162 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc163 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc164 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc165 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc167 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc168 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc169 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc171 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc172 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc173 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc174 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc176 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc177 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc178 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc180 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc181 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc182 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc183 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc184 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc185 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc186 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc187 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc189 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc190 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc191 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc192 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc202 = loc("xnumel"(#loc1)) +#loc203 = loc("r0_numel"(#loc2)) +#loc204 = loc("xoffset"(#loc3)) +#loc205 = loc("xoffset"(#loc4)) +#loc206 = loc("xindex"(#loc5)) +#loc207 = loc("xindex"(#loc6)) +#loc208 = loc("xindex"(#loc7)) +#loc209 = loc("xmask"(#loc8)) +#loc210 = loc("r0_index"(#loc9)) +#loc211 = loc("r0_index"(#loc10)) +#loc212 = loc("r0_offset"(#loc11)) +#loc213 = loc("r0_mask"(#loc12)) +#loc214 = loc("tmp0"(#loc13)) +#loc215 = loc("tmp0"(#loc14)) +#loc216 = loc("tmp0"(#loc15)) +#loc217 = loc("tmp0"(#loc16)) +#loc218 = loc("tmp1"(#loc17)) +#loc219 = loc("tmp2"(#loc18)) +#loc220 = loc("tmp3"(#loc19)) +#loc221 = loc("tmp4"(#loc20)) +#loc222 = loc("tmp5"(#loc21)) +#loc223 = loc("tmp6"(#loc22)) +#loc224 = loc("tmp7"(#loc23)) +#loc225 = loc("tmp9"(#loc24)) +#loc226 = loc("tmp11"(#loc25)) +#loc227 = loc("tmp14"(#loc27)) +#loc228 = loc("tmp15"(#loc28)) +#loc229 = loc("tmp16"(#loc29)) +#loc230 = loc("tmp20"(#loc31)) +#loc231 = loc("tmp23"(#loc32)) +#loc232 = loc("tmp24"(#loc33)) +#loc233 = loc("tmp24"(#loc34)) +#loc234 = loc("tmp25"(#loc35)) +#loc235 = loc("tmp28"(#loc36)) +#loc236 = loc("tmp29"(#loc37)) +#loc237 = loc("tmp29"(#loc38)) +#loc238 = loc("tmp30"(#loc39)) +#loc239 = loc("tmp31"(#loc40)) +#loc240 = loc("tmp32"(#loc41)) +#loc241 = loc("tmp33"(#loc42)) +#loc242 = loc("tmp34"(#loc43)) +#loc243 = loc("tmp35"(#loc44)) +#loc244 = loc("tmp36"(#loc45)) +#loc245 = loc("tmp37"(#loc46)) +#loc246 = loc("tmp38"(#loc47)) +#loc247 = loc("tmp39"(#loc48)) +#loc248 = loc("tmp40"(#loc49)) +#loc249 = loc("tmp42"(#loc56)) +#loc250 = loc("tmp43"(#loc57)) +#loc251 = loc("tmp44"(#loc58)) +#loc252 = loc("tmp45"(#loc59)) +#loc253 = loc("tmp46"(#loc60)) +#loc254 = loc("tmp47"(#loc61)) +#loc255 = loc("tmp48"(#loc62)) +#loc256 = loc("tmp49"(#loc63)) +#loc261 = loc("flip"(#loc96)) +#loc262 = loc("flip"(#loc97)) +#loc263 = loc("flip"(#loc98)) +#loc264 = loc("flip"(#loc99)) +#loc268 = loc("y"(#loc104)) +#loc269 = loc("right_mask"(#loc105)) +#loc270 = loc("right_mask"(#loc106)) +#loc271 = loc("left_mask"(#loc107)) +#loc272 = loc("ileft"(#loc108)) +#loc273 = loc("ileft"(#loc109)) +#loc274 = loc("ileft"(#loc110)) +#loc275 = loc("ileft"(#loc111)) +#loc276 = loc("iright"(#loc112)) +#loc277 = loc("iright"(#loc113)) +#loc278 = loc("iright"(#loc114)) +#loc279 = loc("iright"(#loc115)) +#loc280 = loc("ileft"(#loc116)) +#loc281 = loc("iright"(#loc117)) +#loc282 = loc("y_idx"(#loc118)) +#loc283 = loc("left_idx"(#loc119)) +#loc284 = loc("left_idx"(#loc120)) +#loc285 = loc("left_idx"(#loc121)) +#loc286 = loc("left_idx"(#loc122)) +#loc287 = loc("left_idx"(#loc123)) +#loc288 = loc("right_idx"(#loc124)) +#loc289 = loc("right_idx"(#loc125)) +#loc290 = loc("right_idx"(#loc126)) +#loc291 = loc("right_idx"(#loc127)) +#loc292 = loc("right_idx"(#loc128)) +#loc293 = loc("left_idx"(#loc129)) +#loc294 = loc("right_idx"(#loc130)) +#loc295 = loc("left_valid_mask"(#loc131)) +#loc296 = loc("right_valid_mask"(#loc132)) +#loc297 = loc("left_isnan"(#loc133)) +#loc298 = loc("right_isnan"(#loc134)) +#loc299 = loc("cond"(#loc135)) +#loc300 = loc("cond"(#loc138)) +#loc301 = loc("cond"(#loc139)) +#loc302 = loc("cond"(#loc140)) +#loc303 = loc("eq"(#loc142)) +#loc304 = loc("eq"(#loc145)) +#loc305 = loc("eq"(#loc146)) +#loc306 = loc("cond"(#loc147)) +#loc307 = loc("cond"(#loc148)) +#loc308 = loc("cond"(#loc149)) +#loc309 = loc("cond"(#loc150)) +#loc310 = loc("cond"(#loc151)) +#loc311 = loc("cond"(#loc152)) +#loc312 = loc("cond"(#loc153)) +#loc313 = loc("cond"(#loc154)) +#loc314 = loc("cond"(#loc155)) +#loc315 = loc("ret"(#loc156)) +#loc316 = loc("ret"(#loc157)) +#loc317 = loc("ret"(#loc158)) +#loc318 = loc("ret"(#loc159)) +#loc319 = loc("new_idxs"(#loc160)) +#loc320 = loc("new_idxs"(#loc161)) +#loc321 = loc("new_idxs"(#loc162)) +#loc322 = loc("new_idxs"(#loc163)) +#loc326 = loc("input"(#loc174)) +#loc330 = loc("flip"(#loc192)) +#loc331 = loc("cond"(#loc299)) +#loc332 = loc("cond"(#loc302)) +#loc333 = loc("eq"(#loc303)) +#loc334 = loc("eq"(#loc305)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..4ec2172e4142021c8789e457a6ee68f86d602829 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir @@ -0,0 +1,1480 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [4, 8], warpsPerCTA = [2, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1, 2], threadsPerWarp = [4, 2, 4], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1, 2], threadsPerWarp = [8, 2, 2], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked3 = #ttg.blocked<{sizePerThread = [1, 1, 2], threadsPerWarp = [16, 2, 1], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked4 = #ttg.blocked<{sizePerThread = [1, 2, 1], threadsPerWarp = [32, 1, 1], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 2], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":18:0) +#loc1 = loc(unknown) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":46:71) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":51:71) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":55:26) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":59:26) +#loc117 = loc("in_ptr0"(#loc)) +#loc118 = loc("out_ptr4"(#loc)) +#loc119 = loc("out_ptr5"(#loc)) +#loc120 = loc("out_ptr6"(#loc)) +#loc121 = loc("out_ptr7"(#loc)) +#loc122 = loc("out_ptr8"(#loc)) +#loc123 = loc("out_ptr9"(#loc)) +#loc124 = loc("xnumel"(#loc)) +#loc125 = loc("r0_numel"(#loc)) +#loc144 = loc(callsite(#loc20 at #loc21)) +#loc150 = loc("ileft"(#loc29)) +#loc154 = loc("iright"(#loc34)) +#loc163 = loc("left_idx"(#loc43)) +#loc168 = loc("right_idx"(#loc48)) +#loc189 = loc(callsite(#loc20 at #loc69)) +#loc192 = loc("tmp24"(#loc72)) +#loc197 = loc("tmp29"(#loc77)) +#loc214 = loc(callsite(#loc25 at #loc144)) +#loc218 = loc(callsite(#loc25 at #loc189)) +#loc221 = loc(callsite(#loc1 at #loc192)) +#loc224 = loc(callsite(#loc1 at #loc197)) +#loc228 = loc(callsite(#loc150 at #loc214)) +#loc232 = loc(callsite(#loc154 at #loc214)) +#loc240 = loc(callsite(#loc163 at #loc214)) +#loc245 = loc(callsite(#loc168 at #loc214)) +#loc265 = loc(callsite(#loc150 at #loc218)) +#loc269 = loc(callsite(#loc154 at #loc218)) +#loc287 = loc(callsite(#loc163 at #loc218)) +#loc291 = loc(callsite(#loc168 at #loc218)) +#loc301 = loc(callsite(#loc1 at #loc228)) +#loc303 = loc(callsite(#loc1 at #loc232)) +#loc306 = loc(callsite(#loc1 at #loc240)) +#loc309 = loc(callsite(#loc1 at #loc245)) +#loc311 = loc(callsite(#loc1 at #loc265)) +#loc313 = loc(callsite(#loc1 at #loc269)) +#loc315 = loc(callsite(#loc1 at #loc287)) +#loc317 = loc(callsite(#loc1 at #loc291)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<8x1xi1, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<17> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked2> loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked3> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked4> loc(#loc1) + %cst_5 = arith.constant dense<16> : tensor<8x1xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<32> : tensor<8x1xi32, #blocked5> loc(#loc1) + %cst_7 = arith.constant dense<32> : tensor<8x1xi32, #blocked> loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %cst_8 = arith.constant dense<1> : tensor<8x16xi32, #blocked5> loc(#loc1) + %cst_9 = arith.constant dense<0> : tensor<8x16xi32, #blocked> loc(#loc1) + %cst_10 = arith.constant dense<17> : tensor<8x16xi32, #blocked> loc(#loc1) + %cst_11 = arith.constant dense<16> : tensor<8x16xi32, #blocked> loc(#loc1) + %cst_12 = arith.constant dense<16384> : tensor<8x16xi64, #blocked> loc(#loc1) + %cst_13 = arith.constant dense<0> : tensor<8x16xi64, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc126) + %xoffset_14 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc127) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc128) + %xindex_15 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc128) + %xindex_16 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc128) + %xindex_17 = tt.expand_dims %xindex_15 {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<8x1xi32, #blocked5> loc(#loc128) + %xindex_18 = tt.splat %xoffset_14 : i32 -> tensor<8x1xi32, #blocked> loc(#loc129) + %xindex_19 = tt.splat %xoffset_14 : i32 -> tensor<8x1xi32, #blocked5> loc(#loc129) + %xindex_20 = arith.addi %xindex_18, %xindex_16 : tensor<8x1xi32, #blocked> loc(#loc129) + %xindex_21 = arith.addi %xindex_19, %xindex_17 : tensor<8x1xi32, #blocked5> loc(#loc129) + %xmask = arith.cmpi slt, %xindex_20, %cst_7 : tensor<8x1xi32, #blocked> loc(#loc130) + %xmask_22 = arith.cmpi slt, %xindex_21, %cst_6 : tensor<8x1xi32, #blocked5> loc(#loc130) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc131) + %r0_index_23 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc131) + %tmp0 = arith.muli %xindex_20, %cst_5 : tensor<8x1xi32, #blocked> loc(#loc132) + %tmp0_24 = tt.broadcast %r0_index_23 : tensor<1x16xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc133) + %tmp0_25 = tt.broadcast %tmp0 : tensor<8x1xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc133) + %tmp0_26 = arith.addi %tmp0_24, %tmp0_25 : tensor<8x16xi32, #blocked> loc(#loc133) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc134) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi32, #blocked> loc(#loc134) + %tmp0_29 = tt.broadcast %xmask : tensor<8x1xi1, #blocked> -> tensor<8x16xi1, #blocked> loc(#loc135) + %tmp0_30 = tt.broadcast %xmask_22 : tensor<8x1xi1, #blocked5> -> tensor<8x16xi1, #blocked5> loc(#loc135) + %tmp0_31 = tt.load %tmp0_28, %tmp0_29, %cst_13 : tensor<8x16x!tt.ptr, #blocked> loc(#loc135) + %tmp2 = arith.cmpi sgt, %tmp0_31, %cst_13 : tensor<8x16xi64, #blocked> loc(#loc136) + %tmp4 = arith.cmpi slt, %tmp0_31, %cst_12 : tensor<8x16xi64, #blocked> loc(#loc137) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<8x16xi1, #blocked> loc(#loc138) + %tmp7 = arith.extui %tmp5 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc210) + %tmp9 = arith.trunci %r0_index_23 : tensor<1x16xi32, #blocked> to tensor<1x16xi16, #blocked> loc(#loc141) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16, #blocked> -> tensor<8x16xi16, #blocked> loc(#loc142) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> loc(#loc211) + %flip_32 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> loc(#loc211) + %flip_33 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> loc(#loc211) + %flip_34 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> loc(#loc211) + %flip_35 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> loc(#loc211) + %flip_36 = tt.expand_dims %flip_32 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> loc(#loc211) + %flip_37 = tt.expand_dims %flip_33 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> loc(#loc211) + %flip_38 = tt.expand_dims %flip_34 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> loc(#loc211) + %flip_39 = tt.expand_dims %flip_35 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> -> tensor<1x2x1xi32, #blocked3> loc(#loc211) + %flip_40 = tt.expand_dims %flip_36 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> -> tensor<1x2x1xi32, #blocked4> loc(#loc211) + %flip_41 = tt.expand_dims %flip_37 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> -> tensor<1x2x1xi32, #blocked2> loc(#loc211) + %flip_42 = tt.expand_dims %flip_38 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> -> tensor<1x2x1xi32, #blocked1> loc(#loc211) + %flip_43 = tt.broadcast %flip_39 : tensor<1x2x1xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc212) + %flip_44 = tt.reshape %flip_43 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc213) + %y = tt.reshape %tmp7 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc225) + %left_mask = arith.subi %cst_4, %flip_40 : tensor<1x2x1xi32, #blocked4> loc(#loc226) + %left_mask_45 = arith.subi %cst_3, %flip_39 : tensor<1x2x1xi32, #blocked3> loc(#loc226) + %left_mask_46 = arith.subi %cst_2, %flip_41 : tensor<1x2x1xi32, #blocked2> loc(#loc226) + %left_mask_47 = arith.subi %cst_1, %flip_42 : tensor<1x2x1xi32, #blocked1> loc(#loc226) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc227) + %ileft_48 = arith.muli %y, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc227) + %ileft_49 = "tt.reduce"(%ileft_48) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_50 = tt.expand_dims %ileft_49 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc229) + %ileft_51 = tt.broadcast %ileft_50 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc230) + %iright = tt.broadcast %flip_40 : tensor<1x2x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc231) + %iright_52 = arith.muli %y, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc231) + %iright_53 = "tt.reduce"(%iright_52) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_54 = tt.expand_dims %iright_53 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc233) + %iright_55 = tt.broadcast %iright_54 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc234) + %ileft_56 = tt.reshape %ileft_51 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_57 = tt.reshape %iright_55 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx = tt.reshape %tmp11 : tensor<8x16xi16, #blocked> -> tensor<64x2x1xi16, #blocked4> loc(#loc237) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc238) + %left_idx_58 = tt.broadcast %left_idx : tensor<1x2x1xi16, #blocked4> -> tensor<64x2x1xi16, #blocked4> loc(#loc239) + %left_idx_59 = arith.muli %y_idx, %left_idx_58 : tensor<64x2x1xi16, #blocked4> loc(#loc239) + %input = arith.extsi %left_idx_59 : tensor<64x2x1xi16, #blocked4> to tensor<64x2x1xi32, #blocked4> loc(#loc304) + %left_idx_60 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_61 = tt.expand_dims %left_idx_60 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc241) + %left_idx_62 = tt.broadcast %left_idx_61 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc242) + %right_idx = arith.trunci %flip_40 : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc243) + %right_idx_63 = tt.broadcast %right_idx : tensor<1x2x1xi16, #blocked4> -> tensor<64x2x1xi16, #blocked4> loc(#loc244) + %right_idx_64 = arith.muli %y_idx, %right_idx_63 : tensor<64x2x1xi16, #blocked4> loc(#loc244) + %input_65 = arith.extsi %right_idx_64 : tensor<64x2x1xi16, #blocked4> to tensor<64x2x1xi32, #blocked4> loc(#loc307) + %right_idx_66 = "tt.reduce"(%input_65) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_67 = tt.expand_dims %right_idx_66 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc246) + %right_idx_68 = tt.broadcast %right_idx_67 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc247) + %left_idx_69 = tt.reshape %left_idx_62 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_70 = tt.reshape %right_idx_68 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond = arith.cmpi slt, %ileft_56, %iright_57 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq = arith.cmpi eq, %ileft_56, %iright_57 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_71 = arith.cmpi sgt, %left_idx_69, %right_idx_70 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_72 = arith.andi %eq, %cond_71 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_73 = arith.ori %cond, %cond_72 : tensor<8x16xi1, #blocked> loc(#loc254) + %cond_74 = arith.extui %cond_73 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc255) + %cond_75 = arith.xori %cond_74, %flip_44 : tensor<8x16xi32, #blocked> loc(#loc255) + %cond_76 = arith.cmpi ne, %cond_75, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc256) + %ret = arith.xori %ileft_56, %iright_57 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_77 = arith.select %cond_76, %ret, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_78 = arith.xori %tmp7, %ret_77 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs = arith.xori %left_idx_69, %right_idx_70 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_79 = arith.select %cond_76, %new_idxs, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_80 = arith.extsi %tmp9 : tensor<1x16xi16, #blocked> to tensor<1x16xi32, #blocked> loc(#loc262) + %new_idxs_81 = tt.broadcast %new_idxs_80 : tensor<1x16xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc262) + %new_idxs_82 = arith.xori %new_idxs_81, %new_idxs_79 : tensor<8x16xi32, #blocked> loc(#loc262) + %flip_83 = tt.broadcast %flip_41 : tensor<1x2x1xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc212) + %flip_84 = tt.reshape %flip_83 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc213) + %y_85 = tt.reshape %ret_78 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc225) + %ileft_86 = tt.broadcast %left_mask_45 : tensor<1x2x1xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc227) + %ileft_87 = arith.muli %y_85, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc227) + %ileft_88 = "tt.reduce"(%ileft_87) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_89 = tt.expand_dims %ileft_88 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc229) + %ileft_90 = tt.broadcast %ileft_89 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc230) + %iright_91 = arith.muli %y_85, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc231) + %iright_92 = "tt.reduce"(%iright_91) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_93 = tt.expand_dims %iright_92 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc233) + %iright_94 = tt.broadcast %iright_93 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc234) + %ileft_95 = tt.reshape %ileft_90 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_96 = tt.reshape %iright_94 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_97 = tt.reshape %new_idxs_82 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc237) + %left_idx_98 = arith.muli %y_idx_97, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc239) + %left_idx_99 = "tt.reduce"(%left_idx_98) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_100 = tt.expand_dims %left_idx_99 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc241) + %left_idx_101 = tt.broadcast %left_idx_100 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc242) + %right_idx_102 = arith.muli %y_idx_97, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc244) + %right_idx_103 = "tt.reduce"(%right_idx_102) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_104 = tt.expand_dims %right_idx_103 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc246) + %right_idx_105 = tt.broadcast %right_idx_104 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc247) + %left_idx_106 = tt.reshape %left_idx_101 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_107 = tt.reshape %right_idx_105 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_108 = arith.cmpi slt, %ileft_95, %iright_96 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_109 = arith.cmpi eq, %ileft_95, %iright_96 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_110 = arith.cmpi sgt, %left_idx_106, %right_idx_107 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_111 = arith.andi %eq_109, %cond_110 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_112 = arith.ori %cond_108, %cond_111 : tensor<8x16xi1, #blocked> loc(#loc254) + %cond_113 = arith.extui %cond_112 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc255) + %cond_114 = arith.xori %cond_113, %flip_84 : tensor<8x16xi32, #blocked> loc(#loc255) + %cond_115 = arith.cmpi ne, %cond_114, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc256) + %ret_116 = arith.xori %ileft_95, %iright_96 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_117 = arith.select %cond_115, %ret_116, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_118 = arith.xori %ret_78, %ret_117 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_119 = arith.xori %left_idx_106, %right_idx_107 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_120 = arith.select %cond_115, %new_idxs_119, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_121 = arith.xori %new_idxs_82, %new_idxs_120 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_122 = tt.reshape %ret_118 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc225) + %ileft_123 = arith.muli %y_122, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc227) + %ileft_124 = "tt.reduce"(%ileft_123) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_125 = tt.expand_dims %ileft_124 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc229) + %ileft_126 = tt.broadcast %ileft_125 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc230) + %iright_127 = arith.muli %y_122, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc231) + %iright_128 = "tt.reduce"(%iright_127) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_129 = tt.expand_dims %iright_128 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc233) + %iright_130 = tt.broadcast %iright_129 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc234) + %ileft_131 = tt.reshape %ileft_126 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_132 = tt.reshape %iright_130 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_133 = tt.reshape %new_idxs_121 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc237) + %left_idx_134 = arith.muli %y_idx_133, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc239) + %left_idx_135 = "tt.reduce"(%left_idx_134) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_136 = tt.expand_dims %left_idx_135 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc241) + %left_idx_137 = tt.broadcast %left_idx_136 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc242) + %right_idx_138 = arith.muli %y_idx_133, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc244) + %right_idx_139 = "tt.reduce"(%right_idx_138) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_140 = tt.expand_dims %right_idx_139 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc246) + %right_idx_141 = tt.broadcast %right_idx_140 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc247) + %left_idx_142 = tt.reshape %left_idx_137 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_143 = tt.reshape %right_idx_141 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_144 = arith.cmpi slt, %ileft_131, %iright_132 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_145 = arith.cmpi eq, %ileft_131, %iright_132 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_146 = arith.cmpi sgt, %left_idx_142, %right_idx_143 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_147 = arith.andi %eq_145, %cond_146 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_148 = arith.ori %cond_144, %cond_147 : tensor<8x16xi1, #blocked> loc(#loc254) + %cond_149 = arith.extui %cond_148 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc255) + %cond_150 = arith.xori %cond_149, %flip_84 : tensor<8x16xi32, #blocked> loc(#loc255) + %cond_151 = arith.cmpi ne, %cond_150, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc256) + %ret_152 = arith.xori %ileft_131, %iright_132 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_153 = arith.select %cond_151, %ret_152, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_154 = arith.xori %ret_118, %ret_153 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_155 = arith.xori %left_idx_142, %right_idx_143 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_156 = arith.select %cond_151, %new_idxs_155, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_157 = arith.xori %new_idxs_121, %new_idxs_156 : tensor<8x16xi32, #blocked> loc(#loc262) + %flip_158 = tt.broadcast %flip_42 : tensor<1x2x1xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc212) + %flip_159 = tt.reshape %flip_158 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc213) + %y_160 = tt.reshape %ret_154 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc225) + %ileft_161 = tt.broadcast %left_mask_46 : tensor<1x2x1xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc227) + %ileft_162 = arith.muli %y_160, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc227) + %ileft_163 = "tt.reduce"(%ileft_162) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc300) + %ileft_164 = tt.expand_dims %ileft_163 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc229) + %ileft_165 = tt.broadcast %ileft_164 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc230) + %iright_166 = arith.muli %y_160, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc231) + %iright_167 = "tt.reduce"(%iright_166) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %iright_168 = tt.expand_dims %iright_167 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc233) + %iright_169 = tt.broadcast %iright_168 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc234) + %ileft_170 = tt.reshape %ileft_165 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_171 = tt.reshape %iright_169 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_172 = tt.reshape %new_idxs_157 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc237) + %left_idx_173 = arith.muli %y_idx_172, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc239) + %left_idx_174 = "tt.reduce"(%left_idx_173) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %left_idx_175 = tt.expand_dims %left_idx_174 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc241) + %left_idx_176 = tt.broadcast %left_idx_175 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc242) + %right_idx_177 = arith.muli %y_idx_172, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc244) + %right_idx_178 = "tt.reduce"(%right_idx_177) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc308) + %right_idx_179 = tt.expand_dims %right_idx_178 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc246) + %right_idx_180 = tt.broadcast %right_idx_179 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc247) + %left_idx_181 = tt.reshape %left_idx_176 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_182 = tt.reshape %right_idx_180 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_183 = arith.cmpi slt, %ileft_170, %iright_171 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_184 = arith.cmpi eq, %ileft_170, %iright_171 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_185 = arith.cmpi sgt, %left_idx_181, %right_idx_182 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_186 = arith.andi %eq_184, %cond_185 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_187 = arith.ori %cond_183, %cond_186 : tensor<8x16xi1, #blocked> loc(#loc254) + %cond_188 = arith.extui %cond_187 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc255) + %cond_189 = arith.xori %cond_188, %flip_159 : tensor<8x16xi32, #blocked> loc(#loc255) + %cond_190 = arith.cmpi ne, %cond_189, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc256) + %ret_191 = arith.xori %ileft_170, %iright_171 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_192 = arith.select %cond_190, %ret_191, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_193 = arith.xori %ret_154, %ret_192 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_194 = arith.xori %left_idx_181, %right_idx_182 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_195 = arith.select %cond_190, %new_idxs_194, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_196 = arith.xori %new_idxs_157, %new_idxs_195 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_197 = tt.reshape %ret_193 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc225) + %ileft_198 = arith.muli %y_197, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc227) + %ileft_199 = "tt.reduce"(%ileft_198) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_200 = tt.expand_dims %ileft_199 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc229) + %ileft_201 = tt.broadcast %ileft_200 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc230) + %iright_202 = arith.muli %y_197, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc231) + %iright_203 = "tt.reduce"(%iright_202) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_204 = tt.expand_dims %iright_203 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc233) + %iright_205 = tt.broadcast %iright_204 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc234) + %ileft_206 = tt.reshape %ileft_201 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_207 = tt.reshape %iright_205 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_208 = tt.reshape %new_idxs_196 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc237) + %left_idx_209 = arith.muli %y_idx_208, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc239) + %left_idx_210 = "tt.reduce"(%left_idx_209) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_211 = tt.expand_dims %left_idx_210 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc241) + %left_idx_212 = tt.broadcast %left_idx_211 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc242) + %right_idx_213 = arith.muli %y_idx_208, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc244) + %right_idx_214 = "tt.reduce"(%right_idx_213) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_215 = tt.expand_dims %right_idx_214 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc246) + %right_idx_216 = tt.broadcast %right_idx_215 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc247) + %left_idx_217 = tt.reshape %left_idx_212 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_218 = tt.reshape %right_idx_216 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_219 = arith.cmpi slt, %ileft_206, %iright_207 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_220 = arith.cmpi eq, %ileft_206, %iright_207 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_221 = arith.cmpi sgt, %left_idx_217, %right_idx_218 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_222 = arith.andi %eq_220, %cond_221 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_223 = arith.ori %cond_219, %cond_222 : tensor<8x16xi1, #blocked> loc(#loc254) + %cond_224 = arith.extui %cond_223 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc255) + %cond_225 = arith.xori %cond_224, %flip_159 : tensor<8x16xi32, #blocked> loc(#loc255) + %cond_226 = arith.cmpi ne, %cond_225, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc256) + %ret_227 = arith.xori %ileft_206, %iright_207 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_228 = arith.select %cond_226, %ret_227, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_229 = arith.xori %ret_193, %ret_228 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_230 = arith.xori %left_idx_217, %right_idx_218 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_231 = arith.select %cond_226, %new_idxs_230, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_232 = arith.xori %new_idxs_196, %new_idxs_231 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_233 = tt.reshape %ret_229 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc225) + %ileft_234 = arith.muli %y_233, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc227) + %ileft_235 = "tt.reduce"(%ileft_234) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_236 = tt.expand_dims %ileft_235 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc229) + %ileft_237 = tt.broadcast %ileft_236 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc230) + %iright_238 = arith.muli %y_233, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc231) + %iright_239 = "tt.reduce"(%iright_238) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_240 = tt.expand_dims %iright_239 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc233) + %iright_241 = tt.broadcast %iright_240 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc234) + %ileft_242 = tt.reshape %ileft_237 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_243 = tt.reshape %iright_241 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_244 = tt.reshape %new_idxs_232 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc237) + %left_idx_245 = arith.muli %y_idx_244, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc239) + %left_idx_246 = "tt.reduce"(%left_idx_245) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_247 = tt.expand_dims %left_idx_246 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc241) + %left_idx_248 = tt.broadcast %left_idx_247 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc242) + %right_idx_249 = arith.muli %y_idx_244, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc244) + %right_idx_250 = "tt.reduce"(%right_idx_249) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_251 = tt.expand_dims %right_idx_250 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc246) + %right_idx_252 = tt.broadcast %right_idx_251 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc247) + %left_idx_253 = tt.reshape %left_idx_248 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_254 = tt.reshape %right_idx_252 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_255 = arith.cmpi slt, %ileft_242, %iright_243 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_256 = arith.cmpi eq, %ileft_242, %iright_243 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_257 = arith.cmpi sgt, %left_idx_253, %right_idx_254 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_258 = arith.andi %eq_256, %cond_257 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_259 = arith.ori %cond_255, %cond_258 : tensor<8x16xi1, #blocked> loc(#loc254) + %cond_260 = arith.extui %cond_259 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc255) + %cond_261 = arith.xori %cond_260, %flip_159 : tensor<8x16xi32, #blocked> loc(#loc255) + %cond_262 = arith.cmpi ne, %cond_261, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc256) + %ret_263 = arith.xori %ileft_242, %iright_243 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_264 = arith.select %cond_262, %ret_263, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_265 = arith.xori %ret_229, %ret_264 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_266 = arith.xori %left_idx_253, %right_idx_254 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_267 = arith.select %cond_262, %new_idxs_266, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_268 = arith.xori %new_idxs_232, %new_idxs_267 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_269 = tt.reshape %ret_265 : tensor<8x16xi32, #blocked> -> tensor<8x2x8xi32, #blocked1> loc(#loc225) + %ileft_270 = tt.broadcast %left_mask_47 : tensor<1x2x1xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc227) + %ileft_271 = arith.muli %y_269, %ileft_270 : tensor<8x2x8xi32, #blocked1> loc(#loc227) + %ileft_272 = "tt.reduce"(%ileft_271) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc300) + %ileft_273 = tt.expand_dims %ileft_272 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc229) + %ileft_274 = tt.broadcast %ileft_273 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc230) + %iright_275 = arith.muli %y_269, %flip_158 : tensor<8x2x8xi32, #blocked1> loc(#loc231) + %iright_276 = "tt.reduce"(%iright_275) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc302) + %iright_277 = tt.expand_dims %iright_276 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc233) + %iright_278 = tt.broadcast %iright_277 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc234) + %ileft_279 = tt.reshape %ileft_274 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_280 = tt.reshape %iright_278 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_281 = tt.reshape %new_idxs_268 : tensor<8x16xi32, #blocked> -> tensor<8x2x8xi32, #blocked1> loc(#loc237) + %left_idx_282 = arith.muli %y_idx_281, %ileft_270 : tensor<8x2x8xi32, #blocked1> loc(#loc239) + %left_idx_283 = "tt.reduce"(%left_idx_282) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc305) + %left_idx_284 = tt.expand_dims %left_idx_283 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc241) + %left_idx_285 = tt.broadcast %left_idx_284 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc242) + %right_idx_286 = arith.muli %y_idx_281, %flip_158 : tensor<8x2x8xi32, #blocked1> loc(#loc244) + %right_idx_287 = "tt.reduce"(%right_idx_286) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc308) + %right_idx_288 = tt.expand_dims %right_idx_287 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc246) + %right_idx_289 = tt.broadcast %right_idx_288 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc247) + %left_idx_290 = tt.reshape %left_idx_285 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_291 = tt.reshape %right_idx_289 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_292 = arith.cmpi slt, %ileft_279, %iright_280 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_293 = arith.cmpi eq, %ileft_279, %iright_280 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_294 = arith.cmpi sgt, %left_idx_290, %right_idx_291 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_295 = arith.andi %eq_293, %cond_294 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_296 = arith.ori %cond_292, %cond_295 : tensor<8x16xi1, #blocked> loc(#loc254) + %ret_297 = arith.xori %ileft_279, %iright_280 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_298 = arith.select %cond_296, %ret_297, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_299 = arith.xori %ret_265, %ret_298 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_300 = arith.xori %left_idx_290, %right_idx_291 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_301 = arith.select %cond_296, %new_idxs_300, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_302 = arith.xori %new_idxs_268, %new_idxs_301 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_303 = tt.reshape %ret_299 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc225) + %ileft_304 = arith.muli %y_303, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc227) + %ileft_305 = "tt.reduce"(%ileft_304) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc300) + %ileft_306 = tt.expand_dims %ileft_305 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc229) + %ileft_307 = tt.broadcast %ileft_306 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc230) + %iright_308 = arith.muli %y_303, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc231) + %iright_309 = "tt.reduce"(%iright_308) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %iright_310 = tt.expand_dims %iright_309 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc233) + %iright_311 = tt.broadcast %iright_310 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc234) + %ileft_312 = tt.reshape %ileft_307 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_313 = tt.reshape %iright_311 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_314 = tt.reshape %new_idxs_302 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc237) + %left_idx_315 = arith.muli %y_idx_314, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc239) + %left_idx_316 = "tt.reduce"(%left_idx_315) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %left_idx_317 = tt.expand_dims %left_idx_316 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc241) + %left_idx_318 = tt.broadcast %left_idx_317 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc242) + %right_idx_319 = arith.muli %y_idx_314, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc244) + %right_idx_320 = "tt.reduce"(%right_idx_319) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc308) + %right_idx_321 = tt.expand_dims %right_idx_320 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc246) + %right_idx_322 = tt.broadcast %right_idx_321 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc247) + %left_idx_323 = tt.reshape %left_idx_318 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_324 = tt.reshape %right_idx_322 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_325 = arith.cmpi slt, %ileft_312, %iright_313 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_326 = arith.cmpi eq, %ileft_312, %iright_313 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_327 = arith.cmpi sgt, %left_idx_323, %right_idx_324 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_328 = arith.andi %eq_326, %cond_327 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_329 = arith.ori %cond_325, %cond_328 : tensor<8x16xi1, #blocked> loc(#loc254) + %ret_330 = arith.xori %ileft_312, %iright_313 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_331 = arith.select %cond_329, %ret_330, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_332 = arith.xori %ret_299, %ret_331 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_333 = arith.xori %left_idx_323, %right_idx_324 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_334 = arith.select %cond_329, %new_idxs_333, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_335 = arith.xori %new_idxs_302, %new_idxs_334 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_336 = tt.reshape %ret_332 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc225) + %ileft_337 = arith.muli %y_336, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc227) + %ileft_338 = "tt.reduce"(%ileft_337) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_339 = tt.expand_dims %ileft_338 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc229) + %ileft_340 = tt.broadcast %ileft_339 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc230) + %iright_341 = arith.muli %y_336, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc231) + %iright_342 = "tt.reduce"(%iright_341) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_343 = tt.expand_dims %iright_342 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc233) + %iright_344 = tt.broadcast %iright_343 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc234) + %ileft_345 = tt.reshape %ileft_340 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_346 = tt.reshape %iright_344 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_347 = tt.reshape %new_idxs_335 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc237) + %left_idx_348 = arith.muli %y_idx_347, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc239) + %left_idx_349 = "tt.reduce"(%left_idx_348) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_350 = tt.expand_dims %left_idx_349 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc241) + %left_idx_351 = tt.broadcast %left_idx_350 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc242) + %right_idx_352 = arith.muli %y_idx_347, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc244) + %right_idx_353 = "tt.reduce"(%right_idx_352) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_354 = tt.expand_dims %right_idx_353 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc246) + %right_idx_355 = tt.broadcast %right_idx_354 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc247) + %left_idx_356 = tt.reshape %left_idx_351 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_357 = tt.reshape %right_idx_355 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_358 = arith.cmpi slt, %ileft_345, %iright_346 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_359 = arith.cmpi eq, %ileft_345, %iright_346 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_360 = arith.cmpi sgt, %left_idx_356, %right_idx_357 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_361 = arith.andi %eq_359, %cond_360 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_362 = arith.ori %cond_358, %cond_361 : tensor<8x16xi1, #blocked> loc(#loc254) + %ret_363 = arith.xori %ileft_345, %iright_346 : tensor<8x16xi32, #blocked> loc(#loc257) + %ret_364 = arith.select %cond_362, %ret_363, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc258) + %ret_365 = arith.xori %ret_332, %ret_364 : tensor<8x16xi32, #blocked> loc(#loc259) + %new_idxs_366 = arith.xori %left_idx_356, %right_idx_357 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_367 = arith.select %cond_362, %new_idxs_366, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_368 = arith.xori %new_idxs_335, %new_idxs_367 : tensor<8x16xi32, #blocked> loc(#loc262) + %y_369 = tt.reshape %ret_365 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc225) + %ileft_370 = arith.muli %y_369, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc227) + %ileft_371 = "tt.reduce"(%ileft_370) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_372 = tt.expand_dims %ileft_371 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc229) + %ileft_373 = tt.broadcast %ileft_372 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc230) + %iright_374 = arith.muli %y_369, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc231) + %iright_375 = "tt.reduce"(%iright_374) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_376 = tt.expand_dims %iright_375 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc233) + %iright_377 = tt.broadcast %iright_376 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc234) + %ileft_378 = tt.reshape %ileft_373 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc235) + %iright_379 = tt.reshape %iright_377 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc236) + %y_idx_380 = tt.reshape %new_idxs_368 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc237) + %left_idx_381 = arith.muli %y_idx_380, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc239) + %left_idx_382 = "tt.reduce"(%left_idx_381) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_383 = tt.expand_dims %left_idx_382 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc241) + %left_idx_384 = tt.broadcast %left_idx_383 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc242) + %right_idx_385 = arith.muli %y_idx_380, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc244) + %right_idx_386 = "tt.reduce"(%right_idx_385) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_387 = tt.expand_dims %right_idx_386 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc246) + %right_idx_388 = tt.broadcast %right_idx_387 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc247) + %left_idx_389 = tt.reshape %left_idx_384 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc248) + %right_idx_390 = tt.reshape %right_idx_388 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc249) + %cond_391 = arith.cmpi slt, %ileft_378, %iright_379 : tensor<8x16xi32, #blocked> loc(#loc250) + %eq_392 = arith.cmpi eq, %ileft_378, %iright_379 : tensor<8x16xi32, #blocked> loc(#loc251) + %cond_393 = arith.cmpi sgt, %left_idx_389, %right_idx_390 : tensor<8x16xi32, #blocked> loc(#loc252) + %cond_394 = arith.andi %eq_392, %cond_393 : tensor<8x16xi1, #blocked> loc(#loc253) + %cond_395 = arith.ori %cond_391, %cond_394 : tensor<8x16xi1, #blocked> loc(#loc254) + %new_idxs_396 = arith.xori %left_idx_389, %right_idx_390 : tensor<8x16xi32, #blocked> loc(#loc260) + %new_idxs_397 = arith.select %cond_395, %new_idxs_396, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc261) + %new_idxs_398 = arith.xori %new_idxs_368, %new_idxs_397 : tensor<8x16xi32, #blocked> loc(#loc262) + %tmp14 = arith.cmpi eq, %tmp0_31, %cst_12 : tensor<8x16xi64, #blocked> loc(#loc186) + %tmp16 = arith.extui %tmp14 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc217) + %y_399 = tt.reshape %tmp16 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc263) + %ileft_400 = arith.muli %y_399, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc264) + %ileft_401 = "tt.reduce"(%ileft_400) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_402 = tt.expand_dims %ileft_401 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc266) + %ileft_403 = tt.broadcast %ileft_402 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc267) + %iright_404 = arith.muli %y_399, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc268) + %iright_405 = "tt.reduce"(%iright_404) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_406 = tt.expand_dims %iright_405 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc270) + %iright_407 = tt.broadcast %iright_406 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc271) + %ileft_408 = tt.reshape %ileft_403 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_409 = tt.reshape %iright_407 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc273) + %cond_410 = arith.cmpi slt, %ileft_408, %iright_409 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_411 = arith.cmpi eq, %ileft_408, %iright_409 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_412 = arith.andi %eq_411, %cond_71 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_413 = arith.ori %cond_410, %cond_412 : tensor<8x16xi1, #blocked> loc(#loc277) + %cond_414 = arith.extui %cond_413 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc278) + %cond_415 = arith.xori %cond_414, %flip_44 : tensor<8x16xi32, #blocked> loc(#loc278) + %cond_416 = arith.cmpi ne, %cond_415, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc279) + %ret_417 = arith.xori %ileft_408, %iright_409 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_418 = arith.select %cond_416, %ret_417, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_419 = arith.xori %tmp16, %ret_418 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_420 = arith.select %cond_416, %new_idxs, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_421 = arith.xori %new_idxs_81, %new_idxs_420 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_422 = tt.reshape %ret_419 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc263) + %ileft_423 = arith.muli %y_422, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc264) + %ileft_424 = "tt.reduce"(%ileft_423) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_425 = tt.expand_dims %ileft_424 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc266) + %ileft_426 = tt.broadcast %ileft_425 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc267) + %iright_427 = arith.muli %y_422, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc268) + %iright_428 = "tt.reduce"(%iright_427) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_429 = tt.expand_dims %iright_428 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc270) + %iright_430 = tt.broadcast %iright_429 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc271) + %ileft_431 = tt.reshape %ileft_426 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_432 = tt.reshape %iright_430 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_433 = tt.reshape %new_idxs_421 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc285) + %left_idx_434 = arith.muli %y_idx_433, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc286) + %left_idx_435 = "tt.reduce"(%left_idx_434) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_436 = tt.expand_dims %left_idx_435 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc288) + %left_idx_437 = tt.broadcast %left_idx_436 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc289) + %right_idx_438 = arith.muli %y_idx_433, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc290) + %right_idx_439 = "tt.reduce"(%right_idx_438) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_440 = tt.expand_dims %right_idx_439 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc292) + %right_idx_441 = tt.broadcast %right_idx_440 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc293) + %left_idx_442 = tt.reshape %left_idx_437 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_443 = tt.reshape %right_idx_441 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_444 = arith.cmpi slt, %ileft_431, %iright_432 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_445 = arith.cmpi eq, %ileft_431, %iright_432 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_446 = arith.cmpi sgt, %left_idx_442, %right_idx_443 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_447 = arith.andi %eq_445, %cond_446 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_448 = arith.ori %cond_444, %cond_447 : tensor<8x16xi1, #blocked> loc(#loc277) + %cond_449 = arith.extui %cond_448 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc278) + %cond_450 = arith.xori %cond_449, %flip_84 : tensor<8x16xi32, #blocked> loc(#loc278) + %cond_451 = arith.cmpi ne, %cond_450, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc279) + %ret_452 = arith.xori %ileft_431, %iright_432 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_453 = arith.select %cond_451, %ret_452, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_454 = arith.xori %ret_419, %ret_453 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_455 = arith.xori %left_idx_442, %right_idx_443 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_456 = arith.select %cond_451, %new_idxs_455, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_457 = arith.xori %new_idxs_421, %new_idxs_456 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_458 = tt.reshape %ret_454 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc263) + %ileft_459 = arith.muli %y_458, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc264) + %ileft_460 = "tt.reduce"(%ileft_459) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_461 = tt.expand_dims %ileft_460 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc266) + %ileft_462 = tt.broadcast %ileft_461 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc267) + %iright_463 = arith.muli %y_458, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc268) + %iright_464 = "tt.reduce"(%iright_463) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_465 = tt.expand_dims %iright_464 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc270) + %iright_466 = tt.broadcast %iright_465 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc271) + %ileft_467 = tt.reshape %ileft_462 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_468 = tt.reshape %iright_466 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_469 = tt.reshape %new_idxs_457 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc285) + %left_idx_470 = arith.muli %y_idx_469, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc286) + %left_idx_471 = "tt.reduce"(%left_idx_470) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_472 = tt.expand_dims %left_idx_471 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc288) + %left_idx_473 = tt.broadcast %left_idx_472 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc289) + %right_idx_474 = arith.muli %y_idx_469, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc290) + %right_idx_475 = "tt.reduce"(%right_idx_474) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_476 = tt.expand_dims %right_idx_475 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc292) + %right_idx_477 = tt.broadcast %right_idx_476 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc293) + %left_idx_478 = tt.reshape %left_idx_473 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_479 = tt.reshape %right_idx_477 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_480 = arith.cmpi slt, %ileft_467, %iright_468 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_481 = arith.cmpi eq, %ileft_467, %iright_468 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_482 = arith.cmpi sgt, %left_idx_478, %right_idx_479 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_483 = arith.andi %eq_481, %cond_482 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_484 = arith.ori %cond_480, %cond_483 : tensor<8x16xi1, #blocked> loc(#loc277) + %cond_485 = arith.extui %cond_484 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc278) + %cond_486 = arith.xori %cond_485, %flip_84 : tensor<8x16xi32, #blocked> loc(#loc278) + %cond_487 = arith.cmpi ne, %cond_486, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc279) + %ret_488 = arith.xori %ileft_467, %iright_468 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_489 = arith.select %cond_487, %ret_488, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_490 = arith.xori %ret_454, %ret_489 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_491 = arith.xori %left_idx_478, %right_idx_479 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_492 = arith.select %cond_487, %new_idxs_491, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_493 = arith.xori %new_idxs_457, %new_idxs_492 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_494 = tt.reshape %ret_490 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc263) + %ileft_495 = arith.muli %y_494, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc264) + %ileft_496 = "tt.reduce"(%ileft_495) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc310) + %ileft_497 = tt.expand_dims %ileft_496 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc266) + %ileft_498 = tt.broadcast %ileft_497 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc267) + %iright_499 = arith.muli %y_494, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc268) + %iright_500 = "tt.reduce"(%iright_499) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc312) + %iright_501 = tt.expand_dims %iright_500 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc270) + %iright_502 = tt.broadcast %iright_501 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc271) + %ileft_503 = tt.reshape %ileft_498 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_504 = tt.reshape %iright_502 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_505 = tt.reshape %new_idxs_493 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc285) + %left_idx_506 = arith.muli %y_idx_505, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc286) + %left_idx_507 = "tt.reduce"(%left_idx_506) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc314) + %left_idx_508 = tt.expand_dims %left_idx_507 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc288) + %left_idx_509 = tt.broadcast %left_idx_508 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc289) + %right_idx_510 = arith.muli %y_idx_505, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc290) + %right_idx_511 = "tt.reduce"(%right_idx_510) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc316) + %right_idx_512 = tt.expand_dims %right_idx_511 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc292) + %right_idx_513 = tt.broadcast %right_idx_512 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc293) + %left_idx_514 = tt.reshape %left_idx_509 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_515 = tt.reshape %right_idx_513 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_516 = arith.cmpi slt, %ileft_503, %iright_504 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_517 = arith.cmpi eq, %ileft_503, %iright_504 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_518 = arith.cmpi sgt, %left_idx_514, %right_idx_515 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_519 = arith.andi %eq_517, %cond_518 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_520 = arith.ori %cond_516, %cond_519 : tensor<8x16xi1, #blocked> loc(#loc277) + %cond_521 = arith.extui %cond_520 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc278) + %cond_522 = arith.xori %cond_521, %flip_159 : tensor<8x16xi32, #blocked> loc(#loc278) + %cond_523 = arith.cmpi ne, %cond_522, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc279) + %ret_524 = arith.xori %ileft_503, %iright_504 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_525 = arith.select %cond_523, %ret_524, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_526 = arith.xori %ret_490, %ret_525 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_527 = arith.xori %left_idx_514, %right_idx_515 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_528 = arith.select %cond_523, %new_idxs_527, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_529 = arith.xori %new_idxs_493, %new_idxs_528 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_530 = tt.reshape %ret_526 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc263) + %ileft_531 = arith.muli %y_530, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc264) + %ileft_532 = "tt.reduce"(%ileft_531) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_533 = tt.expand_dims %ileft_532 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc266) + %ileft_534 = tt.broadcast %ileft_533 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc267) + %iright_535 = arith.muli %y_530, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc268) + %iright_536 = "tt.reduce"(%iright_535) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_537 = tt.expand_dims %iright_536 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc270) + %iright_538 = tt.broadcast %iright_537 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc271) + %ileft_539 = tt.reshape %ileft_534 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_540 = tt.reshape %iright_538 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_541 = tt.reshape %new_idxs_529 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc285) + %left_idx_542 = arith.muli %y_idx_541, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc286) + %left_idx_543 = "tt.reduce"(%left_idx_542) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_544 = tt.expand_dims %left_idx_543 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc288) + %left_idx_545 = tt.broadcast %left_idx_544 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc289) + %right_idx_546 = arith.muli %y_idx_541, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc290) + %right_idx_547 = "tt.reduce"(%right_idx_546) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_548 = tt.expand_dims %right_idx_547 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc292) + %right_idx_549 = tt.broadcast %right_idx_548 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc293) + %left_idx_550 = tt.reshape %left_idx_545 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_551 = tt.reshape %right_idx_549 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_552 = arith.cmpi slt, %ileft_539, %iright_540 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_553 = arith.cmpi eq, %ileft_539, %iright_540 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_554 = arith.cmpi sgt, %left_idx_550, %right_idx_551 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_555 = arith.andi %eq_553, %cond_554 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_556 = arith.ori %cond_552, %cond_555 : tensor<8x16xi1, #blocked> loc(#loc277) + %cond_557 = arith.extui %cond_556 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc278) + %cond_558 = arith.xori %cond_557, %flip_159 : tensor<8x16xi32, #blocked> loc(#loc278) + %cond_559 = arith.cmpi ne, %cond_558, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc279) + %ret_560 = arith.xori %ileft_539, %iright_540 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_561 = arith.select %cond_559, %ret_560, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_562 = arith.xori %ret_526, %ret_561 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_563 = arith.xori %left_idx_550, %right_idx_551 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_564 = arith.select %cond_559, %new_idxs_563, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_565 = arith.xori %new_idxs_529, %new_idxs_564 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_566 = tt.reshape %ret_562 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc263) + %ileft_567 = arith.muli %y_566, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc264) + %ileft_568 = "tt.reduce"(%ileft_567) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_569 = tt.expand_dims %ileft_568 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc266) + %ileft_570 = tt.broadcast %ileft_569 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc267) + %iright_571 = arith.muli %y_566, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc268) + %iright_572 = "tt.reduce"(%iright_571) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_573 = tt.expand_dims %iright_572 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc270) + %iright_574 = tt.broadcast %iright_573 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc271) + %ileft_575 = tt.reshape %ileft_570 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_576 = tt.reshape %iright_574 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_577 = tt.reshape %new_idxs_565 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc285) + %left_idx_578 = arith.muli %y_idx_577, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc286) + %left_idx_579 = "tt.reduce"(%left_idx_578) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_580 = tt.expand_dims %left_idx_579 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc288) + %left_idx_581 = tt.broadcast %left_idx_580 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc289) + %right_idx_582 = arith.muli %y_idx_577, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc290) + %right_idx_583 = "tt.reduce"(%right_idx_582) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_584 = tt.expand_dims %right_idx_583 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc292) + %right_idx_585 = tt.broadcast %right_idx_584 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc293) + %left_idx_586 = tt.reshape %left_idx_581 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_587 = tt.reshape %right_idx_585 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_588 = arith.cmpi slt, %ileft_575, %iright_576 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_589 = arith.cmpi eq, %ileft_575, %iright_576 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_590 = arith.cmpi sgt, %left_idx_586, %right_idx_587 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_591 = arith.andi %eq_589, %cond_590 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_592 = arith.ori %cond_588, %cond_591 : tensor<8x16xi1, #blocked> loc(#loc277) + %cond_593 = arith.extui %cond_592 : tensor<8x16xi1, #blocked> to tensor<8x16xi32, #blocked> loc(#loc278) + %cond_594 = arith.xori %cond_593, %flip_159 : tensor<8x16xi32, #blocked> loc(#loc278) + %cond_595 = arith.cmpi ne, %cond_594, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc279) + %ret_596 = arith.xori %ileft_575, %iright_576 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_597 = arith.select %cond_595, %ret_596, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_598 = arith.xori %ret_562, %ret_597 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_599 = arith.xori %left_idx_586, %right_idx_587 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_600 = arith.select %cond_595, %new_idxs_599, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_601 = arith.xori %new_idxs_565, %new_idxs_600 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_602 = tt.reshape %ret_598 : tensor<8x16xi32, #blocked> -> tensor<8x2x8xi32, #blocked1> loc(#loc263) + %ileft_603 = arith.muli %y_602, %ileft_270 : tensor<8x2x8xi32, #blocked1> loc(#loc264) + %ileft_604 = "tt.reduce"(%ileft_603) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc310) + %ileft_605 = tt.expand_dims %ileft_604 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc266) + %ileft_606 = tt.broadcast %ileft_605 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc267) + %iright_607 = arith.muli %y_602, %flip_158 : tensor<8x2x8xi32, #blocked1> loc(#loc268) + %iright_608 = "tt.reduce"(%iright_607) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc312) + %iright_609 = tt.expand_dims %iright_608 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc270) + %iright_610 = tt.broadcast %iright_609 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc271) + %ileft_611 = tt.reshape %ileft_606 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_612 = tt.reshape %iright_610 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_613 = tt.reshape %new_idxs_601 : tensor<8x16xi32, #blocked> -> tensor<8x2x8xi32, #blocked1> loc(#loc285) + %left_idx_614 = arith.muli %y_idx_613, %ileft_270 : tensor<8x2x8xi32, #blocked1> loc(#loc286) + %left_idx_615 = "tt.reduce"(%left_idx_614) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc314) + %left_idx_616 = tt.expand_dims %left_idx_615 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc288) + %left_idx_617 = tt.broadcast %left_idx_616 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc289) + %right_idx_618 = arith.muli %y_idx_613, %flip_158 : tensor<8x2x8xi32, #blocked1> loc(#loc290) + %right_idx_619 = "tt.reduce"(%right_idx_618) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<8x2x8xi32, #blocked1>) -> tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc316) + %right_idx_620 = tt.expand_dims %right_idx_619 {axis = 1 : i32} : tensor<8x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<8x1x8xi32, #blocked1> loc(#loc292) + %right_idx_621 = tt.broadcast %right_idx_620 : tensor<8x1x8xi32, #blocked1> -> tensor<8x2x8xi32, #blocked1> loc(#loc293) + %left_idx_622 = tt.reshape %left_idx_617 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_623 = tt.reshape %right_idx_621 : tensor<8x2x8xi32, #blocked1> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_624 = arith.cmpi slt, %ileft_611, %iright_612 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_625 = arith.cmpi eq, %ileft_611, %iright_612 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_626 = arith.cmpi sgt, %left_idx_622, %right_idx_623 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_627 = arith.andi %eq_625, %cond_626 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_628 = arith.ori %cond_624, %cond_627 : tensor<8x16xi1, #blocked> loc(#loc277) + %ret_629 = arith.xori %ileft_611, %iright_612 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_630 = arith.select %cond_628, %ret_629, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_631 = arith.xori %ret_598, %ret_630 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_632 = arith.xori %left_idx_622, %right_idx_623 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_633 = arith.select %cond_628, %new_idxs_632, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_634 = arith.xori %new_idxs_601, %new_idxs_633 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_635 = tt.reshape %ret_631 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc263) + %ileft_636 = arith.muli %y_635, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc264) + %ileft_637 = "tt.reduce"(%ileft_636) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc310) + %ileft_638 = tt.expand_dims %ileft_637 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc266) + %ileft_639 = tt.broadcast %ileft_638 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc267) + %iright_640 = arith.muli %y_635, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc268) + %iright_641 = "tt.reduce"(%iright_640) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc312) + %iright_642 = tt.expand_dims %iright_641 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc270) + %iright_643 = tt.broadcast %iright_642 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc271) + %ileft_644 = tt.reshape %ileft_639 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_645 = tt.reshape %iright_643 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_646 = tt.reshape %new_idxs_634 : tensor<8x16xi32, #blocked> -> tensor<16x2x4xi32, #blocked2> loc(#loc285) + %left_idx_647 = arith.muli %y_idx_646, %ileft_161 : tensor<16x2x4xi32, #blocked2> loc(#loc286) + %left_idx_648 = "tt.reduce"(%left_idx_647) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc314) + %left_idx_649 = tt.expand_dims %left_idx_648 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc288) + %left_idx_650 = tt.broadcast %left_idx_649 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc289) + %right_idx_651 = arith.muli %y_idx_646, %flip_83 : tensor<16x2x4xi32, #blocked2> loc(#loc290) + %right_idx_652 = "tt.reduce"(%right_idx_651) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<16x2x4xi32, #blocked2>) -> tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc316) + %right_idx_653 = tt.expand_dims %right_idx_652 {axis = 1 : i32} : tensor<16x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<16x1x4xi32, #blocked2> loc(#loc292) + %right_idx_654 = tt.broadcast %right_idx_653 : tensor<16x1x4xi32, #blocked2> -> tensor<16x2x4xi32, #blocked2> loc(#loc293) + %left_idx_655 = tt.reshape %left_idx_650 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_656 = tt.reshape %right_idx_654 : tensor<16x2x4xi32, #blocked2> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_657 = arith.cmpi slt, %ileft_644, %iright_645 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_658 = arith.cmpi eq, %ileft_644, %iright_645 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_659 = arith.cmpi sgt, %left_idx_655, %right_idx_656 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_660 = arith.andi %eq_658, %cond_659 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_661 = arith.ori %cond_657, %cond_660 : tensor<8x16xi1, #blocked> loc(#loc277) + %ret_662 = arith.xori %ileft_644, %iright_645 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_663 = arith.select %cond_661, %ret_662, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_664 = arith.xori %ret_631, %ret_663 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_665 = arith.xori %left_idx_655, %right_idx_656 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_666 = arith.select %cond_661, %new_idxs_665, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_667 = arith.xori %new_idxs_634, %new_idxs_666 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_668 = tt.reshape %ret_664 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc263) + %ileft_669 = arith.muli %y_668, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc264) + %ileft_670 = "tt.reduce"(%ileft_669) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_671 = tt.expand_dims %ileft_670 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc266) + %ileft_672 = tt.broadcast %ileft_671 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc267) + %iright_673 = arith.muli %y_668, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc268) + %iright_674 = "tt.reduce"(%iright_673) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_675 = tt.expand_dims %iright_674 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc270) + %iright_676 = tt.broadcast %iright_675 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc271) + %ileft_677 = tt.reshape %ileft_672 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_678 = tt.reshape %iright_676 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_679 = tt.reshape %new_idxs_667 : tensor<8x16xi32, #blocked> -> tensor<32x2x2xi32, #blocked3> loc(#loc285) + %left_idx_680 = arith.muli %y_idx_679, %ileft_86 : tensor<32x2x2xi32, #blocked3> loc(#loc286) + %left_idx_681 = "tt.reduce"(%left_idx_680) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_682 = tt.expand_dims %left_idx_681 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc288) + %left_idx_683 = tt.broadcast %left_idx_682 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc289) + %right_idx_684 = arith.muli %y_idx_679, %flip_43 : tensor<32x2x2xi32, #blocked3> loc(#loc290) + %right_idx_685 = "tt.reduce"(%right_idx_684) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<32x2x2xi32, #blocked3>) -> tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_686 = tt.expand_dims %right_idx_685 {axis = 1 : i32} : tensor<32x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<32x1x2xi32, #blocked3> loc(#loc292) + %right_idx_687 = tt.broadcast %right_idx_686 : tensor<32x1x2xi32, #blocked3> -> tensor<32x2x2xi32, #blocked3> loc(#loc293) + %left_idx_688 = tt.reshape %left_idx_683 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_689 = tt.reshape %right_idx_687 : tensor<32x2x2xi32, #blocked3> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_690 = arith.cmpi slt, %ileft_677, %iright_678 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_691 = arith.cmpi eq, %ileft_677, %iright_678 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_692 = arith.cmpi sgt, %left_idx_688, %right_idx_689 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_693 = arith.andi %eq_691, %cond_692 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_694 = arith.ori %cond_690, %cond_693 : tensor<8x16xi1, #blocked> loc(#loc277) + %ret_695 = arith.xori %ileft_677, %iright_678 : tensor<8x16xi32, #blocked> loc(#loc280) + %ret_696 = arith.select %cond_694, %ret_695, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc281) + %ret_697 = arith.xori %ret_664, %ret_696 : tensor<8x16xi32, #blocked> loc(#loc282) + %new_idxs_698 = arith.xori %left_idx_688, %right_idx_689 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_699 = arith.select %cond_694, %new_idxs_698, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_700 = arith.xori %new_idxs_667, %new_idxs_699 : tensor<8x16xi32, #blocked> loc(#loc284) + %y_701 = tt.reshape %ret_697 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc263) + %ileft_702 = arith.muli %y_701, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc264) + %ileft_703 = "tt.reduce"(%ileft_702) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_704 = tt.expand_dims %ileft_703 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc266) + %ileft_705 = tt.broadcast %ileft_704 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc267) + %iright_706 = arith.muli %y_701, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc268) + %iright_707 = "tt.reduce"(%iright_706) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_708 = tt.expand_dims %iright_707 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc270) + %iright_709 = tt.broadcast %iright_708 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc271) + %ileft_710 = tt.reshape %ileft_705 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc272) + %iright_711 = tt.reshape %iright_709 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc273) + %y_idx_712 = tt.reshape %new_idxs_700 : tensor<8x16xi32, #blocked> -> tensor<64x2x1xi32, #blocked4> loc(#loc285) + %left_idx_713 = arith.muli %y_idx_712, %ileft : tensor<64x2x1xi32, #blocked4> loc(#loc286) + %left_idx_714 = "tt.reduce"(%left_idx_713) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_715 = tt.expand_dims %left_idx_714 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc288) + %left_idx_716 = tt.broadcast %left_idx_715 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc289) + %right_idx_717 = arith.muli %y_idx_712, %iright : tensor<64x2x1xi32, #blocked4> loc(#loc290) + %right_idx_718 = "tt.reduce"(%right_idx_717) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<64x2x1xi32, #blocked4>) -> tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_719 = tt.expand_dims %right_idx_718 {axis = 1 : i32} : tensor<64x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<64x1x1xi32, #blocked4> loc(#loc292) + %right_idx_720 = tt.broadcast %right_idx_719 : tensor<64x1x1xi32, #blocked4> -> tensor<64x2x1xi32, #blocked4> loc(#loc293) + %left_idx_721 = tt.reshape %left_idx_716 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc294) + %right_idx_722 = tt.reshape %right_idx_720 : tensor<64x2x1xi32, #blocked4> -> tensor<8x16xi32, #blocked> loc(#loc295) + %cond_723 = arith.cmpi slt, %ileft_710, %iright_711 : tensor<8x16xi32, #blocked> loc(#loc274) + %eq_724 = arith.cmpi eq, %ileft_710, %iright_711 : tensor<8x16xi32, #blocked> loc(#loc275) + %cond_725 = arith.cmpi sgt, %left_idx_721, %right_idx_722 : tensor<8x16xi32, #blocked> loc(#loc296) + %cond_726 = arith.andi %eq_724, %cond_725 : tensor<8x16xi1, #blocked> loc(#loc276) + %cond_727 = arith.ori %cond_723, %cond_726 : tensor<8x16xi1, #blocked> loc(#loc277) + %new_idxs_728 = arith.xori %left_idx_721, %right_idx_722 : tensor<8x16xi32, #blocked> loc(#loc297) + %new_idxs_729 = arith.select %cond_727, %new_idxs_728, %cst_9 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc283) + %new_idxs_730 = arith.xori %new_idxs_700, %new_idxs_729 : tensor<8x16xi32, #blocked> loc(#loc284) + %tmp20 = arith.extui %tmp5 : tensor<8x16xi1, #blocked> to tensor<8x16xi64, #blocked> loc(#loc219) + %tmp23 = arith.select %tmp0_29, %tmp20, %cst_13 : tensor<8x16xi1, #blocked>, tensor<8x16xi64, #blocked> loc(#loc191) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_741: i64 loc(callsite(#loc1 at #loc192)), %tmp24_742: i64 loc(callsite(#loc1 at #loc192))): + %tmp24_743 = arith.addi %tmp24_741, %tmp24_742 : i64 loc(#loc298) + tt.reduce.return %tmp24_743 : i64 loc(#loc220) + }) : (tensor<8x16xi64, #blocked>) -> tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc220) + %tmp30 = ttg.convert_layout %tmp24 : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc193) + %tmp24_731 = tt.expand_dims %tmp30 {axis = 1 : i32} : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<8x1xi64, #blocked5> loc(#loc194) + %tmp24_732 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi64, #blocked> loc(#loc194) + %tmp25 = arith.extui %tmp14 : tensor<8x16xi1, #blocked> to tensor<8x16xi64, #blocked> loc(#loc222) + %tmp28 = arith.select %tmp0_29, %tmp25, %cst_13 : tensor<8x16xi1, #blocked>, tensor<8x16xi64, #blocked> loc(#loc196) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_741: i64 loc(callsite(#loc1 at #loc197)), %tmp29_742: i64 loc(callsite(#loc1 at #loc197))): + %tmp29_743 = arith.addi %tmp29_741, %tmp29_742 : i64 loc(#loc299) + tt.reduce.return %tmp29_743 : i64 loc(#loc223) + }) : (tensor<8x16xi64, #blocked>) -> tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc223) + %tmp31 = ttg.convert_layout %tmp29 : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc198) + %tmp29_733 = tt.expand_dims %tmp31 {axis = 1 : i32} : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<8x1xi64, #blocked5> loc(#loc199) + %tmp29_734 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi64, #blocked> loc(#loc199) + %tmp30_735 = arith.trunci %tmp24_731 : tensor<8x1xi64, #blocked5> to tensor<8x1xi32, #blocked5> loc(#loc193) + %tmp30_736 = arith.trunci %tmp24_732 : tensor<8x1xi64, #blocked> to tensor<8x1xi32, #blocked> loc(#loc193) + %tmp31_737 = arith.trunci %tmp29_733 : tensor<8x1xi64, #blocked5> to tensor<8x1xi32, #blocked5> loc(#loc198) + %tmp31_738 = arith.trunci %tmp29_734 : tensor<8x1xi64, #blocked> to tensor<8x1xi32, #blocked> loc(#loc198) + %tmp34 = tt.broadcast %tmp30_736 : tensor<8x1xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc200) + %tmp34_739 = arith.cmpi slt, %tmp0_24, %tmp34 : tensor<8x16xi32, #blocked> loc(#loc200) + %tmp36 = arith.select %tmp34_739, %new_idxs_398, %cst_11 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc201) + %tmp38 = arith.addi %tmp36, %cst_10 : tensor<8x16xi32, #blocked> loc(#loc202) + %tmp39 = arith.cmpi slt, %tmp36, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc203) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc204) + %0 = arith.cmpi sge, %tmp40, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc85) + %1 = arith.cmpi slt, %tmp40, %cst_10 : tensor<8x16xi32, #blocked> loc(#loc86) + %2 = arith.andi %0, %1 : tensor<8x16xi1, #blocked> loc(#loc87) + %3 = arith.xori %xmask, %cst : tensor<8x1xi1, #blocked> loc(#loc88) + %4 = tt.broadcast %3 : tensor<8x1xi1, #blocked> -> tensor<8x16xi1, #blocked> loc(#loc89) + %5 = arith.ori %2, %4 : tensor<8x16xi1, #blocked> loc(#loc89) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<8x16xi1, #blocked> loc(#loc90) + %tmp45 = tt.broadcast %tmp31_738 : tensor<8x1xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc205) + %tmp45_740 = arith.cmpi slt, %tmp0_24, %tmp45 : tensor<8x16xi32, #blocked> loc(#loc205) + %tmp46 = arith.select %tmp45_740, %new_idxs_730, %cst_11 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc206) + %tmp47 = arith.addi %tmp46, %cst_10 : tensor<8x16xi32, #blocked> loc(#loc207) + %tmp48 = arith.cmpi slt, %tmp46, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc208) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<8x16xi1, #blocked>, tensor<8x16xi32, #blocked> loc(#loc209) + %6 = arith.cmpi sge, %tmp49, %cst_9 : tensor<8x16xi32, #blocked> loc(#loc96) + %7 = arith.cmpi slt, %tmp49, %cst_10 : tensor<8x16xi32, #blocked> loc(#loc97) + %8 = arith.andi %6, %7 : tensor<8x16xi1, #blocked> loc(#loc98) + %9 = arith.ori %8, %4 : tensor<8x16xi1, #blocked> loc(#loc99) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<8x16xi1, #blocked> loc(#loc100) + %10 = tt.splat %out_ptr4 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked5> loc(#loc101) + %11 = tt.addptr %10, %xindex_21 : tensor<8x1x!tt.ptr, #blocked5>, tensor<8x1xi32, #blocked5> loc(#loc101) + tt.store %11, %tmp30_735, %xmask_22 : tensor<8x1x!tt.ptr, #blocked5> loc(#loc102) + %12 = tt.splat %out_ptr5 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked5> loc(#loc103) + %13 = tt.addptr %12, %xindex_21 : tensor<8x1x!tt.ptr, #blocked5>, tensor<8x1xi32, #blocked5> loc(#loc103) + tt.store %13, %tmp31_737, %xmask_22 : tensor<8x1x!tt.ptr, #blocked5> loc(#loc104) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc105) + %15 = tt.addptr %14, %tmp0_26 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi32, #blocked> loc(#loc105) + tt.store %15, %new_idxs_398, %tmp0_29 : tensor<8x16x!tt.ptr, #blocked> loc(#loc106) + %16 = arith.muli %xindex_20, %cst_0 : tensor<8x1xi32, #blocked> loc(#loc107) + %17 = tt.broadcast %16 : tensor<8x1xi32, #blocked> -> tensor<8x16xi32, #blocked> loc(#loc108) + %18 = arith.addi %tmp40, %17 : tensor<8x16xi32, #blocked> loc(#loc108) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc109) + %20 = tt.addptr %19, %18 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi32, #blocked> loc(#loc109) + %21 = ttg.convert_layout %20 : tensor<8x16x!tt.ptr, #blocked> -> tensor<8x16x!tt.ptr, #blocked5> loc(#loc110) + tt.store %21, %cst_8, %tmp0_30 : tensor<8x16x!tt.ptr, #blocked5> loc(#loc110) + %22 = tt.splat %out_ptr8 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc111) + %23 = tt.addptr %22, %tmp0_26 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi32, #blocked> loc(#loc111) + tt.store %23, %new_idxs_730, %tmp0_29 : tensor<8x16x!tt.ptr, #blocked> loc(#loc112) + %24 = arith.addi %tmp49, %17 : tensor<8x16xi32, #blocked> loc(#loc113) + %25 = tt.splat %out_ptr9 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc114) + %26 = tt.addptr %25, %24 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi32, #blocked> loc(#loc114) + %27 = ttg.convert_layout %26 : tensor<8x16x!tt.ptr, #blocked> -> tensor<8x16x!tt.ptr, #blocked5> loc(#loc115) + tt.store %27, %cst_8, %tmp0_30 : tensor<8x16x!tt.ptr, #blocked5> loc(#loc115) + tt.return loc(#loc116) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":24:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":25:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":25:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":26:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":27:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:40) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:30) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":36:18) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":38:18) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":39:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":41:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":40:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":43:19) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":45:34) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":47:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":49:21) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":48:21) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":52:20) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":54:35) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":60:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":55:29) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":56:21) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":58:35) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":61:21) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":59:29) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":64:19) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":66:35) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":68:20) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":69:20) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":70:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:28) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:46) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:38) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:55) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:53) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:63) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":75:19) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":76:35) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":77:20) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":78:20) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":79:35) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:28) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:46) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:38) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:53) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:63) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":81:25) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":81:37) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":82:25) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":82:37) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":83:25) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":83:47) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:52) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:49) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:25) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:85) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":85:25) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":85:47) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:49) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:85) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:4) +#loc126 = loc("xoffset"(#loc2)) +#loc127 = loc("xoffset"(#loc3)) +#loc128 = loc("xindex"(#loc4)) +#loc129 = loc("xindex"(#loc5)) +#loc130 = loc("xmask"(#loc6)) +#loc131 = loc("r0_index"(#loc7)) +#loc132 = loc("tmp0"(#loc8)) +#loc133 = loc("tmp0"(#loc9)) +#loc134 = loc("tmp0"(#loc10)) +#loc135 = loc("tmp0"(#loc11)) +#loc136 = loc("tmp2"(#loc12)) +#loc137 = loc("tmp4"(#loc13)) +#loc138 = loc("tmp5"(#loc14)) +#loc139 = loc("tmp7"(#loc15)) +#loc140 = loc("tmp6"(#loc16)) +#loc141 = loc("tmp9"(#loc17)) +#loc142 = loc("tmp11"(#loc18)) +#loc143 = loc("flip"(#loc19)) +#loc145 = loc("flip"(#loc22)) +#loc146 = loc("flip"(#loc23)) +#loc147 = loc("y"(#loc24)) +#loc148 = loc("left_mask"(#loc26)) +#loc149 = loc("ileft"(#loc27)) +#loc151 = loc("ileft"(#loc31)) +#loc152 = loc("ileft"(#loc32)) +#loc153 = loc("iright"(#loc33)) +#loc155 = loc("iright"(#loc35)) +#loc156 = loc("iright"(#loc36)) +#loc157 = loc("ileft"(#loc37)) +#loc158 = loc("iright"(#loc38)) +#loc159 = loc("y_idx"(#loc39)) +#loc160 = loc("left_idx"(#loc40)) +#loc161 = loc("left_idx"(#loc41)) +#loc162 = loc("input"(#loc42)) +#loc164 = loc("left_idx"(#loc44)) +#loc165 = loc("left_idx"(#loc45)) +#loc166 = loc("right_idx"(#loc46)) +#loc167 = loc("right_idx"(#loc47)) +#loc169 = loc("right_idx"(#loc49)) +#loc170 = loc("right_idx"(#loc50)) +#loc171 = loc("left_idx"(#loc51)) +#loc172 = loc("right_idx"(#loc52)) +#loc173 = loc("cond"(#loc53)) +#loc174 = loc("eq"(#loc54)) +#loc175 = loc("cond"(#loc55)) +#loc176 = loc("cond"(#loc56)) +#loc177 = loc("cond"(#loc57)) +#loc178 = loc("cond"(#loc58)) +#loc179 = loc("cond"(#loc59)) +#loc180 = loc("ret"(#loc60)) +#loc181 = loc("ret"(#loc61)) +#loc182 = loc("ret"(#loc62)) +#loc183 = loc("new_idxs"(#loc63)) +#loc184 = loc("new_idxs"(#loc64)) +#loc185 = loc("new_idxs"(#loc65)) +#loc186 = loc("tmp14"(#loc66)) +#loc187 = loc("tmp16"(#loc67)) +#loc188 = loc("tmp15"(#loc68)) +#loc190 = loc("tmp20"(#loc70)) +#loc191 = loc("tmp23"(#loc71)) +#loc193 = loc("tmp30"(#loc73)) +#loc194 = loc("tmp24"(#loc74)) +#loc195 = loc("tmp25"(#loc75)) +#loc196 = loc("tmp28"(#loc76)) +#loc198 = loc("tmp31"(#loc78)) +#loc199 = loc("tmp29"(#loc79)) +#loc200 = loc("tmp34"(#loc80)) +#loc201 = loc("tmp36"(#loc81)) +#loc202 = loc("tmp38"(#loc82)) +#loc203 = loc("tmp39"(#loc83)) +#loc204 = loc("tmp40"(#loc84)) +#loc205 = loc("tmp45"(#loc91)) +#loc206 = loc("tmp46"(#loc92)) +#loc207 = loc("tmp47"(#loc93)) +#loc208 = loc("tmp48"(#loc94)) +#loc209 = loc("tmp49"(#loc95)) +#loc210 = loc(fused[#loc139, #loc140]) +#loc211 = loc(callsite(#loc143 at #loc144)) +#loc212 = loc(callsite(#loc145 at #loc144)) +#loc213 = loc(callsite(#loc146 at #loc144)) +#loc215 = loc("cond"(#loc173)) +#loc216 = loc("eq"(#loc174)) +#loc217 = loc(fused[#loc187, #loc188]) +#loc219 = loc(fused[#loc190, #loc139, #loc140]) +#loc220 = loc(callsite(#loc28 at #loc192)) +#loc222 = loc(fused[#loc195, #loc187, #loc188]) +#loc223 = loc(callsite(#loc28 at #loc197)) +#loc225 = loc(callsite(#loc147 at #loc214)) +#loc226 = loc(callsite(#loc148 at #loc214)) +#loc227 = loc(callsite(#loc149 at #loc214)) +#loc229 = loc(callsite(#loc151 at #loc214)) +#loc230 = loc(callsite(#loc152 at #loc214)) +#loc231 = loc(callsite(#loc153 at #loc214)) +#loc233 = loc(callsite(#loc155 at #loc214)) +#loc234 = loc(callsite(#loc156 at #loc214)) +#loc235 = loc(callsite(#loc157 at #loc214)) +#loc236 = loc(callsite(#loc158 at #loc214)) +#loc237 = loc(callsite(#loc159 at #loc214)) +#loc238 = loc(callsite(#loc160 at #loc214)) +#loc239 = loc(callsite(#loc161 at #loc214)) +#loc241 = loc(callsite(#loc164 at #loc214)) +#loc242 = loc(callsite(#loc165 at #loc214)) +#loc243 = loc(callsite(#loc166 at #loc214)) +#loc244 = loc(callsite(#loc167 at #loc214)) +#loc246 = loc(callsite(#loc169 at #loc214)) +#loc247 = loc(callsite(#loc170 at #loc214)) +#loc248 = loc(callsite(#loc171 at #loc214)) +#loc249 = loc(callsite(#loc172 at #loc214)) +#loc250 = loc(callsite(#loc215 at #loc214)) +#loc251 = loc(callsite(#loc216 at #loc214)) +#loc252 = loc(callsite(#loc175 at #loc214)) +#loc253 = loc(callsite(#loc176 at #loc214)) +#loc254 = loc(callsite(#loc177 at #loc214)) +#loc255 = loc(callsite(#loc178 at #loc214)) +#loc256 = loc(callsite(#loc179 at #loc214)) +#loc257 = loc(callsite(#loc180 at #loc214)) +#loc258 = loc(callsite(#loc181 at #loc214)) +#loc259 = loc(callsite(#loc182 at #loc214)) +#loc260 = loc(callsite(#loc183 at #loc214)) +#loc261 = loc(callsite(#loc184 at #loc214)) +#loc262 = loc(callsite(#loc185 at #loc214)) +#loc263 = loc(callsite(#loc147 at #loc218)) +#loc264 = loc(callsite(#loc149 at #loc218)) +#loc266 = loc(callsite(#loc151 at #loc218)) +#loc267 = loc(callsite(#loc152 at #loc218)) +#loc268 = loc(callsite(#loc153 at #loc218)) +#loc270 = loc(callsite(#loc155 at #loc218)) +#loc271 = loc(callsite(#loc156 at #loc218)) +#loc272 = loc(callsite(#loc157 at #loc218)) +#loc273 = loc(callsite(#loc158 at #loc218)) +#loc274 = loc(callsite(#loc215 at #loc218)) +#loc275 = loc(callsite(#loc216 at #loc218)) +#loc276 = loc(callsite(#loc176 at #loc218)) +#loc277 = loc(callsite(#loc177 at #loc218)) +#loc278 = loc(callsite(#loc178 at #loc218)) +#loc279 = loc(callsite(#loc179 at #loc218)) +#loc280 = loc(callsite(#loc180 at #loc218)) +#loc281 = loc(callsite(#loc181 at #loc218)) +#loc282 = loc(callsite(#loc182 at #loc218)) +#loc283 = loc(callsite(#loc184 at #loc218)) +#loc284 = loc(callsite(#loc185 at #loc218)) +#loc285 = loc(callsite(#loc159 at #loc218)) +#loc286 = loc(callsite(#loc161 at #loc218)) +#loc288 = loc(callsite(#loc164 at #loc218)) +#loc289 = loc(callsite(#loc165 at #loc218)) +#loc290 = loc(callsite(#loc167 at #loc218)) +#loc292 = loc(callsite(#loc169 at #loc218)) +#loc293 = loc(callsite(#loc170 at #loc218)) +#loc294 = loc(callsite(#loc171 at #loc218)) +#loc295 = loc(callsite(#loc172 at #loc218)) +#loc296 = loc(callsite(#loc175 at #loc218)) +#loc297 = loc(callsite(#loc183 at #loc218)) +#loc298 = loc(callsite(#loc30 at #loc220)) +#loc299 = loc(callsite(#loc30 at #loc223)) +#loc300 = loc(callsite(#loc28 at #loc228)) +#loc302 = loc(callsite(#loc28 at #loc232)) +#loc304 = loc(callsite(#loc162 at #loc240)) +#loc305 = loc(callsite(#loc28 at #loc240)) +#loc307 = loc(callsite(#loc162 at #loc245)) +#loc308 = loc(callsite(#loc28 at #loc245)) +#loc310 = loc(callsite(#loc28 at #loc265)) +#loc312 = loc(callsite(#loc28 at #loc269)) +#loc314 = loc(callsite(#loc28 at #loc287)) +#loc316 = loc(callsite(#loc28 at #loc291)) +#loc318 = loc(callsite(#loc30 at #loc300)) +#loc319 = loc(callsite(#loc30 at #loc302)) +#loc320 = loc(callsite(#loc30 at #loc305)) +#loc321 = loc(callsite(#loc30 at #loc308)) +#loc322 = loc(callsite(#loc30 at #loc310)) +#loc323 = loc(callsite(#loc30 at #loc312)) +#loc324 = loc(callsite(#loc30 at #loc314)) +#loc325 = loc(callsite(#loc30 at #loc316)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..95bbc5b852dcbde77d9e74b76c5abe45c9aacaab --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/5P66Y2R4BAVAKI2AZ4OOKOSCRGKH7SKT5SYLTP4RXGBUCBAJIDZQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir @@ -0,0 +1,1451 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":18:0) +#loc1 = loc(unknown) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":46:71) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":51:71) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":55:26) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":59:26) +#loc120 = loc("in_ptr0"(#loc)) +#loc121 = loc("out_ptr4"(#loc)) +#loc122 = loc("out_ptr5"(#loc)) +#loc123 = loc("out_ptr6"(#loc)) +#loc124 = loc("out_ptr7"(#loc)) +#loc125 = loc("out_ptr8"(#loc)) +#loc126 = loc("out_ptr9"(#loc)) +#loc127 = loc("xnumel"(#loc)) +#loc128 = loc("r0_numel"(#loc)) +#loc149 = loc(callsite(#loc22 at #loc23)) +#loc156 = loc("ileft"(#loc32)) +#loc160 = loc("iright"(#loc37)) +#loc169 = loc("left_idx"(#loc46)) +#loc174 = loc("right_idx"(#loc51)) +#loc195 = loc(callsite(#loc22 at #loc72)) +#loc198 = loc("tmp24"(#loc75)) +#loc202 = loc("tmp29"(#loc79)) +#loc221 = loc(callsite(#loc28 at #loc149)) +#loc225 = loc(callsite(#loc28 at #loc195)) +#loc228 = loc(callsite(#loc1 at #loc198)) +#loc231 = loc(callsite(#loc1 at #loc202)) +#loc235 = loc(callsite(#loc156 at #loc221)) +#loc239 = loc(callsite(#loc160 at #loc221)) +#loc247 = loc(callsite(#loc169 at #loc221)) +#loc252 = loc(callsite(#loc174 at #loc221)) +#loc272 = loc(callsite(#loc156 at #loc225)) +#loc276 = loc(callsite(#loc160 at #loc225)) +#loc294 = loc(callsite(#loc169 at #loc225)) +#loc298 = loc(callsite(#loc174 at #loc225)) +#loc308 = loc(callsite(#loc1 at #loc235)) +#loc310 = loc(callsite(#loc1 at #loc239)) +#loc313 = loc(callsite(#loc1 at #loc247)) +#loc316 = loc(callsite(#loc1 at #loc252)) +#loc318 = loc(callsite(#loc1 at #loc272)) +#loc320 = loc(callsite(#loc1 at #loc276)) +#loc322 = loc(callsite(#loc1 at #loc294)) +#loc324 = loc(callsite(#loc1 at #loc298)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<8x16xi32> loc(#loc1) + %cst_1 = arith.constant dense<17> : tensor<8x1xi32> loc(#loc1) + %cst_2 = arith.constant dense : tensor<8x1xi1> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<8x16xi32> loc(#loc1) + %cst_4 = arith.constant dense<17> : tensor<8x16xi32> loc(#loc1) + %cst_5 = arith.constant dense<16> : tensor<8x16xi32> loc(#loc1) + %cst_6 = arith.constant dense<16384> : tensor<8x16xi64> loc(#loc1) + %cst_7 = arith.constant dense<0> : tensor<8x16xi64> loc(#loc1) + %cst_8 = arith.constant dense<16> : tensor<8x1xi32> loc(#loc1) + %xmask = arith.constant dense<32> : tensor<8x1xi32> loc(#loc129) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc130) + %xoffset_9 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc131) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> loc(#loc132) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32> -> tensor<8x1xi32> loc(#loc133) + %xindex_11 = tt.splat %xoffset_9 : i32 -> tensor<8x1xi32> loc(#loc134) + %xindex_12 = arith.addi %xindex_11, %xindex_10 : tensor<8x1xi32> loc(#loc134) + %xmask_13 = arith.cmpi slt, %xindex_12, %xmask : tensor<8x1xi32> loc(#loc129) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc135) + %r0_index_14 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc136) + %tmp0 = arith.muli %xindex_12, %cst_8 : tensor<8x1xi32> loc(#loc137) + %tmp0_15 = tt.broadcast %r0_index_14 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc138) + %tmp0_16 = tt.broadcast %tmp0 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc138) + %tmp0_17 = arith.addi %tmp0_15, %tmp0_16 : tensor<8x16xi32> loc(#loc138) + %tmp0_18 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc139) + %tmp0_19 = tt.addptr %tmp0_18, %tmp0_17 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc139) + %tmp0_20 = tt.broadcast %xmask_13 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc140) + %tmp0_21 = tt.load %tmp0_19, %tmp0_20, %cst_7 : tensor<8x16x!tt.ptr> loc(#loc140) + %tmp2 = arith.cmpi sgt, %tmp0_21, %cst_7 : tensor<8x16xi64> loc(#loc141) + %tmp4 = arith.cmpi slt, %tmp0_21, %cst_6 : tensor<8x16xi64> loc(#loc142) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<8x16xi1> loc(#loc143) + %tmp7 = arith.extui %tmp5 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc216) + %tmp9 = arith.trunci %r0_index_14 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc146) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16> -> tensor<8x16xi16> loc(#loc147) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc217) + %flip_22 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc218) + %flip_23 = tt.expand_dims %flip_22 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc218) + %flip_24 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc219) + %flip_25 = tt.reshape %flip_24 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc220) + %y = tt.reshape %tmp7 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc232) + %left_mask = arith.subi %cst, %flip_23 : tensor<1x2x1xi32> loc(#loc233) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc234) + %ileft_26 = arith.muli %y, %ileft : tensor<64x2x1xi32> loc(#loc234) + %ileft_27 = "tt.reduce"(%ileft_26) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc307) + %ileft_28 = tt.expand_dims %ileft_27 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc236) + %ileft_29 = tt.broadcast %ileft_28 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc237) + %iright = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<64x2x1xi32> loc(#loc238) + %iright_30 = arith.muli %y, %iright : tensor<64x2x1xi32> loc(#loc238) + %iright_31 = "tt.reduce"(%iright_30) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc309) + %iright_32 = tt.expand_dims %iright_31 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc240) + %iright_33 = tt.broadcast %iright_32 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc241) + %ileft_34 = tt.reshape %ileft_29 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_35 = tt.reshape %iright_33 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx = tt.reshape %tmp11 : tensor<8x16xi16> -> tensor<64x2x1xi16> loc(#loc244) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc245) + %left_idx_36 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc246) + %left_idx_37 = arith.muli %y_idx, %left_idx_36 : tensor<64x2x1xi16> loc(#loc246) + %input = arith.extsi %left_idx_37 : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc311) + %left_idx_38 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc312) + %left_idx_39 = tt.expand_dims %left_idx_38 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc248) + %left_idx_40 = tt.broadcast %left_idx_39 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc249) + %right_idx = arith.trunci %flip_23 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc250) + %right_idx_41 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<64x2x1xi16> loc(#loc251) + %right_idx_42 = arith.muli %y_idx, %right_idx_41 : tensor<64x2x1xi16> loc(#loc251) + %input_43 = arith.extsi %right_idx_42 : tensor<64x2x1xi16> to tensor<64x2x1xi32> loc(#loc314) + %right_idx_44 = "tt.reduce"(%input_43) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc315) + %right_idx_45 = tt.expand_dims %right_idx_44 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc253) + %right_idx_46 = tt.broadcast %right_idx_45 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc254) + %left_idx_47 = tt.reshape %left_idx_40 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_48 = tt.reshape %right_idx_46 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc256) + %cond = arith.cmpi slt, %ileft_34, %iright_35 : tensor<8x16xi32> loc(#loc257) + %eq = arith.cmpi eq, %ileft_34, %iright_35 : tensor<8x16xi32> loc(#loc258) + %cond_49 = arith.cmpi sgt, %left_idx_47, %right_idx_48 : tensor<8x16xi32> loc(#loc259) + %cond_50 = arith.andi %eq, %cond_49 : tensor<8x16xi1> loc(#loc260) + %cond_51 = arith.ori %cond, %cond_50 : tensor<8x16xi1> loc(#loc261) + %cond_52 = arith.extui %cond_51 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc262) + %cond_53 = arith.xori %cond_52, %flip_25 : tensor<8x16xi32> loc(#loc262) + %cond_54 = arith.cmpi ne, %cond_53, %cst_3 : tensor<8x16xi32> loc(#loc263) + %ret = arith.xori %ileft_34, %iright_35 : tensor<8x16xi32> loc(#loc264) + %ret_55 = arith.select %cond_54, %ret, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_56 = arith.xori %tmp7, %ret_55 : tensor<8x16xi32> loc(#loc266) + %new_idxs = arith.xori %left_idx_47, %right_idx_48 : tensor<8x16xi32> loc(#loc267) + %new_idxs_57 = arith.select %cond_54, %new_idxs, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_58 = arith.extsi %tmp9 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc269) + %new_idxs_59 = tt.broadcast %new_idxs_58 : tensor<1x16xi32> -> tensor<8x16xi32> loc(#loc269) + %new_idxs_60 = arith.xori %new_idxs_59, %new_idxs_57 : tensor<8x16xi32> loc(#loc269) + %flip_61 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc219) + %flip_62 = tt.reshape %flip_61 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc220) + %y_63 = tt.reshape %ret_56 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc232) + %ileft_64 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<32x2x2xi32> loc(#loc234) + %ileft_65 = arith.muli %y_63, %ileft_64 : tensor<32x2x2xi32> loc(#loc234) + %ileft_66 = "tt.reduce"(%ileft_65) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc307) + %ileft_67 = tt.expand_dims %ileft_66 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc236) + %ileft_68 = tt.broadcast %ileft_67 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc237) + %iright_69 = arith.muli %y_63, %flip_24 : tensor<32x2x2xi32> loc(#loc238) + %iright_70 = "tt.reduce"(%iright_69) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc309) + %iright_71 = tt.expand_dims %iright_70 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc240) + %iright_72 = tt.broadcast %iright_71 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc241) + %ileft_73 = tt.reshape %ileft_68 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_74 = tt.reshape %iright_72 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_75 = tt.reshape %new_idxs_60 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc244) + %left_idx_76 = arith.muli %y_idx_75, %ileft_64 : tensor<32x2x2xi32> loc(#loc246) + %left_idx_77 = "tt.reduce"(%left_idx_76) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc312) + %left_idx_78 = tt.expand_dims %left_idx_77 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc248) + %left_idx_79 = tt.broadcast %left_idx_78 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc249) + %right_idx_80 = arith.muli %y_idx_75, %flip_24 : tensor<32x2x2xi32> loc(#loc251) + %right_idx_81 = "tt.reduce"(%right_idx_80) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc315) + %right_idx_82 = tt.expand_dims %right_idx_81 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc253) + %right_idx_83 = tt.broadcast %right_idx_82 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc254) + %left_idx_84 = tt.reshape %left_idx_79 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_85 = tt.reshape %right_idx_83 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_86 = arith.cmpi slt, %ileft_73, %iright_74 : tensor<8x16xi32> loc(#loc257) + %eq_87 = arith.cmpi eq, %ileft_73, %iright_74 : tensor<8x16xi32> loc(#loc258) + %cond_88 = arith.cmpi sgt, %left_idx_84, %right_idx_85 : tensor<8x16xi32> loc(#loc259) + %cond_89 = arith.andi %eq_87, %cond_88 : tensor<8x16xi1> loc(#loc260) + %cond_90 = arith.ori %cond_86, %cond_89 : tensor<8x16xi1> loc(#loc261) + %cond_91 = arith.extui %cond_90 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc262) + %cond_92 = arith.xori %cond_91, %flip_62 : tensor<8x16xi32> loc(#loc262) + %cond_93 = arith.cmpi ne, %cond_92, %cst_3 : tensor<8x16xi32> loc(#loc263) + %ret_94 = arith.xori %ileft_73, %iright_74 : tensor<8x16xi32> loc(#loc264) + %ret_95 = arith.select %cond_93, %ret_94, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_96 = arith.xori %ret_56, %ret_95 : tensor<8x16xi32> loc(#loc266) + %new_idxs_97 = arith.xori %left_idx_84, %right_idx_85 : tensor<8x16xi32> loc(#loc267) + %new_idxs_98 = arith.select %cond_93, %new_idxs_97, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_99 = arith.xori %new_idxs_60, %new_idxs_98 : tensor<8x16xi32> loc(#loc269) + %y_100 = tt.reshape %ret_96 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc232) + %ileft_101 = arith.muli %y_100, %ileft : tensor<64x2x1xi32> loc(#loc234) + %ileft_102 = "tt.reduce"(%ileft_101) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc307) + %ileft_103 = tt.expand_dims %ileft_102 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc236) + %ileft_104 = tt.broadcast %ileft_103 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc237) + %iright_105 = arith.muli %y_100, %iright : tensor<64x2x1xi32> loc(#loc238) + %iright_106 = "tt.reduce"(%iright_105) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc309) + %iright_107 = tt.expand_dims %iright_106 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc240) + %iright_108 = tt.broadcast %iright_107 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc241) + %ileft_109 = tt.reshape %ileft_104 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_110 = tt.reshape %iright_108 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_111 = tt.reshape %new_idxs_99 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc244) + %left_idx_112 = arith.muli %y_idx_111, %ileft : tensor<64x2x1xi32> loc(#loc246) + %left_idx_113 = "tt.reduce"(%left_idx_112) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc312) + %left_idx_114 = tt.expand_dims %left_idx_113 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc248) + %left_idx_115 = tt.broadcast %left_idx_114 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc249) + %right_idx_116 = arith.muli %y_idx_111, %iright : tensor<64x2x1xi32> loc(#loc251) + %right_idx_117 = "tt.reduce"(%right_idx_116) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc315) + %right_idx_118 = tt.expand_dims %right_idx_117 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc253) + %right_idx_119 = tt.broadcast %right_idx_118 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc254) + %left_idx_120 = tt.reshape %left_idx_115 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_121 = tt.reshape %right_idx_119 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_122 = arith.cmpi slt, %ileft_109, %iright_110 : tensor<8x16xi32> loc(#loc257) + %eq_123 = arith.cmpi eq, %ileft_109, %iright_110 : tensor<8x16xi32> loc(#loc258) + %cond_124 = arith.cmpi sgt, %left_idx_120, %right_idx_121 : tensor<8x16xi32> loc(#loc259) + %cond_125 = arith.andi %eq_123, %cond_124 : tensor<8x16xi1> loc(#loc260) + %cond_126 = arith.ori %cond_122, %cond_125 : tensor<8x16xi1> loc(#loc261) + %cond_127 = arith.extui %cond_126 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc262) + %cond_128 = arith.xori %cond_127, %flip_62 : tensor<8x16xi32> loc(#loc262) + %cond_129 = arith.cmpi ne, %cond_128, %cst_3 : tensor<8x16xi32> loc(#loc263) + %ret_130 = arith.xori %ileft_109, %iright_110 : tensor<8x16xi32> loc(#loc264) + %ret_131 = arith.select %cond_129, %ret_130, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_132 = arith.xori %ret_96, %ret_131 : tensor<8x16xi32> loc(#loc266) + %new_idxs_133 = arith.xori %left_idx_120, %right_idx_121 : tensor<8x16xi32> loc(#loc267) + %new_idxs_134 = arith.select %cond_129, %new_idxs_133, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_135 = arith.xori %new_idxs_99, %new_idxs_134 : tensor<8x16xi32> loc(#loc269) + %flip_136 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc219) + %flip_137 = tt.reshape %flip_136 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc220) + %y_138 = tt.reshape %ret_132 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc232) + %ileft_139 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<16x2x4xi32> loc(#loc234) + %ileft_140 = arith.muli %y_138, %ileft_139 : tensor<16x2x4xi32> loc(#loc234) + %ileft_141 = "tt.reduce"(%ileft_140) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc307) + %ileft_142 = tt.expand_dims %ileft_141 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc236) + %ileft_143 = tt.broadcast %ileft_142 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc237) + %iright_144 = arith.muli %y_138, %flip_61 : tensor<16x2x4xi32> loc(#loc238) + %iright_145 = "tt.reduce"(%iright_144) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc309) + %iright_146 = tt.expand_dims %iright_145 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc240) + %iright_147 = tt.broadcast %iright_146 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc241) + %ileft_148 = tt.reshape %ileft_143 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_149 = tt.reshape %iright_147 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_150 = tt.reshape %new_idxs_135 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc244) + %left_idx_151 = arith.muli %y_idx_150, %ileft_139 : tensor<16x2x4xi32> loc(#loc246) + %left_idx_152 = "tt.reduce"(%left_idx_151) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc312) + %left_idx_153 = tt.expand_dims %left_idx_152 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc248) + %left_idx_154 = tt.broadcast %left_idx_153 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc249) + %right_idx_155 = arith.muli %y_idx_150, %flip_61 : tensor<16x2x4xi32> loc(#loc251) + %right_idx_156 = "tt.reduce"(%right_idx_155) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc315) + %right_idx_157 = tt.expand_dims %right_idx_156 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc253) + %right_idx_158 = tt.broadcast %right_idx_157 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc254) + %left_idx_159 = tt.reshape %left_idx_154 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_160 = tt.reshape %right_idx_158 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_161 = arith.cmpi slt, %ileft_148, %iright_149 : tensor<8x16xi32> loc(#loc257) + %eq_162 = arith.cmpi eq, %ileft_148, %iright_149 : tensor<8x16xi32> loc(#loc258) + %cond_163 = arith.cmpi sgt, %left_idx_159, %right_idx_160 : tensor<8x16xi32> loc(#loc259) + %cond_164 = arith.andi %eq_162, %cond_163 : tensor<8x16xi1> loc(#loc260) + %cond_165 = arith.ori %cond_161, %cond_164 : tensor<8x16xi1> loc(#loc261) + %cond_166 = arith.extui %cond_165 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc262) + %cond_167 = arith.xori %cond_166, %flip_137 : tensor<8x16xi32> loc(#loc262) + %cond_168 = arith.cmpi ne, %cond_167, %cst_3 : tensor<8x16xi32> loc(#loc263) + %ret_169 = arith.xori %ileft_148, %iright_149 : tensor<8x16xi32> loc(#loc264) + %ret_170 = arith.select %cond_168, %ret_169, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_171 = arith.xori %ret_132, %ret_170 : tensor<8x16xi32> loc(#loc266) + %new_idxs_172 = arith.xori %left_idx_159, %right_idx_160 : tensor<8x16xi32> loc(#loc267) + %new_idxs_173 = arith.select %cond_168, %new_idxs_172, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_174 = arith.xori %new_idxs_135, %new_idxs_173 : tensor<8x16xi32> loc(#loc269) + %y_175 = tt.reshape %ret_171 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc232) + %ileft_176 = arith.muli %y_175, %ileft_64 : tensor<32x2x2xi32> loc(#loc234) + %ileft_177 = "tt.reduce"(%ileft_176) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc307) + %ileft_178 = tt.expand_dims %ileft_177 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc236) + %ileft_179 = tt.broadcast %ileft_178 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc237) + %iright_180 = arith.muli %y_175, %flip_24 : tensor<32x2x2xi32> loc(#loc238) + %iright_181 = "tt.reduce"(%iright_180) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc309) + %iright_182 = tt.expand_dims %iright_181 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc240) + %iright_183 = tt.broadcast %iright_182 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc241) + %ileft_184 = tt.reshape %ileft_179 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_185 = tt.reshape %iright_183 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_186 = tt.reshape %new_idxs_174 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc244) + %left_idx_187 = arith.muli %y_idx_186, %ileft_64 : tensor<32x2x2xi32> loc(#loc246) + %left_idx_188 = "tt.reduce"(%left_idx_187) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc312) + %left_idx_189 = tt.expand_dims %left_idx_188 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc248) + %left_idx_190 = tt.broadcast %left_idx_189 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc249) + %right_idx_191 = arith.muli %y_idx_186, %flip_24 : tensor<32x2x2xi32> loc(#loc251) + %right_idx_192 = "tt.reduce"(%right_idx_191) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc315) + %right_idx_193 = tt.expand_dims %right_idx_192 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc253) + %right_idx_194 = tt.broadcast %right_idx_193 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc254) + %left_idx_195 = tt.reshape %left_idx_190 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_196 = tt.reshape %right_idx_194 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_197 = arith.cmpi slt, %ileft_184, %iright_185 : tensor<8x16xi32> loc(#loc257) + %eq_198 = arith.cmpi eq, %ileft_184, %iright_185 : tensor<8x16xi32> loc(#loc258) + %cond_199 = arith.cmpi sgt, %left_idx_195, %right_idx_196 : tensor<8x16xi32> loc(#loc259) + %cond_200 = arith.andi %eq_198, %cond_199 : tensor<8x16xi1> loc(#loc260) + %cond_201 = arith.ori %cond_197, %cond_200 : tensor<8x16xi1> loc(#loc261) + %cond_202 = arith.extui %cond_201 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc262) + %cond_203 = arith.xori %cond_202, %flip_137 : tensor<8x16xi32> loc(#loc262) + %cond_204 = arith.cmpi ne, %cond_203, %cst_3 : tensor<8x16xi32> loc(#loc263) + %ret_205 = arith.xori %ileft_184, %iright_185 : tensor<8x16xi32> loc(#loc264) + %ret_206 = arith.select %cond_204, %ret_205, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_207 = arith.xori %ret_171, %ret_206 : tensor<8x16xi32> loc(#loc266) + %new_idxs_208 = arith.xori %left_idx_195, %right_idx_196 : tensor<8x16xi32> loc(#loc267) + %new_idxs_209 = arith.select %cond_204, %new_idxs_208, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_210 = arith.xori %new_idxs_174, %new_idxs_209 : tensor<8x16xi32> loc(#loc269) + %y_211 = tt.reshape %ret_207 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc232) + %ileft_212 = arith.muli %y_211, %ileft : tensor<64x2x1xi32> loc(#loc234) + %ileft_213 = "tt.reduce"(%ileft_212) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc307) + %ileft_214 = tt.expand_dims %ileft_213 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc236) + %ileft_215 = tt.broadcast %ileft_214 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc237) + %iright_216 = arith.muli %y_211, %iright : tensor<64x2x1xi32> loc(#loc238) + %iright_217 = "tt.reduce"(%iright_216) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc309) + %iright_218 = tt.expand_dims %iright_217 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc240) + %iright_219 = tt.broadcast %iright_218 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc241) + %ileft_220 = tt.reshape %ileft_215 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_221 = tt.reshape %iright_219 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_222 = tt.reshape %new_idxs_210 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc244) + %left_idx_223 = arith.muli %y_idx_222, %ileft : tensor<64x2x1xi32> loc(#loc246) + %left_idx_224 = "tt.reduce"(%left_idx_223) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc312) + %left_idx_225 = tt.expand_dims %left_idx_224 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc248) + %left_idx_226 = tt.broadcast %left_idx_225 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc249) + %right_idx_227 = arith.muli %y_idx_222, %iright : tensor<64x2x1xi32> loc(#loc251) + %right_idx_228 = "tt.reduce"(%right_idx_227) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc315) + %right_idx_229 = tt.expand_dims %right_idx_228 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc253) + %right_idx_230 = tt.broadcast %right_idx_229 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc254) + %left_idx_231 = tt.reshape %left_idx_226 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_232 = tt.reshape %right_idx_230 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_233 = arith.cmpi slt, %ileft_220, %iright_221 : tensor<8x16xi32> loc(#loc257) + %eq_234 = arith.cmpi eq, %ileft_220, %iright_221 : tensor<8x16xi32> loc(#loc258) + %cond_235 = arith.cmpi sgt, %left_idx_231, %right_idx_232 : tensor<8x16xi32> loc(#loc259) + %cond_236 = arith.andi %eq_234, %cond_235 : tensor<8x16xi1> loc(#loc260) + %cond_237 = arith.ori %cond_233, %cond_236 : tensor<8x16xi1> loc(#loc261) + %cond_238 = arith.extui %cond_237 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc262) + %cond_239 = arith.xori %cond_238, %flip_137 : tensor<8x16xi32> loc(#loc262) + %cond_240 = arith.cmpi ne, %cond_239, %cst_3 : tensor<8x16xi32> loc(#loc263) + %ret_241 = arith.xori %ileft_220, %iright_221 : tensor<8x16xi32> loc(#loc264) + %ret_242 = arith.select %cond_240, %ret_241, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_243 = arith.xori %ret_207, %ret_242 : tensor<8x16xi32> loc(#loc266) + %new_idxs_244 = arith.xori %left_idx_231, %right_idx_232 : tensor<8x16xi32> loc(#loc267) + %new_idxs_245 = arith.select %cond_240, %new_idxs_244, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_246 = arith.xori %new_idxs_210, %new_idxs_245 : tensor<8x16xi32> loc(#loc269) + %y_247 = tt.reshape %ret_243 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc232) + %ileft_248 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<8x2x8xi32> loc(#loc234) + %ileft_249 = arith.muli %y_247, %ileft_248 : tensor<8x2x8xi32> loc(#loc234) + %ileft_250 = "tt.reduce"(%ileft_249) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc307) + %ileft_251 = tt.expand_dims %ileft_250 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc236) + %ileft_252 = tt.broadcast %ileft_251 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc237) + %iright_253 = arith.muli %y_247, %flip_136 : tensor<8x2x8xi32> loc(#loc238) + %iright_254 = "tt.reduce"(%iright_253) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc309) + %iright_255 = tt.expand_dims %iright_254 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc240) + %iright_256 = tt.broadcast %iright_255 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc241) + %ileft_257 = tt.reshape %ileft_252 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_258 = tt.reshape %iright_256 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_259 = tt.reshape %new_idxs_246 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc244) + %left_idx_260 = arith.muli %y_idx_259, %ileft_248 : tensor<8x2x8xi32> loc(#loc246) + %left_idx_261 = "tt.reduce"(%left_idx_260) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc312) + %left_idx_262 = tt.expand_dims %left_idx_261 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc248) + %left_idx_263 = tt.broadcast %left_idx_262 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc249) + %right_idx_264 = arith.muli %y_idx_259, %flip_136 : tensor<8x2x8xi32> loc(#loc251) + %right_idx_265 = "tt.reduce"(%right_idx_264) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc315) + %right_idx_266 = tt.expand_dims %right_idx_265 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc253) + %right_idx_267 = tt.broadcast %right_idx_266 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc254) + %left_idx_268 = tt.reshape %left_idx_263 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_269 = tt.reshape %right_idx_267 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_270 = arith.cmpi slt, %ileft_257, %iright_258 : tensor<8x16xi32> loc(#loc257) + %eq_271 = arith.cmpi eq, %ileft_257, %iright_258 : tensor<8x16xi32> loc(#loc258) + %cond_272 = arith.cmpi sgt, %left_idx_268, %right_idx_269 : tensor<8x16xi32> loc(#loc259) + %cond_273 = arith.andi %eq_271, %cond_272 : tensor<8x16xi1> loc(#loc260) + %cond_274 = arith.ori %cond_270, %cond_273 : tensor<8x16xi1> loc(#loc261) + %ret_275 = arith.xori %ileft_257, %iright_258 : tensor<8x16xi32> loc(#loc264) + %ret_276 = arith.select %cond_274, %ret_275, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_277 = arith.xori %ret_243, %ret_276 : tensor<8x16xi32> loc(#loc266) + %new_idxs_278 = arith.xori %left_idx_268, %right_idx_269 : tensor<8x16xi32> loc(#loc267) + %new_idxs_279 = arith.select %cond_274, %new_idxs_278, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_280 = arith.xori %new_idxs_246, %new_idxs_279 : tensor<8x16xi32> loc(#loc269) + %y_281 = tt.reshape %ret_277 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc232) + %ileft_282 = arith.muli %y_281, %ileft_139 : tensor<16x2x4xi32> loc(#loc234) + %ileft_283 = "tt.reduce"(%ileft_282) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc307) + %ileft_284 = tt.expand_dims %ileft_283 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc236) + %ileft_285 = tt.broadcast %ileft_284 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc237) + %iright_286 = arith.muli %y_281, %flip_61 : tensor<16x2x4xi32> loc(#loc238) + %iright_287 = "tt.reduce"(%iright_286) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc309) + %iright_288 = tt.expand_dims %iright_287 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc240) + %iright_289 = tt.broadcast %iright_288 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc241) + %ileft_290 = tt.reshape %ileft_285 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_291 = tt.reshape %iright_289 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_292 = tt.reshape %new_idxs_280 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc244) + %left_idx_293 = arith.muli %y_idx_292, %ileft_139 : tensor<16x2x4xi32> loc(#loc246) + %left_idx_294 = "tt.reduce"(%left_idx_293) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc312) + %left_idx_295 = tt.expand_dims %left_idx_294 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc248) + %left_idx_296 = tt.broadcast %left_idx_295 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc249) + %right_idx_297 = arith.muli %y_idx_292, %flip_61 : tensor<16x2x4xi32> loc(#loc251) + %right_idx_298 = "tt.reduce"(%right_idx_297) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc315) + %right_idx_299 = tt.expand_dims %right_idx_298 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc253) + %right_idx_300 = tt.broadcast %right_idx_299 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc254) + %left_idx_301 = tt.reshape %left_idx_296 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_302 = tt.reshape %right_idx_300 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_303 = arith.cmpi slt, %ileft_290, %iright_291 : tensor<8x16xi32> loc(#loc257) + %eq_304 = arith.cmpi eq, %ileft_290, %iright_291 : tensor<8x16xi32> loc(#loc258) + %cond_305 = arith.cmpi sgt, %left_idx_301, %right_idx_302 : tensor<8x16xi32> loc(#loc259) + %cond_306 = arith.andi %eq_304, %cond_305 : tensor<8x16xi1> loc(#loc260) + %cond_307 = arith.ori %cond_303, %cond_306 : tensor<8x16xi1> loc(#loc261) + %ret_308 = arith.xori %ileft_290, %iright_291 : tensor<8x16xi32> loc(#loc264) + %ret_309 = arith.select %cond_307, %ret_308, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_310 = arith.xori %ret_277, %ret_309 : tensor<8x16xi32> loc(#loc266) + %new_idxs_311 = arith.xori %left_idx_301, %right_idx_302 : tensor<8x16xi32> loc(#loc267) + %new_idxs_312 = arith.select %cond_307, %new_idxs_311, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_313 = arith.xori %new_idxs_280, %new_idxs_312 : tensor<8x16xi32> loc(#loc269) + %y_314 = tt.reshape %ret_310 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc232) + %ileft_315 = arith.muli %y_314, %ileft_64 : tensor<32x2x2xi32> loc(#loc234) + %ileft_316 = "tt.reduce"(%ileft_315) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc307) + %ileft_317 = tt.expand_dims %ileft_316 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc236) + %ileft_318 = tt.broadcast %ileft_317 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc237) + %iright_319 = arith.muli %y_314, %flip_24 : tensor<32x2x2xi32> loc(#loc238) + %iright_320 = "tt.reduce"(%iright_319) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc309) + %iright_321 = tt.expand_dims %iright_320 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc240) + %iright_322 = tt.broadcast %iright_321 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc241) + %ileft_323 = tt.reshape %ileft_318 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_324 = tt.reshape %iright_322 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_325 = tt.reshape %new_idxs_313 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc244) + %left_idx_326 = arith.muli %y_idx_325, %ileft_64 : tensor<32x2x2xi32> loc(#loc246) + %left_idx_327 = "tt.reduce"(%left_idx_326) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc312) + %left_idx_328 = tt.expand_dims %left_idx_327 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc248) + %left_idx_329 = tt.broadcast %left_idx_328 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc249) + %right_idx_330 = arith.muli %y_idx_325, %flip_24 : tensor<32x2x2xi32> loc(#loc251) + %right_idx_331 = "tt.reduce"(%right_idx_330) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc315) + %right_idx_332 = tt.expand_dims %right_idx_331 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc253) + %right_idx_333 = tt.broadcast %right_idx_332 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc254) + %left_idx_334 = tt.reshape %left_idx_329 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_335 = tt.reshape %right_idx_333 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_336 = arith.cmpi slt, %ileft_323, %iright_324 : tensor<8x16xi32> loc(#loc257) + %eq_337 = arith.cmpi eq, %ileft_323, %iright_324 : tensor<8x16xi32> loc(#loc258) + %cond_338 = arith.cmpi sgt, %left_idx_334, %right_idx_335 : tensor<8x16xi32> loc(#loc259) + %cond_339 = arith.andi %eq_337, %cond_338 : tensor<8x16xi1> loc(#loc260) + %cond_340 = arith.ori %cond_336, %cond_339 : tensor<8x16xi1> loc(#loc261) + %ret_341 = arith.xori %ileft_323, %iright_324 : tensor<8x16xi32> loc(#loc264) + %ret_342 = arith.select %cond_340, %ret_341, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc265) + %ret_343 = arith.xori %ret_310, %ret_342 : tensor<8x16xi32> loc(#loc266) + %new_idxs_344 = arith.xori %left_idx_334, %right_idx_335 : tensor<8x16xi32> loc(#loc267) + %new_idxs_345 = arith.select %cond_340, %new_idxs_344, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_346 = arith.xori %new_idxs_313, %new_idxs_345 : tensor<8x16xi32> loc(#loc269) + %y_347 = tt.reshape %ret_343 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc232) + %ileft_348 = arith.muli %y_347, %ileft : tensor<64x2x1xi32> loc(#loc234) + %ileft_349 = "tt.reduce"(%ileft_348) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc307) + %ileft_350 = tt.expand_dims %ileft_349 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc236) + %ileft_351 = tt.broadcast %ileft_350 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc237) + %iright_352 = arith.muli %y_347, %iright : tensor<64x2x1xi32> loc(#loc238) + %iright_353 = "tt.reduce"(%iright_352) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc309) + %iright_354 = tt.expand_dims %iright_353 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc240) + %iright_355 = tt.broadcast %iright_354 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc241) + %ileft_356 = tt.reshape %ileft_351 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc242) + %iright_357 = tt.reshape %iright_355 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc243) + %y_idx_358 = tt.reshape %new_idxs_346 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc244) + %left_idx_359 = arith.muli %y_idx_358, %ileft : tensor<64x2x1xi32> loc(#loc246) + %left_idx_360 = "tt.reduce"(%left_idx_359) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc312) + %left_idx_361 = tt.expand_dims %left_idx_360 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc248) + %left_idx_362 = tt.broadcast %left_idx_361 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc249) + %right_idx_363 = arith.muli %y_idx_358, %iright : tensor<64x2x1xi32> loc(#loc251) + %right_idx_364 = "tt.reduce"(%right_idx_363) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc315) + %right_idx_365 = tt.expand_dims %right_idx_364 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc253) + %right_idx_366 = tt.broadcast %right_idx_365 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc254) + %left_idx_367 = tt.reshape %left_idx_362 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc255) + %right_idx_368 = tt.reshape %right_idx_366 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc256) + %cond_369 = arith.cmpi slt, %ileft_356, %iright_357 : tensor<8x16xi32> loc(#loc257) + %eq_370 = arith.cmpi eq, %ileft_356, %iright_357 : tensor<8x16xi32> loc(#loc258) + %cond_371 = arith.cmpi sgt, %left_idx_367, %right_idx_368 : tensor<8x16xi32> loc(#loc259) + %cond_372 = arith.andi %eq_370, %cond_371 : tensor<8x16xi1> loc(#loc260) + %cond_373 = arith.ori %cond_369, %cond_372 : tensor<8x16xi1> loc(#loc261) + %new_idxs_374 = arith.xori %left_idx_367, %right_idx_368 : tensor<8x16xi32> loc(#loc267) + %new_idxs_375 = arith.select %cond_373, %new_idxs_374, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc268) + %new_idxs_376 = arith.xori %new_idxs_346, %new_idxs_375 : tensor<8x16xi32> loc(#loc269) + %tmp14 = arith.cmpi eq, %tmp0_21, %cst_6 : tensor<8x16xi64> loc(#loc192) + %tmp16 = arith.extui %tmp14 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc224) + %y_377 = tt.reshape %tmp16 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc270) + %ileft_378 = arith.muli %y_377, %ileft : tensor<64x2x1xi32> loc(#loc271) + %ileft_379 = "tt.reduce"(%ileft_378) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc317) + %ileft_380 = tt.expand_dims %ileft_379 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc273) + %ileft_381 = tt.broadcast %ileft_380 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc274) + %iright_382 = arith.muli %y_377, %iright : tensor<64x2x1xi32> loc(#loc275) + %iright_383 = "tt.reduce"(%iright_382) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc319) + %iright_384 = tt.expand_dims %iright_383 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc277) + %iright_385 = tt.broadcast %iright_384 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc278) + %ileft_386 = tt.reshape %ileft_381 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_387 = tt.reshape %iright_385 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %cond_388 = arith.cmpi slt, %ileft_386, %iright_387 : tensor<8x16xi32> loc(#loc281) + %eq_389 = arith.cmpi eq, %ileft_386, %iright_387 : tensor<8x16xi32> loc(#loc282) + %cond_390 = arith.andi %eq_389, %cond_49 : tensor<8x16xi1> loc(#loc283) + %cond_391 = arith.ori %cond_388, %cond_390 : tensor<8x16xi1> loc(#loc284) + %cond_392 = arith.extui %cond_391 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc285) + %cond_393 = arith.xori %cond_392, %flip_25 : tensor<8x16xi32> loc(#loc285) + %cond_394 = arith.cmpi ne, %cond_393, %cst_3 : tensor<8x16xi32> loc(#loc286) + %ret_395 = arith.xori %ileft_386, %iright_387 : tensor<8x16xi32> loc(#loc287) + %ret_396 = arith.select %cond_394, %ret_395, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_397 = arith.xori %tmp16, %ret_396 : tensor<8x16xi32> loc(#loc289) + %new_idxs_398 = arith.select %cond_394, %new_idxs, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_399 = arith.xori %new_idxs_59, %new_idxs_398 : tensor<8x16xi32> loc(#loc291) + %y_400 = tt.reshape %ret_397 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc270) + %ileft_401 = arith.muli %y_400, %ileft_64 : tensor<32x2x2xi32> loc(#loc271) + %ileft_402 = "tt.reduce"(%ileft_401) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc317) + %ileft_403 = tt.expand_dims %ileft_402 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc273) + %ileft_404 = tt.broadcast %ileft_403 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc274) + %iright_405 = arith.muli %y_400, %flip_24 : tensor<32x2x2xi32> loc(#loc275) + %iright_406 = "tt.reduce"(%iright_405) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc319) + %iright_407 = tt.expand_dims %iright_406 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc277) + %iright_408 = tt.broadcast %iright_407 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc278) + %ileft_409 = tt.reshape %ileft_404 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_410 = tt.reshape %iright_408 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_411 = tt.reshape %new_idxs_399 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc292) + %left_idx_412 = arith.muli %y_idx_411, %ileft_64 : tensor<32x2x2xi32> loc(#loc293) + %left_idx_413 = "tt.reduce"(%left_idx_412) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc321) + %left_idx_414 = tt.expand_dims %left_idx_413 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc295) + %left_idx_415 = tt.broadcast %left_idx_414 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc296) + %right_idx_416 = arith.muli %y_idx_411, %flip_24 : tensor<32x2x2xi32> loc(#loc297) + %right_idx_417 = "tt.reduce"(%right_idx_416) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc323) + %right_idx_418 = tt.expand_dims %right_idx_417 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc299) + %right_idx_419 = tt.broadcast %right_idx_418 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc300) + %left_idx_420 = tt.reshape %left_idx_415 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_421 = tt.reshape %right_idx_419 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_422 = arith.cmpi slt, %ileft_409, %iright_410 : tensor<8x16xi32> loc(#loc281) + %eq_423 = arith.cmpi eq, %ileft_409, %iright_410 : tensor<8x16xi32> loc(#loc282) + %cond_424 = arith.cmpi sgt, %left_idx_420, %right_idx_421 : tensor<8x16xi32> loc(#loc303) + %cond_425 = arith.andi %eq_423, %cond_424 : tensor<8x16xi1> loc(#loc283) + %cond_426 = arith.ori %cond_422, %cond_425 : tensor<8x16xi1> loc(#loc284) + %cond_427 = arith.extui %cond_426 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc285) + %cond_428 = arith.xori %cond_427, %flip_62 : tensor<8x16xi32> loc(#loc285) + %cond_429 = arith.cmpi ne, %cond_428, %cst_3 : tensor<8x16xi32> loc(#loc286) + %ret_430 = arith.xori %ileft_409, %iright_410 : tensor<8x16xi32> loc(#loc287) + %ret_431 = arith.select %cond_429, %ret_430, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_432 = arith.xori %ret_397, %ret_431 : tensor<8x16xi32> loc(#loc289) + %new_idxs_433 = arith.xori %left_idx_420, %right_idx_421 : tensor<8x16xi32> loc(#loc304) + %new_idxs_434 = arith.select %cond_429, %new_idxs_433, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_435 = arith.xori %new_idxs_399, %new_idxs_434 : tensor<8x16xi32> loc(#loc291) + %y_436 = tt.reshape %ret_432 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc270) + %ileft_437 = arith.muli %y_436, %ileft : tensor<64x2x1xi32> loc(#loc271) + %ileft_438 = "tt.reduce"(%ileft_437) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc317) + %ileft_439 = tt.expand_dims %ileft_438 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc273) + %ileft_440 = tt.broadcast %ileft_439 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc274) + %iright_441 = arith.muli %y_436, %iright : tensor<64x2x1xi32> loc(#loc275) + %iright_442 = "tt.reduce"(%iright_441) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc319) + %iright_443 = tt.expand_dims %iright_442 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc277) + %iright_444 = tt.broadcast %iright_443 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc278) + %ileft_445 = tt.reshape %ileft_440 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_446 = tt.reshape %iright_444 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_447 = tt.reshape %new_idxs_435 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc292) + %left_idx_448 = arith.muli %y_idx_447, %ileft : tensor<64x2x1xi32> loc(#loc293) + %left_idx_449 = "tt.reduce"(%left_idx_448) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc321) + %left_idx_450 = tt.expand_dims %left_idx_449 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc295) + %left_idx_451 = tt.broadcast %left_idx_450 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc296) + %right_idx_452 = arith.muli %y_idx_447, %iright : tensor<64x2x1xi32> loc(#loc297) + %right_idx_453 = "tt.reduce"(%right_idx_452) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc323) + %right_idx_454 = tt.expand_dims %right_idx_453 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc299) + %right_idx_455 = tt.broadcast %right_idx_454 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc300) + %left_idx_456 = tt.reshape %left_idx_451 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_457 = tt.reshape %right_idx_455 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_458 = arith.cmpi slt, %ileft_445, %iright_446 : tensor<8x16xi32> loc(#loc281) + %eq_459 = arith.cmpi eq, %ileft_445, %iright_446 : tensor<8x16xi32> loc(#loc282) + %cond_460 = arith.cmpi sgt, %left_idx_456, %right_idx_457 : tensor<8x16xi32> loc(#loc303) + %cond_461 = arith.andi %eq_459, %cond_460 : tensor<8x16xi1> loc(#loc283) + %cond_462 = arith.ori %cond_458, %cond_461 : tensor<8x16xi1> loc(#loc284) + %cond_463 = arith.extui %cond_462 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc285) + %cond_464 = arith.xori %cond_463, %flip_62 : tensor<8x16xi32> loc(#loc285) + %cond_465 = arith.cmpi ne, %cond_464, %cst_3 : tensor<8x16xi32> loc(#loc286) + %ret_466 = arith.xori %ileft_445, %iright_446 : tensor<8x16xi32> loc(#loc287) + %ret_467 = arith.select %cond_465, %ret_466, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_468 = arith.xori %ret_432, %ret_467 : tensor<8x16xi32> loc(#loc289) + %new_idxs_469 = arith.xori %left_idx_456, %right_idx_457 : tensor<8x16xi32> loc(#loc304) + %new_idxs_470 = arith.select %cond_465, %new_idxs_469, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_471 = arith.xori %new_idxs_435, %new_idxs_470 : tensor<8x16xi32> loc(#loc291) + %y_472 = tt.reshape %ret_468 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc270) + %ileft_473 = arith.muli %y_472, %ileft_139 : tensor<16x2x4xi32> loc(#loc271) + %ileft_474 = "tt.reduce"(%ileft_473) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc317) + %ileft_475 = tt.expand_dims %ileft_474 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc273) + %ileft_476 = tt.broadcast %ileft_475 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc274) + %iright_477 = arith.muli %y_472, %flip_61 : tensor<16x2x4xi32> loc(#loc275) + %iright_478 = "tt.reduce"(%iright_477) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc319) + %iright_479 = tt.expand_dims %iright_478 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc277) + %iright_480 = tt.broadcast %iright_479 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc278) + %ileft_481 = tt.reshape %ileft_476 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_482 = tt.reshape %iright_480 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_483 = tt.reshape %new_idxs_471 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc292) + %left_idx_484 = arith.muli %y_idx_483, %ileft_139 : tensor<16x2x4xi32> loc(#loc293) + %left_idx_485 = "tt.reduce"(%left_idx_484) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc321) + %left_idx_486 = tt.expand_dims %left_idx_485 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc295) + %left_idx_487 = tt.broadcast %left_idx_486 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc296) + %right_idx_488 = arith.muli %y_idx_483, %flip_61 : tensor<16x2x4xi32> loc(#loc297) + %right_idx_489 = "tt.reduce"(%right_idx_488) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc323) + %right_idx_490 = tt.expand_dims %right_idx_489 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc299) + %right_idx_491 = tt.broadcast %right_idx_490 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc300) + %left_idx_492 = tt.reshape %left_idx_487 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_493 = tt.reshape %right_idx_491 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_494 = arith.cmpi slt, %ileft_481, %iright_482 : tensor<8x16xi32> loc(#loc281) + %eq_495 = arith.cmpi eq, %ileft_481, %iright_482 : tensor<8x16xi32> loc(#loc282) + %cond_496 = arith.cmpi sgt, %left_idx_492, %right_idx_493 : tensor<8x16xi32> loc(#loc303) + %cond_497 = arith.andi %eq_495, %cond_496 : tensor<8x16xi1> loc(#loc283) + %cond_498 = arith.ori %cond_494, %cond_497 : tensor<8x16xi1> loc(#loc284) + %cond_499 = arith.extui %cond_498 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc285) + %cond_500 = arith.xori %cond_499, %flip_137 : tensor<8x16xi32> loc(#loc285) + %cond_501 = arith.cmpi ne, %cond_500, %cst_3 : tensor<8x16xi32> loc(#loc286) + %ret_502 = arith.xori %ileft_481, %iright_482 : tensor<8x16xi32> loc(#loc287) + %ret_503 = arith.select %cond_501, %ret_502, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_504 = arith.xori %ret_468, %ret_503 : tensor<8x16xi32> loc(#loc289) + %new_idxs_505 = arith.xori %left_idx_492, %right_idx_493 : tensor<8x16xi32> loc(#loc304) + %new_idxs_506 = arith.select %cond_501, %new_idxs_505, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_507 = arith.xori %new_idxs_471, %new_idxs_506 : tensor<8x16xi32> loc(#loc291) + %y_508 = tt.reshape %ret_504 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc270) + %ileft_509 = arith.muli %y_508, %ileft_64 : tensor<32x2x2xi32> loc(#loc271) + %ileft_510 = "tt.reduce"(%ileft_509) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc317) + %ileft_511 = tt.expand_dims %ileft_510 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc273) + %ileft_512 = tt.broadcast %ileft_511 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc274) + %iright_513 = arith.muli %y_508, %flip_24 : tensor<32x2x2xi32> loc(#loc275) + %iright_514 = "tt.reduce"(%iright_513) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc319) + %iright_515 = tt.expand_dims %iright_514 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc277) + %iright_516 = tt.broadcast %iright_515 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc278) + %ileft_517 = tt.reshape %ileft_512 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_518 = tt.reshape %iright_516 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_519 = tt.reshape %new_idxs_507 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc292) + %left_idx_520 = arith.muli %y_idx_519, %ileft_64 : tensor<32x2x2xi32> loc(#loc293) + %left_idx_521 = "tt.reduce"(%left_idx_520) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc321) + %left_idx_522 = tt.expand_dims %left_idx_521 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc295) + %left_idx_523 = tt.broadcast %left_idx_522 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc296) + %right_idx_524 = arith.muli %y_idx_519, %flip_24 : tensor<32x2x2xi32> loc(#loc297) + %right_idx_525 = "tt.reduce"(%right_idx_524) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc323) + %right_idx_526 = tt.expand_dims %right_idx_525 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc299) + %right_idx_527 = tt.broadcast %right_idx_526 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc300) + %left_idx_528 = tt.reshape %left_idx_523 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_529 = tt.reshape %right_idx_527 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_530 = arith.cmpi slt, %ileft_517, %iright_518 : tensor<8x16xi32> loc(#loc281) + %eq_531 = arith.cmpi eq, %ileft_517, %iright_518 : tensor<8x16xi32> loc(#loc282) + %cond_532 = arith.cmpi sgt, %left_idx_528, %right_idx_529 : tensor<8x16xi32> loc(#loc303) + %cond_533 = arith.andi %eq_531, %cond_532 : tensor<8x16xi1> loc(#loc283) + %cond_534 = arith.ori %cond_530, %cond_533 : tensor<8x16xi1> loc(#loc284) + %cond_535 = arith.extui %cond_534 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc285) + %cond_536 = arith.xori %cond_535, %flip_137 : tensor<8x16xi32> loc(#loc285) + %cond_537 = arith.cmpi ne, %cond_536, %cst_3 : tensor<8x16xi32> loc(#loc286) + %ret_538 = arith.xori %ileft_517, %iright_518 : tensor<8x16xi32> loc(#loc287) + %ret_539 = arith.select %cond_537, %ret_538, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_540 = arith.xori %ret_504, %ret_539 : tensor<8x16xi32> loc(#loc289) + %new_idxs_541 = arith.xori %left_idx_528, %right_idx_529 : tensor<8x16xi32> loc(#loc304) + %new_idxs_542 = arith.select %cond_537, %new_idxs_541, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_543 = arith.xori %new_idxs_507, %new_idxs_542 : tensor<8x16xi32> loc(#loc291) + %y_544 = tt.reshape %ret_540 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc270) + %ileft_545 = arith.muli %y_544, %ileft : tensor<64x2x1xi32> loc(#loc271) + %ileft_546 = "tt.reduce"(%ileft_545) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc317) + %ileft_547 = tt.expand_dims %ileft_546 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc273) + %ileft_548 = tt.broadcast %ileft_547 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc274) + %iright_549 = arith.muli %y_544, %iright : tensor<64x2x1xi32> loc(#loc275) + %iright_550 = "tt.reduce"(%iright_549) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc319) + %iright_551 = tt.expand_dims %iright_550 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc277) + %iright_552 = tt.broadcast %iright_551 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc278) + %ileft_553 = tt.reshape %ileft_548 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_554 = tt.reshape %iright_552 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_555 = tt.reshape %new_idxs_543 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc292) + %left_idx_556 = arith.muli %y_idx_555, %ileft : tensor<64x2x1xi32> loc(#loc293) + %left_idx_557 = "tt.reduce"(%left_idx_556) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc321) + %left_idx_558 = tt.expand_dims %left_idx_557 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc295) + %left_idx_559 = tt.broadcast %left_idx_558 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc296) + %right_idx_560 = arith.muli %y_idx_555, %iright : tensor<64x2x1xi32> loc(#loc297) + %right_idx_561 = "tt.reduce"(%right_idx_560) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc323) + %right_idx_562 = tt.expand_dims %right_idx_561 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc299) + %right_idx_563 = tt.broadcast %right_idx_562 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc300) + %left_idx_564 = tt.reshape %left_idx_559 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_565 = tt.reshape %right_idx_563 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_566 = arith.cmpi slt, %ileft_553, %iright_554 : tensor<8x16xi32> loc(#loc281) + %eq_567 = arith.cmpi eq, %ileft_553, %iright_554 : tensor<8x16xi32> loc(#loc282) + %cond_568 = arith.cmpi sgt, %left_idx_564, %right_idx_565 : tensor<8x16xi32> loc(#loc303) + %cond_569 = arith.andi %eq_567, %cond_568 : tensor<8x16xi1> loc(#loc283) + %cond_570 = arith.ori %cond_566, %cond_569 : tensor<8x16xi1> loc(#loc284) + %cond_571 = arith.extui %cond_570 : tensor<8x16xi1> to tensor<8x16xi32> loc(#loc285) + %cond_572 = arith.xori %cond_571, %flip_137 : tensor<8x16xi32> loc(#loc285) + %cond_573 = arith.cmpi ne, %cond_572, %cst_3 : tensor<8x16xi32> loc(#loc286) + %ret_574 = arith.xori %ileft_553, %iright_554 : tensor<8x16xi32> loc(#loc287) + %ret_575 = arith.select %cond_573, %ret_574, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_576 = arith.xori %ret_540, %ret_575 : tensor<8x16xi32> loc(#loc289) + %new_idxs_577 = arith.xori %left_idx_564, %right_idx_565 : tensor<8x16xi32> loc(#loc304) + %new_idxs_578 = arith.select %cond_573, %new_idxs_577, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_579 = arith.xori %new_idxs_543, %new_idxs_578 : tensor<8x16xi32> loc(#loc291) + %y_580 = tt.reshape %ret_576 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc270) + %ileft_581 = arith.muli %y_580, %ileft_248 : tensor<8x2x8xi32> loc(#loc271) + %ileft_582 = "tt.reduce"(%ileft_581) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc317) + %ileft_583 = tt.expand_dims %ileft_582 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc273) + %ileft_584 = tt.broadcast %ileft_583 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc274) + %iright_585 = arith.muli %y_580, %flip_136 : tensor<8x2x8xi32> loc(#loc275) + %iright_586 = "tt.reduce"(%iright_585) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc319) + %iright_587 = tt.expand_dims %iright_586 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc277) + %iright_588 = tt.broadcast %iright_587 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc278) + %ileft_589 = tt.reshape %ileft_584 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_590 = tt.reshape %iright_588 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_591 = tt.reshape %new_idxs_579 : tensor<8x16xi32> -> tensor<8x2x8xi32> loc(#loc292) + %left_idx_592 = arith.muli %y_idx_591, %ileft_248 : tensor<8x2x8xi32> loc(#loc293) + %left_idx_593 = "tt.reduce"(%left_idx_592) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc321) + %left_idx_594 = tt.expand_dims %left_idx_593 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc295) + %left_idx_595 = tt.broadcast %left_idx_594 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc296) + %right_idx_596 = arith.muli %y_idx_591, %flip_136 : tensor<8x2x8xi32> loc(#loc297) + %right_idx_597 = "tt.reduce"(%right_idx_596) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<8x2x8xi32>) -> tensor<8x8xi32> loc(#loc323) + %right_idx_598 = tt.expand_dims %right_idx_597 {axis = 1 : i32} : tensor<8x8xi32> -> tensor<8x1x8xi32> loc(#loc299) + %right_idx_599 = tt.broadcast %right_idx_598 : tensor<8x1x8xi32> -> tensor<8x2x8xi32> loc(#loc300) + %left_idx_600 = tt.reshape %left_idx_595 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_601 = tt.reshape %right_idx_599 : tensor<8x2x8xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_602 = arith.cmpi slt, %ileft_589, %iright_590 : tensor<8x16xi32> loc(#loc281) + %eq_603 = arith.cmpi eq, %ileft_589, %iright_590 : tensor<8x16xi32> loc(#loc282) + %cond_604 = arith.cmpi sgt, %left_idx_600, %right_idx_601 : tensor<8x16xi32> loc(#loc303) + %cond_605 = arith.andi %eq_603, %cond_604 : tensor<8x16xi1> loc(#loc283) + %cond_606 = arith.ori %cond_602, %cond_605 : tensor<8x16xi1> loc(#loc284) + %ret_607 = arith.xori %ileft_589, %iright_590 : tensor<8x16xi32> loc(#loc287) + %ret_608 = arith.select %cond_606, %ret_607, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_609 = arith.xori %ret_576, %ret_608 : tensor<8x16xi32> loc(#loc289) + %new_idxs_610 = arith.xori %left_idx_600, %right_idx_601 : tensor<8x16xi32> loc(#loc304) + %new_idxs_611 = arith.select %cond_606, %new_idxs_610, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_612 = arith.xori %new_idxs_579, %new_idxs_611 : tensor<8x16xi32> loc(#loc291) + %y_613 = tt.reshape %ret_609 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc270) + %ileft_614 = arith.muli %y_613, %ileft_139 : tensor<16x2x4xi32> loc(#loc271) + %ileft_615 = "tt.reduce"(%ileft_614) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc317) + %ileft_616 = tt.expand_dims %ileft_615 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc273) + %ileft_617 = tt.broadcast %ileft_616 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc274) + %iright_618 = arith.muli %y_613, %flip_61 : tensor<16x2x4xi32> loc(#loc275) + %iright_619 = "tt.reduce"(%iright_618) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc319) + %iright_620 = tt.expand_dims %iright_619 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc277) + %iright_621 = tt.broadcast %iright_620 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc278) + %ileft_622 = tt.reshape %ileft_617 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_623 = tt.reshape %iright_621 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_624 = tt.reshape %new_idxs_612 : tensor<8x16xi32> -> tensor<16x2x4xi32> loc(#loc292) + %left_idx_625 = arith.muli %y_idx_624, %ileft_139 : tensor<16x2x4xi32> loc(#loc293) + %left_idx_626 = "tt.reduce"(%left_idx_625) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc321) + %left_idx_627 = tt.expand_dims %left_idx_626 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc295) + %left_idx_628 = tt.broadcast %left_idx_627 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc296) + %right_idx_629 = arith.muli %y_idx_624, %flip_61 : tensor<16x2x4xi32> loc(#loc297) + %right_idx_630 = "tt.reduce"(%right_idx_629) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<16x2x4xi32>) -> tensor<16x4xi32> loc(#loc323) + %right_idx_631 = tt.expand_dims %right_idx_630 {axis = 1 : i32} : tensor<16x4xi32> -> tensor<16x1x4xi32> loc(#loc299) + %right_idx_632 = tt.broadcast %right_idx_631 : tensor<16x1x4xi32> -> tensor<16x2x4xi32> loc(#loc300) + %left_idx_633 = tt.reshape %left_idx_628 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_634 = tt.reshape %right_idx_632 : tensor<16x2x4xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_635 = arith.cmpi slt, %ileft_622, %iright_623 : tensor<8x16xi32> loc(#loc281) + %eq_636 = arith.cmpi eq, %ileft_622, %iright_623 : tensor<8x16xi32> loc(#loc282) + %cond_637 = arith.cmpi sgt, %left_idx_633, %right_idx_634 : tensor<8x16xi32> loc(#loc303) + %cond_638 = arith.andi %eq_636, %cond_637 : tensor<8x16xi1> loc(#loc283) + %cond_639 = arith.ori %cond_635, %cond_638 : tensor<8x16xi1> loc(#loc284) + %ret_640 = arith.xori %ileft_622, %iright_623 : tensor<8x16xi32> loc(#loc287) + %ret_641 = arith.select %cond_639, %ret_640, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_642 = arith.xori %ret_609, %ret_641 : tensor<8x16xi32> loc(#loc289) + %new_idxs_643 = arith.xori %left_idx_633, %right_idx_634 : tensor<8x16xi32> loc(#loc304) + %new_idxs_644 = arith.select %cond_639, %new_idxs_643, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_645 = arith.xori %new_idxs_612, %new_idxs_644 : tensor<8x16xi32> loc(#loc291) + %y_646 = tt.reshape %ret_642 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc270) + %ileft_647 = arith.muli %y_646, %ileft_64 : tensor<32x2x2xi32> loc(#loc271) + %ileft_648 = "tt.reduce"(%ileft_647) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc317) + %ileft_649 = tt.expand_dims %ileft_648 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc273) + %ileft_650 = tt.broadcast %ileft_649 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc274) + %iright_651 = arith.muli %y_646, %flip_24 : tensor<32x2x2xi32> loc(#loc275) + %iright_652 = "tt.reduce"(%iright_651) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc319) + %iright_653 = tt.expand_dims %iright_652 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc277) + %iright_654 = tt.broadcast %iright_653 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc278) + %ileft_655 = tt.reshape %ileft_650 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_656 = tt.reshape %iright_654 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_657 = tt.reshape %new_idxs_645 : tensor<8x16xi32> -> tensor<32x2x2xi32> loc(#loc292) + %left_idx_658 = arith.muli %y_idx_657, %ileft_64 : tensor<32x2x2xi32> loc(#loc293) + %left_idx_659 = "tt.reduce"(%left_idx_658) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc321) + %left_idx_660 = tt.expand_dims %left_idx_659 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc295) + %left_idx_661 = tt.broadcast %left_idx_660 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc296) + %right_idx_662 = arith.muli %y_idx_657, %flip_24 : tensor<32x2x2xi32> loc(#loc297) + %right_idx_663 = "tt.reduce"(%right_idx_662) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<32x2x2xi32>) -> tensor<32x2xi32> loc(#loc323) + %right_idx_664 = tt.expand_dims %right_idx_663 {axis = 1 : i32} : tensor<32x2xi32> -> tensor<32x1x2xi32> loc(#loc299) + %right_idx_665 = tt.broadcast %right_idx_664 : tensor<32x1x2xi32> -> tensor<32x2x2xi32> loc(#loc300) + %left_idx_666 = tt.reshape %left_idx_661 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_667 = tt.reshape %right_idx_665 : tensor<32x2x2xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_668 = arith.cmpi slt, %ileft_655, %iright_656 : tensor<8x16xi32> loc(#loc281) + %eq_669 = arith.cmpi eq, %ileft_655, %iright_656 : tensor<8x16xi32> loc(#loc282) + %cond_670 = arith.cmpi sgt, %left_idx_666, %right_idx_667 : tensor<8x16xi32> loc(#loc303) + %cond_671 = arith.andi %eq_669, %cond_670 : tensor<8x16xi1> loc(#loc283) + %cond_672 = arith.ori %cond_668, %cond_671 : tensor<8x16xi1> loc(#loc284) + %ret_673 = arith.xori %ileft_655, %iright_656 : tensor<8x16xi32> loc(#loc287) + %ret_674 = arith.select %cond_672, %ret_673, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc288) + %ret_675 = arith.xori %ret_642, %ret_674 : tensor<8x16xi32> loc(#loc289) + %new_idxs_676 = arith.xori %left_idx_666, %right_idx_667 : tensor<8x16xi32> loc(#loc304) + %new_idxs_677 = arith.select %cond_672, %new_idxs_676, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_678 = arith.xori %new_idxs_645, %new_idxs_677 : tensor<8x16xi32> loc(#loc291) + %y_679 = tt.reshape %ret_675 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc270) + %ileft_680 = arith.muli %y_679, %ileft : tensor<64x2x1xi32> loc(#loc271) + %ileft_681 = "tt.reduce"(%ileft_680) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc317) + %ileft_682 = tt.expand_dims %ileft_681 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc273) + %ileft_683 = tt.broadcast %ileft_682 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc274) + %iright_684 = arith.muli %y_679, %iright : tensor<64x2x1xi32> loc(#loc275) + %iright_685 = "tt.reduce"(%iright_684) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc319) + %iright_686 = tt.expand_dims %iright_685 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc277) + %iright_687 = tt.broadcast %iright_686 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc278) + %ileft_688 = tt.reshape %ileft_683 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc279) + %iright_689 = tt.reshape %iright_687 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc280) + %y_idx_690 = tt.reshape %new_idxs_678 : tensor<8x16xi32> -> tensor<64x2x1xi32> loc(#loc292) + %left_idx_691 = arith.muli %y_idx_690, %ileft : tensor<64x2x1xi32> loc(#loc293) + %left_idx_692 = "tt.reduce"(%left_idx_691) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc321) + %left_idx_693 = tt.expand_dims %left_idx_692 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc295) + %left_idx_694 = tt.broadcast %left_idx_693 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc296) + %right_idx_695 = arith.muli %y_idx_690, %iright : tensor<64x2x1xi32> loc(#loc297) + %right_idx_696 = "tt.reduce"(%right_idx_695) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<64x2x1xi32>) -> tensor<64x1xi32> loc(#loc323) + %right_idx_697 = tt.expand_dims %right_idx_696 {axis = 1 : i32} : tensor<64x1xi32> -> tensor<64x1x1xi32> loc(#loc299) + %right_idx_698 = tt.broadcast %right_idx_697 : tensor<64x1x1xi32> -> tensor<64x2x1xi32> loc(#loc300) + %left_idx_699 = tt.reshape %left_idx_694 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc301) + %right_idx_700 = tt.reshape %right_idx_698 : tensor<64x2x1xi32> -> tensor<8x16xi32> loc(#loc302) + %cond_701 = arith.cmpi slt, %ileft_688, %iright_689 : tensor<8x16xi32> loc(#loc281) + %eq_702 = arith.cmpi eq, %ileft_688, %iright_689 : tensor<8x16xi32> loc(#loc282) + %cond_703 = arith.cmpi sgt, %left_idx_699, %right_idx_700 : tensor<8x16xi32> loc(#loc303) + %cond_704 = arith.andi %eq_702, %cond_703 : tensor<8x16xi1> loc(#loc283) + %cond_705 = arith.ori %cond_701, %cond_704 : tensor<8x16xi1> loc(#loc284) + %new_idxs_706 = arith.xori %left_idx_699, %right_idx_700 : tensor<8x16xi32> loc(#loc304) + %new_idxs_707 = arith.select %cond_705, %new_idxs_706, %cst_3 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc290) + %new_idxs_708 = arith.xori %new_idxs_678, %new_idxs_707 : tensor<8x16xi32> loc(#loc291) + %tmp20 = arith.extui %tmp5 : tensor<8x16xi1> to tensor<8x16xi64> loc(#loc226) + %tmp23 = arith.select %tmp0_20, %tmp20, %cst_7 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc197) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_713: i64 loc(callsite(#loc1 at #loc198)), %tmp24_714: i64 loc(callsite(#loc1 at #loc198))): + %tmp24_715 = arith.addi %tmp24_713, %tmp24_714 : i64 loc(#loc305) + tt.reduce.return %tmp24_715 : i64 loc(#loc227) + }) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc227) + %tmp24_709 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc199) + %tmp25 = arith.extui %tmp14 : tensor<8x16xi1> to tensor<8x16xi64> loc(#loc229) + %tmp28 = arith.select %tmp0_20, %tmp25, %cst_7 : tensor<8x16xi1>, tensor<8x16xi64> loc(#loc201) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_713: i64 loc(callsite(#loc1 at #loc202)), %tmp29_714: i64 loc(callsite(#loc1 at #loc202))): + %tmp29_715 = arith.addi %tmp29_713, %tmp29_714 : i64 loc(#loc306) + tt.reduce.return %tmp29_715 : i64 loc(#loc230) + }) : (tensor<8x16xi64>) -> tensor<8xi64> loc(#loc230) + %tmp29_710 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<8xi64> -> tensor<8x1xi64> loc(#loc203) + %tmp30 = arith.trunci %tmp24_709 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc204) + %tmp31 = arith.trunci %tmp29_710 : tensor<8x1xi64> to tensor<8x1xi32> loc(#loc205) + %tmp34 = tt.broadcast %tmp30 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc206) + %tmp34_711 = arith.cmpi slt, %tmp0_15, %tmp34 : tensor<8x16xi32> loc(#loc206) + %tmp36 = arith.select %tmp34_711, %new_idxs_376, %cst_5 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc207) + %tmp38 = arith.addi %tmp36, %cst_4 : tensor<8x16xi32> loc(#loc208) + %tmp39 = arith.cmpi slt, %tmp36, %cst_3 : tensor<8x16xi32> loc(#loc209) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc210) + %0 = arith.cmpi sge, %tmp40, %cst_3 : tensor<8x16xi32> loc(#loc88) + %1 = arith.cmpi slt, %tmp40, %cst_4 : tensor<8x16xi32> loc(#loc89) + %2 = arith.andi %0, %1 : tensor<8x16xi1> loc(#loc90) + %3 = arith.xori %xmask_13, %cst_2 : tensor<8x1xi1> loc(#loc91) + %4 = tt.broadcast %3 : tensor<8x1xi1> -> tensor<8x16xi1> loc(#loc92) + %5 = arith.ori %2, %4 : tensor<8x16xi1> loc(#loc92) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<8x16xi1> loc(#loc93) + %tmp45 = tt.broadcast %tmp31 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc211) + %tmp45_712 = arith.cmpi slt, %tmp0_15, %tmp45 : tensor<8x16xi32> loc(#loc211) + %tmp46 = arith.select %tmp45_712, %new_idxs_708, %cst_5 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc212) + %tmp47 = arith.addi %tmp46, %cst_4 : tensor<8x16xi32> loc(#loc213) + %tmp48 = arith.cmpi slt, %tmp46, %cst_3 : tensor<8x16xi32> loc(#loc214) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<8x16xi1>, tensor<8x16xi32> loc(#loc215) + %6 = arith.cmpi sge, %tmp49, %cst_3 : tensor<8x16xi32> loc(#loc99) + %7 = arith.cmpi slt, %tmp49, %cst_4 : tensor<8x16xi32> loc(#loc100) + %8 = arith.andi %6, %7 : tensor<8x16xi1> loc(#loc101) + %9 = arith.ori %8, %4 : tensor<8x16xi1> loc(#loc102) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<8x16xi1> loc(#loc103) + %10 = tt.splat %out_ptr4 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc104) + %11 = tt.addptr %10, %xindex_12 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc104) + tt.store %11, %tmp30, %xmask_13 : tensor<8x1x!tt.ptr> loc(#loc105) + %12 = tt.splat %out_ptr5 : !tt.ptr -> tensor<8x1x!tt.ptr> loc(#loc106) + %13 = tt.addptr %12, %xindex_12 : tensor<8x1x!tt.ptr>, tensor<8x1xi32> loc(#loc106) + tt.store %13, %tmp31, %xmask_13 : tensor<8x1x!tt.ptr> loc(#loc107) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc108) + %15 = tt.addptr %14, %tmp0_17 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc108) + tt.store %15, %new_idxs_376, %tmp0_20 : tensor<8x16x!tt.ptr> loc(#loc109) + %16 = arith.muli %xindex_12, %cst_1 : tensor<8x1xi32> loc(#loc110) + %17 = tt.broadcast %16 : tensor<8x1xi32> -> tensor<8x16xi32> loc(#loc111) + %18 = arith.addi %tmp40, %17 : tensor<8x16xi32> loc(#loc111) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc112) + %20 = tt.addptr %19, %18 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc112) + tt.store %20, %cst_0, %tmp0_20 : tensor<8x16x!tt.ptr> loc(#loc113) + %21 = tt.splat %out_ptr8 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc114) + %22 = tt.addptr %21, %tmp0_17 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc114) + tt.store %22, %new_idxs_708, %tmp0_20 : tensor<8x16x!tt.ptr> loc(#loc115) + %23 = arith.addi %tmp49, %17 : tensor<8x16xi32> loc(#loc116) + %24 = tt.splat %out_ptr9 : !tt.ptr -> tensor<8x16x!tt.ptr> loc(#loc117) + %25 = tt.addptr %24, %23 : tensor<8x16x!tt.ptr>, tensor<8x16xi32> loc(#loc117) + tt.store %25, %cst_0, %tmp0_20 : tensor<8x16x!tt.ptr> loc(#loc118) + tt.return loc(#loc119) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":26:21) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":27:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":27:38) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:30) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":36:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":38:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":39:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":41:19) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":40:19) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":43:19) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":45:34) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":47:20) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":49:21) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":48:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":52:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":54:35) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":55:29) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":56:21) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":58:35) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":59:29) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":60:21) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":61:21) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":64:19) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":66:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":68:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":69:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":70:35) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:28) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:46) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:38) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:55) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:53) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:63) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":75:19) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":76:35) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":77:20) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":78:20) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":79:35) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:28) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:46) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:38) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:53) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:63) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":81:25) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":81:37) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":82:25) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":82:37) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":83:25) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":83:47) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:52) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:49) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:25) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:85) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":85:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":85:47) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:49) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:25) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:85) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:4) +#loc129 = loc("xmask"(#loc2)) +#loc130 = loc("xoffset"(#loc3)) +#loc131 = loc("xoffset"(#loc4)) +#loc132 = loc("xindex"(#loc5)) +#loc133 = loc("xindex"(#loc6)) +#loc134 = loc("xindex"(#loc7)) +#loc135 = loc("r0_index"(#loc8)) +#loc136 = loc("r0_index"(#loc9)) +#loc137 = loc("tmp0"(#loc10)) +#loc138 = loc("tmp0"(#loc11)) +#loc139 = loc("tmp0"(#loc12)) +#loc140 = loc("tmp0"(#loc13)) +#loc141 = loc("tmp2"(#loc14)) +#loc142 = loc("tmp4"(#loc15)) +#loc143 = loc("tmp5"(#loc16)) +#loc144 = loc("tmp7"(#loc17)) +#loc145 = loc("tmp6"(#loc18)) +#loc146 = loc("tmp9"(#loc19)) +#loc147 = loc("tmp11"(#loc20)) +#loc148 = loc("flip"(#loc21)) +#loc150 = loc("flip"(#loc24)) +#loc151 = loc("flip"(#loc25)) +#loc152 = loc("flip"(#loc26)) +#loc153 = loc("y"(#loc27)) +#loc154 = loc("left_mask"(#loc29)) +#loc155 = loc("ileft"(#loc30)) +#loc157 = loc("ileft"(#loc34)) +#loc158 = loc("ileft"(#loc35)) +#loc159 = loc("iright"(#loc36)) +#loc161 = loc("iright"(#loc38)) +#loc162 = loc("iright"(#loc39)) +#loc163 = loc("ileft"(#loc40)) +#loc164 = loc("iright"(#loc41)) +#loc165 = loc("y_idx"(#loc42)) +#loc166 = loc("left_idx"(#loc43)) +#loc167 = loc("left_idx"(#loc44)) +#loc168 = loc("input"(#loc45)) +#loc170 = loc("left_idx"(#loc47)) +#loc171 = loc("left_idx"(#loc48)) +#loc172 = loc("right_idx"(#loc49)) +#loc173 = loc("right_idx"(#loc50)) +#loc175 = loc("right_idx"(#loc52)) +#loc176 = loc("right_idx"(#loc53)) +#loc177 = loc("left_idx"(#loc54)) +#loc178 = loc("right_idx"(#loc55)) +#loc179 = loc("cond"(#loc56)) +#loc180 = loc("eq"(#loc57)) +#loc181 = loc("cond"(#loc58)) +#loc182 = loc("cond"(#loc59)) +#loc183 = loc("cond"(#loc60)) +#loc184 = loc("cond"(#loc61)) +#loc185 = loc("cond"(#loc62)) +#loc186 = loc("ret"(#loc63)) +#loc187 = loc("ret"(#loc64)) +#loc188 = loc("ret"(#loc65)) +#loc189 = loc("new_idxs"(#loc66)) +#loc190 = loc("new_idxs"(#loc67)) +#loc191 = loc("new_idxs"(#loc68)) +#loc192 = loc("tmp14"(#loc69)) +#loc193 = loc("tmp16"(#loc70)) +#loc194 = loc("tmp15"(#loc71)) +#loc196 = loc("tmp20"(#loc73)) +#loc197 = loc("tmp23"(#loc74)) +#loc199 = loc("tmp24"(#loc76)) +#loc200 = loc("tmp25"(#loc77)) +#loc201 = loc("tmp28"(#loc78)) +#loc203 = loc("tmp29"(#loc80)) +#loc204 = loc("tmp30"(#loc81)) +#loc205 = loc("tmp31"(#loc82)) +#loc206 = loc("tmp34"(#loc83)) +#loc207 = loc("tmp36"(#loc84)) +#loc208 = loc("tmp38"(#loc85)) +#loc209 = loc("tmp39"(#loc86)) +#loc210 = loc("tmp40"(#loc87)) +#loc211 = loc("tmp45"(#loc94)) +#loc212 = loc("tmp46"(#loc95)) +#loc213 = loc("tmp47"(#loc96)) +#loc214 = loc("tmp48"(#loc97)) +#loc215 = loc("tmp49"(#loc98)) +#loc216 = loc(fused[#loc144, #loc145]) +#loc217 = loc(callsite(#loc148 at #loc149)) +#loc218 = loc(callsite(#loc150 at #loc149)) +#loc219 = loc(callsite(#loc151 at #loc149)) +#loc220 = loc(callsite(#loc152 at #loc149)) +#loc222 = loc("cond"(#loc179)) +#loc223 = loc("eq"(#loc180)) +#loc224 = loc(fused[#loc193, #loc194]) +#loc226 = loc(fused[#loc196, #loc144, #loc145]) +#loc227 = loc(callsite(#loc31 at #loc198)) +#loc229 = loc(fused[#loc200, #loc193, #loc194]) +#loc230 = loc(callsite(#loc31 at #loc202)) +#loc232 = loc(callsite(#loc153 at #loc221)) +#loc233 = loc(callsite(#loc154 at #loc221)) +#loc234 = loc(callsite(#loc155 at #loc221)) +#loc236 = loc(callsite(#loc157 at #loc221)) +#loc237 = loc(callsite(#loc158 at #loc221)) +#loc238 = loc(callsite(#loc159 at #loc221)) +#loc240 = loc(callsite(#loc161 at #loc221)) +#loc241 = loc(callsite(#loc162 at #loc221)) +#loc242 = loc(callsite(#loc163 at #loc221)) +#loc243 = loc(callsite(#loc164 at #loc221)) +#loc244 = loc(callsite(#loc165 at #loc221)) +#loc245 = loc(callsite(#loc166 at #loc221)) +#loc246 = loc(callsite(#loc167 at #loc221)) +#loc248 = loc(callsite(#loc170 at #loc221)) +#loc249 = loc(callsite(#loc171 at #loc221)) +#loc250 = loc(callsite(#loc172 at #loc221)) +#loc251 = loc(callsite(#loc173 at #loc221)) +#loc253 = loc(callsite(#loc175 at #loc221)) +#loc254 = loc(callsite(#loc176 at #loc221)) +#loc255 = loc(callsite(#loc177 at #loc221)) +#loc256 = loc(callsite(#loc178 at #loc221)) +#loc257 = loc(callsite(#loc222 at #loc221)) +#loc258 = loc(callsite(#loc223 at #loc221)) +#loc259 = loc(callsite(#loc181 at #loc221)) +#loc260 = loc(callsite(#loc182 at #loc221)) +#loc261 = loc(callsite(#loc183 at #loc221)) +#loc262 = loc(callsite(#loc184 at #loc221)) +#loc263 = loc(callsite(#loc185 at #loc221)) +#loc264 = loc(callsite(#loc186 at #loc221)) +#loc265 = loc(callsite(#loc187 at #loc221)) +#loc266 = loc(callsite(#loc188 at #loc221)) +#loc267 = loc(callsite(#loc189 at #loc221)) +#loc268 = loc(callsite(#loc190 at #loc221)) +#loc269 = loc(callsite(#loc191 at #loc221)) +#loc270 = loc(callsite(#loc153 at #loc225)) +#loc271 = loc(callsite(#loc155 at #loc225)) +#loc273 = loc(callsite(#loc157 at #loc225)) +#loc274 = loc(callsite(#loc158 at #loc225)) +#loc275 = loc(callsite(#loc159 at #loc225)) +#loc277 = loc(callsite(#loc161 at #loc225)) +#loc278 = loc(callsite(#loc162 at #loc225)) +#loc279 = loc(callsite(#loc163 at #loc225)) +#loc280 = loc(callsite(#loc164 at #loc225)) +#loc281 = loc(callsite(#loc222 at #loc225)) +#loc282 = loc(callsite(#loc223 at #loc225)) +#loc283 = loc(callsite(#loc182 at #loc225)) +#loc284 = loc(callsite(#loc183 at #loc225)) +#loc285 = loc(callsite(#loc184 at #loc225)) +#loc286 = loc(callsite(#loc185 at #loc225)) +#loc287 = loc(callsite(#loc186 at #loc225)) +#loc288 = loc(callsite(#loc187 at #loc225)) +#loc289 = loc(callsite(#loc188 at #loc225)) +#loc290 = loc(callsite(#loc190 at #loc225)) +#loc291 = loc(callsite(#loc191 at #loc225)) +#loc292 = loc(callsite(#loc165 at #loc225)) +#loc293 = loc(callsite(#loc167 at #loc225)) +#loc295 = loc(callsite(#loc170 at #loc225)) +#loc296 = loc(callsite(#loc171 at #loc225)) +#loc297 = loc(callsite(#loc173 at #loc225)) +#loc299 = loc(callsite(#loc175 at #loc225)) +#loc300 = loc(callsite(#loc176 at #loc225)) +#loc301 = loc(callsite(#loc177 at #loc225)) +#loc302 = loc(callsite(#loc178 at #loc225)) +#loc303 = loc(callsite(#loc181 at #loc225)) +#loc304 = loc(callsite(#loc189 at #loc225)) +#loc305 = loc(callsite(#loc33 at #loc227)) +#loc306 = loc(callsite(#loc33 at #loc230)) +#loc307 = loc(callsite(#loc31 at #loc235)) +#loc309 = loc(callsite(#loc31 at #loc239)) +#loc311 = loc(callsite(#loc168 at #loc247)) +#loc312 = loc(callsite(#loc31 at #loc247)) +#loc314 = loc(callsite(#loc168 at #loc252)) +#loc315 = loc(callsite(#loc31 at #loc252)) +#loc317 = loc(callsite(#loc31 at #loc272)) +#loc319 = loc(callsite(#loc31 at #loc276)) +#loc321 = loc(callsite(#loc31 at #loc294)) +#loc323 = loc(callsite(#loc31 at #loc298)) +#loc325 = loc(callsite(#loc33 at #loc307)) +#loc326 = loc(callsite(#loc33 at #loc309)) +#loc327 = loc(callsite(#loc33 at #loc312)) +#loc328 = loc(callsite(#loc33 at #loc315)) +#loc329 = loc(callsite(#loc33 at #loc317)) +#loc330 = loc(callsite(#loc33 at #loc319)) +#loc331 = loc(callsite(#loc33 at #loc321)) +#loc332 = loc(callsite(#loc33 at #loc323)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/7/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..554762f9f2fd13b86f0e11f31ad25f1eb3e76d6b --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "0c26ceae614c16280db457c0b301ae5b8479049d2cfeca61a0e3f0197645a333", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 8, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 256, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..bda4eca640c073eeb586de52baba00540d532a2a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.ptx @@ -0,0 +1,391 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u32 triton_red_fused_zeros_0_param_3, + .param .u32 triton_red_fused_zeros_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_6 +) +.reqntid 256 +{ + .reg .pred %p<5>; + .reg .b16 %rs<5>; + .reg .b32 %r<52>; + .reg .b64 %rd<25>; + .loc 1 18 0 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_red_fused_zeros_0_param_2]; + ld.param.b64 %rd7, [triton_red_fused_zeros_0_param_0]; + ld.param.b64 %rd10, [triton_red_fused_zeros_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:23:28 + mov.u32 %r7, %ctaid.x; + .loc 1 23 33 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:23:33 + shl.b32 %r1, %r7, 6; + .loc 1 24 44 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:24:44 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 252; + bfe.u32 %r8, %r2, 2, 6; + .loc 1 24 23 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:24:23 + or.b32 %r9, %r8, %r1; + .loc 1 26 37 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:26:37 + and.b32 %r10, %r2, 3; + .loc 1 29 21 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:29:21 + bfe.s32 %r11, %r7, 25, 1; + shr.u32 %r12, %r11, 21; + add.s32 %r13, %r9, %r12; + shr.s32 %r14, %r13, 11; + .loc 1 29 29 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:29:29 + shr.u32 %r15, %r14, 27; + add.s32 %r16, %r14, %r15; + and.b32 %r17, %r16, 33554400; + sub.s32 %r18, %r14, %r17; + .loc 1 30 19 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:30:19 + shr.u32 %r19, %r11, 16; + add.s32 %r20, %r9, %r19; + .loc 1 39 45 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:39:45 + shl.b32 %r21, %r18, 7; + .loc 1 39 68 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:39:68 + shl.b32 %r22, %r20, 7; + and.b32 %r23, %r22, -8388608; + .loc 1 33 40 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:33:40 + cvt.u64.u32 %rd11, %r10; + shl.b32 %r24, %r7, 13; + shl.b32 %r25, %r8, 7; + or.b32 %r26, %r24, %r25; + cvt.s64.s32 %rd12, %r26; + or.b64 %rd13, %rd12, %rd11; + shl.b64 %rd14, %rd13, 1; + add.s64 %rd23, %rd10, %rd14; + shl.b32 %r27, %r7, 18; + add.s32 %r28, %r23, %r27; + shl.b32 %r29, %r8, 12; + or.b32 %r30, %r28, %r29; + add.s32 %r31, %r30, %r21; + or.b32 %r32, %r31, %r10; + shl.b32 %r33, %r14, 23; + sub.s32 %r34, %r32, %r33; + cvt.u64.u32 %rd2, %r34; + mov.b32 %r51, 0f00000000; + mov.b64 %rd24, -4; +$L__BB0_1: // =>This Inner Loop Header: Depth=1 + .loc 1 39 34 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:39:34 + add.s64 %rd21, %rd2, %rd24; + cvt.u32.u64 %r35, %rd21; + add.s32 %r36, %r35, 4; + mad.wide.s32 %rd16, %r36, 2, %rd7; + .loc 1 39 73 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:39:73 + // begin inline asm + mov.u64 %rd15, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd15, 1.0; + // end inline asm + mov.b16 %rs2, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u16 %rs1, %rs2; + @%p1 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs1 }, [ %rd16 + 0 ], %rd15; + // end inline asm + .loc 1 39 127 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:39:127 + cvt.f32.bf16 %r37, %rs1; + .loc 1 40 50 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:40:50 + // begin inline asm + mov.u64 %rd18, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd18, 1.0; + // end inline asm + // begin inline asm + mov.u16 %rs3, %rs2; + @%p1 ld.global.L1::evict_first.L2::cache_hint.b16 { %rs3 }, [ %rd23 + 0 ], %rd18; + // end inline asm + .loc 1 40 104 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:40:104 + cvt.f32.bf16 %r38, %rs3; + .loc 1 43 23 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:43:23 + fma.rn.f32 %r51, %r37, %r38, %r51; + .loc 1 33 40 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:33:40 + add.s64 %rd24, %rd24, 4; + add.s64 %rd23, %rd23, 8; + setp.lt.u64 %p3, %rd24, 124; + @%p3 bra $L__BB0_1; +// %bb.2: + .loc 1 24 44 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:24:44 + and.b32 %r40, %r2, 63; + .loc 1 24 23 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:24:23 + or.b32 %r41, %r1, %r40; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:45:25 ] + shfl.sync.bfly.b32 %r42, %r51, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:45:25 ] + add.f32 %r43, %r51, %r42; + .loc 2 291 36 // standard.py:291:36 @[ cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:45:25 ] + shfl.sync.bfly.b32 %r44, %r43, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:45:25 ] + add.f32 %r45, %r43, %r44; +$L__tmp2: + .loc 1 45 28 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:45:28 + mov.b32 %r46, global_smem; + add.s32 %r47, %r46, %r3; + st.shared.b32 [%r47], %r45; + bar.sync 0; + shl.b32 %r48, %r40, 2; + add.s32 %r49, %r46, %r48; + ld.shared.b32 %r39, [%r49]; + .loc 1 49 25 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:49:25 + mad.wide.s32 %rd22, %r41, 4, %rd8; + .loc 1 49 36 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:49:36 + and.b32 %r50, %r2, 192; + setp.eq.b32 %p4, %r50, 0; + // begin inline asm + @%p4 st.global.b32 [ %rd22 + 0 ], { %r39 }; + // end inline asm + .loc 1 49 4 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:49:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 209 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xca DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 98 +.b8 108 +.b8 53 +.b8 119 +.b8 51 +.b8 103 +.b8 98 +.b8 115 +.b8 111 +.b8 105 +.b8 119 +.b8 111 +.b8 115 +.b8 110 +.b8 119 +.b8 107 +.b8 110 +.b8 102 +.b8 97 +.b8 114 +.b8 98 +.b8 53 +.b8 53 +.b8 108 +.b8 107 +.b8 108 +.b8 112 +.b8 102 +.b8 110 +.b8 116 +.b8 112 +.b8 117 +.b8 122 +.b8 54 +.b8 113 +.b8 52 +.b8 106 +.b8 106 +.b8 117 +.b8 105 +.b8 51 +.b8 53 +.b8 121 +.b8 105 +.b8 50 +.b8 119 +.b8 100 +.b8 119 +.b8 101 +.b8 112 +.b8 111 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 98 +.b8 108 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.source b/SpecForge-ext/cache/compiled_kernels/triton/7/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.source new file mode 100644 index 0000000000000000000000000000000000000000..8c5d30a2baeca5ee569f1dd4550a99ffe0e32de1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.source @@ -0,0 +1,222 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":18:0) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc46 = loc(unknown) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc53 = loc("in_ptr0"(#loc)) +#loc54 = loc("in_ptr1"(#loc)) +#loc55 = loc("out_ptr1"(#loc)) +#loc56 = loc("xnumel"(#loc)) +#loc57 = loc("r0_numel"(#loc)) +#loc97 = loc("input"(#loc44)) +#loc98 = loc("a"(#loc49)) +#loc99 = loc("b"(#loc49)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 524288 : i32 loc(#loc58) + %r0_numel_1 = arith.constant 128 : i32 loc(#loc59) + %xoffset = tt.get_program_id x : i32 loc(#loc60) + %xoffset_2 = arith.constant 64 : i32 loc(#loc61) + %xoffset_3 = arith.constant 64 : i32 loc(#loc61) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc61) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> loc(#loc62) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> loc(#loc63) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<64x1xi32> loc(#loc64) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<64x1xi32> loc(#loc64) + %xmask = arith.constant true loc(#loc65) + %xmask_8 = arith.constant dense : tensor<64x4xi1> loc(#loc65) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc66) + %r0_base_9 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32> -> tensor<1x4xi32> loc(#loc67) + %x0 = arith.constant 2048 : i32 loc(#loc68) + %x0_10 = arith.constant 2048 : i32 loc(#loc68) + %x0_11 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc68) + %x0_12 = arith.remsi %xindex_7, %x0_11 : tensor<64x1xi32> loc(#loc68) + %x1 = arith.constant 2048 : i32 loc(#loc69) + %x1_13 = arith.constant 2048 : i32 loc(#loc69) + %x1_14 = arith.constant dense<2048> : tensor<64x1xi32> loc(#loc69) + %x1_15 = arith.divsi %xindex_7, %x1_14 : tensor<64x1xi32> loc(#loc69) + %x1_16 = arith.constant 32 : i32 loc(#loc70) + %x1_17 = arith.constant 32 : i32 loc(#loc70) + %x1_18 = arith.constant dense<32> : tensor<64x1xi32> loc(#loc70) + %x1_19 = arith.remsi %x1_15, %x1_18 : tensor<64x1xi32> loc(#loc70) + %x2 = arith.constant 65536 : i32 loc(#loc71) + %x2_20 = arith.constant 65536 : i32 loc(#loc71) + %x2_21 = arith.constant dense<65536> : tensor<64x1xi32> loc(#loc71) + %x2_22 = arith.divsi %xindex_7, %x2_21 : tensor<64x1xi32> loc(#loc71) + %_tmp4 = arith.constant 0.000000e+00 : f32 loc(#loc72) + %_tmp4_23 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc72) + %c0_i32 = arith.constant 0 : i32 loc(#loc16) + %c4_i32 = arith.constant 4 : i32 loc(#loc16) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc16) + %1 = arith.bitcast %r0_numel_1 : i32 to i32 loc(#loc16) + %2 = arith.bitcast %c4_i32 : i32 to i32 loc(#loc16) + %3 = ub.poison : i32 loc(#loc16) + %_tmp4_24 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp4_27 = %_tmp4_23) -> (tensor<64x4xf32>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32> loc(#loc74) + %r0_index_28 = arith.addi %r0_index, %r0_base_9 : tensor<1x4xi32> loc(#loc74) + %r0_mask = arith.constant dense<128> : tensor<1x4xi32> loc(#loc75) + %r0_mask_29 = arith.cmpi slt, %r0_index_28, %r0_mask : tensor<1x4xi32> loc(#loc75) + %tmp0 = arith.constant 128 : i32 loc(#loc76) + %tmp0_30 = arith.constant 128 : i32 loc(#loc76) + %tmp0_31 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc76) + %tmp0_32 = arith.muli %tmp0_31, %x1_19 : tensor<64x1xi32> loc(#loc76) + %tmp0_33 = tt.broadcast %r0_index_28 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc77) + %tmp0_34 = tt.broadcast %tmp0_32 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc77) + %tmp0_35 = arith.addi %tmp0_33, %tmp0_34 : tensor<64x4xi32> loc(#loc77) + %tmp0_36 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_37 = arith.constant 4096 : i32 loc(#loc78) + %tmp0_38 = arith.constant dense<4096> : tensor<64x1xi32> loc(#loc78) + %tmp0_39 = arith.muli %tmp0_38, %x0_12 : tensor<64x1xi32> loc(#loc78) + %tmp0_40 = tt.broadcast %tmp0_39 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc79) + %tmp0_41 = arith.addi %tmp0_35, %tmp0_40 : tensor<64x4xi32> loc(#loc79) + %tmp0_42 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_43 = arith.constant 8388608 : i32 loc(#loc80) + %tmp0_44 = arith.constant dense<8388608> : tensor<64x1xi32> loc(#loc80) + %tmp0_45 = arith.muli %tmp0_44, %x2_22 : tensor<64x1xi32> loc(#loc80) + %tmp0_46 = tt.broadcast %tmp0_45 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc81) + %tmp0_47 = arith.addi %tmp0_41, %tmp0_46 : tensor<64x4xi32> loc(#loc81) + %tmp0_48 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc82) + %tmp0_49 = tt.addptr %tmp0_48, %tmp0_47 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc82) + %tmp0_50 = arith.constant 0.000000e+00 : f32 loc(#loc83) + %tmp0_51 = tt.broadcast %r0_mask_29 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc83) + %tmp0_52 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc83) + %tmp0_53 = arith.truncf %tmp0_52 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc83) + %tmp0_54 = tt.load %tmp0_49, %tmp0_51, %tmp0_53 evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc83) + %tmp0_55 = arith.extf %tmp0_54 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc84) + %tmp1 = arith.constant 128 : i32 loc(#loc85) + %tmp1_56 = arith.constant 128 : i32 loc(#loc85) + %tmp1_57 = arith.constant dense<128> : tensor<64x1xi32> loc(#loc85) + %tmp1_58 = arith.muli %tmp1_57, %xindex_7 : tensor<64x1xi32> loc(#loc85) + %tmp1_59 = tt.broadcast %r0_index_28 : tensor<1x4xi32> -> tensor<64x4xi32> loc(#loc86) + %tmp1_60 = tt.broadcast %tmp1_58 : tensor<64x1xi32> -> tensor<64x4xi32> loc(#loc86) + %tmp1_61 = arith.addi %tmp1_59, %tmp1_60 : tensor<64x4xi32> loc(#loc86) + %tmp1_62 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x4x!tt.ptr> loc(#loc87) + %tmp1_63 = tt.addptr %tmp1_62, %tmp1_61 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> loc(#loc87) + %tmp1_64 = arith.constant 0.000000e+00 : f32 loc(#loc88) + %tmp1_65 = tt.broadcast %r0_mask_29 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc88) + %tmp1_66 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> loc(#loc88) + %tmp1_67 = arith.truncf %tmp1_66 : tensor<64x4xf32> to tensor<64x4xbf16> loc(#loc88) + %tmp1_68 = tt.load %tmp1_63, %tmp1_65, %tmp1_67 evictionPolicy = evict_first : tensor<64x4x!tt.ptr> loc(#loc88) + %tmp1_69 = arith.extf %tmp1_68 : tensor<64x4xbf16> to tensor<64x4xf32> loc(#loc89) + %tmp2 = arith.mulf %tmp0_55, %tmp1_69 : tensor<64x4xf32> loc(#loc90) + %tmp5 = arith.addf %_tmp4_27, %tmp2 : tensor<64x4xf32> loc(#loc91) + %_tmp4_70 = tt.broadcast %r0_mask_29 : tensor<1x4xi1> -> tensor<64x4xi1> loc(#loc92) + %_tmp4_71 = arith.select %_tmp4_70, %tmp5, %_tmp4_27 : tensor<64x4xi1>, tensor<64x4xf32> loc(#loc92) + scf.yield %_tmp4_71 : tensor<64x4xf32> loc(#loc36) + } loc(#loc73) + %tmp4 = tt.call @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp4_24) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc93) + %tmp4_25 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> loc(#loc94) + %tmp7 = arith.constant 0.000000e+00 : f32 loc(#loc95) + %tmp8 = arith.constant dense<0.000000e+00> : tensor<64x1xf32> loc(#loc96) + %tmp8_26 = arith.subf %tmp4_25, %tmp8 : tensor<64x1xf32> loc(#loc96) + %4 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr> loc(#loc41) + %5 = tt.addptr %4, %xindex_7 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> loc(#loc41) + tt.store %5, %tmp8_26 : tensor<64x1x!tt.ptr> loc(#loc42) + tt.return loc(#loc43) + } loc(#loc) + tt.func private @"triton.language.standard.sum__fp32S64_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x4xf32> loc("input"(#loc44))) -> tensor<64xf32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: f32 loc(unknown), %arg2: f32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__fp32_fp32__(%arg1, %arg2) : (f32, f32) -> f32 loc(#loc45) + tt.reduce.return %2 : f32 loc(#loc45) + }) : (tensor<64x4xf32>) -> tensor<64xf32> loc(#loc45) + tt.return %0 : tensor<64xf32> loc(#loc47) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64xf32> loc(#loc48) + tt.return %1 : tensor<64xf32> loc(#loc48) + } loc(#loc44) + tt.func private @triton.language.standard._sum_combine__fp32_fp32__(%a: f32 loc("a"(#loc49)), %b: f32 loc("b"(#loc49))) -> f32 attributes {noinline = false} { + %0 = arith.addf %a, %b : f32 loc(#loc50) + tt.return %0 : f32 loc(#loc51) + ^bb1: // no predecessors + %1 = ub.poison : f32 loc(#loc52) + tt.return %1 : f32 loc(#loc52) + } loc(#loc49) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":23:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":23:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":24:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":24:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":24:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":25:46) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":29:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":30:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":32:43) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":33:40) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":34:31) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":35:29) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:45) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:41) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:55) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:50) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:68) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:60) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:73) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:127) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:45) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:41) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:34) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:50) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:104) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":41:22) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":43:23) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":44:40) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":44:8) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":45:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":45:28) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":47:11) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":48:18) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":49:25) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":49:36) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":49:4) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc58 = loc("xnumel"(#loc1)) +#loc59 = loc("r0_numel"(#loc2)) +#loc60 = loc("xoffset"(#loc3)) +#loc61 = loc("xoffset"(#loc4)) +#loc62 = loc("xindex"(#loc5)) +#loc63 = loc("xindex"(#loc6)) +#loc64 = loc("xindex"(#loc7)) +#loc65 = loc("xmask"(#loc8)) +#loc66 = loc("r0_base"(#loc9)) +#loc67 = loc("r0_base"(#loc10)) +#loc68 = loc("x0"(#loc11)) +#loc69 = loc("x1"(#loc12)) +#loc70 = loc("x1"(#loc13)) +#loc71 = loc("x2"(#loc14)) +#loc72 = loc("_tmp4"(#loc15)) +#loc73 = loc("_tmp4"(#loc16)) +#loc74 = loc("r0_index"(#loc17)) +#loc75 = loc("r0_mask"(#loc18)) +#loc76 = loc("tmp0"(#loc19)) +#loc77 = loc("tmp0"(#loc20)) +#loc78 = loc("tmp0"(#loc21)) +#loc79 = loc("tmp0"(#loc22)) +#loc80 = loc("tmp0"(#loc23)) +#loc81 = loc("tmp0"(#loc24)) +#loc82 = loc("tmp0"(#loc25)) +#loc83 = loc("tmp0"(#loc26)) +#loc84 = loc("tmp0"(#loc27)) +#loc85 = loc("tmp1"(#loc28)) +#loc86 = loc("tmp1"(#loc29)) +#loc87 = loc("tmp1"(#loc30)) +#loc88 = loc("tmp1"(#loc31)) +#loc89 = loc("tmp1"(#loc32)) +#loc90 = loc("tmp2"(#loc33)) +#loc91 = loc("tmp5"(#loc34)) +#loc92 = loc("_tmp4"(#loc35)) +#loc93 = loc("tmp4"(#loc37)) +#loc94 = loc("tmp4"(#loc38)) +#loc95 = loc("tmp7"(#loc39)) +#loc96 = loc("tmp8"(#loc40)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..19b0f12d73502c486c0b7a4c0cddae2d98e75f3f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/BQTM5LTBJQLCQDNUK7ALGANOLOCHSBE5FT7MUYNA4PYBS5SFUMZQ/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,155 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":18:0) +#loc1 = loc(unknown) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":45:25) +#loc39 = loc("in_ptr0"(#loc)) +#loc40 = loc("in_ptr1"(#loc)) +#loc41 = loc("out_ptr1"(#loc)) +#loc42 = loc("xnumel"(#loc)) +#loc43 = loc("r0_numel"(#loc)) +#loc73 = loc("tmp4"(#loc33)) +#loc76 = loc(callsite(#loc1 at #loc73)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<2048> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<32> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<65536> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<4096> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<8388608> : tensor<64x1xi32, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<1x4xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc44) + %xoffset_8 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc45) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc46) + %xindex_9 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc46) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc46) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc46) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked> loc(#loc47) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc47) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<64x1xi32, #blocked> loc(#loc47) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<64x1xi32, #blocked1> loc(#loc47) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc48) + %r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4xi32, #blocked> loc(#loc48) + %x0 = arith.remsi %xindex_14, %cst : tensor<64x1xi32, #blocked> loc(#loc49) + %x1 = arith.divsi %xindex_14, %cst : tensor<64x1xi32, #blocked> loc(#loc50) + %x1_17 = arith.remsi %x1, %cst_0 : tensor<64x1xi32, #blocked> loc(#loc51) + %x2 = arith.divsi %xindex_14, %cst_1 : tensor<64x1xi32, #blocked> loc(#loc52) + %tmp0 = arith.muli %x1_17, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc53) + %tmp0_18 = tt.broadcast %tmp0 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc54) + %tmp0_19 = arith.muli %x0, %cst_3 : tensor<64x1xi32, #blocked> loc(#loc55) + %tmp0_20 = tt.broadcast %tmp0_19 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc56) + %tmp0_21 = arith.muli %x2, %cst_4 : tensor<64x1xi32, #blocked> loc(#loc57) + %tmp0_22 = tt.broadcast %tmp0_21 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc58) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked> loc(#loc59) + %tmp1 = arith.muli %xindex_14, %cst_2 : tensor<64x1xi32, #blocked> loc(#loc60) + %tmp1_24 = tt.broadcast %tmp1 : tensor<64x1xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc61) + %tmp1_25 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked> loc(#loc62) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_28 = %cst_7) -> (tensor<64x4xf32, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked> loc(#loc64) + %r0_index_29 = arith.addi %r0_index, %r0_base_16 : tensor<1x4xi32, #blocked> loc(#loc64) + %r0_mask = arith.cmpi slt, %r0_index_29, %cst_6 : tensor<1x4xi32, #blocked> loc(#loc65) + %tmp0_30 = tt.broadcast %r0_index_29 : tensor<1x4xi32, #blocked> -> tensor<64x4xi32, #blocked> loc(#loc54) + %tmp0_31 = arith.addi %tmp0_30, %tmp0_18 : tensor<64x4xi32, #blocked> loc(#loc54) + %tmp0_32 = arith.addi %tmp0_31, %tmp0_20 : tensor<64x4xi32, #blocked> loc(#loc56) + %tmp0_33 = arith.addi %tmp0_32, %tmp0_22 : tensor<64x4xi32, #blocked> loc(#loc58) + %tmp0_34 = tt.addptr %tmp0_23, %tmp0_33 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi32, #blocked> loc(#loc59) + %tmp0_35 = tt.broadcast %r0_mask : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc66) + %tmp0_36 = tt.load %tmp0_34, %tmp0_35, %cst_5 evictionPolicy = evict_first : tensor<64x4x!tt.ptr, #blocked> loc(#loc66) + %tmp0_37 = arith.extf %tmp0_36 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc67) + %tmp1_38 = arith.addi %tmp0_30, %tmp1_24 : tensor<64x4xi32, #blocked> loc(#loc61) + %tmp1_39 = tt.addptr %tmp1_25, %tmp1_38 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi32, #blocked> loc(#loc62) + %tmp1_40 = tt.load %tmp1_39, %tmp0_35, %cst_5 evictionPolicy = evict_first : tensor<64x4x!tt.ptr, #blocked> loc(#loc68) + %tmp1_41 = arith.extf %tmp1_40 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc69) + %tmp2 = arith.mulf %tmp0_37, %tmp1_41 : tensor<64x4xf32, #blocked> loc(#loc70) + %tmp5 = arith.addf %_tmp4_28, %tmp2 : tensor<64x4xf32, #blocked> loc(#loc71) + %_tmp4_42 = arith.select %tmp0_35, %tmp5, %_tmp4_28 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc72) + scf.yield %_tmp4_42 : tensor<64x4xf32, #blocked> loc(#loc31) + } loc(#loc63) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_28: f32 loc(callsite(#loc1 at #loc73)), %tmp4_29: f32 loc(callsite(#loc1 at #loc73))): + %tmp4_30 = arith.addf %tmp4_28, %tmp4_29 : f32 loc(#loc77) + tt.reduce.return %tmp4_30 : f32 loc(#loc75) + }) : (tensor<64x4xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc75) + %tmp4_26 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc74) + %tmp4_27 = tt.expand_dims %tmp4_26 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc74) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc36) + %1 = tt.addptr %0, %xindex_15 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc36) + tt.store %1, %tmp4_27 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc37) + tt.return loc(#loc38) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":29:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":29:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":30:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:41) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:55) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:50) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:68) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:60) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:45) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:41) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":33:40) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":34:31) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":35:29) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:73) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:127) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:104) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":41:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":43:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":44:40) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":44:8) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":45:28) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":49:25) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":49:36) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":49:4) +#loc44 = loc("xoffset"(#loc2)) +#loc45 = loc("xoffset"(#loc3)) +#loc46 = loc("xindex"(#loc4)) +#loc47 = loc("xindex"(#loc5)) +#loc48 = loc("r0_base"(#loc6)) +#loc49 = loc("x0"(#loc7)) +#loc50 = loc("x1"(#loc8)) +#loc51 = loc("x1"(#loc9)) +#loc52 = loc("x2"(#loc10)) +#loc53 = loc("tmp0"(#loc11)) +#loc54 = loc("tmp0"(#loc12)) +#loc55 = loc("tmp0"(#loc13)) +#loc56 = loc("tmp0"(#loc14)) +#loc57 = loc("tmp0"(#loc15)) +#loc58 = loc("tmp0"(#loc16)) +#loc59 = loc("tmp0"(#loc17)) +#loc60 = loc("tmp1"(#loc18)) +#loc61 = loc("tmp1"(#loc19)) +#loc62 = loc("tmp1"(#loc20)) +#loc63 = loc("_tmp4"(#loc21)) +#loc64 = loc("r0_index"(#loc22)) +#loc65 = loc("r0_mask"(#loc23)) +#loc66 = loc("tmp0"(#loc24)) +#loc67 = loc("tmp0"(#loc25)) +#loc68 = loc("tmp1"(#loc26)) +#loc69 = loc("tmp1"(#loc27)) +#loc70 = loc("tmp2"(#loc28)) +#loc71 = loc("tmp5"(#loc29)) +#loc72 = loc("_tmp4"(#loc30)) +#loc74 = loc("tmp4"(#loc35)) +#loc75 = loc(callsite(#loc32 at #loc73)) +#loc77 = loc(callsite(#loc34 at #loc75)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.cubin new file mode 100644 index 0000000000000000000000000000000000000000..2b15fc97526898acfabd8ac861ec84669344885c Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.json b/SpecForge-ext/cache/compiled_kernels/triton/7/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ba053c60a955b83d36a1f6296a066d6581f27134 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.json @@ -0,0 +1 @@ +{"hash": "1b178f5dfe84e5bcbc00bc20de563fd4427bac43d930736557805e3c7c715574", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 16, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_zeros_0"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.llir new file mode 100644 index 0000000000000000000000000000000000000000..ef4cf4bfa3a9aa6dcb676790cb20ddc1292097c2 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.llir @@ -0,0 +1,158 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_zeros_0(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4, ptr addrspace(1) readnone captures(none) %5, ptr addrspace(1) readnone captures(none) %6) local_unnamed_addr #0 !dbg !4 { + %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %9 = shl i32 %8, 2, !dbg !8 + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %11 = and i32 %10, 96, !dbg !9 + %12 = lshr exact i32 %11, 5, !dbg !9 + %13 = and i32 %10, 3, !dbg !9 + %14 = or disjoint i32 %12, %9, !dbg !10 + %15 = or disjoint i32 %9, %13, !dbg !10 + %16 = shl nuw nsw i32 %10, 2, !dbg !11 + %17 = and i32 %16, 124, !dbg !11 + %18 = sdiv i32 %14, 2048, !dbg !12 + %19 = mul i32 %18, 2048, !dbg !13 + %.decomposed = sub i32 %14, %19, !dbg !13 + %20 = srem i32 %18, 32, !dbg !14 + %21 = sdiv i32 %14, 65536, !dbg !15 + %22 = shl nsw i32 %20, 7, !dbg !16 + %23 = shl nsw i32 %.decomposed, 12, !dbg !17 + %24 = shl i32 %21, 23, !dbg !18 + %25 = or disjoint i32 %23, %17, !dbg !19 + %26 = add i32 %25, %24, !dbg !20 + %27 = add i32 %26, %22, !dbg !21 + %28 = sext i32 %27 to i64, !dbg !22 + %29 = getelementptr bfloat, ptr addrspace(1) %0, i64 %28, !dbg !22 + %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !23 + %31 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %29, i64 %30, i1 true) #4, !dbg !23 + %32 = extractvalue { i32, i32 } %31, 0, !dbg !23 + %33 = bitcast i32 %32 to <2 x bfloat>, !dbg !23 + %34 = extractvalue { i32, i32 } %31, 1, !dbg !23 + %35 = bitcast i32 %34 to <2 x bfloat>, !dbg !23 + %36 = shl i32 %14, 7, !dbg !24 + %37 = or disjoint i32 %36, %17, !dbg !25 + %38 = sext i32 %37 to i64, !dbg !26 + %39 = getelementptr bfloat, ptr addrspace(1) %1, i64 %38, !dbg !26 + %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_first.b64 $0, 1.0;", "=l"() #4, !dbg !27 + %41 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, $2;\0A\09mov.u32 $1, $3;\0A\09@$6 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { $0, $1 }, [ $4 + 0 ], $5;", "=r,=r,r,r,l,l,b"(i32 0, i32 0, ptr addrspace(1) %39, i64 %40, i1 true) #4, !dbg !27 + %42 = extractvalue { i32, i32 } %41, 0, !dbg !27 + %43 = bitcast i32 %42 to <2 x bfloat>, !dbg !27 + %44 = extractvalue { i32, i32 } %41, 1, !dbg !27 + %45 = bitcast i32 %44 to <2 x bfloat>, !dbg !27 + %46 = fpext <2 x bfloat> %33 to <2 x float>, !dbg !28 + %47 = fpext <2 x bfloat> %43 to <2 x float>, !dbg !29 + %48 = fmul <2 x float> %46, %47, !dbg !30 + %49 = fadd <2 x float> %48, zeroinitializer, !dbg !31 + %50 = fpext <2 x bfloat> %35 to <2 x float>, !dbg !28 + %51 = fpext <2 x bfloat> %45 to <2 x float>, !dbg !29 + %52 = fmul <2 x float> %50, %51, !dbg !30 + %53 = fadd <2 x float> %52, zeroinitializer, !dbg !31 + %shift = shufflevector <2 x float> %49, <2 x float> poison, <2 x i32> , !dbg !32 + %foldExtExtBinop = fadd <2 x float> %49, %shift, !dbg !32 + %foldExtExtBinop2 = fadd <2 x float> %53, %foldExtExtBinop, !dbg !32 + %shift4 = shufflevector <2 x float> %53, <2 x float> poison, <2 x i32> , !dbg !32 + %foldExtExtBinop5 = fadd <2 x float> %shift4, %foldExtExtBinop2, !dbg !32 + %54 = extractelement <2 x float> %foldExtExtBinop5, i64 0, !dbg !32 + %55 = bitcast float %54 to i32, !dbg !36 + %56 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 16, i32 31), !dbg !36 + %57 = bitcast i32 %56 to float, !dbg !36 + %58 = fadd float %54, %57, !dbg !32 + %59 = bitcast float %58 to i32, !dbg !36 + %60 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %59, i32 8, i32 31), !dbg !36 + %61 = bitcast i32 %60 to float, !dbg !36 + %62 = fadd float %58, %61, !dbg !32 + %63 = bitcast float %62 to i32, !dbg !36 + %64 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 4, i32 31), !dbg !36 + %65 = bitcast i32 %64 to float, !dbg !36 + %66 = fadd float %62, %65, !dbg !32 + %67 = bitcast float %66 to i32, !dbg !36 + %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 2, i32 31), !dbg !36 + %69 = bitcast i32 %68 to float, !dbg !36 + %70 = fadd float %66, %69, !dbg !32 + %71 = bitcast float %70 to i32, !dbg !36 + %72 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 1, i32 31), !dbg !36 + %73 = bitcast i32 %72 to float, !dbg !36 + %74 = fadd float %70, %73, !dbg !32 + %75 = lshr exact i32 %11, 3, !dbg !37 + %76 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %75, !dbg !37 + store float %74, ptr addrspace(3) %76, align 4, !dbg !37 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !37 + %77 = shl nuw nsw i32 %13, 2, !dbg !37 + %78 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %77, !dbg !37 + %79 = load i32, ptr addrspace(3) %78, align 4, !dbg !37 + %80 = sext i32 %15 to i64, !dbg !38 + %81 = getelementptr float, ptr addrspace(1) %2, i64 %80, !dbg !38 + %82 = and i32 %10, 124, !dbg !39 + %83 = icmp eq i32 %82, 0, !dbg !39 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %79, ptr addrspace(1) %81, i1 %83) #4, !dbg !39 + ret void, !dbg !40 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="128" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_zeros_0", linkageName: "triton_red_fused_zeros_0", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 23, column: 28, scope: !4) +!8 = !DILocation(line: 23, column: 33, scope: !4) +!9 = !DILocation(line: 24, column: 44, scope: !4) +!10 = !DILocation(line: 24, column: 23, scope: !4) +!11 = !DILocation(line: 26, column: 37, scope: !4) +!12 = !DILocation(line: 29, column: 21, scope: !4) +!13 = !DILocation(line: 28, column: 19, scope: !4) +!14 = !DILocation(line: 29, column: 29, scope: !4) +!15 = !DILocation(line: 30, column: 19, scope: !4) +!16 = !DILocation(line: 39, column: 45, scope: !4) +!17 = !DILocation(line: 39, column: 55, scope: !4) +!18 = !DILocation(line: 39, column: 68, scope: !4) +!19 = !DILocation(line: 39, column: 41, scope: !4) +!20 = !DILocation(line: 39, column: 50, scope: !4) +!21 = !DILocation(line: 39, column: 60, scope: !4) +!22 = !DILocation(line: 39, column: 34, scope: !4) +!23 = !DILocation(line: 39, column: 73, scope: !4) +!24 = !DILocation(line: 40, column: 45, scope: !4) +!25 = !DILocation(line: 40, column: 41, scope: !4) +!26 = !DILocation(line: 40, column: 34, scope: !4) +!27 = !DILocation(line: 40, column: 50, scope: !4) +!28 = !DILocation(line: 39, column: 127, scope: !4) +!29 = !DILocation(line: 40, column: 104, scope: !4) +!30 = !DILocation(line: 41, column: 22, scope: !4) +!31 = !DILocation(line: 43, column: 23, scope: !4) +!32 = !DILocation(line: 261, column: 15, scope: !33, inlinedAt: !35) +!33 = distinct !DILexicalBlockFile(scope: !4, file: !34, discriminator: 0) +!34 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!35 = !DILocation(line: 45, column: 25, scope: !4) +!36 = !DILocation(line: 291, column: 36, scope: !33, inlinedAt: !35) +!37 = !DILocation(line: 45, column: 28, scope: !4) +!38 = !DILocation(line: 49, column: 25, scope: !4) +!39 = !DILocation(line: 49, column: 36, scope: !4) +!40 = !DILocation(line: 49, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ptx new file mode 100644 index 0000000000000000000000000000000000000000..172522e7c9a845417072331c37d714f70b751dc6 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ptx @@ -0,0 +1,412 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_zeros_0 // -- Begin function triton_red_fused_zeros_0 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_zeros_0 +.visible .entry triton_red_fused_zeros_0( + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_2, + .param .u32 triton_red_fused_zeros_0_param_3, + .param .u32 triton_red_fused_zeros_0_param_4, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused_zeros_0_param_6 +) +.reqntid 128 +{ + .reg .pred %p<4>; + .reg .b16 %rs<9>; + .reg .b32 %r<72>; + .reg .b64 %rd<11>; + .loc 1 18 0 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:18:0 + +// %bb.0: + ld.param.b64 %rd8, [triton_red_fused_zeros_0_param_0]; + ld.param.b64 %rd9, [triton_red_fused_zeros_0_param_1]; +$L__tmp0: + .loc 1 23 28 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:23:28 + mov.u32 %r10, %ctaid.x; + .loc 1 23 33 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:23:33 + shl.b32 %r11, %r10, 2; + ld.param.b64 %rd10, [triton_red_fused_zeros_0_param_2]; + .loc 1 24 44 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:24:44 + mov.u32 %r12, %tid.x; + and.b32 %r13, %r12, 96; + bfe.u32 %r14, %r12, 5, 2; + and.b32 %r15, %r12, 3; + .loc 1 24 23 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:24:23 + or.b32 %r16, %r14, %r11; + or.b32 %r17, %r11, %r15; + .loc 1 26 37 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:26:37 + shl.b32 %r18, %r12, 2; + and.b32 %r19, %r18, 124; + .loc 1 29 21 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:29:21 + bfe.s32 %r20, %r10, 29, 1; + shr.u32 %r21, %r20, 21; + add.s32 %r22, %r16, %r21; + shr.s32 %r23, %r22, 11; + .loc 1 28 19 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:28:19 + and.b32 %r24, %r22, 1046528; + sub.s32 %r25, %r16, %r24; + .loc 1 29 29 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:29:29 + shr.u32 %r26, %r23, 27; + add.s32 %r27, %r23, %r26; + and.b32 %r28, %r27, 33554400; + sub.s32 %r29, %r23, %r28; + .loc 1 30 19 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:30:19 + shr.u32 %r30, %r20, 16; + add.s32 %r31, %r16, %r30; + .loc 1 39 45 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:39:45 + shl.b32 %r32, %r29, 7; + .loc 1 39 55 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:39:55 + shl.b32 %r33, %r25, 12; + .loc 1 39 68 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:39:68 + shl.b32 %r34, %r31, 7; + and.b32 %r35, %r34, -8388608; + .loc 1 39 41 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:39:41 + or.b32 %r36, %r33, %r19; + .loc 1 39 50 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:39:50 + add.s32 %r37, %r36, %r35; + .loc 1 39 60 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:39:60 + add.s32 %r38, %r37, %r32; + .loc 1 39 34 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:39:34 + mad.wide.s32 %rd2, %r38, 2, %rd8; + .loc 1 39 73 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:39:73 + // begin inline asm + mov.u64 %rd3, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd3, 1.0; + // end inline asm + mov.b32 %r3, 0; + mov.pred %p1, -1; + // begin inline asm + mov.u32 %r1, %r3; + mov.u32 %r2, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r1, %r2 }, [ %rd2 + 0 ], %rd3; + // end inline asm + .loc 1 40 45 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:40:45 + shl.b32 %r39, %r16, 7; + .loc 1 40 41 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:40:41 + or.b32 %r40, %r39, %r19; + .loc 1 40 34 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:40:34 + mad.wide.s32 %rd5, %r40, 2, %rd9; + .loc 1 40 50 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:40:50 + // begin inline asm + mov.u64 %rd6, 0x0; + createpolicy.fractional.L2::evict_first.b64 %rd6, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r5, %r3; + mov.u32 %r6, %r3; + @%p1 ld.global.L1::evict_first.L2::cache_hint.v2.b32 { %r5, %r6 }, [ %rd5 + 0 ], %rd6; + // end inline asm + .loc 1 39 127 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:39:127 + mov.b32 {%rs1, %rs2}, %r1; + cvt.f32.bf16 %r41, %rs1; + cvt.f32.bf16 %r42, %rs2; + .loc 1 40 104 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:40:104 + mov.b32 {%rs3, %rs4}, %r5; + cvt.f32.bf16 %r43, %rs3; + cvt.f32.bf16 %r44, %rs4; + .loc 1 43 23 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:43:23 + fma.rn.f32 %r45, %r42, %r44, 0f00000000; + fma.rn.f32 %r46, %r41, %r43, 0f00000000; + .loc 1 39 127 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:39:127 + mov.b32 {%rs5, %rs6}, %r2; + cvt.f32.bf16 %r47, %rs5; + cvt.f32.bf16 %r48, %rs6; + .loc 1 40 104 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:40:104 + mov.b32 {%rs7, %rs8}, %r6; + cvt.f32.bf16 %r49, %rs7; + cvt.f32.bf16 %r50, %rs8; + .loc 1 43 23 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:43:23 + fma.rn.f32 %r51, %r48, %r50, 0f00000000; + fma.rn.f32 %r52, %r47, %r49, 0f00000000; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:45:25 ] + add.f32 %r53, %r46, %r45; + add.f32 %r54, %r52, %r53; + add.f32 %r55, %r51, %r54; + .loc 2 291 36 // standard.py:291:36 @[ cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:45:25 ] + shfl.sync.bfly.b32 %r56, %r55, 16, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:45:25 ] + add.f32 %r57, %r55, %r56; + .loc 2 291 36 // standard.py:291:36 @[ cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:45:25 ] + shfl.sync.bfly.b32 %r58, %r57, 8, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:45:25 ] + add.f32 %r59, %r57, %r58; + .loc 2 291 36 // standard.py:291:36 @[ cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:45:25 ] + shfl.sync.bfly.b32 %r60, %r59, 4, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:45:25 ] + add.f32 %r61, %r59, %r60; + .loc 2 291 36 // standard.py:291:36 @[ cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:45:25 ] + shfl.sync.bfly.b32 %r62, %r61, 2, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:45:25 ] + add.f32 %r63, %r61, %r62; + .loc 2 291 36 // standard.py:291:36 @[ cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:45:25 ] + shfl.sync.bfly.b32 %r64, %r63, 1, 31, -1; + .loc 2 261 15 // standard.py:261:15 @[ cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:45:25 ] + add.f32 %r65, %r63, %r64; +$L__tmp2: + .loc 1 45 28 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:45:28 + shr.u32 %r66, %r13, 3; + mov.b32 %r67, global_smem; + add.s32 %r68, %r67, %r66; + st.shared.b32 [%r68], %r65; + bar.sync 0; + shl.b32 %r69, %r15, 2; + add.s32 %r70, %r67, %r69; + ld.shared.b32 %r9, [%r70]; + .loc 1 49 25 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:49:25 + mad.wide.s32 %rd7, %r17, 4, %rd10; + .loc 1 49 36 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:49:36 + and.b32 %r71, %r12, 124; + setp.eq.b32 %p3, %r71, 0; + // begin inline asm + @%p3 st.global.b32 [ %rd7 + 0 ], { %r9 }; + // end inline asm + .loc 1 49 4 // cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py:49:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 209 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xca DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 98 +.b8 108 +.b8 53 +.b8 119 +.b8 51 +.b8 103 +.b8 98 +.b8 115 +.b8 111 +.b8 105 +.b8 119 +.b8 111 +.b8 115 +.b8 110 +.b8 119 +.b8 107 +.b8 110 +.b8 102 +.b8 97 +.b8 114 +.b8 98 +.b8 53 +.b8 53 +.b8 108 +.b8 107 +.b8 108 +.b8 112 +.b8 102 +.b8 110 +.b8 116 +.b8 112 +.b8 117 +.b8 122 +.b8 54 +.b8 113 +.b8 52 +.b8 106 +.b8 106 +.b8 117 +.b8 105 +.b8 51 +.b8 53 +.b8 121 +.b8 105 +.b8 50 +.b8 119 +.b8 100 +.b8 119 +.b8 101 +.b8 112 +.b8 111 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 98 +.b8 108 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x1b DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 48 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xa6:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xbb:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 45 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..cb110af8a9fb44e055f0d7c194355f8db291cfff --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,142 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":18:0) +#loc1 = loc(unknown) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":45:25) +#loc36 = loc("in_ptr0"(#loc)) +#loc37 = loc("in_ptr1"(#loc)) +#loc38 = loc("out_ptr1"(#loc)) +#loc39 = loc("xnumel"(#loc)) +#loc40 = loc("r0_numel"(#loc)) +#loc68 = loc("tmp4"(#loc30)) +#loc71 = loc(callsite(#loc1 at #loc68)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<128> : tensor<1x128xi32, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<8388608> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<65536> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<32> : tensor<4x1xi32, #blocked> loc(#loc1) + %cst_5 = arith.constant dense<2048> : tensor<4x1xi32, #blocked> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %cst_6 = arith.constant dense<0.000000e+00> : tensor<4x128xbf16, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc41) + %xoffset_8 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc42) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc43) + %xindex_9 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc43) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4x1xi32, #blocked> loc(#loc43) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xi32, #blocked1> loc(#loc43) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<4x1xi32, #blocked> loc(#loc44) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<4x1xi32, #blocked1> loc(#loc44) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<4x1xi32, #blocked> loc(#loc44) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<4x1xi32, #blocked1> loc(#loc44) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc45) + %r0_base_16 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> loc(#loc45) + %x0 = arith.remsi %xindex_14, %cst_5 : tensor<4x1xi32, #blocked> loc(#loc46) + %x1 = arith.divsi %xindex_14, %cst_5 : tensor<4x1xi32, #blocked> loc(#loc47) + %x1_17 = arith.remsi %x1, %cst_4 : tensor<4x1xi32, #blocked> loc(#loc48) + %x2 = arith.divsi %xindex_14, %cst_3 : tensor<4x1xi32, #blocked> loc(#loc49) + %r0_mask = arith.cmpi slt, %r0_base_16, %cst : tensor<1x128xi32, #blocked> loc(#loc50) + %tmp0 = arith.muli %x1_17, %cst_0 : tensor<4x1xi32, #blocked> loc(#loc51) + %tmp0_18 = tt.broadcast %r0_base_16 : tensor<1x128xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc52) + %tmp0_19 = tt.broadcast %tmp0 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc52) + %tmp0_20 = arith.addi %tmp0_18, %tmp0_19 : tensor<4x128xi32, #blocked> loc(#loc52) + %tmp0_21 = arith.muli %x0, %cst_1 : tensor<4x1xi32, #blocked> loc(#loc53) + %tmp0_22 = tt.broadcast %tmp0_21 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc54) + %tmp0_23 = arith.addi %tmp0_20, %tmp0_22 : tensor<4x128xi32, #blocked> loc(#loc54) + %tmp0_24 = arith.muli %x2, %cst_2 : tensor<4x1xi32, #blocked> loc(#loc55) + %tmp0_25 = tt.broadcast %tmp0_24 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc56) + %tmp0_26 = arith.addi %tmp0_23, %tmp0_25 : tensor<4x128xi32, #blocked> loc(#loc56) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr, #blocked> loc(#loc57) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<4x128x!tt.ptr, #blocked>, tensor<4x128xi32, #blocked> loc(#loc57) + %tmp0_29 = tt.broadcast %r0_mask : tensor<1x128xi1, #blocked> -> tensor<4x128xi1, #blocked> loc(#loc58) + %tmp0_30 = tt.load %tmp0_28, %tmp0_29, %cst_6 evictionPolicy = evict_first : tensor<4x128x!tt.ptr, #blocked> loc(#loc58) + %tmp0_31 = arith.extf %tmp0_30 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc59) + %tmp1 = arith.muli %xindex_14, %cst_0 : tensor<4x1xi32, #blocked> loc(#loc60) + %tmp1_32 = tt.broadcast %tmp1 : tensor<4x1xi32, #blocked> -> tensor<4x128xi32, #blocked> loc(#loc61) + %tmp1_33 = arith.addi %tmp0_18, %tmp1_32 : tensor<4x128xi32, #blocked> loc(#loc61) + %tmp1_34 = tt.splat %in_ptr1 : !tt.ptr -> tensor<4x128x!tt.ptr, #blocked> loc(#loc62) + %tmp1_35 = tt.addptr %tmp1_34, %tmp1_33 : tensor<4x128x!tt.ptr, #blocked>, tensor<4x128xi32, #blocked> loc(#loc62) + %tmp1_36 = tt.load %tmp1_35, %tmp0_29, %cst_6 evictionPolicy = evict_first : tensor<4x128x!tt.ptr, #blocked> loc(#loc63) + %tmp1_37 = arith.extf %tmp1_36 : tensor<4x128xbf16, #blocked> to tensor<4x128xf32, #blocked> loc(#loc64) + %tmp2 = arith.mulf %tmp0_31, %tmp1_37 : tensor<4x128xf32, #blocked> loc(#loc65) + %tmp5 = arith.addf %tmp2, %cst_7 : tensor<4x128xf32, #blocked> loc(#loc66) + %_tmp4 = arith.select %tmp0_29, %tmp5, %cst_7 : tensor<4x128xi1, #blocked>, tensor<4x128xf32, #blocked> loc(#loc67) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_40: f32 loc(callsite(#loc1 at #loc68)), %tmp4_41: f32 loc(callsite(#loc1 at #loc68))): + %tmp4_42 = arith.addf %tmp4_40, %tmp4_41 : f32 loc(#loc72) + tt.reduce.return %tmp4_42 : f32 loc(#loc70) + }) : (tensor<4x128xf32, #blocked>) -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc70) + %tmp4_38 = ttg.convert_layout %tmp4 : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc69) + %tmp4_39 = tt.expand_dims %tmp4_38 {axis = 1 : i32} : tensor<4xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<4x1xf32, #blocked1> loc(#loc69) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<4x1x!tt.ptr, #blocked1> loc(#loc33) + %1 = tt.addptr %0, %xindex_15 : tensor<4x1x!tt.ptr, #blocked1>, tensor<4x1xi32, #blocked1> loc(#loc33) + tt.store %1, %tmp4_39 : tensor<4x1x!tt.ptr, #blocked1> loc(#loc34) + tt.return loc(#loc35) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":23:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":23:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":24:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":24:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":26:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":28:19) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":29:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":29:29) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":30:19) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":35:29) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:45) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:41) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:55) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:50) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:68) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:60) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:34) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:73) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:127) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:45) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:41) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:34) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:50) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:104) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":41:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":43:23) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":44:40) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":45:28) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":49:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":49:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":49:4) +#loc41 = loc("xoffset"(#loc2)) +#loc42 = loc("xoffset"(#loc3)) +#loc43 = loc("xindex"(#loc4)) +#loc44 = loc("xindex"(#loc5)) +#loc45 = loc("r0_base"(#loc6)) +#loc46 = loc("x0"(#loc7)) +#loc47 = loc("x1"(#loc8)) +#loc48 = loc("x1"(#loc9)) +#loc49 = loc("x2"(#loc10)) +#loc50 = loc("r0_mask"(#loc11)) +#loc51 = loc("tmp0"(#loc12)) +#loc52 = loc("tmp0"(#loc13)) +#loc53 = loc("tmp0"(#loc14)) +#loc54 = loc("tmp0"(#loc15)) +#loc55 = loc("tmp0"(#loc16)) +#loc56 = loc("tmp0"(#loc17)) +#loc57 = loc("tmp0"(#loc18)) +#loc58 = loc("tmp0"(#loc19)) +#loc59 = loc("tmp0"(#loc20)) +#loc60 = loc("tmp1"(#loc21)) +#loc61 = loc("tmp1"(#loc22)) +#loc62 = loc("tmp1"(#loc23)) +#loc63 = loc("tmp1"(#loc24)) +#loc64 = loc("tmp1"(#loc25)) +#loc65 = loc("tmp2"(#loc26)) +#loc66 = loc("tmp5"(#loc27)) +#loc67 = loc("_tmp4"(#loc28)) +#loc69 = loc("tmp4"(#loc32)) +#loc70 = loc(callsite(#loc29 at #loc68)) +#loc72 = loc(callsite(#loc31 at #loc70)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ttir new file mode 100644 index 0000000000000000000000000000000000000000..8a64e2127d8febd4f44a53330d11f2d4cd84a6e8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/DMLY6XP6QTS3ZPAAXQQN4VR72RBHXLCD3EYHGZKXQBPDY7DRKV2A/triton_red_fused_zeros_0.ttir @@ -0,0 +1,139 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":18:0) +#loc1 = loc(unknown) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":45:25) +#loc38 = loc("in_ptr0"(#loc)) +#loc39 = loc("in_ptr1"(#loc)) +#loc40 = loc("out_ptr1"(#loc)) +#loc41 = loc("xnumel"(#loc)) +#loc42 = loc("r0_numel"(#loc)) +#loc72 = loc("tmp4"(#loc32)) +#loc75 = loc(callsite(#loc1 at #loc72)) +module { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<4x128xbf16> loc(#loc1) + %cst_0 = arith.constant dense<8388608> : tensor<4x1xi32> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<4x1xi32> loc(#loc1) + %cst_2 = arith.constant dense<128> : tensor<4x1xi32> loc(#loc1) + %cst_3 = arith.constant dense<128> : tensor<1x128xi32> loc(#loc1) + %cst_4 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> loc(#loc1) + %x2 = arith.constant dense<65536> : tensor<4x1xi32> loc(#loc43) + %x1 = arith.constant dense<32> : tensor<4x1xi32> loc(#loc44) + %cst_5 = arith.constant dense<2048> : tensor<4x1xi32> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc45) + %xoffset_6 = arith.muli %xoffset, %c4_i32 : i32 loc(#loc46) + %xindex = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> loc(#loc47) + %xindex_7 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<4xi32> -> tensor<4x1xi32> loc(#loc48) + %xindex_8 = tt.splat %xoffset_6 : i32 -> tensor<4x1xi32> loc(#loc49) + %xindex_9 = arith.addi %xindex_8, %xindex_7 : tensor<4x1xi32> loc(#loc49) + %r0_base = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> loc(#loc50) + %r0_base_10 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32> loc(#loc51) + %x0 = arith.remsi %xindex_9, %cst_5 : tensor<4x1xi32> loc(#loc52) + %x1_11 = arith.divsi %xindex_9, %cst_5 : tensor<4x1xi32> loc(#loc53) + %x1_12 = arith.remsi %x1_11, %x1 : tensor<4x1xi32> loc(#loc44) + %x2_13 = arith.divsi %xindex_9, %x2 : tensor<4x1xi32> loc(#loc43) + %r0_mask = arith.cmpi slt, %r0_base_10, %cst_3 : tensor<1x128xi32> loc(#loc54) + %tmp0 = arith.muli %x1_12, %cst_2 : tensor<4x1xi32> loc(#loc55) + %tmp0_14 = tt.broadcast %r0_base_10 : tensor<1x128xi32> -> tensor<4x128xi32> loc(#loc56) + %tmp0_15 = tt.broadcast %tmp0 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc56) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<4x128xi32> loc(#loc56) + %tmp0_17 = arith.muli %x0, %cst_1 : tensor<4x1xi32> loc(#loc57) + %tmp0_18 = tt.broadcast %tmp0_17 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc58) + %tmp0_19 = arith.addi %tmp0_16, %tmp0_18 : tensor<4x128xi32> loc(#loc58) + %tmp0_20 = arith.muli %x2_13, %cst_0 : tensor<4x1xi32> loc(#loc59) + %tmp0_21 = tt.broadcast %tmp0_20 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc60) + %tmp0_22 = arith.addi %tmp0_19, %tmp0_21 : tensor<4x128xi32> loc(#loc60) + %tmp0_23 = tt.splat %in_ptr0 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc61) + %tmp0_24 = tt.addptr %tmp0_23, %tmp0_22 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc61) + %tmp0_25 = tt.broadcast %r0_mask : tensor<1x128xi1> -> tensor<4x128xi1> loc(#loc62) + %tmp0_26 = tt.load %tmp0_24, %tmp0_25, %cst evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc62) + %tmp0_27 = arith.extf %tmp0_26 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc63) + %tmp1 = arith.muli %xindex_9, %cst_2 : tensor<4x1xi32> loc(#loc64) + %tmp1_28 = tt.broadcast %tmp1 : tensor<4x1xi32> -> tensor<4x128xi32> loc(#loc65) + %tmp1_29 = arith.addi %tmp0_14, %tmp1_28 : tensor<4x128xi32> loc(#loc65) + %tmp1_30 = tt.splat %in_ptr1 : !tt.ptr -> tensor<4x128x!tt.ptr> loc(#loc66) + %tmp1_31 = tt.addptr %tmp1_30, %tmp1_29 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> loc(#loc66) + %tmp1_32 = tt.load %tmp1_31, %tmp0_25, %cst evictionPolicy = evict_first : tensor<4x128x!tt.ptr> loc(#loc67) + %tmp1_33 = arith.extf %tmp1_32 : tensor<4x128xbf16> to tensor<4x128xf32> loc(#loc68) + %tmp2 = arith.mulf %tmp0_27, %tmp1_33 : tensor<4x128xf32> loc(#loc69) + %tmp5 = arith.addf %tmp2, %cst_4 : tensor<4x128xf32> loc(#loc70) + %_tmp4 = arith.select %tmp0_25, %tmp5, %cst_4 : tensor<4x128xi1>, tensor<4x128xf32> loc(#loc71) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_35: f32 loc(callsite(#loc1 at #loc72)), %tmp4_36: f32 loc(callsite(#loc1 at #loc72))): + %tmp4_37 = arith.addf %tmp4_35, %tmp4_36 : f32 loc(#loc76) + tt.reduce.return %tmp4_37 : f32 loc(#loc74) + }) : (tensor<4x128xf32>) -> tensor<4xf32> loc(#loc74) + %tmp4_34 = tt.expand_dims %tmp4 {axis = 1 : i32} : tensor<4xf32> -> tensor<4x1xf32> loc(#loc73) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<4x1x!tt.ptr> loc(#loc35) + %1 = tt.addptr %0, %xindex_9 : tensor<4x1x!tt.ptr>, tensor<4x1xi32> loc(#loc35) + tt.store %1, %tmp4_34 : tensor<4x1x!tt.ptr> loc(#loc36) + tt.return loc(#loc37) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":30:19) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":29:29) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":23:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":23:33) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":24:36) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":24:44) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":24:23) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":26:27) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":26:37) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":28:19) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":29:21) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":35:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:45) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:41) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:55) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:50) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:68) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:60) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:73) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":39:127) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:45) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:41) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:50) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":40:104) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":41:22) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":43:23) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":44:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":45:28) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":49:25) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":49:36) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bl/cbl5w3gbsoiwosnwknfarb55lklpfntpuz6q4jjui35yi2wdwepo.py":49:4) +#loc43 = loc("x2"(#loc2)) +#loc44 = loc("x1"(#loc3)) +#loc45 = loc("xoffset"(#loc4)) +#loc46 = loc("xoffset"(#loc5)) +#loc47 = loc("xindex"(#loc6)) +#loc48 = loc("xindex"(#loc7)) +#loc49 = loc("xindex"(#loc8)) +#loc50 = loc("r0_base"(#loc9)) +#loc51 = loc("r0_base"(#loc10)) +#loc52 = loc("x0"(#loc11)) +#loc53 = loc("x1"(#loc12)) +#loc54 = loc("r0_mask"(#loc13)) +#loc55 = loc("tmp0"(#loc14)) +#loc56 = loc("tmp0"(#loc15)) +#loc57 = loc("tmp0"(#loc16)) +#loc58 = loc("tmp0"(#loc17)) +#loc59 = loc("tmp0"(#loc18)) +#loc60 = loc("tmp0"(#loc19)) +#loc61 = loc("tmp0"(#loc20)) +#loc62 = loc("tmp0"(#loc21)) +#loc63 = loc("tmp0"(#loc22)) +#loc64 = loc("tmp1"(#loc23)) +#loc65 = loc("tmp1"(#loc24)) +#loc66 = loc("tmp1"(#loc25)) +#loc67 = loc("tmp1"(#loc26)) +#loc68 = loc("tmp1"(#loc27)) +#loc69 = loc("tmp2"(#loc28)) +#loc70 = loc("tmp5"(#loc29)) +#loc71 = loc("_tmp4"(#loc30)) +#loc73 = loc("tmp4"(#loc34)) +#loc74 = loc(callsite(#loc31 at #loc72)) +#loc76 = loc(callsite(#loc33 at #loc74)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..aff8818c0360273741f91c88952de34f6ec04b19 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..63a1ae0b292bab489027df88030a63e771797c10 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ffc570e253857448771e8107fc02cc3a37b0af51 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"hash": "216454436a742975997ad2492b746bae3cb4daa170e2ff11d7e2252d07da3898", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 4, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 4096, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..95098fde01dbd6b5aff06bef5fa83952d62e8a5e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir @@ -0,0 +1,1533 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_1 = internal constant [8 x i8] c"unknown\00" +@assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py\00" +@assertMessage_1 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp49 < 17\00" +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py\00" +@assertMessage_0 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp40 < 17\00" +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8], align 16 + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %13 = shl i32 %12, 5, !dbg !11 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %15 = and i32 %14, 124, !dbg !12 + %16 = lshr exact i32 %15, 2, !dbg !12 + %17 = and i32 %14, 31, !dbg !12 + %18 = or disjoint i32 %16, %13, !dbg !13 + %19 = icmp slt i32 %18, 128, !dbg !14 + %20 = and i32 %14, 3, !dbg !15 + %21 = shl i32 %18, 4, !dbg !16 + %22 = and i32 %14, 1, !dbg !17 + %23 = lshr i32 %14, 1, !dbg !17 + %.lobit = and i32 %23, 1, !dbg !17 + %24 = xor i32 %22, 1, !dbg !21 + %25 = xor i32 %.lobit, 1, !dbg !21 + %26 = trunc i32 %14 to i1, !dbg !22 + %27 = trunc i32 %23 to i1, !dbg !22 + %28 = insertelement <2 x i1> poison, i1 %27, i64 0, !dbg !23 + %29 = shufflevector <2 x i1> %28, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !23 + %30 = insertelement <4 x i32> poison, i32 %25, i64 0, !dbg !25 + %31 = shufflevector <4 x i32> %30, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !25 + %32 = insertelement <4 x i32> poison, i32 %.lobit, i64 0, !dbg !26 + %33 = shufflevector <4 x i32> %32, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !26 + %34 = insertelement <4 x i32> poison, i32 %24, i64 0, !dbg !27 + %35 = shufflevector <4 x i32> %34, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !27 + %36 = insertelement <4 x i32> poison, i32 %22, i64 0, !dbg !28 + %37 = shufflevector <4 x i32> %36, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !28 + %38 = insertelement <2 x i1> poison, i1 %26, i64 0, !dbg !22 + %39 = shufflevector <2 x i1> %38, <2 x i1> poison, <2 x i32> zeroinitializer, !dbg !22 + %40 = insertelement <2 x i32> poison, i32 %24, i64 0, !dbg !29 + %41 = shufflevector <2 x i32> %40, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !29 + %42 = insertelement <2 x i32> poison, i32 %22, i64 0, !dbg !30 + %43 = shufflevector <2 x i32> %42, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !30 + %44 = insertelement <2 x i32> %42, i32 %24, i64 1, !dbg !31 + %45 = insertelement <2 x i32> %40, i32 %22, i64 1, !dbg !32 + %46 = insertelement <4 x i1> poison, i1 %19, i64 0, !dbg !33 + %47 = shufflevector <4 x i1> %46, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !33 + %48 = shl nuw nsw i32 %15, 1, !dbg !34 + %49 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %48, !dbg !34 + %50 = shl nuw nsw i32 %17, 3, !dbg !34 + %51 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %50, !dbg !34 + %52 = shl nuw nsw i32 %20, 2, !dbg !15 + %53 = or disjoint i32 %52, 1, !dbg !15 + %54 = or disjoint i32 %52, 2, !dbg !15 + %55 = or disjoint i32 %52, 3, !dbg !15 + %56 = or disjoint i32 %21, %52, !dbg !35 + %57 = or disjoint i32 %21, %54, !dbg !35 + %58 = sext i32 %56 to i64, !dbg !36 + %59 = getelementptr i64, ptr addrspace(1) %0, i64 %58, !dbg !36 + %60 = sext i32 %57 to i64, !dbg !36 + %61 = getelementptr i64, ptr addrspace(1) %0, i64 %60, !dbg !36 + %62 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %59, i1 %19) #6, !dbg !37 + %63 = extractvalue { i64, i64 } %62, 0, !dbg !37 + %64 = extractvalue { i64, i64 } %62, 1, !dbg !37 + %65 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %61, i1 %19) #6, !dbg !37 + %66 = extractvalue { i64, i64 } %65, 0, !dbg !37 + %67 = extractvalue { i64, i64 } %65, 1, !dbg !37 + %68 = xor i32 %53, %52, !dbg !38 + %69 = xor i32 %54, %55, !dbg !38 + %70 = insertelement <4 x i64> poison, i64 %63, i64 0, !dbg !39 + %71 = insertelement <4 x i64> %70, i64 %64, i64 1, !dbg !39 + %72 = insertelement <4 x i64> %71, i64 %66, i64 2, !dbg !39 + %73 = insertelement <4 x i64> %72, i64 %67, i64 3, !dbg !39 + %74 = add <4 x i64> %73, splat (i64 -1), !dbg !39 + %75 = icmp ult <4 x i64> %74, splat (i64 16383), !dbg !39 + %76 = extractelement <4 x i1> %75, i64 0, !dbg !40 + %77 = zext i1 %76 to i32, !dbg !41 + %78 = extractelement <4 x i1> %75, i64 1, !dbg !40 + %79 = zext i1 %78 to i32, !dbg !41 + %80 = extractelement <4 x i1> %75, i64 2, !dbg !42 + %81 = zext i1 %80 to i32, !dbg !41 + %82 = extractelement <4 x i1> %75, i64 3, !dbg !42 + %83 = zext i1 %82 to i32, !dbg !41 + %84 = xor i1 %76, true, !dbg !40 + %85 = and i1 %78, %84, !dbg !40 + %.not = xor i1 %82, true, !dbg !42 + %86 = or i1 %80, %.not, !dbg !42 + %87 = xor i32 %77, %79, !dbg !43 + %88 = xor i32 %81, %83, !dbg !43 + %89 = select i1 %85, i32 %87, i32 0, !dbg !44 + %90 = select i1 %86, i32 %88, i32 0, !dbg !44 + %91 = xor i32 %89, %77, !dbg !45 + %92 = xor i32 %89, %79, !dbg !45 + %93 = xor i32 %90, %81, !dbg !45 + %94 = xor i32 %90, %83, !dbg !45 + %95 = select i1 %85, i32 %68, i32 0, !dbg !46 + %96 = select i1 %86, i32 %69, i32 0, !dbg !46 + %97 = xor i32 %95, %52, !dbg !47 + %98 = xor i32 %95, %53, !dbg !47 + %99 = xor i32 %96, %54, !dbg !47 + %100 = xor i32 %96, %55, !dbg !47 + %101 = insertelement <2 x i32> poison, i32 %91, i64 0, !dbg !22 + %102 = insertelement <2 x i32> %101, i32 %92, i64 1, !dbg !22 + %103 = insertelement <2 x i32> poison, i32 %93, i64 0, !dbg !22 + %104 = insertelement <2 x i32> %103, i32 %94, i64 1, !dbg !22 + %105 = icmp samesign uge <2 x i32> %102, %104, !dbg !22 + %106 = insertelement <2 x i32> %104, i32 %92, i64 1, !dbg !22 + %107 = insertelement <2 x i32> %102, i32 %94, i64 1, !dbg !22 + %108 = icmp ne <2 x i32> %106, %107, !dbg !22 + %109 = insertelement <2 x i32> poison, i32 %97, i64 0, !dbg !22 + %110 = insertelement <2 x i32> %109, i32 %98, i64 1, !dbg !22 + %111 = insertelement <2 x i32> poison, i32 %99, i64 0, !dbg !22 + %112 = insertelement <2 x i32> %111, i32 %100, i64 1, !dbg !22 + %113 = icmp ule <2 x i32> %110, %112, !dbg !22 + %114 = or <2 x i1> %113, %108, !dbg !22 + %115 = and <2 x i1> %105, %114, !dbg !22 + %116 = xor <2 x i1> %115, %39, !dbg !22 + %117 = xor <2 x i32> %107, %106, !dbg !43 + %118 = select <2 x i1> %116, <2 x i32> zeroinitializer, <2 x i32> %117, !dbg !44 + %119 = extractelement <2 x i32> %118, i64 0, !dbg !45 + %120 = xor i32 %119, %91, !dbg !45 + %121 = extractelement <2 x i32> %118, i64 1, !dbg !45 + %122 = xor i32 %121, %92, !dbg !45 + %123 = xor <2 x i32> %118, %104, !dbg !45 + %124 = xor i32 %99, %97, !dbg !38 + %125 = xor i32 %100, %98, !dbg !38 + %126 = extractelement <2 x i1> %116, i64 0, !dbg !46 + %127 = select i1 %126, i32 0, i32 %124, !dbg !46 + %128 = extractelement <2 x i1> %116, i64 1, !dbg !46 + %129 = select i1 %128, i32 0, i32 %125, !dbg !46 + %130 = xor i32 %127, %97, !dbg !47 + %131 = xor i32 %129, %98, !dbg !47 + %132 = xor i32 %127, %99, !dbg !47 + %133 = xor i32 %129, %100, !dbg !47 + %134 = icmp samesign uge i32 %120, %122, !dbg !22 + %135 = icmp ne i32 %120, %122, !dbg !22 + %136 = icmp samesign ule i32 %130, %131, !dbg !22 + %137 = or i1 %135, %136, !dbg !22 + %138 = and i1 %134, %137, !dbg !22 + %.not3 = xor i1 %138, %26, !dbg !22 + %139 = extractelement <2 x i32> %123, i64 0, !dbg !22 + %140 = extractelement <2 x i32> %123, i64 1, !dbg !22 + %141 = icmp samesign uge i32 %139, %140, !dbg !22 + %142 = icmp ne i32 %139, %140, !dbg !22 + %143 = icmp samesign ule i32 %132, %133, !dbg !22 + %144 = or i1 %142, %143, !dbg !22 + %145 = and i1 %141, %144, !dbg !22 + %.not4 = xor i1 %145, %26, !dbg !22 + %146 = xor i32 %120, %122, !dbg !43 + %147 = xor i32 %139, %140, !dbg !43 + %148 = select i1 %.not3, i32 0, i32 %146, !dbg !44 + %149 = select i1 %.not4, i32 0, i32 %147, !dbg !44 + %150 = xor i32 %148, %120, !dbg !45 + %151 = xor i32 %148, %122, !dbg !45 + %152 = xor i32 %149, %139, !dbg !45 + %153 = xor i32 %149, %140, !dbg !45 + %154 = xor i32 %130, %131, !dbg !38 + %155 = xor i32 %132, %133, !dbg !38 + %156 = select i1 %.not3, i32 0, i32 %154, !dbg !46 + %157 = select i1 %.not4, i32 0, i32 %155, !dbg !46 + %158 = xor i32 %156, %130, !dbg !47 + %159 = xor i32 %156, %131, !dbg !47 + %160 = xor i32 %157, %132, !dbg !47 + %161 = xor i32 %157, %133, !dbg !47 + %162 = mul nuw nsw i32 %150, %24, !dbg !29 + %163 = mul nuw nsw i32 %151, %24, !dbg !29 + %164 = mul nuw nsw i32 %152, %24, !dbg !29 + %165 = mul nuw nsw i32 %153, %24, !dbg !29 + %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %162, i32 1, i32 31), !dbg !48 + %167 = add i32 %162, %166, !dbg !51 + %168 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %163, i32 1, i32 31), !dbg !48 + %169 = add i32 %163, %168, !dbg !51 + %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 1, i32 31), !dbg !48 + %171 = add i32 %164, %170, !dbg !51 + %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 1, i32 31), !dbg !48 + %173 = add i32 %165, %172, !dbg !51 + %174 = mul nuw nsw i32 %150, %22, !dbg !30 + %175 = mul nuw nsw i32 %151, %22, !dbg !30 + %176 = mul nuw nsw i32 %152, %22, !dbg !30 + %177 = mul nuw nsw i32 %153, %22, !dbg !30 + %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 1, i32 31), !dbg !48 + %179 = add i32 %174, %178, !dbg !51 + %180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 1, i32 31), !dbg !48 + %181 = add i32 %175, %180, !dbg !51 + %182 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 1, i32 31), !dbg !48 + %183 = add i32 %176, %182, !dbg !51 + %184 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 1, i32 31), !dbg !48 + %185 = add i32 %177, %184, !dbg !51 + %186 = mul nuw nsw i32 %158, %24, !dbg !32 + %187 = mul nuw nsw i32 %159, %24, !dbg !32 + %188 = mul nuw nsw i32 %160, %24, !dbg !32 + %189 = mul nuw nsw i32 %161, %24, !dbg !32 + %190 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %186, i32 1, i32 31), !dbg !48 + %191 = add i32 %186, %190, !dbg !51 + %192 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %187, i32 1, i32 31), !dbg !48 + %193 = add i32 %187, %192, !dbg !51 + %194 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %188, i32 1, i32 31), !dbg !48 + %195 = add i32 %188, %194, !dbg !51 + %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %189, i32 1, i32 31), !dbg !48 + %197 = add i32 %189, %196, !dbg !51 + %198 = mul nuw nsw i32 %158, %22, !dbg !31 + %199 = mul nuw nsw i32 %159, %22, !dbg !31 + %200 = mul nuw nsw i32 %160, %22, !dbg !31 + %201 = mul nuw nsw i32 %161, %22, !dbg !31 + %202 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %198, i32 1, i32 31), !dbg !48 + %203 = add i32 %198, %202, !dbg !51 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %199, i32 1, i32 31), !dbg !48 + %205 = add i32 %199, %204, !dbg !51 + %206 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 1, i32 31), !dbg !48 + %207 = add i32 %200, %206, !dbg !51 + %208 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %201, i32 1, i32 31), !dbg !48 + %209 = add i32 %201, %208, !dbg !51 + %210 = icmp sge i32 %167, %179, !dbg !22 + %211 = icmp ne i32 %167, %179, !dbg !22 + %212 = icmp sle i32 %191, %203, !dbg !22 + %213 = or i1 %211, %212, !dbg !22 + %214 = and i1 %210, %213, !dbg !22 + %.not5 = xor i1 %214, %27, !dbg !22 + %215 = icmp sge i32 %169, %181, !dbg !22 + %216 = icmp ne i32 %169, %181, !dbg !22 + %217 = icmp sle i32 %193, %205, !dbg !22 + %218 = or i1 %216, %217, !dbg !22 + %219 = and i1 %215, %218, !dbg !22 + %.not6 = xor i1 %219, %27, !dbg !22 + %220 = icmp sge i32 %171, %183, !dbg !22 + %221 = icmp ne i32 %171, %183, !dbg !22 + %222 = icmp sle i32 %195, %207, !dbg !22 + %223 = or i1 %221, %222, !dbg !22 + %224 = and i1 %220, %223, !dbg !22 + %.not7 = xor i1 %224, %27, !dbg !22 + %225 = icmp sge i32 %173, %185, !dbg !22 + %226 = icmp ne i32 %173, %185, !dbg !22 + %227 = icmp sle i32 %197, %209, !dbg !22 + %228 = or i1 %226, %227, !dbg !22 + %229 = and i1 %225, %228, !dbg !22 + %.not8 = xor i1 %229, %27, !dbg !22 + %230 = xor i32 %167, %179, !dbg !43 + %231 = xor i32 %169, %181, !dbg !43 + %232 = xor i32 %171, %183, !dbg !43 + %233 = xor i32 %173, %185, !dbg !43 + %234 = select i1 %.not5, i32 0, i32 %230, !dbg !44 + %235 = select i1 %.not6, i32 0, i32 %231, !dbg !44 + %236 = select i1 %.not7, i32 0, i32 %232, !dbg !44 + %237 = select i1 %.not8, i32 0, i32 %233, !dbg !44 + %238 = xor i32 %234, %150, !dbg !45 + %239 = xor i32 %235, %151, !dbg !45 + %240 = xor i32 %236, %152, !dbg !45 + %241 = xor i32 %237, %153, !dbg !45 + %242 = xor i32 %191, %203, !dbg !38 + %243 = xor i32 %193, %205, !dbg !38 + %244 = xor i32 %195, %207, !dbg !38 + %245 = xor i32 %197, %209, !dbg !38 + %246 = select i1 %.not5, i32 0, i32 %242, !dbg !46 + %247 = select i1 %.not6, i32 0, i32 %243, !dbg !46 + %248 = select i1 %.not7, i32 0, i32 %244, !dbg !46 + %249 = select i1 %.not8, i32 0, i32 %245, !dbg !46 + %250 = xor i32 %246, %158, !dbg !47 + %251 = xor i32 %247, %159, !dbg !47 + %252 = xor i32 %248, %160, !dbg !47 + %253 = xor i32 %249, %161, !dbg !47 + %254 = icmp sge i32 %238, %240, !dbg !22 + %255 = icmp ne i32 %238, %240, !dbg !22 + %256 = icmp sle i32 %250, %252, !dbg !22 + %257 = or i1 %255, %256, !dbg !22 + %258 = and i1 %254, %257, !dbg !22 + %.not9 = xor i1 %258, %27, !dbg !22 + %259 = icmp sge i32 %239, %241, !dbg !22 + %260 = icmp ne i32 %239, %241, !dbg !22 + %261 = icmp sle i32 %251, %253, !dbg !22 + %262 = or i1 %260, %261, !dbg !22 + %263 = and i1 %259, %262, !dbg !22 + %.not10 = xor i1 %263, %27, !dbg !22 + %264 = xor i32 %238, %240, !dbg !43 + %265 = xor i32 %239, %241, !dbg !43 + %266 = select i1 %.not9, i32 0, i32 %264, !dbg !44 + %267 = select i1 %.not10, i32 0, i32 %265, !dbg !44 + %268 = xor i32 %266, %238, !dbg !45 + %269 = xor i32 %267, %239, !dbg !45 + %270 = xor i32 %266, %240, !dbg !45 + %271 = xor i32 %267, %241, !dbg !45 + %272 = xor i32 %250, %252, !dbg !38 + %273 = xor i32 %251, %253, !dbg !38 + %274 = select i1 %.not9, i32 0, i32 %272, !dbg !46 + %275 = select i1 %.not10, i32 0, i32 %273, !dbg !46 + %276 = xor i32 %274, %250, !dbg !47 + %277 = xor i32 %275, %251, !dbg !47 + %278 = xor i32 %274, %252, !dbg !47 + %279 = xor i32 %275, %253, !dbg !47 + %280 = icmp sge i32 %268, %269, !dbg !22 + %281 = icmp ne i32 %268, %269, !dbg !22 + %282 = icmp sle i32 %276, %277, !dbg !22 + %283 = or i1 %281, %282, !dbg !22 + %284 = and i1 %280, %283, !dbg !22 + %.not11 = xor i1 %284, %27, !dbg !22 + %285 = icmp sge i32 %270, %271, !dbg !22 + %286 = icmp ne i32 %270, %271, !dbg !22 + %287 = icmp sle i32 %278, %279, !dbg !22 + %288 = or i1 %286, %287, !dbg !22 + %289 = and i1 %285, %288, !dbg !22 + %.not12 = xor i1 %289, %27, !dbg !22 + %290 = xor i32 %268, %269, !dbg !43 + %291 = xor i32 %270, %271, !dbg !43 + %292 = select i1 %.not11, i32 0, i32 %290, !dbg !44 + %293 = select i1 %.not12, i32 0, i32 %291, !dbg !44 + %294 = xor i32 %276, %277, !dbg !38 + %295 = xor i32 %278, %279, !dbg !38 + %296 = select i1 %.not11, i32 0, i32 %294, !dbg !46 + %297 = select i1 %.not12, i32 0, i32 %295, !dbg !46 + %298 = xor i32 %296, %276, !dbg !47 + %299 = xor i32 %296, %277, !dbg !47 + %300 = xor i32 %297, %278, !dbg !47 + %301 = xor i32 %297, %279, !dbg !47 + %302 = mul nuw nsw i32 %298, %25, !dbg !32 + %303 = mul nuw nsw i32 %299, %25, !dbg !32 + %304 = mul nuw nsw i32 %300, %25, !dbg !32 + %305 = mul nuw nsw i32 %301, %25, !dbg !32 + %306 = mul nuw nsw i32 %298, %.lobit, !dbg !31 + %307 = mul nuw nsw i32 %299, %.lobit, !dbg !31 + %308 = mul nuw nsw i32 %300, %.lobit, !dbg !31 + %309 = mul nuw nsw i32 %301, %.lobit, !dbg !31 + %310 = insertelement <2 x i32> poison, i32 %292, i64 0, !dbg !45 + %311 = shufflevector <2 x i32> %310, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !45 + %312 = insertelement <2 x i32> poison, i32 %269, i64 0, !dbg !45 + %313 = insertelement <2 x i32> %312, i32 %268, i64 1, !dbg !45 + %314 = xor <2 x i32> %311, %313, !dbg !45 + %315 = insertelement <2 x i32> poison, i32 %293, i64 0, !dbg !45 + %316 = shufflevector <2 x i32> %315, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !45 + %317 = insertelement <2 x i32> poison, i32 %271, i64 0, !dbg !45 + %318 = insertelement <2 x i32> %317, i32 %270, i64 1, !dbg !45 + %319 = xor <2 x i32> %316, %318, !dbg !45 + %320 = extractelement <2 x i32> %314, i64 1, !dbg !30 + %321 = mul nuw nsw i32 %320, %25, !dbg !29 + %322 = extractelement <2 x i32> %314, i64 0, !dbg !30 + %323 = mul nuw nsw i32 %322, %25, !dbg !29 + %324 = extractelement <2 x i32> %319, i64 1, !dbg !30 + %325 = mul nuw nsw i32 %324, %25, !dbg !29 + %326 = extractelement <2 x i32> %319, i64 0, !dbg !30 + %327 = mul nuw nsw i32 %326, %25, !dbg !29 + %328 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %321, i32 2, i32 31), !dbg !48 + %329 = add i32 %321, %328, !dbg !51 + %330 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %323, i32 2, i32 31), !dbg !48 + %331 = add i32 %323, %330, !dbg !51 + %332 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %325, i32 2, i32 31), !dbg !48 + %333 = add i32 %325, %332, !dbg !51 + %334 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %327, i32 2, i32 31), !dbg !48 + %335 = add i32 %327, %334, !dbg !51 + %336 = mul nuw nsw i32 %320, %.lobit, !dbg !30 + %337 = mul nuw nsw i32 %322, %.lobit, !dbg !30 + %338 = mul nuw nsw i32 %324, %.lobit, !dbg !30 + %339 = mul nuw nsw i32 %326, %.lobit, !dbg !30 + %340 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %336, i32 2, i32 31), !dbg !48 + %341 = add i32 %336, %340, !dbg !51 + %342 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %337, i32 2, i32 31), !dbg !48 + %343 = add i32 %337, %342, !dbg !51 + %344 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %338, i32 2, i32 31), !dbg !48 + %345 = add i32 %338, %344, !dbg !51 + %346 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %339, i32 2, i32 31), !dbg !48 + %347 = add i32 %339, %346, !dbg !51 + %348 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %302, i32 2, i32 31), !dbg !48 + %349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %303, i32 2, i32 31), !dbg !48 + %350 = insertelement <2 x i32> poison, i32 %303, i64 0, !dbg !51 + %351 = insertelement <2 x i32> %350, i32 %302, i64 1, !dbg !51 + %352 = insertelement <2 x i32> poison, i32 %349, i64 0, !dbg !51 + %353 = insertelement <2 x i32> %352, i32 %348, i64 1, !dbg !51 + %354 = add <2 x i32> %351, %353, !dbg !51 + %355 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %304, i32 2, i32 31), !dbg !48 + %356 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %305, i32 2, i32 31), !dbg !48 + %357 = insertelement <2 x i32> poison, i32 %305, i64 0, !dbg !51 + %358 = insertelement <2 x i32> %357, i32 %304, i64 1, !dbg !51 + %359 = insertelement <2 x i32> poison, i32 %356, i64 0, !dbg !51 + %360 = insertelement <2 x i32> %359, i32 %355, i64 1, !dbg !51 + %361 = add <2 x i32> %358, %360, !dbg !51 + %362 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %306, i32 2, i32 31), !dbg !48 + %363 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %307, i32 2, i32 31), !dbg !48 + %364 = insertelement <2 x i32> poison, i32 %307, i64 0, !dbg !51 + %365 = insertelement <2 x i32> %364, i32 %306, i64 1, !dbg !51 + %366 = insertelement <2 x i32> poison, i32 %363, i64 0, !dbg !51 + %367 = insertelement <2 x i32> %366, i32 %362, i64 1, !dbg !51 + %368 = add <2 x i32> %365, %367, !dbg !51 + %369 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 2, i32 31), !dbg !48 + %370 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %309, i32 2, i32 31), !dbg !48 + %371 = insertelement <2 x i32> poison, i32 %309, i64 0, !dbg !51 + %372 = insertelement <2 x i32> %371, i32 %308, i64 1, !dbg !51 + %373 = insertelement <2 x i32> poison, i32 %370, i64 0, !dbg !51 + %374 = insertelement <2 x i32> %373, i32 %369, i64 1, !dbg !51 + %375 = add <2 x i32> %372, %374, !dbg !51 + %376 = insertelement <2 x i32> poison, i32 %331, i64 0, !dbg !40 + %377 = insertelement <2 x i32> %376, i32 %329, i64 1, !dbg !40 + %378 = insertelement <2 x i32> poison, i32 %343, i64 0, !dbg !40 + %379 = insertelement <2 x i32> %378, i32 %341, i64 1, !dbg !40 + %380 = icmp slt <2 x i32> %377, %379, !dbg !40 + %381 = icmp eq <2 x i32> %377, %379, !dbg !52 + %382 = icmp sgt <2 x i32> %354, %368, !dbg !53 + %383 = and <2 x i1> %381, %382, !dbg !54 + %384 = or <2 x i1> %380, %383, !dbg !55 + %385 = insertelement <2 x i32> poison, i32 %335, i64 0, !dbg !40 + %386 = insertelement <2 x i32> %385, i32 %333, i64 1, !dbg !40 + %387 = insertelement <2 x i32> poison, i32 %347, i64 0, !dbg !40 + %388 = insertelement <2 x i32> %387, i32 %345, i64 1, !dbg !40 + %389 = icmp slt <2 x i32> %386, %388, !dbg !40 + %390 = icmp eq <2 x i32> %386, %388, !dbg !52 + %391 = icmp sgt <2 x i32> %361, %375, !dbg !53 + %392 = and <2 x i1> %390, %391, !dbg !54 + %393 = or <2 x i1> %389, %392, !dbg !55 + %394 = xor <2 x i32> %377, %379, !dbg !43 + %395 = xor <2 x i32> %386, %388, !dbg !43 + %396 = select <2 x i1> %384, <2 x i32> %394, <2 x i32> zeroinitializer, !dbg !44 + %397 = select <2 x i1> %393, <2 x i32> %395, <2 x i32> zeroinitializer, !dbg !44 + %398 = xor <2 x i32> %396, %314, !dbg !45 + %399 = xor <2 x i32> %397, %319, !dbg !45 + %400 = xor <2 x i32> %354, %368, !dbg !38 + %401 = xor <2 x i32> %361, %375, !dbg !38 + %402 = select <2 x i1> %384, <2 x i32> %400, <2 x i32> zeroinitializer, !dbg !46 + %403 = select <2 x i1> %393, <2 x i32> %401, <2 x i32> zeroinitializer, !dbg !46 + %404 = insertelement <2 x i32> poison, i32 %299, i64 0, !dbg !47 + %405 = insertelement <2 x i32> %404, i32 %298, i64 1, !dbg !47 + %406 = xor <2 x i32> %402, %405, !dbg !47 + %407 = insertelement <2 x i32> poison, i32 %301, i64 0, !dbg !47 + %408 = insertelement <2 x i32> %407, i32 %300, i64 1, !dbg !47 + %409 = xor <2 x i32> %403, %408, !dbg !47 + %410 = mul nuw nsw <2 x i32> %398, %41, !dbg !29 + %411 = mul nuw nsw <2 x i32> %399, %41, !dbg !29 + %412 = extractelement <2 x i32> %410, i64 1, !dbg !48 + %413 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %412, i32 1, i32 31), !dbg !48 + %414 = extractelement <2 x i32> %410, i64 0, !dbg !48 + %415 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %414, i32 1, i32 31), !dbg !48 + %416 = insertelement <2 x i32> poison, i32 %415, i64 0, !dbg !51 + %417 = insertelement <2 x i32> %416, i32 %413, i64 1, !dbg !51 + %418 = add <2 x i32> %410, %417, !dbg !51 + %419 = extractelement <2 x i32> %411, i64 1, !dbg !48 + %420 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %419, i32 1, i32 31), !dbg !48 + %421 = extractelement <2 x i32> %411, i64 0, !dbg !48 + %422 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %421, i32 1, i32 31), !dbg !48 + %423 = insertelement <2 x i32> poison, i32 %422, i64 0, !dbg !51 + %424 = insertelement <2 x i32> %423, i32 %420, i64 1, !dbg !51 + %425 = add <2 x i32> %411, %424, !dbg !51 + %426 = mul nuw nsw <2 x i32> %398, %43, !dbg !30 + %427 = mul nuw nsw <2 x i32> %399, %43, !dbg !30 + %428 = extractelement <2 x i32> %426, i64 1, !dbg !48 + %429 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %428, i32 1, i32 31), !dbg !48 + %430 = extractelement <2 x i32> %426, i64 0, !dbg !48 + %431 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %430, i32 1, i32 31), !dbg !48 + %432 = insertelement <2 x i32> poison, i32 %431, i64 0, !dbg !51 + %433 = insertelement <2 x i32> %432, i32 %429, i64 1, !dbg !51 + %434 = add <2 x i32> %426, %433, !dbg !51 + %435 = extractelement <2 x i32> %427, i64 1, !dbg !48 + %436 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %435, i32 1, i32 31), !dbg !48 + %437 = extractelement <2 x i32> %427, i64 0, !dbg !48 + %438 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %437, i32 1, i32 31), !dbg !48 + %439 = insertelement <2 x i32> poison, i32 %438, i64 0, !dbg !51 + %440 = insertelement <2 x i32> %439, i32 %436, i64 1, !dbg !51 + %441 = add <2 x i32> %427, %440, !dbg !51 + %442 = mul nuw nsw <2 x i32> %406, %41, !dbg !32 + %443 = extractelement <2 x i32> %442, i64 1, !dbg !48 + %444 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %443, i32 1, i32 31), !dbg !48 + %445 = mul nuw nsw <2 x i32> %409, %41, !dbg !32 + %446 = extractelement <2 x i32> %442, i64 0, !dbg !48 + %447 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %446, i32 1, i32 31), !dbg !48 + %448 = insertelement <2 x i32> poison, i32 %447, i64 0, !dbg !51 + %449 = insertelement <2 x i32> %448, i32 %444, i64 1, !dbg !51 + %450 = add <2 x i32> %442, %449, !dbg !51 + %451 = mul nuw nsw <2 x i32> %409, %44, !dbg !31 + %452 = extractelement <2 x i32> %445, i64 1, !dbg !48 + %453 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %452, i32 1, i32 31), !dbg !48 + %454 = mul nuw nsw <2 x i32> %409, %45, !dbg !32 + %455 = extractelement <2 x i32> %445, i64 0, !dbg !48 + %456 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %455, i32 1, i32 31), !dbg !48 + %457 = insertelement <2 x i32> poison, i32 %456, i64 0, !dbg !51 + %458 = insertelement <2 x i32> %457, i32 %453, i64 1, !dbg !51 + %459 = add <2 x i32> %445, %458, !dbg !51 + %460 = mul nuw nsw <2 x i32> %406, %43, !dbg !31 + %461 = mul nuw nsw <2 x i32> %409, %43, !dbg !31 + %462 = extractelement <2 x i32> %460, i64 1, !dbg !48 + %463 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %462, i32 1, i32 31), !dbg !48 + %464 = extractelement <2 x i32> %460, i64 0, !dbg !48 + %465 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %464, i32 1, i32 31), !dbg !48 + %466 = insertelement <2 x i32> poison, i32 %465, i64 0, !dbg !51 + %467 = insertelement <2 x i32> %466, i32 %463, i64 1, !dbg !51 + %468 = add <2 x i32> %460, %467, !dbg !51 + %469 = extractelement <2 x i32> %461, i64 1, !dbg !48 + %470 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %469, i32 1, i32 31), !dbg !48 + %471 = insertelement <2 x i32> %458, i32 %470, i64 1, !dbg !51 + %472 = add <2 x i32> %454, %471, !dbg !51 + %473 = extractelement <2 x i32> %461, i64 0, !dbg !48 + %474 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %473, i32 1, i32 31), !dbg !48 + %475 = insertelement <2 x i32> %471, i32 %474, i64 0, !dbg !51 + %476 = add <2 x i32> %475, %461, !dbg !51 + %477 = insertelement <2 x i32> %475, i32 %453, i64 1, !dbg !51 + %478 = add <2 x i32> %477, %451, !dbg !51 + %479 = icmp slt <2 x i32> %418, %434, !dbg !40 + %480 = icmp slt <2 x i32> %425, %441, !dbg !40 + %481 = icmp eq <2 x i32> %418, %434, !dbg !52 + %482 = icmp eq <2 x i32> %425, %441, !dbg !52 + %483 = icmp sgt <2 x i32> %450, %468, !dbg !53 + %484 = icmp sgt <2 x i32> %459, %476, !dbg !53 + %485 = and <2 x i1> %481, %483, !dbg !54 + %486 = and <2 x i1> %482, %484, !dbg !54 + %487 = or <2 x i1> %479, %485, !dbg !55 + %488 = or <2 x i1> %480, %486, !dbg !55 + %489 = xor <2 x i32> %418, %434, !dbg !43 + %490 = xor <2 x i32> %425, %441, !dbg !43 + %491 = select <2 x i1> %487, <2 x i32> %489, <2 x i32> zeroinitializer, !dbg !44 + %492 = select <2 x i1> %488, <2 x i32> %490, <2 x i32> zeroinitializer, !dbg !44 + %493 = xor <2 x i32> %491, %398, !dbg !45 + %494 = xor <2 x i32> %492, %399, !dbg !45 + %495 = xor <2 x i32> %450, %468, !dbg !38 + %496 = xor <2 x i32> %478, %472, !dbg !38 + %497 = select <2 x i1> %487, <2 x i32> %495, <2 x i32> zeroinitializer, !dbg !46 + %498 = select <2 x i1> %488, <2 x i32> %496, <2 x i32> zeroinitializer, !dbg !46 + %499 = xor <2 x i32> %497, %406, !dbg !47 + %500 = xor <2 x i32> %498, %409, !dbg !47 + %501 = icmp slt <2 x i32> %493, %494, !dbg !40 + %502 = icmp eq <2 x i32> %493, %494, !dbg !52 + %503 = icmp sgt <2 x i32> %499, %500, !dbg !53 + %504 = and <2 x i1> %502, %503, !dbg !54 + %505 = or <2 x i1> %501, %504, !dbg !55 + %506 = xor <2 x i32> %493, %494, !dbg !43 + %507 = select <2 x i1> %505, <2 x i32> %506, <2 x i32> zeroinitializer, !dbg !44 + %508 = shufflevector <2 x i32> %507, <2 x i32> poison, <2 x i32> , !dbg !45 + %509 = shufflevector <2 x i32> %493, <2 x i32> %494, <2 x i32> , !dbg !45 + %510 = xor <2 x i32> %508, %509, !dbg !45 + %511 = shufflevector <2 x i32> %507, <2 x i32> poison, <2 x i32> , !dbg !45 + %512 = shufflevector <2 x i32> %494, <2 x i32> %493, <2 x i32> , !dbg !45 + %513 = xor <2 x i32> %511, %512, !dbg !45 + %514 = shufflevector <2 x i32> %507, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !45 + %515 = shufflevector <2 x i32> %494, <2 x i32> %493, <2 x i32> , !dbg !45 + %516 = xor <2 x i32> %514, %515, !dbg !45 + %517 = shufflevector <2 x i32> %494, <2 x i32> %493, <2 x i32> , !dbg !45 + %518 = xor <2 x i32> %507, %517, !dbg !45 + %519 = xor <2 x i32> %499, %500, !dbg !38 + %520 = select <2 x i1> %505, <2 x i32> %519, <2 x i32> zeroinitializer, !dbg !46 + %521 = xor <2 x i32> %520, %499, !dbg !47 + %522 = xor <2 x i32> %520, %500, !dbg !47 + %523 = icmp slt <2 x i32> %513, %516, !dbg !40 + %524 = icmp eq <2 x i32> %518, %510, !dbg !52 + %525 = shufflevector <2 x i32> %521, <2 x i32> %522, <2 x i32> , !dbg !53 + %526 = shufflevector <2 x i32> %522, <2 x i32> %521, <2 x i32> , !dbg !53 + %527 = icmp sgt <2 x i32> %525, %526, !dbg !53 + %528 = and <2 x i1> %524, %527, !dbg !54 + %529 = or <2 x i1> %523, %528, !dbg !55 + %530 = xor <2 x i32> %526, %525, !dbg !38 + %531 = select <2 x i1> %529, <2 x i32> %530, <2 x i32> zeroinitializer, !dbg !46 + %532 = shufflevector <2 x i32> %531, <2 x i32> poison, <4 x i32> , !dbg !46 + %533 = shufflevector <2 x i32> %521, <2 x i32> %522, <4 x i32> , !dbg !47 + %534 = xor <4 x i32> %532, %533, !dbg !47 + %535 = select <4 x i1> %47, <4 x i1> %75, <4 x i1> zeroinitializer, !dbg !33 + %536 = bitcast <4 x i1> %535 to i4, !dbg !56 + %537 = tail call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 %536), !dbg !56 + %538 = zext nneg i4 %537 to i64, !dbg !56 + %539 = zext nneg i4 %537 to i32, !dbg !58 + %540 = icmp eq <4 x i64> %73, splat (i64 16384), !dbg !59 + %541 = extractelement <4 x i1> %540, i64 0, !dbg !60 + %542 = zext i1 %541 to i32, !dbg !41 + %543 = extractelement <4 x i1> %540, i64 1, !dbg !60 + %544 = zext i1 %543 to i32, !dbg !41 + %545 = extractelement <4 x i1> %540, i64 2, !dbg !61 + %546 = zext i1 %545 to i32, !dbg !41 + %547 = extractelement <4 x i1> %540, i64 3, !dbg !61 + %548 = zext i1 %547 to i32, !dbg !41 + %549 = xor i1 %541, true, !dbg !60 + %550 = and i1 %543, %549, !dbg !60 + %.not13 = xor i1 %547, true, !dbg !61 + %551 = or i1 %545, %.not13, !dbg !61 + %552 = xor i32 %542, %544, !dbg !62 + %553 = xor i32 %546, %548, !dbg !62 + %554 = select i1 %550, i32 %552, i32 0, !dbg !63 + %555 = select i1 %551, i32 %553, i32 0, !dbg !63 + %556 = xor i32 %554, %542, !dbg !64 + %557 = xor i32 %554, %544, !dbg !64 + %558 = xor i32 %555, %546, !dbg !64 + %559 = xor i32 %555, %548, !dbg !64 + %560 = select i1 %550, i32 %68, i32 0, !dbg !65 + %561 = select i1 %551, i32 %69, i32 0, !dbg !65 + %562 = xor i32 %560, %52, !dbg !66 + %563 = xor i32 %560, %53, !dbg !66 + %564 = xor i32 %561, %54, !dbg !66 + %565 = xor i32 %561, %55, !dbg !66 + %566 = insertelement <2 x i32> poison, i32 %556, i64 0, !dbg !23 + %567 = insertelement <2 x i32> %566, i32 %557, i64 1, !dbg !23 + %568 = insertelement <2 x i32> poison, i32 %558, i64 0, !dbg !23 + %569 = insertelement <2 x i32> %568, i32 %559, i64 1, !dbg !23 + %570 = icmp samesign uge <2 x i32> %567, %569, !dbg !23 + %571 = insertelement <2 x i32> %569, i32 %557, i64 1, !dbg !23 + %572 = insertelement <2 x i32> %567, i32 %559, i64 1, !dbg !23 + %573 = icmp ne <2 x i32> %571, %572, !dbg !23 + %574 = insertelement <2 x i32> poison, i32 %562, i64 0, !dbg !23 + %575 = insertelement <2 x i32> %574, i32 %563, i64 1, !dbg !23 + %576 = insertelement <2 x i32> poison, i32 %564, i64 0, !dbg !23 + %577 = insertelement <2 x i32> %576, i32 %565, i64 1, !dbg !23 + %578 = icmp ule <2 x i32> %575, %577, !dbg !23 + %579 = or <2 x i1> %578, %573, !dbg !23 + %580 = and <2 x i1> %570, %579, !dbg !23 + %581 = xor <2 x i1> %580, %39, !dbg !23 + %582 = xor <2 x i32> %572, %571, !dbg !62 + %583 = select <2 x i1> %581, <2 x i32> zeroinitializer, <2 x i32> %582, !dbg !63 + %584 = extractelement <2 x i32> %583, i64 0, !dbg !64 + %585 = xor i32 %584, %556, !dbg !64 + %586 = extractelement <2 x i32> %583, i64 1, !dbg !64 + %587 = xor i32 %586, %557, !dbg !64 + %588 = xor <2 x i32> %583, %569, !dbg !64 + %589 = xor i32 %564, %562, !dbg !67 + %590 = xor i32 %565, %563, !dbg !67 + %591 = extractelement <2 x i1> %581, i64 0, !dbg !65 + %592 = select i1 %591, i32 0, i32 %589, !dbg !65 + %593 = extractelement <2 x i1> %581, i64 1, !dbg !65 + %594 = select i1 %593, i32 0, i32 %590, !dbg !65 + %595 = xor i32 %592, %562, !dbg !66 + %596 = xor i32 %594, %563, !dbg !66 + %597 = xor i32 %592, %564, !dbg !66 + %598 = xor i32 %594, %565, !dbg !66 + %599 = icmp samesign uge i32 %585, %587, !dbg !23 + %600 = icmp ne i32 %585, %587, !dbg !23 + %601 = icmp samesign ule i32 %595, %596, !dbg !23 + %602 = or i1 %600, %601, !dbg !23 + %603 = and i1 %599, %602, !dbg !23 + %.not16 = xor i1 %603, %26, !dbg !23 + %604 = extractelement <2 x i32> %588, i64 0, !dbg !23 + %605 = extractelement <2 x i32> %588, i64 1, !dbg !23 + %606 = icmp samesign uge i32 %604, %605, !dbg !23 + %607 = icmp ne i32 %604, %605, !dbg !23 + %608 = icmp samesign ule i32 %597, %598, !dbg !23 + %609 = or i1 %607, %608, !dbg !23 + %610 = and i1 %606, %609, !dbg !23 + %.not17 = xor i1 %610, %26, !dbg !23 + %611 = xor i32 %585, %587, !dbg !62 + %612 = xor i32 %604, %605, !dbg !62 + %613 = select i1 %.not16, i32 0, i32 %611, !dbg !63 + %614 = select i1 %.not17, i32 0, i32 %612, !dbg !63 + %615 = xor i32 %613, %585, !dbg !64 + %616 = xor i32 %613, %587, !dbg !64 + %617 = xor i32 %614, %604, !dbg !64 + %618 = xor i32 %614, %605, !dbg !64 + %619 = xor i32 %595, %596, !dbg !67 + %620 = xor i32 %597, %598, !dbg !67 + %621 = select i1 %.not16, i32 0, i32 %619, !dbg !65 + %622 = select i1 %.not17, i32 0, i32 %620, !dbg !65 + %623 = mul nuw nsw i32 %615, %24, !dbg !27 + %624 = mul nuw nsw i32 %616, %24, !dbg !27 + %625 = mul nuw nsw i32 %617, %24, !dbg !27 + %626 = mul nuw nsw i32 %618, %24, !dbg !27 + %627 = mul nuw nsw i32 %615, %22, !dbg !28 + %628 = mul nuw nsw i32 %616, %22, !dbg !28 + %629 = mul nuw nsw i32 %617, %22, !dbg !28 + %630 = mul nuw nsw i32 %618, %22, !dbg !28 + %631 = insertelement <4 x i32> poison, i32 %622, i64 0, !dbg !66 + %632 = insertelement <4 x i32> %631, i32 %621, i64 1, !dbg !66 + %633 = shufflevector <4 x i32> %632, <4 x i32> poison, <4 x i32> , !dbg !66 + %634 = insertelement <4 x i32> poison, i32 %595, i64 0, !dbg !66 + %635 = insertelement <4 x i32> %634, i32 %596, i64 1, !dbg !66 + %636 = insertelement <4 x i32> %635, i32 %597, i64 2, !dbg !66 + %637 = insertelement <4 x i32> %636, i32 %598, i64 3, !dbg !66 + %638 = xor <4 x i32> %633, %637, !dbg !66 + %639 = extractelement <4 x i32> %638, i64 0, !dbg !26 + %640 = mul nuw nsw i32 %639, %24, !dbg !25 + %641 = extractelement <4 x i32> %638, i64 1, !dbg !26 + %642 = mul nuw nsw i32 %641, %24, !dbg !25 + %643 = extractelement <4 x i32> %638, i64 2, !dbg !26 + %644 = mul nuw nsw i32 %643, %24, !dbg !25 + %645 = extractelement <4 x i32> %638, i64 3, !dbg !26 + %646 = mul nuw nsw i32 %645, %24, !dbg !25 + %647 = mul nuw nsw i32 %639, %22, !dbg !26 + %648 = mul nuw nsw i32 %641, %22, !dbg !26 + %649 = mul nuw nsw i32 %643, %22, !dbg !26 + %650 = mul nuw nsw i32 %645, %22, !dbg !26 + %651 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %623, i32 1, i32 31), !dbg !68 + %652 = add i32 %651, %623, !dbg !69 + %653 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %624, i32 1, i32 31), !dbg !68 + %654 = add i32 %653, %624, !dbg !69 + %655 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %625, i32 1, i32 31), !dbg !68 + %656 = add i32 %655, %625, !dbg !69 + %657 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %626, i32 1, i32 31), !dbg !68 + %658 = add i32 %657, %626, !dbg !69 + %659 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %627, i32 1, i32 31), !dbg !68 + %660 = add i32 %659, %627, !dbg !69 + %661 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %628, i32 1, i32 31), !dbg !68 + %662 = add i32 %661, %628, !dbg !69 + %663 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %629, i32 1, i32 31), !dbg !68 + %664 = add i32 %663, %629, !dbg !69 + %665 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %630, i32 1, i32 31), !dbg !68 + %666 = add i32 %665, %630, !dbg !69 + %667 = icmp sge i32 %654, %662, !dbg !23 + %668 = icmp ne i32 %654, %662, !dbg !23 + %669 = insertelement <2 x i32> poison, i32 %652, i64 0, !dbg !23 + %670 = insertelement <2 x i32> %669, i32 %656, i64 1, !dbg !23 + %671 = insertelement <2 x i32> poison, i32 %660, i64 0, !dbg !23 + %672 = insertelement <2 x i32> %671, i32 %664, i64 1, !dbg !23 + %673 = icmp sge <2 x i32> %670, %672, !dbg !23 + %674 = icmp ne <2 x i32> %670, %672, !dbg !23 + %675 = icmp sge i32 %658, %666, !dbg !23 + %676 = icmp ne i32 %658, %666, !dbg !23 + %677 = xor i32 %660, %652, !dbg !62 + %678 = xor i32 %662, %654, !dbg !62 + %679 = xor i32 %664, %656, !dbg !62 + %680 = xor i32 %666, %658, !dbg !62 + %681 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %640, i32 1, i32 31), !dbg !68 + %682 = add i32 %681, %640, !dbg !69 + %683 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %642, i32 1, i32 31), !dbg !68 + %684 = add i32 %683, %642, !dbg !69 + %685 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %644, i32 1, i32 31), !dbg !68 + %686 = add i32 %685, %644, !dbg !69 + %687 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %646, i32 1, i32 31), !dbg !68 + %688 = add i32 %687, %646, !dbg !69 + %689 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %647, i32 1, i32 31), !dbg !68 + %690 = add i32 %689, %647, !dbg !69 + %691 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %648, i32 1, i32 31), !dbg !68 + %692 = add i32 %691, %648, !dbg !69 + %693 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %649, i32 1, i32 31), !dbg !68 + %694 = add i32 %693, %649, !dbg !69 + %695 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %650, i32 1, i32 31), !dbg !68 + %696 = add i32 %695, %650, !dbg !69 + %697 = insertelement <2 x i32> poison, i32 %682, i64 0, !dbg !23 + %698 = insertelement <2 x i32> %697, i32 %686, i64 1, !dbg !23 + %699 = insertelement <2 x i32> poison, i32 %690, i64 0, !dbg !23 + %700 = insertelement <2 x i32> %699, i32 %694, i64 1, !dbg !23 + %701 = icmp sle <2 x i32> %698, %700, !dbg !23 + %702 = or <2 x i1> %674, %701, !dbg !23 + %703 = and <2 x i1> %673, %702, !dbg !23 + %704 = xor <2 x i1> %703, %29, !dbg !23 + %705 = insertelement <2 x i32> poison, i32 %677, i64 0, !dbg !63 + %706 = insertelement <2 x i32> %705, i32 %679, i64 1, !dbg !63 + %707 = select <2 x i1> %704, <2 x i32> zeroinitializer, <2 x i32> %706, !dbg !63 + %708 = insertelement <2 x i32> poison, i32 %615, i64 0, !dbg !64 + %709 = insertelement <2 x i32> %708, i32 %617, i64 1, !dbg !64 + %710 = xor <2 x i32> %707, %709, !dbg !64 + %711 = insertelement <2 x i32> poison, i32 %688, i64 0, !dbg !23 + %712 = insertelement <2 x i32> %711, i32 %684, i64 1, !dbg !23 + %713 = insertelement <2 x i32> poison, i32 %696, i64 0, !dbg !23 + %714 = insertelement <2 x i32> %713, i32 %692, i64 1, !dbg !23 + %715 = icmp sle <2 x i32> %712, %714, !dbg !23 + %716 = insertelement <2 x i1> poison, i1 %676, i64 0, !dbg !23 + %717 = insertelement <2 x i1> %716, i1 %668, i64 1, !dbg !23 + %718 = or <2 x i1> %717, %715, !dbg !23 + %719 = insertelement <2 x i1> poison, i1 %675, i64 0, !dbg !23 + %720 = insertelement <2 x i1> %719, i1 %667, i64 1, !dbg !23 + %721 = and <2 x i1> %720, %718, !dbg !23 + %722 = xor <2 x i1> %721, %29, !dbg !23 + %723 = insertelement <2 x i32> poison, i32 %680, i64 0, !dbg !63 + %724 = insertelement <2 x i32> %723, i32 %678, i64 1, !dbg !63 + %725 = select <2 x i1> %722, <2 x i32> zeroinitializer, <2 x i32> %724, !dbg !63 + %726 = insertelement <2 x i32> poison, i32 %618, i64 0, !dbg !64 + %727 = insertelement <2 x i32> %726, i32 %616, i64 1, !dbg !64 + %728 = xor <2 x i32> %725, %727, !dbg !64 + %729 = xor i32 %690, %682, !dbg !67 + %730 = xor i32 %692, %684, !dbg !67 + %731 = xor i32 %694, %686, !dbg !67 + %732 = xor i32 %696, %688, !dbg !67 + %733 = shufflevector <2 x i1> %704, <2 x i1> %722, <4 x i32> , !dbg !65 + %734 = insertelement <4 x i32> poison, i32 %729, i64 0, !dbg !65 + %735 = insertelement <4 x i32> %734, i32 %730, i64 1, !dbg !65 + %736 = insertelement <4 x i32> %735, i32 %731, i64 2, !dbg !65 + %737 = insertelement <4 x i32> %736, i32 %732, i64 3, !dbg !65 + %738 = select <4 x i1> %733, <4 x i32> zeroinitializer, <4 x i32> %737, !dbg !65 + %739 = xor <4 x i32> %738, %638, !dbg !66 + %740 = extractelement <2 x i32> %710, i64 0, !dbg !23 + %741 = extractelement <2 x i32> %710, i64 1, !dbg !23 + %742 = icmp sge i32 %740, %741, !dbg !23 + %743 = icmp ne i32 %740, %741, !dbg !23 + %shift = shufflevector <4 x i32> %739, <4 x i32> poison, <4 x i32> , !dbg !23 + %744 = icmp sle <4 x i32> %739, %shift, !dbg !23 + %745 = extractelement <4 x i1> %744, i64 0, !dbg !23 + %746 = or i1 %743, %745, !dbg !23 + %747 = and i1 %742, %746, !dbg !23 + %748 = extractelement <2 x i32> %728, i64 0, !dbg !23 + %749 = extractelement <2 x i32> %728, i64 1, !dbg !23 + %750 = icmp sge i32 %749, %748, !dbg !23 + %751 = icmp ne i32 %749, %748, !dbg !23 + %shift53 = shufflevector <4 x i32> %739, <4 x i32> poison, <4 x i32> , !dbg !23 + %752 = icmp sle <4 x i32> %739, %shift53, !dbg !23 + %753 = extractelement <4 x i1> %752, i64 1, !dbg !23 + %754 = or i1 %751, %753, !dbg !23 + %755 = and i1 %750, %754, !dbg !23 + %756 = insertelement <2 x i1> poison, i1 %755, i64 0, !dbg !23 + %757 = insertelement <2 x i1> %756, i1 %747, i64 1, !dbg !23 + %758 = xor <2 x i1> %757, %29, !dbg !23 + %759 = xor i32 %741, %740, !dbg !62 + %760 = xor i32 %748, %749, !dbg !62 + %761 = extractelement <2 x i1> %758, i64 1, !dbg !63 + %762 = select i1 %761, i32 0, i32 %759, !dbg !63 + %763 = extractelement <2 x i1> %758, i64 0, !dbg !63 + %764 = select i1 %763, i32 0, i32 %760, !dbg !63 + %765 = xor i32 %762, %740, !dbg !64 + %766 = xor i32 %764, %749, !dbg !64 + %767 = xor i32 %762, %741, !dbg !64 + %768 = xor i32 %764, %748, !dbg !64 + %769 = shufflevector <4 x i32> %739, <4 x i32> poison, <2 x i32> , !dbg !67 + %770 = shufflevector <4 x i32> %739, <4 x i32> poison, <2 x i32> , !dbg !67 + %771 = xor <2 x i32> %769, %770, !dbg !67 + %772 = select <2 x i1> %758, <2 x i32> zeroinitializer, <2 x i32> %771, !dbg !65 + %773 = shufflevector <2 x i32> %772, <2 x i32> poison, <4 x i32> , !dbg !65 + %774 = shufflevector <2 x i32> %772, <2 x i32> poison, <2 x i32> , !dbg !66 + %775 = shufflevector <4 x i32> %739, <4 x i32> poison, <2 x i32> , !dbg !66 + %776 = xor <2 x i32> %774, %775, !dbg !66 + %777 = shufflevector <4 x i32> %739, <4 x i32> poison, <2 x i32> , !dbg !66 + %778 = xor <2 x i32> %772, %777, !dbg !66 + %779 = icmp sge i32 %765, %766, !dbg !23 + %780 = icmp ne i32 %765, %766, !dbg !23 + %781 = extractelement <2 x i32> %778, i64 1, !dbg !23 + %782 = extractelement <2 x i32> %776, i64 1, !dbg !23 + %783 = icmp sle i32 %781, %782, !dbg !23 + %784 = or i1 %780, %783, !dbg !23 + %785 = and i1 %779, %784, !dbg !23 + %.not24 = xor i1 %785, %27, !dbg !23 + %786 = icmp sge i32 %767, %768, !dbg !23 + %787 = icmp ne i32 %767, %768, !dbg !23 + %788 = extractelement <2 x i32> %778, i64 0, !dbg !23 + %789 = extractelement <2 x i32> %776, i64 0, !dbg !23 + %790 = icmp sle i32 %789, %788, !dbg !23 + %791 = or i1 %787, %790, !dbg !23 + %792 = and i1 %786, %791, !dbg !23 + %.not25 = xor i1 %792, %27, !dbg !23 + %793 = xor i32 %766, %765, !dbg !62 + %794 = xor i32 %768, %767, !dbg !62 + %795 = select i1 %.not24, i32 0, i32 %793, !dbg !63 + %796 = select i1 %.not25, i32 0, i32 %794, !dbg !63 + %797 = insertelement <2 x i32> poison, i32 %796, i64 0, !dbg !64 + %798 = insertelement <2 x i32> %797, i32 %795, i64 1, !dbg !64 + %799 = insertelement <2 x i32> poison, i32 %767, i64 0, !dbg !64 + %800 = insertelement <2 x i32> %799, i32 %765, i64 1, !dbg !64 + %801 = xor <2 x i32> %798, %800, !dbg !64 + %802 = insertelement <2 x i32> poison, i32 %768, i64 0, !dbg !64 + %803 = insertelement <2 x i32> %802, i32 %766, i64 1, !dbg !64 + %804 = xor <2 x i32> %798, %803, !dbg !64 + %805 = xor i32 %782, %781, !dbg !67 + %806 = xor i32 %788, %789, !dbg !67 + %807 = insertelement <2 x i1> poison, i1 %.not25, i64 0, !dbg !65 + %808 = insertelement <2 x i1> %807, i1 %.not24, i64 1, !dbg !65 + %809 = insertelement <2 x i32> poison, i32 %806, i64 0, !dbg !65 + %810 = insertelement <2 x i32> %809, i32 %805, i64 1, !dbg !65 + %811 = select <2 x i1> %808, <2 x i32> zeroinitializer, <2 x i32> %810, !dbg !65 + %812 = shufflevector <2 x i32> %811, <2 x i32> poison, <4 x i32> , !dbg !66 + %813 = xor <4 x i32> %773, %812, !dbg !66 + %814 = xor <2 x i32> %811, %776, !dbg !66 + %815 = xor <2 x i32> %811, %778, !dbg !66 + %816 = extractelement <2 x i32> %801, i64 1, !dbg !28 + %817 = mul nuw nsw i32 %816, %25, !dbg !27 + %818 = extractelement <2 x i32> %804, i64 1, !dbg !28 + %819 = mul nuw nsw i32 %818, %25, !dbg !27 + %820 = extractelement <2 x i32> %801, i64 0, !dbg !28 + %821 = mul nuw nsw i32 %820, %25, !dbg !27 + %822 = extractelement <2 x i32> %804, i64 0, !dbg !28 + %823 = mul nuw nsw i32 %822, %25, !dbg !27 + %824 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %817, i32 2, i32 31), !dbg !68 + %825 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %819, i32 2, i32 31), !dbg !68 + %826 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %821, i32 2, i32 31), !dbg !68 + %827 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %823, i32 2, i32 31), !dbg !68 + %828 = insertelement <4 x i32> poison, i32 %817, i64 0, !dbg !69 + %829 = insertelement <4 x i32> %828, i32 %819, i64 1, !dbg !69 + %830 = insertelement <4 x i32> %829, i32 %821, i64 2, !dbg !69 + %831 = insertelement <4 x i32> %830, i32 %823, i64 3, !dbg !69 + %832 = insertelement <4 x i32> poison, i32 %824, i64 0, !dbg !69 + %833 = insertelement <4 x i32> %832, i32 %825, i64 1, !dbg !69 + %834 = insertelement <4 x i32> %833, i32 %826, i64 2, !dbg !69 + %835 = insertelement <4 x i32> %834, i32 %827, i64 3, !dbg !69 + %836 = add <4 x i32> %831, %835, !dbg !69 + %837 = mul nuw nsw i32 %816, %.lobit, !dbg !28 + %838 = mul nuw nsw i32 %818, %.lobit, !dbg !28 + %839 = mul nuw nsw i32 %820, %.lobit, !dbg !28 + %840 = mul nuw nsw i32 %822, %.lobit, !dbg !28 + %841 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %837, i32 2, i32 31), !dbg !68 + %842 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %838, i32 2, i32 31), !dbg !68 + %843 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %839, i32 2, i32 31), !dbg !68 + %844 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %840, i32 2, i32 31), !dbg !68 + %845 = insertelement <4 x i32> poison, i32 %837, i64 0, !dbg !69 + %846 = insertelement <4 x i32> %845, i32 %838, i64 1, !dbg !69 + %847 = insertelement <4 x i32> %846, i32 %839, i64 2, !dbg !69 + %848 = insertelement <4 x i32> %847, i32 %840, i64 3, !dbg !69 + %849 = insertelement <4 x i32> poison, i32 %841, i64 0, !dbg !69 + %850 = insertelement <4 x i32> %849, i32 %842, i64 1, !dbg !69 + %851 = insertelement <4 x i32> %850, i32 %843, i64 2, !dbg !69 + %852 = insertelement <4 x i32> %851, i32 %844, i64 3, !dbg !69 + %853 = add <4 x i32> %848, %852, !dbg !69 + %854 = shufflevector <2 x i32> %815, <2 x i32> %814, <4 x i32> , !dbg !25 + %855 = mul nuw nsw <4 x i32> %854, %31, !dbg !25 + %856 = extractelement <4 x i32> %855, i64 0, !dbg !68 + %857 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %856, i32 2, i32 31), !dbg !68 + %858 = extractelement <4 x i32> %855, i64 1, !dbg !68 + %859 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %858, i32 2, i32 31), !dbg !68 + %860 = extractelement <4 x i32> %855, i64 2, !dbg !68 + %861 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %860, i32 2, i32 31), !dbg !68 + %862 = extractelement <4 x i32> %855, i64 3, !dbg !68 + %863 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %862, i32 2, i32 31), !dbg !68 + %864 = insertelement <4 x i32> poison, i32 %857, i64 0, !dbg !69 + %865 = insertelement <4 x i32> %864, i32 %859, i64 1, !dbg !69 + %866 = insertelement <4 x i32> %865, i32 %861, i64 2, !dbg !69 + %867 = insertelement <4 x i32> %866, i32 %863, i64 3, !dbg !69 + %868 = add <4 x i32> %855, %867, !dbg !69 + %869 = mul nuw nsw <4 x i32> %854, %33, !dbg !26 + %870 = extractelement <4 x i32> %869, i64 0, !dbg !68 + %871 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %870, i32 2, i32 31), !dbg !68 + %872 = extractelement <4 x i32> %869, i64 1, !dbg !68 + %873 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %872, i32 2, i32 31), !dbg !68 + %874 = extractelement <4 x i32> %869, i64 2, !dbg !68 + %875 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %874, i32 2, i32 31), !dbg !68 + %876 = extractelement <4 x i32> %869, i64 3, !dbg !68 + %877 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %876, i32 2, i32 31), !dbg !68 + %878 = insertelement <4 x i32> poison, i32 %871, i64 0, !dbg !69 + %879 = insertelement <4 x i32> %878, i32 %873, i64 1, !dbg !69 + %880 = insertelement <4 x i32> %879, i32 %875, i64 2, !dbg !69 + %881 = insertelement <4 x i32> %880, i32 %877, i64 3, !dbg !69 + %882 = add <4 x i32> %869, %881, !dbg !69 + %883 = icmp slt <4 x i32> %836, %853, !dbg !60 + %884 = icmp eq <4 x i32> %836, %853, !dbg !70 + %885 = icmp sgt <4 x i32> %868, %882, !dbg !71 + %886 = and <4 x i1> %884, %885, !dbg !72 + %887 = or <4 x i1> %883, %886, !dbg !73 + %888 = or <4 x i1> %883, %886, !dbg !73 + %889 = shufflevector <4 x i1> %888, <4 x i1> poison, <2 x i32> , !dbg !73 + %890 = or <4 x i1> %883, %886, !dbg !73 + %891 = shufflevector <4 x i1> %890, <4 x i1> poison, <2 x i32> , !dbg !73 + %foldExtExtBinop = xor <4 x i32> %836, %853, !dbg !62 + %foldExtExtBinop55 = xor <4 x i32> %836, %853, !dbg !62 + %foldExtExtBinop57 = xor <4 x i32> %836, %853, !dbg !62 + %foldExtExtBinop59 = xor <4 x i32> %836, %853, !dbg !62 + %892 = shufflevector <2 x i1> %889, <2 x i1> %891, <2 x i32> , !dbg !63 + %893 = shufflevector <4 x i32> %foldExtExtBinop57, <4 x i32> %foldExtExtBinop, <2 x i32> , !dbg !63 + %894 = select <2 x i1> %892, <2 x i32> %893, <2 x i32> zeroinitializer, !dbg !63 + %895 = shufflevector <2 x i1> %891, <2 x i1> %889, <2 x i32> , !dbg !63 + %896 = shufflevector <4 x i32> %foldExtExtBinop59, <4 x i32> %foldExtExtBinop55, <2 x i32> , !dbg !63 + %897 = select <2 x i1> %895, <2 x i32> %896, <2 x i32> zeroinitializer, !dbg !63 + %898 = shufflevector <2 x i32> %894, <2 x i32> %897, <2 x i32> , !dbg !64 + %899 = shufflevector <2 x i32> %801, <2 x i32> %804, <2 x i32> , !dbg !64 + %900 = xor <2 x i32> %898, %899, !dbg !64 + %901 = xor <2 x i32> %894, %801, !dbg !64 + %902 = xor <2 x i32> %897, %804, !dbg !64 + %903 = shufflevector <2 x i32> %897, <2 x i32> %894, <2 x i32> , !dbg !64 + %904 = shufflevector <2 x i32> %804, <2 x i32> %801, <2 x i32> , !dbg !64 + %905 = xor <2 x i32> %903, %904, !dbg !64 + %906 = xor <4 x i32> %868, %882, !dbg !67 + %907 = xor <4 x i32> %868, %882, !dbg !67 + %908 = shufflevector <4 x i32> %907, <4 x i32> poison, <2 x i32> , !dbg !67 + %909 = xor <4 x i32> %868, %882, !dbg !67 + %910 = shufflevector <4 x i32> %909, <4 x i32> poison, <2 x i32> , !dbg !67 + %911 = select <4 x i1> %887, <4 x i32> %906, <4 x i32> zeroinitializer, !dbg !65 + %912 = select <2 x i1> %889, <2 x i32> %908, <2 x i32> zeroinitializer, !dbg !65 + %913 = select <2 x i1> %891, <2 x i32> %910, <2 x i32> zeroinitializer, !dbg !65 + %914 = xor <4 x i32> %813, %911, !dbg !66 + %915 = shufflevector <2 x i32> %913, <2 x i32> %912, <2 x i32> , !dbg !66 + %916 = shufflevector <2 x i32> %815, <2 x i32> %814, <2 x i32> , !dbg !66 + %917 = xor <2 x i32> %915, %916, !dbg !66 + %918 = shufflevector <2 x i32> %912, <2 x i32> %913, <2 x i32> , !dbg !66 + %919 = shufflevector <2 x i32> %814, <2 x i32> %815, <2 x i32> , !dbg !66 + %920 = xor <2 x i32> %918, %919, !dbg !66 + %921 = shufflevector <2 x i32> %913, <2 x i32> %912, <2 x i32> , !dbg !66 + %922 = shufflevector <2 x i32> %815, <2 x i32> %814, <2 x i32> , !dbg !66 + %923 = xor <2 x i32> %921, %922, !dbg !66 + %924 = shufflevector <2 x i32> %912, <2 x i32> %913, <2 x i32> , !dbg !66 + %925 = shufflevector <2 x i32> %814, <2 x i32> %815, <2 x i32> , !dbg !66 + %926 = xor <2 x i32> %924, %925, !dbg !66 + %927 = shufflevector <2 x i32> %901, <2 x i32> %902, <4 x i32> , !dbg !27 + %928 = mul nuw nsw <4 x i32> %927, %35, !dbg !27 + %929 = extractelement <4 x i32> %928, i64 0, !dbg !68 + %930 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %929, i32 1, i32 31), !dbg !68 + %931 = extractelement <4 x i32> %928, i64 1, !dbg !68 + %932 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %931, i32 1, i32 31), !dbg !68 + %933 = extractelement <4 x i32> %928, i64 2, !dbg !68 + %934 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %933, i32 1, i32 31), !dbg !68 + %935 = extractelement <4 x i32> %928, i64 3, !dbg !68 + %936 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %935, i32 1, i32 31), !dbg !68 + %937 = insertelement <4 x i32> poison, i32 %930, i64 0, !dbg !69 + %938 = insertelement <4 x i32> %937, i32 %932, i64 1, !dbg !69 + %939 = insertelement <4 x i32> %938, i32 %934, i64 2, !dbg !69 + %940 = insertelement <4 x i32> %939, i32 %936, i64 3, !dbg !69 + %941 = add <4 x i32> %928, %940, !dbg !69 + %942 = mul nuw nsw <4 x i32> %927, %37, !dbg !28 + %943 = extractelement <4 x i32> %942, i64 0, !dbg !68 + %944 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %943, i32 1, i32 31), !dbg !68 + %945 = extractelement <4 x i32> %942, i64 1, !dbg !68 + %946 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %945, i32 1, i32 31), !dbg !68 + %947 = extractelement <4 x i32> %942, i64 2, !dbg !68 + %948 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %947, i32 1, i32 31), !dbg !68 + %949 = extractelement <4 x i32> %942, i64 3, !dbg !68 + %950 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %949, i32 1, i32 31), !dbg !68 + %951 = insertelement <4 x i32> poison, i32 %944, i64 0, !dbg !69 + %952 = insertelement <4 x i32> %951, i32 %946, i64 1, !dbg !69 + %953 = insertelement <4 x i32> %952, i32 %948, i64 2, !dbg !69 + %954 = insertelement <4 x i32> %953, i32 %950, i64 3, !dbg !69 + %955 = add <4 x i32> %942, %954, !dbg !69 + %956 = extractelement <2 x i32> %926, i64 1, !dbg !26 + %957 = mul nuw nsw i32 %956, %24, !dbg !25 + %958 = extractelement <2 x i32> %923, i64 1, !dbg !26 + %959 = mul nuw nsw i32 %958, %24, !dbg !25 + %960 = extractelement <2 x i32> %926, i64 0, !dbg !26 + %961 = mul nuw nsw i32 %960, %24, !dbg !25 + %962 = extractelement <2 x i32> %923, i64 0, !dbg !26 + %963 = mul nuw nsw i32 %962, %24, !dbg !25 + %964 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %957, i32 1, i32 31), !dbg !68 + %965 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %959, i32 1, i32 31), !dbg !68 + %966 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %961, i32 1, i32 31), !dbg !68 + %967 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %963, i32 1, i32 31), !dbg !68 + %968 = insertelement <2 x i32> poison, i32 %961, i64 0, !dbg !69 + %969 = insertelement <2 x i32> %968, i32 %959, i64 1, !dbg !69 + %970 = insertelement <2 x i32> poison, i32 %966, i64 0, !dbg !69 + %971 = insertelement <2 x i32> %970, i32 %965, i64 1, !dbg !69 + %972 = add <2 x i32> %969, %971, !dbg !69 + %973 = insertelement <4 x i32> poison, i32 %964, i64 0, !dbg !69 + %974 = insertelement <4 x i32> %973, i32 %965, i64 1, !dbg !69 + %975 = insertelement <4 x i32> %974, i32 %966, i64 2, !dbg !69 + %976 = insertelement <4 x i32> %975, i32 %967, i64 3, !dbg !69 + %977 = insertelement <4 x i32> poison, i32 %957, i64 0, !dbg !69 + %978 = insertelement <4 x i32> %977, i32 %959, i64 1, !dbg !69 + %979 = insertelement <4 x i32> %978, i32 %961, i64 2, !dbg !69 + %980 = insertelement <4 x i32> %979, i32 %963, i64 3, !dbg !69 + %981 = add <4 x i32> %976, %980, !dbg !69 + %982 = insertelement <2 x i32> poison, i32 %963, i64 0, !dbg !69 + %983 = insertelement <2 x i32> %982, i32 %957, i64 1, !dbg !69 + %984 = insertelement <2 x i32> poison, i32 %967, i64 0, !dbg !69 + %985 = insertelement <2 x i32> %984, i32 %964, i64 1, !dbg !69 + %986 = add <2 x i32> %983, %985, !dbg !69 + %987 = mul nuw nsw i32 %956, %22, !dbg !26 + %988 = mul nuw nsw i32 %958, %22, !dbg !26 + %989 = mul nuw nsw i32 %960, %22, !dbg !26 + %990 = mul nuw nsw i32 %962, %22, !dbg !26 + %991 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %987, i32 1, i32 31), !dbg !68 + %992 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %988, i32 1, i32 31), !dbg !68 + %993 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %989, i32 1, i32 31), !dbg !68 + %994 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %990, i32 1, i32 31), !dbg !68 + %995 = insertelement <2 x i32> poison, i32 %993, i64 0, !dbg !69 + %996 = insertelement <2 x i32> %995, i32 %992, i64 1, !dbg !69 + %997 = insertelement <2 x i32> poison, i32 %989, i64 0, !dbg !69 + %998 = insertelement <2 x i32> %997, i32 %988, i64 1, !dbg !69 + %999 = add <2 x i32> %996, %998, !dbg !69 + %1000 = insertelement <4 x i32> poison, i32 %991, i64 0, !dbg !69 + %1001 = insertelement <4 x i32> %1000, i32 %992, i64 1, !dbg !69 + %1002 = insertelement <4 x i32> %1001, i32 %993, i64 2, !dbg !69 + %1003 = insertelement <4 x i32> %1002, i32 %994, i64 3, !dbg !69 + %1004 = insertelement <4 x i32> poison, i32 %987, i64 0, !dbg !69 + %1005 = insertelement <4 x i32> %1004, i32 %988, i64 1, !dbg !69 + %1006 = insertelement <4 x i32> %1005, i32 %989, i64 2, !dbg !69 + %1007 = insertelement <4 x i32> %1006, i32 %990, i64 3, !dbg !69 + %1008 = add <4 x i32> %1003, %1007, !dbg !69 + %1009 = insertelement <2 x i32> poison, i32 %994, i64 0, !dbg !69 + %1010 = insertelement <2 x i32> %1009, i32 %991, i64 1, !dbg !69 + %1011 = insertelement <2 x i32> poison, i32 %990, i64 0, !dbg !69 + %1012 = insertelement <2 x i32> %1011, i32 %987, i64 1, !dbg !69 + %1013 = add <2 x i32> %1010, %1012, !dbg !69 + %1014 = icmp slt <4 x i32> %941, %955, !dbg !60 + %1015 = icmp slt <4 x i32> %941, %955, !dbg !60 + %1016 = shufflevector <4 x i1> %1015, <4 x i1> poison, <2 x i32> , !dbg !60 + %1017 = icmp slt <4 x i32> %941, %955, !dbg !60 + %1018 = shufflevector <4 x i1> %1017, <4 x i1> poison, <2 x i32> , !dbg !60 + %1019 = icmp eq <4 x i32> %941, %955, !dbg !70 + %1020 = icmp sgt <4 x i32> %981, %1008, !dbg !71 + %1021 = and <4 x i1> %1019, %1020, !dbg !72 + %1022 = and <4 x i1> %1019, %1020, !dbg !72 + %1023 = shufflevector <4 x i1> %1022, <4 x i1> poison, <2 x i32> , !dbg !72 + %1024 = and <4 x i1> %1019, %1020, !dbg !72 + %1025 = shufflevector <4 x i1> %1024, <4 x i1> poison, <2 x i32> , !dbg !72 + %1026 = or <4 x i1> %1014, %1021, !dbg !73 + %1027 = or <2 x i1> %1016, %1023, !dbg !73 + %1028 = or <2 x i1> %1018, %1025, !dbg !73 + %1029 = shufflevector <2 x i1> %1018, <2 x i1> %1016, <2 x i32> , !dbg !73 + %1030 = shufflevector <2 x i1> %1025, <2 x i1> %1023, <2 x i32> , !dbg !73 + %1031 = or <2 x i1> %1029, %1030, !dbg !73 + %1032 = shufflevector <2 x i1> %1016, <2 x i1> %1018, <2 x i32> , !dbg !73 + %1033 = shufflevector <2 x i1> %1023, <2 x i1> %1025, <2 x i32> , !dbg !73 + %1034 = or <2 x i1> %1032, %1033, !dbg !73 + %1035 = xor <4 x i32> %941, %955, !dbg !62 + %1036 = shufflevector <4 x i32> %1035, <4 x i32> poison, <2 x i32> , !dbg !62 + %1037 = xor <4 x i32> %941, %955, !dbg !62 + %1038 = shufflevector <4 x i32> %1037, <4 x i32> poison, <2 x i32> , !dbg !62 + %1039 = shufflevector <2 x i1> %1034, <2 x i1> %1031, <2 x i32> , !dbg !63 + %1040 = shufflevector <2 x i32> %1036, <2 x i32> %1038, <2 x i32> , !dbg !63 + %1041 = select <2 x i1> %1039, <2 x i32> %1040, <2 x i32> zeroinitializer, !dbg !63 + %1042 = select <2 x i1> %1034, <2 x i32> %1036, <2 x i32> zeroinitializer, !dbg !63 + %1043 = select <2 x i1> %1031, <2 x i32> %1038, <2 x i32> zeroinitializer, !dbg !63 + %1044 = shufflevector <2 x i1> %1031, <2 x i1> %1034, <2 x i32> , !dbg !63 + %1045 = shufflevector <2 x i32> %1038, <2 x i32> %1036, <2 x i32> , !dbg !63 + %1046 = select <2 x i1> %1044, <2 x i32> %1045, <2 x i32> zeroinitializer, !dbg !63 + %1047 = xor <2 x i32> %1041, %900, !dbg !64 + %1048 = xor <2 x i32> %1042, %901, !dbg !64 + %1049 = xor <2 x i32> %1043, %902, !dbg !64 + %1050 = xor <2 x i32> %1046, %905, !dbg !64 + %1051 = shufflevector <2 x i32> %1013, <2 x i32> %999, <4 x i32> , !dbg !67 + %1052 = shufflevector <2 x i32> %986, <2 x i32> %972, <4 x i32> , !dbg !67 + %1053 = xor <4 x i32> %1051, %1052, !dbg !67 + %1054 = xor <2 x i32> %999, %972, !dbg !67 + %1055 = xor <2 x i32> %1013, %986, !dbg !67 + %1056 = select <4 x i1> %1026, <4 x i32> %1053, <4 x i32> zeroinitializer, !dbg !65 + %1057 = select <2 x i1> %1027, <2 x i32> %1054, <2 x i32> zeroinitializer, !dbg !65 + %1058 = select <2 x i1> %1028, <2 x i32> %1055, <2 x i32> zeroinitializer, !dbg !65 + %1059 = shufflevector <2 x i1> %1031, <2 x i1> %1034, <2 x i32> , !dbg !65 + %1060 = shufflevector <2 x i32> %1054, <2 x i32> %1055, <2 x i32> , !dbg !65 + %1061 = select <2 x i1> %1059, <2 x i32> %1060, <2 x i32> zeroinitializer, !dbg !65 + %1062 = shufflevector <2 x i1> %1031, <2 x i1> %1034, <2 x i32> , !dbg !65 + %1063 = shufflevector <2 x i32> %1055, <2 x i32> %1054, <2 x i32> , !dbg !65 + %1064 = select <2 x i1> %1062, <2 x i32> %1063, <2 x i32> zeroinitializer, !dbg !65 + %1065 = shufflevector <2 x i32> %1055, <2 x i32> %1054, <2 x i32> , !dbg !65 + %1066 = select <2 x i1> %1031, <2 x i32> %1065, <2 x i32> zeroinitializer, !dbg !65 + %1067 = shufflevector <2 x i32> %1054, <2 x i32> %1055, <2 x i32> , !dbg !65 + %1068 = select <2 x i1> %1034, <2 x i32> %1067, <2 x i32> zeroinitializer, !dbg !65 + %1069 = xor <4 x i32> %914, %1056, !dbg !66 + %1070 = xor <2 x i32> %1061, %917, !dbg !66 + %1071 = xor <2 x i32> %1064, %920, !dbg !66 + %1072 = xor <2 x i32> %1066, %923, !dbg !66 + %1073 = xor <2 x i32> %1068, %926, !dbg !66 + %1074 = extractelement <2 x i32> %1048, i64 0, !dbg !60 + %1075 = extractelement <2 x i32> %1048, i64 1, !dbg !60 + %1076 = icmp slt i32 %1075, %1074, !dbg !60 + %1077 = extractelement <2 x i32> %1049, i64 0, !dbg !60 + %1078 = extractelement <2 x i32> %1049, i64 1, !dbg !60 + %1079 = icmp slt i32 %1078, %1077, !dbg !60 + %1080 = icmp eq i32 %1075, %1074, !dbg !70 + %1081 = icmp eq i32 %1078, %1077, !dbg !70 + %shift61 = shufflevector <2 x i32> %1073, <2 x i32> poison, <2 x i32> , !dbg !71 + %1082 = icmp sgt <2 x i32> %shift61, %1073, !dbg !71 + %1083 = extractelement <2 x i1> %1082, i64 0, !dbg !71 + %shift62 = shufflevector <2 x i32> %1072, <2 x i32> poison, <2 x i32> , !dbg !71 + %1084 = icmp sgt <2 x i32> %shift62, %1072, !dbg !71 + %1085 = extractelement <2 x i1> %1084, i64 0, !dbg !71 + %1086 = and i1 %1080, %1083, !dbg !72 + %1087 = and i1 %1081, %1085, !dbg !72 + %1088 = insertelement <2 x i1> poison, i1 %1076, i64 0, !dbg !73 + %1089 = insertelement <2 x i1> %1088, i1 %1079, i64 1, !dbg !73 + %1090 = insertelement <2 x i1> poison, i1 %1086, i64 0, !dbg !73 + %1091 = insertelement <2 x i1> %1090, i1 %1087, i64 1, !dbg !73 + %1092 = or <2 x i1> %1089, %1091, !dbg !73 + %1093 = shufflevector <2 x i32> %1048, <2 x i32> %1049, <2 x i32> , !dbg !62 + %1094 = shufflevector <2 x i32> %1048, <2 x i32> %1049, <2 x i32> , !dbg !62 + %1095 = xor <2 x i32> %1093, %1094, !dbg !62 + %1096 = select <2 x i1> %1092, <2 x i32> %1095, <2 x i32> zeroinitializer, !dbg !63 + %1097 = xor <2 x i32> %1096, %1047, !dbg !64 + %1098 = shufflevector <2 x i32> %1096, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !64 + %1099 = xor <2 x i32> %1098, %1048, !dbg !64 + %1100 = shufflevector <2 x i32> %1096, <2 x i32> poison, <2 x i32> , !dbg !64 + %1101 = shufflevector <2 x i32> %1096, <2 x i32> poison, <2 x i32> , !dbg !64 + %1102 = xor <2 x i32> %1101, %1049, !dbg !64 + %1103 = xor <2 x i32> %1100, %1050, !dbg !64 + %1104 = xor <2 x i32> %1071, %1070, !dbg !67 + %1105 = shufflevector <2 x i1> %1092, <2 x i1> poison, <2 x i32> , !dbg !65 + %1106 = select <2 x i1> %1105, <2 x i32> %1104, <2 x i32> zeroinitializer, !dbg !65 + %1107 = shufflevector <2 x i32> %1106, <2 x i32> poison, <4 x i32> , !dbg !66 + %1108 = xor <4 x i32> %1069, %1107, !dbg !66 + %1109 = shufflevector <2 x i32> %1106, <2 x i32> poison, <2 x i32> , !dbg !66 + %1110 = xor <2 x i32> %1057, %1109, !dbg !66 + %1111 = shufflevector <2 x i32> %1106, <2 x i32> poison, <2 x i32> zeroinitializer, !dbg !66 + %1112 = xor <2 x i32> %1111, %1072, !dbg !66 + %1113 = shufflevector <2 x i32> %1106, <2 x i32> poison, <2 x i32> , !dbg !66 + %1114 = xor <2 x i32> %1113, %1073, !dbg !66 + %1115 = icmp slt <2 x i32> %1099, %1102, !dbg !60 + %1116 = icmp eq <2 x i32> %1097, %1103, !dbg !70 + %1117 = icmp sgt <2 x i32> %1114, %1112, !dbg !71 + %1118 = and <2 x i1> %1116, %1117, !dbg !72 + %1119 = or <2 x i1> %1115, %1118, !dbg !73 + %1120 = xor <2 x i32> %1110, %1058, !dbg !67 + %1121 = xor <2 x i32> %1120, %814, !dbg !67 + %1122 = xor <2 x i32> %1121, %912, !dbg !67 + %1123 = xor <2 x i32> %1122, %815, !dbg !67 + %1124 = xor <2 x i32> %1123, %913, !dbg !67 + %1125 = xor <2 x i32> %1124, %1106, !dbg !67 + %1126 = select <2 x i1> %1119, <2 x i32> %1125, <2 x i32> zeroinitializer, !dbg !65 + %1127 = shufflevector <2 x i32> %1126, <2 x i32> poison, <4 x i32> , !dbg !65 + %1128 = xor <4 x i32> %1108, %1127, !dbg !66 + %1129 = xor <4 x i32> %1128, %739, !dbg !66 + %1130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %539, i32 2, i32 31), !dbg !58 + %1131 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 2, i32 31), !dbg !58 + %1132 = insertelement <2 x i32> poison, i32 %1130, i64 0, !dbg !58 + %1133 = insertelement <2 x i32> %1132, i32 %1131, i64 1, !dbg !58 + %1134 = bitcast <2 x i32> %1133 to i64, !dbg !58 + %1135 = add i64 %538, %1134, !dbg !56 + %extelt.offset = lshr i64 %1135, 32, !dbg !58 + %1136 = trunc nuw i64 %extelt.offset to i32, !dbg !58 + %1137 = trunc i64 %1135 to i32, !dbg !58 + %1138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1137, i32 1, i32 31), !dbg !58 + %1139 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1136, i32 1, i32 31), !dbg !58 + %1140 = insertelement <2 x i32> poison, i32 %1138, i64 0, !dbg !58 + %1141 = insertelement <2 x i32> %1140, i32 %1139, i64 1, !dbg !58 + %1142 = bitcast <2 x i32> %1141 to i64, !dbg !58 + %1143 = add i64 %1135, %1142, !dbg !56 + %1144 = insertelement <1 x i64> poison, i64 %1143, i64 0, !dbg !34 + store <1 x i64> %1144, ptr addrspace(3) %49, align 8, !dbg !34 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !34 + %1145 = load i64, ptr addrspace(3) %51, align 8, !dbg !34 + %1146 = select <4 x i1> %47, <4 x i1> %540, <4 x i1> zeroinitializer, !dbg !74 + %1147 = bitcast <4 x i1> %1146 to i4, !dbg !75 + %1148 = tail call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 %1147), !dbg !75 + %1149 = zext nneg i4 %1148 to i64, !dbg !75 + %1150 = zext nneg i4 %1148 to i32, !dbg !77 + %1151 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1150, i32 2, i32 31), !dbg !77 + %1152 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 2, i32 31), !dbg !77 + %1153 = insertelement <2 x i32> poison, i32 %1151, i64 0, !dbg !77 + %1154 = insertelement <2 x i32> %1153, i32 %1152, i64 1, !dbg !77 + %1155 = bitcast <2 x i32> %1154 to i64, !dbg !77 + %1156 = add i64 %1149, %1155, !dbg !75 + %extelt.offset34 = lshr i64 %1156, 32, !dbg !77 + %1157 = trunc nuw i64 %extelt.offset34 to i32, !dbg !77 + %1158 = trunc i64 %1156 to i32, !dbg !77 + %1159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1158, i32 1, i32 31), !dbg !77 + %1160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %1157, i32 1, i32 31), !dbg !77 + %1161 = insertelement <2 x i32> poison, i32 %1159, i64 0, !dbg !77 + %1162 = insertelement <2 x i32> %1161, i32 %1160, i64 1, !dbg !77 + %1163 = bitcast <2 x i32> %1162 to i64, !dbg !77 + %1164 = add i64 %1156, %1163, !dbg !75 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !78 + %1165 = insertelement <1 x i64> poison, i64 %1164, i64 0, !dbg !78 + store <1 x i64> %1165, ptr addrspace(3) %49, align 8, !dbg !78 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !78 + %1166 = load i64, ptr addrspace(3) %51, align 8, !dbg !78 + %1167 = trunc i64 %1143 to i32, !dbg !34 + %1168 = insertelement <4 x i32> poison, i32 %55, i64 0, !dbg !79 + %1169 = insertelement <4 x i32> %1168, i32 %54, i64 1, !dbg !79 + %1170 = insertelement <4 x i32> %1169, i32 %53, i64 2, !dbg !79 + %1171 = insertelement <4 x i32> %1170, i32 %52, i64 3, !dbg !79 + %1172 = insertelement <4 x i32> poison, i32 %1167, i64 0, !dbg !79 + %1173 = shufflevector <4 x i32> %1172, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !79 + %1174 = icmp slt <4 x i32> %1171, %1173, !dbg !79 + %1175 = select <4 x i1> %1174, <4 x i32> %534, <4 x i32> splat (i32 16), !dbg !80 + %1176 = add <4 x i32> %1175, splat (i32 17), !dbg !81 + %1177 = icmp slt <4 x i32> %1175, zeroinitializer, !dbg !82 + %1178 = select <4 x i1> %1177, <4 x i32> %1176, <4 x i32> %1175, !dbg !83 + %1179 = icmp ugt <4 x i32> %1178, splat (i32 16), !dbg !84 + %shift63 = shufflevector <4 x i1> %1179, <4 x i1> poison, <4 x i32> , !dbg !85 + %foldExtExtBinop64 = or <4 x i1> %shift63, %1179, !dbg !85 + %shift66 = shufflevector <4 x i1> %foldExtExtBinop64, <4 x i1> poison, <4 x i32> , !dbg !85 + %foldExtExtBinop67 = or <4 x i1> %1179, %shift66, !dbg !85 + %shift69 = shufflevector <4 x i1> %foldExtExtBinop67, <4 x i1> poison, <4 x i32> , !dbg !85 + %foldExtExtBinop70 = or <4 x i1> %1179, %shift69, !dbg !85 + %1180 = extractelement <4 x i1> %foldExtExtBinop70, i64 0, !dbg !85 + %1181 = and i1 %19, %1180, !dbg !85 + br i1 %1181, label %1182, label %1183, !dbg !85 + +1182: ; preds = %11 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 71, ptr nonnull @assertFunc_0, i64 1), !dbg !85 + unreachable, !dbg !85 + +1183: ; preds = %11 + %1184 = trunc i64 %1164 to i32, !dbg !78 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !85 + %1185 = insertelement <4 x i32> poison, i32 %52, i64 0, !dbg !86 + %1186 = insertelement <4 x i32> %1185, i32 %53, i64 1, !dbg !86 + %1187 = insertelement <4 x i32> %1186, i32 %54, i64 2, !dbg !86 + %1188 = insertelement <4 x i32> %1187, i32 %55, i64 3, !dbg !86 + %1189 = insertelement <4 x i32> poison, i32 %1184, i64 0, !dbg !86 + %1190 = shufflevector <4 x i32> %1189, <4 x i32> poison, <4 x i32> zeroinitializer, !dbg !86 + %1191 = icmp slt <4 x i32> %1188, %1190, !dbg !86 + %1192 = select <4 x i1> %1191, <4 x i32> %1129, <4 x i32> splat (i32 16), !dbg !87 + %1193 = add <4 x i32> %1192, splat (i32 17), !dbg !88 + %1194 = icmp slt <4 x i32> %1192, zeroinitializer, !dbg !89 + %1195 = select <4 x i1> %1194, <4 x i32> %1193, <4 x i32> %1192, !dbg !90 + %1196 = icmp ugt <4 x i32> %1195, splat (i32 16), !dbg !91 + %1197 = bitcast <4 x i1> %1196 to i4, !dbg !92 + %1198 = icmp ne i4 %1197, 0, !dbg !92 + %1199 = and i1 %19, %1198, !dbg !92 + br i1 %1199, label %1200, label %1201, !dbg !92 + +1200: ; preds = %1183 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 80, ptr nonnull @assertFunc_1, i64 1), !dbg !92 + unreachable, !dbg !92 + +1201: ; preds = %1183 + %1202 = trunc i64 %1166 to i32, !dbg !78 + %1203 = trunc i64 %1145 to i32, !dbg !34 + %1204 = or disjoint i32 %13, %17, !dbg !13 + %1205 = icmp slt i32 %1204, 128, !dbg !14 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !92 + %1206 = sext i32 %1204 to i64, !dbg !93 + %1207 = getelementptr i32, ptr addrspace(1) %1, i64 %1206, !dbg !93 + %1208 = and i32 %14, 96, !dbg !94 + %1209 = icmp eq i32 %1208, 0, !dbg !94 + %1210 = and i1 %1209, %1205, !dbg !94 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %1203, ptr addrspace(1) %1207, i1 %1210) #6, !dbg !94 + %1211 = getelementptr i32, ptr addrspace(1) %2, i64 %1206, !dbg !95 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %1202, ptr addrspace(1) %1211, i1 %1210) #6, !dbg !96 + %1212 = getelementptr i32, ptr addrspace(1) %3, i64 %58, !dbg !97 + %1213 = extractelement <4 x i32> %534, i64 0, !dbg !98 + %1214 = extractelement <4 x i32> %534, i64 1, !dbg !98 + %1215 = extractelement <4 x i32> %534, i64 2, !dbg !98 + %1216 = extractelement <4 x i32> %534, i64 3, !dbg !98 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1216, i32 %1215, i32 %1214, i32 %1213, ptr addrspace(1) %1212, i1 %19) #6, !dbg !98 + %1217 = mul i32 %18, 17, !dbg !99 + %1218 = extractelement <4 x i32> %1178, i64 3, !dbg !100 + %1219 = add i32 %1218, %1217, !dbg !100 + %1220 = extractelement <4 x i32> %1178, i64 2, !dbg !100 + %1221 = add i32 %1220, %1217, !dbg !100 + %1222 = extractelement <4 x i32> %1178, i64 1, !dbg !100 + %1223 = add i32 %1222, %1217, !dbg !100 + %1224 = extractelement <4 x i32> %1178, i64 0, !dbg !100 + %1225 = add i32 %1224, %1217, !dbg !100 + %1226 = sext i32 %1219 to i64, !dbg !101 + %1227 = getelementptr i32, ptr addrspace(1) %4, i64 %1226, !dbg !101 + %1228 = sext i32 %1221 to i64, !dbg !101 + %1229 = getelementptr i32, ptr addrspace(1) %4, i64 %1228, !dbg !101 + %1230 = sext i32 %1223 to i64, !dbg !101 + %1231 = getelementptr i32, ptr addrspace(1) %4, i64 %1230, !dbg !101 + %1232 = sext i32 %1225 to i64, !dbg !101 + %1233 = getelementptr i32, ptr addrspace(1) %4, i64 %1232, !dbg !101 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !102 + %1234 = ptrtoint ptr addrspace(1) %1227 to i64, !dbg !102 + %1235 = ptrtoint ptr addrspace(1) %1229 to i64, !dbg !102 + %1236 = ptrtoint ptr addrspace(1) %1231 to i64, !dbg !102 + %1237 = ptrtoint ptr addrspace(1) %1233 to i64, !dbg !102 + %1238 = and i32 %14, 48, !dbg !102 + %1239 = shl nuw nsw i32 %1238, 6, !dbg !102 + %1240 = shl nuw nsw i32 %14, 3, !dbg !102 + %1241 = and i32 %1240, 120, !dbg !102 + %1242 = lshr exact i32 %1238, 1, !dbg !102 + %1243 = shl nuw nsw i32 %14, 1, !dbg !102 + %1244 = and i32 %1243, 128, !dbg !102 + %1245 = or disjoint i32 %1239, %1241, !dbg !102 + %1246 = xor i32 %1245, %1242, !dbg !102 + %1247 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1244, !dbg !102 + %1248 = getelementptr inbounds nuw i8, ptr addrspace(3) %1247, i32 %1246, !dbg !102 + %1249 = insertelement <1 x i64> poison, i64 %1234, i64 0, !dbg !102 + store <1 x i64> %1249, ptr addrspace(3) %1248, align 8, !dbg !102 + %1250 = getelementptr inbounds nuw i8, ptr addrspace(3) %1248, i32 256, !dbg !102 + %1251 = insertelement <1 x i64> poison, i64 %1235, i64 0, !dbg !102 + store <1 x i64> %1251, ptr addrspace(3) %1250, align 8, !dbg !102 + %1252 = getelementptr inbounds nuw i8, ptr addrspace(3) %1248, i32 512, !dbg !102 + %1253 = insertelement <1 x i64> poison, i64 %1236, i64 0, !dbg !102 + store <1 x i64> %1253, ptr addrspace(3) %1252, align 8, !dbg !102 + %1254 = getelementptr inbounds nuw i8, ptr addrspace(3) %1248, i32 768, !dbg !102 + %1255 = insertelement <1 x i64> poison, i64 %1237, i64 0, !dbg !102 + store <1 x i64> %1255, ptr addrspace(3) %1254, align 8, !dbg !102 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !102 + %1256 = and i32 %14, 12, !dbg !102 + %1257 = shl nuw nsw i32 %1256, 8, !dbg !102 + %1258 = shl nuw nsw i32 %20, 5, !dbg !102 + %1259 = and i32 %1240, 896, !dbg !102 + %1260 = shl nuw nsw i32 %1256, 1, !dbg !102 + %1261 = or disjoint i32 %1257, %1258, !dbg !102 + %1262 = or disjoint i32 %1261, %1259, !dbg !102 + %1263 = or disjoint i32 %1262, %1260, !dbg !102 + %1264 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1263, !dbg !102 + %1265 = load i64, ptr addrspace(3) %1264, align 8, !dbg !102 + %1266 = xor i32 %1263, 8, !dbg !102 + %1267 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1266, !dbg !102 + %1268 = load i64, ptr addrspace(3) %1267, align 8, !dbg !102 + %1269 = xor i32 %1263, 16, !dbg !102 + %1270 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1269, !dbg !102 + %1271 = load i64, ptr addrspace(3) %1270, align 8, !dbg !102 + %1272 = xor i32 %1263, 24, !dbg !102 + %1273 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %1272, !dbg !102 + %1274 = load i64, ptr addrspace(3) %1273, align 8, !dbg !102 + %1275 = inttoptr i64 %1265 to ptr addrspace(1), !dbg !102 + %1276 = inttoptr i64 %1268 to ptr addrspace(1), !dbg !102 + %1277 = inttoptr i64 %1271 to ptr addrspace(1), !dbg !102 + %1278 = inttoptr i64 %1274 to ptr addrspace(1), !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1275, i1 %1205) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1276, i1 %1205) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1277, i1 %1205) #6, !dbg !102 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1278, i1 %1205) #6, !dbg !102 + %1279 = getelementptr i32, ptr addrspace(1) %5, i64 %58, !dbg !103 + %1280 = extractelement <4 x i32> %1129, i64 0, !dbg !104 + %1281 = extractelement <4 x i32> %1129, i64 1, !dbg !104 + %1282 = extractelement <4 x i32> %1129, i64 2, !dbg !104 + %1283 = extractelement <4 x i32> %1129, i64 3, !dbg !104 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1280, i32 %1281, i32 %1282, i32 %1283, ptr addrspace(1) %1279, i1 %19) #6, !dbg !104 + %1284 = extractelement <4 x i32> %1195, i64 0, !dbg !105 + %1285 = add i32 %1284, %1217, !dbg !105 + %1286 = extractelement <4 x i32> %1195, i64 1, !dbg !105 + %1287 = add i32 %1286, %1217, !dbg !105 + %1288 = extractelement <4 x i32> %1195, i64 2, !dbg !105 + %1289 = add i32 %1288, %1217, !dbg !105 + %1290 = extractelement <4 x i32> %1195, i64 3, !dbg !105 + %1291 = add i32 %1290, %1217, !dbg !105 + %1292 = sext i32 %1285 to i64, !dbg !106 + %1293 = getelementptr i32, ptr addrspace(1) %6, i64 %1292, !dbg !106 + %1294 = sext i32 %1287 to i64, !dbg !106 + %1295 = getelementptr i32, ptr addrspace(1) %6, i64 %1294, !dbg !106 + %1296 = sext i32 %1289 to i64, !dbg !106 + %1297 = getelementptr i32, ptr addrspace(1) %6, i64 %1296, !dbg !106 + %1298 = sext i32 %1291 to i64, !dbg !106 + %1299 = getelementptr i32, ptr addrspace(1) %6, i64 %1298, !dbg !106 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !107 + %1300 = ptrtoint ptr addrspace(1) %1293 to i64, !dbg !107 + %1301 = ptrtoint ptr addrspace(1) %1295 to i64, !dbg !107 + %1302 = ptrtoint ptr addrspace(1) %1297 to i64, !dbg !107 + %1303 = ptrtoint ptr addrspace(1) %1299 to i64, !dbg !107 + %1304 = insertelement <1 x i64> poison, i64 %1300, i64 0, !dbg !107 + store <1 x i64> %1304, ptr addrspace(3) %1248, align 8, !dbg !107 + %1305 = insertelement <1 x i64> poison, i64 %1301, i64 0, !dbg !107 + store <1 x i64> %1305, ptr addrspace(3) %1250, align 8, !dbg !107 + %1306 = insertelement <1 x i64> poison, i64 %1302, i64 0, !dbg !107 + store <1 x i64> %1306, ptr addrspace(3) %1252, align 8, !dbg !107 + %1307 = insertelement <1 x i64> poison, i64 %1303, i64 0, !dbg !107 + store <1 x i64> %1307, ptr addrspace(3) %1254, align 8, !dbg !107 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !107 + %1308 = load i64, ptr addrspace(3) %1264, align 8, !dbg !107 + %1309 = load i64, ptr addrspace(3) %1267, align 8, !dbg !107 + %1310 = load i64, ptr addrspace(3) %1270, align 8, !dbg !107 + %1311 = load i64, ptr addrspace(3) %1273, align 8, !dbg !107 + %1312 = inttoptr i64 %1308 to ptr addrspace(1), !dbg !107 + %1313 = inttoptr i64 %1309 to ptr addrspace(1), !dbg !107 + %1314 = inttoptr i64 %1310 to ptr addrspace(1), !dbg !107 + %1315 = inttoptr i64 %1311 to ptr addrspace(1), !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1312, i1 %1205) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1313, i1 %1205) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1314, i1 %1205) #6, !dbg !107 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %1315, i1 %1205) #6, !dbg !107 + ret void, !dbg !108 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i4 @llvm.ctpop.i4(i4) #5 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="128" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", linkageName: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 24, column: 28, scope: !9) +!11 = !DILocation(line: 24, column: 33, scope: !9) +!12 = !DILocation(line: 25, column: 44, scope: !9) +!13 = !DILocation(line: 25, column: 23, scope: !9) +!14 = !DILocation(line: 26, column: 21, scope: !9) +!15 = !DILocation(line: 27, column: 38, scope: !9) +!16 = !DILocation(line: 34, column: 40, scope: !9) +!17 = !DILocation(line: 627, column: 44, scope: !18, inlinedAt: !20) +!18 = distinct !DILexicalBlockFile(scope: !9, file: !19, discriminator: 0) +!19 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!20 = !DILocation(line: 46, column: 71, scope: !9) +!21 = !DILocation(line: 537, column: 21, scope: !18, inlinedAt: !20) +!22 = !DILocation(line: 599, column: 28, scope: !18, inlinedAt: !20) +!23 = !DILocation(line: 599, column: 28, scope: !18, inlinedAt: !24) +!24 = !DILocation(line: 51, column: 71, scope: !9) +!25 = !DILocation(line: 548, column: 23, scope: !18, inlinedAt: !24) +!26 = !DILocation(line: 551, column: 23, scope: !18, inlinedAt: !24) +!27 = !DILocation(line: 538, column: 40, scope: !18, inlinedAt: !24) +!28 = !DILocation(line: 539, column: 41, scope: !18, inlinedAt: !24) +!29 = !DILocation(line: 538, column: 40, scope: !18, inlinedAt: !20) +!30 = !DILocation(line: 539, column: 41, scope: !18, inlinedAt: !20) +!31 = !DILocation(line: 551, column: 23, scope: !18, inlinedAt: !20) +!32 = !DILocation(line: 548, column: 23, scope: !18, inlinedAt: !20) +!33 = !DILocation(line: 54, column: 35, scope: !9) +!34 = !DILocation(line: 60, column: 21, scope: !9) +!35 = !DILocation(line: 34, column: 37, scope: !9) +!36 = !DILocation(line: 34, column: 30, scope: !9) +!37 = !DILocation(line: 34, column: 45, scope: !9) +!38 = !DILocation(line: 601, column: 48, scope: !18, inlinedAt: !20) +!39 = !DILocation(line: 39, column: 18, scope: !9) +!40 = !DILocation(line: 574, column: 22, scope: !18, inlinedAt: !20) +!41 = !DILocation(line: 0, scope: !9) +!42 = !DILocation(line: 599, column: 19, scope: !18, inlinedAt: !20) +!43 = !DILocation(line: 600, column: 38, scope: !18, inlinedAt: !20) +!44 = !DILocation(line: 600, column: 46, scope: !18, inlinedAt: !20) +!45 = !DILocation(line: 600, column: 15, scope: !18, inlinedAt: !20) +!46 = !DILocation(line: 601, column: 59, scope: !18, inlinedAt: !20) +!47 = !DILocation(line: 601, column: 22, scope: !18, inlinedAt: !20) +!48 = !DILocation(line: 291, column: 36, scope: !49, inlinedAt: !20) +!49 = distinct !DILexicalBlockFile(scope: !9, file: !50, discriminator: 0) +!50 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!51 = !DILocation(line: 261, column: 15, scope: !49, inlinedAt: !20) +!52 = !DILocation(line: 591, column: 21, scope: !18, inlinedAt: !20) +!53 = !DILocation(line: 594, column: 40, scope: !18, inlinedAt: !20) +!54 = !DILocation(line: 594, column: 29, scope: !18, inlinedAt: !20) +!55 = !DILocation(line: 594, column: 23, scope: !18, inlinedAt: !20) +!56 = !DILocation(line: 261, column: 15, scope: !49, inlinedAt: !57) +!57 = !DILocation(line: 55, column: 26, scope: !9) +!58 = !DILocation(line: 291, column: 36, scope: !49, inlinedAt: !57) +!59 = !DILocation(line: 47, column: 20, scope: !9) +!60 = !DILocation(line: 574, column: 22, scope: !18, inlinedAt: !24) +!61 = !DILocation(line: 599, column: 19, scope: !18, inlinedAt: !24) +!62 = !DILocation(line: 600, column: 38, scope: !18, inlinedAt: !24) +!63 = !DILocation(line: 600, column: 46, scope: !18, inlinedAt: !24) +!64 = !DILocation(line: 600, column: 15, scope: !18, inlinedAt: !24) +!65 = !DILocation(line: 601, column: 59, scope: !18, inlinedAt: !24) +!66 = !DILocation(line: 601, column: 22, scope: !18, inlinedAt: !24) +!67 = !DILocation(line: 601, column: 48, scope: !18, inlinedAt: !24) +!68 = !DILocation(line: 291, column: 36, scope: !49, inlinedAt: !24) +!69 = !DILocation(line: 261, column: 15, scope: !49, inlinedAt: !24) +!70 = !DILocation(line: 591, column: 21, scope: !18, inlinedAt: !24) +!71 = !DILocation(line: 594, column: 40, scope: !18, inlinedAt: !24) +!72 = !DILocation(line: 594, column: 29, scope: !18, inlinedAt: !24) +!73 = !DILocation(line: 594, column: 23, scope: !18, inlinedAt: !24) +!74 = !DILocation(line: 58, column: 35, scope: !9) +!75 = !DILocation(line: 261, column: 15, scope: !49, inlinedAt: !76) +!76 = !DILocation(line: 59, column: 26, scope: !9) +!77 = !DILocation(line: 291, column: 36, scope: !49, inlinedAt: !76) +!78 = !DILocation(line: 61, column: 21, scope: !9) +!79 = !DILocation(line: 64, column: 19, scope: !9) +!80 = !DILocation(line: 66, column: 35, scope: !9) +!81 = !DILocation(line: 68, column: 20, scope: !9) +!82 = !DILocation(line: 69, column: 20, scope: !9) +!83 = !DILocation(line: 70, column: 35, scope: !9) +!84 = !DILocation(line: 71, column: 38, scope: !9) +!85 = !DILocation(line: 71, column: 63, scope: !9) +!86 = !DILocation(line: 75, column: 19, scope: !9) +!87 = !DILocation(line: 76, column: 35, scope: !9) +!88 = !DILocation(line: 77, column: 20, scope: !9) +!89 = !DILocation(line: 78, column: 20, scope: !9) +!90 = !DILocation(line: 79, column: 35, scope: !9) +!91 = !DILocation(line: 80, column: 38, scope: !9) +!92 = !DILocation(line: 80, column: 63, scope: !9) +!93 = !DILocation(line: 81, column: 25, scope: !9) +!94 = !DILocation(line: 81, column: 37, scope: !9) +!95 = !DILocation(line: 82, column: 25, scope: !9) +!96 = !DILocation(line: 82, column: 37, scope: !9) +!97 = !DILocation(line: 83, column: 25, scope: !9) +!98 = !DILocation(line: 83, column: 47, scope: !9) +!99 = !DILocation(line: 84, column: 52, scope: !9) +!100 = !DILocation(line: 84, column: 49, scope: !9) +!101 = !DILocation(line: 84, column: 25, scope: !9) +!102 = !DILocation(line: 84, column: 85, scope: !9) +!103 = !DILocation(line: 85, column: 25, scope: !9) +!104 = !DILocation(line: 85, column: 47, scope: !9) +!105 = !DILocation(line: 86, column: 49, scope: !9) +!106 = !DILocation(line: 86, column: 25, scope: !9) +!107 = !DILocation(line: 86, column: 85, scope: !9) +!108 = !DILocation(line: 86, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..2526340281753587d389f1a323f72922b4747289 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx @@ -0,0 +1,2103 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 // -- Begin function triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 54, 113, 47, 99, 54, 113, 118, 101, 119, 121, 100, 105, 118, 103, 52, 98, 118, 99, 52, 113, 121, 122, 102, 103, 121, 98, 120, 97, 54, 117, 115, 109, 52, 102, 55, 108, 98, 105, 114, 116, 103, 114, 51, 55, 110, 104, 109, 108, 119, 102, 101, 100, 54, 112, 53, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 57, 32, 60, 32, 49, 55}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 54, 113, 47, 99, 54, 113, 118, 101, 119, 121, 100, 105, 118, 103, 52, 98, 118, 99, 52, 113, 121, 122, 102, 103, 121, 98, 120, 97, 54, 117, 115, 109, 52, 102, 55, 108, 98, 105, 114, 116, 103, 114, 51, 55, 110, 104, 109, 108, 119, 102, 101, 100, 54, 112, 53, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 48, 32, 60, 32, 49, 55}; +.extern .shared .align 16 .b8 global_smem[]; + // @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.visible .entry triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_7, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_8, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_9, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_10 +) +.reqntid 128 +{ + .reg .pred %p<324>; + .reg .b16 %rs<37>; + .reg .b32 %r<812>; + .reg .b64 %rd<82>; + .loc 1 18 0 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:18:0 +$L__func_begin0: + .loc 1 18 0 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:18:0 + +// %bb.0: + ld.param.b64 %rd17, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0]; +$L__tmp0: + .loc 1 24 28 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:24:28 + mov.u32 %r26, %ctaid.x; + .loc 1 24 33 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:24:33 + shl.b32 %r1, %r26, 5; + .loc 1 25 44 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:25:44 + mov.u32 %r2, %tid.x; + and.b32 %r27, %r2, 124; + bfe.u32 %r28, %r2, 2, 5; + and.b32 %r3, %r2, 31; + .loc 1 25 23 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:25:23 + or.b32 %r4, %r28, %r1; + .loc 1 26 21 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:26:21 + setp.gt.s32 %p3, %r4, 127; + setp.lt.s32 %p2, %r4, 128; + .loc 1 27 38 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:27:38 + and.b32 %r5, %r2, 3; + .loc 1 34 40 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:34:40 + shl.b32 %r29, %r4, 4; +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + and.b32 %r30, %r2, 1; + shr.u32 %r31, %r2, 1; + bfe.u32 %r32, %r2, 1, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r33, %r30, 1; + xor.b32 %r34, %r32, 1; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.ne.b32 %p4, %r30, 0; + and.b32 %r35, %r31, 1; + setp.ne.b32 %p5, %r35, 0; +$L__tmp2: + .loc 1 60 21 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:60:21 + shl.b32 %r36, %r27, 1; + mov.b32 %r37, global_smem; + add.s32 %r38, %r37, %r36; + shl.b32 %r39, %r3, 3; + add.s32 %r40, %r37, %r39; + .loc 1 27 38 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:27:38 + shl.b32 %r6, %r5, 2; + or.b32 %r7, %r6, 1; + or.b32 %r8, %r6, 2; + or.b32 %r9, %r6, 3; + .loc 1 34 37 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:34:37 + or.b32 %r41, %r29, %r6; + .loc 1 34 30 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:34:30 + mad.wide.s32 %rd13, %r41, 8, %rd17; + cvt.s64.s32 %rd18, %r29; + cvt.u64.u32 %rd19, %r6; + or.b64 %rd20, %rd18, %rd19; + shl.b64 %rd21, %rd20, 3; + add.s64 %rd22, %rd17, %rd21; + add.s64 %rd16, %rd22, 16; + .loc 1 34 45 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:34:45 + // begin inline asm + mov.u64 %rd11, 0x0; + mov.u64 %rd12, 0x0; + @%p2 ld.global.v2.b64 { %rd11, %rd12 }, [ %rd13 + 0 ]; + // end inline asm + // begin inline asm + mov.u64 %rd14, 0x0; + mov.u64 %rd15, 0x0; + @%p2 ld.global.v2.b64 { %rd14, %rd15 }, [ %rd16 + 0 ]; + // end inline asm +$L__tmp3: + .loc 2 601 48 // triton_helpers.py:601:48 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r42, %r7, %r6; + xor.b32 %r43, %r8, %r9; +$L__tmp4: + .loc 1 39 18 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:39:18 + add.s64 %rd23, %rd12, -1; + add.s64 %rd24, %rd14, -1; + add.s64 %rd25, %rd11, -1; + add.s64 %rd26, %rd15, -1; + setp.gt.u64 %p6, %rd26, 16382; + setp.gt.u64 %p7, %rd25, 16382; + setp.lt.u64 %p8, %rd26, 16383; + setp.lt.u64 %p9, %rd24, 16383; + setp.lt.u64 %p10, %rd23, 16383; + setp.lt.u64 %p11, %rd25, 16383; + .loc 1 0 0 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:0 + selp.b32 %r44, 1, 0, %p11; + selp.b32 %r45, 1, 0, %p10; + selp.b32 %r46, 1, 0, %p9; + selp.b32 %r47, 1, 0, %p8; +$L__tmp5: + .loc 2 574 22 // triton_helpers.py:574:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + and.pred %p12, %p10, %p7; + .loc 2 599 19 // triton_helpers.py:599:19 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + or.pred %p13, %p9, %p6; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r48, %r44, %r45; + xor.b32 %r49, %r46, %r47; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r50, %r48, 0, %p12; + selp.b32 %r51, %r49, 0, %p13; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r52, %r50, %r44; + xor.b32 %r53, %r50, %r45; + xor.b32 %r54, %r51, %r46; + xor.b32 %r55, %r51, %r47; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r56, %r42, 0, %p12; + selp.b32 %r57, %r43, 0, %p13; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r58, %r56, %r6; + xor.b32 %r59, %r56, %r7; + xor.b32 %r60, %r57, %r8; + xor.b32 %r61, %r57, %r9; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.ge.u32 %p14, %r52, %r54; + setp.ge.u32 %p15, %r53, %r55; + setp.ne.b32 %p16, %r53, %r55; + setp.ne.b32 %p17, %r54, %r52; + setp.le.u32 %p18, %r59, %r61; + setp.le.u32 %p19, %r58, %r60; + or.pred %p20, %p19, %p17; + or.pred %p21, %p18, %p16; + and.pred %p22, %p15, %p21; + and.pred %p23, %p14, %p20; + xor.pred %p24, %p23, %p4; + xor.pred %p25, %p22, %p4; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r62, %r52, %r54; + xor.b32 %r63, %r55, %r53; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r64, 0, %r63, %p25; + selp.b32 %r65, 0, %r62, %p24; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r66, %r65, %r52; + xor.b32 %r67, %r64, %r53; + xor.b32 %r68, %r64, %r55; + xor.b32 %r69, %r65, %r54; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r70, %r60, %r58; + xor.b32 %r71, %r61, %r59; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r72, 0, %r70, %p24; + selp.b32 %r73, 0, %r71, %p25; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r74, %r72, %r58; + xor.b32 %r75, %r73, %r59; + xor.b32 %r76, %r72, %r60; + xor.b32 %r77, %r73, %r61; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.ge.u32 %p26, %r66, %r67; + setp.ne.b32 %p27, %r66, %r67; + setp.le.u32 %p28, %r74, %r75; + or.pred %p29, %p27, %p28; + and.pred %p30, %p26, %p29; + xor.pred %p31, %p30, %p4; + setp.ge.u32 %p32, %r69, %r68; + setp.ne.b32 %p33, %r69, %r68; + setp.le.u32 %p34, %r76, %r77; + or.pred %p35, %p33, %p34; + and.pred %p36, %p32, %p35; + xor.pred %p37, %p36, %p4; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r78, %r66, %r67; + xor.b32 %r79, %r69, %r68; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r80, 0, %r78, %p31; + selp.b32 %r81, 0, %r79, %p37; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r82, %r80, %r66; + xor.b32 %r83, %r80, %r67; + xor.b32 %r84, %r81, %r69; + xor.b32 %r85, %r81, %r68; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r86, %r74, %r75; + xor.b32 %r87, %r76, %r77; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r88, 0, %r86, %p31; + selp.b32 %r89, 0, %r87, %p37; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r90, %r88, %r74; + xor.b32 %r91, %r88, %r75; + xor.b32 %r92, %r89, %r76; + xor.b32 %r93, %r89, %r77; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + mul.lo.s32 %r94, %r82, %r33; + mul.lo.s32 %r95, %r83, %r33; + mul.lo.s32 %r96, %r84, %r33; + mul.lo.s32 %r97, %r85, %r33; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r98, %r94, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r99, %r94, %r98; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r100, %r95, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r101, %r95, %r100; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r102, %r96, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r103, %r96, %r102; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r104, %r97, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r105, %r97, %r104; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + mul.lo.s32 %r106, %r82, %r30; + mul.lo.s32 %r107, %r83, %r30; + mul.lo.s32 %r108, %r84, %r30; + mul.lo.s32 %r109, %r85, %r30; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r110, %r106, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r111, %r106, %r110; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r112, %r107, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r113, %r107, %r112; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r114, %r108, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r115, %r108, %r114; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r116, %r109, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r117, %r109, %r116; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + mul.lo.s32 %r118, %r90, %r33; + mul.lo.s32 %r119, %r91, %r33; + mul.lo.s32 %r120, %r92, %r33; + mul.lo.s32 %r121, %r93, %r33; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r122, %r118, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r123, %r118, %r122; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r124, %r119, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r125, %r119, %r124; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r126, %r120, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r127, %r120, %r126; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r128, %r121, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r129, %r121, %r128; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + mul.lo.s32 %r130, %r90, %r30; + mul.lo.s32 %r131, %r91, %r30; + mul.lo.s32 %r132, %r92, %r30; + mul.lo.s32 %r133, %r93, %r30; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r134, %r130, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r135, %r130, %r134; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r136, %r131, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r137, %r131, %r136; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r138, %r132, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r139, %r132, %r138; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r140, %r133, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r141, %r133, %r140; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.ge.s32 %p38, %r99, %r111; + setp.ne.b32 %p39, %r99, %r111; + setp.le.s32 %p40, %r123, %r135; + or.pred %p41, %p39, %p40; + and.pred %p42, %p38, %p41; + xor.pred %p43, %p42, %p5; + setp.ge.s32 %p44, %r101, %r113; + setp.ne.b32 %p45, %r101, %r113; + setp.le.s32 %p46, %r125, %r137; + or.pred %p47, %p45, %p46; + and.pred %p48, %p44, %p47; + xor.pred %p49, %p48, %p5; + setp.ge.s32 %p50, %r103, %r115; + setp.ne.b32 %p51, %r103, %r115; + setp.le.s32 %p52, %r127, %r139; + or.pred %p53, %p51, %p52; + and.pred %p54, %p50, %p53; + xor.pred %p55, %p54, %p5; + setp.ge.s32 %p56, %r105, %r117; + setp.ne.b32 %p57, %r105, %r117; + setp.le.s32 %p58, %r129, %r141; + or.pred %p59, %p57, %p58; + and.pred %p60, %p56, %p59; + xor.pred %p61, %p60, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r142, %r99, %r111; + xor.b32 %r143, %r101, %r113; + xor.b32 %r144, %r103, %r115; + xor.b32 %r145, %r105, %r117; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r146, 0, %r142, %p43; + selp.b32 %r147, 0, %r143, %p49; + selp.b32 %r148, 0, %r144, %p55; + selp.b32 %r149, 0, %r145, %p61; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r150, %r146, %r82; + xor.b32 %r151, %r147, %r83; + xor.b32 %r152, %r148, %r84; + xor.b32 %r153, %r149, %r85; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r154, %r123, %r135; + xor.b32 %r155, %r125, %r137; + xor.b32 %r156, %r127, %r139; + xor.b32 %r157, %r129, %r141; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r158, 0, %r154, %p43; + selp.b32 %r159, 0, %r155, %p49; + selp.b32 %r160, 0, %r156, %p55; + selp.b32 %r161, 0, %r157, %p61; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r162, %r158, %r90; + xor.b32 %r163, %r159, %r91; + xor.b32 %r164, %r160, %r92; + xor.b32 %r165, %r161, %r93; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.ge.s32 %p62, %r150, %r152; + setp.ne.b32 %p63, %r150, %r152; + setp.le.s32 %p64, %r162, %r164; + or.pred %p65, %p63, %p64; + and.pred %p66, %p62, %p65; + xor.pred %p67, %p66, %p5; + setp.ge.s32 %p68, %r151, %r153; + setp.ne.b32 %p69, %r151, %r153; + setp.le.s32 %p70, %r163, %r165; + or.pred %p71, %p69, %p70; + and.pred %p72, %p68, %p71; + xor.pred %p73, %p72, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r166, %r150, %r152; + xor.b32 %r167, %r151, %r153; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r168, 0, %r166, %p67; + selp.b32 %r169, 0, %r167, %p73; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r170, %r168, %r150; + xor.b32 %r171, %r169, %r151; + xor.b32 %r172, %r168, %r152; + xor.b32 %r173, %r169, %r153; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r174, %r162, %r164; + xor.b32 %r175, %r163, %r165; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r176, 0, %r174, %p67; + selp.b32 %r177, 0, %r175, %p73; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r178, %r176, %r162; + xor.b32 %r179, %r177, %r163; + xor.b32 %r180, %r176, %r164; + xor.b32 %r181, %r177, %r165; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.ge.s32 %p74, %r170, %r171; + setp.ne.b32 %p75, %r170, %r171; + setp.le.s32 %p76, %r178, %r179; + or.pred %p77, %p75, %p76; + and.pred %p78, %p74, %p77; + xor.pred %p79, %p78, %p5; + setp.ge.s32 %p80, %r172, %r173; + setp.ne.b32 %p81, %r172, %r173; + setp.le.s32 %p82, %r180, %r181; + or.pred %p83, %p81, %p82; + and.pred %p84, %p80, %p83; + xor.pred %p85, %p84, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r182, %r170, %r171; + xor.b32 %r183, %r172, %r173; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r184, 0, %r182, %p79; + selp.b32 %r185, 0, %r183, %p85; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r186, %r178, %r179; + xor.b32 %r187, %r180, %r181; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r188, 0, %r186, %p79; + selp.b32 %r189, 0, %r187, %p85; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r190, %r188, %r178; + xor.b32 %r191, %r188, %r179; + xor.b32 %r192, %r189, %r180; + xor.b32 %r193, %r189, %r181; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + mul.lo.s32 %r194, %r190, %r34; + mul.lo.s32 %r195, %r191, %r34; + mul.lo.s32 %r196, %r192, %r34; + mul.lo.s32 %r197, %r193, %r34; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + mul.lo.s32 %r198, %r190, %r32; + mul.lo.s32 %r199, %r191, %r32; + mul.lo.s32 %r200, %r192, %r32; + mul.lo.s32 %r201, %r193, %r32; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r202, %r184, %r171; + xor.b32 %r203, %r184, %r170; + xor.b32 %r204, %r185, %r173; + xor.b32 %r205, %r185, %r172; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + mul.lo.s32 %r206, %r203, %r34; + mul.lo.s32 %r207, %r202, %r34; + mul.lo.s32 %r208, %r205, %r34; + mul.lo.s32 %r209, %r204, %r34; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r210, %r206, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r211, %r206, %r210; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r212, %r207, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r213, %r207, %r212; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r214, %r208, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r215, %r208, %r214; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r216, %r209, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r217, %r209, %r216; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + mul.lo.s32 %r218, %r203, %r32; + mul.lo.s32 %r219, %r202, %r32; + mul.lo.s32 %r220, %r205, %r32; + mul.lo.s32 %r221, %r204, %r32; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r222, %r218, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r223, %r218, %r222; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r224, %r219, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r225, %r219, %r224; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r226, %r220, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r227, %r220, %r226; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r228, %r221, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r229, %r221, %r228; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r230, %r194, 2, 31, -1; + shfl.sync.bfly.b32 %r231, %r195, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r232, %r195, %r231; + add.s32 %r233, %r194, %r230; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r234, %r196, 2, 31, -1; + shfl.sync.bfly.b32 %r235, %r197, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r236, %r197, %r235; + add.s32 %r237, %r196, %r234; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r238, %r198, 2, 31, -1; + shfl.sync.bfly.b32 %r239, %r199, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r240, %r199, %r239; + add.s32 %r241, %r198, %r238; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r242, %r200, 2, 31, -1; + shfl.sync.bfly.b32 %r243, %r201, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r244, %r201, %r243; + add.s32 %r245, %r200, %r242; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.lt.s32 %p86, %r213, %r225; + setp.lt.s32 %p87, %r211, %r223; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.eq.b32 %p88, %r211, %r223; + setp.eq.b32 %p89, %r213, %r225; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.gt.s32 %p90, %r233, %r241; + setp.gt.s32 %p91, %r232, %r240; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + and.pred %p92, %p89, %p91; + and.pred %p93, %p88, %p90; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + or.pred %p94, %p87, %p93; + or.pred %p95, %p86, %p92; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.lt.s32 %p96, %r217, %r229; + setp.lt.s32 %p97, %r215, %r227; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.eq.b32 %p98, %r215, %r227; + setp.eq.b32 %p99, %r217, %r229; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.gt.s32 %p100, %r237, %r245; + setp.gt.s32 %p101, %r236, %r244; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + and.pred %p102, %p99, %p101; + and.pred %p103, %p98, %p100; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + or.pred %p104, %p97, %p103; + or.pred %p105, %p96, %p102; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r246, %r211, %r223; + xor.b32 %r247, %r213, %r225; + xor.b32 %r248, %r215, %r227; + xor.b32 %r249, %r217, %r229; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r250, %r247, 0, %p95; + selp.b32 %r251, %r246, 0, %p94; + selp.b32 %r252, %r249, 0, %p105; + selp.b32 %r253, %r248, 0, %p104; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r254, %r251, %r203; + xor.b32 %r255, %r250, %r202; + xor.b32 %r256, %r253, %r205; + xor.b32 %r257, %r252, %r204; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r258, %r233, %r241; + xor.b32 %r259, %r232, %r240; + xor.b32 %r260, %r237, %r245; + xor.b32 %r261, %r236, %r244; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r262, %r259, 0, %p95; + selp.b32 %r263, %r258, 0, %p94; + selp.b32 %r264, %r261, 0, %p105; + selp.b32 %r265, %r260, 0, %p104; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r266, %r263, %r190; + xor.b32 %r267, %r262, %r191; + xor.b32 %r268, %r265, %r192; + xor.b32 %r269, %r264, %r193; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + mul.lo.s32 %r270, %r255, %r33; + mul.lo.s32 %r271, %r254, %r33; + mul.lo.s32 %r272, %r257, %r33; + mul.lo.s32 %r273, %r256, %r33; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r274, %r271, 1, 31, -1; + shfl.sync.bfly.b32 %r275, %r270, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r276, %r271, %r274; + add.s32 %r277, %r270, %r275; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r278, %r273, 1, 31, -1; + shfl.sync.bfly.b32 %r279, %r272, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r280, %r273, %r278; + add.s32 %r281, %r272, %r279; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + mul.lo.s32 %r282, %r255, %r30; + mul.lo.s32 %r283, %r254, %r30; + mul.lo.s32 %r284, %r257, %r30; + mul.lo.s32 %r285, %r256, %r30; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r286, %r283, 1, 31, -1; + shfl.sync.bfly.b32 %r287, %r282, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r288, %r283, %r286; + add.s32 %r289, %r282, %r287; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r290, %r285, 1, 31, -1; + shfl.sync.bfly.b32 %r291, %r284, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r292, %r285, %r290; + add.s32 %r293, %r284, %r291; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + mul.lo.s32 %r294, %r267, %r33; + mul.lo.s32 %r295, %r266, %r33; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r296, %r295, 1, 31, -1; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + mul.lo.s32 %r297, %r269, %r33; + mul.lo.s32 %r298, %r268, %r33; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r299, %r294, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r300, %r294, %r299; + add.s32 %r301, %r295, %r296; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + mul.lo.s32 %r302, %r269, %r30; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r303, %r298, 1, 31, -1; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + mul.lo.s32 %r304, %r268, %r30; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r305, %r297, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r306, %r297, %r305; + add.s32 %r307, %r298, %r303; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + mul.lo.s32 %r308, %r267, %r30; + mul.lo.s32 %r309, %r266, %r30; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r310, %r309, 1, 31, -1; + shfl.sync.bfly.b32 %r311, %r308, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r312, %r308, %r311; + add.s32 %r313, %r309, %r310; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r314, %r304, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r315, %r304, %r314; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + shfl.sync.bfly.b32 %r316, %r302, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + add.s32 %r317, %r316, %r302; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.lt.s32 %p106, %r277, %r289; + setp.lt.s32 %p107, %r276, %r288; + setp.lt.s32 %p108, %r281, %r293; + setp.lt.s32 %p109, %r280, %r292; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.eq.b32 %p110, %r276, %r288; + setp.eq.b32 %p111, %r277, %r289; + setp.eq.b32 %p112, %r280, %r292; + setp.eq.b32 %p113, %r281, %r293; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.gt.s32 %p114, %r301, %r313; + setp.gt.s32 %p115, %r300, %r312; + setp.gt.s32 %p116, %r307, %r315; + setp.gt.s32 %p117, %r306, %r317; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + and.pred %p118, %p111, %p115; + and.pred %p119, %p110, %p114; + and.pred %p120, %p113, %p117; + and.pred %p121, %p112, %p116; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + or.pred %p122, %p107, %p119; + or.pred %p123, %p106, %p118; + or.pred %p124, %p109, %p121; + or.pred %p125, %p108, %p120; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r318, %r276, %r288; + xor.b32 %r319, %r277, %r289; + xor.b32 %r320, %r280, %r292; + xor.b32 %r321, %r281, %r293; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r322, %r319, 0, %p123; + selp.b32 %r323, %r318, 0, %p122; + selp.b32 %r324, %r321, 0, %p125; + selp.b32 %r325, %r320, 0, %p124; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r326, %r323, %r254; + xor.b32 %r327, %r322, %r255; + xor.b32 %r328, %r325, %r256; + xor.b32 %r329, %r324, %r257; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r330, %r300, %r312; + xor.b32 %r331, %r301, %r313; + xor.b32 %r332, %r317, %r306; + xor.b32 %r333, %r307, %r315; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r334, %r331, 0, %p122; + selp.b32 %r335, %r330, 0, %p123; + selp.b32 %r336, %r333, 0, %p124; + selp.b32 %r337, %r332, 0, %p125; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r338, %r335, %r267; + xor.b32 %r339, %r334, %r266; + xor.b32 %r340, %r337, %r269; + xor.b32 %r341, %r336, %r268; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.lt.s32 %p126, %r327, %r329; + setp.lt.s32 %p127, %r326, %r328; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.eq.b32 %p128, %r326, %r328; + setp.eq.b32 %p129, %r327, %r329; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.gt.s32 %p130, %r339, %r341; + setp.gt.s32 %p131, %r338, %r340; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + and.pred %p132, %p129, %p131; + and.pred %p133, %p128, %p130; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + or.pred %p134, %p127, %p133; + or.pred %p135, %p126, %p132; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r342, %r326, %r328; + xor.b32 %r343, %r327, %r329; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r344, %r343, 0, %p135; + selp.b32 %r345, %r342, 0, %p134; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r346, %r345, %r328; + xor.b32 %r347, %r344, %r327; + xor.b32 %r348, %r345, %r326; + xor.b32 %r349, %r344, %r329; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r350, %r339, %r341; + xor.b32 %r351, %r338, %r340; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r352, %r351, 0, %p135; + selp.b32 %r353, %r350, 0, %p134; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r354, %r353, %r339; + xor.b32 %r355, %r352, %r338; + xor.b32 %r356, %r353, %r341; + xor.b32 %r357, %r352, %r340; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.lt.s32 %p136, %r348, %r347; + setp.lt.s32 %p137, %r346, %r349; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.eq.b32 %p138, %r348, %r347; + setp.eq.b32 %p139, %r349, %r346; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + setp.gt.s32 %p140, %r354, %r355; + setp.gt.s32 %p141, %r356, %r357; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r358, %r355, %r354; + xor.b32 %r359, %r357, %r356; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + selp.b32 %r360, %r359, 0, %p141; + selp.b32 %r361, %r360, 0, %p139; + selp.b32 %r362, %r359, %r361, %p137; + selp.b32 %r363, %r358, 0, %p140; + selp.b32 %r364, %r363, 0, %p138; + selp.b32 %r365, %r358, %r364, %p136; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:46:71 ] + xor.b32 %r12, %r365, %r355; + xor.b32 %r13, %r365, %r354; + xor.b32 %r11, %r362, %r356; + xor.b32 %r10, %r362, %r357; +$L__tmp6: + .loc 1 54 35 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:54:35 + and.pred %p142, %p2, %p9; + selp.b16 %rs1, 1, 0, %p142; + shl.b16 %rs2, %rs1, 2; + and.pred %p143, %p2, %p8; + selp.b16 %rs3, -1, 0, %p143; + shl.b16 %rs4, %rs3, 3; + or.b16 %rs5, %rs4, %rs2; + and.pred %p144, %p2, %p11; + selp.b16 %rs6, 1, 0, %p144; + and.pred %p145, %p2, %p10; + selp.b16 %rs7, -1, 0, %p145; + shl.b16 %rs8, %rs7, 1; + or.b16 %rs9, %rs6, %rs8; + and.b16 %rs10, %rs9, 3; + or.b16 %rs11, %rs10, %rs5; +$L__tmp7: + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:55:26 ] + and.b16 %rs12, %rs11, 15; + cvt.u32.u16 %r366, %rs12; + popc.b32 %r367, %r366; + cvt.u64.u32 %rd27, %r367; +$L__tmp8: + .loc 1 47 20 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:47:20 + setp.ne.b64 %p146, %rd15, 16384; + setp.ne.b64 %p147, %rd11, 16384; + setp.eq.b64 %p148, %rd15, 16384; + setp.eq.b64 %p149, %rd14, 16384; + setp.eq.b64 %p150, %rd12, 16384; + setp.eq.b64 %p151, %rd11, 16384; + .loc 1 0 0 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:0 + selp.b32 %r368, 1, 0, %p151; + selp.b32 %r369, 1, 0, %p150; + selp.b32 %r370, 1, 0, %p149; + selp.b32 %r371, 1, 0, %p148; +$L__tmp9: + .loc 2 574 22 // triton_helpers.py:574:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + and.pred %p152, %p150, %p147; + .loc 2 599 19 // triton_helpers.py:599:19 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + or.pred %p153, %p149, %p146; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r372, %r368, %r369; + xor.b32 %r373, %r370, %r371; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + selp.b32 %r374, %r372, 0, %p152; + selp.b32 %r375, %r373, 0, %p153; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r376, %r374, %r368; + xor.b32 %r377, %r374, %r369; + xor.b32 %r378, %r375, %r370; + xor.b32 %r379, %r375, %r371; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + selp.b32 %r380, %r42, 0, %p152; + selp.b32 %r381, %r43, 0, %p153; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r382, %r380, %r6; + xor.b32 %r383, %r380, %r7; + xor.b32 %r384, %r381, %r8; + xor.b32 %r385, %r381, %r9; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + setp.ge.u32 %p154, %r376, %r378; + setp.ge.u32 %p155, %r377, %r379; + setp.ne.b32 %p156, %r377, %r379; + setp.ne.b32 %p157, %r378, %r376; + setp.le.u32 %p158, %r383, %r385; + setp.le.u32 %p159, %r382, %r384; + or.pred %p160, %p159, %p157; + or.pred %p161, %p158, %p156; + and.pred %p162, %p155, %p161; + and.pred %p163, %p154, %p160; + xor.pred %p164, %p163, %p4; + xor.pred %p165, %p162, %p4; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r386, %r376, %r378; + xor.b32 %r387, %r379, %r377; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + selp.b32 %r388, 0, %r387, %p165; + selp.b32 %r389, 0, %r386, %p164; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r390, %r389, %r376; + xor.b32 %r391, %r388, %r377; + xor.b32 %r392, %r388, %r379; + xor.b32 %r393, %r389, %r378; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r394, %r384, %r382; + xor.b32 %r395, %r385, %r383; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + selp.b32 %r396, 0, %r394, %p164; + selp.b32 %r397, 0, %r395, %p165; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r398, %r396, %r382; + xor.b32 %r399, %r397, %r383; + xor.b32 %r400, %r396, %r384; + xor.b32 %r401, %r397, %r385; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + setp.ge.u32 %p166, %r390, %r391; + setp.ne.b32 %p167, %r390, %r391; + setp.le.u32 %p168, %r398, %r399; + or.pred %p169, %p167, %p168; + and.pred %p170, %p166, %p169; + xor.pred %p171, %p170, %p4; + setp.ge.u32 %p172, %r393, %r392; + setp.ne.b32 %p173, %r393, %r392; + setp.le.u32 %p174, %r400, %r401; + or.pred %p175, %p173, %p174; + and.pred %p176, %p172, %p175; + xor.pred %p177, %p176, %p4; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r402, %r390, %r391; + xor.b32 %r403, %r393, %r392; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + selp.b32 %r404, 0, %r402, %p171; + selp.b32 %r405, 0, %r403, %p177; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r406, %r404, %r390; + xor.b32 %r407, %r404, %r391; + xor.b32 %r408, %r405, %r393; + xor.b32 %r409, %r405, %r392; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r410, %r398, %r399; + xor.b32 %r411, %r400, %r401; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + selp.b32 %r412, 0, %r410, %p171; + selp.b32 %r413, 0, %r411, %p177; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + mul.lo.s32 %r414, %r406, %r33; + mul.lo.s32 %r415, %r407, %r33; + mul.lo.s32 %r416, %r408, %r33; + mul.lo.s32 %r417, %r409, %r33; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + mul.lo.s32 %r418, %r406, %r30; + mul.lo.s32 %r419, %r407, %r30; + mul.lo.s32 %r420, %r408, %r30; + mul.lo.s32 %r421, %r409, %r30; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r422, %r413, %r401; + xor.b32 %r423, %r413, %r400; + xor.b32 %r424, %r412, %r399; + xor.b32 %r425, %r412, %r398; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + mul.lo.s32 %r426, %r425, %r33; + mul.lo.s32 %r427, %r424, %r33; + mul.lo.s32 %r428, %r423, %r33; + mul.lo.s32 %r429, %r422, %r33; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + mul.lo.s32 %r430, %r425, %r30; + mul.lo.s32 %r431, %r424, %r30; + mul.lo.s32 %r432, %r423, %r30; + mul.lo.s32 %r433, %r422, %r30; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r434, %r414, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r435, %r434, %r414; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r436, %r415, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r437, %r436, %r415; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r438, %r416, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r439, %r438, %r416; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r440, %r417, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r441, %r440, %r417; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r442, %r418, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r443, %r442, %r418; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r444, %r419, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r445, %r444, %r419; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r446, %r420, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r447, %r446, %r420; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r448, %r421, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r449, %r448, %r421; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + setp.ge.s32 %p178, %r437, %r445; + setp.ne.b32 %p179, %r437, %r445; + setp.ge.s32 %p180, %r435, %r443; + setp.ge.s32 %p181, %r439, %r447; + setp.ne.b32 %p182, %r439, %r447; + setp.ne.b32 %p183, %r435, %r443; + setp.ge.s32 %p184, %r441, %r449; + setp.ne.b32 %p185, %r441, %r449; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r450, %r443, %r435; + xor.b32 %r451, %r445, %r437; + xor.b32 %r452, %r447, %r439; + xor.b32 %r453, %r449, %r441; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r454, %r426, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r455, %r454, %r426; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r456, %r427, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r457, %r456, %r427; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r458, %r428, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r459, %r458, %r428; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r460, %r429, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r461, %r460, %r429; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r462, %r430, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r463, %r462, %r430; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r464, %r431, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r465, %r464, %r431; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r466, %r432, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r467, %r466, %r432; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r468, %r433, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r469, %r468, %r433; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + setp.le.s32 %p186, %r459, %r467; + setp.le.s32 %p187, %r455, %r463; + or.pred %p188, %p183, %p187; + or.pred %p189, %p182, %p186; + and.pred %p190, %p181, %p189; + and.pred %p191, %p180, %p188; + xor.pred %p192, %p191, %p5; + xor.pred %p193, %p190, %p5; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + selp.b32 %r470, 0, %r452, %p193; + selp.b32 %r471, 0, %r450, %p192; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r472, %r471, %r406; + xor.b32 %r473, %r470, %r408; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + setp.le.s32 %p194, %r461, %r469; + setp.le.s32 %p195, %r457, %r465; + or.pred %p196, %p179, %p195; + or.pred %p197, %p185, %p194; + and.pred %p198, %p184, %p197; + and.pred %p199, %p178, %p196; + xor.pred %p200, %p199, %p5; + xor.pred %p201, %p198, %p5; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + selp.b32 %r474, 0, %r453, %p201; + selp.b32 %r475, 0, %r451, %p200; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r476, %r475, %r407; + xor.b32 %r477, %r474, %r409; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r478, %r463, %r455; + xor.b32 %r479, %r465, %r457; + xor.b32 %r480, %r467, %r459; + xor.b32 %r481, %r469, %r461; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + selp.b32 %r482, 0, %r481, %p201; + selp.b32 %r483, 0, %r479, %p200; + selp.b32 %r484, 0, %r480, %p193; + selp.b32 %r485, 0, %r478, %p192; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r486, %r485, %r425; + xor.b32 %r487, %r484, %r423; + xor.b32 %r488, %r483, %r424; + xor.b32 %r489, %r482, %r422; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + setp.ge.s32 %p202, %r472, %r473; + setp.ne.b32 %p203, %r472, %r473; + setp.le.s32 %p204, %r488, %r489; + setp.le.s32 %p205, %r486, %r487; + or.pred %p206, %p203, %p205; + and.pred %p207, %p202, %p206; + setp.ge.s32 %p208, %r476, %r477; + setp.ne.b32 %p209, %r476, %r477; + or.pred %p210, %p209, %p204; + and.pred %p211, %p208, %p210; + xor.pred %p212, %p211, %p5; + xor.pred %p213, %p207, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r490, %r473, %r472; + xor.b32 %r491, %r477, %r476; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + selp.b32 %r492, 0, %r490, %p213; + selp.b32 %r493, 0, %r491, %p212; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r494, %r492, %r472; + xor.b32 %r495, %r493, %r476; + xor.b32 %r496, %r492, %r473; + xor.b32 %r497, %r493, %r477; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r498, %r487, %r486; + xor.b32 %r499, %r489, %r488; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + selp.b32 %r500, 0, %r499, %p212; + selp.b32 %r501, 0, %r498, %p213; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r502, %r501, %r487; + xor.b32 %r503, %r500, %r488; + xor.b32 %r504, %r500, %r489; + xor.b32 %r505, %r501, %r486; + .loc 2 599 28 // triton_helpers.py:599:28 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + setp.ge.s32 %p214, %r494, %r495; + setp.ne.b32 %p215, %r494, %r495; + setp.le.s32 %p216, %r505, %r503; + or.pred %p217, %p215, %p216; + and.pred %p218, %p214, %p217; + xor.pred %p219, %p218, %p5; + setp.ge.s32 %p220, %r496, %r497; + setp.ne.b32 %p221, %r496, %r497; + setp.le.s32 %p222, %r502, %r504; + or.pred %p223, %p221, %p222; + and.pred %p224, %p220, %p223; + xor.pred %p225, %p224, %p5; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r506, %r495, %r494; + xor.b32 %r507, %r497, %r496; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + selp.b32 %r508, 0, %r506, %p219; + selp.b32 %r509, 0, %r507, %p225; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r510, %r509, %r496; + xor.b32 %r511, %r508, %r494; + xor.b32 %r512, %r509, %r497; + xor.b32 %r513, %r508, %r495; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r514, %r503, %r505; + xor.b32 %r515, %r504, %r502; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + selp.b32 %r516, 0, %r515, %p225; + selp.b32 %r517, 0, %r514, %p219; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r522, %r517, %r503; + xor.b32 %r523, %r516, %r502; + xor.b32 %r524, %r517, %r505; + xor.b32 %r525, %r516, %r504; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + mul.lo.s32 %r526, %r511, %r34; + mul.lo.s32 %r527, %r513, %r34; + mul.lo.s32 %r528, %r510, %r34; + mul.lo.s32 %r529, %r512, %r34; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r530, %r526, 2, 31, -1; + shfl.sync.bfly.b32 %r531, %r527, 2, 31, -1; + shfl.sync.bfly.b32 %r532, %r528, 2, 31, -1; + shfl.sync.bfly.b32 %r533, %r529, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r534, %r527, %r531; + add.s32 %r535, %r529, %r533; + add.s32 %r536, %r526, %r530; + add.s32 %r537, %r528, %r532; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + mul.lo.s32 %r538, %r511, %r32; + mul.lo.s32 %r539, %r513, %r32; + mul.lo.s32 %r540, %r510, %r32; + mul.lo.s32 %r541, %r512, %r32; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r542, %r538, 2, 31, -1; + shfl.sync.bfly.b32 %r543, %r539, 2, 31, -1; + shfl.sync.bfly.b32 %r544, %r540, 2, 31, -1; + shfl.sync.bfly.b32 %r545, %r541, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r546, %r539, %r543; + add.s32 %r547, %r541, %r545; + add.s32 %r548, %r538, %r542; + add.s32 %r549, %r540, %r544; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + mul.lo.s32 %r550, %r525, %r34; + mul.lo.s32 %r551, %r523, %r34; + mul.lo.s32 %r552, %r522, %r34; + mul.lo.s32 %r553, %r524, %r34; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r554, %r553, 2, 31, -1; + shfl.sync.bfly.b32 %r555, %r552, 2, 31, -1; + shfl.sync.bfly.b32 %r556, %r551, 2, 31, -1; + shfl.sync.bfly.b32 %r557, %r550, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r558, %r551, %r556; + add.s32 %r559, %r553, %r554; + add.s32 %r560, %r550, %r557; + add.s32 %r561, %r552, %r555; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + mul.lo.s32 %r562, %r525, %r32; + mul.lo.s32 %r563, %r523, %r32; + mul.lo.s32 %r564, %r522, %r32; + mul.lo.s32 %r565, %r524, %r32; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r566, %r565, 2, 31, -1; + shfl.sync.bfly.b32 %r567, %r564, 2, 31, -1; + shfl.sync.bfly.b32 %r568, %r563, 2, 31, -1; + shfl.sync.bfly.b32 %r569, %r562, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + add.s32 %r570, %r563, %r568; + add.s32 %r571, %r565, %r566; + add.s32 %r572, %r562, %r569; + add.s32 %r573, %r564, %r567; + .loc 2 574 22 // triton_helpers.py:574:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + setp.lt.s32 %p226, %r537, %r549; + setp.lt.s32 %p227, %r536, %r548; + setp.lt.s32 %p228, %r535, %r547; + setp.lt.s32 %p229, %r534, %r546; + .loc 2 591 21 // triton_helpers.py:591:21 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + setp.eq.b32 %p230, %r534, %r546; + setp.eq.b32 %p231, %r535, %r547; + setp.eq.b32 %p232, %r536, %r548; + setp.eq.b32 %p233, %r537, %r549; + .loc 2 594 40 // triton_helpers.py:594:40 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + setp.gt.s32 %p234, %r561, %r573; + setp.gt.s32 %p235, %r560, %r572; + setp.gt.s32 %p236, %r559, %r571; + setp.gt.s32 %p237, %r558, %r570; + .loc 2 594 29 // triton_helpers.py:594:29 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + and.pred %p238, %p233, %p237; + and.pred %p239, %p232, %p236; + and.pred %p240, %p231, %p235; + and.pred %p241, %p230, %p234; + .loc 2 594 23 // triton_helpers.py:594:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + or.pred %p242, %p229, %p241; + or.pred %p243, %p228, %p240; + or.pred %p244, %p227, %p239; + or.pred %p245, %p226, %p238; + .loc 2 600 38 // triton_helpers.py:600:38 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r574, %r534, %r546; + xor.b32 %r575, %r535, %r547; + xor.b32 %r576, %r536, %r548; + xor.b32 %r577, %r537, %r549; + .loc 2 600 46 // triton_helpers.py:600:46 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + selp.b32 %r578, %r577, 0, %p245; + selp.b32 %r579, %r576, 0, %p244; + selp.b32 %r580, %r575, 0, %p243; + selp.b32 %r581, %r574, 0, %p242; + .loc 2 600 15 // triton_helpers.py:600:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r582, %r581, %r513; + xor.b32 %r583, %r579, %r511; + xor.b32 %r584, %r578, %r510; + xor.b32 %r585, %r580, %r512; + .loc 2 601 48 // triton_helpers.py:601:48 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r586, %r561, %r573; + xor.b32 %r587, %r558, %r570; + xor.b32 %r588, %r560, %r572; + xor.b32 %r589, %r559, %r571; + .loc 2 601 59 // triton_helpers.py:601:59 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + selp.b32 %r590, %r589, 0, %p244; + selp.b32 %r591, %r588, 0, %p243; + selp.b32 %r592, %r587, 0, %p245; + selp.b32 %r593, %r586, 0, %p242; + .loc 2 601 22 // triton_helpers.py:601:22 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + xor.b32 %r598, %r593, %r522; + xor.b32 %r599, %r590, %r524; + xor.b32 %r600, %r591, %r525; + xor.b32 %r601, %r592, %r523; + .loc 2 538 40 // triton_helpers.py:538:40 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + mul.lo.s32 %r602, %r585, %r33; + mul.lo.s32 %r603, %r584, %r33; + mul.lo.s32 %r604, %r582, %r33; + mul.lo.s32 %r605, %r583, %r33; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r606, %r605, 1, 31, -1; + shfl.sync.bfly.b32 %r607, %r604, 1, 31, -1; + shfl.sync.bfly.b32 %r608, %r603, 1, 31, -1; + shfl.sync.bfly.b32 %r609, %r602, 1, 31, -1; + .loc 2 539 41 // triton_helpers.py:539:41 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + mul.lo.s32 %r614, %r585, %r30; + mul.lo.s32 %r615, %r584, %r30; + mul.lo.s32 %r616, %r582, %r30; + mul.lo.s32 %r617, %r583, %r30; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r618, %r617, 1, 31, -1; + shfl.sync.bfly.b32 %r619, %r616, 1, 31, -1; + shfl.sync.bfly.b32 %r620, %r615, 1, 31, -1; + shfl.sync.bfly.b32 %r621, %r614, 1, 31, -1; + .loc 2 548 23 // triton_helpers.py:548:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + mul.lo.s32 %r626, %r599, %r33; + mul.lo.s32 %r627, %r598, %r33; + mul.lo.s32 %r628, %r601, %r33; + mul.lo.s32 %r629, %r600, %r33; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r630, %r626, 1, 31, -1; + shfl.sync.bfly.b32 %r631, %r627, 1, 31, -1; + shfl.sync.bfly.b32 %r632, %r628, 1, 31, -1; + shfl.sync.bfly.b32 %r633, %r629, 1, 31, -1; + .loc 2 551 23 // triton_helpers.py:551:23 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + mul.lo.s32 %r638, %r599, %r30; + mul.lo.s32 %r639, %r598, %r30; + mul.lo.s32 %r640, %r601, %r30; + mul.lo.s32 %r641, %r600, %r30; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:51:71 ] + shfl.sync.bfly.b32 %r642, %r638, 1, 31, -1; + shfl.sync.bfly.b32 %r643, %r639, 1, 31, -1; + shfl.sync.bfly.b32 %r644, %r640, 1, 31, -1; + shfl.sync.bfly.b32 %r645, %r641, 1, 31, -1; +$L__tmp10: + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:55:26 ] + shfl.sync.bfly.b32 %r720, %r367, 2, 31, -1; + mov.b32 %r721, 0; + shfl.sync.bfly.b32 %r722, %r721, 2, 31, -1; + cvt.u64.u32 %rd28, %r720; + cvt.u64.u32 %rd29, %r722; + shl.b64 %rd30, %rd29, 32; + or.b64 %rd31, %rd28, %rd30; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:55:26 ] + add.s64 %rd32, %rd27, %rd31; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:55:26 ] + mov.b64 {_, %r723}, %rd32; + cvt.u32.u64 %r724, %rd32; + shfl.sync.bfly.b32 %r725, %r724, 1, 31, -1; + shfl.sync.bfly.b32 %r726, %r723, 1, 31, -1; + cvt.u64.u32 %rd33, %r725; + cvt.u64.u32 %rd34, %r726; + shl.b64 %rd35, %rd34, 32; + or.b64 %rd36, %rd33, %rd35; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:55:26 ] + add.s64 %rd37, %rd32, %rd36; +$L__tmp11: + .loc 1 60 21 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:60:21 + st.shared.b64 [%r38], %rd37; + bar.sync 0; + ld.shared.b64 %rd2, [%r40]; + .loc 1 58 35 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:58:35 + and.pred %p282, %p2, %p149; + selp.b16 %rs13, 1, 0, %p282; + shl.b16 %rs14, %rs13, 2; + and.pred %p283, %p2, %p148; + selp.b16 %rs15, -1, 0, %p283; + shl.b16 %rs16, %rs15, 3; + or.b16 %rs17, %rs16, %rs14; + and.pred %p284, %p2, %p151; + selp.b16 %rs18, 1, 0, %p284; + and.pred %p285, %p2, %p150; + selp.b16 %rs19, -1, 0, %p285; + shl.b16 %rs20, %rs19, 1; + or.b16 %rs21, %rs18, %rs20; + and.b16 %rs22, %rs21, 3; + or.b16 %rs23, %rs22, %rs17; +$L__tmp12: + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:59:26 ] + and.b16 %rs24, %rs23, 15; + cvt.u32.u16 %r727, %rs24; + popc.b32 %r728, %r727; + cvt.u64.u32 %rd38, %r728; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:59:26 ] + shfl.sync.bfly.b32 %r729, %r728, 2, 31, -1; + shfl.sync.bfly.b32 %r730, %r721, 2, 31, -1; + cvt.u64.u32 %rd39, %r729; + cvt.u64.u32 %rd40, %r730; + shl.b64 %rd41, %rd40, 32; + or.b64 %rd42, %rd39, %rd41; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:59:26 ] + add.s64 %rd43, %rd38, %rd42; + .loc 3 291 36 // standard.py:291:36 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:59:26 ] + mov.b64 {_, %r731}, %rd43; + cvt.u32.u64 %r732, %rd43; + shfl.sync.bfly.b32 %r733, %r732, 1, 31, -1; + shfl.sync.bfly.b32 %r734, %r731, 1, 31, -1; + cvt.u64.u32 %rd44, %r733; + cvt.u64.u32 %rd45, %r734; + shl.b64 %rd46, %rd45, 32; + or.b64 %rd47, %rd44, %rd46; + .loc 3 261 15 // standard.py:261:15 @[ c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:59:26 ] + add.s64 %rd3, %rd43, %rd47; +$L__tmp13: + .loc 1 61 21 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:61:21 + bar.sync 0; + st.shared.b64 [%r38], %rd3; + bar.sync 0; + .loc 1 60 21 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:60:21 + cvt.u32.u64 %r735, %rd37; + .loc 1 64 19 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:64:19 + setp.lt.s32 %p286, %r7, %r735; + setp.lt.s32 %p287, %r6, %r735; + setp.lt.s32 %p288, %r8, %r735; + setp.lt.s32 %p289, %r9, %r735; + .loc 1 66 35 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:66:35 + selp.b32 %r736, %r10, 16, %p289; + selp.b32 %r737, %r11, 16, %p288; + selp.b32 %r738, %r13, 16, %p287; + selp.b32 %r739, %r12, 16, %p286; + .loc 1 68 20 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:68:20 + add.s32 %r740, %r739, 17; + add.s32 %r741, %r738, 17; + add.s32 %r742, %r737, 17; + add.s32 %r743, %r736, 17; + .loc 1 69 20 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:69:20 + setp.lt.s32 %p290, %r739, 0; + setp.lt.s32 %p291, %r738, 0; + setp.lt.s32 %p292, %r737, 0; + setp.lt.s32 %p293, %r736, 0; + .loc 1 70 35 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:70:35 + selp.b32 %r18, %r743, %r736, %p293; + selp.b32 %r19, %r742, %r737, %p292; + selp.b32 %r21, %r741, %r738, %p291; + selp.b32 %r20, %r740, %r739, %p290; + .loc 1 71 63 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:71:63 + max.u32 %r744, %r21, %r20; + max.u32 %r745, %r19, %r744; + max.u32 %r746, %r18, %r745; + setp.lt.u32 %p294, %r746, 17; + or.pred %p295, %p3, %p294; + @%p295 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + .loc 1 0 0 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:0 + xor.b32 %r518, %r501, %r517; + xor.b32 %r519, %r500, %r516; + xor.b32 %r520, %r501, %r516; + xor.b32 %r521, %r500, %r517; + xor.b32 %r594, %r521, %r593; + xor.b32 %r595, %r520, %r592; + xor.b32 %r596, %r519, %r591; + xor.b32 %r597, %r518, %r590; + add.s32 %r610, %r604, %r607; + add.s32 %r611, %r602, %r609; + add.s32 %r612, %r605, %r606; + add.s32 %r613, %r603, %r608; + add.s32 %r622, %r616, %r619; + add.s32 %r623, %r614, %r621; + add.s32 %r624, %r617, %r618; + add.s32 %r625, %r615, %r620; + add.s32 %r634, %r628, %r632; + add.s32 %r635, %r627, %r631; + add.s32 %r636, %r626, %r630; + add.s32 %r637, %r629, %r633; + add.s32 %r646, %r644, %r640; + add.s32 %r647, %r643, %r639; + add.s32 %r648, %r642, %r638; + add.s32 %r649, %r645, %r641; + setp.lt.s32 %p246, %r613, %r625; + setp.lt.s32 %p247, %r612, %r624; + setp.lt.s32 %p248, %r611, %r623; + setp.lt.s32 %p249, %r610, %r622; + setp.eq.b32 %p250, %r610, %r622; + setp.eq.b32 %p251, %r611, %r623; + setp.eq.b32 %p252, %r612, %r624; + setp.eq.b32 %p253, %r613, %r625; + setp.gt.s32 %p254, %r635, %r647; + setp.gt.s32 %p255, %r637, %r649; + setp.gt.s32 %p256, %r636, %r648; + setp.gt.s32 %p257, %r634, %r646; + and.pred %p258, %p253, %p257; + and.pred %p259, %p252, %p256; + and.pred %p260, %p251, %p255; + and.pred %p261, %p250, %p254; + or.pred %p262, %p249, %p261; + or.pred %p263, %p248, %p260; + or.pred %p264, %p247, %p259; + or.pred %p265, %p246, %p258; + xor.b32 %r650, %r610, %r622; + xor.b32 %r651, %r611, %r623; + xor.b32 %r652, %r612, %r624; + xor.b32 %r653, %r613, %r625; + selp.b32 %r654, %r653, 0, %p265; + selp.b32 %r655, %r652, 0, %p264; + selp.b32 %r656, %r651, 0, %p263; + selp.b32 %r657, %r650, 0, %p262; + xor.b32 %r658, %r654, %r584; + xor.b32 %r659, %r655, %r583; + xor.b32 %r660, %r657, %r582; + xor.b32 %r661, %r656, %r585; + xor.b32 %r662, %r646, %r634; + xor.b32 %r663, %r647, %r635; + xor.b32 %r664, %r648, %r636; + xor.b32 %r665, %r649, %r637; + selp.b32 %r666, %r663, 0, %p262; + selp.b32 %r667, %r662, 0, %p265; + selp.b32 %r668, %r665, 0, %p263; + selp.b32 %r669, %r664, 0, %p264; + xor.b32 %r670, %r597, %r669; + xor.b32 %r671, %r596, %r668; + xor.b32 %r672, %r669, %r599; + xor.b32 %r673, %r667, %r601; + xor.b32 %r674, %r666, %r598; + xor.b32 %r675, %r668, %r600; + setp.lt.s32 %p266, %r659, %r658; + setp.lt.s32 %p267, %r660, %r661; + setp.eq.b32 %p268, %r659, %r658; + setp.eq.b32 %p269, %r660, %r661; + setp.gt.s32 %p270, %r672, %r673; + setp.gt.s32 %p271, %r674, %r675; + and.pred %p272, %p268, %p270; + and.pred %p273, %p269, %p271; + or.pred %p274, %p266, %p272; + or.pred %p275, %p267, %p273; + xor.b32 %r676, %r658, %r659; + xor.b32 %r677, %r661, %r660; + selp.b32 %r678, %r677, 0, %p275; + selp.b32 %r679, %r676, 0, %p274; + xor.b32 %r680, %r679, %r658; + xor.b32 %r681, %r678, %r660; + xor.b32 %r682, %r679, %r659; + xor.b32 %r683, %r678, %r661; + xor.b32 %r684, %r675, %r674; + xor.b32 %r685, %r673, %r672; + selp.b32 %r686, %r685, 0, %p274; + selp.b32 %r687, %r684, 0, %p275; + xor.b32 %r688, %r671, %r687; + xor.b32 %r689, %r667, %r686; + xor.b32 %r690, %r689, %r595; + xor.b32 %r691, %r666, %r687; + xor.b32 %r692, %r691, %r594; + xor.b32 %r693, %r670, %r686; + xor.b32 %r694, %r687, %r675; + xor.b32 %r695, %r687, %r674; + xor.b32 %r696, %r686, %r673; + xor.b32 %r697, %r686, %r672; + setp.lt.s32 %p276, %r682, %r681; + setp.lt.s32 %p277, %r680, %r683; + setp.eq.b32 %p278, %r681, %r682; + setp.eq.b32 %p279, %r680, %r683; + setp.gt.s32 %p280, %r697, %r695; + setp.gt.s32 %p281, %r696, %r694; + xor.b32 %r698, %r689, %r668; + xor.b32 %r699, %r691, %r669; + xor.b32 %r700, %r699, %r522; + xor.b32 %r701, %r698, %r523; + xor.b32 %r702, %r701, %r592; + xor.b32 %r703, %r700, %r593; + xor.b32 %r704, %r703, %r524; + xor.b32 %r705, %r702, %r525; + xor.b32 %r706, %r705, %r591; + xor.b32 %r707, %r704, %r590; + xor.b32 %r708, %r707, %r686; + xor.b32 %r709, %r706, %r687; + selp.b32 %r710, %r709, 0, %p281; + selp.b32 %r711, %r710, 0, %p279; + selp.b32 %r712, %r709, %r711, %p277; + selp.b32 %r713, %r708, 0, %p280; + selp.b32 %r714, %r713, 0, %p278; + selp.b32 %r715, %r708, %r714, %p276; + xor.b32 %r716, %r693, %r715; + xor.b32 %r717, %r692, %r715; + xor.b32 %r718, %r690, %r712; + xor.b32 %r719, %r688, %r712; + xor.b32 %r17, %r719, %r489; + xor.b32 %r16, %r718, %r487; + xor.b32 %r15, %r717, %r488; + xor.b32 %r14, %r716, %r486; + .loc 1 61 21 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:61:21 + ld.shared.b64 %rd4, [%r40]; + cvt.u32.u64 %r747, %rd3; + .loc 1 71 63 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:71:63 + bar.sync 0; + .loc 1 75 19 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:75:19 + setp.lt.s32 %p297, %r8, %r747; + setp.lt.s32 %p298, %r9, %r747; + setp.lt.s32 %p299, %r6, %r747; + setp.lt.s32 %p300, %r7, %r747; + .loc 1 76 35 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:76:35 + selp.b32 %r748, %r15, 16, %p300; + selp.b32 %r749, %r14, 16, %p299; + selp.b32 %r750, %r17, 16, %p298; + selp.b32 %r751, %r16, 16, %p297; + .loc 1 77 20 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:77:20 + add.s32 %r752, %r751, 17; + add.s32 %r753, %r750, 17; + add.s32 %r754, %r749, 17; + add.s32 %r755, %r748, 17; + .loc 1 78 20 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:78:20 + setp.lt.s32 %p301, %r751, 0; + setp.lt.s32 %p302, %r750, 0; + setp.lt.s32 %p303, %r749, 0; + setp.lt.s32 %p304, %r748, 0; + .loc 1 79 35 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:79:35 + selp.b32 %r23, %r755, %r748, %p304; + selp.b32 %r22, %r754, %r749, %p303; + selp.b32 %r25, %r753, %r750, %p302; + selp.b32 %r24, %r752, %r751, %p301; + .loc 1 80 38 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:80:38 + setp.gt.u32 %p305, %r24, 16; + selp.b16 %rs25, 1, 0, %p305; + shl.b16 %rs26, %rs25, 2; + setp.gt.u32 %p306, %r25, 16; + selp.b16 %rs27, -1, 0, %p306; + shl.b16 %rs28, %rs27, 3; + or.b16 %rs29, %rs28, %rs26; + setp.gt.u32 %p307, %r22, 16; + selp.b16 %rs30, 1, 0, %p307; + setp.gt.u32 %p308, %r23, 16; + selp.b16 %rs31, -1, 0, %p308; + shl.b16 %rs32, %rs31, 1; + or.b16 %rs33, %rs30, %rs32; + and.b16 %rs34, %rs33, 3; + or.b16 %rs35, %rs34, %rs29; + .loc 1 80 63 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:80:63 + and.b16 %rs36, %rs35, 15; + setp.eq.b16 %p309, %rs36, 0; + or.pred %p310, %p3, %p309; + @%p310 bra $L__BB0_4; + bra.uni $L__BB0_3; +$L__BB0_4: + .loc 1 0 63 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:0:63 + ld.param.b64 %rd10, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6]; + ld.param.b64 %rd9, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5]; + ld.param.b64 %rd8, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4]; + ld.param.b64 %rd7, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3]; + ld.param.b64 %rd6, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2]; + ld.param.b64 %rd5, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1]; + cvt.s64.s32 %rd1, %r41; + .loc 1 61 21 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:61:21 + cvt.u32.u64 %r757, %rd4; + .loc 1 60 21 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:60:21 + cvt.u32.u64 %r756, %rd2; + .loc 1 25 23 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:25:23 + or.b32 %r774, %r1, %r3; + .loc 1 26 21 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:26:21 + setp.lt.s32 %p314, %r774, 128; + .loc 1 80 63 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:80:63 + bar.sync 0; + .loc 1 81 25 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:81:25 + mul.wide.s32 %rd60, %r774, 4; + add.s64 %rd48, %rd5, %rd60; + .loc 1 81 37 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:81:37 + and.b32 %r775, %r2, 96; + setp.eq.b32 %p323, %r775, 0; + and.pred %p311, %p323, %p314; + // begin inline asm + @%p311 st.global.b32 [ %rd48 + 0 ], { %r756 }; + // end inline asm + .loc 1 82 25 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:82:25 + add.s64 %rd49, %rd6, %rd60; + .loc 1 82 37 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:82:37 + // begin inline asm + @%p311 st.global.b32 [ %rd49 + 0 ], { %r757 }; + // end inline asm + .loc 1 83 25 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:83:25 + shl.b64 %rd61, %rd1, 2; + add.s64 %rd50, %rd7, %rd61; + .loc 1 83 47 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:83:47 + // begin inline asm + @%p2 st.global.v4.b32 [ %rd50 + 0 ], { %r13, %r12, %r11, %r10 }; + // end inline asm + .loc 1 84 52 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:84:52 + mul.lo.s32 %r776, %r4, 17; + .loc 1 84 49 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:84:49 + add.s32 %r777, %r21, %r776; + add.s32 %r778, %r20, %r776; + add.s32 %r779, %r19, %r776; + add.s32 %r780, %r18, %r776; + .loc 1 84 25 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:84:25 + mad.wide.s32 %rd62, %r777, 4, %rd8; + mad.wide.s32 %rd63, %r778, 4, %rd8; + mad.wide.s32 %rd64, %r779, 4, %rd8; + mad.wide.s32 %rd65, %r780, 4, %rd8; + .loc 1 84 85 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:84:85 + bar.sync 0; + and.b32 %r781, %r2, 48; + shl.b32 %r782, %r781, 6; + shl.b32 %r783, %r2, 3; + and.b32 %r784, %r783, 120; + shr.u32 %r785, %r781, 1; + shl.b32 %r786, %r2, 1; + and.b32 %r787, %r786, 128; + or.b32 %r788, %r782, %r784; + xor.b32 %r789, %r788, %r785; + add.s32 %r791, %r37, %r787; + add.s32 %r792, %r791, %r789; + st.shared.b64 [%r792], %rd62; + st.shared.b64 [%r792+256], %rd63; + st.shared.b64 [%r792+512], %rd64; + st.shared.b64 [%r792+768], %rd65; + bar.sync 0; + and.b32 %r793, %r2, 12; + shl.b32 %r794, %r793, 8; + shl.b32 %r795, %r5, 5; + and.b32 %r796, %r783, 896; + shl.b32 %r797, %r793, 1; + or.b32 %r798, %r794, %r795; + or.b32 %r799, %r798, %r796; + or.b32 %r800, %r799, %r797; + add.s32 %r801, %r37, %r800; + ld.shared.b64 %rd51, [%r801]; + xor.b32 %r802, %r800, 8; + add.s32 %r803, %r37, %r802; + ld.shared.b64 %rd52, [%r803]; + xor.b32 %r804, %r800, 16; + add.s32 %r805, %r37, %r804; + ld.shared.b64 %rd53, [%r805]; + xor.b32 %r806, %r800, 24; + add.s32 %r807, %r37, %r806; + ld.shared.b64 %rd54, [%r807]; + mov.b32 %r762, 1; + // begin inline asm + @%p314 st.global.b32 [ %rd51 + 0 ], { %r762 }; + // end inline asm + // begin inline asm + @%p314 st.global.b32 [ %rd52 + 0 ], { %r762 }; + // end inline asm + // begin inline asm + @%p314 st.global.b32 [ %rd53 + 0 ], { %r762 }; + // end inline asm + // begin inline asm + @%p314 st.global.b32 [ %rd54 + 0 ], { %r762 }; + // end inline asm + .loc 1 85 25 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:85:25 + add.s64 %rd55, %rd9, %rd61; + .loc 1 85 47 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:85:47 + // begin inline asm + @%p2 st.global.v4.b32 [ %rd55 + 0 ], { %r14, %r15, %r16, %r17 }; + // end inline asm + .loc 1 86 49 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:86:49 + add.s32 %r808, %r22, %r776; + add.s32 %r809, %r23, %r776; + add.s32 %r810, %r24, %r776; + add.s32 %r811, %r25, %r776; + .loc 1 86 25 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:86:25 + mad.wide.s32 %rd66, %r808, 4, %rd10; + mad.wide.s32 %rd67, %r809, 4, %rd10; + mad.wide.s32 %rd68, %r810, 4, %rd10; + mad.wide.s32 %rd69, %r811, 4, %rd10; + .loc 1 86 85 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:86:85 + bar.sync 0; + st.shared.b64 [%r792], %rd66; + st.shared.b64 [%r792+256], %rd67; + st.shared.b64 [%r792+512], %rd68; + st.shared.b64 [%r792+768], %rd69; + bar.sync 0; + ld.shared.b64 %rd56, [%r801]; + ld.shared.b64 %rd57, [%r803]; + ld.shared.b64 %rd58, [%r805]; + ld.shared.b64 %rd59, [%r807]; + // begin inline asm + @%p314 st.global.b32 [ %rd56 + 0 ], { %r762 }; + // end inline asm + // begin inline asm + @%p314 st.global.b32 [ %rd57 + 0 ], { %r762 }; + // end inline asm + // begin inline asm + @%p314 st.global.b32 [ %rd58 + 0 ], { %r762 }; + // end inline asm + // begin inline asm + @%p314 st.global.b32 [ %rd59 + 0 ], { %r762 }; + // end inline asm + .loc 1 86 4 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:86:4 + ret; +$L__BB0_1: + .loc 1 71 63 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:71:63 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd76, assertFunc_0; + cvta.global.u64 %rd77, %rd76; + st.param.b64 [param3], %rd77; + mov.b64 %rd78, assertFile_0; + cvta.global.u64 %rd79, %rd78; + st.param.b64 [param1], %rd79; + mov.b64 %rd80, assertMessage_0; + cvta.global.u64 %rd81, %rd80; + st.param.b64 [param0], %rd81; + st.param.b64 [param4], 1; + st.param.b32 [param2], 71; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__BB0_3: + .loc 1 80 63 // c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py:80:63 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd70, assertFunc_1; + cvta.global.u64 %rd71, %rd70; + st.param.b64 [param3], %rd71; + mov.b64 %rd72, assertFile_1; + cvta.global.u64 %rd73, %rd72; + st.param.b64 [param1], %rd73; + mov.b64 %rd74, assertMessage_1; + cvta.global.u64 %rd75, %rd74; + st.param.b64 [param0], %rd75; + st.param.b64 [param4], 1; + st.param.b32 [param2], 80; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__tmp14: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 376 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x171 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 54 +.b8 113 +.b8 118 +.b8 101 +.b8 119 +.b8 121 +.b8 100 +.b8 105 +.b8 118 +.b8 103 +.b8 52 +.b8 98 +.b8 118 +.b8 99 +.b8 52 +.b8 113 +.b8 121 +.b8 122 +.b8 102 +.b8 103 +.b8 121 +.b8 98 +.b8 120 +.b8 97 +.b8 54 +.b8 117 +.b8 115 +.b8 109 +.b8 52 +.b8 102 +.b8 55 +.b8 108 +.b8 98 +.b8 105 +.b8 114 +.b8 116 +.b8 103 +.b8 114 +.b8 51 +.b8 55 +.b8 110 +.b8 104 +.b8 109 +.b8 108 +.b8 119 +.b8 102 +.b8 101 +.b8 100 +.b8 54 +.b8 112 +.b8 53 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 54 +.b8 113 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x7a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 116 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 112 +.b8 117 +.b8 116 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 110 +.b8 101 +.b8 119 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 115 +.b8 99 +.b8 97 +.b8 108 +.b8 97 +.b8 114 +.b8 95 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 119 +.b8 104 +.b8 101 +.b8 114 +.b8 101 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x105:0x76 DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp6 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x132:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp11 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 55 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x14a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp9 // DW_AT_low_pc +.b64 $L__tmp10 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x162:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp12 // DW_AT_low_pc +.b64 $L__tmp13 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 59 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source b/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source new file mode 100644 index 0000000000000000000000000000000000000000..d11fedd6538208842180230fb88bd3dd8743f84d --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source @@ -0,0 +1,1405 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":18:0) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc141 = loc(unknown) +#loc166 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc170 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc175 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc179 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc188 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc193 = loc("in_ptr0"(#loc)) +#loc194 = loc("out_ptr4"(#loc)) +#loc195 = loc("out_ptr5"(#loc)) +#loc196 = loc("out_ptr6"(#loc)) +#loc197 = loc("out_ptr7"(#loc)) +#loc198 = loc("out_ptr8"(#loc)) +#loc199 = loc("out_ptr9"(#loc)) +#loc200 = loc("xnumel"(#loc)) +#loc201 = loc("r0_numel"(#loc)) +#loc257 = loc("x"(#loc91)) +#loc258 = loc("idxs"(#loc91)) +#loc259 = loc("x"(#loc95)) +#loc260 = loc("idxs"(#loc95)) +#loc265 = loc("x"(#loc103)) +#loc266 = loc("idxs"(#loc103)) +#loc267 = loc("flip"(#loc103)) +#loc323 = loc("input"(#loc166)) +#loc324 = loc("a"(#loc170)) +#loc325 = loc("b"(#loc170)) +#loc327 = loc("x"(#loc175)) +#loc328 = loc("x"(#loc179)) +#loc329 = loc("input"(#loc188)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 128 : i32 loc(#loc202) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc203) + %xoffset = tt.get_program_id x : i32 loc(#loc204) + %xoffset_2 = arith.constant 32 : i32 loc(#loc205) + %xoffset_3 = arith.constant 32 : i32 loc(#loc205) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc205) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc206) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc207) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<32x1xi32> loc(#loc208) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<32x1xi32> loc(#loc208) + %xmask = arith.constant dense<128> : tensor<32x1xi32> loc(#loc209) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<32x1xi32> loc(#loc209) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc210) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc211) + %r0_offset = arith.constant 0 : i32 loc(#loc212) + %r0_mask = arith.constant true loc(#loc213) + %r0_mask_10 = arith.constant dense : tensor<32x16xi1> loc(#loc213) + %tmp0 = arith.constant 16 : i32 loc(#loc214) + %tmp0_11 = arith.constant 16 : i32 loc(#loc214) + %tmp0_12 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc214) + %tmp0_13 = arith.muli %tmp0_12, %xindex_7 : tensor<32x1xi32> loc(#loc214) + %tmp0_14 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc215) + %tmp0_15 = tt.broadcast %tmp0_13 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc215) + %tmp0_16 = arith.addi %tmp0_14, %tmp0_15 : tensor<32x16xi32> loc(#loc215) + %tmp0_17 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc216) + %tmp0_18 = tt.addptr %tmp0_17, %tmp0_16 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc216) + %tmp0_19 = arith.constant 0.000000e+00 : f32 loc(#loc217) + %tmp0_20 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc217) + %tmp0_21 = arith.constant dense<0.000000e+00> : tensor<32x16xf32> loc(#loc217) + %tmp0_22 = arith.fptosi %tmp0_21 : tensor<32x16xf32> to tensor<32x16xi64> loc(#loc217) + %tmp0_23 = tt.load %tmp0_18, %tmp0_20, %tmp0_22 : tensor<32x16x!tt.ptr> loc(#loc217) + %tmp1 = arith.constant 0 : i64 loc(#loc218) + %tmp1_24 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc218) + %tmp2 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc219) + %tmp2_25 = arith.cmpi sgt, %tmp0_23, %tmp2 : tensor<32x16xi64> loc(#loc219) + %tmp3 = arith.constant 16384 : i64 loc(#loc220) + %tmp3_26 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc220) + %tmp4 = arith.constant dense<16384> : tensor<32x16xi64> loc(#loc221) + %tmp4_27 = arith.cmpi slt, %tmp0_23, %tmp4 : tensor<32x16xi64> loc(#loc221) + %tmp5 = arith.andi %tmp2_25, %tmp4_27 : tensor<32x16xi1> loc(#loc222) + %tmp6 = arith.extui %tmp5 : tensor<32x16xi1> to tensor<32x16xi8> loc(#loc223) + %tmp7 = arith.extsi %tmp6 : tensor<32x16xi8> to tensor<32x16xi32> loc(#loc224) + %tmp9 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc225) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16> -> tensor<32x16xi16> loc(#loc226) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp7, %tmp11) : (tensor<32x16xi32>, tensor<32x16xi16>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc26) + %tmp14 = arith.constant dense<16384> : tensor<32x16xi64> loc(#loc227) + %tmp14_28 = arith.cmpi eq, %tmp0_23, %tmp14 : tensor<32x16xi64> loc(#loc227) + %tmp15 = arith.extui %tmp14_28 : tensor<32x16xi1> to tensor<32x16xi8> loc(#loc228) + %tmp16 = arith.extsi %tmp15 : tensor<32x16xi8> to tensor<32x16xi32> loc(#loc229) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp16, %tmp11) : (tensor<32x16xi32>, tensor<32x16xi16>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc30) + %tmp20 = arith.extsi %tmp7 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc230) + %tmp23 = arith.constant 0 : i32 loc(#loc231) + %tmp23_29 = arith.constant 0 : i64 loc(#loc231) + %tmp23_30 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc231) + %tmp23_31 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc231) + %tmp23_32 = arith.select %tmp23_31, %tmp20, %tmp23_30 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc231) + %tmp24 = tt.call @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp23_32) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc232) + %tmp24_33 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc233) + %tmp25 = arith.extsi %tmp16 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc234) + %tmp28 = arith.constant 0 : i32 loc(#loc235) + %tmp28_34 = arith.constant 0 : i64 loc(#loc235) + %tmp28_35 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc235) + %tmp28_36 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc235) + %tmp28_37 = arith.select %tmp28_36, %tmp25, %tmp28_35 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc235) + %tmp29 = tt.call @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp28_37) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc236) + %tmp29_38 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc237) + %tmp30 = arith.trunci %tmp24_33 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc238) + %tmp31 = arith.trunci %tmp29_38 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc239) + %tmp32 = arith.extsi %0#1 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc240) + %tmp33 = arith.trunci %tmp32 : tensor<32x16xi64> to tensor<32x16xi32> loc(#loc241) + %tmp34 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc242) + %tmp34_39 = tt.broadcast %tmp30 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc242) + %tmp34_40 = arith.cmpi slt, %tmp34, %tmp34_39 : tensor<32x16xi32> loc(#loc242) + %tmp35 = arith.constant 16 : i32 loc(#loc243) + %tmp35_41 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc243) + %tmp36 = arith.constant dense<16> : tensor<32x16xi32> loc(#loc244) + %tmp36_42 = arith.select %tmp34_40, %tmp33, %tmp36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc244) + %tmp37 = arith.constant 17 : i32 loc(#loc245) + %tmp37_43 = arith.constant dense<17> : tensor<32x16xi32> loc(#loc245) + %tmp38 = arith.addi %tmp36_42, %tmp37_43 : tensor<32x16xi32> loc(#loc246) + %tmp39 = arith.constant 0 : i32 loc(#loc247) + %tmp39_44 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc247) + %tmp39_45 = arith.cmpi slt, %tmp36_42, %tmp39_44 : tensor<32x16xi32> loc(#loc247) + %tmp40 = arith.select %tmp39_45, %tmp38, %tmp36_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc248) + %c0_i32 = arith.constant 0 : i32 loc(#loc50) + %cst = arith.constant dense<0> : tensor<32x16xi32> loc(#loc50) + %2 = arith.cmpi sle, %cst, %tmp40 : tensor<32x16xi32> loc(#loc50) + %c17_i32 = arith.constant 17 : i32 loc(#loc51) + %cst_46 = arith.constant dense<17> : tensor<32x16xi32> loc(#loc51) + %3 = arith.cmpi slt, %tmp40, %cst_46 : tensor<32x16xi32> loc(#loc51) + %4 = arith.andi %2, %3 : tensor<32x16xi1> loc(#loc52) + %true = arith.constant true loc(#loc53) + %cst_47 = arith.constant dense : tensor<32x1xi1> loc(#loc53) + %5 = arith.xori %xmask_8, %cst_47 : tensor<32x1xi1> loc(#loc53) + %6 = tt.broadcast %5 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc54) + %7 = arith.ori %4, %6 : tensor<32x16xi1> loc(#loc54) + tt.assert %7, "index out of bounds: 0 <= tmp40 < 17" : tensor<32x16xi1> loc(#loc55) + %tmp42 = arith.constant 1 : i32 loc(#loc249) + %tmp42_48 = arith.constant dense<1> : tensor<1x1xi32> loc(#loc249) + %tmp43 = arith.extsi %1#1 : tensor<32x16xi32> to tensor<32x16xi64> loc(#loc250) + %tmp44 = arith.trunci %tmp43 : tensor<32x16xi64> to tensor<32x16xi32> loc(#loc251) + %tmp45 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc252) + %tmp45_49 = tt.broadcast %tmp31 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc252) + %tmp45_50 = arith.cmpi slt, %tmp45, %tmp45_49 : tensor<32x16xi32> loc(#loc252) + %tmp46 = arith.constant dense<16> : tensor<32x16xi32> loc(#loc253) + %tmp46_51 = arith.select %tmp45_50, %tmp44, %tmp46 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc253) + %tmp47 = arith.addi %tmp46_51, %tmp37_43 : tensor<32x16xi32> loc(#loc254) + %tmp48 = arith.constant 0 : i32 loc(#loc255) + %tmp48_52 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc255) + %tmp48_53 = arith.cmpi slt, %tmp46_51, %tmp48_52 : tensor<32x16xi32> loc(#loc255) + %tmp49 = arith.select %tmp48_53, %tmp47, %tmp46_51 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc256) + %c0_i32_54 = arith.constant 0 : i32 loc(#loc64) + %cst_55 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc64) + %8 = arith.cmpi sle, %cst_55, %tmp49 : tensor<32x16xi32> loc(#loc64) + %c17_i32_56 = arith.constant 17 : i32 loc(#loc65) + %cst_57 = arith.constant dense<17> : tensor<32x16xi32> loc(#loc65) + %9 = arith.cmpi slt, %tmp49, %cst_57 : tensor<32x16xi32> loc(#loc65) + %10 = arith.andi %8, %9 : tensor<32x16xi1> loc(#loc66) + %true_58 = arith.constant true loc(#loc67) + %cst_59 = arith.constant dense : tensor<32x1xi1> loc(#loc67) + %11 = arith.xori %xmask_8, %cst_59 : tensor<32x1xi1> loc(#loc67) + %12 = tt.broadcast %11 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc68) + %13 = arith.ori %10, %12 : tensor<32x16xi1> loc(#loc68) + tt.assert %13, "index out of bounds: 0 <= tmp49 < 17" : tensor<32x16xi1> loc(#loc69) + %14 = tt.splat %out_ptr4 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc70) + %15 = tt.addptr %14, %xindex_7 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc70) + tt.store %15, %tmp30, %xmask_8 : tensor<32x1x!tt.ptr> loc(#loc71) + %16 = tt.splat %out_ptr5 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc72) + %17 = tt.addptr %16, %xindex_7 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc72) + tt.store %17, %tmp31, %xmask_8 : tensor<32x1x!tt.ptr> loc(#loc73) + %c16_i32 = arith.constant 16 : i32 loc(#loc74) + %c16_i32_60 = arith.constant 16 : i32 loc(#loc74) + %cst_61 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc74) + %18 = arith.muli %cst_61, %xindex_7 : tensor<32x1xi32> loc(#loc74) + %19 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc75) + %20 = tt.broadcast %18 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc75) + %21 = arith.addi %19, %20 : tensor<32x16xi32> loc(#loc75) + %22 = tt.splat %out_ptr6 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc76) + %23 = tt.addptr %22, %21 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc76) + %24 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc77) + tt.store %23, %tmp33, %24 : tensor<32x16x!tt.ptr> loc(#loc77) + %c17_i32_62 = arith.constant 17 : i32 loc(#loc78) + %c17_i32_63 = arith.constant 17 : i32 loc(#loc78) + %cst_64 = arith.constant dense<17> : tensor<32x1xi32> loc(#loc78) + %25 = arith.muli %cst_64, %xindex_7 : tensor<32x1xi32> loc(#loc78) + %26 = tt.broadcast %25 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc79) + %27 = arith.addi %tmp40, %26 : tensor<32x16xi32> loc(#loc79) + %28 = tt.splat %out_ptr7 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc80) + %29 = tt.addptr %28, %27 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc80) + %cst_65 = arith.constant dense<1> : tensor<32x16xi32> loc(#loc81) + %30 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc81) + tt.store %29, %cst_65, %30 : tensor<32x16x!tt.ptr> loc(#loc81) + %c16_i32_66 = arith.constant 16 : i32 loc(#loc82) + %c16_i32_67 = arith.constant 16 : i32 loc(#loc82) + %cst_68 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc82) + %31 = arith.muli %cst_68, %xindex_7 : tensor<32x1xi32> loc(#loc82) + %32 = tt.broadcast %r0_index_9 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc83) + %33 = tt.broadcast %31 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc83) + %34 = arith.addi %32, %33 : tensor<32x16xi32> loc(#loc83) + %35 = tt.splat %out_ptr8 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc84) + %36 = tt.addptr %35, %34 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc84) + %37 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc85) + tt.store %36, %tmp44, %37 : tensor<32x16x!tt.ptr> loc(#loc85) + %c17_i32_69 = arith.constant 17 : i32 loc(#loc86) + %c17_i32_70 = arith.constant 17 : i32 loc(#loc86) + %cst_71 = arith.constant dense<17> : tensor<32x1xi32> loc(#loc86) + %38 = arith.muli %cst_71, %xindex_7 : tensor<32x1xi32> loc(#loc86) + %39 = tt.broadcast %38 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc87) + %40 = arith.addi %tmp49, %39 : tensor<32x16xi32> loc(#loc87) + %41 = tt.splat %out_ptr9 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc88) + %42 = tt.addptr %41, %40 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc88) + %cst_72 = arith.constant dense<1> : tensor<32x16xi32> loc(#loc89) + %43 = tt.broadcast %xmask_8 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc89) + tt.store %42, %cst_72, %43 : tensor<32x16x!tt.ptr> loc(#loc89) + tt.return loc(#loc90) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc91)), %idxs: tensor<32x16xi16> loc("idxs"(#loc91))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<32x16xi32>, tensor<32x16xi16>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc92) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc92) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc92) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc92) + tt.return %3#0, %3#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc93) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc94) + %5 = ub.poison : tensor<32x16xi32> loc(#loc94) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc94) + } loc(#loc91) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i16S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc95)), %idxs: tensor<32x16xi16> loc("idxs"(#loc95))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i16S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi16>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + tt.return %0#0, %0#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc101) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi32> loc(#loc102) + %2 = ub.poison : tensor<32x16xi32> loc(#loc102) + tt.return %1, %2 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i16S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi16> loc("idxs"(#loc103)), %flip: tensor<32x16xi32> loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi16> -> tensor<256x2x1xi16> loc(#loc282) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc283) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc284) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<256x2x1xi16> loc(#loc284) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<256x2x1xi16>) -> tensor<256x1xi32> loc(#loc285) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc286) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc287) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc288) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc289) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<256x2x1xi16> loc(#loc289) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<256x2x1xi16>) -> tensor<256x1xi32> loc(#loc290) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc291) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc292) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_27 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_28 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_49 = arith.constant true loc(#loc300) + %cond_50 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<32x16xi1> loc(#loc300) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<32x16xi1> loc(#loc301) + %cond_53 = arith.ori %cond, %cond_52 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_53 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_50 = arith.ori %eq, %eq_49 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_50 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<32x16xi32> loc(#loc306) + %cond_30 = arith.andi %3, %cond_29 : tensor<32x16xi1> loc(#loc307) + %cond_31 = arith.ori %1, %cond_30 : tensor<32x16xi1> loc(#loc308) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<32x16xi1> loc(#loc309) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<32x16xi1> loc(#loc310) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<32x16xi1> loc(#loc311) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<32x16xi1> loc(#loc312) + %cond_36 = arith.extui %cond_35 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc313) + %cond_37 = arith.xori %cond_36, %flip : tensor<32x16xi32> loc(#loc313) + %cond_38 = arith.constant 0 : i32 loc(#loc314) + %cond_39 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc314) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<32x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_43 = arith.xori %x, %ret_42 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<32x16xi32> loc(#loc319) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S32_16S__(%idxs) : (tensor<32x16xi16>) -> tensor<32x16xi16> loc(#loc320) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<32x16xi16> to tensor<32x16xi32> loc(#loc321) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_47 = arith.extsi %idxs : tensor<32x16xi16> to tensor<32x16xi32> loc(#loc322) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_43, %new_idxs_48 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<256x2x1xi32> loc("input"(#loc166))) -> tensor<256x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc167) + tt.return %0 : tensor<256x1xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<256x1xi32> loc(#loc169) + tt.return %1 : tensor<256x1xi32> loc(#loc169) + } loc(#loc166) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc170)), %b: i32 loc("b"(#loc170))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc171) + tt.return %0 : i32 loc(#loc172) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc173) + tt.return %1 : i32 loc(#loc173) + } loc(#loc170) + tt.func private @"triton.language.standard.sum__i16S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<256x2x1xi16> loc("input"(#loc166))) -> tensor<256x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc326) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc167) + tt.return %0 : tensor<256x1xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<256x1xi32> loc(#loc169) + tt.return %1 : tensor<256x1xi32> loc(#loc169) + } loc(#loc166) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%x: tensor<32x16xi32> loc("x"(#loc175))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc176) + %false = arith.constant false loc(#loc177) + tt.return %false : i1 loc(#loc177) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc178) + tt.return %1 : i1 loc(#loc178) + } loc(#loc175) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S32_16S__(%x: tensor<32x16xi32> loc("x"(#loc179))) -> tensor<32x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc180) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc181) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc181) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<32x16xi32> loc(#loc181) + %4 = arith.addi %x, %3 : tensor<32x16xi32> loc(#loc181) + tt.return %4 : tensor<32x16xi32> loc(#loc182) + ^bb1: // no predecessors + %5 = ub.poison : tensor<32x16xi32> loc(#loc183) + tt.return %5 : tensor<32x16xi32> loc(#loc183) + } loc(#loc179) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc185) + %cst = arith.constant dense : tensor<1xi1> loc(#loc185) + tt.return %cst : tensor<1xi1> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc187) + tt.return %0 : tensor<1xi1> loc(#loc187) + } loc(#loc184) + tt.func private @triton.language.standard.zeros_like__i32S32_16S__(%input: tensor<32x16xi32> loc("input"(#loc188))) -> tensor<32x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<32x16xi32> loc(#loc189) + tt.return %0 : tensor<32x16xi32> loc(#loc190) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi32> loc(#loc191) + tt.return %1 : tensor<32x16xi32> loc(#loc191) + } loc(#loc188) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<32x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc185) + %cst = arith.constant dense<0> : tensor<32x16xi32> loc(#loc185) + tt.return %cst : tensor<32x16xi32> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<32x16xi32> loc(#loc187) + tt.return %0 : tensor<32x16xi32> loc(#loc187) + } loc(#loc184) + tt.func private @triton.language.standard.zeros_like__i16S32_16S__(%input: tensor<32x16xi16> loc("input"(#loc188))) -> tensor<32x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<32x16xi16> loc(#loc189) + tt.return %0 : tensor<32x16xi16> loc(#loc190) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x16xi16> loc(#loc191) + tt.return %1 : tensor<32x16xi16> loc(#loc191) + } loc(#loc188) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_32__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<32x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc185) + %cst = arith.constant dense<0> : tensor<32x16xi16> loc(#loc185) + tt.return %cst : tensor<32x16xi16> loc(#loc186) + ^bb1: // no predecessors + %0 = ub.poison : tensor<32x16xi16> loc(#loc187) + tt.return %0 : tensor<32x16xi16> loc(#loc187) + } loc(#loc184) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc95)), %idxs: tensor<32x16xi32> loc("idxs"(#loc95))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + tt.return %1#0, %1#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc101) + ^bb1: // no predecessors + %2 = ub.poison : tensor<32x16xi32> loc(#loc102) + %3 = ub.poison : tensor<32x16xi32> loc(#loc102) + tt.return %2, %3 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: tensor<32x16xi32> loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<128x2x2xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<128x2x2xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<128x2x2xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<128x2x2xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<128x2x2xi32> loc("input"(#loc166))) -> tensor<128x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc167) + tt.return %0 : tensor<128x2xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<128x2xi32> loc(#loc169) + tt.return %1 : tensor<128x2xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: tensor<32x16xi32> loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x1xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x1xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc95)), %idxs: tensor<32x16xi32> loc("idxs"(#loc95))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc261) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc262) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc262) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc263) + %flip_3 = tt.reshape %flip_2 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc264) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<32x16xi32>, tensor<32x16xi32>, tensor<32x16xi32>) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + tt.return %2#0, %2#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc101) + ^bb1: // no predecessors + %3 = ub.poison : tensor<32x16xi32> loc(#loc102) + %4 = ub.poison : tensor<32x16xi32> loc(#loc102) + tt.return %3, %4 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: tensor<32x16xi32> loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x4xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<64x2x4xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x4xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x4xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_45 = arith.constant true loc(#loc300) + %cond_46 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<32x16xi1> loc(#loc300) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<32x16xi1> loc(#loc301) + %cond_49 = arith.ori %cond, %cond_48 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_49 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_46 = arith.ori %eq, %eq_45 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_46 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = arith.extui %cond_33 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc313) + %cond_35 = arith.xori %cond_34, %flip : tensor<32x16xi32> loc(#loc313) + %cond_36 = arith.constant 0 : i32 loc(#loc314) + %cond_37 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc314) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<32x16xi32> loc(#loc314) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_41 = arith.xori %x, %ret_40 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_41, %new_idxs_44 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<64x2x4xi32> loc("input"(#loc166))) -> tensor<64x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc167) + tt.return %0 : tensor<64x4xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<64x4xi32> loc(#loc169) + tt.return %1 : tensor<64x4xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S32_16S_i32S32_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc95)), %idxs: tensor<32x16xi32> loc("idxs"(#loc95))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc330) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<32x16xi32>, tensor<32x16xi32>, i1) -> (tensor<32x16xi32>, tensor<32x16xi32>) loc(#loc100) + tt.return %3#0, %3#1 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc101) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc102) + %5 = ub.poison : tensor<32x16xi32> loc(#loc102) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc102) + } loc(#loc95) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<32x2x8xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<32x2x8xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<32x2x8xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<32x2x8xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i32S32_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x2x8xi32> loc("input"(#loc166))) -> tensor<32x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc167) + tt.reduce.return %2 : i32 loc(#loc167) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc167) + tt.return %0 : tensor<32x8xi32> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32x8xi32> loc(#loc169) + tt.return %1 : tensor<32x8xi32> loc(#loc169) + } loc(#loc166) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<64x2x4xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<64x2x4xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<64x2x4xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<64x2x4xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S64_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<128x2x2xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<128x2x2xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<128x2x2xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<128x2x2xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S128_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S32_16S_i32S32_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<32x16xi32> loc("x"(#loc103)), %idxs: tensor<32x16xi32> loc("idxs"(#loc103)), %flip: i1 loc("flip"(#loc103))) -> (tensor<32x16xi32>, tensor<32x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc268) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc269) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc270) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc270) + %left_mask = arith.constant 1 : i32 loc(#loc271) + %left_mask_2 = arith.constant 1 : i32 loc(#loc271) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc271) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc271) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc272) + %ileft_5 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc272) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc273) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc274) + %ileft_8 = tt.broadcast %ileft_7 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc275) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc276) + %iright_9 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc276) + %iright_10 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc277) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc278) + %iright_12 = tt.broadcast %iright_11 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc279) + %ileft_13 = tt.reshape %ileft_8 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %iright_14 = tt.reshape %iright_12 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc281) + %y_idx = tt.reshape %idxs : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc282) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc284) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<256x2x1xi32> loc(#loc284) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc285) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc286) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc287) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc289) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<256x2x1xi32> loc(#loc289) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S256_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc290) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc291) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc292) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc293) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc294) + %left_valid_mask = arith.constant true loc(#loc295) + %left_valid_mask_25 = arith.constant dense : tensor<32x16xi1> loc(#loc295) + %right_valid_mask = arith.constant true loc(#loc296) + %right_valid_mask_26 = arith.constant dense : tensor<32x16xi1> loc(#loc296) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<32x16xi32> loc(#loc297) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<32x16xi32> loc(#loc298) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc331) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc136) + %1 = scf.if %0 -> (tensor<32x16xi1>) { + %cond_42 = arith.constant true loc(#loc300) + %cond_43 = arith.constant dense : tensor<32x16xi1> loc(#loc300) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<32x16xi1> loc(#loc300) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<32x16xi1> loc(#loc301) + %cond_46 = arith.ori %cond, %cond_45 : tensor<32x16xi1> loc(#loc332) + scf.yield %cond_46 : tensor<32x16xi1> loc(#loc332) + } else { + scf.yield %cond : tensor<32x16xi1> loc(#loc141) + } loc(#loc137) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc333) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S32_16S__(%ileft_13) : (tensor<32x16xi32>) -> i1 loc(#loc143) + %3 = scf.if %2 -> (tensor<32x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<32x16xi1> loc(#loc304) + %eq_43 = arith.ori %eq, %eq_42 : tensor<32x16xi1> loc(#loc334) + scf.yield %eq_43 : tensor<32x16xi1> loc(#loc334) + } else { + scf.yield %eq : tensor<32x16xi1> loc(#loc141) + } loc(#loc144) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc306) + %cond_28 = arith.andi %3, %cond_27 : tensor<32x16xi1> loc(#loc307) + %cond_29 = arith.ori %1, %cond_28 : tensor<32x16xi1> loc(#loc308) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc309) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<32x16xi1> loc(#loc310) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<32x16xi1> loc(#loc311) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<32x16xi1> loc(#loc312) + %cond_34 = tt.splat %flip : i1 -> tensor<32x16xi1> loc(#loc313) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<32x16xi1> loc(#loc313) + %ret = arith.xori %ileft_13, %iright_14 : tensor<32x16xi32> loc(#loc315) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%x) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc316) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc317) + %ret_38 = arith.xori %x, %ret_37 : tensor<32x16xi32> loc(#loc318) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<32x16xi32> loc(#loc319) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S32_16S__(%idxs) : (tensor<32x16xi32>) -> tensor<32x16xi32> loc(#loc320) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc321) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<32x16xi32> loc(#loc322) + tt.return %ret_38, %new_idxs_41 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc164) + ^bb1: // no predecessors + %4 = ub.poison : tensor<32x16xi32> loc(#loc165) + %5 = ub.poison : tensor<32x16xi32> loc(#loc165) + tt.return %4, %5 : tensor<32x16xi32>, tensor<32x16xi32> loc(#loc165) + } loc(#loc103) + tt.func private @"triton.language.standard.sum__i64S32_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<32x16xi64> loc("input"(#loc166))) -> tensor<32xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc167) + tt.reduce.return %2 : i64 loc(#loc167) + }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc167) + tt.return %0 : tensor<32xi64> loc(#loc168) + ^bb1: // no predecessors + %1 = ub.poison : tensor<32xi64> loc(#loc169) + tt.return %1 : tensor<32xi64> loc(#loc169) + } loc(#loc166) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc170)), %b: i64 loc("b"(#loc170))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc171) + tt.return %0 : i64 loc(#loc172) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc173) + tt.return %1 : i64 loc(#loc173) + } loc(#loc170) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":34:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":34:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":34:30) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":34:45) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":35:30) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":36:18) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":37:34) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":38:18) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":39:18) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":40:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":41:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":43:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":45:34) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":46:71) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":47:20) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":48:21) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":49:21) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":51:71) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":52:20) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":54:35) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":55:26) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":55:29) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":56:21) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":58:35) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":59:26) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":59:29) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":60:21) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":61:21) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":62:21) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":63:21) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":64:19) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":65:32) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":66:35) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":67:44) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":68:20) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":69:20) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":70:35) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":71:28) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":71:46) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":71:38) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":71:55) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":71:53) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":71:63) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":72:31) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":73:21) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":74:21) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":75:19) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":76:35) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":77:20) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":78:20) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":79:35) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":80:28) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":80:46) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":80:38) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":80:55) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":80:53) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":80:63) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":81:25) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":81:37) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":82:25) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":82:37) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":83:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":83:32) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":83:25) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":83:47) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":84:52) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":84:49) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":84:25) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":84:85) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":85:35) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":85:32) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":85:25) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":85:47) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":86:52) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":86:49) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":86:25) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":86:85) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":86:4) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc140 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc142 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc143 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc144 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc145 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc146 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc147 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc148 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc149 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc150 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc151 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc152 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc153 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc154 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc155 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc156 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc157 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc158 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc159 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc160 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc161 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc162 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc163 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc164 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc165 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc167 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc168 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc169 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc171 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc172 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc173 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc174 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc176 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc177 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc178 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc180 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc181 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc182 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc183 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc184 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc185 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc186 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc187 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc189 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc190 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc191 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc192 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc202 = loc("xnumel"(#loc1)) +#loc203 = loc("r0_numel"(#loc2)) +#loc204 = loc("xoffset"(#loc3)) +#loc205 = loc("xoffset"(#loc4)) +#loc206 = loc("xindex"(#loc5)) +#loc207 = loc("xindex"(#loc6)) +#loc208 = loc("xindex"(#loc7)) +#loc209 = loc("xmask"(#loc8)) +#loc210 = loc("r0_index"(#loc9)) +#loc211 = loc("r0_index"(#loc10)) +#loc212 = loc("r0_offset"(#loc11)) +#loc213 = loc("r0_mask"(#loc12)) +#loc214 = loc("tmp0"(#loc13)) +#loc215 = loc("tmp0"(#loc14)) +#loc216 = loc("tmp0"(#loc15)) +#loc217 = loc("tmp0"(#loc16)) +#loc218 = loc("tmp1"(#loc17)) +#loc219 = loc("tmp2"(#loc18)) +#loc220 = loc("tmp3"(#loc19)) +#loc221 = loc("tmp4"(#loc20)) +#loc222 = loc("tmp5"(#loc21)) +#loc223 = loc("tmp6"(#loc22)) +#loc224 = loc("tmp7"(#loc23)) +#loc225 = loc("tmp9"(#loc24)) +#loc226 = loc("tmp11"(#loc25)) +#loc227 = loc("tmp14"(#loc27)) +#loc228 = loc("tmp15"(#loc28)) +#loc229 = loc("tmp16"(#loc29)) +#loc230 = loc("tmp20"(#loc31)) +#loc231 = loc("tmp23"(#loc32)) +#loc232 = loc("tmp24"(#loc33)) +#loc233 = loc("tmp24"(#loc34)) +#loc234 = loc("tmp25"(#loc35)) +#loc235 = loc("tmp28"(#loc36)) +#loc236 = loc("tmp29"(#loc37)) +#loc237 = loc("tmp29"(#loc38)) +#loc238 = loc("tmp30"(#loc39)) +#loc239 = loc("tmp31"(#loc40)) +#loc240 = loc("tmp32"(#loc41)) +#loc241 = loc("tmp33"(#loc42)) +#loc242 = loc("tmp34"(#loc43)) +#loc243 = loc("tmp35"(#loc44)) +#loc244 = loc("tmp36"(#loc45)) +#loc245 = loc("tmp37"(#loc46)) +#loc246 = loc("tmp38"(#loc47)) +#loc247 = loc("tmp39"(#loc48)) +#loc248 = loc("tmp40"(#loc49)) +#loc249 = loc("tmp42"(#loc56)) +#loc250 = loc("tmp43"(#loc57)) +#loc251 = loc("tmp44"(#loc58)) +#loc252 = loc("tmp45"(#loc59)) +#loc253 = loc("tmp46"(#loc60)) +#loc254 = loc("tmp47"(#loc61)) +#loc255 = loc("tmp48"(#loc62)) +#loc256 = loc("tmp49"(#loc63)) +#loc261 = loc("flip"(#loc96)) +#loc262 = loc("flip"(#loc97)) +#loc263 = loc("flip"(#loc98)) +#loc264 = loc("flip"(#loc99)) +#loc268 = loc("y"(#loc104)) +#loc269 = loc("right_mask"(#loc105)) +#loc270 = loc("right_mask"(#loc106)) +#loc271 = loc("left_mask"(#loc107)) +#loc272 = loc("ileft"(#loc108)) +#loc273 = loc("ileft"(#loc109)) +#loc274 = loc("ileft"(#loc110)) +#loc275 = loc("ileft"(#loc111)) +#loc276 = loc("iright"(#loc112)) +#loc277 = loc("iright"(#loc113)) +#loc278 = loc("iright"(#loc114)) +#loc279 = loc("iright"(#loc115)) +#loc280 = loc("ileft"(#loc116)) +#loc281 = loc("iright"(#loc117)) +#loc282 = loc("y_idx"(#loc118)) +#loc283 = loc("left_idx"(#loc119)) +#loc284 = loc("left_idx"(#loc120)) +#loc285 = loc("left_idx"(#loc121)) +#loc286 = loc("left_idx"(#loc122)) +#loc287 = loc("left_idx"(#loc123)) +#loc288 = loc("right_idx"(#loc124)) +#loc289 = loc("right_idx"(#loc125)) +#loc290 = loc("right_idx"(#loc126)) +#loc291 = loc("right_idx"(#loc127)) +#loc292 = loc("right_idx"(#loc128)) +#loc293 = loc("left_idx"(#loc129)) +#loc294 = loc("right_idx"(#loc130)) +#loc295 = loc("left_valid_mask"(#loc131)) +#loc296 = loc("right_valid_mask"(#loc132)) +#loc297 = loc("left_isnan"(#loc133)) +#loc298 = loc("right_isnan"(#loc134)) +#loc299 = loc("cond"(#loc135)) +#loc300 = loc("cond"(#loc138)) +#loc301 = loc("cond"(#loc139)) +#loc302 = loc("cond"(#loc140)) +#loc303 = loc("eq"(#loc142)) +#loc304 = loc("eq"(#loc145)) +#loc305 = loc("eq"(#loc146)) +#loc306 = loc("cond"(#loc147)) +#loc307 = loc("cond"(#loc148)) +#loc308 = loc("cond"(#loc149)) +#loc309 = loc("cond"(#loc150)) +#loc310 = loc("cond"(#loc151)) +#loc311 = loc("cond"(#loc152)) +#loc312 = loc("cond"(#loc153)) +#loc313 = loc("cond"(#loc154)) +#loc314 = loc("cond"(#loc155)) +#loc315 = loc("ret"(#loc156)) +#loc316 = loc("ret"(#loc157)) +#loc317 = loc("ret"(#loc158)) +#loc318 = loc("ret"(#loc159)) +#loc319 = loc("new_idxs"(#loc160)) +#loc320 = loc("new_idxs"(#loc161)) +#loc321 = loc("new_idxs"(#loc162)) +#loc322 = loc("new_idxs"(#loc163)) +#loc326 = loc("input"(#loc174)) +#loc330 = loc("flip"(#loc192)) +#loc331 = loc("cond"(#loc299)) +#loc332 = loc("cond"(#loc302)) +#loc333 = loc("eq"(#loc303)) +#loc334 = loc("eq"(#loc305)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..f306453b99098052147aa33021a7a5625bf4ce6c --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir @@ -0,0 +1,1480 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1, 4], threadsPerWarp = [8, 2, 2], warpsPerCTA = [4, 1, 1], order = [2, 1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1, 4], threadsPerWarp = [16, 2, 1], warpsPerCTA = [4, 1, 1], order = [2, 1, 0]}> +#blocked3 = #ttg.blocked<{sizePerThread = [1, 2, 2], threadsPerWarp = [32, 1, 1], warpsPerCTA = [4, 1, 1], order = [2, 1, 0]}> +#blocked4 = #ttg.blocked<{sizePerThread = [2, 2, 1], threadsPerWarp = [32, 1, 1], warpsPerCTA = [4, 1, 1], order = [2, 1, 0]}> +#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":18:0) +#loc1 = loc(unknown) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":46:71) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":51:71) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":55:26) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":59:26) +#loc117 = loc("in_ptr0"(#loc)) +#loc118 = loc("out_ptr4"(#loc)) +#loc119 = loc("out_ptr5"(#loc)) +#loc120 = loc("out_ptr6"(#loc)) +#loc121 = loc("out_ptr7"(#loc)) +#loc122 = loc("out_ptr8"(#loc)) +#loc123 = loc("out_ptr9"(#loc)) +#loc124 = loc("xnumel"(#loc)) +#loc125 = loc("r0_numel"(#loc)) +#loc144 = loc(callsite(#loc20 at #loc21)) +#loc150 = loc("ileft"(#loc29)) +#loc154 = loc("iright"(#loc34)) +#loc163 = loc("left_idx"(#loc43)) +#loc168 = loc("right_idx"(#loc48)) +#loc189 = loc(callsite(#loc20 at #loc69)) +#loc192 = loc("tmp24"(#loc72)) +#loc197 = loc("tmp29"(#loc77)) +#loc214 = loc(callsite(#loc25 at #loc144)) +#loc218 = loc(callsite(#loc25 at #loc189)) +#loc221 = loc(callsite(#loc1 at #loc192)) +#loc224 = loc(callsite(#loc1 at #loc197)) +#loc228 = loc(callsite(#loc150 at #loc214)) +#loc232 = loc(callsite(#loc154 at #loc214)) +#loc240 = loc(callsite(#loc163 at #loc214)) +#loc245 = loc(callsite(#loc168 at #loc214)) +#loc265 = loc(callsite(#loc150 at #loc218)) +#loc269 = loc(callsite(#loc154 at #loc218)) +#loc287 = loc(callsite(#loc163 at #loc218)) +#loc291 = loc(callsite(#loc168 at #loc218)) +#loc301 = loc(callsite(#loc1 at #loc228)) +#loc303 = loc(callsite(#loc1 at #loc232)) +#loc306 = loc(callsite(#loc1 at #loc240)) +#loc309 = loc(callsite(#loc1 at #loc245)) +#loc311 = loc(callsite(#loc1 at #loc265)) +#loc313 = loc(callsite(#loc1 at #loc269)) +#loc315 = loc(callsite(#loc1 at #loc287)) +#loc317 = loc(callsite(#loc1 at #loc291)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense : tensor<32x1xi1, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<17> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked2> loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked3> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked4> loc(#loc1) + %cst_5 = arith.constant dense<16> : tensor<32x1xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<32x1xi32, #blocked5> loc(#loc1) + %cst_7 = arith.constant dense<128> : tensor<32x1xi32, #blocked> loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %cst_8 = arith.constant dense<1> : tensor<32x16xi32, #blocked5> loc(#loc1) + %cst_9 = arith.constant dense<0> : tensor<32x16xi32, #blocked> loc(#loc1) + %cst_10 = arith.constant dense<17> : tensor<32x16xi32, #blocked> loc(#loc1) + %cst_11 = arith.constant dense<16> : tensor<32x16xi32, #blocked> loc(#loc1) + %cst_12 = arith.constant dense<16384> : tensor<32x16xi64, #blocked> loc(#loc1) + %cst_13 = arith.constant dense<0> : tensor<32x16xi64, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc126) + %xoffset_14 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc127) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc128) + %xindex_15 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc128) + %xindex_16 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> loc(#loc128) + %xindex_17 = tt.expand_dims %xindex_15 {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<32x1xi32, #blocked5> loc(#loc128) + %xindex_18 = tt.splat %xoffset_14 : i32 -> tensor<32x1xi32, #blocked> loc(#loc129) + %xindex_19 = tt.splat %xoffset_14 : i32 -> tensor<32x1xi32, #blocked5> loc(#loc129) + %xindex_20 = arith.addi %xindex_18, %xindex_16 : tensor<32x1xi32, #blocked> loc(#loc129) + %xindex_21 = arith.addi %xindex_19, %xindex_17 : tensor<32x1xi32, #blocked5> loc(#loc129) + %xmask = arith.cmpi slt, %xindex_20, %cst_7 : tensor<32x1xi32, #blocked> loc(#loc130) + %xmask_22 = arith.cmpi slt, %xindex_21, %cst_6 : tensor<32x1xi32, #blocked5> loc(#loc130) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc131) + %r0_index_23 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc131) + %tmp0 = arith.muli %xindex_20, %cst_5 : tensor<32x1xi32, #blocked> loc(#loc132) + %tmp0_24 = tt.broadcast %r0_index_23 : tensor<1x16xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc133) + %tmp0_25 = tt.broadcast %tmp0 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc133) + %tmp0_26 = arith.addi %tmp0_24, %tmp0_25 : tensor<32x16xi32, #blocked> loc(#loc133) + %tmp0_27 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc134) + %tmp0_28 = tt.addptr %tmp0_27, %tmp0_26 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc134) + %tmp0_29 = tt.broadcast %xmask : tensor<32x1xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc135) + %tmp0_30 = tt.broadcast %xmask_22 : tensor<32x1xi1, #blocked5> -> tensor<32x16xi1, #blocked5> loc(#loc135) + %tmp0_31 = tt.load %tmp0_28, %tmp0_29, %cst_13 : tensor<32x16x!tt.ptr, #blocked> loc(#loc135) + %tmp2 = arith.cmpi sgt, %tmp0_31, %cst_13 : tensor<32x16xi64, #blocked> loc(#loc136) + %tmp4 = arith.cmpi slt, %tmp0_31, %cst_12 : tensor<32x16xi64, #blocked> loc(#loc137) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<32x16xi1, #blocked> loc(#loc138) + %tmp7 = arith.extui %tmp5 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc210) + %tmp9 = arith.trunci %r0_index_23 : tensor<1x16xi32, #blocked> to tensor<1x16xi16, #blocked> loc(#loc141) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16, #blocked> -> tensor<32x16xi16, #blocked> loc(#loc142) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> loc(#loc211) + %flip_32 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> loc(#loc211) + %flip_33 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> loc(#loc211) + %flip_34 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> loc(#loc211) + %flip_35 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> loc(#loc211) + %flip_36 = tt.expand_dims %flip_32 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> loc(#loc211) + %flip_37 = tt.expand_dims %flip_33 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> loc(#loc211) + %flip_38 = tt.expand_dims %flip_34 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> loc(#loc211) + %flip_39 = tt.expand_dims %flip_35 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> -> tensor<1x2x1xi32, #blocked3> loc(#loc211) + %flip_40 = tt.expand_dims %flip_36 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> -> tensor<1x2x1xi32, #blocked4> loc(#loc211) + %flip_41 = tt.expand_dims %flip_37 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> -> tensor<1x2x1xi32, #blocked2> loc(#loc211) + %flip_42 = tt.expand_dims %flip_38 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> -> tensor<1x2x1xi32, #blocked1> loc(#loc211) + %flip_43 = tt.broadcast %flip_39 : tensor<1x2x1xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc212) + %flip_44 = tt.reshape %flip_43 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc213) + %y = tt.reshape %tmp7 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc225) + %left_mask = arith.subi %cst_4, %flip_40 : tensor<1x2x1xi32, #blocked4> loc(#loc226) + %left_mask_45 = arith.subi %cst_3, %flip_39 : tensor<1x2x1xi32, #blocked3> loc(#loc226) + %left_mask_46 = arith.subi %cst_2, %flip_41 : tensor<1x2x1xi32, #blocked2> loc(#loc226) + %left_mask_47 = arith.subi %cst_1, %flip_42 : tensor<1x2x1xi32, #blocked1> loc(#loc226) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc227) + %ileft_48 = arith.muli %y, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc227) + %ileft_49 = "tt.reduce"(%ileft_48) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_50 = tt.expand_dims %ileft_49 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc229) + %ileft_51 = tt.broadcast %ileft_50 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc230) + %iright = tt.broadcast %flip_40 : tensor<1x2x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc231) + %iright_52 = arith.muli %y, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc231) + %iright_53 = "tt.reduce"(%iright_52) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_54 = tt.expand_dims %iright_53 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc233) + %iright_55 = tt.broadcast %iright_54 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc234) + %ileft_56 = tt.reshape %ileft_51 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_57 = tt.reshape %iright_55 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx = tt.reshape %tmp11 : tensor<32x16xi16, #blocked> -> tensor<256x2x1xi16, #blocked4> loc(#loc237) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc238) + %left_idx_58 = tt.broadcast %left_idx : tensor<1x2x1xi16, #blocked4> -> tensor<256x2x1xi16, #blocked4> loc(#loc239) + %left_idx_59 = arith.muli %y_idx, %left_idx_58 : tensor<256x2x1xi16, #blocked4> loc(#loc239) + %input = arith.extsi %left_idx_59 : tensor<256x2x1xi16, #blocked4> to tensor<256x2x1xi32, #blocked4> loc(#loc304) + %left_idx_60 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_61 = tt.expand_dims %left_idx_60 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc241) + %left_idx_62 = tt.broadcast %left_idx_61 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc242) + %right_idx = arith.trunci %flip_40 : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc243) + %right_idx_63 = tt.broadcast %right_idx : tensor<1x2x1xi16, #blocked4> -> tensor<256x2x1xi16, #blocked4> loc(#loc244) + %right_idx_64 = arith.muli %y_idx, %right_idx_63 : tensor<256x2x1xi16, #blocked4> loc(#loc244) + %input_65 = arith.extsi %right_idx_64 : tensor<256x2x1xi16, #blocked4> to tensor<256x2x1xi32, #blocked4> loc(#loc307) + %right_idx_66 = "tt.reduce"(%input_65) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_67 = tt.expand_dims %right_idx_66 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc246) + %right_idx_68 = tt.broadcast %right_idx_67 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc247) + %left_idx_69 = tt.reshape %left_idx_62 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_70 = tt.reshape %right_idx_68 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond = arith.cmpi slt, %ileft_56, %iright_57 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq = arith.cmpi eq, %ileft_56, %iright_57 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_71 = arith.cmpi sgt, %left_idx_69, %right_idx_70 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_72 = arith.andi %eq, %cond_71 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_73 = arith.ori %cond, %cond_72 : tensor<32x16xi1, #blocked> loc(#loc254) + %cond_74 = arith.extui %cond_73 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc255) + %cond_75 = arith.xori %cond_74, %flip_44 : tensor<32x16xi32, #blocked> loc(#loc255) + %cond_76 = arith.cmpi ne, %cond_75, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc256) + %ret = arith.xori %ileft_56, %iright_57 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_77 = arith.select %cond_76, %ret, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_78 = arith.xori %tmp7, %ret_77 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs = arith.xori %left_idx_69, %right_idx_70 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_79 = arith.select %cond_76, %new_idxs, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_80 = arith.extsi %tmp9 : tensor<1x16xi16, #blocked> to tensor<1x16xi32, #blocked> loc(#loc262) + %new_idxs_81 = tt.broadcast %new_idxs_80 : tensor<1x16xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc262) + %new_idxs_82 = arith.xori %new_idxs_81, %new_idxs_79 : tensor<32x16xi32, #blocked> loc(#loc262) + %flip_83 = tt.broadcast %flip_41 : tensor<1x2x1xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc212) + %flip_84 = tt.reshape %flip_83 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc213) + %y_85 = tt.reshape %ret_78 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc225) + %ileft_86 = tt.broadcast %left_mask_45 : tensor<1x2x1xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc227) + %ileft_87 = arith.muli %y_85, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc227) + %ileft_88 = "tt.reduce"(%ileft_87) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_89 = tt.expand_dims %ileft_88 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc229) + %ileft_90 = tt.broadcast %ileft_89 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc230) + %iright_91 = arith.muli %y_85, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc231) + %iright_92 = "tt.reduce"(%iright_91) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_93 = tt.expand_dims %iright_92 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc233) + %iright_94 = tt.broadcast %iright_93 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc234) + %ileft_95 = tt.reshape %ileft_90 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_96 = tt.reshape %iright_94 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_97 = tt.reshape %new_idxs_82 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc237) + %left_idx_98 = arith.muli %y_idx_97, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc239) + %left_idx_99 = "tt.reduce"(%left_idx_98) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_100 = tt.expand_dims %left_idx_99 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc241) + %left_idx_101 = tt.broadcast %left_idx_100 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc242) + %right_idx_102 = arith.muli %y_idx_97, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc244) + %right_idx_103 = "tt.reduce"(%right_idx_102) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_104 = tt.expand_dims %right_idx_103 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc246) + %right_idx_105 = tt.broadcast %right_idx_104 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc247) + %left_idx_106 = tt.reshape %left_idx_101 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_107 = tt.reshape %right_idx_105 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_108 = arith.cmpi slt, %ileft_95, %iright_96 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_109 = arith.cmpi eq, %ileft_95, %iright_96 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_110 = arith.cmpi sgt, %left_idx_106, %right_idx_107 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_111 = arith.andi %eq_109, %cond_110 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_112 = arith.ori %cond_108, %cond_111 : tensor<32x16xi1, #blocked> loc(#loc254) + %cond_113 = arith.extui %cond_112 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc255) + %cond_114 = arith.xori %cond_113, %flip_84 : tensor<32x16xi32, #blocked> loc(#loc255) + %cond_115 = arith.cmpi ne, %cond_114, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc256) + %ret_116 = arith.xori %ileft_95, %iright_96 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_117 = arith.select %cond_115, %ret_116, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_118 = arith.xori %ret_78, %ret_117 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_119 = arith.xori %left_idx_106, %right_idx_107 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_120 = arith.select %cond_115, %new_idxs_119, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_121 = arith.xori %new_idxs_82, %new_idxs_120 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_122 = tt.reshape %ret_118 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc225) + %ileft_123 = arith.muli %y_122, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc227) + %ileft_124 = "tt.reduce"(%ileft_123) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_125 = tt.expand_dims %ileft_124 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc229) + %ileft_126 = tt.broadcast %ileft_125 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc230) + %iright_127 = arith.muli %y_122, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc231) + %iright_128 = "tt.reduce"(%iright_127) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_129 = tt.expand_dims %iright_128 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc233) + %iright_130 = tt.broadcast %iright_129 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc234) + %ileft_131 = tt.reshape %ileft_126 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_132 = tt.reshape %iright_130 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_133 = tt.reshape %new_idxs_121 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc237) + %left_idx_134 = arith.muli %y_idx_133, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc239) + %left_idx_135 = "tt.reduce"(%left_idx_134) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_136 = tt.expand_dims %left_idx_135 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc241) + %left_idx_137 = tt.broadcast %left_idx_136 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc242) + %right_idx_138 = arith.muli %y_idx_133, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc244) + %right_idx_139 = "tt.reduce"(%right_idx_138) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_140 = tt.expand_dims %right_idx_139 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc246) + %right_idx_141 = tt.broadcast %right_idx_140 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc247) + %left_idx_142 = tt.reshape %left_idx_137 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_143 = tt.reshape %right_idx_141 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_144 = arith.cmpi slt, %ileft_131, %iright_132 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_145 = arith.cmpi eq, %ileft_131, %iright_132 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_146 = arith.cmpi sgt, %left_idx_142, %right_idx_143 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_147 = arith.andi %eq_145, %cond_146 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_148 = arith.ori %cond_144, %cond_147 : tensor<32x16xi1, #blocked> loc(#loc254) + %cond_149 = arith.extui %cond_148 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc255) + %cond_150 = arith.xori %cond_149, %flip_84 : tensor<32x16xi32, #blocked> loc(#loc255) + %cond_151 = arith.cmpi ne, %cond_150, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc256) + %ret_152 = arith.xori %ileft_131, %iright_132 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_153 = arith.select %cond_151, %ret_152, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_154 = arith.xori %ret_118, %ret_153 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_155 = arith.xori %left_idx_142, %right_idx_143 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_156 = arith.select %cond_151, %new_idxs_155, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_157 = arith.xori %new_idxs_121, %new_idxs_156 : tensor<32x16xi32, #blocked> loc(#loc262) + %flip_158 = tt.broadcast %flip_42 : tensor<1x2x1xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc212) + %flip_159 = tt.reshape %flip_158 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc213) + %y_160 = tt.reshape %ret_154 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc225) + %ileft_161 = tt.broadcast %left_mask_46 : tensor<1x2x1xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc227) + %ileft_162 = arith.muli %y_160, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc227) + %ileft_163 = "tt.reduce"(%ileft_162) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc300) + %ileft_164 = tt.expand_dims %ileft_163 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc229) + %ileft_165 = tt.broadcast %ileft_164 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc230) + %iright_166 = arith.muli %y_160, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc231) + %iright_167 = "tt.reduce"(%iright_166) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %iright_168 = tt.expand_dims %iright_167 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc233) + %iright_169 = tt.broadcast %iright_168 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc234) + %ileft_170 = tt.reshape %ileft_165 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_171 = tt.reshape %iright_169 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_172 = tt.reshape %new_idxs_157 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc237) + %left_idx_173 = arith.muli %y_idx_172, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc239) + %left_idx_174 = "tt.reduce"(%left_idx_173) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %left_idx_175 = tt.expand_dims %left_idx_174 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc241) + %left_idx_176 = tt.broadcast %left_idx_175 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc242) + %right_idx_177 = arith.muli %y_idx_172, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc244) + %right_idx_178 = "tt.reduce"(%right_idx_177) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc308) + %right_idx_179 = tt.expand_dims %right_idx_178 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc246) + %right_idx_180 = tt.broadcast %right_idx_179 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc247) + %left_idx_181 = tt.reshape %left_idx_176 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_182 = tt.reshape %right_idx_180 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_183 = arith.cmpi slt, %ileft_170, %iright_171 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_184 = arith.cmpi eq, %ileft_170, %iright_171 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_185 = arith.cmpi sgt, %left_idx_181, %right_idx_182 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_186 = arith.andi %eq_184, %cond_185 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_187 = arith.ori %cond_183, %cond_186 : tensor<32x16xi1, #blocked> loc(#loc254) + %cond_188 = arith.extui %cond_187 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc255) + %cond_189 = arith.xori %cond_188, %flip_159 : tensor<32x16xi32, #blocked> loc(#loc255) + %cond_190 = arith.cmpi ne, %cond_189, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc256) + %ret_191 = arith.xori %ileft_170, %iright_171 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_192 = arith.select %cond_190, %ret_191, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_193 = arith.xori %ret_154, %ret_192 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_194 = arith.xori %left_idx_181, %right_idx_182 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_195 = arith.select %cond_190, %new_idxs_194, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_196 = arith.xori %new_idxs_157, %new_idxs_195 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_197 = tt.reshape %ret_193 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc225) + %ileft_198 = arith.muli %y_197, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc227) + %ileft_199 = "tt.reduce"(%ileft_198) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_200 = tt.expand_dims %ileft_199 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc229) + %ileft_201 = tt.broadcast %ileft_200 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc230) + %iright_202 = arith.muli %y_197, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc231) + %iright_203 = "tt.reduce"(%iright_202) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_204 = tt.expand_dims %iright_203 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc233) + %iright_205 = tt.broadcast %iright_204 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc234) + %ileft_206 = tt.reshape %ileft_201 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_207 = tt.reshape %iright_205 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_208 = tt.reshape %new_idxs_196 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc237) + %left_idx_209 = arith.muli %y_idx_208, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc239) + %left_idx_210 = "tt.reduce"(%left_idx_209) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_211 = tt.expand_dims %left_idx_210 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc241) + %left_idx_212 = tt.broadcast %left_idx_211 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc242) + %right_idx_213 = arith.muli %y_idx_208, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc244) + %right_idx_214 = "tt.reduce"(%right_idx_213) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_215 = tt.expand_dims %right_idx_214 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc246) + %right_idx_216 = tt.broadcast %right_idx_215 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc247) + %left_idx_217 = tt.reshape %left_idx_212 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_218 = tt.reshape %right_idx_216 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_219 = arith.cmpi slt, %ileft_206, %iright_207 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_220 = arith.cmpi eq, %ileft_206, %iright_207 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_221 = arith.cmpi sgt, %left_idx_217, %right_idx_218 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_222 = arith.andi %eq_220, %cond_221 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_223 = arith.ori %cond_219, %cond_222 : tensor<32x16xi1, #blocked> loc(#loc254) + %cond_224 = arith.extui %cond_223 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc255) + %cond_225 = arith.xori %cond_224, %flip_159 : tensor<32x16xi32, #blocked> loc(#loc255) + %cond_226 = arith.cmpi ne, %cond_225, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc256) + %ret_227 = arith.xori %ileft_206, %iright_207 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_228 = arith.select %cond_226, %ret_227, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_229 = arith.xori %ret_193, %ret_228 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_230 = arith.xori %left_idx_217, %right_idx_218 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_231 = arith.select %cond_226, %new_idxs_230, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_232 = arith.xori %new_idxs_196, %new_idxs_231 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_233 = tt.reshape %ret_229 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc225) + %ileft_234 = arith.muli %y_233, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc227) + %ileft_235 = "tt.reduce"(%ileft_234) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_236 = tt.expand_dims %ileft_235 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc229) + %ileft_237 = tt.broadcast %ileft_236 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc230) + %iright_238 = arith.muli %y_233, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc231) + %iright_239 = "tt.reduce"(%iright_238) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_240 = tt.expand_dims %iright_239 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc233) + %iright_241 = tt.broadcast %iright_240 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc234) + %ileft_242 = tt.reshape %ileft_237 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_243 = tt.reshape %iright_241 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_244 = tt.reshape %new_idxs_232 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc237) + %left_idx_245 = arith.muli %y_idx_244, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc239) + %left_idx_246 = "tt.reduce"(%left_idx_245) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_247 = tt.expand_dims %left_idx_246 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc241) + %left_idx_248 = tt.broadcast %left_idx_247 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc242) + %right_idx_249 = arith.muli %y_idx_244, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc244) + %right_idx_250 = "tt.reduce"(%right_idx_249) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_251 = tt.expand_dims %right_idx_250 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc246) + %right_idx_252 = tt.broadcast %right_idx_251 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc247) + %left_idx_253 = tt.reshape %left_idx_248 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_254 = tt.reshape %right_idx_252 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_255 = arith.cmpi slt, %ileft_242, %iright_243 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_256 = arith.cmpi eq, %ileft_242, %iright_243 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_257 = arith.cmpi sgt, %left_idx_253, %right_idx_254 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_258 = arith.andi %eq_256, %cond_257 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_259 = arith.ori %cond_255, %cond_258 : tensor<32x16xi1, #blocked> loc(#loc254) + %cond_260 = arith.extui %cond_259 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc255) + %cond_261 = arith.xori %cond_260, %flip_159 : tensor<32x16xi32, #blocked> loc(#loc255) + %cond_262 = arith.cmpi ne, %cond_261, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc256) + %ret_263 = arith.xori %ileft_242, %iright_243 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_264 = arith.select %cond_262, %ret_263, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_265 = arith.xori %ret_229, %ret_264 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_266 = arith.xori %left_idx_253, %right_idx_254 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_267 = arith.select %cond_262, %new_idxs_266, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_268 = arith.xori %new_idxs_232, %new_idxs_267 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_269 = tt.reshape %ret_265 : tensor<32x16xi32, #blocked> -> tensor<32x2x8xi32, #blocked1> loc(#loc225) + %ileft_270 = tt.broadcast %left_mask_47 : tensor<1x2x1xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc227) + %ileft_271 = arith.muli %y_269, %ileft_270 : tensor<32x2x8xi32, #blocked1> loc(#loc227) + %ileft_272 = "tt.reduce"(%ileft_271) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc300) + %ileft_273 = tt.expand_dims %ileft_272 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc229) + %ileft_274 = tt.broadcast %ileft_273 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc230) + %iright_275 = arith.muli %y_269, %flip_158 : tensor<32x2x8xi32, #blocked1> loc(#loc231) + %iright_276 = "tt.reduce"(%iright_275) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc302) + %iright_277 = tt.expand_dims %iright_276 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc233) + %iright_278 = tt.broadcast %iright_277 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc234) + %ileft_279 = tt.reshape %ileft_274 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_280 = tt.reshape %iright_278 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_281 = tt.reshape %new_idxs_268 : tensor<32x16xi32, #blocked> -> tensor<32x2x8xi32, #blocked1> loc(#loc237) + %left_idx_282 = arith.muli %y_idx_281, %ileft_270 : tensor<32x2x8xi32, #blocked1> loc(#loc239) + %left_idx_283 = "tt.reduce"(%left_idx_282) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc305) + %left_idx_284 = tt.expand_dims %left_idx_283 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc241) + %left_idx_285 = tt.broadcast %left_idx_284 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc242) + %right_idx_286 = arith.muli %y_idx_281, %flip_158 : tensor<32x2x8xi32, #blocked1> loc(#loc244) + %right_idx_287 = "tt.reduce"(%right_idx_286) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc308) + %right_idx_288 = tt.expand_dims %right_idx_287 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc246) + %right_idx_289 = tt.broadcast %right_idx_288 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc247) + %left_idx_290 = tt.reshape %left_idx_285 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_291 = tt.reshape %right_idx_289 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_292 = arith.cmpi slt, %ileft_279, %iright_280 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_293 = arith.cmpi eq, %ileft_279, %iright_280 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_294 = arith.cmpi sgt, %left_idx_290, %right_idx_291 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_295 = arith.andi %eq_293, %cond_294 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_296 = arith.ori %cond_292, %cond_295 : tensor<32x16xi1, #blocked> loc(#loc254) + %ret_297 = arith.xori %ileft_279, %iright_280 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_298 = arith.select %cond_296, %ret_297, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_299 = arith.xori %ret_265, %ret_298 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_300 = arith.xori %left_idx_290, %right_idx_291 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_301 = arith.select %cond_296, %new_idxs_300, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_302 = arith.xori %new_idxs_268, %new_idxs_301 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_303 = tt.reshape %ret_299 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc225) + %ileft_304 = arith.muli %y_303, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc227) + %ileft_305 = "tt.reduce"(%ileft_304) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc300) + %ileft_306 = tt.expand_dims %ileft_305 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc229) + %ileft_307 = tt.broadcast %ileft_306 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc230) + %iright_308 = arith.muli %y_303, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc231) + %iright_309 = "tt.reduce"(%iright_308) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %iright_310 = tt.expand_dims %iright_309 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc233) + %iright_311 = tt.broadcast %iright_310 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc234) + %ileft_312 = tt.reshape %ileft_307 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_313 = tt.reshape %iright_311 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_314 = tt.reshape %new_idxs_302 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc237) + %left_idx_315 = arith.muli %y_idx_314, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc239) + %left_idx_316 = "tt.reduce"(%left_idx_315) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %left_idx_317 = tt.expand_dims %left_idx_316 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc241) + %left_idx_318 = tt.broadcast %left_idx_317 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc242) + %right_idx_319 = arith.muli %y_idx_314, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc244) + %right_idx_320 = "tt.reduce"(%right_idx_319) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc308) + %right_idx_321 = tt.expand_dims %right_idx_320 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc246) + %right_idx_322 = tt.broadcast %right_idx_321 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc247) + %left_idx_323 = tt.reshape %left_idx_318 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_324 = tt.reshape %right_idx_322 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_325 = arith.cmpi slt, %ileft_312, %iright_313 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_326 = arith.cmpi eq, %ileft_312, %iright_313 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_327 = arith.cmpi sgt, %left_idx_323, %right_idx_324 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_328 = arith.andi %eq_326, %cond_327 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_329 = arith.ori %cond_325, %cond_328 : tensor<32x16xi1, #blocked> loc(#loc254) + %ret_330 = arith.xori %ileft_312, %iright_313 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_331 = arith.select %cond_329, %ret_330, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_332 = arith.xori %ret_299, %ret_331 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_333 = arith.xori %left_idx_323, %right_idx_324 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_334 = arith.select %cond_329, %new_idxs_333, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_335 = arith.xori %new_idxs_302, %new_idxs_334 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_336 = tt.reshape %ret_332 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc225) + %ileft_337 = arith.muli %y_336, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc227) + %ileft_338 = "tt.reduce"(%ileft_337) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc300) + %ileft_339 = tt.expand_dims %ileft_338 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc229) + %ileft_340 = tt.broadcast %ileft_339 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc230) + %iright_341 = arith.muli %y_336, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc231) + %iright_342 = "tt.reduce"(%iright_341) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %iright_343 = tt.expand_dims %iright_342 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc233) + %iright_344 = tt.broadcast %iright_343 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc234) + %ileft_345 = tt.reshape %ileft_340 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_346 = tt.reshape %iright_344 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_347 = tt.reshape %new_idxs_335 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc237) + %left_idx_348 = arith.muli %y_idx_347, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc239) + %left_idx_349 = "tt.reduce"(%left_idx_348) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %left_idx_350 = tt.expand_dims %left_idx_349 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc241) + %left_idx_351 = tt.broadcast %left_idx_350 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc242) + %right_idx_352 = arith.muli %y_idx_347, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc244) + %right_idx_353 = "tt.reduce"(%right_idx_352) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc308) + %right_idx_354 = tt.expand_dims %right_idx_353 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc246) + %right_idx_355 = tt.broadcast %right_idx_354 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc247) + %left_idx_356 = tt.reshape %left_idx_351 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_357 = tt.reshape %right_idx_355 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_358 = arith.cmpi slt, %ileft_345, %iright_346 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_359 = arith.cmpi eq, %ileft_345, %iright_346 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_360 = arith.cmpi sgt, %left_idx_356, %right_idx_357 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_361 = arith.andi %eq_359, %cond_360 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_362 = arith.ori %cond_358, %cond_361 : tensor<32x16xi1, #blocked> loc(#loc254) + %ret_363 = arith.xori %ileft_345, %iright_346 : tensor<32x16xi32, #blocked> loc(#loc257) + %ret_364 = arith.select %cond_362, %ret_363, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc258) + %ret_365 = arith.xori %ret_332, %ret_364 : tensor<32x16xi32, #blocked> loc(#loc259) + %new_idxs_366 = arith.xori %left_idx_356, %right_idx_357 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_367 = arith.select %cond_362, %new_idxs_366, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_368 = arith.xori %new_idxs_335, %new_idxs_367 : tensor<32x16xi32, #blocked> loc(#loc262) + %y_369 = tt.reshape %ret_365 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc225) + %ileft_370 = arith.muli %y_369, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc227) + %ileft_371 = "tt.reduce"(%ileft_370) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc228)), %ileft_742: i32 loc(callsite(#loc1 at #loc228))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc318) + tt.reduce.return %ileft_743 : i32 loc(#loc300) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc300) + %ileft_372 = tt.expand_dims %ileft_371 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc229) + %ileft_373 = tt.broadcast %ileft_372 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc230) + %iright_374 = arith.muli %y_369, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc231) + %iright_375 = "tt.reduce"(%iright_374) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc232)), %iright_742: i32 loc(callsite(#loc1 at #loc232))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc319) + tt.reduce.return %iright_743 : i32 loc(#loc302) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %iright_376 = tt.expand_dims %iright_375 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc233) + %iright_377 = tt.broadcast %iright_376 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc234) + %ileft_378 = tt.reshape %ileft_373 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc235) + %iright_379 = tt.reshape %iright_377 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc236) + %y_idx_380 = tt.reshape %new_idxs_368 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc237) + %left_idx_381 = arith.muli %y_idx_380, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc239) + %left_idx_382 = "tt.reduce"(%left_idx_381) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc240)), %left_idx_742: i32 loc(callsite(#loc1 at #loc240))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc320) + tt.reduce.return %left_idx_743 : i32 loc(#loc305) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %left_idx_383 = tt.expand_dims %left_idx_382 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc241) + %left_idx_384 = tt.broadcast %left_idx_383 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc242) + %right_idx_385 = arith.muli %y_idx_380, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc244) + %right_idx_386 = "tt.reduce"(%right_idx_385) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc245)), %right_idx_742: i32 loc(callsite(#loc1 at #loc245))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc321) + tt.reduce.return %right_idx_743 : i32 loc(#loc308) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc308) + %right_idx_387 = tt.expand_dims %right_idx_386 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc246) + %right_idx_388 = tt.broadcast %right_idx_387 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc247) + %left_idx_389 = tt.reshape %left_idx_384 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc248) + %right_idx_390 = tt.reshape %right_idx_388 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc249) + %cond_391 = arith.cmpi slt, %ileft_378, %iright_379 : tensor<32x16xi32, #blocked> loc(#loc250) + %eq_392 = arith.cmpi eq, %ileft_378, %iright_379 : tensor<32x16xi32, #blocked> loc(#loc251) + %cond_393 = arith.cmpi sgt, %left_idx_389, %right_idx_390 : tensor<32x16xi32, #blocked> loc(#loc252) + %cond_394 = arith.andi %eq_392, %cond_393 : tensor<32x16xi1, #blocked> loc(#loc253) + %cond_395 = arith.ori %cond_391, %cond_394 : tensor<32x16xi1, #blocked> loc(#loc254) + %new_idxs_396 = arith.xori %left_idx_389, %right_idx_390 : tensor<32x16xi32, #blocked> loc(#loc260) + %new_idxs_397 = arith.select %cond_395, %new_idxs_396, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc261) + %new_idxs_398 = arith.xori %new_idxs_368, %new_idxs_397 : tensor<32x16xi32, #blocked> loc(#loc262) + %tmp14 = arith.cmpi eq, %tmp0_31, %cst_12 : tensor<32x16xi64, #blocked> loc(#loc186) + %tmp16 = arith.extui %tmp14 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc217) + %y_399 = tt.reshape %tmp16 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc263) + %ileft_400 = arith.muli %y_399, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc264) + %ileft_401 = "tt.reduce"(%ileft_400) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_402 = tt.expand_dims %ileft_401 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc266) + %ileft_403 = tt.broadcast %ileft_402 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc267) + %iright_404 = arith.muli %y_399, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc268) + %iright_405 = "tt.reduce"(%iright_404) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_406 = tt.expand_dims %iright_405 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc270) + %iright_407 = tt.broadcast %iright_406 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc271) + %ileft_408 = tt.reshape %ileft_403 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_409 = tt.reshape %iright_407 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc273) + %cond_410 = arith.cmpi slt, %ileft_408, %iright_409 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_411 = arith.cmpi eq, %ileft_408, %iright_409 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_412 = arith.andi %eq_411, %cond_71 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_413 = arith.ori %cond_410, %cond_412 : tensor<32x16xi1, #blocked> loc(#loc277) + %cond_414 = arith.extui %cond_413 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc278) + %cond_415 = arith.xori %cond_414, %flip_44 : tensor<32x16xi32, #blocked> loc(#loc278) + %cond_416 = arith.cmpi ne, %cond_415, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc279) + %ret_417 = arith.xori %ileft_408, %iright_409 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_418 = arith.select %cond_416, %ret_417, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_419 = arith.xori %tmp16, %ret_418 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_420 = arith.select %cond_416, %new_idxs, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_421 = arith.xori %new_idxs_81, %new_idxs_420 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_422 = tt.reshape %ret_419 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc263) + %ileft_423 = arith.muli %y_422, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc264) + %ileft_424 = "tt.reduce"(%ileft_423) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_425 = tt.expand_dims %ileft_424 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc266) + %ileft_426 = tt.broadcast %ileft_425 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc267) + %iright_427 = arith.muli %y_422, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc268) + %iright_428 = "tt.reduce"(%iright_427) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_429 = tt.expand_dims %iright_428 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc270) + %iright_430 = tt.broadcast %iright_429 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc271) + %ileft_431 = tt.reshape %ileft_426 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_432 = tt.reshape %iright_430 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_433 = tt.reshape %new_idxs_421 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc285) + %left_idx_434 = arith.muli %y_idx_433, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc286) + %left_idx_435 = "tt.reduce"(%left_idx_434) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_436 = tt.expand_dims %left_idx_435 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc288) + %left_idx_437 = tt.broadcast %left_idx_436 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc289) + %right_idx_438 = arith.muli %y_idx_433, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc290) + %right_idx_439 = "tt.reduce"(%right_idx_438) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_440 = tt.expand_dims %right_idx_439 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc292) + %right_idx_441 = tt.broadcast %right_idx_440 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc293) + %left_idx_442 = tt.reshape %left_idx_437 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_443 = tt.reshape %right_idx_441 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_444 = arith.cmpi slt, %ileft_431, %iright_432 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_445 = arith.cmpi eq, %ileft_431, %iright_432 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_446 = arith.cmpi sgt, %left_idx_442, %right_idx_443 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_447 = arith.andi %eq_445, %cond_446 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_448 = arith.ori %cond_444, %cond_447 : tensor<32x16xi1, #blocked> loc(#loc277) + %cond_449 = arith.extui %cond_448 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc278) + %cond_450 = arith.xori %cond_449, %flip_84 : tensor<32x16xi32, #blocked> loc(#loc278) + %cond_451 = arith.cmpi ne, %cond_450, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc279) + %ret_452 = arith.xori %ileft_431, %iright_432 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_453 = arith.select %cond_451, %ret_452, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_454 = arith.xori %ret_419, %ret_453 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_455 = arith.xori %left_idx_442, %right_idx_443 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_456 = arith.select %cond_451, %new_idxs_455, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_457 = arith.xori %new_idxs_421, %new_idxs_456 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_458 = tt.reshape %ret_454 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc263) + %ileft_459 = arith.muli %y_458, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc264) + %ileft_460 = "tt.reduce"(%ileft_459) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_461 = tt.expand_dims %ileft_460 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc266) + %ileft_462 = tt.broadcast %ileft_461 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc267) + %iright_463 = arith.muli %y_458, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc268) + %iright_464 = "tt.reduce"(%iright_463) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_465 = tt.expand_dims %iright_464 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc270) + %iright_466 = tt.broadcast %iright_465 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc271) + %ileft_467 = tt.reshape %ileft_462 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_468 = tt.reshape %iright_466 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_469 = tt.reshape %new_idxs_457 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc285) + %left_idx_470 = arith.muli %y_idx_469, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc286) + %left_idx_471 = "tt.reduce"(%left_idx_470) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_472 = tt.expand_dims %left_idx_471 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc288) + %left_idx_473 = tt.broadcast %left_idx_472 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc289) + %right_idx_474 = arith.muli %y_idx_469, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc290) + %right_idx_475 = "tt.reduce"(%right_idx_474) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_476 = tt.expand_dims %right_idx_475 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc292) + %right_idx_477 = tt.broadcast %right_idx_476 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc293) + %left_idx_478 = tt.reshape %left_idx_473 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_479 = tt.reshape %right_idx_477 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_480 = arith.cmpi slt, %ileft_467, %iright_468 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_481 = arith.cmpi eq, %ileft_467, %iright_468 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_482 = arith.cmpi sgt, %left_idx_478, %right_idx_479 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_483 = arith.andi %eq_481, %cond_482 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_484 = arith.ori %cond_480, %cond_483 : tensor<32x16xi1, #blocked> loc(#loc277) + %cond_485 = arith.extui %cond_484 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc278) + %cond_486 = arith.xori %cond_485, %flip_84 : tensor<32x16xi32, #blocked> loc(#loc278) + %cond_487 = arith.cmpi ne, %cond_486, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc279) + %ret_488 = arith.xori %ileft_467, %iright_468 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_489 = arith.select %cond_487, %ret_488, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_490 = arith.xori %ret_454, %ret_489 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_491 = arith.xori %left_idx_478, %right_idx_479 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_492 = arith.select %cond_487, %new_idxs_491, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_493 = arith.xori %new_idxs_457, %new_idxs_492 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_494 = tt.reshape %ret_490 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc263) + %ileft_495 = arith.muli %y_494, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc264) + %ileft_496 = "tt.reduce"(%ileft_495) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc310) + %ileft_497 = tt.expand_dims %ileft_496 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc266) + %ileft_498 = tt.broadcast %ileft_497 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc267) + %iright_499 = arith.muli %y_494, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc268) + %iright_500 = "tt.reduce"(%iright_499) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc312) + %iright_501 = tt.expand_dims %iright_500 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc270) + %iright_502 = tt.broadcast %iright_501 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc271) + %ileft_503 = tt.reshape %ileft_498 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_504 = tt.reshape %iright_502 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_505 = tt.reshape %new_idxs_493 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc285) + %left_idx_506 = arith.muli %y_idx_505, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc286) + %left_idx_507 = "tt.reduce"(%left_idx_506) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc314) + %left_idx_508 = tt.expand_dims %left_idx_507 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc288) + %left_idx_509 = tt.broadcast %left_idx_508 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc289) + %right_idx_510 = arith.muli %y_idx_505, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc290) + %right_idx_511 = "tt.reduce"(%right_idx_510) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc316) + %right_idx_512 = tt.expand_dims %right_idx_511 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc292) + %right_idx_513 = tt.broadcast %right_idx_512 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc293) + %left_idx_514 = tt.reshape %left_idx_509 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_515 = tt.reshape %right_idx_513 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_516 = arith.cmpi slt, %ileft_503, %iright_504 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_517 = arith.cmpi eq, %ileft_503, %iright_504 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_518 = arith.cmpi sgt, %left_idx_514, %right_idx_515 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_519 = arith.andi %eq_517, %cond_518 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_520 = arith.ori %cond_516, %cond_519 : tensor<32x16xi1, #blocked> loc(#loc277) + %cond_521 = arith.extui %cond_520 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc278) + %cond_522 = arith.xori %cond_521, %flip_159 : tensor<32x16xi32, #blocked> loc(#loc278) + %cond_523 = arith.cmpi ne, %cond_522, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc279) + %ret_524 = arith.xori %ileft_503, %iright_504 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_525 = arith.select %cond_523, %ret_524, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_526 = arith.xori %ret_490, %ret_525 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_527 = arith.xori %left_idx_514, %right_idx_515 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_528 = arith.select %cond_523, %new_idxs_527, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_529 = arith.xori %new_idxs_493, %new_idxs_528 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_530 = tt.reshape %ret_526 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc263) + %ileft_531 = arith.muli %y_530, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc264) + %ileft_532 = "tt.reduce"(%ileft_531) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_533 = tt.expand_dims %ileft_532 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc266) + %ileft_534 = tt.broadcast %ileft_533 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc267) + %iright_535 = arith.muli %y_530, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc268) + %iright_536 = "tt.reduce"(%iright_535) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_537 = tt.expand_dims %iright_536 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc270) + %iright_538 = tt.broadcast %iright_537 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc271) + %ileft_539 = tt.reshape %ileft_534 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_540 = tt.reshape %iright_538 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_541 = tt.reshape %new_idxs_529 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc285) + %left_idx_542 = arith.muli %y_idx_541, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc286) + %left_idx_543 = "tt.reduce"(%left_idx_542) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_544 = tt.expand_dims %left_idx_543 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc288) + %left_idx_545 = tt.broadcast %left_idx_544 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc289) + %right_idx_546 = arith.muli %y_idx_541, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc290) + %right_idx_547 = "tt.reduce"(%right_idx_546) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_548 = tt.expand_dims %right_idx_547 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc292) + %right_idx_549 = tt.broadcast %right_idx_548 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc293) + %left_idx_550 = tt.reshape %left_idx_545 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_551 = tt.reshape %right_idx_549 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_552 = arith.cmpi slt, %ileft_539, %iright_540 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_553 = arith.cmpi eq, %ileft_539, %iright_540 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_554 = arith.cmpi sgt, %left_idx_550, %right_idx_551 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_555 = arith.andi %eq_553, %cond_554 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_556 = arith.ori %cond_552, %cond_555 : tensor<32x16xi1, #blocked> loc(#loc277) + %cond_557 = arith.extui %cond_556 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc278) + %cond_558 = arith.xori %cond_557, %flip_159 : tensor<32x16xi32, #blocked> loc(#loc278) + %cond_559 = arith.cmpi ne, %cond_558, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc279) + %ret_560 = arith.xori %ileft_539, %iright_540 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_561 = arith.select %cond_559, %ret_560, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_562 = arith.xori %ret_526, %ret_561 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_563 = arith.xori %left_idx_550, %right_idx_551 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_564 = arith.select %cond_559, %new_idxs_563, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_565 = arith.xori %new_idxs_529, %new_idxs_564 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_566 = tt.reshape %ret_562 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc263) + %ileft_567 = arith.muli %y_566, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc264) + %ileft_568 = "tt.reduce"(%ileft_567) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_569 = tt.expand_dims %ileft_568 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc266) + %ileft_570 = tt.broadcast %ileft_569 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc267) + %iright_571 = arith.muli %y_566, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc268) + %iright_572 = "tt.reduce"(%iright_571) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_573 = tt.expand_dims %iright_572 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc270) + %iright_574 = tt.broadcast %iright_573 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc271) + %ileft_575 = tt.reshape %ileft_570 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_576 = tt.reshape %iright_574 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_577 = tt.reshape %new_idxs_565 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc285) + %left_idx_578 = arith.muli %y_idx_577, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc286) + %left_idx_579 = "tt.reduce"(%left_idx_578) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_580 = tt.expand_dims %left_idx_579 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc288) + %left_idx_581 = tt.broadcast %left_idx_580 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc289) + %right_idx_582 = arith.muli %y_idx_577, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc290) + %right_idx_583 = "tt.reduce"(%right_idx_582) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_584 = tt.expand_dims %right_idx_583 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc292) + %right_idx_585 = tt.broadcast %right_idx_584 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc293) + %left_idx_586 = tt.reshape %left_idx_581 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_587 = tt.reshape %right_idx_585 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_588 = arith.cmpi slt, %ileft_575, %iright_576 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_589 = arith.cmpi eq, %ileft_575, %iright_576 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_590 = arith.cmpi sgt, %left_idx_586, %right_idx_587 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_591 = arith.andi %eq_589, %cond_590 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_592 = arith.ori %cond_588, %cond_591 : tensor<32x16xi1, #blocked> loc(#loc277) + %cond_593 = arith.extui %cond_592 : tensor<32x16xi1, #blocked> to tensor<32x16xi32, #blocked> loc(#loc278) + %cond_594 = arith.xori %cond_593, %flip_159 : tensor<32x16xi32, #blocked> loc(#loc278) + %cond_595 = arith.cmpi ne, %cond_594, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc279) + %ret_596 = arith.xori %ileft_575, %iright_576 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_597 = arith.select %cond_595, %ret_596, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_598 = arith.xori %ret_562, %ret_597 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_599 = arith.xori %left_idx_586, %right_idx_587 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_600 = arith.select %cond_595, %new_idxs_599, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_601 = arith.xori %new_idxs_565, %new_idxs_600 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_602 = tt.reshape %ret_598 : tensor<32x16xi32, #blocked> -> tensor<32x2x8xi32, #blocked1> loc(#loc263) + %ileft_603 = arith.muli %y_602, %ileft_270 : tensor<32x2x8xi32, #blocked1> loc(#loc264) + %ileft_604 = "tt.reduce"(%ileft_603) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc310) + %ileft_605 = tt.expand_dims %ileft_604 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc266) + %ileft_606 = tt.broadcast %ileft_605 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc267) + %iright_607 = arith.muli %y_602, %flip_158 : tensor<32x2x8xi32, #blocked1> loc(#loc268) + %iright_608 = "tt.reduce"(%iright_607) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc312) + %iright_609 = tt.expand_dims %iright_608 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc270) + %iright_610 = tt.broadcast %iright_609 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc271) + %ileft_611 = tt.reshape %ileft_606 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_612 = tt.reshape %iright_610 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_613 = tt.reshape %new_idxs_601 : tensor<32x16xi32, #blocked> -> tensor<32x2x8xi32, #blocked1> loc(#loc285) + %left_idx_614 = arith.muli %y_idx_613, %ileft_270 : tensor<32x2x8xi32, #blocked1> loc(#loc286) + %left_idx_615 = "tt.reduce"(%left_idx_614) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc314) + %left_idx_616 = tt.expand_dims %left_idx_615 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc288) + %left_idx_617 = tt.broadcast %left_idx_616 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc289) + %right_idx_618 = arith.muli %y_idx_613, %flip_158 : tensor<32x2x8xi32, #blocked1> loc(#loc290) + %right_idx_619 = "tt.reduce"(%right_idx_618) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<32x2x8xi32, #blocked1>) -> tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc316) + %right_idx_620 = tt.expand_dims %right_idx_619 {axis = 1 : i32} : tensor<32x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1x8xi32, #blocked1> loc(#loc292) + %right_idx_621 = tt.broadcast %right_idx_620 : tensor<32x1x8xi32, #blocked1> -> tensor<32x2x8xi32, #blocked1> loc(#loc293) + %left_idx_622 = tt.reshape %left_idx_617 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_623 = tt.reshape %right_idx_621 : tensor<32x2x8xi32, #blocked1> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_624 = arith.cmpi slt, %ileft_611, %iright_612 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_625 = arith.cmpi eq, %ileft_611, %iright_612 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_626 = arith.cmpi sgt, %left_idx_622, %right_idx_623 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_627 = arith.andi %eq_625, %cond_626 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_628 = arith.ori %cond_624, %cond_627 : tensor<32x16xi1, #blocked> loc(#loc277) + %ret_629 = arith.xori %ileft_611, %iright_612 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_630 = arith.select %cond_628, %ret_629, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_631 = arith.xori %ret_598, %ret_630 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_632 = arith.xori %left_idx_622, %right_idx_623 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_633 = arith.select %cond_628, %new_idxs_632, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_634 = arith.xori %new_idxs_601, %new_idxs_633 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_635 = tt.reshape %ret_631 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc263) + %ileft_636 = arith.muli %y_635, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc264) + %ileft_637 = "tt.reduce"(%ileft_636) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc310) + %ileft_638 = tt.expand_dims %ileft_637 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc266) + %ileft_639 = tt.broadcast %ileft_638 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc267) + %iright_640 = arith.muli %y_635, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc268) + %iright_641 = "tt.reduce"(%iright_640) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc312) + %iright_642 = tt.expand_dims %iright_641 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc270) + %iright_643 = tt.broadcast %iright_642 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc271) + %ileft_644 = tt.reshape %ileft_639 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_645 = tt.reshape %iright_643 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_646 = tt.reshape %new_idxs_634 : tensor<32x16xi32, #blocked> -> tensor<64x2x4xi32, #blocked2> loc(#loc285) + %left_idx_647 = arith.muli %y_idx_646, %ileft_161 : tensor<64x2x4xi32, #blocked2> loc(#loc286) + %left_idx_648 = "tt.reduce"(%left_idx_647) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc314) + %left_idx_649 = tt.expand_dims %left_idx_648 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc288) + %left_idx_650 = tt.broadcast %left_idx_649 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc289) + %right_idx_651 = arith.muli %y_idx_646, %flip_83 : tensor<64x2x4xi32, #blocked2> loc(#loc290) + %right_idx_652 = "tt.reduce"(%right_idx_651) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<64x2x4xi32, #blocked2>) -> tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc316) + %right_idx_653 = tt.expand_dims %right_idx_652 {axis = 1 : i32} : tensor<64x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<64x1x4xi32, #blocked2> loc(#loc292) + %right_idx_654 = tt.broadcast %right_idx_653 : tensor<64x1x4xi32, #blocked2> -> tensor<64x2x4xi32, #blocked2> loc(#loc293) + %left_idx_655 = tt.reshape %left_idx_650 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_656 = tt.reshape %right_idx_654 : tensor<64x2x4xi32, #blocked2> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_657 = arith.cmpi slt, %ileft_644, %iright_645 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_658 = arith.cmpi eq, %ileft_644, %iright_645 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_659 = arith.cmpi sgt, %left_idx_655, %right_idx_656 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_660 = arith.andi %eq_658, %cond_659 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_661 = arith.ori %cond_657, %cond_660 : tensor<32x16xi1, #blocked> loc(#loc277) + %ret_662 = arith.xori %ileft_644, %iright_645 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_663 = arith.select %cond_661, %ret_662, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_664 = arith.xori %ret_631, %ret_663 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_665 = arith.xori %left_idx_655, %right_idx_656 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_666 = arith.select %cond_661, %new_idxs_665, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_667 = arith.xori %new_idxs_634, %new_idxs_666 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_668 = tt.reshape %ret_664 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc263) + %ileft_669 = arith.muli %y_668, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc264) + %ileft_670 = "tt.reduce"(%ileft_669) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc310) + %ileft_671 = tt.expand_dims %ileft_670 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc266) + %ileft_672 = tt.broadcast %ileft_671 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc267) + %iright_673 = arith.muli %y_668, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc268) + %iright_674 = "tt.reduce"(%iright_673) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc312) + %iright_675 = tt.expand_dims %iright_674 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc270) + %iright_676 = tt.broadcast %iright_675 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc271) + %ileft_677 = tt.reshape %ileft_672 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_678 = tt.reshape %iright_676 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_679 = tt.reshape %new_idxs_667 : tensor<32x16xi32, #blocked> -> tensor<128x2x2xi32, #blocked3> loc(#loc285) + %left_idx_680 = arith.muli %y_idx_679, %ileft_86 : tensor<128x2x2xi32, #blocked3> loc(#loc286) + %left_idx_681 = "tt.reduce"(%left_idx_680) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc314) + %left_idx_682 = tt.expand_dims %left_idx_681 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc288) + %left_idx_683 = tt.broadcast %left_idx_682 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc289) + %right_idx_684 = arith.muli %y_idx_679, %flip_43 : tensor<128x2x2xi32, #blocked3> loc(#loc290) + %right_idx_685 = "tt.reduce"(%right_idx_684) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<128x2x2xi32, #blocked3>) -> tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc316) + %right_idx_686 = tt.expand_dims %right_idx_685 {axis = 1 : i32} : tensor<128x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1x2xi32, #blocked3> loc(#loc292) + %right_idx_687 = tt.broadcast %right_idx_686 : tensor<128x1x2xi32, #blocked3> -> tensor<128x2x2xi32, #blocked3> loc(#loc293) + %left_idx_688 = tt.reshape %left_idx_683 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_689 = tt.reshape %right_idx_687 : tensor<128x2x2xi32, #blocked3> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_690 = arith.cmpi slt, %ileft_677, %iright_678 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_691 = arith.cmpi eq, %ileft_677, %iright_678 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_692 = arith.cmpi sgt, %left_idx_688, %right_idx_689 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_693 = arith.andi %eq_691, %cond_692 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_694 = arith.ori %cond_690, %cond_693 : tensor<32x16xi1, #blocked> loc(#loc277) + %ret_695 = arith.xori %ileft_677, %iright_678 : tensor<32x16xi32, #blocked> loc(#loc280) + %ret_696 = arith.select %cond_694, %ret_695, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc281) + %ret_697 = arith.xori %ret_664, %ret_696 : tensor<32x16xi32, #blocked> loc(#loc282) + %new_idxs_698 = arith.xori %left_idx_688, %right_idx_689 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_699 = arith.select %cond_694, %new_idxs_698, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_700 = arith.xori %new_idxs_667, %new_idxs_699 : tensor<32x16xi32, #blocked> loc(#loc284) + %y_701 = tt.reshape %ret_697 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc263) + %ileft_702 = arith.muli %y_701, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc264) + %ileft_703 = "tt.reduce"(%ileft_702) <{axis = 1 : i32}> ({ + ^bb0(%ileft_741: i32 loc(callsite(#loc1 at #loc265)), %ileft_742: i32 loc(callsite(#loc1 at #loc265))): + %ileft_743 = arith.addi %ileft_741, %ileft_742 : i32 loc(#loc322) + tt.reduce.return %ileft_743 : i32 loc(#loc310) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc310) + %ileft_704 = tt.expand_dims %ileft_703 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc266) + %ileft_705 = tt.broadcast %ileft_704 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc267) + %iright_706 = arith.muli %y_701, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc268) + %iright_707 = "tt.reduce"(%iright_706) <{axis = 1 : i32}> ({ + ^bb0(%iright_741: i32 loc(callsite(#loc1 at #loc269)), %iright_742: i32 loc(callsite(#loc1 at #loc269))): + %iright_743 = arith.addi %iright_741, %iright_742 : i32 loc(#loc323) + tt.reduce.return %iright_743 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc312) + %iright_708 = tt.expand_dims %iright_707 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc270) + %iright_709 = tt.broadcast %iright_708 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc271) + %ileft_710 = tt.reshape %ileft_705 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc272) + %iright_711 = tt.reshape %iright_709 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc273) + %y_idx_712 = tt.reshape %new_idxs_700 : tensor<32x16xi32, #blocked> -> tensor<256x2x1xi32, #blocked4> loc(#loc285) + %left_idx_713 = arith.muli %y_idx_712, %ileft : tensor<256x2x1xi32, #blocked4> loc(#loc286) + %left_idx_714 = "tt.reduce"(%left_idx_713) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_741: i32 loc(callsite(#loc1 at #loc287)), %left_idx_742: i32 loc(callsite(#loc1 at #loc287))): + %left_idx_743 = arith.addi %left_idx_741, %left_idx_742 : i32 loc(#loc324) + tt.reduce.return %left_idx_743 : i32 loc(#loc314) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc314) + %left_idx_715 = tt.expand_dims %left_idx_714 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc288) + %left_idx_716 = tt.broadcast %left_idx_715 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc289) + %right_idx_717 = arith.muli %y_idx_712, %iright : tensor<256x2x1xi32, #blocked4> loc(#loc290) + %right_idx_718 = "tt.reduce"(%right_idx_717) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_741: i32 loc(callsite(#loc1 at #loc291)), %right_idx_742: i32 loc(callsite(#loc1 at #loc291))): + %right_idx_743 = arith.addi %right_idx_741, %right_idx_742 : i32 loc(#loc325) + tt.reduce.return %right_idx_743 : i32 loc(#loc316) + }) : (tensor<256x2x1xi32, #blocked4>) -> tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc316) + %right_idx_719 = tt.expand_dims %right_idx_718 {axis = 1 : i32} : tensor<256x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<256x1x1xi32, #blocked4> loc(#loc292) + %right_idx_720 = tt.broadcast %right_idx_719 : tensor<256x1x1xi32, #blocked4> -> tensor<256x2x1xi32, #blocked4> loc(#loc293) + %left_idx_721 = tt.reshape %left_idx_716 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc294) + %right_idx_722 = tt.reshape %right_idx_720 : tensor<256x2x1xi32, #blocked4> -> tensor<32x16xi32, #blocked> loc(#loc295) + %cond_723 = arith.cmpi slt, %ileft_710, %iright_711 : tensor<32x16xi32, #blocked> loc(#loc274) + %eq_724 = arith.cmpi eq, %ileft_710, %iright_711 : tensor<32x16xi32, #blocked> loc(#loc275) + %cond_725 = arith.cmpi sgt, %left_idx_721, %right_idx_722 : tensor<32x16xi32, #blocked> loc(#loc296) + %cond_726 = arith.andi %eq_724, %cond_725 : tensor<32x16xi1, #blocked> loc(#loc276) + %cond_727 = arith.ori %cond_723, %cond_726 : tensor<32x16xi1, #blocked> loc(#loc277) + %new_idxs_728 = arith.xori %left_idx_721, %right_idx_722 : tensor<32x16xi32, #blocked> loc(#loc297) + %new_idxs_729 = arith.select %cond_727, %new_idxs_728, %cst_9 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc283) + %new_idxs_730 = arith.xori %new_idxs_700, %new_idxs_729 : tensor<32x16xi32, #blocked> loc(#loc284) + %tmp20 = arith.extui %tmp5 : tensor<32x16xi1, #blocked> to tensor<32x16xi64, #blocked> loc(#loc219) + %tmp23 = arith.select %tmp0_29, %tmp20, %cst_13 : tensor<32x16xi1, #blocked>, tensor<32x16xi64, #blocked> loc(#loc191) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_741: i64 loc(callsite(#loc1 at #loc192)), %tmp24_742: i64 loc(callsite(#loc1 at #loc192))): + %tmp24_743 = arith.addi %tmp24_741, %tmp24_742 : i64 loc(#loc298) + tt.reduce.return %tmp24_743 : i64 loc(#loc220) + }) : (tensor<32x16xi64, #blocked>) -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc220) + %tmp30 = ttg.convert_layout %tmp24 : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc193) + %tmp24_731 = tt.expand_dims %tmp30 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<32x1xi64, #blocked5> loc(#loc194) + %tmp24_732 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi64, #blocked> loc(#loc194) + %tmp25 = arith.extui %tmp14 : tensor<32x16xi1, #blocked> to tensor<32x16xi64, #blocked> loc(#loc222) + %tmp28 = arith.select %tmp0_29, %tmp25, %cst_13 : tensor<32x16xi1, #blocked>, tensor<32x16xi64, #blocked> loc(#loc196) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_741: i64 loc(callsite(#loc1 at #loc197)), %tmp29_742: i64 loc(callsite(#loc1 at #loc197))): + %tmp29_743 = arith.addi %tmp29_741, %tmp29_742 : i64 loc(#loc299) + tt.reduce.return %tmp29_743 : i64 loc(#loc223) + }) : (tensor<32x16xi64, #blocked>) -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc223) + %tmp31 = ttg.convert_layout %tmp29 : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc198) + %tmp29_733 = tt.expand_dims %tmp31 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<32x1xi64, #blocked5> loc(#loc199) + %tmp29_734 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<32xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi64, #blocked> loc(#loc199) + %tmp30_735 = arith.trunci %tmp24_731 : tensor<32x1xi64, #blocked5> to tensor<32x1xi32, #blocked5> loc(#loc193) + %tmp30_736 = arith.trunci %tmp24_732 : tensor<32x1xi64, #blocked> to tensor<32x1xi32, #blocked> loc(#loc193) + %tmp31_737 = arith.trunci %tmp29_733 : tensor<32x1xi64, #blocked5> to tensor<32x1xi32, #blocked5> loc(#loc198) + %tmp31_738 = arith.trunci %tmp29_734 : tensor<32x1xi64, #blocked> to tensor<32x1xi32, #blocked> loc(#loc198) + %tmp34 = tt.broadcast %tmp30_736 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc200) + %tmp34_739 = arith.cmpi slt, %tmp0_24, %tmp34 : tensor<32x16xi32, #blocked> loc(#loc200) + %tmp36 = arith.select %tmp34_739, %new_idxs_398, %cst_11 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc201) + %tmp38 = arith.addi %tmp36, %cst_10 : tensor<32x16xi32, #blocked> loc(#loc202) + %tmp39 = arith.cmpi slt, %tmp36, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc203) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc204) + %0 = arith.cmpi sge, %tmp40, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc85) + %1 = arith.cmpi slt, %tmp40, %cst_10 : tensor<32x16xi32, #blocked> loc(#loc86) + %2 = arith.andi %0, %1 : tensor<32x16xi1, #blocked> loc(#loc87) + %3 = arith.xori %xmask, %cst : tensor<32x1xi1, #blocked> loc(#loc88) + %4 = tt.broadcast %3 : tensor<32x1xi1, #blocked> -> tensor<32x16xi1, #blocked> loc(#loc89) + %5 = arith.ori %2, %4 : tensor<32x16xi1, #blocked> loc(#loc89) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<32x16xi1, #blocked> loc(#loc90) + %tmp45 = tt.broadcast %tmp31_738 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc205) + %tmp45_740 = arith.cmpi slt, %tmp0_24, %tmp45 : tensor<32x16xi32, #blocked> loc(#loc205) + %tmp46 = arith.select %tmp45_740, %new_idxs_730, %cst_11 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc206) + %tmp47 = arith.addi %tmp46, %cst_10 : tensor<32x16xi32, #blocked> loc(#loc207) + %tmp48 = arith.cmpi slt, %tmp46, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc208) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<32x16xi1, #blocked>, tensor<32x16xi32, #blocked> loc(#loc209) + %6 = arith.cmpi sge, %tmp49, %cst_9 : tensor<32x16xi32, #blocked> loc(#loc96) + %7 = arith.cmpi slt, %tmp49, %cst_10 : tensor<32x16xi32, #blocked> loc(#loc97) + %8 = arith.andi %6, %7 : tensor<32x16xi1, #blocked> loc(#loc98) + %9 = arith.ori %8, %4 : tensor<32x16xi1, #blocked> loc(#loc99) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<32x16xi1, #blocked> loc(#loc100) + %10 = tt.splat %out_ptr4 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked5> loc(#loc101) + %11 = tt.addptr %10, %xindex_21 : tensor<32x1x!tt.ptr, #blocked5>, tensor<32x1xi32, #blocked5> loc(#loc101) + tt.store %11, %tmp30_735, %xmask_22 : tensor<32x1x!tt.ptr, #blocked5> loc(#loc102) + %12 = tt.splat %out_ptr5 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked5> loc(#loc103) + %13 = tt.addptr %12, %xindex_21 : tensor<32x1x!tt.ptr, #blocked5>, tensor<32x1xi32, #blocked5> loc(#loc103) + tt.store %13, %tmp31_737, %xmask_22 : tensor<32x1x!tt.ptr, #blocked5> loc(#loc104) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc105) + %15 = tt.addptr %14, %tmp0_26 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc105) + tt.store %15, %new_idxs_398, %tmp0_29 : tensor<32x16x!tt.ptr, #blocked> loc(#loc106) + %16 = arith.muli %xindex_20, %cst_0 : tensor<32x1xi32, #blocked> loc(#loc107) + %17 = tt.broadcast %16 : tensor<32x1xi32, #blocked> -> tensor<32x16xi32, #blocked> loc(#loc108) + %18 = arith.addi %tmp40, %17 : tensor<32x16xi32, #blocked> loc(#loc108) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc109) + %20 = tt.addptr %19, %18 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc109) + %21 = ttg.convert_layout %20 : tensor<32x16x!tt.ptr, #blocked> -> tensor<32x16x!tt.ptr, #blocked5> loc(#loc110) + tt.store %21, %cst_8, %tmp0_30 : tensor<32x16x!tt.ptr, #blocked5> loc(#loc110) + %22 = tt.splat %out_ptr8 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc111) + %23 = tt.addptr %22, %tmp0_26 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc111) + tt.store %23, %new_idxs_730, %tmp0_29 : tensor<32x16x!tt.ptr, #blocked> loc(#loc112) + %24 = arith.addi %tmp49, %17 : tensor<32x16xi32, #blocked> loc(#loc113) + %25 = tt.splat %out_ptr9 : !tt.ptr -> tensor<32x16x!tt.ptr, #blocked> loc(#loc114) + %26 = tt.addptr %25, %24 : tensor<32x16x!tt.ptr, #blocked>, tensor<32x16xi32, #blocked> loc(#loc114) + %27 = ttg.convert_layout %26 : tensor<32x16x!tt.ptr, #blocked> -> tensor<32x16x!tt.ptr, #blocked5> loc(#loc115) + tt.store %27, %cst_8, %tmp0_30 : tensor<32x16x!tt.ptr, #blocked5> loc(#loc115) + tt.return loc(#loc116) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":24:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":25:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":25:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":26:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":27:38) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":34:40) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":34:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":34:30) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":34:45) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":36:18) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":38:18) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":39:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":41:19) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":40:19) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":43:19) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":45:34) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":47:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":49:21) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":48:21) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":52:20) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":54:35) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":60:21) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":55:29) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":56:21) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":58:35) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":61:21) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":59:29) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":64:19) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":66:35) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":68:20) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":69:20) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":70:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":71:28) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":71:46) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":71:38) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":71:55) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":71:53) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":71:63) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":75:19) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":76:35) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":77:20) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":78:20) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":79:35) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":80:28) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":80:46) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":80:38) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":80:53) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":80:63) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":81:25) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":81:37) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":82:25) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":82:37) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":83:25) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":83:47) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":84:52) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":84:49) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":84:25) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":84:85) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":85:25) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":85:47) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":86:49) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":86:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":86:85) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":86:4) +#loc126 = loc("xoffset"(#loc2)) +#loc127 = loc("xoffset"(#loc3)) +#loc128 = loc("xindex"(#loc4)) +#loc129 = loc("xindex"(#loc5)) +#loc130 = loc("xmask"(#loc6)) +#loc131 = loc("r0_index"(#loc7)) +#loc132 = loc("tmp0"(#loc8)) +#loc133 = loc("tmp0"(#loc9)) +#loc134 = loc("tmp0"(#loc10)) +#loc135 = loc("tmp0"(#loc11)) +#loc136 = loc("tmp2"(#loc12)) +#loc137 = loc("tmp4"(#loc13)) +#loc138 = loc("tmp5"(#loc14)) +#loc139 = loc("tmp7"(#loc15)) +#loc140 = loc("tmp6"(#loc16)) +#loc141 = loc("tmp9"(#loc17)) +#loc142 = loc("tmp11"(#loc18)) +#loc143 = loc("flip"(#loc19)) +#loc145 = loc("flip"(#loc22)) +#loc146 = loc("flip"(#loc23)) +#loc147 = loc("y"(#loc24)) +#loc148 = loc("left_mask"(#loc26)) +#loc149 = loc("ileft"(#loc27)) +#loc151 = loc("ileft"(#loc31)) +#loc152 = loc("ileft"(#loc32)) +#loc153 = loc("iright"(#loc33)) +#loc155 = loc("iright"(#loc35)) +#loc156 = loc("iright"(#loc36)) +#loc157 = loc("ileft"(#loc37)) +#loc158 = loc("iright"(#loc38)) +#loc159 = loc("y_idx"(#loc39)) +#loc160 = loc("left_idx"(#loc40)) +#loc161 = loc("left_idx"(#loc41)) +#loc162 = loc("input"(#loc42)) +#loc164 = loc("left_idx"(#loc44)) +#loc165 = loc("left_idx"(#loc45)) +#loc166 = loc("right_idx"(#loc46)) +#loc167 = loc("right_idx"(#loc47)) +#loc169 = loc("right_idx"(#loc49)) +#loc170 = loc("right_idx"(#loc50)) +#loc171 = loc("left_idx"(#loc51)) +#loc172 = loc("right_idx"(#loc52)) +#loc173 = loc("cond"(#loc53)) +#loc174 = loc("eq"(#loc54)) +#loc175 = loc("cond"(#loc55)) +#loc176 = loc("cond"(#loc56)) +#loc177 = loc("cond"(#loc57)) +#loc178 = loc("cond"(#loc58)) +#loc179 = loc("cond"(#loc59)) +#loc180 = loc("ret"(#loc60)) +#loc181 = loc("ret"(#loc61)) +#loc182 = loc("ret"(#loc62)) +#loc183 = loc("new_idxs"(#loc63)) +#loc184 = loc("new_idxs"(#loc64)) +#loc185 = loc("new_idxs"(#loc65)) +#loc186 = loc("tmp14"(#loc66)) +#loc187 = loc("tmp16"(#loc67)) +#loc188 = loc("tmp15"(#loc68)) +#loc190 = loc("tmp20"(#loc70)) +#loc191 = loc("tmp23"(#loc71)) +#loc193 = loc("tmp30"(#loc73)) +#loc194 = loc("tmp24"(#loc74)) +#loc195 = loc("tmp25"(#loc75)) +#loc196 = loc("tmp28"(#loc76)) +#loc198 = loc("tmp31"(#loc78)) +#loc199 = loc("tmp29"(#loc79)) +#loc200 = loc("tmp34"(#loc80)) +#loc201 = loc("tmp36"(#loc81)) +#loc202 = loc("tmp38"(#loc82)) +#loc203 = loc("tmp39"(#loc83)) +#loc204 = loc("tmp40"(#loc84)) +#loc205 = loc("tmp45"(#loc91)) +#loc206 = loc("tmp46"(#loc92)) +#loc207 = loc("tmp47"(#loc93)) +#loc208 = loc("tmp48"(#loc94)) +#loc209 = loc("tmp49"(#loc95)) +#loc210 = loc(fused[#loc139, #loc140]) +#loc211 = loc(callsite(#loc143 at #loc144)) +#loc212 = loc(callsite(#loc145 at #loc144)) +#loc213 = loc(callsite(#loc146 at #loc144)) +#loc215 = loc("cond"(#loc173)) +#loc216 = loc("eq"(#loc174)) +#loc217 = loc(fused[#loc187, #loc188]) +#loc219 = loc(fused[#loc190, #loc139, #loc140]) +#loc220 = loc(callsite(#loc28 at #loc192)) +#loc222 = loc(fused[#loc195, #loc187, #loc188]) +#loc223 = loc(callsite(#loc28 at #loc197)) +#loc225 = loc(callsite(#loc147 at #loc214)) +#loc226 = loc(callsite(#loc148 at #loc214)) +#loc227 = loc(callsite(#loc149 at #loc214)) +#loc229 = loc(callsite(#loc151 at #loc214)) +#loc230 = loc(callsite(#loc152 at #loc214)) +#loc231 = loc(callsite(#loc153 at #loc214)) +#loc233 = loc(callsite(#loc155 at #loc214)) +#loc234 = loc(callsite(#loc156 at #loc214)) +#loc235 = loc(callsite(#loc157 at #loc214)) +#loc236 = loc(callsite(#loc158 at #loc214)) +#loc237 = loc(callsite(#loc159 at #loc214)) +#loc238 = loc(callsite(#loc160 at #loc214)) +#loc239 = loc(callsite(#loc161 at #loc214)) +#loc241 = loc(callsite(#loc164 at #loc214)) +#loc242 = loc(callsite(#loc165 at #loc214)) +#loc243 = loc(callsite(#loc166 at #loc214)) +#loc244 = loc(callsite(#loc167 at #loc214)) +#loc246 = loc(callsite(#loc169 at #loc214)) +#loc247 = loc(callsite(#loc170 at #loc214)) +#loc248 = loc(callsite(#loc171 at #loc214)) +#loc249 = loc(callsite(#loc172 at #loc214)) +#loc250 = loc(callsite(#loc215 at #loc214)) +#loc251 = loc(callsite(#loc216 at #loc214)) +#loc252 = loc(callsite(#loc175 at #loc214)) +#loc253 = loc(callsite(#loc176 at #loc214)) +#loc254 = loc(callsite(#loc177 at #loc214)) +#loc255 = loc(callsite(#loc178 at #loc214)) +#loc256 = loc(callsite(#loc179 at #loc214)) +#loc257 = loc(callsite(#loc180 at #loc214)) +#loc258 = loc(callsite(#loc181 at #loc214)) +#loc259 = loc(callsite(#loc182 at #loc214)) +#loc260 = loc(callsite(#loc183 at #loc214)) +#loc261 = loc(callsite(#loc184 at #loc214)) +#loc262 = loc(callsite(#loc185 at #loc214)) +#loc263 = loc(callsite(#loc147 at #loc218)) +#loc264 = loc(callsite(#loc149 at #loc218)) +#loc266 = loc(callsite(#loc151 at #loc218)) +#loc267 = loc(callsite(#loc152 at #loc218)) +#loc268 = loc(callsite(#loc153 at #loc218)) +#loc270 = loc(callsite(#loc155 at #loc218)) +#loc271 = loc(callsite(#loc156 at #loc218)) +#loc272 = loc(callsite(#loc157 at #loc218)) +#loc273 = loc(callsite(#loc158 at #loc218)) +#loc274 = loc(callsite(#loc215 at #loc218)) +#loc275 = loc(callsite(#loc216 at #loc218)) +#loc276 = loc(callsite(#loc176 at #loc218)) +#loc277 = loc(callsite(#loc177 at #loc218)) +#loc278 = loc(callsite(#loc178 at #loc218)) +#loc279 = loc(callsite(#loc179 at #loc218)) +#loc280 = loc(callsite(#loc180 at #loc218)) +#loc281 = loc(callsite(#loc181 at #loc218)) +#loc282 = loc(callsite(#loc182 at #loc218)) +#loc283 = loc(callsite(#loc184 at #loc218)) +#loc284 = loc(callsite(#loc185 at #loc218)) +#loc285 = loc(callsite(#loc159 at #loc218)) +#loc286 = loc(callsite(#loc161 at #loc218)) +#loc288 = loc(callsite(#loc164 at #loc218)) +#loc289 = loc(callsite(#loc165 at #loc218)) +#loc290 = loc(callsite(#loc167 at #loc218)) +#loc292 = loc(callsite(#loc169 at #loc218)) +#loc293 = loc(callsite(#loc170 at #loc218)) +#loc294 = loc(callsite(#loc171 at #loc218)) +#loc295 = loc(callsite(#loc172 at #loc218)) +#loc296 = loc(callsite(#loc175 at #loc218)) +#loc297 = loc(callsite(#loc183 at #loc218)) +#loc298 = loc(callsite(#loc30 at #loc220)) +#loc299 = loc(callsite(#loc30 at #loc223)) +#loc300 = loc(callsite(#loc28 at #loc228)) +#loc302 = loc(callsite(#loc28 at #loc232)) +#loc304 = loc(callsite(#loc162 at #loc240)) +#loc305 = loc(callsite(#loc28 at #loc240)) +#loc307 = loc(callsite(#loc162 at #loc245)) +#loc308 = loc(callsite(#loc28 at #loc245)) +#loc310 = loc(callsite(#loc28 at #loc265)) +#loc312 = loc(callsite(#loc28 at #loc269)) +#loc314 = loc(callsite(#loc28 at #loc287)) +#loc316 = loc(callsite(#loc28 at #loc291)) +#loc318 = loc(callsite(#loc30 at #loc300)) +#loc319 = loc(callsite(#loc30 at #loc302)) +#loc320 = loc(callsite(#loc30 at #loc305)) +#loc321 = loc(callsite(#loc30 at #loc308)) +#loc322 = loc(callsite(#loc30 at #loc310)) +#loc323 = loc(callsite(#loc30 at #loc312)) +#loc324 = loc(callsite(#loc30 at #loc314)) +#loc325 = loc(callsite(#loc30 at #loc316)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..ef3bad67983cd7df32546544eef4fa7172b440f9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/EFSFIQ3KOQUXLGL22JESW5DLVY6LJWVBODRP6EOX4ISS2B62HCMA/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir @@ -0,0 +1,1451 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":18:0) +#loc1 = loc(unknown) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":46:71) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":51:71) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":55:26) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":59:26) +#loc120 = loc("in_ptr0"(#loc)) +#loc121 = loc("out_ptr4"(#loc)) +#loc122 = loc("out_ptr5"(#loc)) +#loc123 = loc("out_ptr6"(#loc)) +#loc124 = loc("out_ptr7"(#loc)) +#loc125 = loc("out_ptr8"(#loc)) +#loc126 = loc("out_ptr9"(#loc)) +#loc127 = loc("xnumel"(#loc)) +#loc128 = loc("r0_numel"(#loc)) +#loc149 = loc(callsite(#loc22 at #loc23)) +#loc156 = loc("ileft"(#loc32)) +#loc160 = loc("iright"(#loc37)) +#loc169 = loc("left_idx"(#loc46)) +#loc174 = loc("right_idx"(#loc51)) +#loc195 = loc(callsite(#loc22 at #loc72)) +#loc198 = loc("tmp24"(#loc75)) +#loc202 = loc("tmp29"(#loc79)) +#loc221 = loc(callsite(#loc28 at #loc149)) +#loc225 = loc(callsite(#loc28 at #loc195)) +#loc228 = loc(callsite(#loc1 at #loc198)) +#loc231 = loc(callsite(#loc1 at #loc202)) +#loc235 = loc(callsite(#loc156 at #loc221)) +#loc239 = loc(callsite(#loc160 at #loc221)) +#loc247 = loc(callsite(#loc169 at #loc221)) +#loc252 = loc(callsite(#loc174 at #loc221)) +#loc272 = loc(callsite(#loc156 at #loc225)) +#loc276 = loc(callsite(#loc160 at #loc225)) +#loc294 = loc(callsite(#loc169 at #loc225)) +#loc298 = loc(callsite(#loc174 at #loc225)) +#loc308 = loc(callsite(#loc1 at #loc235)) +#loc310 = loc(callsite(#loc1 at #loc239)) +#loc313 = loc(callsite(#loc1 at #loc247)) +#loc316 = loc(callsite(#loc1 at #loc252)) +#loc318 = loc(callsite(#loc1 at #loc272)) +#loc320 = loc(callsite(#loc1 at #loc276)) +#loc322 = loc(callsite(#loc1 at #loc294)) +#loc324 = loc(callsite(#loc1 at #loc298)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<32x16xi32> loc(#loc1) + %cst_1 = arith.constant dense<17> : tensor<32x1xi32> loc(#loc1) + %cst_2 = arith.constant dense : tensor<32x1xi1> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<32x16xi32> loc(#loc1) + %cst_4 = arith.constant dense<17> : tensor<32x16xi32> loc(#loc1) + %cst_5 = arith.constant dense<16> : tensor<32x16xi32> loc(#loc1) + %cst_6 = arith.constant dense<16384> : tensor<32x16xi64> loc(#loc1) + %cst_7 = arith.constant dense<0> : tensor<32x16xi64> loc(#loc1) + %cst_8 = arith.constant dense<16> : tensor<32x1xi32> loc(#loc1) + %xmask = arith.constant dense<128> : tensor<32x1xi32> loc(#loc129) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc130) + %xoffset_9 = arith.muli %xoffset, %c32_i32 : i32 loc(#loc131) + %xindex = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc132) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32> loc(#loc133) + %xindex_11 = tt.splat %xoffset_9 : i32 -> tensor<32x1xi32> loc(#loc134) + %xindex_12 = arith.addi %xindex_11, %xindex_10 : tensor<32x1xi32> loc(#loc134) + %xmask_13 = arith.cmpi slt, %xindex_12, %xmask : tensor<32x1xi32> loc(#loc129) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc135) + %r0_index_14 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc136) + %tmp0 = arith.muli %xindex_12, %cst_8 : tensor<32x1xi32> loc(#loc137) + %tmp0_15 = tt.broadcast %r0_index_14 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc138) + %tmp0_16 = tt.broadcast %tmp0 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc138) + %tmp0_17 = arith.addi %tmp0_15, %tmp0_16 : tensor<32x16xi32> loc(#loc138) + %tmp0_18 = tt.splat %in_ptr0 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc139) + %tmp0_19 = tt.addptr %tmp0_18, %tmp0_17 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc139) + %tmp0_20 = tt.broadcast %xmask_13 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc140) + %tmp0_21 = tt.load %tmp0_19, %tmp0_20, %cst_7 : tensor<32x16x!tt.ptr> loc(#loc140) + %tmp2 = arith.cmpi sgt, %tmp0_21, %cst_7 : tensor<32x16xi64> loc(#loc141) + %tmp4 = arith.cmpi slt, %tmp0_21, %cst_6 : tensor<32x16xi64> loc(#loc142) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<32x16xi1> loc(#loc143) + %tmp7 = arith.extui %tmp5 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc216) + %tmp9 = arith.trunci %r0_index_14 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc146) + %tmp11 = tt.broadcast %tmp9 : tensor<1x16xi16> -> tensor<32x16xi16> loc(#loc147) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc217) + %flip_22 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc218) + %flip_23 = tt.expand_dims %flip_22 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc218) + %flip_24 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc219) + %flip_25 = tt.reshape %flip_24 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc220) + %y = tt.reshape %tmp7 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc232) + %left_mask = arith.subi %cst, %flip_23 : tensor<1x2x1xi32> loc(#loc233) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc234) + %ileft_26 = arith.muli %y, %ileft : tensor<256x2x1xi32> loc(#loc234) + %ileft_27 = "tt.reduce"(%ileft_26) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc307) + %ileft_28 = tt.expand_dims %ileft_27 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc236) + %ileft_29 = tt.broadcast %ileft_28 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc237) + %iright = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<256x2x1xi32> loc(#loc238) + %iright_30 = arith.muli %y, %iright : tensor<256x2x1xi32> loc(#loc238) + %iright_31 = "tt.reduce"(%iright_30) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc309) + %iright_32 = tt.expand_dims %iright_31 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc240) + %iright_33 = tt.broadcast %iright_32 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc241) + %ileft_34 = tt.reshape %ileft_29 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_35 = tt.reshape %iright_33 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx = tt.reshape %tmp11 : tensor<32x16xi16> -> tensor<256x2x1xi16> loc(#loc244) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc245) + %left_idx_36 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc246) + %left_idx_37 = arith.muli %y_idx, %left_idx_36 : tensor<256x2x1xi16> loc(#loc246) + %input = arith.extsi %left_idx_37 : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc311) + %left_idx_38 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc312) + %left_idx_39 = tt.expand_dims %left_idx_38 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc248) + %left_idx_40 = tt.broadcast %left_idx_39 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc249) + %right_idx = arith.trunci %flip_23 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc250) + %right_idx_41 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<256x2x1xi16> loc(#loc251) + %right_idx_42 = arith.muli %y_idx, %right_idx_41 : tensor<256x2x1xi16> loc(#loc251) + %input_43 = arith.extsi %right_idx_42 : tensor<256x2x1xi16> to tensor<256x2x1xi32> loc(#loc314) + %right_idx_44 = "tt.reduce"(%input_43) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc315) + %right_idx_45 = tt.expand_dims %right_idx_44 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc253) + %right_idx_46 = tt.broadcast %right_idx_45 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc254) + %left_idx_47 = tt.reshape %left_idx_40 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_48 = tt.reshape %right_idx_46 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc256) + %cond = arith.cmpi slt, %ileft_34, %iright_35 : tensor<32x16xi32> loc(#loc257) + %eq = arith.cmpi eq, %ileft_34, %iright_35 : tensor<32x16xi32> loc(#loc258) + %cond_49 = arith.cmpi sgt, %left_idx_47, %right_idx_48 : tensor<32x16xi32> loc(#loc259) + %cond_50 = arith.andi %eq, %cond_49 : tensor<32x16xi1> loc(#loc260) + %cond_51 = arith.ori %cond, %cond_50 : tensor<32x16xi1> loc(#loc261) + %cond_52 = arith.extui %cond_51 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc262) + %cond_53 = arith.xori %cond_52, %flip_25 : tensor<32x16xi32> loc(#loc262) + %cond_54 = arith.cmpi ne, %cond_53, %cst_3 : tensor<32x16xi32> loc(#loc263) + %ret = arith.xori %ileft_34, %iright_35 : tensor<32x16xi32> loc(#loc264) + %ret_55 = arith.select %cond_54, %ret, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_56 = arith.xori %tmp7, %ret_55 : tensor<32x16xi32> loc(#loc266) + %new_idxs = arith.xori %left_idx_47, %right_idx_48 : tensor<32x16xi32> loc(#loc267) + %new_idxs_57 = arith.select %cond_54, %new_idxs, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_58 = arith.extsi %tmp9 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc269) + %new_idxs_59 = tt.broadcast %new_idxs_58 : tensor<1x16xi32> -> tensor<32x16xi32> loc(#loc269) + %new_idxs_60 = arith.xori %new_idxs_59, %new_idxs_57 : tensor<32x16xi32> loc(#loc269) + %flip_61 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc219) + %flip_62 = tt.reshape %flip_61 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc220) + %y_63 = tt.reshape %ret_56 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc232) + %ileft_64 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<128x2x2xi32> loc(#loc234) + %ileft_65 = arith.muli %y_63, %ileft_64 : tensor<128x2x2xi32> loc(#loc234) + %ileft_66 = "tt.reduce"(%ileft_65) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc307) + %ileft_67 = tt.expand_dims %ileft_66 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc236) + %ileft_68 = tt.broadcast %ileft_67 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc237) + %iright_69 = arith.muli %y_63, %flip_24 : tensor<128x2x2xi32> loc(#loc238) + %iright_70 = "tt.reduce"(%iright_69) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc309) + %iright_71 = tt.expand_dims %iright_70 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc240) + %iright_72 = tt.broadcast %iright_71 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc241) + %ileft_73 = tt.reshape %ileft_68 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_74 = tt.reshape %iright_72 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_75 = tt.reshape %new_idxs_60 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc244) + %left_idx_76 = arith.muli %y_idx_75, %ileft_64 : tensor<128x2x2xi32> loc(#loc246) + %left_idx_77 = "tt.reduce"(%left_idx_76) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc312) + %left_idx_78 = tt.expand_dims %left_idx_77 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc248) + %left_idx_79 = tt.broadcast %left_idx_78 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc249) + %right_idx_80 = arith.muli %y_idx_75, %flip_24 : tensor<128x2x2xi32> loc(#loc251) + %right_idx_81 = "tt.reduce"(%right_idx_80) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc315) + %right_idx_82 = tt.expand_dims %right_idx_81 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc253) + %right_idx_83 = tt.broadcast %right_idx_82 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc254) + %left_idx_84 = tt.reshape %left_idx_79 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_85 = tt.reshape %right_idx_83 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_86 = arith.cmpi slt, %ileft_73, %iright_74 : tensor<32x16xi32> loc(#loc257) + %eq_87 = arith.cmpi eq, %ileft_73, %iright_74 : tensor<32x16xi32> loc(#loc258) + %cond_88 = arith.cmpi sgt, %left_idx_84, %right_idx_85 : tensor<32x16xi32> loc(#loc259) + %cond_89 = arith.andi %eq_87, %cond_88 : tensor<32x16xi1> loc(#loc260) + %cond_90 = arith.ori %cond_86, %cond_89 : tensor<32x16xi1> loc(#loc261) + %cond_91 = arith.extui %cond_90 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc262) + %cond_92 = arith.xori %cond_91, %flip_62 : tensor<32x16xi32> loc(#loc262) + %cond_93 = arith.cmpi ne, %cond_92, %cst_3 : tensor<32x16xi32> loc(#loc263) + %ret_94 = arith.xori %ileft_73, %iright_74 : tensor<32x16xi32> loc(#loc264) + %ret_95 = arith.select %cond_93, %ret_94, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_96 = arith.xori %ret_56, %ret_95 : tensor<32x16xi32> loc(#loc266) + %new_idxs_97 = arith.xori %left_idx_84, %right_idx_85 : tensor<32x16xi32> loc(#loc267) + %new_idxs_98 = arith.select %cond_93, %new_idxs_97, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_99 = arith.xori %new_idxs_60, %new_idxs_98 : tensor<32x16xi32> loc(#loc269) + %y_100 = tt.reshape %ret_96 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc232) + %ileft_101 = arith.muli %y_100, %ileft : tensor<256x2x1xi32> loc(#loc234) + %ileft_102 = "tt.reduce"(%ileft_101) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc307) + %ileft_103 = tt.expand_dims %ileft_102 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc236) + %ileft_104 = tt.broadcast %ileft_103 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc237) + %iright_105 = arith.muli %y_100, %iright : tensor<256x2x1xi32> loc(#loc238) + %iright_106 = "tt.reduce"(%iright_105) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc309) + %iright_107 = tt.expand_dims %iright_106 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc240) + %iright_108 = tt.broadcast %iright_107 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc241) + %ileft_109 = tt.reshape %ileft_104 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_110 = tt.reshape %iright_108 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_111 = tt.reshape %new_idxs_99 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc244) + %left_idx_112 = arith.muli %y_idx_111, %ileft : tensor<256x2x1xi32> loc(#loc246) + %left_idx_113 = "tt.reduce"(%left_idx_112) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc312) + %left_idx_114 = tt.expand_dims %left_idx_113 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc248) + %left_idx_115 = tt.broadcast %left_idx_114 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc249) + %right_idx_116 = arith.muli %y_idx_111, %iright : tensor<256x2x1xi32> loc(#loc251) + %right_idx_117 = "tt.reduce"(%right_idx_116) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc315) + %right_idx_118 = tt.expand_dims %right_idx_117 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc253) + %right_idx_119 = tt.broadcast %right_idx_118 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc254) + %left_idx_120 = tt.reshape %left_idx_115 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_121 = tt.reshape %right_idx_119 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_122 = arith.cmpi slt, %ileft_109, %iright_110 : tensor<32x16xi32> loc(#loc257) + %eq_123 = arith.cmpi eq, %ileft_109, %iright_110 : tensor<32x16xi32> loc(#loc258) + %cond_124 = arith.cmpi sgt, %left_idx_120, %right_idx_121 : tensor<32x16xi32> loc(#loc259) + %cond_125 = arith.andi %eq_123, %cond_124 : tensor<32x16xi1> loc(#loc260) + %cond_126 = arith.ori %cond_122, %cond_125 : tensor<32x16xi1> loc(#loc261) + %cond_127 = arith.extui %cond_126 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc262) + %cond_128 = arith.xori %cond_127, %flip_62 : tensor<32x16xi32> loc(#loc262) + %cond_129 = arith.cmpi ne, %cond_128, %cst_3 : tensor<32x16xi32> loc(#loc263) + %ret_130 = arith.xori %ileft_109, %iright_110 : tensor<32x16xi32> loc(#loc264) + %ret_131 = arith.select %cond_129, %ret_130, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_132 = arith.xori %ret_96, %ret_131 : tensor<32x16xi32> loc(#loc266) + %new_idxs_133 = arith.xori %left_idx_120, %right_idx_121 : tensor<32x16xi32> loc(#loc267) + %new_idxs_134 = arith.select %cond_129, %new_idxs_133, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_135 = arith.xori %new_idxs_99, %new_idxs_134 : tensor<32x16xi32> loc(#loc269) + %flip_136 = tt.broadcast %flip_23 : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc219) + %flip_137 = tt.reshape %flip_136 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc220) + %y_138 = tt.reshape %ret_132 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc232) + %ileft_139 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<64x2x4xi32> loc(#loc234) + %ileft_140 = arith.muli %y_138, %ileft_139 : tensor<64x2x4xi32> loc(#loc234) + %ileft_141 = "tt.reduce"(%ileft_140) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc307) + %ileft_142 = tt.expand_dims %ileft_141 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc236) + %ileft_143 = tt.broadcast %ileft_142 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc237) + %iright_144 = arith.muli %y_138, %flip_61 : tensor<64x2x4xi32> loc(#loc238) + %iright_145 = "tt.reduce"(%iright_144) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc309) + %iright_146 = tt.expand_dims %iright_145 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc240) + %iright_147 = tt.broadcast %iright_146 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc241) + %ileft_148 = tt.reshape %ileft_143 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_149 = tt.reshape %iright_147 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_150 = tt.reshape %new_idxs_135 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc244) + %left_idx_151 = arith.muli %y_idx_150, %ileft_139 : tensor<64x2x4xi32> loc(#loc246) + %left_idx_152 = "tt.reduce"(%left_idx_151) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc312) + %left_idx_153 = tt.expand_dims %left_idx_152 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc248) + %left_idx_154 = tt.broadcast %left_idx_153 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc249) + %right_idx_155 = arith.muli %y_idx_150, %flip_61 : tensor<64x2x4xi32> loc(#loc251) + %right_idx_156 = "tt.reduce"(%right_idx_155) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc315) + %right_idx_157 = tt.expand_dims %right_idx_156 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc253) + %right_idx_158 = tt.broadcast %right_idx_157 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc254) + %left_idx_159 = tt.reshape %left_idx_154 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_160 = tt.reshape %right_idx_158 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_161 = arith.cmpi slt, %ileft_148, %iright_149 : tensor<32x16xi32> loc(#loc257) + %eq_162 = arith.cmpi eq, %ileft_148, %iright_149 : tensor<32x16xi32> loc(#loc258) + %cond_163 = arith.cmpi sgt, %left_idx_159, %right_idx_160 : tensor<32x16xi32> loc(#loc259) + %cond_164 = arith.andi %eq_162, %cond_163 : tensor<32x16xi1> loc(#loc260) + %cond_165 = arith.ori %cond_161, %cond_164 : tensor<32x16xi1> loc(#loc261) + %cond_166 = arith.extui %cond_165 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc262) + %cond_167 = arith.xori %cond_166, %flip_137 : tensor<32x16xi32> loc(#loc262) + %cond_168 = arith.cmpi ne, %cond_167, %cst_3 : tensor<32x16xi32> loc(#loc263) + %ret_169 = arith.xori %ileft_148, %iright_149 : tensor<32x16xi32> loc(#loc264) + %ret_170 = arith.select %cond_168, %ret_169, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_171 = arith.xori %ret_132, %ret_170 : tensor<32x16xi32> loc(#loc266) + %new_idxs_172 = arith.xori %left_idx_159, %right_idx_160 : tensor<32x16xi32> loc(#loc267) + %new_idxs_173 = arith.select %cond_168, %new_idxs_172, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_174 = arith.xori %new_idxs_135, %new_idxs_173 : tensor<32x16xi32> loc(#loc269) + %y_175 = tt.reshape %ret_171 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc232) + %ileft_176 = arith.muli %y_175, %ileft_64 : tensor<128x2x2xi32> loc(#loc234) + %ileft_177 = "tt.reduce"(%ileft_176) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc307) + %ileft_178 = tt.expand_dims %ileft_177 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc236) + %ileft_179 = tt.broadcast %ileft_178 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc237) + %iright_180 = arith.muli %y_175, %flip_24 : tensor<128x2x2xi32> loc(#loc238) + %iright_181 = "tt.reduce"(%iright_180) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc309) + %iright_182 = tt.expand_dims %iright_181 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc240) + %iright_183 = tt.broadcast %iright_182 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc241) + %ileft_184 = tt.reshape %ileft_179 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_185 = tt.reshape %iright_183 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_186 = tt.reshape %new_idxs_174 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc244) + %left_idx_187 = arith.muli %y_idx_186, %ileft_64 : tensor<128x2x2xi32> loc(#loc246) + %left_idx_188 = "tt.reduce"(%left_idx_187) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc312) + %left_idx_189 = tt.expand_dims %left_idx_188 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc248) + %left_idx_190 = tt.broadcast %left_idx_189 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc249) + %right_idx_191 = arith.muli %y_idx_186, %flip_24 : tensor<128x2x2xi32> loc(#loc251) + %right_idx_192 = "tt.reduce"(%right_idx_191) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc315) + %right_idx_193 = tt.expand_dims %right_idx_192 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc253) + %right_idx_194 = tt.broadcast %right_idx_193 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc254) + %left_idx_195 = tt.reshape %left_idx_190 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_196 = tt.reshape %right_idx_194 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_197 = arith.cmpi slt, %ileft_184, %iright_185 : tensor<32x16xi32> loc(#loc257) + %eq_198 = arith.cmpi eq, %ileft_184, %iright_185 : tensor<32x16xi32> loc(#loc258) + %cond_199 = arith.cmpi sgt, %left_idx_195, %right_idx_196 : tensor<32x16xi32> loc(#loc259) + %cond_200 = arith.andi %eq_198, %cond_199 : tensor<32x16xi1> loc(#loc260) + %cond_201 = arith.ori %cond_197, %cond_200 : tensor<32x16xi1> loc(#loc261) + %cond_202 = arith.extui %cond_201 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc262) + %cond_203 = arith.xori %cond_202, %flip_137 : tensor<32x16xi32> loc(#loc262) + %cond_204 = arith.cmpi ne, %cond_203, %cst_3 : tensor<32x16xi32> loc(#loc263) + %ret_205 = arith.xori %ileft_184, %iright_185 : tensor<32x16xi32> loc(#loc264) + %ret_206 = arith.select %cond_204, %ret_205, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_207 = arith.xori %ret_171, %ret_206 : tensor<32x16xi32> loc(#loc266) + %new_idxs_208 = arith.xori %left_idx_195, %right_idx_196 : tensor<32x16xi32> loc(#loc267) + %new_idxs_209 = arith.select %cond_204, %new_idxs_208, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_210 = arith.xori %new_idxs_174, %new_idxs_209 : tensor<32x16xi32> loc(#loc269) + %y_211 = tt.reshape %ret_207 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc232) + %ileft_212 = arith.muli %y_211, %ileft : tensor<256x2x1xi32> loc(#loc234) + %ileft_213 = "tt.reduce"(%ileft_212) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc307) + %ileft_214 = tt.expand_dims %ileft_213 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc236) + %ileft_215 = tt.broadcast %ileft_214 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc237) + %iright_216 = arith.muli %y_211, %iright : tensor<256x2x1xi32> loc(#loc238) + %iright_217 = "tt.reduce"(%iright_216) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc309) + %iright_218 = tt.expand_dims %iright_217 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc240) + %iright_219 = tt.broadcast %iright_218 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc241) + %ileft_220 = tt.reshape %ileft_215 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_221 = tt.reshape %iright_219 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_222 = tt.reshape %new_idxs_210 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc244) + %left_idx_223 = arith.muli %y_idx_222, %ileft : tensor<256x2x1xi32> loc(#loc246) + %left_idx_224 = "tt.reduce"(%left_idx_223) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc312) + %left_idx_225 = tt.expand_dims %left_idx_224 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc248) + %left_idx_226 = tt.broadcast %left_idx_225 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc249) + %right_idx_227 = arith.muli %y_idx_222, %iright : tensor<256x2x1xi32> loc(#loc251) + %right_idx_228 = "tt.reduce"(%right_idx_227) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc315) + %right_idx_229 = tt.expand_dims %right_idx_228 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc253) + %right_idx_230 = tt.broadcast %right_idx_229 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc254) + %left_idx_231 = tt.reshape %left_idx_226 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_232 = tt.reshape %right_idx_230 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_233 = arith.cmpi slt, %ileft_220, %iright_221 : tensor<32x16xi32> loc(#loc257) + %eq_234 = arith.cmpi eq, %ileft_220, %iright_221 : tensor<32x16xi32> loc(#loc258) + %cond_235 = arith.cmpi sgt, %left_idx_231, %right_idx_232 : tensor<32x16xi32> loc(#loc259) + %cond_236 = arith.andi %eq_234, %cond_235 : tensor<32x16xi1> loc(#loc260) + %cond_237 = arith.ori %cond_233, %cond_236 : tensor<32x16xi1> loc(#loc261) + %cond_238 = arith.extui %cond_237 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc262) + %cond_239 = arith.xori %cond_238, %flip_137 : tensor<32x16xi32> loc(#loc262) + %cond_240 = arith.cmpi ne, %cond_239, %cst_3 : tensor<32x16xi32> loc(#loc263) + %ret_241 = arith.xori %ileft_220, %iright_221 : tensor<32x16xi32> loc(#loc264) + %ret_242 = arith.select %cond_240, %ret_241, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_243 = arith.xori %ret_207, %ret_242 : tensor<32x16xi32> loc(#loc266) + %new_idxs_244 = arith.xori %left_idx_231, %right_idx_232 : tensor<32x16xi32> loc(#loc267) + %new_idxs_245 = arith.select %cond_240, %new_idxs_244, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_246 = arith.xori %new_idxs_210, %new_idxs_245 : tensor<32x16xi32> loc(#loc269) + %y_247 = tt.reshape %ret_243 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc232) + %ileft_248 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<32x2x8xi32> loc(#loc234) + %ileft_249 = arith.muli %y_247, %ileft_248 : tensor<32x2x8xi32> loc(#loc234) + %ileft_250 = "tt.reduce"(%ileft_249) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc307) + %ileft_251 = tt.expand_dims %ileft_250 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc236) + %ileft_252 = tt.broadcast %ileft_251 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc237) + %iright_253 = arith.muli %y_247, %flip_136 : tensor<32x2x8xi32> loc(#loc238) + %iright_254 = "tt.reduce"(%iright_253) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc309) + %iright_255 = tt.expand_dims %iright_254 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc240) + %iright_256 = tt.broadcast %iright_255 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc241) + %ileft_257 = tt.reshape %ileft_252 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_258 = tt.reshape %iright_256 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_259 = tt.reshape %new_idxs_246 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc244) + %left_idx_260 = arith.muli %y_idx_259, %ileft_248 : tensor<32x2x8xi32> loc(#loc246) + %left_idx_261 = "tt.reduce"(%left_idx_260) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc312) + %left_idx_262 = tt.expand_dims %left_idx_261 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc248) + %left_idx_263 = tt.broadcast %left_idx_262 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc249) + %right_idx_264 = arith.muli %y_idx_259, %flip_136 : tensor<32x2x8xi32> loc(#loc251) + %right_idx_265 = "tt.reduce"(%right_idx_264) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc315) + %right_idx_266 = tt.expand_dims %right_idx_265 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc253) + %right_idx_267 = tt.broadcast %right_idx_266 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc254) + %left_idx_268 = tt.reshape %left_idx_263 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_269 = tt.reshape %right_idx_267 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_270 = arith.cmpi slt, %ileft_257, %iright_258 : tensor<32x16xi32> loc(#loc257) + %eq_271 = arith.cmpi eq, %ileft_257, %iright_258 : tensor<32x16xi32> loc(#loc258) + %cond_272 = arith.cmpi sgt, %left_idx_268, %right_idx_269 : tensor<32x16xi32> loc(#loc259) + %cond_273 = arith.andi %eq_271, %cond_272 : tensor<32x16xi1> loc(#loc260) + %cond_274 = arith.ori %cond_270, %cond_273 : tensor<32x16xi1> loc(#loc261) + %ret_275 = arith.xori %ileft_257, %iright_258 : tensor<32x16xi32> loc(#loc264) + %ret_276 = arith.select %cond_274, %ret_275, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_277 = arith.xori %ret_243, %ret_276 : tensor<32x16xi32> loc(#loc266) + %new_idxs_278 = arith.xori %left_idx_268, %right_idx_269 : tensor<32x16xi32> loc(#loc267) + %new_idxs_279 = arith.select %cond_274, %new_idxs_278, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_280 = arith.xori %new_idxs_246, %new_idxs_279 : tensor<32x16xi32> loc(#loc269) + %y_281 = tt.reshape %ret_277 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc232) + %ileft_282 = arith.muli %y_281, %ileft_139 : tensor<64x2x4xi32> loc(#loc234) + %ileft_283 = "tt.reduce"(%ileft_282) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc307) + %ileft_284 = tt.expand_dims %ileft_283 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc236) + %ileft_285 = tt.broadcast %ileft_284 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc237) + %iright_286 = arith.muli %y_281, %flip_61 : tensor<64x2x4xi32> loc(#loc238) + %iright_287 = "tt.reduce"(%iright_286) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc309) + %iright_288 = tt.expand_dims %iright_287 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc240) + %iright_289 = tt.broadcast %iright_288 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc241) + %ileft_290 = tt.reshape %ileft_285 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_291 = tt.reshape %iright_289 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_292 = tt.reshape %new_idxs_280 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc244) + %left_idx_293 = arith.muli %y_idx_292, %ileft_139 : tensor<64x2x4xi32> loc(#loc246) + %left_idx_294 = "tt.reduce"(%left_idx_293) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc312) + %left_idx_295 = tt.expand_dims %left_idx_294 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc248) + %left_idx_296 = tt.broadcast %left_idx_295 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc249) + %right_idx_297 = arith.muli %y_idx_292, %flip_61 : tensor<64x2x4xi32> loc(#loc251) + %right_idx_298 = "tt.reduce"(%right_idx_297) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc315) + %right_idx_299 = tt.expand_dims %right_idx_298 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc253) + %right_idx_300 = tt.broadcast %right_idx_299 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc254) + %left_idx_301 = tt.reshape %left_idx_296 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_302 = tt.reshape %right_idx_300 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_303 = arith.cmpi slt, %ileft_290, %iright_291 : tensor<32x16xi32> loc(#loc257) + %eq_304 = arith.cmpi eq, %ileft_290, %iright_291 : tensor<32x16xi32> loc(#loc258) + %cond_305 = arith.cmpi sgt, %left_idx_301, %right_idx_302 : tensor<32x16xi32> loc(#loc259) + %cond_306 = arith.andi %eq_304, %cond_305 : tensor<32x16xi1> loc(#loc260) + %cond_307 = arith.ori %cond_303, %cond_306 : tensor<32x16xi1> loc(#loc261) + %ret_308 = arith.xori %ileft_290, %iright_291 : tensor<32x16xi32> loc(#loc264) + %ret_309 = arith.select %cond_307, %ret_308, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_310 = arith.xori %ret_277, %ret_309 : tensor<32x16xi32> loc(#loc266) + %new_idxs_311 = arith.xori %left_idx_301, %right_idx_302 : tensor<32x16xi32> loc(#loc267) + %new_idxs_312 = arith.select %cond_307, %new_idxs_311, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_313 = arith.xori %new_idxs_280, %new_idxs_312 : tensor<32x16xi32> loc(#loc269) + %y_314 = tt.reshape %ret_310 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc232) + %ileft_315 = arith.muli %y_314, %ileft_64 : tensor<128x2x2xi32> loc(#loc234) + %ileft_316 = "tt.reduce"(%ileft_315) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc307) + %ileft_317 = tt.expand_dims %ileft_316 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc236) + %ileft_318 = tt.broadcast %ileft_317 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc237) + %iright_319 = arith.muli %y_314, %flip_24 : tensor<128x2x2xi32> loc(#loc238) + %iright_320 = "tt.reduce"(%iright_319) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc309) + %iright_321 = tt.expand_dims %iright_320 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc240) + %iright_322 = tt.broadcast %iright_321 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc241) + %ileft_323 = tt.reshape %ileft_318 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_324 = tt.reshape %iright_322 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_325 = tt.reshape %new_idxs_313 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc244) + %left_idx_326 = arith.muli %y_idx_325, %ileft_64 : tensor<128x2x2xi32> loc(#loc246) + %left_idx_327 = "tt.reduce"(%left_idx_326) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc312) + %left_idx_328 = tt.expand_dims %left_idx_327 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc248) + %left_idx_329 = tt.broadcast %left_idx_328 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc249) + %right_idx_330 = arith.muli %y_idx_325, %flip_24 : tensor<128x2x2xi32> loc(#loc251) + %right_idx_331 = "tt.reduce"(%right_idx_330) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc315) + %right_idx_332 = tt.expand_dims %right_idx_331 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc253) + %right_idx_333 = tt.broadcast %right_idx_332 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc254) + %left_idx_334 = tt.reshape %left_idx_329 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_335 = tt.reshape %right_idx_333 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_336 = arith.cmpi slt, %ileft_323, %iright_324 : tensor<32x16xi32> loc(#loc257) + %eq_337 = arith.cmpi eq, %ileft_323, %iright_324 : tensor<32x16xi32> loc(#loc258) + %cond_338 = arith.cmpi sgt, %left_idx_334, %right_idx_335 : tensor<32x16xi32> loc(#loc259) + %cond_339 = arith.andi %eq_337, %cond_338 : tensor<32x16xi1> loc(#loc260) + %cond_340 = arith.ori %cond_336, %cond_339 : tensor<32x16xi1> loc(#loc261) + %ret_341 = arith.xori %ileft_323, %iright_324 : tensor<32x16xi32> loc(#loc264) + %ret_342 = arith.select %cond_340, %ret_341, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc265) + %ret_343 = arith.xori %ret_310, %ret_342 : tensor<32x16xi32> loc(#loc266) + %new_idxs_344 = arith.xori %left_idx_334, %right_idx_335 : tensor<32x16xi32> loc(#loc267) + %new_idxs_345 = arith.select %cond_340, %new_idxs_344, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_346 = arith.xori %new_idxs_313, %new_idxs_345 : tensor<32x16xi32> loc(#loc269) + %y_347 = tt.reshape %ret_343 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc232) + %ileft_348 = arith.muli %y_347, %ileft : tensor<256x2x1xi32> loc(#loc234) + %ileft_349 = "tt.reduce"(%ileft_348) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc235)), %ileft_714: i32 loc(callsite(#loc1 at #loc235))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc325) + tt.reduce.return %ileft_715 : i32 loc(#loc307) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc307) + %ileft_350 = tt.expand_dims %ileft_349 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc236) + %ileft_351 = tt.broadcast %ileft_350 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc237) + %iright_352 = arith.muli %y_347, %iright : tensor<256x2x1xi32> loc(#loc238) + %iright_353 = "tt.reduce"(%iright_352) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc239)), %iright_714: i32 loc(callsite(#loc1 at #loc239))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc326) + tt.reduce.return %iright_715 : i32 loc(#loc309) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc309) + %iright_354 = tt.expand_dims %iright_353 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc240) + %iright_355 = tt.broadcast %iright_354 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc241) + %ileft_356 = tt.reshape %ileft_351 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc242) + %iright_357 = tt.reshape %iright_355 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc243) + %y_idx_358 = tt.reshape %new_idxs_346 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc244) + %left_idx_359 = arith.muli %y_idx_358, %ileft : tensor<256x2x1xi32> loc(#loc246) + %left_idx_360 = "tt.reduce"(%left_idx_359) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc247)), %left_idx_714: i32 loc(callsite(#loc1 at #loc247))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc327) + tt.reduce.return %left_idx_715 : i32 loc(#loc312) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc312) + %left_idx_361 = tt.expand_dims %left_idx_360 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc248) + %left_idx_362 = tt.broadcast %left_idx_361 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc249) + %right_idx_363 = arith.muli %y_idx_358, %iright : tensor<256x2x1xi32> loc(#loc251) + %right_idx_364 = "tt.reduce"(%right_idx_363) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc252)), %right_idx_714: i32 loc(callsite(#loc1 at #loc252))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc328) + tt.reduce.return %right_idx_715 : i32 loc(#loc315) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc315) + %right_idx_365 = tt.expand_dims %right_idx_364 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc253) + %right_idx_366 = tt.broadcast %right_idx_365 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc254) + %left_idx_367 = tt.reshape %left_idx_362 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc255) + %right_idx_368 = tt.reshape %right_idx_366 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc256) + %cond_369 = arith.cmpi slt, %ileft_356, %iright_357 : tensor<32x16xi32> loc(#loc257) + %eq_370 = arith.cmpi eq, %ileft_356, %iright_357 : tensor<32x16xi32> loc(#loc258) + %cond_371 = arith.cmpi sgt, %left_idx_367, %right_idx_368 : tensor<32x16xi32> loc(#loc259) + %cond_372 = arith.andi %eq_370, %cond_371 : tensor<32x16xi1> loc(#loc260) + %cond_373 = arith.ori %cond_369, %cond_372 : tensor<32x16xi1> loc(#loc261) + %new_idxs_374 = arith.xori %left_idx_367, %right_idx_368 : tensor<32x16xi32> loc(#loc267) + %new_idxs_375 = arith.select %cond_373, %new_idxs_374, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc268) + %new_idxs_376 = arith.xori %new_idxs_346, %new_idxs_375 : tensor<32x16xi32> loc(#loc269) + %tmp14 = arith.cmpi eq, %tmp0_21, %cst_6 : tensor<32x16xi64> loc(#loc192) + %tmp16 = arith.extui %tmp14 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc224) + %y_377 = tt.reshape %tmp16 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc270) + %ileft_378 = arith.muli %y_377, %ileft : tensor<256x2x1xi32> loc(#loc271) + %ileft_379 = "tt.reduce"(%ileft_378) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc317) + %ileft_380 = tt.expand_dims %ileft_379 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc273) + %ileft_381 = tt.broadcast %ileft_380 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc274) + %iright_382 = arith.muli %y_377, %iright : tensor<256x2x1xi32> loc(#loc275) + %iright_383 = "tt.reduce"(%iright_382) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc319) + %iright_384 = tt.expand_dims %iright_383 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc277) + %iright_385 = tt.broadcast %iright_384 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc278) + %ileft_386 = tt.reshape %ileft_381 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_387 = tt.reshape %iright_385 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %cond_388 = arith.cmpi slt, %ileft_386, %iright_387 : tensor<32x16xi32> loc(#loc281) + %eq_389 = arith.cmpi eq, %ileft_386, %iright_387 : tensor<32x16xi32> loc(#loc282) + %cond_390 = arith.andi %eq_389, %cond_49 : tensor<32x16xi1> loc(#loc283) + %cond_391 = arith.ori %cond_388, %cond_390 : tensor<32x16xi1> loc(#loc284) + %cond_392 = arith.extui %cond_391 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc285) + %cond_393 = arith.xori %cond_392, %flip_25 : tensor<32x16xi32> loc(#loc285) + %cond_394 = arith.cmpi ne, %cond_393, %cst_3 : tensor<32x16xi32> loc(#loc286) + %ret_395 = arith.xori %ileft_386, %iright_387 : tensor<32x16xi32> loc(#loc287) + %ret_396 = arith.select %cond_394, %ret_395, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_397 = arith.xori %tmp16, %ret_396 : tensor<32x16xi32> loc(#loc289) + %new_idxs_398 = arith.select %cond_394, %new_idxs, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_399 = arith.xori %new_idxs_59, %new_idxs_398 : tensor<32x16xi32> loc(#loc291) + %y_400 = tt.reshape %ret_397 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc270) + %ileft_401 = arith.muli %y_400, %ileft_64 : tensor<128x2x2xi32> loc(#loc271) + %ileft_402 = "tt.reduce"(%ileft_401) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc317) + %ileft_403 = tt.expand_dims %ileft_402 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc273) + %ileft_404 = tt.broadcast %ileft_403 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc274) + %iright_405 = arith.muli %y_400, %flip_24 : tensor<128x2x2xi32> loc(#loc275) + %iright_406 = "tt.reduce"(%iright_405) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc319) + %iright_407 = tt.expand_dims %iright_406 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc277) + %iright_408 = tt.broadcast %iright_407 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc278) + %ileft_409 = tt.reshape %ileft_404 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_410 = tt.reshape %iright_408 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_411 = tt.reshape %new_idxs_399 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc292) + %left_idx_412 = arith.muli %y_idx_411, %ileft_64 : tensor<128x2x2xi32> loc(#loc293) + %left_idx_413 = "tt.reduce"(%left_idx_412) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc321) + %left_idx_414 = tt.expand_dims %left_idx_413 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc295) + %left_idx_415 = tt.broadcast %left_idx_414 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc296) + %right_idx_416 = arith.muli %y_idx_411, %flip_24 : tensor<128x2x2xi32> loc(#loc297) + %right_idx_417 = "tt.reduce"(%right_idx_416) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc323) + %right_idx_418 = tt.expand_dims %right_idx_417 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc299) + %right_idx_419 = tt.broadcast %right_idx_418 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc300) + %left_idx_420 = tt.reshape %left_idx_415 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_421 = tt.reshape %right_idx_419 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_422 = arith.cmpi slt, %ileft_409, %iright_410 : tensor<32x16xi32> loc(#loc281) + %eq_423 = arith.cmpi eq, %ileft_409, %iright_410 : tensor<32x16xi32> loc(#loc282) + %cond_424 = arith.cmpi sgt, %left_idx_420, %right_idx_421 : tensor<32x16xi32> loc(#loc303) + %cond_425 = arith.andi %eq_423, %cond_424 : tensor<32x16xi1> loc(#loc283) + %cond_426 = arith.ori %cond_422, %cond_425 : tensor<32x16xi1> loc(#loc284) + %cond_427 = arith.extui %cond_426 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc285) + %cond_428 = arith.xori %cond_427, %flip_62 : tensor<32x16xi32> loc(#loc285) + %cond_429 = arith.cmpi ne, %cond_428, %cst_3 : tensor<32x16xi32> loc(#loc286) + %ret_430 = arith.xori %ileft_409, %iright_410 : tensor<32x16xi32> loc(#loc287) + %ret_431 = arith.select %cond_429, %ret_430, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_432 = arith.xori %ret_397, %ret_431 : tensor<32x16xi32> loc(#loc289) + %new_idxs_433 = arith.xori %left_idx_420, %right_idx_421 : tensor<32x16xi32> loc(#loc304) + %new_idxs_434 = arith.select %cond_429, %new_idxs_433, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_435 = arith.xori %new_idxs_399, %new_idxs_434 : tensor<32x16xi32> loc(#loc291) + %y_436 = tt.reshape %ret_432 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc270) + %ileft_437 = arith.muli %y_436, %ileft : tensor<256x2x1xi32> loc(#loc271) + %ileft_438 = "tt.reduce"(%ileft_437) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc317) + %ileft_439 = tt.expand_dims %ileft_438 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc273) + %ileft_440 = tt.broadcast %ileft_439 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc274) + %iright_441 = arith.muli %y_436, %iright : tensor<256x2x1xi32> loc(#loc275) + %iright_442 = "tt.reduce"(%iright_441) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc319) + %iright_443 = tt.expand_dims %iright_442 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc277) + %iright_444 = tt.broadcast %iright_443 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc278) + %ileft_445 = tt.reshape %ileft_440 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_446 = tt.reshape %iright_444 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_447 = tt.reshape %new_idxs_435 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc292) + %left_idx_448 = arith.muli %y_idx_447, %ileft : tensor<256x2x1xi32> loc(#loc293) + %left_idx_449 = "tt.reduce"(%left_idx_448) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc321) + %left_idx_450 = tt.expand_dims %left_idx_449 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc295) + %left_idx_451 = tt.broadcast %left_idx_450 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc296) + %right_idx_452 = arith.muli %y_idx_447, %iright : tensor<256x2x1xi32> loc(#loc297) + %right_idx_453 = "tt.reduce"(%right_idx_452) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc323) + %right_idx_454 = tt.expand_dims %right_idx_453 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc299) + %right_idx_455 = tt.broadcast %right_idx_454 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc300) + %left_idx_456 = tt.reshape %left_idx_451 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_457 = tt.reshape %right_idx_455 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_458 = arith.cmpi slt, %ileft_445, %iright_446 : tensor<32x16xi32> loc(#loc281) + %eq_459 = arith.cmpi eq, %ileft_445, %iright_446 : tensor<32x16xi32> loc(#loc282) + %cond_460 = arith.cmpi sgt, %left_idx_456, %right_idx_457 : tensor<32x16xi32> loc(#loc303) + %cond_461 = arith.andi %eq_459, %cond_460 : tensor<32x16xi1> loc(#loc283) + %cond_462 = arith.ori %cond_458, %cond_461 : tensor<32x16xi1> loc(#loc284) + %cond_463 = arith.extui %cond_462 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc285) + %cond_464 = arith.xori %cond_463, %flip_62 : tensor<32x16xi32> loc(#loc285) + %cond_465 = arith.cmpi ne, %cond_464, %cst_3 : tensor<32x16xi32> loc(#loc286) + %ret_466 = arith.xori %ileft_445, %iright_446 : tensor<32x16xi32> loc(#loc287) + %ret_467 = arith.select %cond_465, %ret_466, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_468 = arith.xori %ret_432, %ret_467 : tensor<32x16xi32> loc(#loc289) + %new_idxs_469 = arith.xori %left_idx_456, %right_idx_457 : tensor<32x16xi32> loc(#loc304) + %new_idxs_470 = arith.select %cond_465, %new_idxs_469, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_471 = arith.xori %new_idxs_435, %new_idxs_470 : tensor<32x16xi32> loc(#loc291) + %y_472 = tt.reshape %ret_468 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc270) + %ileft_473 = arith.muli %y_472, %ileft_139 : tensor<64x2x4xi32> loc(#loc271) + %ileft_474 = "tt.reduce"(%ileft_473) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc317) + %ileft_475 = tt.expand_dims %ileft_474 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc273) + %ileft_476 = tt.broadcast %ileft_475 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc274) + %iright_477 = arith.muli %y_472, %flip_61 : tensor<64x2x4xi32> loc(#loc275) + %iright_478 = "tt.reduce"(%iright_477) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc319) + %iright_479 = tt.expand_dims %iright_478 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc277) + %iright_480 = tt.broadcast %iright_479 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc278) + %ileft_481 = tt.reshape %ileft_476 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_482 = tt.reshape %iright_480 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_483 = tt.reshape %new_idxs_471 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc292) + %left_idx_484 = arith.muli %y_idx_483, %ileft_139 : tensor<64x2x4xi32> loc(#loc293) + %left_idx_485 = "tt.reduce"(%left_idx_484) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc321) + %left_idx_486 = tt.expand_dims %left_idx_485 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc295) + %left_idx_487 = tt.broadcast %left_idx_486 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc296) + %right_idx_488 = arith.muli %y_idx_483, %flip_61 : tensor<64x2x4xi32> loc(#loc297) + %right_idx_489 = "tt.reduce"(%right_idx_488) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc323) + %right_idx_490 = tt.expand_dims %right_idx_489 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc299) + %right_idx_491 = tt.broadcast %right_idx_490 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc300) + %left_idx_492 = tt.reshape %left_idx_487 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_493 = tt.reshape %right_idx_491 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_494 = arith.cmpi slt, %ileft_481, %iright_482 : tensor<32x16xi32> loc(#loc281) + %eq_495 = arith.cmpi eq, %ileft_481, %iright_482 : tensor<32x16xi32> loc(#loc282) + %cond_496 = arith.cmpi sgt, %left_idx_492, %right_idx_493 : tensor<32x16xi32> loc(#loc303) + %cond_497 = arith.andi %eq_495, %cond_496 : tensor<32x16xi1> loc(#loc283) + %cond_498 = arith.ori %cond_494, %cond_497 : tensor<32x16xi1> loc(#loc284) + %cond_499 = arith.extui %cond_498 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc285) + %cond_500 = arith.xori %cond_499, %flip_137 : tensor<32x16xi32> loc(#loc285) + %cond_501 = arith.cmpi ne, %cond_500, %cst_3 : tensor<32x16xi32> loc(#loc286) + %ret_502 = arith.xori %ileft_481, %iright_482 : tensor<32x16xi32> loc(#loc287) + %ret_503 = arith.select %cond_501, %ret_502, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_504 = arith.xori %ret_468, %ret_503 : tensor<32x16xi32> loc(#loc289) + %new_idxs_505 = arith.xori %left_idx_492, %right_idx_493 : tensor<32x16xi32> loc(#loc304) + %new_idxs_506 = arith.select %cond_501, %new_idxs_505, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_507 = arith.xori %new_idxs_471, %new_idxs_506 : tensor<32x16xi32> loc(#loc291) + %y_508 = tt.reshape %ret_504 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc270) + %ileft_509 = arith.muli %y_508, %ileft_64 : tensor<128x2x2xi32> loc(#loc271) + %ileft_510 = "tt.reduce"(%ileft_509) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc317) + %ileft_511 = tt.expand_dims %ileft_510 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc273) + %ileft_512 = tt.broadcast %ileft_511 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc274) + %iright_513 = arith.muli %y_508, %flip_24 : tensor<128x2x2xi32> loc(#loc275) + %iright_514 = "tt.reduce"(%iright_513) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc319) + %iright_515 = tt.expand_dims %iright_514 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc277) + %iright_516 = tt.broadcast %iright_515 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc278) + %ileft_517 = tt.reshape %ileft_512 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_518 = tt.reshape %iright_516 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_519 = tt.reshape %new_idxs_507 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc292) + %left_idx_520 = arith.muli %y_idx_519, %ileft_64 : tensor<128x2x2xi32> loc(#loc293) + %left_idx_521 = "tt.reduce"(%left_idx_520) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc321) + %left_idx_522 = tt.expand_dims %left_idx_521 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc295) + %left_idx_523 = tt.broadcast %left_idx_522 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc296) + %right_idx_524 = arith.muli %y_idx_519, %flip_24 : tensor<128x2x2xi32> loc(#loc297) + %right_idx_525 = "tt.reduce"(%right_idx_524) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc323) + %right_idx_526 = tt.expand_dims %right_idx_525 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc299) + %right_idx_527 = tt.broadcast %right_idx_526 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc300) + %left_idx_528 = tt.reshape %left_idx_523 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_529 = tt.reshape %right_idx_527 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_530 = arith.cmpi slt, %ileft_517, %iright_518 : tensor<32x16xi32> loc(#loc281) + %eq_531 = arith.cmpi eq, %ileft_517, %iright_518 : tensor<32x16xi32> loc(#loc282) + %cond_532 = arith.cmpi sgt, %left_idx_528, %right_idx_529 : tensor<32x16xi32> loc(#loc303) + %cond_533 = arith.andi %eq_531, %cond_532 : tensor<32x16xi1> loc(#loc283) + %cond_534 = arith.ori %cond_530, %cond_533 : tensor<32x16xi1> loc(#loc284) + %cond_535 = arith.extui %cond_534 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc285) + %cond_536 = arith.xori %cond_535, %flip_137 : tensor<32x16xi32> loc(#loc285) + %cond_537 = arith.cmpi ne, %cond_536, %cst_3 : tensor<32x16xi32> loc(#loc286) + %ret_538 = arith.xori %ileft_517, %iright_518 : tensor<32x16xi32> loc(#loc287) + %ret_539 = arith.select %cond_537, %ret_538, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_540 = arith.xori %ret_504, %ret_539 : tensor<32x16xi32> loc(#loc289) + %new_idxs_541 = arith.xori %left_idx_528, %right_idx_529 : tensor<32x16xi32> loc(#loc304) + %new_idxs_542 = arith.select %cond_537, %new_idxs_541, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_543 = arith.xori %new_idxs_507, %new_idxs_542 : tensor<32x16xi32> loc(#loc291) + %y_544 = tt.reshape %ret_540 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc270) + %ileft_545 = arith.muli %y_544, %ileft : tensor<256x2x1xi32> loc(#loc271) + %ileft_546 = "tt.reduce"(%ileft_545) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc317) + %ileft_547 = tt.expand_dims %ileft_546 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc273) + %ileft_548 = tt.broadcast %ileft_547 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc274) + %iright_549 = arith.muli %y_544, %iright : tensor<256x2x1xi32> loc(#loc275) + %iright_550 = "tt.reduce"(%iright_549) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc319) + %iright_551 = tt.expand_dims %iright_550 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc277) + %iright_552 = tt.broadcast %iright_551 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc278) + %ileft_553 = tt.reshape %ileft_548 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_554 = tt.reshape %iright_552 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_555 = tt.reshape %new_idxs_543 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc292) + %left_idx_556 = arith.muli %y_idx_555, %ileft : tensor<256x2x1xi32> loc(#loc293) + %left_idx_557 = "tt.reduce"(%left_idx_556) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc321) + %left_idx_558 = tt.expand_dims %left_idx_557 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc295) + %left_idx_559 = tt.broadcast %left_idx_558 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc296) + %right_idx_560 = arith.muli %y_idx_555, %iright : tensor<256x2x1xi32> loc(#loc297) + %right_idx_561 = "tt.reduce"(%right_idx_560) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc323) + %right_idx_562 = tt.expand_dims %right_idx_561 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc299) + %right_idx_563 = tt.broadcast %right_idx_562 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc300) + %left_idx_564 = tt.reshape %left_idx_559 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_565 = tt.reshape %right_idx_563 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_566 = arith.cmpi slt, %ileft_553, %iright_554 : tensor<32x16xi32> loc(#loc281) + %eq_567 = arith.cmpi eq, %ileft_553, %iright_554 : tensor<32x16xi32> loc(#loc282) + %cond_568 = arith.cmpi sgt, %left_idx_564, %right_idx_565 : tensor<32x16xi32> loc(#loc303) + %cond_569 = arith.andi %eq_567, %cond_568 : tensor<32x16xi1> loc(#loc283) + %cond_570 = arith.ori %cond_566, %cond_569 : tensor<32x16xi1> loc(#loc284) + %cond_571 = arith.extui %cond_570 : tensor<32x16xi1> to tensor<32x16xi32> loc(#loc285) + %cond_572 = arith.xori %cond_571, %flip_137 : tensor<32x16xi32> loc(#loc285) + %cond_573 = arith.cmpi ne, %cond_572, %cst_3 : tensor<32x16xi32> loc(#loc286) + %ret_574 = arith.xori %ileft_553, %iright_554 : tensor<32x16xi32> loc(#loc287) + %ret_575 = arith.select %cond_573, %ret_574, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_576 = arith.xori %ret_540, %ret_575 : tensor<32x16xi32> loc(#loc289) + %new_idxs_577 = arith.xori %left_idx_564, %right_idx_565 : tensor<32x16xi32> loc(#loc304) + %new_idxs_578 = arith.select %cond_573, %new_idxs_577, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_579 = arith.xori %new_idxs_543, %new_idxs_578 : tensor<32x16xi32> loc(#loc291) + %y_580 = tt.reshape %ret_576 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc270) + %ileft_581 = arith.muli %y_580, %ileft_248 : tensor<32x2x8xi32> loc(#loc271) + %ileft_582 = "tt.reduce"(%ileft_581) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc317) + %ileft_583 = tt.expand_dims %ileft_582 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc273) + %ileft_584 = tt.broadcast %ileft_583 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc274) + %iright_585 = arith.muli %y_580, %flip_136 : tensor<32x2x8xi32> loc(#loc275) + %iright_586 = "tt.reduce"(%iright_585) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc319) + %iright_587 = tt.expand_dims %iright_586 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc277) + %iright_588 = tt.broadcast %iright_587 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc278) + %ileft_589 = tt.reshape %ileft_584 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_590 = tt.reshape %iright_588 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_591 = tt.reshape %new_idxs_579 : tensor<32x16xi32> -> tensor<32x2x8xi32> loc(#loc292) + %left_idx_592 = arith.muli %y_idx_591, %ileft_248 : tensor<32x2x8xi32> loc(#loc293) + %left_idx_593 = "tt.reduce"(%left_idx_592) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc321) + %left_idx_594 = tt.expand_dims %left_idx_593 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc295) + %left_idx_595 = tt.broadcast %left_idx_594 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc296) + %right_idx_596 = arith.muli %y_idx_591, %flip_136 : tensor<32x2x8xi32> loc(#loc297) + %right_idx_597 = "tt.reduce"(%right_idx_596) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<32x2x8xi32>) -> tensor<32x8xi32> loc(#loc323) + %right_idx_598 = tt.expand_dims %right_idx_597 {axis = 1 : i32} : tensor<32x8xi32> -> tensor<32x1x8xi32> loc(#loc299) + %right_idx_599 = tt.broadcast %right_idx_598 : tensor<32x1x8xi32> -> tensor<32x2x8xi32> loc(#loc300) + %left_idx_600 = tt.reshape %left_idx_595 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_601 = tt.reshape %right_idx_599 : tensor<32x2x8xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_602 = arith.cmpi slt, %ileft_589, %iright_590 : tensor<32x16xi32> loc(#loc281) + %eq_603 = arith.cmpi eq, %ileft_589, %iright_590 : tensor<32x16xi32> loc(#loc282) + %cond_604 = arith.cmpi sgt, %left_idx_600, %right_idx_601 : tensor<32x16xi32> loc(#loc303) + %cond_605 = arith.andi %eq_603, %cond_604 : tensor<32x16xi1> loc(#loc283) + %cond_606 = arith.ori %cond_602, %cond_605 : tensor<32x16xi1> loc(#loc284) + %ret_607 = arith.xori %ileft_589, %iright_590 : tensor<32x16xi32> loc(#loc287) + %ret_608 = arith.select %cond_606, %ret_607, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_609 = arith.xori %ret_576, %ret_608 : tensor<32x16xi32> loc(#loc289) + %new_idxs_610 = arith.xori %left_idx_600, %right_idx_601 : tensor<32x16xi32> loc(#loc304) + %new_idxs_611 = arith.select %cond_606, %new_idxs_610, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_612 = arith.xori %new_idxs_579, %new_idxs_611 : tensor<32x16xi32> loc(#loc291) + %y_613 = tt.reshape %ret_609 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc270) + %ileft_614 = arith.muli %y_613, %ileft_139 : tensor<64x2x4xi32> loc(#loc271) + %ileft_615 = "tt.reduce"(%ileft_614) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc317) + %ileft_616 = tt.expand_dims %ileft_615 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc273) + %ileft_617 = tt.broadcast %ileft_616 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc274) + %iright_618 = arith.muli %y_613, %flip_61 : tensor<64x2x4xi32> loc(#loc275) + %iright_619 = "tt.reduce"(%iright_618) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc319) + %iright_620 = tt.expand_dims %iright_619 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc277) + %iright_621 = tt.broadcast %iright_620 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc278) + %ileft_622 = tt.reshape %ileft_617 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_623 = tt.reshape %iright_621 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_624 = tt.reshape %new_idxs_612 : tensor<32x16xi32> -> tensor<64x2x4xi32> loc(#loc292) + %left_idx_625 = arith.muli %y_idx_624, %ileft_139 : tensor<64x2x4xi32> loc(#loc293) + %left_idx_626 = "tt.reduce"(%left_idx_625) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc321) + %left_idx_627 = tt.expand_dims %left_idx_626 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc295) + %left_idx_628 = tt.broadcast %left_idx_627 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc296) + %right_idx_629 = arith.muli %y_idx_624, %flip_61 : tensor<64x2x4xi32> loc(#loc297) + %right_idx_630 = "tt.reduce"(%right_idx_629) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<64x2x4xi32>) -> tensor<64x4xi32> loc(#loc323) + %right_idx_631 = tt.expand_dims %right_idx_630 {axis = 1 : i32} : tensor<64x4xi32> -> tensor<64x1x4xi32> loc(#loc299) + %right_idx_632 = tt.broadcast %right_idx_631 : tensor<64x1x4xi32> -> tensor<64x2x4xi32> loc(#loc300) + %left_idx_633 = tt.reshape %left_idx_628 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_634 = tt.reshape %right_idx_632 : tensor<64x2x4xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_635 = arith.cmpi slt, %ileft_622, %iright_623 : tensor<32x16xi32> loc(#loc281) + %eq_636 = arith.cmpi eq, %ileft_622, %iright_623 : tensor<32x16xi32> loc(#loc282) + %cond_637 = arith.cmpi sgt, %left_idx_633, %right_idx_634 : tensor<32x16xi32> loc(#loc303) + %cond_638 = arith.andi %eq_636, %cond_637 : tensor<32x16xi1> loc(#loc283) + %cond_639 = arith.ori %cond_635, %cond_638 : tensor<32x16xi1> loc(#loc284) + %ret_640 = arith.xori %ileft_622, %iright_623 : tensor<32x16xi32> loc(#loc287) + %ret_641 = arith.select %cond_639, %ret_640, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_642 = arith.xori %ret_609, %ret_641 : tensor<32x16xi32> loc(#loc289) + %new_idxs_643 = arith.xori %left_idx_633, %right_idx_634 : tensor<32x16xi32> loc(#loc304) + %new_idxs_644 = arith.select %cond_639, %new_idxs_643, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_645 = arith.xori %new_idxs_612, %new_idxs_644 : tensor<32x16xi32> loc(#loc291) + %y_646 = tt.reshape %ret_642 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc270) + %ileft_647 = arith.muli %y_646, %ileft_64 : tensor<128x2x2xi32> loc(#loc271) + %ileft_648 = "tt.reduce"(%ileft_647) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc317) + %ileft_649 = tt.expand_dims %ileft_648 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc273) + %ileft_650 = tt.broadcast %ileft_649 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc274) + %iright_651 = arith.muli %y_646, %flip_24 : tensor<128x2x2xi32> loc(#loc275) + %iright_652 = "tt.reduce"(%iright_651) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc319) + %iright_653 = tt.expand_dims %iright_652 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc277) + %iright_654 = tt.broadcast %iright_653 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc278) + %ileft_655 = tt.reshape %ileft_650 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_656 = tt.reshape %iright_654 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_657 = tt.reshape %new_idxs_645 : tensor<32x16xi32> -> tensor<128x2x2xi32> loc(#loc292) + %left_idx_658 = arith.muli %y_idx_657, %ileft_64 : tensor<128x2x2xi32> loc(#loc293) + %left_idx_659 = "tt.reduce"(%left_idx_658) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc321) + %left_idx_660 = tt.expand_dims %left_idx_659 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc295) + %left_idx_661 = tt.broadcast %left_idx_660 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc296) + %right_idx_662 = arith.muli %y_idx_657, %flip_24 : tensor<128x2x2xi32> loc(#loc297) + %right_idx_663 = "tt.reduce"(%right_idx_662) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<128x2x2xi32>) -> tensor<128x2xi32> loc(#loc323) + %right_idx_664 = tt.expand_dims %right_idx_663 {axis = 1 : i32} : tensor<128x2xi32> -> tensor<128x1x2xi32> loc(#loc299) + %right_idx_665 = tt.broadcast %right_idx_664 : tensor<128x1x2xi32> -> tensor<128x2x2xi32> loc(#loc300) + %left_idx_666 = tt.reshape %left_idx_661 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_667 = tt.reshape %right_idx_665 : tensor<128x2x2xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_668 = arith.cmpi slt, %ileft_655, %iright_656 : tensor<32x16xi32> loc(#loc281) + %eq_669 = arith.cmpi eq, %ileft_655, %iright_656 : tensor<32x16xi32> loc(#loc282) + %cond_670 = arith.cmpi sgt, %left_idx_666, %right_idx_667 : tensor<32x16xi32> loc(#loc303) + %cond_671 = arith.andi %eq_669, %cond_670 : tensor<32x16xi1> loc(#loc283) + %cond_672 = arith.ori %cond_668, %cond_671 : tensor<32x16xi1> loc(#loc284) + %ret_673 = arith.xori %ileft_655, %iright_656 : tensor<32x16xi32> loc(#loc287) + %ret_674 = arith.select %cond_672, %ret_673, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc288) + %ret_675 = arith.xori %ret_642, %ret_674 : tensor<32x16xi32> loc(#loc289) + %new_idxs_676 = arith.xori %left_idx_666, %right_idx_667 : tensor<32x16xi32> loc(#loc304) + %new_idxs_677 = arith.select %cond_672, %new_idxs_676, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_678 = arith.xori %new_idxs_645, %new_idxs_677 : tensor<32x16xi32> loc(#loc291) + %y_679 = tt.reshape %ret_675 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc270) + %ileft_680 = arith.muli %y_679, %ileft : tensor<256x2x1xi32> loc(#loc271) + %ileft_681 = "tt.reduce"(%ileft_680) <{axis = 1 : i32}> ({ + ^bb0(%ileft_713: i32 loc(callsite(#loc1 at #loc272)), %ileft_714: i32 loc(callsite(#loc1 at #loc272))): + %ileft_715 = arith.addi %ileft_713, %ileft_714 : i32 loc(#loc329) + tt.reduce.return %ileft_715 : i32 loc(#loc317) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc317) + %ileft_682 = tt.expand_dims %ileft_681 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc273) + %ileft_683 = tt.broadcast %ileft_682 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc274) + %iright_684 = arith.muli %y_679, %iright : tensor<256x2x1xi32> loc(#loc275) + %iright_685 = "tt.reduce"(%iright_684) <{axis = 1 : i32}> ({ + ^bb0(%iright_713: i32 loc(callsite(#loc1 at #loc276)), %iright_714: i32 loc(callsite(#loc1 at #loc276))): + %iright_715 = arith.addi %iright_713, %iright_714 : i32 loc(#loc330) + tt.reduce.return %iright_715 : i32 loc(#loc319) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc319) + %iright_686 = tt.expand_dims %iright_685 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc277) + %iright_687 = tt.broadcast %iright_686 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc278) + %ileft_688 = tt.reshape %ileft_683 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc279) + %iright_689 = tt.reshape %iright_687 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc280) + %y_idx_690 = tt.reshape %new_idxs_678 : tensor<32x16xi32> -> tensor<256x2x1xi32> loc(#loc292) + %left_idx_691 = arith.muli %y_idx_690, %ileft : tensor<256x2x1xi32> loc(#loc293) + %left_idx_692 = "tt.reduce"(%left_idx_691) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_713: i32 loc(callsite(#loc1 at #loc294)), %left_idx_714: i32 loc(callsite(#loc1 at #loc294))): + %left_idx_715 = arith.addi %left_idx_713, %left_idx_714 : i32 loc(#loc331) + tt.reduce.return %left_idx_715 : i32 loc(#loc321) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc321) + %left_idx_693 = tt.expand_dims %left_idx_692 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc295) + %left_idx_694 = tt.broadcast %left_idx_693 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc296) + %right_idx_695 = arith.muli %y_idx_690, %iright : tensor<256x2x1xi32> loc(#loc297) + %right_idx_696 = "tt.reduce"(%right_idx_695) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_713: i32 loc(callsite(#loc1 at #loc298)), %right_idx_714: i32 loc(callsite(#loc1 at #loc298))): + %right_idx_715 = arith.addi %right_idx_713, %right_idx_714 : i32 loc(#loc332) + tt.reduce.return %right_idx_715 : i32 loc(#loc323) + }) : (tensor<256x2x1xi32>) -> tensor<256x1xi32> loc(#loc323) + %right_idx_697 = tt.expand_dims %right_idx_696 {axis = 1 : i32} : tensor<256x1xi32> -> tensor<256x1x1xi32> loc(#loc299) + %right_idx_698 = tt.broadcast %right_idx_697 : tensor<256x1x1xi32> -> tensor<256x2x1xi32> loc(#loc300) + %left_idx_699 = tt.reshape %left_idx_694 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc301) + %right_idx_700 = tt.reshape %right_idx_698 : tensor<256x2x1xi32> -> tensor<32x16xi32> loc(#loc302) + %cond_701 = arith.cmpi slt, %ileft_688, %iright_689 : tensor<32x16xi32> loc(#loc281) + %eq_702 = arith.cmpi eq, %ileft_688, %iright_689 : tensor<32x16xi32> loc(#loc282) + %cond_703 = arith.cmpi sgt, %left_idx_699, %right_idx_700 : tensor<32x16xi32> loc(#loc303) + %cond_704 = arith.andi %eq_702, %cond_703 : tensor<32x16xi1> loc(#loc283) + %cond_705 = arith.ori %cond_701, %cond_704 : tensor<32x16xi1> loc(#loc284) + %new_idxs_706 = arith.xori %left_idx_699, %right_idx_700 : tensor<32x16xi32> loc(#loc304) + %new_idxs_707 = arith.select %cond_705, %new_idxs_706, %cst_3 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc290) + %new_idxs_708 = arith.xori %new_idxs_678, %new_idxs_707 : tensor<32x16xi32> loc(#loc291) + %tmp20 = arith.extui %tmp5 : tensor<32x16xi1> to tensor<32x16xi64> loc(#loc226) + %tmp23 = arith.select %tmp0_20, %tmp20, %cst_7 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc197) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_713: i64 loc(callsite(#loc1 at #loc198)), %tmp24_714: i64 loc(callsite(#loc1 at #loc198))): + %tmp24_715 = arith.addi %tmp24_713, %tmp24_714 : i64 loc(#loc305) + tt.reduce.return %tmp24_715 : i64 loc(#loc227) + }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc227) + %tmp24_709 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc199) + %tmp25 = arith.extui %tmp14 : tensor<32x16xi1> to tensor<32x16xi64> loc(#loc229) + %tmp28 = arith.select %tmp0_20, %tmp25, %cst_7 : tensor<32x16xi1>, tensor<32x16xi64> loc(#loc201) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_713: i64 loc(callsite(#loc1 at #loc202)), %tmp29_714: i64 loc(callsite(#loc1 at #loc202))): + %tmp29_715 = arith.addi %tmp29_713, %tmp29_714 : i64 loc(#loc306) + tt.reduce.return %tmp29_715 : i64 loc(#loc230) + }) : (tensor<32x16xi64>) -> tensor<32xi64> loc(#loc230) + %tmp29_710 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<32xi64> -> tensor<32x1xi64> loc(#loc203) + %tmp30 = arith.trunci %tmp24_709 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc204) + %tmp31 = arith.trunci %tmp29_710 : tensor<32x1xi64> to tensor<32x1xi32> loc(#loc205) + %tmp34 = tt.broadcast %tmp30 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc206) + %tmp34_711 = arith.cmpi slt, %tmp0_15, %tmp34 : tensor<32x16xi32> loc(#loc206) + %tmp36 = arith.select %tmp34_711, %new_idxs_376, %cst_5 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc207) + %tmp38 = arith.addi %tmp36, %cst_4 : tensor<32x16xi32> loc(#loc208) + %tmp39 = arith.cmpi slt, %tmp36, %cst_3 : tensor<32x16xi32> loc(#loc209) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc210) + %0 = arith.cmpi sge, %tmp40, %cst_3 : tensor<32x16xi32> loc(#loc88) + %1 = arith.cmpi slt, %tmp40, %cst_4 : tensor<32x16xi32> loc(#loc89) + %2 = arith.andi %0, %1 : tensor<32x16xi1> loc(#loc90) + %3 = arith.xori %xmask_13, %cst_2 : tensor<32x1xi1> loc(#loc91) + %4 = tt.broadcast %3 : tensor<32x1xi1> -> tensor<32x16xi1> loc(#loc92) + %5 = arith.ori %2, %4 : tensor<32x16xi1> loc(#loc92) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<32x16xi1> loc(#loc93) + %tmp45 = tt.broadcast %tmp31 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc211) + %tmp45_712 = arith.cmpi slt, %tmp0_15, %tmp45 : tensor<32x16xi32> loc(#loc211) + %tmp46 = arith.select %tmp45_712, %new_idxs_708, %cst_5 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc212) + %tmp47 = arith.addi %tmp46, %cst_4 : tensor<32x16xi32> loc(#loc213) + %tmp48 = arith.cmpi slt, %tmp46, %cst_3 : tensor<32x16xi32> loc(#loc214) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<32x16xi1>, tensor<32x16xi32> loc(#loc215) + %6 = arith.cmpi sge, %tmp49, %cst_3 : tensor<32x16xi32> loc(#loc99) + %7 = arith.cmpi slt, %tmp49, %cst_4 : tensor<32x16xi32> loc(#loc100) + %8 = arith.andi %6, %7 : tensor<32x16xi1> loc(#loc101) + %9 = arith.ori %8, %4 : tensor<32x16xi1> loc(#loc102) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<32x16xi1> loc(#loc103) + %10 = tt.splat %out_ptr4 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc104) + %11 = tt.addptr %10, %xindex_12 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc104) + tt.store %11, %tmp30, %xmask_13 : tensor<32x1x!tt.ptr> loc(#loc105) + %12 = tt.splat %out_ptr5 : !tt.ptr -> tensor<32x1x!tt.ptr> loc(#loc106) + %13 = tt.addptr %12, %xindex_12 : tensor<32x1x!tt.ptr>, tensor<32x1xi32> loc(#loc106) + tt.store %13, %tmp31, %xmask_13 : tensor<32x1x!tt.ptr> loc(#loc107) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc108) + %15 = tt.addptr %14, %tmp0_17 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc108) + tt.store %15, %new_idxs_376, %tmp0_20 : tensor<32x16x!tt.ptr> loc(#loc109) + %16 = arith.muli %xindex_12, %cst_1 : tensor<32x1xi32> loc(#loc110) + %17 = tt.broadcast %16 : tensor<32x1xi32> -> tensor<32x16xi32> loc(#loc111) + %18 = arith.addi %tmp40, %17 : tensor<32x16xi32> loc(#loc111) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc112) + %20 = tt.addptr %19, %18 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc112) + tt.store %20, %cst_0, %tmp0_20 : tensor<32x16x!tt.ptr> loc(#loc113) + %21 = tt.splat %out_ptr8 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc114) + %22 = tt.addptr %21, %tmp0_17 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc114) + tt.store %22, %new_idxs_708, %tmp0_20 : tensor<32x16x!tt.ptr> loc(#loc115) + %23 = arith.addi %tmp49, %17 : tensor<32x16xi32> loc(#loc116) + %24 = tt.splat %out_ptr9 : !tt.ptr -> tensor<32x16x!tt.ptr> loc(#loc117) + %25 = tt.addptr %24, %23 : tensor<32x16x!tt.ptr>, tensor<32x16xi32> loc(#loc117) + tt.store %25, %cst_0, %tmp0_20 : tensor<32x16x!tt.ptr> loc(#loc118) + tt.return loc(#loc119) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":26:21) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":27:28) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":27:38) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":34:40) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":34:37) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":34:30) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":34:45) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":36:18) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":38:18) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":39:18) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":41:19) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":40:19) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":43:19) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":45:34) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc68 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":47:20) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":49:21) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":48:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":52:20) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":54:35) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":55:29) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":56:21) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":58:35) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":59:29) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":60:21) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":61:21) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":64:19) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":66:35) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":68:20) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":69:20) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":70:35) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":71:28) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":71:46) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":71:38) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":71:55) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":71:53) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":71:63) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":75:19) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":76:35) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":77:20) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":78:20) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":79:35) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":80:28) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":80:46) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":80:38) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":80:53) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":80:63) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":81:25) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":81:37) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":82:25) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":82:37) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":83:25) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":83:47) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":84:52) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":84:49) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":84:25) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":84:85) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":85:25) +#loc115 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":85:47) +#loc116 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":86:49) +#loc117 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":86:25) +#loc118 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":86:85) +#loc119 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/6q/c6qvewydivg4bvc4qyzfgybxa6usm4f7lbirtgr37nhmlwfed6p5.py":86:4) +#loc129 = loc("xmask"(#loc2)) +#loc130 = loc("xoffset"(#loc3)) +#loc131 = loc("xoffset"(#loc4)) +#loc132 = loc("xindex"(#loc5)) +#loc133 = loc("xindex"(#loc6)) +#loc134 = loc("xindex"(#loc7)) +#loc135 = loc("r0_index"(#loc8)) +#loc136 = loc("r0_index"(#loc9)) +#loc137 = loc("tmp0"(#loc10)) +#loc138 = loc("tmp0"(#loc11)) +#loc139 = loc("tmp0"(#loc12)) +#loc140 = loc("tmp0"(#loc13)) +#loc141 = loc("tmp2"(#loc14)) +#loc142 = loc("tmp4"(#loc15)) +#loc143 = loc("tmp5"(#loc16)) +#loc144 = loc("tmp7"(#loc17)) +#loc145 = loc("tmp6"(#loc18)) +#loc146 = loc("tmp9"(#loc19)) +#loc147 = loc("tmp11"(#loc20)) +#loc148 = loc("flip"(#loc21)) +#loc150 = loc("flip"(#loc24)) +#loc151 = loc("flip"(#loc25)) +#loc152 = loc("flip"(#loc26)) +#loc153 = loc("y"(#loc27)) +#loc154 = loc("left_mask"(#loc29)) +#loc155 = loc("ileft"(#loc30)) +#loc157 = loc("ileft"(#loc34)) +#loc158 = loc("ileft"(#loc35)) +#loc159 = loc("iright"(#loc36)) +#loc161 = loc("iright"(#loc38)) +#loc162 = loc("iright"(#loc39)) +#loc163 = loc("ileft"(#loc40)) +#loc164 = loc("iright"(#loc41)) +#loc165 = loc("y_idx"(#loc42)) +#loc166 = loc("left_idx"(#loc43)) +#loc167 = loc("left_idx"(#loc44)) +#loc168 = loc("input"(#loc45)) +#loc170 = loc("left_idx"(#loc47)) +#loc171 = loc("left_idx"(#loc48)) +#loc172 = loc("right_idx"(#loc49)) +#loc173 = loc("right_idx"(#loc50)) +#loc175 = loc("right_idx"(#loc52)) +#loc176 = loc("right_idx"(#loc53)) +#loc177 = loc("left_idx"(#loc54)) +#loc178 = loc("right_idx"(#loc55)) +#loc179 = loc("cond"(#loc56)) +#loc180 = loc("eq"(#loc57)) +#loc181 = loc("cond"(#loc58)) +#loc182 = loc("cond"(#loc59)) +#loc183 = loc("cond"(#loc60)) +#loc184 = loc("cond"(#loc61)) +#loc185 = loc("cond"(#loc62)) +#loc186 = loc("ret"(#loc63)) +#loc187 = loc("ret"(#loc64)) +#loc188 = loc("ret"(#loc65)) +#loc189 = loc("new_idxs"(#loc66)) +#loc190 = loc("new_idxs"(#loc67)) +#loc191 = loc("new_idxs"(#loc68)) +#loc192 = loc("tmp14"(#loc69)) +#loc193 = loc("tmp16"(#loc70)) +#loc194 = loc("tmp15"(#loc71)) +#loc196 = loc("tmp20"(#loc73)) +#loc197 = loc("tmp23"(#loc74)) +#loc199 = loc("tmp24"(#loc76)) +#loc200 = loc("tmp25"(#loc77)) +#loc201 = loc("tmp28"(#loc78)) +#loc203 = loc("tmp29"(#loc80)) +#loc204 = loc("tmp30"(#loc81)) +#loc205 = loc("tmp31"(#loc82)) +#loc206 = loc("tmp34"(#loc83)) +#loc207 = loc("tmp36"(#loc84)) +#loc208 = loc("tmp38"(#loc85)) +#loc209 = loc("tmp39"(#loc86)) +#loc210 = loc("tmp40"(#loc87)) +#loc211 = loc("tmp45"(#loc94)) +#loc212 = loc("tmp46"(#loc95)) +#loc213 = loc("tmp47"(#loc96)) +#loc214 = loc("tmp48"(#loc97)) +#loc215 = loc("tmp49"(#loc98)) +#loc216 = loc(fused[#loc144, #loc145]) +#loc217 = loc(callsite(#loc148 at #loc149)) +#loc218 = loc(callsite(#loc150 at #loc149)) +#loc219 = loc(callsite(#loc151 at #loc149)) +#loc220 = loc(callsite(#loc152 at #loc149)) +#loc222 = loc("cond"(#loc179)) +#loc223 = loc("eq"(#loc180)) +#loc224 = loc(fused[#loc193, #loc194]) +#loc226 = loc(fused[#loc196, #loc144, #loc145]) +#loc227 = loc(callsite(#loc31 at #loc198)) +#loc229 = loc(fused[#loc200, #loc193, #loc194]) +#loc230 = loc(callsite(#loc31 at #loc202)) +#loc232 = loc(callsite(#loc153 at #loc221)) +#loc233 = loc(callsite(#loc154 at #loc221)) +#loc234 = loc(callsite(#loc155 at #loc221)) +#loc236 = loc(callsite(#loc157 at #loc221)) +#loc237 = loc(callsite(#loc158 at #loc221)) +#loc238 = loc(callsite(#loc159 at #loc221)) +#loc240 = loc(callsite(#loc161 at #loc221)) +#loc241 = loc(callsite(#loc162 at #loc221)) +#loc242 = loc(callsite(#loc163 at #loc221)) +#loc243 = loc(callsite(#loc164 at #loc221)) +#loc244 = loc(callsite(#loc165 at #loc221)) +#loc245 = loc(callsite(#loc166 at #loc221)) +#loc246 = loc(callsite(#loc167 at #loc221)) +#loc248 = loc(callsite(#loc170 at #loc221)) +#loc249 = loc(callsite(#loc171 at #loc221)) +#loc250 = loc(callsite(#loc172 at #loc221)) +#loc251 = loc(callsite(#loc173 at #loc221)) +#loc253 = loc(callsite(#loc175 at #loc221)) +#loc254 = loc(callsite(#loc176 at #loc221)) +#loc255 = loc(callsite(#loc177 at #loc221)) +#loc256 = loc(callsite(#loc178 at #loc221)) +#loc257 = loc(callsite(#loc222 at #loc221)) +#loc258 = loc(callsite(#loc223 at #loc221)) +#loc259 = loc(callsite(#loc181 at #loc221)) +#loc260 = loc(callsite(#loc182 at #loc221)) +#loc261 = loc(callsite(#loc183 at #loc221)) +#loc262 = loc(callsite(#loc184 at #loc221)) +#loc263 = loc(callsite(#loc185 at #loc221)) +#loc264 = loc(callsite(#loc186 at #loc221)) +#loc265 = loc(callsite(#loc187 at #loc221)) +#loc266 = loc(callsite(#loc188 at #loc221)) +#loc267 = loc(callsite(#loc189 at #loc221)) +#loc268 = loc(callsite(#loc190 at #loc221)) +#loc269 = loc(callsite(#loc191 at #loc221)) +#loc270 = loc(callsite(#loc153 at #loc225)) +#loc271 = loc(callsite(#loc155 at #loc225)) +#loc273 = loc(callsite(#loc157 at #loc225)) +#loc274 = loc(callsite(#loc158 at #loc225)) +#loc275 = loc(callsite(#loc159 at #loc225)) +#loc277 = loc(callsite(#loc161 at #loc225)) +#loc278 = loc(callsite(#loc162 at #loc225)) +#loc279 = loc(callsite(#loc163 at #loc225)) +#loc280 = loc(callsite(#loc164 at #loc225)) +#loc281 = loc(callsite(#loc222 at #loc225)) +#loc282 = loc(callsite(#loc223 at #loc225)) +#loc283 = loc(callsite(#loc182 at #loc225)) +#loc284 = loc(callsite(#loc183 at #loc225)) +#loc285 = loc(callsite(#loc184 at #loc225)) +#loc286 = loc(callsite(#loc185 at #loc225)) +#loc287 = loc(callsite(#loc186 at #loc225)) +#loc288 = loc(callsite(#loc187 at #loc225)) +#loc289 = loc(callsite(#loc188 at #loc225)) +#loc290 = loc(callsite(#loc190 at #loc225)) +#loc291 = loc(callsite(#loc191 at #loc225)) +#loc292 = loc(callsite(#loc165 at #loc225)) +#loc293 = loc(callsite(#loc167 at #loc225)) +#loc295 = loc(callsite(#loc170 at #loc225)) +#loc296 = loc(callsite(#loc171 at #loc225)) +#loc297 = loc(callsite(#loc173 at #loc225)) +#loc299 = loc(callsite(#loc175 at #loc225)) +#loc300 = loc(callsite(#loc176 at #loc225)) +#loc301 = loc(callsite(#loc177 at #loc225)) +#loc302 = loc(callsite(#loc178 at #loc225)) +#loc303 = loc(callsite(#loc181 at #loc225)) +#loc304 = loc(callsite(#loc189 at #loc225)) +#loc305 = loc(callsite(#loc33 at #loc227)) +#loc306 = loc(callsite(#loc33 at #loc230)) +#loc307 = loc(callsite(#loc31 at #loc235)) +#loc309 = loc(callsite(#loc31 at #loc239)) +#loc311 = loc(callsite(#loc168 at #loc247)) +#loc312 = loc(callsite(#loc31 at #loc247)) +#loc314 = loc(callsite(#loc168 at #loc252)) +#loc315 = loc(callsite(#loc31 at #loc252)) +#loc317 = loc(callsite(#loc31 at #loc272)) +#loc319 = loc(callsite(#loc31 at #loc276)) +#loc321 = loc(callsite(#loc31 at #loc294)) +#loc323 = loc(callsite(#loc31 at #loc298)) +#loc325 = loc(callsite(#loc33 at #loc307)) +#loc326 = loc(callsite(#loc33 at #loc309)) +#loc327 = loc(callsite(#loc33 at #loc312)) +#loc328 = loc(callsite(#loc33 at #loc315)) +#loc329 = loc(callsite(#loc33 at #loc317)) +#loc330 = loc(callsite(#loc33 at #loc319)) +#loc331 = loc(callsite(#loc33 at #loc321)) +#loc332 = loc(callsite(#loc33 at #loc323)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..54cd45eae62c1ee84979f636dd77a728daf33105 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/FM6CWX5NDNUSSOQWH3EGE2NL4IM53NMNXM4NI2FUVI2663Y6EMSQ/triton_red_fused_zeros_0.ttgir @@ -0,0 +1,233 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":18:0) +#loc1 = loc(unknown) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":45:25) +#loc58 = loc("in_ptr0"(#loc)) +#loc59 = loc("in_ptr1"(#loc)) +#loc60 = loc("out_ptr1"(#loc)) +#loc61 = loc("ks0"(#loc)) +#loc62 = loc("ks1"(#loc)) +#loc63 = loc("xnumel"(#loc)) +#loc64 = loc("r0_numel"(#loc)) +#loc109 = loc("tmp4"(#loc52)) +#loc120 = loc(callsite(#loc1 at #loc109)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_zeros_0(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 {tt.divisibility = 16 : i32} loc("ks1"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<32> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<128> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<4096> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_2 = arith.constant dense<0> : tensor<64x1xi64, #blocked> loc(#loc1) + %cst_3 = arith.constant dense<0> : tensor<64x1xi32, #blocked> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<64x1xi64, #blocked> loc(#loc1) + %c0_i64 = arith.constant 0 : i64 loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %cst_5 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked> loc(#loc1) + %c4_i32 = arith.constant 4 : i32 loc(#loc1) + %c128_i32 = arith.constant 128 : i32 loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c4096_i64 = arith.constant 4096 : i64 loc(#loc1) + %cst_6 = arith.constant dense<128> : tensor<1x4xi32, #blocked> loc(#loc1) + %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked> loc(#loc1) + %c64_i32 = arith.constant 64 : i32 loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc65) + %xoffset_8 = arith.muli %xoffset, %c64_i32 : i32 loc(#loc66) + %xindex = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc67) + %xindex_9 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc67) + %xindex_10 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> loc(#loc67) + %xindex_11 = tt.expand_dims %xindex_9 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> loc(#loc67) + %xindex_12 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked> loc(#loc68) + %xindex_13 = tt.splat %xoffset_8 : i32 -> tensor<64x1xi32, #blocked1> loc(#loc68) + %xindex_14 = arith.addi %xindex_12, %xindex_10 : tensor<64x1xi32, #blocked> loc(#loc68) + %xindex_15 = arith.addi %xindex_13, %xindex_11 : tensor<64x1xi32, #blocked1> loc(#loc68) + %xmask = tt.splat %xnumel : i32 -> tensor<64x1xi32, #blocked> loc(#loc69) + %xmask_16 = tt.splat %xnumel : i32 -> tensor<64x1xi32, #blocked1> loc(#loc69) + %xmask_17 = arith.cmpi slt, %xindex_14, %xmask : tensor<64x1xi32, #blocked> loc(#loc69) + %xmask_18 = arith.cmpi slt, %xindex_15, %xmask_16 : tensor<64x1xi32, #blocked1> loc(#loc69) + %r0_base = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc70) + %r0_base_19 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x4xi32, #blocked> loc(#loc70) + %x0 = arith.extsi %xindex_14 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> loc(#loc71) + %x0_20 = tt.splat %ks0 : i64 -> tensor<64x1xi64, #blocked> loc(#loc71) + %x0_21 = arith.remsi %x0, %x0_20 : tensor<64x1xi64, #blocked> loc(#loc71) + %x1 = arith.divsi %x0, %x0_20 : tensor<64x1xi64, #blocked> loc(#loc72) + %x1_22 = arith.remsi %x1, %cst : tensor<64x1xi64, #blocked> loc(#loc73) + %x2 = tt.splat %ks1 : i64 -> tensor<64x1xi64, #blocked> loc(#loc74) + %x2_23 = arith.divsi %x0, %x2 : tensor<64x1xi64, #blocked> loc(#loc74) + %fixed = arith.cmpi ne, %x0_21, %cst_2 : tensor<64x1xi64, #blocked> loc(#loc111) + %fixed_24 = arith.subi %x1, %cst_4 : tensor<64x1xi64, #blocked> loc(#loc112) + %fixed_25 = arith.select %fixed, %fixed_24, %x1 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked> loc(#loc113) + %x5 = arith.cmpi slt, %xindex_14, %cst_3 : tensor<64x1xi32, #blocked> loc(#loc114) + %x5_26 = arith.cmpi slt, %ks0, %c0_i64 : i64 loc(#loc115) + %x5_27 = tt.splat %x5_26 : i1 -> tensor<64x1xi1, #blocked> loc(#loc116) + %x5_28 = arith.cmpi ne, %x5, %x5_27 : tensor<64x1xi1, #blocked> loc(#loc116) + %x5_29 = arith.select %x5_28, %fixed_25, %x1 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked> loc(#loc117) + %tmp0 = arith.muli %x1_22, %cst_0 : tensor<64x1xi64, #blocked> loc(#loc79) + %tmp0_30 = tt.broadcast %tmp0 : tensor<64x1xi64, #blocked> -> tensor<64x4xi64, #blocked> loc(#loc80) + %tmp0_31 = arith.muli %x0_21, %cst_1 : tensor<64x1xi64, #blocked> loc(#loc81) + %tmp0_32 = tt.broadcast %tmp0_31 : tensor<64x1xi64, #blocked> -> tensor<64x4xi64, #blocked> loc(#loc82) + %tmp0_33 = arith.muli %ks0, %c4096_i64 : i64 loc(#loc83) + %tmp0_34 = tt.splat %tmp0_33 : i64 -> tensor<64x1xi64, #blocked> loc(#loc84) + %tmp0_35 = arith.muli %tmp0_34, %x2_23 : tensor<64x1xi64, #blocked> loc(#loc84) + %tmp0_36 = tt.broadcast %tmp0_35 : tensor<64x1xi64, #blocked> -> tensor<64x4xi64, #blocked> loc(#loc85) + %tmp0_37 = tt.splat %in_ptr0 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked> loc(#loc86) + %tmp0_38 = tt.broadcast %xmask_17 : tensor<64x1xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc87) + %tmp1 = arith.muli %x0_21, %cst_0 : tensor<64x1xi64, #blocked> loc(#loc88) + %tmp1_39 = tt.broadcast %tmp1 : tensor<64x1xi64, #blocked> -> tensor<64x4xi64, #blocked> loc(#loc89) + %tmp1_40 = arith.muli %x5_29, %cst_0 : tensor<64x1xi64, #blocked> loc(#loc90) + %tmp1_41 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc91) + %tmp1_42 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc92) + %tmp1_43 = arith.extui %tmp1_42 : i1 to i64 loc(#loc93) + %tmp1_44 = arith.muli %ks0, %tmp1_43 : i64 loc(#loc93) + %tmp1_45 = arith.extui %tmp1_41 : i1 to i64 loc(#loc118) + %tmp1_46 = arith.addi %tmp1_45, %tmp1_44 : i64 loc(#loc94) + %tmp1_47 = tt.splat %tmp1_46 : i64 -> tensor<64x1xi64, #blocked> loc(#loc96) + %tmp1_48 = arith.muli %tmp1_40, %tmp1_47 : tensor<64x1xi64, #blocked> loc(#loc96) + %tmp1_49 = tt.broadcast %tmp1_48 : tensor<64x1xi64, #blocked> -> tensor<64x4xi64, #blocked> loc(#loc97) + %tmp1_50 = tt.splat %in_ptr1 : !tt.ptr -> tensor<64x4x!tt.ptr, #blocked> loc(#loc98) + %_tmp4 = scf.for %r0_offset = %c0_i32 to %c128_i32 step %c4_i32 iter_args(%_tmp4_53 = %cst_7) -> (tensor<64x4xf32, #blocked>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x4xi32, #blocked> loc(#loc100) + %r0_index_54 = arith.addi %r0_index, %r0_base_19 : tensor<1x4xi32, #blocked> loc(#loc100) + %r0_mask = arith.cmpi slt, %r0_index_54, %cst_6 : tensor<1x4xi32, #blocked> loc(#loc101) + %tmp0_55 = arith.extsi %r0_index_54 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked> loc(#loc80) + %tmp0_56 = tt.broadcast %tmp0_55 : tensor<1x4xi64, #blocked> -> tensor<64x4xi64, #blocked> loc(#loc80) + %tmp0_57 = arith.addi %tmp0_56, %tmp0_30 : tensor<64x4xi64, #blocked> loc(#loc80) + %tmp0_58 = arith.addi %tmp0_57, %tmp0_32 : tensor<64x4xi64, #blocked> loc(#loc82) + %tmp0_59 = arith.addi %tmp0_58, %tmp0_36 : tensor<64x4xi64, #blocked> loc(#loc85) + %tmp0_60 = tt.addptr %tmp0_37, %tmp0_59 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi64, #blocked> loc(#loc86) + %tmp0_61 = tt.broadcast %r0_mask : tensor<1x4xi1, #blocked> -> tensor<64x4xi1, #blocked> loc(#loc87) + %tmp0_62 = arith.andi %tmp0_61, %tmp0_38 : tensor<64x4xi1, #blocked> loc(#loc87) + %tmp0_63 = tt.load %tmp0_60, %tmp0_62, %cst_5 evictionPolicy = evict_first : tensor<64x4x!tt.ptr, #blocked> loc(#loc102) + %tmp0_64 = arith.extf %tmp0_63 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc103) + %tmp1_65 = arith.addi %tmp0_56, %tmp1_39 : tensor<64x4xi64, #blocked> loc(#loc89) + %tmp1_66 = arith.addi %tmp1_65, %tmp1_49 : tensor<64x4xi64, #blocked> loc(#loc97) + %tmp1_67 = tt.addptr %tmp1_50, %tmp1_66 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi64, #blocked> loc(#loc98) + %tmp1_68 = tt.load %tmp1_67, %tmp0_62, %cst_5 evictionPolicy = evict_first : tensor<64x4x!tt.ptr, #blocked> loc(#loc104) + %tmp1_69 = arith.extf %tmp1_68 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked> loc(#loc105) + %tmp2 = arith.mulf %tmp0_64, %tmp1_69 : tensor<64x4xf32, #blocked> loc(#loc106) + %tmp5 = arith.addf %_tmp4_53, %tmp2 : tensor<64x4xf32, #blocked> loc(#loc107) + %_tmp4_70 = arith.select %tmp0_62, %tmp5, %_tmp4_53 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> loc(#loc108) + scf.yield %_tmp4_70 : tensor<64x4xf32, #blocked> loc(#loc50) + } loc(#loc99) + %tmp4 = "tt.reduce"(%_tmp4) <{axis = 1 : i32}> ({ + ^bb0(%tmp4_53: f32 loc(callsite(#loc1 at #loc109)), %tmp4_54: f32 loc(callsite(#loc1 at #loc109))): + %tmp4_55 = arith.addf %tmp4_53, %tmp4_54 : f32 loc(#loc121) + tt.reduce.return %tmp4_55 : f32 loc(#loc119) + }) : (tensor<64x4xf32, #blocked>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc119) + %tmp4_51 = ttg.convert_layout %tmp4 : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc110) + %tmp4_52 = tt.expand_dims %tmp4_51 {axis = 1 : i32} : tensor<64xf32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xf32, #blocked1> loc(#loc110) + %0 = tt.splat %out_ptr1 : !tt.ptr -> tensor<64x1x!tt.ptr, #blocked1> loc(#loc55) + %1 = tt.addptr %0, %xindex_15 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> loc(#loc55) + tt.store %1, %tmp4_52, %xmask_18 : tensor<64x1x!tt.ptr, #blocked1> loc(#loc56) + tt.return loc(#loc57) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":23:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":23:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":24:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":25:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":27:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":28:21) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":28:28) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":29:19) +#loc12 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:34) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":30:51) +#loc14 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:44) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":74:47) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:25) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:36) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:32) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":75:47) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":39:45) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":39:41) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":39:55) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":39:50) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":39:65) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":39:69) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":39:60) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":39:34) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":39:84) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":40:45) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":40:41) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":40:54) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":40:73) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":40:99) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":40:90) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":40:81) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":40:65) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":40:58) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":40:50) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":40:34) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":33:40) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":34:31) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":35:29) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":39:74) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":39:136) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":40:106) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":40:168) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":41:22) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":43:23) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":44:48) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":44:8) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":45:28) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":49:25) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":49:36) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/bc/cbcjmmdd4rwzphcjmnxlmi32as4rv3cdhd37zaajsactabdidubh.py":49:4) +#loc65 = loc("xoffset"(#loc2)) +#loc66 = loc("xoffset"(#loc3)) +#loc67 = loc("xindex"(#loc4)) +#loc68 = loc("xindex"(#loc5)) +#loc69 = loc("xmask"(#loc6)) +#loc70 = loc("r0_base"(#loc7)) +#loc71 = loc("x0"(#loc8)) +#loc72 = loc("x1"(#loc9)) +#loc73 = loc("x1"(#loc10)) +#loc74 = loc("x2"(#loc11)) +#loc75 = loc("fixed"(#loc12)) +#loc76 = loc("x5"(#loc13)) +#loc77 = loc("fixed"(#loc14)) +#loc78 = loc("fixed"(#loc15)) +#loc79 = loc("tmp0"(#loc20)) +#loc80 = loc("tmp0"(#loc21)) +#loc81 = loc("tmp0"(#loc22)) +#loc82 = loc("tmp0"(#loc23)) +#loc83 = loc("tmp0"(#loc24)) +#loc84 = loc("tmp0"(#loc25)) +#loc85 = loc("tmp0"(#loc26)) +#loc86 = loc("tmp0"(#loc27)) +#loc87 = loc("tmp0"(#loc28)) +#loc88 = loc("tmp1"(#loc29)) +#loc89 = loc("tmp1"(#loc30)) +#loc90 = loc("tmp1"(#loc31)) +#loc91 = loc("tmp1"(#loc32)) +#loc92 = loc("tmp1"(#loc33)) +#loc93 = loc("tmp1"(#loc34)) +#loc94 = loc("tmp1"(#loc35)) +#loc95 = loc("tmp1"(#loc36)) +#loc96 = loc("tmp1"(#loc37)) +#loc97 = loc("tmp1"(#loc38)) +#loc98 = loc("tmp1"(#loc39)) +#loc99 = loc("_tmp4"(#loc40)) +#loc100 = loc("r0_index"(#loc41)) +#loc101 = loc("r0_mask"(#loc42)) +#loc102 = loc("tmp0"(#loc43)) +#loc103 = loc("tmp0"(#loc44)) +#loc104 = loc("tmp1"(#loc45)) +#loc105 = loc("tmp1"(#loc46)) +#loc106 = loc("tmp2"(#loc47)) +#loc107 = loc("tmp5"(#loc48)) +#loc108 = loc("_tmp4"(#loc49)) +#loc110 = loc("tmp4"(#loc54)) +#loc111 = loc(callsite(#loc75 at #loc76)) +#loc112 = loc(callsite(#loc77 at #loc76)) +#loc113 = loc(callsite(#loc78 at #loc76)) +#loc114 = loc(callsite(#loc16 at #loc76)) +#loc115 = loc(callsite(#loc17 at #loc76)) +#loc116 = loc(callsite(#loc18 at #loc76)) +#loc117 = loc(callsite(#loc19 at #loc76)) +#loc118 = loc(fused[#loc94, #loc95]) +#loc119 = loc(callsite(#loc51 at #loc109)) +#loc121 = loc(callsite(#loc53 at #loc119)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin new file mode 100644 index 0000000000000000000000000000000000000000..84e4cde881f9f8e54244d069d0227e1b78e7afb7 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir new file mode 100644 index 0000000000000000000000000000000000000000..06b0db79257fcf646a8c41d1594041799c10a7a8 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.llir @@ -0,0 +1,206 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused__to_copy_clone_slice_sum_transpose_5(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2, i64 %3, i32 %4, i32 %5, ptr addrspace(1) readnone captures(none) %6, ptr addrspace(1) readnone captures(none) %7) local_unnamed_addr #0 !dbg !4 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %10 = shl i32 %9, 3, !dbg !8 + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %12 = and i32 %11, 7, !dbg !9 + %13 = or disjoint i32 %10, %12, !dbg !10 + %14 = icmp slt i32 %13, %4, !dbg !11 + %.fr = freeze i1 %14 + %15 = lshr i32 %11, 3, !dbg !12 + %16 = and i32 %15, 7, !dbg !12 + %17 = or disjoint i32 %16, 8, !dbg !12 + %18 = sext i32 %13 to i64, !dbg !13 + %.frozen = freeze i64 %2, !dbg !14 + %19 = sdiv i64 %18, %.frozen, !dbg !14 + %20 = mul i64 %19, %.frozen, !dbg !13 + %.decomposed = sub i64 %18, %20, !dbg !13 + %21 = icmp sgt i32 %5, 0, !dbg !15 + br i1 %21, label %.lr.ph, label %._crit_edge, !dbg !15 + +.lr.ph: ; preds = %8 + %22 = mul i64 %3, %2, !dbg !16 + %23 = mul i64 %22, %19, !dbg !17 + %24 = getelementptr i32, ptr addrspace(1) %0, i64 %.decomposed + %invariant.gep = getelementptr i32, ptr addrspace(1) %24, i64 %23, !dbg !15 + br i1 %.fr, label %.lr.ph.split, label %.lr.ph.split.us + +.lr.ph.split.us: ; preds = %.lr.ph, %.lr.ph.split.us + %25 = phi i32 [ %36, %.lr.ph.split.us ], [ 0, %.lr.ph ] + %26 = or disjoint i32 %25, %16, !dbg !18 + %27 = or disjoint i32 %17, %25, !dbg !18 + %28 = sext i32 %26 to i64, !dbg !19 + %29 = sext i32 %27 to i64, !dbg !19 + %30 = mul i64 %2, %28, !dbg !19 + %31 = mul i64 %2, %29, !dbg !19 + %gep.us = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %30, !dbg !20 + %gep5.us = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %31, !dbg !20 + %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %33 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep.us, i64 %32, i1 false) #4, !dbg !21 + %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %35 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep5.us, i64 %34, i1 false) #4, !dbg !21 + %36 = add i32 %25, 16, !dbg !15 + %37 = icmp slt i32 %36, %5, !dbg !15 + br i1 %37, label %.lr.ph.split.us, label %._crit_edge, !dbg !15 + +.lr.ph.split: ; preds = %.lr.ph, %.lr.ph.split + %38 = phi i64 [ %53, %.lr.ph.split ], [ 0, %.lr.ph ] + %39 = phi i64 [ %55, %.lr.ph.split ], [ 0, %.lr.ph ] + %40 = phi i32 [ %56, %.lr.ph.split ], [ 0, %.lr.ph ] + %41 = or disjoint i32 %40, %16, !dbg !18 + %42 = or disjoint i32 %17, %40, !dbg !18 + %43 = icmp slt i32 %41, %5, !dbg !22 + %44 = icmp slt i32 %42, %5, !dbg !22 + %45 = sext i32 %41 to i64, !dbg !19 + %46 = sext i32 %42 to i64, !dbg !19 + %47 = mul i64 %2, %45, !dbg !19 + %48 = mul i64 %2, %46, !dbg !19 + %gep = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %47, !dbg !20 + %gep5 = getelementptr i32, ptr addrspace(1) %invariant.gep, i64 %48, !dbg !20 + %49 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %50 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep, i64 %49, i1 %43) #4, !dbg !21 + %51 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #4, !dbg !21 + %52 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b32 { $0 }, [ $1 + 0 ], $2;", "=r,l,l,b"(ptr addrspace(1) %gep5, i64 %51, i1 %44) #4, !dbg !21 + %narrow = select i1 %43, i32 %50, i32 0, !dbg !23 + %spec.select = sext i32 %narrow to i64, !dbg !23 + %53 = add i64 %38, %spec.select, !dbg !23 + %narrow6 = select i1 %44, i32 %52, i32 0, !dbg !23 + %54 = sext i32 %narrow6 to i64, !dbg !23 + %55 = add i64 %39, %54, !dbg !23 + %56 = add i32 %40, 16, !dbg !15 + %57 = icmp slt i32 %56, %5, !dbg !15 + br i1 %57, label %.lr.ph.split, label %._crit_edge, !dbg !15 + +._crit_edge: ; preds = %.lr.ph.split.us, %.lr.ph.split, %8 + %58 = phi i64 [ 0, %8 ], [ %53, %.lr.ph.split ], [ 0, %.lr.ph.split.us ], !dbg !24 + %59 = phi i64 [ 0, %8 ], [ %55, %.lr.ph.split ], [ 0, %.lr.ph.split.us ], !dbg !24 + %60 = and i32 %11, 24, !dbg !9 + %61 = lshr i32 %11, 5, !dbg !9 + %62 = add i64 %58, %59, !dbg !25 + %extelt.offset = lshr i64 %62, 32, !dbg !29 + %63 = trunc nuw i64 %extelt.offset to i32, !dbg !29 + %64 = trunc i64 %62 to i32, !dbg !29 + %65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 16, i32 31), !dbg !29 + %66 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 16, i32 31), !dbg !29 + %67 = insertelement <2 x i32> poison, i32 %65, i64 0, !dbg !29 + %68 = insertelement <2 x i32> %67, i32 %66, i64 1, !dbg !29 + %69 = bitcast <2 x i32> %68 to i64, !dbg !29 + %70 = add i64 %62, %69, !dbg !25 + %extelt.offset2 = lshr i64 %70, 32, !dbg !29 + %71 = trunc nuw i64 %extelt.offset2 to i32, !dbg !29 + %72 = trunc i64 %70 to i32, !dbg !29 + %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 8, i32 31), !dbg !29 + %74 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 8, i32 31), !dbg !29 + %75 = insertelement <2 x i32> poison, i32 %73, i64 0, !dbg !29 + %76 = insertelement <2 x i32> %75, i32 %74, i64 1, !dbg !29 + %77 = bitcast <2 x i32> %76 to i64, !dbg !29 + %78 = add i64 %70, %77, !dbg !25 + %79 = and i32 %61, 1, !dbg !29 + %80 = icmp eq i32 %60, 0, !dbg !29 + %.idx = shl nuw nsw i32 %12, 4, !dbg !29 + %81 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx, !dbg !29 + %82 = getelementptr i64, ptr addrspace(3) %81, i32 %79, !dbg !29 + %83 = insertelement <1 x i64> poison, i64 %78, i64 0, !dbg !29 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %82, <1 x i64> %83, i1 %80) #4, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !29 + %84 = icmp samesign ult i32 %11, 16, !dbg !29 + %85 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %11, !dbg !29 + %86 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %85, i1 %84) #4, !dbg !29 + %extelt.offset3 = lshr i64 %86, 32, !dbg !29 + %87 = trunc nuw i64 %extelt.offset3 to i32, !dbg !29 + %88 = trunc i64 %86 to i32, !dbg !29 + %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 1, i32 31), !dbg !29 + %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 1, i32 31), !dbg !29 + %91 = insertelement <2 x i32> poison, i32 %89, i64 0, !dbg !29 + %92 = insertelement <2 x i32> %91, i32 %90, i64 1, !dbg !29 + %93 = bitcast <2 x i32> %92 to i64, !dbg !29 + %94 = add i64 %86, %93, !dbg !25 + %95 = and i32 %11, 1009, !dbg !29 + %96 = icmp eq i32 %95, 0, !dbg !29 + %97 = insertelement <1 x i64> poison, i64 %94, i64 0, !dbg !29 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %85, <1 x i64> %97, i1 %96) #4, !dbg !29 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !29 + %98 = load i64, ptr addrspace(3) %81, align 16, !dbg !29 + %99 = trunc i64 %98 to i32, !dbg !30 + %100 = icmp slt i64 %2, 2, !dbg !31 + %101 = icmp sgt i64 %2, 1, !dbg !32 + %102 = select i1 %101, i64 %2, i64 0, !dbg !33 + %103 = zext i1 %100 to i64, !dbg !34 + %104 = add i64 %102, %103, !dbg !35 + %105 = mul i64 %19, %104, !dbg !36 + %106 = getelementptr i32, ptr addrspace(1) %1, i64 %.decomposed, !dbg !37 + %107 = getelementptr i32, ptr addrspace(1) %106, i64 %105, !dbg !37 + %108 = and i32 %11, 56, !dbg !38 + %109 = icmp eq i32 %108, 0, !dbg !38 + %110 = and i1 %109, %.fr, !dbg !38 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %99, ptr addrspace(1) %107, i1 %110) #4, !dbg !38 + ret void, !dbg !39 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +attributes #0 = { nounwind "nvvm.reqntid"="64" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused__to_copy_clone_slice_sum_transpose_5", linkageName: "triton_red_fused__to_copy_clone_slice_sum_transpose_5", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 21, column: 28, scope: !4) +!8 = !DILocation(line: 21, column: 33, scope: !4) +!9 = !DILocation(line: 22, column: 44, scope: !4) +!10 = !DILocation(line: 22, column: 23, scope: !4) +!11 = !DILocation(line: 23, column: 21, scope: !4) +!12 = !DILocation(line: 24, column: 37, scope: !4) +!13 = !DILocation(line: 26, column: 19, scope: !4) +!14 = !DILocation(line: 27, column: 19, scope: !4) +!15 = !DILocation(line: 30, column: 40, scope: !4) +!16 = !DILocation(line: 36, column: 54, scope: !4) +!17 = !DILocation(line: 36, column: 58, scope: !4) +!18 = !DILocation(line: 31, column: 31, scope: !4) +!19 = !DILocation(line: 36, column: 43, scope: !4) +!20 = !DILocation(line: 36, column: 34, scope: !4) +!21 = !DILocation(line: 36, column: 63, scope: !4) +!22 = !DILocation(line: 32, column: 29, scope: !4) +!23 = !DILocation(line: 40, column: 48, scope: !4) +!24 = !DILocation(line: 28, column: 43, scope: !4) +!25 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !28) +!26 = distinct !DILexicalBlockFile(scope: !4, file: !27, discriminator: 0) +!27 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!28 = !DILocation(line: 41, column: 25, scope: !4) +!29 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !28) +!30 = !DILocation(line: 42, column: 19, scope: !4) +!31 = !DILocation(line: 43, column: 49, scope: !4) +!32 = !DILocation(line: 43, column: 75, scope: !4) +!33 = !DILocation(line: 43, column: 66, scope: !4) +!34 = !DILocation(line: 43, scope: !4) +!35 = !DILocation(line: 43, column: 57, scope: !4) +!36 = !DILocation(line: 43, column: 34, scope: !4) +!37 = !DILocation(line: 43, column: 25, scope: !4) +!38 = !DILocation(line: 43, column: 88, scope: !4) +!39 = !DILocation(line: 43, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx new file mode 100644 index 0000000000000000000000000000000000000000..0f7f117f9a8bea18c53258f13d11b2c86d4585c9 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ptx @@ -0,0 +1,536 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused__to_copy_clone_slice_sum_transpose_5 // -- Begin function triton_red_fused__to_copy_clone_slice_sum_transpose_5 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused__to_copy_clone_slice_sum_transpose_5 +.visible .entry triton_red_fused__to_copy_clone_slice_sum_transpose_5( + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_1, + .param .u64 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_2, + .param .u64 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_3, + .param .u32 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_4, + .param .u32 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_5, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_7 +) +.reqntid 64 +{ + .reg .pred %p<17>; + .reg .b32 %r<57>; + .reg .b64 %rd<87>; + .loc 1 18 0 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:18:0 +$L__func_begin0: + .loc 1 18 0 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:18:0 + +// %bb.0: + ld.param.b32 %r8, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_5]; + ld.param.b64 %rd16, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_2]; +$L__tmp0: + .loc 1 21 28 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:21:28 + mov.u32 %r9, %ctaid.x; + .loc 1 21 33 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:21:33 + shl.b32 %r10, %r9, 3; + .loc 1 22 44 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:22:44 + mov.u32 %r1, %tid.x; + ld.param.b32 %r11, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_4]; + and.b32 %r2, %r1, 7; + .loc 1 22 23 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:22:23 + or.b32 %r13, %r10, %r2; + .loc 1 26 19 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:26:19 + cvt.s64.s32 %rd1, %r13; + .loc 1 27 19 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:27:19 + or.b64 %rd19, %rd1, %rd16; + and.b64 %rd20, %rd19, -4294967296; + setp.ne.b64 %p2, %rd20, 0; + @%p2 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: + div.s64 %rd82, %rd1, %rd16; + bra.uni $L__BB0_3; +$L__BB0_1: + cvt.u32.u64 %r15, %rd16; + cvt.u32.u64 %r16, %rd1; + div.u32 %r17, %r16, %r15; + cvt.u64.u32 %rd82, %r17; +$L__BB0_3: + .loc 1 0 19 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:0:19 + ld.param.b64 %rd15, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_1]; + setp.lt.s32 %p1, %r13, %r11; + .loc 1 26 19 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:26:19 + mul.lo.s64 %rd22, %rd82, %rd16; + sub.s64 %rd6, %rd1, %rd22; + .loc 1 30 40 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:30:40 + setp.lt.s32 %p3, %r8, 1; + mov.b64 %rd85, 0; + shl.b64 %rd81, %rd6, 2; + mov.b64 %rd86, %rd85; + @%p3 bra $L__BB0_9; +// %bb.4: // %.lr.ph + .loc 1 0 40 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:0:40 + ld.param.b64 %rd17, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_3]; + ld.param.b64 %rd14, [triton_red_fused__to_copy_clone_slice_sum_transpose_5_param_0]; + bfe.u32 %r3, %r1, 3, 3; + .loc 1 36 54 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:36:54 + mul.lo.s64 %rd23, %rd17, %rd16; + .loc 1 36 58 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:36:58 + mul.lo.s64 %rd24, %rd23, %rd82; + add.s64 %rd26, %rd14, %rd81; + .loc 1 30 40 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:30:40 + shl.b64 %rd27, %rd24, 2; + add.s64 %rd7, %rd26, %rd27; + @%p1 bra $L__BB0_7; + bra.uni $L__BB0_5; +$L__BB0_7: // %.lr.ph.split.preheader + .loc 1 0 40 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:0:40 + mov.b32 %r56, 0; + mov.b64 %rd85, 0; + mov.b64 %rd86, %rd85; +$L__BB0_8: // %.lr.ph.split + // =>This Inner Loop Header: Depth=1 + .loc 1 31 31 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:31:31 + add.s32 %r26, %r3, %r56; + .loc 1 32 29 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:32:29 + add.s32 %r27, %r26, 8; + setp.lt.s32 %p7, %r26, %r8; + setp.lt.s32 %p8, %r27, %r8; + .loc 1 36 43 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:36:43 + cvt.s64.s32 %rd48, %r26; + cvt.s64.s32 %rd49, %r27; + mul.lo.s64 %rd50, %rd16, %rd48; + mul.lo.s64 %rd51, %rd16, %rd49; + .loc 1 36 34 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:36:34 + shl.b64 %rd52, %rd50, 2; + add.s64 %rd43, %rd7, %rd52; + shl.b64 %rd53, %rd51, 2; + add.s64 %rd46, %rd7, %rd53; + .loc 1 36 63 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:36:63 + // begin inline asm + mov.u64 %rd42, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd42, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r24, 0x0; + @%p7 ld.global.L1::evict_last.L2::cache_hint.b32 { %r24 }, [ %rd43 + 0 ], %rd42; + // end inline asm + // begin inline asm + mov.u64 %rd45, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd45, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r25, 0x0; + @%p8 ld.global.L1::evict_last.L2::cache_hint.b32 { %r25 }, [ %rd46 + 0 ], %rd45; + // end inline asm + .loc 1 40 48 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:40:48 + selp.b32 %r28, %r24, 0, %p7; + cvt.s64.s32 %rd54, %r28; + add.s64 %rd85, %rd85, %rd54; + selp.b32 %r29, %r25, 0, %p8; + cvt.s64.s32 %rd55, %r29; + add.s64 %rd86, %rd86, %rd55; + .loc 1 30 40 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:30:40 + add.s32 %r56, %r56, 16; + setp.lt.s32 %p9, %r56, %r8; + @%p9 bra $L__BB0_8; + bra.uni $L__BB0_9; +$L__BB0_5: // %.lr.ph.split.us.preheader + .loc 1 0 40 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:0:40 + mov.b32 %r55, 0; +$L__BB0_6: // %.lr.ph.split.us + // =>This Inner Loop Header: Depth=1 + .loc 1 31 31 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:31:31 + add.s32 %r21, %r3, %r55; + .loc 1 36 43 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:36:43 + add.s32 %r22, %r21, 8; + cvt.s64.s32 %rd35, %r21; + cvt.s64.s32 %rd36, %r22; + mul.lo.s64 %rd37, %rd16, %rd35; + mul.lo.s64 %rd38, %rd16, %rd36; + .loc 1 36 34 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:36:34 + shl.b64 %rd39, %rd37, 2; + add.s64 %rd29, %rd7, %rd39; + shl.b64 %rd40, %rd38, 2; + add.s64 %rd32, %rd7, %rd40; + .loc 1 36 63 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:36:63 + // begin inline asm + mov.u64 %rd28, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd28, 1.0; + // end inline asm + mov.pred %p4, 0; + // begin inline asm + mov.u32 %r19, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b32 { %r19 }, [ %rd29 + 0 ], %rd28; + // end inline asm + // begin inline asm + mov.u64 %rd31, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd31, 1.0; + // end inline asm + // begin inline asm + mov.u32 %r20, 0x0; + @%p4 ld.global.L1::evict_last.L2::cache_hint.b32 { %r20 }, [ %rd32 + 0 ], %rd31; + // end inline asm + .loc 1 30 40 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:30:40 + add.s32 %r55, %r55, 16; + setp.lt.s32 %p6, %r55, %r8; + mov.b64 %rd86, %rd85; + @%p6 bra $L__BB0_6; +$L__BB0_9: // %._crit_edge + .loc 1 22 44 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:22:44 + and.b32 %r34, %r1, 24; +$L__tmp1: + .loc 2 261 15 // standard.py:261:15 @[ chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:41:25 ] + add.s64 %rd60, %rd85, %rd86; + .loc 2 291 36 // standard.py:291:36 @[ chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:41:25 ] + mov.b64 {_, %r35}, %rd60; + cvt.u32.u64 %r36, %rd60; + shfl.sync.bfly.b32 %r37, %r36, 16, 31, -1; + shfl.sync.bfly.b32 %r38, %r35, 16, 31, -1; + cvt.u64.u32 %rd61, %r37; + cvt.u64.u32 %rd62, %r38; + shl.b64 %rd63, %rd62, 32; + or.b64 %rd64, %rd61, %rd63; + .loc 2 261 15 // standard.py:261:15 @[ chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:41:25 ] + add.s64 %rd65, %rd60, %rd64; + .loc 2 291 36 // standard.py:291:36 @[ chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:41:25 ] + mov.b64 {_, %r39}, %rd65; + cvt.u32.u64 %r40, %rd65; + shfl.sync.bfly.b32 %r41, %r40, 8, 31, -1; + shfl.sync.bfly.b32 %r42, %r39, 8, 31, -1; + cvt.u64.u32 %rd66, %r41; + cvt.u64.u32 %rd67, %r42; + shl.b64 %rd68, %rd67, 32; + or.b64 %rd69, %rd66, %rd68; + .loc 2 261 15 // standard.py:261:15 @[ chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:41:25 ] + add.s64 %rd56, %rd65, %rd69; + .loc 2 291 36 // standard.py:291:36 @[ chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:41:25 ] + setp.eq.b32 %p10, %r34, 0; + shl.b32 %r43, %r2, 4; + mov.b32 %r44, global_smem; + add.s32 %r45, %r44, %r43; + shr.u32 %r46, %r1, 2; + and.b32 %r47, %r46, 8; + add.s32 %r30, %r45, %r47; + // begin inline asm + @%p10 st.shared.b64 [ %r30 + 0 ], %rd56; + // end inline asm + bar.sync 0; + setp.lt.u32 %p11, %r1, 16; + shl.b32 %r48, %r1, 3; + add.s32 %r31, %r44, %r48; + // begin inline asm + @%p11 ld.shared.b64 %rd57, [ %r31 + 0 ]; + // end inline asm + mov.b64 {_, %r49}, %rd57; + cvt.u32.u64 %r50, %rd57; + shfl.sync.bfly.b32 %r51, %r50, 1, 31, -1; + shfl.sync.bfly.b32 %r52, %r49, 1, 31, -1; + cvt.u64.u32 %rd70, %r51; + cvt.u64.u32 %rd71, %r52; + shl.b64 %rd72, %rd71, 32; + or.b64 %rd73, %rd70, %rd72; + .loc 2 261 15 // standard.py:261:15 @[ chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:41:25 ] + add.s64 %rd58, %rd57, %rd73; + .loc 2 291 36 // standard.py:291:36 @[ chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:41:25 ] + and.b32 %r53, %r1, 1009; + setp.eq.b32 %p12, %r53, 0; + // begin inline asm + @%p12 st.shared.b64 [ %r31 + 0 ], %rd58; + // end inline asm + bar.sync 0; + ld.shared.b32 %r33, [%r45]; +$L__tmp2: + .loc 1 43 49 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:43:49 + setp.lt.s64 %p14, %rd16, 2; + .loc 1 43 75 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:43:75 + setp.gt.s64 %p15, %rd16, 1; + .loc 1 43 66 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:43:66 + selp.b64 %rd74, %rd16, 0, %p15; + .loc 1 43 0 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:43 + selp.b64 %rd75, 1, 0, %p14; + .loc 1 43 57 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:43:57 + add.s64 %rd76, %rd74, %rd75; + .loc 1 43 34 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:43:34 + mul.lo.s64 %rd77, %rd82, %rd76; + .loc 1 43 25 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:43:25 + add.s64 %rd79, %rd15, %rd81; + shl.b64 %rd80, %rd77, 2; + add.s64 %rd59, %rd79, %rd80; + .loc 1 43 88 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:43:88 + and.b32 %r54, %r1, 56; + setp.eq.b32 %p16, %r54, 0; + and.pred %p13, %p16, %p1; + // begin inline asm + @%p13 st.global.b32 [ %rd59 + 0 ], { %r33 }; + // end inline asm + .loc 1 43 4 // chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py:43:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 238 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xe7 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 104 +.b8 108 +.b8 55 +.b8 54 +.b8 114 +.b8 121 +.b8 101 +.b8 117 +.b8 120 +.b8 121 +.b8 51 +.b8 118 +.b8 113 +.b8 104 +.b8 110 +.b8 55 +.b8 122 +.b8 122 +.b8 102 +.b8 111 +.b8 112 +.b8 50 +.b8 97 +.b8 110 +.b8 120 +.b8 104 +.b8 120 +.b8 50 +.b8 117 +.b8 120 +.b8 54 +.b8 118 +.b8 110 +.b8 115 +.b8 104 +.b8 111 +.b8 109 +.b8 111 +.b8 55 +.b8 118 +.b8 97 +.b8 51 +.b8 111 +.b8 104 +.b8 118 +.b8 112 +.b8 53 +.b8 100 +.b8 98 +.b8 50 +.b8 111 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 104 +.b8 108 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x38 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 99 +.b8 108 +.b8 111 +.b8 110 +.b8 101 +.b8 95 +.b8 115 +.b8 108 +.b8 105 +.b8 99 +.b8 101 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 116 +.b8 114 +.b8 97 +.b8 110 +.b8 115 +.b8 112 +.b8 111 +.b8 115 +.b8 101 +.b8 95 +.b8 53 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xc3:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xd8:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 41 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..07d08e1c917de06fbdcb3ac650e73cd97887d0f1 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/JOVMEF5UB3XPAO2GMKSDIGWCQSZP2YWR3UL465XFLBODYHCLBU5A/triton_red_fused__to_copy_clone_slice_sum_transpose_5.ttgir @@ -0,0 +1,147 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 2], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":18:0) +#loc1 = loc(unknown) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":41:25) +#loc40 = loc("in_ptr0"(#loc)) +#loc41 = loc("out_ptr1"(#loc)) +#loc42 = loc("ks0"(#loc)) +#loc43 = loc("ks1"(#loc)) +#loc44 = loc("xnumel"(#loc)) +#loc45 = loc("r0_numel"(#loc)) +#loc68 = loc("tmp3"(#loc26)) +#loc73 = loc(callsite(#loc1 at #loc68)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused__to_copy_clone_slice_sum_transpose_5(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr1"(#loc)), %ks0: i64 loc("ks0"(#loc)), %ks1: i64 loc("ks1"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<8x16xi64, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c1_i64 = arith.constant 1 : i64 loc(#loc1) + %c8_i32 = arith.constant 8 : i32 loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<8x16xi32, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc46) + %xoffset_1 = arith.muli %xoffset, %c8_i32 : i32 loc(#loc47) + %xindex = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc48) + %xindex_2 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<8xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi32, #blocked> loc(#loc48) + %xindex_3 = tt.splat %xoffset_1 : i32 -> tensor<8x1xi32, #blocked> loc(#loc49) + %xindex_4 = arith.addi %xindex_3, %xindex_2 : tensor<8x1xi32, #blocked> loc(#loc49) + %xmask = tt.splat %xnumel : i32 -> tensor<8x1xi32, #blocked> loc(#loc50) + %xmask_5 = arith.cmpi slt, %xindex_4, %xmask : tensor<8x1xi32, #blocked> loc(#loc50) + %r0_base = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc51) + %r0_base_6 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc51) + %x0 = arith.extsi %xindex_4 : tensor<8x1xi32, #blocked> to tensor<8x1xi64, #blocked> loc(#loc52) + %x0_7 = tt.splat %ks0 : i64 -> tensor<8x1xi64, #blocked> loc(#loc52) + %x0_8 = arith.remsi %x0, %x0_7 : tensor<8x1xi64, #blocked> loc(#loc52) + %x1 = arith.divsi %x0, %x0_7 : tensor<8x1xi64, #blocked> loc(#loc53) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x16xi32, #blocked> loc(#loc54) + %tmp0 = tt.splat %ks0 : i64 -> tensor<1x16xi64, #blocked> loc(#loc55) + %tmp0_9 = tt.broadcast %x0_8 : tensor<8x1xi64, #blocked> -> tensor<8x16xi64, #blocked> loc(#loc56) + %tmp0_10 = arith.muli %ks0, %ks1 : i64 loc(#loc57) + %tmp0_11 = tt.splat %tmp0_10 : i64 -> tensor<8x1xi64, #blocked> loc(#loc58) + %tmp0_12 = arith.muli %tmp0_11, %x1 : tensor<8x1xi64, #blocked> loc(#loc58) + %tmp0_13 = tt.broadcast %tmp0_12 : tensor<8x1xi64, #blocked> -> tensor<8x16xi64, #blocked> loc(#loc59) + %tmp0_14 = tt.splat %in_ptr0 : !tt.ptr -> tensor<8x16x!tt.ptr, #blocked> loc(#loc60) + %tmp0_15 = tt.broadcast %xmask_5 : tensor<8x1xi1, #blocked> -> tensor<8x16xi1, #blocked> loc(#loc61) + %_tmp3 = scf.for %_tmp3_17 = %c0_i32 to %r0_numel step %c16_i32 iter_args(%_tmp3_18 = %cst) -> (tensor<8x16xi64, #blocked>) : i32 { + %r0_index = tt.splat %_tmp3_17 : i32 -> tensor<1x16xi32, #blocked> loc(#loc63) + %r0_index_19 = arith.addi %r0_index, %r0_base_6 : tensor<1x16xi32, #blocked> loc(#loc63) + %r0_mask_20 = arith.cmpi slt, %r0_index_19, %r0_mask : tensor<1x16xi32, #blocked> loc(#loc54) + %tmp0_21 = arith.extsi %r0_index_19 : tensor<1x16xi32, #blocked> to tensor<1x16xi64, #blocked> loc(#loc55) + %tmp0_22 = arith.muli %tmp0, %tmp0_21 : tensor<1x16xi64, #blocked> loc(#loc55) + %tmp0_23 = tt.broadcast %tmp0_22 : tensor<1x16xi64, #blocked> -> tensor<8x16xi64, #blocked> loc(#loc56) + %tmp0_24 = arith.addi %tmp0_9, %tmp0_23 : tensor<8x16xi64, #blocked> loc(#loc56) + %tmp0_25 = arith.addi %tmp0_24, %tmp0_13 : tensor<8x16xi64, #blocked> loc(#loc59) + %tmp0_26 = tt.addptr %tmp0_14, %tmp0_25 : tensor<8x16x!tt.ptr, #blocked>, tensor<8x16xi64, #blocked> loc(#loc60) + %tmp0_27 = tt.broadcast %r0_mask_20 : tensor<1x16xi1, #blocked> -> tensor<8x16xi1, #blocked> loc(#loc61) + %tmp0_28 = arith.andi %tmp0_27, %tmp0_15 : tensor<8x16xi1, #blocked> loc(#loc61) + %tmp0_29 = tt.load %tmp0_26, %tmp0_28, %cst_0 evictionPolicy = evict_last : tensor<8x16x!tt.ptr, #blocked> loc(#loc64) + %tmp1 = arith.extsi %tmp0_29 : tensor<8x16xi32, #blocked> to tensor<8x16xi64, #blocked> loc(#loc65) + %tmp4 = arith.addi %_tmp3_18, %tmp1 : tensor<8x16xi64, #blocked> loc(#loc66) + %_tmp3_30 = arith.select %tmp0_28, %tmp4, %_tmp3_18 : tensor<8x16xi1, #blocked>, tensor<8x16xi64, #blocked> loc(#loc67) + scf.yield %_tmp3_30 : tensor<8x16xi64, #blocked> loc(#loc24) + } loc(#loc62) + %tmp3 = "tt.reduce"(%_tmp3) <{axis = 1 : i32}> ({ + ^bb0(%tmp3_17: i64 loc(callsite(#loc1 at #loc68)), %tmp3_18: i64 loc(callsite(#loc1 at #loc68))): + %tmp3_19 = arith.addi %tmp3_17, %tmp3_18 : i64 loc(#loc74) + tt.reduce.return %tmp3_19 : i64 loc(#loc72) + }) : (tensor<8x16xi64, #blocked>) -> tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc72) + %tmp3_16 = tt.expand_dims %tmp3 {axis = 1 : i32} : tensor<8xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<8x1xi64, #blocked> loc(#loc69) + %tmp5 = arith.trunci %tmp3_16 : tensor<8x1xi64, #blocked> to tensor<8x1xi32, #blocked> loc(#loc70) + %0 = arith.cmpi sle, %ks0, %c1_i64 : i64 loc(#loc30) + %1 = arith.cmpi sgt, %ks0, %c1_i64 : i64 loc(#loc31) + %2 = arith.extui %1 : i1 to i64 loc(#loc32) + %3 = arith.muli %ks0, %2 : i64 loc(#loc32) + %4 = arith.extui %0 : i1 to i64 loc(#loc71) + %5 = arith.addi %4, %3 : i64 loc(#loc33) + %6 = tt.splat %5 : i64 -> tensor<8x1xi64, #blocked> loc(#loc35) + %7 = arith.muli %x1, %6 : tensor<8x1xi64, #blocked> loc(#loc35) + %8 = arith.addi %x0_8, %7 : tensor<8x1xi64, #blocked> loc(#loc36) + %9 = tt.splat %out_ptr1 : !tt.ptr -> tensor<8x1x!tt.ptr, #blocked> loc(#loc37) + %10 = tt.addptr %9, %8 : tensor<8x1x!tt.ptr, #blocked>, tensor<8x1xi64, #blocked> loc(#loc37) + tt.store %10, %tmp5, %xmask_5 : tensor<8x1x!tt.ptr, #blocked> loc(#loc38) + tt.return loc(#loc39) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":21:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":21:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":22:44) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":22:23) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":23:21) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":24:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":26:19) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":27:19) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":32:29) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":36:43) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":36:39) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":36:54) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":36:58) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":36:50) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":36:34) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":36:73) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":30:40) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":31:31) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":36:63) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":37:23) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":39:23) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":40:48) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":40:8) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":41:28) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":42:19) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":43:49) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":43:75) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":43:66) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":43:57) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":43:41) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":43:34) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":43:30) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":43:25) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":43:88) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/hl/chl76ryeuxy3vqhn7zzfop2anxhx2ux6vnshomo7va3ohvp5db2o.py":43:4) +#loc46 = loc("xoffset"(#loc2)) +#loc47 = loc("xoffset"(#loc3)) +#loc48 = loc("xindex"(#loc4)) +#loc49 = loc("xindex"(#loc5)) +#loc50 = loc("xmask"(#loc6)) +#loc51 = loc("r0_base"(#loc7)) +#loc52 = loc("x0"(#loc8)) +#loc53 = loc("x1"(#loc9)) +#loc54 = loc("r0_mask"(#loc10)) +#loc55 = loc("tmp0"(#loc11)) +#loc56 = loc("tmp0"(#loc12)) +#loc57 = loc("tmp0"(#loc13)) +#loc58 = loc("tmp0"(#loc14)) +#loc59 = loc("tmp0"(#loc15)) +#loc60 = loc("tmp0"(#loc16)) +#loc61 = loc("tmp0"(#loc17)) +#loc62 = loc("_tmp3"(#loc18)) +#loc63 = loc("r0_index"(#loc19)) +#loc64 = loc("tmp0"(#loc20)) +#loc65 = loc("tmp1"(#loc21)) +#loc66 = loc("tmp4"(#loc22)) +#loc67 = loc("_tmp3"(#loc23)) +#loc69 = loc("tmp3"(#loc28)) +#loc70 = loc("tmp5"(#loc29)) +#loc71 = loc(fused[#loc33, #loc34]) +#loc72 = loc(callsite(#loc25 at #loc68)) +#loc74 = loc(callsite(#loc27 at #loc72)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/__grp__triton_red_fused_eq_mul_squeeze_sum_2.json b/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/__grp__triton_red_fused_eq_mul_squeeze_sum_2.json new file mode 100644 index 0000000000000000000000000000000000000000..756efc8ed787aa2a2a2c29700b7cbeff25a00029 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/__grp__triton_red_fused_eq_mul_squeeze_sum_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_red_fused_eq_mul_squeeze_sum_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.source", "triton_red_fused_eq_mul_squeeze_sum_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.ttir", "triton_red_fused_eq_mul_squeeze_sum_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.ttgir", "triton_red_fused_eq_mul_squeeze_sum_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.llir", "triton_red_fused_eq_mul_squeeze_sum_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.ptx", "triton_red_fused_eq_mul_squeeze_sum_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.cubin", "triton_red_fused_eq_mul_squeeze_sum_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..10e7b2ba516f0a0d5fc45d25d394b844b2ff9c49 Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.json b/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.json new file mode 100644 index 0000000000000000000000000000000000000000..487b9f7dd57ba85fc3c69acd3b2af000084d49a0 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.json @@ -0,0 +1 @@ +{"hash": "57807a2af8d8cda7dd5c426812f78ddd6d3e1b3ed08ac308d0a3e735392fa14e", "target": {"backend": "cuda", "arch": 90, "warp_size": 32}, "num_warps": 16, "num_ctas": 1, "num_stages": 1, "warp_size": 32, "maxnreg": null, "cluster_dims": [1, 1, 1], "ptx_version": null, "ptx_options": null, "ir_override": null, "enable_fp_fusion": true, "launch_cooperative_grid": false, "launch_pdl": false, "supported_fp8_dtypes": ["fp8e4b15", "fp8e4nv", "fp8e5"], "deprecated_fp8_dot_operand_dtypes": ["fp8e4b15"], "default_dot_input_precision": "tf32", "allowed_dot_input_precisions": ["tf32", "tf32x3", "ieee"], "max_num_imprecise_acc_default": 1073741824, "extern_libs": [["libdevice", "/workspace/specforge/lib/python3.11/site-packages/triton/backends/nvidia/lib/libdevice.10.bc"]], "debug": true, "backend_name": "cuda", "sanitize_overflow": false, "arch": "sm90", "instrumentation_mode": "", "triton_version": "3.5.1", "tensordesc_meta": [], "shared": 128, "tmem_size": 0, "global_scratch_size": 0, "global_scratch_align": 1, "profile_scratch_size": 0, "profile_scratch_align": 1, "name": "triton_red_fused_eq_mul_squeeze_sum_2"} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..b4525be32600dac1d117e24d92617e9ecc639992 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.llir @@ -0,0 +1,312 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@global_smem = external addrspace(3) global [0 x i8], align 16 + +; Function Attrs: nounwind +define ptx_kernel void @triton_red_fused_eq_mul_squeeze_sum_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i64 %4, i32 %5, i32 %6, ptr addrspace(1) readnone captures(none) %7, ptr addrspace(1) readnone captures(none) %8) local_unnamed_addr #0 !dbg !4 { + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !7 + %11 = icmp samesign ult i32 %10, 2, !dbg !8 + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !9 + %13 = and i32 %12, 511, !dbg !9 + %14 = shl i64 %4, 2, !dbg !10 + %15 = zext nneg i32 %10 to i64, !dbg !11 + %16 = mul i64 %14, %15, !dbg !11 + %17 = icmp sgt i32 %6, 0, !dbg !12 + br i1 %17, label %.lr.ph.preheader, label %._crit_edge, !dbg !12 + +.lr.ph.preheader: ; preds = %9 + %18 = insertelement <4 x i32> poison, i32 %6, i64 0 + %19 = shufflevector <4 x i32> %18, <4 x i32> poison, <4 x i32> zeroinitializer + %20 = insertelement <2 x i64> , i64 %4, i64 0 + %21 = insertelement <4 x i1> poison, i1 %11, i64 0 + %22 = shufflevector <4 x i1> %21, <4 x i1> poison, <4 x i32> zeroinitializer + br label %.lr.ph, !dbg !12 + +.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph + %23 = phi i32 [ %132, %.lr.ph ], [ 0, %.lr.ph.preheader ] + %24 = phi <4 x i64> [ %131, %.lr.ph ], [ zeroinitializer, %.lr.ph.preheader ] + %25 = or disjoint i32 %12, %23, !dbg !13 + %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !14 + %27 = or disjoint i32 %23, %13, !dbg !13 + %28 = or i32 %25, 512, !dbg !13 + %29 = or disjoint i32 %27, 1024, !dbg !13 + %30 = or i32 %25, 1536, !dbg !13 + %31 = insertelement <4 x i32> poison, i32 %27, i64 0, !dbg !15 + %32 = insertelement <4 x i32> %31, i32 %28, i64 1, !dbg !15 + %33 = insertelement <4 x i32> %32, i32 %29, i64 2, !dbg !15 + %34 = insertelement <4 x i32> %33, i32 %30, i64 3, !dbg !15 + %35 = icmp slt <4 x i32> %34, %19, !dbg !15 + %36 = sext i32 %27 to i64, !dbg !16 + %37 = sext i32 %28 to i64, !dbg !16 + %38 = sext i32 %29 to i64, !dbg !16 + %39 = sext i32 %30 to i64, !dbg !16 + %40 = add i64 %16, %36, !dbg !16 + %41 = add i64 %16, %37, !dbg !16 + %42 = add i64 %16, %38, !dbg !16 + %43 = add i64 %16, %39, !dbg !16 + %44 = sdiv i64 %40, %4, !dbg !17 + %45 = sdiv i64 %41, %4, !dbg !17 + %46 = sdiv i64 %42, %4, !dbg !17 + %47 = sdiv i64 %43, %4, !dbg !17 + %48 = insertelement <2 x i64> poison, i64 %36, i64 0, !dbg !18 + %49 = insertelement <2 x i64> %48, i64 %44, i64 1, !dbg !18 + %50 = srem <2 x i64> %49, %20, !dbg !18 + %51 = extractelement <2 x i64> %50, i64 1, !dbg !19 + %52 = mul i64 %51, %4, !dbg !19 + %53 = insertelement <2 x i64> poison, i64 %37, i64 0, !dbg !18 + %54 = insertelement <2 x i64> %53, i64 %45, i64 1, !dbg !18 + %55 = srem <2 x i64> %54, %20, !dbg !18 + %56 = extractelement <2 x i64> %55, i64 1, !dbg !19 + %57 = mul i64 %56, %4, !dbg !19 + %58 = insertelement <2 x i64> poison, i64 %38, i64 0, !dbg !18 + %59 = insertelement <2 x i64> %58, i64 %46, i64 1, !dbg !18 + %60 = srem <2 x i64> %59, %20, !dbg !18 + %61 = extractelement <2 x i64> %60, i64 1, !dbg !19 + %62 = mul i64 %61, %4, !dbg !19 + %63 = insertelement <2 x i64> poison, i64 %39, i64 0, !dbg !18 + %64 = insertelement <2 x i64> %63, i64 %47, i64 1, !dbg !18 + %65 = srem <2 x i64> %64, %20, !dbg !18 + %66 = extractelement <2 x i64> %65, i64 1, !dbg !19 + %67 = mul i64 %66, %4, !dbg !19 + %68 = extractelement <2 x i64> %50, i64 0, !dbg !20 + %69 = add i64 %68, %52, !dbg !20 + %70 = extractelement <2 x i64> %55, i64 0, !dbg !20 + %71 = add i64 %70, %57, !dbg !20 + %72 = extractelement <2 x i64> %60, i64 0, !dbg !20 + %73 = add i64 %72, %62, !dbg !20 + %74 = extractelement <2 x i64> %65, i64 0, !dbg !20 + %75 = add i64 %74, %67, !dbg !20 + %76 = getelementptr i64, ptr addrspace(1) %0, i64 %69, !dbg !21 + %77 = getelementptr i64, ptr addrspace(1) %0, i64 %71, !dbg !21 + %78 = getelementptr i64, ptr addrspace(1) %0, i64 %73, !dbg !21 + %79 = getelementptr i64, ptr addrspace(1) %0, i64 %75, !dbg !21 + %80 = and <4 x i1> %22, %35, !dbg !22 + %81 = extractelement <4 x i1> %80, i64 0, !dbg !23 + %82 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %76, i64 %26, i1 %81) #5, !dbg !14 + %83 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !14 + %84 = extractelement <4 x i1> %80, i64 1, !dbg !23 + %85 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %77, i64 %83, i1 %84) #5, !dbg !14 + %86 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !14 + %87 = extractelement <4 x i1> %80, i64 2, !dbg !23 + %88 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %78, i64 %86, i1 %87) #5, !dbg !14 + %89 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !14 + %90 = extractelement <4 x i1> %80, i64 3, !dbg !23 + %91 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %79, i64 %89, i1 %90) #5, !dbg !14 + %92 = getelementptr i64, ptr addrspace(1) %1, i64 %69, !dbg !24 + %93 = getelementptr i64, ptr addrspace(1) %1, i64 %71, !dbg !24 + %94 = getelementptr i64, ptr addrspace(1) %1, i64 %73, !dbg !24 + %95 = getelementptr i64, ptr addrspace(1) %1, i64 %75, !dbg !24 + %96 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !25 + %97 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %92, i64 %96, i1 %81) #5, !dbg !25 + %98 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !25 + %99 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %93, i64 %98, i1 %84) #5, !dbg !25 + %100 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !25 + %101 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %94, i64 %100, i1 %87) #5, !dbg !25 + %102 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !25 + %103 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %95, i64 %102, i1 %90) #5, !dbg !25 + %104 = getelementptr i64, ptr addrspace(1) %2, i64 %69, !dbg !26 + %105 = getelementptr i64, ptr addrspace(1) %2, i64 %71, !dbg !26 + %106 = getelementptr i64, ptr addrspace(1) %2, i64 %73, !dbg !26 + %107 = getelementptr i64, ptr addrspace(1) %2, i64 %75, !dbg !26 + %108 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !23 + %109 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %104, i64 %108, i1 %81) #5, !dbg !23 + %110 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !23 + %111 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %105, i64 %110, i1 %84) #5, !dbg !23 + %112 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !23 + %113 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %106, i64 %112, i1 %87) #5, !dbg !23 + %114 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09createpolicy.fractional.L2::evict_last.b64 $0, 1.0;", "=l"() #5, !dbg !23 + %115 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$3 ld.global.L1::evict_last.L2::cache_hint.b64 { $0 }, [ $1 + 0 ], $2;", "=l,l,l,b"(ptr addrspace(1) %107, i64 %114, i1 %90) #5, !dbg !23 + %116 = insertelement <4 x i64> poison, i64 %82, i64 0, !dbg !27 + %117 = insertelement <4 x i64> %116, i64 %85, i64 1, !dbg !27 + %118 = insertelement <4 x i64> %117, i64 %88, i64 2, !dbg !27 + %119 = insertelement <4 x i64> %118, i64 %91, i64 3, !dbg !27 + %120 = insertelement <4 x i64> poison, i64 %97, i64 0, !dbg !27 + %121 = insertelement <4 x i64> %120, i64 %99, i64 1, !dbg !27 + %122 = insertelement <4 x i64> %121, i64 %101, i64 2, !dbg !27 + %123 = insertelement <4 x i64> %122, i64 %103, i64 3, !dbg !27 + %124 = icmp eq <4 x i64> %119, %123, !dbg !27 + %125 = select <4 x i1> %80, <4 x i1> %124, <4 x i1> zeroinitializer, !dbg !28 + %126 = insertelement <4 x i64> poison, i64 %109, i64 0, !dbg !28 + %127 = insertelement <4 x i64> %126, i64 %111, i64 1, !dbg !28 + %128 = insertelement <4 x i64> %127, i64 %113, i64 2, !dbg !28 + %129 = insertelement <4 x i64> %128, i64 %115, i64 3, !dbg !28 + %130 = select <4 x i1> %125, <4 x i64> %129, <4 x i64> zeroinitializer, !dbg !28 + %131 = add <4 x i64> %130, %24, !dbg !28 + %132 = add i32 %23, 2048, !dbg !12 + %133 = icmp slt i32 %132, %6, !dbg !12 + br i1 %133, label %.lr.ph, label %._crit_edge.loopexit, !dbg !12 + +._crit_edge.loopexit: ; preds = %.lr.ph + %134 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %131), !dbg !9 + br label %._crit_edge, !dbg !9 + +._crit_edge: ; preds = %._crit_edge.loopexit, %9 + %135 = phi i64 [ 0, %9 ], [ %134, %._crit_edge.loopexit ], !dbg !29 + %136 = and i32 %12, 31, !dbg !9 + %137 = lshr i32 %12, 5, !dbg !9 + %extelt.offset = lshr i64 %135, 32, !dbg !33 + %138 = trunc nuw i64 %extelt.offset to i32, !dbg !33 + %139 = trunc i64 %135 to i32, !dbg !33 + %140 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %139, i32 16, i32 31), !dbg !33 + %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %138, i32 16, i32 31), !dbg !33 + %142 = insertelement <2 x i32> poison, i32 %140, i64 0, !dbg !33 + %143 = insertelement <2 x i32> %142, i32 %141, i64 1, !dbg !33 + %144 = bitcast <2 x i32> %143 to i64, !dbg !33 + %145 = add i64 %135, %144, !dbg !29 + %extelt.offset2 = lshr i64 %145, 32, !dbg !33 + %146 = trunc nuw i64 %extelt.offset2 to i32, !dbg !33 + %147 = trunc i64 %145 to i32, !dbg !33 + %148 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %147, i32 8, i32 31), !dbg !33 + %149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %146, i32 8, i32 31), !dbg !33 + %150 = insertelement <2 x i32> poison, i32 %148, i64 0, !dbg !33 + %151 = insertelement <2 x i32> %150, i32 %149, i64 1, !dbg !33 + %152 = bitcast <2 x i32> %151 to i64, !dbg !33 + %153 = add i64 %145, %152, !dbg !29 + %extelt.offset3 = lshr i64 %153, 32, !dbg !33 + %154 = trunc nuw i64 %extelt.offset3 to i32, !dbg !33 + %155 = trunc i64 %153 to i32, !dbg !33 + %156 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %155, i32 4, i32 31), !dbg !33 + %157 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 4, i32 31), !dbg !33 + %158 = insertelement <2 x i32> poison, i32 %156, i64 0, !dbg !33 + %159 = insertelement <2 x i32> %158, i32 %157, i64 1, !dbg !33 + %160 = bitcast <2 x i32> %159 to i64, !dbg !33 + %161 = add i64 %153, %160, !dbg !29 + %extelt.offset4 = lshr i64 %161, 32, !dbg !33 + %162 = trunc nuw i64 %extelt.offset4 to i32, !dbg !33 + %163 = trunc i64 %161 to i32, !dbg !33 + %164 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %163, i32 2, i32 31), !dbg !33 + %165 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %162, i32 2, i32 31), !dbg !33 + %166 = insertelement <2 x i32> poison, i32 %164, i64 0, !dbg !33 + %167 = insertelement <2 x i32> %166, i32 %165, i64 1, !dbg !33 + %168 = bitcast <2 x i32> %167 to i64, !dbg !33 + %169 = add i64 %161, %168, !dbg !29 + %extelt.offset5 = lshr i64 %169, 32, !dbg !33 + %170 = trunc nuw i64 %extelt.offset5 to i32, !dbg !33 + %171 = trunc i64 %169 to i32, !dbg !33 + %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %171, i32 1, i32 31), !dbg !33 + %173 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %170, i32 1, i32 31), !dbg !33 + %174 = insertelement <2 x i32> poison, i32 %172, i64 0, !dbg !33 + %175 = insertelement <2 x i32> %174, i32 %173, i64 1, !dbg !33 + %176 = bitcast <2 x i32> %175 to i64, !dbg !33 + %177 = add i64 %169, %176, !dbg !29 + %178 = and i32 %137, 15, !dbg !33 + %179 = icmp eq i32 %136, 0, !dbg !33 + %180 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %178, !dbg !33 + %181 = insertelement <1 x i64> poison, i64 %177, i64 0, !dbg !33 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %180, <1 x i64> %181, i1 %179) #5, !dbg !33 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !33 + %182 = icmp samesign ult i32 %12, 16, !dbg !33 + %183 = getelementptr i64, ptr addrspace(3) @global_smem, i32 %12, !dbg !33 + %184 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %183, i1 %182) #5, !dbg !33 + %extelt.offset6 = lshr i64 %184, 32, !dbg !33 + %185 = trunc nuw i64 %extelt.offset6 to i32, !dbg !33 + %186 = trunc i64 %184 to i32, !dbg !33 + %187 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %186, i32 8, i32 31), !dbg !33 + %188 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %185, i32 8, i32 31), !dbg !33 + %189 = insertelement <2 x i32> poison, i32 %187, i64 0, !dbg !33 + %190 = insertelement <2 x i32> %189, i32 %188, i64 1, !dbg !33 + %191 = bitcast <2 x i32> %190 to i64, !dbg !33 + %192 = add i64 %184, %191, !dbg !29 + %extelt.offset7 = lshr i64 %192, 32, !dbg !33 + %193 = trunc nuw i64 %extelt.offset7 to i32, !dbg !33 + %194 = trunc i64 %192 to i32, !dbg !33 + %195 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %194, i32 4, i32 31), !dbg !33 + %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %193, i32 4, i32 31), !dbg !33 + %197 = insertelement <2 x i32> poison, i32 %195, i64 0, !dbg !33 + %198 = insertelement <2 x i32> %197, i32 %196, i64 1, !dbg !33 + %199 = bitcast <2 x i32> %198 to i64, !dbg !33 + %200 = add i64 %192, %199, !dbg !29 + %extelt.offset8 = lshr i64 %200, 32, !dbg !33 + %201 = trunc nuw i64 %extelt.offset8 to i32, !dbg !33 + %202 = trunc i64 %200 to i32, !dbg !33 + %203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 2, i32 31), !dbg !33 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %201, i32 2, i32 31), !dbg !33 + %205 = insertelement <2 x i32> poison, i32 %203, i64 0, !dbg !33 + %206 = insertelement <2 x i32> %205, i32 %204, i64 1, !dbg !33 + %207 = bitcast <2 x i32> %206 to i64, !dbg !33 + %208 = add i64 %200, %207, !dbg !29 + %extelt.offset9 = lshr i64 %208, 32, !dbg !33 + %209 = trunc nuw i64 %extelt.offset9 to i32, !dbg !33 + %210 = trunc i64 %208 to i32, !dbg !33 + %211 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %210, i32 1, i32 31), !dbg !33 + %212 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %209, i32 1, i32 31), !dbg !33 + %213 = insertelement <2 x i32> poison, i32 %211, i64 0, !dbg !33 + %214 = insertelement <2 x i32> %213, i32 %212, i64 1, !dbg !33 + %215 = bitcast <2 x i32> %214 to i64, !dbg !33 + %216 = add i64 %208, %215, !dbg !29 + %217 = icmp eq i32 %12, 0, !dbg !33 + %218 = insertelement <1 x i64> poison, i64 %216, i64 0, !dbg !33 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %183, <1 x i64> %218, i1 %217) #5, !dbg !33 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !33 + %219 = load i64, ptr addrspace(3) @global_smem, align 16, !dbg !33 + %220 = getelementptr i64, ptr addrspace(1) %3, i64 %15, !dbg !34 + %221 = icmp eq i32 %13, 0, !dbg !35 + %222 = and i1 %11, %221, !dbg !35 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %219, ptr addrspace(1) %220, i1 %222) #5, !dbg !35 + ret void, !dbg !36 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #3 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #4 + +attributes #0 = { nounwind "nvvm.reqntid"="512" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { convergent nocallback nounwind } +attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = distinct !DISubprogram(name: "triton_red_fused_eq_mul_squeeze_sum_2", linkageName: "triton_red_fused_eq_mul_squeeze_sum_2", scope: !1, file: !1, line: 18, type: !5, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 22, column: 28, scope: !4) +!8 = !DILocation(line: 24, column: 21, scope: !4) +!9 = !DILocation(line: 25, column: 37, scope: !4) +!10 = !DILocation(line: 35, column: 51, scope: !4) +!11 = !DILocation(line: 35, column: 55, scope: !4) +!12 = !DILocation(line: 29, column: 40, scope: !4) +!13 = !DILocation(line: 30, column: 31, scope: !4) +!14 = !DILocation(line: 35, column: 92, scope: !4) +!15 = !DILocation(line: 31, column: 29, scope: !4) +!16 = !DILocation(line: 35, column: 49, scope: !4) +!17 = !DILocation(line: 35, column: 62, scope: !4) +!18 = !DILocation(line: 35, column: 84, scope: !4) +!19 = !DILocation(line: 35, column: 40, scope: !4) +!20 = !DILocation(line: 35, column: 77, scope: !4) +!21 = !DILocation(line: 35, column: 34, scope: !4) +!22 = !DILocation(line: 35, column: 102, scope: !4) +!23 = !DILocation(line: 37, column: 92, scope: !4) +!24 = !DILocation(line: 36, column: 34, scope: !4) +!25 = !DILocation(line: 36, column: 92, scope: !4) +!26 = !DILocation(line: 37, column: 34, scope: !4) +!27 = !DILocation(line: 38, column: 23, scope: !4) +!28 = !DILocation(line: 43, column: 48, scope: !4) +!29 = !DILocation(line: 261, column: 15, scope: !30, inlinedAt: !32) +!30 = distinct !DILexicalBlockFile(scope: !4, file: !31, discriminator: 0) +!31 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!32 = !DILocation(line: 44, column: 25, scope: !4) +!33 = !DILocation(line: 291, column: 36, scope: !30, inlinedAt: !32) +!34 = !DILocation(line: 45, column: 25, scope: !4) +!35 = !DILocation(line: 45, column: 36, scope: !4) +!36 = !DILocation(line: 45, column: 4, scope: !4) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..1edb16ca64a7f2404fa156190fa2609ae6642080 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.ptx @@ -0,0 +1,703 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_red_fused_eq_mul_squeeze_sum_2 // -- Begin function triton_red_fused_eq_mul_squeeze_sum_2 +.extern .shared .align 16 .b8 global_smem[]; + // @triton_red_fused_eq_mul_squeeze_sum_2 +.visible .entry triton_red_fused_eq_mul_squeeze_sum_2( + .param .u64 .ptr .global .align 1 triton_red_fused_eq_mul_squeeze_sum_2_param_0, + .param .u64 .ptr .global .align 1 triton_red_fused_eq_mul_squeeze_sum_2_param_1, + .param .u64 .ptr .global .align 1 triton_red_fused_eq_mul_squeeze_sum_2_param_2, + .param .u64 .ptr .global .align 1 triton_red_fused_eq_mul_squeeze_sum_2_param_3, + .param .u64 triton_red_fused_eq_mul_squeeze_sum_2_param_4, + .param .u32 triton_red_fused_eq_mul_squeeze_sum_2_param_5, + .param .u32 triton_red_fused_eq_mul_squeeze_sum_2_param_6, + .param .u64 .ptr .global .align 1 triton_red_fused_eq_mul_squeeze_sum_2_param_7, + .param .u64 .ptr .global .align 1 triton_red_fused_eq_mul_squeeze_sum_2_param_8 +) +.reqntid 512 +{ + .reg .pred %p<37>; + .reg .b32 %r<77>; + .reg .b64 %rd<187>; + .loc 1 18 0 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:18:0 + +// %bb.0: + ld.param.b32 %r13, [triton_red_fused_eq_mul_squeeze_sum_2_param_6]; + ld.param.b64 %rd39, [triton_red_fused_eq_mul_squeeze_sum_2_param_3]; +$L__tmp0: + .loc 1 22 28 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:22:28 + mov.u32 %r14, %ctaid.x; + .loc 1 25 37 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:25:37 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 511; + .loc 1 35 55 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:35:55 + cvt.u64.u32 %rd1, %r14; + .loc 1 29 40 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:29:40 + setp.lt.s32 %p5, %r13, 1; + mov.b64 %rd186, 0; + cvt.u32.u64 %r75, %rd1; + @%p5 bra $L__BB0_16; +// %bb.1: // %.lr.ph.preheader + .loc 1 0 40 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:0:40 + ld.param.b64 %rd40, [triton_red_fused_eq_mul_squeeze_sum_2_param_4]; + ld.param.b64 %rd38, [triton_red_fused_eq_mul_squeeze_sum_2_param_2]; + ld.param.b64 %rd37, [triton_red_fused_eq_mul_squeeze_sum_2_param_1]; + ld.param.b64 %rd36, [triton_red_fused_eq_mul_squeeze_sum_2_param_0]; + mul.lo.s64 %rd42, %rd40, %rd1; + shl.b64 %rd2, %rd42, 2; + .loc 1 24 21 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:24:21 + setp.lt.u32 %p1, %r75, 2; + mov.b64 %rd4, 8; + mov.b64 %rd178, 0; + mov.b32 %r76, 0; + mov.b64 %rd179, %rd178; + mov.b64 %rd180, %rd178; + mov.b64 %rd181, %rd178; + bra.uni $L__BB0_2; +$L__BB0_13: // in Loop: Header=BB0_2 Depth=1 + .loc 1 35 62 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:35:62 + div.s64 %rd185, %rd17, %rd40; +$L__BB0_14: // in Loop: Header=BB0_2 Depth=1 + .loc 1 31 29 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:31:29 + setp.lt.s32 %p22, %r8, %r13; + setp.lt.s32 %p23, %r9, %r13; + setp.lt.s32 %p24, %r10, %r13; + setp.lt.s32 %p25, %r11, %r13; + .loc 1 35 84 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:35:84 + rem.s64 %rd103, %rd10, %rd40; + rem.s64 %rd104, %rd182, %rd4; + rem.s64 %rd105, %rd11, %rd40; + rem.s64 %rd106, %rd183, %rd4; + rem.s64 %rd107, %rd12, %rd40; + rem.s64 %rd108, %rd184, %rd4; + rem.s64 %rd109, %rd13, %rd40; + rem.s64 %rd110, %rd185, %rd4; + .loc 1 35 77 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:35:77 + mad.lo.s64 %rd111, %rd104, %rd40, %rd103; + mad.lo.s64 %rd112, %rd106, %rd40, %rd105; + mad.lo.s64 %rd113, %rd108, %rd40, %rd107; + mad.lo.s64 %rd114, %rd110, %rd40, %rd109; + .loc 1 35 34 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:35:34 + shl.b64 %rd115, %rd111, 3; + add.s64 %rd57, %rd36, %rd115; + shl.b64 %rd116, %rd112, 3; + add.s64 %rd61, %rd36, %rd116; + shl.b64 %rd117, %rd113, 3; + add.s64 %rd65, %rd36, %rd117; + shl.b64 %rd118, %rd114, 3; + add.s64 %rd69, %rd36, %rd118; + .loc 1 35 102 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:35:102 + and.pred %p17, %p1, %p25; + and.pred %p16, %p1, %p24; + and.pred %p15, %p1, %p23; + and.pred %p14, %p1, %p22; + .loc 1 35 92 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:35:92 + // begin inline asm + mov.u64 %rd56, 0x0; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd56 }, [ %rd57 + 0 ], %rd47; + // end inline asm + // begin inline asm + mov.u64 %rd59, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd59, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd60, 0x0; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd60 }, [ %rd61 + 0 ], %rd59; + // end inline asm + // begin inline asm + mov.u64 %rd63, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd63, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd64, 0x0; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd64 }, [ %rd65 + 0 ], %rd63; + // end inline asm + // begin inline asm + mov.u64 %rd67, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd67, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd68, 0x0; + @%p17 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd68 }, [ %rd69 + 0 ], %rd67; + // end inline asm + .loc 1 36 34 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:36:34 + add.s64 %rd73, %rd37, %rd115; + add.s64 %rd77, %rd37, %rd116; + add.s64 %rd81, %rd37, %rd117; + add.s64 %rd85, %rd37, %rd118; + .loc 1 36 92 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:36:92 + // begin inline asm + mov.u64 %rd71, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd71, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd72, 0x0; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd72 }, [ %rd73 + 0 ], %rd71; + // end inline asm + // begin inline asm + mov.u64 %rd75, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd75, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd76, 0x0; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd76 }, [ %rd77 + 0 ], %rd75; + // end inline asm + // begin inline asm + mov.u64 %rd79, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd79, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd80, 0x0; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd80 }, [ %rd81 + 0 ], %rd79; + // end inline asm + // begin inline asm + mov.u64 %rd83, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd83, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd84, 0x0; + @%p17 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd84 }, [ %rd85 + 0 ], %rd83; + // end inline asm + .loc 1 37 34 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:37:34 + add.s64 %rd89, %rd38, %rd115; + add.s64 %rd93, %rd38, %rd116; + add.s64 %rd97, %rd38, %rd117; + add.s64 %rd101, %rd38, %rd118; + .loc 1 37 92 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:37:92 + // begin inline asm + mov.u64 %rd87, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd87, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd88, 0x0; + @%p14 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd88 }, [ %rd89 + 0 ], %rd87; + // end inline asm + // begin inline asm + mov.u64 %rd91, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd91, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd92, 0x0; + @%p15 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd92 }, [ %rd93 + 0 ], %rd91; + // end inline asm + // begin inline asm + mov.u64 %rd95, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd95, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd96, 0x0; + @%p16 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd96 }, [ %rd97 + 0 ], %rd95; + // end inline asm + // begin inline asm + mov.u64 %rd99, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd99, 1.0; + // end inline asm + // begin inline asm + mov.u64 %rd100, 0x0; + @%p17 ld.global.L1::evict_last.L2::cache_hint.b64 { %rd100 }, [ %rd101 + 0 ], %rd99; + // end inline asm + .loc 1 38 23 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:38:23 + setp.eq.b64 %p26, %rd56, %rd72; + setp.eq.b64 %p27, %rd60, %rd76; + setp.eq.b64 %p28, %rd64, %rd80; + setp.eq.b64 %p29, %rd68, %rd84; + .loc 1 43 48 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:43:48 + selp.b64 %rd119, %rd100, 0, %p29; + selp.b64 %rd120, %rd119, 0, %p17; + selp.b64 %rd121, %rd96, 0, %p28; + selp.b64 %rd122, %rd121, 0, %p16; + selp.b64 %rd123, %rd92, 0, %p27; + selp.b64 %rd124, %rd123, 0, %p15; + selp.b64 %rd125, %rd88, 0, %p26; + selp.b64 %rd126, %rd125, 0, %p14; + add.s64 %rd178, %rd126, %rd178; + add.s64 %rd179, %rd124, %rd179; + add.s64 %rd180, %rd122, %rd180; + add.s64 %rd181, %rd120, %rd181; + .loc 1 29 40 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:29:40 + add.s32 %r76, %r76, 2048; + setp.lt.s32 %p30, %r76, %r13; + @%p30 bra $L__BB0_2; + bra.uni $L__BB0_15; +$L__BB0_2: // %.lr.ph + // =>This Inner Loop Header: Depth=1 + .loc 1 35 92 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:35:92 + add.s32 %r17, %r1, %r76; + // begin inline asm + mov.u64 %rd47, 0x0; + createpolicy.fractional.L2::evict_last.b64 %rd47, 1.0; + // end inline asm + .loc 1 30 31 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:30:31 + add.s32 %r8, %r2, %r76; + or.b32 %r9, %r17, 512; + .loc 1 35 49 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:35:49 + cvt.s64.s32 %rd10, %r8; + cvt.s64.s32 %rd11, %r9; + add.s64 %rd14, %rd2, %rd10; + add.s64 %rd15, %rd2, %rd11; + .loc 1 35 62 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:35:62 + or.b64 %rd48, %rd14, %rd40; + and.b64 %rd49, %rd48, -4294967296; + setp.ne.b64 %p6, %rd49, 0; + @%p6 bra $L__BB0_4; + bra.uni $L__BB0_3; +$L__BB0_4: // in Loop: Header=BB0_2 Depth=1 + div.s64 %rd182, %rd14, %rd40; + bra.uni $L__BB0_5; +$L__BB0_3: // in Loop: Header=BB0_2 Depth=1 + cvt.u32.u64 %r18, %rd40; + cvt.u32.u64 %r19, %rd14; + div.u32 %r20, %r19, %r18; + cvt.u64.u32 %rd182, %r20; +$L__BB0_5: // in Loop: Header=BB0_2 Depth=1 + .loc 1 0 0 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:0 + add.s32 %r10, %r8, 1024; + .loc 1 35 0 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:35 + cvt.s64.s32 %rd12, %r10; + add.s64 %rd16, %rd2, %rd12; + .loc 1 35 62 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:35:62 + or.b64 %rd50, %rd15, %rd40; + and.b64 %rd51, %rd50, -4294967296; + setp.ne.b64 %p7, %rd51, 0; + @%p7 bra $L__BB0_7; + bra.uni $L__BB0_6; +$L__BB0_7: // in Loop: Header=BB0_2 Depth=1 + div.s64 %rd183, %rd15, %rd40; + bra.uni $L__BB0_8; +$L__BB0_6: // in Loop: Header=BB0_2 Depth=1 + cvt.u32.u64 %r21, %rd40; + cvt.u32.u64 %r22, %rd15; + div.u32 %r23, %r22, %r21; + cvt.u64.u32 %rd183, %r23; +$L__BB0_8: // in Loop: Header=BB0_2 Depth=1 + .loc 1 0 0 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:0 + or.b32 %r11, %r17, 1536; + .loc 1 35 0 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:35 + cvt.s64.s32 %rd13, %r11; + add.s64 %rd17, %rd2, %rd13; + .loc 1 35 62 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:35:62 + or.b64 %rd52, %rd16, %rd40; + and.b64 %rd53, %rd52, -4294967296; + setp.ne.b64 %p8, %rd53, 0; + @%p8 bra $L__BB0_10; + bra.uni $L__BB0_9; +$L__BB0_10: // in Loop: Header=BB0_2 Depth=1 + div.s64 %rd184, %rd16, %rd40; + bra.uni $L__BB0_11; +$L__BB0_9: // in Loop: Header=BB0_2 Depth=1 + cvt.u32.u64 %r24, %rd40; + cvt.u32.u64 %r25, %rd16; + div.u32 %r26, %r25, %r24; + cvt.u64.u32 %rd184, %r26; +$L__BB0_11: // in Loop: Header=BB0_2 Depth=1 + or.b64 %rd54, %rd17, %rd40; + and.b64 %rd55, %rd54, -4294967296; + setp.ne.b64 %p9, %rd55, 0; + @%p9 bra $L__BB0_13; +// %bb.12: // in Loop: Header=BB0_2 Depth=1 + cvt.u32.u64 %r27, %rd40; + cvt.u32.u64 %r28, %rd17; + div.u32 %r29, %r28, %r27; + cvt.u64.u32 %rd185, %r29; + bra.uni $L__BB0_14; +$L__BB0_15: // %._crit_edge.loopexit + .loc 1 25 37 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:25:37 + add.s64 %rd127, %rd178, %rd180; + add.s64 %rd128, %rd179, %rd181; + add.s64 %rd186, %rd127, %rd128; +$L__BB0_16: // %._crit_edge + .loc 1 24 21 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:24:21 + setp.lt.u32 %p35, %r75, 2; + .loc 1 25 37 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:25:37 + and.b32 %r34, %r1, 31; +$L__tmp1: + .loc 2 291 36 // standard.py:291:36 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + mov.b64 {_, %r35}, %rd186; + cvt.u32.u64 %r36, %rd186; + shfl.sync.bfly.b32 %r37, %r36, 16, 31, -1; + shfl.sync.bfly.b32 %r38, %r35, 16, 31, -1; + cvt.u64.u32 %rd134, %r37; + cvt.u64.u32 %rd135, %r38; + shl.b64 %rd136, %rd135, 32; + or.b64 %rd137, %rd134, %rd136; + .loc 2 261 15 // standard.py:261:15 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + add.s64 %rd138, %rd186, %rd137; + .loc 2 291 36 // standard.py:291:36 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + mov.b64 {_, %r39}, %rd138; + cvt.u32.u64 %r40, %rd138; + shfl.sync.bfly.b32 %r41, %r40, 8, 31, -1; + shfl.sync.bfly.b32 %r42, %r39, 8, 31, -1; + cvt.u64.u32 %rd139, %r41; + cvt.u64.u32 %rd140, %r42; + shl.b64 %rd141, %rd140, 32; + or.b64 %rd142, %rd139, %rd141; + .loc 2 261 15 // standard.py:261:15 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + add.s64 %rd143, %rd138, %rd142; + .loc 2 291 36 // standard.py:291:36 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + mov.b64 {_, %r43}, %rd143; + cvt.u32.u64 %r44, %rd143; + shfl.sync.bfly.b32 %r45, %r44, 4, 31, -1; + shfl.sync.bfly.b32 %r46, %r43, 4, 31, -1; + cvt.u64.u32 %rd144, %r45; + cvt.u64.u32 %rd145, %r46; + shl.b64 %rd146, %rd145, 32; + or.b64 %rd147, %rd144, %rd146; + .loc 2 261 15 // standard.py:261:15 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + add.s64 %rd148, %rd143, %rd147; + .loc 2 291 36 // standard.py:291:36 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + mov.b64 {_, %r47}, %rd148; + cvt.u32.u64 %r48, %rd148; + shfl.sync.bfly.b32 %r49, %r48, 2, 31, -1; + shfl.sync.bfly.b32 %r50, %r47, 2, 31, -1; + cvt.u64.u32 %rd149, %r49; + cvt.u64.u32 %rd150, %r50; + shl.b64 %rd151, %rd150, 32; + or.b64 %rd152, %rd149, %rd151; + .loc 2 261 15 // standard.py:261:15 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + add.s64 %rd153, %rd148, %rd152; + .loc 2 291 36 // standard.py:291:36 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + mov.b64 {_, %r51}, %rd153; + cvt.u32.u64 %r52, %rd153; + shfl.sync.bfly.b32 %r53, %r52, 1, 31, -1; + shfl.sync.bfly.b32 %r54, %r51, 1, 31, -1; + cvt.u64.u32 %rd154, %r53; + cvt.u64.u32 %rd155, %r54; + shl.b64 %rd156, %rd155, 32; + or.b64 %rd157, %rd154, %rd156; + .loc 2 261 15 // standard.py:261:15 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + add.s64 %rd129, %rd153, %rd157; + .loc 2 291 36 // standard.py:291:36 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + setp.eq.b32 %p31, %r34, 0; + shr.u32 %r55, %r1, 2; + and.b32 %r56, %r55, 120; + mov.b32 %r57, global_smem; + add.s32 %r30, %r57, %r56; + // begin inline asm + @%p31 st.shared.b64 [ %r30 + 0 ], %rd129; + // end inline asm + bar.sync 0; + setp.lt.u32 %p32, %r1, 16; + shl.b32 %r58, %r1, 3; + add.s32 %r31, %r57, %r58; + // begin inline asm + @%p32 ld.shared.b64 %rd130, [ %r31 + 0 ]; + // end inline asm + mov.b64 {_, %r59}, %rd130; + cvt.u32.u64 %r60, %rd130; + shfl.sync.bfly.b32 %r61, %r60, 8, 31, -1; + shfl.sync.bfly.b32 %r62, %r59, 8, 31, -1; + cvt.u64.u32 %rd158, %r61; + cvt.u64.u32 %rd159, %r62; + shl.b64 %rd160, %rd159, 32; + or.b64 %rd161, %rd158, %rd160; + .loc 2 261 15 // standard.py:261:15 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + add.s64 %rd162, %rd130, %rd161; + .loc 2 291 36 // standard.py:291:36 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + mov.b64 {_, %r63}, %rd162; + cvt.u32.u64 %r64, %rd162; + shfl.sync.bfly.b32 %r65, %r64, 4, 31, -1; + shfl.sync.bfly.b32 %r66, %r63, 4, 31, -1; + cvt.u64.u32 %rd163, %r65; + cvt.u64.u32 %rd164, %r66; + shl.b64 %rd165, %rd164, 32; + or.b64 %rd166, %rd163, %rd165; + .loc 2 261 15 // standard.py:261:15 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + add.s64 %rd167, %rd162, %rd166; + .loc 2 291 36 // standard.py:291:36 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + mov.b64 {_, %r67}, %rd167; + cvt.u32.u64 %r68, %rd167; + shfl.sync.bfly.b32 %r69, %r68, 2, 31, -1; + shfl.sync.bfly.b32 %r70, %r67, 2, 31, -1; + cvt.u64.u32 %rd168, %r69; + cvt.u64.u32 %rd169, %r70; + shl.b64 %rd170, %rd169, 32; + or.b64 %rd171, %rd168, %rd170; + .loc 2 261 15 // standard.py:261:15 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + add.s64 %rd172, %rd167, %rd171; + .loc 2 291 36 // standard.py:291:36 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + mov.b64 {_, %r71}, %rd172; + cvt.u32.u64 %r72, %rd172; + shfl.sync.bfly.b32 %r73, %r72, 1, 31, -1; + shfl.sync.bfly.b32 %r74, %r71, 1, 31, -1; + cvt.u64.u32 %rd173, %r73; + cvt.u64.u32 %rd174, %r74; + shl.b64 %rd175, %rd174, 32; + or.b64 %rd176, %rd173, %rd175; + .loc 2 261 15 // standard.py:261:15 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + add.s64 %rd131, %rd172, %rd176; + .loc 2 291 36 // standard.py:291:36 @[ cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:44:25 ] + setp.eq.b32 %p33, %r1, 0; + // begin inline asm + @%p33 st.shared.b64 [ %r31 + 0 ], %rd131; + // end inline asm + bar.sync 0; + ld.shared.b64 %rd132, [global_smem]; +$L__tmp2: + .loc 1 45 25 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:45:25 + shl.b64 %rd177, %rd1, 3; + add.s64 %rd133, %rd39, %rd177; + .loc 1 45 36 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:45:36 + setp.eq.b32 %p36, %r2, 0; + and.pred %p34, %p35, %p36; + // begin inline asm + @%p34 st.global.b64 [ %rd133 + 0 ], { %rd132 }; + // end inline asm + .loc 1 45 4 // cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py:45:4 + ret; +$L__tmp3: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 222 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0xd7 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 100 +.b8 50 +.b8 105 +.b8 116 +.b8 107 +.b8 116 +.b8 113 +.b8 111 +.b8 114 +.b8 120 +.b8 51 +.b8 108 +.b8 53 +.b8 111 +.b8 100 +.b8 54 +.b8 118 +.b8 113 +.b8 122 +.b8 122 +.b8 51 +.b8 109 +.b8 55 +.b8 51 +.b8 113 +.b8 113 +.b8 111 +.b8 106 +.b8 103 +.b8 110 +.b8 110 +.b8 108 +.b8 103 +.b8 122 +.b8 108 +.b8 52 +.b8 55 +.b8 101 +.b8 104 +.b8 104 +.b8 112 +.b8 120 +.b8 116 +.b8 117 +.b8 99 +.b8 108 +.b8 107 +.b8 105 +.b8 54 +.b8 52 +.b8 105 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 100 +.b8 50 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x28 DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 114 +.b8 101 +.b8 100 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 109 +.b8 117 +.b8 108 +.b8 95 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0xb3:0x2e DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0xc8:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 44 // DW_AT_call_line +.b8 25 // DW_AT_call_column +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.source b/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.source new file mode 100644 index 0000000000000000000000000000000000000000..c7b3b4cb70098ff1b17d197fbdb2622869b6a12f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.source @@ -0,0 +1,282 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":18:0) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc61 = loc(unknown) +#loc64 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc68 = loc("in_ptr0"(#loc)) +#loc69 = loc("in_ptr1"(#loc)) +#loc70 = loc("in_ptr2"(#loc)) +#loc71 = loc("out_ptr0"(#loc)) +#loc72 = loc("ks0"(#loc)) +#loc73 = loc("xnumel"(#loc)) +#loc74 = loc("r0_numel"(#loc)) +#loc129 = loc("input"(#loc59)) +#loc130 = loc("a"(#loc64)) +#loc131 = loc("b"(#loc64)) +module { + tt.func public @triton_red_fused_eq_mul_squeeze_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 2 : i32 loc(#loc75) + %xoffset = tt.get_program_id x : i32 loc(#loc76) + %xoffset_1 = arith.constant 1 : i32 loc(#loc77) + %xoffset_2 = arith.constant 1 : i32 loc(#loc77) + %xoffset_3 = arith.muli %xoffset, %xoffset_2 : i32 loc(#loc77) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc78) + %xindex_4 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc79) + %xindex_5 = tt.splat %xoffset_3 : i32 -> tensor<1x1xi32> loc(#loc80) + %xindex_6 = arith.addi %xindex_5, %xindex_4 : tensor<1x1xi32> loc(#loc80) + %xmask = arith.constant dense<2> : tensor<1x1xi32> loc(#loc81) + %xmask_7 = arith.cmpi slt, %xindex_6, %xmask : tensor<1x1xi32> loc(#loc81) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc82) + %r0_base_8 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc83) + %_tmp7 = arith.constant 0 : i64 loc(#loc84) + %_tmp7_9 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc84) + %c0_i32 = arith.constant 0 : i32 loc(#loc11) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc11) + %0 = arith.bitcast %c0_i32 : i32 to i32 loc(#loc11) + %1 = arith.bitcast %r0_numel : i32 to i32 loc(#loc11) + %2 = arith.bitcast %c2048_i32 : i32 to i32 loc(#loc11) + %3 = ub.poison : i32 loc(#loc11) + %_tmp7_10 = scf.for %r0_offset = %0 to %1 step %2 iter_args(%_tmp7_12 = %_tmp7_9) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc86) + %r0_index_13 = arith.addi %r0_index, %r0_base_8 : tensor<1x2048xi32> loc(#loc86) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x2048xi32> loc(#loc87) + %r0_mask_14 = arith.cmpi slt, %r0_index_13, %r0_mask : tensor<1x2048xi32> loc(#loc87) + %tmp0 = arith.constant 4 : i32 loc(#loc88) + %tmp0_15 = arith.constant 4 : i64 loc(#loc88) + %tmp0_16 = arith.muli %tmp0_15, %ks0 : i64 loc(#loc88) + %tmp0_17 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc89) + %tmp0_18 = tt.splat %tmp0_16 : i64 -> tensor<1x1xi64> loc(#loc89) + %tmp0_19 = arith.muli %tmp0_18, %tmp0_17 : tensor<1x1xi64> loc(#loc89) + %tmp0_20 = arith.extsi %r0_index_13 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc90) + %tmp0_21 = tt.broadcast %tmp0_19 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc90) + %tmp0_22 = arith.addi %tmp0_20, %tmp0_21 : tensor<1x2048xi64> loc(#loc90) + %tmp0_23 = tt.splat %ks0 : i64 -> tensor<1x2048xi64> loc(#loc91) + %tmp0_24 = arith.divsi %tmp0_22, %tmp0_23 : tensor<1x2048xi64> loc(#loc91) + %tmp0_25 = arith.constant 8 : i32 loc(#loc92) + %tmp0_26 = arith.constant 8 : i64 loc(#loc92) + %tmp0_27 = arith.constant dense<8> : tensor<1x2048xi64> loc(#loc92) + %tmp0_28 = arith.remsi %tmp0_24, %tmp0_27 : tensor<1x2048xi64> loc(#loc92) + %tmp0_29 = tt.splat %ks0 : i64 -> tensor<1x2048xi64> loc(#loc93) + %tmp0_30 = arith.muli %tmp0_29, %tmp0_28 : tensor<1x2048xi64> loc(#loc93) + %tmp0_31 = arith.extsi %r0_index_13 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc94) + %tmp0_32 = tt.splat %ks0 : i64 -> tensor<1x2048xi64> loc(#loc94) + %tmp0_33 = arith.remsi %tmp0_31, %tmp0_32 : tensor<1x2048xi64> loc(#loc94) + %tmp0_34 = arith.addi %tmp0_30, %tmp0_33 : tensor<1x2048xi64> loc(#loc95) + %tmp0_35 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc96) + %tmp0_36 = tt.addptr %tmp0_35, %tmp0_34 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc96) + %tmp0_37 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc97) + %tmp0_38 = arith.andi %r0_mask_14, %tmp0_37 : tensor<1x2048xi1> loc(#loc97) + %tmp0_39 = arith.constant 0.000000e+00 : f32 loc(#loc98) + %tmp0_40 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc98) + %tmp0_41 = arith.fptosi %tmp0_40 : tensor<1x2048xf32> to tensor<1x2048xi64> loc(#loc98) + %tmp0_42 = tt.load %tmp0_36, %tmp0_38, %tmp0_41 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc98) + %tmp1 = arith.constant 4 : i32 loc(#loc99) + %tmp1_43 = arith.constant 4 : i64 loc(#loc99) + %tmp1_44 = arith.muli %tmp1_43, %ks0 : i64 loc(#loc99) + %tmp1_45 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc100) + %tmp1_46 = tt.splat %tmp1_44 : i64 -> tensor<1x1xi64> loc(#loc100) + %tmp1_47 = arith.muli %tmp1_46, %tmp1_45 : tensor<1x1xi64> loc(#loc100) + %tmp1_48 = arith.extsi %r0_index_13 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc101) + %tmp1_49 = tt.broadcast %tmp1_47 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc101) + %tmp1_50 = arith.addi %tmp1_48, %tmp1_49 : tensor<1x2048xi64> loc(#loc101) + %tmp1_51 = tt.splat %ks0 : i64 -> tensor<1x2048xi64> loc(#loc102) + %tmp1_52 = arith.divsi %tmp1_50, %tmp1_51 : tensor<1x2048xi64> loc(#loc102) + %tmp1_53 = arith.constant 8 : i32 loc(#loc103) + %tmp1_54 = arith.constant 8 : i64 loc(#loc103) + %tmp1_55 = arith.constant dense<8> : tensor<1x2048xi64> loc(#loc103) + %tmp1_56 = arith.remsi %tmp1_52, %tmp1_55 : tensor<1x2048xi64> loc(#loc103) + %tmp1_57 = tt.splat %ks0 : i64 -> tensor<1x2048xi64> loc(#loc104) + %tmp1_58 = arith.muli %tmp1_57, %tmp1_56 : tensor<1x2048xi64> loc(#loc104) + %tmp1_59 = arith.extsi %r0_index_13 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc105) + %tmp1_60 = tt.splat %ks0 : i64 -> tensor<1x2048xi64> loc(#loc105) + %tmp1_61 = arith.remsi %tmp1_59, %tmp1_60 : tensor<1x2048xi64> loc(#loc105) + %tmp1_62 = arith.addi %tmp1_58, %tmp1_61 : tensor<1x2048xi64> loc(#loc106) + %tmp1_63 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc107) + %tmp1_64 = tt.addptr %tmp1_63, %tmp1_62 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc107) + %tmp1_65 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc108) + %tmp1_66 = arith.andi %r0_mask_14, %tmp1_65 : tensor<1x2048xi1> loc(#loc108) + %tmp1_67 = arith.constant 0.000000e+00 : f32 loc(#loc109) + %tmp1_68 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc109) + %tmp1_69 = arith.fptosi %tmp1_68 : tensor<1x2048xf32> to tensor<1x2048xi64> loc(#loc109) + %tmp1_70 = tt.load %tmp1_64, %tmp1_66, %tmp1_69 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc109) + %tmp4 = arith.constant 4 : i32 loc(#loc110) + %tmp4_71 = arith.constant 4 : i64 loc(#loc110) + %tmp4_72 = arith.muli %tmp4_71, %ks0 : i64 loc(#loc110) + %tmp4_73 = arith.extsi %xindex_6 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc111) + %tmp4_74 = tt.splat %tmp4_72 : i64 -> tensor<1x1xi64> loc(#loc111) + %tmp4_75 = arith.muli %tmp4_74, %tmp4_73 : tensor<1x1xi64> loc(#loc111) + %tmp4_76 = arith.extsi %r0_index_13 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc112) + %tmp4_77 = tt.broadcast %tmp4_75 : tensor<1x1xi64> -> tensor<1x2048xi64> loc(#loc112) + %tmp4_78 = arith.addi %tmp4_76, %tmp4_77 : tensor<1x2048xi64> loc(#loc112) + %tmp4_79 = tt.splat %ks0 : i64 -> tensor<1x2048xi64> loc(#loc113) + %tmp4_80 = arith.divsi %tmp4_78, %tmp4_79 : tensor<1x2048xi64> loc(#loc113) + %tmp4_81 = arith.constant 8 : i32 loc(#loc114) + %tmp4_82 = arith.constant 8 : i64 loc(#loc114) + %tmp4_83 = arith.constant dense<8> : tensor<1x2048xi64> loc(#loc114) + %tmp4_84 = arith.remsi %tmp4_80, %tmp4_83 : tensor<1x2048xi64> loc(#loc114) + %tmp4_85 = tt.splat %ks0 : i64 -> tensor<1x2048xi64> loc(#loc115) + %tmp4_86 = arith.muli %tmp4_85, %tmp4_84 : tensor<1x2048xi64> loc(#loc115) + %tmp4_87 = arith.extsi %r0_index_13 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc116) + %tmp4_88 = tt.splat %ks0 : i64 -> tensor<1x2048xi64> loc(#loc116) + %tmp4_89 = arith.remsi %tmp4_87, %tmp4_88 : tensor<1x2048xi64> loc(#loc116) + %tmp4_90 = arith.addi %tmp4_86, %tmp4_89 : tensor<1x2048xi64> loc(#loc117) + %tmp4_91 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc118) + %tmp4_92 = tt.addptr %tmp4_91, %tmp4_90 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc118) + %tmp4_93 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc119) + %tmp4_94 = arith.andi %r0_mask_14, %tmp4_93 : tensor<1x2048xi1> loc(#loc119) + %tmp4_95 = arith.constant 0.000000e+00 : f32 loc(#loc120) + %tmp4_96 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32> loc(#loc120) + %tmp4_97 = arith.fptosi %tmp4_96 : tensor<1x2048xf32> to tensor<1x2048xi64> loc(#loc120) + %tmp4_98 = tt.load %tmp4_92, %tmp4_94, %tmp4_97 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc120) + %tmp2 = arith.cmpi eq, %tmp0_42, %tmp1_70 : tensor<1x2048xi64> loc(#loc121) + %tmp3 = arith.extui %tmp2 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc122) + %tmp5 = arith.muli %tmp3, %tmp4_98 : tensor<1x2048xi64> loc(#loc123) + %tmp8 = arith.addi %_tmp7_12, %tmp5 : tensor<1x2048xi64> loc(#loc124) + %_tmp7_99 = tt.broadcast %xmask_7 : tensor<1x1xi1> -> tensor<1x2048xi1> loc(#loc125) + %_tmp7_100 = arith.andi %r0_mask_14, %_tmp7_99 : tensor<1x2048xi1> loc(#loc125) + %_tmp7_101 = arith.select %_tmp7_100, %tmp8, %_tmp7_12 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc126) + scf.yield %_tmp7_101 : tensor<1x2048xi64> loc(#loc53) + } loc(#loc85) + %tmp7 = tt.call @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%_tmp7_10) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc127) + %tmp7_11 = tt.expand_dims %tmp7 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc128) + %4 = tt.splat %out_ptr0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc56) + %5 = tt.addptr %4, %xindex_6 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc56) + tt.store %5, %tmp7_11, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc57) + tt.return loc(#loc58) + } loc(#loc) + tt.func private @"triton.language.standard.sum__i64S1_2048S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2048xi64> loc("input"(#loc59))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc60) + tt.reduce.return %2 : i64 loc(#loc60) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc60) + tt.return %0 : tensor<1xi64> loc(#loc62) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc63) + tt.return %1 : tensor<1xi64> loc(#loc63) + } loc(#loc59) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc64)), %b: i64 loc("b"(#loc64))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc65) + tt.return %0 : i64 loc(#loc66) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc67) + tt.return %1 : i64 loc(#loc67) + } loc(#loc64) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":22:33) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":23:36) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":23:44) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":23:23) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":24:21) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":25:27) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":25:37) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":28:43) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":29:40) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":30:31) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":31:29) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:51) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:55) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:49) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:62) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:69) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:40) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:84) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:77) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:102) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:92) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":36:51) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":36:55) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":36:49) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":36:62) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":36:69) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":36:40) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":36:84) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":36:77) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":36:34) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":36:102) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":36:92) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":37:51) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":37:55) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":37:49) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":37:62) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":37:69) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":37:40) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":37:84) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":37:77) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":37:34) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":37:102) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":37:92) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":38:23) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":39:23) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":40:22) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":42:23) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":43:35) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":43:48) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":43:8) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":44:25) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":44:28) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":45:25) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":45:36) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":45:4) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc65 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc66 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc67 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc75 = loc("xnumel"(#loc1)) +#loc76 = loc("xoffset"(#loc2)) +#loc77 = loc("xoffset"(#loc3)) +#loc78 = loc("xindex"(#loc4)) +#loc79 = loc("xindex"(#loc5)) +#loc80 = loc("xindex"(#loc6)) +#loc81 = loc("xmask"(#loc7)) +#loc82 = loc("r0_base"(#loc8)) +#loc83 = loc("r0_base"(#loc9)) +#loc84 = loc("_tmp7"(#loc10)) +#loc85 = loc("_tmp7"(#loc11)) +#loc86 = loc("r0_index"(#loc12)) +#loc87 = loc("r0_mask"(#loc13)) +#loc88 = loc("tmp0"(#loc14)) +#loc89 = loc("tmp0"(#loc15)) +#loc90 = loc("tmp0"(#loc16)) +#loc91 = loc("tmp0"(#loc17)) +#loc92 = loc("tmp0"(#loc18)) +#loc93 = loc("tmp0"(#loc19)) +#loc94 = loc("tmp0"(#loc20)) +#loc95 = loc("tmp0"(#loc21)) +#loc96 = loc("tmp0"(#loc22)) +#loc97 = loc("tmp0"(#loc23)) +#loc98 = loc("tmp0"(#loc24)) +#loc99 = loc("tmp1"(#loc25)) +#loc100 = loc("tmp1"(#loc26)) +#loc101 = loc("tmp1"(#loc27)) +#loc102 = loc("tmp1"(#loc28)) +#loc103 = loc("tmp1"(#loc29)) +#loc104 = loc("tmp1"(#loc30)) +#loc105 = loc("tmp1"(#loc31)) +#loc106 = loc("tmp1"(#loc32)) +#loc107 = loc("tmp1"(#loc33)) +#loc108 = loc("tmp1"(#loc34)) +#loc109 = loc("tmp1"(#loc35)) +#loc110 = loc("tmp4"(#loc36)) +#loc111 = loc("tmp4"(#loc37)) +#loc112 = loc("tmp4"(#loc38)) +#loc113 = loc("tmp4"(#loc39)) +#loc114 = loc("tmp4"(#loc40)) +#loc115 = loc("tmp4"(#loc41)) +#loc116 = loc("tmp4"(#loc42)) +#loc117 = loc("tmp4"(#loc43)) +#loc118 = loc("tmp4"(#loc44)) +#loc119 = loc("tmp4"(#loc45)) +#loc120 = loc("tmp4"(#loc46)) +#loc121 = loc("tmp2"(#loc47)) +#loc122 = loc("tmp3"(#loc48)) +#loc123 = loc("tmp5"(#loc49)) +#loc124 = loc("tmp8"(#loc50)) +#loc125 = loc("_tmp7"(#loc51)) +#loc126 = loc("_tmp7"(#loc52)) +#loc127 = loc("tmp7"(#loc54)) +#loc128 = loc("tmp7"(#loc55)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..8f2047e9184d5c2bff8a66a9a2f4940049abb0f7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.ttgir @@ -0,0 +1,137 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 16], order = [0, 1]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":18:0) +#loc1 = loc(unknown) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":44:25) +#loc36 = loc("in_ptr0"(#loc)) +#loc37 = loc("in_ptr1"(#loc)) +#loc38 = loc("in_ptr2"(#loc)) +#loc39 = loc("out_ptr0"(#loc)) +#loc40 = loc("ks0"(#loc)) +#loc41 = loc("xnumel"(#loc)) +#loc42 = loc("r0_numel"(#loc)) +#loc69 = loc("tmp7"(#loc30)) +#loc74 = loc(callsite(#loc1 at #loc69)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_red_fused_eq_mul_squeeze_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<8> : tensor<1x2048xi64, #blocked> loc(#loc1) + %c0_i32 = arith.constant 0 : i32 loc(#loc1) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc1) + %c2_i32 = arith.constant 2 : i32 loc(#loc1) + %c4_i64 = arith.constant 4 : i64 loc(#loc1) + %cst_0 = arith.constant dense<0> : tensor<1x2048xi64, #blocked> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc43) + %xmask = arith.cmpi slt, %xoffset, %c2_i32 : i32 loc(#loc44) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc45) + %r0_base_1 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x2048xi32, #blocked> loc(#loc45) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x2048xi32, #blocked> loc(#loc46) + %tmp0 = arith.muli %ks0, %c4_i64 : i64 loc(#loc47) + %tmp0_2 = arith.extsi %xoffset : i32 to i64 loc(#loc48) + %tmp0_3 = arith.muli %tmp0, %tmp0_2 : i64 loc(#loc48) + %tmp0_4 = tt.splat %tmp0_3 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc71) + %tmp0_5 = tt.splat %ks0 : i64 -> tensor<1x2048xi64, #blocked> loc(#loc50) + %tmp0_6 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc51) + %tmp0_7 = tt.splat %xmask : i1 -> tensor<1x2048xi1, #blocked> loc(#loc72) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc53) + %tmp4 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr, #blocked> loc(#loc54) + %_tmp7 = scf.for %_tmp7_9 = %c0_i32 to %r0_numel step %c2048_i32 iter_args(%arg8 = %cst_0) -> (tensor<1x2048xi64, #blocked>) : i32 { + %r0_index = tt.splat %_tmp7_9 : i32 -> tensor<1x2048xi32, #blocked> loc(#loc56) + %r0_index_10 = arith.addi %r0_index, %r0_base_1 : tensor<1x2048xi32, #blocked> loc(#loc56) + %r0_mask_11 = arith.cmpi slt, %r0_index_10, %r0_mask : tensor<1x2048xi32, #blocked> loc(#loc46) + %tmp0_12 = arith.extsi %r0_index_10 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc49) + %tmp0_13 = arith.addi %tmp0_12, %tmp0_4 : tensor<1x2048xi64, #blocked> loc(#loc49) + %tmp0_14 = arith.divsi %tmp0_13, %tmp0_5 : tensor<1x2048xi64, #blocked> loc(#loc50) + %tmp0_15 = arith.remsi %tmp0_14, %cst : tensor<1x2048xi64, #blocked> loc(#loc57) + %tmp0_16 = arith.muli %tmp0_5, %tmp0_15 : tensor<1x2048xi64, #blocked> loc(#loc58) + %tmp0_17 = arith.remsi %tmp0_12, %tmp0_5 : tensor<1x2048xi64, #blocked> loc(#loc59) + %tmp0_18 = arith.addi %tmp0_16, %tmp0_17 : tensor<1x2048xi64, #blocked> loc(#loc60) + %tmp0_19 = tt.addptr %tmp0_6, %tmp0_18 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc51) + %tmp0_20 = arith.andi %r0_mask_11, %tmp0_7 : tensor<1x2048xi1, #blocked> loc(#loc52) + %tmp0_21 = tt.load %tmp0_19, %tmp0_20, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc61) + %tmp1_22 = tt.addptr %tmp1, %tmp0_18 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc53) + %tmp1_23 = tt.load %tmp1_22, %tmp0_20, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc62) + %tmp4_24 = tt.addptr %tmp4, %tmp0_18 : tensor<1x2048x!tt.ptr, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc54) + %tmp4_25 = tt.load %tmp4_24, %tmp0_20, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr, #blocked> loc(#loc63) + %tmp2 = arith.cmpi eq, %tmp0_21, %tmp1_23 : tensor<1x2048xi64, #blocked> loc(#loc64) + %tmp3 = arith.extui %tmp2 : tensor<1x2048xi1, #blocked> to tensor<1x2048xi64, #blocked> loc(#loc65) + %tmp5 = arith.muli %tmp3, %tmp4_25 : tensor<1x2048xi64, #blocked> loc(#loc66) + %tmp8 = arith.addi %arg8, %tmp5 : tensor<1x2048xi64, #blocked> loc(#loc67) + %_tmp7_26 = arith.select %tmp0_20, %tmp8, %arg8 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked> loc(#loc68) + scf.yield %_tmp7_26 : tensor<1x2048xi64, #blocked> loc(#loc28) + } loc(#loc55) + %tmp7 = "tt.reduce"(%_tmp7) <{axis = 1 : i32}> ({ + ^bb0(%tmp7_9: i64 loc(callsite(#loc1 at #loc69)), %tmp7_10: i64 loc(callsite(#loc1 at #loc69))): + %tmp7_11 = arith.addi %tmp7_9, %tmp7_10 : i64 loc(#loc75) + tt.reduce.return %tmp7_11 : i64 loc(#loc73) + }) : (tensor<1x2048xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc73) + %tmp7_8 = tt.expand_dims %tmp7 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc70) + %0 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc33) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc34) + %2 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc34) + tt.store %1, %tmp7_8, %2 : tensor<1x1x!tt.ptr, #blocked> loc(#loc34) + tt.return loc(#loc35) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":22:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":24:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":25:37) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":31:29) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:51) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:55) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:49) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:62) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:34) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:102) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":36:34) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":37:34) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":29:40) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":30:31) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:69) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:40) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:84) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:77) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:92) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":36:92) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":37:92) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":38:23) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":39:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":40:22) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":42:23) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":43:48) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":43:8) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":44:28) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":45:25) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":45:36) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":45:4) +#loc43 = loc("xoffset"(#loc2)) +#loc44 = loc("xmask"(#loc3)) +#loc45 = loc("r0_base"(#loc4)) +#loc46 = loc("r0_mask"(#loc5)) +#loc47 = loc("tmp0"(#loc6)) +#loc48 = loc("tmp0"(#loc7)) +#loc49 = loc("tmp0"(#loc8)) +#loc50 = loc("tmp0"(#loc9)) +#loc51 = loc("tmp0"(#loc10)) +#loc52 = loc("tmp0"(#loc11)) +#loc53 = loc("tmp1"(#loc12)) +#loc54 = loc("tmp4"(#loc13)) +#loc55 = loc("_tmp7"(#loc14)) +#loc56 = loc("r0_index"(#loc15)) +#loc57 = loc("tmp0"(#loc16)) +#loc58 = loc("tmp0"(#loc17)) +#loc59 = loc("tmp0"(#loc18)) +#loc60 = loc("tmp0"(#loc19)) +#loc61 = loc("tmp0"(#loc20)) +#loc62 = loc("tmp1"(#loc21)) +#loc63 = loc("tmp4"(#loc22)) +#loc64 = loc("tmp2"(#loc23)) +#loc65 = loc("tmp3"(#loc24)) +#loc66 = loc("tmp5"(#loc25)) +#loc67 = loc("tmp8"(#loc26)) +#loc68 = loc("_tmp7"(#loc27)) +#loc70 = loc("tmp7"(#loc32)) +#loc71 = loc(fused[#loc49, #loc48]) +#loc72 = loc(fused[#loc52, #loc44]) +#loc73 = loc(callsite(#loc29 at #loc69)) +#loc75 = loc(callsite(#loc31 at #loc73)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..fa10867b1afc55be2226e6f70b1e97fde3dfe3a7 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/K6AHUKXY3DG2PXK4IJUBF54N3VWT4GZ62CFMGCGQUPTTKOJPUFHA/triton_red_fused_eq_mul_squeeze_sum_2.ttir @@ -0,0 +1,138 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":18:0) +#loc3 = loc(unknown) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":44:25) +#loc37 = loc("in_ptr0"(#loc)) +#loc38 = loc("in_ptr1"(#loc)) +#loc39 = loc("in_ptr2"(#loc)) +#loc40 = loc("out_ptr0"(#loc)) +#loc41 = loc("ks0"(#loc)) +#loc42 = loc("xnumel"(#loc)) +#loc43 = loc("r0_numel"(#loc)) +#loc71 = loc("tmp7"(#loc31)) +#loc76 = loc(callsite(#loc3 at #loc71)) +module { + tt.func public @triton_red_fused_eq_mul_squeeze_sum_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %in_ptr1: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr1"(#loc)), %in_ptr2: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr2"(#loc)), %out_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr0"(#loc)), %ks0: i64 loc("ks0"(#loc)), %xnumel: i32 loc("xnumel"(#loc)), %r0_numel: i32 loc("r0_numel"(#loc))) attributes {noinline = false} { + %xmask = arith.constant 2 : i32 loc(#loc44) + %c2048_i32 = arith.constant 2048 : i32 loc(#loc2) + %c0_i32 = arith.constant 0 : i32 loc(#loc2) + %cst = arith.constant dense<8> : tensor<1x2048xi64> loc(#loc3) + %c4_i64 = arith.constant 4 : i64 loc(#loc3) + %cst_0 = arith.constant dense<0> : tensor<1x2048xi64> loc(#loc3) + %xoffset = tt.get_program_id x : i32 loc(#loc45) + %xmask_1 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc44) + %xmask_2 = tt.splat %xmask_1 : i1 -> tensor<1x1xi1> loc(#loc44) + %r0_base = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32> loc(#loc46) + %r0_base_3 = tt.expand_dims %r0_base {axis = 0 : i32} : tensor<2048xi32> -> tensor<1x2048xi32> loc(#loc47) + %_tmp7 = scf.for %r0_offset = %c0_i32 to %r0_numel step %c2048_i32 iter_args(%_tmp7_5 = %cst_0) -> (tensor<1x2048xi64>) : i32 { + %r0_index = tt.splat %r0_offset : i32 -> tensor<1x2048xi32> loc(#loc49) + %r0_index_6 = arith.addi %r0_index, %r0_base_3 : tensor<1x2048xi32> loc(#loc49) + %r0_mask = tt.splat %r0_numel : i32 -> tensor<1x2048xi32> loc(#loc50) + %r0_mask_7 = arith.cmpi slt, %r0_index_6, %r0_mask : tensor<1x2048xi32> loc(#loc50) + %tmp0 = arith.muli %ks0, %c4_i64 : i64 loc(#loc51) + %tmp0_8 = arith.extsi %xoffset : i32 to i64 loc(#loc52) + %tmp0_9 = arith.muli %tmp0, %tmp0_8 : i64 loc(#loc52) + %tmp0_10 = arith.extsi %r0_index_6 : tensor<1x2048xi32> to tensor<1x2048xi64> loc(#loc53) + %tmp0_11 = tt.splat %tmp0_9 : i64 -> tensor<1x2048xi64> loc(#loc73) + %tmp0_12 = arith.addi %tmp0_10, %tmp0_11 : tensor<1x2048xi64> loc(#loc53) + %tmp0_13 = tt.splat %ks0 : i64 -> tensor<1x2048xi64> loc(#loc54) + %tmp0_14 = arith.divsi %tmp0_12, %tmp0_13 : tensor<1x2048xi64> loc(#loc54) + %tmp0_15 = arith.remsi %tmp0_14, %cst : tensor<1x2048xi64> loc(#loc55) + %tmp0_16 = arith.muli %tmp0_13, %tmp0_15 : tensor<1x2048xi64> loc(#loc56) + %tmp0_17 = arith.remsi %tmp0_10, %tmp0_13 : tensor<1x2048xi64> loc(#loc57) + %tmp0_18 = arith.addi %tmp0_16, %tmp0_17 : tensor<1x2048xi64> loc(#loc58) + %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc59) + %tmp0_20 = tt.addptr %tmp0_19, %tmp0_18 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc59) + %tmp0_21 = tt.splat %xmask_1 : i1 -> tensor<1x2048xi1> loc(#loc74) + %tmp0_22 = arith.andi %r0_mask_7, %tmp0_21 : tensor<1x2048xi1> loc(#loc60) + %tmp0_23 = tt.load %tmp0_20, %tmp0_22, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc61) + %tmp1 = tt.splat %in_ptr1 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc62) + %tmp1_24 = tt.addptr %tmp1, %tmp0_18 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc62) + %tmp1_25 = tt.load %tmp1_24, %tmp0_22, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc63) + %tmp4 = tt.splat %in_ptr2 : !tt.ptr -> tensor<1x2048x!tt.ptr> loc(#loc64) + %tmp4_26 = tt.addptr %tmp4, %tmp0_18 : tensor<1x2048x!tt.ptr>, tensor<1x2048xi64> loc(#loc64) + %tmp4_27 = tt.load %tmp4_26, %tmp0_22, %cst_0 evictionPolicy = evict_last : tensor<1x2048x!tt.ptr> loc(#loc65) + %tmp2 = arith.cmpi eq, %tmp0_23, %tmp1_25 : tensor<1x2048xi64> loc(#loc66) + %tmp3 = arith.extui %tmp2 : tensor<1x2048xi1> to tensor<1x2048xi64> loc(#loc67) + %tmp5 = arith.muli %tmp3, %tmp4_27 : tensor<1x2048xi64> loc(#loc68) + %tmp8 = arith.addi %_tmp7_5, %tmp5 : tensor<1x2048xi64> loc(#loc69) + %_tmp7_28 = arith.select %tmp0_22, %tmp8, %_tmp7_5 : tensor<1x2048xi1>, tensor<1x2048xi64> loc(#loc70) + scf.yield %_tmp7_28 : tensor<1x2048xi64> loc(#loc29) + } loc(#loc48) + %tmp7 = "tt.reduce"(%_tmp7) <{axis = 1 : i32}> ({ + ^bb0(%tmp7_5: i64 loc(callsite(#loc3 at #loc71)), %tmp7_6: i64 loc(callsite(#loc3 at #loc71))): + %tmp7_7 = arith.addi %tmp7_5, %tmp7_6 : i64 loc(#loc77) + tt.reduce.return %tmp7_7 : i64 loc(#loc75) + }) : (tensor<1x2048xi64>) -> tensor<1xi64> loc(#loc75) + %tmp7_4 = tt.expand_dims %tmp7 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc72) + %0 = tt.addptr %out_ptr0, %xoffset : !tt.ptr, i32 loc(#loc34) + %1 = tt.splat %0 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc34) + tt.store %1, %tmp7_4, %xmask_2 : tensor<1x1x!tt.ptr> loc(#loc35) + tt.return loc(#loc36) + } loc(#loc) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":24:21) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":29:40) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":22:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":25:27) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":25:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":30:31) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":31:29) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:51) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:55) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:49) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:62) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:69) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:40) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:84) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:77) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:34) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:102) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":35:92) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":36:34) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":36:92) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":37:34) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":37:92) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":38:23) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":39:23) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":40:22) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":42:23) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":43:48) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":43:8) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":44:28) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":45:25) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":45:36) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/d2/cd2itktqorx3l5od6vqzz3m73qqojgnnlgzl47ehhpxtuclki64i.py":45:4) +#loc44 = loc("xmask"(#loc1)) +#loc45 = loc("xoffset"(#loc4)) +#loc46 = loc("r0_base"(#loc5)) +#loc47 = loc("r0_base"(#loc6)) +#loc48 = loc("_tmp7"(#loc2)) +#loc49 = loc("r0_index"(#loc7)) +#loc50 = loc("r0_mask"(#loc8)) +#loc51 = loc("tmp0"(#loc9)) +#loc52 = loc("tmp0"(#loc10)) +#loc53 = loc("tmp0"(#loc11)) +#loc54 = loc("tmp0"(#loc12)) +#loc55 = loc("tmp0"(#loc13)) +#loc56 = loc("tmp0"(#loc14)) +#loc57 = loc("tmp0"(#loc15)) +#loc58 = loc("tmp0"(#loc16)) +#loc59 = loc("tmp0"(#loc17)) +#loc60 = loc("tmp0"(#loc18)) +#loc61 = loc("tmp0"(#loc19)) +#loc62 = loc("tmp1"(#loc20)) +#loc63 = loc("tmp1"(#loc21)) +#loc64 = loc("tmp4"(#loc22)) +#loc65 = loc("tmp4"(#loc23)) +#loc66 = loc("tmp2"(#loc24)) +#loc67 = loc("tmp3"(#loc25)) +#loc68 = loc("tmp5"(#loc26)) +#loc69 = loc("tmp8"(#loc27)) +#loc70 = loc("_tmp7"(#loc28)) +#loc72 = loc("tmp7"(#loc33)) +#loc73 = loc(fused[#loc53, #loc52]) +#loc74 = loc(fused[#loc60, #loc44]) +#loc75 = loc(callsite(#loc30 at #loc71)) +#loc77 = loc(callsite(#loc32 at #loc75)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json b/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json new file mode 100644 index 0000000000000000000000000000000000000000..afd185d0a80a0e06c74d6e24f08449ab890d444e --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/__grp__triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json @@ -0,0 +1 @@ +{"child_paths": {"triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin", "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json": "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.json"}} \ No newline at end of file diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin b/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin new file mode 100644 index 0000000000000000000000000000000000000000..93b0f87cb1833bfc9c3c3d334d9adf43c787da0d Binary files /dev/null and b/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.cubin differ diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir b/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir new file mode 100644 index 0000000000000000000000000000000000000000..87d53d74dd0c08672a456306acbced6c9ad0d3aa --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.llir @@ -0,0 +1,840 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target datalayout = "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" + +@assertFunc_1 = internal constant [8 x i8] c"unknown\00" +@assertFile_1 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py\00" +@assertMessage_1 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp49 < 17\00" +@assertFunc_0 = internal constant [8 x i8] c"unknown\00" +@assertFile_0 = internal constant [114 x i8] c"/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py\00" +@assertMessage_0 = internal constant [37 x i8] c"index out of bounds: 0 <= tmp40 < 17\00" + +; Function Attrs: noreturn +declare !dbg !5 void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr #0 + +define ptx_kernel void @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8, ptr addrspace(1) readnone captures(none) %9, ptr addrspace(1) readnone captures(none) %10) local_unnamed_addr #1 !dbg !9 { + %12 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !10 + %13 = icmp samesign ult i32 %12, 32, !dbg !11 + %14 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12 + %15 = and i32 %14, 15, !dbg !12 + %16 = shl i32 %12, 4, !dbg !13 + %17 = or disjoint i32 %15, %16, !dbg !14 + %18 = sext i32 %17 to i64, !dbg !15 + %19 = getelementptr i64, ptr addrspace(1) %0, i64 %18, !dbg !15 + %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 %13) #5, !dbg !16 + %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 %13) #5, !dbg !16 + %22 = add i64 %20, -1, !dbg !17 + %23 = icmp ult i64 %22, 16383, !dbg !17 + %24 = add i64 %21, -1, !dbg !17 + %25 = icmp ult i64 %24, 16383, !dbg !17 + %26 = zext i1 %23 to i32, !dbg !18 + %27 = lshr i32 %14, 1, !dbg !19 + %.lobit = and i32 %27, 1, !dbg !19 + %28 = and i32 %14, 1, !dbg !19 + %29 = lshr i32 %14, 2, !dbg !19 + %.lobit1 = and i32 %29, 1, !dbg !19 + %30 = lshr i32 %14, 3, !dbg !19 + %.lobit2 = and i32 %30, 1, !dbg !19 + %31 = xor i32 %28, 1, !dbg !23 + %32 = xor i32 %.lobit, 1, !dbg !23 + %33 = xor i32 %.lobit1, 1, !dbg !23 + %34 = xor i32 %.lobit2, 1, !dbg !23 + %35 = select i1 %23, i32 %31, i32 0, !dbg !24 + %36 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %35, i32 1, i32 31), !dbg !25 + %37 = add i32 %35, %36, !dbg !28 + %38 = select i1 %23, i32 %28, i32 0, !dbg !29 + %39 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %38, i32 1, i32 31), !dbg !25 + %40 = add i32 %38, %39, !dbg !28 + %41 = mul nuw nsw i32 %31, %15, !dbg !30 + %42 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %41, i32 1, i32 31), !dbg !25 + %43 = add i32 %42, %41, !dbg !28 + %44 = mul nuw nsw i32 %15, %28, !dbg !31 + %45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 1, i32 31), !dbg !25 + %46 = add i32 %45, %44, !dbg !28 + %47 = icmp slt i32 %37, %40, !dbg !32 + %48 = icmp eq i32 %37, %40, !dbg !33 + %49 = icmp sgt i32 %43, %46, !dbg !34 + %50 = and i1 %48, %49, !dbg !35 + %51 = or i1 %47, %50, !dbg !36 + %52 = trunc i32 %27 to i1, !dbg !37 + %53 = xor i1 %51, %52, !dbg !37 + %54 = xor i32 %37, %40, !dbg !38 + %55 = select i1 %53, i32 %54, i32 0, !dbg !39 + %56 = xor i32 %55, %26, !dbg !40 + %57 = xor i32 %46, %43, !dbg !41 + %58 = select i1 %53, i32 %57, i32 0, !dbg !42 + %59 = xor i32 %58, %15, !dbg !43 + %60 = mul nuw nsw i32 %56, %32, !dbg !24 + %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 2, i32 31), !dbg !25 + %62 = add i32 %60, %61, !dbg !28 + %63 = mul nuw nsw i32 %56, %.lobit, !dbg !29 + %64 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 2, i32 31), !dbg !25 + %65 = add i32 %63, %64, !dbg !28 + %66 = mul nuw nsw i32 %59, %32, !dbg !30 + %67 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %66, i32 2, i32 31), !dbg !25 + %68 = add i32 %66, %67, !dbg !28 + %69 = mul nuw nsw i32 %59, %.lobit, !dbg !31 + %70 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %69, i32 2, i32 31), !dbg !25 + %71 = add i32 %69, %70, !dbg !28 + %72 = trunc i32 %29 to i1, !dbg !37 + %73 = icmp sge i32 %62, %65, !dbg !37 + %74 = icmp ne i32 %62, %65, !dbg !37 + %75 = icmp sle i32 %68, %71, !dbg !37 + %76 = or i1 %74, %75, !dbg !37 + %77 = and i1 %73, %76, !dbg !37 + %.not = xor i1 %77, %72, !dbg !37 + %78 = xor i32 %62, %65, !dbg !38 + %79 = select i1 %.not, i32 0, i32 %78, !dbg !39 + %80 = xor i32 %79, %56, !dbg !40 + %81 = xor i32 %68, %71, !dbg !41 + %82 = select i1 %.not, i32 0, i32 %81, !dbg !42 + %83 = xor i32 %82, %59, !dbg !43 + %84 = mul nuw nsw i32 %80, %31, !dbg !24 + %85 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %84, i32 1, i32 31), !dbg !25 + %86 = add i32 %84, %85, !dbg !28 + %87 = mul nuw nsw i32 %80, %28, !dbg !29 + %88 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 1, i32 31), !dbg !25 + %89 = add i32 %87, %88, !dbg !28 + %90 = mul nuw nsw i32 %83, %31, !dbg !30 + %91 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %90, i32 1, i32 31), !dbg !25 + %92 = add i32 %90, %91, !dbg !28 + %93 = mul nuw nsw i32 %83, %28, !dbg !31 + %94 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %93, i32 1, i32 31), !dbg !25 + %95 = add i32 %93, %94, !dbg !28 + %96 = icmp sge i32 %86, %89, !dbg !37 + %97 = icmp ne i32 %86, %89, !dbg !37 + %98 = icmp sle i32 %92, %95, !dbg !37 + %99 = or i1 %97, %98, !dbg !37 + %100 = and i1 %96, %99, !dbg !37 + %.not3 = xor i1 %100, %72, !dbg !37 + %101 = xor i32 %86, %89, !dbg !38 + %102 = select i1 %.not3, i32 0, i32 %101, !dbg !39 + %103 = xor i32 %102, %80, !dbg !40 + %104 = xor i32 %92, %95, !dbg !41 + %105 = select i1 %.not3, i32 0, i32 %104, !dbg !42 + %106 = xor i32 %105, %83, !dbg !43 + %107 = mul nuw nsw i32 %103, %33, !dbg !24 + %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 4, i32 31), !dbg !25 + %109 = add i32 %107, %108, !dbg !28 + %110 = mul nuw nsw i32 %103, %.lobit1, !dbg !29 + %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 4, i32 31), !dbg !25 + %112 = add i32 %110, %111, !dbg !28 + %113 = mul nuw nsw i32 %106, %33, !dbg !30 + %114 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %113, i32 4, i32 31), !dbg !25 + %115 = add i32 %113, %114, !dbg !28 + %116 = mul nuw nsw i32 %106, %.lobit1, !dbg !31 + %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 4, i32 31), !dbg !25 + %118 = add i32 %116, %117, !dbg !28 + %119 = trunc i32 %30 to i1, !dbg !37 + %120 = icmp sge i32 %109, %112, !dbg !37 + %121 = icmp ne i32 %109, %112, !dbg !37 + %122 = icmp sle i32 %115, %118, !dbg !37 + %123 = or i1 %121, %122, !dbg !37 + %124 = and i1 %120, %123, !dbg !37 + %.not4 = xor i1 %124, %119, !dbg !37 + %125 = xor i32 %109, %112, !dbg !38 + %126 = select i1 %.not4, i32 0, i32 %125, !dbg !39 + %127 = xor i32 %126, %103, !dbg !40 + %128 = xor i32 %115, %118, !dbg !41 + %129 = select i1 %.not4, i32 0, i32 %128, !dbg !42 + %130 = xor i32 %129, %106, !dbg !43 + %131 = mul nuw nsw i32 %127, %32, !dbg !24 + %132 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %131, i32 2, i32 31), !dbg !25 + %133 = add i32 %131, %132, !dbg !28 + %134 = mul nuw nsw i32 %127, %.lobit, !dbg !29 + %135 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %134, i32 2, i32 31), !dbg !25 + %136 = add i32 %134, %135, !dbg !28 + %137 = mul nuw nsw i32 %130, %32, !dbg !30 + %138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 2, i32 31), !dbg !25 + %139 = add i32 %137, %138, !dbg !28 + %140 = mul nuw nsw i32 %130, %.lobit, !dbg !31 + %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 2, i32 31), !dbg !25 + %142 = add i32 %140, %141, !dbg !28 + %143 = icmp sge i32 %133, %136, !dbg !37 + %144 = icmp ne i32 %133, %136, !dbg !37 + %145 = icmp sle i32 %139, %142, !dbg !37 + %146 = or i1 %144, %145, !dbg !37 + %147 = and i1 %143, %146, !dbg !37 + %.not5 = xor i1 %147, %119, !dbg !37 + %148 = xor i32 %133, %136, !dbg !38 + %149 = select i1 %.not5, i32 0, i32 %148, !dbg !39 + %150 = xor i32 %149, %127, !dbg !40 + %151 = xor i32 %139, %142, !dbg !41 + %152 = select i1 %.not5, i32 0, i32 %151, !dbg !42 + %153 = xor i32 %152, %130, !dbg !43 + %154 = mul nuw nsw i32 %150, %31, !dbg !24 + %155 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %154, i32 1, i32 31), !dbg !25 + %156 = add i32 %154, %155, !dbg !28 + %157 = mul nuw nsw i32 %150, %28, !dbg !29 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !25 + %159 = add i32 %157, %158, !dbg !28 + %160 = mul nuw nsw i32 %153, %31, !dbg !30 + %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 1, i32 31), !dbg !25 + %162 = add i32 %160, %161, !dbg !28 + %163 = mul nuw nsw i32 %153, %28, !dbg !31 + %164 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %163, i32 1, i32 31), !dbg !25 + %165 = add i32 %163, %164, !dbg !28 + %166 = icmp sge i32 %156, %159, !dbg !37 + %167 = icmp ne i32 %156, %159, !dbg !37 + %168 = icmp sle i32 %162, %165, !dbg !37 + %169 = or i1 %167, %168, !dbg !37 + %170 = and i1 %166, %169, !dbg !37 + %.not6 = xor i1 %170, %119, !dbg !37 + %171 = xor i32 %156, %159, !dbg !38 + %172 = select i1 %.not6, i32 0, i32 %171, !dbg !39 + %173 = xor i32 %172, %150, !dbg !40 + %174 = xor i32 %162, %165, !dbg !41 + %175 = select i1 %.not6, i32 0, i32 %174, !dbg !42 + %176 = xor i32 %175, %153, !dbg !43 + %177 = mul nuw nsw i32 %173, %34, !dbg !24 + %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 8, i32 31), !dbg !25 + %179 = add i32 %177, %178, !dbg !28 + %180 = mul nuw nsw i32 %173, %.lobit2, !dbg !29 + %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 8, i32 31), !dbg !25 + %182 = add i32 %180, %181, !dbg !28 + %183 = mul nuw nsw i32 %176, %34, !dbg !30 + %184 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %183, i32 8, i32 31), !dbg !25 + %185 = add i32 %183, %184, !dbg !28 + %186 = mul nuw nsw i32 %176, %.lobit2, !dbg !31 + %187 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %186, i32 8, i32 31), !dbg !25 + %188 = add i32 %186, %187, !dbg !28 + %189 = icmp slt i32 %179, %182, !dbg !32 + %190 = icmp eq i32 %179, %182, !dbg !33 + %191 = icmp sgt i32 %185, %188, !dbg !34 + %192 = and i1 %190, %191, !dbg !35 + %193 = or i1 %189, %192, !dbg !36 + %194 = xor i32 %179, %182, !dbg !38 + %195 = select i1 %193, i32 %194, i32 0, !dbg !39 + %196 = xor i32 %195, %173, !dbg !40 + %197 = xor i32 %185, %188, !dbg !41 + %198 = select i1 %193, i32 %197, i32 0, !dbg !42 + %199 = xor i32 %198, %176, !dbg !43 + %200 = mul nuw nsw i32 %196, %33, !dbg !24 + %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 4, i32 31), !dbg !25 + %202 = add i32 %200, %201, !dbg !28 + %203 = mul nuw nsw i32 %196, %.lobit1, !dbg !29 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 4, i32 31), !dbg !25 + %205 = add i32 %203, %204, !dbg !28 + %206 = mul nuw nsw i32 %199, %33, !dbg !30 + %207 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %206, i32 4, i32 31), !dbg !25 + %208 = add i32 %206, %207, !dbg !28 + %209 = mul nuw nsw i32 %199, %.lobit1, !dbg !31 + %210 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %209, i32 4, i32 31), !dbg !25 + %211 = add i32 %209, %210, !dbg !28 + %212 = icmp slt i32 %202, %205, !dbg !32 + %213 = icmp eq i32 %202, %205, !dbg !33 + %214 = icmp sgt i32 %208, %211, !dbg !34 + %215 = and i1 %213, %214, !dbg !35 + %216 = or i1 %212, %215, !dbg !36 + %217 = xor i32 %202, %205, !dbg !38 + %218 = select i1 %216, i32 %217, i32 0, !dbg !39 + %219 = xor i32 %218, %196, !dbg !40 + %220 = xor i32 %208, %211, !dbg !41 + %221 = select i1 %216, i32 %220, i32 0, !dbg !42 + %222 = xor i32 %221, %199, !dbg !43 + %223 = mul nuw nsw i32 %219, %32, !dbg !24 + %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 2, i32 31), !dbg !25 + %225 = add i32 %223, %224, !dbg !28 + %226 = mul nuw nsw i32 %219, %.lobit, !dbg !29 + %227 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %226, i32 2, i32 31), !dbg !25 + %228 = add i32 %226, %227, !dbg !28 + %229 = mul nuw nsw i32 %222, %32, !dbg !30 + %230 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %229, i32 2, i32 31), !dbg !25 + %231 = add i32 %229, %230, !dbg !28 + %232 = mul nuw nsw i32 %222, %.lobit, !dbg !31 + %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 2, i32 31), !dbg !25 + %234 = add i32 %232, %233, !dbg !28 + %235 = icmp slt i32 %225, %228, !dbg !32 + %236 = icmp eq i32 %225, %228, !dbg !33 + %237 = icmp sgt i32 %231, %234, !dbg !34 + %238 = and i1 %236, %237, !dbg !35 + %239 = or i1 %235, %238, !dbg !36 + %240 = xor i32 %225, %228, !dbg !38 + %241 = select i1 %239, i32 %240, i32 0, !dbg !39 + %242 = xor i32 %241, %219, !dbg !40 + %243 = xor i32 %231, %234, !dbg !41 + %244 = select i1 %239, i32 %243, i32 0, !dbg !42 + %245 = xor i32 %244, %222, !dbg !43 + %246 = mul nuw nsw i32 %242, %31, !dbg !24 + %247 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %246, i32 1, i32 31), !dbg !25 + %248 = add i32 %246, %247, !dbg !28 + %249 = mul nuw nsw i32 %242, %28, !dbg !29 + %250 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %249, i32 1, i32 31), !dbg !25 + %251 = add i32 %249, %250, !dbg !28 + %252 = mul nuw nsw i32 %245, %31, !dbg !30 + %253 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 1, i32 31), !dbg !25 + %254 = add i32 %252, %253, !dbg !28 + %255 = mul nuw nsw i32 %245, %28, !dbg !31 + %256 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %255, i32 1, i32 31), !dbg !25 + %257 = add i32 %255, %256, !dbg !28 + %258 = icmp slt i32 %248, %251, !dbg !32 + %259 = icmp eq i32 %248, %251, !dbg !33 + %260 = icmp sgt i32 %254, %257, !dbg !34 + %261 = and i1 %259, %260, !dbg !35 + %262 = or i1 %258, %261, !dbg !36 + %263 = xor i32 %254, %257, !dbg !41 + %264 = select i1 %262, i32 %263, i32 0, !dbg !42 + %265 = xor i32 %264, %245, !dbg !43 + %266 = icmp eq i64 %20, 16384, !dbg !44 + %267 = icmp eq i64 %21, 16384, !dbg !44 + %268 = zext i1 %266 to i32, !dbg !18 + %269 = select i1 %266, i32 %31, i32 0, !dbg !45 + %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 1, i32 31), !dbg !47 + %271 = add i32 %270, %269, !dbg !48 + %272 = select i1 %266, i32 %28, i32 0, !dbg !49 + %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 1, i32 31), !dbg !47 + %274 = add i32 %273, %272, !dbg !48 + %275 = icmp slt i32 %271, %274, !dbg !50 + %276 = icmp eq i32 %271, %274, !dbg !51 + %277 = and i1 %49, %276, !dbg !52 + %278 = or i1 %275, %277, !dbg !53 + %279 = xor i1 %278, %52, !dbg !54 + %280 = xor i32 %274, %271, !dbg !55 + %281 = select i1 %279, i32 %280, i32 0, !dbg !56 + %282 = xor i32 %281, %268, !dbg !57 + %283 = select i1 %279, i32 %57, i32 0, !dbg !58 + %284 = xor i32 %283, %15, !dbg !59 + %285 = mul nuw nsw i32 %282, %32, !dbg !45 + %286 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 2, i32 31), !dbg !47 + %287 = add i32 %285, %286, !dbg !48 + %288 = mul nuw nsw i32 %282, %.lobit, !dbg !49 + %289 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %288, i32 2, i32 31), !dbg !47 + %290 = add i32 %288, %289, !dbg !48 + %291 = mul nuw nsw i32 %284, %32, !dbg !60 + %292 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %291, i32 2, i32 31), !dbg !47 + %293 = add i32 %291, %292, !dbg !48 + %294 = mul nuw nsw i32 %284, %.lobit, !dbg !61 + %295 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %294, i32 2, i32 31), !dbg !47 + %296 = add i32 %294, %295, !dbg !48 + %297 = icmp sge i32 %287, %290, !dbg !54 + %298 = icmp ne i32 %287, %290, !dbg !54 + %299 = icmp sle i32 %293, %296, !dbg !54 + %300 = or i1 %298, %299, !dbg !54 + %301 = and i1 %297, %300, !dbg !54 + %.not8 = xor i1 %301, %72, !dbg !54 + %302 = xor i32 %287, %290, !dbg !55 + %303 = select i1 %.not8, i32 0, i32 %302, !dbg !56 + %304 = xor i32 %303, %282, !dbg !57 + %305 = xor i32 %293, %296, !dbg !62 + %306 = select i1 %.not8, i32 0, i32 %305, !dbg !58 + %307 = xor i32 %306, %284, !dbg !59 + %308 = mul nuw nsw i32 %304, %31, !dbg !45 + %309 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 1, i32 31), !dbg !47 + %310 = add i32 %308, %309, !dbg !48 + %311 = mul nuw nsw i32 %304, %28, !dbg !49 + %312 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %311, i32 1, i32 31), !dbg !47 + %313 = add i32 %311, %312, !dbg !48 + %314 = mul nuw nsw i32 %307, %31, !dbg !60 + %315 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %314, i32 1, i32 31), !dbg !47 + %316 = add i32 %314, %315, !dbg !48 + %317 = mul nuw nsw i32 %307, %28, !dbg !61 + %318 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %317, i32 1, i32 31), !dbg !47 + %319 = add i32 %317, %318, !dbg !48 + %320 = icmp sge i32 %310, %313, !dbg !54 + %321 = icmp ne i32 %310, %313, !dbg !54 + %322 = icmp sle i32 %316, %319, !dbg !54 + %323 = or i1 %321, %322, !dbg !54 + %324 = and i1 %320, %323, !dbg !54 + %.not9 = xor i1 %324, %72, !dbg !54 + %325 = xor i32 %310, %313, !dbg !55 + %326 = select i1 %.not9, i32 0, i32 %325, !dbg !56 + %327 = xor i32 %326, %304, !dbg !57 + %328 = xor i32 %316, %319, !dbg !62 + %329 = select i1 %.not9, i32 0, i32 %328, !dbg !58 + %330 = xor i32 %329, %307, !dbg !59 + %331 = mul nuw nsw i32 %327, %33, !dbg !45 + %332 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %331, i32 4, i32 31), !dbg !47 + %333 = add i32 %331, %332, !dbg !48 + %334 = mul nuw nsw i32 %327, %.lobit1, !dbg !49 + %335 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %334, i32 4, i32 31), !dbg !47 + %336 = add i32 %334, %335, !dbg !48 + %337 = mul nuw nsw i32 %330, %33, !dbg !60 + %338 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %337, i32 4, i32 31), !dbg !47 + %339 = add i32 %337, %338, !dbg !48 + %340 = mul nuw nsw i32 %330, %.lobit1, !dbg !61 + %341 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %340, i32 4, i32 31), !dbg !47 + %342 = add i32 %340, %341, !dbg !48 + %343 = icmp sge i32 %333, %336, !dbg !54 + %344 = icmp ne i32 %333, %336, !dbg !54 + %345 = icmp sle i32 %339, %342, !dbg !54 + %346 = or i1 %344, %345, !dbg !54 + %347 = and i1 %343, %346, !dbg !54 + %.not10 = xor i1 %347, %119, !dbg !54 + %348 = xor i32 %333, %336, !dbg !55 + %349 = select i1 %.not10, i32 0, i32 %348, !dbg !56 + %350 = xor i32 %349, %327, !dbg !57 + %351 = xor i32 %339, %342, !dbg !62 + %352 = select i1 %.not10, i32 0, i32 %351, !dbg !58 + %353 = xor i32 %352, %330, !dbg !59 + %354 = mul nuw nsw i32 %350, %32, !dbg !45 + %355 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %354, i32 2, i32 31), !dbg !47 + %356 = add i32 %354, %355, !dbg !48 + %357 = mul nuw nsw i32 %350, %.lobit, !dbg !49 + %358 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %357, i32 2, i32 31), !dbg !47 + %359 = add i32 %357, %358, !dbg !48 + %360 = mul nuw nsw i32 %353, %32, !dbg !60 + %361 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %360, i32 2, i32 31), !dbg !47 + %362 = add i32 %360, %361, !dbg !48 + %363 = mul nuw nsw i32 %353, %.lobit, !dbg !61 + %364 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %363, i32 2, i32 31), !dbg !47 + %365 = add i32 %363, %364, !dbg !48 + %366 = icmp sge i32 %356, %359, !dbg !54 + %367 = icmp ne i32 %356, %359, !dbg !54 + %368 = icmp sle i32 %362, %365, !dbg !54 + %369 = or i1 %367, %368, !dbg !54 + %370 = and i1 %366, %369, !dbg !54 + %.not11 = xor i1 %370, %119, !dbg !54 + %371 = xor i32 %356, %359, !dbg !55 + %372 = select i1 %.not11, i32 0, i32 %371, !dbg !56 + %373 = xor i32 %372, %350, !dbg !57 + %374 = xor i32 %362, %365, !dbg !62 + %375 = select i1 %.not11, i32 0, i32 %374, !dbg !58 + %376 = xor i32 %375, %353, !dbg !59 + %377 = mul nuw nsw i32 %373, %31, !dbg !45 + %378 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %377, i32 1, i32 31), !dbg !47 + %379 = add i32 %377, %378, !dbg !48 + %380 = mul nuw nsw i32 %373, %28, !dbg !49 + %381 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %380, i32 1, i32 31), !dbg !47 + %382 = add i32 %380, %381, !dbg !48 + %383 = mul nuw nsw i32 %376, %31, !dbg !60 + %384 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %383, i32 1, i32 31), !dbg !47 + %385 = add i32 %383, %384, !dbg !48 + %386 = mul nuw nsw i32 %376, %28, !dbg !61 + %387 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %386, i32 1, i32 31), !dbg !47 + %388 = add i32 %386, %387, !dbg !48 + %389 = icmp sge i32 %379, %382, !dbg !54 + %390 = icmp ne i32 %379, %382, !dbg !54 + %391 = icmp sle i32 %385, %388, !dbg !54 + %392 = or i1 %390, %391, !dbg !54 + %393 = and i1 %389, %392, !dbg !54 + %.not12 = xor i1 %393, %119, !dbg !54 + %394 = xor i32 %379, %382, !dbg !55 + %395 = select i1 %.not12, i32 0, i32 %394, !dbg !56 + %396 = xor i32 %395, %373, !dbg !57 + %397 = xor i32 %385, %388, !dbg !62 + %398 = select i1 %.not12, i32 0, i32 %397, !dbg !58 + %399 = xor i32 %398, %376, !dbg !59 + %400 = mul nuw nsw i32 %396, %34, !dbg !45 + %401 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %400, i32 8, i32 31), !dbg !47 + %402 = add i32 %400, %401, !dbg !48 + %403 = mul nuw nsw i32 %396, %.lobit2, !dbg !49 + %404 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %403, i32 8, i32 31), !dbg !47 + %405 = add i32 %403, %404, !dbg !48 + %406 = mul nuw nsw i32 %399, %34, !dbg !60 + %407 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %406, i32 8, i32 31), !dbg !47 + %408 = add i32 %406, %407, !dbg !48 + %409 = mul nuw nsw i32 %399, %.lobit2, !dbg !61 + %410 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %409, i32 8, i32 31), !dbg !47 + %411 = add i32 %409, %410, !dbg !48 + %412 = icmp slt i32 %402, %405, !dbg !50 + %413 = icmp eq i32 %402, %405, !dbg !51 + %414 = icmp sgt i32 %408, %411, !dbg !63 + %415 = and i1 %413, %414, !dbg !52 + %416 = or i1 %412, %415, !dbg !53 + %417 = xor i32 %402, %405, !dbg !55 + %418 = select i1 %416, i32 %417, i32 0, !dbg !56 + %419 = xor i32 %418, %396, !dbg !57 + %420 = xor i32 %408, %411, !dbg !62 + %421 = select i1 %416, i32 %420, i32 0, !dbg !58 + %422 = xor i32 %421, %399, !dbg !59 + %423 = mul nuw nsw i32 %419, %33, !dbg !45 + %424 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %423, i32 4, i32 31), !dbg !47 + %425 = add i32 %423, %424, !dbg !48 + %426 = mul nuw nsw i32 %419, %.lobit1, !dbg !49 + %427 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %426, i32 4, i32 31), !dbg !47 + %428 = add i32 %426, %427, !dbg !48 + %429 = mul nuw nsw i32 %422, %33, !dbg !60 + %430 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %429, i32 4, i32 31), !dbg !47 + %431 = add i32 %429, %430, !dbg !48 + %432 = mul nuw nsw i32 %422, %.lobit1, !dbg !61 + %433 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %432, i32 4, i32 31), !dbg !47 + %434 = add i32 %432, %433, !dbg !48 + %435 = icmp slt i32 %425, %428, !dbg !50 + %436 = icmp eq i32 %425, %428, !dbg !51 + %437 = icmp sgt i32 %431, %434, !dbg !63 + %438 = and i1 %436, %437, !dbg !52 + %439 = or i1 %435, %438, !dbg !53 + %440 = xor i32 %425, %428, !dbg !55 + %441 = select i1 %439, i32 %440, i32 0, !dbg !56 + %442 = xor i32 %441, %419, !dbg !57 + %443 = xor i32 %431, %434, !dbg !62 + %444 = select i1 %439, i32 %443, i32 0, !dbg !58 + %445 = xor i32 %444, %422, !dbg !59 + %446 = mul nuw nsw i32 %442, %32, !dbg !45 + %447 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %446, i32 2, i32 31), !dbg !47 + %448 = add i32 %446, %447, !dbg !48 + %449 = mul nuw nsw i32 %442, %.lobit, !dbg !49 + %450 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %449, i32 2, i32 31), !dbg !47 + %451 = add i32 %449, %450, !dbg !48 + %452 = mul nuw nsw i32 %445, %32, !dbg !60 + %453 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %452, i32 2, i32 31), !dbg !47 + %454 = add i32 %452, %453, !dbg !48 + %455 = mul nuw nsw i32 %445, %.lobit, !dbg !61 + %456 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %455, i32 2, i32 31), !dbg !47 + %457 = add i32 %455, %456, !dbg !48 + %458 = icmp slt i32 %448, %451, !dbg !50 + %459 = icmp eq i32 %448, %451, !dbg !51 + %460 = icmp sgt i32 %454, %457, !dbg !63 + %461 = and i1 %459, %460, !dbg !52 + %462 = or i1 %458, %461, !dbg !53 + %463 = xor i32 %448, %451, !dbg !55 + %464 = select i1 %462, i32 %463, i32 0, !dbg !56 + %465 = xor i32 %464, %442, !dbg !57 + %466 = xor i32 %454, %457, !dbg !62 + %467 = select i1 %462, i32 %466, i32 0, !dbg !58 + %468 = xor i32 %467, %445, !dbg !59 + %469 = mul nuw nsw i32 %465, %31, !dbg !45 + %470 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %469, i32 1, i32 31), !dbg !47 + %471 = add i32 %469, %470, !dbg !48 + %472 = mul nuw nsw i32 %465, %28, !dbg !49 + %473 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %472, i32 1, i32 31), !dbg !47 + %474 = add i32 %472, %473, !dbg !48 + %475 = mul nuw nsw i32 %468, %31, !dbg !60 + %476 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %475, i32 1, i32 31), !dbg !47 + %477 = add i32 %475, %476, !dbg !48 + %478 = mul nuw nsw i32 %468, %28, !dbg !61 + %479 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %478, i32 1, i32 31), !dbg !47 + %480 = add i32 %478, %479, !dbg !48 + %481 = icmp slt i32 %471, %474, !dbg !50 + %482 = icmp eq i32 %471, %474, !dbg !51 + %483 = icmp sgt i32 %477, %480, !dbg !63 + %484 = and i1 %482, %483, !dbg !52 + %485 = or i1 %481, %484, !dbg !53 + %486 = xor i32 %477, %480, !dbg !62 + %487 = select i1 %485, i32 %486, i32 0, !dbg !58 + %488 = xor i32 %487, %468, !dbg !59 + %narrow = select i1 %13, i1 %23, i1 false, !dbg !64 + %489 = zext i1 %narrow to i64, !dbg !64 + %narrow13 = select i1 %13, i1 %25, i1 false, !dbg !64 + %490 = zext i1 %narrow13 to i64, !dbg !64 + %491 = zext i1 %narrow to i32, !dbg !65 + %492 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %491, i32 8, i32 31), !dbg !65 + %493 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 8, i32 31), !dbg !65 + %494 = insertelement <2 x i32> poison, i32 %492, i64 0, !dbg !65 + %495 = insertelement <2 x i32> %494, i32 %493, i64 1, !dbg !65 + %496 = bitcast <2 x i32> %495 to i64, !dbg !65 + %497 = add i64 %496, %489, !dbg !67 + %extelt.offset = lshr i64 %497, 32, !dbg !65 + %498 = trunc nuw i64 %extelt.offset to i32, !dbg !65 + %499 = trunc i64 %497 to i32, !dbg !65 + %500 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %499, i32 4, i32 31), !dbg !65 + %501 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %498, i32 4, i32 31), !dbg !65 + %502 = insertelement <2 x i32> poison, i32 %500, i64 0, !dbg !65 + %503 = insertelement <2 x i32> %502, i32 %501, i64 1, !dbg !65 + %504 = bitcast <2 x i32> %503 to i64, !dbg !65 + %505 = add i64 %497, %504, !dbg !67 + %extelt.offset14 = lshr i64 %505, 32, !dbg !65 + %506 = trunc nuw i64 %extelt.offset14 to i32, !dbg !65 + %507 = trunc i64 %505 to i32, !dbg !65 + %508 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %507, i32 2, i32 31), !dbg !65 + %509 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %506, i32 2, i32 31), !dbg !65 + %510 = insertelement <2 x i32> poison, i32 %508, i64 0, !dbg !65 + %511 = insertelement <2 x i32> %510, i32 %509, i64 1, !dbg !65 + %512 = bitcast <2 x i32> %511 to i64, !dbg !65 + %513 = add i64 %505, %512, !dbg !67 + %extelt.offset15 = lshr i64 %513, 32, !dbg !65 + %514 = trunc nuw i64 %extelt.offset15 to i32, !dbg !65 + %515 = trunc i64 %513 to i32, !dbg !65 + %516 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %515, i32 1, i32 31), !dbg !65 + %517 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %514, i32 1, i32 31), !dbg !65 + %518 = insertelement <2 x i32> poison, i32 %516, i64 0, !dbg !65 + %519 = insertelement <2 x i32> %518, i32 %517, i64 1, !dbg !65 + %520 = bitcast <2 x i32> %519 to i64, !dbg !65 + %521 = add i64 %513, %520, !dbg !67 + %522 = zext i1 %narrow13 to i32, !dbg !65 + %523 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %522, i32 8, i32 31), !dbg !65 + %524 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 8, i32 31), !dbg !65 + %525 = insertelement <2 x i32> poison, i32 %523, i64 0, !dbg !65 + %526 = insertelement <2 x i32> %525, i32 %524, i64 1, !dbg !65 + %527 = bitcast <2 x i32> %526 to i64, !dbg !65 + %528 = add i64 %527, %490, !dbg !67 + %extelt.offset17 = lshr i64 %528, 32, !dbg !65 + %529 = trunc nuw i64 %extelt.offset17 to i32, !dbg !65 + %530 = trunc i64 %528 to i32, !dbg !65 + %531 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %530, i32 4, i32 31), !dbg !65 + %532 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %529, i32 4, i32 31), !dbg !65 + %533 = insertelement <2 x i32> poison, i32 %531, i64 0, !dbg !65 + %534 = insertelement <2 x i32> %533, i32 %532, i64 1, !dbg !65 + %535 = bitcast <2 x i32> %534 to i64, !dbg !65 + %536 = add i64 %528, %535, !dbg !67 + %extelt.offset18 = lshr i64 %536, 32, !dbg !65 + %537 = trunc nuw i64 %extelt.offset18 to i32, !dbg !65 + %538 = trunc i64 %536 to i32, !dbg !65 + %539 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %538, i32 2, i32 31), !dbg !65 + %540 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %537, i32 2, i32 31), !dbg !65 + %541 = insertelement <2 x i32> poison, i32 %539, i64 0, !dbg !65 + %542 = insertelement <2 x i32> %541, i32 %540, i64 1, !dbg !65 + %543 = bitcast <2 x i32> %542 to i64, !dbg !65 + %544 = add i64 %536, %543, !dbg !67 + %extelt.offset19 = lshr i64 %544, 32, !dbg !65 + %545 = trunc nuw i64 %extelt.offset19 to i32, !dbg !65 + %546 = trunc i64 %544 to i32, !dbg !65 + %547 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %546, i32 1, i32 31), !dbg !65 + %548 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %545, i32 1, i32 31), !dbg !65 + %narrow20 = select i1 %13, i1 %266, i1 false, !dbg !68 + %549 = zext i1 %narrow20 to i64, !dbg !68 + %narrow21 = select i1 %13, i1 %267, i1 false, !dbg !68 + %550 = zext i1 %narrow21 to i64, !dbg !68 + %551 = zext i1 %narrow20 to i32, !dbg !69 + %552 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %551, i32 8, i32 31), !dbg !69 + %553 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 8, i32 31), !dbg !69 + %554 = insertelement <2 x i32> poison, i32 %552, i64 0, !dbg !69 + %555 = insertelement <2 x i32> %554, i32 %553, i64 1, !dbg !69 + %556 = bitcast <2 x i32> %555 to i64, !dbg !69 + %557 = add i64 %556, %549, !dbg !71 + %extelt.offset23 = lshr i64 %557, 32, !dbg !69 + %558 = trunc nuw i64 %extelt.offset23 to i32, !dbg !69 + %559 = trunc i64 %557 to i32, !dbg !69 + %560 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %559, i32 4, i32 31), !dbg !69 + %561 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %558, i32 4, i32 31), !dbg !69 + %562 = insertelement <2 x i32> poison, i32 %560, i64 0, !dbg !69 + %563 = insertelement <2 x i32> %562, i32 %561, i64 1, !dbg !69 + %564 = bitcast <2 x i32> %563 to i64, !dbg !69 + %565 = add i64 %557, %564, !dbg !71 + %extelt.offset24 = lshr i64 %565, 32, !dbg !69 + %566 = trunc nuw i64 %extelt.offset24 to i32, !dbg !69 + %567 = trunc i64 %565 to i32, !dbg !69 + %568 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %567, i32 2, i32 31), !dbg !69 + %569 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %566, i32 2, i32 31), !dbg !69 + %570 = insertelement <2 x i32> poison, i32 %568, i64 0, !dbg !69 + %571 = insertelement <2 x i32> %570, i32 %569, i64 1, !dbg !69 + %572 = bitcast <2 x i32> %571 to i64, !dbg !69 + %573 = add i64 %565, %572, !dbg !71 + %extelt.offset25 = lshr i64 %573, 32, !dbg !69 + %574 = trunc nuw i64 %extelt.offset25 to i32, !dbg !69 + %575 = trunc i64 %573 to i32, !dbg !69 + %576 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %575, i32 1, i32 31), !dbg !69 + %577 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %574, i32 1, i32 31), !dbg !69 + %578 = zext i1 %narrow21 to i32, !dbg !69 + %579 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %578, i32 8, i32 31), !dbg !69 + %580 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 8, i32 31), !dbg !69 + %581 = insertelement <2 x i32> poison, i32 %579, i64 0, !dbg !69 + %582 = insertelement <2 x i32> %581, i32 %580, i64 1, !dbg !69 + %583 = bitcast <2 x i32> %582 to i64, !dbg !69 + %584 = add i64 %583, %550, !dbg !71 + %extelt.offset27 = lshr i64 %584, 32, !dbg !69 + %585 = trunc nuw i64 %extelt.offset27 to i32, !dbg !69 + %586 = trunc i64 %584 to i32, !dbg !69 + %587 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %586, i32 4, i32 31), !dbg !69 + %588 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %585, i32 4, i32 31), !dbg !69 + %589 = insertelement <2 x i32> poison, i32 %587, i64 0, !dbg !69 + %590 = insertelement <2 x i32> %589, i32 %588, i64 1, !dbg !69 + %591 = bitcast <2 x i32> %590 to i64, !dbg !69 + %592 = add i64 %584, %591, !dbg !71 + %extelt.offset28 = lshr i64 %592, 32, !dbg !69 + %593 = trunc nuw i64 %extelt.offset28 to i32, !dbg !69 + %594 = trunc i64 %592 to i32, !dbg !69 + %595 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %594, i32 2, i32 31), !dbg !69 + %596 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %593, i32 2, i32 31), !dbg !69 + %597 = insertelement <2 x i32> poison, i32 %595, i64 0, !dbg !69 + %598 = insertelement <2 x i32> %597, i32 %596, i64 1, !dbg !69 + %599 = bitcast <2 x i32> %598 to i64, !dbg !69 + %600 = add i64 %592, %599, !dbg !71 + %extelt.offset29 = lshr i64 %600, 32, !dbg !69 + %601 = trunc nuw i64 %extelt.offset29 to i32, !dbg !69 + %602 = trunc i64 %600 to i32, !dbg !69 + %603 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %602, i32 1, i32 31), !dbg !69 + %604 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %601, i32 1, i32 31), !dbg !69 + %605 = trunc i64 %521 to i32, !dbg !72 + %606 = icmp slt i32 %15, %605, !dbg !73 + %607 = select i1 %606, i32 %265, i32 16, !dbg !74 + %608 = add i32 %607, 17, !dbg !75 + %609 = icmp slt i32 %607, 0, !dbg !76 + %610 = select i1 %609, i32 %608, i32 %607, !dbg !77 + %611 = icmp ult i32 %610, 17, !dbg !78 + %612 = icmp samesign ugt i32 %12, 31, !dbg !18 + %613 = or i1 %612, %611, !dbg !79 + br i1 %613, label %615, label %614, !dbg !80 + +614: ; preds = %11 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 71, ptr nonnull @assertFunc_0, i64 1), !dbg !80 + unreachable, !dbg !80 + +615: ; preds = %11 + %616 = insertelement <2 x i32> poison, i32 %576, i64 0, !dbg !69 + %617 = insertelement <2 x i32> %616, i32 %577, i64 1, !dbg !69 + %618 = bitcast <2 x i32> %617 to i64, !dbg !69 + %619 = add i64 %573, %618, !dbg !71 + %620 = trunc i64 %619 to i32, !dbg !81 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !80 + %621 = icmp slt i32 %15, %620, !dbg !82 + %622 = select i1 %621, i32 %488, i32 16, !dbg !83 + %623 = add i32 %622, 17, !dbg !84 + %624 = icmp slt i32 %622, 0, !dbg !85 + %625 = select i1 %624, i32 %623, i32 %622, !dbg !86 + %626 = icmp ult i32 %625, 17, !dbg !87 + %627 = or i1 %612, %626, !dbg !88 + br i1 %627, label %629, label %628, !dbg !89 + +628: ; preds = %615 + tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 80, ptr nonnull @assertFunc_1, i64 1), !dbg !89 + unreachable, !dbg !89 + +629: ; preds = %615 + %630 = insertelement <2 x i32> poison, i32 %603, i64 0, !dbg !69 + %631 = insertelement <2 x i32> %630, i32 %604, i64 1, !dbg !69 + %632 = bitcast <2 x i32> %631 to i64, !dbg !69 + %633 = add i64 %600, %632, !dbg !71 + %634 = trunc i64 %633 to i32, !dbg !81 + %635 = insertelement <2 x i32> poison, i32 %547, i64 0, !dbg !65 + %636 = insertelement <2 x i32> %635, i32 %548, i64 1, !dbg !65 + %637 = bitcast <2 x i32> %636 to i64, !dbg !65 + %638 = add i64 %544, %637, !dbg !67 + %639 = trunc i64 %638 to i32, !dbg !72 + tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0), !dbg !89 + %640 = zext nneg i32 %12 to i64, !dbg !90 + %641 = getelementptr i32, ptr addrspace(1) %1, i64 %640, !dbg !90 + %642 = and i32 %14, 63, !dbg !91 + %643 = icmp eq i32 %642, 0, !dbg !91 + %644 = and i1 %13, %643, !dbg !91 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %639, ptr addrspace(1) %641, i1 %644) #5, !dbg !91 + %645 = getelementptr i32, ptr addrspace(1) %2, i64 %640, !dbg !92 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %634, ptr addrspace(1) %645, i1 %644) #5, !dbg !93 + %646 = getelementptr i32, ptr addrspace(1) %3, i64 %18, !dbg !94 + %647 = and i32 %14, 48, !dbg !95 + %648 = icmp eq i32 %647, 0, !dbg !95 + %649 = and i1 %13, %648, !dbg !95 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %265, ptr addrspace(1) %646, i1 %649) #5, !dbg !95 + %650 = mul i32 %12, 17, !dbg !96 + %651 = add i32 %610, %650, !dbg !97 + %652 = sext i32 %651 to i64, !dbg !98 + %653 = getelementptr i32, ptr addrspace(1) %4, i64 %652, !dbg !98 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %653, i1 %649) #5, !dbg !99 + %654 = getelementptr i32, ptr addrspace(1) %5, i64 %18, !dbg !100 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %488, ptr addrspace(1) %654, i1 %649) #5, !dbg !101 + %655 = add i32 %625, %650, !dbg !102 + %656 = sext i32 %655 to i64, !dbg !103 + %657 = getelementptr i32, ptr addrspace(1) %6, i64 %656, !dbg !103 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 1, ptr addrspace(1) %657, i1 %649) #5, !dbg !104 + ret void, !dbg !105 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #3 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #4 + +attributes #0 = { noreturn } +attributes #1 = { "nvvm.reqntid"="64" } +attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #4 = { convergent nocallback nounwind } +attributes #5 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py", directory: "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!4 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!5 = !DISubprogram(name: "__assertfail", linkageName: "__assertfail", scope: !6, file: !6, type: !7, spFlags: DISPFlagOptimized) +!6 = !DIFile(filename: "", directory: "") +!7 = !DISubroutineType(cc: DW_CC_normal, types: !8) +!8 = !{} +!9 = distinct !DISubprogram(name: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", linkageName: "triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2", scope: !1, file: !1, line: 18, type: !7, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DILocation(line: 24, column: 28, scope: !9) +!11 = !DILocation(line: 26, column: 21, scope: !9) +!12 = !DILocation(line: 27, column: 38, scope: !9) +!13 = !DILocation(line: 34, column: 40, scope: !9) +!14 = !DILocation(line: 34, column: 37, scope: !9) +!15 = !DILocation(line: 34, column: 30, scope: !9) +!16 = !DILocation(line: 34, column: 45, scope: !9) +!17 = !DILocation(line: 39, column: 18, scope: !9) +!18 = !DILocation(line: 0, scope: !9) +!19 = !DILocation(line: 627, column: 44, scope: !20, inlinedAt: !22) +!20 = distinct !DILexicalBlockFile(scope: !9, file: !21, discriminator: 0) +!21 = !DIFile(filename: "triton_helpers.py", directory: "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime") +!22 = !DILocation(line: 46, column: 71, scope: !9) +!23 = !DILocation(line: 537, column: 21, scope: !20, inlinedAt: !22) +!24 = !DILocation(line: 538, column: 40, scope: !20, inlinedAt: !22) +!25 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !22) +!26 = distinct !DILexicalBlockFile(scope: !9, file: !27, discriminator: 0) +!27 = !DIFile(filename: "standard.py", directory: "/workspace/specforge/lib/python3.11/site-packages/triton/language") +!28 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !22) +!29 = !DILocation(line: 539, column: 41, scope: !20, inlinedAt: !22) +!30 = !DILocation(line: 548, column: 23, scope: !20, inlinedAt: !22) +!31 = !DILocation(line: 551, column: 23, scope: !20, inlinedAt: !22) +!32 = !DILocation(line: 574, column: 22, scope: !20, inlinedAt: !22) +!33 = !DILocation(line: 591, column: 21, scope: !20, inlinedAt: !22) +!34 = !DILocation(line: 594, column: 40, scope: !20, inlinedAt: !22) +!35 = !DILocation(line: 594, column: 29, scope: !20, inlinedAt: !22) +!36 = !DILocation(line: 594, column: 23, scope: !20, inlinedAt: !22) +!37 = !DILocation(line: 599, column: 28, scope: !20, inlinedAt: !22) +!38 = !DILocation(line: 600, column: 38, scope: !20, inlinedAt: !22) +!39 = !DILocation(line: 600, column: 46, scope: !20, inlinedAt: !22) +!40 = !DILocation(line: 600, column: 15, scope: !20, inlinedAt: !22) +!41 = !DILocation(line: 601, column: 48, scope: !20, inlinedAt: !22) +!42 = !DILocation(line: 601, column: 59, scope: !20, inlinedAt: !22) +!43 = !DILocation(line: 601, column: 22, scope: !20, inlinedAt: !22) +!44 = !DILocation(line: 47, column: 20, scope: !9) +!45 = !DILocation(line: 538, column: 40, scope: !20, inlinedAt: !46) +!46 = !DILocation(line: 51, column: 71, scope: !9) +!47 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !46) +!48 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !46) +!49 = !DILocation(line: 539, column: 41, scope: !20, inlinedAt: !46) +!50 = !DILocation(line: 574, column: 22, scope: !20, inlinedAt: !46) +!51 = !DILocation(line: 591, column: 21, scope: !20, inlinedAt: !46) +!52 = !DILocation(line: 594, column: 29, scope: !20, inlinedAt: !46) +!53 = !DILocation(line: 594, column: 23, scope: !20, inlinedAt: !46) +!54 = !DILocation(line: 599, column: 28, scope: !20, inlinedAt: !46) +!55 = !DILocation(line: 600, column: 38, scope: !20, inlinedAt: !46) +!56 = !DILocation(line: 600, column: 46, scope: !20, inlinedAt: !46) +!57 = !DILocation(line: 600, column: 15, scope: !20, inlinedAt: !46) +!58 = !DILocation(line: 601, column: 59, scope: !20, inlinedAt: !46) +!59 = !DILocation(line: 601, column: 22, scope: !20, inlinedAt: !46) +!60 = !DILocation(line: 548, column: 23, scope: !20, inlinedAt: !46) +!61 = !DILocation(line: 551, column: 23, scope: !20, inlinedAt: !46) +!62 = !DILocation(line: 601, column: 48, scope: !20, inlinedAt: !46) +!63 = !DILocation(line: 594, column: 40, scope: !20, inlinedAt: !46) +!64 = !DILocation(line: 54, column: 35, scope: !9) +!65 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !66) +!66 = !DILocation(line: 55, column: 26, scope: !9) +!67 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !66) +!68 = !DILocation(line: 58, column: 35, scope: !9) +!69 = !DILocation(line: 291, column: 36, scope: !26, inlinedAt: !70) +!70 = !DILocation(line: 59, column: 26, scope: !9) +!71 = !DILocation(line: 261, column: 15, scope: !26, inlinedAt: !70) +!72 = !DILocation(line: 60, column: 21, scope: !9) +!73 = !DILocation(line: 64, column: 19, scope: !9) +!74 = !DILocation(line: 66, column: 35, scope: !9) +!75 = !DILocation(line: 68, column: 20, scope: !9) +!76 = !DILocation(line: 69, column: 20, scope: !9) +!77 = !DILocation(line: 70, column: 35, scope: !9) +!78 = !DILocation(line: 71, column: 38, scope: !9) +!79 = !DILocation(line: 71, column: 53, scope: !9) +!80 = !DILocation(line: 71, column: 63, scope: !9) +!81 = !DILocation(line: 61, column: 21, scope: !9) +!82 = !DILocation(line: 75, column: 19, scope: !9) +!83 = !DILocation(line: 76, column: 35, scope: !9) +!84 = !DILocation(line: 77, column: 20, scope: !9) +!85 = !DILocation(line: 78, column: 20, scope: !9) +!86 = !DILocation(line: 79, column: 35, scope: !9) +!87 = !DILocation(line: 80, column: 38, scope: !9) +!88 = !DILocation(line: 80, column: 53, scope: !9) +!89 = !DILocation(line: 80, column: 63, scope: !9) +!90 = !DILocation(line: 81, column: 25, scope: !9) +!91 = !DILocation(line: 81, column: 37, scope: !9) +!92 = !DILocation(line: 82, column: 25, scope: !9) +!93 = !DILocation(line: 82, column: 37, scope: !9) +!94 = !DILocation(line: 83, column: 25, scope: !9) +!95 = !DILocation(line: 83, column: 47, scope: !9) +!96 = !DILocation(line: 84, column: 52, scope: !9) +!97 = !DILocation(line: 84, column: 49, scope: !9) +!98 = !DILocation(line: 84, column: 25, scope: !9) +!99 = !DILocation(line: 84, column: 85, scope: !9) +!100 = !DILocation(line: 85, column: 25, scope: !9) +!101 = !DILocation(line: 85, column: 47, scope: !9) +!102 = !DILocation(line: 86, column: 49, scope: !9) +!103 = !DILocation(line: 86, column: 25, scope: !9) +!104 = !DILocation(line: 86, column: 85, scope: !9) +!105 = !DILocation(line: 86, column: 4, scope: !9) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx b/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx new file mode 100644 index 0000000000000000000000000000000000000000..ab19f6303d2e93e667594d59e0e1097b8d25a39f --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ptx @@ -0,0 +1,1673 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.7 +.target sm_90a +.address_size 64 + + // .globl triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 // -- Begin function triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +.noreturn; +.global .align 1 .b8 assertFunc_1[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_1[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 102, 104, 47, 99, 102, 104, 119, 55, 106, 115, 114, 53, 115, 107, 55, 52, 107, 99, 104, 100, 104, 113, 111, 103, 51, 113, 121, 113, 55, 103, 101, 51, 113, 106, 113, 109, 109, 113, 119, 121, 51, 55, 108, 114, 50, 98, 101, 52, 108, 112, 121, 107, 122, 120, 51, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 57, 32, 60, 32, 49, 55}; +.global .align 1 .b8 assertFunc_0[8] = {117, 110, 107, 110, 111, 119, 110}; +.global .align 1 .b8 assertFile_0[114] = {47, 119, 111, 114, 107, 115, 112, 97, 99, 101, 47, 104, 97, 110, 114, 117, 105, 47, 83, 112, 101, 99, 70, 111, 114, 103, 101, 45, 101, 120, 116, 47, 99, 97, 99, 104, 101, 47, 99, 111, 109, 112, 105, 108, 101, 100, 95, 107, 101, 114, 110, 101, 108, 115, 47, 102, 104, 47, 99, 102, 104, 119, 55, 106, 115, 114, 53, 115, 107, 55, 52, 107, 99, 104, 100, 104, 113, 111, 103, 51, 113, 121, 113, 55, 103, 101, 51, 113, 106, 113, 109, 109, 113, 119, 121, 51, 55, 108, 114, 50, 98, 101, 52, 108, 112, 121, 107, 122, 120, 51, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[37] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 52, 48, 32, 60, 32, 49, 55}; + // @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2 +.visible .entry triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2( + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_7, + .param .u32 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_8, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_9, + .param .u64 .ptr .global .align 1 triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_10 +) +.reqntid 64 +{ + .reg .pred %p<140>; + .reg .b32 %r<453>; + .reg .b64 %rd<107>; + .loc 1 18 0 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:18:0 +$L__func_begin0: + .loc 1 18 0 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:18:0 + +// %bb.0: + ld.param.b64 %rd15, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_0]; +$L__tmp0: + .loc 1 24 28 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:24:28 + mov.u32 %r1, %ctaid.x; + .loc 1 26 21 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:26:21 + setp.lt.u32 %p2, %r1, 32; + .loc 1 27 38 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:27:38 + mov.u32 %r2, %tid.x; + and.b32 %r3, %r2, 15; + .loc 1 34 40 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:34:40 + shl.b32 %r14, %r1, 4; + .loc 1 34 37 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:34:37 + or.b32 %r15, %r3, %r14; + .loc 1 34 30 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:34:30 + mad.wide.s32 %rd12, %r15, 8, %rd15; + .loc 1 34 45 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:34:45 + // begin inline asm + mov.u64 %rd11, 0x0; + @%p2 ld.global.b64 { %rd11 }, [ %rd12 + 0 ]; + // end inline asm + // begin inline asm + mov.u64 %rd13, 0x0; + @%p2 ld.global.b64 { %rd13 }, [ %rd12 + 0 ]; + // end inline asm + .loc 1 39 18 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:39:18 + add.s64 %rd16, %rd11, -1; + setp.lt.u64 %p3, %rd16, 16383; + add.s64 %rd17, %rd13, -1; + setp.lt.u64 %p4, %rd17, 16383; + .loc 1 0 0 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:0 + selp.b32 %r16, 1, 0, %p3; +$L__tmp1: + .loc 2 627 44 // triton_helpers.py:627:44 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shr.u32 %r17, %r2, 1; + bfe.u32 %r18, %r2, 1, 1; + and.b32 %r19, %r2, 1; + shr.u32 %r20, %r2, 2; + bfe.u32 %r21, %r2, 2, 1; + shr.u32 %r22, %r2, 3; + bfe.u32 %r23, %r2, 3, 1; + .loc 2 537 21 // triton_helpers.py:537:21 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r24, %r19, 1; + xor.b32 %r25, %r18, 1; + xor.b32 %r26, %r21, 1; + xor.b32 %r27, %r23, 1; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r28, %r24, 0, %p3; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r29, %r28, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r30, %r28, %r29; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r31, %r19, 0, %p3; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r32, %r31, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r33, %r31, %r32; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r34, %r24, %r3; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r35, %r34, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r36, %r35, %r34; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r37, %r3, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r38, %r37, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r39, %r38, %r37; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.lt.s32 %p5, %r30, %r33; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.eq.b32 %p6, %r30, %r33; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.gt.s32 %p7, %r36, %r39; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + and.pred %p8, %p6, %p7; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + or.pred %p9, %p5, %p8; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + and.b32 %r40, %r17, 1; + setp.ne.b32 %p10, %r40, 0; + xor.pred %p11, %p9, %p10; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r41, %r30, %r33; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r42, %r41, 0, %p11; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r43, %r42, %r16; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r44, %r39, %r36; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r45, %r44, 0, %p11; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r46, %r45, %r3; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r47, %r43, %r25; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r48, %r47, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r49, %r47, %r48; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r50, %r43, %r18; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r51, %r50, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r52, %r50, %r51; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r53, %r46, %r25; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r54, %r53, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r55, %r53, %r54; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r56, %r46, %r18; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r57, %r56, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r58, %r56, %r57; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + and.b32 %r59, %r20, 1; + setp.ne.b32 %p12, %r59, 0; + setp.ge.s32 %p13, %r49, %r52; + setp.ne.b32 %p14, %r49, %r52; + setp.le.s32 %p15, %r55, %r58; + or.pred %p16, %p14, %p15; + and.pred %p17, %p13, %p16; + xor.pred %p18, %p17, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r60, %r49, %r52; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r61, 0, %r60, %p18; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r62, %r61, %r43; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r63, %r55, %r58; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r64, 0, %r63, %p18; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r65, %r64, %r46; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r66, %r62, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r67, %r66, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r68, %r66, %r67; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r69, %r62, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r70, %r69, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r71, %r69, %r70; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r72, %r65, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r73, %r72, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r74, %r72, %r73; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r75, %r65, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r76, %r75, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r77, %r75, %r76; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.ge.s32 %p19, %r68, %r71; + setp.ne.b32 %p20, %r68, %r71; + setp.le.s32 %p21, %r74, %r77; + or.pred %p22, %p20, %p21; + and.pred %p23, %p19, %p22; + xor.pred %p24, %p23, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r78, %r68, %r71; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r79, 0, %r78, %p24; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r80, %r79, %r62; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r81, %r74, %r77; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r82, 0, %r81, %p24; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r83, %r82, %r65; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r84, %r80, %r26; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r85, %r84, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r86, %r84, %r85; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r87, %r80, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r88, %r87, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r89, %r87, %r88; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r90, %r83, %r26; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r91, %r90, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r92, %r90, %r91; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r93, %r83, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r94, %r93, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r95, %r93, %r94; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + and.b32 %r96, %r22, 1; + setp.ne.b32 %p25, %r96, 0; + setp.ge.s32 %p26, %r86, %r89; + setp.ne.b32 %p27, %r86, %r89; + setp.le.s32 %p28, %r92, %r95; + or.pred %p29, %p27, %p28; + and.pred %p30, %p26, %p29; + xor.pred %p31, %p30, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r97, %r86, %r89; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r98, 0, %r97, %p31; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r99, %r98, %r80; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r100, %r92, %r95; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r101, 0, %r100, %p31; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r102, %r101, %r83; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r103, %r99, %r25; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r104, %r103, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r105, %r103, %r104; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r106, %r99, %r18; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r107, %r106, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r108, %r106, %r107; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r109, %r102, %r25; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r110, %r109, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r111, %r109, %r110; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r112, %r102, %r18; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r113, %r112, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r114, %r112, %r113; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.ge.s32 %p32, %r105, %r108; + setp.ne.b32 %p33, %r105, %r108; + setp.le.s32 %p34, %r111, %r114; + or.pred %p35, %p33, %p34; + and.pred %p36, %p32, %p35; + xor.pred %p37, %p36, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r115, %r105, %r108; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r116, 0, %r115, %p37; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r117, %r116, %r99; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r118, %r111, %r114; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r119, 0, %r118, %p37; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r120, %r119, %r102; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r121, %r117, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r122, %r121, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r123, %r121, %r122; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r124, %r117, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r125, %r124, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r126, %r124, %r125; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r127, %r120, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r128, %r127, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r129, %r127, %r128; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r130, %r120, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r131, %r130, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r132, %r130, %r131; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.ge.s32 %p38, %r123, %r126; + setp.ne.b32 %p39, %r123, %r126; + setp.le.s32 %p40, %r129, %r132; + or.pred %p41, %p39, %p40; + and.pred %p42, %p38, %p41; + xor.pred %p43, %p42, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r133, %r123, %r126; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r134, 0, %r133, %p43; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r135, %r134, %r117; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r136, %r129, %r132; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r137, 0, %r136, %p43; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r138, %r137, %r120; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r139, %r135, %r27; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r140, %r139, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r141, %r139, %r140; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r142, %r135, %r23; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r143, %r142, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r144, %r142, %r143; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r145, %r138, %r27; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r146, %r145, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r147, %r145, %r146; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r148, %r138, %r23; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r149, %r148, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r150, %r148, %r149; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.lt.s32 %p44, %r141, %r144; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.eq.b32 %p45, %r141, %r144; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.gt.s32 %p46, %r147, %r150; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + and.pred %p47, %p45, %p46; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + or.pred %p48, %p44, %p47; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r151, %r141, %r144; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r152, %r151, 0, %p48; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r153, %r152, %r135; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r154, %r147, %r150; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r155, %r154, 0, %p48; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r156, %r155, %r138; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r157, %r153, %r26; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r158, %r157, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r159, %r157, %r158; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r160, %r153, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r161, %r160, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r162, %r160, %r161; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r163, %r156, %r26; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r164, %r163, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r165, %r163, %r164; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r166, %r156, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r167, %r166, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r168, %r166, %r167; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.lt.s32 %p49, %r159, %r162; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.eq.b32 %p50, %r159, %r162; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.gt.s32 %p51, %r165, %r168; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + and.pred %p52, %p50, %p51; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + or.pred %p53, %p49, %p52; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r169, %r159, %r162; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r170, %r169, 0, %p53; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r171, %r170, %r153; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r172, %r165, %r168; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r173, %r172, 0, %p53; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r174, %r173, %r156; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r175, %r171, %r25; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r176, %r175, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r177, %r175, %r176; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r178, %r171, %r18; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r179, %r178, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r180, %r178, %r179; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r181, %r174, %r25; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r182, %r181, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r183, %r181, %r182; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r184, %r174, %r18; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r185, %r184, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r186, %r184, %r185; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.lt.s32 %p54, %r177, %r180; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.eq.b32 %p55, %r177, %r180; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.gt.s32 %p56, %r183, %r186; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + and.pred %p57, %p55, %p56; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + or.pred %p58, %p54, %p57; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r187, %r177, %r180; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r188, %r187, 0, %p58; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r189, %r188, %r171; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r190, %r183, %r186; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r191, %r190, 0, %p58; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r192, %r191, %r174; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r193, %r189, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r194, %r193, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r195, %r193, %r194; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r196, %r189, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r197, %r196, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r198, %r196, %r197; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r199, %r192, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r200, %r199, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r201, %r199, %r200; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + mul.lo.s32 %r202, %r192, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + shfl.sync.bfly.b32 %r203, %r202, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + add.s32 %r204, %r202, %r203; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.lt.s32 %p59, %r195, %r198; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.eq.b32 %p60, %r195, %r198; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + setp.gt.s32 %p61, %r201, %r204; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r205, %r201, %r204; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + selp.b32 %r206, %r205, 0, %p61; + selp.b32 %r207, %r206, 0, %p60; + selp.b32 %r208, %r205, %r207, %p59; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:46:71 ] + xor.b32 %r4, %r208, %r192; +$L__tmp2: + .loc 1 47 20 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:47:20 + setp.eq.b64 %p62, %rd11, 16384; + setp.eq.b64 %p63, %rd13, 16384; + .loc 1 0 0 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:0 + selp.b32 %r209, 1, 0, %p62; +$L__tmp3: + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r210, %r24, 0, %p62; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r211, %r210, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r212, %r211, %r210; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r213, %r19, 0, %p62; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r214, %r213, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r215, %r214, %r213; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.lt.s32 %p64, %r212, %r215; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.eq.b32 %p65, %r212, %r215; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + and.pred %p66, %p7, %p65; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + or.pred %p67, %p64, %p66; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.pred %p68, %p67, %p10; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r216, %r215, %r212; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r217, %r216, 0, %p68; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r218, %r217, %r209; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r219, %r44, 0, %p68; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r220, %r219, %r3; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r221, %r218, %r25; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r222, %r221, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r223, %r221, %r222; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r224, %r218, %r18; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r225, %r224, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r226, %r224, %r225; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r227, %r220, %r25; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r228, %r227, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r229, %r227, %r228; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r230, %r220, %r18; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r231, %r230, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r232, %r230, %r231; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.ge.s32 %p69, %r223, %r226; + setp.ne.b32 %p70, %r223, %r226; + setp.le.s32 %p71, %r229, %r232; + or.pred %p72, %p70, %p71; + and.pred %p73, %p69, %p72; + xor.pred %p74, %p73, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r233, %r223, %r226; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r234, 0, %r233, %p74; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r235, %r234, %r218; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r236, %r229, %r232; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r237, 0, %r236, %p74; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r238, %r237, %r220; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r239, %r235, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r240, %r239, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r241, %r239, %r240; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r242, %r235, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r243, %r242, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r244, %r242, %r243; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r245, %r238, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r246, %r245, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r247, %r245, %r246; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r248, %r238, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r249, %r248, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r250, %r248, %r249; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.ge.s32 %p75, %r241, %r244; + setp.ne.b32 %p76, %r241, %r244; + setp.le.s32 %p77, %r247, %r250; + or.pred %p78, %p76, %p77; + and.pred %p79, %p75, %p78; + xor.pred %p80, %p79, %p12; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r251, %r241, %r244; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r252, 0, %r251, %p80; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r253, %r252, %r235; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r254, %r247, %r250; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r255, 0, %r254, %p80; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r256, %r255, %r238; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r257, %r253, %r26; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r258, %r257, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r259, %r257, %r258; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r260, %r253, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r261, %r260, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r262, %r260, %r261; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r263, %r256, %r26; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r264, %r263, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r265, %r263, %r264; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r266, %r256, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r267, %r266, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r268, %r266, %r267; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.ge.s32 %p81, %r259, %r262; + setp.ne.b32 %p82, %r259, %r262; + setp.le.s32 %p83, %r265, %r268; + or.pred %p84, %p82, %p83; + and.pred %p85, %p81, %p84; + xor.pred %p86, %p85, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r269, %r259, %r262; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r270, 0, %r269, %p86; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r271, %r270, %r253; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r272, %r265, %r268; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r273, 0, %r272, %p86; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r274, %r273, %r256; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r275, %r271, %r25; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r276, %r275, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r277, %r275, %r276; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r278, %r271, %r18; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r279, %r278, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r280, %r278, %r279; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r281, %r274, %r25; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r282, %r281, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r283, %r281, %r282; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r284, %r274, %r18; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r285, %r284, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r286, %r284, %r285; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.ge.s32 %p87, %r277, %r280; + setp.ne.b32 %p88, %r277, %r280; + setp.le.s32 %p89, %r283, %r286; + or.pred %p90, %p88, %p89; + and.pred %p91, %p87, %p90; + xor.pred %p92, %p91, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r287, %r277, %r280; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r288, 0, %r287, %p92; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r289, %r288, %r271; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r290, %r283, %r286; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r291, 0, %r290, %p92; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r292, %r291, %r274; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r293, %r289, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r294, %r293, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r295, %r293, %r294; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r296, %r289, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r297, %r296, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r298, %r296, %r297; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r299, %r292, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r300, %r299, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r301, %r299, %r300; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r302, %r292, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r303, %r302, 1, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r304, %r302, %r303; + .loc 2 599 28 // triton_helpers.py:599:28 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.ge.s32 %p93, %r295, %r298; + setp.ne.b32 %p94, %r295, %r298; + setp.le.s32 %p95, %r301, %r304; + or.pred %p96, %p94, %p95; + and.pred %p97, %p93, %p96; + xor.pred %p98, %p97, %p25; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r305, %r295, %r298; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r306, 0, %r305, %p98; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r307, %r306, %r289; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r308, %r301, %r304; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r309, 0, %r308, %p98; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r310, %r309, %r292; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r311, %r307, %r27; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r312, %r311, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r313, %r311, %r312; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r314, %r307, %r23; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r315, %r314, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r316, %r314, %r315; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r317, %r310, %r27; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r318, %r317, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r319, %r317, %r318; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r320, %r310, %r23; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r321, %r320, 8, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r322, %r320, %r321; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.lt.s32 %p99, %r313, %r316; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.eq.b32 %p100, %r313, %r316; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.gt.s32 %p101, %r319, %r322; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + and.pred %p102, %p100, %p101; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + or.pred %p103, %p99, %p102; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r323, %r313, %r316; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r324, %r323, 0, %p103; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r325, %r324, %r307; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r326, %r319, %r322; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r327, %r326, 0, %p103; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r328, %r327, %r310; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r329, %r325, %r26; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r330, %r329, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r331, %r329, %r330; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r332, %r325, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r333, %r332, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r334, %r332, %r333; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r335, %r328, %r26; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r336, %r335, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r337, %r335, %r336; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r338, %r328, %r21; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r339, %r338, 4, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r340, %r338, %r339; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.lt.s32 %p104, %r331, %r334; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.eq.b32 %p105, %r331, %r334; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.gt.s32 %p106, %r337, %r340; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + and.pred %p107, %p105, %p106; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + or.pred %p108, %p104, %p107; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r341, %r331, %r334; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r342, %r341, 0, %p108; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r343, %r342, %r325; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r344, %r337, %r340; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r345, %r344, 0, %p108; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r346, %r345, %r328; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r347, %r343, %r25; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r348, %r347, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r349, %r347, %r348; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r350, %r343, %r18; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r351, %r350, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r352, %r350, %r351; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r353, %r346, %r25; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r354, %r353, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r355, %r353, %r354; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r356, %r346, %r18; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r357, %r356, 2, 31, -1; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + add.s32 %r358, %r356, %r357; + .loc 2 574 22 // triton_helpers.py:574:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.lt.s32 %p109, %r349, %r352; + .loc 2 591 21 // triton_helpers.py:591:21 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.eq.b32 %p110, %r349, %r352; + .loc 2 594 40 // triton_helpers.py:594:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + setp.gt.s32 %p111, %r355, %r358; + .loc 2 594 29 // triton_helpers.py:594:29 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + and.pred %p112, %p110, %p111; + .loc 2 594 23 // triton_helpers.py:594:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + or.pred %p113, %p109, %p112; + .loc 2 600 38 // triton_helpers.py:600:38 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r359, %r349, %r352; + .loc 2 600 46 // triton_helpers.py:600:46 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r360, %r359, 0, %p113; + .loc 2 600 15 // triton_helpers.py:600:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r361, %r360, %r343; + .loc 2 601 48 // triton_helpers.py:601:48 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r362, %r355, %r358; + .loc 2 601 59 // triton_helpers.py:601:59 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + selp.b32 %r363, %r362, 0, %p113; + .loc 2 601 22 // triton_helpers.py:601:22 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + xor.b32 %r364, %r363, %r346; + .loc 2 538 40 // triton_helpers.py:538:40 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r365, %r361, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r366, %r365, 1, 31, -1; + .loc 2 539 41 // triton_helpers.py:539:41 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r368, %r361, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r369, %r368, 1, 31, -1; + .loc 2 548 23 // triton_helpers.py:548:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r371, %r364, %r24; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r372, %r371, 1, 31, -1; + .loc 2 551 23 // triton_helpers.py:551:23 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + mul.lo.s32 %r374, %r364, %r19; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:51:71 ] + shfl.sync.bfly.b32 %r375, %r374, 1, 31, -1; +$L__tmp4: + .loc 1 54 35 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:54:35 + and.pred %p117, %p2, %p3; + selp.b64 %rd18, 1, 0, %p117; + and.pred %p118, %p2, %p4; + selp.b64 %rd19, 1, 0, %p118; +$L__tmp5: + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + selp.b32 %r381, 1, 0, %p117; + shfl.sync.bfly.b32 %r382, %r381, 8, 31, -1; + mov.b32 %r383, 0; + shfl.sync.bfly.b32 %r384, %r383, 8, 31, -1; + cvt.u64.u32 %rd20, %r382; + cvt.u64.u32 %rd21, %r384; + shl.b64 %rd22, %rd21, 32; + or.b64 %rd23, %rd20, %rd22; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + add.s64 %rd24, %rd23, %rd18; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + mov.b64 {_, %r385}, %rd24; + cvt.u32.u64 %r386, %rd24; + shfl.sync.bfly.b32 %r387, %r386, 4, 31, -1; + shfl.sync.bfly.b32 %r388, %r385, 4, 31, -1; + cvt.u64.u32 %rd25, %r387; + cvt.u64.u32 %rd26, %r388; + shl.b64 %rd27, %rd26, 32; + or.b64 %rd28, %rd25, %rd27; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + add.s64 %rd29, %rd24, %rd28; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + mov.b64 {_, %r389}, %rd29; + cvt.u32.u64 %r390, %rd29; + shfl.sync.bfly.b32 %r391, %r390, 2, 31, -1; + shfl.sync.bfly.b32 %r392, %r389, 2, 31, -1; + cvt.u64.u32 %rd30, %r391; + cvt.u64.u32 %rd31, %r392; + shl.b64 %rd32, %rd31, 32; + or.b64 %rd33, %rd30, %rd32; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + add.s64 %rd34, %rd29, %rd33; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + mov.b64 {_, %r393}, %rd34; + cvt.u32.u64 %r394, %rd34; + shfl.sync.bfly.b32 %r395, %r394, 1, 31, -1; + shfl.sync.bfly.b32 %r396, %r393, 1, 31, -1; + cvt.u64.u32 %rd35, %r395; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + add.s64 %rd36, %rd34, %rd35; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + selp.b32 %r397, 1, 0, %p118; + shfl.sync.bfly.b32 %r398, %r397, 8, 31, -1; + shfl.sync.bfly.b32 %r399, %r383, 8, 31, -1; + cvt.u64.u32 %rd37, %r398; + cvt.u64.u32 %rd38, %r399; + shl.b64 %rd39, %rd38, 32; + or.b64 %rd40, %rd37, %rd39; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + add.s64 %rd41, %rd40, %rd19; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + mov.b64 {_, %r400}, %rd41; + cvt.u32.u64 %r401, %rd41; + shfl.sync.bfly.b32 %r402, %r401, 4, 31, -1; + shfl.sync.bfly.b32 %r403, %r400, 4, 31, -1; + cvt.u64.u32 %rd42, %r402; + cvt.u64.u32 %rd43, %r403; + shl.b64 %rd44, %rd43, 32; + or.b64 %rd45, %rd42, %rd44; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + add.s64 %rd46, %rd41, %rd45; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + mov.b64 {_, %r404}, %rd46; + cvt.u32.u64 %r405, %rd46; + shfl.sync.bfly.b32 %r406, %r405, 2, 31, -1; + shfl.sync.bfly.b32 %r407, %r404, 2, 31, -1; + cvt.u64.u32 %rd47, %r406; + cvt.u64.u32 %rd48, %r407; + shl.b64 %rd49, %rd48, 32; + or.b64 %rd50, %rd47, %rd49; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + add.s64 %rd2, %rd46, %rd50; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + mov.b64 {_, %r408}, %rd2; + cvt.u32.u64 %r409, %rd2; + shfl.sync.bfly.b32 %r6, %r409, 1, 31, -1; + shfl.sync.bfly.b32 %r7, %r408, 1, 31, -1; +$L__tmp6: + .loc 1 58 35 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:58:35 + and.pred %p119, %p2, %p62; + selp.b64 %rd51, 1, 0, %p119; + and.pred %p120, %p2, %p63; + selp.b64 %rd52, 1, 0, %p120; +$L__tmp7: + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + selp.b32 %r410, 1, 0, %p119; + shfl.sync.bfly.b32 %r411, %r410, 8, 31, -1; + shfl.sync.bfly.b32 %r412, %r383, 8, 31, -1; + cvt.u64.u32 %rd53, %r411; + cvt.u64.u32 %rd54, %r412; + shl.b64 %rd55, %rd54, 32; + or.b64 %rd56, %rd53, %rd55; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + add.s64 %rd57, %rd56, %rd51; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + mov.b64 {_, %r413}, %rd57; + cvt.u32.u64 %r414, %rd57; + shfl.sync.bfly.b32 %r415, %r414, 4, 31, -1; + shfl.sync.bfly.b32 %r416, %r413, 4, 31, -1; + cvt.u64.u32 %rd58, %r415; + cvt.u64.u32 %rd59, %r416; + shl.b64 %rd60, %rd59, 32; + or.b64 %rd61, %rd58, %rd60; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + add.s64 %rd62, %rd57, %rd61; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + mov.b64 {_, %r417}, %rd62; + cvt.u32.u64 %r418, %rd62; + shfl.sync.bfly.b32 %r419, %r418, 2, 31, -1; + shfl.sync.bfly.b32 %r420, %r417, 2, 31, -1; + cvt.u64.u32 %rd63, %r419; + cvt.u64.u32 %rd64, %r420; + shl.b64 %rd65, %rd64, 32; + or.b64 %rd66, %rd63, %rd65; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + add.s64 %rd3, %rd62, %rd66; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + mov.b64 {_, %r421}, %rd3; + cvt.u32.u64 %r422, %rd3; + shfl.sync.bfly.b32 %r8, %r422, 1, 31, -1; + shfl.sync.bfly.b32 %r9, %r421, 1, 31, -1; + selp.b32 %r423, 1, 0, %p120; + shfl.sync.bfly.b32 %r424, %r423, 8, 31, -1; + shfl.sync.bfly.b32 %r425, %r383, 8, 31, -1; + cvt.u64.u32 %rd67, %r424; + cvt.u64.u32 %rd68, %r425; + shl.b64 %rd69, %rd68, 32; + or.b64 %rd70, %rd67, %rd69; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + add.s64 %rd71, %rd70, %rd52; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + mov.b64 {_, %r426}, %rd71; + cvt.u32.u64 %r427, %rd71; + shfl.sync.bfly.b32 %r428, %r427, 4, 31, -1; + shfl.sync.bfly.b32 %r429, %r426, 4, 31, -1; + cvt.u64.u32 %rd72, %r428; + cvt.u64.u32 %rd73, %r429; + shl.b64 %rd74, %rd73, 32; + or.b64 %rd75, %rd72, %rd74; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + add.s64 %rd76, %rd71, %rd75; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + mov.b64 {_, %r430}, %rd76; + cvt.u32.u64 %r431, %rd76; + shfl.sync.bfly.b32 %r432, %r431, 2, 31, -1; + shfl.sync.bfly.b32 %r433, %r430, 2, 31, -1; + cvt.u64.u32 %rd77, %r432; + cvt.u64.u32 %rd78, %r433; + shl.b64 %rd79, %rd78, 32; + or.b64 %rd80, %rd77, %rd79; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + add.s64 %rd4, %rd76, %rd80; + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + mov.b64 {_, %r434}, %rd4; + cvt.u32.u64 %r435, %rd4; + shfl.sync.bfly.b32 %r10, %r435, 1, 31, -1; + shfl.sync.bfly.b32 %r11, %r434, 1, 31, -1; +$L__tmp8: + .loc 1 60 21 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:60:21 + cvt.u32.u64 %r436, %rd36; + .loc 1 64 19 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:64:19 + setp.lt.s32 %p121, %r3, %r436; + .loc 1 66 35 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:66:35 + selp.b32 %r437, %r4, 16, %p121; + .loc 1 68 20 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:68:20 + add.s32 %r438, %r437, 17; + .loc 1 69 20 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:69:20 + setp.lt.s32 %p122, %r437, 0; + .loc 1 70 35 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:70:35 + selp.b32 %r12, %r438, %r437, %p122; + .loc 1 71 38 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:71:38 + setp.lt.u32 %p123, %r12, 17; + .loc 1 0 0 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:0 + setp.gt.u32 %p124, %r1, 31; + .loc 1 71 53 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:71:53 + or.pred %p125, %p124, %p123; + .loc 1 71 63 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:71:63 + @%p125 bra $L__BB0_2; + bra.uni $L__BB0_1; +$L__BB0_2: +$L__tmp9: + .loc 1 18 0 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:18 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:0 ] + add.s32 %r367, %r365, %r366; + add.s32 %r370, %r368, %r369; + add.s32 %r373, %r371, %r372; + add.s32 %r376, %r374, %r375; + setp.lt.s32 %p114, %r367, %r370; + setp.eq.b32 %p115, %r367, %r370; + setp.gt.s32 %p116, %r373, %r376; + xor.b32 %r377, %r373, %r376; + selp.b32 %r378, %r377, 0, %p116; + selp.b32 %r379, %r378, 0, %p115; + selp.b32 %r380, %r377, %r379, %p114; + xor.b32 %r5, %r380, %r364; +$L__tmp10: + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + cvt.u64.u32 %rd87, %r8; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + add.s64 %rd88, %rd3, %rd87; +$L__tmp11: + .loc 1 61 21 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:61:21 + cvt.u32.u64 %r439, %rd88; + .loc 1 71 63 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:71:63 + bar.sync 0; + .loc 1 75 19 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:75:19 + setp.lt.s32 %p127, %r3, %r439; + .loc 1 76 35 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:76:35 + selp.b32 %r440, %r5, 16, %p127; + .loc 1 77 20 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:77:20 + add.s32 %r441, %r440, 17; + .loc 1 78 20 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:78:20 + setp.lt.s32 %p128, %r440, 0; + .loc 1 79 35 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:79:35 + selp.b32 %r13, %r441, %r440, %p128; + .loc 1 80 38 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:80:38 + setp.lt.u32 %p129, %r13, 17; + .loc 1 80 53 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:80:53 + or.pred %p130, %p124, %p129; + .loc 1 80 63 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:80:63 + @%p130 bra $L__BB0_4; + bra.uni $L__BB0_3; +$L__BB0_4: + .loc 1 0 63 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:0:63 + ld.param.b64 %rd10, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_6]; + ld.param.b64 %rd9, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_5]; + ld.param.b64 %rd8, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_4]; + ld.param.b64 %rd7, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_3]; + ld.param.b64 %rd6, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_2]; + ld.param.b64 %rd5, [triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2_param_1]; + cvt.s64.s32 %rd1, %r15; +$L__tmp12: + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + cvt.u64.u32 %rd101, %r10; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:59:26 ] + add.s64 %rd102, %rd4, %rd101; +$L__tmp13: + .loc 1 61 21 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:61:21 + cvt.u32.u64 %r443, %rd102; +$L__tmp14: + .loc 3 291 36 // standard.py:291:36 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + cvt.u64.u32 %rd103, %r6; + .loc 3 261 15 // standard.py:261:15 @[ cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:55:26 ] + add.s64 %rd104, %rd2, %rd103; +$L__tmp15: + .loc 1 60 21 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:60:21 + cvt.u32.u64 %r442, %rd104; + .loc 1 80 63 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:80:63 + bar.sync 0; + .loc 1 81 25 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:81:25 + mul.wide.u32 %rd105, %r1, 4; + add.s64 %rd95, %rd5, %rd105; + .loc 1 81 37 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:81:37 + and.b32 %r448, %r2, 63; + setp.eq.b32 %p138, %r448, 0; + and.pred %p131, %p2, %p138; + // begin inline asm + @%p131 st.global.b32 [ %rd95 + 0 ], { %r442 }; + // end inline asm + .loc 1 82 25 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:82:25 + add.s64 %rd96, %rd6, %rd105; + .loc 1 82 37 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:82:37 + // begin inline asm + @%p131 st.global.b32 [ %rd96 + 0 ], { %r443 }; + // end inline asm + .loc 1 83 25 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:83:25 + shl.b64 %rd106, %rd1, 2; + add.s64 %rd97, %rd7, %rd106; + .loc 1 83 47 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:83:47 + and.b32 %r449, %r2, 48; + setp.eq.b32 %p139, %r449, 0; + and.pred %p133, %p2, %p139; + // begin inline asm + @%p133 st.global.b32 [ %rd97 + 0 ], { %r4 }; + // end inline asm + .loc 1 84 52 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:84:52 + mul.lo.s32 %r450, %r1, 17; + .loc 1 84 49 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:84:49 + add.s32 %r451, %r12, %r450; + .loc 1 84 25 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:84:25 + mad.wide.s32 %rd98, %r451, 4, %rd8; + mov.b32 %r445, 1; + .loc 1 84 85 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:84:85 + // begin inline asm + @%p133 st.global.b32 [ %rd98 + 0 ], { %r445 }; + // end inline asm + .loc 1 85 25 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:85:25 + add.s64 %rd99, %rd9, %rd106; + .loc 1 85 47 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:85:47 + // begin inline asm + @%p133 st.global.b32 [ %rd99 + 0 ], { %r5 }; + // end inline asm + .loc 1 86 49 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:86:49 + add.s32 %r452, %r13, %r450; + .loc 1 86 25 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:86:25 + mad.wide.s32 %rd100, %r452, 4, %rd10; + .loc 1 86 85 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:86:85 + // begin inline asm + @%p133 st.global.b32 [ %rd100 + 0 ], { %r445 }; + // end inline asm + .loc 1 86 4 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:86:4 + ret; +$L__BB0_1: + .loc 1 71 63 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:71:63 + { // callseq 0, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd81, assertFunc_0; + cvta.global.u64 %rd82, %rd81; + st.param.b64 [param3], %rd82; + mov.b64 %rd83, assertFile_0; + cvta.global.u64 %rd84, %rd83; + st.param.b64 [param1], %rd84; + mov.b64 %rd85, assertMessage_0; + cvta.global.u64 %rd86, %rd85; + st.param.b64 [param0], %rd86; + st.param.b64 [param4], 1; + st.param.b32 [param2], 71; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 0 + trap; +$L__BB0_3: + .loc 1 80 63 // cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py:80:63 + { // callseq 1, 0 + .param .b64 param0; + .param .b64 param1; + .param .b32 param2; + .param .b64 param3; + .param .b64 param4; + mov.b64 %rd89, assertFunc_1; + cvta.global.u64 %rd90, %rd89; + st.param.b64 [param3], %rd90; + mov.b64 %rd91, assertFile_1; + cvta.global.u64 %rd92, %rd91; + st.param.b64 [param1], %rd92; + mov.b64 %rd93, assertMessage_1; + cvta.global.u64 %rd94, %rd93; + st.param.b64 [param0], %rd94; + st.param.b64 [param4], 1; + st.param.b32 [param2], 80; + call.uni __assertfail, (param0, param1, param2, param3, param4); + } // callseq 1 + trap; +$L__tmp16: +$L__func_end0: + // -- End function +} + .file 1 "/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py" + .file 2 "/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py" + .file 3 "/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 // Abbreviation Code +.b8 17 // DW_TAG_compile_unit +.b8 1 // DW_CHILDREN_yes +.b8 37 // DW_AT_producer +.b8 8 // DW_FORM_string +.b8 19 // DW_AT_language +.b8 5 // DW_FORM_data2 +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 16 // DW_AT_stmt_list +.b8 6 // DW_FORM_data4 +.b8 27 // DW_AT_comp_dir +.b8 8 // DW_FORM_string +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 2 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 0 // DW_CHILDREN_no +.b8 3 // DW_AT_name +.b8 8 // DW_FORM_string +.b8 32 // DW_AT_inline +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 3 // Abbreviation Code +.b8 46 // DW_TAG_subprogram +.b8 1 // DW_CHILDREN_yes +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 4 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 87 // DW_AT_call_column +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 5 // Abbreviation Code +.b8 29 // DW_TAG_inlined_subroutine +.b8 0 // DW_CHILDREN_no +.b8 49 // DW_AT_abstract_origin +.b8 19 // DW_FORM_ref4 +.b8 17 // DW_AT_low_pc +.b8 1 // DW_FORM_addr +.b8 18 // DW_AT_high_pc +.b8 1 // DW_FORM_addr +.b8 88 // DW_AT_call_file +.b8 11 // DW_FORM_data1 +.b8 89 // DW_AT_call_line +.b8 11 // DW_FORM_data1 +.b8 0 // EOM(1) +.b8 0 // EOM(2) +.b8 0 // EOM(3) + } + .section .debug_info + { +.b32 399 // Length of Unit +.b8 2 // DWARF version number +.b8 0 +.b32 .debug_abbrev // Offset Into Abbrev. Section +.b8 8 // Address Size (in bytes) +.b8 1 // Abbrev [1] 0xb:0x188 DW_TAG_compile_unit +.b8 116 // DW_AT_producer +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 // DW_AT_language +.b8 0 +.b8 99 // DW_AT_name +.b8 102 +.b8 104 +.b8 119 +.b8 55 +.b8 106 +.b8 115 +.b8 114 +.b8 53 +.b8 115 +.b8 107 +.b8 55 +.b8 52 +.b8 107 +.b8 99 +.b8 104 +.b8 100 +.b8 104 +.b8 113 +.b8 111 +.b8 103 +.b8 51 +.b8 113 +.b8 121 +.b8 113 +.b8 55 +.b8 103 +.b8 101 +.b8 51 +.b8 113 +.b8 106 +.b8 113 +.b8 109 +.b8 109 +.b8 113 +.b8 119 +.b8 121 +.b8 51 +.b8 55 +.b8 108 +.b8 114 +.b8 50 +.b8 98 +.b8 101 +.b8 52 +.b8 108 +.b8 112 +.b8 121 +.b8 107 +.b8 122 +.b8 120 +.b8 51 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line // DW_AT_stmt_list +.b8 47 // DW_AT_comp_dir +.b8 119 +.b8 111 +.b8 114 +.b8 107 +.b8 115 +.b8 112 +.b8 97 +.b8 99 +.b8 101 +.b8 47 +.b8 104 +.b8 97 +.b8 110 +.b8 114 +.b8 117 +.b8 105 +.b8 47 +.b8 83 +.b8 112 +.b8 101 +.b8 99 +.b8 70 +.b8 111 +.b8 114 +.b8 103 +.b8 101 +.b8 45 +.b8 101 +.b8 120 +.b8 116 +.b8 47 +.b8 99 +.b8 97 +.b8 99 +.b8 104 +.b8 101 +.b8 47 +.b8 99 +.b8 111 +.b8 109 +.b8 112 +.b8 105 +.b8 108 +.b8 101 +.b8 100 +.b8 95 +.b8 107 +.b8 101 +.b8 114 +.b8 110 +.b8 101 +.b8 108 +.b8 115 +.b8 47 +.b8 102 +.b8 104 +.b8 0 +.b8 2 // Abbrev [2] 0x8b:0x7a DW_TAG_subprogram +.b8 116 // DW_AT_name +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 112 +.b8 101 +.b8 114 +.b8 95 +.b8 102 +.b8 117 +.b8 115 +.b8 101 +.b8 100 +.b8 95 +.b8 95 +.b8 116 +.b8 111 +.b8 95 +.b8 99 +.b8 111 +.b8 112 +.b8 121 +.b8 95 +.b8 97 +.b8 114 +.b8 97 +.b8 110 +.b8 103 +.b8 101 +.b8 95 +.b8 98 +.b8 105 +.b8 116 +.b8 119 +.b8 105 +.b8 115 +.b8 101 +.b8 95 +.b8 97 +.b8 110 +.b8 100 +.b8 95 +.b8 101 +.b8 113 +.b8 95 +.b8 103 +.b8 116 +.b8 95 +.b8 105 +.b8 110 +.b8 100 +.b8 101 +.b8 120 +.b8 95 +.b8 112 +.b8 117 +.b8 116 +.b8 95 +.b8 108 +.b8 116 +.b8 95 +.b8 110 +.b8 101 +.b8 119 +.b8 95 +.b8 122 +.b8 101 +.b8 114 +.b8 111 +.b8 115 +.b8 95 +.b8 115 +.b8 99 +.b8 97 +.b8 108 +.b8 97 +.b8 114 +.b8 95 +.b8 116 +.b8 101 +.b8 110 +.b8 115 +.b8 111 +.b8 114 +.b8 95 +.b8 115 +.b8 111 +.b8 114 +.b8 116 +.b8 95 +.b8 115 +.b8 117 +.b8 109 +.b8 95 +.b8 117 +.b8 110 +.b8 115 +.b8 113 +.b8 117 +.b8 101 +.b8 101 +.b8 122 +.b8 101 +.b8 95 +.b8 118 +.b8 105 +.b8 101 +.b8 119 +.b8 95 +.b8 119 +.b8 104 +.b8 101 +.b8 114 +.b8 101 +.b8 95 +.b8 50 +.b8 0 +.b8 1 // DW_AT_inline +.b8 3 // Abbrev [3] 0x105:0x8d DW_TAG_subprogram +.b64 $L__func_begin0 // DW_AT_low_pc +.b64 $L__func_end0 // DW_AT_high_pc +.b32 139 // DW_AT_abstract_origin +.b8 4 // Abbrev [4] 0x11a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp1 // DW_AT_low_pc +.b64 $L__tmp2 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 46 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x132:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp3 // DW_AT_low_pc +.b64 $L__tmp4 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 51 // DW_AT_call_line +.b8 71 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x14a:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp5 // DW_AT_low_pc +.b64 $L__tmp15 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 55 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 4 // Abbrev [4] 0x162:0x18 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp7 // DW_AT_low_pc +.b64 $L__tmp13 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 59 // DW_AT_call_line +.b8 26 // DW_AT_call_column +.b8 5 // Abbrev [5] 0x17a:0x17 DW_TAG_inlined_subroutine +.b32 139 // DW_AT_abstract_origin +.b64 $L__tmp9 // DW_AT_low_pc +.b64 $L__tmp10 // DW_AT_high_pc +.b8 1 // DW_AT_call_file +.b8 0 // DW_AT_call_line +.b8 0 // End Of Children Mark +.b8 0 // End Of Children Mark + } + .section .debug_macinfo { } diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source b/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source new file mode 100644 index 0000000000000000000000000000000000000000..7fe239bbbbfde849c05a56e8b84f27eea5c7cd6a --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.source @@ -0,0 +1,1397 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":18:0) +#loc90 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":640:0) +#loc94 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":607:0) +#loc102 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":518:0) +#loc140 = loc(unknown) +#loc165 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":285:0) +#loc169 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":260:0) +#loc174 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":86:0) +#loc178 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":63:0) +#loc187 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":131:0) +#loc192 = loc("in_ptr0"(#loc)) +#loc193 = loc("out_ptr4"(#loc)) +#loc194 = loc("out_ptr5"(#loc)) +#loc195 = loc("out_ptr6"(#loc)) +#loc196 = loc("out_ptr7"(#loc)) +#loc197 = loc("out_ptr8"(#loc)) +#loc198 = loc("out_ptr9"(#loc)) +#loc199 = loc("xnumel"(#loc)) +#loc200 = loc("r0_numel"(#loc)) +#loc255 = loc("x"(#loc90)) +#loc256 = loc("idxs"(#loc90)) +#loc257 = loc("x"(#loc94)) +#loc258 = loc("idxs"(#loc94)) +#loc263 = loc("x"(#loc102)) +#loc264 = loc("idxs"(#loc102)) +#loc265 = loc("flip"(#loc102)) +#loc321 = loc("input"(#loc165)) +#loc322 = loc("a"(#loc169)) +#loc323 = loc("b"(#loc169)) +#loc325 = loc("x"(#loc174)) +#loc326 = loc("x"(#loc178)) +#loc327 = loc("input"(#loc187)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %xnumel_0 = arith.constant 32 : i32 loc(#loc201) + %r0_numel_1 = arith.constant 16 : i32 loc(#loc202) + %xoffset = tt.get_program_id x : i32 loc(#loc203) + %xoffset_2 = arith.constant 1 : i32 loc(#loc204) + %xoffset_3 = arith.constant 1 : i32 loc(#loc204) + %xoffset_4 = arith.muli %xoffset, %xoffset_3 : i32 loc(#loc204) + %xindex = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc205) + %xindex_5 = tt.expand_dims %xindex {axis = 1 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc206) + %xindex_6 = tt.splat %xoffset_4 : i32 -> tensor<1x1xi32> loc(#loc207) + %xindex_7 = arith.addi %xindex_6, %xindex_5 : tensor<1x1xi32> loc(#loc207) + %xmask = arith.constant dense<32> : tensor<1x1xi32> loc(#loc208) + %xmask_8 = arith.cmpi slt, %xindex_7, %xmask : tensor<1x1xi32> loc(#loc208) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc209) + %r0_index_9 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc210) + %r0_offset = arith.constant 0 : i32 loc(#loc211) + %r0_mask = arith.constant true loc(#loc212) + %r0_mask_10 = arith.constant dense : tensor<1x16xi1> loc(#loc212) + %tmp0 = arith.constant 16 : i32 loc(#loc213) + %tmp0_11 = arith.constant 16 : i32 loc(#loc213) + %tmp0_12 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc213) + %tmp0_13 = arith.muli %tmp0_12, %xindex_7 : tensor<1x1xi32> loc(#loc213) + %tmp0_14 = tt.broadcast %tmp0_13 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc214) + %tmp0_15 = arith.addi %r0_index_9, %tmp0_14 : tensor<1x16xi32> loc(#loc214) + %tmp0_16 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc215) + %tmp0_17 = tt.addptr %tmp0_16, %tmp0_15 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc215) + %tmp0_18 = arith.constant 0.000000e+00 : f32 loc(#loc216) + %tmp0_19 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc216) + %tmp0_20 = arith.constant dense<0.000000e+00> : tensor<1x16xf32> loc(#loc216) + %tmp0_21 = arith.fptosi %tmp0_20 : tensor<1x16xf32> to tensor<1x16xi64> loc(#loc216) + %tmp0_22 = tt.load %tmp0_17, %tmp0_19, %tmp0_21 : tensor<1x16x!tt.ptr> loc(#loc216) + %tmp1 = arith.constant 0 : i64 loc(#loc217) + %tmp1_23 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc217) + %tmp2 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc218) + %tmp2_24 = arith.cmpi sgt, %tmp0_22, %tmp2 : tensor<1x16xi64> loc(#loc218) + %tmp3 = arith.constant 16384 : i64 loc(#loc219) + %tmp3_25 = arith.constant dense<16384> : tensor<1x1xi64> loc(#loc219) + %tmp4 = arith.constant dense<16384> : tensor<1x16xi64> loc(#loc220) + %tmp4_26 = arith.cmpi slt, %tmp0_22, %tmp4 : tensor<1x16xi64> loc(#loc220) + %tmp5 = arith.andi %tmp2_24, %tmp4_26 : tensor<1x16xi1> loc(#loc221) + %tmp6 = arith.extui %tmp5 : tensor<1x16xi1> to tensor<1x16xi8> loc(#loc222) + %tmp7 = arith.extsi %tmp6 : tensor<1x16xi8> to tensor<1x16xi32> loc(#loc223) + %tmp9 = arith.trunci %r0_index_9 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc224) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp7, %tmp9) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc25) + %tmp14 = arith.constant dense<16384> : tensor<1x16xi64> loc(#loc225) + %tmp14_27 = arith.cmpi eq, %tmp0_22, %tmp14 : tensor<1x16xi64> loc(#loc225) + %tmp15 = arith.extui %tmp14_27 : tensor<1x16xi1> to tensor<1x16xi8> loc(#loc226) + %tmp16 = arith.extsi %tmp15 : tensor<1x16xi8> to tensor<1x16xi32> loc(#loc227) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%tmp16, %tmp9) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc29) + %tmp20 = arith.extsi %tmp7 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc228) + %tmp23 = arith.constant 0 : i32 loc(#loc229) + %tmp23_28 = arith.constant 0 : i64 loc(#loc229) + %tmp23_29 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc229) + %tmp23_30 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc229) + %tmp23_31 = arith.select %tmp23_30, %tmp20, %tmp23_29 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc229) + %tmp24 = tt.call @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp23_31) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc230) + %tmp24_32 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc231) + %tmp25 = arith.extsi %tmp16 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc232) + %tmp28 = arith.constant 0 : i32 loc(#loc233) + %tmp28_33 = arith.constant 0 : i64 loc(#loc233) + %tmp28_34 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc233) + %tmp28_35 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc233) + %tmp28_36 = arith.select %tmp28_35, %tmp25, %tmp28_34 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc233) + %tmp29 = tt.call @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%tmp28_36) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc234) + %tmp29_37 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc235) + %tmp30 = arith.trunci %tmp24_32 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc236) + %tmp31 = arith.trunci %tmp29_37 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc237) + %tmp32 = arith.extsi %0#1 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc238) + %tmp33 = arith.trunci %tmp32 : tensor<1x16xi64> to tensor<1x16xi32> loc(#loc239) + %tmp34 = tt.broadcast %tmp30 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc240) + %tmp34_38 = arith.cmpi slt, %r0_index_9, %tmp34 : tensor<1x16xi32> loc(#loc240) + %tmp35 = arith.constant 16 : i32 loc(#loc241) + %tmp35_39 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc241) + %tmp36 = arith.constant dense<16> : tensor<1x16xi32> loc(#loc242) + %tmp36_40 = arith.select %tmp34_38, %tmp33, %tmp36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc242) + %tmp37 = arith.constant 17 : i32 loc(#loc243) + %tmp37_41 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc243) + %tmp38 = arith.addi %tmp36_40, %tmp37_41 : tensor<1x16xi32> loc(#loc244) + %tmp39 = arith.constant 0 : i32 loc(#loc245) + %tmp39_42 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc245) + %tmp39_43 = arith.cmpi slt, %tmp36_40, %tmp39_42 : tensor<1x16xi32> loc(#loc245) + %tmp40 = arith.select %tmp39_43, %tmp38, %tmp36_40 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc246) + %c0_i32 = arith.constant 0 : i32 loc(#loc49) + %cst = arith.constant dense<0> : tensor<1x16xi32> loc(#loc49) + %2 = arith.cmpi sle, %cst, %tmp40 : tensor<1x16xi32> loc(#loc49) + %c17_i32 = arith.constant 17 : i32 loc(#loc50) + %cst_44 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc50) + %3 = arith.cmpi slt, %tmp40, %cst_44 : tensor<1x16xi32> loc(#loc50) + %4 = arith.andi %2, %3 : tensor<1x16xi1> loc(#loc51) + %true = arith.constant true loc(#loc52) + %cst_45 = arith.constant dense : tensor<1x1xi1> loc(#loc52) + %5 = arith.xori %xmask_8, %cst_45 : tensor<1x1xi1> loc(#loc52) + %6 = tt.broadcast %5 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc53) + %7 = arith.ori %4, %6 : tensor<1x16xi1> loc(#loc53) + tt.assert %7, "index out of bounds: 0 <= tmp40 < 17" : tensor<1x16xi1> loc(#loc54) + %tmp42 = arith.constant 1 : i32 loc(#loc247) + %tmp42_46 = arith.constant dense<1> : tensor<1x1xi32> loc(#loc247) + %tmp43 = arith.extsi %1#1 : tensor<1x16xi32> to tensor<1x16xi64> loc(#loc248) + %tmp44 = arith.trunci %tmp43 : tensor<1x16xi64> to tensor<1x16xi32> loc(#loc249) + %tmp45 = tt.broadcast %tmp31 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc250) + %tmp45_47 = arith.cmpi slt, %r0_index_9, %tmp45 : tensor<1x16xi32> loc(#loc250) + %tmp46 = arith.constant dense<16> : tensor<1x16xi32> loc(#loc251) + %tmp46_48 = arith.select %tmp45_47, %tmp44, %tmp46 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc251) + %tmp47 = arith.addi %tmp46_48, %tmp37_41 : tensor<1x16xi32> loc(#loc252) + %tmp48 = arith.constant 0 : i32 loc(#loc253) + %tmp48_49 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc253) + %tmp48_50 = arith.cmpi slt, %tmp46_48, %tmp48_49 : tensor<1x16xi32> loc(#loc253) + %tmp49 = arith.select %tmp48_50, %tmp47, %tmp46_48 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc254) + %c0_i32_51 = arith.constant 0 : i32 loc(#loc63) + %cst_52 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc63) + %8 = arith.cmpi sle, %cst_52, %tmp49 : tensor<1x16xi32> loc(#loc63) + %c17_i32_53 = arith.constant 17 : i32 loc(#loc64) + %cst_54 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc64) + %9 = arith.cmpi slt, %tmp49, %cst_54 : tensor<1x16xi32> loc(#loc64) + %10 = arith.andi %8, %9 : tensor<1x16xi1> loc(#loc65) + %true_55 = arith.constant true loc(#loc66) + %cst_56 = arith.constant dense : tensor<1x1xi1> loc(#loc66) + %11 = arith.xori %xmask_8, %cst_56 : tensor<1x1xi1> loc(#loc66) + %12 = tt.broadcast %11 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc67) + %13 = arith.ori %10, %12 : tensor<1x16xi1> loc(#loc67) + tt.assert %13, "index out of bounds: 0 <= tmp49 < 17" : tensor<1x16xi1> loc(#loc68) + %14 = tt.splat %out_ptr4 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc69) + %15 = tt.addptr %14, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc69) + tt.store %15, %tmp30, %xmask_8 : tensor<1x1x!tt.ptr> loc(#loc70) + %16 = tt.splat %out_ptr5 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc71) + %17 = tt.addptr %16, %xindex_7 : tensor<1x1x!tt.ptr>, tensor<1x1xi32> loc(#loc71) + tt.store %17, %tmp31, %xmask_8 : tensor<1x1x!tt.ptr> loc(#loc72) + %c16_i32 = arith.constant 16 : i32 loc(#loc73) + %c16_i32_57 = arith.constant 16 : i32 loc(#loc73) + %cst_58 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc73) + %18 = arith.muli %cst_58, %xindex_7 : tensor<1x1xi32> loc(#loc73) + %19 = tt.broadcast %18 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc74) + %20 = arith.addi %r0_index_9, %19 : tensor<1x16xi32> loc(#loc74) + %21 = tt.splat %out_ptr6 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc75) + %22 = tt.addptr %21, %20 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc75) + %23 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc76) + tt.store %22, %tmp33, %23 : tensor<1x16x!tt.ptr> loc(#loc76) + %c17_i32_59 = arith.constant 17 : i32 loc(#loc77) + %c17_i32_60 = arith.constant 17 : i32 loc(#loc77) + %cst_61 = arith.constant dense<17> : tensor<1x1xi32> loc(#loc77) + %24 = arith.muli %cst_61, %xindex_7 : tensor<1x1xi32> loc(#loc77) + %25 = tt.broadcast %24 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc78) + %26 = arith.addi %tmp40, %25 : tensor<1x16xi32> loc(#loc78) + %27 = tt.splat %out_ptr7 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc79) + %28 = tt.addptr %27, %26 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc79) + %cst_62 = arith.constant dense<1> : tensor<1x16xi32> loc(#loc80) + %29 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc80) + tt.store %28, %cst_62, %29 : tensor<1x16x!tt.ptr> loc(#loc80) + %c16_i32_63 = arith.constant 16 : i32 loc(#loc81) + %c16_i32_64 = arith.constant 16 : i32 loc(#loc81) + %cst_65 = arith.constant dense<16> : tensor<1x1xi32> loc(#loc81) + %30 = arith.muli %cst_65, %xindex_7 : tensor<1x1xi32> loc(#loc81) + %31 = tt.broadcast %30 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc82) + %32 = arith.addi %r0_index_9, %31 : tensor<1x16xi32> loc(#loc82) + %33 = tt.splat %out_ptr8 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc83) + %34 = tt.addptr %33, %32 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc83) + %35 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc84) + tt.store %34, %tmp44, %35 : tensor<1x16x!tt.ptr> loc(#loc84) + %c17_i32_66 = arith.constant 17 : i32 loc(#loc85) + %c17_i32_67 = arith.constant 17 : i32 loc(#loc85) + %cst_68 = arith.constant dense<17> : tensor<1x1xi32> loc(#loc85) + %36 = arith.muli %cst_68, %xindex_7 : tensor<1x1xi32> loc(#loc85) + %37 = tt.broadcast %36 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc86) + %38 = arith.addi %tmp49, %37 : tensor<1x16xi32> loc(#loc86) + %39 = tt.splat %out_ptr9 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc87) + %40 = tt.addptr %39, %38 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc87) + %cst_69 = arith.constant dense<1> : tensor<1x16xi32> loc(#loc88) + %41 = tt.broadcast %xmask_8 : tensor<1x1xi1> -> tensor<1x16xi1> loc(#loc88) + tt.store %40, %cst_69, %41 : tensor<1x16x!tt.ptr> loc(#loc88) + tt.return loc(#loc89) + } loc(#loc) + tt.func private @"torch._inductor.runtime.triton_helpers.sort_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc90)), %idxs: tensor<1x16xi16> loc("idxs"(#loc90))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs) : (tensor<1x16xi32>, tensor<1x16xi16>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc91) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc91) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc91) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1) : (tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc91) + tt.return %3#0, %3#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc92) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc93) + %5 = ub.poison : tensor<1x16xi32> loc(#loc93) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc93) + } loc(#loc90) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i16S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_1__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc94)), %idxs: tensor<1x16xi16> loc("idxs"(#loc94))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc259) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc260) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc260) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc261) + %flip_3 = tt.reshape %flip_2 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc262) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i16S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi16>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + tt.return %0#0, %0#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc100) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi32> loc(#loc101) + %2 = ub.poison : tensor<1x16xi32> loc(#loc101) + tt.return %1, %2 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc101) + } loc(#loc94) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i16S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi16> loc("idxs"(#loc102)), %flip: tensor<1x16xi32> loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi16> -> tensor<8x2x1xi16> loc(#loc280) + %left_idx = arith.trunci %left_mask_4 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc281) + %left_idx_15 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc282) + %left_idx_16 = arith.muli %y_idx, %left_idx_15 : tensor<8x2x1xi16> loc(#loc282) + %left_idx_17 = tt.call @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_16) : (tensor<8x2x1xi16>) -> tensor<8x1xi32> loc(#loc283) + %left_idx_18 = tt.expand_dims %left_idx_17 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc284) + %left_idx_19 = tt.broadcast %left_idx_18 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc285) + %right_idx = arith.trunci %right_mask_1 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc286) + %right_idx_20 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc287) + %right_idx_21 = arith.muli %y_idx, %right_idx_20 : tensor<8x2x1xi16> loc(#loc287) + %right_idx_22 = tt.call @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_21) : (tensor<8x2x1xi16>) -> tensor<8x1xi32> loc(#loc288) + %right_idx_23 = tt.expand_dims %right_idx_22 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc289) + %right_idx_24 = tt.broadcast %right_idx_23 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc290) + %left_idx_25 = tt.reshape %left_idx_19 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_26 = tt.reshape %right_idx_24 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_27 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_28 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_49 = arith.constant true loc(#loc298) + %cond_50 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_51 = arith.xori %left_isnan, %cond_50 : tensor<1x16xi1> loc(#loc298) + %cond_52 = arith.andi %right_isnan, %cond_51 : tensor<1x16xi1> loc(#loc299) + %cond_53 = arith.ori %cond, %cond_52 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_53 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_49 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_50 = arith.ori %eq, %eq_49 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_50 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_29 = arith.cmpi sgt, %left_idx_25, %right_idx_26 : tensor<1x16xi32> loc(#loc304) + %cond_30 = arith.andi %3, %cond_29 : tensor<1x16xi1> loc(#loc305) + %cond_31 = arith.ori %1, %cond_30 : tensor<1x16xi1> loc(#loc306) + %cond_32 = arith.cmpi ugt, %right_valid_mask_28, %left_valid_mask_27 : tensor<1x16xi1> loc(#loc307) + %cond_33 = arith.cmpi eq, %right_valid_mask_28, %left_valid_mask_27 : tensor<1x16xi1> loc(#loc308) + %cond_34 = arith.andi %cond_33, %cond_31 : tensor<1x16xi1> loc(#loc309) + %cond_35 = arith.ori %cond_32, %cond_34 : tensor<1x16xi1> loc(#loc310) + %cond_36 = arith.extui %cond_35 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc311) + %cond_37 = arith.xori %cond_36, %flip : tensor<1x16xi32> loc(#loc311) + %cond_38 = arith.constant 0 : i32 loc(#loc312) + %cond_39 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc312) + %cond_40 = arith.cmpi ne, %cond_37, %cond_39 : tensor<1x16xi32> loc(#loc312) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_41 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_42 = arith.select %cond_40, %ret, %ret_41 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_43 = arith.xori %x, %ret_42 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_25, %right_idx_26 : tensor<1x16xi32> loc(#loc317) + %new_idxs_44 = tt.call @triton.language.standard.zeros_like__i16S1_16S__(%idxs) : (tensor<1x16xi16>) -> tensor<1x16xi16> loc(#loc318) + %new_idxs_45 = arith.extsi %new_idxs_44 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc319) + %new_idxs_46 = arith.select %cond_40, %new_idxs, %new_idxs_45 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_47 = arith.extsi %idxs : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc320) + %new_idxs_48 = arith.xori %new_idxs_47, %new_idxs_46 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_43, %new_idxs_48 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x1xi32> loc("input"(#loc165))) -> tensor<8x1xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc166) + tt.reduce.return %2 : i32 loc(#loc166) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc166) + tt.return %0 : tensor<8x1xi32> loc(#loc167) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x1xi32> loc(#loc168) + tt.return %1 : tensor<8x1xi32> loc(#loc168) + } loc(#loc165) + tt.func private @triton.language.standard._sum_combine__i32_i32__(%a: i32 loc("a"(#loc169)), %b: i32 loc("b"(#loc169))) -> i32 attributes {noinline = false} { + %0 = arith.addi %a, %b : i32 loc(#loc170) + tt.return %0 : i32 loc(#loc171) + ^bb1: // no predecessors + %1 = ub.poison : i32 loc(#loc172) + tt.return %1 : i32 loc(#loc172) + } loc(#loc169) + tt.func private @"triton.language.standard.sum__i16S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<8x2x1xi16> loc("input"(#loc165))) -> tensor<8x1xi32> attributes {noinline = false} { + %input_0 = arith.extsi %input : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc324) + %0 = "tt.reduce"(%input_0) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc166) + tt.reduce.return %2 : i32 loc(#loc166) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc166) + tt.return %0 : tensor<8x1xi32> loc(#loc167) + ^bb1: // no predecessors + %1 = ub.poison : tensor<8x1xi32> loc(#loc168) + tt.return %1 : tensor<8x1xi32> loc(#loc168) + } loc(#loc165) + tt.func private @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%x: tensor<1x16xi32> loc("x"(#loc174))) -> i1 attributes {noinline = false} { + %0 = tt.call @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc175) + %false = arith.constant false loc(#loc176) + tt.return %false : i1 loc(#loc176) + ^bb1: // no predecessors + %1 = ub.poison : i1 loc(#loc177) + tt.return %1 : i1 loc(#loc177) + } loc(#loc174) + tt.func private @torch._inductor.runtime.triton_helpers.promote_to_tensor__i32S1_16S__(%x: tensor<1x16xi32> loc("x"(#loc178))) -> tensor<1x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc179) + %1 = arith.extui %0 : tensor<1xi1> to tensor<1xi32> loc(#loc180) + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<1xi32> -> tensor<1x1xi32> loc(#loc180) + %3 = tt.broadcast %2 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc180) + %4 = arith.addi %x, %3 : tensor<1x16xi32> loc(#loc180) + tt.return %4 : tensor<1x16xi32> loc(#loc181) + ^bb1: // no predecessors + %5 = ub.poison : tensor<1x16xi32> loc(#loc182) + tt.return %5 : tensor<1x16xi32> loc(#loc182) + } loc(#loc178) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(1,)cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} { + %false = arith.constant false loc(#loc184) + %cst = arith.constant dense : tensor<1xi1> loc(#loc184) + tt.return %cst : tensor<1xi1> loc(#loc185) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1xi1> loc(#loc186) + tt.return %0 : tensor<1xi1> loc(#loc186) + } loc(#loc183) + tt.func private @triton.language.standard.zeros_like__i32S1_16S__(%input: tensor<1x16xi32> loc("input"(#loc187))) -> tensor<1x16xi32> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() : () -> tensor<1x16xi32> loc(#loc188) + tt.return %0 : tensor<1x16xi32> loc(#loc189) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi32> loc(#loc190) + tt.return %1 : tensor<1x16xi32> loc(#loc190) + } loc(#loc187) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int32_"() -> tensor<1x16xi32> attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 loc(#loc184) + %cst = arith.constant dense<0> : tensor<1x16xi32> loc(#loc184) + tt.return %cst : tensor<1x16xi32> loc(#loc185) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16xi32> loc(#loc186) + tt.return %0 : tensor<1x16xi32> loc(#loc186) + } loc(#loc183) + tt.func private @triton.language.standard.zeros_like__i16S1_16S__(%input: tensor<1x16xi16> loc("input"(#loc187))) -> tensor<1x16xi16> attributes {noinline = false} { + %0 = tt.call @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() : () -> tensor<1x16xi16> loc(#loc188) + tt.return %0 : tensor<1x16xi16> loc(#loc189) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x16xi16> loc(#loc190) + tt.return %1 : tensor<1x16xi16> loc(#loc190) + } loc(#loc187) + tt.func private @"triton.language.standard.zeros____(0, 0)cconstexpr_1__(0, 1)cconstexpr_16__(1,)cconstexpr_int16_"() -> tensor<1x16xi16> attributes {noinline = false} { + %c0_i16 = arith.constant 0 : i16 loc(#loc184) + %cst = arith.constant dense<0> : tensor<1x16xi16> loc(#loc184) + tt.return %cst : tensor<1x16xi16> loc(#loc185) + ^bb1: // no predecessors + %0 = ub.poison : tensor<1x16xi16> loc(#loc186) + tt.return %0 : tensor<1x16xi16> loc(#loc186) + } loc(#loc183) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_2__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc94)), %idxs: tensor<1x16xi32> loc("idxs"(#loc94))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc259) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc260) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc260) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc261) + %flip_3 = tt.reshape %flip_2 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc262) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + tt.return %1#0, %1#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc100) + ^bb1: // no predecessors + %2 = ub.poison : tensor<1x16xi32> loc(#loc101) + %3 = ub.poison : tensor<1x16xi32> loc(#loc101) + tt.return %2, %3 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc101) + } loc(#loc94) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: tensor<1x16xi32> loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<4x2x2xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<4x2x2xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<4x2x2xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<4x2x2xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc298) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc298) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc299) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc311) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc311) + %cond_36 = arith.constant 0 : i32 loc(#loc312) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc312) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc312) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<4x2x2xi32> loc("input"(#loc165))) -> tensor<4x2xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc166) + tt.reduce.return %2 : i32 loc(#loc166) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc166) + tt.return %0 : tensor<4x2xi32> loc(#loc167) + ^bb1: // no predecessors + %1 = ub.poison : tensor<4x2xi32> loc(#loc168) + tt.return %1 : tensor<4x2xi32> loc(#loc168) + } loc(#loc165) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: tensor<1x16xi32> loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x1xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x1xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc298) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc298) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc299) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc311) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc311) + %cond_36 = arith.constant 0 : i32 loc(#loc312) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc312) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc312) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_3__(4,)cconstexpr_True__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc94)), %idxs: tensor<1x16xi32> loc("idxs"(#loc94))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc259) + %flip_0 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc260) + %flip_1 = tt.expand_dims %flip_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc260) + %flip_2 = tt.broadcast %flip_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc261) + %flip_3 = tt.reshape %flip_2 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc262) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip_3) : (tensor<1x16xi32>, tensor<1x16xi32>, tensor<1x16xi32>) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + tt.return %2#0, %2#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc100) + ^bb1: // no predecessors + %3 = ub.poison : tensor<1x16xi32> loc(#loc101) + %4 = ub.poison : tensor<1x16xi32> loc(#loc101) + tt.return %3, %4 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc101) + } loc(#loc94) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: tensor<1x16xi32> loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<2x2x4xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<2x2x4xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<2x2x4xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<2x2x4xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_45 = arith.constant true loc(#loc298) + %cond_46 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_47 = arith.xori %left_isnan, %cond_46 : tensor<1x16xi1> loc(#loc298) + %cond_48 = arith.andi %right_isnan, %cond_47 : tensor<1x16xi1> loc(#loc299) + %cond_49 = arith.ori %cond, %cond_48 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_49 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_45 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_46 = arith.ori %eq, %eq_45 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_46 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = arith.extui %cond_33 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc311) + %cond_35 = arith.xori %cond_34, %flip : tensor<1x16xi32> loc(#loc311) + %cond_36 = arith.constant 0 : i32 loc(#loc312) + %cond_37 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc312) + %cond_38 = arith.cmpi ne, %cond_35, %cond_37 : tensor<1x16xi32> loc(#loc312) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_40 = arith.select %cond_38, %ret, %ret_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_41 = arith.xori %x, %ret_40 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_42 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_43 = arith.select %cond_38, %new_idxs, %new_idxs_42 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_44 = arith.xori %idxs, %new_idxs_43 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_41, %new_idxs_44 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<2x2x4xi32> loc("input"(#loc165))) -> tensor<2x4xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc166) + tt.reduce.return %2 : i32 loc(#loc166) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc166) + tt.return %0 : tensor<2x4xi32> loc(#loc167) + ^bb1: // no predecessors + %1 = ub.poison : tensor<2x4xi32> loc(#loc168) + tt.return %1 : tensor<2x4xi32> loc(#loc168) + } loc(#loc165) + tt.func private @"torch._inductor.runtime.triton_helpers._bitonic_merge_with_index__i32S1_16S_i32S1_16S__(2,)cconstexpr_None__(3,)cconstexpr_4__(4,)cconstexpr_False__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc94)), %idxs: tensor<1x16xi32> loc("idxs"(#loc94))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %flip = arith.constant false loc(#loc328) + %0:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x, %idxs, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + %1:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%0#0, %0#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + %2:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%1#0, %1#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + %3:2 = tt.call @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%2#0, %2#1, %flip) : (tensor<1x16xi32>, tensor<1x16xi32>, i1) -> (tensor<1x16xi32>, tensor<1x16xi32>) loc(#loc99) + tt.return %3#0, %3#1 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc100) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc101) + %5 = ub.poison : tensor<1x16xi32> loc(#loc101) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc101) + } loc(#loc94) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_0__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: i1 loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<1x2x8xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<1x2x8xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<1x2x8xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<1x2x8xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc298) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc298) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc299) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc311) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc311) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"triton.language.standard.sum__i32S1_2_8S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x2x8xi32> loc("input"(#loc165))) -> tensor<1x8xi32> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i32 loc(unknown), %arg2: i32 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i32_i32__(%arg1, %arg2) : (i32, i32) -> i32 loc(#loc166) + tt.reduce.return %2 : i32 loc(#loc166) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc166) + tt.return %0 : tensor<1x8xi32> loc(#loc167) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1x8xi32> loc(#loc168) + tt.return %1 : tensor<1x8xi32> loc(#loc168) + } loc(#loc165) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_1__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: i1 loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<2x2x4xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<2x2x4xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<2x2x4xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<2x2x4xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S2_2_4S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc298) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc298) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc299) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc311) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc311) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_2__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: i1 loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<4x2x2xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<4x2x2xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<4x2x2xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<4x2x2xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S4_2_2S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc298) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc298) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc299) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc311) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc311) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"torch._inductor.runtime.triton_helpers._compare_and_swap_with_index__i32S1_16S_i32S1_16S_u1__(2,)cconstexpr_None__(4,)cconstexpr_3__(5,)cconstexpr_4__(6,)cconstexpr_True__(7,)cconstexpr_True_"(%x: tensor<1x16xi32> loc("x"(#loc102)), %idxs: tensor<1x16xi32> loc("idxs"(#loc102)), %flip: i1 loc("flip"(#loc102))) -> (tensor<1x16xi32>, tensor<1x16xi32>) attributes {noinline = false} { + %y = tt.reshape %x : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc266) + %right_mask = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc267) + %right_mask_0 = tt.expand_dims %right_mask {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc268) + %right_mask_1 = tt.expand_dims %right_mask_0 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc268) + %left_mask = arith.constant 1 : i32 loc(#loc269) + %left_mask_2 = arith.constant 1 : i32 loc(#loc269) + %left_mask_3 = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc269) + %left_mask_4 = arith.subi %left_mask_3, %right_mask_1 : tensor<1x2x1xi32> loc(#loc269) + %ileft = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc270) + %ileft_5 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc270) + %ileft_6 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%ileft_5) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc271) + %ileft_7 = tt.expand_dims %ileft_6 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc272) + %ileft_8 = tt.broadcast %ileft_7 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc273) + %iright = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc274) + %iright_9 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc274) + %iright_10 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%iright_9) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc275) + %iright_11 = tt.expand_dims %iright_10 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc276) + %iright_12 = tt.broadcast %iright_11 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc277) + %ileft_13 = tt.reshape %ileft_8 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc278) + %iright_14 = tt.reshape %iright_12 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc279) + %y_idx = tt.reshape %idxs : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc280) + %left_idx = tt.broadcast %left_mask_4 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc282) + %left_idx_15 = arith.muli %y_idx, %left_idx : tensor<8x2x1xi32> loc(#loc282) + %left_idx_16 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%left_idx_15) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc283) + %left_idx_17 = tt.expand_dims %left_idx_16 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc284) + %left_idx_18 = tt.broadcast %left_idx_17 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc285) + %right_idx = tt.broadcast %right_mask_1 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc287) + %right_idx_19 = arith.muli %y_idx, %right_idx : tensor<8x2x1xi32> loc(#loc287) + %right_idx_20 = tt.call @"triton.language.standard.sum__i32S8_2_1S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%right_idx_19) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc288) + %right_idx_21 = tt.expand_dims %right_idx_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc289) + %right_idx_22 = tt.broadcast %right_idx_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc290) + %left_idx_23 = tt.reshape %left_idx_18 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc291) + %right_idx_24 = tt.reshape %right_idx_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc292) + %left_valid_mask = arith.constant true loc(#loc293) + %left_valid_mask_25 = arith.constant dense : tensor<1x16xi1> loc(#loc293) + %right_valid_mask = arith.constant true loc(#loc294) + %right_valid_mask_26 = arith.constant dense : tensor<1x16xi1> loc(#loc294) + %left_isnan = arith.cmpi ne, %ileft_13, %ileft_13 : tensor<1x16xi32> loc(#loc295) + %right_isnan = arith.cmpi ne, %iright_14, %iright_14 : tensor<1x16xi32> loc(#loc296) + %cond = arith.cmpi slt, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc329) + %0 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc135) + %1 = scf.if %0 -> (tensor<1x16xi1>) { + %cond_42 = arith.constant true loc(#loc298) + %cond_43 = arith.constant dense : tensor<1x16xi1> loc(#loc298) + %cond_44 = arith.xori %left_isnan, %cond_43 : tensor<1x16xi1> loc(#loc298) + %cond_45 = arith.andi %right_isnan, %cond_44 : tensor<1x16xi1> loc(#loc299) + %cond_46 = arith.ori %cond, %cond_45 : tensor<1x16xi1> loc(#loc330) + scf.yield %cond_46 : tensor<1x16xi1> loc(#loc330) + } else { + scf.yield %cond : tensor<1x16xi1> loc(#loc140) + } loc(#loc136) + %eq = arith.cmpi eq, %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc331) + %2 = tt.call @torch._inductor.runtime.triton_helpers.is_floating__i32S1_16S__(%ileft_13) : (tensor<1x16xi32>) -> i1 loc(#loc142) + %3 = scf.if %2 -> (tensor<1x16xi1>) { + %eq_42 = arith.andi %left_isnan, %right_isnan : tensor<1x16xi1> loc(#loc302) + %eq_43 = arith.ori %eq, %eq_42 : tensor<1x16xi1> loc(#loc332) + scf.yield %eq_43 : tensor<1x16xi1> loc(#loc332) + } else { + scf.yield %eq : tensor<1x16xi1> loc(#loc140) + } loc(#loc143) + %cond_27 = arith.cmpi sgt, %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc304) + %cond_28 = arith.andi %3, %cond_27 : tensor<1x16xi1> loc(#loc305) + %cond_29 = arith.ori %1, %cond_28 : tensor<1x16xi1> loc(#loc306) + %cond_30 = arith.cmpi ugt, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc307) + %cond_31 = arith.cmpi eq, %right_valid_mask_26, %left_valid_mask_25 : tensor<1x16xi1> loc(#loc308) + %cond_32 = arith.andi %cond_31, %cond_29 : tensor<1x16xi1> loc(#loc309) + %cond_33 = arith.ori %cond_30, %cond_32 : tensor<1x16xi1> loc(#loc310) + %cond_34 = tt.splat %flip : i1 -> tensor<1x16xi1> loc(#loc311) + %cond_35 = arith.xori %cond_33, %cond_34 : tensor<1x16xi1> loc(#loc311) + %ret = arith.xori %ileft_13, %iright_14 : tensor<1x16xi32> loc(#loc313) + %ret_36 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%x) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc314) + %ret_37 = arith.select %cond_35, %ret, %ret_36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc315) + %ret_38 = arith.xori %x, %ret_37 : tensor<1x16xi32> loc(#loc316) + %new_idxs = arith.xori %left_idx_23, %right_idx_24 : tensor<1x16xi32> loc(#loc317) + %new_idxs_39 = tt.call @triton.language.standard.zeros_like__i32S1_16S__(%idxs) : (tensor<1x16xi32>) -> tensor<1x16xi32> loc(#loc318) + %new_idxs_40 = arith.select %cond_35, %new_idxs, %new_idxs_39 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc319) + %new_idxs_41 = arith.xori %idxs, %new_idxs_40 : tensor<1x16xi32> loc(#loc320) + tt.return %ret_38, %new_idxs_41 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc163) + ^bb1: // no predecessors + %4 = ub.poison : tensor<1x16xi32> loc(#loc164) + %5 = ub.poison : tensor<1x16xi32> loc(#loc164) + tt.return %4, %5 : tensor<1x16xi32>, tensor<1x16xi32> loc(#loc164) + } loc(#loc102) + tt.func private @"triton.language.standard.sum__i64S1_16S__(1,)cconstexpr_1__(2,)cconstexpr_False__(3,)cNone"(%input: tensor<1x16xi64> loc("input"(#loc165))) -> tensor<1xi64> attributes {noinline = false} { + %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%arg1: i64 loc(unknown), %arg2: i64 loc(unknown)): + %2 = tt.call @triton.language.standard._sum_combine__i64_i64__(%arg1, %arg2) : (i64, i64) -> i64 loc(#loc166) + tt.reduce.return %2 : i64 loc(#loc166) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc166) + tt.return %0 : tensor<1xi64> loc(#loc167) + ^bb1: // no predecessors + %1 = ub.poison : tensor<1xi64> loc(#loc168) + tt.return %1 : tensor<1xi64> loc(#loc168) + } loc(#loc165) + tt.func private @triton.language.standard._sum_combine__i64_i64__(%a: i64 loc("a"(#loc169)), %b: i64 loc("b"(#loc169))) -> i64 attributes {noinline = false} { + %0 = arith.addi %a, %b : i64 loc(#loc170) + tt.return %0 : i64 loc(#loc171) + ^bb1: // no predecessors + %1 = ub.poison : i64 loc(#loc172) + tt.return %1 : i64 loc(#loc172) + } loc(#loc169) +} loc(#loc) +#loc1 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":19:13) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":20:15) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":24:33) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":25:36) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":25:44) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":25:23) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":26:21) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":27:28) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":27:38) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":28:16) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":29:48) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:40) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:37) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:30) +#loc16 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:45) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":35:30) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":36:18) +#loc19 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":37:34) +#loc20 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":38:18) +#loc21 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":39:18) +#loc22 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":40:19) +#loc23 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":41:19) +#loc24 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":43:19) +#loc25 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":46:71) +#loc26 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":47:20) +#loc27 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":48:21) +#loc28 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":49:21) +#loc29 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":51:71) +#loc30 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":52:20) +#loc31 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":54:35) +#loc32 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":55:26) +#loc33 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":55:29) +#loc34 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":56:21) +#loc35 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":58:35) +#loc36 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":59:26) +#loc37 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":59:29) +#loc38 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":60:21) +#loc39 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":61:21) +#loc40 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":62:21) +#loc41 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":63:21) +#loc42 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":64:19) +#loc43 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":65:32) +#loc44 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":66:35) +#loc45 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":67:44) +#loc46 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":68:20) +#loc47 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":69:20) +#loc48 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":70:35) +#loc49 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:28) +#loc50 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:46) +#loc51 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:38) +#loc52 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:55) +#loc53 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:53) +#loc54 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:63) +#loc55 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":72:31) +#loc56 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":73:21) +#loc57 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":74:21) +#loc58 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":75:19) +#loc59 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":76:35) +#loc60 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":77:20) +#loc61 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":78:20) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":79:35) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:28) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:46) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:38) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:55) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:53) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:63) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":81:25) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":81:37) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":82:25) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":82:37) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":83:35) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":83:32) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":83:25) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":83:47) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:52) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:49) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:25) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:85) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":85:35) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":85:32) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":85:25) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":85:47) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:52) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:49) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:25) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:85) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:4) +#loc91 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc92 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:11) +#loc93 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":668:4) +#loc95 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc96 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc97 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc98 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc99 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc100 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:11) +#loc101 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":636:4) +#loc103 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc104 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:30) +#loc105 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":536:33) +#loc106 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc107 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc108 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc109 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc110 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc111 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc112 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc113 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc114 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc115 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc116 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc117 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc118 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc119 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc120 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc121 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc122 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc123 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc124 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc125 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc126 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc127 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc128 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc129 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc130 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":558:49) +#loc131 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":559:50) +#loc132 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":570:25) +#loc133 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":571:27) +#loc134 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc135 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:23) +#loc136 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":575:11) +#loc137 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:47) +#loc138 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:46) +#loc139 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":579:31) +#loc141 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc142 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:23) +#loc143 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":592:11) +#loc144 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:36) +#loc145 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":593:23) +#loc146 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc147 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc148 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc149 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":596:31) +#loc150 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:29) +#loc151 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:48) +#loc152 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":597:8) +#loc153 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc154 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc155 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc156 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:60) +#loc157 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc158 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc159 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc160 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:73) +#loc161 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc162 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc163 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:11) +#loc164 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":603:4) +#loc166 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc167 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:11) +#loc168 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:4) +#loc170 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc171 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:11) +#loc172 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:4) +#loc173 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc175 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:29) +#loc176 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:11) +#loc177 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":87:4) +#loc179 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:30) +#loc180 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:15) +#loc181 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:11) +#loc182 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":65:4) +#loc183 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":118:0) +#loc184 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:31) +#loc185 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:11) +#loc186 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":127:4) +#loc188 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:30) +#loc189 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:11) +#loc190 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":138:4) +#loc191 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":630:15) +#loc201 = loc("xnumel"(#loc1)) +#loc202 = loc("r0_numel"(#loc2)) +#loc203 = loc("xoffset"(#loc3)) +#loc204 = loc("xoffset"(#loc4)) +#loc205 = loc("xindex"(#loc5)) +#loc206 = loc("xindex"(#loc6)) +#loc207 = loc("xindex"(#loc7)) +#loc208 = loc("xmask"(#loc8)) +#loc209 = loc("r0_index"(#loc9)) +#loc210 = loc("r0_index"(#loc10)) +#loc211 = loc("r0_offset"(#loc11)) +#loc212 = loc("r0_mask"(#loc12)) +#loc213 = loc("tmp0"(#loc13)) +#loc214 = loc("tmp0"(#loc14)) +#loc215 = loc("tmp0"(#loc15)) +#loc216 = loc("tmp0"(#loc16)) +#loc217 = loc("tmp1"(#loc17)) +#loc218 = loc("tmp2"(#loc18)) +#loc219 = loc("tmp3"(#loc19)) +#loc220 = loc("tmp4"(#loc20)) +#loc221 = loc("tmp5"(#loc21)) +#loc222 = loc("tmp6"(#loc22)) +#loc223 = loc("tmp7"(#loc23)) +#loc224 = loc("tmp9"(#loc24)) +#loc225 = loc("tmp14"(#loc26)) +#loc226 = loc("tmp15"(#loc27)) +#loc227 = loc("tmp16"(#loc28)) +#loc228 = loc("tmp20"(#loc30)) +#loc229 = loc("tmp23"(#loc31)) +#loc230 = loc("tmp24"(#loc32)) +#loc231 = loc("tmp24"(#loc33)) +#loc232 = loc("tmp25"(#loc34)) +#loc233 = loc("tmp28"(#loc35)) +#loc234 = loc("tmp29"(#loc36)) +#loc235 = loc("tmp29"(#loc37)) +#loc236 = loc("tmp30"(#loc38)) +#loc237 = loc("tmp31"(#loc39)) +#loc238 = loc("tmp32"(#loc40)) +#loc239 = loc("tmp33"(#loc41)) +#loc240 = loc("tmp34"(#loc42)) +#loc241 = loc("tmp35"(#loc43)) +#loc242 = loc("tmp36"(#loc44)) +#loc243 = loc("tmp37"(#loc45)) +#loc244 = loc("tmp38"(#loc46)) +#loc245 = loc("tmp39"(#loc47)) +#loc246 = loc("tmp40"(#loc48)) +#loc247 = loc("tmp42"(#loc55)) +#loc248 = loc("tmp43"(#loc56)) +#loc249 = loc("tmp44"(#loc57)) +#loc250 = loc("tmp45"(#loc58)) +#loc251 = loc("tmp46"(#loc59)) +#loc252 = loc("tmp47"(#loc60)) +#loc253 = loc("tmp48"(#loc61)) +#loc254 = loc("tmp49"(#loc62)) +#loc259 = loc("flip"(#loc95)) +#loc260 = loc("flip"(#loc96)) +#loc261 = loc("flip"(#loc97)) +#loc262 = loc("flip"(#loc98)) +#loc266 = loc("y"(#loc103)) +#loc267 = loc("right_mask"(#loc104)) +#loc268 = loc("right_mask"(#loc105)) +#loc269 = loc("left_mask"(#loc106)) +#loc270 = loc("ileft"(#loc107)) +#loc271 = loc("ileft"(#loc108)) +#loc272 = loc("ileft"(#loc109)) +#loc273 = loc("ileft"(#loc110)) +#loc274 = loc("iright"(#loc111)) +#loc275 = loc("iright"(#loc112)) +#loc276 = loc("iright"(#loc113)) +#loc277 = loc("iright"(#loc114)) +#loc278 = loc("ileft"(#loc115)) +#loc279 = loc("iright"(#loc116)) +#loc280 = loc("y_idx"(#loc117)) +#loc281 = loc("left_idx"(#loc118)) +#loc282 = loc("left_idx"(#loc119)) +#loc283 = loc("left_idx"(#loc120)) +#loc284 = loc("left_idx"(#loc121)) +#loc285 = loc("left_idx"(#loc122)) +#loc286 = loc("right_idx"(#loc123)) +#loc287 = loc("right_idx"(#loc124)) +#loc288 = loc("right_idx"(#loc125)) +#loc289 = loc("right_idx"(#loc126)) +#loc290 = loc("right_idx"(#loc127)) +#loc291 = loc("left_idx"(#loc128)) +#loc292 = loc("right_idx"(#loc129)) +#loc293 = loc("left_valid_mask"(#loc130)) +#loc294 = loc("right_valid_mask"(#loc131)) +#loc295 = loc("left_isnan"(#loc132)) +#loc296 = loc("right_isnan"(#loc133)) +#loc297 = loc("cond"(#loc134)) +#loc298 = loc("cond"(#loc137)) +#loc299 = loc("cond"(#loc138)) +#loc300 = loc("cond"(#loc139)) +#loc301 = loc("eq"(#loc141)) +#loc302 = loc("eq"(#loc144)) +#loc303 = loc("eq"(#loc145)) +#loc304 = loc("cond"(#loc146)) +#loc305 = loc("cond"(#loc147)) +#loc306 = loc("cond"(#loc148)) +#loc307 = loc("cond"(#loc149)) +#loc308 = loc("cond"(#loc150)) +#loc309 = loc("cond"(#loc151)) +#loc310 = loc("cond"(#loc152)) +#loc311 = loc("cond"(#loc153)) +#loc312 = loc("cond"(#loc154)) +#loc313 = loc("ret"(#loc155)) +#loc314 = loc("ret"(#loc156)) +#loc315 = loc("ret"(#loc157)) +#loc316 = loc("ret"(#loc158)) +#loc317 = loc("new_idxs"(#loc159)) +#loc318 = loc("new_idxs"(#loc160)) +#loc319 = loc("new_idxs"(#loc161)) +#loc320 = loc("new_idxs"(#loc162)) +#loc324 = loc("input"(#loc173)) +#loc328 = loc("flip"(#loc191)) +#loc329 = loc("cond"(#loc297)) +#loc330 = loc("cond"(#loc300)) +#loc331 = loc("eq"(#loc301)) +#loc332 = loc("eq"(#loc303)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir b/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..11bf9eec524c596482797830e0d50a30af7f8028 --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttgir @@ -0,0 +1,1487 @@ +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked1 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [2, 2, 8], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked2 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [4, 2, 4], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked3 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [8, 2, 2], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked4 = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [16, 2, 1], warpsPerCTA = [2, 1, 1], order = [2, 1, 0]}> +#blocked5 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 16], warpsPerCTA = [2, 1], order = [1, 0]}> +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":18:0) +#loc1 = loc(unknown) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc17 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":46:71) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":51:71) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":55:26) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":59:26) +#loc113 = loc("in_ptr0"(#loc)) +#loc114 = loc("out_ptr4"(#loc)) +#loc115 = loc("out_ptr5"(#loc)) +#loc116 = loc("out_ptr6"(#loc)) +#loc117 = loc("out_ptr7"(#loc)) +#loc118 = loc("out_ptr8"(#loc)) +#loc119 = loc("out_ptr9"(#loc)) +#loc120 = loc("xnumel"(#loc)) +#loc121 = loc("r0_numel"(#loc)) +#loc136 = loc(callsite(#loc16 at #loc17)) +#loc142 = loc("ileft"(#loc25)) +#loc146 = loc("iright"(#loc30)) +#loc155 = loc("left_idx"(#loc39)) +#loc160 = loc("right_idx"(#loc44)) +#loc181 = loc(callsite(#loc16 at #loc65)) +#loc184 = loc("tmp24"(#loc68)) +#loc188 = loc("tmp29"(#loc72)) +#loc210 = loc(callsite(#loc21 at #loc136)) +#loc214 = loc(callsite(#loc21 at #loc181)) +#loc217 = loc(callsite(#loc1 at #loc184)) +#loc220 = loc(callsite(#loc1 at #loc188)) +#loc225 = loc(callsite(#loc142 at #loc210)) +#loc229 = loc(callsite(#loc146 at #loc210)) +#loc237 = loc(callsite(#loc155 at #loc210)) +#loc242 = loc(callsite(#loc160 at #loc210)) +#loc262 = loc(callsite(#loc142 at #loc214)) +#loc266 = loc(callsite(#loc146 at #loc214)) +#loc284 = loc(callsite(#loc155 at #loc214)) +#loc288 = loc(callsite(#loc160 at #loc214)) +#loc298 = loc(callsite(#loc1 at #loc225)) +#loc300 = loc(callsite(#loc1 at #loc229)) +#loc303 = loc(callsite(#loc1 at #loc237)) +#loc306 = loc(callsite(#loc1 at #loc242)) +#loc308 = loc(callsite(#loc1 at #loc262)) +#loc310 = loc(callsite(#loc1 at #loc266)) +#loc312 = loc(callsite(#loc1 at #loc284)) +#loc314 = loc(callsite(#loc1 at #loc288)) +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<1x16xi64, #blocked> loc(#loc1) + %cst_0 = arith.constant dense<16384> : tensor<1x16xi64, #blocked> loc(#loc1) + %cst_1 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked1> loc(#loc1) + %cst_2 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked2> loc(#loc1) + %cst_3 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked3> loc(#loc1) + %cst_4 = arith.constant dense<1> : tensor<1x2x1xi32, #blocked4> loc(#loc1) + %c32_i32 = arith.constant 32 : i32 loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %c17_i32 = arith.constant 17 : i32 loc(#loc1) + %cst_5 = arith.constant dense<1> : tensor<1x16xi32, #blocked> loc(#loc1) + %cst_6 = arith.constant dense<0> : tensor<1x16xi32, #blocked5> loc(#loc1) + %cst_7 = arith.constant dense<17> : tensor<1x16xi32, #blocked5> loc(#loc1) + %cst_8 = arith.constant dense<16> : tensor<1x16xi32, #blocked5> loc(#loc1) + %cst_9 = arith.constant dense<16384> : tensor<1x16xi64, #blocked5> loc(#loc1) + %cst_10 = arith.constant dense<0> : tensor<1x16xi64, #blocked5> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc122) + %xmask = arith.cmpi slt, %xoffset, %c32_i32 : i32 loc(#loc123) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked5}>> loc(#loc124) + %r0_index_11 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc124) + %r0_index_12 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked5}>> -> tensor<1x16xi32, #blocked5> loc(#loc124) + %r0_index_13 = tt.expand_dims %r0_index_11 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> loc(#loc124) + %tmp0 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc125) + %tmp0_14 = tt.splat %tmp0 : i32 -> tensor<1x16xi32, #blocked5> loc(#loc204) + %tmp0_15 = tt.splat %tmp0 : i32 -> tensor<1x16xi32, #blocked> loc(#loc204) + %tmp0_16 = arith.addi %r0_index_12, %tmp0_14 : tensor<1x16xi32, #blocked5> loc(#loc126) + %tmp0_17 = arith.addi %r0_index_13, %tmp0_15 : tensor<1x16xi32, #blocked> loc(#loc126) + %tmp0_18 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc127) + %tmp0_19 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> loc(#loc127) + %tmp0_20 = tt.addptr %tmp0_18, %tmp0_16 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc127) + %tmp0_21 = tt.addptr %tmp0_19, %tmp0_17 : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> loc(#loc127) + %tmp0_22 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked5> loc(#loc205) + %tmp0_23 = tt.splat %xmask : i1 -> tensor<1x16xi1, #blocked> loc(#loc205) + %tmp0_24 = tt.load %tmp0_20, %tmp0_22, %cst_10 : tensor<1x16x!tt.ptr, #blocked5> loc(#loc128) + %tmp0_25 = tt.load %tmp0_21, %tmp0_23, %cst : tensor<1x16x!tt.ptr, #blocked> loc(#loc128) + %tmp2 = arith.cmpi sgt, %tmp0_24, %cst_10 : tensor<1x16xi64, #blocked5> loc(#loc129) + %tmp2_26 = arith.cmpi sgt, %tmp0_25, %cst : tensor<1x16xi64, #blocked> loc(#loc129) + %tmp4 = arith.cmpi slt, %tmp0_24, %cst_9 : tensor<1x16xi64, #blocked5> loc(#loc130) + %tmp4_27 = arith.cmpi slt, %tmp0_25, %cst_0 : tensor<1x16xi64, #blocked> loc(#loc130) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<1x16xi1, #blocked5> loc(#loc131) + %tmp5_28 = arith.andi %tmp2_26, %tmp4_27 : tensor<1x16xi1, #blocked> loc(#loc131) + %tmp7 = arith.extui %tmp5 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc206) + %tmp9 = arith.trunci %r0_index_12 : tensor<1x16xi32, #blocked5> to tensor<1x16xi16, #blocked5> loc(#loc134) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> loc(#loc207) + %flip_29 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> loc(#loc207) + %flip_30 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> loc(#loc207) + %flip_31 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> loc(#loc207) + %flip_32 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked3}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> loc(#loc207) + %flip_33 = tt.expand_dims %flip_29 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked4}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> loc(#loc207) + %flip_34 = tt.expand_dims %flip_30 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked2}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> loc(#loc207) + %flip_35 = tt.expand_dims %flip_31 {axis = 0 : i32} : tensor<2xi32, #ttg.slice<{dim = 0, parent = #ttg.slice<{dim = 2, parent = #blocked1}>}>> -> tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> loc(#loc207) + %flip_36 = tt.expand_dims %flip_32 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked3}>> -> tensor<1x2x1xi32, #blocked3> loc(#loc207) + %flip_37 = tt.expand_dims %flip_33 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked4}>> -> tensor<1x2x1xi32, #blocked4> loc(#loc207) + %flip_38 = tt.expand_dims %flip_34 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked2}>> -> tensor<1x2x1xi32, #blocked2> loc(#loc207) + %flip_39 = tt.expand_dims %flip_35 {axis = 2 : i32} : tensor<1x2xi32, #ttg.slice<{dim = 2, parent = #blocked1}>> -> tensor<1x2x1xi32, #blocked1> loc(#loc207) + %flip_40 = tt.broadcast %flip_36 : tensor<1x2x1xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc208) + %flip_41 = tt.reshape %flip_40 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc209) + %y = tt.reshape %tmp7 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc222) + %left_mask = arith.subi %cst_4, %flip_37 : tensor<1x2x1xi32, #blocked4> loc(#loc223) + %left_mask_42 = arith.subi %cst_3, %flip_36 : tensor<1x2x1xi32, #blocked3> loc(#loc223) + %left_mask_43 = arith.subi %cst_2, %flip_38 : tensor<1x2x1xi32, #blocked2> loc(#loc223) + %left_mask_44 = arith.subi %cst_1, %flip_39 : tensor<1x2x1xi32, #blocked1> loc(#loc223) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc224) + %ileft_45 = arith.muli %y, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc224) + %ileft_46 = "tt.reduce"(%ileft_45) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc297) + %ileft_47 = tt.expand_dims %ileft_46 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc226) + %ileft_48 = tt.broadcast %ileft_47 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc227) + %iright = tt.broadcast %flip_37 : tensor<1x2x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc228) + %iright_49 = arith.muli %y, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc228) + %iright_50 = "tt.reduce"(%iright_49) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc299) + %iright_51 = tt.expand_dims %iright_50 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc230) + %iright_52 = tt.broadcast %iright_51 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc231) + %ileft_53 = tt.reshape %ileft_48 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_54 = tt.reshape %iright_52 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx = tt.reshape %tmp9 : tensor<1x16xi16, #blocked5> -> tensor<8x2x1xi16, #blocked4> loc(#loc234) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc235) + %left_idx_55 = tt.broadcast %left_idx : tensor<1x2x1xi16, #blocked4> -> tensor<8x2x1xi16, #blocked4> loc(#loc236) + %left_idx_56 = arith.muli %y_idx, %left_idx_55 : tensor<8x2x1xi16, #blocked4> loc(#loc236) + %input = arith.extsi %left_idx_56 : tensor<8x2x1xi16, #blocked4> to tensor<8x2x1xi32, #blocked4> loc(#loc301) + %left_idx_57 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %left_idx_58 = tt.expand_dims %left_idx_57 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc238) + %left_idx_59 = tt.broadcast %left_idx_58 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc239) + %right_idx = arith.trunci %flip_37 : tensor<1x2x1xi32, #blocked4> to tensor<1x2x1xi16, #blocked4> loc(#loc240) + %right_idx_60 = tt.broadcast %right_idx : tensor<1x2x1xi16, #blocked4> -> tensor<8x2x1xi16, #blocked4> loc(#loc241) + %right_idx_61 = arith.muli %y_idx, %right_idx_60 : tensor<8x2x1xi16, #blocked4> loc(#loc241) + %input_62 = arith.extsi %right_idx_61 : tensor<8x2x1xi16, #blocked4> to tensor<8x2x1xi32, #blocked4> loc(#loc304) + %right_idx_63 = "tt.reduce"(%input_62) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %right_idx_64 = tt.expand_dims %right_idx_63 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc243) + %right_idx_65 = tt.broadcast %right_idx_64 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc244) + %left_idx_66 = tt.reshape %left_idx_59 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_67 = tt.reshape %right_idx_65 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond = arith.cmpi slt, %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq = arith.cmpi eq, %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_68 = arith.cmpi sgt, %left_idx_66, %right_idx_67 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_69 = arith.andi %eq, %cond_68 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_70 = arith.ori %cond, %cond_69 : tensor<1x16xi1, #blocked5> loc(#loc251) + %cond_71 = arith.extui %cond_70 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_72 = arith.xori %cond_71, %flip_41 : tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_73 = arith.cmpi ne, %cond_72, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc253) + %ret = arith.xori %ileft_53, %iright_54 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_74 = arith.select %cond_73, %ret, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_75 = arith.xori %tmp7, %ret_74 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs = arith.xori %left_idx_66, %right_idx_67 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_76 = arith.select %cond_73, %new_idxs, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_77 = arith.extsi %tmp9 : tensor<1x16xi16, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc259) + %new_idxs_78 = arith.xori %new_idxs_77, %new_idxs_76 : tensor<1x16xi32, #blocked5> loc(#loc259) + %flip_79 = tt.broadcast %flip_38 : tensor<1x2x1xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc208) + %flip_80 = tt.reshape %flip_79 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc209) + %y_81 = tt.reshape %ret_75 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc222) + %ileft_82 = tt.broadcast %left_mask_42 : tensor<1x2x1xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc224) + %ileft_83 = arith.muli %y_81, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc224) + %ileft_84 = "tt.reduce"(%ileft_83) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc297) + %ileft_85 = tt.expand_dims %ileft_84 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc226) + %ileft_86 = tt.broadcast %ileft_85 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc227) + %iright_87 = arith.muli %y_81, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc228) + %iright_88 = "tt.reduce"(%iright_87) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc299) + %iright_89 = tt.expand_dims %iright_88 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc230) + %iright_90 = tt.broadcast %iright_89 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc231) + %ileft_91 = tt.reshape %ileft_86 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_92 = tt.reshape %iright_90 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_93 = tt.reshape %new_idxs_78 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc234) + %left_idx_94 = arith.muli %y_idx_93, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc236) + %left_idx_95 = "tt.reduce"(%left_idx_94) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %left_idx_96 = tt.expand_dims %left_idx_95 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc238) + %left_idx_97 = tt.broadcast %left_idx_96 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc239) + %right_idx_98 = arith.muli %y_idx_93, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc241) + %right_idx_99 = "tt.reduce"(%right_idx_98) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %right_idx_100 = tt.expand_dims %right_idx_99 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc243) + %right_idx_101 = tt.broadcast %right_idx_100 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc244) + %left_idx_102 = tt.reshape %left_idx_97 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_103 = tt.reshape %right_idx_101 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_104 = arith.cmpi slt, %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_105 = arith.cmpi eq, %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_106 = arith.cmpi sgt, %left_idx_102, %right_idx_103 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_107 = arith.andi %eq_105, %cond_106 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_108 = arith.ori %cond_104, %cond_107 : tensor<1x16xi1, #blocked5> loc(#loc251) + %cond_109 = arith.extui %cond_108 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_110 = arith.xori %cond_109, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_111 = arith.cmpi ne, %cond_110, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc253) + %ret_112 = arith.xori %ileft_91, %iright_92 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_113 = arith.select %cond_111, %ret_112, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_114 = arith.xori %ret_75, %ret_113 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_115 = arith.xori %left_idx_102, %right_idx_103 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_116 = arith.select %cond_111, %new_idxs_115, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_117 = arith.xori %new_idxs_78, %new_idxs_116 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_118 = tt.reshape %ret_114 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc222) + %ileft_119 = arith.muli %y_118, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc224) + %ileft_120 = "tt.reduce"(%ileft_119) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc297) + %ileft_121 = tt.expand_dims %ileft_120 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc226) + %ileft_122 = tt.broadcast %ileft_121 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc227) + %iright_123 = arith.muli %y_118, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc228) + %iright_124 = "tt.reduce"(%iright_123) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc299) + %iright_125 = tt.expand_dims %iright_124 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc230) + %iright_126 = tt.broadcast %iright_125 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc231) + %ileft_127 = tt.reshape %ileft_122 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_128 = tt.reshape %iright_126 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_129 = tt.reshape %new_idxs_117 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc234) + %left_idx_130 = arith.muli %y_idx_129, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc236) + %left_idx_131 = "tt.reduce"(%left_idx_130) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %left_idx_132 = tt.expand_dims %left_idx_131 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc238) + %left_idx_133 = tt.broadcast %left_idx_132 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc239) + %right_idx_134 = arith.muli %y_idx_129, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc241) + %right_idx_135 = "tt.reduce"(%right_idx_134) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %right_idx_136 = tt.expand_dims %right_idx_135 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc243) + %right_idx_137 = tt.broadcast %right_idx_136 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc244) + %left_idx_138 = tt.reshape %left_idx_133 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_139 = tt.reshape %right_idx_137 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_140 = arith.cmpi slt, %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_141 = arith.cmpi eq, %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_142 = arith.cmpi sgt, %left_idx_138, %right_idx_139 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_143 = arith.andi %eq_141, %cond_142 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_144 = arith.ori %cond_140, %cond_143 : tensor<1x16xi1, #blocked5> loc(#loc251) + %cond_145 = arith.extui %cond_144 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_146 = arith.xori %cond_145, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_147 = arith.cmpi ne, %cond_146, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc253) + %ret_148 = arith.xori %ileft_127, %iright_128 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_149 = arith.select %cond_147, %ret_148, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_150 = arith.xori %ret_114, %ret_149 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_151 = arith.xori %left_idx_138, %right_idx_139 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_152 = arith.select %cond_147, %new_idxs_151, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_153 = arith.xori %new_idxs_117, %new_idxs_152 : tensor<1x16xi32, #blocked5> loc(#loc259) + %flip_154 = tt.broadcast %flip_39 : tensor<1x2x1xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc208) + %flip_155 = tt.reshape %flip_154 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc209) + %y_156 = tt.reshape %ret_150 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc222) + %ileft_157 = tt.broadcast %left_mask_43 : tensor<1x2x1xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc224) + %ileft_158 = arith.muli %y_156, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc224) + %ileft_159 = "tt.reduce"(%ileft_158) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc297) + %ileft_160 = tt.expand_dims %ileft_159 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc226) + %ileft_161 = tt.broadcast %ileft_160 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc227) + %iright_162 = arith.muli %y_156, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc228) + %iright_163 = "tt.reduce"(%iright_162) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc299) + %iright_164 = tt.expand_dims %iright_163 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc230) + %iright_165 = tt.broadcast %iright_164 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc231) + %ileft_166 = tt.reshape %ileft_161 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_167 = tt.reshape %iright_165 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_168 = tt.reshape %new_idxs_153 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc234) + %left_idx_169 = arith.muli %y_idx_168, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc236) + %left_idx_170 = "tt.reduce"(%left_idx_169) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %left_idx_171 = tt.expand_dims %left_idx_170 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc238) + %left_idx_172 = tt.broadcast %left_idx_171 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc239) + %right_idx_173 = arith.muli %y_idx_168, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc241) + %right_idx_174 = "tt.reduce"(%right_idx_173) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %right_idx_175 = tt.expand_dims %right_idx_174 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc243) + %right_idx_176 = tt.broadcast %right_idx_175 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc244) + %left_idx_177 = tt.reshape %left_idx_172 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_178 = tt.reshape %right_idx_176 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_179 = arith.cmpi slt, %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_180 = arith.cmpi eq, %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_181 = arith.cmpi sgt, %left_idx_177, %right_idx_178 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_182 = arith.andi %eq_180, %cond_181 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_183 = arith.ori %cond_179, %cond_182 : tensor<1x16xi1, #blocked5> loc(#loc251) + %cond_184 = arith.extui %cond_183 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_185 = arith.xori %cond_184, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_186 = arith.cmpi ne, %cond_185, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc253) + %ret_187 = arith.xori %ileft_166, %iright_167 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_188 = arith.select %cond_186, %ret_187, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_189 = arith.xori %ret_150, %ret_188 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_190 = arith.xori %left_idx_177, %right_idx_178 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_191 = arith.select %cond_186, %new_idxs_190, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_192 = arith.xori %new_idxs_153, %new_idxs_191 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_193 = tt.reshape %ret_189 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc222) + %ileft_194 = arith.muli %y_193, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc224) + %ileft_195 = "tt.reduce"(%ileft_194) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc297) + %ileft_196 = tt.expand_dims %ileft_195 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc226) + %ileft_197 = tt.broadcast %ileft_196 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc227) + %iright_198 = arith.muli %y_193, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc228) + %iright_199 = "tt.reduce"(%iright_198) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc299) + %iright_200 = tt.expand_dims %iright_199 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc230) + %iright_201 = tt.broadcast %iright_200 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc231) + %ileft_202 = tt.reshape %ileft_197 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_203 = tt.reshape %iright_201 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_204 = tt.reshape %new_idxs_192 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc234) + %left_idx_205 = arith.muli %y_idx_204, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc236) + %left_idx_206 = "tt.reduce"(%left_idx_205) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %left_idx_207 = tt.expand_dims %left_idx_206 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc238) + %left_idx_208 = tt.broadcast %left_idx_207 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc239) + %right_idx_209 = arith.muli %y_idx_204, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc241) + %right_idx_210 = "tt.reduce"(%right_idx_209) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %right_idx_211 = tt.expand_dims %right_idx_210 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc243) + %right_idx_212 = tt.broadcast %right_idx_211 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc244) + %left_idx_213 = tt.reshape %left_idx_208 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_214 = tt.reshape %right_idx_212 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_215 = arith.cmpi slt, %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_216 = arith.cmpi eq, %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_217 = arith.cmpi sgt, %left_idx_213, %right_idx_214 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_218 = arith.andi %eq_216, %cond_217 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_219 = arith.ori %cond_215, %cond_218 : tensor<1x16xi1, #blocked5> loc(#loc251) + %cond_220 = arith.extui %cond_219 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_221 = arith.xori %cond_220, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_222 = arith.cmpi ne, %cond_221, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc253) + %ret_223 = arith.xori %ileft_202, %iright_203 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_224 = arith.select %cond_222, %ret_223, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_225 = arith.xori %ret_189, %ret_224 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_226 = arith.xori %left_idx_213, %right_idx_214 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_227 = arith.select %cond_222, %new_idxs_226, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_228 = arith.xori %new_idxs_192, %new_idxs_227 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_229 = tt.reshape %ret_225 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc222) + %ileft_230 = arith.muli %y_229, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc224) + %ileft_231 = "tt.reduce"(%ileft_230) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc297) + %ileft_232 = tt.expand_dims %ileft_231 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc226) + %ileft_233 = tt.broadcast %ileft_232 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc227) + %iright_234 = arith.muli %y_229, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc228) + %iright_235 = "tt.reduce"(%iright_234) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc299) + %iright_236 = tt.expand_dims %iright_235 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc230) + %iright_237 = tt.broadcast %iright_236 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc231) + %ileft_238 = tt.reshape %ileft_233 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_239 = tt.reshape %iright_237 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_240 = tt.reshape %new_idxs_228 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc234) + %left_idx_241 = arith.muli %y_idx_240, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc236) + %left_idx_242 = "tt.reduce"(%left_idx_241) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %left_idx_243 = tt.expand_dims %left_idx_242 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc238) + %left_idx_244 = tt.broadcast %left_idx_243 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc239) + %right_idx_245 = arith.muli %y_idx_240, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc241) + %right_idx_246 = "tt.reduce"(%right_idx_245) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %right_idx_247 = tt.expand_dims %right_idx_246 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc243) + %right_idx_248 = tt.broadcast %right_idx_247 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc244) + %left_idx_249 = tt.reshape %left_idx_244 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_250 = tt.reshape %right_idx_248 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_251 = arith.cmpi slt, %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_252 = arith.cmpi eq, %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_253 = arith.cmpi sgt, %left_idx_249, %right_idx_250 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_254 = arith.andi %eq_252, %cond_253 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_255 = arith.ori %cond_251, %cond_254 : tensor<1x16xi1, #blocked5> loc(#loc251) + %cond_256 = arith.extui %cond_255 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_257 = arith.xori %cond_256, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc252) + %cond_258 = arith.cmpi ne, %cond_257, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc253) + %ret_259 = arith.xori %ileft_238, %iright_239 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_260 = arith.select %cond_258, %ret_259, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_261 = arith.xori %ret_225, %ret_260 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_262 = arith.xori %left_idx_249, %right_idx_250 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_263 = arith.select %cond_258, %new_idxs_262, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_264 = arith.xori %new_idxs_228, %new_idxs_263 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_265 = tt.reshape %ret_261 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc222) + %ileft_266 = tt.broadcast %left_mask_44 : tensor<1x2x1xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc224) + %ileft_267 = arith.muli %y_265, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc224) + %ileft_268 = "tt.reduce"(%ileft_267) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc297) + %ileft_269 = tt.expand_dims %ileft_268 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc226) + %ileft_270 = tt.broadcast %ileft_269 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc227) + %iright_271 = arith.muli %y_265, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc228) + %iright_272 = "tt.reduce"(%iright_271) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc299) + %iright_273 = tt.expand_dims %iright_272 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc230) + %iright_274 = tt.broadcast %iright_273 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc231) + %ileft_275 = tt.reshape %ileft_270 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_276 = tt.reshape %iright_274 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_277 = tt.reshape %new_idxs_264 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc234) + %left_idx_278 = arith.muli %y_idx_277, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc236) + %left_idx_279 = "tt.reduce"(%left_idx_278) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc302) + %left_idx_280 = tt.expand_dims %left_idx_279 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc238) + %left_idx_281 = tt.broadcast %left_idx_280 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc239) + %right_idx_282 = arith.muli %y_idx_277, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc241) + %right_idx_283 = "tt.reduce"(%right_idx_282) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc305) + %right_idx_284 = tt.expand_dims %right_idx_283 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc243) + %right_idx_285 = tt.broadcast %right_idx_284 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc244) + %left_idx_286 = tt.reshape %left_idx_281 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_287 = tt.reshape %right_idx_285 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_288 = arith.cmpi slt, %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_289 = arith.cmpi eq, %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_290 = arith.cmpi sgt, %left_idx_286, %right_idx_287 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_291 = arith.andi %eq_289, %cond_290 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_292 = arith.ori %cond_288, %cond_291 : tensor<1x16xi1, #blocked5> loc(#loc251) + %ret_293 = arith.xori %ileft_275, %iright_276 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_294 = arith.select %cond_292, %ret_293, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_295 = arith.xori %ret_261, %ret_294 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_296 = arith.xori %left_idx_286, %right_idx_287 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_297 = arith.select %cond_292, %new_idxs_296, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_298 = arith.xori %new_idxs_264, %new_idxs_297 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_299 = tt.reshape %ret_295 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc222) + %ileft_300 = arith.muli %y_299, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc224) + %ileft_301 = "tt.reduce"(%ileft_300) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc297) + %ileft_302 = tt.expand_dims %ileft_301 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc226) + %ileft_303 = tt.broadcast %ileft_302 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc227) + %iright_304 = arith.muli %y_299, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc228) + %iright_305 = "tt.reduce"(%iright_304) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc299) + %iright_306 = tt.expand_dims %iright_305 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc230) + %iright_307 = tt.broadcast %iright_306 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc231) + %ileft_308 = tt.reshape %ileft_303 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_309 = tt.reshape %iright_307 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_310 = tt.reshape %new_idxs_298 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc234) + %left_idx_311 = arith.muli %y_idx_310, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc236) + %left_idx_312 = "tt.reduce"(%left_idx_311) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc302) + %left_idx_313 = tt.expand_dims %left_idx_312 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc238) + %left_idx_314 = tt.broadcast %left_idx_313 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc239) + %right_idx_315 = arith.muli %y_idx_310, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc241) + %right_idx_316 = "tt.reduce"(%right_idx_315) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc305) + %right_idx_317 = tt.expand_dims %right_idx_316 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc243) + %right_idx_318 = tt.broadcast %right_idx_317 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc244) + %left_idx_319 = tt.reshape %left_idx_314 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_320 = tt.reshape %right_idx_318 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_321 = arith.cmpi slt, %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_322 = arith.cmpi eq, %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_323 = arith.cmpi sgt, %left_idx_319, %right_idx_320 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_324 = arith.andi %eq_322, %cond_323 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_325 = arith.ori %cond_321, %cond_324 : tensor<1x16xi1, #blocked5> loc(#loc251) + %ret_326 = arith.xori %ileft_308, %iright_309 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_327 = arith.select %cond_325, %ret_326, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_328 = arith.xori %ret_295, %ret_327 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_329 = arith.xori %left_idx_319, %right_idx_320 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_330 = arith.select %cond_325, %new_idxs_329, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_331 = arith.xori %new_idxs_298, %new_idxs_330 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_332 = tt.reshape %ret_328 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc222) + %ileft_333 = arith.muli %y_332, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc224) + %ileft_334 = "tt.reduce"(%ileft_333) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc297) + %ileft_335 = tt.expand_dims %ileft_334 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc226) + %ileft_336 = tt.broadcast %ileft_335 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc227) + %iright_337 = arith.muli %y_332, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc228) + %iright_338 = "tt.reduce"(%iright_337) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc299) + %iright_339 = tt.expand_dims %iright_338 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc230) + %iright_340 = tt.broadcast %iright_339 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc231) + %ileft_341 = tt.reshape %ileft_336 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_342 = tt.reshape %iright_340 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_343 = tt.reshape %new_idxs_331 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc234) + %left_idx_344 = arith.muli %y_idx_343, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc236) + %left_idx_345 = "tt.reduce"(%left_idx_344) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc302) + %left_idx_346 = tt.expand_dims %left_idx_345 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc238) + %left_idx_347 = tt.broadcast %left_idx_346 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc239) + %right_idx_348 = arith.muli %y_idx_343, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc241) + %right_idx_349 = "tt.reduce"(%right_idx_348) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc305) + %right_idx_350 = tt.expand_dims %right_idx_349 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc243) + %right_idx_351 = tt.broadcast %right_idx_350 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc244) + %left_idx_352 = tt.reshape %left_idx_347 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_353 = tt.reshape %right_idx_351 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_354 = arith.cmpi slt, %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_355 = arith.cmpi eq, %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_356 = arith.cmpi sgt, %left_idx_352, %right_idx_353 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_357 = arith.andi %eq_355, %cond_356 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_358 = arith.ori %cond_354, %cond_357 : tensor<1x16xi1, #blocked5> loc(#loc251) + %ret_359 = arith.xori %ileft_341, %iright_342 : tensor<1x16xi32, #blocked5> loc(#loc254) + %ret_360 = arith.select %cond_358, %ret_359, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc255) + %ret_361 = arith.xori %ret_328, %ret_360 : tensor<1x16xi32, #blocked5> loc(#loc256) + %new_idxs_362 = arith.xori %left_idx_352, %right_idx_353 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_363 = arith.select %cond_358, %new_idxs_362, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_364 = arith.xori %new_idxs_331, %new_idxs_363 : tensor<1x16xi32, #blocked5> loc(#loc259) + %y_365 = tt.reshape %ret_361 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc222) + %ileft_366 = arith.muli %y_365, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc224) + %ileft_367 = "tt.reduce"(%ileft_366) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc225)), %ileft_744: i32 loc(callsite(#loc1 at #loc225))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc315) + tt.reduce.return %ileft_745 : i32 loc(#loc297) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc297) + %ileft_368 = tt.expand_dims %ileft_367 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc226) + %ileft_369 = tt.broadcast %ileft_368 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc227) + %iright_370 = arith.muli %y_365, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc228) + %iright_371 = "tt.reduce"(%iright_370) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc229)), %iright_744: i32 loc(callsite(#loc1 at #loc229))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc316) + tt.reduce.return %iright_745 : i32 loc(#loc299) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc299) + %iright_372 = tt.expand_dims %iright_371 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc230) + %iright_373 = tt.broadcast %iright_372 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc231) + %ileft_374 = tt.reshape %ileft_369 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc232) + %iright_375 = tt.reshape %iright_373 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc233) + %y_idx_376 = tt.reshape %new_idxs_364 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc234) + %left_idx_377 = arith.muli %y_idx_376, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc236) + %left_idx_378 = "tt.reduce"(%left_idx_377) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc237)), %left_idx_744: i32 loc(callsite(#loc1 at #loc237))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc317) + tt.reduce.return %left_idx_745 : i32 loc(#loc302) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc302) + %left_idx_379 = tt.expand_dims %left_idx_378 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc238) + %left_idx_380 = tt.broadcast %left_idx_379 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc239) + %right_idx_381 = arith.muli %y_idx_376, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc241) + %right_idx_382 = "tt.reduce"(%right_idx_381) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc242)), %right_idx_744: i32 loc(callsite(#loc1 at #loc242))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc318) + tt.reduce.return %right_idx_745 : i32 loc(#loc305) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc305) + %right_idx_383 = tt.expand_dims %right_idx_382 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc243) + %right_idx_384 = tt.broadcast %right_idx_383 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc244) + %left_idx_385 = tt.reshape %left_idx_380 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc245) + %right_idx_386 = tt.reshape %right_idx_384 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc246) + %cond_387 = arith.cmpi slt, %ileft_374, %iright_375 : tensor<1x16xi32, #blocked5> loc(#loc247) + %eq_388 = arith.cmpi eq, %ileft_374, %iright_375 : tensor<1x16xi32, #blocked5> loc(#loc248) + %cond_389 = arith.cmpi sgt, %left_idx_385, %right_idx_386 : tensor<1x16xi32, #blocked5> loc(#loc249) + %cond_390 = arith.andi %eq_388, %cond_389 : tensor<1x16xi1, #blocked5> loc(#loc250) + %cond_391 = arith.ori %cond_387, %cond_390 : tensor<1x16xi1, #blocked5> loc(#loc251) + %new_idxs_392 = arith.xori %left_idx_385, %right_idx_386 : tensor<1x16xi32, #blocked5> loc(#loc257) + %new_idxs_393 = arith.select %cond_391, %new_idxs_392, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc258) + %new_idxs_394 = arith.xori %new_idxs_364, %new_idxs_393 : tensor<1x16xi32, #blocked5> loc(#loc259) + %tmp14 = arith.cmpi eq, %tmp0_24, %cst_9 : tensor<1x16xi64, #blocked5> loc(#loc178) + %tmp14_395 = arith.cmpi eq, %tmp0_25, %cst_0 : tensor<1x16xi64, #blocked> loc(#loc178) + %tmp16 = arith.extui %tmp14 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc213) + %y_396 = tt.reshape %tmp16 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc260) + %ileft_397 = arith.muli %y_396, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc261) + %ileft_398 = "tt.reduce"(%ileft_397) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc307) + %ileft_399 = tt.expand_dims %ileft_398 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc263) + %ileft_400 = tt.broadcast %ileft_399 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc264) + %iright_401 = arith.muli %y_396, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc265) + %iright_402 = "tt.reduce"(%iright_401) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc309) + %iright_403 = tt.expand_dims %iright_402 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc267) + %iright_404 = tt.broadcast %iright_403 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc268) + %ileft_405 = tt.reshape %ileft_400 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_406 = tt.reshape %iright_404 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %cond_407 = arith.cmpi slt, %ileft_405, %iright_406 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_408 = arith.cmpi eq, %ileft_405, %iright_406 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_409 = arith.andi %eq_408, %cond_68 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_410 = arith.ori %cond_407, %cond_409 : tensor<1x16xi1, #blocked5> loc(#loc274) + %cond_411 = arith.extui %cond_410 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_412 = arith.xori %cond_411, %flip_41 : tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_413 = arith.cmpi ne, %cond_412, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc276) + %ret_414 = arith.xori %ileft_405, %iright_406 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_415 = arith.select %cond_413, %ret_414, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_416 = arith.xori %tmp16, %ret_415 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_417 = arith.select %cond_413, %new_idxs, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_418 = arith.xori %new_idxs_77, %new_idxs_417 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_419 = tt.reshape %ret_416 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc260) + %ileft_420 = arith.muli %y_419, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc261) + %ileft_421 = "tt.reduce"(%ileft_420) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc307) + %ileft_422 = tt.expand_dims %ileft_421 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc263) + %ileft_423 = tt.broadcast %ileft_422 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc264) + %iright_424 = arith.muli %y_419, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc265) + %iright_425 = "tt.reduce"(%iright_424) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc309) + %iright_426 = tt.expand_dims %iright_425 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc267) + %iright_427 = tt.broadcast %iright_426 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc268) + %ileft_428 = tt.reshape %ileft_423 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_429 = tt.reshape %iright_427 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_430 = tt.reshape %new_idxs_418 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc282) + %left_idx_431 = arith.muli %y_idx_430, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc283) + %left_idx_432 = "tt.reduce"(%left_idx_431) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc311) + %left_idx_433 = tt.expand_dims %left_idx_432 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc285) + %left_idx_434 = tt.broadcast %left_idx_433 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc286) + %right_idx_435 = arith.muli %y_idx_430, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc287) + %right_idx_436 = "tt.reduce"(%right_idx_435) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc313) + %right_idx_437 = tt.expand_dims %right_idx_436 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc289) + %right_idx_438 = tt.broadcast %right_idx_437 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc290) + %left_idx_439 = tt.reshape %left_idx_434 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_440 = tt.reshape %right_idx_438 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_441 = arith.cmpi slt, %ileft_428, %iright_429 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_442 = arith.cmpi eq, %ileft_428, %iright_429 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_443 = arith.cmpi sgt, %left_idx_439, %right_idx_440 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_444 = arith.andi %eq_442, %cond_443 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_445 = arith.ori %cond_441, %cond_444 : tensor<1x16xi1, #blocked5> loc(#loc274) + %cond_446 = arith.extui %cond_445 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_447 = arith.xori %cond_446, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_448 = arith.cmpi ne, %cond_447, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc276) + %ret_449 = arith.xori %ileft_428, %iright_429 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_450 = arith.select %cond_448, %ret_449, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_451 = arith.xori %ret_416, %ret_450 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_452 = arith.xori %left_idx_439, %right_idx_440 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_453 = arith.select %cond_448, %new_idxs_452, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_454 = arith.xori %new_idxs_418, %new_idxs_453 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_455 = tt.reshape %ret_451 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc260) + %ileft_456 = arith.muli %y_455, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc261) + %ileft_457 = "tt.reduce"(%ileft_456) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc307) + %ileft_458 = tt.expand_dims %ileft_457 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc263) + %ileft_459 = tt.broadcast %ileft_458 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc264) + %iright_460 = arith.muli %y_455, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc265) + %iright_461 = "tt.reduce"(%iright_460) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc309) + %iright_462 = tt.expand_dims %iright_461 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc267) + %iright_463 = tt.broadcast %iright_462 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc268) + %ileft_464 = tt.reshape %ileft_459 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_465 = tt.reshape %iright_463 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_466 = tt.reshape %new_idxs_454 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc282) + %left_idx_467 = arith.muli %y_idx_466, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc283) + %left_idx_468 = "tt.reduce"(%left_idx_467) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc311) + %left_idx_469 = tt.expand_dims %left_idx_468 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc285) + %left_idx_470 = tt.broadcast %left_idx_469 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc286) + %right_idx_471 = arith.muli %y_idx_466, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc287) + %right_idx_472 = "tt.reduce"(%right_idx_471) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc313) + %right_idx_473 = tt.expand_dims %right_idx_472 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc289) + %right_idx_474 = tt.broadcast %right_idx_473 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc290) + %left_idx_475 = tt.reshape %left_idx_470 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_476 = tt.reshape %right_idx_474 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_477 = arith.cmpi slt, %ileft_464, %iright_465 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_478 = arith.cmpi eq, %ileft_464, %iright_465 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_479 = arith.cmpi sgt, %left_idx_475, %right_idx_476 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_480 = arith.andi %eq_478, %cond_479 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_481 = arith.ori %cond_477, %cond_480 : tensor<1x16xi1, #blocked5> loc(#loc274) + %cond_482 = arith.extui %cond_481 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_483 = arith.xori %cond_482, %flip_80 : tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_484 = arith.cmpi ne, %cond_483, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc276) + %ret_485 = arith.xori %ileft_464, %iright_465 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_486 = arith.select %cond_484, %ret_485, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_487 = arith.xori %ret_451, %ret_486 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_488 = arith.xori %left_idx_475, %right_idx_476 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_489 = arith.select %cond_484, %new_idxs_488, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_490 = arith.xori %new_idxs_454, %new_idxs_489 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_491 = tt.reshape %ret_487 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc260) + %ileft_492 = arith.muli %y_491, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc261) + %ileft_493 = "tt.reduce"(%ileft_492) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc307) + %ileft_494 = tt.expand_dims %ileft_493 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc263) + %ileft_495 = tt.broadcast %ileft_494 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc264) + %iright_496 = arith.muli %y_491, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc265) + %iright_497 = "tt.reduce"(%iright_496) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc309) + %iright_498 = tt.expand_dims %iright_497 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc267) + %iright_499 = tt.broadcast %iright_498 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc268) + %ileft_500 = tt.reshape %ileft_495 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_501 = tt.reshape %iright_499 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_502 = tt.reshape %new_idxs_490 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc282) + %left_idx_503 = arith.muli %y_idx_502, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc283) + %left_idx_504 = "tt.reduce"(%left_idx_503) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc311) + %left_idx_505 = tt.expand_dims %left_idx_504 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc285) + %left_idx_506 = tt.broadcast %left_idx_505 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc286) + %right_idx_507 = arith.muli %y_idx_502, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc287) + %right_idx_508 = "tt.reduce"(%right_idx_507) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc313) + %right_idx_509 = tt.expand_dims %right_idx_508 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc289) + %right_idx_510 = tt.broadcast %right_idx_509 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc290) + %left_idx_511 = tt.reshape %left_idx_506 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_512 = tt.reshape %right_idx_510 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_513 = arith.cmpi slt, %ileft_500, %iright_501 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_514 = arith.cmpi eq, %ileft_500, %iright_501 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_515 = arith.cmpi sgt, %left_idx_511, %right_idx_512 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_516 = arith.andi %eq_514, %cond_515 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_517 = arith.ori %cond_513, %cond_516 : tensor<1x16xi1, #blocked5> loc(#loc274) + %cond_518 = arith.extui %cond_517 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_519 = arith.xori %cond_518, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_520 = arith.cmpi ne, %cond_519, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc276) + %ret_521 = arith.xori %ileft_500, %iright_501 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_522 = arith.select %cond_520, %ret_521, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_523 = arith.xori %ret_487, %ret_522 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_524 = arith.xori %left_idx_511, %right_idx_512 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_525 = arith.select %cond_520, %new_idxs_524, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_526 = arith.xori %new_idxs_490, %new_idxs_525 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_527 = tt.reshape %ret_523 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc260) + %ileft_528 = arith.muli %y_527, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc261) + %ileft_529 = "tt.reduce"(%ileft_528) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc307) + %ileft_530 = tt.expand_dims %ileft_529 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc263) + %ileft_531 = tt.broadcast %ileft_530 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc264) + %iright_532 = arith.muli %y_527, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc265) + %iright_533 = "tt.reduce"(%iright_532) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc309) + %iright_534 = tt.expand_dims %iright_533 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc267) + %iright_535 = tt.broadcast %iright_534 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc268) + %ileft_536 = tt.reshape %ileft_531 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_537 = tt.reshape %iright_535 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_538 = tt.reshape %new_idxs_526 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc282) + %left_idx_539 = arith.muli %y_idx_538, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc283) + %left_idx_540 = "tt.reduce"(%left_idx_539) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc311) + %left_idx_541 = tt.expand_dims %left_idx_540 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc285) + %left_idx_542 = tt.broadcast %left_idx_541 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc286) + %right_idx_543 = arith.muli %y_idx_538, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc287) + %right_idx_544 = "tt.reduce"(%right_idx_543) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc313) + %right_idx_545 = tt.expand_dims %right_idx_544 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc289) + %right_idx_546 = tt.broadcast %right_idx_545 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc290) + %left_idx_547 = tt.reshape %left_idx_542 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_548 = tt.reshape %right_idx_546 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_549 = arith.cmpi slt, %ileft_536, %iright_537 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_550 = arith.cmpi eq, %ileft_536, %iright_537 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_551 = arith.cmpi sgt, %left_idx_547, %right_idx_548 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_552 = arith.andi %eq_550, %cond_551 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_553 = arith.ori %cond_549, %cond_552 : tensor<1x16xi1, #blocked5> loc(#loc274) + %cond_554 = arith.extui %cond_553 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_555 = arith.xori %cond_554, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_556 = arith.cmpi ne, %cond_555, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc276) + %ret_557 = arith.xori %ileft_536, %iright_537 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_558 = arith.select %cond_556, %ret_557, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_559 = arith.xori %ret_523, %ret_558 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_560 = arith.xori %left_idx_547, %right_idx_548 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_561 = arith.select %cond_556, %new_idxs_560, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_562 = arith.xori %new_idxs_526, %new_idxs_561 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_563 = tt.reshape %ret_559 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc260) + %ileft_564 = arith.muli %y_563, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc261) + %ileft_565 = "tt.reduce"(%ileft_564) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc307) + %ileft_566 = tt.expand_dims %ileft_565 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc263) + %ileft_567 = tt.broadcast %ileft_566 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc264) + %iright_568 = arith.muli %y_563, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc265) + %iright_569 = "tt.reduce"(%iright_568) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc309) + %iright_570 = tt.expand_dims %iright_569 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc267) + %iright_571 = tt.broadcast %iright_570 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc268) + %ileft_572 = tt.reshape %ileft_567 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_573 = tt.reshape %iright_571 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_574 = tt.reshape %new_idxs_562 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc282) + %left_idx_575 = arith.muli %y_idx_574, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc283) + %left_idx_576 = "tt.reduce"(%left_idx_575) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc311) + %left_idx_577 = tt.expand_dims %left_idx_576 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc285) + %left_idx_578 = tt.broadcast %left_idx_577 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc286) + %right_idx_579 = arith.muli %y_idx_574, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc287) + %right_idx_580 = "tt.reduce"(%right_idx_579) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc313) + %right_idx_581 = tt.expand_dims %right_idx_580 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc289) + %right_idx_582 = tt.broadcast %right_idx_581 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc290) + %left_idx_583 = tt.reshape %left_idx_578 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_584 = tt.reshape %right_idx_582 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_585 = arith.cmpi slt, %ileft_572, %iright_573 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_586 = arith.cmpi eq, %ileft_572, %iright_573 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_587 = arith.cmpi sgt, %left_idx_583, %right_idx_584 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_588 = arith.andi %eq_586, %cond_587 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_589 = arith.ori %cond_585, %cond_588 : tensor<1x16xi1, #blocked5> loc(#loc274) + %cond_590 = arith.extui %cond_589 : tensor<1x16xi1, #blocked5> to tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_591 = arith.xori %cond_590, %flip_155 : tensor<1x16xi32, #blocked5> loc(#loc275) + %cond_592 = arith.cmpi ne, %cond_591, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc276) + %ret_593 = arith.xori %ileft_572, %iright_573 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_594 = arith.select %cond_592, %ret_593, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_595 = arith.xori %ret_559, %ret_594 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_596 = arith.xori %left_idx_583, %right_idx_584 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_597 = arith.select %cond_592, %new_idxs_596, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_598 = arith.xori %new_idxs_562, %new_idxs_597 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_599 = tt.reshape %ret_595 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc260) + %ileft_600 = arith.muli %y_599, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc261) + %ileft_601 = "tt.reduce"(%ileft_600) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc307) + %ileft_602 = tt.expand_dims %ileft_601 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc263) + %ileft_603 = tt.broadcast %ileft_602 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc264) + %iright_604 = arith.muli %y_599, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc265) + %iright_605 = "tt.reduce"(%iright_604) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc309) + %iright_606 = tt.expand_dims %iright_605 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc267) + %iright_607 = tt.broadcast %iright_606 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc268) + %ileft_608 = tt.reshape %ileft_603 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_609 = tt.reshape %iright_607 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_610 = tt.reshape %new_idxs_598 : tensor<1x16xi32, #blocked5> -> tensor<1x2x8xi32, #blocked1> loc(#loc282) + %left_idx_611 = arith.muli %y_idx_610, %ileft_266 : tensor<1x2x8xi32, #blocked1> loc(#loc283) + %left_idx_612 = "tt.reduce"(%left_idx_611) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc311) + %left_idx_613 = tt.expand_dims %left_idx_612 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc285) + %left_idx_614 = tt.broadcast %left_idx_613 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc286) + %right_idx_615 = arith.muli %y_idx_610, %flip_154 : tensor<1x2x8xi32, #blocked1> loc(#loc287) + %right_idx_616 = "tt.reduce"(%right_idx_615) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<1x2x8xi32, #blocked1>) -> tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> loc(#loc313) + %right_idx_617 = tt.expand_dims %right_idx_616 {axis = 1 : i32} : tensor<1x8xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<1x1x8xi32, #blocked1> loc(#loc289) + %right_idx_618 = tt.broadcast %right_idx_617 : tensor<1x1x8xi32, #blocked1> -> tensor<1x2x8xi32, #blocked1> loc(#loc290) + %left_idx_619 = tt.reshape %left_idx_614 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_620 = tt.reshape %right_idx_618 : tensor<1x2x8xi32, #blocked1> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_621 = arith.cmpi slt, %ileft_608, %iright_609 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_622 = arith.cmpi eq, %ileft_608, %iright_609 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_623 = arith.cmpi sgt, %left_idx_619, %right_idx_620 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_624 = arith.andi %eq_622, %cond_623 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_625 = arith.ori %cond_621, %cond_624 : tensor<1x16xi1, #blocked5> loc(#loc274) + %ret_626 = arith.xori %ileft_608, %iright_609 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_627 = arith.select %cond_625, %ret_626, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_628 = arith.xori %ret_595, %ret_627 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_629 = arith.xori %left_idx_619, %right_idx_620 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_630 = arith.select %cond_625, %new_idxs_629, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_631 = arith.xori %new_idxs_598, %new_idxs_630 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_632 = tt.reshape %ret_628 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc260) + %ileft_633 = arith.muli %y_632, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc261) + %ileft_634 = "tt.reduce"(%ileft_633) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc307) + %ileft_635 = tt.expand_dims %ileft_634 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc263) + %ileft_636 = tt.broadcast %ileft_635 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc264) + %iright_637 = arith.muli %y_632, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc265) + %iright_638 = "tt.reduce"(%iright_637) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc309) + %iright_639 = tt.expand_dims %iright_638 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc267) + %iright_640 = tt.broadcast %iright_639 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc268) + %ileft_641 = tt.reshape %ileft_636 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_642 = tt.reshape %iright_640 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_643 = tt.reshape %new_idxs_631 : tensor<1x16xi32, #blocked5> -> tensor<2x2x4xi32, #blocked2> loc(#loc282) + %left_idx_644 = arith.muli %y_idx_643, %ileft_157 : tensor<2x2x4xi32, #blocked2> loc(#loc283) + %left_idx_645 = "tt.reduce"(%left_idx_644) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc311) + %left_idx_646 = tt.expand_dims %left_idx_645 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc285) + %left_idx_647 = tt.broadcast %left_idx_646 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc286) + %right_idx_648 = arith.muli %y_idx_643, %flip_79 : tensor<2x2x4xi32, #blocked2> loc(#loc287) + %right_idx_649 = "tt.reduce"(%right_idx_648) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<2x2x4xi32, #blocked2>) -> tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> loc(#loc313) + %right_idx_650 = tt.expand_dims %right_idx_649 {axis = 1 : i32} : tensor<2x4xi32, #ttg.slice<{dim = 1, parent = #blocked2}>> -> tensor<2x1x4xi32, #blocked2> loc(#loc289) + %right_idx_651 = tt.broadcast %right_idx_650 : tensor<2x1x4xi32, #blocked2> -> tensor<2x2x4xi32, #blocked2> loc(#loc290) + %left_idx_652 = tt.reshape %left_idx_647 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_653 = tt.reshape %right_idx_651 : tensor<2x2x4xi32, #blocked2> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_654 = arith.cmpi slt, %ileft_641, %iright_642 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_655 = arith.cmpi eq, %ileft_641, %iright_642 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_656 = arith.cmpi sgt, %left_idx_652, %right_idx_653 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_657 = arith.andi %eq_655, %cond_656 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_658 = arith.ori %cond_654, %cond_657 : tensor<1x16xi1, #blocked5> loc(#loc274) + %ret_659 = arith.xori %ileft_641, %iright_642 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_660 = arith.select %cond_658, %ret_659, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_661 = arith.xori %ret_628, %ret_660 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_662 = arith.xori %left_idx_652, %right_idx_653 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_663 = arith.select %cond_658, %new_idxs_662, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_664 = arith.xori %new_idxs_631, %new_idxs_663 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_665 = tt.reshape %ret_661 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc260) + %ileft_666 = arith.muli %y_665, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc261) + %ileft_667 = "tt.reduce"(%ileft_666) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc307) + %ileft_668 = tt.expand_dims %ileft_667 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc263) + %ileft_669 = tt.broadcast %ileft_668 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc264) + %iright_670 = arith.muli %y_665, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc265) + %iright_671 = "tt.reduce"(%iright_670) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc309) + %iright_672 = tt.expand_dims %iright_671 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc267) + %iright_673 = tt.broadcast %iright_672 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc268) + %ileft_674 = tt.reshape %ileft_669 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_675 = tt.reshape %iright_673 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_676 = tt.reshape %new_idxs_664 : tensor<1x16xi32, #blocked5> -> tensor<4x2x2xi32, #blocked3> loc(#loc282) + %left_idx_677 = arith.muli %y_idx_676, %ileft_82 : tensor<4x2x2xi32, #blocked3> loc(#loc283) + %left_idx_678 = "tt.reduce"(%left_idx_677) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc311) + %left_idx_679 = tt.expand_dims %left_idx_678 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc285) + %left_idx_680 = tt.broadcast %left_idx_679 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc286) + %right_idx_681 = arith.muli %y_idx_676, %flip_40 : tensor<4x2x2xi32, #blocked3> loc(#loc287) + %right_idx_682 = "tt.reduce"(%right_idx_681) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<4x2x2xi32, #blocked3>) -> tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> loc(#loc313) + %right_idx_683 = tt.expand_dims %right_idx_682 {axis = 1 : i32} : tensor<4x2xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<4x1x2xi32, #blocked3> loc(#loc289) + %right_idx_684 = tt.broadcast %right_idx_683 : tensor<4x1x2xi32, #blocked3> -> tensor<4x2x2xi32, #blocked3> loc(#loc290) + %left_idx_685 = tt.reshape %left_idx_680 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_686 = tt.reshape %right_idx_684 : tensor<4x2x2xi32, #blocked3> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_687 = arith.cmpi slt, %ileft_674, %iright_675 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_688 = arith.cmpi eq, %ileft_674, %iright_675 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_689 = arith.cmpi sgt, %left_idx_685, %right_idx_686 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_690 = arith.andi %eq_688, %cond_689 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_691 = arith.ori %cond_687, %cond_690 : tensor<1x16xi1, #blocked5> loc(#loc274) + %ret_692 = arith.xori %ileft_674, %iright_675 : tensor<1x16xi32, #blocked5> loc(#loc277) + %ret_693 = arith.select %cond_691, %ret_692, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc278) + %ret_694 = arith.xori %ret_661, %ret_693 : tensor<1x16xi32, #blocked5> loc(#loc279) + %new_idxs_695 = arith.xori %left_idx_685, %right_idx_686 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_696 = arith.select %cond_691, %new_idxs_695, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_697 = arith.xori %new_idxs_664, %new_idxs_696 : tensor<1x16xi32, #blocked5> loc(#loc281) + %y_698 = tt.reshape %ret_694 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc260) + %ileft_699 = arith.muli %y_698, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc261) + %ileft_700 = "tt.reduce"(%ileft_699) <{axis = 1 : i32}> ({ + ^bb0(%ileft_743: i32 loc(callsite(#loc1 at #loc262)), %ileft_744: i32 loc(callsite(#loc1 at #loc262))): + %ileft_745 = arith.addi %ileft_743, %ileft_744 : i32 loc(#loc319) + tt.reduce.return %ileft_745 : i32 loc(#loc307) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc307) + %ileft_701 = tt.expand_dims %ileft_700 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc263) + %ileft_702 = tt.broadcast %ileft_701 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc264) + %iright_703 = arith.muli %y_698, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc265) + %iright_704 = "tt.reduce"(%iright_703) <{axis = 1 : i32}> ({ + ^bb0(%iright_743: i32 loc(callsite(#loc1 at #loc266)), %iright_744: i32 loc(callsite(#loc1 at #loc266))): + %iright_745 = arith.addi %iright_743, %iright_744 : i32 loc(#loc320) + tt.reduce.return %iright_745 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc309) + %iright_705 = tt.expand_dims %iright_704 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc267) + %iright_706 = tt.broadcast %iright_705 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc268) + %ileft_707 = tt.reshape %ileft_702 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc269) + %iright_708 = tt.reshape %iright_706 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc270) + %y_idx_709 = tt.reshape %new_idxs_697 : tensor<1x16xi32, #blocked5> -> tensor<8x2x1xi32, #blocked4> loc(#loc282) + %left_idx_710 = arith.muli %y_idx_709, %ileft : tensor<8x2x1xi32, #blocked4> loc(#loc283) + %left_idx_711 = "tt.reduce"(%left_idx_710) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_743: i32 loc(callsite(#loc1 at #loc284)), %left_idx_744: i32 loc(callsite(#loc1 at #loc284))): + %left_idx_745 = arith.addi %left_idx_743, %left_idx_744 : i32 loc(#loc321) + tt.reduce.return %left_idx_745 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc311) + %left_idx_712 = tt.expand_dims %left_idx_711 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc285) + %left_idx_713 = tt.broadcast %left_idx_712 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc286) + %right_idx_714 = arith.muli %y_idx_709, %iright : tensor<8x2x1xi32, #blocked4> loc(#loc287) + %right_idx_715 = "tt.reduce"(%right_idx_714) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_743: i32 loc(callsite(#loc1 at #loc288)), %right_idx_744: i32 loc(callsite(#loc1 at #loc288))): + %right_idx_745 = arith.addi %right_idx_743, %right_idx_744 : i32 loc(#loc322) + tt.reduce.return %right_idx_745 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32, #blocked4>) -> tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> loc(#loc313) + %right_idx_716 = tt.expand_dims %right_idx_715 {axis = 1 : i32} : tensor<8x1xi32, #ttg.slice<{dim = 1, parent = #blocked4}>> -> tensor<8x1x1xi32, #blocked4> loc(#loc289) + %right_idx_717 = tt.broadcast %right_idx_716 : tensor<8x1x1xi32, #blocked4> -> tensor<8x2x1xi32, #blocked4> loc(#loc290) + %left_idx_718 = tt.reshape %left_idx_713 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc291) + %right_idx_719 = tt.reshape %right_idx_717 : tensor<8x2x1xi32, #blocked4> -> tensor<1x16xi32, #blocked5> loc(#loc292) + %cond_720 = arith.cmpi slt, %ileft_707, %iright_708 : tensor<1x16xi32, #blocked5> loc(#loc271) + %eq_721 = arith.cmpi eq, %ileft_707, %iright_708 : tensor<1x16xi32, #blocked5> loc(#loc272) + %cond_722 = arith.cmpi sgt, %left_idx_718, %right_idx_719 : tensor<1x16xi32, #blocked5> loc(#loc293) + %cond_723 = arith.andi %eq_721, %cond_722 : tensor<1x16xi1, #blocked5> loc(#loc273) + %cond_724 = arith.ori %cond_720, %cond_723 : tensor<1x16xi1, #blocked5> loc(#loc274) + %new_idxs_725 = arith.xori %left_idx_718, %right_idx_719 : tensor<1x16xi32, #blocked5> loc(#loc294) + %new_idxs_726 = arith.select %cond_724, %new_idxs_725, %cst_6 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc280) + %new_idxs_727 = arith.xori %new_idxs_697, %new_idxs_726 : tensor<1x16xi32, #blocked5> loc(#loc281) + %tmp20 = arith.extui %tmp5 : tensor<1x16xi1, #blocked5> to tensor<1x16xi64, #blocked5> loc(#loc215) + %tmp20_728 = arith.extui %tmp5_28 : tensor<1x16xi1, #blocked> to tensor<1x16xi64, #blocked> loc(#loc215) + %tmp23 = arith.select %tmp0_22, %tmp20, %cst_10 : tensor<1x16xi1, #blocked5>, tensor<1x16xi64, #blocked5> loc(#loc183) + %tmp23_729 = arith.select %tmp0_23, %tmp20_728, %cst : tensor<1x16xi1, #blocked>, tensor<1x16xi64, #blocked> loc(#loc183) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_743: i64 loc(callsite(#loc1 at #loc184)), %tmp24_744: i64 loc(callsite(#loc1 at #loc184))): + %tmp24_745 = arith.addi %tmp24_743, %tmp24_744 : i64 loc(#loc295) + tt.reduce.return %tmp24_745 : i64 loc(#loc216) + }) : (tensor<1x16xi64, #blocked5>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc216) + %tmp24_730 = "tt.reduce"(%tmp23_729) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_743: i64 loc(callsite(#loc1 at #loc184)), %tmp24_744: i64 loc(callsite(#loc1 at #loc184))): + %tmp24_745 = arith.addi %tmp24_743, %tmp24_744 : i64 loc(#loc295) + tt.reduce.return %tmp24_745 : i64 loc(#loc216) + }) : (tensor<1x16xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc216) + %tmp24_731 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<1x1xi64, #blocked5> loc(#loc185) + %tmp24_732 = tt.expand_dims %tmp24_730 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc185) + %tmp25 = arith.extui %tmp14 : tensor<1x16xi1, #blocked5> to tensor<1x16xi64, #blocked5> loc(#loc218) + %tmp25_733 = arith.extui %tmp14_395 : tensor<1x16xi1, #blocked> to tensor<1x16xi64, #blocked> loc(#loc218) + %tmp28 = arith.select %tmp0_22, %tmp25, %cst_10 : tensor<1x16xi1, #blocked5>, tensor<1x16xi64, #blocked5> loc(#loc187) + %tmp28_734 = arith.select %tmp0_23, %tmp25_733, %cst : tensor<1x16xi1, #blocked>, tensor<1x16xi64, #blocked> loc(#loc187) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_743: i64 loc(callsite(#loc1 at #loc188)), %tmp29_744: i64 loc(callsite(#loc1 at #loc188))): + %tmp29_745 = arith.addi %tmp29_743, %tmp29_744 : i64 loc(#loc296) + tt.reduce.return %tmp29_745 : i64 loc(#loc219) + }) : (tensor<1x16xi64, #blocked5>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> loc(#loc219) + %tmp29_735 = "tt.reduce"(%tmp28_734) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_743: i64 loc(callsite(#loc1 at #loc188)), %tmp29_744: i64 loc(callsite(#loc1 at #loc188))): + %tmp29_745 = arith.addi %tmp29_743, %tmp29_744 : i64 loc(#loc296) + tt.reduce.return %tmp29_745 : i64 loc(#loc219) + }) : (tensor<1x16xi64, #blocked>) -> tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> loc(#loc219) + %tmp29_736 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked5}>> -> tensor<1x1xi64, #blocked5> loc(#loc189) + %tmp29_737 = tt.expand_dims %tmp29_735 {axis = 1 : i32} : tensor<1xi64, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<1x1xi64, #blocked> loc(#loc189) + %tmp30 = arith.trunci %tmp24_731 : tensor<1x1xi64, #blocked5> to tensor<1x1xi32, #blocked5> loc(#loc190) + %tmp30_738 = arith.trunci %tmp24_732 : tensor<1x1xi64, #blocked> to tensor<1x1xi32, #blocked> loc(#loc190) + %tmp31 = arith.trunci %tmp29_736 : tensor<1x1xi64, #blocked5> to tensor<1x1xi32, #blocked5> loc(#loc191) + %tmp31_739 = arith.trunci %tmp29_737 : tensor<1x1xi64, #blocked> to tensor<1x1xi32, #blocked> loc(#loc191) + %tmp34 = tt.broadcast %tmp30 : tensor<1x1xi32, #blocked5> -> tensor<1x16xi32, #blocked5> loc(#loc192) + %tmp34_740 = arith.cmpi slt, %r0_index_12, %tmp34 : tensor<1x16xi32, #blocked5> loc(#loc192) + %tmp36 = arith.select %tmp34_740, %new_idxs_394, %cst_8 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc193) + %tmp38 = arith.addi %tmp36, %cst_7 : tensor<1x16xi32, #blocked5> loc(#loc194) + %tmp39 = arith.cmpi slt, %tmp36, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc195) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc196) + %0 = arith.cmpi sge, %tmp40, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc81) + %1 = arith.cmpi slt, %tmp40, %cst_7 : tensor<1x16xi32, #blocked5> loc(#loc82) + %2 = arith.andi %0, %1 : tensor<1x16xi1, #blocked5> loc(#loc83) + %xmask_741 = arith.cmpi sge, %xoffset, %c32_i32 : i32 loc(#loc221) + %3 = tt.splat %xmask_741 : i1 -> tensor<1x16xi1, #blocked5> loc(#loc197) + %4 = arith.ori %2, %3 : tensor<1x16xi1, #blocked5> loc(#loc85) + tt.assert %4, "index out of bounds: 0 <= tmp40 < 17" : tensor<1x16xi1, #blocked5> loc(#loc86) + %tmp45 = tt.broadcast %tmp31 : tensor<1x1xi32, #blocked5> -> tensor<1x16xi32, #blocked5> loc(#loc198) + %tmp45_742 = arith.cmpi slt, %r0_index_12, %tmp45 : tensor<1x16xi32, #blocked5> loc(#loc198) + %tmp46 = arith.select %tmp45_742, %new_idxs_727, %cst_8 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc199) + %tmp47 = arith.addi %tmp46, %cst_7 : tensor<1x16xi32, #blocked5> loc(#loc200) + %tmp48 = arith.cmpi slt, %tmp46, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc201) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<1x16xi1, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc202) + %5 = arith.cmpi sge, %tmp49, %cst_6 : tensor<1x16xi32, #blocked5> loc(#loc92) + %6 = arith.cmpi slt, %tmp49, %cst_7 : tensor<1x16xi32, #blocked5> loc(#loc93) + %7 = arith.andi %5, %6 : tensor<1x16xi1, #blocked5> loc(#loc94) + %8 = arith.ori %7, %3 : tensor<1x16xi1, #blocked5> loc(#loc95) + tt.assert %8, "index out of bounds: 0 <= tmp49 < 17" : tensor<1x16xi1, #blocked5> loc(#loc96) + %9 = tt.addptr %out_ptr4, %xoffset : !tt.ptr, i32 loc(#loc97) + %10 = tt.splat %9 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc98) + %11 = tt.splat %xmask : i1 -> tensor<1x1xi1, #blocked> loc(#loc98) + tt.store %10, %tmp30_738, %11 : tensor<1x1x!tt.ptr, #blocked> loc(#loc98) + %12 = tt.addptr %out_ptr5, %xoffset : !tt.ptr, i32 loc(#loc99) + %13 = tt.splat %12 : !tt.ptr -> tensor<1x1x!tt.ptr, #blocked> loc(#loc100) + tt.store %13, %tmp31_739, %11 : tensor<1x1x!tt.ptr, #blocked> loc(#loc100) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc101) + %15 = tt.addptr %14, %tmp0_16 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc101) + tt.store %15, %new_idxs_394, %tmp0_22 : tensor<1x16x!tt.ptr, #blocked5> loc(#loc102) + %16 = arith.muli %xoffset, %c17_i32 : i32 loc(#loc103) + %17 = tt.splat %16 : i32 -> tensor<1x16xi32, #blocked5> loc(#loc203) + %18 = arith.addi %tmp40, %17 : tensor<1x16xi32, #blocked5> loc(#loc104) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc105) + %20 = tt.addptr %19, %18 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc105) + %21 = ttg.convert_layout %20 : tensor<1x16x!tt.ptr, #blocked5> -> tensor<1x16x!tt.ptr, #blocked> loc(#loc106) + tt.store %21, %cst_5, %tmp0_23 : tensor<1x16x!tt.ptr, #blocked> loc(#loc106) + %22 = tt.splat %out_ptr8 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc107) + %23 = tt.addptr %22, %tmp0_16 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc107) + tt.store %23, %new_idxs_727, %tmp0_22 : tensor<1x16x!tt.ptr, #blocked5> loc(#loc108) + %24 = arith.addi %tmp49, %17 : tensor<1x16xi32, #blocked5> loc(#loc109) + %25 = tt.splat %out_ptr9 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked5> loc(#loc110) + %26 = tt.addptr %25, %24 : tensor<1x16x!tt.ptr, #blocked5>, tensor<1x16xi32, #blocked5> loc(#loc110) + %27 = ttg.convert_layout %26 : tensor<1x16x!tt.ptr, #blocked5> -> tensor<1x16x!tt.ptr, #blocked> loc(#loc111) + tt.store %27, %cst_5, %tmp0_23 : tensor<1x16x!tt.ptr, #blocked> loc(#loc111) + tt.return loc(#loc112) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":24:28) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":26:21) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":27:38) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:40) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:37) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:30) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:45) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":36:18) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":38:18) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":39:18) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":41:19) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":40:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":43:19) +#loc15 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc18 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc62 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":47:20) +#loc63 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":49:21) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":48:21) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":52:20) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":54:35) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":55:29) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":56:21) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":58:35) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":59:29) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":60:21) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":61:21) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":64:19) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":66:35) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":68:20) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":69:20) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":70:35) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:28) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:46) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:38) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:55) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:53) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:63) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":75:19) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":76:35) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":77:20) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":78:20) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":79:35) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:28) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:46) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:38) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:53) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:63) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":81:25) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":81:37) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":82:25) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":82:37) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":83:25) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":83:47) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:52) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:49) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:25) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:85) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":85:25) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":85:47) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:49) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:25) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:85) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:4) +#loc122 = loc("xoffset"(#loc2)) +#loc123 = loc("xmask"(#loc3)) +#loc124 = loc("r0_index"(#loc4)) +#loc125 = loc("tmp0"(#loc5)) +#loc126 = loc("tmp0"(#loc6)) +#loc127 = loc("tmp0"(#loc7)) +#loc128 = loc("tmp0"(#loc8)) +#loc129 = loc("tmp2"(#loc9)) +#loc130 = loc("tmp4"(#loc10)) +#loc131 = loc("tmp5"(#loc11)) +#loc132 = loc("tmp7"(#loc12)) +#loc133 = loc("tmp6"(#loc13)) +#loc134 = loc("tmp9"(#loc14)) +#loc135 = loc("flip"(#loc15)) +#loc137 = loc("flip"(#loc18)) +#loc138 = loc("flip"(#loc19)) +#loc139 = loc("y"(#loc20)) +#loc140 = loc("left_mask"(#loc22)) +#loc141 = loc("ileft"(#loc23)) +#loc143 = loc("ileft"(#loc27)) +#loc144 = loc("ileft"(#loc28)) +#loc145 = loc("iright"(#loc29)) +#loc147 = loc("iright"(#loc31)) +#loc148 = loc("iright"(#loc32)) +#loc149 = loc("ileft"(#loc33)) +#loc150 = loc("iright"(#loc34)) +#loc151 = loc("y_idx"(#loc35)) +#loc152 = loc("left_idx"(#loc36)) +#loc153 = loc("left_idx"(#loc37)) +#loc154 = loc("input"(#loc38)) +#loc156 = loc("left_idx"(#loc40)) +#loc157 = loc("left_idx"(#loc41)) +#loc158 = loc("right_idx"(#loc42)) +#loc159 = loc("right_idx"(#loc43)) +#loc161 = loc("right_idx"(#loc45)) +#loc162 = loc("right_idx"(#loc46)) +#loc163 = loc("left_idx"(#loc47)) +#loc164 = loc("right_idx"(#loc48)) +#loc165 = loc("cond"(#loc49)) +#loc166 = loc("eq"(#loc50)) +#loc167 = loc("cond"(#loc51)) +#loc168 = loc("cond"(#loc52)) +#loc169 = loc("cond"(#loc53)) +#loc170 = loc("cond"(#loc54)) +#loc171 = loc("cond"(#loc55)) +#loc172 = loc("ret"(#loc56)) +#loc173 = loc("ret"(#loc57)) +#loc174 = loc("ret"(#loc58)) +#loc175 = loc("new_idxs"(#loc59)) +#loc176 = loc("new_idxs"(#loc60)) +#loc177 = loc("new_idxs"(#loc61)) +#loc178 = loc("tmp14"(#loc62)) +#loc179 = loc("tmp16"(#loc63)) +#loc180 = loc("tmp15"(#loc64)) +#loc182 = loc("tmp20"(#loc66)) +#loc183 = loc("tmp23"(#loc67)) +#loc185 = loc("tmp24"(#loc69)) +#loc186 = loc("tmp25"(#loc70)) +#loc187 = loc("tmp28"(#loc71)) +#loc189 = loc("tmp29"(#loc73)) +#loc190 = loc("tmp30"(#loc74)) +#loc191 = loc("tmp31"(#loc75)) +#loc192 = loc("tmp34"(#loc76)) +#loc193 = loc("tmp36"(#loc77)) +#loc194 = loc("tmp38"(#loc78)) +#loc195 = loc("tmp39"(#loc79)) +#loc196 = loc("tmp40"(#loc80)) +#loc197 = loc(fused[#loc85, #loc84]) +#loc198 = loc("tmp45"(#loc87)) +#loc199 = loc("tmp46"(#loc88)) +#loc200 = loc("tmp47"(#loc89)) +#loc201 = loc("tmp48"(#loc90)) +#loc202 = loc("tmp49"(#loc91)) +#loc203 = loc(fused[#loc104, #loc103]) +#loc204 = loc(fused[#loc126, #loc125]) +#loc205 = loc(fused[#loc128, #loc123]) +#loc206 = loc(fused[#loc132, #loc133]) +#loc207 = loc(callsite(#loc135 at #loc136)) +#loc208 = loc(callsite(#loc137 at #loc136)) +#loc209 = loc(callsite(#loc138 at #loc136)) +#loc211 = loc("cond"(#loc165)) +#loc212 = loc("eq"(#loc166)) +#loc213 = loc(fused[#loc179, #loc180]) +#loc215 = loc(fused[#loc182, #loc132, #loc133]) +#loc216 = loc(callsite(#loc24 at #loc184)) +#loc218 = loc(fused[#loc186, #loc179, #loc180]) +#loc219 = loc(callsite(#loc24 at #loc188)) +#loc221 = loc(fused[#loc84, #loc123]) +#loc222 = loc(callsite(#loc139 at #loc210)) +#loc223 = loc(callsite(#loc140 at #loc210)) +#loc224 = loc(callsite(#loc141 at #loc210)) +#loc226 = loc(callsite(#loc143 at #loc210)) +#loc227 = loc(callsite(#loc144 at #loc210)) +#loc228 = loc(callsite(#loc145 at #loc210)) +#loc230 = loc(callsite(#loc147 at #loc210)) +#loc231 = loc(callsite(#loc148 at #loc210)) +#loc232 = loc(callsite(#loc149 at #loc210)) +#loc233 = loc(callsite(#loc150 at #loc210)) +#loc234 = loc(callsite(#loc151 at #loc210)) +#loc235 = loc(callsite(#loc152 at #loc210)) +#loc236 = loc(callsite(#loc153 at #loc210)) +#loc238 = loc(callsite(#loc156 at #loc210)) +#loc239 = loc(callsite(#loc157 at #loc210)) +#loc240 = loc(callsite(#loc158 at #loc210)) +#loc241 = loc(callsite(#loc159 at #loc210)) +#loc243 = loc(callsite(#loc161 at #loc210)) +#loc244 = loc(callsite(#loc162 at #loc210)) +#loc245 = loc(callsite(#loc163 at #loc210)) +#loc246 = loc(callsite(#loc164 at #loc210)) +#loc247 = loc(callsite(#loc211 at #loc210)) +#loc248 = loc(callsite(#loc212 at #loc210)) +#loc249 = loc(callsite(#loc167 at #loc210)) +#loc250 = loc(callsite(#loc168 at #loc210)) +#loc251 = loc(callsite(#loc169 at #loc210)) +#loc252 = loc(callsite(#loc170 at #loc210)) +#loc253 = loc(callsite(#loc171 at #loc210)) +#loc254 = loc(callsite(#loc172 at #loc210)) +#loc255 = loc(callsite(#loc173 at #loc210)) +#loc256 = loc(callsite(#loc174 at #loc210)) +#loc257 = loc(callsite(#loc175 at #loc210)) +#loc258 = loc(callsite(#loc176 at #loc210)) +#loc259 = loc(callsite(#loc177 at #loc210)) +#loc260 = loc(callsite(#loc139 at #loc214)) +#loc261 = loc(callsite(#loc141 at #loc214)) +#loc263 = loc(callsite(#loc143 at #loc214)) +#loc264 = loc(callsite(#loc144 at #loc214)) +#loc265 = loc(callsite(#loc145 at #loc214)) +#loc267 = loc(callsite(#loc147 at #loc214)) +#loc268 = loc(callsite(#loc148 at #loc214)) +#loc269 = loc(callsite(#loc149 at #loc214)) +#loc270 = loc(callsite(#loc150 at #loc214)) +#loc271 = loc(callsite(#loc211 at #loc214)) +#loc272 = loc(callsite(#loc212 at #loc214)) +#loc273 = loc(callsite(#loc168 at #loc214)) +#loc274 = loc(callsite(#loc169 at #loc214)) +#loc275 = loc(callsite(#loc170 at #loc214)) +#loc276 = loc(callsite(#loc171 at #loc214)) +#loc277 = loc(callsite(#loc172 at #loc214)) +#loc278 = loc(callsite(#loc173 at #loc214)) +#loc279 = loc(callsite(#loc174 at #loc214)) +#loc280 = loc(callsite(#loc176 at #loc214)) +#loc281 = loc(callsite(#loc177 at #loc214)) +#loc282 = loc(callsite(#loc151 at #loc214)) +#loc283 = loc(callsite(#loc153 at #loc214)) +#loc285 = loc(callsite(#loc156 at #loc214)) +#loc286 = loc(callsite(#loc157 at #loc214)) +#loc287 = loc(callsite(#loc159 at #loc214)) +#loc289 = loc(callsite(#loc161 at #loc214)) +#loc290 = loc(callsite(#loc162 at #loc214)) +#loc291 = loc(callsite(#loc163 at #loc214)) +#loc292 = loc(callsite(#loc164 at #loc214)) +#loc293 = loc(callsite(#loc167 at #loc214)) +#loc294 = loc(callsite(#loc175 at #loc214)) +#loc295 = loc(callsite(#loc26 at #loc216)) +#loc296 = loc(callsite(#loc26 at #loc219)) +#loc297 = loc(callsite(#loc24 at #loc225)) +#loc299 = loc(callsite(#loc24 at #loc229)) +#loc301 = loc(callsite(#loc154 at #loc237)) +#loc302 = loc(callsite(#loc24 at #loc237)) +#loc304 = loc(callsite(#loc154 at #loc242)) +#loc305 = loc(callsite(#loc24 at #loc242)) +#loc307 = loc(callsite(#loc24 at #loc262)) +#loc309 = loc(callsite(#loc24 at #loc266)) +#loc311 = loc(callsite(#loc24 at #loc284)) +#loc313 = loc(callsite(#loc24 at #loc288)) +#loc315 = loc(callsite(#loc26 at #loc297)) +#loc316 = loc(callsite(#loc26 at #loc299)) +#loc317 = loc(callsite(#loc26 at #loc302)) +#loc318 = loc(callsite(#loc26 at #loc305)) +#loc319 = loc(callsite(#loc26 at #loc307)) +#loc320 = loc(callsite(#loc26 at #loc309)) +#loc321 = loc(callsite(#loc26 at #loc311)) +#loc322 = loc(callsite(#loc26 at #loc313)) diff --git a/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir b/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir new file mode 100644 index 0000000000000000000000000000000000000000..f57113df6b34541280e93e35ea931ca9c83448fc --- /dev/null +++ b/SpecForge-ext/cache/compiled_kernels/triton/7/NFXDDTZIEQAGR5JBWWCXV73QZILXJMIVVJVPZH3CHAIULDAND5UQ/triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2.ttir @@ -0,0 +1,1437 @@ +#loc = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":18:0) +#loc1 = loc(unknown) +#loc17 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":662:12) +#loc18 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":46:71) +#loc23 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":634:73) +#loc27 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:51) +#loc32 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:53) +#loc41 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:50) +#loc46 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:51) +#loc67 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":51:71) +#loc70 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":55:26) +#loc74 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":59:26) +#loc115 = loc("in_ptr0"(#loc)) +#loc116 = loc("out_ptr4"(#loc)) +#loc117 = loc("out_ptr5"(#loc)) +#loc118 = loc("out_ptr6"(#loc)) +#loc119 = loc("out_ptr7"(#loc)) +#loc120 = loc("out_ptr8"(#loc)) +#loc121 = loc("out_ptr9"(#loc)) +#loc122 = loc("xnumel"(#loc)) +#loc123 = loc("r0_numel"(#loc)) +#loc139 = loc(callsite(#loc17 at #loc18)) +#loc146 = loc("ileft"(#loc27)) +#loc150 = loc("iright"(#loc32)) +#loc159 = loc("left_idx"(#loc41)) +#loc164 = loc("right_idx"(#loc46)) +#loc185 = loc(callsite(#loc17 at #loc67)) +#loc188 = loc("tmp24"(#loc70)) +#loc192 = loc("tmp29"(#loc74)) +#loc215 = loc(callsite(#loc23 at #loc139)) +#loc219 = loc(callsite(#loc23 at #loc185)) +#loc222 = loc(callsite(#loc1 at #loc188)) +#loc225 = loc(callsite(#loc1 at #loc192)) +#loc229 = loc(callsite(#loc146 at #loc215)) +#loc233 = loc(callsite(#loc150 at #loc215)) +#loc241 = loc(callsite(#loc159 at #loc215)) +#loc246 = loc(callsite(#loc164 at #loc215)) +#loc266 = loc(callsite(#loc146 at #loc219)) +#loc270 = loc(callsite(#loc150 at #loc219)) +#loc288 = loc(callsite(#loc159 at #loc219)) +#loc292 = loc(callsite(#loc164 at #loc219)) +#loc302 = loc(callsite(#loc1 at #loc229)) +#loc304 = loc(callsite(#loc1 at #loc233)) +#loc307 = loc(callsite(#loc1 at #loc241)) +#loc310 = loc(callsite(#loc1 at #loc246)) +#loc312 = loc(callsite(#loc1 at #loc266)) +#loc314 = loc(callsite(#loc1 at #loc270)) +#loc316 = loc(callsite(#loc1 at #loc288)) +#loc318 = loc(callsite(#loc1 at #loc292)) +module { + tt.func public @triton_per_fused__to_copy_arange_bitwise_and_eq_gt_index_put_lt_new_zeros_scalar_tensor_sort_sum_unsqueeze_view_where_2(%in_ptr0: !tt.ptr {tt.divisibility = 16 : i32} loc("in_ptr0"(#loc)), %out_ptr4: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr4"(#loc)), %out_ptr5: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr5"(#loc)), %out_ptr6: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr6"(#loc)), %out_ptr7: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr7"(#loc)), %out_ptr8: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr8"(#loc)), %out_ptr9: !tt.ptr {tt.divisibility = 16 : i32} loc("out_ptr9"(#loc)), %xnumel: i32 {tt.divisibility = 16 : i32} loc("xnumel"(#loc)), %r0_numel: i32 {tt.divisibility = 16 : i32} loc("r0_numel"(#loc))) attributes {noinline = false} { + %c17_i32 = arith.constant 17 : i32 loc(#loc1) + %true = arith.constant true loc(#loc1) + %c16_i32 = arith.constant 16 : i32 loc(#loc1) + %xmask = arith.constant 32 : i32 loc(#loc124) + %cst = arith.constant dense<1> : tensor<1x2x1xi32> loc(#loc1) + %cst_0 = arith.constant dense<1> : tensor<1x16xi32> loc(#loc1) + %cst_1 = arith.constant dense<0> : tensor<1x16xi32> loc(#loc1) + %cst_2 = arith.constant dense<17> : tensor<1x16xi32> loc(#loc1) + %cst_3 = arith.constant dense<16> : tensor<1x16xi32> loc(#loc1) + %cst_4 = arith.constant dense<16384> : tensor<1x16xi64> loc(#loc1) + %cst_5 = arith.constant dense<0> : tensor<1x16xi64> loc(#loc1) + %xoffset = tt.get_program_id x : i32 loc(#loc125) + %xmask_6 = arith.cmpi slt, %xoffset, %xmask : i32 loc(#loc124) + %xmask_7 = tt.splat %xmask_6 : i1 -> tensor<1x1xi1> loc(#loc124) + %r0_index = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> loc(#loc126) + %r0_index_8 = tt.expand_dims %r0_index {axis = 0 : i32} : tensor<16xi32> -> tensor<1x16xi32> loc(#loc127) + %tmp0 = arith.muli %xoffset, %c16_i32 : i32 loc(#loc128) + %tmp0_9 = tt.splat %tmp0 : i32 -> tensor<1x16xi32> loc(#loc208) + %tmp0_10 = arith.addi %r0_index_8, %tmp0_9 : tensor<1x16xi32> loc(#loc129) + %tmp0_11 = tt.splat %in_ptr0 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc130) + %tmp0_12 = tt.addptr %tmp0_11, %tmp0_10 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc130) + %tmp0_13 = tt.splat %xmask_6 : i1 -> tensor<1x16xi1> loc(#loc209) + %tmp0_14 = tt.load %tmp0_12, %tmp0_13, %cst_5 : tensor<1x16x!tt.ptr> loc(#loc131) + %tmp2 = arith.cmpi sgt, %tmp0_14, %cst_5 : tensor<1x16xi64> loc(#loc132) + %tmp4 = arith.cmpi slt, %tmp0_14, %cst_4 : tensor<1x16xi64> loc(#loc133) + %tmp5 = arith.andi %tmp2, %tmp4 : tensor<1x16xi1> loc(#loc134) + %tmp7 = arith.extui %tmp5 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc210) + %tmp9 = arith.trunci %r0_index_8 : tensor<1x16xi32> to tensor<1x16xi16> loc(#loc137) + %flip = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32> loc(#loc211) + %flip_15 = tt.expand_dims %flip {axis = 0 : i32} : tensor<2xi32> -> tensor<1x2xi32> loc(#loc212) + %flip_16 = tt.expand_dims %flip_15 {axis = 2 : i32} : tensor<1x2xi32> -> tensor<1x2x1xi32> loc(#loc212) + %flip_17 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc213) + %flip_18 = tt.reshape %flip_17 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc214) + %y = tt.reshape %tmp7 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc226) + %left_mask = arith.subi %cst, %flip_16 : tensor<1x2x1xi32> loc(#loc227) + %ileft = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc228) + %ileft_19 = arith.muli %y, %ileft : tensor<8x2x1xi32> loc(#loc228) + %ileft_20 = "tt.reduce"(%ileft_19) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc301) + %ileft_21 = tt.expand_dims %ileft_20 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc230) + %ileft_22 = tt.broadcast %ileft_21 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc231) + %iright = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<8x2x1xi32> loc(#loc232) + %iright_23 = arith.muli %y, %iright : tensor<8x2x1xi32> loc(#loc232) + %iright_24 = "tt.reduce"(%iright_23) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc303) + %iright_25 = tt.expand_dims %iright_24 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc234) + %iright_26 = tt.broadcast %iright_25 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc235) + %ileft_27 = tt.reshape %ileft_22 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_28 = tt.reshape %iright_26 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx = tt.reshape %tmp9 : tensor<1x16xi16> -> tensor<8x2x1xi16> loc(#loc238) + %left_idx = arith.trunci %left_mask : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc239) + %left_idx_29 = tt.broadcast %left_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc240) + %left_idx_30 = arith.muli %y_idx, %left_idx_29 : tensor<8x2x1xi16> loc(#loc240) + %input = arith.extsi %left_idx_30 : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc305) + %left_idx_31 = "tt.reduce"(%input) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc306) + %left_idx_32 = tt.expand_dims %left_idx_31 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc242) + %left_idx_33 = tt.broadcast %left_idx_32 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc243) + %right_idx = arith.trunci %flip_16 : tensor<1x2x1xi32> to tensor<1x2x1xi16> loc(#loc244) + %right_idx_34 = tt.broadcast %right_idx : tensor<1x2x1xi16> -> tensor<8x2x1xi16> loc(#loc245) + %right_idx_35 = arith.muli %y_idx, %right_idx_34 : tensor<8x2x1xi16> loc(#loc245) + %input_36 = arith.extsi %right_idx_35 : tensor<8x2x1xi16> to tensor<8x2x1xi32> loc(#loc308) + %right_idx_37 = "tt.reduce"(%input_36) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc309) + %right_idx_38 = tt.expand_dims %right_idx_37 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc247) + %right_idx_39 = tt.broadcast %right_idx_38 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc248) + %left_idx_40 = tt.reshape %left_idx_33 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_41 = tt.reshape %right_idx_39 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc250) + %cond = arith.cmpi slt, %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc251) + %eq = arith.cmpi eq, %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc252) + %cond_42 = arith.cmpi sgt, %left_idx_40, %right_idx_41 : tensor<1x16xi32> loc(#loc253) + %cond_43 = arith.andi %eq, %cond_42 : tensor<1x16xi1> loc(#loc254) + %cond_44 = arith.ori %cond, %cond_43 : tensor<1x16xi1> loc(#loc255) + %cond_45 = arith.extui %cond_44 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc256) + %cond_46 = arith.xori %cond_45, %flip_18 : tensor<1x16xi32> loc(#loc256) + %cond_47 = arith.cmpi ne, %cond_46, %cst_1 : tensor<1x16xi32> loc(#loc257) + %ret = arith.xori %ileft_27, %iright_28 : tensor<1x16xi32> loc(#loc258) + %ret_48 = arith.select %cond_47, %ret, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_49 = arith.xori %tmp7, %ret_48 : tensor<1x16xi32> loc(#loc260) + %new_idxs = arith.xori %left_idx_40, %right_idx_41 : tensor<1x16xi32> loc(#loc261) + %new_idxs_50 = arith.select %cond_47, %new_idxs, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_51 = arith.extsi %tmp9 : tensor<1x16xi16> to tensor<1x16xi32> loc(#loc263) + %new_idxs_52 = arith.xori %new_idxs_51, %new_idxs_50 : tensor<1x16xi32> loc(#loc263) + %flip_53 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc213) + %flip_54 = tt.reshape %flip_53 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc214) + %y_55 = tt.reshape %ret_49 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc226) + %ileft_56 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<4x2x2xi32> loc(#loc228) + %ileft_57 = arith.muli %y_55, %ileft_56 : tensor<4x2x2xi32> loc(#loc228) + %ileft_58 = "tt.reduce"(%ileft_57) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc301) + %ileft_59 = tt.expand_dims %ileft_58 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc230) + %ileft_60 = tt.broadcast %ileft_59 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc231) + %iright_61 = arith.muli %y_55, %flip_17 : tensor<4x2x2xi32> loc(#loc232) + %iright_62 = "tt.reduce"(%iright_61) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc303) + %iright_63 = tt.expand_dims %iright_62 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc234) + %iright_64 = tt.broadcast %iright_63 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc235) + %ileft_65 = tt.reshape %ileft_60 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_66 = tt.reshape %iright_64 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_67 = tt.reshape %new_idxs_52 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc238) + %left_idx_68 = arith.muli %y_idx_67, %ileft_56 : tensor<4x2x2xi32> loc(#loc240) + %left_idx_69 = "tt.reduce"(%left_idx_68) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc306) + %left_idx_70 = tt.expand_dims %left_idx_69 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc242) + %left_idx_71 = tt.broadcast %left_idx_70 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc243) + %right_idx_72 = arith.muli %y_idx_67, %flip_17 : tensor<4x2x2xi32> loc(#loc245) + %right_idx_73 = "tt.reduce"(%right_idx_72) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc309) + %right_idx_74 = tt.expand_dims %right_idx_73 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc247) + %right_idx_75 = tt.broadcast %right_idx_74 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc248) + %left_idx_76 = tt.reshape %left_idx_71 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_77 = tt.reshape %right_idx_75 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_78 = arith.cmpi slt, %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc251) + %eq_79 = arith.cmpi eq, %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc252) + %cond_80 = arith.cmpi sgt, %left_idx_76, %right_idx_77 : tensor<1x16xi32> loc(#loc253) + %cond_81 = arith.andi %eq_79, %cond_80 : tensor<1x16xi1> loc(#loc254) + %cond_82 = arith.ori %cond_78, %cond_81 : tensor<1x16xi1> loc(#loc255) + %cond_83 = arith.extui %cond_82 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc256) + %cond_84 = arith.xori %cond_83, %flip_54 : tensor<1x16xi32> loc(#loc256) + %cond_85 = arith.cmpi ne, %cond_84, %cst_1 : tensor<1x16xi32> loc(#loc257) + %ret_86 = arith.xori %ileft_65, %iright_66 : tensor<1x16xi32> loc(#loc258) + %ret_87 = arith.select %cond_85, %ret_86, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_88 = arith.xori %ret_49, %ret_87 : tensor<1x16xi32> loc(#loc260) + %new_idxs_89 = arith.xori %left_idx_76, %right_idx_77 : tensor<1x16xi32> loc(#loc261) + %new_idxs_90 = arith.select %cond_85, %new_idxs_89, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_91 = arith.xori %new_idxs_52, %new_idxs_90 : tensor<1x16xi32> loc(#loc263) + %y_92 = tt.reshape %ret_88 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc226) + %ileft_93 = arith.muli %y_92, %ileft : tensor<8x2x1xi32> loc(#loc228) + %ileft_94 = "tt.reduce"(%ileft_93) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc301) + %ileft_95 = tt.expand_dims %ileft_94 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc230) + %ileft_96 = tt.broadcast %ileft_95 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc231) + %iright_97 = arith.muli %y_92, %iright : tensor<8x2x1xi32> loc(#loc232) + %iright_98 = "tt.reduce"(%iright_97) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc303) + %iright_99 = tt.expand_dims %iright_98 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc234) + %iright_100 = tt.broadcast %iright_99 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc235) + %ileft_101 = tt.reshape %ileft_96 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_102 = tt.reshape %iright_100 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_103 = tt.reshape %new_idxs_91 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc238) + %left_idx_104 = arith.muli %y_idx_103, %ileft : tensor<8x2x1xi32> loc(#loc240) + %left_idx_105 = "tt.reduce"(%left_idx_104) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc306) + %left_idx_106 = tt.expand_dims %left_idx_105 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc242) + %left_idx_107 = tt.broadcast %left_idx_106 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc243) + %right_idx_108 = arith.muli %y_idx_103, %iright : tensor<8x2x1xi32> loc(#loc245) + %right_idx_109 = "tt.reduce"(%right_idx_108) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc309) + %right_idx_110 = tt.expand_dims %right_idx_109 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc247) + %right_idx_111 = tt.broadcast %right_idx_110 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc248) + %left_idx_112 = tt.reshape %left_idx_107 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_113 = tt.reshape %right_idx_111 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_114 = arith.cmpi slt, %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc251) + %eq_115 = arith.cmpi eq, %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc252) + %cond_116 = arith.cmpi sgt, %left_idx_112, %right_idx_113 : tensor<1x16xi32> loc(#loc253) + %cond_117 = arith.andi %eq_115, %cond_116 : tensor<1x16xi1> loc(#loc254) + %cond_118 = arith.ori %cond_114, %cond_117 : tensor<1x16xi1> loc(#loc255) + %cond_119 = arith.extui %cond_118 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc256) + %cond_120 = arith.xori %cond_119, %flip_54 : tensor<1x16xi32> loc(#loc256) + %cond_121 = arith.cmpi ne, %cond_120, %cst_1 : tensor<1x16xi32> loc(#loc257) + %ret_122 = arith.xori %ileft_101, %iright_102 : tensor<1x16xi32> loc(#loc258) + %ret_123 = arith.select %cond_121, %ret_122, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_124 = arith.xori %ret_88, %ret_123 : tensor<1x16xi32> loc(#loc260) + %new_idxs_125 = arith.xori %left_idx_112, %right_idx_113 : tensor<1x16xi32> loc(#loc261) + %new_idxs_126 = arith.select %cond_121, %new_idxs_125, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_127 = arith.xori %new_idxs_91, %new_idxs_126 : tensor<1x16xi32> loc(#loc263) + %flip_128 = tt.broadcast %flip_16 : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc213) + %flip_129 = tt.reshape %flip_128 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc214) + %y_130 = tt.reshape %ret_124 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc226) + %ileft_131 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<2x2x4xi32> loc(#loc228) + %ileft_132 = arith.muli %y_130, %ileft_131 : tensor<2x2x4xi32> loc(#loc228) + %ileft_133 = "tt.reduce"(%ileft_132) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc301) + %ileft_134 = tt.expand_dims %ileft_133 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc230) + %ileft_135 = tt.broadcast %ileft_134 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc231) + %iright_136 = arith.muli %y_130, %flip_53 : tensor<2x2x4xi32> loc(#loc232) + %iright_137 = "tt.reduce"(%iright_136) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc303) + %iright_138 = tt.expand_dims %iright_137 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc234) + %iright_139 = tt.broadcast %iright_138 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc235) + %ileft_140 = tt.reshape %ileft_135 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_141 = tt.reshape %iright_139 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_142 = tt.reshape %new_idxs_127 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc238) + %left_idx_143 = arith.muli %y_idx_142, %ileft_131 : tensor<2x2x4xi32> loc(#loc240) + %left_idx_144 = "tt.reduce"(%left_idx_143) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc306) + %left_idx_145 = tt.expand_dims %left_idx_144 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc242) + %left_idx_146 = tt.broadcast %left_idx_145 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc243) + %right_idx_147 = arith.muli %y_idx_142, %flip_53 : tensor<2x2x4xi32> loc(#loc245) + %right_idx_148 = "tt.reduce"(%right_idx_147) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc309) + %right_idx_149 = tt.expand_dims %right_idx_148 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc247) + %right_idx_150 = tt.broadcast %right_idx_149 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc248) + %left_idx_151 = tt.reshape %left_idx_146 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_152 = tt.reshape %right_idx_150 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_153 = arith.cmpi slt, %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc251) + %eq_154 = arith.cmpi eq, %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc252) + %cond_155 = arith.cmpi sgt, %left_idx_151, %right_idx_152 : tensor<1x16xi32> loc(#loc253) + %cond_156 = arith.andi %eq_154, %cond_155 : tensor<1x16xi1> loc(#loc254) + %cond_157 = arith.ori %cond_153, %cond_156 : tensor<1x16xi1> loc(#loc255) + %cond_158 = arith.extui %cond_157 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc256) + %cond_159 = arith.xori %cond_158, %flip_129 : tensor<1x16xi32> loc(#loc256) + %cond_160 = arith.cmpi ne, %cond_159, %cst_1 : tensor<1x16xi32> loc(#loc257) + %ret_161 = arith.xori %ileft_140, %iright_141 : tensor<1x16xi32> loc(#loc258) + %ret_162 = arith.select %cond_160, %ret_161, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_163 = arith.xori %ret_124, %ret_162 : tensor<1x16xi32> loc(#loc260) + %new_idxs_164 = arith.xori %left_idx_151, %right_idx_152 : tensor<1x16xi32> loc(#loc261) + %new_idxs_165 = arith.select %cond_160, %new_idxs_164, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_166 = arith.xori %new_idxs_127, %new_idxs_165 : tensor<1x16xi32> loc(#loc263) + %y_167 = tt.reshape %ret_163 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc226) + %ileft_168 = arith.muli %y_167, %ileft_56 : tensor<4x2x2xi32> loc(#loc228) + %ileft_169 = "tt.reduce"(%ileft_168) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc301) + %ileft_170 = tt.expand_dims %ileft_169 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc230) + %ileft_171 = tt.broadcast %ileft_170 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc231) + %iright_172 = arith.muli %y_167, %flip_17 : tensor<4x2x2xi32> loc(#loc232) + %iright_173 = "tt.reduce"(%iright_172) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc303) + %iright_174 = tt.expand_dims %iright_173 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc234) + %iright_175 = tt.broadcast %iright_174 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc235) + %ileft_176 = tt.reshape %ileft_171 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_177 = tt.reshape %iright_175 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_178 = tt.reshape %new_idxs_166 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc238) + %left_idx_179 = arith.muli %y_idx_178, %ileft_56 : tensor<4x2x2xi32> loc(#loc240) + %left_idx_180 = "tt.reduce"(%left_idx_179) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc306) + %left_idx_181 = tt.expand_dims %left_idx_180 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc242) + %left_idx_182 = tt.broadcast %left_idx_181 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc243) + %right_idx_183 = arith.muli %y_idx_178, %flip_17 : tensor<4x2x2xi32> loc(#loc245) + %right_idx_184 = "tt.reduce"(%right_idx_183) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc309) + %right_idx_185 = tt.expand_dims %right_idx_184 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc247) + %right_idx_186 = tt.broadcast %right_idx_185 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc248) + %left_idx_187 = tt.reshape %left_idx_182 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_188 = tt.reshape %right_idx_186 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_189 = arith.cmpi slt, %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc251) + %eq_190 = arith.cmpi eq, %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc252) + %cond_191 = arith.cmpi sgt, %left_idx_187, %right_idx_188 : tensor<1x16xi32> loc(#loc253) + %cond_192 = arith.andi %eq_190, %cond_191 : tensor<1x16xi1> loc(#loc254) + %cond_193 = arith.ori %cond_189, %cond_192 : tensor<1x16xi1> loc(#loc255) + %cond_194 = arith.extui %cond_193 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc256) + %cond_195 = arith.xori %cond_194, %flip_129 : tensor<1x16xi32> loc(#loc256) + %cond_196 = arith.cmpi ne, %cond_195, %cst_1 : tensor<1x16xi32> loc(#loc257) + %ret_197 = arith.xori %ileft_176, %iright_177 : tensor<1x16xi32> loc(#loc258) + %ret_198 = arith.select %cond_196, %ret_197, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_199 = arith.xori %ret_163, %ret_198 : tensor<1x16xi32> loc(#loc260) + %new_idxs_200 = arith.xori %left_idx_187, %right_idx_188 : tensor<1x16xi32> loc(#loc261) + %new_idxs_201 = arith.select %cond_196, %new_idxs_200, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_202 = arith.xori %new_idxs_166, %new_idxs_201 : tensor<1x16xi32> loc(#loc263) + %y_203 = tt.reshape %ret_199 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc226) + %ileft_204 = arith.muli %y_203, %ileft : tensor<8x2x1xi32> loc(#loc228) + %ileft_205 = "tt.reduce"(%ileft_204) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc301) + %ileft_206 = tt.expand_dims %ileft_205 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc230) + %ileft_207 = tt.broadcast %ileft_206 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc231) + %iright_208 = arith.muli %y_203, %iright : tensor<8x2x1xi32> loc(#loc232) + %iright_209 = "tt.reduce"(%iright_208) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc303) + %iright_210 = tt.expand_dims %iright_209 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc234) + %iright_211 = tt.broadcast %iright_210 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc235) + %ileft_212 = tt.reshape %ileft_207 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_213 = tt.reshape %iright_211 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_214 = tt.reshape %new_idxs_202 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc238) + %left_idx_215 = arith.muli %y_idx_214, %ileft : tensor<8x2x1xi32> loc(#loc240) + %left_idx_216 = "tt.reduce"(%left_idx_215) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc306) + %left_idx_217 = tt.expand_dims %left_idx_216 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc242) + %left_idx_218 = tt.broadcast %left_idx_217 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc243) + %right_idx_219 = arith.muli %y_idx_214, %iright : tensor<8x2x1xi32> loc(#loc245) + %right_idx_220 = "tt.reduce"(%right_idx_219) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc309) + %right_idx_221 = tt.expand_dims %right_idx_220 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc247) + %right_idx_222 = tt.broadcast %right_idx_221 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc248) + %left_idx_223 = tt.reshape %left_idx_218 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_224 = tt.reshape %right_idx_222 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_225 = arith.cmpi slt, %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc251) + %eq_226 = arith.cmpi eq, %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc252) + %cond_227 = arith.cmpi sgt, %left_idx_223, %right_idx_224 : tensor<1x16xi32> loc(#loc253) + %cond_228 = arith.andi %eq_226, %cond_227 : tensor<1x16xi1> loc(#loc254) + %cond_229 = arith.ori %cond_225, %cond_228 : tensor<1x16xi1> loc(#loc255) + %cond_230 = arith.extui %cond_229 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc256) + %cond_231 = arith.xori %cond_230, %flip_129 : tensor<1x16xi32> loc(#loc256) + %cond_232 = arith.cmpi ne, %cond_231, %cst_1 : tensor<1x16xi32> loc(#loc257) + %ret_233 = arith.xori %ileft_212, %iright_213 : tensor<1x16xi32> loc(#loc258) + %ret_234 = arith.select %cond_232, %ret_233, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_235 = arith.xori %ret_199, %ret_234 : tensor<1x16xi32> loc(#loc260) + %new_idxs_236 = arith.xori %left_idx_223, %right_idx_224 : tensor<1x16xi32> loc(#loc261) + %new_idxs_237 = arith.select %cond_232, %new_idxs_236, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_238 = arith.xori %new_idxs_202, %new_idxs_237 : tensor<1x16xi32> loc(#loc263) + %y_239 = tt.reshape %ret_235 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc226) + %ileft_240 = tt.broadcast %left_mask : tensor<1x2x1xi32> -> tensor<1x2x8xi32> loc(#loc228) + %ileft_241 = arith.muli %y_239, %ileft_240 : tensor<1x2x8xi32> loc(#loc228) + %ileft_242 = "tt.reduce"(%ileft_241) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc301) + %ileft_243 = tt.expand_dims %ileft_242 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc230) + %ileft_244 = tt.broadcast %ileft_243 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc231) + %iright_245 = arith.muli %y_239, %flip_128 : tensor<1x2x8xi32> loc(#loc232) + %iright_246 = "tt.reduce"(%iright_245) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc303) + %iright_247 = tt.expand_dims %iright_246 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc234) + %iright_248 = tt.broadcast %iright_247 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc235) + %ileft_249 = tt.reshape %ileft_244 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_250 = tt.reshape %iright_248 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_251 = tt.reshape %new_idxs_238 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc238) + %left_idx_252 = arith.muli %y_idx_251, %ileft_240 : tensor<1x2x8xi32> loc(#loc240) + %left_idx_253 = "tt.reduce"(%left_idx_252) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc306) + %left_idx_254 = tt.expand_dims %left_idx_253 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc242) + %left_idx_255 = tt.broadcast %left_idx_254 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc243) + %right_idx_256 = arith.muli %y_idx_251, %flip_128 : tensor<1x2x8xi32> loc(#loc245) + %right_idx_257 = "tt.reduce"(%right_idx_256) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc309) + %right_idx_258 = tt.expand_dims %right_idx_257 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc247) + %right_idx_259 = tt.broadcast %right_idx_258 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc248) + %left_idx_260 = tt.reshape %left_idx_255 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_261 = tt.reshape %right_idx_259 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_262 = arith.cmpi slt, %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc251) + %eq_263 = arith.cmpi eq, %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc252) + %cond_264 = arith.cmpi sgt, %left_idx_260, %right_idx_261 : tensor<1x16xi32> loc(#loc253) + %cond_265 = arith.andi %eq_263, %cond_264 : tensor<1x16xi1> loc(#loc254) + %cond_266 = arith.ori %cond_262, %cond_265 : tensor<1x16xi1> loc(#loc255) + %ret_267 = arith.xori %ileft_249, %iright_250 : tensor<1x16xi32> loc(#loc258) + %ret_268 = arith.select %cond_266, %ret_267, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_269 = arith.xori %ret_235, %ret_268 : tensor<1x16xi32> loc(#loc260) + %new_idxs_270 = arith.xori %left_idx_260, %right_idx_261 : tensor<1x16xi32> loc(#loc261) + %new_idxs_271 = arith.select %cond_266, %new_idxs_270, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_272 = arith.xori %new_idxs_238, %new_idxs_271 : tensor<1x16xi32> loc(#loc263) + %y_273 = tt.reshape %ret_269 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc226) + %ileft_274 = arith.muli %y_273, %ileft_131 : tensor<2x2x4xi32> loc(#loc228) + %ileft_275 = "tt.reduce"(%ileft_274) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc301) + %ileft_276 = tt.expand_dims %ileft_275 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc230) + %ileft_277 = tt.broadcast %ileft_276 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc231) + %iright_278 = arith.muli %y_273, %flip_53 : tensor<2x2x4xi32> loc(#loc232) + %iright_279 = "tt.reduce"(%iright_278) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc303) + %iright_280 = tt.expand_dims %iright_279 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc234) + %iright_281 = tt.broadcast %iright_280 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc235) + %ileft_282 = tt.reshape %ileft_277 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_283 = tt.reshape %iright_281 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_284 = tt.reshape %new_idxs_272 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc238) + %left_idx_285 = arith.muli %y_idx_284, %ileft_131 : tensor<2x2x4xi32> loc(#loc240) + %left_idx_286 = "tt.reduce"(%left_idx_285) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc306) + %left_idx_287 = tt.expand_dims %left_idx_286 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc242) + %left_idx_288 = tt.broadcast %left_idx_287 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc243) + %right_idx_289 = arith.muli %y_idx_284, %flip_53 : tensor<2x2x4xi32> loc(#loc245) + %right_idx_290 = "tt.reduce"(%right_idx_289) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc309) + %right_idx_291 = tt.expand_dims %right_idx_290 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc247) + %right_idx_292 = tt.broadcast %right_idx_291 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc248) + %left_idx_293 = tt.reshape %left_idx_288 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_294 = tt.reshape %right_idx_292 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_295 = arith.cmpi slt, %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc251) + %eq_296 = arith.cmpi eq, %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc252) + %cond_297 = arith.cmpi sgt, %left_idx_293, %right_idx_294 : tensor<1x16xi32> loc(#loc253) + %cond_298 = arith.andi %eq_296, %cond_297 : tensor<1x16xi1> loc(#loc254) + %cond_299 = arith.ori %cond_295, %cond_298 : tensor<1x16xi1> loc(#loc255) + %ret_300 = arith.xori %ileft_282, %iright_283 : tensor<1x16xi32> loc(#loc258) + %ret_301 = arith.select %cond_299, %ret_300, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_302 = arith.xori %ret_269, %ret_301 : tensor<1x16xi32> loc(#loc260) + %new_idxs_303 = arith.xori %left_idx_293, %right_idx_294 : tensor<1x16xi32> loc(#loc261) + %new_idxs_304 = arith.select %cond_299, %new_idxs_303, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_305 = arith.xori %new_idxs_272, %new_idxs_304 : tensor<1x16xi32> loc(#loc263) + %y_306 = tt.reshape %ret_302 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc226) + %ileft_307 = arith.muli %y_306, %ileft_56 : tensor<4x2x2xi32> loc(#loc228) + %ileft_308 = "tt.reduce"(%ileft_307) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc301) + %ileft_309 = tt.expand_dims %ileft_308 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc230) + %ileft_310 = tt.broadcast %ileft_309 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc231) + %iright_311 = arith.muli %y_306, %flip_17 : tensor<4x2x2xi32> loc(#loc232) + %iright_312 = "tt.reduce"(%iright_311) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc303) + %iright_313 = tt.expand_dims %iright_312 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc234) + %iright_314 = tt.broadcast %iright_313 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc235) + %ileft_315 = tt.reshape %ileft_310 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_316 = tt.reshape %iright_314 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_317 = tt.reshape %new_idxs_305 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc238) + %left_idx_318 = arith.muli %y_idx_317, %ileft_56 : tensor<4x2x2xi32> loc(#loc240) + %left_idx_319 = "tt.reduce"(%left_idx_318) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc306) + %left_idx_320 = tt.expand_dims %left_idx_319 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc242) + %left_idx_321 = tt.broadcast %left_idx_320 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc243) + %right_idx_322 = arith.muli %y_idx_317, %flip_17 : tensor<4x2x2xi32> loc(#loc245) + %right_idx_323 = "tt.reduce"(%right_idx_322) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc309) + %right_idx_324 = tt.expand_dims %right_idx_323 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc247) + %right_idx_325 = tt.broadcast %right_idx_324 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc248) + %left_idx_326 = tt.reshape %left_idx_321 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_327 = tt.reshape %right_idx_325 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_328 = arith.cmpi slt, %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc251) + %eq_329 = arith.cmpi eq, %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc252) + %cond_330 = arith.cmpi sgt, %left_idx_326, %right_idx_327 : tensor<1x16xi32> loc(#loc253) + %cond_331 = arith.andi %eq_329, %cond_330 : tensor<1x16xi1> loc(#loc254) + %cond_332 = arith.ori %cond_328, %cond_331 : tensor<1x16xi1> loc(#loc255) + %ret_333 = arith.xori %ileft_315, %iright_316 : tensor<1x16xi32> loc(#loc258) + %ret_334 = arith.select %cond_332, %ret_333, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc259) + %ret_335 = arith.xori %ret_302, %ret_334 : tensor<1x16xi32> loc(#loc260) + %new_idxs_336 = arith.xori %left_idx_326, %right_idx_327 : tensor<1x16xi32> loc(#loc261) + %new_idxs_337 = arith.select %cond_332, %new_idxs_336, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_338 = arith.xori %new_idxs_305, %new_idxs_337 : tensor<1x16xi32> loc(#loc263) + %y_339 = tt.reshape %ret_335 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc226) + %ileft_340 = arith.muli %y_339, %ileft : tensor<8x2x1xi32> loc(#loc228) + %ileft_341 = "tt.reduce"(%ileft_340) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc229)), %ileft_706: i32 loc(callsite(#loc1 at #loc229))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc319) + tt.reduce.return %ileft_707 : i32 loc(#loc301) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc301) + %ileft_342 = tt.expand_dims %ileft_341 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc230) + %ileft_343 = tt.broadcast %ileft_342 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc231) + %iright_344 = arith.muli %y_339, %iright : tensor<8x2x1xi32> loc(#loc232) + %iright_345 = "tt.reduce"(%iright_344) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc233)), %iright_706: i32 loc(callsite(#loc1 at #loc233))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc320) + tt.reduce.return %iright_707 : i32 loc(#loc303) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc303) + %iright_346 = tt.expand_dims %iright_345 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc234) + %iright_347 = tt.broadcast %iright_346 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc235) + %ileft_348 = tt.reshape %ileft_343 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc236) + %iright_349 = tt.reshape %iright_347 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc237) + %y_idx_350 = tt.reshape %new_idxs_338 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc238) + %left_idx_351 = arith.muli %y_idx_350, %ileft : tensor<8x2x1xi32> loc(#loc240) + %left_idx_352 = "tt.reduce"(%left_idx_351) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc241)), %left_idx_706: i32 loc(callsite(#loc1 at #loc241))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc321) + tt.reduce.return %left_idx_707 : i32 loc(#loc306) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc306) + %left_idx_353 = tt.expand_dims %left_idx_352 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc242) + %left_idx_354 = tt.broadcast %left_idx_353 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc243) + %right_idx_355 = arith.muli %y_idx_350, %iright : tensor<8x2x1xi32> loc(#loc245) + %right_idx_356 = "tt.reduce"(%right_idx_355) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc246)), %right_idx_706: i32 loc(callsite(#loc1 at #loc246))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc322) + tt.reduce.return %right_idx_707 : i32 loc(#loc309) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc309) + %right_idx_357 = tt.expand_dims %right_idx_356 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc247) + %right_idx_358 = tt.broadcast %right_idx_357 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc248) + %left_idx_359 = tt.reshape %left_idx_354 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc249) + %right_idx_360 = tt.reshape %right_idx_358 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc250) + %cond_361 = arith.cmpi slt, %ileft_348, %iright_349 : tensor<1x16xi32> loc(#loc251) + %eq_362 = arith.cmpi eq, %ileft_348, %iright_349 : tensor<1x16xi32> loc(#loc252) + %cond_363 = arith.cmpi sgt, %left_idx_359, %right_idx_360 : tensor<1x16xi32> loc(#loc253) + %cond_364 = arith.andi %eq_362, %cond_363 : tensor<1x16xi1> loc(#loc254) + %cond_365 = arith.ori %cond_361, %cond_364 : tensor<1x16xi1> loc(#loc255) + %new_idxs_366 = arith.xori %left_idx_359, %right_idx_360 : tensor<1x16xi32> loc(#loc261) + %new_idxs_367 = arith.select %cond_365, %new_idxs_366, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc262) + %new_idxs_368 = arith.xori %new_idxs_338, %new_idxs_367 : tensor<1x16xi32> loc(#loc263) + %tmp14 = arith.cmpi eq, %tmp0_14, %cst_4 : tensor<1x16xi64> loc(#loc182) + %tmp16 = arith.extui %tmp14 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc218) + %y_369 = tt.reshape %tmp16 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc264) + %ileft_370 = arith.muli %y_369, %ileft : tensor<8x2x1xi32> loc(#loc265) + %ileft_371 = "tt.reduce"(%ileft_370) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc311) + %ileft_372 = tt.expand_dims %ileft_371 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc267) + %ileft_373 = tt.broadcast %ileft_372 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc268) + %iright_374 = arith.muli %y_369, %iright : tensor<8x2x1xi32> loc(#loc269) + %iright_375 = "tt.reduce"(%iright_374) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc313) + %iright_376 = tt.expand_dims %iright_375 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc271) + %iright_377 = tt.broadcast %iright_376 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc272) + %ileft_378 = tt.reshape %ileft_373 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_379 = tt.reshape %iright_377 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc274) + %cond_380 = arith.cmpi slt, %ileft_378, %iright_379 : tensor<1x16xi32> loc(#loc275) + %eq_381 = arith.cmpi eq, %ileft_378, %iright_379 : tensor<1x16xi32> loc(#loc276) + %cond_382 = arith.andi %eq_381, %cond_42 : tensor<1x16xi1> loc(#loc277) + %cond_383 = arith.ori %cond_380, %cond_382 : tensor<1x16xi1> loc(#loc278) + %cond_384 = arith.extui %cond_383 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc279) + %cond_385 = arith.xori %cond_384, %flip_18 : tensor<1x16xi32> loc(#loc279) + %cond_386 = arith.cmpi ne, %cond_385, %cst_1 : tensor<1x16xi32> loc(#loc280) + %ret_387 = arith.xori %ileft_378, %iright_379 : tensor<1x16xi32> loc(#loc281) + %ret_388 = arith.select %cond_386, %ret_387, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_389 = arith.xori %tmp16, %ret_388 : tensor<1x16xi32> loc(#loc283) + %new_idxs_390 = arith.select %cond_386, %new_idxs, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_391 = arith.xori %new_idxs_51, %new_idxs_390 : tensor<1x16xi32> loc(#loc285) + %y_392 = tt.reshape %ret_389 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc264) + %ileft_393 = arith.muli %y_392, %ileft_56 : tensor<4x2x2xi32> loc(#loc265) + %ileft_394 = "tt.reduce"(%ileft_393) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc311) + %ileft_395 = tt.expand_dims %ileft_394 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc267) + %ileft_396 = tt.broadcast %ileft_395 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc268) + %iright_397 = arith.muli %y_392, %flip_17 : tensor<4x2x2xi32> loc(#loc269) + %iright_398 = "tt.reduce"(%iright_397) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc313) + %iright_399 = tt.expand_dims %iright_398 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc271) + %iright_400 = tt.broadcast %iright_399 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc272) + %ileft_401 = tt.reshape %ileft_396 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_402 = tt.reshape %iright_400 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_403 = tt.reshape %new_idxs_391 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc286) + %left_idx_404 = arith.muli %y_idx_403, %ileft_56 : tensor<4x2x2xi32> loc(#loc287) + %left_idx_405 = "tt.reduce"(%left_idx_404) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc315) + %left_idx_406 = tt.expand_dims %left_idx_405 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc289) + %left_idx_407 = tt.broadcast %left_idx_406 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc290) + %right_idx_408 = arith.muli %y_idx_403, %flip_17 : tensor<4x2x2xi32> loc(#loc291) + %right_idx_409 = "tt.reduce"(%right_idx_408) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc317) + %right_idx_410 = tt.expand_dims %right_idx_409 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc293) + %right_idx_411 = tt.broadcast %right_idx_410 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc294) + %left_idx_412 = tt.reshape %left_idx_407 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_413 = tt.reshape %right_idx_411 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_414 = arith.cmpi slt, %ileft_401, %iright_402 : tensor<1x16xi32> loc(#loc275) + %eq_415 = arith.cmpi eq, %ileft_401, %iright_402 : tensor<1x16xi32> loc(#loc276) + %cond_416 = arith.cmpi sgt, %left_idx_412, %right_idx_413 : tensor<1x16xi32> loc(#loc297) + %cond_417 = arith.andi %eq_415, %cond_416 : tensor<1x16xi1> loc(#loc277) + %cond_418 = arith.ori %cond_414, %cond_417 : tensor<1x16xi1> loc(#loc278) + %cond_419 = arith.extui %cond_418 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc279) + %cond_420 = arith.xori %cond_419, %flip_54 : tensor<1x16xi32> loc(#loc279) + %cond_421 = arith.cmpi ne, %cond_420, %cst_1 : tensor<1x16xi32> loc(#loc280) + %ret_422 = arith.xori %ileft_401, %iright_402 : tensor<1x16xi32> loc(#loc281) + %ret_423 = arith.select %cond_421, %ret_422, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_424 = arith.xori %ret_389, %ret_423 : tensor<1x16xi32> loc(#loc283) + %new_idxs_425 = arith.xori %left_idx_412, %right_idx_413 : tensor<1x16xi32> loc(#loc298) + %new_idxs_426 = arith.select %cond_421, %new_idxs_425, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_427 = arith.xori %new_idxs_391, %new_idxs_426 : tensor<1x16xi32> loc(#loc285) + %y_428 = tt.reshape %ret_424 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc264) + %ileft_429 = arith.muli %y_428, %ileft : tensor<8x2x1xi32> loc(#loc265) + %ileft_430 = "tt.reduce"(%ileft_429) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc311) + %ileft_431 = tt.expand_dims %ileft_430 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc267) + %ileft_432 = tt.broadcast %ileft_431 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc268) + %iright_433 = arith.muli %y_428, %iright : tensor<8x2x1xi32> loc(#loc269) + %iright_434 = "tt.reduce"(%iright_433) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc313) + %iright_435 = tt.expand_dims %iright_434 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc271) + %iright_436 = tt.broadcast %iright_435 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc272) + %ileft_437 = tt.reshape %ileft_432 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_438 = tt.reshape %iright_436 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_439 = tt.reshape %new_idxs_427 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc286) + %left_idx_440 = arith.muli %y_idx_439, %ileft : tensor<8x2x1xi32> loc(#loc287) + %left_idx_441 = "tt.reduce"(%left_idx_440) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc315) + %left_idx_442 = tt.expand_dims %left_idx_441 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc289) + %left_idx_443 = tt.broadcast %left_idx_442 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc290) + %right_idx_444 = arith.muli %y_idx_439, %iright : tensor<8x2x1xi32> loc(#loc291) + %right_idx_445 = "tt.reduce"(%right_idx_444) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc317) + %right_idx_446 = tt.expand_dims %right_idx_445 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc293) + %right_idx_447 = tt.broadcast %right_idx_446 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc294) + %left_idx_448 = tt.reshape %left_idx_443 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_449 = tt.reshape %right_idx_447 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_450 = arith.cmpi slt, %ileft_437, %iright_438 : tensor<1x16xi32> loc(#loc275) + %eq_451 = arith.cmpi eq, %ileft_437, %iright_438 : tensor<1x16xi32> loc(#loc276) + %cond_452 = arith.cmpi sgt, %left_idx_448, %right_idx_449 : tensor<1x16xi32> loc(#loc297) + %cond_453 = arith.andi %eq_451, %cond_452 : tensor<1x16xi1> loc(#loc277) + %cond_454 = arith.ori %cond_450, %cond_453 : tensor<1x16xi1> loc(#loc278) + %cond_455 = arith.extui %cond_454 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc279) + %cond_456 = arith.xori %cond_455, %flip_54 : tensor<1x16xi32> loc(#loc279) + %cond_457 = arith.cmpi ne, %cond_456, %cst_1 : tensor<1x16xi32> loc(#loc280) + %ret_458 = arith.xori %ileft_437, %iright_438 : tensor<1x16xi32> loc(#loc281) + %ret_459 = arith.select %cond_457, %ret_458, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_460 = arith.xori %ret_424, %ret_459 : tensor<1x16xi32> loc(#loc283) + %new_idxs_461 = arith.xori %left_idx_448, %right_idx_449 : tensor<1x16xi32> loc(#loc298) + %new_idxs_462 = arith.select %cond_457, %new_idxs_461, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_463 = arith.xori %new_idxs_427, %new_idxs_462 : tensor<1x16xi32> loc(#loc285) + %y_464 = tt.reshape %ret_460 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc264) + %ileft_465 = arith.muli %y_464, %ileft_131 : tensor<2x2x4xi32> loc(#loc265) + %ileft_466 = "tt.reduce"(%ileft_465) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc311) + %ileft_467 = tt.expand_dims %ileft_466 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc267) + %ileft_468 = tt.broadcast %ileft_467 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc268) + %iright_469 = arith.muli %y_464, %flip_53 : tensor<2x2x4xi32> loc(#loc269) + %iright_470 = "tt.reduce"(%iright_469) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc313) + %iright_471 = tt.expand_dims %iright_470 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc271) + %iright_472 = tt.broadcast %iright_471 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc272) + %ileft_473 = tt.reshape %ileft_468 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_474 = tt.reshape %iright_472 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_475 = tt.reshape %new_idxs_463 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc286) + %left_idx_476 = arith.muli %y_idx_475, %ileft_131 : tensor<2x2x4xi32> loc(#loc287) + %left_idx_477 = "tt.reduce"(%left_idx_476) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc315) + %left_idx_478 = tt.expand_dims %left_idx_477 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc289) + %left_idx_479 = tt.broadcast %left_idx_478 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc290) + %right_idx_480 = arith.muli %y_idx_475, %flip_53 : tensor<2x2x4xi32> loc(#loc291) + %right_idx_481 = "tt.reduce"(%right_idx_480) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc317) + %right_idx_482 = tt.expand_dims %right_idx_481 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc293) + %right_idx_483 = tt.broadcast %right_idx_482 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc294) + %left_idx_484 = tt.reshape %left_idx_479 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_485 = tt.reshape %right_idx_483 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_486 = arith.cmpi slt, %ileft_473, %iright_474 : tensor<1x16xi32> loc(#loc275) + %eq_487 = arith.cmpi eq, %ileft_473, %iright_474 : tensor<1x16xi32> loc(#loc276) + %cond_488 = arith.cmpi sgt, %left_idx_484, %right_idx_485 : tensor<1x16xi32> loc(#loc297) + %cond_489 = arith.andi %eq_487, %cond_488 : tensor<1x16xi1> loc(#loc277) + %cond_490 = arith.ori %cond_486, %cond_489 : tensor<1x16xi1> loc(#loc278) + %cond_491 = arith.extui %cond_490 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc279) + %cond_492 = arith.xori %cond_491, %flip_129 : tensor<1x16xi32> loc(#loc279) + %cond_493 = arith.cmpi ne, %cond_492, %cst_1 : tensor<1x16xi32> loc(#loc280) + %ret_494 = arith.xori %ileft_473, %iright_474 : tensor<1x16xi32> loc(#loc281) + %ret_495 = arith.select %cond_493, %ret_494, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_496 = arith.xori %ret_460, %ret_495 : tensor<1x16xi32> loc(#loc283) + %new_idxs_497 = arith.xori %left_idx_484, %right_idx_485 : tensor<1x16xi32> loc(#loc298) + %new_idxs_498 = arith.select %cond_493, %new_idxs_497, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_499 = arith.xori %new_idxs_463, %new_idxs_498 : tensor<1x16xi32> loc(#loc285) + %y_500 = tt.reshape %ret_496 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc264) + %ileft_501 = arith.muli %y_500, %ileft_56 : tensor<4x2x2xi32> loc(#loc265) + %ileft_502 = "tt.reduce"(%ileft_501) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc311) + %ileft_503 = tt.expand_dims %ileft_502 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc267) + %ileft_504 = tt.broadcast %ileft_503 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc268) + %iright_505 = arith.muli %y_500, %flip_17 : tensor<4x2x2xi32> loc(#loc269) + %iright_506 = "tt.reduce"(%iright_505) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc313) + %iright_507 = tt.expand_dims %iright_506 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc271) + %iright_508 = tt.broadcast %iright_507 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc272) + %ileft_509 = tt.reshape %ileft_504 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_510 = tt.reshape %iright_508 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_511 = tt.reshape %new_idxs_499 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc286) + %left_idx_512 = arith.muli %y_idx_511, %ileft_56 : tensor<4x2x2xi32> loc(#loc287) + %left_idx_513 = "tt.reduce"(%left_idx_512) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc315) + %left_idx_514 = tt.expand_dims %left_idx_513 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc289) + %left_idx_515 = tt.broadcast %left_idx_514 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc290) + %right_idx_516 = arith.muli %y_idx_511, %flip_17 : tensor<4x2x2xi32> loc(#loc291) + %right_idx_517 = "tt.reduce"(%right_idx_516) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc317) + %right_idx_518 = tt.expand_dims %right_idx_517 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc293) + %right_idx_519 = tt.broadcast %right_idx_518 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc294) + %left_idx_520 = tt.reshape %left_idx_515 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_521 = tt.reshape %right_idx_519 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_522 = arith.cmpi slt, %ileft_509, %iright_510 : tensor<1x16xi32> loc(#loc275) + %eq_523 = arith.cmpi eq, %ileft_509, %iright_510 : tensor<1x16xi32> loc(#loc276) + %cond_524 = arith.cmpi sgt, %left_idx_520, %right_idx_521 : tensor<1x16xi32> loc(#loc297) + %cond_525 = arith.andi %eq_523, %cond_524 : tensor<1x16xi1> loc(#loc277) + %cond_526 = arith.ori %cond_522, %cond_525 : tensor<1x16xi1> loc(#loc278) + %cond_527 = arith.extui %cond_526 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc279) + %cond_528 = arith.xori %cond_527, %flip_129 : tensor<1x16xi32> loc(#loc279) + %cond_529 = arith.cmpi ne, %cond_528, %cst_1 : tensor<1x16xi32> loc(#loc280) + %ret_530 = arith.xori %ileft_509, %iright_510 : tensor<1x16xi32> loc(#loc281) + %ret_531 = arith.select %cond_529, %ret_530, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_532 = arith.xori %ret_496, %ret_531 : tensor<1x16xi32> loc(#loc283) + %new_idxs_533 = arith.xori %left_idx_520, %right_idx_521 : tensor<1x16xi32> loc(#loc298) + %new_idxs_534 = arith.select %cond_529, %new_idxs_533, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_535 = arith.xori %new_idxs_499, %new_idxs_534 : tensor<1x16xi32> loc(#loc285) + %y_536 = tt.reshape %ret_532 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc264) + %ileft_537 = arith.muli %y_536, %ileft : tensor<8x2x1xi32> loc(#loc265) + %ileft_538 = "tt.reduce"(%ileft_537) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc311) + %ileft_539 = tt.expand_dims %ileft_538 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc267) + %ileft_540 = tt.broadcast %ileft_539 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc268) + %iright_541 = arith.muli %y_536, %iright : tensor<8x2x1xi32> loc(#loc269) + %iright_542 = "tt.reduce"(%iright_541) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc313) + %iright_543 = tt.expand_dims %iright_542 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc271) + %iright_544 = tt.broadcast %iright_543 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc272) + %ileft_545 = tt.reshape %ileft_540 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_546 = tt.reshape %iright_544 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_547 = tt.reshape %new_idxs_535 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc286) + %left_idx_548 = arith.muli %y_idx_547, %ileft : tensor<8x2x1xi32> loc(#loc287) + %left_idx_549 = "tt.reduce"(%left_idx_548) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc315) + %left_idx_550 = tt.expand_dims %left_idx_549 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc289) + %left_idx_551 = tt.broadcast %left_idx_550 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc290) + %right_idx_552 = arith.muli %y_idx_547, %iright : tensor<8x2x1xi32> loc(#loc291) + %right_idx_553 = "tt.reduce"(%right_idx_552) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc317) + %right_idx_554 = tt.expand_dims %right_idx_553 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc293) + %right_idx_555 = tt.broadcast %right_idx_554 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc294) + %left_idx_556 = tt.reshape %left_idx_551 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_557 = tt.reshape %right_idx_555 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_558 = arith.cmpi slt, %ileft_545, %iright_546 : tensor<1x16xi32> loc(#loc275) + %eq_559 = arith.cmpi eq, %ileft_545, %iright_546 : tensor<1x16xi32> loc(#loc276) + %cond_560 = arith.cmpi sgt, %left_idx_556, %right_idx_557 : tensor<1x16xi32> loc(#loc297) + %cond_561 = arith.andi %eq_559, %cond_560 : tensor<1x16xi1> loc(#loc277) + %cond_562 = arith.ori %cond_558, %cond_561 : tensor<1x16xi1> loc(#loc278) + %cond_563 = arith.extui %cond_562 : tensor<1x16xi1> to tensor<1x16xi32> loc(#loc279) + %cond_564 = arith.xori %cond_563, %flip_129 : tensor<1x16xi32> loc(#loc279) + %cond_565 = arith.cmpi ne, %cond_564, %cst_1 : tensor<1x16xi32> loc(#loc280) + %ret_566 = arith.xori %ileft_545, %iright_546 : tensor<1x16xi32> loc(#loc281) + %ret_567 = arith.select %cond_565, %ret_566, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_568 = arith.xori %ret_532, %ret_567 : tensor<1x16xi32> loc(#loc283) + %new_idxs_569 = arith.xori %left_idx_556, %right_idx_557 : tensor<1x16xi32> loc(#loc298) + %new_idxs_570 = arith.select %cond_565, %new_idxs_569, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_571 = arith.xori %new_idxs_535, %new_idxs_570 : tensor<1x16xi32> loc(#loc285) + %y_572 = tt.reshape %ret_568 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc264) + %ileft_573 = arith.muli %y_572, %ileft_240 : tensor<1x2x8xi32> loc(#loc265) + %ileft_574 = "tt.reduce"(%ileft_573) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc311) + %ileft_575 = tt.expand_dims %ileft_574 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc267) + %ileft_576 = tt.broadcast %ileft_575 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc268) + %iright_577 = arith.muli %y_572, %flip_128 : tensor<1x2x8xi32> loc(#loc269) + %iright_578 = "tt.reduce"(%iright_577) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc313) + %iright_579 = tt.expand_dims %iright_578 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc271) + %iright_580 = tt.broadcast %iright_579 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc272) + %ileft_581 = tt.reshape %ileft_576 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_582 = tt.reshape %iright_580 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_583 = tt.reshape %new_idxs_571 : tensor<1x16xi32> -> tensor<1x2x8xi32> loc(#loc286) + %left_idx_584 = arith.muli %y_idx_583, %ileft_240 : tensor<1x2x8xi32> loc(#loc287) + %left_idx_585 = "tt.reduce"(%left_idx_584) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc315) + %left_idx_586 = tt.expand_dims %left_idx_585 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc289) + %left_idx_587 = tt.broadcast %left_idx_586 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc290) + %right_idx_588 = arith.muli %y_idx_583, %flip_128 : tensor<1x2x8xi32> loc(#loc291) + %right_idx_589 = "tt.reduce"(%right_idx_588) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<1x2x8xi32>) -> tensor<1x8xi32> loc(#loc317) + %right_idx_590 = tt.expand_dims %right_idx_589 {axis = 1 : i32} : tensor<1x8xi32> -> tensor<1x1x8xi32> loc(#loc293) + %right_idx_591 = tt.broadcast %right_idx_590 : tensor<1x1x8xi32> -> tensor<1x2x8xi32> loc(#loc294) + %left_idx_592 = tt.reshape %left_idx_587 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_593 = tt.reshape %right_idx_591 : tensor<1x2x8xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_594 = arith.cmpi slt, %ileft_581, %iright_582 : tensor<1x16xi32> loc(#loc275) + %eq_595 = arith.cmpi eq, %ileft_581, %iright_582 : tensor<1x16xi32> loc(#loc276) + %cond_596 = arith.cmpi sgt, %left_idx_592, %right_idx_593 : tensor<1x16xi32> loc(#loc297) + %cond_597 = arith.andi %eq_595, %cond_596 : tensor<1x16xi1> loc(#loc277) + %cond_598 = arith.ori %cond_594, %cond_597 : tensor<1x16xi1> loc(#loc278) + %ret_599 = arith.xori %ileft_581, %iright_582 : tensor<1x16xi32> loc(#loc281) + %ret_600 = arith.select %cond_598, %ret_599, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_601 = arith.xori %ret_568, %ret_600 : tensor<1x16xi32> loc(#loc283) + %new_idxs_602 = arith.xori %left_idx_592, %right_idx_593 : tensor<1x16xi32> loc(#loc298) + %new_idxs_603 = arith.select %cond_598, %new_idxs_602, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_604 = arith.xori %new_idxs_571, %new_idxs_603 : tensor<1x16xi32> loc(#loc285) + %y_605 = tt.reshape %ret_601 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc264) + %ileft_606 = arith.muli %y_605, %ileft_131 : tensor<2x2x4xi32> loc(#loc265) + %ileft_607 = "tt.reduce"(%ileft_606) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc311) + %ileft_608 = tt.expand_dims %ileft_607 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc267) + %ileft_609 = tt.broadcast %ileft_608 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc268) + %iright_610 = arith.muli %y_605, %flip_53 : tensor<2x2x4xi32> loc(#loc269) + %iright_611 = "tt.reduce"(%iright_610) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc313) + %iright_612 = tt.expand_dims %iright_611 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc271) + %iright_613 = tt.broadcast %iright_612 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc272) + %ileft_614 = tt.reshape %ileft_609 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_615 = tt.reshape %iright_613 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_616 = tt.reshape %new_idxs_604 : tensor<1x16xi32> -> tensor<2x2x4xi32> loc(#loc286) + %left_idx_617 = arith.muli %y_idx_616, %ileft_131 : tensor<2x2x4xi32> loc(#loc287) + %left_idx_618 = "tt.reduce"(%left_idx_617) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc315) + %left_idx_619 = tt.expand_dims %left_idx_618 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc289) + %left_idx_620 = tt.broadcast %left_idx_619 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc290) + %right_idx_621 = arith.muli %y_idx_616, %flip_53 : tensor<2x2x4xi32> loc(#loc291) + %right_idx_622 = "tt.reduce"(%right_idx_621) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<2x2x4xi32>) -> tensor<2x4xi32> loc(#loc317) + %right_idx_623 = tt.expand_dims %right_idx_622 {axis = 1 : i32} : tensor<2x4xi32> -> tensor<2x1x4xi32> loc(#loc293) + %right_idx_624 = tt.broadcast %right_idx_623 : tensor<2x1x4xi32> -> tensor<2x2x4xi32> loc(#loc294) + %left_idx_625 = tt.reshape %left_idx_620 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_626 = tt.reshape %right_idx_624 : tensor<2x2x4xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_627 = arith.cmpi slt, %ileft_614, %iright_615 : tensor<1x16xi32> loc(#loc275) + %eq_628 = arith.cmpi eq, %ileft_614, %iright_615 : tensor<1x16xi32> loc(#loc276) + %cond_629 = arith.cmpi sgt, %left_idx_625, %right_idx_626 : tensor<1x16xi32> loc(#loc297) + %cond_630 = arith.andi %eq_628, %cond_629 : tensor<1x16xi1> loc(#loc277) + %cond_631 = arith.ori %cond_627, %cond_630 : tensor<1x16xi1> loc(#loc278) + %ret_632 = arith.xori %ileft_614, %iright_615 : tensor<1x16xi32> loc(#loc281) + %ret_633 = arith.select %cond_631, %ret_632, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_634 = arith.xori %ret_601, %ret_633 : tensor<1x16xi32> loc(#loc283) + %new_idxs_635 = arith.xori %left_idx_625, %right_idx_626 : tensor<1x16xi32> loc(#loc298) + %new_idxs_636 = arith.select %cond_631, %new_idxs_635, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_637 = arith.xori %new_idxs_604, %new_idxs_636 : tensor<1x16xi32> loc(#loc285) + %y_638 = tt.reshape %ret_634 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc264) + %ileft_639 = arith.muli %y_638, %ileft_56 : tensor<4x2x2xi32> loc(#loc265) + %ileft_640 = "tt.reduce"(%ileft_639) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc311) + %ileft_641 = tt.expand_dims %ileft_640 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc267) + %ileft_642 = tt.broadcast %ileft_641 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc268) + %iright_643 = arith.muli %y_638, %flip_17 : tensor<4x2x2xi32> loc(#loc269) + %iright_644 = "tt.reduce"(%iright_643) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc313) + %iright_645 = tt.expand_dims %iright_644 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc271) + %iright_646 = tt.broadcast %iright_645 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc272) + %ileft_647 = tt.reshape %ileft_642 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_648 = tt.reshape %iright_646 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_649 = tt.reshape %new_idxs_637 : tensor<1x16xi32> -> tensor<4x2x2xi32> loc(#loc286) + %left_idx_650 = arith.muli %y_idx_649, %ileft_56 : tensor<4x2x2xi32> loc(#loc287) + %left_idx_651 = "tt.reduce"(%left_idx_650) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc315) + %left_idx_652 = tt.expand_dims %left_idx_651 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc289) + %left_idx_653 = tt.broadcast %left_idx_652 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc290) + %right_idx_654 = arith.muli %y_idx_649, %flip_17 : tensor<4x2x2xi32> loc(#loc291) + %right_idx_655 = "tt.reduce"(%right_idx_654) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<4x2x2xi32>) -> tensor<4x2xi32> loc(#loc317) + %right_idx_656 = tt.expand_dims %right_idx_655 {axis = 1 : i32} : tensor<4x2xi32> -> tensor<4x1x2xi32> loc(#loc293) + %right_idx_657 = tt.broadcast %right_idx_656 : tensor<4x1x2xi32> -> tensor<4x2x2xi32> loc(#loc294) + %left_idx_658 = tt.reshape %left_idx_653 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_659 = tt.reshape %right_idx_657 : tensor<4x2x2xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_660 = arith.cmpi slt, %ileft_647, %iright_648 : tensor<1x16xi32> loc(#loc275) + %eq_661 = arith.cmpi eq, %ileft_647, %iright_648 : tensor<1x16xi32> loc(#loc276) + %cond_662 = arith.cmpi sgt, %left_idx_658, %right_idx_659 : tensor<1x16xi32> loc(#loc297) + %cond_663 = arith.andi %eq_661, %cond_662 : tensor<1x16xi1> loc(#loc277) + %cond_664 = arith.ori %cond_660, %cond_663 : tensor<1x16xi1> loc(#loc278) + %ret_665 = arith.xori %ileft_647, %iright_648 : tensor<1x16xi32> loc(#loc281) + %ret_666 = arith.select %cond_664, %ret_665, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc282) + %ret_667 = arith.xori %ret_634, %ret_666 : tensor<1x16xi32> loc(#loc283) + %new_idxs_668 = arith.xori %left_idx_658, %right_idx_659 : tensor<1x16xi32> loc(#loc298) + %new_idxs_669 = arith.select %cond_664, %new_idxs_668, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_670 = arith.xori %new_idxs_637, %new_idxs_669 : tensor<1x16xi32> loc(#loc285) + %y_671 = tt.reshape %ret_667 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc264) + %ileft_672 = arith.muli %y_671, %ileft : tensor<8x2x1xi32> loc(#loc265) + %ileft_673 = "tt.reduce"(%ileft_672) <{axis = 1 : i32}> ({ + ^bb0(%ileft_705: i32 loc(callsite(#loc1 at #loc266)), %ileft_706: i32 loc(callsite(#loc1 at #loc266))): + %ileft_707 = arith.addi %ileft_705, %ileft_706 : i32 loc(#loc323) + tt.reduce.return %ileft_707 : i32 loc(#loc311) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc311) + %ileft_674 = tt.expand_dims %ileft_673 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc267) + %ileft_675 = tt.broadcast %ileft_674 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc268) + %iright_676 = arith.muli %y_671, %iright : tensor<8x2x1xi32> loc(#loc269) + %iright_677 = "tt.reduce"(%iright_676) <{axis = 1 : i32}> ({ + ^bb0(%iright_705: i32 loc(callsite(#loc1 at #loc270)), %iright_706: i32 loc(callsite(#loc1 at #loc270))): + %iright_707 = arith.addi %iright_705, %iright_706 : i32 loc(#loc324) + tt.reduce.return %iright_707 : i32 loc(#loc313) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc313) + %iright_678 = tt.expand_dims %iright_677 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc271) + %iright_679 = tt.broadcast %iright_678 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc272) + %ileft_680 = tt.reshape %ileft_675 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc273) + %iright_681 = tt.reshape %iright_679 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc274) + %y_idx_682 = tt.reshape %new_idxs_670 : tensor<1x16xi32> -> tensor<8x2x1xi32> loc(#loc286) + %left_idx_683 = arith.muli %y_idx_682, %ileft : tensor<8x2x1xi32> loc(#loc287) + %left_idx_684 = "tt.reduce"(%left_idx_683) <{axis = 1 : i32}> ({ + ^bb0(%left_idx_705: i32 loc(callsite(#loc1 at #loc288)), %left_idx_706: i32 loc(callsite(#loc1 at #loc288))): + %left_idx_707 = arith.addi %left_idx_705, %left_idx_706 : i32 loc(#loc325) + tt.reduce.return %left_idx_707 : i32 loc(#loc315) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc315) + %left_idx_685 = tt.expand_dims %left_idx_684 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc289) + %left_idx_686 = tt.broadcast %left_idx_685 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc290) + %right_idx_687 = arith.muli %y_idx_682, %iright : tensor<8x2x1xi32> loc(#loc291) + %right_idx_688 = "tt.reduce"(%right_idx_687) <{axis = 1 : i32}> ({ + ^bb0(%right_idx_705: i32 loc(callsite(#loc1 at #loc292)), %right_idx_706: i32 loc(callsite(#loc1 at #loc292))): + %right_idx_707 = arith.addi %right_idx_705, %right_idx_706 : i32 loc(#loc326) + tt.reduce.return %right_idx_707 : i32 loc(#loc317) + }) : (tensor<8x2x1xi32>) -> tensor<8x1xi32> loc(#loc317) + %right_idx_689 = tt.expand_dims %right_idx_688 {axis = 1 : i32} : tensor<8x1xi32> -> tensor<8x1x1xi32> loc(#loc293) + %right_idx_690 = tt.broadcast %right_idx_689 : tensor<8x1x1xi32> -> tensor<8x2x1xi32> loc(#loc294) + %left_idx_691 = tt.reshape %left_idx_686 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc295) + %right_idx_692 = tt.reshape %right_idx_690 : tensor<8x2x1xi32> -> tensor<1x16xi32> loc(#loc296) + %cond_693 = arith.cmpi slt, %ileft_680, %iright_681 : tensor<1x16xi32> loc(#loc275) + %eq_694 = arith.cmpi eq, %ileft_680, %iright_681 : tensor<1x16xi32> loc(#loc276) + %cond_695 = arith.cmpi sgt, %left_idx_691, %right_idx_692 : tensor<1x16xi32> loc(#loc297) + %cond_696 = arith.andi %eq_694, %cond_695 : tensor<1x16xi1> loc(#loc277) + %cond_697 = arith.ori %cond_693, %cond_696 : tensor<1x16xi1> loc(#loc278) + %new_idxs_698 = arith.xori %left_idx_691, %right_idx_692 : tensor<1x16xi32> loc(#loc298) + %new_idxs_699 = arith.select %cond_697, %new_idxs_698, %cst_1 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc284) + %new_idxs_700 = arith.xori %new_idxs_670, %new_idxs_699 : tensor<1x16xi32> loc(#loc285) + %tmp20 = arith.extui %tmp5 : tensor<1x16xi1> to tensor<1x16xi64> loc(#loc220) + %tmp23 = arith.select %tmp0_13, %tmp20, %cst_5 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc187) + %tmp24 = "tt.reduce"(%tmp23) <{axis = 1 : i32}> ({ + ^bb0(%tmp24_705: i64 loc(callsite(#loc1 at #loc188)), %tmp24_706: i64 loc(callsite(#loc1 at #loc188))): + %tmp24_707 = arith.addi %tmp24_705, %tmp24_706 : i64 loc(#loc299) + tt.reduce.return %tmp24_707 : i64 loc(#loc221) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc221) + %tmp24_701 = tt.expand_dims %tmp24 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc189) + %tmp25 = arith.extui %tmp14 : tensor<1x16xi1> to tensor<1x16xi64> loc(#loc223) + %tmp28 = arith.select %tmp0_13, %tmp25, %cst_5 : tensor<1x16xi1>, tensor<1x16xi64> loc(#loc191) + %tmp29 = "tt.reduce"(%tmp28) <{axis = 1 : i32}> ({ + ^bb0(%tmp29_705: i64 loc(callsite(#loc1 at #loc192)), %tmp29_706: i64 loc(callsite(#loc1 at #loc192))): + %tmp29_707 = arith.addi %tmp29_705, %tmp29_706 : i64 loc(#loc300) + tt.reduce.return %tmp29_707 : i64 loc(#loc224) + }) : (tensor<1x16xi64>) -> tensor<1xi64> loc(#loc224) + %tmp29_702 = tt.expand_dims %tmp29 {axis = 1 : i32} : tensor<1xi64> -> tensor<1x1xi64> loc(#loc193) + %tmp30 = arith.trunci %tmp24_701 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc194) + %tmp31 = arith.trunci %tmp29_702 : tensor<1x1xi64> to tensor<1x1xi32> loc(#loc195) + %tmp34 = tt.broadcast %tmp30 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc196) + %tmp34_703 = arith.cmpi slt, %r0_index_8, %tmp34 : tensor<1x16xi32> loc(#loc196) + %tmp36 = arith.select %tmp34_703, %new_idxs_368, %cst_3 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc197) + %tmp38 = arith.addi %tmp36, %cst_2 : tensor<1x16xi32> loc(#loc198) + %tmp39 = arith.cmpi slt, %tmp36, %cst_1 : tensor<1x16xi32> loc(#loc199) + %tmp40 = arith.select %tmp39, %tmp38, %tmp36 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc200) + %0 = arith.cmpi sge, %tmp40, %cst_1 : tensor<1x16xi32> loc(#loc83) + %1 = arith.cmpi slt, %tmp40, %cst_2 : tensor<1x16xi32> loc(#loc84) + %2 = arith.andi %0, %1 : tensor<1x16xi1> loc(#loc85) + %3 = arith.xori %xmask_6, %true : i1 loc(#loc86) + %4 = tt.splat %3 : i1 -> tensor<1x16xi1> loc(#loc201) + %5 = arith.ori %2, %4 : tensor<1x16xi1> loc(#loc87) + tt.assert %5, "index out of bounds: 0 <= tmp40 < 17" : tensor<1x16xi1> loc(#loc88) + %tmp45 = tt.broadcast %tmp31 : tensor<1x1xi32> -> tensor<1x16xi32> loc(#loc202) + %tmp45_704 = arith.cmpi slt, %r0_index_8, %tmp45 : tensor<1x16xi32> loc(#loc202) + %tmp46 = arith.select %tmp45_704, %new_idxs_700, %cst_3 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc203) + %tmp47 = arith.addi %tmp46, %cst_2 : tensor<1x16xi32> loc(#loc204) + %tmp48 = arith.cmpi slt, %tmp46, %cst_1 : tensor<1x16xi32> loc(#loc205) + %tmp49 = arith.select %tmp48, %tmp47, %tmp46 : tensor<1x16xi1>, tensor<1x16xi32> loc(#loc206) + %6 = arith.cmpi sge, %tmp49, %cst_1 : tensor<1x16xi32> loc(#loc94) + %7 = arith.cmpi slt, %tmp49, %cst_2 : tensor<1x16xi32> loc(#loc95) + %8 = arith.andi %6, %7 : tensor<1x16xi1> loc(#loc96) + %9 = arith.ori %8, %4 : tensor<1x16xi1> loc(#loc97) + tt.assert %9, "index out of bounds: 0 <= tmp49 < 17" : tensor<1x16xi1> loc(#loc98) + %10 = tt.addptr %out_ptr4, %xoffset : !tt.ptr, i32 loc(#loc99) + %11 = tt.splat %10 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc99) + tt.store %11, %tmp30, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc100) + %12 = tt.addptr %out_ptr5, %xoffset : !tt.ptr, i32 loc(#loc101) + %13 = tt.splat %12 : !tt.ptr -> tensor<1x1x!tt.ptr> loc(#loc101) + tt.store %13, %tmp31, %xmask_7 : tensor<1x1x!tt.ptr> loc(#loc102) + %14 = tt.splat %out_ptr6 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc103) + %15 = tt.addptr %14, %tmp0_10 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc103) + tt.store %15, %new_idxs_368, %tmp0_13 : tensor<1x16x!tt.ptr> loc(#loc104) + %16 = arith.muli %xoffset, %c17_i32 : i32 loc(#loc105) + %17 = tt.splat %16 : i32 -> tensor<1x16xi32> loc(#loc207) + %18 = arith.addi %tmp40, %17 : tensor<1x16xi32> loc(#loc106) + %19 = tt.splat %out_ptr7 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc107) + %20 = tt.addptr %19, %18 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc107) + tt.store %20, %cst_0, %tmp0_13 : tensor<1x16x!tt.ptr> loc(#loc108) + %21 = tt.splat %out_ptr8 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc109) + %22 = tt.addptr %21, %tmp0_10 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc109) + tt.store %22, %new_idxs_700, %tmp0_13 : tensor<1x16x!tt.ptr> loc(#loc110) + %23 = arith.addi %tmp49, %17 : tensor<1x16xi32> loc(#loc111) + %24 = tt.splat %out_ptr9 : !tt.ptr -> tensor<1x16x!tt.ptr> loc(#loc112) + %25 = tt.addptr %24, %23 : tensor<1x16x!tt.ptr>, tensor<1x16xi32> loc(#loc112) + tt.store %25, %cst_0, %tmp0_13 : tensor<1x16x!tt.ptr> loc(#loc113) + tt.return loc(#loc114) + } loc(#loc) +} loc(#loc) +#loc2 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":26:21) +#loc3 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":24:28) +#loc4 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":27:28) +#loc5 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":27:38) +#loc6 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:40) +#loc7 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:37) +#loc8 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:30) +#loc9 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":34:45) +#loc10 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":36:18) +#loc11 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":38:18) +#loc12 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":39:18) +#loc13 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":41:19) +#loc14 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":40:19) +#loc15 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":43:19) +#loc16 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:41) +#loc19 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:44) +#loc20 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:60) +#loc21 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":627:68) +#loc22 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":533:22) +#loc24 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":537:21) +#loc25 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:40) +#loc26 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":291:36) +#loc28 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":261:15) +#loc29 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:65) +#loc30 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":538:78) +#loc31 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:41) +#loc33 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:67) +#loc34 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":539:80) +#loc35 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":540:30) +#loc36 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":541:32) +#loc37 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":546:29) +#loc38 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:36) +#loc39 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:23) +#loc40 = loc("/workspace/specforge/lib/python3.11/site-packages/triton/language/standard.py":290:25) +#loc42 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:53) +#loc43 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":548:66) +#loc44 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:37) +#loc45 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:23) +#loc47 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:54) +#loc48 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":551:67) +#loc49 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":553:36) +#loc50 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":554:38) +#loc51 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":574:22) +#loc52 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":591:21) +#loc53 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:40) +#loc54 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:29) +#loc55 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":594:23) +#loc56 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:19) +#loc57 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":599:28) +#loc58 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:38) +#loc59 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:46) +#loc60 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":600:15) +#loc61 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:48) +#loc62 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:59) +#loc63 = loc("/workspace/specforge/lib/python3.11/site-packages/torch/_inductor/runtime/triton_helpers.py":601:22) +#loc64 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":47:20) +#loc65 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":49:21) +#loc66 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":48:21) +#loc68 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":52:20) +#loc69 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":54:35) +#loc71 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":55:29) +#loc72 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":56:21) +#loc73 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":58:35) +#loc75 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":59:29) +#loc76 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":60:21) +#loc77 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":61:21) +#loc78 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":64:19) +#loc79 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":66:35) +#loc80 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":68:20) +#loc81 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":69:20) +#loc82 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":70:35) +#loc83 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:28) +#loc84 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:46) +#loc85 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:38) +#loc86 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:55) +#loc87 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:53) +#loc88 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":71:63) +#loc89 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":75:19) +#loc90 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":76:35) +#loc91 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":77:20) +#loc92 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":78:20) +#loc93 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":79:35) +#loc94 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:28) +#loc95 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:46) +#loc96 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:38) +#loc97 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:53) +#loc98 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":80:63) +#loc99 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":81:25) +#loc100 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":81:37) +#loc101 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":82:25) +#loc102 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":82:37) +#loc103 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":83:25) +#loc104 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":83:47) +#loc105 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:52) +#loc106 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:49) +#loc107 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:25) +#loc108 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":84:85) +#loc109 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":85:25) +#loc110 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":85:47) +#loc111 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:49) +#loc112 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:25) +#loc113 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:85) +#loc114 = loc("/workspace/hanrui/SpecForge-ext/cache/compiled_kernels/fh/cfhw7jsr5sk74kchdhqog3qyq7ge3qjqmmqwy37lr2be4lpykzx3.py":86:4) +#loc124 = loc("xmask"(#loc2)) +#loc125 = loc("xoffset"(#loc3)) +#loc126 = loc("r0_index"(#loc4)) +#loc127 = loc("r0_index"(#loc5)) +#loc128 = loc("tmp0"(#loc6)) +#loc129 = loc("tmp0"(#loc7)) +#loc130 = loc("tmp0"(#loc8)) +#loc131 = loc("tmp0"(#loc9)) +#loc132 = loc("tmp2"(#loc10)) +#loc133 = loc("tmp4"(#loc11)) +#loc134 = loc("tmp5"(#loc12)) +#loc135 = loc("tmp7"(#loc13)) +#loc136 = loc("tmp6"(#loc14)) +#loc137 = loc("tmp9"(#loc15)) +#loc138 = loc("flip"(#loc16)) +#loc140 = loc("flip"(#loc19)) +#loc141 = loc("flip"(#loc20)) +#loc142 = loc("flip"(#loc21)) +#loc143 = loc("y"(#loc22)) +#loc144 = loc("left_mask"(#loc24)) +#loc145 = loc("ileft"(#loc25)) +#loc147 = loc("ileft"(#loc29)) +#loc148 = loc("ileft"(#loc30)) +#loc149 = loc("iright"(#loc31)) +#loc151 = loc("iright"(#loc33)) +#loc152 = loc("iright"(#loc34)) +#loc153 = loc("ileft"(#loc35)) +#loc154 = loc("iright"(#loc36)) +#loc155 = loc("y_idx"(#loc37)) +#loc156 = loc("left_idx"(#loc38)) +#loc157 = loc("left_idx"(#loc39)) +#loc158 = loc("input"(#loc40)) +#loc160 = loc("left_idx"(#loc42)) +#loc161 = loc("left_idx"(#loc43)) +#loc162 = loc("right_idx"(#loc44)) +#loc163 = loc("right_idx"(#loc45)) +#loc165 = loc("right_idx"(#loc47)) +#loc166 = loc("right_idx"(#loc48)) +#loc167 = loc("left_idx"(#loc49)) +#loc168 = loc("right_idx"(#loc50)) +#loc169 = loc("cond"(#loc51)) +#loc170 = loc("eq"(#loc52)) +#loc171 = loc("cond"(#loc53)) +#loc172 = loc("cond"(#loc54)) +#loc173 = loc("cond"(#loc55)) +#loc174 = loc("cond"(#loc56)) +#loc175 = loc("cond"(#loc57)) +#loc176 = loc("ret"(#loc58)) +#loc177 = loc("ret"(#loc59)) +#loc178 = loc("ret"(#loc60)) +#loc179 = loc("new_idxs"(#loc61)) +#loc180 = loc("new_idxs"(#loc62)) +#loc181 = loc("new_idxs"(#loc63)) +#loc182 = loc("tmp14"(#loc64)) +#loc183 = loc("tmp16"(#loc65)) +#loc184 = loc("tmp15"(#loc66)) +#loc186 = loc("tmp20"(#loc68)) +#loc187 = loc("tmp23"(#loc69)) +#loc189 = loc("tmp24"(#loc71)) +#loc190 = loc("tmp25"(#loc72)) +#loc191 = loc("tmp28"(#loc73)) +#loc193 = loc("tmp29"(#loc75)) +#loc194 = loc("tmp30"(#loc76)) +#loc195 = loc("tmp31"(#loc77)) +#loc196 = loc("tmp34"(#loc78)) +#loc197 = loc("tmp36"(#loc79)) +#loc198 = loc("tmp38"(#loc80)) +#loc199 = loc("tmp39"(#loc81)) +#loc200 = loc("tmp40"(#loc82)) +#loc201 = loc(fused[#loc87, #loc86]) +#loc202 = loc("tmp45"(#loc89)) +#loc203 = loc("tmp46"(#loc90)) +#loc204 = loc("tmp47"(#loc91)) +#loc205 = loc("tmp48"(#loc92)) +#loc206 = loc("tmp49"(#loc93)) +#loc207 = loc(fused[#loc106, #loc105]) +#loc208 = loc(fused[#loc129, #loc128]) +#loc209 = loc(fused[#loc131, #loc124]) +#loc210 = loc(fused[#loc135, #loc136]) +#loc211 = loc(callsite(#loc138 at #loc139)) +#loc212 = loc(callsite(#loc140 at #loc139)) +#loc213 = loc(callsite(#loc141 at #loc139)) +#loc214 = loc(callsite(#loc142 at #loc139)) +#loc216 = loc("cond"(#loc169)) +#loc217 = loc("eq"(#loc170)) +#loc218 = loc(fused[#loc183, #loc184]) +#loc220 = loc(fused[#loc186, #loc135, #loc136]) +#loc221 = loc(callsite(#loc26 at #loc188)) +#loc223 = loc(fused[#loc190, #loc183, #loc184]) +#loc224 = loc(callsite(#loc26 at #loc192)) +#loc226 = loc(callsite(#loc143 at #loc215)) +#loc227 = loc(callsite(#loc144 at #loc215)) +#loc228 = loc(callsite(#loc145 at #loc215)) +#loc230 = loc(callsite(#loc147 at #loc215)) +#loc231 = loc(callsite(#loc148 at #loc215)) +#loc232 = loc(callsite(#loc149 at #loc215)) +#loc234 = loc(callsite(#loc151 at #loc215)) +#loc235 = loc(callsite(#loc152 at #loc215)) +#loc236 = loc(callsite(#loc153 at #loc215)) +#loc237 = loc(callsite(#loc154 at #loc215)) +#loc238 = loc(callsite(#loc155 at #loc215)) +#loc239 = loc(callsite(#loc156 at #loc215)) +#loc240 = loc(callsite(#loc157 at #loc215)) +#loc242 = loc(callsite(#loc160 at #loc215)) +#loc243 = loc(callsite(#loc161 at #loc215)) +#loc244 = loc(callsite(#loc162 at #loc215)) +#loc245 = loc(callsite(#loc163 at #loc215)) +#loc247 = loc(callsite(#loc165 at #loc215)) +#loc248 = loc(callsite(#loc166 at #loc215)) +#loc249 = loc(callsite(#loc167 at #loc215)) +#loc250 = loc(callsite(#loc168 at #loc215)) +#loc251 = loc(callsite(#loc216 at #loc215)) +#loc252 = loc(callsite(#loc217 at #loc215)) +#loc253 = loc(callsite(#loc171 at #loc215)) +#loc254 = loc(callsite(#loc172 at #loc215)) +#loc255 = loc(callsite(#loc173 at #loc215)) +#loc256 = loc(callsite(#loc174 at #loc215)) +#loc257 = loc(callsite(#loc175 at #loc215)) +#loc258 = loc(callsite(#loc176 at #loc215)) +#loc259 = loc(callsite(#loc177 at #loc215)) +#loc260 = loc(callsite(#loc178 at #loc215)) +#loc261 = loc(callsite(#loc179 at #loc215)) +#loc262 = loc(callsite(#loc180 at #loc215)) +#loc263 = loc(callsite(#loc181 at #loc215)) +#loc264 = loc(callsite(#loc143 at #loc219)) +#loc265 = loc(callsite(#loc145 at #loc219)) +#loc267 = loc(callsite(#loc147 at #loc219)) +#loc268 = loc(callsite(#loc148 at #loc219)) +#loc269 = loc(callsite(#loc149 at #loc219)) +#loc271 = loc(callsite(#loc151 at #loc219)) +#loc272 = loc(callsite(#loc152 at #loc219)) +#loc273 = loc(callsite(#loc153 at #loc219)) +#loc274 = loc(callsite(#loc154 at #loc219)) +#loc275 = loc(callsite(#loc216 at #loc219)) +#loc276 = loc(callsite(#loc217 at #loc219)) +#loc277 = loc(callsite(#loc172 at #loc219)) +#loc278 = loc(callsite(#loc173 at #loc219)) +#loc279 = loc(callsite(#loc174 at #loc219)) +#loc280 = loc(callsite(#loc175 at #loc219)) +#loc281 = loc(callsite(#loc176 at #loc219)) +#loc282 = loc(callsite(#loc177 at #loc219)) +#loc283 = loc(callsite(#loc178 at #loc219)) +#loc284 = loc(callsite(#loc180 at #loc219)) +#loc285 = loc(callsite(#loc181 at #loc219)) +#loc286 = loc(callsite(#loc155 at #loc219)) +#loc287 = loc(callsite(#loc157 at #loc219)) +#loc289 = loc(callsite(#loc160 at #loc219)) +#loc290 = loc(callsite(#loc161 at #loc219)) +#loc291 = loc(callsite(#loc163 at #loc219)) +#loc293 = loc(callsite(#loc165 at #loc219)) +#loc294 = loc(callsite(#loc166 at #loc219)) +#loc295 = loc(callsite(#loc167 at #loc219)) +#loc296 = loc(callsite(#loc168 at #loc219)) +#loc297 = loc(callsite(#loc171 at #loc219)) +#loc298 = loc(callsite(#loc179 at #loc219)) +#loc299 = loc(callsite(#loc28 at #loc221)) +#loc300 = loc(callsite(#loc28 at #loc224)) +#loc301 = loc(callsite(#loc26 at #loc229)) +#loc303 = loc(callsite(#loc26 at #loc233)) +#loc305 = loc(callsite(#loc158 at #loc241)) +#loc306 = loc(callsite(#loc26 at #loc241)) +#loc308 = loc(callsite(#loc158 at #loc246)) +#loc309 = loc(callsite(#loc26 at #loc246)) +#loc311 = loc(callsite(#loc26 at #loc266)) +#loc313 = loc(callsite(#loc26 at #loc270)) +#loc315 = loc(callsite(#loc26 at #loc288)) +#loc317 = loc(callsite(#loc26 at #loc292)) +#loc319 = loc(callsite(#loc28 at #loc301)) +#loc320 = loc(callsite(#loc28 at #loc303)) +#loc321 = loc(callsite(#loc28 at #loc306)) +#loc322 = loc(callsite(#loc28 at #loc309)) +#loc323 = loc(callsite(#loc28 at #loc311)) +#loc324 = loc(callsite(#loc28 at #loc313)) +#loc325 = loc(callsite(#loc28 at #loc315)) +#loc326 = loc(callsite(#loc28 at #loc317))